From c8d15e296358095ce804f1a5825b2cdc9f551f7c Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Tue, 16 Jan 2018 14:58:48 -0800
Subject: [PATCH 001/966] Updating the docker login command. The email flag is
 deprecated. (#16171)

PiperOrigin-RevId: 181769938
---
 tensorflow/tools/docker/parameterized_docker_build.sh | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tensorflow/tools/docker/parameterized_docker_build.sh b/tensorflow/tools/docker/parameterized_docker_build.sh
index e7de7df856..1214b6b0a3 100755
--- a/tensorflow/tools/docker/parameterized_docker_build.sh
+++ b/tensorflow/tools/docker/parameterized_docker_build.sh
@@ -408,9 +408,8 @@ fi
 # Optional: set TF_DOCKER_BUILD_PUSH_WITH_CREDENTIALS to push image
 if [[ ! -z "${TF_DOCKER_BUILD_PUSH_WITH_CREDENTIALS}" ]]; then
 
-  docker login --username "${TF_DOCKER_USERNAME}" \
-  --email "${TF_DOCKER_EMAIL}" \
-  --password "${TF_DOCKER_PASSWORD}"
+  docker login -u "${TF_DOCKER_USERNAME}" \
+  -p "${TF_DOCKER_PASSWORD}"
 
   if [[ $? != "0" ]]; then
     die "FAIL: Unable to login. Invalid credentials."
-- 
GitLab


From 26891cccd3fad36fdfd19254435d6aec72397808 Mon Sep 17 00:00:00 2001
From: lspvic <lspvic@qq.com>
Date: Thu, 18 Jan 2018 19:39:45 +0800
Subject: [PATCH 002/966] Fix result shape of tf.tensordot unknown

---
 tensorflow/python/ops/math_ops.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index cd07dad613..88260e2687 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -2768,7 +2768,8 @@ def tensordot(a, b, axes, name=None):
       if axes < 1:
         raise ValueError("'axes' must be at least 1.")
       if a_shape.ndims is not None:
-        return range(a_shape.ndims - axes, a_shape.ndims), range(axes)
+        return (list(xrange(a_shape.ndims - axes, a_shape.ndims)),
+                list(xrange(axes)))
       else:
         rank = array_ops.rank(a)
         return (range(rank - axes, rank, dtype=dtypes.int32),
-- 
GitLab


From cc0922c1af35a850077966d7fe95dfd5c208c4c4 Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Thu, 18 Jan 2018 10:27:07 -0800
Subject: [PATCH 003/966] Fixing a typo for the argument to docker push.
 (#16204)

---
 tensorflow/tools/docker/parameterized_docker_build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/tools/docker/parameterized_docker_build.sh b/tensorflow/tools/docker/parameterized_docker_build.sh
index 1214b6b0a3..fa867b65db 100755
--- a/tensorflow/tools/docker/parameterized_docker_build.sh
+++ b/tensorflow/tools/docker/parameterized_docker_build.sh
@@ -414,7 +414,7 @@ if [[ ! -z "${TF_DOCKER_BUILD_PUSH_WITH_CREDENTIALS}" ]]; then
   if [[ $? != "0" ]]; then
     die "FAIL: Unable to login. Invalid credentials."
   fi
-  docker push $1
+  docker push "${FINAL_IMG}"
   if [[ $? == "0" ]]; then
     docker logout
     echo "Successfully pushed Docker image ${FINAL_IMG}"
-- 
GitLab


From f063d85cf74e7ad5943d614f7ac4b7404d1ed32f Mon Sep 17 00:00:00 2001
From: ManHyuk <manhyuk@kw.ac.kr>
Date: Sat, 20 Jan 2018 00:52:59 +0900
Subject: [PATCH 004/966] fix typo

---
 tensorflow/contrib/eager/python/g3doc/guide.md     | 2 +-
 tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/eager/python/g3doc/guide.md b/tensorflow/contrib/eager/python/g3doc/guide.md
index 0095ffa0db..7eea93ce1f 100644
--- a/tensorflow/contrib/eager/python/g3doc/guide.md
+++ b/tensorflow/contrib/eager/python/g3doc/guide.md
@@ -292,7 +292,7 @@ def loss(weight, bias):
   error = prediction(training_inputs, weight, bias) - training_outputs
   return tf.reduce_mean(tf.square(error))
 
-# Function that returns the the derivative of loss with respect to
+# Function that returns the derivative of loss with respect to
 # weight and bias
 grad = tfe.gradients_function(loss)
 
diff --git a/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h b/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
index 3cda4bcccc..7019c29959 100644
--- a/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
+++ b/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
@@ -370,7 +370,7 @@ enum {
    * Looks up items from a given tensor.
    *
    * Each item in the output is a raw copy of the corresponding item in
-   * the input “values”. If the the given “lookup” indices are out of bounds,
+   * the input “values”. If the given “lookup” indices are out of bounds,
    * the op will fail and an error will be reported.
    *
    * Inputs:
-- 
GitLab


From 22c0a40e2f1980445b9616ea969931fb096595ff Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Jan 2018 09:18:11 -0800
Subject: [PATCH 005/966] Correct handling of dtype for Categorical sampling.

PiperOrigin-RevId: 182943806
---
 .../python/kernel_tests/distributions/categorical_test.py     | 4 ++++
 tensorflow/python/ops/distributions/categorical.py            | 4 +++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/kernel_tests/distributions/categorical_test.py b/tensorflow/python/kernel_tests/distributions/categorical_test.py
index 019c1bc353..ca2358fe99 100644
--- a/tensorflow/python/kernel_tests/distributions/categorical_test.py
+++ b/tensorflow/python/kernel_tests/distributions/categorical_test.py
@@ -100,6 +100,10 @@ class CategoricalTest(test.TestCase):
     self.assertEqual(
         dist.logits.dtype, dist.log_prob(np.array(
             0, dtype=np.int64)).dtype)
+    for dtype in [dtypes.float16, dtypes.float32, dtypes.float64]:
+      dist = make_categorical([], 5, dtype=dtype)
+      self.assertEqual(dist.dtype, dtype)
+      self.assertEqual(dist.dtype, dist.sample(5).dtype)
 
   def testUnknownShape(self):
     with self.test_session():
diff --git a/tensorflow/python/ops/distributions/categorical.py b/tensorflow/python/ops/distributions/categorical.py
index 60a515583e..9161e3fa9f 100644
--- a/tensorflow/python/ops/distributions/categorical.py
+++ b/tensorflow/python/ops/distributions/categorical.py
@@ -265,7 +265,9 @@ class Categorical(distribution.Distribution):
       logits_2d = self.logits
     else:
       logits_2d = array_ops.reshape(self.logits, [-1, self.event_size])
-    draws = random_ops.multinomial(logits_2d, n, seed=seed)
+    sample_dtype = dtypes.int64 if self.dtype.size > 4 else dtypes.int32
+    draws = random_ops.multinomial(
+        logits_2d, n, seed=seed, output_dtype=sample_dtype)
     draws = array_ops.reshape(
         array_ops.transpose(draws),
         array_ops.concat([[n], self.batch_shape_tensor()], 0))
-- 
GitLab


From 03760b7ad6e43e7f84cb4beb17814c58fe60f667 Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Tue, 23 Jan 2018 09:46:25 -0800
Subject: [PATCH 006/966] The link translator expects "@{" for API links.

This is currently not rendering:

https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec

PiperOrigin-RevId: 182947562
---
 tensorflow/contrib/tpu/python/tpu/tpu_estimator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index b25ab1707e..b6d685b3fc 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -388,7 +388,7 @@ class TPUEstimatorSpec(collections.namedtuple('TPUEstimatorSpec', [
   `metric_fn` runs on CPU to generate metrics and `tensors` represents the
   `Tensor`s transferred from TPU system to CPU host and passed to `metric_fn`.
   To be precise, TPU evaluation expects a slightly different signature from the
-  ${tf.estimator.Estimator}. While `EstimatorSpec.eval_metric_ops` expects a
+  @{tf.estimator.Estimator}. While `EstimatorSpec.eval_metric_ops` expects a
   dict, `TPUEstimatorSpec.eval_metrics` is a tuple of `metric_fn` and `tensors`.
   The `tensors` could be a list of `Tensor`s or dict of names to `Tensor`s. The
   `tensors` usually specify the model logits, which are transferred back from
-- 
GitLab


From 0bcbe479b4adaaa5b16000df4e703dd1389eaea2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Jan 2018 10:19:19 -0800
Subject: [PATCH 007/966] Increase shard count of
 tensorflow/contrib/learn:linear_test to avoid test timeouts.

The test tensorflow/contrib/learn:linear_test has been getting flaky timeouts t
a low rate for some time, but they increased greatly under the address
sanitizer due to a recent change there.

This CL increases the test's shard count to bring the time for each shard
well under the 5 minute timeout.

PiperOrigin-RevId: 182953078
---
 tensorflow/contrib/learn/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/learn/BUILD b/tensorflow/contrib/learn/BUILD
index ee3611ca93..3c782b54a8 100644
--- a/tensorflow/contrib/learn/BUILD
+++ b/tensorflow/contrib/learn/BUILD
@@ -494,7 +494,7 @@ py_test(
     name = "linear_test",
     size = "medium",
     srcs = ["python/learn/estimators/linear_test.py"],
-    shard_count = 4,
+    shard_count = 20,
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
     deps = [
-- 
GitLab


From 144bfce5be3d27d61c6370a2dd822e32815454ad Mon Sep 17 00:00:00 2001
From: Benoit Steiner <bsteiner@google.com>
Date: Tue, 23 Jan 2018 10:59:00 -0800
Subject: [PATCH 008/966] Removed unecessary dependency

PiperOrigin-RevId: 182959711
---
 tensorflow/cc/BUILD | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index ddcee3deee..c9ade5fb83 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -673,7 +673,6 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:tensorflow",
     ],
 )
 
-- 
GitLab


From 7da6a83b74f467929032dce95794ef0197d46b20 Mon Sep 17 00:00:00 2001
From: Akshay Agrawal <akshayka@google.com>
Date: Tue, 23 Jan 2018 11:05:50 -0800
Subject: [PATCH 009/966] TFE usability: Implement `ndim` for
 `EagerTensor`+NumPy compatibility

Third party libraries like matplotlib often access `ndim`.

PiperOrigin-RevId: 182961097
---
 tensorflow/python/eager/tensor_test.py | 13 +++++++++++++
 tensorflow/python/framework/ops.py     |  5 +++++
 2 files changed, 18 insertions(+)

diff --git a/tensorflow/python/eager/tensor_test.py b/tensorflow/python/eager/tensor_test.py
index 2568d3dc05..0bd5a5dbaf 100644
--- a/tensorflow/python/eager/tensor_test.py
+++ b/tensorflow/python/eager/tensor_test.py
@@ -112,6 +112,19 @@ class TFETensorTest(test_util.TensorFlowTestCase):
     numpy_tensor = np.asarray(tensor, dtype=np.int32)
     self.assertAllEqual(numpy_tensor, [1, 2, 3])
 
+  def testNdimsAgreesWithNumpy(self):
+    numpy_tensor = np.asarray(1.0)
+    tensor = constant_op.constant(numpy_tensor)
+    self.assertAllEqual(numpy_tensor.ndim, tensor.ndim)
+
+    numpy_tensor = np.asarray([1.0, 2.0, 3.0])
+    tensor = constant_op.constant(numpy_tensor)
+    self.assertAllEqual(numpy_tensor.ndim, tensor.ndim)
+
+    numpy_tensor = np.asarray([[1.0, 2.0, 3.0], [1.0, 2.0, 3.0]])
+    tensor = constant_op.constant(numpy_tensor)
+    self.assertAllEqual(numpy_tensor.ndim, tensor.ndim)
+
   def testCopy(self):
     t = constant_op.constant(1.0)
     tt = copy.copy(t)
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 0eb06ae913..2489982d93 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -775,6 +775,11 @@ class _EagerTensorBase(Tensor):
     """The shape of the tensor as a list."""
     return list(self._shape_tuple())
 
+  @property
+  def ndim(self):
+    """Returns the number of Tensor dimensions."""
+    return self.shape.ndims
+
   def cpu(self):
     """A copy of this Tensor with contents backed by host memory."""
     return self._copy(context.context(), "CPU:0")
-- 
GitLab


From d1020bfdedadc3da7c89a69651c45cc790922b69 Mon Sep 17 00:00:00 2001
From: Gunhan Gulsoy <gunan@google.com>
Date: Tue, 23 Jan 2018 11:09:26 -0800
Subject: [PATCH 010/966] Run TF opensource presubmits with sanybridge CPU
 optimizations.

PiperOrigin-RevId: 182961789
---
 tensorflow/tools/ci_build/builds/libtensorflow.sh      | 2 +-
 tensorflow/tools/ci_build/linux/cpu/run_cc_core.sh     | 3 ++-
 tensorflow/tools/ci_build/linux/cpu/run_py2_core.sh    | 3 ++-
 tensorflow/tools/ci_build/linux/cpu/run_py3_contrib.sh | 3 ++-
 tensorflow/tools/ci_build/linux/cpu/run_py3_core.sh    | 3 ++-
 tensorflow/tools/ci_build/linux/gpu/run_cc_core.sh     | 3 ++-
 tensorflow/tools/ci_build/linux/gpu/run_py3_core.sh    | 3 ++-
 tensorflow/tools/ci_build/osx/cpu/run_contrib.sh       | 3 ++-
 tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh   | 3 ++-
 9 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/tensorflow/tools/ci_build/builds/libtensorflow.sh b/tensorflow/tools/ci_build/builds/libtensorflow.sh
index 26713dded8..aadf480d37 100755
--- a/tensorflow/tools/ci_build/builds/libtensorflow.sh
+++ b/tensorflow/tools/ci_build/builds/libtensorflow.sh
@@ -51,8 +51,8 @@ function build_libtensorflow_tarball() {
   rm -rf ${DIR}
 
   TARBALL_SUFFIX="${1}"
-  BAZEL="bazel --bazelrc ./tensorflow/tools/ci_build/install/.bazelrc"
   BAZEL_OPTS="-c opt"
+  export CC_OPT_FLAGS='-mavx'
   if [ "${TF_NEED_CUDA}" == "1" ]; then
     BAZEL_OPTS="${BAZEL_OPTS} --config=cuda"
   fi
diff --git a/tensorflow/tools/ci_build/linux/cpu/run_cc_core.sh b/tensorflow/tools/ci_build/linux/cpu/run_cc_core.sh
index e3e6b2f316..51e10f81f8 100755
--- a/tensorflow/tools/ci_build/linux/cpu/run_cc_core.sh
+++ b/tensorflow/tools/ci_build/linux/cpu/run_cc_core.sh
@@ -26,12 +26,13 @@ echo ""
 
 # Run configure.
 export TF_NEED_CUDA=0
+export CC_OPT_FLAGS='-mavx'
 # Only running cc tests, python version does not matter.
 export PYTHON_BIN_PATH=`which python`
 yes "" | $PYTHON_BIN_PATH configure.py
 
 # Run bazel test command. Double test timeouts to avoid flakes.
 bazel test --test_tag_filters=-no_oss,-gpu,-benchmark-test --test_lang_filters=cc,java -k \
-    --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 \
+    --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 --config=opt \
     --test_output=errors -- \
     //tensorflow/... -//tensorflow/compiler/... -//tensorflow/contrib/...
diff --git a/tensorflow/tools/ci_build/linux/cpu/run_py2_core.sh b/tensorflow/tools/ci_build/linux/cpu/run_py2_core.sh
index 5110d52f31..ea14848b1a 100755
--- a/tensorflow/tools/ci_build/linux/cpu/run_py2_core.sh
+++ b/tensorflow/tools/ci_build/linux/cpu/run_py2_core.sh
@@ -26,11 +26,12 @@ echo ""
 
 # Run configure.
 export TF_NEED_CUDA=0
+export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=`which python2`
 yes "" | $PYTHON_BIN_PATH configure.py
 
 # Run bazel test command. Double test timeouts to avoid flakes.
 bazel test --test_tag_filters=-no_oss,-oss_serial,-gpu,-benchmark-test --test_lang_filters=py -k \
-    --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 --build_tests_only \
+    --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 --build_tests_only --config=opt \
     --test_output=errors -- \
     //tensorflow/... -//tensorflow/compiler/... -//tensorflow/contrib/...
diff --git a/tensorflow/tools/ci_build/linux/cpu/run_py3_contrib.sh b/tensorflow/tools/ci_build/linux/cpu/run_py3_contrib.sh
index df6016504c..6d017c8a1f 100755
--- a/tensorflow/tools/ci_build/linux/cpu/run_py3_contrib.sh
+++ b/tensorflow/tools/ci_build/linux/cpu/run_py3_contrib.sh
@@ -26,12 +26,13 @@ echo ""
 
 # Run configure.
 export TF_NEED_CUDA=0
+export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=`which python3`
 yes "" | $PYTHON_BIN_PATH configure.py
 
 # Run bazel test command. Double test timeouts to avoid flakes.
 bazel test --test_tag_filters=-no_oss,-oss_serial,-gpu,-benchmark-test -k \
-    --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 \
+    --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 --config=opt \
     --test_output=errors -- \
     //tensorflow/contrib/... \
     -//tensorflow/contrib/lite/... \
diff --git a/tensorflow/tools/ci_build/linux/cpu/run_py3_core.sh b/tensorflow/tools/ci_build/linux/cpu/run_py3_core.sh
index ea9e102936..a9accb9dd5 100755
--- a/tensorflow/tools/ci_build/linux/cpu/run_py3_core.sh
+++ b/tensorflow/tools/ci_build/linux/cpu/run_py3_core.sh
@@ -26,11 +26,12 @@ echo ""
 
 # Run configure.
 export TF_NEED_CUDA=0
+export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=`which python3`
 yes "" | $PYTHON_BIN_PATH configure.py
 
 # Run bazel test command. Double test timeouts to avoid flakes.
 bazel test --test_tag_filters=-no_oss,-oss_serial,-gpu,-benchmark-test --test_lang_filters=py -k \
-    --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 --build_tests_only \
+    --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 --build_tests_only --config=opt \
     --test_output=errors -- \
     //tensorflow/... -//tensorflow/compiler/... -//tensorflow/contrib/...
diff --git a/tensorflow/tools/ci_build/linux/gpu/run_cc_core.sh b/tensorflow/tools/ci_build/linux/gpu/run_cc_core.sh
index df196f829c..02224d8e9d 100755
--- a/tensorflow/tools/ci_build/linux/gpu/run_cc_core.sh
+++ b/tensorflow/tools/ci_build/linux/gpu/run_cc_core.sh
@@ -26,6 +26,7 @@ echo ""
 
 # Run configure.
 export PYTHON_BIN_PATH=`which python3`
+export CC_OPT_FLAGS='-mavx'
 
 export TF_NEED_CUDA=1
 export TF_CUDA_COMPUTE_CAPABILITIES=3.7
@@ -35,6 +36,6 @@ yes "" | $PYTHON_BIN_PATH configure.py
 # Run bazel test command. Double test timeouts to avoid flakes.
 bazel test --config=cuda --test_tag_filters=-no_oss,-oss_serial,-no_gpu,-benchmark-test -k \
     --test_lang_filters=cc --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 \
-    --build_tests_only --test_output=errors --local_test_jobs=8 \
+    --build_tests_only --test_output=errors --local_test_jobs=8 --config=opt \
     --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute -- \
     //tensorflow/... -//tensorflow/compiler/... -//tensorflow/contrib/...
diff --git a/tensorflow/tools/ci_build/linux/gpu/run_py3_core.sh b/tensorflow/tools/ci_build/linux/gpu/run_py3_core.sh
index abd256a895..0367a53d14 100755
--- a/tensorflow/tools/ci_build/linux/gpu/run_py3_core.sh
+++ b/tensorflow/tools/ci_build/linux/gpu/run_py3_core.sh
@@ -26,6 +26,7 @@ echo ""
 
 # Run configure.
 export PYTHON_BIN_PATH=`which python3`
+export CC_OPT_FLAGS='-mavx'
 
 export TF_NEED_CUDA=1
 export TF_CUDA_COMPUTE_CAPABILITIES=3.7
@@ -35,6 +36,6 @@ yes "" | $PYTHON_BIN_PATH configure.py
 # Run bazel test command. Double test timeouts to avoid flakes.
 bazel test --config=cuda --test_tag_filters=-no_oss,-oss_serial,-no_gpu,-benchmark-test -k \
     --test_lang_filters=py --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 \
-    --build_tests_only --test_output=errors --local_test_jobs=8 \
+    --build_tests_only --test_output=errors --local_test_jobs=8 --config=opt \
     --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute -- \
     //tensorflow/... -//tensorflow/compiler/... -//tensorflow/contrib/...
diff --git a/tensorflow/tools/ci_build/osx/cpu/run_contrib.sh b/tensorflow/tools/ci_build/osx/cpu/run_contrib.sh
index ddaaddc917..509ee38ec4 100755
--- a/tensorflow/tools/ci_build/osx/cpu/run_contrib.sh
+++ b/tensorflow/tools/ci_build/osx/cpu/run_contrib.sh
@@ -27,11 +27,12 @@ echo ""
 
 # Run configure.
 export TF_NEED_CUDA=0
+export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which python2)
 yes "" | $PYTHON_BIN_PATH configure.py
 which bazel
 bazel test --test_tag_filters=-no_oss,-gpu,-benchmark-test,-nomac \
     --test_timeout 300,450,1200,3600 \
-    --test_size_filters=small,medium \
+    --test_size_filters=small,medium --config=opt \
     --jobs=${N_JOBS} --build_tests_only --test_output=errors -k -- \
     //tensorflow/contrib/... -//tensorflow/contrib/lite/...
diff --git a/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh b/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh
index e026dcd08f..0554713670 100755
--- a/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh
+++ b/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh
@@ -27,11 +27,12 @@ echo ""
 
 # Run configure.
 export TF_NEED_CUDA=0
+export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which python2)
 yes "" | $PYTHON_BIN_PATH configure.py
 which bazel
 bazel test --test_tag_filters=-no_oss,-gpu,-benchmark-test,-nomac \
-    --test_timeout 300,450,1200,3600 \
+    --test_timeout 300,450,1200,3600 --config=opt \
     --test_size_filters=small,medium \
     --jobs=${N_JOBS} --build_tests_only --test_output=errors -k -- \
     //tensorflow/... -//tensorflow/compiler/... -//tensorflow/contrib/...
-- 
GitLab


From 2a3559feb6564e4e46a56a71b200f6a17afe69e7 Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Tue, 23 Jan 2018 11:20:29 -0800
Subject: [PATCH 011/966] Moves batch ops to core and exposes
 tf.contrib.batching

PiperOrigin-RevId: 182963906
---
 tensorflow/BUILD                              |   3 -
 tensorflow/contrib/BUILD                      |   2 -
 tensorflow/contrib/__init__.py                |   1 +
 tensorflow/contrib/batching/BUILD             |  47 ++---
 tensorflow/contrib/batching/kernels/BUILD     |  34 ----
 tensorflow/contrib/batching/ops/batch_ops.cc  | 164 ------------------
 .../contrib/batching/python/ops/batch_ops.py  |  12 +-
 tensorflow/contrib/cmake/python_modules.txt   |   1 -
 tensorflow/contrib/cmake/tf_core_ops.cmake    |   1 +
 tensorflow/contrib/cmake/tf_python.cmake      |   1 +
 tensorflow/core/BUILD                         |   3 +
 .../core/api_def/base_api/api_def_Batch.pbtxt |  42 +++++
 .../api_def/base_api/api_def_Unbatch.pbtxt    |  24 +++
 .../base_api/api_def_UnbatchGrad.pbtxt        |  20 +++
 tensorflow/core/kernels/BUILD                 |  17 ++
 .../kernels/batch_kernels.cc                  |   6 +-
 tensorflow/core/ops/batch_ops.cc              |  84 +++++++++
 tensorflow/python/BUILD                       |   7 +
 18 files changed, 218 insertions(+), 251 deletions(-)
 delete mode 100644 tensorflow/contrib/batching/kernels/BUILD
 delete mode 100644 tensorflow/contrib/batching/ops/batch_ops.cc
 create mode 100644 tensorflow/core/api_def/base_api/api_def_Batch.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_Unbatch.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_UnbatchGrad.pbtxt
 rename tensorflow/{contrib/batching => core}/kernels/batch_kernels.cc (99%)
 create mode 100644 tensorflow/core/ops/batch_ops.cc

diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 84ef1447dd..1bf7c741b7 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -434,9 +434,6 @@ filegroup(
         "//tensorflow/contrib/all_reduce:all_files",
         "//tensorflow/contrib/android:all_files",
         "//tensorflow/contrib/batching:all_files",
-        "//tensorflow/contrib/batching/kernels:all_files",
-        "//tensorflow/contrib/batching/test_util:all_files",
-        "//tensorflow/contrib/batching/util:all_files",
         "//tensorflow/contrib/bayesflow:all_files",
         "//tensorflow/contrib/boosted_trees:all_files",
         "//tensorflow/contrib/boosted_trees/estimator_batch:all_files",
diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index 8bed0fabd7..f1e54432fa 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -111,7 +111,6 @@ cc_library(
     name = "contrib_kernels",
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/contrib/batching:batch_ops_kernels",
         "//tensorflow/contrib/boosted_trees:boosted_trees_kernels",
         "//tensorflow/contrib/coder:all_kernels",
         "//tensorflow/contrib/cudnn_rnn:cudnn_rnn_kernels",
@@ -134,7 +133,6 @@ cc_library(
     name = "contrib_ops_op_lib",
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/contrib/batching:batch_ops_op_lib",
         "//tensorflow/contrib/boosted_trees:boosted_trees_ops_op_lib",
         "//tensorflow/contrib/coder:all_ops",
         "//tensorflow/contrib/cudnn_rnn:cudnn_rnn_ops_op_lib",
diff --git a/tensorflow/contrib/__init__.py b/tensorflow/contrib/__init__.py
index f600a8a998..8f6a3cb1ca 100644
--- a/tensorflow/contrib/__init__.py
+++ b/tensorflow/contrib/__init__.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 # Add projects here, they will show up under tf.contrib.
+from tensorflow.contrib import batching
 from tensorflow.contrib import bayesflow
 from tensorflow.contrib import cloud
 from tensorflow.contrib import cluster_resolver
diff --git a/tensorflow/contrib/batching/BUILD b/tensorflow/contrib/batching/BUILD
index cd98f0e703..ee67909133 100644
--- a/tensorflow/contrib/batching/BUILD
+++ b/tensorflow/contrib/batching/BUILD
@@ -67,48 +67,14 @@ load(
 )
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 
-tf_custom_op_library(
-    name = "python/ops/_batch_ops.so",
-    srcs = ["ops/batch_ops.cc"],
-    deps = [
-        "//tensorflow/contrib/batching/kernels:batch_kernels",
-    ],
-)
-
-tf_gen_op_libs(
-    op_lib_names = ["batch_ops"],
-)
-
-tf_gen_op_wrapper_py(
-    name = "batch_ops",
-    deps = [":batch_ops_op_lib"],
-)
-
-tf_kernel_library(
-    name = "batch_ops_kernels",
-    deps = [
-        "//tensorflow/contrib/batching/kernels:batch_kernels",
-        "//tensorflow/contrib/batching/util:periodic_function",
-        "//tensorflow/core/kernels:concat_lib",
-        "//tensorflow/core/kernels:ops_util",
-        "//tensorflow/core/kernels:split_lib",
-    ],
-    alwayslink = 1,
-)
-
-tf_custom_op_py_library(
+py_library(
     name = "batch_py",
     srcs = glob(["python/ops/*.py"]) + ["__init__.py"],
-    dso = [":python/ops/_batch_ops.so"],
-    kernels = [
-        ":batch_ops_kernels",
-        ":batch_ops_op_lib",
-    ],
     srcs_version = "PY2AND3",
     deps = [
-        ":batch_ops",
         "//tensorflow/contrib/util:util_py",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:batch_ops_gen",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:gradients",
@@ -118,6 +84,14 @@ tf_custom_op_py_library(
     ],
 )
 
+cc_library(
+    name = "batch_ops_kernels",
+    deps = [
+        "//tensorflow/core/kernels:batch_kernels",
+    ],
+    alwayslink = 1,
+)
+
 py_test(
     name = "batch_ops_test",
     size = "small",
@@ -133,6 +107,7 @@ py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework",
         "//tensorflow/python:gradients",
         "//tensorflow/python:script_ops",
     ],
diff --git a/tensorflow/contrib/batching/kernels/BUILD b/tensorflow/contrib/batching/kernels/BUILD
deleted file mode 100644
index 6e53dd9a5f..0000000000
--- a/tensorflow/contrib/batching/kernels/BUILD
+++ /dev/null
@@ -1,34 +0,0 @@
-# Description:
-#   Contains kernels for the batching ops.
-
-package(default_visibility = ["//tensorflow:__subpackages__"])
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-cc_library(
-    name = "batch_kernels",
-    srcs = ["batch_kernels.cc"],
-    deps = [
-        "//tensorflow/contrib/batching:shared_batch_scheduler_hdrs",
-        "//tensorflow/contrib/batching/util:periodic_function_dynamic",
-        "//tensorflow/core:framework_headers_lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/kernels:concat_lib_hdrs",
-        "//tensorflow/core/kernels:ops_util_hdrs",
-        "//tensorflow/core/kernels:split_lib_hdrs",
-    ],
-    alwayslink = 1,
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/contrib/batching/ops/batch_ops.cc b/tensorflow/contrib/batching/ops/batch_ops.cc
deleted file mode 100644
index 85e0ccba4a..0000000000
--- a/tensorflow/contrib/batching/ops/batch_ops.cc
+++ /dev/null
@@ -1,164 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-
-namespace tensorflow {
-
-REGISTER_OP("Batch")
-    .Input("in_tensors: T")
-    .Output("batched_tensors: T")
-    .Output("batch_index: int64")
-    .Output("id: int64")
-    .Attr("num_batch_threads: int")
-    .Attr("max_batch_size: int")
-    .Attr("batch_timeout_micros: int")
-    .Attr("allowed_batch_sizes: list(int) = []")
-    .Attr("grad_timeout_micros: int")
-    .Attr("container: string = ''")
-    .Attr("shared_name: string = ''")
-    .Attr("batching_queue: string = ''")
-    .Attr("T: list(type)")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      std::vector<shape_inference::ShapeHandle> in_shapes;
-      TF_RETURN_IF_ERROR(c->input("in_tensors", &in_shapes));
-      std::vector<shape_inference::ShapeHandle> out_shapes(in_shapes.size());
-      for (int i = 0; i < in_shapes.size(); ++i) {
-        TF_RETURN_IF_ERROR(
-            c->ReplaceDim(in_shapes[i], 0, c->UnknownDim(), &out_shapes[i]));
-      }
-      TF_RETURN_IF_ERROR(c->set_output("batched_tensors", out_shapes));
-      TF_RETURN_IF_ERROR(c->set_output("id", {c->Scalar()}));
-      TF_RETURN_IF_ERROR(c->set_output(
-          "batch_index",
-          {c->MakeShape({shape_inference::DimensionOrConstant(c->UnknownDim()),
-                         shape_inference::DimensionOrConstant(3)})}));
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Batches all input tensors nondeterministically.
-
-When many instances of this Op are being run concurrently with the same
-container/shared_name in the same device, some will output zero-shaped Tensors
-and others will output Tensors of size up to max_batch_size.
-
-All Tensors in in_tensors are batched together (so, for example, labels and
-features should be batched with a single instance of this operation.
-
-Each invocation of batch emits an `id` scalar which will be used to identify
-this particular invocation when doing unbatch or its gradient.
-
-Each op which emits a non-empty batch will also emit a non-empty batch_index
-Tensor, which, is a [K, 3] matrix where each row contains the invocation's id,
-start, and length of elements of each set of Tensors present in batched_tensors.
-
-Batched tensors are concatenated along the first dimension, and all tensors in
-in_tensors must have the first dimension of the same size.
-
-in_tensors: The tensors to be batched.
-num_batch_threads: Number of scheduling threads for processing batches of work.
- Determines the number of batches processed in parallel.
-max_batch_size: Batch sizes will never be bigger than this.
-batch_timeout_micros: Maximum number of microseconds to wait before outputting
- an incomplete batch.
-allowed_batch_sizes: Optional list of allowed batch sizes. If left empty, does
- nothing. Otherwise, supplies a list of batch sizes, causing the op to pad
- batches up to one of those sizes. The entries must increase monotonically, and
- the final entry must equal max_batch_size.
-grad_timeout_micros: The timeout to use for the gradient. See Unbatch.
-batched_tensors: Either empty tensors or a batch of concatenated Tensors.
-batch_index: If out_tensors is non-empty, has information to invert it.
-container: Controls the scope of sharing of this batch.
-id: always contains a scalar with a unique ID for this invocation of Batch.
-shared_name: Concurrently running instances of batch in the same device with the
- same container and shared_name will batch their elements together. If left
- empty, the op name will be used as the shared name.
-T: the types of tensors to be batched.
-)doc");
-
-REGISTER_OP("Unbatch")
-    .Input("batched_tensor: T")
-    .Input("batch_index: int64")
-    .Input("id: int64")
-    .Output("unbatched_tensor: T")
-    .Attr("timeout_micros: int")
-    .Attr("container: string = ''")
-    .Attr("shared_name: string = ''")
-    .Attr("T: type")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      shape_inference::ShapeHandle out_shape;
-      TF_RETURN_IF_ERROR(
-          c->ReplaceDim(c->input(0), 0, c->UnknownDim(), &out_shape));
-      c->set_output(0, out_shape);
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Reverses the operation of Batch for a single output Tensor.
-
-An instance of Unbatch either receives an empty batched_tensor, in which case it
-asynchronously waits until the values become available from a concurrently
-running instance of Unbatch with the same container and shared_name, or receives
-a non-empty batched_tensor in which case it finalizes all other concurrently
-running instances and outputs its own element from the batch.
-
-batched_tensor: The possibly transformed output of Batch. The size of the first
- dimension should remain unchanged by the transformations for the operation to
- work.
-batch_index: The matching batch_index obtained from Batch.
-id: The id scalar emitted by Batch.
-unbatched_tensor: The Tensor corresponding to this execution.
-timeout_micros: Maximum amount of time (in microseconds) to wait to receive the
- batched input tensor associated with a given invocation of the op.
-container: Container to control resource sharing.
-shared_name: Instances of Unbatch with the same container and shared_name are
- assumed to possibly belong to the same batch. If left empty, the op name will
- be used as the shared name.
-)doc");
-
-REGISTER_OP("UnbatchGrad")
-    .Input("original_input: T")
-    .Input("batch_index: int64")
-    .Input("grad: T")
-    .Input("id: int64")
-    .Output("batched_grad: T")
-    .Attr("container: string = ''")
-    .Attr("shared_name: string = ''")
-    .Attr("T: type")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      c->set_output(0, c->UnknownShapeOfRank(c->Rank(c->input(2))));
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Gradient of Unbatch.
-
-Acts like Batch but using the given batch_index index of batching things as they
-become available. This ensures that the gradients are propagated back in the
-same session which did the forward pass.
-
-original_input: The input to the Unbatch operation this is the gradient of.
-batch_index: The batch_index given to the Unbatch operation this is the gradient
-of.
-grad: The downstream gradient.
-id: The id scalar emitted by Batch.
-batched_grad: The return value, either an empty tensor or the batched gradient.
-container: Container to control resource sharing.
-shared_name: Instances of UnbatchGrad with the same container and shared_name
- are assumed to possibly belong to the same batch. If left empty, the op name
- will be used as the shared name.
-  )doc");
-
-}  // namespace tensorflow
diff --git a/tensorflow/contrib/batching/python/ops/batch_ops.py b/tensorflow/contrib/batching/python/ops/batch_ops.py
index cee4d7b4a9..4e0b3f9af9 100644
--- a/tensorflow/contrib/batching/python/ops/batch_ops.py
+++ b/tensorflow/contrib/batching/python/ops/batch_ops.py
@@ -18,18 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.batching.ops import gen_batch_ops
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import gen_batch_ops
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
-from tensorflow.contrib.batching.ops.gen_batch_ops import *
+from tensorflow.python.ops.gen_batch_ops import *
 # pylint: enable=wildcard-import
-from tensorflow.contrib.util import loader
-from tensorflow.python.framework import ops
-from tensorflow.python.platform import resource_loader
-
-
-_batch_ops = loader.load_op_library(
-    resource_loader.get_path_to_datafile("_batch_ops.so"))
 
 
 @ops.RegisterGradient("Batch")
diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt
index e37d059a84..dec6c513ba 100644
--- a/tensorflow/contrib/cmake/python_modules.txt
+++ b/tensorflow/contrib/cmake/python_modules.txt
@@ -109,7 +109,6 @@ tensorflow/contrib/android/java/org/tensorflow/contrib
 tensorflow/contrib/android/java/org/tensorflow/contrib/android
 tensorflow/contrib/android/jni
 tensorflow/contrib/batching
-tensorflow/contrib/batching/kernels
 tensorflow/contrib/batching/python
 tensorflow/contrib/batching/python/ops
 tensorflow/contrib/bayesflow
diff --git a/tensorflow/contrib/cmake/tf_core_ops.cmake b/tensorflow/contrib/cmake/tf_core_ops.cmake
index 6f56e9d086..138993db35 100644
--- a/tensorflow/contrib/cmake/tf_core_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_core_ops.cmake
@@ -15,6 +15,7 @@
 set(tf_op_lib_names
     "audio_ops"
     "array_ops"
+		"batch_ops"
     "bitwise_ops"
     "candidate_sampling_ops"
     "checkpoint_ops"
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index 17bbdb1a86..b62a031749 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -314,6 +314,7 @@ endfunction()
 
 GENERATE_PYTHON_OP_LIB("audio_ops")
 GENERATE_PYTHON_OP_LIB("array_ops")
+GENERATE_PYTHON_OP_LIB("batch_ops")
 GENERATE_PYTHON_OP_LIB("bitwise_ops")
 GENERATE_PYTHON_OP_LIB("math_ops")
 GENERATE_PYTHON_OP_LIB("functional_ops")
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 232f7d9b3c..d73f003a1d 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -593,6 +593,7 @@ cc_library(
 tf_gen_op_libs(
     is_external = False,
     op_lib_names = [
+        "batch_ops",
         "bitwise_ops",
         "candidate_sampling_ops",
         "checkpoint_ops",
@@ -674,6 +675,7 @@ cc_library(
     deps = [
         ":array_ops_op_lib",
         ":audio_ops_op_lib",
+        ":batch_ops_op_lib",
         ":bitwise_ops_op_lib",
         ":candidate_sampling_ops_op_lib",
         ":checkpoint_ops_op_lib",
@@ -809,6 +811,7 @@ cc_library(
     deps = [
         "//tensorflow/core/kernels:array",
         "//tensorflow/core/kernels:audio",
+        "//tensorflow/core/kernels:batch_kernels",
         "//tensorflow/core/kernels:bincount_op",
         "//tensorflow/core/kernels:candidate_sampler_ops",
         "//tensorflow/core/kernels:checkpoint_ops",
diff --git a/tensorflow/core/api_def/base_api/api_def_Batch.pbtxt b/tensorflow/core/api_def/base_api/api_def_Batch.pbtxt
new file mode 100644
index 0000000000..aea11b64fd
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Batch.pbtxt
@@ -0,0 +1,42 @@
+op {
+  graph_op_name: "Batch"
+  summary: "Batches all input tensors nondeterministically."
+  description: <<END
+When many instances of this Op are being run concurrently with the same
+container/shared_name in the same device, some will output zero-shaped Tensors
+and others will output Tensors of size up to max_batch_size.
+
+All Tensors in in_tensors are batched together (so, for example, labels and
+features should be batched with a single instance of this operation.
+
+Each invocation of batch emits an `id` scalar which will be used to identify
+this particular invocation when doing unbatch or its gradient.
+
+Each op which emits a non-empty batch will also emit a non-empty batch_index
+Tensor, which, is a [K, 3] matrix where each row contains the invocation's id,
+start, and length of elements of each set of Tensors present in batched_tensors.
+
+Batched tensors are concatenated along the first dimension, and all tensors in
+in_tensors must have the first dimension of the same size.
+
+in_tensors: The tensors to be batched.
+num_batch_threads: Number of scheduling threads for processing batches of work.
+ Determines the number of batches processed in parallel.
+max_batch_size: Batch sizes will never be bigger than this.
+batch_timeout_micros: Maximum number of microseconds to wait before outputting
+ an incomplete batch.
+allowed_batch_sizes: Optional list of allowed batch sizes. If left empty, does
+ nothing. Otherwise, supplies a list of batch sizes, causing the op to pad
+ batches up to one of those sizes. The entries must increase monotonically, and
+ the final entry must equal max_batch_size.
+grad_timeout_micros: The timeout to use for the gradient. See Unbatch.
+batched_tensors: Either empty tensors or a batch of concatenated Tensors.
+batch_index: If out_tensors is non-empty, has information to invert it.
+container: Controls the scope of sharing of this batch.
+id: always contains a scalar with a unique ID for this invocation of Batch.
+shared_name: Concurrently running instances of batch in the same device with the
+ same container and shared_name will batch their elements together. If left
+ empty, the op name will be used as the shared name.
+T: the types of tensors to be batched.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Unbatch.pbtxt b/tensorflow/core/api_def/base_api/api_def_Unbatch.pbtxt
new file mode 100644
index 0000000000..6d10ea606d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Unbatch.pbtxt
@@ -0,0 +1,24 @@
+op {
+  graph_op_name: "Unbatch"
+  summary: "Reverses the operation of Batch for a single output Tensor."
+  description: <<END
+An instance of Unbatch either receives an empty batched_tensor, in which case it
+asynchronously waits until the values become available from a concurrently
+running instance of Unbatch with the same container and shared_name, or receives
+a non-empty batched_tensor in which case it finalizes all other concurrently
+running instances and outputs its own element from the batch.
+
+batched_tensor: The possibly transformed output of Batch. The size of the first
+ dimension should remain unchanged by the transformations for the operation to
+ work.
+batch_index: The matching batch_index obtained from Batch.
+id: The id scalar emitted by Batch.
+unbatched_tensor: The Tensor corresponding to this execution.
+timeout_micros: Maximum amount of time (in microseconds) to wait to receive the
+ batched input tensor associated with a given invocation of the op.
+container: Container to control resource sharing.
+shared_name: Instances of Unbatch with the same container and shared_name are
+ assumed to possibly belong to the same batch. If left empty, the op name will
+ be used as the shared name.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_UnbatchGrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnbatchGrad.pbtxt
new file mode 100644
index 0000000000..487b4218d5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_UnbatchGrad.pbtxt
@@ -0,0 +1,20 @@
+op {
+  graph_op_name: "UnbatchGrad"
+  summary: "Gradient of Unbatch."
+  description: <<END
+Acts like Batch but using the given batch_index index of batching things as they
+become available. This ensures that the gradients are propagated back in the
+same session which did the forward pass.
+
+original_input: The input to the Unbatch operation this is the gradient of.
+batch_index: The batch_index given to the Unbatch operation this is the gradient
+of.
+grad: The downstream gradient.
+id: The id scalar emitted by Batch.
+batched_grad: The return value, either an empty tensor or the batched gradient.
+container: Container to control resource sharing.
+shared_name: Instances of UnbatchGrad with the same container and shared_name
+ are assumed to possibly belong to the same batch. If left empty, the op name
+ will be used as the shared name.
+END
+}
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index c0d3867770..96f0feaf3d 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -369,6 +369,22 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "batch_kernels",
+    srcs = ["batch_kernels.cc"],
+    deps = [
+        "//tensorflow/core:batch_ops_op_lib",
+        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/kernels:concat_lib_hdrs",
+        "//tensorflow/core/kernels:ops_util_hdrs",
+        "//tensorflow/core/kernels:split_lib_hdrs",
+        "//tensorflow/core/kernels/batching_util:periodic_function_dynamic",
+        "//tensorflow/core/kernels/batching_util:shared_batch_scheduler_hdrs",
+    ],
+    alwayslink = 1,
+)
+
 tf_kernel_library(
     name = "record_input_op",
     srcs = [
@@ -4975,6 +4991,7 @@ filegroup(
             "debug_ops.*",
             "scatter_nd_op*",
             "critical_section.*",
+            "batch_kernels.*",
         ],
     ),
     visibility = ["//visibility:public"],
diff --git a/tensorflow/contrib/batching/kernels/batch_kernels.cc b/tensorflow/core/kernels/batch_kernels.cc
similarity index 99%
rename from tensorflow/contrib/batching/kernels/batch_kernels.cc
rename to tensorflow/core/kernels/batch_kernels.cc
index 6041d8c9b2..5b4e1a809f 100644
--- a/tensorflow/contrib/batching/kernels/batch_kernels.cc
+++ b/tensorflow/core/kernels/batch_kernels.cc
@@ -13,20 +13,22 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/batching/shared_batch_scheduler.h"
-#include "tensorflow/contrib/batching/util/periodic_function.h"
+
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_util.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/batching_util/shared_batch_scheduler.h"
+#include "tensorflow/core/kernels/batching_util/periodic_function.h"
 #include "tensorflow/core/kernels/concat_lib.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/kernels/split_lib.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/macros.h"
 
+
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
diff --git a/tensorflow/core/ops/batch_ops.cc b/tensorflow/core/ops/batch_ops.cc
new file mode 100644
index 0000000000..a64582acee
--- /dev/null
+++ b/tensorflow/core/ops/batch_ops.cc
@@ -0,0 +1,84 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+REGISTER_OP("Batch")
+    .Input("in_tensors: T")
+    .Output("batched_tensors: T")
+    .Output("batch_index: int64")
+    .Output("id: int64")
+    .Attr("num_batch_threads: int")
+    .Attr("max_batch_size: int")
+    .Attr("batch_timeout_micros: int")
+    .Attr("allowed_batch_sizes: list(int) = []")
+    .Attr("grad_timeout_micros: int")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .Attr("batching_queue: string = ''")
+    .Attr("T: list(type)")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      std::vector<shape_inference::ShapeHandle> in_shapes;
+      TF_RETURN_IF_ERROR(c->input("in_tensors", &in_shapes));
+      std::vector<shape_inference::ShapeHandle> out_shapes(in_shapes.size());
+      for (int i = 0; i < in_shapes.size(); ++i) {
+        TF_RETURN_IF_ERROR(
+            c->ReplaceDim(in_shapes[i], 0, c->UnknownDim(), &out_shapes[i]));
+      }
+      TF_RETURN_IF_ERROR(c->set_output("batched_tensors", out_shapes));
+      TF_RETURN_IF_ERROR(c->set_output("id", {c->Scalar()}));
+      TF_RETURN_IF_ERROR(c->set_output(
+          "batch_index",
+          {c->MakeShape({shape_inference::DimensionOrConstant(c->UnknownDim()),
+                         shape_inference::DimensionOrConstant(3)})}));
+      return Status::OK();
+    });
+
+REGISTER_OP("Unbatch")
+    .Input("batched_tensor: T")
+    .Input("batch_index: int64")
+    .Input("id: int64")
+    .Output("unbatched_tensor: T")
+    .Attr("timeout_micros: int")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .Attr("T: type")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle out_shape;
+      TF_RETURN_IF_ERROR(
+          c->ReplaceDim(c->input(0), 0, c->UnknownDim(), &out_shape));
+      c->set_output(0, out_shape);
+      return Status::OK();
+    });
+
+REGISTER_OP("UnbatchGrad")
+    .Input("original_input: T")
+    .Input("batch_index: int64")
+    .Input("grad: T")
+    .Input("id: int64")
+    .Output("batched_grad: T")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .Attr("T: type")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->UnknownShapeOfRank(c->Rank(c->input(2))));
+      return Status::OK();
+    });
+
+}  // namespace tensorflow
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index a0193ca6ca..01b3e92d2d 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -1387,6 +1387,13 @@ tf_gen_op_wrapper_private_py(
     ],
 )
 
+tf_gen_op_wrapper_private_py(
+    name = "batch_ops_gen",
+    visibility = [
+        "//tensorflow:__subpackages__",
+    ],
+)
+
 tf_gen_op_wrapper_private_py(
     name = "math_ops_gen",
     visibility = [
-- 
GitLab


From 25006a71a4a668049f338b396ba410d851c13c9d Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Tue, 23 Jan 2018 11:23:35 -0800
Subject: [PATCH 012/966] Make feature_column_test.py run with the C API
 enabled.

I only enabled running the C API with test classes that other testing
found failures in to avoid increasing the test time unnecessarily.

PiperOrigin-RevId: 182964349
---
 .../feature_column/feature_column_test.py     | 30 ++++++++++++++-----
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
index 2374680b96..d78d74c8ca 100644
--- a/tensorflow/python/feature_column/feature_column_test.py
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -1072,6 +1072,7 @@ def get_linear_model_column_var(column):
                             'linear_model/' + column.name)[0]
 
 
+@test_util.with_c_api
 class LinearModelTest(test.TestCase):
 
   def test_raises_if_empty_feature_columns(self):
@@ -1325,10 +1326,16 @@ class LinearModelTest(test.TestCase):
     price = fc.numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
-      predictions = fc.linear_model(features, [price])
-      with _initialized_session():
-        with self.assertRaisesRegexp(Exception, 'requested shape has 4'):
-          predictions.eval()
+      if ops._USE_C_API:
+        with self.assertRaisesRegexp(
+            Exception,
+            r'Cannot reshape a tensor with 2 elements to shape \[2,2\]'):
+          predictions = fc.linear_model(features, [price])
+      else:
+        predictions = fc.linear_model(features, [price])
+        with _initialized_session():
+          with self.assertRaisesRegexp(Exception, 'requested shape has 4'):
+            predictions.eval()
 
   def test_dense_reshaping(self):
     price = fc.numeric_column('price', shape=[1, 2])
@@ -1791,6 +1798,7 @@ class InputLayerTest(test.TestCase):
       self.assertAllEqual([[2, 2], [2, 2], [2, 2]], gradient)
 
 
+@test_util.with_c_api
 class FunctionalInputLayerTest(test.TestCase):
 
   def test_raises_if_empty_feature_columns(self):
@@ -1855,10 +1863,16 @@ class FunctionalInputLayerTest(test.TestCase):
     price = fc.numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
-      net = fc.input_layer(features, [price])
-      with _initialized_session():
-        with self.assertRaisesRegexp(Exception, 'requested shape has 4'):
-          net.eval()
+      if ops._USE_C_API:
+        with self.assertRaisesRegexp(
+            Exception,
+            r'Cannot reshape a tensor with 2 elements to shape \[2,2\]'):
+          net = fc.input_layer(features, [price])
+      else:
+        net = fc.input_layer(features, [price])
+        with _initialized_session():
+          with self.assertRaisesRegexp(Exception, 'requested shape has 4'):
+            net.eval()
 
   def test_reshaping(self):
     price = fc.numeric_column('price', shape=[1, 2])
-- 
GitLab


From 95f6860767116e1a831362c7d6233087fcee47dd Mon Sep 17 00:00:00 2001
From: "Joshua V. Dillon" <jvdillon@google.com>
Date: Tue, 23 Jan 2018 11:37:58 -0800
Subject: [PATCH 013/966] Add `tf.contrib.distributions.Autoregressive`.

PiperOrigin-RevId: 182966827
---
 tensorflow/contrib/distributions/BUILD        |  13 ++
 tensorflow/contrib/distributions/__init__.py  |   2 +
 .../kernel_tests/autoregressive_test.py       |  94 ++++++++
 .../python/ops/autoregressive.py              | 208 ++++++++++++++++++
 4 files changed, 317 insertions(+)
 create mode 100644 tensorflow/contrib/distributions/python/kernel_tests/autoregressive_test.py
 create mode 100644 tensorflow/contrib/distributions/python/ops/autoregressive.py

diff --git a/tensorflow/contrib/distributions/BUILD b/tensorflow/contrib/distributions/BUILD
index 95848af699..7d785c3636 100644
--- a/tensorflow/contrib/distributions/BUILD
+++ b/tensorflow/contrib/distributions/BUILD
@@ -128,6 +128,19 @@ cuda_py_test(
     tags = ["no_pip"],
 )
 
+cuda_py_test(
+    name = "autoregressive_test",
+    size = "small",
+    srcs = ["python/kernel_tests/autoregressive_test.py"],
+    additional_deps = [
+        ":distributions_py",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 cuda_py_test(
     name = "binomial_test",
     size = "small",
diff --git a/tensorflow/contrib/distributions/__init__.py b/tensorflow/contrib/distributions/__init__.py
index 7b401e178f..59cc5eae06 100644
--- a/tensorflow/contrib/distributions/__init__.py
+++ b/tensorflow/contrib/distributions/__init__.py
@@ -23,6 +23,7 @@ from __future__ import print_function
 # pylint: disable=unused-import,wildcard-import,line-too-long,g-importing-member
 
 from tensorflow.contrib.distributions.python.ops import bijectors
+from tensorflow.contrib.distributions.python.ops.autoregressive import *
 from tensorflow.contrib.distributions.python.ops.binomial import *
 from tensorflow.contrib.distributions.python.ops.cauchy import *
 from tensorflow.contrib.distributions.python.ops.chi2 import *
@@ -92,6 +93,7 @@ _allowed_symbols = [
     'NOT_REPARAMETERIZED',
     'ReparameterizationType',
     'Distribution',
+    'Autoregressive',
     'Binomial',
     'Bernoulli',
     'BernoulliWithSigmoidProbs',
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/autoregressive_test.py b/tensorflow/contrib/distributions/python/kernel_tests/autoregressive_test.py
new file mode 100644
index 0000000000..0928dc3f35
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/autoregressive_test.py
@@ -0,0 +1,94 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.distributions.python.ops import autoregressive as autoregressive_lib
+from tensorflow.contrib.distributions.python.ops import independent as independent_lib
+from tensorflow.contrib.distributions.python.ops import test_util
+from tensorflow.contrib.distributions.python.ops.bijectors.affine import Affine
+from tensorflow.contrib.distributions.python.ops.bijectors.masked_autoregressive import MaskedAutoregressiveFlow
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import normal as normal_lib
+from tensorflow.python.ops.distributions import transformed_distribution as transformed_distribution_lib
+from tensorflow.python.ops.distributions import util as distribution_util
+from tensorflow.python.platform import test
+
+
+class AutogressiveTest(test_util.VectorDistributionTestHelpers, test.TestCase):
+  """Tests the Autoregressive distribution."""
+
+  def setUp(self):
+    self._rng = np.random.RandomState(42)
+
+  def _random_scale_tril(self, event_size):
+    n = np.int32(event_size * (event_size + 1) // 2)
+    p = 2. * self._rng.random_sample(n).astype(np.float32) - 1.
+    return distribution_util.fill_triangular(0.25 * p)
+
+  def _normal_fn(self, affine_bijector):
+    def _fn(samples):
+      scale = math_ops.exp(affine_bijector.forward(samples))
+      return independent_lib.Independent(
+          normal_lib.Normal(loc=0., scale=scale, validate_args=True),
+          reinterpreted_batch_ndims=1)
+    return _fn
+
+  def testSampleAndLogProbConsistency(self):
+    batch_shape = []
+    event_size = 2
+    with self.test_session() as sess:
+      batch_event_shape = np.concatenate([batch_shape, [event_size]], axis=0)
+      sample0 = array_ops.zeros(batch_event_shape)
+      affine = Affine(scale_tril=self._random_scale_tril(event_size))
+      ar = autoregressive_lib.Autoregressive(
+          self._normal_fn(affine), sample0, validate_args=True)
+      self.run_test_sample_consistent_log_prob(
+          sess.run, ar, radius=1., center=0., rtol=0.01)
+
+  def testCompareToBijector(self):
+    """Demonstrates equivalence between TD, Bijector approach and AR dist."""
+    sample_shape = np.int32([4, 5])
+    batch_shape = np.int32([])
+    event_size = np.int32(2)
+    with self.test_session() as sess:
+      batch_event_shape = np.concatenate([batch_shape, [event_size]], axis=0)
+      sample0 = array_ops.zeros(batch_event_shape)
+      affine = Affine(scale_tril=self._random_scale_tril(event_size))
+      ar = autoregressive_lib.Autoregressive(
+          self._normal_fn(affine), sample0, validate_args=True)
+      ar_flow = MaskedAutoregressiveFlow(
+          is_constant_jacobian=True,
+          shift_and_log_scale_fn=lambda x: [None, affine.forward(x)],
+          validate_args=True)
+      td = transformed_distribution_lib.TransformedDistribution(
+          distribution=normal_lib.Normal(loc=0., scale=1.),
+          bijector=ar_flow,
+          event_shape=[event_size],
+          batch_shape=batch_shape,
+          validate_args=True)
+      x_shape = np.concatenate(
+          [sample_shape, batch_shape, [event_size]], axis=0)
+      x = 2. * self._rng.random_sample(x_shape).astype(np.float32) - 1.
+      td_log_prob_, ar_log_prob_ = sess.run([td.log_prob(x), ar.log_prob(x)])
+      self.assertAllClose(td_log_prob_, ar_log_prob_, atol=0., rtol=1e-6)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/ops/autoregressive.py b/tensorflow/contrib/distributions/python/ops/autoregressive.py
new file mode 100644
index 0000000000..852298bf33
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/autoregressive.py
@@ -0,0 +1,208 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The Autoregressive distribution."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops.distributions import distribution as distribution_lib
+from tensorflow.python.ops.distributions import util as distribution_util
+
+
+class Autoregressive(distribution_lib.Distribution):
+  """Autoregressive distributions.
+
+  The Autoregressive distribution enables learning (often) richer multivariate
+  distributions by repeatedly applying a [diffeomorphic](
+  https://en.wikipedia.org/wiki/Diffeomorphism) transformation (such as
+  implemented by `Bijector`s). Regarding terminology,
+
+    "Autoregressive models decompose the joint density as a product of
+    conditionals, and model each conditional in turn. Normalizing flows
+    transform a base density (e.g. a standard Gaussian) into the target density
+    by an invertible transformation with tractable Jacobian." [1]
+
+  In other words, the "autoregressive property" is equivalent to the
+  decomposition, `p(x) = prod{ p(x[i] | x[0:i]) : i=0, ..., d }`. The provided
+  `shift_and_log_scale_fn`, `masked_autoregressive_default_template`, achieves
+  this property by zeroing out weights in its `masked_dense` layers.
+
+  Practically speaking the autoregressive property means that there exists a
+  permutation of the event coordinates such that each coordinate is a
+  diffeomorphic function of only preceding coordinates. [2]
+
+  #### Mathematical Details
+
+  The probability function is,
+
+  ```none
+  prob(x; fn, n) = fn(x).prob(x)
+  ```
+
+  And a sample is generated by,
+
+  ```none
+  x = fn(...fn(fn(x0).sample()).sample()).sample()
+  ```
+
+  where the ellipses (`...`) represent `n-2` composed calls to `fn`, `fn`
+  constructs a `tf.distributions.Distribution`-like instance, and `x0` is a
+  fixed initializing `Tensor`.
+
+  #### Examples
+
+  ```python
+  tfd = tf.contrib.distributions
+
+  def normal_fn(self, event_size):
+    n = event_size * (event_size + 1) / 2
+    p = tf.Variable(tfd.Normal(loc=0., scale=1.).sample(n))
+    affine = tfd.bijectors.Affine(
+        scale_tril=tfd.fill_triangular(0.25 * p))
+    def _fn(samples):
+      scale = math_ops.exp(affine.forward(samples)).eval()
+      return independent_lib.Independent(
+          normal_lib.Normal(loc=0., scale=scale, validate_args=True),
+          reinterpreted_batch_ndims=1)
+    return _fn
+
+  batch_and_event_shape = [3, 2, 4]
+  sample0 = array_ops.zeros(batch_and_event_shape)
+  ar = autoregressive_lib.Autoregressive(
+      self._normal_fn(batch_and_event_shape[-1]), sample0)
+  x = ar.sample([6, 5])
+  # ==> x.shape = [6, 5, 3, 2, 4]
+  prob_x = ar.prob(x)
+  # ==> x.shape = [6, 5, 3, 2]
+
+  ```
+
+  [1]: "Masked Autoregressive Flow for Density Estimation."
+       George Papamakarios, Theo Pavlakou, Iain Murray. Arxiv. 2017.
+       https://arxiv.org/abs/1705.07057
+
+  [2]: "Conditional Image Generation with PixelCNN Decoders."
+       Aaron van den Oord, Nal Kalchbrenner, Oriol Vinyals, Lasse Espeholt, Alex
+       Graves, Koray Kavukcuoglu. Arxiv, 2016.
+       https://arxiv.org/abs/1606.05328
+  """
+
+  def __init__(self,
+               distribution_fn,
+               sample0=None,
+               num_steps=None,
+               validate_args=False,
+               allow_nan_stats=True,
+               name="Autoregressive"):
+    """Construct an `Autoregressive` distribution.
+
+    Args:
+      distribution_fn: Python `callable` which constructs a
+        `tf.distributions.Distribution`-like instance from a `Tensor` (e.g.,
+        `sample0`). The function must respect the "autoregressive property",
+        i.e., there exists a permutation of event such that each coordinate is a
+        diffeomorphic function of on preceding coordinates.
+      sample0: Initial input to `distribution_fn`; used to
+        build the distribution in `__init__` which in turn specifies this
+        distribution's properties, e.g., `event_shape`, `batch_shape`, `dtype`.
+        If unspecified, then `distribution_fn` should be default constructable.
+      num_steps: Number of times `distribution_fn` is composed from samples,
+        e.g., `num_steps=2` implies
+        `distribution_fn(distribution_fn(sample0).sample(n)).sample()`.
+      validate_args: Python `bool`.  Whether to validate input with asserts.
+        If `validate_args` is `False`, and the inputs are invalid,
+        correct behavior is not guaranteed.
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
+        (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
+        result is undefined. When `False`, an exception is raised if one or
+        more of the statistic's batch members are undefined.
+      name: Python `str` name prefixed to Ops created by this class.
+        Default value: "Autoregressive".
+
+    Raises:
+      ValueError: if `num_steps` and
+        `distribution_fn(sample0).event_shape.num_elements()` are both `None`.
+      ValueError: if `num_steps < 1`.
+    """
+    parameters = locals()
+    with ops.name_scope(name):
+      self._distribution_fn = distribution_fn
+      self._sample0 = sample0
+      self._distribution0 = (distribution_fn() if sample0 is None
+                             else distribution_fn(sample0))
+      if num_steps is None:
+        num_steps = self._distribution0.event_shape.num_elements()
+        if num_steps is None:
+          raise ValueError("distribution_fn must generate a distribution "
+                           "with fully known `event_shape`.")
+      if num_steps < 1:
+        raise ValueError("num_steps ({}) must be at least 1.".format(num_steps))
+      self._num_steps = num_steps
+    super(Autoregressive, self).__init__(
+        dtype=self._distribution0.dtype,
+        reparameterization_type=self._distribution0.reparameterization_type,
+        validate_args=validate_args,
+        allow_nan_stats=allow_nan_stats,
+        parameters=parameters,
+        graph_parents=self._distribution0._graph_parents,  # pylint: disable=protected-access
+        name=name)
+
+  @property
+  def distribution_fn(self):
+    return self._distribution_fn
+
+  @property
+  def sample0(self):
+    return self._sample0
+
+  @property
+  def num_steps(self):
+    return self._num_steps
+
+  @property
+  def distribution0(self):
+    return self._distribution0
+
+  def _batch_shape(self):
+    return self.distribution0.batch_shape
+
+  def _batch_shape_tensor(self):
+    return self.distribution0.batch_shape_tensor()
+
+  def _event_shape(self):
+    return self.distribution0.event_shape
+
+  def _event_shape_tensor(self):
+    return self.distribution0.event_shape_tensor()
+
+  def _sample_n(self, n, seed=None):
+    if seed is None:
+      seed = distribution_util.gen_new_seed(
+          seed=np.random.randint(2**32 - 1),
+          salt="autoregressive")
+    samples = self.distribution0.sample(n, seed=seed)
+    for _ in range(self._num_steps):
+      samples = self.distribution_fn(samples).sample(seed=seed)
+    return samples
+
+  def _log_prob(self, value):
+    return self.distribution_fn(value).log_prob(value)
+
+  def _prob(self, value):
+    return self.distribution_fn(value).prob(value)
-- 
GitLab


From 8c08ef800149c51076431d63477faa48bd1e5877 Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Tue, 23 Jan 2018 11:46:18 -0800
Subject: [PATCH 014/966] [XLA] Remove LiteralTestUtil::EqualTuple / NearTuple.

Instead, simply DTRT when you pass a tuple to Equal/Near.

PiperOrigin-RevId: 182968283
---
 .../xla/tests/client_library_test_base.cc     |   4 +-
 .../compiler/xla/tests/literal_test_util.cc   | 132 +++++-------------
 .../compiler/xla/tests/literal_test_util.h    |  31 ++--
 3 files changed, 47 insertions(+), 120 deletions(-)

diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.cc b/tensorflow/compiler/xla/tests/client_library_test_base.cc
index 7c9494f133..a677986cd9 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.cc
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.cc
@@ -387,7 +387,7 @@ void ClientLibraryTestBase::ComputeAndCompareTuple(
     return;
   }
   auto actual = actual_status.ConsumeValueOrDie();
-  LiteralTestUtil::ExpectEqualTuple(expected, *actual);
+  LiteralTestUtil::ExpectEqual(expected, *actual);
 }
 
 void ClientLibraryTestBase::ComputeAndCompareTuple(
@@ -399,7 +399,7 @@ void ClientLibraryTestBase::ComputeAndCompareTuple(
     return;
   }
   auto actual = actual_status.ConsumeValueOrDie();
-  LiteralTestUtil::ExpectNearTuple(expected, *actual, error);
+  LiteralTestUtil::ExpectNear(expected, *actual, error);
 }
 
 void ClientLibraryTestBase::ComputeAndCompare(
diff --git a/tensorflow/compiler/xla/tests/literal_test_util.cc b/tensorflow/compiler/xla/tests/literal_test_util.cc
index dd91dbe8b9..975feaa050 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util.cc
+++ b/tensorflow/compiler/xla/tests/literal_test_util.cc
@@ -313,6 +313,10 @@ bool ExpectLiteralsEqual(const Literal& expected, const Literal& actual,
     case TUPLE: {
       bool tuple_match = true;
       for (int i = 0; i < ShapeUtil::TupleElementCount(expected.shape()); ++i) {
+        SCOPED_TRACE(tensorflow::strings::StrCat(
+            "Tuple index ", i, " in ",
+            ShapeUtil::HumanString(expected.shape())));
+
         // Create LiteralViews of the expected and actual elements.
         auto result = Equal(LiteralView::Create(expected, {i}),
                             LiteralView::Create(actual, {i}));
@@ -336,47 +340,6 @@ bool ExpectLiteralsEqual(const Literal& expected, const Literal& actual,
   return result;
 }
 
-/* static */ ::testing::AssertionResult LiteralTestUtil::EqualTuple(
-    const Literal& expected, const Literal& actual) {
-  VLOG(1) << "expected: " << expected.ToString();
-  VLOG(1) << "actual:   " << actual.ToString();
-
-  if (!ShapeUtil::IsTuple(expected.shape()) ||
-      !ShapeUtil::IsTuple(actual.shape())) {
-    return ::testing::AssertionFailure()
-           << "tuples expected shape = " << expected.shape().ShortDebugString()
-           << " actual shape = " << actual.shape().ShortDebugString();
-  }
-  AssertEqualShapes(expected.shape(), actual.shape());
-
-  ::testing::AssertionResult err = ::testing::AssertionSuccess();
-  for (int64 i = 0; i < ShapeUtil::TupleElementCount(expected.shape()); ++i) {
-    SCOPED_TRACE(tensorflow::strings::StrCat(
-        "Tuple index ", i, " in ", ShapeUtil::HumanString(expected.shape())));
-    const auto expected_element = LiteralView::Create(expected, {i});
-    const auto actual_element = LiteralView::Create(actual, {i});
-
-    ::testing::AssertionResult res = [&] {
-      if (ShapeUtil::IsTuple(expected_element.shape())) {
-        return EqualTuple(expected_element, actual_element);
-      } else {
-        return Equal(expected_element, actual_element);
-      }
-    }();
-
-    if (!res && err) {
-      err = res;
-    }
-  }
-
-  return err;
-}
-
-/* static */ void LiteralTestUtil::ExpectEqualTuple(const Literal& expected,
-                                                    const Literal& actual) {
-  EXPECT_TRUE(EqualTuple(expected, actual));
-}
-
 namespace {
 
 // Helper class for comparing floating-point literals within an error bound.
@@ -613,10 +576,37 @@ bool NearComparator::ExpectValuesNear<bfloat16>(bfloat16 expected,
 
 /* static */ ::testing::AssertionResult LiteralTestUtil::Near(
     const Literal& expected, const Literal& actual, const ErrorSpec& error) {
-  NearComparator comparator(error);
-  return comparator.ExpectNear(expected, actual)
-             ? ::testing::AssertionSuccess()
-             : ::testing::AssertionFailure() << "values were not near";
+  ::testing::AssertionResult err =
+      EqualShapes(expected.shape(), actual.shape());
+  if (!err) {
+    return err;
+  }
+
+  if (ShapeUtil::IsTuple(expected.shape())) {
+    for (int64 i = 0; i < ShapeUtil::TupleElementCount(expected.shape()); ++i) {
+      SCOPED_TRACE(tensorflow::strings::StrCat(
+          "Tuple index ", i, " in ", ShapeUtil::HumanString(expected.shape())));
+      const auto expected_element = LiteralView::Create(expected, {i});
+      const auto actual_element = LiteralView::Create(actual, {i});
+
+      ::testing::AssertionResult res =
+          Near(expected_element, actual_element, error);
+      if (err && !res) {
+        err = res;
+      }
+    }
+    return err;
+  }
+
+  if (ShapeUtil::ElementIsFloating(expected.shape()) ||
+      ShapeUtil::ElementIsComplex(expected.shape())) {
+    NearComparator comparator(error);
+    return comparator.ExpectNear(expected, actual)
+               ? ::testing::AssertionSuccess()
+               : ::testing::AssertionFailure() << "values were not near";
+  }
+
+  return Equal(expected, actual);
 }
 
 /* static */ void LiteralTestUtil::ExpectNear(const Literal& expected,
@@ -629,65 +619,13 @@ bool NearComparator::ExpectValuesNear<bfloat16>(bfloat16 expected,
               : tensorflow::strings::StrCat("\nmessage: ", message));
 }
 
-/* static */ ::testing::AssertionResult LiteralTestUtil::NearTuple(
-    const Literal& expected, const Literal& actual, const ErrorSpec& error) {
-  VLOG(1) << "expected: " << expected.ToString();
-  VLOG(1) << "actual:   " << actual.ToString();
-
-  if (!ShapeUtil::IsTuple(expected.shape()) ||
-      !ShapeUtil::IsTuple(actual.shape())) {
-    return ::testing::AssertionFailure()
-           << "tuples expected shape = " << expected.shape().ShortDebugString()
-           << " actual shape = " << actual.shape().ShortDebugString();
-  }
-  AssertEqualShapes(expected.shape(), actual.shape());
-
-  ::testing::AssertionResult err = ::testing::AssertionSuccess();
-  for (int64 i = 0; i < ShapeUtil::TupleElementCount(expected.shape()); ++i) {
-    SCOPED_TRACE(tensorflow::strings::StrCat(
-        "Tuple index ", i, " in ", ShapeUtil::HumanString(expected.shape())));
-    const auto expected_element = LiteralView::Create(expected, {i});
-    const auto actual_element = LiteralView::Create(actual, {i});
-
-    ::testing::AssertionResult res = [&] {
-      if (ShapeUtil::IsTuple(expected_element.shape())) {
-        return NearTuple(expected_element, actual_element, error);
-      } else if (ShapeUtil::ElementIsFloating(expected_element.shape())) {
-        return Near(expected_element, actual_element, error);
-      } else {
-        return Equal(expected_element, actual_element);
-      }
-    }();
-
-    if (err && !res) {
-      err = res;
-    }
-  }
-  return err;
-}
-
-/* static */ void LiteralTestUtil::ExpectNearTuple(const Literal& expected,
-                                                   const Literal& actual,
-                                                   const ErrorSpec& error) {
-  EXPECT_TRUE(NearTuple(expected, actual, error));
-}
-
 /*static*/ ::testing::AssertionResult LiteralTestUtil::NearOrEqual(
     const Literal& expected, const Literal& actual,
     const tensorflow::gtl::optional<ErrorSpec>& error) {
-  bool is_tuple = ShapeUtil::IsTuple(expected.shape());
   if (error.has_value()) {
-    if (is_tuple) {
-      VLOG(1) << "Expects near tuple";
-      return NearTuple(expected, actual, *error);
-    }
     VLOG(1) << "Expects near";
     return Near(expected, actual, *error);
   }
-  if (is_tuple) {
-    VLOG(1) << "Expects equal tuple";
-    return EqualTuple(expected, actual);
-  }
   VLOG(1) << "Expects equal";
   return Equal(expected, actual);
 }
diff --git a/tensorflow/compiler/xla/tests/literal_test_util.h b/tensorflow/compiler/xla/tests/literal_test_util.h
index f53553c701..9b0724262d 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util.h
+++ b/tensorflow/compiler/xla/tests/literal_test_util.h
@@ -111,17 +111,18 @@ class LiteralTestUtil {
   static void ExpectR4EqualArray4D(const Array4D<NativeT>& expected,
                                    const Literal& actual);
 
-  // Returns whether the two tuples are equal.
-  static ::testing::AssertionResult EqualTuple(
-      const Literal& expected, const Literal& actual) TF_MUST_USE_RESULT;
-
-  // Expects that the values of the elements in the expected and actual tuples
-  // are equal. Tuples are matched recursively.
-  static void ExpectEqualTuple(const Literal& expected, const Literal& actual);
-
   // Asserts that the expected and actual literals are within the given error
   // bound for all elements. Also, asserts that the rank, dimensions sizes, and
-  // bounds are equivalent. Only supported for floating point values.
+  // bounds are equivalent.
+  //
+  // Tuples are matched recursively.  When comparing tensors of
+  // non-floating-point type, checks for exact equality, ignoring the ErroSpec.
+  //
+  // If the shape of the literals is neither a complex/floating-point tensor nor
+  // a tuple which contains a complex/floating-point tensor, Near() is
+  // equivalent to Equal().  We don't raise an error in this case, because we
+  // want to allow callers to call Near() even if they have no preconceptions
+  // about the shapes being compared.
   static ::testing::AssertionResult Near(
       const Literal& expected, const Literal& actual,
       const ErrorSpec& error) TF_MUST_USE_RESULT;
@@ -170,18 +171,6 @@ class LiteralTestUtil {
                                   const Literal& actual,
                                   const ErrorSpec& error);
 
-  // Returns whether the values of the elements in the expected and actual
-  // tuples are within the given error bound. Tuples are matched recursively.
-  // If the elements of the tuple are not floating-point types, the error spec
-  // is ignored and exact equality is checked.
-  static ::testing::AssertionResult NearTuple(
-      const Literal& expected, const Literal& actual,
-      const ErrorSpec& error) TF_MUST_USE_RESULT;
-
-  // Expects that the expected and actual values are near.
-  static void ExpectNearTuple(const Literal& expected, const Literal& actual,
-                              const ErrorSpec& error);
-
   // If the error spec is given, returns whether the expected and the actual are
   // within the error bound; otherwise, returns whether they are equal. Tuples
   // will be compared recursively.
-- 
GitLab


From 64d42bf9d1a952e4dd00283f396d09f3328a6127 Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Tue, 23 Jan 2018 12:01:11 -0800
Subject: [PATCH 015/966] Correctly incref items put into the tuple to keep
 msan happy.

PiperOrigin-RevId: 182970703
---
 tensorflow/python/eager/pywrap_tfe_src.cc | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index b3ce09ce90..6162644036 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -1091,8 +1091,9 @@ bool MaybeRunRecordGradientCallback(const tensorflow::OpDef* op_def,
 
   PyObject* inputs = PyTuple_New(op_def->input_arg_size());
   for (int i = 0; i < op_def->input_arg_size(); i++) {
-    PyTuple_SET_ITEM(
-        inputs, i, PyTuple_GET_ITEM(args, kFastPathExecuteInputStartIndex + i));
+    auto* input = PyTuple_GET_ITEM(args, kFastPathExecuteInputStartIndex + i);
+    Py_INCREF(input);
+    PyTuple_SET_ITEM(inputs, i, input);
   }
 
   int args_size = PyTuple_GET_SIZE(args);
@@ -1100,9 +1101,10 @@ bool MaybeRunRecordGradientCallback(const tensorflow::OpDef* op_def,
       args_size - op_def->input_arg_size() - kFastPathExecuteInputStartIndex;
   PyObject* attrs = PyTuple_New(num_attrs);
   for (int i = 0; i < num_attrs; i++) {
-    PyTuple_SET_ITEM(attrs, i,
-                     PyTuple_GET_ITEM(args, kFastPathExecuteInputStartIndex +
-                                                op_def->input_arg_size() + i));
+    auto* attr = PyTuple_GET_ITEM(
+        args, kFastPathExecuteInputStartIndex + op_def->input_arg_size() + i);
+    Py_INCREF(attr);
+    PyTuple_SET_ITEM(attrs, i, attr);
   }
 
   PyObject* callback_args = Py_BuildValue("OOO", inputs, attrs, result);
-- 
GitLab


From 93035bd163b7f609c0a2023259f5dfb826e1c033 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Jan 2018 12:14:52 -0800
Subject: [PATCH 016/966] Allow fisher-vector multiplies on 1D vectors for
 CategoricalLogitsNegativeLogProbLoss.

PiperOrigin-RevId: 182972901
---
 .../kernel_tests/loss_functions_test.py       | 36 +++++++++++++++++++
 .../contrib/kfac/python/ops/loss_functions.py |  7 ++--
 2 files changed, 40 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/kfac/python/kernel_tests/loss_functions_test.py b/tensorflow/contrib/kfac/python/kernel_tests/loss_functions_test.py
index 63f45ea55b..ae787b6f1a 100644
--- a/tensorflow/contrib/kfac/python/kernel_tests/loss_functions_test.py
+++ b/tensorflow/contrib/kfac/python/kernel_tests/loss_functions_test.py
@@ -113,6 +113,42 @@ class CategoricalLogitsNegativeLogProbLossTest(test.TestCase):
       self.assertListEqual(loss.input_minibatches, tower_logits)
       self.assertEqual(loss.num_registered_minibatches, num_towers)
 
+  def testMultiplyFisherSingleVector(self):
+    with ops.Graph().as_default(), self.test_session() as sess:
+      logits = np.array([1., 2., 3.])
+      loss = loss_functions.CategoricalLogitsNegativeLogProbLoss(logits)
+
+      # the LossFunction.multiply_fisher docstring only says it supports the
+      # case where the vector is the same shape as the input natural parameters
+      # (i.e. the logits here), but here we also test leading dimensions
+      vector = np.array([1., 2., 3.])
+      vectors = [vector, vector.reshape(1, -1), np.stack([vector] * 4)]
+
+      probs = np.exp(logits - np.logaddexp.reduce(logits))
+      fisher = np.diag(probs) - np.outer(probs, probs)
+
+      for vector in vectors:
+        result = loss.multiply_fisher(vector)
+        expected_result = np.dot(vector, fisher)
+        self.assertAllClose(expected_result, sess.run(result))
+
+  def testMultiplyFisherBatch(self):
+    with ops.Graph().as_default(), self.test_session() as sess:
+      logits = np.array([[1., 2., 3.], [4., 6., 8.]])
+      loss = loss_functions.CategoricalLogitsNegativeLogProbLoss(logits)
+
+      vector = np.array([[1., 2., 3.], [5., 3., 1.]])
+
+      na = np.newaxis
+      probs = np.exp(logits - np.logaddexp.reduce(logits, axis=-1,
+                                                  keepdims=True))
+      fishers = probs[..., na] * np.eye(3) - probs[..., na] * probs[..., na, :]
+
+      result = loss.multiply_fisher(vector)
+      expected_result = np.matmul(vector[..., na, :], fishers)[..., 0, :]
+      self.assertEqual(sess.run(result).shape, logits.shape)
+      self.assertAllClose(expected_result, sess.run(result))
+
 
 class OnehotCategoricalLogitsNegativeLogProbLossTest(test.TestCase):
 
diff --git a/tensorflow/contrib/kfac/python/ops/loss_functions.py b/tensorflow/contrib/kfac/python/ops/loss_functions.py
index 2daead2a71..cb3e698b9c 100644
--- a/tensorflow/contrib/kfac/python/ops/loss_functions.py
+++ b/tensorflow/contrib/kfac/python/ops/loss_functions.py
@@ -660,19 +660,20 @@ class CategoricalLogitsNegativeLogProbLoss(DistributionNegativeLogProbLoss,
 
   def multiply_fisher(self, vector):
     probs = self._probs
-    return vector * probs - math_ops.reduce_sum(vector * probs, axis=1) * probs
+    return vector * probs - probs * math_ops.reduce_sum(
+        vector * probs, axis=-1, keep_dims=True)
 
   def multiply_fisher_factor(self, vector):
     probs = self._probs
     sqrt_probs = self._sqrt_probs
     return sqrt_probs * vector - probs * math_ops.reduce_sum(
-        sqrt_probs * vector, axis=1, keep_dims=True)
+        sqrt_probs * vector, axis=-1, keep_dims=True)
 
   def multiply_fisher_factor_transpose(self, vector):
     probs = self._probs
     sqrt_probs = self._sqrt_probs
     return sqrt_probs * vector - sqrt_probs * math_ops.reduce_sum(
-        probs * vector, axis=1, keep_dims=True)
+        probs * vector, axis=-1, keep_dims=True)
 
   def multiply_fisher_factor_replicated_one_hot(self, index):
     assert len(index) == 1, "Length of index was {}".format(len(index))
-- 
GitLab


From b47119257b76bdec542121aaed056c8b010fd19f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Jan 2018 12:19:06 -0800
Subject: [PATCH 017/966] Update ops-related pbtxt files.

PiperOrigin-RevId: 182973494
---
 .../core/ops/compat/ops_history.v1.pbtxt      | 152 ++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 | 152 ++++++++++++++++++
 2 files changed, 304 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 438882dc9e..c2e2793760 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -7903,6 +7903,76 @@ op {
     }
   }
 }
+op {
+  name: "Batch"
+  input_arg {
+    name: "in_tensors"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "batched_tensors"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "batch_index"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "id"
+    type: DT_INT64
+  }
+  attr {
+    name: "num_batch_threads"
+    type: "int"
+  }
+  attr {
+    name: "max_batch_size"
+    type: "int"
+  }
+  attr {
+    name: "batch_timeout_micros"
+    type: "int"
+  }
+  attr {
+    name: "allowed_batch_sizes"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "grad_timeout_micros"
+    type: "int"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "batching_queue"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "BatchCholesky"
   input_arg {
@@ -64232,6 +64302,88 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "Unbatch"
+  input_arg {
+    name: "batched_tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "batch_index"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "id"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "unbatched_tensor"
+    type_attr: "T"
+  }
+  attr {
+    name: "timeout_micros"
+    type: "int"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "UnbatchGrad"
+  input_arg {
+    name: "original_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "batch_index"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "id"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "batched_grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
 op {
   name: "UniformCandidateSampler"
   input_arg {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 45eb1a9cac..60da36a939 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -2737,6 +2737,76 @@ op {
     }
   }
 }
+op {
+  name: "Batch"
+  input_arg {
+    name: "in_tensors"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "batched_tensors"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "batch_index"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "id"
+    type: DT_INT64
+  }
+  attr {
+    name: "num_batch_threads"
+    type: "int"
+  }
+  attr {
+    name: "max_batch_size"
+    type: "int"
+  }
+  attr {
+    name: "batch_timeout_micros"
+    type: "int"
+  }
+  attr {
+    name: "allowed_batch_sizes"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "grad_timeout_micros"
+    type: "int"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "batching_queue"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "BatchCholesky"
   input_arg {
@@ -30381,6 +30451,88 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "Unbatch"
+  input_arg {
+    name: "batched_tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "batch_index"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "id"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "unbatched_tensor"
+    type_attr: "T"
+  }
+  attr {
+    name: "timeout_micros"
+    type: "int"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "UnbatchGrad"
+  input_arg {
+    name: "original_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "batch_index"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "id"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "batched_grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
 op {
   name: "UniformCandidateSampler"
   input_arg {
-- 
GitLab


From ba4aec48268d02f111cd7e2c2666f4e7b077e68a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Jan 2018 12:24:21 -0800
Subject: [PATCH 018/966] Internal Change

PiperOrigin-RevId: 182974191
---
 .../lite/toco/allocate_transient_arrays.cc    |   8 +-
 tensorflow/contrib/lite/toco/dump_graphviz.cc |   8 +-
 .../contrib/lite/toco/export_tensorflow.cc    |  46 ++---
 .../convert_expanddims_to_reshape.cc          |   6 +-
 .../convert_pure_conv_to_depthwise.cc         |   2 +-
 .../convert_trivial_transpose_to_reshape.cc   |   4 +-
 .../create_im2col_arrays.cc                   |   2 +-
 .../toco/graph_transformations/dequantize.cc  |   4 +-
 .../graph_transformations/drop_fake_quant.cc  |   2 +-
 .../drop_im2col_arrays.cc                     |   2 +-
 .../fuse_activation_functions.cc              |   2 +-
 .../fuse_binary_into_following_affine.cc      |   4 +-
 .../fuse_binary_into_preceding_affine.cc      |   4 +-
 .../graph_transformations.cc                  |  19 +-
 .../identify_l2_normalization.cc              |  12 +-
 .../graph_transformations/identify_l2_pool.cc |   4 +-
 .../graph_transformations/identify_relu1.cc   |   6 +-
 .../propagate_array_data_types.cc             |  20 +-
 .../propagate_fixed_sizes.cc                  | 174 +++++++++---------
 .../toco/graph_transformations/quantize.cc    |   2 +-
 .../read_fake_quant_min_max.cc                |   2 +-
 .../remove_final_dequantize_op.cc             |   2 +-
 .../remove_trivial_binary.cc                  |   2 +-
 .../remove_trivial_concatenation_input.cc     |   2 +-
 .../remove_trivial_passthrough.cc             |   2 +-
 .../graph_transformations/remove_unused_op.cc |   6 +-
 .../resolve_batch_normalization.cc            |   6 +-
 .../resolve_batch_to_space_nd_attributes.cc   |   4 +-
 .../resolve_constant_binary.cc                |  11 +-
 .../resolve_constant_concatenation.cc         |   2 +-
 .../resolve_constant_fake_quant.cc            |   2 +-
 .../resolve_constant_fill.cc                  |   4 +-
 .../resolve_constant_range.cc                 |  14 +-
 .../resolve_constant_shape_or_rank.cc         |   2 +-
 .../resolve_constant_stack.cc                 |   2 +-
 .../resolve_constant_strided_slice.cc         |   2 +-
 .../resolve_constant_unary.cc                 |   2 +-
 .../resolve_mean_attributes.cc                |   2 +-
 .../resolve_pad_attributes.cc                 |   2 +-
 .../resolve_reorder_axes.cc                   |   2 +-
 .../resolve_reshape_attributes.cc             |   2 +-
 .../resolve_slice_attributes.cc               |   4 +-
 .../resolve_space_to_batch_nd_attributes.cc   |   5 +-
 .../resolve_strided_slice_attributes.cc       |   6 +-
 .../resolve_tensorflow_concat.cc              |   2 +-
 .../resolve_tensorflow_matmul.cc              |   4 +-
 .../resolve_tensorflow_merge.cc               |   2 +-
 .../resolve_tensorflow_switch.cc              |   4 +-
 .../resolve_tensorflow_tile.cc                |   4 +-
 .../resolve_transpose_attributes.cc           |   2 +-
 .../resolve_constant_concatenation_test.cc    |  19 +-
 .../unfuse_activation_functions.cc            |   2 +-
 .../contrib/lite/toco/import_tensorflow.cc    |   2 +-
 tensorflow/contrib/lite/toco/model.h          |  39 +++-
 tensorflow/contrib/lite/toco/tflite/export.cc |   4 +-
 .../contrib/lite/toco/tflite/import_test.cc   |   2 +-
 tensorflow/contrib/lite/toco/toco_tooling.cc  |   2 +-
 tensorflow/contrib/lite/toco/tooling_util.cc  |  49 ++---
 58 files changed, 288 insertions(+), 270 deletions(-)

diff --git a/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc b/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc
index d4da8f5dfe..5961d30bf5 100644
--- a/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc
+++ b/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc
@@ -148,7 +148,7 @@ std::size_t TransientArraySize(const Model& model, const string& array_name,
   if (!IsAllocatableTransientArray(model, array_name)) {
     return 0;
   }
-  const auto& array = model.arrays.at(array_name);
+  const auto& array = &model.GetArray(array_name);
   CHECK(array->has_shape())
       << "Array '" << array_name << "' doesn't have a shape";
   if (array->data_type == ArrayDataType::kNone) {
@@ -185,7 +185,7 @@ void AllocateTransientArray(const Model& model, const string& array_name,
   }
   const std::size_t size =
       TransientArraySize(model, array_name, transient_data_alignment);
-  const auto& array = model.arrays.at(array_name);
+  const auto& array = &model.GetArray(array_name);
   CHECK(!array->alloc);
   allocator->Allocate(size, &array->GetOrCreateAlloc());
 }
@@ -197,7 +197,7 @@ void DeallocateTransientArray(const Model& model, const string& array_name,
   if (!IsAllocatableTransientArray(model, array_name)) {
     return;
   }
-  const auto& array = model.arrays.at(array_name);
+  const auto& array = &model.GetArray(array_name);
   CHECK(!!array->alloc);
   allocator->Deallocate(*array->alloc);
 }
@@ -231,7 +231,7 @@ void AllocateTransientArrays(Model* model,
   // Construct a sorted map of array names, so that other layout engines can
   // match exactly.
   std::map<string, const Array*> ordered_arrays_map;
-  for (const auto& pair : model->arrays) {
+  for (const auto& pair : model->GetArrayMap()) {
     ordered_arrays_map[pair.first] = pair.second.get();
   }
 
diff --git a/tensorflow/contrib/lite/toco/dump_graphviz.cc b/tensorflow/contrib/lite/toco/dump_graphviz.cc
index 39809216c7..c726eb6d86 100644
--- a/tensorflow/contrib/lite/toco/dump_graphviz.cc
+++ b/tensorflow/contrib/lite/toco/dump_graphviz.cc
@@ -278,8 +278,8 @@ std::vector<const Operator*> OperatorsToDump(const Model& model) {
   if (last_specified) {
     // Return only the part of the graph between graphviz_first_array
     // and graphviz_last_array.
-    CHECK(model.arrays.count(dump_options.graphviz_first_array));
-    CHECK(model.arrays.count(dump_options.graphviz_last_array));
+    CHECK(model.HasArray(dump_options.graphviz_first_array));
+    CHECK(model.HasArray(dump_options.graphviz_last_array));
     std::unordered_set<string> arrays_already_produced;
     std::vector<string> arrays_to_produce;
     arrays_to_produce.push_back(dump_options.graphviz_last_array);
@@ -336,7 +336,7 @@ void DumpGraphviz(const Model& model, string* output_file_contents) {
             op_properties.color.TextColorString().c_str());
     // Add nodes and edges for all inputs of the operator.
     for (const auto& input : op.inputs) {
-      if (model.arrays.count(input) == 0) {
+      if (!model.HasArray(input)) {
         // Arrays should _always_ exist. Except, perhaps, during development.
         continue;
       }
@@ -352,7 +352,7 @@ void DumpGraphviz(const Model& model, string* output_file_contents) {
     }
     // Add nodes and edges for all outputs of the operator.
     for (const auto& output : op.outputs) {
-      if (model.arrays.count(output) == 0) {
+      if (!model.HasArray(output)) {
         // Arrays should _always_ exist. Except, perhaps, during development.
         continue;
       }
diff --git a/tensorflow/contrib/lite/toco/export_tensorflow.cc b/tensorflow/contrib/lite/toco/export_tensorflow.cc
index 90fa442746..4fc01dbc20 100644
--- a/tensorflow/contrib/lite/toco/export_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/export_tensorflow.cc
@@ -156,8 +156,8 @@ void ConvertFloatTensorConst(const Model& model, const string& name,
   const_op->set_name(name);
   (*const_op->mutable_attr())["dtype"].set_type(DT_FLOAT);
   auto* tensor = (*const_op->mutable_attr())["value"].mutable_tensor();
-  CHECK(model.arrays.count(name));
-  const auto& input_array = *model.arrays.at(name);
+  CHECK(model.HasArray(name));
+  const auto& input_array = model.GetArray(name);
   const auto& input_shape = input_array.shape();
   CHECK(input_array.buffer);
   CHECK(input_array.buffer->type == ArrayDataType::kFloat);
@@ -177,8 +177,8 @@ void ConvertFloatTensorConst(const Model& model, const string& name,
   const_op->set_name(name);
   (*const_op->mutable_attr())["dtype"].set_type(DT_FLOAT);
   auto* tensor = (*const_op->mutable_attr())["value"].mutable_tensor();
-  CHECK(model.arrays.count(name));
-  const auto& input_array = *model.arrays.at(name);
+  CHECK(model.HasArray(name));
+  const auto& input_array = model.GetArray(name);
   const auto& input_shape = input_array.shape();
   CHECK(input_array.buffer);
   CHECK(input_array.buffer->type == ArrayDataType::kFloat);
@@ -193,8 +193,8 @@ void ConvertIntTensorConst(const Model& model, const string& name,
   if (HasAlreadyExportedConst(name, *tensorflow_graph)) {
     return;
   }
-  CHECK(model.arrays.count(name));
-  const auto& array = *model.arrays.at(name);
+  CHECK(model.HasArray(name));
+  const auto& array = model.GetArray(name);
   auto* const_op = tensorflow_graph->add_node();
   const_op->set_op("Const");
   const_op->set_name(name);
@@ -324,7 +324,7 @@ void ConvertConvOperator(const Model& model, const ConvOperator& src_op,
     biasadd_op->add_input(conv_output);
     biasadd_op->add_input(src_op.inputs[2]);
     (*biasadd_op->mutable_attr())["T"].set_type(DT_FLOAT);
-    CHECK(model.arrays.count(src_op.inputs[2]));
+    CHECK(model.HasArray(src_op.inputs[2]));
     const string& bias_array_name =
         WalkUpToConstantArray(model, src_op.inputs[2]);
     const auto& bias_array = model.GetArray(bias_array_name);
@@ -361,7 +361,7 @@ void ConvertDepthwiseConvOperator(const Model& model,
   // We need to convert that to H x W x InputDepth x Multiplier.
   // That's only a matter of constructing a Dims object; the actual
   // array layout is the same.
-  CHECK(model.arrays.count(src_op.inputs[1]));
+  CHECK(model.HasArray(src_op.inputs[1]));
   const string& src_weights_name =
       WalkUpToConstantArray(model, src_op.inputs[1]);
   const auto& src_weights_array = model.GetArray(src_weights_name);
@@ -404,7 +404,7 @@ void ConvertDepthwiseConvOperator(const Model& model,
     biasadd_op->add_input(conv_output);
     biasadd_op->add_input(src_op.inputs[2]);
     (*biasadd_op->mutable_attr())["T"].set_type(DT_FLOAT);
-    CHECK(model.arrays.count(src_op.inputs[2]));
+    CHECK(model.HasArray(src_op.inputs[2]));
     const string& bias_name = WalkUpToConstantArray(model, src_op.inputs[2]);
     const auto& bias_array = model.GetArray(bias_name);
     // TODO(b/62904716) Bias arrays should be 1-D, and used directly.
@@ -469,10 +469,10 @@ void ConvertFullyConnectedOperator(const Model& model,
   (*matmul_op->mutable_attr())["T"].set_type(DT_FLOAT);
   (*matmul_op->mutable_attr())["transpose_a"].set_b(false);
   (*matmul_op->mutable_attr())["transpose_b"].set_b(false);
-  CHECK(model.arrays.count(src_op.inputs[1]));
+  CHECK(model.HasArray(src_op.inputs[1]));
   const string& fc_weights_name =
       WalkUpToConstantArray(model, src_op.inputs[1]);
-  const auto& fc_weights_array = *model.arrays.at(fc_weights_name);
+  const auto& fc_weights_array = model.GetArray(fc_weights_name);
   const auto& fc_weights_shape = fc_weights_array.shape();
   CHECK_EQ(fc_weights_shape.dimensions_count(), 2);
   CreateMatrixShapeTensorConst(reshape_shape, fc_weights_shape.dims(1), -1,
@@ -492,8 +492,8 @@ void ConvertFullyConnectedOperator(const Model& model,
     biasadd_op->add_input(matmul_output);
     biasadd_op->add_input(src_op.inputs[2]);
     (*biasadd_op->mutable_attr())["T"].set_type(DT_FLOAT);
-    CHECK(model.arrays.count(src_op.inputs[2]));
-    const auto& bias_array = *model.arrays.at(src_op.inputs[2]);
+    CHECK(model.HasArray(src_op.inputs[2]));
+    const auto& bias_array = model.GetArray(src_op.inputs[2]);
     // TODO(b/62904716) Bias arrays should be 1-D, and used directly.
     Shape bias_shape_1d = bias_array.shape();
     UnextendShape(&bias_shape_1d, 1);
@@ -625,7 +625,7 @@ void ConvertSoftmaxOperator(const Model& model, const SoftmaxOperator& src_op,
     *reshape_op->add_input() = softmax_size;
     (*reshape_op->mutable_attr())["T"].set_type(DT_FLOAT);
 
-    const auto& input_shape = model.arrays.at(src_op.inputs[0])->shape();
+    const auto& input_shape = model.GetArray(src_op.inputs[0]).shape();
     int32 flattened_size = 1;
     for (int i = 0; i < input_shape.dimensions_count() - 1; ++i) {
       flattened_size *= input_shape.dims(i);
@@ -1013,8 +1013,8 @@ void ConvertLstmCellOperator(const Model& model, const LstmCellOperator& src_op,
   // Op names have been chosen to match the tf.slim LSTM naming
   // as closely as possible.
   const int axis =
-      model.arrays.at(src_op.inputs[LstmCellOperator::PREV_ACTIV_INPUT])
-          ->shape()
+      model.GetArray(src_op.inputs[LstmCellOperator::PREV_ACTIV_INPUT])
+          .shape()
           .dimensions_count() -
       1;
   // Note that DATA_INPUT may have extra size 1 dimensions, but TF concat
@@ -1033,9 +1033,9 @@ void ConvertLstmCellOperator(const Model& model, const LstmCellOperator& src_op,
 
   // Write weights
   const string weights_output = base + "weights";
-  CHECK(model.arrays.count(src_op.inputs[LstmCellOperator::WEIGHTS_INPUT]));
+  CHECK(model.HasArray(src_op.inputs[LstmCellOperator::WEIGHTS_INPUT]));
   const auto& weights_array =
-      *model.arrays.at(src_op.inputs[LstmCellOperator::WEIGHTS_INPUT]);
+      model.GetArray(src_op.inputs[LstmCellOperator::WEIGHTS_INPUT]);
   // Convert 4D FullyConnected weights into 2D matrix
   const auto& weights_shape = weights_array.shape();
   CHECK_EQ(weights_shape.dimensions_count(), 2);
@@ -1059,9 +1059,9 @@ void ConvertLstmCellOperator(const Model& model, const LstmCellOperator& src_op,
 
   // Write biases
   const string biases_output = base + "biases";
-  CHECK(model.arrays.count(src_op.inputs[LstmCellOperator::BIASES_INPUT]));
+  CHECK(model.HasArray(src_op.inputs[LstmCellOperator::BIASES_INPUT]));
   const auto& bias_array =
-      *model.arrays.at(src_op.inputs[LstmCellOperator::BIASES_INPUT]);
+      model.GetArray(src_op.inputs[LstmCellOperator::BIASES_INPUT]);
   // TODO(b/62904716) Bias arrays should be 1-D, and used directly.
   Shape bias_shape_1d = bias_array.shape();
   UnextendShape(&bias_shape_1d, 1);
@@ -1557,7 +1557,7 @@ void AddPlaceholderForRNNState(const Model& model, const string& name, int size,
   (*placeholder->mutable_attr())["dtype"].set_type(DT_FLOAT);
 
   auto* shape = (*placeholder->mutable_attr())["shape"].mutable_shape();
-  const auto& state_array = *model.arrays.at(name);
+  const auto& state_array = model.GetArray(name);
   if (state_array.has_shape()) {
     const auto& state_shape = state_array.shape();
     const int kDims = state_shape.dimensions_count();
@@ -1574,7 +1574,7 @@ void ExportTensorFlowGraphDefImplementation(const Model& model,
                                             GraphDef* tensorflow_graph) {
   for (const auto& input_array : model.flags.input_arrays()) {
     AddPlaceholder(input_array.name(),
-                   model.arrays.at(input_array.name())->data_type,
+                   model.GetArray(input_array.name()).data_type,
                    tensorflow_graph);
   }
   for (const auto& rnn_state : model.flags.rnn_states()) {
@@ -1588,7 +1588,7 @@ void ExportTensorFlowGraphDefImplementation(const Model& model,
   // by the above operators export. It's important that this comes
   // after, as some operators need to export arrays that they reference
   // in a specific way, rather than in the generic way done below.
-  for (const auto& array_pair : model.arrays) {
+  for (const auto& array_pair : model.GetArrayMap()) {
     const string& array_name = array_pair.first;
     const auto& array = *array_pair.second;
     if (array.buffer) {
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/convert_expanddims_to_reshape.cc b/tensorflow/contrib/lite/toco/graph_transformations/convert_expanddims_to_reshape.cc
index 3bde9b0169..56f48d47de 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/convert_expanddims_to_reshape.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/convert_expanddims_to_reshape.cc
@@ -35,7 +35,7 @@ bool ConvertExpandDimsToReshape::Run(Model* model, std::size_t op_index) {
   CHECK_EQ(expand_op->inputs.size(), 2);
   CHECK_EQ(expand_op->outputs.size(), 1);
 
-  const auto& input_array = *model->arrays[expand_op->inputs[0]];
+  const auto& input_array = model->GetArray(expand_op->inputs[0]);
   if (!input_array.has_shape()) {
     // Yield until input dims have been resolved.
     return false;
@@ -46,7 +46,7 @@ bool ConvertExpandDimsToReshape::Run(Model* model, std::size_t op_index) {
     return false;
   }
 
-  const auto& axis_array = *model->arrays[expand_op->inputs[1]];
+  const auto& axis_array = model->GetArray(expand_op->inputs[1]);
   if (!axis_array.has_shape()) {
     // Yield until input axis array shape has been resolved.
     return false;
@@ -86,7 +86,7 @@ bool ConvertExpandDimsToReshape::Run(Model* model, std::size_t op_index) {
   if (IsDiscardableArray(*model, axis_array_name) &&
       CountOpsWithInput(*model, axis_array_name) == 1 &&
       !GetOpWithOutput(*model, axis_array_name)) {
-    model->arrays.erase(axis_array_name);
+    model->EraseArray(axis_array_name);
   }
 
   // Replace the operator in the graph.
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/convert_pure_conv_to_depthwise.cc b/tensorflow/contrib/lite/toco/graph_transformations/convert_pure_conv_to_depthwise.cc
index bf454c40c7..d38db85280 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/convert_pure_conv_to_depthwise.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/convert_pure_conv_to_depthwise.cc
@@ -58,7 +58,7 @@ bool ConvertPureConvToDepthwise::Run(Model* model, std::size_t op_index) {
   depthwiseconv_op->outputs = {conv_op->outputs[0]};
   if (conv_op->outputs.size() > 1) {
     // delete the im2col array.
-    model->arrays.erase(conv_op->outputs[1]);
+    model->EraseArray(conv_op->outputs[1]);
   }
   depthwiseconv_op->fused_activation_function =
       conv_op->fused_activation_function;
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc b/tensorflow/contrib/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc
index a234c20924..c2b166033c 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc
@@ -29,7 +29,7 @@ bool ConvertTrivialTransposeToReshape::Run(Model* model, std::size_t op_index) {
   TransposeOperator* transpose_op =
       static_cast<TransposeOperator*>(transpose_it->get());
 
-  const auto& output_array = *model->arrays[transpose_op->outputs[0]];
+  const auto& output_array = model->GetArray(transpose_op->outputs[0]);
   if (!output_array.has_shape()) {
     // Yield until PropagateFixedSizes has been run on this op.
     return false;
@@ -70,7 +70,7 @@ bool ConvertTrivialTransposeToReshape::Run(Model* model, std::size_t op_index) {
   // Delete perm array if unused
   if (IsDiscardableArray(*model, perm_array_name) &&
       CountOpsWithInput(*model, perm_array_name) == 1) {
-    model->arrays.erase(perm_array_name);
+    model->EraseArray(perm_array_name);
   }
 
   // Replace the operator in the graph.
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/create_im2col_arrays.cc b/tensorflow/contrib/lite/toco/graph_transformations/create_im2col_arrays.cc
index 1735b51e5b..076415ece8 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/create_im2col_arrays.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/create_im2col_arrays.cc
@@ -35,7 +35,7 @@ bool CreateIm2colArrays::Run(Model* model, std::size_t op_index) {
     // We already have an im2col array
     return false;
   }
-  const auto& weights_array = *model->arrays[conv_op->inputs[1]];
+  const auto& weights_array = model->GetArray(conv_op->inputs[1]);
   if (!weights_array.has_shape()) {
     // We need to yield until weights dims have been resolved, because
     // from the weights dims we determine whether an im2col array is
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/dequantize.cc b/tensorflow/contrib/lite/toco/graph_transformations/dequantize.cc
index 79854cba34..498c864bde 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/dequantize.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/dequantize.cc
@@ -53,7 +53,7 @@ std::vector<std::unique_ptr<Operator>>::iterator FindFirstOpWithInput(
 }
 
 void ClearArrayQuantizationParams(const string& array_name, Model* model) {
-  auto* array = model->arrays.at(array_name).get();
+  auto* array = &model->GetArray(array_name);
   CHECK(array->quantization_params);
   for (auto& input_array : *model->flags.mutable_input_arrays()) {
     if (input_array.name() == array_name) {
@@ -77,7 +77,7 @@ void ClearArrayQuantizationParams(const string& array_name, Model* model) {
 
 bool DequantizeArray(const string& array_name,
                      GraphTransformation* transformation, Model* model) {
-  auto* array = model->arrays.at(array_name).get();
+  auto* array = &model->GetArray(array_name);
   if (!array->quantization_params) {
     return false;
   }
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/drop_fake_quant.cc b/tensorflow/contrib/lite/toco/graph_transformations/drop_fake_quant.cc
index fea360740f..95558ef5ec 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/drop_fake_quant.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/drop_fake_quant.cc
@@ -45,7 +45,7 @@ bool DropFakeQuant::Run(Model* model, std::size_t op_index) {
   // Drop min/max inputs
   for (int i = 1; i < fakequant_op->inputs.size(); i++) {
     if (CountOpsWithInput(*model, fakequant_op->inputs[i]) == 1) {
-      model->arrays.erase(fakequant_op->inputs[i]);
+      model->EraseArray(fakequant_op->inputs[i]);
     }
   }
   fakequant_op->inputs.resize(1);
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/drop_im2col_arrays.cc b/tensorflow/contrib/lite/toco/graph_transformations/drop_im2col_arrays.cc
index a3ed6663bc..f7fd878b7e 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/drop_im2col_arrays.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/drop_im2col_arrays.cc
@@ -32,7 +32,7 @@ bool DropIm2colArrays::Run(Model* model, std::size_t op_index) {
 
   // Drop the im2col array.
   CHECK_EQ(conv_op->outputs.size(), 2);
-  model->arrays.erase(conv_op->outputs[1]);
+  model->EraseArray(conv_op->outputs[1]);
   conv_op->outputs.resize(1);
   AddMessageF("Dropped an im2col array for %s", LogName(*conv_op));
 
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/fuse_activation_functions.cc b/tensorflow/contrib/lite/toco/graph_transformations/fuse_activation_functions.cc
index ad4a6f9b78..88e59664ec 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/fuse_activation_functions.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/fuse_activation_functions.cc
@@ -91,7 +91,7 @@ bool FuseActivationFunctions::Run(Model* model, std::size_t op_index) {
   } else {
     LOG(FATAL) << "Unhandled activation function type";
   }
-  model->arrays.erase(ac_op->inputs[0]);
+  model->EraseArray(ac_op->inputs[0]);
   op->outputs[0] = ac_op->outputs[0];
   model->operators.erase(ac_it);
   return true;
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/fuse_binary_into_following_affine.cc b/tensorflow/contrib/lite/toco/graph_transformations/fuse_binary_into_following_affine.cc
index 4619d8bbee..dcbbead517 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/fuse_binary_into_following_affine.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/fuse_binary_into_following_affine.cc
@@ -285,13 +285,13 @@ bool FuseBinaryIntoFollowingAffine::Run(Model* model, std::size_t op_index) {
   AddMessageF("Fusing %s into the following %s", LogName(*binary_op),
               LogName(*following_op));
 
-  model->arrays.erase(binary_op->outputs[0]);
+  model->EraseArray(binary_op->outputs[0]);
   following_op->inputs[0] = binary_op->inputs[index_of_variable_input];
   const auto& old_constant_param_name =
       binary_op->inputs[index_of_constant_input];
   CHECK(IsConstantParameterArray(*model, old_constant_param_name));
   if (CountOpsWithInput(*model, old_constant_param_name) == 1) {
-    model->arrays.erase(old_constant_param_name);
+    model->EraseArray(old_constant_param_name);
   }
   model->operators.erase(binary_it);
   return true;
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc b/tensorflow/contrib/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc
index 8948653ec3..5b57178b18 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc
@@ -309,7 +309,7 @@ bool FuseBinaryIntoPrecedingAffine::Run(Model* model, std::size_t op_index) {
     LOG(FATAL) << "should not get here";
   }
 
-  model->arrays.erase(preceding_op->outputs[0]);
+  model->EraseArray(preceding_op->outputs[0]);
   preceding_op->outputs[0] = binary_op->outputs[0];
   preceding_op->fused_activation_function =
       binary_op->fused_activation_function;
@@ -317,7 +317,7 @@ bool FuseBinaryIntoPrecedingAffine::Run(Model* model, std::size_t op_index) {
       binary_op->inputs[index_of_constant_input];
   CHECK(IsConstantParameterArray(*model, old_constant_param_name));
   if (CountOpsWithInput(*model, old_constant_param_name) == 1) {
-    model->arrays.erase(old_constant_param_name);
+    model->EraseArray(old_constant_param_name);
   }
   model->operators.erase(binary_it);
   return true;
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.cc b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.cc
index f861c4147a..2340f0e850 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.cc
@@ -31,13 +31,13 @@ namespace {
 
 void PrintModelStats(const string& label, const Model& model) {
   int quantized_arrays = 0;
-  for (const auto& array : model.arrays) {
+  for (const auto& array : model.GetArrayMap()) {
     if (array.second->quantization_params) {
       quantized_arrays++;
     }
   }
   LOG(INFO) << label << ": " << model.operators.size() << " operators, "
-            << model.arrays.size() << " arrays (" << quantized_arrays
+            << model.GetArrayMap().size() << " arrays (" << quantized_arrays
             << " quantized)";
 }
 
@@ -91,14 +91,9 @@ void DiscardUselessConnectedComponentsAndRNNBackEdges(Model* model) {
     }
   } while (found_new_useful_arrays);
   // Erase arrays that aren't useful, and that are discardable.
-  for (auto it = model->arrays.begin(); it != model->arrays.end();) {
-    if (useful_arrays.count(it->first) ||
-        !IsDiscardableArray(*model, it->first)) {
-      ++it;
-    } else {
-      it = model->arrays.erase(it);
-    }
-  }
+  model->EraseArrays([&](const string& name) {
+    return (!useful_arrays.count(name) && IsDiscardableArray(*model, name));
+  });
   // Erase operators that do not produce a useful output array.
   for (auto it = model->operators.begin(); it != model->operators.end();) {
     // Only need to test the first output, as we simultaneously added all of
@@ -118,8 +113,8 @@ void DiscardUselessConnectedComponentsAndRNNBackEdges(Model* model) {
   std::vector<RnnState> rnn_states_to_keep;
   for (const auto& rnn_state : model->flags.rnn_states()) {
     const bool dangling =
-        !model->arrays.count(rnn_state.back_edge_source_array()) ||
-        !model->arrays.count(rnn_state.state_array());
+        !model->HasArray(rnn_state.back_edge_source_array()) ||
+        !model->HasArray(rnn_state.state_array());
     if (dangling) {
       CHECK(rnn_state.discardable());
     } else {
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/identify_l2_normalization.cc b/tensorflow/contrib/lite/toco/graph_transformations/identify_l2_normalization.cc
index 01b75e37c6..419a0776a6 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/identify_l2_normalization.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/identify_l2_normalization.cc
@@ -150,19 +150,19 @@ bool IdentifyL2Normalization::Run(Model* model, std::size_t op_index) {
 
   // Erase the subgraph that is now replaced by L2Normalization
   model->operators.erase(FindOperator(model, square_op));
-  model->arrays.erase(sum_op->inputs[0]);
+  model->EraseArray(sum_op->inputs[0]);
   if (sum_op->inputs.size() > 1) {
-    model->arrays.erase(sum_op->inputs[1]);
+    model->EraseArray(sum_op->inputs[1]);
   }
   model->operators.erase(FindOperator(model, sum_op));
   if (add_op) {
-    model->arrays.erase(add_op->inputs[0]);
-    model->arrays.erase(add_op->inputs[1]);
+    model->EraseArray(add_op->inputs[0]);
+    model->EraseArray(add_op->inputs[1]);
     model->operators.erase(FindOperator(model, add_op));
   }
-  model->arrays.erase(sqrt_or_rsqrt_op->inputs[0]);
+  model->EraseArray(sqrt_or_rsqrt_op->inputs[0]);
   model->operators.erase(FindOperator(model, sqrt_or_rsqrt_op));
-  model->arrays.erase(div_or_mul_op->inputs[1]);
+  model->EraseArray(div_or_mul_op->inputs[1]);
   model->operators.erase(FindOperator(model, div_or_mul_op));
   return true;
 }
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/identify_l2_pool.cc b/tensorflow/contrib/lite/toco/graph_transformations/identify_l2_pool.cc
index 1865416fc2..e4d52476c6 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/identify_l2_pool.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/identify_l2_pool.cc
@@ -92,8 +92,8 @@ bool IdentifyL2Pool::Run(Model* model, std::size_t op_index) {
   AddMessageF("Creating %s replacing equivalent subgraph", LogName(*l2pool_op));
 
   // Erase intermediate arrays, keeping input to square op.
-  model->arrays.erase(avpool_op->inputs[0]);
-  model->arrays.erase(sqrt_op->inputs[0]);
+  model->EraseArray(avpool_op->inputs[0]);
+  model->EraseArray(sqrt_op->inputs[0]);
 
   // Erase three operators being replaced.
   model->operators.erase(FindOperator(model, square_op));
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/identify_relu1.cc b/tensorflow/contrib/lite/toco/graph_transformations/identify_relu1.cc
index cfc77024e7..d36e950609 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/identify_relu1.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/identify_relu1.cc
@@ -89,12 +89,12 @@ bool IdentifyRelu1::Run(Model* model, std::size_t op_index) {
   AddMessageF("Creating %s replacing equivalent subgraph", LogName(*relu1_op));
 
   // Erase Maximum scalar input & operator
-  model->arrays.erase(maximum_op->inputs[scalar_input_index]);
+  model->EraseArray(maximum_op->inputs[scalar_input_index]);
   model->operators.erase(FindOperator(model, maximum_op));
 
   // Erase Minimum inputs & operator
-  model->arrays.erase(minimum_op->inputs[0]);
-  model->arrays.erase(minimum_op->inputs[1]);
+  model->EraseArray(minimum_op->inputs[0]);
+  model->EraseArray(minimum_op->inputs[1]);
   model->operators.erase(FindOperator(model, minimum_op));
 
   return true;
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc
index 29b55d9bfc..f0d107232b 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc
@@ -27,7 +27,7 @@ namespace {
 void SetDataTypeForAllOutputs(Model* model, Operator* op,
                               ArrayDataType data_type) {
   for (const auto& output : op->outputs) {
-    model->arrays[output]->data_type = data_type;
+    model->GetArray(output).data_type = data_type;
   }
 }
 }  // namespace
@@ -39,7 +39,7 @@ bool PropagateArrayDataTypes::Run(Model* model, std::size_t op_index) {
   // If the data type of some input is unknown, we need to yield.
   for (const auto& input : op->inputs) {
     if (!model->IsOptionalArray(input) &&
-        model->arrays[input]->data_type == ArrayDataType::kNone) {
+        model->GetArray(input).data_type == ArrayDataType::kNone) {
       return false;
     }
   }
@@ -47,7 +47,7 @@ bool PropagateArrayDataTypes::Run(Model* model, std::size_t op_index) {
   // end if we changed anything, and return the correct boolean value.
   std::unordered_map<string, ArrayDataType> old_output_data_types;
   for (const auto& output : op->outputs) {
-    old_output_data_types[output] = model->arrays[output]->data_type;
+    old_output_data_types[output] = model->GetArray(output).data_type;
   }
   // Do the actual output data types propagation.
   if (op->type == OperatorType::kDequantize ||
@@ -69,18 +69,18 @@ bool PropagateArrayDataTypes::Run(Model* model, std::size_t op_index) {
              op->type == OperatorType::kFill) {
     // These operators produce an output with the same type as their 2nd input
     CHECK_GE(op->inputs.size(), 2);
-    const ArrayDataType data_type = model->arrays[op->inputs[1]]->data_type;
+    const ArrayDataType data_type = model->GetArray(op->inputs[1]).data_type;
     SetDataTypeForAllOutputs(model, op, data_type);
   } else if (op->type == OperatorType::kCast) {
     // Data type of the Cast op is specified.
     CHECK_EQ(op->outputs.size(), 1);
     auto* cast_op = static_cast<CastOperator*>(op);
-    model->arrays[op->outputs[0]]->data_type = cast_op->dst_data_type;
+    model->GetArray(op->outputs[0]).data_type = cast_op->dst_data_type;
   } else if (op->type == OperatorType::kArgMax) {
     // Data type of the ArgMax op is specified.
     CHECK_EQ(op->outputs.size(), 1);
     auto* argmax_op = static_cast<ArgMaxOperator*>(op);
-    model->arrays[op->outputs[0]]->data_type = argmax_op->output_data_type;
+    model->GetArray(op->outputs[0]).data_type = argmax_op->output_data_type;
   } else if (op->type == OperatorType::kRange) {
     auto* range_op = static_cast<RangeOperator*>(op);
     // Output type of the Range op can be set via an attribute
@@ -91,7 +91,7 @@ bool PropagateArrayDataTypes::Run(Model* model, std::size_t op_index) {
     } else {
       // Otherwise use the first input
       CHECK_GE(op->inputs.size(), 1);
-      data_type = model->arrays[op->inputs[0]]->data_type;
+      data_type = model->GetArray(op->inputs[0]).data_type;
     }
     CHECK_EQ(op->outputs.size(), 1);
     SetDataTypeForAllOutputs(model, op, data_type);
@@ -103,7 +103,7 @@ bool PropagateArrayDataTypes::Run(Model* model, std::size_t op_index) {
     for (int i = 0; i < unsupported_op->output_data_types.size(); ++i) {
       auto output = op->outputs[i];
       auto data_type = unsupported_op->output_data_types[i];
-      model->arrays[output]->data_type = data_type;
+      model->GetArray(output).data_type = data_type;
     }
   } else if (op->type == OperatorType::kExpandDims) {
     // Yield on ExpandDim until it is converted to Reshape
@@ -111,12 +111,12 @@ bool PropagateArrayDataTypes::Run(Model* model, std::size_t op_index) {
   } else {
     // These operators produce outputs with the same type as their 1st input
     CHECK_GT(op->inputs.size(), 0);
-    const ArrayDataType data_type = model->arrays[op->inputs[0]]->data_type;
+    const ArrayDataType data_type = model->GetArray(op->inputs[0]).data_type;
     SetDataTypeForAllOutputs(model, op, data_type);
   }
   // Return true if any output data type changed, false if none changed.
   for (const auto& output : op->outputs) {
-    if (old_output_data_types[output] != model->arrays[output]->data_type) {
+    if (old_output_data_types[output] != model->GetArray(output).data_type) {
       return true;
     }
   }
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index a939efb4db..ff0a3bd881 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -85,7 +85,7 @@ void ComputeBinaryOperatorOutputSize(const Shape& input_shape1,
 
 int GetOutputDepthFromWeights(const Model& model, const Operator& op) {
   const string& weights_name = op.inputs[1];
-  const auto& weights_shape = model.arrays.at(weights_name)->shape();
+  const auto& weights_shape = model.GetArray(weights_name).shape();
   if (op.type == OperatorType::kConv ||
       op.type == OperatorType::kFullyConnected) {
     return weights_shape.dims(0);
@@ -98,7 +98,7 @@ int GetOutputDepthFromWeights(const Model& model, const Operator& op) {
 
 bool EnsureBiasVectorShape(Model* model, Operator* op) {
   const string& weights_name = op->inputs[1];
-  const auto& weights_array = *model->arrays[weights_name];
+  const auto& weights_array = model->GetArray(weights_name);
   // Yield until weights shape has been resolved.
   if (!weights_array.has_shape()) {
     return false;
@@ -107,7 +107,7 @@ bool EnsureBiasVectorShape(Model* model, Operator* op) {
   if (op->inputs.size() < 3) {
     return false;
   }
-  auto& bias_array = *model->arrays[op->inputs[2]];
+  auto& bias_array = model->GetArray(op->inputs[2]);
   if (bias_array.has_shape()) {
     return true;
   }
@@ -126,7 +126,7 @@ void ProcessConvOperator(Model* model, ConvOperator* op) {
     return;
   }
 
-  const auto& input_array = *model->arrays[op->inputs[0]];
+  const auto& input_array = model->GetArray(op->inputs[0]);
   // Yield until input dims have been resolved.
   if (!input_array.has_shape()) {
     return;
@@ -134,7 +134,7 @@ void ProcessConvOperator(Model* model, ConvOperator* op) {
   const auto& input_shape = input_array.shape();
   CHECK_EQ(input_shape.dimensions_count(), 4);
 
-  const auto& weights_array = *model->arrays[op->inputs[1]];
+  const auto& weights_array = model->GetArray(op->inputs[1]);
   // Yield until weights dims have been resolved.
   if (!weights_array.has_shape()) {
     return;
@@ -156,7 +156,7 @@ void ProcessConvOperator(Model* model, ConvOperator* op) {
   if (op->outputs.size() == 2) {
     const auto& output_shape = output_array.shape();
     const int input_depth = weights_shape.dims(3);
-    auto& im2col_array = *model->arrays[op->outputs[1]];
+    auto& im2col_array = model->GetArray(op->outputs[1]);
     im2col_array.copy_shape(Shape{output_shape.dims(0), output_shape.dims(1),
                                   output_shape.dims(2),
                                   input_depth * kheight * kwidth});
@@ -168,7 +168,7 @@ void ProcessDepthwiseConvOperator(Model* model, DepthwiseConvOperator* op) {
     return;
   }
 
-  const auto& input_array = *model->arrays[op->inputs[0]];
+  const auto& input_array = model->GetArray(op->inputs[0]);
   // Yield until input dims have been resolved.
   if (!input_array.has_shape()) {
     return;
@@ -176,7 +176,7 @@ void ProcessDepthwiseConvOperator(Model* model, DepthwiseConvOperator* op) {
   const auto& input_shape = input_array.shape();
   CHECK_EQ(input_shape.dimensions_count(), 4);
 
-  const auto& weights_array = *model->arrays[op->inputs[1]];
+  const auto& weights_array = model->GetArray(op->inputs[1]);
   // Yield until weights dims have been resolved.
   if (!weights_array.has_shape()) {
     return;
@@ -209,7 +209,7 @@ void ProcessDepthwiseConvOperator(Model* model, DepthwiseConvOperator* op) {
 }
 
 void ProcessDepthToSpaceOperator(Model* model, DepthToSpaceOperator* op) {
-  const auto& input_array = *model->arrays[op->inputs[0]];
+  const auto& input_array = model->GetArray(op->inputs[0]);
   // Yield until input dims have been resolved.
   if (!input_array.has_shape()) {
     return;
@@ -232,7 +232,7 @@ void ProcessDepthToSpaceOperator(Model* model, DepthToSpaceOperator* op) {
 }
 
 void ProcessSpaceToDepthOperator(Model* model, SpaceToDepthOperator* op) {
-  const auto& input_array = *model->arrays[op->inputs[0]];
+  const auto& input_array = model->GetArray(op->inputs[0]);
   // Yield until input dims have been resolved.
   if (!input_array.has_shape()) {
     return;
@@ -258,7 +258,7 @@ void ProcessSpaceToDepthOperator(Model* model, SpaceToDepthOperator* op) {
 void ProcessFillOperator(Model* model, FillOperator* op) {
   CHECK_EQ(op->inputs.size(), 2);
   CHECK_EQ(op->outputs.size(), 1);
-  auto& output_array = *model->arrays[op->outputs[0]];
+  auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.has_shape()) {
     // We have already run
     return;
@@ -287,7 +287,7 @@ void ProcessFullyConnectedOperator(Model* model, FullyConnectedOperator* op) {
     return;
   }
 
-  const auto& input_array = *model->arrays[op->inputs[0]];
+  const auto& input_array = model->GetArray(op->inputs[0]);
   // Yield until input dims have been resolved.
   if (!input_array.has_shape()) {
     return;
@@ -295,7 +295,7 @@ void ProcessFullyConnectedOperator(Model* model, FullyConnectedOperator* op) {
   const auto& input_shape = input_array.shape();
   CHECK_GE(input_shape.dimensions_count(), 1);
 
-  const auto& weights_array = *model->arrays[op->inputs[1]];
+  const auto& weights_array = model->GetArray(op->inputs[1]);
   // Yield until weights dims have been resolved.
   if (!weights_array.has_shape()) {
     return;
@@ -315,13 +315,13 @@ void ProcessFullyConnectedOperator(Model* model, FullyConnectedOperator* op) {
 
 void ProcessTensorFlowReshapeOperator(Model* model,
                                       TensorFlowReshapeOperator* op) {
-  auto& output_array = *model->arrays[op->outputs[0]];
+  auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.has_shape()) {
     // We have already run
     return;
   }
 
-  const auto& input_array = *model->arrays[op->inputs[0]];
+  const auto& input_array = model->GetArray(op->inputs[0]);
   if (!input_array.has_shape()) {
     // Yield until input dims have been resolved.
     return;
@@ -377,14 +377,14 @@ void ProcessTensorFlowReshapeOperator(Model* model,
 }
 
 void ProcessSimpleOperator(Model* model, Operator* op) {
-  const auto& input_array = *model->arrays[op->inputs[0]];
+  const auto& input_array = model->GetArray(op->inputs[0]);
   // Yield until input dims have been resolved.
   if (!input_array.has_shape()) {
     return;
   }
 
   const string& output_name = op->outputs[0];
-  auto& output_array = *model->arrays[output_name];
+  auto& output_array = model->GetArray(output_name);
   if (output_array.has_shape()) {
     return;
   }
@@ -394,14 +394,14 @@ void ProcessSimpleOperator(Model* model, Operator* op) {
 
 void ProcessSimpleBinaryOperator(Model* model, Operator* op) {
   CHECK_EQ(op->inputs.size(), 2);
-  const auto& input0_array = *model->arrays[op->inputs[0]];
-  const auto& input1_array = *model->arrays[op->inputs[1]];
+  const auto& input0_array = model->GetArray(op->inputs[0]);
+  const auto& input1_array = model->GetArray(op->inputs[1]);
   // Yield until input dims have been resolved.
   if (!input0_array.has_shape() || !input1_array.has_shape()) {
     return;
   }
   const string& output_name = op->outputs[0];
-  auto& output_array = *model->arrays[output_name];
+  auto& output_array = model->GetArray(output_name);
   ComputeBinaryOperatorOutputSize(input0_array.shape(), input1_array.shape(),
                                   &output_array);
 }
@@ -424,11 +424,11 @@ bool KeepDims(const Operator& op) {
 
 void ProcessTensorFlowReductionOperator(Model* model, Operator* op) {
   CHECK_LE(op->inputs.size(), 2);
-  auto& output_array = *model->arrays[op->outputs[0]];
+  auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.has_shape()) {
     return;
   }
-  const auto& input_array = *model->arrays[op->inputs[0]];
+  const auto& input_array = model->GetArray(op->inputs[0]);
   if (!input_array.has_shape()) {
     return;
   }
@@ -436,7 +436,7 @@ void ProcessTensorFlowReductionOperator(Model* model, Operator* op) {
   const bool keep_dims = KeepDims(*op);
   if (op->inputs.size() == 2) {
     // There is a reduction_indices input.
-    const auto& reduction_array = *model->arrays[op->inputs[1]];
+    const auto& reduction_array = model->GetArray(op->inputs[1]);
     if (!reduction_array.buffer) {
       return;
     }
@@ -476,11 +476,11 @@ void ProcessSliceOperator(Model* model, SliceOperator* op) {
   if (op->begin.empty()) return;
 
   // Yield until input dims have been resolved.
-  const auto& input_array = *model->arrays[op->inputs[0]];
+  const auto& input_array = model->GetArray(op->inputs[0]);
   if (!input_array.has_shape()) return;
   const Shape& input_shape = input_array.shape();
 
-  auto& output_array = *model->arrays[op->outputs[0]];
+  auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.has_shape()) return;
 
   CHECK_EQ(input_shape.dims().size(), op->size.size());
@@ -500,7 +500,7 @@ void ProcessSliceOperator(Model* model, SliceOperator* op) {
 
 void ProcessReorderAxesOperator(Model* model, ReorderAxesOperator* op) {
   const string& input_name = op->inputs[0];
-  const auto& input_array = *model->arrays[input_name];
+  const auto& input_array = model->GetArray(input_name);
   // Yield until input dims have been resolved.
   if (!input_array.has_shape()) {
     return;
@@ -515,20 +515,20 @@ void ProcessReorderAxesOperator(Model* model, ReorderAxesOperator* op) {
 void ProcessConcatenationOperator(Model* model, ConcatenationOperator* op) {
   // Yield until input dims have been resolved.
   for (const auto& input_name : op->inputs) {
-    auto& input_array = *model->arrays[input_name];
+    auto& input_array = model->GetArray(input_name);
     if (!input_array.has_shape()) {
       return;
     }
   }
   auto& output_array = model->GetArray(op->outputs[0]);
   // Use 0 input as basis for output dimensions.
-  const auto& first_input_array = *model->arrays[op->inputs[0]];
+  const auto& first_input_array = model->GetArray(op->inputs[0]);
   output_array.copy_shape(first_input_array.shape());
   // Determine the concat size, and enfore that all inputs have
   // the same dimensions count.
   int concat_size = 0;
   for (const auto& input_name : op->inputs) {
-    auto& input_array = *model->arrays[input_name];
+    auto& input_array = model->GetArray(input_name);
     CHECK(input_array.has_shape());
     if (input_array.shape().dimensions_count() == 0) {
       continue;
@@ -548,16 +548,16 @@ void ProcessConcatenationOperator(Model* model, ConcatenationOperator* op) {
 
 void ProcessRangeOperator(Model* model, RangeOperator* op) {
   CHECK_EQ(op->inputs.size(), 3);
-  const auto& start_array = *model->arrays[op->inputs[0]];
+  const auto& start_array = model->GetArray(op->inputs[0]);
   if (!start_array.has_shape()) {
     // Yield until input dims have been resolved.
     return;
   }
-  const auto& limit_array = *model->arrays[op->inputs[1]];
+  const auto& limit_array = model->GetArray(op->inputs[1]);
   if (!limit_array.has_shape()) {
     return;
   }
-  const auto& delta_array = *model->arrays[op->inputs[2]];
+  const auto& delta_array = model->GetArray(op->inputs[2]);
   if (!delta_array.has_shape()) {
     return;
   }
@@ -599,7 +599,7 @@ void ProcessRangeOperator(Model* model, RangeOperator* op) {
 void ProcessTensorFlowSplitOperator(Model* model, TensorFlowSplitOperator* op) {
   CHECK_EQ(op->inputs.size(), 2);
   const string& input_name = op->inputs[1];
-  const auto& input_array = *model->arrays[input_name];
+  const auto& input_array = model->GetArray(input_name);
   // Yield until input dims have been resolved.
   if (!input_array.has_shape()) {
     return;
@@ -618,13 +618,13 @@ void ProcessTensorFlowSplitOperator(Model* model, TensorFlowSplitOperator* op) {
 
   CHECK_EQ(op->outputs.size(), op->num_split);
   for (const auto& output : op->outputs) {
-    model->arrays[output]->copy_shape(output_shape);
+    model->GetArray(output).copy_shape(output_shape);
   }
 }
 
 void ProcessAveragePoolOperator(Model* model, AveragePoolOperator* op) {
   const string& input_name = op->inputs[0];
-  const auto& input_array = *model->arrays[input_name];
+  const auto& input_array = model->GetArray(input_name);
   // Yield until input dims have been resolved.
   if (!input_array.has_shape()) {
     return;
@@ -641,7 +641,7 @@ void ProcessAveragePoolOperator(Model* model, AveragePoolOperator* op) {
 
 void ProcessMaxPoolOperator(Model* model, MaxPoolOperator* op) {
   const string& input_name = op->inputs[0];
-  const auto& input_array = *model->arrays[input_name];
+  const auto& input_array = model->GetArray(input_name);
   // Yield until input dims have been resolved.
   if (!input_array.has_shape()) {
     return;
@@ -658,7 +658,7 @@ void ProcessMaxPoolOperator(Model* model, MaxPoolOperator* op) {
 
 void ProcessL2PoolOperator(Model* model, L2PoolOperator* op) {
   const string& input_name = op->inputs[0];
-  const auto& input_array = *model->arrays[input_name];
+  const auto& input_array = model->GetArray(input_name);
   // Yield until input dims have been resolved.
   if (!input_array.has_shape()) {
     return;
@@ -679,14 +679,14 @@ void ProcessResizeBilinearOperator(Model* model, ResizeBilinearOperator* op) {
   CHECK_EQ(op->inputs.size(), 2);
   CHECK_EQ(op->outputs.size(), 1);
 
-  if (!model->arrays[op->inputs[0]]->has_shape() ||
-      !model->arrays[op->inputs[1]]->has_shape()) {
+  if (!model->GetArray(op->inputs[0]).has_shape() ||
+      !model->GetArray(op->inputs[1]).has_shape()) {
     return;
   }
-  const auto& input_data_shape = model->arrays[op->inputs[0]]->shape();
+  const auto& input_data_shape = model->GetArray(op->inputs[0]).shape();
 
   const string& output_size_name = op->inputs[1];
-  const auto& output_size_array = *model->arrays[output_size_name];
+  const auto& output_size_array = model->GetArray(output_size_name);
   CHECK(output_size_array.data_type == ArrayDataType::kInt32);
   CHECK(output_size_array.has_shape());
   const auto& output_size_shape = output_size_array.shape();
@@ -697,9 +697,9 @@ void ProcessResizeBilinearOperator(Model* model, ResizeBilinearOperator* op) {
   }
   std::vector<int32> output_shape =
       output_size_array.GetBuffer<ArrayDataType::kInt32>().data;
-  model->arrays[op->outputs[0]]->copy_shape(
-      Shape({input_data_shape.dims(0), output_shape[0], output_shape[1],
-             input_data_shape.dims(3)}));
+  model->GetArray(op->outputs[0])
+      .copy_shape(Shape({input_data_shape.dims(0), output_shape[0],
+                         output_shape[1], input_data_shape.dims(3)}));
 }
 
 void ProcessLstmCellOperator(Model* model, LstmCellOperator* op) {
@@ -708,7 +708,7 @@ void ProcessLstmCellOperator(Model* model, LstmCellOperator* op) {
   QCHECK_EQ(op->outputs.size(), LstmCellOperator::NUM_OUTPUTS);
 
   const auto& input_array =
-      *model->arrays[op->inputs[LstmCellOperator::DATA_INPUT]];
+      model->GetArray(op->inputs[LstmCellOperator::DATA_INPUT]);
   // Yield until all input dims have been resolved.
   if (!input_array.has_shape()) {
     return;
@@ -717,7 +717,7 @@ void ProcessLstmCellOperator(Model* model, LstmCellOperator* op) {
   CHECK_GE(input_shape.dimensions_count(), 2);
 
   const auto& prev_activ_array =
-      *model->arrays[op->inputs[LstmCellOperator::PREV_ACTIV_INPUT]];
+      model->GetArray(op->inputs[LstmCellOperator::PREV_ACTIV_INPUT]);
   // Yield until all input dims have been resolved.
   if (!prev_activ_array.has_shape()) {
     return;
@@ -726,7 +726,7 @@ void ProcessLstmCellOperator(Model* model, LstmCellOperator* op) {
   CHECK_GE(prev_activ_shape.dimensions_count(), 2);
 
   const auto& weights_array =
-      *model->arrays[op->inputs[LstmCellOperator::WEIGHTS_INPUT]];
+      model->GetArray(op->inputs[LstmCellOperator::WEIGHTS_INPUT]);
   // Yield until weights dims have been resolved.
   if (!weights_array.has_shape()) {
     return;
@@ -735,7 +735,7 @@ void ProcessLstmCellOperator(Model* model, LstmCellOperator* op) {
   CHECK_EQ(weights_shape.dimensions_count(), 2);
 
   const auto& bias_array =
-      *model->arrays[op->inputs[LstmCellOperator::BIASES_INPUT]];
+      model->GetArray(op->inputs[LstmCellOperator::BIASES_INPUT]);
   // Yield until bias dims have been resolved.
   if (!bias_array.has_shape()) {
     return;
@@ -744,7 +744,7 @@ void ProcessLstmCellOperator(Model* model, LstmCellOperator* op) {
   CHECK_GE(bias_shape.dimensions_count(), 1);
 
   const auto& prev_state_array =
-      *model->arrays[op->inputs[LstmCellOperator::PREV_STATE_INPUT]];
+      model->GetArray(op->inputs[LstmCellOperator::PREV_STATE_INPUT]);
   // Yield until all input dims have been resolved.
   if (!prev_state_array.has_shape()) {
     return;
@@ -784,7 +784,7 @@ void ProcessLstmCellOperator(Model* model, LstmCellOperator* op) {
 }
 
 void ProcessSpaceToBatchNDOperator(Model* model, SpaceToBatchNDOperator* op) {
-  const auto& input_array = *model->arrays[op->inputs[0]];
+  const auto& input_array = model->GetArray(op->inputs[0]);
   // Yield until input dims have been resolved.
   if (!input_array.has_shape()) {
     return;
@@ -797,8 +797,8 @@ void ProcessSpaceToBatchNDOperator(Model* model, SpaceToBatchNDOperator* op) {
   const auto input_height = input_shape.dims(1);
   const auto input_width = input_shape.dims(2);
 
-  const auto& block_shape_array = *model->arrays[op->inputs[1]];
-  const auto& paddings_array = *model->arrays[op->inputs[2]];
+  const auto& block_shape_array = model->GetArray(op->inputs[1]);
+  const auto& paddings_array = model->GetArray(op->inputs[2]);
   const auto& block_shape_array_shape = block_shape_array.shape();
   const auto& paddings_array_shape = paddings_array.shape();
   QCHECK_EQ(block_shape_array_shape.dimensions_count(), 1);
@@ -830,13 +830,13 @@ void ProcessSpaceToBatchNDOperator(Model* model, SpaceToBatchNDOperator* op) {
   int output_height = height_with_paddings / block_height;
   int output_width = width_with_paddings / block_width;
 
-  model->arrays[op->outputs[0]]->copy_shape(
-      Shape({input_shape.dims(0) * block_height * block_width, output_height,
-             output_width, input_shape.dims(3)}));
+  model->GetArray(op->outputs[0])
+      .copy_shape(Shape({input_shape.dims(0) * block_height * block_width,
+                         output_height, output_width, input_shape.dims(3)}));
 }
 
 void ProcessBatchToSpaceNDOperator(Model* model, BatchToSpaceNDOperator* op) {
-  const auto& input_array = *model->arrays[op->inputs[0]];
+  const auto& input_array = model->GetArray(op->inputs[0]);
   // Yield until input dims have been resolved.
   if (!input_array.has_shape()) {
     return;
@@ -846,8 +846,8 @@ void ProcessBatchToSpaceNDOperator(Model* model, BatchToSpaceNDOperator* op) {
   const auto input_height = input_shape.dims(1);
   const auto input_width = input_shape.dims(2);
 
-  const auto& block_shape_array = *model->arrays[op->inputs[1]];
-  const auto& crops_array = *model->arrays[op->inputs[2]];
+  const auto& block_shape_array = model->GetArray(op->inputs[1]);
+  const auto& crops_array = model->GetArray(op->inputs[2]);
   const auto& block_shape_array_shape = block_shape_array.shape();
   const auto& crops_array_shape = crops_array.shape();
   QCHECK_EQ(block_shape_array_shape.dimensions_count(), 1);
@@ -882,15 +882,15 @@ void ProcessBatchToSpaceNDOperator(Model* model, BatchToSpaceNDOperator* op) {
   int output_height = input_height * block_height;
   int output_width = input_width * block_width;
 
-  model->arrays[op->outputs[0]]->copy_shape(
-      Shape({input_shape.dims(0) / (block_height * block_width), output_height,
-             output_width, input_shape.dims(3)}));
+  model->GetArray(op->outputs[0])
+      .copy_shape(Shape({input_shape.dims(0) / (block_height * block_width),
+                         output_height, output_width, input_shape.dims(3)}));
 }
 
 void ProcessGatherOperator(Model* model, GatherOperator* op) {
-  const auto& input_array = *model->arrays[op->inputs[0]];
-  const auto& indices_array = *model->arrays[op->inputs[1]];
-  auto& output_array = *model->arrays[op->outputs[0]];
+  const auto& input_array = model->GetArray(op->inputs[0]);
+  const auto& indices_array = model->GetArray(op->inputs[1]);
+  auto& output_array = model->GetArray(op->outputs[0]);
 
   // Bail if we already know the output shape.
   if (output_array.has_shape()) {
@@ -924,7 +924,7 @@ void ProcessPadOperator(Model* model, PadOperator* op) {
   CHECK_EQ(op->inputs.size(), 2);
   CHECK_EQ(op->outputs.size(), 1);
 
-  const auto& input_array = *model->arrays[op->inputs[0]];
+  const auto& input_array = model->GetArray(op->inputs[0]);
 
   // Yield until input dims have been resolved.
   if (!input_array.has_shape()) return;
@@ -932,7 +932,7 @@ void ProcessPadOperator(Model* model, PadOperator* op) {
   if (op->left_padding.empty()) return;
   CHECK_EQ(op->left_padding.size(), op->right_padding.size());
 
-  auto& output_array = *model->arrays[op->outputs[0]];
+  auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.has_shape()) return;
 
   Shape output_shape = input_array.shape();
@@ -949,13 +949,13 @@ void ProcessPadOperator(Model* model, PadOperator* op) {
 void ProcessRankOperator(Model* model, RankOperator* op) {
   CHECK_GE(op->inputs.size(), 1);
   CHECK_EQ(op->outputs.size(), 1);
-  auto& output_array = *model->arrays[op->outputs[0]];
+  auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.has_shape()) {
     // Shape already propagated
     return;
   }
 
-  const auto& input_array = *model->arrays[op->inputs[0]];
+  const auto& input_array = model->GetArray(op->inputs[0]);
   if (!input_array.has_shape()) {
     // Yield until input dims have been resolved.
     return;
@@ -970,13 +970,13 @@ void ProcessRankOperator(Model* model, RankOperator* op) {
 void ProcessShapeOperator(Model* model, TensorFlowShapeOperator* op) {
   CHECK_GE(op->inputs.size(), 1);
   CHECK_EQ(op->outputs.size(), 1);
-  auto& output_array = *model->arrays[op->outputs[0]];
+  auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.has_shape()) {
     // Shape already propagated
     return;
   }
 
-  const auto& input_array = *model->arrays[op->inputs[0]];
+  const auto& input_array = model->GetArray(op->inputs[0]);
   if (!input_array.has_shape()) {
     // Yield until input dims have been resolved.
     return;
@@ -991,7 +991,7 @@ void ProcessShapeOperator(Model* model, TensorFlowShapeOperator* op) {
 void ProcessStackOperator(Model* model, StackOperator* op) {
   CHECK_GE(op->inputs.size(), 1);
   CHECK_EQ(op->outputs.size(), 1);
-  auto& output_array = *model->arrays[op->outputs[0]];
+  auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.has_shape()) {
     // Shape already propagated
     return;
@@ -1032,7 +1032,7 @@ void ProcessStackOperator(Model* model, StackOperator* op) {
 void ProcessStridedSliceOperator(Model* model, StridedSliceOperator* op) {
   CHECK_GE(op->inputs.size(), 1);
   CHECK_EQ(op->outputs.size(), 1);
-  auto& output_array = *model->arrays[op->outputs[0]];
+  auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.has_shape()) {
     // Shape already propagated
     return;
@@ -1112,12 +1112,12 @@ void ProcessSqueezeOperator(Model* model, SqueezeOperator* op) {
   CHECK_EQ(op->inputs.size(), 1);
   CHECK_EQ(op->outputs.size(), 1);
 
-  const auto& input_array = *model->arrays[op->inputs[0]];
+  const auto& input_array = model->GetArray(op->inputs[0]);
 
   // Yield until input dims have been resolved.
   if (!input_array.has_shape()) return;
 
-  auto& output_array = *model->arrays[op->outputs[0]];
+  auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.has_shape()) return;
 
   const std::vector<int>& input_dims = input_array.shape().dims();
@@ -1136,18 +1136,18 @@ void ProcessSqueezeOperator(Model* model, SqueezeOperator* op) {
 
 void ProcessSvdfOperator(Model* model, SvdfOperator* op) {
   CHECK(op->inputs.size() == 3 || op->inputs.size() == 4);
-  const auto& input_array = *model->arrays[op->inputs[0]];
+  const auto& input_array = model->GetArray(op->inputs[0]);
   if (!input_array.has_shape()) return;
 
-  auto& weights_feature_array = *model->arrays[op->inputs[1]];
+  auto& weights_feature_array = model->GetArray(op->inputs[1]);
   if (!weights_feature_array.has_shape()) return;
 
-  const auto& weights_time_array = *model->arrays[op->inputs[2]];
+  const auto& weights_time_array = model->GetArray(op->inputs[2]);
   if (!weights_time_array.has_shape()) return;
 
   const bool has_bias = (op->inputs.size() == 4);
   if (has_bias) {
-    const auto& bias_array = *model->arrays[op->inputs[3]];
+    const auto& bias_array = model->GetArray(op->inputs[3]);
     if (!bias_array.has_shape()) return;
   }
 
@@ -1164,13 +1164,13 @@ void ProcessSvdfOperator(Model* model, SvdfOperator* op) {
 }
 
 void ProcessTransposeOperator(Model* model, TransposeOperator* op) {
-  auto& output_array = *model->arrays[op->outputs[0]];
+  auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.has_shape()) {
     // We have already run
     return;
   }
 
-  const auto& input_array = *model->arrays[op->inputs[0]];
+  const auto& input_array = model->GetArray(op->inputs[0]);
   if (!input_array.has_shape()) {
     // Yield until input dims have been resolved.
     return;
@@ -1204,7 +1204,7 @@ void ProcessTransposeOperator(Model* model, TransposeOperator* op) {
 
 void ProcessArgMaxOperator(Model* model, ArgMaxOperator* op) {
   CHECK_EQ(op->inputs.size(), 2);
-  const auto& input_array = *model->arrays[op->inputs[0]];
+  const auto& input_array = model->GetArray(op->inputs[0]);
   // Yield until input dims have been resolved.
   if (!input_array.has_shape()) {
     return;
@@ -1222,7 +1222,7 @@ void ProcessArgMaxOperator(Model* model, ArgMaxOperator* op) {
   }
   output_dims.push_back(1);
   const string& output_name = op->outputs[0];
-  auto& output_array = *model->arrays[output_name];
+  auto& output_array = model->GetArray(output_name);
   if (output_array.has_shape()) {
     return;
   }
@@ -1236,8 +1236,8 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
   auto* op = it->get();
   std::unordered_map<string, std::vector<int>> old_output_dims;
   for (const auto& output : op->outputs) {
-    if (model->arrays[output]->has_shape()) {
-      old_output_dims[output] = model->arrays[output]->shape().dims();
+    if (model->GetArray(output).has_shape()) {
+      old_output_dims[output] = model->GetArray(output).shape().dims();
     }
   }
 
@@ -1433,10 +1433,10 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
   // Return true if any output dim changed, false if none changed.
   // Assumption: no transformation clears an output shape, they only add shapes.
   for (const auto& output : op->outputs) {
-    if (model->arrays[output]->has_shape() &&
-        (old_output_dims[output] != model->arrays[output]->shape().dims())) {
+    if (model->GetArray(output).has_shape() &&
+        (old_output_dims[output] != model->GetArray(output).shape().dims())) {
       AddMessageF("Set shape of %s to [%s]", output,
-                  absl::StrJoin(model->arrays[output]->shape().dims(), ","));
+                  absl::StrJoin(model->GetArray(output).shape().dims(), ","));
       return true;
     }
   }
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
index 56082b965a..b973b2b813 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
@@ -412,7 +412,7 @@ bool Quantize::Run(Model* model, std::size_t op_index) {
               model->flags.set_output_arrays(i, dequantize_op->inputs[0]);
             }
           }
-          model->arrays.erase(dequantize_op->outputs[0]);
+          model->EraseArray(dequantize_op->outputs[0]);
           model->operators.erase(dequantize_it);
         }
       }
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/read_fake_quant_min_max.cc b/tensorflow/contrib/lite/toco/graph_transformations/read_fake_quant_min_max.cc
index 371ced388a..11f8d4b6ee 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/read_fake_quant_min_max.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/read_fake_quant_min_max.cc
@@ -80,7 +80,7 @@ bool ReadFakeQuantMinMax::Run(Model* model, std::size_t op_index) {
     // else.
     for (int i = 1; i <= 2; i++) {
       if (CountOpsWithInput(*model, fakequant_op->inputs[i]) == 1) {
-        model->arrays.erase(fakequant_op->inputs[i]);
+        model->EraseArray(fakequant_op->inputs[i]);
       }
     }
     fakequant_op->inputs.resize(1);
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_final_dequantize_op.cc b/tensorflow/contrib/lite/toco/graph_transformations/remove_final_dequantize_op.cc
index 3992e7d1ef..c3b2709a33 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/remove_final_dequantize_op.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_final_dequantize_op.cc
@@ -51,7 +51,7 @@ bool RemoveFinalDequantizeOp::Run(Model* model, std::size_t op_index) {
 
   // Remove the node and its output array.
   AddMessageF("Removed final %s", LogName(*dequantize_op));
-  model->arrays.erase(output);
+  model->EraseArray(output);
   model->operators.erase(dequantize_it);
   return true;
 }
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_binary.cc b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_binary.cc
index 6add443f2d..8512e6bb5a 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_binary.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_binary.cc
@@ -81,7 +81,7 @@ bool RemoveTrivialBinaryOperator::Run(Model* model, std::size_t op_index) {
   // Now check if the constant operand makes this binary
   // operator trivial.
   const auto& constant_input_array =
-      *model->arrays[binary_op->inputs[index_of_constant_input]];
+      model->GetArray(binary_op->inputs[index_of_constant_input]);
   // For now, we only handle floats here.
   if (constant_input_array.data_type != ArrayDataType::kFloat) {
     return false;
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_concatenation_input.cc b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_concatenation_input.cc
index 23a5c857e8..936854a04f 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_concatenation_input.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_concatenation_input.cc
@@ -59,7 +59,7 @@ bool RemoveTrivialConcatenationInput::Run(Model* model, std::size_t op_index) {
   for (const string& input : trivial_inputs) {
     if (IsDiscardableArray(*model, input) &&
         CountOpsWithInput(*model, input) == 1) {
-      model->arrays.erase(input);
+      model->EraseArray(input);
     }
   }
   concat_op->inputs = nontrivial_inputs;
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.cc b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.cc
index 047389f69a..587f171bbf 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.cc
@@ -124,7 +124,7 @@ bool RemoveTrivialPassthroughOp(GraphTransformation* transformation,
       }
     }
     if (!is_referenced) {
-      model->arrays.erase(removal_candidate);
+      model->EraseArray(removal_candidate);
     }
   }
 
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_unused_op.cc b/tensorflow/contrib/lite/toco/graph_transformations/remove_unused_op.cc
index e6cca8acf3..aa2c293382 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/remove_unused_op.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_unused_op.cc
@@ -33,7 +33,7 @@ bool RemoveUnusedOp::Run(Model* model, std::size_t op_index) {
   // the model. We allow specifying an arbitrary input_array,
   // treating the part of the graph leading up to it as unused.
   for (const auto& output : op->outputs) {
-    CHECK(model->arrays.count(output));
+    CHECK(model->HasArray(output));
     // If this output is provided as the model's input array,
     // then we don't need this operator to produce its contents.
     if (IsInputArray(*model, output)) {
@@ -93,7 +93,7 @@ bool RemoveUnusedOp::Run(Model* model, std::size_t op_index) {
     if (IsDiscardableArray(*model, input) &&
         CountOpsWithInput(*model, input) == 1 &&
         !GetOpWithOutput(*model, input)) {
-      model->arrays.erase(input);
+      model->EraseArray(input);
     }
   }
 
@@ -116,7 +116,7 @@ bool RemoveUnusedOp::Run(Model* model, std::size_t op_index) {
       continue;
     }
     // Generic case: do delete this output array.
-    model->arrays.erase(output);
+    model->EraseArray(output);
   }
   model->operators.erase(it);
   return true;
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_batch_normalization.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_batch_normalization.cc
index 3eb7fa3896..fb109eb91b 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_batch_normalization.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_batch_normalization.cc
@@ -121,9 +121,9 @@ bool ResolveBatchNormalization::Run(Model* model, std::size_t op_index) {
   }
 
   // Remove the old param arrays
-  model->arrays.erase(bn_op->inputs[1]);
-  model->arrays.erase(bn_op->inputs[2]);
-  model->arrays.erase(bn_op->inputs[3]);
+  model->EraseArray(bn_op->inputs[1]);
+  model->EraseArray(bn_op->inputs[2]);
+  model->EraseArray(bn_op->inputs[3]);
 
   // Remove the old operator
   DCHECK_EQ(bn_it->get(), bn_op);
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_batch_to_space_nd_attributes.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_batch_to_space_nd_attributes.cc
index 7777d4f543..a06919e228 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_batch_to_space_nd_attributes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_batch_to_space_nd_attributes.cc
@@ -42,7 +42,7 @@ bool ResolveBatchToSpaceNDAttributes::Run(Model* model, std::size_t op_index) {
     return false;
 
   // Handle crops
-  const auto& crops_array = *model->arrays[op->inputs[2]];
+  const auto& crops_array = model->GetArray(op->inputs[2]);
   if (!crops_array.has_shape()) return false;
   const std::vector<int>& crops_dims = crops_array.shape().dims();
   if (crops_dims.size() != 2) {
@@ -58,7 +58,7 @@ bool ResolveBatchToSpaceNDAttributes::Run(Model* model, std::size_t op_index) {
   }
 
   // Handle block_shape
-  const auto& block_shape_array = *model->arrays[op->inputs[1]];
+  const auto& block_shape_array = model->GetArray(op->inputs[1]);
   if (!block_shape_array.has_shape()) return false;
   const std::vector<int>& block_shape_dims = block_shape_array.shape().dims();
   CHECK_EQ(block_shape_dims.size(), 1);
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_binary.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_binary.cc
index fd51df4058..5e779f6765 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_binary.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_binary.cc
@@ -166,8 +166,9 @@ void EvaluateBinaryOperatorOnConstantInputs(Model* model,
 
 void EvaluateBinaryOperatorOnConstantInputs(Model* model,
                                             const Operator* binary_op) {
-  const auto inputs_data_type = model->arrays[binary_op->inputs[0]]->data_type;
-  const auto output_data_type = model->arrays[binary_op->outputs[0]]->data_type;
+  const auto inputs_data_type = model->GetArray(binary_op->inputs[0]).data_type;
+  const auto output_data_type =
+      model->GetArray(binary_op->outputs[0]).data_type;
 #define TOCO_HANDLE_CASE(InputsDataType, OutputDataType)                    \
   if (inputs_data_type == InputsDataType &&                                 \
       output_data_type == OutputDataType) {                                 \
@@ -214,7 +215,7 @@ bool ResolveConstantBinaryOperator::Run(Model* model, std::size_t op_index) {
     return false;
   }
 
-  auto& output_array = *model->arrays[binary_op->outputs[0]];
+  auto& output_array = model->GetArray(binary_op->outputs[0]);
   // Yield until the output array dims have been resolved.
   if (!output_array.has_shape()) {
     return false;
@@ -239,10 +240,10 @@ bool ResolveConstantBinaryOperator::Run(Model* model, std::size_t op_index) {
 
   // Remove the binary operator and its inputs
   if (CountOpsWithInput(*model, binary_op->inputs[0]) == 1) {
-    model->arrays.erase(binary_op->inputs[0]);
+    model->EraseArray(binary_op->inputs[0]);
   }
   if (CountOpsWithInput(*model, binary_op->inputs[1]) == 1) {
-    model->arrays.erase(binary_op->inputs[1]);
+    model->EraseArray(binary_op->inputs[1]);
   }
   AddMessageF("Resolved constant %s to the equivalent constant array",
               LogName(*binary_op));
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc
index 9835f86398..833c97c758 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc
@@ -189,7 +189,7 @@ bool ResolveConstantConcatenation::Run(Model* model, std::size_t op_index) {
 
   // Remove all the resolved arrays.
   for (const string& input_name : concat_op->inputs) {
-    model->arrays.erase(input_name);
+    model->EraseArray(input_name);
   }
 
   // Remove concatenate operator
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fake_quant.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fake_quant.cc
index 244adcc4c4..81fe37d7e0 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fake_quant.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fake_quant.cc
@@ -66,7 +66,7 @@ bool ResolveConstantFakeQuant::Run(Model* model, std::size_t op_index) {
     output_buffer.data[i] = dst_val;
   }
   if (CountOpsWithInput(*model, fakequant_op->inputs[0]) == 1) {
-    model->arrays.erase(fakequant_op->inputs[0]);
+    model->EraseArray(fakequant_op->inputs[0]);
   }
   model->operators.erase(fakequant_it);
 
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fill.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fill.cc
index 9da51d9147..f6f95481b5 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fill.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fill.cc
@@ -104,11 +104,11 @@ bool ResolveConstantFill::Run(Model* model, std::size_t op_index) {
   // Erase input arrays if no longer used
   if (IsDiscardableArray(*model, op->inputs[0]) &&
       CountOpsWithInput(*model, op->inputs[0]) == 1) {
-    model->arrays.erase(op->inputs[0]);
+    model->EraseArray(op->inputs[0]);
   }
   if (IsDiscardableArray(*model, op->inputs[1]) &&
       CountOpsWithInput(*model, op->inputs[1]) == 1) {
-    model->arrays.erase(op->inputs[1]);
+    model->EraseArray(op->inputs[1]);
   }
 
   // Erase the operator
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_range.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_range.cc
index 383d54aa5a..1a0ba9e2bc 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_range.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_range.cc
@@ -28,17 +28,17 @@ bool ResolveConstantRange::Run(Model* model, std::size_t op_index) {
   auto* op = static_cast<RangeOperator*>(base_op);
 
   CHECK_EQ(op->inputs.size(), 3);
-  const auto& start_array = *model->arrays[op->inputs[0]];
+  const auto& start_array = model->GetArray(op->inputs[0]);
   if (!start_array.has_shape()) {
     // Yield until all input dims have been resolved.
     return false;
   }
-  const auto& limit_array = *model->arrays[op->inputs[1]];
+  const auto& limit_array = model->GetArray(op->inputs[1]);
   if (!limit_array.has_shape()) {
     // Yield until all input dims have been resolved.
     return false;
   }
-  const auto& delta_array = *model->arrays[op->inputs[2]];
+  const auto& delta_array = model->GetArray(op->inputs[2]);
   if (!delta_array.has_shape()) {
     // Yield until all input dims have been resolved.
     return false;
@@ -52,7 +52,7 @@ bool ResolveConstantRange::Run(Model* model, std::size_t op_index) {
   }
 
   CHECK_EQ(op->outputs.size(), 1);
-  auto& output_array = *model->arrays[op->outputs[0]];
+  auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.data_type == ArrayDataType::kNone) {
     // Yield until the output type has been set by PropagateArrayDataTypes
     return false;
@@ -87,15 +87,15 @@ bool ResolveConstantRange::Run(Model* model, std::size_t op_index) {
   // Delete the input array if no longer used
   if (IsDiscardableArray(*model, op->inputs[0]) &&
       CountOpsWithInput(*model, op->inputs[0]) == 1) {
-    model->arrays.erase(op->inputs[0]);
+    model->EraseArray(op->inputs[0]);
   }
   if (IsDiscardableArray(*model, op->inputs[1]) &&
       CountOpsWithInput(*model, op->inputs[1]) == 1) {
-    model->arrays.erase(op->inputs[1]);
+    model->EraseArray(op->inputs[1]);
   }
   if (IsDiscardableArray(*model, op->inputs[2]) &&
       CountOpsWithInput(*model, op->inputs[2]) == 1) {
-    model->arrays.erase(op->inputs[2]);
+    model->EraseArray(op->inputs[2]);
   }
 
   // Delete the operator
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_shape_or_rank.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_shape_or_rank.cc
index 35b81dd550..9ea01acd05 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_shape_or_rank.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_shape_or_rank.cc
@@ -62,7 +62,7 @@ bool ResolveConstantShapeOrRank::Run(Model* model, std::size_t op_index) {
   // Delete the input array if no longer used
   if (IsDiscardableArray(*model, op->inputs[0]) &&
       CountOpsWithInput(*model, op->inputs[0]) == 1) {
-    model->arrays.erase(op->inputs[0]);
+    model->EraseArray(op->inputs[0]);
   }
 
   model->operators.erase(it);
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_stack.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_stack.cc
index 86c76141a4..ea0d6dc820 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_stack.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_stack.cc
@@ -101,7 +101,7 @@ bool ResolveConstantStack::Run(Model* model, std::size_t op_index) {
   for (const auto& input : op->inputs) {
     if (IsDiscardableArray(*model, input) &&
         CountOpsWithInput(*model, input) == 1) {
-      model->arrays.erase(input);
+      model->EraseArray(input);
     }
   }
 
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_strided_slice.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_strided_slice.cc
index 3976d9cbb4..a0cfc3d597 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_strided_slice.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_strided_slice.cc
@@ -186,7 +186,7 @@ bool ResolveConstantStridedSlice::Run(Model* model, std::size_t op_index) {
   // Erase input array if no longer used
   if (IsDiscardableArray(*model, op->inputs[0]) &&
       CountOpsWithInput(*model, op->inputs[0]) == 1) {
-    model->arrays.erase(op->inputs[0]);
+    model->EraseArray(op->inputs[0]);
   }
 
   // Erase the operator
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc
index 26ff9d887b..1cd2aff28c 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc
@@ -199,7 +199,7 @@ bool ResolveConstantUnaryOperator::Run(Model* model, std::size_t op_index) {
   }
   for (const auto& input : unary_op->inputs) {
     if (CountOpsWithInput(*model, input) == 1) {
-      model->arrays.erase(input);
+      model->EraseArray(input);
     }
   }
   AddMessageF("Resolved constant %s to the equivalent constant array",
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_mean_attributes.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_mean_attributes.cc
index b77be3f5c0..013b50ac9b 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_mean_attributes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_mean_attributes.cc
@@ -36,7 +36,7 @@ bool ResolveMeanAttributes::Run(Model* model, std::size_t op_index) {
   if (op->inputs.size() != 2) return false;
   if (!IsConstantParameterArray(*model, op->inputs[1])) return false;
 
-  const auto& indices_array = *model->arrays[op->inputs[1]];
+  const auto& indices_array = model->GetArray(op->inputs[1]);
   if (!indices_array.has_shape()) return false;
   op->axis = indices_array.GetBuffer<ArrayDataType::kInt32>().data;
   return true;
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_pad_attributes.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_pad_attributes.cc
index d5f5869c62..8a8e723cf7 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_pad_attributes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_pad_attributes.cc
@@ -35,7 +35,7 @@ bool ResolvePadAttributes::Run(Model* model, std::size_t op_index) {
   CHECK_EQ(op->inputs.size(), 2);
   if (!IsConstantParameterArray(*model, op->inputs[1])) return false;
 
-  const auto& array = *model->arrays[op->inputs[1]];
+  const auto& array = model->GetArray(op->inputs[1]);
   if (!array.has_shape()) return false;
 
   const std::vector<int>& dims = array.shape().dims();
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_reorder_axes.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_reorder_axes.cc
index b5093bc4c7..5c68f87f6c 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_reorder_axes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_reorder_axes.cc
@@ -103,7 +103,7 @@ bool ResolveReorderAxes::Run(Model* model, std::size_t op_index) {
   AddMessageF("Reordered axes for array %s", input_array_name);
 
   // Remove the op and output array.
-  model->arrays.erase(output_array_name);
+  model->EraseArray(output_array_name);
   model->operators.erase(reorder_it);
   return true;
 }
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_reshape_attributes.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_reshape_attributes.cc
index bed2a85bd2..2e063e3554 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_reshape_attributes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_reshape_attributes.cc
@@ -37,7 +37,7 @@ bool ResolveReshapeAttributes::Run(Model* model, std::size_t op_index) {
   if (!op->shape.empty()) return false;
 
   if (IsConstantParameterArray(*model, reshape_op->inputs[1])) {
-    const auto& constant_input_array = *model->arrays[reshape_op->inputs[1]];
+    const auto& constant_input_array = model->GetArray(reshape_op->inputs[1]);
     op->shape = constant_input_array.GetBuffer<ArrayDataType::kInt32>().data;
   }
 
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_slice_attributes.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_slice_attributes.cc
index 1d0a2ec8f6..e760d08e5a 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_slice_attributes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_slice_attributes.cc
@@ -36,10 +36,10 @@ bool ResolveSliceAttributes::Run(Model* model, std::size_t op_index) {
   if (!IsConstantParameterArray(*model, op->inputs[1])) return false;
   if (!IsConstantParameterArray(*model, op->inputs[2])) return false;
 
-  const auto& begin_array = *model->arrays[op->inputs[1]];
+  const auto& begin_array = model->GetArray(op->inputs[1]);
   if (!begin_array.has_shape()) return false;
 
-  const auto& size_array = *model->arrays[op->inputs[2]];
+  const auto& size_array = model->GetArray(op->inputs[2]);
   if (!size_array.has_shape()) return false;
 
   op->begin = begin_array.GetBuffer<ArrayDataType::kInt32>().data;
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_space_to_batch_nd_attributes.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_space_to_batch_nd_attributes.cc
index a73f16735c..dad6aceccf 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_space_to_batch_nd_attributes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_space_to_batch_nd_attributes.cc
@@ -45,7 +45,7 @@ bool ResolveSpaceToBatchNDAttributes::Run(Model* model, std::size_t op_index) {
     return false;
 
   // Handle paddings.
-  const auto& paddings_array = *model->arrays[op->inputs[paddings_index]];
+  const auto& paddings_array = model->GetArray(op->inputs[paddings_index]);
   if (!paddings_array.has_shape()) return false;
   const std::vector<int>& paddings_dims = paddings_array.shape().dims();
   if (paddings_dims.size() != 2) {
@@ -61,7 +61,8 @@ bool ResolveSpaceToBatchNDAttributes::Run(Model* model, std::size_t op_index) {
   }
 
   // Handle block_shape.
-  const auto& block_shape_array = *model->arrays[op->inputs[block_shape_index]];
+  const auto& block_shape_array =
+      model->GetArray(op->inputs[block_shape_index]);
   if (!block_shape_array.has_shape()) return false;
   const std::vector<int>& block_shape_dims = block_shape_array.shape().dims();
   CHECK_EQ(block_shape_dims.size(), 1);
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_strided_slice_attributes.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_strided_slice_attributes.cc
index dbe69adcbd..de4d06be2a 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_strided_slice_attributes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_strided_slice_attributes.cc
@@ -31,13 +31,13 @@ bool ResolveStridedSliceAttributes::Run(Model* model, std::size_t op_index) {
   }
 
   CHECK_EQ(op->inputs.size(), 4);
-  const auto& start_array = *model->arrays[op->inputs[1]];
+  const auto& start_array = model->GetArray(op->inputs[1]);
   if (!start_array.has_shape()) return false;
 
-  const auto& stop_array = *model->arrays[op->inputs[2]];
+  const auto& stop_array = model->GetArray(op->inputs[2]);
   if (!stop_array.has_shape()) return false;
 
-  const auto& stride_array = *model->arrays[op->inputs[3]];
+  const auto& stride_array = model->GetArray(op->inputs[3]);
   if (!stride_array.has_shape()) return false;
 
   if (!IsConstantParameterArray(*model, op->inputs[1])) return false;
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_concat.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_concat.cc
index c6723a880e..5c0c1e3478 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_concat.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_concat.cc
@@ -75,7 +75,7 @@ bool ResolveTensorFlowConcat::Run(Model* model, std::size_t op_index) {
 
   // Remove the axis array if it is not used by anything else.
   if (CountOpsWithInput(*model, axis_name) == 1) {
-    model->arrays.erase(axis_name);
+    model->EraseArray(axis_name);
   }
   // Remove the TensorFlowConcat op
   model->operators.erase(concat_it);
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc
index bea7487051..ad1e56888e 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc
@@ -69,7 +69,7 @@ bool ResolveTensorFlowMatMul::Run(Model* model, std::size_t op_index) {
                 LogName(*matmul_op), LogName(*fc_op));
     const auto& previous_op_output = previous_op->outputs[0];
     if (CountOpsWithInput(*model, previous_op_output) == 1) {
-      model->arrays.erase(previous_op_output);
+      model->EraseArray(previous_op_output);
     }
     CHECK_EQ(previous_op->inputs.size(), 2);
     fc_op->inputs = {previous_op->inputs[0], matmul_op->inputs[1]};
@@ -78,7 +78,7 @@ bool ResolveTensorFlowMatMul::Run(Model* model, std::size_t op_index) {
       const auto& previous_op_shape = previous_op->inputs[1];
       if (CountOpsWithInput(*model, previous_op_shape) == 1 &&
           !GetOpWithOutput(*model, previous_op_shape)) {
-        model->arrays.erase(previous_op_shape);
+        model->EraseArray(previous_op_shape);
       }
       model->operators.erase(previous_op_it);
     }
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_merge.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_merge.cc
index cfa5ce0716..477e7f13da 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_merge.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_merge.cc
@@ -55,7 +55,7 @@ bool ResolveTensorFlowMerge::Run(Model* model, std::size_t op_index) {
 
   // Remove the node and its output array.
   AddMessageF("Removing already-resolved %s", LogName(*merge_op));
-  model->arrays.erase(merge_op->outputs[0]);
+  model->EraseArray(merge_op->outputs[0]);
   model->operators.erase(merge_it);
   return true;
 }
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_switch.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_switch.cc
index 150cf53da3..a418073441 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_switch.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_switch.cc
@@ -103,7 +103,7 @@ bool ResolveTensorFlowSwitch::Run(Model* model, std::size_t op_index) {
   // Remove the output arrays if they are now unused.
   for (int i = 0; i < 2; i++) {
     if (!GetOpWithInput(*model, switch_op->outputs[i])) {
-      model->arrays.erase(switch_op->outputs[i]);
+      model->EraseArray(switch_op->outputs[i]);
     }
   }
   // Remove input arrays if they are only used by the switch itself and aren't
@@ -111,7 +111,7 @@ bool ResolveTensorFlowSwitch::Run(Model* model, std::size_t op_index) {
   for (const auto& input : switch_op->inputs) {
     if (CountOpsWithInput(*model, input) == 1 &&
         !GetOpWithOutput(*model, input)) {
-      model->arrays.erase(input);
+      model->EraseArray(input);
     }
   }
   // Remove the switch node itself.
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_tile.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_tile.cc
index 9f7e7c42a2..1ddf54c778 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_tile.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_tile.cc
@@ -45,10 +45,10 @@ void RemoveTileOperator(Model* model, Operator* tile_op, Operator* binary_op,
   model->operators.erase(tile_it);
   if (!CountOpsWithInput(*model, tile_multiplier_array) &&
       !GetOpWithOutput(*model, tile_multiplier_array)) {
-    model->arrays.erase(tile_multiplier_array);
+    model->EraseArray(tile_multiplier_array);
   }
   if (!CountOpsWithInput(*model, tile_output_array)) {
-    model->arrays.erase(tile_output_array);
+    model->EraseArray(tile_output_array);
   }
 }
 }  // namespace
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_transpose_attributes.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_transpose_attributes.cc
index 12d966b261..a657ee00af 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_transpose_attributes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_transpose_attributes.cc
@@ -35,7 +35,7 @@ bool ResolveTransposeAttributes::Run(Model* model, std::size_t op_index) {
   if (!IsConstantParameterArray(*model, op->inputs[1])) return false;
 
   // Handling perm.
-  const auto& perm_array = *model->arrays[op->inputs[1]];
+  const auto& perm_array = model->GetArray(op->inputs[1]);
   if (!perm_array.has_shape()) return false;
 
   const std::vector<int>& perm_dims = perm_array.shape().dims();
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/tests/resolve_constant_concatenation_test.cc b/tensorflow/contrib/lite/toco/graph_transformations/tests/resolve_constant_concatenation_test.cc
index a14016e8e2..3a1d175b98 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/tests/resolve_constant_concatenation_test.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/tests/resolve_constant_concatenation_test.cc
@@ -19,7 +19,6 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-//#include "tensorflow/contrib/lite/kernels/test_util.h"
 #include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
 #include "tensorflow/contrib/lite/toco/model.h"
 #include "tensorflow/contrib/lite/toco/tooling_util.h"
@@ -168,11 +167,11 @@ TEST_F(ResolveConstantConcatenationTest, ConcatAtAxis0) {
 
   GraphTransformationsSet graph_transformation_set;
   graph_transformation_set.Add(new toco::ResolveConstantConcatenation);
-  EXPECT_THAT(model.arrays.size(), 5);
+  EXPECT_THAT(model.GetArrayMap().size(), 5);
   (*graph_transformation_set.begin())->Run(&model, /*op_index=*/0);
-  EXPECT_THAT(model.arrays.size(), 1);
+  EXPECT_THAT(model.GetArrayMap().size(), 1);
 
-  auto& concatenated_array = (*model.arrays.begin()).second;
+  auto& concatenated_array = (*model.GetArrayMap().begin()).second;
   EXPECT_THAT(concatenated_array->GetBuffer<toco::ArrayDataType::kFloat>().data,
               ElementsAreArray(ArrayFloatNear(
                   {0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  10., 11., 12.,
@@ -187,11 +186,11 @@ TEST_F(ResolveConstantConcatenationTest, ConcatAtAxis1) {
 
   GraphTransformationsSet graph_transformation_set;
   graph_transformation_set.Add(new toco::ResolveConstantConcatenation);
-  EXPECT_THAT(model.arrays.size(), 5);
+  EXPECT_THAT(model.GetArrayMap().size(), 5);
   (*graph_transformation_set.begin())->Run(&model, /*op_index=*/0);
-  EXPECT_THAT(model.arrays.size(), 1);
+  EXPECT_THAT(model.GetArrayMap().size(), 1);
 
-  auto& concatenated_array = (*model.arrays.begin()).second;
+  auto& concatenated_array = (*model.GetArrayMap().begin()).second;
   EXPECT_THAT(concatenated_array->GetBuffer<toco::ArrayDataType::kFloat>().data,
               ElementsAreArray(ArrayFloatNear(
                   {0.,  1.,  2.,  3.,  10., 11., 12., 13., 20., 21., 22.,
@@ -206,11 +205,11 @@ TEST_F(ResolveConstantConcatenationTest, ConcatAtAxis2) {
 
   GraphTransformationsSet graph_transformation_set;
   graph_transformation_set.Add(new toco::ResolveConstantConcatenation);
-  EXPECT_THAT(model.arrays.size(), 5);
+  EXPECT_THAT(model.GetArrayMap().size(), 5);
   (*graph_transformation_set.begin())->Run(&model, /*op_index=*/0);
-  EXPECT_THAT(model.arrays.size(), 1);
+  EXPECT_THAT(model.GetArrayMap().size(), 1);
 
-  auto& concatenated_array = (*model.arrays.begin()).second;
+  auto& concatenated_array = (*model.GetArrayMap().begin()).second;
   EXPECT_THAT(concatenated_array->GetBuffer<toco::ArrayDataType::kFloat>().data,
               ElementsAreArray(ArrayFloatNear(
                   {0.,  1.,  10., 11., 20., 21., 30., 31., 2.,  3.,  12.,
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/unfuse_activation_functions.cc b/tensorflow/contrib/lite/toco/graph_transformations/unfuse_activation_functions.cc
index 4e273343df..2c7046c8c7 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/unfuse_activation_functions.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/unfuse_activation_functions.cc
@@ -63,7 +63,7 @@ bool UnfuseActivationFunctions::Run(Model* model, std::size_t op_index) {
   ac_op->outputs = op->outputs;
   const string& tmp_array_name =
       AvailableArrayName(*model, op->outputs[0] + "_unfused");
-  CHECK(!model->arrays.count(tmp_array_name));
+  CHECK(!model->HasArray(tmp_array_name));
   model->GetOrCreateArray(tmp_array_name);
   ac_op->inputs = {tmp_array_name};
   op->outputs = {tmp_array_name};
diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.cc b/tensorflow/contrib/lite/toco/import_tensorflow.cc
index 995e9d67ca..1947271f55 100644
--- a/tensorflow/contrib/lite/toco/import_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/import_tensorflow.cc
@@ -1652,7 +1652,7 @@ void StripCaretFromArrayNames(Model* model) {
       output = string(absl::StripPrefix(output, "^"));
     }
   }
-  for (auto& array : model->arrays) {
+  for (auto& array : model->GetArrayMap()) {
     if (absl::StartsWith(array.first, "^")) {
       LOG(FATAL) << "What?";
     }
diff --git a/tensorflow/contrib/lite/toco/model.h b/tensorflow/contrib/lite/toco/model.h
index b079750bed..54fbba7381 100644
--- a/tensorflow/contrib/lite/toco/model.h
+++ b/tensorflow/contrib/lite/toco/model.h
@@ -1521,15 +1521,19 @@ struct Array {
 
 // Our Model struct, represents an entire model (our "top-level" struct).
 // Owns everything.
-struct Model {
+class Model {
+ public:
+  using ArrayMap = std::unordered_map<string, std::unique_ptr<Array>>;
+
+  bool HasArray(const string& name) const { return arrays.count(name) > 0; }
   Array& GetArray(const string& name) const {
-    DCHECK(arrays.count(name));
+    DCHECK(HasArray(name));
     return *arrays.at(name);
   }
   Array& GetOrCreateArray(const string& name) {
     // Make sure name is not used by an optional array
     DCHECK(!optional_arrays.count(name));
-    if (!arrays.count(name)) {
+    if (!HasArray(name)) {
       Array* ptr = new Array;
       arrays[name] = std::unique_ptr<Array>(ptr);
     }
@@ -1544,18 +1548,27 @@ struct Model {
     return optional_arrays.count(name);
   }
 
+  // Note that this invalidates all array iterators.
+  void EraseArray(const string& name) { arrays.erase(name); }
+  void EraseArrays(std::function<bool(const string&)> discardable) {
+    for (auto it = arrays.begin(); it != arrays.end();) {
+      if (discardable(it->first)) {
+        it = arrays.erase(it);
+      } else {
+        ++it;
+      }
+    }
+  }
+  const ArrayMap& GetArrayMap() const { return arrays; }
+
   // Optional arrays are used for optional tensors,
   // these tensors do not have data, but with reserved names as op inputs.
   std::set<string> optional_arrays;
+
   // The list of operators. Notice how it's a list of unique_ptr's, implying
   // that the Model is what owns Operator's and keeps them alive.
   std::vector<std::unique_ptr<Operator>> operators;
-  // The associative array mapping names to Array's.
-  // Notice how it's a container of unique_ptr's, implying
-  // that the Model is what owns Array's and keeps them alive.
-  // The Operator's refer to these Array's by their name strings, not by their
-  // addresses. See Operator::inputs, Operator::outputs.
-  std::unordered_map<string, std::unique_ptr<Array>> arrays;
+
   // Generic flags, a place where we combine information passed to us via
   // command-line parameters (e.g. --input_width=N) with information that
   // we may or may not find in the input model file.
@@ -1564,6 +1577,14 @@ struct Model {
   std::size_t transient_data_size = 0;
   // For code-generation only: required alignment of the transient_data buffer
   std::size_t transient_data_alignment = 0;
+
+ private:
+  // The associative array mapping names to Array's.
+  // Notice how it's a container of unique_ptr's, implying
+  // that the Model is what owns Array's and keeps them alive.
+  // The Operator's refer to these Array's by their name strings, not by their
+  // addresses. See Operator::inputs, Operator::outputs.
+  std::unordered_map<string, std::unique_ptr<Array>> arrays;
 };
 }  // namespace toco
 
diff --git a/tensorflow/contrib/lite/toco/tflite/export.cc b/tensorflow/contrib/lite/toco/tflite/export.cc
index 440353203e..391ef87029 100644
--- a/tensorflow/contrib/lite/toco/tflite/export.cc
+++ b/tensorflow/contrib/lite/toco/tflite/export.cc
@@ -62,7 +62,7 @@ namespace details {
 void LoadTensorsMap(const Model& model, TensorsMap* tensors_map) {
   // First find a list of unique array names.
   std::set<string> names;
-  for (const auto& array_pair : model.arrays) {
+  for (const auto& array_pair : model.GetArrayMap()) {
     names.insert(array_pair.first);
   }
 
@@ -96,7 +96,7 @@ Offset<Vector<Offset<Tensor>>> ExportTensors(
   // tensors in the tensors_map.
   std::map<int, Offset<Tensor>> ordered_tensors;
 
-  for (const auto& array_pair : model.arrays) {
+  for (const auto& array_pair : model.GetArrayMap()) {
     const string& tensor_name = array_pair.first;
     const toco::Array& array = *array_pair.second;
 
diff --git a/tensorflow/contrib/lite/toco/tflite/import_test.cc b/tensorflow/contrib/lite/toco/tflite/import_test.cc
index 309fa6d7f6..aad6e780d5 100644
--- a/tensorflow/contrib/lite/toco/tflite/import_test.cc
+++ b/tensorflow/contrib/lite/toco/tflite/import_test.cc
@@ -114,7 +114,7 @@ TEST_F(ImportTest, Tensors) {
 
   auto model = Import(ModelFlags(), InputModelAsString());
 
-  ASSERT_GT(model->arrays.count("tensor_one"), 0);
+  ASSERT_GT(model->HasArray("tensor_one"), 0);
   Array& a1 = model->GetArray("tensor_one");
   EXPECT_EQ(ArrayDataType::kFloat, a1.data_type);
   EXPECT_THAT(a1.GetBuffer<ArrayDataType::kFloat>().data,
diff --git a/tensorflow/contrib/lite/toco/toco_tooling.cc b/tensorflow/contrib/lite/toco/toco_tooling.cc
index 94b4d14696..afaa0fd0c7 100644
--- a/tensorflow/contrib/lite/toco/toco_tooling.cc
+++ b/tensorflow/contrib/lite/toco/toco_tooling.cc
@@ -133,7 +133,7 @@ void SetFinalDataTypeOnInputs(const TocoFlags& toco_flags, Model* model) {
 
   for (int i = 0; i < model->flags.input_arrays_size(); i++) {
     string const& array_name = model->flags.input_arrays(i).name();
-    auto* array = model->arrays[array_name].get();
+    auto* array = &model->GetArray(array_name);
     // Note that the notion of changing data types only applies to real-numbers
     // arrays (see the documentation for inference_input_type).
     // TODO(benoitjacob) this is assuming that uint8 arrays are quantized,
diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc
index f9093ab973..900c60bd90 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.cc
+++ b/tensorflow/contrib/lite/toco/tooling_util.cc
@@ -93,7 +93,7 @@ int CountOpsWithInput(const Model& model, const string& array_name) {
 
 bool DeleteArrayIfUnused(const string& array_name, Model* model) {
   if (CountOpsWithInput(*model, array_name) == 0) {
-    model->arrays.erase(array_name);
+    model->EraseArray(array_name);
     return true;
   }
   return false;
@@ -566,11 +566,11 @@ int RequiredBufferSizeForShape(const Shape& shape) {
 }
 
 bool IsConstantParameterArray(const Model& model, const string& name) {
-  if (!model.arrays.count(name)) {
+  if (!model.HasArray(name)) {
     return false;
   }
 
-  return !!model.arrays.at(name)->buffer;
+  return !!model.GetArray(name).buffer;
 }
 
 namespace {
@@ -633,17 +633,17 @@ void CheckNonExistentIOArrays(const Model& model) {
     return;
   }
   for (const auto& input_array : model.flags.input_arrays()) {
-    CHECK(model.arrays.count(input_array.name()))
+    CHECK(model.HasArray(input_array.name()))
         << "Input array not found: " << input_array.name();
   }
   for (const string& output_array : model.flags.output_arrays()) {
-    CHECK(model.arrays.count(output_array))
+    CHECK(model.HasArray(output_array))
         << "Output array not found: " << output_array;
   }
   for (const auto& rnn_state : model.flags.rnn_states()) {
     if (!rnn_state.discardable()) {
-      CHECK(model.arrays.count(rnn_state.state_array()));
-      CHECK(model.arrays.count(rnn_state.back_edge_source_array()));
+      CHECK(model.HasArray(rnn_state.state_array()));
+      CHECK(model.HasArray(rnn_state.back_edge_source_array()));
     }
   }
 }
@@ -652,10 +652,10 @@ void CheckNonExistentIOArrays(const Model& model) {
 void CheckNoMissingArray(const Model& model) {
   for (const auto& op : model.operators) {
     for (const auto& input : op->inputs) {
-      CHECK(model.arrays.count(input) || model.optional_arrays.count(input));
+      CHECK(model.HasArray(input) || model.optional_arrays.count(input));
     }
     for (const auto& output : op->outputs) {
-      CHECK(model.arrays.count(output));
+      CHECK(model.HasArray(output));
     }
   }
   CheckNonExistentIOArrays(model);
@@ -664,12 +664,12 @@ void CheckNoMissingArray(const Model& model) {
 void FixNoMissingArray(Model* model) {
   for (const auto& op : model->operators) {
     for (const auto& input : op->inputs) {
-      if (!model->arrays.count(input)) {
+      if (!model->HasArray(input)) {
         model->GetOrCreateArray(input);
       }
     }
     for (const auto& output : op->outputs) {
-      if (!model->arrays.count(output)) {
+      if (!model->HasArray(output)) {
         model->GetOrCreateArray(output);
       }
     }
@@ -687,7 +687,7 @@ void FixNoMissingArray(Model* model) {
 
 void CheckNoOrphanedArray(const Model& model) {
   std::unordered_set<string> arrays_without_known_use;
-  for (const auto& array : model.arrays) {
+  for (const auto& array : model.GetArrayMap()) {
     if (IsDiscardableArray(model, array.first)) {
       arrays_without_known_use.insert(array.first);
     }
@@ -714,7 +714,7 @@ void CheckNoOrphanedArray(const Model& model) {
 
 void FixNoOrphanedArray(Model* model) {
   std::unordered_set<string> arrays_without_known_use;
-  for (const auto& array : model->arrays) {
+  for (const auto& array : model->GetArrayMap()) {
     arrays_without_known_use.insert(array.first);
   }
   for (const auto& op : model->operators) {
@@ -731,13 +731,13 @@ void FixNoOrphanedArray(Model* model) {
   }
   for (const auto& array : arrays_without_known_use) {
     if (IsDiscardableArray(*model, array)) {
-      model->arrays.erase(array);
+      model->EraseArray(array);
     }
   }
 }
 
 void CheckArrayFieldsConsistent(const Model& model) {
-  for (const auto& array_entry : model.arrays) {
+  for (const auto& array_entry : model.GetArrayMap()) {
     const auto& array = array_entry.second;
     if (array->has_shape()) {
       for (int d : array->shape().dims()) {
@@ -756,7 +756,7 @@ void CheckArrayFieldsConsistent(const Model& model) {
 
 void CheckOperatorOrdering(const Model& model) {
   std::unordered_set<string> arrays_behind_us;
-  for (const auto& array_entry : model.arrays) {
+  for (const auto& array_entry : model.GetArrayMap()) {
     if (!GetOpWithOutput(model, array_entry.first)) {
       arrays_behind_us.insert(array_entry.first);
     }
@@ -781,7 +781,7 @@ void CheckOperatorOrdering(const Model& model) {
 
 void FixOperatorOrdering(Model* model) {
   std::unordered_set<string> arrays_behind_us;
-  for (const auto& array_entry : model->arrays) {
+  for (const auto& array_entry : model->GetArrayMap()) {
     if (!GetOpWithOutput(*model, array_entry.first)) {
       arrays_behind_us.insert(array_entry.first);
     }
@@ -936,7 +936,8 @@ void CheckModelCounts(const Model& model) {
     if (count_type == "None") {
       continue;
     } else if (count_type == "Arrays") {
-      CheckCountInRange(model_check, model.arrays.size(), "count of arrays");
+      CheckCountInRange(model_check, model.GetArrayMap().size(),
+                        "count of arrays");
     } else if (count_type == "Total") {
       CheckCountInRange(model_check, model.operators.size(),
                         "count of all operator instances");
@@ -1297,7 +1298,7 @@ bool IsAllocatableTransientArray(const Model& model, const string& array_name) {
       return false;
     }
   }
-  const auto& array = model.arrays.at(array_name);
+  const auto& array = &model.GetArray(array_name);
   // An array with a constant buffer isn't a transient array.
   if (!!array->buffer) {
     return false;
@@ -1310,13 +1311,13 @@ bool IsAllocatableTransientArray(const Model& model, const string& array_name) {
 }
 
 string AvailableArrayName(const Model& model, const string& name) {
-  if (!model.arrays.count(name) && !model.optional_arrays.count(name)) {
+  if (!model.HasArray(name) && !model.optional_arrays.count(name)) {
     return name;
   }
   const int kNumSuffixesToTry = 1000;
   for (int i = 0; i < kNumSuffixesToTry; i++) {
     const string& name_with_suffix = toco::port::StringF("%s_%d", name, i);
-    if (!model.arrays.count(name_with_suffix)) {
+    if (!model.HasArray(name_with_suffix)) {
       return name_with_suffix;
     }
   }
@@ -1334,12 +1335,12 @@ string ShapeToString(const Shape& shape) {
 }
 
 void PrintArrayShape(Model* model, const string& name) {
-  if (!model->arrays[name]->has_shape()) {
+  if (!model->GetArray(name).has_shape()) {
     LOG(INFO) << name << " has no shape";
     return;
   }
   LOG(INFO) << name
-            << " has shape: " << ShapeToString(model->arrays[name]->shape());
+            << " has shape: " << ShapeToString(model->GetArray(name).shape());
 }
 
 bool IsArrayFullyConnectedWeights(const Model& model, const string& name) {
@@ -1673,7 +1674,7 @@ bool IsDiscardableArray(const Model& model, const string& array_name) {
 }
 
 void CheckFinalDataTypesSatisfied(const Model& model) {
-  for (const auto& array_entry : model.arrays) {
+  for (const auto& array_entry : model.GetArrayMap()) {
     const auto& array = *array_entry.second;
     if (array.final_data_type != ArrayDataType::kNone) {
       CHECK(array.final_data_type == array.data_type)
-- 
GitLab


From 5f4bbef22e1cb7841da282b9903a5dd24a6c1bd6 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Tue, 23 Jan 2018 12:39:22 -0800
Subject: [PATCH 019/966] [TF:XLA] Bump open source llvm revision to r323181

PiperOrigin-RevId: 182976206
---
 tensorflow/workspace.bzl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 25702087dc..d17dc81024 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -473,11 +473,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "llvm",
       urls = [
-          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/cecc5a23281018f9bbdd6f659462b8c99d8e8926.tar.gz",
-          "https://github.com/llvm-mirror/llvm/archive/cecc5a23281018f9bbdd6f659462b8c99d8e8926.tar.gz",
+          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/95461910b8134c70de1d94f6854b64cfff3615d9.tar.gz",
+          "https://github.com/llvm-mirror/llvm/archive/95461910b8134c70de1d94f6854b64cfff3615d9.tar.gz",
       ],
-      sha256 = "b0960749fe2bf78d7c8350c18d584cf6dd5abeeae20da43d4829cdbc0166626c",
-      strip_prefix = "llvm-cecc5a23281018f9bbdd6f659462b8c99d8e8926",
+      sha256 = "2e16617820ec59e3b57b6800aab00238c2e5955182b56a0fbc03bde12b5f2d7b",
+      strip_prefix = "llvm-95461910b8134c70de1d94f6854b64cfff3615d9",
       build_file = str(Label("//third_party/llvm:llvm.BUILD")),
   )
 
-- 
GitLab


From 9f389fc4d36f0ffb79681817c25b8d7a329fc649 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Jan 2018 12:41:07 -0800
Subject: [PATCH 020/966] internal change

PiperOrigin-RevId: 182976445
---
 tensorflow/contrib/tpu/profiler/BUILD         | 11 +++++++++-
 .../tpu/profiler/capture_tpu_profile.cc       |  4 ++++
 tensorflow/contrib/tpu/profiler/version.h     | 21 +++++++++++++++++++
 3 files changed, 35 insertions(+), 1 deletion(-)
 create mode 100644 tensorflow/contrib/tpu/profiler/version.h

diff --git a/tensorflow/contrib/tpu/profiler/BUILD b/tensorflow/contrib/tpu/profiler/BUILD
index 346c03067d..198da0203a 100644
--- a/tensorflow/contrib/tpu/profiler/BUILD
+++ b/tensorflow/contrib/tpu/profiler/BUILD
@@ -44,13 +44,22 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "version",
+    hdrs = ["version.h"],
+    visibility = ["//visibility:public"],
+)
+
 tf_cc_binary(
     name = "capture_tpu_profile",
-    srcs = ["capture_tpu_profile.cc"],
+    srcs = [
+        "capture_tpu_profile.cc",
+    ],
     visibility = ["//visibility:public"],
     deps = [
         ":dump_tpu_profile",
         ":tpu_profiler_proto_cc",
+        ":version",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core/distributed_runtime/rpc:grpc_util",
diff --git a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
index b67f2f47a7..1cded9f8cf 100644
--- a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
+++ b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
@@ -26,6 +26,7 @@ limitations under the License.
 
 #include "tensorflow/contrib/tpu/profiler/dump_tpu_profile.h"
 #include "tensorflow/contrib/tpu/profiler/tpu_profiler.grpc.pb.h"
+#include "tensorflow/contrib/tpu/profiler/version.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/init_main.h"
@@ -84,6 +85,9 @@ int main(int argc, char** argv) {
                        "Duration of tracing in ms. Default is 2000ms."),
   };
 
+  std::cout << "Welcome to the Cloud TPU Profiler v" << TPU_PROFILER_VERSION
+            << std::endl;
+
   tensorflow::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   bool parse_ok = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_ok || FLAGS_service_addr.empty() || FLAGS_logdir.empty()) {
diff --git a/tensorflow/contrib/tpu/profiler/version.h b/tensorflow/contrib/tpu/profiler/version.h
new file mode 100644
index 0000000000..a6c9a9503d
--- /dev/null
+++ b/tensorflow/contrib/tpu/profiler/version.h
@@ -0,0 +1,21 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+=============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TPU_PROFILER_VERSION_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_TPU_PROFILER_VERSION_H_
+
+#define TPU_PROFILER_VERSION "1.4.3"
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TPU_PROFILER_VERSION_H_
-- 
GitLab


From 0eba6b6b6a0a0f0d56afd3e2c6945075706f9abb Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Jan 2018 12:48:00 -0800
Subject: [PATCH 021/966] [XLA:GPU] Support BF16 data type.

Add an HLO pass to the GPU backend to implement BF16 operations with F32
operations.

Define macro XLA_BACKEND_SUPPORTS_BFLOAT16=1 when building tests for the GPU
backend to enable BF16 tests for GPU. Enable bfloat16_test and other BF16 tests
for GPU.

Add hlo_element_type_converter_test.

Add convolution tests and matrix multiplication tests for BF16.

PiperOrigin-RevId: 182977358
---
 tensorflow/compiler/xla/service/BUILD         |  10 ++
 tensorflow/compiler/xla/service/gpu/BUILD     |   1 +
 .../compiler/xla/service/gpu/gpu_compiler.cc  |   5 +
 .../xla/service/hlo_element_type_converter.cc | 102 ++++++++++++---
 .../hlo_element_type_converter_test.cc        | 123 ++++++++++++++++++
 .../compiler/xla/service/hlo_matchers.h       |   1 +
 tensorflow/compiler/xla/tests/BUILD           |   3 -
 .../compiler/xla/tests/bfloat16_test.cc       |   5 +-
 .../compiler/xla/tests/convolution_test.cc    |  23 ++++
 .../xla/tests/matrix_ops_simple_test.cc       |  72 ++++++++++
 .../compiler/xla/tests/reduce_window_test.cc  |   4 +-
 11 files changed, 326 insertions(+), 23 deletions(-)
 create mode 100644 tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 426fead41b..2e0ac256cc 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -1944,6 +1944,16 @@ cc_library(
     ],
 )
 
+tf_cc_test(
+    name = "hlo_element_type_converter_test",
+    srcs = ["hlo_element_type_converter_test.cc"],
+    deps = [
+        ":hlo_element_type_converter",
+        ":hlo_matchers",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+    ],
+)
+
 cc_library(
     name = "device_memory_allocator",
     srcs = ["device_memory_allocator.cc"],
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index d7ca0f6846..df5e2e35f8 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -475,6 +475,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_constant_folding",
         "//tensorflow/compiler/xla/service:hlo_cse",
         "//tensorflow/compiler/xla/service:hlo_dce",
+        "//tensorflow/compiler/xla/service:hlo_element_type_converter",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/compiler/xla/service:hlo_pass_pipeline",
         "//tensorflow/compiler/xla/service:hlo_proto",
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index 8ba6f48dcf..21798ed606 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -58,6 +58,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_constant_folding.h"
 #include "tensorflow/compiler/xla/service/hlo_cse.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
+#include "tensorflow/compiler/xla/service/hlo_element_type_converter.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
@@ -137,6 +138,10 @@ tensorflow::Status OptimizeHloModule(HloModule* hlo_module) {
 
     // TODO(b/64094172): make Call work on GPU instead of inlining.
     pipeline.AddPass<CallInliner>();
+    // Convert BF16 operations to F32 operations so that the GPU backend can
+    // support BF16 operations without directly implementing a BF16 lowering for
+    // most ops.
+    pipeline.AddPass<HloElementTypeConverter>(BF16, F32);
     pipeline.AddPass<DotDecomposer>();
     {
       auto& pass =
diff --git a/tensorflow/compiler/xla/service/hlo_element_type_converter.cc b/tensorflow/compiler/xla/service/hlo_element_type_converter.cc
index 1773bb401d..c782d1b0ad 100644
--- a/tensorflow/compiler/xla/service/hlo_element_type_converter.cc
+++ b/tensorflow/compiler/xla/service/hlo_element_type_converter.cc
@@ -54,45 +54,96 @@ bool HasOperandType(HloInstruction* hlo, PrimitiveType type) {
   return false;
 }
 
+// Finds out the Tuple Shape of the new instruction after converting the element
+// type of the operands of the original instruction from `from_type` to
+// `to_type`.
+//
+// This routine assumes the resulting `shape` of the original instruction is a
+// non-nested tuple. This assumption is currently safe as only kTuple, kInfeed,
+// kOutfeed, kCall, kCustomCall and kBatchNorm* HLO instructions can produce
+// results with tuple shapes, and this routine is only called to convert the
+// result shapes of kBatchNorm* HLO instructions, which are non-nested tuples.
+Shape GetConvertedTupleShape(const Shape& shape, PrimitiveType from_type,
+                             PrimitiveType to_type) {
+  std::vector<Shape> new_tuple_subshapes;
+  for (int64 i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
+    Shape subshape = ShapeUtil::GetTupleElementShape(shape, i);
+    CHECK(!ShapeUtil::IsTuple(subshape));
+    if (subshape.element_type() == from_type) {
+      subshape = ShapeUtil::ChangeElementType(subshape, to_type);
+    }
+    new_tuple_subshapes.push_back(subshape);
+  }
+  return ShapeUtil::MakeTupleShape(new_tuple_subshapes);
+}
+
+// Converts the elements of the result of `hlo` to produce a new tuple with
+// shape `to_shape`.
+//
+// This routine assumes `hlo` is an instruction that produces a non-nested Tuple
+// as a result.
+HloInstruction* ConvertTupleElements(HloInstruction* hlo,
+                                     const Shape& to_shape) {
+  const Shape& shape = hlo->shape();
+  HloComputation* computation = hlo->parent();
+  std::vector<HloInstruction*> tuple_elements;
+  for (int64 i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
+    const Shape& ele_shape = ShapeUtil::GetTupleElementShape(shape, i);
+    HloInstruction* element = computation->AddInstruction(
+        HloInstruction::CreateGetTupleElement(ele_shape, hlo, i));
+    const Shape& to_ele_shape = ShapeUtil::GetTupleElementShape(to_shape, i);
+    CHECK(!ShapeUtil::IsTuple(ele_shape));
+    if (ele_shape.element_type() != to_ele_shape.element_type()) {
+      element = computation->AddInstruction(
+          HloInstruction::CreateConvert(to_ele_shape, element));
+    }
+    tuple_elements.push_back(element);
+  }
+  return computation->AddInstruction(
+      HloInstruction::CreateTuple(tuple_elements));
+}
+
 }  // namespace
 
 HloElementTypeConverter::HloElementTypeConverter(
     PrimitiveType eliminate_type, PrimitiveType replace_with_type)
     : eliminate_type_(eliminate_type), replace_with_type_(replace_with_type) {}
 
+// This routine converts the arithmetic operations in the given module that use
+// eliminate_type_ to operations that use replace_with_type_.
 StatusOr<bool> HloElementTypeConverter::Run(HloModule* module) {
   XLA_VLOG_LINES(
       3, "HloElementTypeConverter::Run(), before:\n" + module->ToString());
+
+  if (eliminate_type_ == replace_with_type_) {
+    return false;
+  }
+
   bool changed = false;
   for (auto* computation : module->computations()) {
     for (auto* hlo : computation->MakeInstructionPostOrder()) {
+      const auto opcode = hlo->opcode();
       // These are ops where it does not make sense to convert them.
-      if (hlo->opcode() == HloOpcode::kParameter ||
-          hlo->opcode() == HloOpcode::kConstant ||
-          hlo->opcode() == HloOpcode::kTuple ||
-          hlo->opcode() == HloOpcode::kConvert ||
-          hlo->opcode() == HloOpcode::kGetTupleElement ||
-          hlo->opcode() == HloOpcode::kInfeed ||
-          hlo->opcode() == HloOpcode::kOutfeed) {
+      if (opcode == HloOpcode::kParameter || opcode == HloOpcode::kConstant ||
+          opcode == HloOpcode::kTuple || opcode == HloOpcode::kConvert ||
+          opcode == HloOpcode::kGetTupleElement ||
+          opcode == HloOpcode::kInfeed || opcode == HloOpcode::kOutfeed) {
         continue;
       }
 
       // We cannot change a CustomCall since we have no way of adjusting the
       // called binary to expect the updated type.
-      if (hlo->opcode() == HloOpcode::kCustomCall) {
+      if (opcode == HloOpcode::kCustomCall) {
         continue;
       }
 
       // These are ops with embedded computations where it suffices to convert
       // the embedded computations instead of converting the ops themselves.
-      if (hlo->opcode() == HloOpcode::kWhile ||
-          hlo->opcode() == HloOpcode::kCall ||
-          hlo->opcode() == HloOpcode::kFusion ||
-          hlo->opcode() == HloOpcode::kMap ||
-          hlo->opcode() == HloOpcode::kReduce ||
-          hlo->opcode() == HloOpcode::kReduceWindow ||
-          hlo->opcode() == HloOpcode::kSelectAndScatter ||
-          hlo->opcode() == HloOpcode::kConditional) {
+      if (opcode == HloOpcode::kWhile || opcode == HloOpcode::kCall ||
+          opcode == HloOpcode::kFusion || opcode == HloOpcode::kMap ||
+          opcode == HloOpcode::kReduce || opcode == HloOpcode::kReduceWindow ||
+          opcode == HloOpcode::kSelectAndScatter ||
+          opcode == HloOpcode::kConditional) {
         continue;
       }
       TF_RET_CHECK(hlo->called_computations().empty()) << hlo->ToString();
@@ -106,6 +157,11 @@ StatusOr<bool> HloElementTypeConverter::Run(HloModule* module) {
         continue;
       }
 
+      // Handle instructions that perform arithmetic operations and contain
+      // operands with eliminate_type_.
+      //
+      // First, convert the operands with eliminate_type_ to operands with
+      // replace_with_type_.
       std::vector<HloInstruction*> new_operands;
       for (HloInstruction* operand : hlo->operands()) {
         if (operand->shape().element_type() == eliminate_type_) {
@@ -114,6 +170,10 @@ StatusOr<bool> HloElementTypeConverter::Run(HloModule* module) {
         new_operands.push_back(operand);
       }
 
+      // Then find out the result type of the new instruction with the same
+      // opcode but using the converted operands, create the new instruction,
+      // and convert the result of the new instruction back to match the result
+      // type of the original instruction.
       HloInstruction* new_hlo;
       if (hlo->shape().element_type() == eliminate_type_) {
         Shape shape =
@@ -121,10 +181,20 @@ StatusOr<bool> HloElementTypeConverter::Run(HloModule* module) {
         new_hlo = computation->AddInstruction(
             hlo->CloneWithNewOperands(shape, new_operands, hlo->GetModule()));
         new_hlo = ToElementType(new_hlo, eliminate_type_);
+      } else if (ShapeUtil::IsTuple(hlo->shape())) {
+        Shape old_shape = hlo->shape();
+        Shape new_shape = GetConvertedTupleShape(hlo->shape(), eliminate_type_,
+                                                 replace_with_type_);
+        new_hlo = computation->AddInstruction(hlo->CloneWithNewOperands(
+            new_shape, new_operands, hlo->GetModule()));
+        // Convert the elements of the result of `new_hlo` to produce a new
+        // tuple with shape `old_shape`.
+        new_hlo = ConvertTupleElements(new_hlo, old_shape);
       } else {
         new_hlo = computation->AddInstruction(hlo->CloneWithNewOperands(
             hlo->shape(), new_operands, hlo->GetModule()));
       }
+
       TF_RETURN_IF_ERROR(computation->ReplaceInstruction(hlo, new_hlo));
       changed = true;
     }
diff --git a/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc b/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc
new file mode 100644
index 0000000000..2aed82409b
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc
@@ -0,0 +1,123 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_element_type_converter.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+
+namespace xla {
+namespace {
+
+namespace op = xla::testing::opcode_matchers;
+
+class HloElementTypeConverterTest : public HloTestBase {
+ public:
+  std::unique_ptr<HloModule> CreateModuleFromHloString(
+      const string& hlo_string) {
+    return HloRunner::CreateModuleFromString(hlo_string,
+                                             GetDebugOptionsForTest())
+        .ValueOrDie();
+  }
+};
+
+TEST_F(HloElementTypeConverterTest, CustomCallsNotConverted) {
+  const string& hlo_string = R"(
+    HloModule custom_call
+    ENTRY CustomCall {
+      constant = bf16[1]{0} constant({12345})
+      ROOT custom-call = bf16[1,2,3]{0,2,1} custom-call(constant),
+           custom_call_target="foo"
+    }
+  )";
+  auto module = CreateModuleFromHloString(hlo_string);
+  HloElementTypeConverter type_converter(BF16, F32);
+  auto converted = type_converter.Run(module.get()).ConsumeValueOrDie();
+  EXPECT_TRUE(converted == false);
+}
+
+TEST_F(HloElementTypeConverterTest, InfeedsOutfeedsNotConverted) {
+  const string& hlo_string = R"(
+    HloModule InfeedOutfeed
+    ENTRY RoundTrip16MiBR1.v2 {
+      ROOT infeed = bf16[4]{0} infeed()
+      outfeed = () outfeed(infeed)
+    }
+  )";
+  auto module = CreateModuleFromHloString(hlo_string);
+  HloElementTypeConverter type_converter(BF16, F32);
+  auto converted = type_converter.Run(module.get()).ConsumeValueOrDie();
+  EXPECT_TRUE(converted == false);
+}
+
+TEST_F(HloElementTypeConverterTest, OperationsInNestedTuplesConverted) {
+  const string& hlo_string = R"(
+    HloModule NestedTuples
+    ENTRY NestedTuples.v5 {
+      constant.4 = bf16[] constant(42)
+      constant.2 = f32[2]{0} constant({1, 2})
+      constant.3 = bf16[] constant(42)
+      add = bf16[] add(constant.2, constant.3)
+      tuple = (f32[2]{0}, bf16[]) tuple(constant.2, add)
+      constant.5 = bf16[2]{0} constant({22, 44})
+      ROOT tuple.1 = ((f32[2]{0}, bf16[]), bf16[2]{0}) tuple(tuple, constant.5)
+    }
+  )";
+
+  auto module = CreateModuleFromHloString(hlo_string);
+  HloElementTypeConverter type_converter(BF16, F32);
+  auto converted = type_converter.Run(module.get()).ConsumeValueOrDie();
+  EXPECT_TRUE(converted == true);
+
+  const HloInstruction* bf16_op =
+      module->entry_computation()->root_instruction()->operand(0)->operand(1);
+  EXPECT_THAT(bf16_op, op::Convert(op::Add(op::Constant(), op::Convert())));
+}
+
+TEST_F(HloElementTypeConverterTest, BatchNormGradBF16Converted) {
+  const string& hlo_string = R"(
+    HloModule BatchNormGrad
+    ENTRY BatchNormGrad.v6 {
+      constant.4 = bf16[2,2,2,1]{3,2,1,0} constant(bf16[2,2,2,1] { { /*i0=0*/ 
+      { /*i1=0*/ {0}, {0} }, { /*i1=1*/ {0}, {0} } }, { /*i0=1*/ { /*i1=0*/ {0},
+      {0} }, { /*i1=1*/ {0}, {0} } } })
+      constant.5 = bf16[2]{0} constant({1, 1})
+      constant.6 = bf16[2]{0} constant({0, 0})
+      constant.7 = bf16[2]{0} constant({1, 1})
+      constant.8 = bf16[2,2,2,1]{3,2,1,0} constant(bf16[2,2,2,1] { { /*i0=0*/
+      { /*i1=0*/ {1}, {2} }, { /*i1=1*/ {3}, {4} } }, { /*i0=1*/ { /*i1=0*/
+      {5}, {6} }, { /*i1=1*/ {7}, {8} } } })
+      ROOT batch-norm-grad = (bf16[2,2,2,1]{3,2,1,0}, bf16[2]{0}, bf16[2]{0})
+      batch-norm-grad(constant.4, constant.5, constant.6, constant.7,
+      constant.8), epsilon=0, feature_index=2
+    }
+  )";
+
+  auto module = CreateModuleFromHloString(hlo_string);
+  HloElementTypeConverter type_converter(BF16, F32);
+  auto converted = type_converter.Run(module.get()).ConsumeValueOrDie();
+  EXPECT_TRUE(converted == true);
+
+  const HloInstruction* tuple_instr =
+      module->entry_computation()->root_instruction();
+  ::testing::Matcher<const ::xla::HloInstruction*> batch_norm =
+      op::BatchNormGrad();
+  EXPECT_THAT(tuple_instr,
+              op::Tuple(op::Convert(op::GetTupleElement(batch_norm, 0)),
+                        op::Convert(op::GetTupleElement(batch_norm, 1)),
+                        op::Convert(op::GetTupleElement(batch_norm, 2))));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_matchers.h b/tensorflow/compiler/xla/service/hlo_matchers.h
index 992f55788b..9206cdac05 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers.h
+++ b/tensorflow/compiler/xla/service/hlo_matchers.h
@@ -83,6 +83,7 @@ HLO_MATCHER(Abs);
 HLO_MATCHER(Add);
 HLO_MATCHER(Bitcast);
 HLO_MATCHER(Broadcast);
+HLO_MATCHER(BatchNormGrad);
 HLO_MATCHER(Call);
 HLO_MATCHER(Ceil);
 HLO_MATCHER(Clamp);
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 3922c779a0..84e7c1770b 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -815,9 +815,6 @@ xla_test(
 xla_test(
     name = "bfloat16_test",
     srcs = ["bfloat16_test.cc"],
-    blacklisted_backends = [
-        "gpu",
-    ],
     shard_count = 40,
     deps = [
         ":test_utils",
diff --git a/tensorflow/compiler/xla/tests/bfloat16_test.cc b/tensorflow/compiler/xla/tests/bfloat16_test.cc
index e47fcad475..b853dfaa15 100644
--- a/tensorflow/compiler/xla/tests/bfloat16_test.cc
+++ b/tensorflow/compiler/xla/tests/bfloat16_test.cc
@@ -99,8 +99,9 @@ XLA_TEST_F(Bfloat16Test, BatchNormTraining) {
 
   auto expected = Literal::MakeTuple(
       {Literal::CreateR4<bfloat16>(
-           {{{{static_cast<bfloat16>(-1.7f)}, {static_cast<bfloat16>(-2.04f)}},
-             {{static_cast<bfloat16>(0.105f)}, {static_cast<bfloat16>(0.65f)}}},
+           {{{{static_cast<bfloat16>(-1.6875f)},
+              {static_cast<bfloat16>(-2.04f)}},
+             {{static_cast<bfloat16>(0.105f)}, {static_cast<bfloat16>(0.66f)}}},
             {{{static_cast<bfloat16>(1.89f)}, {static_cast<bfloat16>(3.35f)}},
              {{static_cast<bfloat16>(3.7f)}, {static_cast<bfloat16>(6.04f)}}}})
            .get(),
diff --git a/tensorflow/compiler/xla/tests/convolution_test.cc b/tensorflow/compiler/xla/tests/convolution_test.cc
index a10e17dbf3..0ceb9aff37 100644
--- a/tensorflow/compiler/xla/tests/convolution_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_test.cc
@@ -608,5 +608,28 @@ INSTANTIATE_TEST_CASE_P(
 
 );
 
+TEST_F(ConvolutionTest, Convolve_bf16_1x1x1x2_1x1x1x2_Valid) {
+  ComputationBuilder builder(client_, TestName());
+  Shape input_shape = ShapeUtil::MakeShape(BF16, {1, 1, 1, 2});
+  Shape filter_shape = ShapeUtil::MakeShape(BF16, {1, 1, 1, 2});
+  auto input = builder.Parameter(0, input_shape, "input");
+  auto filter = builder.Parameter(1, filter_shape, "filter");
+  auto conv = builder.Conv(input, filter, {1, 1}, Padding::kValid);
+
+  Array4D<bfloat16> input_data(1, 1, 1, 2);
+  input_data.FillWithYX(Array2D<bfloat16>({
+      {bfloat16(1), bfloat16(2)},
+  }));
+  Array4D<bfloat16> filter_data(1, 1, 1, 2);
+  filter_data.FillWithYX(Array2D<bfloat16>({
+      {bfloat16(5), bfloat16(6)},
+  }));
+
+  ComputeAndCompare(&builder, conv,
+                    {std::move(*Literal::CreateFromArray(input_data)),
+                     std::move(*Literal::CreateFromArray(filter_data))},
+                    error_spec_);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc b/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc
index 0fb87c3c2c..6c86dd5b9e 100644
--- a/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc
+++ b/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc
@@ -221,5 +221,77 @@ INSTANTIATE_TEST_CASE_P(MatOpsDotAddTestInstances, MatOpsDotAddTest,
                         ::testing::Combine(::testing::Bool(), ::testing::Bool(),
                                            ::testing::Bool()));
 
+class MatOpsDotAddTest_bf16
+    : public ClientLibraryTestBase,
+      public ::testing::WithParamInterface<std::tuple<bool, bool, bool>> {};
+
+TEST_P(MatOpsDotAddTest_bf16, Dot_Add_2x2_2x2) {
+  bool row_major = std::get<0>(GetParam());
+  bool add_lhs = std::get<1>(GetParam());
+  bool transpose = std::get<2>(GetParam());
+  Array2D<bfloat16> lhs(
+      {{bfloat16(1.0f), bfloat16(2.0f)}, {bfloat16(3.0), bfloat16(4.0)}});
+  Array2D<bfloat16> rhs(
+      {{bfloat16(10.0f), bfloat16(11.0f)}, {bfloat16(12.0f), bfloat16(13.0f)}});
+
+  auto minor_to_major = [](bool row_major) -> std::vector<int64> {
+    return {row_major ? 1 : 0, row_major ? 0 : 1};
+  };
+
+  auto prim_type = primitive_util::NativeToPrimitiveType<bfloat16>();
+  Shape lhs_shape =
+      ShapeUtil::MakeShape(prim_type, {lhs.height(), lhs.width()});
+  Shape rhs_shape =
+      ShapeUtil::MakeShape(prim_type, {rhs.height(), rhs.width()});
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto lhs_handle,
+      client_->TransferToServer(
+          *Literal::CreateR2FromArray2DWithLayout<bfloat16>(
+              lhs, LayoutUtil::MakeLayout(minor_to_major(row_major)))));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto rhs_handle,
+      client_->TransferToServer(
+          *Literal::CreateR2FromArray2DWithLayout<bfloat16>(
+              rhs, LayoutUtil::MakeLayout(minor_to_major(row_major)))));
+
+  ComputationBuilder builder(client_, TestName());
+  auto lhs_arg = builder.Parameter(0, lhs_shape, "lhs");
+  auto lhs_mat_arg = lhs_arg;
+  if (transpose) {
+    lhs_mat_arg = builder.Transpose(lhs_mat_arg, {1, 0});
+  }
+  auto rhs_arg = builder.Parameter(1, rhs_shape, "rhs");
+  auto result = builder.Dot(lhs_mat_arg, rhs_arg);
+  Array2D<bfloat16> expected;
+  if (add_lhs) {
+    result = builder.Add(result, lhs_arg);
+    if (transpose) {
+      expected = Array2D<bfloat16>(
+          {{bfloat16(47), bfloat16(52)}, {bfloat16(71), bfloat16(78)}});
+    } else {
+      expected = Array2D<bfloat16>(
+          {{bfloat16(35), bfloat16(39)}, {bfloat16(81), bfloat16(89)}});
+    }
+  } else {
+    result = builder.Add(result, rhs_arg);
+    if (transpose) {
+      expected = Array2D<bfloat16>(
+          {{bfloat16(56), bfloat16(61)}, {bfloat16(80), bfloat16(87)}});
+    } else {
+      expected = Array2D<bfloat16>(
+          {{bfloat16(44), bfloat16(48)}, {bfloat16(90), bfloat16(98)}});
+    }
+  }
+
+  ComputeAndCompareR2<bfloat16>(&builder, expected,
+                                {lhs_handle.get(), rhs_handle.get()},
+                                ErrorSpec(1e-6));
+}
+
+INSTANTIATE_TEST_CASE_P(MatOpsDotAddTestInstances, MatOpsDotAddTest_bf16,
+                        ::testing::Combine(::testing::Bool(), ::testing::Bool(),
+                                           ::testing::Bool()));
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/reduce_window_test.cc b/tensorflow/compiler/xla/tests/reduce_window_test.cc
index 51e0765f7f..73b37e201a 100644
--- a/tensorflow/compiler/xla/tests/reduce_window_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_window_test.cc
@@ -810,9 +810,9 @@ INSTANTIATE_TEST_CASE_P(
 
 class R4ReduceWindowAnyDimsTest : public R4ReduceWindowTest {};
 
-// TODO(b/72234705): Fix the test cases failed on CPU.
+// TODO(b/72234705): Fix the test cases failed on CPU and GPU.
 XLA_TEST_P(R4ReduceWindowAnyDimsTest,
-           DISABLED_ON_CPU_PARALLEL(DISABLED_ON_CPU(DoIt))) {
+           DISABLED_ON_CPU_PARALLEL(DISABLED_ON_CPU(DISABLED_ON_GPU(DoIt)))) {
   DoIt();
 }
 
-- 
GitLab


From fb7ca01cc0dcb11a4805a2018ebb617b28b88d97 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Tue, 23 Jan 2018 12:48:44 -0800
Subject: [PATCH 022/966] Adding tf_export decorators/calls to TensorFlow
 functions and constants.

PiperOrigin-RevId: 182977460
---
 tensorflow/python/framework/constant_op.py    |  2 +
 tensorflow/python/framework/device.py         |  2 +
 tensorflow/python/framework/dtypes.py         | 27 +++++++++++
 tensorflow/python/framework/errors_impl.py    | 45 ++++++++++++++++++-
 tensorflow/python/framework/graph_io.py       |  2 +
 .../python/framework/graph_util_impl.py       |  6 +++
 tensorflow/python/framework/importer.py       |  2 +
 tensorflow/python/framework/load_library.py   |  3 ++
 tensorflow/python/framework/ops.py            | 25 +++++++++++
 tensorflow/python/framework/random_seed.py    |  3 ++
 tensorflow/python/framework/sparse_tensor.py  |  4 ++
 tensorflow/python/framework/tensor_shape.py   |  3 ++
 tensorflow/python/framework/tensor_util.py    |  3 ++
 tensorflow/python/framework/test_util.py      |  6 +++
 tensorflow/python/framework/versions.py       |  9 ++++
 tensorflow/tools/api/generator/BUILD          |  3 ++
 16 files changed, 144 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/framework/constant_op.py b/tensorflow/python/framework/constant_op.py
index ac915157f5..0a56d3f64d 100644
--- a/tensorflow/python/framework/constant_op.py
+++ b/tensorflow/python/framework/constant_op.py
@@ -52,6 +52,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.util.tf_export import tf_export
 
 
 def _eager_reshape(tensor, shape, ctx):
@@ -131,6 +132,7 @@ def convert_to_eager_tensor(value, ctx, dtype=None):
     return ops.EagerTensor(value, context=handle, device=device, dtype=dtype)
 
 
+@tf_export("constant")
 def constant(value, dtype=None, shape=None, name="Const", verify_shape=False):
   """Creates a constant tensor.
 
diff --git a/tensorflow/python/framework/device.py b/tensorflow/python/framework/device.py
index 8f5125dcfe..ab06a2babf 100644
--- a/tensorflow/python/framework/device.py
+++ b/tensorflow/python/framework/device.py
@@ -19,8 +19,10 @@ from __future__ import division
 from __future__ import print_function
 
 import copy
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("DeviceSpec")
 class DeviceSpec(object):
   """Represents a (possibly partial) specification for a TensorFlow device.
 
diff --git a/tensorflow/python/framework/dtypes.py b/tensorflow/python/framework/dtypes.py
index b0422eb6be..67ccf990d6 100644
--- a/tensorflow/python/framework/dtypes.py
+++ b/tensorflow/python/framework/dtypes.py
@@ -23,11 +23,13 @@ import numpy as np
 
 from tensorflow.core.framework import types_pb2
 from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.util.tf_export import tf_export
 
 
 _np_bfloat16 = pywrap_tensorflow.TF_bfloat16_type()
 
 
+@tf_export("DType")
 class DType(object):
   """Represents the type of the elements in a `Tensor`.
 
@@ -321,32 +323,55 @@ dtype_range = {np.bool_: (False, True),
 
 # Define standard wrappers for the types_pb2.DataType enum.
 resource = DType(types_pb2.DT_RESOURCE)
+tf_export("resource").export_constant(__name__, "resource")
 variant = DType(types_pb2.DT_VARIANT)
+tf_export("variant").export_constant(__name__, "variant")
 float16 = DType(types_pb2.DT_HALF)
+tf_export("float16").export_constant(__name__, "float16")
 half = float16
+tf_export("half").export_constant(__name__, "half")
 float32 = DType(types_pb2.DT_FLOAT)
+tf_export("float32").export_constant(__name__, "float32")
 float64 = DType(types_pb2.DT_DOUBLE)
+tf_export("float64").export_constant(__name__, "float64")
 double = float64
+tf_export("double").export_constant(__name__, "double")
 int32 = DType(types_pb2.DT_INT32)
+tf_export("int32").export_constant(__name__, "int32")
 uint8 = DType(types_pb2.DT_UINT8)
+tf_export("uint8").export_constant(__name__, "uint8")
 uint16 = DType(types_pb2.DT_UINT16)
+tf_export("uint16").export_constant(__name__, "uint16")
 uint32 = DType(types_pb2.DT_UINT32)
 uint64 = DType(types_pb2.DT_UINT64)
 int16 = DType(types_pb2.DT_INT16)
+tf_export("int16").export_constant(__name__, "int16")
 int8 = DType(types_pb2.DT_INT8)
+tf_export("int8").export_constant(__name__, "int8")
 string = DType(types_pb2.DT_STRING)
+tf_export("string").export_constant(__name__, "string")
 complex64 = DType(types_pb2.DT_COMPLEX64)
+tf_export("complex64").export_constant(__name__, "complex64")
 complex128 = DType(types_pb2.DT_COMPLEX128)
+tf_export("complex128").export_constant(__name__, "complex128")
 int64 = DType(types_pb2.DT_INT64)
+tf_export("int64").export_constant(__name__, "int64")
 bool = DType(types_pb2.DT_BOOL)
+tf_export("bool").export_constant(__name__, "bool")
 qint8 = DType(types_pb2.DT_QINT8)
+tf_export("qint8").export_constant(__name__, "qint8")
 quint8 = DType(types_pb2.DT_QUINT8)
+tf_export("quint8").export_constant(__name__, "quint8")
 qint16 = DType(types_pb2.DT_QINT16)
+tf_export("qint16").export_constant(__name__, "qint16")
 quint16 = DType(types_pb2.DT_QUINT16)
+tf_export("quint16").export_constant(__name__, "quint16")
 qint32 = DType(types_pb2.DT_QINT32)
+tf_export("qint32").export_constant(__name__, "qint32")
 resource_ref = DType(types_pb2.DT_RESOURCE_REF)
 variant_ref = DType(types_pb2.DT_VARIANT_REF)
 bfloat16 = DType(types_pb2.DT_BFLOAT16)
+tf_export("bfloat16").export_constant(__name__, "bfloat16")
 float16_ref = DType(types_pb2.DT_HALF_REF)
 half_ref = float16_ref
 float32_ref = DType(types_pb2.DT_FLOAT_REF)
@@ -578,8 +603,10 @@ _TF_TO_NP = {
 QUANTIZED_DTYPES = frozenset(
     [qint8, quint8, qint16, quint16, qint32, qint8_ref, quint8_ref, qint16_ref,
      quint16_ref, qint32_ref])
+tf_export("QUANTIZED_DTYPES").export_constant(__name__, "QUANTIZED_DTYPES")
 
 
+@tf_export("as_dtype")
 def as_dtype(type_value):
   """Converts the given `type_value` to a `DType`.
 
diff --git a/tensorflow/python/framework/errors_impl.py b/tensorflow/python/framework/errors_impl.py
index c3b2c498c3..2a40316d51 100644
--- a/tensorflow/python/framework/errors_impl.py
+++ b/tensorflow/python/framework/errors_impl.py
@@ -25,8 +25,10 @@ from tensorflow.core.lib.core import error_codes_pb2
 from tensorflow.python import pywrap_tensorflow as c_api
 from tensorflow.python.framework import c_api_util
 from tensorflow.python.util import compat
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("OpError", "errors.OpError")
 class OpError(Exception):
   """A generic error that is raised when TensorFlow execution fails.
 
@@ -133,25 +135,48 @@ class OpError(Exception):
 
 
 OK = error_codes_pb2.OK
+tf_export("errors.OK").export_constant(__name__, "OK")
 CANCELLED = error_codes_pb2.CANCELLED
+tf_export("errors.CANCELLED").export_constant(__name__, "CANCELLED")
 UNKNOWN = error_codes_pb2.UNKNOWN
+tf_export("errors.UNKNOWN").export_constant(__name__, "UNKNOWN")
 INVALID_ARGUMENT = error_codes_pb2.INVALID_ARGUMENT
+tf_export("errors.INVALID_ARGUMENT").export_constant(__name__,
+                                                     "INVALID_ARGUMENT")
 DEADLINE_EXCEEDED = error_codes_pb2.DEADLINE_EXCEEDED
+tf_export("errors.DEADLINE_EXCEEDED").export_constant(__name__,
+                                                      "DEADLINE_EXCEEDED")
 NOT_FOUND = error_codes_pb2.NOT_FOUND
+tf_export("errors.NOT_FOUND").export_constant(__name__, "NOT_FOUND")
 ALREADY_EXISTS = error_codes_pb2.ALREADY_EXISTS
+tf_export("errors.ALREADY_EXISTS").export_constant(__name__, "ALREADY_EXISTS")
 PERMISSION_DENIED = error_codes_pb2.PERMISSION_DENIED
+tf_export("errors.PERMISSION_DENIED").export_constant(__name__,
+                                                      "PERMISSION_DENIED")
 UNAUTHENTICATED = error_codes_pb2.UNAUTHENTICATED
+tf_export("errors.UNAUTHENTICATED").export_constant(__name__, "UNAUTHENTICATED")
 RESOURCE_EXHAUSTED = error_codes_pb2.RESOURCE_EXHAUSTED
+tf_export("errors.RESOURCE_EXHAUSTED").export_constant(__name__,
+                                                       "RESOURCE_EXHAUSTED")
 FAILED_PRECONDITION = error_codes_pb2.FAILED_PRECONDITION
+tf_export("errors.FAILED_PRECONDITION").export_constant(__name__,
+                                                        "FAILED_PRECONDITION")
 ABORTED = error_codes_pb2.ABORTED
+tf_export("errors.ABORTED").export_constant(__name__, "ABORTED")
 OUT_OF_RANGE = error_codes_pb2.OUT_OF_RANGE
+tf_export("errors.OUT_OF_RANGE").export_constant(__name__, "OUT_OF_RANGE")
 UNIMPLEMENTED = error_codes_pb2.UNIMPLEMENTED
+tf_export("errors.UNIMPLEMENTED").export_constant(__name__, "UNIMPLEMENTED")
 INTERNAL = error_codes_pb2.INTERNAL
+tf_export("errors.INTERNAL").export_constant(__name__, "INTERNAL")
 UNAVAILABLE = error_codes_pb2.UNAVAILABLE
+tf_export("errors.UNAVAILABLE").export_constant(__name__, "UNAVAILABLE")
 DATA_LOSS = error_codes_pb2.DATA_LOSS
+tf_export("errors.DATA_LOSS").export_constant(__name__, "DATA_LOSS")
 
 
 # pylint: disable=line-too-long
+@tf_export("errors.CancelledError")
 class CancelledError(OpError):
   """Raised when an operation or step is cancelled.
 
@@ -172,6 +197,7 @@ class CancelledError(OpError):
 # pylint: enable=line-too-long
 
 
+@tf_export("errors.UnknownError")
 class UnknownError(OpError):
   """Unknown error.
 
@@ -189,6 +215,7 @@ class UnknownError(OpError):
     super(UnknownError, self).__init__(node_def, op, message, error_code)
 
 
+@tf_export("errors.InvalidArgumentError")
 class InvalidArgumentError(OpError):
   """Raised when an operation receives an invalid argument.
 
@@ -209,6 +236,7 @@ class InvalidArgumentError(OpError):
                                                INVALID_ARGUMENT)
 
 
+@tf_export("errors.DeadlineExceededError")
 class DeadlineExceededError(OpError):
   """Raised when a deadline expires before an operation could complete.
 
@@ -223,6 +251,7 @@ class DeadlineExceededError(OpError):
                                                 DEADLINE_EXCEEDED)
 
 
+@tf_export("errors.NotFoundError")
 class NotFoundError(OpError):
   """Raised when a requested entity (e.g., a file or directory) was not found.
 
@@ -239,6 +268,7 @@ class NotFoundError(OpError):
     super(NotFoundError, self).__init__(node_def, op, message, NOT_FOUND)
 
 
+@tf_export("errors.AlreadyExistsError")
 class AlreadyExistsError(OpError):
   """Raised when an entity that we attempted to create already exists.
 
@@ -256,6 +286,7 @@ class AlreadyExistsError(OpError):
                                              ALREADY_EXISTS)
 
 
+@tf_export("errors.PermissionDeniedError")
 class PermissionDeniedError(OpError):
   """Raised when the caller does not have permission to run an operation.
 
@@ -273,6 +304,7 @@ class PermissionDeniedError(OpError):
                                                 PERMISSION_DENIED)
 
 
+@tf_export("errors.UnauthenticatedError")
 class UnauthenticatedError(OpError):
   """The request does not have valid authentication credentials.
 
@@ -287,6 +319,7 @@ class UnauthenticatedError(OpError):
                                                UNAUTHENTICATED)
 
 
+@tf_export("errors.ResourceExhaustedError")
 class ResourceExhaustedError(OpError):
   """Some resource has been exhausted.
 
@@ -302,6 +335,7 @@ class ResourceExhaustedError(OpError):
                                                  RESOURCE_EXHAUSTED)
 
 
+@tf_export("errors.FailedPreconditionError")
 class FailedPreconditionError(OpError):
   """Operation was rejected because the system is not in a state to execute it.
 
@@ -318,6 +352,7 @@ class FailedPreconditionError(OpError):
                                                   FAILED_PRECONDITION)
 
 
+@tf_export("errors.AbortedError")
 class AbortedError(OpError):
   """The operation was aborted, typically due to a concurrent action.
 
@@ -335,6 +370,7 @@ class AbortedError(OpError):
     super(AbortedError, self).__init__(node_def, op, message, ABORTED)
 
 
+@tf_export("errors.OutOfRangeError")
 class OutOfRangeError(OpError):
   """Raised when an operation iterates past the valid input range.
 
@@ -353,6 +389,7 @@ class OutOfRangeError(OpError):
                                           OUT_OF_RANGE)
 
 
+@tf_export("errors.UnimplementedError")
 class UnimplementedError(OpError):
   """Raised when an operation has not been implemented.
 
@@ -371,6 +408,7 @@ class UnimplementedError(OpError):
                                              UNIMPLEMENTED)
 
 
+@tf_export("errors.InternalError")
 class InternalError(OpError):
   """Raised when the system experiences an internal error.
 
@@ -385,6 +423,7 @@ class InternalError(OpError):
     super(InternalError, self).__init__(node_def, op, message, INTERNAL)
 
 
+@tf_export("errors.UnavailableError")
 class UnavailableError(OpError):
   """Raised when the runtime is currently unavailable.
 
@@ -399,6 +438,7 @@ class UnavailableError(OpError):
                                            UNAVAILABLE)
 
 
+@tf_export("errors.DataLossError")
 class DataLossError(OpError):
   """Raised when unrecoverable data loss or corruption is encountered.
 
@@ -437,10 +477,12 @@ _EXCEPTION_CLASS_TO_CODE = dict((
     (class_, code) for (code, class_) in _CODE_TO_EXCEPTION_CLASS.items()))
 
 
+@tf_export("errors.exception_type_from_error_code")
 def exception_type_from_error_code(error_code):
   return _CODE_TO_EXCEPTION_CLASS[error_code]
 
 
+@tf_export("errors.error_code_from_exception_type")
 def error_code_from_exception_type(cls):
   return _EXCEPTION_CLASS_TO_CODE[cls]
 
@@ -457,7 +499,8 @@ def _make_specific_exception(node_def, op, message, error_code):
 # Named like a function for backwards compatibility with the
 # @tf_contextlib.contextmanager version, which was switched to a class to avoid
 # some object creation overhead.
-class raise_exception_on_not_ok_status(object):  # pylint: disable=invalid-name
+@tf_export("errors.raise_exception_on_not_ok_status")  # pylint: disable=invalid-name
+class raise_exception_on_not_ok_status(object):
   """Context manager to check for C API status."""
 
   def __enter__(self):
diff --git a/tensorflow/python/framework/graph_io.py b/tensorflow/python/framework/graph_io.py
index a0ea4ad48e..be30b16f5f 100644
--- a/tensorflow/python/framework/graph_io.py
+++ b/tensorflow/python/framework/graph_io.py
@@ -24,8 +24,10 @@ import os.path
 from google.protobuf import text_format
 from tensorflow.python.framework import ops
 from tensorflow.python.lib.io import file_io
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('train.write_graph')
 def write_graph(graph_or_graph_def, logdir, name, as_text=True):
   """Writes a graph proto to a file.
 
diff --git a/tensorflow/python/framework/graph_util_impl.py b/tensorflow/python/framework/graph_util_impl.py
index 6c7b455388..5a543317e6 100644
--- a/tensorflow/python/framework/graph_util_impl.py
+++ b/tensorflow/python/framework/graph_util_impl.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
 
 _VARIABLE_OPS = {
     "Assign",
@@ -49,6 +50,7 @@ def _is_variable_op(op):
   return op in _VARIABLE_OPS
 
 
+@tf_export("graph_util.must_run_on_cpu")
 def must_run_on_cpu(node, pin_variables_on_cpu=False):
   """Returns True if the given node_def must run on CPU, otherwise False.
 
@@ -147,6 +149,7 @@ def _bfs_for_reachable_nodes(target_nodes, name_to_input_name):
   return nodes_to_keep
 
 
+@tf_export("graph_util.extract_sub_graph")
 def extract_sub_graph(graph_def, dest_nodes):
   """Extract the subgraph that can reach any of the nodes in 'dest_nodes'.
 
@@ -184,6 +187,7 @@ def extract_sub_graph(graph_def, dest_nodes):
   return out
 
 
+@tf_export("graph_util.tensor_shape_from_node_def_name")
 def tensor_shape_from_node_def_name(graph, input_name):
   """Convenience function to get a shape from a NodeDef's input string."""
   # To get a tensor, the name must be in the form <input>:<port>, for example
@@ -198,6 +202,7 @@ def tensor_shape_from_node_def_name(graph, input_name):
   return shape
 
 
+@tf_export("graph_util.convert_variables_to_constants")
 def convert_variables_to_constants(sess,
                                    input_graph_def,
                                    output_node_names,
@@ -270,6 +275,7 @@ def convert_variables_to_constants(sess,
   return output_graph_def
 
 
+@tf_export("graph_util.remove_training_nodes")
 def remove_training_nodes(input_graph, protected_nodes=None):
   """Prunes out nodes that aren't needed for inference.
 
diff --git a/tensorflow/python/framework/importer.py b/tensorflow/python/framework/importer.py
index a3dbe43f06..00fff8d040 100644
--- a/tensorflow/python/framework/importer.py
+++ b/tensorflow/python/framework/importer.py
@@ -36,6 +36,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.util import compat
 from tensorflow.python.util.deprecation import deprecated_args
+from tensorflow.python.util.tf_export import tf_export
 
 
 # TODO(josh11b): SWIG the code from node_def_util instead of duplicating
@@ -369,6 +370,7 @@ def _GatherReturnElements(requested_return_elements, graph, results):
   return combined_return_elements
 
 
+@tf_export('import_graph_def')
 @deprecated_args(None, 'Please file an issue at '
                  'https://github.com/tensorflow/tensorflow/issues if you depend'
                  ' on this feature.',
diff --git a/tensorflow/python/framework/load_library.py b/tensorflow/python/framework/load_library.py
index 909e6d4c7b..c997ead829 100644
--- a/tensorflow/python/framework/load_library.py
+++ b/tensorflow/python/framework/load_library.py
@@ -28,8 +28,10 @@ from tensorflow.core.lib.core import error_codes_pb2
 from tensorflow.python import pywrap_tensorflow as py_tf
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.util import compat
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('load_op_library')
 def load_op_library(library_filename):
   """Loads a TensorFlow plugin, containing custom ops and kernels.
 
@@ -79,6 +81,7 @@ def load_op_library(library_filename):
   return module
 
 
+@tf_export('load_file_system_library')
 def load_file_system_library(library_filename):
   """Loads a TensorFlow plugin, containing file system implementation.
 
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 2489982d93..ce9ca07215 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -55,6 +55,7 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
 from tensorflow.python.util import decorator_utils
 from tensorflow.python.util import tf_contextlib
+from tensorflow.python.util.tf_export import tf_export
 
 
 # Temporary global switch determining if we should enable the work-in-progress
@@ -191,6 +192,7 @@ class _TensorLike(object):
   pass
 
 
+@tf_export("Tensor")
 class Tensor(_TensorLike):
   """Represents one of the outputs of an `Operation`.
 
@@ -871,6 +873,7 @@ _tensor_conversion_func_lock = threading.Lock()
 register_dense_tensor_like_type(Tensor)
 
 
+@tf_export("convert_to_tensor")
 def convert_to_tensor(value, dtype=None, name=None, preferred_dtype=None):
   """Converts the given `value` to a `Tensor`.
 
@@ -1116,6 +1119,7 @@ def convert_n_to_tensor(values, dtype=None, name=None, preferred_dtype=None):
       as_ref=False)
 
 
+@tf_export("convert_to_tensor_or_indexed_slices")
 def convert_to_tensor_or_indexed_slices(value, dtype=None, name=None):
   """Converts the given object to a `Tensor` or an `IndexedSlices`.
 
@@ -1246,6 +1250,7 @@ def convert_n_to_tensor_or_indexed_slices(values, dtype=None, name=None):
 
 
 # TODO(josh11b): Add ctx argument to conversion_func() signature.
+@tf_export("register_tensor_conversion_function")
 def register_tensor_conversion_function(base_type,
                                         conversion_func,
                                         priority=100):
@@ -1306,6 +1311,7 @@ def register_tensor_conversion_function(base_type,
     _tensor_conversion_func_cache = {}
 
 
+@tf_export("IndexedSlices")
 class IndexedSlices(_TensorLike):
   """A sparse representation of a set of tensor slices at given indices.
 
@@ -1486,6 +1492,7 @@ def _create_c_op(graph, node_def, inputs, control_inputs):
   return c_op
 
 
+@tf_export("Operation")
 class Operation(object):
   """Represents a graph node that performs computation on tensors.
 
@@ -2213,6 +2220,7 @@ class Operation(object):
 _gradient_registry = registry.Registry("gradient")
 
 
+@tf_export("RegisterGradient")
 class RegisterGradient(object):
   """A decorator for registering the gradient function for an op type.
 
@@ -2255,6 +2263,7 @@ class RegisterGradient(object):
     return f
 
 
+@tf_export("NoGradient", "NotDifferentiable")
 def NotDifferentiable(op_type):
   """Specifies that ops of type `op_type` is not differentiable.
 
@@ -2574,6 +2583,7 @@ def _name_from_scope_name(name):
   return name[:-1] if (name and name[-1] == "/") else name
 
 
+@tf_export("Graph")
 class Graph(object):
   """A TensorFlow computation, represented as a dataflow graph.
 
@@ -4591,6 +4601,9 @@ class Graph(object):
 
 # TODO(agarwal): currently device directives in an outer eager scope will not
 # apply to inner graph mode code. Fix that.
+
+
+@tf_export("device")
 def device(device_name_or_function):
   """Wrapper for `Graph.device()` using the default graph.
 
@@ -4620,6 +4633,7 @@ def device(device_name_or_function):
     return context.device(device_name_or_function)
 
 
+@tf_export("container")
 def container(container_name):
   """Wrapper for `Graph.container()` using the default graph.
 
@@ -4633,6 +4647,7 @@ def container(container_name):
   return get_default_graph().container(container_name)
 
 
+@tf_export("colocate_with")
 def colocate_with(op, ignore_existing=False):
   if context.in_graph_mode():
     return get_default_graph().colocate_with(op, ignore_existing)
@@ -4643,6 +4658,7 @@ def colocate_with(op, ignore_existing=False):
       return _NullContextmanager()
 
 
+@tf_export("control_dependencies")
 def control_dependencies(control_inputs):
   """Wrapper for `Graph.control_dependencies()` using the default graph.
 
@@ -4760,6 +4776,7 @@ def default_session(session):
   return _default_session_stack.get_controller(session)
 
 
+@tf_export("get_default_session")
 def get_default_session():
   """Returns the default session for the current thread.
 
@@ -5049,6 +5066,7 @@ def eager_run(main=None, argv=None):
   app.run(main, argv)
 
 
+@tf_export("reset_default_graph")
 def reset_default_graph():
   """Clears the default graph stack and resets the global default graph.
 
@@ -5067,6 +5085,7 @@ def reset_default_graph():
   _default_graph_stack.reset()
 
 
+@tf_export("get_default_graph")
 def get_default_graph():
   """Returns the default graph for the current thread.
 
@@ -5187,6 +5206,7 @@ def _get_graph_from_inputs(op_input_list, graph=None):
   return graph or get_default_graph()
 
 
+@tf_export("GraphKeys")
 class GraphKeys(object):
   """Standard names to use for graph collections.
 
@@ -5335,6 +5355,7 @@ class GraphKeys(object):
     return cls.GLOBAL_VARIABLES
 
 
+@tf_export("add_to_collection")
 def add_to_collection(name, value):
   """Wrapper for `Graph.add_to_collection()` using the default graph.
 
@@ -5371,6 +5392,7 @@ def add_to_collections(names, value):
   get_default_graph().add_to_collections(names, value)
 
 
+@tf_export("get_collection_ref")
 def get_collection_ref(key):
   """Wrapper for `Graph.get_collection_ref()` using the default graph.
 
@@ -5394,6 +5416,7 @@ def get_collection_ref(key):
   return get_default_graph().get_collection_ref(key)
 
 
+@tf_export("get_collection")
 def get_collection(key, scope=None):
   """Wrapper for `Graph.get_collection()` using the default graph.
 
@@ -5430,6 +5453,7 @@ def get_all_collection_keys():
 # Named like a function for backwards compatibility with the
 # @tf_contextlib.contextmanager version, which was switched to a class to avoid
 # some object creation overhead.
+@tf_export("name_scope", "keras.backend.name_scope")
 class name_scope(object):  # pylint: disable=invalid-name
   """A context manager for use when defining a Python op.
 
@@ -5576,6 +5600,7 @@ def prepend_name_scope(name, import_scope):
 
 # pylint: disable=g-doc-return-or-yield
 # pylint: disable=not-context-manager
+@tf_export("op_scope")
 @tf_contextlib.contextmanager
 def op_scope(values, name, default_name=None):
   """DEPRECATED. Same as name_scope above, just different argument order."""
diff --git a/tensorflow/python/framework/random_seed.py b/tensorflow/python/framework/random_seed.py
index 5f1130570d..1e74a790a3 100644
--- a/tensorflow/python/framework/random_seed.py
+++ b/tensorflow/python/framework/random_seed.py
@@ -22,6 +22,7 @@ from __future__ import print_function
 
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
+from tensorflow.python.util.tf_export import tf_export
 
 
 DEFAULT_GRAPH_SEED = 87654321
@@ -32,6 +33,7 @@ def _truncate_seed(seed):
   return seed % _MAXINT32  # Truncate to fit into 32-bit integer
 
 
+@tf_export('get_seed')
 def get_seed(op_seed):
   """Returns the local seeds an operation should use given an op-specific seed.
 
@@ -78,6 +80,7 @@ def get_seed(op_seed):
   return seeds
 
 
+@tf_export('set_random_seed')
 def set_random_seed(seed):
   """Sets the graph-level random seed.
 
diff --git a/tensorflow/python/framework/sparse_tensor.py b/tensorflow/python/framework/sparse_tensor.py
index 6218cc34ca..1fe81e5f17 100644
--- a/tensorflow/python/framework/sparse_tensor.py
+++ b/tensorflow/python/framework/sparse_tensor.py
@@ -23,6 +23,7 @@ import collections
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.util.tf_export import tf_export
 
 # pylint: disable=protected-access
 _TensorLike = ops._TensorLike
@@ -31,6 +32,7 @@ _override_helper = ops._override_helper
 # pylint: enable=protected-access
 
 
+@tf_export("SparseTensor")
 class SparseTensor(_TensorLike):
   """Represents a sparse tensor.
 
@@ -222,8 +224,10 @@ class SparseTensor(_TensorLike):
 
 SparseTensorValue = collections.namedtuple(
     "SparseTensorValue", ["indices", "values", "dense_shape"])
+tf_export("SparseTensorValue")(SparseTensorValue)
 
 
+@tf_export("convert_to_tensor_or_sparse_tensor")
 def convert_to_tensor_or_sparse_tensor(value, dtype=None, name=None):
   """Converts value to a `SparseTensor` or `Tensor`.
 
diff --git a/tensorflow/python/framework/tensor_shape.py b/tensorflow/python/framework/tensor_shape.py
index 54ec15ea66..222071cb9e 100644
--- a/tensorflow/python/framework/tensor_shape.py
+++ b/tensorflow/python/framework/tensor_shape.py
@@ -19,8 +19,10 @@ from __future__ import print_function
 
 from tensorflow.core.framework import tensor_shape_pb2
 from tensorflow.python.util import compat
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("Dimension")
 class Dimension(object):
   """Represents the value of one dimension in a TensorShape."""
 
@@ -397,6 +399,7 @@ def as_dimension(value):
     return Dimension(value)
 
 
+@tf_export("TensorShape")
 class TensorShape(object):
   """Represents the shape of a `Tensor`.
 
diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index 1b90c7ad4d..d2b8e80305 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -38,6 +38,7 @@ except ImportError:
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.util.tf_export import tf_export
 # pylint: enable=g-import-not-at-top
 
 
@@ -328,6 +329,7 @@ def _AssertCompatible(values, dtype):
                       (dtype.name, repr(mismatch), type(mismatch).__name__))
 
 
+@tf_export("make_tensor_proto")
 def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False):
   """Create a TensorProto.
 
@@ -515,6 +517,7 @@ def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False):
   return tensor_proto
 
 
+@tf_export("make_ndarray")
 def MakeNdarray(tensor):
   """Create a numpy ndarray from a tensor.
 
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 729c939870..8124472a83 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -65,8 +65,10 @@ from tensorflow.python.training import server_lib
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
 from tensorflow.python.util.protobuf import compare
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("test.gpu_device_name")
 def gpu_device_name():
   """Returns the name of a GPU device if available or the empty string."""
   for x in device_lib.list_local_devices():
@@ -101,6 +103,7 @@ def assert_ops_in_graph(expected_ops, graph):
   return actual_ops
 
 
+@tf_export("test.assert_equal_graph_def")
 def assert_equal_graph_def(actual, expected, checkpoint_v2=False):
   """Asserts that two `GraphDef`s are (mostly) the same.
 
@@ -630,6 +633,7 @@ def run_in_graph_and_eager_modes(
   return decorator
 
 
+@tf_export("test.is_gpu_available")
 def is_gpu_available(cuda_only=False, min_cuda_compute_capability=None):
   """Returns whether TensorFlow can access a GPU.
 
@@ -678,6 +682,7 @@ def device(use_gpu):
     yield
 
 
+@tf_export("test.TestCase")
 class TensorFlowTestCase(googletest.TestCase):
   """Base class for tests that need to test TensorFlow.
   """
@@ -1326,6 +1331,7 @@ class TensorFlowTestCase(googletest.TestCase):
     # pylint: enable=invalid-name
 
 
+@tf_export("test.create_local_cluster")
 def create_local_cluster(num_workers, num_ps, protocol="grpc",
                          worker_config=None, ps_config=None):
   """Create and start local servers and return the associated `Server` objects.
diff --git a/tensorflow/python/framework/versions.py b/tensorflow/python/framework/versions.py
index f03b81eb28..bdcbc15af6 100644
--- a/tensorflow/python/framework/versions.py
+++ b/tensorflow/python/framework/versions.py
@@ -20,6 +20,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.util.tf_export import tf_export
 
 __version__ = pywrap_tensorflow.__version__
 __git_version__ = pywrap_tensorflow.__git_version__
@@ -28,16 +29,24 @@ __cxx11_abi_flag__ = pywrap_tensorflow.__cxx11_abi_flag__
 __monolithic_build__ = pywrap_tensorflow.__monolithic_build__
 
 VERSION = __version__
+tf_export("VERSION").export_constant(__name__, "VERSION")
 GIT_VERSION = __git_version__
+tf_export("GIT_VERSION").export_constant(__name__, "GIT_VERSION")
 COMPILER_VERSION = __compiler_version__
+tf_export("COMPILER_VERSION").export_constant(__name__, "COMPILER_VERSION")
 CXX11_ABI_FLAG = __cxx11_abi_flag__
 MONOLITHIC_BUILD = __monolithic_build__
 
 GRAPH_DEF_VERSION = pywrap_tensorflow.GRAPH_DEF_VERSION
+tf_export("GRAPH_DEF_VERSION").export_constant(__name__, "GRAPH_DEF_VERSION")
 GRAPH_DEF_VERSION_MIN_CONSUMER = (
     pywrap_tensorflow.GRAPH_DEF_VERSION_MIN_CONSUMER)
+tf_export("GRAPH_DEF_VERSION_MIN_CONSUMER").export_constant(
+    __name__, "GRAPH_DEF_VERSION_MIN_CONSUMER")
 GRAPH_DEF_VERSION_MIN_PRODUCER = (
     pywrap_tensorflow.GRAPH_DEF_VERSION_MIN_PRODUCER)
+tf_export("GRAPH_DEF_VERSION_MIN_PRODUCER").export_constant(
+    __name__, "GRAPH_DEF_VERSION_MIN_PRODUCER")
 
 __all__ = [
     "__version__",
diff --git a/tensorflow/tools/api/generator/BUILD b/tensorflow/tools/api/generator/BUILD
index 8bbd247818..4c47ffe07c 100644
--- a/tensorflow/tools/api/generator/BUILD
+++ b/tensorflow/tools/api/generator/BUILD
@@ -48,6 +48,7 @@ genrule(
         "api/contrib/stat_summarizer/__init__.py",
         "api/distributions/__init__.py",
         "api/distributions/bijectors/__init__.py",
+        "api/errors/__init__.py",
         "api/image/__init__.py",
         "api/linalg/__init__.py",
         "api/nn/__init__.py",
@@ -55,7 +56,9 @@ genrule(
         "api/train/__init__.py",
         "api/app/__init__.py",
         "api/gfile/__init__.py",
+        "api/graph_util/__init__.py",
         "api/keras/__init__.py",
+        "api/keras/backend/__init__.py",
         "api/keras/datasets/__init__.py",
         "api/keras/datasets/boston_housing/__init__.py",
         "api/keras/datasets/cifar10/__init__.py",
-- 
GitLab


From 1ef8e4d519519b0d1bb84e31290db6830fb28887 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Jan 2018 12:52:36 -0800
Subject: [PATCH 023/966] Go: Update generated wrapper functions for TensorFlow
 ops.

PiperOrigin-RevId: 182977979
---
 tensorflow/go/op/wrappers.go | 1186 +++++++++++++++++-----------------
 1 file changed, 593 insertions(+), 593 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 8df4e98adc..2f1f3b0be4 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -163,6 +163,63 @@ func WriteSummary(scope *Scope, writer tf.Output, step tf.Output, tensor tf.Outp
 	return scope.AddOperation(opspec)
 }
 
+// Creates summary database writer accessible by given resource handle.
+//
+// This can be used to write tensors from the execution graph directly
+// to a database. Only SQLite is supported right now. This function
+// will create the schema if it doesn't exist. Entries in the Users,
+// Experiments, and Runs tables will be created automatically if they
+// don't already exist.
+//
+// Arguments:
+//	writer: Handle to SummaryWriter resource to overwrite.
+//	db_uri: For example "file:/tmp/foo.sqlite".
+//	experiment_name: Can't contain ASCII control characters or <>. Case
+// sensitive. If empty, then the Run will not be associated with any
+// Experiment.
+//	run_name: Can't contain ASCII control characters or <>. Case sensitive.
+// If empty, then each Tag will not be associated with any Run.
+//	user_name: Must be valid as both a DNS label and Linux username. If
+// empty, then the Experiment will not be associated with any User.
+//
+// Returns the created operation.
+func CreateSummaryDbWriter(scope *Scope, writer tf.Output, db_uri tf.Output, experiment_name tf.Output, run_name tf.Output, user_name tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "CreateSummaryDbWriter",
+		Input: []tf.Input{
+			writer, db_uri, experiment_name, run_name, user_name,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Creates a summary file writer accessible by the given resource handle.
+//
+// Arguments:
+//	writer: A handle to the summary writer resource
+//	logdir: Directory where the event file will be written.
+//	max_queue: Size of the queue of pending events and summaries.
+//	flush_millis: How often, in milliseconds, to flush the pending events and
+// summaries to disk.
+//	filename_suffix: Every event file's name is suffixed with this suffix.
+//
+// Returns the created operation.
+func CreateSummaryFileWriter(scope *Scope, writer tf.Output, logdir tf.Output, max_queue tf.Output, flush_millis tf.Output, filename_suffix tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "CreateSummaryFileWriter",
+		Input: []tf.Input{
+			writer, logdir, max_queue, flush_millis, filename_suffix,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Partitions `data` into `num_partitions` tensors using indices from `partitions`.
 //
 // For each index tuple `js` of size `partitions.ndim`, the slice `data[js, ...]`
@@ -3164,39 +3221,6 @@ func HistogramFixedWidth(scope *Scope, values tf.Output, value_range tf.Output,
 	return op.Output(0)
 }
 
-// Creates summary database writer accessible by given resource handle.
-//
-// This can be used to write tensors from the execution graph directly
-// to a database. Only SQLite is supported right now. This function
-// will create the schema if it doesn't exist. Entries in the Users,
-// Experiments, and Runs tables will be created automatically if they
-// don't already exist.
-//
-// Arguments:
-//	writer: Handle to SummaryWriter resource to overwrite.
-//	db_uri: For example "file:/tmp/foo.sqlite".
-//	experiment_name: Can't contain ASCII control characters or <>. Case
-// sensitive. If empty, then the Run will not be associated with any
-// Experiment.
-//	run_name: Can't contain ASCII control characters or <>. Case sensitive.
-// If empty, then each Tag will not be associated with any Run.
-//	user_name: Must be valid as both a DNS label and Linux username. If
-// empty, then the Experiment will not be associated with any User.
-//
-// Returns the created operation.
-func CreateSummaryDbWriter(scope *Scope, writer tf.Output, db_uri tf.Output, experiment_name tf.Output, run_name tf.Output, user_name tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "CreateSummaryDbWriter",
-		Input: []tf.Input{
-			writer, db_uri, experiment_name, run_name, user_name,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
 // Adds Tensor 'bias' to Tensor 'input' for Quantized types.
 //
 // Broadcasts the values of bias on dimensions 0..N-2 of 'input'.
@@ -7474,30 +7498,6 @@ func VarHandleOp(scope *Scope, dtype tf.DataType, shape tf.Shape, optional ...Va
 	return op.Output(0)
 }
 
-// Creates a summary file writer accessible by the given resource handle.
-//
-// Arguments:
-//	writer: A handle to the summary writer resource
-//	logdir: Directory where the event file will be written.
-//	max_queue: Size of the queue of pending events and summaries.
-//	flush_millis: How often, in milliseconds, to flush the pending events and
-// summaries to disk.
-//	filename_suffix: Every event file's name is suffixed with this suffix.
-//
-// Returns the created operation.
-func CreateSummaryFileWriter(scope *Scope, writer tf.Output, logdir tf.Output, max_queue tf.Output, flush_millis tf.Output, filename_suffix tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "CreateSummaryFileWriter",
-		Input: []tf.Input{
-			writer, logdir, max_queue, flush_millis, filename_suffix,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
 // Elementwise computes the bitwise XOR of `x` and `y`.
 //
 // The result will have those bits set, that are different in `x` and `y`. The
@@ -10955,139 +10955,298 @@ func DepthwiseConv2dNativeBackpropFilter(scope *Scope, input tf.Output, filter_s
 	return op.Output(0)
 }
 
-// Component-wise divides a SparseTensor by a dense Tensor.
-//
-// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
-// the other direction.
+// Flushes the writer's unwritten events.
 //
 // Arguments:
-//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//	dense: `R`-D.  The dense Tensor operand.
+//	writer: A handle to the summary writer resource.
 //
-// Returns 1-D.  The `N` values that are operated on.
-func SparseDenseCwiseDiv(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+// Returns the created operation.
+func FlushSummaryWriter(scope *Scope, writer tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseDenseCwiseDiv",
+		Type: "FlushSummaryWriter",
 		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape, dense,
+			writer,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// ResourceApplyMomentumAttr is an optional argument to ResourceApplyMomentum.
-type ResourceApplyMomentumAttr func(optionalAttr)
+// QuantizeV2Attr is an optional argument to QuantizeV2.
+type QuantizeV2Attr func(optionalAttr)
 
-// ResourceApplyMomentumUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyMomentumUseLocking(value bool) ResourceApplyMomentumAttr {
+// QuantizeV2Mode sets the optional mode attribute to value.
+// If not specified, defaults to "MIN_COMBINED"
+func QuantizeV2Mode(value string) QuantizeV2Attr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["mode"] = value
 	}
 }
 
-// ResourceApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
-//
-// value: If `True`, the tensor passed to compute grad will be
-// var - lr * momentum * accum, so in the end, the var you get is actually
-// var - lr * momentum * accum.
-// If not specified, defaults to false
-func ResourceApplyMomentumUseNesterov(value bool) ResourceApplyMomentumAttr {
+// QuantizeV2RoundMode sets the optional round_mode attribute to value.
+// If not specified, defaults to "HALF_AWAY_FROM_ZERO"
+func QuantizeV2RoundMode(value string) QuantizeV2Attr {
 	return func(m optionalAttr) {
-		m["use_nesterov"] = value
+		m["round_mode"] = value
 	}
 }
 
-// Update '*var' according to the momentum scheme. Set use_nesterov = True if you
+// Quantize the 'input' tensor of type float to 'output' tensor of type 'T'.
 //
-// want to use Nesterov momentum.
+// [min_range, max_range] are scalar floats that specify the range for
+// the 'input' data. The 'mode' attribute controls exactly which calculations are
+// used to convert the float values to their quantized equivalents.  The
+// 'round_mode' attribute controls which rounding tie-breaking algorithm is used
+// when rounding float values to their quantized equivalents.
 //
-// accum = accum * momentum + grad
-// var -= lr * accum
+// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
 //
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	grad: The gradient.
-//	momentum: Momentum. Must be a scalar.
+// ```
+// out[i] = (in[i] - min_range) * range(T) / (max_range - min_range)
+// if T == qint8, out[i] -= (range(T) + 1) / 2.0
+// ```
+// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
 //
-// Returns the created operation.
-func ResourceApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, momentum tf.Output, optional ...ResourceApplyMomentumAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyMomentum",
-		Input: []tf.Input{
-			var_, accum, lr, grad, momentum,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Returns the truth value of (x >= y) element-wise.
+// *MIN_COMBINED Mode Example*
 //
-// *NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func GreaterEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "GreaterEqual",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Conv3DAttr is an optional argument to Conv3D.
-type Conv3DAttr func(optionalAttr)
-
-// Conv3DDataFormat sets the optional data_format attribute to value.
+// Assume the input is type float and has a possible range of [0.0, 6.0] and the
+// output type is quint8 ([0, 255]). The min_range and max_range values should be
+// specified as 0.0 and 6.0. Quantizing from float to quint8 will multiply each
+// value of the input by 255/6 and cast to quint8.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func Conv3DDataFormat(value string) Conv3DAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Conv3DDilations sets the optional dilations attribute to value.
+// If the output type was qint8 ([-128, 127]), the operation will additionally
+// subtract each value by 128 prior to casting, so that the range of values aligns
+// with the range of qint8.
 //
-// value: 1-D tensor of length 5.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
-func Conv3DDilations(value []int64) Conv3DAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
+// If the mode is 'MIN_FIRST', then this approach is used:
+//
+// ```
+// num_discrete_values = 1 << (# of bits in T)
+// range_adjust = num_discrete_values / (num_discrete_values - 1)
+// range = (range_max - range_min) * range_adjust
+// range_scale = num_discrete_values / range
+// quantized = round(input * range_scale) - round(range_min * range_scale) +
+//   numeric_limits<T>::min()
+// quantized = max(quantized, numeric_limits<T>::min())
+// quantized = min(quantized, numeric_limits<T>::max())
+// ```
+//
+// The biggest difference between this and MIN_COMBINED is that the minimum range
+// is rounded first, before it's subtracted from the rounded value. With
+// MIN_COMBINED, a small bias is introduced where repeated iterations of quantizing
+// and dequantizing will introduce a larger and larger error.
+//
+// *SCALED mode Example*
+//
+// `SCALED` mode matches the quantization approach used in
+// `QuantizeAndDequantize{V2|V3}`.
+//
+// If the mode is `SCALED`, we do not use the full range of the output type,
+// choosing to elide the lowest possible value for symmetry (e.g., output range is
+// -127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
+// 0.
+//
+// We first find the range of values in our tensor. The
+// range we use is always centered on 0, so we find m such that
+// ```c++
+//   m = max(abs(input_min), abs(input_max))
+// ```
+//
+// Our input tensor range is then `[-m, m]`.
+//
+// Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
+// If T is signed, this is
+// ```
+//   num_bits = sizeof(T) * 8
+//   [min_fixed, max_fixed] =
+//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
+// ```
+//
+// Otherwise, if T is unsigned, the fixed-point range is
+// ```
+//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
+// ```
+//
+// From this we compute our scaling factor, s:
+// ```c++
+//   s = (max_fixed - min_fixed) / (2 * m)
+// ```
+//
+// Now we can quantize the elements of our tensor:
+// ```c++
+// result = round(input * s)
+// ```
+//
+// One thing to watch out for is that the operator may choose to adjust the
+// requested minimum and maximum values slightly during the quantization process,
+// so you should always use the output ports as the range for further calculations.
+// For example, if the requested minimum and maximum values are close to equal,
+// they will be separated by a small epsilon value to prevent ill-formed quantized
+// buffers from being created. Otherwise, you can end up with buffers where all the
+// quantized values map to the same float value, which causes problems for
+// operations that have to perform further calculations on them.
+//
+// Arguments:
+//
+//	min_range: The minimum scalar value possibly produced for the input.
+//	max_range: The maximum scalar value possibly produced for the input.
+//
+//
+// Returns The quantized data produced from the float input.The actual minimum scalar value used for the output.The actual maximum scalar value used for the output.
+func QuantizeV2(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, T tf.DataType, optional ...QuantizeV2Attr) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"T": T}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizeV2",
+		Input: []tf.Input{
+			input, min_range, max_range,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Component-wise divides a SparseTensor by a dense Tensor.
+//
+// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
+// the other direction.
+//
+// Arguments:
+//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	dense: `R`-D.  The dense Tensor operand.
+//
+// Returns 1-D.  The `N` values that are operated on.
+func SparseDenseCwiseDiv(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseDenseCwiseDiv",
+		Input: []tf.Input{
+			sp_indices, sp_values, sp_shape, dense,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyMomentumAttr is an optional argument to ResourceApplyMomentum.
+type ResourceApplyMomentumAttr func(optionalAttr)
+
+// ResourceApplyMomentumUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyMomentumUseLocking(value bool) ResourceApplyMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// ResourceApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
+//
+// value: If `True`, the tensor passed to compute grad will be
+// var - lr * momentum * accum, so in the end, the var you get is actually
+// var - lr * momentum * accum.
+// If not specified, defaults to false
+func ResourceApplyMomentumUseNesterov(value bool) ResourceApplyMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_nesterov"] = value
+	}
+}
+
+// Update '*var' according to the momentum scheme. Set use_nesterov = True if you
+//
+// want to use Nesterov momentum.
+//
+// accum = accum * momentum + grad
+// var -= lr * accum
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	grad: The gradient.
+//	momentum: Momentum. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, momentum tf.Output, optional ...ResourceApplyMomentumAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyMomentum",
+		Input: []tf.Input{
+			var_, accum, lr, grad, momentum,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Returns the truth value of (x >= y) element-wise.
+//
+// *NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func GreaterEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "GreaterEqual",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Conv3DAttr is an optional argument to Conv3D.
+type Conv3DAttr func(optionalAttr)
+
+// Conv3DDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func Conv3DDataFormat(value string) Conv3DAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Conv3DDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 5.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DDilations(value []int64) Conv3DAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
 	}
 }
 
@@ -14467,230 +14626,18 @@ func RandomGamma(scope *Scope, shape tf.Output, alpha tf.Output, optional ...Ran
 	return op.Output(0)
 }
 
-// AvgPool3DGradAttr is an optional argument to AvgPool3DGrad.
-type AvgPool3DGradAttr func(optionalAttr)
+// QuantizedConv2DAttr is an optional argument to QuantizedConv2D.
+type QuantizedConv2DAttr func(optionalAttr)
 
-// AvgPool3DGradDataFormat sets the optional data_format attribute to value.
-//
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func AvgPool3DGradDataFormat(value string) AvgPool3DGradAttr {
+// QuantizedConv2DOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["out_type"] = value
 	}
 }
 
-// Computes gradients of average pooling function.
-//
-// Arguments:
-//	orig_input_shape: The original input dimensions.
-//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-//
-// Returns The backprop for input.
-func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DGradAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "AvgPool3DGrad",
-		Input: []tf.Input{
-			orig_input_shape, grad,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ParseSingleSequenceExampleAttr is an optional argument to ParseSingleSequenceExample.
-type ParseSingleSequenceExampleAttr func(optionalAttr)
-
-// ParseSingleSequenceExampleContextSparseTypes sets the optional context_sparse_types attribute to value.
-//
-// value: A list of Ncontext_sparse types; the data types of data in
-// each context Feature given in context_sparse_keys.
-// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func ParseSingleSequenceExampleContextSparseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["context_sparse_types"] = value
-	}
-}
-
-// ParseSingleSequenceExampleFeatureListDenseTypes sets the optional feature_list_dense_types attribute to value.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func ParseSingleSequenceExampleFeatureListDenseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["feature_list_dense_types"] = value
-	}
-}
-
-// ParseSingleSequenceExampleContextDenseShapes sets the optional context_dense_shapes attribute to value.
-//
-// value: A list of Ncontext_dense shapes; the shapes of data in
-// each context Feature given in context_dense_keys.
-// The number of elements in the Feature corresponding to context_dense_key[j]
-// must always equal context_dense_shapes[j].NumEntries().
-// The shape of context_dense_values[j] will match context_dense_shapes[j].
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func ParseSingleSequenceExampleContextDenseShapes(value []tf.Shape) ParseSingleSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["context_dense_shapes"] = value
-	}
-}
-
-// ParseSingleSequenceExampleFeatureListSparseTypes sets the optional feature_list_sparse_types attribute to value.
-//
-// value: A list of Nfeature_list_sparse types; the data types
-// of data in each FeatureList given in feature_list_sparse_keys.
-// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func ParseSingleSequenceExampleFeatureListSparseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["feature_list_sparse_types"] = value
-	}
-}
-
-// ParseSingleSequenceExampleFeatureListDenseShapes sets the optional feature_list_dense_shapes attribute to value.
-//
-// value: A list of Nfeature_list_dense shapes; the shapes of
-// data in each FeatureList given in feature_list_dense_keys.
-// The shape of each Feature in the FeatureList corresponding to
-// feature_list_dense_key[j] must always equal
-// feature_list_dense_shapes[j].NumEntries().
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func ParseSingleSequenceExampleFeatureListDenseShapes(value []tf.Shape) ParseSingleSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["feature_list_dense_shapes"] = value
-	}
-}
-
-// Transforms a scalar brain.SequenceExample proto (as strings) into typed tensors.
-//
-// Arguments:
-//	serialized: A scalar containing a binary serialized SequenceExample proto.
-//	feature_list_dense_missing_assumed_empty: A vector listing the
-// FeatureList keys which may be missing from the SequenceExample.  If the
-// associated FeatureList is missing, it is treated as empty.  By default,
-// any FeatureList not listed in this vector must exist in the SequenceExample.
-//	context_sparse_keys: A list of Ncontext_sparse string Tensors (scalars).
-// The keys expected in the Examples' features associated with context_sparse
-// values.
-//	context_dense_keys: A list of Ncontext_dense string Tensors (scalars).
-// The keys expected in the SequenceExamples' context features associated with
-// dense values.
-//	feature_list_sparse_keys: A list of Nfeature_list_sparse string Tensors
-// (scalars).  The keys expected in the FeatureLists associated with sparse
-// values.
-//	feature_list_dense_keys: A list of Nfeature_list_dense string Tensors (scalars).
-// The keys expected in the SequenceExamples' feature_lists associated
-// with lists of dense values.
-//	context_dense_defaults: A list of Ncontext_dense Tensors (some may be empty).
-// context_dense_defaults[j] provides default values
-// when the SequenceExample's context map lacks context_dense_key[j].
-// If an empty Tensor is provided for context_dense_defaults[j],
-// then the Feature context_dense_keys[j] is required.
-// The input type is inferred from context_dense_defaults[j], even when it's
-// empty.  If context_dense_defaults[j] is not empty, its shape must match
-// context_dense_shapes[j].
-//	debug_name: A scalar containing the name of the serialized proto.
-// May contain, for example, table key (descriptive) name for the
-// corresponding serialized proto.  This is purely useful for debugging
-// purposes, and the presence of values here has no effect on the output.
-// May also be an empty scalar if no name is available.
-func ParseSingleSequenceExample(scope *Scope, serialized tf.Output, feature_list_dense_missing_assumed_empty tf.Output, context_sparse_keys []tf.Output, context_dense_keys []tf.Output, feature_list_sparse_keys []tf.Output, feature_list_dense_keys []tf.Output, context_dense_defaults []tf.Output, debug_name tf.Output, optional ...ParseSingleSequenceExampleAttr) (context_sparse_indices []tf.Output, context_sparse_values []tf.Output, context_sparse_shapes []tf.Output, context_dense_values []tf.Output, feature_list_sparse_indices []tf.Output, feature_list_sparse_values []tf.Output, feature_list_sparse_shapes []tf.Output, feature_list_dense_values []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ParseSingleSequenceExample",
-		Input: []tf.Input{
-			serialized, feature_list_dense_missing_assumed_empty, tf.OutputList(context_sparse_keys), tf.OutputList(context_dense_keys), tf.OutputList(feature_list_sparse_keys), tf.OutputList(feature_list_dense_keys), tf.OutputList(context_dense_defaults), debug_name,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if context_sparse_indices, idx, err = makeOutputList(op, idx, "context_sparse_indices"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if context_sparse_values, idx, err = makeOutputList(op, idx, "context_sparse_values"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if context_sparse_shapes, idx, err = makeOutputList(op, idx, "context_sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if context_dense_values, idx, err = makeOutputList(op, idx, "context_dense_values"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if feature_list_sparse_indices, idx, err = makeOutputList(op, idx, "feature_list_sparse_indices"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if feature_list_sparse_values, idx, err = makeOutputList(op, idx, "feature_list_sparse_values"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if feature_list_sparse_shapes, idx, err = makeOutputList(op, idx, "feature_list_sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if feature_list_dense_values, idx, err = makeOutputList(op, idx, "feature_list_dense_values"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	return context_sparse_indices, context_sparse_values, context_sparse_shapes, context_dense_values, feature_list_sparse_indices, feature_list_sparse_values, feature_list_sparse_shapes, feature_list_dense_values
-}
-
-// QuantizedConv2DAttr is an optional argument to QuantizedConv2D.
-type QuantizedConv2DAttr func(optionalAttr)
-
-// QuantizedConv2DOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// QuantizedConv2DDilations sets the optional dilations attribute to value.
+// QuantizedConv2DDilations sets the optional dilations attribute to value.
 //
 // value: 1-D tensor of length 4.  The dilation factor for each dimension of
 // `input`. If set to k > 1, there will be k-1 skipped cells between each
@@ -18537,34 +18484,6 @@ func ReaderResetV2(scope *Scope, reader_handle tf.Output) (o *tf.Operation) {
 	return scope.AddOperation(opspec)
 }
 
-// Adjust the hue of one or more images.
-//
-// `images` is a tensor of at least 3 dimensions.  The last dimension is
-// interpretted as channels, and must be three.
-//
-// The input image is considered in the RGB colorspace. Conceptually, the RGB
-// colors are first mapped into HSV. A delta is then applied all the hue values,
-// and then remapped back to RGB colorspace.
-//
-// Arguments:
-//	images: Images to adjust.  At least 3-D.
-//	delta: A float delta to add to the hue.
-//
-// Returns The hue-adjusted image or images.
-func AdjustHue(scope *Scope, images tf.Output, delta tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AdjustHue",
-		Input: []tf.Input{
-			images, delta,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // ResourceApplyAdamAttr is an optional argument to ResourceApplyAdam.
 type ResourceApplyAdamAttr func(optionalAttr)
 
@@ -25550,34 +25469,274 @@ func RightShift(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// DecodeWavAttr is an optional argument to DecodeWav.
-type DecodeWavAttr func(optionalAttr)
-
-// DecodeWavDesiredChannels sets the optional desired_channels attribute to value.
-//
-// value: Number of sample channels wanted.
-// If not specified, defaults to -1
-func DecodeWavDesiredChannels(value int64) DecodeWavAttr {
-	return func(m optionalAttr) {
-		m["desired_channels"] = value
-	}
-}
-
-// DecodeWavDesiredSamples sets the optional desired_samples attribute to value.
+// Adjust the hue of one or more images.
 //
-// value: Length of audio requested.
-// If not specified, defaults to -1
-func DecodeWavDesiredSamples(value int64) DecodeWavAttr {
-	return func(m optionalAttr) {
-		m["desired_samples"] = value
-	}
-}
-
-// Decode a 16-bit PCM WAV file to a float tensor.
+// `images` is a tensor of at least 3 dimensions.  The last dimension is
+// interpretted as channels, and must be three.
 //
-// The -32768 to 32767 signed 16-bit values will be scaled to -1.0 to 1.0 in float.
+// The input image is considered in the RGB colorspace. Conceptually, the RGB
+// colors are first mapped into HSV. A delta is then applied all the hue values,
+// and then remapped back to RGB colorspace.
 //
-// When desired_channels is set, if the input contains fewer channels than this
+// Arguments:
+//	images: Images to adjust.  At least 3-D.
+//	delta: A float delta to add to the hue.
+//
+// Returns The hue-adjusted image or images.
+func AdjustHue(scope *Scope, images tf.Output, delta tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AdjustHue",
+		Input: []tf.Input{
+			images, delta,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AvgPool3DGradAttr is an optional argument to AvgPool3DGrad.
+type AvgPool3DGradAttr func(optionalAttr)
+
+// AvgPool3DGradDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func AvgPool3DGradDataFormat(value string) AvgPool3DGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes gradients of average pooling function.
+//
+// Arguments:
+//	orig_input_shape: The original input dimensions.
+//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The backprop for input.
+func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AvgPool3DGrad",
+		Input: []tf.Input{
+			orig_input_shape, grad,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ParseSingleSequenceExampleAttr is an optional argument to ParseSingleSequenceExample.
+type ParseSingleSequenceExampleAttr func(optionalAttr)
+
+// ParseSingleSequenceExampleContextSparseTypes sets the optional context_sparse_types attribute to value.
+//
+// value: A list of Ncontext_sparse types; the data types of data in
+// each context Feature given in context_sparse_keys.
+// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSingleSequenceExampleContextSparseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["context_sparse_types"] = value
+	}
+}
+
+// ParseSingleSequenceExampleFeatureListDenseTypes sets the optional feature_list_dense_types attribute to value.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSingleSequenceExampleFeatureListDenseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["feature_list_dense_types"] = value
+	}
+}
+
+// ParseSingleSequenceExampleContextDenseShapes sets the optional context_dense_shapes attribute to value.
+//
+// value: A list of Ncontext_dense shapes; the shapes of data in
+// each context Feature given in context_dense_keys.
+// The number of elements in the Feature corresponding to context_dense_key[j]
+// must always equal context_dense_shapes[j].NumEntries().
+// The shape of context_dense_values[j] will match context_dense_shapes[j].
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSingleSequenceExampleContextDenseShapes(value []tf.Shape) ParseSingleSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["context_dense_shapes"] = value
+	}
+}
+
+// ParseSingleSequenceExampleFeatureListSparseTypes sets the optional feature_list_sparse_types attribute to value.
+//
+// value: A list of Nfeature_list_sparse types; the data types
+// of data in each FeatureList given in feature_list_sparse_keys.
+// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSingleSequenceExampleFeatureListSparseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["feature_list_sparse_types"] = value
+	}
+}
+
+// ParseSingleSequenceExampleFeatureListDenseShapes sets the optional feature_list_dense_shapes attribute to value.
+//
+// value: A list of Nfeature_list_dense shapes; the shapes of
+// data in each FeatureList given in feature_list_dense_keys.
+// The shape of each Feature in the FeatureList corresponding to
+// feature_list_dense_key[j] must always equal
+// feature_list_dense_shapes[j].NumEntries().
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSingleSequenceExampleFeatureListDenseShapes(value []tf.Shape) ParseSingleSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["feature_list_dense_shapes"] = value
+	}
+}
+
+// Transforms a scalar brain.SequenceExample proto (as strings) into typed tensors.
+//
+// Arguments:
+//	serialized: A scalar containing a binary serialized SequenceExample proto.
+//	feature_list_dense_missing_assumed_empty: A vector listing the
+// FeatureList keys which may be missing from the SequenceExample.  If the
+// associated FeatureList is missing, it is treated as empty.  By default,
+// any FeatureList not listed in this vector must exist in the SequenceExample.
+//	context_sparse_keys: A list of Ncontext_sparse string Tensors (scalars).
+// The keys expected in the Examples' features associated with context_sparse
+// values.
+//	context_dense_keys: A list of Ncontext_dense string Tensors (scalars).
+// The keys expected in the SequenceExamples' context features associated with
+// dense values.
+//	feature_list_sparse_keys: A list of Nfeature_list_sparse string Tensors
+// (scalars).  The keys expected in the FeatureLists associated with sparse
+// values.
+//	feature_list_dense_keys: A list of Nfeature_list_dense string Tensors (scalars).
+// The keys expected in the SequenceExamples' feature_lists associated
+// with lists of dense values.
+//	context_dense_defaults: A list of Ncontext_dense Tensors (some may be empty).
+// context_dense_defaults[j] provides default values
+// when the SequenceExample's context map lacks context_dense_key[j].
+// If an empty Tensor is provided for context_dense_defaults[j],
+// then the Feature context_dense_keys[j] is required.
+// The input type is inferred from context_dense_defaults[j], even when it's
+// empty.  If context_dense_defaults[j] is not empty, its shape must match
+// context_dense_shapes[j].
+//	debug_name: A scalar containing the name of the serialized proto.
+// May contain, for example, table key (descriptive) name for the
+// corresponding serialized proto.  This is purely useful for debugging
+// purposes, and the presence of values here has no effect on the output.
+// May also be an empty scalar if no name is available.
+func ParseSingleSequenceExample(scope *Scope, serialized tf.Output, feature_list_dense_missing_assumed_empty tf.Output, context_sparse_keys []tf.Output, context_dense_keys []tf.Output, feature_list_sparse_keys []tf.Output, feature_list_dense_keys []tf.Output, context_dense_defaults []tf.Output, debug_name tf.Output, optional ...ParseSingleSequenceExampleAttr) (context_sparse_indices []tf.Output, context_sparse_values []tf.Output, context_sparse_shapes []tf.Output, context_dense_values []tf.Output, feature_list_sparse_indices []tf.Output, feature_list_sparse_values []tf.Output, feature_list_sparse_shapes []tf.Output, feature_list_dense_values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ParseSingleSequenceExample",
+		Input: []tf.Input{
+			serialized, feature_list_dense_missing_assumed_empty, tf.OutputList(context_sparse_keys), tf.OutputList(context_dense_keys), tf.OutputList(feature_list_sparse_keys), tf.OutputList(feature_list_dense_keys), tf.OutputList(context_dense_defaults), debug_name,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if context_sparse_indices, idx, err = makeOutputList(op, idx, "context_sparse_indices"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if context_sparse_values, idx, err = makeOutputList(op, idx, "context_sparse_values"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if context_sparse_shapes, idx, err = makeOutputList(op, idx, "context_sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if context_dense_values, idx, err = makeOutputList(op, idx, "context_dense_values"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if feature_list_sparse_indices, idx, err = makeOutputList(op, idx, "feature_list_sparse_indices"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if feature_list_sparse_values, idx, err = makeOutputList(op, idx, "feature_list_sparse_values"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if feature_list_sparse_shapes, idx, err = makeOutputList(op, idx, "feature_list_sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if feature_list_dense_values, idx, err = makeOutputList(op, idx, "feature_list_dense_values"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	return context_sparse_indices, context_sparse_values, context_sparse_shapes, context_dense_values, feature_list_sparse_indices, feature_list_sparse_values, feature_list_sparse_shapes, feature_list_dense_values
+}
+
+// DecodeWavAttr is an optional argument to DecodeWav.
+type DecodeWavAttr func(optionalAttr)
+
+// DecodeWavDesiredChannels sets the optional desired_channels attribute to value.
+//
+// value: Number of sample channels wanted.
+// If not specified, defaults to -1
+func DecodeWavDesiredChannels(value int64) DecodeWavAttr {
+	return func(m optionalAttr) {
+		m["desired_channels"] = value
+	}
+}
+
+// DecodeWavDesiredSamples sets the optional desired_samples attribute to value.
+//
+// value: Length of audio requested.
+// If not specified, defaults to -1
+func DecodeWavDesiredSamples(value int64) DecodeWavAttr {
+	return func(m optionalAttr) {
+		m["desired_samples"] = value
+	}
+}
+
+// Decode a 16-bit PCM WAV file to a float tensor.
+//
+// The -32768 to 32767 signed 16-bit values will be scaled to -1.0 to 1.0 in float.
+//
+// When desired_channels is set, if the input contains fewer channels than this
 // then the last channel will be duplicated to give the requested number, else if
 // the input has more channels than requested then the additional channels will be
 // ignored.
@@ -28178,162 +28337,3 @@ func FakeQuantWithMinMaxVarsPerChannelGradient(scope *Scope, gradients tf.Output
 	op := scope.AddOperation(opspec)
 	return op.Output(0), op.Output(1), op.Output(2)
 }
-
-// QuantizeV2Attr is an optional argument to QuantizeV2.
-type QuantizeV2Attr func(optionalAttr)
-
-// QuantizeV2Mode sets the optional mode attribute to value.
-// If not specified, defaults to "MIN_COMBINED"
-func QuantizeV2Mode(value string) QuantizeV2Attr {
-	return func(m optionalAttr) {
-		m["mode"] = value
-	}
-}
-
-// QuantizeV2RoundMode sets the optional round_mode attribute to value.
-// If not specified, defaults to "HALF_AWAY_FROM_ZERO"
-func QuantizeV2RoundMode(value string) QuantizeV2Attr {
-	return func(m optionalAttr) {
-		m["round_mode"] = value
-	}
-}
-
-// Quantize the 'input' tensor of type float to 'output' tensor of type 'T'.
-//
-// [min_range, max_range] are scalar floats that specify the range for
-// the 'input' data. The 'mode' attribute controls exactly which calculations are
-// used to convert the float values to their quantized equivalents.  The
-// 'round_mode' attribute controls which rounding tie-breaking algorithm is used
-// when rounding float values to their quantized equivalents.
-//
-// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
-//
-// ```
-// out[i] = (in[i] - min_range) * range(T) / (max_range - min_range)
-// if T == qint8, out[i] -= (range(T) + 1) / 2.0
-// ```
-// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
-//
-// *MIN_COMBINED Mode Example*
-//
-// Assume the input is type float and has a possible range of [0.0, 6.0] and the
-// output type is quint8 ([0, 255]). The min_range and max_range values should be
-// specified as 0.0 and 6.0. Quantizing from float to quint8 will multiply each
-// value of the input by 255/6 and cast to quint8.
-//
-// If the output type was qint8 ([-128, 127]), the operation will additionally
-// subtract each value by 128 prior to casting, so that the range of values aligns
-// with the range of qint8.
-//
-// If the mode is 'MIN_FIRST', then this approach is used:
-//
-// ```
-// num_discrete_values = 1 << (# of bits in T)
-// range_adjust = num_discrete_values / (num_discrete_values - 1)
-// range = (range_max - range_min) * range_adjust
-// range_scale = num_discrete_values / range
-// quantized = round(input * range_scale) - round(range_min * range_scale) +
-//   numeric_limits<T>::min()
-// quantized = max(quantized, numeric_limits<T>::min())
-// quantized = min(quantized, numeric_limits<T>::max())
-// ```
-//
-// The biggest difference between this and MIN_COMBINED is that the minimum range
-// is rounded first, before it's subtracted from the rounded value. With
-// MIN_COMBINED, a small bias is introduced where repeated iterations of quantizing
-// and dequantizing will introduce a larger and larger error.
-//
-// *SCALED mode Example*
-//
-// `SCALED` mode matches the quantization approach used in
-// `QuantizeAndDequantize{V2|V3}`.
-//
-// If the mode is `SCALED`, we do not use the full range of the output type,
-// choosing to elide the lowest possible value for symmetry (e.g., output range is
-// -127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
-// 0.
-//
-// We first find the range of values in our tensor. The
-// range we use is always centered on 0, so we find m such that
-// ```c++
-//   m = max(abs(input_min), abs(input_max))
-// ```
-//
-// Our input tensor range is then `[-m, m]`.
-//
-// Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
-// If T is signed, this is
-// ```
-//   num_bits = sizeof(T) * 8
-//   [min_fixed, max_fixed] =
-//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
-// ```
-//
-// Otherwise, if T is unsigned, the fixed-point range is
-// ```
-//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
-// ```
-//
-// From this we compute our scaling factor, s:
-// ```c++
-//   s = (max_fixed - min_fixed) / (2 * m)
-// ```
-//
-// Now we can quantize the elements of our tensor:
-// ```c++
-// result = round(input * s)
-// ```
-//
-// One thing to watch out for is that the operator may choose to adjust the
-// requested minimum and maximum values slightly during the quantization process,
-// so you should always use the output ports as the range for further calculations.
-// For example, if the requested minimum and maximum values are close to equal,
-// they will be separated by a small epsilon value to prevent ill-formed quantized
-// buffers from being created. Otherwise, you can end up with buffers where all the
-// quantized values map to the same float value, which causes problems for
-// operations that have to perform further calculations on them.
-//
-// Arguments:
-//
-//	min_range: The minimum scalar value possibly produced for the input.
-//	max_range: The maximum scalar value possibly produced for the input.
-//
-//
-// Returns The quantized data produced from the float input.The actual minimum scalar value used for the output.The actual maximum scalar value used for the output.
-func QuantizeV2(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, T tf.DataType, optional ...QuantizeV2Attr) (output tf.Output, output_min tf.Output, output_max tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"T": T}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizeV2",
-		Input: []tf.Input{
-			input, min_range, max_range,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Flushes the writer's unwritten events.
-//
-// Arguments:
-//	writer: A handle to the summary writer resource.
-//
-// Returns the created operation.
-func FlushSummaryWriter(scope *Scope, writer tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "FlushSummaryWriter",
-		Input: []tf.Input{
-			writer,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-- 
GitLab


From 93da81caf6a9b24a1e3299655a51a03f4754cff6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Jan 2018 13:12:00 -0800
Subject: [PATCH 024/966] Add a test case to validate number of buckets after
 quantile generation.

Add an option to generate quantiles instead of sampled boundaries.
This option will always return exactly the number of requested boundaries,
regardless of duplicates and fewer inputs than requested boundaries.

PiperOrigin-RevId: 182980861
---
 tensorflow/contrib/boosted_trees/BUILD        |   1 +
 .../boosted_trees/kernels/quantile_ops.cc     |  27 ++++-
 .../contrib/boosted_trees/ops/quantile_ops.cc |   1 +
 .../python/kernel_tests/quantile_ops_test.py  | 100 +++++++++++++++++-
 .../boosted_trees/python/ops/quantile_ops.py  |  18 +++-
 .../resources/quantile_stream_resource.h      |  16 ++-
 6 files changed, 153 insertions(+), 10 deletions(-)

diff --git a/tensorflow/contrib/boosted_trees/BUILD b/tensorflow/contrib/boosted_trees/BUILD
index 392ac7fa1c..6fdcd0f996 100644
--- a/tensorflow/contrib/boosted_trees/BUILD
+++ b/tensorflow/contrib/boosted_trees/BUILD
@@ -196,6 +196,7 @@ py_test(
     name = "quantile_ops_test",
     size = "small",
     srcs = ["python/kernel_tests/quantile_ops_test.py"],
+    shard_count = 3,
     srcs_version = "PY2AND3",
     deps = [
         ":quantile_ops_py",
diff --git a/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc b/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc
index 8600c8c53c..88f3006407 100644
--- a/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc
@@ -46,6 +46,7 @@ const char* const kHandleName = "handle";
 const char* const kNextStampTokenName = "next_stamp_token";
 const char* const kStampTokenName = "stamp_token";
 const char* const kAreBucketsReadyName = "are_buckets_ready";
+const char* const kGenerateQuantiles = "generate_quantiles";
 // Names for sparse arguments.
 const char* const kNumSparseFeaturesName = "num_sparse_features";
 const char* const kSparseBucketsName = "sparse_buckets";
@@ -182,6 +183,16 @@ std::vector<float> GenerateBoundaries(const QuantileStream& stream,
   return boundaries;
 }
 
+// Generates quantiles on a finalized QuantileStream.
+std::vector<float> GenerateQuantiles(const QuantileStream& stream,
+                                     int num_quantiles) {
+  // Do not de-dup boundaries. Exactly num_quantiles+1 boundary values
+  // will be returned.
+  std::vector<float> boundaries = stream.GenerateQuantiles(num_quantiles);
+  CHECK_EQ(boundaries.size(), num_quantiles + 1);
+  return boundaries;
+}
+
 // Copies quantiles to output list.
 void CopyBoundaries(OpKernelContext* const context,
                     const std::vector<float>& boundaries, const int64 index,
@@ -224,6 +235,8 @@ class CreateQuantileAccumulatorOp : public OpKernel {
     OP_REQUIRES_OK(context,
                    context->GetAttr(kNumQuantilesName, &num_quantiles_));
     OP_REQUIRES_OK(context, context->GetAttr(kMaxElementsName, &max_elements_));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr(kGenerateQuantiles, &generate_quantiles_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -231,9 +244,9 @@ class CreateQuantileAccumulatorOp : public OpKernel {
     // other exceptions. If one already exists, it unrefs the new one.
     const Tensor* stamp_token_t;
     OP_REQUIRES_OK(context, context->input(kStampTokenName, &stamp_token_t));
-    auto result =
-        new QuantileStreamResource(epsilon_, num_quantiles_, max_elements_,
-                                   stamp_token_t->scalar<int64>()());
+    auto result = new QuantileStreamResource(epsilon_, num_quantiles_,
+                                             max_elements_, generate_quantiles_,
+                                             stamp_token_t->scalar<int64>()());
     auto status = CreateResource(context, HandleFromInput(context, 0), result);
     if (!status.ok() && status.code() != tensorflow::error::ALREADY_EXISTS) {
       OP_REQUIRES(context, false, status);
@@ -246,6 +259,7 @@ class CreateQuantileAccumulatorOp : public OpKernel {
   // An upperbound on the number of enteries that the summaries might have
   // for a feature.
   int64 max_elements_;
+  bool generate_quantiles_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("CreateQuantileAccumulator").Device(DEVICE_CPU),
@@ -597,10 +611,15 @@ class QuantileAccumulatorFlushOp : public OpKernel {
         << "Passed stamp token: " << stamp_token << " "
         << "Current token: " << streams_resource->stamp();
     QuantileStream* stream = streams_resource->stream(stamp_token);
+    bool generate_quantiles = streams_resource->generate_quantiles();
     stream->Finalize();
+
     streams_resource->set_boundaries(
         stamp_token,
-        GenerateBoundaries(*stream, streams_resource->num_quantiles()));
+        generate_quantiles
+            ? GenerateQuantiles(*stream, streams_resource->num_quantiles())
+            : GenerateBoundaries(*stream, streams_resource->num_quantiles()));
+
     streams_resource->Reset(next_stamp_token);
   }
 };
diff --git a/tensorflow/contrib/boosted_trees/ops/quantile_ops.cc b/tensorflow/contrib/boosted_trees/ops/quantile_ops.cc
index 1fa70bafdd..bb57dcf8ae 100644
--- a/tensorflow/contrib/boosted_trees/ops/quantile_ops.cc
+++ b/tensorflow/contrib/boosted_trees/ops/quantile_ops.cc
@@ -39,6 +39,7 @@ REGISTER_OP("CreateQuantileAccumulator")
     .Attr("max_elements: int = 1099511627776")  // 1 << 40
     .Attr("epsilon: float")
     .Attr("num_quantiles: int")
+    .Attr("generate_quantiles: bool=False")
     .Input("quantile_accumulator_handle: resource")
     .Input("stamp_token: int64")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
diff --git a/tensorflow/contrib/boosted_trees/python/kernel_tests/quantile_ops_test.py b/tensorflow/contrib/boosted_trees/python/kernel_tests/quantile_ops_test.py
index 888d5c57ed..eefa7ef0dc 100644
--- a/tensorflow/contrib/boosted_trees/python/kernel_tests/quantile_ops_test.py
+++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/quantile_ops_test.py
@@ -106,9 +106,11 @@ class QuantileBucketsOpTest(test_util.TensorFlowTestCase):
     |   6   |  16   |  [16, 17, 18, 19, 20, 21]
     """
 
+    num_quantiles = 3
     with self.test_session() as sess:
       accumulator = quantile_ops.QuantileAccumulator(
-          init_stamp_token=0, num_quantiles=3, epsilon=0.001, name="q1")
+          init_stamp_token=0, num_quantiles=num_quantiles,
+          epsilon=0.001, name="q1")
       resources.initialize_resources(resources.shared_resources()).run()
     input_column = array_ops.placeholder(dtypes.float32)
     weights = array_ops.placeholder(dtypes.float32)
@@ -131,8 +133,104 @@ class QuantileBucketsOpTest(test_util.TensorFlowTestCase):
       buckets, are_ready_flush = (sess.run(
           [buckets, are_ready_flush]))
       self.assertEqual(True, are_ready_flush)
+      self.assertEqual(num_quantiles + 1, len(buckets))
       self.assertAllEqual([1, 86., 170., 253.], buckets)
 
+  def testStreamingQuantileBucketsLowPrecisionInput(self):
+    """Tests inputs that simulate low precision float16 values."""
+
+    num_quantiles = 3
+    # set generate_quantiles to True since the test will generate fewer
+    # boundaries otherwise.
+    with self.test_session() as sess:
+      accumulator = quantile_ops.QuantileAccumulator(
+          init_stamp_token=0, num_quantiles=num_quantiles,
+          epsilon=0.001, name="q1", generate_quantiles=True)
+      resources.initialize_resources(resources.shared_resources()).run()
+    input_column = array_ops.placeholder(dtypes.float32)
+    weights = array_ops.placeholder(dtypes.float32)
+    update = accumulator.add_summary(
+        stamp_token=0,
+        column=input_column,
+        example_weights=weights)
+
+    with self.test_session() as sess:
+      # This input is generated by integer in the range [2030, 2060]
+      # but represented by with float16 precision. Integers <= 2048 are
+      # exactly represented, whereas  numbers > 2048 are rounded; and hence
+      # numbers > 2048 are repeated. For precision loss / rounding, see:
+      # https://en.wikipedia.org/wiki/Half-precision_floating-point_format.
+      #
+      # The intent of the test is not handling of float16 values, but to
+      # validate the number of buckets is returned, in cases where  the input
+      # may contain repeated values.
+      inputs = [
+          2030.0, 2031.0, 2032.0, 2033.0, 2034.0, 2035.0, 2036.0, 2037.0,
+          2038.0, 2039.0, 2040.0, 2041.0, 2042.0, 2043.0, 2044.0, 2045.0,
+          2046.0, 2047.0, 2048.0, 2048.0, 2050.0, 2052.0, 2052.0, 2052.0,
+          2054.0, 2056.0, 2056.0, 2056.0, 2058.0, 2060.0
+      ]
+      sess.run(update,
+               {input_column: inputs,
+                weights: [1] * len(inputs)})
+
+    with self.test_session() as sess:
+      sess.run(accumulator.flush(stamp_token=0, next_stamp_token=1))
+      are_ready_flush, buckets = (accumulator.get_buckets(stamp_token=1))
+      buckets, are_ready_flush = (sess.run(
+          [buckets, are_ready_flush]))
+      self.assertEqual(True, are_ready_flush)
+      self.assertEqual(num_quantiles + 1, len(buckets))
+      self.assertAllEqual([2030, 2040, 2050, 2060], buckets)
+
+  def _testStreamingQuantileBucketsHelper(self, inputs):
+    """Helper to test quantile buckets on different inputs."""
+
+    # Use 3 quantiles, 4 boundaries for simplicity.
+    num_quantiles = 3
+    # set generate_quantiles to True since the test will generate fewer
+    # boundaries otherwise.
+    with self.test_session() as sess:
+      accumulator = quantile_ops.QuantileAccumulator(
+          init_stamp_token=0, num_quantiles=num_quantiles,
+          epsilon=0.001, name="q1", generate_quantiles=True)
+      resources.initialize_resources(resources.shared_resources()).run()
+    input_column = array_ops.placeholder(dtypes.float32)
+    weights = array_ops.placeholder(dtypes.float32)
+    update = accumulator.add_summary(
+        stamp_token=0,
+        column=input_column,
+        example_weights=weights)
+
+    with self.test_session() as sess:
+      sess.run(update,
+               {input_column: inputs,
+                weights: [1] * len(inputs)})
+
+    with self.test_session() as sess:
+      sess.run(accumulator.flush(stamp_token=0, next_stamp_token=1))
+      are_ready_flush, buckets = (accumulator.get_buckets(stamp_token=1))
+      buckets, are_ready_flush = (sess.run(
+          [buckets, are_ready_flush]))
+      self.assertEqual(True, are_ready_flush)
+      self.assertEqual(num_quantiles + 1, len(buckets))
+
+  def testStreamingQuantileBucketsRepeatedSingleValue(self):
+    inputs = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+    self._testStreamingQuantileBucketsHelper(inputs)
+
+  def testStreamingQ2antileBucketsRepeatedTwoValues(self):
+    inputs = [1, 1, 1, 2, 2, 2, 2, 2, 1, 1]
+    self._testStreamingQuantileBucketsHelper(inputs)
+
+  def testStreamingQ2antileBucketsRepeatedTwoValuesUnbalanced(self):
+    inputs = [7, 7, 7, 2, 7, 7, 2, 2, 7, 7]
+    self._testStreamingQuantileBucketsHelper(inputs)
+
+  def testStreamingQuantileBucketsFewerInputstThanBuckets(self):
+    inputs = [5]
+    self._testStreamingQuantileBucketsHelper(inputs)
+
   def testStreamingQuantileBuckets(self):
     """Sets up the quantile summary op test as follows.
 
diff --git a/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py b/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py
index 294e04002a..97d57e8b23 100644
--- a/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py
+++ b/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py
@@ -47,7 +47,8 @@ class QuantileAccumulator(saver.BaseSaverBuilder.SaveableObject):
                num_quantiles,
                max_elements=None,
                name=None,
-               container=None):
+               container=None,
+               generate_quantiles=False):
     """Creates a QuantileAccumulator object.
 
     Args:
@@ -57,8 +58,11 @@ class QuantileAccumulator(saver.BaseSaverBuilder.SaveableObject):
       max_elements: Maximum number of elements added to the accumulator.
       name: the name to save the accumulator under.
       container: An optional `string`. Defaults to `""`
+      generate_quantiles: Generate quantiles instead of approximate boundaries.
+        If true, exactly `num_quantiles` will be produced in the final summary.
     """
     self._epsilon = epsilon
+    self._generate_quantiles = generate_quantiles
 
     name = _PATTERN.sub("", name)
     with ops.name_scope(name, "QuantileAccumulator") as name:
@@ -70,7 +74,8 @@ class QuantileAccumulator(saver.BaseSaverBuilder.SaveableObject):
           init_stamp_token,
           epsilon=epsilon,
           max_elements=max_elements,
-          num_quantiles=num_quantiles)
+          num_quantiles=num_quantiles,
+          generate_quantiles=generate_quantiles)
       is_initialized_op = gen_quantile_ops.quantile_accumulator_is_initialized(
           self._quantile_accumulator_handle)
     resources.register_resource(self._quantile_accumulator_handle,
@@ -176,7 +181,14 @@ class QuantileAccumulator(saver.BaseSaverBuilder.SaveableObject):
         summaries=summary)
 
   def flush(self, stamp_token, next_stamp_token):
-    """Finalizes quantile summary stream and resets it for next iteration."""
+    """Finalizes quantile summary stream and resets it for next iteration.
+
+    Args:
+      stamp_token: Exepcted current token.
+      next_stamp_token: Next value for the token.
+    Returns:
+      A list of quantiles or approximate boundaries.
+    """
     return gen_quantile_ops.quantile_accumulator_flush(
         quantile_accumulator_handle=self._quantile_accumulator_handle,
         stamp_token=stamp_token,
diff --git a/tensorflow/contrib/boosted_trees/resources/quantile_stream_resource.h b/tensorflow/contrib/boosted_trees/resources/quantile_stream_resource.h
index fb29f79e57..c1fa35eaf1 100644
--- a/tensorflow/contrib/boosted_trees/resources/quantile_stream_resource.h
+++ b/tensorflow/contrib/boosted_trees/resources/quantile_stream_resource.h
@@ -32,12 +32,14 @@ using QuantileStream =
 class QuantileStreamResource : public StampedResource {
  public:
   QuantileStreamResource(const float epsilon, const int32 num_quantiles,
-                         const int64 max_elements, int64 stamp_token)
+                         const int64 max_elements, bool generate_quantiles,
+                         int64 stamp_token)
       : stream_(epsilon, max_elements),
         are_buckets_ready_(false),
         epsilon_(epsilon),
         num_quantiles_(num_quantiles),
-        max_elements_(max_elements) {
+        max_elements_(max_elements),
+        generate_quantiles_(generate_quantiles) {
     set_stamp(stamp_token);
   }
 
@@ -74,6 +76,11 @@ class QuantileStreamResource : public StampedResource {
     are_buckets_ready_ = are_buckets_ready;
   }
 
+  bool generate_quantiles() const { return generate_quantiles_; }
+  void set_generate_quantiles(bool generate_quantiles) {
+    generate_quantiles_ = generate_quantiles;
+  }
+
  private:
   ~QuantileStreamResource() override {}
 
@@ -95,6 +102,11 @@ class QuantileStreamResource : public StampedResource {
   const int32 num_quantiles_;
   // An upper-bound for the number of elements.
   int64 max_elements_;
+
+  // Generate quantiles instead of approximate boundaries.
+  // If true, exactly `num_quantiles` will be produced in the final summary.
+  bool generate_quantiles_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(QuantileStreamResource);
 };
 
-- 
GitLab


From 8aa3253dbf881538d67e315e5ae6f1458d125416 Mon Sep 17 00:00:00 2001
From: Nupur Garg <nupurgarg@google.com>
Date: Tue, 23 Jan 2018 13:35:08 -0800
Subject: [PATCH 025/966] Add support for conv and matmul operators with
 variable filter.

PiperOrigin-RevId: 182984061
---
 .../contrib/lite/testing/generate_examples.py | 173 ++++++++++++------
 tensorflow/contrib/lite/toco/BUILD            |   1 +
 .../convert_reorder_axes.cc                   | 149 +++++++++++++++
 .../graph_transformations.h                   |   1 +
 tensorflow/contrib/lite/toco/toco_tooling.cc  |   2 +-
 tensorflow/contrib/lite/toco/tooling_util.cc  |   4 +-
 tensorflow/contrib/lite/toco/tooling_util.h   |   5 +
 7 files changed, 277 insertions(+), 58 deletions(-)
 create mode 100644 tensorflow/contrib/lite/toco/graph_transformations/convert_reorder_axes.cc

diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py
index 9713532326..56e4dfc7a2 100644
--- a/tensorflow/contrib/lite/testing/generate_examples.py
+++ b/tensorflow/contrib/lite/testing/generate_examples.py
@@ -853,34 +853,55 @@ def make_fused_batch_norm_tests(zip_path):
 def make_conv_tests(zip_path):
   """Make a set of tests to do convolution."""
 
-  test_parameters = [{
-      "input_shape": [[1, 3, 4, 3]],
-      "filter_shape": [[1, 1, 3, 2]],
-      "strides": [[1, 1, 1, 1], [1, 2, 3, 1]],
-      "padding": ["SAME", "VALID"],
-      "data_format": ["NHWC"],  # TODO(aselle): NCHW  would be good
-  }, {
-      "input_shape": [[2, 14, 14, 2]],
-      "filter_shape": [[6, 6, 2, 2]],
-      "strides": [[1, 1, 1, 1], [1, 2, 3, 1]],
-      "padding": ["SAME", "VALID"],
-      "data_format": ["NHWC"],  # TODO(aselle): NCHW  would be good
-  }]
+  test_parameters = [
+      {
+          "input_shape": [[1, 3, 4, 3]],
+          "filter_shape": [[1, 1, 3, 2]],
+          "strides": [[1, 1, 1, 1], [1, 2, 3, 1]],
+          "padding": ["SAME", "VALID"],
+          "data_format": ["NHWC"],  # TODO(aselle): NCHW  would be good
+          "constant_filter": [True, False],
+      },
+      {
+          "input_shape": [[2, 14, 14, 2]],
+          "filter_shape": [[6, 6, 2, 2]],
+          "strides": [[1, 1, 1, 1], [1, 2, 3, 1]],
+          "padding": ["SAME", "VALID"],
+          "data_format": ["NHWC"],  # TODO(aselle): NCHW  would be good
+          "constant_filter": [True, False],
+      }
+  ]
 
   def build_graph(parameters):
+    """Build a conv graph given `parameters`."""
     input_tensor = tf.placeholder(
         dtype=tf.float32, name="input", shape=parameters["input_shape"])
-    filter_values = create_tensor_data(np.float32, parameters["filter_shape"])
-    out = tf.nn.conv2d(input_tensor, filter_values,
-                       strides=parameters["strides"],
-                       padding=parameters["padding"],
-                       data_format=parameters["data_format"])
-    return [input_tensor], [out]
+
+    # Get filter input either as a placeholder or constants. Also get a list of
+    # the input tensors that are represented as placeholders.
+    if parameters["constant_filter"]:
+      filter_input = create_tensor_data(np.float32, parameters["filter_shape"])
+      input_tensors = [input_tensor]
+    else:
+      filter_input = tf.placeholder(
+          dtype=tf.float32, name="filter", shape=parameters["filter_shape"])
+      input_tensors = [input_tensor, filter_input]
+
+    out = tf.nn.conv2d(
+        input_tensor,
+        filter_input,
+        strides=parameters["strides"],
+        padding=parameters["padding"],
+        data_format=parameters["data_format"])
+    return input_tensors, [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
-    input_values = create_tensor_data(np.float32, parameters["input_shape"])
-    return [input_values], sess.run(
-        outputs, feed_dict=dict(zip(inputs, [input_values])))
+    # Build list of input values either containing 1 tensor (input) or 2 tensors
+    # (input, filter) based on whether filter is constant or variable input.
+    values = [create_tensor_data(np.float32, parameters["input_shape"])]
+    if not parameters["constant_filter"]:
+      values.append(create_tensor_data(np.float32, parameters["filter_shape"]))
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
@@ -889,45 +910,70 @@ def make_depthwiseconv_tests(zip_path):
   """Make a set of tests to do convolution."""
 
   # Tensorflow only supports equal strides
-  test_parameters = [{
-      "input_shape": [[1, 3, 4, 3], [1, 10, 10, 3]],
-      "filter_size": [[1, 1], [1, 2], [3, 3]],
-      "strides": [[1, 1, 1, 1], [1, 3, 3, 1]],
-      "channel_multiplier": [1, 2],
-      "rate": [[1, 1]],
-      "padding": ["SAME", "VALID"],
-      "data_format": ["NHWC"],
-  }, {
-      "input_shape": [[1, 3, 4, 3]],
-      "filter_size": [[1, 1]],
-      "strides": [[1, 1, 2, 1]],  # TF needs [1, x, x, 1]
-      "channel_multiplier": [2],
-      "rate": [[2, 2]],   #  Only [1, 1] is supported
-      "padding": ["SAME"],
-      "data_format": ["NHWC"],
-  }]
+  test_parameters = [
+      {
+          "input_shape": [[1, 3, 4, 3], [1, 10, 10, 3]],
+          "filter_size": [[1, 1], [1, 2], [3, 3]],
+          "strides": [[1, 1, 1, 1], [1, 3, 3, 1]],
+          "channel_multiplier": [1, 2],
+          "rate": [[1, 1]],
+          "padding": ["SAME", "VALID"],
+          "data_format": ["NHWC"],
+          "constant_filter": [True, False],
+      },
+      {
+          "input_shape": [[1, 3, 4, 3]],
+          "filter_size": [[1, 1]],
+          "strides": [[1, 1, 2, 1]],  # TF needs [1, x, x, 1]
+          "channel_multiplier": [2],
+          "rate": [[2, 2]],  #  Only [1, 1] is supported
+          "padding": ["SAME"],
+          "data_format": ["NHWC"],
+          "constant_filter": [True, False],
+      }
+  ]
 
-  def build_graph(parameters):
-    """Build a depthwise conv graph given `parameters`."""
+  def get_tensor_shapes(parameters):
     input_shape = parameters["input_shape"]
     filter_size = parameters["filter_size"]
+    filter_shape = filter_size + [
+        input_shape[3], parameters["channel_multiplier"]
+    ]
+    return [input_shape, filter_shape]
+
+  def build_graph(parameters):
+    """Build a depthwise conv graph given `parameters`."""
+    input_shape, filter_shape = get_tensor_shapes(parameters)
     input_tensor = tf.placeholder(
         dtype=tf.float32, name="input", shape=input_shape)
-    filter_shape = filter_size + [
-        input_shape[3], parameters["channel_multiplier"]]
-    filter_values = create_tensor_data(np.float32, filter_shape)
+
+    # Get filter input either as a placeholder or constants. Also get a list of
+    # the input tensors that are represented as placeholders.
+    if parameters["constant_filter"]:
+      filter_input = create_tensor_data(np.float32, filter_shape)
+      input_tensors = [input_tensor]
+    else:
+      filter_input = tf.placeholder(
+          dtype=tf.float32, name="filter", shape=filter_shape)
+      input_tensors = [input_tensor, filter_input]
+
     out = tf.nn.depthwise_conv2d(
-        input_tensor, filter_values,
+        input_tensor,
+        filter_input,
         strides=parameters["strides"],
         rate=parameters["rate"],
         padding=parameters["padding"],
         data_format=parameters["data_format"])
-    return [input_tensor], [out]
+    return input_tensors, [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
-    input_values = create_tensor_data(np.float32, parameters["input_shape"])
-    return [input_values], sess.run(
-        outputs, feed_dict=dict(zip(inputs, [input_values])))
+    # Build list of input values either containing 1 tensor (input) or 2 tensors
+    # (input, filter) based on whether filter is constant or variable input.
+    input_shape, filter_shape = get_tensor_shapes(parameters)
+    values = [create_tensor_data(np.float32, input_shape)]
+    if not parameters["constant_filter"]:
+      values.append(create_tensor_data(np.float32, filter_shape))
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
@@ -978,32 +1024,49 @@ def make_fully_connected_tests(zip_path):
       "shape2": [[3, 3]],
       "transpose_a": [True, False],
       "transpose_b": [True, False],
+      "constant_filter": [True, False],
   }, {
       "shape1": [[4, 4], [1, 4], [4]],
       "shape2": [[4, 4], [4, 1], [4]],
       "transpose_a": [False],
       "transpose_b": [False],
+      "constant_filter": [True, False],
   }, {
       "shape1": [[40, 37]],
       "shape2": [[37, 40]],
       "transpose_a": [False],
       "transpose_b": [False],
-
+      "constant_filter": [True, False],
   }]
 
   def build_graph(parameters):
+    """Build a matmul graph given `parameters`."""
     input_tensor1 = tf.placeholder(dtype=tf.float32, name="input1",
                                    shape=parameters["shape1"])
-    input_tensor2 = create_tensor_data(np.float32, parameters["shape2"])
+
+    # Get input_tensor2 either as a placeholder or constants. Also get a list of
+    # the input tensors that are represented as placeholders.
+    if parameters["constant_filter"]:
+      input_tensor2 = create_tensor_data(np.float32, parameters["shape2"])
+      input_tensors = [input_tensor1]
+    else:
+      input_tensor2 = tf.placeholder(
+          dtype=tf.float32, name="input2", shape=parameters["shape2"])
+      input_tensors = [input_tensor1, input_tensor2]
+
     out = tf.matmul(input_tensor1, input_tensor2,
                     transpose_a=parameters["transpose_a"],
                     transpose_b=parameters["transpose_b"])
-    return [input_tensor1], [out]
+    return input_tensors, [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
-    input_values1 = create_tensor_data(np.float32, shape=parameters["shape1"])
-    return [input_values1], sess.run(
-        outputs, feed_dict=dict(zip(inputs, [input_values1])))
+    # Build list of input values either containing 1 tensor (input_values1) or 2
+    # tensors (input_values1, input_values2) based on whether the second input
+    # is a constant or variable input.
+    values = [create_tensor_data(np.float32, shape=parameters["shape1"])]
+    if not parameters["constant_filter"]:
+      values.append(create_tensor_data(np.float32, parameters["shape2"]))
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
diff --git a/tensorflow/contrib/lite/toco/BUILD b/tensorflow/contrib/lite/toco/BUILD
index 967e304742..ad8f0e4a47 100644
--- a/tensorflow/contrib/lite/toco/BUILD
+++ b/tensorflow/contrib/lite/toco/BUILD
@@ -171,6 +171,7 @@ cc_library(
     srcs = [
         "graph_transformations/convert_expanddims_to_reshape.cc",
         "graph_transformations/convert_pure_conv_to_depthwise.cc",
+        "graph_transformations/convert_reorder_axes.cc",
         "graph_transformations/convert_trivial_transpose_to_reshape.cc",
         "graph_transformations/create_im2col_arrays.cc",
         "graph_transformations/dequantize.cc",
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/convert_reorder_axes.cc b/tensorflow/contrib/lite/toco/graph_transformations/convert_reorder_axes.cc
new file mode 100644
index 0000000000..0d274fc687
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/convert_reorder_axes.cc
@@ -0,0 +1,149 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+// Creates a Reshape operator from ReorderAxes operator.
+TensorFlowReshapeOperator* CreateReshapeFromReorderAxes(
+    Model* model, ReorderAxesOperator* reorder_op, const Shape& input_shape) {
+  auto* reshape_op = new TensorFlowReshapeOperator;
+
+  // Copy inputs and outputs to Reshape.
+  reshape_op->inputs.push_back(reorder_op->inputs[0]);
+  reshape_op->outputs = reorder_op->outputs;
+
+  // Create reshape dimensions based on input shape. Conversion from
+  // ReorderAxes to Reshape requires a 4D input shape.
+  CHECK_EQ(input_shape.dimensions_count(), 4);
+  std::vector<int> reshape_dims = {1, input_shape.dims(0), input_shape.dims(1),
+                                   input_shape.dims(3) * input_shape.dims(2)};
+
+  // Create a new input array for Reshape.
+  string reshape_array_name =
+      AvailableArrayName(*model, reshape_op->outputs[0]);
+  reshape_op->inputs.push_back(reshape_array_name);
+
+  Array& reshape_array = model->GetOrCreateArray(reshape_array_name);
+  *(reshape_array.mutable_shape()->mutable_dims()) = {
+      1, static_cast<int>(reshape_dims.size())};
+  reshape_array.data_type = ArrayDataType::kInt32;
+  auto& reshape_buffer =
+      reshape_array.GetMutableBuffer<ArrayDataType::kInt32>();
+  reshape_buffer.data = reshape_dims;
+
+  return reshape_op;
+}
+
+// Creates a Transpose operator from ReorderAxes operator.
+TransposeOperator* CreateTransposeFromReorderAxes(
+    Model* model, ReorderAxesOperator* reorder_op, const Shape& input_shape,
+    const AxesOrder& input_axes_order, const AxesOrder& output_axes_order) {
+  auto* transpose_op = new TransposeOperator;
+
+  // Copy inputs and outputs to Transpose.
+  transpose_op->inputs.push_back(reorder_op->inputs[0]);
+  transpose_op->outputs = reorder_op->outputs;
+
+  // Create permutations data based on input and output axes order.
+  std::vector<int> permutations_data;
+  GetShuffleShape(input_axes_order, output_axes_order, &permutations_data);
+
+  // Create a new input permutations array for Transpose.
+  string perm_array_name = AvailableArrayName(*model, transpose_op->outputs[0]);
+  transpose_op->inputs.push_back(perm_array_name);
+
+  Array& perm_array = model->GetOrCreateArray(perm_array_name);
+  *(perm_array.mutable_shape()->mutable_dims()) = {
+      static_cast<int>(permutations_data.size())};
+  perm_array.data_type = ArrayDataType::kInt32;
+  auto& perm_buffer = perm_array.GetMutableBuffer<ArrayDataType::kInt32>();
+  perm_buffer.data = permutations_data;
+
+  return transpose_op;
+}
+
+// Converts ReorderAxes into Transpose and Reshape which are compatible with the
+// TFLite interpreter.
+bool ConvertReorderAxes::Run(Model* model, std::size_t op_index) {
+  auto reorder_it = model->operators.begin() + op_index;
+  if (reorder_it->get()->type != OperatorType::kReorderAxes) return false;
+
+  auto* reorder_op = static_cast<ReorderAxesOperator*>(reorder_it->get());
+  CHECK_EQ(reorder_op->inputs.size(), 1);
+  CHECK_EQ(reorder_op->outputs.size(), 1);
+
+  const auto& input_array_name = reorder_op->inputs[0];
+  const auto& output_array_name = reorder_op->outputs[0];
+  auto& input_array = model->GetArray(input_array_name);
+  auto& output_array = model->GetArray(output_array_name);
+
+  // Get input array. If kFakeQuant is the input into ReorderAxes, get the input
+  // array passed into kFakeQuant. kFakeQuant op is dropped when possible.
+  string constant_input_array_name = input_array_name;
+  if (!input_array.buffer) {
+    const auto* op_producing_input = GetOpWithOutput(*model, input_array_name);
+    if (op_producing_input &&
+        op_producing_input->type == OperatorType::kFakeQuant) {
+      constant_input_array_name = op_producing_input->inputs[0];
+    }
+  }
+
+  // Yield if input array contains constants or if output array size has not
+  // been adjusted to reflect the permutations in ReorderAxes. ReorderAxes will
+  // be merged into a constant array when possible.
+  if (IsConstantParameterArray(*model, constant_input_array_name)) return false;
+  if (!output_array.has_shape()) return false;
+
+  const auto input_axes_order = reorder_op->input_axes_order;
+  const auto output_axes_order = reorder_op->output_axes_order;
+  const Shape input_shape = input_array.shape();
+
+  // Creates a Reshape or Transpose operator depending on the conversion.
+  if (input_axes_order == AxesOrder::kHWIM &&
+      output_axes_order == AxesOrder::k1HWO) {
+    // Add Reshape operator into the graph. This special case is not just a
+    // permutation. The input dimensions get merged into 3 dimensions while the
+    // order of the elements does not change.
+    auto* reshape_op =
+        CreateReshapeFromReorderAxes(model, reorder_op, input_shape);
+    const auto reshape_it = model->operators.emplace(reorder_it, reshape_op);
+    reorder_it = reshape_it + 1;
+  } else {
+    // Add Transpose operator into the graph.
+    auto* transpose_op = CreateTransposeFromReorderAxes(
+        model, reorder_op, input_shape, input_axes_order, output_axes_order);
+    const auto transpose_it =
+        model->operators.emplace(reorder_it, transpose_op);
+    reorder_it = transpose_it + 1;
+  }
+
+  // Remove ReorderAxes operator from the graph.
+  CHECK_EQ(reorder_it->get(), reorder_op);
+  model->operators.erase(reorder_it);
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
index 9ec9f92c90..d6950cf987 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
+++ b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
@@ -115,6 +115,7 @@ void RunGraphTransformations(Model* model, const string& message,
 DECLARE_GRAPH_TRANSFORMATION(ConvertExpandDimsToReshape)
 DECLARE_GRAPH_TRANSFORMATION(ConvertPureConvToDepthwise)
 DECLARE_GRAPH_TRANSFORMATION(ConvertTrivialTransposeToReshape)
+DECLARE_GRAPH_TRANSFORMATION(ConvertReorderAxes)
 DECLARE_GRAPH_TRANSFORMATION(EnsureBiasVectors)
 DECLARE_GRAPH_TRANSFORMATION(FuseActivationFunctions)
 DECLARE_GRAPH_TRANSFORMATION(FuseBinaryIntoFollowingAffine)
diff --git a/tensorflow/contrib/lite/toco/toco_tooling.cc b/tensorflow/contrib/lite/toco/toco_tooling.cc
index afaa0fd0c7..f2753c84e9 100644
--- a/tensorflow/contrib/lite/toco/toco_tooling.cc
+++ b/tensorflow/contrib/lite/toco/toco_tooling.cc
@@ -53,6 +53,7 @@ void MakeGeneralGraphTransformationsSet(
   CHECK(transformations->empty());
   transformations->Add(new ConvertExpandDimsToReshape);
   transformations->Add(new ConvertTrivialTransposeToReshape);
+  transformations->Add(new ConvertReorderAxes);
   transformations->Add(new ResolveReshapeAttributes);
   transformations->Add(new PropagateArrayDataTypes);
   transformations->Add(new PropagateFixedSizes);
@@ -96,7 +97,6 @@ void MakeGeneralGraphTransformationsSet(
 
 bool SupportsQuantization(FileFormat format) {
   return (format == GRAPHVIZ_DOT || format == TFLITE);
-  ;
 }
 
 bool SupportsFusedActivationFunction(FileFormat format) {
diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc
index 900c60bd90..8543ba4742 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.cc
+++ b/tensorflow/contrib/lite/toco/tooling_util.cc
@@ -1463,8 +1463,6 @@ bool EstimateArithmeticOpsCount(const Model& model, int64* result) {
   return true;
 }
 
-namespace {
-
 void GetShuffleShape(AxesOrder input_axes_order, AxesOrder output_axes_order,
                      std::vector<int>* shuffle) {
   CHECK_EQ(AxesCount(input_axes_order), AxesCount(output_axes_order));
@@ -1499,6 +1497,8 @@ void GetShuffleShape(AxesOrder input_axes_order, AxesOrder output_axes_order,
   }
 }
 
+namespace {
+
 // Extend shuffle is designed to match ExtendShape, which pads the shape with
 // unit dimensions at the beginning.
 void ExtendShuffle(const std::vector<int>& input_shuffle, int newdim,
diff --git a/tensorflow/contrib/lite/toco/tooling_util.h b/tensorflow/contrib/lite/toco/tooling_util.h
index c81e77874e..72fb0fd1a7 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.h
+++ b/tensorflow/contrib/lite/toco/tooling_util.h
@@ -274,6 +274,11 @@ bool EstimateArithmeticOpsCount(const Model& model, int64* result);
 
 int AxesCount(AxesOrder axes_order);
 
+// Returns the permutation of the dimensions based on the input axes order and
+// output axes order.
+void GetShuffleShape(AxesOrder input_axes_order, AxesOrder output_axes_order,
+                     std::vector<int>* shuffle);
+
 void ShuffleDims(const Shape& input_shape, AxesOrder input_axes_order,
                  AxesOrder output_axes_order, Shape* output_shape);
 void ShuffleArray(const Shape& input_shape, AxesOrder input_axes_order,
-- 
GitLab


From fa35e11bca893dde9177b6b9f8a335453ac97eee Mon Sep 17 00:00:00 2001
From: Yao Zhang <yaozhang@google.com>
Date: Tue, 23 Jan 2018 13:51:44 -0800
Subject: [PATCH 026/966] Fix StridedSlice. (1) Use input index instead of
 input name to make sure the naming is unique, because in rare cases, two
 inputs may connect to the same node. (2) Consecutive if statements each
 should have a return and should not fall through; this created un-wanted
 behavior. Use switch and break now.

PiperOrigin-RevId: 182986609
---
 .../grappler/optimizers/layout_optimizer.cc   | 34 ++++++++--
 .../optimizers/layout_optimizer_test.cc       | 49 +++++++------
 .../python/grappler/layout_optimizer_test.py  | 68 ++++++++++++++-----
 3 files changed, 101 insertions(+), 50 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.cc b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
index ea7b05d381..50e6ba4a64 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
@@ -590,7 +590,7 @@ class NodeProcessor : public GraphProcessor {
     // to ensure added_node is in the same frame with node_.
     NodeDef* added_node = graph_->add_node();
     *added_node = *input_node;
-    string base_name = strings::StrCat(node_->name(), "-", input_node->name());
+    string base_name = strings::StrCat(node_->name(), "-", input_index);
     string node_name = LayoutOptimizerNode(base_name);
     added_node->set_name(node_name);
     *node_->mutable_input(input_index) = node_name;
@@ -1647,12 +1647,32 @@ class StridedSliceProcessor : public SliceProcessor {
       return errors::InvalidArgument("invalid mask value: ", i);
     }
     if (i == 0 || i == 1 || i == 14 || i == 15) return Status::OK();
-    if (i == 2 || i == 3) i += 2;
-    if (i == 4 || i == 5) i += 4;
-    if (i == 6 || i == 7) i += 6;
-    if (i == 8 || i == 9) i -= 6;
-    if (i == 10 || i == 11) i -= 4;
-    if (i == 12 || i == 13) i -= 2;
+    switch (i) {
+      case 2:
+      case 3:
+        i += 2;
+        break;
+      case 4:
+      case 5:
+        i += 4;
+        break;
+      case 6:
+      case 7:
+        i += 6;
+        break;
+      case 8:
+      case 9:
+        i -= 6;
+        break;
+      case 10:
+      case 11:
+        i -= 4;
+        break;
+      case 12:
+      case 13:
+        i -= 2;
+        break;
+    }
     node_->mutable_attr()->at(mask).set_i(i);
     return Status::OK();
   }
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
index 587642c96e..5cb366df2d 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
@@ -172,8 +172,7 @@ TEST_F(LayoutOptimizerTest, Conv2DBackpropInput) {
 
   Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
   NodeMap node_map(&output);
-  string input_name =
-      strings::StrCat("Conv2DBackpropInput-InputSizes", "-", "LayoutOptimizer");
+  string input_name = "Conv2DBackpropInput-0-LayoutOptimizer";
   auto input_sizes_node = node_map.GetNode(input_name);
   CHECK(input_sizes_node);
   auto conv2d_backprop_node = node_map.GetNode("Conv2DBackpropInput");
@@ -288,7 +287,7 @@ TEST_F(LayoutOptimizerTest, Pad) {
   auto pad = node_map.GetNode("p");
   EXPECT_EQ(pad->input(0), "Conv2D");
 
-  auto pad_const = node_map.GetNode("p-c-LayoutOptimizer");
+  auto pad_const = node_map.GetNode("p-1-LayoutOptimizer");
   EXPECT_TRUE(pad_const);
   EXPECT_TRUE(pad_const->attr().find("value") != pad_const->attr().end());
   Tensor tensor;
@@ -476,9 +475,9 @@ TEST_F(LayoutOptimizerTest, SplitDimC) {
   Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
   NodeMap node_map(&output);
   auto split_node = node_map.GetNode("split");
-  EXPECT_EQ(split_node->input(0), "split-c-LayoutOptimizer");
+  EXPECT_EQ(split_node->input(0), "split-0-LayoutOptimizer");
   EXPECT_EQ(split_node->input(1), "Conv2D");
-  auto split_const = node_map.GetNode("split-c-LayoutOptimizer");
+  auto split_const = node_map.GetNode("split-0-LayoutOptimizer");
   EXPECT_EQ(split_const->op(), "Const");
   EXPECT_EQ(split_const->attr().at({"value"}).tensor().int_val(0), 1);
 }
@@ -496,9 +495,9 @@ TEST_F(LayoutOptimizerTest, SplitDimH) {
   Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
   NodeMap node_map(&output);
   auto split_node = node_map.GetNode("split");
-  EXPECT_EQ(split_node->input(0), "split-c-LayoutOptimizer");
+  EXPECT_EQ(split_node->input(0), "split-0-LayoutOptimizer");
   EXPECT_EQ(split_node->input(1), "Conv2D");
-  auto split_const = node_map.GetNode("split-c-LayoutOptimizer");
+  auto split_const = node_map.GetNode("split-0-LayoutOptimizer");
   EXPECT_EQ(split_const->op(), "Const");
   EXPECT_EQ(split_const->attr().at({"value"}).tensor().int_val(0), 2);
 }
@@ -516,9 +515,9 @@ TEST_F(LayoutOptimizerTest, SplitDimW) {
   Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
   NodeMap node_map(&output);
   auto split_node = node_map.GetNode("split");
-  EXPECT_EQ(split_node->input(0), "split-c-LayoutOptimizer");
+  EXPECT_EQ(split_node->input(0), "split-0-LayoutOptimizer");
   EXPECT_EQ(split_node->input(1), "Conv2D");
-  auto split_const = node_map.GetNode("split-c-LayoutOptimizer");
+  auto split_const = node_map.GetNode("split-0-LayoutOptimizer");
   EXPECT_EQ(split_const->op(), "Const");
   EXPECT_EQ(split_const->attr().at({"value"}).tensor().int_val(0), 3);
 }
@@ -536,9 +535,9 @@ TEST_F(LayoutOptimizerTest, SplitDimN) {
   Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
   NodeMap node_map(&output);
   auto split_node = node_map.GetNode("split");
-  EXPECT_EQ(split_node->input(0), "split-c-LayoutOptimizer");
+  EXPECT_EQ(split_node->input(0), "split-0-LayoutOptimizer");
   EXPECT_EQ(split_node->input(1), "Conv2D");
-  auto split_const = node_map.GetNode("split-c-LayoutOptimizer");
+  auto split_const = node_map.GetNode("split-0-LayoutOptimizer");
   EXPECT_EQ(split_const->op(), "Const");
   EXPECT_EQ(split_const->attr().at({"value"}).tensor().int_val(0), 0);
 }
@@ -582,8 +581,8 @@ TEST_F(LayoutOptimizerTest, SplitSamePortToMultipleInputsOfSameNode) {
   EXPECT_EQ(concat_node->input(0), "split:1");
   EXPECT_EQ(concat_node->input(1), "split:1");
   EXPECT_EQ(concat_node->input(2), "split:1");
-  EXPECT_EQ(concat_node->input(3), "concat-axis-LayoutOptimizer");
-  auto concat_dim = node_map.GetNode("concat-axis-LayoutOptimizer");
+  EXPECT_EQ(concat_node->input(3), "concat-3-LayoutOptimizer");
+  auto concat_dim = node_map.GetNode("concat-3-LayoutOptimizer");
   EXPECT_EQ(concat_dim->attr().at({"value"}).tensor().int_val(0), 1);
 }
 
@@ -603,8 +602,8 @@ TEST_F(LayoutOptimizerTest, ConcatDimH) {
   auto concat_node = node_map.GetNode("concat");
   EXPECT_EQ(concat_node->input(0), "split");
   EXPECT_EQ(concat_node->input(1), "split:1");
-  EXPECT_EQ(concat_node->input(2), "concat-axis-LayoutOptimizer");
-  auto concat_dim = node_map.GetNode("concat-axis-LayoutOptimizer");
+  EXPECT_EQ(concat_node->input(2), "concat-2-LayoutOptimizer");
+  auto concat_dim = node_map.GetNode("concat-2-LayoutOptimizer");
   EXPECT_EQ(concat_dim->attr().at({"value"}).tensor().int_val(0), 2);
 }
 
@@ -648,8 +647,8 @@ TEST_F(LayoutOptimizerTest, ConcatDimW) {
   auto concat_node = node_map.GetNode("concat");
   EXPECT_EQ(concat_node->input(0), "split");
   EXPECT_EQ(concat_node->input(1), "split:1");
-  EXPECT_EQ(concat_node->input(2), "concat-axis-LayoutOptimizer");
-  auto concat_dim = node_map.GetNode("concat-axis-LayoutOptimizer");
+  EXPECT_EQ(concat_node->input(2), "concat-2-LayoutOptimizer");
+  auto concat_dim = node_map.GetNode("concat-2-LayoutOptimizer");
   EXPECT_EQ(concat_dim->attr().at({"value"}).tensor().int_val(0), 3);
 }
 
@@ -669,8 +668,8 @@ TEST_F(LayoutOptimizerTest, ConcatDimN) {
   auto concat_node = node_map.GetNode("concat");
   EXPECT_EQ(concat_node->input(0), "split");
   EXPECT_EQ(concat_node->input(1), "split:1");
-  EXPECT_EQ(concat_node->input(2), "concat-axis-LayoutOptimizer");
-  auto concat_dim = node_map.GetNode("concat-axis-LayoutOptimizer");
+  EXPECT_EQ(concat_node->input(2), "concat-2-LayoutOptimizer");
+  auto concat_dim = node_map.GetNode("concat-2-LayoutOptimizer");
   EXPECT_EQ(concat_dim->attr().at({"value"}).tensor().int_val(0), 0);
 }
 
@@ -690,8 +689,8 @@ TEST_F(LayoutOptimizerTest, ConcatDimC) {
   auto concat_node = node_map.GetNode("concat");
   EXPECT_EQ(concat_node->input(0), "split");
   EXPECT_EQ(concat_node->input(1), "split:1");
-  EXPECT_EQ(concat_node->input(2), "concat-axis-LayoutOptimizer");
-  auto concat_dim = node_map.GetNode("concat-axis-LayoutOptimizer");
+  EXPECT_EQ(concat_node->input(2), "concat-2-LayoutOptimizer");
+  auto concat_dim = node_map.GetNode("concat-2-LayoutOptimizer");
   EXPECT_EQ(concat_dim->attr().at({"value"}).tensor().int_val(0), 1);
 }
 
@@ -861,10 +860,10 @@ TEST_F(LayoutOptimizerTest, SliceConst) {
   NodeMap node_map(&output);
   auto slice_node = node_map.GetNode("slice");
   EXPECT_EQ(slice_node->input(0), "Conv2D");
-  EXPECT_EQ(slice_node->input(1), "slice-begin-LayoutOptimizer");
-  EXPECT_EQ(slice_node->input(2), "slice-size-LayoutOptimizer");
+  EXPECT_EQ(slice_node->input(1), "slice-1-LayoutOptimizer");
+  EXPECT_EQ(slice_node->input(2), "slice-2-LayoutOptimizer");
 
-  auto begin_const = node_map.GetNode("slice-begin-LayoutOptimizer");
+  auto begin_const = node_map.GetNode("slice-1-LayoutOptimizer");
   Tensor begin_tensor;
   EXPECT_TRUE(begin_tensor.FromProto(
       begin_const->mutable_attr()->at({"value"}).tensor()));
@@ -872,7 +871,7 @@ TEST_F(LayoutOptimizerTest, SliceConst) {
   test::FillValues<int>(&begin_tensor_expected, {0, 1, 2, 3});
   test::ExpectTensorEqual<int>(begin_tensor_expected, begin_tensor);
 
-  auto size_const = node_map.GetNode("slice-size-LayoutOptimizer");
+  auto size_const = node_map.GetNode("slice-2-LayoutOptimizer");
   Tensor size_tensor;
   EXPECT_TRUE(size_tensor.FromProto(
       size_const->mutable_attr()->at({"value"}).tensor()));
diff --git a/tensorflow/python/grappler/layout_optimizer_test.py b/tensorflow/python/grappler/layout_optimizer_test.py
index 25c5ef6b68..578f86ca5a 100644
--- a/tensorflow/python/grappler/layout_optimizer_test.py
+++ b/tensorflow/python/grappler/layout_optimizer_test.py
@@ -376,7 +376,7 @@ class LayoutOptimizerTest(test.TestCase):
       self.assertEqual(expected_num_transposes, num_transposes)
       self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
       self._assert_trans_nchw_to_nhwc('Pad-0-0', nodes)
-      self.assertIn('Pad-PaddingsConst-LayoutOptimizer', nodes)
+      self.assertIn('Pad-1-LayoutOptimizer', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
   def testReduceSum(self):
@@ -587,7 +587,7 @@ class LayoutOptimizerTest(test.TestCase):
       self.assertEqual(expected_num_transposes, num_transposes)
       self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
       self._assert_trans_nchw_to_nhwc('concat-0-0', nodes)
-      self.assertIn('concat-Const_2-LayoutOptimizer', nodes)
+      self.assertIn('concat-2-LayoutOptimizer', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
   def testFill(self):
@@ -698,7 +698,7 @@ class LayoutOptimizerTest(test.TestCase):
       self.assertEqual(expected_num_transposes, num_transposes)
       self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
       self._assert_trans_nchw_to_nhwc('ReverseV2-0-0', nodes)
-      self.assertIn('ReverseV2-DimsConst-LayoutOptimizer', nodes)
+      self.assertIn('ReverseV2-1-LayoutOptimizer', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
   def testReverseWithNonConstDims(self):
@@ -867,7 +867,7 @@ class LayoutOptimizerTest(test.TestCase):
       self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
       self._assert_trans_nchw_to_nhwc('MaxPoolV2-0-0', nodes)
       self._assert_vec_nhwc_to_nchw('MaxPoolV2-2', nodes)
-      self.assertIn('MaxPoolV2-Const_2-LayoutOptimizer', nodes)
+      self.assertIn('MaxPoolV2-1-LayoutOptimizer', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
   def testMaxPoolGradV2(self):
@@ -904,7 +904,7 @@ class LayoutOptimizerTest(test.TestCase):
       self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
       self._assert_trans_nchw_to_nhwc('MaxPoolGradV2-0-0', nodes)
       self._assert_vec_nhwc_to_nchw('MaxPoolGradV2-4', nodes)
-      self.assertIn('MaxPoolGradV2-Const_2-LayoutOptimizer', nodes)
+      self.assertIn('MaxPoolGradV2-3-LayoutOptimizer', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
   def testSliceWithNonConstAxis(self):
@@ -977,16 +977,17 @@ class LayoutOptimizerTest(test.TestCase):
       self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
       self._assert_trans_nchw_to_nhwc('StridedSlice-0-0', nodes)
       self._assert_vec_nhwc_to_nchw('StridedSlice-2', nodes)
-      self.assertIn('StridedSlice-StridedSlice/begin-LayoutOptimizer', nodes)
-      self.assertIn('StridedSlice-StridedSlice/strides-LayoutOptimizer', nodes)
+      self.assertIn('StridedSlice-1-LayoutOptimizer', nodes)
+      self.assertIn('StridedSlice-3-LayoutOptimizer', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
-  def testStridedSliceWithMask(self):
+  def testStridedSliceWithMask1011(self):
     if test.is_gpu_available(cuda_only=True):
       random_seed.set_random_seed(0)
       x = random_ops.truncated_normal([1, 784], seed=0)
       conv = _two_layer_model(x)
-      # This will generate a StridedSlice op with begin mask and end mask.
+      # This will generate a StridedSlice op with begin mask and
+      # end mask 11(1011).
       s = conv[:, :, 1:-1, :]
       output = array_ops.identity(s)
 
@@ -1010,11 +1011,44 @@ class LayoutOptimizerTest(test.TestCase):
       self.assertEqual(expected_num_transposes, num_transposes)
       self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
       self._assert_trans_nchw_to_nhwc('strided_slice-0-0', nodes)
-      self.assertIn('strided_slice-strided_slice/stack-LayoutOptimizer', nodes)
-      self.assertIn('strided_slice-strided_slice/stack_1-LayoutOptimizer',
-                    nodes)
-      self.assertIn('strided_slice-strided_slice/stack_2-LayoutOptimizer',
-                    nodes)
+      self.assertIn('strided_slice-1-LayoutOptimizer', nodes)
+      self.assertIn('strided_slice-2-LayoutOptimizer', nodes)
+      self.assertIn('strided_slice-3-LayoutOptimizer', nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
+  def testStridedSliceWithMask0111(self):
+    if test.is_gpu_available(cuda_only=True):
+      random_seed.set_random_seed(0)
+      x = random_ops.truncated_normal([1, 784], seed=0)
+      conv = _two_layer_model(x)
+      # This will generate a StridedSlice op with begin mask and
+      # end mask 7(0111).
+      s = conv[:, :, :, 1:-1]
+      output = array_ops.identity(s)
+
+      with session.Session() as sess:
+        output_val_ref = sess.run(output)
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(output, run_metadata=metadata)
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      # Four transposes were initially added in the Expand phase of
+      # LayoutOptimizer; two of them are cancelled out in the Collapse phase.
+      expected_num_transposes = 2
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
+      self._assert_trans_nchw_to_nhwc('strided_slice-0-0', nodes)
+      self.assertIn('strided_slice-1-LayoutOptimizer', nodes)
+      self.assertIn('strided_slice-2-LayoutOptimizer', nodes)
+      self.assertIn('strided_slice-3-LayoutOptimizer', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
   def testStridedSliceGradWithNonConstAxis(self):
@@ -1055,10 +1089,8 @@ class LayoutOptimizerTest(test.TestCase):
       self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
       self._assert_trans_nchw_to_nhwc('StridedSliceGrad-0-0', nodes)
       self._assert_vec_nhwc_to_nchw('StridedSliceGrad-2', nodes)
-      self.assertIn('StridedSlice-StridedSliceGrad/begin-LayoutOptimizer',
-                    nodes)
-      self.assertIn('StridedSlice-StridedSliceGrad/strides-LayoutOptimizer',
-                    nodes)
+      self.assertIn('StridedSlice-1-LayoutOptimizer', nodes)
+      self.assertIn('StridedSlice-2-LayoutOptimizer', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
   def testShapeN(self):
-- 
GitLab


From d50f68df6d1e4e41f4486ee71626454f6bd3ffe4 Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Tue, 23 Jan 2018 14:06:13 -0800
Subject: [PATCH 027/966] C API: don't return source/sink control edges.

PiperOrigin-RevId: 182989157
---
 tensorflow/c/c_api.cc      | 14 ++++++++++----
 tensorflow/c/c_api_test.cc | 31 ++++++++++++++++++++++++++++---
 2 files changed, 38 insertions(+), 7 deletions(-)

diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index 6fc75a98f1..92c9adbec7 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -1469,7 +1469,13 @@ int TF_OperationOutputConsumers(TF_Output oper_out, TF_Input* consumers,
 }
 
 int TF_OperationNumControlInputs(TF_Operation* oper) {
-  return oper->node.in_edges().size() - oper->node.num_inputs();
+  int count = 0;
+  for (const auto* edge : oper->node.in_edges()) {
+    if (edge->IsControlEdge() && !edge->src()->IsSource()) {
+      ++count;
+    }
+  }
+  return count;
 }
 
 int TF_OperationGetControlInputs(TF_Operation* oper,
@@ -1477,7 +1483,7 @@ int TF_OperationGetControlInputs(TF_Operation* oper,
                                  int max_control_inputs) {
   int count = 0;
   for (const auto* edge : oper->node.in_edges()) {
-    if (edge->IsControlEdge()) {
+    if (edge->IsControlEdge() && !edge->src()->IsSource()) {
       if (count < max_control_inputs) {
         control_inputs[count] = ToOperation(edge->src());
       }
@@ -1490,7 +1496,7 @@ int TF_OperationGetControlInputs(TF_Operation* oper,
 int TF_OperationNumControlOutputs(TF_Operation* oper) {
   int count = 0;
   for (const auto* edge : oper->node.out_edges()) {
-    if (edge->IsControlEdge()) {
+    if (edge->IsControlEdge() && !edge->dst()->IsSink()) {
       ++count;
     }
   }
@@ -1502,7 +1508,7 @@ int TF_OperationGetControlOutputs(TF_Operation* oper,
                                   int max_control_outputs) {
   int count = 0;
   for (const auto* edge : oper->node.out_edges()) {
-    if (edge->IsControlEdge()) {
+    if (edge->IsControlEdge() && !edge->dst()->IsSink()) {
       if (count < max_control_outputs) {
         control_outputs[count] = ToOperation(edge->dst());
       }
diff --git a/tensorflow/c/c_api_test.cc b/tensorflow/c/c_api_test.cc
index df697e16d3..01954eb235 100644
--- a/tensorflow/c/c_api_test.cc
+++ b/tensorflow/c/c_api_test.cc
@@ -575,7 +575,7 @@ TEST(CAPI, ImportGraphDef) {
   TF_Status* s = TF_NewStatus();
   TF_Graph* graph = TF_NewGraph();
 
-  // Create a graph with two nodes: x and 3
+  // Create a simple graph.
   Placeholder(graph, s);
   ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
   ASSERT_TRUE(TF_GraphOperationByName(graph, "feed") != nullptr);
@@ -586,7 +586,7 @@ TEST(CAPI, ImportGraphDef) {
   ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
   ASSERT_TRUE(TF_GraphOperationByName(graph, "neg") != nullptr);
 
-  // Export to a GraphDef
+  // Export to a GraphDef.
   TF_Buffer* graph_def = TF_NewBuffer();
   TF_GraphToGraphDef(graph, graph_def, s);
   ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
@@ -606,6 +606,31 @@ TEST(CAPI, ImportGraphDef) {
   ASSERT_TRUE(feed != nullptr);
   ASSERT_TRUE(neg != nullptr);
 
+  // Test basic structure of the imported graph.
+  EXPECT_EQ(0, TF_OperationNumInputs(scalar));
+  EXPECT_EQ(0, TF_OperationNumInputs(feed));
+  ASSERT_EQ(1, TF_OperationNumInputs(neg));
+  TF_Output neg_input = TF_OperationInput({neg, 0});
+  EXPECT_EQ(scalar, neg_input.oper);
+  EXPECT_EQ(0, neg_input.index);
+
+  // Test that we can't see control edges involving the source and sink nodes.
+  TF_Operation* control_ops[100];
+  EXPECT_EQ(0, TF_OperationNumControlInputs(scalar));
+  EXPECT_EQ(0, TF_OperationGetControlInputs(scalar, control_ops, 100));
+  EXPECT_EQ(0, TF_OperationNumControlOutputs(scalar));
+  EXPECT_EQ(0, TF_OperationGetControlOutputs(scalar, control_ops, 100));
+
+  EXPECT_EQ(0, TF_OperationNumControlInputs(feed));
+  EXPECT_EQ(0, TF_OperationGetControlInputs(feed, control_ops, 100));
+  EXPECT_EQ(0, TF_OperationNumControlOutputs(feed));
+  EXPECT_EQ(0, TF_OperationGetControlOutputs(feed, control_ops, 100));
+
+  EXPECT_EQ(0, TF_OperationNumControlInputs(neg));
+  EXPECT_EQ(0, TF_OperationGetControlInputs(neg, control_ops, 100));
+  EXPECT_EQ(0, TF_OperationNumControlOutputs(neg));
+  EXPECT_EQ(0, TF_OperationGetControlOutputs(neg, control_ops, 100));
+
   // Import it again, with an input mapping, return outputs, and a return
   // operation, into the same graph.
   TF_DeleteImportGraphDefOptions(opts);
@@ -629,7 +654,7 @@ TEST(CAPI, ImportGraphDef) {
   ASSERT_TRUE(neg2 != nullptr);
 
   // Check input mapping
-  TF_Output neg_input = TF_OperationInput({neg, 0});
+  neg_input = TF_OperationInput({neg, 0});
   EXPECT_EQ(scalar, neg_input.oper);
   EXPECT_EQ(0, neg_input.index);
 
-- 
GitLab


From 634959be3220bf9263b1fac33ce41a0869fdc010 Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Tue, 23 Jan 2018 14:24:53 -0800
Subject: [PATCH 028/966] Updating the update_version script to act
 appropriately for rc0 when it comes to the install sources tested
 configurations table along with a warning. Using perl rather than sed as it
 supports regex look behind.

PiperOrigin-RevId: 182992467
---
 tensorflow/tools/ci_build/update_version.py | 110 +++++++++++---------
 1 file changed, 61 insertions(+), 49 deletions(-)

diff --git a/tensorflow/tools/ci_build/update_version.py b/tensorflow/tools/ci_build/update_version.py
index d2a63e5d66..347d0769a9 100755
--- a/tensorflow/tools/ci_build/update_version.py
+++ b/tensorflow/tools/ci_build/update_version.py
@@ -25,19 +25,19 @@
 # pylint: disable=superfluous-parens
 
 import argparse
-import fileinput
 import os
 import re
 import subprocess
 import time
 
-# File parameters
+# File parameters.
 TF_SRC_DIR = "tensorflow"
 VERSION_H = "%s/core/public/version.h" % TF_SRC_DIR
 SETUP_PY = "%s/tools/pip_package/setup.py" % TF_SRC_DIR
 README_MD = "./README.md"
 DEVEL_DOCKERFILE = "%s/tools/docker/Dockerfile.devel" % TF_SRC_DIR
 GPU_DEVEL_DOCKERFILE = "%s/tools/docker/Dockerfile.devel-gpu" % TF_SRC_DIR
+CPU_MKL_DEVEL_DOCKERFILE = "%s/tools/docker/Dockerfile.devel-cpu-mkl" % TF_SRC_DIR
 RELEVANT_FILES = [TF_SRC_DIR,
                   VERSION_H,
                   SETUP_PY,
@@ -45,17 +45,11 @@ RELEVANT_FILES = [TF_SRC_DIR,
                   DEVEL_DOCKERFILE,
                   GPU_DEVEL_DOCKERFILE]
 
-# Version type parameters
+# Version type parameters.
 NIGHTLY_VERSION = 1
 REGULAR_VERSION = 0
 
 
-def replace_line(old_line, new_line, filename):
-  """Replace a line in a file."""
-  for line in fileinput.input(filename, inplace=True):
-    print(line.rstrip().replace(old_line, new_line))
-
-
 def check_existence(filename):
   """Check the existence of file or dir."""
   if not os.path.exists(filename):
@@ -69,9 +63,12 @@ def check_all_files():
     check_existence(file_name)
 
 
-def replace_with_sed(query, filename):
+def replace_string_in_line(search, replace, filename):
   """Replace with sed when regex is required."""
-  subprocess.check_call(['sed', '-i', '-r', '-e', query, filename])
+  with open(filename, "r") as source:
+    content = source.read()
+  with open(filename, "w") as source:
+    source.write(re.sub(search, replace, content))
 
 
 class Version(object):
@@ -125,13 +122,13 @@ class Version(object):
     Raises:
       RuntimeError: If the version string is not valid.
     """
-    # Check validity of new version string
+    # Check validity of new version string.
     if not re.search(r"[0-9]+\.[0-9]+\.[a-zA-Z0-9]+", string):
       raise RuntimeError("Invalid version string: %s" % string)
 
     major, minor, extension = string.split(".", 2)
 
-    # Isolate patch and identifier string if identifier string exists
+    # Isolate patch and identifier string if identifier string exists.
     extension_split = extension.split("-", 1)
     patch = extension_split[0]
     if len(extension_split) == 2:
@@ -154,7 +151,7 @@ def get_current_semver_version():
     core/public/version.h
   """
 
-  # Get current version information
+  # Get current version information.
   version_file = open(VERSION_H, "r")
   for line in version_file:
     major_match = re.search("^#define TF_MAJOR_VERSION ([0-9]+)", line)
@@ -185,32 +182,33 @@ def get_current_semver_version():
 
 def update_version_h(old_version, new_version):
   """Update tensorflow/core/public/version.h."""
-  replace_line("#define TF_MAJOR_VERSION %s" % old_version.major,
-               "#define TF_MAJOR_VERSION %s" % new_version.major, VERSION_H)
-  replace_line("#define TF_MINOR_VERSION %s" % old_version.minor,
-               "#define TF_MINOR_VERSION %s" % new_version.minor, VERSION_H)
-  replace_line("#define TF_PATCH_VERSION %s" % old_version.patch,
-               "#define TF_PATCH_VERSION %s" % new_version.patch, VERSION_H)
-  replace_line("#define TF_VERSION_SUFFIX \"%s\"" %
-               old_version.identifier_string,
-               "#define TF_VERSION_SUFFIX \"%s\""
-               % new_version.identifier_string,
-               VERSION_H)
+  replace_string_in_line("#define TF_MAJOR_VERSION %s" % old_version.major,
+                         "#define TF_MAJOR_VERSION %s" % new_version.major,
+                         VERSION_H)
+  replace_string_in_line("#define TF_MINOR_VERSION %s" % old_version.minor,
+                         "#define TF_MINOR_VERSION %s" % new_version.minor,
+                         VERSION_H)
+  replace_string_in_line("#define TF_PATCH_VERSION %s" % old_version.patch,
+                         "#define TF_PATCH_VERSION %s" % new_version.patch,
+                         VERSION_H)
+  replace_string_in_line(
+      "#define TF_VERSION_SUFFIX \"%s\"" % old_version.identifier_string,
+      "#define TF_VERSION_SUFFIX \"%s\"" % new_version.identifier_string,
+      VERSION_H)
 
 
 def update_setup_dot_py(old_version, new_version):
   """Update setup.py."""
-  replace_line("_VERSION = '%s'" % old_version.string,
-               "_VERSION = '%s'" % new_version.string, SETUP_PY)
+  replace_string_in_line("_VERSION = '%s'" % old_version.string,
+                         "_VERSION = '%s'" % new_version.string, SETUP_PY)
 
 
 def update_readme(old_version, new_version):
   """Update README."""
   pep_440_str = new_version.pep_440_str
-  replace_with_sed(r"s/%s\.%s\.([[:alnum:]]+)-/%s-/g" % (old_version.major,
-                                                         old_version.minor,
-                                                         pep_440_str),
-                   README_MD)
+  replace_string_in_line(r"%s\.%s\.([[:alnum:]]+)-" % (old_version.major,
+                                                       old_version.minor),
+                         "%s-" % pep_440_str, README_MD)
 
 
 def update_md_files(old_version, new_version):
@@ -226,22 +224,29 @@ def update_md_files(old_version, new_version):
   for filename in ["linux", "mac", "windows", "sources"]:
     filepath = "%s/docs_src/install/install_%s.md" % (TF_SRC_DIR,
                                                       filename)
-    replace_with_sed("s/tensorflow-%s/tensorflow-%s/g"
-                     % (old_pep_version, new_pep_version), filepath)
-    replace_with_sed("s/tensorflow_gpu-%s/tensorflow_gpu-%s/g"
-                     % (old_pep_version, new_pep_version), filepath)
-    replace_with_sed("s/TensorFlow %s/TensorFlow %s/g"
-                     % (old_pep_version, new_pep_version), filepath)
+
+    if filename == "sources" and "rc0" in new_pep_version:
+      replace_string_in_line("(?<!<td>)tensorflow-%s" % old_pep_version,
+                             "tensorflow-%s" % new_pep_version, filepath)
+      replace_string_in_line("(?<!<td>)tensorflow_gpu-%s" % old_pep_version,
+                             "tensorflow_gpu-%s" % new_pep_version, filepath)
+    else:
+      replace_string_in_line("tensorflow-%s" % old_pep_version,
+                             "tensorflow-%s" % new_pep_version, filepath)
+      replace_string_in_line("tensorflow_gpu-%s" % old_pep_version,
+                             "tensorflow_gpu-%s" % new_pep_version, filepath)
+    replace_string_in_line("TensorFlow %s" % old_pep_version,
+                           "TensorFlow %s" % new_pep_version, filepath)
 
   for filename in ["java", "go", "c"]:
     filepath = "%s/docs_src/install/install_%s.md" % (TF_SRC_DIR,
                                                       filename)
-    replace_with_sed(r"s/x86_64-%s/x86_64-%s/g"
-                     % (old_version, new_version), filepath)
-    replace_with_sed(r"s/libtensorflow-%s.jar/libtensorflow-%s.jar/g"
-                     % (old_version, new_version), filepath)
-    replace_with_sed(r"s/<version>%s<\/version>/<version>%s<\/version>/g"
-                     % (old_version, new_version), filepath)
+    replace_string_in_line(r"x86_64-%s" % old_version,
+                           "x86_64-%s" % new_version, filepath)
+    replace_string_in_line(r"libtensorflow-%s.jar" % old_version,
+                           "libtensorflow-%s.jar" % new_version, filepath)
+    replace_string_in_line(r"<version>%s<\/version>" % old_version,
+                           "<version>%s</version>" % new_version, filepath)
 
 
 def major_minor_change(old_version, new_version):
@@ -266,10 +271,11 @@ def update_dockerfiles(old_version, new_version):
           % (old_r_major_minor_string, r_major_minor_string))
 
     # Update dockerfiles
-    replace_with_sed("s/%s/%s/g"
-                     % (old_r_major_minor, r_major_minor), DEVEL_DOCKERFILE)
-    replace_with_sed("s/%s/%s/g"
-                     % (old_r_major_minor, r_major_minor), GPU_DEVEL_DOCKERFILE)
+    replace_string_in_line(old_r_major_minor, r_major_minor, DEVEL_DOCKERFILE)
+    replace_string_in_line(old_r_major_minor, r_major_minor,
+                           GPU_DEVEL_DOCKERFILE)
+    replace_string_in_line(old_r_major_minor, r_major_minor,
+                           CPU_MKL_DEVEL_DOCKERFILE)
 
 
 def check_for_lingering_string(lingering_string):
@@ -333,7 +339,7 @@ def main():
   old_version = get_current_semver_version()
 
   if args.nightly:
-    # dev minor version is one ahead of official
+    # Dev minor version is one ahead of official.
     nightly_minor_ver = int(old_version.minor) + 1
     new_version = Version(old_version.major,
                           str(nightly_minor_ver),
@@ -349,12 +355,18 @@ def main():
   update_md_files(old_version, new_version)
   update_dockerfiles(old_version, new_version)
 
-  # Print transition details
+  # Print transition details.
   print("Major: %s -> %s" % (old_version.major, new_version.major))
   print("Minor: %s -> %s" % (old_version.minor, new_version.minor))
   print("Patch: %s -> %s\n" % (old_version.patch, new_version.patch))
 
   check_for_old_version(old_version, new_version)
+  if "rc0" in str(new_version):
+    print("\n\n\033[93mNOTE: Please update the tensorflow/docs_src/install/"
+          "install_sources.md and add a line for tensorflow-%s and "
+          "tensorflow_gpu-%s in the tested source configurations "
+          "table.\033[0m\n" % (new_version.pep_440_str,
+                               new_version.pep_440_str))
 
 
 if __name__ == "__main__":
-- 
GitLab


From 6d6fe1f8e6e47755ca9d1899e4348d7fa466f132 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Jan 2018 14:25:36 -0800
Subject: [PATCH 029/966] Replace the use of ConsumeValueOrDie with
 TF_ASSERT_OK_AND_ASSIGN.

PiperOrigin-RevId: 182992618
---
 .../service/hlo_element_type_converter_test.cc | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc b/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc
index 2aed82409b..cb94d9f19b 100644
--- a/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc
@@ -43,8 +43,8 @@ TEST_F(HloElementTypeConverterTest, CustomCallsNotConverted) {
   )";
   auto module = CreateModuleFromHloString(hlo_string);
   HloElementTypeConverter type_converter(BF16, F32);
-  auto converted = type_converter.Run(module.get()).ConsumeValueOrDie();
-  EXPECT_TRUE(converted == false);
+  TF_ASSERT_OK_AND_ASSIGN(bool converted, type_converter.Run(module.get()));
+  EXPECT_FALSE(converted);
 }
 
 TEST_F(HloElementTypeConverterTest, InfeedsOutfeedsNotConverted) {
@@ -57,8 +57,8 @@ TEST_F(HloElementTypeConverterTest, InfeedsOutfeedsNotConverted) {
   )";
   auto module = CreateModuleFromHloString(hlo_string);
   HloElementTypeConverter type_converter(BF16, F32);
-  auto converted = type_converter.Run(module.get()).ConsumeValueOrDie();
-  EXPECT_TRUE(converted == false);
+  TF_ASSERT_OK_AND_ASSIGN(bool converted, type_converter.Run(module.get()));
+  EXPECT_FALSE(converted);
 }
 
 TEST_F(HloElementTypeConverterTest, OperationsInNestedTuplesConverted) {
@@ -77,9 +77,8 @@ TEST_F(HloElementTypeConverterTest, OperationsInNestedTuplesConverted) {
 
   auto module = CreateModuleFromHloString(hlo_string);
   HloElementTypeConverter type_converter(BF16, F32);
-  auto converted = type_converter.Run(module.get()).ConsumeValueOrDie();
-  EXPECT_TRUE(converted == true);
-
+  TF_ASSERT_OK_AND_ASSIGN(bool converted, type_converter.Run(module.get()));
+  EXPECT_TRUE(converted);
   const HloInstruction* bf16_op =
       module->entry_computation()->root_instruction()->operand(0)->operand(1);
   EXPECT_THAT(bf16_op, op::Convert(op::Add(op::Constant(), op::Convert())));
@@ -106,9 +105,8 @@ TEST_F(HloElementTypeConverterTest, BatchNormGradBF16Converted) {
 
   auto module = CreateModuleFromHloString(hlo_string);
   HloElementTypeConverter type_converter(BF16, F32);
-  auto converted = type_converter.Run(module.get()).ConsumeValueOrDie();
-  EXPECT_TRUE(converted == true);
-
+  TF_ASSERT_OK_AND_ASSIGN(bool converted, type_converter.Run(module.get()));
+  EXPECT_TRUE(converted);
   const HloInstruction* tuple_instr =
       module->entry_computation()->root_instruction();
   ::testing::Matcher<const ::xla::HloInstruction*> batch_norm =
-- 
GitLab


From 23e25320211c13b48f67db603a6829ab3908d71e Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Tue, 23 Jan 2018 14:31:15 -0800
Subject: [PATCH 030/966] Turn private Operation and Tensor fields into
 properties.

Unfortunately there are many uses of these private fields. Turning
them into properties that call public API methods will allow enabling
the C API before fixing all such uses.

PiperOrigin-RevId: 182993539
---
 tensorflow/python/framework/ops.py | 134 ++++++++++++++++++++---------
 1 file changed, 94 insertions(+), 40 deletions(-)

diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index ce9ca07215..9d3806e8fe 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -287,7 +287,7 @@ class Tensor(_TensorLike):
     self._op = op
     self._value_index = value_index
     self._dtype = dtypes.as_dtype(dtype)
-    self._shape = tensor_shape.unknown_shape()
+    self._shape_val = tensor_shape.unknown_shape()
     # List of operations that use this Tensor as input.  We maintain this list
     # to easily navigate a computation graph.
     self._consumers = []
@@ -381,7 +381,18 @@ class Tensor(_TensorLike):
               graph, self._as_tf_output(), num_dims, status)
         dim_list = [None if i == -1 else i for i in dim_list]
       return tensor_shape.TensorShape(dim_list)
-    return self._shape
+    return self._shape_val
+
+  @property
+  def _shape(self):
+    logging.warning("Tensor._shape is private, use Tensor.shape "
+                    "instead. Tensor._shape will eventually be removed.")
+    return self.shape
+
+  @_shape.setter
+  def _shape(self, value):
+    raise ValueError(
+        "Tensor._shape cannot be assigned, use Tensor.set_shape instead.")
 
   def __iter__(self):
     if context.in_graph_mode():
@@ -456,7 +467,7 @@ class Tensor(_TensorLike):
         this tensor.
     """
     if not _USE_C_API:
-      self._shape = self._shape.merge_with(shape)  # pylint: disable=protected-access
+      self._shape_val = self._shape_val.merge_with(shape)
       return
     if not isinstance(shape, tensor_shape.TensorShape):
       shape = tensor_shape.TensorShape(shape)
@@ -1572,7 +1583,7 @@ class Operation(object):
             "Cannot create a tensor proto whose content is larger than 2GB.")
       if not _VALID_OP_NAME_REGEX.match(node_def.name):
         raise ValueError("'%s' is not a valid node name" % node_def.name)
-      self._node_def = copy.deepcopy(node_def)
+      self._node_def_val = copy.deepcopy(node_def)
       c_op = None
     elif type(node_def).__name__ == "SwigPyObject":
       assert inputs is None
@@ -1581,7 +1592,7 @@ class Operation(object):
       assert input_types is None
       assert original_op is None
       assert op_def is None
-      self._node_def = None
+      self._node_def_val = None
       c_op = node_def
     else:
       raise TypeError("node_def needs to be a NodeDef: %s" % node_def)
@@ -1589,28 +1600,29 @@ class Operation(object):
     if not isinstance(g, Graph):
       raise TypeError("g needs to be a Graph: %s" % g)
     self._graph = g
+
     if inputs is None:
       inputs = []
     elif not isinstance(inputs, list):
       raise TypeError("inputs needs to be a list of Tensors: %s" % inputs)
-    self._inputs = list(inputs)  # Defensive copy.
-    for a in self._inputs:
+    self._inputs_val = list(inputs)  # Defensive copy.
+    for a in self._inputs_val:
       if not isinstance(a, Tensor):
         raise TypeError("input needs to be a Tensor: %s" % a)
     if input_types is None:
-      input_types = [i.dtype.base_dtype for i in self._inputs]
+      input_types = [i.dtype.base_dtype for i in self._inputs_val]
     else:
       if not all(
           x.is_compatible_with(i.dtype)
-          for i, x in zip(self._inputs, input_types)):
+          for i, x in zip(self._inputs_val, input_types)):
         raise TypeError("In op '%s', input types (%s) are not compatible "
                         "with expected types (%s)" %
-                        (self.node_def.name, [i.dtype for i in self._inputs],
+                        (self.node_def.name, [i.dtype for i in self._inputs_val],
                          input_types))
     self._input_types_val = input_types
 
     # Build the list of control inputs.
-    self._control_inputs = []
+    self._control_inputs_val = []
     if control_inputs:
       for c in control_inputs:
         control_op = None
@@ -1621,11 +1633,11 @@ class Operation(object):
         else:
           raise TypeError("Control input must be an Operation, "
                           "a Tensor, or IndexedSlices: %s" % c)
-        self._control_inputs.append(control_op)
+        self._control_inputs_val.append(control_op)
 
     self._id_value = self._graph._next_id()  # pylint: disable=protected-access
     self._original_op = original_op
-    self._op_def = op_def
+    self._op_def_val = op_def
     self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access
     self._control_flow_context = self.graph._get_control_flow_context()  # pylint: disable=protected-access
 
@@ -1641,15 +1653,15 @@ class Operation(object):
       # Refactor so we don't have to do this here.
       grouped_inputs = self._reconstruct_sequence_inputs(
           op_def, inputs, node_def.attr)
-      self._c_op = _create_c_op(self._graph, self._node_def, grouped_inputs,
-                                self._control_inputs)
+      self._c_op = _create_c_op(self._graph, self._node_def_val, grouped_inputs,
+                                self._control_inputs_val)
     else:
       self._c_op = None
 
     # Mark that we consume the inputs. This is unnecessary and unsupported with
     # the C API enabled, since the C API tracks the tensor consumers instead.
     if not self._c_op:
-      for input_tensor in self._inputs:
+      for input_tensor in self._inputs_val:
         input_tensor._add_consumer(self)  # pylint: disable=protected-access
 
     # Initialize self._outputs.
@@ -1764,7 +1776,7 @@ class Operation(object):
     if self._c_op:
       return c_api.TF_OperationName(self._c_op)
     else:
-      return self._node_def.name
+      return self._node_def_val.name
 
   @property
   def _id(self):
@@ -1783,7 +1795,7 @@ class Operation(object):
     if self._c_op:
       return c_api.TF_OperationDevice(self._c_op)
     else:
-      return self._node_def.device
+      return self._node_def_val.device
 
   @property
   def _output_types(self):
@@ -1843,7 +1855,7 @@ class Operation(object):
           self._c_op,  # pylint: disable=protected-access
           compat.as_str(_device_string(device)))
     else:
-      self._node_def.device = _device_string(device)
+      self._node_def_val.device = _device_string(device)
 
   def _add_input(self, tensor, dtype=None):
     """Add a new input to this operation.
@@ -1871,7 +1883,7 @@ class Operation(object):
         raise TypeError(
             "Cannot convert a tensor of type %s to an input of type %s" %
             (tensor.dtype.name, dtype.name))
-    self._inputs.append(tensor)
+    self._inputs_val.append(tensor)
     self._input_types_val.append(dtype)
     tensor._add_consumer(self)  # pylint: disable=protected-access
     self._recompute_node_def()
@@ -1901,8 +1913,8 @@ class Operation(object):
             self._tf_input(index),
             status)
     else:
-      self._inputs[index].consumers().remove(self)
-      self._inputs[index] = tensor
+      self._inputs_val[index].consumers().remove(self)
+      self._inputs_val[index] = tensor
       self._input_types_val[index] = tensor.dtype
       tensor._add_consumer(self)  # pylint: disable=protected-access
       self._recompute_node_def()
@@ -1928,7 +1940,7 @@ class Operation(object):
           if not isinstance(op, Operation):
             raise TypeError("op must be an Operation: %s" % op)
           _assert_same_graph(self, op)
-          self._control_inputs.append(op)
+          self._control_inputs_val.append(op)
         self._recompute_node_def()
 
   def _add_control_input(self, op):
@@ -1960,13 +1972,14 @@ class Operation(object):
     # TODO(skyewm): remove this function when we switch to C API
     if self._c_op: return
 
-    del self._node_def.input[:]
+    del self._node_def_val.input[:]
     # pylint: disable=protected-access
-    self._node_def.input.extend([t._as_node_def_input() for t in self._inputs])
+    self._node_def_val.input.extend(
+        [t._as_node_def_input() for t in self._inputs_val])
     # pylint: enable=protected-access
-    if self._control_inputs:
-      self._node_def.input.extend(
-          ["^%s" % op.name for op in self._control_inputs])
+    if self._control_inputs_val:
+      self._node_def_val.input.extend(
+          ["^%s" % op.name for op in self._control_inputs_val])
 
   def __str__(self):
     return str(self.node_def)
@@ -2016,7 +2029,17 @@ class Operation(object):
       ]
       # pylint: enable=protected-access
       return Operation._InputList(retval)
-    return Operation._InputList(self._inputs)
+    return Operation._InputList(self._inputs_val)
+
+  @property
+  def _inputs(self):
+    logging.warning("Operation._inputs is private, use Operation.inputs "
+                    "instead. Operation._inputs will eventually be removed.")
+    return self.inputs
+
+  @_inputs.setter
+  def _inputs(self, value):
+    raise ValueError("Cannot assign _inputs")
 
   @property
   def _input_types(self):
@@ -2030,6 +2053,10 @@ class Operation(object):
     else:
       return self._input_types_val
 
+  @_input_types.setter
+  def _input_types(self, value):
+    raise ValueError("Cannot assign _input_types")
+
   @property
   def control_inputs(self):
     """The `Operation` objects on which this op has a control dependency.
@@ -2053,7 +2080,22 @@ class Operation(object):
       ]
       # pylint: enable=protected-access
     else:
-      return self._control_inputs
+      return self._control_inputs_val
+
+  @property
+  def _control_inputs(self):
+    logging.warning("Operation._control_inputs is private, use "
+                    "Operation.control_inputs instead. "
+                    "Operation._control_inputs will eventually be removed.")
+    return self.control_inputs
+
+  @_control_inputs.setter
+  def _control_inputs(self, value):
+    logging.warning("Operation._control_inputs is private, use "
+                    "Operation.control_inputs instead. "
+                    "Operation._control_inputs will eventually be removed.")
+    self._remove_all_control_inputs()
+    self._add_control_inputs(value)
 
   @property
   def type(self):
@@ -2062,7 +2104,7 @@ class Operation(object):
       op_type = c_api.TF_OperationOpType(self._c_op)
       return op_type
     else:
-      return self._node_def.op
+      return self._node_def_val.op
 
   @property
   def graph(self):
@@ -2089,7 +2131,13 @@ class Operation(object):
       node_def.ParseFromString(compat.as_bytes(data))
       return node_def
     else:
-      return self._node_def
+      return self._node_def_val
+
+  @property
+  def _node_def(self):
+    logging.warning("Operation._node_def is private, use Operation.node_def "
+                    "instead. Operation._node_def will eventually be removed.")
+    return self.node_def
 
   @property
   def op_def(self):
@@ -2114,7 +2162,13 @@ class Operation(object):
       op_def.ParseFromString(compat.as_bytes(data))
       return op_def
     else:
-      return self._op_def
+      return self._op_def_val
+
+  @property
+  def _op_def(self):
+    logging.warning("Operation._op_def is private, use Operation.op_def "
+                    "instead. Operation._op_def will eventually be removed.")
+    return self.op_def
 
   @property
   def traceback(self):
@@ -2146,7 +2200,7 @@ class Operation(object):
       finally:
         c_api.TF_DeleteBuffer(buf)
     else:
-      self._node_def.attr[attr_name].CopyFrom(attr_value)
+      self._node_def_val.attr[attr_name].CopyFrom(attr_value)
 
   def get_attr(self, name):
     """Returns the value of the attr of this op with the given `name`.
@@ -2173,10 +2227,10 @@ class Operation(object):
       x = attr_value_pb2.AttrValue()
       x.ParseFromString(data)
     else:
-      if name not in self._node_def.attr:
+      if name not in self._node_def_val.attr:
         raise ValueError(
-            "No attr named '" + name + "' in " + str(self._node_def))
-      x = self._node_def.attr[name]
+            "No attr named '" + name + "' in " + str(self._node_def_val))
+      x = self._node_def_val.attr[name]
 
     # Treat an empty oneof value as an empty list.
     if not x.WhichOneof("value"):
@@ -4196,10 +4250,10 @@ class Graph(object):
       """
       self._graph = graph
       if control_inputs is None:
-        self._control_inputs = []
+        self._control_inputs_val = []
         self._new_stack = True
       else:
-        self._control_inputs = control_inputs
+        self._control_inputs_val = control_inputs
         self._new_stack = False
       self._seen_nodes = set()
       self._old_stack = None
@@ -4227,7 +4281,7 @@ class Graph(object):
 
     @property
     def control_inputs(self):
-      return self._control_inputs
+      return self._control_inputs_val
 
     def add_op(self, op):
       self._seen_nodes.add(op)
-- 
GitLab


From b2637597c5f23bbd3f5a71f9ec91b65898ea896f Mon Sep 17 00:00:00 2001
From: Yilei Yang <yileiyang@google.com>
Date: Thu, 14 Dec 2017 11:12:52 -0800
Subject: [PATCH 031/966] Continue to allow flag access before explicit parse.

Made tf.flags.FLAGS a wrapper of absl.flags.FLAGS, when the flag is access, parse flags implicitly with sys.argv if not yet.

PiperOrigin-RevId: 179068530
---
 tensorflow/python/BUILD                  |  5 ++-
 tensorflow/python/platform/flags.py      | 55 +++++++++++++++++++++++
 tensorflow/python/platform/flags_test.py | 57 +++++++++++++++++++++++-
 3 files changed, 114 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 5255b69418..375a5a0720 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -171,7 +171,10 @@ tf_py_test(
     name = "flags_test",
     size = "small",
     srcs = ["platform/flags_test.py"],
-    additional_deps = [":platform"],
+    additional_deps = [
+        ":client_testlib",
+        ":platform",
+    ],
 )
 
 tf_py_test(
diff --git a/tensorflow/python/platform/flags.py b/tensorflow/python/platform/flags.py
index abd6f3d855..6225db7744 100644
--- a/tensorflow/python/platform/flags.py
+++ b/tensorflow/python/platform/flags.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import logging as _logging
+import sys as _sys
 
 # go/tf-wildcard-import
 from absl.flags import *  # pylint: disable=wildcard-import
@@ -59,6 +60,58 @@ def _wrap_define_function(original_function):
   return tf_decorator.make_decorator(original_function, wrapper)
 
 
+class _FlagValuesWrapper(object):
+  """Wrapper class for absl.flags.FLAGS.
+
+  The difference is that tf.flags.FLAGS implicitly parses flags with sys.argv
+  when accessing the FLAGS values before it's explicitly parsed,
+  while absl.flags.FLAGS raises an exception.
+  """
+
+  def __init__(self, flags_object):
+    self.__dict__['__wrapped'] = flags_object
+
+  def __getattribute__(self, name):
+    if name == '__dict__':
+      return super(_FlagValuesWrapper, self).__getattribute__(name)
+    return self.__dict__['__wrapped'].__getattribute__(name)
+
+  def __getattr__(self, name):
+    wrapped = self.__dict__['__wrapped']
+    # To maintain backwards compatibility, implicitly parse flags when reading
+    # a flag.
+    if not wrapped.is_parsed():
+      wrapped(_sys.argv)
+    return wrapped.__getattr__(name)
+
+  def __setattr__(self, name, value):
+    return self.__dict__['__wrapped'].__setattr__(name, value)
+
+  def __delattr__(self, name):
+    return self.__dict__['__wrapped'].__delattr__(name)
+
+  def __dir__(self):
+    return self.__dict__['__wrapped'].__dir__()
+
+  def __getitem__(self, name):
+    return self.__dict__['__wrapped'].__getitem__(name)
+
+  def __setitem__(self, name, flag):
+    return self.__dict__['__wrapped'].__setitem__(name, flag)
+
+  def __len__(self):
+    return self.__dict__['__wrapped'].__len__()
+
+  def __iter__(self):
+    return self.__dict__['__wrapped'].__iter__()
+
+  def __str__(self):
+    return self.__dict__['__wrapped'].__str__()
+
+  def __call__(self, *args, **kwargs):
+    return self.__dict__['__wrapped'].__call__(*args, **kwargs)
+
+
 # pylint: disable=invalid-name,used-before-assignment
 # absl.flags APIs use `default` as the name of the default value argument.
 # Allow the following functions continue to accept `default_value`.
@@ -68,3 +121,5 @@ DEFINE_bool = DEFINE_boolean
 DEFINE_float = _wrap_define_function(DEFINE_float)
 DEFINE_integer = _wrap_define_function(DEFINE_integer)
 # pylint: enable=invalid-name,used-before-assignment
+
+FLAGS = _FlagValuesWrapper(FLAGS)  # pylint: disable=used-before-assignment
diff --git a/tensorflow/python/platform/flags_test.py b/tensorflow/python/platform/flags_test.py
index e8200142dd..bd3c8e3995 100644
--- a/tensorflow/python/platform/flags_test.py
+++ b/tensorflow/python/platform/flags_test.py
@@ -17,11 +17,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import sys
 import unittest
 
 from absl import flags as absl_flags
 
 from tensorflow.python.platform import flags
+from tensorflow.python.platform import test
 
 
 flags.DEFINE_string(
@@ -48,8 +50,59 @@ flags.DEFINE_boolean(
 
 class FlagsTest(unittest.TestCase):
 
-  def test_global_flags_object(self):
-    self.assertIs(flags.FLAGS, absl_flags.FLAGS)
+  def setUp(self):
+    self.original_flags = flags.FlagValues()
+    self.wrapped_flags = flags._FlagValuesWrapper(self.original_flags)
+    flags.DEFINE_string(
+        'test', 'default', 'test flag', flag_values=self.wrapped_flags)
+
+  def test_attribute_overrides(self):
+    # Test that methods defined in absl.flags.FlagValues are the same as the
+    # wrapped ones.
+    self.assertEqual(flags.FLAGS.is_parsed, absl_flags.FLAGS.is_parsed)
+
+  def test_getattr(self):
+    self.assertFalse(self.wrapped_flags.is_parsed())
+    with test.mock.patch.object(sys, 'argv', new=['program', '--test=new']):
+      self.assertEqual('new', self.wrapped_flags.test)
+    self.assertTrue(self.wrapped_flags.is_parsed())
+
+  def test_setattr(self):
+    self.assertEqual('default', self.wrapped_flags.test)
+    self.wrapped_flags.test = 'new'
+    self.assertEqual('new', self.wrapped_flags.test)
+
+  def test_delattr(self):
+    del self.wrapped_flags.test
+    self.assertNotIn('test', self.wrapped_flags)
+    with self.assertRaises(AttributeError):
+      _ = self.wrapped_flags.test
+
+  def test_dir(self):
+    self.assertEqual(['test'], dir(self.wrapped_flags))
+
+  def test_getitem(self):
+    self.assertIs(self.original_flags['test'], self.wrapped_flags['test'])
+
+  def test_setitem(self):
+    flag = flags.Flag(flags.ArgumentParser(), flags.ArgumentSerializer(),
+                      'fruit', 'apple', 'the fruit type')
+    self.wrapped_flags['fruit'] = flag
+    self.assertIs(self.original_flags['fruit'], self.wrapped_flags['fruit'])
+    self.assertEqual('apple', self.wrapped_flags.fruit)
+
+  def test_len(self):
+    self.assertEqual(1, len(self.wrapped_flags))
+
+  def test_iter(self):
+    self.assertEqual(['test'], list(self.wrapped_flags))
+
+  def test_str(self):
+    self.assertEqual(str(self.wrapped_flags), str(self.original_flags))
+
+  def test_call(self):
+    self.wrapped_flags(['program', '--test=new'])
+    self.assertEqual('new', self.wrapped_flags.test)
 
   def test_keyword_arguments(self):
     test_cases = (
-- 
GitLab


From a1e777e379170ca10f422d78c75d9c709eb692c1 Mon Sep 17 00:00:00 2001
From: Nick Felt <nickfelt@google.com>
Date: Thu, 18 Jan 2018 15:50:16 -0800
Subject: [PATCH 032/966] Update tensorboard dependency to minimum of 0.4.0

This should address https://github.com/tensorflow/tensorboard/issues/877.

PiperOrigin-RevId: 182451796
---
 tensorflow/tools/pip_package/setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 0c5f42b123..02310730fe 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -36,7 +36,7 @@ REQUIRED_PACKAGES = [
     'numpy >= 1.12.1',
     'six >= 1.10.0',
     'protobuf >= 3.4.0',
-    'tensorflow-tensorboard',
+    'tensorflow-tensorboard >= 0.4.0',
 ]
 
 project_name = 'tensorflow'
-- 
GitLab


From 474929984808916ee89e26c8007081aa0975fe80 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 10 Jan 2018 18:32:40 -0800
Subject: [PATCH 033/966] Added the "Getting Started with TensorFlow for ML
 Beginners" chapter to Get Started home page.

PiperOrigin-RevId: 181548668
---
 tensorflow/docs_src/get_started/index.md      | 32 +++++++++----------
 tensorflow/docs_src/get_started/leftnav_files |  5 +++
 2 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/tensorflow/docs_src/get_started/index.md b/tensorflow/docs_src/get_started/index.md
index d0cb69d211..b7bd1286e3 100644
--- a/tensorflow/docs_src/get_started/index.md
+++ b/tensorflow/docs_src/get_started/index.md
@@ -1,35 +1,35 @@
 # Getting Started
 
 TensorFlow is a tool for machine learning. While it contains a wide range of
-functionality, it is mainly designed for deep neural network models.
+functionality, TensorFlow is mainly designed for deep neural network models.
 
-The fastest way to build a fully-featured model trained on your data is to use
-TensorFlow's high-level API. In the following examples, we will use the
-high-level API on the classic [Iris dataset](https://en.wikipedia.org/wiki/Iris_flower_data_set).
-We will train a model that predicts what species a flower is based on its
-characteristics, and along the way get a quick introduction to the basic tasks
-in TensorFlow using Estimators.
+TensorFlow provides many APIs. This section focuses on the high-level APIs.
+If you are new to TensorFlow, begin by reading one of the following documents:
 
-This tutorial is divided into the following parts:
+  * @{$get_started/get_started_for_beginners}, which is aimed at readers
+    new to machine learning.
+  * @{$get_started/premade_estimators}, which is aimed at readers who have
+    experience in machine learning.
 
-  * @{$get_started/premade_estimators}, which shows you
-    how to quickly setup prebuilt models to train on in-memory data.
-  * @{$get_started/checkpoints}, which shows you how to save training progress,
+Then, read the following documents, which demonstrate the key features
+in the high-level APIs:
+
+  * @{$get_started/checkpoints}, which explains how to save training progress
     and resume where you left off.
   * @{$get_started/feature_columns}, which shows how an
     Estimator can handle a variety of input data types without changes to the
     model.
-  * @{$get_started/datasets_quickstart}, which is a minimal introduction to
-    the TensorFlow's input pipelines.
+  * @{$get_started/datasets_quickstart}, which introduces TensorFlow's
+    input pipelines.
   * @{$get_started/custom_estimators}, which demonstrates how
     to build and train models you design yourself.
 
 For more advanced users:
 
   * The @{$low_level_intro$Low Level Introduction} demonstrates how to use
-    tensorflow outside of the Estimator framework, for debugging and
+    TensorFlow outside of the Estimator framework, for debugging and
     experimentation.
-  * The remainder of the @{$programmers_guide$Programmer's Guide} contains
-    in-depth guides to various major components of TensorFlow.
+  * The @{$programmers_guide$Programmer's Guide} details major
+    TensorFlow components.
   * The @{$tutorials$Tutorials} provide walkthroughs of a variety of
     TensorFlow models.
diff --git a/tensorflow/docs_src/get_started/leftnav_files b/tensorflow/docs_src/get_started/leftnav_files
index 668daae9cb..437791d6a3 100644
--- a/tensorflow/docs_src/get_started/leftnav_files
+++ b/tensorflow/docs_src/get_started/leftnav_files
@@ -1,5 +1,10 @@
 index.md
+
+### Getting Started
+get_started_for_beginners.md
 premade_estimators.md
+
+### Details
 checkpoints.md
 feature_columns.md
 datasets_quickstart.md
-- 
GitLab


From 392437c566f5363bc4ac25c4932fb0fbf48fb3ed Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Tue, 23 Jan 2018 14:54:22 -0800
Subject: [PATCH 034/966] Make bijectors/reshape_test.py work with C API
 enabled.

Some of the error messages change with the C API enabled due to slight
differences in shape inference (in this case, the C API catches shape
errors sooner). This change refactors some of the tests to expect
different error messages in different cases.

PiperOrigin-RevId: 182997183
---
 .../kernel_tests/bijectors/reshape_test.py    | 52 +++++++++++++++----
 1 file changed, 43 insertions(+), 9 deletions(-)

diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/reshape_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/reshape_test.py
index 49451446b5..e216d88cb1 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/reshape_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/reshape_test.py
@@ -22,12 +22,15 @@ import numpy as np
 
 from tensorflow.contrib.distributions.python.ops.bijectors.reshape import Reshape
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops.distributions.bijector_test_util import assert_bijective_and_finite
 from tensorflow.python.platform import test
 
 
+@test_util.with_c_api
 class _ReshapeBijectorTest(object):
   """Base class for testing the reshape transformation.
 
@@ -136,7 +139,8 @@ class _ReshapeBijectorTest(object):
         sess.run(bijector.forward_event_shape_tensor(shape_in),
                  feed_dict=feed_dict)
 
-  def testInvalidDimensionsOpError(self):
+  # pylint: disable=invalid-name
+  def _testInvalidDimensionsOpError(self, expected_error_message):
 
     with self.test_session() as sess:
 
@@ -146,10 +150,10 @@ class _ReshapeBijectorTest(object):
           event_shape_in=shape_in,
           validate_args=True)
 
-      with self.assertRaisesError(
-          "elements must be either positive integers or `-1`."):
+      with self.assertRaisesError(expected_error_message):
         sess.run(bijector.forward_event_shape_tensor(shape_in),
                  feed_dict=feed_dict)
+  # pylint: enable=invalid-name
 
   def testValidButNonMatchingInputOpError(self):
     x = np.random.randn(4, 3, 2)
@@ -184,7 +188,8 @@ class _ReshapeBijectorTest(object):
         sess.run(bijector.forward(x),
                  feed_dict=feed_dict)
 
-  def testInputOutputMismatchOpError(self):
+  # pylint: disable=invalid-name
+  def _testInputOutputMismatchOpError(self, expected_error_message):
     x1 = np.random.randn(4, 2, 3)
     x2 = np.random.randn(4, 1, 1, 5)
 
@@ -196,13 +201,11 @@ class _ReshapeBijectorTest(object):
           event_shape_in=shape_in,
           validate_args=True)
 
-      # test that *all* methods check basic assertions
-      with self.assertRaisesError(
-          "Input to reshape is a tensor with"):
+      with self.assertRaisesError(expected_error_message):
         sess.run(bijector.forward(x1), feed_dict=fd_mismatched)
-      with self.assertRaisesError(
-          "Input to reshape is a tensor with"):
+      with self.assertRaisesError(expected_error_message):
         sess.run(bijector.inverse(x2), feed_dict=fd_mismatched)
+  # pylint: enable=invalid-name
 
   def testOneShapePartiallySpecified(self):
     expected_x = np.random.randn(4, 6)
@@ -262,6 +265,7 @@ class _ReshapeBijectorTest(object):
     raise NotImplementedError("Subclass failed to implement `build_shapes`.")
 
 
+@test_util.with_c_api
 class ReshapeBijectorTestStatic(test.TestCase, _ReshapeBijectorTest):
 
   def build_shapes(self, shape_in, shape_out):
@@ -299,7 +303,22 @@ class ReshapeBijectorTestStatic(test.TestCase, _ReshapeBijectorTest):
           validate_args=True)
       assert_bijective_and_finite(bijector, x, y, rtol=1e-6, atol=0)
 
+  def testInvalidDimensionsOpError(self):
+    if ops._USE_C_API:
+      error_message = "Invalid value in tensor used for shape: -2"
+    else:
+      error_message = "elements must be either positive integers or `-1`."
+    self._testInvalidDimensionsOpError(error_message)
+
+  def testInputOutputMismatchOpError(self):
+    if ops._USE_C_API:
+      error_message = "Cannot reshape a tensor with"
+    else:
+      error_message = "Input to reshape is a tensor with"
+    self._testInputOutputMismatchOpError(error_message)
+
 
+@test_util.with_c_api
 class ReshapeBijectorTestDynamic(test.TestCase, _ReshapeBijectorTest):
 
   def build_shapes(self, shape_in, shape_out):
@@ -313,7 +332,15 @@ class ReshapeBijectorTestDynamic(test.TestCase, _ReshapeBijectorTest):
   def assertRaisesError(self, msg):
     return self.assertRaisesOpError(msg)
 
+  def testInvalidDimensionsOpError(self):
+    self._testInvalidDimensionsOpError(
+        "elements must be either positive integers or `-1`.")
+
+  def testInputOutputMismatchOpError(self):
+    self._testInputOutputMismatchOpError("Input to reshape is a tensor with")
+
 
+@test_util.with_c_api
 class ReshapeBijectorTestDynamicNdims(test.TestCase, _ReshapeBijectorTest):
 
   def build_shapes(self, shape_in, shape_out):
@@ -325,6 +352,13 @@ class ReshapeBijectorTestDynamicNdims(test.TestCase, _ReshapeBijectorTest):
   def assertRaisesError(self, msg):
     return self.assertRaisesOpError(msg)
 
+  def testInvalidDimensionsOpError(self):
+    self._testInvalidDimensionsOpError(
+        "elements must be either positive integers or `-1`.")
+
+  def testInputOutputMismatchOpError(self):
+    self._testInputOutputMismatchOpError("Input to reshape is a tensor with")
+
 
 if __name__ == "__main__":
   test.main()
-- 
GitLab


From 67ed32d35148f382de92c9383347400875eaba40 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 23 Jan 2018 15:01:14 -0800
Subject: [PATCH 035/966] Tweaks to PR 16101 which adds stream selection
 support for `tf.contrib.ffmpeg.decode_audio`.

PiperOrigin-RevId: 182998275
---
 tensorflow/contrib/ffmpeg/decode_audio_op.cc  |  20 +++++++++++----
 .../contrib/ffmpeg/decode_audio_op_test.py    |  19 ++++++++++++--
 .../contrib/ffmpeg/default/ffmpeg_lib.cc      |  24 +++++++++++-------
 tensorflow/contrib/ffmpeg/ffmpeg_lib.h        |   2 +-
 tensorflow/contrib/ffmpeg/ffmpeg_ops.py       |   7 +++--
 .../testdata/mono_16khz_mp3_32khz_aac.mp4     | Bin 0 -> 69357 bytes
 6 files changed, 53 insertions(+), 19 deletions(-)
 create mode 100644 tensorflow/contrib/ffmpeg/testdata/mono_16khz_mp3_32khz_aac.mp4

diff --git a/tensorflow/contrib/ffmpeg/decode_audio_op.cc b/tensorflow/contrib/ffmpeg/decode_audio_op.cc
index 92fad70b1f..5ab57ca4cd 100644
--- a/tensorflow/contrib/ffmpeg/decode_audio_op.cc
+++ b/tensorflow/contrib/ffmpeg/decode_audio_op.cc
@@ -44,7 +44,7 @@ const char* kValidFileFormats[] = {"mp3", "mp4", "ogg", "wav"};
 void Decode(OpKernelContext* context,
             const tensorflow::StringPiece& file_contents,
             const string& file_format, const int32 samples_per_second,
-            const int32 channel_count) {
+            const int32 channel_count, const string& stream) {
   // Write the input data to a temp file.
   const string temp_filename = io::GetTempFilename(file_format);
   OP_REQUIRES_OK(context, WriteFile(temp_filename, file_contents));
@@ -54,7 +54,7 @@ void Decode(OpKernelContext* context,
   std::vector<float> output_samples;
   Status result =
       ffmpeg::ReadAudioFile(temp_filename, file_format, samples_per_second,
-                            channel_count, &output_samples);
+                            channel_count, stream, &output_samples);
   if (result.code() == error::Code::NOT_FOUND) {
     OP_REQUIRES(
         context, result.ok(),
@@ -99,7 +99,12 @@ void Decode(OpKernelContext* context,
  */
 class DecodeAudioOpV2 : public OpKernel {
  public:
-  explicit DecodeAudioOpV2(OpKernelConstruction* context) : OpKernel(context) {}
+  explicit DecodeAudioOpV2(OpKernelConstruction* context) : OpKernel(context) {
+    string stream;
+    if (context->GetAttr("stream", &stream).ok()) {
+      stream_ = stream;
+    }
+  }
 
   void Compute(OpKernelContext* context) override {
     OP_REQUIRES(
@@ -153,8 +158,12 @@ class DecodeAudioOpV2 : public OpKernel {
         errors::InvalidArgument("channel_count must be positive, but got: ",
                                 channel_count));
 
-    Decode(context, contents, file_format, samples_per_second, channel_count);
+    Decode(context, contents, file_format, samples_per_second, channel_count,
+           stream_);
   }
+
+ private:
+  string stream_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("DecodeAudioV2").Device(DEVICE_CPU),
@@ -166,6 +175,7 @@ REGISTER_OP("DecodeAudioV2")
     .Input("samples_per_second: int32")
     .Input("channel_count: int32")
     .Output("sampled_audio: float")
+    .Attr("stream: string = ''")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       const Tensor* channels_tensor = c->input_tensor(3);
       if (channels_tensor == nullptr) {
@@ -237,7 +247,7 @@ class DecodeAudioOp : public OpKernel {
 
     const tensorflow::StringPiece file_contents = contents.scalar<string>()();
     Decode(context, file_contents, file_format_, samples_per_second_,
-           channel_count_);
+           channel_count_, "");
   }
 
  private:
diff --git a/tensorflow/contrib/ffmpeg/decode_audio_op_test.py b/tensorflow/contrib/ffmpeg/decode_audio_op_test.py
index 0d7c9cb99e..3dc663bb6f 100644
--- a/tensorflow/contrib/ffmpeg/decode_audio_op_test.py
+++ b/tensorflow/contrib/ffmpeg/decode_audio_op_test.py
@@ -33,7 +33,8 @@ class DecodeAudioOpTest(test.TestCase):
 
   def _loadFileAndTest(self, filename, file_format, duration_sec,
                        samples_per_second, channel_count,
-                       samples_per_second_tensor=None, feed_dict=None):
+                       samples_per_second_tensor=None, feed_dict=None,
+                       stream=None):
     """Loads an audio file and validates the output tensor.
 
     Args:
@@ -49,6 +50,9 @@ class DecodeAudioOpTest(test.TestCase):
       feed_dict: Used when evaluating the `decode_audio` op. If not
         provided, will be empty. Useful when providing a placeholder for
         `samples_per_second_tensor`.
+      stream: A string specifying which stream from the content file
+        should be decoded. The default value is '' which leaves the
+        decision to ffmpeg.
     """
     if samples_per_second_tensor is None:
       samples_per_second_tensor = samples_per_second
@@ -62,7 +66,7 @@ class DecodeAudioOpTest(test.TestCase):
           contents,
           file_format=file_format,
           samples_per_second=samples_per_second_tensor,
-          channel_count=channel_count)
+          channel_count=channel_count, stream=stream)
       audio = audio_op.eval(feed_dict=feed_dict or {})
       self.assertEqual(len(audio.shape), 2)
       self.assertNear(
@@ -72,6 +76,17 @@ class DecodeAudioOpTest(test.TestCase):
           0.1 * audio.shape[0])
       self.assertEqual(audio.shape[1], channel_count)
 
+  def testStreamIdentifier(self):
+    # mono_16khz_mp3_32khz_aac.mp4 was generated from:
+    # ffmpeg -i tensorflow/contrib/ffmpeg/testdata/mono_16khz_mp3.mp4 \
+    #        -i tensorflow/contrib/ffmpeg/testdata/mono_32khz_aac.mp4 \
+    #        -strict -2 -map 0:a -map 1:a \
+    #        tensorflow/contrib/ffmpeg/testdata/mono_16khz_mp3_32khz_aac.mp4
+    self._loadFileAndTest('mono_16khz_mp3_32khz_aac.mp4', 'mp4', 2.77, 20000,
+                          1, stream='0')
+    self._loadFileAndTest('mono_16khz_mp3_32khz_aac.mp4', 'mp4', 2.77, 20000,
+                          1, stream='1')
+
   def testMonoMp3(self):
     self._loadFileAndTest('mono_16khz.mp3', 'mp3', 0.57, 20000, 1)
     self._loadFileAndTest('mono_16khz.mp3', 'mp3', 0.57, 20000, 2)
diff --git a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
index 1e8af1458c..3c51deefbc 100644
--- a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
+++ b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
@@ -44,8 +44,10 @@ std::vector<string> FfmpegAudioCommandLine(const string& input_filename,
                                            const string& output_filename,
                                            const string& input_format_id,
                                            int32 samples_per_second,
-                                           int32 channel_count) {
-  return {"-nostats",             // No additional progress display.
+                                           int32 channel_count,
+                                           const string& stream) {
+  std::vector<string> command({
+          "-nostats",             // No additional progress display.
           "-nostdin",             // No interactive commands accepted.
           "-f", input_format_id,  // eg: "mp3"
           "-probesize", StrCat(kDefaultProbeSize), "-i", input_filename,
@@ -58,8 +60,15 @@ std::vector<string> FfmpegAudioCommandLine(const string& input_filename,
           // Output set (in several ways) to signed 16-bit little-endian ints.
           "-codec:a:0", "pcm_s16le", "-sample_fmt", "s16", "-f", "s16le",
           "-sn",  // No subtitle recording.
-          "-y",   // Overwrite output file.
-          StrCat(output_filename)};
+          "-y"   // Overwrite output file.
+  });
+  if (!stream.empty()) {
+    command.emplace_back("-map");
+    command.emplace_back(StrCat("0:", stream));
+  }
+  command.emplace_back(StrCat(output_filename));
+
+  return command;
 }
 
 std::vector<string> FfmpegVideoCommandLine(const string& input_filename,
@@ -123,7 +132,6 @@ bool IsBinaryInstalled(const string& binary_name) {
   std::transform(args.begin(), args.end(), std::back_inserter(args_chars),
                  [](const string& s) { return const_cast<char*>(s.c_str()); });
   args_chars.push_back(nullptr);
-
   ::execvp(kFfmpegExecutable, args_chars.data());
   // exec only returns on error.
   const int error = errno;
@@ -308,13 +316,12 @@ Status WriteFile(const string& filename, StringPiece contents) {
 
 Status ReadAudioFile(const string& filename, const string& audio_format_id,
                      int32 samples_per_second, int32 channel_count,
-                     std::vector<float>* output_samples) {
+                     const string& stream, std::vector<float>* output_samples) {
   // Create an argument list.
   string output_filename = io::GetTempFilename("raw");
   const std::vector<string> args =
       FfmpegAudioCommandLine(filename, output_filename, audio_format_id,
-                             samples_per_second, channel_count);
-
+                             samples_per_second, channel_count, stream);
   // Unfortunately, it's impossible to differentiate an exec failure due to the
   // binary being missing and an error from the binary's execution. Therefore,
   // check to see if the binary *should* be available. If not, return an error
@@ -368,7 +375,6 @@ Status ReadVideoFile(const string& filename, std::vector<uint8>* output_data,
   // Create an argument list.
   const std::vector<string> args =
       FfmpegVideoCommandLine(filename, output_filename);
-
   // Execute ffmpeg and report errors.
   pid_t child_pid = ::fork();
   if (child_pid < 0) {
diff --git a/tensorflow/contrib/ffmpeg/ffmpeg_lib.h b/tensorflow/contrib/ffmpeg/ffmpeg_lib.h
index c5ea1432bf..bc1733ead8 100644
--- a/tensorflow/contrib/ffmpeg/ffmpeg_lib.h
+++ b/tensorflow/contrib/ffmpeg/ffmpeg_lib.h
@@ -42,7 +42,7 @@ Status WriteFile(const string& filename, tensorflow::StringPiece contents);
 // contain a separate sample for each channel. Frames are ordered by time.
 Status ReadAudioFile(const string& filename, const string& audio_format_id,
                      int32 samples_per_second, int32 channel_count,
-                     std::vector<float>* output_samples);
+                     const string& stream, std::vector<float>* output_samples);
 
 // Creates an audio file using ffmpeg in a specific format. The samples are in
 // [-1.0, 1.0]. If there are multiple channels in the audio then each frame will
diff --git a/tensorflow/contrib/ffmpeg/ffmpeg_ops.py b/tensorflow/contrib/ffmpeg/ffmpeg_ops.py
index 08b5a6ea48..020b5c99c6 100644
--- a/tensorflow/contrib/ffmpeg/ffmpeg_ops.py
+++ b/tensorflow/contrib/ffmpeg/ffmpeg_ops.py
@@ -31,7 +31,7 @@ _ffmpeg_so = loader.load_op_library(
 
 
 def decode_audio(contents, file_format=None, samples_per_second=None,
-                 channel_count=None):
+                 channel_count=None, stream=None):
   """Create an op that decodes the contents of an audio file.
 
   Note that ffmpeg is free to select the "best" audio track from an mp4.
@@ -51,6 +51,9 @@ def decode_audio(contents, file_format=None, samples_per_second=None,
         `contents` have more than this number, then some channels will
         be merged or dropped. If `contents` has fewer than this, then
         additional channels will be created from the existing ones.
+    stream: A string specifying which stream from the content file
+        should be decoded, e.g., '0' means the 0-th stream.
+        The default value is '' which leaves the decision to ffmpeg.
 
   Returns:
     A rank-2 tensor that has time along dimension 0 and channels along
@@ -61,7 +64,7 @@ def decode_audio(contents, file_format=None, samples_per_second=None,
   """
   return gen_decode_audio_op_py.decode_audio_v2(
       contents, file_format=file_format, samples_per_second=samples_per_second,
-      channel_count=channel_count)
+      channel_count=channel_count, stream=stream)
 
 
 ops.NotDifferentiable('DecodeAudio')
diff --git a/tensorflow/contrib/ffmpeg/testdata/mono_16khz_mp3_32khz_aac.mp4 b/tensorflow/contrib/ffmpeg/testdata/mono_16khz_mp3_32khz_aac.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..2485da86d60837800fbb0b390c440e674de25993
GIT binary patch
literal 69357
zcmZQzV30{GsVvAW&d+6FU}6B#nZ@}=xdkSM3=9k$X+^2242*0Ob5jya?lCd=B$g$c
zn(3Jt=ouOqFfg82EzG8%o3y;+^sC+1?#1o=yua(z<9YYap9xtr-AnF%PWg{aAGV*z
zR3vh_>~E~w<t$x!(C7ZcB1VbR9>4ovepg+*|3{~&;wK9RW?R=Cn8yED6Mrb<LdVg!
z8hR^ct#sYw`}zUnE=J}BO?(c|Hgo1rN|(QB$G1gSpeUoy&*`7^x0wb@g#1<2QiL5(
z&UkW)r*yKIlh2<@i5WaTiE4&-pIoguxxcbri+`JE!<;D&stPSvS|8g@d3q@&(QLK+
z<lvXQ+ONNc`REGxJw6}*Vl!h`&iTcE_kZ~lS7;M0xoG;1ZEX5`Ox17v-@fYJTtS||
zrQ3E(-jw-1_u%h+HVT3XjE^s+>1$UV+v4qi>`tb{zLjmA4>x*cd%R2D`DQCO^DYxs
z_2-^nqrTcK|7x?}|I+<A%X8MuuwoGVq4OcG_UDg2=i<7e3682C>ymGLc-v_ZJj;mt
zQ<e1RX?*9KpZ_TrSf((s?ZMZxMLy0?^WW7!oKv~Z;=}&nzk8ouC{z1?SN6fnOVzSh
zbF8GqWCU!Ei!c8y)ArRbn3tQKMUZ_({qrwB8W%7aN%#H;IDgG?{~g8&%<AC+EZYt+
zKm9B|;{n6J`7<OAJlj!W^lwtzzez3(41fP8Fx36uz+hkC!Nf0dF#154+~<AUi`V(N
znKbbAKX|~<z-H|nl4+FP`$_Z=hub>;o=+>nd!ITAs%*-i<v-=;?M<D1iZ+ZJR40F!
zwxf;xRL%X@M+^!+?+KDqd$c66Y!Rp33iZhcC$zgvJ0z$#kt>L2N0LPK4xLYB6@e1f
zTx-6H{I)sL+@#((Y1RJcuMN&GQe`|Op)jFMe)5&8DyMoZFR+}{SmIujzr}M#U1`vq
zxlI%191=2U$<N`O(62O&XXD3{9TIOdi$0Yv`0V$`>R|qyM>9Y2%0KyQ&&+5vUw)F=
zl=eJh=7Wz<Uf%FGhgWi8ap3{BS(o>nkm2C$;_W+Y{#a&`Z{Nd&3ARR(XP4?WFkd(M
zFoXTkpX2g8AI?N~@8xx0@>EIx`K-;8f_T&x3HHB8s#@pZzVO?m82A1O&*j5Bb4*ff
z^MZ_b+|;m(QZ2rgQuTbw)A_QKAKqw+(%H!s!Z}amz17hkmnkR1`eU<xJWJ?a7bV2`
z%+q+8*4CIqi?r`PQMKA=eqCAO{=MmUGlCZ+3G(?}JzRJ;()M3OYsCEc%MJXQ|36hX
zF#Mal>1PqUhxJR24dyo%gkRd=z#fpXwub+Ps{5<D1M)jKb%U-5DX=cN)v?7_DBCxk
zDP7O5z5jTl(;KaGjEqK0RBpa0UstTga3tN~@unA7WlisXj%Meb9iC^!kpERu?9-%v
zm3sye4==MkzqRhL^`W+;61Ipx$G({DnaL`f^IfO=+WT|-Z~o{U7cSxd`srBrotA?v
zlaiv+H!i-Fv{EI&dBua+0|#z+tlD*6N1nma;==A(Z%o84tDKQ{xh8*L>iKC)ovt`d
z6~Ae_T8rzxi0Jk}<vI4{oIScsY4Wbq8JC?{vA*B`-?^2Gx;;E29yNv>HJZ1Db&>ta
z=m(tw(zg6_&%{I|f2e-)o{2r`&t0<*J0uR<&N{utwfDbf_L9AoPyAY*vZfXm7Wnze
zOqF<ID48Y9<gY$w&BiA>4^%xREIj8%NbP$$nLXs2rNOpMkqmqxs`i>qZK^Y7Khk3m
zZb>puk+rFMu%b<W!-SXfycBmlcd5~zrF^o@`+(%6>bGkX=PE_ADC}rt@DA&oHqSTw
zRhEpTPOjghtszb+N@9Kfp#n2bIxX$3OguNI^w`TI6Iu%x*?z8n<k-b<LVy7T{yX?H
z$4px<<<`NWp|W+>r#Pl3DLan$pZrtt;UWLg`<E8UOa58sej(7~ZS_yR)YBgF5#DqD
z<f{Kj_|fJ3cKM@>_J{A^J>*~fk4N#{x)lE@FF1Zxw@&)`_}66hXYscy?N69Ll4w7*
zUr6xh=SN01NjLS3A1cMG|M}tmeEq`{t?MVI)NnsPRLA<$!2jevYxP(13PN9xv#EY!
zo;~poL#?vvSMEs@el|XGvKP0Ssh?O`Gk@cbllrswF)4pl{#51tGXKGv{*!qs-cQ2$
zRX?mgvBXL&rJeiH5}TDLI9)#Ol+ENQ(A(wooJsxRxdr}~%ao6$Pk1W%t6RXGGiIvw
zlNBkQFSq@$oxRPHXPz^gARl|lG_#W$HxF6NT=FK-^ZGfVRg+9M9#PIenOx}dVIoKD
zBEM#iquZ08b9~q*<h`KCDZs>~wC2P#W(N1RPeziJDi!v!sx#lW{A!=h`NpQoTX>ps
z_nRZz*Qg)b(6ZrAMsMs!o@XL$Zl0FK9N~@kQXGsbmkC&$=xAZ>ReICN&UDF8Yt8N>
zK2zSUTEeK)cWNuYxLTM>$%N%h0g{$Kd)~Jyo=FqPQ4%d!vP0#QnT9}ELVMbkiE<|M
zC-2C#xi)7$lP>qRjS1<MN8Y?VnP6!srK{T78~9RnM%h~?qlW@ko=Jr&?HY50CyFrX
z&i|7nP_AMzuVsGl%a$EtYkFfW`4p0*R;XN^sH*a0`<IRiX%D~YJssB`ONbk&?mR!K
zET>PP#k6{LoXyNeRjrkptJQX>9AxQe^Z4N5D6*Bi_pAN;rA1FJMlt_c+vmHspZCsZ
z1)KfdpBFyASW~xswso*diT%<OQAQHAjte6>+K+3_GFdFgytA%%$DOzR9E=l75BINH
z(kAA+|4?U?N!k6A2X(E|{bclYtyq@32G5;zxk%afWUopS>y7tZ7heAU6vHU<b^Y^z
zj(`LP1_lQE_+CbZX`Ss+&9B#1b|o#ZO}Cm?>nNtv+2EF&WD%n&v&^E+Xr-S2QO;XW
zmKfS<^s!G9ej{^d-ghm(n%ci}89(i8nGokSk7HW1fVbzVlSTKKCsbz2&7Y=v%<`t?
zo{DzP8^TZa<g^_>cqIK}P_6qMp<8aMd1n_cQQg_$u6u~%yFdHHH*;d9yj$SC_hXaQ
zyjjj&X`DrGcgpFW@=UNiX(=<K_1z)Ac0JXjRy#Rv-mG6N^rYY<bEDubKjqzrCN<C9
zydofX#mfl?l1^r&Ogxxy(&6yLsWTLQr93ENvI*93?<x)wONf<nsycn7!$|t{5(_?m
zh81o)lPp%KZ2HAiCg-(ZP4#@`MZNN4o(0@bQbUyHSjrZ0CeO5sP;Xiqq|(PAA=7EL
zR4|{VHDg_>X)5#Eot9-qJ3g}pif!6xx}-AG=DQxx#H*9m3Z*E%`gi4ZbxGy1!;_oK
zMYDcB<F?=n+Z!0Oq|5o#uJ_x`;=iZQpShKJ%9p-G=i;*Mp;6wy?{yZuKU?}bZtuO=
z)O*oE(>za?z2s<|qqO&aUEb<*FF6IJ<))pLpL5UFz)RrY7Y~z8r#7!<kA6mbiz$z;
z1-y0>?l`4X&!uxT?SPu-1dS4|E0q!dCtWCgpY_@M>z#kK#eLfvjAcSORGuD7kXW$d
z*qo3)R#CB8;n%i!*Ew_@x)HS0>(JYts>P|#<a7@u${f9Qck0G1B}bO~-M0L`J?C<I
zQiAIi7dM_$su4fF?%=Q~3E23l#yGXiJm+-YBF={C4?CUgBOMp3zDhb*W|P-FA$(%$
zQt_N*(b>rxW!P4_3F!adlzC#MS;p#wNlu+kp=r8Ty+7%vJf6y7BDuQI^WfwlHyb5`
z6#~wj?u}9HueUXvR4|qF*kEZHw6i#Q-aXqzyvlxG-WTZV7tT9ZJm3Gve>toF{@)uI
zI1KCB3&dH|PIqse5F9yUb@IOCNR_VRg}b9oEep@?&99vKYK!|N=Z4$I((igQePHCX
zTpvAwfq_Z?{`~mfdaE7NgPs+t?7pBHHIwV9YMEOor-d-bHAWddo_4lA|L#ELEB~fX
zcy8CDcv)%k!8}iM89zILzV|Wyt}V*P_jI^F5I??1KE#em_^0C+hWUr$r6#=;=ll@U
zDEIh<lRbl-(ZrvcUo!eH?C*A2{nu9gqxiFr{IWIqito-#Pk3P)(x%oZ7k!4~+r9<v
zCuVJm&A+Jfmvf)u%NR#{$*K$Ur<S(t5muOPbX4fMtk3jjt4JGnGv!leE<5|vmCFwW
z+c~r=+?jAImE&}v%)JSH_q7~<7)3BojJ<83lE0+Wh$Zcqa@zv8%3al8j>IbTD4bub
z%#ac&(-AwDHIsSs=an5N`X_j~nK&gFGO~0oDfiS=-SuGN;ah7L33jTqa2zSnS1+n7
zJ9a&a$x?=sPi0YY;S-~XNikxajG3KRR`>+GiM$^C!m!p)%j3ujF`tI5LN1<Lcu(KE
z9VmBQuasR(Fv^1AU*!6S2`>Vyl=!xQ3YPyK9uxJlzHmIQ;qg*wvbAS(Z-`Mp`o(4Q
zMvs>X6Bl0N{I&b_PW`;kO!j;FANe?cbpBA$@A~-2rnx2a!Y2t|`7o(?jX-&h;<~RF
z^_SXfc-Cpyd#CJItj|4J!`S8=xK8<4gXh$f4@y`{3{M`qD){W~lm3}Churs6{pmla
zKj}!#_T!$PPO)s*Bh$|KN+oB;y!PWNztfdePt^Fhzv#cqWUrq*>2v+0&I;!(lO9Q}
zUQxo+nNn#k<sQ0UdAgyFU&^d6Dy7?14oxa3a2Cq?P;j?Tt;nlgS)MoR?3(8GXZ_iK
zJ~H&3S95G|OHHy|Q^&~muKb(f$BYGGt2g_3_Nq!eX<VYZWP=;etTs)JwCxL`uX4^R
znd9@)W6oVqxs7*DeA<<=?xAPY437t@Y?6ioJ*7g<Dvu_$Bn29%tlBcgBYdX9a!>DP
z2b=%5_H}zmTFT#73i`=)HG0Ooi~qS@Zl*+wb?Ph<GF(*ZH|>P3(vOtf1dgk7+@)tL
zR~%XWxMN<VjjWJV`a}_@tmS0_Q@m3YE#_3ExOCn!6z{sfF(^QdF+5bQOlYY_=aVZF
zJAI~j*)X}V_x@*?BB9ci6u2zp{MT8<JN9|YGbj4)Is8NXZHaYEio+uVUZb9g5}xe`
znoq`B_d71j%lPf`fB8rAXT`ajeQq1`h=yFSh}<~i#<ZK-6Xtg<`Vp`ybTZqLN$!Sc
z4=@UuuD{}~+{$(klr9+mM@|UmQ45=#v+Az=gn!@aIx5t20#tVUxI9yOv29_e_Z+`9
zPL_9X%5F=UranRFxh=zVm*-1%`b(%DFlAXao3Y7B(kSsy*p)(#4IUl~9KU#|sd1cZ
zQqpKiy|3JT@Z>C~n91{m9!CZlo1~ORHkm32s_0Z1Ifz~PyDaxt-}Enw{Cd24-qiB%
zUe3bh6lTP-$%y0Fsfib!dpzBEq%geWjLIM1X|Czh<TK=#baZah@HCUua8zbfovqWv
z;<=qOewwcc$N57O3qAD=)z7G`Oqs~_j{WA-4W}KxzP_~7_m)>1tGirj>(@!{fyxY>
z0&d2s6DKt+3i9snED>qE;MlU6ecs8AC6{XI>a>sAD$P|hp4`eaCt}I1`N}7l6@tT<
z-cL%<G;I2IvQZ&fK*P$o^3T2I*OC|H|J5)4JdH10(TJ^EeW73DC!bD(c+3A0waR7L
zH}?l$N`E+0>BC*Qz-Vr5cEbz-$#({4CT`pK_pOIi!+q9!l`-KFGp`+1Q4uogR9Mo%
z<RpG?QSI)vQEnSc=C54o9ucxoi0{SEf0uvVe|z`(g9U1P7cg&hnW0<Sv*YdOj?nH`
ze{9d$&Tb3MD%augV|6;}r*)XcIxVE>&U@|YDVszUE-5$sHGLjs@nb_&(Ix(a1@9kP
zuLz82J>Kw+V}9YYy<1~9^M;&_x+xU#ZvKqRI#$-sS~3m(6PQ;U&2#1pXP11u@BE^p
z+TE*me*QXxfrG_`D~MgTgF`?6``qoUb(t6ccYd|C?@Y`3l>F$=@jG?fJJ(zm(6|}f
zmeFawHEo&cmgUipYrjps=D;BJ>r-u2XUmeGH`7YmC9b4@+2mB`Rez%-@lU4I+FI`L
z$ed?dhbA}uv~^%n6hC|K+x7!3tA#3A1lH_f;%jkMYvJ0I@POgp<V}y&%i|)MS$J+c
zq%*|v-+#WY;%D7g{+rj2Md;0WDaRM&uPJ>gE9uHlBcsP&4Zr$7dmZ?fe^F_(5YrC_
zA*TP%CLaF!4zdTm_V%v6b^d8_trgRU7r!o+o-ZyxzdQNKo172P-~A+d7BS9Hi)xv#
zGBf4M^i@t*wr|{V{sK>$QS~E5oA~F;7u<Ls?fHh;|MJ?^`O{B4eN&ZoclWlv^^fH4
ze`ub+@Z8JdFU2Lrb^Z1ZAAUB!t&rn4a2EPC_3yvmH*4)r)Y_krH~G{2;rLt@Q`SG#
zKWYVj^q;<W{^oqMKMh~(-D>5}Sup&YcI#JZt<}D{@qP{swFQ40;vfEIn7{fP<FA8<
zl23R{@cq2w$s_$a{+1=J=fs@*U;KD`D2TJnLv8E7&3)^?-cfq@^{MN}8!4}x6(nj@
z?8IhrWvMMKJjzgc-f-&{;o~RN`;ULv&T;4QpCU&gfq82m{=cMOXIuYo|DXRiCC@*U
z@1H++xxn!|ipBYdJk*ut{GK!kakxF?5PB#pdrqK5Pq=Tg)|!@1`OUxX-{8Mium1V~
z^Vk3D|Msj@{%Ifir|~y~{0SCQ%OZ=cmS&y?CqD_##Dx<%9A+d=YLgR_^;TitdcfYq
zI{M=Qd-<)u<3Bdo9Qa)OefGK9=l9zV<lNqzop(TZr$SGkoyR1O$?k_9|L~R66Se5Q
z^H3+}O{K-u>(A4ld*0E#CszJ%<Li%q-U&U_l|7`Q$SKYt<R~Q2?mqF*<CAwTPEtLj
za(u$XqBj#fi;Q-?J348?Chho7sijZ7j)*Jwt$P>GwR@Xe*u>-6siprmy^R!o@M+aa
z-O4BjAr8;OhXvYsI-C!+x2*75;CYx|W`&c4r*oUv5+kPAeR=|MUzl4Xsy{F&sD2CI
z!@xel=3l<`IR@U{%mth=JG`Er|CfC8$b4(7a|%vE6a1`|`#g552*&gYx!%~l%h7cG
z*ZFH7Iv+Z$ATV(yf6KAL8+XdgIVG6S@iB^+u75sZBdG9YsDE-odxOwqjUH!V?H3Fa
z1u`alxqeZ2;zadL51;(X@8JCLeyPBP(nU7fa{i?jZ9yx|PO3DemkamHOc9^3d!gdY
z#+WCZ&$b*{Dbtp_BxX{WaCB-%)Hb&dp8Lc6V|S{r+~KbqF|~%1D|BbjVxC!ifeHc>
zG}avxaNOK!x>$SCc1b3t4Uvp2jZ0!zEjr=(RHe91Ty(c<+V)+Q{-VbZU5fMUiduNE
zQAhbv#i?U5;+Cfms|jRvPrmrg_q_CXAMNjVvQOGi_$en4)?jzlk$>IebFQb2f9$od
zto2$Xd;Xh|BBx+qr=0_zQ)9Ep#4eA;?KjpbIh~ABn9w@6`$l}w%No{{M3bJ5B>~5d
zUS*l^@b2mPY%aQrq7gitQI3j76v70mIouVQPA!<^!u(J)gZcGe-U-b~hc~nMzTYT2
z`E5W&U%>9bA31Ms8$5aFs`etzyiDGztGjdZ1jnW|2g{^?xuo!=HyW?Iz{t+i#5&<%
zWZPzczsU)?!IkP$J)eaB%r-D{F+6bRbKmTwmg}>tk0i%Dc^q6}bMpMGlF;)H_X=K=
z=BzlpC@it)*{<mG@BI!1T-?rf>io{h8@7K>EV%n(w!2$zqifTP%EzC@YkX}l6pEfX
z_gKW>p6&fzhku1T*r=?sv6^;8SYvI$6Q-(lpMIRb`ewDB@Y!uEvJ0a;=Q2Ol|5+-T
zbjf_fGN*<QN2iz+x15Z!o>yP}AnfikyC*76fsQ8+syJlXZ+mmT&SS%}<WFsUGOIdY
ztNvcU<YoTyD5j|=mRxnaxnL#ZG*6b9j}7bDwR$GB>ds2B&fLD{$(Cn4hKo94CV$J1
z{KY=;$_xeu<_WtZ6PPy7^4;*5(PGy2?iM$F)>BSO$pzPgnpDqwP0)D}babUc&$*{U
zjHh^3^*a0Wba^TZ$}DDSE7eeVFOb|})Wqf;8Dhck@8|l54GUV`z<pKjwXt^(a9ntw
zqFrocG{L{M@Kde#fs{Y5f2gScX8rSF!nwcZ%8%mZgrDWBZa6sM!QPVgjeqXAxBl>$
z-~K1W;h(wU{ZC7Nws&@ZmDia3d49{u?0IevPp)EQ{;){e*OHfS#ysKH$<HjBxc52r
zG0b`O<Wotf#|5D|oBXD#z3xn09lKW1$!2MTqR7p=M-0r8qh<vfA5=*fm^?|beSuq`
zhvQtW#9SUl8Sax8LepZT7IOG0dOK@HsJ3-Z=6&;I!z9j!+UYeDZI*1Z{&i%R%qQ9V
zb0$8DaI%r_7rgrNMEa+cig_>kE=ReG9y?jq$K!ndPEOzQ7UAoqm#U86a!}m#dWltD
z%5>I4>2)UV-p%SuDs7HVs+4&slUFe>$<T79RKJM7FZb6FrW4_ZCT%yEbE@)B&z66y
z<&Q5|df|k!FH7RdhZ-r1m@18=6`T$<PQ5%~(Szj+;?=z#A1V#;Ty|=Wy5l$N;tif`
z2bBb7opA`^Fyj=waPIrbqmw+uPuy4<oZ=C^J8wmAp}^(?MR_svicWHG+{9PW{NT?;
z%Ro=R=BbZ_*0O6c%$(pX=ri>Vr{%;Br*8YSlZF3OytNzEr7m(9sa}|LIjJMG#Zyb!
znZs$%q_A^d_hP2yJNU^e2`E3&tdNPcF#fnfWzvjC%3`)UFIlDp2-+wKE|}!h$x>mU
z^Mgh4@QK2=k0wnNmRq)@-(}YpR=?GIO)VD0ISSm7R@SpNG7-Fz{(RfqPkf67URhqZ
z=n4-!J731(`PNS_7JJT)*SYoR!lqtLwFfIE_)O_lT(IIn=B4PCsFH(+!Ne`s<;&UH
zw;a3t?L$VjPp*AF>lVZH%UQP=&MMFqdRoEo&tm<fK!(-^1_lPEcfa0uCUAN_W~u6z
zR$scy*?5}byZRL=^~L*~RDbAyZ<H%P<@w=_r=H=Zj&JJ^U(`R#{^ZE??N%S{vZtv(
z?pK@qEAXeM`X90X982t#t%~B-eG}BzEmjh-bhP}(pgQmBq!0QT>SuS%w9|Z0RQLSb
zM?TKSOeW?mB?c$E7f*iiD@fADFN9OQU{U3oQ15=F8!G(klp7CB7KvUgS;8`<F>34U
zl`Koos4H_uCUYd6^Hy$DopoH`=t0HSQv$1)0^8i}7222<YEC_A#K^<t;eTYp;)N@A
zYRaev32Z#VJXP_)rYDwsHht0_PQN><etGh;iz`0*=iyv<nnQTbgoigT$;q6V@Zpb=
z>~$Sy(~!0cDrNy5r9Ce-ZY+|y@8!%tN%_v#CpO&^7gc=ZOI>|(&Vtr|{-vn~GAlVM
zk_^)(vd9TrIGG$WQV!Vib^py}Z-pOiiRqkIqZK$){q)o&7dCgCo*sGTbm^JXS$?W>
z<6?RPPA||rH|67>mA^#J2d&tiDQI#@IBnwGT`LSNWb0qt3N!rsT5h6@npD^x4^36s
zu!5&Pvb_zbBtoYrUHQW)$k5?9SMAnT>x`CL+e=<rZ8>eOcuTS9dC?a;hvj!IOjo*<
z`w8uhXtmnBb??k|mG!b*_t;M=FEZXbRdieW)e9ZV7y1`{R!z&EFL>3U@7~F)9H-~_
z^>SvFtT(Q_9<umQPyCJB&OiV3K3~kj=qeJ_z|_F-OwNvhF@ULo&G6Yjo;U`H-I4_o
z8)hb^Pd?Na)25>qw>_ydLxh3x0Hc8E`sW4Ey7(&7_C%$FDyxo6vyI%o|Ied4VxBhV
zI@%A^-qZ<OFz49B&HRdfsxnKwURotuTQt=KHwt|0&sDu)!{xpDvr$ZW<D|2XJl$sb
zY~gT}wEE*Rf68AW$G8>hM=cM^Oa0{NZ`{Qx!z<I4WHC#nK*CA<+Y@W?TD5)sB`GIU
zvLC;cTjQyqvhB(V6{{r%7K_TZ9#MJRnUNFyiR11Hx8xI3;^%DUa{O_~f@M;-%8Z1G
z4R=^;UZyh{>~`~*=n-18pZTxeQmL?sKXrIQqMx#!U-)GDt&o*5n|#Y8XI#HOzwY*j
zh)5SF8-bt|Cy)9F%TDxEUbO7*3=iLPeQGXu*%uz1%I>JH;xs2#b%L<gx@ERAoIIL0
z<aGCceR8t#W{Zx=oXDF-oEk04u1A($P-Xd4-Eq3Z&o?QGLExO(u@i#R8&vr=8YHZ{
z96j}fx|nmZ)8!VgBS}Udo){Lr`x7aC<J1Jz7iKL%;_oKR*i`UHu;QfiE?@rl^?m1>
z)!VKJp9oUO@e~VE`PuU9nMX=vfQPcJSxXV`8W*u05l6(FvK=J_IxXhqZ`5Da5p>q`
z*{pNR6{e(e?mDEVB<%V2XvHhZ4-!IC*>xBgn5^F2d9bD8Qp06euS;*27s|AGYicYr
za1>>I+Sj_U`Q07gUrWE(EUYRy$k0En@cixU=he~jrB4?zIEt`$?RvCHZp-tmbj5~Q
z4=1<zH0W=-aro7~Igb~9^ZfC3LD7ykPgBae!V5M$aG0^0HEFte#O@xE1q=pB2N;f<
ze7Uu~ar)Z_DH}}VoX?b3b54oRY<MxR^o@#B<?$8I3%-eYYIX~4KiI^+Yc-3-$vHb1
zr*Ey{UU>iZo}5n+3=#~7f9(JDFF(42fr01U+T!YaKi#6ecC1}BCm^p(XYQ$-m6K-p
zNP1g`H5gQ}c`Uc_P<PpuwD3x7#;QdEhs~q!-72<=mc79}S;ws`x}w!I-J#R<(V?}a
zJO%Ct82*{Be>ky|At?aV(yV)bgVCU%Gm6h?g2y}y#Z&u(j{IHw>QhYKMe7+(;c63K
z|J$MZi}CA0dx;_?>$4Lj<NVb>^ZziM^hfg_@0<LtBPqp`Wacjy{P2Ad=Rc=w-rI6K
zM2@_F!ucsaLEzKz4$r(TDVZ-$ywaQOccZ_jn8oDEiEvgA8R=fZ_Y$8ZCWeW3+}Y+V
zyj6M4jfo4RuUEfXu+8^E$3B%anszm(+td$yRLOXsG23pMKX1S4Nu!(hwHu!9TjsBM
z{A7x;*gu~>do6cvp7|Q5C!{xd7TnWbc5=U(|KWc;s)zq+2tRzK{P4WM#LEvof4E<2
z*&Ju8dZfm}x$%RO_30%)+}RW_{F&k1^C>7*(DJU#{7FA5`D&j|inI9mkx%%@i3d9v
z?Q&0hl%8AUZM<Oarpv1{*1TlQ;7t2GS<ob2HEAYqrKi_IfqoAEdE5QYT;5gGxcKs)
zvi|ixE>Es*U3sfi_DRMWb#qVW<di>Q=E=f`k~|YP8qU4luH0~3_)x-$q)*0nDt#75
za*|JelzzfF>HMa37Be<pw$o{|C<+zsP;NZ(ljUUJaewiRoJBi4AALS@)XHu0)37f_
ziJWeWocy%ecQ0E}T_L*nfBD0D^;MkVM*||P_X`$Y3+s$L@Nr(g=&?8IOoAE{WHzuX
zS{Q0%<h%P$dthTd`S2UhU6W!ao=r&)d)0L))YV)zYLk)olMnL`{QbW6?%()l77Yxn
z4$q4n7#J8681y0-1Q}Qzc02&pst+f297sxlOrtRsyb-wm`sA}PFY`GZMgF%n?K!_B
zMcm1>ihL9`ufX{s{}#@=$Q^q2PmUD)+GG=dDy8mZJ;$MV=ZZ=jw=;dkb<PZ*<O+Uw
zc)!@+>se5<QOif&KhKZ-7?UkS<-KR!=bou=`EXM%t}|eXh>>2Uo>-6#pCWUFszb{%
zjcLpwt|<zR&t1Jcx2(J3dE@gI%czrDGCfnhzD~#}5ZvV?J?qE@r|Io$PfQF|dot<p
zo7!paYDzZ2OC~OFIJxHJrw*qb?#Jsi6_5G2yUtQx{glNl)Z<V3n<pQ`Pii@G2YZz8
z>A7FD)tKWlDMYE-Mw0iqq4INmU4hN3KO|MYHRRhkS^3qvRbC7yR)m^aGqkwdH!WZL
zAvRSpXri3t;nz}f56*2{$^TBO%x%}QNz>Kh`=={h>J<un_W4TUgy(+WI$ft{9)ExG
zTJG-j4V(-yNA0euONY2S${*bLw5j|<%JUD4Y}gvPlI|A&^1O3MaHmb1Km<e0zu=<*
zruO|)l74Ho_U_0!zN;&I?YpRVA@60T{CVp?{r@p$<tJKC)`p&#DqC4}kvohtWc{D3
zYwIJnuf13naxd_r_QT-Pxp`OP0vDc-p803dr3H`VraV6xRH<2{`{G?uQR=NpmHYIT
z+MOv`P-bnv`_E(Ni|T$i8v>Y`=bq(e5LROFVG!|Q5Mf{tWoP6tUH{wwQXS5J!n5j=
z;A7#n_M88&@i=(0VeSPdQKQ<^H(c!IR!FqZ|JBInyg;!*KJ%W-qN_?ei5fN){VjH>
z0-x-~6(99YaBh9Uk^cFJ!SxQ`ZpXlBMJd1HCwTl_^@G8GVbzNH&etbQpQG5c#IsRJ
zJ>|ra$G)m9@1E?Ldgf%>4>qT%!ADdKH0*X6%=B~O{S<S)M`&@Ux>=yrrh;F)SL(6~
zMxAs!8e*R9D4;XH`_d1;`!DrR2sI^HNFDK3+L9}<{J8mN*=sy&lob2ZJolv~`5h?p
z*lkzcTFrlbiTK>*XCx1Lo<96;CCds|m4x5>rbk_!<(~fhkc+QK$Zz#oF*#X#n##nS
z6(us2g4|}h|K9)fto3)Rj*iy9^R6l}O$^cs+%ECk|CLMPjGjH5(#9)Vt_3_`Nj#>k
zlqh;iVQ!+H{`N&1jCFqcq#P?s6<s<{UD?JdcTv$9cky}3jpt%J6|`K`j!p~kZkFi1
zm@q5J;B!iPq_Sg5c#qP_=}{*gj|+w-894d56|`B~cxSIx?TK^ZWb)ZH-&*{FdZ^%{
zO~Gon4l1phUXyj^zx_=UkIDs&0WP9%Bg>PDICkuHoP1b8<FL>9>e{D`DuPN2S04A9
zQ|!O<!@b01kp|YL&&pMAo#+3${?FddKXaCOEM@k5@rvb^zWW*t^LGn;o+>@NyCUI7
z*PV%hS9e!*^e^4CQ9U?h!4!Vi*(wK3_J!o$@ysywH@W3<pSvW#DMjtOZpYob8SHQO
zn*aFO+-q>x{8-=h$qo#TthUDl7anAHTKsLg*`vRkx|)L$H}143-LN#ZaQ#1PRmam8
z{$5OG+s)dcxjEwe!iG7LVGH?ve7$v81Q@u_|Eg<X(Dv&xk66lDx@xIy(UuRc#+TAl
z-&>#8ah)E1=jvT<w#cu&**R0aQ_S)ocNg=VSIV67Y(s&!)5@lT<fYei7V%klmY%ru
zE@kJexjHLesyz~|WB6wY>arev0ICI<Vqbk>Vm~E#_vwkmf+ZXNMXSEGzuG9DQDP+e
zZHeY7#R(5xcE5lAWB!fx_804_74lNt3tuwH%k7U-`PWb+RM&h=@J`XB82uwEdp#cp
z$sD^VeeHzv2G?1SWG!8Os@to^mF_9ZzA@pNkNU=2Lf)?{R32PVKD1Tk!}M&YNR_h_
zl-Hk{+$i;;xu17Jn_F*^S0US?BmSQSZH?G88GAZ9yrkdF5ZzMewV~6QFI3l+{lk$a
zqoC|lmoymt-Y984X>U+p<jHzYJ?Nyi<);YUhk6H3b!JvX+NG;Z4}Q_Iv7F;!W~`;o
z>iMb{KDF$Y`6MYP;O#tFu{BUneU|#Uk1a;Ogv7qY&r#Lq@DB`V)7ElVn`AJ_K~QzY
z6U`k{EK1kREC`8ED~vXGSdts5$>t}Ydh+Si_9*T8vr7`xUNr`@b%($1ShIvj=2f;$
z<!gnmqk%$Q=W0ESG^eJS^hECDZOJ~U<o?uCcJ<U#?o6A^CuZGNv~4-fGNV80=ypyW
z@#dFSS5yS}|7a)+d04I12>N*?Y{$+&jV&I}=QrQ@URYVt$l=lZvCe*`=id58g%$Tq
z4qxs#+V+LxO|ztZ!j7}5r@lvf9F}uv?7x}b_E!A%NzS4Kd;5PL<&lX-wtLqEi)wf`
zE?((eIO)ydq}}Du5C5)RxO&2*T~?C$rb5pI1gEjQluS6ym3UN8=>-dCj1NN$>yeZ3
zT9c1lui0?l#<#*~`+4F24^G{;@hzWR^Wu6Xa|6RaOZ|rdosCKgpkyPz!|RC3ZkBYL
zmVW`>=j>%p)^XU03jDIY)lqoz$m+tRn8k;l>@MHIS#$e?Aiv><BB5<hc9+lPteNwR
z&HLH(9*>{dcK-emlc)G@>bT=6V-mf~Qsarb;Sp_xsW}I&l2y2F@@$)7x6kv^nh!;d
zQkOZVtqGXIc7bbZluORq2iZ-`?)P;+Tr1f1rhzkqpE>-=GJTyX&c<EG{Abk{rJ4HG
zi!EpkXzI50JH2vSLEfX(LP4Hrmlm#5mk!$K1QisR-t;i{@aFGRUAx|8PhWVPUyM=A
zwud@K9tlM*!3$OOH6t99n-_Up?np6JQA&L+B)Do(=ECg0wC>0WEg5}WS4=HCS%2!x
zKA0{1Svs~;;Bn<+X8!Q!8a40izg|c<k*sF;|9{}5SpKP=N8j74Zx3EnW1L^u>lN>+
z%qY37!%|>!pOZS*LW%ba_*@@t_0(uP-h9*j%>mD*b1f$yA1?MU3rIV4@S~sX&*{e(
zB|j2W;EWVqvg;MU>UFk1>Utjg_kXsuklDJ@M%wa+j}zyKm>(fi_N_Z@6TNP8=p3C@
zw=PaLu5I75Vqz@IqQguP+v@k<eerN!-C}{li)=UcSk||luaOdP3J45o6>@U%Ji(&k
z*23_B;h*XH7fGO4W%zqwm%`+|b5$x>p2@TrPws1CRkT>3+;dm(a)FVgowRFdk=?Ez
z9?tx06c6oClramKkSuiYhH8@UdY3+P?$T_};&YR{=Y=Vssp{li<}t6|&qcT++el^e
zpMZJQJnr*<sPuQ*$haSj6IH&tBT?q+B2GpP!vdp}8z~vb{~yjgrSg9oi;@Qa&#LAB
zZ(Y0+U|Z8E=UU)#GE=DU=*0gI6(uGep4|8%KI^^BAs1DpiH|RqtUdfg$*WSWt5p1D
z?c^r~!4n03w42wb+x^p8#J}{)g01Y=3YYwnEZMPN{hnB7*7;d?r)mC3-{|t><Bi3~
zO2xHyoqoT_f$f|7nYnX+tTmrm_tx_C-_2|5TZ=ayxc2Ym)wn}FXYIB6t=czpipqq3
zw&2(&R?jHVJ(=(JVx`7}i=LtDBfXSakL2FoqBvQ`$>@mN(v_ht2|l|MpC@=_DJwNf
zyj74`vT;go!joGpv*)YMR-Gl~bgxpoa<x_NvK3!?qFu5zx-Q!{JF2A!t~Y5+5Sr^3
zXDIQZgLTQ%6Uq%!(_9uUnk3>K+Tt%T@pORFE1NlsiX8v(c0T-Ztn0uzzI$blcD1@B
zHft-r7oV^%qtW}>G|xj5&wM%fefEweZd@m9I(!y9>8>!Fu>8U#flnWr6+^9pi!Pap
zJW^#j_HgHlO<t{yT1SGG|Gl~@{&VfDoUWHvo(ngNH*fUy+o-sHZ`HppVYxdOvt@){
zS?hH1uC_Z`Dpc=s`1Ss`Sq+_MW9;Q6w}(%QXfk}RGm9%>g0FkYZm*k%*Z;g@^7Ypy
z_MR=SQnu}rb}bJyHWbnJohY$7NU*^E@4w~Yd9t%l#iyLj-)f@6#4&l^jxQTbu2sc9
zFrB`Bn+s!7-lhZKM99Ekaqf%j7fo67y&Pv7LL)D*t2R5WeJrZ4&Qq3l{qC&5ev7GG
zFIBfZ;c{CtRix{R^ruySiv`|#J>yUi-OKv!P(^n8)Jc!hEp_v|?k`Y3%9K3Q0^IX6
z2aS($Kf1MnkvFYmTD_3ZDrG)N8J|lwlUpW!7eClky-&nVuA@(##Z6A3uX&c>i<lej
zUY7+A`zj=cS2S5xr^NkF+3nrWIhE0hpUKidf1=)|1eV|1jUC@j$d0X_RC_B*onaBH
z)Q{!&@6Pz}PFFp+y;sZO?t-2jyPiJRSb8$#?T&Pt=P$Rdc)!o=l%F2w<(S`RS^|`m
z#2g$oa(+ch2CQhT;mz}X>Exv9*2dkUn6N{H)9G!F&k84#lj?K2y_@z-DqvZcu&LPd
zoT24}4vo)mtOfr~@tCl`)vo`+i8U-P-rVv=f%4xyR@{u-cfM><sja~MxEnuH{}=z=
z{qB0<mZL028f-2<wBM<I-F$An^_M%n?Zq?2rgzl#zbZQLv^e_M^Q^VU)Yq0g*EKG!
z=-nrz!uMxlo%)+QUeA;SAGjCa6Y5g7+1>jj?%%7mnw=@@*)Hxqdi~eIZJ!>7Iow^m
zQCH^b{P~4%te2JLr8Ot{eVe?0*3qRK7&sXi4ZMu~ggpF&yo_5OYMeqLR2G{&1Yt!3
z1{sEb#_PAJaZF)hU|?Xp9&}*ZsfRo|-lcz{gXSwntM+g6cAT{P#Enxcl(|Y)i%ecS
zdC?~2C9caSJN6!_Tl^)+?f;`vhCkdKDvL{$S3R}4ab>Qdd{AZAo(tEHs$Av$$?GZh
zG4ALW37gPnS$FNmprl&keUBwqe>03cf8ovMwK05dTO*!cn*HYTxwz$vg)~I>SXcbC
z-2JIre^Gj3Nqup}8)qi-xwlrl<O|t-u6%c7(YAg2bC#`geJr)NMBL!=cI}@*qLZeC
zTjhlvIZ`D0`sS)nc1w;*E-MvfS*#bEIrFFd)M7DRL#vdmJjLr@CbQgSzYz6tT9In*
zg==psIO;rXSIydA5OhiGuH&Xl9?J`~?`&H8uuOQxmG=rw3<?YZn;sWv>1fF?{4-r=
zlI<$R^bl57t1CPCPgj^G@_A2_uO>&*TppL?6HaQKDh{)~3VGVL^qH&qw|r@eVP<fi
zP;^?j%-{2%=WByG#VpRQQxzK}ts)my>{z0da^lgm$;Y!f+%F3}<Wb-8=*gBDOwxa6
zN|>B{@?lBGCQ;?n?P^AMAGQi;cG$RF_X=xq;;9UExvm!F$m(=G<-w|5q5?(<ipJA7
zEpwf0GV7#^#LtvyqX#RQKm5Oc`tHrQ^Dau1s3!37-zwa_TGy?rPkzGNpO*Z0LJI#|
z*1q``H_z|tw>AGYk7p{U#^xmTr=CcPNVC}cf9jve^XL1g_7)2~=5JZSd~5E7i(5o8
zBkuC`FMnBkJm;OcOwFxjOGGu<#rrgFn9W?Z_}u2wm75<w>&@M7uxCf<=41Qs9#z&)
z+UU4{u^sQP-2J!gSDq9(zR|;E?Jw11k^3_J6N9e{T)Gsz`N!?<f_HgAXa1Kqd<<Lr
zF4s-DyKCom@zDFL_GFnI=en^kHM^;Ht-Dr;!E5EmlIzySuVA*|J!{j#_{Gif@Wp*P
z52xRE+qdDYjZ4ae$vb^*7wt`*xBbR?1(lUOEk2tjXZdNLPq<d3{%&$g<h;Xn=@+)A
ztBKdWs;OT0JmhEC>H?X`Iw2F=+PTeAM7EZ4h(BN{TDMs1S%y-rd2LQ=PL05flO`%w
z+b*8G@M=fYzwR4}Kklu{WMG)b@IKbqjK%EAtf$9<y>rh#pO89XQh?Rd;3ltff79?2
zeuBDhKKvEY7EbBA-Lh^mduCG9+*Lx-Pj*dSc28(`>&z(Y$9-YVCR5K%jgPu(=~@uO
z@XuK9OEeE>haw}W0Iqu=c(!m+WzR$@HI=qWR_cL*npO=H8GIafMYwAI?YeQKOLK<(
zw5`pXOW#iLNDO;D*Z9Te!))5OVq!hN_c-pKo*A7q@lv>5>x_cm+DT3&eO3?Gp7{7@
z$?GMtK_<K_bALUa_i0LZUs=e(@C}DbzZ$1z#9ezSt!`Y_D`|3MOTM_N>g3P4@6YF(
zT;8j^^@EuIMbApV#1EHuZ?F4tcXt2HqF${J({3)j6S7x(%b}2UCl@z7zA6zB{(Y+C
zw(>cN_r=4eT(8M{AEuG1XWOIsvj1k?Q|TM}$8UL`O?zE+<K2S`w{!mVf0UW&b?^PX
z%L#XVZ${^p9L^S9wB~2&(R+nIIC|NlKFnLN)#1VgM#-EbtKZDocCp~3NMQDC9}&Bo
zj}}y{KF?qQ8X4M><|V?wz`%L@00V;}bGz`vJwZ`71n1fC{p3AwqAYx#<zGeZKlLBh
zzuCMW#3;!ne5#bGpLp^Q&yQ#R&*ZN()mQ)LQTfj~Ki%_B?MF}E@G8aIJ9bJwS8RH&
zV&nMKNzV0si^Z8k6}g>u(#KAITxa6n+|jP~PR06tg5r_qcg3E0KY4UWC3r&HC6U#s
zRW6U6X5PIgx4}hOomJ(5vb<8G-E5_RNm{R5Vj14YnSbC`3*!qFHw~XTd0(pIxf6Gv
z?`z+_%lmFkP@Ve!`G)2<AAgydGoM}Z`?Coz<PL|OD%`bf*Cp+cWi`(vrQLtlWnSaI
zloV*}rgn95<>#H8J@(iC%)k26&f%@^>dz(LUvUW7UwCn5`P{<~{(f*^T;t&5uD0aD
z<Kp7?OaJX{+<9i+C)Ym}%PP+9`owxf&;5zA>xY_YCjL+UKmT1jUF4K-&i%i2>sPLM
zWOV#ozWTgG<@ILlr%icO@A(&R-2c#fz3JlV)rOMCWQ*?bZn`dI^<Ijz=Wz1Q!@q3L
z{FrCf;hi3_My6Np+OB`c_FiOrzmRD{4}*)-oXh*qo_muj=pJ2~Qe!4^^>taum-oja
zY+k>5GpX=YLEE`oHLkp$_W3Ol%6U86wDQ|Yt3y0Wcg^y;3{z$*Ow!%#^&(j;)UwBE
z>Oad>J&_?2Q&_KPy103kPb*+Odm(S0vdV4^2AgANt}tx!e|~#&!9T}t|381rBrb7n
z`OmkTha+_P@wGPumgiL6ZR9F{z{q8~{z+p*h>im&XW9IFknkgkbB60iPp#Y~f96?-
z+-IJ^<=UvIbx~`Q=Yt=G_JLI<?I+~Jgg)!nsPBF3Qr&O!&>&^A)T8L0MvKypdlv*x
zKRxMiN9gH}Lxwf4{O{GrcGd=b?v&U6EzsV)i&NkJ`y|VCUz{hqOg2#2E}^7%`0*)~
zW4|sX@=PyKeBx)~f2=M*vF@jyU`$W;lE=rC8>e|avzS}qQvCI#wQib0gLmeKPyAmz
z`D;o()g`}ov9UgX^0ViY!bg7sjpP+{W;{yy6(UoAa`NPb>X&S6{14rk&?ekE;ZBCf
zSxF0tLlt`%Ef4<*<j7Mv9Gi3Uaz~csOun}d7RkQeAn>AQNjvKefm_)gnZ+tbRY8(}
z6nXS`+PF4ye0Xz0<*LNoJs<6Ok8^(7=jGq@kVBryS+QMFV=9N|ik*hFr|%YDG!0YF
zn`xP@kR+VKbz%K)hsvX|K~KK@G-wf+B(UhrirT28CwuSC);zZP+_SdcJckJ$4mx2r
zLFI>)GuB9MdVlJfT4_Q`M}}8o)tN6(TivS|roVBXCEtE^60h65`ZbF_GBIbe?`h`#
z*tO=y^fe2^)~{w1)>hpT&D)i5BS!IhN36rbFrN==Ubx6r3p_cZrBmzt^v>lIVpmt*
zIh*);?ou{|6E`@`MW!=Yy_tBo|Bbu-^TNr7&kDXg7js)vpfUGlWw6Eh{@-WQic%N&
zE&AbDCsmnpR!g_`gINm`zb1RoX7!@~cXbZD;43fq5G>4N>Bhh$;+55Pu<1X0)erH~
zb#5V#GR1>a<Fiy|-@56R%6Z8{LsPta+4_~Lele_Pvx*KWd^r7k=~14?j|7w_^lG$f
zwP*(HR(i5$BEPJ{gA>7z|IRh$Ny=bgP?ydSlj_Ke65D*TywZD?j%lQuj_t=Q+bnPN
zbcVEX`f7Qx7wd7_ESugbv@614>Gd3srxOE}F0dJ=$*vCFy3%J|YRZ;F?>F%|pU7r3
zl4AgkAtiB5SO7}a%uy?^HnU$7crqg>;7iMjG|%_hlb=cl*&X|}!TZP3Kboo^W?xMC
zthdr6#fEW<@{K=I{`cxLPt*(h`%nBjf2Bvl@0T{ut(DJOFXT&q<Rq_Ruia9!^n*wL
zsu~$DpG$G9A8zWszTx@l-#Pb1HRsiTeg5)N{=o6Wj|6`f^0ter^GVv-KYmoV<7boq
z^YD3|-}Q9-d$Lcy{iyP0$GquXli&U=nIH3(<(J>3H*4HFSL|#tj&W$uc*|lm`9$*F
zhFC?0nW0ObCY*d*_C;h5U%tq#q>oJk*#ak1HD)|Xw_d{0S?=g5)F|-4)3>{S{^ccc
zHuH+oxB2VLoOV#1ThU^IbeUL(BA3AHKrhz6wGQ^3|8AP>nR)v7oxtCZIGda*J>2{p
z1Vx<GI)fTK;x0~jT=SrBxrB#B7Q>Si<+~itbDWY+-aL5AAa&*Tk~eFD3Kj`3ou)c*
zn#+~nALbN#e=92WS;^71-*D;)2aU<g(;WoLINH=xgPwdyiOP6ve=6nRse}{hYKpNB
z{?=!I+Vg||T4>0(KVSQnyy-iAcb~Ue%wFwxn>Cb0=6&DQA+zboob?)$QoUBHvzR-3
zxUF+~D{W%?bIEPn=(wM{d)XBZ{xgsHUo%@d;)CXy#s1%80>29GoD|F|WqqN$*RO_s
z>J_=7rs7SDcD7ydUg6YYT2wlB@2RW_OIo}P1sMvuRxR){<ezK#?B;cs6=(h43OzdO
zndddpdeNhX|MF`sYyNGr3(+)wy7l)y|Kne8wc35{|8c9;?m-RTYM!Y5nLi}HN%Zbl
z`NLDq<92_`-=2tMoww`fg@v5_0ZwEM8!ouCYJid%gT>w~My{GGy`H9GE84Ps9Hw)G
z_c(u8`{~F04__av>|K4r^H!U8Y(l%ivB{6>j;R0W|5wCPZ!Ir)qT;rR^MN@MZJ!04
z$~5MyKUUfIrEcYknuRY~tnKc!ZO=D$Q(W`lNzt7t?Wg}AmelW=SF9|&Tu`GR#?n@X
zNvv#1-R3Ht`E7qh+;<fT*)kt+sfaV^?F(_5ZlHYHUfRV@sA|dd<MX?M>UVgQ+RIFS
zTxDc^_n`jjI8()+9KY+_<{vDo>v;aeAZn6iT#ozX!Y0|p4>I|kesr{(T|9GQqo0$S
zNz1$<=j|CCud*i}xUD|3QP1;wit!(r`8=0}Pq@rqGtt<n#-}Rs{v6@w6`v~91Oz`s
zJb9vaah1RBMCGZG921LHc*w9gTg+o@vE)}$(pWO9sKT;n>La5(p~u#nY?EhA-rh3v
zz2JeQj?hW&?v3isM*c13OeMw^pE9|pdpPX#3$vKlZ{^*&C@9Wvo|B1@W0#v#=e;K9
z_9Dq?otBy>j83SX-7V0jJbB`)xh8#VJZlX)*9$ll)fCByA7=S3XQ+OD`;#v@Dlb)v
zmZ?fS(dd}xt>VvX(kS4t@bQ{mCw#AT9_zMZtK*%?7VYzErcok?jv|w#Q1`*t36qTq
zIHy>hlo9Kxv|QpfiLuW|@=)5sD6U}Jl?***0XvoIc;8Q5E9BWf;n@U>ci--=-1zR(
zwhuFwaaKfLo5>Rv(U3fmse|*j^TwTfS{7%m56k&?|MmOzr~j^VKL2i)?Uog*_7t_Q
z=qTNoA#iirt97Ols$#NTt~J*uG7Bv=liA_G;&*mi0Jp08Q>FtB1$t>G-X9XPxol$2
za^7vdkKS><YipvG?YiEx-g$Z+r{RewlRc8Bu}o%I>3Z!JPYKiUDxabQlhPJER(rC;
z^uS}aC(qg4xe6y26*mX^8Rdl+WN~fs-c`UT5W4<JLPw64m=vg6%TRy)*N?yd7!(vH
zXzs84#(PlU_L4(Ng)b(zM$CQFXnUMRebbX8KijV}$;rN1Vw3sANY3O{OWK2?Z9Cc?
zOZgqsX)Ie}(V|qHr{N@Tu6&-!e|4ou`{{U>NgwP5)$M0aekrXay3ysP$eiC%Yg*hU
zGyl7&?xN20Qe*Cl=~HTqBj22S>|QRF-abKSk&oi$*12MSN}NohodR=x56Ue)=W%9s
z<lzN#o1C7V-Mg;zt#sEFMJ0}wq6F2WI}5GFIW|q?Oj2bFRGZ{$DA?ws^1>-tVUli9
z&=a961ws`OOVrw}M5bsh*|3!}w$DprlSIe7NU2R82j#d8dS0+37AM(py-eX;se0sR
zqMqW^DFrSik(Wdwyt(slc63QH813lrJ15b0Im>$kN5A<cp);u*y*r~?cD!9Y^}6%o
z!&-OV3Ar2+-pDR@b;=<RgQXA3<>Ht0nQq*%JV7{;;}R<y*MUjaS5JDqF5U7z_THa`
zQC=-MO>=MZG&vd{e{H8+dHm?flwSVNZ>s(s+QgRpZinHW@28%$%z3AzC$_YJBS6f?
zfBvR5O1e#MGONurHCAnQ)NP(EBJwGzewXLR6HWF?-p8hEPCDnH++fOU)1oA-rL}&+
zulLiv8g?K2cK(ZZk;e&{{Q}KyKFW7mEp>Pu|JrI=H*fJcXQ@$Gv;Lz{g>|=&gL3@;
zSB%x`-fw2|`X@1=!Am3N)$_AVM-p0}r-r;xHfM1c6q)oeFmtQRRLf6CAd}4uY)AJ+
zGbkS2bhPi#IaUXefAcRaWD!`vs4gacy;Ec-&y$?UMal_*#trSq4V-%8((Id*4y`)H
zz@p8uxpG3z+-Dm;2Tp37BfET0jM9gMb>W@Mqo;a4ow!;~X{*02)6q(UbPZLlC#P1i
zd_QD!J*cQZ`&iD>_`4aOOC^5KO8MaaH+k;^aN(Oc=?bKV@6-=tGW)i~Xj;dIjRLhw
zVN%{__grbe^8c}m{|}}gG4rplU&*r1urNrT)yB{N*!?Ka8ucGc?mt!keetXbf3)OZ
z^6zW!Km6pYJ#wOyW%2`Cmi|NeGg|8MeiTfo=+xX<6!oUlF0w&!@|Qw;`NBiC8&o#k
zahUol#pzMd<4LE>h2k0P7=)ih9*P#6yxL;kq(H78pDfpfc}qUrZGCvjM@yxu7=C_b
zZVz|%vmSrge#*FC{kbWI-7)Zkl5KpJyWwWl9j{etW`7gtzp7s?@UYr?ou;gn=gW;M
zJMBHy*=J2S{gg8`F#pIFg~)5K_@cbmXLR&snx0g$IO1uP+TtE~xKqB!TOo+s=G{IE
zCb4_I(U+_wpDa%~A)(T5(z0qvNKk~YDsRtjHrEIy<0TUhUsTo?aatBLUFeG7G@(K%
zulbzH>C*%s%;QyjF{e`Bq*Te~`NY&(L52b*or9A%Pf@;P@ljV?Tq(xU*iA7_Q|QSF
zy@|5*amts!ELoQ3nWQOn`o)RG65fobINxqPVi?w$u%US6YWGPsN)fW2dg2E=b3Py0
zSk@Ub$1CPhNXp#@|D4=rc<uS*C+FYd&YrCJnB&=sNed(991tqySufMPaB_;1$;o4;
zg1_#q`R)|3K*&kMG_v80)o-l>oBK0VCa<{Qc_T80B{5RD@5ICd8k3YCe-rHbukme@
zN|)OjPA5Zil^zx$@1mkk_e_Zyvy?t+UHZJn>f(_(Law=KNgBLYWZ#xD&-~5YSXcJw
zn)%|rVSB1w@`Cr27)2IYg!DYi*r32CD9`F1;n*si(H|c9?$b`iuTS_6Z4G|9Vp5sJ
z2fc`sik}4{C5{`Yw0U@17+&F!k$4qyL1wLxXNg2s@Ee~E3B?Jk``lIrAGo{DQfjF`
zbJ>n3SyOFzZ{6t)n&7)uC1^tA0Z_8NAf_pC3EX1fzP2^NKk_iAw1;=gOU_Svecr1s
zDjuro@Lv7rwd&v3`=$$h6Q9`mbM}@=M}B7XFL*HdYxxDE`dRTuf9gNJ{efkA*Y8Na
z_D#xXUJ73+bh5lM;rKn(qvk0UePYKByj@Z>*?3=B+&#g~Pjc8BLZ*8z+-Aciwvb0=
zl0>x1h8;z^-E+@oI4e3m<lcVb$SI-L71K8ySuj!4#J+X*^@wj~zGg;2CV#oMnf3)m
z@^<RZn=&=?$t<NbnPvGWC-55Ovf5isI-c93q^x^cp|@j(q=&!~ezq&J_l%qkC9gk=
zwAJ!)Q`5A+yj!w=c1hc%?y1TXeB=++%3hdV5q#3nU)}GdWk3OgVAoyaU!OeE>^eL&
z-~3)Grm*zG+k2mWhc{P+t6o{K^PjxQ!DP8<e?r<n>^%6aU{X%`<;|00#jhQ9c~$No
z9j(qVq5b0Dqv>2)Pgm`&Ub9@}h?H^S*O^kYdX(;|s{S~8DM}}H>LqFWdpmde&c0@U
zeDSZ)UGHX}oR#3$Z)x;ya-LS1;d#|ES+%K3x2EuJ*`;wM$C;_;38N3^{IVmCY}GzJ
zN1WPQi$hmV6X@EP&=XM^ue;^#udAthYW^6cl!b{$^%S{=U)p^&b*s_W4PRppPFS<h
zVr}Gd(XKm7yPj)KSU=k!#qj6P<}(t1{w$xNJtswm;otmCf2OGP{#FV7^Xfl~1LO7i
zCm8&%e`8>O|C@pNJ4?zd&X4>D8FkhQ***MpQ2UQfQ{5}!=EIy7+^19iyq%_eYM<!j
zoAIho*0;M9Ih_za`5_`D;i9UX`?M&FCIK0znl;taPMtiE>bK#$c>e!;uhctkPgt!a
zRPb}Pobijq`@AbvK74w1_wj-^`scr%YrbcF+o*2VA~lbfGA+I036(r^bPv5y5_mGp
zJjJz8=jS@JGFx#O_Io-HP91nScV}=%Bd4ufadBOn_}%KDRbJCmzTKC*EHC>j?~z-3
z|CbKNV;?vh-^qnB-q{!)>hWuDF~`nT4)$())7HPZt9QjsMW}A;bCm)G<rnhV=WiGV
zzY_SgZb|=<{h<@CmOhGm`S8h(Z5GWQ?mkjeuGdJf6PKCfR?N}0-*iI9)!4l^4EmB@
z@bC9lRGFN7@sX_RB9Dd&h26{bHIyFwWjOQn>wfFm8$;NiWxl;Rqr}y6r}e%vbKNsl
zhp+A4RdLdfySZn-T;j<P1NqgHo#qwxO*(S*%bmtG5tH8XT(q@bq!3uBQ|O`OG4b>1
zt~STLPuP3j@o4kN*lYcrKl^$?@!_2h1K4AkFTOY*+^m%orQmktkwi_Sw#muopO<ti
z%#paZaQ{Omrdmd|=3l}u?C-U*IofQAnK6qaCVs*cM@KdD6xIi;)g^wYIH@=4dTdnu
zd)1bK_n=*CxW}mn*Ee{x@A>mtrT*~e>H@AdgA=M3KVH37(_(q#d-NxXT`p;d)Y&98
z&nSzjZCBYDyhn4Iqp8g_wuLhH0;cJ=3Op)1dAd2`it6kJw>Fs>Q@rg3diEUToRL{=
z<nG$1^7g0sQ;*EDnXyk8^v|q~PWPCcb?WJ~g%dSSUTvK#IpJwQngVMeO9lf21F!jx
z&K;m_r&rXtIugJolIq>p4vLqHj_BliWbY8Pvh|pDf6c^C_s=B7i|!9r*<TyqyeY2v
zpuqXhkE~v{Z2I?K`Bna*i}FXdg-pN9-^h~Yqn^y=mEshYXX2jOq;5V@I9}yMWRoTD
zLmOeEy-FNUkL>uDFs;NyAjfmsrdJ-8tVco(S{6=L*_n5cqrk&oQ|CxfXhDkh8MUNp
zLk4s0!)h9HZwF18;jYa*Nn?wUbS{U=G2SqTTqP#&4<3q79O~+Si!n5%@+>M+=QKE|
zRvPAZqb5}|;N+BSi<J8ejRM>ACR;HyElw}3@6D3``K3^aX(5mNcd-w_zgM!zOyx>h
zeR{#n35UN--&KEOYn=A_#huI*Y4R$Ueom^Dy`v^AyW3j3`|>Km=mRSq|7cDSzW%{F
zzBtcz&ZEa}Z#tE}ORuu2llJovlHQr-^nJ>8{?fdi&wIAL*Ktf^+)?kG8#kx-=VI|o
z`!D?J&cFBcz1-<r4%cTCcKga%>s;D6CHu+M*#=23w@!ESsS{!?ip)H1kn0{+JX>**
z$GHbiv3^-9lh)dYyS?-3a(T|P>FlkUT&g@=T4kP^zFsLBB3*|TIW;gaY)FuB<CgGx
za4K=t4x@87veq%SBnVnC{F|2dgGK9CC}@Kk1B3kTZw!C$Z(x7Nl9GF}@SmMOKhGq=
zpnb3P%IY~jC2}&f|Gy{t|6h9f#{(Z4A37gmEl_`;B%o$@?(ifxnO^rR+jm4S`8?;r
zRIe7rR~~n-PN;h^Z@q={lw{8@ycc=>mh>1*YUg`%@a`x3<d#T7uFZms+?yC|YM3W*
z=KiwO6F+m}OOfEte&s_a+#EU6H>EsZbk1GI&B=8}pQZZ8{|1fmKc@%(|9w3E|G%=`
z{m%t<eX?a<P<Y|P&Z3AML!G^T7BV%TB9A}5z4Pvq`_`>>wsQ~J%z6I2dda;1|2<M4
zPJCEdXC-9taMH?M4~?EIY?)%!^65vCsKF5?A;&g9m!9=XZ`ZiF=2=I7v+$9B-MMb-
zZRMZ!(_1<`oYqW#|Mx}V!RXW5#oyb$C}>hp6pZn&eeZnW<p(SO%O5QN@YUrN6q&y>
z|MXL41EZ3Mf+x#5fldyuha54ABFg{%&;9fLcm39D|18x%>$Tqv@)Y|hd0w%1m+R+_
z*m{}z|DIRcJPb~->$#Y69B%VaJpNwg=aJ<$v+vK1b9?=H@&CX7zyCi|_5bgOsT|IS
z3pSi|=9gW0C`3W3W#Qq6i65p~MAQ`6Z*tql|I@<tY3ii45k0LRy8i_kiI=T!iQT7l
zNHs~%LV?LCe$PYKi0*rgtm`($rs<tzc=r6+a~pZ?8H^h@@Gg*H6!;2TGx2{7%fCZL
z)7-85l^Yki#WES6b3LHsl5$m0;*AGOqPY6?MNb7%eg@rH^jj|UN)cOYfYj#vN6QK(
zXK*MfvM-pM>9TA~!!oZ9wN_yV56RCu*{(DGxUg{_^OI!h@0)&NSJG8hF>8y`=Sp!C
z=ReVPxU2Y7DMc;Z%^}SunElq5GzNc>`L2#a9p>{>4yTq2x4gO(vXP%bk)>aOrTMbU
z2~`0W1A&uAr%n}c%-x<OB+St#sOCP!<>kl9+8r935<FL2TcV^Qvc$4dPNey|<G!Hj
zb8p*ef8H6;a$w%Y*HtrO_@ZMOWqLj)#_la~I<-No*o$M0i=QHg;P*(5lMNoLR8oWj
zJzM27rUkh1YAOEE^w=jUrqV2|ZGCG)@$GXjeUDZud|a3J{$6hP7u7|tx9%=G$FsrO
z#BAY)fXM=Ohco0h8!9?XYtC%{8oi%^ZC11H!RqNxvV&@;%~sizwomu_+$A0@Yg$k2
z;3yDJ4H2}kRDE>lXv*~j%@=;EugWG^F1HrD%cv}WP-CO&RR7YZO*TJxEWgWqTyOS!
zX&<Lgx!T!B_fEc<r@N&1TA8G>aO0B;UmX>7rf|3{a#m$M(#|r)%<H{U*+k|2UnK-N
zPRu#tA~Q>}<coo}K!jvO%cM)*A*sd<J<I_rn>%05D-E8~(H+q?t^ROA#Pr1}p9C4!
zTzTNMCuheo!+EiLzvnUS6#aVF+Fa+l`=Xg=t_4|c@2zg%T9%!dW%(m!me!H_;+fuo
z>>D<1IH@RX;nAStCDW4R?*9G%>4kR65$7d2=C)4yb-Og>p^(HrtEp$>6T52s0^gr|
z=6HYem8pFB-=;k;;W_lSCt=aK>aSfZU&V#|ExNY(*@@SiHY~n-c>l6%BFlY6f*L^e
z3)AH5i$#w(mq~c)SWSFk-ScYVxtsjj;_0iV9^Gm)RU<yn^x0(58{CPCyD$123p!zx
z*~yS)EADF^dCp*u!Q`?UwaJ37Qe0H3^7eA|tbae3?O0BpmIcGVsoQ=j*J|#Y(O|om
zq3-_%_IDZ^gc&#<F}R9Yc!_mrDGCXBx{AkqSaia3mzU4>b+e{g=FRcv+_Rm<Eu-^n
zhikT?P^RV%rs6QaoJ*xgBfI-t)`Yt*n4l8Kpsd8m;;`s~hk(TsE>)2T&ZgEyeL`}q
z8dD|eT3^V2`ESY=|EYeq`m`<WlO+!s|Jl0JQ*{291Kj^44fa3$A7B3PhK0S`5d(JK
zUsJvw(SP_=@^yP51N)@836(!Lb+C%`iLx;8i8UHHtZ?!0e}8gLu)^*SorN|!^Q;e^
zjN9Y0Rd1!r^|<?@=?xFnKHgmNK{(rV@~lnSoq0#p8)DPVIwn5M<q(@;5faF7<BJcM
zUfZ!P83GQ*a(;TgddXj8<^<`aX2cwfHlFb0S#FD0{D=Q$$zS+C=k9wwqwIdk!SFwq
zubKT3{=sn|{>A+RhTDHWZ)d3a(az5Jp-?0&shd%Uh0lW9@Nen$6gefMLY0(hulKmD
zS2j~kHgAbf5fFN9DAch@ThM}Gi(|%xhYG?DHtO6AJq>9*b7FXBr`>*Gx-diO|8#Yu
zX8m0cQct#i(cLhC!Q;Tg?+<v=KmTj}!OW$5Y?aoc1)$YkGnf5hQsI5xu-=O)Mf4Nv
zgbD1|Pw>|MPq6s)x1sLA;|UdS<b?QTn-4kiiSE&MVBqkz5T4zlR6No0M~tGOev7#5
z?1-~##BaKUo85`sp=YtXCbXl|V2flW8zcXPTK);Y*DIch>^DeQ@OT+d&gZXuvcD|O
zq<lU6Z|QL#;p0~-dcNA67d|dLujKW)+ZPygyHnRWR<bSc&RqC}lcn)^!h~j)76}_=
zPtygiYgLLT&nil{VBf#p^}R<&+!y8+z1uG=TbK=V*T=ryTOa$@HvPli^66*4?@K=Y
zj6piONp|(gBQ|qybrkci<T<)Y*#D4>?fZS-=d>HPCDh7am_JeOL5qAli_QHH52mX>
z`BQ!{#pd8v<@_0fpX4W;_@Uf6@u5t`oXe9Q-{V&`=REFuv+U822Ya9Id9&(7blS~r
ztQk=p``&N+*Ybh!tNe=d#>dR6_qTjLtiE7d{{e=?@Be>1?A^k^Q1-aZB1!)I&;R!B
zpq*2X8Z{!E#1ueTk^k43jaQB&T~RnVX|~CPgA+VlnRc^U>nkL9_&iLp-oJvQMrn`t
zvageBR$IAyp7MBQpt7YhD5B%)9`2x*Jo{MMZtoQM%)im&C;M@iy^~LJe9YGsxbd>H
zluyd%h4Mi|77IIt=E|Eq`#lsN%x4t%e10m&hkdX7FBNzE%;xbvbf=<U;9-i^SykO5
z3%l-4a%Puk<L;jDB92M<$_AF-`+XD-RtVb3I9c5e50l!|<+iklzr*XA-kAx`fsM*}
zmUe+_6#WByW;;phWa@f)JlB{szj^9|i6?^<?d3dfCt2`3w1`oeI?u=LIGgIyM=TZk
zimKC}mF|~Q;FECu@yPxBgezwiZCgIg(U@l6An^P%=cj!JPUW1_3zvMJ=`*uSv9Y`P
z@RXI2(>Y)6u(V?rXxrXA;pCBXpE)0?)he`PKT%8cNc!kvu+w_ePV34;6|a}vwVC~(
z_BESs$9^{jPZbphMWIHHvuZ0#XTG>`@%Mt{z75Bl6{ZR#d${;6k~&kIQ~ga(b)MbC
z28pIwj%O5(+)CNrT4wnt)bgtnvz?73yO$zESYqc+voAcmt1e3%y<(u{dW&bbeCcBO
zRW-gHMJ{YD=j<KRXEYzRlb3V6bL3&jt{DzpY7ugVpM@rrtaUg!C50otf9?53?ho@*
zCbN3R)K0Hnk<j}yE_>SK-)HZ>tu=0HnJxKy{kat836Z)+lLLzv#Bc_6C3oF7`4hcS
z<=DJu3M)cSrKqNx+!d3K_{sbBo8*t6MYA2bRCYQoof>-RZgOgQ*oBY~Ne@Sl<tkUz
z`W`F}4t7_|wzr>u{^eVSWZD1mAKuE|{V<vT@z0pF<7Xf8DV$;8Y4H1z@b29^1qOy6
z)(_P`d{6vmv7td%q4JhTQ2e%{sE|{-f@0lGH@DpBTNS`lI(PYouSUzYANVZ@V7lOu
zFWI4SV4{|#=F<q7qmj}Foz)Ib+mY>6eL$Oi@7hJvi}tvIHpBeXzn~_i+tLI|+6*x}
zJ~MbL?owUyMrHlBk_Rr*kCSBkKi0VaV5l)r{=EBUO3nY*PIh%47Rsl-JyFL}Z_HBP
zJ^$5*_8FCe`ljWJVy?S+hrbPy5BhKRq`u+GrudcerzG_|KYC_tTVnKgMZ4awY0jTw
ze=bt}>b+$0)xxAo^GS=(ca*$Wn(*WHmvgm0dfcD<Kg?uj@q(p7x+9Cp`C<8CCVS(3
zJ10GvU(H#Y@U2t6OP-&zc%PJWTXjzRZFz6u-=&{Z+9iJII6v60t-kD?%I>t1_9Oap
zTx<)UPm+s$%3`(cL*KlOcBOYZ-*0I7y5mfhkkK`N_g(5o%a5?vH65JjrnTGCrk`1T
z|BXqeH^1$aSh2mqZ=210hI^Cz6nZUY8eFzXs+@b!j(z(s6O{yynF9W*BJ-}-_;-C^
z68#}4BiGWrX7c4r>c(BdS4s{=6;60Q>EuIY!LltqqQa+BIG=w=QL~wS%qN(08KaHv
zrAnDVj*IW6`%mfcI~+3k<h4^md@QXyRMa1KByVFe-r)Jb{K!TVj>IFL+n*m0xG2+S
z)N^*Cf~Cr&zFbAlnFb}c)(ho$k4|g0ot2jJCv)?^BTVu<H$-FYy=sG}hH>ynoqEH_
zye+lKrC61tzVp;EwX2VhPH1o3`lg)ySN-B&Ig8i4^cCy;udHbCFGX9-c*Q%9)cvzt
zMVg8?6qr@5m$&!mESK7q>YL{J<o=rTl5cXVf1cGjJ0WOU!(kJn#VwrXtClZHYF$$l
zz@m48UsC&EW5BU#F+Oa41v-Wmciq**_bTU?g{_HIUEJ1JrpNP9v|O-pvHs(-sF`tJ
zR`0zTwRzLi$&aTkK9=mbh4I`>zu@T^E$7QGu1G$a+`LfTB2Gt5W}1bSKs@)+c0JGI
zz5A6PtIKWQll$sXF=&1I`YS5JEz?>-DU?xh_1Y|krZdi+BJ&QnOgxmtQ5LT%^rQN>
zhT<QeeHwyq>=r3L$WIXdyu5>>+JBAEpD*=VM)G^~Z&b=J-gis&RlbAZ=k3Rh;vD~+
z75?-2>!CWkFFW}KKNMB93EViXdjGGO|Gj^|l|Q{c!(wZG=;Wt2ksKd8i-IRUmfg{3
zzhlDjRL}ddvs+a=AGLgn+0oy*%ac!D=zed*YtMqSCuQ3gbo}f;q_V2?QOs^TclpJt
zjW3S;{O_s!;=YyO=j|;XpWfYZpOwP-_MH2P`$8SRZ(n7x{O_uIxFS%Gu}j&$Y2wHB
zD&@l)zD^TnSmHIs$YLFr`>gcN*eyPvJ&q}AP3m~?{Yk~Lr05QRiI|4yU4gRZ8j6?J
zPdI;CO?*O|)!CM!*-X|lhFq68Ce7qzn0ClrciHZunYSuUlQZ0eZHvrGEkqAR%(~hm
z6d|FK_0l8!u+W_cp2@Ov7o9TaNL(`M^v9+cZVRt}DpRa}-eHlmYU*>E=cTITXxde6
zrl{yRz4z%&%lN~0jW+GnzyJ2%;^%d7SMGJ*pZ~hI?w~<p+|oNQs;l(FJ$`)`{`$Iq
z%F~CHG0YP^7O(^yQk0MSGEG{C-QvrBjk1_ex0_b(HeCOGb3&KoiTJyF^{%8P&70~I
zCvov>1&7L;C-={?vuP;It5=-A<-?1tLd|ziCr)acB5846`PPbaAG+q`NuCei82UU}
z;+M#hwGA%<yq@e+;hM{1Fd?+c^{ZWI+{9XUtv#2rzsa<6nf~UBKXPwASHJkDC*NOo
zo(WzXG4WDmLR4DeAHT`p4Rnny-dg$Va3`Fz_M9`fU~{h2POG=!j6A06pA>Ecuj%<e
zL8RrpLhFkdws7$;`%7naT8TSNej%GM{p@=;!5_WPF4=rbl(^Qxrm0iZnPy?AD^}5x
zvry+>$h_?{gnzQXy%gVazwb%y?2jJp3h!Kg>|GSgy!k@(q!{g4lh2$ud1udjt`|qP
z)LmEnF#C$w`&3ow)sr3ta!BiWM|CFdo)_f2W6m2xh1VUW>$JSrrgZo$@J?-2m21@c
z=qYk!(r=L`y*hnL7p*IN7+rXhLO!~zR=WM|{oC5zTh}T~lz#K-iPDm(Z#1<wWhBKO
z{4@P>C&O1GPPU0WB8N)uIc)IsG*k^ywRLGo3d)YS)X~!$Y}3RRsVp!xCs0R?W17B*
z*9MM5iyMy4m+^Xg>Rh?yGZ{&i<<Cy0Y*@1Hv1i&xmo?$4exB;gr#R-^?LH=7{)IQk
z<Ksz>=~l|lu9cr+b$fYwzI<4>Q9;<M!`YWN=*T>;z>brS8i6M+>-Gk9cKC@nv2E~M
zvq>a%a^IY_-rBReG#QU6^D$a4Q8YU!_~2yt0=Jt#qP`at?b!I!^GQXb?LLMUA5EzX
zoR=~=luz|6aPbuUy<`rj*;gis)0KC>J9RYirZ6!{Dok#5)Z{t$P))Ms#W{snF5x<p
z6%w@68YZYHCCqauJb6S>AwaR<>K!gVMRu;&<=0Q_RS8Sd@i}!VNzr&kLsIf77pFbS
z3zj^cm|P(G<VFOC)tlMZv#s8QWljiEOP6@4w|b-L9xjb<8BxJ&cN`CXRqc_&lbKR;
ze@6ULuE3m6_Wctiq`AA@j$OO$cHTyqi&G|O@oS?q3lCm4p5<A(tj@n>v&geHEeTKo
z#-lht{Nc;%f&cmc+Z#JDFc@5UWg%0~@6Tud@BjYY&hb7BnNv@c^l)Cuw&d9sA9+V*
zb<#=i7olvY7OdMj@!g7yRg(j@o={A;NLrWuCfx4tm4kCUSye8pt};(~BjO-BnYHJ1
z&ua&TIg(Oqj@B&ZPWxz?&e9XNcm`AI629dd+gZ+o%hLx7Hk@Pw_bVB?uS+qNR({lN
zJ{fpX_}v|!`7EC&+Mkt+>Hm=YGsOL|Zc^mD9{(dXHS_1xt1kK9JpWNj|Ht0HX#&4y
zz7mS(wwdOBF0NPc`|WQo?r+*}Z;G4nQBj{gt>dGehjVOBd$*nI#2?BZP5Nj5uw+j^
z@$9AG*)N;wlv<qqcc|oO%GBw!?sj<=qJCy$O2sjso*5}AA}v=0R@=;%e!Zl~Nww&w
zX3pBj6Q=G~Hmj5hdD`x^VE0rVe`T$mOo=BGKYWkfWWoJ)n_P6tPn&7_IzBTNS>$Bu
z#tW(SCi47SF<tYP;PHYa>(woLgb#Imx;M!?{J!Uhz69sSSDqK1wEX(>NOtyu$sfYG
z6i@x=<gpJ?eDywQ;#-?5^AiqxZs>TWJo#~L&UBjtf>$4PzB=RknImU~;M{igOQk9n
z|61mqHgIo`UAQBXFZFWA#~4mKBPI)pCBbd#o9|72DE!99l~r5e+%(l2Up=20r)V#j
zD*iQ-+sGsN1IwqNHOIbu`7wzvkXu41r**1=prD6@NNf}TCbu&+HJ9?1Yl+M`<Fa>^
z^7J`=2_9@4)iv6LW{URANtxKNDM;zur&NvUNya<=7QJxYwa9mpTWC?xlUq~PeW{n3
zwfxEB37m^|d{ANL+ZeJyqTPPCQCM!ni(Gqyy!Dlb%lBXYrT3n_`=9bBakbi>Z{;7f
z9F9dei|zS(EvB6HYTf6skBg<FpItre_r<8zvGy;A&Lh?HiIX*0G>nxTgl<M{za_Si
zi9twh+pmn1xt5zxzlhZRxOHVMr`V5MS5`^ZENjpWTF7(e(&aM(6$)1-S#D~X_IS>5
z)11u)TXQv@`3W}}DOxCNEVSV`Exq&pfu4=K=l<b1EhT)QTOKs2_t4S64U{n$7$*0b
zu(%)f$lc-Euh5$(<(~R}lU?^e{gmqcTRBcXbg?v_a^iD*#^lfZ4^`SfP5)`){9OKE
zlAYbhKnuxF(Q*sCpIm>jW&VwIa?b20IbVNOS+}N*dGe%B+vj*Z-EmU)naGq=K~Ivh
zA08>1V>2^PV1G#$vz@!r&gp4;X0&H=ugW%#ifoFRE&hFzA#c;c1B_`}si|x_mj4^)
z%=xscTs^vM$*$Ggw%fmbm+;E+Kj&)}rlQ1&6IdrCPFN8&cWdWb!GA10Jes<!heBRH
zk5Ey(p!8d1SNuHnyIXHBQeSju!m$?}4;2Mp|1=7=;MBb6aeTRU>;Y||r!_GX6ee}N
zy7=VrS=Dq2w|Ie>8A4_fy^<~;+m$yzymZX#j9TbchXjuY*(v9kym{>z__O~X^4aFC
ztZgloVyNw=AaWt?_#~@kHD=Dr6`dLrTbLXJjhwr0_52FG$uKA5iL%XvmqN|WYx?(0
z+w1gFf8H<G8MeB+>}G0o-_2m~_EFlDkUe+xBrcAIb=51ciU?)%%;K5h#c{8B;t6Lp
zM@`EKpG1np3|!O}%vdmaBHPPC$v0CR^{1Ut6rC0=+GE5be#EtC$KGwJHmmH;UYwfO
zt=4en!HvS-wR=`<He0i1Q@7RJDDB|Pw%zxfawnelZ9Ng$dQtk$n}p;@NjtmCf6b%*
z`;?sjwrAQ}iMP6(mrD1p&$_TjY}KCp?B$}qT5(4kwoFrFbztqhcjVQwc@n9%=1c!Z
zq}rM<y3EJO5{fv<q=9o&qUF=gOP~5Ua<2dTlRfc7QppqJT}3tRS6cq=|HRV&SoqOG
z8GS{?q^Z|+dSe)^el*4G^3==Rtmt0ne$ncXZT?$9{*M2#oPVZ$yeU&ZspE&bjdSP<
z)dPksn<W=zuJ8|ynZ_>VJ}39&%bk((!ry{KW85B`cwF>7b`8%~mfb}cEv9(h-{#(W
zNjbm7y(!1z{#>o|(*loQo4Hg_z@VtcDJ1{)L6we_X;FFVXSwElpW5m)>%FhTBEJUD
zoayaHD<7?0;rLbSLd%Jm3i<SVb4)rfpWXiW*sS_J+DcEZ3Pu=BcIbMs;?Di^!aF?H
z*1nul&-~ub`^HW_`44iNCrt1Xw)C>TTD|B)XWF|XFHW#O-T&#?s*E3trIbS^f4ghg
z%)Do=k@`Db(@brHJI3s5oEC7HvYh9ZmuV>wZe+J+Tfly0;|q@H1>T{(s@v|LD)8JC
zda?KTcTSHz?yUBT#}x}(p1CMbJU4M^Qsbd3hfdjhPWMc5I+3d=uyx8T5z!snRJ(lV
z@87Aoe{=G=mns!5lY)X$gc>F(bmm#5DXKgC<5XJqvdl(|o26&vlZ0iFh6>X*u{oPD
z3oT|8T-<B2{Xr*7nMK{=bGz8))*X40t=zr8ciFP5t|@yam-Ft7dm5Oabf=|Bo^9g&
zA1$oM0<Lmx5Sb{kYgzuDe=9Q<AJ~(;<#TDyoptgOO6lpREjneR`!^dXoKjfnDZZ=u
z_M%zfgFrwVj+z-$jXy^_*rxNZ?eA~?{{PTI9Y|rrz`!MQX`TM%E8R0^em>I2U0QcB
zT4tt$%fUtd(j1DQx>5L}jXbDiDs#;c?pS0XDdx@aw#v)xrA-@qsg9qj!?E2<o!4kD
z%D8kqRZXXFYKYPzeVg;5OLG_<Lv^C{Jk;`DY3E+L^UUD<M9W5po&G9cE*}7wcP33P
zka46tf>&O5WEiS^{Fzh}9OXatH%I%U^`9KPU)HOt&wRP$W9*FijejS3KbgLR<KNbL
zQ<aM8GdjOzsTf;rD*E+N-?2tS^-+b&%uR}=E@swD(o90H*Q@*}HAzZXY3--Hy2o2)
z4yR}-yVk4Kb8o($wOoG6#DiI<RjvGP_x{>nR`kE4%v4Of;JWw5qmj{Vx1M$Hn5H9t
zYqG#DKAYp6uX*0ZcW<gYs-^$lbk_gn`5G;Up4?dS=#&EIkpiQWiOE8?MW#Id$vmxp
zSK6#+KOwVVUK^*hc!%0d%Oq7^mZD#WxO-d$JOx=2PpD)HeeRG=d2G;_diOiW!;}fa
z*_N`w|B@UHRo1`TJ!68TzUKC{`8(IHE8fmNIk=>~g|GPeKL(eBEzj0oR%hvy&Y#dE
z;Ud84F?Di_$C8!pQ&J}d`2BYG_<2#Fee<f`bc1TX=Py@Z-*MuAU1?bKbm?E2GpujT
zxpqfh<)zxOKhJZoc%>$JIvO;+%Gr~1`i1SJr5!t`yv({%rm@+t>Y`25=lJ5gLcFRg
zmZaYij=QNhncFmEGrNR5cV+n3>MggU=7#JztN-_o*g=c%xWK3nTw;RaDqA?X7#6tj
zG5oX8opQpM!;Mo2l$`k2I;grkc=tZ(%=)lBo>}=?Et~t@xN_xlUs%dw9QtjatJFvy
z?@a&pWas*3&I<kolde4Q3}5G!cw$$g<V24H9d3fT8Y#;?W?pgWR5~)z>%<Cq*ULLT
ze6sMf?b^t-sOxCcQT3p-fMsX)+9vfrH;e8S54-r>(zDMtY{l2POAlYE-}1|UYgxYL
z728!`&sT*{`EUMj{=ahFitC;K)&)l%^pBW)&2{3p`ZfMqJM&U4SwqEyeBHB-y;!j5
z%nX*F=E3Wd?A0>b(pD_AGPtRFXsPdv7Kc#pvmzf))ron`nc9~r>K>xVW8!gXp^tKN
zuy@`TUdIP42Uw5ZdwDJ)M1$!;<tB+mDJPOT6D%11&0qG&I_qeZk4<F1z^?sECNeO8
z|J$GUw&uCy{O-T)+ZXWP|Hi=nPMc}MM)f|PKCf+la`(>d%$m*Qta5+D={4uAC!hIw
z@o=BZ`PF>QPuqXGM*nm={q|3nU%7Wi(pj#bhvzf=Iwoc5u_&nGjfz%%=FR9mU%tGG
zGJo-Zf4|<5tj#TFU95g>{4Tkt$cXhr+h3XKAH(P0i8;G~r-9cj`0i5w`^S%eI(=79
zUFEdlZIOw;XDo}G-rULY;<~ury7+yU=T#)mo_g(j<ch+6m->^^?|xs>jy|a|;qb?q
zDsHnjA5z@e+PkZIhu}QTiVt`H{I0#3q4f92ff<s;|L>dqQvdk7fKzu)OlMzMS(kTj
zN!%Potw>H@)xJ{`kEB|aCB2@Vn`$=K#A=U!aMA6`5c`jJ@2qs~?pkB3Zr2sVfBE@Z
zNru0v7rJ+T{!wPWp<bwl<)uJ#Y=q;w(4&j<%)^3jq@4Qy^Jo0|D{0ftO7?LYKNGzj
znpVy}wJ!LW8MC`Vnd760^PlxfPI`LjvFw_X>!0l7A2xjT|7$<_gz#5hop-YTkMCgk
z7r5riijEGJ)vu%1%S+C_niqQ2bNcZO93@gL+K)IAlmzDKRI)U8IHYXm%ydyz3jed)
zI<;~B-p6k%f7{m0Tff0C?;pSM!s{<eU$dxgF6gPLsCg}QM6)-n|M!cxA3tv`F=Oa(
zn=F)?-gsd)53g|e;nT-YZ_hP$v#RN8P!tT75BYw3_3G8DPp@8f<4CKUnyZ`Z5l?;%
z-|c>DwjVP(pd@h1C$UW}K!G>js<bq<uSS*ECCq2ViiL&`jxgNz-S+bQvKt?I+#G||
zT$B<R9v+sn`T6Djhv!dwYT6i_WOx%rg|+O{3%9ILd=khxMN#J6j)h_oprZ|@-1?<l
zE4XjUeHRD${cjk5|2tsu>u>kL9~=C04|hIad-MJOZ5=;7)R+I>m$Zw^Vd*aJTl>F$
zu{l0Nmm!eh^=*0e{y%@`i9c9pJn5sIyh7nTVLrdwO}1*4k}oz-{`Xn=%WFGLmhOKo
zZT7YQe8q3Z{Bl)odN^Ay&d;~aNiu`!TF-%-@9vZ={5ikR$@kSX-z!0?nwD1G>#bbB
zTF2kr%YWfBr<pjjUgCclE;+V%$0yc8UF>nM|0r-a$_dygsxT)DH*h|&n{3li{r~gt
z2m9Ig|I7`vX(%eNb*gGmp4Wcp@WaF>Dg_%(o}Zr{_<En{XQMR7`un@n=L#-5!Bc5a
z@&1sX{q0r`fwn_tlK*^;{`Z-8YI<eqlanEngYT(!p1iBNQqJx337fsE{JzYy3d%`e
zSa|4xz^B9vwv^j)O%7|$R9eoo+1Ap;;b_1xgTX^Ums2FBOM|6>$zx)M!VJNOo(otm
z@N-^2)Xg4$<4XhsClfnY69d~ZhW*y7*WRewx;H=mol}+0l_!dY2hRszGoDa?xo>)_
zu6M<pSuHd9?|xAFB*ekGSfQnZ<%!3`!Y3Y1DvC@3jM<AD7zMtr6X~u7FPCNi>oDb-
zTX4s%qZ7B5^mMi)C~aszWHGbg$hVjS^If02{CGEIN`;DULWgVPwa^R38#)+Ll%Ac)
z-f@%9@#u-nO;0r3)lI+H2npOw_TJIC^~rNz^|*-^xoR5n>_65UowfYzRjju8?|nzR
zk{4^G?tESwdRJJX%w6yPW>Km4B^DPxSnjtwE^_6t&9aHzKUt@ryZp{_(;mKst31PV
z**$MX7wWz&iPWk;xh3_M#)G2Q#)q<(zAU*_*l*cqbv9|<#)-l?tgC)q$yPs>HS_nT
zes}xC-{-$qvq{xC_EhrRdi5jc4S&}M`4@ku&njJ)@wa^6(Vg8N;^%!{b9T+n=J|EE
zHF+i_Oqwh+MeWSfT_J8fMg|j~37yC=c(C~Id&R{M^ltR6f4WD6#l=a*P>4O?u7irO
zkih=6!awI`x#p;Jxb%2lku+dg;Joo<P~R)FQnlU3GOJW}8ia1x|8M%?pO5q#Z-)K&
zlx6tQKx8s^hTtvLUA)id-=91^aNh;d##ol7_UkSO-Y(x|`6^rDSNW~wd(7>wnu%XN
z!DA?~uI|?E`xpKuE#SZXC;obITChq(g3IEMFAQH)dhg%e{NZYpE%%$f42m<#KK+gV
zS6yv)(C}&&*9}?qN7Cm%L@oYQw_$q!XXVY8_n-Z6_cnWMV!7D4;@i^B+T5)A0b!0O
zPpT!}SbBC&_Z5|{J=g#KIdFeMQtkVby2nGlXHIB1c=n~Xz$B*KU*p1A?mqhUDolOB
zbGxi_i>}_eeqn}Y{n9D6%Fa*!emobOyn6S<Gh1e^>@+wm2CAMJL@s*H3cKW*r0=-J
zNbpkL=4)4DL*DiFtO@isU0)z`HgReW=iHRbhEV}+jao;0PU$Q=&T~B4<-s?D$)Q^k
znwk_=@qTO*cFeE|ZeaNLL;qnzH|RhE=BVA%7}UGwIZwKx*nDup)j}hg)pI61m7G-7
zd($?ys3OPGwp?EQ&Mro~%q7af2Hw*$H@@>|?LEjN$I{R|_nyysB~F$H2USE;1Io*6
z*u5rrakQv3Y27Va^(1E2dEOtB0uy2tx7=}JEX{rS@Cjd#Ys?Hek;sTp5i6Z<8NZuZ
z`0p;=_$7aapogXO*FCq>X1|_ku`hB_-PB<H6-5iGjC@6x$VkNO@MK=m7~2%XvQCy;
z&-v8Z6WP<mH5PPlP|;P2-{!}0Qm4`|%qGslC}f70=q0ZSUTGRr_gvJS)p0ITW2aJd
z&q1AO{wy|ITaHgZIa$bwFJRibl{)LW)YT=tmAe%_PCm-g7o9kH(S&E;PGlR4C7-m|
z85AfX&M~)T)#DRK*SLjbO-lJ&)0!=z#WA<#g|PbJ>2G2kl(hsU8aHwW9XxrtBc%4S
z-C0km(vIM%k&iw6X8CR0_;;m6cuG*V>b0GPp|?()o#xH*C?c{Y#qrb^j@?R5Qy&?|
ztJ^J<lzzQ2Tj<I`<r<f@&rb3yX53IZxvf^R*vOmTW^U#Y?IUXnRMKypyXbk^&7aZ4
zRb@%!$F)lTycztQR8A-vJrl@nD!Assp_;Ih=T}#`YX^UiLZfhzeZGTe_$-q@vwoXc
zJkofmd5ZIHUna-31Q(VAc3<tx{ND#ZPM6;|y}ox&-LKexHZmyW*YY1XeqZ@<<AF&x
zSAz_=nV+ztm7$OWoc#VC<YC}=Qfj>VM4DA6U;pPh{qOc4VX0}nuEDp-Ie4q$!~NP5
z{?7f^ezMG8NBCK!kWs;tAN-c;AM&S8d?pij^wi`4&smbz|M*ldgn3E+*K;!L@V_iL
zTdC8w>(-Sm!Y2}ZBiIE#pINEVX68Pns(5ksgtq@Yb=4wEgal-r+64m?geI6&iEObb
zI`(iKcM|vXU%@l9UG6s5er(Z>Nj~s&&cQuvkEA=-+}owFE!AU*TEyC$H~;&e5adYO
z{4FnK&YF!jtW8yU7XH&e8buxxY<p?*Mac8`$_cIZZ1ueRCFV#~UWs7lY|2!b6eH@h
z$hF5OcdGgYPjz=DZ=OdIJFZ^#kk4}3yQ$=%(CnNkXLf2vJrZ9&*`>~65ogJXM-nra
zsInUhh-eAAXzAT!G?>!O<s@3Y$dNOUbJEedVp}AI#F9@PTCr?$jAmx&gn$0>3nnQ3
zkAJ%@vt-dmkv&apZX$bT?U8sosYhChFGS_sx>F0AC%u06da|3#rkv{+I_KsJOjMY&
z`H8~hmK=dek2#zTEf|$KQhb607EGQoiK9E4+3n*y2R6}%rqe^ylgf9L`A$yRT`1ig
zTC9IFciOBP6GPchjw|gsDzT0`g;uUyELSvPMsjnuio=450r`v$W&s`>7*-^)gJun&
z=j{A1u={9tKqdL+Cys_2TH3dGrp;5Cmwf$*?b%0<oHn@xeSF9%W4~%+t39{n<nup$
zreC-F;66X!<9To(U)eNI)mJ`qoR~RT3!UVWKXP*2&1o09I8kfUkyFS1Wj5db<oWKi
zOX<m1Yh}N4^dJ6KfBa;ZV*Bw4fdxWI^Ep4sPBgIe7dHB|z#t*!%kH@e<=JljWvtrF
z;)2&E9JTTP^i_aEpt|fT|GN6azh*5ge9D(_d||Cbrqe9t@?{*NvQs_MPbKcp-Myu)
zc3tBI_VPG?_HSQzpP0AK$LZC}n#i+XI)Vd~rW`)h_~Y)8&vM6SJ>z?^OZMZoguTYQ
zjNKPc7AUGvEl4@pqOImv+Uzv@iG_~+vK14B`%9)gO06{2I>98#;{M+)Jgqd6%iJTQ
z!nb^%YPR;|2sa^XuQb_tQZn|N`%gJuF??dBD9Pv|@H<K6#Z{p>LIvksToz1jEqC|u
zmu4%evuW@%W0>=)X5Pg=8yQwKoL^a#BE&Pr>FvstB?`>{-@LQB5_5a&@`SL%>Sml=
z2A(sH1g%*S|FcBolHp^?YrOn2GhDocGZvS;vt^yZ_x7(4M|aGo8vfg#IBK7CckItD
ziFH(%_}A))X@|uWH`ey>B`01NrEH8=V_v8g_-|i)s;#|2i$O?f=SAIXcDEKWCpL%N
zU1RvjYS|r5My5osO{?@3uii9>VJuj8^Xa9aDXF<u{e0j5RRy0*nD(>tw5NewIBWIx
zG?k+c>3TnSK0P$Jp6z)#B=g3q(=73;V$S*(%KQG9y+ratFJI3(4WlWCUHkh>+N__+
zZ9J@dY40<=$osVg3%FImr3nKA+YUuh1($8hg{-1kv_qbA74M19=$d=PBWF?B%2iQi
zwq@t8S~4~~RXiXt<H)+6Ze6K`tF8zNMnBVyTw&ZUwCN=8`pY_}%4|(|G(Uh(|5BK7
zph>`@17x~<H;aKx%M_btM{YasE%tUS^&<5=9zTRW%G_a+S-o|_hv#n_`kz1kaL@hL
zojGk<zB|uwyf;g+=fCg5Q6WE3puhQRQ-wa8(8ucwP9_@Ir1+X}*t-0jR{5Ask3&g7
zW#Y!ECluamJXf@sBb(>g`e!oFKc*t%`!9b#|2xm~_#PLRMU1Au60HK(+>C#^WLu)8
z*P6tNwOgm9R*LFWZ;APu#;k7AoA@QgI^<SRMo)R4$InZG9J4oBzNq*XD7-DeW#J`{
zFLzeT<OpTTE@}$b7Wk?BpebfCZ?ykCm(MmH(~RDU+TC3Fbn@1WHf;;fB(;|+k0;C%
zQ{=4i?o+$o=fJ0&wUftOLUq|Pf2E>=#lo}ORZmq|axkC#V6<nEVVt+-ex)78f20nH
zg)i~W>{RxWlnnf|M`pdmlrx2rUXS0jGt9^qI{n(?BTt~9uUgJR(e%y_y+3(_URzu`
z5@)O)t*$04t$BK>=iHd5UFUL(&K;i0+w$Pqj?RP4{RZ_W4#htLg`+mBB)u<N-eI3-
z&LAkcx^?oQDRXX#au~P^g;*wTHj|4wEL78ExL{&UQ9<IV3!Vp?o@=CO-=1h?WUuks
zfMHUbL}Iglb})ZW8_RizFaI7mum&<FFcdKSv(SGisNvugATR+G#ibv<|EQZ+ruSwi
zAJ5C8iHbi`rtANW7ySDEjF99{7K6#2);ooMDF0+s|JCxV!~Z}`)9YhiVxEydT>L)-
z{uS~3cXhw`k$=+P4+)%eJGP{M{hnF=6Y@?bWZgcj9`E3NXiGtir_EZS0&(LDs<WQ3
z7*2WGAtAZJP)OcUanh*?D<nNxm?i~md!pc?SR0m-<d`yRQ`z##E<8MZm(1QxJ{Wo6
zMdS;G;^<#R!Tt8L?yvB5RP#yX$<t^PbWs*(a$PRdHf<r-We%4Xr$A1ZgJCu?Q@#rv
zzM$~>iGD$lMO>5r#3Ik^3TN?F#kHRV<+eX+s=2;s!ttUgriXI82Rr^Odcg9-?Y;E_
z^<OT97Dl%54E~xE)DIW9gf^<gD9X*h?)hP#iL;tgnP!0GCfmseS$>9f%!}9Z)_GLn
zqbT?G>7km{PTsMNCsiJuRzC4l)Ao06w25fuy}9lu4ZMV-tER7;ICZ9C*hX&Sj9H?p
zOh(HmT=G>ntK+nH5j>fGqUcu3qDbYapMorX+b0NI%H_Pi-Z}iQZ+1%Cyq(sjyC)r<
z{B*)h`}qA|oU?-G2KY=)J1w;1DU%Iz)8q$19;fFgKeRRIv0(WyQP9uB@=1b|Dw`~$
zScVe|pEXBI*t5A;mvrnBWPh+K;)uiUr7GSkQ)cfhPYO3!n=zxvxv6*2i-z>iKeZ;A
z6rI*!VEz-lK}4)(|AJx$hPw{T9KzxZkqL}~+6~Ig4WRovo)p{&@d3|(%l~9NBld+m
z=4$lX<tF}(N>e5--KzYs-csmAAxD9%tzc2Ndsmp^iixpv`r=y!U)r{`-JdPI_SVU7
zeii=h8JsUZKl!v}RmGF@Vwa9idR`Z%`g-36_sjRrsy^BG#QDTupLWIr!ljeSCOC;m
zsVPc^c${z7b60Pc<T2Bj$J{J@!$Yy_rof%oCvwVCjJA2)YICv?DsgkV%2c?)rQfEF
z?V==?delW5HkLVe0&Q#;Rrt8}ip;z`S6;8}wJl!}Lndc%E_=X&$;ziY6n3^Gbj<W@
zvi{t1spg$r(+Ss&5)LnS3tyS^V9m#jH7lwZqUX$L{9(WKX4#CwAG_{vl_@h?J}X#l
z&BWeaMU~1r?OfrNjC~R1w|4A5Yw7M9(!I`m!t6;l$^Usf|916A?oPZ=@Z9zGswF4%
zI3+UGWmF!WSDBLP80)Jj=ej>-%H<nPN<T|tvs%omPZ;Iqi<LGu994~ANjah7!Q#m<
zX=2uf&LqJjH`c8XYFWLsGSJ-TwyWp@z3Y=!rc4Xw@p3#db!G_5gv_rq83aGSet2R^
zi`Q%si$9kxd=Pk+XZru#;zSAMU8(7glNe6$DSulQ%gC&)(zI|w%g#Jcjfa0KZ|soL
z`z+{@ZT_%jBJ<mGv$v)+$u&n!Qu*ZFaxlP^^`yL|qwEK^P5G6PLhUonD;H_{-k3J&
z*Z2Foxsq%cG*<d8i;@(4cr>YJk&K3c;IH#i&xBO?Ce0`hE8{DE_sE6CO6QeLlA3yP
z;G0crHqFz2^q+t4MF!p0TboLC#4mAjEa{jhqRk{_aflH#oX(KimMnY!^t(4L-|Mgc
zuX|gc|F8aeHUopqE8(m)JMP?Dq?Ng&cj+<*an+8p2<1R?CP8bx4IzeHIh9R0+EQM|
z?O`r^o*xo<7Sd*1aB{79RO*^H#!^DZ7JU=mDzqV6^2mV-@L_NPGa3#FaDWTCtn|m<
zMAcN<Cb)Ptg<1;gd;D3g{)clPhrs82L-k|@?}kU51yL;4^TpMF7yS_u{<-_{r#RLY
zfBgqV_R{~nCjB{FAAU)`ZT>P9TgewivgtMs6-JVI`~07Lzxb)n!&7#?k+Sz1!4K_I
zI}`V=a=!5Awf0@b1ec0!4*ec&{u?to&BdHU_fB0s@p9QGkEbo=!JMD}@%T4iQ(wK!
zJ=<dH$|oi_H;JSRO`oCK6v?wTsZv(aYCDJbCXX7E8KyI%<=j)ZsMW>V-B;bQ%e97O
z-rgrhS2{j*Jkh$mV1>Y)+|Fkfm3ETTl%snm9DK_8HD6HZh3tv;luw4X$JVsRzH<2`
zUix@$&-5wPj#jb?ZSK=1y^p;y-R9MkJ%uY&r`($O^mB)K%jCx!JPYa+BMXcwPMa=U
z(6mP2ftTvRjZ4%%FEZG%)4J&7$9<bT?wHK)HL8$r<_PL@+Q-n(d`ReMl4tJoyES})
zCI6?cFjm=^t3InxrY~Slq)S%NC6(Jeb6%#To>XQNlsVmT-e!BwtoV?StXENLvc4jd
z6(_e8Nk+QNG)g}yv{2~S{KyL`FP^A9TT?aD+PdVdlI3Sp$-G1ho$Ygt4{nM2vEaeV
z;+ntwRyy51{*F1Ve*!s#JC6C4ws^9uEq<q!wYb5JN9x{zBe!1Z@h+<3w0h<|>sr7v
z_gc3tC4U#IwlulstNoD;^37Fy{@`OqMON7sO^2`zCpuIf&eHv1c|mAG2Zv6^RGH_K
zr2kr3_@7{4_$<G4`kw@bB~FKDFo=uDF#I!Lqagjc2fS#5sd7h4_F?5Se~jE8*+2Qx
z{$}^5BIi@JB5iSw5-*ic{NZwc`2V_4oYmj20>5s)m^FEoz|wxz%e6neU+SM>s%Nv;
zI$|sRN=aIBN%C`!FZ>CDKX!h;(0~5@nkE0G;_j=<>p2B`mi6ljeY!v8WcB5Tp7QyB
zMb$qiUt{^XZcEQQ7WYZTMz+^4r`T|QifC_pCA76<>fg{ghr+(M?Cy6`d~sh+;L*)T
zA;nKN&J+sY^d#}CljMCKfwxs!!d~ve4E?NkCp-{7Sr(;Yxjss9-!Z}MGHu~{-jN47
zU;Xv-KYv$XzV|VqD^EK=i3oo1H#_04d~dg>tbCitr*n4hor#_)I~#5FezaNd66iOW
zvf$*kcS<HtpPXJ|+060knc(>hbzehWzn;!qp^Xz2mG!)2KRN0yH)!h=+Bc((Tg6@8
zqmS$Cew%sBTJ8%TpLkz&QIEG@y;o2$$>Y%uK|Ri#c9z3}?Hs);f)*FJ3imc{F<7GP
zqN>j1&GzTFmdPB`39*+r9VIrLIFQP)Nhi19l9;^Sp6+#lJS)Xql@-Mvsy@or5aqF~
z;^a1b`a|V1LwA9T%97cSt_A9xcr~}&>X@P!zxmCJ^$UF_vpKZzSlT#vKF&`*W63+)
z!|Bu&vqddYzod9xT;fex^}5vP@#`It_avpcRXQdmFR7fmb+<y)rL^Xx<$Z^@=RFaL
z>X{nh8Q!^#(IQ9f=oW95$pTIdCzrB#>Ly(ak@7L-X10#`qu8*<^7p>o84PnAB2)Ms
zPcg`VE+2d-a3dv5Oo|88?vnp*u2Y`s%Folibf=MT3C~NP)>uW}X%+1ikB-#teep@`
z!&HO3N$IQ3R4AMXKQiIabB}b3Mn3c79A9fEivM2H9{E;f=RAfviKk0D#5VGV<+O8L
zpYZ4oOMCRA6svkemEA0VGFPdduG!_^n|LBi#%b>(F*SjiA#LJv&Z4^%H=hx_lH&Pn
zm+RVN9eGuaCSNDXHi(K@v}UqsJnBd+eqtQyB#|xjWl_Z}7R_vq5=qhZZi*}3vFzlN
zX?w=VZShFrP9e{;EA0$dC$46gr>*6sr_-9#Y2Bc@>XLB(24$A=QYG#lr!<{5rI$j!
z>T;4Sp~ts!J~lkE<cw0yOYUzc)E08`C>%Stf%DV~U!L6x!ls|fR|y<Cp`^`X5YZGE
z8sWM<QFv>`+}A3L4k}tzTFz+l>2b3>`QV-ZrISx4xO~-BpFF);y`jySf3D)pJ$~yd
z=EWZ2R+uNO<juHEZLJ~CUnA$qYMa(yWAV_l*1O;4ro!{;*FhFGORJd{8$DXA#P6;!
zoUHZl>YPsnCq?7eskA45IIa3rqwnLlx+Ndo=JEcY>N2bM36s@~KlAQh+nn+%)4?Ss
z{{6&AgP`+051sp68#x`B#Lh*@%}_fuiN}XW#a)<9uJ!HRol)AKEJW;-Ggh?di1`;P
z@ttc8Vz&}LV%5;q`#(rUjmi4(<^BIYCO;EvS=cU;bTs*MB`6($Fl&ncwH(HctF9mV
z`Txtm?U#HRSi{=y{QvR)-~Xx07#JAXEB{Pm3fp*=fq@CM?3K?!NM_rT_kpHrf)OGg
z%rwtd&N%wPV9AoJ6I>>Iv|<a1&}!JtB&s8{Q#-9LM3rySBG<X{hK83~U)H|9GOb{f
z+pEdK%R8>_+UfA(`TGSIbwms9ba<&5-?!rK-u~rFgFm>3P}mG^^lks5q8_SraZ>0f
z8R0`IcFh0GPW)T`Uuj7l_m@i+*M+w(QIy^|;k*91qPp*Y`zHSOeUW7IT+83{q^G%(
z^Xa=U<}d&6(^ALqisPYK%Qu^fdHc6aeljmd{nPu88P3n*J2}6fZ285{p?=c(qrK!G
zHucZ9pC9Cx{QFY%yYd~EzoAz>KfDw3?=EAiXRy8IeO6wq^Cx?PfB)MPPAmL*k9Gby
zeEi9_I0y9;HA!ym_UgS)m;63nrT)~<&w0kLKl6P*Ka>-E7F4ab$N$(|kLe08T5K*q
zxKtPN%~9U0vSpt2fg^u)UoEP;_?3|_`}HHc+Mh<wC#H8Mp7U>i=lRigrPds7_Ir~i
zZdE;}Cwwk;^NI;FO+`)Ku7^2Oa)i&Mt;u|%aq83#rQlCTzIUn}6j&<LBJt$oZYhQ(
zZ(ndc-!ip#cF;nrqEAaMm2Od~@SD&s(dD+(sVdLQDelR<`$vvgi1kckmhoCAGnYqF
zYuBZ$CXX#ABlZVt%&~Z~@b3HqmSyfLKaB2fDtI`llh1dO<HVe=!dxe(rU<NhYkKit
zpZe`>O>3Pd^fyd-Em+%R9_J**p)2vB<!VlkhtXyYwv(rpXRru_aPOA$atRVh5uUFz
zr}IggyQ*Gl=HxX?Cr-4O<K<FxTE%K>Pg5pqw@_1U#>-79wtTOSvAlV8iBs$BLjkul
zOk%ReW(!ivogB7CIIdYU!|}Azk6_hHTCrDlJ#A4inRv~O+tiYIS1@1K@pC6%^q*jH
zIVMxZC*tWB^&l!dl7H^i;@1tVv3EKexE?pKGyIFte;5c!Pl_uH`A)7;yz=WMzr@cI
z?T70(KKU<JcSrf`pCJD^AB1Z5@16W(>Q|TPB|m+<KjeOmnSTCVjsLAZOWLQ{v$)iA
z*Bf`%c>ZY?`jh$hlj?`_FD3bB$wzX0jSF&qB448Ne#Mc!p<mX_Kg6FQ^h5Z$kzM4E
zF6YzB)IUx)@_uT2$?nL_$v>mtezNob{l)tszqab@{X3uhp5E_Slk{b!ZTM7=>f5(c
zEZCbltNz<KcmCKg?eJ-rTGOi>ufDcey6^KywaDb1UaS5rpG)xhWscwO9R63WlxoAT
ztL(Zyv!mkugo%&hl{!oAcG}xiFPT1l?+kyz`yOxYSSOs(nI7?A$)-O~rq^r@{-|W5
zxy0yKwR><$VUS3PP|hKXKNB^JIMULD7CTG}JmNZ4u~*i8)|z8pU8+ka&RsD5p<uU6
z?@|Hn&W8#U4tOjSzqO7@pk%5Fle(hx=M0{md~8OQa~Y*>3Mm{Gd?2bV^Okkmd#!0H
zzZK?4Pj{{2+`4<Mgo!i9%?3}M<sG`84*ht$i*3t~MQSGVRywML%zBz~X?n<euckG=
zp(i3$1$|~u)YJ=)^`5veT4>plrx#wW6W3XEv@|Oz>a+#di9@+=NgD2RgX)SGW*MDn
zRoODha#_cTIZHU^S%iB{nbV}^5;En<sVl0=!m3OumMRLR+OY+Z2UXX%2C@bzoC-R$
zyyV>JbrU>ZPF-d7XF^Alhe4;aT#3vkp#(vXe$~fKHVm@=4mmJ!H8r;69AE&SsQ92T
z;z|y<7^>g?{eObq1EI{UjIaW$^T&5aOxmDgBl&@4-eWiSJ>Q&c9&0&#exc=Z+F(|u
z5PLUAwYZA+ma<7Pah4L3I#rKs-Tf(+JIuTMBuB;O?ZPkT1oYQGU$P^;iL-p&D{sZ4
zC!T3fer6Z1`fQ(%^NGC;(^pkol)HJaBjLwRz1p*#1)_q-A8<U~sv^2bx%a^nwU3)(
zqdNq5&z)<|vZLHcb)}@8<4(on1}uknC?ESa?bC@9=O*4uIw9O9^ms>8%=AXVsVmeO
zj#SjkDeo}Uadon|7v;XaNU3On(1X9<nw~6K{<8DPjfpSz_Oxw$vgAR>%bl4r>u$6&
z9SHg5!_MY)%j4>4!9xbE8fE`VdOM#K|Jw6%nmSjcREla#Cg*eI0|hMK3Uy*RCFk5{
ze7t1St{{^)lWUwRxtCO{3n|%5<du_{*U#iWudvCY;z-UzPN!Rwp3FLAwJyZHEw|&9
z5lc=w$A{XP>IIWxnB%;IdsHW#6zuI*5XofnUF_<u%vvg-`fWmxr(bt^PZxKfgR177
zo5np8f?U*3URd`u_K`B{rIQOH|35A&%u4Yo$=~eFWY{URo97R=#h&?P>t=ebl?}c<
zV?zIbojTnePxkGPo!a2xaQku@kCGcpUF$8&#oU<!le{nfc>Z~bhyK*p-8S<XJOw+0
zJM-6VOnn&t^tMY&)X9>{RTo*kB!V{EZ8KzGV66JHH~+}@iR<R1u{OFhJMef+Sahja
zO#P!)+cc$1f`&Uore)laoB^s(8CYlBIWNln?uMBAQgsP#kkIk#U;f5_WMD|w`C549
zdyRVK;ksF`Uv`#$^H{(Au>u3bf-~=y2{ZB}9v7Q+QN^OAXGO}ks>IlJ6Y};k@_6|2
zv1W8W-NP$*#+^qtg=xv_ZL<|bIv?HW*d1`@u+%lC!@Jbxu911nTIo7fG4lb#zZrSI
z#A~JZrNs*!WME+aZlAz^=Wm03f%lSxli$`&a6Wm<slq~~VCEv>f|i}j{Zx-#%~*87
z%~8BgsB3n%^pO*diLPwLZB^y1A2v^5xW9J~la+s@z}wgr^Y>O5%CjC+y_Yue@trHv
z8@4>LxYg<5D8jTz{P)eK3l;rW)&zJR_^?knxp@8Wn~nbq82-!u`(J+7`ap{uj}t$e
zdh@43d9mMD+Sxut%9}qpS#^q&M~=U*q$yUpxnQ2I`oS$N#lfEQliZV!cf9fQH#M2~
zp{0Uve#_;_A4H9;s!}vBKVGHQkW<9KsKV6ccKrRh|F8G5-BwZz@w`xPxuCAx?~(9L
z1!l>^b&Hdkzv%xzZ*zZ}$#uTNJFlqic$dX!@^gRK`{V!ozpq~+`@Hr(@0~cczqijW
z?%iB2-h1@!{<6Qf4?Hfcxc%+@-Tj4o7;bOQ`+NKL$q#vc?n}xR$toXRVyk@eWKpdm
zhs~V+y%mPMDnfP>4}O^CWS8IM-kiyqw!=}kpW`8)$8+VDLxN|0l@Ao|QR=+BjAhF!
zm8C)q8cPBhZp^Je^!2)|_;cO0%nsX}9p2ynwk>_-v%mH8J~r0vuj~C`8+os`ey{b#
zdw<Wrll}3#|NHt-`5V<Aq}inl@*eepCVNY)xm#eXy!JKt#XhpS+bDb7QTfb1Ux82C
zZv@4@UM%#qu1)pX{LUl)Cf6}{e9imdKJN!d|1{ee=Qs9?KmK3dSz&%g*xp}x;kTLc
zo?j-}-d{HH;jc%QCzq7{6!M%qb%oC}<><MBM_!z`dgT0+lZjthHdX|MsZE+HG5Pw`
zB*9s~w}!6tFgvL=VU<S8q9t?L1p;4JNE*%5+QAf}8+n=Ic#9XWQpD`(CAEpG?RU$E
zsqAhKP%eJTzxK=fJIa>QJNIU<&PWVXo?&16YxNG<qc_Z1qqaX(w&Y%X_pQZq+xw5_
zO*@gi#r4z9D;AIEtr66m!V#$5)g`lcws&jDL7DPPHIr{zJYHmD^W^ZXU&8BJ7Z=>!
zWVV`X7H4X(gM#g9r_4zw|9@7E+;vau<FvpO9s|GcUPsp|d@!r((YkTzOJ1mEkl&(b
z?;Jhe<$tX#yZ*axWz~e6HGBU{|ET3#UG;e?H&=t7{y~KZ)^#BfArX8G|4i3klHlz;
z>{zfN1$?xw{Qfuc```X<@X=On-sPM+SGnk)-6X$MrU-2p%kx`3f8Bq?DKB2}pzUf)
z(${l(%BL1iTG^v`Ps)G$pLxokrysrIwx;KdT$l39(-U|m*`%LhdGV>pUc*js!o`Fa
zwI6lc6ed?J)LVYqv!Kt+zxCaTAK9w~4!EoKADT2(VA5@Ej)n&T7uIqZo9YE@D170u
zw54BR$|{%V3Vq?GLD8yywQ28tR0LBqQWhWIIQ2&0Pm$BgnoqoBSR%Kx@#bwZR-TyD
zvGzzvk#2{Zjn0cxa+(rfJ<bX$E&uLg?ohXX0-vwJ2@#bmo)?bvt!dGDt<#()bl~(~
zM+-9#4VHMJt4m!EDGRg&IyFp<I<Ii>o#n@AmFY(#pUrgfQ8?bZ;AAXUx*Ess*gprq
zTCrT~uoq4}t*R1OCT8ASA3JSVm*L^H%QWVld4GR(+5Otk`fU@pxVx8!*v<GMv*pI}
zCwXnLtM~fvEPCl>cufAx`L(f+E}UC>!Lpv`$o(C~?W-2ngs#Z*w+x!={#P%0;(np}
znx9t^jT!&W`uv>v*YUpJ;r<dg7&N_iKYaJQoY&--$$SQr0tSOCZ-4Hu3eCt1H?>}z
zAd)^e$ZxywhoaN-wtWft(8RzpEnNSjflz<v<L3;F0{$VVQg(&MsVFo#9QJ7qPFQ7M
z%ejDoLE1(m<bjcz@zifeMdJ%nB&Y8F@t*z6y7EIm3>Y1JBc?5xbYa#q&t=Q1XP8dw
z(VY;qvg_o!%q`rLk}mh<g<taW*FNjH;=<L$Xw$XFqh~X-n1&qDc4f}oyG<!pBq02H
zfP&#dZ!s(XmW$DL)`uGy{+a8GlyPhTo!`jFXEB{2H7?G7a-4GOP2od@LH4CqFXa_0
zLZ-7_^4#}O^@?49&<A#LH9IcnrUlba>o+giV38yt=dGXD@#)+Ocgu(#2@$Q64znAj
z8?R*SR-Cj}ZCRb2rUCQ**u7l;+FUlzJ^3j%x9h>N1Jm4knOO7RbzI!$5W;zJfy<S5
zD^(u3ZOjl#F4}GQ^2xJLA4S(p@}2S~Aj<LRk`GxY{vK5NpZ(7M!oB<6r(|*;?73d)
zzu-~V{zbnI&)mZv^*#9av%2DQyT3Gcmdi8Rf7#D|cK)-kL2tb-Fh#dJ&XFkO)-|5V
zk@RzE{<AU(sejwP%Wk(>yRSKzg(X|{#R-m`RS9pezBViQ;<hoHvFmZ-9`~(+uRh&&
z`=s<tpWkJ-ZdJU)`VAMWWYq3VnVPmtZPVMQg(u!d&1T-Ez}>OuVvNxgu_>Bds++u=
zI&-FIu9`UQq|>aGk|1WMo0r2BFNX;WrZfCASufzF#L~jXz%IRj`{3bQ2i)Zkta~Ts
z7Jag)mQg;rlBHkwpy08a$~PAZe&APBUYyCar02Iv>P_XjH*V}OV&lE9ck{)Wle6Xq
znRVXy^eDse*Yv-;y1DEXPN?4tcF*z-JbviJzs^}Fc;kar9N#2m=&Z0hVWD8T=rzAs
zCa?Rso&NUR{Bs_6>`6QkGwsB=Yr!*mj`}{9J((;z_1)oJmd6^;O7eOq+P>y5wOY3K
zRdmMNWg?x9PeNX#8hf%ozLsC9aQft)-hwS<!S{M5y@>B|p4_z3O)}Hx#{0Y74}I4f
zMRzXeWn&KGTDrf7*<?A-#=ubP*PjcQ^fji=X4w2~Rqvu(YJHm~v2io(bSj_r|FSu&
z9JAfVI3C`l_2Oc-B6Hb6yIW1yn#@*FJnWcwAtuLxLAH#+EB^5}S-*M)f$1!L{6TK#
zyytatupB!1S;%{8*`?Uci-qo-P+@!WQLtO5C1}!_SDwdrxSMt<hvjUjt2=ez*R;n<
zKUn-8u^4J}9Nn$vptAUR;M0)$hfn5j-}Pkm&Cq!VQ&;uQO_}#*d%f(<{kK#8)Ym+m
zn-lVGseT62Rz9I{-UW$oI7ByY+<I-|>wxDrTlXybdi(yRGxfqYYf@?!o($5@`{X^D
z|L<8_Nw=EplpUF?XRedJ^Uc!c?)$y-&6YncdH!YbQma+PZ#D<JU3u8HP<9Jb(Yy9e
zx#yx2f+UJswl#S;JuoqgOfi}x7U@vb^`>_AA(vVHi-cnO)x$5W$$xP(pon=>z*Ebu
zFBy2w7I8N?aJeur32$3fb5G#D_KVlI<JMQ5E<2lP9#%9Z>uRy-mT2X>$CIXtJnG3*
zeAr>dV9BQLD${W3;L0+c9r7kq_SWrRA%5aVRbbbKka+*t1FmQ8zOGAVz3Wt5Qx~`F
zn2j&b);)S*4h&5vA_KAvpT?e4+#d4%<s4?KJnea_RJGHVe%JYa)8)8PsA^)Cd+c@v
z3zd@$C53^VUSV_AEpy{MAkVQ#^vJQ+rHe9Ftx~O>bm!9P*mtFu{F#di82%Zp3;4*v
z!@%kE>Cb2V%YoM!oZ`$<3I*bn!)1+jy=|PZEb7vy)mzGqg+!WyLv1TBYN#kM`#rmR
zDk8+JP|8|Oy?)u-tF!-FTIO1WJ>)G=PSTsXP20UVth22=<bKirvb6zs-wJMfV`w63
zT2-*XNom#G-m2IY`nNrL<L_R{wVb2bv}(%h8O58Hi-q3#yU6)d=9URsCZXbAr-t4Q
zz9pX2v+a`>TkGDvoS__7j|opO?A~9=r}@TPvBym1ZHlh*e!*T=%QB|2*aCjmNAZ3w
zQnR<sI%48-+J%K}-~LK1EB$<bRW`#?jpI`zrmCH<P*pMBA=sW1{|B`6DNtwXF^(3g
zd)u!qx7t1P<=6W%pT9m@8|T=N*m?KBm1kQWuSBo;|91B_*2hkvb7R)XaeuqBWAF67
zV`jJZN-XfYyzJ}wgl{_gGtce%!=KQ%r8<0Z-nM5I!O0rm)~q}zDy=E{f=lZP*R-(Z
z75}dLzc_y_ch}hjCEoIYq-UO=T>Kl?o!IxZkE3wu>M30&9dehJ^0O5)m3b{KyP0+9
zhlFUQa?q_~%TDV1Eo*sp;lNI<X{numHakxHZ?9a>bYw|#y!*Tr5%F%J%Y5qt<D8~^
z73Ep?C^cnOqv6s<XHRbB#Vku^3e8=f`)m`}j~(Aq+7eeb2VA(f;sZZtyx*?UNtdR+
zI>65G&vdP+w<_0(BMf%72N)RG-!ZBPw0SH^Na%e1VTv6?(}IhJo1|+NGz*u^n8h8)
z(99jPVCKnRi`e)Ze%S5$mFBN=PDkK~0H@CkDFY_~j(zIPqEq`BJ05?inHR{lE~l&J
zZ}pTG{f3K0`=7szJev4$;cUwyA%@jFUp{_iKODRG&)me1`>tR6v?J+bY3=6+MqLl*
zt~(xGvZ(N>*~OBxtdlBZXPum5k>9k(f1AN;gB@S@teg8F_VXwEo!{qI9@D5V-@C`@
zY2lln@%*em<tv|0nf|9<{nU@U&gV4^eti5`vD;q!=l<eY%Z`i=&&s&)?o-!i%xdbC
zx}nC9DDm*5jSk22ZEx(`|L)0MzGJrKf5+L@#gCNdxBuL=c1n=MvAH+9tXNYnOnj8T
z?PHl%QG)1s_UM!g2U(i5c%Co+^Zw8O?B3fy|8M$n|KhjbTc)vf>3%Z{b3b@}9&dg2
zsrzRiy*$L&*q-~bGPYkmIb@+j()Js#OTQ`Ktev)Xmu=(5;)9?6U;n@ScU|_TFYg}+
z2y%$bVA!?Tqi|WmQ5E(*|2N)=Xq|9>-plPW6K~2~G_gCd;>mJ_{j1d&s$#D0m)x3u
zcBabD`;{N}zx=#|vq98CGyhBNqz|2!%k9+7%iL2ckE>YTj`}=HzIla?h$Z;AKNi=7
z^U|-@Xtz|lhIz*PT3@zy&eL5Va~*A`u$-GHbl$M({I-cv8c96&BoZVQ^e&V-JhWgf
zNiW<xcgiijADa%Zj=pnM_T3`WG;>upIWGqbhJTYc{hXY(b*A~%FMkh<|1SM_Ki=N`
z?fU)CZ^s#J`*q%aX-##PgyQY08ZrL5t5xf&9=g|6WKFm?YYT^tJl~v(*p6#omD%oY
zQ`_-Cov}|{c!r(D^L&4{_IVfB8=hoJzL0BB7xoh|j$CqG){ntn`FSnJU*<3R4tBjS
zoUCW7l{l`clH#6hrFDPbnOlF2qW#*xux*?DclC?ha2dNx_g^P6upMjsQsQW}>G_JI
zBEi!yfB*bF{POi04F<#MAC9qn?>Tn=&+|FI6e{mq*meGrU&yjy;t5wi>ob4%o;mkd
z@MP`etC9Wx!-8J4YTo#Befmq^j+UDLzCEHX)dh1rx4!u4-L>~U7oUibh?seaS%RQd
z-4WiW6$$U3@csF`Y8t4a_D%m$!-gXd7#JA%?=Y~KN)#EFvlPwPWU^>tQ`Hpti3eQE
zny=>s`Yc)?zP(&-{geC0|J`rSJFCj}e%|Llx0T$Md_Nj~Z2r7GvhwkN^S^E?(3`08
zuaU9&%9b*#zsv1^GFLQo-m~UtboRO&*q)GoOGCazO(Z=i_SLPh3eJY|3QralPKHk3
zShgjeEIYco%UdS4o||bRI-%w|6Z@$O+g+S9f)*63q<)^a@qux_`&!mtI|Kh;P*eKz
zHz_GQm&?knTkp&9<{L5Du}t>`h5A?8?>FMzsmamc{#|CZSX|?I77zQrrOyie4}Oc5
zv2RxrE8Hd;J^A*p&Ckor9KtV5;;7n|CMeQ=WOYGlKS$5<G^O_DtOII`Cd~2T(p$A#
z!Z%G(aleM|vUA;k-i7(EPFVKp5l?6H2{q@ZWwD9Zuh>+6O-WLip?rSr!i85vJ&YEz
zwX|x>Tyy4)Xi5;AZqe|;<+4lH$CmzxuE&#Hn=E)l_Xvf~OuDZ=`Dn<wgRO7BZcO)J
z@#Foyn1ho=;naeIUwUTPy}a?^%;s5hG-7g6dy-!D)}B0E`Q#K&+2aTI{ND4L9$v~H
zyfVPWXwAEZNRRSxPRm<9qBlC0Jt(mJa{A!5D;MYL+zz?mm8u6qNsp2q$S|_N?<oAw
zAmDT&BT2w6n2l4Edqq&;X6d4;=)N?YyUdq`lNV{+J0tS$_PS)B>-tTur{!ZB@{jwi
zD$>mPe|_h!gTLOks3kPNEcqP9v$Nuv-RZrL`M7`8&3~u0xH-+M!G(!?f&7to{-3{y
zC7o<l5c-qcuiY`rKY3oftZDsJ#|=+6r19*mJo|U*x{om%`P^45@%cHK)htk<%dhF<
z`KHiQM*I#%yW^gG<JmdqrOj!Mi*t^z-St<DZ|B8%+m?RV8|Jh6SEZT3%*m-2)1Ue*
zpQY^_EY@M`|D#-1vj6Z@Z|!~WS+B@_csbKz|HDk?ryP>Dv%hv<a(n&f{<po-H!5w6
zD)PSU>X4oL-I$r(<&5mimY6aN-OlDCcFO%P3_je8e--z*Xus98lPeETOOCoL{L!p?
z!N1I3(ls(E4ylWn^Z4^FFqW0OaNGTRe9%gu!S&vb$6Jj!I6Lb3dD=d^bHqzO$yzMO
z|5E<R)7XV8)-rtZk4al~Cg!-xyKT4S{5ehK;w5Xuml>S8AZ0w`$N5)ZBz|)*?`){e
zFK=6*Uv%B^s<e{H#fi~byt41+27c$C^WF2!jP_4o71kV>$Z@j4RKhyec|!7`kDW#W
zvy=Rvsk46m=wUO(VSd!@&-beiGXL5et$aaslGSZl*J}0Gn=*|T6eZL@pI3MK55K6|
zBH^GlPfPRc`4S%9*qr_OZ?;y~<Q45NYIYjbN3`r%!cjRv!fM?MQH5N_30j_giH0lY
z>~Ogkl%&gZ>%tShAVrgY<4vz~)10^p4Y>Sk{MDzuH%~}?@-9RTJlD#gRJJKvd@0Xa
zFQ1%T&1mi=X<NhuULN_j&{p7!8)I2ul~9DE%A^?zw|Mzug?*lA*KSy#*P5~K%u#nP
zMqP2=H3v=0SM0o;^VPU{DL?3XdHqLXpfxPK`Muc=ESsgA!d6U5u;X8#x#GHU%uJ`m
zU-Kq!Y2UEQL0B@ftj&h!@Y!Aa4}SVCa9Z6$XOHI0`O+oA)*W*+LM1&nT(h#(Ka{*l
zz}2WZ{^sJT-Wko;tCTO*PPg9u)oz)`lC`W_{Ig7MGHiP-E+aBy)!of+j9g~AIPP9}
zT;<SN)01L5IMz)RYMCiNSvA0EOLBQ>fy@i;AidJ=Adyxn`Nct=&rhk&{~=(J9XvrJ
zPjCI7UI8P)lu3-=&zy^$nsJP)NsmKba7BL*-{l71qy?RMQ)YJOt-XEe>JE*PYW077
zH5EzRyk1-nL?zyS_-QL<IdjR0mB$0`D$Z~HR$E)^#j3{WouL-|$@I!2L1h8i9L^_-
zdj72w+NY}hUDPqtNc++9%aePBUespA%*;C5YPaq9<QLzIJTxrXIF((Ll}h~^)z!-E
z4+!^GOfzKp@TrKkiLc9rQQ7v1LaK*V?xiM)S1tug3Rx}sPc~?@lt^f1XSgd$#h;uw
zcj*?bC&I!LyA)kjUTRw(S4}G9*nGmxlXcTCwQE-jJzZR8Xw8Uu?@{(onR{Yv-o*(W
zEkXb9vtDO<z`$r&Xjo_|F1aMG<w=YW6g~hg6M7^L8X;yXTpPh?!yc$O@sfa~T2e}X
z=Ewf+UeDCnCJ1a^$-(%O#e~Ty<K*@BJ1LCjleNkpx`@16IIVr==8}E)gBNv{D$HS=
z+*r`$<oIIi)SzHhR+l9So#AID9Dm>$u*hBRQE{s90i#B{kZF!dDqgoaT9!?i$sHh|
zxLE3%E7K(fpX>7Ny-%l?Fq)iHX`QsQLtR_M=R&7@pF_||pD8|{IJ+3t+^4L5^2yui
zgs-7?m)c|@&Z2cPle~O_I)ZoK-n7sou)|SlQs>+&4lfpk33^`aa#3BOJ9)((CPQVX
z?XLyT3QXNP;m`?HzCczNwVK|$2|ZOizfCxx;D6xUg*7Ky6#O<nIlpM;A&*oQ6Co{*
zi7sl_)OC)WcGf;NS8?K;Nn!KL8d|PB`7~)V!<@<|3P;o}=igMjI*E;^M`ThTQ^*Y`
z-cJ=qlL9*hSvE}Yc(O*=&St`jNrlC`)@_}{lH|}?7O$Qcv&^bT?uM_YlFa{_$p?-q
z-3Z#6v+9#TLh_G@OQK4*zrI*_ZAawq5U(<iIo2xclTPkiDdY00jb~w_di#073n@$P
zsQDib__4@%Ve}5K+m?DQjUQFkpPgTLq+sicoY?t2%G1sWpB9)F6sQ_Cf8oUQQ~KV;
zu6MUqn0h3DdAE>5;uhw+QF|9Jdcxybywle><E&f?-vgf+8zwMta&dBTR!mza6sNlK
zVORv0&x8eF$j8WqsE7)*yHxH86|ME((Ai=B?fX*>K^C!HDk>LM=1RE8PgC4`W8&ng
z>W3;Ab(}mBH?tUqd9=2?j8#v{b(Cnm<H%#9T_$N=c*5a?z!?|yNe@+iuoOx0DoR~F
z`C`+P6)uXs$0qE{x@u-)H|KP*c!%eU<DWR}7zIq`w7s>iF}2vW{+s7a+uQ}$9(}QC
zDH4)RYnd5;yHsG^mdS>d)6!Mh*t;X^{jc7BEhjM3TTQOkP;md{lPdEn^Cc!+d>&)f
zP{(6k7?rkmK97UCLde}!*FQch4_zSBR<(P|-t&ho7Ay9xKfpQtuTUo+AOHWieI1cE
zeoMPlTJ2<P&$!mE@WVW*Lt;+wxB8H^ZZ1o<c7C|3u<p`iog!CryUA_*LhINIFW*r}
zRs8thJ)m{U$@~B3hVMQeJn5~zYuphIyV46^HHtFpq-BFlBvcuNC3L=g3^_PE^-zAE
z>aJvt`>*!~Z%p}6^)K$(tP3t3JTmsn?h82@Y0Pn&RK?-c<QUxH-T22p>i(4wBM&_u
zzfCJHCN;m57oMGNY}I*|(Wv{Nfj~&(sRh64+kR{mYFqcdf6K>UJ1d_k?HRS@MJGN^
zH@Nk9hLh;c$mzR^{g3?imw1*hmbA`i$(zLmbAEf}yv{On%=Nt%7WnzLuULh}{!jMG
zkJh@&Ol@3kdic-ti>VWe*_QT{zn+*gNAO#uCFg5j^9P@@J1@5RaqQxl&FiwFMJ1%b
zzGRp8k>~4MUiEP{A3C1W6U4{g6hAF!*OrOw!gC)4boH<-(rQ*@lxYz^-=fHIZDyuP
z(u@SLC6Bf=YBR6maPW2O6q&tXa)Xj%i^sF$M?ZZ^n4CVLQNjPo@lPoapFDc}bb@j{
z|1-0do~w+r8DiF4$>5Fyl~4=}Le38Zwp&O}Nh>MiT5)dr%Cq0^t(q#W=X2ou+Duh$
ziD18-Tp3N?Zk*4JI~t6)dT=h&Xj$39vBGG|lC?q2duHixm{gK(Y7x2Z!q@u;K<hjn
z2{IgIPyii`a9!vDi{Z>oR(m=HW>>u2)mZL$!qJjpwVG0h%3Gmxv;3J;=A1D!cT#n#
zlxk4eHfdtq)tS7TU#BSD5fAp@a}++QHtX`9L&CZd3l%DsJaSYNSdzb^J6WKYVH%@~
z(S{60!5o`<o;HqYwTr~XT(eaEcuE*?1|NS~!@BO>wv*3PWmqI`zv^mDpA>HXJtSMh
zZ0V!)OfB<H|4q)k5iNfl1>;pykBHi|tZ-pAnfct@D8a*JnU_pZke{Qv(n_HoCf8Xm
zCr&Q<@Y}A8XA^s8xlBXi?M_vzLWL7*Q7lh=)jN(0Z7t|woWyy;TT-~E<CdG#s^BQG
z6VVGi3_biKwTf~jjcv3J*{u83WBk{sAaR0D){GfEkCZ}r^sn6YQkIz3p`Cf(M`@~O
zyQ&o@XK_cDquB!w|H=JR^XIgQCGylvVoBtfd?Lh?dA-8X1%=`vPEvCo@+|ozo;Rs`
zheCBrLsD6e@E)g%8B0Eha-Qn*`@CAG(pJYQK*m{Hr1gxZOl4>24kzC^&1RdYyH7ms
ztsLWWw^gxP&z-BtZ|hv6Cj}kqOAiUosZx8XF(XJ~O=))4tD_UA{i!UNuxj$ru8uX%
zt$Vr*RMrKky}Ze^k!Rn8dA`*ReOju_t3F&(>7IPZZDQvcLBr+RdZo?j<_gM!&#&AN
zSbS{7CyCdg1}<%<9vk@VNly@48``-&6oezcMQ#s`eA9O~G(9pkG%_&|d`iWmz-AZF
zL_Fi~M(x7R@B3bBmK?IE4dgY~QQ!XQqkVWqMEg2>E*5*s_(_NCt*Qd;JLY{>T=yf-
z{h`>6`A6-SKKaim-y>AteSh1Mf4VO^YN~(hIKSAxp!0*AfVU#gjGvwMJ~e9UAFQuD
z`D1wW<gA}+&)%v{sP-tlx5W9lU7F$(y;IJr)6~NpdDb>nng}gcX*<10;6s0n;<bvI
zva>ywq$u1~b!xOxxg_%Sh-u^{YXf1QB!5K<KQ&)YJJ}1JU*~fQf3SB^-@nC6&8Ow0
ziduxyn?}39>nFe8yW~H=*eJ^PB#-SUi9H`}-F~|XoG!T}Bk$o{+^7=e@}YR=clGOP
zGKPwW)hF%p+SIveBKKppJ2z!YKULjq`4H7qr6?@J<bB-RQQ1Stj=!FLhMTLAfS$mb
zPnH%IW)r6CziqLoQ4-a<xicsra`GafXilL{oqY<kpI;O{G4V-~aqP{BC-$-&mY987
zcg5sqg3TITM>89G(kIVnnIrL4OG7kj!@j2TevYachkOG?{FLnyr7yh_zQ($4^@LuZ
zmW-A+2M!j8r}<1UOxak%*=NZ!S14+pj^8nc{@{m83_~@x3T2&qpb{0};S<O;BgnnU
z&50vHdBQ}ChGe1kfVmPb1-G0`7}c*T_RG##WZLH?$`m2ZvL=}`%cSK)dB?*xZ{Are
zpG@2@Fil*@Yl~CBi{uwA>G3O%YR9TXyKJ7sd1Prx%ak`WwJ&)**z%<OHj~*QZ|3Ey
zD^7SvIO(+p2A^DcP;pY%t80ofI<F=PYAQebzv%d}8~dOBUFZ2uKkh<Utos?AhlR8B
zji#=u`lstM`|eRKmub3-0=I6_(oOnaz$jt5{&~PcaH&`SG=Ou5oVJt8!8T8oHI1{<
zX6M~gS|->nJ&!Z!)T9Ty581JOb+otn=u{_kY4YuJ%0Zq|B|kh|7T;3MdfLm;oj37N
z(!1`K&A&F4Z(^wypTk+P|6YpC^t&Ea$#W*Wo%>))tf$4+jvL#Pj;piEoN>^d&U?wl
zm8Gv*%KwDKC%Gw~F3Q@inXmDr<wx3<d3>U|49$|yOQd#OTKn((FHgCHma{HN?)|$v
zXI{LP-xY70+tZmj%_l#eCjGPD@M((R<Q5fOK`9Z=dq*dAC<!i_FjJzXEw=M&3yc1W
z{P+9o4vHOBh-SH;t>~fX6?^GWPLJPQp`vBN^Bv|&r92N@`OLFP!BKF7Qb@jF^CDGk
z9%svp*&Cj`m1wx&l*Z5|^^$9ANBkbAwTFaWMou?g^zQYNLW`g%=_95y4kewQd`_p$
z*u~j)nc|%3Q5`e2g_{-Tv`;dqxVK@A$F>vK=6DKpxC<H`o_WHtK_THv%Ey$q^P<%R
zAE>At@hUs1HfOcVoS93S8Ja?~BxYSspD+5^iC>bXrA6e^)|(8IB%L-h7c%TRQfVMD
z%lk>>g(m`W3lAJLJ+a+Vm8ILqFX+>P6CErMx}yYwmbxV@S2LR|(p@W|Ge_dvPY$#G
zvU8_jooVZi`JS-r@$FY@-Wf6TcKEEQ4r5>Kz2attQpSX1M>y8FYR`<g)GAW(_><T3
ziDC>4tTPsV3f$4kxTp)9BG~qR^H->pkx=G*cw>L~<bZFA4|htZT5x48?GP1Do;`U*
zvH9k0+6AeOH<ui6oRzq2spz!SS27JjO01boo&E>kSl3>&il1S{xzX0;QgG16<5h3s
zkDT2ZBi0zf&jT9Kljt(qwtQ)tIG5mT+o>u$ToeDv`=6`cKmC9Gxw`ecAI@oD)c>uj
zdd{(PMoZb-8lwx7U-Ai_3)DB@w|&>EA|qxIVX<z>6Px|(G!kC1Y9u%?mtV-9U&|#W
z_+iH<XZ{0>9G2@J1}R+u^&y%2d#`PJ^<)R5&C#Z!Nq%zPY%83WX)|0AKJd!(*Zwa}
z?yvs0aJ(sP32ag9J0K7nq<3!O$NmGG>}T%3f6{)se00Z$dOd-sGZqP!3O!5dQ|}Ob
zINzSLw)8K{gg<UI6^geCQ;IHn1|1Ua`4l0MvdeyN=ZE-&$)Ap=sUIn8SsAXF$Ejh&
z8TjDIhkN0Qx2-MtejYxNsxh(BbDRC2&bcm?)0;g%@^JEeH466eQ=P=Ia6*wwwSLQq
z3g*i!#eZ(huXyhA<KCXBdo?*Fnam507}kZVe-u7^#C%?f;<NvP6F%g#DR-O}aJi(a
zD7X4fN1EkBzT|$z(3=rU1a2BR%9b-IUup1P@#Rr?#)P}CPq<I;IDB#|M{J-T`x?dd
znznj5^A3CXJ5N++I>=$PA@`^4d66j=6ZtIXE6!QrG$lxayJhQ+NlMS2sAqMq*78?%
zD?Re$qwwwy<@Q}pd@6Hw4l3PKEc%+LE+g3UNt;u<;FTB4)04*^_pP$1PgJ?Ba@J|l
zDyM~=6I8Cf%$#Y(>TA%_AneD`AmO+|;rpc6s+%oEMS4#rtlCs^&hPcIRKfPC$`j@X
zo3l>UKddNLn<y|>hfzB0q~DF`ok5OA1aA4No6Gt0PF(O}(&AvP3mX}HB$7hgoKzMG
zJ+878Gii7<xml&@wf7d!V>A4=$jsO4us?e4;LP)<eqM0#;xuB^Xx)-DDWZdM$tB4-
z+Zs7V%3i1+5M<$7^tJbX<GlL#)X55){E|=KX`1-kzTfA)<coCyp{E$GblNB|Fu1%B
zD!svc^sbkngv9hs9j~`4g;x8S=ZHmjzMXl=yJt%{%Na)>e(C4ZhtBxy{XBh^W3SBf
zhv8G^2yLF<8W&U$QarzP>6AI*0e|KFzC|W{^W1y%+fkFb-+W)LDC%13-Ew!G%%bT*
z4^?B-G)o>Z@>s5an9wm%Cj@jF2czw}>zYfSW(eK0W}M%zEO3xxO1IgL6AyOmlozcG
zXn$6IZpq)RkCbX2w+UbCW;qo-nKRMyC13G5jw<)B6Z~(?*X#WCeCOoH0v->ISkj(&
ze70NXe#x#``KABV7Q2(_oWH$KKKZpQWZpARo3)DqUI<-H<V>o_)Mx*(r(fY=$`7sQ
zH(C;6m0RYx*%s-Yo_PLUM<M&<n@>C$m&i)=d=fnM(&K~gO}X{wIlVt<+TIUQU3)>`
zN`aBM#X_BT*UGL5J&oNnJ?@s^OW7II`OZ#wSM#6o=@W}NlWeRzCVrBie&Uxnm-o3i
zW99o*n)VE-9R;$DynGCkja+)W)lDY9Ws<UCwE6d8qV7fMAj{J%IZr%pZZGjx?P=^!
z6FPX}+?C=5C+!M#bc2G@?-rj(n&LNANN7p)*^L{HJUyY@eMsQ&8P(~V+_pAFv`#qh
ztz6V8dPsHhWY<t-p{<!boxF4BM{z|Z8agXYd-DA8ckgBY&a4Vj(hC*y^Hh+2IhA>f
zS&4Xg^pUj<E?EbBiz`%Jm{{3-Ek%wlObqu|6Bpcas8Cs8%_}ZJjZ2;?QP1DBH1oYz
z)cDa9_Ih&fB;_UROFUIQQc`?YOrD!E!E^aDp#`0$m(~OY$Qldzh}o4LRZ4l>8{%rP
zG@weCOPBZm`LCYR$Gf-6m9Vj>ePfEN_59ZJTj{EB=MSL|+vYR&Zqt9dxWebaeYtsp
zVG0MCI35<*Y;+b<y&2B?a>8?i2XEHPnRvaNywH5L)cL)OL<592=+@_oRSM=O%=0RL
zT*kHcw&~Ga@hh*i^+mreI&WVba^<4xzh~z!T0V;SGVh6~uad6Tkz|v}d$_|i&uDJ@
zkiJEGAKUgwwGtz3pR4MU%L_Vow9LBc!I{ZodE|0|qH*-^ll`Hu6OE(Sb$r;jy#cg$
z@!3Q}aBWflB;W*>T7=6gvB=4~!p~C=h8#V)<fZ2Y$$K$dCFU#@I53;(%)}4>mz*rN
zuby-=z4Mp*%_Bcmiz0F!io9ORV`z9p!P#>6gRWza0@^X2vfH*y7y5KlKk54?yUHIi
z?x$ix+L-qXUo9=F<NI{dp0}vd&(c_<Q#^oU@?t?XhCXj6XK{<^9M4mB)*Go_`K_pT
z^;T!m8QJV9%7-!&$~>PiIja`5zDn_9Vr5k2p6<yj$;T+U&EsL=lj2=YmYkoGJZ)JP
zaC9e3bX>Popy{>HS~sm-*BJ#*D9ikPJhj17<M`3TC)y<0X2rSukvICaE%SuR^oKrY
zCU`7lNsU&{+bLbTD$MV)XX>Yv4F)eH|Noxs;p+0F^OvBov(jyqiSjc$A|iFRX-qkB
zMe@WlN11yWeSDoNFFiB3+}<h61*sgIxOt*t(~U`Io*L*@>h9-MY<x0x+m~r4T(%jV
ze9Gr`#>j$Kkn>IQ<=___E#hi^?9vV?k*1T9I-)qmxfV4z?$CBx7iuX{xHd$nL{hHS
zbtU8cdwCb?)Bo&$`XlPfk<%hGg*+IxHokr0lyziJflEhEjrYyl^DEo8q)rr4QZ1{E
zXu3I>#qr`vk?2DgJw^K4{Z=R|E^#Q*c9aN8Z3x-2?&^gCPRpOSPI9&@20l>@ni}Wo
za5cQ(|ED)=kKT+3-c{8YA@aoW^TzN0Uw_`Qv*v;h0|O7Uo4fMm_$RB)pLu|`axE7B
zqpI|}`0$>`*X$n75iI+DH~-xIg|B_=GP-BHnDFKPi)S8eGv3S(R0@b)@aNo%-}(0|
zw>^FIvv10^(%`3EdA~m@HcU7j%rL>dnf-aErESjB<Rv2g2Xecwowm-s|KZN8&XSOK
zeI6_s8Hp_hheR2p=Bn?gEuQ{6{?`BOANNWa7(Se9uKmg!sc>N8YUf0D10Q`$Zvhqk
z9IpgZrL(FzLAw}2*%rL!VDZkK8L6K&ae*O+&odR#NfR7y2RSx|91c{PdE~Oo^UQE&
za3}3yVh=dEv#iX|W#o<5SD*4xQKs%k-~4vLgP9)JE=~BvD-_T7U%X|1OZtazihsQR
z_d4?ApPUr0@%N3JxR&y1%d0z|IK{L47irn=G5=t|^bc=;-SYl0|L1YVAI^Uc3w(dR
ziL=CKq2(ve#dTiC8#kq0nLgp=gqQQ?DSqO<onrIwrl;#el^xf%O#9y;bf@rBjB}R1
z`f9~11t~W1D-^GKI9+*^+Bspvk;DfbXU_`WyQ1E3V8R25M^<N2ekqlHWm#U>6tRa<
zH@V}ZWy{y9e=&;3O;jx1GUnzfxSPxLi#}hnL)}$%@yAYm*;k$A=Orh65#M;ChVOGH
zf5ewbGTL1NM_zS&=*v}q)M3o#ueB-qotg8(A2-_j{uDXyysLa-honsJrJ#Z%8+RRw
zkUp}hu!GyqS$d}8<l_^ja{qdATcUA+;PRT5RVs!OkAxOQY&@yX=A}BxLUy8<ukxuv
zmCun%();E(_kAdm)cfMxQZ6L$!&0MS6Bo;(-2yu}IooP`k_2QN5*1@xTuqvjCN}#_
zl5msZ(qju<s2iy0mcCg)MJdZQlh40#uByP6B7wC6jXYDNTv{cjg&g6un6pZF4^x4N
z)83QUBpDy6-gJ56q{@~*L6|40yx<A5)zP^VZaHL~+mcnSqBUjCiE?MplZEW@N<XeH
z2{IDaKBCBAUwv`f3=Y=%+%23pt+f|d^Y1K{WMDtWq40Hw<*P=XUq<RIr`D}KJ6n61
zedpia`p%!e=PTZ-FI%-cJK4mjMJ?=m<*X~hhgih6NQN>B9B53EJsK%{JwQRjlDXB_
zM(APG#<1R!)iukVjItlhGF{UAL13XIm!N<Kg9^japooQn{J&?#Et<7=!?Z)Q+79x8
z&$?-t5TX(01wKGRYG=`|V+D*_w}K*<2{q4FZ`HK6ObY+SspDE?ah_fH$KNjp=6`+q
zXWhg<Z>voD{wyiExyj0p!<~PP>dsf&>H^xuPfYl#e*KXB;m2P*+;7W-g&dx6J3=`>
z!Sj@|&`*ioK{f4d!awgmO|s|tZOGj-W#L3;jwAlJIlO%qW=z++G5JUKgHIM`1SPL@
z_PWS2G%QLoD?X_=QQ_mESbyHhy%F|`2mgkfPwW)xXz=PYnbzMPbwgU@+oa_mC%PnQ
zZF6^(n700a%3jM4C#GN4Z+)_d+og}YQSkO1)xaZ~Q40^al>|lAGYUMruX*Bc^G8qn
zs$UlKn=D@1RUhCyA*r(IyeZ3~>UARiN2;ByWlmbYI>DiE;(x@aBBohhe%k`x9r>fU
zS8>x(!4RQPt#nV8jiRYtM^uZFVy+*1veSHi$M5%jlOGnJywKrR#<g>ilax`8Q^oC;
zAJe7PAN>jBv%7U`9^cHgFx$9^itHQ?Go2*Q)%aU3Q7qGCXKFAs73pCU7W8nKYvrpb
zz*4YyBb%RQ=Y-#XS~^(Vg%)n}*FDbpYOCbkC-0qCDi%yw@u>8LU{90sBtJKqmQyMZ
z`BLmpe9`YozUq~_$XDZ#ir~}W#O*A9{1<#ab;CuN-)=(YY9j#wm2ZJ5*HWekmGXGi
z2pP6`DV=PYby-gO`lckKltqGuKf{^x*#G`~tiL;E*ZbF7mTsB2N4q09<V3Civz1pp
z_iW0YJ6q<@p@_|<JFd$=n0a=yC?8YC%*eDUoP6S5M-!iMY_77E*uAOlT?_jm^K<&_
zS8i^)e=pc0YIWCw&svh{kqK+Ji1GUbdfi#o@~w3uvp4rek6T|q*-iJ?ynM!H=c=$T
zrqK%vGu^rh%CneO8k(#NnJ+YdqF=T$ljWY1f<c|$8nRvOC7(aCd2Z$KW%55FaVNB5
z;>z70%s1_nx>(D{$b%UCpP>7Avcbs%-M5lIbJg^pk*VDBL1IZl+r4#ODmJs5l+C%E
z_tm&7eoua}Qvc}p6`d9SR{rxhG!(d~$1A;LSu8N|^3=d(%Sr@h@tLoU@YvB5(NoCe
ztazkG#QjhW!}K0o1@A+1Sf<UIyQWw1wS4(mk$YV7J2t*Qf28ue>><#VD7DiMsr+%C
zrTik#&aKW_?#{VI5osqc_AH&YY-Zn5b)Iuwih4VF#g4RXblK(4tXj9`OjhXKoxdK}
zuJtdTvuK*dS0+Y74nqzHMgxr=iIzl@j)Sx0wf(O5<iDy5x6N5P&+(T;FUxth4>w-l
zUZLQzp|`hH?fSi=0nKOMeMzl!IWZ}u!Pi&0Xd<TxXLh;DHpx$sQY{NMo%5aW`jk+n
z*ye+bEi>JGPpcJH9po15I5x-cR7cdKBYR{Y7S+6YQen2EWue1osa0NHi<Da3_U%5T
z*BaZaHsi^Xq>kr1LY6L6%C(nS8L?rG#*v;R0r!2@0*zB$1ippW-eqZHR&Z-&s^mGy
z<F;XH*OMcMj1wk0ew!!pAknM#YT|i|A1M=cnth{Rc=(^5n9r>kT<6{NxHDn(-F#)u
z&s(%kgoI>?@amNmF*o_~**7>bssCKEI9i$Oqg3{k4$spoyKbi_I%F_58@cx1?^hA(
zTBO$EX47QAJ=<M_$HeK}^@+XvG>%2e-W5B`AF(xX&ux=Q5ntvX{<gpH)%t5C`p-`Y
z9=XYMr@Q~0n~2sWL34hY_{H2@v+P!>rN5Tn5P9;4SI<m?Ycn#K{-2o8x_7%+R-src
z_jJ35nl{E4|4JL*KKQ|Q@pbbFPkAC2FfgzgJ(1;KVtLQeD4zX5k-m%VW=4i%al#+}
zh)><mbUx=)MBtO7b2Y$~I)kx}&<bT%ovRD^E}Ne6o~N<s6qC+6y;G_iUhVy6Vq~tJ
zQ##*zQc&~avpk)NJ`IzS6*l;M+*qdUFlY5e^=T|Sbf5dPE?t$aAAQ?so9M@B=O6|3
ziq0$(P-l)|3hT+L2jaCwGVTUWrJE#jIjzk&BJT@aC@y0EWXQjMzJ^DA`|UR``Jz8&
zw9o!^kY}$_)w&qhs-CyUPMEv-8%s<-rN3`Ujrd89zzf0?C0c75HP@>rOHX(raIPds
zZtstK?w_W<T@+_J$<OzbpjK+BfT4%gQNa$yDJLdAjSE(N6IbUg-K;Fba;ZAX(6TTs
zBc;`(O`gR&oMTGz_qWC;t@ZLZAC2BJm#g%YW%@o9{=St?J^?4&A}u2p6@Lty`8%M)
z$E8!r^}$lLcA*1@SMG?^FRHOs{m_5UWv}5=p;-Sce+{O91~<MbeNLHt;uF-je7EF}
zFI*Hi?Zpu*-bGbw3Kc@s`;JaXYiPTDY>BP>gG+I<Ep-;3T+ZSwSkSX{V$2!ABRjsC
zv^&0R`ISG*<CpTf3BB`FCtaI7-K9l!(k7+6N}jrcIrG|_C8w)4_8J}C^6=biMZ?V&
zCp_2q@TeZCXtY#P2${%I<X~?o?6~{FoSnu?HW`0n$<S-yonz!ZKWIj4%tUdUxpysj
z&9+QuJS?=mVd_Vfg3D{x?9~=%ST4NUr_U);NNOoxz@yNOX^B2<YMCwy9;%V|H9Q<o
zEj@l>!rx`x>JQXXCbsSGnJ(oORaI1%a{OjXdfWnWNBs;z7Apq%RhPaQ<{a?OIN&<D
z^})sP$20#d=bgRy@!P4a%chopIjebmTZNd)Oov&2zs%^K-80oPENk_@ow2VrZi}ct
zy-+>!4qNpHmj4svzOq{tMc4f9n5}R=<djf##L`o8HP>hHICO1k<m0qr{B<ht<m!l~
zI8|})wSjlmB#YnFOG;Dj=o7Yia$n>69H}=?ZJylEsr2qo`4q~yTU_$=tET<nbO>#@
zvYs@(z+s#6Y4#)aV*;%vlMP&SpLOJvK3V;%qJ8@QWsLSK?2m}_&)Hk%EZVG`mZ-;E
z(;+_L&4jrjZRNL*)V2J%t9-jsP$J>&oREo59&41X4DIGWE{booUzuW6oXT@j$5XB1
zdS>D4jL-=OvQJb9A7|OK{-kFihops>L8ol6vEAo>g$c)Acot?<{96BO;qCoi=KsYt
z?yI&g@HFb=kWhFX{QSMp7omd9Df%CaqH;X7Q@^ZF{B{3?Ku||gF^g50z;uyolP*2Y
zFuLWo;D}Smo*7PppIgkYduAE2Y<j13%O&TGvK%L~>+vgZRRkB#Q|-w#x_|0cQ}mvx
zey3QEuU=SNtgxp0Q53_yBThLih1aE{9=!UY(mLz&irt}K7yl1`9ww+Dwux=;>zluO
z7hdo<wYucDrUL8bi5(8x+)9%oY*^VNrX&{?Z|G=G-IeZearG09ldrzk9C%{NA(AST
z+!MR7=MziKhEDgXzls+cEt$h)|L2up)S{r-i@X2zY-OGpbZ@4-SFZ>EyuNMkkJ=>~
zpO}0>oHd6hKH7Noj@c$gn;2Gl_$df9oMG_h``76^bLIrO_!^`C`HzKn`l!_&v31pp
zdca`Su%R>2VPjx|*hG^9Y0~`W9m`uzKhkW8v0!8~UH{x*Be*21pTMbbByN*IqR?#V
zhsJt$kJYWd^lh%l@2ZU^=T+Sdd5k=NSUWjS{wKj%FP!J$eP(XOw6`ppfm`Cdt(qjf
zm%LDEk!n%qU%<KPq2`OZhy3gWTK0O{s4J*HdLJ_RSM&YOtVKe-EE7|H-85)c;NFz^
zGqmN#A_p_3pdbx_)n28HoK`||qTfB`r`vHV&hKz{oUT0Kz^#pzeSV^$Kh|$}{J~Y*
zAfa*Yan~6$^n`DIIuxswDWf){FY_zI)09uWZgM+rY}#RC_G|Bs<16m1I%)DmxItZ4
zB_M0_tYzvo>1H*|scp{>Jr_8^?_9DsmuGV0_jUW-cFg+SZ@%Vo?)}wYzTVIYK4a(8
z%V~LOrRGJuT`o_b?vLwA`A~YVB$4B(Ix~mAaFEX~XT=GMwk&G?>T;4&+fUru;?@)M
zg2_qsF3TMEQ|S*sKRV$t)BdQs+Ci(GN;Q+`Pkt`LFlR-}!V3!uU9#qMoN;ASa!8(k
z<*Q%YpM^{l=12<dS){(tzx?>ggb;V0NQH^dCkF1j<<h0hq&w@7Bg?BKgM<s}%eH@6
z%riGbWl4zQ%!S9DdLBA&IaIXbRbbwH(LxUigOi8UocZ`==A=GP^ImvdTkm>KNPg+x
z(<yItm+Wtxd(MMJQSY41?Cicufs5UeLX$=M=NF#UY!T+}JlW^_aCNT)!<LS;caJuG
zl>9qy?}R6l_xGr#vKY>|q|`ojlFw-+q0>gGAHJ>Ma4=3LQ2bobrcKkbO4WOkmU}TU
zI4Os9-3;Rs`f%jRmWmAwCR;4?&)tnkoBTU=nQpHRV`Jy~iywcO+&#%-!pmiU?Q6vG
z$s%p8o2q{wT@wCcMY|Q_?EhEm%>RWkaA$<ehMB2Hq-_d}S+irx6x*{sYAYv2KAX|C
zLN;Q{wcw&&qr0-Zq%_)R#{3X1wz)F*V~pW>why+&r>5v~D;(+GRjD`W^}CB!J01l4
z@lU)OdY$DCXx-35K?|1$hTsac?(gR>-+%mMnW@tz9bF!|vc==T<}RnUGt*ulNZBd>
zg+u*s$UfFdKc<(=_j=A^u_UNEZE}B?>Z}t2_dhI(%5PBl&mPa~Sz)$i(XMVKcTTVH
z!va^Ysqc8D@=JZUN1el_mTJC?K8BlCg5nxe5(S%VT2tDp_X~ggrW7~Bt4d==nN4P>
znAOHd_B!5OS*bPlSN7I4vNun9Ve-p&cT$iF)35vg=U;uMTR-)RUlPaDZA%JQDLH1X
zYK*&_A<%TpsqM$S43<3asSj^V;ykrF^Kqy14S~1&wkUpJzVgIky_w?j^=WFw8G0`(
z&EI-PJyo$;{=})$P{6OdxPo;$x1!lf9u}dhyaxYC&sDEJo>``1(-Nto=5g}Nk0=&{
z!e`0eeaTy8+t0r^cf(1#xA(U1SN?_G$Nlc?sAux$oO1j-=S;mVLP@GFVK3C@KVbQB
zkEboU!Ryu|?<1!Jl|HI}p0pxldQwQjKYQ(ZL3>?+pvv5X7cB#$uQs2}`JTvfKySL^
zWbrh8HMUmHnY^{zwYi*r?3rBn>tE3>HV3CYnsK#X_cg~~=Fi`fmcG3vt>(~TiwLKt
ze+=Tks-;h@dr;0;7r@{n!oj*~fih!&n)XDar3_jVbipel9u_r(C<sfi9AYd46<d`R
ze-ALSZs?mN>LA~8Fv@k+7A1iTHQ|as?EZzQ?4NqQy|cQ0#pJDB%5{-SFFk|Scv*91
zstf##|9E8nt7bd@1C6RJjBNYPPMsj(cCBN+<+n!rS^K9k*@xC-xKFwIP=0UCnTAY(
zjqQq4pB}NQKe8y6S-_jgPg|{}&dXQqlcLZz<~G$pvD3e5IE}>Typ?eIAdytDCHeXU
zuV)_Oo+qs;dH4+GNl%^pMpmOWE^sR2hsjU98lD)Jp8PmpS@=u(md?ucmsM(CH><Bd
zc;@<BZ<fc06nEVf))!N~Im!F{e+Q30LEl!&9bT`z_u$P%;X-APQ(k@CBwKvG<xlKB
z<yrFsP9NX$s#B?kWwlOc%A|)f52nSbf8@7NjcAFKubUJy$yzp3idX4r)G3EDA3Mg7
zmKsLgJn2nK7oI7qdV48)nc8cWm$Axgm!~@%ob^PZH6&~6OJQfv>rY!M-ks^oW^)Pn
zR`TWqV-C;r45KSf0SdQ`cNqRyn^^IEhOof1ps0r32VVuf@Nj8!(%_tzsNy^IysO+X
zNtIc#nw&>wrU~mj;&NFx*EdH?_tb-}jhY-2T<0EgVchEYe-f7jj|&5r>><uW%p(8V
zC%G@3x?tJ~UZztTXHQ+dxyiZglJlw!jWfhAPhi}{TIImIaChF>2NsshGoF<nFT7Xo
zUiq4N&wuV~-yJ?PR5DokB``ODx`j_1ZcNAlub27y>F=LL<HgfC75`72=VftS=;Ds`
zJ_#yQS$uX~s$=-J(0<bX#FJIpA1u5d<<B{p`BLzVpGv|%-zS|1k8`<rszj+Ya&(FY
z$0(kt337g@etD97ww<l;5A|<r+IwuicptYF+jM6_Ms<A8+XIe;5A^)DD?~|Nck*BQ
zW5N7MfAYL1eVrudx5@kX+zEZw^8_D7-Zq;v@67pUVhVe+8d&oM=0>WBy0rfM%4aaI
zsFGJKX5MBU_jISSIbMPmFZ5-ct+;9!Ii{?;+4AAHQjK)O<V$fULf)N9dBo*wsMP%W
z|K0~8PmZtToRw+yM6vaOie<gK;`twyk~a@k=S<`Xt$1_JlKa}aX;B_0SdKs2eZ_%u
z8h3QZtPd%lBa8~yPdfJaMA6+yJJCZP*%pSjep}iamu&ogV$n)Tp%X{-93@R1)fHHj
zoD}tCBN9IElbE;J&daq*Im+L?RVCKY#%x2s=mQIL71iTqEo)BxQxtXFa>ma`seHxc
z^%|4poMk5|=QH_RIZ3Wzv{}c}CiGBupG)sQ2B)WsVrEa9^hCyITJF__28-0<8G8eJ
zlueBEGNY9=R8stu!wSU&x^)HT=)B{2Xp|T^*GtnO@s+t&M<?s)O=i<hJ&dwpU|`^!
zSP-}NpK1M!6yXJ+EYEDY^G(O$m3j**OH4L1Fl309$rbfw8{Ct<+utr-GBrklL4sK<
z^TucEcB@4l^1c&|I;MPejrDsrvsLQ^$bfH4bo!PSvvw?DU|?Vtsr-F(PW{q9eQyg7
zhgPVc`%<U8db|CCDFKY@>zP+7{0s;bbYNiMGTF?){OtdYAKy0X=2b*@?+Gf93D~yi
ziq!Pz*H3s#PM&-G=iLg|&378|tsZ@w|Iv2Js^@>Q(vzngn4qweX^Dr!De<PQpM0k1
zPQEkc=*!Sd%}JZGCD!V?x0q<|0Il|V7}&{>1U(MVzW(g>`(IYGY`(!%l$d<%Z_9*c
z*Io4+{IB14I9d1b|C&nv#(&pU&y~3d9Wpd<QQFayr{{lbzkTOF-~F>V_20Gsv{L=3
zf5PQg_erIjQky*Yx)|4<XkWUobK<YmA7!f7sx@tPtve&gaOFhdr_xKZ`bUn~_`Xi6
z3;n1lFY}>DYU<-d6QVkg9234&SX6s?yU@&%X*tG=mzGSD_)w#1v))nt8Gn%QH})wf
ziq6b)z9e++$hpUBi(EF<y7ap&az6Ioq4Uf7Nh)=0Up&iWJUSFv<`i?5KRcQAe!^or
zC*_sjHp;90$myS1!{zQhMb$P%J}lt0@ce5hEB%ijsoDAANdLk4m4ct=TPjXz@d}+?
zxr8^Tc~Rw+bBBxzTGDx@++cZhK<&D)=R%`JCrw)>8yyq*Bg3%uR_9U{u1Oi5q5;ZQ
zuG|VuDFVG>t!;h+#tW7ugdP5}dKYWTCnlLqej+JBoh2>ZdTCw0mvp}hDL(Zqy7}@?
zTM(D&f=LRg$$DE28TsOLr@1(*2^vk|)a$oCtiTjgtWmR^nUzP+lBJ`;OQEA<-W3nk
z|KC2JdZiM2nsw_X*W)T%gi3=r8$8wLI;t6^_-yy+5~;fGvu*y)vX$>bf0sLy@PGbv
zs(sBf*VD1R|MM3~-P#fPOeWs2GfqmTg@ym+ryoTY4GawuFPQ#1FbXg*1Th9g3NSFF
z3yVXJgDsR|PznGo0$iI{TYcvAU8d**0!xFI=>N?Md77Mjb|xR^krV$D=O5LS-(uIR
zeCqXsk0%~171(Idqc1G{<NEJX#Xs5mry1%WO+RK7lWpTv>~T2L@<vjv`AwF8YyV}O
ztoyO=<)k0n50he;%iN`sTw67cs~@ORv}gUzF#pDT1Hng2)fxnj#wxONc$r)h{`r%?
zy4G0v<K<_WT9rlRSqZ8OGz7W}+QnX^*xdXW(tdOQQkQ+*RUf9;J$bUJM>*Me>W3#^
zZeOZheI#Y|uRGmkE|rcPMun;#>e({>zNb$9&_Cz0$Nu3YJDtxH<)ywoviU9I^sX$0
zktZccdY_g1!~&sM_C=GP-&r$lHGhAM6U!#C)dJJo6#E@5pD8!Zb8}2R{^a2Qhk86M
z9+q}3yO~_x`JXb}Y51Y=jZmQ0q;__BtEyQiRF3hy@QkQ`Io0{+iDf>#MM6$(E`=UP
z+8YkOj}K8$yw&p1&CRlt!KF#3w6v+6lZCH*g2M`x8Ix{x?p^Z0bMrH=S5r7Or-dpC
zb#xrm;d?s0%|UhgF_%Oo#|Z)*4EL-jpFYkW)61yDF+tI@VJ>TQ%Y+t}Ru#u<f$uf-
zt)u6kw(q{E^4McaZb=ZU?aS@IoOBl(O^f(?ws6<)%6*R)7h6^tt6crDRDMQE)L*j+
zPL|HB@wcxTD=;vyPhxLnU{q}}p5qjvuvdYhkm-6z3`=%_gN}mN0;W5RVy5fQsqs#8
zYEA&HRAKmk(sAC3;(W`M5B=t4HPt@4!{;E#`SX#i_@N^;D_=X=_??vc(PSoiIri?)
z7xQ2197<qNV-cS1GuN}_b!zAL`L04w_H%Ytt=|G#u@zl&Q=>siFhS}2&AYNZeyWeE
z>|FdeRM^b#|MSKF>b?pUe{aF}b!N(EjaX7nDi=-n{j90bZN>Id(n`XqGG?YtT*AD|
zEy9nhKk+?UQnbfqV)>G75{m*C`n<nz&vOF%-=LxbmybW2D)(Hpc~;hRc*4==9cPYA
zI`ZPgxrZv{l{x}O-MI?MCypGqndhwMZ@E}~?XAhzpLCkt<gGg~$4|-I_~40m78f=D
zcud>W<`k)!bKu$JjRJEtjyyWAE_Km*>zj%t8i^c7rl#LftYO(Ofuqe*T8LwVWq^xQ
zlNSHS7uzZhvWP8-v@qthS(3bEiU&)i)y70kan9gbo(3n~BGs2&5@R^T>9dR}zt16*
zqid(?`88SB$~aESrxmlmT)j}hSm$s`QDL~_L4~Sfl}m*WiofstzI1_~!716xJ2+UZ
zonCVq%Xd0_cWYSi=KB1Ca}5fD0t|LalD(bPCwbD|i5<;J5_2l;s`=2Gdg}H6{?EUD
zvK3wVF^~U${r;v6Uh7wXI$vJ@@6)*ku2qYUX$CD7;n5JBFKg}M8?9mTu&>LAdE)kd
zx3aqa+GBzX^Zg`wE`Vw;CeFnD$4=#~$y49>MViT`^Gwzm{qIX7_nmgQ+`Tqt#-ek{
zE)feN7wg*1&OUG=E_uRPhE$oCdp9|~>7E=CQIo&+3hViwy<LZMXQb`_I{*LqQ&#&=
z|JVP!{#8w_%?k$RFPF2rzFqwFx7Dd)nNNj8sfiPt`eOypk{!YhTk>A5(>*J>{M0$^
z3oE8+ssH5<)=;hLQ!8+2Sa!+Eh1<YT$KDQ7wRM9QsItHREScYaHow8`o<(napkYQ}
zq{_*)>JQI%viwi@zeUM@@%f2IYHT+g|Ga7GA`7(%TMu|tZ-3D#m;HCaiGM5XIaKP{
zpLQl}Sn~e)iJYA?`OSX#w6EIFe8hga`6Gkr$MU6w9(irQ<m~PjJi&7>!+gd+29tiB
z{Sq>L<sQcL$Da2N2<Vlx-j@}4xo^GVXW^HD`ZMcVRG<D+@OsO=Ue13>kx=dO6D|L=
zYImysDEcMl{wn|aC3^{*xh@}0o%eIo7goQ#vy(^tXpr5M?=R&Ju2EjnzQ6Oq(#f|9
zf@&roc=BiEd5&L~7fmcxvO0UWd#=Kpn|!h7JBt7F3SX?^Xmh{Q^2c$3qRx`rUFuUG
zr2H^<SHJLG#Ug!*$CK=d>x^_&uR648XE;yH>v)<hbdqCsf?&@h%}6U#&eO}CZJMKe
z*8F{M^pS~G{fhCfPn)X4lD4zolG&MI)*2UX^2T4_%DwV`_KVL?@7undLt=Y^$O?nX
zG$&ah&Z(P~p1TXQvJ0v*8A^#6zTiyE5!#S5Z&7Z^pG7*h@>vT-7wnw!<W`Mf1<$8^
zO+#OGua#C>Q+N;Y%s8Gc(%9#auCVNxP)^E{o2<9l7GCppojXyJqicoGVvgR0v78Et
z>7kRI^1eM$5e)WRsq*HQdfJh5mpJ030!}I%J9lP!)Orc?{|~o*ye>U^ndQRV?oCD!
z3|!&&Km5B>(ZGI!Uzgebm;-~aC1XGjgWKN&(mFdNrmf~+U=S@3(NodW)35*!Jq9+i
zC<JkExPcmc^83E9EJ#{v_2k~~HA_-dTAxpRGoSNBy$Ju(BQ{^xC~iI>5a!l3$yu>e
z{o(qBPb{{7@=^Wv{9UJh)OV#|Q<aL}9iP|5`9C&)95DUW`ENSzKiB`~d1Swme~(M8
z{Sw6~d;IO3B6JSgW&eDq`s?=}?MXkczIBRAI5PS4VfFTN0#D|dD1N+tkmWy9yyl7D
z-yeRIclpF*8T(U*uce*k^GW{FI*$`|+aFE(-T1*}`kWdM?|Gk{B2TE4%r2_&f555V
zG;gc&NAU+2<<%=a`unR3<+vNy9Pnq{qk8U{;QKo^)2>gMbpPeaw8u{bgqxeyH(Z@?
z>Z4GMcZgRbXLO~tLC5(yzE+PS)@+oimsCEn<EEVLwG&_LO5A6EG_t&@^5(;9;U1w&
zZ#;8)-0CFy4hy<uc3wIwbm3!?jr7b3vnQ^3bx})ETF=>fuKI*~4<;}7{5oZ|RQ&f2
zb2+!kC+&LIebUj{E!67A>(ORZ?=G15%0rUn$=#fJOn(>Zblf%4dS@#VlwN3N7hWi$
zwj*ZC3Q1vW_7#4MJIrQktySjY6=s>nwo~U-=gri+N&c|~D{G$!%}kv7QF%j;){!M%
zuYwFe89y-+GF+12ux69EPyBz*#4az9mA4dmVtDvWr)!!1@)L0?c;q>u<8Y>ik*>#)
zltWJz1>W)YZc-I?@!i<z(BPHgq&qp_vFCCjhl)=YC#KpZbY{G%G|J61o~^pV;$hnI
zseiVx#5C>Ob)@3*yK}KkpKV;lB$vw{e%kr`<iZK+PbBv9I@auep74l~QTVy1N}0q5
zhMxz1W|$o~zwiK~0Aea}hkVG#LjoR0=RbZSHSN>Q2qO-49p?!Z0`2quPVjy(Ut*1-
z&~Bjy@t1p>-2(DH?qON7+0sdtdzZkM@`;{b-p@+etNfv7`cmtcx_2HKZg5GT)+OdO
zU)ZtT`0hD{*oA!mE9OtCbd>M9_f^GSe9{Md$#q|kR6G|@o;tzd_+Lc}H+4&yB1el!
zo%fFj-F?aNL;X0*=70LC5AIFz?tA9B@1P{}E|r^$)$4xR=sv2DjkJBXW!|-eyw8o*
z5Bw68XDUi!T;x@ADB#h=oh3apj6Sa@P4{p)&ocKF%ZHsz()afHANVt&f1>qFK88h6
zb6@H-uGlhhIol;o4d=~I?q2S6RAk;VGsere$@F_z-kNFC1-$OrOp|N%XXpz57P_zW
z+5DBevjv12Svr;pE)?kzpV;<EJ5f}i(=mW~<z9Y|f8T%poicIa^BtTVbCyI*QV|u%
zl|8V`PC!z8sr9>UdSz_q);M3Yd35ymBR6xyiJTSur>nzkZGJ5B)l^_!H+y3nkCKxc
zi(cI9C3{4qT)HQo*<sA7bEHKxLMFK)(o1hTuXa9r^8d$Qo80s*gQtZ^8Fg!mG(5kz
zcl(sOyY+UgoD9nZp7?zK<2getqncgy;xwmekAHY9i+j>nw#Yr7UHq9`?%sk^{ye>G
zEg63^x4pQt)o`5#1H)kfkNvA&+AX+!V~b?w9MLeNL%WoscuG6X|B1i5@c2!*nf2bQ
znm?+;6mAGO%Kq;cS<{d#a#zudX~{iXx1&nu?97`>WcGM^$o|)qHwwDR!d)X3$nUW`
zC8}miPPE{rI`OxbClXzXgD<8g`y@r>-8Q&xyz;Jj-rTvOk+ze}ingalR4P30tSGPh
zw*Glm|6RxDT{}_^gNir?1_pQA>sx*2C={QnT(f8WWoNAwv9)<Dip@?h5AsZ^*57d5
z%b#PDm&3)vv|!^MydKRb(*>q_WqnRRnv-><XZP8~Vsp3d`LyYlgrS4Cipkl2a3R{T
zAVxzi3cUI#+x294DTBa+R|lU99Dn2)tI6}sqV=z)eqPO6<v(n7A%fppCKWvrIDA!|
zdx7we(w{c!zpwq(nE21jK9=K8(c>jXb0wySK0Z>lok#IVovq-{{eR*){&Vj?#VFr9
z_d{RZ)#ZK4C(2Z;!hMyWt+y2Vef!65)ldFg1-G^<Pk6>+$u6PzY<}j1A3MJ;nEv4Y
zi$eXV5~V8MPT6+?v#T=gby7TQOy8u`?Wna-esP;+=XUcopHkjuc|O~!^5dUJ|GYh?
z+-GEYg!gzgo;+D-C9&!vOX;49zN&q;g<@^Bk-~B6e2g+~0zHlIq#acRCTU8VIxjLv
zS#wt9;}aDDs}nA!*8>`ICLQuoY5dWfBgC(|o1JBvSFh(RnRmVYO^!-VLb5`kx<cFe
z-npi58a$cA<QaQeq5GCg$J8zdgIqJAUYi+)EQL3e^r!gE`XqV6%Z)#x&z!kTD<)o0
zU313=)7>X7rh4&Ia0!&|^biQ^);pH`=#^){6v-1Tm;V$o2x^>G39Jkh@!!7W_Lky}
zD=b;&X<fYgI(SBxpt@LF>~f9?Y0o`b&T7>h6x1$|i4@ru*!n1FjZlD}%fSWl$7ZtB
zF4yTRl@vWN)&J`yMSHo3#s6+f<f&LJS+My*?#-42lb!{I3$-Rzu1e=IvZy+>M5#GP
z$-Vq}Cr35+iJi+C^<Jqk1}D|qi?|hq$Ei#_EYvyW?}C<F?gyTlF1<3j=kTuA`5du|
z5{3@S)0g#6UTXf~aO302vmIJQSM~WZvRVY&_;VP>tWq$G6Lh_F$%x~`=BeCUPAST1
zPq(?|u&8C3?kWE5UaV8SO~sSNF4~6jOljv<_gMCPvF>*{rmNnrf?ii^Yzz0iuT@sp
z{N3#K({1@?U3I66e!?%FY`UgxzI3uqz^bo1yc_1qP2-p${LA2;s+^^p=Z8ZD?!Cs7
zc0^0)PqjGQ!0<0}{lkPE6I7OfC)^qJUiU_D^zWK*Jxp<LB%k4I#UnK~&KKwNaDLCf
zH&I@-=D7O3zd`<2>I&80|L;oq=UaPU{m1jaPXzz-#w?O=tv8(TQ}g8_x$U0?`T4#T
z$=<%T#IEx1YlZdv0uQ4mNuBZJJ<gzbWJ`+g^Q3sLJv01ISRAUEJzw}<@uynr!z_Ph
zKPi%Wzp3SQqDK*j%ch?}<*S4b&uF*(k|-ZmQPQq>%4^moDXSwo2_8m2ZPJBy8>DOu
zIC-|xWXar4p)Egy8kL2r76>^gDX3j3<X}yibBAMR9P6?}$>N+0QzXI*GZy`|Xny|q
zhDx`>Waf7&q6`gkzC0>RCUR~N`MdqoP9C|X-|Ba)tSdil=^h+B@$0Eew+_k1D(^1j
zKlIRO!q-lb?xM*;CudC7v1u1R=;@-%)8xT_cSn}(cmG|#*pye;8_6Cp`Xg1bpz(U`
z@y)4@XC8jkS)(~sMN03wN>S|ck2=kI{<=P0H(gFVN{qek(0s-#rSp=nGuu0UrpfY}
zLMbfE<~@1Im;dYgHP42!2hpMFd*>cL7P9X2tFEjTu?y>8H&&;eDpSpS7&o8&<=fp7
z-(;E(uU)lFY1{kb1y(yRoP3zPtpDFBLBXCox_hqnB&=APxivf@t8P16Nxt9bgdQC)
z&*;)uXH;z+f7`rY*#4$|$MyQFO9db1o}06HO~N$KBFC%G*rrH4V>DNc57xis<YBe{
zMYR1|`Bk4Li)bI)+RyaVt~gykf%&_k^5K0itQh2XcJ^4mYhKX)t}<Ehc!lroqo%7K
zF!GtM?~!(BJ#++A<TCzO5MasD{Pb|Y;ENl*(_Sn-xsZjmTP^N__hbI{lPM>gLT5g>
zrv9V+&7VzzYJwXR=6y=!XZro7fA^nF-jCUDUy6_WXCU->d4%fmk177~+d6lhR8H$C
z)O+T9Y_81wyswS=3|}AFS-d;4XZKShYrZX>dpKqpsp!S3H$8Cq+26tWOZ(Q7e}Xmo
zs?Y7roX^HAnV<Ps#Y(MZn!SM+Tf?LsoP7L8IbLN7Ke>Ble$_J-6FF_gM;tD{zE!jb
zTnLPH>5qNvBAqhLSi-w6t0QqoAm8k(9u>k%1Rq!n)qY<*;mX$-`Po9<7uVifZI#3s
zd0F5=9Gl{8n*irS|MV5FS5#WbBuqQx$-CC5WOY=Jnxk;9N(+aJM!>I`Ha4HiTpU)O
z+RS;eJ8;J&UL&LPmKIrRVre!rvKYS2N`I6Za4qTOyhW#2*ov6s1rHu>{nYpJl~?$N
z|MvS&_EcDYnSRx*%~^Eo`F(p&$$q_StN7hJpl<i&yGMK&|KHyg#k`x*PPi;nN=E5}
z{K41x@4s7h1$qe@hdiq+Xc9d+$<<=VwPzc;1b!w=<k~&=>#-T`i)tT9`gmH~e>PrV
zwb(AmeBbu3XT9J4PoK3h|Lw2m1&21PTP~fq-+Sx!|8`~jlfxD*-gj}*oK*_~xHOx2
zB9}ETQxW<3u}_xyc#Z$t{Kc)xx(s@nlgv!+-Tdpd$Y|N_&u#hqJ8GBSmd>8zE%SNl
zHKtTmyP6p{wHMu&G-2Dt%TaYh-1A@j%Sly+KjaJC<vzaWRX^Ih%tG_ZYIXg)az9)$
zoy9U8m=FI^{8Lr2TIsI;hYv-51`Mn&v#)6cbS-nct;xmv!FWw<qvnBCuFM6G%%?V2
z2dy>HFxB|d6>+6A{-LHo%Yt)fbKdeOZM&uEWvZ?{^RhQ*uae)1*RJPFwl?m%e}LiN
zY_ne?T)#uuewA?jE@NQdQn7#GZ;rixO8&M!_<KO{sLdw7{_9Oq_GJ^kK0oJFv;3mU
zZ~jEp8#Rmk6Q2rM>zzDwOu6T#&@nfKPpzKnpKb5|slD8y;V<`p)AR0cMt9{J#Exz@
z>UOx?bU)INgK2~KzVGw?AM#6AG~uvsz44)d|M~y=`JU$<s9&rJ@IP(Wt^9ud6qjoL
zDW1je11I!Kw=Q!GIeBM-o1@X1{%iG$q7P04xXZ8|V+x)>H~#;EhyKUaK5VXt+@pDj
zpXG^h=ZVV7pOtfdd@g1wsoSIY`n=AB7klr_KOx`zWW}!!?ML}tC;yuMaFX1w8v<`^
zmb4$%uND01`eKsZp{FW;-Jc(lIFrb)UHnKUS;Bu)PDk;JBd<idPx-8W=;h@7u4xjJ
zu7N;9pA{1ayNtWQyycq|Sl01L9<G@FqOQ%jg>}DhAJ=_x4(-C)$v4GbomgA<dy_U}
z;n8b?%q`+vdV;l+-+Q=-Chf1hDaj@O{KR?p=M6?uI!_`e{I_D9@bJTg|F{1h;Q!9A
zKK<0E%DV6G_snzP>zllI`4X1j`@1-QEM7G6(Y!s1cmMHEyiv1A`BV1|mw%?U!irDj
zdtEHQuQ>T;v0(4{&Y#f|9_zWL3wb_W)8gW-q<Z`Gy!z8NHS5d2rJqi@_^){XCV~C7
zfy?*Xzh5BKvFk|O7j4yv0(;-?(=nDg=fFH)ZQoH#hD{4LROm=?oA5a>$cZ;UtgsVr
zV%MI0>dB{_HftMX*3K@=%>~^f_Cx<+1*m}I@6=;p%6+h;A|%K<kHvjz@g}?N*9HDK
z*M0Y_dHIvU`*K{V>a+W1lRlT9-6UuKw_osQ_Ah4TFNYsplsBu2>7TU6&iSGJvJ?N(
z|H^rPFZ|pvePgAd%^oG$`NxjfEv_w5{;vJ=vcONy_mkqje2Uat`gW3>_8;vDf7aBq
zAF8jdPi?8YRP##t)p?Z(->*w6ehPjiDSuF2Q}xID-z?szW4BBfx*+_%E=cu5{LK{m
z`1;90@yp)NaM{y)vh(-!ox(rXew;A>?0>O|FO^SC<9zT^_<_xl`33J<4DYJUwO*3-
z#Ix(9t=*a_6)A-`Iz3%htIRXz4$eA}yz$7^PaKD5xv?Y_ESS*QqTG7m{Pna-mG^h1
zt)I}`qTblz_NCj8(QMn6LlX}jRBg?5lWODT6B1N9xpkF$)m#TB?zIjUlTJ9K&lPht
zIdP4bPsvzu!h)5bIh<@ZWp&Rrx^(CEvX`6|izH@D-BX!(*>LHuqDf0zH+XL8xVl_Z
zV)BYf%MQ6mnp}Ro?fFxIXwS6~uIsb9E4f}P{rdFDW82lNRj({LHl92)-F))t<I@)l
zoJ#39F~!fN^Si$K*~y!psHb(Fc)+qK=j3A*!6R?8D?S}Fm=m!@Cn@5_lE-damCj7E
zh?r1$&vb>JfQdmy!F)A=TL+5@7A!x}B+6kKp|XJcvF8Mq69Eb9c#d83%1U7>p4>Tk
z!JLmqM$x8reoa+fOLM#K{XFP+<<HDN``7c|{1?E$tGC&OcN62&n;Y(*V7u`085iT_
z2R{^EFn+9AP{~*D`}z!i3%UL4n13{Uo3}&m2LIkq%_pBT{!y^qAh$zq`!;?Je(Q4#
z{K_X7*e9Q3kY#tMj0<2WU=%Q2qagj`j7|b51^(ab<ru@a!_2>}p>eIi^)HX)mVXP>
z_xZXghFwwr_}}oat4^2H@0zbFaO+Ly&(Bv|YW<%U*>OFI_^$8J>-Kc5m*2UNdG{wz
z{*=GMqafaM(pU58ofXVWCOx{%6p<hx^=WnI#WR!d6g;Z6KkBlF`R0iq>1SMOG*_tZ
zeWtS0&Zc{TzyXQ5F*EJ>&v1TOch3KqZArWL=SX{*XD5DEUuCJaeLBgG=gE@_vxeT7
zgI05V=B=M4{PNF%Y0gswU)ZKhV_quscu!pVnI|V&93$p!TWJ@(Mg81vM!V~WJBr_%
z3NL)2B&)uB(x?ADoJAQ+k_wuPN>ZM^^!&Jv$$xr<!u&>yOuoYpCsp5bbDw)rb>*o^
zYl2E1d3}3TCdkrMk+Gz+Ln%A*Qlw6C$Ds{wrcUMa{7j3AmJ2tEOj_W*X|-h~%S?u%
zpR0{0Gjeupk$m^?X0w`5K%a{7Ga(gy{xz4UHf~<(awYZsxvZJ4*4)3NvX+`3^Ip5c
z`BTt}b<MHL7iQ^Q;?uR~er;-@>ae?YQtAJB-`AXw-(h^@OU<>MO{@-QKHmGU#_(tN
z&Z067wbswt>~W`)JpZ=|e)zp)rt`fg4Enh*^xEI8EVVs(S*&xv|BK>JcJK4KcAELP
z-m4C9>8s~Ex!~3Gw?1}HQaTL|Bq<#gVYnEu)az0aqm2b?aL=RHM*@~>wl*gf-^!ac
zjpO%;ZxgIcHJ48*Qdtmp*iGjZcZBtS(;bhw3*<c~eNYNuK6pBfZM*2!7nd%)VtTUd
zw6BW}$9?Cyj!E66h1=Jci&?)Hp3wJdeJs~oL)q?QyUqW%FS@=|R=b*U-ONWHIi?#e
z+dVO~!*%W4uIz;#2eQ0Bu6ZH3(Ybh|Y_8PW+oDNRrhe17bZ7ot{$7^VKD-A~ouqHZ
z2CR-06`kO+XNszYQ{b#@S=BJh1M8&D{nA(>GP%z)>U821x#u0*MCW<E++}n@X-i0{
zr_+^f>X5<gt^-N}pb5F-7vz*$`5sxHRFM+$I5puwZL|7U?JsKXz29TjGsg?Ru{|*T
zY56xj=ZEamI+G0kGdM5!rO|##{xYZj?earSaXUXS+A*rkXLxz!&&p2%)8GI8(BS>@
z_@PB{!cUickA75F`eTdtr_H}qRUh?>PWsBezO#f+!TD5eM*Eii<}JUrH+B5bZW6rw
zmLu=aJm;78k5c+q)JL(@EB%k)`1Sap%jz7C()BGVHPe6IRekk8W70Qq5A~y)p8WXt
zWPb1$Pr0{eJU+@AN;fM^5jy^e<EwXgK&d68O`CL>o4A7Vl;aZ@7H!|-BOz4L@uXme
zlOdZ^1IHpxh6g8}e{d;KSC}Lq(Gj}LcWTGgEiT2nAB9<hN-nrXsf83RSrH_V5mJ2O
z+{6iq6N=R$O6T!(EW2CIdGEsH14*6^hlRUEdS8U@Q<<F-D;-@g;;z0`x%cqI1BVn<
zkK9-3@p$5+TikSb@@csV5}rI3GLbn#Z=I6l_=3DDxdhU0eeHV6I-xe_T~YozH>Dqs
zmVBGMOex49CEjRhyT-!~_i4`dKJ(NC+{K&>7fxv9>0uI->NxJYPRVz!qlYTTO*YnZ
zuB_9Kxh#3PNkLR7OCrQlH^|@b+L<k?)7uM|v~bo;$`Nv8G;mbqlC*fS!cjv>S6^b{
zc`G4q#p^*Qc~u0m-ld#W^U+kA6|OSNNzL8iug9dTfAy~~_gYZPHSqz%t7pv83`x#L
z2K)jH((J#wCVh8(aiV%rai_ERto9w=0u17e5e=RSo<R%38^Cw27&fzjmZ~s+XEC05
z(u$G0!b$SoD(7vVn&J;sUwQJ6DgWfRe-r$tl^u$)-#_v1uD^-Oum1TtFMD{RAg;n`
zcW2$IeO5wWo^6=lX2ofD_*qijs-IoX&*wkbFnvy~hx5801?}B75zY_lFKx1~+Ar2p
zeShYOH&qKQrG@GozB}qKmR-}o&$i9I^UJ2%>xWu?%ZDhw&_8ukf7g8_&cgK>LT_zt
z+Nb=z;Qn~Ms_OUepKQE87=PF^E&g1}k9;Hb$MH*5_F5k3{2?ypKk0)})qfVD7cqPK
zW8S(H?3nYmpzVIcgh$3F^M1&#Wjembcju$(IhC@nI+Hx3`?f7}&hJs5(&8TAB=Ms~
z-eYdgN}a7e{Jh!z>Qgp&{Fr2-e(j>CTqf6zB;Atj932HN*XOBExZqvIWcpa>;C1!+
zCni7ARgACk*}-J_qDB3%!U4e;|AQ0TCn?)K6!729<@$P;$|Nr>WiOorJub`wmwb3;
z?Xr?wZ<M|EMbR1_hDk>(b5w+s0-Up#%vKSa@u{%cx;<QIg8ZrSG@eQ63*4GcoZsgr
z(phS><3r#nbtPy2=8T0WZG;p=vI8@{`#;C+R$o=~%EJ0d^u1QWuWhaRa=)(6_i~tg
zcCPE(?tVWHL(Vo;PR74FZJg?i_E&$t=?ZOF<y!IUwZG%1<FiY*Onh`BrfHYDU!&s!
z#s=A?$1>IwE)Rcka{;U40npl|3*x-JOp3{%3X}i(m;V!Z8f97wuc@gkNEj4^#;#HF
zJo06>Q=Gt$bMA&S56Cy$y9f$fu9%@S`JwET`IBt+IrmkIOkY|P<GrLHbC#!&#_ayg
zd+J-By8OD{$N6*qV!@AVAH_7!7kXs&ZuZWIxu=}@k9L&!ZF29->-=%KL-5maVbv*5
zT((7?uG_mZpkZ^0`5cpWxA#k`%0#Ajm#EYz-d?>f#(Uap)z>PI4CZ;W`Lo6}eqCa;
zK=oLGONG6;@*z`|!W&E#F`W{7c;<;G`L{efDL;MUW2vUt-A#gz|NiNhTsw8Cz@kT1
zHaoe*INzGGY_L?a>e19yThJ%oC-`cnQH=T&p$EJEv<O*Lf(||jHT|l`FkSD!#D_LA
zecL?sG>@Mww%MuRs>sgfbD1-#prxXxs7CFlY_*ZlNta&Z!|I2NST=-mq&*Wp5TeL*
zQf*BSgFufF$J(Zj8OyF+^|;s?c|qw#h|Qt=uu?^?Hf4sC2W|=yEk2sDDHpTlSIn7s
z>YSp6N}|FX)(MNFuFh~WS;VH!DKOVV<-zs?8J<GbCmeI$b1gGuP`c9J-{E^?eskKC
z84m;`w9CU5Z0s&OuF&AuF>Pj^#4MF`50w+U6h)^nX~xcojhxWradK+2+zaE4sf#Cu
zM20SDOL7)JEEI5(<A=bUDS_7Cj`Ae_(AcZ<>ioT}_hSC<?&<T>R9qZvqVcBw>)(5P
z1`G@gNBH-jyWi(f@w=;6^0oQTlb1H!GkWt{MrYE;bh8}+933fvQ%$zSO#UiU>}|IF
zo7W_rHOF_&m@HRx%<N3ai$(vrUEao9aan9PpWN_YiY0qr^ny3rRrjvWlJwXy@5Z~0
z>o!g3RLHTMw?o5lmb}FluB_cj7ujZO9eO-1=;|V!1xGgRI_0@Ed-GgY(@hCYx35J^
z3z-omSCC+t6`7$vsr=EjXXko?HD@tt3-v7Fdb=d9anHGB)lb(eN*Yv0G;Y0kfZ^ZV
zz8{N@co>?m+$>tFB*OO9%)^yIzW)Tj-ND}se<J=i)RY(qytVtFe5251&--50M|^YK
z56{b1eOWIpFn6JPIFtAJItA6oemc(QYIWvM{&kTr^>9bN<wAY6Y|bYclb+n2&{n-Z
zH7AlMn&aaiNy#q7t#kaP4VqOZwDZio`E-eP;>krjIf6JG_}6(XNxrSjB6d1wn!{A3
zo$-44)e^E##rOTu?stBDziwe&v(m{E3R5=CUp0N>b-Pt5&3_HL?fEV#@J+0EY9Pem
z9sBr_%+iSevxVMX@c14()oW4k)wyT0cdy7;wDv}UkG<2&nMs$5Zs&7Poh>M^;rGew
zyQN;Pn|1T$+PQQ0&-?rP4d=VReePG|(iKnF2zVcjsp(gF<<fkc^Yh(AnL-b7DL<EN
zv)RvjdiSI)Vt%~dSMo$Kzwe=(t<G+LuHEpQ6y{vZ%3?i#OUD<Tllq25lj8V3&Xni=
zJ}K_a1;NQjJd14v^`(E*Os~5);nVR+p7$Re`H{!Bw84Ax2c{VB7U7pOf&@%jHr-`t
zt7h@fo;XFwx%Q;RJC-&fPL8Fj-5b0nao(CdncHEN+w6j!In`A>|2Fb|sfsc_wdcy|
zdx1}7yY)ZBeZ5h-e23YTmp#ANw}$Ng@N<u}*qcV~reK>Ti9QiNyE(-(&))w2=kk-+
z2fMf4m^*jw#=|n2Yv-IeQK;_vVQx`~+B@%+jDPk||8z}UnDGF^zn}V7B)E9MC#DH^
zJvJ2Izd-QR*G)C*oKu%hemM8b^qcFCYw{aC>G(YFjQi#)lm6g8Cnx<9{%kOPo6QvO
zV{>h$>wbP{U;3}I<A?Tcft8C?Cp}8pas19A`#3w*iC_EMPJHU)aaX?7S>gOP(awCA
z@(GKX($2~gUbgJrezdcMr_toD!kJ8YnO{4mx7ILu&o7)5TyQ<CpugR03SZLZV}df5
zYTEk*U)<5@lbdO}q#<R}qhODe)Kli2)-wg(|J$m5Yfp~*kvM(j2YwuW;xVchcl?x=
zF0!1dQaaV+-s2}bk1d(DUc+lONAl+-mD6PudoK!Iic?>;#=q;L^37e1vNcBT)4cd3
z=Il|F2@~mKwf*2w+8JOdd5z=pDmM}CB2^YEmB=fmoT~!=aZG6c{(kHBTfcW#1~@f+
z@6&!_$jRMye9!E!?@y+^zUylyoBA#ORo&wQ73tTvt*<*=z)%usxM-{BWuY>$a}TeU
zZ+LQ{#`N$B%V|qx`ffbdS~vIDja)0QzIg>7j#*kWdA*d)7ymG0?)0qoBR|!{FDkTM
zl0KbUX76z5iFy*#-P})Rl05T1s<ocBj`;P{_PcI$9-~B-^d!%`5Q{8tzb{H#+U~E(
zmi)xEbG4(-)y{Lr9<0o1%yRpi9m=$CcJNW}e-$79YF1l)I-a`y#Np5h;hfnAV|GlN
ze3Wx3@8{|BQfECZ7GEv$^!dk??LrYolaHu8U}Q2~KgFE4%i|~m0|WEVf9#+Clzw4o
z-f|=%qr)JS!Be8rG$qE;zskS=H%I@2%Q5bglRMNVva=nx`RUN3sq%fEvihNYw*|iK
z7f?Q0=29xC>gn?IjJn{Gv+e4^y96K2*Y2!g{5nyWg=c+8i>Hb=&+I}KW`nl;R-yOz
zHBVO9+j&p>#5LQ*rBI`_<e^5fXV`-yJKP&QKKOlbFI8#&FulxRYM08AKy}HKj}ecG
zb_McfFHu*M<aSbE{a1If!L4RZ^9<!l50<Ph-V~$uwb9|)ZZ?m{FIOAch1{8RZ^@Jz
zL7h0QW41f3tpw(n33zjQ_PD1uDRHJK%Wvsm>vL%HW9q!wobi|=jm^dCwlIU+k|bA6
zts6#?hbPPw=x2R#WRLg5Cf27cyW|#3^H%UkP&OCyh`3~U&)fXyiOn-6Z1?Hgtowmw
zrqbrg`UxlP7V6CO__$}w+K4{0osuEX3Ma^F8@Nb{m0p_Uwq*8`Tfr64EqN}FThFy9
zv#sR3=@h-{$im~QN;|a;I-RC@F)R-hH1e1;|K_>n%1n~tOs^c(<mYVS<mkELxpm{K
z_^tI$N$+%~PiVe<Kj^>Nt-E(@FL%9qb7yix)zPGl(cxh`9ri5?`SkP6o(m$*9FG^d
zC^<^D1zI?=w^?pr^!oC5r`!257Pk5Aae{?cG%NllhRuEZ)Pq&Itx~-4n&K&W`3L(a
zMz)wREMQ<*Gh_3c`c41Wew{I4lhMcdF7GC8*Y)+?yuRh!+{~~QtTC@^g=Opme;qer
znKer=_4(Wq*~^`k&lUKV2S`3T!TH9lgzvd0hcU;Q7)Sl3clCs_BwA&kZC$6ak84`m
z-YJu=oH?e|x<lq{Z)juO_hi{mdv-ah?fb+g%rRlo_jPr@Prc-t8FhHyiA8D?#k81J
zOBOXNPXsmDE!UU0ak89X5oiLp+ZpN}v=y(4y>-c0S>V;)j{Xbwrx)5OUgZeZv|(>Q
zapX_oq}|VzbnXdVj#1vT%K3hMOH-WkBbDg~IdXnC>K%Xc$)2Zjn$0ZNGaC<U{Myp4
z)!^`n^U0T#4|`usZ`>)nHmX)pF~c);_ewo=c5A*nQ@r9>`lSy|Tro+gD5bD+kxfvf
z&4!b`N1i2n-Zs>8UZc28(q_A)>Y<G;`$W>R;w=Q_xZXc(sS@w<D2wayp6Z|)7bw%B
zB#}4Cd;U>Hb8e$FPVL2$_aE?h_Hjur%Z#;xk7FcScC;)xyzpVBP!P}6Yn35Sy16&q
zz3aT)qd`bF=<1CR$He~}=jyl=uqtSEp2gfLbEYh3kC(gr?na(4dxxdUoIRI+?#h36
zQCB|qo^7_sVSCFLMjOn6j!8=MN`3j<ux;|as?}MEn(?z04`%zduQyxscJ*)niJRZv
zc+c9m_fBzc>-Poe<^m!<O*{P@H`Yeh?X$R4EtxIFb^prkowHqE@*MIP7klryeAY>E
zxyD6%^A~1nMND9AZ~-k6(%X`u%;Lr@25LC-=KVOmdk0h1t`sx1A4Rp%CtUX3t8G`k
zx9cN^;fcq_OX@`b@pJy+c$cy@U0Etov!1u(kJ1-Uc`+Ylr!d}&osagixbG?yN|^iQ
zlkqff>kTs}mvD3%rc{2bn7Z54)MkxSX}aKEnXXq8cl=rD(9^Y~W3}8&sc_De%|cfu
z>9sd(n6kD|SLe+vr+~ZBd-~X2+=~x(y!{Y)(<QiFxnaJYz{Due??!tz1>BW*z`gLT
z;Do!LH~(vRvV47gT6i*JI-8yNg>SM?I~J%mB-s??Urs1-F%SsV=;}4i4C<QogZ+!3
zK~SoH&F^<sWrg!))f2zv-d~i}n0-TTx5q2H8$PZ+i_Y9R#5v`$hXhNK;E7|E^JVuH
zuHG2oSGzZH&7aU&mD8eLg)8lkU$?f~!12yDeZ_*<Q;xZjLNnBDHt4Ca=tkXPdceSD
zxr%{7bhhlf<hh0Gvy(r3`ts!9ZZ&(JkEi!;_sePkU6`?6BwUvBIAc-+0|TGk-g2kl
z?+jitlWOWx{<c1ppUWB9(sadH*rzxDfTFxxMNRW5e`$wVQf@A@PChxlKz;M~m;8GL
zJ%8v4%J5ZmYh3$ecf@U3O5b`N=aYBdOrKw4=RV{4N!J!H=hYe4nme^3#h3K1D{`Ol
z?nJ}{FBR>!wJHK1V|QI*VB=Bp)|>c(+egK37RyhwLK(Z9zR7CtoGv{~f46JAl0AG$
zV#`*hqxxHfB%<T59h|7Ht-a}rq?L;zm;IxTU4{}~>rzr<R!Zb6O`lLRkHLva@6x0w
zPuE3W|9GaXm}!wRoncC*sll5sp6kvXncwFnm96z8%|lsY(}h<8x0kGS4C=n9nLqjO
z^Bpg;6}q^MmMg57)bHGMkJlr!rqVL3*i%Ji!L;^}JvnpY7Ug}a-Zyc{f9rEkvJLlm
z^EQ2xI4dLbj?-FL?-l>K$fWtp7k8ON9zQDL(P%Sw#ftUC0_s*@?LU5gxNut4R#P5^
zvb!&|C!TOnjG8t7^vOxJtxtN-N9E6~;&j~@op*JM&S9gb7w4xPJD301v9k8aQISBG
zj>_i|weKQ+zmR;|c%|&ydiCQMB^y&1n9I+-`tsrJPVtyt9^<Fm6e1G;D6SU1I(eJK
zm5<)j3uVtO3w5Zo(0y~v|LS_Rb3e{q=6o}$bH?$5E2Z9B<}XasnA$R9L))qkf{drw
zW<BuCWSH7k&v<fK(b@~w4!H$aT|0Ya<w@Pij%L}eQ+7(Mnf_+i@vIXMI9GD3WHfQV
zT6Q9G9n%`^Jn+C-!%7D=2hJ9zMGOp_A6NW*er|8&3kHThdjbDu*K^-GKG|JQtK&TJ
z<xK5PojGzm&O7ROJZt#hx@?(x;-T`SH|8gPwx3;6Dq+hz;lq<HJ9}#5-<)cRIKS!a
zCsmd?YvMQCJqiEF)^|63L5Tmki|^)03bbwL)Cp6Nw)nxK;BfB3zT&vamFYKBPBjT6
zJh@aAKJj_ai`;jfJ}YnQ%sSqzrFZE5{fEEIu2mlXaWB8z%;d0q^4Hss5AT0Fr7HZ(
zmG^DwKmMB?v-z)FYZW9Smh)8NaOR0W@~iiyDK9a+S9kGV`eq&}J73n>f7A`1cC4Ga
z?RH3A#-HQfOItQZFIc(UTxI4nw^b{TeOTw&H1kRS#cjrH&8jEW{VqG6Flt-1e94ES
zt$CuRt>T>bH~E<vUAdrkB;4Tb#Ai{q`--*q8Jv-nH2%1H!bw?m)-&7N*QVuse;#{j
zzjt2P<OO;>F^!So9N!ez^7ac)jd#}*dU){g{8C;0`+scX)<<9LT=V4Nq2K8;ceD-8
z2RtgYn6s(q3Fm6<$<2QsFn^LgVBH_Tuf9ia^PaEG6ZjOE53n~p07q01gO7tT2g?yg
zg9g3TeGZRroi7jFTfo3w9p;_7^81m-$BSy0pLVIvd(j&4Wb2Qj=-R6{52$Cd6h#Hu
zoL?w3u~R*BNrFhn@qXXD{(xigytDSs{{CHM(Z2A7%o<G<=Rd@hs#{DE6%x7Ro2gak
zxzy!>gEEIpf>gPA-tYV;1<oN0W(MYI|M=ZB_pz3YWcisD;lF&Zb>^@%&6>t(!gP&i
z8{g?ydf7D=+v~4dRd%K8iQl{ZuaPhB&%VcR<{aLrd!T;HN5!h+4VQD5Nq?_%zwdj0
zXZx)m_jA^(Hl&rG`Ih?YX8*Qzkyq#Zz8zivKOisu_5O|-i{38#Sh)3<(}I&%YIUp9
zWMy-!n6J6k&9yXJxBnT-+@BJ!N@p;$P1unBB;Vt=|Agz8Gpo#(%81`RIb~<I#?4sQ
zlXsq(O*ymTp_TRBKX%`x-QF&}xMSyrD|eo$2Si8N6uL3!zxba0!+&~v_stid9UeX@
zD3_gcC+5$+Izi*d&puSx+&P!0=gw|Wz5n6Y-;Mw5OW*bVe7C#r@pbF=HC5+6zHZ)s
z{{iT}#Q;+t*27MP4l@>jE-t-){{AQScg)(4J^T*~9#7@C*5@nWv9^*;QQG2=gyfU2
zk5(3`xaqvM=$Yw~a7wxJ`h+7%uPs=*eI^}_ROnI>+{!Xj$|-njqeSWE9UiPf)f#Pg
zpRVgI6|V3~T(!S+#^sl{mOr|cHaT7Bj97!^Xa1QjFK7B65HY!XQZy{dH{m-^x8eOl
zjn_*yZ&<OwTKDobw&N;2PEKs6w;s=4)2Voo{rcQk-S(*T;F`3@vwy6f@pZ2d*Oa+e
zALPbgS5Qb;aOHpR!n<>p7Uj#&IQGx3=do?&1doF&H#L9Ts+M#rCjNtUdA;|6A9~J@
zwmm5QS9wuB|6IebrZvy?Z;C#9n-%C~<h*I$bN<68uBG1BJr+~hRw=lkWy`eX2in|E
z3NXsE70dOXVvL?^fBSq+z@`r^j+3Wa9AJ)~I(@m|*`zmjGuvJ}ZwNS<l=6mci`e!x
zQ#a*Z*sAEhkmJdh=0%+wF8zJK@7XJBt25qm=NpQCFl2xF^6RXDjNfzbV22A=v(3IH
zDNWztcD(So`t4oX?Wt>@pS}}*J#BIMmIrq-z6f4&?D@WK+Z#)(3XfG<al4;PT{)9a
z$3;z9#Y-Zq@@mVft|wxhucl4(I3RQJ*7hapg3~;8LYaKdPssJN`LTIwmXKlW!LHZ2
z+73^)2FC8;mx|kXR=9;BAvx=IwnRWz<F>A-Bic_^U0T*0EY1MF9N}SM!xRTs4vs^N
zb@$A#%PZ{9<KKL{`uYThdC|Rsr-BtKTGl;1S&%o;*mvsGsFTXaD|ByHb#R*8Tq9f(
ztSn^6$QN$5bHOo(g$j!MZyR3OCL=Ma!D>yT<Gs)6d|!)H%>7>Ztm!V<nEyhsNoqp4
z!`n-R%c4GOIuu4PS;Dku73ZdO_K3+0dd;atHNDnT8ISK5dig%OWa>|Weal_k(&Qs=
zESrBfoh!7#XT4g`Pbod$-!&6|d4Es7cky}ir@ETfM9cM$t>+*6EFQn+V)e)TlwwgP
zrbISNzneKcH^pU+)_xZ+PH*_PJM8&-@AtO3Ij<rgRr6Qc-e0Bkv9$fSr{eUoBx~cn
zr_Y#vpM1T!i09~w`|{6rtaZuH{OvDZ*m&dlsvqfs&v)EPd=_`@h4z=J7phmy-0Eii
zXzr8q;*)+|xN}%K>C2mmIpS_9t1DQ&6mHzNp0QcX+jgSW2gd!^|K7E^FYEonitXCn
zXZO6z&V}Y}nO*+LQkwVe6#j{4kI0;Jkn!{5b6{?6c=vnx+5_PI=7|~yv;??%m=YOQ
zI98P2?*5>+>Oa@>dD~+O7#Mj1I5x|6)kOuVzbY40=t_@Gc%S?&YwD+J&z9nat6eJt
zm5pV~%ne_!Fa57C_e9ujvclsx58cl-KM2lo-e(sV&i;ObXwE#oN9Sd^pNQW}SmSwC
zGs$q@mFsu&%`{v~d?(w!;+=hLZT0dCY4#^~Sn(ac9%bq7>=5<Fs=>~(`yJQju8Ku7
zk1Uc%xBc&4($9PSzM))RRa*5n^N{b4gI~1YG$?yF{ci2-O*_+7{cAaui}R%&6XkWq
z8%))s`YOr`%a6(L-o@^-y*kvPLOQ+fAV0s!9PK-1T;az=H~l+&R7l(_W%qPl%ZNfx
zZmyo4tm<sb@7k{UJ;^*?!sF)r&X_-;hn1gyDapK;@GkPyzC#cAbUqwD5hs&0Bltga
zss!uv<;o2285U~^7_oG}<;jj;e){m!=MO%LK0dKr&RtHOosVfvH|x5XoqX(aehTcu
zk1Retc_70)@%W75hi4e?lvn?K+#qvYPF;`TpXr*fZrqcY8UqavFfg#czi;)0{T)MB
z%EZ;>EW5mJpX7)?cID%`=KUTG)owgb?$uviZdxhOzcb^H(d5OOwfT?z_4c}7_ucW+
z`lG)BP9=Lxx{*6KTWh2G+y|N3PV*aI?dwiB&Xr;Mf~`z!#f}L9`<<0jzeJyJy8iXv
z<=@eMhv&SQsxUQqmrg-RSWbvp_Js(&&Lf@=1&Rue8l5c7OzY>FvQqrmoF36Gxzi;T
z#}}Gy+P8Q4UF%Q5DqRV8`os<~xV<sn?%WU($+t2^Yys!lIohp1_FSK5apbsKnUa?F
zq4P(-+AJ%2`mm6bfw{gp*yQk{&#r-<izcl)^)PpFR-j{iZjVM-d~LeC<&PTW{aciu
z-_~ZVK9|6CuJrTw3-6x&s5cA{6HPQ0exQ5(q4B=AJ~?b>BN`Yi6CXM-Xs_JFF+0d#
zv2|+H_em$i{GE2@E~}VU+4;sz!E@@Jdka^t)p^~rEVpOpjs9n6KSVB(3KqWgBgdoU
z<>XY;yKf`>73G5>a<21yxXjZPzW3Vh9f!NO7%ZFc`fS631w|se;~D;$tu-u@=-^Of
zY`FGbsM+`Y!FzvquPtF%v0UizxgSrnUw5YI{{9uadwRt!!+&3=O`H1c#{TfFy|u5`
ziIrV1R+83QalYcqjQ{1QK5YqU)_&{87Itfusq$6r$r1l}FP_@9PRKQI;;uPUw|ZZV
z@Q{C7-7NG|wII&!+A`Pcd(EC7bbfX>d$+`jP%)2{B0m*9Gs^!?Hj)f_Rkq&L@}$X)
zMYoMy%Z_h#>wW0HQpHtQ!<UQq=)+`jm$sIGmAR|B1lFyceD>XP9bV6B)y7ayhu5>r
zlm)F$mxKmgFAcSt*86kkmdFLicFDf!u)Dfw`zq7PK_=k=rg;?#ixN`){H@<!8?t}i
z0s9U<hChY^je#7BhQIq;--g`ebZ%hyXFBnX$;6hPto?H4Uw>B@PXGL);`M&}<*%0S
z{cZU`=4{@<^L1L8$9i2Z#D9%%I<jHG`b)RCKL&^WKl(I2xAlGfDG`y47n&|#>pPQk
z;Bug!!Bc^thdWQZHSM0OT9R$dw8*-8?~gU!*S-H`XwA?*#KpC2&ZkwIM7E|@O*+cT
zYTd=f6|Gzm-*rky?7+fhUAzl(nRi~?G<okIrdHNjR?{D?Z=G6r>Oxq-290df%)J7^
zp<DHYUq)Zn*IMRR#M$zD??cABlF|l&mo~nC_tO4=?uTQC>puLk%dao`arWbfD>e+A
zrT=>l^a&k#{=NQjQ?u>Mt#f|KHwo07`;+&8Q9^+ogc+U~GC!69u?S%X1`dnd{QNQo
z28Nv6vWyf^XCHx?UobFm$1pH5fDl5M5y1qr(D<MYX<%{e1TzBzb96~jVm1Q<Lrh6F
z*iL37rBKtjW01rk_Csah;%Iy{dsR|XN{Sg67-Uj&ic6q+7#SG2Wf&M(pdMnnoSTxF
z$iTp$kedQ_2UHn|vTI;q*m@uoyq1+gHzOsd2qX%^#rdUqAR2~)^GowmJQDL#a#BHR
znCfyf^U@d?7zB!QQ4Cc{fr!hb6s3aXLHa>n;3+MFoUOpXz~oe1l9a>1z>rm3QVfd>
zkeaC60uzu!VR}F`0|OHSXte|b$f*nr3}&gtDIgCsFgG+bs4_4zurxF@h&zaa)H0Sc
zFfb}TV{K??U=`sAd&<Dp(9poh1oC=u2{@QQsz7{QaPWWwi17d@_=-!4!IP(;bs1>*
zOmRtZ6-Xl#>oG7e#4|83Zf9U%dc(lLw32~=X*UA{(`^O@rppWrOph5Dn4%aMm~JpI
zFzsPrV4BXr!1R`Zf$0td1Je@*2BuvM42*La7?|ck<@Yf#Fs*{Jvl$o|uQM<(nK3Xh
zErF^B>3a`lgUko12if_Mfq`i=0|V171_maOo<$4{Ox6qxOhOC{j58P*7=AG@Ft9N|
zJPNw{8iY?2mn7#S`9p_+fr*8IfkTslfo~=QgNP&pgY;4c2Bj?w44Ts!81(HJ7)(Ah
zFxY%#U~q|NVDLK4zz}esfg$1~14BYS149}I14EWC14F@o28QyJ3=H*R3=D0q3=BP<
z3=9*k85pK=Gce4uW?)!wgn?m&D+9xZRtAP02N@U+2s1DoyTQP4_5=gNRUQV0yNnDB
zPf{2dUQc0Q_%xM);fD<a!{40@j7+x~7+E_R7}>fR7};hrFtROWU}W3Kz`(+eCu@P?
zmw|zS8?-zE>Qx3tQ1XK0FPIQCmXWwbW-UmNf(>M@$DOrm85kH-7|6(4%vuy=t#{;P
zty)MRpk*y4l&tlIfq`*H9yn_Wbs%Rg38btA3Y17lieu0~%2J>x0m;EIh!4V`GPeRP
zdqpxZFjg`!Fs@}_U^>Xaz?j9rz$C@M!0?QLf$<{)15*nF1CuWU1Ct&D0}~r0t1#YT
zU|?L#z`&@;z`$t0z`&%<z`&FW(a*G<fr0Ti0|Vm)1_mZp1_s8xP%}Z<C5M55sRwGt
zNd^W+&@eE_qgxpmm{b`U7~eB6FoLo($UKmpP7Dl80SpXGrjTq3Iz5On4Pr0DPjG3%
zz$nAOz<7^=f$=;81Ctm71Cu%f15+6T1IQgrAU}05FfbiuU|{NCU|<CK36#A->ckls
zn4}pPn9La%m{J%Rm`*Y<FnKdDFc~p0Fx4?IFu5@>FbOa)Fo{6SWt_&qz?jd#z_^lu
zfpI1S17kM>17j-#10yJ#3qjH-s0zS|k02EVx(o~q-3$!OyBQd`MHv_bofsIzt}rmj
zH83!!wlgs3R536Z&R}3LKh3~kzm|c)?HdDw?+ylr;N1)i(Q6qPk~J9^(w8tW<h*8J
zDBjJ$P}R=B(0G7>p_7q;p>GQV!(<@_hUtzB40G-?Ff3ldz_418fnjqA1H<m63=D^!
zF)*B*#=vl)kb&X)TLy;vTNoIg+c7Y_%VuEsYR<s$tDJ%1|7iwBmSP4*wq^!Kwuuak
zY;zeH*;X<zFeH_xlz=LXsN7UA4NAv~$hBB*N@h`FK>?_;0m*@}ZYHFvR9#t;4=M{~
ZQxd_NL5Y*WC$TKe)J)IBK+n*?5CER*t;qlY

literal 0
HcmV?d00001

-- 
GitLab


From 68a9ee1d4e041d7690a949718b2651b035a6bfad Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Tue, 23 Jan 2018 15:02:46 -0800
Subject: [PATCH 036/966] LabeledTensor: don't test Tensor.shape object, test
 value instead.

With the C API enabled, a new TensorShape object may be generated
every Tensor.get_shape call.

PiperOrigin-RevId: 182998609
---
 tensorflow/contrib/labeled_tensor/python/ops/core_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/labeled_tensor/python/ops/core_test.py b/tensorflow/contrib/labeled_tensor/python/ops/core_test.py
index 1f4a3ef568..e70b492374 100644
--- a/tensorflow/contrib/labeled_tensor/python/ops/core_test.py
+++ b/tensorflow/contrib/labeled_tensor/python/ops/core_test.py
@@ -225,7 +225,7 @@ class LabeledTensorTest(test_util.Base):
     tensor = array_ops.placeholder(dtypes.string, [None])
     actual = core.LabeledTensor(tensor, ['x'])
     self.assertIsNone(actual.axes['x'].size)
-    self.assertIs(actual.axes['x'].value, tensor.get_shape()[0])
+    self.assertIsNone(actual.axes['x'].value.value)
 
   def test_eq(self):
     self.assertEqual(self.lt, self.lt)
-- 
GitLab


From 58d227d36aa17d038c70c94787f415b1cd64d982 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Jan 2018 15:08:26 -0800
Subject: [PATCH 037/966] Switch models/ to use new test-specification format

PiperOrigin-RevId: 182999650
---
 tensorflow/contrib/lite/models/speech_test.cc | 189 ++++++++++++++++
 .../testdata/speech_asr_lm_model.test_spec    | 202 ++++++++++++++++++
 2 files changed, 391 insertions(+)
 create mode 100644 tensorflow/contrib/lite/models/speech_test.cc
 create mode 100644 tensorflow/contrib/lite/models/testdata/speech_asr_lm_model.test_spec

diff --git a/tensorflow/contrib/lite/models/speech_test.cc b/tensorflow/contrib/lite/models/speech_test.cc
new file mode 100644
index 0000000000..daa8c3100b
--- /dev/null
+++ b/tensorflow/contrib/lite/models/speech_test.cc
@@ -0,0 +1,189 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Unit test for speech models (Hotword, SpeakerId) using TFLite Ops.
+
+#include <memory>
+#include <string>
+
+#include <fstream>
+
+#include "testing/base/public/googletest.h"
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/testing/parse_testdata.h"
+#include "tensorflow/contrib/lite/testing/split.h"
+#include "tensorflow/contrib/lite/testing/tflite_driver.h"
+
+namespace tflite {
+namespace {
+
+const char kDataPath[] = "third_party/tensorflow/contrib/lite/models/testdata/";
+
+bool Init(const string& in_file_name, testing::TfLiteDriver* driver,
+          std::ifstream* in_file) {
+  driver->SetModelBaseDir(kDataPath);
+  in_file->open(string(kDataPath) + in_file_name, std::ifstream::in);
+  return in_file->is_open();
+}
+
+// Converts a set of test files provided by the speech team into a single
+// test_spec. Input CSV files are supposed to contain a number of sequences per
+// line. Each sequence maps to a single invocation of the interpreter and the
+// output tensor after all sequences have run is compared to the corresponding
+// line in the output CSV file.
+bool ConvertCsvData(const string& model_name, const string& in_name,
+                    const string& out_name, const string& input_tensor,
+                    const string& output_tensor,
+                    const string& persistent_tensors, int sequence_size,
+                    std::ostream* out) {
+  auto data_path = [](const string& s) { return string(kDataPath) + s; };
+
+  *out << "load_model: \"" << data_path(model_name) << "\"" << std::endl;
+
+  *out << "init_state: \"" << persistent_tensors << "\"" << std::endl;
+
+  string in_file_name = data_path(in_name);
+  std::ifstream in_file(in_file_name);
+  if (!in_file.is_open()) {
+    std::cerr << "Failed to open " << in_file_name << std::endl;
+    return false;
+  }
+  string out_file_name = data_path(out_name);
+  std::ifstream out_file(out_file_name);
+  if (!out_file.is_open()) {
+    std::cerr << "Failed to open " << out_file_name << std::endl;
+    return false;
+  }
+
+  int invocation_count = 0;
+  string in_values;
+  while (std::getline(in_file, in_values, '\n')) {
+    std::vector<string> input = testing::Split<string>(in_values, ",");
+    int num_sequences = input.size() / sequence_size;
+
+    for (int j = 0; j < num_sequences; ++j) {
+      *out << "invoke {" << std::endl;
+      *out << "  id: " << invocation_count << std::endl;
+      *out << "  input: \"";
+      for (int k = 0; k < sequence_size; ++k) {
+        *out << input[k + j * sequence_size] << ",";
+      }
+      *out << "\"" << std::endl;
+
+      if (j == num_sequences - 1) {
+        string out_values;
+        if (!std::getline(out_file, out_values, '\n')) {
+          std::cerr << "Not enough lines in " << out_file_name << std::endl;
+          return false;
+        }
+        *out << "  output: \"" << out_values << "\"" << std::endl;
+      }
+
+      *out << "}" << std::endl;
+      ++invocation_count;
+    }
+  }
+  return true;
+}
+
+TEST(SpeechTest, HotwordOkGoogleRank1Test) {
+  std::stringstream os;
+  ASSERT_TRUE(ConvertCsvData(
+      "speech_hotword_model_rank1.tflite", "speech_hotword_model_in.csv",
+      "speech_hotword_model_out_rank1.csv", /*input_tensor=*/"0",
+      /*output_tensor=*/"18", /*persistent_tensors=*/"4",
+      /*sequence_size=*/40, &os));
+  testing::TfLiteDriver test_driver(/*use_nnapi=*/false);
+  ASSERT_TRUE(testing::ParseAndRunTests(&os, &test_driver))
+      << test_driver.GetErrorMessage();
+}
+
+TEST(SpeechTest, HotwordOkGoogleRank2Test) {
+  std::stringstream os;
+  ASSERT_TRUE(ConvertCsvData(
+      "speech_hotword_model_rank2.tflite", "speech_hotword_model_in.csv",
+      "speech_hotword_model_out_rank2.csv", /*input_tensor=*/"17",
+      /*output_tensor=*/"18", /*persistent_tensors=*/"1",
+      /*sequence_size=*/40, &os));
+  testing::TfLiteDriver test_driver(/*use_nnapi=*/false);
+  ASSERT_TRUE(testing::ParseAndRunTests(&os, &test_driver))
+      << test_driver.GetErrorMessage();
+}
+
+TEST(SpeechTest, SpeakerIdOkGoogleTest) {
+  std::stringstream os;
+  ASSERT_TRUE(ConvertCsvData(
+      "speech_speakerid_model.tflite", "speech_speakerid_model_in.csv",
+      "speech_speakerid_model_out.csv", /*input_tensor=*/"0",
+      /*output_tensor=*/"66",
+      /*persistent_tensors=*/"19,20,40,41,61,62",
+      /*sequence_size=*/80, &os));
+  testing::TfLiteDriver test_driver(/*use_nnapi=*/false);
+  ASSERT_TRUE(testing::ParseAndRunTests(&os, &test_driver))
+      << test_driver.GetErrorMessage();
+}
+
+TEST(SpeechTest, AsrAmTest) {
+  std::stringstream os;
+  ASSERT_TRUE(
+      ConvertCsvData("speech_asr_am_model.tflite", "speech_asr_am_model_in.csv",
+                     "speech_asr_am_model_out.csv", /*input_tensor=*/"0",
+                     /*output_tensor=*/"109",
+                     /*persistent_tensors=*/"19,20,40,41,61,62,82,83,103,104",
+                     /*sequence_size=*/320, &os));
+  testing::TfLiteDriver test_driver(/*use_nnapi=*/false);
+  ASSERT_TRUE(testing::ParseAndRunTests(&os, &test_driver))
+      << test_driver.GetErrorMessage();
+}
+
+// The original version of speech_asr_lm_model_test.cc ran a few sequences
+// through the interpreter and stored the sum of all the output, which was them
+// compared for correctness. In this test we are comparing all the intermediate
+// results.
+TEST(SpeechTest, AsrLmTest) {
+  std::ifstream in_file;
+  testing::TfLiteDriver test_driver(/*use_nnapi=*/false);
+  ASSERT_TRUE(Init("speech_asr_lm_model.test_spec", &test_driver, &in_file));
+  ASSERT_TRUE(testing::ParseAndRunTests(&in_file, &test_driver))
+      << test_driver.GetErrorMessage();
+}
+
+TEST(SpeechTest, EndpointerTest) {
+  std::stringstream os;
+  ASSERT_TRUE(ConvertCsvData(
+      "speech_endpointer_model.tflite", "speech_endpointer_model_in.csv",
+      "speech_endpointer_model_out.csv", /*input_tensor=*/"0",
+      /*output_tensor=*/"58",
+      /*persistent_tensors=*/"28,29,49,50",
+      /*sequence_size=*/320, &os));
+  testing::TfLiteDriver test_driver(/*use_nnapi=*/false);
+  ASSERT_TRUE(testing::ParseAndRunTests(&os, &test_driver))
+      << test_driver.GetErrorMessage();
+}
+
+TEST(SpeechTest, TtsTest) {
+  std::stringstream os;
+  ASSERT_TRUE(ConvertCsvData("speech_tts_model.tflite",
+                             "speech_tts_model_in.csv",
+                             "speech_tts_model_out.csv", /*input_tensor=*/"0",
+                             /*output_tensor=*/"74",
+                             /*persistent_tensors=*/"25,26,46,47,67,68,73",
+                             /*sequence_size=*/334, &os));
+  testing::TfLiteDriver test_driver(/*use_nnapi=*/false);
+  ASSERT_TRUE(testing::ParseAndRunTests(&os, &test_driver))
+      << test_driver.GetErrorMessage();
+}
+
+}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/models/testdata/speech_asr_lm_model.test_spec b/tensorflow/contrib/lite/models/testdata/speech_asr_lm_model.test_spec
new file mode 100644
index 0000000000..5812de4b30
--- /dev/null
+++ b/tensorflow/contrib/lite/models/testdata/speech_asr_lm_model.test_spec
@@ -0,0 +1,202 @@
+load_model: "speech_asr_lm_model.tflite"
+init_state: "21,22,42,43,63,64"
+invoke {
+  id: 3
+  input: "63982"
+  input: "8409"
+  output: "-2.75389"
+}
+invoke {
+  id: 4
+  input: "8409"
+  input: "1488"
+  output: "0.601841"
+}
+invoke {
+  id: 5
+  input: "1488"
+  input: "63981"
+  output: "-0.314846"
+}
+init_state: "21,22,42,43,63,64"
+invoke {
+  id: 6
+  input: "63982"
+  input: "8409"
+  output: "-2.75389"
+}
+invoke {
+  id: 7
+  input: "8409"
+  input: "3082"
+  output: "-3.63721"
+}
+init_state: "21,22,42,43,63,64"
+invoke {
+  id: 8
+  input: "63982"
+  input: "8409"
+  output: "-2.75389"
+}
+invoke {
+  id: 9
+  input: "8409"
+  input: "18965"
+  output: "-6.93985"
+}
+init_state: "21,22,42,43,63,64"
+invoke {
+  id: 13
+  input: "63982"
+  input: "12516"
+  output: "-6.20867"
+}
+invoke {
+  id: 14
+  input: "12516"
+  input: "914"
+  output: "-0.407277"
+}
+invoke {
+  id: 15
+  input: "914"
+  input: "63981"
+  output: "-3.82091"
+}
+init_state: "21,22,42,43,63,64"
+invoke {
+  id: 19
+  input: "63982"
+  input: "12516"
+  output: "-6.20867"
+}
+invoke {
+  id: 20
+  input: "12516"
+  input: "914"
+  output: "-0.407277"
+}
+invoke {
+  id: 21
+  input: "914"
+  input: "48619"
+  output: "-4.02131"
+}
+invoke {
+  id: 22
+  input: "48619"
+  input: "63981"
+  output: "-0.677399"
+}
+init_state: "21,22,42,43,63,64"
+invoke {
+  id: 26
+  input: "63982"
+  input: "12516"
+  output: "-6.20867"
+}
+invoke {
+  id: 27
+  input: "12516"
+  input: "914"
+  output: "-0.407277"
+}
+invoke {
+  id: 28
+  input: "914"
+  input: "4700"
+  output: "-4.056"
+}
+invoke {
+  id: 29
+  input: "4700"
+  input: "63981"
+  output: "0.415889"
+}
+init_state: "21,22,42,43,63,64"
+invoke {
+  id: 30
+  input: "63982"
+  input: "12516"
+  output: "-6.20867"
+}
+invoke {
+  id: 31
+  input: "12516"
+  input: "914"
+  output: "-0.407277"
+invoke {
+  id: 32
+  input: "914"
+  input: "51923"
+  output: "-14.1147"
+}
+init_state: "21,22,42,43,63,64"
+invoke {
+  id: 34
+  input: "63982"
+  input: "5520"
+  output: "-4.56971"
+}
+invoke {
+  id: 35
+  input: "5520"
+  input: "16318"
+  output: "-1.54815"
+}
+init_state: "21,22,42,43,63,64"
+invoke {
+  id: 36
+  input: "63982"
+  input: "5520"
+  output: "-4.56971"
+}
+invoke {
+  id: 37
+  input: "5520"
+  input: "28303"
+  output: "-14.0947"
+}
+init_state: "21,22,42,43,63,64"
+invoke {
+  id: 38
+  input: "63982"
+  input: "12451"
+  output: "-6.24243"
+}
+invoke {
+  id: 39
+  input: "12451"
+  input: "752"
+  output: "0.0700736"
+}
+invoke {
+  id: 40
+  input: "752"
+  input: "11"
+  output: "-1.72744"
+}
+invoke {
+  id: 41
+  input: "11"
+  input: "19454"
+  output: "-3.19211"
+}
+invoke {
+  id: 42
+  input: "19454"
+  input: "16989"
+  output: "-4.01684"
+}
+invoke {
+  id: 43
+  input: "16989"
+  input: "40168"
+  output: "-8.91317"
+}
+invoke {
+  id: 44
+  input: "40168"
+  input: "63981"
+  output: "-0.675377"
+}
-- 
GitLab


From 3f3c9f73e269d1a24b7bb2841d17ad6be3353b7e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Jan 2018 15:11:22 -0800
Subject: [PATCH 038/966] Adds regularization_losses in remaining heads.

PiperOrigin-RevId: 183000075
---
 .../estimator/python/estimator/head.py        |  71 +++++--
 .../estimator/python/estimator/head_test.py   |  80 +++++++-
 tensorflow/python/estimator/canned/head.py    | 115 ++++++++---
 .../python/estimator/canned/head_test.py      | 186 +++++++++++++++++-
 4 files changed, 410 insertions(+), 42 deletions(-)

diff --git a/tensorflow/contrib/estimator/python/estimator/head.py b/tensorflow/contrib/estimator/python/estimator/head.py
index d6ca33e189..fd0994490a 100644
--- a/tensorflow/contrib/estimator/python/estimator/head.py
+++ b/tensorflow/contrib/estimator/python/estimator/head.py
@@ -220,7 +220,7 @@ def multi_label_head(n_classes,
   `batch_size`.
 
   The head expects `logits` with shape `[D0, D1, ... DN, n_classes]`. In many
-  applications, the shape is `[batch_size, label_n_classes]`.
+  applications, the shape is `[batch_size, n_classes]`.
 
   Labels can be:
   * A multi-hot tensor of shape `[D0, D1, ... DN, n_classes]`
@@ -392,8 +392,32 @@ class _MultiLabelHead(head_lib._Head):  # pylint:disable=protected-access
         processed_labels=processed_labels)
 
   def create_estimator_spec(
-      self, features, mode, logits, labels=None, train_op_fn=None):
-    """See `Head`."""
+      self, features, mode, logits, labels=None, train_op_fn=None,
+      regularization_losses=None):
+    """Returns an `EstimatorSpec`.
+
+    Args:
+      features: Input `dict` of `Tensor` or `SparseTensor` objects.
+      mode: Estimator's `ModeKeys`.
+      logits: logits `Tensor` with shape `[D0, D1, ... DN, n_classes]`.
+        For many applications, the shape is `[batch_size, n_classes]`.
+      labels: Labels with shape matching `logits`. Can be multi-hot `Tensor`
+        with shape `[D0, D1, ... DN, n_classes]` or `SparseTensor` with
+        `dense_shape` `[D0, D1, ... DN, ?]`. `labels` is required argument when
+        `mode` equals `TRAIN` or `EVAL`.
+      train_op_fn: Function that takes a scalar loss `Tensor` and returns
+        `train_op`. Required in TRAIN mode.
+      regularization_losses: A list of additional scalar losses to be added to
+        the training loss, such as regularization losses. These losses are
+        usually expressed as a batch average, so for best results users need to
+        set `loss_reduction=SUM_OVER_BATCH_SIZE` or
+        `loss_reduction=SUM_OVER_NONZERO_WEIGHTS` when creating the head to
+        avoid scaling errors.
+    Returns:
+      `EstimatorSpec`.
+    Raises:
+      ValueError: If `train_op_fn` is `None` in TRAIN mode.
+    """
     with ops.name_scope(self._name, 'head'):
       logits = head_lib._check_logits_final_dim(logits, self.logits_dimension)  # pylint:disable=protected-access
 
@@ -422,18 +446,26 @@ class _MultiLabelHead(head_lib._Head):  # pylint:disable=protected-access
       (training_loss, unreduced_loss, weights,
        processed_labels) = self.create_loss(
            features=features, mode=mode, logits=logits, labels=labels)
+      if regularization_losses:
+        regularization_loss = math_ops.add_n(regularization_losses)
+        regularized_training_loss = math_ops.add_n(
+            [training_loss, regularization_loss])
+      else:
+        regularization_loss = None
+        regularized_training_loss = training_loss
 
       # Eval.
       if mode == model_fn.ModeKeys.EVAL:
         return model_fn.EstimatorSpec(
             mode=model_fn.ModeKeys.EVAL,
             predictions=predictions,
-            loss=training_loss,
+            loss=regularized_training_loss,
             eval_metric_ops=self._eval_metric_ops(
                 labels=processed_labels,
                 probabilities=probabilities,
                 weights=weights,
-                unreduced_loss=unreduced_loss))
+                unreduced_loss=unreduced_loss,
+                regularization_loss=regularization_loss))
 
       # Train.
       if train_op_fn is None:
@@ -447,25 +479,31 @@ class _MultiLabelHead(head_lib._Head):  # pylint:disable=protected-access
       else:
         mean_loss = None
     with ops.name_scope(''):
+      keys = metric_keys.MetricKeys
       summary.scalar(
-          head_lib._summary_key(self._name, metric_keys.MetricKeys.LOSS),  # pylint:disable=protected-access
-          training_loss)
+          head_lib._summary_key(self._name, keys.LOSS),  # pylint:disable=protected-access
+          regularized_training_loss)
       if mean_loss is not None:
         summary.scalar(
-            head_lib._summary_key(  # pylint:disable=protected-access
-                self._name, metric_keys.MetricKeys.LOSS_MEAN),
+            head_lib._summary_key(self._name, keys.LOSS_MEAN),  # pylint:disable=protected-access
             mean_loss)
+      if regularization_loss is not None:
+        summary.scalar(
+            head_lib._summary_key(self._name, keys.LOSS_REGULARIZATION),  # pylint:disable=protected-access
+            regularization_loss)
     return model_fn.EstimatorSpec(
         mode=model_fn.ModeKeys.TRAIN,
         predictions=predictions,
-        loss=training_loss,
-        train_op=train_op_fn(training_loss))
+        loss=regularized_training_loss,
+        train_op=train_op_fn(regularized_training_loss))
 
-  def _eval_metric_ops(self, labels, probabilities, weights, unreduced_loss):
+  def _eval_metric_ops(
+      self, labels, probabilities, weights, unreduced_loss,
+      regularization_loss):
     """Returns a dict of metrics for eval_metric_ops."""
     with ops.name_scope(
         None, 'metrics',
-        [labels, probabilities, weights, unreduced_loss]):
+        [labels, probabilities, weights, unreduced_loss, regularization_loss]):
       keys = metric_keys.MetricKeys
       metric_ops = {
           # Estimator already adds a metric for loss.
@@ -482,6 +520,13 @@ class _MultiLabelHead(head_lib._Head):  # pylint:disable=protected-access
                               weights=weights, curve='PR',
                               name=keys.AUC_PR),
       }
+      if regularization_loss is not None:
+        loss_regularization_key = head_lib._summary_key(  # pylint:disable=protected-access
+            self._name, keys.LOSS_REGULARIZATION)
+        metric_ops[loss_regularization_key] = (
+            metrics_lib.mean(
+                values=regularization_loss,
+                name=keys.LOSS_REGULARIZATION))
       for threshold in self._thresholds:
         accuracy_key = keys.ACCURACY_AT_THRESHOLD % threshold
         metric_ops[head_lib._summary_key(self._name, accuracy_key)] = (  # pylint:disable=protected-access
diff --git a/tensorflow/contrib/estimator/python/estimator/head_test.py b/tensorflow/contrib/estimator/python/estimator/head_test.py
index e39e44541d..1adbd6f0fe 100644
--- a/tensorflow/contrib/estimator/python/estimator/head_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/head_test.py
@@ -399,12 +399,13 @@ class MultiLabelHead(test.TestCase):
 
   def _test_eval(
       self, head, logits, labels, expected_loss, expected_metrics,
-      features=None):
+      features=None, regularization_losses=None):
     spec = head.create_estimator_spec(
         features=features or {},
         mode=model_fn.ModeKeys.EVAL,
         logits=logits,
-        labels=labels)
+        labels=labels,
+        regularization_losses=regularization_losses)
 
     # Assert spec contains expected tensors.
     self.assertIsNotNone(spec.loss)
@@ -486,6 +487,38 @@ class MultiLabelHead(test.TestCase):
         expected_loss=expected_loss,
         expected_metrics=expected_metrics)
 
+  def test_eval_with_regularization_losses(self):
+    n_classes = 2
+    head = head_lib.multi_label_head(
+        n_classes, loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE)
+    logits = np.array([[-1., 1.], [-1.5, 1.5]], dtype=np.float32)
+    labels = np.array([[1, 0], [1, 1]], dtype=np.int64)
+    regularization_losses = [1.5, 0.5]
+    expected_regularization_loss = 2.
+    # unregularized_loss = sum(
+    #     labels * -log(sigmoid(logits)) +
+    #     (1 - labels) * -log(1 - sigmoid(logits))) / batch_size
+    expected_unregularized_loss = np.sum(
+        _sigmoid_cross_entropy(labels=labels, logits=logits)) / 2.
+    expected_regularized_loss = (
+        expected_unregularized_loss + expected_regularization_loss)
+    keys = metric_keys.MetricKeys
+    expected_metrics = {
+        keys.LOSS_MEAN: expected_unregularized_loss,
+        keys.LOSS_REGULARIZATION: expected_regularization_loss,
+        # auc and auc_pr cannot be reliably calculated for only 4 samples, but
+        # this assert tests that the algorithm remains consistent.
+        keys.AUC: 0.3333,
+        keys.AUC_PR: 0.7639,
+    }
+    self._test_eval(
+        head=head,
+        logits=logits,
+        labels=labels,
+        expected_loss=expected_regularized_loss,
+        expected_metrics=expected_metrics,
+        regularization_losses=regularization_losses)
+
   def test_eval_with_label_vocabulary(self):
     n_classes = 2
     head = head_lib.multi_label_head(
@@ -829,6 +862,49 @@ class MultiLabelHead(test.TestCase):
     self._test_train(
         head=head, logits=logits, labels=labels, expected_loss=expected_loss)
 
+  def test_train_with_regularization_losses(self):
+    head = head_lib.multi_label_head(
+        n_classes=2, loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE)
+    logits = np.array([[-10., 10.], [-15., 10.]], dtype=np.float32)
+    labels = np.array([[1, 0], [1, 1]], dtype=np.int64)
+    regularization_losses = [1.5, 0.5]
+    # For large logits, sigmoid cross entropy loss is approximated as:
+    # loss = labels * (logits < 0) * (-logits) +
+    #        (1 - labels) * (logits > 0) * logits =>
+    # expected_unweighted_loss = [[10., 10.], [15., 0.]]
+    # Average over classes and over batch and add regularization loss.
+    expected_loss = 35. / 4. + 2.
+    expected_summaries = {
+        metric_keys.MetricKeys.LOSS: expected_loss,
+        metric_keys.MetricKeys.LOSS_REGULARIZATION: 2.,
+    }
+    expected_train_result = 'my_train_op'
+    def _train_op_fn(loss):
+      return string_ops.string_join(
+          [constant_op.constant(expected_train_result),
+           string_ops.as_string(loss, precision=3)])
+
+    spec = head.create_estimator_spec(
+        features={'x': np.array(((42,),), dtype=np.int32)},
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels,
+        train_op_fn=_train_op_fn,
+        regularization_losses=regularization_losses)
+
+    # Assert predictions, loss, train_op, and summaries.
+    tol = 1e-3
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNotNone(spec.scaffold.summary_op)
+      loss, train_result, summary_str = sess.run((spec.loss, spec.train_op,
+                                                  spec.scaffold.summary_op))
+      self.assertAllClose(expected_loss, loss, rtol=tol, atol=tol)
+      self.assertEqual(
+          six.b('{0:s}{1:.3f}'.format(expected_train_result, expected_loss)),
+          train_result)
+      _assert_simple_summaries(self, expected_summaries, summary_str, tol)
+
   def test_train_with_weights(self):
     n_classes = 2
     head = head_lib.multi_label_head(n_classes, weight_column='example_weights')
diff --git a/tensorflow/python/estimator/canned/head.py b/tensorflow/python/estimator/canned/head.py
index 204e1119f2..94a5d3a342 100644
--- a/tensorflow/python/estimator/canned/head.py
+++ b/tensorflow/python/estimator/canned/head.py
@@ -627,15 +627,15 @@ class _MultiClassHeadWithSoftmaxCrossEntropyLoss(_Head):
       logits: logits `Tensor` with shape `[D0, D1, ... DN, logits_dimension]`.
         For many applications, the shape is `[batch_size, logits_dimension]`.
       labels: Labels integer or string `Tensor` with shape matching `logits`,
-        namely `[D0, D1, ... DN, 1]`. `labels` is required argument when `mode`
-        equals `TRAIN` or `EVAL`.
+        namely `[D0, D1, ... DN, 1]` or `[D0, D1, ... DN]`. `labels` is
+        required argument when `mode` equals `TRAIN` or `EVAL`.
       train_op_fn: Function that takes a scalar loss `Tensor` and returns
         `train_op`. Required in TRAIN mode.
       regularization_losses: A list of additional scalar losses to be added to
         the training loss, such as regularization losses. These losses are
         usually expressed as a batch average, so for best results users need to
-        set `loss_reduction=MEAN_PER_ELEMENT` or
-        `loss_reduction=SUM_BY_NONZERO_WEIGHTS` when creating the head to
+        set `loss_reduction=SUM_OVER_BATCH_SIZE` or
+        `loss_reduction=SUM_OVER_NONZERO_WEIGHTS` when creating the head to
         avoid scaling errors.
     Returns:
       `EstimatorSpec`.
@@ -827,10 +827,10 @@ class _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(_Head):
     return 1
 
   def _eval_metric_ops(self, labels, logits, logistic, class_ids, weights,
-                       unreduced_loss):
+                       unreduced_loss, regularization_loss):
     with ops.name_scope(None, 'metrics',
                         (labels, logits, logistic, class_ids, weights,
-                         unreduced_loss)):
+                         unreduced_loss, regularization_loss)):
       keys = metric_keys.MetricKeys
       labels_mean = _indicator_labels_mean(
           labels=labels, weights=weights, name=keys.LABEL_MEAN)
@@ -870,6 +870,11 @@ class _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(_Head):
                   curve='PR',
                   name=keys.AUC_PR)
       }
+      if regularization_loss is not None:
+        metric_ops[_summary_key(self._name, keys.LOSS_REGULARIZATION)] = (
+            metrics_lib.mean(
+                values=regularization_loss,
+                name=keys.LOSS_REGULARIZATION))
       for threshold in self._thresholds:
         accuracy_key = keys.ACCURACY_AT_THRESHOLD % threshold
         metric_ops[_summary_key(self._name,
@@ -924,8 +929,31 @@ class _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(_Head):
         processed_labels=labels)
 
   def create_estimator_spec(
-      self, features, mode, logits, labels=None, train_op_fn=None):
-    """See `Head`."""
+      self, features, mode, logits, labels=None, train_op_fn=None,
+      regularization_losses=None):
+    """Returns an `EstimatorSpec`.
+
+    Args:
+      features: Input `dict` of `Tensor` or `SparseTensor` objects.
+      mode: Estimator's `ModeKeys`.
+      logits: logits `Tensor` with shape `[D0, D1, ... DN, 1]`. For many
+        applications, the shape is `[batch_size, 1]`.
+      labels: Labels integer or string `Tensor` with shape matching `logits`,
+        namely `[D0, D1, ... DN, 1]` or `[D0, D1, ... DN]`. `labels` is required
+        argument when `mode` equals `TRAIN` or `EVAL`.
+      train_op_fn: Function that takes a scalar loss `Tensor` and returns
+        `train_op`. Required in TRAIN mode.
+      regularization_losses: A list of additional scalar losses to be added to
+        the training loss, such as regularization losses. These losses are
+        usually expressed as a batch average, so for best results users need to
+        set `loss_reduction=SUM_OVER_BATCH_SIZE` or
+        `loss_reduction=SUM_OVER_NONZERO_WEIGHTS` when creating the head to
+        avoid scaling errors.
+    Returns:
+      `EstimatorSpec`.
+    Raises:
+      ValueError: If `train_op_fn` is `None` in TRAIN mode.
+    """
     # Predict.
     with ops.name_scope(self._name, 'head'):
       with ops.name_scope(None, 'predictions', (logits,)):
@@ -972,20 +1000,28 @@ class _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(_Head):
       (training_loss, unreduced_loss, weights, processed_labels) = (
           self.create_loss(
               features=features, mode=mode, logits=logits, labels=labels))
+      if regularization_losses:
+        regularization_loss = math_ops.add_n(regularization_losses)
+        regularized_training_loss = math_ops.add_n(
+            [training_loss, regularization_loss])
+      else:
+        regularization_loss = None
+        regularized_training_loss = training_loss
 
       # Eval.
       if mode == model_fn.ModeKeys.EVAL:
         return model_fn.EstimatorSpec(
             mode=model_fn.ModeKeys.EVAL,
             predictions=predictions,
-            loss=training_loss,
+            loss=regularized_training_loss,
             eval_metric_ops=self._eval_metric_ops(
                 labels=processed_labels,
                 logits=logits,
                 logistic=logistic,
                 class_ids=class_ids,
                 weights=weights,
-                unreduced_loss=unreduced_loss))
+                unreduced_loss=unreduced_loss,
+                regularization_loss=regularization_loss))
 
       # Train.
       if train_op_fn is None:
@@ -999,18 +1035,22 @@ class _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(_Head):
       else:
         mean_loss = None
     with ops.name_scope(''):
+      keys = metric_keys.MetricKeys
       summary.scalar(
-          _summary_key(self._name, metric_keys.MetricKeys.LOSS),
-          training_loss)
+          _summary_key(self._name, keys.LOSS),
+          regularized_training_loss)
       if mean_loss is not None:
         summary.scalar(
-            _summary_key(self._name, metric_keys.MetricKeys.LOSS_MEAN),
-            mean_loss)
+            _summary_key(self._name, keys.LOSS_MEAN), mean_loss)
+      if regularization_loss is not None:
+        summary.scalar(
+            _summary_key(self._name, keys.LOSS_REGULARIZATION),
+            regularization_loss)
     return model_fn.EstimatorSpec(
         mode=model_fn.ModeKeys.TRAIN,
         predictions=predictions,
-        loss=training_loss,
-        train_op=train_op_fn(training_loss))
+        loss=regularized_training_loss,
+        train_op=train_op_fn(regularized_training_loss))
 
 
 def _regression_head_with_mean_squared_error_loss(
@@ -1111,7 +1151,8 @@ class _RegressionHeadWithMeanSquaredErrorLoss(_Head):
         processed_labels=labels)
 
   def create_estimator_spec(
-      self, features, mode, logits, labels=None, train_op_fn=None):
+      self, features, mode, logits, labels=None, train_op_fn=None,
+      regularization_losses=None):
     """Returns an `EstimatorSpec`.
 
     Args:
@@ -1125,6 +1166,12 @@ class _RegressionHeadWithMeanSquaredErrorLoss(_Head):
         `mode` equals `TRAIN` or `EVAL`.
       train_op_fn: Function that takes a scalar loss `Tensor` and returns
         `train_op`. Required in TRAIN mode.
+      regularization_losses: A list of additional scalar losses to be added to
+        the training loss, such as regularization losses. These losses are
+        usually expressed as a batch average, so for best results users need to
+        set `loss_reduction=SUM_OVER_BATCH_SIZE` or
+        `loss_reduction=SUM_OVER_NONZERO_WEIGHTS` when creating the head to
+        avoid scaling errors.
     Returns:
       `EstimatorSpec`.
     Raises:
@@ -1147,20 +1194,34 @@ class _RegressionHeadWithMeanSquaredErrorLoss(_Head):
 
       training_loss, unreduced_loss, weights, _ = self.create_loss(
           features=features, mode=mode, logits=logits, labels=labels)
+      if regularization_losses:
+        regularization_loss = math_ops.add_n(regularization_losses)
+        regularized_training_loss = math_ops.add_n(
+            [training_loss, regularization_loss])
+      else:
+        regularization_loss = None
+        regularized_training_loss = training_loss
 
       # Eval.
       if mode == model_fn.ModeKeys.EVAL:
+        keys = metric_keys.MetricKeys
         # Estimator already adds a metric for loss.
         eval_metric_ops = {
-            _summary_key(self._name, metric_keys.MetricKeys.LOSS_MEAN):
+            _summary_key(self._name, keys.LOSS_MEAN):
                 metrics_lib.mean(
                     values=unreduced_loss,
                     weights=weights)
         }
+        if regularization_loss is not None:
+          regularization_loss_key = _summary_key(
+              self._name, keys.LOSS_REGULARIZATION)
+          eval_metric_ops[regularization_loss_key] = metrics_lib.mean(
+              values=regularization_loss,
+              name=keys.LOSS_REGULARIZATION)
         return model_fn.EstimatorSpec(
             mode=model_fn.ModeKeys.EVAL,
             predictions=predictions,
-            loss=training_loss,
+            loss=regularized_training_loss,
             eval_metric_ops=eval_metric_ops)
 
       # Train.
@@ -1175,18 +1236,22 @@ class _RegressionHeadWithMeanSquaredErrorLoss(_Head):
       else:
         mean_loss = None
     with ops.name_scope(''):
+      keys = metric_keys.MetricKeys
       summary.scalar(
-          _summary_key(self._name, metric_keys.MetricKeys.LOSS),
-          training_loss)
+          _summary_key(self._name, keys.LOSS),
+          regularized_training_loss)
       if mean_loss is not None:
         summary.scalar(
-            _summary_key(self._name, metric_keys.MetricKeys.LOSS_MEAN),
-            mean_loss)
+            _summary_key(self._name, keys.LOSS_MEAN), mean_loss)
+      if regularization_loss is not None:
+        summary.scalar(
+            _summary_key(self._name, keys.LOSS_REGULARIZATION),
+            regularization_loss)
     return model_fn.EstimatorSpec(
         mode=model_fn.ModeKeys.TRAIN,
         predictions=predictions,
-        loss=training_loss,
-        train_op=train_op_fn(training_loss))
+        loss=regularized_training_loss,
+        train_op=train_op_fn(regularized_training_loss))
 
 
 def _assert_range(labels, n_classes, message=None):
diff --git a/tensorflow/python/estimator/canned/head_test.py b/tensorflow/python/estimator/canned/head_test.py
index 28b8e635fb..4e871e8f37 100644
--- a/tensorflow/python/estimator/canned/head_test.py
+++ b/tensorflow/python/estimator/canned/head_test.py
@@ -487,7 +487,7 @@ class MultiClassHeadWithSoftmaxCrossEntropyLoss(test.TestCase):
   def test_eval_with_regularization_losses(self):
     n_classes = 3
     head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
-        n_classes, loss_reduction=losses.Reduction.MEAN)
+        n_classes, loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE)
     logits = np.array(((10, 0, 0), (0, 10, 0),), dtype=np.float32)
     labels = np.array(((1,), (1,)), dtype=np.int64)
     features = {'x': np.array(((42,),), dtype=np.int32)}
@@ -790,7 +790,7 @@ class MultiClassHeadWithSoftmaxCrossEntropyLoss(test.TestCase):
   def test_train_with_regularization_losses(self):
     n_classes = 3
     head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
-        n_classes, loss_reduction=losses.Reduction.MEAN)
+        n_classes, loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE)
 
     logits = np.array(((10, 0, 0), (0, 10, 0),), dtype=np.float32)
     labels = np.array(((1,), (1,)), dtype=np.int64)
@@ -1485,6 +1485,53 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
     ]
     self.assertItemsEqual(expected_metric_keys, spec.eval_metric_ops.keys())
 
+  def test_eval_with_regularization_losses(self):
+    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+        loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE)
+    logits = np.array(((45,), (-41,),), dtype=np.float32)
+    labels = np.array(((1,), (1,),), dtype=np.int32)
+    features = {'x': np.array(((42,),), dtype=np.int32)}
+    regularization_losses = [1.5, 0.5]
+    expected_regularization_loss = 2.
+    # unregularized_loss = sum(cross_entropy(labels, logits)) / batch_size
+    #                    = sum(0, 41) / 2 = 20.5
+    expected_unregularized_loss = 20.5
+    expected_regularized_loss = (
+        expected_unregularized_loss + expected_regularization_loss)
+
+    # Create estimator spec.
+    spec = head.create_estimator_spec(
+        features=features,
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits,
+        labels=labels,
+        regularization_losses=regularization_losses)
+
+    keys = metric_keys.MetricKeys
+    expected_metrics = {
+        keys.LOSS_MEAN: expected_unregularized_loss,
+        keys.LOSS_REGULARIZATION: expected_regularization_loss,
+        keys.ACCURACY: 1./2,
+        keys.PREDICTION_MEAN: 1./2,
+        keys.LABEL_MEAN: 2./2,
+        keys.ACCURACY_BASELINE: 2./2,
+        keys.AUC: 0.,
+        keys.AUC_PR: 1.,
+    }
+
+    # Assert predictions, loss, and metrics.
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNone(spec.scaffold.summary_op)
+      value_ops = {k: spec.eval_metric_ops[k][0] for k in spec.eval_metric_ops}
+      update_ops = {k: spec.eval_metric_ops[k][1] for k in spec.eval_metric_ops}
+      loss, metrics = sess.run((spec.loss, update_ops))
+      self.assertAllClose(expected_regularized_loss, loss)
+      # Check results of both update (in `metrics`) and value ops.
+      self.assertAllClose(expected_metrics, metrics)
+      self.assertAllClose(
+          expected_metrics, {k: value_ops[k].eval() for k in value_ops})
+
   def test_eval_with_vocabulary_list_create_loss(self):
     head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
         label_vocabulary=['aang', 'iroh'])
@@ -1749,6 +1796,49 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
           },
           summary_str)
 
+  def test_train_with_regularization_losses(self):
+    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+        loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE)
+
+    logits = np.array(((45,), (-41,),), dtype=np.float32)
+    labels = np.array(((1,), (1,),), dtype=np.float64)
+    expected_train_result = b'my_train_op'
+    features = {'x': np.array(((42,),), dtype=np.float32)}
+    regularization_losses = [1.5, 0.5]
+    expected_regularization_loss = 2.
+    # unregularized_loss = sum(cross_entropy(labels, logits)) / batch_size
+    #                    = sum(0, 41) / 2 = 20.5
+    # loss = unregularized_loss + regularization_loss = 7.
+    expected_loss = 22.5
+    def _train_op_fn(loss):
+      with ops.control_dependencies((check_ops.assert_equal(
+          math_ops.to_float(expected_loss), math_ops.to_float(loss),
+          name='assert_loss'),)):
+        return constant_op.constant(expected_train_result)
+
+    # Create estimator spec.
+    spec = head.create_estimator_spec(
+        features=features,
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels,
+        train_op_fn=_train_op_fn,
+        regularization_losses=regularization_losses)
+
+    # Assert predictions, loss, train_op, and summaries.
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNotNone(spec.scaffold.summary_op)
+      loss, train_result, summary_str = sess.run((spec.loss, spec.train_op,
+                                                  spec.scaffold.summary_op))
+      self.assertAllClose(expected_loss, loss)
+      self.assertEqual(expected_train_result, train_result)
+      _assert_simple_summaries(self, {
+          metric_keys.MetricKeys.LOSS: expected_loss,
+          metric_keys.MetricKeys.LOSS_REGULARIZATION: (
+              expected_regularization_loss),
+      }, summary_str)
+
   def test_float_labels_train_create_loss(self):
     head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss()
 
@@ -2512,6 +2602,51 @@ class RegressionHeadWithMeanSquaredErrorLossTest(test.TestCase):
     ]
     self.assertItemsEqual(expected_metric_keys, spec.eval_metric_ops.keys())
 
+  def test_eval_with_regularization_losses(self):
+    head = head_lib._regression_head_with_mean_squared_error_loss(
+        loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE)
+    self.assertEqual(1, head.logits_dimension)
+
+    logits = np.array(((45,), (41,),), dtype=np.float32)
+    labels = np.array(((43,), (44,),), dtype=np.int32)
+    features = {'x': np.array(((42,),), dtype=np.float32)}
+    regularization_losses = [1.5, 0.5]
+    expected_regularization_loss = 2.
+    # unregularized_loss = ((43-45)^2 + (44-41)^2) / batch_size
+    #                    = (4 + 9) / 2 = 6.5
+    expected_unregularized_loss = 6.5
+    expected_regularized_loss = (
+        expected_unregularized_loss + expected_regularization_loss)
+    # Create estimator spec.
+    spec = head.create_estimator_spec(
+        features=features,
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits,
+        labels=labels,
+        regularization_losses=regularization_losses)
+
+    keys = metric_keys.MetricKeys
+    expected_metrics = {
+        keys.LOSS_MEAN: expected_unregularized_loss,
+        keys.LOSS_REGULARIZATION: expected_regularization_loss,
+    }
+
+    # Assert predictions, loss, and metrics.
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNone(spec.scaffold.summary_op)
+      value_ops = {k: spec.eval_metric_ops[k][0] for k in spec.eval_metric_ops}
+      update_ops = {k: spec.eval_metric_ops[k][1] for k in spec.eval_metric_ops}
+      prediction_key = prediction_keys.PredictionKeys.PREDICTIONS
+      predictions, loss, metrics = sess.run((
+          spec.predictions[prediction_key], spec.loss, update_ops))
+      self.assertAllClose(logits, predictions)
+      self.assertAllClose(expected_regularized_loss, loss)
+      # Check results of both update (in `metrics`) and value ops.
+      self.assertAllClose(expected_metrics, metrics)
+      self.assertAllClose(
+          expected_metrics, {k: value_ops[k].eval() for k in value_ops})
+
   def test_train_create_loss(self):
     head = head_lib._regression_head_with_mean_squared_error_loss()
     logits = np.array(((45,), (41,),), dtype=np.float32)
@@ -2666,6 +2801,53 @@ class RegressionHeadWithMeanSquaredErrorLossTest(test.TestCase):
           },
           summary_str)
 
+  def test_train_with_regularization_losses(self):
+    head = head_lib._regression_head_with_mean_squared_error_loss(
+        loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE)
+    self.assertEqual(1, head.logits_dimension)
+
+    # Create estimator spec.
+    logits = np.array(((45,), (41,),), dtype=np.float32)
+    labels = np.array(((43.,), (44.,),), dtype=np.float64)
+    expected_train_result = b'my_train_op'
+    features = {'x': np.array(((42.,),), dtype=np.float32)}
+    regularization_losses = [1.5, 0.5]
+    expected_regularization_loss = 2.
+    # unregularized_loss = ((43-45)^2 + (44-41)^2) / batch_size
+    #                    = (4 + 9) / 2 = 6.5
+    # loss = unregularized_loss + regularization_loss = 8.5
+    expected_loss = 8.5
+    def _train_op_fn(loss):
+      with ops.control_dependencies((check_ops.assert_equal(
+          math_ops.to_float(expected_loss), math_ops.to_float(loss),
+          name='assert_loss'),)):
+        return constant_op.constant(expected_train_result)
+
+    spec = head.create_estimator_spec(
+        features=features,
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels,
+        train_op_fn=_train_op_fn,
+        regularization_losses=regularization_losses)
+
+    # Assert predictions, loss, train_op, and summaries.
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNotNone(spec.scaffold.summary_op)
+      prediction_key = prediction_keys.PredictionKeys.PREDICTIONS
+      predictions, loss, train_result, summary_str = sess.run((
+          spec.predictions[prediction_key], spec.loss, spec.train_op,
+          spec.scaffold.summary_op))
+      self.assertAllClose(logits, predictions)
+      self.assertAllClose(expected_loss, loss)
+      self.assertEqual(expected_train_result, train_result)
+      _assert_simple_summaries(self, {
+          metric_keys.MetricKeys.LOSS: expected_loss,
+          metric_keys.MetricKeys.LOSS_REGULARIZATION: (
+              expected_regularization_loss),
+      }, summary_str)
+
   def test_weighted_multi_example_eval(self):
     """1d label, 3 examples, 1 batch."""
     head = head_lib._regression_head_with_mean_squared_error_loss(
-- 
GitLab


From 75a5ae1d422c9e3efd8c3d4da2ef734c143c8aea Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Tue, 23 Jan 2018 15:20:43 -0800
Subject: [PATCH 039/966] [TF:XLA:CPU] Alias tensorflow::gtl to gtl for brevity

PiperOrigin-RevId: 183001493
---
 .../compiler/xla/service/cpu/ir_emitter.cc    | 31 +++++++++----------
 1 file changed, 14 insertions(+), 17 deletions(-)

diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index cfdf9f4ebc..ac8d5d33fa 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -72,6 +72,7 @@ namespace {
 using llvm_ir::AsStringRef;
 using llvm_ir::IrName;
 using llvm_ir::SetToFirstInsertPoint;
+namespace gtl = tensorflow::gtl;
 }  // namespace
 
 namespace cpu {
@@ -491,7 +492,7 @@ Status IrEmitter::HandleTuple(HloInstruction* tuple) {
 }
 
 Status IrEmitter::HandleMap(HloInstruction* map) {
-  tensorflow::gtl::ArraySlice<HloInstruction*> operands(map->operands());
+  gtl::ArraySlice<HloInstruction*> operands(map->operands());
   HloComputation* function = map->to_apply();
   // The called computation should have been emitted previously.
   llvm::Function* mapped_ir_function = FindOrDie(emitted_functions_, function);
@@ -1632,7 +1633,7 @@ IrEmitter::EmitInnerLoopForVectorizedReduction(
     const ReductionGenerator& reduction_generator,
     const llvm_ir::IrArray::Index& output_index,
     const ShardedVectorType& accumulator_type, HloInstruction* init_value,
-    HloInstruction* arg, tensorflow::gtl::ArraySlice<int64> dimensions,
+    HloInstruction* arg, gtl::ArraySlice<int64> dimensions,
     unsigned element_alignment) {
   ShardedVector accumulator;
   accumulator.reserve(accumulator_type.size());
@@ -1736,7 +1737,7 @@ void IrEmitter::EmitShardedVectorStore(
 
 StatusOr<bool> IrEmitter::EmitVectorizedReduce(
     HloInstruction* reduce, HloInstruction* arg, HloInstruction* init_value,
-    tensorflow::gtl::ArraySlice<int64> dimensions, HloComputation* function,
+    gtl::ArraySlice<int64> dimensions, HloComputation* function,
     string* failure_reason) {
   ReductionGenerator reduction_generator =
       MatchReductionGenerator(function, failure_reason);
@@ -1881,7 +1882,7 @@ StatusOr<bool> IrEmitter::EmitVectorizedReduce(
 Status IrEmitter::HandleReduce(HloInstruction* reduce) {
   auto arg = reduce->mutable_operand(0);
   auto init_value = reduce->mutable_operand(1);
-  tensorflow::gtl::ArraySlice<int64> dimensions(reduce->dimensions());
+  gtl::ArraySlice<int64> dimensions(reduce->dimensions());
   HloComputation* function = reduce->to_apply();
   if (!options::VectorizedReduceDisabled(hlo_module_config_)) {
     string vectorization_failure_reason;
@@ -2001,7 +2002,7 @@ Status IrEmitter::HandleSlice(HloInstruction* slice) {
   //
   // * Implement the memcpy within the innermost loop.
 
-  tensorflow::gtl::FlatSet<int64> inner_dims;
+  gtl::FlatSet<int64> inner_dims;
   for (int64 dim : LayoutUtil::MinorToMajor(layout)) {
     if (operand->shape().dimensions(dim) != slice->shape().dimensions(dim)) {
       break;
@@ -2329,8 +2330,7 @@ Status IrEmitter::HandleCall(HloInstruction* call) {
 }
 
 Status IrEmitter::HandleCustomCall(HloInstruction* custom_call) {
-  tensorflow::gtl::ArraySlice<HloInstruction*> operands(
-      custom_call->operands());
+  gtl::ArraySlice<HloInstruction*> operands(custom_call->operands());
   tensorflow::StringPiece custom_call_target(custom_call->custom_call_target());
   llvm::Type* i8_ptr_type = ir_builder_.getInt8PtrTy();
   llvm::AllocaInst* operands_alloca =
@@ -2461,8 +2461,7 @@ Status IrEmitter::HandleWhile(HloInstruction* xla_while) {
 }
 
 StatusOr<bool> IrEmitter::EmitFastConcatenate(
-    HloInstruction* concatenate,
-    tensorflow::gtl::ArraySlice<HloInstruction*> operands,
+    HloInstruction* concatenate, gtl::ArraySlice<HloInstruction*> operands,
     string* failure_reason) {
   if (ShouldEmitParallelLoopFor(*concatenate)) {
     *failure_reason =
@@ -2601,8 +2600,7 @@ void IrEmitter::EmitTransferElements(llvm::Value* target, llvm::Value* source,
 }
 
 Status IrEmitter::HandleConcatenate(HloInstruction* concatenate) {
-  tensorflow::gtl::ArraySlice<HloInstruction*> operands(
-      concatenate->operands());
+  gtl::ArraySlice<HloInstruction*> operands(concatenate->operands());
   string failure_reason;
   TF_ASSIGN_OR_RETURN(
       bool successful,
@@ -2915,7 +2913,7 @@ llvm::Value* IrEmitter::EmitTempBufferPointer(
 // for a single element_type value, and loads it after call.
 llvm::Value* IrEmitter::EmitElementFunctionCall(
     llvm::Function* function, const Shape& return_shape,
-    tensorflow::gtl::ArraySlice<llvm::Value*> parameter_addresses,
+    gtl::ArraySlice<llvm::Value*> parameter_addresses,
     tensorflow::StringPiece name) {
   llvm::Value* return_value_buffer = EmitArrayFunctionCall(
       function, return_shape, 1, parameter_addresses, name);
@@ -2935,8 +2933,7 @@ llvm::Value* IrEmitter::EmitElementFunctionCall(
 //                 temps)
 //   return return_value_buffer  -- address of the return value.
 void IrEmitter::EmitArrayFunctionCallInto(
-    llvm::Function* function,
-    tensorflow::gtl::ArraySlice<llvm::Value*> parameter_addresses,
+    llvm::Function* function, gtl::ArraySlice<llvm::Value*> parameter_addresses,
     llvm::Value* return_value_buffer, tensorflow::StringPiece name) {
   ir_builder_.CreateCall(
       function, GetArrayFunctionCallArguments(
@@ -2949,7 +2946,7 @@ void IrEmitter::EmitArrayFunctionCallInto(
 
 llvm::Value* IrEmitter::EmitArrayFunctionCall(
     llvm::Function* function, const Shape& return_shape, int64 element_count,
-    tensorflow::gtl::ArraySlice<llvm::Value*> parameter_addresses,
+    gtl::ArraySlice<llvm::Value*> parameter_addresses,
     tensorflow::StringPiece name) {
   llvm::Value* elements =
       llvm::ConstantInt::get(ir_builder_.getInt64Ty(), element_count);
@@ -3059,8 +3056,8 @@ Status IrEmitter::EmitMemcpy(const HloInstruction& source,
 
 Status IrEmitter::ElementTypesSameAndSupported(
     const HloInstruction& instruction,
-    tensorflow::gtl::ArraySlice<const HloInstruction*> operands,
-    tensorflow::gtl::ArraySlice<PrimitiveType> supported_types) {
+    gtl::ArraySlice<const HloInstruction*> operands,
+    gtl::ArraySlice<PrimitiveType> supported_types) {
   for (auto operand : operands) {
     TF_RET_CHECK(
         ShapeUtil::SameElementType(operands[0]->shape(), operand->shape()));
-- 
GitLab


From f16cf555905d711c1877039e3b37240e9026c1f2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Jan 2018 15:41:33 -0800
Subject: [PATCH 040/966] Add a base transformer class with useful utilities.
 Intercept errors in the base class and include additional context
 information. Use the base transformer class with type_info.py.

PiperOrigin-RevId: 183004622
---
 tensorflow/contrib/py2tf/conversion.py        | 20 ++++---
 .../py2tf/convert/builtin_functions_test.py   |  2 +-
 .../contrib/py2tf/convert/call_trees_test.py  |  2 +-
 .../py2tf/convert/control_flow_test.py        |  2 +-
 .../py2tf/convert/print_functions_test.py     |  2 +-
 .../py2tf/convert/side_effect_guards_test.py  |  2 +-
 tensorflow/contrib/py2tf/pyct/BUILD           |  1 +
 .../py2tf/pyct/static_analysis/type_info.py   | 10 ++--
 .../pyct/static_analysis/type_info_test.py    | 25 ++++-----
 tensorflow/contrib/py2tf/pyct/transformer.py  | 52 +++++++++++++++++++
 10 files changed, 90 insertions(+), 28 deletions(-)
 create mode 100644 tensorflow/contrib/py2tf/pyct/transformer.py

diff --git a/tensorflow/contrib/py2tf/conversion.py b/tensorflow/contrib/py2tf/conversion.py
index 3bdbc66a99..38f1c0a14a 100644
--- a/tensorflow/contrib/py2tf/conversion.py
+++ b/tensorflow/contrib/py2tf/conversion.py
@@ -181,7 +181,8 @@ def function_to_graph(f, conversion_map, param_value_hints, owner_type=None):
         node_globals[fn.__name__] = fn
 
   namer = conversion_map.new_namer(node_globals)
-  node = node_to_graph(node, namer, node_globals, param_value_hints,
+  node = node_to_graph(node, tf_inspect.getsource(f), tf_inspect.getfile(f),
+                       namer, node_globals, param_value_hints,
                        conversion_map.nocompile_decorators)
 
   # Simulate a rename to ensure the top level is in the name map. This is needed
@@ -196,18 +197,23 @@ def function_to_graph(f, conversion_map, param_value_hints, owner_type=None):
   return node, conversion_map.name_map[f]
 
 
-def _static_analysis_pass(node, namespace, value_hints):
+def _static_analysis_pass(node, source, f, namespace, value_hints):
   node = access.resolve(node)
   node = live_values.resolve(node, namespace, config.PYTHON_LITERALS)
-  node = type_info.resolve(node, value_hints)
+  node = type_info.resolve(node, source, f, value_hints)
   return node
 
 
-def node_to_graph(node, namer, namespace, value_hints, nocompile_decorators):
+def node_to_graph(node, source, f, namer, namespace, value_hints,
+                  nocompile_decorators):
   """Convert Python code to equivalent TF graph mode code.
 
   Args:
     node: A Python AST node representing the code to convert.
+    source: Optional string containing the source code of the node. Used in
+        error messages.
+    f: Optional string indicating the file where the node originated. None if
+        unknown. Used in error messages.
     namer: A naming.Namer object.
     namespace: Dict mapping symbol names to their corresponding live objects.
     value_hints: A dict containing value hints for symbols like function
@@ -235,7 +241,7 @@ def node_to_graph(node, namer, namespace, value_hints, nocompile_decorators):
   # tree, which must be accounted. Although less efficient, it is most robust
   # to re-run the analysis.
 
-  node = _static_analysis_pass(node, namespace, value_hints)
+  node = _static_analysis_pass(node, source, f, namespace, value_hints)
   node = decorators.transform(node, nocompile_decorators)
   node = break_canonicalization.transform(node, namer)
 
@@ -245,14 +251,14 @@ def node_to_graph(node, namer, namespace, value_hints, nocompile_decorators):
   node = continue_canonicalization.transform(node, namer)
   namespace['len'] = len
 
-  node = _static_analysis_pass(node, namespace, value_hints)
+  node = _static_analysis_pass(node, None, None, namespace, value_hints)
   node = for_canonicalization.transform(node, namer)
   # for_canonicalization may insert new global references.
   node = builtin_functions.transform(node)
   # builtin_functions may insert new global references.
   namespace['print'] = print
 
-  node = _static_analysis_pass(node, namespace, value_hints)
+  node = _static_analysis_pass(node, None, None, namespace, value_hints)
   node = print_functions.transform(node)
   node = call_trees.transform(node, namer, namespace,
                               config.DEFAULT_UNCOMPILED_MODULES,
diff --git a/tensorflow/contrib/py2tf/convert/builtin_functions_test.py b/tensorflow/contrib/py2tf/convert/builtin_functions_test.py
index 633602f4d4..ab02b362aa 100644
--- a/tensorflow/contrib/py2tf/convert/builtin_functions_test.py
+++ b/tensorflow/contrib/py2tf/convert/builtin_functions_test.py
@@ -35,7 +35,7 @@ class BuiltinFunctionsTest(test.TestCase):
     node = parser.parse_object(test_fn)
     node = access.resolve(node)
     node = live_values.resolve(node, namespace, {})
-    node = type_info.resolve(node, {})
+    node = type_info.resolve(node, None, None, {})
     return node
 
   def test_len(self):
diff --git a/tensorflow/contrib/py2tf/convert/call_trees_test.py b/tensorflow/contrib/py2tf/convert/call_trees_test.py
index 3367d41db3..78a6b53910 100644
--- a/tensorflow/contrib/py2tf/convert/call_trees_test.py
+++ b/tensorflow/contrib/py2tf/convert/call_trees_test.py
@@ -41,7 +41,7 @@ class CallTreesTest(test.TestCase):
     node = parser.parse_object(test_fn)
     node = access.resolve(node)
     node = live_values.resolve(node, namespace, {})
-    node = type_info.resolve(node, {})
+    node = type_info.resolve(node, None, None, {})
     return node
 
   def test_basic(self):
diff --git a/tensorflow/contrib/py2tf/convert/control_flow_test.py b/tensorflow/contrib/py2tf/convert/control_flow_test.py
index 121af4ee94..64a317ee9c 100644
--- a/tensorflow/contrib/py2tf/convert/control_flow_test.py
+++ b/tensorflow/contrib/py2tf/convert/control_flow_test.py
@@ -46,7 +46,7 @@ class ControlFlowTest(test.TestCase):
     node = parser.parse_object(test_fn)
     node = access.resolve(node)
     node = live_values.resolve(node, namespace, {})
-    node = type_info.resolve(node, {})
+    node = type_info.resolve(node, None, None, {})
     return node
 
   def test_simple_while(self):
diff --git a/tensorflow/contrib/py2tf/convert/print_functions_test.py b/tensorflow/contrib/py2tf/convert/print_functions_test.py
index 65e592b66e..8b6c238aa4 100644
--- a/tensorflow/contrib/py2tf/convert/print_functions_test.py
+++ b/tensorflow/contrib/py2tf/convert/print_functions_test.py
@@ -35,7 +35,7 @@ class PrintFunctionsTest(test.TestCase):
     node = parser.parse_object(test_fn)
     node = access.resolve(node)
     node = live_values.resolve(node, namespace, {})
-    node = type_info.resolve(node, {})
+    node = type_info.resolve(node, None, None, {})
     return node
 
   def test_transform(self):
diff --git a/tensorflow/contrib/py2tf/convert/side_effect_guards_test.py b/tensorflow/contrib/py2tf/convert/side_effect_guards_test.py
index d932840186..1715e9eb95 100644
--- a/tensorflow/contrib/py2tf/convert/side_effect_guards_test.py
+++ b/tensorflow/contrib/py2tf/convert/side_effect_guards_test.py
@@ -43,7 +43,7 @@ class SideEffectGuardsTest(test.TestCase):
     node = parser.parse_object(test_fn)
     node = access.resolve(node)
     node = live_values.resolve(node, namespace, {})
-    node = type_info.resolve(node, {})
+    node = type_info.resolve(node, None, None, {})
     return node
 
   def test_transform(self):
diff --git a/tensorflow/contrib/py2tf/pyct/BUILD b/tensorflow/contrib/py2tf/pyct/BUILD
index b60ed918f5..9dd564cb9f 100644
--- a/tensorflow/contrib/py2tf/pyct/BUILD
+++ b/tensorflow/contrib/py2tf/pyct/BUILD
@@ -23,6 +23,7 @@ py_library(
         "parser.py",
         "pretty_printer.py",
         "templates.py",
+        "transformer.py",
     ],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/type_info.py b/tensorflow/contrib/py2tf/pyct/static_analysis/type_info.py
index 3e54590326..b17af5d844 100644
--- a/tensorflow/contrib/py2tf/pyct/static_analysis/type_info.py
+++ b/tensorflow/contrib/py2tf/pyct/static_analysis/type_info.py
@@ -24,6 +24,7 @@ from __future__ import print_function
 import gast
 
 from tensorflow.contrib.py2tf.pyct import anno
+from tensorflow.contrib.py2tf.pyct import transformer
 from tensorflow.python.util import tf_inspect
 
 
@@ -69,7 +70,7 @@ class Scope(object):
     raise KeyError(name)
 
 
-class TypeInfoResolver(gast.NodeTransformer):
+class TypeInfoResolver(transformer.Base):
   """Annotates symbols with type information where possible.
 
   Nodes currently annotated:
@@ -77,7 +78,8 @@ class TypeInfoResolver(gast.NodeTransformer):
     * Attribute (helps resolve object methods)
   """
 
-  def __init__(self, value_hints):
+  def __init__(self, value_hints, source, f):
+    super(TypeInfoResolver, self).__init__(source, f)
     self.scope = Scope(None)
     self.value_hints = value_hints
     self.function_level = 0
@@ -206,6 +208,6 @@ class TypeInfoResolver(gast.NodeTransformer):
     return node
 
 
-def resolve(node, value_hints):
+def resolve(node, source, f, value_hints):
   assert value_hints is not None
-  return TypeInfoResolver(value_hints).visit(node)
+  return TypeInfoResolver(value_hints, source, f).visit(node)
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/type_info_test.py b/tensorflow/contrib/py2tf/pyct/static_analysis/type_info_test.py
index 8526f42413..98dc7bf50f 100644
--- a/tensorflow/contrib/py2tf/pyct/static_analysis/type_info_test.py
+++ b/tensorflow/contrib/py2tf/pyct/static_analysis/type_info_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.contrib.py2tf.pyct import anno
 from tensorflow.contrib.py2tf.pyct import parser
+from tensorflow.contrib.py2tf.pyct import transformer
 from tensorflow.contrib.py2tf.pyct.static_analysis import access
 from tensorflow.contrib.py2tf.pyct.static_analysis import live_values
 from tensorflow.contrib.py2tf.pyct.static_analysis import type_info
@@ -63,7 +64,7 @@ class TypeInfoResolverTest(test.TestCase):
     node = parser.parse_object(test_fn)
     node = access.resolve(node)
     node = live_values.resolve(node, {'training': training}, {})
-    node = type_info.resolve(node, {})
+    node = type_info.resolve(node, None, None, {})
 
     call_node = node.body[0].body[0].value
     self.assertEquals(training.GradientDescentOptimizer,
@@ -80,7 +81,7 @@ class TypeInfoResolverTest(test.TestCase):
     node = parser.parse_object(test_fn)
     node = access.resolve(node)
     node = live_values.resolve(node, {'training': training}, {})
-    node = type_info.resolve(node, {})
+    node = type_info.resolve(node, None, None, {})
 
     attr_call_node = node.body[0].body[1].value.func
     self.assertEquals((training.__name__, 'GradientDescentOptimizer'),
@@ -95,7 +96,7 @@ class TypeInfoResolverTest(test.TestCase):
     node = parser.parse_object(test_fn)
     node = access.resolve(node)
     node = live_values.resolve(node, {'session': session}, {})
-    node = type_info.resolve(node, {})
+    node = type_info.resolve(node, None, None, {})
 
     constructor_call = node.body[0].body[0].items[0].context_expr
     self.assertEquals(session.Session, anno.getanno(constructor_call, 'type'))
@@ -118,8 +119,8 @@ class TypeInfoResolverTest(test.TestCase):
     node = parser.parse_object(test_fn)
     node = access.resolve(node)
     node = live_values.resolve(node, {'training': training}, {})
-    with self.assertRaises(ValueError):
-      node = type_info.resolve(node, {})
+    with self.assertRaises(transformer.PyFlowParseError):
+      node = type_info.resolve(node, None, None, {})
 
   def test_parameter_class_members(self):
 
@@ -129,8 +130,8 @@ class TypeInfoResolverTest(test.TestCase):
     node = parser.parse_object(test_fn)
     node = access.resolve(node)
     node = live_values.resolve(node, {'training': training}, {})
-    with self.assertRaises(ValueError):
-      node = type_info.resolve(node, {})
+    with self.assertRaises(transformer.PyFlowParseError):
+      node = type_info.resolve(node, None, None, {})
 
   def test_parameter_class_members_with_value_hints(self):
 
@@ -141,7 +142,7 @@ class TypeInfoResolverTest(test.TestCase):
     node = access.resolve(node)
     node = live_values.resolve(node, {'training': training}, {})
     node = type_info.resolve(
-        node, {
+        node, None, None, {
             'opt': (('%s.GradientDescentOptimizer' % training.__name__),
                     training.GradientDescentOptimizer(0.1))
         })
@@ -163,8 +164,8 @@ class TypeInfoResolverTest(test.TestCase):
     node = parser.parse_object(test_fn)
     node = access.resolve(node)
     node = live_values.resolve(node, {'bar': bar}, {})
-    with self.assertRaises(ValueError):
-      node = type_info.resolve(node, {})
+    with self.assertRaises(transformer.PyFlowParseError):
+      node = type_info.resolve(node, None, None, {})
 
   def test_nested_members(self):
 
@@ -175,8 +176,8 @@ class TypeInfoResolverTest(test.TestCase):
     node = parser.parse_object(test_fn)
     node = access.resolve(node)
     node = live_values.resolve(node, {'training': training}, {})
-    with self.assertRaises(ValueError):
-      node = type_info.resolve(node, {})
+    with self.assertRaises(transformer.PyFlowParseError):
+      node = type_info.resolve(node, None, None, {})
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/py2tf/pyct/transformer.py b/tensorflow/contrib/py2tf/pyct/transformer.py
new file mode 100644
index 0000000000..1658a1b694
--- /dev/null
+++ b/tensorflow/contrib/py2tf/pyct/transformer.py
@@ -0,0 +1,52 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A node transformer that includes utilities for SCT."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gast
+
+from tensorflow.contrib.py2tf.pyct import pretty_printer
+
+
+class PyFlowParseError(SyntaxError):
+  pass
+
+
+class Base(gast.NodeTransformer):
+  """Base class for specialized transformers."""
+
+  def __init__(self, source, f):
+    self._lineno = 0
+    self._col_offset = 0
+    self._source = source
+    self._file = f
+
+  def visit(self, node):
+    try:
+      if self._source and hasattr(node, 'lineno'):
+        self._lineno = node.lineno
+        self._col_offset = node.col_offset
+      return super(Base, self).visit(node)
+    except ValueError as e:
+      msg = '%s\nOccurred at node:\n%s' % (str(e), pretty_printer.fmt(node))
+      if self._source:
+        line = self._source.splitlines()[self._lineno - 1]
+      else:
+        line = '<no source available>'
+      raise PyFlowParseError(
+          msg, (self._file, self._lineno, self._col_offset + 1, line))
-- 
GitLab


From 84a5c4bb26d6ce30d2243c4314320d0ff93fda47 Mon Sep 17 00:00:00 2001
From: Frank Perbet <fkp@google.com>
Date: Tue, 23 Jan 2018 15:59:09 -0800
Subject: [PATCH 041/966] Make namedtuples with identical name and field names
 to be considered as the same shallow structure.

PiperOrigin-RevId: 183007218
---
 tensorflow/python/util/nest.py      | 87 +++++++++++++++++++++++++----
 tensorflow/python/util/nest_test.py | 30 ++++++++++
 2 files changed, 107 insertions(+), 10 deletions(-)

diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index 4ce871de72..874df3d108 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -47,10 +47,25 @@ def _sorted(dict_):
     raise TypeError("nest only supports dicts with sortable keys.")
 
 
-def _is_namedtuple(instance):
-  """Returns True iff `instance` is a `namedtuple`."""
+def _is_namedtuple(instance, strict=False):
+  """Returns True iff `instance` is a `namedtuple`.
+
+  Args:
+    instance: An instance of a Python object.
+    strict: If True, `instance` is considered to be a `namedtuple` only if
+        it is a "plain" namedtuple. For instance, a class inheriting
+        from a `namedtuple` will be considered to be a `namedtuple`
+        iff `strict=False`.
+
+  Returns:
+    True if `instance` is a `namedtuple`.
+  """
+  # Attemp to limit the test to plain namedtuple (not stuff inheriting from it).
+  if not isinstance(instance, tuple):
+    return False
+  if strict and instance.__class__.__base__ != tuple:
+    return False
   return (
-      isinstance(instance, tuple) and
       hasattr(instance, "_fields") and
       isinstance(instance._fields, _collections.Sequence) and
       all(isinstance(f, _six.string_types) for f in instance._fields))
@@ -140,8 +155,37 @@ def flatten(nest):
   return _pywrap_tensorflow.Flatten(nest)
 
 
+def _same_namedtuples(nest1, nest2):
+  """Returns True if the two namedtuples have the same name and fields."""
+  if nest1._fields != nest2._fields:
+    return False
+  if nest1.__class__.__name__ != nest2.__class__.__name__:
+    return False
+  return True
+
+
 def _recursive_assert_same_structure(nest1, nest2, check_types):
-  """Helper function for `assert_same_structure`."""
+  """Helper function for `assert_same_structure`.
+
+  See `assert_same_structure` for further information about namedtuples.
+
+  Args:
+    nest1: An arbitrarily nested structure.
+    nest2: An arbitrarily nested structure.
+    check_types: If `True` (default) types of sequences are checked as
+        well, including the keys of dictionaries. If set to `False`, for example
+        a list and a tuple of objects will look the same if they have the same
+        size. Note that namedtuples with identical name and fields are always
+        considered to have the same shallow structure.
+
+  Returns:
+    True if `nest1` and `nest2` have the same structure.
+
+  Raises:
+    ValueError: If the two structure don't have the same nested structre.
+    TypeError: If the two structure don't have the same sequence type.
+    ValueError: If the two dictionaries don't have the same set of keys.
+  """
   is_sequence_nest1 = is_sequence(nest1)
   if is_sequence_nest1 != is_sequence(nest2):
     raise ValueError(
@@ -154,11 +198,21 @@ def _recursive_assert_same_structure(nest1, nest2, check_types):
   if check_types:
     type_nest1 = type(nest1)
     type_nest2 = type(nest2)
-    if type_nest1 != type_nest2:
-      raise TypeError(
-          "The two structures don't have the same sequence type. First "
-          "structure has type %s, while second structure has type %s."
-          % (type_nest1, type_nest2))
+
+    # Duck-typing means that nest should be fine with two different namedtuples
+    # with identical name and fields.
+    if _is_namedtuple(nest1, True) and _is_namedtuple(nest2, True):
+      if not _same_namedtuples(nest1, nest2):
+        raise TypeError(
+            "The two namedtuples don't have the same sequence type. First "
+            "structure has type %s, while second structure has type %s."
+            % (type_nest1, type_nest2))
+    else:
+      if type_nest1 != type_nest2:
+        raise TypeError(
+            "The two structures don't have the same sequence type. First "
+            "structure has type %s, while second structure has type %s."
+            % (type_nest1, type_nest2))
 
     if isinstance(nest1, dict):
       keys1 = set(_six.iterkeys(nest1))
@@ -178,13 +232,24 @@ def _recursive_assert_same_structure(nest1, nest2, check_types):
 def assert_same_structure(nest1, nest2, check_types=True):
   """Asserts that two structures are nested in the same way.
 
+  Note that namedtuples with identical name and fields are always considered
+  to have the same shallow structure (even with `check_types=True`).
+  For intance, this code will print `True`:
+
+  ```python
+  def nt(a, b):
+    return collections.namedtuple('foo', 'a b')(a, b)
+  print(assert_same_structure(nt(0, 1), nt(2, 3)))
+  ```
+
   Args:
     nest1: an arbitrarily nested structure.
     nest2: an arbitrarily nested structure.
     check_types: if `True` (default) types of sequences are checked as
         well, including the keys of dictionaries. If set to `False`, for example
         a list and a tuple of objects will look the same if they have the same
-        size.
+        size. Note that namedtuples with identical name and fields are always
+        considered to have the same shallow structure.
 
   Raises:
     ValueError: If the two structures do not have the same number of elements or
@@ -354,6 +419,8 @@ def map_structure(func, *structure, **check_types_dict):
       `True` (default) the types of iterables within the structures have to be
       same (e.g. `map_structure(func, [1], (1,))` raises a `TypeError`
       exception). To allow this set this argument to `False`.
+      Note that namedtuples with identical name and fields are always
+      considered to have the same shallow structure.
 
   Returns:
     A new structure with the same arity as `structure`, whose values correspond
diff --git a/tensorflow/python/util/nest_test.py b/tensorflow/python/util/nest_test.py
index 4906649f01..6bec397db5 100644
--- a/tensorflow/python/util/nest_test.py
+++ b/tensorflow/python/util/nest_test.py
@@ -258,6 +258,36 @@ class NestTest(test.TestCase):
                                  "don't have the same set of keys"):
       nest.assert_same_structure({"a": 1}, {"b": 1})
 
+    same_name_type_0 = collections.namedtuple("same_name", ("a", "b"))
+    same_name_type_1 = collections.namedtuple("same_name", ("a", "b"))
+    nest.assert_same_structure(same_name_type_0(0, 1), same_name_type_1(2, 3))
+
+    # This assertion is expected to pass: two namedtuples with the same
+    # name and field names are considered to be identical.
+    same_name_type_2 = collections.namedtuple("same_name_1", ("x", "y"))
+    same_name_type_3 = collections.namedtuple("same_name_1", ("x", "y"))
+    nest.assert_same_structure(
+        same_name_type_0(same_name_type_2(0, 1), 2),
+        same_name_type_1(same_name_type_3(2, 3), 4))
+
+    expected_message = "The two structures don't have the same.*"
+    with self.assertRaisesRegexp(ValueError, expected_message):
+      nest.assert_same_structure(same_name_type_0(0, same_name_type_1(1, 2)),
+                                 same_name_type_1(same_name_type_0(0, 1), 2))
+
+    same_name_type_1 = collections.namedtuple("not_same_name", ("a", "b"))
+    self.assertRaises(TypeError, nest.assert_same_structure,
+                      same_name_type_0(0, 1), same_name_type_1(2, 3))
+
+    same_name_type_1 = collections.namedtuple("same_name", ("x", "y"))
+    self.assertRaises(TypeError, nest.assert_same_structure,
+                      same_name_type_0(0, 1), same_name_type_1(2, 3))
+
+    class SameNamedType1(collections.namedtuple("same_name", ("a", "b"))):
+      pass
+    self.assertRaises(TypeError, nest.assert_same_structure,
+                      same_name_type_0(0, 1), SameNamedType1(2, 3))
+
   def testMapStructure(self):
     structure1 = (((1, 2), 3), 4, (5, 6))
     structure2 = (((7, 8), 9), 10, (11, 12))
-- 
GitLab


From 3fd539dce556ba11cb5ee88cbdd75614be0590b7 Mon Sep 17 00:00:00 2001
From: Andrew Selle <aselle@google.com>
Date: Tue, 23 Jan 2018 16:03:41 -0800
Subject: [PATCH 042/966] Make the toco python wrapper lazy loaded to avoid dep
 error.

PiperOrigin-RevId: 183007943
---
 tensorflow/contrib/lite/python/lite.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/lite/python/lite.py b/tensorflow/contrib/lite/python/lite.py
index 4d87a5907b..3c369774be 100644
--- a/tensorflow/contrib/lite/python/lite.py
+++ b/tensorflow/contrib/lite/python/lite.py
@@ -31,10 +31,18 @@ import tempfile
 from tensorflow.contrib.lite.toco import model_flags_pb2 as _model_flags_pb2
 from tensorflow.contrib.lite.toco import toco_flags_pb2 as _toco_flags_pb2
 from tensorflow.contrib.lite.toco import types_pb2 as _types_pb2
-from tensorflow.contrib.lite.toco.python.tensorflow_wrap_toco import TocoConvert as _toco_convert_protos
 from tensorflow.python.framework import dtypes as _dtypes
 from tensorflow.python.platform import resource_loader as _resource_loader
 from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.util.lazy_loader import LazyLoader
+
+# Lazy load since some of the performance benchmark skylark rules
+# break dependencies.
+_toco_python = LazyLoader(
+    "tensorflow_wrap_toco", globals(),
+    "tensorflow.contrib.lite.toco.python."
+    "tensorflow_wrap_toco")
+del LazyLoader
 
 # Enum types from the protobuf promoted to the API
 FLOAT = _types_pb2.FLOAT
@@ -86,7 +94,8 @@ def toco_convert_protos(model_flags_str, toco_flags_str, input_data_str):
   # TODO(aselle): When toco does not use fatal errors for failure, we can
   # switch this on.
   if not _toco_from_proto_bin:
-    return _toco_convert_protos(model_flags_str, toco_flags_str, input_data_str)
+    return _toco_python.TocoConvert(
+        model_flags_str, toco_flags_str, input_data_str)
 
   with tempfile.NamedTemporaryFile() as fp_toco, \
            tempfile.NamedTemporaryFile() as fp_model, \
-- 
GitLab


From f5c6d3f28f382d09c19f36ff99a8966231d85c15 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Jan 2018 16:22:43 -0800
Subject: [PATCH 043/966] Add unidirectional sequence LSTM to TFLite Ops.

PiperOrigin-RevId: 183010993
---
 tensorflow/contrib/lite/kernels/BUILD         |   13 +
 tensorflow/contrib/lite/kernels/register.cc   |    3 +
 .../kernels/unidirectional_sequence_lstm.cc   |  527 ++++++++
 .../unidirectional_sequence_lstm_test.cc      | 1089 +++++++++++++++++
 tensorflow/contrib/lite/model.cc              |    1 +
 tensorflow/contrib/lite/nnapi_delegate.cc     |    1 +
 tensorflow/contrib/lite/schema/schema.fbs     |    1 +
 .../contrib/lite/schema/schema_generated.h    |    9 +-
 8 files changed, 1641 insertions(+), 3 deletions(-)
 create mode 100644 tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm.cc
 create mode 100644 tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm_test.cc

diff --git a/tensorflow/contrib/lite/kernels/BUILD b/tensorflow/contrib/lite/kernels/BUILD
index bd390d664d..2d51b8727f 100644
--- a/tensorflow/contrib/lite/kernels/BUILD
+++ b/tensorflow/contrib/lite/kernels/BUILD
@@ -106,6 +106,7 @@ cc_library(
         "sub.cc",
         "svdf.cc",
         "transpose.cc",
+        "unidirectional_sequence_lstm.cc",
         "unidirectional_sequence_rnn.cc",
     ],
     hdrs = [
@@ -249,6 +250,18 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "unidirectional_sequence_lstm_test",
+    size = "small",
+    srcs = ["unidirectional_sequence_lstm_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 tf_cc_test(
     name = "unidirectional_sequence_rnn_test",
     size = "small",
diff --git a/tensorflow/contrib/lite/kernels/register.cc b/tensorflow/contrib/lite/kernels/register.cc
index 45ad5f1890..c9e74cb8d5 100644
--- a/tensorflow/contrib/lite/kernels/register.cc
+++ b/tensorflow/contrib/lite/kernels/register.cc
@@ -48,6 +48,7 @@ TfLiteRegistration* Register_MUL();
 TfLiteRegistration* Register_L2_NORMALIZATION();
 TfLiteRegistration* Register_LOCAL_RESPONSE_NORMALIZATION();
 TfLiteRegistration* Register_LSTM();
+TfLiteRegistration* Register_UNIDIRECTIONAL_SEQUENCE_LSTM();
 TfLiteRegistration* Register_PAD();
 TfLiteRegistration* Register_RESHAPE();
 TfLiteRegistration* Register_RESIZE_BILINEAR();
@@ -89,6 +90,8 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION,
              Register_LOCAL_RESPONSE_NORMALIZATION());
   AddBuiltin(BuiltinOperator_LSTM, Register_LSTM());
+  AddBuiltin(BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM,
+             Register_UNIDIRECTIONAL_SEQUENCE_LSTM());
   AddBuiltin(BuiltinOperator_PAD, Register_PAD());
   AddBuiltin(BuiltinOperator_RESHAPE, Register_RESHAPE());
   AddBuiltin(BuiltinOperator_RESIZE_BILINEAR, Register_RESIZE_BILINEAR());
diff --git a/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm.cc b/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm.cc
new file mode 100644
index 0000000000..9cdb58714e
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm.cc
@@ -0,0 +1,527 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <unistd.h>
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/activation_functor.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace unidirectional_sequence_lstm {
+
+// Input Tensors of size {max_time, n_batch, n_input}
+constexpr int kInputTensor = 0;
+
+// Input weight tensors of size: {n_cell, n_input}
+constexpr int kInputToInputWeightsTensor = 1;  // Optional
+constexpr int kInputToForgetWeightsTensor = 2;
+constexpr int kInputToCellWeightsTensor = 3;
+constexpr int kInputToOutputWeightsTensor = 4;
+
+// Recurrent weight tensors of size {n_cell, n_output}
+constexpr int kRecurrentToInputWeightsTensor = 5;  // Optional
+constexpr int kRecurrentToForgetWeightsTensor = 6;
+constexpr int kRecurrentToCellWeightsTensor = 7;
+constexpr int kRecurrentToOutputWeightsTensor = 8;
+
+// Peephole weights tensors of size {n_cell}, representing a diagonal matrix.
+constexpr int kCellToInputWeightsTensor = 9;    // Optional
+constexpr int kCellToForgetWeightsTensor = 10;  // Optional
+constexpr int kCellToOutputWeightsTensor = 11;  // Optional
+
+// Gates bias tensors of size {n_cell}
+constexpr int kInputGateBiasTensor = 12;  // Optional
+constexpr int kForgetGateBiasTensor = 13;
+constexpr int kCellGateBiasTensor = 14;
+constexpr int kOutputGateBiasTensor = 15;
+
+// Projection weight tensor of size {n_output, n_cell}
+constexpr int kProjectionWeightsTensor = 16;  // Optional
+// Projection bias tensor of size {n_output}
+constexpr int kProjectionBiasTensor = 17;  // Optional
+
+// Output tensors.
+constexpr int kScratchBufferTensor = 0;
+constexpr int kOutputStateTensor = 1;
+constexpr int kCellStateTensor = 2;
+constexpr int kOutputTensor = 3;
+
+// Check that input tensor dimensions matches with each other.
+TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
+                                        TfLiteNode* node, int n_input,
+                                        int n_output, int n_cell) {
+  auto* params = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data);
+
+  // Making sure clipping parameters have valid values.
+  // == 0 means no clipping
+  //  > 0 means clipping
+  TF_LITE_ENSURE(context, params->cell_clip >= 0);
+  TF_LITE_ENSURE(context, params->proj_clip >= 0);
+
+  TfLiteTensor* input_to_input_weights =
+      GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
+  if (input_to_input_weights) {
+    TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->size, 2);
+    TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[0], n_cell);
+    TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[1], n_input);
+  }
+
+  TfLiteTensor* input_to_forget_weights =
+      GetInput(context, node, kInputToForgetWeightsTensor);
+  TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[0], n_cell);
+  TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[1], n_input);
+
+  TfLiteTensor* input_to_cell_weights =
+      GetInput(context, node, kInputToCellWeightsTensor);
+  TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[0], n_cell);
+  TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[1], n_input);
+
+  TfLiteTensor* recurrent_to_input_weights =
+      GetOptionalInputTensor(context, node, kRecurrentToInputWeightsTensor);
+  if (recurrent_to_input_weights) {
+    TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->size, 2);
+    TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->data[0],
+                      n_cell);
+    TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->data[1],
+                      n_output);
+  }
+
+  TfLiteTensor* recurrent_to_forget_weights =
+      GetInput(context, node, kRecurrentToForgetWeightsTensor);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->data[0],
+                    n_cell);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->data[1],
+                    n_output);
+
+  TfLiteTensor* recurrent_to_cell_weights =
+      GetInput(context, node, kRecurrentToCellWeightsTensor);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[0], n_cell);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[1],
+                    n_output);
+
+  // We make sure the input-gate's parameters are either both present (regular
+  // LSTM) or not at all (CIFG-LSTM).
+  const bool cifg_weights_all_or_none =
+      ((input_to_input_weights != nullptr) &&
+       (recurrent_to_input_weights != nullptr)) ||
+      ((input_to_input_weights == nullptr) &&
+       (recurrent_to_input_weights == nullptr));
+  TF_LITE_ENSURE(context, cifg_weights_all_or_none == true);
+
+  TfLiteTensor* cell_to_input_weights =
+      GetOptionalInputTensor(context, node, kCellToInputWeightsTensor);
+  if (cell_to_input_weights) {
+    TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->size, 1);
+    TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->data[0], n_cell);
+  }
+
+  TfLiteTensor* cell_to_forget_weights =
+      GetOptionalInputTensor(context, node, kCellToForgetWeightsTensor);
+  if (cell_to_forget_weights) {
+    TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->size, 1);
+    TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->data[0], n_cell);
+  }
+
+  TfLiteTensor* cell_to_output_weights =
+      GetOptionalInputTensor(context, node, kCellToOutputWeightsTensor);
+  if (cell_to_output_weights) {
+    TF_LITE_ENSURE_EQ(context, cell_to_output_weights->dims->size, 1);
+    TF_LITE_ENSURE_EQ(context, cell_to_output_weights->dims->data[0], n_cell);
+  }
+
+  // Making sure the peephole weights are there all or none.
+  const bool use_cifg = (input_to_input_weights == nullptr);
+  const bool peephole_weights_all_or_none =
+      ((cell_to_input_weights != nullptr || use_cifg) &&
+       (cell_to_forget_weights != nullptr) &&
+       (cell_to_output_weights != nullptr)) ||
+      ((cell_to_input_weights == nullptr) &&
+       (cell_to_forget_weights == nullptr) &&
+       (cell_to_output_weights == nullptr));
+  TF_LITE_ENSURE(context, peephole_weights_all_or_none == true);
+
+  // Make sure the input gate bias is present only when not a CIFG-LSTM.
+  TfLiteTensor* input_gate_bias =
+      GetOptionalInputTensor(context, node, kInputGateBiasTensor);
+  if (use_cifg) {
+    TF_LITE_ENSURE_EQ(context, input_gate_bias, nullptr);
+  } else {
+    TF_LITE_ENSURE_EQ(context, input_gate_bias->dims->size, 1);
+    TF_LITE_ENSURE_EQ(context, input_gate_bias->dims->data[0], n_cell);
+  }
+
+  TfLiteTensor* forget_gate_bias =
+      GetInput(context, node, kForgetGateBiasTensor);
+  TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->size, 1);
+  TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->data[0], n_cell);
+
+  TfLiteTensor* cell_bias = GetInput(context, node, kCellGateBiasTensor);
+  TF_LITE_ENSURE_EQ(context, cell_bias->dims->size, 1);
+  TF_LITE_ENSURE_EQ(context, cell_bias->dims->data[0], n_cell);
+
+  TfLiteTensor* output_gate_bias =
+      GetInput(context, node, kOutputGateBiasTensor);
+  TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->size, 1);
+  TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->data[0], n_cell);
+
+  TfLiteTensor* projection_weights =
+      GetOptionalInputTensor(context, node, kProjectionWeightsTensor);
+  if (projection_weights) {
+    TF_LITE_ENSURE_EQ(context, projection_weights->dims->size, 2);
+    TF_LITE_ENSURE_EQ(context, projection_weights->dims->data[0], n_output);
+    TF_LITE_ENSURE_EQ(context, projection_weights->dims->data[1], n_cell);
+  }
+
+  TfLiteTensor* projection_bias =
+      GetOptionalInputTensor(context, node, kProjectionBiasTensor);
+  if (projection_bias) {
+    TF_LITE_ENSURE_EQ(context, projection_bias->dims->size, 1);
+    TF_LITE_ENSURE_EQ(context, projection_bias->dims->data[0], n_output);
+  }
+
+  // Making sure the projection tensors are consistent:
+  // 1) If projection weight is not present, then projection bias should not be
+  // present.
+  // 2) If projection weight is present, then projection bias is optional.
+  // TODO(ghodrat): make sure this is correct.
+  const bool projecton_tensors_consistent =
+      ((projection_weights != nullptr) || (projection_bias == nullptr));
+  TF_LITE_ENSURE(context, projecton_tensors_consistent == true);
+
+  return kTfLiteOk;
+}
+
+// Resize the output, state and scratch tensors based on the sizes of the input
+// tensors. Also check that the size of the input tensors match each other.
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  // Check we have all the inputs and outputs we need.
+  TF_LITE_ENSURE_EQ(context, node->inputs->size, 18);
+  TF_LITE_ENSURE_EQ(context, node->outputs->size, 4);
+
+  // Inferring batch size, number of outputs and sequence length and
+  // number of cells from the input tensors.
+  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TF_LITE_ENSURE(context, input->dims->size > 1);
+  const int max_time = input->dims->data[0];
+  const int n_batch = input->dims->data[1];
+  const int n_input = input->dims->data[2];
+
+  TfLiteTensor* input_to_output_weights =
+      GetInput(context, node, kInputToOutputWeightsTensor);
+  const int n_cell = input_to_output_weights->dims->data[0];
+  TF_LITE_ENSURE_EQ(context, input_to_output_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, input_to_output_weights->dims->data[1], n_input);
+
+  TfLiteTensor* recurrent_to_output_weights =
+      GetInput(context, node, kRecurrentToOutputWeightsTensor);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_output_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_output_weights->dims->data[0],
+                    n_cell);
+  const int n_output = recurrent_to_output_weights->dims->data[1];
+
+  // Check that input tensor dimensions matches with each other.
+  CheckInputTensorDimensions(context, node, n_input, n_output, n_cell);
+
+  // Get the pointer to output, state and scratch buffer tensors.
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output_state = GetOutput(context, node, kOutputStateTensor);
+  TfLiteTensor* cell_state = GetOutput(context, node, kCellStateTensor);
+  // TODO(ghodrat): Modify this as soon as we have a finalized method for
+  // scratch buffers.
+  TfLiteTensor* scratch_buffer = GetOutput(context, node, kScratchBufferTensor);
+
+  // Resize the output and output_state tensors.
+  TfLiteIntArray* output_size = TfLiteIntArrayCreate(3);
+  output_size->data[0] = max_time;
+  output_size->data[1] = n_batch;
+  output_size->data[2] = n_output;
+  TF_LITE_ENSURE_OK(context,
+                    context->ResizeTensor(context, output, output_size));
+
+  TfLiteIntArray* output_state_size = TfLiteIntArrayCreate(2);
+  output_state_size->data[0] = n_batch;
+  output_state_size->data[1] = n_output;
+  TF_LITE_ENSURE_OK(
+      context, context->ResizeTensor(context, output_state, output_state_size));
+
+  // Resize the scratch buffer tensor.
+  TfLiteIntArray* cell_size = TfLiteIntArrayCreate(2);
+  cell_size->data[0] = n_batch;
+  cell_size->data[1] = n_cell;
+  TF_LITE_ENSURE_OK(context,
+                    context->ResizeTensor(context, cell_state, cell_size));
+
+  // Mark state tensors as persistent tensors.
+  output_state->allocation_type = kTfLiteArenaRwPersistent;
+  cell_state->allocation_type = kTfLiteArenaRwPersistent;
+
+  TfLiteTensor* input_to_input_weights =
+      GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
+  const bool use_cifg = (input_to_input_weights == nullptr);
+  if (use_cifg) {
+    TfLiteIntArray* scratch_buffer_size = TfLiteIntArrayCreate(2);
+    scratch_buffer_size->data[0] = n_batch;
+    // Reserving space for Cell, Forget, Output gates
+    scratch_buffer_size->data[1] = n_cell * 3;
+    TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scratch_buffer,
+                                                     scratch_buffer_size));
+  } else {
+    TfLiteIntArray* scratch_buffer_size = TfLiteIntArrayCreate(2);
+    scratch_buffer_size->data[0] = n_batch;
+    // Reserving space for Input, Cell, Forget, Output gates
+    scratch_buffer_size->data[1] = n_cell * 4;
+    TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scratch_buffer,
+                                                     scratch_buffer_size));
+  }
+  return kTfLiteOk;
+}
+
+// The LSTM Op engine.
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data);
+  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+
+  TfLiteTensor* input_to_input_weights =
+      GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
+  TfLiteTensor* input_to_forget_weights =
+      GetInput(context, node, kInputToForgetWeightsTensor);
+  TfLiteTensor* input_to_cell_weights =
+      GetInput(context, node, kInputToCellWeightsTensor);
+  TfLiteTensor* input_to_output_weights =
+      GetInput(context, node, kInputToOutputWeightsTensor);
+
+  TfLiteTensor* recurrent_to_input_weights =
+      GetOptionalInputTensor(context, node, kRecurrentToInputWeightsTensor);
+  TfLiteTensor* recurrent_to_forget_weights =
+      GetInput(context, node, kRecurrentToForgetWeightsTensor);
+  TfLiteTensor* recurrent_to_cell_weights =
+      GetInput(context, node, kRecurrentToCellWeightsTensor);
+  TfLiteTensor* recurrent_to_output_weights =
+      GetInput(context, node, kRecurrentToOutputWeightsTensor);
+
+  TfLiteTensor* cell_to_input_weights =
+      GetOptionalInputTensor(context, node, kCellToInputWeightsTensor);
+  TfLiteTensor* cell_to_forget_weights =
+      GetOptionalInputTensor(context, node, kCellToForgetWeightsTensor);
+  TfLiteTensor* cell_to_output_weights =
+      GetOptionalInputTensor(context, node, kCellToOutputWeightsTensor);
+
+  TfLiteTensor* input_gate_bias =
+      GetOptionalInputTensor(context, node, kInputGateBiasTensor);
+  TfLiteTensor* forget_gate_bias =
+      GetInput(context, node, kForgetGateBiasTensor);
+  TfLiteTensor* cell_bias = GetInput(context, node, kCellGateBiasTensor);
+  TfLiteTensor* output_gate_bias =
+      GetInput(context, node, kOutputGateBiasTensor);
+
+  TfLiteTensor* projection_weights =
+      GetOptionalInputTensor(context, node, kProjectionWeightsTensor);
+  TfLiteTensor* projection_bias =
+      GetOptionalInputTensor(context, node, kProjectionBiasTensor);
+
+  TfLiteTensor* output_state = GetOutput(context, node, kOutputStateTensor);
+  TfLiteTensor* cell_state = GetOutput(context, node, kCellStateTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  const int max_time = input->dims->data[0];
+  const int n_batch = input->dims->data[1];
+  const int n_input = input->dims->data[2];
+  // n_cell and n_output will be the same size when there is no projection.
+  const int n_cell = input_to_output_weights->dims->data[0];
+  const int n_output = recurrent_to_output_weights->dims->data[1];
+
+  // Since we have already checked that weights are all there or none, we can
+  // check the existense of only one to the get the condition.
+  const bool use_cifg = (input_to_input_weights == nullptr);
+  const bool use_peephole = (cell_to_output_weights != nullptr);
+
+  // Index the scratch buffers pointers to the global scratch buffer.
+  TfLiteTensor* scratch_buffer = GetOutput(context, node, kScratchBufferTensor);
+  float* input_gate_scratch = nullptr;
+  float* cell_scratch = nullptr;
+  float* forget_gate_scratch = nullptr;
+  float* output_gate_scratch = nullptr;
+  if (use_cifg) {
+    cell_scratch = scratch_buffer->data.f;
+    forget_gate_scratch = scratch_buffer->data.f + n_cell * n_batch;
+    output_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
+  } else {
+    input_gate_scratch = scratch_buffer->data.f;
+    cell_scratch = scratch_buffer->data.f + n_cell * n_batch;
+    forget_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
+    output_gate_scratch = scratch_buffer->data.f + 3 * n_cell * n_batch;
+  }
+
+  for (int t = 0; t < max_time; t++) {
+    const float* input_ptr_time = input->data.f + t * n_batch * n_input;
+    // Initialize scratch buffers with bias.
+    if (!use_cifg) {
+      tensor_utils::VectorBatchVectorAssign(input_gate_bias->data.f, n_cell,
+                                            n_batch, input_gate_scratch);
+    }
+    tensor_utils::VectorBatchVectorAssign(forget_gate_bias->data.f, n_cell,
+                                          n_batch, forget_gate_scratch);
+    tensor_utils::VectorBatchVectorAssign(cell_bias->data.f, n_cell, n_batch,
+                                          cell_scratch);
+    tensor_utils::VectorBatchVectorAssign(output_gate_bias->data.f, n_cell,
+                                          n_batch, output_gate_scratch);
+
+    // For each batch and cell: compute input_weight * input.
+    if (!use_cifg) {
+      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+          input_to_input_weights->data.f, n_cell, n_input, input_ptr_time,
+          n_batch, input_gate_scratch, /*result_stride=*/1);
+    }
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        input_to_forget_weights->data.f, n_cell, n_input, input_ptr_time,
+        n_batch, forget_gate_scratch, /*result_stride=*/1);
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        input_to_cell_weights->data.f, n_cell, n_input, input_ptr_time, n_batch,
+        cell_scratch, /*result_stride=*/1);
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        input_to_output_weights->data.f, n_cell, n_input, input_ptr_time,
+        n_batch, output_gate_scratch, /*result_stride=*/1);
+
+    // For each batch and cell: compute recurrent_weight * output_state.
+    if (!use_cifg) {
+      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+          recurrent_to_input_weights->data.f, n_cell, n_output,
+          output_state->data.f, n_batch, input_gate_scratch,
+          /*result_stride=*/1);
+    }
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        recurrent_to_forget_weights->data.f, n_cell, n_output,
+        output_state->data.f, n_batch, forget_gate_scratch,
+        /*result_stride=*/1);
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        recurrent_to_cell_weights->data.f, n_cell, n_output,
+        output_state->data.f, n_batch, cell_scratch, /*result_stride=*/1);
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        recurrent_to_output_weights->data.f, n_cell, n_output,
+        output_state->data.f, n_batch, output_gate_scratch,
+        /*result_stride=*/1);
+
+    // For each batch and cell: update input gate.
+    if (!use_cifg) {
+      if (use_peephole) {
+        tensor_utils::VectorBatchVectorCwiseProductAccumulate(
+            cell_to_input_weights->data.f, n_cell, cell_state->data.f, n_batch,
+            input_gate_scratch);
+      }
+      tensor_utils::ApplySigmoidToVector(input_gate_scratch, n_cell * n_batch,
+                                         input_gate_scratch);
+    }
+
+    // For each batch and cell: update forget gate.
+    if (use_peephole) {
+      tensor_utils::VectorBatchVectorCwiseProductAccumulate(
+          cell_to_forget_weights->data.f, n_cell, cell_state->data.f, n_batch,
+          forget_gate_scratch);
+    }
+    tensor_utils::ApplySigmoidToVector(forget_gate_scratch, n_cell * n_batch,
+                                       forget_gate_scratch);
+
+    // For each batch and cell: update the cell.
+    tensor_utils::VectorVectorCwiseProduct(forget_gate_scratch,
+                                           cell_state->data.f, n_batch * n_cell,
+                                           cell_state->data.f);
+    tensor_utils::ApplyActivationToVector(cell_scratch, n_batch * n_cell,
+                                          params->activation, cell_scratch);
+    if (use_cifg) {
+      tensor_utils::Sub1Vector(forget_gate_scratch, n_batch * n_cell,
+                               forget_gate_scratch);
+      tensor_utils::VectorVectorCwiseProductAccumulate(
+          cell_scratch, forget_gate_scratch, n_batch * n_cell,
+          cell_state->data.f);
+    } else {
+      tensor_utils::VectorVectorCwiseProductAccumulate(
+          cell_scratch, input_gate_scratch, n_batch * n_cell,
+          cell_state->data.f);
+    }
+    if (params->cell_clip > 0.0) {
+      tensor_utils::ClipVector(cell_state->data.f, n_batch * n_cell,
+                               params->cell_clip, cell_state->data.f);
+    }
+
+    // For each batch and cell: update the output gate.
+    if (use_peephole) {
+      tensor_utils::VectorBatchVectorCwiseProductAccumulate(
+          cell_to_output_weights->data.f, n_cell, cell_state->data.f, n_batch,
+          output_gate_scratch);
+    }
+    tensor_utils::ApplySigmoidToVector(output_gate_scratch, n_batch * n_cell,
+                                       output_gate_scratch);
+    tensor_utils::ApplyActivationToVector(cell_state->data.f, n_batch * n_cell,
+                                          params->activation, cell_scratch);
+    tensor_utils::VectorVectorCwiseProduct(output_gate_scratch, cell_scratch,
+                                           n_batch * n_cell,
+                                           output_gate_scratch);
+
+    // For each batch: update the projection and output_state.
+    const bool use_projection_weight = (projection_weights != nullptr);
+    const bool use_projection_bias = (projection_bias != nullptr);
+    float* output_ptr_time = output->data.f + t * n_batch * n_output;
+    if (use_projection_weight) {
+      if (use_projection_bias) {
+        tensor_utils::VectorBatchVectorAssign(projection_bias->data.f, n_output,
+                                              n_batch, output_ptr_time);
+      } else {
+        tensor_utils::ZeroVector(output_ptr_time, n_batch * n_output);
+      }
+      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+          projection_weights->data.f, n_output, n_cell, output_gate_scratch,
+          n_batch, output_ptr_time, /*result_stride=*/1);
+      if (params->proj_clip > 0.0) {
+        tensor_utils::ClipVector(output_ptr_time, n_batch * n_output,
+                                 params->proj_clip, output_ptr_time);
+      }
+    } else {
+      tensor_utils::CopyVector(output_gate_scratch, n_batch * n_output,
+                               output_ptr_time);
+    }
+    tensor_utils::CopyVector(output_ptr_time, n_batch * n_output,
+                             output_state->data.f);
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace unidirectional_sequence_lstm
+
+TfLiteRegistration* Register_UNIDIRECTIONAL_SEQUENCE_LSTM() {
+  static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
+                                 unidirectional_sequence_lstm::Prepare,
+                                 unidirectional_sequence_lstm::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm_test.cc b/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm_test.cc
new file mode 100644
index 0000000000..93b635ae57
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm_test.cc
@@ -0,0 +1,1089 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Unit test for TFLite Sequential LSTM op.
+
+#include <iomanip>
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class UnidirectionalLSTMOpModel : public SingleOpModel {
+ public:
+  UnidirectionalLSTMOpModel(int n_batch, int n_input, int n_cell, int n_output,
+                            int sequence_length, bool use_cifg,
+                            bool use_peephole, bool use_projection_weights,
+                            bool use_projection_bias, float cell_clip,
+                            float proj_clip,
+                            const std::vector<std::vector<int>>& input_shapes)
+      : n_batch_(n_batch),
+        n_input_(n_input),
+        n_cell_(n_cell),
+        n_output_(n_output),
+        sequence_length_(sequence_length) {
+    input_ = AddInput(TensorType_FLOAT32);
+
+    if (use_cifg) {
+      input_to_input_weights_ = AddNullInput();
+    } else {
+      input_to_input_weights_ = AddInput(TensorType_FLOAT32);
+    }
+
+    input_to_forget_weights_ = AddInput(TensorType_FLOAT32);
+    input_to_cell_weights_ = AddInput(TensorType_FLOAT32);
+    input_to_output_weights_ = AddInput(TensorType_FLOAT32);
+
+    if (use_cifg) {
+      recurrent_to_input_weights_ = AddNullInput();
+    } else {
+      recurrent_to_input_weights_ = AddInput(TensorType_FLOAT32);
+    }
+
+    recurrent_to_forget_weights_ = AddInput(TensorType_FLOAT32);
+    recurrent_to_cell_weights_ = AddInput(TensorType_FLOAT32);
+    recurrent_to_output_weights_ = AddInput(TensorType_FLOAT32);
+
+    if (use_peephole) {
+      if (use_cifg) {
+        cell_to_input_weights_ = AddNullInput();
+      } else {
+        cell_to_input_weights_ = AddInput(TensorType_FLOAT32);
+      }
+      cell_to_forget_weights_ = AddInput(TensorType_FLOAT32);
+      cell_to_output_weights_ = AddInput(TensorType_FLOAT32);
+    } else {
+      cell_to_input_weights_ = AddNullInput();
+      cell_to_forget_weights_ = AddNullInput();
+      cell_to_output_weights_ = AddNullInput();
+    }
+
+    if (use_cifg) {
+      input_gate_bias_ = AddNullInput();
+    } else {
+      input_gate_bias_ = AddInput(TensorType_FLOAT32);
+    }
+    forget_gate_bias_ = AddInput(TensorType_FLOAT32);
+    cell_bias_ = AddInput(TensorType_FLOAT32);
+    output_gate_bias_ = AddInput(TensorType_FLOAT32);
+
+    if (use_projection_weights) {
+      projection_weights_ = AddInput(TensorType_FLOAT32);
+      if (use_projection_bias) {
+        projection_bias_ = AddInput(TensorType_FLOAT32);
+      } else {
+        projection_bias_ = AddNullInput();
+      }
+    } else {
+      projection_weights_ = AddNullInput();
+      projection_bias_ = AddNullInput();
+    }
+
+    scratch_buffer_ = AddOutput(TensorType_FLOAT32);
+    // TODO(ghodrat): Modify these states when we have a permanent solution for
+    // persistent buffer.
+    output_state_ = AddOutput(TensorType_FLOAT32);
+    cell_state_ = AddOutput(TensorType_FLOAT32);
+    output_ = AddOutput(TensorType_FLOAT32);
+
+    SetBuiltinOp(BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM,
+                 BuiltinOptions_LSTMOptions,
+                 CreateLSTMOptions(builder_, ActivationFunctionType_TANH,
+                                   cell_clip, proj_clip)
+                     .Union());
+    BuildInterpreter(input_shapes);
+  }
+
+  void SetInputToInputWeights(std::initializer_list<float> f) {
+    PopulateTensor(input_to_input_weights_, f);
+  }
+
+  void SetInputToForgetWeights(std::initializer_list<float> f) {
+    PopulateTensor(input_to_forget_weights_, f);
+  }
+
+  void SetInputToCellWeights(std::initializer_list<float> f) {
+    PopulateTensor(input_to_cell_weights_, f);
+  }
+
+  void SetInputToOutputWeights(std::initializer_list<float> f) {
+    PopulateTensor(input_to_output_weights_, f);
+  }
+
+  void SetRecurrentToInputWeights(std::initializer_list<float> f) {
+    PopulateTensor(recurrent_to_input_weights_, f);
+  }
+
+  void SetRecurrentToForgetWeights(std::initializer_list<float> f) {
+    PopulateTensor(recurrent_to_forget_weights_, f);
+  }
+
+  void SetRecurrentToCellWeights(std::initializer_list<float> f) {
+    PopulateTensor(recurrent_to_cell_weights_, f);
+  }
+
+  void SetRecurrentToOutputWeights(std::initializer_list<float> f) {
+    PopulateTensor(recurrent_to_output_weights_, f);
+  }
+
+  void SetCellToInputWeights(std::initializer_list<float> f) {
+    PopulateTensor(cell_to_input_weights_, f);
+  }
+
+  void SetCellToForgetWeights(std::initializer_list<float> f) {
+    PopulateTensor(cell_to_forget_weights_, f);
+  }
+
+  void SetCellToOutputWeights(std::initializer_list<float> f) {
+    PopulateTensor(cell_to_output_weights_, f);
+  }
+
+  void SetInputGateBias(std::initializer_list<float> f) {
+    PopulateTensor(input_gate_bias_, f);
+  }
+
+  void SetForgetGateBias(std::initializer_list<float> f) {
+    PopulateTensor(forget_gate_bias_, f);
+  }
+
+  void SetCellBias(std::initializer_list<float> f) {
+    PopulateTensor(cell_bias_, f);
+  }
+
+  void SetOutputGateBias(std::initializer_list<float> f) {
+    PopulateTensor(output_gate_bias_, f);
+  }
+
+  void SetProjectionWeights(std::initializer_list<float> f) {
+    PopulateTensor(projection_weights_, f);
+  }
+
+  void SetProjectionBias(std::initializer_list<float> f) {
+    PopulateTensor(projection_bias_, f);
+  }
+
+  void ResetOutputState() {
+    const int zero_buffer_size = n_cell_ * n_batch_;
+    std::unique_ptr<float[]> zero_buffer(new float[zero_buffer_size]);
+    memset(zero_buffer.get(), 0, zero_buffer_size * sizeof(float));
+    PopulateTensor(output_state_, 0, zero_buffer.get(),
+                   zero_buffer.get() + zero_buffer_size);
+  }
+
+  void ResetCellState() {
+    const int zero_buffer_size = n_cell_ * n_batch_;
+    std::unique_ptr<float[]> zero_buffer(new float[zero_buffer_size]);
+    memset(zero_buffer.get(), 0, zero_buffer_size * sizeof(float));
+    PopulateTensor(cell_state_, 0, zero_buffer.get(),
+                   zero_buffer.get() + zero_buffer_size);
+  }
+
+  void SetInput(int offset, float* begin, float* end) {
+    PopulateTensor(input_, offset, begin, end);
+  }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+  int num_inputs() { return n_input_; }
+  int num_outputs() { return n_output_; }
+  int num_cells() { return n_cell_; }
+  int num_batches() { return n_batch_; }
+  int sequence_length() { return sequence_length_; }
+
+ private:
+  int input_;
+  int input_to_input_weights_;
+  int input_to_forget_weights_;
+  int input_to_cell_weights_;
+  int input_to_output_weights_;
+
+  int recurrent_to_input_weights_;
+  int recurrent_to_forget_weights_;
+  int recurrent_to_cell_weights_;
+  int recurrent_to_output_weights_;
+
+  int cell_to_input_weights_;
+  int cell_to_forget_weights_;
+  int cell_to_output_weights_;
+
+  int input_gate_bias_;
+  int forget_gate_bias_;
+  int cell_bias_;
+  int output_gate_bias_;
+
+  int projection_weights_;
+  int projection_bias_;
+
+  int output_;
+  int output_state_;
+  int cell_state_;
+  int scratch_buffer_;
+
+  int n_batch_;
+  int n_input_;
+  int n_cell_;
+  int n_output_;
+  int sequence_length_;
+};
+
+TEST(LSTMOpTest, BlackBoxTestNoCifgNoPeepholeNoProjectionNoClipping) {
+  const int n_batch = 1;
+  const int n_input = 2;
+  // n_cell and n_output have the same size when there is no projection.
+  const int n_cell = 4;
+  const int n_output = 4;
+  const int sequence_length = 3;
+
+  UnidirectionalLSTMOpModel lstm(
+      n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/false,
+      /*use_peephole=*/false, /*use_projection_weights=*/false,
+      /*use_projection_bias=*/false, /*cell_clip=*/0.0, /*proj_clip=*/0.0,
+      {
+          {sequence_length, n_batch, n_input},  // input tensor
+
+          {n_cell, n_input},  // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {n_cell, n_output},  // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {0},  // cell_to_input_weight tensor
+          {0},  // cell_to_forget_weight tensor
+          {0},  // cell_to_output_weight tensor
+
+          {n_cell},  // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {0, 0},  // projection_weight tensor
+          {0},     // projection_bias tensor
+      });
+
+  lstm.SetInputToInputWeights({-0.45018822, -0.02338299, -0.0870589,
+                               -0.34550029, 0.04266912, -0.15680569,
+                               -0.34856534, 0.43890524});
+
+  lstm.SetInputToCellWeights({-0.50013041, 0.1370284, 0.11810488, 0.2013163,
+                              -0.20583314, 0.44344562, 0.22077113,
+                              -0.29909778});
+
+  lstm.SetInputToForgetWeights({0.09701663, 0.20334584, -0.50592935,
+                                -0.31343272, -0.40032279, 0.44781327,
+                                0.01387155, -0.35593212});
+
+  lstm.SetInputToOutputWeights({-0.25065863, -0.28290087, 0.04613829,
+                                0.40525138, 0.44272184, 0.03897077, -0.1556896,
+                                0.19487578});
+
+  lstm.SetInputGateBias({0., 0., 0., 0.});
+
+  lstm.SetCellBias({0., 0., 0., 0.});
+
+  lstm.SetForgetGateBias({1., 1., 1., 1.});
+
+  lstm.SetOutputGateBias({0., 0., 0., 0.});
+
+  lstm.SetRecurrentToInputWeights(
+      {-0.0063535, -0.2042388, 0.31454784, -0.35746509, 0.28902304, 0.08183324,
+       -0.16555229, 0.02286911, -0.13566875, 0.03034258, 0.48091322,
+       -0.12528998, 0.24077177, -0.51332325, -0.33502164, 0.10629296});
+
+  lstm.SetRecurrentToCellWeights(
+      {-0.3407414, 0.24443203, -0.2078532, 0.26320225, 0.05695659, -0.00123841,
+       -0.4744786, -0.35869038, -0.06418842, -0.13502428, -0.501764, 0.22830659,
+       -0.46367589, 0.26016325, -0.03894562, -0.16368064});
+
+  lstm.SetRecurrentToForgetWeights(
+      {-0.48684245, -0.06655136, 0.42224967, 0.2112639, 0.27654213, 0.20864892,
+       -0.07646349, 0.45877004, 0.00141793, -0.14609534, 0.36447752, 0.09196436,
+       0.28053468, 0.01560611, -0.20127171, -0.01140004});
+
+  lstm.SetRecurrentToOutputWeights(
+      {0.43385774, -0.17194885, 0.2718237, 0.09215671, 0.24107647, -0.39835793,
+       0.18212086, 0.01301402, 0.48572797, -0.50656658, 0.20047462, -0.20607421,
+       -0.51818722, -0.15390486, 0.0468148, 0.39922136});
+
+  // Input should have n_input * sequence_length many values.
+  static float lstm_input[] = {2., 3., 3., 4., 1., 1.};
+  static float lstm_golden_output[] = {-0.02973187, 0.1229473,   0.20885126,
+                                       -0.15358765, -0.03716109, 0.12507336,
+                                       0.41193449,  -0.20860538, -0.15053082,
+                                       0.09120187,  0.24278517,  -0.12222792};
+
+  // Resetting cell_state and output_state
+  lstm.ResetCellState();
+  lstm.ResetOutputState();
+
+  float* batch0_start = lstm_input;
+  float* batch0_end = batch0_start + lstm.num_inputs() * lstm.sequence_length();
+
+  lstm.SetInput(0, batch0_start, batch0_end);
+
+  lstm.Invoke();
+
+  float* golden_start = lstm_golden_output;
+  float* golden_end =
+      golden_start + lstm.num_outputs() * lstm.sequence_length();
+  std::vector<float> expected;
+  expected.insert(expected.end(), golden_start, golden_end);
+  EXPECT_THAT(lstm.GetOutput(), ElementsAreArray(ArrayFloatNear(expected)));
+}
+
+TEST(LSTMOpTest, BlackBoxTestWithCifgWithPeepholeNoProjectionNoClipping) {
+  const int n_batch = 1;
+  const int n_input = 2;
+  // n_cell and n_output have the same size when there is no projection.
+  const int n_cell = 4;
+  const int n_output = 4;
+  const int sequence_length = 3;
+
+  UnidirectionalLSTMOpModel lstm(
+      n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/true,
+      /*use_peephole=*/true, /*use_projection_weights=*/false,
+      /*use_projection_bias=*/false, /*cell_clip=*/0.0, /*proj_clip=*/0.0,
+      {
+          {sequence_length, n_batch, n_input},  // input tensor
+
+          {0, 0},             // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {0, 0},              // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {0},       // cell_to_input_weight tensor
+          {n_cell},  // cell_to_forget_weight tensor
+          {n_cell},  // cell_to_output_weight tensor
+
+          {0},       // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {0, 0},  // projection_weight tensor
+          {0},     // projection_bias tensor
+      });
+
+  lstm.SetInputToCellWeights({-0.49770179, -0.27711356, -0.09624726, 0.05100781,
+                              0.04717243, 0.48944736, -0.38535351,
+                              -0.17212132});
+
+  lstm.SetInputToForgetWeights({-0.55291498, -0.42866567, 0.13056988,
+                                -0.3633365, -0.22755712, 0.28253698, 0.24407166,
+                                0.33826375});
+
+  lstm.SetInputToOutputWeights({0.10725588, -0.02335852, -0.55932593,
+                                -0.09426838, -0.44257352, 0.54939759,
+                                0.01533556, 0.42751634});
+
+  lstm.SetCellBias({0., 0., 0., 0.});
+
+  lstm.SetForgetGateBias({1., 1., 1., 1.});
+
+  lstm.SetOutputGateBias({0., 0., 0., 0.});
+
+  lstm.SetRecurrentToCellWeights(
+      {0.54066205, -0.32668582, -0.43562764, -0.56094903, 0.42957711,
+       0.01841056, -0.32764608, -0.33027974, -0.10826075, 0.20675004,
+       0.19069612, -0.03026325, -0.54532051, 0.33003211, 0.44901288,
+       0.21193194});
+
+  lstm.SetRecurrentToForgetWeights(
+      {-0.13832897, -0.0515101, -0.2359007, -0.16661474, -0.14340827,
+       0.36986142, 0.23414481, 0.55899, 0.10798943, -0.41174671, 0.17751795,
+       -0.34484994, -0.35874045, -0.11352962, 0.27268326, 0.54058349});
+
+  lstm.SetRecurrentToOutputWeights(
+      {0.41613156, 0.42610586, -0.16495961, -0.5663873, 0.30579174, -0.05115908,
+       -0.33941799, 0.23364776, 0.11178309, 0.09481031, -0.26424935, 0.46261835,
+       0.50248802, 0.26114327, -0.43736315, 0.33149987});
+
+  lstm.SetCellToForgetWeights(
+      {0.47485286, -0.51955009, -0.24458408, 0.31544167});
+  lstm.SetCellToOutputWeights(
+      {-0.17135078, 0.82760304, 0.85573703, -0.77109635});
+
+  static float lstm_input[] = {2., 3., 3., 4., 1., 1.};
+  static float lstm_golden_output[] = {-0.36444446, -0.00352185, 0.12886585,
+                                       -0.05163646, -0.42312205, -0.01218222,
+                                       0.24201041,  -0.08124574, -0.358325,
+                                       -0.04621704, 0.21641694,  -0.06471302};
+
+  // Resetting cell_state and output_state
+  lstm.ResetCellState();
+  lstm.ResetOutputState();
+
+  float* batch0_start = lstm_input;
+  float* batch0_end = batch0_start + lstm.num_inputs() * lstm.sequence_length();
+
+  lstm.SetInput(0, batch0_start, batch0_end);
+
+  lstm.Invoke();
+
+  float* golden_start = lstm_golden_output;
+  float* golden_end =
+      golden_start + lstm.num_outputs() * lstm.sequence_length();
+  std::vector<float> expected;
+  expected.insert(expected.end(), golden_start, golden_end);
+  EXPECT_THAT(lstm.GetOutput(), ElementsAreArray(ArrayFloatNear(expected)));
+}
+
+TEST(LSTMOpTest, BlackBoxTestWithPeepholeWithProjectionNoClipping) {
+  const int n_batch = 2;
+  const int n_input = 5;
+  const int n_cell = 20;
+  const int n_output = 16;
+  const int sequence_length = 4;
+
+  UnidirectionalLSTMOpModel lstm(
+      n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/false,
+      /*use_peephole=*/true, /*use_projection_weights=*/true,
+      /*use_projection_bias=*/false,
+      /*cell_clip=*/0.0, /*proj_clip=*/0.0,
+      {
+          {sequence_length, n_batch, n_input},  // input tensor
+
+          {n_cell, n_input},  // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {n_cell, n_output},  // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {n_cell},  // cell_to_input_weight tensor
+          {n_cell},  // cell_to_forget_weight tensor
+          {n_cell},  // cell_to_output_weight tensor
+
+          {n_cell},  // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {n_output, n_cell},  // projection_weight tensor
+          {0},                 // projection_bias tensor
+      });
+
+  lstm.SetInputToInputWeights(
+      {0.021393683,  0.06124551,    0.046905167,  -0.014657677,  -0.03149463,
+       0.09171803,   0.14647801,    0.10797193,   -0.0057968358, 0.0019193048,
+       -0.2726754,   0.10154029,    -0.018539885, 0.080349885,   -0.10262385,
+       -0.022599787, -0.09121155,   -0.008675967, -0.045206103,  -0.0821282,
+       -0.008045952, 0.015478081,   0.055217247,  0.038719587,   0.044153627,
+       -0.06453243,  0.05031825,    -0.046935108, -0.008164439,  0.014574226,
+       -0.1671009,   -0.15519552,   -0.16819797,  -0.13971269,   -0.11953059,
+       0.25005487,   -0.22790983,   0.009855087,  -0.028140958,  -0.11200698,
+       0.11295408,   -0.0035217577, 0.054485075,  0.05184695,    0.064711206,
+       0.10989193,   0.11674786,    0.03490607,   0.07727357,    0.11390585,
+       -0.1863375,   -0.1034451,    -0.13945189,  -0.049401227,  -0.18767063,
+       0.042483903,  0.14233552,    0.13832581,   0.18350165,    0.14545603,
+       -0.028545704, 0.024939531,   0.050929718,  0.0076203286,  -0.0029723682,
+       -0.042484224, -0.11827596,   -0.09171104,  -0.10808628,   -0.16327988,
+       -0.2273378,   -0.0993647,    -0.017155107, 0.0023917493,  0.049272764,
+       0.0038534778, 0.054764505,   0.089753784,  0.06947234,    0.08014476,
+       -0.04544234,  -0.0497073,    -0.07135631,  -0.048929106,  -0.004042012,
+       -0.009284026, 0.018042054,   0.0036860977, -0.07427302,   -0.11434604,
+       -0.018995456, 0.031487543,   0.012834908,  0.019977754,   0.044256654,
+       -0.39292613,  -0.18519334,   -0.11651281,  -0.06809892,   0.011373677});
+
+  lstm.SetInputToForgetWeights(
+      {-0.0018401089, -0.004852237,  0.03698424,   0.014181704,   0.028273236,
+       -0.016726194,  -0.05249759,   -0.10204261,  0.00861066,    -0.040979505,
+       -0.009899187,  0.01923892,    -0.028177269, -0.08535103,   -0.14585495,
+       0.10662567,    -0.01909731,   -0.017883534, -0.0047269356, -0.045103323,
+       0.0030784295,  0.076784775,   0.07463696,   0.094531395,   0.0814421,
+       -0.12257899,   -0.033945758,  -0.031303465, 0.045630626,   0.06843887,
+       -0.13492945,   -0.012480007,  -0.0811829,   -0.07224499,   -0.09628791,
+       0.045100946,   0.0012300825,  0.013964662,  0.099372394,   0.02543059,
+       0.06958324,    0.034257296,   0.0482646,    0.06267997,    0.052625068,
+       0.12784666,    0.07077897,    0.025725935,  0.04165009,    0.07241905,
+       0.018668644,   -0.037377294,  -0.06277783,  -0.08833636,   -0.040120605,
+       -0.011405586,  -0.007808335,  -0.010301386, -0.005102167,  0.027717464,
+       0.05483423,    0.11449111,    0.11289652,   0.10939839,    0.13396506,
+       -0.08402166,   -0.01901462,   -0.044678304, -0.07720565,   0.014350063,
+       -0.11757958,   -0.0652038,    -0.08185733,  -0.076754324,  -0.092614375,
+       0.10405491,    0.052960336,   0.035755895,  0.035839386,   -0.012540553,
+       0.036881298,   0.02913376,    0.03420159,   0.05448447,    -0.054523353,
+       0.02582715,    0.02327355,    -0.011857179, -0.0011980024, -0.034641717,
+       -0.026125094,  -0.17582615,   -0.15923657,  -0.27486774,   -0.0006143371,
+       0.0001771948,  -8.470171e-05, 0.02651807,   0.045790765,   0.06956496});
+
+  lstm.SetInputToCellWeights(
+      {-0.04580283,   -0.09549462,   -0.032418985,  -0.06454633,
+       -0.043528453,  0.043018587,   -0.049152344,  -0.12418144,
+       -0.078985475,  -0.07596889,   0.019484362,   -0.11434962,
+       -0.0074034138, -0.06314844,   -0.092981495,  0.0062155537,
+       -0.025034338,  -0.0028890965, 0.048929527,   0.06235075,
+       0.10665918,    -0.032036792,  -0.08505916,   -0.10843358,
+       -0.13002433,   -0.036816437,  -0.02130134,   -0.016518239,
+       0.0047691227,  -0.0025825808, 0.066017866,   0.029991534,
+       -0.10652836,   -0.1037554,    -0.13056071,   -0.03266643,
+       -0.033702414,  -0.006473424,  -0.04611692,   0.014419339,
+       -0.025174323,  0.0396852,     0.081777506,   0.06157468,
+       0.10210095,    -0.009658194,  0.046511717,   0.03603906,
+       0.0069369148,  0.015960095,   -0.06507666,   0.09551598,
+       0.053568836,   0.06408714,    0.12835667,    -0.008714329,
+       -0.20211966,   -0.12093674,   0.029450472,   0.2849013,
+       -0.029227901,  0.1164364,     -0.08560263,   0.09941786,
+       -0.036999565,  -0.028842626,  -0.0033637602, -0.017012902,
+       -0.09720865,   -0.11193351,   -0.029155117,  -0.017936034,
+       -0.009768936,  -0.04223324,   -0.036159635,  0.06505112,
+       -0.021742892,  -0.023377212,  -0.07221364,   -0.06430552,
+       0.05453865,    0.091149814,   0.06387331,    0.007518393,
+       0.055960953,   0.069779344,   0.046411168,   0.10509911,
+       0.07463894,    0.0075130584,  0.012850982,   0.04555431,
+       0.056955688,   0.06555285,    0.050801456,   -0.009862683,
+       0.00826772,    -0.026555609,  -0.0073611983, -0.0014897042});
+
+  lstm.SetInputToOutputWeights(
+      {-0.0998932,   -0.07201956,  -0.052803773,  -0.15629593,  -0.15001918,
+       -0.07650751,  0.02359855,   -0.075155355,  -0.08037709,  -0.15093534,
+       0.029517552,  -0.04751393,  0.010350531,   -0.02664851,  -0.016839722,
+       -0.023121163, 0.0077019283, 0.012851257,   -0.05040649,  -0.0129761,
+       -0.021737747, -0.038305793, -0.06870586,   -0.01481247,  -0.001285394,
+       0.10124236,   0.083122835,  0.053313006,   -0.062235646, -0.075637154,
+       -0.027833903, 0.029774971,  0.1130802,     0.09218906,   0.09506135,
+       -0.086665764, -0.037162706, -0.038880914,  -0.035832845, -0.014481564,
+       -0.09825003,  -0.12048569,  -0.097665586,  -0.05287633,  -0.0964047,
+       -0.11366429,  0.035777505,  0.13568819,    0.052451383,  0.050649304,
+       0.05798951,   -0.021852335, -0.099848844,  0.014740475,  -0.078897946,
+       0.04974699,   0.014160473,  0.06973932,    0.04964942,   0.033364646,
+       0.08190124,   0.025535367,  0.050893165,   0.048514254,  0.06945813,
+       -0.078907564, -0.06707616,  -0.11844508,   -0.09986688,  -0.07509403,
+       0.06263226,   0.14925587,   0.20188436,    0.12098451,   0.14639415,
+       0.0015017595, -0.014267382, -0.03417257,   0.012711468,  0.0028300495,
+       -0.024758482, -0.05098548,  -0.0821182,    0.014225672,  0.021544158,
+       0.08949725,   0.07505268,   -0.0020780868, 0.04908258,   0.06476295,
+       -0.022907063, 0.027562456,  0.040185735,   0.019567577,  -0.015598739,
+       -0.049097303, -0.017121866, -0.083368234,  -0.02332002,  -0.0840956});
+
+  lstm.SetInputGateBias(
+      {0.02234832,  0.14757581,   0.18176508,  0.10380666,  0.053110216,
+       -0.06928846, -0.13942584,  -0.11816189, 0.19483899,  0.03652339,
+       -0.10250295, 0.036714908,  -0.18426876, 0.036065217, 0.21810818,
+       0.02383196,  -0.043370757, 0.08690144,  -0.04444982, 0.00030581196});
+
+  lstm.SetForgetGateBias({0.035185695, -0.042891346, -0.03032477, 0.23027696,
+                          0.11098921,  0.15378423,   0.09263801,  0.09790885,
+                          0.09508917,  0.061199076,  0.07665568,  -0.015443159,
+                          -0.03499149, 0.046190713,  0.08895977,  0.10899629,
+                          0.40694186,  0.06030037,   0.012413437, -0.06108739});
+
+  lstm.SetCellBias({-0.024379363, 0.0055531194, 0.23377132,   0.033463873,
+                    -0.1483596,   -0.10639995,  -0.091433935, 0.058573797,
+                    -0.06809782,  -0.07889636,  -0.043246906, -0.09829136,
+                    -0.4279842,   0.034901652,  0.18797937,   0.0075234566,
+                    0.016178843,  0.1749513,    0.13975595,   0.92058027});
+
+  lstm.SetOutputGateBias(
+      {0.046159424,  -0.0012809046, 0.03563469,   0.12648113, 0.027195795,
+       0.35373217,   -0.018957434,  0.008907322,  -0.0762701, 0.12018895,
+       0.04216877,   0.0022856654,  0.040952638,  0.3147856,  0.08225149,
+       -0.057416286, -0.14995944,   -0.008040261, 0.13208859, 0.029760877});
+
+  lstm.SetRecurrentToInputWeights(
+      {-0.001374326,   -0.078856036,   0.10672688,    0.029162422,
+       -0.11585556,    0.02557986,     -0.13446963,   -0.035785314,
+       -0.01244275,    0.025961924,    -0.02337298,   -0.044228926,
+       -0.055839065,   -0.046598054,   -0.010546039,  -0.06900766,
+       0.027239809,    0.022582639,    -0.013296484,  -0.05459212,
+       0.08981,        -0.045407712,   0.08682226,    -0.06867011,
+       -0.14390695,    -0.02916037,    0.000996957,   0.091420636,
+       0.14283475,     -0.07390571,    -0.06402044,   0.062524505,
+       -0.093129106,   0.04860203,     -0.08364217,   -0.08119002,
+       0.009352075,    0.22920375,     0.0016303885,  0.11583097,
+       -0.13732095,    0.012405723,    -0.07551853,   0.06343048,
+       0.12162708,     -0.031923793,   -0.014335606,  0.01790974,
+       -0.10650317,    -0.0724401,     0.08554849,    -0.05727212,
+       0.06556731,     -0.042729504,   -0.043227166,  0.011683251,
+       -0.013082158,   -0.029302018,   -0.010899579,  -0.062036745,
+       -0.022509435,   -0.00964907,    -0.01567329,   0.04260106,
+       -0.07787477,    -0.11576462,    0.017356863,   0.048673786,
+       -0.017577527,   -0.05527947,    -0.082487635,  -0.040137455,
+       -0.10820036,    -0.04666372,    0.022746278,   -0.07851417,
+       0.01068115,     0.032956902,    0.022433773,   0.0026891115,
+       0.08944216,     -0.0685835,     0.010513544,   0.07228705,
+       0.02032331,     -0.059686817,   -0.0005566496, -0.086984694,
+       0.040414046,    -0.1380399,     0.094208956,   -0.05722982,
+       0.012092817,    -0.04989123,    -0.086576,     -0.003399834,
+       -0.04696032,    -0.045747425,   0.10091314,    0.048676282,
+       -0.029037097,   0.031399418,    -0.0040285117, 0.047237843,
+       0.09504992,     0.041799378,    -0.049185462,  -0.031518843,
+       -0.10516937,    0.026374253,    0.10058866,    -0.0033195973,
+       -0.041975245,   0.0073591834,   0.0033782164,  -0.004325073,
+       -0.10167381,    0.042500053,    -0.01447153,   0.06464186,
+       -0.017142897,   0.03312627,     0.009205989,   0.024138335,
+       -0.011337001,   0.035530265,    -0.010912711,  0.0706555,
+       -0.005894094,   0.051841937,    -0.1401738,    -0.02351249,
+       0.0365468,      0.07590991,     0.08838724,    0.021681072,
+       -0.10086113,    0.019608743,    -0.06195883,   0.077335775,
+       0.023646897,    -0.095322326,   0.02233014,    0.09756986,
+       -0.048691444,   -0.009579111,   0.07595467,    0.11480546,
+       -0.09801813,    0.019894179,    0.08502348,    0.004032281,
+       0.037211012,    0.068537936,    -0.048005626,  -0.091520436,
+       -0.028379958,   -0.01556313,    0.06554592,    -0.045599163,
+       -0.01672207,    -0.020169014,   -0.011877351,  -0.20212261,
+       0.010889619,    0.0047078193,   0.038385306,   0.08540671,
+       -0.017140968,   -0.0035865551,  0.016678626,   0.005633034,
+       0.015963363,    0.00871737,     0.060130805,   0.028611384,
+       0.10109069,     -0.015060172,   -0.07894427,   0.06401885,
+       0.011584063,    -0.024466386,   0.0047652307,  -0.09041358,
+       0.030737216,    -0.0046374933,  0.14215417,    -0.11823516,
+       0.019899689,    0.006106124,    -0.027092824,  0.0786356,
+       0.05052217,     -0.058925,      -0.011402121,  -0.024987547,
+       -0.0013661642,  -0.06832946,    -0.015667673,  -0.1083353,
+       -0.00096863037, -0.06988685,    -0.053350925,  -0.027275559,
+       -0.033664223,   -0.07978348,    -0.025200296,  -0.017207067,
+       -0.058403496,   -0.055697463,   0.005798788,   0.12965427,
+       -0.062582195,   0.0013350133,   -0.10482091,   0.0379771,
+       0.072521195,    -0.0029455067,  -0.13797039,   -0.03628521,
+       0.013806405,    -0.017858358,   -0.01008298,   -0.07700066,
+       -0.017081132,   0.019358726,    0.0027079724,  0.004635139,
+       0.062634714,    -0.02338735,    -0.039547626,  -0.02050681,
+       0.03385117,     -0.083611414,   0.002862572,   -0.09421313,
+       0.058618143,    -0.08598433,    0.00972939,    0.023867095,
+       -0.053934585,   -0.023203006,   0.07452513,    -0.048767887,
+       -0.07314807,    -0.056307215,   -0.10433547,   -0.06440842,
+       0.04328182,     0.04389765,     -0.020006588,  -0.09076438,
+       -0.11652589,    -0.021705797,   0.03345259,    -0.010329105,
+       -0.025767034,   0.013057034,    -0.07316461,   -0.10145612,
+       0.06358255,     0.18531723,     0.07759293,    0.12006465,
+       0.1305557,      0.058638252,    -0.03393652,   0.09622831,
+       -0.16253184,    -2.4580743e-06, 0.079869635,   -0.070196845,
+       -0.005644518,   0.06857898,     -0.12598175,   -0.035084512,
+       0.03156317,     -0.12794146,    -0.031963028,  0.04692781,
+       0.030070418,    0.0071660685,   -0.095516115,  -0.004643372,
+       0.040170413,    -0.062104587,   -0.0037324072, 0.0554317,
+       0.08184801,     -0.019164372,   0.06791302,    0.034257166,
+       -0.10307039,    0.021943003,    0.046745934,   0.0790918,
+       -0.0265588,     -0.007824208,   0.042546265,   -0.00977924,
+       -0.0002440307,  -0.017384544,   -0.017990116,  0.12252321,
+       -0.014512694,   -0.08251313,    0.08861942,    0.13589665,
+       0.026351685,    0.012641483,    0.07466548,    0.044301085,
+       -0.045414884,   -0.051112458,   0.03444247,    -0.08502782,
+       -0.04106223,    -0.028126027,   0.028473156,   0.10467447});
+
+  lstm.SetRecurrentToForgetWeights(
+      {-0.057784554,  -0.026057621,  -0.068447545,   -0.022581743,
+       0.14811787,    0.10826372,    0.09471067,     0.03987225,
+       -0.0039523416, 0.00030638507, 0.053185795,    0.10572994,
+       0.08414449,    -0.022036452,  -0.00066928595, -0.09203576,
+       0.032950465,   -0.10985798,   -0.023809856,   0.0021431844,
+       -0.02196096,   -0.00326074,   0.00058621005,  -0.074678116,
+       -0.06193199,   0.055729095,   0.03736828,     0.020123724,
+       0.061878487,   -0.04729229,   0.034919553,    -0.07585433,
+       -0.04421272,   -0.044019096,  0.085488975,    0.04058006,
+       -0.06890133,   -0.030951202,  -0.024628663,   -0.07672815,
+       0.034293607,   0.08556707,    -0.05293577,    -0.033561368,
+       -0.04899627,   0.0241671,     0.015736353,    -0.095442444,
+       -0.029564252,  0.016493602,   -0.035026584,   0.022337519,
+       -0.026871363,  0.004780428,   0.0077918363,   -0.03601621,
+       0.016435321,   -0.03263031,   -0.09543275,    -0.047392778,
+       0.013454138,   0.028934088,   0.01685226,     -0.086110644,
+       -0.046250615,  -0.01847454,   0.047608484,    0.07339695,
+       0.034546845,   -0.04881143,   0.009128804,    -0.08802852,
+       0.03761666,    0.008096139,   -0.014454086,   0.014361001,
+       -0.023502491,  -0.0011840804, -0.07607001,    0.001856849,
+       -0.06509276,   -0.006021153,  -0.08570962,    -0.1451793,
+       0.060212336,   0.055259194,   0.06974018,     0.049454916,
+       -0.027794661,  -0.08077226,   -0.016179763,   0.1169753,
+       0.17213494,    -0.0056326236, -0.053934924,   -0.0124349,
+       -0.11520337,   0.05409887,    0.088759385,    0.0019655675,
+       0.0042065294,  0.03881498,    0.019844765,    0.041858196,
+       -0.05695512,   0.047233116,   0.038937137,    -0.06542224,
+       0.014429736,   -0.09719407,   0.13908425,     -0.05379757,
+       0.012321099,   0.082840554,   -0.029899208,   0.044217527,
+       0.059855383,   0.07711018,    -0.045319796,   0.0948846,
+       -0.011724666,  -0.0033288454, -0.033542685,   -0.04764985,
+       -0.13873616,   0.040668588,   0.034832682,    -0.015319203,
+       -0.018715994,  0.046002675,   0.0599172,      -0.043107376,
+       0.0294216,     -0.002314414,  -0.022424703,   0.0030315618,
+       0.0014641669,  0.0029166266,  -0.11878115,    0.013738511,
+       0.12375372,    -0.0006038222, 0.029104086,    0.087442465,
+       0.052958444,   0.07558703,    0.04817258,     0.044462286,
+       -0.015213451,  -0.08783778,   -0.0561384,     -0.003008196,
+       0.047060397,   -0.002058388,  0.03429439,     -0.018839769,
+       0.024734668,   0.024614193,   -0.042046934,   0.09597743,
+       -0.0043254104, 0.04320769,    0.0064070094,   -0.0019131786,
+       -0.02558259,   -0.022822596,  -0.023273505,   -0.02464396,
+       -0.10991725,   -0.006240552,  0.0074488563,   0.024044557,
+       0.04383914,    -0.046476185,  0.028658995,    0.060410924,
+       0.050786525,   0.009452605,   -0.0073054377,  -0.024810238,
+       0.0052906186,  0.0066939713,  -0.0020913032,  0.014515517,
+       0.015898481,   0.021362653,   -0.030262267,   0.016587038,
+       -0.011442813,  0.041154444,   -0.007631438,   -0.03423484,
+       -0.010977775,  0.036152758,   0.0066366293,   0.11915515,
+       0.02318443,    -0.041350313,  0.021485701,    -0.10906167,
+       -0.028218046,  -0.00954771,   0.020531068,    -0.11995105,
+       -0.03672871,   0.024019798,   0.014255957,    -0.05221243,
+       -0.00661567,   -0.04630967,   0.033188973,    0.10107534,
+       -0.014027541,  0.030796422,   -0.10270911,    -0.035999842,
+       0.15443139,    0.07684145,    0.036571592,    -0.035900835,
+       -0.0034699554, 0.06209149,    0.015920248,    -0.031122351,
+       -0.03858649,   0.01849943,    0.13872518,     0.01503974,
+       0.069941424,   -0.06948533,   -0.0088794185,  0.061282158,
+       -0.047401894,  0.03100163,    -0.041533746,   -0.10430945,
+       0.044574402,   -0.01425562,   -0.024290353,   0.034563623,
+       0.05866852,    0.023947537,   -0.09445152,    0.035450947,
+       0.02247216,    -0.0042998926, 0.061146557,    -0.10250651,
+       0.020881841,   -0.06747029,   0.10062043,     -0.0023941975,
+       0.03532124,    -0.016341697,  0.09685456,     -0.016764693,
+       0.051808182,   0.05875331,    -0.04536488,    0.001626336,
+       -0.028892258,  -0.01048663,   -0.009793449,   -0.017093895,
+       0.010987891,   0.02357273,    -0.00010856845, 0.0099760275,
+       -0.001845119,  -0.03551521,   0.0018358806,   0.05763657,
+       -0.01769146,   0.040995963,   0.02235177,     -0.060430344,
+       0.11475477,    -0.023854522,  0.10071741,     0.0686208,
+       -0.014250481,  0.034261297,   0.047418304,    0.08562733,
+       -0.030519066,  0.0060542435,  0.014653856,    -0.038836084,
+       0.04096551,    0.032249358,   -0.08355519,    -0.026823482,
+       0.056386515,   -0.010401743,  -0.028396193,   0.08507674,
+       0.014410365,   0.020995233,   0.17040324,     0.11511526,
+       0.02459721,    0.0066619175,  0.025853224,    -0.023133837,
+       -0.081302024,  0.017264642,   -0.009585969,   0.09491168,
+       -0.051313367,  0.054532815,   -0.014298593,   0.10657464,
+       0.007076659,   0.10964551,    0.0409152,      0.008275321,
+       -0.07283536,   0.07937492,    0.04192024,     -0.1075027});
+
+  lstm.SetRecurrentToCellWeights(
+      {-0.037322544,   0.018592842,   0.0056175636,  -0.06253426,
+       0.055647098,    -0.05713207,   -0.05626563,   0.005559383,
+       0.03375411,     -0.025757805,  -0.088049285,  0.06017052,
+       -0.06570978,    0.007384076,   0.035123326,   -0.07920549,
+       0.053676967,    0.044480428,   -0.07663568,   0.0071805613,
+       0.08089997,     0.05143358,    0.038261272,   0.03339287,
+       -0.027673481,   0.044746667,   0.028349208,   0.020090483,
+       -0.019443132,   -0.030755889,  -0.0040000007, 0.04465846,
+       -0.021585021,   0.0031670958,  0.0053199246,  -0.056117613,
+       -0.10893326,    0.076739706,   -0.08509834,   -0.027997585,
+       0.037871376,    0.01449768,    -0.09002357,   -0.06111149,
+       -0.046195522,   0.0422062,     -0.005683705,  -0.1253618,
+       -0.012925729,   -0.04890792,   0.06985068,    0.037654128,
+       0.03398274,     -0.004781977,  0.007032333,   -0.031787455,
+       0.010868644,    -0.031489216,  0.09525667,    0.013939797,
+       0.0058680447,   0.0167067,     0.02668468,    -0.04797466,
+       -0.048885044,   -0.12722108,   0.035304096,   0.06554885,
+       0.00972396,     -0.039238118,  -0.05159735,   -0.11329045,
+       0.1613692,      -0.03750952,   0.06529313,    -0.071974665,
+       -0.11769596,    0.015524369,   -0.0013754242, -0.12446318,
+       0.02786344,     -0.014179351,  0.005264273,   0.14376344,
+       0.015983658,    0.03406988,    -0.06939408,   0.040699873,
+       0.02111075,     0.09669095,    0.041345075,   -0.08316494,
+       -0.07684199,    -0.045768797,  0.032298047,   -0.041805092,
+       0.0119405,      0.0061010392,  0.12652606,    0.0064572375,
+       -0.024950314,   0.11574242,    0.04508852,    -0.04335324,
+       0.06760663,     -0.027437469,  0.07216407,    0.06977076,
+       -0.05438599,    0.034033038,   -0.028602652,  0.05346137,
+       0.043184172,    -0.037189785,  0.10420091,    0.00882477,
+       -0.054019816,   -0.074273005,  -0.030617684,  -0.0028467078,
+       0.024302477,    -0.0038869337, 0.005332455,   0.0013399826,
+       0.04361412,     -0.007001822,  0.09631092,    -0.06702025,
+       -0.042049985,   -0.035070654,  -0.04103342,   -0.10273396,
+       0.0544271,      0.037184782,   -0.13150354,   -0.0058036847,
+       -0.008264958,   0.042035464,   0.05891794,    0.029673764,
+       0.0063542654,   0.044788733,   0.054816857,   0.062257513,
+       -0.00093483756, 0.048938446,   -0.004952862,  -0.007730018,
+       -0.04043371,    -0.017094059,  0.07229206,    -0.023670016,
+       -0.052195564,   -0.025616996,  -0.01520939,   0.045104615,
+       -0.007376126,   0.003533447,   0.006570588,   0.056037236,
+       0.12436656,     0.051817212,   0.028532185,   -0.08686856,
+       0.11868599,     0.07663395,    -0.07323171,   0.03463402,
+       -0.050708205,   -0.04458982,   -0.11590894,   0.021273347,
+       0.1251325,      -0.15313013,   -0.12224372,   0.17228661,
+       0.023029093,    0.086124025,   0.006445803,   -0.03496501,
+       0.028332196,    0.04449512,    -0.042436164,  -0.026587414,
+       -0.006041347,   -0.09292539,   -0.05678812,   0.03897832,
+       0.09465633,     0.008115513,   -0.02171956,   0.08304309,
+       0.071401566,    0.019622514,   0.032163795,   -0.004167056,
+       0.02295182,     0.030739572,   0.056506045,   0.004612461,
+       0.06524936,     0.059999723,   0.046395954,   -0.0045512207,
+       -0.1335546,     -0.030136576,  0.11584653,    -0.014678886,
+       0.0020118146,   -0.09688814,   -0.0790206,    0.039770417,
+       -0.0329582,     0.07922767,    0.029322514,   0.026405897,
+       0.04207835,     -0.07073373,   0.063781224,   0.0859677,
+       -0.10925287,    -0.07011058,   0.048005477,   0.03438226,
+       -0.09606514,    -0.006669445,  -0.043381985,  0.04240257,
+       -0.06955775,    -0.06769346,   0.043903265,   -0.026784198,
+       -0.017840602,   0.024307009,   -0.040079936,  -0.019946516,
+       0.045318738,    -0.12233574,   0.026170589,   0.0074471775,
+       0.15978073,     0.10185836,    0.10298046,    -0.015476589,
+       -0.039390966,   -0.072174534,  0.0739445,     -0.1211869,
+       -0.0347889,     -0.07943156,   0.014809798,   -0.12412325,
+       -0.0030663363,  0.039695457,   0.0647603,     -0.08291318,
+       -0.018529687,   -0.004423833,  0.0037507233,  0.084633216,
+       -0.01514876,    -0.056505352,  -0.012800942,  -0.06994386,
+       0.012962922,    -0.031234352,  0.07029052,    0.016418684,
+       0.03618972,     0.055686004,   -0.08663945,   -0.017404709,
+       -0.054761406,   0.029065743,   0.052404847,   0.020238016,
+       0.0048197987,   -0.0214882,    0.07078733,    0.013016777,
+       0.06262858,     0.009184685,   0.020785125,   -0.043904778,
+       -0.0270329,     -0.03299152,   -0.060088247,  -0.015162964,
+       -0.001828936,   0.12642565,    -0.056757294,  0.013586685,
+       0.09232601,     -0.035886683,  0.06000002,    0.05229691,
+       -0.052580316,   -0.082029596,  -0.010794592,  0.012947712,
+       -0.036429964,   -0.085508935,  -0.13127148,   -0.017744139,
+       0.031502828,    0.036232427,   -0.031581745,  0.023051167,
+       -0.05325106,    -0.03421577,   0.028793324,   -0.034633752,
+       -0.009881397,   -0.043551125,  -0.018609839,  0.0019097115,
+       -0.008799762,   0.056595087,   0.0022273948,  0.055752404});
+
+  lstm.SetRecurrentToOutputWeights({
+      0.025825322,   -0.05813119,  0.09495884,   -0.045984812,   -0.01255415,
+      -0.0026479573, -0.08196161,  -0.054914974, -0.0046604523,  -0.029587349,
+      -0.044576716,  -0.07480124,  -0.082868785, 0.023254942,    0.027502948,
+      -0.0039728214, -0.08683098,  -0.08116779,  -0.014675607,   -0.037924774,
+      -0.023314456,  -0.007401714, -0.09255757,  0.029460307,    -0.08829125,
+      -0.005139627,  -0.08989442,  -0.0555066,   0.13596267,     -0.025062224,
+      -0.048351806,  -0.03850004,  0.07266485,   -0.022414139,   0.05940088,
+      0.075114764,   0.09597592,   -0.010211725, -0.0049794707,  -0.011523867,
+      -0.025980417,  0.072999895,  0.11091378,   -0.081685916,   0.014416728,
+      0.043229222,   0.034178585,  -0.07530371,  0.035837382,    -0.085607,
+      -0.007721233,  -0.03287832,  -0.043848954, -0.06404588,    -0.06632928,
+      -0.073643476,  0.008214239,  -0.045984086, 0.039764922,    0.03474462,
+      0.060612556,   -0.080590084, 0.049127717,  0.04151091,     -0.030063879,
+      0.008801774,   -0.023021035, -0.019558564, 0.05158114,     -0.010947698,
+      -0.011825728,  0.0075720972, 0.0699727,    -0.0039981045,  0.069350146,
+      0.08799282,    0.016156472,  0.035502106,  0.11695009,     0.006217345,
+      0.13392477,    -0.037875112, 0.025745004,  0.08940699,     -0.00924166,
+      0.0046702605,  -0.036598757, -0.08811812,  0.10522024,     -0.032441203,
+      0.008176899,   -0.04454919,  0.07058152,   0.0067963637,   0.039206743,
+      0.03259838,    0.03725492,   -0.09515802,  0.013326398,    -0.052055415,
+      -0.025676316,  0.03198509,   -0.015951829, -0.058556724,   0.036879618,
+      0.043357447,   0.028362012,  -0.05908629,  0.0059240665,   -0.04995891,
+      -0.019187413,  0.0276265,    -0.01628143,  0.0025863599,   0.08800015,
+      0.035250366,   -0.022165963, -0.07328642,  -0.009415526,   -0.07455109,
+      0.11690406,    0.0363299,    0.07411125,   0.042103454,    -0.009660886,
+      0.019076364,   0.018299393,  -0.046004917, 0.08891175,     0.0431396,
+      -0.026327137,  -0.051502608, 0.08979574,   -0.051670972,   0.04940282,
+      -0.07491107,   -0.021240504, 0.022596184,  -0.034280192,   0.060163025,
+      -0.058211457,  -0.051837247, -0.01349775,  -0.04639988,    -0.035936575,
+      -0.011681591,  0.064818054,  0.0073146066, -0.021745546,   -0.043124277,
+      -0.06471268,   -0.07053354,  -0.029321948, -0.05330136,    0.016933719,
+      -0.053782392,  0.13747959,   -0.1361751,   -0.11569455,    0.0033329215,
+      0.05693899,    -0.053219706, 0.063698,     0.07977434,     -0.07924483,
+      0.06936997,    0.0034815092, -0.007305279, -0.037325785,   -0.07251102,
+      -0.033633437,  -0.08677009,  0.091591336,  -0.14165086,    0.021752775,
+      0.019683983,   0.0011612234, -0.058154266, 0.049996935,    0.0288841,
+      -0.0024567875, -0.14345716,  0.010955264,  -0.10234828,    0.1183656,
+      -0.0010731248, -0.023590032, -0.072285876, -0.0724771,     -0.026382286,
+      -0.0014920527, 0.042667855,  0.0018776858, 0.02986552,     0.009814309,
+      0.0733756,     0.12289186,   0.018043943,  -0.0458958,     0.049412545,
+      0.033632483,   0.05495232,   0.036686596,  -0.013781798,   -0.010036754,
+      0.02576849,    -0.08307328,  0.010112348,  0.042521734,    -0.05869831,
+      -0.071689695,  0.03876447,   -0.13275425,  -0.0352966,     -0.023077697,
+      0.10285965,    0.084736146,  0.15568255,   -0.00040734606, 0.027835453,
+      -0.10292561,   -0.032401145, 0.10053256,   -0.026142767,   -0.08271222,
+      -0.0030240538, -0.016368777, 0.1070414,    0.042672627,    0.013456989,
+      -0.0437609,    -0.022309763, 0.11576483,   0.04108048,     0.061026827,
+      -0.0190714,    -0.0869359,   0.037901703,  0.0610107,      0.07202949,
+      0.01675338,    0.086139716,  -0.08795751,  -0.014898893,   -0.023771819,
+      -0.01965048,   0.007955471,  -0.043740474, 0.03346837,     -0.10549954,
+      0.090567775,   0.042013682,  -0.03176985,  0.12569028,     -0.02421228,
+      -0.029526481,  0.023851605,  0.031539805,  0.05292009,     -0.02344001,
+      -0.07811758,   -0.08834428,  0.10094801,   0.16594367,     -0.06861939,
+      -0.021256343,  -0.041093912, -0.06669611,  0.035498552,    0.021757556,
+      -0.09302526,   -0.015403468, -0.06614931,  -0.051798206,   -0.013874718,
+      0.03630673,    0.010412845,  -0.08077351,  0.046185967,    0.0035662893,
+      0.03541868,    -0.094149634, -0.034814864, 0.003128424,    -0.020674974,
+      -0.03944324,   -0.008110165, -0.11113267,  0.08484226,     0.043586485,
+      0.040582247,   0.0968012,    -0.065249965, -0.028036479,   0.0050708856,
+      0.0017462453,  0.0326779,    0.041296225,  0.09164146,     -0.047743853,
+      -0.015952192,  -0.034451712, 0.084197424,  -0.05347844,    -0.11768019,
+      0.085926116,   -0.08251791,  -0.045081906, 0.0948852,      0.068401024,
+      0.024856757,   0.06978981,   -0.057309967, -0.012775832,   -0.0032452994,
+      0.01977615,    -0.041040014, -0.024264973, 0.063464895,    0.05431621,
+  });
+
+  lstm.SetCellToInputWeights(
+      {0.040369894, 0.030746894,  0.24704495,  0.018586371,  -0.037586458,
+       -0.15312155, -0.11812848,  -0.11465643, 0.20259799,   0.11418174,
+       -0.10116027, -0.011334949, 0.12411352,  -0.076769054, -0.052169047,
+       0.21198851,  -0.38871562,  -0.09061183, -0.09683246,  -0.21929175});
+
+  lstm.SetCellToForgetWeights(
+      {-0.01998659,  -0.15568835,  -0.24248174,   -0.012770197, 0.041331276,
+       -0.072311886, -0.052123554, -0.0066330447, -0.043891653, 0.036225766,
+       -0.047248036, 0.021479502,  0.033189066,   0.11952997,   -0.020432774,
+       0.64658105,   -0.06650122,  -0.03467612,   0.095340036,  0.23647355});
+
+  lstm.SetCellToOutputWeights(
+      {0.08286371,  -0.08261836, -0.51210177, 0.002913762, 0.17764764,
+       -0.5495371,  -0.08460716, -0.24552552, 0.030037103, 0.04123544,
+       -0.11940523, 0.007358328, 0.1890978,   0.4833202,   -0.34441817,
+       0.36312827,  -0.26375428, 0.1457655,   -0.19724406, 0.15548733});
+
+  lstm.SetProjectionWeights(
+      {-0.009802181,  0.09401916,    0.0717386,     -0.13895074,  0.09641832,
+       0.060420845,   0.08539281,    0.054285463,   0.061395317,  0.034448683,
+       -0.042991187,  0.019801661,   -0.16840284,   -0.015726732, -0.23041931,
+       -0.024478018,  -0.10959692,   -0.013875541,  0.18600968,   -0.061274476,
+       0.0138165,     -0.08160894,   -0.07661644,   0.032372914,  0.16169067,
+       0.22465782,    -0.03993472,   -0.004017731,  0.08633481,   -0.28869787,
+       0.08682067,    0.17240396,    0.014975425,   0.056431185,  0.031037588,
+       0.16702051,    0.0077946745,  0.15140012,    0.29405436,   0.120285,
+       -0.188994,     -0.027265169,  0.043389652,   -0.022061434, 0.014777949,
+       -0.20203483,   0.094781205,   0.19100232,    0.13987629,   -0.036132768,
+       -0.06426278,   -0.05108664,   0.13221376,    0.009441198,  -0.16715929,
+       0.15859416,    -0.040437475,  0.050779544,   -0.022187516, 0.012166504,
+       0.027685808,   -0.07675938,   -0.0055694645, -0.09444123,  0.0046453946,
+       0.050794356,   0.10770313,    -0.20790008,   -0.07149004,  -0.11425117,
+       0.008225835,   -0.035802525,  0.14374903,    0.15262283,   0.048710253,
+       0.1847461,     -0.007487823,  0.11000021,    -0.09542012,  0.22619456,
+       -0.029149994,  0.08527916,    0.009043713,   0.0042746216, 0.016261552,
+       0.022461696,   0.12689082,    -0.043589946,  -0.12035478,  -0.08361797,
+       -0.050666027,  -0.1248618,    -0.1275799,    -0.071875185, 0.07377272,
+       0.09944291,    -0.18897448,   -0.1593054,    -0.06526116,  -0.040107165,
+       -0.004618631,  -0.067624845,  -0.007576253,  0.10727444,   0.041546922,
+       -0.20424393,   0.06907816,    0.050412357,   0.00724631,   0.039827548,
+       0.12449835,    0.10747581,    0.13708383,    0.09134148,   -0.12617786,
+       -0.06428341,   0.09956831,    0.1208086,     -0.14676677,  -0.0727722,
+       0.1126304,     0.010139365,   0.015571211,   -0.038128063, 0.022913318,
+       -0.042050496,  0.16842307,    -0.060597885,  0.10531834,   -0.06411776,
+       -0.07451711,   -0.03410368,   -0.13393489,   0.06534304,   0.003620307,
+       0.04490757,    0.05970546,    0.05197996,    0.02839995,   0.10434969,
+       -0.013699693,  -0.028353551,  -0.07260381,   0.047201227,  -0.024575593,
+       -0.036445823,  0.07155557,    0.009672501,   -0.02328883,  0.009533515,
+       -0.03606021,   -0.07421458,   -0.028082801,  -0.2678904,   -0.13221288,
+       0.18419984,    -0.13012612,   -0.014588381,  -0.035059117, -0.04824723,
+       0.07830115,    -0.056184657,  0.03277091,    0.025466874,  0.14494097,
+       -0.12522776,   -0.098633975,  -0.10766018,   -0.08317623,  0.08594209,
+       0.07749552,    0.039474737,   0.1776665,     -0.07409566,  -0.0477268,
+       0.29323658,    0.10801441,    0.1154011,     0.013952499,  0.10739139,
+       0.10708251,    -0.051456142,  0.0074137426,  -0.10430189,  0.10034707,
+       0.045594677,   0.0635285,     -0.0715442,    -0.089667566, -0.10811871,
+       0.00026344223, 0.08298446,    -0.009525053,  0.006585689,  -0.24567553,
+       -0.09450807,   0.09648481,    0.026996298,   -0.06419476,  -0.04752702,
+       -0.11063944,   -0.23441927,   -0.17608605,   -0.052156363, 0.067035615,
+       0.19271925,    -0.0032889997, -0.043264326,  0.09663576,   -0.057112187,
+       -0.10100678,   0.0628376,     0.04447668,    0.017961001,  -0.10094388,
+       -0.10190601,   0.18335468,    0.10494553,    -0.052095775, -0.0026118709,
+       0.10539724,    -0.04383912,   -0.042349473,  0.08438151,   -0.1947263,
+       0.02251204,    0.11216432,    -0.10307853,   0.17351969,   -0.039091777,
+       0.08066188,    -0.00561982,   0.12633002,    0.11335965,   -0.0088127935,
+       -0.019777594,  0.06864014,    -0.059751723,  0.016233567,  -0.06894641,
+       -0.28651384,   -0.004228674,  0.019708522,   -0.16305895,  -0.07468996,
+       -0.0855457,    0.099339016,   -0.07580735,   -0.13775392,  0.08434318,
+       0.08330512,    -0.12131499,   0.031935584,   0.09180414,   -0.08876437,
+       -0.08049874,   0.008753825,   0.03498998,    0.030215185,  0.03907079,
+       0.089751154,   0.029194152,   -0.03337423,   -0.019092513, 0.04331237,
+       0.04299654,    -0.036394123,  -0.12915532,   0.09793732,   0.07512415,
+       -0.11319543,   -0.032502122,  0.15661901,    0.07671967,   -0.005491124,
+       -0.19379048,   -0.218606,     0.21448623,    0.017840758,  0.1416943,
+       -0.07051762,   0.19488361,    0.02664691,    -0.18104725,  -0.09334311,
+       0.15026465,    -0.15493552,   -0.057762887,  -0.11604192,  -0.262013,
+       -0.01391798,   0.012185008,   0.11156489,    -0.07483202,  0.06693364,
+       -0.26151478,   0.046425626,   0.036540434,   -0.16435726,  0.17338543,
+       -0.21401681,   -0.11385144,   -0.08283257,   -0.069031075, 0.030635102,
+       0.010969227,   0.11109743,    0.010919218,   0.027526086,  0.13519906,
+       0.01891392,    -0.046839405,  -0.040167913,  0.017953383,  -0.09700955,
+       0.0061885654,  -0.07000971,   0.026893595,   -0.038844477, 0.14543656});
+
+  static float lstm_input[][20] = {
+      {// Batch0: 4 (input_sequence_size) * 5 (n_input)
+       0.787926, 0.151646, 0.071352, 0.118426, 0.458058, 0.596268, 0.998386,
+       0.568695, 0.864524, 0.571277, 0.073204, 0.296072, 0.743333, 0.069199,
+       0.045348, 0.867394, 0.291279, 0.013714, 0.482521, 0.626339},
+
+      {// Batch1: 4 (input_sequence_size) * 5 (n_input)
+       0.295743, 0.544053, 0.690064, 0.858138, 0.497181, 0.642421, 0.524260,
+       0.134799, 0.003639, 0.162482, 0.640394, 0.930399, 0.050782, 0.432485,
+       0.988078, 0.082922, 0.563329, 0.865614, 0.333232, 0.259916}};
+
+  static float lstm_golden_output[][64] = {
+      {// Batch0: 4 (input_sequence_size) * 16 (n_output)
+       -0.00396806, 0.029352,     -0.00279226, 0.0159977,   -0.00835576,
+       -0.0211779,  0.0283512,    -0.0114597,  0.00907307,  -0.0244004,
+       -0.0152191,  -0.0259063,   0.00914318,  0.00415118,  0.017147,
+       0.0134203,   -0.0166936,   0.0381209,   0.000889694, 0.0143363,
+       -0.0328911,  -0.0234288,   0.0333051,   -0.012229,   0.0110322,
+       -0.0457725,  -0.000832209, -0.0202817,  0.0327257,   0.0121308,
+       0.0155969,   0.0312091,    -0.0213783,  0.0350169,   0.000324794,
+       0.0276012,   -0.0263374,   -0.0371449,  0.0446149,   -0.0205474,
+       0.0103729,   -0.0576349,   -0.0150052,  -0.0292043,  0.0376827,
+       0.0136115,   0.0243435,    0.0354492,   -0.0189322,  0.0464512,
+       -0.00251373, 0.0225745,    -0.0308346,  -0.0317124,  0.0460407,
+       -0.0189395,  0.0149363,    -0.0530162,  -0.0150767,  -0.0340193,
+       0.0286833,   0.00824207,   0.0264887,   0.0305169},
+      {// Batch1: 4 (input_sequence_size) * 16 (n_output)
+       -0.013869,    0.0287268,   -0.00334693, 0.00733398,  -0.0287926,
+       -0.0186926,   0.0193662,   -0.0115437,  0.00422612,  -0.0345232,
+       0.00223253,   -0.00957321, 0.0210624,   0.013331,    0.0150954,
+       0.02168,      -0.0141913,  0.0322082,   0.00227024,  0.0260507,
+       -0.0188721,   -0.0296489,  0.0399134,   -0.0160509,  0.0116039,
+       -0.0447318,   -0.0150515,  -0.0277406,  0.0316596,   0.0118233,
+       0.0214762,    0.0293641,   -0.0204549,  0.0450315,   -0.00117378,
+       0.0167673,    -0.0375007,  -0.0238314,  0.038784,    -0.0174034,
+       0.0131743,    -0.0506589,  -0.0048447,  -0.0240239,  0.0325789,
+       0.00790065,   0.0220157,   0.0333314,   -0.0264787,  0.0387855,
+       -0.000764675, 0.0217599,   -0.037537,   -0.0335206,  0.0431679,
+       -0.0211424,   0.010203,    -0.062785,   -0.00832363, -0.025181,
+       0.0412031,    0.0118723,   0.0239643,   0.0394009}};
+
+  // Resetting cell_state and output_state
+  lstm.ResetCellState();
+  lstm.ResetOutputState();
+
+  for (int i = 0; i < lstm.sequence_length(); i++) {
+    float* batch0_start = lstm_input[0] + i * lstm.num_inputs();
+    float* batch0_end = batch0_start + lstm.num_inputs();
+
+    lstm.SetInput(2 * i * lstm.num_inputs(), batch0_start, batch0_end);
+
+    float* batch1_start = lstm_input[1] + i * lstm.num_inputs();
+    float* batch1_end = batch1_start + lstm.num_inputs();
+    lstm.SetInput((2 * i + 1) * lstm.num_inputs(), batch1_start, batch1_end);
+  }
+
+  lstm.Invoke();
+
+  std::vector<float> expected;
+  for (int i = 0; i < lstm.sequence_length(); i++) {
+    float* golden_start_batch0 = lstm_golden_output[0] + i * lstm.num_outputs();
+    float* golden_end_batch0 = golden_start_batch0 + lstm.num_outputs();
+    float* golden_start_batch1 = lstm_golden_output[1] + i * lstm.num_outputs();
+    float* golden_end_batch1 = golden_start_batch1 + lstm.num_outputs();
+    expected.insert(expected.end(), golden_start_batch0, golden_end_batch0);
+    expected.insert(expected.end(), golden_start_batch1, golden_end_batch1);
+  }
+  EXPECT_THAT(lstm.GetOutput(), ElementsAreArray(ArrayFloatNear(expected)));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/model.cc b/tensorflow/contrib/lite/model.cc
index 4b0c853f77..4d8a6d10c8 100644
--- a/tensorflow/contrib/lite/model.cc
+++ b/tensorflow/contrib/lite/model.cc
@@ -463,6 +463,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
       builtin_data = reinterpret_cast<void*>(params);
       break;
     }
+    case BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM:
     case BuiltinOperator_LSTM: {
       TfLiteLSTMParams* params = MallocPOD<TfLiteLSTMParams>();
       if (auto* lstm_params = op->builtin_options_as_LSTMOptions()) {
diff --git a/tensorflow/contrib/lite/nnapi_delegate.cc b/tensorflow/contrib/lite/nnapi_delegate.cc
index b3602f799e..998a7b7614 100644
--- a/tensorflow/contrib/lite/nnapi_delegate.cc
+++ b/tensorflow/contrib/lite/nnapi_delegate.cc
@@ -322,6 +322,7 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
       case tflite::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN:
       case tflite::BuiltinOperator_EMBEDDING_LOOKUP:
       case tflite::BuiltinOperator_EMBEDDING_LOOKUP_SPARSE:
+      case tflite::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM:
       case tflite::BuiltinOperator_L2_NORMALIZATION:
       case tflite::BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION:
       case tflite::BuiltinOperator_MUL:
diff --git a/tensorflow/contrib/lite/schema/schema.fbs b/tensorflow/contrib/lite/schema/schema.fbs
index 260a87c93b..6fcd3e51a4 100644
--- a/tensorflow/contrib/lite/schema/schema.fbs
+++ b/tensorflow/contrib/lite/schema/schema.fbs
@@ -117,6 +117,7 @@ enum BuiltinOperator : byte {
   SUB = 41,
   DIV = 42,
   SQUEEZE = 43,
+  UNIDIRECTIONAL_SEQUENCE_LSTM = 44,
 }
 
 // Options for the builtin operators.
diff --git a/tensorflow/contrib/lite/schema/schema_generated.h b/tensorflow/contrib/lite/schema/schema_generated.h
index fd98be8f70..6eb9ae2926 100755
--- a/tensorflow/contrib/lite/schema/schema_generated.h
+++ b/tensorflow/contrib/lite/schema/schema_generated.h
@@ -206,11 +206,12 @@ enum BuiltinOperator {
   BuiltinOperator_SUB = 41,
   BuiltinOperator_DIV = 42,
   BuiltinOperator_SQUEEZE = 43,
+  BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM = 44,
   BuiltinOperator_MIN = BuiltinOperator_ADD,
-  BuiltinOperator_MAX = BuiltinOperator_SQUEEZE
+  BuiltinOperator_MAX = BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM
 };
 
-inline BuiltinOperator (&EnumValuesBuiltinOperator())[41] {
+inline BuiltinOperator (&EnumValuesBuiltinOperator())[42] {
   static BuiltinOperator values[] = {
       BuiltinOperator_ADD,
       BuiltinOperator_AVERAGE_POOL_2D,
@@ -252,7 +253,8 @@ inline BuiltinOperator (&EnumValuesBuiltinOperator())[41] {
       BuiltinOperator_MEAN,
       BuiltinOperator_SUB,
       BuiltinOperator_DIV,
-      BuiltinOperator_SQUEEZE};
+      BuiltinOperator_SQUEEZE,
+      BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM};
   return values;
 }
 
@@ -301,6 +303,7 @@ inline const char **EnumNamesBuiltinOperator() {
                                 "SUB",
                                 "DIV",
                                 "SQUEEZE",
+                                "UNIDIRECTIONAL_SEQUENCE_LSTM",
                                 nullptr};
   return names;
 }
-- 
GitLab


From 4af0e15de653c148608a61ae2575acd493160795 Mon Sep 17 00:00:00 2001
From: ManHyuk <manhyuk@kw.ac.kr>
Date: Wed, 24 Jan 2018 09:30:10 +0900
Subject: [PATCH 044/966] fix typo

---
 tensorflow/python/estimator/canned/dnn_testing_utils.py    | 2 +-
 tensorflow/python/estimator/canned/linear_testing_utils.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/estimator/canned/dnn_testing_utils.py b/tensorflow/python/estimator/canned/dnn_testing_utils.py
index 2bdec69303..706575985f 100644
--- a/tensorflow/python/estimator/canned/dnn_testing_utils.py
+++ b/tensorflow/python/estimator/canned/dnn_testing_utils.py
@@ -877,7 +877,7 @@ class BaseDNNWarmStartingTest(object):
 
     # Create a second DNNClassifier, warm-started from the first.  Use a
     # learning_rate = 0.0 optimizer to check values (use SGD so we don't have
-    # accumulator values that change).  Use a a new FeatureColumn with a
+    # accumulator values that change).  Use a new FeatureColumn with a
     # different vocabulary for occupation.
     new_vocab_list = ['doctor', 'consultant', 'engineer']
     new_vocab_file = os.path.join(self._ckpt_and_vocab_dir,
diff --git a/tensorflow/python/estimator/canned/linear_testing_utils.py b/tensorflow/python/estimator/canned/linear_testing_utils.py
index cccb9af4b2..3e9183cf1b 100644
--- a/tensorflow/python/estimator/canned/linear_testing_utils.py
+++ b/tensorflow/python/estimator/canned/linear_testing_utils.py
@@ -2003,7 +2003,7 @@ class BaseLinearWarmStartingTest(object):
 
     # Create a second LinearClassifier, warm-started from the first.  Use a
     # learning_rate = 0.0 optimizer to check values (use SGD so we don't have
-    # accumulator values that change).  Use a a new FeatureColumn with a
+    # accumulator values that change).  Use a new FeatureColumn with a
     # different vocabulary for occupation.
     new_vocab_list = ['doctor', 'consultant', 'engineer']
     new_vocab_file = os.path.join(self._ckpt_and_vocab_dir,
-- 
GitLab


From 5dfcb10d324873ccd7d5f6cca7bc503cea9cd9cc Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Jan 2018 16:38:55 -0800
Subject: [PATCH 045/966] Enable and fix some bfloat16 tests.

PiperOrigin-RevId: 183013346
---
 tensorflow/compiler/tests/binary_ops_test.py | 14 ++++++++++----
 tensorflow/compiler/tests/image_ops_test.py  | 15 ++++++++++-----
 tensorflow/compiler/tests/unary_ops_test.py  |  6 ++++--
 tensorflow/core/kernels/pack_op.cc           |  1 -
 tensorflow/core/ops/image_ops.cc             |  6 ++++--
 tensorflow/python/framework/test_util.py     |  6 +++++-
 tensorflow/python/lib/core/bfloat16_test.py  | 19 +++++++++++++++++++
 7 files changed, 52 insertions(+), 15 deletions(-)

diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index 65706b35d6..16856bd736 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -43,7 +43,7 @@ class BinaryOpsTest(XLATestCase):
         output = op(pa, pb)
       result = session.run(output, {pa: a, pb: b})
       if equality_test is None:
-        equality_test = self.assertAllClose
+        equality_test = self.assertAllCloseAccordingToType
       equality_test(result, expected, rtol=1e-3)
 
   def _testSymmetricBinary(self, op, a, b, expected, equality_test=None):
@@ -54,14 +54,20 @@ class BinaryOpsTest(XLATestCase):
     """Tests closeness of two lists of floats."""
     self.assertEqual(len(result), len(expected))
     for i in range(len(result)):
-      self.assertAllClose(result[i], expected[i], rtol)
+      self.assertAllCloseAccordingToType(result[i], expected[i], rtol)
 
   def testFloatOps(self):
     for dtype in self.float_types:
+      if dtype == dtypes.bfloat16.as_numpy_dtype:
+        a = -1.01
+        b = 4.1
+      else:
+        a = -1.001
+        b = 4.01
       self._testBinary(
           lambda x, y: math_ops.approximate_equal(x, y, tolerance=0.0001),
-          np.array([[[[-1, 2.00009999], [-3, 4.01]]]], dtype=dtype),
-          np.array([[[[-1.001, 2], [-3.00009, 4]]]], dtype=dtype),
+          np.array([[[[-1, 2.00009999], [-3, b]]]], dtype=dtype),
+          np.array([[[[a, 2], [-3.00009, 4]]]], dtype=dtype),
           expected=np.array([[[[False, True], [True, False]]]], dtype=dtype))
 
       self._testBinary(
diff --git a/tensorflow/compiler/tests/image_ops_test.py b/tensorflow/compiler/tests/image_ops_test.py
index e84b790037..538fa8e8e5 100644
--- a/tensorflow/compiler/tests/image_ops_test.py
+++ b/tensorflow/compiler/tests/image_ops_test.py
@@ -65,7 +65,7 @@ class RGBToHSVTest(XLATestCase):
       # Verify that processing batch elements together is the same as separate
       self.assertAllClose(batch1, join1)
       self.assertAllClose(batch2, join2)
-      self.assertAllClose(batch2, inp)
+      self.assertAllCloseAccordingToType(batch2, inp, bfloat16_atol=0.03)
 
   def testRGBToHSVRoundTrip(self):
     data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
@@ -77,21 +77,25 @@ class RGBToHSVTest(XLATestCase):
           hsv = image_ops.rgb_to_hsv(placeholder)
           rgb = image_ops.hsv_to_rgb(hsv)
         rgb_tf = rgb.eval(feed_dict={placeholder: rgb_np})
-      self.assertAllClose(rgb_tf, rgb_np)
+      self.assertAllCloseAccordingToType(rgb_tf, rgb_np, bfloat16_atol=0.03)
 
   def testRGBToHSVNumpy(self):
     """Tests the RGB to HSV conversion matches a reference implementation."""
     for nptype in self.float_types:
       rgb_flat = np.random.random(64 * 3).reshape((64, 3)).astype(nptype)
       rgb_np = rgb_flat.reshape(4, 4, 4, 3)
-      hsv_np = np.array([colorsys.rgb_to_hsv(r, g, b) for r, g, b in rgb_flat])
+      hsv_np = np.array([
+          colorsys.rgb_to_hsv(
+              r.astype(np.float64), g.astype(np.float64), b.astype(np.float64))
+          for r, g, b in rgb_flat
+      ])
       hsv_np = hsv_np.reshape(4, 4, 4, 3)
       with self.test_session():
         placeholder = array_ops.placeholder(nptype)
         with self.test_scope():
           hsv_op = image_ops.rgb_to_hsv(placeholder)
         hsv_tf = hsv_op.eval(feed_dict={placeholder: rgb_np})
-      self.assertAllClose(hsv_tf, hsv_np)
+      self.assertAllCloseAccordingToType(hsv_tf, hsv_np)
 
 
 class AdjustContrastTest(XLATestCase):
@@ -427,7 +431,8 @@ class ResizeBilinearTest(XLATestCase):
           np.zeros([1, input_shape[0], input_shape[1], 1], dtype=dtype),
           align_corners=True)
       out = sess.run(resized, {grads: grads_np[np.newaxis, :, :, np.newaxis]})
-      self.assertAllClose(expected[np.newaxis, :, :, np.newaxis], out)
+      self.assertAllCloseAccordingToType(expected[np.newaxis, :, :, np.newaxis],
+                                         out)
 
   def testAlignCorners1x2To3x2(self):
     for dtype in self.float_types:
diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py
index 0a6fe04d3c..8e4b8a3833 100644
--- a/tensorflow/compiler/tests/unary_ops_test.py
+++ b/tensorflow/compiler/tests/unary_ops_test.py
@@ -67,8 +67,10 @@ class UnaryOpsTest(XLATestCase):
         output = op(pinp)
       result = session.run(output, {pinp: inp})
       if equality_test is None:
-        equality_test = self.assertAllCloseAccordingToType
-      equality_test(result, expected, rtol=rtol, atol=atol)
+        self.assertAllCloseAccordingToType(
+            result, expected, rtol=rtol, atol=atol, bfloat16_rtol=0.03)
+      else:
+        equality_test(result, expected, rtol=rtol, atol=atol)
 
   def ListsAreClose(self, result, expected, rtol, atol):
     """Tests closeness of two lists of floats."""
diff --git a/tensorflow/core/kernels/pack_op.cc b/tensorflow/core/kernels/pack_op.cc
index 2923c38662..2033fbf5dc 100644
--- a/tensorflow/core/kernels/pack_op.cc
+++ b/tensorflow/core/kernels/pack_op.cc
@@ -139,7 +139,6 @@ class PackOp : public OpKernel {
 
 TF_CALL_ALL_TYPES(REGISTER_PACK);
 TF_CALL_QUANTIZED_TYPES(REGISTER_PACK);
-TF_CALL_bfloat16(REGISTER_PACK);
 TF_CALL_variant(REGISTER_PACK);
 
 #if defined(IS_MOBILE_PLATFORM) && !defined(SUPPORT_SELECTIVE_REGISTRATION)
diff --git a/tensorflow/core/ops/image_ops.cc b/tensorflow/core/ops/image_ops.cc
index 31cc662d21..7484ebb078 100644
--- a/tensorflow/core/ops/image_ops.cc
+++ b/tensorflow/core/ops/image_ops.cc
@@ -181,7 +181,9 @@ REGISTER_OP("ResizeBilinear")
     .Input("images: T")
     .Input("size: int32")
     .Output("resized_images: float")
-    .Attr("T: {int8, uint8, int16, uint16, int32, int64, half, float, double}")
+    .Attr(
+        "T: {int8, uint8, int16, uint16, int32, int64, bfloat16, half, "
+        "float, double}")
     .Attr("align_corners: bool = false")
     .SetShapeFn(ResizeShapeFn);
 
@@ -212,7 +214,7 @@ REGISTER_OP("ResizeBilinearGrad")
     .Input("grads: float")
     .Input("original_image: T")
     .Output("output: T")
-    .Attr("T: {float, half, double}")
+    .Attr("T: {float, bfloat16, half, double}")
     .Attr("align_corners: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       c->set_output(0, c->input(1));
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 8124472a83..7b09891fc1 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -1130,7 +1130,11 @@ class TensorFlowTestCase(googletest.TestCase):
       print("not close dif = ", np.abs(x - y))
       print("not close tol = ", atol + rtol * np.abs(y))
       print("dtype = %s, shape = %s" % (a.dtype, a.shape))
-      np.testing.assert_allclose(a, b, rtol=rtol, atol=atol, err_msg=msg)
+      # TODO(xpan): There seems to be a bug:
+      # tensorflow/compiler/tests:binary_ops_test pass with float32
+      # nan even though the equal_nan is False by default internally.
+      np.testing.assert_allclose(
+          a, b, rtol=rtol, atol=atol, err_msg=msg, equal_nan=True)
 
   def assertAllClose(self, a, b, rtol=1e-6, atol=1e-6):
     """Asserts that two numpy arrays, or dicts of same, have near values.
diff --git a/tensorflow/python/lib/core/bfloat16_test.py b/tensorflow/python/lib/core/bfloat16_test.py
index 985a11272c..09d4b01fa4 100644
--- a/tensorflow/python/lib/core/bfloat16_test.py
+++ b/tensorflow/python/lib/core/bfloat16_test.py
@@ -25,6 +25,7 @@ import numpy as np
 
 # pylint: disable=unused-import,g-bad-import-order
 from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.framework import dtypes
 from tensorflow.python.platform import test
 
 
@@ -160,6 +161,24 @@ class Bfloat16Test(test.TestCase):
       for w in self.float_values():
         self.assertEqual(v != w, bfloat16(v) != bfloat16(w))
 
+  def testNan(self):
+    a = np.isnan(bfloat16(float("nan")))
+    self.assertTrue(a)
+    np.testing.assert_allclose(np.array([1.0, a]), np.array([1.0, a]))
+
+    a = np.array(
+        [bfloat16(1.34375),
+         bfloat16(1.4375),
+         bfloat16(float("nan"))],
+        dtype=dtypes.bfloat16.as_numpy_dtype)
+    b = np.array(
+        [bfloat16(1.3359375),
+         bfloat16(1.4375),
+         bfloat16(float("nan"))],
+        dtype=dtypes.bfloat16.as_numpy_dtype)
+    np.testing.assert_allclose(
+        a, b, rtol=0.1, atol=0.1, equal_nan=True, err_msg="", verbose=True)
+
 
 class Bfloat16NumPyTest(test.TestCase):
 
-- 
GitLab


From d725e36efc65ba614690b28c864a08b5e669c32e Mon Sep 17 00:00:00 2001
From: Saurabh Saxena <srbs@google.com>
Date: Tue, 23 Jan 2018 16:50:47 -0800
Subject: [PATCH 046/966] Adds a mechanism for whitelisting ops for use in
 map_fns using a WHITELIST_OP_FOR_DATASET_FUNCTIONS macro. Changes criteria
 for whitelisting stateful dataset ops. We used to check for the presence of
 the output_shapes attr. Now we just match the name and the output type.

PiperOrigin-RevId: 183014929
---
 tensorflow/core/BUILD                  |  1 +
 tensorflow/core/framework/dataset.h    | 75 ++++++++++++++++++++++++++
 tensorflow/core/kernels/data/dataset.h | 14 ++++-
 3 files changed, 88 insertions(+), 2 deletions(-)
 create mode 100644 tensorflow/core/framework/dataset.h

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index d73f003a1d..fc49825748 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -432,6 +432,7 @@ tf_cuda_library(
         "framework/cancellation.h",
         "framework/common_shape_fns.h",
         "framework/control_flow.h",  # TODO(josh11b): Make internal?
+        "framework/dataset.h",
         "framework/device_base.h",
         "framework/function.h",
         "framework/graph_def_util.h",
diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
new file mode 100644
index 0000000000..2c2c7e7c58
--- /dev/null
+++ b/tensorflow/core/framework/dataset.h
@@ -0,0 +1,75 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_FRAMEWORK_DATASET_H_
+#define TENSORFLOW_FRAMEWORK_DATASET_H_
+
+namespace tensorflow {
+namespace dataset {
+// Registry for stateful ops that need to be used in dataset functions.
+// See below macro for usage details.
+class WhitelistedStatefulOpRegistry {
+ public:
+  Status Add(StringPiece op_name) {
+    op_names_.insert(op_name);
+    return Status::OK();
+  }
+
+  bool Contains(StringPiece op_name) {
+    return op_names_.find(op_name) != op_names_.end();
+  }
+
+  static WhitelistedStatefulOpRegistry* Global() {
+    static WhitelistedStatefulOpRegistry* reg =
+        new WhitelistedStatefulOpRegistry;
+    return reg;
+  }
+
+ private:
+  WhitelistedStatefulOpRegistry() {}
+  WhitelistedStatefulOpRegistry(WhitelistedStatefulOpRegistry const& copy);
+  WhitelistedStatefulOpRegistry operator=(
+      WhitelistedStatefulOpRegistry const& copy);
+  std::set<StringPiece> op_names_;
+};
+
+}  // namespace dataset
+
+// Use this macro to whitelist an op that is marked stateful but needs to be
+// used inside a map_fn in an input pipeline. This is only needed if you wish
+// to be able to checkpoint the state of the input pipeline. We currently
+// do not allow stateful ops to be defined inside of map_fns since it is not
+// possible to save their state.
+// Note that the state of the whitelisted ops inside functions will not be
+// saved during checkpointing, hence this should only be used if the op is
+// marked stateful for reasons like to avoid constant folding during graph
+// optimiztion but is not stateful.
+// If possible, try to remove the stateful flag on the op first.
+// Example usage:
+//
+//   WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS("LegacyStatefulReader");
+//
+#define WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS(name) \
+  WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS_UNIQ_HELPER(__COUNTER__, name)
+#define WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS_UNIQ_HELPER(ctr, name) \
+  WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS_UNIQ(ctr, name)
+#define WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS_UNIQ(ctr, name)        \
+  static ::tensorflow::Status whitelist_op##ctr TF_ATTRIBUTE_UNUSED =      \
+      ::tensorflow::dataset::WhitelistedStatefulOpRegistry::Global()->Add( \
+          name)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_FRAMEWORK_DATASET_H_
diff --git a/tensorflow/core/kernels/data/dataset.h b/tensorflow/core/kernels/data/dataset.h
index 3cb3c08a32..fbaaf22527 100644
--- a/tensorflow/core/kernels/data/dataset.h
+++ b/tensorflow/core/kernels/data/dataset.h
@@ -19,11 +19,13 @@ limitations under the License.
 
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/framework/variant_encode_decode.h"
 #include "tensorflow/core/framework/variant_tensor_data.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -193,9 +195,17 @@ class GraphDefBuilderWrapper {
     return Status::OK();
   }
 
+  // Returns whether an op has been whitelisted for use inside map_fns.
+  // Uses a heuristic to whitelist source dataset ops which have been
+  // marked stateful due to b/65524810.
+  // Also looks up the `op_def->name` in the global
+  // `WhitelistedStatefulOpRegistry`.
   bool IsOpWhitelisted(const OpDef* op_def) const {
-    return StringPiece(op_def->name()).ends_with("Dataset") &&
-           HasAttr(op_def, "output_shapes");
+    return (StringPiece(op_def->name()).ends_with("Dataset") &&
+            op_def->output_arg_size() == 1 &&
+            op_def->output_arg(0).type() == DT_VARIANT) ||
+           dataset::WhitelistedStatefulOpRegistry::Global()->Contains(
+               op_def->name());
   }
 
   bool HasAttr(const string& op_type_name, const string& attr_name) const;
-- 
GitLab


From 8c30d9a635b9d6456239fe9ec5526a8bba998e34 Mon Sep 17 00:00:00 2001
From: Chris Leary <leary@google.com>
Date: Tue, 23 Jan 2018 17:11:30 -0800
Subject: [PATCH 047/966] [XLA:python] Pull static method off of
 ComputationBuilder class.

PiperOrigin-RevId: 183017700
---
 tensorflow/compiler/xla/python/xla_client.py | 23 ++++++++++----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index 5455adafcd..9cfe1249f5 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -555,22 +555,12 @@ class ComputationBuilder(object):
       A ComputationDataHandle representing the added pad op.
     """
     if not isinstance(padding_config, xla_data_pb2.PaddingConfig):
-      padding_config = self._GetPaddingConfigFromTriples(padding_config)
+      padding_config = GetPaddingConfigFromTriples(padding_config)
     return _wrap_data_handle(
         self._client.Pad(_unwrap_data_handle(operand),
                          _unwrap_data_handle(padding_value),
                          padding_config))
 
-  def _GetPaddingConfigFromTriples(self, triples):
-    """Create PaddingConfig proto from list of triples of integers."""
-    padding_config = xla_data_pb2.PaddingConfig()
-    for lo, hi, interior in triples:
-      dimension = padding_config.dimensions.add()
-      dimension.edge_padding_low = lo
-      dimension.edge_padding_high = hi
-      dimension.interior_padding = interior
-    return padding_config
-
   def Reshape(self, operand, dimensions, new_sizes):
     """Reshape op."""
     return _wrap_data_handle(
@@ -997,3 +987,14 @@ def get_replica_count():
   yet or not.
   """
   return c_api.GetReplicaCount()
+
+
+def GetPaddingConfigFromTriples(triples):
+  """Create PaddingConfig proto from list of triples of integers."""
+  padding_config = xla_data_pb2.PaddingConfig()
+  for lo, hi, interior in triples:
+    dimension = padding_config.dimensions.add()
+    dimension.edge_padding_low = lo
+    dimension.edge_padding_high = hi
+    dimension.interior_padding = interior
+  return padding_config
-- 
GitLab


From 678560c3fc5be1a1a2b8632136cbce0390d2c240 Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Tue, 23 Jan 2018 17:26:50 -0800
Subject: [PATCH 048/966] Mark TensorArrayV2 ops and deprecated io_ops
 deprecated.

PiperOrigin-RevId: 183019408
---
 tensorflow/core/ops/data_flow_ops.cc | 36 ++++++++++++++--------------
 tensorflow/core/ops/io_ops.cc        | 16 ++++++-------
 tensorflow/core/public/version.h     |  4 +++-
 3 files changed, 29 insertions(+), 27 deletions(-)

diff --git a/tensorflow/core/ops/data_flow_ops.cc b/tensorflow/core/ops/data_flow_ops.cc
index cf949ed647..12c27c7984 100644
--- a/tensorflow/core/ops/data_flow_ops.cc
+++ b/tensorflow/core/ops/data_flow_ops.cc
@@ -787,7 +787,6 @@ REGISTER_OP("TensorArray")
     .SetIsStateful()
     .SetShapeFn(shape_inference::UnknownShape)
     .Deprecated(16, "Use TensorArrayV3");
-// TODO(cwhipkey): mark this deprecated in favor of V3.
 REGISTER_OP("TensorArrayV2")
     .Input("size: int32")
     .Attr("dtype: type")
@@ -802,7 +801,8 @@ REGISTER_OP("TensorArrayV2")
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
       c->set_output(0, c->Vector(2));
       return Status::OK();
-    });
+    })
+    .Deprecated(26, "Use TensorArrayV3");
 REGISTER_OP("TensorArrayGrad")
     .Input("handle: string")
     .Input("flow_in: float")
@@ -811,7 +811,6 @@ REGISTER_OP("TensorArrayGrad")
     .SetIsStateful()
     .SetShapeFn(shape_inference::UnknownShape)
     .Deprecated(16, "Use TensorArrayGradV3");
-// TODO(cwhipkey): mark this deprecated in favor of V3.
 REGISTER_OP("TensorArrayGradV2")
     .Input("handle: string")
     .Input("flow_in: float")
@@ -825,7 +824,8 @@ REGISTER_OP("TensorArrayGradV2")
       TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_dim));
       c->set_output(0, c->Vector(2));
       return Status::OK();
-    });
+    })
+    .Deprecated(26, "Use TensorArrayGradV3");
 REGISTER_OP("TensorArrayWrite")
     .Input("handle: Ref(string)")
     .Input("index: int32")
@@ -835,7 +835,6 @@ REGISTER_OP("TensorArrayWrite")
     .Attr("T: type")
     .SetShapeFn(shape_inference::UnknownShape)
     .Deprecated(16, "Use TensorArrayWriteV3");
-// TODO(cwhipkey): mark this deprecated in favor of V3.
 REGISTER_OP("TensorArrayWriteV2")
     .Input("handle: string")
     .Input("index: int32")
@@ -853,7 +852,8 @@ REGISTER_OP("TensorArrayWriteV2")
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
       TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
       return shape_inference::ScalarShape(c);
-    });
+    })
+    .Deprecated(26, "Use TensorArrayWriteV3");
 REGISTER_OP("TensorArrayRead")
     .Input("handle: Ref(string)")
     .Input("index: int32")
@@ -862,7 +862,6 @@ REGISTER_OP("TensorArrayRead")
     .Attr("dtype: type")
     .SetShapeFn(shape_inference::UnknownShape)
     .Deprecated(16, "Use TensorArrayReadV3");
-// TODO(cwhipkey): mark this deprecated in favor of V3.
 REGISTER_OP("TensorArrayReadV2")
     .Input("handle: string")
     .Input("index: int32")
@@ -878,7 +877,8 @@ REGISTER_OP("TensorArrayReadV2")
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
       TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
       return shape_inference::UnknownShape(c);
-    });
+    })
+    .Deprecated(26, "Use TensorArrayReadV3");
 REGISTER_OP("TensorArrayPack")
     .Input("handle: Ref(string)")
     .Input("flow_in: float")
@@ -904,7 +904,6 @@ REGISTER_OP("TensorArrayGather")
     .Attr("element_shape: shape = { unknown_rank: true }")
     .SetShapeFn(shape_inference::UnknownShape)
     .Deprecated(16, "Use TensorArrayGatherV3");
-// TODO(cwhipkey): mark this deprecated in favor of V3.
 REGISTER_OP("TensorArrayGatherV2")
     .Input("handle: string")
     .Input("indices: int32")
@@ -920,7 +919,8 @@ REGISTER_OP("TensorArrayGatherV2")
       TF_RETURN_IF_ERROR(c->WithValue(c->Dim(c->input(0), 0), 2, &unused_dim));
       TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
       return shape_inference::UnknownShape(c);
-    });
+    })
+    .Deprecated(26, "Use TensorArrayGatherV3");
 REGISTER_OP("TensorArrayScatter")
     .Input("handle: Ref(string)")
     .Input("indices: int32")
@@ -930,7 +930,6 @@ REGISTER_OP("TensorArrayScatter")
     .Attr("T: type")
     .SetShapeFn(shape_inference::UnknownShape)
     .Deprecated(19, "Use TensorArrayGradV3");
-// TODO(cwhipkey): mark this deprecated in favor of V3.
 REGISTER_OP("TensorArrayScatterV2")
     .Input("handle: string")
     .Input("indices: int32")
@@ -946,7 +945,8 @@ REGISTER_OP("TensorArrayScatterV2")
       TF_RETURN_IF_ERROR(c->WithValue(c->Dim(c->input(0), 0), 2, &unused_dim));
       TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
       return shape_inference::ScalarShape(c);
-    });
+    })
+    .Deprecated(26, "Use TensorArrayScatterV3");
 REGISTER_OP("TensorArrayConcat")
     .Input("handle: Ref(string)")
     .Input("flow_in: float")
@@ -983,7 +983,6 @@ REGISTER_OP("TensorArraySplit")
     .Attr("T: type")
     .SetShapeFn(shape_inference::UnknownShape)
     .Deprecated(16, "Use TensorArraySplitV3");
-// TODO(cwhipkey): mark this deprecated in favor of V3.
 REGISTER_OP("TensorArraySplitV2")
     .Input("handle: string")
     .Input("value: T")
@@ -1000,14 +999,14 @@ REGISTER_OP("TensorArraySplitV2")
       TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
       TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
       return shape_inference::ScalarShape(c);
-    });
+    })
+    .Deprecated(26, "Use TensorArraySplitV3");
 REGISTER_OP("TensorArraySize")
     .Input("handle: Ref(string)")
     .Input("flow_in: float")
     .Output("size: int32")
     .SetShapeFn(shape_inference::UnknownShape)
     .Deprecated(16, "Use TensorArraySizeV3");
-// TODO(cwhipkey): mark this deprecated in favor of V3.
 REGISTER_OP("TensorArraySizeV2")
     .Input("handle: string")
     .Input("flow_in: float")
@@ -1018,12 +1017,12 @@ REGISTER_OP("TensorArraySizeV2")
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &handle));
       TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_dim));
       return shape_inference::ScalarShape(c);
-    });
+    })
+    .Deprecated(26, "Use TensorArraySizeV3");
 REGISTER_OP("TensorArrayClose")
     .Input("handle: Ref(string)")
     .SetShapeFn([](InferenceContext* c) { return Status::OK(); })
     .Deprecated(16, "Use TensorArrayCloseV3");
-// TODO(cwhipkey): mark this deprecated in favor of V3.
 REGISTER_OP("TensorArrayCloseV2")
     .Input("handle: string")
     .SetShapeFn([](InferenceContext* c) {
@@ -1032,7 +1031,8 @@ REGISTER_OP("TensorArrayCloseV2")
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &handle));
       TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_dim));
       return Status::OK();
-    });
+    })
+    .Deprecated(26, "Use TensorArrayCloseV3");
 
 // --------------------------------------------------------------------------
 
diff --git a/tensorflow/core/ops/io_ops.cc b/tensorflow/core/ops/io_ops.cc
index 21f0d02ff2..7db4d0c4b6 100644
--- a/tensorflow/core/ops/io_ops.cc
+++ b/tensorflow/core/ops/io_ops.cc
@@ -272,14 +272,14 @@ REGISTER_OP("WholeFileReaderV2")
     .SetIsStateful()
     .SetShapeFn(shape_inference::ScalarShape);
 
-// TODO(cwhipkey): mark this deprecated in favor of V2.
 REGISTER_OP("TextLineReader")
     .Output("reader_handle: Ref(string)")
     .Attr("skip_header_lines: int = 0")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetIsStateful()
-    .SetShapeFn(TwoElementOutput);
+    .SetShapeFn(TwoElementOutput)
+    .Deprecated(26, "Use TextLineReaderV2");
 
 REGISTER_OP("TextLineReaderV2")
     .Output("reader_handle: resource")
@@ -289,7 +289,6 @@ REGISTER_OP("TextLineReaderV2")
     .SetIsStateful()
     .SetShapeFn(shape_inference::ScalarShape);
 
-// TODO(cwhipkey): mark this deprecated in favor of V2.
 REGISTER_OP("FixedLengthRecordReader")
     .Output("reader_handle: Ref(string)")
     .Attr("header_bytes: int = 0")
@@ -299,7 +298,8 @@ REGISTER_OP("FixedLengthRecordReader")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetIsStateful()
-    .SetShapeFn(TwoElementOutput);
+    .SetShapeFn(TwoElementOutput)
+    .Deprecated(26, "Use FixedLengthRecordReaderV2");
 
 REGISTER_OP("FixedLengthRecordReaderV2")
     .Output("reader_handle: resource")
@@ -313,14 +313,14 @@ REGISTER_OP("FixedLengthRecordReaderV2")
     .SetIsStateful()
     .SetShapeFn(shape_inference::ScalarShape);
 
-// TODO(cwhipkey): mark this deprecated in favor of V2.
 REGISTER_OP("TFRecordReader")
     .Output("reader_handle: Ref(string)")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .Attr("compression_type: string = ''")
     .SetIsStateful()
-    .SetShapeFn(TwoElementOutput);
+    .SetShapeFn(TwoElementOutput)
+    .Deprecated(26, "Use TFRecordReaderV2");
 
 REGISTER_OP("TFRecordReaderV2")
     .Output("reader_handle: resource")
@@ -337,13 +337,13 @@ REGISTER_OP("LMDBReader")
     .SetIsStateful()
     .SetShapeFn(TwoElementOutput);
 
-// TODO(cwhipkey): mark this deprecated in favor of V2.
 REGISTER_OP("IdentityReader")
     .Output("reader_handle: Ref(string)")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetIsStateful()
-    .SetShapeFn(TwoElementOutput);
+    .SetShapeFn(TwoElementOutput)
+    .Deprecated(26, "Use IdentityReaderV2");
 
 REGISTER_OP("IdentityReaderV2")
     .Output("reader_handle: resource")
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 3baab75e27..836efc079f 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -94,10 +94,12 @@ limitations under the License.
 // 26. Add a bool 'stripped_default_attrs' to MetaInfoDef indicating
 //     whether default-valued attrs have been stripped from the nodes in the
 //     GraphDef. (7dec2017)
+// 27. Deprecate TensorArray ops v2 in favor of v3 and deprecated io_ops
+//     deprecated in favor of V2 ops. (2018/01/23)
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 25
+#define TF_GRAPH_DEF_VERSION 26
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //
-- 
GitLab


From a3e81ec2892126056ad6c1feb9161bc16c2c2975 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Tue, 23 Jan 2018 17:32:39 -0800
Subject: [PATCH 049/966] [TPU] Add extra type checking in Python around the
 infeed operation, since it is a common mistake to make.

PiperOrigin-RevId: 183020077
---
 tensorflow/contrib/tpu/python/ops/tpu_ops.py | 58 ++++++++++++++++++++
 1 file changed, 58 insertions(+)

diff --git a/tensorflow/contrib/tpu/python/ops/tpu_ops.py b/tensorflow/contrib/tpu/python/ops/tpu_ops.py
index 33e47f674d..a49a3dcf29 100644
--- a/tensorflow/contrib/tpu/python/ops/tpu_ops.py
+++ b/tensorflow/contrib/tpu/python/ops/tpu_ops.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 
 import platform
 
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 
 if platform.system() != "Windows":
@@ -40,6 +41,63 @@ if platform.system() != "Windows":
     del op  # Unused
     # The gradient of a cross replica sum is also a cross-replica sum.
     return gen_tpu_ops.cross_replica_sum(grad)
+
+  # This extra type checking exists to give a more helpful error message in
+  # the common case that uint8 and int64 values are infed. Remove when both
+  # types are supported.
+
+  _SUPPORTED_INFEED_DTYPES = set([
+      dtypes.int32, dtypes.bfloat16, dtypes.float32
+  ])
+
+  def infeed_dequeue(dtype, shape, name=None):
+    """A placeholder op for a value that will be fed into the computation.
+
+    Args:
+      dtype: A `tf.DType`. The type of elements in the tensor.
+      shape: A `tf.TensorShape` or list of `ints`. The shape of the tensor.
+      name: A name for the operation (optional).
+
+    Returns:
+      A `Tensor` of type `dtype`.
+      A tensor that will be provided using the infeed mechanism.
+
+    Raises:
+      TypeError: If 'dtype` is not a supported infeed type.
+    """
+    if dtype not in _SUPPORTED_INFEED_DTYPES:
+      raise TypeError(
+          "{} is not a supported TPU infeed type. Supported types are: "
+          "{}".format(dtype, list(_SUPPORTED_INFEED_DTYPES)))
+
+    return gen_tpu_ops.infeed_dequeue(dtype, shape, name=name)
+
+  # pylint: disable=redefined-outer-name
+  def infeed_dequeue_tuple(dtypes, shapes, name=None):
+    """A placeholder op for values fed into the TPU simultaneously as a tuple.
+
+    Args:
+      dtypes: A list of `tf.DType`s that has length `>= 1`.
+        The element types of each element in `outputs`.
+      shapes: A list of shapes (each a `tf.TensorShape` or list of `ints`).
+        The shapes of each tensor in `outputs`.
+      name: A name for the operation (optional).
+
+    Returns:
+      A list of `Tensor` objects of type `dtypes`.
+      A list of tensors that will be provided using the infeed mechanism.
+
+    Raises:
+      TypeError: If a type in 'dtypes` is not a supported infeed type.
+    """
+    for dtype in dtypes:
+      if dtype not in _SUPPORTED_INFEED_DTYPES:
+        raise TypeError(
+            "{} is not a supported TPU infeed type. Supported types are: "
+            "{}".format(dtype, list(_SUPPORTED_INFEED_DTYPES)))
+    return gen_tpu_ops.infeed_dequeue_tuple(dtypes, shapes, name=name)
+  # pylint: enable=redefined-outer-name
+
 else:
   # We have already built the appropriate libraries into the binary via CMake
   # if we have built contrib, so we don't need this
-- 
GitLab


From 6dfe00ca4114371ed47c93810219736e3deda2d2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Jan 2018 17:35:53 -0800
Subject: [PATCH 050/966] Support StridedSlice in TFLite for 1D-4D tensors.

PiperOrigin-RevId: 183020501
---
 tensorflow/contrib/lite/builtin_op_data.h     |  20 +-
 tensorflow/contrib/lite/kernels/BUILD         |  13 +
 .../internal/reference/reference_ops.h        |  51 ++-
 tensorflow/contrib/lite/kernels/register.cc   |   2 +
 .../contrib/lite/kernels/strided_slice.cc     | 256 ++++++++++++
 .../lite/kernels/strided_slice_test.cc        | 375 ++++++++++++++++++
 tensorflow/contrib/lite/model.cc              |  12 +
 tensorflow/contrib/lite/nnapi_delegate.cc     |   1 +
 tensorflow/contrib/lite/schema/schema.fbs     |  10 +
 .../contrib/lite/schema/schema_generated.h    | 239 ++++++++++-
 tensorflow/contrib/lite/testing/BUILD         |   1 +
 .../contrib/lite/testing/generate_examples.py |  92 +++++
 .../testing/generated_examples_zip_test.cc    |   1 +
 .../resolve_strided_slice_attributes.cc       |   4 +
 .../contrib/lite/toco/import_tensorflow.cc    |   2 +
 tensorflow/contrib/lite/toco/model.h          |   3 +
 .../contrib/lite/toco/tflite/operator.cc      |  26 ++
 .../contrib/lite/toco/tflite/operator_test.cc |  22 +
 18 files changed, 1107 insertions(+), 23 deletions(-)
 create mode 100644 tensorflow/contrib/lite/kernels/strided_slice.cc
 create mode 100644 tensorflow/contrib/lite/kernels/strided_slice_test.cc
 mode change 100755 => 100644 tensorflow/contrib/lite/schema/schema_generated.h

diff --git a/tensorflow/contrib/lite/builtin_op_data.h b/tensorflow/contrib/lite/builtin_op_data.h
index 3b43a1fd5d..ab07c58c92 100644
--- a/tensorflow/contrib/lite/builtin_op_data.h
+++ b/tensorflow/contrib/lite/builtin_op_data.h
@@ -88,7 +88,9 @@ typedef struct {
   TfLiteFusedActivation activation;
 } TfLiteSequenceRNNParams;
 
-typedef struct { TfLiteFusedActivation activation; } TfLiteFullyConnectedParams;
+typedef struct {
+  TfLiteFusedActivation activation;
+} TfLiteFullyConnectedParams;
 
 typedef enum {
   kTfLiteLshProjectionUnknown = 0,
@@ -96,9 +98,13 @@ typedef enum {
   kTfLiteLshProjectionDense = 2,
 } TfLiteLSHProjectionType;
 
-typedef struct { TfLiteLSHProjectionType type; } TfLiteLSHProjectionParams;
+typedef struct {
+  TfLiteLSHProjectionType type;
+} TfLiteLSHProjectionParams;
 
-typedef struct { float beta; } TfLiteSoftmaxParams;
+typedef struct {
+  float beta;
+} TfLiteSoftmaxParams;
 
 typedef struct {
   int axis;
@@ -226,6 +232,14 @@ typedef struct {
   int num_squeeze_dims;
 } TfLiteSqueezeParams;
 
+typedef struct {
+  int begin_mask;
+  int end_mask;
+  int ellipsis_mask;
+  int new_axis_mask;
+  int shrink_axis_mask;
+} TfLiteStridedSliceParams;
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/tensorflow/contrib/lite/kernels/BUILD b/tensorflow/contrib/lite/kernels/BUILD
index 2d51b8727f..4195e7553c 100644
--- a/tensorflow/contrib/lite/kernels/BUILD
+++ b/tensorflow/contrib/lite/kernels/BUILD
@@ -103,6 +103,7 @@ cc_library(
         "space_to_batch_nd.cc",
         "space_to_depth.cc",
         "squeeze.cc",
+        "strided_slice.cc",
         "sub.cc",
         "svdf.cc",
         "transpose.cc",
@@ -518,6 +519,18 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "strided_slice_test",
+    size = "small",
+    srcs = ["strided_slice_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index 1d86183d94..03708eb32b 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -2330,6 +2330,18 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims,
   }
 }
 
+inline bool LoopCondition(int index, int stop, int stride) {
+  return stride > 0 ? index < stop : index > stop;
+}
+
+inline int StartIndex(int start, int stride, int dim, bool masked) {
+  return masked ? (stride > 0 ? 0 : dim - 1) : start;
+}
+
+inline int StopIndex(int stop, int stride, int dim, bool masked) {
+  return masked ? (stride > 0 ? dim : -1) : stop;
+}
+
 template <typename T>
 inline void StridedSlice(const T* input_data, const Dims<4>& input_dims,
                          int begin_mask, int end_mask,
@@ -2337,20 +2349,35 @@ inline void StridedSlice(const T* input_data, const Dims<4>& input_dims,
                          const std::vector<int>& stops,
                          const std::vector<int>& strides, T* output_data,
                          const Dims<4>& output_dims) {
-  const int start_b = (begin_mask & 8) ? 0 : starts[3];
-  const int stop_b = (end_mask & 8) ? input_dims.sizes[3] : stops[3];
-  const int start_h = (begin_mask & 4) ? 0 : starts[2];
-  const int stop_h = (end_mask & 4) ? input_dims.sizes[2] : stops[2];
-  const int start_w = (begin_mask & 2) ? 0 : starts[1];
-  const int stop_w = (end_mask & 2) ? input_dims.sizes[1] : stops[1];
-  const int start_d = (begin_mask & 1) ? 0 : starts[0];
-  const int stop_d = (end_mask & 1) ? input_dims.sizes[0] : stops[0];
+  TFLITE_DCHECK_EQ(starts.size(), 4);
+  TFLITE_DCHECK_EQ(stops.size(), 4);
+  TFLITE_DCHECK_EQ(strides.size(), 4);
+  const int start_b =
+      StartIndex(starts[3], strides[3], input_dims.sizes[3], begin_mask & 8);
+  const int stop_b =
+      StopIndex(stops[3], strides[3], input_dims.sizes[3], end_mask & 8);
+  const int start_h =
+      StartIndex(starts[2], strides[2], input_dims.sizes[2], begin_mask & 4);
+  const int stop_h =
+      StopIndex(stops[2], strides[2], input_dims.sizes[2], end_mask & 4);
+  const int start_w =
+      StartIndex(starts[1], strides[1], input_dims.sizes[1], begin_mask & 2);
+  const int stop_w =
+      StopIndex(stops[1], strides[1], input_dims.sizes[1], end_mask & 2);
+  const int start_d =
+      StartIndex(starts[0], strides[0], input_dims.sizes[0], begin_mask & 1);
+  const int stop_d =
+      StopIndex(stops[0], strides[0], input_dims.sizes[0], end_mask & 1);
 
   T* out_ptr = output_data;
-  for (int in_b = start_b; in_b < stop_b; in_b += strides[3]) {
-    for (int in_h = start_h; in_h < stop_h; in_h += strides[2]) {
-      for (int in_w = start_w; in_w < stop_w; in_w += strides[1]) {
-        for (int in_d = start_d; in_d < stop_d; in_d += strides[0]) {
+  for (int in_b = start_b; LoopCondition(in_b, stop_b, strides[3]);
+       in_b += strides[3]) {
+    for (int in_h = start_h; LoopCondition(in_h, stop_h, strides[2]);
+         in_h += strides[2]) {
+      for (int in_w = start_w; LoopCondition(in_w, stop_w, strides[1]);
+           in_w += strides[1]) {
+        for (int in_d = start_d; LoopCondition(in_d, stop_d, strides[0]);
+             in_d += strides[0]) {
           *out_ptr++ = input_data[Offset(input_dims, in_d, in_w, in_h, in_b)];
         }
       }
diff --git a/tensorflow/contrib/lite/kernels/register.cc b/tensorflow/contrib/lite/kernels/register.cc
index c9e74cb8d5..f605deaa5b 100644
--- a/tensorflow/contrib/lite/kernels/register.cc
+++ b/tensorflow/contrib/lite/kernels/register.cc
@@ -58,6 +58,7 @@ TfLiteRegistration* Register_GATHER();
 TfLiteRegistration* Register_TRANSPOSE();
 TfLiteRegistration* Register_MEAN();
 TfLiteRegistration* Register_SQUEEZE();
+TfLiteRegistration* Register_STRIDED_SLICE();
 
 BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_RELU, Register_RELU());
@@ -103,6 +104,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_DIV, Register_DIV());
   AddBuiltin(BuiltinOperator_SUB, Register_SUB());
   AddBuiltin(BuiltinOperator_SQUEEZE, Register_SQUEEZE());
+  AddBuiltin(BuiltinOperator_STRIDED_SLICE, Register_STRIDED_SLICE());
 }
 
 TfLiteRegistration* BuiltinOpResolver::FindOp(
diff --git a/tensorflow/contrib/lite/kernels/strided_slice.cc b/tensorflow/contrib/lite/kernels/strided_slice.cc
new file mode 100644
index 0000000000..91ba4a9b78
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/strided_slice.cc
@@ -0,0 +1,256 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string.h>
+#include <cmath>
+#include <vector>
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace strided_slice {
+
+enum KernelType {
+  kReference,
+  // TODO(soroosh): add kGenericOptimized
+};
+
+constexpr int kInputTensor = 0;
+constexpr int kBeginTensor = 1;
+constexpr int kEndTensor = 2;
+constexpr int kStridesTensor = 3;
+constexpr int kOutputTensor = 0;
+
+struct StridedSliceContext {
+  StridedSliceContext(TfLiteContext* context, TfLiteNode* node) {
+    params = reinterpret_cast<TfLiteStridedSliceParams*>(node->builtin_data);
+    input = GetInput(context, node, kInputTensor);
+    begin = GetInput(context, node, kBeginTensor);
+    end = GetInput(context, node, kEndTensor);
+    strides = GetInput(context, node, kStridesTensor);
+    output = GetOutput(context, node, kOutputTensor);
+    dims = NumDimensions(input);
+  }
+  TfLiteStridedSliceParams* params;
+  TfLiteTensor* input;
+  TfLiteTensor* begin;
+  TfLiteTensor* end;
+  TfLiteTensor* strides;
+  TfLiteTensor* output;
+  int dims;
+};
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 4);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  StridedSliceContext op_context(context, node);
+
+  // Ensure validity of input tensor and its dimension
+  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.begin), 1);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.end), 1);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.strides), 1);
+  TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type);
+  // Only INT32 begin/end/strides are supported
+  // TODO(soroosh) add support for INT64
+  TF_LITE_ENSURE_EQ(context, op_context.begin->type, kTfLiteInt32);
+  TF_LITE_ENSURE_EQ(context, op_context.end->type, kTfLiteInt32);
+  TF_LITE_ENSURE_EQ(context, op_context.strides->type, kTfLiteInt32);
+  TF_LITE_ENSURE_MSG(context, op_context.dims <= 4,
+                     "StridedSlice op only supports 1D-4D input arrays.");
+
+  // TODO(soroosh): add the following missing functionalities
+  TF_LITE_ENSURE_MSG(context, op_context.params->ellipsis_mask == 0,
+                     "ellipsis_mask is not implemented yet.");
+  TF_LITE_ENSURE_MSG(context, op_context.params->new_axis_mask == 0,
+                     "new_axis_mask is not implemented yet.");
+  TF_LITE_ENSURE_MSG(context, op_context.params->shrink_axis_mask == 0,
+                     "shrink_axis_mask is not implemented yet.");
+
+  // TODO(soroosh): optimize for constant tensors to do allocation in Prepare
+  op_context.output->allocation_type = kTfLiteDynamic;
+  return kTfLiteOk;
+}  // namespace strided_slice
+
+// TODO(soroosh): consolidate with BytesRequired in interpreter.h
+TfLiteStatus BytesRequired(TfLiteContext* context, TfLiteType type,
+                           const int* dims, int dims_size, size_t* bytes) {
+  // TODO(aselle): Check for overflow here using overflow.h in TensorFlow
+  // MultiplyWithoutOverflow.
+  TF_LITE_ENSURE(context, bytes != nullptr);
+  size_t count = 1;
+  for (int k = 0; k < dims_size; k++) count *= dims[k];
+  switch (type) {
+    case kTfLiteFloat32:
+      *bytes = sizeof(float) * count;
+      break;
+    case kTfLiteInt32:
+      *bytes = sizeof(int32_t) * count;
+      break;
+    case kTfLiteUInt8:
+      *bytes = sizeof(uint8_t) * count;
+      break;
+    case kTfLiteInt64:
+      *bytes = sizeof(int64_t) * count;
+      break;
+    default:
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+// Reverse order of bits in the mask to match the expected order in kernel
+inline int ReverseMaskBits(int mask, int num_dimensions) {
+  int out = 0;
+  for (int dim = 0; dim < num_dimensions; dim++) {
+    out <<= 1;
+    out += (mask & 1);
+    mask >>= 1;
+  }
+  return out;
+}
+
+// This Op only supports 1-4D cases and since we use the reference 4D
+// implementation, the 1-3D tensors are mapped to 4D.
+const int kMaxDim = 4;
+
+inline int32_t PositiveRemainder(int32_t dividend, int32_t divisor) {
+  return (divisor + (dividend % divisor)) % divisor;
+}
+
+inline int32_t ClampedIndex(int32_t index, int dim, bool pos_stride) {
+  return pos_stride
+             ? (index >= dim ? dim
+                             : PositiveRemainder(
+                                   std::min(std::max(index, -dim), dim), dim))
+             : (index < -dim
+                    ? -1
+                    : PositiveRemainder(
+                          std::min(std::max(index, -dim), dim - 1), dim));
+}
+
+template <KernelType kernel_type>
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  StridedSliceContext op_context(context, node);
+
+  std::vector<int> starts;
+  std::vector<int> stops;
+  std::vector<int> strides;
+
+  // Determine size of output tensor and map indices
+  TfLiteIntArray* output_shape = TfLiteIntArrayCreate(op_context.dims);
+  for (int idx = op_context.dims - 1; idx >= 0; --idx) {
+    int dim = op_context.input->dims->data[idx];
+    int32_t stride = GetTensorData<int32_t>(op_context.strides)[idx];
+    TF_LITE_ENSURE_MSG(context, stride != 0, "stride value has to be non-zero");
+    bool pos_stride = stride > 0;
+
+    int32_t begin =
+        op_context.params->begin_mask & (1 << idx)
+            ? pos_stride ? 0 : dim - 1
+            : ClampedIndex(GetTensorData<int32_t>(op_context.begin)[idx], dim,
+                           pos_stride);
+    int32_t end =
+        op_context.params->end_mask & (1 << idx)
+            ? pos_stride ? dim : -1
+            : ClampedIndex(GetTensorData<int32_t>(op_context.end)[idx], dim,
+                           pos_stride);
+
+    // This is valid for both positive and negative strides
+    output_shape->data[idx] = ceil((end - begin) / static_cast<float>(stride));
+    output_shape->data[idx] =
+        output_shape->data[idx] < 0 ? 0 : output_shape->data[idx];
+    starts.emplace_back(begin);
+    stops.emplace_back(end);
+    strides.emplace_back(stride);
+  }
+
+  for (int i = op_context.dims; i < kMaxDim; i++) {
+    starts.emplace_back(0);
+    stops.emplace_back(1);
+    strides.emplace_back(1);
+  }
+
+  TF_LITE_ENSURE_STATUS(
+      context->ResizeTensor(context, op_context.output, output_shape));
+
+  size_t required_bytes;
+  TF_LITE_ENSURE_OK(
+      context,
+      BytesRequired(context, op_context.output->type, output_shape->data,
+                    output_shape->size, &required_bytes));
+  TfLiteTensorRealloc(required_bytes, op_context.output);
+
+  op_context.params->begin_mask =
+      ReverseMaskBits(op_context.params->begin_mask, op_context.dims);
+  op_context.params->end_mask =
+      ReverseMaskBits(op_context.params->end_mask, op_context.dims);
+
+#define TF_LITE_STRIDED_SLICE(kernel_type, data_type)                 \
+  kernel_type::StridedSlice(                                          \
+      GetTensorData<data_type>(op_context.input),                     \
+      GetTensorDims(op_context.input), op_context.params->begin_mask, \
+      op_context.params->end_mask, starts, stops, strides,            \
+      GetTensorData<data_type>(op_context.output),                    \
+      GetTensorDims(op_context.output))
+
+  switch (op_context.input->type) {
+    case kTfLiteFloat32:
+      if (kernel_type == kReference) {
+        TF_LITE_STRIDED_SLICE(reference_ops, float);
+      }
+      break;
+    case kTfLiteInt32:
+      if (kernel_type == kReference) {
+        TF_LITE_STRIDED_SLICE(reference_ops, int32_t);
+      }
+      break;
+    case kTfLiteInt64:
+      if (kernel_type == kReference) {
+        TF_LITE_STRIDED_SLICE(reference_ops, int64_t);
+      }
+      break;
+    default:
+      context->ReportError(context,
+                           "Type is currently not supported "
+                           "by StridedSlice.");
+      return kTfLiteError;
+  }
+#undef TF_LITE_STRIDED_SLICE
+  return kTfLiteOk;
+}
+
+}  // namespace strided_slice
+
+TfLiteRegistration* Register_STRIDED_SLICE_REF() {
+  static TfLiteRegistration r = {
+      nullptr, nullptr, strided_slice::Prepare,
+      strided_slice::Eval<strided_slice::kReference>};
+  return &r;
+}
+
+// TODO(soroosh): add optimized
+TfLiteRegistration* Register_STRIDED_SLICE() {
+  return Register_STRIDED_SLICE_REF();
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/strided_slice_test.cc b/tensorflow/contrib/lite/kernels/strided_slice_test.cc
new file mode 100644
index 0000000000..cd4a364682
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/strided_slice_test.cc
@@ -0,0 +1,375 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class StridedSliceOpModel : public SingleOpModel {
+ public:
+  StridedSliceOpModel(std::initializer_list<int> input_shape,
+                      std::initializer_list<int> begin_shape,
+                      std::initializer_list<int> end_shape,
+                      std::initializer_list<int> strides_shape, int begin_mask,
+                      int end_mask, int ellipsis_mask, int new_axis_mask,
+                      int shrink_axis_mask) {
+    input_ = AddInput(TensorType_FLOAT32);
+    begin_ = AddInput(TensorType_INT32);
+    end_ = AddInput(TensorType_INT32);
+    strides_ = AddInput(TensorType_INT32);
+    output_ = AddOutput(TensorType_FLOAT32);
+    SetBuiltinOp(
+        BuiltinOperator_STRIDED_SLICE, BuiltinOptions_StridedSliceOptions,
+        CreateStridedSliceOptions(builder_, begin_mask, end_mask, ellipsis_mask,
+                                  new_axis_mask, shrink_axis_mask)
+            .Union());
+    BuildInterpreter({input_shape, begin_shape, end_shape, strides_shape});
+  }
+
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor<float>(input_, data);
+  }
+  void SetBegin(std::initializer_list<int32> data) {
+    PopulateTensor<int32>(begin_, data);
+  }
+  void SetEnd(std::initializer_list<int32> data) {
+    PopulateTensor<int32>(end_, data);
+  }
+  void SetStrides(std::initializer_list<int32> data) {
+    PopulateTensor<int32>(strides_, data);
+  }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int input_;
+  int begin_;
+  int end_;
+  int strides_;
+  int output_;
+};
+
+TEST(StridedSliceOpTest, UnsupportedInputSize) {
+  EXPECT_DEATH(
+      StridedSliceOpModel({2, 2, 2, 2, 2}, {5}, {5}, {5}, 0, 0, 0, 0, 0),
+      "StridedSlice op only supports 1D-4D input arrays.");
+}
+
+TEST(StridedSliceOpTest, UnssupportedArgs) {
+  EXPECT_DEATH(StridedSliceOpModel({3, 2}, {2}, {2}, {2}, 0, 0, 1, 0, 0),
+               "ellipsis_mask is not implemented yet.");
+  EXPECT_DEATH(StridedSliceOpModel({3, 2}, {2}, {2}, {2}, 0, 0, 0, 1, 0),
+               "new_axis_mask is not implemented yet.");
+  EXPECT_DEATH(StridedSliceOpModel({3, 2}, {2}, {2}, {2}, 0, 0, 0, 0, 1),
+               "shrink_axis_mask is not implemented yet.");
+}
+
+TEST(StridedSliceOpTest, In1D) {
+  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4});
+  m.SetBegin({1});
+  m.SetEnd({3});
+  m.SetStrides({1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({2, 3}));
+}
+
+TEST(StridedSliceOpTest, In1D_EmptyOutput) {
+  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4});
+  m.SetBegin({10});
+  m.SetEnd({3});
+  m.SetStrides({1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({0}));
+}
+
+TEST(StridedSliceOpTest, In1D_NegativeBegin) {
+  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4});
+  m.SetBegin({-3});
+  m.SetEnd({3});
+  m.SetStrides({1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({2, 3}));
+}
+
+TEST(StridedSliceOpTest, In1D_OutOfRangeBegin) {
+  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4});
+  m.SetBegin({-5});
+  m.SetEnd({3});
+  m.SetStrides({1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3}));
+}
+
+TEST(StridedSliceOpTest, In1D_NegativeEnd) {
+  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4});
+  m.SetBegin({1});
+  m.SetEnd({-2});
+  m.SetStrides({1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({2}));
+}
+
+TEST(StridedSliceOpTest, In1D_OutOfRangeEnd) {
+  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4});
+  m.SetBegin({-3});
+  m.SetEnd({5});
+  m.SetStrides({1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({2, 3, 4}));
+}
+
+TEST(StridedSliceOpTest, In1D_BeginMask) {
+  StridedSliceOpModel m({4}, {1}, {1}, {1}, 1, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4});
+  m.SetBegin({1});
+  m.SetEnd({3});
+  m.SetStrides({1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3}));
+}
+
+TEST(StridedSliceOpTest, In1D_NegativeBeginNegativeStride) {
+  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4});
+  m.SetBegin({-2});
+  m.SetEnd({-3});
+  m.SetStrides({-1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({3}));
+}
+
+TEST(StridedSliceOpTest, In1D_OutOfRangeBeginNegativeStride) {
+  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4});
+  m.SetBegin({5});
+  m.SetEnd({2});
+  m.SetStrides({-1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({4}));
+}
+
+TEST(StridedSliceOpTest, In1D_NegativeEndNegativeStride) {
+  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4});
+  m.SetBegin({2});
+  m.SetEnd({-4});
+  m.SetStrides({-1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({3, 2}));
+}
+
+TEST(StridedSliceOpTest, In1D_OutOfRangeEndNegativeStride) {
+  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4});
+  m.SetBegin({-3});
+  m.SetEnd({-5});
+  m.SetStrides({-1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({2, 1}));
+}
+
+TEST(StridedSliceOpTest, In1D_EndMask) {
+  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 1, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4});
+  m.SetBegin({1});
+  m.SetEnd({3});
+  m.SetStrides({1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({2, 3, 4}));
+}
+TEST(StridedSliceOpTest, In1D_NegStride) {
+  StridedSliceOpModel m({3}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3});
+  m.SetBegin({-1});
+  m.SetEnd({-4});
+  m.SetStrides({-1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({3, 2, 1}));
+}
+
+TEST(StridedSliceOpTest, In1D_EvenLenStride2) {
+  StridedSliceOpModel m({2}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  m.SetInput({1, 2});
+  m.SetBegin({0});
+  m.SetEnd({2});
+  m.SetStrides({2});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1}));
+}
+TEST(StridedSliceOpTest, In1D_OddLenStride2) {
+  StridedSliceOpModel m({3}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3});
+  m.SetBegin({0});
+  m.SetEnd({3});
+  m.SetStrides({2});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 3}));
+}
+
+TEST(StridedSliceOpTest, In2D_Identity) {
+  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4, 5, 6});
+  m.SetBegin({0, 0});
+  m.SetEnd({2, 3});
+  m.SetStrides({1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 3}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
+}
+TEST(StridedSliceOpTest, In2D) {
+  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4, 5, 6});
+  m.SetBegin({1, 0});
+  m.SetEnd({2, 2});
+  m.SetStrides({1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({4, 5}));
+}
+
+TEST(StridedSliceOpTest, In2D_Stride2) {
+  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4, 5, 6});
+  m.SetBegin({0, 0});
+  m.SetEnd({2, 3});
+  m.SetStrides({2, 2});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 3}));
+}
+
+TEST(StridedSliceOpTest, In2D_NegStride) {
+  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4, 5, 6});
+  m.SetBegin({1, -1});
+  m.SetEnd({2, -4});
+  m.SetStrides({2, -1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({6, 5, 4}));
+}
+
+TEST(StridedSliceOpTest, In2D_BeginMask) {
+  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 1, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4, 5, 6});
+  m.SetBegin({1, 0});
+  m.SetEnd({2, 2});
+  m.SetStrides({1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 4, 5}));
+}
+
+TEST(StridedSliceOpTest, In2D_EndMask) {
+  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 2, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4, 5, 6});
+  m.SetBegin({1, 0});
+  m.SetEnd({2, 2});
+  m.SetStrides({1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({4, 5, 6}));
+}
+
+TEST(StridedSliceOpTest, In2D_NegStrideBeginMask) {
+  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 2, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4, 5, 6});
+  m.SetBegin({1, -2});
+  m.SetEnd({2, -4});
+  m.SetStrides({1, -1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({6, 5, 4}));
+}
+TEST(StridedSliceOpTest, In2D_NegStrideEndMask) {
+  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 2, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4, 5, 6});
+  m.SetBegin({1, -2});
+  m.SetEnd({2, -3});
+  m.SetStrides({1, -1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({5, 4}));
+}
+
+TEST(StridedSliceOpTest, In3D_Identity) {
+  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  m.SetBegin({0, 0, 0});
+  m.SetEnd({2, 3, 2});
+  m.SetStrides({1, 1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 3, 2}));
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}));
+}
+
+TEST(StridedSliceOpTest, In3D_NegStride) {
+  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  m.SetBegin({-1, -1, -1});
+  m.SetEnd({-3, -4, -3});
+  m.SetStrides({-1, -1, -1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 3, 2}));
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray({12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1}));
+}
+TEST(StridedSliceOpTest, In3D_Strided2) {
+  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  m.SetBegin({0, 0, 0});
+  m.SetEnd({2, 3, 2});
+  m.SetStrides({2, 2, 2});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 5}));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/model.cc b/tensorflow/contrib/lite/model.cc
index 4d8a6d10c8..a01b74f9da 100644
--- a/tensorflow/contrib/lite/model.cc
+++ b/tensorflow/contrib/lite/model.cc
@@ -618,6 +618,18 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
       builtin_data = reinterpret_cast<void*>(params);
       break;
     }
+    case BuiltinOperator_STRIDED_SLICE: {
+      auto* params = MallocPOD<TfLiteStridedSliceParams>();
+      if (auto* schema_params = op->builtin_options_as_StridedSliceOptions()) {
+        params->begin_mask = schema_params->begin_mask();
+        params->end_mask = schema_params->end_mask();
+        params->ellipsis_mask = schema_params->ellipsis_mask();
+        params->new_axis_mask = schema_params->new_axis_mask();
+        params->shrink_axis_mask = schema_params->shrink_axis_mask();
+      }
+      builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
   }
   return builtin_data;
 }
diff --git a/tensorflow/contrib/lite/nnapi_delegate.cc b/tensorflow/contrib/lite/nnapi_delegate.cc
index 998a7b7614..d5b9319407 100644
--- a/tensorflow/contrib/lite/nnapi_delegate.cc
+++ b/tensorflow/contrib/lite/nnapi_delegate.cc
@@ -339,6 +339,7 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
       case tflite::BuiltinOperator_DIV:
       case tflite::BuiltinOperator_SUB:
       case tflite::BuiltinOperator_SQUEEZE:
+      case tflite::BuiltinOperator_STRIDED_SLICE:
         FATAL("Op code %d is currently not delegated to NNAPI", builtin);
         nn_op_type = -1;  // set to invalid
         break;
diff --git a/tensorflow/contrib/lite/schema/schema.fbs b/tensorflow/contrib/lite/schema/schema.fbs
index 6fcd3e51a4..8ddad4d251 100644
--- a/tensorflow/contrib/lite/schema/schema.fbs
+++ b/tensorflow/contrib/lite/schema/schema.fbs
@@ -118,6 +118,7 @@ enum BuiltinOperator : byte {
   DIV = 42,
   SQUEEZE = 43,
   UNIDIRECTIONAL_SEQUENCE_LSTM = 44,
+  STRIDED_SLICE = 45,
 }
 
 // Options for the builtin operators.
@@ -153,6 +154,7 @@ union BuiltinOptions {
   DivOptions,
   SqueezeOptions,
   SequenceRNNOptions,
+  StridedSliceOptions,
 }
 
 enum Padding : byte { SAME, VALID }
@@ -340,6 +342,14 @@ table SqueezeOptions {
   squeeze_dims:[int];
 }
 
+table StridedSliceOptions {
+  begin_mask: int;
+  end_mask: int;
+  ellipsis_mask: int;
+  new_axis_mask: int;
+  shrink_axis_mask: int;
+}
+
 // An OperatorCode can be an enum value (BuiltinOperator) if the operator is a
 // builtin, or a string if the operator is custom.
 table OperatorCode {
diff --git a/tensorflow/contrib/lite/schema/schema_generated.h b/tensorflow/contrib/lite/schema/schema_generated.h
old mode 100755
new mode 100644
index 6eb9ae2926..b756891f66
--- a/tensorflow/contrib/lite/schema/schema_generated.h
+++ b/tensorflow/contrib/lite/schema/schema_generated.h
@@ -120,6 +120,9 @@ struct MeanOptionsT;
 struct SqueezeOptions;
 struct SqueezeOptionsT;
 
+struct StridedSliceOptions;
+struct StridedSliceOptionsT;
+
 struct OperatorCode;
 struct OperatorCodeT;
 
@@ -207,11 +210,12 @@ enum BuiltinOperator {
   BuiltinOperator_DIV = 42,
   BuiltinOperator_SQUEEZE = 43,
   BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM = 44,
+  BuiltinOperator_STRIDED_SLICE = 45,
   BuiltinOperator_MIN = BuiltinOperator_ADD,
-  BuiltinOperator_MAX = BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM
+  BuiltinOperator_MAX = BuiltinOperator_STRIDED_SLICE
 };
 
-inline BuiltinOperator (&EnumValuesBuiltinOperator())[42] {
+inline BuiltinOperator (&EnumValuesBuiltinOperator())[43] {
   static BuiltinOperator values[] = {
       BuiltinOperator_ADD,
       BuiltinOperator_AVERAGE_POOL_2D,
@@ -254,7 +258,8 @@ inline BuiltinOperator (&EnumValuesBuiltinOperator())[42] {
       BuiltinOperator_SUB,
       BuiltinOperator_DIV,
       BuiltinOperator_SQUEEZE,
-      BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM};
+      BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM,
+      BuiltinOperator_STRIDED_SLICE};
   return values;
 }
 
@@ -304,6 +309,7 @@ inline const char **EnumNamesBuiltinOperator() {
                                 "DIV",
                                 "SQUEEZE",
                                 "UNIDIRECTIONAL_SEQUENCE_LSTM",
+                                "STRIDED_SLICE",
                                 nullptr};
   return names;
 }
@@ -346,11 +352,12 @@ enum BuiltinOptions {
   BuiltinOptions_DivOptions = 29,
   BuiltinOptions_SqueezeOptions = 30,
   BuiltinOptions_SequenceRNNOptions = 31,
+  BuiltinOptions_StridedSliceOptions = 32,
   BuiltinOptions_MIN = BuiltinOptions_NONE,
-  BuiltinOptions_MAX = BuiltinOptions_SequenceRNNOptions
+  BuiltinOptions_MAX = BuiltinOptions_StridedSliceOptions
 };
 
-inline BuiltinOptions (&EnumValuesBuiltinOptions())[32] {
+inline BuiltinOptions (&EnumValuesBuiltinOptions())[33] {
   static BuiltinOptions values[] = {
       BuiltinOptions_NONE,
       BuiltinOptions_Conv2DOptions,
@@ -383,7 +390,8 @@ inline BuiltinOptions (&EnumValuesBuiltinOptions())[32] {
       BuiltinOptions_SubOptions,
       BuiltinOptions_DivOptions,
       BuiltinOptions_SqueezeOptions,
-      BuiltinOptions_SequenceRNNOptions};
+      BuiltinOptions_SequenceRNNOptions,
+      BuiltinOptions_StridedSliceOptions};
   return values;
 }
 
@@ -420,6 +428,7 @@ inline const char **EnumNamesBuiltinOptions() {
                                 "DivOptions",
                                 "SqueezeOptions",
                                 "SequenceRNNOptions",
+                                "StridedSliceOptions",
                                 nullptr};
   return names;
 }
@@ -593,6 +602,11 @@ struct BuiltinOptionsTraits<SequenceRNNOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_SequenceRNNOptions;
 };
 
+template <>
+struct BuiltinOptionsTraits<StridedSliceOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_StridedSliceOptions;
+};
+
 struct BuiltinOptionsUnion {
   BuiltinOptions type;
   void *value;
@@ -950,6 +964,16 @@ struct BuiltinOptionsUnion {
                ? reinterpret_cast<const SequenceRNNOptionsT *>(value)
                : nullptr;
   }
+  StridedSliceOptionsT *AsStridedSliceOptions() {
+    return type == BuiltinOptions_StridedSliceOptions
+               ? reinterpret_cast<StridedSliceOptionsT *>(value)
+               : nullptr;
+  }
+  const StridedSliceOptionsT *AsStridedSliceOptions() const {
+    return type == BuiltinOptions_StridedSliceOptions
+               ? reinterpret_cast<const StridedSliceOptionsT *>(value)
+               : nullptr;
+  }
 };
 
 bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *obj,
@@ -3532,6 +3556,111 @@ flatbuffers::Offset<SqueezeOptions> CreateSqueezeOptions(
     flatbuffers::FlatBufferBuilder &_fbb, const SqueezeOptionsT *_o,
     const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct StridedSliceOptionsT : public flatbuffers::NativeTable {
+  typedef StridedSliceOptions TableType;
+  int32_t begin_mask;
+  int32_t end_mask;
+  int32_t ellipsis_mask;
+  int32_t new_axis_mask;
+  int32_t shrink_axis_mask;
+  StridedSliceOptionsT()
+      : begin_mask(0),
+        end_mask(0),
+        ellipsis_mask(0),
+        new_axis_mask(0),
+        shrink_axis_mask(0) {}
+};
+
+struct StridedSliceOptions FLATBUFFERS_FINAL_CLASS
+    : private flatbuffers::Table {
+  typedef StridedSliceOptionsT NativeTableType;
+  enum {
+    VT_BEGIN_MASK = 4,
+    VT_END_MASK = 6,
+    VT_ELLIPSIS_MASK = 8,
+    VT_NEW_AXIS_MASK = 10,
+    VT_SHRINK_AXIS_MASK = 12
+  };
+  int32_t begin_mask() const { return GetField<int32_t>(VT_BEGIN_MASK, 0); }
+  int32_t end_mask() const { return GetField<int32_t>(VT_END_MASK, 0); }
+  int32_t ellipsis_mask() const {
+    return GetField<int32_t>(VT_ELLIPSIS_MASK, 0);
+  }
+  int32_t new_axis_mask() const {
+    return GetField<int32_t>(VT_NEW_AXIS_MASK, 0);
+  }
+  int32_t shrink_axis_mask() const {
+    return GetField<int32_t>(VT_SHRINK_AXIS_MASK, 0);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_BEGIN_MASK) &&
+           VerifyField<int32_t>(verifier, VT_END_MASK) &&
+           VerifyField<int32_t>(verifier, VT_ELLIPSIS_MASK) &&
+           VerifyField<int32_t>(verifier, VT_NEW_AXIS_MASK) &&
+           VerifyField<int32_t>(verifier, VT_SHRINK_AXIS_MASK) &&
+           verifier.EndTable();
+  }
+  StridedSliceOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      StridedSliceOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<StridedSliceOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const StridedSliceOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct StridedSliceOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_begin_mask(int32_t begin_mask) {
+    fbb_.AddElement<int32_t>(StridedSliceOptions::VT_BEGIN_MASK, begin_mask, 0);
+  }
+  void add_end_mask(int32_t end_mask) {
+    fbb_.AddElement<int32_t>(StridedSliceOptions::VT_END_MASK, end_mask, 0);
+  }
+  void add_ellipsis_mask(int32_t ellipsis_mask) {
+    fbb_.AddElement<int32_t>(StridedSliceOptions::VT_ELLIPSIS_MASK,
+                             ellipsis_mask, 0);
+  }
+  void add_new_axis_mask(int32_t new_axis_mask) {
+    fbb_.AddElement<int32_t>(StridedSliceOptions::VT_NEW_AXIS_MASK,
+                             new_axis_mask, 0);
+  }
+  void add_shrink_axis_mask(int32_t shrink_axis_mask) {
+    fbb_.AddElement<int32_t>(StridedSliceOptions::VT_SHRINK_AXIS_MASK,
+                             shrink_axis_mask, 0);
+  }
+  explicit StridedSliceOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+      : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  StridedSliceOptionsBuilder &operator=(const StridedSliceOptionsBuilder &);
+  flatbuffers::Offset<StridedSliceOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<StridedSliceOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<StridedSliceOptions> CreateStridedSliceOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, int32_t begin_mask = 0,
+    int32_t end_mask = 0, int32_t ellipsis_mask = 0, int32_t new_axis_mask = 0,
+    int32_t shrink_axis_mask = 0) {
+  StridedSliceOptionsBuilder builder_(_fbb);
+  builder_.add_shrink_axis_mask(shrink_axis_mask);
+  builder_.add_new_axis_mask(new_axis_mask);
+  builder_.add_ellipsis_mask(ellipsis_mask);
+  builder_.add_end_mask(end_mask);
+  builder_.add_begin_mask(begin_mask);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<StridedSliceOptions> CreateStridedSliceOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const StridedSliceOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct OperatorCodeT : public flatbuffers::NativeTable {
   typedef OperatorCode TableType;
   BuiltinOperator builtin_code;
@@ -3816,6 +3945,11 @@ struct Operator FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
                ? static_cast<const SequenceRNNOptions *>(builtin_options())
                : nullptr;
   }
+  const StridedSliceOptions *builtin_options_as_StridedSliceOptions() const {
+    return builtin_options_type() == BuiltinOptions_StridedSliceOptions
+               ? static_cast<const StridedSliceOptions *>(builtin_options())
+               : nullptr;
+  }
   const flatbuffers::Vector<uint8_t> *custom_options() const {
     return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_CUSTOM_OPTIONS);
   }
@@ -4023,6 +4157,12 @@ Operator::builtin_options_as<SequenceRNNOptions>() const {
   return builtin_options_as_SequenceRNNOptions();
 }
 
+template <>
+inline const StridedSliceOptions *
+Operator::builtin_options_as<StridedSliceOptions>() const {
+  return builtin_options_as_StridedSliceOptions();
+}
+
 struct OperatorBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
@@ -4962,11 +5102,11 @@ inline void SequenceRNNOptions::UnPackTo(
   {
     auto _e = time_major();
     _o->time_major = _e;
-  }
+  };
   {
     auto _e = fused_activation_function();
     _o->fused_activation_function = _e;
-  }
+  };
 }
 
 inline flatbuffers::Offset<SequenceRNNOptions> SequenceRNNOptions::Pack(
@@ -6040,6 +6180,67 @@ inline flatbuffers::Offset<SqueezeOptions> CreateSqueezeOptions(
   return tflite::CreateSqueezeOptions(_fbb, _squeeze_dims);
 }
 
+inline StridedSliceOptionsT *StridedSliceOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new StridedSliceOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void StridedSliceOptions::UnPackTo(
+    StridedSliceOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  {
+    auto _e = begin_mask();
+    _o->begin_mask = _e;
+  };
+  {
+    auto _e = end_mask();
+    _o->end_mask = _e;
+  };
+  {
+    auto _e = ellipsis_mask();
+    _o->ellipsis_mask = _e;
+  };
+  {
+    auto _e = new_axis_mask();
+    _o->new_axis_mask = _e;
+  };
+  {
+    auto _e = shrink_axis_mask();
+    _o->shrink_axis_mask = _e;
+  };
+}
+
+inline flatbuffers::Offset<StridedSliceOptions> StridedSliceOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const StridedSliceOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateStridedSliceOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<StridedSliceOptions> CreateStridedSliceOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const StridedSliceOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const StridedSliceOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _begin_mask = _o->begin_mask;
+  auto _end_mask = _o->end_mask;
+  auto _ellipsis_mask = _o->ellipsis_mask;
+  auto _new_axis_mask = _o->new_axis_mask;
+  auto _shrink_axis_mask = _o->shrink_axis_mask;
+  return tflite::CreateStridedSliceOptions(_fbb, _begin_mask, _end_mask,
+                                           _ellipsis_mask, _new_axis_mask,
+                                           _shrink_axis_mask);
+}
+
 inline OperatorCodeT *OperatorCode::UnPack(
     const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new OperatorCodeT();
@@ -6552,6 +6753,10 @@ inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier,
       auto ptr = reinterpret_cast<const SequenceRNNOptions *>(obj);
       return verifier.VerifyTable(ptr);
     }
+    case BuiltinOptions_StridedSliceOptions: {
+      auto ptr = reinterpret_cast<const StridedSliceOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
     default:
       return false;
   }
@@ -6700,6 +6905,10 @@ inline void *BuiltinOptionsUnion::UnPack(
       auto ptr = reinterpret_cast<const SequenceRNNOptions *>(obj);
       return ptr->UnPack(resolver);
     }
+    case BuiltinOptions_StridedSliceOptions: {
+      auto ptr = reinterpret_cast<const StridedSliceOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
     default:
       return nullptr;
   }
@@ -6835,6 +7044,10 @@ inline flatbuffers::Offset<void> BuiltinOptionsUnion::Pack(
       auto ptr = reinterpret_cast<const SequenceRNNOptionsT *>(value);
       return CreateSequenceRNNOptions(_fbb, ptr, _rehasher).Union();
     }
+    case BuiltinOptions_StridedSliceOptions: {
+      auto ptr = reinterpret_cast<const StridedSliceOptionsT *>(value);
+      return CreateStridedSliceOptions(_fbb, ptr, _rehasher).Union();
+    }
     default:
       return 0;
   }
@@ -6985,6 +7198,11 @@ inline BuiltinOptionsUnion::BuiltinOptionsUnion(const BuiltinOptionsUnion &u)
           *reinterpret_cast<SequenceRNNOptionsT *>(u.value));
       break;
     }
+    case BuiltinOptions_StridedSliceOptions: {
+      value = new StridedSliceOptionsT(
+          *reinterpret_cast<StridedSliceOptionsT *>(u.value));
+      break;
+    }
     default:
       break;
   }
@@ -7147,6 +7365,11 @@ inline void BuiltinOptionsUnion::Reset() {
       delete ptr;
       break;
     }
+    case BuiltinOptions_StridedSliceOptions: {
+      auto ptr = reinterpret_cast<StridedSliceOptionsT *>(value);
+      delete ptr;
+      break;
+    }
     default:
       break;
   }
diff --git a/tensorflow/contrib/lite/testing/BUILD b/tensorflow/contrib/lite/testing/BUILD
index 933da11353..50e8ca75f8 100644
--- a/tensorflow/contrib/lite/testing/BUILD
+++ b/tensorflow/contrib/lite/testing/BUILD
@@ -46,6 +46,7 @@ gen_zipped_test_files(
         "space_to_batch_nd.zip",
         "space_to_depth.zip",
         "squeeze.zip",
+        "strided_slice.zip",
         "sub.zip",
         "transpose.zip",
     ],
diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py
index 56e4dfc7a2..6204471e52 100644
--- a/tensorflow/contrib/lite/testing/generate_examples.py
+++ b/tensorflow/contrib/lite/testing/generate_examples.py
@@ -1447,6 +1447,97 @@ def make_squeeze_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
+def make_strided_slice_tests(zip_path):
+  """Make a set of tests to do strided_slice."""
+
+  # TODO(soroosh): add test/support for uint8.
+  test_parameters = [
+      # 4-D
+      {
+          "dtype": [tf.float32, tf.int32, tf.int64],
+          "index_type": [tf.int32],
+          "input_shape": [[12, 2, 2, 5]],
+          "begin": [[0, 0, 0, 0], [1, 0, 1, 0]],
+          "end": [[8, 2, 2, 3], [12, 2, 2, 5]],
+          "strides": [None, [1, 1, 1, 1], [2, 1, 3, 1]],
+          "begin_mask": [None, 0, 1, 2, 8],
+          "end_mask": [None, 0, 1, 2, 8],
+      },
+      # 2-D
+      {
+          "dtype": [tf.float32, tf.int32, tf.int64],
+          "index_type": [tf.int32],
+          "input_shape": [[2, 3]],
+          "begin": [[0, 0], [1, 0]],
+          "end": [[2, 3], [2, 2]],
+          "strides": [None, [1, 1], [2, 2]],
+          "begin_mask": [None, 0, 1, 2],
+          "end_mask": [None, 0, 1, 2],
+      },
+      # Negative strides
+      {
+          "dtype": [tf.float32, tf.int32, tf.int64],
+          "index_type": [tf.int32],
+          "input_shape": [[2, 3]],
+          "begin": [[0, -1]],
+          "end": [[2, -3]],
+          "strides": [[1, -1]],
+          "begin_mask": [None, 0, 1, 2],
+          "end_mask": [None, 0, 1, 2],
+      },
+  ]
+
+  def build_graph(parameters):
+    """Build graph for stride_slice test."""
+    input_tensor = tf.placeholder(
+        dtype=parameters["dtype"],
+        name="input",
+        shape=parameters["input_shape"])
+    begin = tf.placeholder(
+        dtype=parameters["index_type"],
+        name="begin",
+        shape=[len(parameters["input_shape"])])
+    end = tf.placeholder(
+        dtype=parameters["index_type"],
+        name="end",
+        shape=[len(parameters["input_shape"])])
+    strides = (
+        tf.placeholder(
+            dtype=parameters["index_type"],
+            name="strides",
+            shape=[len(parameters["input_shape"])])
+        if parameters["strides"] is not None else None)
+    tensors = [input_tensor, begin, end]
+    if strides is not None:
+      tensors.append(strides)
+    out = tf.strided_slice(
+        input_tensor,
+        begin,
+        end,
+        strides,
+        begin_mask=parameters["begin_mask"],
+        end_mask=parameters["end_mask"])
+    return tensors, [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    """Build inputs for stride_slice test."""
+    input_values = create_tensor_data(parameters["dtype"],
+                                      parameters["input_shape"])
+    index_type = _TF_TYPE_INFO[parameters["index_type"]][0]
+    begin_values = np.array(parameters["begin"]).astype(index_type)
+    end_values = np.array(parameters["end"]).astype(index_type)
+    stride_values = (
+        np.array(parameters["strides"]).astype(index_type)
+        if parameters["strides"] is not None else None)
+    values = [input_values, begin_values, end_values]
+    if stride_values is not None:
+      values.append(stride_values)
+
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 def make_l2_pool(input_tensor, ksize, strides, padding, data_format):
   """Given an input perform a sequence of TensorFlow ops to produce l2pool."""
   return tf.sqrt(tf.nn.avg_pool(
@@ -1505,6 +1596,7 @@ def main(unused_args):
         "transpose.zip": make_transpose_tests,
         "mean.zip": make_mean_tests,
         "squeeze.zip": make_squeeze_tests,
+        "strided_slice.zip": make_strided_slice_tests,
     }
     out = FLAGS.zip_to_output
     bin_path = FLAGS.toco
diff --git a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
index c8a6e07abd..c29cd85c4d 100644
--- a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
+++ b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
@@ -263,6 +263,7 @@ INSTANTIATE_TESTS(div)
 INSTANTIATE_TESTS(transpose)
 INSTANTIATE_TESTS(mean)
 INSTANTIATE_TESTS(squeeze)
+INSTANTIATE_TESTS(strided_slice)
 
 }  // namespace testing
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_strided_slice_attributes.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_strided_slice_attributes.cc
index de4d06be2a..7e8b249b07 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_strided_slice_attributes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_strided_slice_attributes.cc
@@ -33,6 +33,10 @@ bool ResolveStridedSliceAttributes::Run(Model* model, std::size_t op_index) {
   CHECK_EQ(op->inputs.size(), 4);
   const auto& start_array = model->GetArray(op->inputs[1]);
   if (!start_array.has_shape()) return false;
+  if (toco::RequiredBufferSizeForShape(start_array.shape()) > 4) {
+    // Only 1-4D arrays are supported for now.
+    return false;
+  }
 
   const auto& stop_array = model->GetArray(op->inputs[2]);
   if (!stop_array.has_shape()) return false;
diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.cc b/tensorflow/contrib/lite/toco/import_tensorflow.cc
index 1947271f55..e8f318cd43 100644
--- a/tensorflow/contrib/lite/toco/import_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/import_tensorflow.cc
@@ -1179,6 +1179,8 @@ void ConvertStridedSliceOperator(const NodeDef& node,
                                  const TensorFlowImportFlags& tf_import_flags,
                                  Model* model) {
   CHECK_EQ(node.op(), "StridedSlice");
+  // TODO(soroosh): The 4th input (strides) should be e optional, to be
+  // consistent with TF.
   CheckInputsCount(node, tf_import_flags, 4);
 
   auto* op = new StridedSliceOperator;
diff --git a/tensorflow/contrib/lite/toco/model.h b/tensorflow/contrib/lite/toco/model.h
index 54fbba7381..3cda63a8ce 100644
--- a/tensorflow/contrib/lite/toco/model.h
+++ b/tensorflow/contrib/lite/toco/model.h
@@ -738,6 +738,9 @@ struct PadOperator : Operator {
 //
 // Inputs:
 //   inputs[0]: required: the input array
+//   inputs[1]: required: the begin array
+//   inputs[2]: required: the end array
+//   inputs[3]: optional: the strides array
 //
 // TensorFlow equivalent: StridedSlice
 struct StridedSliceOperator : Operator {
diff --git a/tensorflow/contrib/lite/toco/tflite/operator.cc b/tensorflow/contrib/lite/toco/tflite/operator.cc
index 0111e1ed92..0c2b570aad 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator.cc
@@ -617,6 +617,30 @@ class Split : public CustomOperator<TensorFlowSplitOperator> {
   }
 };
 
+class StridedSlice
+    : public BuiltinOperator<StridedSliceOperator,
+                             ::tflite::StridedSliceOptions,
+                             ::tflite::BuiltinOptions_StridedSliceOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateStridedSliceOptions(
+        *builder, op.begin_mask, op.end_mask, op.ellipsis_mask,
+        op.new_axis_mask, op.shrink_axis_mask);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->begin_mask = options.begin_mask();
+    op->end_mask = options.end_mask();
+    op->ellipsis_mask = options.ellipsis_mask();
+    op->new_axis_mask = options.new_axis_mask();
+    op->shrink_axis_mask = options.shrink_axis_mask();
+  }
+};
+
 class TensorFlowUnsupported : public BaseOperator {
  public:
   using BaseOperator::BaseOperator;
@@ -777,6 +801,8 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList() {
       new Mean(::tflite::BuiltinOperator_MEAN, OperatorType::kMean));
   ops.emplace_back(
       new Squeeze(::tflite::BuiltinOperator_SQUEEZE, OperatorType::kSqueeze));
+  ops.emplace_back(new StridedSlice(::tflite::BuiltinOperator_STRIDED_SLICE,
+                                    OperatorType::kStridedSlice));
 
   // Custom Operators.
   ops.emplace_back(new Cast("CAST", OperatorType::kCast));
diff --git a/tensorflow/contrib/lite/toco/tflite/operator_test.cc b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
index 77c70847d1..de79c70e1b 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator_test.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
@@ -398,6 +398,28 @@ TEST_F(OperatorTest, Squeeze) {
   EXPECT_EQ(op.squeeze_dims, output_toco_op->squeeze_dims);
 }
 
+TEST_F(OperatorTest, StridedSlice) {
+  StridedSliceOperator op;
+
+  op.begin_mask = 1;
+  op.end_mask = 2;
+  op.ellipsis_mask = 1;
+  op.new_axis_mask = 1;
+  op.shrink_axis_mask = 2;
+
+  auto output_toco_op = SerializeAndDeserialize(
+      GetOperator("STRIDED_SLICE", OperatorType::kStridedSlice), op);
+  EXPECT_EQ(op.start_indices, output_toco_op->start_indices);
+  EXPECT_EQ(op.stop_indices, output_toco_op->stop_indices);
+  EXPECT_EQ(op.strides, output_toco_op->strides);
+  EXPECT_EQ(op.begin_mask, output_toco_op->begin_mask);
+  EXPECT_EQ(op.end_mask, output_toco_op->end_mask);
+  EXPECT_EQ(op.end_mask, output_toco_op->end_mask);
+  EXPECT_EQ(op.ellipsis_mask, output_toco_op->ellipsis_mask);
+  EXPECT_EQ(op.new_axis_mask, output_toco_op->new_axis_mask);
+  EXPECT_EQ(op.shrink_axis_mask, output_toco_op->shrink_axis_mask);
+}
+
 TEST_F(OperatorTest, TensorFlowUnsupported) {
   TensorFlowUnsupportedOperator op;
   op.tensorflow_op = "MyCustomUnsupportedOp";
-- 
GitLab


From 68e17d497497119c24ad506dac4e34e127cf836c Mon Sep 17 00:00:00 2001
From: Jie <jiej@nvidia.com>
Date: Tue, 23 Jan 2018 17:54:45 -0800
Subject: [PATCH 051/966] [UPDATE]   Refactoring optimization pass and shape
 inference

shape inference
  Removed shape refiner and apply shape inference through
  grappler/costs/graph_properties

  Currently using static shape inference

optimization pass
  Moved Python optimization pass to c++ optimization pass
---
 tensorflow/contrib/tensorrt/BUILD             |   9 +-
 .../contrib/tensorrt/convert/convert_graph.cc |  87 +++++++++---
 .../contrib/tensorrt/convert/convert_nodes.cc |  24 ++--
 .../contrib/tensorrt/convert/convert_nodes.h  |   5 +-
 .../contrib/tensorrt/convert/inferShapes.cc   | 125 ------------------
 .../contrib/tensorrt/convert/inferShapes.h    |  39 ------
 .../contrib/tensorrt/python/trt_convert.py    |  40 +++---
 7 files changed, 114 insertions(+), 215 deletions(-)
 delete mode 100644 tensorflow/contrib/tensorrt/convert/inferShapes.cc
 delete mode 100644 tensorflow/contrib/tensorrt/convert/inferShapes.h

diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index 723c9f5434..f92b60b03a 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -174,12 +174,10 @@ cc_library(
         "convert/convert_nodes.cc",
         "convert/convert_graph.cc",
         "segment/segment.cc",
-        "convert/inferShapes.cc",
     ],
     hdrs=[
         "convert/convert_nodes.h",
         "convert/convert_graph.h",
-        "convert/inferShapes.h",
         "segment/segment.h",
         "segment/union_find.h",
     ],
@@ -192,7 +190,12 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:core_cpu_base",
-        #"//third_party/eigen3",
+        "//tensorflow/core/grappler/optimizers:constant_folding",
+        "//tensorflow/core/grappler/optimizers:layout_optimizer",
+        "//tensorflow/core/grappler/clusters:virtual_cluster",
+        "//tensorflow/core/grappler:devices",
+        "//tensorflow/core/grappler/costs:graph_properties",
+        #"//tensorflow/core/grappler/clusters:single_machine",
     ],
 )
 
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index 29aa555467..cdde56dae2 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -28,7 +28,6 @@ limitations under the License.
 #include "NvInfer.h"
 
 #include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
-#include "tensorflow/contrib/tensorrt/convert/inferShapes.h"
 #include "tensorflow/contrib/tensorrt/segment/segment.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -40,6 +39,17 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 
 #define _TF_LOG_DEBUG ::tensorflow::internal::LogMessage(__FILE__, __LINE__, -1)
+#include "tensorflow/core/grappler/optimizers/constant_folding.h"
+#include "tensorflow/core/grappler/optimizers/layout_optimizer.h" 
+#include "tensorflow/core/grappler/devices.h"
+//#include "tensorflow/core/grappler/clusters/single_machine.h"
+#include "tensorflow/core/grappler/clusters/virtual_cluster.h"
+#include "tensorflow/core/protobuf/device_properties.pb.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/utils.h"
+
+#include "tensorflow/core/grappler/costs/graph_properties.h"
+
 //------------------------------------------------------------------------------
 namespace tensorrt {
 namespace convert {
@@ -114,7 +124,8 @@ std::unordered_map<std::string, std::vector<int>> BuildTensorNameMap(
 tensorflow::Status ConvertSubGraphToTensorRT(
     tensorflow::Graph& graph, const std::vector<std::string>& output_names,
     const std::set<int>& subgraph_node_ids, size_t max_batch_size,
-    size_t max_workspace_size, const ShapeMap& shape_map) {
+    size_t max_workspace_size,
+    const tensorflow::grappler::GraphProperties& graph_properties) {
   tensorflow::EdgeSet subgraph_incoming_edges;
   GetSubGraphIncomingEdges(graph, subgraph_node_ids, &subgraph_incoming_edges);
 
@@ -152,7 +163,7 @@ tensorflow::Status ConvertSubGraphToTensorRT(
   tensorflow::NodeDef trt_node_def;
   TF_RETURN_IF_ERROR(ConvertSubGraphToTensorRTNodeDef(
       graph, subgraph_node_ids, subgraph_inputs, subgraph_outputs,
-      max_batch_size, max_workspace_size, shape_map, &trt_node_def));
+      max_batch_size, max_workspace_size, graph_properties, &trt_node_def));
   tensorflow::Status status;
   tensorflow::Node* trt_node = graph.AddNode(trt_node_def, &status);
 
@@ -199,18 +210,62 @@ tensorflow::Status ConvertGraphDefToTensorRT(
     const tensorflow::GraphDef& graph_def,
     const std::vector<std::string>& output_names, size_t max_batch_size,
     size_t max_workspace_size, tensorflow::GraphDef* new_graph_def) {
-  ShapeMap shape_map;
-  TF_RETURN_IF_ERROR(
-      tensorflow::trt::inferShapes(graph_def, output_names, shape_map));
-  std::stringstream oss;
-  for (auto& n : shape_map) {  // nodes
-    oss << " Node= " << n.first << ", ";
-    for (auto o : n.second) {  // outputs
-      oss << o.first.DebugString() << " T= " << o.second << ", ";
-    }
-    LOG(DEBUG) << oss.str();
-    oss.str("");
-  }
+
+  // optimization pass
+  tensorflow::grappler::GrapplerItem item;
+  item.fetch = output_names;
+  tensorflow::GraphDef gdef;
+
+  // layout optimization
+  item.graph = graph_def;
+  tensorflow::grappler::LayoutOptimizer optimizer;
+  tensorflow::grappler::Cluster* gCluster;
+
+  // virtual cluster
+  tensorflow::DeviceProperties device_properties;
+  device_properties.set_type("GPU");
+  device_properties.mutable_environment()->insert({"architecture", "6"});
+  gCluster =
+    new tensorflow::grappler::VirtualCluster({{"/GPU:0", device_properties}});
+
+  // single machine
+  int num_cpu_cores = tensorflow::grappler::GetNumAvailableLogicalCPUCores();
+  int num_gpus = tensorflow::grappler::GetNumAvailableGPUs();
+  LOG(DEBUG) << "cpu_cores: " << num_cpu_cores;
+  LOG(DEBUG) << "gpus: " << num_gpus;
+  // int timeout_s = 60 * 10;
+  // gCluster = new tensorflow::grappler::SingleMachine(
+  //                  timeout_s, num_cpu_cores, num_gpus);
+
+  tensorflow::Status status = optimizer.Optimize(gCluster, item, &gdef);
+
+  if (status !=tensorflow::Status::OK())
+    return status;
+ 
+  // constant folding
+  item.graph = gdef;
+  tensorflow::grappler::ConstantFolding fold(nullptr);
+  status = fold.Optimize(nullptr, item, &gdef);
+  if (status !=tensorflow::Status::OK())
+    return status;
+  
+  // AJ refactoring shape inference through grappler/GraphProperties.
+  tensorflow::grappler::GraphProperties static_graph_properties(item);
+  static_graph_properties.InferStatically(false);
+  // TF_CHECK_OK(static_graph_prop.InferStatically(false));
+  // ShapeMap shape_map;
+  // TF_RETURN_IF_ERROR(
+  //     tensorflow::trt::inferShapes(gdef, output_names, shape_map));
+  // std::stringstream oss;
+  // for (auto& n : shape_map) {  // nodes
+  //   oss << " Node= " << n.first << ", ";
+  //   for (auto o : n.second) {  // outputs
+  //     oss << o.first.DebugString() << " T= " << o.second << ", ";
+  //   }
+  //   LOG(DEBUG) << oss.str();
+  //   oss.str("");
+  // }
+
   // Build full graph
   tensorflow::FunctionLibraryDefinition flib(tensorflow::OpRegistry::Global(),
                                              graph_def.library());
@@ -243,7 +298,7 @@ tensorflow::Status ConvertGraphDefToTensorRT(
     }
     TF_RETURN_IF_ERROR(ConvertSubGraphToTensorRT(
         graph, output_names, subgraph_node_ids, max_batch_size,
-        max_workspace_size, shape_map));
+        max_workspace_size, static_graph_properties));
   }
   graph.ToGraphDef(new_graph_def);
   return tensorflow::Status::OK();
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index 83f78d7eff..6c77cdc0b6 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -1548,7 +1548,8 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
     const tensorflow::Graph& graph, const std::set<int>& subgraph_node_ids,
     const std::vector<std::pair<int, int>>& input_inds,
     const std::vector<std::pair<int, int>>& output_inds, size_t max_batch_size,
-    size_t max_workspace_size, const ShapeMap& shape_map,
+    size_t max_workspace_size,
+    const tensorflow::grappler::GraphProperties& graph_properties,
     tensorflow::NodeDef* trt_node) {
   // Visit nodes in reverse topological order and construct the TRT network.
 
@@ -1605,20 +1606,20 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
     input_names.push_back(node_name);  // insert original node name without port
     // TODO(jie): alternative :)
     // tensorflow::DataType tf_dtype = node->output_type(output_idx);
-    if (shape_map.count(node_name) == 0)
+    if (!graph_properties.HasOutputProperties(node_name))
       return tensorflow::errors::Internal("failed to find input node: " +
                                           node_name);
 
-    auto input_entry_vec = shape_map.at(node_name);
-    if (static_cast<int>(input_entry_vec.size()) < output_idx)
+    auto op_info_vec = graph_properties.GetOutputProperties(node_name);
+    if (static_cast<int>(op_info_vec.size()) < output_idx)
       return tensorflow::errors::Internal(
           "accessing output index of: " + std::to_string(output_idx) +
           ", at node: " + node_name + "with output entry from shape_map: " +
-          std::to_string(input_entry_vec.size()));
+          std::to_string(op_info_vec.size()));
 
-    auto input_entry = input_entry_vec.at(output_idx);
+    auto op_info = op_info_vec.at(output_idx);
 
-    tensorflow::DataType tf_dtype = input_entry.second;
+    tensorflow::DataType tf_dtype = op_info.dtype();
     input_dtypes.push_back(tf_dtype);
 
     nvinfer1::DataType dtype(nvinfer1::DataType::kFLOAT);
@@ -1627,15 +1628,16 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
     LOG(DEBUG) << "accessing output index of: " << std::to_string(output_idx)
                << ", at node: " << node_name
                << "with output entry from shape_map: "
-               << std::to_string(input_entry_vec.size());
+               << std::to_string(op_info_vec.size());
+
     // TODO(ben,jie): update TRT input format/dimension
     nvinfer1::DimsCHW input_dim_psuedo_chw;
     for (int i = 0; i < 3; i++) input_dim_psuedo_chw.d[i] = 1;
 
-    for (int i = 1; i < input_entry.first.dims(); i++) {
+    for (int i = 1; i < op_info.shape().dim_size(); i++) {
       LOG(DEBUG) << "dimension: " << i
-                 << " , size: " << input_entry.first.dim_size(i);
-      input_dim_psuedo_chw.d[i - 1] = input_entry.first.dim_size(i);
+                 << " , size: " << op_info.shape().dim(i).size();
+      input_dim_psuedo_chw.d[i - 1] = op_info.shape().dim(i).size();
     }
 
     // TODO(ben,jie): proper way to restore input tensor name?
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.h b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
index a624582dec..dc59c37892 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.h
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
@@ -20,10 +20,10 @@ limitations under the License.
 #include <vector>
 #include <utility>
 
-#include "tensorflow/contrib/tensorrt/convert/inferShapes.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/grappler/costs/graph_properties.h"
 
 namespace tensorrt {
 namespace convert {
@@ -34,7 +34,8 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
         input_inds,  // {node_id, output_idx}
     const std::vector<std::pair<int, int>>&
         output_inds,  // {node_id, output_idx}
-    size_t max_batch_size, size_t max_workspace_size, const ShapeMap& shape_map,
+    size_t max_batch_size, size_t max_workspace_size,
+    const tensorflow::grappler::GraphProperties& graph_prop,
     tensorflow::NodeDef* trt_node);
 }  // namespace convert
 }  // namespace tensorrt
diff --git a/tensorflow/contrib/tensorrt/convert/inferShapes.cc b/tensorflow/contrib/tensorrt/convert/inferShapes.cc
deleted file mode 100644
index c7f0f0023d..0000000000
--- a/tensorflow/contrib/tensorrt/convert/inferShapes.cc
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/contrib/tensorrt/convert/inferShapes.h"
-#include <functional>
-#include "tensorflow/core/common_runtime/shape_refiner.h"
-#include "tensorflow/core/framework/node_def.pb.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/types.pb_text.h"
-#include "tensorflow/core/graph/algorithm.h"
-#include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/graph_constructor.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/platform/logging.h"
-
-#define _TF_LOG_DEBUG ::tensorflow::internal::LogMessage(__FILE__, __LINE__, -1)
-
-namespace tensorflow {
-namespace trt {
-std::vector<tensorflow::DataType> getTypes(const tensorflow::OpDef& op,
-                                           const tensorflow::NodeDef& nd,
-                                           bool inp = true) {
-  const auto& attrMap = nd.attr();
-  auto getType = [&attrMap](decltype(
-                     op.input_arg(0)) a) -> std::vector<tensorflow::DataType> {
-    std::vector<tensorflow::DataType> tvec;
-    if (!a.type_list_attr().empty()) {  // get the list types
-      const auto& tl = attrMap.at(a.type_list_attr()).list();
-      int tsize = tl.type_size();
-      tvec.reserve(tsize);
-      for (int t = 0; t < tsize; t++) {
-        tvec.push_back(tl.type(t));
-      }
-      return tvec;
-    }
-    tensorflow::DataType cType = tensorflow::DT_INVALID;
-    if (a.type() != tensorflow::DT_INVALID) {  // get defined types
-      cType = a.type();
-    } else if (!a.type_attr().empty()) {
-      cType = attrMap.at(a.type_attr()).type();
-    }
-    if (!a.number_attr().empty()) {  // numbertypes
-      int64 nTensors = attrMap.at(a.number_attr()).i();
-      tvec = std::vector<tensorflow::DataType>(nTensors, cType);
-      return tvec;
-    }
-    tvec.push_back(cType);
-    return tvec;
-  };
-  std::vector<tensorflow::DataType> types;
-  if (inp) {
-    int n_inputs = op.input_arg_size();
-    for (int i = 0; i < n_inputs; i++) {
-      auto tout = getType(op.input_arg(i));
-      LOG(DEBUG) << "Node= " << nd.name() << " #inputs" << tout.size();
-      types.insert(types.end(), tout.begin(), tout.end());
-    }
-  } else {
-    int n_outputs = op.output_arg_size();
-    // types.resize(n_outputs);
-    for (int i = 0; i < n_outputs; i++) {
-      auto tout = getType(op.output_arg(i));
-      LOG(DEBUG) << "Node= " << nd.name() << " #outputs" << tout.size();
-      types.insert(types.end(), tout.begin(), tout.end());
-    }
-  }
-  return types;
-}
-
-tensorflow::Status inferShapes(const tensorflow::GraphDef& graph_def,
-                               const std::vector<std::string>& output_names,
-                               ShapeMap& shapes) {
-  tensorflow::Graph g(OpRegistry::Global());
-  TF_RETURN_IF_ERROR(tensorflow::ConvertGraphDefToGraph(
-      tensorflow::GraphConstructorOptions(), graph_def, &g));
-  std::vector<tensorflow::Node*> POnodes;
-  tensorflow::GetPostOrder(g, &POnodes);
-  tensorflow::ShapeRefiner refiner(graph_def.versions().producer(),
-                                   OpRegistry::Global());
-  for (auto n = POnodes.rbegin(); n != POnodes.rend(); ++n) {
-    TF_CHECK_OK(refiner.AddNode(*n));
-  }
-
-  auto shape2PTS = [](tensorflow::shape_inference::InferenceContext* ic,
-                      const tensorflow::shape_inference::ShapeHandle& sh)
-      -> tensorflow::PartialTensorShape {
-    std::vector<int64> dims;
-    int64 rank = ic->Rank(sh);
-    for (int64 i = 0; i < rank; i++) {
-      auto dh = ic->Dim(sh, i);
-      dims.push_back(ic->Value(dh));
-    }
-    return tensorflow::PartialTensorShape(dims);
-  };
-  for (const auto& n : POnodes) {
-    auto ic = refiner.GetContext(n);
-    if (ic) {
-      int nOuts = ic->num_outputs();
-      auto types = getTypes(n->op_def(), n->def(), false);
-      std::vector<
-          std::pair<tensorflow::PartialTensorShape, tensorflow::DataType>>
-          SAT;
-      for (int i = 0; i < nOuts; i++) {
-        auto PTS = shape2PTS(ic, ic->output(i));
-        SAT.push_back({PTS, types.at(i)});
-      }
-      shapes[n->name()] = SAT;
-    } else {
-      LOG(WARNING) << "Node " << n->name() << " doesn't have InferenceContext!";
-    }
-  }
-  return tensorflow::Status::OK();
-}
-}  // namespace trt
-}  // namespace tensorflow
diff --git a/tensorflow/contrib/tensorrt/convert/inferShapes.h b/tensorflow/contrib/tensorrt/convert/inferShapes.h
deleted file mode 100644
index b94f1ee893..0000000000
--- a/tensorflow/contrib/tensorrt/convert/inferShapes.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_CONVERT_INFERSHAPES_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_CONVERT_INFERSHAPES_H_
-
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include <utility>
-
-#include "tensorflow/core/framework/graph.pb.h"
-#include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/lib/core/status.h"
-
-typedef std::unordered_map<std::string,
-                           std::vector<std::pair<tensorflow::PartialTensorShape,
-                                                 tensorflow::DataType>>>
-    ShapeMap;
-namespace tensorflow {
-namespace trt {
-tensorflow::Status inferShapes(const tensorflow::GraphDef& graph_def,
-                               const std::vector<std::string>& output_names,
-                               ShapeMap& shapes);
-}
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_CONVERT_INFERSHAPES_H_
diff --git a/tensorflow/contrib/tensorrt/python/trt_convert.py b/tensorflow/contrib/tensorrt/python/trt_convert.py
index a66afa8d05..521ccdce42 100644
--- a/tensorflow/contrib/tensorrt/python/trt_convert.py
+++ b/tensorflow/contrib/tensorrt/python/trt_convert.py
@@ -48,36 +48,38 @@ def CreateInferenceGraph(input_graph_def, outputs,max_batch_size=1,max_workspace
   #   output_graph_def_string = trt_convert(
   #       input_graph_def_string,outputs,
   #       max_batch_size,max_workspace_size, status)
-  g = tf.Graph()
-  with g.as_default():
-    tf.import_graph_def(input_graph_def, name="")
-  rewriter_config = rewriter_config_pb2.RewriterConfig()
-  rewriter_config.optimizers.append('layout')
-  rewriter_config.optimizers.append('constfold')
+  # g = tf.Graph()
+  # with g.as_default():
+  #   tf.import_graph_def(input_graph_def, name="")
+  # rewriter_config = rewriter_config_pb2.RewriterConfig()
+  # rewriter_config.optimizers.append('layout')
+  # rewriter_config.optimizers.append('constfold')
 
-  # mark output nodes as fetch
-  train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
-  for node_name in outputs:
-    out_node = g.get_operation_by_name(node_name)
-    for i in range(0,len(out_node.outputs)):
-      train_op.append(out_node.outputs[0])
+  # # mark output nodes as fetch
+  # train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
+  # for node_name in outputs:
+  #   out_node = g.get_operation_by_name(node_name)
+  #   for i in range(0,len(out_node.outputs)):
+  #     train_op.append(out_node.outputs[0])
 
-  # constant folding
-  mg = meta_graph.create_meta_graph_def(graph=g)
-  meta_graph.add_collection_def(mg, ops.GraphKeys.TRAIN_OP)
-  optimized_graph_def_str = \
-    tf_optimizer.OptimizeGraph(rewriter_config, mg).SerializeToString()
+  # # constant folding
+  # mg = meta_graph.create_meta_graph_def(graph=g)
+  # meta_graph.add_collection_def(mg, ops.GraphKeys.TRAIN_OP)
+  # optimized_graph_def_str = \
+  #   tf_optimizer.OptimizeGraph(rewriter_config, mg).SerializeToString()
+  input_graph_def_str= \
+    input_graph_def.SerializeToString()
 
   # TODO(sami): Fix this when we can return status from C++ library
   # There is a problem with the TF internal library setup that doesn't allow us to return a status object from C++.
   # Thus we return a  pair or strings where first one is encoded status and the second one is the
   # transformed graphs protobuf string.
   out = trt_convert(
-      optimized_graph_def_str ,outputs,
+      input_graph_def_str ,outputs,
       max_batch_size,max_workspace_size)
   status = out[0]
   output_graph_def_string = out[1]
-  del optimized_graph_def_str #save some memory
+  del input_graph_def_str #save some memory
   if len(status) < 2:
     raise _impl.UnknownError(None,None,status)
   if status[:2] != "OK":
-- 
GitLab


From 8aaff49102d5d652521f229da44437aa0cd578ec Mon Sep 17 00:00:00 2001
From: Benoit Steiner <bsteiner@google.com>
Date: Fri, 19 Jan 2018 19:13:43 -0800
Subject: [PATCH 052/966] Turn the op_performance_data proto lib into a header
 only library by default

PiperOrigin-RevId: 182621348

Signed-off-by: Jie <jiej@nvidia.com>
---
 tensorflow/core/BUILD                         |  6 +++--
 tensorflow/core/grappler/costs/BUILD          | 24 +++++++++----------
 .../core/platform/default/build_config.bzl    |  8 +++++++
 tensorflow/python/BUILD                       |  4 ++--
 4 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 579174efa3..f2f66fc567 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -136,6 +136,8 @@ load(
     "tf_nano_proto_library",
     "tf_protos_all",
     "tf_protos_all_impl",
+    "tf_protos_grappler",
+    "tf_protos_grappler_impl",
 )
 load(
     "//tensorflow/core:platform/default/build_config_root.bzl",
@@ -1529,7 +1531,7 @@ cc_library(
         "@snappy",
         "@zlib_archive//:zlib",
         "@protobuf_archive//:protobuf",
-    ] + tf_protos_all_impl(),
+    ] + tf_protos_all_impl() + tf_protos_grappler_impl(),
 )
 
 # File compiled with extra flags to get cpu-specific acceleration.
@@ -2094,7 +2096,7 @@ tf_cuda_library(
         ":core_cpu_base",
         ":proto_text",
         "//tensorflow/core/grappler:grappler_item",
-    ] + if_static([":core_cpu_impl"]) + tf_protos_all(),
+    ] + if_static([":core_cpu_impl"]) + tf_protos_all() + tf_protos_grappler(),
 )
 
 tf_cuda_library(
diff --git a/tensorflow/core/grappler/costs/BUILD b/tensorflow/core/grappler/costs/BUILD
index 7abc155c19..0fe01e9c9e 100644
--- a/tensorflow/core/grappler/costs/BUILD
+++ b/tensorflow/core/grappler/costs/BUILD
@@ -1,6 +1,10 @@
 licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow:tensorflow.bzl", "tf_cuda_library", "tf_cc_test")
+load(
+    "//tensorflow/core:platform/default/build_config.bzl",
+    "tf_protos_grappler",
+)
 
 filegroup(
     name = "all_files",
@@ -37,6 +41,7 @@ tf_proto_library(
     name = "op_performance_data",
     srcs = ["op_performance_data.proto"],
     cc_api_version = 2,
+    default_header = True,
     protodeps = tf_additional_all_protos(),
     visibility = ["//visibility:public"],
 )
@@ -47,7 +52,6 @@ cc_library(
     hdrs = ["graph_properties.h"],
     visibility = ["//visibility:public"],
     deps = [
-        ":op_performance_data_cc",
         ":utils",
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
@@ -55,7 +59,7 @@ cc_library(
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
-    ],
+    ] + tf_protos_grappler(),
 )
 
 tf_cc_test(
@@ -135,7 +139,7 @@ tf_cuda_library(
     hdrs = ["utils.h"],
     visibility = ["//visibility:public"],
     deps = [
-        ":op_performance_data_cc",
+        "//third_party/eigen3",
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
@@ -143,8 +147,7 @@ tf_cuda_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:utils",
-        "//third_party/eigen3",
-    ],
+    ] + tf_protos_grappler(),
 )
 
 tf_cc_test(
@@ -207,9 +210,8 @@ cc_library(
     hdrs = ["op_context.h"],
     visibility = ["//visibility:public"],
     deps = [
-        ":op_performance_data_cc",
         "//tensorflow/core:protos_all_cc",
-    ],
+    ] + tf_protos_grappler(),
 )
 
 cc_library(
@@ -276,12 +278,11 @@ cc_library(
     deps = [
         ":cost_estimator",
         ":op_context",
-        ":op_performance_data_cc",
+        "//third_party/eigen3",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler/clusters:utils",
-        "//third_party/eigen3",
-    ],
+    ] + tf_protos_grappler(),
 )
 
 tf_cc_test(
@@ -305,7 +306,6 @@ cc_library(
         ":cost_estimator",
         ":graph_properties",
         ":op_level_cost_estimator",
-        ":op_performance_data_cc",
         ":utils",
         ":virtual_placer",
         ":virtual_scheduler",
@@ -314,7 +314,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
-    ],
+    ] + tf_protos_grappler(),
 )
 
 tf_cc_test(
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index e9c510c93c..2102c5cca3 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -378,6 +378,14 @@ def tf_protos_all():
       extra_deps=tf_protos_all_impl(),
       otherwise=["//tensorflow/core:protos_all_cc"])
 
+def tf_protos_grappler_impl():
+  return ["//tensorflow/core/grappler/costs:op_performance_data_cc_impl"]
+
+def tf_protos_grappler():
+  return if_static(
+      extra_deps=tf_protos_grappler_impl(),
+      otherwise=["//tensorflow/core/grappler/costs:op_performance_data_cc"])
+
 def tf_env_time_hdrs():
   return [
       "platform/env_time.h",
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 3493ed76f3..dbb29d9878 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -32,6 +32,7 @@ load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library_py")
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_additional_lib_deps")
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_additional_all_protos")
+load("//tensorflow/core:platform/default/build_config.bzl", "tf_protos_grappler")
 load("//tensorflow/core:platform/default/build_config_root.bzl", "tf_additional_plugin_deps")
 load("//tensorflow/python:build_defs.bzl", "tf_gen_op_wrapper_private_py")
 load("//tensorflow/core:platform/default/build_config_root.bzl", "tf_additional_verbs_deps")
@@ -209,9 +210,8 @@ cc_library(
         "//tensorflow/core/grappler/costs:analytical_cost_estimator",
         "//tensorflow/core/grappler/costs:cost_estimator",
         "//tensorflow/core/grappler/costs:measuring_cost_estimator",
-        "//tensorflow/core/grappler/costs:op_performance_data_cc",
         "//tensorflow/core/grappler/costs:utils",
-    ],
+    ] + tf_protos_grappler(),
 )
 
 cc_library(
-- 
GitLab


From 320e07a3c7f2815b31cd534ea714616040abe91c Mon Sep 17 00:00:00 2001
From: Jie <jiej@nvidia.com>
Date: Tue, 23 Jan 2018 18:12:36 -0800
Subject: [PATCH 053/966] [BUG_FIX]   git patch issue fixed. modified:  
 tensorflow/contrib/tensorrt/convert/convert_graph.cc

---
 tensorflow/contrib/tensorrt/convert/convert_graph.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index cdde56dae2..e90790716c 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -268,10 +268,10 @@ tensorflow::Status ConvertGraphDefToTensorRT(
 
   // Build full graph
   tensorflow::FunctionLibraryDefinition flib(tensorflow::OpRegistry::Global(),
-                                             graph_def.library());
+                                             gdef.library());
   tensorflow::Graph graph(flib);
   TF_RETURN_IF_ERROR(tensorflow::ConvertGraphDefToGraph(
-      tensorflow::GraphConstructorOptions(), graph_def, &graph));
+      tensorflow::GraphConstructorOptions(), gdef, &graph));
 
   // Segment the graph into subgraphs that can be converted to TensorRT
   tensorrt::segment::SegmentOptions segment_options;
@@ -282,7 +282,7 @@ tensorflow::Status ConvertGraphDefToTensorRT(
   segment_options.minimum_segment_size = 2;
   tensorrt::segment::SegmentNodesVector segments;
   TF_RETURN_IF_ERROR(tensorrt::segment::SegmentGraph(
-      graph_def, IsTensorRTCandidate, segment_options, &segments));
+      gdef, IsTensorRTCandidate, segment_options, &segments));
   if (segments.size() > 1) {
     // LOG(WARNING) << "Multiple TensorRT candidate subgraphs were found, "
     //<< "but only the first can be converted.";
-- 
GitLab


From f073d5f56dd07b42d381a5a8ce6868b3ea6ac8f2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Jan 2018 18:18:15 -0800
Subject: [PATCH 054/966] Update ops-related pbtxt files.

PiperOrigin-RevId: 183024949
---
 .../core/ops/compat/ops_history.v1.pbtxt      | 468 ++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 |  54 ++
 2 files changed, 522 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index c2e2793760..47de3d398e 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -18822,6 +18822,57 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "FixedLengthRecordReader"
+  output_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "header_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "record_bytes"
+    type: "int"
+  }
+  attr {
+    name: "footer_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "hop_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  deprecation {
+    version: 26
+  }
+  is_stateful: true
+}
 op {
   name: "FixedLengthRecordReaderV2"
   output_arg {
@@ -21384,6 +21435,32 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "IdentityReader"
+  output_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  deprecation {
+    version: 26
+  }
+  is_stateful: true
+}
 op {
   name: "IdentityReaderV2"
   output_arg {
@@ -38544,6 +38621,46 @@ op {
     }
   }
 }
+op {
+  name: "ResizeBilinear"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "resized_images"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "ResizeBilinearGrad"
   input_arg {
@@ -38577,6 +38694,40 @@ op {
     }
   }
 }
+op {
+  name: "ResizeBilinearGrad"
+  input_arg {
+    name: "grads"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "original_image"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "ResizeNearestNeighbor"
   input_arg {
@@ -61858,6 +62009,39 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "TFRecordReader"
+  output_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "compression_type"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  deprecation {
+    version: 26
+  }
+  is_stateful: true
+}
 op {
   name: "TFRecordReaderV2"
   output_arg {
@@ -62259,6 +62443,16 @@ op {
     type: DT_STRING
   }
 }
+op {
+  name: "TensorArrayCloseV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  deprecation {
+    version: 26
+  }
+}
 op {
   name: "TensorArrayCloseV3"
   input_arg {
@@ -62436,6 +62630,41 @@ op {
     }
   }
 }
+op {
+  name: "TensorArrayGatherV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "element_shape"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+  deprecation {
+    version: 26
+  }
+}
 op {
   name: "TensorArrayGatherV3"
   input_arg {
@@ -62513,6 +62742,29 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "TensorArrayGradV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "grad_handle"
+    type: DT_STRING
+  }
+  attr {
+    name: "source"
+    type: "string"
+  }
+  deprecation {
+    version: 26
+  }
+  is_stateful: true
+}
 op {
   name: "TensorArrayGradV3"
   input_arg {
@@ -62619,6 +62871,32 @@ op {
     type: "type"
   }
 }
+op {
+  name: "TensorArrayReadV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  deprecation {
+    version: 26
+  }
+}
 op {
   name: "TensorArrayReadV3"
   input_arg {
@@ -62701,6 +62979,36 @@ op {
     type: "type"
   }
 }
+op {
+  name: "TensorArrayScatterV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "flow_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  deprecation {
+    version: 26
+  }
+}
 op {
   name: "TensorArrayScatterV3"
   input_arg {
@@ -62763,6 +63071,24 @@ op {
     type: DT_INT32
   }
 }
+op {
+  name: "TensorArraySizeV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  deprecation {
+    version: 26
+  }
+}
 op {
   name: "TensorArraySizeV3"
   input_arg {
@@ -62837,6 +63163,36 @@ op {
     type: "type"
   }
 }
+op {
+  name: "TensorArraySplitV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "flow_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  deprecation {
+    version: 26
+  }
+}
 op {
   name: "TensorArraySplitV3"
   input_arg {
@@ -62938,6 +63294,55 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "TensorArrayV2"
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "element_shape"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+  attr {
+    name: "dynamic_size"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "clear_after_read"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "tensor_array_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  deprecation {
+    version: 26
+  }
+  is_stateful: true
+}
 op {
   name: "TensorArrayV3"
   input_arg {
@@ -63103,6 +63508,36 @@ op {
     type: "type"
   }
 }
+op {
+  name: "TensorArrayWriteV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "flow_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  deprecation {
+    version: 26
+  }
+}
 op {
   name: "TensorArrayWriteV3"
   input_arg {
@@ -63481,6 +63916,39 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "TextLineReader"
+  output_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "skip_header_lines"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  deprecation {
+    version: 26
+  }
+  is_stateful: true
+}
 op {
   name: "TextLineReaderV2"
   output_arg {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 60da36a939..fe0b362d8f 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -8540,6 +8540,10 @@ op {
       s: ""
     }
   }
+  deprecation {
+    version: 26
+    explanation: "Use FixedLengthRecordReaderV2"
+  }
   is_stateful: true
 }
 op {
@@ -10137,6 +10141,10 @@ op {
       s: ""
     }
   }
+  deprecation {
+    version: 26
+    explanation: "Use IdentityReaderV2"
+  }
   is_stateful: true
 }
 op {
@@ -19699,6 +19707,7 @@ op {
         type: DT_UINT16
         type: DT_INT32
         type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -19733,6 +19742,7 @@ op {
     allowed_values {
       list {
         type: DT_FLOAT
+        type: DT_BFLOAT16
         type: DT_HALF
         type: DT_DOUBLE
       }
@@ -28618,6 +28628,10 @@ op {
       s: ""
     }
   }
+  deprecation {
+    version: 26
+    explanation: "Use TFRecordReaderV2"
+  }
   is_stateful: true
 }
 op {
@@ -28888,6 +28902,10 @@ op {
     name: "handle"
     type: DT_STRING
   }
+  deprecation {
+    version: 26
+    explanation: "Use TensorArrayCloseV3"
+  }
 }
 op {
   name: "TensorArrayCloseV3"
@@ -29067,6 +29085,10 @@ op {
       }
     }
   }
+  deprecation {
+    version: 26
+    explanation: "Use TensorArrayGatherV3"
+  }
 }
 op {
   name: "TensorArrayGatherV3"
@@ -29144,6 +29166,10 @@ op {
     name: "source"
     type: "string"
   }
+  deprecation {
+    version: 26
+    explanation: "Use TensorArrayGradV3"
+  }
   is_stateful: true
 }
 op {
@@ -29253,6 +29279,10 @@ op {
     name: "dtype"
     type: "type"
   }
+  deprecation {
+    version: 26
+    explanation: "Use TensorArrayReadV3"
+  }
 }
 op {
   name: "TensorArrayReadV3"
@@ -29336,6 +29366,10 @@ op {
     name: "T"
     type: "type"
   }
+  deprecation {
+    version: 26
+    explanation: "Use TensorArrayScatterV3"
+  }
 }
 op {
   name: "TensorArrayScatterV3"
@@ -29399,6 +29433,10 @@ op {
     name: "size"
     type: DT_INT32
   }
+  deprecation {
+    version: 26
+    explanation: "Use TensorArraySizeV3"
+  }
 }
 op {
   name: "TensorArraySizeV3"
@@ -29474,6 +29512,10 @@ op {
     name: "T"
     type: "type"
   }
+  deprecation {
+    version: 26
+    explanation: "Use TensorArraySplitV3"
+  }
 }
 op {
   name: "TensorArraySplitV3"
@@ -29575,6 +29617,10 @@ op {
       s: ""
     }
   }
+  deprecation {
+    version: 26
+    explanation: "Use TensorArrayV3"
+  }
   is_stateful: true
 }
 op {
@@ -29692,6 +29738,10 @@ op {
     name: "T"
     type: "type"
   }
+  deprecation {
+    version: 26
+    explanation: "Use TensorArrayWriteV3"
+  }
 }
 op {
   name: "TensorArrayWriteV3"
@@ -30069,6 +30119,10 @@ op {
       s: ""
     }
   }
+  deprecation {
+    version: 26
+    explanation: "Use TextLineReaderV2"
+  }
   is_stateful: true
 }
 op {
-- 
GitLab


From 7035501c1b35d52d80d8fde3a95492d83a96f495 Mon Sep 17 00:00:00 2001
From: Sami Kama <skama@nvidia.com>
Date: Tue, 23 Jan 2018 18:28:49 -0800
Subject: [PATCH 055/966] Fix License date

---
 third_party/tensorrt/LICENSE | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/tensorrt/LICENSE b/third_party/tensorrt/LICENSE
index d3da228420..96c0e3bcaf 100644
--- a/third_party/tensorrt/LICENSE
+++ b/third_party/tensorrt/LICENSE
@@ -1,4 +1,4 @@
-Copyright 2015 The TensorFlow Authors.  All rights reserved.
+Copyright 2018 The TensorFlow Authors.  All rights reserved.
 
                                  Apache License
                            Version 2.0, January 2004
-- 
GitLab


From 93307b6ea9d94a6d69783a9afad51dbe11788d41 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Jan 2018 18:46:01 -0800
Subject: [PATCH 056/966] Go: Update generated wrapper functions for TensorFlow
 ops.

PiperOrigin-RevId: 183027491
---
 tensorflow/go/op/wrappers.go | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 2f1f3b0be4..7873201f3e 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -2461,6 +2461,8 @@ func TensorArrayV2TensorArrayName(value string) TensorArrayV2Attr {
 }
 
 // Deprecated. Use TensorArrayV3
+//
+// DEPRECATED at GraphDef version 26: Use TensorArrayV3
 func TensorArrayV2(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
@@ -10400,6 +10402,8 @@ func SparseReshape(scope *Scope, input_indices tf.Output, input_shape tf.Output,
 }
 
 // Deprecated. Use TensorArraySplitV3
+//
+// DEPRECATED at GraphDef version 26: Use TensorArraySplitV3
 func TensorArraySplitV2(scope *Scope, handle tf.Output, value tf.Output, lengths tf.Output, flow_in tf.Output) (flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
@@ -11813,6 +11817,8 @@ func MaxPoolV2(scope *Scope, input tf.Output, ksize tf.Output, strides tf.Output
 }
 
 // Deprecated. Use TensorArrayReadV3
+//
+// DEPRECATED at GraphDef version 26: Use TensorArrayReadV3
 func TensorArrayReadV2(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
 	if scope.Err() != nil {
 		return
@@ -20211,6 +20217,8 @@ func DenseToSparseBatchDataset(scope *Scope, input_dataset tf.Output, batch_size
 }
 
 // Deprecated. Use TensorArrayGradV3
+//
+// DEPRECATED at GraphDef version 26: Use TensorArrayGradV3
 func TensorArrayGradV2(scope *Scope, handle tf.Output, flow_in tf.Output, source string) (grad_handle tf.Output) {
 	if scope.Err() != nil {
 		return
@@ -21504,6 +21512,8 @@ func TensorArrayGatherV2ElementShape(value tf.Shape) TensorArrayGatherV2Attr {
 }
 
 // Deprecated. Use TensorArrayGatherV3
+//
+// DEPRECATED at GraphDef version 26: Use TensorArrayGatherV3
 func TensorArrayGatherV2(scope *Scope, handle tf.Output, indices tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayGatherV2Attr) (value tf.Output) {
 	if scope.Err() != nil {
 		return
@@ -22228,6 +22238,8 @@ func EncodeBase64(scope *Scope, input tf.Output, optional ...EncodeBase64Attr) (
 
 // Deprecated. Use TensorArrayCloseV3
 //
+// DEPRECATED at GraphDef version 26: Use TensorArrayCloseV3
+//
 // Returns the created operation.
 func TensorArrayCloseV2(scope *Scope, handle tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
@@ -23247,6 +23259,8 @@ func TensorArraySizeV3(scope *Scope, handle tf.Output, flow_in tf.Output) (size
 }
 
 // Deprecated. Use TensorArrayGradV3
+//
+// DEPRECATED at GraphDef version 26: Use TensorArrayWriteV3
 func TensorArrayWriteV2(scope *Scope, handle tf.Output, index tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
@@ -23397,6 +23411,8 @@ func AsString(scope *Scope, input tf.Output, optional ...AsStringAttr) (output t
 }
 
 // Deprecated. Use TensorArrayScatterV3
+//
+// DEPRECATED at GraphDef version 26: Use TensorArrayScatterV3
 func TensorArrayScatterV2(scope *Scope, handle tf.Output, indices tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
@@ -23601,6 +23617,8 @@ func FractionalMaxPool(scope *Scope, value tf.Output, pooling_ratio []float32, o
 }
 
 // Deprecated. Use TensorArraySizeV3
+//
+// DEPRECATED at GraphDef version 26: Use TensorArraySizeV3
 func TensorArraySizeV2(scope *Scope, handle tf.Output, flow_in tf.Output) (size tf.Output) {
 	if scope.Err() != nil {
 		return
-- 
GitLab


From 31301635e25a3e7bbac32c8527ceb60060c2264f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Jan 2018 18:49:09 -0800
Subject: [PATCH 057/966] Add nested support for `assertAllClose()` so `a` and
 `b` can be arbitrarily nested of a numpy array, tuple, list, namedtuple or
 anything can be converted to a numpy array.

PiperOrigin-RevId: 183027752
---
 tensorflow/python/framework/test_util.py      | 93 ++++++++++++++-----
 tensorflow/python/framework/test_util_test.py | 34 +++++--
 2 files changed, 96 insertions(+), 31 deletions(-)

diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 7b09891fc1..0133318456 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -1136,41 +1136,84 @@ class TensorFlowTestCase(googletest.TestCase):
       np.testing.assert_allclose(
           a, b, rtol=rtol, atol=atol, err_msg=msg, equal_nan=True)
 
-  def assertAllClose(self, a, b, rtol=1e-6, atol=1e-6):
-    """Asserts that two numpy arrays, or dicts of same, have near values.
-
-    This does not support nested dicts. `a` and `b` can be namedtuples too,
-    which are converted to dicts.
-
-    Args:
-      a: The expected numpy ndarray (or anything can be converted to one), or
-        dict of same. Must be a dict iff `b` is a dict.
-      b: The actual numpy ndarray (or anything can be converted to one), or
-        dict of same. Must be a dict iff `a` is a dict.
-      rtol: relative tolerance.
-      atol: absolute tolerance.
+  def _assertAllCloseRecursive(self, a, b, rtol=1e-6, atol=1e-6, path=None):
+    path = path or []
+    path_str = (("[" + "][".join([str(p) for p in path]) + "]") if path else "")
 
-    Raises:
-      ValueError: if only one of `a` and `b` is a dict.
-    """
     # Check if a and/or b are namedtuples.
     if hasattr(a, "_asdict"):
       a = a._asdict()
     if hasattr(b, "_asdict"):
       b = b._asdict()
-    is_a_dict = isinstance(a, dict)
-    if is_a_dict != isinstance(b, dict):
-      raise ValueError("Can't compare dict to non-dict, %s vs %s." % (a, b))
-    if is_a_dict:
+    a_is_dict = isinstance(a, dict)
+    if a_is_dict != isinstance(b, dict):
+      raise ValueError("Can't compare dict to non-dict, a%s vs b%s." %
+                       (path_str, path_str))
+    if a_is_dict:
       self.assertItemsEqual(
-          a.keys(), b.keys(),
-          msg="mismatched keys, expected %s, got %s" % (a.keys(), b.keys()))
+          a.keys(),
+          b.keys(),
+          msg="mismatched keys: a%s has keys %s, but b%s has keys %s" %
+          (path_str, a.keys(), path_str, b.keys()))
       for k in a:
+        path.append(k)
+        self._assertAllCloseRecursive(
+            a[k], b[k], rtol=rtol, atol=atol, path=path)
+        del path[-1]
+    elif isinstance(a, (list, tuple)):
+      # Try to directly compare a, b as ndarrays; if not work, then traverse
+      # through the sequence, which is more expensive.
+      try:
+        a_as_ndarray = np.array(a)
+        b_as_ndarray = np.array(b)
         self._assertArrayLikeAllClose(
-            a[k], b[k], rtol=rtol, atol=atol,
-            msg="%s: expected %s, got %s." % (k, a, b))
+            a_as_ndarray,
+            b_as_ndarray,
+            rtol=rtol,
+            atol=atol,
+            msg="Mismatched value: a%s is different from b%s." % (path_str,
+                                                                  path_str))
+      except (ValueError, TypeError) as e:
+        if len(a) != len(b):
+          raise ValueError(
+              "Mismatched length: a%s has %d items, but b%s has %d items" %
+              (path_str, len(a), path_str, len(b)))
+        for idx, (a_ele, b_ele) in enumerate(zip(a, b)):
+          path.append(str(idx))
+          self._assertAllCloseRecursive(
+              a_ele, b_ele, rtol=rtol, atol=atol, path=path)
+          del path[-1]
+    # a and b are ndarray like objects
     else:
-      self._assertArrayLikeAllClose(a, b, rtol=rtol, atol=atol)
+      self._assertArrayLikeAllClose(
+          a,
+          b,
+          rtol=rtol,
+          atol=atol,
+          msg="Mismatched value: a%s is different from b%s." % (path_str,
+                                                                path_str))
+
+  def assertAllClose(self, a, b, rtol=1e-6, atol=1e-6):
+    """Asserts that two structures of numpy arrays, have near values.
+
+    `a` and `b` can be arbitrarily nested structures. A layer of a nested
+    structure can be a `dict`, `namedtuple`, `tuple` or `list`.
+
+    Args:
+      a: The expected numpy `ndarray`, or anything that can be converted into a
+          numpy `ndarray`, or any arbitrarily nested of structure of these.
+      b: The actual numpy `ndarray`, or anything that can be converted into a
+          numpy `ndarray`, or any arbitrarily nested of structure of these.
+      rtol: relative tolerance.
+      atol: absolute tolerance.
+
+    Raises:
+      ValueError: if only one of `a[p]` and `b[p]` is a dict or
+          `a[p]` and `b[p]` have different length, where `[p]` denotes a path
+          to the nested structure, e.g. given `a = [(1, 1), {'d': (6, 7)}]` and
+          `[p] = [1]['d']`, then `a[p] = (6, 7)`.
+    """
+    self._assertAllCloseRecursive(a, b, rtol=rtol, atol=atol)
 
   def assertAllCloseAccordingToType(self,
                                     a,
diff --git a/tensorflow/python/framework/test_util_test.py b/tensorflow/python/framework/test_util_test.py
index 6ddb3533e5..3594d125bf 100644
--- a/tensorflow/python/framework/test_util_test.py
+++ b/tensorflow/python/framework/test_util_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import copy
 import random
 import threading
 
@@ -252,12 +253,30 @@ class TestUtilTest(test_util.TensorFlowTestCase):
     with self.assertRaisesRegexp(AssertionError, r"Not equal to tolerance"):
       self.assertAllClose(expected, {"a": a, "b": b, "c": c_copy})
 
-  def testAllCloseNestedDicts(self):
-    a = {"a": 1, "b": 2, "nested": {"d": 3, "e": 4}}
-    with self.assertRaisesRegexp(
-        TypeError,
-        r"inputs could not be safely coerced to any supported types"):
-      self.assertAllClose(a, a)
+  def testAllCloseListOfNamedtuples(self):
+    my_named_tuple = collections.namedtuple("MyNamedTuple", ["x", "y"])
+    l1 = [
+        my_named_tuple(x=np.array([[2.3, 2.5]]), y=np.array([[0.97, 0.96]])),
+        my_named_tuple(x=np.array([[3.3, 3.5]]), y=np.array([[0.98, 0.99]]))
+    ]
+    l2 = [
+        ([[2.3, 2.5]], [[0.97, 0.96]]),
+        ([[3.3, 3.5]], [[0.98, 0.99]]),
+    ]
+    self.assertAllClose(l1, l2)
+
+  def testAllCloseNestedStructure(self):
+    a = {"x": np.ones((3, 2, 4)) * 7, "y": (2, [{"nested": {"m": 3, "n": 4}}])}
+    self.assertAllClose(a, a)
+
+    b = copy.deepcopy(a)
+    self.assertAllClose(a, b)
+
+    # Test mismatched values
+    b["y"][1][0]["nested"]["n"] = 4.2
+    with self.assertRaisesRegexp(AssertionError,
+                                 r"\[y\]\[1\]\[0\]\[nested\]\[n\]"):
+      self.assertAllClose(a, b)
 
   def testArrayNear(self):
     a = [1, 2]
@@ -282,6 +301,9 @@ class TestUtilTest(test_util.TensorFlowTestCase):
         control_flow_ops.Assert(x, y).run()
 
   def testAssertAllCloseAccordingToType(self):
+    # test plain int
+    self.assertAllCloseAccordingToType(1, 1, rtol=1e-8, atol=1e-8)
+
     # test float64
     self.assertAllCloseAccordingToType(
         np.asarray([1e-8], dtype=np.float64),
-- 
GitLab


From 11429a5e25cf7bf9b45a564283a35e1495de6741 Mon Sep 17 00:00:00 2001
From: Shivani Agrawal <shivaniagrawal@google.com>
Date: Tue, 23 Jan 2018 20:09:58 -0800
Subject: [PATCH 058/966] [tf.data] Fixes flaky timeouts of
 stats_dataset_ops_test.

PiperOrigin-RevId: 183033139
---
 tensorflow/contrib/data/python/kernel_tests/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index 1fbf18f30a..1cf0202fd8 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -475,7 +475,7 @@ py_test(
 
 py_test(
     name = "stats_dataset_ops_test",
-    size = "small",
+    size = "medium",
     srcs = ["stats_dataset_ops_test.py"],
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
-- 
GitLab


From 2a9866020dbcb53de32bcd984150545a623e34b7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Jan 2018 21:51:06 -0800
Subject: [PATCH 059/966] Make an API for non-slot variables. This new API is
 safer than the existing AdamOptimizer code when switching between multiple
 graphs or multiple threads.

PiperOrigin-RevId: 183039296
---
 .../eager/python/checkpointable_test.py       | 48 +++++++-------
 .../python/training/lazy_adam_optimizer.py    |  5 +-
 .../opt/python/training/nadam_optimizer.py    | 15 +++--
 tensorflow/python/training/adam.py            | 62 ++++++++-----------
 tensorflow/python/training/adam_test.py       |  7 ++-
 tensorflow/python/training/optimizer.py       | 27 ++++++--
 .../training/sync_replicas_optimizer_test.py  |  5 +-
 7 files changed, 90 insertions(+), 79 deletions(-)

diff --git a/tensorflow/contrib/eager/python/checkpointable_test.py b/tensorflow/contrib/eager/python/checkpointable_test.py
index f820990bbe..ff419614f5 100644
--- a/tensorflow/contrib/eager/python/checkpointable_test.py
+++ b/tensorflow/contrib/eager/python/checkpointable_test.py
@@ -70,42 +70,36 @@ class CheckpointableAdam(adam.AdamOptimizer, checkpointable.Checkpointable):
     checkpointable.Checkpointable.__init__(self)
     adam.AdamOptimizer.__init__(self, *args, **kwargs)
 
-  # NOTE: Copied from AdamOptimizer with modifications to use add_variable
+  # NOTE: Copied from Optimizer with modifications to use add_variable
   # for non-slot variables. These contortions are necessary to maintain
   # checkpoint compatibility with variable.name based saving.
-  def _create_slots(self, var_list):
-    # Create the beta1 and beta2 accumulators on the same device as the first
-    # variable. Sort the var_list to make sure this device is consistent across
-    # workers (these need to go on the same PS, otherwise some updates are
-    # silently ignored).
-    first_var = min(var_list, key=lambda x: x.name)
-
-    create_new = self._beta1_power is None
-    if not create_new and context.in_graph_mode():
-      create_new = (self._beta1_power.graph is not first_var.graph)
-
-    if create_new:
-      with ops.colocate_with(first_var):
+  # TODO(allenl): Make this cleaner.
+  def _create_non_slot_variable(self, initial_value, name, colocate_with):
+    """Add an extra variable, not associated with a slot."""
+    if context.in_graph_mode():
+      graph = colocate_with.graph
+    else:
+      graph = None
 
+    key = (name, graph)
+    v = self._non_slot_dict.get(key, None)
+    if v is None:
+      with ops.colocate_with(colocate_with):
         def _variable_getter(name, shape, dtype, initializer):
           del shape, dtype  # not used, but there for compatibility
           return variable_scope.variable(
               name=name, initial_value=initializer, trainable=False)
 
-        self._beta1_power = self.add_variable(
-            name="beta1_power",
-            shape=[],
-            initializer=self._beta1,
+        initial_value = ops.convert_to_tensor(initial_value)
+        v = self.add_variable(
+            name=name,
+            shape=initial_value.get_shape(),
+            initializer=initial_value,
             getter=_variable_getter)
-        self._beta2_power = self.add_variable(
-            name="beta2_power",
-            shape=[],
-            initializer=self._beta2,
-            getter=_variable_getter)
-    # Create slots for the first and second moments.
-    for v in var_list:
-      self._zeros_slot(v, "m", self._name)
-      self._zeros_slot(v, "v", self._name)
+
+      self._non_slot_dict[key] = v
+
+    return v
 
   # TODO(allenl): Override slot variable creation (_get_or_make_slot,
   # _get_or_make_slot_with_initializer, _zeros_slot) to allow deferred
diff --git a/tensorflow/contrib/opt/python/training/lazy_adam_optimizer.py b/tensorflow/contrib/opt/python/training/lazy_adam_optimizer.py
index 4c3fec0672..aeca900bc8 100644
--- a/tensorflow/contrib/opt/python/training/lazy_adam_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/lazy_adam_optimizer.py
@@ -47,8 +47,9 @@ class LazyAdamOptimizer(adam.AdamOptimizer):
   """
 
   def _apply_sparse(self, grad, var):
-    beta1_power = math_ops.cast(self._beta1_power, var.dtype.base_dtype)
-    beta2_power = math_ops.cast(self._beta2_power, var.dtype.base_dtype)
+    beta1_power, beta2_power = self._get_beta_accumulators()
+    beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
+    beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype)
     lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
     beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
     beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
diff --git a/tensorflow/contrib/opt/python/training/nadam_optimizer.py b/tensorflow/contrib/opt/python/training/nadam_optimizer.py
index a4421ecfe6..44a8890cb1 100644
--- a/tensorflow/contrib/opt/python/training/nadam_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/nadam_optimizer.py
@@ -34,12 +34,13 @@ class NadamOptimizer(adam.AdamOptimizer):
   def _apply_dense(self, grad, var):
     m = self.get_slot(var, "m")
     v = self.get_slot(var, "v")
+    beta1_power, beta2_power = self._get_beta_accumulators()
     return training_ops.apply_adam(
         var,
         m,
         v,
-        math_ops.cast(self._beta1_power, var.dtype.base_dtype),
-        math_ops.cast(self._beta2_power, var.dtype.base_dtype),
+        math_ops.cast(beta1_power, var.dtype.base_dtype),
+        math_ops.cast(beta2_power, var.dtype.base_dtype),
         math_ops.cast(self._lr_t, var.dtype.base_dtype),
         math_ops.cast(self._beta1_t, var.dtype.base_dtype),
         math_ops.cast(self._beta2_t, var.dtype.base_dtype),
@@ -51,12 +52,13 @@ class NadamOptimizer(adam.AdamOptimizer):
   def _resource_apply_dense(self, grad, var):
     m = self.get_slot(var, "m")
     v = self.get_slot(var, "v")
+    beta1_power, beta2_power = self._get_beta_accumulators()
     return training_ops.resource_apply_adam(
         var.handle,
         m.handle,
         v.handle,
-        math_ops.cast(self._beta1_power, grad.dtype.base_dtype),
-        math_ops.cast(self._beta2_power, grad.dtype.base_dtype),
+        math_ops.cast(beta1_power, grad.dtype.base_dtype),
+        math_ops.cast(beta2_power, grad.dtype.base_dtype),
         math_ops.cast(self._lr_t, grad.dtype.base_dtype),
         math_ops.cast(self._beta1_t, grad.dtype.base_dtype),
         math_ops.cast(self._beta2_t, grad.dtype.base_dtype),
@@ -66,8 +68,9 @@ class NadamOptimizer(adam.AdamOptimizer):
         use_nesterov=True)
 
   def _apply_sparse_shared(self, grad, var, indices, scatter_add):
-    beta1_power = math_ops.cast(self._beta1_power, var.dtype.base_dtype)
-    beta2_power = math_ops.cast(self._beta2_power, var.dtype.base_dtype)
+    beta1_power, beta2_power = self._get_beta_accumulators()
+    beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
+    beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype)
     lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
     beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
     beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
diff --git a/tensorflow/python/training/adam.py b/tensorflow/python/training/adam.py
index 266f5563e0..0c69f8bf39 100644
--- a/tensorflow/python/training/adam.py
+++ b/tensorflow/python/training/adam.py
@@ -24,7 +24,6 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import variable_scope
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import training_ops
 
@@ -101,19 +100,16 @@ class AdamOptimizer(optimizer.Optimizer):
     self._beta2_t = None
     self._epsilon_t = None
 
-    # Variables to accumulate the powers of the beta parameters.
-    # Created in _create_slots when we know the variables to optimize.
-    self._beta1_power = None
-    self._beta2_power = None
-
     # Created in SparseApply if needed.
     self._updated_lr = None
 
   def _get_beta_accumulators(self):
-    return self._beta1_power, self._beta2_power
-
-  def _non_slot_variables(self):
-    return self._get_beta_accumulators()
+    if context.in_graph_mode():
+      graph = ops.get_default_graph()
+    else:
+      graph = None
+    return (self._get_non_slot_variable("beta1_power", graph=graph),
+            self._get_non_slot_variable("beta2_power", graph=graph))
 
   def _create_slots(self, var_list):
     # Create the beta1 and beta2 accumulators on the same device as the first
@@ -121,19 +117,13 @@ class AdamOptimizer(optimizer.Optimizer):
     # workers (these need to go on the same PS, otherwise some updates are
     # silently ignored).
     first_var = min(var_list, key=lambda x: x.name)
+    self._create_non_slot_variable(initial_value=self._beta1,
+                                   name="beta1_power",
+                                   colocate_with=first_var)
+    self._create_non_slot_variable(initial_value=self._beta2,
+                                   name="beta2_power",
+                                   colocate_with=first_var)
 
-    create_new = self._beta1_power is None
-    if not create_new and context.in_graph_mode():
-      create_new = (self._beta1_power.graph is not first_var.graph)
-
-    if create_new:
-      with ops.colocate_with(first_var):
-        self._beta1_power = variable_scope.variable(self._beta1,
-                                                    name="beta1_power",
-                                                    trainable=False)
-        self._beta2_power = variable_scope.variable(self._beta2,
-                                                    name="beta2_power",
-                                                    trainable=False)
     # Create slots for the first and second moments.
     for v in var_list:
       self._zeros_slot(v, "m", self._name)
@@ -148,10 +138,11 @@ class AdamOptimizer(optimizer.Optimizer):
   def _apply_dense(self, grad, var):
     m = self.get_slot(var, "m")
     v = self.get_slot(var, "v")
+    beta1_power, beta2_power = self._get_beta_accumulators()
     return training_ops.apply_adam(
         var, m, v,
-        math_ops.cast(self._beta1_power, var.dtype.base_dtype),
-        math_ops.cast(self._beta2_power, var.dtype.base_dtype),
+        math_ops.cast(beta1_power, var.dtype.base_dtype),
+        math_ops.cast(beta2_power, var.dtype.base_dtype),
         math_ops.cast(self._lr_t, var.dtype.base_dtype),
         math_ops.cast(self._beta1_t, var.dtype.base_dtype),
         math_ops.cast(self._beta2_t, var.dtype.base_dtype),
@@ -161,10 +152,11 @@ class AdamOptimizer(optimizer.Optimizer):
   def _resource_apply_dense(self, grad, var):
     m = self.get_slot(var, "m")
     v = self.get_slot(var, "v")
+    beta1_power, beta2_power = self._get_beta_accumulators()
     return training_ops.resource_apply_adam(
         var.handle, m.handle, v.handle,
-        math_ops.cast(self._beta1_power, grad.dtype.base_dtype),
-        math_ops.cast(self._beta2_power, grad.dtype.base_dtype),
+        math_ops.cast(beta1_power, grad.dtype.base_dtype),
+        math_ops.cast(beta2_power, grad.dtype.base_dtype),
         math_ops.cast(self._lr_t, grad.dtype.base_dtype),
         math_ops.cast(self._beta1_t, grad.dtype.base_dtype),
         math_ops.cast(self._beta2_t, grad.dtype.base_dtype),
@@ -172,8 +164,9 @@ class AdamOptimizer(optimizer.Optimizer):
         grad, use_locking=self._use_locking)
 
   def _apply_sparse_shared(self, grad, var, indices, scatter_add):
-    beta1_power = math_ops.cast(self._beta1_power, var.dtype.base_dtype)
-    beta2_power = math_ops.cast(self._beta2_power, var.dtype.base_dtype)
+    beta1_power, beta2_power = self._get_beta_accumulators()
+    beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
+    beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype)
     lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
     beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
     beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
@@ -217,12 +210,11 @@ class AdamOptimizer(optimizer.Optimizer):
   def _finish(self, update_ops, name_scope):
     # Update the power accumulators.
     with ops.control_dependencies(update_ops):
-      with ops.colocate_with(self._beta1_power):
-        update_beta1 = self._beta1_power.assign(
-            self._beta1_power * self._beta1_t,
-            use_locking=self._use_locking)
-        update_beta2 = self._beta2_power.assign(
-            self._beta2_power * self._beta2_t,
-            use_locking=self._use_locking)
+      beta1_power, beta2_power = self._get_beta_accumulators()
+      with ops.colocate_with(beta1_power):
+        update_beta1 = beta1_power.assign(
+            beta1_power * self._beta1_t, use_locking=self._use_locking)
+        update_beta2 = beta2_power.assign(
+            beta2_power * self._beta2_t, use_locking=self._use_locking)
     return control_flow_ops.group(*update_ops + [update_beta1, update_beta2],
                                   name=name_scope)
diff --git a/tensorflow/python/training/adam_test.py b/tensorflow/python/training/adam_test.py
index ffb66abc4c..a521f1299e 100644
--- a/tensorflow/python/training/adam_test.py
+++ b/tensorflow/python/training/adam_test.py
@@ -174,8 +174,11 @@ class AdamOptimizerTest(test.TestCase):
         opt = adam.AdamOptimizer()
         update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         opt_variables = opt.variables()
-        self.assertIn(opt._beta1_power, opt_variables)
-        self.assertIn(opt._beta2_power, opt_variables)
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+        self.assertTrue(beta1_power is not None)
+        self.assertTrue(beta2_power is not None)
+        self.assertIn(beta1_power, opt_variables)
+        self.assertIn(beta2_power, opt_variables)
 
         with ops.Graph().as_default():
           # Shouldn't return non-slot variables from other graphs.
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index 56cf4d42ee..038469b1ba 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -32,6 +32,7 @@ from tensorflow.python.ops import gradients
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.training import slot_creator
 from tensorflow.python.util import nest
@@ -299,6 +300,7 @@ class Optimizer(object):
     # Dictionary of slots.
     #  {slot_name : { variable_to_train: slot_for_the_variable, ...}, ... }
     self._slots = {}
+    self._non_slot_dict = {}
 
   def get_name(self):
     return self._name
@@ -603,17 +605,32 @@ class Optimizer(object):
     # Sort variables by name so that the return is deterministic.
     return sorted(optimizer_variables, key=lambda v: v.name)
 
+  def _create_non_slot_variable(self, initial_value, name, colocate_with):
+    """Add an extra variable, not associated with a slot."""
+    if context.in_graph_mode():
+      graph = colocate_with.graph
+    else:
+      graph = None
+
+    key = (name, graph)
+    v = self._non_slot_dict.get(key, None)
+    if v is None:
+      with ops.colocate_with(colocate_with):
+        v = variable_scope.variable(initial_value, name=name, trainable=False)
+      self._non_slot_dict[key] = v
+
+    return v
+
+  def _get_non_slot_variable(self, name, graph=None):
+    return self._non_slot_dict.get((name, graph), None)
+
   def _non_slot_variables(self):
     """Additional variables created by the `Optimizer`.
 
-    This method should be overridden by child classes which create extra
-    variables, so that `variables()` includes the `Optimizer`'s non-slot
-    variables.
-
     Returns:
       A list or tuple of variables.
     """
-    return []
+    return self._non_slot_dict.values()
 
   def _assert_valid_dtypes(self, tensors):
     """Asserts tensors are all valid types (see `_valid_dtypes`).
diff --git a/tensorflow/python/training/sync_replicas_optimizer_test.py b/tensorflow/python/training/sync_replicas_optimizer_test.py
index 297284f80c..fff17402e2 100644
--- a/tensorflow/python/training/sync_replicas_optimizer_test.py
+++ b/tensorflow/python/training/sync_replicas_optimizer_test.py
@@ -286,8 +286,9 @@ class SyncReplicasOptimizerHookTest(test.TestCase):
     global_step = variables.Variable(0, name="global_step", trainable=False)
     opt.minimize(v, global_step=global_step)
     opt_variables = opt.variables()
-    self.assertIn(opt._opt._beta1_power, opt_variables)
-    self.assertIn(opt._opt._beta2_power, opt_variables)
+    beta1_power, beta2_power = opt._opt._get_beta_accumulators()
+    self.assertIn(beta1_power, opt_variables)
+    self.assertIn(beta2_power, opt_variables)
 
 
 if __name__ == "__main__":
-- 
GitLab


From 95158cb83907d76e8df258fec23816c15ef4477e Mon Sep 17 00:00:00 2001
From: lspvic <lspvic@qq.com>
Date: Tue, 23 Jan 2018 03:19:57 +0800
Subject: [PATCH 060/966] Check axes value ranges

---
 tensorflow/python/ops/math_ops.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 88260e2687..f4f18733f8 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -2765,9 +2765,12 @@ def tensordot(a, b, axes, name=None):
     """Generates two sets of contraction axes for the two tensor arguments."""
     a_shape = a.get_shape()
     if isinstance(axes, compat.integral_types):
-      if axes < 1:
-        raise ValueError("'axes' must be at least 1.")
+      if axes < 0:
+        raise ValueError("'axes' must be at least 0.")
       if a_shape.ndims is not None:
+        if axes > a_shape.ndims:
+          raise ValueError("'axes' must not be larger than the number of "
+                           "dimensions of tensor %s." % a)
         return (list(xrange(a_shape.ndims - axes, a_shape.ndims)),
                 list(xrange(axes)))
       else:
-- 
GitLab


From 15d2baefdba2371761a93d7ff38c837ea433403f Mon Sep 17 00:00:00 2001
From: lspvic <lspvic@qq.com>
Date: Wed, 24 Jan 2018 13:43:03 +0800
Subject: [PATCH 061/966] Add tests for tensordot scalar axes

---
 .../python/kernel_tests/tensordot_op_test.py  | 54 +++++++++----------
 1 file changed, 27 insertions(+), 27 deletions(-)

diff --git a/tensorflow/python/kernel_tests/tensordot_op_test.py b/tensorflow/python/kernel_tests/tensordot_op_test.py
index f375157287..38205518b5 100644
--- a/tensorflow/python/kernel_tests/tensordot_op_test.py
+++ b/tensorflow/python/kernel_tests/tensordot_op_test.py
@@ -64,7 +64,7 @@ class TensordotTest(test_lib.TestCase):
     a = [[1, 2], [3, 4]]
     b = [[1, 2], [3, 4]]
     # Invalid static axes.
-    for axes_value in -1, 0, [1], [[1]], [[1], [0, 1]]:
+    for axes_value in -1, 3, [1], [[1]], [[1], [0, 1]]:
       with self.assertRaises(ValueError):
         math_ops.tensordot(a, b, axes_value)
 
@@ -87,7 +87,7 @@ class TensordotTest(test_lib.TestCase):
 
   # Test case for 11950
   def test_valid_axis(self):
-    for axes_value in [1, 2], [[1], [2]]:
+    for axes_value in [1, 2], [[1], [2]], [[], []], 0:
       with self.test_session() as sess:
         np_a = np.ones((3,3))
         np_b = np.array([2, 3, 1])[None, None]
@@ -102,29 +102,29 @@ class TensordotTest(test_lib.TestCase):
 
 
   def test_partial_shape_inference(self):
-    a = array_ops.placeholder(dtypes.float32)
-    b = array_ops.placeholder(dtypes.float32)
-    axes = ([1], [0])
-    output = math_ops.tensordot(a, b, axes)
-    self.assertEqual(output.get_shape().ndims, None)
-    a.set_shape([None, 2])
-    b.set_shape([2, 3])
-    output = math_ops.tensordot(a, b, axes)
-    output_shape = output.get_shape()
-    self.assertEqual(output_shape.ndims, 2)
-    output_shape = output_shape.as_list()
-    self.assertEqual(output_shape[0], None)
-    self.assertEqual(output_shape[1], 3)
-    a = array_ops.placeholder(dtypes.float32)
-    b = array_ops.placeholder(dtypes.float32)
-    a.set_shape([2, 2])
-    b.set_shape([2, None])
-    output = math_ops.tensordot(a, b, axes)
-    output_shape = output.get_shape()
-    self.assertEqual(output_shape.ndims, 2)
-    output_shape = output_shape.as_list()
-    self.assertEqual(output_shape[0], 2)
-    self.assertEqual(output_shape[1], None)
+    for axes in ([1],[0]), 1:
+      a = array_ops.placeholder(dtypes.float32)
+      b = array_ops.placeholder(dtypes.float32)
+      output = math_ops.tensordot(a, b, axes)
+      self.assertEqual(output.get_shape().ndims, None)
+      a.set_shape([None, 2])
+      b.set_shape([2, 3])
+      output = math_ops.tensordot(a, b, axes)
+      output_shape = output.get_shape()
+      self.assertEqual(output_shape.ndims, 2)
+      output_shape = output_shape.as_list()
+      self.assertEqual(output_shape[0], None)
+      self.assertEqual(output_shape[1], 3)
+      a = array_ops.placeholder(dtypes.float32)
+      b = array_ops.placeholder(dtypes.float32)
+      a.set_shape([2, 2])
+      b.set_shape([2, None])
+      output = math_ops.tensordot(a, b, axes)
+      output_shape = output.get_shape()
+      self.assertEqual(output_shape.ndims, 2)
+      output_shape = output_shape.as_list()
+      self.assertEqual(output_shape[0], 2)
+      self.assertEqual(output_shape[1], None)
 
 
 def _get_tensordot_tests(dtype_, rank_a_, rank_b_, num_dims_, dynamic_shape_):
@@ -191,8 +191,8 @@ def _get_tensordot_tests(dtype_, rank_a_, rank_b_, num_dims_, dynamic_shape_):
         low=-1.0, high=1.0, size=np.prod(shape)).reshape(shape).astype(dtype_)
     b_np = np.random.uniform(
         low=-1.0, high=1.0, size=np.prod(shape)).reshape(shape).astype(dtype_)
-    all_axes = [1]
-    if a_np.ndim > 1:
+    all_axes = [0, 1]
+    if a_np.ndim > 2:
       all_axes.append(a_np.ndim - 1)
     for axes in all_axes:
       np_ans = np.tensordot(a_np, b_np, axes=axes)
-- 
GitLab


From abce81d8203d2f111a8cb7d54aeb18bd464465c6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Jan 2018 23:17:06 -0800
Subject: [PATCH 062/966] Add Real NVP and NICE bijectors and tests.

PiperOrigin-RevId: 183044638
---
 tensorflow/contrib/distributions/BUILD        |  16 +
 .../kernel_tests/bijectors/real_nvp_test.py   | 144 +++++++++
 .../python/ops/bijectors/__init__.py          |   3 +
 .../python/ops/bijectors/real_nvp.py          | 282 ++++++++++++++++++
 4 files changed, 445 insertions(+)
 create mode 100644 tensorflow/contrib/distributions/python/kernel_tests/bijectors/real_nvp_test.py
 create mode 100644 tensorflow/contrib/distributions/python/ops/bijectors/real_nvp.py

diff --git a/tensorflow/contrib/distributions/BUILD b/tensorflow/contrib/distributions/BUILD
index 7d785c3636..7f510c4221 100644
--- a/tensorflow/contrib/distributions/BUILD
+++ b/tensorflow/contrib/distributions/BUILD
@@ -931,6 +931,22 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "real_nvp_test",
+    size = "small",
+    srcs = ["python/kernel_tests/bijectors/real_nvp_test.py"],
+    additional_deps = [
+        ":bijectors_py",
+        ":distributions_py",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 cuda_py_test(
     name = "permute_test",
     size = "small",
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/real_nvp_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/real_nvp_test.py
new file mode 100644
index 0000000000..46fe779741
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/real_nvp_test.py
@@ -0,0 +1,144 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for MaskedAutoregressiveFlow."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from tensorflow.contrib.distributions.python.ops import test_util
+from tensorflow.contrib.distributions.python.ops.bijectors.invert import Invert
+from tensorflow.contrib.distributions.python.ops.bijectors.real_nvp import real_nvp_default_template
+from tensorflow.contrib.distributions.python.ops.bijectors.real_nvp import RealNVP
+from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.ops.distributions import normal as normal_lib
+from tensorflow.python.ops.distributions import transformed_distribution as transformed_distribution_lib
+from tensorflow.python.platform import test
+
+
+class RealNVPTest(test_util.VectorDistributionTestHelpers, test.TestCase):
+
+  @property
+  def _real_nvp_kwargs(self):
+    return {
+        "shift_and_log_scale_fn": real_nvp_default_template(
+            hidden_layers=[3], shift_only=False),
+        "is_constant_jacobian": False,
+    }
+
+  def testBijector(self):
+    x_ = np.arange(3 * 4 * 2).astype(np.float32).reshape(3, 4 * 2)
+    with self.test_session() as sess:
+      nvp = RealNVP(
+          num_masked=4,
+          validate_args=True,
+          **self._real_nvp_kwargs)
+      x = constant_op.constant(x_)
+      forward_x = nvp.forward(x)
+      # Use identity to invalidate cache.
+      inverse_y = nvp.inverse(array_ops.identity(forward_x))
+      fldj = nvp.forward_log_det_jacobian(x)
+      # Use identity to invalidate cache.
+      ildj = nvp.inverse_log_det_jacobian(array_ops.identity(forward_x))
+      variables.global_variables_initializer().run()
+      [
+          forward_x_,
+          inverse_y_,
+          ildj_,
+          fldj_,
+      ] = sess.run([
+          forward_x,
+          inverse_y,
+          ildj,
+          fldj,
+      ])
+      self.assertEqual("real_nvp", nvp.name)
+      self.assertAllClose(forward_x_, forward_x_, rtol=1e-6, atol=0.)
+      self.assertAllClose(x_, inverse_y_, rtol=1e-5, atol=0.)
+      self.assertAllClose(ildj_, -fldj_, rtol=1e-6, atol=0.)
+
+  def testMutuallyConsistent(self):
+    dims = 4
+    with self.test_session() as sess:
+      nvp = RealNVP(
+          num_masked=3,
+          validate_args=True,
+          **self._real_nvp_kwargs)
+      dist = transformed_distribution_lib.TransformedDistribution(
+          distribution=normal_lib.Normal(loc=0., scale=1.),
+          bijector=nvp,
+          event_shape=[dims],
+          validate_args=True)
+      self.run_test_sample_consistent_log_prob(
+          sess_run_fn=sess.run,
+          dist=dist,
+          num_samples=int(1e5),
+          radius=1.,
+          center=0.,
+          rtol=0.02)
+
+  def testInvertMutuallyConsistent(self):
+    dims = 4
+    with self.test_session() as sess:
+      nvp = Invert(RealNVP(
+          num_masked=3,
+          validate_args=True,
+          **self._real_nvp_kwargs))
+      dist = transformed_distribution_lib.TransformedDistribution(
+          distribution=normal_lib.Normal(loc=0., scale=1.),
+          bijector=nvp,
+          event_shape=[dims],
+          validate_args=True)
+      self.run_test_sample_consistent_log_prob(
+          sess_run_fn=sess.run,
+          dist=dist,
+          num_samples=int(1e5),
+          radius=1.,
+          center=0.,
+          rtol=0.02)
+
+
+class NICETest(RealNVPTest):
+
+  @property
+  def _real_nvp_kwargs(self):
+    return {
+        "shift_and_log_scale_fn": real_nvp_default_template(
+            hidden_layers=[2], shift_only=True),
+        "is_constant_jacobian": True,
+    }
+
+
+class RealNVPConstantShiftScaleTest(RealNVPTest):
+
+  @property
+  def _real_nvp_kwargs(self):
+
+    def constant_shift_log_scale_fn(x0, output_units):
+      del x0, output_units
+      shift = constant_op.constant([0.1])
+      log_scale = constant_op.constant([0.5])
+      return shift, log_scale
+
+    return {
+        "shift_and_log_scale_fn": constant_shift_log_scale_fn,
+        "is_constant_jacobian": True,
+    }
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py b/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py
index bc0ec7f195..93923c3f08 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py
@@ -29,6 +29,7 @@
 @@MaskedAutoregressiveFlow
 @@Permute
 @@PowerTransform
+@@RealNVP
 @@Reshape
 @@Sigmoid
 @@SigmoidCentered
@@ -39,6 +40,7 @@
 
 @@masked_autoregressive_default_template
 @@masked_dense
+@@real_nvp_default_template
 """
 
 from __future__ import absolute_import
@@ -60,6 +62,7 @@ from tensorflow.contrib.distributions.python.ops.bijectors.invert import *
 from tensorflow.contrib.distributions.python.ops.bijectors.masked_autoregressive import *
 from tensorflow.contrib.distributions.python.ops.bijectors.permute import *
 from tensorflow.contrib.distributions.python.ops.bijectors.power_transform import *
+from tensorflow.contrib.distributions.python.ops.bijectors.real_nvp import *
 from tensorflow.contrib.distributions.python.ops.bijectors.reshape import *
 from tensorflow.contrib.distributions.python.ops.bijectors.sigmoid import *
 from tensorflow.contrib.distributions.python.ops.bijectors.sigmoid_centered import *
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/real_nvp.py b/tensorflow/contrib/distributions/python/ops/bijectors/real_nvp.py
new file mode 100644
index 0000000000..2840f52e74
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/real_nvp.py
@@ -0,0 +1,282 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Real NVP bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.layers import core as layers
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import template as template_ops
+from tensorflow.python.ops.distributions import bijector as bijector_lib
+
+
+__all__ = [
+    "RealNVP",
+    "real_nvp_default_template"
+]
+
+
+class RealNVP(bijector_lib.Bijector):
+  """RealNVP "affine coupling layer" for vector-valued events.
+
+  Real NVP models a normalizing flow on a `D`-dimensional distribution via a
+  single `D-d`-dimensional conditional distribution [1]:
+
+  `y[d:D] = y[d:D] * math_ops.exp(log_scale_fn(y[d:D])) + shift_fn(y[d:D])`
+  `y[0:d] = x[0:d]`
+
+  The last `D-d` units are scaled and shifted based on the first `d` units only,
+  while the first `d` units are 'masked' and left unchanged. Real NVP's
+  `shift_and_log_scale_fn` computes vector-valued quantities. For
+  scale-and-shift transforms that do not depend on any masked units, i.e.
+  `d=0`, use the `tfb.Affine` bijector with learned parameters instead.
+
+  Masking is currently only supported for base distributions with
+  `event_ndims=1`. For more sophisticated masking schemes like checkerboard or
+  channel-wise masking [2], use the `tfb.Permute` bijector to re-order desired
+  masked units into the first `d` units. For base distributions with
+  `event_ndims > 1`, use the `tfb.Reshape` bijector to flatten the event shape.
+
+  Recall that the MAF bijector [2] implements a normalizing flow via an
+  autoregressive transformation. MAF and IAF have opposite computational
+  tradeoffs - MAF can train all units in parallel but must sample units
+  sequentially, while IAF must train units sequentially but can sample in
+  parallel. In contrast, Real NVP can compute both forward and inverse
+  computations in parallel. However, the lack of an autoregressive
+  transformations makes it less expressive on a per-bijector basis.
+
+  A "valid" `shift_and_log_scale_fn` must compute each `shift` (aka `loc` or
+  "mu" [2]) and `log(scale)` (aka "alpha" [2]) such that each are broadcastable
+  with the arguments to `forward` and `inverse`, i.e., such that the
+  calculations in `forward`, `inverse` [below] are possible. For convenience,
+  `real_nvp_default_nvp` is offered as a possible `shift_and_log_scale_fn`
+  function.
+
+  NICE [3] is a special case of the Real NVP bijector which discards the scale
+  transformation, resulting in a constant-time inverse-log-determinant-Jacobian.
+  To use a NICE bijector instead of Real NVP, `shift_and_log_scale_fn` should
+  return `(shift, None)`, and `is_constant_jacobian` should be set to `True` in
+  the `RealNVP` constructor. Calling `real_nvp_default_template` with
+  `shift_only=True` returns one such NICE-compatible `shift_and_log_scale_fn`.
+
+  Caching: the scalar input depth `D` of the base distribution is not known at
+  construction time. The first call to any of `forward(x)`, `inverse(x)`,
+  `inverse_log_det_jacobian(x)`, or `forward_log_det_jacobian(x)` memoizes
+  `D`, which is re-used in subsequent calls. This shape must be known prior to
+  graph execution (which is the case if using tf.layers).
+
+  #### Example Use
+
+  ```python
+  tfd = tf.contrib.distributions
+  tfb = tfd.bijectors
+
+  # A common choice for a normalizing flow is to use a Gaussian for the base
+  # distribution. (However, any continuous distribution would work.) E.g.,
+  nvp = tfd.TransformedDistribution(
+      distribution=tfd.MultivariateNormalDiag(loc=[0., 0., 0.])),
+      bijector=tfb.RealNVP(
+          num_masked=2,
+          shift_and_log_scale_fn=tfb.real_nvp_default_template(
+              hidden_layers=[512, 512])))
+
+  x = nvp.sample()
+  nvp.log_prob(x)
+  nvp.log_prob(0.)
+  ```
+
+  For more examples, see [4].
+
+  [1]: "Density Estimation using Real NVP."
+       Laurent Dinh, Jascha Sohl-Dickstein, Samy Bengio. ICLR. 2017.
+       https://arxiv.org/abs/1605.08803
+
+  [2]: "Masked Autoregressive Flow for Density Estimation."
+       George Papamakarios, Theo Pavlakou, Iain Murray. Arxiv. 2017.
+       https://arxiv.org/abs/1705.07057
+
+  [3]: "NICE: Non-linear Independent Components Estimation."
+       Laurent Dinh, David Krueger, Yoshua Bengio. ICLR. 2015.
+       https://arxiv.org/abs/1410.8516
+
+  [4]: "Normalizing Flows Tutorial, Part 2: Modern Normalizing Flows."
+       Eric Jang. Blog post. January 2018.
+       http://blog.evjang.com/2018/01/nf2.html
+  """
+
+  def __init__(self,
+               num_masked,
+               shift_and_log_scale_fn,
+               is_constant_jacobian=False,
+               validate_args=False,
+               name=None):
+    """Creates the Real NVP or NICE bijector.
+
+    Args:
+      num_masked: Python `int` indicating that the first `d` units of the event
+        should be masked. Must be in the closed interval `[1, D-1]`, where `D`
+        is the event size of the base distribution.
+      shift_and_log_scale_fn: Python `callable` which computes `shift` and
+        `log_scale` from both the forward domain (`x`) and the inverse domain
+        (`y`). Calculation must respect the "autoregressive property" (see class
+        docstring). Suggested default
+        `masked_autoregressive_default_template(hidden_layers=...)`.
+        Typically the function contains `tf.Variables` and is wrapped using
+        `tf.make_template`. Returning `None` for either (both) `shift`,
+        `log_scale` is equivalent to (but more efficient than) returning zero.
+      is_constant_jacobian: Python `bool`. Default: `False`. When `True` the
+        implementation assumes `log_scale` does not depend on the forward domain
+        (`x`) or inverse domain (`y`) values. (No validation is made;
+        `is_constant_jacobian=False` is always safe but possibly computationally
+        inefficient.)
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str`, name given to ops managed by this object.
+
+    Raises:
+      ValueError: If num_masked < 1.
+    """
+    name = name or "real_nvp"
+    if num_masked <= 0:
+      raise ValueError("num_masked must be a positive integer.")
+    self._num_masked = num_masked
+    # At construction time, we don't know input_depth.
+    self._input_depth = None
+    self._shift_and_log_scale_fn = shift_and_log_scale_fn
+    super(RealNVP, self).__init__(
+        event_ndims=1,
+        is_constant_jacobian=is_constant_jacobian,
+        validate_args=validate_args,
+        name=name)
+
+  def _cache_input_depth(self, x):
+    if self._input_depth is None:
+      self._input_depth = x.shape.with_rank_at_least(1)[-1].value
+      if self._input_depth is None:
+        raise NotImplementedError(
+            "Rightmost dimension must be known prior to graph execution.")
+      if self._num_masked >= self._input_depth:
+        raise ValueError(
+            "Number of masked units must be smaller than the event size.")
+
+  def _forward(self, x):
+    self._cache_input_depth(x)
+    # Performs scale and shift.
+    x0, x1 = x[:, :self._num_masked], x[:, self._num_masked:]
+    shift, log_scale = self._shift_and_log_scale_fn(
+        x0, self._input_depth - self._num_masked)
+    y1 = x1
+    if log_scale is not None:
+      y1 *= math_ops.exp(log_scale)
+    if shift is not None:
+      y1 += shift
+    y = array_ops.concat([x0, y1], axis=-1)
+    return y
+
+  def _inverse(self, y):
+    self._cache_input_depth(y)
+    # Performs un-shift and un-scale.
+    y0, y1 = y[:, :self._num_masked], y[:, self._num_masked:]
+    shift, log_scale = self._shift_and_log_scale_fn(
+        y0, self._input_depth - self._num_masked)
+    x1 = y1
+    if shift is not None:
+      x1 -= shift
+    if log_scale is not None:
+      x1 *= math_ops.exp(-log_scale)
+    x = array_ops.concat([y0, x1], axis=-1)
+    return x
+
+  def _inverse_log_det_jacobian(self, y):
+    self._cache_input_depth(y)
+    y0 = y[:, :self._num_masked]
+    _, log_scale = self._shift_and_log_scale_fn(
+        y0, self._input_depth - self._num_masked)
+    if log_scale is None:
+      return constant_op.constant(0., dtype=y.dtype, name="ildj")
+    return -math_ops.reduce_sum(log_scale, axis=-1)
+
+  def _forward_log_det_jacobian(self, x):
+    self._cache_input_depth(x)
+    x0 = x[:, :self._num_masked]
+    _, log_scale = self._shift_and_log_scale_fn(
+        x0, self._input_depth - self._num_masked)
+    if log_scale is None:
+      return constant_op.constant(0., dtype=x.dtype, name="ildj")
+    return math_ops.reduce_sum(log_scale, axis=-1)
+
+
+def real_nvp_default_template(
+    hidden_layers,
+    shift_only=False,
+    activation=nn_ops.relu,
+    name=None,
+    *args,
+    **kwargs):
+  """Build a scale-and-shift function using a multi-layer neural network.
+
+  This will be wrapped in a make_template to ensure the variables are only
+  created once. It takes the `d`-dimensional input x[0:d] and returns the `D-d`
+  dimensional outputs `loc` ("mu") and `log_scale` ("alpha").
+
+  Arguments:
+    hidden_layers: Python `list`-like of non-negative integer, scalars
+      indicating the number of units in each hidden layer. Default: `[512, 512].
+    shift_only: Python `bool` indicating if only the `shift` term shall be
+      computed (i.e. NICE bijector). Default: `False`.
+    activation: Activation function (callable). Explicitly setting to `None`
+      implies a linear activation.
+    name: A name for ops managed by this function. Default:
+      "real_nvp_default_template".
+    *args: `tf.layers.dense` arguments.
+    **kwargs: `tf.layers.dense` keyword arguments.
+
+  Returns:
+    shift: `Float`-like `Tensor` of shift terms (the "mu" in [2]).
+    log_scale: `Float`-like `Tensor` of log(scale) terms (the "alpha" in [2]).
+
+  Raises:
+    NotImplementedError: if rightmost dimension of `inputs` is unknown prior to
+      graph execution.
+  """
+
+  with ops.name_scope(name, "real_nvp_default_template"):
+    def _fn(x, output_units):
+      """Fully connected MLP parameterized via `real_nvp_template`."""
+      for units in hidden_layers:
+        x = layers.dense(
+            inputs=x,
+            units=units,
+            activation=activation,
+            *args,
+            **kwargs)
+      x = layers.dense(
+          inputs=x,
+          units=(1 if shift_only else 2) * output_units,
+          activation=None,
+          *args,
+          **kwargs)
+      if shift_only:
+        return x, None
+      shift, log_scale = array_ops.split(x, 2, axis=-1)
+      return shift, log_scale
+    return template_ops.make_template(
+        "real_nvp_default_template", _fn)
-- 
GitLab


From 03975654063aa72bb2aa93b28aff544b22c677f9 Mon Sep 17 00:00:00 2001
From: Yifei Feng <fengyifei2026@gmail.com>
Date: Tue, 23 Jan 2018 23:43:35 -0800
Subject: [PATCH 063/966] Make graph transform tool accessible via command line
 for pip install. (#15967)

* Make graph transform tool accessible via command line for pip install.
RELNOTE: Make graph transform tool available from command line as
`transform_graph` for pip package.
Fix  #13287.

* Fix buildifier format error.

* Make wrapper work with bazel.

* Update bazel path to data dir.
---
 tensorflow/tools/graph_transforms/BUILD       | 12 ++++++
 .../graph_transforms_wrapper.py               | 39 +++++++++++++++++++
 tensorflow/tools/pip_package/BUILD            |  2 +
 .../tools/pip_package/build_pip_package.sh    |  4 +-
 tensorflow/tools/pip_package/setup.py         |  1 +
 5 files changed, 57 insertions(+), 1 deletion(-)
 create mode 100644 tensorflow/tools/graph_transforms/graph_transforms_wrapper.py

diff --git a/tensorflow/tools/graph_transforms/BUILD b/tensorflow/tools/graph_transforms/BUILD
index b5465b7fb3..31c4349431 100644
--- a/tensorflow/tools/graph_transforms/BUILD
+++ b/tensorflow/tools/graph_transforms/BUILD
@@ -317,6 +317,18 @@ tf_py_test(
     main = "python/transform_graph_test.py",
 )
 
+py_binary(
+    name = "graph_transforms_wrapper",
+    srcs = ["graph_transforms_wrapper.py"],
+    srcs_version = "PY2AND3",
+    data = [
+        ":transform_graph",
+    ],
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/tools/graph_transforms/graph_transforms_wrapper.py b/tensorflow/tools/graph_transforms/graph_transforms_wrapper.py
new file mode 100644
index 0000000000..e12feddba2
--- /dev/null
+++ b/tensorflow/tools/graph_transforms/graph_transforms_wrapper.py
@@ -0,0 +1,39 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Wrapper for runninmg graph_transform binary embedded in pip site-package.
+NOTE: this mainly exists since PIP setup.py cannot install binaries to bin/.
+It can only install Python "console-scripts." This will work as a console
+script. See tools/pip_package/setup.py (search for CONSOLE_SCRIPTS).
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+import tensorflow as tf
+
+
+def main():
+  # Pip installs the binary in aux-bin off of main site-package install.
+  # Just find it and exec, passing all arguments in the process.
+  pip_binary = os.path.join(tf.__path__[0], 'aux-bin/transform_graph')
+  bazel_binary = ('bazel-bin/tensorflow/tools/graph_transforms/'
+                  'graph_transforms_wrapper.runfiles/org_tensorflow/'
+                  'tensorflow/tools/graph_transforms/transform_graph')
+  binary = bazel_binary if os.path.isfile(bazel_binary) else pip_binary
+  os.execvp(binary, sys.argv)
+
+main()
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index c789e2ba0c..d61ddb889c 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -179,6 +179,8 @@ sh_binary(
             "//tensorflow/python/tools:tools_pip",
             "//tensorflow/python:test_ops",
             "//tensorflow/tools/dist_test/server:grpc_tensorflow_server",
+            "//tensorflow/tools/graph_transforms:graph_transforms_wrapper",
+            "//tensorflow/tools/graph_transforms:transform_graph",
         ],
     }) + if_mkl(["//third_party/mkl:intel_binary_blob"]),
 )
diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
index ca8c272a08..32b8c54a56 100755
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@@ -137,9 +137,11 @@ function main() {
         fi
       fi
     fi
-    # Install toco as a binary in aux-bin.
     mkdir "${TMPDIR}/tensorflow/aux-bin"
+    # Install toco as a binary in aux-bin.
     cp bazel-bin/tensorflow/contrib/lite/toco/toco ${TMPDIR}/tensorflow/aux-bin/
+    # Install graph transform tool as a bianry in aux-bin
+    cp bazel-bin/tensorflow/tools/graph_transforms/transform_graph ${TMPDIR}/tensorflow/aux-bin/
   fi
 
   # protobuf pip package doesn't ship with header files. Copy the headers
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 62df6453fb..0034fe447c 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -74,6 +74,7 @@ CONSOLE_SCRIPTS = [
     'freeze_graph = tensorflow.python.tools.freeze_graph:main',
     'toco_from_protos = tensorflow.contrib.lite.toco.python.toco_from_protos:main',
     'toco = tensorflow.contrib.lite.toco.python.toco_wrapper:main',
+    'transform_graph = tensorflow.tools.graph_transforms.graph_transforms_wrapper:main',
     'saved_model_cli = tensorflow.python.tools.saved_model_cli:main',
     # We need to keep the TensorBoard command, even though the console script
     # is now declared by the tensorboard pip package. If we remove the
-- 
GitLab


From 7c687707ee6e5aecc045be4c80a732585c7b045b Mon Sep 17 00:00:00 2001
From: ManHyuk <manhyuk@kw.ac.kr>
Date: Wed, 24 Jan 2018 16:45:59 +0900
Subject: [PATCH 064/966] Fix typo (#16340)

* fix typo

* fix typo
---
 tensorflow/python/estimator/canned/dnn_testing_utils.py    | 2 +-
 tensorflow/python/estimator/canned/linear_testing_utils.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/estimator/canned/dnn_testing_utils.py b/tensorflow/python/estimator/canned/dnn_testing_utils.py
index 2bdec69303..706575985f 100644
--- a/tensorflow/python/estimator/canned/dnn_testing_utils.py
+++ b/tensorflow/python/estimator/canned/dnn_testing_utils.py
@@ -877,7 +877,7 @@ class BaseDNNWarmStartingTest(object):
 
     # Create a second DNNClassifier, warm-started from the first.  Use a
     # learning_rate = 0.0 optimizer to check values (use SGD so we don't have
-    # accumulator values that change).  Use a a new FeatureColumn with a
+    # accumulator values that change).  Use a new FeatureColumn with a
     # different vocabulary for occupation.
     new_vocab_list = ['doctor', 'consultant', 'engineer']
     new_vocab_file = os.path.join(self._ckpt_and_vocab_dir,
diff --git a/tensorflow/python/estimator/canned/linear_testing_utils.py b/tensorflow/python/estimator/canned/linear_testing_utils.py
index cccb9af4b2..3e9183cf1b 100644
--- a/tensorflow/python/estimator/canned/linear_testing_utils.py
+++ b/tensorflow/python/estimator/canned/linear_testing_utils.py
@@ -2003,7 +2003,7 @@ class BaseLinearWarmStartingTest(object):
 
     # Create a second LinearClassifier, warm-started from the first.  Use a
     # learning_rate = 0.0 optimizer to check values (use SGD so we don't have
-    # accumulator values that change).  Use a a new FeatureColumn with a
+    # accumulator values that change).  Use a new FeatureColumn with a
     # different vocabulary for occupation.
     new_vocab_list = ['doctor', 'consultant', 'engineer']
     new_vocab_file = os.path.join(self._ckpt_and_vocab_dir,
-- 
GitLab


From 2e5ff39e56fcc63475dd4ecae18e3724695bf0a3 Mon Sep 17 00:00:00 2001
From: Taehoon Lee <me@taehoonlee.com>
Date: Wed, 24 Jan 2018 16:48:23 +0900
Subject: [PATCH 065/966] Fix typos (#16349)

---
 tensorflow/cc/tools/freeze_saved_model_test.cc                  | 2 +-
 tensorflow/compiler/xla/tests/test_utils.cc                     | 2 +-
 .../contrib/opt/python/training/model_average_optimizer_test.py | 2 +-
 tensorflow/contrib/py2tf/api.py                                 | 2 +-
 .../contrib/receptive_field/python/util/graph_compute_order.py  | 2 +-
 tensorflow/contrib/verbs/README.md                              | 2 +-
 tensorflow/contrib/verbs/rdma.h                                 | 2 +-
 tensorflow/core/kernels/mkl_softmax_op.cc                       | 2 +-
 tensorflow/python/layers/convolutional.py                       | 2 +-
 9 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/tensorflow/cc/tools/freeze_saved_model_test.cc b/tensorflow/cc/tools/freeze_saved_model_test.cc
index 57244a4f0a..52a81a5028 100644
--- a/tensorflow/cc/tools/freeze_saved_model_test.cc
+++ b/tensorflow/cc/tools/freeze_saved_model_test.cc
@@ -71,7 +71,7 @@ class FreezeTest : public ::testing::Test {
     return Status::OK();
   }
 
-  // Adds `graph_def` to `saved_model_bundle` and intializes a session with
+  // Adds `graph_def` to `saved_model_bundle` and initializes a session with
   // `init_node`.
   Status AddGraphDefToSavedModelBundle(const GraphDef& graph_def,
                                        const string& init_node,
diff --git a/tensorflow/compiler/xla/tests/test_utils.cc b/tensorflow/compiler/xla/tests/test_utils.cc
index 8b10aef5b8..0e90a32358 100644
--- a/tensorflow/compiler/xla/tests/test_utils.cc
+++ b/tensorflow/compiler/xla/tests/test_utils.cc
@@ -34,7 +34,7 @@ void PopulateWithRandomFloatingPointData(Literal* literal) {
   TF_CHECK_OK(literal->Populate<FloatT>(
       [&](tensorflow::gtl::ArraySlice<int64> indices) {
         // Generate a random uniforma number from -0.0625 and 0.0625 and bias it
-        // with  a position dependent nubmer with mean 0.037109375. These number
+        // with  a position dependent number with mean 0.037109375. These number
         // should allow for long chains of accumulation without being too close
         // to zero or to large to accumulate all numbers accurately.
         return (generator(engine) - 1.0625) +
diff --git a/tensorflow/contrib/opt/python/training/model_average_optimizer_test.py b/tensorflow/contrib/opt/python/training/model_average_optimizer_test.py
index a73aa772bb..a264946057 100644
--- a/tensorflow/contrib/opt/python/training/model_average_optimizer_test.py
+++ b/tensorflow/contrib/opt/python/training/model_average_optimizer_test.py
@@ -148,7 +148,7 @@ class ModelAverageOptimizerTest(test.TestCase):
     self.assertAllEqual(1.0, sessions[0].run(global_var_1))
     self.assertAllEqual(0, sessions[0].run(global_step))
 
-    # iteration 2, global varibale update
+    # iteration 2, global variable update
     thread_0 = self.checkedThread(
       target=self._run, args=(train_ops[0], sessions[0]))
     thread_1 = self.checkedThread(
diff --git a/tensorflow/contrib/py2tf/api.py b/tensorflow/contrib/py2tf/api.py
index 9a2b70c53c..0fd833e29c 100644
--- a/tensorflow/contrib/py2tf/api.py
+++ b/tensorflow/contrib/py2tf/api.py
@@ -86,7 +86,7 @@ def convert_inline(f, *args, **kwargs):
 def convert(recursive=False, arg_value_hints=None):
   """Decorator that compiles a function to graph mode.
 
-  The decorator is dynamic - invoking compilation whenever the decorated fuction
+  The decorator is dynamic - invoking compilation whenever the decorated function
   is called. This means the parameter values are known at compilation.
 
   Args:
diff --git a/tensorflow/contrib/receptive_field/python/util/graph_compute_order.py b/tensorflow/contrib/receptive_field/python/util/graph_compute_order.py
index b2360fec6c..0388079f20 100644
--- a/tensorflow/contrib/receptive_field/python/util/graph_compute_order.py
+++ b/tensorflow/contrib/receptive_field/python/util/graph_compute_order.py
@@ -61,7 +61,7 @@ def _compute_output_resolution(input_spatial_resolution, kernel_size, stride,
     stride: Stride (int).
     total_padding: Total padding to be applied (int).
   Returns:
-    output_resolution: Ouput dimension (int) or None.
+    output_resolution: Output dimension (int) or None.
   """
   if (input_spatial_resolution is None) or (kernel_size is None) or (
       stride is None) or (total_padding is None):
diff --git a/tensorflow/contrib/verbs/README.md b/tensorflow/contrib/verbs/README.md
index 1b99f4ce4f..647e68c695 100644
--- a/tensorflow/contrib/verbs/README.md
+++ b/tensorflow/contrib/verbs/README.md
@@ -115,7 +115,7 @@ When the receiver receives the RDMA write, it will locate the relevant **RdmaTen
 		* Reallocate the result tensor (and proxy tensor if required).
 		* Re-send the request to the remote side.
 	* **RecvTensorContent()**    - Receive tensor content from the remote side (RDMA write was completed).
-		* Decode proto if required and/or move to GPU if the content was not written to it directly (GPU direct is not avaliable).
+		* Decode proto if required and/or move to GPU if the content was not written to it directly (GPU direct is not available).
 		* Invoke the done callback.
 * **class RdmaTensorResponse**   - Holds and manages information for a single tensor response throughout the entire send cycle. API:
 	* **Start()**                - Start the response sequence. 
diff --git a/tensorflow/contrib/verbs/rdma.h b/tensorflow/contrib/verbs/rdma.h
index 68b3d59f56..d57c5138b1 100644
--- a/tensorflow/contrib/verbs/rdma.h
+++ b/tensorflow/contrib/verbs/rdma.h
@@ -269,7 +269,7 @@ class RdmaTensorRequest {
   // Receive tensor content (RDMA write was completed).
   //
   // Decode proto if required and/or move to GPU if the content was not
-  // written to it directly (GPU direct is not avaliable). Afterwards,
+  // written to it directly (GPU direct is not available). Afterwards,
   // invoke Done().
   void RecvTensorContent();
 
diff --git a/tensorflow/core/kernels/mkl_softmax_op.cc b/tensorflow/core/kernels/mkl_softmax_op.cc
index 896d562933..e976632514 100644
--- a/tensorflow/core/kernels/mkl_softmax_op.cc
+++ b/tensorflow/core/kernels/mkl_softmax_op.cc
@@ -105,7 +105,7 @@ class MklSoftmaxOp : public OpKernel {
       // Softmax MklDnn output layout is same as input layout.
       auto dst_pd = src.GetUsrMemPrimDesc();
 
-      // if input is MKL shape, ouput is also MKL shape.
+      // if input is MKL shape, output is also MKL shape.
       // if input is TF shape, output is also TF shape
       if (src_mkl_shape.IsMklTensor()) {
         output_mkl_shape.SetMklTensor(true);
diff --git a/tensorflow/python/layers/convolutional.py b/tensorflow/python/layers/convolutional.py
index 79c421f4c9..d5147b237b 100644
--- a/tensorflow/python/layers/convolutional.py
+++ b/tensorflow/python/layers/convolutional.py
@@ -1094,7 +1094,7 @@ class SeparableConv1D(_SeparableConv):
       strides = (1, 1, 1) + self.strides
       spatial_start_dim = 2
 
-    # Explictly broadcast inputs and kernels to 4D.
+    # Explicitly broadcast inputs and kernels to 4D.
     # TODO(fchollet): refactor when a native separable_conv1d op is available.
     inputs = array_ops.expand_dims(inputs, spatial_start_dim)
     depthwise_kernel = array_ops.expand_dims(self.depthwise_kernel, 0)
-- 
GitLab


From 41a3bfe1e970b420ac2454020487cafa5a7cd0b6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 24 Jan 2018 01:17:31 -0800
Subject: [PATCH 066/966] Use RunConfig.master to set up the Session for
 running prediction in Estimator.predict()

PiperOrigin-RevId: 183053484
---
 tensorflow/python/estimator/estimator.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index 90eecc1fda..b4c6b2747e 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -453,6 +453,7 @@ class Estimator(object):
       with training.MonitoredSession(
           session_creator=training.ChiefSessionCreator(
               checkpoint_filename_with_path=checkpoint_path,
+              master=self._config.master,
               scaffold=estimator_spec.scaffold,
               config=self._session_config),
           hooks=input_hooks + hooks) as mon_sess:
-- 
GitLab


From 6ce118c193561e500fa5ed3e4b3683c4d447ff76 Mon Sep 17 00:00:00 2001
From: ManHyuk <manhyuk@kw.ac.kr>
Date: Wed, 24 Jan 2018 21:24:54 +0900
Subject: [PATCH 067/966] fix typo

---
 tensorflow/core/profiler/internal/tfprof_stats.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/profiler/internal/tfprof_stats.h b/tensorflow/core/profiler/internal/tfprof_stats.h
index d78abda588..01b17e2679 100644
--- a/tensorflow/core/profiler/internal/tfprof_stats.h
+++ b/tensorflow/core/profiler/internal/tfprof_stats.h
@@ -83,7 +83,7 @@ class TFStats {
   const MultiGraphNodeProto& ShowMultiGraphNode(const string& cmd,
                                                 const Options& opts) const;
 
-  // A a (partial) graph to existing graph.
+  // Add a (partial) graph to existing graph.
   void AddGraph(std::unique_ptr<GraphDef> graph);
 
   // Add a step of run time meta data.
@@ -118,7 +118,7 @@ class TFStats {
   MultiGraphNodeProto empty_multi_graph_node_;
 
   std::map<int64, string> id_to_string_;
-  // Graph nodes covered by RunMetdata, that is traced with run time stats.
+  // Graph nodes covered by RunMetadata, that is traced with run time stats.
   std::set<int64> covered_nodes_;
 };
 
-- 
GitLab


From 61d12729f7607caad8ddae30f4727042a353fb6e Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Wed, 24 Jan 2018 06:21:09 -0800
Subject: [PATCH 068/966] [TPU] Raise error for unimplemented TPU op.

Raise an error as soon as detected that an unsupported TPU op is added. Update the list of unimplemented ops.

PiperOrigin-RevId: 183076718
---
 tensorflow/compiler/tf2xla/xla_op_registry.cc |   2 +
 tensorflow/contrib/tpu/python/tpu/tpu.py      | 511 +++++++++++++++++-
 2 files changed, 491 insertions(+), 22 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.cc b/tensorflow/compiler/tf2xla/xla_op_registry.cc
index 0dde6a986c..bbe808595d 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.cc
@@ -255,6 +255,8 @@ void XlaOpRegistry::RegisterCompilationKernels() {
 std::vector<const KernelDef*> XlaOpRegistry::DeviceKernels(
     const string& compilation_device_name,
     bool include_compilation_only_kernels) {
+  // Ensure compilation kernels registered.
+  RegisterCompilationKernels();
   std::vector<const KernelDef*> kernels;
   XlaOpRegistry& registry = Instance();
   mutex_lock lock(registry.mutex_);
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu.py b/tensorflow/contrib/tpu/python/tpu/tpu.py
index 8fec379aad..d552f77f50 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu.py
@@ -37,21 +37,479 @@ from tensorflow.python.util import compat
 # that's introduced outside of the infeed.
 _BLACKLISTED_OPS = set([
     "Placeholder",
+    "PlaceholderV2",
+    "PlaceholderWithDefault",
+    "ScalarSummary",
+])
+
+# Operations that are deprecated and won't be implemented.
+_DEPRECATED_OPS = set([
+    "AdjustContrast",
+    "AudioSummary",
+    "BatchCholesky",
+    "BatchCholeskyGrad",
+    "BatchFFT",
+    "BatchFFT2D",
+    "BatchFFT3D",
+    "BatchIFFT",
+    "BatchIFFT2D",
+    "BatchIFFT3D",
+    "BatchMatrixBandPart",
+    "BatchMatrixDeterminant",
+    "BatchMatrixDiag",
+    "BatchMatrixDiagPart",
+    "BatchMatrixInverse",
+    "BatchMatrixSetDiag",
+    "BatchMatrixSolve",
+    "BatchMatrixSolveLs",
+    "BatchMatrixTriangularSolve",
+    "BatchNormWithGlobalNormalization",
+    "BatchNormWithGlobalNormalizationGrad",
+    "BatchSelfAdjointEig",
+    "BatchSelfAdjointEigV2",
+    "BatchSvd",
+    "Conv3DBackpropFilter",
+    "Conv3DBackpropInput",
+    "FixedLengthRecordReader",
+    "IdentityReader",
+    "NegTrain",
+    "PlaceholderV2",
+    "QuantizeAndDequantize",
+    "RandomCrop",
+    "RandomPoisson",
+    "SelfAdjointEig",
+    "Skipgram",
+    "TFRecordReader",
+    "TensorArray",
+    "TensorArrayClose",
+    "TensorArrayCloseV2",
+    "TensorArrayConcat",
+    "TensorArrayConcatV2",
+    "TensorArrayGather",
+    "TensorArrayGatherV2",
+    "TensorArrayGrad",
+    "TensorArrayGradV2",
+    "TensorArrayPack",
+    "TensorArrayRead",
+    "TensorArrayReadV2",
+    "TensorArrayScatter",
+    "TensorArrayScatterV2",
+    "TensorArraySize",
+    "TensorArraySizeV2",
+    "TensorArraySplit",
+    "TensorArraySplitV2",
+    "TensorArrayUnpack",
+    "TensorArrayV2",
+    "TensorArrayWrite",
+    "TensorArrayWriteV2",
+    "TextLineReader",
+    "TileGrad",
+    "TopK",
+])
+
+# Operations that don't make it to the TPU but are added to the graph.
+_TRANSFORMED_OPS = set([
+    "Enter",
+    "Exit",
+    "LoopCond",
+    "Merge",
+    "NextIteration",
+    "Switch",
+    "TPUReplicateMetadata",
 ])
 
 # These operations will currently fail to compile, but we should be able to
 # support them eventually via CPU offload or extending our operation set.
 _NOT_IMPLEMENTED_OPS = set([
-    "AudioSummary",
+    "Abort",
+    "AccumulateNV2",
+    "Acos",
+    "AddManySparseToTensorsMap",
+    "AddSparseToTensorsMap",
+    "AddV2",
+    "AllCandidateSampler",
+    "Asin",
+    "AsString",
+    "Atan",
+    "AudioSpectrogram",
     "AudioSummaryV2",
+    "BatchDataset",
+    "Betainc",
+    "Bincount",
+    "Bitcast",
+    "BitwiseXor",
+    "Bucketize",
+    "BytesProducedStatsDataset",
+    "CacheDataset",
+    "CholeskyGrad",
+    "CompareAndBitpack",
+    "ComputeAccidentalHits",
+    "ConcatenateDataset",
+    "ConjugateTranspose",
+    "Copy",
+    "CopyHost",
+    "CriticalSectionOp",
+    "CropAndResize",
+    "CropAndResizeGradBoxes",
+    "CropAndResizeGradImage",
+    "CTCBeamSearchDecoder",
+    "CTCGreedyDecoder",
+    "CTCLoss",
+    "Cumprod",
+    "DataFormatDimMap",
+    "DataFormatVecPermute",
+    "DatasetToSingleElement",
+    "DebugGradientIdentity",
+    "DebugIdentity",
+    "DebugNanCount",
+    "DebugNumericSummary",
+    "DecodeAndCropJpeg",
+    "DecodeBase64",
+    "DecodeBmp",
+    "DecodeCompressed",
+    "DecodeCSV",
+    "DecodeGif",
+    "DecodeJpeg",
+    "DecodeJSONExample",
+    "DecodePng",
+    "DecodeRaw",
+    "DecodeWav",
+    "DeleteSessionTensor",
+    "DenseToDenseSetOperation",
+    "DenseToSparseBatchDataset",
+    "DenseToSparseSetOperation",
+    "Dequantize",
+    "DeserializeIterator",
+    "DeserializeManySparse",
+    "DeserializeSparse",
+    "DestroyResourceOp",
+    "Digamma",
+    "Dilation2D",
+    "Dilation2DBackpropFilter",
+    "Dilation2DBackpropInput",
+    "DrawBoundingBoxes",
+    "DynamicPartition",
+    "EagerPyFunc",
+    "EditDistance",
+    "EmptyTensorList",
+    "EncodeBase64",
+    "EncodeJpeg",
+    "EncodePng",
+    "EncodeWav",
+    "Erf",
+    "Erfc",
+    "ExecuteInCriticalSection",
+    "ExtractGlimpse",
+    "ExtractImagePatches",
+    "ExtractJpegShape",
+    "Fact",
+    "FakeQuantWithMinMaxArgs",
+    "FakeQuantWithMinMaxArgsGradient",
+    "FakeQuantWithMinMaxVars",
+    "FakeQuantWithMinMaxVarsGradient",
+    "FakeQuantWithMinMaxVarsPerChannel",
+    "FakeQuantWithMinMaxVarsPerChannelGradient",
+    "FIFOQueueV2",
+    "FilterDataset",
+    "FixedLengthRecordDataset",
+    "FixedLengthRecordReaderV2",
+    "FixedUnigramCandidateSampler",
+    "FlatMapDataset",
+    "FractionalAvgPool",
+    "FractionalAvgPoolGrad",
+    "FractionalMaxPool",
+    "FractionalMaxPoolGrad",
+    "FusedPadConv2D",
+    "FusedResizeAndPadConv2D",
+    "GatherNd",
+    "GenerateVocabRemapping",
+    "GetSessionHandle",
+    "GetSessionHandleV2",
+    "GetSessionTensor",
+    "GroupByWindowDataset",
+    "GuaranteeConst",
+    "HashTableV2",
+    "HistogramFixedWidth",
     "HistogramSummary",
+    "IdentityReaderV2",
+    "Igamma",
+    "Igammac",
+    "IgnoreErrorsDataset",
     "ImageSummary",
+    "ImmutableConst",
+    "InitializeTableFromTextFileV2",
+    "InitializeTableV2",
+    "InterleaveDataset",
+    "InTopK",
+    "InTopKV2",
+    "InvGrad",
+    "Iterator",
+    "IteratorFromStringHandle",
+    "IteratorGetNext",
+    "IteratorSetStatsAggregator",
+    "IteratorToStringHandle",
+    "LatencyStatsDataset",
+    "LearnedUnigramCandidateSampler",
+    "Lgamma",
+    "ListDiff",
+    "LoadAndRemapMatrix",
+    "LogMatrixDeterminant",
+    "LogUniformCandidateSampler",
+    "LookupTableExportV2",
+    "LookupTableFindV2",
+    "LookupTableImportV2",
+    "LookupTableInsertV2",
+    "LookupTableSizeV2",
+    "MakeIterator",
+    "MapAndBatchDataset",
+    "MapClear",
+    "MapDataset",
+    "MapIncompleteSize",
+    "MapPeek",
+    "MapSize",
+    "MapStage",
+    "MapUnstage",
+    "MapUnstageNoKey",
+    "MatchingFiles",
+    "MatrixBandPart",
+    "MatrixDeterminant",
+    "MatrixExponential",
+    "MatrixInverse",
+    "MatrixSetDiag",
+    "MatrixSolve",
+    "MatrixSolveLs",
+    "MatrixTriangularSolve",
+    "MaxPool3DGradGrad",
+    "MaxPoolGradGrad",
+    "MaxPoolGradGradV2",
+    "MaxPoolGradGradWithArgmax",
+    "MaxPoolGradV2",
+    "MaxPoolGradWithArgmax",
+    "MaxPoolV2",
+    "MaxPoolWithArgmax",
     "MergeSummary",
+    "MergeV2Checkpoints",
+    "Mfcc",
+    "MirrorPadGrad",
+    "MutableDenseHashTableV2",
+    "MutableHashTableOfTensorsV2",
+    "MutableHashTableV2",
+    "NonMaxSuppression",
+    "NonMaxSuppressionV2",
+    "NthElement",
+    "OneShotIterator",
+    "OrderedMapClear",
+    "OrderedMapIncompleteSize",
+    "OrderedMapPeek",
+    "OrderedMapSize",
+    "OrderedMapStage",
+    "OrderedMapUnstage",
+    "OrderedMapUnstageNoKey",
+    "PaddedBatchDataset",
+    "PaddingFIFOQueueV2",
+    "ParallelConcat",
+    "ParallelInterleaveDataset",
+    "ParallelMapDataset",
+    "ParameterizedTruncatedNormal",
+    "ParseExample",
+    "ParseSingleExample",
+    "ParseSingleSequenceExample",
+    "ParseTensor",
+    "Placeholder",
+    "PlaceholderWithDefault",
+    "Polygamma",
+    "PopulationCount",
+    "PrefetchDataset",
     "Print",
+    "PriorityQueueV2",
+    "PyFunc",
+    "PyFuncStateless",
+    "Qr",
+    "QuantizeAndDequantizeV3",
+    "QuantizedAdd",
+    "QuantizedAvgPool",
+    "QuantizedBatchNormWithGlobalNormalization",
+    "QuantizedBiasAdd",
+    "QuantizedConcat",
+    "QuantizedConv2D",
+    "QuantizedInstanceNorm",
+    "QuantizedMatMul",
+    "QuantizedMaxPool",
+    "QuantizedMul",
+    "QuantizeDownAndShrinkRange",
+    "QuantizedRelu",
+    "QuantizedRelu6",
+    "QuantizedReluX",
+    "QuantizedReshape",
+    "QuantizedResizeBilinear",
+    "QuantizeV2",
+    "QueueCloseV2",
+    "QueueDequeueManyV2",
+    "QueueDequeueUpToV2",
+    "QueueDequeueV2",
+    "QueueEnqueueManyV2",
+    "QueueEnqueueV2",
+    "QueueIsClosedV2",
+    "QueueSizeV2",
+    "RandomDataset",
+    "RandomGamma",
+    "RandomPoissonV2",
+    "RandomShuffle",
+    "RandomShuffleQueueV2",
+    "RangeDataset",
+    "ReaderNumRecordsProducedV2",
+    "ReaderNumWorkUnitsCompletedV2",
+    "ReaderReadUpToV2",
+    "ReaderReadV2",
+    "ReaderResetV2",
+    "ReaderRestoreStateV2",
+    "ReaderSerializeStateV2",
+    "ReadFile",
+    "RecordInput",
+    "ReduceJoin",
+    "RemoteCall",
+    "RemoteFusedGraphExecute",
+    "RepeatDataset",
+    "RequantizationRange",
+    "Requantize",
+    "ResizeArea",
+    "ResizeBicubic",
+    "ResizeBicubicGrad",
+    "ResizeNearestNeighbor",
+    "ResizeNearestNeighborGrad",
+    "ResourceApplyAdadelta",
+    "ResourceApplyAdagradDA",
+    "ResourceApplyAddSign",
+    "ResourceApplyCenteredRMSProp",
+    "ResourceApplyPowerSign",
+    "ResourceApplyProximalAdagrad",
+    "ResourceApplyProximalGradientDescent",
+    "ResourceCountUpTo",
+    "ResourceScatterAdd",
+    "ResourceScatterNdUpdate",
+    "ResourceScatterUpdate",
+    "ResourceSparseApplyAdadelta",
+    "ResourceSparseApplyAdagrad",
+    "ResourceSparseApplyAdagradDA",
+    "ResourceSparseApplyCenteredRMSProp",
+    "ResourceSparseApplyFtrl",
+    "ResourceSparseApplyFtrlV2",
+    "ResourceSparseApplyMomentum",
+    "ResourceSparseApplyProximalAdagrad",
+    "ResourceSparseApplyProximalGradientDescent",
+    "ResourceSparseApplyRMSProp",
+    "Restore",
+    "RestoreSlice",
+    "RestoreV2",
+    "ReverseSequence",
+    "SampleDistortedBoundingBox",
+    "SampleDistortedBoundingBoxV2",
+    "Save",
+    "SaveSlices",
+    "SaveV2",
     "ScalarSummary",
+    "ScanDataset",
+    "ScatterNd",
+    "ScatterNdNonAliasingAdd",
+    "SdcaFprint",
+    "SdcaOptimizer",
+    "SegmentMax",
+    "SegmentMean",
+    "SegmentMin",
+    "SegmentProd",
+    "SegmentSum",
+    "SelfAdjointEigV2",
+    "SerializeIterator",
+    "SerializeManySparse",
+    "SerializeSparse",
+    "SerializeTensor",
+    "SetSize",
+    "ShardedFilename",
+    "ShardedFilespec",
+    "ShuffleAndRepeatDataset",
+    "ShuffleDataset",
+    "SkipDataset",
+    "Snapshot",
+    "SparseAdd",
+    "SparseAddGrad",
+    "SparseConcat",
+    "SparseCross",
+    "SparseDenseCwiseAdd",
+    "SparseDenseCwiseDiv",
+    "SparseDenseCwiseMul",
+    "SparseFillEmptyRows",
+    "SparseFillEmptyRowsGrad",
+    "SparseReduceMax",
+    "SparseReduceMaxSparse",
+    "SparseReduceSum",
+    "SparseReduceSumSparse",
+    "SparseReorder",
+    "SparseReshape",
+    "SparseSegmentMean",
+    "SparseSegmentMeanGrad",
+    "SparseSegmentMeanWithNumSegments",
+    "SparseSegmentSqrtN",
+    "SparseSegmentSqrtNGrad",
+    "SparseSegmentSqrtNWithNumSegments",
+    "SparseSegmentSum",
+    "SparseSegmentSumWithNumSegments",
+    "SparseSlice",
+    "SparseSoftmax",
+    "SparseSparseMaximum",
+    "SparseSparseMinimum",
+    "SparseSplit",
+    "SparseTensorDenseAdd",
+    "SparseTensorDenseMatMul",
+    "SparseTensorSliceDataset",
+    "SparseToDense",
+    "SparseToSparseSetOperation",
+    "SqlDataset",
+    "Stage",
+    "StageClear",
+    "StagePeek",
+    "StageSize",
+    "StatelessTruncatedNormal",
+    "StatsAggregatorHandle",
+    "StatsAggregatorSummary",
+    "StringJoin",
+    "StringSplit",
+    "StringToHashBucket",
+    "StringToHashBucketFast",
+    "StringToHashBucketStrong",
+    "StringToNumber",
+    "Substr",
+    "Svd",
+    "TakeDataset",
+    "TakeManySparseFromTensorsMap",
+    "TensorDataset",
+    "TensorListFromTensor",
+    "TensorListLength",
+    "TensorListPopBack",
+    "TensorListPushBack",
+    "TensorListStack",
+    "TensorSliceDataset",
     "TensorSummary",
     "TensorSummaryV2",
-    ])
+    "TextLineDataset",
+    "TextLineReaderV2",
+    "TFRecordDataset",
+    "TFRecordReaderV2",
+    "ThreadUnsafeUnigramCandidateSampler",
+    "TopKV2",
+    "UniformCandidateSampler",
+    "Unique",
+    "UniqueDataset",
+    "UniqueV2",
+    "UniqueWithCounts",
+    "UnsortedSegmentMax",
+    "Unstage",
+    "VarHandleOp",
+    "Where",
+    "WholeFileReaderV2",
+    "WriteFile",
+    "Zeta",
+    "ZipDataset",
+])
 
 _MAX_WARNING_LINES = 5
 
@@ -124,32 +582,42 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
   def __init__(self, name):
     super(TPUReplicateContext, self).__init__()
     self._name = name
-    self._unsupported_ops = []
-
-  def report_unsupported_operations(self):
-    if self._unsupported_ops:
-      op_str = "\n".join(["  %s (%s)" % (op.type, op.name)
-                          for op in self._unsupported_ops[:_MAX_WARNING_LINES]])
-      logging.warning("%d unsupported operations found: \n%s",
-                      len(self._unsupported_ops), op_str)
-      if len(self._unsupported_ops) > _MAX_WARNING_LINES:
-        logging.warning("... and %d more" %
-                        (len(self._unsupported_ops) - _MAX_WARNING_LINES))
+    self._verified_functions = set()
 
   def AddOp(self, op):
     self._AddOpInternal(op)
 
   def _AddOpInternal(self, op):
-    # pylint: disable=protected-access
-    if op.type in _BLACKLISTED_OPS:
-      logging.error("Operation of type %s (%s) is not supported on the TPU. "
-                    "Execution will fail if this op is used in the graph. " %
-                    (op.type, op.name))
 
-    if op.type in _NOT_IMPLEMENTED_OPS:
-      self._unsupported_ops.append(op)
+    def check_supported_type(op_name, op_type):
+      # pylint: disable=protected-access
+      if op_type in _TRANSFORMED_OPS:
+        pass
+      elif op_type in _BLACKLISTED_OPS:
+        logging.error(
+            "Operation of type `%s` (%s) is not supported on the TPU. "
+            "Compilation will fail if this op is used in the graph. ", op_type,
+            op_name)
+      elif op_type in _DEPRECATED_OPS:
+        raise NotImplementedError(
+            "Operation of type %s (%s) is deprecated and won't be supported on"
+            " TPU." % (op_type, op_name))
+      elif op_type in _NOT_IMPLEMENTED_OPS:
+        raise NotImplementedError(
+            "Operation of type %s (%s) not supported on TPU." % (op_type,
+                                                                 op_name))
+      elif op.graph._is_function(op_type):
+        if op_type not in self._verified_functions:
+          fun_def = op.graph._get_function(op_type).definition
+          for node in fun_def.node_def:
+            check_supported_type(node.name, node.op)
+          self._verified_functions.add(op_type)
+      # pylint: enable=protected-access
 
-    if any(x.dtype._is_ref_dtype for x in op.inputs):
+    # pylint: disable=protected-access
+    check_supported_type(op.name, op.type)
+    if any(x.dtype._is_ref_dtype for x in op.inputs) or any(
+        x.dtype._is_ref_dtype for x in op.outputs):
       raise NotImplementedError(
           "Non-resource Variables are not supported inside TPU computations "
           "(operator name: %s)" % op.name)
@@ -360,7 +828,6 @@ def replicate(computation,
           new_output_tensors.append(array_ops.identity(t))
       output_tensors = new_output_tensors
     finally:
-      context.report_unsupported_operations()
       context.Exit()
 
     # Fan-out: Builds a TPUReplicatedOutput node for each output.
-- 
GitLab


From 67cd3121296671d32c8150e9e57ac8f296f367ae Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 24 Jan 2018 07:59:17 -0800
Subject: [PATCH 069/966] Remove unused ResizeBilinear options. Use a tensor to
 read new sizes.

PiperOrigin-RevId: 183085403
---
 .../contrib/lite/kernels/resize_bilinear.cc   | 44 ++++++++++---------
 .../lite/kernels/resize_bilinear_test.cc      | 29 +++++++-----
 tensorflow/contrib/lite/schema/schema.fbs     |  2 -
 .../testing/generated_examples_zip_test.cc    |  5 ++-
 4 files changed, 46 insertions(+), 34 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/resize_bilinear.cc b/tensorflow/contrib/lite/kernels/resize_bilinear.cc
index 1613c9a89f..9a419af023 100644
--- a/tensorflow/contrib/lite/kernels/resize_bilinear.cc
+++ b/tensorflow/contrib/lite/kernels/resize_bilinear.cc
@@ -33,49 +33,53 @@ enum KernelType {
 };
 
 constexpr int kInputTensor = 0;
+constexpr int kSizeTensor = 1;
 constexpr int kOutputTensor = 0;
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  auto* params =
-      reinterpret_cast<TfLiteResizeBilinearParams*>(node->builtin_data);
-
-  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
   TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* size = GetInput(context, node, kSizeTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   // TODO(ahentz): Our current implementations rely on the inputs being 4D.
   TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(size), 1);
 
   // TODO(ahentz): Our current implementations only support float32.
-  TF_LITE_ENSURE_EQ(context, output->type, kTfLiteFloat32);
-  TF_LITE_ENSURE_EQ(context, input->type, output->type);
-
-  TfLiteIntArray* output_size = TfLiteIntArrayCreate(4);
-  output_size->data[0] = input->dims->data[0];
-  output_size->data[1] = params->new_height;
-  output_size->data[2] = params->new_width;
-  output_size->data[3] = input->dims->data[3];
-
-  return context->ResizeTensor(context, output, output_size);
+  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_EQ(context, size->type, kTfLiteInt32);
+  // ResizeBilinear creates a float tensor even when the input is made of
+  // integers.
+  output->type = kTfLiteFloat32;
+
+  // TODO(ahentz): if the input is constant, we can allocate here.
+  output->allocation_type = kTfLiteDynamic;
+  return kTfLiteOk;
 }
 
 template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  auto* params =
-      reinterpret_cast<TfLiteResizeBilinearParams*>(node->builtin_data);
-
   TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* size = GetInput(context, node, kSizeTensor);
 
-  // We have to fake a tensor here, to satisfy ResizeBilinear().
-  int32 output_size_data[2] = {params->new_height, params->new_width};
+  // TODO(ahentz): we only need to do this here if it wasn't done in Eval().
+  TfLiteIntArray* output_size = TfLiteIntArrayCreate(4);
+  output_size->data[0] = input->dims->data[0];
+  const int32* size_data = GetTensorData<int32>(size);
+  output_size->data[1] = size_data[0];
+  output_size->data[2] = size_data[1];
+  output_size->data[3] = input->dims->data[3];
+  context->ResizeTensor(context, output, output_size);
+  TfLiteTensorRealloc(output->bytes, output);
 
   if (output->type == kTfLiteFloat32) {
 #define TF_LITE_RESIZE_BILINEAR(type)                                     \
   type::ResizeBilinear(GetTensorData<float>(input), GetTensorDims(input), \
-                       output_size_data, GetTensorDims({1, 1, 1, 2}),     \
+                       GetTensorData<int32>(size), GetTensorDims(size),   \
                        GetTensorData<float>(output), GetTensorDims(output))
 
     if (kernel_type == kReference) {
diff --git a/tensorflow/contrib/lite/kernels/resize_bilinear_test.cc b/tensorflow/contrib/lite/kernels/resize_bilinear_test.cc
index 314a71e210..2b1aaf654f 100644
--- a/tensorflow/contrib/lite/kernels/resize_bilinear_test.cc
+++ b/tensorflow/contrib/lite/kernels/resize_bilinear_test.cc
@@ -25,47 +25,52 @@ using ::testing::ElementsAreArray;
 
 class ResizeBilinearOpModel : public SingleOpModel {
  public:
-  ResizeBilinearOpModel(std::initializer_list<int> input_shape, int new_height,
-                        int new_width) {
+  ResizeBilinearOpModel(std::initializer_list<int> input_shape) {
     input_ = AddInput(TensorType_FLOAT32);
+    size_ = AddInput(TensorType_INT32);
     output_ = AddOutput(TensorType_FLOAT32);
-    SetBuiltinOp(
-        BuiltinOperator_RESIZE_BILINEAR, BuiltinOptions_ResizeBilinearOptions,
-        CreateResizeBilinearOptions(builder_, new_height, new_width).Union());
-    BuildInterpreter({input_shape});
+    SetBuiltinOp(BuiltinOperator_RESIZE_BILINEAR,
+                 BuiltinOptions_ResizeBilinearOptions,
+                 CreateResizeBilinearOptions(builder_).Union());
+    BuildInterpreter({input_shape, {2}});
   }
 
   void SetInput(std::initializer_list<float> data) {
     PopulateTensor(input_, data);
   }
+  void SetSize(std::initializer_list<int> data) { PopulateTensor(size_, data); }
 
   std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
 
  private:
   int input_;
+  int size_;
   int output_;
 };
 
 TEST(ResizeBilinearOpTest, HorizontalResize) {
-  ResizeBilinearOpModel m({1, 1, 2, 1}, 1, 3);
+  ResizeBilinearOpModel m({1, 1, 2, 1});
   m.SetInput({3, 6});
+  m.SetSize({1, 3});
   m.Invoke();
   EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({3, 5, 6})));
 }
 
 TEST(ResizeBilinearOpTest, VerticalResize) {
-  ResizeBilinearOpModel m({1, 2, 1, 1}, 3, 1);
+  ResizeBilinearOpModel m({1, 2, 1, 1});
   m.SetInput({3, 9});
+  m.SetSize({3, 1});
   m.Invoke();
   EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({3, 7, 9})));
 }
 
 TEST(ResizeBilinearOpTest, TwoDimensionalResize) {
-  ResizeBilinearOpModel m({1, 2, 2, 1}, 3, 3);
+  ResizeBilinearOpModel m({1, 2, 2, 1});
   m.SetInput({
       3, 6,  //
       9, 12  //
   });
+  m.SetSize({3, 3});
   m.Invoke();
   EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({
                                  3, 5, 6,    //
@@ -75,13 +80,14 @@ TEST(ResizeBilinearOpTest, TwoDimensionalResize) {
 }
 
 TEST(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatches) {
-  ResizeBilinearOpModel m({2, 2, 2, 1}, 3, 3);
+  ResizeBilinearOpModel m({2, 2, 2, 1});
   m.SetInput({
       3, 6,   //
       9, 12,  //
       4, 10,  //
       10, 16  //
   });
+  m.SetSize({3, 3});
   m.Invoke();
   EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({
                                  3, 5, 6,     //
@@ -94,11 +100,12 @@ TEST(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatches) {
 }
 
 TEST(ResizeBilinearOpTest, ThreeDimensionalResize) {
-  ResizeBilinearOpModel m({1, 2, 2, 2}, 3, 3);
+  ResizeBilinearOpModel m({1, 2, 2, 2});
   m.SetInput({
       3, 4, 6, 10,    //
       9, 10, 12, 16,  //
   });
+  m.SetSize({3, 3});
   m.Invoke();
   EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({
                                  3, 4, 5, 8, 6, 10,      //
diff --git a/tensorflow/contrib/lite/schema/schema.fbs b/tensorflow/contrib/lite/schema/schema.fbs
index 8ddad4d251..da7db9bcf4 100644
--- a/tensorflow/contrib/lite/schema/schema.fbs
+++ b/tensorflow/contrib/lite/schema/schema.fbs
@@ -266,8 +266,6 @@ table LSTMOptions {
 }
 
 table ResizeBilinearOptions {
-  new_height:int;
-  new_width:int;
 }
 
 // A call operation options
diff --git a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
index c29cd85c4d..36aa09090b 100644
--- a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
+++ b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
@@ -85,7 +85,10 @@ std::map<string, string> kBrokenTests = {
     {R"(l2normdim=\[2,3\],epsilon=.*,input_shape=\[1,3,4,3\])", "67963812"},
 
     // ResizeBilinear looks completely incompatible with Tensorflow
-    {R"(resize_bilinear)", "67964336"},
+    {R"(resize_bilinear.*dtype=tf.int32)", "72401107"},
+    {R"(resize_bilinearalign_corners=True,.*,size=\[2,2\])", "72401483"},
+    {R"(resize_bilinearalign_corners=True,.*,size=\[4,3\])", "72401483"},
+    {R"(resize_bilinearalign_corners=True,.*,size=\[5,6\])", "72401483"},
 
     // Transpose only supports 1D-4D input tensors.
     {R"(transposedtype=.*,input_shape=\[.,.,.,.,.\],perm=.*)", "71545879"},
-- 
GitLab


From 0f103fbb49b952cfcdcdfcb38d011f59db6dd97e Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Wed, 24 Jan 2018 08:00:10 -0800
Subject: [PATCH 070/966] Automated g4 rollback of changelist 183076718

PiperOrigin-RevId: 183085489
---
 tensorflow/compiler/tf2xla/xla_op_registry.cc |   2 -
 tensorflow/contrib/tpu/python/tpu/tpu.py      | 511 +-----------------
 2 files changed, 22 insertions(+), 491 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.cc b/tensorflow/compiler/tf2xla/xla_op_registry.cc
index bbe808595d..0dde6a986c 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.cc
@@ -255,8 +255,6 @@ void XlaOpRegistry::RegisterCompilationKernels() {
 std::vector<const KernelDef*> XlaOpRegistry::DeviceKernels(
     const string& compilation_device_name,
     bool include_compilation_only_kernels) {
-  // Ensure compilation kernels registered.
-  RegisterCompilationKernels();
   std::vector<const KernelDef*> kernels;
   XlaOpRegistry& registry = Instance();
   mutex_lock lock(registry.mutex_);
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu.py b/tensorflow/contrib/tpu/python/tpu/tpu.py
index d552f77f50..8fec379aad 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu.py
@@ -37,479 +37,21 @@ from tensorflow.python.util import compat
 # that's introduced outside of the infeed.
 _BLACKLISTED_OPS = set([
     "Placeholder",
-    "PlaceholderV2",
-    "PlaceholderWithDefault",
-    "ScalarSummary",
-])
-
-# Operations that are deprecated and won't be implemented.
-_DEPRECATED_OPS = set([
-    "AdjustContrast",
-    "AudioSummary",
-    "BatchCholesky",
-    "BatchCholeskyGrad",
-    "BatchFFT",
-    "BatchFFT2D",
-    "BatchFFT3D",
-    "BatchIFFT",
-    "BatchIFFT2D",
-    "BatchIFFT3D",
-    "BatchMatrixBandPart",
-    "BatchMatrixDeterminant",
-    "BatchMatrixDiag",
-    "BatchMatrixDiagPart",
-    "BatchMatrixInverse",
-    "BatchMatrixSetDiag",
-    "BatchMatrixSolve",
-    "BatchMatrixSolveLs",
-    "BatchMatrixTriangularSolve",
-    "BatchNormWithGlobalNormalization",
-    "BatchNormWithGlobalNormalizationGrad",
-    "BatchSelfAdjointEig",
-    "BatchSelfAdjointEigV2",
-    "BatchSvd",
-    "Conv3DBackpropFilter",
-    "Conv3DBackpropInput",
-    "FixedLengthRecordReader",
-    "IdentityReader",
-    "NegTrain",
-    "PlaceholderV2",
-    "QuantizeAndDequantize",
-    "RandomCrop",
-    "RandomPoisson",
-    "SelfAdjointEig",
-    "Skipgram",
-    "TFRecordReader",
-    "TensorArray",
-    "TensorArrayClose",
-    "TensorArrayCloseV2",
-    "TensorArrayConcat",
-    "TensorArrayConcatV2",
-    "TensorArrayGather",
-    "TensorArrayGatherV2",
-    "TensorArrayGrad",
-    "TensorArrayGradV2",
-    "TensorArrayPack",
-    "TensorArrayRead",
-    "TensorArrayReadV2",
-    "TensorArrayScatter",
-    "TensorArrayScatterV2",
-    "TensorArraySize",
-    "TensorArraySizeV2",
-    "TensorArraySplit",
-    "TensorArraySplitV2",
-    "TensorArrayUnpack",
-    "TensorArrayV2",
-    "TensorArrayWrite",
-    "TensorArrayWriteV2",
-    "TextLineReader",
-    "TileGrad",
-    "TopK",
-])
-
-# Operations that don't make it to the TPU but are added to the graph.
-_TRANSFORMED_OPS = set([
-    "Enter",
-    "Exit",
-    "LoopCond",
-    "Merge",
-    "NextIteration",
-    "Switch",
-    "TPUReplicateMetadata",
 ])
 
 # These operations will currently fail to compile, but we should be able to
 # support them eventually via CPU offload or extending our operation set.
 _NOT_IMPLEMENTED_OPS = set([
-    "Abort",
-    "AccumulateNV2",
-    "Acos",
-    "AddManySparseToTensorsMap",
-    "AddSparseToTensorsMap",
-    "AddV2",
-    "AllCandidateSampler",
-    "Asin",
-    "AsString",
-    "Atan",
-    "AudioSpectrogram",
+    "AudioSummary",
     "AudioSummaryV2",
-    "BatchDataset",
-    "Betainc",
-    "Bincount",
-    "Bitcast",
-    "BitwiseXor",
-    "Bucketize",
-    "BytesProducedStatsDataset",
-    "CacheDataset",
-    "CholeskyGrad",
-    "CompareAndBitpack",
-    "ComputeAccidentalHits",
-    "ConcatenateDataset",
-    "ConjugateTranspose",
-    "Copy",
-    "CopyHost",
-    "CriticalSectionOp",
-    "CropAndResize",
-    "CropAndResizeGradBoxes",
-    "CropAndResizeGradImage",
-    "CTCBeamSearchDecoder",
-    "CTCGreedyDecoder",
-    "CTCLoss",
-    "Cumprod",
-    "DataFormatDimMap",
-    "DataFormatVecPermute",
-    "DatasetToSingleElement",
-    "DebugGradientIdentity",
-    "DebugIdentity",
-    "DebugNanCount",
-    "DebugNumericSummary",
-    "DecodeAndCropJpeg",
-    "DecodeBase64",
-    "DecodeBmp",
-    "DecodeCompressed",
-    "DecodeCSV",
-    "DecodeGif",
-    "DecodeJpeg",
-    "DecodeJSONExample",
-    "DecodePng",
-    "DecodeRaw",
-    "DecodeWav",
-    "DeleteSessionTensor",
-    "DenseToDenseSetOperation",
-    "DenseToSparseBatchDataset",
-    "DenseToSparseSetOperation",
-    "Dequantize",
-    "DeserializeIterator",
-    "DeserializeManySparse",
-    "DeserializeSparse",
-    "DestroyResourceOp",
-    "Digamma",
-    "Dilation2D",
-    "Dilation2DBackpropFilter",
-    "Dilation2DBackpropInput",
-    "DrawBoundingBoxes",
-    "DynamicPartition",
-    "EagerPyFunc",
-    "EditDistance",
-    "EmptyTensorList",
-    "EncodeBase64",
-    "EncodeJpeg",
-    "EncodePng",
-    "EncodeWav",
-    "Erf",
-    "Erfc",
-    "ExecuteInCriticalSection",
-    "ExtractGlimpse",
-    "ExtractImagePatches",
-    "ExtractJpegShape",
-    "Fact",
-    "FakeQuantWithMinMaxArgs",
-    "FakeQuantWithMinMaxArgsGradient",
-    "FakeQuantWithMinMaxVars",
-    "FakeQuantWithMinMaxVarsGradient",
-    "FakeQuantWithMinMaxVarsPerChannel",
-    "FakeQuantWithMinMaxVarsPerChannelGradient",
-    "FIFOQueueV2",
-    "FilterDataset",
-    "FixedLengthRecordDataset",
-    "FixedLengthRecordReaderV2",
-    "FixedUnigramCandidateSampler",
-    "FlatMapDataset",
-    "FractionalAvgPool",
-    "FractionalAvgPoolGrad",
-    "FractionalMaxPool",
-    "FractionalMaxPoolGrad",
-    "FusedPadConv2D",
-    "FusedResizeAndPadConv2D",
-    "GatherNd",
-    "GenerateVocabRemapping",
-    "GetSessionHandle",
-    "GetSessionHandleV2",
-    "GetSessionTensor",
-    "GroupByWindowDataset",
-    "GuaranteeConst",
-    "HashTableV2",
-    "HistogramFixedWidth",
     "HistogramSummary",
-    "IdentityReaderV2",
-    "Igamma",
-    "Igammac",
-    "IgnoreErrorsDataset",
     "ImageSummary",
-    "ImmutableConst",
-    "InitializeTableFromTextFileV2",
-    "InitializeTableV2",
-    "InterleaveDataset",
-    "InTopK",
-    "InTopKV2",
-    "InvGrad",
-    "Iterator",
-    "IteratorFromStringHandle",
-    "IteratorGetNext",
-    "IteratorSetStatsAggregator",
-    "IteratorToStringHandle",
-    "LatencyStatsDataset",
-    "LearnedUnigramCandidateSampler",
-    "Lgamma",
-    "ListDiff",
-    "LoadAndRemapMatrix",
-    "LogMatrixDeterminant",
-    "LogUniformCandidateSampler",
-    "LookupTableExportV2",
-    "LookupTableFindV2",
-    "LookupTableImportV2",
-    "LookupTableInsertV2",
-    "LookupTableSizeV2",
-    "MakeIterator",
-    "MapAndBatchDataset",
-    "MapClear",
-    "MapDataset",
-    "MapIncompleteSize",
-    "MapPeek",
-    "MapSize",
-    "MapStage",
-    "MapUnstage",
-    "MapUnstageNoKey",
-    "MatchingFiles",
-    "MatrixBandPart",
-    "MatrixDeterminant",
-    "MatrixExponential",
-    "MatrixInverse",
-    "MatrixSetDiag",
-    "MatrixSolve",
-    "MatrixSolveLs",
-    "MatrixTriangularSolve",
-    "MaxPool3DGradGrad",
-    "MaxPoolGradGrad",
-    "MaxPoolGradGradV2",
-    "MaxPoolGradGradWithArgmax",
-    "MaxPoolGradV2",
-    "MaxPoolGradWithArgmax",
-    "MaxPoolV2",
-    "MaxPoolWithArgmax",
     "MergeSummary",
-    "MergeV2Checkpoints",
-    "Mfcc",
-    "MirrorPadGrad",
-    "MutableDenseHashTableV2",
-    "MutableHashTableOfTensorsV2",
-    "MutableHashTableV2",
-    "NonMaxSuppression",
-    "NonMaxSuppressionV2",
-    "NthElement",
-    "OneShotIterator",
-    "OrderedMapClear",
-    "OrderedMapIncompleteSize",
-    "OrderedMapPeek",
-    "OrderedMapSize",
-    "OrderedMapStage",
-    "OrderedMapUnstage",
-    "OrderedMapUnstageNoKey",
-    "PaddedBatchDataset",
-    "PaddingFIFOQueueV2",
-    "ParallelConcat",
-    "ParallelInterleaveDataset",
-    "ParallelMapDataset",
-    "ParameterizedTruncatedNormal",
-    "ParseExample",
-    "ParseSingleExample",
-    "ParseSingleSequenceExample",
-    "ParseTensor",
-    "Placeholder",
-    "PlaceholderWithDefault",
-    "Polygamma",
-    "PopulationCount",
-    "PrefetchDataset",
     "Print",
-    "PriorityQueueV2",
-    "PyFunc",
-    "PyFuncStateless",
-    "Qr",
-    "QuantizeAndDequantizeV3",
-    "QuantizedAdd",
-    "QuantizedAvgPool",
-    "QuantizedBatchNormWithGlobalNormalization",
-    "QuantizedBiasAdd",
-    "QuantizedConcat",
-    "QuantizedConv2D",
-    "QuantizedInstanceNorm",
-    "QuantizedMatMul",
-    "QuantizedMaxPool",
-    "QuantizedMul",
-    "QuantizeDownAndShrinkRange",
-    "QuantizedRelu",
-    "QuantizedRelu6",
-    "QuantizedReluX",
-    "QuantizedReshape",
-    "QuantizedResizeBilinear",
-    "QuantizeV2",
-    "QueueCloseV2",
-    "QueueDequeueManyV2",
-    "QueueDequeueUpToV2",
-    "QueueDequeueV2",
-    "QueueEnqueueManyV2",
-    "QueueEnqueueV2",
-    "QueueIsClosedV2",
-    "QueueSizeV2",
-    "RandomDataset",
-    "RandomGamma",
-    "RandomPoissonV2",
-    "RandomShuffle",
-    "RandomShuffleQueueV2",
-    "RangeDataset",
-    "ReaderNumRecordsProducedV2",
-    "ReaderNumWorkUnitsCompletedV2",
-    "ReaderReadUpToV2",
-    "ReaderReadV2",
-    "ReaderResetV2",
-    "ReaderRestoreStateV2",
-    "ReaderSerializeStateV2",
-    "ReadFile",
-    "RecordInput",
-    "ReduceJoin",
-    "RemoteCall",
-    "RemoteFusedGraphExecute",
-    "RepeatDataset",
-    "RequantizationRange",
-    "Requantize",
-    "ResizeArea",
-    "ResizeBicubic",
-    "ResizeBicubicGrad",
-    "ResizeNearestNeighbor",
-    "ResizeNearestNeighborGrad",
-    "ResourceApplyAdadelta",
-    "ResourceApplyAdagradDA",
-    "ResourceApplyAddSign",
-    "ResourceApplyCenteredRMSProp",
-    "ResourceApplyPowerSign",
-    "ResourceApplyProximalAdagrad",
-    "ResourceApplyProximalGradientDescent",
-    "ResourceCountUpTo",
-    "ResourceScatterAdd",
-    "ResourceScatterNdUpdate",
-    "ResourceScatterUpdate",
-    "ResourceSparseApplyAdadelta",
-    "ResourceSparseApplyAdagrad",
-    "ResourceSparseApplyAdagradDA",
-    "ResourceSparseApplyCenteredRMSProp",
-    "ResourceSparseApplyFtrl",
-    "ResourceSparseApplyFtrlV2",
-    "ResourceSparseApplyMomentum",
-    "ResourceSparseApplyProximalAdagrad",
-    "ResourceSparseApplyProximalGradientDescent",
-    "ResourceSparseApplyRMSProp",
-    "Restore",
-    "RestoreSlice",
-    "RestoreV2",
-    "ReverseSequence",
-    "SampleDistortedBoundingBox",
-    "SampleDistortedBoundingBoxV2",
-    "Save",
-    "SaveSlices",
-    "SaveV2",
     "ScalarSummary",
-    "ScanDataset",
-    "ScatterNd",
-    "ScatterNdNonAliasingAdd",
-    "SdcaFprint",
-    "SdcaOptimizer",
-    "SegmentMax",
-    "SegmentMean",
-    "SegmentMin",
-    "SegmentProd",
-    "SegmentSum",
-    "SelfAdjointEigV2",
-    "SerializeIterator",
-    "SerializeManySparse",
-    "SerializeSparse",
-    "SerializeTensor",
-    "SetSize",
-    "ShardedFilename",
-    "ShardedFilespec",
-    "ShuffleAndRepeatDataset",
-    "ShuffleDataset",
-    "SkipDataset",
-    "Snapshot",
-    "SparseAdd",
-    "SparseAddGrad",
-    "SparseConcat",
-    "SparseCross",
-    "SparseDenseCwiseAdd",
-    "SparseDenseCwiseDiv",
-    "SparseDenseCwiseMul",
-    "SparseFillEmptyRows",
-    "SparseFillEmptyRowsGrad",
-    "SparseReduceMax",
-    "SparseReduceMaxSparse",
-    "SparseReduceSum",
-    "SparseReduceSumSparse",
-    "SparseReorder",
-    "SparseReshape",
-    "SparseSegmentMean",
-    "SparseSegmentMeanGrad",
-    "SparseSegmentMeanWithNumSegments",
-    "SparseSegmentSqrtN",
-    "SparseSegmentSqrtNGrad",
-    "SparseSegmentSqrtNWithNumSegments",
-    "SparseSegmentSum",
-    "SparseSegmentSumWithNumSegments",
-    "SparseSlice",
-    "SparseSoftmax",
-    "SparseSparseMaximum",
-    "SparseSparseMinimum",
-    "SparseSplit",
-    "SparseTensorDenseAdd",
-    "SparseTensorDenseMatMul",
-    "SparseTensorSliceDataset",
-    "SparseToDense",
-    "SparseToSparseSetOperation",
-    "SqlDataset",
-    "Stage",
-    "StageClear",
-    "StagePeek",
-    "StageSize",
-    "StatelessTruncatedNormal",
-    "StatsAggregatorHandle",
-    "StatsAggregatorSummary",
-    "StringJoin",
-    "StringSplit",
-    "StringToHashBucket",
-    "StringToHashBucketFast",
-    "StringToHashBucketStrong",
-    "StringToNumber",
-    "Substr",
-    "Svd",
-    "TakeDataset",
-    "TakeManySparseFromTensorsMap",
-    "TensorDataset",
-    "TensorListFromTensor",
-    "TensorListLength",
-    "TensorListPopBack",
-    "TensorListPushBack",
-    "TensorListStack",
-    "TensorSliceDataset",
     "TensorSummary",
     "TensorSummaryV2",
-    "TextLineDataset",
-    "TextLineReaderV2",
-    "TFRecordDataset",
-    "TFRecordReaderV2",
-    "ThreadUnsafeUnigramCandidateSampler",
-    "TopKV2",
-    "UniformCandidateSampler",
-    "Unique",
-    "UniqueDataset",
-    "UniqueV2",
-    "UniqueWithCounts",
-    "UnsortedSegmentMax",
-    "Unstage",
-    "VarHandleOp",
-    "Where",
-    "WholeFileReaderV2",
-    "WriteFile",
-    "Zeta",
-    "ZipDataset",
-])
+    ])
 
 _MAX_WARNING_LINES = 5
 
@@ -582,42 +124,32 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
   def __init__(self, name):
     super(TPUReplicateContext, self).__init__()
     self._name = name
-    self._verified_functions = set()
+    self._unsupported_ops = []
+
+  def report_unsupported_operations(self):
+    if self._unsupported_ops:
+      op_str = "\n".join(["  %s (%s)" % (op.type, op.name)
+                          for op in self._unsupported_ops[:_MAX_WARNING_LINES]])
+      logging.warning("%d unsupported operations found: \n%s",
+                      len(self._unsupported_ops), op_str)
+      if len(self._unsupported_ops) > _MAX_WARNING_LINES:
+        logging.warning("... and %d more" %
+                        (len(self._unsupported_ops) - _MAX_WARNING_LINES))
 
   def AddOp(self, op):
     self._AddOpInternal(op)
 
   def _AddOpInternal(self, op):
+    # pylint: disable=protected-access
+    if op.type in _BLACKLISTED_OPS:
+      logging.error("Operation of type %s (%s) is not supported on the TPU. "
+                    "Execution will fail if this op is used in the graph. " %
+                    (op.type, op.name))
 
-    def check_supported_type(op_name, op_type):
-      # pylint: disable=protected-access
-      if op_type in _TRANSFORMED_OPS:
-        pass
-      elif op_type in _BLACKLISTED_OPS:
-        logging.error(
-            "Operation of type `%s` (%s) is not supported on the TPU. "
-            "Compilation will fail if this op is used in the graph. ", op_type,
-            op_name)
-      elif op_type in _DEPRECATED_OPS:
-        raise NotImplementedError(
-            "Operation of type %s (%s) is deprecated and won't be supported on"
-            " TPU." % (op_type, op_name))
-      elif op_type in _NOT_IMPLEMENTED_OPS:
-        raise NotImplementedError(
-            "Operation of type %s (%s) not supported on TPU." % (op_type,
-                                                                 op_name))
-      elif op.graph._is_function(op_type):
-        if op_type not in self._verified_functions:
-          fun_def = op.graph._get_function(op_type).definition
-          for node in fun_def.node_def:
-            check_supported_type(node.name, node.op)
-          self._verified_functions.add(op_type)
-      # pylint: enable=protected-access
+    if op.type in _NOT_IMPLEMENTED_OPS:
+      self._unsupported_ops.append(op)
 
-    # pylint: disable=protected-access
-    check_supported_type(op.name, op.type)
-    if any(x.dtype._is_ref_dtype for x in op.inputs) or any(
-        x.dtype._is_ref_dtype for x in op.outputs):
+    if any(x.dtype._is_ref_dtype for x in op.inputs):
       raise NotImplementedError(
           "Non-resource Variables are not supported inside TPU computations "
           "(operator name: %s)" % op.name)
@@ -828,6 +360,7 @@ def replicate(computation,
           new_output_tensors.append(array_ops.identity(t))
       output_tensors = new_output_tensors
     finally:
+      context.report_unsupported_operations()
       context.Exit()
 
     # Fan-out: Builds a TPUReplicatedOutput node for each output.
-- 
GitLab


From 1fc5c8393ab8903ed43cd8ced39d8b4a9b837960 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 24 Jan 2018 08:35:13 -0800
Subject: [PATCH 071/966] Adds sharding in head_test to prevent timeouts.

PiperOrigin-RevId: 183089222
---
 tensorflow/python/estimator/BUILD | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/estimator/BUILD b/tensorflow/python/estimator/BUILD
index 6343615737..58d540b1ec 100644
--- a/tensorflow/python/estimator/BUILD
+++ b/tensorflow/python/estimator/BUILD
@@ -624,8 +624,9 @@ py_library(
 
 py_test(
     name = "head_test",
-    size = "small",
+    size = "medium",
     srcs = ["canned/head_test.py"],
+    shard_count = 4,
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
     deps = [
-- 
GitLab


From a346b737046574a86268bf920e4c88f19a455830 Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Wed, 24 Jan 2018 09:01:25 -0800
Subject: [PATCH 072/966] Don't set unused Operation private fields if the C
 API is enabled.

This helps catch invalid uses of the private fields with the C API
enabled, and refactors Operation.__init__ in preparation for enabling
the C API and eventually getting rid of the _USE_C_API toggle.

PiperOrigin-RevId: 183092099
---
 tensorflow/python/framework/ops.py | 31 +++++++++++++++++-------------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 9d3806e8fe..6c4563f769 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -1583,7 +1583,6 @@ class Operation(object):
             "Cannot create a tensor proto whose content is larger than 2GB.")
       if not _VALID_OP_NAME_REGEX.match(node_def.name):
         raise ValueError("'%s' is not a valid node name" % node_def.name)
-      self._node_def_val = copy.deepcopy(node_def)
       c_op = None
     elif type(node_def).__name__ == "SwigPyObject":
       assert inputs is None
@@ -1592,7 +1591,6 @@ class Operation(object):
       assert input_types is None
       assert original_op is None
       assert op_def is None
-      self._node_def_val = None
       c_op = node_def
     else:
       raise TypeError("node_def needs to be a NodeDef: %s" % node_def)
@@ -1605,24 +1603,22 @@ class Operation(object):
       inputs = []
     elif not isinstance(inputs, list):
       raise TypeError("inputs needs to be a list of Tensors: %s" % inputs)
-    self._inputs_val = list(inputs)  # Defensive copy.
-    for a in self._inputs_val:
+    for a in inputs:
       if not isinstance(a, Tensor):
         raise TypeError("input needs to be a Tensor: %s" % a)
     if input_types is None:
-      input_types = [i.dtype.base_dtype for i in self._inputs_val]
+      input_types = [i.dtype.base_dtype for i in inputs]
     else:
       if not all(
           x.is_compatible_with(i.dtype)
-          for i, x in zip(self._inputs_val, input_types)):
+          for i, x in zip(inputs, input_types)):
         raise TypeError("In op '%s', input types (%s) are not compatible "
                         "with expected types (%s)" %
-                        (self.node_def.name, [i.dtype for i in self._inputs_val],
+                        (self.node_def.name, [i.dtype for i in inputs],
                          input_types))
-    self._input_types_val = input_types
 
     # Build the list of control inputs.
-    self._control_inputs_val = []
+    control_input_ops = []
     if control_inputs:
       for c in control_inputs:
         control_op = None
@@ -1633,11 +1629,20 @@ class Operation(object):
         else:
           raise TypeError("Control input must be an Operation, "
                           "a Tensor, or IndexedSlices: %s" % c)
-        self._control_inputs_val.append(control_op)
+        control_input_ops.append(control_op)
+
+    # Don't set private fields with C API enabled to catch users who need to
+    # switch to public API.
+    # TODO(skyewm): delete these fields once we remove _USE_C_API
+    if not self._graph._c_graph:
+      self._inputs_val = list(inputs)  # Defensive copy.
+      self._input_types_val = input_types
+      self._control_inputs_val = control_input_ops
+      self._node_def_val = copy.deepcopy(node_def)
+      self._op_def_val = op_def
 
     self._id_value = self._graph._next_id()  # pylint: disable=protected-access
     self._original_op = original_op
-    self._op_def_val = op_def
     self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access
     self._control_flow_context = self.graph._get_control_flow_context()  # pylint: disable=protected-access
 
@@ -1653,8 +1658,8 @@ class Operation(object):
       # Refactor so we don't have to do this here.
       grouped_inputs = self._reconstruct_sequence_inputs(
           op_def, inputs, node_def.attr)
-      self._c_op = _create_c_op(self._graph, self._node_def_val, grouped_inputs,
-                                self._control_inputs_val)
+      self._c_op = _create_c_op(self._graph, node_def, grouped_inputs,
+                                control_input_ops)
     else:
       self._c_op = None
 
-- 
GitLab


From 37bfad3c33c005077630b021ca927608dd70bb3e Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Wed, 24 Jan 2018 09:10:53 -0800
Subject: [PATCH 073/966] Allow setting per-thread device copying policies in
 TFE.

PiperOrigin-RevId: 183093407
---
 tensorflow/c/eager/c_api.cc          | 19 ++++++++++-
 tensorflow/c/eager/c_api.h           | 12 +++++++
 tensorflow/c/eager/c_api_internal.h  |  7 ++++
 tensorflow/c/eager/c_api_test.cc     | 49 ++++++++++++++++++++++++++++
 tensorflow/python/eager/context.py   | 12 +++++++
 tensorflow/python/eager/core_test.py |  9 +++++
 tensorflow/python/pywrap_tfe.i       |  2 ++
 7 files changed, 109 insertions(+), 1 deletion(-)

diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index e56cfa5885..e4e33016c2 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -118,6 +118,23 @@ void TFE_ContextClearCaches(TFE_Context* ctx) {
   tensorflow::gtl::STLDeleteValues(&ctx->kernel_cache);
 }
 
+void TFE_ContextSetThreadLocalDevicePlacementPolicy(
+    TFE_Context* ctx, TFE_ContextDevicePlacementPolicy policy) {
+  tensorflow::mutex_lock ml(ctx->policy_map_mu);
+  ctx->thread_local_policies[std::this_thread::get_id()] = policy;
+}
+
+extern TFE_ContextDevicePlacementPolicy TFE_ContextGetDevicePlacementPolicy(
+    TFE_Context* ctx) {
+  tensorflow::mutex_lock ml(ctx->policy_map_mu);
+  auto policy_map_it =
+      ctx->thread_local_policies.find(std::this_thread::get_id());
+  if (policy_map_it != ctx->thread_local_policies.end()) {
+    return policy_map_it->second;
+  }
+  return ctx->policy;
+}
+
 TFE_TensorHandle* TFE_NewTensorHandle(TF_Tensor* t, TF_Status* status) {
   tensorflow::Tensor tensor;
   status->status = tensorflow::TF_TensorToTensor(t, &tensor);
@@ -434,7 +451,7 @@ tensorflow::Status ValidateInputTypeAndPlacement(
     const tensorflow::Device* actual_device =
         op->input_devices[i] == nullptr ? host_device : op->input_devices[i];
     if (expected_device != actual_device) {
-      switch (ctx->policy) {
+      switch (TFE_ContextGetDevicePlacementPolicy(ctx)) {
         case TFE_DEVICE_PLACEMENT_EXPLICIT:
           // TODO(xpan): See if we could bubble python related error up
           // to python level.
diff --git a/tensorflow/c/eager/c_api.h b/tensorflow/c/eager/c_api.h
index 9b0fd037da..26e6d94753 100644
--- a/tensorflow/c/eager/c_api.h
+++ b/tensorflow/c/eager/c_api.h
@@ -93,6 +93,18 @@ TF_CAPI_EXPORT extern TF_DeviceList* TFE_ContextListDevices(TFE_Context* ctx,
 // ops.
 TF_CAPI_EXPORT extern void TFE_ContextClearCaches(TFE_Context* ctx);
 
+// Sets a thread-local device placement policy. After this call, other calls to
+// TFE_Execute in the same thread will use the device policy specified here
+// instead of the device policy used to construct the context. This has no
+// effect on the device policy used by other program threads.
+TF_CAPI_EXPORT extern void TFE_ContextSetThreadLocalDevicePlacementPolicy(
+    TFE_Context*, TFE_ContextDevicePlacementPolicy);
+
+// Returns the device placement policy to be used by this context in the current
+// thread.
+TF_CAPI_EXPORT extern TFE_ContextDevicePlacementPolicy
+TFE_ContextGetDevicePlacementPolicy(TFE_Context*);
+
 // A handle to a tensor on a device.
 //
 // Like a TF_Tensor, a TFE_TensorHandle refers to a tensor with a value, shape,
diff --git a/tensorflow/c/eager/c_api_internal.h b/tensorflow/c/eager/c_api_internal.h
index 55a04d48ba..f40c41576c 100644
--- a/tensorflow/c/eager/c_api_internal.h
+++ b/tensorflow/c/eager/c_api_internal.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <cstddef>
 #include <memory>
 #include <string>
+#include <thread>
 #include <vector>
 
 #include "tensorflow/c/c_api.h"
@@ -45,6 +46,12 @@ struct TFE_Context {
 
   TFE_ContextDevicePlacementPolicy policy;
 
+  // Note: we cannot use C++11 thread_local here as there is no concept of a
+  // thread-local-object-local variable in C++11.
+  tensorflow::mutex policy_map_mu;
+  std::unordered_map<std::thread::id, TFE_ContextDevicePlacementPolicy>
+      thread_local_policies GUARDED_BY(policy_map_mu);
+
   // TFE_Context is an extension of TF_Session. And TF_Session needs a TF_Graph.
   TF_Session* session;
   tensorflow::Rendezvous* rendezvous;
diff --git a/tensorflow/c/eager/c_api_test.cc b/tensorflow/c/eager/c_api_test.cc
index 423a7e1ff7..18e7a64435 100644
--- a/tensorflow/c/eager/c_api_test.cc
+++ b/tensorflow/c/eager/c_api_test.cc
@@ -321,6 +321,55 @@ TEST(CAPI, TensorHandleSilentCopy) {
   EXPECT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
 }
 
+TEST(CAPI, TensorHandleSilentCopyLocal) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetDevicePlacementPolicy(opts,
+                                             TFE_DEVICE_PLACEMENT_EXPLICIT);
+  TFE_Context* ctx = TFE_NewContext(opts, status.get());
+  TFE_ContextSetThreadLocalDevicePlacementPolicy(ctx,
+                                                 TFE_DEVICE_PLACEMENT_SILENT);
+  TFE_DeleteContextOptions(opts);
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  TFE_TensorHandle* hcpu = TestMatrixTensorHandle();
+  TF_Tensor* t = TFE_TensorHandleResolve(hcpu, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  TF_DeviceList* devices = TFE_ContextListDevices(ctx, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  const int num_devices = TF_DeviceListCount(devices);
+
+  // Disable the test if no GPU is present.
+  if (num_devices > 1) {
+    const int device_to_use = 1;
+    const string name(TF_DeviceListName(devices, device_to_use, status.get()));
+    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+
+    TFE_TensorHandle* hgpu =
+        TFE_TensorHandleCopyToDevice(hcpu, ctx, name.c_str(), status.get());
+    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+
+    TFE_Op* matmul = MatMulOp(ctx, hcpu, hgpu);
+    TFE_OpSetDevice(matmul, name.c_str(), status.get());
+    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+    TFE_TensorHandle* retvals[1];
+    int num_retvals = 1;
+    TFE_Execute(matmul, &retvals[0], &num_retvals, status.get());
+    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+    TFE_DeleteOp(matmul);
+    TFE_DeleteTensorHandle(retvals[0]);
+    TFE_DeleteTensorHandle(hgpu);
+  }
+
+  TF_DeleteDeviceList(devices);
+  TF_DeleteTensor(t);
+  TFE_DeleteTensorHandle(hcpu);
+  TFE_DeleteContext(ctx, status.get());
+  EXPECT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+}
+
 TEST(CAPI, Execute) {
   TF_Status* status = TF_NewStatus();
   TFE_ContextOptions* opts = TFE_NewContextOptions();
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index cbf588336d..df483b69c4 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -411,6 +411,18 @@ class Context(object):
       self._initialize_handle_and_devices()
     pywrap_tensorflow.TFE_ContextEnableRunMetadata(self._context_handle)
 
+  @contextlib.contextmanager
+  def device_policy(self, policy):
+    old = pywrap_tensorflow.TFE_ContextGetDevicePlacementPolicy(
+        self._context_handle)
+    pywrap_tensorflow.TFE_ContextSetThreadLocalDevicePlacementPolicy(
+        self._handle, policy)
+    try:
+      yield
+    finally:
+      pywrap_tensorflow.TFE_ContextSetThreadLocalDevicePlacementPolicy(
+          self._handle, old)
+
   def disable_run_metadata(self):
     """Disables tracing of op execution via RunMetadata."""
     if not self._context_handle:
diff --git a/tensorflow/python/eager/core_test.py b/tensorflow/python/eager/core_test.py
index a70fa72804..82438ecadb 100644
--- a/tensorflow/python/eager/core_test.py
+++ b/tensorflow/python/eager/core_test.py
@@ -173,6 +173,15 @@ class TFETest(test_util.TensorFlowTestCase):
     with self.assertRaises(RuntimeError):
       x.gpu(context.context().num_gpus() + 1)
 
+  def testCopyScope(self):
+    if not context.context().num_gpus():
+      self.skipTest('No GPUs found')
+    constant = constant_op.constant(1.0)
+    with ops.device('gpu:0'):
+      with context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
+        c = constant + 1.0
+    self.assertAllEqual(c, 2.0)
+
   def testNumpyForceCPU(self):
     if not context.context().num_gpus():
       self.skipTest('No GPUs found')
diff --git a/tensorflow/python/pywrap_tfe.i b/tensorflow/python/pywrap_tfe.i
index de0e5856ff..ed7e4b17ad 100644
--- a/tensorflow/python/pywrap_tfe.i
+++ b/tensorflow/python/pywrap_tfe.i
@@ -24,6 +24,8 @@ limitations under the License.
 %rename("%s") TFE_ContextDisableRunMetadata;
 %rename("%s") TFE_ContextExportRunMetadata;
 %rename("%s") TFE_ContextClearCaches;
+%rename("%s") TFE_ContextGetDevicePlacementPolicy;
+%rename("%s") TFE_ContextSetThreadLocalDevicePlacementPolicy;
 %rename("%s") TFE_OpNameGetAttrType;
 %rename("%s") TFE_Py_InitEagerTensor;
 %rename("%s") TFE_Py_RegisterExceptionClass;
-- 
GitLab


From ffdae0a35785337bd10fed289d3998ba0e7c014b Mon Sep 17 00:00:00 2001
From: Jianwei Xie <xiejw@google.com>
Date: Wed, 24 Jan 2018 09:15:32 -0800
Subject: [PATCH 074/966] Rename internal field.

PiperOrigin-RevId: 183093901
---
 tensorflow/python/estimator/run_config.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/estimator/run_config.py b/tensorflow/python/estimator/run_config.py
index dc714d4d22..894005550d 100644
--- a/tensorflow/python/estimator/run_config.py
+++ b/tensorflow/python/estimator/run_config.py
@@ -484,7 +484,7 @@ class RunConfig(object):
         self._num_ps_replicas = _count_ps(self._cluster_spec)
         self._num_worker_replicas = _count_worker(
             self._cluster_spec, chief_task_type=TaskType.CHIEF)
-        self._global_id = _get_global_id_in_cluster(
+        self._global_id_in_cluster = _get_global_id_in_cluster(
             self._cluster_spec,
             self._task_type,
             self._task_id,
@@ -495,14 +495,14 @@ class RunConfig(object):
         self._master = _LOCAL_MASTER
         self._num_ps_replicas = 0
         self._num_worker_replicas = 0
-        self._global_id = None  # undefined
+        self._global_id_in_cluster = None  # undefined
 
       self._is_chief = self._task_type == TaskType.CHIEF
     else:
       # Local mode.
       self._task_type = task_env.get(_TASK_TYPE_KEY, TaskType.WORKER)
       self._task_id = int(task_env.get(_TASK_ID_KEY, 0))
-      self._global_id = 0
+      self._global_id_in_cluster = 0
 
       if self._task_type != TaskType.WORKER:
         raise ValueError(
@@ -537,7 +537,7 @@ class RunConfig(object):
       raise ValueError('If `master` node exists in `cluster`, task_type '
                        '`evaluator` is not supported.')
 
-    self._global_id = _get_global_id_in_cluster(
+    self._global_id_in_cluster = _get_global_id_in_cluster(
         self._cluster_spec,
         self._task_type,
         self._task_id,
@@ -619,7 +619,7 @@ class RunConfig(object):
     Returns:
       An integer id.
     """
-    return self._global_id
+    return self._global_id_in_cluster
 
   @property
   def task_type(self):
-- 
GitLab


From 4e5da8880a1de695212777628f33db07833675c6 Mon Sep 17 00:00:00 2001
From: Jonathan Hseu <vomjom@vomjom.net>
Date: Wed, 24 Jan 2018 09:20:45 -0800
Subject: [PATCH 075/966] Remove path_to_str from the public API (#16339)

* Remove path_to_str from the public API

* Move path_to_str to another file
---
 tensorflow/python/estimator/estimator.py      |  3 +-
 tensorflow/python/estimator/run_config.py     |  4 ++-
 tensorflow/python/util/compat.py              | 15 --------
 tensorflow/python/util/compat_internal.py     | 34 +++++++++++++++++++
 .../tools/api/golden/tensorflow.compat.pbtxt  |  4 ---
 5 files changed, 39 insertions(+), 21 deletions(-)
 create mode 100644 tensorflow/python/util/compat_internal.py

diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index bd65d565b5..f53e170acc 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -54,6 +54,7 @@ from tensorflow.python.training import saver
 from tensorflow.python.training import training
 from tensorflow.python.training import training_util
 from tensorflow.python.util import compat
+from tensorflow.python.util import compat_internal
 from tensorflow.python.util import nest
 
 
@@ -159,7 +160,7 @@ class Estimator(object):
       self._config = config
 
     # Model directory.
-    model_dir = compat.path_to_str(model_dir)
+    model_dir = compat_internal.path_to_str(model_dir)
     if (model_dir is not None) and (self._config.model_dir is not None):
       if model_dir != self._config.model_dir:
         # TODO(alanyee): remove this suppression after it is no longer needed
diff --git a/tensorflow/python/estimator/run_config.py b/tensorflow/python/estimator/run_config.py
index db30329897..ee77961b39 100644
--- a/tensorflow/python/estimator/run_config.py
+++ b/tensorflow/python/estimator/run_config.py
@@ -28,6 +28,7 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import server_lib
 from tensorflow.python.util import compat
+from tensorflow.python.util import compat_internal
 
 
 _USE_DEFAULT = object()
@@ -444,7 +445,8 @@ class RunConfig(object):
     if tf_config:
       logging.info('TF_CONFIG environment variable: %s', tf_config)
 
-    model_dir = _get_model_dir(tf_config, compat.path_to_str(model_dir))
+    model_dir = _get_model_dir(tf_config,
+                               compat_internal.path_to_str(model_dir))
 
     RunConfig._replace(
         self,
diff --git a/tensorflow/python/util/compat.py b/tensorflow/python/util/compat.py
index 3ab0bd16fa..07382d93df 100644
--- a/tensorflow/python/util/compat.py
+++ b/tensorflow/python/util/compat.py
@@ -21,7 +21,6 @@ In addition to the functions below, `as_str` converts an object to a `str`.
 @@as_bytes
 @@as_text
 @@as_str_any
-@@path_to_str
 
 ## Types
 The compatibility module also provides the following types:
@@ -109,20 +108,6 @@ def as_str_any(value):
     return str(value)
 
 
-def path_to_str(path):
-  """Returns the file system path representation of a `PathLike` object, else as it is.
-
-  Args:
-    path: An object that can be converted to path representation.
-
-  Returns:
-    A `str` object.
-  """
-  if hasattr(path, "__fspath__"):
-    path = as_str_any(path.__fspath__())
-  return path
-
-
 # Numpy 1.8 scalars don't inherit from numbers.Integral in Python 3, so we
 # need to check them specifically.  The same goes from Real and Complex.
 integral_types = (_numbers.Integral, _np.integer)
diff --git a/tensorflow/python/util/compat_internal.py b/tensorflow/python/util/compat_internal.py
new file mode 100644
index 0000000000..a299b2fc3c
--- /dev/null
+++ b/tensorflow/python/util/compat_internal.py
@@ -0,0 +1,34 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Functions for Python 2 vs. 3 compatibility that are private to TensorFlow."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+def path_to_str(path):
+  """Returns the file system path representation of a `PathLike` object, else as it is.
+
+  Args:
+    path: An object that can be converted to path representation.
+
+  Returns:
+    A `str` object.
+  """
+  if hasattr(path, "__fspath__"):
+    path = as_str_any(path.__fspath__())
+  return path
diff --git a/tensorflow/tools/api/golden/tensorflow.compat.pbtxt b/tensorflow/tools/api/golden/tensorflow.compat.pbtxt
index bab480ff9b..ccc6031400 100644
--- a/tensorflow/tools/api/golden/tensorflow.compat.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.compat.pbtxt
@@ -32,8 +32,4 @@ tf_module {
     name: "as_text"
     argspec: "args=[\'bytes_or_text\', \'encoding\'], varargs=None, keywords=None, defaults=[\'utf-8\'], "
   }
-  member_method {
-    name: "path_to_str"
-    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
-  }
 }
-- 
GitLab


From 21fd0369d932331dd9edc1cf4df437a452b4a335 Mon Sep 17 00:00:00 2001
From: "freedom\" Koan-Sin Tan" <koansin.tan@gmail.com>
Date: Thu, 25 Jan 2018 01:31:13 +0800
Subject: [PATCH 076/966] [tflite] make calling NNAPI work again (#16256)

calling PrepareOpsAndTensors() before using NN API looks
1. unnecessary
2. will decrease next_node_to_prepare_ so that the next
   next_node_to_prepare_ == nodes_and_registration_.size()
will fail
---
 tensorflow/contrib/lite/interpreter.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/contrib/lite/interpreter.cc b/tensorflow/contrib/lite/interpreter.cc
index 5f5981e45a..69a597dc5a 100644
--- a/tensorflow/contrib/lite/interpreter.cc
+++ b/tensorflow/contrib/lite/interpreter.cc
@@ -291,7 +291,6 @@ TfLiteStatus Interpreter::Invoke() {
 
   TfLiteStatus status = kTfLiteOk;
   if (nnapi_delegate_) {
-    TF_LITE_ENSURE_STATUS(PrepareOpsAndTensors());
     if (next_node_to_prepare_ == nodes_and_registration_.size()) {
       TF_LITE_ENSURE_OK(&context_, nnapi_delegate_->Invoke(this));
       return kTfLiteOk;
-- 
GitLab


From 7b62a71e2d46c148df7d5704972f4592bc5e0f1b Mon Sep 17 00:00:00 2001
From: Ian Langmore <langmore@google.com>
Date: Wed, 24 Jan 2018 09:49:47 -0800
Subject: [PATCH 077/966] * BUGFIX:  See code associated with
 scale_identity_multiplier * BUGFIX:  See code associated with 'weight' inside
 sample_n * Use parameterization `temperature` rather than `mix_scale`. *
 Simplify documentation and link to arXiv paper for details * Document that we
 allow `temperature` to have any shape broadcastable with `mix_loc`.  This is
 a mute point to some degree since we require K = 2 now. * Add some tests

PiperOrigin-RevId: 183098275
---
 .../kernel_tests/vector_diffeomixture_test.py | 148 +++++++--
 .../python/ops/vector_diffeomixture.py        | 300 ++++++++----------
 2 files changed, 258 insertions(+), 190 deletions(-)

diff --git a/tensorflow/contrib/distributions/python/kernel_tests/vector_diffeomixture_test.py b/tensorflow/contrib/distributions/python/kernel_tests/vector_diffeomixture_test.py
index d292b04665..04f047aa0c 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/vector_diffeomixture_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/vector_diffeomixture_test.py
@@ -27,6 +27,8 @@ from tensorflow.python.ops.linalg import linear_operator_diag as linop_diag_lib
 from tensorflow.python.ops.linalg import linear_operator_identity as linop_identity_lib
 from tensorflow.python.platform import test
 
+rng = np.random.RandomState(0)
+
 
 class VectorDiffeomixtureTest(
     test_util.VectorDistributionTestHelpers, test.TestCase):
@@ -37,7 +39,7 @@ class VectorDiffeomixtureTest(
       dims = 4
       vdm = vdm_lib.VectorDiffeomixture(
           mix_loc=[[0.], [1.]],
-          mix_scale=[1.],
+          temperature=[1.],
           distribution=normal_lib.Normal(0., 1.),
           loc=[
               None,
@@ -66,7 +68,7 @@ class VectorDiffeomixtureTest(
       dims = 4
       vdm = vdm_lib.VectorDiffeomixture(
           mix_loc=[[0.], [1.]],
-          mix_scale=[1.],
+          temperature=[1.],
           distribution=normal_lib.Normal(1., 1.5),
           loc=[
               None,
@@ -95,7 +97,7 @@ class VectorDiffeomixtureTest(
       dims = 4
       vdm = vdm_lib.VectorDiffeomixture(
           mix_loc=[[0.], [1.]],
-          mix_scale=[1.],
+          temperature=[1.],
           distribution=normal_lib.Normal(0., 1.),
           loc=[
               None,
@@ -122,12 +124,39 @@ class VectorDiffeomixtureTest(
       self.run_test_sample_consistent_log_prob(
           sess.run, vdm, radius=4., center=2., rtol=0.01)
 
+  def testSampleProbConsistentBroadcastMixTwoBatchDims(self):
+    dims = 4
+    loc_1 = rng.randn(2, 3, dims).astype(np.float32)
+
+    with self.test_session() as sess:
+      vdm = vdm_lib.VectorDiffeomixture(
+          mix_loc=(rng.rand(2, 3, 1) - 0.5).astype(np.float32),
+          temperature=[1.],
+          distribution=normal_lib.Normal(0., 1.),
+          loc=[
+              None,
+              loc_1,
+          ],
+          scale=[
+              linop_identity_lib.LinearOperatorScaledIdentity(
+                  num_rows=dims,
+                  multiplier=[np.float32(1.1)],
+                  is_positive_definite=True),
+          ] * 2,
+          validate_args=True)
+      # Ball centered at component0's mean.
+      self.run_test_sample_consistent_log_prob(
+          sess.run, vdm, radius=2., center=0., rtol=0.01)
+      # Larger ball centered at component1's mean.
+      self.run_test_sample_consistent_log_prob(
+          sess.run, vdm, radius=3., center=loc_1, rtol=0.02)
+
   def testMeanCovarianceNoBatch(self):
     with self.test_session() as sess:
       dims = 3
       vdm = vdm_lib.VectorDiffeomixture(
           mix_loc=[[0.], [4.]],
-          mix_scale=[10.],
+          temperature=[1 / 10.],
           distribution=normal_lib.Normal(0., 1.),
           loc=[
               np.float32([-2.]),
@@ -147,12 +176,94 @@ class VectorDiffeomixtureTest(
       self.run_test_sample_consistent_mean_covariance(
           sess.run, vdm, rtol=0.02, cov_rtol=0.08)
 
+  def testTemperatureControlsHowMuchThisLooksLikeDiscreteMixture(self):
+    # As temperature decreases, this should approach a mixture of normals, with
+    # components at -2, 2.
+    with self.test_session() as sess:
+      dims = 1
+      vdm = vdm_lib.VectorDiffeomixture(
+          mix_loc=[0.],
+          temperature=[[2.], [1.], [0.2]],
+          distribution=normal_lib.Normal(0., 1.),
+          loc=[
+              np.float32([-2.]),
+              np.float32([2.]),
+          ],
+          scale=[
+              linop_identity_lib.LinearOperatorScaledIdentity(
+                  num_rows=dims,
+                  multiplier=np.float32(0.5),
+                  is_positive_definite=True),
+          ] * 2,  # Use the same scale for each component.
+          quadrature_size=8,
+          validate_args=True)
+
+      samps = vdm.sample(10000)
+      self.assertAllEqual((10000, 3, 1), samps.shape)
+      samps_ = sess.run(samps).reshape(10000, 3)  # Make scalar event shape.
+
+      # One characteristic of a discrete mixture (as opposed to a "smear") is
+      # that more weight is put near the component centers at -2, 2, and thus
+      # less weight is put near the origin.
+      prob_of_being_near_origin = (np.abs(samps_) < 1).mean(axis=0)
+      self.assertGreater(
+          prob_of_being_near_origin[0], prob_of_being_near_origin[1])
+      self.assertGreater(
+          prob_of_being_near_origin[1], prob_of_being_near_origin[2])
+
+      # Run this test as well, just because we can.
+      self.run_test_sample_consistent_mean_covariance(
+          sess.run, vdm, rtol=0.02, cov_rtol=0.08)
+
+  def testConcentrationLocControlsHowMuchWeightIsOnEachComponent(self):
+    with self.test_session() as sess:
+      dims = 1
+      vdm = vdm_lib.VectorDiffeomixture(
+          mix_loc=[[-1.], [0.], [1.]],
+          temperature=[0.5],
+          distribution=normal_lib.Normal(0., 1.),
+          loc=[
+              np.float32([-2.]),
+              np.float32([2.]),
+          ],
+          scale=[
+              linop_identity_lib.LinearOperatorScaledIdentity(
+                  num_rows=dims,
+                  multiplier=np.float32(0.5),
+                  is_positive_definite=True),
+          ] * 2,  # Use the same scale for each component.
+          quadrature_size=8,
+          validate_args=True)
+
+      samps = vdm.sample(10000)
+      self.assertAllEqual((10000, 3, 1), samps.shape)
+      samps_ = sess.run(samps).reshape(10000, 3)  # Make scalar event shape.
+
+      # One characteristic of putting more weight on a component is that the
+      # mean is closer to that component's mean.
+      # Get the mean for each batch member, the names signify the value of
+      # concentration for that batch member.
+      mean_neg1, mean_0, mean_1 = samps_.mean(axis=0)
+
+      # Since concentration is the concentration for component 0,
+      # concentration = -1 ==> more weight on component 1, which has mean = 2
+      # concentration = 0 ==> equal weight
+      # concentration = 1 ==> more weight on component 0, which has mean = -2
+      self.assertLess(-2, mean_1)
+      self.assertLess(mean_1, mean_0)
+      self.assertLess(mean_0, mean_neg1)
+      self.assertLess(mean_neg1, 2)
+
+      # Run this test as well, just because we can.
+      self.run_test_sample_consistent_mean_covariance(
+          sess.run, vdm, rtol=0.02, cov_rtol=0.08)
+
   def testMeanCovarianceNoBatchUncenteredNonStandardBase(self):
     with self.test_session() as sess:
       dims = 3
       vdm = vdm_lib.VectorDiffeomixture(
           mix_loc=[[0.], [4.]],
-          mix_scale=[10.],
+          temperature=[0.1],
           distribution=normal_lib.Normal(-1., 1.5),
           loc=[
               np.float32([-2.]),
@@ -177,7 +288,7 @@ class VectorDiffeomixtureTest(
       dims = 3
       vdm = vdm_lib.VectorDiffeomixture(
           mix_loc=[[0.], [4.]],
-          mix_scale=[10.],
+          temperature=[0.1],
           distribution=normal_lib.Normal(0., 1.),
           loc=[
               np.float32([[-2.]]),
@@ -205,7 +316,7 @@ class VectorDiffeomixtureTest(
       dims = 4
       vdm = vdm_lib.VectorDiffeomixture(
           mix_loc=[0.],
-          mix_scale=[1.],
+          temperature=[0.1],
           distribution=normal_lib.Normal(0., 1.),
           loc=[
               None,
@@ -229,29 +340,6 @@ class VectorDiffeomixtureTest(
       self.run_test_sample_consistent_log_prob(
           sess.run, vdm, radius=4., center=2., rtol=0.005)
 
-  # TODO(jvdillon): We've tested that (i) .sample and .log_prob are consistent,
-  # (ii) .mean, .stddev etc... and .sample are consistent. However, we haven't
-  # tested that the quadrature approach well-approximates the integral.
-  #
-  # To that end, consider adding these tests:
-  #
-  # Test1: In the limit of high mix_scale, this approximates a discrete mixture,
-  # and there are many discrete mixtures where we can explicitly compute
-  # mean/var, etc... So test1 would choose one of those discrete mixtures and
-  # show our mean/var/etc... is close to that.
-  #
-  # Test2:  In the limit of low mix_scale, the a diffeomixture of Normal(-5, 1),
-  # Normal(5, 1) should (I believe...must check) should look almost like
-  # Uniform(-5, 5), and thus (i) .prob(x) should be about 1/10 for x in (-5, 5),
-  # and (ii) the first few moments should approximately match that of
-  # Uniform(-5, 5)
-  #
-  # Test3:  If mix_loc is symmetric, then for any mix_scale, our
-  # quadrature-based diffeomixture of Normal(-1, 1), Normal(1, 1) should have
-  # mean zero, exactly.
-
-  # TODO(jvdillon): Add more tests which verify broadcasting.
-
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py b/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py
index 7ce8a83fd9..0c747f8e68 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py
@@ -50,20 +50,25 @@ __all__ = [
 
 
 def quadrature_scheme_softmaxnormal_gauss_hermite(
-    loc, scale, quadrature_size,
+    normal_loc, normal_scale, quadrature_size,
     validate_args=False, name=None):
   """Use Gauss-Hermite quadrature to form quadrature on `K - 1` simplex.
 
+  A `SoftmaxNormal` random variable `Y` may be generated via
+
+  ```
+  Y = SoftmaxCentered(X),
+  X = Normal(normal_loc, normal_scale)
+  ```
+
   Note: for a given `quadrature_size`, this method is generally less accurate
   than `quadrature_scheme_softmaxnormal_quantiles`.
 
   Args:
-    loc: `float`-like `Tensor` with shape `[b1, ..., bB, K-1]`, B>=0.
-      Represents the `location` parameter of the SoftmaxNormal used for
-      selecting one of the `K` affine transformations.
-    scale: `float`-like `Tensor` with shape `[b1, ..., bB, K-1]`, B>=0.
-      Represents the `scale` parameter of the SoftmaxNormal used for
-      selecting one of the `K` affine transformations.
+    normal_loc: `float`-like `Tensor` with shape `[b1, ..., bB, K-1]`, B>=0.
+      The location parameter of the Normal used to construct the SoftmaxNormal.
+    normal_scale: `float`-like `Tensor`. Broadcastable with `normal_loc`.
+      The scale parameter of the Normal used to construct the SoftmaxNormal.
     quadrature_size: Python `int` scalar representing the number of quadrature
       points.
     validate_args: Python `bool`, default `False`. When `True` distribution
@@ -80,24 +85,25 @@ def quadrature_scheme_softmaxnormal_gauss_hermite(
       associated with each grid point.
   """
   with ops.name_scope(name, "quadrature_scheme_softmaxnormal_gauss_hermite",
-                      [loc, scale]):
-    loc = ops.convert_to_tensor(loc, name="loc")
-    dt = loc.dtype.base_dtype
-    scale = ops.convert_to_tensor(scale, dtype=dt, name="scale")
+                      [normal_loc, normal_scale]):
+    normal_loc = ops.convert_to_tensor(normal_loc, name="normal_loc")
+    dt = normal_loc.dtype.base_dtype
+    normal_scale = ops.convert_to_tensor(
+        normal_scale, dtype=dt, name="normal_scale")
 
-    loc = maybe_check_quadrature_param(loc, "loc", validate_args)
-    scale = maybe_check_quadrature_param(scale, "scale", validate_args)
+    normal_scale = maybe_check_quadrature_param(
+        normal_scale, "normal_scale", validate_args)
 
     grid, probs = np.polynomial.hermite.hermgauss(deg=quadrature_size)
-    grid = grid.astype(loc.dtype.as_numpy_dtype)
-    probs = probs.astype(loc.dtype.as_numpy_dtype)
+    grid = grid.astype(dt.dtype.as_numpy_dtype)
+    probs = probs.astype(dt.dtype.as_numpy_dtype)
     probs /= np.linalg.norm(probs, ord=1, keepdims=True)
-    probs = ops.convert_to_tensor(probs, name="probs", dtype=loc.dtype)
+    probs = ops.convert_to_tensor(probs, name="probs", dtype=dt)
 
     grid = softmax(
         -distribution_util.pad(
-            (loc[..., array_ops.newaxis] +
-             np.sqrt(2.) * scale[..., array_ops.newaxis] * grid),
+            (normal_loc[..., array_ops.newaxis] +
+             np.sqrt(2.) * normal_scale[..., array_ops.newaxis] * grid),
             axis=-2,
             front=True),
         axis=-2)  # shape: [B, components, deg]
@@ -106,18 +112,23 @@ def quadrature_scheme_softmaxnormal_gauss_hermite(
 
 
 def quadrature_scheme_softmaxnormal_quantiles(
-    loc, scale, quadrature_size,
+    normal_loc, normal_scale, quadrature_size,
     validate_args=False, name=None):
   """Use SoftmaxNormal quantiles to form quadrature on `K - 1` simplex.
 
+  A `SoftmaxNormal` random variable `Y` may be generated via
+
+  ```
+  Y = SoftmaxCentered(X),
+  X = Normal(normal_loc, normal_scale)
+  ```
+
   Args:
-    loc: `float`-like `Tensor` with shape `[b1, ..., bB, K-1]`, B>=0.
-      Represents the `location` parameter of the SoftmaxNormal used for
-      selecting one of the `K` affine transformations.
-    scale: `float`-like `Tensor` with shape `[b1, ..., bB, K-1]`, B>=0.
-      Represents the `scale` parameter of the SoftmaxNormal used for
-      selecting one of the `K` affine transformations.
-    quadrature_size: Python scalar `int` representing the number of quadrature
+    normal_loc: `float`-like `Tensor` with shape `[b1, ..., bB, K-1]`, B>=0.
+      The location parameter of the Normal used to construct the SoftmaxNormal.
+    normal_scale: `float`-like `Tensor`. Broadcastable with `normal_loc`.
+      The scale parameter of the Normal used to construct the SoftmaxNormal.
+    quadrature_size: Python `int` scalar representing the number of quadrature
       points.
     validate_args: Python `bool`, default `False`. When `True` distribution
       parameters are checked for validity despite possibly degrading runtime
@@ -132,15 +143,17 @@ def quadrature_scheme_softmaxnormal_quantiles(
     probs:  Shape `[b1, ..., bB, K, quadrature_size]` `Tensor` representing the
       associated with each grid point.
   """
-  with ops.name_scope(name, "softmax_normal_grid_and_probs", [loc, scale]):
-    loc = ops.convert_to_tensor(loc, name="loc")
-    dt = loc.dtype.base_dtype
-    scale = ops.convert_to_tensor(scale, dtype=dt, name="scale")
+  with ops.name_scope(name, "softmax_normal_grid_and_probs",
+                      [normal_loc, normal_scale]):
+    normal_loc = ops.convert_to_tensor(normal_loc, name="normal_loc")
+    dt = normal_loc.dtype.base_dtype
+    normal_scale = ops.convert_to_tensor(
+        normal_scale, dtype=dt, name="normal_scale")
 
-    loc = maybe_check_quadrature_param(loc, "loc", validate_args)
-    scale = maybe_check_quadrature_param(scale, "scale", validate_args)
+    normal_scale = maybe_check_quadrature_param(
+        normal_scale, "normal_scale", validate_args)
 
-    dist = normal_lib.Normal(loc=loc, scale=scale)
+    dist = normal_lib.Normal(loc=normal_loc, scale=normal_scale)
 
     def _get_batch_ndims():
       """Helper to get dist.batch_shape.ndims, statically if possible."""
@@ -195,114 +208,51 @@ def quadrature_scheme_softmaxnormal_quantiles(
 class VectorDiffeomixture(distribution_lib.Distribution):
   """VectorDiffeomixture distribution.
 
-  The VectorDiffeomixture is an approximation to a [compound distribution](
-  https://en.wikipedia.org/wiki/Compound_probability_distribution), i.e.,
+  A vector diffeomixture (VDM) is a distribution parameterized by a convex
+  combination of `K` component `loc` vectors, `loc[k], k = 0,...,K-1`, and `K`
+  `scale` matrices `scale[k], k = 0,..., K-1`.  It approximates the following
+  [compound distribution]
+  (https://en.wikipedia.org/wiki/Compound_probability_distribution)
 
   ```none
-  p(x) = int_{X} q(x | v) p(v) dv
-       = lim_{Q->infty} sum{ prob[i] q(x | loc=sum_k^K lambda[k;i] loc[k],
-                                            scale=sum_k^K lambda[k;i] scale[k])
-                            : i=0, ..., Q-1 }
+  p(x) = int p(x | z) p(z) dz,
+  where z is in the K-simplex, and
+  p(x | z) := p(x | loc=sum_k z[k] loc[k], scale=sum_k z[k] scale[k])
   ```
 
-  where `q(x | v)` is a vector version of the `distribution` argument and `p(v)`
-  is a SoftmaxNormal parameterized by `mix_loc` and `mix_scale`. The
-  vector-ization of `distribution` entails an affine transformation of iid
-  samples from `distribution`.  The `prob` term is from quadrature and
-  `lambda[k] = sigmoid(mix_loc[k] + sqrt(2) mix_scale[k] grid[k])` where the
-  `grid` points correspond to the `prob`s.
-
-  In the non-approximation case, a draw from the mixture distribution (the
-  "prior") represents the convex weights for different affine transformations.
-  I.e., draw a mixing vector `v` (from the `K-1`-simplex) and let the final
-  sample be: `y = (sum_k^K v[k] scale[k]) @ x + (sum_k^K v[k] loc[k])` where `@`
-  denotes matrix multiplication.  However, the non-approximate distribution does
-  not have an analytical probability density function (pdf). Therefore the
-  `VectorDiffeomixture` class implements an approximation based on
-  [numerical quadrature](
-  https://en.wikipedia.org/wiki/Numerical_integration) (default:
-  [Gauss--Hermite quadrature](
-  https://en.wikipedia.org/wiki/Gauss%E2%80%93Hermite_quadrature)). I.e., in
-  Note: although the `VectorDiffeomixture` is approximately the
-  `SoftmaxNormal-Distribution` compound distribution, it is itself a valid
-  distribution. It possesses a `sample`, `log_prob`, `mean`, `covariance` which
-  are all mutually consistent.
-
-  #### Intended Use
-
-  This distribution is noteworthy because it implements a mixture of
-  `Vector`-ized distributions yet has samples differentiable in the
-  distribution's parameters (aka "reparameterized"). It has an analytical
-  density function with `O(dKQ)` complexity. `d` is the vector dimensionality,
-  `K` is the number of components, and `Q` is the number of quadrature points.
-  These properties make it well-suited for Bayesian Variational Inference, i.e.,
-  as a surrogate family for the posterior.
-
-  For large values of `mix_scale`, the `VectorDistribution` behaves increasingly
-  like a discrete mixture. (In most cases this limit is only achievable by also
-  increasing the quadrature polynomial degree, `Q`.)
-
-  The term `Vector` is consistent with similar named Tensorflow `Distribution`s.
-  For more details, see the "About `Vector` distributions in Tensorflow."
-  section.
-
-  The term `Diffeomixture` is a portmanteau of
-  [diffeomorphism](https://en.wikipedia.org/wiki/Diffeomorphism) and [compound
-  mixture](https://en.wikipedia.org/wiki/Compound_probability_distribution). For
-  more details, see the "About `Diffeomixture`s and reparametrization.`"
-  section.
-
-  #### Mathematical Details
-
-  The `VectorDiffeomixture` approximates a SoftmaxNormal-mixed ("prior")
-  [compound distribution](
-  https://en.wikipedia.org/wiki/Compound_probability_distribution).
-  Using variable-substitution and [numerical quadrature](
-  https://en.wikipedia.org/wiki/Numerical_integration) (default:
-  [Gauss--Hermite quadrature](
-  https://en.wikipedia.org/wiki/Gauss%E2%80%93Hermite_quadrature)) we can
-  redefine the distribution to be a parameter-less convex combination of `K`
-  different affine combinations of a `d` iid samples from `distribution`.
-
-  That is, defined over `R**d` this distribution is parameterized by a
-  (batch of) length-`K` `mix_loc` and `mix_scale` vectors, a length-`K` list of
-  (a batch of) length-`d` `loc` vectors, and a length-`K` list of `scale`
-  `LinearOperator`s each operating on a (batch of) length-`d` vector space.
-  Finally, a `distribution` parameter specifies the underlying base distribution
-  which is "lifted" to become multivariate ("lifting" is the same concept as in
-  `TransformedDistribution`).
-
-  The probability density function (pdf) is,
+  The integral `int p(x | z) p(z) dz` is approximated with a quadrature scheme
+  adapted to the mixture density `p(z)`.  The `N` quadrature points `z_{N, n}`
+  and weights `w_{N, n}` (which are non-negative and sum to 1) are chosen
+  such that
 
-  ```none
-  pdf(y; mix_loc, mix_scale, loc, scale, phi)
-    = sum{ prob[i] phi(f_inverse(x; i)) / abs(det(interp_scale[i]))
-          : i=0, ..., Q-1 }
-  ```
+  ```q_N(x) := sum_{n=1}^N w_{n, N} p(x | z_{N, n}) --> p(x)```
 
-  where, `phi` is the base distribution pdf, and,
+  as `N --> infinity`.
 
-  ```none
-  f_inverse(x; i) = inv(interp_scale[i]) @ (x - interp_loc[i])
-  interp_loc[i]   = sum{ lambda[k; i] loc[k]   : k=0, ..., K-1 }
-  interp_scale[i] = sum{ lambda[k; i] scale[k] : k=0, ..., K-1 }
-  ```
+  Since `q_N(x)` is in fact a mixture (of `N` points), we may sample from
+  `q_N` exactly.  It is important to note that the VDM is *defined* as `q_N`
+  above, and *not* `p(x)`.  Therefore, sampling and pdf may be implemented as
+  exact (up to floating point error) methods.
 
-  and,
+  A common choice for the conditional `p(x | z)` is a multivariate Normal.
 
-  ```none
-  grid, weight = np.polynomial.hermite.hermgauss(quadrature_size)
-  prob[k]   = weight[k] / sqrt(pi)
-  lambda[k; i] = sigmoid(mix_loc[k] + sqrt(2) mix_scale[k] grid[i])
+  The implemented marginal `p(z)` is the `SoftmaxNormal`, which is a
+  `K-1` dimensional Normal transformed by a `SoftmaxCentered` bijector, making
+  it a density on the `K`-simplex.  That is,
+
+  ```
+  Z = SoftmaxCentered(X),
+  X = Normal(mix_loc / temperature, 1 / temperature)
   ```
 
-  The distribution corresponding to `phi` must be a scalar-batch, scalar-event
-  distribution. Typically it is reparameterized. If not, it must be a function
-  of non-trainable parameters.
+  The default quadrature scheme chooses `z_{N, n}` as `N` midpoints of
+  the quantiles of `p(z)` (generalized quantiles if `K > 2`).
 
-  WARNING: If you backprop through a VectorDiffeomixture sample and the "base"
-  distribution is both: not `FULLY_REPARAMETERIZED` and a function of trainable
-  variables, then the gradient is not guaranteed correct!
+  See [1] for more details.
+
+  [1]. "Quadrature Compound: An approximating family of distributions"
+       Joshua Dillon, Ian Langmore, arXiv preprints
+       https://arxiv.org/abs/1801.03080
 
   #### About `Vector` distributions in TensorFlow.
 
@@ -310,12 +260,11 @@ class VectorDiffeomixture(distribution_lib.Distribution):
   particularly useful in [variational Bayesian
   methods](https://en.wikipedia.org/wiki/Variational_Bayesian_methods).
 
-  Conditioned on a draw from the SoftmaxNormal, `Y|v` is a vector whose
+  Conditioned on a draw from the SoftmaxNormal, `X|z` is a vector whose
   components are linear combinations of affine transformations, thus is itself
-  an affine transformation. Therefore `Y|v` lives in the vector space generated
-  by vectors of affine-transformed distributions.
+  an affine transformation.
 
-  Note: The marginals `Y_1|v, ..., Y_d|v` are *not* generally identical to some
+  Note: The marginals `X_1|v, ..., X_d|v` are *not* generally identical to some
   parameterization of `distribution`.  This is due to the fact that the sum of
   draws from `distribution` are not generally itself the same `distribution`.
 
@@ -331,12 +280,16 @@ class VectorDiffeomixture(distribution_lib.Distribution):
   optimize Monte-Carlo objectives. Such objectives are a finite-sample
   approximation of an expectation and arise throughout scientific computing.
 
+  WARNING: If you backprop through a VectorDiffeomixture sample and the "base"
+  distribution is both: not `FULLY_REPARAMETERIZED` and a function of trainable
+  variables, then the gradient is not guaranteed correct!
+
   #### Examples
 
   ```python
   tfd = tf.contrib.distributions
 
-  # Create two batches of VectorDiffeomixtures, one with mix_loc=[0.] and
+  # Create two batches of VectorDiffeomixtures, one with mix_loc=[0.],
   # another with mix_loc=[1]. In both cases, `K=2` and the affine
   # transformations involve:
   # k=0: loc=zeros(dims)  scale=LinearOperatorScaledIdentity
@@ -344,7 +297,7 @@ class VectorDiffeomixture(distribution_lib.Distribution):
   dims = 5
   vdm = tfd.VectorDiffeomixture(
       mix_loc=[[0.], [1]],
-      mix_scale=[1.],
+      temperature=[1.],
       distribution=tfd.Normal(loc=0., scale=1.),
       loc=[
           None,  # Equivalent to `np.zeros(dims, dtype=np.float32)`.
@@ -364,7 +317,7 @@ class VectorDiffeomixture(distribution_lib.Distribution):
 
   def __init__(self,
                mix_loc,
-               mix_scale,
+               temperature,
                distribution,
                loc=None,
                scale=None,
@@ -373,15 +326,24 @@ class VectorDiffeomixture(distribution_lib.Distribution):
                validate_args=False,
                allow_nan_stats=True,
                name="VectorDiffeomixture"):
-    """Constructs the VectorDiffeomixture on `R**d`.
+    """Constructs the VectorDiffeomixture on `R^d`.
+
+    The vector diffeomixture (VDM) approximates the compound distribution
+
+    ```none
+    p(x) = int p(x | z) p(z) dz,
+    where z is in the K-simplex, and
+    p(x | z) := p(x | loc=sum_k z[k] loc[k], scale=sum_k z[k] scale[k])
+    ```
 
     Args:
-      mix_loc: `float`-like `Tensor` with shape `[b1, ..., bB, K-1]`. Represents
-        the `location` parameter of the SoftmaxNormal used for selecting one of
-        the `K` affine transformations.
-      mix_scale: `float`-like `Tensor` with shape `[b1, ..., bB, K-1]`.
-        Represents the `scale` parameter of the SoftmaxNormal used for selecting
-        one of the `K` affine transformations.
+      mix_loc: `float`-like `Tensor` with shape `[b1, ..., bB, K-1]`.
+        In terms of samples, larger `mix_loc[..., k]` ==>
+        `Z` is more likely to put more weight on its `kth` component.
+      temperature: `float`-like `Tensor`. Broadcastable with `mix_loc`.
+        In terms of samples, smaller `temperature` means one component is more
+        likely to dominate.  I.e., smaller `temperature` makes the VDM look more
+        like a standard mixture of `K` components.
       distribution: `tf.Distribution`-like instance. Distribution from which `d`
         iid samples are used as input to the selected affine transformation.
         Must be a scalar-batch, scalar-event distribution.  Typically
@@ -401,8 +363,9 @@ class VectorDiffeomixture(distribution_lib.Distribution):
         transformation. `LinearOperator`s must have shape `[B1, ..., Bb, d, d]`,
         `b >= 0`, i.e., characterizes `b`-batches of `d x d` matrices
       quadrature_size: Python `int` scalar representing number of
-        quadrature points.
-      quadrature_fn: Python callable taking `mix_loc`, `mix_scale`,
+        quadrature points.  Larger `quadrature_size` means `q_N(x)` better
+        approximates `p(x)`.
+      quadrature_fn: Python callable taking `normal_loc`, `normal_scale`,
         `quadrature_size`, `validate_args` and returning `tuple(grid, probs)`
         representing the SoftmaxNormal grid and corresponding normalized weight.
         normalized) weight.
@@ -430,7 +393,7 @@ class VectorDiffeomixture(distribution_lib.Distribution):
       ValueError: if `not distribution.is_scalar_event`.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[mix_loc, mix_scale]):
+    with ops.name_scope(name, values=[mix_loc, temperature]):
       if not scale or len(scale) < 2:
         raise ValueError("Must specify list (or list-like object) of scale "
                          "LinearOperators, one for each component with "
@@ -473,8 +436,15 @@ class VectorDiffeomixture(distribution_lib.Distribution):
         raise NotImplementedError("Currently only bimixtures are supported; "
                                   "len(scale)={} is not 2.".format(len(scale)))
 
+      mix_loc = ops.convert_to_tensor(
+          mix_loc, dtype=dtype, name="mix_loc")
+      temperature = ops.convert_to_tensor(
+          temperature, dtype=dtype, name="temperature")
       self._grid, probs = tuple(quadrature_fn(
-          mix_loc, mix_scale, quadrature_size, validate_args))
+          mix_loc / temperature,
+          1. / temperature,
+          quadrature_size,
+          validate_args))
 
       # Note: by creating the logits as `log(prob)` we ensure that
       # `self.mixture_distribution.logits` is equivalent to
@@ -618,7 +588,14 @@ class VectorDiffeomixture(distribution_lib.Distribution):
     weight = array_ops.gather(
         array_ops.reshape(self.grid, shape=[-1]),
         ids + offset)
-    weight = weight[..., array_ops.newaxis]
+    # At this point, weight flattened all batch dims into one.
+    # We also need to append a singleton to broadcast with event dims.
+    if self.batch_shape.is_fully_defined():
+      new_shape = [-1] + self.batch_shape.as_list() + [1]
+    else:
+      new_shape = array_ops.concat(
+          ([-1], self.batch_shape_tensor(), [1]), axis=0)
+    weight = array_ops.reshape(weight, shape=new_shape)
 
     if len(x) != 2:
       # We actually should have already triggered this exception. However as a
@@ -686,7 +663,7 @@ class VectorDiffeomixture(distribution_lib.Distribution):
 
     # To compute E[Cov(Z|V)], we'll add matrices within three categories:
     # scaled-identity, diagonal, and full. Then we'll combine these at the end.
-    scaled_identity = None
+    scale_identity_multiplier = None
     diag = None
     full = None
 
@@ -694,10 +671,12 @@ class VectorDiffeomixture(distribution_lib.Distribution):
       s = aff.scale  # Just in case aff.scale has side-effects, we'll call once.
       if (s is None
           or isinstance(s, linop_identity_lib.LinearOperatorIdentity)):
-        scaled_identity = add(scaled_identity, p[..., k, array_ops.newaxis])
+        scale_identity_multiplier = add(scale_identity_multiplier,
+                                        p[..., k, array_ops.newaxis])
       elif isinstance(s, linop_identity_lib.LinearOperatorScaledIdentity):
-        scaled_identity = add(scaled_identity, (p[..., k, array_ops.newaxis] *
-                                                math_ops.square(s.multiplier)))
+        scale_identity_multiplier = add(
+            scale_identity_multiplier,
+            (p[..., k, array_ops.newaxis] * math_ops.square(s.multiplier)))
       elif isinstance(s, linop_diag_lib.LinearOperatorDiag):
         diag = add(diag, (p[..., k, array_ops.newaxis] *
                           math_ops.square(s.diag_part())))
@@ -709,12 +688,13 @@ class VectorDiffeomixture(distribution_lib.Distribution):
         full = add(full, x)
 
     # We must now account for the fact that the base distribution might have a
-    # non-unity variance. Recall that `Cov(SX+m) = S.T Cov(X) S = S.T S Var(X)`.
+    # non-unity variance. Recall that, since X ~ iid Law(X_0),
+    #   `Cov(SX+m) = S Cov(X) S.T = S S.T Diag(Var(X_0))`.
     # We can scale by `Var(X)` (vs `Cov(X)`) since X corresponds to `d` iid
     # samples from a scalar-event distribution.
     v = self.distribution.variance()
-    if scaled_identity is not None:
-      scaled_identity *= v
+    if scale_identity_multiplier is not None:
+      scale_identity_multiplier *= v
     if diag is not None:
       diag *= v[..., array_ops.newaxis]
     if full is not None:
@@ -723,10 +703,10 @@ class VectorDiffeomixture(distribution_lib.Distribution):
     if diag_only:
       # Apparently we don't need the full matrix, just the diagonal.
       r = add(diag, full)
-      if r is None and scaled_identity is not None:
+      if r is None and scale_identity_multiplier is not None:
         ones = array_ops.ones(self.event_shape_tensor(), dtype=self.dtype)
-        return scaled_identity * ones
-      return add(r, scaled_identity)
+        return scale_identity_multiplier[..., array_ops.newaxis] * ones
+      return add(r, scale_identity_multiplier)
 
     # `None` indicates we don't know if the result is positive-definite.
     is_positive_definite = (True if all(aff.scale.is_positive_definite
@@ -742,10 +722,10 @@ class VectorDiffeomixture(distribution_lib.Distribution):
       to_add.append(linop_full_lib.LinearOperatorFullMatrix(
           matrix=full,
           is_positive_definite=is_positive_definite))
-    if scaled_identity is not None:
+    if scale_identity_multiplier is not None:
       to_add.append(linop_identity_lib.LinearOperatorScaledIdentity(
           num_rows=self.event_shape_tensor()[0],
-          multiplier=scaled_identity,
+          multiplier=scale_identity_multiplier,
           is_positive_definite=is_positive_definite))
 
     return (linop_add_lib.add_operators(to_add)[0].to_dense()
-- 
GitLab


From d9f93c42a50b1f1401d9c186eac0ae8dc9093c3b Mon Sep 17 00:00:00 2001
From: Jianwei Xie <xiejw@google.com>
Date: Wed, 24 Jan 2018 10:02:35 -0800
Subject: [PATCH 078/966] Merge changes from github.

PiperOrigin-RevId: 183100142
---
 RELEASE.md                                    |  174 ++-
 WORKSPACE                                     |    8 +-
 tensorflow/BUILD                              |    3 +
 tensorflow/c/eager/c_api.cc                   |    3 +-
 .../compiler/xla/tests/dynamic_ops_test.cc    |   28 +
 tensorflow/contrib/cmake/README.md            |    4 +-
 .../contrib/cmake/external/snappy.cmake       |    2 +-
 tensorflow/contrib/cmake/python_modules.txt   |    4 +
 .../contrib/cmake/python_sanity_test.py       |  124 ++
 .../contrib/cmake/tf_core_framework.cmake     |    2 +
 tensorflow/contrib/cmake/tf_python.cmake      |    9 +-
 .../cudnn_rnn/kernels/cudnn_rnn_ops.cc        |    2 +-
 .../cudnn_rnn/python/ops/cudnn_rnn_ops.py     |    2 +-
 .../contrib/eager/python/g3doc/guide.md       |    2 +-
 .../estimator/python/estimator/extenders.py   |    2 +-
 .../contrib/ffmpeg/default/ffmpeg_lib.cc      |    4 +-
 .../framework/python/ops/script_ops.py        |    1 +
 tensorflow/contrib/gan/python/train.py        |   16 +-
 .../contrib/layers/python/layers/layers.py    |    8 +-
 .../contrib/lite/download_dependencies.sh     |    9 +-
 tensorflow/contrib/lite/kernels/gather.cc     |    7 +-
 .../contrib/lite/kernels/gather_test.cc       |   27 +-
 .../lite/models/testdata/g3doc/README.md      |    4 +-
 .../contrib/lite/nnapi/NeuralNetworksShim.h   |    2 +-
 .../contrib/lite/testing/generate_examples.py |    2 +-
 .../lite/toco/g3doc/cmdline_reference.md      |    2 +-
 tensorflow/contrib/lite/tools/BUILD           |   21 +
 .../contrib/makefile/download_dependencies.sh |   13 +-
 tensorflow/contrib/opt/BUILD                  |   22 +
 tensorflow/contrib/opt/__init__.py            |    5 +-
 .../training/model_average_optimizer.py       |  299 ++++
 .../training/model_average_optimizer_test.py  |  200 +++
 tensorflow/contrib/periodic_resample/BUILD    |   18 +
 .../kernels/periodic_resample_op.h            |   11 +
 .../periodic_resample/ops/array_ops.cc        |   42 +-
 .../kernel_tests/periodic_resample_op_test.py |   16 +-
 .../python/kernel_tests/core_rnn_cell_test.py |    6 +
 .../rnn/python/kernel_tests/rnn_cell_test.py  |   95 ++
 tensorflow/contrib/rnn/python/ops/rnn_cell.py |  257 +++-
 .../kernel_tests/beam_search_decoder_test.py  |   88 ++
 .../seq2seq/python/ops/beam_search_decoder.py |   31 +-
 tensorflow/contrib/verbs/BUILD                |    6 +-
 tensorflow/contrib/verbs/README.md            |  174 ++-
 .../contrib/verbs/grpc_verbs_service.cc       |   12 +-
 .../verbs/patch_notes_verbs_with_0_copies.md  |   87 ++
 tensorflow/contrib/verbs/rdma.cc              | 1348 ++++++++++-------
 tensorflow/contrib/verbs/rdma.h               |  504 +++---
 tensorflow/contrib/verbs/rdma_mgr.cc          |  213 ++-
 tensorflow/contrib/verbs/rdma_mgr.h           |    1 +
 .../contrib/verbs/rdma_rendezvous_mgr.cc      |  114 +-
 tensorflow/contrib/verbs/verbs_server_lib.cc  |    1 +
 tensorflow/contrib/verbs/verbs_service.proto  |    6 +
 .../contrib/verbs/verbs_with_0_copies.png     |  Bin 0 -> 62862 bytes
 .../contrib/verbs/verbs_with_0_copies.xml     |    1 +
 .../verbs_with_0_copies_phase1_protocol.jpg   |  Bin 0 -> 88799 bytes
 .../verbs_with_0_copies_phase1_protocol.xml   |    1 +
 tensorflow/core/BUILD                         |    3 +
 tensorflow/core/framework/types.h             |    7 +
 tensorflow/core/graph/mkl_layout_pass.cc      |   58 +-
 tensorflow/core/kernels/BUILD                 |   17 +
 tensorflow/core/kernels/cuda_solvers.h        |    2 +-
 tensorflow/core/kernels/cwise_op_pow.cc       |    7 +-
 tensorflow/core/kernels/cwise_ops.h           |   35 +
 tensorflow/core/kernels/cwise_ops_common.cc   |    5 +
 tensorflow/core/kernels/decode_image_op.cc    |   39 +-
 tensorflow/core/kernels/matmul_op.cc          |    1 +
 tensorflow/core/kernels/mkl_aggregate_ops.cc  |   24 +
 tensorflow/core/kernels/mkl_concat_op.cc      |    4 +-
 .../core/kernels/mkl_conv_grad_filter_ops.cc  |    7 +
 .../core/kernels/mkl_conv_grad_input_ops.cc   |    7 +
 tensorflow/core/kernels/mkl_conv_ops.cc       |    7 +
 tensorflow/core/kernels/mkl_conv_ops.h        |   29 +-
 .../core/kernels/mkl_fused_batch_norm_op.cc   |  360 +++--
 .../core/kernels/mkl_input_conversion_op.cc   |    2 +-
 tensorflow/core/kernels/mkl_lrn_op.cc         |   34 +-
 tensorflow/core/kernels/mkl_maxpooling_op.cc  |   14 +-
 .../core/kernels/mkl_pooling_ops_common.h     |    4 +-
 tensorflow/core/kernels/mkl_reshape_op.cc     |   15 +-
 tensorflow/core/kernels/mkl_softmax_op.cc     |  163 ++
 tensorflow/core/kernels/pooling_ops_common.cc |    2 +-
 .../core/kernels/spectrogram_test_utils.cc    |   14 +
 .../core/kernels/transpose_functor_cpu.cc     |   12 +
 .../core/kernels/transpose_functor_gpu.cu.cc  |   21 +
 tensorflow/core/kernels/xent_op.cc            |   10 +-
 tensorflow/core/lib/gif/gif_io.cc             |   16 +-
 tensorflow/core/lib/gif/gif_io.h              |    3 +-
 tensorflow/core/ops/nn_ops.cc                 |    8 +
 tensorflow/core/platform/s3/aws_logging.cc    |    1 +
 tensorflow/core/platform/s3/s3_file_system.cc |   50 +-
 tensorflow/core/public/version.h              |    2 +-
 .../docs_src/api_guides/python/python_io.md   |   10 +-
 tensorflow/docs_src/install/install_c.md      |    2 +-
 tensorflow/docs_src/install/install_go.md     |    2 +-
 tensorflow/docs_src/install/install_java.md   |   22 +-
 tensorflow/docs_src/install/install_linux.md  |   22 +-
 tensorflow/docs_src/install/install_mac.md    |   10 +-
 .../docs_src/install/install_sources.md       |   14 +-
 .../tutorials/word2vec/word2vec_basic.py      |  104 +-
 .../kernel_tests/batch_dataset_op_test.py     |   22 +
 tensorflow/python/estimator/estimator.py      |    8 +-
 tensorflow/python/estimator/run_config.py     |    6 +-
 tensorflow/python/framework/constant_op.py    |    1 -
 tensorflow/python/kernel_tests/conv1d_test.py |   44 +-
 .../python/kernel_tests/cwise_ops_test.py     |   27 +
 .../python/kernel_tests/metrics_test.py       |   29 +-
 .../python/kernel_tests/xent_op_test.py       |   10 +
 tensorflow/python/layers/pooling.py           |   18 +-
 tensorflow/python/layers/pooling_test.py      |    8 +-
 tensorflow/python/lib/core/py_func.cc         |   19 +-
 tensorflow/python/ops/histogram_ops.py        |   65 +
 tensorflow/python/ops/histogram_ops_test.py   |   53 +
 tensorflow/python/ops/image_ops_impl.py       |   46 +-
 tensorflow/python/ops/image_ops_test.py       |   10 +
 tensorflow/python/ops/losses/losses_impl.py   |    2 +-
 tensorflow/python/ops/metrics_impl.py         |   56 +-
 tensorflow/python/ops/nn_impl.py              |    5 +-
 tensorflow/python/ops/nn_ops.py               |    9 +-
 tensorflow/python/ops/nn_test.py              |   12 +-
 tensorflow/python/ops/rnn_cell_impl.py        |    4 +
 tensorflow/python/summary/summary.py          |    5 +-
 tensorflow/python/util/compat.py              |   15 +
 .../tools/api/golden/tensorflow.compat.pbtxt  |    4 +
 ...sorflow.nn.rnn_cell.-dropout-wrapper.pbtxt |    4 +
 tensorflow/tools/api/golden/tensorflow.pbtxt  |    4 +
 .../tools/api/golden/tensorflow.summary.pbtxt |    2 +-
 tensorflow/tools/benchmark/BUILD              |    3 +-
 tensorflow/tools/benchmark/README.md          |    1 +
 .../tools/ci_build/builds/libtensorflow.sh    |    2 +-
 tensorflow/tools/ci_build/ci_sanity.sh        |   39 +-
 .../docker/parameterized_docker_build.sh      |    2 +-
 tensorflow/tools/pip_package/BUILD            |   21 -
 .../tools/pip_package/check_load_py_test.py   |    3 +
 .../tools/pip_package/pip_smoke_test.py       |    9 +-
 tensorflow/tools/pip_package/setup.py         |    2 +-
 tensorflow/tools/test/BUILD                   |    9 -
 tensorflow/tools/test/check_futures_test.py   |    2 +-
 tensorflow/workspace.bzl                      |    2 +
 third_party/git/git_configure.bzl             |    5 +
 third_party/toolchains/clang6/BUILD           |    1 +
 third_party/toolchains/clang6/CROSSTOOL.tpl   |  587 +++++++
 third_party/toolchains/clang6/README.md       |  101 ++
 third_party/toolchains/clang6/clang.BUILD     |  162 ++
 third_party/toolchains/clang6/repo.bzl        |   30 +
 143 files changed, 5265 insertions(+), 1416 deletions(-)
 create mode 100644 tensorflow/contrib/cmake/python_sanity_test.py
 create mode 100644 tensorflow/contrib/opt/python/training/model_average_optimizer.py
 create mode 100644 tensorflow/contrib/opt/python/training/model_average_optimizer_test.py
 create mode 100644 tensorflow/contrib/verbs/patch_notes_verbs_with_0_copies.md
 create mode 100644 tensorflow/contrib/verbs/verbs_with_0_copies.png
 create mode 100644 tensorflow/contrib/verbs/verbs_with_0_copies.xml
 create mode 100644 tensorflow/contrib/verbs/verbs_with_0_copies_phase1_protocol.jpg
 create mode 100644 tensorflow/contrib/verbs/verbs_with_0_copies_phase1_protocol.xml
 create mode 100644 tensorflow/core/kernels/mkl_softmax_op.cc
 create mode 100644 third_party/toolchains/clang6/BUILD
 create mode 100644 third_party/toolchains/clang6/CROSSTOOL.tpl
 create mode 100644 third_party/toolchains/clang6/README.md
 create mode 100644 third_party/toolchains/clang6/clang.BUILD
 create mode 100644 third_party/toolchains/clang6/repo.bzl

diff --git a/RELEASE.md b/RELEASE.md
index b24e83f053..fdf10407fd 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -13,46 +13,146 @@
 * [TensorFlow Lite](https://github.com/tensorflow/tensorflow/tree/r1.5/tensorflow/contrib/lite)
   dev preview is now available.
 * CUDA 9 and cuDNN 7 support.
+* Accelerated Linear Algebra (XLA):
+  * Add `complex64` support to XLA compiler.
+  * `bfloat` support is now added to XLA infrastructure.
+  * Make `ClusterSpec` propagation work with XLA devices.
+  * Use a determinisitic executor to generate XLA graph.
+* `tf.contrib`:
+  * `tf.contrib.distributions`:
+    * Add `tf.contrib.distributions.Autoregressive`.
+    * Make `tf.contrib.distributions` QuadratureCompound classes support batch
+    * Infer `tf.contrib.distributions.RelaxedOneHotCategorical` `dtype` from arguments.
+    * Make `tf.contrib.distributions` quadrature family parameterized by
+      `quadrature_grid_and_prob` vs `quadrature_degree`.
+    * `auto_correlation` added to `tf.contrib.distributions`
+  * Add `tf.contrib.bayesflow.layers`, a collection of probabilistic (neural) layers.
+  * Add `tf.contrib.bayesflow.halton_sequence`.
+  * Add `tf.contrib.data.make_saveable_from_iterator.`
+  * Add `tf.contrib.data.shuffle_and_repeat`.
+  * Add new custom transformation: `tf.contrib.data.scan()`.
+  * `tf.contrib.distributions.bijectors`:
+    * Add `tf.contrib.distributions.bijectors.MaskedAutoregressiveFlow`.
+    * Add `tf.contrib.distributions.bijectors.Permute`.
+    * Add `tf.contrib.distributions.bijectors.Gumbel`.
+    * Add `tf.contrib.distributions.bijectors.Reshape`.
+    * Support shape inference (i.e., shapes containing -1) in the Reshape bijector.
+* Add `streaming_precision_recall_at_equal_thresholds,` a method for computing
+  streaming precision and recall with `O(num_thresholds + size of predictions)`
+  time and space complexity.
+* Change `RunConfig` default behavior to not set a random seed, making random
+  behavior independently random on distributed workers. We expect this to
+  generally improve training performance. Models that do rely on determinism
+  should set a random seed explicitly.
+* Replaced the implementation of `tf.flags` with `absl.flags`.
+* Add support for `CUBLAS_TENSOR_OP_MATH` in fp16 GEMM
+* Add support for CUDA on NVIDIA Tegra devices
 
 ## Bug Fixes and Other Changes
-* `auto_correlation` added to `tf.contrib.distributions`.
-* Add `DenseFlipout` probabilistic layer.
-* Restandardize `DenseVariational` as simpler template for other probabilistic layers.
-* Make `tf.contrib.distributions` QuadratureCompound classes support batch.
+* Documentation updates:
+  * Clarified that you can only install TensorFlow on 64-bit machines.
+  * Added a short doc explaining how `Estimator`s save checkpoints.
+  * Add documentation for ops supported by the `tf2xla` bridge.
+  * Fix minor typos in the doc of `SpaceToDepth` and `DepthToSpace`.
+  * Updated documentation comments in `mfcc_mel_filterbank.h` and `mfcc.h` to
+    clarify that the input domain is squared magnitude spectra and the weighting
+    is done on linear magnitude spectra (sqrt of inputs).
+  * Change `tf.contrib.distributions` docstring examples to use `tfd` alias
+    rather than `ds`, `bs`.
+  * Fix docstring typos in `tf.distributions.bijectors.Bijector`.
+  * `tf.assert_equal` no longer raises `ValueError.` It now raises
+    `InvalidArgumentError,` as documented.
+  * Update Getting Started docs and API intro.
+* Google Cloud Storage (GCS):
+  * Add userspace DNS caching for the GCS client.
+  * Customize request timeouts for the GCS filesystem.
+  * Improve GCS filesystem caching.
+* Bug Fixes:
+  * Fix bug where partitioned integer variables got their wrong shapes. Before
+  * Fix correctness bug in CPU and GPU implementations of Adadelta.
+  * Fix a bug in `import_meta_graph`'s handling of partitioned variables when
+    importing into a scope. WARNING: This may break loading checkpoints of
+    graphs with partitioned variables saved after using `import_meta_graph` with
+    a non-empty `import_scope` argument.
+  * Fix bug in offline debugger which prevented viewing events.
+  * Added the `WorkerService.DeleteWorkerSession` method to the gRPC interface,
+    to fix a memory leak. Ensure that your master and worker servers are running
+    the same version of TensorFlow to avoid compatibility issues.
+  * Fix bug in peephole implementation of BlockLSTM cell.
+  * Fix bug by casting dtype of `log_det_jacobian` to match `log_prob` in
+    `TransformedDistribution`.
+  * Fix a bug in `import_meta_graph`'s handling of partitioned variables when
+  * Ensure `tf.distributions.Multinomial` doesn't underflow in `log_prob`.
+    Before this change, all partitions of an integer variable were initialized
+    with the shape of the unpartitioned variable; after this change they are
+    initialized correctly.
+* Other:
+  * Add necessary shape util support for bfloat16.
+  * Add a way to run ops using a step function to MonitoredSession.
+  * Add `DenseFlipout` probabilistic layer.
+  * A new flag `ignore_live_threads` is available on train. If set to `True`, it
+    will ignore threads that remain running when tearing down infrastructure
+    after successfully completing training, instead of throwing a RuntimeError.
+  * Restandardize `DenseVariational` as simpler template for other probabilistic
+    layers.
+  * `tf.data` now supports `tf.SparseTensor` components in dataset elements.
+  * It is now possible to iterate over `Tensor`s.
+  * Allow `SparseSegmentReduction` ops to have missing segment IDs.
+  * Modify custom export strategy to account for multidimensional sparse float
+    splits.
+  * `Conv2D`, `Conv2DBackpropInput`, `Conv2DBackpropFilter` now supports arbitrary
+    dilations with GPU and cuDNNv6 support.
+  * `Estimator` now supports `Dataset`: `input_fn` can return a `Dataset`
+    instead of `Tensor`s.
+  * Add `RevBlock`, a memory-efficient implementation of reversible residual layers.
+  * Reduce BFCAllocator internal fragmentation.
+  * Add `cross_entropy` and `kl_divergence` to `tf.distributions.Distribution`.
+  * Add `tf.nn.softmax_cross_entropy_with_logits_v2` which enables backprop
+    w.r.t. the labels.
+  * GPU back-end now uses `ptxas` to compile generated PTX.
+  * `BufferAssignment`'s protocol buffer dump is now deterministic.
+  * Change embedding op to use parallel version of `DynamicStitch`.
+  * Add support for sparse multidimensional feature columns.
+  * Speed up the case for sparse float columns that have only 1 value.
+  * Allow sparse float splits to support multivalent feature columns.
+  * Add `quantile` to `tf.distributions.TransformedDistribution`.
+  * Add `NCHW_VECT_C` support for `tf.depth_to_space` on GPU.
+  * Add `NCHW_VECT_C` support for `tf.space_to_depth` on GPU.
+
+## API Changes
+* Rename `SqueezeDims` attribute to `Axis` in C++ API for Squeeze op.
 * `Stream::BlockHostUntilDone` now returns Status rather than bool.
-* Customize request timeouts for the GCS filesystem.
+* Minor refactor: move stats files from `stochastic` to `common` and remove
+  `stochastic`.
 
 ## Thanks to our Contributors
 
 This release contains contributions from many people at Google, as well as:
 
-4d55397500, Abdullah Alrasheed, abenmao, Adam Salvail, Aditya Dhulipala, Ag Ramesh,
-Akimasa Kimura, Alan Du, Alan Yee, Alexander, Amit Kushwaha, Amy, Andrei Costinescu,
-Andrei Nigmatulin, Andrew Erlichson, Andrew Myers, Andrew Stepanov, Androbin, AngryPowman,
-Anish Shah, Anton Daitche, Artsiom Chapialiou, asdf2014, Aseem Raj Baranwal, Ash Hall,
-Bart Kiers, Batchu Venkat Vishal, ben, Ben Barsdell, Bill Piel, Carl Thomé, Catalin Voss,
-Changming Sun, Chengzhi Chen, Chi Zeng, Chris Antaki, Chris Donahue, Chris Oelmueller,
-Chris Tava, Clayne Robison, Codrut, Courtial Florian, Dalmo Cirne, Dan J, Darren Garvey,
-David Kristoffersson, David Norman, David RöThlisberger, DavidNorman, Dhruv, DimanNe,
-Dorokhov, Duncan Mac-Vicar P, EdwardDixon, EMCP, error.d, FAIJUL, Fan Xia,
-Francois Xavier, Fred Reiss, Freedom" Koan-Sin Tan, Fritz Obermeyer, Gao, Xiang,
-Guenther Schmuelling, Guo Yejun (郭叶军), Hans Gaiser, HectorSVC, Hyungsuk Yoon,
-James Pruegsanusak, Jay Young, Jean Wanka, Jeff Carpenter, Jeremy Rutman, Jeroen BéDorf,
-Jett Jones, Jimmy Jia, jinghuangintel, jinze1994, JKurland, Joel Hestness, joetoth,
-John B Nelson, John Impallomeni, John Lawson, Jonas, Jonathan Dekhtiar, joshkyh, Jun Luan,
-Jun Mei, Kai Sasaki, Karl Lessard, karl@kubx.ca, Kb Sriram, Kenichi Ueno, Kevin Slagle,
-Kongsea, Lakshay Garg, lhlmgr, Lin Min, liu.guangcong, Loki Der Quaeler, Louie Helm,
-lucasmoura, Luke Iwanski, Lyndon White, Mahmoud Abuzaina, Marcel Puyat, Mark Aaron Shirley,
-Michele Colombo, MtDersvan, Namrata-Ibm, Nathan Luehr, Naurril, Nayana Thorat, Nicolas Lopez,
-Niranjan Hasabnis, Nolan Liu, Nouce, Oliver Hennigh, osdamv, Patrik Erdes,
-Patryk Chrabaszcz, Pavel Christof, Penghao Cen, postBG, Qingqing Cao, Qingying Chen, qjivy,
-Raphael, Rasmi, raymondxyang, Renze Yu, resec, Roffel, Ruben Vereecken, Ryohei Kuroki,
-sandipmgiri, Santiago Castro, Scott Kirkland, Sean Vig, Sebastian Raschka, Sebastian Weiss,
-Sergey Kolesnikov, Sergii Khomenko, Shahid, Shivam Kotwalia, Stuart Berg, Sumit Gouthaman,
-superzerg, Sven Mayer, tetris, Ti Zhou, Tiago Freitas Pereira, Tian Jin, Tomoaki Oiki,
-Vaibhav Sood, vfdev, Vivek Rane, Vladimir Moskva, wangqr, Weber Xie, Will Frey,
-Yan Facai (颜发才), yanivbl6, Yaroslav Bulatov, Yixing Lao, Yong Tang, youkaichao,
-Yuan (Terry) Tang, Yue Zhang, Yuxin Wu, Ziming Dong, ZxYuan, 黄璞
+Adam Zahran, Ag Ramesh, Alan Lee, Alan Yee, Alex Sergeev, Alexander, Amir H. Jadidinejad,
+Amy, Anastasios Doumoulakis, Andrei Costinescu, Andrei Nigmatulin, Anthony Platanios,
+Anush Elangovan, arixlin, Armen Donigian, ArtëM Sobolev, Atlas7, Ben Barsdell, Bill Prin,
+Bo Wang, Brett Koonce, Cameron Thomas, Carl Thomé, Cem Eteke, cglewis, Changming Sun,
+Charles Shenton, Chi-Hung, Chris Donahue, Chris Filo Gorgolewski, Chris Hoyean Song,
+Chris Tava, Christian Grail, Christoph Boeddeker, cinqS, Clayne Robison, codrut3, concerttttt,
+CQY, Dan Becker, Dan Jarvis, Daniel Zhang, David Norman, dmaclach, Dmitry Trifonov,
+Donggeon Lim, dongpilYu, Dr. Kashif Rasul, Edd Wilder-James, Eric Lv, fcharras, Felix Abecassis,
+FirefoxMetzger, formath, FredZhang, Gaojin Cao, Gary Deer, Guenther Schmuelling, Hanchen Li,
+Hanmin Qin, hannesa2, hyunyoung2, Ilya Edrenkin, Jackson Kontny, Jan, Javier Luraschi,
+Jay Young, Jayaram Bobba, Jeff, Jeff Carpenter, Jeremy Sharpe, Jeroen BéDorf, Jimmy Jia,
+Jinze Bai, Jiongyan Zhang, Joe Castagneri, Johan Ju, Josh Varty, Julian Niedermeier,
+JxKing, Karl Lessard, Kb Sriram, Keven Wang, Koan-Sin Tan, Kyle Mills, lanhin, LevineHuang,
+Loki Der Quaeler, Loo Rong Jie, Luke Iwanski, LáSzló Csomor, Mahdi Abavisani, Mahmoud Abuzaina,
+ManHyuk, Marek ŠUppa, MathSquared, Mats Linander, Matt Wytock, Matthew Daley, Maximilian Bachl,
+mdymczyk, melvyniandrag, Michael Case, Mike Traynor, miqlas, Namrata-Ibm, Nathan Luehr,
+Nathan Van Doorn, Noa Ezra, Nolan Liu, Oleg Zabluda, opensourcemattress, Ouwen Huang,
+Paul Van Eck, peisong, Peng Yu, PinkySan, pks, powderluv, Qiao Hai-Jun, Qiao Longfei,
+Rajendra Arora, Ralph Tang, resec, Robin Richtsfeld, Rohan Varma, Ryohei Kuroki, SaintNazaire,
+Samuel He, Sandeep Dcunha, sandipmgiri, Sang Han, scott, Scott Mudge, Se-Won Kim, Simon Perkins,
+Simone Cirillo, Steffen Schmitz, Suvojit Manna, Sylvus, Taehoon Lee, Ted Chang, Thomas Deegan,
+Till Hoffmann, Tim, Toni Kunic, Toon Verstraelen, Tristan Rice, Urs KöSter, Utkarsh Upadhyay,
+Vish (Ishaya) Abrams, Winnie Tsang, Yan Chen, Yan Facai (颜发才), Yi Yang, Yong Tang,
+Youssef Hesham, Yuan (Terry) Tang, Zhengsheng Wei, zxcqwe4906, 张志豪, 田传武 
 
 We are also grateful to all who filed issues or helped resolve them, asked and
 answered questions, and were part of inspiring discussions.
@@ -60,7 +160,15 @@ answered questions, and were part of inspiring discussions.
 # Release 1.4.1
 
 ## Bug Fixes and Other Changes
-* `LinearClassifier` fix for the Google Cloud Machine Learning Engine.
+* `LinearClassifier` fix.
+
+# Release 1.4.0
+
+## Major Features And Improvements
+* `tf.keras` is now part of the core TensorFlow API.
+* [`tf.data`](http://tensorflow.org/programmers_guide/datasets) is now part of
+  the core TensorFlow API.
+  * The API is now subject to backwards compatibility guarantees.
 
 # Release 1.4.0
 
diff --git a/WORKSPACE b/WORKSPACE
index b40913801b..7ae39374f1 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -2,11 +2,11 @@ workspace(name = "org_tensorflow")
 
 http_archive(
     name = "io_bazel_rules_closure",
-    sha256 = "110fe68753413777944b473c25eed6368c4a0487cee23a7bac1b13cc49d3e257",
-    strip_prefix = "rules_closure-4af89ef1db659eb41f110df189b67d4cf14073e1",
+    sha256 = "6691c58a2cd30a86776dd9bb34898b041e37136f2dc7e24cadaeaf599c95c657",
+    strip_prefix = "rules_closure-08039ba8ca59f64248bb3b6ae016460fe9c9914f",
     urls = [
-        "https://mirror.bazel.build/github.com/bazelbuild/rules_closure/archive/4af89ef1db659eb41f110df189b67d4cf14073e1.tar.gz",
-        "https://github.com/bazelbuild/rules_closure/archive/4af89ef1db659eb41f110df189b67d4cf14073e1.tar.gz",  # 2017-08-28
+        "https://mirror.bazel.build/github.com/bazelbuild/rules_closure/archive/08039ba8ca59f64248bb3b6ae016460fe9c9914f.tar.gz",
+        "https://github.com/bazelbuild/rules_closure/archive/08039ba8ca59f64248bb3b6ae016460fe9c9914f.tar.gz",  # 2018-01-16
     ],
 )
 
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 1bf7c741b7..2ea0e38c78 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -662,6 +662,9 @@ filegroup(
         "//tensorflow/tools/quantization:all_files",
         "//tensorflow/tools/test:all_files",
         "//tensorflow/user_ops:all_files",
+        "//third_party/eigen3:all_files",
+        "//third_party/fft2d:all_files",
+        "//third_party/flatbuffers:all_files",
         "//third_party/hadoop:all_files",
         "//third_party/sycl:all_files",
         "//third_party/sycl/sycl:all_files",
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index e4e33016c2..ecba8c0109 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -190,6 +190,7 @@ TFE_TensorHandle* TFE_TensorHandleCopyToDevice(TFE_TensorHandle* h,
   bool is_same_device =
       (srcd == dstd) || (DeviceName(srcd) == DeviceName(dstd));
   const bool dst_cpu = IsCPU(dstd);
+  const bool src_cpu = IsCPU(srcd);
   if (is_same_device) {
     return new TFE_TensorHandle(h->t, dst_cpu ? nullptr : dstd);
   }
@@ -213,7 +214,7 @@ TFE_TensorHandle* TFE_TensorHandleCopyToDevice(TFE_TensorHandle* h,
     return new TFE_TensorHandle(dst, dst_cpu ? nullptr : dstd);
   }
   tensorflow::DeviceContext* src_device_context = nullptr;
-  if (!IsCPU(srcd)) {
+  if (!src_cpu) {
     src_device_context = srcd->tensorflow_gpu_device_info()->default_context;
   }
   tensorflow::DeviceContext* dst_device_context = nullptr;
diff --git a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
index ae3f887240..877dc7db0e 100644
--- a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
@@ -595,6 +595,11 @@ XLA_TEST_F(DynamicUpdateSliceTest, R3ContiguousSingleElement) {
   // Single element, no wrap.
   std::vector<int32> operand_shape({4, 5, 2});
   RunR3Contiguous<float>(operand_shape, /*index=*/1, /*size=*/1);
+}
+
+XLA_TEST_F(DynamicUpdateSliceTest, R3ContiguousSingleElementBF16) {
+  // Single element, no wrap.
+  std::vector<int32> operand_shape({4, 5, 2});
   RunR3Contiguous<bfloat16>(operand_shape, /*index=*/1, /*size=*/1);
 }
 
@@ -602,6 +607,11 @@ XLA_TEST_F(DynamicUpdateSliceTest, R3ContiguousMultipleElements) {
   // Multiple element, no wrap.
   std::vector<int32> operand_shape({4, 5, 2});
   RunR3Contiguous<float>(operand_shape, /*index=*/1, /*size=*/2);
+}
+
+XLA_TEST_F(DynamicUpdateSliceTest, R3ContiguousMultipleElementsBF16) {
+  // Multiple element, no wrap.
+  std::vector<int32> operand_shape({4, 5, 2});
   RunR3Contiguous<bfloat16>(operand_shape, /*index=*/1, /*size=*/2);
 }
 
@@ -609,6 +619,11 @@ XLA_TEST_F(DynamicUpdateSliceTest, R3ContiguousMultipleWrapping) {
   // Multiple element, wrapping.
   std::vector<int32> operand_shape({4, 5, 2});
   RunR3Contiguous<float>(operand_shape, /*index=*/3, /*size=*/2);
+}
+
+XLA_TEST_F(DynamicUpdateSliceTest, R3ContiguousMultipleWrappingBF16) {
+  // Multiple element, wrapping.
+  std::vector<int32> operand_shape({4, 5, 2});
   RunR3Contiguous<bfloat16>(operand_shape, /*index=*/3, /*size=*/2);
 }
 
@@ -616,12 +631,21 @@ XLA_TEST_F(DynamicUpdateSliceTest, R3ContiguousTooLarge) {
   // Multiple element, update size larger than operand.
   std::vector<int32> operand_shape({4, 5, 2});
   RunR3Contiguous<float>(operand_shape, /*index=*/5, /*size=*/2);
+}
+
+XLA_TEST_F(DynamicUpdateSliceTest, R3ContiguousTooLargeBF16) {
+  // Multiple element, update size larger than operand.
+  std::vector<int32> operand_shape({4, 5, 2});
   RunR3Contiguous<bfloat16>(operand_shape, /*index=*/5, /*size=*/2);
 }
 
 XLA_TEST_F(DynamicUpdateSliceTest, R3ContiguousUnaligned) {
   std::vector<int32> operand_shape({3, 123, 247});
   RunR3Contiguous<float>(operand_shape, /*index=*/1, /*size=*/1);
+}
+
+XLA_TEST_F(DynamicUpdateSliceTest, R3ContiguousUnalignedBF16) {
+  std::vector<int32> operand_shape({3, 123, 247});
   RunR3Contiguous<bfloat16>(operand_shape, /*index=*/1, /*size=*/1);
 }
 
@@ -629,6 +653,10 @@ XLA_TEST_F(DynamicUpdateSliceTest, R3ContiguousUnaligned) {
 XLA_TEST_F(DynamicUpdateSliceTest, DISABLED_ON_GPU(R3ContiguousLarger)) {
   std::vector<int32> operand_shape({32, 128, 1024});
   RunR3Contiguous<float>(operand_shape, /*index=*/7, /*size=*/1);
+}
+
+XLA_TEST_F(DynamicUpdateSliceTest, DISABLED_ON_GPU(R3ContiguousLargerBF16)) {
+  std::vector<int32> operand_shape({32, 128, 1024});
   RunR3Contiguous<bfloat16>(operand_shape, /*index=*/7, /*size=*/1);
 }
 
diff --git a/tensorflow/contrib/cmake/README.md b/tensorflow/contrib/cmake/README.md
index 4be733a280..8f85a75ee4 100644
--- a/tensorflow/contrib/cmake/README.md
+++ b/tensorflow/contrib/cmake/README.md
@@ -30,7 +30,7 @@ bindings.
 
 * CMake version 3.5 or later.
 
-* [Git](http://git-scm.com)
+* [Git](https://git-scm.com)
 
 * [SWIG](http://www.swig.org/download.html)
 
@@ -48,7 +48,7 @@ bindings.
 
 * Microsoft Windows 10
   - Microsoft Visual Studio Enterprise 2015 with Visual C++ 2015
-  - [Anaconda 4.1.1 (Python 3.5 64-bit)](https://www.continuum.io/downloads)
+  - [Anaconda 4.1.1 (Python 3.5 64-bit)](https://www.anaconda.com/download/)
   - [Git for Windows version 2.9.2.windows.1](https://git-scm.com/download/win)
   - [swigwin-3.0.10](http://www.swig.org/download.html)
   - [NVidia CUDA Toolkit 8.0](https://developer.nvidia.com/cuda-downloads)
diff --git a/tensorflow/contrib/cmake/external/snappy.cmake b/tensorflow/contrib/cmake/external/snappy.cmake
index 013b3a862f..fd57734298 100644
--- a/tensorflow/contrib/cmake/external/snappy.cmake
+++ b/tensorflow/contrib/cmake/external/snappy.cmake
@@ -47,4 +47,4 @@ ExternalProject_Add(snappy
 )
 
 # actually enables snappy in the source code
-add_definitions(-DTF_USE_SNAPPY)
\ No newline at end of file
+add_definitions(-DTF_USE_SNAPPY)
diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt
index dec6c513ba..7db454bd83 100644
--- a/tensorflow/contrib/cmake/python_modules.txt
+++ b/tensorflow/contrib/cmake/python_modules.txt
@@ -1,3 +1,5 @@
+# python_sanity_test.py will complain about invalid or missing entries
+# problematic entries can be commented for temporary whitelisting
 tensorflow
 tensorflow/core
 tensorflow/core/example
@@ -307,6 +309,8 @@ tensorflow/contrib/metrics
 tensorflow/contrib/metrics/python
 tensorflow/contrib/metrics/python/metrics
 tensorflow/contrib/metrics/python/ops
+tensorflow/contrib/mpi_collectives/python
+tensorflow/contrib/mpi_collectives/python/ops
 tensorflow/contrib/model_pruning
 tensorflow/contrib/model_pruning/examples
 tensorflow/contrib/model_pruning/examples/cifar10
diff --git a/tensorflow/contrib/cmake/python_sanity_test.py b/tensorflow/contrib/cmake/python_sanity_test.py
new file mode 100644
index 0000000000..3be5bd1b23
--- /dev/null
+++ b/tensorflow/contrib/cmake/python_sanity_test.py
@@ -0,0 +1,124 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Complain about invalid or missing entries in python_*.txt files.
+Problematic entries can be commented for temporary whitelisting.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import unittest
+
+
+def abs_path(path):
+  root = os.path.dirname(__file__)
+
+  for _ in range(3):
+    root = os.path.join(root, os.pardir)
+
+  path = os.path.join(root, path)
+  path = os.path.abspath(path)
+  return path
+
+def read_entries(test):
+  with open(abs_path(test.entries_file), "r") as f:
+    lines = f.readlines()
+
+  lines = [line.strip() for line in lines]
+  lines = [line for line in lines if line]
+
+  test.entries = []
+  test.whitelist = []
+
+  for line in lines:
+    # line is comment
+    if line.startswith('#'):
+      line = line[1:].strip()
+      # whitelist entry
+      if line.startswith('tensorflow/'):
+        test.whitelist.append(line)
+    # line has comment -> strip comment
+    elif line.find('#') != -1:
+      line = line[:line.find('#')].strip()
+      test.entries.append(line)
+    else:
+      test.entries.append(line)
+
+def test_invalid_directories(test):
+  for entry in test.entries:
+    if not os.path.isdir(abs_path(entry)):
+      problem = "'" + test.entries_file + "' contains invalid '" + entry + "'"
+      solution = "Please remove the invalid entry (or add the missing directory)."
+      raise AssertionError(problem + "\n" + solution)
+
+def test_missing_directory(test, path):
+  if path in test.whitelist:
+    return
+
+  dir_exists = os.path.isdir(abs_path(path))
+  entry_exists = path in test.entries
+
+  if dir_exists and not entry_exists:
+    problem = "'" + test.entries_file + "' is missing '" + path + "'"
+    solution = "Please add the missing entry (comment to whitelist if needed)."
+    raise AssertionError(problem + "\n" + solution)
+
+
+class PythonModuleTest(unittest.TestCase):
+
+  def setUp(self):
+    self.entries_file = "tensorflow/contrib/cmake/python_modules.txt"
+    read_entries(self)
+
+  def testInvalidEntries(self):
+    test_invalid_directories(self)
+
+  def testMissingModules(self):
+    module_names = next(os.walk(abs_path("tensorflow/contrib")))[1]
+
+    for module_name in module_names:
+      path = "tensorflow/contrib/" + module_name
+
+      test_missing_directory(self, path + "/python")
+      test_missing_directory(self, path + "/python/ops")
+      test_missing_directory(self, path + "/python/kernels")
+      test_missing_directory(self, path + "/python/layers")
+
+
+class PythonProtoTest(unittest.TestCase):
+
+  def setUp(self):
+    self.entries_file = "tensorflow/contrib/cmake/python_protos.txt"
+    read_entries(self)
+
+  def testInvalidEntries(self):
+    test_invalid_directories(self)
+
+
+class PythonProtoCCTest(unittest.TestCase):
+
+  def setUp(self):
+    self.entries_file = "tensorflow/contrib/cmake/python_protos_cc.txt"
+    read_entries(self)
+
+  def testInvalidEntries(self):
+    test_invalid_directories(self)
+
+
+if __name__ == "__main__":
+  unittest.main()
diff --git a/tensorflow/contrib/cmake/tf_core_framework.cmake b/tensorflow/contrib/cmake/tf_core_framework.cmake
index 24d7fb82a2..129c208ecd 100644
--- a/tensorflow/contrib/cmake/tf_core_framework.cmake
+++ b/tensorflow/contrib/cmake/tf_core_framework.cmake
@@ -126,7 +126,9 @@ endfunction()
 file(GLOB_RECURSE tf_protos_cc_srcs RELATIVE ${tensorflow_source_dir}
     "${tensorflow_source_dir}/tensorflow/core/*.proto"
     "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/proto/*.proto"
+    "${tensorflow_source_dir}/tensorflow/contrib/tpu/proto/*.proto"
 )
+
 RELATIVE_PROTOBUF_GENERATE_CPP(PROTO_SRCS PROTO_HDRS
     ${tensorflow_source_dir} ${tf_protos_cc_srcs}
 )
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index b62a031749..8862390d2b 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -126,7 +126,8 @@ STRING(REGEX REPLACE ";" "\\\\;" python_protos "${python_protos}")
 STRING(REGEX REPLACE "\n" ";" python_protos "${python_protos}")
 
 foreach(python_proto ${python_protos})
-  if(NOT python_proto MATCHES "\#")
+  if(NOT python_proto MATCHES "^\#")
+    STRING(REGEX REPLACE " *\#.*" "" python_proto "${python_proto}")
     if(NOT EXISTS "${tensorflow_source_dir}/${python_proto}")
       message(SEND_ERROR "Python proto directory not found: ${python_proto}")
     endif()
@@ -147,7 +148,8 @@ STRING(REGEX REPLACE ";" "\\\\;" python_protos_cc "${python_protos_cc}")
 STRING(REGEX REPLACE "\n" ";" python_protos_cc "${python_protos_cc}")
 
 foreach(python_proto_cc ${python_protos_cc})
-  if(NOT python_proto_cc MATCHES "\#")
+  if(NOT python_proto_cc MATCHES "^\#")
+    STRING(REGEX REPLACE " *\#.*" "" python_proto_cc "${python_proto_cc}")
     if(NOT EXISTS "${tensorflow_source_dir}/${python_proto_cc}")
       message(SEND_ERROR "Python proto CC directory not found: ${python_proto_cc}")
     endif()
@@ -209,7 +211,8 @@ STRING(REGEX REPLACE ";" "\\\\;" python_modules "${python_modules}")
 STRING(REGEX REPLACE "\n" ";" python_modules "${python_modules}")
 
 foreach(python_module ${python_modules})
-  if(NOT python_module MATCHES "\#")
+  if(NOT python_module MATCHES "^\#")
+    STRING(REGEX REPLACE " *\#.*" "" python_module "${python_module}")
     if(NOT EXISTS "${tensorflow_source_dir}/${python_module}")
       message(SEND_ERROR "Python module not found: ${python_module}")
     endif()
diff --git a/tensorflow/contrib/cudnn_rnn/kernels/cudnn_rnn_ops.cc b/tensorflow/contrib/cudnn_rnn/kernels/cudnn_rnn_ops.cc
index 6b0452e7af..ba9686e94e 100644
--- a/tensorflow/contrib/cudnn_rnn/kernels/cudnn_rnn_ops.cc
+++ b/tensorflow/contrib/cudnn_rnn/kernels/cudnn_rnn_ops.cc
@@ -649,7 +649,7 @@ class CudnnRNNParamsToCanonical<GPUDevice, T> : public CudnnRNNKernelCommon {
     }
     const int num_params_per_layer = num_params_ / num_layers / num_dirs;
     // Number of params applied on inputs. The rest are applied on recurrent
-    // hiddden states.
+    // hidden states.
     const int num_params_input_state = num_params_per_layer / 2;
     CHECK(num_params_ % (num_layers * num_dirs) == 0)
         << "Number of params is not a multiple of num_layers * num_dirs.";
diff --git a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
index 5f6e3be28f..e87162f0ee 100644
--- a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
+++ b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
@@ -1542,7 +1542,7 @@ class _CudnnRNNNoInputC(_CudnnRNN):
       params: the parameter buffer created for this model.
       is_training: whether this operation will be used in training or inference.
     Returns:
-      output: the output sequuence.
+      output: the output sequence.
       output_h: the final state for h.
     """
     return _cudnn_rnn_no_input_c(
diff --git a/tensorflow/contrib/eager/python/g3doc/guide.md b/tensorflow/contrib/eager/python/g3doc/guide.md
index 0095ffa0db..7eea93ce1f 100644
--- a/tensorflow/contrib/eager/python/g3doc/guide.md
+++ b/tensorflow/contrib/eager/python/g3doc/guide.md
@@ -292,7 +292,7 @@ def loss(weight, bias):
   error = prediction(training_inputs, weight, bias) - training_outputs
   return tf.reduce_mean(tf.square(error))
 
-# Function that returns the the derivative of loss with respect to
+# Function that returns the derivative of loss with respect to
 # weight and bias
 grad = tfe.gradients_function(loss)
 
diff --git a/tensorflow/contrib/estimator/python/estimator/extenders.py b/tensorflow/contrib/estimator/python/estimator/extenders.py
index 29c3c73585..c99bf8badb 100644
--- a/tensorflow/contrib/estimator/python/estimator/extenders.py
+++ b/tensorflow/contrib/estimator/python/estimator/extenders.py
@@ -100,7 +100,7 @@ def add_metrics(estimator, metric_fn):
 
 
 def clip_gradients_by_norm(optimizer, clip_norm):
-  """Returns an optimizer which clips gradients before appliying them.
+  """Returns an optimizer which clips gradients before applying them.
 
   Example:
 
diff --git a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
index 3c51deefbc..c85b1837ab 100644
--- a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
+++ b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
@@ -82,7 +82,9 @@ std::vector<string> FfmpegVideoCommandLine(const string& input_filename,
           "-probesize",
           StrCat(kDefaultProbeSize),
           "-loglevel",
-          "error",  // Print errors only.
+          // Info is needed to get the information about stream, etc.
+          // It is generated to a separate file, not stdout/stderr.
+          "info",
           "-hide_banner",  // Skip printing build options, version, etc.
           "-vcodec",
           "rawvideo",
diff --git a/tensorflow/contrib/framework/python/ops/script_ops.py b/tensorflow/contrib/framework/python/ops/script_ops.py
index fdbb77bbe4..5d269fefdc 100644
--- a/tensorflow/contrib/framework/python/ops/script_ops.py
+++ b/tensorflow/contrib/framework/python/ops/script_ops.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+
 """Script Language Operators. See the @{$python/script_ops} guide.
 
 @@py_func
diff --git a/tensorflow/contrib/gan/python/train.py b/tensorflow/contrib/gan/python/train.py
index a32ddd7a06..5d0ac93aec 100644
--- a/tensorflow/contrib/gan/python/train.py
+++ b/tensorflow/contrib/gan/python/train.py
@@ -279,14 +279,16 @@ def acgan_model(
     generator_inputs = _convert_tensor_or_l_or_d(generator_inputs)
     generated_data = generator_fn(generator_inputs)
   with variable_scope.variable_scope(discriminator_scope) as dis_scope:
-    (discriminator_gen_outputs, discriminator_gen_classification_logits
-    ) = _validate_acgan_discriminator_outputs(
-        discriminator_fn(generated_data, generator_inputs))
+    with ops.name_scope(dis_scope.name+'/generated/'):
+      (discriminator_gen_outputs, discriminator_gen_classification_logits
+      ) = _validate_acgan_discriminator_outputs(
+          discriminator_fn(generated_data, generator_inputs))
   with variable_scope.variable_scope(dis_scope, reuse=True):
-    real_data = ops.convert_to_tensor(real_data)
-    (discriminator_real_outputs, discriminator_real_classification_logits
-    ) = _validate_acgan_discriminator_outputs(
-        discriminator_fn(real_data, generator_inputs))
+    with ops.name_scope(dis_scope.name+'/real/'):
+      real_data = ops.convert_to_tensor(real_data)
+      (discriminator_real_outputs, discriminator_real_classification_logits
+      ) = _validate_acgan_discriminator_outputs(
+          discriminator_fn(real_data, generator_inputs))
   if check_shapes:
     if not generated_data.shape.is_compatible_with(real_data.shape):
       raise ValueError(
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index f3229a1605..ef2b673074 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -479,8 +479,12 @@ def batch_norm(inputs,
 
     Sergey Ioffe, Christian Szegedy
 
-  Can be used as a normalizer function for conv2d and fully_connected.
-
+  Can be used as a normalizer function for conv2d and fully_connected. The
+  normalization is over all but the last dimension if `data_format` is `NHWC`
+  and all but the second dimension if `data_format` is `NCHW`.  In case of a 2D
+  tensor this corresponds to the batch dimension, while in case of a 4D tensor this
+  corresponds to the batch and space dimensions.
+  
   Note: when training, the moving_mean and moving_variance need to be updated.
   By default the update ops are placed in `tf.GraphKeys.UPDATE_OPS`, so they
   need to be added as a dependency to the `train_op`. For example:
diff --git a/tensorflow/contrib/lite/download_dependencies.sh b/tensorflow/contrib/lite/download_dependencies.sh
index 362e5bee25..e1b7b3613a 100755
--- a/tensorflow/contrib/lite/download_dependencies.sh
+++ b/tensorflow/contrib/lite/download_dependencies.sh
@@ -22,7 +22,14 @@ cd "$SCRIPT_DIR/../../.."
 DOWNLOADS_DIR=tensorflow/contrib/lite/downloads
 BZL_FILE_PATH=tensorflow/workspace.bzl
 
-EIGEN_URL="$(grep -o 'http.*bitbucket.org/eigen/eigen/get/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v bazel-mirror | head -n1)"
+# Ensure it is being run from repo root
+if [ ! -f $BZL_FILE_PATH ]; then
+  echo "Could not find ${BZL_FILE_PATH}":
+  echo "Likely you are not running this from the root directory of the repository.";
+  exit 1;
+fi
+
+EIGEN_URL="$(grep -o 'http.*bitbucket.org/eigen/eigen/get/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)"
 GEMMLOWP_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
 GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.gz"
 ABSL_URL="$(grep -o 'https://github.com/abseil/abseil-cpp/.*tar.gz' "${BZL_FILE_PATH}" | head -n1)"
diff --git a/tensorflow/contrib/lite/kernels/gather.cc b/tensorflow/contrib/lite/kernels/gather.cc
index f8df797daf..0e4187d1ea 100644
--- a/tensorflow/contrib/lite/kernels/gather.cc
+++ b/tensorflow/contrib/lite/kernels/gather.cc
@@ -42,9 +42,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, positions->type, kTfLiteInt32);
   // Check that input and output types match.
   TF_LITE_ENSURE_EQ(context, input->type, output->type);
-  // TODO(mgubin): only 1D positions are currently supported.
-  TF_LITE_ENSURE_EQ(context, NumDimensions(positions), 1);
+  // TODO(mgubin): only 0D or 1D positions are currently supported.
+  TF_LITE_ENSURE(context, NumDimensions(positions) <= 1);
   // TODO(mgubin): Only default axis == 0 is supported.
+  TF_LITE_ENSURE_EQ(context, params->axis, 0);
   // Check conditions for different types.
   switch (input->type) {
     case kTfLiteFloat32:
@@ -64,7 +65,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   }
   const int num_dimensions =
       NumDimensions(input) + NumDimensions(positions) - 1;
-  TF_LITE_ENSURE(context, params->axis < num_dimensions);
+  TF_LITE_ENSURE(context, params->axis <= num_dimensions);
   TfLiteIntArray* output_shape = TfLiteIntArrayCreate(num_dimensions);
   int output_index = 0;
   for (int i = 0; i < params->axis; ++i) {
diff --git a/tensorflow/contrib/lite/kernels/gather_test.cc b/tensorflow/contrib/lite/kernels/gather_test.cc
index 6343d3b4ef..658d977b8d 100644
--- a/tensorflow/contrib/lite/kernels/gather_test.cc
+++ b/tensorflow/contrib/lite/kernels/gather_test.cc
@@ -48,8 +48,8 @@ class GatherOpModel : public SingleOpModel {
     PopulateStringTensor(input_, data);
   }
 
-  void SetPositions(std::initializer_list<int32> data) {
-    PopulateTensor<int32>(positions_, data);
+  void SetPositions(std::initializer_list<int> data) {
+    PopulateTensor<int>(positions_, data);
   }
 
   std::vector<float> GetOutputFloat() { return ExtractVector<float>(output_); }
@@ -76,6 +76,29 @@ TEST(GatherOpTest, Shuffle) {
               ElementsAreArray(ArrayFloatNear({0.7, 0.8, -2, 0.2})));
 }
 
+TEST(GatherOpTest, Test0DIndex) {
+  GatherOpModel m({2, 2}, TensorType_FLOAT32, {});
+  m.SetInputFloat({-2.0, 0.2, 0.7, 0.8});
+  m.SetPositions({1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputFloat(),
+              ElementsAreArray(ArrayFloatNear({0.7, 0.8})));
+  EXPECT_THAT(m.GetOutputShape(),
+              ElementsAreArray({2}));
+}
+
+TEST(GatherOpTest, Test0DIndexWith0DResult) {
+  // 0D tensor is special case in current TFLite. Test it once to make sure
+  // existing workarounds are fine with it.
+  GatherOpModel m({3}, TensorType_FLOAT32, {});
+  m.SetInputFloat({1.0, 2.0, 3.0});
+  m.SetPositions({1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputFloat(),
+              ElementsAreArray(ArrayFloatNear({2.0})));
+  EXPECT_TRUE(m.GetOutputShape().empty());
+}
+
 TEST(FloatGatherOpTest, Duplicate) {
   GatherOpModel m({1, 2, 2}, TensorType_FLOAT32, {2});
   m.SetInputFloat({-2.0, 0.2, 0.7, 0.8});
diff --git a/tensorflow/contrib/lite/models/testdata/g3doc/README.md b/tensorflow/contrib/lite/models/testdata/g3doc/README.md
index 667a588383..1c47e00aae 100644
--- a/tensorflow/contrib/lite/models/testdata/g3doc/README.md
+++ b/tensorflow/contrib/lite/models/testdata/g3doc/README.md
@@ -53,7 +53,7 @@ with the corresponding parameters as shown in the figure.
 ### Automatic Speech Recognizer (ASR) Acoustic Model (AM)
 
 The acoustic model for automatic speech recognition is the neural network model
-for matching phonemes to the input autio features. It generates posterior
+for matching phonemes to the input audio features. It generates posterior
 probabilities of phonemes from speech frontend features (log-mel filterbanks).
 It has an input size of 320 (float), an output size of 42 (float), five LSTM
 layers and one fully connected layers with a Softmax activation function, with
@@ -68,7 +68,7 @@ for predicting the probability of a word given previous words in a sentence.
 It generates posterior probabilities of the next word based from a sequence of
 words. The words are encoded as indices in a fixed size dictionary.
 The model has two inputs both of size one (integer): the current word index and
-next word index, an output size of one (float): the log probability. It consits
+next word index, an output size of one (float): the log probability. It consists
 of three embedding layer, three LSTM layers, followed by a multiplication, a
 fully connected layers and an addition.
 The corresponding parameters as shown in the figure.
diff --git a/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h b/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
index 3cda4bcccc..7019c29959 100644
--- a/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
+++ b/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
@@ -370,7 +370,7 @@ enum {
    * Looks up items from a given tensor.
    *
    * Each item in the output is a raw copy of the corresponding item in
-   * the input “values”. If the the given “lookup” indices are out of bounds,
+   * the input “values”. If the given “lookup” indices are out of bounds,
    * the op will fail and an error will be reported.
    *
    * Inputs:
diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py
index 6204471e52..c225cd4f00 100644
--- a/tensorflow/contrib/lite/testing/generate_examples.py
+++ b/tensorflow/contrib/lite/testing/generate_examples.py
@@ -1170,7 +1170,7 @@ def make_pad_tests(zip_path):
 def make_reshape_tests(zip_path):
   """Make a set of tests to do reshape."""
 
-  # Alll shapes below are suitable for tensors with 420 elements.
+  # All shapes below are suitable for tensors with 420 elements.
   test_parameters = [{
       "dtype": [tf.float32, tf.int32],
       "input_shape": [[3, 4, 5, 7], [4, 105], [21, 5, 2, 2], [420]],
diff --git a/tensorflow/contrib/lite/toco/g3doc/cmdline_reference.md b/tensorflow/contrib/lite/toco/g3doc/cmdline_reference.md
index 4776741ab9..5e07795223 100644
--- a/tensorflow/contrib/lite/toco/g3doc/cmdline_reference.md
+++ b/tensorflow/contrib/lite/toco/g3doc/cmdline_reference.md
@@ -229,7 +229,7 @@ additional information about the multiple input arrays:
     well-formed quantized representation of these graphs. Such graphs should be
     fixed, but as a temporary work-around, setting this
     reorder_across_fake_quant flag allows the converter to perform necessary
-    graph transformaitons on them, at the cost of no longer faithfully matching
+    graph transformations on them, at the cost of no longer faithfully matching
     inference and training arithmetic.
 
 ### Logging flags
diff --git a/tensorflow/contrib/lite/tools/BUILD b/tensorflow/contrib/lite/tools/BUILD
index 6e2d633765..20df905270 100644
--- a/tensorflow/contrib/lite/tools/BUILD
+++ b/tensorflow/contrib/lite/tools/BUILD
@@ -27,6 +27,27 @@ tf_cc_binary(
     ],
 )
 
+tf_cc_binary(
+    name = "benchmark_model",
+    srcs = ["benchmark_model.cc"],
+    linkopts = select({
+        "//tensorflow:android": [
+            "-pie",
+            "-landroid",
+            "-lm",
+            "-z defs",
+            "-Wl,--exclude-libs,ALL",  # Exclude syms in all libs from auto export
+        ],
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":mutable_op_resolver",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite:string_util",
+        "//tensorflow/contrib/lite/kernels:builtin_ops",
+    ],
+)
+
 cc_library(
     name = "gen_op_registration",
     srcs = ["gen_op_registration.cc"],
diff --git a/tensorflow/contrib/makefile/download_dependencies.sh b/tensorflow/contrib/makefile/download_dependencies.sh
index 0a47f50c43..4ae18b2cef 100755
--- a/tensorflow/contrib/makefile/download_dependencies.sh
+++ b/tensorflow/contrib/makefile/download_dependencies.sh
@@ -63,12 +63,17 @@ download_and_extract() {
   elif [[ "${url}" == *zip ]]; then
     tempdir=$(mktemp -d)
     tempdir2=$(mktemp -d)
-    wget -P ${tempdir} ${url}
-    unzip ${tempdir}/* -d ${tempdir2}
+    if [[ "$OSTYPE" == "darwin"* ]]; then
+      # macOS (AKA darwin) doesn't have wget.
+      (cd "${tempdir}"; curl --remote-name --silent --location "${url}")
+    else
+      wget -P "${tempdir}" "${url}"
+    fi
+    unzip "${tempdir}"/* -d "${tempdir2}"
     # unzip has no strip components, so unzip to a temp dir, and move the files
     # we want from the tempdir to destination.
-    cp -R ${tempdir2}/*/* ${dir}/
-    rm -rf ${tempdir2} ${tempdir}
+    cp -R "${tempdir2}"/*/* "${dir}"/
+    rm -rf "${tempdir2}" "${tempdir}"
   fi
 
   # Delete any potential BUILD files, which would interfere with Bazel builds.
diff --git a/tensorflow/contrib/opt/BUILD b/tensorflow/contrib/opt/BUILD
index 9c961f2b9c..827279bd47 100644
--- a/tensorflow/contrib/opt/BUILD
+++ b/tensorflow/contrib/opt/BUILD
@@ -19,6 +19,7 @@ py_library(
         "python/training/elastic_average_optimizer.py",
         "python/training/external_optimizer.py",
         "python/training/lazy_adam_optimizer.py",
+        "python/training/model_average_optimizer.py",
         "python/training/moving_average_optimizer.py",
         "python/training/multitask_optimizer_wrapper.py",
         "python/training/nadam_optimizer.py",
@@ -193,6 +194,27 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "model_average_optimizer_test",
+    srcs = ["python/training/model_average_optimizer_test.py"],
+    additional_deps = [
+        ":opt_py",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:training",
+        "//tensorflow/python:ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//third_party/py/numpy",
+    ],
+    tags = [
+        "notap",  # This test launches local server.
+    ],
+)
+
 py_test(
     name = "sign_decay_test",
     srcs = ["python/training/sign_decay_test.py"],
diff --git a/tensorflow/contrib/opt/__init__.py b/tensorflow/contrib/opt/__init__.py
index 90d2f92462..6c1bb1adc0 100644
--- a/tensorflow/contrib/opt/__init__.py
+++ b/tensorflow/contrib/opt/__init__.py
@@ -29,6 +29,7 @@ from tensorflow.contrib.opt.python.training.nadam_optimizer import *
 from tensorflow.contrib.opt.python.training.powersign import *
 from tensorflow.contrib.opt.python.training.variable_clipping_optimizer import *
 from tensorflow.contrib.opt.python.training.elastic_average_optimizer import *
+from tensorflow.contrib.opt.python.training.model_average_optimizer import *
 # pylint: enable=wildcard-import
 
 from tensorflow.python.util.all_util import remove_undocumented
@@ -48,7 +49,9 @@ _allowed_symbols = [
     'MultitaskOptimizerWrapper',
     'clip_gradients_by_global_norm',
     'ElasticAverageOptimizer',
-    'ElasticAverageCustomGetter'
+    'ElasticAverageCustomGetter',
+    'ModelAverageOptimizer',
+    'ModelAverageCustomGetter'
 ]
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/opt/python/training/model_average_optimizer.py b/tensorflow/contrib/opt/python/training/model_average_optimizer.py
new file mode 100644
index 0000000000..47509ecca6
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/model_average_optimizer.py
@@ -0,0 +1,299 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Wrapper optimizer for Model Average """
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import constant_op
+from tensorflow.python.training import optimizer
+from tensorflow.python.training import session_run_hook
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import data_flow_ops
+
+GLOBAL_VARIABLE_NAME = 'global_center_variable'
+
+
+class ModelAverageCustomGetter(object):
+  """Custom_getter class is used to do:
+  1. Change trainable variables to local collection and place them at worker
+    device
+  2. Generate global variables
+    Notice that the class should be used with tf.replica_device_setter,
+    so that the global center variables and global step variable can be placed
+    at ps device. Besides, use 'tf.get_variable' instead of 'tf.Variable' to
+    use this custom getter.
+
+  For example,
+  ma_custom_getter = ModelAverageCustomGetter(worker_device)
+  with tf.device(
+    tf.train.replica_device_setter(
+      worker_device=worker_device,
+      ps_device="/job:ps/cpu:0",
+      cluster=cluster)),
+    tf.variable_scope('',custom_getter=ma_custom_getter):
+    hid_w = tf.get_variable(
+      initializer=tf.truncated_normal(
+          [IMAGE_PIXELS * IMAGE_PIXELS, FLAGS.hidden_units],
+          stddev=1.0 / IMAGE_PIXELS),
+      name="hid_w")
+    hid_b = tf.get_variable(initializer=tf.zeros([FLAGS.hidden_units]),
+                            name="hid_b")
+  """
+
+  def __init__(self, worker_device):
+    """Create a new `ElasticAverageCustomGetter`.
+
+    Args:
+      worker_device: String.  Name of the `worker` job.
+    """
+    self._worker_device = worker_device
+    self._local_2_global = {}
+
+  def __call__(self, getter, name, trainable, collections, *args, **kwargs):
+    if trainable:
+      with ops.device(self._worker_device):
+        local_var = getter(name, trainable=True,
+                           collections=[ops.GraphKeys.LOCAL_VARIABLES],
+                           *args, **kwargs)
+
+      global_variable = variable_scope.variable(
+        name='%s/%s' % (GLOBAL_VARIABLE_NAME, name),
+        initial_value=local_var.initialized_value(),
+        trainable=False,
+        collections=[ops.GraphKeys.GLOBAL_VARIABLES])
+
+      self._local_2_global[local_var] = global_variable
+      return local_var
+    else:
+      return getter(name, trainable, collections, *args, **kwargs)
+
+
+class ModelAverageOptimizer(optimizer.Optimizer):
+  """Wrapper optimizer that implements the Model Average algorithm.
+  This is a sync optimizer. During the training, each worker will update
+  the local variables and maintains its own local_step, which starts from 0
+  and is incremented by 1 after each update of local variables. Whenever the
+  interval_steps divides the local step, the local variables from all the
+  workers will be averaged and assigned to global center variables. Then the
+  local variables will be assigned by global center variables.
+  """
+
+  def __init__(
+      self,
+      opt,
+      num_worker,
+      is_chief,
+      ma_custom_getter,
+      interval_steps=100,
+      use_locking=True,
+      name="ModelAverageOptimizer"):
+    """Construct a new model average optimizer.
+
+    Args:
+      opt: The actual optimizer that will be used to update local variables
+      num_worker: The number of workers
+      is_chief: whether chief worker
+      ma_custom_getter: ModelAverageCustomGetter
+      interval_steps: An int point value to controls the frequency of the
+        average of local variables
+      use_locking: If True use locks for update operations
+      name: string. Optional name of the returned operation
+    """
+    super(ModelAverageOptimizer, self).__init__(use_locking, name)
+    self._opt = opt
+    self._num_worker = num_worker
+    self._is_chief = is_chief
+    self._local_2_global = ma_custom_getter._local_2_global
+    self._interval_steps = interval_steps
+    self._accumulator_list = []
+    self._chief_init_op = None
+
+    self._local_step = variable_scope.get_variable(
+      initializer=0,
+      trainable=False,
+      collections=[ops.GraphKeys.LOCAL_VARIABLES],
+      name="local_step")
+
+    self._opt._prepare()
+
+  def compute_gradients(self, *args, **kwargs):
+    """Compute gradients of "loss" for the variables in "var_list".
+
+    This simply wraps the compute_gradients() from the real optimizer.
+
+    Args:
+      *args: Arguments for compute_gradients().
+      **kwargs: Keyword arguments for compute_gradients().
+
+    Returns:
+      A list of (gradient, variable) pairs.
+    """
+    return self._opt.compute_gradients(*args, **kwargs)
+
+  def _local_vars_update(self, var_list):
+    """Get the update ops for the local variables in "var_list".
+
+    Args:
+      var_list: Optional list or tuple of 'tf.Variable' to update
+
+    Returns:
+      An update op
+    """
+    if not var_list:
+      raise ValueError(
+        'The list of local_variables should not be empty')
+    update_ops = []
+    global_center_vars = [self._local_2_global[var] for var in var_list]
+    for lvar, gvar in zip(var_list, global_center_vars):
+      with ops.device(lvar.device):
+        update_ops.append(state_ops.assign(lvar, gvar.read_value()))
+    return control_flow_ops.group(*(update_ops))
+
+  def apply_gradients(self, grads_and_vars, global_step=None, name=None):
+    """Apply gradients to variables.
+
+    This contains most of the synchronization implementation and also wraps the
+    apply_gradients() from the real optimizer. The chief work updates global
+    variables.
+
+    Args:
+      grads_and_vars: List of (gradient, variable) pairs as returned by
+        compute_gradients().
+      global_step: Optional Variable to increment by one after the
+        variables have been updated.
+      name: Optional name for the returned operation.  Default to the
+        name passed to the Optimizer constructor.
+
+    Returns:
+      A conditional 'Operation' that update both local and global variables or
+      just local variables
+
+    Raises:
+      ValueError: If the grads_and_vars is empty.
+      ValueError: If global step is not provided, the staleness cannot be
+        checked.
+    """
+
+    # update local variables
+    if not grads_and_vars:
+      raise ValueError("Must supply at least one variable")
+    if global_step is None:
+      raise ValueError("Global step is required")
+
+    apply_updates = self._opt.apply_gradients(grads_and_vars)
+    with ops.control_dependencies([apply_updates]):
+      local_update = state_ops.assign_add(
+        self._local_step, 1, name='local_step_update').op
+
+    # update global variables.
+    def _Update_global_variables():
+      local_vars = [v for g, v in grads_and_vars if g is not None]
+      global_vars = [self._local_2_global[v] for v in local_vars]
+      # sync queue
+      with ops.colocate_with(global_step):
+        sync_queue = data_flow_ops.FIFOQueue(-1, [dtypes.bool], shapes=[[]],
+                                             shared_name='sync_queue')
+      train_ops = []
+      aggregated_vars = []
+      with ops.name_scope(None, self._name + '/global'):
+        for var, gvar in zip(local_vars, global_vars):
+          with ops.device(gvar.device):
+            if isinstance(var._ref(), ops.Tensor):
+              var_accum = data_flow_ops.ConditionalAccumulator(
+                var.dtype,
+                shape=var.get_shape(),
+                shared_name=gvar.name + "/var_accum")
+              train_ops.append(
+                var_accum.apply_grad(var._ref(), local_step=global_step))
+              aggregated_vars.append(var_accum.take_grad(self._num_worker))
+            else:
+              raise ValueError("Unknown local variable type!")
+            self._accumulator_list.append((var_accum, gvar.device))
+      # chief worker updates global vars and enqueues tokens to the sync queue
+      if self._is_chief:
+        update_ops = []
+        with ops.control_dependencies(train_ops):
+          for avg_var, gvar in zip(aggregated_vars, global_vars):
+            with ops.device(gvar.device):
+              update_ops.append(state_ops.assign(gvar, avg_var))
+          with ops.device(global_step.device):
+            update_ops.append(state_ops.assign_add(global_step, 1))
+        with ops.control_dependencies(update_ops), ops.device(
+            global_step.device):
+          tokens = array_ops.fill([self._num_worker - 1],
+                                  constant_op.constant(False))
+          sync_op = sync_queue.enqueue_many(tokens)
+      else:
+        with ops.control_dependencies(train_ops), ops.device(
+            global_step.device):
+          sync_op = sync_queue.dequeue()
+
+      with ops.control_dependencies([sync_op]):
+        local_update_op = self._local_vars_update(local_vars)
+      return local_update_op
+
+    with ops.control_dependencies([local_update]):
+      condition = math_ops.equal(math_ops.mod(
+        self._local_step, self._interval_steps), 0)
+      conditional_update = control_flow_ops.cond(
+        condition, _Update_global_variables, control_flow_ops.no_op)
+
+    chief_init_ops = []
+    for accum, dev in self._accumulator_list:
+      with ops.device(dev):
+        chief_init_ops.append(
+          accum.set_global_step(
+            global_step, name="SetGlobalStep"))
+    self._chief_init_op = control_flow_ops.group(*(chief_init_ops))
+
+    return conditional_update
+
+  def get_init_op(self):
+    """Returns the op to let all the local variables equal to the global
+     variables before the training begins"""
+    return self._local_vars_update(variables.trainable_variables())
+
+  def make_session_run_hook(self):
+    """Creates a hook to handle ModelAverage ops such as initialization."""
+    return _ModelAverageOptimizerHook(self, self._is_chief)
+
+
+class _ModelAverageOptimizerHook(session_run_hook.SessionRunHook):
+  def __init__(self, ma_optimizer, is_chief):
+    """Creates hook to handle ModelAverageOptimizer initialization ops.
+
+    Args:
+      ea_optimizer: `ModelAverageOptimizer` which this hook will initialize.
+      is_chief: `Bool`, whether is this a chief replica or not.
+    """
+    self._ma_optimizer = ma_optimizer
+    self._is_chief = is_chief
+
+  def begin(self):
+    self._local_init_op = variables.local_variables_initializer()
+    self._global_init_op = None
+    if self._is_chief:
+      self._global_init_op = variables.global_variables_initializer()
+      self._chief_init_op = self._ma_optimizer._chief_init_op
+    self._variable_init_op = self._ma_optimizer.get_init_op()
diff --git a/tensorflow/contrib/opt/python/training/model_average_optimizer_test.py b/tensorflow/contrib/opt/python/training/model_average_optimizer_test.py
new file mode 100644
index 0000000000..a73aa772bb
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/model_average_optimizer_test.py
@@ -0,0 +1,200 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ModelAverageOptimizer."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import portpicker
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import gradient_descent
+from tensorflow.python.training import server_lib
+from tensorflow.python.training import training
+from tensorflow.python.training import training_util
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.training import device_setter
+from tensorflow.contrib.opt.python.training.model_average_optimizer import \
+  ModelAverageOptimizer, ModelAverageCustomGetter, GLOBAL_VARIABLE_NAME
+
+
+def create_local_cluster(num_workers, num_ps, protocol="grpc"):
+  """Create local GRPC servers and return them."""
+  worker_ports = [portpicker.pick_unused_port() for _ in range(num_workers)]
+  ps_ports = [portpicker.pick_unused_port() for _ in range(num_ps)]
+  cluster_dict = {
+    "worker": ["localhost:%s" % port for port in worker_ports],
+    "ps": ["localhost:%s" % port for port in ps_ports]
+  }
+  cs = server_lib.ClusterSpec(cluster_dict)
+
+  workers = [
+    server_lib.Server(
+      cs, job_name="worker", protocol=protocol, task_index=ix, start=True)
+    for ix in range(num_workers)
+  ]
+  ps_servers = [
+    server_lib.Server(
+      cs, job_name="ps", protocol=protocol, task_index=ix, start=True)
+    for ix in range(num_ps)
+  ]
+
+  return cluster_dict, workers, ps_servers
+
+
+# Creates the workers and return their sessions, graphs, train_ops.
+# Cheif worker will update at last
+def _get_workers(num_workers, steps, workers):
+  sessions = []
+  graphs = []
+  train_ops = []
+  for worker_id in range(num_workers):
+    graph = ops.Graph()
+    is_chief = (worker_id == 0)
+    with graph.as_default():
+      worker_device = "/job:worker/task:%d/cpu:0" % (worker_id)
+      ma_coustom = ModelAverageCustomGetter(
+        worker_device=worker_device)
+      with variable_scope.variable_scope('',
+                                         custom_getter=ma_coustom), ops.device(
+        device_setter.replica_device_setter(worker_device=worker_device,
+                                            ps_device="/job:ps/task:0/cpu:0",
+                                            ps_tasks=1)):
+
+        global_step = variables.Variable(0, name='global_step',
+                                         trainable=False)
+        var_0 = variable_scope.get_variable(initializer=0.0, name="v0")
+        var_1 = variable_scope.get_variable(initializer=1.0, name="v1")
+
+      with ops.device("/job:worker/task:" + str(worker_id)):
+        if worker_id == 0:
+          grads_0 = constant_op.constant(-1.0)
+          grads_1 = constant_op.constant(-1.0)
+        else:
+          grads_0 = constant_op.constant(-2.0)
+          grads_1 = constant_op.constant(-2.0)
+        sgd_opt = gradient_descent.GradientDescentOptimizer(1.0)
+        opt = ModelAverageOptimizer(
+          opt=sgd_opt,
+          num_worker=num_workers,
+          ma_custom_getter=ma_coustom,
+          is_chief=is_chief,
+          interval_steps=steps
+        )
+        train_op = [
+          opt.apply_gradients(
+            [[grads_0, var_0],
+             [grads_1, var_1]], global_step)
+        ]
+      easgd_hook = opt.make_session_run_hook()
+      # Creates MonitoredSession
+      sess = training.MonitoredTrainingSession(workers[worker_id].target,
+                                               hooks=[easgd_hook])
+
+    sessions.append(sess)
+    graphs.append(graph)
+    train_ops.append(train_op)
+  return sessions, graphs, train_ops
+
+
+class ModelAverageOptimizerTest(test.TestCase):
+  def _run(self, train_op, sess):
+    sess.run(train_op)
+
+  def test1Workers2Period(self):
+    num_workers = 2
+    steps = 2
+    num_ps = 1
+    cluster, workers, _ = create_local_cluster(num_workers=num_workers,
+                                               num_ps=num_ps)
+
+    sessions, graphs, train_ops = _get_workers(num_workers,
+                                               steps,
+                                               workers)
+
+    var_0 = graphs[0].get_tensor_by_name('v0:0')
+    var_1 = graphs[0].get_tensor_by_name('v1:0')
+    global_step = training_util.get_global_step(graphs[0])
+    global_var_0 = graphs[0].get_tensor_by_name(GLOBAL_VARIABLE_NAME + "/v0:0")
+    global_var_1 = graphs[0].get_tensor_by_name(GLOBAL_VARIABLE_NAME + "/v1:0")
+
+    # Verify the initialized value.
+    self.assertAllEqual(0.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0, sessions[0].run(var_1))
+    self.assertAllEqual(0.0, sessions[0].run(global_var_0))
+    self.assertAllEqual(1.0, sessions[0].run(global_var_1))
+    self.assertAllEqual(0, sessions[0].run(global_step))
+
+    sessions[0].run(train_ops[0])
+    sessions[1].run(train_ops[1])
+
+    self.assertAllEqual(1.0, sessions[0].run(var_0))
+    self.assertAllEqual(2.0, sessions[0].run(var_1))
+    self.assertAllEqual(0.0, sessions[0].run(global_var_0))
+    self.assertAllEqual(1.0, sessions[0].run(global_var_1))
+    self.assertAllEqual(0, sessions[0].run(global_step))
+
+    # iteration 2, global varibale update
+    thread_0 = self.checkedThread(
+      target=self._run, args=(train_ops[0], sessions[0]))
+    thread_1 = self.checkedThread(
+      target=self._run, args=(train_ops[1], sessions[1]))
+    thread_0.start()
+    thread_1.start()
+    thread_0.join()
+    thread_1.join()
+
+    self.assertAllEqual(3.0, sessions[0].run(var_0))
+    self.assertAllEqual(4.0, sessions[0].run(var_1))
+    self.assertAllEqual(3.0, sessions[0].run(global_var_0))
+    self.assertAllEqual(4.0, sessions[0].run(global_var_1))
+    self.assertAllEqual(1, sessions[0].run(global_step))
+
+    # iteration 3
+    sessions[0].run(train_ops[0])
+
+    self.assertAllEqual(4.0, sessions[0].run(var_0))
+    self.assertAllEqual(5.0, sessions[0].run(var_1))
+    self.assertAllEqual(3.0, sessions[0].run(global_var_0))
+    self.assertAllEqual(4.0, sessions[0].run(global_var_1))
+    self.assertAllEqual(1, sessions[0].run(global_step))
+
+  def testPS2TasksWithClusterSpecClass(self):
+    cluster_spec = server_lib.ClusterSpec({
+      "ps": ["ps0:2222", "ps1:2222"],
+      "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+    })
+    worker_device = "/job:worker/task:0"
+    ma_coustom = ModelAverageCustomGetter(
+      worker_device=worker_device)
+    from tensorflow.python.training import device_setter
+    with ops.device(
+        device_setter.replica_device_setter(cluster=cluster_spec,
+                                            worker_device=worker_device,
+                                            ps_device="/job:ps")), \
+         variable_scope.variable_scope('', custom_getter=ma_coustom):
+      v = variable_scope.get_variable(initializer=[1, 2], name="v")
+      w = variable_scope.get_variable(initializer=[2, 1], name='w')
+      v_g, w_g = ma_coustom._local_2_global[v], ma_coustom._local_2_global[w]
+      self.assertDeviceEqual("/job:worker/task:0", v.device)
+      self.assertDeviceEqual("job:ps/task:0", v_g.device)
+      self.assertDeviceEqual("/job:worker/task:0", w.device)
+      self.assertDeviceEqual("job:ps/task:1", w_g.device)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/periodic_resample/BUILD b/tensorflow/contrib/periodic_resample/BUILD
index 71582f9c9a..bd9078ae76 100644
--- a/tensorflow/contrib/periodic_resample/BUILD
+++ b/tensorflow/contrib/periodic_resample/BUILD
@@ -6,6 +6,7 @@ exports_files(["LICENSE"])
 
 load(
     "//tensorflow:tensorflow.bzl",
+    "py_test",
     "tf_gen_op_libs",
     "tf_custom_op_library",
     "tf_custom_op_py_library",
@@ -64,11 +65,28 @@ py_library(
         "python/__init__.py",
     ],
     srcs_version = "PY2AND3",
+    tags = [
+        "notap",
+    ],
     deps = [
         ":periodic_resample_op_py",
     ],
 )
 
+py_test(
+    name = "periodic_resample_op_test",
+    srcs = ["python/kernel_tests/periodic_resample_op_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "notap",
+    ],
+    deps = [
+        ":init_py",
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/python:framework_test_lib",
+    ],
+)
+
 # py_library(
 #     name = "periodic_resample_op_py",
 #     srcs = ["python/ops/periodic_resample_op.py"],
diff --git a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h
index bef21f7a5c..ba410f025d 100644
--- a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h
+++ b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h
@@ -100,6 +100,8 @@ template <class InputDataT,
                   desired_shape.size(), "."));
 
   bool found = false;
+  const auto& input_tensor_shape = input_tensor.shape();
+
   for (int i = 0; i < rank; ++i) {
     // if (desired_shape(i) < 1) {
     if (desired_shape[i] < 1) {
@@ -111,6 +113,15 @@ template <class InputDataT,
       adjustable_dimension = i;
       found = true;
     } else {
+      OP_REQUIRES(
+          context, desired_shape[i] >= input_tensor_shape.dim_size(i),
+          tensorflow::errors::InvalidArgument(
+              "periodic_resample expects the size of non-adjustable "
+              "dimensions be at least as large as size of input tensor."
+              " Dimension ", i, " input tensor has size ",
+              input_tensor_shape.dim_size(i), ", desired shape has size ",
+              desired_shape[i], "."));
+
       // target_dimensions[i] = desired_shape(i);
       target_dimensions[i] = desired_shape[i];
       new_sliced_size *= target_dimensions[i];
diff --git a/tensorflow/contrib/periodic_resample/ops/array_ops.cc b/tensorflow/contrib/periodic_resample/ops/array_ops.cc
index c90fc06c7f..82bd796956 100644
--- a/tensorflow/contrib/periodic_resample/ops/array_ops.cc
+++ b/tensorflow/contrib/periodic_resample/ops/array_ops.cc
@@ -34,26 +34,40 @@ This function implements a slightly more generic version of the subpixel
 convolutions found in this [paper](https://arxiv.org/abs/1609.05158).
 
 The formula for computing the elements in the `output` tensor is as follows:
+
   `T` = `values` tensor of rank `R`
+
   `S` = desired `shape` of output tensor (vector of length `R`)
+
   `P` = `output` tensor of rank `R`
-  \((T_1,\ldots,T_R)\) = shape(`T`)
-  \([S_1,\ldots,S_q,\ldots,S_R]\) = elements of vector `S`
 
-  A single element in `S` is left unspecified (denoted \(S_q=-1\)).
-  Let \(f_i\) denote the (possibly non-integer) factor that relates the original
-  dimension to the desired dimensions, \(S_i=f_i T_i\), for \(i\neq q\) where
-  \(f_i>0\).
+  \\((T_1,\\ldots,T_R)\\) = shape(`T`)
+
+  \\([S_1,\\ldots,S_q,\\ldots,S_R]\\) = elements of vector `S`
+
+  A single element in `S` is left unspecified (denoted \\(S_q=-1\\)).
+
+  Let \\(f_i\\) denote the (possibly non-integer) factor that relates the original
+  dimension to the desired dimensions, \\(S_i=f_i T_i\\), for \\(i\\neq q\\) where
+  \\(f_i>0\\).
+
   Define the following:
-    \(g_i=\lceil f_i\rceil\)
-    \(t=\prod_i T_i\)
-    \(s=\prod_{i\neq q} S_i\)
-  \(S_q\) can then be defined as by \(S_q=\lfloor t/s\rfloor\).
+
+  \\(g_i=\\lceil f_i\\rceil\\)
+
+  \\(t=\\prod_i T_i\\)
+
+  \\(s=\\prod_{i\\neq q} S_i\\)
+
+  \\(S_q\\) can then be defined by \\(S_q=\\lfloor t/s\\rfloor\\).
   The elements of the resulting tensor are defined as
-  \(P_{s_1,\ldots,s_R}=T_{h_1,\ldots,h_q,\ldots,h_R}\).
-  The \(h_i\) (\(i\neq q\)) are defined by \(h_i=\lfloor s_i/g_i\rfloor\).
-  \(h_q=S_q\sum_{j\neq q}^{q-1}G_j \mathrm{mod}(s_j,g_j) + s_q\), where
-  \(G_j=\prod_{i}^{j-1}g_i\) (\(G_0=1\)).
+
+  \\(P_{s_1,\\ldots,s_R}=T_{h_1,\\ldots,h_q,\\ldots,h_R}\\).
+
+  The \\(h_i\\) (\\(i\\neq q\\)) are defined by \\(h_i=\\lfloor s_i/g_i\\rfloor\\).
+
+  \\(h_q=S_q\\sum_{j\\neq q}^{q-1}G_j \\mathrm{mod}(s_j,g_j) + s_q\\), where
+  \\(G_j=\\prod_{i}^{j-1}g_i\\) (\\(G_0=1\\)).
 
 One drawback of this method is that whenever the output dimensions are slightly
 less than integer multiples of the input dimensions, many of the tensor elements
diff --git a/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py b/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py
index 1d727870f6..30a2077570 100644
--- a/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py
+++ b/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py
@@ -19,8 +19,9 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy
-import tensorflow
+
 from tensorflow.contrib.periodic_resample import periodic_resample
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
@@ -96,6 +97,19 @@ class PeriodicResampleTest(test_util.TensorFlowTestCase):
       result = periodic_resample(input_tensor, desired_shape).eval()
       self.assertAllEqual(result, output_tensor)
 
+  def testPeriodicResampleErrors(self):
+    input_tensor = numpy.zeros(shape=[1, 2, 2, 4])
+    with self.test_session():
+      variables.global_variables_initializer().run()
+      with self.assertRaisesWithPredicateMatch(
+          errors_impl.InvalidArgumentError,
+          'Dimension 3 input tensor has size 4, desired shape has size 1'):
+        periodic_resample(input_tensor, [None, 4, 4, 1]).eval()
+      with self.assertRaisesWithPredicateMatch(
+          errors_impl.InvalidArgumentError,
+          '4, to be the same as the length of the desired shape, 3'):
+        periodic_resample(input_tensor, [None, 4, 4]).eval()
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
index b5d81b7caa..cafeb56ad8 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
@@ -663,6 +663,12 @@ class DropoutWrapperTest(test.TestCase):
         self.assertEqual(res[1].h.shape, (batch_size, 3))
         return res
 
+  def testWrappedCellProperty(self):
+    cell = rnn_cell_impl.BasicRNNCell(10)
+    wrapper = rnn_cell_impl.DropoutWrapper(cell)
+    # Github issue 15810
+    self.assertEqual(wrapper.wrapped_cell, cell)
+
   def testDropoutWrapperKeepAllConstantInput(self):
     keep = array_ops.ones([])
     res = self._testDropoutWrapper(
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
index 73789206f3..70aaba1728 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
@@ -1549,5 +1549,100 @@ class BenchmarkLSTMCellXLA(test.Benchmark):
                   benchmark_results["wall_time"]]]))
 
 
+class WeightNormLSTMCellTest(test.TestCase):
+  """Compared cell output with pre-calculated values."""
+
+  def _cell_output(self, cell):
+    """Calculate cell output"""
+
+    with self.test_session() as sess:
+      init = init_ops.constant_initializer(0.5)
+      with variable_scope.variable_scope("root",
+                                         initializer=init):
+        x = array_ops.zeros([1, 2])
+        c0 = array_ops.zeros([1, 2])
+        h0 = array_ops.zeros([1, 2])
+
+        state0 = rnn_cell.LSTMStateTuple(c0, h0)
+
+        xout, sout = cell()(x, state0)
+
+      sess.run([variables.global_variables_initializer()])
+      res = sess.run([xout, sout], {
+          x.name: np.array([[1., 1.]]),
+          c0.name: 0.1 * np.asarray([[0, 1]]),
+          h0.name: 0.1 * np.asarray([[2, 3]]),
+      })
+
+    actual_state_c = res[1].c
+    actual_state_h = res[1].h
+
+    return actual_state_c, actual_state_h
+
+  def testBasicCell(self):
+    """Tests cell w/o peepholes and w/o normalisation"""
+
+    def cell():
+      return contrib_rnn_cell.WeightNormLSTMCell(2,
+                                                 norm=False,
+                                                 use_peepholes=False)
+
+    actual_c, actual_h = self._cell_output(cell)
+
+    expected_c = np.array([[0.65937078, 0.74983585]])
+    expected_h = np.array([[0.44923624, 0.49362513]])
+
+    self.assertAllClose(expected_c, actual_c, 1e-5)
+    self.assertAllClose(expected_h, actual_h, 1e-5)
+
+  def testNonbasicCell(self):
+    """Tests cell with peepholes and w/o normalisation"""
+
+    def cell():
+      return contrib_rnn_cell.WeightNormLSTMCell(2,
+                                                 norm=False,
+                                                 use_peepholes=True)
+
+    actual_c, actual_h = self._cell_output(cell)
+
+    expected_c = np.array([[0.65937084, 0.7574988]])
+    expected_h = np.array([[0.4792085, 0.53470564]])
+
+    self.assertAllClose(expected_c, actual_c, 1e-5)
+    self.assertAllClose(expected_h, actual_h, 1e-5)
+
+
+  def testBasicCellWithNorm(self):
+    """Tests cell w/o peepholes and with normalisation"""
+
+    def cell():
+      return contrib_rnn_cell.WeightNormLSTMCell(2,
+                                                 norm=True,
+                                                 use_peepholes=False)
+
+    actual_c, actual_h = self._cell_output(cell)
+
+    expected_c = np.array([[0.50125383, 0.58805949]])
+    expected_h = np.array([[0.32770363, 0.37397948]])
+
+    self.assertAllClose(expected_c, actual_c, 1e-5)
+    self.assertAllClose(expected_h, actual_h, 1e-5)
+
+  def testNonBasicCellWithNorm(self):
+    """Tests cell with peepholes and with normalisation"""
+
+    def cell():
+      return contrib_rnn_cell.WeightNormLSTMCell(2,
+                                                 norm=True,
+                                                 use_peepholes=True)
+
+    actual_c, actual_h = self._cell_output(cell)
+
+    expected_c = np.array([[0.50125383, 0.59587258]])
+    expected_h = np.array([[0.35041603, 0.40873795]])
+
+    self.assertAllClose(expected_c, actual_c, 1e-5)
+    self.assertAllClose(expected_h, actual_h, 1e-5)
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/rnn/python/ops/rnn_cell.py b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
index e4667828cd..d7ae6621db 100644
--- a/tensorflow/contrib/rnn/python/ops/rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
@@ -38,6 +38,7 @@ from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import nn_impl
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
 
@@ -328,7 +329,7 @@ class TimeFreqLSTMCell(rnn_cell_impl.RNNCell):
   def __init__(self, num_units, use_peepholes=False,
                cell_clip=None, initializer=None,
                num_unit_shards=1, forget_bias=1.0,
-               feature_size=None, frequency_skip=None,
+               feature_size=None, frequency_skip=1,
                reuse=None):
     """Initialize the parameters for an LSTM cell.
 
@@ -2723,3 +2724,257 @@ class SRUCell(rnn_cell_impl._LayerRNNCell):
     h = r * self._activation(c) + (1.0 - r) * inputs
 
     return h, c
+
+
+class WeightNormLSTMCell(rnn_cell_impl.RNNCell):
+  """Weight normalized LSTM Cell. Adapted from `rnn_cell_impl.LSTMCell`.
+
+    The weight-norm implementation is based on:
+    https://arxiv.org/abs/1602.07868
+    Tim Salimans, Diederik P. Kingma.
+    Weight Normalization: A Simple Reparameterization to Accelerate
+    Training of Deep Neural Networks
+
+    The default LSTM implementation based on:
+    http://www.bioinf.jku.at/publications/older/2604.pdf
+    S. Hochreiter and J. Schmidhuber.
+    "Long Short-Term Memory". Neural Computation, 9(8):1735-1780, 1997.
+
+    The class uses optional peephole connections, optional cell clipping
+    and an optional projection layer.
+
+    The optional peephole implementation is based on:
+    https://research.google.com/pubs/archive/43905.pdf
+    Hasim Sak, Andrew Senior, and Francoise Beaufays.
+    "Long short-term memory recurrent neural network architectures for
+    large scale acoustic modeling." INTERSPEECH, 2014.
+  """
+
+  def __init__(self, num_units, norm=True, use_peepholes=False,
+               cell_clip=None, initializer=None, num_proj=None,
+               proj_clip=None, forget_bias=1, activation=None,
+               reuse=None):
+    """Initialize the parameters of a weight-normalized LSTM cell.
+
+    Args:
+      num_units: int, The number of units in the LSTM cell
+      norm: If `True`, apply normalization to the weight matrices. If False,
+        the result is identical to that obtained from `rnn_cell_impl.LSTMCell`
+      use_peepholes: bool, set `True` to enable diagonal/peephole connections.
+      cell_clip: (optional) A float value, if provided the cell state is clipped
+        by this value prior to the cell output activation.
+      initializer: (optional) The initializer to use for the weight matrices.
+      num_proj: (optional) int, The output dimensionality for the projection
+        matrices.  If None, no projection is performed.
+      proj_clip: (optional) A float value.  If `num_proj > 0` and `proj_clip` is
+        provided, then the projected values are clipped elementwise to within
+        `[-proj_clip, proj_clip]`.
+      forget_bias: Biases of the forget gate are initialized by default to 1
+        in order to reduce the scale of forgetting at the beginning of
+        the training.
+      activation: Activation function of the inner states.  Default: `tanh`.
+      reuse: (optional) Python boolean describing whether to reuse variables
+        in an existing scope.  If not `True`, and the existing scope already has
+        the given variables, an error is raised.
+    """
+    super(WeightNormLSTMCell, self).__init__(_reuse=reuse)
+
+    self._scope = 'wn_lstm_cell'
+    self._num_units = num_units
+    self._norm = norm
+    self._initializer = initializer
+    self._use_peepholes = use_peepholes
+    self._cell_clip = cell_clip
+    self._num_proj = num_proj
+    self._proj_clip = proj_clip
+    self._activation = activation or math_ops.tanh
+    self._forget_bias = forget_bias
+
+    self._weights_variable_name = "kernel"
+    self._bias_variable_name = "bias"
+
+    if num_proj:
+      self._state_size = rnn_cell_impl.LSTMStateTuple(num_units, num_proj)
+      self._output_size = num_proj
+    else:
+      self._state_size = rnn_cell_impl.LSTMStateTuple(num_units, num_units)
+      self._output_size = num_units
+
+  @property
+  def state_size(self):
+    return self._state_size
+
+  @property
+  def output_size(self):
+    return self._output_size
+
+  def _normalize(self, weight, name):
+    """Apply weight normalization.
+
+    Args:
+      weight: a 2D tensor with known number of columns.
+      name: string, variable name for the normalizer.
+    Returns:
+      A tensor with the same shape as `weight`.
+    """
+
+    output_size = weight.get_shape().as_list()[1]
+    g = vs.get_variable(name, [output_size], dtype=weight.dtype)
+    return nn_impl.l2_normalize(weight, dim=0) * g
+
+  def _linear(self, args,
+              output_size,
+              norm,
+              bias,
+              bias_initializer=None,
+              kernel_initializer=None):
+    """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable.
+
+    Args:
+      args: a 2D Tensor or a list of 2D, batch x n, Tensors.
+      output_size: int, second dimension of W[i].
+      bias: boolean, whether to add a bias term or not.
+      bias_initializer: starting value to initialize the bias
+        (default is all zeros).
+      kernel_initializer: starting value to initialize the weight.
+
+    Returns:
+      A 2D Tensor with shape [batch x output_size] equal to
+      sum_i(args[i] * W[i]), where W[i]s are newly created matrices.
+
+    Raises:
+      ValueError: if some of the arguments has unspecified or wrong shape.
+    """
+    if args is None or (nest.is_sequence(args) and not args):
+      raise ValueError("`args` must be specified")
+    if not nest.is_sequence(args):
+      args = [args]
+
+    # Calculate the total size of arguments on dimension 1.
+    total_arg_size = 0
+    shapes = [a.get_shape() for a in args]
+    for shape in shapes:
+      if shape.ndims != 2:
+        raise ValueError("linear is expecting 2D arguments: %s" % shapes)
+      if shape[1].value is None:
+        raise ValueError("linear expects shape[1] to be provided for shape %s, "
+                         "but saw %s" % (shape, shape[1]))
+      else:
+        total_arg_size += shape[1].value
+
+    dtype = [a.dtype for a in args][0]
+
+    # Now the computation.
+    scope = vs.get_variable_scope()
+    with vs.variable_scope(scope) as outer_scope:
+      weights = vs.get_variable(
+          self._weights_variable_name, [total_arg_size, output_size],
+          dtype=dtype,
+          initializer=kernel_initializer)
+      if norm:
+        wn = []
+        st = 0
+        with ops.control_dependencies(None):
+          for i in range(len(args)):
+            en = st + shapes[i][1].value
+            wn.append(self._normalize(weights[st:en, :],
+                                      name='norm_{}'.format(i)))
+            st = en
+
+          weights = array_ops.concat(wn, axis=0)
+
+      if len(args) == 1:
+        res = math_ops.matmul(args[0], weights)
+      else:
+        res = math_ops.matmul(array_ops.concat(args, 1), weights)
+      if not bias:
+        return res
+
+      with vs.variable_scope(outer_scope) as inner_scope:
+        inner_scope.set_partitioner(None)
+        if bias_initializer is None:
+          bias_initializer = init_ops.constant_initializer(0.0, dtype=dtype)
+
+        biases = vs.get_variable(
+            self._bias_variable_name, [output_size],
+            dtype=dtype,
+            initializer=bias_initializer)
+
+      return nn_ops.bias_add(res, biases)
+
+  def call(self, inputs, state):
+    """Run one step of LSTM.
+
+    Args:
+      inputs: input Tensor, 2D, batch x num_units.
+      state: A tuple of state Tensors, both `2-D`, with column sizes
+       `c_state` and `m_state`.
+
+    Returns:
+      A tuple containing:
+
+      - A `2-D, [batch x output_dim]`, Tensor representing the output of the
+        LSTM after reading `inputs` when previous state was `state`.
+        Here output_dim is:
+           num_proj if num_proj was set,
+           num_units otherwise.
+      - Tensor(s) representing the new state of LSTM after reading `inputs` when
+        the previous state was `state`.  Same type and shape(s) as `state`.
+
+    Raises:
+      ValueError: If input size cannot be inferred from inputs via
+        static shape inference.
+    """
+    dtype = inputs.dtype
+    num_units = self._num_units
+    sigmoid = math_ops.sigmoid
+    c, h = state
+
+    input_size = inputs.get_shape().with_rank(2)[1]
+    if input_size.value is None:
+      raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
+
+    with vs.variable_scope(self._scope, initializer=self._initializer):
+
+      concat = self._linear([inputs, h], 4 * num_units,
+                            norm=self._norm, bias=True)
+
+      # i = input_gate, j = new_input, f = forget_gate, o = output_gate
+      i, j, f, o = array_ops.split(value=concat, num_or_size_splits=4, axis=1)
+
+      if self._use_peepholes:
+        w_f_diag = vs.get_variable("w_f_diag", shape=[num_units], dtype=dtype)
+        w_i_diag = vs.get_variable("w_i_diag", shape=[num_units], dtype=dtype)
+        w_o_diag = vs.get_variable("w_o_diag", shape=[num_units], dtype=dtype)
+
+        new_c = (c * sigmoid(f + self._forget_bias + w_f_diag * c)
+                 + sigmoid(i + w_i_diag * c) * self._activation(j))
+      else:
+        new_c = (c * sigmoid(f + self._forget_bias)
+                 + sigmoid(i) * self._activation(j))
+
+      if self._cell_clip is not None:
+        # pylint: disable=invalid-unary-operand-type
+        new_c = clip_ops.clip_by_value(new_c, -self._cell_clip, self._cell_clip)
+        # pylint: enable=invalid-unary-operand-type
+      if self._use_peepholes:
+        new_h = sigmoid(o + w_o_diag * new_c) * self._activation(new_c)
+      else:
+        new_h = sigmoid(o) * self._activation(new_c)
+
+      if self._num_proj is not None:
+        with vs.variable_scope("projection"):
+          new_h = self._linear(new_h,
+                               self._num_proj,
+                               norm=self._norm,
+                               bias=False)
+
+        if self._proj_clip is not None:
+          # pylint: disable=invalid-unary-operand-type
+          new_h = clip_ops.clip_by_value(new_h,
+                                         -self._proj_clip,
+                                         self._proj_clip)
+          # pylint: enable=invalid-unary-operand-type
+
+      new_state = rnn_cell_impl.LSTMStateTuple(new_c, new_h)
+      return new_h, new_state
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py
index d2beac5f31..f498b2bb57 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py
@@ -225,6 +225,94 @@ class TestBeamStep(test.TestCase):
     self.assertAllEqual(next_state_.log_probs, expected_log_probs)
 
 
+class TestLargeBeamStep(test.TestCase):
+  """
+  Tests a single step of beam search in such
+  case that beam size is larger than vocabulary size.
+  """
+
+  def setUp(self):
+    super(TestLargeBeamStep, self).setUp()
+    self.batch_size = 2
+    self.beam_width = 8
+    self.vocab_size = 5
+    self.end_token = 0
+    self.length_penalty_weight = 0.6
+
+
+  def test_step(self):
+    def get_probs():
+      """this simulates the initialize method in BeamSearchDecoder"""
+      log_prob_mask = array_ops.one_hot(array_ops.zeros([self.batch_size],
+                                                        dtype=dtypes.int32),
+                                        depth=self.beam_width, on_value=True,
+                                        off_value=False, dtype=dtypes.bool)
+
+      log_prob_zeros = array_ops.zeros([self.batch_size, self.beam_width],
+                                       dtype=dtypes.float32)
+      log_prob_neg_inf = array_ops.ones([self.batch_size, self.beam_width],
+                                        dtype=dtypes.float32) * -np.Inf
+
+      log_probs = array_ops.where(log_prob_mask, log_prob_zeros,
+                                  log_prob_neg_inf)
+      return log_probs
+
+    log_probs = get_probs()
+    dummy_cell_state = array_ops.zeros([self.batch_size, self.beam_width])
+
+    _finished = array_ops.one_hot(
+        array_ops.zeros([self.batch_size], dtype=dtypes.int32),
+        depth=self.beam_width, on_value=False,
+        off_value=True, dtype=dtypes.bool)
+    _lengths = np.zeros([self.batch_size, self.beam_width], dtype=np.int64)
+    _lengths[:, 0]=2
+    _lengths = constant_op.constant(_lengths, dtype=dtypes.int64)
+
+    beam_state = beam_search_decoder.BeamSearchDecoderState(
+        cell_state=dummy_cell_state,
+        log_probs=log_probs,
+        lengths=_lengths,
+        finished=_finished)
+
+    logits_ = np.full([self.batch_size, self.beam_width, self.vocab_size],
+                      0.0001)
+    logits_[0, 0, 2] = 1.9
+    logits_[0, 0, 3] = 2.1
+    logits_[0, 1, 3] = 3.1
+    logits_[0, 1, 4] = 0.9
+    logits_[1, 0, 1] = 0.5
+    logits_[1, 1, 2] = 2.7
+    logits_[1, 2, 2] = 10.0
+    logits_[1, 2, 3] = 0.2
+    logits = constant_op.constant(logits_, dtype=dtypes.float32)
+    log_probs = nn_ops.log_softmax(logits)
+
+    outputs, next_beam_state = beam_search_decoder._beam_search_step(
+        time=2,
+        logits=logits,
+        next_cell_state=dummy_cell_state,
+        beam_state=beam_state,
+        batch_size=ops.convert_to_tensor(self.batch_size),
+        beam_width=self.beam_width,
+        end_token=self.end_token,
+        length_penalty_weight=self.length_penalty_weight)
+
+    with self.test_session() as sess:
+      outputs_, next_state_, state_, log_probs_ = sess.run(
+          [outputs, next_beam_state, beam_state, log_probs])
+
+    self.assertEqual(outputs_.predicted_ids[0, 0], 3)
+    self.assertEqual(outputs_.predicted_ids[0, 1], 2)
+    self.assertEqual(outputs_.predicted_ids[1, 0], 1)
+    neg_inf = -np.Inf
+    self.assertAllEqual(next_state_.log_probs[:, -3:],
+                        [[neg_inf, neg_inf, neg_inf],
+                         [neg_inf, neg_inf, neg_inf]])
+    self.assertEqual((next_state_.log_probs[:, :-3] > neg_inf).all(), True)
+    self.assertEqual((next_state_.lengths[:, :-3] > 0).all(), True)
+    self.assertAllEqual(next_state_.lengths[:, -3:], [[0, 0, 0],
+                                                      [0, 0, 0]])
+
 class BeamSearchDecoderTest(test.TestCase):
 
   def _testDynamicDecodeRNN(self, time_major, has_attention):
diff --git a/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py b/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
index ebe25ce077..a5f7169c31 100644
--- a/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
+++ b/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
-
 import numpy as np
 
 from tensorflow.contrib.seq2seq.python.ops import beam_search_ops
@@ -229,8 +228,11 @@ class BeamSearchDecoder(decoder.Decoder):
     self._start_tokens = array_ops.tile(
         array_ops.expand_dims(self._start_tokens, 1), [1, self._beam_width])
     self._start_inputs = self._embedding_fn(self._start_tokens)
-    self._finished = array_ops.zeros(
-        [self._batch_size, self._beam_width], dtype=dtypes.bool)
+    
+    self._finished = array_ops.one_hot(
+        array_ops.zeros([self._batch_size], dtype=dtypes.int32),
+        depth=self._beam_width, on_value=False,
+        off_value=True, dtype=dtypes.bool)
 
   @property
   def batch_size(self):
@@ -298,11 +300,15 @@ class BeamSearchDecoder(decoder.Decoder):
     """
     finished, start_inputs = self._finished, self._start_inputs
 
+    log_probs = array_ops.one_hot(  # shape(batch_sz, beam_sz)
+        array_ops.zeros([self._batch_size], dtype=dtypes.int32),
+        depth=self._beam_width, on_value=0.0, off_value=-np.Inf,
+        dtype=nest.flatten(self._initial_cell_state)[0].dtype)
+
+
     initial_state = BeamSearchDecoderState(
         cell_state=self._initial_cell_state,
-        log_probs=array_ops.zeros(
-            [self._batch_size, self._beam_width],
-            dtype=nest.flatten(self._initial_cell_state)[0].dtype),
+        log_probs=log_probs,
         finished=finished,
         lengths=array_ops.zeros(
             [self._batch_size, self._beam_width], dtype=dtypes.int64))
@@ -563,18 +569,11 @@ def _beam_search_step(time, logits, next_cell_state, beam_state, batch_size,
   time = ops.convert_to_tensor(time, name="time")
   # During the first time step we only consider the initial beam
   scores_shape = array_ops.shape(scores)
-  scores_flat = control_flow_ops.cond(
-      time > 0,
-      lambda: array_ops.reshape(scores, [batch_size, -1]),
-      lambda: scores[:, 0])
-  num_available_beam = control_flow_ops.cond(
-      time > 0, lambda: math_ops.reduce_prod(scores_shape[1:]),
-      lambda: math_ops.reduce_prod(scores_shape[2:]))
+  scores_flat = array_ops.reshape(scores, [batch_size, -1])
 
   # Pick the next beams according to the specified successors function
-  next_beam_size = math_ops.minimum(
-      ops.convert_to_tensor(beam_width, dtype=dtypes.int32, name="beam_width"),
-      num_available_beam)
+  next_beam_size = ops.convert_to_tensor(beam_width, dtype=dtypes.int32,
+                                         name="beam_width")
   next_beam_scores, word_indices = nn_ops.top_k(scores_flat, k=next_beam_size)
 
   next_beam_scores.set_shape([static_batch_size, beam_width])
diff --git a/tensorflow/contrib/verbs/BUILD b/tensorflow/contrib/verbs/BUILD
index 38a84ffb10..80a5d07ea4 100644
--- a/tensorflow/contrib/verbs/BUILD
+++ b/tensorflow/contrib/verbs/BUILD
@@ -99,7 +99,7 @@ cc_library(
     alwayslink = 1,
 )
 
-tf_cuda_library(
+cc_library(
     name = "rdma_rendezvous_mgr",
     srcs = ["rdma_rendezvous_mgr.cc"],
     hdrs = ["rdma_rendezvous_mgr.h"],
@@ -114,7 +114,7 @@ tf_cuda_library(
     ],
 )
 
-cc_library(
+tf_cuda_library(
     name = "rdma_mgr",
     srcs = ["rdma_mgr.cc"],
     hdrs = ["rdma_mgr.h"],
@@ -141,6 +141,8 @@ tf_cuda_library(
         "//conditions:default": [],
     }),
     deps = [
+        ":grpc_verbs_client",
+        ":verbs_service_proto_cc",
         ":verbs_util",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
diff --git a/tensorflow/contrib/verbs/README.md b/tensorflow/contrib/verbs/README.md
index 7c1c8ea459..1b99f4ce4f 100644
--- a/tensorflow/contrib/verbs/README.md
+++ b/tensorflow/contrib/verbs/README.md
@@ -24,66 +24,144 @@ The design is based on TensorFlow r1.0. An RDMA path is added between servers fo
 
 During the server setup, an RDMA manager is created to manage low-level RDMA components such as RDMA channel and RDMA adapter, an RDMA rendezvous manager is created to oversee send/recv operations between servers. Following the distributed TensorFlow design philosophy, the send operation is passive, i.e. merely placing a tensor in the local out-going table. It is the receive operation that actually initiates the tensor transfer.
 
-TensorFlow dynamically allocates memory for tensors that are to be sent or received. This causes difficulty for RDMA operations where pinned memory is required. Two remedies are possible, either the memory is pinned, transfer, then unpinned for each and every tensor to be transferred, or a buffer is pre-allocated and pinned for each tensor. The former incurs significant operation overhead since pinning and unpinning memory for each dynamically generated tensor is slow. The latter incurs large memory overhead and extra copying from the tensor to its pinned buffer, but may still be faster than the former. The second approach is adopted in this design. Each RDMA channel, representing a RDMA connection to a peer, contains a table of pinned buffers for all the seen tensors that requires transfer. It is assumed that the tensor size rarely changes across different steps. So only one buffer is created for the same tensor across all the steps. In the rare case when the tensor size does increases, the old buffer is discarded and new buffer of larger size is created and pinned.
+TensorFlow dynamically allocates memory for tensors that are to be sent or received. This causes difficulty for RDMA operations where pinned memory is required. Few remedies are possible:
+1. The memory is pinned, transfered, then unpinned for each and every tensor to be transferred. This incurs significant operation overhead since pinning and unpinning memory for each dynamically generated tensor is slow. 
+2. Buffer is pre-allocated and pinned for each tensor. This incurs large memory overhead and extra copying from the tensor to its pinned buffer, but may still be faster than the former.
+3. Following HKUST research on the use of GPU direct, and their [GDR implementation](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/gdr/README.md), there is a smart way to benefit from the TensorFlow allocation theme which is mostly pool based, i.e allocators pre-allocate a large memory block, and allocate the tensors from there. By attaching a custom Visitor to relevant alloactors, we can do a single registration of the entire memory block, which zeros the registration overhead. Once the block is registered, each new tensor allocated will be at a registred address, which will allow us to do direct RDMA writes to it.
 
-When a tensor is prepared for transfer, it is first converted to TensorProto, then the proto is serialized to byte array and copied to the pinned buffer. The content of the buffer is transferred to the remote node via RDMA write. On the remote side, the process is reversed. This is illustrated in the diagram below. The conversion of TensorProto is introduced to simplify transfer of string-tensors. Also since the TensorProto lives in host memory, even if the origin tensor lives in the device, the pinned buffers are all allocated in the host memory.
-![TensorFlow RDMA path](./design_diagram.png)
+For best performance, we will adopt HKUST 0 copies approach in our solution. This means:
+
+1. Tensor writes will be done directly from the source tensor to the **result** tensor, with no memory copies in between. This should be done for all DMAable tensors which are located either on CPU or on a RDMA compatible GPU device (GPU direct). 
+2. Non DMAable tensors (CanMemCopy == false) will be serialized to a TensorProto on the sender side, RDMA written to a registered buffer on the receiver side, and then deserialized by the receiver.
+3. Tensors which are located on a non-RDMA-compatible GPU, will be RDMA written to a registered CPU **proxy** buffer on the receiver side, and then copied to GPU by the receiver.
 
-The following improvements can be made in the future. First, conversion to TensorProto and serialization can be avoided for numeric (float/int) tensors since their internal buffer can be access directly as byte array. Second, the pinned buffer may be allocated on device if the tensor is located in the device. This avoids extra device-to-host copy at the expense of extra device memory consumption.
 ## Design details
 
-### RDMA components
+### Terminology
 
-* **RDMA adapter:** The base for RDMA communications. It may contain multiple channels and buffers.  It is responsible for handling various incoming RDMA messages.
-* **RDMA channel:** Responsible for RDMA connection to a particular node. It manages multiple buffers. A channel has a callback table which stores all the callbacks for the requested tensors.
-* **RDMA buffer:** Responsible for sending or receiving data. It has a fixed size memory to store the data. It has a queue to store the pending jobs. There are three types of buffers, message buffer, ACK buffer and tensor buffer. A channel has two message buffers, two ack buffers and many tensor buffers.
-* **RDMA manager:** Manages the adapter and channels, including channel creation, channel setup via GRPC service, channel lookup, etc.
-* **RDMA rendezvous manager:** manages multiple rdma rendezvous.
-* **RDMA rendezvous:** a derived class of BaseRemoteRendezvous. This class is the back end for "send" and "recv" ops. When the sendrecv_op wants to send or receive a tensor, it calls the rendezvous' "send" and "recv" functions respectively. Rendezvous are identified by "step_id", a random number, so that tensors for different iterations don't get mixed up.
+* **Sender** - The node which sends the tensor.
+* **Receiver** - The node which receives the tensor.
+* **Result tensor** - The destination tensor, allocated on its appropriate device.
+* **Proxy tensor** - A CPU allocated tensor, which will be used in the case where the result tensor cannot be RDMA written to directly (GPU direct is disabled or not available). The RDMA write will therefore be done to the proxy tensor, and afterwards we will do a manual local copy from it to the result tensor.
 
-### The SEND operation
+### Messages
 
-In TensorFlow, when rendezvous sends a tensor, it merely puts a tensor in a local table in the corresponding rendezvous. If the tensor has been requested, a callback exists in the table. "send" will activate the callback, which tries to send the tensor across the node.
+* RDMA_MESSAGE_TENSOR_REQUEST
+* RDMA_MESSAGE_META_DATA_RESPONSE
+* RDMA_MESSAGE_TENSOR_RE_REQUEST
 
+### Transport protocol
 
-### The RECV operation
+The tensor transfer process is initiated when the receiver requests a tensor. In code it is done by calling **Rendezvous::Recv()** or **Rendezvous::RecvAsync()**. The TensorFlow base implementation handles the case where the requested tensor is located on the same node. The more interesting case where the requested tensor is located on a remote node (receiver != sender) is to be handled in a derivation of the pure virtual **BaseRemoteRendezvous::RecvFromRemoteAsync()**. TensorFlow provides a default GRPC based implementation which comes in the vanilla version but suffers in scalability when running large models. Our RDMA based implementation presumes to be more scalable. HKUST's contrib GDR implementation is more scalable than GRPC, and less scalable than ours, only because we did our evolution based on it.
 
-When a tensor is requested, rendezvous' recv function is called. The function first places a callback in the channel's callback table, which will be activated once the tensor is sent from the source. In the next step, a message is sent to notify the source of the requested tensor. Once the source receives the message, it will check locally for the tensor, if not found, a callback is placed in the table, otherwise, the tensor id will be placed at corresponding RDMA buffer's job queue for future transmission. When a tensor is scheduled to be transmitted, the RDMA buffer needs to have the memory allocated and initialized (registered with the remote buffer info). If the memory is not ready, the transmission is deferred, a message is sent to the destination to establish the memory first. The other case a transmission can be deferred is when the buffer is still being used by an on-going transmission.
+Our entry point is the implementation of **RdmaRemoteRendezvous::RecvFromRemoteAsync()**, located in rdma_rendezvous_mgr.cc. The implementation creates a new **RdmaTensorRequest** object, keyed by request index (uint32_t), stores it in a list of pending requests, and calls its **Start()** method. The **Start()** method basically does 2 things:
 
-### Three types of RDMA buffers
+1. Allocate the result tensor (and the proxy tensor if required).
+2. Send a **RDMA_MESSAGE_TENSOR_REQUEST** to the sender, containing the address of the destination tensor (result/proxy) for RDMA write.
 
-* **Message buffer:** responsible for sending message only.
-* **Ack buffer:** once a message is sent, the recipient needs to send an ack via the ack buffer to free up the message buffer. An ack buffer is exclusively for its coupled message buffer.
-* **Tensor buffer:** responsible for sending tensors. The recipient needs to send back a message to free up the sending buffer.
+In order to allocate the result and proxy tensors, we need to know the tensor's meta-data, i.e. shape and data-type for DMAable tensors, and proto-size for serialized tensors. Unfortunately, this information is only available on the sender side which complicates manners. In order to avoid sending extra messages for querying the meta-data at each step, we store a local meta-data cache per tensor, which will only be update upon changes. Based on the assumption that the meta-data of a tensor rarely changes between steps, we expect that on most times the cache will only be updated once. The sender is responsible to detect changes in the meta-data, and update the receiver. In order for the sender to know that the meta-data had changed, each **RDMA_MESSAGE_TENSOR_REQUEST** will contain the meta-data that the receiver had grabbed from the local cache. The sender will then compare the meta-data from the message to the tensor's new meta-data.
 
-### RDMA packet format
+When the sender receives an **RDMA_MESSAGE_TENSOR_REQUEST**, it will create a new **RdmaTensorResponse** object for the given request message, store it in a list of pending responses, and will invoke its **Start()** method. The **Start()** method does the following:
 
-|type|name_size|name|step_id|buffer_size|remote_addr|rkey|is_dead|data_type|tensor_shape|tensor_bytes|tensor_buffer|
+1. Grab the source tensor from the local table (In code, **RecvLocalAsync()**).
+2. If the source tensor is not DMAable, serialize it to a TensorProto.
+3. If the source tensor is located on a device which cannot be DMA written from, copy it to CPU.
+4. If it is the first time this tensor is requested, or if the tensor's meta-data changed:
+	1. Clone the tensor's data to be sent later.
+	2. Send a **RDMA_MESSAGE_META_DATA_RESPONSE** containing the new meta-data.
+5. Otherwise:
+	1. RDMA write the tensor (or TensorProto) to the destination address and rkey specified in the request message. The immediate value for the write will be the request index.
 
-### Six types of RDMA messages
-* RDMA_MESSAGE_ACK
-* RDMA_MESSAGE_BUFFER_IDLE
-* RDMA_MESSAGE_BUFFER_REQUEST
-* RDMA_MESSAGE_BUFFER_RESPONSE
-* RDMA_MESSAGE_TENSOR_REQUEST
-* RDMA_MESSAGE_TENSOR_WRITE
-
-### Actions upon receiving RDMA messages
-* RDMA_MESSAGE_ACK
-  * sender: mark local ack buffer idle.
-  * receiver: mark remote message buffer idle, send next item.
-* RDMA_MESSAGE_BUFFER_IDLE
-  * sender: mark local message buffer idle, send next item.
-  * receiver: send ack, set remote tensor buffer idle, send next item.
-* RDMA_MESSAGE_BUFFER_REQUEST
-  * sender: mark local message buffer idle, send next item.
-  * receiver: send ack, find or create tensor buffer, send BUFFER_RESPONSE.
-* RDMA_MESSAGE_BUFFER_RESPONSE
-  * sender: mark local message buffer idle, send next item.
-  * receiver: send ack, set remote buffer info, set local and remote buffer idle, send next item.
-* RDMA_MESSAGE_TENSOR_REQUEST
-  * sender: mark local message buffer idle, send next item.
-  * receiver: send ack, find or create tensor buffer, enqueue tensor id, send next item.
-* RDMA_MESSAGE_TENSOR_WRITE
-  * sender: mark local message buffer idle, send next item.
-  * receiver: run callback.
+
+When the receiver receives the **RDMA_MESSAGE_META_DATA_RESPONSE**, it will locate the relevant **RdmaTensorRequest** using the request index specified in the message, and invoke its **RecvTensorMetaData()** which does the following:
+
+1. Update the local meta-data cache.
+2. Reallocate the result/proxy tensors. 
+3. Re-send the tensor request. For tracability, the new message has a different name: **RDMA_MESSAGE_TENSOR_RE_REQUEST**.
+
+When the sender receives a **RDMA_MESSAGE_TENSOR_RE_REQUEST**, it will locate the relevant **RdmaTensorResponse** using the request index specified in the message, and invoke its **Resume()** method, which will RDMA write the contents of the tensor that was cloned earlier, to the new remote address specified in the re-request.
+
+When the receiver receives the RDMA write, it will locate the relevant **RdmaTensorRequest** using the request index which is the immediate value. It will then invoke its **RecvTensorContent()** which does the following:
+
+1. Proxy copy/deserialize if required.
+2. Invoke the done callback.
+3. Deallocate the result/proxy tensors and remove the request from the pending list.
+
+![alt text](verbs_with_0_copies.png "Transport protocol")
+
+### Additional design notes
+
+1. When the sender receives a tensor request, the source tensor may or may not be ready yet. The situation is handled through a process of tag matching:
+	* If the request arrives before the tensor is ready, then a callback is put in a local table, and will be invoked once the tensor arrives.
+	* If the tensor is ready before the request arives, than the tensor is put in a local table. When the request arrives, it will invoke the callback immediatly.
+   In code it is done by calling **RecvLocalAsync()**, which receives the tensor's key, step-id, and the callback.
+2. When the callback is invoked, the relevant tensor is removed from the tag matching table. In the case where we need to send the tensor's meta-data, the **RdmaTensorResponse** will store a copy of the tensor until the re-request arrives.
+3. The sending of protocol messages (**RDMA_MESSAGE_TENSOR_REQUEST**, **RDMA_MESSAGE_META_DATA_RESPONSE** and **RDMA_MESSAGE_TENSOR_RE_REQUEST**) is done by the class **RdmaMessageBuffer**. All messages are sent using RDMA writes from/to fixed messages buffers. This implies that we cannot send on a specific channel more than one message at a time. In order to synchronize the messages, the **RdmaMessageBuffer** holds the a local and remote buffer statuses which can be either busy or idle. When a write is issued, both statuses will be changed to busy. When the write-complete event is received, the local status is changed to idle. When the write is received on the remote side, the remote side will parse the message, and return an ACK back to the sending side on which the sending side will update the remote status to idle. When both the local and remote statuses are idle, the next message can be sent.
+5. ACK writes are empty writes (hence they require no buffer) with immediate value 0xFFFFFFFE. Message writes have the immediate value 0xFFFFFFFF. All other writes are tensor-content writes whose immediate value is the request-index.
+
+### RDMA components
+
+* **enum RdmaImmDataType**       - Immediate types to distinguish between different RDMA writes on the remote side. Ack writes and control-message writes have a fixed immediate value. The rest of the writes are tensor writes and the immediate value is the relevant request index.
+* **enum  RdmaWriteIDType**      - Types to distinguish between different RDMA write-complete events: Ack, control message and tensor writes.
+* **class RdmaWriteID**          - Context for RDMA write complete events. Holds the RdmaWriteIDType and additional data.
+* **class RdmaTensorMetaData**   - Meta-data for a tensor (type, shape, is_dead, proto_size).
+* **class RdmaMemoryMgr**        - Manages the meta-data cache, and the registered memory regions.
+* **class RdmaTensorRequest**    - Holds and manages information for a single tensor request throughout the entire receive cycle. API:
+	* **Start()**                - Start the request sequence.
+		* Allocate the result tensor (and proxy tensor if required).
+		* Send RDMA_MESSAGE_TENSOR_REQUEST to the remote side.
+	* **RecvTensorMetaData()**   - Receive meta-data from the remote side.
+		* Update the local meta-data cache.
+		* Reallocate the result tensor (and proxy tensor if required).
+		* Re-send the request to the remote side.
+	* **RecvTensorContent()**    - Receive tensor content from the remote side (RDMA write was completed).
+		* Decode proto if required and/or move to GPU if the content was not written to it directly (GPU direct is not avaliable).
+		* Invoke the done callback.
+* **class RdmaTensorResponse**   - Holds and manages information for a single tensor response throughout the entire send cycle. API:
+	* **Start()**                - Start the response sequence. 
+		* Find the tensor in the local tag-match table.
+		* Compare the tensor's meta-data to the meta-data in the message (taken from the requester's local cache). 
+			* If meta-data changed:
+				* Clone the tensor to be sent later.
+				* Send a meta-data update message and wait for re-request.
+			* Else:
+				* Send the tensor's content (using direct RDMA write).
+	* **Resume()**               - Resume the response sequence after a re-request. Send the tensor's content that was cloned earlier.
+	* **Destroy()**              - Destroy the response's resources and remove it form the pending list.
+* **class RdmaAdapter**          - The base for RDMA communications. It may contain multiple channels and buffers.  It is responsible for handling various incoming RDMA messages.
+* **class RdmaChannel**          - Responsible for RDMA connection to a particular node. It manages messagee buffers. A channel has a request table which stores all the pending tensor requests.
+* **class RdmaMessageBuffer**    - Responsible for sending or receiving messages. It has a fixed size memory to store the data. It has a queue to store the pending jobs. A channel has two message buffers one for tx and one for rx.
+* **class RdmaMgr**              - Manages the adapter and channels, including channel creation, channel setup via GRPC service, channel lookup, etc.
+* **class RdmaRendezvousMgr**    - Manages multiple rdma rendezvous.
+* **class RdmaRemoteRendezvous** - A derived class of BaseRemoteRendezvous. This class is the back end for "send" and "recv" ops. When the sendrecv_op wants to send or receive a tensor, it calls the rendezvous' "send" and "recv" functions respectively. Rendezvous are identified by "step_id", a random number, so that tensors for different iterations don't get mixed up.
+
+### Message structure:
+
+| type | name_size | name | step_id | request_index | remote_addr/checksum | rkey | is_dead | data_type | tensor_shape | tensor_bytes | error_status          |
+|------|---------- |------|---------|---------------|----------------------|------|---------|-----------|--------------|--------------|-----------------------|
+|  1B  |    2B     | 512  |  8B     |      8B       |           8B         |   4B |      1B |     XB    |    XB        |    8B        | Size - 4B, proto - XB |
+
+* **RDMA_MESSAGE_TENSOR_REQUEST**  - (receiver ==> sender) The original tensor request. 
+	* type - The message type.
+	* name (name_size) - Name of the requested tensor.
+	* step_id - Step ID.
+	* request_index - Request index.
+	* remote_addr/rkey - Address/rkey of the result/proxy tensor. Irrelevant for first-time request.
+	* is_dead/data_type/tensor_shape/tensor_bytes - The current meta-data as stored in the receiver local cache. The sender will use that information to know if the receiver's cache requires updating.
+* **RDMA_MESSAGE_META_DATA_RESPONSE**  - (sender ==> receiver) The meta-data update message in case meta-data had changed (or if it is the first time the tensor is requested).
+	* type - The message type.
+	* request_index - Request index.
+	* is_dead/data_type/tensor_shape/tensor_bytes - The up-to-date meta-data.
+	* checksum - In data validation mode, this will hold the checksum of the source tensor.
+* **RDMA_MESSAGE_TENSOR_RE_REQUEST** - (receiver ==> sender) Tensor re-requset after meta-data update and reallocation of result/proxy tensors.
+	* type - The message type.
+	* name (name_size) - Name of the requested tensor.
+	* step_id - Step ID.
+	* request_index - Request index.
+	* remote_addr/rkey - Address/rkey of the reallocated result/proxy tensor.
+* **RDMA_MESSAGE_ERROR_STATUS** - (sender ==> receiver) Notify the receiver that an error had occured on the sender side, so it can propagate it to the upper levels.
+	* type - The message type.
+	* name (name_size) - Name of the requested tensor.
+	* step_id - Step ID.
+	* request_index - Request index.
+	* error_status - The error status (code, message, details).
diff --git a/tensorflow/contrib/verbs/grpc_verbs_service.cc b/tensorflow/contrib/verbs/grpc_verbs_service.cc
index f2af6b79fb..742f946c95 100644
--- a/tensorflow/contrib/verbs/grpc_verbs_service.cc
+++ b/tensorflow/contrib/verbs/grpc_verbs_service.cc
@@ -122,17 +122,15 @@ Status GrpcVerbsService::GetRemoteAddressSync(
   rc->SetRemoteAddress(ra, false);
   rc->Connect();
   int i = 0;
-  int idx[] = {1, 0, 3, 2};
-  std::vector<RdmaBuffer*> mb(rc->message_buffers());
-  CHECK_EQ(request->mr_size(), 4);
+  int idx[] = {1, 0};
+  std::vector<RdmaMessageBuffer*> mb(rc->message_buffers());
+  CHECK_EQ(request->mr_size(), RdmaChannel::kNumMessageBuffers);
   for (const auto& mr : request->mr()) {
     // the connections are crossed, i.e.
     // local tx_message_buffer <---> remote rx_message_buffer_
     // local rx_message_buffer <---> remote tx_message_buffer_
-    // local tx_ack_buffer <---> remote rx_ack_buffer_
-    // local rx_ack_buffer <---> remote tx_ack_buffer_
-    // hence idx[] = {1, 0, 3, 2}.
-    RdmaBuffer* rb = mb[idx[i]];
+    // hence idx[] = {1, 0}.
+    RdmaMessageBuffer* rb = mb[idx[i]];
     RemoteMR rmr;
     rmr.remote_addr = mr.remote_addr();
     rmr.rkey = mr.rkey();
diff --git a/tensorflow/contrib/verbs/patch_notes_verbs_with_0_copies.md b/tensorflow/contrib/verbs/patch_notes_verbs_with_0_copies.md
new file mode 100644
index 0000000000..956b8f2147
--- /dev/null
+++ b/tensorflow/contrib/verbs/patch_notes_verbs_with_0_copies.md
@@ -0,0 +1,87 @@
+## Verbs implementation to use direct tensor writes (0 copies)
+
+### Motivation:
+
+Following HKUST research on the use of GPU direct, and their [GDR implementation](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/gdr/README.md), we wish to adopt the 0 copies approach and apply it to the current verbs implementation, while keeping the current implementation advantages, such as configurability and the use of RDMA for control messages.
+
+### Performance:
+
+Compared with the current GRPC, verbs and GDR implementation, the result implementation gave the best performance for every model, with any number of nodes. For VGG16 on 8 nodes with 4 P100 GPUs each, the prototype beat the second place by over 15%.
+
+### Implementation requirements:
+
+1. Tensor writes need to be done directly from the source Tensor to the destination Tensor, with no memory copies in between. This should be done for all DMAble tensors which are located either on CPU or on a RDMA compatible GPU device (GPU direct). 
+2. Non DMAble tensors (CanMemCopy == false) will be serialized to proto on the sender side, RDMA written to a registered buffer on the receiver side, and then deserialized by the receiver.
+3. Tensors which are located on a non-RDMA-compatible GPU, will be RDMA written to a registered CPU proxy buffer on the receiver side, and then copied to GPU by the receiver.
+
+### Implementation constrains:
+
+For best stability and proof of correctness, we will divide the implementation to two stages:
+1. At first stage we will keep changes to the current implementation to the minimum possible. The expense will be that we may have unused or unnecessary code leftovers, which may also affect performance. 
+2. At second stage, we will re-iterate over the code and remove irrelevant code parts.
+The design of the solution aims that we will achieve both stages with relative ease. 
+
+### Design guidelines:
+
+1. Since we do not want to do any unnecessary memory copying, we will no longer allocate a fixed CPU buffer as the destination for the RDMA write. Instead we will do the writing directly to the result tensor, or if the result tensor is on a device which does not support RDMA, we will do the writing to a proxy CPU tensor and then copy its content to the result tensor.
+2. The address of the destination Tensor needs to be sent to the sender side for writing, meaning that the result/proxy tensor should be pre-allocated on the receiver side, prior to sending the tensor request. In order to do that, we need to know its meta-data, i.e. shape and data-type for DMAble tensors, and proto-size for serialized tensors. Unfortunately, this information is only available on the sender side which complicates manners. In order to avoid sending extra messages for querying the meta-data on each step, we store a local meta-data cache per tensor. Based on the assumption that the meta-data of a tensor rarely changes between steps, we expect that on most times the cache will only be updated once. When the sender receives a request for a tensor, if it is the first time this tensor is requested, or in the rare case that the meta-data did change, the sender will first send a meta-data response, on which the receiver will update the local cache, and reallocate the result/proxy tensors if required. When the receiver sends the tensor request, it will contain also the meta-data currently stored in its local cache, so the sender can compare it to see if there was a change.
+3. When the sender writes the tensor content to the result tensor, no additional data is being written with it. That means we need to reside on ibverbs immediate (uint32_t) to indicate which request we are responding to (in order to trigger the receive callback). The easiest and most elegant way is to key the recv callback with a unique request_index (uint32_t), instead of the current key_with_step_id (string). 
+4. Since the sender no longer writes the tensor from/to fixed buffers, we no longer need to schedule the writes using the local/remote status. In addition we no longer rely on the RmdaTensorBuffer members as the source/destination addresses and rkey/lkey. Instead, each RdmaTensorBuffer will hold multiple "Response" objects (one per step-id), from which we derive destination address and rkey. The source address and lkey are always the ones of the source Tensor.
+5. With the addition of tensor pre-allocation, we noticed there is a large code similarity between sending the first tensor request and re-sending the request in case of meta-data changes. After implementing a common method for tensor pre-allocation, it turned out that implementation becomes much simpler by encapsulating the process of request sending/re-sending, meta-data response callback and content response callback, all in a single "Request" class. The request class holds all the relevant request information, which reduces excessive parameter passing and lambda capturing. This decision is purely for elegance and code simplicity, and we decided to implement it in first stage because it makes the implementation much easier.
+
+### New types/classes:
+
+* **enum RdmaImmDataType** - Immediate types to distinguish between different RDMA writes on the remote side. Ack writes and control-message writes have a fixed immediate value. The rest of the writes are tensor writes and the immediate value is the relevant request index.
+* **enum  RdmaWriteIDType**    - Types to distinguish between different RDMA write-complete events: Ack, control message, tensor DMA write and tensor proto write.
+* **class RdmaWriteID**        - Context for RDMA write complete events. Holds the RdmaWriteIDType and additional data.
+* **class RemoteAddressContext** - Remote address information (address + mr). Will be passed as write context for tensor proto writes.
+* **class RdmaTensorMetaData** - Meta-data for a tensor (type, shape, is_dead, proto_size).
+* **class RdmaMemoryMgr**      - Manages the meta-data cache, and the registered memory regions.
+* **class RdmaTensorRequest**  - Holds and manages information for a single tensor request throughout the entire receive cycle. API:
+	* Start() - Start the request.
+	* RecvTensorMetaData() - Receive meta-data from the remote side.
+	* RecvTensorContent() - Receive tensor content from the remote side and invoke the done() callback. 
+* **class RdmaTensorResponse** - Holds information for a single tensor response, such as destination address and rkey.
+
+### Protocol changes:
+
+The protocol messages themselves will remain mostly unchanged at the first stage, but will be used differently, as described below. The current messages structures already have most of the required fields for the new implementation. The only change is the "buffer_size" field which is no longer used since we are no longer sending additional information with the tensor, and thus it is now always equal to the "tensor_bytes" field. Instead, we use that field to pass the "request_index".
+
+### Message structure:
+
+| type | name_size | name | step_id | request_index | remote_addr | rkey | is_dead | data_type | tensor_shape | tensor_bytes |
+|------|---------- |------|---------|---------------|-------------|------|---------|-----------|--------------|--------------|
+|  1B  |    2B     | 512  |  8B     |      8B       |         8B  |   4B |      1B |     XB    |    XB        |    8B        |
+
+* **RDMA_MESSAGE_TENSOR_REQUEST**  - (receiver ==> sender) The original tensor request. 
+	* type - The message type.
+	* name (name_size) - Name of the requested tensor.
+	* step_id - Step ID.
+	* request_index - Request index.
+	* remote_addr/rkey - Address/rkey of the result/proxy tensor. Irrelevant for first-time request.
+	* is_dead/data_type/tensor_shape/tensor_bytes - The current meta-data as stored in the receiver local cache. The sender will use that information to know if the receiver's cache requires updating.
+* **RDMA_MESSAGE_BUFFER_REQUEST**  - (sender ==> receiver) The meta-data update message in case meta-data had changed (or if it is the first time the tensor is requested).
+	* type - The message type.
+	* request_index - Request index.
+	* is_dead/data_type/tensor_shape/tensor_bytes - The up-to-date meta-data.
+* **RDMA_MESSAGE_BUFFER_RESPONSE** - (receiver ==> sender) Tensor re-requset after meta-data update and reallocation of result/proxy tensors.
+	* type - The message type.
+	* name (name_size) - Name of the requested tensor.
+	* step_id - Step ID.
+	* request_index - Request index.
+	* remote_addr/rkey - Address/rkey of the reallocated result/proxy tensor.
+	* is_dead/data_type/tensor_shape/tensor_bytes - The new meta-data. Will be removed in the next phase.
+* **RDMA_MESSAGE_TENSOR_WRITE**    - (sender ==> receiver) No longer sent. There is only a direct write of the tensor content to the result/proxy tensor. Request index passed as the immediate value of the write.
+* **RDMA_MESSAGE_TENSOR_IDLE**     - (receiver ==> sender) No longer sent.
+
+![alt text](verbs_with_0_copies_phase1_protocol.jpg "Phase 1 message protocol")
+
+### Second stage optimizations:
+1. Remove unused code leftovers.
+2. Remove the ACK buffer completely, since we can rely completely on its immediate value.
+
+### Future optimizations:
+1. Map the tensor names to indexes, to significantly reduce the request message size.
+2. Understand the purpose of empty tensors and if we can skip remote fetching for them.
+3. Consider concatenating multiple requests and/or using multiple message buffers.
+4. Consider a no-request architecture.
diff --git a/tensorflow/contrib/verbs/rdma.cc b/tensorflow/contrib/verbs/rdma.cc
index ae9a384565..ec5271abe0 100644
--- a/tensorflow/contrib/verbs/rdma.cc
+++ b/tensorflow/contrib/verbs/rdma.cc
@@ -16,18 +16,19 @@ limitations under the License.
 #ifdef TENSORFLOW_USE_VERBS
 
 #include "tensorflow/contrib/verbs/rdma.h"
-#include <fcntl.h>
+#include "tensorflow/contrib/verbs/verbs_service.pb.h"
 #include <cstdlib>
 #include <fcntl.h>
-#include "tensorflow/contrib/verbs/verbs_util.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/common_runtime/process_util.h"
 #if GOOGLE_CUDA
 #include "tensorflow/core/common_runtime/gpu/gpu_util.h"
 #include "tensorflow/core/common_runtime/gpu/process_state.h"
 #endif
 #include "tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h"
 #include "tensorflow/core/distributed_runtime/session_mgr.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/framework/rendezvous.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -41,32 +42,19 @@ namespace tensorflow {
 #define RoCE_V2 "RoCE v2"
 
 namespace {
-// hash name to 32-bit integer
-uint32_t NameHash(const string& name) {
-  return Hash32(name.data(), name.size(), 0x1234ABCD);
-}
 
 // convenience function for printing message
 string MessageTypeToString(RdmaMessageType rmt) {
   switch (rmt) {
-    case RDMA_MESSAGE_ACK:
-      return "RDMA_MESSAGE_ACK";
-      break;
-    case RDMA_MESSAGE_BUFFER_IDLE:
-      return "RDMA_MESSAGE_BUFFER_IDLE";
-      break;
-    case RDMA_MESSAGE_BUFFER_REQUEST:
-      return "RDMA_MESSAGE_BUFFER_REQUEST";
+    case RDMA_MESSAGE_META_DATA_UPDATE:
+      return "RDMA_MESSAGE_META_DATA_UPDATE";
       break;
-    case RDMA_MESSAGE_BUFFER_RESPONSE:
-      return "RDMA_MESSAGE_BUFFER_RESPONSE";
+    case RDMA_MESSAGE_TENSOR_RE_REQUEST:
+      return "RDMA_MESSAGE_TENSOR_RE_REQUEST";
       break;
     case RDMA_MESSAGE_TENSOR_REQUEST:
       return "RDMA_MESSAGE_TENSOR_REQUEST";
       break;
-    case RDMA_MESSAGE_TENSOR_WRITE:
-      return "RDMA_MESSAGE_TENSOR_WRITE";
-      break;
     default:
       return "UNKNOWN MESSAGE";
   }
@@ -347,7 +335,7 @@ uint32_t set_param(uint32_t default_val, const char* env_param) {
 
 enum ibv_mtu set_mtu(uint8_t port_num, ibv_context* context) {
   ibv_port_attr port_attr;
-  enum ibv_mtu mtu;
+  enum ibv_mtu mtu = IBV_MTU_512;
   string mtu_s;
   int rc, mtu_i;
 
@@ -468,97 +456,70 @@ void RdmaAdapter::Process_CQ() {
         rc->Recv();
         // imm_data is the index of RX buffer in the buffer table.
         uint32_t imm_data = wc_[i].imm_data;
-        RdmaBuffer* rb = rc->FindBuffer(imm_data);
+        RdmaMessageBuffer* rb;
         RdmaMessage rm;
-        RdmaMessage::ParseMessage(rm, rb->buffer_);
-        VLOG(2) << "recv RDMA message: " << MessageTypeToString(rm.type_);
 
-        if (rm.type_ == RDMA_MESSAGE_ACK) {
+        if (imm_data == RDMA_IMM_DATA_ACK) {
           // receive an ack to a message
           rb = rc->tx_message_buffer_;
           rb->SetBufferStatus(remote, idle);
           rb->SendNextItem();
-        } else if (rm.type_ == RDMA_MESSAGE_TENSOR_REQUEST) {
-          // received a request-for-tensor message
-          // send ack to release remote tx message buffer
-          RdmaBuffer* ab = rc->tx_ack_buffer_;
-          ab->SendNextItem();
-          // find or create buffer
-          RdmaBuffer* tb = rc->FindOrCreateBuffer(rm.name_);
-          string key_with_step_id =
-              VerbsUtil::AppendStepidToKey(rm.name_, rm.step_id_);
-          tb->EnqueueItem(key_with_step_id);
-          // send the next tensor
-          worker_env_->compute_pool->Schedule([tb]() { tb->SendNextItem(); });
-        } else if (rm.type_ == RDMA_MESSAGE_BUFFER_IDLE) {
-          // receive tensor-buffer-ready message
-          // send ack to release remote tx message buffer
-          RdmaBuffer* ab = rc->tx_ack_buffer_;
-          ab->SendNextItem();
-          // find buffer
-          RdmaTensorBuffer* tb =
-              reinterpret_cast<RdmaTensorBuffer*>(rc->FindBuffer(rm.name_));
-          tb->SetBufferStatus(remote, idle);
-          worker_env_->compute_pool->Schedule([tb]() { tb->ReSendNextItem(); });
-        } else if (rm.type_ == RDMA_MESSAGE_BUFFER_REQUEST) {
-          // remote host requests to create a tensor buffer;
-          // send ack to release remote tx message buffer
-          RdmaBuffer* ab = rc->tx_ack_buffer_;
-          ab->SendNextItem();
-          // find or create the buffer
-          RdmaBuffer* tb = rc->FindOrCreateBuffer(rm.name_, TENSOR);
-          RemoteMR rmr;
-          rmr.remote_addr = rm.remote_addr_;
-          rmr.rkey = rm.rkey_;
-          tb->SetRemoteMR(rmr, true);
-          tb->CreateCPUBuffer(rm.buffer_size_);
-          // create RDMA_MESSAGE_BUFFER_RESPONSE message
-          RdmaMessage br;
-          br.type_ = RDMA_MESSAGE_BUFFER_RESPONSE;
-          br.name_size_ = rm.name_.size();
-          br.name_ = rm.name_;
-          br.buffer_size_ = rm.buffer_size_;
-          br.remote_addr_ = reinterpret_cast<uint64_t>(tb->buffer_);
-          br.rkey_ = tb->self_->rkey;
-          string message = RdmaMessage::CreateMessage(br);
-          RdmaBuffer* mb = rc->tx_message_buffer_;
-          mb->EnqueueItem(message);
-          mb->SendNextItem();
-        } else if (rm.type_ == RDMA_MESSAGE_BUFFER_RESPONSE) {
-          // remote creates a buffer and responds
-          // send ack to release remote tx message buffer
-          RdmaBuffer* ab = rc->tx_ack_buffer_;
-          ab->SendNextItem();
-          // find buffer
-          RdmaTensorBuffer* tb =
-              reinterpret_cast<RdmaTensorBuffer*>(rc->FindBuffer(rm.name_));
-          CHECK(rm.buffer_size_ == tb->size_)
-              << "rm.buffer_size = " << rm.buffer_size_
-              << "tb->size_ = " << tb->size_ << "rm.name_ = " << rm.name_;
-          RemoteMR rmr;
-          rmr.remote_addr = rm.remote_addr_;
-          rmr.rkey = rm.rkey_;
-          tb->SetRemoteMR(rmr, true);
-          tb->SetBufferStatus(local, idle);
-          tb->SetBufferStatus(remote, idle);
-          worker_env_->compute_pool->Schedule([tb]() { tb->ReSendNextItem(); });
-        } else if (rm.type_ == RDMA_MESSAGE_TENSOR_WRITE) {
-          // tensor RDMA write completed
-          worker_env_->compute_pool->Schedule([rm, rc]() {
-            string key_with_step_id =
-                VerbsUtil::AppendStepidToKey(rm.name_, rm.step_id_);
-            rc->RunRecvCallback(key_with_step_id);
-          });
+          continue;
         }
-      } else if (wc_[i].opcode == IBV_WC_RDMA_WRITE) {
-        RdmaBuffer* rb = reinterpret_cast<RdmaBuffer*>(wc_[i].wr_id);
-        rb->SetBufferStatus(local, idle);
-        RdmaMessage rm;
+
+        if (imm_data <= RDMA_IMM_MAX_REQUEST_ID) {
+          // receive a tensor RDMA write
+          uint32_t request_index = imm_data;
+          RdmaTensorRequest* request = rc->GetTensorRequest(request_index);
+          request->RecvTensorContent();
+          continue;
+        }
+
+        // receive a control message
+        rb = rc->rx_message_buffer_;
         RdmaMessage::ParseMessage(rm, rb->buffer_);
-        VLOG(2) << "sent RDMA message: " << MessageTypeToString(rm.type_);
-        if (rm.type_ != RDMA_MESSAGE_ACK) {
-          worker_env_->compute_pool->Schedule([rb]() { rb->SendNextItem(); });
+        RdmaMessageBuffer::SendAck(rc);
+        RDMA_LOG(1) << "Step 0x" << std::hex << rm.step_id_ << std::dec
+                    << ": Received " << MessageTypeToString(rm.type_) << " "
+                    << "#" << rm.request_index_ << ": " << rm.name_;
+
+        if (rm.type_ == RDMA_MESSAGE_TENSOR_REQUEST) {
+          RdmaTensorResponse* response = rc->AddTensorResponse(rm);
+          response->Start();
+        } else if (rm.type_ == RDMA_MESSAGE_META_DATA_UPDATE) {
+          RdmaTensorRequest* request = rc->GetTensorRequest(rm.request_index_);
+          request->RecvTensorMetaData(rm.data_type_, rm.tensor_shape_,
+                                      rm.is_dead_, rm.tensor_bytes_);
+#ifdef RDMA_DATA_VALIDATION
+          request->RecvTensorChecksum(rm.checksum_);
+#endif
+        } else if (rm.type_ == RDMA_MESSAGE_TENSOR_RE_REQUEST) {
+          RdmaTensorResponse* response = rc->UpdateTensorResponse(rm);
+          response->Resume();
+        } else if (rm.type_ == RDMA_MESSAGE_ERROR_STATUS) {
+          RdmaTensorRequest* request = rc->GetTensorRequest(rm.request_index_);
+          request->RecvErrorStatus(rm.status_);
+        }
+      } else if (wc_[i].opcode == IBV_WC_RDMA_WRITE) {
+        RdmaWriteID* wr_id = reinterpret_cast<RdmaWriteID*>(wc_[i].wr_id);
+        RDMA_LOG(2) << "Write complete of type " << wr_id->write_type;
+        switch (wr_id->write_type) {
+          case RDMA_WRITE_ID_ACK:
+            break;
+          case RDMA_WRITE_ID_MESSAGE: {
+            RdmaMessageBuffer* rb =
+                reinterpret_cast<RdmaMessageBuffer*>(wr_id->write_context);
+            rb->SetBufferStatus(local, idle);
+            rb->SendNextItem();
+            break;
+          }
+          case RDMA_WRITE_ID_TENSOR_WRITE: {
+            RdmaTensorResponse* response =
+                reinterpret_cast<RdmaTensorResponse*>(wr_id->write_context);
+            response->Destroy();
+          }
         }
+        delete wr_id;
       }
     }
   }
@@ -588,8 +549,10 @@ int RdmaChannel::PingPostSend() {
 
 RdmaChannel::RdmaChannel(const RdmaAdapter* adapter, const string local_name,
                          const string remote_name)
-    : adapter_(adapter), local_name_(local_name), remote_name_(remote_name) {
-
+    : adapter_(adapter),
+      local_name_(local_name),
+      remote_name_(remote_name),
+      request_serial_(0) {
   struct ibv_sge list;
 
   mr_ = ibv_reg_mr(adapter_->pd_, ping_buff_, kPingBuffSize,
@@ -651,29 +614,15 @@ RdmaChannel::RdmaChannel(const RdmaAdapter* adapter, const string local_name,
 
   // create message and ack buffers, then initialize the tables.
   {
-    const string buffer_names[] = {"tx_message_buffer", "rx_message_buffer",
-                                   "tx_ack_buffer",     "rx_ack_buffer"};
+    const string buffer_names[] = {"tx_message_buffer", "rx_message_buffer"};
     tx_message_buffer_ = new RdmaMessageBuffer(this, buffer_names[0]);
     rx_message_buffer_ = new RdmaMessageBuffer(this, buffer_names[1]);
-    tx_ack_buffer_ = new RdmaAckBuffer(this, buffer_names[2]);
-    rx_ack_buffer_ = new RdmaAckBuffer(this, buffer_names[3]);
     message_buffers_.reserve(kNumMessageBuffers);
     message_buffers_.push_back(tx_message_buffer_);
     message_buffers_.push_back(rx_message_buffer_);
-    message_buffers_.push_back(tx_ack_buffer_);
-    message_buffers_.push_back(rx_ack_buffer_);
     // create buffer on host
     tx_message_buffer_->CreateCPUBuffer(RdmaMessage::kRdmaMessageBufferSize);
     rx_message_buffer_->CreateCPUBuffer(RdmaMessage::kRdmaMessageBufferSize);
-    tx_ack_buffer_->CreateCPUBuffer(RdmaMessage::kRdmaAckBufferSize);
-    rx_ack_buffer_->CreateCPUBuffer(RdmaMessage::kRdmaAckBufferSize);
-    // bt_mu_.lock() is not used in constructor.
-    for (int i = 0; i < kNumMessageBuffers; i++) {
-      uint32_t index = NameHash(buffer_names[i]);
-      buffer_table_.insert({index, message_buffers_[i]});
-      buffer_index_name_table_.insert({index, buffer_names[i]});
-      buffer_name_index_table_.insert({buffer_names[i], index});
-    }
   }
   CHECK(PingPostRecv() == 0) << "Couldn't post receive from " << remote_name_
                              << " with error " << std::strerror(errno);
@@ -684,8 +633,6 @@ RdmaChannel::~RdmaChannel() {
   CHECK(!ibv_destroy_qp(qp_)) << "Failed to destroy QP";
   delete tx_message_buffer_;
   delete rx_message_buffer_;
-  delete tx_ack_buffer_;
-  delete rx_ack_buffer_;
 }
 
 void RdmaChannel::SetRemoteAddress(const RdmaAddress& ra, bool override) {
@@ -716,114 +663,31 @@ void RdmaChannel::Recv() {
   CHECK(!ibv_post_recv(qp_, &wr, &bad_wr)) << "Failed to post recv";
 }
 
-// Lookup 32-bit buffer index from buffer name
-// Args:
-//   buffer_name: name of the buffer
-// Returns:
-//   32-bit index
-uint32_t RdmaChannel::LookupBufferIndex(const string& buffer_name) {
-  mutex_lock lock{bt_mu_};
-  BufferNameIndexTable::iterator iter =
-      buffer_name_index_table_.find(buffer_name);
-  CHECK(iter != buffer_name_index_table_.end());
-  return iter->second;
-}
-
-// Find a buffer by its 32-bit index
-// Args:
-//   index: 32-bit hash code of the tensor buffer name
-// Returns:
-//   name of the tensor buffer
-RdmaBuffer* RdmaChannel::FindBuffer(const uint32_t index) {
-  mutex_lock lock{bt_mu_};
-  BufferTable::iterator iter = buffer_table_.find(index);
-  CHECK(iter != buffer_table_.end());
-  return iter->second;
-}
-
-// Find a buffer by its name
-// Args:
-//   name: name of the buffer
-// Returns:
-//   the named rdma buffer
-RdmaBuffer* RdmaChannel::FindBuffer(const string& name) {
-  uint32_t index = LookupBufferIndex(name);
-  return FindBuffer(index);
-}
-
-// Find a buffer if it exists, otherwise create one.
-// The memory inside the created buffer is not allocated.
-// Args:
-//   name: the name of the buffer
-//   buffer_type: TENSOR, MESSAGE or ACK.
-// Returns:
-//   the named buffer
-RdmaBuffer* RdmaChannel::FindOrCreateBuffer(const string& name,
-                                            BufferType buffer_type) {
-  mutex_lock lock{bt_mu_};
-  RdmaBuffer* rb;
-  // find index
-  BufferNameIndexTable::iterator iter = buffer_name_index_table_.find(name);
-  if (iter != buffer_name_index_table_.end()) {
-    uint32_t index = iter->second;
-    // find buffer
-    BufferTable::iterator iter = buffer_table_.find(index);
-    CHECK(iter != buffer_table_.end());
-    rb = iter->second;
-  } else {
-    uint32_t index = NameHash(name);
-    if (buffer_type == TENSOR) {
-      rb = new RdmaTensorBuffer(this, name);
-    } else if (buffer_type == MESSAGE) {
-      rb = new RdmaMessageBuffer(this, name);
-    } else if (buffer_type == ACK) {
-      rb = new RdmaAckBuffer(this, name);
-    }
-    buffer_name_index_table_.insert({name, index});
-    buffer_index_name_table_.insert({index, name});
-    buffer_table_.insert({index, rb});
+RdmaTensorRequest* RdmaChannel::InsertTensorRequest(
+    const string& key, int64 step_id, Device* dst_dev,
+    const Rendezvous::Args recv_args,
+    const RdmaTensorRequest::RecvDoneCallback& done) {
+  mutex_lock lock{ct_mu_};
+  uint32_t request_index = request_serial_++;
+  if (request_serial_ > RDMA_IMM_MAX_REQUEST_ID) {
+    request_serial_ = 0;
   }
-  CHECK(rb);
-  return rb;
+  RdmaTensorRequest request(request_index, key, step_id, this, dst_dev,
+                            recv_args, done);
+  auto it = request_table_.emplace(request_index, request);
+  return &it.first->second;
 }
 
-// Insert callback to the callback_table.
-// The callback is activated when the corresponding tensor is received.
-// Arg:
-//   key: the name of the tensor
-//   recv_done: the callback associated with the tensor.
-// Returns:
-//   None
-void RdmaChannel::InsertRecvCallback(const string& key,
-                                     std::function<void()> recv_done) {
+void RdmaChannel::RemoveTensorRequest(uint32_t request_index) {
   mutex_lock lock{ct_mu_};
-  callback_table_.insert({key, recv_done});
+  request_table_.erase(request_index);
 }
 
-// Remove callback from the callback_table.
-// Arg:
-//   key: the name of the tensor
-// Returns:
-//   None
-void RdmaChannel::RemoveRecvCallback(const string& key) {
+RdmaTensorRequest* RdmaChannel::GetTensorRequest(uint32_t request_index) {
   mutex_lock lock{ct_mu_};
-  callback_table_.erase(key);
-}
-
-// Run named callback in the callback_table.
-// Arg:
-//   key: the name of the tensor
-// Returns:
-//   None
-void RdmaChannel::RunRecvCallback(const string& key) {
-  std::function<void()> recv_done;
-  {
-    mutex_lock lock{ct_mu_};
-    CallbackTable::iterator iter = callback_table_.find(key);
-    CHECK(iter != callback_table_.end());
-    recv_done = iter->second;
-  }
-  recv_done();
+  RequestTable::iterator iter = request_table_.find(request_index);
+  CHECK(iter != request_table_.end());
+  return &iter->second;
 }
 
 void RdmaChannel::Connect() {
@@ -888,25 +752,22 @@ void RdmaChannel::Connect(const RdmaAddress& remoteAddr) {
 
     connected_ = true;
   } else {
-    LOG(INFO) << "channel already connected";
+    RDMA_LOG(2) << "channel already connected";
   }
 }
 
-RdmaBuffer::RdmaBuffer(RdmaChannel* channel, string name)
+RdmaMessageBuffer::RdmaMessageBuffer(RdmaChannel* channel, string name)
     : channel_(channel), name_(name) {}
 
-RdmaBuffer::~RdmaBuffer() {
+RdmaMessageBuffer::~RdmaMessageBuffer() {
   CHECK(!ibv_dereg_mr(self_)) << "ibv_dereg_mr failed";
   FreeBuffer();
 }
 
-void RdmaBuffer::FreeBuffer() {
+void RdmaMessageBuffer::FreeBuffer() {
   if ((buffer_ != nullptr) && buffer_on_host_) {
     free(buffer_);
   }
-  // TODO
-  // release buffer if it is on device.
-  // We don't support RDMABuffer on device at this moment.
 }
 
 // Allocate CPU memory for the Rdma buffer
@@ -915,7 +776,7 @@ void RdmaBuffer::FreeBuffer() {
 //   lock: whether or not mutex_lock the process to protect concurrency.
 // Returns:
 //   None
-void RdmaBuffer::CreateCPUBuffer(size_t size, bool lock) {
+void RdmaMessageBuffer::CreateCPUBuffer(size_t size, bool lock) {
   CHECK(size > 0);
   if (lock) {
     mu_.lock();
@@ -943,7 +804,7 @@ void RdmaBuffer::CreateCPUBuffer(size_t size, bool lock) {
 //   override: whether override existing information
 // Returns:
 //   None
-void RdmaBuffer::SetRemoteMR(RemoteMR rmr, bool override) {
+void RdmaMessageBuffer::SetRemoteMR(RemoteMR rmr, bool override) {
   mutex_lock lock{mu_};
   if ((override) || (remote_status_ == none)) {
     remote_.remote_addr = rmr.remote_addr;
@@ -956,63 +817,51 @@ void RdmaBuffer::SetRemoteMR(RemoteMR rmr, bool override) {
 }
 
 // Put a task in the buffer's job queue
-void RdmaBuffer::EnqueueItem(string item) {
+void RdmaMessageBuffer::EnqueueItem(string item) {
   mutex_lock lock{mu_};
   queue_.push(item);
 }
 
 // Rdma-Write the content of the buffer
-void RdmaBuffer::Write(uint32_t imm_data, size_t buffer_size) {
+void RdmaMessageBuffer::Write(uint32_t imm_data, size_t buffer_size) {
+  Write(channel_, imm_data, buffer_size, (uint64_t)buffer_, self_->lkey,
+        remote_.remote_addr, remote_.rkey, RDMA_WRITE_ID_MESSAGE, this);
+}
+
+// Generalized Write method
+void RdmaMessageBuffer::Write(const RdmaChannel* channel, uint32_t imm_data,
+                              size_t buffer_size, uint64_t src_addr,
+                              uint32_t lkey, uint64_t remote_addr,
+                              uint32_t rkey, RdmaWriteIDType write_type,
+                              void* write_context) {
   struct ibv_sge list;
-  list.addr = (uint64_t)buffer_;
+  list.addr = src_addr;
   list.length = buffer_size;
-  list.lkey = self_->lkey;
+  list.lkey = lkey;
 
   struct ibv_send_wr wr;
   memset(&wr, 0, sizeof(wr));
-  wr.wr_id = (uint64_t) this;
+  wr.wr_id = (uint64_t) new RdmaWriteID(write_type, write_context);
   wr.sg_list = &list;
   wr.num_sge = 1;
   wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
   wr.send_flags = IBV_SEND_SIGNALED;
   wr.imm_data = imm_data;
-  wr.wr.rdma.remote_addr = (uint64_t)remote_.remote_addr;
-  wr.wr.rdma.rkey = remote_.rkey;
+  wr.wr.rdma.remote_addr = remote_addr;
+  wr.wr.rdma.rkey = rkey;
 
   struct ibv_send_wr* bad_wr;
-  CHECK(!ibv_post_send(channel_->qp_, &wr, &bad_wr)) << "Failed to post send";
-}
-
-RdmaAckBuffer::RdmaAckBuffer(RdmaChannel* channel, string name)
-    : RdmaBuffer(channel, name) {}
-
-RdmaMessageBuffer::RdmaMessageBuffer(RdmaChannel* channel, string name)
-    : RdmaBuffer(channel, name) {}
-
-RdmaTensorBuffer::RdmaTensorBuffer(RdmaChannel* channel, string name)
-    : RdmaBuffer(channel, name) {}
-
-RdmaTensorBuffer::~RdmaTensorBuffer() {
-  for (Itable it = retable.begin(); it != retable.end(); ++it) {
-    delete (it->second);
-  }
+  CHECK(!ibv_post_send(channel->qp_, &wr, &bad_wr)) << "Failed to post send";
 }
 
 // Send the next ack from the buffer's job queue.
-void RdmaAckBuffer::SendNextItem() {
-  uint32_t imm_data = LookupBufferIndex("rx_ack_buffer");
-  RdmaMessage rm;
-  rm.name_ = "rx_ack_buffer";
-  rm.type_ = RDMA_MESSAGE_ACK;
-  rm.name_size_ = rm.name_.size();
-  string message = RdmaMessage::CreateMessage(rm);
-  memcpy(buffer_, message.data(), message.size());
-  Write(imm_data, message.size());
+void RdmaMessageBuffer::SendAck(const RdmaChannel* channel) {
+  Write(channel, RDMA_IMM_DATA_ACK, 0, 0, 0, 0, 0, RDMA_WRITE_ID_ACK, nullptr);
 }
 
 // Send the next message from the buffer's job queue.
 void RdmaMessageBuffer::SendNextItem() {
-  uint32_t imm_data = LookupBufferIndex("rx_message_buffer");
+  uint32_t imm_data = RDMA_IMM_DATA_MESSAGE;
   mu_.lock();
   if (!queue_.empty() && (local_status_ == idle) && (remote_status_ == idle)) {
     local_status_ = busy;
@@ -1029,244 +878,392 @@ void RdmaMessageBuffer::SendNextItem() {
   }
 }
 
-Rendezvous::DoneCallback RdmaTensorBuffer::getRecvTensorCallback(
-    const string& key_with_step_id, const string& key, int64 step_id,
-    const Rendezvous::ParsedKey& parsed) {
-  Rendezvous::DoneCallback cb = [this, key_with_step_id, key, step_id, parsed](
-      const Status& status, const Rendezvous::Args& send_args,
-      const Rendezvous::Args& recv_args, const Tensor& in, bool is_dead) {
-    CHECK(status.ok()) << "RecvLocalAsync was not ok, key" << key_with_step_id
-                       << " error message: " << status.error_message();
-    size_t buffer_size = RdmaMessage::kMessageTotalBytes;
-    size_t tensor_bytes = 0;
-    // Figures out which device the tensor is hosted on.
-    Device* src_dev = nullptr;
-    Status s = channel_->adapter_->worker_env_->device_mgr->LookupDevice(
-        parsed.src_device, &src_dev);
-    CHECK(s.ok()) << "src device not found";
-    // Does the device have the right incarnation number we expect?
-    CHECK(src_dev->attributes().incarnation() == parsed.src_incarnation)
-        << "RecvTensor expects a different device incarnation: "
-        << parsed.src_incarnation << " vs. "
-        << src_dev->attributes().incarnation()
-        << ". Your worker job was probably restarted. Check your "
-        << "worker job for the reason why it was restarted.";
-    Device* dst_dev = nullptr;
-    // destination is on CPU.
-    s = channel_->adapter_->worker_env_->device_mgr->LookupDevice("CPU:0",
-                                                                  &dst_dev);
-    CHECK(s.ok()) << "dst device not found";
-    AllocatorAttributes dst_alloc_attr;
-    dst_alloc_attr.set_on_host(true);
-
-    bool can_memcpy = DataTypeCanUseMemcpy(in.dtype());
-    // string tensor needs to be serialized
-    Tensor copy;
-    TensorProto proto;
-    if (src_dev->tensorflow_gpu_device_info() &&
-        (!send_args.alloc_attrs.on_host())) {
 #if GOOGLE_CUDA
-      CHECK(send_args.device_context) << "send dev name: " << src_dev->name()
-                                      << " gpu_info: "
-                                      << src_dev->tensorflow_gpu_device_info();
-
-      if (can_memcpy) {
-        AllocatorAttributes host_alloc_attrs;
-        host_alloc_attrs.set_gpu_compatible(true);
-        host_alloc_attrs.set_on_host(true);
-        Allocator* alloc = ProcessState::singleton()->GetCUDAHostAllocator(0);
-        copy = Tensor(alloc, in.dtype(), in.shape());
-        tensor_bytes = in.TotalBytes();
-        buffer_size += tensor_bytes;
-        GPUUtil::CopyGPUTensorToCPU(
-            src_dev, send_args.device_context, &in, &copy,
-            [this, copy, tensor_bytes, buffer_size, key, in, step_id,
-             key_with_step_id, is_dead, send_args, recv_args](const Status& s) {
-              CHECK(s.ok()) << "copy tensor from gpu sync";
-              StringPiece copy_buf;
-              copy_buf = copy.tensor_data();
-              PostCopyOperations(true, buffer_size, tensor_bytes, key, in,
-                                 step_id, is_dead, key_with_step_id, &copy,
-                                 NULL, &copy_buf, send_args, recv_args);
-            });
-      } else {
-        // "val" is on a GPU. No longer uses GPUUtil to fill the proto, use
-        // aync instead
-        GPUUtil::SetProtoFromGPU(
-            in, src_dev, send_args.device_context, &proto, is_dead,
-	    [this, proto, buffer_size, key, in, step_id, key_with_step_id,
-            is_dead, send_args, recv_args](const Status& s) mutable {
-              CHECK(s.ok()) << "copy proto from gpu sync";
-              auto tensor_bytes = proto.ByteSize();
-              buffer_size += tensor_bytes;
-              PostCopyOperations(false, buffer_size, tensor_bytes, key, in,
-                                 step_id, is_dead, key_with_step_id, NULL,
-                                 &proto, NULL, send_args, recv_args);
-            });
-      }
-#endif  // GOOGLE_CUDA
-    } else {
-      // tensor is in CPU memory.
-      StringPiece copy_buf;
-      if (can_memcpy) {
-        copy_buf = in.tensor_data();
-        tensor_bytes = in.TotalBytes();
-      } else {
-        in.AsProtoTensorContent(&proto);
-        tensor_bytes = proto.ByteSize();
-      }
-      buffer_size += tensor_bytes;
-      PostCopyOperations(can_memcpy, buffer_size, tensor_bytes, key, in,
-                         step_id, is_dead, key_with_step_id, &copy, &proto,
-                         &copy_buf, send_args, recv_args);
+static void CountCopies(const std::string& key, void* src_addr, void* dst_addr,
+                        size_t tensor_bytes, bool is_gpu_to_cpu) {
+#ifdef RDMA_COUNT_COPIES
+  static uint64_t numGPUToCPUCopies = 0;
+  static uint64_t numGPUToCPUCopiedBytes = 0;
+  static uint64_t numCPUToGPUCopies = 0;
+  static uint64_t numCPUToGPUCopiedBytes = 0;
+  static uint64_t numTotalCopies = 0;
+
+  if (is_gpu_to_cpu) {
+    ++numGPUToCPUCopies;
+    numGPUToCPUCopiedBytes += tensor_bytes;
+  } else {
+    ++numCPUToGPUCopies;
+    numCPUToGPUCopiedBytes += tensor_bytes;
+  }
+  if ((++numTotalCopies % 0x400) == 0) {
+    RDMA_LOG(0) << "Tensor copies:"
+                << " GPU to CPU: " << numGPUToCPUCopies
+                << " (" << numGPUToCPUCopiedBytes << " Bytes)"
+                << " CPU to GPU: " << numCPUToGPUCopies
+                << " (" << numCPUToGPUCopiedBytes << " Bytes)";
+  }
+  RDMA_LOG(2) << "Copying tensor " << key
+              << " From: " << src_addr << " To: " << dst_addr;
+#endif // RDMA_COUNT_COPIES
+}
+#endif // GOOGLE_CUDA
+
+#ifdef RDMA_DATA_VALIDATION
+static uint64_t Checksum(Device* device, const DeviceContext* device_context,
+                         const Tensor& in) {
+  uint64 checksum = 0;
+  if (DataTypeCanUseMemcpy(in.dtype())) {
+#if GOOGLE_CUDA
+    if (in.TotalBytes() == 0) {
+      return 0;
     }
-  };
-  return cb;
+    checksum = (device_context != nullptr)
+                   ? GPUUtil::Checksum(device, device_context, in)
+                   : GPUUtil::Checksum(in);
+#endif // GOOGLE_CUDA
+  } else {
+    string s = in.SummarizeValue(999999);
+    checksum = Hash64(s.c_str(), s.size(), 0);
+  }
+  return checksum;
 }
 
-// Send the next tensor from the buffer's job queue.
-void RdmaTensorBuffer::SendNextItem() {
-  // get the key
-  string key_with_step_id = "";
-  {
-    mutex_lock lock{mu_};
-    if (!queue_.empty()) {
-      key_with_step_id = queue_.front();
-      queue_.pop();
+static void ValidateChecksum(uint64_t expected, uint64_t actual,
+                             const Tensor& in, uint32_t request_index,
+                             const std::string& key, const std::string& msg) {
+  RDMA_LOG(2) << "Request #" << request_index << ": " << key
+              << ": Checksum: " << std::hex << " Expected = 0x" << expected
+              << ". Actual = 0x" << actual << ".";
+
+  if (expected != actual) {
+    // Checksum failed. There is one case where this is allowed - if the
+    // tensor is an AssignAdd of the global step. Since the data-validation
+    // always postpones the Tensor response in order to send a checksum message,
+    // it is possible that the global-step was updated while the response was
+    // still in queue.
+    if ((in.TotalBytes() == 8) && (in.dtype() == DT_INT64)) {
+      int64_t prev_val = *(int64_t*)DMAHelper::base(&in) - 1;
+      actual = Hash64((const char*)&prev_val, 8, 0);
+    }
+    if (expected != actual) {
+      LOG(FATAL) << "[" << msg << "]: Checksum validation failed for request #"
+                 << request_index << ": " << key << std::hex << " "
+                 << DataTypeString(in.dtype()) << " "
+                 << in.shape().DebugString() << " (0x" << in.TotalBytes()
+                 << " bytes): "
+                 << " Expected 0x" << expected << ". Got 0x" << actual << ".";
     }
   }
+}
+#endif // RDMA_DATA_VALIDATION
+
+#if GOOGLE_CUDA
+// Sync the 'done' operation on the GPU stream, but without all the data
+// copying.
+static void StreamGPUOp(Device* gpu_device,
+                        const DeviceContext* device_context,
+                        StatusCallback done) {
+  Tensor dummy1, dummy2;
+  GPUUtil::CopyGPUTensorToCPU(
+      gpu_device, device_context, &dummy1, &dummy2, done);
+}
+#endif  // GOOGLE_CUDA
+
+RdmaTensorResponse* RdmaChannel::AddTensorResponse(const RdmaMessage& rm) {
+  mutex_lock lock{mu_};
+  auto it =
+      responses_table_.emplace(rm.request_index_, RdmaTensorResponse(this, rm));
+  CHECK(it.second) << "Response with the ID " << rm.request_index_
+                   << " already exists.";
+  return &it.first->second;
+}
+
+RdmaTensorResponse* RdmaChannel::UpdateTensorResponse(const RdmaMessage& rm) {
+  mutex_lock lock{mu_};
+  auto it = responses_table_.find(rm.request_index_);
+  CHECK(it != responses_table_.end()) << "No response found.";
+  RdmaTensorResponse* response = &it->second;
+  response->Update(rm);
+  return response;
+}
 
-  // send the tensor if a key is acquired.
-  if (key_with_step_id != "") {
-    VLOG(2) << "try to send tensor: " << key_with_step_id;
-    string key;
-    int64 step_id;
-    VerbsUtil::GetKeyAndStepId(key_with_step_id, key, step_id);
-    CHECK(key.compare(name_) == 0);
-    Rendezvous::ParsedKey parsed;
-    Rendezvous::ParseKey(key, &parsed);
-    Rendezvous::DoneCallback cb =
-        getRecvTensorCallback(key_with_step_id, key, step_id, parsed);
-    channel_->adapter_->worker_env_->rendezvous_mgr->RecvLocalAsync(step_id,
-                                                                    parsed, cb);
+void RdmaChannel::RemoveTensorResponse(uint32_t request_index) {
+  mutex_lock lock{mu_};
+  responses_table_.erase(request_index);
+}
+
+void RdmaTensorResponse::Start() {
+  Rendezvous::ParsedKey parsed;
+  Status s = Rendezvous::ParseKey(rm_.name_, &parsed);
+  if (!s.ok()) {
+    SendErrorStatus(s);
+    return;
   }
+
+  channel_->adapter_->worker_env_->rendezvous_mgr->RecvLocalAsync(
+      rm_.step_id_, parsed,
+      [this, parsed](const Status& status, const Rendezvous::Args& send_args,
+                     const Rendezvous::Args& recv_args, const Tensor& in,
+                     bool is_dead) {
+        CHECK(status.ok()) << "RecvLocalAsync was not ok."
+                           << " error message: " << status.error_message();
+        RecvHandler(parsed, send_args, recv_args, in, is_dead);
+      });
 }
 
-void RdmaTensorBuffer::ReSendNextItem() {
-  // get the key
-  string key_with_step_id = "";
-  {
-    mutex_lock lock{mu_};
-    if (!requeue.empty()) {
-      key_with_step_id = requeue.front();
-      requeue.pop();
-    }
+void RdmaTensorResponse::Resume() { SendContent(*tensor_, *proto_, is_dead_); }
+
+// Helper for RecvTensor. Validates "key" and returns the source
+// device in "*src_dev".
+Status RdmaTensorResponse::PrepareRecvTensor(
+    const Rendezvous::ParsedKey& parsed, Device** src_dev) {
+  // Figures out which device the tensor is hosted on.
+  string local_name = DeviceNameUtils::LocalName(parsed.src_device);
+  TF_RETURN_IF_ERROR(channel_->adapter_->worker_env_->device_mgr->LookupDevice(
+      local_name, src_dev));
+
+  // Does the device have the right incarnation number we expect?
+  if ((*src_dev)->attributes().incarnation() != parsed.src_incarnation) {
+    return errors::Aborted(
+        "RecvTensor expects a different device incarnation: ",
+        parsed.src_incarnation, " vs. ", (*src_dev)->attributes().incarnation(),
+        ". Your worker job was probably restarted. Check your "
+        "worker job for the reason why it was restarted.");
   }
 
-  // send the tensor if a key is acquired.
-  if (key_with_step_id != "") {
-    VLOG(2) << "try to send tensor: " << key_with_step_id;
-    string key;
-    int64 step_id;
-    VerbsUtil::GetKeyAndStepId(key_with_step_id, key, step_id);
-    CHECK(key.compare(name_) == 0);
-    Rendezvous::ParsedKey parsed;
-    Rendezvous::ParseKey(key, &parsed);
-    Rendezvous::DoneCallback cb =
-        getRecvTensorCallback(key_with_step_id, key, step_id, parsed);
-    ReItem* item;
-    {
-      mutex_lock lock{mu_};
-      Itable it = retable.find(key_with_step_id);
-      CHECK(it != retable.end()) << "Could not find dup-recv context";
-      item = it->second;
-      retable.erase(it);
+  return Status::OK();
+}
+
+void RdmaTensorResponse::RecvHandler(Rendezvous::ParsedKey parsed,
+                                     const Rendezvous::Args& send_args,
+                                     const Rendezvous::Args& recv_args,
+                                     const Tensor& in, bool is_dead) {
+  Status s = PrepareRecvTensor(parsed, &src_dev_);
+  if (!s.ok()) {
+    SendErrorStatus(s);
+    return;
+  }
+
+  meta_data_changed_ = TensorMetaDataChanged(in, is_dead);
+#ifdef RDMA_DATA_VALIDATION
+  // Always send a meta data message with the source checksum
+  meta_data_changed_ = rm_.type_ == RDMA_MESSAGE_TENSOR_REQUEST;
+  checksum_ = Checksum(src_dev_, send_args.device_context, in);
+#endif
+  bool can_memcpy = DataTypeCanUseMemcpy(in.dtype());
+  // string tensor needs to be serialized
+  Tensor copy;
+  TensorProto proto;
+  const bool on_host = send_args.alloc_attrs.on_host();
+  if (src_dev_->tensorflow_gpu_device_info() && !on_host) {
+#if GOOGLE_CUDA
+    DeviceContext* send_dev_context = send_args.device_context;
+    CHECK(send_dev_context)
+        << "send dev name: " << src_dev_->name()
+        << " gpu_info: " << src_dev_->tensorflow_gpu_device_info();
+
+    if (can_memcpy) {
+      // If the tensor is located on a GDR compatible GPU, there is no need to
+      // copy it. We can send directly from the source, just need to make sure
+      // we are in sync with the GPU stream.
+      // If the tensor's meta-data changed however, we will need to clone it,
+      // so anyway we'll have to copy it from GPU to CPU first. If at some
+      // point in time Clone() is changed to only save a shallow copy, we can
+      // skip the copy here as well.
+      if ((in.TotalBytes() > 0) && !meta_data_changed_ &&
+          (RdmaMemoryMgr::Singleton().FindMemoryRegion(
+              (void*)DMAHelper::base(&in), in.TotalBytes()) != nullptr)) {
+        StreamGPUOp(src_dev_, send_dev_context,
+                    [this, in, proto, is_dead](const Status& s) {
+                      Send(in, proto, is_dead, s);
+                    });
+        return;
+      }
+
+      // The tensor must be copied from GPU to CPU, because either:
+      // 1. The tensor is located on a non GDR compatible GPU.
+      // 2. The tensor's meta-data has changed.
+      Allocator* alloc = ProcessState::singleton()->GetCUDAHostAllocator(0);
+      copy = Tensor(alloc, in.dtype(), in.shape());
+      CountCopies(rm_.name_, (void*)DMAHelper::base(&in),
+                  (void*)DMAHelper::base(&copy), in.TotalBytes(), true);
+      GPUUtil::CopyGPUTensorToCPU(
+          src_dev_, send_dev_context, &in, &copy,
+          [this, copy, proto, is_dead](const Status& s) {
+            Send(copy, proto, is_dead, s);
+          });
+    } else {
+      GPUUtil::SetProtoFromGPU(
+          in, src_dev_, send_args.device_context, &proto, is_dead,
+          [this, in, proto, is_dead](const Status& s) mutable {
+            Send(in, proto, is_dead, s);
+          });
+    }
+#else
+    SendErrorStatus(errors::Internal("No GPU device in process"));
+#endif  // GOOGLE_CUDA
+  } else {
+    // tensor is in CPU memory.
+    if (!can_memcpy) {
+      in.AsProtoTensorContent(&proto);
     }
-    cb(Status::OK(), item->send_args, item->recv_args, item->in, item->is_dead);
-    delete (item);
+    Send(in, proto, is_dead, Status::OK());
+  }
+}
+
+void RdmaTensorResponse::Send(const Tensor& in, const TensorProto& proto,
+                              bool is_dead, const Status& status) {
+  if (!status.ok()) {
+    SendErrorStatus(status);
+    return;
+  }
+  bool can_memcpy = DataTypeCanUseMemcpy(in.dtype());
+  bool proto_size_changed = (!can_memcpy) &&
+                            (proto.ByteSize() != rm_.tensor_bytes_);
+  if (meta_data_changed_ || proto_size_changed) {
+    Clone(in, proto, is_dead);
+    SendMetaData(in, proto, is_dead);
+  } else {
+    SendContent(in, proto, is_dead);
+  }
+}
+
+bool RdmaTensorResponse::TensorMetaDataChanged(const Tensor& in, bool is_dead) {
+  return (rm_.data_type_ != in.dtype()) || (rm_.tensor_shape_ != in.shape()) ||
+         (rm_.is_dead_ != is_dead);
+}
+
+void RdmaTensorResponse::Clone(const Tensor& in, const TensorProto& proto,
+                               bool is_dead) {
+  // Clone the data to be sent later. For simplicity, we clone the tensor's
+  // data even if it is already a copy. Performance is less of a concern here
+  // since the meta-data hardly ever changes. The reason we create a copy, is
+  // that some tensors share their buffer between different step-ids, so the
+  // tensor content may change before re-request was completed.
+  bool can_memcpy = DataTypeCanUseMemcpy(in.dtype());
+  if (can_memcpy && (in.TotalBytes() > 0)) {
+    AllocatorAttributes host_alloc_attrs;
+    host_alloc_attrs.set_nic_compatible(true);
+    host_alloc_attrs.set_on_host(true);
+    Allocator* allocator = src_dev_->GetAllocator(host_alloc_attrs);
+    tensor_ = new Tensor(allocator, in.dtype(), in.shape());
+    memcpy(DMAHelper::base(tensor_), DMAHelper::base(&in), in.TotalBytes());
+  } else {
+    tensor_ = new Tensor(in.dtype(), in.shape());
   }
+  if (!can_memcpy) {
+    proto_ = new TensorProto(proto);
+  }
+  is_dead_ = is_dead;
 }
 
-void RdmaTensorBuffer::PostCopyOperations(
-    bool can_memcpy, size_t buffer_size, size_t tensor_bytes, const string& key,
-    const Tensor& in, int64 step_id, bool is_dead,
-    const string& key_with_step_id, const Tensor* copy,
-    const TensorProto* proto, const StringPiece* copy_buf,
-    const Rendezvous::Args& send_args, const Rendezvous::Args& recv_args) {
-  // prepare message
+void RdmaTensorResponse::SendMetaData(const Tensor& in,
+                                      const TensorProto& proto, bool is_dead) {
+  RDMA_LOG(2) << "Request #" << rm_.request_index_
+              << ": Meta data changed: " << rm_.name_;
+  bool can_memcpy = DataTypeCanUseMemcpy(in.dtype());
+  size_t tensor_bytes = (can_memcpy) ? in.TotalBytes() : proto.ByteSize();
+
+  // Send meta-data update:
   RdmaMessage rm;
-  rm.name_size_ = key.size();
-  rm.name_ = key;
+  rm.type_ = RDMA_MESSAGE_META_DATA_UPDATE;
+  rm.name_size_ = rm_.name_.size();
+  rm.name_ = rm_.name_;
   rm.tensor_shape_ = in.shape();
   rm.data_type_ = in.dtype();
-  rm.step_id_ = step_id;
+  rm.step_id_ = rm_.step_id_;
   rm.is_dead_ = is_dead;
   rm.tensor_bytes_ = tensor_bytes;
-  rm.buffer_size_ = buffer_size;
-  mu_.lock();
-  if (local_status_ == none || (buffer_size > size_ && local_status_ == idle &&
-                                remote_status_ == idle)) {
-    if ((local_status_ != none) && (buffer_size > size_)) {
-      VLOG(2) << "Extend RDMA buffer from " << size_ << " to " << buffer_size;
-    }
-    CreateCPUBuffer(buffer_size, false);
-    // Need to be received again, put into the re-recv queue and the table
-    requeue.push(key_with_step_id);
-    ReItem* item = new ReItem(send_args, recv_args, in, is_dead);
-    retable.insert(std::pair<string, ReItem*>(key_with_step_id, item));
-    mu_.unlock();
-    // no longer used: put back the key since it is not sent;
-    // ask the remote to create the same buffer
-    rm.type_ = RDMA_MESSAGE_BUFFER_REQUEST;
-    rm.remote_addr_ = reinterpret_cast<uint64_t>(buffer_);
-    rm.rkey_ = self_->rkey;
-    string message = RdmaMessage::CreateMessage(rm);
-    channel_->tx_message_buffer_->EnqueueItem(message);
-    channel_->tx_message_buffer_->SendNextItem();
-  } else if ((local_status_ == idle) && (remote_status_ == idle)) {
-    // both buffers are ready, send the tensor
-    local_status_ = busy;
-    remote_status_ = busy;
-    // local/remote_status_ won't be set back to idle
-    // unitl Write() is successful
-    mu_.unlock();
-    if (!((buffer_size == size_ && rm.data_type_ != DT_STRING) ||
-          (buffer_size <= size_ && rm.data_type_ == DT_STRING))) {
-      VLOG(2) << "Tensor and buffer size do not agree,"
-              << " buffer_size = " << size_
-              << " requested tensor size = " << buffer_size << in.DebugString();
-    }
-    uint32_t imm_data = LookupBufferIndex(key);
-    rm.type_ = RDMA_MESSAGE_TENSOR_WRITE;
-    string message = RdmaMessage::CreateMessage(rm);
-    memcpy(buffer_, message.data(), message.size());
-    if (!is_dead) {
-      // copy the tensor buffer content
-      void* output = static_cast<void*>(static_cast<char*>(buffer_) +
-                                        RdmaMessage::kTensorBufferStartIndex);
-      CHECK(tensor_bytes + RdmaMessage::kTensorBufferStartIndex <= size_);
-      if (can_memcpy) {
-        CHECK(copy != NULL) << "callback missing pointer to copy tensor";
-        CHECK(copy_buf != NULL) << "callback missing pointer to copy buffer";
-        CHECK(copy_buf->size() == tensor_bytes)
-            << "unexpected tensor size: " << copy_buf->size()
-            << " != " << tensor_bytes;
-        memcpy(output, copy_buf->data(), tensor_bytes);
-      } else {
-        CHECK(proto != NULL) << "callback missing pointer to proto tensor";
-        proto->SerializeToArray(output, tensor_bytes);
+  rm.request_index_ = rm_.request_index_;
+#ifdef RDMA_DATA_VALIDATION
+  rm.checksum_ = checksum_;
+#endif
+  RDMA_LOG(1) << "Step 0x" << std::hex << rm.step_id_ << std::dec
+              << ": Sending RDMA_MESSAGE_META_DATA_UPDATE #"
+              << rm.request_index_ << ": " << rm.name_
+              << " (shape = " << rm.tensor_shape_.DebugString() << "."
+              << " data-type = " << DataTypeString(rm.data_type_) << "."
+              << " is-dead = " << rm.is_dead_ << ")";
+
+  string message = RdmaMessage::CreateMessage(rm);
+  channel_->tx_message_buffer_->EnqueueItem(message);
+  channel_->tx_message_buffer_->SendNextItem();
+}
+
+void RdmaTensorResponse::SendContent(const Tensor& in, const TensorProto& proto,
+                                     bool is_dead) {
+  bool can_memcpy = DataTypeCanUseMemcpy(in.dtype());
+  size_t tensor_bytes = (can_memcpy) ? in.TotalBytes() : proto.ByteSize();
+  uint32_t imm_data = rm_.request_index_;
+  if (!is_dead) {
+    if (can_memcpy) {
+      src_buffer_ = const_cast<TensorBuffer*>(DMAHelper::buffer(&in));
+      if (src_buffer_ != nullptr) {
+        src_buffer_->Ref();  // Keep buffer alive until write is complete
+        src_addr_ = src_buffer_->data();
+        mr_ = RdmaMemoryMgr::Singleton().FindMemoryRegion(src_addr_,
+                                                          tensor_bytes);
       }
     } else {
-      buffer_size = RdmaMessage::kMessageTotalBytes;
+      RDMA_LOG(2) << "Encoding proto: " << rm_.name_
+                  << " (Size: " << tensor_bytes << ") " << in.DebugString();
+      src_addr_ = malloc(tensor_bytes);
+      mr_ = ibv_reg_mr(channel_->adapter_->pd_, src_addr_, tensor_bytes,
+                       IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE);
+      proto.SerializeToArray(src_addr_, tensor_bytes);
     }
-    Write(imm_data, buffer_size);
   } else {
-    // Need to be received again, put into the re-recv queue and the table
-    requeue.push(key_with_step_id);
-    ReItem* item = new ReItem(send_args, recv_args, in, is_dead);
-    retable.insert(std::pair<string, ReItem*>(key_with_step_id, item));
-    mu_.unlock();
+    tensor_bytes = 0;
   }
+
+  uint32_t lkey = (mr_ == nullptr) ? 0 : mr_->lkey;
+  RDMA_LOG(1) << "Step 0x" << std::hex << rm_.step_id_ << std::dec
+              << ": Sending tensor content #" << rm_.request_index_ << " from "
+              << std::hex << src_addr_ << " (0x" << lkey << ")"
+              << " to " << rm_.remote_addr_ << " (0x" << rm_.rkey_
+              << "): " << rm_.name_ << " (size: 0x" << std::hex << tensor_bytes
+              << ")";
+
+  RdmaMessageBuffer::Write(channel_, imm_data, tensor_bytes,
+                           (uint64_t)src_addr_, lkey, rm_.remote_addr_,
+                           rm_.rkey_, RDMA_WRITE_ID_TENSOR_WRITE, this);
+}
+
+void RdmaTensorResponse::SendErrorStatus(const Status& status) {
+  RdmaMessage rm;
+  rm.type_ = RDMA_MESSAGE_ERROR_STATUS;
+  rm.name_size_ = rm_.name_.size();
+  rm.name_ = rm_.name_;
+  rm.step_id_ = rm_.step_id_;
+  rm.request_index_ = rm_.request_index_;
+  rm.status_ = status;
+  LOG(ERROR) << "Step 0x" << std::hex << rm.step_id_ << std::dec
+             << ": Sending RDMA_MESSAGE_ERROR_STATUS #"
+             << rm.request_index_ << ": " << rm.name_
+             << ". Status: " << status.ToString();
+
+  string message = RdmaMessage::CreateMessage(rm);
+  channel_->tx_message_buffer_->EnqueueItem(message);
+  channel_->tx_message_buffer_->SendNextItem();
+
+  // Destroy the response.
+  Destroy();
+}
+
+void RdmaTensorResponse::Destroy() {
+  if (src_buffer_ != nullptr) {
+    src_buffer_->Unref();
+  }
+  if (tensor_ != nullptr) {
+    delete tensor_;
+  }
+  if (proto_ != nullptr) {
+    ibv_dereg_mr(mr_);
+    free(src_addr_);
+    delete proto_;
+  }
+  // Remove response from the pending list:
+  channel_->RemoveTensorResponse(rm_.request_index_);
 }
 
 // Create a RdmaMessage according to the pre-defined format
@@ -1276,43 +1273,46 @@ void RdmaTensorBuffer::PostCopyOperations(
 //   message in string format
 string RdmaMessage::CreateMessage(const RdmaMessage& rm) {
   // Rdma Message format
-  // type|name_size|name|step_id|buffer_size|remote_addr|rkey|is_dead|...
-  //   1B|    2B   | 512|  8B   |    8B     |       8B  | 4B |    1B |...
-  // ...|data_type|tensor_shape|tensor_bytes|tensor_buffer
-  // ...|   XB    |    XB      |    8B      |...
+  // type|name_size|name|step_id|request_index|remote_addr|rkey|is_dead|...
+  //   1B|    2B   | 512|  8B   |     8B      |       8B  | 4B |    1B |...
+  // ...|data_type|tensor_shape|tensor_bytes|error_status          |
+  // ...|   XB    |    XB      |    8B      |size - 4B, proto - XB |
   //
-  // ACK:             type|13|"rx_ack_buffer"
-  // TENSOR_REQUEST:  type|name_size|tensor_name|step_id
-  // TENSOR_WRITE:    type|name_size|tensor_name|step_id|...|is_dead
-  //                 |data_type|tensor_shape|tensor_bytes
-  // BUFFER_IDLE:     type|name_size|buffer_name
-  // BUFFER_REQUEST:
-  // type|name_size|buffer_name|...|buffer_size|remote_addr|rkey|
-  // BUFFER_RESPONSE:
-  // type|name_size|buffer_name|...|buffer_size|remote_addr|rkey|
-  char message[kMessageTotalBytes];
+  // ACK:             Imm-type: ACK
+  // TENSOR_REQUEST:  Imm-type: MESSAGE
+  //                  Fields: type, request_index, name, step_id, remote_addr,
+  //                      rkey, is_dead, data_type, tensor_shape, tensor_bytes
+  // META_DATA_UPDATE: Imm-type: MESSAGE
+  //                  Fields: type, request_index, is_dead, data_type,
+  //                      tensor_shape, tensor_bytes
+  // TENSOR_RE_REQUST: Imm-type: MESSAGE
+  //                  Fields: type, request_index, name, step_id, remote_addr,
+  //                      rkey, is_dead, data_type, tensor_shape, tensor_bytes
+  // ERROR_STATUS:    Imm-type: MESSAGE
+  //                  Fields: type, request_index, name, step_id, error_status
+  // Tensor content:  Imm-type: request_index
+  size_t message_size = kMessageTotalBytes;
+  char message[kMessageTotalBytes + kErrorStatusMaxSize];
   // type
   message[kTypeStartIndex] = static_cast<char>(rm.type_) & 0xff;
-  // size of name
-  memcpy(&message[kNameSizeStartIndex], &rm.name_size_, sizeof(rm.name_size_));
-  // name
-  memcpy(&message[kNameStartIndex], rm.name_.data(), rm.name_.size());
-  // buffer_size, remote_addr, rkey
-  if ((rm.type_ == RDMA_MESSAGE_BUFFER_REQUEST) ||
-      (rm.type_ == RDMA_MESSAGE_BUFFER_RESPONSE)) {
-    memcpy(&message[kBufferSizeStartIndex], &rm.buffer_size_,
-           sizeof(rm.buffer_size_));
+  // request index
+  memcpy(&message[kRequestIndexStartIndex], &rm.request_index_,
+         sizeof(rm.request_index_));
+  // name, step_id, remote_addr, rkey
+  if ((rm.type_ == RDMA_MESSAGE_TENSOR_REQUEST) ||
+      (rm.type_ == RDMA_MESSAGE_TENSOR_RE_REQUEST)) {
+    memcpy(&message[kNameSizeStartIndex], &rm.name_size_,
+           sizeof(rm.name_size_));
+    memcpy(&message[kNameStartIndex], rm.name_.data(), rm.name_.size());
     memcpy(&message[kRemoteAddrStartIndex], &rm.remote_addr_,
            sizeof(rm.remote_addr_));
     memcpy(&message[kRkeyStartIndex], &rm.rkey_, sizeof(rm.rkey_));
-  }
-  // step_id
-  if ((rm.type_ == RDMA_MESSAGE_TENSOR_WRITE) ||
-      (rm.type_ == RDMA_MESSAGE_TENSOR_REQUEST)) {
     memcpy(&message[kStepIdStartIndex], &rm.step_id_, sizeof(rm.step_id_));
   }
   // is_dead, data_type, tensor_shape, tensor_bytes
-  if (rm.type_ == RDMA_MESSAGE_TENSOR_WRITE) {
+  if ((rm.type_ == RDMA_MESSAGE_TENSOR_REQUEST) ||
+      (rm.type_ == RDMA_MESSAGE_META_DATA_UPDATE) ||
+      (rm.type_ == RDMA_MESSAGE_TENSOR_RE_REQUEST)) {
     memcpy(&message[kIsDeadStartIndex], &rm.is_dead_, sizeof(rm.is_dead_));
 
     memcpy(&message[kDataTypeStartIndex], &rm.data_type_,
@@ -1322,7 +1322,31 @@ string RdmaMessage::CreateMessage(const RdmaMessage& rm) {
     memcpy(&message[kTensorBytesStartIndex], &rm.tensor_bytes_,
            sizeof(rm.tensor_bytes_));
   }
-  return string(message, kMessageTotalBytes);
+  // checksum
+#ifdef RDMA_DATA_VALIDATION
+  memcpy(&message[kChecksumStartIndex], &rm.checksum_, sizeof(rm.checksum_));
+#endif
+  // error status
+  if (rm.type_ == RDMA_MESSAGE_ERROR_STATUS) {
+    ::grpc::Status gs = ToGrpcStatus(rm.status_);
+    ErrorStatusProto gsProto;
+    gsProto.set_error_code(gs.error_code());
+    gsProto.set_error_message(gs.error_message());
+    gsProto.set_error_details(gs.error_details());
+    uint32_t gsProtoSize = gsProto.ByteSize();
+    if (gsProtoSize + 4 > kErrorStatusMaxSize) {
+      LOG(ERROR) << "Error status (" << gsProtoSize + 4 << " bytes) "
+                 << "is too big to fit in RDMA message ("
+                 << kErrorStatusMaxSize << " bytes). Truncated.";
+      gsProtoSize = kErrorStatusMaxSize - 4;
+    }
+    uint32_t* proto_size = (uint32_t*)&message[kErrorStatusStartIndex];
+    *proto_size = gsProtoSize;
+    gsProto.SerializeToArray(&message[kErrorStatusStartIndex + 4],
+                             gsProtoSize);
+    message_size += gsProtoSize + 4;
+  }
+  return string(message, message_size);
 }
 
 // Parse a RdmaMessage according to the pre-defined format
@@ -1335,26 +1359,24 @@ void RdmaMessage::ParseMessage(RdmaMessage& rm, void* buffer) {
   char* message = static_cast<char*>(buffer);
   // type
   rm.type_ = static_cast<RdmaMessageType>(message[kTypeStartIndex]);
-  // name_size_
-  memcpy(&rm.name_size_, &message[kNameSizeStartIndex], sizeof(rm.name_size_));
-  // name
-  rm.name_ = string(&message[kNameStartIndex], rm.name_size_);
-  // buffer_size, remote_addr, rkey
-  if ((rm.type_ == RDMA_MESSAGE_BUFFER_REQUEST) ||
-      (rm.type_ == RDMA_MESSAGE_BUFFER_RESPONSE)) {
-    memcpy(&rm.buffer_size_, &message[kBufferSizeStartIndex],
-           sizeof(rm.buffer_size_));
+  // request index
+  memcpy(&rm.request_index_, &message[kRequestIndexStartIndex],
+         sizeof(rm.request_index_));
+  // name, step_id, remote_addr, rkey
+  if ((rm.type_ == RDMA_MESSAGE_TENSOR_REQUEST) ||
+      (rm.type_ == RDMA_MESSAGE_TENSOR_RE_REQUEST)) {
+    memcpy(&rm.name_size_, &message[kNameSizeStartIndex],
+           sizeof(rm.name_size_));
+    rm.name_ = string(&message[kNameStartIndex], rm.name_size_);
     memcpy(&rm.remote_addr_, &message[kRemoteAddrStartIndex],
            sizeof(rm.remote_addr_));
     memcpy(&rm.rkey_, &message[kRkeyStartIndex], sizeof(rm.rkey_));
-  }
-  // step_id
-  if ((rm.type_ == RDMA_MESSAGE_TENSOR_WRITE) ||
-      (rm.type_ == RDMA_MESSAGE_TENSOR_REQUEST)) {
     memcpy(&rm.step_id_, &message[kStepIdStartIndex], sizeof(rm.step_id_));
   }
   // data_type, tensor_bytes, tensor_shape, is_dead
-  if (rm.type_ == RDMA_MESSAGE_TENSOR_WRITE) {
+  if ((rm.type_ == RDMA_MESSAGE_TENSOR_REQUEST) ||
+      (rm.type_ == RDMA_MESSAGE_META_DATA_UPDATE) ||
+      (rm.type_ == RDMA_MESSAGE_TENSOR_RE_REQUEST)) {
     memcpy(&rm.is_dead_, &message[kIsDeadStartIndex], sizeof(rm.is_dead_));
     memcpy(&rm.data_type_, &message[kDataTypeStartIndex],
            sizeof(rm.data_type_));
@@ -1363,6 +1385,294 @@ void RdmaMessage::ParseMessage(RdmaMessage& rm, void* buffer) {
     memcpy(&rm.tensor_bytes_, &message[kTensorBytesStartIndex],
            sizeof(rm.tensor_bytes_));
   }
+  // checksum
+#ifdef RDMA_DATA_VALIDATION
+  memcpy(&rm.checksum_, &message[kChecksumStartIndex], sizeof(rm.checksum_));
+#endif
+  // error status
+  if (rm.type_ == RDMA_MESSAGE_ERROR_STATUS) {
+    ErrorStatusProto gsProto;
+    uint32_t gsProtoSize = *(uint32_t*)&message[kErrorStatusStartIndex];
+    CHECK(ParseProtoUnlimited(
+        &gsProto, &message[kErrorStatusStartIndex + 4], gsProtoSize))
+        << "Failed to parse error status proto from message. Aborting.";
+    ::grpc::Status gs((::grpc::StatusCode)gsProto.error_code(),
+                      gsProto.error_message(), gsProto.error_details());
+    rm.status_ = FromGrpcStatus(gs);
+  }
+}
+
+//*****************************************************************************
+// RdmaMemoryMgr
+//*****************************************************************************
+
+ibv_mr* RdmaMemoryMgr::FindMemoryRegion(void* addr, size_t length) {
+  mutex_lock l(mrs_mu_);
+  auto iter = std::upper_bound(mrs_.begin(), mrs_.end(), addr, &Comparator);
+  if (iter == std::end(mrs_) || iter->get()->addr > addr) {
+    return nullptr;
+  } else {
+    return iter->get();
+  }
+}
+
+void RdmaMemoryMgr::InsertMemoryRegion(void* addr, size_t length,
+                                       const std::string& allocator_name) {
+  if (length == 0) return;
+  ibv_mr* mr = ibv_reg_mr(pd_, addr, length,
+                          IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE);
+  RDMA_LOG(1) << "Insert memory region 0x" << std::hex << mr->rkey << ". ["
+              << addr << "-" << (void*)((uint64_t)addr + length - 1) << "]"
+              << " SIZE: 0x" << length << " (" << allocator_name << ").";
+  if (mr != nullptr) {
+    mutex_lock l(mrs_mu_);
+    auto iter = std::upper_bound(mrs_.begin(), mrs_.end(), addr, &Comparator);
+    mrs_.insert(iter, {mr, &MRDeleter});
+  } else {
+    LOG(WARNING) << "Cannot register memory region";
+  }
+}
+
+void RdmaMemoryMgr::EvictMemoryRegion(void* addr, size_t length) {
+  if (length == 0) return;
+  mutex_lock l(mrs_mu_);
+  auto iter = std::upper_bound(mrs_.begin(), mrs_.end(), addr, &Comparator);
+  if (iter != std::end(mrs_) && iter->get()->addr == addr) {
+    mrs_.erase(iter);
+    RDMA_LOG(1) << "Evict memory region 0x" << std::hex << iter->get()->rkey;
+
+  } else {
+    LOG(WARNING) << "Failed to de-register memory region";
+  }
+}
+
+const TensorMetaData* RdmaMemoryMgr::GetTensorMetaData(
+    const std::string& tensor_name) {
+  mutex_lock l(tensor_meta_data_mu_);
+  auto it = tensors_meta_data_.find(tensor_name);
+  if (it == tensors_meta_data_.end()) {
+    return nullptr;
+  }
+  return &it->second;
+}
+
+const TensorMetaData* RdmaMemoryMgr::SetTensorMetaData(
+    const std::string& tensor_name, DataType dtype, const TensorShape& shape,
+    bool is_dead, size_t proto_size) {
+  mutex_lock l(tensor_meta_data_mu_);
+  TensorMetaData& meta_data = tensors_meta_data_[tensor_name];
+  meta_data.data_type_ = dtype;
+  meta_data.tensor_shape_ = shape;
+  meta_data.proto_size_ = proto_size;
+  meta_data.is_dead_ = is_dead;
+  return &meta_data;
+}
+
+//*****************************************************************************
+// RdmaTensorRequest
+//*****************************************************************************
+
+RdmaTensorRequest::RdmaTensorRequest(
+    uint32_t index, const string& key, int64 step_id, RdmaChannel* channel,
+    Device* dst_dev, const Rendezvous::Args recv_args,
+    const RdmaTensorRequest::RecvDoneCallback& done)
+    : index_(index),
+      key_(key),
+      step_id_(step_id),
+      channel_(channel),
+      dst_dev_(dst_dev),
+      recv_args_(recv_args),
+      meta_data_(RdmaMemoryMgr::Singleton().GetTensorMetaData(key)),
+      result_tensor_(nullptr),
+      proxy_tensor_(nullptr),
+      rdma_addr_(nullptr),
+      mr_(nullptr),
+      done_(done) {}
+
+RdmaTensorRequest::~RdmaTensorRequest() { DeallocateTensors(); }
+
+void RdmaTensorRequest::Done(const Status& s) {
+  Tensor val = std::move(*result_tensor_);
+
+#ifdef RDMA_DATA_VALIDATION
+  // Validate checksum
+  // Unfortunately we can't always do a Checksum directly on the result tensor.
+  // If the result tensor is on GPU, then we need to copy it back to CPU. If
+  // we happen to be in the midst of a proxy callback, then the copying will
+  // get stuck.
+  uint64_t checksum = (proxy_tensor_ != nullptr)
+                          ? Checksum(nullptr, nullptr, *proxy_tensor_)
+                          : Checksum(dst_dev_, recv_args_.device_context, val);
+  ValidateChecksum(checksum_, checksum, val, index_, key_, "RDMA");
+#endif
+
+  Rendezvous::Args recv_args = std::move(recv_args_);
+  bool is_dead = (meta_data_ == nullptr) ? false : meta_data_->is_dead_;
+  RecvDoneCallback done = done_;
+  DeallocateTensors();
+  channel_->RemoveTensorRequest(index_);
+  done(s, Rendezvous::Args(), recv_args, val, is_dead);
+}
+
+void RdmaTensorRequest::DeallocateTensors() {
+  if (result_tensor_ != nullptr) {
+    delete result_tensor_;
+    result_tensor_ = nullptr;
+  }
+  if (proxy_tensor_ != nullptr) {
+    delete proxy_tensor_;
+    proxy_tensor_ = nullptr;
+  }
+}
+
+bool RdmaTensorRequest::AllocateTensors() {
+  result_tensor_ =
+      new Tensor(dst_dev_->GetAllocator(recv_args_.alloc_attrs),
+                 meta_data_->data_type_, meta_data_->tensor_shape_);
+
+  size_t tensor_size = result_tensor_->TotalBytes();
+  bool can_memcpy = DataTypeCanUseMemcpy(result_tensor_->dtype());
+  if (can_memcpy) {
+    if (tensor_size == 0) {
+      return true;
+    }
+    rdma_addr_ = DMAHelper::base(result_tensor_);
+    mr_ = RdmaMemoryMgr::Singleton().FindMemoryRegion(rdma_addr_, tensor_size);
+#if GOOGLE_CUDA
+    if (mr_ == nullptr) {
+      // Can't RDMA directly to result. Use a proxy.
+      proxy_tensor_ =
+          new Tensor(ProcessState::singleton()->GetCUDAHostAllocator(0),
+                     result_tensor_->dtype(), result_tensor_->shape());
+      rdma_addr_ = DMAHelper::base(proxy_tensor_);
+      mr_ =
+          RdmaMemoryMgr::Singleton().FindMemoryRegion(rdma_addr_, tensor_size);
+    }
+#endif
+  } else {
+    uint32_t proto_size = meta_data_->proto_size_;
+    rdma_addr_ = malloc(proto_size);
+    mr_ = ibv_reg_mr(RdmaMemoryMgr::Singleton().pd_, rdma_addr_, proto_size,
+                     IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE);
+  }
+  CHECK(mr_ != nullptr) << " No memory region found for address " << rdma_addr_
+                        << ": " << key_;
+  return true;
+}
+
+void RdmaTensorRequest::AllocateTensorsAsync(StatusCallback done) {
+  AllocateTensors();
+  bool on_host = recv_args_.alloc_attrs.on_host();
+  if (dst_dev_->tensorflow_gpu_device_info() && !on_host &&
+      (proxy_tensor_ == nullptr)) {
+#if GOOGLE_CUDA
+        // We need to sync the memory allocation on the GPU:
+        StreamGPUOp(dst_dev_, recv_args_.device_context, done);
+#endif
+  } else {
+    done(Status::OK());
+  }
+}
+
+void RdmaTensorRequest::Send(RdmaMessageType message_type) {
+  RdmaMessageBuffer* rb = channel_->tx_message_buffer_;
+  RdmaMessage rm;
+  rm.type_ = message_type;
+  rm.request_index_ = index_;
+  rm.name_size_ = key_.size();
+  rm.name_ = key_;
+  rm.step_id_ = step_id_;
+  rm.remote_addr_ = (uint64_t)rdma_addr_;
+  if (meta_data_ != nullptr) {
+    rm.data_type_ = meta_data_->data_type_;
+    rm.tensor_shape_ = meta_data_->tensor_shape_;
+    rm.is_dead_ = meta_data_->is_dead_;
+    rm.tensor_bytes_ = meta_data_->proto_size_;
+  } else {
+    rm.data_type_ = DT_INVALID;
+  }
+  rm.rkey_ = (mr_ == nullptr) ? 0 : mr_->rkey;
+
+  RDMA_LOG(1) << "Step 0x" << std::hex << rm.step_id_ << std::dec
+              << ": Sending  " << MessageTypeToString(message_type)
+              << " #" << index_ << ": "
+              << rm.name_ << " on " << rdma_addr_
+              << " (rkey: 0x" << std::hex << rm.rkey_ << ")";
+
+  string message = RdmaMessage::CreateMessage(rm);
+  rb->EnqueueItem(message);
+  rb->SendNextItem();
+}
+
+void RdmaTensorRequest::RecvTensorMetaData(DataType dtype, TensorShape shape,
+                                           bool is_dead, size_t proto_size) {
+  meta_data_ = RdmaMemoryMgr::Singleton().SetTensorMetaData(
+      key_, dtype, shape, is_dead, proto_size);
+
+  DeallocateTensors();
+  AllocateTensorsAsync([this](const Status& s) {
+    Send(RDMA_MESSAGE_TENSOR_RE_REQUEST);
+  });
+}
+
+void RdmaTensorRequest::RecvTensorContent() {
+  bool can_memcpy = DataTypeCanUseMemcpy(meta_data_->data_type_);
+  size_t message_size =
+      can_memcpy ? result_tensor_->TotalBytes() : meta_data_->proto_size_;
+  RDMA_LOG(1) << "Step 0x" << std::hex << step_id_ << std::dec
+              << ": Received tensor content #" << index_ << ": "
+              << key_ << " (Size: 0x" << std::hex << message_size << ")";
+
+  Tensor val;
+
+#if GOOGLE_CUDA
+  if (proxy_tensor_ != nullptr) {
+    CountCopies(key_, (void*)DMAHelper::base(proxy_tensor_),
+                (void*)DMAHelper::base(result_tensor_),
+                result_tensor_->TotalBytes(), false);
+    GPUUtil::CopyCPUTensorToGPU(proxy_tensor_, recv_args_.device_context,
+                                dst_dev_, result_tensor_,
+                                [this](const Status& s) {
+                                  CHECK(s.ok()) << "copy tensor to gpu sync";
+                                  Done(s);
+                                });
+    return;
+  }
+#endif
+
+  if (can_memcpy) {
+    Done(Status::OK());
+  } else {
+    RDMA_LOG(2) << "Decoding proto: " << key_
+                << " (Size: " << meta_data_->proto_size_ << ")";
+    TensorProto proto;
+    CHECK(ParseProtoUnlimited(&proto, rdma_addr_, meta_data_->proto_size_))
+        << "fail to parse proto from array";
+    ibv_dereg_mr(mr_);
+    free(rdma_addr_);
+    Status s = dst_dev_->MakeTensorFromProto(proto, recv_args_.alloc_attrs,
+                                             result_tensor_);
+    Done(s);
+  }
+}
+
+void RdmaTensorRequest::RecvErrorStatus(const Status& status) {
+  if (result_tensor_ == nullptr) {
+    result_tensor_ = new Tensor();
+  }
+  LOG(ERROR) << "Received RDMA_MESSAGE_ERROR_STATUS: " << status.ToString();
+  Done(status);
+}
+
+void RdmaTensorRequest::Start() {
+  meta_data_ = RdmaMemoryMgr::Singleton().GetTensorMetaData(key_);
+  if (meta_data_ != nullptr) {
+    AllocateTensorsAsync([this](const Status& s) {
+      Send(RDMA_MESSAGE_TENSOR_REQUEST);
+    });
+  } else {
+    Send(RDMA_MESSAGE_TENSOR_REQUEST);
+  }
 }
 
 }  // end namespace tensorflow
diff --git a/tensorflow/contrib/verbs/rdma.h b/tensorflow/contrib/verbs/rdma.h
index fea2327d77..68b3d59f56 100644
--- a/tensorflow/contrib/verbs/rdma.h
+++ b/tensorflow/contrib/verbs/rdma.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
+#include "tensorflow/contrib/verbs/verbs_util.h"
 #include "tensorflow/core/distributed_runtime/worker_env.h"
 #include "tensorflow/core/framework/rendezvous.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -43,6 +44,11 @@ namespace tensorflow {
 #define SL_DEFAULT 0
 #define TRAFFIC_CLASS 0
 
+#define RDMA_LOG_0 LOG(INFO)
+#define RDMA_LOG_1 VLOG(1)
+#define RDMA_LOG_2 VLOG(2)
+#define RDMA_LOG(LEVEL) RDMA_LOG_##LEVEL
+
 struct RdmaParams {
   uint8_t port_num;
   uint8_t sgid_index;
@@ -76,29 +82,303 @@ enum Location {
   local,
   remote
 };
-enum BufferType {
-  ACK,
-  MESSAGE,
-  TENSOR
-};
+
 enum RdmaMessageType {
-  RDMA_MESSAGE_ACK,
-  RDMA_MESSAGE_BUFFER_IDLE,
-  RDMA_MESSAGE_BUFFER_REQUEST,
-  RDMA_MESSAGE_BUFFER_RESPONSE,
+  RDMA_MESSAGE_META_DATA_UPDATE,
+  RDMA_MESSAGE_TENSOR_RE_REQUEST,
   RDMA_MESSAGE_TENSOR_REQUEST,
-  RDMA_MESSAGE_TENSOR_WRITE
+  RDMA_MESSAGE_ERROR_STATUS,
+};
+
+struct RdmaMessage {
+  RdmaMessageType type_;
+  uint16_t name_size_;
+  string name_;
+  int64 step_id_;
+  uint64_t request_index_;
+  union {
+    uint64_t remote_addr_;
+#ifdef RDMA_DATA_VALIDATION
+    uint64_t checksum_;
+#endif
+  };
+  uint32_t rkey_;
+  bool is_dead_;
+  DataType data_type_;
+  TensorShape tensor_shape_;
+  size_t tensor_bytes_;
+
+  // For error status:
+  Status status_;
+
+  // type|name_size|name|step_id|request_index|remote_addr/checksum|rkey|...
+  //   1B|    2B   | 512|  8B   |     8B      |       8B           | 4B |...
+  // ...|is_dead|data_type|tensor_shape|tensor_bytes|error_status          |
+  // ...|    1B |   XB    |    XB      |    8B      |size - 4B, proto - XB |
+  static const size_t kNameCapacity = 512;
+  static const size_t kTypeStartIndex = 0;
+  static const size_t kNameSizeStartIndex = kTypeStartIndex + sizeof(type_);
+  static const size_t kNameStartIndex =
+      kNameSizeStartIndex + sizeof(name_size_);
+  static const size_t kStepIdStartIndex = kNameStartIndex + kNameCapacity;
+  static const size_t kRequestIndexStartIndex =
+      kStepIdStartIndex + sizeof(step_id_);
+  static const size_t kRemoteAddrStartIndex =
+      kRequestIndexStartIndex + sizeof(request_index_);
+  static const size_t kChecksumStartIndex = kRemoteAddrStartIndex;
+  static const size_t kRkeyStartIndex =
+      kRemoteAddrStartIndex + sizeof(remote_addr_);
+  static const size_t kIsDeadStartIndex = kRkeyStartIndex + sizeof(rkey_);
+  static const size_t kDataTypeStartIndex =
+      kIsDeadStartIndex + sizeof(is_dead_);
+  static const size_t kTensorShapeStartIndex =
+      kDataTypeStartIndex + sizeof(data_type_);
+  static const size_t kTensorBytesStartIndex =
+      kTensorShapeStartIndex + sizeof(TensorShape);
+  static const size_t kErrorStatusStartIndex =
+      kTensorBytesStartIndex + sizeof(tensor_bytes_);
+  static const size_t kErrorStatusMaxSize = 4096;
+
+  static const size_t kMessageTotalBytes = kErrorStatusStartIndex;
+  static const size_t kRdmaMessageBufferSize =
+      kMessageTotalBytes + kErrorStatusMaxSize;
+  static string CreateMessage(const RdmaMessage& rm);
+  static void ParseMessage(RdmaMessage& rm, void* buffer);
+};
+
+// Immediate types for RDMA write
+enum RdmaImmDataType {
+  RDMA_IMM_MAX_REQUEST_ID = 0xFFFFFFFD,
+  RDMA_IMM_DATA_ACK = 0xFFFFFFFE,
+  RDMA_IMM_DATA_MESSAGE = 0xFFFFFFFF
+};
+
+// Write types for RDMA write-complete events
+enum RdmaWriteIDType {
+  RDMA_WRITE_ID_ACK,
+  RDMA_WRITE_ID_MESSAGE,
+  RDMA_WRITE_ID_TENSOR_WRITE
+};
+
+// Context for RDMA write-complete events
+class RdmaWriteID {
+ public:
+  RdmaWriteID(RdmaWriteIDType write_type, void* write_context)
+      : write_type(write_type), write_context(write_context) {}
+
+  RdmaWriteIDType write_type;
+  void* write_context;
+};
+
+// Tensor meta-data
+class TensorMetaData {
+ public:
+  TensorShape tensor_shape_;
+  DataType data_type_;
+  size_t proto_size_;
+  bool is_dead_;
+
+  std::ostream& print(std::ostream& out) const {
+    out << "Dtype = " << DataTypeString(data_type_)
+        << ", Shape = " << tensor_shape_.DebugString() << ", Proto size = 0x"
+        << std::hex << proto_size_ << ", Is dead = " << is_dead_;
+    return out;
+  }
+};
+
+inline std::ostream& operator<<(std::ostream& out,
+                                const TensorMetaData& meta_data) {
+  return meta_data.print(out);
+}
+
+class RdmaChannel;
+
+void MRDeleter(ibv_mr* mr);
+using MemoryRegionPtr = std::unique_ptr<ibv_mr, decltype(&MRDeleter)>;
+
+// RdmaMemoryMgr
+// Manages the local meta-data cache, and the registered RDMA memory regions.
+class RdmaMemoryMgr {
+ public:
+  static RdmaMemoryMgr& Singleton() {
+    static RdmaMemoryMgr instance;
+    return instance;
+  }
+
+  // Memory regions
+  ibv_mr* FindMemoryRegion(void* addr, size_t length);
+  void InsertMemoryRegion(void* addr, size_t length,
+                          const std::string& allocator_name);
+  void EvictMemoryRegion(void* addr, size_t length);
+
+  // Tensor meta-data cache
+  const TensorMetaData* GetTensorMetaData(const std::string& tensor_name);
+  const TensorMetaData* SetTensorMetaData(const std::string& tensor_name,
+                                          DataType dtype,
+                                          const TensorShape& shape,
+                                          bool is_dead, size_t proto_size);
+
+  struct ibv_pd* pd_;
+
+ protected:
+  RdmaMemoryMgr() : pd_(nullptr) {}
+
+  static bool Comparator(const void* ptr, const MemoryRegionPtr& other) {
+    return ptr < reinterpret_cast<char*>(other->addr) + other->length;
+  }
+
+ private:
+  mutex tensor_meta_data_mu_;
+  std::unordered_map<std::string, TensorMetaData> tensors_meta_data_;
+
+  // Managed memory regions
+  mutex mrs_mu_;
+  std::vector<MemoryRegionPtr> mrs_ GUARDED_BY(mrs_mu_);
 };
-class RdmaBuffer;
+
+// RdmaTensorRequest
+// Represents a single tensor request.
+class RdmaTensorRequest {
+ public:
+  typedef Rendezvous::DoneCallback RecvDoneCallback;
+
+  // Creates a tensor request identified by index.
+  RdmaTensorRequest(uint32_t index, const string& key, int64 step_id,
+                    RdmaChannel* channel, Device* dst_dev,
+                    const Rendezvous::Args recv_args,
+                    const RecvDoneCallback& done);
+  ~RdmaTensorRequest();
+
+  // Request unique index.
+  uint32_t index() { return index_; }
+
+  // Start the tensor request sequence.
+  //
+  // 1. Allocate the result tensor (and proxy tensor if required).
+  // 2. Send RDMA_MESSAGE_TENSOR_REQUEST to the remote side.
+  void Start();
+
+  // Receive tensor meta-data.
+  //
+  // 1. Update the local meta-data cache.
+  // 2. Reallocate the result tensor (and proxy tensor if required).
+  // 3. Re-send the request to the remote side.
+  void RecvTensorMetaData(DataType dtype, TensorShape shape, bool is_dead,
+                          size_t proto_size);
+
+  // Receive tensor content (RDMA write was completed).
+  //
+  // Decode proto if required and/or move to GPU if the content was not
+  // written to it directly (GPU direct is not avaliable). Afterwards,
+  // invoke Done().
+  void RecvTensorContent();
+
+  // Receive error status (in case of a remote error).
+  // Invoke Done() with the status code.
+  void RecvErrorStatus(const Status& status);
+
+#ifdef RDMA_DATA_VALIDATION
+  // Receive tensor checksum
+  //
+  // For validation: Get and store the Tensor's expected checksum for the
+  // current request. Compare the result Tensor's checksum with the stored
+  // checksum right before invoking Done().
+  void RecvTensorChecksum(uint64_t checksum) { checksum_ = checksum; }
+#endif
+
+ private:
+  void Done(const Status& s);
+  void Send(RdmaMessageType message_type);
+  bool AllocateTensors();
+  void AllocateTensorsAsync(StatusCallback done);
+  void DeallocateTensors();
+
+  uint32_t index_;
+  string key_;
+  int64 step_id_;
+  RdmaChannel* channel_;
+  Device* dst_dev_;
+  Rendezvous::Args recv_args_;
+  const TensorMetaData* meta_data_;
+  Tensor* result_tensor_;
+  Tensor* proxy_tensor_;
+  void* rdma_addr_;
+  ibv_mr* mr_;
+  RecvDoneCallback done_;
+#ifdef RDMA_DATA_VALIDATION
+  uint64_t checksum_;
+#endif
+};
+
+// RdmaTensorResponse
+// Represents a single tensor response.
+class RdmaTensorResponse {
+ public:
+  // Creates a response for request message.
+  RdmaTensorResponse(RdmaChannel* channel, const RdmaMessage& rm)
+      : channel_(channel), rm_(rm) {}
+
+  void Update(const RdmaMessage& rm) { rm_ = rm; }
+
+  // Start the tensor response sequence.
+  //
+  // 1. Find the tensor in the local tag-match table and invoke RecvHandler.
+  //    (Using RecvLocalAsync()).
+  // 2. Compare the tensor's meta-data to the meta-data in the message (taken
+  //    from the requester's local cache).
+  //    If meta-data changed:
+  //    a. Clone the tensor to be sent later.
+  //    b. Send a meta-data update message and wait for re-request.
+  //    Else:
+  //    a. Send the tensor's content (using direct RDMA write).
+  void Start();
+
+  // Resume the response sequence, after a re-request.
+  //
+  // 1. Send the tensor's content that was cloned earlier.
+  void Resume();
+
+  // Destroy the response's resources and remove it from the pending list.
+  void Destroy();
+
+ private:
+  void RecvHandler(Rendezvous::ParsedKey parsed,
+                   const Rendezvous::Args& send_args,
+                   const Rendezvous::Args& recv_args, const Tensor& in,
+                   bool is_dead);
+  void Clone(const Tensor& in, const TensorProto& proto, bool is_dead);
+  void Send(const Tensor& in, const TensorProto& proto, bool is_dead,
+            const Status& status);
+  bool TensorMetaDataChanged(const Tensor& in, bool is_dead);
+  Status PrepareRecvTensor(const Rendezvous::ParsedKey& parsed,
+                           Device** src_dev);
+  void SendMetaData(const Tensor& in, const TensorProto& proto, bool is_dead);
+  void SendContent(const Tensor& in, const TensorProto& proto, bool is_dead);
+  void SendErrorStatus(const Status& status);
+
+  RdmaChannel* channel_;
+  RdmaMessage rm_;  // The request message
+  Device* src_dev_ = nullptr;
+  TensorBuffer* src_buffer_ = nullptr;
+  void* src_addr_ = nullptr;
+  ibv_mr* mr_ = nullptr;
+  uint64_t checksum_ = 0;
+  bool meta_data_changed_ = false;
+
+  // Re-item:
+  TensorProto* proto_ = nullptr;
+  Tensor* tensor_ = nullptr;
+  bool is_dead_ = false;
+};
+
+class RdmaMessageBuffer;
 // Class that represents the Rdma Adapter.
 // Responsible for creation of the completion queue, and handling
 // of work completions.
 class RdmaAdapter {
   friend class RdmaChannel;
-  friend class RdmaBuffer;
-  friend class RdmaAckBuffer;
   friend class RdmaMessageBuffer;
-  friend class RdmaTensorBuffer;
+  friend class RdmaTensorResponse;
   friend class RdmaMgr;
   friend class RdmaRemoteRendezvous;
 
@@ -133,10 +413,10 @@ class RdmaAdapter {
 // Responsible for connecting queue pairs.
 class RdmaChannel {
   friend class RdmaAdapter;
-  friend class RdmaBuffer;
-  friend class RdmaAckBuffer;
   friend class RdmaMessageBuffer;
   friend class RdmaTensorBuffer;
+  friend class RdmaTensorRequest;
+  friend class RdmaTensorResponse;
   friend class RdmaMgr;
   friend class RdmaRemoteRendezvous;
 
@@ -146,22 +426,28 @@ class RdmaChannel {
   ~RdmaChannel();
   inline const RdmaAddress& self() { return self_; }
   RdmaAddress address() const;
-  inline const std::vector<RdmaBuffer*>& message_buffers() const {
+  inline const std::vector<RdmaMessageBuffer*>& message_buffers() const {
     return message_buffers_;
   }
   void Connect(const RdmaAddress& remoteAddr);
   void Connect();
   void Recv();
-  RdmaBuffer* FindBuffer(const uint32_t index);
-  RdmaBuffer* FindBuffer(const string& name);
-  RdmaBuffer* FindOrCreateBuffer(const string& name,
-                                 BufferType buffer_type = TENSOR);
-  uint32_t LookupBufferIndex(const string& buffer_name);
   void SetRemoteAddress(const RdmaAddress& ra, bool override);
-  void InsertRecvCallback(const string& key, std::function<void()> recv_done);
-  void RemoveRecvCallback(const string& key);
-  void RunRecvCallback(const string& key);
-  static const int kNumMessageBuffers = 4;
+
+  // Requests:
+  RdmaTensorRequest* InsertTensorRequest(
+      const string& key, int64 step_id, Device* dst_dev,
+      const Rendezvous::Args recv_args,
+      const RdmaTensorRequest::RecvDoneCallback& done);
+  void RemoveTensorRequest(uint32_t request_index);
+  RdmaTensorRequest* GetTensorRequest(uint32_t request_index);
+
+  // Responses:
+  RdmaTensorResponse* AddTensorResponse(const RdmaMessage& rm);
+  RdmaTensorResponse* UpdateTensorResponse(const RdmaMessage& rm);
+  void RemoveTensorResponse(uint32_t request_index);
+
+  static const int kNumMessageBuffers = 2;
   static const int kPingRecvWrid = 0;
 
  private:
@@ -179,36 +465,31 @@ class RdmaChannel {
   string remote_name_;
   ibv_qp* qp_;
   mutex mu_;
-  bool connected_ GUARDED_BY(bt_mu_) = false;
-  RdmaAddress remote_ GUARDED_BY(bt_mu_);
-  bool remote_set_ GUARDED_BY(bt_mu_) = false;
+  bool connected_ GUARDED_BY(mu_) = false;
+  RdmaAddress remote_ GUARDED_BY(mu_);
+  bool remote_set_ GUARDED_BY(mu_) = false;
   mutex ct_mu_;
-  typedef std::unordered_map<string, std::function<void()> > CallbackTable;
-  CallbackTable callback_table_ GUARDED_BY(ct_mu_);
-  mutex bt_mu_;
-  typedef std::unordered_map<unsigned int, RdmaBuffer*> BufferTable;
-  BufferTable buffer_table_ GUARDED_BY(bt_mu_);
-  typedef std::unordered_map<uint32_t, string> BufferIndexNameTable;
-  BufferIndexNameTable buffer_index_name_table_ GUARDED_BY(bt_mu_);
-  typedef std::unordered_map<string, uint32_t> BufferNameIndexTable;
-  BufferNameIndexTable buffer_name_index_table_ GUARDED_BY(bt_mu_);
-  RdmaBuffer* tx_message_buffer_;
-  RdmaBuffer* rx_message_buffer_;
-  RdmaBuffer* tx_ack_buffer_;
-  RdmaBuffer* rx_ack_buffer_;
-  std::vector<RdmaBuffer*> message_buffers_;
+  typedef std::unordered_map<uint32_t, RdmaTensorRequest> RequestTable;
+  RequestTable request_table_ GUARDED_BY(ct_mu_);
+  uint32_t request_serial_ GUARDED_BY(ct_mu_);
+  mutex responses_mu_;
+  typedef std::unordered_map<uint32_t, RdmaTensorResponse> ResponsesTable;
+  ResponsesTable responses_table_ GUARDED_BY(responses_mu_);
+  RdmaMessageBuffer* tx_message_buffer_;
+  RdmaMessageBuffer* rx_message_buffer_;
+  std::vector<RdmaMessageBuffer*> message_buffers_;
 };
 
-// Class that represents a buffer for Rdma writes and reads.
-class RdmaBuffer {
+// Class that represents a buffer for Rdma message sending.
+class RdmaMessageBuffer {
   friend class RdmaChannel;
   friend class RdmaAdapter;
   friend class RdmaMgr;
   friend class RdmaRemoteRendezvous;
 
  public:
-  explicit RdmaBuffer(RdmaChannel* channel, string name);
-  virtual ~RdmaBuffer();
+  explicit RdmaMessageBuffer(RdmaChannel* channel, string name);
+  ~RdmaMessageBuffer();
 
   inline void* buffer() const { return buffer_; }
   inline ibv_mr* self() const { return self_; }
@@ -223,13 +504,15 @@ class RdmaBuffer {
   }
   void FreeBuffer();
   void EnqueueItem(string Item);
-  virtual void SendNextItem() {};
+  void SendNextItem();
   void CreateCPUBuffer(size_t size, bool lock = true);
   void SetRemoteMR(RemoteMR rmi, bool override);
-  uint32_t LookupBufferIndex(const string& buffer_name) {
-    return const_cast<RdmaChannel*>(channel_)->LookupBufferIndex(buffer_name);
-  }
   void Write(uint32_t imm_data, size_t buffer_size);
+  static void Write(const RdmaChannel* channel, uint32_t imm_data,
+                    size_t buffer_size, uint64_t src_addr, uint32_t lkey,
+                    uint64_t remote_addr, uint32_t rkey,
+                    RdmaWriteIDType write_type, void* write_context);
+  static void SendAck(const RdmaChannel* channel);
 
  protected:
   const RdmaChannel* channel_;
@@ -245,125 +528,6 @@ class RdmaBuffer {
   BufferStatus remote_status_ GUARDED_BY(mu_) = none;
 };
 
-class RdmaAckBuffer : public RdmaBuffer {
- public:
-  explicit RdmaAckBuffer(RdmaChannel* channel, string name);
-  virtual ~RdmaAckBuffer() override {}
-  void SendNextItem() override;
-};
-
-class RdmaMessageBuffer : public RdmaBuffer {
-  friend class RdmaChannel;
-  friend class RdmaAapater;
-
- public:
-  explicit RdmaMessageBuffer(RdmaChannel* channel, string name);
-  virtual ~RdmaMessageBuffer() override {}
-  void SendNextItem() override;
-};
-
-class RdmaTensorBuffer : public RdmaBuffer {
- public:
-  explicit RdmaTensorBuffer(RdmaChannel* channel, string name);
-  virtual ~RdmaTensorBuffer() override;
-  void SendNextItem() override;
-  void PostCopyOperations(bool can_memcpy, size_t buffer_size,
-                          size_t tensor_bytes, const string& key,
-                          const Tensor& in, int64 step_id, bool is_dead,
-                          const string& key_with_step_id, const Tensor* copy,
-                          const TensorProto* proto, const StringPiece* copy_buf,
-                          const Rendezvous::Args& send_args,
-                          const Rendezvous::Args& recv_args);
-
-  void ReSendNextItem();
-
- private:
-  Rendezvous::DoneCallback getRecvTensorCallback(
-      const string& key_with_step_id, const string& key, int64 step_id,
-      const Rendezvous::ParsedKey& parsed);
-
-  struct ReItem {
-    Rendezvous::Args send_args;
-    Rendezvous::Args recv_args;
-    Tensor in;
-    bool is_dead;
-
-    ReItem(const Rendezvous::Args& send_args_,
-           const Rendezvous::Args& recv_args_, const Tensor& in_, bool is_dead_)
-        : send_args(send_args_),
-          recv_args(recv_args_),
-          in(in_),
-          is_dead(is_dead_) {
-      if (send_args.device_context) {
-        send_args.device_context->Ref();
-      }
-      if (recv_args.device_context) {
-        recv_args.device_context->Ref();
-      }
-    }
-
-    ~ReItem() {
-      if (send_args.device_context) {
-        send_args.device_context->Unref();
-      }
-      if (recv_args.device_context) {
-        recv_args.device_context->Unref();
-      }
-    }
-  };
-  typedef std::map<string, ReItem*> Table;
-  typedef Table::iterator Itable;
-
-  std::queue<string> requeue GUARDED_BY(mu_);
-  Table retable GUARDED_BY(mu_);
-};
-
-struct RdmaMessage {
-  RdmaMessageType type_;
-  uint16_t name_size_;
-  string name_;
-  int64 step_id_;
-  uint64_t buffer_size_;
-  uint64_t remote_addr_;
-  uint32_t rkey_;
-  bool is_dead_;
-  DataType data_type_;
-  TensorShape tensor_shape_;
-  size_t tensor_bytes_;
-
-  // type|name_size|name|step_id|buffer_size|remote_addr|rkey|is_dead|...
-  //   1B|    2B   | 512|  8B   |    8B     |       8B  | 4B |    1B |...
-  // ...|data_type|tensor_shape|tensor_bytes|tensor_buffer
-  // ...|   XB    |    XB      |    8B      |...
-  //
-  static const size_t kNameCapacity = 512;
-  static const size_t kTypeStartIndex = 0;
-  static const size_t kNameSizeStartIndex = kTypeStartIndex + sizeof(type_);
-  static const size_t kNameStartIndex =
-      kNameSizeStartIndex + sizeof(name_size_);
-  static const size_t kStepIdStartIndex = kNameStartIndex + kNameCapacity;
-  static const size_t kBufferSizeStartIndex =
-      kStepIdStartIndex + sizeof(step_id_);
-  static const size_t kRemoteAddrStartIndex =
-      kBufferSizeStartIndex + sizeof(buffer_size_);
-  static const size_t kRkeyStartIndex =
-      kRemoteAddrStartIndex + sizeof(remote_addr_);
-  static const size_t kIsDeadStartIndex = kRkeyStartIndex + sizeof(rkey_);
-  static const size_t kDataTypeStartIndex =
-      kIsDeadStartIndex + sizeof(is_dead_);
-  static const size_t kTensorShapeStartIndex =
-      kDataTypeStartIndex + sizeof(data_type_);
-  static const size_t kTensorBytesStartIndex =
-      kTensorShapeStartIndex + sizeof(TensorShape);
-  static const size_t kTensorBufferStartIndex =
-      kTensorBytesStartIndex + sizeof(tensor_bytes_);
-  static const size_t kMessageTotalBytes = kTensorBufferStartIndex;
-  static const size_t kRdmaMessageBufferSize = kMessageTotalBytes;
-  static const size_t kRdmaAckBufferSize = kMessageTotalBytes;
-  static string CreateMessage(const RdmaMessage& rm);
-  static void ParseMessage(RdmaMessage& rm, void* buffer);
-};
-
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_USE_VERBS
diff --git a/tensorflow/contrib/verbs/rdma_mgr.cc b/tensorflow/contrib/verbs/rdma_mgr.cc
index 9cb307bcfa..f3644af0b4 100644
--- a/tensorflow/contrib/verbs/rdma_mgr.cc
+++ b/tensorflow/contrib/verbs/rdma_mgr.cc
@@ -16,11 +16,16 @@ limitations under the License.
 #ifdef TENSORFLOW_USE_VERBS
 
 #include "tensorflow/contrib/verbs/rdma_mgr.h"
+#include <fstream>
 #include <vector>
 #include "tensorflow/contrib/verbs/grpc_verbs_client.h"
 #include "tensorflow/contrib/verbs/verbs_service.pb.h"
+#include "tensorflow/core/common_runtime/bfc_allocator.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_util.h"
+#include "tensorflow/core/common_runtime/gpu/process_state.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.h"
 #include "tensorflow/core/distributed_runtime/session_mgr.h"
+#include "tensorflow/core/framework/allocator_registry.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
@@ -53,7 +58,7 @@ RdmaMgr::RdmaMgr(const WorkerEnv* const worker_env,
 void RdmaMgr::SetupChannels() {
   for (const auto& p : channel_table_) {
     string worker_name = p.first;
-    LOG(INFO) << "connecting to remote node " << worker_name;
+    RDMA_LOG(2) << "Connecting to remote node " << worker_name;
     RdmaChannel* rc = p.second;
     GetRemoteAddressRequest req;
     GetRemoteAddressResponse resp;
@@ -78,39 +83,49 @@ void RdmaMgr::SetupChannels() {
       mr->set_rkey(rc->message_buffers_[i]->self_->rkey);
     }
     // synchronous call
-    Status s = client->GetRemoteAddress(&req, &resp);
-    // save obtained remote addresses
-    // connect to the remote channel
-    if (s.ok()) {
-      CHECK(worker_name.compare(resp.host_name()) == 0);
-      RdmaAddress ra;
-      ra.lid = resp.channel().lid();
-      ra.qpn = resp.channel().qpn();
-      ra.psn = resp.channel().psn();
-      ra.snp = resp.channel().snp();
-      ra.iid = resp.channel().iid();
-      rc->SetRemoteAddress(ra, false);
-      rc->Connect();
-      int i = 0;
-      int idx[] = {1, 0, 3, 2};
-      for (const auto& mr : resp.mr()) {
-        // the connections are crossed, i.e.
-        // local tx_message_buffer <---> remote rx_message_buffer_
-        // local rx_message_buffer <---> remote tx_message_buffer_
-        // local tx_ack_buffer <---> remote rx_ack_buffer_
-        // local rx_ack_buffer <---> remote tx_ack_buffer_
-        // hence idx[] = {1, 0, 3, 2}.
-        RdmaBuffer* rb = rc->message_buffers_[idx[i]];
-        RemoteMR rmr;
-        rmr.remote_addr = mr.remote_addr();
-        rmr.rkey = mr.rkey();
-        rb->SetRemoteMR(rmr, false);
-        i++;
+    Status s;
+    int attempts = 0;
+    static const int max_num_attempts = 5;
+    do {
+      s = client->GetRemoteAddress(&req, &resp);
+      // save obtained remote addresses
+      // connect to the remote channel
+      if (s.ok()) {
+        CHECK(worker_name.compare(resp.host_name()) == 0);
+        RdmaAddress ra;
+        ra.lid = resp.channel().lid();
+        ra.qpn = resp.channel().qpn();
+        ra.psn = resp.channel().psn();
+        ra.snp = resp.channel().snp();
+        ra.iid = resp.channel().iid();
+        rc->SetRemoteAddress(ra, false);
+        rc->Connect();
+        int i = 0;
+        int idx[] = {1, 0};
+        for (const auto& mr : resp.mr()) {
+          // the connections are crossed, i.e.
+          // local tx_message_buffer <---> remote rx_message_buffer_
+          // local rx_message_buffer <---> remote tx_message_buffer_
+          // hence idx[] = {1, 0}.
+          RdmaMessageBuffer* rb = rc->message_buffers_[idx[i]];
+          RemoteMR rmr;
+          rmr.remote_addr = mr.remote_addr();
+          rmr.rkey = mr.rkey();
+          rb->SetRemoteMR(rmr, false);
+          i++;
+        }
+        CHECK(i == RdmaChannel::kNumMessageBuffers);
+      } else {
+        LOG(ERROR) << "Connecting to " << worker_name
+                   << ": Got " << s.error_message() << ". Retrying ("
+                   << (attempts + 1) << "/" << max_num_attempts << ")..." ;
+        if (++attempts == max_num_attempts) {
+          break;
+        }
+        worker_env_->env->SleepForMicroseconds(2000000);
       }
-      CHECK(i == RdmaChannel::kNumMessageBuffers);
-    } else {
-      LOG(ERROR) << s.error_message();
-    }
+    } while (!s.ok());
+    RDMA_LOG(0) << "Connected to remote node " << worker_name;
     delete client;
   }
 }
@@ -183,6 +198,138 @@ RdmaChannel* RdmaMgr::FindChannel(const string& name) {
   return iter->second;
 }
 
+bool IsGDRAvailable() {
+#if defined(__APPLE__)
+  return false;
+#elif defined(PLATFORM_WINDOWS)
+  return false;
+#else
+  std::ifstream ifs("/proc/modules");
+  string line;
+  while (std::getline(ifs, line)) {
+    auto sep = line.find(' ');
+    CHECK_NE(sep, std::string::npos);
+    if (line.substr(0, sep) == "nv_peer_mem") {
+      return true;
+    }
+  }
+  return false;
+#endif
+}
+
+int TryToReadNumaNode(ibv_device* device) {
+#if defined(__APPLE__)
+  LOG(INFO) << "OS X does not support NUMA - returning NUMA node 0";
+  return 0;
+#elif defined(PLATFORM_WINDOWS)
+  // Windows support for NUMA is not currently implemented. Return node 0.
+  return 0;
+#else
+  VLOG(2) << "Trying to read NUMA node for device: " << device->name;
+  static const int kUnknownNumaNode = -1;
+
+  auto filename = string(device->ibdev_path) + "/device/numa_node";
+
+  std::ifstream ifs(filename.c_str());
+  string content;
+  CHECK(std::getline(ifs, content));
+
+  int32 value;
+  if (strings::safe_strto32(content, &value)) {
+    if (value < 0) {
+      LOG(INFO) << "Successful NUMA node read from SysFS had negative value ("
+                << value << "), but there must be at least one NUMA node"
+                            ", so returning NUMA node zero";
+      return 0;
+    }
+    LOG(INFO) << "NUMA node for device: " << device->name << " is " << value;
+    return value;
+  }
+  return kUnknownNumaNode;
+#endif
+}
+
+void MRDeleter(ibv_mr* mr) {
+  if (mr) {
+    ibv_dereg_mr(mr);
+  }
+}
+
+// TODO(byronyi): remove this class duplicated from the one in
+// common/runtime/gpu/pool_allocator.h when it is available in common_runtime
+class BasicCPUAllocator : public SubAllocator {
+ public:
+  ~BasicCPUAllocator() override {}
+
+  void* Alloc(size_t alignment, size_t num_bytes) override {
+    return port::AlignedMalloc(num_bytes, alignment);
+  }
+  void Free(void* ptr, size_t) override { port::AlignedFree(ptr); }
+};
+
+// TODO(byronyi): remove this class and its registration when the default
+// cpu_allocator() returns visitable allocator
+class BFCRdmaAllocator : public BFCAllocator {
+ public:
+  BFCRdmaAllocator()
+      : BFCAllocator(new BasicCPUAllocator(), 1LL << 36, true, "cpu_rdma_bfc") {
+  }
+};
+
+REGISTER_MEM_ALLOCATOR("BFCRdmaAllocator", 101, BFCRdmaAllocator);
+
+void RdmaMgr::InitAllocators() {
+  RdmaMemoryMgr::Singleton().pd_ = rdma_adapter_->pd_;
+
+  Allocator* allocators[] = {
+#if GOOGLE_CUDA
+    ProcessState::singleton()->GetCUDAHostAllocator(0),
+    ProcessState::singleton()->GetCPUAllocator(0),
+#endif  // GOOGLE_CUDA
+    cpu_allocator(),
+  };
+
+  using namespace std::placeholders;
+
+  std::set<Allocator*> instrumented_;
+
+  // Host memory allocators
+  for (Allocator* allocator : allocators) {
+    VisitableAllocator::Visitor alloc_visitor =
+        std::bind(&RdmaMemoryMgr::InsertMemoryRegion,
+                  &RdmaMemoryMgr::Singleton(), _1, _2, allocator->Name());
+    VisitableAllocator::Visitor free_visitor = std::bind(
+        &RdmaMemoryMgr::EvictMemoryRegion, &RdmaMemoryMgr::Singleton(), _1, _2);
+
+    auto* visitable_allocator = dynamic_cast<VisitableAllocator*>(allocator);
+    CHECK(visitable_allocator) << "is not visitable for instrumentation"
+                               << allocator->Name();
+    // Make sure we don't instrument the same allocator twice
+    if (instrumented_.find(allocator) == std::end(instrumented_)) {
+      visitable_allocator->AddAllocVisitor(alloc_visitor);
+      visitable_allocator->AddFreeVisitor(free_visitor);
+      instrumented_.insert(allocator);
+      LOG(INFO) << "Instrumenting CPU allocator " << allocator->Name();
+    }
+  }
+
+#if GOOGLE_CUDA
+  if (IsGDRAvailable()) {
+    // Note we don't free allocated GPU memory so there is no free visitor
+    int32_t bus_id = TryToReadNumaNode(rdma_adapter_->context_->device) + 1;
+
+    char buf[8];
+    sprintf(buf, "gpu");
+    VisitableAllocator::Visitor cuda_alloc_visitor =
+        std::bind(&RdmaMemoryMgr::InsertMemoryRegion,
+                  &RdmaMemoryMgr::Singleton(), _1, _2, std::string(buf));
+
+    ProcessState::singleton()->AddGPUAllocVisitor(bus_id, cuda_alloc_visitor);
+    LOG(INFO) << "Instrumenting GPU allocator with bus_id " << bus_id;
+  }
+#endif  // GOOGLE_CUDA
+}
+
 }  // end namespace tensorflow
 
 #endif
diff --git a/tensorflow/contrib/verbs/rdma_mgr.h b/tensorflow/contrib/verbs/rdma_mgr.h
index e711e60478..29227a9544 100644
--- a/tensorflow/contrib/verbs/rdma_mgr.h
+++ b/tensorflow/contrib/verbs/rdma_mgr.h
@@ -38,6 +38,7 @@ class RdmaMgr {
   RdmaChannel* FindChannel(const string& key);
   void SetupChannels();
   bool ConnectivityCheck();
+  void InitAllocators();
   const string& local_worker() { return local_worker_; }
 
  private:
diff --git a/tensorflow/contrib/verbs/rdma_rendezvous_mgr.cc b/tensorflow/contrib/verbs/rdma_rendezvous_mgr.cc
index 74f6681af3..ad3dce1784 100644
--- a/tensorflow/contrib/verbs/rdma_rendezvous_mgr.cc
+++ b/tensorflow/contrib/verbs/rdma_rendezvous_mgr.cc
@@ -21,10 +21,6 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
-#if GOOGLE_CUDA
-#include "tensorflow/core/common_runtime/gpu/gpu_util.h"
-#include "tensorflow/core/common_runtime/gpu/process_state.h"
-#endif  // GOOGLE_CUDA
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -36,11 +32,6 @@ class RdmaRemoteRendezvous : public BaseRemoteRendezvous {
   RdmaRemoteRendezvous(const WorkerEnv* env, int64 step_id, RdmaMgr* rdma_mgr)
       : BaseRemoteRendezvous(env, step_id), rdma_mgr_(rdma_mgr) {}
 
-  void RecvPostCopyOps(const string& key, const string& key_with_step_id,
-                       const Rendezvous::Args& recv_args,
-                       const DoneCallback& done, const RdmaMessage& rm,
-                       RdmaChannel* rc, Tensor& val, const Status& s);
-
  protected:
   void RecvFromRemoteAsync(const Rendezvous::ParsedKey& parsed,
                            const Rendezvous::Args& args,
@@ -74,101 +65,18 @@ void RdmaRemoteRendezvous::RecvFromRemoteAsync(
   RdmaChannel* rc = rdma_mgr_->FindChannel(src_name);
   string key(std::move(parsed.FullKey().ToString()));
   string key_with_step_id = VerbsUtil::AppendStepidToKey(key, step_id_);
-  // insert callback
-  rc->InsertRecvCallback(key_with_step_id, [this, key, key_with_step_id, rc,
-                                            recv_args, parsed, done]() {
-    Status src_s, dst_s, s;
-    Device* src_dev, *dst_dev;
-    src_s = env_->device_mgr->LookupDevice("CPU:0", &src_dev);
-    dst_s = env_->device_mgr->LookupDevice(parsed.dst_device, &dst_dev);
-    if (!src_s.ok() || !dst_s.ok()) {
-      s = src_s.ok() ? dst_s : src_s;
-      LOG(ERROR) << "s is not ok, error code " << s.error_message();
-      done(s, Args(), recv_args, Tensor(), true);
-      return;
-    }
-    RdmaBuffer* rb = rc->FindBuffer(key);
-    RdmaMessage rm;
-    CHECK(rb->size_ >= RdmaMessage::kMessageTotalBytes);
-    RdmaMessage::ParseMessage(rm, rb->buffer_);
-    CHECK(rm.type_ == RDMA_MESSAGE_TENSOR_WRITE);
-    Tensor val;
-    if (!rm.is_dead_) {
-      void* input = static_cast<char*>(rb->buffer_) +
-                    RdmaMessage::kTensorBufferStartIndex;
-      bool can_memcpy = DataTypeCanUseMemcpy(rm.data_type_);
-      if (can_memcpy) {
-        if (dst_dev->tensorflow_gpu_device_info() &&
-            (!recv_args.alloc_attrs.on_host())) {
-#if GOOGLE_CUDA
-          CHECK(recv_args.device_context)
-              << "send dev name: " << src_dev->name()
-              << " gpu_info: " << src_dev->tensorflow_gpu_device_info();
-          Allocator* alloc = ProcessState::singleton()->GetCUDAHostAllocator(0);
-          Tensor copy(alloc, rm.data_type_, rm.tensor_shape_);
-          memcpy(DMAHelper::base(&copy), input, rm.tensor_bytes_);
-
-          Allocator* dst_alloc = dst_dev->GetAllocator(recv_args.alloc_attrs);
-          Tensor gpu_copy(dst_alloc, rm.data_type_, rm.tensor_shape_);
-
-          GPUUtil::CopyCPUTensorToGPU(
-              &copy, recv_args.device_context, dst_dev, &gpu_copy,
-              [this, gpu_copy, key, key_with_step_id, recv_args, done, rm, rc](
-                  const Status& s) {
-                CHECK(s.ok()) << "copy tensor to gpu sync";
-                Tensor val;
-                val = std::move(gpu_copy);
-                RecvPostCopyOps(key, key_with_step_id, recv_args, done, rm, rc,
-                                val, s);
-              });
-#endif  // GOOGLE_CUDA
-          return;
-        } else {
-          AllocatorAttributes host_alloc_attrs;
-          host_alloc_attrs.set_gpu_compatible(true);
-          host_alloc_attrs.set_on_host(true);
-          Allocator* alloc = dst_dev->GetAllocator(host_alloc_attrs);
-          Tensor copy(alloc, rm.data_type_, rm.tensor_shape_);
-          memcpy(DMAHelper::base(&copy), input, rm.tensor_bytes_);
-          val = std::move(copy);
-        }
-      } else {
-        TensorProto proto;
-        CHECK(rm.tensor_bytes_ + RdmaMessage::kTensorBufferStartIndex <=
-              rb->size_);
-        CHECK(ParseProtoUnlimited(&proto, input, rm.tensor_bytes_))
-            << "fail to parse proto from array";
-        s = dst_dev->MakeTensorFromProto(proto, recv_args.alloc_attrs, &val);
-      }
-    }
-    RecvPostCopyOps(key, key_with_step_id, recv_args, done, rm, rc, val, s);
-  });
-  // append key to message queue
-  RdmaBuffer* rb = rc->tx_message_buffer_;
-  RdmaMessage rm;
-  rm.type_ = RDMA_MESSAGE_TENSOR_REQUEST;
-  rm.name_size_ = key.size();
-  rm.name_ = key;
-  rm.step_id_ = step_id_;
-  string message = RdmaMessage::CreateMessage(rm);
-  rb->EnqueueItem(message);
-  rb->SendNextItem();
-}
 
-void RdmaRemoteRendezvous::RecvPostCopyOps(
-    const string& key, const string& key_with_step_id,
-    const Rendezvous::Args& recv_args, const DoneCallback& done,
-    const RdmaMessage& rm, RdmaChannel* rc, Tensor& val, const Status& s) {
-  rc->RemoveRecvCallback(key_with_step_id);
-  RdmaMessage br;
-  br.type_ = RDMA_MESSAGE_BUFFER_IDLE;
-  br.name_size_ = key.size();
-  br.name_ = key;
-  string message = RdmaMessage::CreateMessage(br);
-  RdmaBuffer* tb = rc->tx_message_buffer_;
-  tb->EnqueueItem(message);
-  tb->SendNextItem();
-  done(s, Args(), recv_args, val, rm.is_dead_);
+  Device* dst_dev;
+  s = env_->device_mgr->LookupDevice(parsed.dst_device, &dst_dev);
+  CHECK(s.ok()) << "s is not ok, error code " << s.error_message();
+  if (!s.ok()) {
+    done(s, Args(), recv_args, Tensor(), true);
+    return;
+  }
+
+  RdmaTensorRequest* request =
+      rc->InsertTensorRequest(key, step_id_, dst_dev, recv_args, done);
+  request->Start();
 }
 
 RdmaRendezvousMgr::RdmaRendezvousMgr(const WorkerEnv* env)
diff --git a/tensorflow/contrib/verbs/verbs_server_lib.cc b/tensorflow/contrib/verbs/verbs_server_lib.cc
index a606ef75a4..47ed83f521 100644
--- a/tensorflow/contrib/verbs/verbs_server_lib.cc
+++ b/tensorflow/contrib/verbs/verbs_server_lib.cc
@@ -104,6 +104,7 @@ Status VerbsServer::Start() {
           [this] { verbs_service_->HandleRPCsLoop(); }));
       rdma_mgr_->SetupChannels();
       CHECK(rdma_mgr_->ConnectivityCheck()) << "Connectivity check failed!";
+      rdma_mgr_->InitAllocators();
       verbs_state_ = CONNECTED;
     }
   }
diff --git a/tensorflow/contrib/verbs/verbs_service.proto b/tensorflow/contrib/verbs/verbs_service.proto
index 0df1fed4b9..abdae1d84f 100644
--- a/tensorflow/contrib/verbs/verbs_service.proto
+++ b/tensorflow/contrib/verbs/verbs_service.proto
@@ -50,6 +50,12 @@ message GetRemoteAddressResponse {
   repeated MemoryRegion mr = 3;
 }
 
+message ErrorStatusProto {
+  int32 error_code = 1;
+  string error_message = 2;
+  string error_details = 3;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 //
 // VerbsService
diff --git a/tensorflow/contrib/verbs/verbs_with_0_copies.png b/tensorflow/contrib/verbs/verbs_with_0_copies.png
new file mode 100644
index 0000000000000000000000000000000000000000..0641e2fd50da3738e3b8113f4324156063bf52a2
GIT binary patch
literal 62862
zcmeAS@N?(olHy`uVBq!ia0y~yU@l}}U|Pw+#=yWJl(%#j1A_vCr;B4qMckXY+;>8Q
ze;t=MPM*RUyTV6<=c)<kPM%Y$YSTLkeKp;Vh%5~X(VChy#Vc^}6@$CWIuwg$&ve>(
zR^w8D;#8AeS|*~&$Nqm_^Zoyvd7JNT{eJWJo6qx#zuE2o`}4<(+VgSQwy$?zyZQdt
z+vkzzZ*_O<Q;$(#U_wJ16*h=>?|8qzy|4Rd#z7AO21X>*I7w~8^|)$5ml#if#u7Oe
z2L>dh;K!oy=#fd(22mtI1{Q|}PdFWpO#LImz<^{DqX0vrg<wNR_vMU(VQAu$)ENZB
zKLs-|pcv1<^2mijNzZRFYqTS>wG1p?^B7!WPWm%4ptzM$phK9kV~0hrSPm<)9~cER
zDwzZdRqa_BP>pdInI1T85EZ>WZ)w@vTNk_a_r0iou3!Jt+v??#$uEArUcWrK&$3M-
zCuUvj?y|t|@9t(kI?{RU$;->jXWKnyf`@8j4#%J0_x~?TpI6!T@6U7l%hCCJU9+#R
zGrjtu>gAl`a|^fS-VSPJ<L#>3yQS!<*REHqR%g7twDignPvKd%)n-@M$J<M*XfQKb
zsxk;pR~A&i`eyt6x-S=;`7d4#kB|MrueMQANpJam%@8(dYE|`pu(;oDQFPwU*0`#d
zs{VX?;$Gd~zrV8c&tmz%D>5n_kBGj#FlA4oK%u2rVNC4dHr|&XkIOH2>y<j1)Z}>N
zXwr-X6^;f55E8L%%)PzsqObYglxvPN^~8!MtNAW6%e|GdYiq)M=Ody#wo>OD7?kvu
zbVP5-5S&|fD>HLx1dHvr8_8Fe`^*3O^ZESQ^u_M|X8F?#8o>sFjCEKkQ1N!_^%s9W
zpI^Ry|G!zUuC4X{ez$!7&!6h{KRN%tum7+7?b6pbHzz+kJ3D<sOx@3?CY7I3tR6Hl
zcL@o)#02tj_b3?&*6*o&{rmm?|8fp8EN^TJ^#qR}Eqiyz^X{%vV}7N5Z2WRLhgiNG
z{rE6=xu5CBE9>`s;tE|KH}{;~?;PQ(%7d)p7wmq&c`Va>cDDKDDVo7O0X5I(mUrBb
z**GER{yy8-nvX}H9n6T&zP4r|yIjQq8<n4oM?T(uKCgOR@%eeDg^zE1m3*wnkUwgl
zTd!2A%gTGNzu*7=Z#o;Vl#6p4Pa)U!{Cz*$=GA_SJlrDxs%GQeKL?&|&)ar4=6nMq
z^O@Sq%X}AWg|3=0f2aPL&d09`;Xxy!-zaIEcER*|Oft`&hi%dsZ*FYd@_yg%Ws-ls
z@BcqnZ|4)C&yQ|r&ySo@+7q|hvvzm+d$ZRI_W%33{>xGE_!(c_udj=J8NUD5RO6b-
zx24VVWNd#vnf$CSuKdH_t=ZSjUYo49E!<OA_kQp9pxlr5_CM~k?u*)*<tl4kwxsIo
ztEFFKKR?{6bo=t|^8BbB1&aE|y2}$1BQFXzbeuL&adeg0U-;_M(r!U{J;CGgS^r}0
z+1xz4Ng;V*$puIDrL|vPtzI9r+;47=Kyhq%Okrzf!If2^r)$peH;BhoG}isTeSevm
z;OinChIU?QH}m^7#jg~fiTbUS-}HQL`Mj668`<S%ta=y!|JQWShEJbP>rdxae9Hb{
zZuvb=e)~TgF29?m8-1yn-_FBbwlw5bvF@XFdp@1o(tAN>(XWH-@*;P|IoRN#_(t)=
z^!<NYf8^W5?|Js(`g-};>bG0d<_f92j5d2Mu{Z2-sq5!wXMHWy*PnfpeWXM1%3^o^
zUk~}~XZ-4jU$eP)QjM!!!tbA|zbC1B_po0Sn{&VRyX@bO{q-jLJ7aS<s!vkhT=(~N
z{PJJxTxHg;wfHkdc>f70zG6<6CZUFo(+*l49lt-F*6*F(vp4LMYTctw^%s}C^-Z^K
zX7AhoAg;;H=fNlEc0S+ACqEhwKJwdlCvMrdx3{lX9H~hD@agI4;QXFF@1C~{aT|Sm
z^zo_cqZ@hmX8d`vxPKPAVfCROXQ#yca;&{udR;Q`*gey_6@TBA@7I3bx#evl<0(nQ
z)uF4?>@F$!-9Mdszv}hcHeNjeXqn&mNOeQzWwl)&k4a~~yR$RXZBu!d>*<;4^E}n(
zRV2O2-+y;<&NV|v<Idvf<@2hv&d#yC+*0-FsGP+&8LuM~<RhbxoSkoff6COgChIoc
zIXB0$_(hAbpMiX~VETQ2|Co>GUMFqgQPMlSqNUQnBlOR))}x&jSO0z6zJKO+1Dy|k
zc0W^g{r~s-Y$1z`^uIr!&r3HR%C9eR68Wd}y1*?q?|7f=v$K5M=KG#3{Iq%gUme{&
z6Xh4xoIl7e?-O79H8e_6ch?`G*##2n?am*LK3vGm&S#Rpe&643(T_8erGH*p^!4xe
z`|4sDe-xgzi&t}5%Kc-v|G~Ji{Qb7udB%sXh_!9&c)YFZ>#L_c;=0H5g>&EqvqIJ5
zUUMH8fp&#mdjEc(f35Vb<7M*9H@Q-d*A2f#^<J0zey5l}w(Mr=vdVK(=W<Uj-G1yv
zX`kNl=$J3FT-R@}=<ce!cy|8BS*^Ek)c^n6yW;Wur%(6B+22&^`+Y)x<y#M(Gs*%d
zK$Y@wr9OEtot!7U{N09PK}tJryuG*3xjkg{x96MF&Kf<RxS_oNsPO*18(oF|@2UEl
zCA{c>BJ-P9_FpbIcgiR2ncn+&zqEN?&YtEPw(gic+x_hFZ*9>mjMn+{$3<rQ;f)V;
zcKwJEzjyigqJaPNSf9=R_eI_6&xgal!8QEtswbR#<x{L$oa)aloI6LLO<(uX8!Kbq
zlo~#0RiO}I!%}hfpi<jr|67S&NlR;WCdVyri9CKo>D;~J>$lez+*W#2abL#W(RYTy
z!5K%gh3?H?=B!)UKc~a-ZD;hxB-W4D`cqF&oB36+`ui4v%kw|~v*>GhbV6D8QHAZ-
zE5ZIV#eeT`P|#uQ*dfzt!Ds#CzlGRrTfd^4Oq!P&Je{WW+yA?v@;_ov#m3+XJ6~Ot
zoMN<fv3ydUQs3msADDa1e}6ifzVCxt%-?`y!DCE5a#eAO6ZK|W@EUgCJ2h2%d8ftt
zvlkqHwN}GhLS4-a?(fz_ZSAV)w|{^0`3sf3^*-&=X}kM2y570$Z|}P{db=Ovhg(v~
z|L1)_Hn;BA%ghIRHU$^`NZp)vmdWwjBmZuT=|8wb6!VxEQ<k5xVc)aw#~#6{-y3>m
zEHByg#Be(A_?F`uW2TwAx%Nk@mAYSh^QU`u-?Z+PUXOLVvGd!hr%p#Q3#V63DPRXx
zp5VGphU3GmX5}`EBLRnWRkhS_m-;OG!|LcdN1J)4_Gi`h;Ac)p3a=H2m6~Q<={S7h
z;4vj-2LG5g#*fto1x);9H$UGq*!N}w!}N~L_xUPK*V`7px%+s7SD*Cv-Cqu$zqjLc
zxZdQB$2ao$7@xN5<$FA{DE@YC@4sKK(<X14yEUkLhr#owFN@YGOLe{3&0Gu*CJ{#2
zZ6~so?tRanzgByD@jHfdQ_fD`v*YiV%l>Dl35k^_UNgMeWl^Vip7V)H@eUX1myTCo
z8@>&^tfY4)viz2CyZE&qzg!d<1jQYX6khC=HlNkb-)+3lLZ@N>f=?G7`R@9&E%V2R
zhtG=VdR|(e`TO7^;r0T{Z~yMr{eBy=In8(1i$&cpZY1|F)s5b^B<SDg!;@R*FZtQ4
z^r&Q#=(_2fqx|OE-3@tZW$$M{Bl1sSlJLD3vtKrKoOa=u-^>6GR)>#58{)ZcM{Yk_
z6enG{sy<Hm<IzC7IajT2m-=i|I(JWax$uOJwQ<#te&kq)<<Ie)BYe4VUZaox?VWxd
zJ08v5U$~h^NpDfd<45`X|8D!KboyAVb*bI*n^Q~v=`=<BDDaB1{yg#CgICk%PWrxb
zW5*?TsSO<|>t~#kSZN$}E7gLLJ3Zx*@3xsXg?HTV%|4zKxcp*j_58|bl8;n0e{Fg{
zwa52PdEMfp{MnVSmmbx0iMb;uzd;<@=D)<5aq)2dckA+ZIp<EU4PM^oS#<3A3A;}x
zl#d9SZ!*vO^yK8TgdFS7r58Xx>C?S_Q*(9r`n+u?3!Jv^{eExu4-UpYn@Jn|-*#>m
zzEJo{=-%V=oR4?JTXkE=OPyOZ^?I#9;g4fax)@wy_*`W62R_o*uD|&7QI5}?In9il
z!+ZCZ)_hoUf9hMy!e1WD(E+<|aaZ}x(aT<b>FN0|hOEEh#CqmfPCmNo{?l?lnM0bz
zUq#(*q5Yo;_DnTP1m))*nYp4l=Fg<r$`wBh3ifS^JnkY`C~cNw@Y=L;=bQA+wSF@D
z9iA8XTz9-5)PLWykYhQIk#j-6I4J*jSgdQh{OHs4iTp>Nb?<*Mr)+-go0^Yxrgf*E
zl+IA$mg2LX(mrFx(VKIuN?)a%+O1dTZCdx|#}xA(+ltNU=gso>{>YtPyRqSM0pqu}
zj}ITzuzo!9vG&_dwNKB^q>8uC{N#Mcdq>d@&HaBLojq7|P`I646r5NXIF8Ie@ayaA
zvsTKNPiNozbkOrr#ryp+H`gRD>afVW{yxUKZ_cCXm+y#2A1TaZ`L^%lucN9@ZfriD
zZF29qy1VW{Ve$VzV|8pBI!;gQc${Tnsyn}5Epcki6NcUAEo<%K-<*A99xUT@g5Bej
z_`e?Ga|ZEw_pHB7p8w}aCNH;(m`%;Q+t%;*Bnz$hFjHxIOi^dy&)u7()<$kV)^UiH
zTWp5W9|g<bZ#JKOIQ!1B`-U7ct8T7~-CeZp$6Vg$Mswt=&fERgvHkU8u^J~=_o>Hy
z*5~G|KGby0?uL-kx(GS$4b0GFwMg)RDZ~DbXP+-esGK~VQCC@gN^|*)uiW!XwX;=r
zZ#8&5V}(-R?YQlEa|Kgm_)fjHh%^3pbJw10kM>r-(cR)IGd=m@$={D7dh}v;T=+71
zZ+=Y;!~QeZ+Gl?LD^PgxnW)!$MFzq4n;4y8LnQ`5@x=$5RzedALV;krC4vL3YDQI$
zh6#G27)>X1%{u{#O7E^2>No3fG&JxxvhSMDX$Q%=oazl<0}ioE-0k*Zc)jqAVAza~
z*SEG+ehXWu!l5$bz>b;RocwE9_ziY}hZz3*H@M1NbDqJc@IbBMYrvM7jDq65@?Omj
z8B7%!t!{1L3@_l#Bv80f#!O<aI#UF{D>Lt?n$hsVkv3$x1-IAU4tsTTr>$<xhe<DU
zKW~n@w1V^Uima_tN9Oasx;%B)Hz|j9`AB;M-te7QSBG7CY%I5M_O5TsIv=}T5Pe%G
zyz1P6-p9u#sWr5RdGgy^@iQp|II$=^^2*q7@#+PZ4RTz)I%^#+c3TJ*h{xCQZr!(L
ztzq3eVdIxCEid;@c&sh1l=uBztV!OH1yc%y9j|Zw?yxFkWs%7#4PK@$2Dzvie4uO+
zAO$ipdCtYtZ#WWcr^uRXhJI1%J9=e8-S)u$3%@Vllce+}@6Dy|d9#08oSkLWDP@x7
zl&pSsN!C?<pU<~SU)y%<`trg(h`;>L@AvzcM{TY5@;^6kme%&%*I`x37nzFJ$2@lD
zUm84XuIusnC0St01qx>{gRL$CS)F&{;zI!zg-1>Yita>xQR=fzIn4IO#3}dMo$%9~
zTC!r51#=Id*{eS1&Y6n0TNs|~D}Q}u+B>;_4J!}xEid8RrdnUWcg}|HHtD5~4IKQs
zXP7{v0~)+c0)<Nr&%H=3W7!~gsxLanrZ7o^dzIN*fB&yxEOYPAv-m41enrkU`p?S%
zuS^!jsTbA6y>@dxSlBv~5tPxFIx;A&GdU)?_e>wdHRqd!-E%uG-@PJd#y9^;^`o4P
z>WR~0>h7F4Xj1TFM-gY>8!LrJi+r`Fd{b{|7fa*dXkg%&!p7LK<B~-B7b9z?2>#UL
zx@Jn}L@#M5PMaQ6$17VH@^?zd<i7>6yY7eTrCjnqQYfKjQ+QEL$ctUQp*^f1Y=<zY
z5%kE1K}ql2V)onGvly;9U*r(~JI}&1N$F6}?$d|fy|dfCOlW3@`Z=+}IY)~=wFEx1
zvll$>E}Y;Wv!%1;(m~D#3q|LMzGDH8x(hdSbbo$v@LLzCXfrwFs{d$5^*75xA*GX#
zy!RBn*Q-0-u{HI*-m=@fOAJEVdj8n%-Ty{5sXunBl<B2Ins19v_(|p}Ei5TuQvHA7
z)~41MhqycMRcXuKI;k#f`&;K?BfF4pykE(`pDx+gm)-yT>9kAw9LFWEGj30N@vyz~
z+g<a$cVx>2{=O@(K6SrR{o>y5LO1t+m)>Rnr*TE{@lDoCa{q0=61rXQ;^*_zYu{}O
z|K9oj;#bd`ZM@$jCcG?uct~|eZ0xl9=SkM7To=DypT70s(W~1or!9KB$v#=nH2c^0
z`p?fco%gq1`zL}|V#9H}+jZ4za(?Hw#r?fpd?b6%;xpc}=bz!v^#1OA%kuLX+s~K(
z#a<{ZIy=3Yuc&_ijcE!pP4=e>(ixZp3K<n1PMf3}@agr!4rO_6LGg|Kja`rD>~NS-
z5X{Fh<&U%W8Kp4&52*)FEm|13*p@qV?}YlNqDpZxpPxoqY%VyX67faiwd<xU+l$pN
z_L>Wwo#UC(ZIQd)HasQq-*NkQ8~>kBKHV*~{(P?BilZ~dGH$UwRQ%q>TUYk$Y@wue
z?)LE8yX17eTzGcpB!1kS7b`hAa`P#cEqC(2yMF$#+BjqW=DE{%%{MX+TYKxX)1h_d
z;%_saIX>&IW1QHWF0qQkk3mV#Li<i+6^mD$tw`Ys#YZ9zC*3N8uirB%ywh>z;C(^K
z#TFb(1T}uPPs<k6?v7VG%3bm8z{clp-MqE@)i>7dXyUI`xwT_rhxn#93zG#mBskuD
ze{0Ll$1epK4><MgEp!y$s`Tibb^nU0uO+u{a35oGtbX+8$i!`q)f}m1DZgTV?XRtl
z>|7Q4ox@?ae6?z9c!1;MuXE3~b_fgV-m^?ptF3su^;4YN`hTo5&DZ^W_9}kc=b-DV
z$r<PM|5jHr3LX&_Pqo|WqBrkor8<M)^ztVe25u0qtWkKR#N<*az~Q~%iF{O`jiAn@
zibk7Kt-y9Z7t^dM5A*NLc=7G_={?13qlM4Tjea&!c%#Zwx!Nf&cD=r2|5@5Wuu$jU
z4dWwCZ+p!<)O@wF+yBdl9Vx6{x54bKYf0@oR*CC|6(8g_3Wv;L-MfB~iwyUD@i>K~
z^A9waFX1kI`|!!PtpY;3w;G5`r{-9STo-Gr@xNf_K6Cmlv6<>tS3++e_}un9bDhha
z>orXs+`lu{P57<&NQ*^V%a@~ZQarz_jC;q94u(J(mP10Ipuc`kS@z$Htsl0z%Cvvo
znBMLzee-bH+pR9j?n1p%Thk}z=%4$#X#V&6tb&F8wqarE%l%yB>mtNH9e>*=_jbSf
zx$O9PTQaM2-s}zeRV+3|>CTQl-$HKf$vLLPb@XuNdsCiW{sxoP&#jKzxZb^g)7}X(
zc^Z$G?)Ci^XK*`jditrON4Fj3vn}ndoKgAms{i_(OP?>;lf$9($f$Sn{hCH1obMg-
zvFYB_hQo1NcDuyvE_0b{EgW0GcyW@d%eL)j`3gUCRTlFusd)74tnw_w#c%k;6m*{b
z_#_lsH`%HBPR#GmG4qQnRNfzc+<n;h=Oz2i;-|m8`(W#Mi1)D110SVx+JF4JeLfzz
zx++xcTh{cAwnewH#g7Re|9<M;QH6(_1D`E5c=Ge*^UmbihdOpBREz(8Hu;|WsfWjQ
zd;+EY_a`nU^l(i1Q>F1J!tsbJOUEMxfoTpW?YHa^vdy{iKwj>}E#c$yc04@Xxo*#<
zGlvh)lm0o=R(21^wAMYg0ggsrU%DD}o6lW2ss5bXk<_RUyY238$@D)Ywp(5&z|Y1o
z_R7h5JCZ~S=V%63PuR03#i}LfSfESJg@2y1TeIJLnmqd2dbTX?)vq_=EO*W??$_Jh
z{G9K5=#i%j+tbeZNxB^Qc~-q+$A<prbMC)ae^ep=Z^y*u%m*I|L$==f-`@5+LLqW%
zy6@bScUg5|-cqKCQ42OXT8mv<w&3d{JAnd0jh|lwj$|?liVG@qH9JIr3gM!+wwJ2U
zSIE!So^xrcOq1)K8yj8j?wZ;f@<}Q3z2&11UVPSDY%SvCgsTm{D$TG@PSm((e6L*G
z?a0mCTMNtU=HD-mujKFF>wjizzR!`M2^)?m2wRHJvk)t^%DHrIj%IS|dB*TZ;xFTu
z^-SNk{QA}FInS4_JAKYiv-*pLPEd91+l||f#!slNnfXlkHmjg`N?7;{+c>4Zqe0vK
za$hc(wUvp}P_STY*7b#Yw!b82|2%W&WAZ!B5W5U@#Yat(4%O~d0GF`>ipUweE|MX#
z<L6xKwE4p7E8mq`Rn7g|zgNM^cQ4O0?byB93bDtfUYO+yroT_TbX@*MV$(Ot^tkz_
zcgB}R)$cF)_KRnq!t?h#&#n0^Y!&CuQ}K2F>rI<`xuV~8US8N3qjtOaQB3;zO#l1<
z!R5z~<j39m^JZ_>oq2NX)oXr!R(Srib*8^SVWRxmmA}pP>t^$1RXJUITIx8xBl%-#
z`TXdW(d!$(yb0dST={4Z%k3W*onKzq)gF90Z2z)E=Bi8K@p;Ri&#UW6J6k)ouI^0X
zv+_qfAK#a)zqC@5@y4>RFExXIZdUGF<!@`feR(T)oLA+~ZM7SZcAOP`umAP$Ci7pg
zwE$oKZ2Ta7>AhL7xW3j+>ldNtzNy_auhRmx>janhpPlI!y6TPpWc54KuT6_}`=!`a
zy#Fu1#&!4gWiEfb?%ytX;Zn=qzsPH<*7KA-fu0sjs{7@*+E?GXD^MuNxX6N|MYW+l
zEFx)#Jd2P+gR8ds(L!1Mi4E^mmvX=Fm3EfCU(?)|cYWTgZM!szwik)|olNT0|9I$T
z+fJL(T|FGfH%?BQwMb{zFaEvr`Rq5@ovlq$+xz#Ll5(0|{Qpd?Ba`1J2VP(LJJ#(_
zVXd_J8h!PuBdaUEe_%gTdwYBCSDo*d_Sj$EwRN?x{f~VY5C0DCw{w0mxj#&ZTU)U7
zR@vQm?C<v`vuya%%=(OXZnc!+iua$h+K(>%^7nyNyX3^xduLB%ml17IYWNzU5hGA|
zv0$MIhYP5kRMKX-`rx^?72158()H5YL>yhWiTlhJF{w@!EIE9cW%_sTJib?*I?w(c
z+%`ub;_Hu`TVGFV{i=HGw(8gC`N4;!^SVOv^;~7D??kNGp(j`fYGd9>=GOfAXVU5Q
zo|TW6$bD?OsnpjxmGAl&uB{^eb2^-EXRkQD*XsY#DIJ&pe$Xm@en8Nq?o5Et?oxYc
z`P(MGJ+JCA1ef2w*fe!-{w|(9g%g)mg|%*!JH-{fO2DS@;*W(U94(;Md`5}}1E`r<
zsdzx?-R|@MT}x-1E*JegZ{hoY?pJn1m)}~Rb#=|H^GiFDnT;wRxxDG<cznU5Hq-z5
z4MFB$dC!Q2-;Q*a`%KTju%h#0(?)iGz34j|4K6L5RWo1jxSVljw3BkKp=#;9vf~O;
zceZDrl0H%D$)VY8F<0W)y2Drbx_8{(UE+}(B*>YxX_k&^1jp-z8O?lHda@J5nUo&A
zGS$DWzb)<I>Meyoi<F<NGR=^<)fUrfvEH^&$Lq%=+rp?L(7wzBwH-g=cI(;u%e)Uh
zy(RsO>Z7J9$qO~j@^0@kJ^NbSj8FRShV|;_{EwVWcv&`W@9rBfIeBb1XiN|Wb)^q;
zV0VDLy;=9a-4Ba}1dsRT{dsPmdDPfHc=7#vPNnCv1W)yW9P(z$CVQW6<u407_pbMO
zAN`omV@Ar-Q@>}`Cf20h5<cGa@ulVE)2WBIxXzJ1wes}YiYJPbJ6xUY-r5#En#sLO
z{hap^-<bxtZuZ?1K7P+-eQn)BHc9hzUxwEU*Ib<0B!D&A5~?39xw!t{)_GxPX1i~l
zJ>Sng_O9Lr-9(o;-y~iZxMrR?w_}B~NNn}JJw_(4?ib5id8=(ddeY~|rp-RPoHT#h
zU5~RA?45HpX!_>EHl-F%x7hcXzdcu-W%hE-#F?9dPgz#qTD+tB`niy0zUwY)8!zWu
zI>o?ueQkRqyR11Is0p7Ca!y+D(V~yenF1WX4E$>^oCp9fZ29DL!23wy<Nr6ec8A$~
z*LfSebr<LQKL_tsr*Apwe<W2W#>rgapa&=KwpUu$oG%`eRnn8PpTgF7f#t&mu3jEi
z&|HdwKFc4MIT9Z}#{YMU&%5JryyUH8D4+G2>v|nK5)QKoiuVLxdUvQw<5q{$5!c7<
zmlOnMF~~(NoVXA?jq#+_K|t_$+nV@4X0@MAoh~`Ec6;F2$$sCq@G9x)A8vC?@?v<s
zFoT&Ldt^Jh%1D1csy=^4GRLdfB2C$Qzl4_83KUjKnwba%gHnmKMaPbeKLJuKPQn!r
z@BM!9qBOrvuN{<?6d0I9{TqyfU%i=9r~B^SzPoQGuP%CWe#OPJv8V2G35I`s+$#3S
zn?XtM){UcHM+#@%D$NeBmMeSdbg=N+q^9sqwl8)3u4Y~Fu+}k>I%m7oICbWyEcTld
zi%gl0M)P;?nD(_!reb2i9^1xLP}h}#$tU4y@#hz_^Y<BY`fFXaFkn5Z`smH!4W-Xq
z^k=PeUiI^yLh9kZ3AHW`H$FMOFHhZCBD+*z%f6U9$IWc*9t-YTnVmn&B9A5Z;yd%F
z=Wd>MwO+BfhruOg%S-`jL3REOd3)Je#R~rzNSG&0P`t9>qT#a`p?#oErqq-^HwH+o
zfjV|cDjU9Sb^Y?Q`hDQzb=w2K#+#U_-BRjfoiwL6Y-?0+5qtjc&K=wDM@=dFu<bhI
zgTR|2K}S5Bgx@*$+c|egK7BpObTNmyU`cE>{}NEeak^(x7Q2$(t%BoDM@}a8^h(LQ
zN1Nqy={$PEU2$<~_~j++?i1#yK2m@5McSy=k25*V`dFVtM#t&f`oGTzi_Mg^{%+8o
zer}ePW~p75%#oA(FF*bI;iY-*_ulzgF3V5L=^g%heO0V_YQNa_TYi75YxBD8q+4%S
z6nqgl{O!gv_2BdSzF%08ST6bdr%?1FU(+dKOag^7xIy7P!G~$~w0Z~ad6jKaWqTU=
zESINR3Z{RzT9NSRYoqR?Hy!fpKVRA5yEkm&{{O!o<nD7{H22ouXKBU9qNcwqzxYvo
zZdBfaqK$s9E-jw_xyCYX|C~#2bgGgsHeE@)o73?6%<gGx>vFF-ePetcGf%(v-JIHE
zbF*!a|9w<;(`@GvIq~+Ca}WM}<5JR-dLPkY@iOT~f!DtI`eM$%x?iY1+E%-<bysPY
zcR7E31E}`bTzDit<L<T=hC7x8EWPr^Q~WG;&dhACw_kEU_e%YXzaeXXHJYsO4}Wzl
zyP@!C@$Z_#*mVN_e-Bt}dH3+?D`CMChkRX>roXhV5UKre&wAyAi@DL;F9jDbn|=TD
z(Rm(qtB-v9;QaH^>Ghr;Z|q2{-T5hJ^}E%7uNmAban4j@5NxjmS1c!0HzXfsi~4tG
zbI9i<@t2sRUj<#?cGlK<^>yBL%=^CnQlG4TE_<%xYj#2FPv-NE_QmOUC|EsO@oS!v
zUmV->q`4{AG?dww$8C81m2LfTi#36J65?J@3V*(T#;vr)r|Umm^vb&YtK{<`+Y?oN
z@t-!YNL7p4u(Kt$C_V3AHJhM#Z)K9}oVQ}v<6ZOxkH=S4F4*`w#!0lTx9&!V>ECU~
zTVwZ&URLsBnX>2VrMI{9%fHOC_<eSEi)6=gpCgO&!n-Zr{@b2>{;NEno^jn7Ta&Cq
zCc?V}!gSncH!Tg9Qd;-rn2aqOqkx7k)BSfh9J}RbI%)G;&WtMAdGlsvB=?E?MfOhX
zYzuGgPO1LVlp}rdHG}SoANS+tS^T!yuJKwW`pC`KCY7AI`(v)nDLJ9M|ALtREZHor
z*HV?7dF!9QF<rc+al)ytzxP0e+w2Iz<<9J^6I%pAR_)0@#kyX2_6KR%zstN^&gYi%
zEIL@b+sc{y4ezN>0i~TgKrK6I-S8QUbRXrtzofX9=lemisbwCm;StNkvSzl-*S?}~
zE5tbyH0iVx8b<<0gvHGo+y4fISKpnnWbanz7oV&iPjG&{bnl9(=eJz=opX1VN&mFz
z9h={FZ1UUWIw!6?<G1Sk#_ZTdN3R$ApZBc1l(TWBX2=z7Nm-YeqjmnE8o_r>$7cD0
z9#57@eT%Ot>8(*eSNHkp^Uli)r`9>Abz6j+`Q4uU{OgC0)xzo3kG8M}-JYI1?QF(n
zqnk?=A5F5|_d=GV!QrJ4$0l{ZYkZsbDst#X+uSVu@po-mYU30$*E=813wgcR?ytW_
z{apUsC9X2<{-?@BIJLiSy8ZQ0)$3`#TdJ<MuT0U1Q{tPqLt(RS{db2WuE}#f1fPFp
znAE<YMR}8TY`4XJ+rlW3E{0dve&<P<7X_ShKK|8zo<Y|!zgaJLtNR_F^1C(o*2Yhh
z9=P5<|MV+!?#<oZ!TFcJNU!p#ezc|IvXfBtqeZm`W0opDDmpJ>+Qh)o;INQ$P3>RP
zUo|gxU)hmee*VprL??~X`j}4NHxnGI=PFhId25@uYu)+l^;t!XG9jD8vb<a8cYMAr
zEM7KuiR&C$&ZF})txFaOSH~)u&At8VQb>@HzTojn&X}?yJ?6J3g}>bp4&QiRt7At-
zO|tHzJ@+nEEWJBbzv{l@_lw2F^5)fd-V{IDvq`Tqwf54&ZfCcD^X|oe>suLac4^=4
zAG*q~-n^L-Cn>+wF1*Yu++v09qd&Jkp7vYmI!AVz;!KOb_nx`VZd)2ErL=C#G3mXm
zi~=5ZjSHS1*}1LCK=z;Y>7>)zS;ti$y_w(>&gRbP`sLT}_gt^PaGjm9)nVN=&rSa}
zO+NDSrjq`8>+<*Rf2;0zY|?d|-~arS@dNcod#)9q@SFMny{1cy$+L|@g%@{qT~Y{;
zVTm{*yZ7DMM+R(R4d6<9iQtOse@Y(xZ;IKyJ>Pt$#&_#S=~rHCk3A(?DEm!zN!%Ub
zIS%XM3?9$f@!(8xsk%#y$t3K}q7AP;-u!a+`F}2zNBe#}5sor^FS)Ax<I<8JN2go7
zZSGioBT;z$f^TPBGXsTY-np{X+*ii@kj%v=<zJkqYzvqMYKLxyR&MA`lBTX5U%T%w
z^N*{X*|+nn*{aG*C+BT*Jn}U4#sv=_&IDVL+y@F++d~Q+9lw99uV1Kdp0gl%*VhGF
z?Xu6Z#a&|J+IV}9OaTq6vhhLcQ*<ls>T<;XmLGn8uHe-FH@951=T%Spwk~R}RQYQ^
z`y18od(0e<q`I_7d(8kFVlSrTC;vf1QfhVB>WLele>*sFXV=VWZL#ffc6N^iv+g~L
z+4b#Hyye?uX$xiH<zHey{=2lLdP#oKm4m!<t0$#Qdn8@@s{Uu?n~uhnR&V~@l$?=M
z78ki`*H^hIUYB39-qfxRoy|9Q#rp?=AuEGYKdrjI%{l$YlW!%@{PsUqdcw0;;q8s@
z9dqy8+rVg>dF_hl(_N>&Z`u1UZI*SqTIpNc3tu$F!xzd$ou0p6OZ?elKF}EX3GT+W
z=jYFPTkjIo7yN#8|Fz3qm7iCXeZKs1#p>Iv+fJ)K+GFIJw#wx4{VzUGH~+OcRk`t7
zl;e8AJ=X*`v2lL-KL2-$y2{CI^JC7P%X+!WTRmvwdGWqW+u|O-+{m{{t^3kCz1OF{
z%=EpxE@#!cDz2?^r<VC2jac@=b<W&RS4%s#bFW#lbSvx8yNBf!e&=k<VR!QnT9<sM
z!||!v!Ch~*p0D40dpi3byX~#-FUEiH`CBZjc1q{{yW-WBpBl_x?RxXUTHfvU%U$Q~
z{;LJsFZK8S_c3bc=gdC)k0#10%{E7jw?4L-e*W3<Z53A!KRz>E?8?n^lE(gNdP~@M
z&6ix-wdT~Oj_ez2%EM9vIY8s=pF9(OE)opk5PZ(H{LYW_^_RNR=Ph&Fzqj?(oU2MA
zY|%$<HYvYIx?U>MaJ|rLArDVM@iwJ%Tlp^jy}LJC{p*`McBR+!W-oGWsoghsh8Fvj
z&r**6%06>{iP@L)_3)(CIWx+nJ*^g-zu2R9rAp(#mj0u<uKJI1x*X-A;@UH_9G{l@
zb<|jXzx}^qO1WG04i({`W;fpzHzoHklC|DvG;cZoz3TQK240mbI_}==+EJ*?E_^#+
zb=lmIxu00`Z@#*Ith(gT$+M>_U+sw9cChZmPPw|cMMuB=*qK(e?#R^%b=w8yH?7=k
zpSk<Vgim^R-+Kw)-&H@;Wlr|F9j7X;PJYdNp8M;CrjG8T#*%JL3>*T18#{KGa>p<2
zJw1K)hm%>nQf+dxeI8D6WNo~D>-g#F!)>?H_sA9B=`^W%@b<`S@!C4w`MKKq+js5@
zo;7#sugLXN-xLWQd?fnxN%=O}?Jrd?E%9Hz-D3Nk6OZd|zLx&GDc(IHlYiQM%bomv
za!%1^`HPZg&+~b{eP7eO`M-BWUhh0HW%J7=cGK1S7ME4s|22CHhxYmodHuMu{X1h5
z^|DnY``7Qg<9FWn?+nRg%jq{v<$n3t+nxS#_Ilj1#>2-azI7B?25N)8_O^7Nc|D_M
zV}yihqLk+n3HM1i14H9pC-2u-=CromseWSj7Kb-3F#)CeidAuW8M^DXw09lvbe2_}
z5?GyM7yp0d&z`-NpO!xS{qwY4>Y*1~*yHtFVz$RIXTCY%cxCH#2IZnJhi|Q~uX|VU
zx4mJ1&$f<J%hk_m=QwtMP~iXQqvZE%OXcNbQ^YTQt^Kn4_`kzqPk+a(Xcu#dnG)z6
zDZtQJBc!pVWA*JdJ1y-V?EYWt-@E(kg4$JKXIK56=@N6n)OUi0%HFt#;-`fwm0!;)
zj*gvGb@KPO=<YMOo-VDdSGzgu!Pi5Vz8$IDx;Xw=)n1?L0<o`o|L$He&*JCrGo|*X
z>APNv#|z1(?&tZoIsTfe?eBZei`(u_t%_WCoqgH6iOqA9Jl6l^TU))O^5VO#_qO!w
zZLlp|*L*)#E^_^^*uT?GEMK7HcWsZ(r?(%ucG<OcoPKpta7DlxmzZTocim04zIRLe
zTvWH*z4gmld&Q*w1WaLP@%Q?h`rXR^$Z6r{w}hWpo&NmlkGQAYHXV_t3lG$OeKmi6
z$L0J9U!+CP`n~u3Jj-=v(#@SQ%WTW*>h~Pl(|P(z-xtUCdpF<D>)fk%e!-rfkN-bc
zS|X&RcVRQ9c_RbI6oW>mt{t(f!!BN*Z5DKO{{J~r_X>iS1bz@I{FR?E;qR}RN-1yl
z|BiAowcD0?YnNNM(YrY<N>#2RFMso|D~>o7e3-jhf1_AR{?zMMPS+G1mFl*9_;PE0
zTD4)W|E(W}-<NDFsn{E{ZRXETi@onQ|9)^>wWnjUhtB?Al{uyN|9t!Ov+|Kjl$Nmj
z^<NR&Oe$+Txwgu6eOaov=+H*N!gaO2i59!|SU*ylUiIVPL4NC>MMa-P{<c0d4^vvP
zrQ`LTz4CXhSCpShUiP+2$nT`yd#|bAeQgaaV}iRa?)JT#-xB>wfOF?{vBGT$SK}+}
z_kC+t_NiEOWwyTmoYnG<&N^EeS~|Et_a5F0?gH61I(6>&eynd{Z~k6K_wCvKrO~ld
zYT5toPf_!ax$t7T&peB8TP3bHrM<eRjwFA*bGdkqpzE)?>tgKF51hFsyu6WJuyD_}
z3%?(JIUOa~Y_%ld)pYTlx3}e$?wd0`*C;n?{yZbkQy=vAxXk%mWHmWp>-FwU$3EX`
z&**m8|BK&pOI+0bO@f8rsv<ro+-^O&sa)Lp`f-86iSmm3M2|Qfxyfm1r1+7GbAmbV
z+qz=oop-Lpte2f17dX#CzrTMvug<zxGEwJue6u`K@a4n5D~rzyT#B?Rol~)``q7r^
zJLg&7RqZI6?SFpeysE`hPZnp@pR<2|Z|iKW&~;O)vNQI7oxJgS{RN|~kG*^zRXO~+
zI{EYywbrW7Ed{51{P{quegkw^xKtnQ_&-~Gzc0I~t)jYyQRfbYq;}qqw}jQ}XTO=g
zy>6bx?|j+o8%s~CMeTicxac)kp0%{U%=FJ)+a`W+Uti$K`2OkEOZETv{xf^MDQ5mR
zp_h$Y<JC@||5LOtzF#`nGrXzd(VGn2*>h#>GW|Cg%(JNUe6RW~rudlu5zgRh|2flx
z568(@6}EJoKAC)E^Q1-Mg)_xo#My4XAJzF{$B$3hr;ci5>&v$XSNTTG_0*KFesn`v
zUCGz%^{LrQeg1C#tsVa&>73t5{=3!ecO5^^n>DxWOoOgY?)sc%mXChj5u2klZ|{-P
zN#BmF{ga^^TN*3>>grnCt-C+{>8d?skyZcptk#t^skNU^emS{%<=v_AGwRP*tg(tM
zKK$_6{qDFZPu{+JzHhDM{Ph3j{@)Tc%jBNxX8FG@ujIGh{u6gyJ=OpB)4q_IKJ)Yc
zK5}1Kwj(w+OzYydkMGaf{??vlm}+`;#Xs4XpKb-*KDVdYtyw`JnW<HzaOaV<Zj0)R
zb6>unuA5P^$@n*G`7+zWP1o;>g?};+{dVT&-|e}4o7DYSc+%%jDHZ?k_y6tF`ETD$
zDZ5=)|G!dp+2*F#)2~}`uXt;F#q5USj5wvf(?0VIPQ6+E{PedYg-%mWK6YOgm-M>s
zhW^~IcE-Q=ZJ1}F_daxgN6OxJSF0bT*lN^RXz!Tcu{r);b??%>cE{%Jh_MU2^jfP*
zaCz+Gq+NFYzTA4(r^nA<yzG0`&6}D}jqUyCv;=fn&outKCU)<sC0luC&%Lx&{oK=4
z;)2hGQkC8;x&JNBY<K^<;zu!yj;a>K@Rwdce0{3QF+2YA_xjV`hh~@GJ$q&U^?XnD
zr5k^*`YurMw)67a8{c=Flm7B6`PI{#XTN1fX86Y#cJ7$H?!HfQTfN&<?f?C4cd89|
zC4L1rhjv?th5mj2<4$KZOUUol+b=B@&t3kmmT#eH_IFM74SVdiyFPeO_h0L}*zKcE
z0WP=K#oBvhx4%x<_%c0xTQTcXhXo;=DxpU{R@|!ZfAW6j{P5P(H|sC&H4?uS$(0(b
z9J&4N{`#5U)}MU-UawADcXjHrH&dccoctzww`@j0(Dj)+zTHn-Upn_}O!4Q>3LnL0
z-QHbd5bqfGe((M_b3?AVMISM>{HOA&_%?55eE7@NIo*PBah2?wJ|5ZmtaV-d`K{7c
z_x<Mo&z|df;!w7CWcbFpXV2w$TL1r>A20a)_(c27#)+qX-8lDrfud-Z?UX6cvo3b;
z*wJt;zt(8yna7vZZ$!E&OkX%9CVrt@l#t&_*E^NZ=Xkxfy61ZQ&m)aruU^ZVeB}4P
zv@?Ca)Be2=i*`MEHE-Gf%j*1wa=WkJi}5-eRkoxv`S+!klf@Y?H}Pi{?4NzsmbuN&
z#aBw+ceA{#-`D<C-qHJiT)I+yOMiC2f9p&CH*}`IS#^5b#cyxzO&)o)1YBM9f9XyC
z5N@4EVRa#$>vp~4`1Q2YIw&dN{B584v+d9Q*T3}Y>R&ZqhEM59tEad3y%$vE6l_;#
z<9O2NFv&IMod)ablXaH4(@%Y=+z?g}zEN1?SfSpPsueemYptjiD0K8>-Esb2g7gB9
zi}#ZDU%b?|%g9<R*Ih{}OfWfWMf<e*9mzh%%kDU9+r=ce-15C1@0Hy4eaY`*zKhkL
zoil%wbNbrbXU1ZAI%nGMSl{e07pqiadwtdR#+2JTyPXyPy2h-}lYY7M=kLou0_V>P
ze0Axw!K<3XYP(iO-@Txe_j1#zOvOH<H%gE8*j;B@b5t^Zwt>IyjU4?s*;bB6T>ng3
zXu)B@;c(<>UepiWdq=B<RGib^={WheDliC!bG#D0t>zMQ^popbg@8T>!!C=DGj1-N
zW-M1d&#iCnG_^-!drgj3-1w5-E*ms^*LO|p+3l_|Pfz`>=-X4VU9B(H+k4tO?^3>h
z{IZ2V%uU|<o4(j?p0mt*o!zB!|2cdIO&$fmUHW+bSG(Ol^RxYDzd3kW&NTYS&D@`F
z?{0p3B%G1OLBWEtW5<Iv1?n6}raCOV=3MwR^)|2I^eGcv?30`q1n}$H`^PA{$aJsu
z4}0${zh+I!v3JkEmQViHw$o+mb%pHy+1d*`u0P-AQQWz0VsiF!)u2Raqt5*|eB30s
zm|VRWl-7B$v!yjU@NtS{*-kmEC$dGWV@J~?_s2^;76kCGO%f^m%J<t$H+bnE8Iz5j
zw)IDXr)i#6sjo}ueSfCTif`){{g2No-*z^4Xdlhj`!&a(NucoOlHP3&3k+Dew#sSw
z<QL!9a*262bCH|C2i1TT?O}GVF%xVHb*A`P<SiErs`<E)F?!oSjjj5+pk+F_e>Q)c
z^RxEfH|yWClzF#2nEW?)&J&mPvo%S1Tc?=&Re#+iI{Qag0|Uno;f9XGJdBM`gf*^I
zX=tV>KAL1S>4kd^hsdjip*>3H?8Fj|wog6PY4LlGg<OX9&kh;ig!*IluNQMa`TThH
z8=Z2QMjwTJ@!e+s_U;w$t5?5gwZ8aK%(f$GlMV$4Xc#bd>|o+&n8n^`BLFg=KlZe@
zus~tu!5%k<1pY=R+bu3f5_Lc`0Sj($Ivf%G&2i99pu=xL!0UxCrir`r3KSkS<lC^Q
z1>_o5!EmR^Jh7~eGe8TBeli@Y5#VqDsbpAo^pRM{jtCy%9v>l)nF^04nRH(e7w|A+
z?AVdguV2N^<O5Q;<U)Ig_bHD>@*5Qcz)j?wWgI-LjqvTjUJamh;pxbYoGLnY-0%^U
z;9^qsXHZ)AWH|%F6oy8pwe6Fd#FX?jl{@Q>v^gx4i&A>Dt?r}h*4Gu{dw*;<zchpQ
zUAghyds}Tw-y5I$Q+MXWypWxr?EKd4E~|Z9eZS}UyyX*<mtTD2-X8pS@vAgbvz+I;
z+wXgqf8VM3+G<nQjuTw0Pd9v8ytU-}ht4k#YRdz9U$L{iYVAq>9NfRSEPS5)3ZF}P
zT&K)$*YWQxH<B%^m|Ij6UtO&EG2M9C9pQ>So3<+UJym%0$RcZlB+DT|P{_}jdbHZy
zCFWxvua36{D3Edl%OOQr-ia<Fmzb~{o3uhEDlsedtx)>;i`T+s&j0I-N_wl0vvM^$
ztl<#JvK7(xush~-L^W0AV*d>RjVsRD9UaA;#@|-O7CYA;{=V$BzNwtmzMOv9YEd?x
z+KyQ!cC(_k{ZQR&^8f4<b-(L{?`|$%vWNeX=oZ$cHw-`W`_KCI?nRyM-c1>Wr>sxJ
zE!Pa5zs$Ni#=Vx^*5kN&rh~BB`GreQ+bwxHxqZnVxtjZ-*S{L>zy7w$S1hyq_jK!w
zZ^HT)7foL_W7p30Io$DothY_qkLxhczIpLa^`jg6wj?~feN1@Uqr;{qx!3kg*?Z@|
zf7p8SMUmgnuJf1p^nPo4h{caev#siVx9bZo<s@HUW2rCMwrkCn%g;Vddp3{ze0JW9
zofU7RcTRd)|J!V>&;I5vxxB}(Wv$QJB>Xcfzp;Du&P=vH)z7y5uM5q1a${xMnHiDO
z=Y5|h9Ua@g?)G~}fhp_v`9GWd!FJ_K{T1%>FFu`7wRN>^;Vk`r{qKzPEX*xCcHH>m
z*}PC;f+;95Z8F<&g3IZM=*@-$b3hr<SzB6gd&)~L<)DZiCKkKqaUOosx<|H5^ZUo9
z@0YrxX1M;J{r|^uLB5)e6F`l#x#j0puC%Hse5d+oM~@gc`?VFb#G+0v>d?>3K6~r@
z*`$io*WO;)`rSjye35ve<hJRz6K7Z6F7#J}tpzaL`|q=F;~|&VGBL(amp&<~DYV`5
zBF*vO-=eGf-9`3-&qL*J>mBO9Ep3sbRQjf1W2U-C#7f;qDR*32yynliyT#si`lHnT
z-tOz?Vt22+|KOuZiGz66!}aTf_a3gBCA6vTk!wZ4{T*{ZE`De5ccS5=faq=M4`08Z
zQF!sQ@0)2GQ<n+VtvLR3gLBsGpUaM)NuA8$Qpx%7X!z~VFVm$jy?JIAb8T1aOzvNH
z>o+8Cb9R|?=I^)g_yiW?|8h#_epm7fPXDvN^@SoRg-#LqBv9xn>G?(3jD_p#g-^GI
z)#ph?3K;vuy}4(e7j?@ixaGoz#_baPGp^TG`OoRFxa~bH^FU5N|Mt6&dW3t^eZ(Fl
zt-p6!wL48rs_J~;1l#1YiiI6Mf@YsAtCyxU>dh00cG+oDI5&Tfl5fPVuP==bsjql`
zIxqfvY&Fwc+bGA%jSnPh-XFg5MBM%If_>(d@1~3F-WqZBuZaG#JIjxXc3b3be|vP2
zZM}|)>yOU#^GE&88{X{`i=DOpevQ?`9rKrWo?q}-srJFAs`E(`e$IGlJ?Y7L#YdBD
zcD<11m;%nkntzasXVGiI>Q26gI9LtZclb))pWoJ_@i}1Uk$H1(xU?@Xl$8DS!nEL<
z(oFG(pW}bB$5y^v@1vp0&AG2+#e5EJKb3<@eonGWd9Bx6$h$2j9p{;Fqi$V1ce{bo
zIX^9Bo01zV|LCoV;JCD}B4Dq}dU<_&#b;8^x(D8D`c^Zcz(cHX*2?)ElLg{!^^IHA
z(jI9motyVr=GTeOe%%`uJQfO3IsS}sn!2;%qfabSXB!+CIYqK;Lk{RY-;&v}W6mR&
zRD}sML8WG*>m2Vnp6}o6Eq=4HI_ce?Ew%;g-b~rluQ_?fqp~-yM*q(!)_(0@<!;Kq
zD$VIq&iR@DZi+?DbZ-m#%6jvEs`|1I!hw&bCi%xqxZt5%Wzl2mF;hXQ&$g>>o`rqB
z8219-+1oFk>c2K)!zmw0gS{?uIC2GJwyEo_=$LHV=6LL?|C;Q5N{>?hlv+Mpec0jc
z{Jn1))b(ZyDm8VS?vwUvbeIE5QzGhab=y!%J!!$|-L{2$>OH@upS)*sllw@s+V3}S
zd}iwTv+H+kjo5Nj<$dC(Tf)n~H&t>9OyBO=&gzxKb?>%nHA`&uoQnU`_Lloi+Fo&W
z`?JgSOI7VVcd4J#-q<BJe^a>c`|v3j0;}K8sj#o~+u&{zoOW43$#2rK4>l8Q3mskN
z+!H?j`&HAM@-353v)mLuUd>op_erAIV#yo%C*mB5-;bR9=liiK$$!p&Woacnv-(R3
z8XPIUpn7i&uimC6fkIa^1q1b;pqz2a?a0S_3m4qfe7eu)re<Y}>fy!lNy365lN4IT
zUw_-DwzGY5R6&|gtoi@wbmlAf?AQPG&=X8gpZw)i-Obg}>`~k6oL|i`<CJ{z;d1eL
z%hs|tH(s9c)BIYw_3@(zzoxN#J<_S(Q`wpO=w`{sBGs^upY$?!?{S%P?%T=aLx~$V
z&#SmN^Xg^oIg?y>oby|0+@G~^*;~%T;qBE$8mio3533LS3-wucN#Ukn>()8K%YXLC
z-BW&gH6>8$ThIO__L_>1CfOZ&A<vQEA9KZ$K~Q|E!l`Bl52jYJLdCPXt8TvyII!XO
zmXdV2x-aKTGGB{H9%gE0(zxm(y7QUDl<n7R|0F#)x%2bOW3TIEx31dqNz``V(PM_o
zKWtjTqol{3p~1vts2s53Yrq+U%2_DQfR9IxvQ06%^19$-j%$po&KK>fiTZJ}A1!V@
zUU7c?>FW~VEXP-D+uYu#Z_S-{rE2f7>vo_0wkyZ&@yQieKJH@{{^hwqp{TFK*Jg)@
z;LPx2-t%Kj9Xsw6xacYf%m8JELb1ZehmYI@7#dv!8alj>7%UQ2c;F3+za>1}E-^uF
zLVr7WK{dC+qa<V31PzWUHjEuRCe3GJ2w(y=5xuygI(97SQTZh(&D2_z;dta|TGxR9
z#9<^0SU~A{ipUn^GE!?w$L7fsL!LNPF0GxeGh>pSy|nbFjn8$ZcfT>(`ApVqs@?MY
z$61*IIUSC;s&;%*2>1iCN~_N3piWB%_c6}$Mu&48BCi&*3YJgz6rG=Yh`VFQyd&;@
z6|p<T`YnGSTDT}esG+0#>4t+D0vxU&&jh%MeLO2r=xSqOpgWU=>+1y}!Q)9S{70T;
z>)+~5^V!+{<ihi%r|o>K|9hQFy<NXw%cST{$Y)7zMY|KT-d{~$_EM+TJhpzN<({7>
z)1O^7HUDCsdvfo(-+zuUPTy+G@XDv>)66;k->+V`_PJdHYJJUldGy<w$lp6R&OL9t
z`S*LPKewNmthn!6^1o$f=JNw<|JO{;?B<tz-n_un-txq*+Sy^64^KEPKPstS#pU^L
zkJr@Nmq+*SbJYL$miN({wwhxS##f%-DZKwBGVM^<{Fle~Pi9^=lQG^%&$05i;nlA<
zAJ56%ee%z-#o;#v8aht5Nu6qN*aHf75q+<F*N}@5?`{h*naA#T6OXjWW}SHCaOC8q
zrP?bZPxn15$g^BB^QZv#eo^^#hE7i14(o#7J!ak!X>ya@Y;yys4wo(S$W`9-#BM#K
zh~d%H9|!Vf@2NlX+<yA&h7G&&@0g_9M;v+i@AuZ#?=R2W!O3r*cSN+E@7KrV!p|pP
zeQjgDe?9+0g_PBT?&mj4&UCr1Osk4~EvDI*qkaA9(T3x}pP$ux&s?Cmv`AsAc<}mP
zi?&J6);Hie%gxxaV^4ycvO+)rQ)?8z=$6ZtIWkIm?FNCO94w&b^t88~!huH?IUbRy
zUa}?PqEoN^X4ac`535U^7rAeqd(B39-i_0XUteiDelK2io}H$6%Y41VN|*Dd*UOoG
z5PN0)!cj<nhMm8kP2r{g3mygTm~-We_?G13e#*;!e0qFb?i{Fj=vbStw5{}?_J5D5
z#apwx*G1>;pZ@M$aj@5(O$vMVW>+iM&7NO1M{r`zO(rjvWqHakW>$ThQPDdwc7C`;
z>V>lA9@}@CvY^cx#@AkCz7zsAi#3mE`y5ea75K%-B<lE}@m+eD%hr9RS%T`1x?5+y
z-jKw3H(79z>!j%&)eqONzkKNWea}t{{q<A%z6&Q?x=gn9UpK*Rr+Yi!r}rg0mgR-B
zxc|Rr9&lXGIjH(Uj^Og&@6{iDseQC?*S5E<TRtr|)$E@oZ~x7VPsVJbRoRXia@C*o
z_br^VO(OaHOi*R&xiOw^+P32!hisni^J#DT&6z7}mUUfx>)k!|iWd?Bg*_&9XlGq8
z;A7$PSs?IwAyc_!`q}V~j?*(sS8%X6D5QgyQi)~la=gtwncII(i#Xfc8E;>nYE{>Z
z7F2&4vBk^Ys8TpmB=^=HUEd#-Ut>R?ReAI!&N_Df6~*6|jwFj;QGWKcQ~3R|@Y=xJ
zyjJoB@tFc1T^-uJ7rXrRD=+^3`~Cj$;|UR5Om-{^j|^-b7PxR!bXrue7wPWHtgq7B
z`d+s~-A`z7%T3|qzi)r}XT3f}I_ik$CILAkzJqsmW}9z)9Wkvm=-Z={>3igT_mouS
z%eM!+i5A{zt2nvCZe!#3C7E?X^G@9}xOw}T`Q4@ti)#Nl1{w8VS=IJj^7iRGZZKKh
z?>zh2D`JIZwSl*FC$3lAB{_k?CB`9Ohpq@?$Buv<1;rDE8#=T<F*a2yOweYEQ|fy?
zJ;3qYx)qxh*uEaQnK;vmdr?cku1&#goJ&Rch1&Ty<?h*^b4^%(*(R4)1;;?cKF8<R
zq`iCSoxc1D&usDSR)<rTUs39NTU^QhR+;t9d{b%Jw2btLTWk~R+(lVL3-6fACTT64
z@~8Uj?t6*$rk$BT9##J+dOLmd^BLm7f``}rD!Q1r?wQrAPJ70V9Y6XL4Zkrct&?zP
zHA@984T~<)U<X$=#gDpVg{$Yyn`1pcZ_m5yqDiZ~x7hx9G5_VIYq~Yx8N)gZSBS7A
zIWZ{dsWFQh8!J9aGD|SHKApiuMva4G3P)p-XyL8PY5c8kudeD?n6zFv{-zw$yY$K0
zEgKiPCabx%KAU^eu7;1vhe3(&^8zIfha;gZp;sa`m;?%2mHNvZ70$DCg&uh-$vlgh
zQJ{m7v15k>PY{o9OGo#^1qV|=!w@rM6ati36drk{EV$?^;K9Jyv4e#tNwADjX&nbU
zt6HirgOXl$i3SJLN6>EO^&gwIDg>MYsjvK^z3#n0q3Cfj)93RUTw?ysT<j{K!NJ(E
zqbb3Cu>u2AB&d*NlIluPVGx`yD45qF2(q46%!jp6gC*j~L(oAZjXMM~D*nCMF@wuG
z84?Jr#>|2Z9o#M8DI3V5f(8bTBQqIXV&=H>Z@UdjpX#771@J0G2L=`)e+DJJkIpec
zvMn9jD<8V0BhL3)U;;Wj?AZ?Wt=9z#J!K=maD(P(p_&z38I<@YD|)hk1{7w!;0Eor
zg4)+G85I6*lQ<lXsAig66bDUmLr;of?65e~ctp6N!~2iG;%3ln3U~#M0t3^BBZ+(}
zoeVB9liY-B85t3J8r1oU6+W^kJQA`$@<IX08U~xf8O)wa41&{_Ty%+tIt%Qr2c3sy
zJU|VABm9O|9MDCHV9Wevo;hH3SH&X(c4$!a@PSscfY)O*FmN0YZnvDE!X!}WDdqcx
z6)6B1=8bw~G**Uu0^@1)n`?D(OXlSl<@<ljmcG8$>vE)5)_U5zJ)ixu?(ehpon^wg
z#X(^ji^8KM%NzqKq#SUd|G@S5dSBD5D+}cRzHkSfBzEcA+US*4Utev>zOMJ{-R}2i
z&i{V9U0$KdL5I`f$kC!32_{Gx*g=h{_Sct%7Z<x<-hRJs_O7qT?f)5meQ~k5jbDD=
zN((#o#zm?Og5g`7nGYl9?nMF#`F9-IWnJ#qeD+;6tM=Q?cC-9@Jo+&r8#JN|m;?%0
z<(VX4#R@3b2L!TgG3Sfh(Y2_j>NBW7zJKJR9I`v^ab)Dn>?x=`+}EZxgTW<+mD%kp
zNF&UUR@DRdb_6bV3p_v1cBA;Z4bF|%id`6#^xPE!d6CU26iUdCS^RkR#oyjyiTW{x
ze1dDZJQp~*+*z9($Ct^L$KH5J>4U>}WAD|6U+6+LigX;9ZoO~ee%*gPVmguT>9X}2
zbMJl;uv&1+j&&2;*(QfPj*uB0pzZQ`Ixe~j6AYLhr|GG5>}cWD+Qkem+5$>gwp3W}
zOO&<xwPDtmhs$SYoc#1vpxD(s%0L5jG}4A61@;^+Dh!ZaOOSG{fuAvYk8JdRfmNH{
z_Xn$%z4<fkUhGfFBQj}$Yd|qo^&_E12E-`5l3)yWsl!I$6Ds#|ek(rfs&z@6FAplo
z`2|iGGj{B-VRkQu1Xrh;K-eC~GpbcqQ+%E)Jn~9gaq(i3B7<PKjxwV)Bn}_>D6Fbz
zp3(jJ#o@9Ji!+QEMkr0l{>WxHeZ|W_qq&V8+A|lrxp%iRxWsJiVR-|oMLL8VyTtMZ
ziVJ7)ZV*-CD`sH%1lpf_{=h{Uh*Q^aK8fDrc;;x;t|>hq1qxM-ue`|EDA3TMJ#Aux
zEX1WpeHL7J#GABkMml>=bH`yBhDICVhK|$51t#p^Ko!ttI;wV0;aQStt=F-LjvbRE
zvr4u^a5x+hJ?(LT4_rGnCMgBH0h^J=ozsZLjDRaFk3eR0Ras5#Da7uMRuzHrk8FnF
zCvG-xQ+#B=?x1j<Md8sW9+|U{^X5cE8<%u$_eomkdf9nJ*3{arES?0PKcJYY=egKC
z19XVz&4z<95C_c>@Yq#xTIJcfyebY^2E*x-ceZ$SC_FOYMm5!8qVNpEWH#T~X1zcD
zn`T|{h<yF^x6*q*%g<iEds2mpkG`2HZIr^X_uH-P%;LhQr>2&CzgvE}ncr@Pnf+rk
z{>f?jE-{zfME+`4F$om9+PfGCfC`0AehuR1Yrej}fB(;X@pG}e%a#^CJ{I`**VoT^
z{(4e6r60rm&b<9K?OyChN!`eO^UhiQ3(#^iJpQ@hKm(&m?XQxo>+52(^qh`7eIz~c
zw$&%SN5}f*ypr8{d|taYyZL6kSu(Ta$Hbl09&hjM?QY|J;kkL4pO*U?-*&$%<sURa
z`|0yC(^ll3ydS^ZY_0$DyuIv!)8+mKmFM5zHnplOFTYOCG*K#a?Y8H-b7ECLTW@*y
z@XD`E+xD&9X>t2)Rp^GQt6oy(MFzYwcIU2(X`auWd+CkN=f~3BVoSd2u5U=bB;gX{
z!^V{cY8g1lapv6Jbya6p<Pp$`lt;Q>D)(6&`qRgHH#9uf)qA?$(&F>B=1)~`Zp{`i
zeSdGRn%|rZovYmMZDz%-)9f&>|7BfP=ly=2j-TQ2&+lHZ-)~mz8>et%p2hl~5hpCe
zyK=={ju>X>COxw=oP4h?JFt~gTTIpY*Il>2?f2#fFO(=dw4tNA08~O`eKl!(^0hkr
zZ=%bbXF7|vg}t?sbPbabJiaaap<2_sACvufyO*cDYlmbaP`ge+gpqH4%#H#@(7AJ8
zjtKi-m|>Wl@%-G}r$_FB4lzpixBID5`{|^*+mDSLZ?3PG=aseElAxF`kiW2PcA3b+
zbiX*c0{@h}Ou^&(pNn(9y}jN1|MTCec46!0#6S12xOH;(ZVi_?x>7%$=gqx!sq6dn
zpwFA5mQ^nQlKv*}_08{cQl@Y8c0Ku&?a|V#TN0z>w<pPF$)lN%>o<SzYwfGx-um6E
zdi$66d*6r_ZcF{FX0`3*$&$L)+An_Y><^x*_4~5r^R0fHzaMDMx-@(J-Z?_$dcWgl
zeNW%D*PGp#`@OXN)r!Xv45#03ai5&K_2{Ovj@u7@KR4^QmTX~2?PJlH9S@d@{Pwqt
z%CzYy`dE8rR_ntZg3T|w%JzG9Cg(l=DA^xy{T}n@cQ1QRUl&%=(^F=(28Y;wmM!VI
z7Z<r6Q$GHCa>YWO-%r0Z{c07D^U&S$IXkl|TOj}93-QS_gaym*RX$%DCnr<!pmC*b
zVf3~f&)f3TTQ=O!SRB6NlB?pQE$VsS*UfcvDS25pU64!1%`7h>e^1Gun+xn>S7xtg
zbTjFQNUnK(ZJn8%8lUf#?;p~)Ra~@}k9odfO2O+JJKKMn$KScf{(i4>`S+byTdMMI
z@0vPqmhs&`^YjJvudb5Mc(eD%r|g~;m%hFFl%wCRwr+9Y<HsGXb8lEzRn#jxE&rRi
zN3ihrz8jN1$z=WhQKuWQbKl40cb=Uwx_3bFP<ST{bi%-%$yGeyeS~v@ZXc@M@P0Gv
zZ0{SQc~#=+rv(a+TIy}!0_F6sMuu)PzoJ9)%)aG(IDdAQskiZ3&&g^g)!%Y9hx>gq
z_{gjD=hM~xx7Pa>{_M72%<6sZ;z!=s7dC$7U#Ru_pGEvx^>uTT1d6SC!k;}lYkuG7
z`@QP*o;jP6j&^OC&h>_I-_vQ)JsY}T-#fd*ZwaKc`)%9JLoU@XZz!ucdA(1+oEr1}
zhSE8{`yz#^aeIaR^3qslinT-?dATh%&V}>t6QigUPwVjV9i3{n7Kb>@woN}QsK4XK
zrzr(hrT@A!A1)2*V|_A(bFuCBBZd22&ZG$ymhD)a=iF^kT|aMMzw4@Oei<jNzw+z?
zg@v^-hw?y0>xv2{J;CL!bEmWZjPY^a6Z*zRW66xmN`5wnf4|>vzczaNxekkpM_OfX
z4qu5A5IoMdDcsY>S!vdf?`0kG|NRcJOy9fv#?ErT?zPY7%?tK>vcqLgivpL9*rda5
z_CAk~^Z2%=-HNl7-KFucXWNS}!gWdwRbM2XCd)}=9uhcu@7QrOU*ST%-xe=V_PEP=
zeLX*QkLw|uSS3H96&d==#jXg2Wp!Ivdv@J9{&236)Ym1=9nBrwp9Kzq7ppNaRdd`>
zI_Fohs9$!5L86n^PfxpVM;!M&Eng<)bS6{q_}-~1YM{eg75CO8`p48gZw$|76)nt}
z@<x4k^$%vXnxN{cH*Y4^&EHt?pf)u-e7)E0#j3e4j;P&AUV89Ff%CSbqO(qZzn$1=
zHF>dE&$jP=x6d<Wyq@O!<(~C_gNF-iJZ;}}7I!M|b8ip6C3n<kp2cmm>C-3dt&KXe
zaruSUb`_5{{T4bbJYCj&3b;QTq>#{Uv3J6=o<^%FK`i&W_4j352nn%BwfuPPfxxOD
z$;8P&XZ8OSDc;uc*zVSIQ6=lteQx*m&u!|qFgu^`FH`^R%bb^K_owf*_pY9Or1`m(
z-GO}nId9fJt}QQ{B%=H12Dio`ulK)CZBh4YufMghV~);|r+<sS27hCHRp2GQ{jtBz
z*$T^C|2cE#22~4ahu$;K)n72Zqxf#SZQ;ZHCj<%~&Roa>X=422blCHB{?7RBpCZLa
zuNFr<RtgYN@;mfL#j@kvw~YCU&rjVtA!ogt@9s^Xo0_Tq^EyOMmaKH0qarb}__@lQ
zNw+_q_Is)NNTsp+-R5sEUe>*`Z5Fz8^6mV4kM3;onUuSJ@65QF|C>Db9ZEWXuX=@X
z)~f)Yv^#6BJ5+sJc{6RkTzpa9+?E+d```YaA-*ZNt$u@w(~-Xq`CVkKr@j9(MbqVa
zeEhfH*6&qzeZTjm`qnnpsvn7qYQL*qD&6zP=%Qr$lIg3Xr@#35yi>|d=l9=FPhFnR
zkN=kcw|c+g4cL()-5%R@OV`GFmmHUM+5da)?YMQ(-#xD%pYIvDQStKgS>3T=uQuGA
zS7UT>^LZiJiUsE3xBbK?FW9qFr#Q{*SCaKIKh;+k7WOQ!&D>}7Ew1)V%KVzV@3D3H
z*Z0Ft<cQpt7Ta!cVcV~>^0x1D<l>4WuP^_de|yHtJ$K8i^9rqhRVx?&+H(0@&G9GY
z%xrId#<gudY<k|<@bL4E>I@&l?l^N8hTnNSS^xDXy+@y}8g7{^&t##lXY>3>$Bu)|
zmye3*Px$)2V&U0tA3=pp9JZfMY;t|JCbGI^flk?1rpsc5Q7svXPt94X9<F{}{Zw?3
z@p&EXm9BHPT@Tg&o_Tjy$?sgoSKoJTJFXg#lXfWi>MC2Ax7AI4j%UyJZ||_!`{BEA
z?5_Vc@yqP0Zf2jC55M1hML#+Fyx{(wPmf<vZFzJ5=C<mOvrJaz$lR~rpYyM?x2$sI
ze&5-f#Cg7*kJ*3M|8J7hu_LM0+eNSREhy&xuG+?MBzjMqP?BEU<Eh!%ijRtXx3IjP
z?4ow3G<@4BA(uzDpC0M5hzvQ!bV=3hi>;2&joIb*DqXIwp3bTKt!#sM$KL!s45IzB
zbq;LL*H4b#`bsX}SwQgn-|sHc=0blz_NU6(Z_7Q{wCVmyC(FqeLeg>7XWw$3`LZ*v
zVsl55{!%dqGyC;*shckQtFS(l4L#y%A5;59T6?-cQRf7ouhpFvD_=Kpcg(Bq)0pfb
z8Xara5s<2QOi8<=L-_qd-Kw3k;gz=MzTchx&&eg{^7ov3dyY42>2o|O-?cvR?E&UJ
zDjW`bj#kf5eD=t;?q(Ff;B;NfDa?&JoGM{QQa>Fr)ZeS{Y<^_Ohot#?i*#<>t5|%-
z`@*i$oBv&p+<cm&zu<briKN|by_>A1%++@N{kBQKJ1_a&9i#q^mf&BNkKP!VR2$BF
zS)3OycDLWotK@EJ^{s33UtaJG-@mA9htHnHT^9T8+C&StMVvYkz@2(qSp9SHIhjR<
z$)|KCUWl{bUG%i`^24KX+g=(ZKGaZL^XTBSfRvqI%KeV@#y9Nw;52jk`zgV5lfM3h
ztpzyMGS~aa$G?tmPJFU#Px`39@KLP4kSj~|NK8P`$sPPkdXkSj;*NAUB)QDltA38J
zzNC|PdPnse!$pCMJFH5TetkH+slfG{N{`%L_ai&SRy*2gTD44m{lqr-4dd=_QHv%j
zpT2cf>D;};&$sIPe{;WCy}HCh`n>1gow^=5cHho_yJzwC;hB`>uYW6iO?{-Bbl@Iy
z+WY*;zqZUcd*%2FB|VS7dj4}bas>1KTvB+^%~0*Lls}50@~vR$#)*GhzQx(*_Vdl3
zGM&%J-_-f--nEgNeKvf35x6aVqU~Ly>tD-{-iv;I%Q&Nd^66@c#p!xJy8aB0bZ>5E
zO47UeSbJ}ChjwqFn|?RgZ#q_~7vp!A{bZ5ZmVR3G(X+FU{~S$KKEmxUR$SxxS7(Vu
zW~<JDpZ|ZQzn!_^cx3mEvbP;p<!X-#cBy)I<m5*L@K1kR`1yAA(QhZ7m+e^ccXPN$
z+ysNG3ZJ7}+|=$F&70}6$@@s^tK<+t@g;TBJ2vml)O$Z`dhV|{zP(dfw4!dXa4w(n
zYpc6o%SLaOQ``*m^50nUz3O~iD9j(THY!!|@Vbp(HYu;z!E5xq!2hCPLxuJ(#gzfg
z3eU7e@(*gacWD0*IM5KJJi$Wn_`~Q;99QqnT2TF{!cgb;;|ik_=Qqu`vnA7CNT_;K
z$|r#gLGv1mi&Mkjoea^mXXjnM<#w6s><iytoUasoJJmK&?DhX2?veIy?Vd&*NqsbT
zZO7-+0+K4yDa&jNqa1D~FMM6zX%W6GWZhxy-%GP)Tscnecr{%t(M9I^Tdz-#)#vSr
zGuZL%Nax(ThqEv3-%vK?)?q8hzpWMC7ZTDgigxVOS#Wd#hf53(8>gCDw}X=F933|E
zoEySSFQ;ZsY!6Ofo)UOaoLeF?<a3;9<dq^Lp_|6(^5zPjo_m&c#4YKQ^}ez;I?_Y6
z`gn!EOtnQ*m&N*?#&3`AozqxhtGsRR8ISL0|J|HgXt3h=g|tPs>2-THix$Q$y7q4J
z^@vojcZ!}?MuuBl=jetUDSrJ)Wmisd<y#kp%GBvwUT@45cpr7)h{n3d%tz`r9z1>f
zMB>IG{}Ag*Up4eP?>X$5dc=VJoyw7zgpiFp1eNr}4znmvc2StFl=u13_LmEmsXvkL
z&pxQzo&K?F0|$4JpG@}-x86(6$5PG<aqHcCoZLRCa_Ri@TPmg{^Onr6tA4b@?rrQ9
zci|iJKc0`9U(Lxj{rU;fsOc~N*iP52Q##jeu6|8GF#Ub!XRA++HtmVOUd-CKu5S0Y
z?8-{5w;Mv&&tLl1XI{u}yX*UmDl-pqR>nM6tD5V3qV)FjBdJ`;JXL;kwp|vhy0h;7
zKHv3wYaMM?{;cO(=qk@7kUvANjz!_wBtv_JN0SWuUbyyfJn=fRcJ6ncT9ag{t2RGH
ztd>Q8mk?}#9<V6t$na>b(h9ZhejPh2`gL7m_8jNGc6@?^p39w2CxtVAv&&zo+nN=)
z^l<-z`LQ?E+Ej{CHeIh%UuSoD-cFgB0U}p_y!G6?H>1_yhW6>jZ>OHxo_qA1?tPmJ
zaXOq1dyW>}=sK3sv2&4RS3sM?k=8@4vmF(z8a0&O&3=13^X83<Ul#BEwuntU&Lvx1
zcZvKpyB%9xW4<Ul@`8psKnEq}Z<N`#$W$t6T}Jz{ObM5mJ;%7OsWmukQ_8#D^>j<l
z8;Mm{Hz;1suG^)!G40`MfkMt1!Y`OO8W=cQ6d68>wTZps^=3%Y)7g0F4!>Zy#Jh~G
zEu1OJkFM#@@PGgRpJP_R#iJpw#rJz$Z0^v0*x9nKnSo=9OruK2_N%MIg0}nRW*+_R
zefik>==Y8#2ais_3_o!vs@LK1>93wMCHrk$lHGZ9!q@$b)LogqzVXY8C4~!L@ue^b
zZshGhU(vGbo1Dk|_5V%2zW17{#lCit#{1?O+NTmdC0PZ-?>uIHE##rFrsMYuf8&h9
zhr(Ant`6Js-~aUz!Nv~n2P}@$LE}yblqRrzREteIXbj!_UGih%SLOb#Vueb7w&maU
z0-Xq>clZ6G$NpBPMbb01JXUo~u5&(<I*YOOr;O4%gF6?qIE6X19{u}1|No2+C;#XC
zZ<2~BP<$J^^N+M`#}1)+9xtpo8XO*Rs(9{U6)cRJQ2XhkS5?6mfm@p@g=KGlNqGhB
z{mwEiowV}RzK_8#zlc4T+PJOYp!bn`+X97`N%GJ0K5`SZYse~e<tBUIc?I(|PdB}P
zXSZF7U-A0gUz@fqo2y!1Klj!az1xQ_EOS<WWnCe1Q*PUmZi|kck7}+s91(rK<G?Bw
zVGgB7Wi`Jv_nMcP$Oh~-`}<~2eC4Hv4(|^xj@KO+SiGh=2px?!Q|fcQdc)JW_jukr
zVdKoZx2|4U>Ta)m$&LTLzwIxcg}L!Ln&M58E=M*R%r^ZV`s=6R@r$4xjemEUy0L7k
zkg<Oz_c?!E^qw~M>y~Hkgs(1>O>W-j<$A>NjOcY`PNj7_jxk@G!qDiWly~|5{YBs9
zcFgayc<i%k@<WCEjvYcX3|=U4G&r0T@YqywS?Sz8VRbpZd;V&_Z*Q5A^nR9D;kJfN
z-!HNX7AjrY@$Tc|_Dc&(`OWW{zuj0}`mg)Tx9IMmd*<IxZDc?H?by^=9nCYe``m?G
zVhVeh*Jvm*)hXqDz3hK+*7EsF*w*STdgU%NaUz#XjGN02MsVMCo`Yt`_8gnSZ3#&m
zRXe?JbRYYDx9MGHacgjw#Ao5c?d2~gnAMluSoSVhNO(5O-$x1OV*-M9?&Vj~+invm
z&5@$=K=skR|9>lfRb4%PWy96hkd;|id%k|))p_*89(tqCQTGQYwAX#IJ-&I$k!U@a
zIadzO+vZriE9iP0^O^OVJ|4+ze4+kGrR8#d+0<3)=hB!Ar{7NxIOsCx_1Vtw>wAqP
zU+TvO<h<d!u2-_Y=;jsKD{T9J7hTM|u*N6xn1Xoi=GPy%yjOH=>ssj-edO)ET}M?6
zZl=$#UGa}~ZP&?=8%*Hg%L%HC`#m~?MDhb&=fufBP4bTs5M2KK$j5y)+Y%qQedSnw
z=EK~E8K*xj6x<Zy=rU)HU{S!?sFglWGw&7bxO6d1xkBO5w+U?>+&^<qUVIyMr1s<a
z)Gax8B&>Fv+!*rx%H^UfOsgPsbQ&g%(>t_NKZ=DOImw-M&gJCvmf%zGf0ie14o>4*
z{^W0wmGx5XDU)uus-OFt+dO0SjpYINTxIGdEp7*Fk7Is$b9oxORf*wb^>wMM<7f9c
zf)2VcmaAP6Qx#FWK}B9|=M(w$OZUCl6BzbruCz&3(k2Dn>hi)@<&)CPJh~!3>ZH!z
z^lfQnz25b8r|quq)Y@JC-iXKIt6sjqGac_DyRL1qay9NUq5ET^;_kWVWR`qozBI4t
zm*Xn=jCEyCYQ${WHZ}fi{(jF;y2Vt#lA(CE)wkH?zt`Sa?{f6Ip3=L``}et=UhcDu
zO+41cRQlhZN78QGpdyulsZx1?tIUz;dmF@;D0W!XzY!=FHP-#C_vp)P3G23-tGgXt
z=PYTz`tE(!ReqDCr;kprH;inLkm%^xY%KpVy{7P0gteWOM^~hvVPMd;S7o{D7j8SB
zwlwySPM=C?9%vkf?`le*#>{CYVj9*d4)>RzcNW}Oaq{W)8N5d)>Aq^}*8iT9{(art
zl{ZQ`-5=-#w97>4r!HUeW9pT?+vd(W`Rd(6{wGf_oa_3z>g~#@u6Ne5uUqO{Uq4wY
zu0Sz#W$f-nvt9l|y`gqM>D~R`R~MMN&vE0Pwr25T>1zv<rDdE#wf<Y2pSECUml}hh
zc+UpiXMQ;Wllj(ejo)x+$7-2Im(Cr+etFZZ<^&!Ic$_?YS#HHwZpAAvv$xLJ#M*m3
zPH*}PuGyQ)PPJTIX|nxWM&;7$TU)38-Mr5I_BvOYcAkp6%UxqG2>9IXtGM%K${sWQ
zm+{BX+4h8AG@V!UBW=$9)1QyqyL`2;?w<8;neyKqUQ;c~S+_yX=C2S)5d3~=R(?kH
z<l+@aJ>3J(zN)yFys3df_{Nr8_Otb`m2Z7H%2&1Fc%&-p)8Fs+pO35;R(*8D`bddO
z%f{{Q>P8opmIOVm4OZ-MnsQh-*MAOQ=9J$tg%`K&IIWiY7BsrjX(6k$@~!pKn~^Iw
zxNl27u%XaoNB;GYA4_x}l|(3Ae{9w7dd_p=mA$|0bRYfsCQ`NaaN?q0b3|WupFJ&B
zC?mB>Uvl=Gv;UYvS^Q%f4=j!Z4IZ6PW)v&5{<hjN`(fv4<x-Q-Gb}m&cQ|>oE+wiy
z=}f*DTBPK+eA3jv2}&V1qLr&39VuRH|M~Ngo9!0ADjzj1nZc|4XwR$9pBd5~cN#a9
zov~eM+YmR;!g|Y<_YU6S5^HqdHB~(-Dbt8Pk{K9xhJWdXZHfLd;N!&{7+4;;GgLo%
zWS+KRM-pgJpyc&QB85t)Qud29PhioU)A0m!u2;WQyv(NKiA~?+%mgc{{O5=j?#b`|
zeonUjQRcqtM}LCOu!$DdX=ayNEaiK=<?ZJEX}{w3s{3_Ye6-2(Q#!GwWAppV^DKV<
zV9Nad=UVuMCm)r2BTBX|-rRlap4~T%T`n@!PO-JqYkNgMcbtr0d+yDY<0cDtEjK+q
zUoe9cR3bYpaN<0oX}xsK=6kV!rg?r?CcMJaBxo5+@;dQ%dI!p;dG2)I<eMfn#bD;i
zx7$>NI<K2LZf{T7q&|KA?!Jzse>EF}o%lqimqt!FGjrz_{hU~V{Fg_Bv>zGO{(O7E
z>7>d@ekt8gDl!4~JSAyI?3&*#zPj3OiCk@)#3r3P7O}eRs@wK{ORGx#$L+@9wag>D
z%xk^KW=GdK@u#BxZm`*VbmzQpnTbdLZrB`LbnJNXxoqD_1})CQGiRRT()#=SKfm-}
zgI;;#D=FKzyx;u&bmqK6PVLKfo!Y&r`d0PR9RJ09=cfdm^7*XthIyI5_s4T~I0d-R
z24%eo@l1tQ>VDS*;%j%#joPxuU}}6ry?x#hg{Y4QgLXaX`nK|Zb*Hgq!|kcA(WiE_
zbgj{DQZeAxdbC9LFVhX}owN6zJNe${=H*#u-PC;^md@v}I>(=|zI~6-CDBFt0{MBb
zIt2@N&IzgOv?;pD_ll`8Mqq>MoGU;7IX})keq27dIZQlg>8em>)1%!i|M<m04ZQ}3
zkAfDTx>mnP58w9DWKBe$+=NmQ`Kw#XzG}U?Rjtsuy=$+1jcG<nk6__9@6x#wqvKdM
zoBMBSKD{&VSE|l_hv~@^So(CGn&cRtmWoXFzVhO$Qu_O68B3D;tS7ed@436@z_mD!
z@V!Dmk4{W{WOCv6*RR~!|MuER+<Co1YrpcMhz&(oCdpi1!f*MUW0kp>uEE9R8qvev
zc^&5e(*<1SH1oT}eEj>c$@Lz~9v7ME5w_noqds4md}YB^*O0#*%0@|FsxQypXpje)
z9@OAsY85Nq)MvH$Xyr4W`|I|y&6wcz@y`dlx|7@1_FG?yj*i&UsqS~AFDLb)yKCo;
z&8~C!@_DLQrd$v7S*LUTn?v{a_ig8FgWUr<Up(?qoa^uKOXq7*tjyM<-#^|w9<`-t
zzY3SvyW-r>6Lp>9++99a?pxgc-F#y5t>6r9>zCn*hn*GgHQBl;JZdUD6sr)hh2@Wn
zjP%}@@AEHK^WX7zpHrxIHM6QI-(KNSSJ{k22~hC2s0C;kE{|{9T;9$qt={`M;FR{Q
zzisoXtNMP<&G{_Axa{=C7azEmUx{0!pmdHe|BIJRIy2v7g|l|eH?$W@Z+}?IGU?s*
zZEK@1fBX6R*BhO?pCZri1E1hDQKe|JZR<BVi;b=S*Z0SKvhuh5!om~D@JLt2pkqhD
zo&s|Y5!pr`r9R$!A73v2a_`2)5{I|9zHBb8dwP?-W5*)tI0+U|gJL4*ljt*<9}gVc
zJYn~PN2i0Izd7~My=wiwcPh{MADrLPart3$&H5JYs!ofyq1=$5bhItBy1h*-Hb-Cm
z%w>1k!iptVv=@1I??`>G=V$GAQVi16(l?S7JZk>>Nq$ho`|ij0MU{Gl9_j90Z{ZSC
zb|-*~rK=5eirKM$@9qCDOiez1p?+JvBd^|$3E$Sm?2M6Dc+`~^kqAEK`N<rIeMjcJ
z4SZ~{?{L4(tg?T<?`KNx&Rw|7`pxrP?nj-Mt8e6-pL$;~;FnBP?fmdX{G9GnYzvv&
z`;8xV)y*^3=Z}~0pYu9Hv)VeqY@UTZf6_X2l_Sw6i#CcEew#g0K{K_~%%xZV)x+~~
z@7`oMPl>b25L1m^HKFkA&NtStia=e1C12d@XE{oX?epS|+Pdg(eU$pzO9o1LKR<^j
zdVsnh3T7-)^X#q7wuNn3D60ExOT~rAwW+<!t{YYP&pD^H@~~!ZzI6A3lfrIOZ(FNz
zTEBTSrOsWaa_;hM`J+#!&i(Y#^77e2kD5D^r>pyQd&I7vukQE#X1@QN+K{Pv-tvN?
zW&Sa)=Nvn>-?`Acwc~PR>65kX_bgtuo)CTg+AC06%+Li?C)kU#9O|;jEB@Q`^UJ^Y
z@=MsnV|}*Pee9khndlmGIBjq7Mb<0eW4A@U5`HccWDyLPD|GF&ysdmh$?FK;`s-T?
zHaSZ?)lmDr#*$BeiohX7<$C}3`qjtZ{7ouUI&S?^?$z{PziQ0)D=#pTJ(@CKczJ)v
zruZ4z{nI$ie{*P6^Usz&rP0U5C!1S-_w2Isa~N8UYpdqId{*ir!X2~!&6G{~8<cDd
z1@d>+o;ezI<fcXwm*INe*>(J)e+@o-`d8U$aosZGQ|iU}FP)W}mmT`*WvHF=hJE@T
z-B;IE>aMM*Uuk*1uU=iL;+Fie*^55Un>BZuocW2vO{Y246VL2_?0GD=fM<_(YhK>+
zl9yMf^>!qC-44?+e>iokl-ZK1v;O7}6drly|66ceA%KfT;n5%Wor{jdXr5VH^EI$q
zXLIi5_j7jSyygkwd!);;^uu=*uOqBymkA#cem?cnG(rBC<<ei}tLn94*B$Xcop|}z
z*{(f`lMXAEz6!51UTprhtHb8Wg{pHQTWZSM#b)i2|FuTY^0D)><~r*m`!7FVS+R7H
z=yluWYjVHOo_+bvmDvezKTVRFemilY27B%;=^9n%srMP@JzQR#IqgK)5yQ3l+w#qw
z-nmx`n(p|Z6tz1;@$ZIj2lqV};7sLP-lD$F<he{0FH@m1gP?fM5rt<>g{O|ke!C|B
zcago<?@evlH95gWsjGW}w#SM+(w%N{`ubD}LG^dZmd;1`XJ4OZQTXt-obCSN$6G84
zJ1wdguNI3d`xT)-yE*T`*P_PXbEiJ~K66WUx5dWG+045AjP^2HZmjkAUURh0wQK)3
z4JAF*6KeiT-pbcF{Jy%%bt<1dlhV2$?XqG`3>+;A41(fJV(A)pKJuQ8|0Agzv8k^w
zKcc-aw}2yb<)6m|Qy;vWq{{Gd+Wdw7Egiy!itMs!PS34QJHKDLIeFevJHg5~wUahD
z{PVfjx5GuIKJU8jJ#*{YHM)KO`u7?$PG7X+*LTB@^3%@ux*k=?b}`V?@CrD{%j*_s
z($S@p#p0^i)X*?J)M1BX<Q|s}1qtP(11kcB%qKl)J0|LCH&OMJTA(9KD}$Txzw<AD
z&;96g_uagAFXzpStNUzIsbm~`JKD7N_1xglTshZlw?|(ld3&y&?tZFz*^1J)Av;U_
z;);CMYfg=ldpoW6lvMuT2H#GRO-Ekx?u(hWZvBQcSHpkUm^T`z^m*U>aAkhR**}#f
z6|;k1wlFpqiS>BI__^$0U~y1TVo@mkS;~^4XP=kp%UY?Ocm8<Xqc?usr&=%MSuVY8
z<UV(P-Mfy;mYj%*7M~@ew|mUFE4VrBw{Pd>YCn$UhKo~o#mrrP`Qi>MsVw!vWlK&K
z2Nr!5w<}i{n0BO4Gx~mAH{aQh4h1WDRrr*JSsWDfSQH8mR&g1L_rzZccE0Cz`S8YR
z7WUp6rg8hLAAJejI;Y>|+LMP9|H#c3ds6OyM}4+=ny%%P+6rDN(~~w2F4{c0mgzsI
z@`%p-7#02PwuQfrtmwbC?p&3?k`uG)9Mrp4U$-r^5?bCm;WuyfqeH(BwQCmEWwCaC
zcILj`#K19S5`&7K%&!SM3oO)B#ZN?MRvTRtdcI|QufNBY^Wuj5=L@VDT)BHWX6a;_
z^zV84y;AZ~gJSZF^Zq(2bvqV&OsGs*bNl6#vgO`ZvaY8WojCvhfpXbWJHO>_uM$-+
zdexn`nHw}M$Vhp+$DBB>mFCxHxii1Mthx1wrL1C(-16vK>z!@9+m9UGd_N#tfT8h`
zV8fBt(^5v_dgpgMWvlFV>pvRxXi=lUR))q)LJdczPP6x7XB6nLWAung{Ka7uF0n4~
ztmr4PweuQ}xURMjkQVsh%OLbzT2iQifn!MzgUY%MvV1eOb?yf}=d08%<-akVzq31S
z|AH`u2QwK|&KYaPa4;!M2x6MB<57uC$~ucV-Jx7h)<!J(tj#VoUEX{NQ{xGNh9j=a
zv;zbM7#cf-8jgr=PgW>C`c$?obHerwonmVC9x;hKl)?oxq!>M7E(R=C5MXGW0dncG
z#|p(uAI{5PIbnOv$*F3wZ&c5{ed2Du<k`t-zm6O=*mbC8p3L);{p=P$)t6ts-2Knt
zm*c%4x$6tx9}qN2y72YOr_bG+S1B&{xV9>GI@|ZRo@=AFtJeM4zUNH(uFA~$b~h(|
zzR3Tetm?teaKC5F7cJ{+g_Z^X`z5u!uE1()$$<+SCDqEFKF?bGb3w`SV4ttaa(kvE
z$0@5<%=%MW{Zp#$Z|%#^k$-cJvj{zZFTK0!E!(b7w~VBW7O0o*f5cSZ>Ad6D)a!f;
z%4BCnTiTkW%a-nc#Ql2zbNis#RbTRw{(ir7boI)k7pqrRuUGqW=RV&lt*uW#R%bk9
zmJj@#R=xInY2Bo4Z&&U%$}Fng_qO}5@za8xR=Vk%($9a-NVqxC<n=#i<H|nikN#4o
zZ-igpx7xdIMdW#b3yRzVCuTFK=;fCzRB1WW+O#oYro%n1wrvX}k4HT^^SpN^|H(3?
z<@0}Ed!@cSdiu-u+Slo8o_gPZxA4KUmOH0EzdG~DwQgJ9&mOHbd!J7~Sab{HZY`Hz
ze&BrFWOoLcxb^!~d4v4+H!6Ed{H)ZQZ}a(_$l2|2@maRP{&I^KeExpw7PtBCPo@73
ztth{L@XE_gdDq;}q?|jWDY-6AweRw)vR6fC3RAC}8~uCp>EV~<TdmbEvzPTK6vnyx
zJuMBKw8QRt?$XSs^1;1Q|CfH8+_&JOa9zgLKyMlEsT`AU1_`d17oNNLocQ|<DcOJ9
zvP|mVS*;5GeY+>Yvi981%#)Xkj?MS{w~KG}^|0mI>vO%Pm)^>|ditGIy=~KK#|0l)
z6bf6@-z;T0F-P3%=x(-8VK&!VH8igYr88aR-<oDvSas9&<QdN0cR$~@*x$LBUAWut
z=-lKrZ%V(oyLbHfo0`8b$##10e9QFj1=;#WN!uLftu8ylFMVNMY<#5CkxMI8^}>H!
z%rf7$#{R3kk(F{|OkZ8vY?nViCykWr{CeK1oO-+PaUENEMA1_f-%oWHismf2-q$<1
zq)7Ik!T;@7>R(8#p8qy(f`$EJ_2@@+W%piQ*?Zl?>fw&GFIUgSn!NsZ{N<lw|HZSb
z$`^XCm&!@~b79WMup{rLUY{1cn{RJM*wKPnmd^vD_9gN5N*69JJ^O<5d(i2}{mWYU
zrv!ga>ovNr7i%;9*1TIrtDBU3!{fHiEqn80pK8pXM=NeCw*P4tvkN=D`ABEL+*?)Q
zZfvEAJMErlN6PK=>y@sZtJ?6(c%qZ_r3n`GkNLx=w=t-!Gr2R@tM$y(ZT1TJOP|Vz
zz3iN1v3{b(@{DN?S0{g0er0~9BYOS5<~9GHiJ!iF>ifNgw@>G7Q!-bwonG=X(Y351
zNKa_{-|q9d>W>^%-nGT^Exm7V(XIFP(&4U489Ps1wfnsA-laTU+4ODp_oKGWJmN8D
zOYn`C`jf>H-fjP=oVPGg_}#flvBj24=kj>X`!u0M_jg%*oO-A6;<FQWJlJk0+bP`n
zIOh5Jz0F~3BlA0>=D)VOyyeqDlgeK<RXG=hywBO)-*)&{&(Yi~8}6CZU6S#>7x77D
z{;}S^4VQGI_nq2x&G%6o&yOB4eantCpPLz`tL{H&yR!Ye*el}&6Z1Qx_bp<ZTmLS6
zCdZrrq0Zu8G29WqH<v}7wn_C|8gBWyD)z%XXFi1q_KY4e|K4!T6fz3;*cJNu-qzpu
zzr1_ZpLvsg_RBe8yes~mbibs?9-^J68W5GiBBWSYRX_EO`6nk2wHq~S?{%l1THta1
z!Nc+kQ#zGjPBD9{6L@)|_POL=v5zx+ewIAaSkPxxyQOUF+k0x}oyOmfoQyMxYtsna
z^7i(&i#?g;5f}GOpJwAe+icD1P5$q$)rCiEl>DBf(zjmNQLxka^+a#Cmfd&b4EH|Q
zIWf<Axou&U|NPtYY?f;%UyuDhJG^u9iZxsA-npK>?P}k`+n>(IcxwrEe*TpE|DBX|
z=i>~??wLEU6#Y{^e<4=%^ClI)BR=)lKAfmB*^tu}%HbW;D|N13L{-RqbrQ2~bll`u
z@*jCkb3d7${aq`&YvuQ_U0+to?|j#J+1o((XGrzSANQ-~$F4iSRIlu`xv!=1_hX$V
zMUrLJ-_KP2daWA6%k)qPRFN6KxuZ~=D17!D|EI8seM?W-3JR%zc=N5Wl6AGRfsTuk
z{iJ!UDUYuF&N#M(bLNZb(=2pDmL0jtIgO)P^5w~^Y_V$Z+l5~~`s34iIrsY8<drp#
zF18za8-6WacEnTVTxRA%;douuveR6}TQ9G@|7P00t}~~MqRch5(p1jnw9i+a7S(b0
z+fm0yd+JVWukY?UvN5N2%Q~sGF|l`7UGJa$UR9QNa<lK9Z$~^&mmWF$&Ovbb`BcrN
zI@7QGUl;j!|1~-5x|92CHT~n&g`O{+_pgd8yxF0GMWOKFMx7MD2|Ej3I^8?;RC4Pb
zA=Sb-{(BERr1w8@;Qp7d>#lvT_I6munUnR3%7uRHbu)z2oi0B*uKTi{&*DPYk*8Vl
z`>G$EIUI0@ZL?<Jvqrf@FPZCyZ5&jtA5?qxyuP|jaI5X_<tF9NLRLo>?daZFenBQV
z?`eVb@@f5x98({!eX(ZGn{T2@LM2~gm%!Engzt$?o}#j@@Nj+l+z`$G->NordipLZ
zDqHUJY3r}gyGr+M<<oe$blNlTMU~5*I6mq!_X`Z#0rJWxoszr>I}=)bE3LmjQGS`A
zGn<Fav}t9sZlRx3^1DQ%owr@5Zdown^a~e(6w9S=mR@>#`krNER^$uY!e0SG#rH$k
zecKrDpybq5lLu{iN0tYdz4%dF^|W>W1>fssYI~ntw%IuOxS+d9s{9fcC)ZzZ_T67L
zr*qG#d!C-nwuMJ`TZ=hA+ViF}Ine6qlDpG9K6cAXfB3oWukr6odt4s*WInFH9~!9t
znS*Kj`TJGNmwokM)~(spGKt^v;fvevZ0~QL=5=cSPZ8gfmt*`oJz_o{Gw}N4&mi=C
zexyi{!?QnMnVy`LU%w+>HoG(V$FqMLH-*!==FL6vQKjzc@6&7h&!o4!ysY`u*e-Pv
zzuCsg7V|E=-ZiCnuFq?`m}J+)PnL5p-RVA7`ed7F?zKIqPEIP@lvreVTixl>*2^;`
zP0G%98Y?#~J7Q7G_-2U<chBW{)|ty~t{u>;U%qs1l+W}3-|n5^Tx7++SN`ms)z@~v
zUwf@{{`-A@tDYG$9}1iII{jVw=G)cBpYF8X>{onFwa{X&(Vys?DJth`bPlXCy}H_C
z&d)g(YKx9~+s``ir{{m}_sZ4NcCWVSbMW~5wO@DDR6)nx&yT!Ro$YtNbMtENAA37x
z91ln{ddxBR3*5V^>x`-WB)%0tn^%NwfAO_xkEm>=3RlnGBd$%0+PyaXsi-X2+`gv3
zvy0=2)s>dO1pk;WHkMgjObQdk89id+ZYWLXn7H$eUgI9uybiIgP7`)0J^E&(sGn5I
zB)U|2YMNV||AvxP4VPa|uGu^1lH@Vwo6L+FPD$$@@po!Fr7z$RU}%gGZa8vu*Y8y!
zc5f$Io`~Lbta#tr#v@O?`CdZ$6@o80LYP?`6gIIa6i$3Q%R%&a(e>+Jwr{sz_I!Ho
zqOjvvmJ18-J0tW-EMtd?o}r3fVQd%M(q0A?y<FSAZ%qswKU5hyKYQ!MU008NrjY+p
zs{iOJU6pkaO9J^gL^>E$);T0h)a7V!NaApKBy@S|I$qCNXB_WcGR*s&tE!?GU*y5g
zB&f*H`T4rw?R`Hc?fNd|dH(O)_mP~(dv{FCdwYK}+ce*e_vVM$&B(g4B5n51dA9M^
zanHZaoIk6`w!ZSU%C5a@_)YV#o$$Zp68m~8^L(qS>YqBh(mxrj3jUqo&y%9lBXw=|
zt$DXD1<y)~eX{j#-7Dk&6VqQj4ZmL~d%IQRy1y)UsjkYpfRx2E^Y5hceG)rcD7rPu
z@zJ7d$}9LeeyA{XerESkduNmR{A9JAzDJDRl~3Nm9xEG;gub2Z^yyqaOUq;~A#uLr
zJDv)e&urdvG`D5ywtW+JT>9kxQbAw_gUUI1C-Y9@r#;2bPTX1ZTYfuNWyZORS9M2O
zQ{uK)pTFl*vOaq{-?gpm@+A&xj}8h>klX7X`F?)=yG)b%cd}tyUclNVFCXoUoTzYe
z{^{JxKNn4M9+-Z4bZNWs&%!I2@0$HD<u<Qd`Y`$Z!Z6+F3ui3*FMSftHY$Cx>B-EB
zKl}8qcg~oy`|;`6Egv=>4clvxf7ao8=&SQT&#sd2JHP+WG*Qmzt!cJrqd$a%{kD(V
zesfaVT{FMspQY_H`FiIStPi!7l@#UUme(ln@p|PUGrj)&o7ONpZn@YT!zo*K{js}V
zy6mg8-Nsi(gYu8BPV0RY_Pe_5ZAR>4*P<mno8xCNZukG*D$9R1=2hJGbBd9IDbF51
zb?11gd~Uh(@8FlK=k8upcQN@|Z~0;6*QsCajf=a3e*ZQTd!OYI^D)AG<B}DpWuEAE
zEmr$g+H}Nqe}90pKmjPZnLGUPxbx)YXPt{lFOv)f9!~N!n|?+(YTvt!uHHI<53WzW
zKJDVlwf&pzZvXhW{M!C@o)@0>*6&NQmGlA~)H@&BY|;6^5~sg5Me|Ac^X}Wx!Z~7s
z>mEP9T^8_vVfn=`*6kB~ez^ujOfak8bo8>(#b53+(*#-kj_aRme|<augUaf;Wk0jS
z{DhuQk-r{$sh~1h?cAcc@c5*uk5eO8Y8Ix&d(Jj#Pdy!;`fJP4i@j_2ZG3$6ka5x-
zo>>OF<ZXXn4_{e$Sbmq@eiPOBQxD}Cmz+Om@mJ+FJEQr&OL>Bto`-X%)|EwPX$M&)
zpW4vVV|=?axi0hM{@6OT{Mh{(?s~;{SrrO*ax8uPdBsVYC%Uqa*9EL?IU?Gn^NF=l
zNwDF_Pv+e1>VAu4jUu-#NzKTAbv3`nZHG$i`PF)7e2%<bXRQ7~sW5GdPWalh9sK^w
zZqL7ZYgSM9gBg>n&rM#ReQU`+RjEJsg}#2?l<xCNZ*q0<CB6B@Azr68oxZOrbp89U
z)0<sRp7HOV|KX$1&qvw5otJadUq02C@uqUhidomcd0Z*Ev*h=Rr-km9Z#=A(`R-j5
z>(i%J{Ze^im7sg9{4_81%CDzdmPQ$y{?9#r!*^Ec-$mQn4X%IZ6q?>0u**(=E2wBN
zy~lMm$|Gjs38i=e4IxI4IrbOglnc$j?wYf1XJOA)m%#mJm;-!nZoaL*d^%@Q+$NpN
zYo9%HnXCT5JFaH(x)*<fvo%$_S@wSkvJZY_A+<EVDso}$&1WiiUkXU9v-e&k#uB{f
zjZ&df;j@KChn)4Tr+(kV)jj{y+a;ZqJ3f9=Hh8vmr^>lLcJ6u8EcTX6eLZWn{!gjT
z3c|XDLb6i|7fjE6RrFOTc;~g4f3M$5N6cESSh)0|+}EIP-Qo$`SM+yOT;&%MU;oc3
z*Kq+CqsN^6jt@g0`B+|Fnh`kpwUoKlp{P|FNk{+ZzmvW6d-9!c<%LnfaX;?wS+c%z
z^Rbc}3%73i_RRB%k9&j9$xluv{T|I(<akGHxyO9T&dI8kI)!o5+^Tv#UYb=-+z~U;
zVz<YFDF>alq`oeT+FQ1F>zYe%S$yn;o~zzU(~JG8wass$h2&<X&3@kHXFb--PyA|h
zud9+*W!;P|kHf<jf=asmhJx)<&W|2l@Lr+Ju|uHY$WLjxI90!IyIAiEFaI9<QOHD0
zYPZeVr<rBKrGDDg?<@4D+`Z(x+_q3h@pIk(lC;XDvZlK|cij_S`aR;LoKakyR`u?Z
z-Dma|Z{ok8x&N#5|4iM=DTn<*3n*?)UcNXuulkXR{=N0;7ybPX3kGg?cHQGOXQ%qP
zC)!QvUNYC;uIs9tc{pm$KE^7`HTRBZdd)fKufN<x^;|Ch^8NXVAJ?}X5uINt<INNO
zOd-EehV|o0V--ES+SXZ34Z5I!ZhYweNM+Tu*Ck%HvFk2t{)~$3duFC7CRp+I?EkYq
z*Up~kiaFd~c4B2;)ymm%T+655w|v{bd9ualFD`pULY~{F9JsnA^YikV&$pZu3bO0`
z_~*0V;@I6~x99m~&inE5NL2BL*M<5MEk2j{*kyiPz|6nE+iG6u(-N=cPg`TQy~(tG
zx<vlA-RyJyM=U>@{{FwVJS@%6;^}0ENVU}b`M1pv*f@TXm#@vyJzD$6-#h2C@J-=+
zIvX~fnKE7IbNBa{O%2ChvUYX{=TF_Y&M}qmQ`j#%A7@^L!ps+`TMRF<C=_<SySL_8
zr_Bi#s}rBL{g(ToT>9?Hz0b`XivzzE6?m<)dGw?5hRov=3Vfw)M~*I>z9`&%zY%+7
zcbvT1PPYj=RKDxon#RR6VaJp5&36+u@4Tu}?OQytUq8y}di?Lt7aoX4Jv!4db={8X
z&UQ}OD^9HtkokM%^1BOyp7Ph9@BEp$e}1-NVO_?<sL2j`Gn@A$o|)3bpt4TnN}w7?
ziwA>{xROr_E0e+m52gt_T*|U9N~fN>bCH*Sv8nX-3!C`OE?cgZy0BAS-jiKEVC{>q
zPK_^B1f)B?)4xyIchx<=>h40J->dfQUv_Zi()9&i*&kjvri<@=DsNM)DtkFE<Kk)8
zM$nw-gXs(^=kyI@-n%)RmU+UPyflO*NwIKgQ*%-0yxo7EU-{8vT)Zga|0nN~fAark
ze71_2AOrI57u6^xCQo&SPVXxI>C0+o+uio$<+I;l(z^J{72$>>OY1mxFeh|cSN}LM
zXG+|R3U(De^`$AX44vL#^@~guCZsSy!j6sUr3yo*cl!S7JX4i(#;dNxa&&sfPG5Xb
zK_HrG!j2ndvRZsh3J)9@gxVKA^kNWdXME(sAk@B*kx4;7oN2<29bZIVaWW}9n987X
zPJ88)IQCBO%;VJ=W-8~HRbrT!6a+Y#ChYJiVb%g^P-66m=`ff)LsUhtbFY-&Y=y#y
z1ul9D3{0)+44v9G$x|4Z6ar#EBZQy&VlyY~xKSE;QH!TjTPjP0g-JnR0)vX4f`bZZ
zP*R4`Bj(92l_=3B&{*W65NVH?DOJp+jEn*W$_$-{e=sl#cz{L;FIAa5`R4G*iEY+`
zHwuNEuY62cS%mx;gr<MsbNlR|(8{7vsQISE-nZ?@&xKRhWk){x0N#$VL%`x$Q)}T$
zEs)<Bm@GLR9u+BheP^ApqvX4N6hCNY5EOL`ObP<FOcQpz`0BYNi>0$Wq<n!9RGa~<
z#$kd54^ueP1PdMk0mvjRgzOMw^oZH=OSx!k;}O?&-T{h8vW-(f(O1LpB+E`kPh1gn
zx-i5rFnLs!p;KG${f((?o!zn17al^kjuVs`D%ja`<y7=E9G9>o9VWogSitG<=+M)d
zJMJhHcD@cW0UtXLnw|p<!9QbBC^US3z$si_C_LlP33>4H1n`Lh4h$@RK=tn9sk>Gy
z7Os3<WFmnisqlzJp-}Msf}n7Iq3~C74nL7yqcFjjL8x62l4zK~!2$Mx0t3@UC5F!K
zuzHtuW)pUtDCN`wPqjkzPAFoUuwx2;L(JLsBTtoD&Vx3i!JOi-0MwH_`c!uJC56I`
zuZ==b9P^Mxp^)?ag9T@skFctCe1Vz=u~R^g(IaMujqr47NFyJz$+Q8yDfq}71{J+$
zvJo#{IX~j^mG}u+as!cTXs~CRU{Sv(sN7UVuQaxg4O!9!r16n#$g5Y5kG$?OYoLbP
zCQgS(21+p?(g7OK5Pv8{u_zQ?eAR#ADw~k_PCds~WZ584;{3!oVfD`mJ7QWsSb}EB
zp-y4o5Sh-PvTnzo<Zvk!y|UOgE@a23fE@EsR%T1(gdGc5Cdwg269da3RfbOP>FJM(
znvbZi^S-EnoZM!B`cQkAxnpZRVis~L*CWX>3Vd*95PIGp{%T#@k)`WSUra!fZ0rzj
zIO4je*(=u8BW7ha!&79}G&q<sddRr7Xft}quz@G48yGlTnh%`49M|FI8o({kq0t!6
zZ)3g4lzV^FgdHDds%g1<#4Oq++zVM=*I~#wZO6a6=Q;|x14;!nWEx-c@AsLo<Im4c
zBJbQ?et&y=dwRW|homIagdHbJShXO@3z~EoOzux*P|>@=(0GLPvG~p_$Y~5b|1hP@
zX4czI29<NdYR9`33J?Bp)iYJmD~fI6f+UM2Qy5hAmdf_0RVy)ces19u66Xo7337h)
z=yLc9O%70OIDjH|(%t<Tu?9>Nb|m}}=;S`8U$j!OaN=vB5J=WiSPCjz-d3307H0IA
zqwdpjg!Qn<{3{c7{P-f8#Rbj|6LOd)>`3{pA0^(yprR*X@A9bWV|PpzsAOXi1(idL
z0v?)-9x-0la%aD}FbF;86jmxcXmC{B6xuofCtnrNIR4GQ-Pc%^89KR-@lV{5@JDI7
z6mrrE@?sF0eyPuWv#nsmk;+3Y9x(<!A3b?HyZz%AE`#KglWGi|-evz6l^*19c+~XK
zSw&A`|B0n-NTpNb6rqMAOSe3}nB&i)P&m&(aWq&!h57Ia7SFoYyxqMSmzJ!0Tl0{0
z(dKhXzy1_2>ef@L`up`w_Oo-UyWZ_O_5I!c`76TLpZY(o<j?&{<%`{1&f7SboH3kw
zz2*bsMON{Uf7|Qs_kQcv7n)mJ_Ea=4Rla&nZS7gB7k|!FecoSOyKd(Tv)5)BdVlI>
zo4M5gGY-7F>s-v)s#!jBel@RueXaLJ_<p5be?G1HtNK%}DnnJiZf0rXzBzGmyZ)7Z
zH9CH7L*o6V<*SybJ-BxNXW!|VKVN>_-k`Xf?acdqzdN$7{mFYXtMp2;|FMlv9-EwV
z@BjC1w^HU4+vR1iihQOWx+xRCYn@;6rCv+B)lLjT?T(C0=RlLeZtO^jeNw#QJeN-`
z{%T+DB>k29KU@88Rg->`vaql1%qg7D-raS%9`FC<!(o?aPr~BYtFO-6vNgogciT=`
zrNU(sUhGK?syWQ-61zL>+ULXktG#+JKVSAFvvOjQ&$>g-r#3!*RlbGO-8OOhHp{K?
z+>INydn<4Hxch$X@h|T7tER}*SsG5x-m&-Dj+3G0cmLHKTN+-s%p-H-<)YL&{gQvr
z${Xw-|C2C(w_9HAkcRud{@G@so8mVt_{1)}+fMwO&ccdTCk7#LF1M6P0iZHTmK#wf
zO?nc3B5%_g^~FLv4VOABkKVGPDCy_s`GQy1#aiBc@`A<vw#BbwlOsEKbe?&0Bk<(6
z3C{bv>nbK%=q3OD=JilWa8+pci<{}%>f7Q}{M4@J-o7<`?(5>25yh1@h1Zx)%s<I;
zUsvjpd&Zm{EK@Cvt<u$Bl{LM(V)u7nti8wm;%b#gPp@pNIlgpy{+hWlGyf)r?|bt0
z3g742pL|ccj_f?S=*iDh+UEkZS6Zu_^M3z(7n{&?ZgD+7Nk)&D4Sem0BHZ(n_LTf1
z&fWL!|Burww0Zd4gOOW$zj?J;;BvpI|Mi8It6%U5ch}qFamUwO(Dtv1pnJ!U(zB*6
z&1{0a5)a}ZRZPm=zQ<pG^5)+${y!o^BJ0kiF3A%vURS<SHSfONqVsmEdtB~StuEd&
z_4O*{`AQouo!{g#?f&)J(|_k(@tCt^^39h|mq)$(>E5<BTD$%Phw~$gMQ8PQ%d;sI
zvhMbGe#F%#CaK=QprRL{?u01ERep+Yx;;60d->_S{Zlw*c-&F$7x10wbmX+a^)-r*
zR1@Z0zq0ba=bUNw%d-87=5}~bS-xmd(f;seHWzpKSj~=RcH!7U*KJ2;R{!u3H2f|+
z^R(4nyYr%jMn`X)@pk7azIr*AWtZ!tIiIecma)vAW+;{<R<Ju`^*wz}riUU+FC}Ru
zWn4R~w<-O)+xo{w@^#PR%rxWs8FznDEqZB|SKOPgU;Sv#w`(WwSr_X#KjJEz-*!Zl
zN7u-Y0TlLrEhu5XME%M8{qdF)uTOSa-Z3#&<=x8V9na3MPdVnz6Z!0i>Zaq9gYS7C
z)!)0_>)+dL)7{@YCOO7zTx2x=asPS|$IiCONI%WJtJFW2u_sy7=6UtZ-Ez}aqWI+e
z`xnJhELA@XUAz=%X}oF4ccrd7{`3CC9=V#W;<x0)Yq=9O$Crw?MJ-A3S8?C9$1S+n
z^^uBKrh0Xm&~)vr(?4I>F8h<eyV-xkoJ$1KPKvsUOl_g{m!_3B-gZ7dsd0L?-O5S-
zb)7b+o-SASnUzvJciD2)KK0eVmU#W#e%-b(?Pi(LlJyto%a=tgHJPO@{i?HcU-hv+
zecKiY9Y2&-uF)H7yJNqr-xB$=cEyuJei}Zw?7w%e$E^1|z8tzBqWQCYZ`FO-yvOPO
zGX1P#NqtW7)w!v>U*`G#Gj#SVIvf7zjJ({JAU~H!d%je~U3Yl&=}?=CrXoWp_qB_l
z_5`&4_N;3?b8?mC!c?Yb_tc7&R;4zrPd_+m1+RRSlF;$f-_Gm}$qVH5jrYEjxjfZ=
zk%M>ZbdNcj_Rm%}<;_2R>0-=s=B$?!568Ov`!v;8|I^Iu^>csjv|AgbUuyDt{c%or
zAIn9n_-`BVH*d6_8?>V;GHB-c9bbOsZneH~q(@|zr_A)^d44OW$DiU?c-Fb(PX=f4
znx|UF|J3b}J=Nqf=epp7pH?1o(nEF_fI3kJtz?D7d*Yu2IX_amu6lC)gv*)@MFE|c
z@2Txh-<oMTkyF)M+FM3H{4uk~na*W*U9R`-nrAY*pLIg&g1qXp7XG`_Z+J$gg`B)2
ze02Wyyx6Z9wVwH2{QL92%jg>)%6#|I==Do8roV4GC)d1H3BQrG(`fqpr@!Y|>^<-I
ze5Up%aHya6ZBYq2VQ+E6);`eq)28j`-c?rSJr4L4Jw4fO^Mxr1E{`_tQ~e*YqoHrt
zrfiMxc^)wu5)(b0I2|4}mFghX0Fchu2fv2aE`Oh=war!g7krl~NM&Hs|MuFsdV9{o
z!%^E8PHwv=rm+0fk_kHwa7=V;0vn=-Y{<eSK?7y`T`{{g0ye%1?JAn1WD_I)Ipt8$
z75yHgvo9u{yL`IHd%M+=#(0U^2?uQM?Wq;1-CN52_U^{!ESb3eTYhUfu6TLGDAah&
zDF7GeJc!~P-o#X6iFovHcd`213u~?S7QXlSb$<K*h1ZggUvA`;etMM&G{-VQLlsoV
zSxF&9&k6xZBi4tb<H+0_i<gH!E`M(C5?{BcGwRPGb5ooD>HDG?J3sgGa0a<B2#K%R
ztccl0KGJePdVBwpTI=$O9x~z8XE&d}U_X7Y@7LS&-H+eBaz5N!p>SWp!$(sD8;-EP
zF7SvksGI1`+u0p6eSs6C!D*<N;Pz<At@%3j)34e6=8-K4G0rM>-}(HS!y}u;hgvmN
zK$+nsQf7ellaibccs|<kt9o1h@^ZO9vbF20OtY*@t=^tLZ++%t6S&WK*$3q8?^vDP
zF#Cw7@V#C2dT)OwNzXM%5Rm;g*=(I|^O2)V9tWPCB+zh#^)MgSu-@Y#WB&B7y4>=$
z+TlT$y{&^cznj{=;Kp8sLc^C8A)8e|;k>~aQTD^y@Ex3t>xGV=z4_sl_LhG~#aF!Z
zT)QIlriVw2&o7CqJ$#_Dc3%UetgTbgE1K#K>kuwD!ort%L*sRc*Y;O6I}4T9{y3~0
zvNlgPtNgodsGh6Jx+8NtA1V8S{8|m^#(a%mw`<Nh8Ka(z{eP9h&7oZp29`-45588X
zU)o;1f7xXB-i6KmS<^C;f>iVFwy)0o(-EC<rDgZZll`;OZ&@e5V#&I&g0pnjn%U3u
z;=UbQZIXOT)+F_rjn$Gc+2bWYpS;~Ev||IZUtfv;{<0{{w(1f;pM^nDsWZ&=3M*Nr
z{r{JEYh$9V>HA7v(_JejU-j{bd3Uev$kVhbSF}OP@Wddga=&GniK)fEid9#aWIlP-
z2JMD5I85UF^LTyt%g5&T7Rb-56qBw0th;jR4nEm>KVf(NWy$^aozLtG^w(C^Tl&uX
zn4so#eBvHP^ODW?<PA!6bgMs4^-;;Yn9P59?ca6V+C00Hql1pVY0o^mJ3sy2<eG?o
zyUMIC?^^fzs`-Tvg~2PTzDh+N?q0t${Z+-Un%81hAD@_AGu6IdecWvBx%WNARuNxU
z#w}j<yv#$YPpv5K^Sk+pSD&s_7fZW!FwlE<k;U12d#l%M60@<m?X^AOqW6_o>dV!C
zzIK|hBjGNzA$DD_V#dC|Uxc%&uHBrtT3QP>=;5ip;pyM+nOB4B7conlF8kSDzU1HC
za~?61i*=Wabl%>RxmVTs_WmC>oBwPId%MeIN522ACEonruPSCP@wnrj9iFbc_R0Al
zSMx7$ytay5weqz6qB>{qI==Pc_jmd2kC>6Czx$?;qnO6>lDqYV%DuX;{r|T2Y+IK+
z@yYc2Wi{VkKGO+I5&Zp;Q$6+7^N_xR3EO6U^}ECHYd@Rgqo%LWa`E3M`RvS}Pdu+I
zJMX)di$@Z+7692BnF<?TZ$AIx?eFIomhC$G@{D1*%DR(rs_&+ly*0XMK5@fif18=h
z?wj@<o8rIy+kwKkU-zZNPG4EZ`+Aj!%>Mt^UV9r~G@mw4U9NV8m&v)9RWGNk1Fg=>
zd~>1ah^MTI)w8X#oBRuYZ_AA|(t2y4>UZkq#@gAc+y1ikZCR8b^loWvm5P-Vdnb2U
zHaIJtyz<^>mIwRQb<Xbdio8@_w|hal_6-g}oC-qMRfG>-5>gT>Pia0D*|b>IZ<_w8
zNltRgd+uE7Ir36<x>4VkBDc!(Q>?d_$mmW{owvzm^5=&d^QV_>=<qb@SM~E=IxTST
zl#o+5cc*VxE!4N)Z}B>N;q(P+AAd7-a$j2wF0^J=U0>$=<@@ctrOEnleM<YRv?i6h
zEOmem0cEmW+NAEs-95)|<>IhSJb_9Pzl^tg-0}9Fce-oCPS-~&PS4HS-l&}Ow$j`l
ze&2E0k;>fmz9W{Nb7GfV)O4D@<$e22IggkPa&4e;<K~s<>3e<O`|X@OYu;Ts!JQ|*
z32CuGMi@kxPK(?4SYKPed~tQ0bFXln)l!d{_u7|zFFa{y`Yzhe=gH)=(#F+~o)`-%
z?^XBnp5wASYHQx>UFv<RlRi)QoAIkaIQY?-)CH5)-QH?`Y0b;YM&34OZ41lZZn%AW
z`S&9?J@2$mFF6@yrS)`1^&^$2^6H=KH_kEAH0OA<y#B5G?{9Cp*Sb#FaUsA>U%BtZ
z+PU`qW$XTT@a5iX$}-JzG3B?Nvg+@7#)`g}FT1!+AtN3gnvK`r|K+XSU7nvcPx$!V
zlzGcE)IB$aWS(5Tqqyhl>R(%2_b%Cx^Cl(g`$H+I5VL!`{HBS1`g9`i`W+8_y~=As
z2ao#yy=b-Y(z_C`-Akv;nSM9Ua`|P&uR-3$ma3<-)|Jgz9&F|M=*_9bk0zB~)^8Wh
zGyFR%WTlx>;lU~)P;sOa^f^&I`d&?Q*_tTV-S&Ug|NdUR{>A?Lb<*9t7EgjyUJ3y|
zOzkE)F6!T2yj*!g-#l;0_kSNaLe~ek-wM4Gtm+#yUwAsVUl89j4=?MYui-20+K;d*
z^MgibR>H?Kg`6Ioz3qOnzWm)6l|t#b+SY6Pe=uFOT`9U%OFGk=K}h`lWl_b#*1t)h
zDy8)Z>*Mw-P|F#ZI29j+J_@t@XptMabJDE1TQg^smiOIS8v01cNajKhlW@ZkR%TFx
zfgjnr1KmgVmi~{q_4lY*mL;s-Yd@lzlCVIM71Z`@6BQEgkq?58<Eg;w_?vpm?`*EM
z-fR56{L9{~t1I4KdAX(j>@3S@-p<pV9O6r6fcibpK@N4!66oM<<e#_G$gI@W73A3m
zGq}aXWSb@*sonG^HtP2q<B|)3XIGq+eLC+oYv<=4F7aicas7zpaNUg4b|l~8?e(n<
z3>08!{37%t;pN;AuFlQTemkGtOn))|^u1-j-kx9lIPcZ-)xj9e3=dwWjfxDN-M--q
z7b#4rVhWtN1F3Z*Zff(N>Hc3P^wy1{DaVmE1{FQC*w(HlhgY0aq8|CZxHk9Rj&ilX
z&u{+^c)juQ%L8q_pI!;0H>%`=njLaD93CZI4qc(av1Fcurs})by3v{Mr{7+A?BUlH
zZ?AMY)G~H*xA1og`+=(JW>B*P!#=M^aj$N-f6KY+)V?*(S)F&UZh^hSBPL}RHcmBA
zOK2ykP?;dblH|-FBtCncldGe`R~9eTLY{x}+~OAk!^?P=@+%ZBOmI=-1da7x+Xz?h
zppy6X7H`SZt*Is{4*sv|TL1U{a+LmaDw{oEH{aQ{IdZc$_w9=R*!uPUXZe-BwpHdE
z|5d+yboI)k7kjVl4HvF@m(QLmdh^o{+btg&`&U$M-h1u0mA&WfZ&zYxn(VZ@|Ly3%
znMFHd=7t$vEPed_oc(w0S@z|!S6BYkefeeAqBPs)BIhar58=ijub3Hg6blnCxx{&b
z1}d*DhPzOqOX&LZU29F&{z+aH_;|lb-6cVjszdy%J!HJEu1K7Jqe!grRpnK)S2Z8`
zq$0{x*4__T+m&Eho9Fdlhv5G;L1NdtT?3Zno}c$Vb?V{D4LQ568k;q<h_W_zfYy^Q
zQ4Zi0&`@CXh%wj!xAA}<;nlNS-!8O#ytd=6_IlmrWwKQl*K|EJDL%J&OU+R!Df35e
zuV-Bpez$bmad}^Ed5z`I&PrX}roV5?r`+l+Zz<C?<zmX6J+YNH%WiFa{Ck>ACx>j&
ziTaF($y+xm73LLfsmd+g_375KlC0OUl9E?ipZ=P&Ueq)D&WqPNj8~eTi!6Mc?V_Ss
zxbdgff<=Z344vJM>I=Id^IsAUptdwPkv~u?ywmZs`2B*=>}%1Bu07v%&SY(kf$u!K
ze!1u^M<#7Pzy0k7yX3-TLuvcWKGl4kN~PvrY1zc3xA(SBciQEBX9DB=i;q;<FZO6k
z>$d%6GhecA@AiQCjJ-A9cRq=lZklG{wY=kqW$ilFHGd<m*QU;kx^YHhPHejKqomKG
z4z9_duEFASPN9wp{!9~g*c|*DZ-neLAw%Qur_a_uk=(U9DQ?;k-B_FJT^nAWmy(O@
z2Q7Pw*l^_PiomlIyImfoh2|_sJ)Ir&S9#yUWxo5CRqkK!Z2tA0*Yvor%GYfRt2mdb
z&d<`9{U7~)^XacUueiQHcv;Xi_|ch7cfP%w`L*Y)gQKgX0uyK^>IF*(BMTR(@y}xb
z8c<)t)_6mpV;0h4%%mTWpH+LhAAXV`tbbYrwiMIXbMxeFyR}}W$}TxndRDc5qol|R
z4GtI3G{7R+=B!4C4p7@Z0W_`yDc>}_Vjj2sKWDWv=-*or$<wUSN~t^jW<ELp`r6NH
zxBQQL>z<wN@_Eaen{z8>=Gzu;WuD*caEsI7(Iq>k3;#?&U5wYM@URN->HNG&z0dXK
zl*wh$N=NnUat=Jd{q6i4@6SS=#!GwuuT@UZo4^07pYQ95Y*9Z_>ZRq27Wtp8$q8KR
zW8&xjRiSXB<i8aeXFx^JwZs*A98Vk>gr>8bOl5%-<_<X=E`g8E9A{b+6}-3W$jtMd
z$q$}ARQ{B~^x62|_PUbBntQ9BzSHVgp3wK?_0}yPe?N<}sPkV`3R&=%!^W+}#iXjn
z(5YR>zMw)tg9((8PQWvggNmM+s2ix95AtRZntp4Y`{7_v!hrM{(MvgpM?v;13<|8E
z(6o|-yAM%#GN|Z%b!K1)asU;Sld%<)<Qmc7ac7(Uz9sMWx?L)+|6Lt+_c*pSc(?c2
zR_`m@n+ILEobizP>6JZ^i}y$r+CmrP`QNL25?XLXsPgFs&`yrMHDY&-D<!V3tlh9Y
z%)@?O;V;iCf4jJCmU`ZDjr;lR_PdXhp9Nf2`?~r=?bEGKkIX+izq<YTzf;@!3Vql+
zyOY;1Jg4xWAGDHA|8o{RMI?B|+zyVuv?6iu=T-J9dipt)s#;4@93FLWoxHyGuwvo6
zcy^DNxb563OpOuX{<R)l>jT$EZ=Uol&a$oQ-*oHd^Ihk{TKeMCj`S_&TC9G@TeshS
zmem&fsbcz-y2728E9ca|%bfY5*@e}2;gqT~yZ#qy?@wf2zfgSozPX)zML3t=y;x}Z
z*f@}X-@8-Ur`LQh-RU=1?diK8MdG?OIfkHoeropZQ>72Z_aFcIRNVi_?a~*if4{Hk
zdidqf>HNrv8$z?MUEag$pZe<b+y5DN)4XSu-ahlEMREUn<;8~%>TX)8Tp07ucC&cY
z*QgIKlDZfAX+JG_d+Fe=XN%^~O*Gv6ICQgH>FU3>yUVuRdK+4`qi62!vgpSRKTqFy
z_+s6&Ws)C$pLx50nSa4<Ex9efdS-r=wpr<Q|CLS2y}Wpj^)+8hjpnty_xH2wW1H%`
z_Ry7;F|wVPYj)P}<n1_eb8_>MqtE<7>Ah08;mFh8b6y;f7Mp?@%cqGuQcs6RPCE4H
z(xVs0t{M5vJ2CsT-czf2AIkRa*lYT-{_WZSTaPrx>jXKoeOWSpZh4g&|GYWYo-)fX
zZVCV`4)oTJ?E0aay7vBuPE*N$HMjX**?-ZSRrg%rkx5LvUi<4alV6^lpYw5x&z66J
z?Wv2V%;}4t!|%T<Qe#JK+0WBU?)W~M^Xc)f?jMEdEAlSoD=oaFmS+C`yMKDc%-ZDR
z+27~wH9=dEm*#TKH?D<!d++I)hN&~I%Dq&M%R2qF^5@m^HFXys`@YhxJ?)eFbK$$^
zZSL1LADjKoc(+xj!l`}%znaoov${W-VS9J-PA|>M?`5r>dwEmxx^|B_J%8tg$7XOE
z@8|C<-dF4NDCzTZR#Dc*J)91YKDn`Xa<}oVbX16A0WEo7;-+@wj?ug~e)H_~vlnS9
znG~MUU&}G$a%q+S9FIAYs;6Dg_xvch`SyHPuIsTBCcUp$t9PYbI<siucbUqY#%X%`
zA}2*QPCRd^r7B#~xnzb=s(9z?>c3ATlQN$zzsw=Oexm;7n!9DPoF?BhDxV%bWSDoU
z_R6Oni!<MRu<QG+y7XRDSgEyYm*(}%ok^XSznU$M*z~;LHRy=HQgzvzFA|rZX1u&q
zbNAR~P4&MP-gbMxb#H0B{MxoV?pX4=_a2k%+mER3EWc<bumUt9WhMlg>g8YQs9?tO
z$L*2Jl@!T^CR3*GI6K#{>)8EB$%UIH+kQ?E-fPo)`Gsn~AH!KsBmYCw_xw<cz4&wa
zwIDAs&bpUN>y*N3Ca`$V(J7r`bSAYm?dGx^3t6VaU!VWkYVl*D#peLYhYP3AJyji-
zdeHl)kYnkjIP<tXrL^nqQ3uX!c~&vu=>(m^Su1BBIVluxt5>Y+d**1k$~l?Od%jFO
zK6jeI`oE7tCnXmvYX-S07P>O^COIr<0Ts9p5Do18Lh0)1mm-Sa%ecRhi{7G=%|GYv
z%PE`sHK+R&T0T>m`Twl<uUY2VKJBtmr=upNxt;s6-}mIFOMzuV&$mpuXfxxG?;;hg
zqZ2Lky_HLCD<oG>H;Gf}Q|`)}Xt6y*jQhg0wYM|B-uilN)2SHs1S`)u9osmjOLl(N
zD%^DZg7^MiCwD#57E=HHzr^a<>gkQM{+F#{S1SDZML<wR&pywCm8nn()ZmSV*D(t6
zo!rU(GVkLj{f*pY?)#0&(B@`(`n)Nb;x8X?Zk*;CtX9cC`SLuAz2av)9o{VRt%_VA
z<8!cjcS`?)yK?IH_NTs^JNa<*oKMmFKhBSApU-kv_<6~Ou4(=^S1<dned<zR@%uS0
z_LV*bE+*GcE{jv~<MVx3s8IcAkH;MF<!XE0l$-ER|F}uL?>p0{A5ZL_J)N+renP#&
zhu!La-}1dgM5-U%$)7u6$BjRkpoU03(}W#+E=F>~D=!B-q2tx<d*=CRUVmyecT(d?
z&i*|1*&P8V88vMp&A*$TG_6^|I=^S-=V$+~EoXf7V*cei73)r8|H&m*e~V5utj@gr
z%-{HY=4938J%7IZfiK<D{AoEg`S`)S3zfY`3ZpjX3EkZCIJJC3S8$2v_0yh@zGTk7
zqrOL{>|pZwoAX}1+%jj9YRr7UmHu*9H!XYpxP5)xyOZ9UkrA!$53<W9%nf<<bW3oe
z?)lEkHOr^*pK>vc^!jV}Sl`Tio{ctF=V@V?NsSIYpoIaYV(@mAKxemaYpzz79_yFg
z{@&@8_nGULoSS>=d5qYLso_!lt2kya4!`Gj{Ubj!Y&G73$ke6!CI8)elUIEBbb<%e
zN>O2KoB^)F4B%CmKze6!)sBjNLg5{qbqm#h3SFJpvDos%m(Q=eOf|R<8fM(JbULp8
zS>()upH^Sj|2$(I+voDJXsWDh-_BPtFU~6#wyO2~aa`~Pl;(diz#ADyA|E9&ONz2G
z3UIVCsOX9GoM2&5LX4eKtEu9!QstaY&yooT=JeH04>oxE<4?`2fF9fYpI5ju9TYlQ
z6be1xgl^$v0WBbD>g9m@nxRuVQ&x2T`$q8zJ8tllhL|vdRx`bDF=1pW0yP+=mEifo
zL8ULXMZV+g`o9_Wv+Td~O=`E(kKd^F`e*p3m~G8`HVyAqRb2Sh<Juvga`yVG^kpxD
zfBlNNezU6L^W*v`Cs51a)THdU>mEP9xO49rVTP}sJ)dUU%zp=3ufFN0>4u!HuRi+g
z&i~?_A5r@J`MjBLw^#qzelqh-hh53{Gwqw_F>n7lSvM^5&xFD{fi2I}`8U10xi#5M
z#!~C%zctq?yHj%5q_w!17&#ptF)_Imfo>1-h}q!R1R1~(kY<W{6k(C(Bbj!xW#^R%
zuFg+VZhi{Avd&rm^}{=#CZtVM6x$E#&+zwN6FO0Os`L*--ZZt(jc0WNF29#?;_Y9P
z?l~>yIZIGV*X0k)`@c@|oTwhF_xx~@Y2`1Q+hrS$y!^I%>gx5EC+?W}{cqzF#&dap
zB9cHW{~sN=-Ya3=tk&~R{MP1|Nya|{0tG&JG6+2vme6Bwln??<Q<%VGBf;&FOP9qY
zRik5qn!4IiSAG}=pUBgnpX$2o$F555d7GYeF3z&@*R^>z&$RQ4r<J}zsnj%|%`yMJ
zA2Hm&<YDrA#m=Ue&rf=)_}zLRD%W}%)Zi$4dn>n7a{rwlg)POPZ4Eab)^@z!_2j6z
z@yk71M6WkEM1eX8w?1c-&jMw%uV(O!c0ow})6t}~?;DchX6^XUSd=|;=`<Df&gyUL
z*Izole&6JY7Ry(Nde6~WFH*3_W%A<6GH<D;QhMR`Pong%UwPW~pjYbF?5eNQ^KUYz
zd(Fw)Kl{kb*E@V=X6{~g+jUFT+h0B|XM5${rH}i}aoxLaUyEGjAN>d~$=UhbT2Wa{
zo|AS2ZFc#oAdt$mU#_BT%4+efyAiv-y}d0SH4ikdr=%><Va90Qd3n*Y=tQ?uhdPd&
z{N%KL%NdVwiHSRERQ)b(+~Q?j<SHB)vpwhTJk`3m+kLMc6xOm#QYy@RyMM(t>o=Yy
zA-6lLTqir+c+4*(ezwkSt%E`dlb=xfyO&8XXMF#?-1zssJuZ))B#78oKdPvoeXss`
z$&Sux7M40K4?hT<xm*1$=zHAM<$kB)Y<~89U+g)>@BQ1yPp3Gl_MQImbnWC9-!|D@
zbJPrCzn8q?=gChYeGg)<t+bsM)MmG8Vqy@d!=p*JHZRQ$zcKm6rEP3WkM}~Hv0bp?
zSChj$mN_bYn<p=As@b(jXKBjGPg0%6(^F3sxr-S`I%SqyEuFq}$|jLX{xhHK`uJ}X
zYrpMuOVuF#^f{r1bN+q2c>J2_pG(VwKXJ~UUSe~4lAdm1jQ#Ex3x0Yqe!f4e*KTgm
z=f_qrmd^QUcTeHGp1YAihc2T>OwMk-OQQVMLb5N9Eh?I*Eb#3|^AXo|-GPb%8a$0b
zw(9E))^h5H_qo{Z%ixTye|!G$>5!)_&rgf8XdC%jFnYusvfklxmb+Rgc2U}b8#9DB
zs{c4XO0p6N$QAhDt*}aHxpHpW>@zE--8ZY=IZrw3kCw|6pG7Udc9zAxYRPs`c*~+t
zm?_WLnpuBi@{2=Pgm$V;1&7D<D-)Z_8XFuHRtdHDfrn`)GpO(-FA`umG|xqRQQryU
zoh*ms>L=`Y(kHrwhh-Co!y^L)&?#J?6)Dr_b%om=p67CW_16`-D>++i|0xtMeHS=|
zpUG8;p;KEZLxh<Lyq!qTz$qocS>cVRi9&OOJX2!7y~iB+UpvHDl0ZHB$LE}{Ix3`s
z7S+A}pk!+P#<OJNs@^0aCxtg3**m>Y9B?ppP!MOFX0Zn}#;%~p(zi1qF=ka))v*?b
z8<StmFp<96!_m0mxV(y<l5BStXcZrL`CJ17he#ht#i6Srmg*BB{yh@dWCoou04qun
z<fs<n-J|fJ9W4L6-EJK#((2j{)y56#7dN&vcunsCD+u4T&gCF{v{r$E>8O&xH<eux
zU)?NEDu5$ZH92O1D6$GYP*^gXar{sO%l};1pA8y4h0GT)uq1gZyzwm2QCx6I6_P5Z
zuF(!O0MFxsUD)8Tk)y@7Q!i|9pq~*3IFL_Dh-a`Osc7H^xw3(msSYAv_u_FB@_Nn)
zA&zPxS#ORNB_|;YR9}0E@POwLAd6@NbeIzT7j=i`PnpaEj-D8!W2{k(NGcvUfub(~
zHTppD0r7^yG>|8c3o&WM8bJ&=xl4!@KHdz9if&L;Bwdw$s?rP&$>}|XjbX@<wL<_D
z6*oqsVl*m7a|JwOjOGeZSvouls8*$d!@TAf_Edg;5x)PID6fP;LyAyrV4&mnyJgZ}
z-`t#>c79&$iboUmIZ^~0jwCw369I_fz{g}={%(m?>8lI>e!urO&AzrI_x85Hx3{)t
zo}Xv?<?Z(SQUC2e9AG{u$kOG+AQUcgMuP=utAN5gmQ$xrWxTqw^2_`B|JIkM$p3l3
zZjy09K}%o1zxs2)e+j{cBS%+k4hRL+9EY{CTw!YgKx?5H1sEDH2|mcUv#?#xE5813
z=`x|Kt3nsAjoz-r6}~IaGxg+z9U*1bQoKl2#sq#w*<NQ6J^3aNVKzpO7zvSy>`0ZC
zgBHh&44Fj*D_?x_FcT6IKbhwUn?wiKDT}-s78eFD^I3SXnLSxQCd69v;n$`ks(YF*
z+JIaLb5#LnfyuhKy;WacE}x&IAG0H4;%s-OjT3gvUcUFiM#~*MESEeVTxUM-mwxzS
z7D%6mAmf?#per{svf@}SnaFqb9{Kvd%q?zir;Eab5~j=Ybu%xOT=uG}UsMZ9cMn_|
zRQWzD_V5dAP@k~vr+^CIeHMWZ%f>Btw<O(*U110E&l6t;q3PyUDn5*jTh6vC6lzLb
z$w=D7x#aD#sc-JNyarX02WB#;@GWL$nZvmx*N&z0@ODO)BtL~*^R+X|#aDqh9YF>r
z`8b0e6mmZ+dc<(Cna-M#!<4x@Q+2obMJtfY9aQ*|8CX6kg59AvKd{L}K|m3t!j;2z
z6?mV7t7?NsOkm;S4KoBiZr5-KiEn1?dezk6sgP@~+rt1d!$X_VBPOa^?(H`xusw68
z2R3FqC>&zBRCc+mAbdd<$kPF!WyVd9T%Q?2{H*tkS?nr1$j=uZgXAZKF-_R9<(ti|
zx2+Dho;gpj&|l1Sy}_Xc<Ym!^>m06vyu4rmi$Wpm(e5+JehRst7e;vA>3e%Squ~BL
z@R>|)k=Iw}FL?NPS-{s<rcTmzr|(rv1+CgK+t1n^w<yfs(ss+HKPf3j9E;x?FY{Z#
z);lX;`RQFhS9(2*T6g)zrhixJPh@0$TKw$V??3mhd<ryKkz;fA|J%JKQM-%1j_Is-
zJE{?QD=*6Ml~jl*C=WP+W==)bjl3m5QIw=v_?NBk49CHHd#f`Zy*OCnHM^|R+hd*K
znMsG&*EPF6*|lZC{O$v>mzKIOTiY#HJ>y<gZKgt}Zp)Fvc}jUvFZfo5E?>}dX-4I(
zyr|4K$y+2rR?J{f;d{&giD8lI&d2LgCf?kV80)sFMC5Ew$i|qEJC?_!7-a$<Ul;DT
zn=)}hsr<TSFUxxu8$P}4*>ZS_h2U{k{&VM^9G+<DS^dC~ee%zlGt)lxup29-g~y+p
z@$}1T{qhT!{q2&jetqowdNs%FC)JtL<)+;zl4ZMdp*L*r{yFC?mNWa#v^<~1KHc#B
zKgqD#`Bh1ZTP?P!hMuXL5w_>I+1*?3vtHM(oGfhr?8Ew4>&%zk^&3B>Z)<M9?-;Ds
zx8;_0Wl>!HjQ`QE>`Q#qtWMP)Szoibc1PyJWvzzbR$`_)L#Ov|4u{`&LEYG1{%ME)
zc4XY&SNpr^>5`z+zTQfKc@r(B&43=!vrPGw`4*QKAF96U9=yCs{ZQ%Y4I4|oFEO0C
zv|*V*k*QDT<^O39ckX?Y=H2n*DAU#5?-RVke(XALcQ=Rg(WB-6W(fMq&b;;S=<~2y
zw@=sm+S@%`;!|7wRdz~fxxS#ganSP9@jvfbU;7j=%`uj*_~Jwh^SYY5o6O@k9tpU<
zJ9G21$kUo{TY5j69@X2zIO$W*rLxOjmF0_nDljl<g7(7{K41LO5|kN5PJ7JRsh)S>
z!p4~zb9U~TpxUjRD`YsSV`>kJuDh(&zdJ(epQUtjmwCMMh*)b?YQ%K1`tXvR;74CH
zu3Y~x&+gy%p3L$KM^=8n*pn%2blT-?$LzZvW{W$bPHfA+o%U3FqNVQYs7G^t6iPok
zHz!N<^rPjklfCA|d1-z<GU?Qu@ALNPd))Tb&JXYa-|1_p&d}Li$-d|ptLlVpKLd1B
z{geu$f}S|=P7{cf(@<`mF4{NGV(u~C_x@I0>hrrV#8h}r*Yegp>oMo@L#;g`AAbn@
zpYzJ+nzeJv<mvXaE~zM$-JW5s+bO*8i%_R9qe$Ro_Pz~AUZyV26G}hc|M>~C*AoU2
z_0GvJmTMlGu;b9(?U%|fYsq*%0rw0psxfqSm;GN*wiN7^HL88>T&GV=?zc;kPQ2uN
zQg@Q-qmqkD!!NCxs=Z1kA<6C0naL7MWsHT=yyo<IdCDz|nK{uy*V(UqhR2g`<;E)e
z`QUNls``a#E0oJRj!gaLw$$7Iwy(C;lR#lm$)usj=n+#`xp+$jsO-}CCY0{B;>?RV
z_r#Y?XTJ6J@1w((Z{*}3Y*Y8+6MATyyUZoC^YJFtuBYk#bEa7=zpUw+^mFyG{)7pa
zwa?lX#;NFu?~j>i(btjWA5)lLmAN}pbh@4wGpH7O0a`k+Tt8s)3#dKEojv6Bl&UX2
zk<9<q;<h|k^{rA>8cWxI-F}GztISKfLfgyFzgv@jCo{6ddhVsX$VGKw^YY}cUwm8Q
zrQSJNbw%0o;8%a<Disz5oZ6kgf1c$7*5$!s@1ITEHaqO&iA5?Pzd<*JGH>k^1D8Ll
zg%_o?R`g0U+NiIqc&wajzV^{XS5HvUDxk|WVMoSS_AFstrp(#=j*k|Z{u2t8-kUOo
z-(@0uz5237!7Ptkc~OE+{j0!@f<R7(M_pagtw(#nMe&Xqya6HNEIy!WA&~{F0yg@z
zT^wTKo*u3kW+p+k3EO6ewd`gJ1&2zLP{R>X5q+b6OOOK<3Jv8qtgz!)@^)FP9B8v?
z;})TYBTKhFznG`ba>-1-=}4ud=ngg(B_B{7#!=3?3cM(d6|{p@BQ8a*7-CbBT|!{3
zfX8iLZrgWG3E%S;y<V?g_2c0x@vWK8U!L0wUtK%9rbWKS;mdJ*Azpd8HER=}3a=6K
z;AN5o?W<)IC{P1eb$s{P5LNEc)&IadkjyUL+39jUHhfQ!ckHF-^Mzwe0#k3-e_j12
zTR6<_=XC~6@JR+TY^)p(k6IX1AZ9Rn#GJ9-5O8(-WDDaU^(OWA#j$*LGhUeQSG+XQ
zT5#6&`1NbnYPM~^>t>P}^fdkMF2TFy@_}b(x%~OZtuOHRh5OWtj{n|oK6ff_V{p_y
zt?+$wtps}|9Ivbpyl8wr<lp{27q8fRHJgK8{aAWi_3P(Z{M$M81vC^HJz_W?E`D)B
z7*Z$yRP5wxYOn@Xrv`F1LK`MqOcaX?`}yzp)TF~nS9g^z{j=wj*HWI3pew4}`q%Y5
zdRfvfCfF+-UU}E>cuM`Tmn9-TDrfxe{kZk}z4|QM-9<B>Ut4=Fb=TACHydxJ&Jr|<
z6Vl9md28#Yxqp%_J}KK!Q}nm!>0?1_J?2IyP%KZ<`_yL*iRD81jVtP3fvWAPzxcj$
z9hhPfocH9u`5gz9qR$W8r#d}V@qM<>YPV{;PMT8X{@<}n6>h#gI{8I>Gh66%+p3f0
z2b<o!nB5`ye6?%P_4cK~-H)G~bcy@?@s#ZKMhDO^F&Bq;<neisgn5HM(nR4c3#cHj
zuCLD9uURNG@zw@K>+ZzETFQA=PN`iJW`?Dl>zP-0E>87a?)jO0^G?>;Ew8EUTpWG(
z>m;AdojUukOq!`JfBr$CB8NyHgUY&soQcJMK!rF<XZQaZuG~T3Ce@Vk?cbk0?>h2w
z&X;)>acLP|b7EG1zq0Yq-Ky-X&Mter=0q;Lm;X#vNL_GW%%@bhM<u(z8eNR}S)tyq
z?YJNT)M#<;?h%GI!<g7~XHCch<))>(x7Tj5n{Qh<XRl^q+LS)!O26MvE%xqIKbL8_
zRC25Jmw@M6@_s&>v_7W#(VkNa_`mEBVc~LR5DM4Wbi$kyl512GuPowSsV4BuH~aPN
z{EG|K=TFL-*O^(fZ+Y-7*GFeAYkuXN!sDYK`GjMx%-i+Kv3}>5f3C7x8oY1Wk&`QD
z)FgE3{GQ?*vtwP^^59SNZ1sOGc_>-k`IAXyv*LsuDc`iCMEM#Go;yFhvSPzdheuqe
z#dkJ49OP^%y?pD#mH8QO|5TPl%)b1hg^`&%E?)#(rNfpe#LU%WN<7>C$ca}-eEK{m
zTgL;94ie^T`JbIxB&!{Msj*$&@9J!mrHj|S;JODYrWlw6l^Hs_Bia{OO;i#%R-+Nb
zdn!=UBW6w&cPT5QG?QaqRQ&rJj@nwwF8%xc)?HU`*9<eA+L`;}-*b4#xJiLp%`Tv|
zflYIpv|QU9Cj1OYk$SjTX@Z6Ph5}6vmx&Jh%-6nMZMQ&F^zw@j$$bltrO(MU&HZ~?
z-|^9+8z)x?gA5Q5RN*@eYH&TVmhaNOe8cO`qT_PQe5J1i?srr!+BDy=lqvL%rP}LV
z#^+u-1gD)_?3{J=<V?Ff@i%vOw=cVIIZf~VKW<PPa^@@j>c^q84Bz%OZLbEcn};5=
z7ryU~KChJN!pHtLH!pvU`_*{+$C1VS7e5pRznFYhI`aVU-HfZhPiG!nG=0TYf4}dO
zTpwib*>-=e!n?zri`g%0=H3cietOr=Ynt<CUwig^`i0;7ceY2x)Nsny-6@>;N3pPR
zx4a~{fIi64qFMOK+~WHE8M3#xUwp97_T|&(;^|wn4_w$d?^yzCb7PO=0fh<YYc$RY
zBx@Gx6m0*td)A(N4nN$E>g(3jsTkzRo)P-k{&WA*f6m82?ddMP!nn%Hzs!$J?LS+I
z?b@$jYd5Vvg-f$AyWW1jUF}cRtcx3!WxpK$6?sFU@MoXd5^xo8NSMR>(UaqEPa4ae
z`RTmSBW8c2vax;JsrxIucYo2i#lNI=dAHCb@IpuSNUrb7i)ulOfTvCNTALqL^Z4hR
z%V$g8d{n>qJbHV@tOd6{=eXps@$YY5<|E@7U%Rt;Ufm}_?WE6-h5a@yQs9=GlY7-^
zkwui%7v3J%MrE&0*<##E``^^vy%gwpGW?U#>B$}EIMuf<`((I12)ereWydexwZC4@
zR)1a{wYzxD?3L5R&p9s&)Y*IbUX7}7=i}GA%2vg=UH|$@dd&+_@A-bpsa`Jr*W>g4
z)%^V-xIN<M2Zk5+DLl3Y)m3Uj>X**WfBol;QsL+N>inrIRu&YkS}&JzX}x&Kt~u*o
ze!8rc``G-<uOgSS-{SMrEoKHrt+c<s_xD!wOTV@6ZHzcO*V6Rz`zo6&tAf>qBmZQ)
z&Dxdzsv>*Myow@e?YpPG-`n`~T&eIrS^d42R{gfLe>>wd|C;NwuhsAWc`VBA=h;5}
z6-QgeFaBTsaqa_dXOEbS?Yb%89Rmx6IkXBtZGZKlYO4K%mpi+IQ{Sri{fqOgvdzCx
z@BB3X6C=BfOL3Z?+VaqVnz`5aPp#5Z_!+uB@bWD2k9P|GF@KNJF4(eQ*VX)ce~s5H
zv;TVJ&>G9GSj#<Cr!&uqRi!IC-)Gr#uDvE*<|50+o+F0d_7hF4J12jy`YM}J`|)R+
z+C(cs*O>+P<I0NN=B3Ph^ey-Hi^smN*K%%)J9NBsbM&c<YmxdplM53KO;0~I(Mfn-
z{wC*Y@9jU=UVE>7EoT4q_?e=heFyz?<z1Uk9g99K?)qYuS$3Xo<BlcX{Hy2v+2<kS
z&E0){1!sBsk_nO={DLmJExT$WS5;L1T;I90_4UdBXN6YYJ|?tydcmuGC$`QBdj2>h
zzSwf@JML+!kx?s^Clwh_|23<ALCODOYhS-R`gJOP$N0N9a4b0)`|JBjclk{!(MvX!
z-#@s}<IcKX>6d%ER+bd(ttz=vIkhPJd(#ouFMhpQpn-pf6V;FQtZUmi+2ZfKyD`q^
zGk+XU64cuLQ=(kw|J!e$wr~2apSNUV>fe?BCtSQ%CiwH>9_Qopql<T*oqa9+U)!ns
z@1nN84x7n6^Xp~%2cKh3tdyDQT{!pW`)!f=JeNR+E3-b1|KKqv<ox<0Gpiq|d^+;;
z<zDw^LX5?c(*B3E-^paV&Ph2xeVWF+n%lamB~eum!sOnIPM$4rf6We=ot1@qm&!~N
zjWo-+=K9|Ea?qDclb#znooGsSQ#-3FRCfCJmX?=QyWVnM{`*X=Fskf{n=aGp*V-4I
zCB@@Z{jOiXqy3_FvV7-dq1CQ8a@zae6jtx~*Lb${@~uzDUVpi_{{6CL>2rO*7TcVV
z77~BM)&X909^k>`s9d-!FYf;_i?^MZR~Mey+4iURsr;U=_Z^o1Z2Vb%<7Iy8{YK@C
zOE(ruc255GWY*$}we!ye`gBe%@>zIv(<P_Avj3IOXYv0$Z+J^HNV?N_{|_bIvi%k_
z&8>Dz&v)5xVRc-;Saa#+B9`r+&M@0=Ir4FmSLYAgzfTs=Tso!7Ugq@N%V#I6_I($9
zHC@O1!S@v5%{xOJr`32_#Vx9Qs2aP8Pwk?4X5^y<mrusIsdsKpUNk+$>t@+$qdi`8
z_APv^Dm-bs&hMCy?`}u3zj*Oyuio-27af+D`OK-3KRd%r@wI~R`uc>sc~P(D)ihnR
z|7o%HMTOzjyjv#E?{WxD7Z%h5jUqKp64Y3F<fUz)RD)(h)w)AM%m1_O@|-h$|I}yS
z*gvyZ7SHC?|8BF%@$s6S6Qciq;Xhl&xopbj^7DDOo#Xbr(bc)Rz%%gRgdGWUWp8@O
zn4f;9;=SBs#vP6`@#R0#zFtokkDT)5WxwBh>ywKv|K4R|ER-%j_tv+cVo~28&N>t1
zwO9W9p2%5(YWmBopLJa*v@V*uSUF>-QlV09n#zUjC#Lb8o1@E4=6;s4y!<HhU8nJX
z1IxnipOPEzUcdZu+1G#hX&27%|93oIvZrH}d{M{pd^2yE|NQ!(UI#<t3&DuX3wZzL
z+?Wuc_3UfA({lNRONGjc8J2s@xvY8fSHAiBppU=I<JV1It5CFPv*q-n?jw~yrRQop
ziF>sxOg;cwXCZJb?$M5)e$|gEVz<7rHJ<K!%6DB|^Rb!CPCiqrba#ju^)1=?wsUfA
z;Ie=d>s>abpR3u@pt%5~e&?bsH)B>k*rM)N*Zb&iS=x~cnU;%|WM191R`p7n%DZib
z<zby}FYSe@IG4Or{><XHQ*rm(&dcT{pZP_K&Au=D_|MYvL)&YW)0%IuUES?DNAuCM
zh{-21C)gHNZ?DahO$b^WWtvr&cs${L#u7I0j9mu{<G;q-kH6L{s~3JU4|%_T`|sN)
zs-NBcc4VfY_3r&Y@2=ZBG0k`Wl+psJ^1e5nllPymvaeoeTUaKPzRuv1{e;=YmbGdQ
zDr{a|+7HzVL1V24o7peFxjD66Ozq*+8BeF={oXJqPwGx%+3T5a{p2Qb?&%YlxnJbz
zuh7#kT^7eI50<OAaPXSv%x9k*H@|zvsD8(8ugrp+H&52An0xu8x8M8j2dDlyMT)%?
z>wIidm6^6ZJ8y~jbeq|w1|1#`bPpf9wBzC5)i&L3(=>Nn_1@R>)8&zm-?Pp+yJGS-
z|0%q5TC3{&-l=78f1R>2yyE+4hmL;D&h@wF-fet!#kS{MPJP`qx4h5C7UxIB|9Q_9
zWmqdEd-0P?R!P-ul{<?suz`~Uqf+)%7e&=`nS7JAh0_1=n))xlvfTdF<PD|EXHB%&
zzJF>}bkoi`llWC9T6Wdh<*d7GoLawk-u0VP58F%4E;gIF_(kfhUy^$z_8e7CP%qTU
z7gqOMq8YsG!uS3EYj15%_y4FB@?cVub|LGtN7lccYAWv@3CWukC%5YDZbOd;9tYm>
z`>rfJ>HGTee!XS<&(19r6OZ?4m;L-WD4ZL5z)XvWLQW_5#w^wUvpbpHx-QPXsPF7O
z=fv$TnU{a;OrLgO>h$TC8yJ}v-52OLZ8=iO^0aK`pF@wIT2$B{-RU{UrTB}!Z{4S3
zTYn06tU7Wxrkpvq^0BGavqRl48mhj2%9aTN75@wzF4GuP&h5-rD4Zy}&2_DW_njm6
z)T1YwPuyPr|M&ec*{tj9d~a{d)y}Hrij`1VccEpXJ%`8)hWlIeq6_Wk{^^eU(#*`K
zQIL58at_Y|5tc_9g-ai_7+v;mc<%hY|Bc7}n$Nvo_nYjoI3;mybq4$Fme9l}%uI#q
z2h4R{9zD9(y}};UeBB}3a3ry%0n`z6eRN2Uf5MIpol5osCmb279~roSMo<-gvM3au
ze4o6<K%#NW+fHt`)mg8ayk+LkQ&H!?Y5V5pI>()Uy=TOo)-}8TdG`9tACdh(MIKe{
zZjYJWFUAp-xA2!{#@<Eq?tLpcB^(vBc-z}Waiv%IK|_@e2b3EPRL<E%UVOR5BUaUN
z?k7-t^J>WQtLm1gTz?g7e!sgtetFkT{>gTG124_uKl7~e_}f(T@SXRR^=_a3a&PCd
zv$J;3`|~D@@BIIN2LHF0xc&b8a+aR`*1aZ~>0&K~4fE%j&+dQ4b!q)1v)tvuZ3lgx
z2L^OrKCkoG>8_mY0_{)@%Z4L!eG=#OMXsphs-EDkYyWv!>+>V7Yuf|lK}nUvPl2cN
z@_B3H`Kqr&oOiqP%*Z_Z{KhF6U%8Wq|If3iGN}FflrQW4L#0<Gk>;x{D)+rzHTCq%
zOQ);DD(A1yEPTT`^LumfAHDXPrAKPi)<1bs>L-;Z-8D0O-<}fLvz43W`V+UP=AQ55
zZjQXDa&FC<>nEm8WA<A-KT_#D%k6!+(lhyIi8zAxgqD7On7;L+cU`O@`{KF+WBZIr
z@(ep){e2ob?|m9y#?HR^j@#ax4bM1rrn1CityScC;mi9o3N8QN+B$jKz8~V+^?zqx
z-W=dl_G`N7=b3u3)^lg>jJfqntke48l4~J$&%%u6uU}R_v-r19UG3$+DsLaO=X_>%
z`dFm)pyYDZ|9g^KUd#VqASSE7=vBMSBThwsp&#`X&?MY{U=~}hQ}qq=?dSaV&3VTC
zm#y#2uj(AT;J0h9Pmem^@HBnpNAJ_~ZL+`Lc<jH3_qE-Uf6m7vu6w$io?=@0|IxYd
zi*Ksmow2zSeckuG_mA3n%jaHNHdnRA$bNV3nYkv(w^+R8=GT1^tom?q{{_&k3gTtE
z?)<8scm3p_Cmp|dbDvk06`$X)Upwy?uhhh^6EzFt<}<E&KGpC1g2&lkjML6Of9(1D
z&7%6MxE0U4^M1a3{`TXYE#dbaCCzqLa8_3y+qk7)dH%1R2W_rDy$%_?-(&io_gB&6
z9SbU#E7m$H7pghvo6Gwpd&zin@t5pe`61!$zi#QxhhF`9w&ZEf)9Uvrhkno5-f{2G
z{d?ZLQl=a8<ese$v)X&-&KKTylk{xYl+?`ikZGS8`7R=<#P;X1hb41s9G|$o4p~<z
zvVP0gL*BbC{1DuJVFhP-w70BDeBO4CcXwt6uif^k$ZOr>pR(4`=Ek3YNnU<;f@9^r
zoiC;J?G37PBI{q-r<j17Ej5RGKB)ffeqMGtYWoY$sQ*VEUs?3{Xvpr&%X2g$9)S9|
zR<F9&S*}0I3h62zRW10uMSMm5c4H^)E8br;?t9FcS6}+#N3l24xvc-jzdj3duigLC
z==_7%X1Nj5XF7k}U)s|4x*}?K@v1)yPJb0*6p+mLw0QAL@!GtNE$yz}Pnw@s+V~en
z`(>UBo-?h;>gw^7ms8}gPkrw9DMHIOJo?WSd0nxX@Vj=)Ef?OjD^8ek(Re2R=|5-Z
zBzx7(;pg5t@AI`)vF6$H=J@}%IFNYm{hW$t#wQdDe+K;4^S!9j=Nr%69=+AK^Ky|Q
zlQQ%4`lAzPpIsVTmBP~Xf3~ty6=&lc&c_~C-qrur{=B+JQn4heEOX!b|9>yux*9pr
zajrVI^8Pc@lS&VVJbJP<M!hU*+Z+|Y?~Tzb?oLS(jEgn<9(?xyPx-3%bIpy-e<U0>
zlwDbTcgwatAFcL&f4zI<%gFTLzmxq7TbVk&kI!(_c4T0A<f-7(`FYo@lbQZ9?ingl
zProeQb!PvUJzY<`o-_Qp_r?17rKfl6C;k+iUVp)Cy7sK-%q7paOt`u#xcy)GPorBq
z{wcq9ubij8@yE&j2SN*rCs~Mz^a-8pf8ck1ZgG^C<{_<O<KUXJyI$>_JTE)5^Rn`l
z8n?Ay?jBkBJ#wO3=6uUlbLZv0Hnq21e_81HvF_i8R@RmYi9efkD(3UTin`a`;jb5Y
z%(=g`Y}t{Qsgs`-G5lj~Q4ad#&A9k-e9qs~lm1Nl_*uK^(k}k%DTfYPx#+$)`?TfH
zzS(c~Zh2yP$y<2KgqJJk?=vizSEZ@)|F-weWgaIF8iv({-=FZ)^q8K|^7_Dw@nyx4
zOwX4(e=0MV=``48y_@;!%IXNmYX`*NuMXHVY0sT4Wx01e-iP`uZGL@q@#1rv@3)*a
zZ95{m{(gWEsG9kqDlqNHPfL>{B0{_6FTPdr`~PCCYTxZMO3R-|RmCly{-0BRdGi|=
zh9A3>rA?MSeZR+X`8M13e_wvQ*R`Cov*V|;&Hrfm9HpQnxqC@++a7&O-rDD5*{T0L
zU5+cAOZ2(&XM;$&%ulA%yS;v&`S?WTdfeyJs)c2H?%ZPU{QSFD|J_&C=@$0pa+$lr
zJ16TOb)6pPxjJm!3Za)pT}OH~=lkiz3Ne-cdV2Uxm%>Dg&Xk!#%UQ3kd_7mGWLe0s
z-Xkxie|MGNKQ}30{X~nr>oeb;KmYs4ibs6kGp0OqeY9iQ|G&?qqXG?k_uc%+x^>3~
zrC;@?THkeFh^y#H)pkXJ##S0+L6;Q$m$U!*!g=OfzMUSob(fob-Y}VUw{4-4{~n%6
z)9p@asoDB(j()V#{`%LamB+d3YmZgEe)CyRVP5r{n}64z*=Johe|llnM2nlZ7W$tv
zx%s2>h~?Ata|h=<>xo)$rdaxn$DGMWo=SI4{&Pn~Z~iyOM@6-_y?$3Zs-8P9<kfp{
z^^upnORI`kOt&qJna7i1eCAEbVH5xIPn*@x`8_&gTj=+6mU+s~pK%vn7R@q#toQnu
zUi;31-6^qKF3+=wyuLR7(H<2CrOrL?dl%1K8?}AyrgA^K$ErM=jx<dV&UmT)c0ym>
z%Q#iPs7>F#e|~v+di)|WS?f!V_G{*EKT`YQ%+!nR%~DMa91REl3xbZl+?XyH@@S9V
z!Yd0t-4$N`+%IZbN~f~CmQu0q%;g{ZcRYKg?)O?Ga8hlu_5NR%tp2oo@B3R8=6aIn
z)oBI>QASS}$B?wjg*$aFy`E>WT{QmZo6gHWKNdOf?K=7HfrVwf<F`LgZwWtFDSWmk
zl4a)N1!d3mPF!0QBXoSx>eSoWzc;As-P*pO$!m_?+W-TPj=AhRE+<OFsru!KJkt-^
z=_m0>x_yqC=Ay;^%eO>d()fD)G~20Teq}M!GUxeasu#*=Uin$68!quG*uwH{gJSX;
z-V=B1_PYDa=Nvlow8U%gor5-iH{CWhztrtL@9dnIrPH}@%=KHDzePSjYJO*tTlJ6W
z%gX9h`?7cb{HgloU;aB#^V6wazC32xqdKKRox<Z=q@!lJEe;W2U=;Y^_@H@dxsb}b
zPYcZVrZx$l`s4I(+v$)0o38sk`eD4+s7pCv`?uXztKHws6IL%u_;*6*vq_xL%J1K1
z7pv|TUVS#T$I9u6TA|Y9Pj9S>r8{;`33~Nti~IQ=1*^FxFU2}1%Wq4axrk3=$6U3S
zDutK)H(fpU`sI`|A@!P~kDo5tt(#tHrMmyQs?@7*ThFK4J$Y<4>4!JBR$=GzcD5!5
zIhId*Gjrp=Hg4VX2{JMoVi(3FG~JzDfs>=b;R&b1Bc=PU8&@v4J?Yd96D1{4-p>KT
zc|rR=FYI`rQE94BsQR8ML?M@@>CTpY_ck6D+WGgpa#rERqgxVRo}MRf*x(U!%T4MQ
zH`7N|hR*K%`>q?GeUh=Ryca9-TH>pfXj9^Zp8;1YMHkmi*pU?ACaJK1qowfjuY_|t
zw>B23%2xb%7_!$h*#CTI^N~b}CI*fppauDQwyjl74*ysV&ES6;U-c*0>CvP6?HP_|
zIA9~wXOXTC3u-v>)TM>nfq_K{w8V5`pM$Bxf(DjDv-m|fbWJ=bB_#gxo#WF<9tvk>
zU%R&b{vzjM8-W-Jr$>`4W?fJd@K9s)h%x#tx|WI2SK-Xh0I8E=F^|4*PW|)Y1!!LX
zNlv-c<m+4fX1*5gKl3=LIPAn@);#{HC8w2qkAI%spZTG~EO7eNSIbyCy{{NJB|9iU
zPBD%%;bYNZS~`ngm9^ec)i1v(CE*Xl@!(rqls{X<2rs+5aee#s)$cu--deE>iT5gU
zcWU4D<rWg}eb11^$fPg<bc&<7attF=p*m>$OEc?Ki_KYa%6`j^6xuyw7Wn%4>X*;w
zeniRsot(b@<Y%KFky2&+d0VR0*q<+5-1O-W|6d=|eQk?dJ{@qalUNq1w*R>P-#-G0
zi{~pVU3A->-W{?&?(nM9`^(-gS*ri9HEi#Go6l<`e)`N)UA%aj-<%yi)yw*ppWmLh
zP}o1u89LXmvU2t9&tWZ7^Ms$9gO&<xo>!HW*v4p|5LEo8BJ(Np^7B74k82s7e;d8~
zS8dg=lkOh&-F~-J-=8%0eNtI|X7+s7!{=8$OyBw>Yrjprzy0!~HD(JhS8#kQUF{RQ
zYPOh)Ue7&eIndfz1%^&-w$v$%OiNV*R{adfjFydi={niMUQoii^YX+k%g)Za{9vE$
z%KHZ<9^pUp?6v*xlFeTeUQK<!ck<T<o+rKSuf}=H*q78C;JvHEIf32N>xjSi@{Yr~
z$?uj(URFy{>Wb3KzPfJq*9Xpb`TkDu6_kGIws{V8{ehaG|4RPn+uxiGU&F5Vs;b!T
z(}f~$p?5Og)64&}7S_(VtQuPtaVVu|*WNd2t7iN)D6B1!jR=C>kJ7&A^l6byzt5o4
zOYTJ0M1;JxIK_N(>Ak(b`L<29c<C+{Ja@lM&)nagzx#jte0or3`~Aw}u;?GEf9~Ac
z|6`?nnw-bF=?7nZQTxAPU)j=KnU0SRy>fiQ%%l*I!8Boqld#)Ag@7thikiCb>BB1v
zG=$VAsv0LfyV4RPbo`~ot3yVICnehD#y!_AdZbb)zczNRpn@3tI_>ASg|BL-t$CTz
z8(X`TA)`~cQQ+bgp;M1C!`<)cuQ=-ZO1a6b{8@;$Wc>2amxYdJZYlBIpDoj?Ie*?m
zm8Q+Lx3}rE#Vua4>)si!S81m<pYwO$xn|#Nt41l4tR(HiwBKJoMSGV2=~Pd5IqT<e
zeA?IL|D4^TL%&|Cvt^5T=i=l0Z&vPv_3<n7q9*K6*xR-R6u~MCo!V--Q&^dnDuN?8
zO1f^P>qLu{+c+kjm@eNrS+z&U_0J{mUq9~qc}x5>-h1a_jNi{olb&x<d#lv4ok!aH
zaohLvF28*$FFy6YStfM+^zSdH^JXp*liMp2a;fvrx&LqXO8zul8nAq_k<Yy-i^xYV
z2VDH#OzWFfn<8g*`Rglbqlp%C=WCuXGfI2cd!<PI?c}F(gZST_=GK2XNi{u9Bv9p?
zNT~Y5x?d;VjbrZqc)cQJ!ulk_KA37zY&lQ9^K#}}7MD*(%8`lgkGA*}xt!G%Qa4J}
zn{RSE*XzzQ-q$(&1sf;J7ze#5Yv>Md^IM)XD=5S5(HoV%#cg#;g+dF9yJKce3|H?=
zKIb>l;+^oi@Vn0Zi+Nw$ZJ9QIR^Z+b`}3w+P6L%T+9CZ)@d6Bu8$c&Ht9AWxP^e<@
zGL`R=>->`y63}^ha!qmn<lC3F#4Hb<rF3+1#m(ZS%GuT5-)#HVvNh|jS@DMs$<E^2
z!pmPPGMOa2cx&=^-Nu<5veRwTpNVAKPrE(O!ZM-dd&y4S<tDD-?~@-tol-XG^T%ah
zXa0@Vv5)(HB=hB^MNhYY&WMrWJ9OCZo06619KY$={&V8eR<_3cn)Lnq$09G?bU&fz
z+tmH0c@?>YF6#U<Pr6=b`I+}>lb?lDf2;9jKD{i0W7$;xWm2ZAmP)qg-j1()%Mi8y
zBm0%VUHURN-`xCcD-}`p<bUCx8Ch3;tWWQoQ@@+<Y$oV9if>n9=Usky2sC&OzL~Hw
z3A8P0aZEtx<5e%8W>)#nxou~rf4=kba=lYUZhcE$Ojh2xCl7Kcj9KxoMe{Ffo>i?a
z+O;I_@{iqh&hOV(9KU6)&R(_W_qW&6uADq55$9jC#d>a#adFoX&z)sYcg*!$x%9M6
z;pKNKzGs_f@=F-C{rhvoDf|4~o!c+mNmj2t^)|D-jD1z`cZ1uuh0o$FZN*ypuY1V!
zvu_XfS$ekULwC*8FE1CweJ}j=Q)=3)Wvw1_=Ere8kMfvfef(<n>dRBFn=Msjwkw^w
zLt!stC%4o3l1#<IrOEx192Z1@qE##GH&?9RLDh8^cpd&J2$(a03y3cJAfL|aKmOh?
z)^$~0*<LRG$~a+B=)y?R!<jcfg_^vE-T?Qr`2B`^@9$-+%q@s9{+0E7-I;XrukW0u
z$C=*v^=F~w<CVWf#V5?x@M3v=di}oqr9Tr&d82+DaR2gi+uyY>MD`<{;<5JY`=9++
zmdWj%9r52b?}&ZomlpY`z%{!{_nj5e3+~%iyV1V%{neIvzi&H+MallWROwYYHTTcE
zD!uG08~k&uPxc(qjgGs#>i?Q9UX$W;iCb!1d8;zc?dsvM+S;gG==XKXw*tGl`u7rD
z|6R*pG41HGAIchgRh=GPy8S()vWjC$mfaMQLlbr^`J!@5P#P8lE^k!UxxWvb)=;`{
z$KI`1COyr4`Dkaz_H&A$LrzK}mw&tR{nu&T&dp1*-pOvsI$M=>eWCc*o$EGl)hYh;
zrSi*-MXFmqG!|u5q}U!;ovmcQes5)iYW~^TZ&hXg3PnCFS~j)bN80|M<FpmM^GjOP
zJF~z3zrA~Z!1*pFz6Z}QR&khfw|1Om|INPjix`WLP@4Yh!~0qUU0m3jW_KN1wR_vT
zwKu9xhlXh_Q84vBbE?Bj>EOQ$@~3s07PViP&?RuV{okIWGBZx^-2d+H+xO?bnir=%
zE1v)D-@Es7X8t_$CUM`HnZ<m6vQNM5>#zHL=i!x?+{cU#`^V(&eq*cOdr-G!rNE7_
z*v{LUM~*%ae<kS@!Le9zLE9%j*_uL~<Xdg2UwO<P?vAg|xnJ;7RiAgc-~9JtXa4+8
zHS-pE#PvvfmV;Vj(C39dXQQhPo$lP~6m>{g?Xa?Yn!xmWm3LcCDct#`|Q&W^|X
z9;(|{r=5Ovd-uJ{1mh#E_40p@xVPE=dwJ9FL2Uhx=EtvYH}lt*$6V}wesg8&?YFAj
zt3wPES|142?f<{<^D%}md*%0JoxZ<pb?)h~M`HiQ4&R*F``j%2gIM@!r@7yy9zA+>
zdyB8X?cY;VZyT&l=a}~Kd%$&V`#=4LH{PZFkNqcjEw9rlW_RuCRF*{!D}M%P+bSH{
zdTH{lz+R|pJI)v$IXZRa%43b&|5nGCR{ywI``GKsqg8iJdXE0hzT6;wPBj1goO$ML
zYmOQ{eqPO6_;K3VQ0F^0UrGM6i{1HIValKX=}DbVF_zV;!Y5P%vh1d;u6?cN-Vwch
z!)M_L0gcj&LXST2AGK`TadhgAm8Txv4+?)&HmUD<_d>&({Hg4--;`KC^;6V)^Rl}>
z`dZu3s$=il&WnHNdM0mMJ6+?**Sr6t?ze*-zUyx5j#U4nIZK1LCf{RPsu$%H^RIBC
z@{zTh)b+O5+>_ezYTk|=@$Wvz*SwYJxcBGS|D2oWx4nwn{6k=>#Hww#_m^#39{f)2
z$kBuyw`~g_PP?!n{yp#S|JE|Q{>(qTbb7pr-Hp`c?Q5UN`_@j?IP&%C{ixf^!S>#A
ziqW=RDSMS;%B~E~!k_IYWSnCHBDOu+^5$fprhhHZQ{R5>{l^`|4@Yuzo&L7qt=f^+
z+Pcf%eCKc2`R3)6jeVClSnl0w^8dxl-7bpj-v7J)%Z|nCY<twj*zAsOeLud4g$sJC
z-^)KbJ@5aA-|1~f%L|TvU*B3jq4t5Zy!8G3Wp{hG^Gd7d%iTXL7B@-u-rLeYC$1j8
zdR<<6+FtgYkFxpY^L~CR_^A3`HvZq=#p-hZ_iu`R8Sr+-esJ`f|KDi1U8?H%tvgjq
z=Kta-{QG6=v~Su0S$0>te%x}t^RPe8u;$aZYBS~tj==EP;?F;-D<*!v|Nq;cg2!v6
z56921&$B4Je7t?}{9W0fRc|s@Mt0o({_1+q!kpcoZU4>9=c%v!cK&ea^?N34`_JF0
zE7|ewM5m$4z1mmTKC)hY{?2;+)VeEuT`XR)TUvH>v7Pi;HOu&jZ}@q!+3&X<o5|DB
zb5-DxQId9-gCZy^Ii0P&*&SQm(S1;7&8)w_#O-Cz)qLAl=&HxE;^n_B>h(YVK3qBf
ze+5s;^EscpKJLC>^Q3X9;=7~baS>JfBpxlx`gUaRo8%Lm<(dJn7O5V2yJ>OFT!zLZ
zCZ`y~DpuhWYM_+4wlP~T<E>)eu2-T>H-GcT?OE#i@Y>1C@jp&|*nR&`R`FAvn#8LD
z+b{og%HYwiU2b1{I^tfqe05RGU*SjBY!=P=w+5W<?nLfBCe9%+qq9N4d~#m^FH^RN
zhN*m1^5jkPtg1V>S7#S)&AoMLgM8l9sXKO@*Sk|D8TP2m>i(BImupKN7H<zulTGG|
zhU5$$S0<J$L4ikEFEziaOwa>4GVXDu>`c=mtp=ud4~sS3-Y!>}aGKfw(7&DA_x+VU
zd@}iZZ&0ECouA@+^6uOh>)ZMMUT@*oyh8swUuyX4UKSsI{XO17#-^mzw(zr5@!jjo
zuDE)4ysmq}s1VTGAQ0{wzEYzx1r*#*xX!;hutrgDi|v{vt?hGLmIoh;);zqn|6zL7
z?xzo>`|>Mac-lNW@zJ{dzTMx`x%2%jZR2@*9(qoBsD95+(w=|5taau0g6p!|bLM5I
z)h@Q`<pSrAUOtVpN*5M12wYECo+Z@C3bHE9Drv1^p1wz+|DCe!aqph)=~E4eS^#pG
z_TA5?)EyE+85Q$FHSa42tOw<|y-Cs8ddP)T_*sF#K&HFg9EErJ85v|brp)5s>STRe
z;5yfaGJyyYa1qmkJs>rYoLy~yWf^m0khremy5@sRJ6WUz1s+Mg^f;@q!DYdfniW(1
z1Rq7c^nAO*4eX-TX&oyx8sD+D?0DDC(7*@IstZMjjEdOkj%-#BYmPO}59Vdx+uL_p
z&vv=M^tmyOEF3CaTSOiSamu(Y;BFN9yf7@`y3nJjgBSUjbU8T+b-ym#CXf-mV5-3N
zGW|?hMkZC(mK~FRol=_+=+JSxW32`g%QqE;BX2v;A7tSuQBXKidg0_O4+Rd6!efk#
zOj|iP3b%flQYEAi;N{Svt-3yjX(nsSjx!IslpPcz9Xh<16)P|@&E?=IoLjObO-SHT
z4<k~tcUUl`K_J{KZYPToH%H;jgQD&Y4x1VTru#g1U}CWn6nJFyGUKd@!VyVk7LF@Q
z3P-M93V!9;;GoH<xNgdxBF+>sfk#Ho+I<X-DU6DGGtV`$aFi%194WnYah9h;hcO$6
zzzSD~j@3(-UzyC%7{ufhqgnY>=!A}d(;RswkbxhjeXm_RZPhAuMkba`+#H2BPl|dp
zFfcN)a0n<kG%z%lFe>WZJvR}`1&g21Q#g{U9<Bi9fD|z@v0P$p*->KcE(GO*G%M_G
zfQCKTB(Q1^ae+sd-pY6)+wg|TDW<H56-8^TLx=bFVim9+sFB0hac0gjzcf+8gptXT
c^&k7wN9=J9Vl7h{7#J8lUHx3vIVCg!01=BMr2qf`

literal 0
HcmV?d00001

diff --git a/tensorflow/contrib/verbs/verbs_with_0_copies.xml b/tensorflow/contrib/verbs/verbs_with_0_copies.xml
new file mode 100644
index 0000000000..16130a961b
--- /dev/null
+++ b/tensorflow/contrib/verbs/verbs_with_0_copies.xml
@@ -0,0 +1 @@
+<mxfile userAgent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36" version="7.8.7" editor="www.draw.io" type="device"><diagram name="Page-1" id="74e2e168-ea6b-b213-b513-2b3c1d86103e">7Vxtc9o4EP41zKQfmsGW3/hIgPQ60/RyIZ1rPzHClsFXY1FZEOivP8mW8ZsAB2yXtHQ6jb2SJXl3n0e7K6cdMFhsPhC4nD9gB/kdtetsOmDYUVVFUw32g0u2scRUu7FgRjxHdEoFY+8nEsKk28pzUJjrSDH2qbfMC20cBMimORkkBL/ku7nYz8+6hDNUEoxt6Jel/3oOnQup0u2mDX8hbzYXU1u6aJhC+/uM4FUg5uuowI3+xM0LmIwl+odz6OCXjAiMOmBAMKbx1WIzQD7XbaK2+Ln7Pa27dRMU0CoP6CB+Yg39FUqWHC2MbhNlRK+D+APdDrh7mXsUjZfQ5q0vzPxMNqcLn90p7NL1fH+AfUzYfYAD1ulOzIAIRZu9y1R2L8+cCuEFomTLumx2mo8fEf5kiduX1DhWIptn7GIkQigcYrYbOlUKuxB6kevIkqjI8NkMd463zqnK+LHihrtjL0rfQ9+bBR3QZz185NK0lV3NxM9olHAJg0Q2ppDQm3dJE1tatjUjjqbOS+tfTSKbEskK2lqYcsua+r6PbUgRJwIUhJiEt03OqfI5xyhwbp5Hn8d/P02eRv98GY2f37VhAam2c7PVBVDGTg5Elmtzu1OCv6NMi2FbaOru5iuBVQLp/fgFefwqRhnAalcC4B3jngPghGxrR/ATstfPkTs+IAqHkMKbC/GQ2oD3ZenEsGNah+/ZNeTbLrTnqHkAPiHYLuxlHAjKVCBjg8q0+XsBGahtAlkxjkcryGGRnLjFhM7xDAfQH6XSu7yWMxr9D1G6FcEoXFHMROkInzBein579RjiFbGTEFIsjW3oM5R002IZX+NBbRPkQ+qt89HoWZpTG6LAiCQGBMUgJShc4iBshRvs9SfGDX4/3AZ2s7QbUcAAL7dRGsKvH79wyCPisWd/8hf/6EZv/2PlEeTsffs3B3dT+aX7tv4r0M20RbZfszff+GC3Or/dePRr7u6bmKgaJ2hlTkhS5fo4QTz6iD22lJ0pe728KU2jYKF4UeKp1Eh9QuA2023JO4QH5jELO4RZyECP9E9SttRH4hWkHrPTSTXm00rMd++RkD57Cw7cjjngf1UDLjjggmm4LPJGjN4kwhvMYTBDDmMccF8V53O8mK7C4xh3GHvY1MOMjYbMbzjCLgP3LW/zvePbfDiHS37p+mjT5xWfCKyOuBzaPgxDzz5Ioa5lI9vOIr5bRnxJzVNL1/TKiLckQYBaEfAZXesSVSeyM3kBFCI6tcgL8euUeKE0kNbt3jK/MUxLUSxD10wjP65SjW9OgHjiHm35y5k+Id0FuhflFIbSu1WtHlAVy1KApqs5U2pFU1Z1kcPDgoo70ikeUi5zfkNhyUl4OJh3gbypRUFTUuMUMeTQZoZHTH7Hudbj4aloWHiOE8Unsi0gH7M0wd+gzN+axH3UOqK28ob7Gf++qraK8U6bqpblw00VQqJM75GFyfI0r61yMM/+5MFadgVPwwc+xAvxorz0Jq7SdaITI8qM/e61S3/zqZsh8cvmQjigH9+S28zlRMK2i+2q7tWqKdmrW8rYrKIFtWr74/6M7Zwd1CwZdFcaztDBm4eJ1mqFA1Q4fn0jmU6yn+WQYlZESjtRrVpIdTTzxDi2OJBe9IWaaimleZR6ayOgQj29EfeTlNbOdT+j7H7gstzvcPZjgkaSqtIXEPUlVaC8JdR9rDqIo7Xf6VRVUlqMI2uCN9soQOXnDPcOyh4veJWOF0oDyyLj6PTkY7BmYGMXDs+p+IGu7/NPl45Fxa9S0ZEz1aHkdJf7aeBE77rAayReGoX0tEzjzQfxxePWdoP4JN6UAHyuVIKAyNFlCEeMSEnGUHzEPXY6vVbAXMX2ghkT6Ondc5QevFf3GZ05HnH9aHebe46DgibKBoqp5yzbKxt299Fb1rDFHOAku6pN2ZV/J/EnW9U0jlq115RRZamEYO0iL7o4CiDsHWmldgSu2+1y8ihBdvjQnzyMxuP+h9Ek/1Vcxt7xyCUWnjbgBRdWBwRWnqoVyTeq0uiyjkKgVq65Nmf8h9FzfzLss3+eRuPHvz+PR1cHkDgA0Np0AFm9rXH0XwnggP21Vglg/0lALfYv1s+vFpdY3NDbtHhT2XfNSXUi+LhYxP2ruCF3Qv47M8VRBQO9yvsOJYiyVLLm9773kO+E+VfPLpBulyzPHaSp7sRjXrqJRQFciMaQouWE2V70XGCKJtBxiBB8R9v4in+mPYk/0z6SGZ9w6nUMuTwvNqaGbnTKNUDXVaMa4KVh2MhjWJV86QRkGbZeB4ab/tWihjAsg1m31PvPBbpUPweBfoXtebAFkmCrOdjKvk+8wvYPhO3r9ucrsk9Att7mhpyMcUV2ceqC818+viueVF1xtwd3hqR2XRfu2G36XxzEJ8/p/yMBRv8D</diagram></mxfile>
\ No newline at end of file
diff --git a/tensorflow/contrib/verbs/verbs_with_0_copies_phase1_protocol.jpg b/tensorflow/contrib/verbs/verbs_with_0_copies_phase1_protocol.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..8bc69b889d68f0a6a8faa64f08eab22637646842
GIT binary patch
literal 88799
zcmex=<NpH&0WUXCHwH#V1_nk3Mh1rew;7xnn3+HTL^3loGqbR<fB**@8!H<p2NxG7
z2PY>N4?hnVHy<}AC$AtcAHRTrpa2(-kg$+|Fu#C+0LTzVkWOY64i**;0d7ui0g}Q0
z0}O&3OjDTynHiNBm;@P_1sVSzVUT5DWMF0l0|qEyWn*XIU}j|E{C|WYPJn@djgg6o
ziJOC$lbwZ;iJgIwiJ66!ja^Vkm_t!SR7_mSFf3x~<*O2ofuW_-H(yaUGBdXbPb{A_
zW!lzDPC>zGlb39{_?Xk!#5Ad-QN^Wjo9e`clFlJTWg8FvKf)l*2(=z+Hv<FA{QpN7
z>;xH@7@3$@IoX+6m_UvMOA4|Gi6|NdUSt(^3`%UAxNswzsFIO!;iTY)A55GMUQ+%j
zmej<)N#*}71|DWc1|~sfK?Zw<{|q4q5LOHt7K>iy)awa5TCZj2ooQci(V^~`cj2U|
zR@-LgTwCO*amj75%Df|KuKV{nACCXeAhoYs&f(SiMXUGiu`R!(|DJWl{r?QzQT1&F
z_0w!c>u+(cm@B?ZFYv<pMR~3hAC-Q3Ho3BD&HVOg#p6qMY|fc}!_RbA$YRHdR%vOI
zKgtI({+m@&^fGkuT+NLyxBm((SA800F>Q_Zb9akRk2GD<nxo8(OjdM!3kq5CRWg64
zByUCi{95Lc_tuBzUgMip)$Zu+96$eX(Y<wT(~^y5`M8}u$+oF*s-|qMr_%lII^{t1
z+V)BQOV1Ur`j}_hulb)L!*<)u)LwhlTRu#?Ww+MtneG02+T1d$YKb4)lpaON+U>oP
zx^p7qzq<_o8Eo|aGu&cbvFblVdavHjQ1^~r*FBrw`3hOf+D!5Dx-#{Ycls`K6ZMWs
zb^JnmqQwrs?fsJWpCM!0?7NSoCU<>VH+hAzZt8}2o_$aJcioMNc;znK(p-4l)AOI(
zn=Q9G&h&KW9*}?G@h7O0(^S5ryYOG)mj^E+C*3HSRKNK0UzKMWX0m<POqEXkTem6a
z<>vHVEo)}ECr-5SR9Vxp$L#Wz#kS{LT9^Nw`&Zj&&HnID|N6yu-Ps$rc-<=Nnta{5
zZ9bOyX3e?!X0K9I=Dsoy{`PFKvhvL=<yXwt?#fpAC)lbjNENe?b^XKk(R#YZv#EQ^
zqJsMus-!LZ>9=#2)t{_~_9sjq{)_)oz2$ms(TnAm>`coxrmsG|qUfZ5^5u&Y&u9jo
zvf{kdb$Q9dwY)l$mVCODGWpR*&cA*yr_V^e|K|GvoeehAN^<?LcwRI(_S9+8V`Hfe
znTK4a3q9hQ<8;MCpQFEiv*5exuZ{mEH2i0<wkpVex1c2N@R#PrQ{6V+NZP!KgY(_&
z03Bhqb2$}ng#sQgN%6DWF7|NEcfZ#+Iyx@zdph$=)XU@v3AJh)_9`xYS)6`}nd6+u
zj-aP8LFZfbRwYjSbXMKG^RTk*zN<&olWi|eR=UIgwTAs?#*w-ydljnkUe9ZLFjwu<
z8ez$`Q)cFS=EcnPZn&J*l-uE3d|Qn(tKdiPR^^qd`~E5aXYeW5#kpghvVU*H6XW$u
zuQ{b@f15n@s8V)^r_j8D@AjN-Zt2(R>?N2osF|7Bb?444)h$`wUAg9PGw0M3lNFy%
zlFnQrxY9eQe?><}&za+0>+5f`U*wyu7G^K;Il5v+$*s^mTxI73bE*r)O`q5-`4^P)
zU2e^_TFd<KEwlA5v*+Ah=NA~Lwt3U-*THJ9cvbf9{iJMkXYZ#w?LRNK-7fj~B6|6{
zjB~LUgW5w&E-}gX-!##k-S6G=aK*K}B8%&XQWoF2+vD{kq&kn`KZE<1<G~+%lMk)l
z%J2J0;(Lf$hpGFNi<-76)tcN+9aCDDoH%i5k+QPb!Uw*pFZD0}XYj7#dS12lV!i*<
z$LxJ43XML_w%nw;iudpx-X&L-&xxF!zAGd~ZIY4abT0oQed+Z}`xYnNxD%VV>_0=s
z)^+YH`lOAXy?nAJ>!N9$(1|;J(`DR~@}|suf9Q5fwLI6ajx+q<dDwsMzMQ?pZEeh>
zzv7?o3Y}SN*v;!Q?exoTx#BlRCcJ4m_H?`I7LA=}RrBPVHW}TnTen$2s;gFim&})r
zOVTy_m%8p-8u}{E;jHi4kR7jjc^5~YHG3r8w{*v?AeV11qf;(z_Y^zg#_*pZz|DD5
z$=m}Q>n8ko?lzmN`@2!iyq#qQPdCd>ZfKk2Cpo#>-1AWQjj30BXO>QnKlpUrq@VXa
z^d`s}M!YCrv}>oiPnO1uu6OTV?U*(*Z^cP2+bOpuJe=ftY?6m}hq9^0mGq6dvy&st
zw!A$4+jq-y!E3diI{#dB${a&eyqf~$rwiPi_+Ve3YThI9BW{a>dQMzQORIjv{%Z#F
zpVn*driWV{UjOr%@x+qL+h6===u8yz3%Pi7+2Tzz{?0yQvFvW)Q>TZvJ1*4cZPxAS
zm{6NmCD&WB`^wvZ7m=5^SobYCvGZift({*kKMU=w+UIj)wbM=`^PAhH3wk$y=$W*)
zXZveqW9Gl+jQ{GsFkD_Y>DJ2rm;V_&d4pcMPxW5ZFZg5QMwVTldn1fmb+_0Xezi+^
z-L}b8a^+U-m8!>Fck+MXASQRP8b=wvuG#ES91}0!c{8l+i||9|<QY?n;|g1saz2Wj
z6uhgayVywS`<X;r?kdapBwOwU8T=kRx+^ZFe!FJ7)_YULl*vgh%fsexGT6Gf*lE%t
zqp&B{6+P{%92g(wweOYv<@AEjb;Vqn&2uV$pB3Egv-$WYmRl~SuIfQ4%g$NuJ@2W@
zedAkxQX$)Vp`K&a%YPaEyByb5DkdekU}}V1aPx`ziobd$oqQIow3c7)PulH%vy`=e
zlsz~7s_6N4r$6!E)CT6$R<AEw2kvtC;@!C<s(0m^*en-S<E(pbJ93ZqtX7fgZ=No@
zMM&~t;5ILn!rV<gJ-^!YV|)HHY+WDlEZM4ZiuaL{YyGm*4xT!_^6920tHq-I89$o%
z+xWe6OFytI%5!ycV2<-&J_h#IUk+aiUV1OTtIWML-K90)$@H)-Z?B|?i1#g=G$X?+
z$Zy)~Og1kSh4bmsCM#{DC(abxf3axwvXtkacrGt3^L6Fkmf>A0k*gB1TyAsz_QE0o
z@4_s*MJlu33Ff7xKT`VQ!C=$;@7!IMtFb)CzdG#?Gc;OSta2tLDqP{Y+ms+BO%|8^
z8!R^Y{jKBL7XC!y-^3E;t31xjUf*<m!&*N5=sO*wIWbd{4&Ac+G^uyC#*xK^Y^~L=
z9L3HoPk#OSKZAP%<CArBWF<<2LyLIWW2)?9t{=WqYdUFr!m$mJs-0&vT^8qeoNk%A
zQ!;05$EB^}LjK9}=iQIgZ;7_JvSVksC#S`{d5Kqjbor7+cwDADJ$%f~Bj~Q_$!HJV
zA3Ya7@%-@E;azO_x`3gQeQ{mWt7EK}!{ff@>m0Ra7d~rgyLiK!7w5jkb6raRrgkTI
z?N`CSyMk<Mh)J9zs9E@W-K5~`brYXT`BtkICOs(-7g@4xQqbbvSC-vXvRiDjX42Zd
zf49lW9b935>*4~7C9;ACH{_a~+4w4LD%)z6sfPPfGLBxkAN15{vC6dDLNg<$-OgL2
zB>&<CgN^kP`!}C=u6VJ1WmNDg%PEnO>sFtP-MZ$7hpB6;Nq_G=riY$uI?r>aABz4c
zDVXD;S;40K@7kSk`NbEwmSn!=vvrx4uyOuJ_nE2!x5JLOTwbR?^MTmWqXJV>x4lxn
zbNrVM1HWX+l^37a+*&*1OV!3hS+b`lHl=)A{-n(BR6B2GD&Mi7C(_x`DT|GAj~19r
zzJ0{?k>btE>m9zPN0!PvuHSNB<n0k2ef_LUAMc&-mEWke^v7${4|^VoJa4*nK1Ha1
z_0en2Hxw8?JonP;{i|xYZ<6R=y<f_P-o=x4hJ}2O_`CV+mOINjc6hqv=`LBRHf3?g
z&J}^unMTI1MPqGWe7zj@pJCaw`6W}X->Rv-Cb?#++|gN*Ig4buQ!cMavs64j<@Ktd
zX<0!Nyp>E#*L-{RpTXII@n7|mTKQi&UlOm&d=9LV4@{C=roS?{I@N62+b<?7jHWCO
zy1wata_yAuJX5Cg7loa2d*W7A^H%+3@yrs7D{tJRHqH8TILKJ*WbW0=;l^AtQCHSl
zn+x&_MR}dC{T|(Iq_^miZLj<52@FYpc}qSUT$<OjLu}sjtFo(4XlDD_yr`XU`OkFA
z)nRvXH9f)ur$>dPm^F})j7F6b3;_u--R@)|!>&#t!&BR1%UDX+X1Cp%vQ#?L*w|S2
z%$dI*)$g>aKiTX4%D0{)NxrCLzPj5lmCnLPmzJ{o*>m04#V=*p7rLV3owM+#xW!-9
z_P1D1IPz!uioc%g9|}FtKO6jNJ=4wP9XRA=uG+2oAUP_H7Hn98CDW?(`@Um#ZmnN9
zL@pLxUcdL7;U^RQ1rGOlIgYJAw07lxhUUdwljf~G+QPuTpq%_Xk`S>?x5wj{mWs!5
zt-GZ+1g~vfC!G~EC3nf0GiSCN8-G7Czer;Kq_6TTP5&r%)SkRlGgtb>B-P{HUQ=W(
z{%9w?u*o=P8Ma7yCy)Qhs{Sjp{s|xwhIhk$Q~y<axKsNcVU@4lq`jl!Dr=v!&H0V1
zlXh!f(wAJyX(}oH)OMSq(~_&Jxz5O)dLEeRyZTk;+3P3QuFSZ$lK;fhIc_(+u3yU&
zeyd)psj2$$U8bkxlvM`xxAt%b_%GP~U9t10e&Dak@rPtjB-MFesataWL(jSHxHtEF
z59OQu`o696WLMjA`L-i<H`ZtPoG6~N^~9SX6QOxQbJROGdvP~hOPi;c;~i?M9<2R(
z)kc@(FX4H|rY0sDUEAa`P06Tp-Lce39i6<odFv-_4xIf`MT-AmxS*iFA>S(X+0nh>
z*g_%w!`FSjg|5dp>G+k-3s}?mCbMwGr2DGRZ};A`oiKUI>y+DG2`A49NA;@jvY3|Q
z75=0);>5j6-92vuR(I9bep$6`*3<Qqa!u0;gR=B&&&F)Jb7qR`(d*mpI-YS})o!+3
ziXRa_h>W{DJbNx{o7XDUWJ#F_&#hD4Z|~$Wm?66%x#Mi_$;;)Xs$4<gr=QNhHLJ6Z
zp=H_im*GEi$`T)ZzqoPv$D7|LF0qgEJH5qO@3^<0OKyAEVy8>5zU3ZKy!(Ua(v+D}
zbB<W8K2V%pxU3voSRc9`|2|M_(Wd>LMHOLvPbOr()y|n3lke86x?INjWpr@J#fg*n
z^F-IW9#|B_@$LS;vi)HecXyXYTs^J3Rp@NUcH_vD*SSj$<z`hb+nF!@@}b%Fe_ywC
zrnE|D1?HqZQ@rXtE4r5%TR104d9UAYS$oY|_HD$aKTo?mw5MkB3pG79TH&$ybLFPc
zo1eO;dFy#jnsD=@rnP*p(#!Zu(jqgNa+lo6S6<V0H}dW^TRm$TbAyUYO;zT)PiK_}
z_lh$=Idytgv&qD~l!^-dw{eV@ycg`nmcqJX+qR4PXieIBY0o3wt9K_~4R|qCPH5?l
z--TC|>#|hl#6`_^eDmDCq5ojuQkRsNX|YNhD@rHoDm<$6U7gN(T08BbOTwMSUAI@I
zK383OeB#c9$L@M=J8pXTsm`TVLBZKG+CwV@&#pcCp8=c41Fx>X@oCe(=Sq(%byjF>
z>-ARpIW1YXrZPw}_r{a8_tmdVoxH+RlUJkDODS+kPfy9AYrotd#>Cx7+Nqjm8!G3r
zEMMh#V!<1M!*jNCPD;JHYii+FRerNe)Bj8gJH8oYHD7L??q7IrYKNycr3vRY?Jwjl
zE!fDNQ>_)-;`Jot3GasN^i4ub7diE5pEG%!a@k)$#d_acDgH*Mn|&$4%j4r8hrKnJ
zIhEh!wRfW06kFaevu4H}OG{Z~V$oXqb#{%5i<gqdGll(IEBJEtn%86Vu6ILU*8I>4
z!D-eVeNPPSRbSZ$W&d!Syga`8P4bhBc!x$`ab2t5_mkz7oqn<Kez?$+^+)U0{2x{Z
zD>Yx;iEv=3Vy{4{BiXbO#o7@!#qa8#pGxu<c}$mcm0vEu$)@AaiZffc-L_iQ?@@94
zrwNyN%8u}DGmgeoYe)D_@}9EnvCZW>$Iezy`k7<%Q0$JT(xafPh<S^%a^^HMuj$PX
zpB!|<vg~Th^OL7;tIz83c=FA~TuFJduJxP8)f?t18$Wa^^USK8$2Dhb!SVyA^a@vZ
zJt`E;&Em7%*EZ+jvn?5uPM=xJ^}una?p8@Dk4u?GcfucMl-bMfS{&y&!Cs4{^HREZ
z<%?HkcZ;Q-o_I5V!-pkPT$+^<bv}GhnS5yOrRrsC*IYEsO=a7@v}p~uLcg@!k+nHm
z$ww!hvEsS9+Esb3p`d5ys?0u#O+w3s(pqMpw4LIz`F8Q7^q|s0@tsrUT;m<XUZn+I
zxjaYO)K6p5vMo)e@hKh04K+-451Hm4J>+_5lA&ev#q09zayHd6w{^C^u_#=ftd)FG
z_++P?caUVJW}21YrjS#?M>0)UY8gdMNtx_(%zJ6=h7J2pyv$!_nx3`lXOHW|Qx?9y
zEsKKne&~Mg?wK@2XUCO!LU%P)^U}>b%{w*S^RwkGcOKDI*UI*27rVxN+DT}O$-}iv
zx2$8CD0e#Ln)|l@45>!{Jh>y*UP`{^`*hmR(o2tpKTGs8DI4Aix^yP2)7wr}Q}b?C
z>2;oOx;ujU>kCsvQYAnG{3z)F%wc-E9D6#-sEvF6s`tZv`@%#9#yjO;#S9E`p~Vr#
z=6bmbPFLp4jP#y4XI7R>ZN;CGD}ISHIUl7won!P`t>Ua(P-p5!`Ti9@3oUje=FZLi
zT|aH>JMU=Qch29+MNj{E)~m89>igM^_oGisMrxaDdR^?gcDFy2;ouv+Qln?zL(@aD
za(jae+HYCyx^n$MnQ+St-4j9bj}z_(g)9z~T=+zz<5y2lN5JY|erXfFGJSflu+gdD
zd*Lya>({1~RV==#w^dl6sbrFOyrk-ri5h3NE8W<2+}1B-vFB2gt9RG0UbyAn-C280
z<&Fv8@C!VCsOWG=%bbK|kLIlZ=qA)tdQe?$**3GS-f^dYU16WTaj*Zi&M386D{rQh
ze^q;~S1z`GLx`9_o5<8jbBZT<sA$Ye<=0-TzVB{5$L%fUMl<KWJZk2cePDKQW}vRX
zBqg=y2Ai{-%9M1s6>`72vHyqE!;e=!8U=P`Zmq4GUA8v-_q=+YYVDpUQO4_|3Z^6n
zUj7kXzNu`|p74X(p002AL}g~}J;Of#>*>(XJ?v2iUbniJ96x%u<H?;S#iOY;&#W~&
z{?r^*@0DR!nEqhJu1VDsdw!m+xY}7zD)i#hOTp~hbN&9=1!^nr`J(mHjL+SU<x^O`
zZm#)}c`7^V1asWqKMtR|;^IoJBEMI^cPY>MT~<>2+jQPM=Kl=mfBH`;n6h!oa#pS`
z-Plv#r^aQvE*1Z%{?R%%<$Ae(%Qp_5S*p)M&dq&#UBb_CcIxzv3cGflxanpwYwI_!
z=)i@#${Q9*FG=6_*IV~QchUXil`r$wrrcaM>Dl%fd9kU}Ql|>e@eY|D%HzCg%?5Av
zJ?WiOw%k6Fr*f9fclmwKnKLED&dd9H8D`#HcI{m6t*`rz7N2e3G~?vOm9c)7ndbz`
z&#if$Z5hX{Qu^oeskyVl%lt1auz9;I>eG?Cs%MHjC7;f7w3o|Hf0d^t(V8A^G5g^g
z_ezf3?VkSP4U2MmwwuhHIdNir#kciYews@ob!xu3c*ne#+|(WOq+MBc#%s-0*&F9a
z=YE*3EAw{h;f?b(ew{d9=<#VTr>bsQ=Dzk#zh_^2^Jcf~n{%`F9VwctHC1EJ&V@lH
zZ$G+j*V%PxO7e%J=eT@tKkx3#*gda}agxKnjYmEoNivVxJ|X7p?0ChD=G<=8X|c82
zPM))voO~>`chcM+v!mwOZJGAc$WBA9KXUG~{P6y}DOTMp#LR*dwOn=teRO+td1bZS
zlg`X_xmrrEOg@~jJ+mk4BlnsTnXA^><uChIN9XOkx;j7cw0-Kwz(ZG#tz7QMrS~)I
zkC#`jX;pUkfq&{5_ilZ=UCAgqIqXK%<-J0iRW~l0cxB6(hC8Wy=I;98vBczcwS32S
z$N2|RY+mx)Ubzt~6>-(_*1}q&N4H|sMD|_1zJ9~)&_<(&$5U6dWJk@m7dm-vlb+E1
z6>mF^9_M?jv9~Y$>c*(QwkwWLvYos3w8@nhSDW&dXlm^Fem3Z5+zQkD*88zvb8=Vz
zYG9wYmD&7k;aT~8C${`&usieSeW2Yg6?LOCsk`n4-JQ^ORwK7SxmajX+^KM`Z{K^>
ztTyoGoZ8B-d_vIeWbx_`-%_V?ugve0y=kkseMVum(Dui<oHwqxZaWs8zvKKplbuIB
zmzS;EX8U{g-DkUVZ~td#D{5=EshU!`!}D19S+Q9sw|W_7+PpfuWS?Z*t@V2wJ}ncz
zq^@XdcJkemwBnpiw=OrvtIM5O8QH1Ucl7&CuBa;qjixV_y5i;KtFlSu&C+*kyM1S8
z&w05oy*DsAee$Limcho!Yu+fjWWFf6k#NtuXHHs_UQnmD*Q7^bDK+fJVpCQuJr%O*
zsGsJ|+hwbFYrnp?dsj}h^qV+q#;A&_%UyGdBWtqFJhUdCuF&-|@-<J@-B1*kAIJD~
zt<u#y$2B)z{^9e=&bxX-?1Ja<m!mGYx3aE!<aO!EjPoL9zK5LWELn2lb4%J{-TpPF
z>#qEstyAu2)cvc{w%=~oyqFc|KB(4g54bdCYQc(L^^Ftdj=4Ge*jdd}o4l6g&A!{~
zFWj=UjJ{cuuKf7rr_xH-b9tw3PWpLA@3WiH=B$@XE=OHGZNYDq*|scY`_I_eP``yK
zH}%<UcFW9C{mX57JZkHfR=3jLg~y^>wp(`Ue_v@b?f3M+`IlY4Dm{D8-#F{Zt>zVB
zmoNJLopS4^#o|J(m>}t(Uft8a#hFI0K2)U0J%7J?O<T!Nuc>a6{A0hDME$k;_9Z?!
zmOW4V#=L+-PD_?+?3tN2X=Q!CU98`%!d=bf^FppHx}16Eiv<s(<gvwpFV~r`3;Vfw
z@~j<IJ74pZZ%k6!+p#&;$wGO@wB716E?P}i)xA2YS4-KmP@Mm3>MF@Awd!Y$(()~G
zRuzY5CGNj*X1<o9f99h6_U6_&rYkXSZtM1bzrV(RPv+9KrmM}}PRG~#|DCx$dE=%@
zA+MY~uY|0dvu62B*R_TI4|nA*xjO6Ay)D}>$ko4l_V0X{u}{(Iz^zQ{RBh*2Y`tjs
z#&<Wzk=qkzMeRI4wWN>PbY*<rqcY|DD|Q_}9c7xXe$X^)R@-~MEr$)gB`yX>mvxn?
zhkI=_pL({}<VtfP%d$(8JhyGzQ88ty<dW?^-cwe(@#R{p=k;#Su3Nf#*^!IijOF76
z{kb{aO<sQP4)vd|lb-q^u4v1);NTnI<ysj#;;rW|j(=%TQMGrH`;whY=9fJwTNtW6
zWm|lzM&{OaQd3VoERFssHT}_tf3iDvbsyjHa&c@}k*Dd@+p+up^1io;3w*M(s6^vj
zsIpy|<+YewQ}2{M>nXi!JN-so1H+w92cQiAX3-;dPhQ&j^?vabdZaAyd-9T>R^}H~
zy!(~<FZbV+bEykqXZ>dHJmre#f|D;5qOYED*YnZ*I8n*^<o!C)ZJSJ1Y*%_^B>TAO
z*E>1OK(%)VXTCe$(Pu8mwD;#Fp6#~}Bw2~QZomG>$>-S8u4C~_BmQ|dAHJORVuR+!
zX(_InriBR#i|rYwb7r;ps$6O@d(Nrox^X(=iXHxiA@eV87H#_T>2g`7!3yEXwqMz=
zUd+lf(pB<Qf9z{^D>yT7dX(5?-D~rN9{Y3hzmG4<G&Zrk(=7dNYTpf?mEO57mLlKs
zOx`>b*cN;?nD2ODX3)fkD;MkOZT9ck{7Tnh<K<<Wf1jF@ZL_Ag?_jIsnQFrma{K1(
zmgJjgtUAeM=ejb>g@-&R`_)eL2ny;mnK}1T*_GJ1Tiw4({EbriQVhR}t>10D&-&S#
zxyM6Ra6~;^q<lukX^Nk;+by5V{~1=OF8N^|^uxb<(R#O{9>0Aod(;CIkDgT8u4S|G
zT1mT}$~XBbJFTrHm){K3<t~ijaz3f_IPRrU*wNTyU*`wuzMSf-wsTp2di-Y3;7WVm
zZOa=z$@FbM`!(D*N=060R^eu^_=i!QYToXGOO~$tY9D0qQZH$zj!spWv)MMq%eIW|
zm&IbPt9{dMTXyZ?S`DQ#<7GEB&a!M>K4WiZ+K+Ag-HD0IFFCK>xMa`tNqVJAq$|tI
zioyjo-u%=*8d9vXRO{xn3zMA!=JTB7UDE3n_S&M;N^|0_Cu_3os(E|FY7eb@61r^4
z@07e%o}ST~vw}FcY+Bx3tlX3RFldsW;G0F0FYP`pzA(M$?ebHzLU+b4_7z^InRZ)u
z60h!pp7X7$*Ee~u*e+5b^>yy$AoIy-rdy1Y;va1?+7+?2XRqvy%v|4GF|Rv?=gOX{
zE;gUg*3ql27vFKD#`)0;Rl{7_GfVHDtTsHMytUVGO5}Z)r7lXAdNz;ymM&ct6>sv1
zcRO#*JWu_e@8)J7*w4z!ZW7PBvv|XmW7pZ_EcL~gx<th}Jvv<KvvPB7&DS!gOFlv(
znQD`^$lcMfy|V7kY16AY!qQ#$9493{&rMD8b)FsH6@61Tr>OVnQc=O!BIk1|mp868
z5^D1N%rkkr&Z`ismql-F*Db!g>yucIlF72wGy7F}cP14tJ?^fh^u1KiYKz&Vj7xc6
zm$Dy8QGZl*QQP=h=bYfh+B1s8m1TDxS-Es~ad^p&FFTsDUkkkn*>StmCG5#ZantQ~
zH~m(6Mixr0wKj{_%Ih)-spP#nw`jwUjc+72yF?mp*s=Dm(6gl}o2RB^o9>$;-ThES
zajIm&q-%S7uAXjvRTj?eZ{*ced8g~yYuTByzCI^5b_dNVQ2DU3XlmT6$mzPe`kR8L
zm~LD%OJ&J--ImK%mnVe>gx`v?cF!s+ojkkajn~i8XI~HNtMa<VL~5BQugxsjF(dAE
zuI~=FrPb#<YFyWBnY&HysM^6j7N1sRo$5N8a%e|x_D>tOOP9R$ScO}r3v7O5A}=W@
zsp~Xx`CoB`mzT}fJf8Gn*Hzn)Rb4+tXUIrw3c0wVzw7AbpsS{Co+@i5pK$89mNqGH
zsmJgAD{pO;)|-0vo&U~PJ?nN$&Xif;@#bXgR5{DpXZPQf;VrOM*u!eFPi5L2Poq1(
z?p&GcSub>=RL$`5(Y`HLyG1Wlb2;TG?K}J`aeHjZo#6bXS&l0`)Mh=MII+-j<BS@Y
z8W&|N?%9uW6F(M2uPNFTn)>H<`ql9MGFcfBOU7$Lw@YiM9-B60sj9bk{-i0dt~`6G
zY|`l&EBh|#`ZqcC6(`qJ-P-zV&ZLl8MN^)9i=XhOJ*I#2i8tXpo}5t8jCrCm%_Vom
z<(+pF&FzKGZG2O)ZjDY-bkM%je`l-aYj0ALcJrUO;(V&kyh#1L#Sh&BbM%{JX7c%*
zoprI}<6@t7p&z%C+`OafF8P@YZt3Vf=4g<9{9&O;(uZH$9{&!$y>7>Iw;k7oH;Zpt
zU8R2WwD$e=6W)fITrD}-S(vh@?Chb!T>sKdUY=|@+a{@6xlMj-yX}DB%!xB+r0HM$
z``3Dn>sHe|iMMC8JGQN8?|xKRc%$F!a(3zwA<pAs%R?@om(;!9YZIg5a;swdrH!+q
zO;RJ(!gIwphNkTJu*e|WYwC@Soi%zIm(pCaqt~tIu01Iu{bucnwF-Q`Z~1)RGB9X4
zAdOXUW$etne($%$Pbu*W3fa!DzT1c7xBdLx&B4I;cn7q;fl!2N9R`Ms2!=K0D`yyA
zxyj~m_LI!|qxb4;H=js7{H9#eQ}fV`$-BNOe_Sxd{k`NH)s>&Ls=upm4K|*)YNtC_
zWYM*G(RVnC+Yc$-KUm1VGcUHz_Qa8!-9|19_54<M%EErHdgz>Ky5igYB&9n?n13@K
zb}x;b6zq6HGv%9W?Ub^ouGNhm&rG&VnzT+-P-DBY@{JQcLAj-`{WFU$9bCR~#<!<g
zk!GhQTyMYKHAPc1x_8eVSChw?rVDzHthf;4x^u#8Go71@7zE!%H-HAU8Rau;!+pM3
zd=nCxzD71!Fe-QHuI*t{G>;0ZvZQZR<ubP2rvGQ!e}<^;tk<u0Y&R8}tR`}8(r>;c
zJ)IeWdF~tEq||dt3QqhgcT7b#fk6Qhehdr@F<V~0ukESn<$fWcxoGLqsGpPXz1(M+
z*z+ax%ImI6>$9wdAD=mPE6ZPHr&pBF*Ufp8d_BVMWO!}6U8J~{^*@8^Z}}fq2KVg?
zUoc39*ZI1B_PZMInY8aF-=A6C&*rY$H|c2YChNY-p^Q_@pInJIc^mp=n)cfX?@zAS
zmw9XTOP%oB3cMj~q4y^T)?D5ieko%8Hs}3ni+(Q8TEAptd`^RD;OyHOdeVj6H<J0!
zZ_ilvg!|;oIj7Gq3z2+PGDlq^T<6jx&uK@zyms}R_<pAlhYiYBQ!=lG&D#I2?7>fW
z(UYIfx6ba~Rki(idh5AvTdwBR?1_GE3sW~Ne7ZU}JoI8XE04FzrfI%5hkNq3dF4uL
z+%9FmmNwb*?RGWCi}QUqxi4E+7_sflZK>6I$5W3@x~#q4(d=~YlH0bkgScB$($Z4W
zH!i8K{b?$G);hFSwd;3^^`|WTS!Jt!d5eBKaV$c|X5CWhzrTO%7rwCkt+{qUVae(p
zTP9siU75P|&D2{bMP~2b5_F^9{o-!l)VtgB>jhu`c<Z}qqigKu!Y2pc%v|j>r{Gp_
zY}!0Nk-R!ijX&udr~A6izm@42aqQxqsA;~XSH9lZADdjT>crM}QUZMvy~kU=oe1X_
zTykbx;Jmx;8;fuRky036Ph5t!?d!aEv%d)HXY6#^Q@JM8J!tOQNxo03?rd{C9+zId
zY~Ekr#lQAlyk;7BJg?Mgujn+rOa8}2g=XD(dd#9I<kpnuhBwyg&vLt&`{2X${YT%|
z#J{-OtToS9M>_k|w<QkC*E>D7l<#Z_TXS7DLv_=Vk5c`mg{q$R6&<z6L4@SOiTtci
zzNZRzSI2u_&1?GbZT3`+V<sy?3zmz`Nt%6H+S4=A>$jADbm}}a_KcEiQ?^&logTgD
z<h&r29ZxKZ6z9jc9iFgw!;;mt{iTKCfu%WV?()HlzivH~qIdfArflh&nQh5FyNiMY
zmoMBFl$kOs#%-pPg?Zq_N#@4Jk4(h_I*<~nB2of%n`LXBcJ=Mv?XSP5OFivL5n1f#
zrR8fr&9Lv-yfrH$;|z1P)h^||RsNLkuH*DkX!nNwdr!)5oXeEjeJ!?g&$ox~PxO4w
zHE5f1QgT*LC(Ba#+Zv}=lo?-q?D_EK?K7*6KDz3(ceQJ1YiKsFOUti-hdbJLNI8W}
zpOjXkr|OdWpW&q7bafp0%s4{F_Wi{8<MrZ?O7>?~#vWHX;^|&g7PaZt$w=+g-ZfiO
z!?sSmAHL|q<CG6Zr~mwC&^wuW<xmxCao8kfVMBqJLSdVH%A}rIWa`P8`=)R2FMch5
zOnqtjtXyxg?Lt?4`fkbEq?*RN=j?ixxbsR4?@q-T?24Mowm&+q)W)W)7yS4gdk$AR
z&F!Uc5<2hq?k^Q@qRU0*>px!YdU>s?hPBQ*!_%p=u7&-m%gpWzuQadTJ#Cfdt%YXg
z$*D8ybfaUoUGc8ltvb>4QP^D<UrW|0Gv)Wy)W$P*xW33Us0tVNRK9c9eND^fAc+~b
zdt|m)+-aK7nzyWX^Uai4x0}=MIyWN~CGALICiLOsq^<tl!BO9BSBGwwbf3EV!mMe6
zTAp4;Tk@`D#Ov%+S1+CU<#ybmxZ|F4PG~GizFD?7)%@eU>3m6|n!BEboqD`ydXR_3
zM=51hO-|K9p<Ugp+mG2RWK5F`o%M9*)97ghYl=N`k8IZ1bhc&Uf}WjcHCg=3UcSG!
zD88rTM8|jIP8=!AC1Xo;P~q-a{pgo3>&qWS7QNSS5?boFEaG%lS77(<=Bh=LZgbeq
zPMUXllUVbk*I5rX-M;;1oA2FhlTT{Vdrrx_xJ}7&5?Ha;cvENk@iRu|79IWFE80`l
zCOWO(ne^{JgVD*NnHLl-nRcFTU;LJJId?kmmgA{QRvr};l<pI{zA^Xc#?=*dNaeOH
zQYj`T*0pu<=6_$`Utb%am^EkrDbtX}TI()d-Kedtu~k~X)ncin#%-gpa(B-cVdk;k
zZElOFr*+HPJ_=RYJmt3E9=B$%8xuCAdYa8qR*F0OE?RJTvf_0kzGZ1sFK^Vj&VNi>
zQ%`X}SIVkA%Uovqnmx|)oSnNf<`!>k-LH=B;~RhU>Enn>*4CC%=b)Efx4uTlt-aM*
zv$J#6m5*X8O>Xxrd0MeuUF~tE>SuFpldHGOuDv&T*7sO_!sD4fk>4ij6rNgoN%fd)
zPFIeq!sJsE+?rJ<N(!C0d?;moeX^0fjIhVN0}0nJ9=LL5T`h06y5y?PGuO1NW*Hf&
zXyzVvKIXdbKf`<TSFerr7K$Sk4ajv@XGD}&<fP4ScHKQFGF>nERbYu$$XV&AOMz3@
zJh?jCciHAsXCqhdHj=u(Xq&NBs!50BHg730i{JjCLA!Rm4>M}JryiYnq-fITH%nc2
z2|PZ2aDP*#-on(47uM9Bx|DtH$xY#sZDniko;yC_OWlEqhFnfR&RjWp!pk>zVV0Ae
z7tiUn){+x(Wk_Yy#>2<H_1)ip`_Qemn~sP2XYEW{7xVS@>FTVgb(izg&(2Ic`&oUh
z^|q7dCY>*S`aTQodzBaI^v+drW$>ayc^j{ZJkC{JGOJoidbNL3)1A(Gj$hAiIWyZ#
zR5ZIeFW_u!H_!1+%hQ6jS}l23ckGz(+B;Zny-;tfNtW(*+Y>#`xQdDQ+@8zDw%T34
z|G2jLT*RkmcA|<=#$E-U>eI|OdAhy3o@bdnb@z^&CYN<Dnf#jMR~x#b?az^jkkAt=
z`VNUr>dUd&c5QJ`MMbYmu1d#AmuSg%?(ZXy;c8m+F6KD&Gwg0UYmRS>Q`2hgr;*`F
zyURZ2exJ31XElz*CT3t7fU}XqaMjIwt>nEGkAAkXJ1z?inbsGxH|fu>-=ez9)iz66
z>Q6Y*WG5yQE;v&&Bgog&QF5W$QvUTwg_1yTM3&lu*K+LD_RhsTZp)?eZWgP@`7ezU
z$v^I$yO5*R&bcjZk-Zd0$KDTYoy()<c4d|3tlaVX{br+0?uoZU{jw+8tqK&B+^9PF
zb#Aq!q<mD!WK~5k&q;kZjL*7UYE3EAeIB8Ds4(Xjw@k-$kK|cbmU)~xrSi?=QqW?_
z`L||Hx@~M~wne+xY~HI~>)(nuJm0Eu^z#cJ=}(zg{xg`)>=B$Ovx8q!(6cWsb>~E(
z+r}!pTC{TytU+2BBZ!zjJU-F=<?%QDO7>a|%uPREzxXN?cT77WB1%#ESg&2y!{dd~
zKc;<>D)rFa{#xzoe})4WHXfgG_0e>ZldUsXS%%A;=o7c|+%S1U`-L4(k9c{$Kj&So
zUeEDOx4*VeDAjFp`c2z2;!Cf@#D!n~HtU(NSWMnV$;awJccw|lH&suU&Wfs-ymJ=M
zj{b_==(4AO4cAsHHgbthdavOZQW11rvu)1XIk}hG!)|*O+JBYKF#f?&_)=6QrcP~{
zWxUbVta8nbai4Wly`RoBxz#=WTw%?oyKdFWNgr!iw@$hesT-~%xW(O1DS6Y!DWNwl
zj=ap6^XK?`^}Q)8HI%&bMVszfW=09QOwpLD>{T^+V=3Q?D#zslp*=g!8C|+}_<mie
z_4|FV&l~(c5hPdXr<=BYPNZ(y(p}j>Qx*r#T)n1py88B08`s)g4o`p1ZyH$HXPudv
zKjD?`QgO3~zQ>+SO51X!b!Cj&tBL#vGY_lEGo9&anN~k1(sBKiXA84(_sRwJSZ)22
zI>RjJwba%rTUI=`xRb25TQy~oyHreiZ;PaM>)Ic!pY}f8wD;M625GksR%;f8<?^2C
zlR0TDo~pTQ)9$d>cWYfYUEATRvS>w5$9CoHmTz}X)NG7ZxMX3FcJ8>Q^0{><5@m{?
zMcCUZ9xOOpGvj3G(QQXcrFQJh5t6oBe0xP($hGA=*0gQr&^ed1N#f_#L+aP39R21j
zz^8LPS@^{LP905^OS2y7F+b3g-}<U;<)$l|KT5yFdrwXYe*dcEUY~1BYLr*7?X=%Q
zyg#{9)t2me)v0Nxy5&*F1-D7YuPip-Gur#?(!GYv<^aj2uHT1?Y_^<rZQ7J=8d)rK
zbDK+XpL@#6eat3BXAiq`&MbI)#coPf@Z$5vl4r^m34eU0vT$0k*um?Pi7z(It<;tB
zJnZ+@?s5Bp)Ca4Rj6MXpKED%gFz=tL>B*|uf>!0Xe_hMfU0KR??!)RuE8BhlzE^Qi
zymo23m!{5ym$Oy2<}K@t=1pJR_~_;FINvf)z2xo_MyA(AcD@sy^jNm3*Kf94=JSbp
z_qk7b^|z*^EPf_6C3ESPwpr2M-@=#PaLcp|oBp*Z+~UzPNw3aiHR-e4rh8tIl1V(G
z$?bKzWXELH<)uO)E=vWEow}rzy0i28;nHm{Hb%bM?h~m|l(I9pQ%mh$%=WoKXHMKH
z%r+Ie6Sc$VTC~R0T1M|p+fyswFbhq1vOR9)qmb8OdiU+BmmW8rlryXR?XGQV(I321
z=JQ9&J>R4C#ojmE?$-G&lPa(4dS*VmnyPu?j7-zbug7|}^LCuGyp|oU;WQ;JZLzWO
zy>E<VdZ+g4$=m6k(f-r)W9_o=P_<1~lf@^vWCcpUmn_sS-Fe{p0nZ1v`)(Xvz`(%(
z>sr#NKVOrkmh{M`>-){LpxJ>&kIzc4U07YA6~yUjDkyh*qNZA%?&)J%V%w~mJ2Tg=
zox5paYNU<Fc|GltQhw(=@8wrc2_3pWVawN96;sa`-7d{>aeAdxbZ3&9Y<Kk2$&WoJ
zO3f3#JU7jVGjB)Rg)EZ=Jtun3bgwQ}`m@B;b>g$%Yi+L9u#4?Fo*UCTb^GLpo3uP5
zo*8Qy&73W|b?s7*dp`5B?yAT6%blILl2f<mic7rDX5pSSO^5Q9XC6J~-K?#!u`Dz>
z;!k%_mDB5oYC6pwm&CSDaO>poZn<@m+sN4D?UATu;;YNIZM?bttYKBwp1NA^=>_^P
z^QY!U-dLfsGIpZE;|*<LnUR_*^O8h7<4%95`W?8wDx$>blll8)hr8KQGWfTe%_@n^
ztn->Q<K>(1NbhHUD?Qb&c$lm?f6!Qb_npkjJR>Qev=B~-(^1!#N9y^iuBltQrg(D9
z*PdCYMQlFqb2r><a;Rp)iX|)gtS)LyNIetI*%6hackOjl%<k#S1Qv>%{J!<O*E_8u
z)xDC(<h(hj7b`wBO_sfsyH%BU`>Fgl8Fy_oytYrio@)|jFRYYy>)^5FK2>`=TD`3|
z|5jc5bL~2L^-WSwC+2OA&Qwu3>1E_Ov5&_+B6lml#L}&Y46|N_g-d)6e=e7LXlA#p
zzT6qJmAo4sO+A)boT|F`#%$x;YdWqcy)S&$X_Yx?s+}?)pWfdPH^ZzQCO*?<`E8gT
zns<5Kx8#X2E*&dY)m+P6aldBL?IkWXM^$>-mwfsaGb?P(<8PvdDj$tIVvlMbxj0$p
zbm56Pi-LY#nku=}%_aSsr|tF|N9*`@XB8$`y46-Lov|n4$Dfs^Yur1hpO~;SSaL?-
zl&MmyXRFOJ&0AEk<LXB7#_LxTPHdGA+mIu#cxJP${;_MjmPy5Mh(tzzj-B)Au9v0T
z@(1qT@i}w*58wIK-+oMW+u@@0xgYi#oSJi7_jTOjIqgeaR&`ErUw=S7uq@@%rbK^_
zI_FP!-}X#ds=wmthMNmL16SO=amnf7jLcLv%h1Wm1smiJ9!*YNdv?ugKXzrsMK?>=
zOgdX?7}742bJv_#J7q_}$3pG<_Y*c=_tFS>6mxv#OruSH`iYY@bG~{wC23AsXRqoK
zpK|xA;JYOsR<gOp{VaNHb++Vh<de#M%8~4HlfB;l;oejAvOGYh>!zxg_xUno&D^W1
zDH&dsosmM{;+wDS**1Arw9jm-BWu*n<+sm@F8O--Mr(<W^o9o~=gi_R(LI+qW$pAE
z+{zZG%;%bYcTVdwEtutU%zc^Z@xT=~*<3q3tBze(>pwHCV$-{>5VOawH~F3OTYqlw
zy`tSa&F9S3ofAtnPW!F8vgeZRTCd`_cJ2LFQYHxtt@0M1J5zQ0wN~A2%Ql|K<<a%@
zT(Z;egui_A(T$V(_V&JRTW~Y{;?wQBa{CS()n6MJD(B;~^re|b@FbDL-XWm}b2lqT
z3a*&+b5>AV>d}p@Ph6iAztk~*mAj$s`Qwc*7Orc$rkmoHV9z*Z=S^SB-GwTee$MKW
z{)UDjY0CnpD*t5P`rhM@O;*V^`v*(!-STWm{kE!SXP%VE{9mCvSM(oAS!DFeFnw2N
zw35*LO>28i4Ne9{JHK6;GIf$j^t30{@?2A*r|(((!_BkdHDA8je)bLfFY}~zOPr}R
z)CvuqY;6=$98<dMRJdqCgj0Y=#>trNp$D&C$y7F4e<nO;;#P*!CHDhMlLI8zpSW{*
zj@xeWIiH{2be-;5RVA5wbiwo+ch`BREjFoLZ=a<6JNfqdiBWeaD{c77T2*qW<fVSW
z+>LkMrUm|-v+v9sId}by-#>hRGcR5L(9`!1I8J>NoI2|Tmr1R^rr=g4uT1Y8(dSaS
zOWxnyV=#N_zCZHG_h%mdvtI1qze79aLreYDPOQ<~@}I$XW8}tH53YMnRC=sCnK5l=
zPH`tk?y8q8DU)73G8OyJa8}{ev9Rn0qn!4LPd9GX?vrnS5Nf`;u<OS+^Mbo4{B*u<
zdF31b&(6@T;AyF<r>ez@m%%L_kMuVCf1Hxlw>~V*B;uLn(j9AVR9eZEt#h3ht+&r9
z=rU*Np~*6<WuL4*WHQM!`?cOac|oI%Q)fNk(2ALPn)`K3)-k5U^J~0?ggmNewLi9+
z{IFCl({uVwfmgh0u`w)lsseKb93<^s=S`B8GqhzoVt8hG+?H3fZn31_oX&Juvsuoi
z^qccO?%uVRg80|$>ghPPK`#ov;#lfJ{io>7jThVMg<bXUCSOcG7wxq_XZ7BK>NQn5
zm8Ubq-%c%GC$)m_@|IJ3ui1*(Jl(ST+o|2x*>@dZbbQXXsdYEQs~#>&5nFdTOsiNh
zQ1M(b&%&*{a+h3rydr0NplY;W;rmDS0$uN#H9pm?zqGD4rZCCUnZI?udFYn6wY{&d
z@3cL0Yo6TFh-Y0_zq>l)vR~}q3!{GZSAVshUp>5wXYt8j`@MF4E$rWCP@DeabYrd1
zw)dt-Rv<f=zqH@A-!fP0@1}KM58c1Z7Wm|E{L+|TJIWoWRN9BX47Xb+_v7H#`H%IO
zUDobCTc{=XTCQ8J=vL_O)Y#X}$gbG_9j-(6LaLnq)nB3KSGPkfZn&tO_c_L^9VsXt
ze2@4N@p;;%^|!U&$LRh==qQnNpZv9MYVoUw?|bAU{~0|n@w&a|y3oRR8((aEuDdjT
zTj;$Vwn*X2U60U#aDMe;_G|S$J9tMuylymxN0T(}Jlt|kaeb8J3Ukfl2D0e~i=tDv
zEuYLZU#Z>5X~O(%fo7Mx0w-2K-*!Z2$JaBfOMHE9=(<*k%sU<wRdd&C-;vubDT|%_
zk31B+U47W{s^*8RuZz9jym@9QB;&S8{E0{Hq1<KBJ}R@OZPiZl^5{5o=6sTp?3HY%
zJ)&NDFTYpK@Z%FYt-EZ8&EM6l{Py44yzEzcq?F5?vLkb6&zm)6+Q!HX-6=Epwq*u?
zV{yw2&I~kZ3Y`2brKVkFcly*X6Vw-8w9M^%S#-Yfbla63({@@{moAh1YAGbN$6fhg
z<)!}&rKcSyufATmC$E`*QFBkym5*AhvL$^gCM+`X-EunhkylWx>wkv7wr8?JP8Mv@
z++)?rUFxT4I??0W+U<d#CSR)-6np-Yqr<18$N8P#<R>rgyG+!x>5BKWPpK6A8Nk4}
zFoA)A;lciet4_XLmi&EUwc5UFw<B#8SNcrkdv;7pS@fUb;Dq-Am3p@MRd1`OR$hH(
zH6wrWd8Ms0w0*AR#V3_-TD&KvZ@qL{Yw7&fN4Emp`3mQ|91@P@?5Vr2xMN$PZ}{aa
zOD9iNm@rRW{HWr!nYkSPQPUoG=xy?pT<Pig+2!gPRf#owc581<=I@SFSs1l_qE|m}
z=%)Vmw7B9KIUnVvEPVMixm1^2S}HiP<F51SldnF@R0kW@B>DO5nz32y*%g;MW1~C$
zs=gX?6N6%M7YB(QvY50zz^zr|#90^5(0YyR;P>33S2TYm?aX`pef`;d@qRP;oIt@7
z-F>YqkH=|rO`5u6>8zli3KJHqtj*Hr3UZy~b0O!ezF#I6=bh(2?l5hBA|rQdYvQ_7
z&u31yS*I#4Z*VE4Pxi+)+wC{Jmi_DWG*x@WIMLgJEmc)oWRh*~eEHvpjDGveKamS-
zZ+TW?w#}kAD(&jst=B~U2q$(1R+?mN*f{Hzz@zzvEGym$O`7`2^XR)v71DtfTVj$$
zV|rue_8O;4)w;?^d@J)a+8lIiPnn|ggZQ3KcUhTt!gfmLL5p>#Otviam@d+75?iry
zw&u)<A8OWoj1-<6b1iI{(xR>Tn)keXW(PmjxgES)VeXr{$amhV(rMoEl2+f2WJ@oy
z)ju<5;qf(d)@(X4Vb!^ViMg{pk6Wf2EY;+Etb0~R^2ax2k?;?5L*E5UT)MF%%vI>L
zXq8DtWN2LctyiIkGoRE>{FZ#eW%`_ulAIh%!Q9DRCX;^f+!4!hcFcToKQ)rCnCoj^
zL~78RMTczWm1nEC8|U1Lo*+H(#_BtlcTVn=oH<ddySC+)MwG^`-K%zdj@TDIZH_f_
zd;6)GLPeZgie+O`CQZu~mpi4bth=MvetIHn<=Sl}JCFDXt`$D_CgsMqFC9jYPAt{9
z6p?aUJ=WjsAfKw{bLEv8Uj4;4)i<nsQn79E#@P9pt#0$q&X=~_G*wT{qnpEXean?C
zih5<<*_B3}xwFD*bd6qxxZQ7wUfuL@a^m-ERtvlCIVSv$jdd(!^4V0#lC$UM(Wez^
zHSYU0({~C^Ecp6%XTF%zN#ToIjs4Y*?VZYOv1r<ym}%vi-3gaptu+yxHCa)2rQ5BJ
zN$Zy8)NyuQ;VoKUx_!wL3BKswV=<g!q4~Eq3rx;Atvh?QTF{wxk=t`NsZK5uI(4S!
zk33^a&#CEd#dBsg&at~M?4~kHMk-FBjPFFx@3Kp=!EWuhoVkN`%c`B4R`bN{*?Wyn
zIb-pY<x;z6J-$}4X0F=AzU5msC;LoJOy+#{+%u%q>~ev%?AO%|ckGmloFa|fIScX@
zKZxX3ejm7g;#23O^U9Zbj*GY2ZGCO^QY?E-squu%?25kI0@ZAm?8sI1QYx9?zH!>`
zuiS-MtVXkr#A#fvC^l6!uwlu&IVbd_UViJ<2WF`cK32HJeUDh2_*eepDIdX(JhiXo
z?;fXzEY~xN=1y@ey>aCBo)*vAYd2oMGESMi%R6t&SB^V%{Xs_Q202yl{g*#l^I*-6
zpf^v?w<>nNn<B2HyrIkbbjgLZhdYn_&Y7vwv7@5r($vW}bj3GZ()ubi=~igA&aqi{
zs;4hCj5WQYFk{L#b7K`rjjIor>glp&Dg{Lc{%0_+_BdwvJvg!ZWt*g=jM<mVk=krl
zJ04D93%S#((|yEkvlGXEhN<T~r#(qd7r5HBHGzR?2_w>;yJg+e=GMO7*gtV$Efd=o
zyY<Ci-G1cH-?5>A;hi;Tkpg^G0fS)-q6Aek<lkxKKY4Mjkjsuw8h<VRPJH72)csgW
z*m9RD;a8QqCvtstefDgNQ3}jSo4VknhuyxVDkn}Kb8}Nrdblq6M#-9Fp8Z?fRc@DW
zcz!#=b7y7y)SZ*2?d;p}xKsI`<NDlP-R<`_HC)+x-p2Ccqu1)?TA^*SjhU~Cyga`a
zU+wg${Cd}8(vp=%&omah7zJmk&J@2XzWepfD_=jBB>m!?>YDhn?82*uzGcrQ1x<Pu
zTRdTH;R26mGv~|}zN=Q_^&pd1Rnzu+VNLY%dPd8eId6`qWR_L?PTo2D(W`~ClW(s!
zzsy(j{PBhr#?gIKHLkUVrF;^V)l&H&72=lp*zb?)_e(a?N|!CaPPzW1^Y!$egN;fh
zVsk7`%;fc(bviXKcWGu#uT0!85x2DJ{Iv~fWg8{uu1OZ(WjyJ|i+P#LHQ&md=IMHL
z?3i`Xy@G{1E`L7f6jd3sRAam2ht+A<T+-@uoa`36y^A`k|8dq{p-a71*3K;tZM$x)
zy)<Q6%EH6ymuB7AEvTyX(DwSZ>KxZ=<^?=w=A1bZb8*kI?IF6wJocLHp`muOstY4R
z;<BfH+Z3CaXV-RShuEpJO;Hol+>b4tDc_bh!>g8c<F-8~bo32!gqeG5gRQomwWw*A
zD|`D=%X7-ktdNK!-Z>|as!Z|ME}#BzV*c?x4Qa{7#vB#<R;O$;nPAF0CrsDP+;BF_
z>6Nn;Uy8loG+AxRHknk;mPvt=+otSRICEm+(QU7+%Cs-`t!8fSekO6vu*2yJ7w7vE
z^V8+H1aEt6l8sL>dwnwDUeHf(Wv3q%of#_c4=v4q!N7B-=gfJDYc}pGQ!38Z+Lc||
zd8aU{D2=&R%qT1+#ce0wVQ&xrzg~grpSt|!m!z${EOg*gkGc5a)keQ8I@g?ddZeXI
zI9v3J%90g#y={(K`6==#W+|EGELm}4T3Yl8*0&qdwjDk%<-D*ws<JTd%U_m@_fGd~
z1uaRvu;%*h>C&agMe+({gM99oteKQ)X{Y-~S>F4u_Vr~SzGU(ls&ka{BrZ=}6|?fv
z;`1}(mI^LY*A=|h@lI8G`%RVY9;b_+2u*S8cWjvR?z_76-ZRCDm-qVj>Km>(r2Odg
z%JV0tn&tNPOj~lrVns*S!!3s-qSC~fGow8Hj=#Rs^Ue0u+KDB#n{57_R(`L%WLLY8
zi`0n;sZTdwIdY@6>$t9`XW6Dep^$3rk~O>jK74UNV9AA`jH6q<eT-w?oZ?x-?Yk+W
zZ_Cv1llv-jPad5g?CIRylF_z#;-<-Gy>>j-@94B~iBaorxg+%Q&uWHU)@e((cDc_u
z^ZbdX%b%sYC3RJgOcb0rbEVqSyhR_s=IJX)&Xk-Pu)NtUZ)to=pL3m!#nq3QGlI{~
zvR2H9?zecv@h3yCNWe>PaZvd7?!tTq0i3J6GM!(ctnz9)TmQuKKg0gUf4U2w_rx#N
zez#@S?>}0Xo_)%F=F7B6&RTYodUB}kp_;6zik#kEIVZlG3h}z6K6vL+*N&vMHR-P8
z?^&m`jr`tLt8cw#Y4guNyTWqrj!5n1?pf7E+iutP_djs>c`oSW(mj78ex3PZ@N?(V
z-1mP3FYe0BJiOy*TcSSS*@UY{kHno#*saBrvg+E-xt;%xJ9?O^9=o>4ap|Y*;@6HV
ztu;#hUY2%+M%;4Pb~)ykP$u8b*OGjPTXNl+En^Iqt!@ik>8lyN<hG~gmp3)OD&2+g
z+g_yK_O)wmDnIeGOm6PY1ly^%>h$zZM`yj+$5)wq`1I}RN3ZX{J?+rh>0i>{t-N#e
z+}qvayKYrk&D|Y+{KnmNrGCZs_omAmcjw8ox93Z>>;GqX{x81jKZCgGe})NHGmDyb
z?cK9jr|y+tkj3FtvlPFO$UE0M&uN^?tGTkgPE_vbrj8SpZT(B1E%85AYFk|RC)bi!
z^zzgh#kw06X3S~cP{y6E(tFudDki&hc9v(sid`L5Rg)I0tlzzRcJ9<~r#2P$$i~(u
zY&)MF(AKG^K5O!$>fV&>yU_*jH<t1^ZC-daMy9P>dGhRizpqRW|Mu*bqO|WxZSgsk
z+pf91ocKxX*~`H9N2+HEnajt%X;Z)X#@5E^>4|0UeGiAsKRK(YZ*5nu+nn_qeYdVO
zOYH2ewD=sm{r%Oa+jp$J^kwO3ed$Bz{xbx9S?zau!h;J%<xidz$VU60u05)nb6oYt
z@rPa>9p6lK<2k$QcD{alZQAC>>zijJZk>B=;g@;g`Jt+>Og)@6gLU7WnKGNB;_4B<
zmCrdjJ>PlX@7Q;v+`Z?!@6=xV#;jWA{EJBfA2qqQAB~&VsnNGN(ynEz?c|Iz=imBV
z-+oBfYWmWj&lWun)KGaYad6ugai!RUTMl1N47EKf<ZC3|w^FmRU0}(~1%1w)dU{({
zoR|{#O<nfQ;-^(YizmDKRZfdkTeo!TZTHD#SFWULP0TyFLVIgZV&}72<!4v_segNT
z&&up=`<_U}+~bT}ea2p@T+jJr*}q3Rv$@<-7fxBZ<IlA@2^ZQ$78}*3ZwhEOc6`69
zZr8_kKljbJl(f7n@WP$gbA4~`J}KGtr7db%$1QQo7svWoE!J;R)SWr)SeEXkIz{bC
z8LE5MnlHR`C*E|8pM9ub$fnnydJ1>%*nDA%!t7elXB!^HJ1#C+F~_9BeHYJ3Esay_
zIxQv#-w1P_p7O19>x?rtQL9v<XI|~@7hDu>akotQ?VRRHx5+_aF{%e9-S~ATvGK@n
zd8cV|>vUII`K`;)@Z7cX<f`1e&f1>K9=-CjOkNc@xp3-onc$+pm)mx~xxfFSSk3WZ
zy(Les%rKjKdU>bOy_s)%fA6k%J(*3kIxAPDX4<_}zqN;plWty7<@B18R_htcuKH3m
zH@GqA&h=O8`p*`ng<WNNGfVi@On)(}+tDqb*NSW@v)(OKo*NYAx>H{AqoC-l=??mZ
zQ9h0p=N2uLbQ5KBHIa_y+Ip&2ZR2&JMZIc<9!_^mRJV2NsXoz|();hKm*WxRi_7)X
zjIVoLjSwp4oVmDi$!tl9o8K7s+{rtZXQjAZliNEpQ$zRcSB<l3X|+i^IDc!mpXS<{
zta0$w9@SYvXH92?FV$~6tnKwyZBogSJ4@6y8eN-q@9OIGyxaFXmu1HJe2eT|clWHv
z;o=mT)@|K7VTPM|Iy$!JoZj^0-6ng<&)mo2IIF)avUWP3Q*^4_^ImDujv33#Q*U*p
zua0)E7H?W&t>w5f`dy97l4-|W4`1DRuK)Z`uaf88E46ejwy!#MdS}Y={|q1HH{SI%
z6|H|h`zr&3g*R&R4AD@7u-M-z|7Q@a{WPookh|cm#>Ky?SD9ISomTD?w)&Xrx)ib4
z+CzOy*Q&Vr8+kq5H}_JjP>ff{l$M#*zfU}}|0sT6bNL^Y!X){?ll7O5@4q@PPW{!s
z_YMrq7YqnC105LHT0u@@;#WOb&~x_I_s=|5eP2&aKNZ4yGkmjVpq#R@x|F$4&DO8t
zEpz)<v^4duy*0mh`KF9T$M!5Zs;w*X(fiaVt>@1|ukYxS3+5O0H<;FYIdD>J>9-4R
zeL|0dE;&DEk6Ge!^rh;JN1KeiS4Ui0ZgxyrO=X*ExO=3^o0)|tIo%aAqrJUT9^L(|
zsWNG^=Zx*oBHm>u>#6QzpLKNce97g?JI>C{o@V-f+h)&?de%h};ihU$4<oIjg{HYp
z&5KUmY3{1Erfc28pl3;C->1IbRQOZUOz!9w)>YL{e!ohRn(MCUS<pSL=IEQbCsSU|
zdvscN>-5OnHJk2b|5cLQ;%6RMXgKRnm7UAeLyDW8s41WDI@_mqU3L1Lpq*JCgj~0W
z3obvVn*Qo{fl=AUl@WS@LN>?a+u1&aq)D%zth-F?<(4VZcc*S#y7kS&O}^SKKf<af
z&S$<QExpB3%%1PB;ElZt8l7$~+Pdtrq+!s@V$N;8+y3a~-%`}@D!Z|3SJ!6#MQ<v8
zP1);G^<J(hNcYJtV?&EwTu-mqonEVDGF!Of+Gh{-AYWCDoC(t=Ojdp4=HF7Ivdns2
zc(LKOb@vYI=s#K1w(MSxRm!1`8A~;%#}r<!DtR_x+36{@?wck>m#!~1`Re&n+*7V^
z*ZUnjNAI0j?fZ0!+l`KuB3GAfRaxrkpOUjgp6QBbU%thXC1*7Z*{v6^i#l3bTV}Y@
zl*i}@SJ%&vYYVrl9o?A4?6*|2blWqdeWJQY)?7Tk{l-bH$6h_{?-qW^t`ANNPqB=7
z!*p7~wKDXE_of50HYUta581MlwKFK@jeEzmMUSHH=f=7%ySh<eeoKwHURhOR)#pgx
zZH7W3v+hJL3t6)3*4dlugfk}mk~RtyDof1p@;UME#Nzuat!3YdocJbiW4_=yGnFmV
zXN7;But-_?Kf@Qp*=G)Kin6F&X5W2f+n=17ym}&&f^`>ZpEJ1QzO&=59hcCpI6aoH
z6IZIZq_|Y6##Wn}PSkolb9TUuJnOXe@?1imy8jtIIITMxyNhoDgD8A0Hi>(#Avu>J
zX-E5=x#}k`?(0;FaY0&+^Ca+B;cw*={ihljc)6i!u~OoQQv&d=yhm=6J>}*0-R8ey
zXX(*zwyVXm*Pea1_~_nuPp@2XPo2&)yKmLIn@g@tncjT(MwZ{R*fiIjRr^i<GfY(f
z_mBO@w$^Hi!-i|jY<qn2wr{w#ZR5mjL!0L^&ra3G`0d<vC2gMa>xa6N;yw!e+!s90
zCuZ7@<Ja65xAnf%-~GjX^5T*eSG@kj^Da8_qVsmOmUT{(*VO*_C(jiYpAF_YoNG9z
z@LTWU`<40@TTC~3Ojr~sl=}4bzJ)<gEVuS1bM<CdCborK2}{YIu%cv=PJ~|Ou7jG~
zdc0S%oGP=TF1e)5J##u^^NZ_?#jb3X__N0I+ro?HnjNv7`BvIX<-C48aW8x}&%58?
zQAcmij-_VnMBYt)sJkT3MBDTI!h6?mK76BR=QcxgUfUC!^D#zB9m{;VmHW2s2+D}I
z;qqM4(_f@4bLXw>{nN8j=6ycTmo2s@A!Js6GMmhtZKwE_m<Ag8czJH;3BEo@;#HBe
zY41tzXD2m&^e(agW|uT|cU!b}@ugblQ>D?O2aEqyJ?7qW?CtCOW$!`@&)hx9;xf%o
zD$Yry<>REE9knW3zN)V|aps?pmEDV&iWkRZS5Fj+oxAG&<2`dGRhzBe;H;T>(#Knq
zRa391XVRnD1v@5fEqs4$?VB0@`ZD+F>`6VPQ?OW{zh6t&>Rj|Y%@)tw^Ri9XoPD|@
zQ#t7F<H*&e`8#LM%?eqr@ha-y-IilUKIe`8><==uO+3B!QQWmt$1*>jvOFJZ&-+g;
zBIYIMv|llrGAx@?`BoQYuW#LCzvaWeZ&FKFS@M3>m1BCR&Nye{@6FALxoj>=%*8mj
zwB3DbuDGt;Fzw+RZ^r^%uKNd!V!s<)uCV03w!KtHFm}SN;JHH2?fgCa3PqMmPM*m5
z#N<_X?>SbDo=e&GQof#{*{U!8Gc-pz9a-qVVRmf)gRer`H?l-~W?jE@X~HSbLo-F6
z%}sCR(U^LuT2oJ>|KOcIkq2+<{mv;Ki)Bm9ytLbT+SAOW9nJ6e%XP6H`C;&pd-kEt
zrQhRn3zJVbP0r#wD(mIFNoZ5JRJoI<R=`)TIMZY5C%HUlS^NlbbBm3BlVTjx?e#_N
z)v{Ud?&)w`*_d>sk+)DUt~uUFajk8y<QIcik~2=OO+9e=$RnORc0wP{SKsy3daW95
z_q3-|_okO7@21DyK}ri|=I)D5St`%vbKCUIJdt;sU)?XZx>Ast8m23%y5;kda}%O=
z3JV-bJr!zPF1cG+SIIWIDdwJOtdLcdQtFKfDSFPQ3-cy^?zok^$xhg7itF+Fx$CR;
zCidM(+40TZwSTq5;YBIaLnT>rj(S)5Oxb@}cK;@W9V@dpURi7&88zpT%A0G)Hm-Oo
z6}+@BF(6ZaX5FHG$yBdyBhxp#jxR7il(Ki(hR^wmZ1a{*+<9n`K6jBK|NN5N%RZ0A
zw4{FBySQ6!#>?#c>nAEsi|TQySz;?YJvuKscg2qj?K>t-&0F$j@ts(|$y={~{@1nR
zeC3Dwi2TQ^;*C@jg(a`pB(k24d)4)tXUEps{n1O$#2x>(TQG0(jWYos<R(e2pS>>U
z)*L?HDcMqz)3zG^?%lc8wWh7tX2$ynPraiXpDnF%S@G<8be77blP;UC&$;l|q+VO+
zmMiz`?QeEm(+bL2s$_UJk)`cvnWps#^&K1&@;(%<R6VrV<STzmT>md`q0~87lKHJx
zN1S(#HquqIm(%9gnfAKiX5r;LLEV(8+8Za`d*$Wx>ylf*a!=isTh(WdnJswL7OCxb
z?O3(f^y;lK(jn%Z(LsBa%(Lg0%$yi&<Q3mj$JJ|d)T;Gw>y?Ru1@jKBDK9GAt94rc
zp>pT1U!6wM{+7?mg96gxq_v$p|1&)0_UgW8B$J!5PLOSyX=<`r?d15RaD(SJ;v&vY
z3%+!7A>Urrld=<jak3xCU3*VB|G+E%fL=e|H9NUt7hO5?pMhb|(KnJsrb|o9#O-$Z
zObcuFTq@*u_fp1pBV)U$T^%R38Q)oWN+@%!XT23ukjHBC)lwTGx3ucc36+?lb52jK
zu$s?nO^Vr{Fd0o%<@u+bCcIuYIp=1$ly{xVjOFp!dCRV>njY~^l<`mQEuU$Z*lfc(
zeeISUcdluBD!Ddu(!&q4c1*gQ)*QK1BmTDB&)Bxir+b37tn8J_iJbN}g>#}@)}7p&
zXH_+vPX>jhR116R34N%M+jPHBQrxj*p6-#`hmKmFKd;$QxH9<3iLMs!{|sHZE6!Og
zi*#OA<L#W`6|$b=#MNu6A<MQcl1bfjzR<ShO8dt<lRhghwhY`@cVt6R@{ZVjX0s*w
zgigNLIAe35#*;0X%3de4a__0Aw{8x5dD9@f+ooM~{R!PrjyqejH%s~0ZM&U#o^y%R
zj%(gQ0hy}0>W7vJ&J+}!$aU6ia;BQ7nT%Rg#Tp%Tb3OCiO`MaL`cxTbnN9GX^65_L
zwxFyi$vd}m{+KLPb6GSqU;6lmlV|U5En1b;@p78KYgKV)$MUE9F8yrtE}qsq%}Zs;
zq-g0=DsOjAyp|{QGcaS>wwdP&SIt`OIV**8QNE~7w%C@1?V6nvmgwa?d%kjuh3}sB
zOKl<d4W_s)o1>b4cvIOUi;h}_V)4?@id*SHH!hy4x~EdMD5;#+T=GQD+RX60c_xSE
zEDqN8x|z6TmdIkIsGVLx+p=YC55JwWwd~O1S*uqHJoU3XQEKx%X%=s5sm6{9-z}x;
zNw&Mb-#H?}%W>(~>7!2!HI|8No~o6cS@bBo$F0mOY$n@b?uuJ|(=At<E?uIk6*#G<
z*Hc+DczJI2&;JZ57N>sA>=)bqr6pV3W>#kA=NgL`o2MsEte&+vcWR!0_@v+3scYTb
zGAD2Hly%9m+%M=m-6Q5+-uk^g!H<nq0}k#@mY*y-$LN`Jn|Cf(QGK4Y=YdS#*-P9G
zT|RoKRdUDVK!vU4CA0nSuZgwZA`@_Dk(<e~-<qH9ZMFWedB5D>==tsY<^MX*^Xqwe
z<;=-7pIlX@Mj5>h^4zp}+6?Upw=AS5Jy9y(IA_9?H&ga(>pi5hq%)4IY+ljqTOFr$
zpS(*?t@ZD`eJklJYi7Qf^twZB=^Jf6s+`rd?dkdT_mPqDN5P3R4$iou>wjuV=IgcT
zTUHiYE_xWXaF5rmsV@b0$NXM$`s-ztr9WPM2+4e=c|`w>-b6uh&Ud;GOFwO7|8QsG
z_jL<Td%u{l(r-t0A)m$450lQOZF=}f$@62O%A~nHJsmC%7i;a;t#6JwW8L1j>#Deq
z_w)%7o^y(Ynw?^bqSvqd?%i;9AG=JDg$<va+`<SiLBFR<W*h8?)!M388s|90`N-K{
zn-*@}w$7)==oPbQ-_h5JJH`B`C;5Ayyz^u6>=?CZafAMjZL#KOHfR2fy(T)})={g*
z>W6vY{bfCG$|gz$&GR&P6kB>Z>cTS7Z$TzY4+YBW?iYA);+NUWvuo3Q&bp|-eq&^H
zbL+|4pOcmyUGKAQd!LSZoSCxF)lJi;3Eh{R-n!#mw%1yNRa1{$tll+CTXT}Y)0Q{0
zWd3yeZ)%YWIK^9hvviBX`2&s?xeIMPy|}xJwmkLnx_Bj2bNQMF*WP+fUsjy;Zt?Z9
z;=-5jb;ORR?7L~P<Bi!Cg-fjkl9Ka|W#^_%dhK_7-=AIn#zE&pH~qTnc2Ya%ZeG_U
zZ{DkhN+x2x$JIVcDew8bto4Vc*A#C%yQt7j<tl2Y{Toum<$ujh-hA}a%#E`rN(sGM
zvE=HeJDI$b<a4jhovkwOv69lAGacP4y6?+d`3fF3)OfGX6S_5KWxhkm%5!(l%~buC
zwEv7VSLydFo87sl-Bh`fsiNK9a%qxhxQR&7mw!)gh3{Ozz;NUOtnF-HDF37T{gli9
zge*34A$6O@6u-($|Ep8|H}dJDy4CN#FfiCM6S+SXW;7OhCt@dTu<L2Recx{!KXGAS
zBXg8|_|9L8Ys}AsPVy?3g&o5HF+tMTj(de^sKKh=>~iHB9-GV0yd8e|_<>1&zsn5Y
z?z^XZJicgF^T#yXm*xLfnQV~h=9E~iX|Km_bm(B#ZezcvrtFW-R9|}kH^h3%75Q$%
z_ul)z1}<%wD_S~#I#Wr;`?SnE%NG2J?5~Rb&#<Z;*(6?&62`^u)i@2wSiR=Y4qhqi
zo3qL_SJ;as-<w|lY9W$wy*kXUvFT3_WL&&==JHNo#~+*JzU=<b5Xy&eRAz#d)@yq`
zX1Bg?X>(6y@gJS>ed+yQD-h-__GY*k9{ICC%A|a=$-CfyI-TQt_5L$lbw)BzDuS(R
z*ZI>87Z>lHxg6mIS%i6)4w!^qtBYalm3?y-5e@-J=DAH{T(oZUrw1m~50XW1yFxt{
z9ew#yb?w&u@k}=1Avb3(xOO|!bC%DE{@fGa?sRxHT$;=37rE|cRY|h%T?PhesR@zm
z|8&}a|6qS~i%tI2n@_*YDiu}>i#I&=d(XQ^dY)$AZ8j}#724Y=JXv(nyi;+Z#VL;@
z-K9P9droo$eG3%)t#0hRdST$E!{?=o9`4FWU23sU#52>&$jjr*H&4m0;(L^=%jF;a
zp6`76Yx1+5ev6j&?0yCAne}BwZQuW)($4tivdFH>vhSAOmk#(K({-~wqez-9dfJo9
zoNqj>US^N>t53TrSS-@y{A6)Z=aDr%zuvypFN~^bQ<>|0T6)Rs*p6FI#ESHmSbX95
zEMF-W6j^cR9CzB~l&mL3vr|=;E}oK-CAakLo%-%~wfy>5+w&g(i7$RpS+}Mz)9_#I
z*`3QOjrvMBe_m6qkDPV;PUwLxC$m3DOU~5U9=**y!Qip4a94e{&~uR~3-dmUY%%?I
zN~Mr<U#DeS^n|BIw>O_z^_JtMz&A%WyX@47_F@n2>M2+5G`N+u^!)deAu1Lw^MoE|
zbh0gc*?0H&kA2TCeEJ*Fa(dN^;;ea>?rfcTyf07no>jY^*8E5Q9XmgqU0zZ3>+;c!
zzn>(0G?;YiuhX*2yEI}oKFzrCOxsl`clOkeZ{Gz97RjE8JDM=<ti_{XtxMK^9(_|a
zSv<XMv<lC$NbLw&WRuymf16Zoi`&(`Zo3yOt(*5xB|Q41>b9NNeNLGv9WL^kbo=vM
zHOrQ^c;%30Rn45WnK$-ow<c9=V+iW%_{tWZR`A)nJhis?@Zo)TWam7-vtK@*?e^FF
zeMfX<clUp+J-_hjuMK@lbHqYl|7S?qRh0L<>&=_hsYzd^{G69v7+)+pM>b-uuvot0
z%&C8FXx08Z<bUIdpq<~+<I`GKPFZNJyU6zZMJt}yI#(j1;+|eJUZ?*_ds{@-xrfg?
zZuQ!E1zbs!Ex6E>yJgy+CDWTGJ-VNC+Z!HneVImCOHX%N`MRFE*R*t<Ro}9-vnxGK
zB;TICalX|{_sp)2^Z62;kCZK*{%24!xf6SaKQ{5)=51GR3hC>KOHKA)Y|XaOY?5S?
z|DNDWkGm7O)$W+8a_C-b%{??(x2NN1d9cx&Tgw_(d=Ec+XvUnGhM(Ehp9OC`b>eEf
z=ywIpnIC5ro-n+&&dc|TqE_J*-R&~QAAil?UieRc=NFs(VMt-4rgi-*-|X~RnMJW4
z-Hi`7U7hN?b>dvt$(=`LGUe>dF?jW4WuV&9=Qk4bq?AOOE!{3z?pbzq*N#aGcRvdk
z?DsCp*2#M_-y-B2M^;K?%+YPXbuA|UR`M?@<aV4m@Asrh)n6z4z7h5~Y^P{kXrg!B
z-&QR-7vT_TK5w;`LSfGpCZ965@VMkB<(ug>ZOV$yEams>CWnR`TI{fLXXe&DZ(7B(
z(lpK<7hAg4E$!y~s40v8Gn_kmMO*8_)LCi<8K3U%_5CX2J=@Gw-?q%xPqpvK+hU{2
zRk>mPcCYiTmYqE(oSBi+@woo**O{MoYzwSad>Z+*-Bb0#&8JsC<b+T1Y}?iM_RXp6
zD>73j@tWwmgp^H4J@hDt^NGA*PTrF9(`<up@A%Ike&?55{AxaUe%LovZhp|7%QoW0
zv#ur0vUTiwSXtY7Gwktq;Ya4`Av&+aR*R|Z@-vm(@%F^Tbls`Shg%-)^zd9>lDFCD
z`!ZYIlV3zP&#jmnGb3l$a_-z|S&owP!^7kLeJFcpfAqUu`}9}GY73YA5Pf>3`{$;w
znLCZ<Jzu+{GA&%lQ2T*PS&Hfu^?y?qO#aE@x7TWRY2^nsgDE~*c7dVynYKMTZgygA
z@vH8hxu;*5>0H;mwQV9_e3G!ioVzoYEsM^$X_U+IQ*v$0<-4xl-II@=HOO`BPM@*s
zo8~Pm?ng7@SD9^Jv-HrWNYBdPJ9(CFQ2{NV7jM{{y4lO;q^$E@jvYO>gl;9oN~&~p
zcP&et|Ig?4o&OA<?AO6VW!KSz_4jj6efIg(66;lz?r|%k;+n7Kv*=S(J#E{+vgPdb
zXe)>dc6lo$@y0CKVu{(Z)E)0F&t9s@GX2mW-|-J_4k8##wVlegtuCA1BU3hsLHReV
zL{3D+iRAOXPuEf!II!qGY*XHC<iJu7FFWBSdg46mqcatl?6G8p^Rkc5Okk8pSb&K1
z^uCX29t{2Pyp1T7jgMDsR$*}e28+JKaDycu?}^0Wa-NU(bWX~+|5N(?)Z{-yOP`L@
z&bH4oZ;vc>n{-MgLg?0;j@PMPo^FY1DN`rP-SG^}-@Yg5nBDqG{~4C{*RQtO<A1gO
zTmu91o&%>cKg2q&TG<+Y!{YPacF7%o&fVHK&yHnM`R$GUx!*qKS^Y+C*F1^vI2!5s
zv%qqi7<VY{b_vT1P}4<tFg0#OX6x%T&cB>_`pQ)G)Jp&Ka@i}ltWS6g#I>AW8MRx!
zV_MdRluIs?l~s1`T%M#FfoR19o}Tzm<X7d-{b%-9JZ=sh^368A8%smFbmHbsin<;B
z^@H-!Qs0@gcyqT+nYrXl$G1EExhGmy@M>NNc{=Ou^yO|QcP}t7Zd?IrDGwAW8Rq^Y
z_~r9w{~3(_KJMcUNsW|$=)Zja>_4Z!_gR<*43x8Q+dJePTtDU4;R}a@pMH6>;!=&f
z^2zKi)izmmj(R$p@3*q?X7_J+dxmA9mdZlC6IpsEy(-G9)SlGrTmRLKM<TQ!`C8bK
zG#!s##f~>dP0qPyO)5({Jxf(pTwGjYU0fJ`gC;O>2OR?g!-0)C(GnATLOdq+h3<O2
zL3wR<jM<f?8n-=X&YYQ-mj1re?g{JwXJ@3d^TeiVSQY*VJ9?qwVo%Xp7kBu9(~5`#
z&Zjfhflh%IJ1u(zyFA0Qm0!afMx{aL_9F!wmS9o#MBFZbveNNKhMQ@fXs-VsS&5J-
zU-NVn7;KsM;Eix%C7HW!JzLeKcy7mrBx#nEdFElBcikU()~MBe6p!o*?OSl?*1K!^
zu{Q$G|7WOAxcPC#KCv6G?Emb@yz!r5Qi{>aD3PzWyLb6K4>S2TJvw{VDvfrlM+ZA{
z1SeJ8jZ`m{?^OLT^Y@Y1G|e!p=Hl$#5qrxM-R=MWZB9S7=tp$&ivJ9s9xhM*=Q*J%
zZ{z0ax2-w(Z=AnzdehU@y(#XGRm6j$m2bZd@YFwH&93q~WaW)Le<~{GOkQvKDr%z4
zuB1B0h|^m()t%UN**0j(^ofsNx$~WL^3U6%(y9J_V&|<pw^f%u%z9;>_eeZE<+xY$
zw5A^+_k(1<pE+xz{z2tTVZ%AY6Yts6|J>el%BY~c+l0$z(qxsyAk8(>6HgndN5A%7
zEB`sHHD`(ZlR&{v!wIKWcWZuoHr=qZSM01;-{#CG;s?C_<L<_0+Mf9P$xuIfcFLAL
zy0LC=?>N1BGvlRq(S>-f*{_`E{hYbH<sZw;zK*0>R|Ct}N!R+#NUBoHc%ZED$;)SA
ztH-hpdG0zV=H{=}+fsivN_5kN`iU-XD#v3|maG>qDOFptu7QD51y@`W9^6Ts^?uIn
z<emR?9U}i_cVG5fwQ08QonR@~Gs|UD3s1`A&f)BGPMI!q*QI6JAM4de+&Sh=Dl(|G
zKDPXcxXCo(qV)9ruGurAH}`C`TRT-~=0wFqiJn=`Yj<4d_Dob;d*-aJprqJiWBxU>
zXN5i2+pn?ws>9Tn)0sh)TDf1hY@OBJ-f_k4l$lYWc;q|xt6M$)axYKQl00TCGhK4B
zSkB4N&)VVwcN(s2R`uvQ72o$brd4OhNo}XITDKcLl|02>9=n#d^4P5tf9hkNKR<i<
z^kp40+b5Z!26=Bzxt=bZ*DDn?ZQ5m-N$pY-UKY(YR<*cfrgZc0{iIjxuB?n|S}~*5
zE64D6G~3%?s}%Fd!e7m;3X5l52$G1-le%N1@^Slie!(}@Io3;GTwzytm>)Se&D773
z%`~@Zapcmx+s6{Uf-@)mKD*<mMsHr;V?C3HA7@JSY*U(X?8|-GqnqALHdzyMdF~Y5
z*3-`CKW;j?ZJzvw#rez~e^kHD7ut3Bq4Gn6S##q~nCzA*S3m!?@=cTPyx`34FvID7
zD;KRu-xy!t`uv$&!_;kyvn;1~9tyhR;-UN9cVXY8@0O~jw)5s@{w@8rM&r%8C2P-2
z<T;YE;Docj$(6Pl74uD3dYMi0^a}G-4qRNfdiO6wkC3C$!4vO?_$Iwe{bQZc=Wbc>
z)GNBHVuIhMZ6VF-bKIGG*X;1wlcf|`diYXT#goI6K5_rn)D1elq$uWR%jfgkRgdg=
zy*hCD-{(TyEt5QRRcbUHRXV!zh4O_q`>yA^pT%0I%-1!^#`D7BvnuCKOq6?gY;K>w
z^vBm%u3b`A?$wB1S<y3XI>&?>*+RZAlXkYcYin)1T%t6qROa`c)Mt90*K?veT$Y-v
z%1rVJUCZTk@zC}YC$1jJG*X#+O03VJJu_4>a-F}{qPd3Qa&5sE=QiIIyrp();?9ld
zojNADA7~G{xVp6HyYW56eSH%1)eiC8S*$)g=5NlfSv!nWHsoA#om9Q2qivdJmTAr;
zRlDWAnx3j^Q)W&)e#lOUOK(;B)mvw!*fLg3-SK5s#8IVf#+_5nX!eU7pYZWyu&!10
zR-Ox)Mh{grw%yNDUURi%>dqq>x$BHyuM$16^W>~{huN>qG_@BTtJ}GAMV8B<%(O|~
zX^S42SZvifvB!N^>72*TA9)?JUd}!_?d-Box;38cVqQ_f8Evoo%yL&u-YM04b!qj?
ziA%mI>1~$3x~?bSTvcvs?BBbxOS;Y8F1iuE&fbdoMaPWUM$=~%E_Ryf&bjc4MV#RL
zz)2NpAH_SJ7GD1|*<kOnPc})bmW64Wi|^bt@347~rq@h9XJ2{M$2l{jQ`Vl;lRmr9
zqu|;ie@?-}`!6Yd57^K>t+4t{*0NiycX^A}emK)*<}S+PH|2KrgUS~xbM=?n95T_I
zDpc;&alW?OYxjkkdZo4P(Zy4r3Z3lg{Bmi2U3T7H+nH%ot(H&QcIKqZwhLQ|c4m4m
zndJH41G~<-{;k(S%L;S8${S8k&02L_c1K}>wdVE&o4n(NTSS8TCMz2$`>QvcR7sns
z95iK*>m&D5r$3!l-m!hjiOu517QT<q#UGnl^R+bjV#wtanO9CsIPK%^k~_!g$Ju=*
zlVtxhRPG82z2!GyW?ikmsEw7L%x|qTQ=Ln`+69G`eu{Wo^+fR4lt7L<w^v+=(UsI)
z$9rZ<miC^xEUUJy*_y3!-e~pf*=CPTx?iqp>6|c8_ET-khTDR(yC!*`RGlx(_1H)$
zUQ)<o--R!F%Z%5at2aHR7CH5{`1=TtGNpMFRRya>U6xE{xfETv_N1qi;I?NO4_@7@
z=svRC>7_rP>pO3^KRmw2GqZAZZYM3)Pr8)N-L+-G;~NH-Q)c98dV310Uhgi7l$^LE
zqx4#(EnjedV2Zfymb_1E9~IWL-FoVm+?o}{$*IqnQRupH=7&!cukV_Y9<%Dj!@e-5
zx5YQ-1*eMHy1deQ@j0&h)cR(X*>|U%oF;8rAnjqY_HgvPPtz9N&JOi6p6qo(Gk?=f
z(O0pmQIE}LWJh(r%$wzTy=Tg#hk9(6JpG>k`+CUum9j<I3&p1|^b{C0LLk)zY2)_n
z(@UAdKSlNI56})>`P0|s)NYQcJGSQNRM~X>47%5K>sH^NsfP+CEtd<NcB*#sL)#N4
zx(iQt9X-q`s`=&Bi#zwdqx;Jz)XqOz?HW~LuvT4U_O8iSo=W^@NJ;s2((G-@<lI-b
zF3fx`Zv#(#K67ZMRDWvEdf8>GrW}zlSbe`I=w9N{U4FCA=w4mi%Mz`6WZvyJsaI0x
zUi>oS+>=*j`o*<LM|AY+w3$-37D;5>PRf~b>gJU_OFnzroH*$zxH@-Q^z?m}&reNR
zxn<d`-P_%rH&>VG&7QYU`bJKjk?=IRrIr?szDu6gcX1VT{m6N$@@CVvb!%?z+P~JJ
zGMwq!*{G>YC9f`XFPSOnJ$=Kg!Yx|d8gJZnl#dk|yR|5Eo(oTTw>VOArowmIwyO(&
zhB;17cq(r+Wu~Pm%S)5(dP}!#^E=u1aQQ3`PoW?`<;FuwO1je@=?h%uJteZyD6lK*
z)vtfww%y&ehc7Pjo4fRhDXYTPTX;HU@oVRO@0ylcvA+G0*ROr**(I|Wc!ZbCTmF1j
z@ze)hNA-loRvu41x36!-TI-tzi$8_D6q+_Uh-bob5w~XLS0dVTE3S&aDUExyT*oXU
z`{lVbgSOR=cdU4F<=v6ON5;8s@1<s!x_CX(?b)U{H7|6r(~^~**2mvoJzKlm%I=?f
zzS?irQ!y)B7WXc-&<KrM_jLQ5kWIT?|Ahb88k^#K?>__Q>jud($-<()qF-C(BIl)i
z3w$8;$mLYwk}KP9%5a$oyWbGxJ*O%t_`^#{ci;8eb2Asr+B`Yk;#$zs)NA3AwJsvx
z{B$e*j%aQz({yghc$})MdQ2u#y`k48Q|ZI>oo}KuuY7hAE1&vy^Wy!{Z^F~({%7cV
zY&q-emBrJ#o2IQ?{_#z&g}TK1DW8rnG`r=0p~2?$BB@tp6>YH-pUr$bXSGb!o+Wo5
ze=m9CG}l*giOxNxXpLOGO`AOn1Q%SJyDg~aeA1tTrt<?o_L`qt_Ud@@%~XrM#g@W4
z3%3|db((VAaM~3Quc}#vdRvy}?wnjbKjNIsa;L(ct+zb)^@i=cZN5JE>W#~rtv<>f
z{<KVPZI(UvR81$dACo7~?(&mdEPd3{>C=_98mIIE%WcJ&<<^|Y{g`uHa_#A@Rqsmn
zbgE3f?s?SyPxN`KThpdJ@=3DdoT}bft@`|4;DS9$cTOg;I~P~~xmu88B<p@f()iAN
z%k2tcYPU;HIs5NwQQj;Z<9WML^R=e+tnS?Juf!vMSgy*=uHE?N_WJE<_YTjhJC(G!
zZDn`Q<rbmxz}0KLZP!+1z5Zujnbox`a(#9F18u4K#i@NSj|JBIt8aXMJ^1ug$$Kp8
zr@fL20v-Ok<eQQ5jT0x5E!37jd6#N2wd(Vpb$7deSr(-)RzE&-Wu{QT;XLyj+7^X<
zPCI(5m#Td%sBqtQd+zIG%?m45p3E-0d+F`If9&>Zzb(6-`pEj^N~%hv?fo48u&;cr
z>(Q*6Ggt5R_>#dREq~?CqcYA#+l|)!c$T`Y=h)8TjWhL5<pwU%xx7(1rNVc?cSn_9
z`!)Ngb)-wmC3BXG&uX9Jy)n1UZ3o|DUPIqQm(OjxsTAqpsc}+e@{J`kXLWqPpR72?
z(rL<536-y}U)-7Z@9O3)Z`Qx5Gv{|&r&Qd#a=UoW`>3fNUmS17&DnbD)Y;h8H}enJ
zTx`g)i_c$NR{5pZ^PJif!H=2Kyprnl&O|MHqPkTo?C{_9x%ve&#g6!|>Dhi~itANF
z*V*^KZ`gPG(JQ+JH=~anmD}iDzGHTQqOtnKV}3>tuNzGFnfl1zFw^AgOv%!&!&6rJ
zs!hJO{_k#^vTst4Y&(x9F1IS4lh64%|KaM-=6SQSE?!^dck)Zj{R5SsJxZ08PHvv1
z<@MI#<far^_4HJM_b+CywYq9?OH}fhS-WJm>iQXzZqJ(RnZL7C^2M&NlkYy<<vcm%
z$J3?<H^YypE1z)IsPtWN$5V2W*i=>bjq?MS{Ac(fb@|xkm2RPz%6h+-XusOLJ?-wU
zKA!l^7JKA1FFsi{soyg6sQ%1olTB+*1qOxx+!tGz<@IvWALg?Ywt8y^re_~9<yJbe
z;Bm#0nZfZVGH$G&8l2~0dOTOROeR%l+I^uj({5dg(Ri(=+vhevWKH2&!yQ*<TF$GT
z<h%OV@01;qYzAFB*nTclbG!XCUb<(;`P@mbx?9ft`ZdL<U-M%1+qGXxa$jD*TfA@m
zMw!LFx!t+PG@}-k&+^k;&N}IPyj}EN(~`@!EX*=h&uuTC+~K><dA{dK<<x09Avc^R
z^GWj@E&VoSi>1JgOg^EYEgwGY+U|3q`eR;<J9qRd$t%6;hd!mRw^*}wbJ(gh`x|Gb
zn<<_zT@1P~!H!8N#>Hjb+VA(*#QQhp8Tp#5&yAhCci(c8+9^jvda?pnaykVC2Kj~M
zKEE&WwK(tBuH@{yw`&<2GQZ{5Pe~IGT=!KzAgV|#+-#Q#*B{^QS-T~dm*s9(6D>Na
zl6qySOv{RWBJ(4DJh9q5>C%_ZY?f1>xow;_rF1Oq+xFI`QZ#(6bhX{7`6k7)CZCCK
zT2!jx&&6>eZISBrdnTJSZ{FNplKb}h>iv=1|4H^&%$gCZvD0bximxHpMYEqq>Z`B&
z&%jmaX?DwV^MwV5GPP&^o||AEVRAO+q-xORO9yu48BbU>DPQ;bJ*S(68EsxqKWv$#
zC+zj>{2}}5JXsU%RWG;9+MFG`%e2I-V$RBwMNf7fnzPESa<Swe%dMKKsx=q;cH4FT
zJ;1<w-T=}tU|?Vtx$d=Rw(yQMS7J@4vaz0XR8mvUy(#0F`z+PQ*L_-zrk(ntwMR8s
zs)Z(PU2w%EC3S0G=GUp))^52vw<fbA@!69t({FjVw9gLI<WrgAzG>1TWmWM=$vg*s
z_0ZH`lN(<-&u|kCtmNiDvT4?vgjq#?Hdk$qscbsD{=}3wD>=5rd>2<eE|9C<cvQ%{
z@o>--letQ#i+A4YwcBrcD({(Lwp7ZgZM|LXFP<7*X<0ryTuty>RKP8nTs^fysVPgR
z>wI(CekCpQ8*B2k!{^r+ba7hRR;$a&UcS0!T}@ek?S(td{)U`(s-;35dz61THTxUz
z1iibhdUQih-Mo8$Q{HZEtxXnuazm%*%#`0NJT)&X2lbcE{JP?w0BZxg=+?8LBF=LQ
z6ONd1oV3YXGkI74kx4&%{yx&%EV5eGap%^%YvOiqSUmsVzk{0}2iE9ryi)(O;PS@*
z3@Rs5gSKeCsxB{?d2UVSTfJ@9O+y0tN{==R+)!5exNDQ&D|;dDhsNJKcAW`YQ_A-2
zdfCRRcZd4wfB$AX-|PBe+ws8v3{MaH9se^~k>zf3y8gB@A^DB-H%?DI9WHgE|Coo~
zl5OtWZ!MS<KcS4z^V-Vbjg>z>e6&=LE55Q-xp&vGbMZNERP2-Oll4wdTW2~^zGp(L
zXHI|qR!5cn3U3%BSFCtrz`&p~0UZ05s-Z`x{4VyMapYKc+EVw(EuXj_s4ichw{FY4
zCv`t1rmsD_DJo8E-Qu!$ilt>fB};c^=(qZQRXksF=JKI>CZFRH9G*v?b#GQPDk$&X
zxFGP!l$pw0b+Zocl-aCQx^K&_i~ku+Gr~?e{&N+Sm@LV&YR9Jed6~r$Gfw7QS@GIz
zo?{KmmalI=Ecved{lt;IdAXB=K5Q+O-d@PPdXw@}-P4XWTKOyas-LCJ6aFETW~JDb
z9qM{-NA6dvg`F>EF78QR%BB;!%)0HdYo^}To0W5m7N6~S&b)DcF2BX)mG<?mM-*e!
zcWQK;IC0`k+j93wK`A|&zp~k$6rJh3_UqDX_diz`Mr*SF6mdyjm7AJ7@px!&O{Tlx
zmy><@CR0Dfc}l7t$&2Ia=s7VZHg#v;j+35odM{r3ddAB6z281XU-8`Y&6;!Sb$X=*
zkLt@mRXh@Ms_IxUzh|z#prYr4ig_8=CVMVDcGfd6Z{qeB>l`OV$R&2CR2uKp+9>_?
ziu>-`w#d5d+P>4r{MD>;H=oV**W4<d;pE@2N67Z*r833Nn{yYf*;v(?UMXqtmiLFR
z<;{ziyi~UQduf-&F>RB|>LR6}UAxZI8m@_#I;@$T98qT-=^DtXHsk8ojndtcO~%GX
z&b|{>RnpRGT^}0@d8&b<8kIO&h}Z(%fw(E<@rlJ>9)EM6C_j~f(f#z-*{lcO`%CSz
z>GGYFxU9M-B<b6<V^>NwJ-z$1WXgkXN=?|Lr>txzEVZ<@b<(~=ADH{*JzA}}s<YBP
zQLeV;Td<PJjCVq}OSg)>3<`;Lb93I{z@U9)=Reg6H_Su*b8iaGb=fX(U-3%bj7`?E
zpYM7qeTcP_myhq+uD3B!h=HMh^$9J#rICl$Og~pOFVJb$yr{{is;9MElm`lGA1Gws
zRj#*p`<<Q^CI?pSfYN5p%k0X^20EQvPMfGMU;VM$TCL*RCMW6OjmkS()_NOx1uRyT
zoSEakV{^dW!zW%T$6nuTyW-QuW4F|=?`Lxq`(2Ua;Ut^$Ymc|5#BAF|=QHmyFflO5
zO#62AifPfh+^1?L8orv}?*>Xv5}0)7eA1mW|Cp4FjbG`WVZ6p<x9-=vr!ObIexm6g
zzH6GQ$z$)#TwM?EPGzqX{mEXJQmS+6f7mWT1V8ct_ltkLU+&xR@}FeD6-)7*zE@2`
zE<N7Bt8R2KGf~FtL(ink8h3h+-!a+NWViXwrKw0)59}+RmlT!TXFv7Xbe(J0HY_nc
zwkhb^>(zm(+oEz+vP-Y7IH{4#k`>g`sgjm#etE6mRh!un&ysJ}8M4bBPkXGnd&TNX
zwcTkPON%;-u1)<kVY^WEY);jp6BFDXuI^RUZGM^i=JttSGuIj~xR=KBac+*q%D}6S
zd)I6;d#IY?w!K&V(9Ue7!rGKcne3+pD-V4>;~%p)eT$X*q-Ul2kJ1Akt#}q{vf~q1
zx=Cl|d?T|bncpuSz5e({x$Lo1%v$deN9`k@V(uV?`<z>bti|?wF6!tjuRoiud)4-v
z@$y*{ZFa6y?tAU4xb3>kN$Z84PZzqRAAKl(@QTvzPg(He?aL7dfKQz36|icpY5m>k
zi*qY%>eMGcKbM&HP&ew@@((9HE4(z<U8%i$;mX`KuPdUK&T_1}w@<ZnzGCR}n`@8f
z-1eHb^hnT!&0bTFEW6UT`OY@mZT%;$^fL2&xlXNfHZ#<Ilev0DmrD3-@4&}Sm3|v?
z3l-hG9tBRE)$5Y_Uhr$*JRC<@Cw+*hR9h<j_V)U``_iv%t?Dj++ADc%sn$EA$ec%0
z;t%I+-xPj&YU<uMQ>QI7UYs^N^6t88PiK||%5Hyqd8;sId-mN40TTsTw)L*(<jCC=
z6qHu2bLQ8tj@{Z@PFls9-MZ~oXchA9?=x|ud9LkQtCMH@&Z<14X5-#7Z9U5rHy8J9
zzb8&RejLYX(M<<0cAw?A`F7W>u(*q#re<ha%v!h9T4RUjp0#STeATv@%)RU%c3aj=
zD%U<}`OzI$wIV``v>u&n6tgwG_1xgMu%y~ObKe_1E47sLStjqSe!joM&G2Hg|D&YW
zM;_ja+bCFZ+v2z(zu_5;w|4oH_vRLS?oCTeOFdM7R`Dth7d0OYoLn?(cXjlwUsk#o
zx}9&u1uc}gbU38s<<aF5NB68cdotR1^3T~-x3ArI%vgCaH}UZN_~Mm2W8YL>>@!pg
zJk+)0_Ov^>)?CJXs?`<!9p64qJAO^M@U7h1rIRz$Q>Q=qe(l=U-plJQm28_i?TkyI
zRKV?tQt7FMI#<sane%+J(_1Kv!(qK9RaLXYw|%?6^5pxTl#s`q8Y^%8OqO`F?aa#P
zq`hmaweKB2m+vl<_D<!S(V4@VCsHDf*<@BuzT4q%&82<k%8_s_D}iO#_Wn5QX)fZH
zdd$hs-u<oSx)U;c7aw{3^49cFrCEnIZ4(w;aKpXS)A;$ewi8#De3+@~lDbjj`~B5<
z$oE>vBAt((wjrYA{lxgAKfcd3`Zj&)QgL1-FTG#WR=oDCS#>%kbZge8tTV6HUD$PE
z`;M=BmsQV*bvbe;ddt-^VF`<kAsHqLSC-v$K0iw!ZOY2$GPXV6jg5`%muWAV7F91P
zxG~o2lzgM;^QVU1=37^Gyt)&0$L!jtggGq>u5Es;I;&$(^xF*&ab$Cs<D6c4AH(X)
zZywCLxniw<My1o9SgXrH-P@K+Y+ZTsu69=WoO!94alZH7Oqt0ZZ0z;&#<cAA$D&iF
z3*2ut3XKa^@>w>!aI4DMZFkn?vaQ=+>*D(O#vk|33ollbhHVKekG$qPNnIi}H^g=G
z8xdoHE7vuyxr!NUMeVF!(wh}1sBu2`*O7XpQ?8MNPu;D#a@n2v`Tefb?pB@p{aVm(
z_Ttu_Ovih#%&UXDGtZv#+_7i9c6ijx+x;(6Sy|`0G@bk;&9$e@`_jwIG;7&_*K-S|
z#Q3h{S^D*Z>d~bvuUD%~+7mrdGP2-~*kbmh!tGa1%y^@s=XLTVx7Xae&%+-5x->~|
zalyA&A9Ejn?8?XCC|!SkugQBv_jfmaFgp6_oVBdDjH}z_%rCi-v#O@uKIgl4?aA$B
zr*+GwUkJ-A%v2WE5%mkr3;BHQ5ud+mrpc@woiaAj8oz#&{b@3in{XmqW@bm!tG?KL
zZ@s0b>JCr4n#8#*<i5_d){VSdKQ6U($qMRTzoy3x*NNzIQj@o>?YbRz_Q>&R8!r}B
zzI&?kcB|Ou{qKC2U49#;nqx9=m*`Qp?4EU=DHdy;UMVg-xw7x*lw?Pth#8ww4?j><
zndaqWF=_3Yj?)L_x#!f|U6~d6!*5a26;YA0o}&BHy(jP0i{hE$<*B%R^=<daCEJa|
z1FMf&RO2Yo`FVLWv;4EJre9ni_V(C>r+b_VRt9Q@w4XG4I!i3iTX)H{m1fsc{+J5w
zl<P{`Dd(C#S?|y_mesr14R4z`mu}f^WYlqzyI|U_eJAb<?GSN`&Pz#I^h!R;Ry(vb
zc*fm#3sNd=xwoEL9qPMUbjiUD87if#Y~1D=t4zC_{r$}O;v1*L_aknGLB7sGc+sVx
zGt;(zzy3W|HC`|Isb}C+?_*|LuSAF4oVvAQ%T~2nnJU{<3JfnUTc(%oJV$Ins+f@B
z^(c#iTye=CnVDPGa;U0qn_YN3;C8o{T;b6j^VXiNO(`xg(UxyamwfkflcdOo-0RiC
z^0hBsHOgg8S^8P1X3~UAb+N5WxhgJr*)4hF;(8oMS=Hsv>Ljgvxo-Y*r>mO-FGt_%
z3p=8+DlG4`)>T>WxYN46X;YW%Q=NJ9b;?w=2`7%NP4_*%)m^WV>)6fLouVCC8&90B
zy!vrRpSt48M^1a9g=bH!=;>cmtN*2Fz3K_BYmfC$KDkzQ%5>L?*lCwHSnKxoE?wTc
z)b5g}r`ONiU2SJwcP?M+QfP}*N+DmgvaQ=&FFNaP`Je4+hv(nl@vnS-uvgjED@pU+
zwq4piY15W{sTVF5v^NGHydtgZndNhM^VT)JA)mdXDmhczWuio;eE2BTb2sc+meGyw
zB>lQmM(c1?ySWl>Qj2!i_*?B$Tc^4*Z0ap9?YE*SyC-d2)o=<|;~`m?+zW$H59pVf
zcI)%(z^gvHx4c-rYK_zSz9%d7zSZfiW1H={GV7)}Tlz-9c}Lvp8JH6oS@}(lKS|np
zvu9;zV|=DyXV15&Nv$7tHAM-exK*!D6)aeCe*MuMYa?Fqo?Z7g>FeEAng0ymzwyqO
z>VBBpxA;H9>9)E3Kh>PIrY`gkzP)?Nxzc3+%+hT;S1sx&UZ*3`c2H8i_v)nbz#o(1
zIp^<jy_PgJd4;*z?ycR|%S4L*Gu*#nR+FXQy3MNpKZBlBnf;TQ9BQwnEo(P#nDDpf
zZ_jSMs2$f_>WmyqRX5r0yUH@@PxFQ%zpY+#yU!nQ?>uSsdEU~MGP&2xihidjD*l%I
zEm;<`wQx<xpOmIo-YM0!t2kAEvL`T{b#+f{U|>|4xKqxz_u*ImCz5%N(VpSrc{Q&k
zbC301d*%}3XOS}HU7Abu{GKCj`m1)u&t9s1=&Q2Wey=5EN`C7e*(@^u!Fw*U=vtjT
zQ^-1bc`lznE?Tv!vp1IeF5c>|Ws-UN#EUNDOUWz0%`V@jQ?N33X@=>N{$!I$m(pTm
zTp!C+E;;ME;GL14@AQK+6Sb}#G?H7qzHXiMnq}*vF0>l$)L-c~b)nXsJGV=goai_a
zslDi=T5n6}rrOX;e^fVD8(-;(YTa_uB5k#G$48GPLPDFf%f;g4nI?f8((&!aiOPN7
z4qD9PUM{rutKL1A)ly+P;zD)RslkbfwpX?+wdOoKH}8<!KQ8SUqm;9m#!9MRZ>&kY
z>~MIOm#NsHq&IPzK`BS3XMAbhbbBhVCfAd-M^zUE?QC&tS<>I!vTK4%*a<)9(xY4D
z{f(zxcqcqBc_)*sIxp9%ElH8u(=ye~%&x5c;d|_4JiAo+#|azxZC>j>^N#j86U8&_
z+(NI*vpsot@;Ob&_3(PAy8WbmR-VGNb;3rrJ9_rGC0U)CcUt1q+xhFxeRdaio%%S}
zDJjg;$XR9PsZ^s&k7No?KGB&Ic6R5nwmrG3n_mk~N!|If@UYUSNh+;JrWUeJza6X@
z?dRbXHo4MmTE~g=MW!oyHUAY(oS0{GdhyM&;>@+j*`8~y(RQAj)BUUHj86I~mk>=?
z@xHl%T&9VtHqFi}OI2Ql?iXIR_vm}u-w$rr-+Op}PaMy#H*V3^(isd~q6`cSPo{>8
z1;0J3mn<x)J8P<J+T|1a4b$fB-6sMcwa9{xTDagJwSX@4@V^{kt~@QpZ~L|BQTET)
za^3W>Ke0$zC|Qv*n_LWivsUcB&bx1Uh@q{pNrB*$N>w$FL-Uv6Tjx=-49hwX!izf`
zZ#tj7dt&vI%TG)SS41hlZP7G2Z?EV0^-Zp)&l1H+A4@f-+qgfvzuNf3rW3bn*M0lR
z`p$Z~*ZPe|vmYIs=-YKO+HCu*$(r6dS9#k#xs(doH=VWqC~%8aKd4l5u1((7bicO~
zPgi<R@!fs!MuGH=C*I*rX-l?wxi`69Kl)g4)1BBno9pvm?#sRNPxG#w#mTGIa<782
z<qtl68oO@ebVL8+D#u$@7k4Z2to86&9JuXPXXfMlj^6l~?P6!so^SD<*|JmbQ{$VA
z+dHSWo|~Jg*rI2j>XCNWXsXbjjw_F1yY3riJuFQ>=2tj<L6L2g7SB32gIPk8e0F<W
z^w6xaiQPEqwe^P&^1t_mi}xRTbo=1#;@3;Atu;QOtCN+zDR%Da#Fu${wPZRr&P$wH
zu;)u9zoO2`+^wa$IcqZ=XA1Rf*L^?po&K!o9<9sV=VDh+xf2uj=2Br1)0E4BvCfl}
zc23yxxo5#2*(+ayN-j*Yy*Bwn`JMWJk6qKUQ|x1xZa3UgdS@m7?rwe2NiI+PuCL|O
zHMz5{(xA9P{M3^QFXi&)mLKP{Y|Ae@ttfl)M$co`v<_GHowxOFTD`iI<*voAylJl2
zgsD2y+NMl)5?f>^vf^~>z6lXFb0+nz(^@9$dzI()zdN=}(Snb*N@jRn__=nHx0asj
z$H}XlCM*#OnR{)LVEK;NcLuqMM^ob0vE&51r#NS=YqRMo47-uqd*++3kxbM?KBtt)
zChN9MF<YcMan~ZVOCH9Vy5e~YGtV4*`)TWygpbMLUT0@)J;CM2s`XBFjp$WXUCk4P
zQ!c#`ni=C2Bpuy$>wK@r_hTQT)>Xz?C#NpCx=}AzYmTGWx)al_<)d>o)ixDedSfnk
z)AO~{P8)9BZ}9=^9?ezk-f^ntcxXmKa_*DDK)ZFSGAn;F|Lr{BUHDQni#c6(+mA`!
z1t)#gJ9bQqo+xwA(aVL|Vp+g-_nxrMfVIm{_fHJ<aMoU4Bz@+v@8&#}-6HpDk4fdu
z-R5=2RB_96NtF|qLc3#>4Xc-gba!qH3)4Q4e0trc{wD9KnkFYxRaX4nqnUd&$#AC4
z)T6;^`&6YmxASHOzMWip-8=h}#*f`UrC-fd{IRj~nzrg*-t8(&x%&K0JG;xS%e0sm
zx%SAUGiN5HrKH3bly>{;=&a3_-d)?gEX}3UXz6vYUQXW)>%^~}*&gjJ#JROp=T_7Y
z|F_!&X2-I`u4`sFZc=4u>FRDL#$6d{^`3QG_hhe?>r8HVRNU#wS@`!y<vLZHOSxO;
zi!+6^)G$VNIl3_h%3b%Cs%P$=s^xk%R*yY7Qtp)D;i#aOkIzf@FXfn&Tc|K$Q=#wY
z9cHhkldXO(U7vjEQ{LShra~esF5FA~W%=fedWxh-k&o-9AQ8V=-P4wA%Z&TSIpf9L
zRhn70SGSgOe_B=e#5Lw8-^#*!?uIIlkND{B(^HFxk_ieul$K?Z<E6}fV$P*B*XEZ~
zc7EG&v@N*x_9|67+nT4+MiR;sBihxst<Y8Uo6apXQ9dbW%A(+ENy%x)x1_N;owMz^
zp~a^xx2{k3D^IuN%frteYfougGI3YYBG0HU=d;sw^Umd`SRdVX+jM8u!tDaDgkEQ<
zDyDdOb^Pk-Ka%`os+h@*bBAZjE`PkQ)#SC`f@$pni*9-b`L8%};!IEfF=b21Q>F1Y
zx3(+P$w%`9h98@@<nQ{F&ndc^_P;u}F8TIxqOLtB_l9T9k<k&;1*7Aol;Yw|4<#<y
z<{i(e-tkT8&X1r;cm6(^Sbca&=xdFy4GaR1@i(6Ro&xLoF9lA{axD%MnK{S9Z+XDA
ztl+ONf;euh-J~ixC1Y*j2cyq5I;DRn>aIC^yp%EbpwiK~%XWOx+&F8ZoU4r6ywoc~
z7aobqsw`QlekCO(&CUI>)P-Mr&;OBrxUcTfiPnWjRF7Tr^6Kc-QohsEe_t{1oN2#D
z$*GVGtI4Zo)Tygwr+rDw*H=0a{X+T1m6knQW_9$hIJ0ZJ2<zjY{_bMG_5D}rZ0a>r
z{pfWp!np3Roa6kKw~3M;?L}3p=h*(Z;QEc(NGajpjbnlRX>%1HrdljLlI=BH*8178
z>5?@bg|?e+{Ac)KWMpJA@0gpLn@dWHb>E}lySR>tS98pIYnrs&P*h0fxxp#v=w-Kj
z{jAzfmkNb$6qoy_{E=fF{2ce=V#-qtenfsQjrO~K<KEip$#>4KeE2G~FJo<dNeJ>u
z=qM-6=OYf1w_b3{X8!6gtLKMH{C*5R7G3<J_UL)+qX*DK&*(3a96fP=^ojxGl7#!8
zZL!u};b1fFz_?cRjaR14HxF9B(d?n5#)ZirEnK(V`!Egj9QRP<8ydp4nlCkXv;37i
z&9V8Cx%ZQklih8nMW*YokGd1Jc+I3KOXl4<bLK=x_nI@uzAy(ZV{Le7DI3Sy6+hd3
z($bOuk?A4p|E8qKl>K2*bU%=(-^kYdN}tvJnvOuu!-$Vl!nYSLeptG6$%WTm|FSNu
zoih1Pn~Uj1-ujM!Cv)^}I^7C(-5hi?aI4~?A3~3+zTN++zRUl<vP{sS9j}-Co-b!^
zsowUo(UtMr#-m23c(2RsuIIG-#i{G#SQk>Ncpo~w>V9x@Yx1$cxVy*tTdz%<?7UX{
z=W3^ILH?g41rB|yOj+}vVV>=AlgZs%ruHoin!a+1L8+eDG>=YOv*OoNQ_jqbvKCJY
zIx)+%u&|hCcA-m*bxv#P`zyMOg|6Na5705%t@EDc_>B$QSFBuH)v13bx5eIarcvdZ
zSv}t!HQTRD4xG;4{{HQTJweqjLDk-^YTF)cyL-cPw#kdt1vCFPCTAABHcVXeP4{l@
z<$}}Gyza_)Klm`IT5#I2TGlnQ3f_hGB?){wwX^8csq1f7$@JT{OHGwr$IrW2_1&gG
zohy?aHF_p&zO%#4vV7sM=IEHk3(B6%S+-@#rp#OAtGj9&k2frfk5)CADn85KB3o&~
zlqKJcRaJh3#7D^*T<PMfYG07}S>0!9_*BJb`pZI3yr~SnoBOTWR50L3vD1}Xof&=Y
zLR0ngRF{`_r*`ISG?lI`zg%&ncV2wg{X=IXR?U>#`D4e~9?O1_<@cO@H>KsPZqd26
z<v+uos50F%$I~=d8Loem9{8kU>O09?Cd~*f(N7CMTJzWTIp-aj@LS6%#BHj$!L-RY
zu1r!@x!SmOS=B|g`3gHX9V@yW8Km?$wv+3bpUs)CGTf>*f66Y$<s5P1loPJWlY6qZ
z*VCTW{LZY!!CPft-`Tp?q^j|R#gS7{t8zMT-OP_TG2yYF#a6}k`8~dqBAo*KEFbDz
zJvL=Y$CXoUAu(!S+AdaH(9BKD3Ny`Q`u1`c$E4TqlI+f>>8dSqTr=n4osRAzWz!1J
zSFgo1eobCp^#098i<!@?{ysJKsynuEwVSw}bf%JV^qNUKZhH!Kd=C`w*(ccgcm~^A
zmu=-*yB3~$C-%+VIb}+W*J9t9ZmH96R!p0rb|&blbZ6Mfpq`75g06lu`IDGwwmRiz
zQF5w4?(58;HEqIc6;?{E>gzmfHSNj4JR@tvlyyqxd|p3(T;I<)an^*>vX6DjXC39+
z7JApSW2Hr!e_(LlRIN#m4$e5)ox3@!pmWK#U-vVv+!EO`rETUTrB|<C3H@+e{XnWQ
zXthtV`r6XMha1gmjAxx%8n*D-$s;1mE^I9dJy89_eS6@WwAiH0z0t4kbcpT<ZJVLB
zV0DsAwD(NmMV4$y0VfyQHnhbB|6W_e7V~YU%Hr-K;~TA-YgNx_e%NJUG9x$X`Sq8p
zujn0{5*U~9T<Y=q2}{<x-JbnnPfFI^b^6<Ed%kxorep`2_Is=`yk7J}YGwP$MTu7p
z-Yzzu7%sS7s5xrF^3EETMUGNEJ>OL(&hK!Gjn7ElsQr2Mne!5PV%};@+H3Ca&E0WL
z|9GLFt2bXUC*O)`*+yRGk6tS)E8R$)(KanB=FG;PX1&!X)ZNRKS8RLsB>!?*vCpw~
z&CTk8QB&NNomXYIOz;LJ_x@B>ftk%4zJ&g`Cb>k*F81QnsS~x9-ns4?cX@g7t=et(
zOVvJ!HF?E2t2{oE?I`SRY@~T={mPzIzh=JpIjeK|*UKxjb&S_k>71Ep6SDE%=0v`m
zVn?cvd9E(+PE~))@$beyLHY945xR2m;f<D+VsVb?hC9-ug}n7VJCB`SxA6L{$Ew?I
zO*+*v^-%T9X-6L$eSX^4lNH)5%4^JTShQd9Xoj|$+v=&FQa0vF>ZQwy)+q1RT%;T?
z6)G@ooyv+{&OlZ7f@<H-^IpAPDctz^vBTMAPhOphna(r$#8fda?rD8`mHV12c{O^s
zEN@iFjM{f$X4uE6`KB|hruLsJW7=t~d}vEibV2%_7B!pZ{++8^PP(m~(z0WV<dRO$
zT=zeBe;CY-T7TxPV8}Dg2?6VJTqCaR%(_#u=$cT-Buk#6xI$0=o6>gdoy$4qc+TQp
zpEb?yRgPh;!C__PwtKU(RTGYf%NRABnmu9Z#EOn<8IJOwcAhl4(-WUzI6ZLsz0~cl
znm^ZB1*{HN)KGD2@W?sKBP0`dGJ45U-7FPdH&0dd)@?_lJlZDdIxx@6`4M{N^K;MK
zDUy>su2mOm{Bd!8Y;0fAz2<1NXY7<4C47~eABG#BpR2kpAk9-b^6inu9|gsG&is9(
zw^`P7OJtnup&b)``*X(DPPN?pTxb7_<Q1pyzb?3x`scD@;evxJGUPN*WS-n+om0F_
zV@~z@W1g>6wz>bgBINyvvqe4pY1H=p&-VLWjtHFVJ8Pm#cl#5`kZCzdj<H<Px2LE5
z)Z978dg7bgnI<bb{(dSru}?&!J!P%p6-lAE3Fp0H{X1^2)6D2QT4$8I$*p>lKwgM$
z+N6)-Qd8D;uXU@QafPAu;zB9&>}+ezz>sZ{;gx$`KX1Gm@AhclF}}}L-OrO(?dv^S
z+hZCaJ!>XzNru$eXJ5t79S^OQ?)oiZefE|9+`d)6%tgN$Ty<caT0Z+qytQoTmptt^
z3GdBU>?^Zc{o+pejl-b~Y@zq^18d%thF{#Veq+me_chDDTVBvLIa@UI?(EVDiC?Zw
zGph8+pRf7D*W&dY4wZ#A-ckK26IHg~=;&UN!L0RS?hl=e`_KL}$o;)@fH{<bfla%k
z_T;6SxzaBtsUC4t{I2f#sU&}q$8<SY`Q`GPY&!m|Xx?;qm2Q#NpG6_3kNB)n_U3il
zJ6BHW#P|OU)q=}|4nM#CxO8jo)eizo)1=hJgEm)l?|F6Z%F5ZR^42X=wR294*yP=|
z)H|f_Na4Cfou8U2n%;hA^EXQMv|c-Hbx3oQ)*rjahKrkP^W0Yb+G)nKRWDKO_iUe)
zF55Y_*>6?;C~>BAy2|-XqZ@DM-plWM6ua)3`_>~Hm#r7u%@lOj=WP1MrFLx<oNQA~
zP6?$S^Qu(K@O&%%MM`_l)?K<Q@>Xg8ls#(FX?R*TG2dMAaq!t|XKIgHRBwsa&DnAD
z?X{R1m-LN7eYSg<&c9n3eW)nfr%qJQw*O{ekbl$XPpW4vU8bnM)YaW0EVD&XCBu2$
zXOYErDuOxnI`fzAHDkS1wWsSe>&%2%7Ut%B%Qak>rd(+b@^}?dxjE!&ZEx0tpjkJ1
zS-gV!6SH*-PfVR@A2HXiFXD{P)H$iQTVviT`rFDbpIy0QPqWvoyO*4L?s;{bI5TOH
z*pKcSceB{e>}#3*2YZtuo|Y#?Y+L?Hv*@<hyE9$W!X}$W9ZwXJ4mb4O;cIqM{KgNF
zN1f4AOLkeLo`3kkVCA;0T|K*a+S+RN-SCb+JLRQjZTp05&t0Fo8#-(9mevW*wAieB
z{?)&I&)&s^KJ7~`eEs3(j;Bx0uq->ck}qOPaqo>&HN8t#8C^ZLS;cI7@S(L=76l4<
zS=*o8+!gbmVW+fi$obb(`Gq@^nvNPg%MH}3?)UI+SI+ur)3)!}t>7cux|70Q8>?z8
zP2)B@^m&O@@Pxz{C!RkMNIX2ZP;!RvQpwz7P7k+Tig_6HG*Y8i)6`<I+U$%@+3L0D
z^j77*cegoOagFP0(Zh)7<Q?aWgJ<|97COqDILUtA`|A_G6P}uTd)tMkteLt~ce6g%
z=VdqV1x|Xdxn{@araz$%O|Elgac+B3G~@Rsp-;C<t!G`R*)p-hZSLfq6=%+y?6_)k
zEc%9ucdYB#m&!4YdnX(IaTm?Hm;CP7thSk|MzemVwVmV&l<M39+S*tAx^>pWw4GJ`
z3%{Fw3A-}&Q&zOgYM-)@IMIEcK2=YSs0-B89Gy5ndFKJQt@EQQrth5D$C@!OKkHp)
zpiyiotDSSKm6UaKZK9FQ^7&zJ_usn7;&WZfJaDbyoN!Y=jVq~F6h7tbS8iSL;X`NC
z@znyWZy!_ePxDn1mOGTPli#COyC^F#;HR6Nh?$GqZBE^(S9faX-<fW_KR<Mnj??k*
zuW74PZK9SqyXiL@=d_2sW?L>h>6MnMaGs{?e$JI!CgyMQXn8F!nCv$>OU*aqS>!Br
ziRP@GS2#67FUw7pSQPFV{HowyVcWIkVavrgSy%*a^6rhz654J$y|i>zZ@0_Rxw>y=
zey-d2>}J%>$g`<}Q@jg0PRcA@=pFYg+AP!P;~V3(u2)`X`8uE0$~El{+V~=5!mG;>
zSH&cb_JqgPPJTV*=hc0_x_KYOx#p(q(Ocx-vsbXozE)jg<>aX@S-Ph;%!qB~m6Vd6
zl%ZUxdE?AB)x0?4{|rx#Sg*}#FI_u#>qV}8`*fJ%CEZeaKGxW1D&7CjaHE7@`}wT3
zlO4`}KEb-`S9`G0vRkHU=D~q&8?6@kv?py?rhH<?s^^|urQa)jrs^(MS+VOvw1)SS
z$Y=h_A)k2iz6Y1uOXh8p4cy4iduU>@DBqrgD<7BsyXs>yU%q3?qpI#A-CajgQc}{F
zI3!3dotov#=5O%i>CZM}Z=t>I89CV-+j6s>@AJI-XuH{?{YqM%)rFd>qMLHAWEOYy
zH;HRpi8<uAC}qm?l~L_cPfJUquTH%YCwKR3_WS)3*Y>3>4f8Nr>X+k^UAgFi?H{2J
z9Sd#*<a{~7)3@`l_myhTt$t~rH<TBy_#idss)<LC%;cmmDR!S?LblEKSF|~zs+luk
zvr*({vxlPMTh|BN+;(`~{E*~T?OZP-YcxK3r%&d~4iwkz-(JXabESXLkN%QEl`~g7
z4J9K^op)t8V8rt}=uLLq^ziIS<_^cg)7ehl7hBn#n8_rZ<|;l_@>uM1vn<<b8HwM^
zZk$w!@%G_;w>)*_>Vt|WuJ){5QC1fAG39pBkH*tGCtXQ9al>cVckOv0nJm?lPx_>&
zOo^VT&@(A`Vo%SP7d*RGD{fnQ+%WptW}Zd=SazpeKNsigaX00Apz66`?_gFTt=VRG
zA1M76s+h2J=0|z1?m*G3qfr}honxGdbhpH^z>I(M9$R&mt_l%RWmh<%tft$%X5XX8
ziq4v>f*YzuJ69XIOV6>%df@l`@x5ovJKia5j{hlC|NcRJcSeo-ZNwqQ?MG_%Yu88q
zXE;-vQdQx<_}*Uu2FB-eLBpPsX<O5N>d8-h*<-~mcdUOxYUkR``IBz_Rx(dKS#bMq
zrm=h5E&fhx`SAW^BgwVp-nGtq+WSu}z3^*0zxMGp>tA*G#pS9@X1ub@Ie%87#_dN=
zJ7xuOo>RW5t=3&CwsEng_WI|aj>=prOJD9#+4g){?d;HL*Kh2bG<9yOoVe#4-;mXx
zI{q^xo{xSRth>#}Vyewe!<~X_HR7JRZTqIDea?FAt;Pu_PiiP-GV`ZI++FId@7OzW
z+p_RziwldVpD&&Et#F#}(X~yE?{YsZ%zmWGS)%0{yWaHAORgo6n;Je}_wn_-%4@me
z;Jr(yH?5zYI$`a~6&5`nZ+5IXG*LCDcB=Zp<0o!+s!bL9>t_6W-o#lm^H*Dj%<<Ot
zws4NnJ##DLD4W%PhCe-OiaS=u996isNY&e<K&sbzTlIwBc3%G(KK~1T+9PUH%QfYS
zy40lbqFv9PEV{fUYR}dkt9P$BxovIotYv0%HFuc2Tl@UawQ3PI?S+!M%e)z9M_To+
zoGEfW^X8;CHk0{XU6?8}JZEnB8Rq(|B6^}wY-GW_-}1M)rn}{)o-$suT;<s2o<~pj
zoLgkwp577{c&uZE-sDpc>Ui1$ZtfKI2nup}xoh8&71vy<*Y~a8c;i*T(JN=g7pE47
zt(d#ymapq&Usuz0MS)Fw!m>|_MXWFvy>@eR|Fp}O9@w0`xH~4VXic}q$}49bceGDV
z{wP=Xo^S8sUct~kd@EhmMB1Wr?snzQ^60!=TI(4uDfn(%fA+q`-`LViS3X<K_B24-
z?x>Qv;EvO-PQ8~j+z+`d)n&P@6*SFl{pRThH<moMZ36B7an-cg8Tjm5TIA33(`TQR
zSgBp(=l$^XM>k*5KC?Mf;`Wxm+xGYPvI7aFmdhMntyxcO&Q06ny|X`Q$F7oxe<wR-
zO_{&(SW-&G$s+5P#h$7`s`|B4(wdd^B-UD--=wqIW^ZThs;U{MHyHi=Ix$<jVB6fK
zZ^B}4Zo4}tG*8Im^hz&F&%hc@!{%?s(r2%%l{|Wg>+Onbo3A~(^q-+ScGh*(54zz~
zPiGeF3cL0@T>j3N4hH@SRydExAmu1UP|j-Ge?+6mPdsN~uI8efD|1%dnsBj(v(UCW
z|Ig&Rpi>7Xh)z*9&D36`Dyo{67G|+5(s9@0>G==U?>nll?9e~ex5WDW%vlxw4?H`5
zm=&z4IHi4i$Hfo3E?vqfU0QFN5q8S+pR<zG3j((Ec|M#|db##|zx15Vhf5Al36Rg0
z3D#L~xA64#50l?7m~`cc_*3^FS?EazM{2Y!!emk<-`>gi5u2H5Vs&Z%tS#E9D*qW2
zC-Ug0PxCte^YNWxPmA525w92+k<J<@K0BXl=e(Mix|^DfJf+U(smgbB>`6<1FIYOm
zY|?J!#0!?s&bP*#w=0=;({ZLsZ~Atj{v$4N#^7xZCD$&dR*G{ya(-*0TKX;I+4(t}
z{xdA>TdZHd_OR#l&pxfkn%NdEs93JJT<yl&jpt@8DtxzJQhmjwoARGQ=RK$i%#EL#
zb?-mJtiRU2iEgE4S*}*wy0uU22p5vie5k#C({i(#r_=xPluB0r&6D!kKPyGgM8cGp
z`KTqYrP=Y^zLKND{iQ!n+PX|T@t<K<lV_b{BI8wmUJmQi(Qp4V$e*5Du#hAB)&T~-
z<^)izBJinQ>UG>w=0Eoz-(PNE8aTDY?~j;C{^$Dsx_1eTs~Q*xPs+1u>s)p>k}ka;
znEumD*~*)v+wxR!+oD;gc-?Nr`0RMURBc|;x9F7CH*0&{T;{G@V7j+kJ=v_<X@&B)
zy=%gCPHoJ%sNlAG%2Agu>e~z@jLl|OUXLz(qOs%px3HMZ%c}EU+pS$DbLmdy@f53-
zWzr{3%s3eqb40Q%*dv$IJ<9&>Oq01M{^*|hqgcqWTkXxMi5G86dawUnv}Tr%$@Pe<
zVb}Mo&O5rM+q~gU?v#Z~R=P|zo9wu2xk%FoPnY>DwdyavIaX&Ki+i^6$&7VQZ>>eX
zWgQQWd6Maony0C)G?(+K!uQhj2eZl^-Cmz$xT0asnY6S*YnRp1{UJTtrYAq_6b<cI
zu37TVweqCEg|v*^OS6u&NG^3-D%>)uxK#VljWaE4`KBbzI6KWc*(+E=CvBCli<$4g
zB$3OnGxKy02TEOgxN?%6p5Er5p6|x5b~1EtEOT@}{8~Bf^5$drr<4i5PQGe!bEVCs
z%;~a)YLgy5dSqM~J#)qBB)vti@2xFg{7rb%@0`=pSKpi$nOJrw#kV`;toIs=9EA(5
z;%SbxnRddjZ>G3^;FVx{6FXIH%W2u@oyOB%PtKdOOQj<0>{+!bx6f`lxcS=(Cy!pw
zz*&3Wo?kI3)`?^GwKHxuG3oqA3S%ZM+h!fOIP(6pNlE?_rYz|$pPjby+DX?(ccxXp
z-T8H9uLjej->00b!kv?Srye+=ykqmZIX<g9UQY7ccKB}3Wo!51X{|Z)yh5gG+QhgX
zy?Vu!?_Xz>d9}Xcf@^QS^~Lr?6fXa!Y;s;LXtC*O=hJUyefOD`x?h}esal=Ngv$r?
zj^3P?5@i*>X~|yxprT)HxwD+ZC$0`>nZ4a)+qB-i*&C;A3tM){BWST^^X+8ao=K?*
z#|pG$-0oewHGPw`by%W%G*_hNvgoe!noG7{Jrc*?-*I*4^gTKW;z@6oK9x!f-|l^4
zMRCt$p^cL+7e3$XwW2NfW`1`g-=biCMYVaVf)=*|o*myVcf=*WYOcXijpxCq*CaXT
z28DVq@0ZN`939QS<8q(o<$@{OWlT-nvVykTsm|JNd?(sL@952GAtjZ^3Zuj{H^i(~
zI~Lz+m2)D@V1D11j<u&Eb#JWnSr(h>Q=y@<^w_lw73Ef!6gT_S3>oEqpSNyH(nQ5W
zh1DYTJ{ex$YgE$dp<=iFvXb(e3YRHwrrpZbywRyLNw;kYQ?cR9Dai*mu}dsmw&QG$
zkwxgW*sZgzZVE=v_;t%p<IN-`_tZ&_mYdGTsI1>Ou_DGTb58@uPVpVYG}ajS9`8WD
zbmF6Y&h>vT^MC)4e{@S;@YA2Qf7Aan7=E4j@}XUS^%n;Q_IV`?3=E;gwHAxd=NQh`
z?hKw;F0%FQ5u=_hs(Q933+IVjOj>*K(5wEWhqfnn{Yfd+vg@5;W-l(@$#?B*#*vpg
zFA}BKZ%TVod2Di}U(Ut6O%L^i{W-<so-7P_IobCvZ`jOB<*Lq;zopeO?z<9id3ycK
zZ=CYUwK3Dbx$pn?vQFuh=$?PIeQoQRbdo&{VzPX^&aAo^B5}t}#YL0n;4|e3Kacc1
zvN)^x*zCg1)o=YfI@*1d)kUYi-T%{QvghtazmB`iwdi*)?OXI-#p3#o+B9KRchx=<
z$EM}I%LQuYc0M-0eq^z-=bO2c=Iwp6>5EbCZSUQy-t5;|KC|9vR?PO*I)bv*VY4c8
zmb%>gvpi@X`=qp*%%F}P<#J&=uJlHpG08SxfBxA`pY@ab9#8)`wdgBT+NM7<S3Xvo
zxvDlx!&_zIrj@*tXD>WtE|$6}r_wBb+H=WR+hdM%)*5DQU8ZKqFaE~kM4EQsq;>3`
z=a?pJpQHRta%EbdsGutAc7cnRSJoE&%lcO?dRDHbs@eLoOm)DUkJa8cg(vLsyBN0N
zcHbkR_Slf}2$Nl2o--FWKAm^cWZsk5>O6Oc1ur_6&N!+g#pM}ZxbXVLy8$LsYz|5u
z-(1RPx3#u=?UC7;D)JrOZ#&}*=RT0SysGQ^ZJF?ZqRzDeDaP*;lRmv(9D4Kdd2^o0
z8`XT0eO46Bd{{6yD{|M=SuwY#9^RNK`8h1?uHbvmvw?1bhIb6V?wl6t-f_EZmBz8G
zNw+iACA?&B{F->g#p{F5uAaR<=6av@t6%IBYf}nbwl-zn`uA^MoVPzYwd7?;UY^x|
zhSyEHKm2A!KRmmYEo@y{Bx_8CZNu5x$)&kwmpC=OLZ&M@Cw__wn%nVn-p`p6=N*fY
z=xe+3*iyLCcUk_lr)C1L?k-(h^(k4qY@c+us>SVmqtpHtOQodm9-sDlezNDs_eavh
zm!JOZRkLrqr`)L&|G63uZXcbgq?5AuR@kGr>K(IlbyvEp&SMT#=v7s<&b_*=R5)sD
zw8Z5XQ4Q~k?@sxB$L`&oRWj>4&Qu5~6im)DTQ=d=H{*EbC7;_SM+z>vrF$anXY>@c
zR~Bd8=Y&m*IA(tPYS-4wS-z1c&E{TPIBnL}Ylr=3Uo4m2*_kM=amDA^&yyv6k2j{?
z@a<X|`t`iU)IF#D7g@y=+<D*>^7vKh*-6nFJ!j5E@w@d(EIz4a)Z@>#;@HyA<CA%>
z1c$qJ-JOztTSa$f%Czm3vM!5WD?iegkDPd<r0%q-(!JZt$3I`*Tew5VaW?0f&R@b?
z^r~VWd);1MnJX-x>@_RsLhDSa6K88|oXP{<7=7Pr%yD$%pKYb8H7CA>iD|Y=Y4@G>
zN$$%?@B9|~z^nJLSFeiK!=N1@QwvsXU-{Nt*Xv8kOpRMnlegq*+=}N3in^Y5?cTK;
zAsV;M;#8_*Wiq-B%V|b6&pLg4qvwl_%a6^QEwTEs#Z7rtwJioS{T26qcZ#h|i5B1E
z;=HflRW0ee@teZ631?R?h`h76Ty5dDESawD^9rWDvY6Yn{3O@y+|GMS%C7?DBMxqS
zTX$VeGv{-6#I(-d)XwNQ_wJHiLajnRsp`{8-#Sew<1?5P)%j@3ttoC(@^U3-@jZ(C
zSKjudXk%y9fp39gI(r;d6Rl*6^9ydAt>j;rE;;S?hYP-wcqU9%kutmMmL0TS@6%e3
z9igk1%-T99QRVC1-4<JCy?YmLocDH<sFinA*uUCuWjVGQ%%>x#ZFn_x?d==WB+XSO
z-RYV1Pf0!L?ZqiS-a2HNGjqJ%G*Qq)b=S)s>BnA9vXSlR^?bYL%GuBz^<1a)K6!4)
zy0>mc=qAH?7bC74>v_-7))tw4&g((Nm0J-Z>|&GddPT0(eEnf(XViDMJCAG)nAKn0
z4Yyj)w=b{EJ-J}H(eJOS#nF9J&0G^cN(G<D+;rniulr8PcZ+YF*rv2PYo7nCuvemQ
z<D|>Qro~Ks_@S#RJK5@!<kV!<m8?g$NUr&rlF}3}6tMVC_ga_IXYbWTPM%%$e1Svd
zzWth4PJY{Ha_D$yj<Ra^=^N*tyk57?=Vs5L%=XBhEwfc6rYza_aINpAsZtVla*^*A
zoi>aOyShu){;1{$(<uAs?gv}8YtD{&%Pbmv?bki8s|mX;H7W&tb(Tz8(WNT5)-5}5
zQjp$;w_g^?y6oaFY&iLHv(3D)wfgMSCn~&(Fg=$SXw;X|e&wRZtB^(6)6!0b`4=j>
zJerU@@%@2UQ?1u-zbR+Byy&f=wu-}6%dHn@OxPCmD)8Kl(_VMEl$DjeuEg`l2TpvG
ztkd_(`gB^@&T7WzTyxsPRv4RZPG4<S{`G3)<%&azUaQt+JUcRxQK;gH#bJ)XUAMa*
z=K6e^e89!qx$AEFEbl98)6Q@C&!F*cR+@Wg__T!UWiiGw$4;*3+qhBHG<U=6$zkWY
zuH3#m?b5Z$4{UD+e&`jG-8UuliuZzV7x=ZG-dZ^S(aliXI{RtQ!c4!IS-cgUym$S=
ztfhIU%`AJ*2CwpYzC?3-XJ&}n?MxdFW#xy(`F^*(>Ucv=ZcLv2-o5Yf+AXd-cPv?Q
zW>Wm0nK_zPd17yMmdw1ye0<HbnWi?s*1GMysyW>}l5gI(WZCXe#^Ve28y!-T?&V!%
zHp^so-=~=&OIJ)<GHY((kx3o93}4^*ZF15u>&<F|Gl?gTnwkc9nw<?;bu?F2yMK1j
zxxL$!W=>q2sprqML*~OCBW1r?*0+o1JWqVRKrZLf+{m-<zb;y}sP<i;z1*p++%*s9
zZqf4CAnTHT!#VG|sp!=kiksAZwccC~ocGe`p*>5LPpXAg-kCiqOE)kuEKFcv;J<@>
zvh41T{|rasYnILbF!5+dt-I=f25-6l3}KZO^H=@e%fP_zZ~)dTVUqgKAh__PMeL^5
zEz`QwmTp>{`E9>ajDOtyFY{j7*Yx-_tXU|z+A~&AW2LsS@y4S@yLeHS7kou5FL2u2
zns_X59+u?=*Ii8iDLYRp*qzFqsVjEYJJs#-+d7vW^Y-q4X?v;er^J(alE-7s@BH|<
z^R$h*kEAQ(cQP$PU&pY3@w{1$*0ajbOSYZjIjNFuyJJe7lG4YS^Fdn_qCI1$(sfGB
zZ|$N}pY%kJwBM@mdi`eQ+4+{K{~4TR-QxZ;M7K@We>Ri5myOri;iI3kpYO)o$#V@|
z9^b7q@()zqWd97bKR?2uH2AO+-=}N8>yLkW9;4wT^=$(K^MV_&xJ07SCatczg<iD&
z5OHz;*?+dbcQ&xCVqoA|HlOv1)T$Ywwaxw3HwtI>pLx6b<>LpQ>b1KizTM9kd%S+9
zFZ_N2sR%yTmY&cl^QSY<Oib7si}O+eKhU)UjaCD6H9^<2C1=iTIq$b=uE&-IOQ&;-
zES+1fW>Z~Wxo)dy>z*yjU54U<lX$v|mF}F#J*@RI`G-!n`m_HGihu9?fZp67ebe>r
zbkBVAsQ0%s?k(|oS-Y%yWySPT&Lf$xWb!Uuo9Ciwo?V!8V*Q4RoR1#m87_6nbBkPZ
z;@71V^TeqonZNHw|0^)w|L<ys&WDS&`llQ9`PoX#HL>3W!CC^kfPrIpUeREu{C?)b
zs~1+>OnlzEv*50Y(WN=@Tq;|ZN~)@)Rjc&;>b@rkKaPImr#tK3_UwMOK2mO;aID_<
z7o7}(e|N*q@rNJ3-EVDEK35F)(ci|uH!WGZGsZmkm7-s4$&@o~D-AEaJT_sHk@wnJ
z9Y4femI`WYGrIHZ*P&Qdo!=Z~pI0e*&W%0u#?tS&)7{=P{@io6&dlpmdS%<QU2n1c
zam+)+lZw|qI-(ml`DLnXXs@W=tVdhFZWrQ;?XlQ$aaL0Dla!R#XP2@}7vJUaV~Sh#
z#L}<)Tjm;H?%b(f7HsmLLAqYSI5l=_*~(U*yX(KZpED`^A^pR)j(_ilDR2)W&KLK$
z{?D*|rern8j9Kx^CK#>!c}3RxnDUcty_%iZbwk`;Uiz+><mJ46l4ZB&Bq2Aiz=<EG
zA8_)MGwcmdUi(P&$JdlBwWM#{r8!4(1qG$s+b8)<ED)}-jj_(%9(MRQ7W<r6%)90n
zcSvfwU*Rq8ulK`MbA?0KUcT;_yDX|RoL9{)=UDC*^;WH*psYux@(q)8C%HIJ_AjaS
z+j5BO@uL}Ab1Ek-HWoT`enW)IYZa5p-nCOcXL#<(R5sGnTcj^`=p?qo)#sHjdXoA^
zA>pOk>`NEFoA)G!%=ntMX2!OdtgdTHW>1#II<GABbmHiBTXuQZwnsPir0A@>cHQ2}
zcJT*gOEV>zJe$@nlS`NAxA$s9PJ5|Zu)GL#t>PaSJ9w(v0QYA{sqn3nr}LWZ7W)^-
z)V;c`y71t~U$?9O_T4(9nyhnj!|mFg@0O}8uSx&*tj<{U*Id?1QKoT6*B;ckx$MM4
zL$+Ejt;cF>kC;3@msdFHroxr2cAFzt7a6O}J5v3=0B#AE$n%y}y|lU~f5+`y<D+Vi
z|2#SIHe^=BIp=3fRi><;v_#6HQ^m!7+Y=WTtF+8>^KRS^c^W36X07(Bb)RqO<f}P)
zA#C4_9wuJ)m-f7IW}D;0(oQ|h1M?5h{m*dw%$&WgPQN|pO;7mpM?NtB%a3KdcADut
z4xO^3o73xdCI6fq{A=@O3W^JAY**4fapbBRdvrYKk`$w<QzIAL4Bjp-kSKDJEA^;u
zbL6qaW#2Qs0~fb$`7m?l^vIcW)?;RagjdVf78%VEx2kPSDVVeNL_rs~N8UN#S&z@I
zP5pM&(7ND+$yznHmpRe;Z+CUCsmUoh?z{Hxvbx*;g)7%3cRvf(bM+|~joN$S$KqKN
zY9`<5`0jR3NoX%-c5k=7<rjDO*_wE%kd(vk>bA^uS$f`yOSwj4%I3h8pY&uq51Xzz
zTb-A-G_N+zF7!s5eQZzo!@p~jLSvH!ZzOi=Djs=roO9b^zdP1TE=(0W5-lk>?Y6*6
zEb--TvAt*c!^ah`=7&rT2$w!RW45S8Vf3SrB?~4@KlAO*9Th>9>EDjHxb8b-BsTNq
z-rU~1N6Y3v4orV?!tC8s<Df%wn?2v}N?9}U$j8$8L1+G?sUof&KuY+9`I6NfOc&p(
zxi*XQWcI0@o#Npe?l$YLN>H6m*c7kKY1dER)m`Z6HL2-?$0aw{E6SN)PMfaX)t_m;
z#BRZDx72q@uhX24UzYsxpTTXbam4K-^Zzq!J+k^!cRywnCEfX+Dd{0rQtz4UzP(-E
zd!yKl6C%IX-f~q*S#;pO$?W3_ozhl$QUY0z?{~Lazu(Be$5;Qd)vp^{E||QR=w%Xq
z<JBuVtK&s#`wUOj=>kpeTw%97I=PgTJp~1GRIy#kAb9RS!`o*@-vmpnGM7uNoick)
zy#L2tm7h~~8~Lmd^>MF!$@iaOZ)vuZr&rW}hAE3}&)+u|bHsY-LVKX?n%NN(u;0kg
z1G+*1Yvs^Z@m?&YPZ@s6vzTJc;mW?UM?s4{Q-yRmbIV*xH!W3JY%tl#>qLK^%8Ko`
z?-?e`ho6LB=)C{A0|Vndv;lj8>pyz=JP*B{qPt{Ew5je?Y5%Q1LCbIdGc5V;`TI#y
z3TrCI*(k3UovxmdGcSGl&!C$Xl<0OMaVM+m{xo};->RE*_Xs=rOxP+P-~M1?y~)u@
zcqb-w6pY%W=E%OS-?v-xSc}5D2@H%oaUbNL8Y{e?wO9Jj{m1ur&yZSyGP%HT)d8t7
z2xg|n>eRFLru@18#D4dDkn)Is+)^ih?mtmqZpQ#>A1;4in*Z~_x&I8mLl>a)L5rTi
zrVI!%gni05JM2}Pe`c)MIXl^~X-hY?U+~^lFvT;-)5U#DvhFv-%lEl;KDjNvocA}R
zSSMC(-RuuFJSMaJm!JG#!XWvdp+;x(NvVF{-CqOVw@#~64t%-2Zk1H5-hK81nI_+V
z+Oah-+an}oKWsX$l4Wc5{;IS0(%TPy@T$%G&!B$b0z>*=gv>s%$S1w-%kKVKky*04
zMxJrW<@-;5NHOsIXF$l5A3AL_amAP0`$Kv4rsp?5Fqu{TXFjt7<NWV%3BDhZ>61+5
zW|zNOxb)K74>fF){p%+`$Y3!3hmb&c&16^XKK_PFneRW<@G`K~BP6UJrq1)YQk8pu
zRois$?FT<d`P%;TZ(P7|{x@8r9T5hXUY7l^WA<FWe?UWL`{QkQ_aA?)UoAiJwnonB
zr-dJjes-u9e9Vb{yYKcJr!TJ$Ej}%~KW(q~p*338E)y9TH!gtX8aCu!SmUam)r-m>
zY+PJF`%m@vVg}w2m@*j67JBLUhlp$bXa70>-uZD=&+4W94>qouKl@Mn_u>b<A*r!a
zb>GAOGtA+6|DR!N6ovqUDOE4pkXVOU=qM<GxRkq6q;5{#iBHO(7BDb7!N(O~en6sG
z5%>Es&C-i1{l4#*om=Y{4)BJztKbc9hZvZ5yn|~&Dbldam%<f77^ikcAc{1Q@{NDQ
zOecS?pYZQp4dbew)r<Y_y}JLoaqfSHZ>t^A_zs{hF~k^jlIvRXiW%!T-sa?V{^mQe
zqwrY5@y1ZSgU_$Ed|Wf>PTHOLpsF)xw%u^qz4lh#Sz$}Aw>f^+-_m_%yj(J8<x-1|
z^AlFIoIK(5&2!$VoL$@O1*d)en{sqh%xt^M$;a2^WqHY*R^6W)Ex7!gpq5mp+P-2x
zlSx5)<T-7t-?+J7GdgwZ&#p;ry>Ctno=kaCGHX)jPu(eAkw#t@Sx&EXZPq=WapIQQ
zPtS*$Y92;6e*HRsa(&NwzcmXco!Vx$Tj|^VtzT4Aw+H@WSNUyvGU5%Z)K@LfE0gvt
zmR#~LYhze<^QKCr^|Sm<CY{Vw+jiQv_EJ@h*I~1rC+9VW2kQ#xSgUP{4qD_9cZYYk
zsITzbS)HCnk8-vvE9))3<K=a=*RFk9z=QWsJl@{(7kPHo&80jnC_-gcZ%wA>Le-6u
zKfILfD-xuZciewvbEaK)^^yE=@ACT2Luoc$D|tL?S8VEeav~&MV0v_>W^sFGrhgF6
zzw5~sGTMAss$_P{pDkLW^~w9k(mlI8o?TI&S9)uQ=q91VcRh`7K9@1_oH$d{ZIZIF
zg!hy}!&M8XTHMXvd~@cqZ(mlM{tJ8^^v!spcEZw*(x+0Zj4Ndx*4(+0Dp9lYo~g`Y
zTd!}P8mHcTwUt|(7^xOn*?cM9RO_?!)3QrS(UCHBiFy4TmpsG6`&-WE3%*HmS39X$
z@Tcy$=80$DRAWO*Pfz>Lpqp28%Q0_h!Gd1*jp7|0XDZkGZuE^g^l6TEK>i1bv#S*+
zoAz{`5Psq{Ytn}J6tkU%=d?JNY}m4pt>)+?kMp%{XD{(>I`!Il^Qu`{KaHPzFPw3v
z;^M5}b$cGoHd0y0scfoo{)kI--zke-TMa(1)VSsKR@?Mq>Gt^Kw|>XZ6;0oKLq=Zv
z;f9h`&lU$BGnzC-HGQ_q*JCfWms>1bw^(Ck?6UJWt-dbV@M2@m)IU2}qi1=#SeZ}e
zbK0%@blypSPP451^()S%Oj7>%#wy{p9DjPzU(xuI;1t(+yN=)O?Y_BNbf-o**M*gu
zeL2%Pe_Y9as4qG1@adqE<$+vBP5mEkJH7mf?u+{iOk7H9GUuj>g&Y+um>+Djbg7&B
zgNgISj$VoCo3S&wH?@+-drot3)^2U14HM3W#8@7G^<>lQf6QG|Z$0OkuwQb;5|xhb
z+Dp!T{Th$b1KpN{gstnF+n+PttL)9T%P)4i-?-;`_@QbvXIQD)x=TU+56h;QI?cKJ
zbEmuQn<?+sOGjMS*_zI{;<V(UHF|TiE<V565$!N(#fb^8e0`0cOxc!W_G(h}(yeQz
zR4-5Z&|Nxh%>z@(WQ)+&%WI=-ej9RU2!%=t$$G9_7Bgwu*|xhu-X+_%PU<gt6DOG8
z^K{G3T(SM-(MvneX1`gRv*7TFtklaEe`U`od2t3y&zPr{ymH$~*G<=^^LnUE^6oyI
zx@Bi7-@fl4^$C$tkxrcxuAE+0<cgxc({<AonEGoGuRiWSwAJgjmFb)7Yu`uBd-65@
z{Ngw3Rl}bBXLvgOxU=0()vRvaEBhZ;X1=aW*?jcXS&OJjbM{hsu5`O}?>qZ4EnPa;
z)9%*mo>O&GmGZ5rv0=UD{$SRQZOXRWbdT!=uKHmoZ~N+W=Qlo&u9F_qX8r1%#>%@`
z*(j#fE%Q*KnoFA7!{tTFMz53&pP#&9wO{R1-tMi-P6mJ6^d!P9wOD1@<E=bP9?Fy)
z)L4FG>XzklO6KKuN_KLK6bm;^u3v6cDmi`ObHODkWpghC-AelL^VWV%b;%`VMn4ws
zxvlLr`@zj83wH3@Xuq-(ddjud?CRE6x%Sh}-Hy6;>)y13_9+DhEBypkJi8+_CD|f=
zZcx{X$vtUTl9tWgbSf;US~%N#+ch7iW#>#C&u!T>-H5rYXVI*_+Lc;LS2CUN1WFdF
zOg6IZIlm@)^>gEN)^3gy6WW$+n;tMz@J`|-W!)1~Zg+ZJsOJiCNqs+&-_S|dt$SAT
zO}#}iS-h-k)^PDuTmE>aGh4ar@Vu?NPA|D5>1&p{U+kopP5lqKX^W<GUfGbk?a%W`
zZ%Q=g&KC?!S-)bBdSsBa*|j=j&xO|VpS+J)ZKzoJNM_2`=*7wJuic+y6Q8k@pZ}!D
zODo>fW;Zo0UH6us`nlui+G|thZd($P_w~46;cM@#Zq4s9w?tLV`rVGNNjcV%<!Isd
zrfo*Bh90X>&)F20*z^r^^X-nrPA%r0-D2^U#cYj^g;b$umi5||$F6y)yxC-;=>9-*
ze)v<x=QgiqJ^2)yv~FVJ%Fwq@v~w0XE;$-Ilk3LrxZBlwUWq$0lq03HOgyra?ypsU
z$hl?vq{O06N1n`ZeV+1kM{J6R@5Rd}=SvDl_Po|}HQnS_R1vZzXL-n!S^jm#H_jBf
zPti`#@{Bawmh*FMZGP^w+qpNYuhvd{^1N-imGR`28cv?EHNLC#B7<A<7A+6@8FT5>
zdR<@Ju(0i!cPa&xjE!C${B3n?ow8|UU!;HJ;gWC8t2Ex|TyRU>cKXJD28G{#*L37q
zU+*~ed*S4C*%_Zy4A~_%&nUd`sbbc%*@jnqXHJw4>gm~Ldt#g6^oPHa=eS<slIY%d
zqOj#^=H1kinOh4NS)INVT^Un&_N3vxoHKi>l~Z%C?>lrfyya=J>3!3^>FOs}XipWI
zGCL-skk5AJw%c<ugPyn**U1Zhn$oOiU-9j>N7BY1uaCFhZr-?W)63Yo&()MC{ABBS
zwm5RDROS2m*~KkESKXpRX8$NQ&#eqSopUzj;KKXoj|o-s+~(RIotCA&vD)(N@xlzR
zOH+3pDc$Cl8RY+VSI4j8hhE2QdDL|7+Oku%=gm4U%r;Mcyv<12>5-#F%u8N_WwSxC
zd}Wi8-sT%=fqmy?ukkw0O`eby%i=Aa5u~);wSC<&nQxZ`r>yj<-ulVx!=wy5alviJ
zAK7i>oS(33rIp|MxJN-nVvoEwH$C2>(R3>6M0S_7=9J5N7AN;i_m#;|HF=n`=ysmo
z#yKB<w2D2LoVhhSKIm-zyZuggZ_K>i@-29hR`_y%!JvSam6xZb+pIj5*Oh7RdOWN$
zr^xHH&7;ZbD{j=z)Y-0Py`pq^khjx?>ZP7WprZwp6`kUCXD%*&a<J&!P0jYC(jc|f
z@*6dHPN{Ql&Mmf-?VHbY(t7snywq#DH!e?@dpuia?fS`ViiguDig|0NUXy;HCw8rM
zqNbK-j+@tO-S+8kH{^OXyOx(M`&6xyS=DOfdr$6ec1-YVKCh6Of>WkdYes8KcsMD>
zC@Lsr?ym1Ik0oVY>`C>FdwD<fKf|f3kKS%HKK!kCqNV+m*P%=MRy>?~)csLl?zVSZ
zk7b??os?&I=AY)#Z1a=5jyE4Vp<`HN)0W#Cymq6h=#Havo;}{NMd;Gh-Evm9l}%Qv
zf4iqFU$Z={Kt}KCte8nJZ)}?9tf$??GJRH0d(6`<JNdTVnkcIk_tI-_5XYJR<X5(v
zB(yg!yn5!>vC?nL9)EcAR^epFT{WpEKPKPuF$-F%d3oK0%nuzGQl<-}Zu~0vwZCsy
z;FF76&D0iT25rt-bInF$V!*Ua*}2{mT+BmiyMA5VrL>Z5v2o$Mcuu*OcfLhcKJ8nx
z(koN#%G$isTQ{3+PhIsh|Kf$J8}+G`X3Na_q?Rm~xUyvF%x#lrx5&=fx!O%DD09cE
zonbp8HH|}eUbi(l{!T|zH+RkKs-@qOBRpJXz9$|oyyUsf-^*F;a7xM+xmS^bf?sFy
z-RTcE&<Q*FYIe3~_e8-dCuZI-4SH_Qx->5{X1em0E$*HZrM8z`ic!h>DE3(EeeAsF
zmTqg3KkLp|aLlOk!_8U#fxF&xo{>=t`kr*s{K74c6I1SsYo6^d{y6QZqSv3j%Db*u
z&8{(4-RkF>QZ{A%)ez5lcdktpT^1~RTuN27b)oNv+8MX~`MEly^AD$XAGy18$>Orf
z*~Qacf8}g_b?sY@(e%xKx*kg;zVcg|%Hes)C_K+=a@pkNsZ-D89!=kpG&}2(soGX)
zj(W}B){PV0${wZ0=Or85)by+q7w0$S6D-i#{=L(!y7cY#h{yF|8^j8|&RkfhVjo`g
zTy4jr>z)&pqW<W!?s&Ux`IHoo$eBV`ooeYD#U7tMn~}CC?9B0J;a6;~g*QDYh+S2y
zveYHy+ml^+*FSsg5SePSIpaisZLikH($3a|^><1#E_dx&w%O*``qeRSZcH;N)jv7i
z*<$Ci*8PI}H?Q5SD&;e&*>2OF3+_jZ`Zbc~rMz5nDL3x-qqVia?oZpg=Kbn7?FyGm
zmimio9t-YKovvN=I4E*TQ0~zchpzeFw+Tu4bU%0UI;or^UY0%45fRENeQ$Oh3BBYu
z@AX@=Om&f5_tb26&Xp>QB2`YDPkyZ|_R4$4^sha)9n*?^?A$V3Zs;Gb*k-KHIsHPr
z;G=&*-bJrJN_;k6eU-EL%XZBxmX%9Gd$PJazM1`ar?;}GJh*2v+n1zHRUOT17rcUa
z7n#g^`ru89tz?_8Ynv&TlTO|n8)45=yVizq_DgD<xbm1QwU7Bf!^0&LryX;>zT@@5
zN6X{#jIVB9bUw;|&An^CbZ^!CXV?<c-1=mWsnw3MC$}f%dY-c9^_u8umt|Pb6TB~b
z_kAVbs2m3N1vTK+(cznXoemcKo>MP-YSXJ(vwc=YJ-FmNU6a=~;aBjHB}P-_UK{BN
zz50Hw<?p6TZmcZ~3`4E6>2^IQ=&;R2Ugv|W0<z~$OxkwUa8mrKG8gwK$GS-;7%L^i
z-RAGTy8k(|^?!!9VNGuv(oAPHw*)*Fc_*~+-t~>Po4#K><EU|A(Z{8p#wIJ(uX%Y+
ze3OziX<yH+*qf7|9h$j9bnfO2Gc%QJ;}$;&Q2G|K{JN5JN55Q=RCe$4n8-;2Q$Fha
z+I2u`1<&d`mzT!{zFL3oSm;}U?<Oy=PpMk{dE?Y-7sgc$r}mz;Dt#)q>d%|hxQqP7
zese3`SM9f!4BrAiEqO$1T;bg*RujI^ujfj~CxK;<Q+Y(9r9vjZGBUF5=?Q)DrQP%6
zY_q(Ldkv1P%1M5Iayr{2V}A3UlJZ;^nga9OQm@o0i`F9rsWVb=3VH^&+?p~eSJtF(
z#vB_}uU=0#qg#_R?RQ02pABIPeV2K8y~fw@=gC^%I;v-tygp?c`t!!Ay^HjUDTvRN
zahZIRpIpn2O_`P0869!r_FTPEmGpbXNlJREYuOgx=;&Va$3^Skv$<BE`a|o=jMitg
z?_BP;XYtkexjn0IDNSC^)pgnP@3|f`eZl5wHKv}9Z}vp{-95HYNF_orvX#v^@bPh{
zsMKi_Je)H;b3DDgS9El2Q&RjY8IEL%BC^l+9cH^T_ul1D#;M4DltXerBikxwE#xF?
zi<Dd$B6YT;yT8kszjOPgIaZ}D(|MmiJ-Mjs^|>8&rkYufE8Tzkd>8wosJmoB%cP)5
zH#)kLlzy;9B0B)tT1}~f<~&X9xuzdy%n3=B`l_lb+qv!Sw|!^l;&X$zWiOuSVz}y%
z8i(YacBGI{;H8a5_ZhqV6L$11Eor@wBX7N+yVyPFTFjXpML}s(r)SSHS<%xorFE&x
zRMkwQAdx96)x4C9E+Gf1tca}4mff;vvW&k8PP-X*LCq^$<zCwLa?Xw*>8E>7XXR{<
z_FOCLR%88D{D1?`FSYCyvSKRB=0~l6o_Z_fam&%PYK7k0Qo_bJx9Oc!-Q6=kDA-o@
z%G%CP{(=^^f_y3796kPxnllYkwoRW=J9QHH=>DjXSTE0s6DLj&oY;S}Q0aWv7s-pW
zCQ3PHWj>2_)a6rj-85;*{J@D5XG+QkPWbe%;OcxQ3IFsRHZv}pByZv@yAiW;?N+7E
z>!F+GE-rj3sLLJrQMPF6#x>_R{_w5Y!1^9>yKIM#>=&;?8-Mz$9$ENmN1w`^T;DYv
zw>+l{iMTA8f5^=x^^uWq^2w#J`w1Atbr_N)`Syfdd3{7jwyM3)z324Dl_`(ko$Hy+
zDYVoqPsvL<@bi>u=O^4ddDA|v;Cvm^&FVFgOZFUVS@GgVYx}fVrMBGp>1J;_Yx~6l
z_f2&Vj!(*%m#Q*#;pKv}?7IXe-+4QA+Nra9*yrB8zSqgJ<Mfipg4%^ABVwny2J@P6
zOj)@{@2JK(52H-3#B<`IM{bEd;?&mX_0pZ3Jykhi%dtakj}@<Y^XyY|wpsoBz;q^&
zw1i#TVhg!ja*S?w=1l8j_I_O(!{&bc$Yj$^8>h~C!IR8+HRrzZsV(1p=ek63?b+0H
z?#AoL8Nni36c*0rE<9>*Yj$MuTxBP7|47Lde_Ve(Tl?(lwjzyJAGZ}{Yx|r!)V5Vf
zU5%w8XEXmMp?r(Bm~j1rLB1}n>r1Bx?Qxtsaq4UX9%jRrj58x1eCNKr#&XI1tB=dX
zwk`X+IMZr*Wc0_AIzqF8RM*d!yxUo`>@3HfD{eJRPUn{d&-hl||Dep!P<7SS(vKpy
ze-`t`**uwc;*_(gN2c43CySL#S6;i~;-@j&WU1OVH$S)B2R5hXP5EBoXURL)Nk_5!
zx3=_iW!ZgkcSB4MHk;gh;QY?jQ_sC+isGXDR3YB%Lbf!^V%y@h<65zYcWquVqfTj$
z(`JRPigPMwcLm?%s=2n)uWVPkMyEzjcj-d)P1}rboLR0cDf!AluJ}|{u>I1Ih_7V}
zjC(du60<vd_qKVYw8*+0Cs$m`Y`HYyQOapM*3}cda#^ZX?&v9rTS&dw?dvhI<-m%C
z7H3>7UD`L97#BUZ?em+mBy8n+KE+v``>NuMAKgE2qvw>-#)-2|@OUj=k~^#LNNVN0
zd1doN`%IpB`(24HIXzKx*~vwP(}R}Ha@H%fxz2VvDr9P2TC=iaZA$jM@|a(S(Z0It
z@{P-CG<vUZo~V24tGVv<Yp3tdC=}W|X~j9!Lr?PN_FO*m>QQR4wpoKj%)3cuAF@`+
zPH%s_wQO2(`gO0Q*SbC&bEk@V9uJ8Knl5u$%ehNA*jHEQqe{=2iyJ4_hsSCeM%9Me
zT9j5ivX;B0Dzj}#%9_nZK~EI7D&}09#dEK3UU6pdl2<<cJtw)m%FW!2jEfkjEjK%H
zy-#eh(uJvEYx*~O?U|Kxq4e&=8}a#vj~_q&pJB!Q?tlFPS6WPy)<!TeG|+nFpMk-s
zQ%bYEHzwlli+zi1+eMFT*HORhz1DT9@ufR<>MBZ>Yqv*4T}fLyS>LR6rB~FXY~#TH
z46)}*mVMtk*=t{}PVFw!0EX1qb0y3F6lI2=UE06hdFQz){~7vxwSI2d`0WMwcnQl(
z^FN+k8gDku?k?YRtKff@Caa%aI(~aG=zNJg&%Nr8t4*!B74f^odd^G#A5Si=pY1K5
z*Binh8D2bB^FM>ktkBPy$!{;bD_&~<cpxp++j-g7zHc`FmUii)91&e^PpOc}(=JL%
z%J=kytxCvz`<d_LqV~QUo8_E0$`&c-*i5^h>o#2~aCwpHQkRq^^CKm5l207_H0Q(C
zohuI7dfxkX>(-s`vLywl))-qx$GzX4DUT~GI^R7ltXF&WpCLH^PxH>0_>;T#U%CCC
zLGkqaUHg~pSO2$cmq^vU_?^te=x;5u{nzQecE`@{(iYqJzFm*Sg>Fw4^to=)zul#E
z;)hL3em~i)tY4GsJ!NC*<&q6Q*XH)_U3WUgDsJiCsTE$?Ie+#Szs6-xM|G|1i@OeI
zIK^+?n{{n#wxn@_rQRiG&FQl7S)bjO3R*OOpECEvKS9OHi7(0wUD-8G=5L6amF?{5
zxN)wQv|y^#q%42Ut1}g=m-M<GO5Z6s<K^?p(3B{dedZZ|Zr!>y{ovYlhj!V@)_=-*
z6XVy1l#&)AyQnVJ?3Q_RxciQ?Yc`yzV>!;b;?vr=Zo%o(GEId7w)1KR&hj{)WG^V$
zx4Z9~#Fva>{om8q_9rT?m~eOIjW>&&-_1L_<<|6#x>^6cyo>Ft3!9EVUb0xVt!(O*
z(!II4-Vd(l&CbzXbouqqy65^h)2CHY@})OgOHcnf@-a*5*pw-jX6H9P+R&=;%~QHB
zXOgkqBHdqSTu-0+BDY_>caHx_1C1L`u1+poCpVK<U|!s_7>%0q(-z;=dZqlm|AE-j
zo~K+NdtN5SOqmyQd9q)0(Q4=D?QWOk#OB1#`Ln-xJxYEnN6KsR_cYBtjZb~Oo$%=+
zPZxKdwD#nnCz-jbhm?#R*RE4i%}J@w-yg}j50SXqOm~Khwy$^@ws4wvR#E2qT?IND
zCQRBjfBHLz???tCXW~WGhy<zdE?oRm-uka*{~1oCL(}Gx7lE%U>z?rA3@Y{AQ;+_c
zz51{3{!e@emo(omt@o_+|LfK&x19{zZNG<qnf~NI!<CFt+ebqlO9eu5suL~$e7E9Y
z>1AJ&`<ACR=yc7u^}T4bYmQ%P$c$C30`okV=H2x4>huZ}oVZ|{lHI<2Yopma#W?S|
zwo0yev*^r@=uDQD<rN#xd{cU*x7k?YxaXPjsQFj#N}b)3%+fyL%%XCIw%^~3gXLJW
zCnqW{TjKrDu4tw|celxw$w9p=H#NBm*=&1M%R3jX+M0bo>~7!PcG-yH;=-NFve~vK
z&vIqr=07-hLaO*VEi>!xD+}gs+A(3uM~zu-zZI8=x4eFRbKB%=OT}I)uSkeeo%nQS
zXx-M5{xZ>Z$*eK2c*>O9p6aPI>n-Y=T7TrMz4T<U==Tq{OuI1Y(T0qF%~xJd`K@#5
z-@mlf6_L!DdGED1<!Vm&#qp!dq-^poB^R$<mu2^}Y*P+g>w3gg@KRdR4#esPmn8Y3
zlKJXxzf?L4p9KCY{H=VV|5O75FSp2alj(BhoO=&mP0T!TBF$^VRExR!n=Y$<@4c5K
zdhn3jtsR`wOHOod5q!Trro7^nYO3HSb)z*C3zMQ`Z=2kma413K>88hidosOU9_?&-
zb!Ca>;hbqTo?iaNwlXZwQtr-NX1`#^QI)NhX8-<u;?jB==(1*WWvgp@)UVm5t~s+a
zxr4l?xlFZKx^v^3Y$KHqUxQw}$@<Nv7MH#=&gW=RjIWJ#`J$Dx_L*pIHaj~_y71}b
zEEC~LVV_QHx6@PA^t#|+Y&dg@<SyHYJ=Ic67oIkfsmoIpSa2;Se4WuU#lEQMD6#1`
zi>zC=ER_CeQZ40i`B3VP^)VSz*RClG%ba<aeSh)J+2wZMH%W%hnOUT?=hBm$ZQ_#I
zL33X_%~G2*Z^?okL9Xk;JGeu$FJ2a!?{$^8%zJXshFv#J&GN!dr}9U;)p{3Rny_`t
zV%6E}EFM~1*G*ZvqT{4mrhMIp63yL37w7C(OOxJKkQW|w`=XcP`L1b`+Ryxyc`7yG
zkkAt8w);JoT2`lu&0(C`bLLF!g|2TC@&dE?V%<%R-pFT_uqzhN>^Hqs<>K-=S(okN
zE|)DUJ%d-+Z!PFv5%JpOy4>2G^NW_JpNpMq;$^ApKk=qUnMrYZ%eAEw*Jg)Xrfk*q
zdi3hg?pp_3Z|pYu`a(5ZKzl*pE4fUMf`nyDUz&baR&u%ZWXIC2qSwU^O_{RIJ1F3_
z+wGU0^KX{h9$E5NGcV8i_DjiS?nc^NF*8bv<r8lhE|XTfXg77`<SFilmR7L++~&4a
zQn_wp4Ws3ZGiQofUOWvBY&&-RbNBC<#!tt&<(^uqvK%cbT`2SI;o5Dp%3HmiLcB}8
z3w8*(ESXr)c}kXl*~+<M?{4d!_@uq#o8a?x*Djv8JK;Zr;`GSWsY~0WGG;l&mwa;)
zS{&5t6*zO=FAKIbV`KA87o~nqeHRk`yfW2Ca#H%<WGUs3VON`j<%7=~ypl^ivebI5
zr|K4uib>VQwjXcb>tm@|zKg5pu=>iTH`CJ2EN;|Yw&2X9%bB@+7V9EU=(BD+>AUcW
z**>PnPTRx2ZxEc>bLQOdyLZd>@8mnxdgptI?bb(+<5SA?%{T3qZ*Om}JNcpf(SL^0
z{|o^a85nm@xO;$sg8}*4x<yZy$5+2eev%RI(72X=^_{<g`=p=0(@|is?t1`Rjl+Yy
zR*E@PGTi0;!DP4p4CX(+zk9|Yg?pJK%m66O#Eo>wP3wlB&D&0`GWe1z$s6mt&U+=F
zU(%v`u2oAkSF$YY^)uL_)5~RhX~~Ky?)#2!oGHJn7=Bnzs>QUO_OqW%Gr93@!n5^P
zLt{=Yo<2eMR@(!!Kcxk`*0K6&x;5FYJ=tu0|A<}rl$6KaQ>TABel2}rd-#%l@BheI
z&Yu0s&UVr3lS*0hvi>e}U$|ZT@=~vTlU_|u7R)s}GUG}1!>)&lX>Q9n_Zfefeq`;H
za-Oeu)=O-?61mlC{(RMlt8S_j!p+_VnjV`ySM#g6s+ZFoleNo*nw0w%nQ<4JY@7UA
zZiCzA%q=-t%e78^mXb-6eKOU#q3n8rz)b#0SLOxhZToerH2qNeq1F|DA1fIdDQ2??
zoV}MESe()oW{~#CHa9)vv*Ch;J9%e!1llZp<Ef-OC0k{p<h0v;d9`2br^kQEoR}{W
za;M0zaJFIS%)3!HH!5*Ph53{obzK>?Smm}$%1V|gOONC{+7YtIRPz1Cul!t(zFcEB
zd(GY+@%zfJozqiz?pU-epVqW$>xAD+p8b~m9AT8dy4c9GqGQ+JwVs#MJ>F>*`Al+4
z4GnbciCVPC_3qN1h_9byqkXMz-JFm&OYBzX6Cs~7J?Cq?kEECOPfNOVcXg7UcgW}I
zJzAkr(lc&NwVbkK%5n)M`%McbEt$E}dZJLr-;ZDAPXCS$zL^o;w0QoU6JJkVE4ea9
z?s(1S!dW*zMZVp2TCgo*&U(|e^|t%MW1n|ysWy41d)9R7{bI%gyVX53Z@=l;Wo|js
z+HUiMjNN8>r}QO%c)j}Geaxh9N_fGX#^x+F!?ibJUEf&U+@!GXljUlOlMk(Kr>Yt5
z-*ww_hs$K=lvmT1yg$5CaIN*m*J5+GulT%Wm7jmCv|=Nd%5&wtrY)MDlU~iv<aAu$
zbN$GauLaY8U;AeK=!Qp%)%4{O-9jo)s~USRZ5H%$Y1he(=$RHW$#KQBJdI8@m)Ftn
zA3an4d368E&TsGLDNlV_w5{vP;#nW}nk|)FsV;U`>{eD-<>st?F}=$+_X%B@Yjbbc
z+WUXr+8kbIp0}R$(qHHGr#|l0KDF?Iy^@{j)PrU()Lly3x<aGZNqKI&e#ld0i~E@%
zkCU28%27$Cy07onx|z0yW<32UX4tam>8({Q4*j>JZ}v>e{<ZW(BIgs;?IE6yny$y@
zvvePSy*k(NKt|f-uzTquqL)6Y3TZw*y*YKdPDpi5;H9O4d+LO$ChqUqCuo=E5<T;c
zi!%G$Kb2N<B?AxXm36Opc`B%^-%{<<EyJFr5-YuKKG)Q=Ua4AP@*(DN=IP+AzxU7O
z?tJ<~Y9Z5m#mlle=TbL&_gi*Hgzrquc&Nsz=9FjB&gJ-Vl5|ujzuDB4p8BjFsuOo^
z+I-tKc>073M^FA<dU@8YS8HQ>uIQXoW!{jPv#llV%k>FQ&Tr0nH7Dba&m`5N(}(2N
zO<r4N$~P&_+c@mQHZ28j7tS)i`>sA2pF$=@MHXHzZsn5-nJUBsx?v!<$K(F1usM$=
zYrp<9=RbqY@~JtOH)-G4ur<kg(yd&ZJl!R0UGh{lKTOHI)3NJ(lE|Uo=`DxOn-utm
zJ{FxDWpLh7W%81{mu7e8KACh~!|dI%h)D0>e|DAX{+(ZU$=a(}W9KrPFV{+~xzEM*
ztNQFs>@xE`lJiYfILpy%Qf8#!#Ch*ZCr%IKUeNn)y3AeAcTL+%d|a!#H{E_UWs$Ns
z`}KbxPI?---<H(awd+W2-Qvu*TYK-?hE?{Ro!>KE^iIUL9*eJuIq7?*sJbnynzVkq
z((m9O?1z%Ijtj??X1eP{@7<d9Ct2c$tKQ*F+FG9bpU+HUoa_0Ld$I3wvAW%XhPN|j
z_U?MAwJbdPXr1+@Nw=;o`OJBC>c%A#H&skM_J}*5BeqC-$Bnx(zkjZHl(f(4iN(BP
zop)2FJ%6}oi`o2tVOy5${j7NIvFcW_T`3J*7qiml%=0@YRQb(fPM^P^sZZI)-<D}p
zCQb27?(aD>DJ6Ylso=yPE%kM_cGb4YzOOfFgoS%)*3P)1nPYqF>TxHrEnZ6T(%L3F
z+$NcPeN+8LQso!x#ZtzY<6qVNHr?=zOxpR)vRKIFXt=4;zN?+OJ9RZpHCtA0m6UFo
zWNaedx>53t+JeaNs7*UX%LJF1+&<?X^?28Gq3O|q3y(-HIn(JGDCM&!B_*XcMRD1-
z?qwwn>mpzI%?)~DSSa1|Tc)wv_R2=Fb-t^4y=$+kO!08m3e@#7nJFnKIPaL7{fu98
zmt8j6c|*q6>Dc^}t8Q8fubjQDZ{@`(CC^!R`@Ekis(3rjT&WVRY0t?Pz9UvFKxEOi
zx8gxlT3&q<$#U7IpwabD?d6o8d8gIC`7M{x+oF7I(MQ3~_Tn>=Jv-Oh*BdX&*Qhg3
zGxgXl=2hKO`gf|~*_gxbP3t)(Zx8(XmGiXf1-;G-@|r=54t9Ncc`2%2&N{2~`P@iF
z-i4~WrYw~Y6cm)4IX_U~{F5)O7nfb`wvSyg<LwroV|PoY25YZ<v|sk7XKr>-Wy`Zu
zH!gYHwDEde$hy6SOD|d=S9e(#)0Sh)UhY{feX8SDbFa0%guBr@E!`QxS2JIKSh8UH
zk%Oi3Q785|Z<jjr;m^g+pbc-<&HFmTD4Z*J>f~8drl}~uy%o7_vC3MzU;fGZH7yof
zw{kB^U`cKLqjfv$SEr3;(NX>#t*e|hRF+PeG$n27nmOS?-D^1m#Xs%vIB{-o%WF2)
z72m&IsW5%xeeC69w<&LXYnG)PDw!93c9L}I($$?AA?2JYDbaFAuT{S}b!*428?g)w
zjtbD`0ORitq@B{?h4MeT-%q*xPsm~;|B5I3UnbvwbzWciYkc_v2F4c>5`%pOG}KcV
zlmg<Ezl2+g3ojK$&)Kbi^7e^`(Kk}Bn;9f4Yt1TscewWaMvE1^o))uLs--{Nddz*}
zykjorbKpnPND9X)zEV9@=9@PE>E*bxcYgN4qDu}qUpH1>`{abipOc<m0yo`D-L_vj
z<*O2-w%E4)?5ElDrd_Gp_;uoohilJjzpv}~Jm+4Y)OEIQo(XsLJRWZBj(ha2v_woq
zbHc$hbI*D1cskkNC~$60$J}TcfAz+_6ZUQLU99rr`^t?oPfYjiQG3|+TSn^?k7QZk
zozTmnx=WTAc!%rC95R`>&~M=q%}mc9EY-`Sx4bm%I@B1ewVnI)I~D(xbBo?+6e=%#
z;p0@xb4LB__uEb}nb|ikOkHvHz`A(3qx+q0l?#_^z0;9D^y$SV?XJBA*>~HHr^YB%
z^$JXv)U3GFmvw2$hZDhCjuU4uxmBasZXKPwf4crO=`+rX7kNESWNuo$(kgp;i0}NB
z9;dS=_CA>!n76?6hVC~1ZGI~Y<_GTZa$aNgIc16FVXx~~yswqmOJC{97rt!O>v`Fx
z&i4t|%NHAWw;kcoT4E|GovUUn<Dt55vQo)n?I~W4Usqa7{Mvj;$+a)D`-0<vcMA{C
zdb%ZN@;2GF_80TpPUn5FJ*;u!bmNgVS5DeYRn0J2eN5M?a?Mt$_O#O6q@AKdPaL(g
zw(HJJ)Y!J(bdpyv+v#&s!Mi5Z@C*A*)Kyz{y3q5~;@(+d|8$=&S~T~v=B&N0TDt$T
zC*L`q$1Bt0#dv6U?Kdu)Xpf-#wJo}A`!ZMDDQ))W&Qi_Eywh`HYRF7+jbrP-Y*v`P
zY*NvjSt&eC?~curz8I*wvgnlM4qvV4o-H+tDrVW4F8P@mc6;7Q+wW$lQ<Fs3oL+y}
zNM>2e^vkkcuiPSoPa7*62hExA+Q+12NzYloT}`%~Drrkq)t$Dy6pmRFx9PIP!A;%X
z)m=Y7PY$~i@$74olH|5uPv8BDHre)_J8hawX}0lisW6u;twTYXk@Y$%Q@_nPdfh!X
zqE$-P%3ouy&}*?Z$Dg|#dwy{vo7Urfzq?$P>nyfb+$Mb9pLxQirGI<=9dq+Ae08h3
z?X75b*d0&boj$)3EFK?g_I_<9UEIkw+4gk0M}Nnp>WM#8q9q@FyQ`)rarmX$rNrVj
zx$^n4*0b3XC#_mNcT<(2w3c3_UG`JEY|n*yimV6nG`8RPDr<YCVpdOgWN5|JD;%a}
zLfL^vmG0ifa*59+t0zsn8$CxeI{n%s!JU$l`PVnCo-_4Hi_!8GhxNWx2WJ`h8ccQ;
z*6OzxSv{Za)MoLziNQZNos?dyo-_4Ol)2N%G`Fd5o;Rh%cv}6-T`oCodS;j-=Ue-U
zwUNoo12SbNt@iMjm7d_`bR|e+$#M0-Zz;1xr7UOexOc2YJ$2gkRZ$P#Nxcj0f4pPq
zl;1tYGS=VIcC=MT%W`}FnN(hW;nR=lvYrofxs2}goarvsd(B(4w3~CLX~f>Izc)WG
zTQ_&rm7{uhPI<{hzwBCmKUaoVbvdt})w0`mvJ;lhJM?O)Z+QIKB^Jjk9jCWvRed@s
zvf)Gc^gw;fwWqv(nhAPYI{BnU7qZ;k;s5!<<LFtHHEexb)?9ti6(_i0S9jd>%uTO&
zj{WSZmfW;7d7F5MVfOPf@3ku~G6pJr%bfJnWuN6ag-iQ{jLJK`{hqVOZMb=rKV}u{
znY0;8Czot{;^rlzcf8W)MEBui^}pKgDmhL5wk+G9Yl{2M*me2I7n3J%%KW%QSoFi(
zd7m}Cd@niY&bl&5_eaGxWqrBZzyJMMZ*B2vqh-?9JBPFk-+Fhl%bYe^CA~Z_aQf6G
z$MeHo<sOPYjr&r1a?TDuC9acOb$WKDWvdCyJ?X8iXuLw|<l?BUO({|N%c@Q`r0jHa
z4Aqm3^(_l#d}(ygY)VRt!rVLOlZ@>fI?nv+sJ(otuf0_Fufh3<o;ww0ekY4`lyX<?
zOv_BG+BPNYsnm%}xw$u4H_lgY>2*4=w|KG6fsbj1Hj`XTBdop7e)e^<dcHO~=h_OD
zw2wB^&AcX`iS|tI=(V}0x1e(N?!JSolTux!JSUi@rlwpoI+WsjW!0OW`|6R8W89M5
zQ~lo`ee7wr<-^}?N3XcKUw7Vm;jhn$kJn6I&JkHT-P3b&u$bS@3!P%x^B<ag*37x~
zD(K;~cb}s@mmdD@9{iA(S>~l;#j|Nwp8quKd6sAM$I?u!V@(q0=6J#BB9m*qi_)1o
zE=>+Jx!h;JQS#G?ZTIs!RcCIDoGBUOYj-O)H#&D$Nc6-v^%D+WS?T-wUvxlV&o;U2
zBPHtt)2}|&*t6`c)ZD3Mn^r7;a?M0ljk(Y>OY`h@pUb|>r|uNidZznSXZ9m+54Xoj
z2h#l(w0keVZt?oUbGK$i&DwV9BJH!sqH{lY{Ol^Vxm2pH6e&6J<D{uO1y?QEH-Z1g
z^C=}I5xJ)dZ%#9tAirxW%d|U>TV=0W1WdZTs=8#?rHS9dg3mk*+~TdzQSeMwMQ>GM
zOjB-Hf9i!#TxqXUH;3nSeOVWyS+rAgLLcWIPJPzY)1|q)rWSlO-}X?|a<|FW{|p!W
z46oHa`xRPWo;&~Mkz+F_e0{o4Ux|BF={)twrq|kqaYB3KA}23%sg|67>-|oy(!AN(
zK6l;vlM6lb<yLLF_V-Xn--V4{)1Dft>ezD{UI?DD^yA_=MRSy&Su9r3*kh8TbxrK?
zv(g)Lr_D_h{g|sJo0GjW@X;c!XOp!`Zm4`d=lDY-Qh3S9O^f|b7X0e@A{bXz5SCdl
z+IeD0<;=SrCnXN_we99?yEDyXVvnD(XOUrnpXbbVMxjSHianY9=zhaDy?qNm6}xSf
zwL13sQs9hJ3ujL~d}^YS$t$tU#h+&6%IMABIA_cAB{xk9Olp=c%+$+H$?~kKU@872
zb4xQ)>-Lpcy&ZXahFV2Ai%#`~_Iu|kub(Bh?5t*`+7^{78OBcAP3FD4bE4<l=@*8!
zVWmBB-#$y8P_tz>@_D{H|CU?6HOB*6iTg8FeO)9xhr3{h=VEh9O;1V9)D2%WxQ;FE
z+T=PXf6YhHXFadf44a>No|ZVRdi3Z=oi3HiZN)3Pmbs_?a0?U^GS7+eI@6i`X1hQ}
zQ2oEcAI@n<KUKwA?b&>#H@c|XOH*g{s*kr`%w#=V6Ex|IQ?1*pu=Xn#mt1(Lx@FeR
zIlPi9Ic{|1J<!*BmvzZC)Y^LS`CmODwkpXcbEVFGoHI)#s`}Q-%dhm3&ZNn776wcc
zc(0jd^w?x(MSs$rGcD8Do_RaZh=0ZBy;S&PM!7-ZD~k`iwU{r3e9qr@XxgH_PZJN!
zD|U<4xsp4vLS>QJ<?RgXEm!Mq6`mY>fPtqQyfBi1fr0%U^0wvJBlVB;_XYd^@OX5@
zZt<l53`_g#SKI9Izgl~*fq{KNLJ~1!(-;jvBmrC}6J|TV`feYR-}du&HwWlsLYNl9
zC&|E65}=WfI&4pqm3;Z>j?`|k4HNsLPAxl;{=}eP>9}Ba)N)m8MX%f+-laBImI{Tq
zEZNm6G-;~f#JtI0SkDHBhrU{0<$OkHPUfY;d2%Kzw+n5VStvODw#uHAB>kO|a$$G=
zwfG0v*wl#&+xT0%#zyL{&MLb1+wYdB=COH^`=<Z1koYR@zG6$r#Fvj^uO0P{*t*PW
z+3O7D>qotw?P?eK9C*WI;TDBUDZXD<oH*TiRJ&CB_udPhV!0<L$Iny~ou&OEa_yvD
zmnwH$dUdz-*u28c1v}TSxqCBr(u8$Ffx<0mUVeM;^z>`z)jO%L;Cqr%8D0EK(AMGc
zg^%0!`CEK6d!F(BtKqJ{EIYDJ@i`XHIkI8vWV=&oZVy9rm#J?z(k`5GCG3>r949A<
zlu239&o_l0J(BfE>))Igi6sSZ*7QppSZUPWWn60T;9*3mt!Veg&g#$0PEFR_H07mG
zfLBmVDqHo6nZ~x4U--doE>~7Qb$kAW&Q~liX55ZE9DdKn<9Wv0{VUs_OY}KjQky55
z=D6(Bhi5O=Otjx%yldC)Tf0kU`<Z2w$!LpC`eAW0({!zkwu(k)wIu)Cc(>m@-txy^
z8OZ$FHP6?@TdMqiv5fOt+i)YtJvo~W=bN?3=A2Zyb-HuW&FD>wjJ(RRxPsZ`*wR$q
z_==A*Z)Mm1ni2HN?Cz1KqpWO84CB3iY24kO@}l#mu<A8wmUiiidn%jSqkC@2I4(T=
zO4;c`_51oGhSqO{OiNjFPFxdxf9uxyO|>WfY+Wc6SLK|U|E7K8(&ItRwo{L|q(3MX
z*bVnMGdxi!KKQsPtl)FMP-vER@>k=Y%kCFvthX}PpDZi6&04O}Bg8JzO>MVwIsak3
zw7-+T?c=}o<9yw!ZKW5RChM<$>m+wvzLD*&`IN7bY$kX2#&~Kc=A3YPwDQ`?V=<4-
zg{G|BYW8xa%VyOA-otPkWlx;=wC(v9rK*NLereeWpZf(~U965@*<!gc|GKiTLDi{a
z!I6Ewb*FBfTD6bYw`iuS@ZP2wK5v#RJ;`$D(v+nx9)0VQmu=QdTd20^KZDkxypL}t
zp182|*4AW3zT@d7+hvbh+~$+A?7ADX*r{j7>EiouYh2Ucxw%i>Zog3Y3VV@}&Dx!R
zIp!^JKA5b!O)k45bj7;)!CE^n+`D~vd*@wu?QJuz##tY|5fiiTaOrW&+r`{<c4>EP
zW2`IA*R8OS72o*lY+~5*!^u2q)3e^*x>L^G@%7{BLNzbPXp5zqI)(CrsydTbzVfv3
z{NVN&OQN6Ja%`=aT;U_jTh@R5jZ)|G#-G)A@OY+NV%M#M?V)xv7jHQ<(ej3`&8ur?
zI%0l$<(X^|I;mZ8;i2V$-U;_^^IAs!%kQ<iXDq$=+k)7QG1DHY-4fYTyW&!(rpo7l
z_O$|^f~xa3P4Loh$lMDLWw_rtD|}M6dd>ZP{YQ${mavmm9}L-c*`;{T>gc*M@5r$Q
z9$TjS#tO}uIiEF*XWw+slP$j{J>0gFE7+}V_CuwI;RfF#!wcg&*t;%0=?P1*C=Ffa
z>d&<Ec!ohsnA4k%qLAM~cN2v|Ucb}0dg#6I@6|~rv#V#yFR7k5@oDb)7cO5J;ejE%
z=<%-k{~2uDEuHIEHXM4f!Q<+RUF*~9dptKZ*Ojk1>!x%oPe%24;dI5lN!zb&b<@k7
z?co&tbdtbS=@$2W*ZVAM-s+?qr7bp3OIcTyyu5YOhyM1Y$8q=g`JNtGxA)f>&+oA8
zjY#F1I$}okCN__cOpWR(vk(5j{vMtVtYI!>n)mK#sm*nvTdunQmOtRXhgn=O-@UPJ
z`K(7KUD1F256EK)VfNh{WBpB!WQuJ6HNU|gi{JUnlXlI^?zp6x|J9zM4vRzOz-|y+
z5_JDn9m8LE)FG0s?cJ_dmFtu}mhQjuhw(Q&P!Yvy^={ECpJUu6de^V~!TcRduKQl5
zz4F<PUZrXO83K_9dE71qnS0!svaDog_O$;DDOPX#_>{w$1YBHNmUQe0m|y)+$oWNq
z$Z}_X3E#-qpL6HcR<68K)OUb^dCiIS<-0DwS1f$*X81?(-fRCot;hFD9RJYG!Lzd~
zHuOPPalxIg@6~JVK8eeHNtQn(`)<qc6EW2nd+IqhG!%>MT78f&PkLh9`_kq2oF3on
zN&exy_uBkQ?*1yn{)b`$d`Z(oD-_QjF4De>(1);3=@8?YpjYczEO+F&RwMNBeUGYr
zqG$W!n0-rxgC)on?dH5V-N|m~y=srZvxhq+=KfRc+<eKKby2#+O4pn-@;cL%q!LYh
zY<?TR5W2iNzrEGlDZWxOaN0}t7^T}C-!J{XoB4ghj_->l{t0@Y_|L#qYxC#+v-mxQ
z2UoSj%oKfxaEmoUc_PCz&C>8DnLFl6dl6yQjtC|l1(~CIt3Ei-ZhI2G8?H|P5lpTv
z%rll=jpLwx@Xqa(vaxvU(s4M);E>R5U$aSG0xi>yOqp0J2m<^IxZUO`ep=EQ*i)v!
z!1NT>U}G>^*LlC`Snt35kMHlE0Uf)5bPx+e7W}ja0_`CxN;7Mr#!1g=<v*HVK7X#C
zS@-cc?vo9`E+Fxc1+X5P5<IAmH(JyGVfp3rXaA-CJ}!fQo|kG(RqFW<&Gnz$A}=#9
z_R+0AUhFL!x2!p~>yhMI*VDzEH$Eqx^PS67Eg5jf(bL@LMtv*G%~>a>^QtUz>e-vf
zmTYE_e8bKAWLB_8<)+H1n;xpVxW)Lc>C6Z_er>9tM*oTvJ&fP+xSA2=u7Rw@m{*#y
zy3P$dw8W@W@M)CQ<&~-nGnUL*vQjO2;>5|;l9JVrf4yJWQSY7q!t&mdfI|MAR{oP0
z*9y7p_?+0eR%2ys_L~}i+oUYh+Bo^0=?In9FOuXh-TuW_zEZS7?D^~bIQ~v6SHJW8
zcQ${&aKwJ;_AfH;gJi!L^1sae%Wi$8>j1lbbk&C%w~4`<4}aJB3zulG<#GNMUGI4A
ztI`I>^Vja}kr!MN^5(<e)An!)#rw_z|EAr)P`uY;4MXwQ?d#c(WQtnb*cY8gNR-3v
z)Mk+RoBR6UN2w9u7^UN}i9)yb>8`E5b7QllOPkC?laJfBOqx_JDLCy(>Xp<TXI&A;
zvM@)PX8Bd7=g!`Dcg?Nd%qvruL^v>%&V@~u;7MROq(sF^qq!!WQgNS}@!D_6wAMwF
ziY6_Yndj!_^Cl(regEryZHj+&_b+%}HPP{i-IJGge!X8jg&v=6Q`@p=Y3cfnA8o#N
zuDJTOqVDN@xXKBL^cBVsJ%8=q3OkjTgD$R%PwZNJk=wuc+&qc>9wx_U-if;L_L`@_
z%QSJbXNn6AFQ0x|y729-!aB|+zh(a!E0<dTZioADVgv&?vvge2D#Pt{%U`#zH$O7z
znmZ3?p5v?A_VvM!UZv&f?e8Lg!!sk_FNyXq+x|7redU_K3=TkLkCitc{+?C`mvE+z
zV<KL!QHl;fx^;c^Y_VjG35VyX%`>|;Dab!r$>>hcuAc3ZD|TjHzxP|>r<C{wg>1LA
zp67MmZZ5yQb#~xdt4Uix;}Dydz)}|B!ks4^Y4=Z$^E<!EPhQ-2nW$&eW$$O7QYrW|
zfPryg^6^`Vxz%cUeBZOby6yV)?UUP0-gj9~R!NIVPI$G%pd)KzMQ3)RmbdSmuq$qp
zgDO1Pk{yoxa!%d;^YB^A9{+M_%V`^?+>WT-t<^BY?WLO6Td~Qls(Q28TyCc{JML(D
z6n9GMX681rIZ0wNU8PFPc6k>k8Fl)r=Im?Vc`_<i^)tuGv*+LLaZ7z<+peOkc=^?z
zwP((mD&O50b+1yR=#ck>o28G8Dz7E3JluDBo#5);W4FDo9$BYsr<%WbQnmL^^)1T|
zU5O29?7I9)&7~*K&$e7??dL2JPo9(O+6_~hox7q>IUl=kx>Wj!@5D)AF^ld@39a?>
z+4I|PRjE+wHgB&#d#3Slu)0r<3ROI{e64h<+Uvq6irj_A{OgSU1%*zx=eEt{cl@$W
z)b%M>;qsL0DF$=a&X&rZvopg-_5D%Nrpl{j+H!(#F3W6DU!5}T=+?e6!S0C19}F`~
zH9|v^C(3q8^67GB28Ok6+_>b#nMunoR7<|8&Z(aE$XInln&&G+t+jC{Uz7?))%FD5
zxshBl-S_c2V{Q5UTc&La5?ZdV_w}fy#H3a$mz-T$@mxG-ugsjpy<Gh2-R!wyygk2q
z;;wv(iL1S9T3TYsV=w2Dx@pS7>teqyE9wc~saDn1v@Xco{Gn~WxyJ@>o`aTWc2#n3
z@m?`0LtE3c?MaJztB^R)9F5bpnm0P1n&k<qs(5_}-u~^~<d>hXPFQ*G_@o`BSJz*0
z%?*mPiwW;#d&KeMyRmWRX4RP=RW7+rRyZp-ORYF`byRD<r@wb(Y43@X`TEm??)<D+
zyEM8W+9mX4(2^^sDvbj6iCvkZa^EY=+qq?tK1;3P-PkL07QLFZOva|lT#5Dc_vJU%
zd%S6X{Y*)7k=l9nY14hKEHa(k*0;zgyeD?*qN<8#W)-_H<lal(%yneh-U$(wX)|47
zG%L-xYMw4lna|KOS6QWkbyHQ#QMP#xZ)~1=?n2NasaFY>(E^uVcRlEPbm`HFA5&_w
zf-X#zy1MJkcVoNFcPqM&oGrO6UzDfYl`V4De^1`0uRAuc>5DxxebrMnPr+5fq3^5$
zXV_kuVzz9y_mU;*#gQuy#jvccy2`aCHFC}ii9Itc#mv*I`<>EE+U7UxSm|})%+{5U
zHB}yoO*U2J`Jkybclu6^tyXtMFVE_kDj6bGxpK;xb~X2|r+cnvFFFzTC*P1UZ%&wu
zo$7q)T-E5c>HA)V{b%6V#k(}%-j2#SnU}S9taqGnd0Nn&)EV!!d!H@ZvfNkT!qy6J
zpQ4;8DU)r_bf;eTIJ17unaw<>qeZ34jLp@LCEU9HL}E`{df1iIxm%rQYfM;i&aBvC
z%952_sycI2b~N9f*85$rCwhrm{=~GPWaD11vfiS#3zo9B-`)_wC%J2iXVJyf8$r>f
zJ~NAMKQNK*)H8jva9v#HJn@NhzD8ZOS+3J1=utZJl0?Bd<+dlqPPd{$ZcW&1tg3M0
z{C#7Yz%TiJi=Om9%?<LfJU{!Dir%T9M@Eldn(TRP81irfILBVQFm-ij^ok$-Wf?cp
zdVB2tGX!sa^w{5f^Lj(CZ0$2ei&dKb=H{Gz{d2PNwlAskQz!0oKdU>(bL-V3lO5mh
zxU^<Xk+fE<)5*)VdNyYgFC{VO=pIX+^w6M_J80G;qd==i*QN^fMsM@<`x`xX-EogE
z|E?`GDb|#I;}W9Oxhbn=+MGNgvDSs0g?vJnmaN>OF-KMC#3eT`nQv9;H*@b@cidZ~
z8|D$%b;#&e@Hy><&h2OQR#s2kb4cJ;&q+78$?=ghJ9kM=))T#S>d-!&PwtGKsr&_h
zWIUB`|7W;U!vA#Jg-b%0GgTI{czSv4>gYb={7Z5Qq<q#x9Oou-vFP&pz26K!ndmQY
zxYmC4UEPZNyq~{|2{6bWFMy~fgCu<AE(61b4R{P@L~_-#?rC#t-*4=nxUiOqZHwLd
z;;(K$^5^f^(7^D{n)qJcQKW`&2U7Fp#A5iSQYkS8yP|lPh<7IodM=$3dDO9|S}^~L
z@toxNf}X>M%MY#y`u(l!3U5lDNzp^q6NWKLW+%V-{;OQ^UH^OTjw$cvsj5twY`Hm6
z?W_iCS={26PdLLaq^Hk#lWAgj)#&Q<wGvJb3|!pu_Dv3ZtI%<}R6FUl-o`(+D|h9}
z%dI~>|B6iJF75PyBiydH-8ClNiJYm(Ym@z;(96h3$#dq+iEqS`&TMLE&o?|0+wtYw
zrd6fKdL@P9Dvf8E-R|2QsCZSoJISe+`=<Mq8ug7|CHg0?oMD@@T`~Tt$d_~94y;yQ
zR=XtZTu8IpHs=(-nI9$h-16j7SzoGt@TRA!j?QTZ4y!ruIreSax?u5{6>oB0_)N~`
zQtj5aXE<Nm`R&r;;OGyl-*)VA|G;6BAGPJS-u8`aju)-E`SEph+tSG6b7gPnTdH1<
z%$+vRxy`G1#;FOJH~Kp+x$b=DvRN$k0RszKG$S=4f>Bzt^5HvwEv_*?e<#8r{nymh
zybmWmN{ZOa{WC5<=f2UdCgmkJC&$F7Yeg?}o3SydKIc`(nKQk*OOL*3Uik8=Ua{jP
zqr`N}H?MD7&rvmgb}aR}<jRg$=>>`_CYBp{2}}<1Jo?dUbI@6@4?BYIluN@`D#IMN
zCd=i|yUoIr^?hG1x_M9anCCK;f|*5U52=ex(YzM4RaM5#BY5q^nYA0M|COIgbUJ5c
zoTYkip=I&Jr=bGrHk(d;oR+;L*h9^RspFu~q*J~utwOg8&nep}$NhDE_3?CiWOQk~
z(z0j!Onw;M4mMKu%+%ZRIZL-!+o|KEr_slW(++owHebsBn3cKOcGj=l{qO5e%Lkr_
zyQt?D(6wZ})P#8zH~lVVeK>QuGAdf>@!ETl(-xnRwDX?y@=(IVTb?fWjM?O*dgf-Y
z>h={<S=pB9xGrni+9gwpRMclI>zVuE#E<SIp`UfjU$sd4Y<pzpSD^Hm|FNHW$d!zB
zTi&Nhae3=mWCWdMKUSmVbmGLXlWwuj6*~?UoM~BoxxVAl&8c1|PkoxnxO}%@>b6ta
zz3Qip9_sd5S9Cv+kE~dxZyOwHFTHBgYEh$|ZC}i;6@^@!xFs(yPIAGC9p(4i^VYT-
z9|#K%SaIpawM$X6ZfwiW{C4A;^4oN^)3c{3vrC?=xl|Z5WqpuP?y2zIg*}UAdmcBP
zys^V@l~jB9y1dYoX_KyY-?+0k&?j__!`zr(m!<nR-V06(dU(p~Qd-)>wOn^y4>TzS
z^sc|U*3DJVe1_#zz2cd3w72f{y;0b?D%3Ugx^%SKw4cjQ2WRFUs`L|FS%31C^1aI~
ziJ2$;P5czSBAF(hwoY1~y_wg>#=7`N#kYsAZr6(kxE=hkqRgyZ;3Jot`dx+Fo2Fc?
z_N>a{KQ?`u=c9uD2Z1xTwRi0P&oJ?Ae)hc`rn#MV)3!eU&oHUTx+lM>_o~IwTLzmm
zea+XsHG3+#^S;Sa?kK~li+v91e|ljhcW*&#?CF#1Qj=fodTK4%7T{l|sXytGWG?%G
zEM<F+D{HURiQN_qUiykJvFl~YnzMUDJ10d%UA`8vStvxKBR0^pRClMY%0Y#zk~ZgA
z=6Ln=?%OW9bHY7Er`BlRu2(O5j&@6K*>af2%p-S+$~Lc{Nv}LXLt~`^UCR#^^i;0q
z_##m%n;dp5YlDl`w@9DOMn=XZ6DB>AoYY^Z`zQTFZ+hFR#}db?g~NB>-k-Pc?_BoT
z?N9fxXkA?$%{ObR$4OT=v4Ws|{?i|Xy=;oy<{jAj<X7v;9cTaCVcTvwF{9APrc5he
z)v+W0Q@Uo#pKzhen&};rnxaBHygKIi{r+*j>DFiS6EluwzS+w3WZufFrWe*_^SWf-
z+<4pTPNu)YjHgnQH;Om4Ectfl@=n3&3$i!pNX=VkE9)oj%C${p$+;w%tY=Y)D^;o|
zpUBlzS$fEBQ=r)I-;aOIZ%#DZ5WjlM;T6xEr|NIM-><ZHx>Q-%@6S`Fs+{!8bA327
z|EC*2b6ohI`?ucLzWt}a<?G7o(34YQiV~A%{d+Uza=upg)U~O#MJMKk+`H>JuWG?H
z-MB3q=a;-Z*US37NMdiVVaVH~9RU*_8m;J1uQt6Nv~2QO^?)U{6V6PUGRghS=e0g_
zTkbgTI{$p%-+j{-Yxs10Ty(sM=aA>5ldX!2jGRs<+u5nAe18~Jwe5>{gr3mREx*4R
zyfS<9HqzqrvqgI+)kV9xxZPeSR58czq-V}zqZ{21<jXet#P-SgnTOnGk-MheCY`#)
z{y)QV-R{5F|H^0oWYJ+$x;8cJ+n0TFW9P}rvMzs{uJSx+X=6+7v}-N(Teok0R(@m7
z)Xl0Nb#vv)wEu=2k2|VWvh26csdfCa9Z!2hTSd}W2JhxH^S!wz&`as8+PY;|Ud~kM
zxfQhU{^eZhwoANEmg-4)T$?iQiTnGB@?lRbH8oZmt~!=kcq+p*^kc`#850&SY>QSl
zHh$>DF=_6ZGrzizCGRLpEcmowUZ#0<*7ekH`{ysRE1q7NG+pZy*Hnq4ZvPoXCrw%R
z{$r|V*1xN1kG5KRswOb-ZN_%|ED_y8k(%crF^4y4&GB3B|0!nOwR<`<*Sar@Vf*Q`
zVcDgvy1}Xj^QBKHsmz@Dt~S;=WL>28^ahcRs?*D#_8vT0H2a4|le%BglEo>;i(N{E
zZ%0koG5fQ~?Y>D97Tw&sW{T@BU)O7Dsn36?aR1n`<ju7<+dJ2NJFc#~Y1HYz!$X~y
zBjf6^r3D&OH;VmHbAM#8SuEuP&q7;;<k_E>uJ$f=c|3L6`4I0DXI}Gh`<eV_P|TA~
zHF)~z<e}R|$+f$_-Ch$fd(L26(Y6iln|`@{3V-}9I^@)bjODt0`!_9Z4mY^?VD%xD
zw^!;JA{XUcG-KKvyYg3dURc-4D>iAFfhT;do%?fT&mF9}e!$oM(B`K)*W4y~ZL|Hg
z?M|I~h{@TA#XBFB3QO3ewu@Zeu9`2uAv#jO;nGT(noUOGO(|I)zg8SN(`|H)X=CXs
ztMuLjt=SW^)C|M9PA=Kc<#W5@jj8HZNujV?*EXpxIqRupGIQcb@k3TI=Qo6H>iA*3
zb19dRW#Ml<TdOsZn(GZW&Nt;LEEef!jT02R$N7gbP3PnT(HB*n4H0K|uUUKcMVjxi
zN!dHQrH)RITc_LeeZsAftn8IqzmxR0+BejjL>?`D@?z@o*u;3H+M-+~rR)sHJ;%kl
zwyCT*Gi9FIS<kRzF0X9=o|q<<+QWd-b|AIYO7Q$<!YhRBrw1&TjuLt`J7`jhx8Ln0
z|Ag-JY}Y^K=KPCsVbV_T-aD6)i(chy_Fl6tcxA~&qnj&yZ6B&9<!AI5T<4VUI4k{i
z-m|)|>K+C;%f&aYsQb2l<3w@R<;zNbzFpsz%w2Ie|6t(zL*D)^ah&h3q?t|-+i7m#
zbKmo`woS2ls7mHV^|NOeKH2EC^GB*)NyXWtuTq{0J)XRK(;}x7m5E219z9fktkIL~
zU&<{sF?8Pk)|8pT>-=^WmAWpSYM*nVIN*@5ZmOi=1BGvWiwd`0*;=q`snDFjmCTnM
z)UDO6Z8|#>Z}okfe0=@UEiab4o;KiGwJf$hQFO)C^~a*0J@!+*x^1)P!bfUXZi%>M
z-b}e!+UwWu`)S6nJ<md|7R+An+j?YLYVmDjM|&Y=U!6%_kM5kEe){Ong0F9PO5Xpf
zZZNgw(W@<M*82R`40?3!`mw09yz4^RC!LNtDk!;>W$C0TDbXL6e0zN({PwA@{QlW*
z-jrsZsy_AO;zT9eX_jGcl!IRt>Mj#JQWg9<x;tp{fk!ba^97fBx^rq4&)igOmbBx#
z`(K}`eRbL;&w{+y9L>#8jEM4G@-}((lqGT(1BJ}fcHBD4vQ#VT!jk!WUd~*xHP@nf
zy4#uepS@!>=k{&PjP{PWqC7qB@v{}LC$z`Jt3J+|(6XahcUqR&iEaMkH%`A>H(@KU
zdy?_5`WdDZZF5T6`=kxdRTur142o9OHS#lO?p5#96Mok8=w_DVq#Y>_v)k15Hn}d|
z`t8GIu|>X{&zwlyRLCy-IL#$?(q_HtJVBMq`#EP_$_(oGu3S{hxYKsUg#)QYMRy}k
z7ja+Pa%bbNS8qbUJv`B08T0#-Zt85^ecPf-SMn}W3Yso(-_&E`!<Fs_H<`tXo4jP6
z81#JljyrdQF8$#w70`a_w&>(Tt<X8&;}h;g`|9o%^K|ppi!8cTTq++f9w-|Ut?0V>
z;sN6|%7L4B=AGB*%=S8)dFguSnJl9V`-=TMJMt}VA4|WM8CdXDsK$MN#FD5izMcN7
zE_=VpS3Q{h`1x@@?TycqE~~Cne&<x?+^2dZVvD!i+k#2FDk)yCqqe<HO-W7r!EChn
z+OO$vZykS^wRS?Qr53X#f5Nt-v2%WO2A;~YuKBK_aQSq3UY4iU(-jl8zTJ5>Ia@DR
zd}qvljZ<Hq&6{sBS$?a<lBeHf4^NB_5}LXw`?JvH_bPLoyHW+a-|ox|n!4%AsbgVz
z4{XkJFUj*%)|k({uWZJCGqs?<$`Q(<@8&rxS*aCqW)`gBIT^F}o@tpnr>gqClOIky
z`mITEUs@SfP_Vk%uJ6^UUDh?*Ew!?2o*Fbq7JiiKWpPV$bKT$3v8!2I_2A-g#`$-j
z?_??abh|1eVD6eDVwdw5UeUQXW!Z)4ODBnVDfx42dny;%-cilJx?!Jt?aIJgQx5%@
zUCCL<^&=>Cr~H401>F&A*VHmFI52H>T@`*aRMl*H#$q4uUCn`<v(HTOc6%FVm3GM7
zX`9iTm9tBi8HL1HPns9RamjmASDj+}vt7=8SJSQHuI&Dkc8OWc{72LrRhyZ73+B}2
z23<?>tjvzO_9S%0>{7QZ$1R(~jicb5q^zjyr8nNqxOOUO<I8Pf+t!P$p1oVW@|xJD
zvfHzM&ld04HQ|=0?zXv;T9&G~C>ZyeU77i7vUHHi?Eu47ni`9(EDhHiR*Ib{l%04^
zb-~T~A{VN^O3DjL)-XM6dcI@c(&V1hiqzd@qVHGcN_A@G9+PwYu;cXEHJ!Out%6H)
z*K~Rs85t>yu6Z1h@~Co24p+O^W}b65e{J4gXp(nzOUK8P_X4G^+U4Y)x%ytYd-d@R
z@2ori;W|3Q`cszvtg=<VyhXpAD1UM#-sElQmucE>CotnYV}kF`tnO!XSM8g0w04v1
zPgC);)}ghkUB6Qp-XWdVB8z-H%zm{+KbL2%U$QYir@hYC^|RmAc+aGLHyMx*e`!ZK
zv*7*775g%8t$wK!ep~VW<iMKCTf;9!tl#E<d<+H32_U217>$k5L<K)*X_2;0`cgan
zt~*cee%rFjKTG?G-;?s~8{N&P+?syy=<Ul7@3-fEv)%qtVk%NjQxZ16vey4(!R|jr
z{I0qIdA@67R`{%aZ4)tJ>aI%@CI?PEI8*GE?l)!Iy^k4+UY2%W`<2>p+tW+v%PkEd
z8842LDoc+$^>h~--!A^)Iw!Pbp2)1bI)}5DZ9hFxTz*~Zl|1)DTaM*SiJl(#CZ$d>
zOD;rrYQ(3ovo5mtZ1c18!&8~9_blsYOj{}XA+l-5H<mQ_LqGS+9uIQeIQ_<)oIlF1
z?G|g9EqWO|t8vwiy?ZC+-CVqS+2+Kbp^`kircYG7o-FNQdZc#wl*!(Ca<^xhd*)w1
zWE$9$UAI2+<-gS4WZkT=`3W2J6#S-r-IgfoZ?s$@n$ubLKf~*!B~uoaX1xyTPbn4e
zIW}?cSMlEWE1sVDa<5D}(_fw8GJDagX&-;v)E0Oh?5pPYF5J!+{O)Y2RwbWwminA$
z79VF$+96{cIF(!Wps||z6RT8i!MAG;*Nb<WYxuNrc5pQcX?F*=ylz#D&sv;5rJcXm
z{aEhw7QND0>sQUJ(NWtiy63RO%!%iAB$lrWJT*Jk&cw(#E5uFMaH;C{?qp^2Iez!E
zmQFimKQDOQ&3~eDrlm(qd1hPHOjvEispD2D_~g18*D@`iu%D}fwV#UL(#@OXtEGFz
z%}YG|Mr3@V#9W5`oso&tw+sCW{gQL4_GR+AC!az?3ps;hqNGBMCk0&CYPMuob5QW+
z6^=XqESSG!R@1+AJ`;XRmaMXtS!5P)yXXAkq=kLa8lH+z^{l7We3>01!z;OEiN?MY
zO{OYa%uM~BrEQ<bF4}eZSmcTczc=%0TXN00p1ju6YsU`GO<BR!Kh$d6AM0+D{S~`n
zy5*eN8sEQ8Tqrf+>RNl1so$!WpG`S*{Yd(^*xT`9M_gh&UcDBq)J^(yE2&aUede{y
zrGImeNv2CJvd>zr{kn*K)e_0vq+=HoPqxGyJ!tYZ-(cslG`EyEL2l<8(-wN|@muqK
z!<_5dnrU}73eSuS<4WE2^zidS;phDGk1Z<b-o$;tsXS8qfUaHG+T?<F!k$X1O(}Cf
zCcmrTofTQ>qx!^Z^WzUufx25J?YrH1WIBJ`qqUir<W>gi{`Txik-iqF_4Bs4l3STh
z#j)j=PIu)zHQQd<F6lO9!S|aUrc&m{s<&T==Py|OSjup++Y{d%M->+yH|&{aT-f<+
zj&Jp};DFx+9%rNFPUwkB)g9l(pL{#_-;I6TD@<<2%4pnJyN9Eu>-yUGGT)G0;jwAL
zIcpz#?f=tk9C%tqc~ZK3%2Kz*rW##!{c&sNrhOEjRavo`&1m|^S6Vk>TW^XTxvlbE
zh&Sk`$IUHkvx3)|#Mr2~9=)PB(NwRkZ0^Y`)0EHV@px=Xu^0WeLuJcjwk5j`UEVp-
zV(LN<+n&=!uZ(qjq$l#8l&d(lML&4q_47-&r|V95E2f?8%NFwUN@~1jkw|ON!$mrI
zXAAkHQ<iEacLe#IUU}I#Fn8lRg|y0R@`?SH=aTj<if!4>vF*tsN5yTkA}cO!)#Q`s
z{5oARdfF4G-E*H*{q|07H4dKqAV7HP`K+}+E`6+=EYAD5=kx1Gt8OvRBhytTcs}s_
zI&1qskJ*c_{n}}!8+0=yddIE47&9Zbg<GzsE_sm|o@y*BJK^!$S-#7+Jy+EX*NxHB
z&pmUy>SXTTv^mW3D_$D*MhCBZ8ljfVDx~%B_7!6z6Bp}(Bg+FWs~i+*R=bwB;})+G
z?>ym1$v1Pa1eeV^QCe!cUf;*qef^p{6WTT|yU`ZL>CNKyxoxYb#<E+U9!5d&fg0B)
z`3r4VS{D|qZ4w~D6CL^S&2^Rb{zqG;2j83;DE0ERQs%Tp$NClSs&yVJyt4OLR%)h+
z>Q*WL+;6&PcIRw0GSS&<F-68(al13$rlWJ`R`cu>T6?^^Y|>BZl{G&)E~Vv~JPPnq
zwaD;~be*wT&%r48rFzF^u~X*`i=R@s=DK=jVYKCsHAU+lx!TT;aq>R4$>iN+m3`Af
zmaLdEDKIZpenLsxmZYWMGrnwBOS>id)P()@n#rtsmR!Xry;RLIW}bc+RB>ri(8<im
zirk5Uou`h4<vJL5NIma6zc(~6dHb6Kr8hSw+_kvO&UProDe^?$b?#-a!sZ3<mhN51
z^>lTk*Q<Lo3orGal<=9;FLde6nrC{uc0F6$t(IsKW}xbkySS3|X6k{=XM$3)!P&||
z{w<exwuK$)PW!^N)pc@&_o>{C-5nWyUn)MXS@>bvDU0<&lMg&Q6<93r+D~wnf6Ivr
zsqecrniv+O|25uaUy^w_CwH#PS*fb;TM9ys*@k&eP5U^fM9SlqvQoyBI9019D&Jli
z+byh@E5DlmE$vJFOa9g?saGe)9Q+)4wJ>FGVaP?@&LWXZA<I==j8&|%0#|wk@ort|
zzH8dTrSQ{HY`Z&e-*_Ui*7WO9mRXwLLO(t|wR+N~=nHa%TUHicTk=(E!eSH2>4CF+
zym^9!U-V!6rl;*IZT#Z>;>$&B*^jqptBDE6-6?-2oT)x1lkMC~r#X*ICVrhLDKPE3
zdP&6R%uDNU+<G5m-S-%DUhm{xI+a^qhQG-zU&&uo?%HoQSL<)@y04H!ZT((se0F(h
z{Kl>Kme`)WD`s=6Wc4?*-B%h+10=(DS}!_2+iz;!vFlY6kj(|(H-~f|F|wS=s#&vE
zajggm37x3DI(zYUAsLIFrPGuRZh7i%xt*!xB`{^8$|M&MV0g{IK=ibrYs!sfy0b3t
z?oW%j8u~S)RnhO@=C(U4-bsacd9K{faVgD9$>`3mo{l3cx=wD2dgC?s_UVXg>yM>+
zcukB9);i@lYl^3yn9{b#t8~w=xzwzrd*+*xu@J|Z?f#iXH6|NQED5>zXim9yOv5F?
zTi+M2)pL*bxZTjn?PTV6N6mG6$I+m?Dd{syUFXEQU3szjncZoPwQqH$RzEmYI(@?L
zB5}!=M!erE*D4pAbK0<EdIe<#^{%_-ACXxj{_4%T)l%DcE$Q2yv|npoqI+F2N1=>|
z+5J?v)>%H=y-pV^-#BV*aB9hy${kzQ9eZOWee0)*uV31}TQ$FrFY=9fwrJsz9k1?A
zdnD%nE=SeN*vu@a)7$pV`U`QIUu3uaGGE29eaXqBw5^vnE&t>i9T}}Q=hnKeOWN~#
zPG>e1^<Fi7sHo-XVLEHt(n3{hq3Y!scfNQZEWGAbaxHspTHO7+A2*#A;yqricw*Vf
zwSI5jEY8ey^0+j`AZ5Avjp^*-QKnUK(t3$OyUuioyK>(CZgTC<tTL$?&lI1%xU%J3
z_)VS}feWuLRdahdpYNsrVcnOKv##Ghyyoc2TRVzmWowTd2~C=Au5){x*Bs>}lNp^4
z8aJ9ORz2zU-E^Xq$E2mQ`;_XGop)dMxo78Gn7iy&E3<Ux<hA^Fj?Z*i`EK&dTwR%t
zJdw6ZhnL+FazEqz;*pc+!EZ-)o^fXIQ=J$ex9pwIr(J1#C;3b=wdp&WaO+X@vc={h
z`?nbz>FxcKY`ft~%%|9`hd)~f92JdA4|U|S&h<))I(fKj>z>yyCQLHYtD1bu?1z8L
zjvXCm)jf7yzjNu+e!W#$&y**sBwhP9b#vCn(^dNp$E2D~krp%R)s0kjdH3W?iJ9vm
zH<y$pGrG;3g3|U*dATlIU483`0-0ai8uv_6Hq?up?^MPeuBqiD+x1Ar_+)uwma@6f
zC!R|;KZu=D{eJS&s!xp#SFT3x787GN?KHgb%JTH2+kvt@Q#7NOw|py^l6B+P<!yIs
zBmZ72UD>m_W{%jWCezuGQmYcfZwC8rFFSd;Q}&2(b%eo;m;ORm9(C-@bd=iR@%_=W
zz1M49S6$J2vheZqgjt@YQ?Gqoka*oV*z%<AmgV=84mO2kc%;lzExNC|($n$XvXh<x
zk=<s|np@mT>%1jT{;W1O^jdBC-L&N7>Zf*hcFwt%Z|7;1S-Iq!^B)eifYf6#Uh$lK
zDH+q^x^}%`*=jlIz^Wy#RVG4<mA9)cwwXS&XXUAin`WIJPWBHy59f63oD;pHzArx3
zvMnO3NcXv#OU}fsJayZYgHCCQhb^96`(r2UvUTRn{Ih}D^{m!8Owt)`x3n$T#P8_F
zw&v&THsAa%M|a0hn{|ArCT!nXU8r3rXD6q(_S^lWk6fYCx>7dX-FWO_`h`Ua+rw6G
z->~|v<f@Z?79Z_i>-uT3+>(l3x^vQE-IS@iDw~YGCa-N+d+(Cox0!R#sZVilpPM{o
zYG##Q%&WM5{aLqyC4-`UDr1h!>U2NtWqEX?ytq7P>4b|5UGsR%6pi=mS$k$$|M=!9
zDE^<}MQwxhPTo7j9fG@X?#tBI;{921JvW^eo{4$zwkOgvJnNI%#4EnOPF9y*svpb?
z;<%f?qGwJ{@8V^d&lWqMxAHyJuePX^&oQ{E%=D;ilgARL>TN8wUiysBrntGa&o)&|
zsWUPP3VvbuN0?{n)05sB>S0qB`U!b=dV0=W((@<HB`fUc`$G9c)_srEb*CGNYRzrb
zESwj!qG$37uaG56Px3B|7V5aXpR50vn^L9tzKE#$WZ^5EI`Cs5<OM(5thMSEpJqHM
zGVG-3W!*DhHf-!(S~mIGFR>?upY&@sZ25OgeFdNE`)OkHZyfo`yn%iGy!EfVKTQ{2
ztW&l6V1dO?n5pba-%oS<ze%cUyuk=jA~?-9^uZ39n(q5Ly7n78UolUte%iYHo1v}4
z8;0}YUsZp4cbt}6^&y7O4rV;#iM^i=UH&aGZ$TMD`s!bk{wzI`+8<h>$1V>uyx~dQ
z&&15XJ^lgK48~Xgde#L!GCIC$k2rHb-0&ipt7RJ`U;SIMU$am$`Dz?@<8ip*mM}N-
z9pL#|9~3`jN6*Gr>sb#xh8r#eb2EDaTh;!R`ln*tBEE(<J-`|y%(n4Yra#qF9t{v^
z8X+-2<YW9oo1bRN9Xfckov+a7cHhD+s~c{lR?9O5xj)p6`!G-Rj{85Q-%m~cGqm*S
zIKQ$q3z{aT=l4GDZmrH|Lrtx<`p*_HFz+}(a7m9qd4-zb<mjtUB-Nzdvok$57i}te
z^(=lOYg$U$GMOk}U&&cP)k{tviD!7c<>2fsSGeXpeEIU}wUBeGVpI)}_)nR-VR7L4
z?iJk=xASwpKl)gA*INeD-MVGT>ocZh22{NI5bL_KsbbD;)+^?P4quyd9H!h<HRPE+
z<IvPouT}IrwmGV*O@8>u?#3+1^JS*Xd?#&9&sDnTy7%_m>z^X#Y)`P{n3*}bFzv8M
zrss_*+e2#9k1RILi(0{X=H!_(Zv1X9zy8=0<z}iDwPs>yZnV+e6Dmb2>xz4%?th5U
z6J5BBbIaHHpZtGX%TAZQ5W~)RTB|5m|JcVrwt~s_T$gl1jSWv-ZCz@^@@m$DySZyQ
zgTfZY2d>$sr+eCy;mnOQH)lSp(wnnhao36DC-1IXr-yVDUg%h|sne8CF>{vt5x1>^
zMyD$L^OpUotrmaU*KufK=%mYGcgsYg-^9ro9kLeh-zX@nzf|w<&SG=*f7cKGj+cCY
zby-7NvazAg?#0=bHP#&4u0<SOWWCs?Z<*;cy|92Ag@v8%QBw}xSDG|gPgT*wETsA2
z>#*ZTbJkq)2(7YK_L8XDy2PVV$#8dkz|oqpyXBoRzDE-}c^7Jh2i2(NxE@)$x>T--
zapuOEn<giBMy0KJC~lth*0jw$r90D*Q?q7H!D5?3v%M}YQu2FWD75Q*l9I{9nQvW=
z@6x-FTzTcqs;zr$+f-~5HJ8skoV4u4y_v_d)!&?T+b-?7YqO`~w&@#HwF*<*yzV#{
zn;4s<27U1=yzIDs-KmADFDw>E9RIg(qHvO@mrSl#@W*Qpg>FTfWkkCyS>2kxxzk8l
zMxaDq;drLT^_FKLA(f&^%a%@$IyPlh*^Fr(&yM%U&GZcBRk@Yx;guq(>ay5QDR9C~
z24lmSQ*<pZWz5PqJ^tYCi8-fMu72gQ#LVU8Dx>>R^MX!%GcF9`Ijb^B>Gta58|#Hi
zUTgNw@jJ0G=aH_j>z5gy-Ycw|X{72=IPKC!VIh&1ic{8d<Z7#KdezhML*(X6jV%m<
z<y&Ik8vJJ{?%gwM`p4P6E}6f)ennoXxRRE+Q_mqQ?8wqn@fN3oF0?G<c;L+9`RI|V
z#!1(-(<V)6j#js>?N(0{+4*{LndRQN{B5VFZW8(Ao2ed<r<%65r_;mecI}x=2X$k~
znQf2vifa_Ed6X9#*JC>^#X3=Q`_02ji(dMld2G)4+f>uLaLdZ8vw~X8p54(5?pCtX
z%<g5MY%kmH&#j+jGecraPH89Kwb^SwN7PQAsOUTSq_OA9Cgq%I?wk5kCtL>w(Oaz&
zlj&M#-x;l5HsNBKNK{8lZ^wM+wCyqVEmPL9tUTVU={R#m=ic*gTQ@K`Uvs{O^Vs@;
zUwhAA#d2XSc<C3xGK7Kg&#pY-Y_ou=7M^x2D`QmB-wVz=zOgR0|35?H!sM^n&Kbdd
zI+LbMoHZqVqoDY{q~%?&-c5d9mLHTTHNP{(%<w|qUZGX5Ot#N<loVX)k(m`JC^<1N
z^|9f_aDC&M-TviP3zn|f_;ioUQQzI$bY|DcdOh1MWz`vHd-?Q@ZF-yUuRdSFR^Iq*
zSN7tHHr~k>pGfw~%$k0$GtzRU(W<D-#hTd@rk(Zb>kBtrT5REQ_K<s2Pe-iV(<ymX
zOW1p>w{N$pd*x=na;D6ht!La``Q>xGy+0wujN|sko|#cQvgfGIUYX0ga8V(rsZ{tO
zZ*8@*cah7tMjn+jo)&v@ozupfo1es`22TiB>>0gc&cg^vyFX&Syn-up#ksB5W~lwU
zsnhbrmVN$Xi@NweZTWLzOTtc0mbtVf(w6bigy})cEb}*Qv1qq!>5~%`e826GwCtr?
z{e1Zi{gHydS6C`@xU9S;Ui5Z}ccS&|OwCDgh3{A-XHD=d=;R7qX)QUkXS-|eI{h_;
zDtGRO-woVu_x4q$q5jgl8;{)1Svp1SSV&p8d8?l9!}8Rxc`?~0lG8RWvrjJ8YnT2P
zFRePQBPBj-Qjk>iN&hG6*Zwp7XJ9`P_wB&tsE*9zfyd^uTsC{zH*<w}?yO~62g`Jy
zY|Gdh{mRiSJG9f|&DOk|LT2VZJA|gZJ>=qgxqMMn`h>Zak;&&Kmu{TZ(xx17s_RV0
z+4({elP>G23f=O2mvZ~bqw=6#dz9~eV%j-Va%OC~|3>zSs@pS9T-Lifd;8>b1)afB
zPe0xHows(U?+?`n^Z6s?p6|_Lci*eJF3XuGd9zHGi)ZK`A-&}u-i9g#pOX#zY!)R>
ze&9LTw_u@K9H;u4j056HZ_d3p;r4m|o?mq$-#^|(Q(jHgTdcWdbz#h<j5(?@p1kke
ze2lNu#xcs>erK)zV%yxPnDBcyL;WXf?O0qedAo7QnWgUk848|k^L-a{T-r$4dy>$j
zODS&dI|aKBu-@3MsrPcFh5yxM&)d3uALZRy|E8Y1WBR#ZmFMEYS(7|VPPf->U4CTI
ztv3wfJH>Y}FoN?90|PPZI2p=ikBDXeXHfG0`-lHyE`P`GwEdq{{xjS^@Sj2K!jJyx
zPxD{f|LkAW@6Y+A`u-vY2Aj{w8!``?ek)mZHtw>thTf+qN7kRbpIW1`PITFxDU)w}
zSH5v#SO1D+!>{&Bq<5U*o^E>X#Jr7{gwN0M%E;=}xpDk~UUc*^+Zp8@XOFD!t^Uuj
zJKFV%?c&tF$KQn(`)yyeey*C7Y@YR{q=i>EO%FcwDqyMLEYHjjA+C@07T;-S%>2hT
z<LwbYVFw@6h*e>?wXRhN#;x7n;d*$ZLABW1zsmzl`8vXSk|u}$Gd5?td)a33r0lTm
zzO7SILw-#t<y^GwuvBl%RTuN59d~CN?$+Gyd+h9GzxOLW9u{hOb+7gEZ!+0ydb)V$
z`zI00{MY(!dZ)Z1V`+b8c08ldx<@i)^E9-*Gg2p&roCLMow;Roig;7{&fear+D6Sz
z!O)ffX`}0sCwb>)+vw{Vsw`fadU#XsA}0%;f0tWrtaF@Stx@NRe&-pd?6rLE2G`Af
zGHkuxbEZrRRyp5ZTrtVT<@s$vL9vHg8?sK>d2jx%=;V3tsd&h{&n&O}<|jrxi`8lp
zb1JLdP^A)a-e$?BLt%Z)6K;1pPTKoVShwf9@++fLCEugA&U>yKyXS}4`Kz9D>p4Rw
z&s~zHoY~|jtjejoCydYj$fe39k3{Bgnz>}cB#$qDW_8V+{3z<o>W)9R{@ERw^q4bv
z<B!v+pRT3uvgls%&FIdaBs)I-*3c)DH*1}-o22@87U#}uYr?nOIq9jSq_XQ%kawh@
zs;!st@1P&-S3lKniJ9Gcz(zMmM%^y=xN%p$@l?0FSC`CP%I&$+;&YkTrQBpIQB|8$
zzS<ui{PEXVAGx(^->gLs=SCD=@^f8gzH`z8t!?c8nzCzqPpDPq2!4uwT&n54DyrvX
zj91(dk2&XdIb`|S+8eowe2rYRoBznuW2qe<eFIf*oT^<Z)N%d5WV=YSP2Q{3PPeLW
zR5@R}Nv-+i)VNytfEOEAgbGKsCUUt%{V05Vbnea7ss4KvIdgii`)|Au<F;;-X`cJl
zEobGXzHBq*ip?`$F0t3g^W$!=Xo1Yf8}4-c@ebEKvXI5wU3i^5=ey_~n{OYo{nsq`
zHhyz>tH*<6vo*J#y*yf*YA!xicgmv$vu-VUpKbD?Pg-!T<?X%iwy)dw_}sUuiI*hz
z==s|EoIDr1amLn&L#k1abyGHL|CXNj*~8hZZ2zX)AI{7+JL#3GQlpZVC!Que<L;}x
zB`=owovWW*rn9H&<;#sKp~u&28YQn?HvOhx$n?rQ$(5e*x@S8s)ogaubg91iGHm*4
zAwQ<I=12P_yysk6YE|*K!Ta&!M;d=*_WtO(x}Wd&*B{59?wj$jEad7{LBn6Wwgzm=
zSTI#+mDHRi@%&sdZPRB(`>a&mc8_uSijEVfR}}5fu2d>{>NihF<hgoaTCc_WwvwfO
zZ_Uk9)s`%lpLA*S+((OZrB;;}<z`*2J!^MfZr-8Ym)y?z+V#dq9LZAd+p=Pg=2XFz
zzH*<Ul<#g2TB5z;*LKf0)sm$XB~Iy-hKDRlyBYH(ZM~?Q<w}zsSE`ztg(rQ{oTwzd
zqA=~edCA((Q6kfYQqoe^sl1tV^R>yollp7-91hu(b8YPvsn`|!)-4S&_b)P8=_<$U
zZR|NwvZ6Zwv9g%C!5+)2CkyZR&CH%{x>{2!gstcPw2LaUCv1NBVXmWDlh^q}N={`)
zH`?c(+_U-QoZj|}+LzjtW9FqwFIn<3R<DS2*<!z}ms#gmzBO6twKOu#_xO+Rc2^%%
z+WtALeP;7o{j={sofMw7)@N_Cw9FfiH`}9T1n5Y4MF&NDzYgk{v~>3BNXZ?2LXNdt
zj!n&aw=wO^&h*Jo%qp&?xpqh1*2<mq@ym91#pxWg%eZ+hcIp{41x<Q*I=5PL%e%}+
zM)vONzFS^OYR{ZiAXBi3J>F&NZP8%a8QUk_n(f{8onLar?5GP~_lvxy#3>ueo4hQW
zxiGxjZEiAWnBP3<!&)6r%362MyHvx!dD^S++wULSw^ZGjpYu~%(0yiv#;dNL=oUHG
zc}r$BUD=u*FYJ5v@Z)&tZGneEyZ6<buMdj)V^$I$c39^{|EinIHaz-1)2_5yG|ce%
zBF><VjDgj0mn1zGK3#f6^|7VOy2`=|mDUB5TvFD4UwDwOscqLG*S)nW9G91Fxh$k=
zEXh&&?6$Jng{>3wmMrNxsgk}iuR6#2_>oWFthrCsEDjcLT6{Km>RQ%p_n@#aua~yH
ztM_i&Gdn$%!)vao<nFx>7})!=p!FAss(WPrQT)E<@;@qtN$8zU{a+E^e{DW}<Zt-h
z7YqzlY(x)UgAB%oN#4@}HpDOnu{Ton;Y*zeL5XHz;}g+I#+jK*a(R9POuErs+`sdy
z_&&S4Z-k%jbN*$pZ=KDllB{D|de(M#{4E1kOkLy1F!z1JeBDp-7hAs;z*&WPdCNn8
z7OKx!xuc2E`rbji=qL7zxnDg*u+Cn2wO>z}2>De}L1D2XaW~zbt>T&D)3ar*N3_3s
z?Ubo1Cr+=pGFkVV$x2V#6KDFr&6>7Zer?vWMdnJ{Hf~csu1w1c>h%hmve;D9D{xk?
zmoXR^X2G3^<XD@#ajQQ)vYa)$LqT{i+^4c|mh$YCSL<Tj4TB3=n9BDy)oede?<Rir
zFoHF7<dIgo3dX`csB5w{@WsjpK`znVN&Acgqk`MqHalmiOk8oIWBZNn+K#p@nrHn@
zrfw}OlMYy};q}QqgMshx3gm;lKFa4HjRb!Bv-WTLe+I*^6JI{G>#zRez`#DQ1f1rG
zC&<n~du&y^nUn>R(ki)g+;VkIdKb>;5{eegEBtP^==<^8@K`%O)2}z;sO$0c+nvv~
zM1{8Eh<sgeN>%wRYA)cQUVTITbWQu0#M&5ZILmU^-0-JIibv%MhL6)z#QhGU@S@<+
zW$#eAu6e8~t~<7AKQ<LzYLSv{V%Fp-)zi^k+p+ydN4C?PE1x~1!_yPDauv0#S1D^?
zcw+_Mo=1F&Vqn_1GI)1n`>9P&i>rlP+<#c?sMwLRW$pIYw`=z#8@<2(H{###e=`{V
zoWHG8r_6pzuvaqDLa*gN!>;IyUuq^g&5hzc8z23}d4K)a<lpaqCEnlvYvbSVe+?Kv
z)%`yHZ~2r8xn=F{tDhaO`_J&|`1ku?4nq|tFuvbY|Ekix;lhirH8)_^l)wMCroQ$+
zgAar4_x`{B8Xu%&<<`%Z{r#U|wfz11%Y1+TGhFR|zyD<e`@WC=8CE`(V|HGA^>Z@R
zOt|F?-|yLfwOM>1qoitI4%|%Z@BbOX?Dzd=SjHd+aT_mRfB4xxn4#@3Hy&WVU-6&e
z%BOzDg%@A_+z9g$%#jS$@8y5Z3w&T=WgBmXFcj`e2L9T|{~3Z$*|3fJ9Cw`Gth==O
z+>2Ll^LC4G{H=86-|^$e4<DAD^5dKCzw7nl`}7Vlf8%qQv+7f><oWFI+PCx7`?~s+
UGV>W2Y?~*Lki6ka?f>5d0GQHug#Z8m

literal 0
HcmV?d00001

diff --git a/tensorflow/contrib/verbs/verbs_with_0_copies_phase1_protocol.xml b/tensorflow/contrib/verbs/verbs_with_0_copies_phase1_protocol.xml
new file mode 100644
index 0000000000..484e7c78ae
--- /dev/null
+++ b/tensorflow/contrib/verbs/verbs_with_0_copies_phase1_protocol.xml
@@ -0,0 +1 @@
+<mxfile userAgent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36" version="7.8.4" editor="www.draw.io" type="device"><diagram name="Page-1" id="74e2e168-ea6b-b213-b513-2b3c1d86103e">7Vxbc5s4FP41nuk+pMMd8ujYTrYzTZqNk9nmyaOAbNhgRIWc2P31K4G4yzZ1AHtaZzJjOBLS8TnnOzeRDNTRcn2DQejeIgf6A0Vy1gN1PFAUWVMM+sEom4RiGpcJYYE9h0/KCVPvJ+REiVNXngOj0kSCkE+8sEy0URBAm5RoAGP0Xp42R3551xAsYI0wtYFfp/7rOcTlVFmS8oG/obdw+daWzgdegP26wGgV8P0GijqPf5LhJUjX4vMjFzjovUBSJwN1hBEiydVyPYI+k20qtuS56y2jGd8YBqTJA1bywBvwVzDl2PDpo1eO98b4IxsuE+PHijF1ReCaXADfWwQDdUhn+HBO8lF6teCf8SpRCIKUNiUAk09/pUOUqeJogRxvXaa2z01Ke8ECDnpkDCxDehG8ROzjgs4c+j6yAYGPMIgQjkoC64WBKQycT4+Tu+m3h9nD5J+nyfSxax62a6K0m1LaRYlxBpklS3T43fUInIbAZqPv1C9RmkuWPr2Ts6ffIKZMbQWLnEGQujaIlpDgDZ3CH7A4aLlTkw1+/567CCX1EG7BO2RuA3C3tMiWzqFJLzg6xUhNPUbrUH2A9ltia7eQgDEgoHOTa6juNnZjBj2GoE9M1UHc/X4xZq+erq8nDLPT+29308nWTU8LRqrSJ4xkQwCjikCgQ5MBfoswcdECBcCf5NSrssgK4vkPErLh+QxYEURJ+QpfEQpLYmQb7RYi5QutsJ3O4rzSQLqA6TRNLGwMfUC8t/L6H5Kc0pEDivHiON/wU+hQyDzAKERBBLsHDfN8XylM/WG0Cezu9/syj/XyY+VhZjvDPgP7gKGsJRL7LiMUbm7unx7R6F6UXT2dUp7XqSCmEHuUsZ/wqN/45HMnQztq8qQfw8lT0eDN9+LNM1vss85u1x75Xrp75hsdGBq0emhIqvAPhAb+6D3y6M6ZKi+VsipNo6KhhAf+VK6kIcZgU5gWsglR831Us1LL7plvWLvnW9rO+fQi4Ti3sEyGzQKmVguY1x6OyKO3hMyZmCP2W/MyBbeQYDdNy0cuCBbQoX5GvW6KchctX1bRfoQ7NCTZxEPU2YypWTFEdoH6nnO9y/25XuSCkF3Ofbgess5RDFWHX45tH0SRZ5eFNfd8f4R8hOMl0gZPAe9SHe+HodoS5HuKWOIFieoCgaa0D2K/mrwrVewn3NewX1tI1aXPpiwZpiXLlqFrplFeV27mUw6AZWoEfVlFi/5cGhxR9bpx+VmxLlVFtixZ1XSlpDCtqrCmhrB7WbVhbDnEDtSaHTzDqGYKLA8rKzoiGL3CVNUBCmBF+5zEk7exTfUMKf2KuVKPlRt8YOk5TpxpiJxzOfvowherdV+sCcxHaSP/qofCO/T7itqSjihqUYOjq+KKFUD3KBSW7H2VQBfbUqgiAw/jW7bCO6bap5+fkr7cID5BIlSrv8r4iVVThsDAusurVH1/BO2zvOI1VJZwHRx0FVMQdLsposyqBrVmgW57EfWRUGjWFLqlIXdidq/12kVQ6xlDTSKnXU+kAaZk4KZY5P1klXKloNDMA/PI6kJ6VeMtdSVq+8jtdg3UBgcUnRiZoEl1oJEZdSNTj2pku2sMU+2kdEnbSR2ULmrdX7d9FjxK8qLf6ShY0Fo78FSmvyOWETtiubl/aqSHftgaw0h45LGbq/hJWqzte+TEEm3rmHl2muwIYO7KjYDA62ERziF1p7ggdbbiFqEfXpfTUsr2ggUl6PndY5zBXyjbNIgoY3M/jmQuLdth0E2JXlLsZUO9VrP0g9QqakC2olb2FsifrNRqedCrVkXFAQ9vVSc3R3EaYWfpWK5ImphJEuOxBtnx7XB2O5lOhzeTWfntvILCk5VrLvWlAzM4sZ5b1mNLD5gtgfJFOWYbTTet3t/sTvnZa15n5W9Tvqr1qXxRO6xz5Sfv+J21L9C+1iv0t/fbW9F+loLHK1T71tloVe1na8hydr1Pa+iqMm+54E4JX5bLZH4TE2UGyv6Spboq902/ZH27akDRAUzL3/vag74Tlb96kUGyCeFAGfHOAIzIzKNWuk5IAVjywYjAcEZ1z2cuEYEz4DiYE17hJrmidgRmDiBgb/F7wOHTPuRSzTnGi6EbA1EXULHtE8SwXMawInhvSBVl8nobGO76j6I6wrAIZlJt9p8LdKF8dgL9DNuPwVYVJGLdwVb0tt8Ztn8gbD8Un7e+kXvG/i9hX+8zZKdrnLFf3boCj9P3AA2PAs+424I7Q9D0bgt39Db/1wTJuXX+/x/Uyf8=</diagram></mxfile>
\ No newline at end of file
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index fc49825748..94973a0e52 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -279,6 +279,7 @@ cc_library(
         "platform/platform.h",
         "platform/protobuf.h",
         "platform/types.h",
+        "platform/windows/cpu_info.h",
         "lib/bfloat16/bfloat16.h",
     ] + tf_additional_proto_hdrs() + glob(tf_env_time_hdrs()),
     copts = tf_copts(),
@@ -865,6 +866,7 @@ cc_library(
         "//tensorflow/core/kernels:mkl_pooling_ops",
         "//tensorflow/core/kernels:mkl_relu_op",
         "//tensorflow/core/kernels:mkl_reshape_op",
+        "//tensorflow/core/kernels:mkl_softmax_op",
         "//tensorflow/core/kernels:mkl_tfconv_op",
         "//tensorflow/core/kernels:mkl_aggregate_ops",
     ]),
@@ -2831,6 +2833,7 @@ tf_cc_test_mkl(
         "//tensorflow/core/kernels:mkl_pooling_ops",
         "//tensorflow/core/kernels:mkl_relu_op",
         "//tensorflow/core/kernels:mkl_reshape_op",
+        "//tensorflow/core/kernels:mkl_softmax_op",
         "//tensorflow/core/kernels:mkl_tfconv_op",
     ]),
 )
diff --git a/tensorflow/core/framework/types.h b/tensorflow/core/framework/types.h
index cb8e77f1df..ded6aa0991 100644
--- a/tensorflow/core/framework/types.h
+++ b/tensorflow/core/framework/types.h
@@ -453,6 +453,13 @@ inline bool DataTypeIsInteger(DataType dt) {
   return kDataTypeIsInteger.Contains(dt);
 }
 
+// Is the dtype a signed integral type?
+constexpr DataTypeSet kDataTypeIsSigned =
+    ToSet(DT_INT8) | ToSet(DT_INT16) | ToSet(DT_INT32) | ToSet(DT_INT64);
+inline bool DataTypeIsSigned(DataType dt) {
+  return kDataTypeIsSigned.Contains(dt);
+}
+
 // Is the dtype an unsigned integral type?
 constexpr DataTypeSet kDataTypeIsUnsigned =
     ToSet(DT_UINT8) | ToSet(DT_UINT16) | ToSet(DT_UINT32) | ToSet(DT_UINT64);
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index 89b23f22fd..55bc401b9d 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -2456,9 +2456,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     // NOTE: names are alphabetically sorted.
     rinfo_.push_back({csinfo_.addn, mkl_op_registry::GetMklOpName(csinfo_.addn),
                       CopyAttrsAddN, AddNRewrite});
-    rinfo_.push_back({csinfo_.add,
+    /* rinfo_.push_back({csinfo_.add,
                       mkl_op_registry::GetMklOpName(csinfo_.add),
-                      CopyAttrsDataType, AlwaysRewrite});
+                      CopyAttrsDataType, AlwaysRewrite}); */
     rinfo_.push_back({csinfo_.avg_pool,
                       mkl_op_registry::GetMklOpName(csinfo_.avg_pool),
                       CopyAttrsPooling, AlwaysRewrite});
@@ -3117,7 +3117,9 @@ void MklLayoutRewritePass::GetDummyMklTensorNode(std::unique_ptr<Graph>* g,
     Node* orig_input0 = nullptr;
     TF_CHECK_OK(orig_node->input_node(0,
                                       const_cast<const Node**>(&orig_input0)));
-    CHECK_NOTNULL((*g)->AddControlEdge(orig_input0, *out));
+    // Allow duplicate while adding control edge as it would fail (return
+    // NULL) if we try to add duplicate edge.
+    CHECK_NOTNULL((*g)->AddControlEdge(orig_input0, *out, true));
   }
 
   (*out)->set_assigned_device_name(orig_node->assigned_device_name());
@@ -3382,8 +3384,8 @@ void MklLayoutRewritePass::GetDummyWorkspaceTensorNode(
     std::unique_ptr<Graph>* g, Node** out, Node* orig_node) {
   // We use a tensor of shape {1} and value 0 to represent
   // dummy float tensor. We need this as a dummy workspace tensor.
-  // Workspace tensor has type float.
-  const DataType dt = DataTypeToEnum<float>::v();
+  // Workspace tensor has type uint8.
+  const DataType dt = DataTypeToEnum<uint8>::v();
   TensorProto proto;
   proto.set_dtype(dt);
   float zero[1] = {0};
@@ -3413,7 +3415,9 @@ void MklLayoutRewritePass::GetDummyWorkspaceTensorNode(
     Node* orig_input0 = nullptr;
     TF_CHECK_OK(orig_node->input_node(0,
                                       const_cast<const Node**>(&orig_input0)));
-    CHECK_NOTNULL((*g)->AddControlEdge(orig_input0, *out));
+    // Allow duplicate while adding control edge as it would fail (return
+    // NULL) if we try to add duplicate edge.
+    CHECK_NOTNULL((*g)->AddControlEdge(orig_input0, *out, true));
   }
 
   (*out)->set_assigned_device_name(orig_node->assigned_device_name());
@@ -3863,12 +3867,16 @@ Status MklLayoutRewritePass::MergeConv2DWithBiasAdd(std::unique_ptr<Graph>* g,
   // node are already copied in BuildNode. We handle control edges now.
   for (const Edge* e : pred->in_edges()) {
     if (e->IsControlEdge()) {
-      CHECK_NOTNULL((*g)->AddControlEdge(e->src(), new_node));
+      // Allow duplicate while adding control edge as it would fail (return
+      // NULL) if we try to add duplicate edge.
+      CHECK_NOTNULL((*g)->AddControlEdge(e->src(), new_node, true));
     }
   }
   for (const Edge* e : succ->in_edges()) {
     if (e->IsControlEdge()) {
-      CHECK_NOTNULL((*g)->AddControlEdge(e->src(), new_node));
+      // Allow duplicate while adding control edge as it would fail (return
+      // NULL) if we try to add duplicate edge.
+      CHECK_NOTNULL((*g)->AddControlEdge(e->src(), new_node, true));
     }
   }
 
@@ -3876,14 +3884,18 @@ Status MklLayoutRewritePass::MergeConv2DWithBiasAdd(std::unique_ptr<Graph>* g,
   // First, we will fix outgoing control edges from 'pred' node.
   for (const Edge* e : pred->out_edges()) {
     if (e->IsControlEdge()) {
-      CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst()));
+      // Allow duplicate while adding control edge as it would fail (return
+      // NULL) if we try to add duplicate edge.
+      CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst(), true));
     }
   }
 
   // Second, we will fix outgoing control and data edges from 'succ' node.
   for (const Edge* e : succ->out_edges()) {
     if (e->IsControlEdge()) {
-      CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst()));
+      // Allow duplicate while adding control edge as it would fail (return
+      // NULL) if we try to add duplicate edge.
+      CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst(), true));
     } else {
       // BiasAdd has only 1 output (at slot 0) and merged node also has only 1
       // output (at slot 0).
@@ -3966,12 +3978,16 @@ Status MklLayoutRewritePass::MergeConv2DBackpropFilterWithBiasAddGrad(
   // edges now.
   for (const Edge* e : badd->in_edges()) {
     if (e->IsControlEdge()) {
-      CHECK_NOTNULL((*g)->AddControlEdge(e->src(), new_node));
+      // Allow duplicate while adding control edge as it would fail (return
+      // NULL) if we try to add duplicate edge.
+      CHECK_NOTNULL((*g)->AddControlEdge(e->src(), new_node, true));
     }
   }
   for (const Edge* e : fltr->in_edges()) {
     if (e->IsControlEdge()) {
-      CHECK_NOTNULL((*g)->AddControlEdge(e->src(), new_node));
+      // Allow duplicate while adding control edge as it would fail (return
+      // NULL) if we try to add duplicate edge.
+      CHECK_NOTNULL((*g)->AddControlEdge(e->src(), new_node, true));
     }
   }
 
@@ -3987,7 +4003,9 @@ Status MklLayoutRewritePass::MergeConv2DBackpropFilterWithBiasAddGrad(
 
   for (const Edge* e : badd->out_edges()) {
     if (e->IsControlEdge()) {
-      CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst()));
+      // Allow duplicate while adding control edge as it would fail (return
+      // NULL) if we try to add duplicate edge.
+      CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst(), true));
     } else {
       CHECK_NOTNULL((*g)->AddEdge(new_node, kMergedNodeBiasGradOutputIdx,
                                   e->dst(), e->dst_input()));
@@ -3997,7 +4015,11 @@ Status MklLayoutRewritePass::MergeConv2DBackpropFilterWithBiasAddGrad(
   // Second, we will fix outgoing control and data edges from 'fltr' node.
   for (const Edge* e : fltr->out_edges()) {
     if (e->IsControlEdge()) {
-      CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst()));
+      // We allow duplicate edge for this case since we already add control
+      // edge from new_node in line 3990. Line below could be adding same
+      // edge to same destination again. In such case, if we do not allow
+      // duplicate edge, then this call will fail.
+      CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst(), true));
     } else {
       CHECK_NOTNULL((*g)->AddEdge(new_node, kMergedNodeFilterGradOutputIdx,
                                   e->dst(), e->dst_input()));
@@ -4091,7 +4113,9 @@ Status MklLayoutRewritePass::RewriteNode(std::unique_ptr<Graph>* g,
   // already copied in BuildNode. We need to handle control edges now.
   for (const Edge* e : orig_node->in_edges()) {
     if (e->IsControlEdge()) {
-      CHECK_NOTNULL((*g)->AddControlEdge(e->src(), new_node));
+      // Allow duplicate while adding control edge as it would fail (return
+      // NULL) if we try to add duplicate edge.
+      CHECK_NOTNULL((*g)->AddControlEdge(e->src(), new_node, true));
     }
   }
 
@@ -4104,7 +4128,9 @@ Status MklLayoutRewritePass::RewriteNode(std::unique_ptr<Graph>* g,
   // GetTensorDataIndex provides this mapping function.
   for (const Edge* e : orig_node->out_edges()) {
     if (e->IsControlEdge()) {
-      CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst()));
+      // Allow duplicate while adding control edge as it would fail (return
+      // NULL) if we try to add duplicate edge.
+      CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst(), true));
     } else {
       CHECK_NOTNULL((*g)->AddEdge(new_node, GetTensorDataIndex(e->src_output(),
                             e->src()->num_outputs()),
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 96f0feaf3d..fd99409c9b 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -5846,6 +5846,23 @@ tf_mkl_kernel_library(
     ]),
 )
 
+tf_mkl_kernel_library(
+    name = "mkl_softmax_op",
+    prefix = "mkl_softmax",
+    deps = [
+        ":bounds_check",
+        ":ops_util",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:nn_ops_op_lib",
+    ] + if_mkl([
+        "//third_party/mkl:intel_binary_blob",
+        "@mkl_dnn//:mkl_dnn",
+    ]),
+)
+
 tf_mkl_kernel_library(
     name = "mkl_fused_batch_norm_op",
     srcs = ["mkl_fused_batch_norm_op.cc"],
diff --git a/tensorflow/core/kernels/cuda_solvers.h b/tensorflow/core/kernels/cuda_solvers.h
index 3c389a82ab..ecfa23750c 100644
--- a/tensorflow/core/kernels/cuda_solvers.h
+++ b/tensorflow/core/kernels/cuda_solvers.h
@@ -427,7 +427,7 @@ inline DeviceLapackInfo CudaSolver::GetDeviceLapackInfo(
     int64 size, const string& debug_info) {
   DeviceLapackInfo new_dev_info(context_, size, debug_info);
   scratch_tensor_refs_.emplace_back(new_dev_info.tensor());
-  return std::move(new_dev_info);
+  return new_dev_info;
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_pow.cc b/tensorflow/core/kernels/cwise_op_pow.cc
index 5fb0735ac1..cf86478b0f 100644
--- a/tensorflow/core/kernels/cwise_op_pow.cc
+++ b/tensorflow/core/kernels/cwise_op_pow.cc
@@ -16,8 +16,9 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER7(BinaryOp, CPU, "Pow", functor::pow, float, Eigen::half, double, int32,
-          int64, complex64, complex128);
+REGISTER5(BinaryOp, CPU, "Pow", functor::pow, float, Eigen::half, double,
+          complex64, complex128);
+REGISTER2(BinaryOp, CPU, "Pow", functor::safe_pow, int32, int64);
 
 #if GOOGLE_CUDA
 REGISTER4(BinaryOp, GPU, "Pow", functor::pow, float, Eigen::half, double,
@@ -25,5 +26,5 @@ REGISTER4(BinaryOp, GPU, "Pow", functor::pow, float, Eigen::half, double,
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(BinaryOp, SYCL, "Pow", functor::pow, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_ops.h b/tensorflow/core/kernels/cwise_ops.h
index da70b1e314..06918075a4 100644
--- a/tensorflow/core/kernels/cwise_ops.h
+++ b/tensorflow/core/kernels/cwise_ops.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <type_traits>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
 #include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/kernels/bounds_check.h"
@@ -115,6 +116,35 @@ struct functor_traits<scalar_binary_pow_op_google<Scalar, Exponent>> {
   enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
 };
 
+template <typename Scalar, typename Exponent>
+struct safe_scalar_binary_pow_op {
+  static_assert(std::is_integral<Scalar>::value, "Integer type expected");
+  static_assert(std::is_integral<Exponent>::value &&
+                    std::is_signed<Exponent>::value,
+                "Signed integer type expected");
+
+  bool* const error;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE safe_scalar_binary_pow_op(bool* error)
+      : error(error) {}
+
+  EIGEN_DEVICE_FUNC inline Scalar operator()(const Scalar& a,
+                                             const Exponent& b) const {
+    const Exponent safe_b = tensorflow::internal::SubtleMustCopy(b);
+    if (TF_PREDICT_TRUE(safe_b >= 0)) {
+      return numext::pow(a, safe_b);
+    } else {
+      *error = true;
+      return 0;
+    }
+  }
+};
+
+template <typename Scalar, typename Exponent>
+struct functor_traits<safe_scalar_binary_pow_op<Scalar, Exponent>> {
+  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
+};
+
 template <typename T, typename DivOrMod>
 struct safe_div_or_mod_op {
   static_assert(std::is_integral<T>::value, "Integer type expected");
@@ -741,6 +771,11 @@ struct floor_div_real : base<T, Eigen::internal::google_floor_div_real<T>> {};
 template <typename T>
 struct pow : base<T, Eigen::internal::scalar_binary_pow_op_google<T, T>> {};
 
+template <typename T>
+struct safe_pow : base<T, Eigen::internal::safe_scalar_binary_pow_op<T, T>> {
+  static const bool has_errors = true;
+};
+
 template <typename T>
 struct maximum : base<T, Eigen::internal::scalar_max_op<T>> {};
 
diff --git a/tensorflow/core/kernels/cwise_ops_common.cc b/tensorflow/core/kernels/cwise_ops_common.cc
index 693c6467ac..e561e59cf5 100644
--- a/tensorflow/core/kernels/cwise_ops_common.cc
+++ b/tensorflow/core/kernels/cwise_ops_common.cc
@@ -40,6 +40,11 @@ void BinaryOpShared::SetComputeError(OpKernelContext* ctx) {
   if ((op == "Div" || op == "Mod" || op == "FloorMod" || op == "FloorDiv") &&
       DataTypeIsInteger(ctx->op_kernel().input_type(0))) {
     ctx->CtxFailure(errors::InvalidArgument("Integer division by zero"));
+  } else if ((op == "Pow") &&
+             DataTypeIsInteger(ctx->op_kernel().input_type(0)) &&
+             DataTypeIsSigned(ctx->op_kernel().input_type(1))) {
+    ctx->CtxFailure(errors::InvalidArgument(
+        "Integers to negative integer powers are not allowed"));
   } else {
     ctx->CtxFailure(
         errors::Internal("Unexpected error in binary operator "
diff --git a/tensorflow/core/kernels/decode_image_op.cc b/tensorflow/core/kernels/decode_image_op.cc
index ceb152c3f0..44dcbf834c 100644
--- a/tensorflow/core/kernels/decode_image_op.cc
+++ b/tensorflow/core/kernels/decode_image_op.cc
@@ -87,11 +87,10 @@ class DecodeImageOp : public OpKernel {
       channels_ = 3;
     } else {
       OP_REQUIRES_OK(context, context->GetAttr("channels", &channels_));
-      OP_REQUIRES(
-          context,
-          channels_ == 0 || channels_ == 1 || channels_ == 3 || channels_ == 4,
-          errors::InvalidArgument("channels must be 0, 1, 3, or 4, got ",
-                                  channels_));
+      OP_REQUIRES(context, channels_ == 0 || channels_ == 1 || channels_ == 3 ||
+                               channels_ == 4,
+                  errors::InvalidArgument(
+                      "channels must be 0, 1, 3, or 4, got ", channels_));
     }
     flags_.components = channels_;
 
@@ -115,9 +114,8 @@ class DecodeImageOp : public OpKernel {
 
     if (format_ == kJpgFormat) {
       OP_REQUIRES_OK(context, context->GetAttr("ratio", &flags_.ratio));
-      OP_REQUIRES(context,
-                  flags_.ratio == 1 || flags_.ratio == 2 || flags_.ratio == 4 ||
-                      flags_.ratio == 8,
+      OP_REQUIRES(context, flags_.ratio == 1 || flags_.ratio == 2 ||
+                               flags_.ratio == 4 || flags_.ratio == 8,
                   errors::InvalidArgument("ratio must be 1, 2, 4, or 8, got ",
                                           flags_.ratio));
       OP_REQUIRES_OK(context, context->GetAttr("fancy_upscaling",
@@ -132,9 +130,8 @@ class DecodeImageOp : public OpKernel {
       string dct_method;
       OP_REQUIRES_OK(context, context->GetAttr("dct_method", &dct_method));
       OP_REQUIRES(
-          context,
-          (dct_method.empty() || dct_method == "INTEGER_FAST" ||
-           dct_method == "INTEGER_ACCURATE"),
+          context, (dct_method.empty() || dct_method == "INTEGER_FAST" ||
+                    dct_method == "INTEGER_ACCURATE"),
           errors::InvalidArgument("dct_method must be one of "
                                   "{'', 'INTEGER_FAST', 'INTEGER_ACCURATE'}"));
       if (dct_method == "INTEGER_FAST") {
@@ -160,9 +157,9 @@ class DecodeImageOp : public OpKernel {
         errors::InvalidArgument("Expected image (JPEG, PNG, or GIF), got ",
                                 FileFormatString(magic, input)));
     OP_REQUIRES(context, input.size() <= std::numeric_limits<int>::max(),
-                errors::InvalidArgument(
-                    FileFormatString(magic, input),
-                    " contents are too large for int: ", input.size()));
+                errors::InvalidArgument(FileFormatString(magic, input),
+                                        " contents are too large for int: ",
+                                        input.size()));
     OP_REQUIRES(context, magic == kPngFormat || channel_bits_ == 8,
                 errors::InvalidArgument(FileFormatString(magic, input),
                                         " does not support uint16 output"));
@@ -215,10 +212,9 @@ class DecodeImageOp : public OpKernel {
             input.data(), input.size(), flags, nullptr /* nwarn */,
             [=, &output](int width, int height, int channels) -> uint8* {
               Status status(context->allocate_output(
-                  0,
-                  format_ == kGifFormat
-                      ? TensorShape({1, height, width, channels})
-                      : TensorShape({height, width, channels}),
+                  0, format_ == kGifFormat
+                         ? TensorShape({1, height, width, channels})
+                         : TensorShape({height, width, channels}),
                   &output));
               if (!status.ok()) {
                 VLOG(1) << status;
@@ -294,6 +290,7 @@ class DecodeImageOp : public OpKernel {
 
     // Decode GIF, allocating tensor once the size is known.
     Tensor* output = nullptr;
+    string error_string;
     OP_REQUIRES(
         context,
         gif::Decode(input.data(), input.size(),
@@ -320,8 +317,10 @@ class DecodeImageOp : public OpKernel {
                         return nullptr;
                       }
                       return output->flat<uint8>().data();
-                    }),
-        errors::InvalidArgument("Invalid GIF data, size ", input.size()));
+                    },
+                    &error_string),
+        errors::InvalidArgument("Invalid GIF data (size ", input.size(), "), ",
+                                error_string));
   }
 
  private:
diff --git a/tensorflow/core/kernels/matmul_op.cc b/tensorflow/core/kernels/matmul_op.cc
index b3acb91202..cb68690f28 100644
--- a/tensorflow/core/kernels/matmul_op.cc
+++ b/tensorflow/core/kernels/matmul_op.cc
@@ -539,6 +539,7 @@ struct MatMulFunctor<SYCLDevice, T> {
   REGISTER_KERNEL_BUILDER(                                                     \
       Name("MatMul").Device(DEVICE_CPU).TypeConstraint<T>("T").Label("eigen"), \
       MatMulOp<CPUDevice, T, false /* cublas, ignored for CPU */>);
+
 #define REGISTER_CPU(T)                                             \
   REGISTER_KERNEL_BUILDER(                                          \
       Name("MatMul").Device(DEVICE_CPU).TypeConstraint<T>("T"),     \
diff --git a/tensorflow/core/kernels/mkl_aggregate_ops.cc b/tensorflow/core/kernels/mkl_aggregate_ops.cc
index 44b94be3a0..bb5eceab27 100644
--- a/tensorflow/core/kernels/mkl_aggregate_ops.cc
+++ b/tensorflow/core/kernels/mkl_aggregate_ops.cc
@@ -61,6 +61,18 @@ class MklAddNOp : public OpKernel {
     GetMklShape(ctx, src2_idx, &(mkl_context.input2_shape));
     bool input2_in_mkl_format = mkl_context.input2_shape.IsMklTensor();
 
+    // if the shapes of two tensors are not same raise op error
+    TensorShape src1_shape, src2_shape;
+    src1_shape = input0.shape();
+    src2_shape = input1.shape();
+    if (!src1_shape.IsSameSize(src2_shape) ){
+      ctx->SetStatus( 
+          errors::InvalidArgument(
+          "Inputs to operation ", this->name(), " of type ", this->type_string(),
+          " must have the same size and shape.  Input 0: ",
+          src1_shape.DebugString(), " != input 1: ",
+          src2_shape.DebugString()));
+    }
     // handle the case of a scalar
     if (!input1_in_mkl_format && input0.dims() == 0) {
       const TensorShape& o_shape = input0.shape();
@@ -307,6 +319,18 @@ class MklAddNOp : public OpKernel {
        src1_mkl_shape.GetDimension(): src1_tensor.dims();
       int src2_dims_size = input2_in_mkl_format?
        src2_mkl_shape.GetDimension(): src2_tensor.dims();
+      // if the shapes of two tensors are not same raise op error
+      TensorShape src1_shape, src2_shape;
+      src1_shape = src1_tensor.shape();
+      src2_shape = src2_tensor.shape();
+      if (!src1_shape.IsSameSize(src2_shape) ){
+	ctx->SetStatus( 
+            errors::InvalidArgument(
+            "Inputs to operation ", this->name(), " of type ", this->type_string(),
+            " must have the same size and shape.  Input 0: ",
+            src1_shape.DebugString(), " != input 1: ",
+            src2_shape.DebugString()));
+      }
 
       if (!input1_in_mkl_format && src1_dims_size == 0) {
          Tensor* dst_tensor = nullptr;
diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl_concat_op.cc
index 82771792d7..d109bb6bcf 100644
--- a/tensorflow/core/kernels/mkl_concat_op.cc
+++ b/tensorflow/core/kernels/mkl_concat_op.cc
@@ -598,7 +598,6 @@ class MklConcatOp : public OpKernel {
             concat_dim_tensor.shape().DebugString()));
       int32 concat_dim = internal::SubtleMustCopy(
                            concat_dim_tensor.scalar<int32>()());
-      if (concat_dim < 0) concat_dim = N + concat_dim;
 
       // check that ranks of all tensors match
       // and that their shapes match except for concat_dim.
@@ -609,6 +608,9 @@ class MklConcatOp : public OpKernel {
                                          input_shapes[0].GetTfShape() :
                                          input_tensors[0].shape();
       size_t expected_dims = expected_shape.dims();
+
+      if (concat_dim < 0) concat_dim = expected_dims + concat_dim;
+
       for (auto& s : input_shapes) {
         if (s == expected_shape) {++i; continue;}
 
diff --git a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
index 793fa24d99..54d4916d49 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
@@ -467,6 +467,13 @@ class MklConv2DCustomBackpropFilterOp :
     return filter_tf_shape;
   }
 
+  TensorShape GetOutputTfShape(const TensorShape& input_shape,
+                               const TensorShape& filter_shape,
+                               const TensorShape& outbprop_shape) {
+    // Shape of output of Conv2DBackpropFilter is same as shape of filter.
+    return filter_shape;
+  }
+
   const memory::dims& GetOutputDims(const memory::dims& fwd_input_dims,
                                     const memory::dims& fwd_filter_dims) {
     // Shape of output of Conv2DBackpropFilter is same as shape of filter.
diff --git a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
index db9e97e7ca..ef6db58d31 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
@@ -396,6 +396,13 @@ class MklConv2DCustomBackpropInputOp :
     return GetTfShape(context, kInputIndex_Filter);
   }
 
+  TensorShape GetOutputTfShape(const TensorShape& input_shape,
+                               const TensorShape& filter_shape,
+                               const TensorShape& outbprop_shape) {
+    // Output Shape of Conv2DBackpropInput is same as shape of Conv2D 'input'.
+    return input_shape;
+  }
+
   const memory::dims& GetOutputDims(const memory::dims& fwd_input_dims,
                                     const memory::dims& fwd_filter_dims) {
     // Output Shape of Conv2DBackpropInput is same as shape of Conv2D 'input'.
diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc
index a4e139bb54..0e77b45993 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@@ -551,6 +551,13 @@ class MklConv2DOp : public OpKernel {
         output_mkl_shape.SetMklTensor(false);
         AllocateOutputSetMklShape(context, kOutputIndex_Dst, &output_tensor,
                                     src_tf_shape, output_mkl_shape);
+
+        // MklConv2D also outputs converted filter as 2nd output of Conv2D.
+        filter_mkl_shape.SetMklTensor(false);
+        Tensor* output_filter_tensor = nullptr;
+        AllocateOutputSetMklShape(context, kOutputIndex_Filter,
+                                  &output_filter_tensor,
+                                  filter_tf_shape, filter_mkl_shape);
         return;
       }
 
diff --git a/tensorflow/core/kernels/mkl_conv_ops.h b/tensorflow/core/kernels/mkl_conv_ops.h
index b6883dbaa2..c6456bd5c3 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.h
+++ b/tensorflow/core/kernels/mkl_conv_ops.h
@@ -390,6 +390,29 @@ class MklConv2DBackpropCommonOp :  public OpKernel {
       TensorShape filter_tf_shape = MakeFilterTfShape(context, filter_tensor);
       TensorShape outbprop_tf_shape = GetTfShape(context, kOutbpropIdx);
 
+      // Corner cases: output with 0 elements and 0 batch size.
+      Tensor* output_tensor = nullptr;
+      if (input_tf_shape.num_elements() == 0 ||
+          filter_tf_shape.num_elements() == 0 ||
+          outbprop_tf_shape.num_elements() == 0) {
+        MklDnnShape output_mkl_shape;
+        output_mkl_shape.SetMklTensor(false);
+        TensorShape output_tf_shape = GetOutputTfShape(input_tf_shape,
+                                                       filter_tf_shape,
+                                                       outbprop_tf_shape);
+        const int kOutputIdx = 0;
+        AllocateOutputSetMklShape(context, kOutputIdx, &output_tensor,
+                                    output_tf_shape, output_mkl_shape);
+        CHECK_NOTNULL(output_tensor);
+
+        // if output tensor has more than 0 elements, we need to 0 them out.
+        for (size_t i = 0; i < output_tf_shape.num_elements(); ++i) {
+          output_tensor->flat<T>().data()[i] = 0;
+        }
+
+        return;
+      }
+
       // By default, all dims are in MKL order. Only dims in TF order
       // are those with prefix tf_order.
       memory::dims outbprop_dims, fwd_input_dims, fwd_filter_dims;
@@ -471,7 +494,6 @@ class MklConv2DBackpropCommonOp :  public OpKernel {
       output.SetOpMemDesc(bwd_output_dims, memory::format::any);
 
       // Operator-specific call to create and execute primitive.
-      Tensor* output_tensor = nullptr;
       CreatePrimitive(context, cpu_engine, fwd_pd, &input, &filter,
                       &outbackprop, &output, &output_tensor,
                       strides, padding_l, padding_r,
@@ -507,6 +529,11 @@ class MklConv2DBackpropCommonOp :  public OpKernel {
   virtual TensorShape MakeFilterTfShape(OpKernelContext* context,
                                        const Tensor& filter_tensor) = 0;
 
+  /// Get the TensorFlow shape of output tensor.
+  virtual TensorShape GetOutputTfShape(const TensorShape& input_shape,
+                                       const TensorShape& filter_shape,
+                                       const TensorShape& outbprop_shape) = 0;
+
   /// Get shape of output in MKL-DNN order. Computes shape of output from
   /// input shape (fwd_input_dims) and filter shape (fwd_filter_dims).
   virtual
diff --git a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
index a761562a4b..8340a91d05 100644
--- a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
@@ -703,27 +703,31 @@ class MklFusedBatchNormOp : public OpKernel {
   void Compute(OpKernelContext* context) override {
     try {
       auto cpu_engine = engine(engine::cpu, 0);
-      const size_t src_index = 0;    // index of src input tensor
-      const size_t scale_index = 1;  // index of scale tensor
-      const size_t shift_index = 2;  // index of shift tensor
-      const size_t mean_index = 3;   // index of est_mean tensor
-      const size_t var_index = 4;    // index of est_variance tensor
-
-      const Tensor& src_tensor          = MklGetInput(context, src_index);
-      const Tensor& scale_tensor        = MklGetInput(context, scale_index);
-      const Tensor& shift_tensor        = MklGetInput(context, shift_index);
-      const Tensor& est_mean_tensor     = MklGetInput(context, mean_index);
-      const Tensor& est_variance_tensor = MklGetInput(context, var_index);
-
+      const size_t kSrcIndex = 0;       // index of src input tensor
+      const size_t kScaleIndex = 1;     // index of scale tensor
+      const size_t kShiftIndex = 2;     // index of shift tensor
+      const size_t kMeanIndex = 3;      // index of est_mean tensor
+      const size_t kVarianceIndex = 4;  // index of est_variance tensor
+
+      const Tensor& src_tensor          = MklGetInput(context, kSrcIndex);
+      const Tensor& scale_tensor        = MklGetInput(context, kScaleIndex);
+      const Tensor& shift_tensor        = MklGetInput(context, kShiftIndex);
+      const Tensor& est_mean_tensor     = MklGetInput(context, kMeanIndex);
+      const Tensor& est_variance_tensor = MklGetInput(context,
+                                                      kVarianceIndex);
+
+      TensorShape tf_shape_src;
       MklDnnShape dnn_shape_src;
-      GetMklShape(context, src_index, &dnn_shape_src);
+      GetMklShape(context, kSrcIndex, &dnn_shape_src);
 
       if (dnn_shape_src.IsMklTensor()) {
+        tf_shape_src = dnn_shape_src.GetTfShape();
         OP_REQUIRES(context, dnn_shape_src.GetDimension() == 4,
                     errors::InvalidArgument(
                         "input must be 4-dimensional",
                         src_tensor.shape().DebugString()));
       } else {
+        tf_shape_src = src_tensor.shape();
         OP_REQUIRES(context, src_tensor.dims() == 4,
                     errors::InvalidArgument(
                         "input must be 4-dimensional",
@@ -756,39 +760,35 @@ class MklFusedBatchNormOp : public OpKernel {
                         est_variance_tensor.shape().DebugString()));
       }
 
+      // special case: input with 0 element and 0 batch size
+      Tensor* dst_tensor = nullptr;
+      if (tf_shape_src.num_elements() == 0) {
+         HandleEmptyInput(context,
+                          tf_shape_src,
+                          scale_tensor.shape(),
+                          &dst_tensor);
+         return;
+      }
+
       if (dnn_shape_src.IsMklTensor())
         depth_ = dnn_shape_src.DimSize(MklDnnDims::Dim_C);
       else
         ExtractParams(context);
 
       // Indices of output tensors
-      const size_t dst_index = 0;
-      const size_t batch_mean_index = 1;
-      const size_t batch_variance_index = 2;
-      const size_t saved_mean_index = 3;
-      const size_t saved_variance_index = 4;
+      const size_t kDstIndex = 0;
 
-      // allocate batch mean output tensor
+      // allocate 4 output TF tensors
       Tensor* batch_mean_tensor = nullptr;
-      MklDnnShape mkl_shape_batch_mean;
-      mkl_shape_batch_mean.SetMklTensor(false);
-      AllocateOutputSetMklShape(context,
-                                batch_mean_index,
-                                &batch_mean_tensor,
-                                scale_tensor.shape(),
-                                mkl_shape_batch_mean);
-      CHECK_NOTNULL(batch_mean_tensor);
-
-      // Batch variance
       Tensor* batch_variance_tensor = nullptr;
-      MklDnnShape mkl_shape_batch_variance;
-      mkl_shape_batch_variance.SetMklTensor(false);
-      AllocateOutputSetMklShape(context,
-                                batch_variance_index,
-                                &batch_variance_tensor,
-                                scale_tensor.shape(),
-                                mkl_shape_batch_variance);
-      CHECK_NOTNULL(batch_variance_tensor);
+      Tensor* saved_mean_tensor = nullptr;
+      Tensor* saved_variance_tensor = nullptr;
+      AllocateTFOutputs(context,
+                        scale_tensor.shape(),
+                        &batch_mean_tensor,
+                        &batch_variance_tensor,
+                        &saved_mean_tensor,
+                        &saved_variance_tensor);
 
       if (is_training_)
         SetMeanVariance(*batch_mean_tensor, *batch_variance_tensor);
@@ -844,26 +844,6 @@ class MklFusedBatchNormOp : public OpKernel {
         weights_data[k + depth_] = shift_tf[k];
       }
 
-      // Mean and variance (without Bessel's correction) saved for backward
-      // computation to serve as pre-computed mean and variance.
-      Tensor* saved_mean_tensor = nullptr;
-      MklDnnShape mkl_shape_saved_mean;
-      mkl_shape_saved_mean.SetMklTensor(false);
-      AllocateOutputSetMklShape(context, saved_mean_index,
-                                &saved_mean_tensor,
-                                scale_tensor.shape(),
-                                mkl_shape_saved_mean);
-      CHECK_NOTNULL(saved_mean_tensor);
-
-      Tensor* saved_variance_tensor = nullptr;
-      MklDnnShape mkl_shape_saved_variance;
-      mkl_shape_saved_variance.SetMklTensor(false);
-      AllocateOutputSetMklShape(context, saved_variance_index,
-                                &saved_variance_tensor,
-                                scale_tensor.shape(),
-                                mkl_shape_saved_variance);
-      CHECK_NOTNULL(saved_variance_tensor);
-
       // set mean primitive
       auto mean_desc = memory::desc({1, depth_},
                                     MklDnnType<T>(),
@@ -902,7 +882,6 @@ class MklFusedBatchNormOp : public OpKernel {
       // allocate dst tensor
       MklDnnShape dnn_shape_dst;
       TensorShape tf_shape_dst;
-      Tensor* dst_tensor = nullptr;
       if (dnn_shape_src.IsMklTensor()) {
         dnn_shape_dst.SetMklTensor(true);
         auto dst_pd = bnrm_fwd_pd.dst_primitive_desc();
@@ -915,7 +894,7 @@ class MklFusedBatchNormOp : public OpKernel {
         dnn_shape_dst.SetMklTensor(false);
         tf_shape_dst = src_tensor.shape();
       }
-      AllocateOutputSetMklShape(context, dst_index, &dst_tensor,
+      AllocateOutputSetMklShape(context, kDstIndex, &dst_tensor,
                                 tf_shape_dst, dnn_shape_dst);
 
       // Output of batchnorm has same shape as input.
@@ -958,10 +937,8 @@ class MklFusedBatchNormOp : public OpKernel {
         size_t adjust_size = orig_size - 1;
         adjust_factor = (static_cast<float>(orig_size)) / adjust_size;
       }
-      T* batch_variance_data_tf = reinterpret_cast<T*>(
-                                  batch_variance_tensor->flat<T>().data());
       for (int k=0; k < depth_; k++)
-        batch_variance_data_tf[k] =
+        batch_variance_tensor->flat<T>().data()[k] =
             (reinterpret_cast<T*>(variance_m.get_data_handle()))[k]
             * adjust_factor;
     } catch (mkldnn::error &e) {
@@ -994,8 +971,100 @@ class MklFusedBatchNormOp : public OpKernel {
     variance_values_ = reinterpret_cast<T*>(
                        const_cast<T*>(variance.flat<T>().data()));
   }
-};
 
+  void HandleEmptyInput(OpKernelContext* context,
+                        TensorShape tf_shape_src,
+                        TensorShape tf_shape_scale,
+                        Tensor** dst_tensor) {
+    CHECK_NOTNULL(dst_tensor);
+
+    const size_t kDstIndex = 0;
+    MklDnnShape dnn_shape_dst;
+    dnn_shape_dst.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, kDstIndex, dst_tensor,
+                              tf_shape_src, dnn_shape_dst);
+    CHECK_NOTNULL(*dst_tensor);
+    memset(const_cast<char*>((*dst_tensor)->tensor_data().data()), 0,
+           (*dst_tensor)->tensor_data().size());
+
+    Tensor* batch_mean_tensor = nullptr;
+    Tensor* batch_variance_tensor = nullptr;
+    Tensor* saved_mean_tensor = nullptr;
+    Tensor* saved_variance_tensor = nullptr;
+    AllocateTFOutputs(context, tf_shape_scale,
+                      &batch_mean_tensor,
+                      &batch_variance_tensor,
+                      &saved_mean_tensor,
+                      &saved_variance_tensor);
+  }
+
+  void AllocateTFOutputs(OpKernelContext* context,
+                         TensorShape tf_shape_scale,
+                         Tensor** batch_mean_tensor,
+                         Tensor** batch_variance_tensor,
+                         Tensor** saved_mean_tensor,
+                         Tensor** saved_variance_tensor) {
+    CHECK_NOTNULL(batch_mean_tensor);
+    CHECK_NOTNULL(batch_variance_tensor);
+    CHECK_NOTNULL(saved_mean_tensor);
+    CHECK_NOTNULL(saved_variance_tensor);
+
+    const size_t kBatchMeanIndex = 1;
+    const size_t kBatchVarianceIndex = 2;
+    const size_t kSavedMeanIndex = 3;
+    const size_t kSavedVarianceIndex = 4;
+
+    // allocate batch mean output tensor
+    MklDnnShape mkl_shape_batch_mean;
+    mkl_shape_batch_mean.SetMklTensor(false);
+    AllocateOutputSetMklShape(context,
+                              kBatchMeanIndex,
+                              batch_mean_tensor,
+                              tf_shape_scale,
+                              mkl_shape_batch_mean);
+    CHECK_NOTNULL(*batch_mean_tensor);
+    // set NAN mean value in case of empty input tensor
+    for (int k=0; k < tf_shape_scale.num_elements(); k++)
+      (*batch_mean_tensor)->flat<T>().data()[k] = NAN;
+
+    // allocate batch variance output tensor
+    MklDnnShape mkl_shape_batch_variance;
+    mkl_shape_batch_variance.SetMklTensor(false);
+    AllocateOutputSetMklShape(context,
+                              kBatchVarianceIndex,
+                              batch_variance_tensor,
+                              tf_shape_scale,
+                              mkl_shape_batch_variance);
+    CHECK_NOTNULL(*batch_variance_tensor);
+    // set NAN variance value in case of empty input tensor
+    for (int k=0; k < tf_shape_scale.num_elements(); k++)
+      (*batch_variance_tensor)->flat<T>().data()[k] = NAN;
+
+    // Mean and variance (without Bessel's correction) saved for backward
+    // computation to serve as pre-computed mean and variance.
+    MklDnnShape mkl_shape_saved_mean;
+    mkl_shape_saved_mean.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, kSavedMeanIndex,
+                              saved_mean_tensor,
+                              tf_shape_scale,
+                              mkl_shape_saved_mean);
+    CHECK_NOTNULL(*saved_mean_tensor);
+    // set NAN mean value in case of empty input tensor
+    for (int k=0; k < tf_shape_scale.num_elements(); k++)
+      (*saved_mean_tensor)->flat<T>().data()[k] = NAN;
+
+    MklDnnShape mkl_shape_saved_variance;
+    mkl_shape_saved_variance.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, kSavedVarianceIndex,
+                              saved_variance_tensor,
+                              tf_shape_scale,
+                              mkl_shape_saved_variance);
+    CHECK_NOTNULL(*saved_variance_tensor);
+    // set NAN variance value in case of empty input tensor
+    for (int k=0; k < tf_shape_scale.num_elements(); k++)
+      (*saved_variance_tensor)->flat<T>().data()[k] = NAN;
+  }
+};
 
 template <typename Device, typename T>
 class MklFusedBatchNormGradOp : public OpKernel {
@@ -1009,34 +1078,37 @@ class MklFusedBatchNormGradOp : public OpKernel {
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &tensor_format));
     OP_REQUIRES(context, FormatFromString(tensor_format, &tensor_format_),
                 errors::InvalidArgument("Invalid data format"));
+    OP_REQUIRES_OK(context, context->GetAttr("is_training", &is_training_));
   }
 
   void Compute(OpKernelContext* context) override {
     try {
       auto cpu_engine = engine(engine::cpu, 0);
-
-      const size_t diff_dst_index = 0;  // index of diff_dst tensor
-      const size_t src_index = 1;       // index of src input tensor
-      const size_t scale_index = 2;     // index of scale tensor
-      const size_t mean_index = 3;      // index of saved_mean tensor
-      const size_t variance_index = 4;  // index of saved_variance tensor
-      const Tensor& diff_dst_tensor = MklGetInput(context, diff_dst_index);
-      const Tensor& src_tensor = MklGetInput(context, src_index);
-      const Tensor& scale_tensor = MklGetInput(context, scale_index);
-      const Tensor& saved_mean_tensor = MklGetInput(context, mean_index);
+      const size_t kDiffDstIndex = 0;   // index of diff_dst tensor
+      const size_t kSrcIndex = 1;       // index of src input tensor
+      const size_t kScaleIndex = 2;     // index of scale tensor
+      const size_t kMeanIndex = 3;      // index of saved_mean tensor
+      const size_t kVarianceIndex = 4;  // index of saved_variance tensor
+      const Tensor& diff_dst_tensor = MklGetInput(context, kDiffDstIndex);
+      const Tensor& src_tensor = MklGetInput(context, kSrcIndex);
+      const Tensor& scale_tensor = MklGetInput(context, kScaleIndex);
+      const Tensor& saved_mean_tensor = MklGetInput(context, kMeanIndex);
       const Tensor& saved_variance_tensor = MklGetInput(context,
-                                            variance_index);
+                                            kVarianceIndex);
 
       MklDnnShape dnn_shape_src, dnn_shape_diff_dst;
-      GetMklShape(context, src_index, &dnn_shape_src);
-      GetMklShape(context, diff_dst_index, &dnn_shape_diff_dst);
+      GetMklShape(context, kSrcIndex, &dnn_shape_src);
+      GetMklShape(context, kDiffDstIndex, &dnn_shape_diff_dst);
+      TensorShape tf_shape_src, tf_shape_diff_dst;
 
       if (dnn_shape_diff_dst.IsMklTensor()) {
+        tf_shape_diff_dst = dnn_shape_diff_dst.GetTfShape();
         OP_REQUIRES(context, dnn_shape_diff_dst.GetDimension() == 4,
                     errors::InvalidArgument(
                         "input must be 4-dimensional",
                         diff_dst_tensor.shape().DebugString()));
       } else {
+        tf_shape_diff_dst = diff_dst_tensor.shape();
         OP_REQUIRES(context, diff_dst_tensor.dims() == 4,
                     errors::InvalidArgument(
                         "input must be 4-dimensional",
@@ -1044,11 +1116,13 @@ class MklFusedBatchNormGradOp : public OpKernel {
       }
 
       if (dnn_shape_src.IsMklTensor()) {
+        tf_shape_src = dnn_shape_src.GetTfShape();
         OP_REQUIRES(context, dnn_shape_src.GetDimension() == 4,
                     errors::InvalidArgument(
                         "input must be 4-dimensional",
                          src_tensor.shape().DebugString()));
       } else {
+        tf_shape_src = src_tensor.shape();
         OP_REQUIRES(context, src_tensor.dims() == 4,
                     errors::InvalidArgument(
                         "input must be 4-dimensional",
@@ -1069,6 +1143,15 @@ class MklFusedBatchNormGradOp : public OpKernel {
                       "saved variance must be 1-dimensional",
                       saved_variance_tensor.shape().DebugString()));
 
+      Tensor* diff_src_tensor = nullptr;
+      if (tf_shape_src.num_elements() == 0 ||
+          tf_shape_diff_dst.num_elements() == 0) {
+         HandleEmptyInput(context, tf_shape_src,
+                          scale_tensor.shape(),
+                          &diff_src_tensor);
+         return;
+      }
+
       if (dnn_shape_src.IsMklTensor())
         depth_ = dnn_shape_src.DimSize(MklDnnDims::Dim_C);
       else
@@ -1165,25 +1248,21 @@ class MklFusedBatchNormGradOp : public OpKernel {
       auto diff_weights_m = memory(diff_weights_pd);
 
       auto bnrm_fwd_desc = batch_normalization_forward::desc(
-                                prop_kind::forward_training,
-                                src.GetUsrMemDesc(),
-                                epsilon_,
-                                use_scale_shift);
+                               prop_kind::forward_training,
+                               src.GetUsrMemDesc(),
+                               epsilon_,
+                               is_training_ ? use_scale_shift :
+                               (use_scale_shift | use_global_stats));
       auto bnrm_fwd_pd = batch_normalization_forward::primitive_desc(
                                 bnrm_fwd_desc,
                                 cpu_engine);
 
       // Indices of output tensors
-      const size_t diff_src_index = 0;    // index of diff_src tensor
-      const size_t diff_scale_index = 1;  // index of diff_scale tensor
-      const size_t diff_shift_index = 2;  // index of diff_shift tensor
-      const size_t p1_index = 3;  // index of 1st placeholder tensor
-      const size_t p2_index = 4;  // index of 2nd placeholder tensor
+      const size_t kDiffSrcIndex = 0;    // index of diff_src tensor
 
       // allocate diff_src tensor
       MklDnnShape dnn_shape_diff_src;
       TensorShape tf_shape_diff_src;
-      Tensor* diff_src_tensor = nullptr;
       if (dnn_shape_src.IsMklTensor()) {
         dnn_shape_diff_src.SetMklTensor(true);
         auto diff_src_pd = bnrm_fwd_pd.dst_primitive_desc();
@@ -1201,7 +1280,7 @@ class MklFusedBatchNormGradOp : public OpKernel {
         dnn_shape_diff_src.SetMklTensor(false);
         tf_shape_diff_src = src_tensor.shape();
       }
-      AllocateOutputSetMklShape(context, diff_src_index, &diff_src_tensor,
+      AllocateOutputSetMklShape(context, kDiffSrcIndex, &diff_src_tensor,
                                 tf_shape_diff_src, dnn_shape_diff_src);
 
       diff_src.SetUsrMem(src_md, diff_src_tensor);
@@ -1212,7 +1291,15 @@ class MklFusedBatchNormGradOp : public OpKernel {
                                diff_src.GetUsrMemDesc(),
                                src.GetUsrMemDesc(),
                                epsilon_,
-                               use_scale_shift);
+                               /* for inference, specify use_global_stats
+                                  1. on fwd prop, use mean and variance
+                                     provided as inputs
+                                  2. on bwd prop, mean and variance are
+                                     considered as constants. Thus, 
+                                     reduce the amout of MKL computations
+                               */
+                               is_training_ ? use_scale_shift :
+                               (use_scale_shift | use_global_stats));
       auto bnrm_bwd_pd = batch_normalization_backward::primitive_desc(
                                bnrm_bwd_desc,
                                cpu_engine,
@@ -1232,41 +1319,22 @@ class MklFusedBatchNormGradOp : public OpKernel {
       net.push_back(bnrm_bwd_op);
       stream(stream::kind::eager).submit(net).wait();
 
-      // separate out scale and shift grad and copy to individual tensors
-      const TensorShape& tf_shape_scale_shift = scale_tensor.shape();
+      // allocate 4 output TF tensors
       Tensor* diff_scale_tensor = nullptr;
-      MklDnnShape mkl_shape_diff_scale;
-      mkl_shape_diff_scale.SetMklTensor(false);
-      AllocateOutputSetMklShape(context, diff_scale_index, &diff_scale_tensor,
-                                tf_shape_scale_shift, mkl_shape_diff_scale);
-
       Tensor* diff_shift_tensor = nullptr;
-      MklDnnShape mkl_shape_diff_shift;
-      mkl_shape_diff_shift.SetMklTensor(false);
-      AllocateOutputSetMklShape(context, diff_shift_index, &diff_shift_tensor,
-                                tf_shape_scale_shift, mkl_shape_diff_shift);
+      AllocateTFOutputs(context, scale_tensor.shape(),
+                        &diff_scale_tensor,
+                        &diff_shift_tensor);
 
       // copy data: diff_scale and diff_shift
       T* diff_weights_data_dnn = reinterpret_cast<T*>
                                  (diff_weights_m.get_data_handle());
-      float* diff_scale_data_tf = const_cast<float*>(
-             static_cast<const float*>(diff_scale_tensor->flat<T>().data()));
-      float* diff_shift_data_tf = const_cast<float*>(
-             static_cast<const float*>(diff_shift_tensor->flat<T>().data()));
       for (int i = 0; i < depth_; i++) {
-        diff_scale_data_tf[i] = diff_weights_data_dnn[i];
-        diff_shift_data_tf[i] = diff_weights_data_dnn[i + depth_];
+        diff_scale_tensor->flat<T>().data()[i] =
+                              diff_weights_data_dnn[i];
+        diff_shift_tensor->flat<T>().data()[i] =
+                              diff_weights_data_dnn[i + depth_];
       }
-
-      // Placeholders for estimated_mean and estimated_variance, which are
-      // used for inference and thus not needed here for gradient computation.
-      Tensor* p1_tensor = nullptr, *p2_tensor = nullptr;
-      MklDnnShape mkl_shape_p;
-      mkl_shape_p.SetMklTensor(false);
-      AllocateOutputSetMklShape(context, p1_index, &p1_tensor,
-                                TensorShape({}), mkl_shape_p);
-      AllocateOutputSetMklShape(context, p2_index, &p2_tensor,
-                                TensorShape({}), mkl_shape_p);
     } catch (mkldnn::error &e) {
       string error_msg = "Status: " + std::to_string(e.status) +
                           ", message: " + string(e.message) +
@@ -1282,12 +1350,74 @@ class MklFusedBatchNormGradOp : public OpKernel {
   T epsilon_;
   TensorFormat tensor_format_;
   int depth_;             // batch normalization is done for per channel.
+  bool is_training_;
 
   void ExtractParams(OpKernelContext* context) {
       const Tensor& input = MklGetInput(context, 0);
       depth_ = static_cast<int>(GetTensorDim(input, tensor_format_, 'C'));
   }
 
+  void HandleEmptyInput(OpKernelContext* context,
+                        TensorShape tf_shape_src,
+                        TensorShape tf_shape_scale_shift,
+                        Tensor** diff_src_tensor) {
+    const size_t kDiffSrcIndex = 0;
+
+    MklDnnShape dnn_shape_diff_src;
+    dnn_shape_diff_src.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, kDiffSrcIndex, diff_src_tensor,
+                                tf_shape_src, dnn_shape_diff_src);
+    for (size_t i=0; i < (*diff_src_tensor)->shape().num_elements(); i++)
+       (*diff_src_tensor)->flat<T>().data()[i] = 0;
+
+    Tensor* diff_scale_tensor = nullptr;
+    Tensor* diff_shift_tensor = nullptr;
+    AllocateTFOutputs(context,
+                      tf_shape_scale_shift,
+                      &diff_scale_tensor,
+                      &diff_shift_tensor);
+  }
+
+  void AllocateTFOutputs(OpKernelContext* context,
+                        TensorShape tf_shape_scale_shift,
+                        Tensor** diff_scale_tensor,
+                        Tensor** diff_shift_tensor) {
+    CHECK_NOTNULL(diff_scale_tensor);
+    CHECK_NOTNULL(diff_shift_tensor);
+
+    const size_t kDiffScaleIndex = 1;
+    const size_t kDiffShiftIndex = 2;
+    const size_t kP1Index = 3;
+    const size_t kP2Index = 4;
+
+    // separate out scale and shift grad and copy to individual tensors
+    MklDnnShape mkl_shape_diff_scale;
+    mkl_shape_diff_scale.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, kDiffScaleIndex, diff_scale_tensor,
+                              tf_shape_scale_shift, mkl_shape_diff_scale);
+    CHECK_NOTNULL(*diff_scale_tensor);
+    for (size_t i=0; i < (*diff_scale_tensor)->shape().num_elements(); i++)
+       (*diff_scale_tensor)->flat<T>().data()[i] = 0;
+
+    MklDnnShape mkl_shape_diff_shift;
+    mkl_shape_diff_shift.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, kDiffShiftIndex, diff_shift_tensor,
+                              tf_shape_scale_shift, mkl_shape_diff_shift);
+    CHECK_NOTNULL(*diff_shift_tensor);
+    for (size_t i=0; i < (*diff_shift_tensor)->shape().num_elements(); i++)
+       (*diff_shift_tensor)->flat<T>().data()[i] = 0;
+
+    // Placeholders for estimated_mean and estimated_variance, which are
+    // used for inference and thus not needed here for gradient computation.
+    Tensor* p1_tensor = nullptr, *p2_tensor = nullptr;
+    MklDnnShape mkl_shape_p;
+    mkl_shape_p.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, kP1Index, &p1_tensor,
+                              TensorShape({}), mkl_shape_p);
+    AllocateOutputSetMklShape(context, kP2Index, &p2_tensor,
+                              TensorShape({}), mkl_shape_p);
+  }
+
   memory::dims GetMeanVarianceDims() {
     return memory::dims({1, depth_});
   }
diff --git a/tensorflow/core/kernels/mkl_input_conversion_op.cc b/tensorflow/core/kernels/mkl_input_conversion_op.cc
index 001834b13b..4b5f7b8310 100644
--- a/tensorflow/core/kernels/mkl_input_conversion_op.cc
+++ b/tensorflow/core/kernels/mkl_input_conversion_op.cc
@@ -396,7 +396,7 @@ class MklInputConversionOp : public OpKernel {
       auto cpu_engine = engine(engine::cpu, 0);
       MklDnnData<T> tf_input(&cpu_engine);
       auto input_tf_md = mkl_output_mkl_shape.GetTfLayout();
-      tf_input.SetUsrMem(input_tf_md, &tf_tensor);
+      tf_input.SetUsrMem(input_tf_md, tf_tensor);
 
       // Create reorder between tensorflow layout and Mkl layout.
       std::vector<primitive> net;
diff --git a/tensorflow/core/kernels/mkl_lrn_op.cc b/tensorflow/core/kernels/mkl_lrn_op.cc
index a8f28202f4..95e0404ba8 100644
--- a/tensorflow/core/kernels/mkl_lrn_op.cc
+++ b/tensorflow/core/kernels/mkl_lrn_op.cc
@@ -43,7 +43,7 @@ limitations under the License.
 using mkldnn::lrn_forward;
 using mkldnn::lrn_backward;
 using mkldnn::prop_kind;
-using mkldnn::algorithm::lrn_across_channels;
+using mkldnn::lrn_across_channels;
 using mkldnn::stream;
 #endif
 
@@ -910,17 +910,23 @@ class MklLRNOp : public OpKernel {
       Eigen::Tensor<T, 2, Eigen::RowMajor> multiplier(depth, depth);
       GetBandMatrix<T>(depth, depth_radius_, &multiplier);
 
-      Tensor *output_dnn_data, *workspace;
-      MklDnnShape mkl_output_mkl_shape, mkl_workspace_mkl_shape;
+      Tensor *output_dnn_data = nullptr;
+      MklDnnShape mkl_output_mkl_shape;
       mkl_output_mkl_shape.SetMklTensor(false);
       mkl_output_mkl_shape.SetDimensions(4);
       AllocateOutputSetMklShape(context, kIdxOutput, &output_dnn_data,
                                 input.shape(), mkl_output_mkl_shape);
+      CHECK_NOTNULL(output_dnn_data);
 
-      mkl_workspace_mkl_shape.SetMklTensor(false);
-      mkl_workspace_mkl_shape.SetDimensions(4);
-      AllocateOutputSetMklShape(context, kIdxWorkspace, &workspace,
-                                input.shape(), mkl_workspace_mkl_shape);
+      Tensor* workspace_tensor = nullptr;
+      MklDnnShape workspace_mkl_shape;
+      workspace_mkl_shape.SetMklTensor(false);
+      TensorShape workspace_tf_shape;
+      workspace_tf_shape.AddDim(0);
+      AllocateOutputSetMklShape(context, kIdxWorkspace,
+                              &workspace_tensor,
+                              workspace_tf_shape, workspace_mkl_shape);
+      CHECK_NOTNULL(workspace_tensor);
 
       auto out_shaped = output_dnn_data->shaped<T, 2>({nodes * batch, depth});
       Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
@@ -1344,12 +1350,14 @@ class MklLRNGradOp : public OpKernel {
             errors::InvalidArgument("Output image must be 4-dimensional"));
     }
 
-    if (workspace_dnn_shape.IsMklTensor()) {
-      OP_REQUIRES(context, workspace_dnn_shape.IsMklTensor() == false,
-             errors::InvalidArgument("Workspace should not be MKL Tensor."));
-    } else {
-      OP_REQUIRES(context, workspace_tensor.dims() == 1,
-            errors::InvalidArgument("Workspace must be 1-dimensional"));
+    if (workspace_enabled_) {
+      if (workspace_dnn_shape.IsMklTensor()) {
+        OP_REQUIRES(context, workspace_dnn_shape.IsMklTensor() == false,
+              errors::InvalidArgument("Workspace should not be MKL Tensor."));
+      } else {
+        OP_REQUIRES(context, workspace_tensor.dims() == 1,
+              errors::InvalidArgument("Workspace must be 1-dimensional"));
+      }
     }
   }
 
diff --git a/tensorflow/core/kernels/mkl_maxpooling_op.cc b/tensorflow/core/kernels/mkl_maxpooling_op.cc
index de4d7d2e72..82c5229bab 100644
--- a/tensorflow/core/kernels/mkl_maxpooling_op.cc
+++ b/tensorflow/core/kernels/mkl_maxpooling_op.cc
@@ -517,7 +517,7 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
 
       MklDnnData<T> dnn_data_input(&cpu_engine);
       MklDnnData<T> dnn_data_output(&cpu_engine);
-      MklDnnData<T> dnn_data_wksp(&cpu_engine);
+      MklDnnData<uint8> dnn_data_wksp(&cpu_engine);
 
       // initialize variables for the pooling op
       MklPoolParameters pool_params;
@@ -588,16 +588,16 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
 
     void AllocateWorkspaceTensor(OpKernelContext* context,
                 const pooling_forward::primitive_desc& pool_fwd_prim_desc,
-                MklDnnData<T>* dnn_data_wksp) {
+                MklDnnData<uint8>* dnn_data_wksp) {
         CHECK_NOTNULL(dnn_data_wksp);
         Tensor* workspace_tensor = nullptr;
         memory::primitive_desc workspace_pd
                     = pool_fwd_prim_desc.workspace_primitive_desc();
-        size_t workspace_t_elems = this->GetNumTElements(workspace_pd);
+        size_t workspace_bytes = workspace_pd.get_size();
         MklDnnShape workspace_mkl_shape;
         workspace_mkl_shape.SetMklTensor(false);
         TensorShape workspace_tf_shape;
-        workspace_tf_shape.AddDim(workspace_t_elems);
+        workspace_tf_shape.AddDim(workspace_bytes);
         AllocateOutputSetMklShape(context, kOutputTensorIndexWorkspace,
                                 &workspace_tensor,
                                 workspace_tf_shape, workspace_mkl_shape);
@@ -651,7 +651,7 @@ class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase<T> {
         if (!context->status().ok()) return;
 
         MklDnnData<T> grad_dnn_data(&cpu_engine);
-        MklDnnData<T> workspace_dnn_data(&cpu_engine);
+        MklDnnData<uint8> workspace_dnn_data(&cpu_engine);
         MklDnnData<T> output_dnn_data(&cpu_engine);
         Tensor* output_tensor = nullptr;
         MklPoolParameters pool_params;
@@ -770,7 +770,7 @@ class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase<T> {
 
     void ConfigureWorkspace(const Tensor& workspace_tensor,
                         memory::primitive_desc workspace_pd,
-                        MklDnnData<T> *workspace_dnn_data) {
+                        MklDnnData<uint8> *workspace_dnn_data) {
         CHECK_NOTNULL(workspace_dnn_data);
 
         workspace_dnn_data->SetUsrMem(workspace_pd, &workspace_tensor);
@@ -811,7 +811,7 @@ class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase<T> {
                     errors::InvalidArgument("Gradient must be "
                     "4-dimensional"));
         }
-        if (this->workspace_enabled_){
+        if (this->workspace_enabled_) {
             // The workspace should not be an MKL tensor
             OP_REQUIRES(context, workspace_mkl_shape.IsMklTensor() == false,
                     errors::InvalidArgument("Workspace tensor should not"
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.h b/tensorflow/core/kernels/mkl_pooling_ops_common.h
index d33e91a15d..b974b2c59a 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.h
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.h
@@ -231,7 +231,7 @@ class MklPoolingForwardOpBase : public MklPoolingOpBase<T> {
                   const pooling_forward::primitive_desc& pool_fwd_desc,
                   const MklDnnData<T>* src,
                   MklDnnData<T>* dst,
-                  MklDnnData<T>* wksp = nullptr) {
+                  MklDnnData<uint8>* wksp = nullptr) {
     std::vector<primitive> net;
 
     // Create pooling primitive and add it to net
@@ -307,7 +307,7 @@ class MklPoolingBackwardOpBase : public MklPoolingOpBase<T> {
     MklDnnData<T>* input_gradient_diff_dst,
     MklDnnData<T>* output_diff_src,
     const memory::primitive_desc& target_diff_dst_pd,
-    const MklDnnData<T>* workspace = nullptr) {
+    const MklDnnData<uint8>* workspace = nullptr) {
 
     std::vector<primitive> net;
 
diff --git a/tensorflow/core/kernels/mkl_reshape_op.cc b/tensorflow/core/kernels/mkl_reshape_op.cc
index 11c92ebdb4..b41e529357 100644
--- a/tensorflow/core/kernels/mkl_reshape_op.cc
+++ b/tensorflow/core/kernels/mkl_reshape_op.cc
@@ -256,11 +256,18 @@ class MklReshapeOp : public OpKernel {
               AllocateOutputSetMklShape(context, kOutputSlotIdx, &output_tensor,
                                         shape_to, mkl_shape_output);
 
-              // Insert reorder between Mkl layout and TensorFlow layout.
+              // Insert reorder between Mkl layout and TensorFlow layout if
+              // needed. If reorder is not needed but reshape is needed (since
+              // shape_from != shape_to), then we just copy input tensor to
+              // output tensor with target shape (we cannot forward Mkl layout
+              // in such case because shape has changed.)
               std::vector<primitive> net;
-              CHECK_EQ(dnn_data_input.CheckReorderToOpMem(output_tf_pd,
-                       output_tensor, &net), true);
-              stream(stream::kind::eager).submit(net).wait();
+              if (dnn_data_input.CheckReorderToOpMem(output_tf_pd,
+                       output_tensor, &net)) {
+                stream(stream::kind::eager).submit(net).wait();
+              } else {
+                output_tensor->CopyFrom(input_tensor, shape_to);
+              }
               return;
           } else {
             // If dimensions that are being expanded or collapsed are
diff --git a/tensorflow/core/kernels/mkl_softmax_op.cc b/tensorflow/core/kernels/mkl_softmax_op.cc
new file mode 100644
index 0000000000..896d562933
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_softmax_op.cc
@@ -0,0 +1,163 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/nn_ops.cc.
+#ifdef INTEL_MKL
+#ifdef INTEL_MKL_DNN
+
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/util/tensor_format.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+#include "mkldnn.h"
+#include "mkldnn_types.h"
+#include "tensorflow/core/platform/default/logging.h"
+#include "tensorflow/core/util/mkl_util.h"
+
+#include "mkldnn.hpp"
+using mkldnn::stream;
+using mkldnn::prop_kind;
+using mkldnn::softmax_forward;
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+
+
+template <typename Device, typename T>
+class MklSoftmaxOp : public OpKernel {
+ public:
+  ~MklSoftmaxOp() {}
+
+  explicit MklSoftmaxOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    try {
+      auto cpu_engine = engine(engine::cpu, 0);
+
+      // src_tensor now points to the 0-th input of global data struct "context"
+      size_t src_idx = 0;
+      const Tensor& src_tensor = MklGetInput(context, src_idx);
+
+      // Add: get MklShape
+      MklDnnShape src_mkl_shape;
+      GetMklShape(context, src_idx, &src_mkl_shape);
+
+
+      // src_dims is the dimenstion of src_tensor
+      // dim of the dst will also be same as src_dims
+      auto src_tf_shape = src_mkl_shape.IsMklTensor() ?
+                          src_mkl_shape.GetTfShape() : src_tensor.shape();
+      auto src_dims = TFShapeToMklDnnDims(src_tf_shape);
+      auto output_dims = src_dims;
+
+      // Create softmax memory for src, dst: both are defined in mkl_util.h,
+      // they are wrapper
+      MklDnnData<T> src(&cpu_engine);
+      MklDnnData<T> dst(&cpu_engine);
+
+      // If input is in MKL layout, then simply grab input layout; otherwise,
+      // construct input Tf layout. For TF layout, although input shape
+      // (src_dims) required is in MKL-DNN order, the layout is Tensorflow's
+      // layout
+      auto src_md = src_mkl_shape.IsMklTensor()
+                    ? src_mkl_shape.GetMklLayout()
+                    : memory::desc(src_dims, MklDnnType<T>(),
+                                         memory::format::nc);
+
+      // src: setting memory descriptor and op memory descriptor
+      // Basically following two functions maps the TF "src_tensor" to mkl
+      // tensor object "src"
+      // following functions are in mkl_util.h
+      // data format is "nc" for src and dst; since the src and dst buffer is
+      // always in 2D shape
+      src.SetUsrMem(src_md, &src_tensor);
+      src.SetOpMemDesc(src_dims, memory::format::nc);
+
+      // creating a memory descriptor
+      int axis = 1;  // axis to which softmax will be applied
+      auto softmax_fwd_desc = softmax_forward::desc(prop_kind::forward_scoring,
+                                                    src.GetOpMemDesc(), axis);
+      auto softmax_fwd_pd = softmax_forward::primitive_desc(softmax_fwd_desc,
+                                                            cpu_engine);
+
+      // add: output
+      Tensor* output_tensor = nullptr;
+      MklDnnShape output_mkl_shape;
+      TensorShape output_tf_shape;  // shape of output TF tensor.
+      // Softmax MklDnn output layout is same as input layout.
+      auto dst_pd = src.GetUsrMemPrimDesc();
+
+      // if input is MKL shape, ouput is also MKL shape.
+      // if input is TF shape, output is also TF shape
+      if (src_mkl_shape.IsMklTensor()) {
+        output_mkl_shape.SetMklTensor(true);
+        output_mkl_shape.SetMklLayout(&dst_pd);
+        output_mkl_shape.SetElemType(MklDnnType<T>());
+        output_mkl_shape.SetTfLayout(output_dims.size(), output_dims,
+                                     memory::format::nc);
+        output_tf_shape.AddDim((dst_pd.get_size() / sizeof(T)));
+      } else {  // then output is also TF shape
+        output_mkl_shape.SetMklTensor(false);
+        output_tf_shape = MklDnnDimsToTFShape(output_dims);
+      }
+      // Allocate output shape (MKL or TF based on the above)
+      AllocateOutputSetMklShape(context, 0, &output_tensor, output_tf_shape,
+                                output_mkl_shape);
+
+      // Output_dims and input_dims are same
+      dst.SetUsrMem(src_md, output_tensor);
+
+      // finally creating the "softmax op" using the primitive descriptor, src
+      // and dst
+      auto softmax_fwd =
+          softmax_forward(softmax_fwd_pd, src.GetOpMem(), dst.GetOpMem());
+
+      // execute net (pushing to the stream)
+      // following 3 are common for all mkl dnn ops
+      std::vector<primitive> net;
+      net.push_back(softmax_fwd);
+      stream(stream::kind::eager).submit(net).wait();
+    } catch (mkldnn::error& e) {
+      string error_msg = "Status: " + std::to_string(e.status) + ", message: " +
+                         string(e.message) + ", in file " + string(__FILE__) +
+                         ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
+    }
+  }
+};
+
+/* Register DNN kernels for supported operations and supported types - right now
+ * it is only Softmax and f32 */
+#define REGISTER_SOFTMAX_MKL_SUPPORTED_KERNELS_TYPES(type)             \
+  REGISTER_KERNEL_BUILDER(Name("_MklSoftmax")                       \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<type>("T")            \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklSoftmaxOp<CPUDevice, type>);
+TF_CALL_float(REGISTER_SOFTMAX_MKL_SUPPORTED_KERNELS_TYPES);
+
+
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/pooling_ops_common.cc b/tensorflow/core/kernels/pooling_ops_common.cc
index 6a52a15c93..d4241b5809 100644
--- a/tensorflow/core/kernels/pooling_ops_common.cc
+++ b/tensorflow/core/kernels/pooling_ops_common.cc
@@ -222,7 +222,7 @@ void DnnPoolingOp<T>::Compute(
                                       output_desc, &output_data)
                     .ok();
   OP_REQUIRES(context, status,
-              errors::Internal("cudnn PoolBackward launch failed"));
+              errors::Internal("cudnn PoolForward launch failed"));
 
   if (data_format == FORMAT_NHWC) {
     /// Transform the output data from NCHW back to NHWC
diff --git a/tensorflow/core/kernels/spectrogram_test_utils.cc b/tensorflow/core/kernels/spectrogram_test_utils.cc
index 046f6344df..bc30330d61 100644
--- a/tensorflow/core/kernels/spectrogram_test_utils.cc
+++ b/tensorflow/core/kernels/spectrogram_test_utils.cc
@@ -70,10 +70,24 @@ bool ReadRawFloatFileToComplexVector(
   int offset = 0;
   const int end = data_string.size();
   while (offset < end) {
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+    char arr[4];
+    for (int i = 0; i < kBytesPerValue; ++i ) {
+      arr[3 - i] = *(data_string.data() + offset + i);
+    }
+    memcpy(&real_out, arr, kBytesPerValue);
+    offset += kBytesPerValue;
+    for (int i = 0; i < kBytesPerValue; ++i ) {
+      arr[3 - i] = *(data_string.data() + offset + i);
+    }
+    memcpy(&imag_out, arr, kBytesPerValue);
+    offset += kBytesPerValue;
+#else
     memcpy(&real_out, data_string.data() + offset, kBytesPerValue);
     offset += kBytesPerValue;
     memcpy(&imag_out, data_string.data() + offset, kBytesPerValue);
     offset += kBytesPerValue;
+#endif
     if (row_counter >= row_length) {
       data->push_back(data_row);
       data_row.clear();
diff --git a/tensorflow/core/kernels/transpose_functor_cpu.cc b/tensorflow/core/kernels/transpose_functor_cpu.cc
index 41b73fdaf4..6594f7ee7b 100644
--- a/tensorflow/core/kernels/transpose_functor_cpu.cc
+++ b/tensorflow/core/kernels/transpose_functor_cpu.cc
@@ -88,6 +88,18 @@ struct Transpose<CPUDevice, T, conjugate> {
         internal::TransposeUsingEigen<CPUDevice, T, 5>(d, in, perm, conjugate,
                                                        out);
         break;
+      case 6:
+	internal::TransposeUsingEigen<CPUDevice, T, 6>(d, in, perm, conjugate,
+						       out);
+	break;
+      case 7:
+	internal::TransposeUsingEigen<CPUDevice, T, 7>(d, in, perm, conjugate,
+						       out);
+	break;
+      case 8:
+        internal::TransposeUsingEigen<CPUDevice, T, 8>(d, in, perm, conjugate,
+						       out);
+	break;
       default:
         TransposeSimple<T, conjugate>(d, in, perm, out);
         break;
diff --git a/tensorflow/core/kernels/transpose_functor_gpu.cu.cc b/tensorflow/core/kernels/transpose_functor_gpu.cu.cc
index 493dac9a7c..d6a237d6c1 100644
--- a/tensorflow/core/kernels/transpose_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/transpose_functor_gpu.cu.cc
@@ -201,6 +201,27 @@ struct Transpose<GPUDevice, T, conjugate> {
                                                          out);
         }
         break;
+      case 6:
+        if (!internal::TransposeUsingTile<T, conjugate>::run(d, in, perm,
+                                                             out)) {
+          internal::TransposeUsingEigen<GPUDevice, T, 6>(d, in, perm, conjugate,
+                                                         out);
+        }
+        break;
+      case 7:
+        if (!internal::TransposeUsingTile<T, conjugate>::run(d, in, perm,
+                                                             out)) {
+          internal::TransposeUsingEigen<GPUDevice, T, 7>(d, in, perm, conjugate,
+                                                         out);
+        }
+        break;
+      case 8:
+        if (!internal::TransposeUsingTile<T, conjugate>::run(d, in, perm,
+                                                             out)) {
+          internal::TransposeUsingEigen<GPUDevice, T, 8>(d, in, perm, conjugate,
+                                                         out);
+        }
+        break;
       default:
         internal::TransposeSimple<T, conjugate>(d, in, perm, out);
         break;
diff --git a/tensorflow/core/kernels/xent_op.cc b/tensorflow/core/kernels/xent_op.cc
index dc21cee3a8..0f8d027caa 100644
--- a/tensorflow/core/kernels/xent_op.cc
+++ b/tensorflow/core/kernels/xent_op.cc
@@ -67,10 +67,12 @@ class SoftmaxXentWithLogitsOp : public OpKernel {
     // Try to reuse the logits_in buffer for the backprop output.
     OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
                                 {0}, 1, logits_in.shape(), &back_out));
-    functor::XentFunctor<Device, T> functor;
-    functor(context->eigen_device<Device>(), logits_in.matrix<T>(),
-            labels_in.matrix<T>(), scratch.matrix<T>(), loss_out->vec<T>(),
-            back_out->matrix<T>());
+    if (logits_in.dim_size(0) > 0) {
+      functor::XentFunctor<Device, T> functor;
+      functor(context->eigen_device<Device>(), logits_in.matrix<T>(),
+              labels_in.matrix<T>(), scratch.matrix<T>(), loss_out->vec<T>(),
+              back_out->matrix<T>());
+      }
   }
 };
 
diff --git a/tensorflow/core/lib/gif/gif_io.cc b/tensorflow/core/lib/gif/gif_io.cc
index b5c0d9f621..0f6999c88f 100644
--- a/tensorflow/core/lib/gif/gif_io.cc
+++ b/tensorflow/core/lib/gif/gif_io.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/core/lib/gif/gif_io.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/gif.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mem.h"
@@ -44,7 +45,8 @@ int input_callback(GifFileType* gif_file, GifByteType* buf, int size) {
 }
 
 uint8* Decode(const void* srcdata, int datasize,
-              std::function<uint8*(int, int, int, int)> allocate_output) {
+              const std::function<uint8*(int, int, int, int)>& allocate_output,
+              string* error_string) {
   int error_code = D_GIF_SUCCEEDED;
   InputBufferInfo info = {reinterpret_cast<const uint8*>(srcdata), datasize};
   GifFileType* gif_file =
@@ -57,17 +59,17 @@ uint8* Decode(const void* srcdata, int datasize,
     }
   });
   if (error_code != D_GIF_SUCCEEDED) {
-    LOG(ERROR) << "Fail to open gif file, reason: "
-               << GifErrorString(error_code);
+    *error_string = strings::StrCat("failed to open gif file: ",
+                                    GifErrorString(error_code));
     return nullptr;
   }
   if (DGifSlurp(gif_file) != GIF_OK) {
-    LOG(ERROR) << "Fail to slurp gif file, reason: "
-               << GifErrorString(gif_file->Error);
+    *error_string = strings::StrCat("failed to slurp gif file: ",
+                                    GifErrorString(gif_file->Error));
     return nullptr;
   }
   if (gif_file->ImageCount <= 0) {
-    LOG(ERROR) << "Gif file does not contain any image";
+    *error_string = strings::StrCat("gif file does not contain any image");
     return nullptr;
   }
 
@@ -83,7 +85,7 @@ uint8* Decode(const void* srcdata, int datasize,
     GifImageDesc* img_desc = &this_image->ImageDesc;
     if (img_desc->Left != 0 || img_desc->Top != 0 || img_desc->Width != width ||
         img_desc->Height != height) {
-      LOG(ERROR) << "Can't process optimized gif.";
+      *error_string = strings::StrCat("can't process optimized gif");
       return nullptr;
     }
 
diff --git a/tensorflow/core/lib/gif/gif_io.h b/tensorflow/core/lib/gif/gif_io.h
index 5399e6a538..0a7967a5a1 100644
--- a/tensorflow/core/lib/gif/gif_io.h
+++ b/tensorflow/core/lib/gif/gif_io.h
@@ -43,7 +43,8 @@ namespace tensorflow {
 namespace gif {
 
 uint8* Decode(const void* srcdata, int datasize,
-              std::function<uint8*(int, int, int, int)> allocate_output);
+              const std::function<uint8*(int, int, int, int)>& allocate_output,
+              string* error_string);
 
 }  // namespace gif
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index 536fc7c0c1..3f72b41569 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -1818,7 +1818,11 @@ REGISTER_OP("_MklMaxPool")
     .Input("input: T")
     .Input("mkl_input: uint8")
     .Output("output: T")
+#ifndef INTEL_MKL_DNN
     .Output("workspace: T")
+#else
+    .Output("workspace: uint8")
+#endif
     .Output("mkl_output: uint8")
     .Output("mkl_workspace: uint8")
     .SetShapeFn(shape_inference::MaxPoolShape)
@@ -1840,7 +1844,11 @@ REGISTER_OP("_MklMaxPoolGrad")
     .Input("orig_input: T")
     .Input("orig_output: T")
     .Input("grad: T")
+#ifndef INTEL_MKL_DNN
     .Input("workspace: T")
+#else
+    .Input("workspace: uint8")
+#endif
     .Input("mkl_orig_input: uint8")
     .Input("mkl_orig_output: uint8")
     .Input("mkl_grad: uint8")
diff --git a/tensorflow/core/platform/s3/aws_logging.cc b/tensorflow/core/platform/s3/aws_logging.cc
index 41b854d634..fbca0acc36 100644
--- a/tensorflow/core/platform/s3/aws_logging.cc
+++ b/tensorflow/core/platform/s3/aws_logging.cc
@@ -48,6 +48,7 @@ void AWSLogSystem::LogStream(Aws::Utils::Logging::LogLevel log_level,
 
 void AWSLogSystem::LogMessage(Aws::Utils::Logging::LogLevel log_level,
                               const std::string& message) {
+  if (message == "Initializing Curl library") return;
   switch (log_level) {
     case Aws::Utils::Logging::LogLevel::Info:
       LOG(INFO) << message;
diff --git a/tensorflow/core/platform/s3/s3_file_system.cc b/tensorflow/core/platform/s3/s3_file_system.cc
index 397f26ec0b..ebda3a2065 100644
--- a/tensorflow/core/platform/s3/s3_file_system.cc
+++ b/tensorflow/core/platform/s3/s3_file_system.cc
@@ -14,11 +14,13 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/platform/s3/s3_file_system.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/s3/aws_logging.h"
 #include "tensorflow/core/platform/s3/s3_crypto.h"
 
 #include <aws/core/Aws.h>
+#include <aws/core/config/AWSProfileConfigLoader.h>
 #include <aws/core/utils/FileSystemUtils.h>
 #include <aws/core/utils/logging/AWSLogging.h>
 #include <aws/core/utils/logging/LogSystemInterface.h>
@@ -54,13 +56,37 @@ Aws::Client::ClientConfiguration& GetDefaultClientConfig() {
       cfg.endpointOverride = Aws::String(endpoint);
     }
     const char* region = getenv("AWS_REGION");
+    if (!region) {
+      // TODO (yongtang): `S3_REGION` should be deprecated after 2.0.
+      region = getenv("S3_REGION");
+    }
     if (region) {
       cfg.region = Aws::String(region);
     } else {
-      // TODO (yongtang): `S3_REGION` should be deprecated after 2.0.
-      const char* region = getenv("S3_REGION");
-      if (region) {
-        cfg.region = Aws::String(region);
+      // Load config file (e.g., ~/.aws/config) only if AWS_SDK_LOAD_CONFIG
+      // is set with a truthy value.
+      const char* load_config_env = getenv("AWS_SDK_LOAD_CONFIG");
+      string load_config =
+          load_config_env ? str_util::Lowercase(load_config_env) : "";
+      if (load_config == "true" || load_config == "1") {
+        Aws::String config_file;
+        // If AWS_CONFIG_FILE is set then use it, otherwise use ~/.aws/config.
+        const char* config_file_env = getenv("AWS_CONFIG_FILE");
+        if (config_file_env) {
+          config_file = config_file_env;
+        } else {
+          const char* home_env = getenv("HOME");
+          if (home_env) {
+            config_file = home_env;
+            config_file += "/.aws/config";
+          }
+        }
+        Aws::Config::AWSConfigFileProfileConfigLoader loader(config_file);
+        loader.Load();
+        auto profiles = loader.GetProfiles();
+        if (!profiles["default"].GetRegion().empty()) {
+          cfg.region = profiles["default"].GetRegion();
+        }
       }
     }
     const char* use_https = getenv("S3_USE_HTTPS");
@@ -79,6 +105,22 @@ Aws::Client::ClientConfiguration& GetDefaultClientConfig() {
         cfg.verifySSL = true;
       }
     }
+    const char* connect_timeout = getenv("S3_CONNECT_TIMEOUT_MSEC");
+    if (connect_timeout) {
+      int64 timeout;
+
+      if (strings::safe_strto64(connect_timeout, &timeout)) {
+        cfg.connectTimeoutMs = timeout;
+      }
+    }
+    const char* request_timeout = getenv("S3_REQUEST_TIMEOUT_MSEC");
+    if (request_timeout) {
+      int64 timeout;
+
+      if (strings::safe_strto64(request_timeout, &timeout)) {
+        cfg.requestTimeoutMs = timeout;
+      }
+    }
 
     init = true;
   }
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 836efc079f..67da7bf452 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -24,7 +24,7 @@ limitations under the License.
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
 // "-beta", "-rc", "-rc.1")
-#define TF_VERSION_SUFFIX "-rc0"
+#define TF_VERSION_SUFFIX "-rc1"
 
 #define TF_STR_HELPER(x) #x
 #define TF_STR(x) TF_STR_HELPER(x)
diff --git a/tensorflow/docs_src/api_guides/python/python_io.md b/tensorflow/docs_src/api_guides/python/python_io.md
index a5444408fe..06282e49d5 100644
--- a/tensorflow/docs_src/api_guides/python/python_io.md
+++ b/tensorflow/docs_src/api_guides/python/python_io.md
@@ -14,16 +14,16 @@ suitable if fast sharding or other non-sequential access is desired.
 
 ## TFRecords Format Details
 
-A TFRecords file contains a sequence of strings with CRC hashes.  Each record
-has the format
+A TFRecords file contains a sequence of strings with CRC32C (32-bit CRC using
+the Castagnoli polynomial) hashes.  Each record has the format
 
     uint64 length
     uint32 masked_crc32_of_length
     byte   data[length]
     uint32 masked_crc32_of_data
 
-and the records are concatenated together to produce the file.  The CRC32s
-are [described here](https://en.wikipedia.org/wiki/Cyclic_redundancy_check),
-and the mask of a CRC is
+and the records are concatenated together to produce the file. CRCs are
+[described here](https://en.wikipedia.org/wiki/Cyclic_redundancy_check), and
+the mask of a CRC is
 
     masked_crc = ((crc >> 15) | (crc << 17)) + 0xa282ead8ul
diff --git a/tensorflow/docs_src/install/install_c.md b/tensorflow/docs_src/install/install_c.md
index d79cd1415d..ba1a4118ae 100644
--- a/tensorflow/docs_src/install/install_c.md
+++ b/tensorflow/docs_src/install/install_c.md
@@ -38,7 +38,7 @@ enable TensorFlow for C:
          OS="linux" # Change to "darwin" for macOS
          TARGET_DIRECTORY="/usr/local"
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.5.0-rc0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.5.0-rc1.tar.gz" |
            sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_go.md b/tensorflow/docs_src/install/install_go.md
index 49f5350405..87cc647317 100644
--- a/tensorflow/docs_src/install/install_go.md
+++ b/tensorflow/docs_src/install/install_go.md
@@ -38,7 +38,7 @@ steps to install this library and enable TensorFlow for Go:
          TF_TYPE="cpu" # Change to "gpu" for GPU support
          TARGET_DIRECTORY='/usr/local'
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.5.0-rc0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.5.0-rc1.tar.gz" |
          sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_java.md b/tensorflow/docs_src/install/install_java.md
index 47b1251427..37e109a6e4 100644
--- a/tensorflow/docs_src/install/install_java.md
+++ b/tensorflow/docs_src/install/install_java.md
@@ -36,7 +36,7 @@ following to the project's `pom.xml` to use the TensorFlow Java APIs:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>tensorflow</artifactId>
-  <version>1.5.0-rc0</version>
+  <version>1.5.0-rc1</version>
 </dependency>
 ```
 
@@ -65,7 +65,7 @@ As an example, these steps will create a Maven project that uses TensorFlow:
                <dependency>
                  <groupId>org.tensorflow</groupId>
                  <artifactId>tensorflow</artifactId>
-                 <version>1.5.0-rc0</version>
+                 <version>1.5.0-rc1</version>
                </dependency>
              </dependencies>
          </project>
@@ -123,12 +123,12 @@ instead:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow</artifactId>
-  <version>1.4.0</version>
+  <version>1.5.0-rc1</version>
 </dependency>
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow_jni_gpu</artifactId>
-  <version>1.4.0</version>
+  <version>1.5.0-rc1</version>
 </dependency>
 ```
 
@@ -147,7 +147,7 @@ refer to the simpler instructions above instead.
 Take the following steps to install TensorFlow for Java on Linux or macOS:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.5.0-rc0.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.5.0-rc1.jar),
      which is the TensorFlow Java Archive (JAR).
 
   2. Decide whether you will run TensorFlow for Java on CPU(s) only or with
@@ -166,7 +166,7 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
          OS=$(uname -s | tr '[:upper:]' '[:lower:]')
          mkdir -p ./jni
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.5.0-rc0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.5.0-rc1.tar.gz" |
            tar -xz -C ./jni
 
 ### Install on Windows
@@ -174,10 +174,10 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
 Take the following steps to install TensorFlow for Java on Windows:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.5.0-rc0.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.5.0-rc1.jar),
      which is the TensorFlow Java Archive (JAR).
   2. Download the following Java Native Interface (JNI) file appropriate for
-     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.5.0-rc0.zip).
+     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.5.0-rc1.zip).
   3. Extract this .zip file.
 
 
@@ -225,7 +225,7 @@ must be part of your `classpath`. For example, you can include the
 downloaded `.jar` in your `classpath` by using the `-cp` compilation flag
 as follows:
 
-<pre><b>javac -cp libtensorflow-1.5.0-rc0.jar HelloTF.java</b></pre>
+<pre><b>javac -cp libtensorflow-1.5.0-rc1.jar HelloTF.java</b></pre>
 
 
 ### Running
@@ -239,11 +239,11 @@ two files are available to the JVM:
 For example, the following command line executes the `HelloTF` program on Linux
 and macOS X:
 
-<pre><b>java -cp libtensorflow-1.5.0-rc0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.5.0-rc1.jar:. -Djava.library.path=./jni HelloTF</b></pre>
 
 And the following command line executes the `HelloTF` program on Windows:
 
-<pre><b>java -cp libtensorflow-1.5.0-rc0.jar;. -Djava.library.path=jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.5.0-rc1.jar;. -Djava.library.path=jni HelloTF</b></pre>
 
 If the program prints <tt>Hello from <i>version</i></tt>, you've successfully
 installed TensorFlow for Java and are ready to use the API.  If the program
diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index bb1d9a9f57..03f12dff08 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -188,7 +188,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      Virtualenv environment:
 
      <pre>(tensorflow)$ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc0-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc1-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common_installation_problems).
@@ -293,7 +293,7 @@ take the following steps:
 
      <pre>
      $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc0-cp34-cp34m-linux_x86_64.whl</b>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc1-cp34-cp34m-linux_x86_64.whl</b>
      </pre>
 
      If this step fails, see
@@ -480,7 +480,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
 
      <pre>
      (tensorflow)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc0-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc1-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -648,14 +648,14 @@ This section documents the relevant values for Linux installations.
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc1-cp27-none-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0rc0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0rc1-cp27-none-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -667,14 +667,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc1-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0rc0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0rc1-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -686,14 +686,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc1-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0rc0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0rc1-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
@@ -705,14 +705,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc1-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0rc0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0rc1-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
diff --git a/tensorflow/docs_src/install/install_mac.md b/tensorflow/docs_src/install/install_mac.md
index cf1c5157f8..e13ddadab7 100644
--- a/tensorflow/docs_src/install/install_mac.md
+++ b/tensorflow/docs_src/install/install_mac.md
@@ -115,7 +115,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      TensorFlow in the active Virtualenv is as follows:
 
      <pre> $ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0rc0-py2-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0rc1-py2-none-any.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common-installation-problems).
@@ -238,7 +238,7 @@ take the following steps:
      issue the following command:
 
      <pre> $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0rc0-py2-none-any.whl</b> </pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0rc1-py2-none-any.whl</b> </pre>
 
      If the preceding command fails, see
      [installation problems](#common-installation-problems).
@@ -347,7 +347,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
      TensorFlow for Python 2.7:
 
      <pre> (<i>targetDirectory</i>)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0rc0-py2-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0rc1-py2-none-any.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -520,7 +520,7 @@ This section documents the relevant values for Mac OS installations.
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0rc0-py2-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0rc1-py2-none-any.whl
 </pre>
 
 
@@ -528,5 +528,5 @@ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0rc0-py2-none-a
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0rc0-py3-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0rc1-py3-none-any.whl
 </pre>
diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index 90e93f56c5..f494cc7a7c 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -361,10 +361,10 @@ Invoke `pip install` to install that pip package.
 The filename of the `.whl` file depends on your platform.
 For example, the following command will install the pip package
 
-for TensorFlow 1.5.0rc0 on Linux:
+for TensorFlow 1.5.0rc1 on Linux:
 
 <pre>
-$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.5.0rc0-py2-none-any.whl</b>
+$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.5.0rc1-py2-none-any.whl</b>
 </pre>
 
 ## Validate your installation
@@ -462,9 +462,12 @@ Stack Overflow and specify the `tensorflow` tag.
 **Linux**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
+
+<tr><td>tensorflow-1.5.0-rc1</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.8.0</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.5.0-rc1</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.8.0</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.4.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.5.4</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.4.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.5.4</td><td>6</td><td>8</td></tr>
- <tr><td>tensorflow-1.3.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.4.5</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow-1.3.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.4.5</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.3.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.4.5</td><td>6</td><td>8</td></tr>
 <tr><td>tensorflow-1.2.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.4.5</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.2.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.4.5</td><td>5.1</td><td>8</td></tr>
@@ -477,8 +480,9 @@ Stack Overflow and specify the `tensorflow` tag.
 **Mac**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
+<tr><td>tensorflow-1.5.0-rc1</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.8.1</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.4.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.5.4</td><td>N/A</td><td>N/A</td></tr>
- <tr><td>tensorflow-1.3.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.4.5</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow-1.3.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.4.5</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.2.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.4.5</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.1.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.4.2</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.1.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.4.2</td><td>5.1</td><td>8</td></tr>
@@ -489,6 +493,8 @@ Stack Overflow and specify the `tensorflow` tag.
 **Windows**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
+<tr><td>tensorflow-1.5.0-rc1</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.5.0-rc1</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.4.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.4.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>6</td><td>8</td></tr>
 <tr><td>tensorflow-1.3.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
diff --git a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
index 87cd95165e..7d1650f05e 100644
--- a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
+++ b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
@@ -21,6 +21,8 @@ from __future__ import print_function
 import collections
 import math
 import os
+import sys
+import argparse
 import random
 from tempfile import gettempdir
 import zipfile
@@ -30,6 +32,24 @@ from six.moves import urllib
 from six.moves import xrange  # pylint: disable=redefined-builtin
 import tensorflow as tf
 
+from tensorflow.contrib.tensorboard.plugins import projector
+
+# Give a folder path as an argument with '--log_dir' to save
+# TensorBoard summaries. Default is a log folder in current directory.
+current_path = os.path.dirname(os.path.realpath(sys.argv[0]))
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+      '--log_dir',
+      type=str,
+      default=os.path.join(current_path, 'log'),
+      help='The log directory for TensorBoard summaries.')
+FLAGS, unparsed = parser.parse_known_args()
+
+# Create the directory for TensorBoard variables if there is not.
+if not os.path.exists(FLAGS.log_dir):
+  os.makedirs(FLAGS.log_dir)
+
 # Step 1: Download the data.
 url = 'http://mattmahoney.net/dc/'
 
@@ -156,38 +176,47 @@ graph = tf.Graph()
 with graph.as_default():
 
   # Input data.
-  train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
-  train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
-  valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
+  with tf.name_scope('inputs'):
+    train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
+    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
+    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
 
   # Ops and variables pinned to the CPU because of missing GPU implementation
   with tf.device('/cpu:0'):
     # Look up embeddings for inputs.
-    embeddings = tf.Variable(
-        tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
-    embed = tf.nn.embedding_lookup(embeddings, train_inputs)
+    with tf.name_scope('embeddings'):
+      embeddings = tf.Variable(
+          tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
+      embed = tf.nn.embedding_lookup(embeddings, train_inputs)
 
     # Construct the variables for the NCE loss
-    nce_weights = tf.Variable(
-        tf.truncated_normal([vocabulary_size, embedding_size],
-                            stddev=1.0 / math.sqrt(embedding_size)))
-    nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
+    with tf.name_scope('weights'):
+      nce_weights = tf.Variable(
+          tf.truncated_normal([vocabulary_size, embedding_size],
+                              stddev=1.0 / math.sqrt(embedding_size)))
+    with tf.name_scope('biases'):
+      nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
 
   # Compute the average NCE loss for the batch.
   # tf.nce_loss automatically draws a new sample of the negative labels each
   # time we evaluate the loss.
   # Explanation of the meaning of NCE loss:
   #   http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
-  loss = tf.reduce_mean(
-      tf.nn.nce_loss(weights=nce_weights,
-                     biases=nce_biases,
-                     labels=train_labels,
-                     inputs=embed,
-                     num_sampled=num_sampled,
-                     num_classes=vocabulary_size))
+  with tf.name_scope('loss'):
+    loss = tf.reduce_mean(
+        tf.nn.nce_loss(weights=nce_weights,
+                       biases=nce_biases,
+                       labels=train_labels,
+                       inputs=embed,
+                       num_sampled=num_sampled,
+                       num_classes=vocabulary_size))
+
+  # Add the loss value as a scalar to summary.
+  tf.summary.scalar('loss', loss)
 
   # Construct the SGD optimizer using a learning rate of 1.0.
-  optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
+  with tf.name_scope('optimizer'):
+    optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
 
   # Compute the cosine similarity between minibatch examples and all embeddings.
   norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
@@ -197,13 +226,22 @@ with graph.as_default():
   similarity = tf.matmul(
       valid_embeddings, normalized_embeddings, transpose_b=True)
 
+  # Merge all summaries.
+  merged = tf.summary.merge_all()
+
   # Add variable initializer.
   init = tf.global_variables_initializer()
 
+  # Create a saver.
+  saver = tf.train.Saver()
+
 # Step 5: Begin training.
 num_steps = 100001
 
 with tf.Session(graph=graph) as session:
+  # Open a writer to write summaries.
+  writer = tf.summary.FileWriter(FLAGS.log_dir, session.graph)
+
   # We must initialize all variables before we use them.
   init.run()
   print('Initialized')
@@ -214,10 +252,21 @@ with tf.Session(graph=graph) as session:
         batch_size, num_skips, skip_window)
     feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
 
+    # Define metadata variable.
+    run_metadata = tf.RunMetadata()
+
     # We perform one update step by evaluating the optimizer op (including it
     # in the list of returned values for session.run()
-    _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
+    # Also, evaluate the merged op to get all summaries from the returned "summary" variable.
+    # Feed metadata variable to session for visualizing the graph in TensorBoard.
+    _, summary, loss_val = session.run([optimizer, merged, loss], feed_dict=feed_dict, run_metadata=run_metadata)
     average_loss += loss_val
+    
+    # Add returned summaries to writer in each step.
+    writer.add_summary(summary, step)
+    # Add metadata to visualize the graph for the last run.
+    if step == (num_steps - 1):
+      writer.add_run_metadata(run_metadata, 'step%d' % step)
 
     if step % 2000 == 0:
       if step > 0:
@@ -240,6 +289,23 @@ with tf.Session(graph=graph) as session:
         print(log_str)
   final_embeddings = normalized_embeddings.eval()
 
+  # Write corresponding labels for the embeddings.
+  with open(FLAGS.log_dir + '/metadata.tsv', 'w') as f:
+    for i in xrange(vocabulary_size):
+      f.write(reverse_dictionary[i] + '\n')
+
+  # Save the model for checkpoints.
+  saver.save(session, os.path.join(FLAGS.log_dir, "model.ckpt"))
+
+  # Create a configuration for visualizing embeddings with the labels in TensorBoard.
+  config = projector.ProjectorConfig()
+  embedding_conf = config.embeddings.add()
+  embedding_conf.tensor_name = embeddings.name
+  embedding_conf.metadata_path = os.path.join(FLAGS.log_dir, 'metadata.tsv')
+  projector.visualize_embeddings(writer, config)
+
+writer.close()
+
 # Step 6: Visualize the embeddings.
 
 
diff --git a/tensorflow/python/data/kernel_tests/batch_dataset_op_test.py b/tensorflow/python/data/kernel_tests/batch_dataset_op_test.py
index 53c8be1d1d..eac1c1960d 100644
--- a/tensorflow/python/data/kernel_tests/batch_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/batch_dataset_op_test.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -301,6 +302,27 @@ class BatchDatasetTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  def testPaddedBatchDatasetUnicode(self):
+    # See GitHub issue 16149
+    def generator():
+      data = [
+          [u'Простой', u'тест', u'юникода'],
+          [u'никогда', u'не', u'бывает', u'простым']]
+
+      for seq in data:
+        yield seq, [0, 1, 2, 3]
+
+    dataset = dataset_ops.Dataset.from_generator(
+        generator,
+        (dtypes.string, dtypes.int32),
+        (tensor_shape.TensorShape([None]), tensor_shape.TensorShape([None])))
+    padded_dataset = dataset.padded_batch(2, padded_shapes=([None], [None]),
+                                          padding_values=('', 0))
+    with self.test_session() as sess:
+      next_element = padded_dataset.make_one_shot_iterator().get_next()
+      sess.run(next_element)
+
+
   def testPaddedBatchDatasetShapeSpecifications(self):
     int_placeholder = array_ops.placeholder(dtypes.int32)
     float_placeholder = array_ops.placeholder(dtypes.float32)
diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index b4c6b2747e..2e914fa7e0 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -128,9 +128,10 @@ class Estimator(object):
 
       model_dir: Directory to save model parameters, graph and etc. This can
         also be used to load checkpoints from the directory into a estimator to
-        continue training a previously saved model. If `None`, the model_dir in
-        `config` will be used if set. If both are set, they must be same. If
-        both are `None`, a temporary directory will be used.
+        continue training a previously saved model. If `PathLike` object, the
+        path will be resolved. If `None`, the model_dir in `config` will be used
+        if set. If both are set, they must be same. If both are `None`, a
+        temporary directory will be used.
       config: Configuration object.
       params: `dict` of hyper parameters that will be passed into `model_fn`.
               Keys are names of parameters, values are basic python types.
@@ -158,6 +159,7 @@ class Estimator(object):
       self._config = config
 
     # Model directory.
+    model_dir = compat.path_to_str(model_dir)
     if (model_dir is not None) and (self._config.model_dir is not None):
       if model_dir != self._config.model_dir:
         # TODO(alanyee): remove this suppression after it is no longer needed
diff --git a/tensorflow/python/estimator/run_config.py b/tensorflow/python/estimator/run_config.py
index 894005550d..61a537022b 100644
--- a/tensorflow/python/estimator/run_config.py
+++ b/tensorflow/python/estimator/run_config.py
@@ -27,6 +27,7 @@ import six
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import server_lib
+from tensorflow.python.util import compat
 
 
 _USE_DEFAULT = object()
@@ -399,7 +400,8 @@ class RunConfig(object):
 
     Args:
       model_dir: directory where model parameters, graph, etc are saved. If
-        `None`, will use a default value set by the Estimator.
+        `PathLike` object, the path will be resolved. If `None`, will use a
+        default value set by the Estimator.
       tf_random_seed: Random seed for TensorFlow initializers.
         Setting this value allows consistency between reruns.
       save_summary_steps: Save summaries every this many steps.
@@ -442,7 +444,7 @@ class RunConfig(object):
     if tf_config:
       logging.info('TF_CONFIG environment variable: %s', tf_config)
 
-    model_dir = _get_model_dir(tf_config, model_dir)
+    model_dir = _get_model_dir(tf_config, compat.path_to_str(model_dir))
 
     RunConfig._replace(
         self,
diff --git a/tensorflow/python/framework/constant_op.py b/tensorflow/python/framework/constant_op.py
index 0a56d3f64d..d3d8c9c154 100644
--- a/tensorflow/python/framework/constant_op.py
+++ b/tensorflow/python/framework/constant_op.py
@@ -60,7 +60,6 @@ def _eager_reshape(tensor, shape, ctx):
   attr_t = tensor._datatype_enum()  # pylint: disable=protected-access
   attr_tshape, (shape,) = execute.args_to_matching_eager(
       [shape], ctx, dtypes.int32)
-  attr_tshape = attr_tshape
   inputs_flat = [tensor, shape]
   attrs = ("T", attr_t, "Tshape", attr_tshape)
   result, = execute.execute(
diff --git a/tensorflow/python/kernel_tests/conv1d_test.py b/tensorflow/python/kernel_tests/conv1d_test.py
index d92797a7d3..e2e6205911 100644
--- a/tensorflow/python/kernel_tests/conv1d_test.py
+++ b/tensorflow/python/kernel_tests/conv1d_test.py
@@ -30,27 +30,29 @@ from tensorflow.python.platform import test
 class Conv1DTest(test.TestCase):
 
   def testBasic(self):
-    """Test that argument passing to conv2d is handled properly."""
-
-    x = constant_op.constant([1, 2, 3, 4], dtype=dtypes.float32)
-    x = array_ops.expand_dims(x, 0)  # Add batch dimension
-    x = array_ops.expand_dims(x, 2)  # And depth dimension
-    filters = constant_op.constant([2, 1], dtype=dtypes.float32)
-    filters = array_ops.expand_dims(filters, 1)  # in_channels
-    filters = array_ops.expand_dims(filters, 2)  # out_channels
-    # Filters is 2x1x1
-    for stride in [1, 2]:
-      with self.test_session(use_gpu=test.is_gpu_available()):
-        c = nn_ops.conv1d(x, filters, stride, padding="VALID")
-        reduced = array_ops.squeeze(c)
-        output = reduced.eval()
-        if stride == 1:
-          self.assertEqual(len(output), 3)
-          self.assertAllClose(output,
-                              [2 * 1 + 1 * 2, 2 * 2 + 1 * 3, 2 * 3 + 1 * 4])
-        else:
-          self.assertEqual(len(output), 2)
-          self.assertAllClose(output, [2 * 1 + 1 * 2, 2 * 3 + 1 * 4])
+    """Test that argument passing to conv1d is handled properly."""
+    # TODO(yongtang): dtypes.float64 can only be enabled once conv2d support
+    # dtypes.float64, as conv1d implicitly calls conv2d after expand_dims.
+    for dtype in [dtypes.float16, dtypes.float32]:
+      x = constant_op.constant([1, 2, 3, 4], dtype=dtype)
+      x = array_ops.expand_dims(x, 0)  # Add batch dimension
+      x = array_ops.expand_dims(x, 2)  # And depth dimension
+      filters = constant_op.constant([2, 1], dtype=dtype)
+      filters = array_ops.expand_dims(filters, 1)  # in_channels
+      filters = array_ops.expand_dims(filters, 2)  # out_channels
+      # Filters is 2x1x1
+      for stride in [1, 2]:
+        with self.test_session(use_gpu=test.is_gpu_available()):
+          c = nn_ops.conv1d(x, filters, stride, padding="VALID")
+          reduced = array_ops.squeeze(c)
+          output = reduced.eval()
+          if stride == 1:
+            self.assertEqual(len(output), 3)
+            self.assertAllClose(output,
+                                [2 * 1 + 1 * 2, 2 * 2 + 1 * 3, 2 * 3 + 1 * 4])
+          else:
+            self.assertEqual(len(output), 2)
+            self.assertAllClose(output, [2 * 1 + 1 * 2, 2 * 3 + 1 * 4])
 
   def testConv1DTranspose(self):
     with self.test_session():
diff --git a/tensorflow/python/kernel_tests/cwise_ops_test.py b/tensorflow/python/kernel_tests/cwise_ops_test.py
index cea12ea8ec..a91917b27f 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_test.py
@@ -24,6 +24,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_lib
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
@@ -1168,6 +1169,32 @@ class BinaryOpTest(test.TestCase):
       self._compareCpu(x1, x2, np.arctan2, math_ops.atan2)
       self._compareGpu(x1, x2, np.arctan2, math_ops.atan2)
 
+  def testPowNegativeExponent(self):
+    for dtype in [np.int32, np.int64]:
+      with self.test_session(use_gpu=False) as sess:
+        with self.assertRaisesRegexp(
+            errors_impl.InvalidArgumentError,
+            "Integers to negative integer powers are not allowed"):
+          x = np.array([5, 2]).astype(dtype)
+          y = np.array([-2, 3]).astype(dtype)
+          sess.run(math_ops.pow(x, y))
+
+      with self.test_session(use_gpu=False) as sess:
+        with self.assertRaisesRegexp(
+            errors_impl.InvalidArgumentError,
+            "Integers to negative integer powers are not allowed"):
+          x = np.array([5, 2]).astype(dtype)
+          y = np.array([2, -3]).astype(dtype)
+          sess.run(math_ops.pow(x, y))
+
+      with self.test_session(use_gpu=False) as sess:
+        with self.assertRaisesRegexp(
+            errors_impl.InvalidArgumentError,
+            "Integers to negative integer powers are not allowed"):
+          x = np.array([5, 2]).astype(dtype)
+          y = -3
+          sess.run(math_ops.pow(x, y))
+
 
 class ComparisonOpTest(test.TestCase):
 
diff --git a/tensorflow/python/kernel_tests/metrics_test.py b/tensorflow/python/kernel_tests/metrics_test.py
index 3358b78efd..e0e752147c 100644
--- a/tensorflow/python/kernel_tests/metrics_test.py
+++ b/tensorflow/python/kernel_tests/metrics_test.py
@@ -3628,7 +3628,8 @@ class MeanPerClassAccuracyTest(test.TestCase):
         predictions=array_ops.ones([10, 1]),
         labels=array_ops.ones([10, 1]),
         num_classes=2)
-    _assert_metric_variables(self, ('mean_accuracy/total_confusion_matrix:0',))
+    _assert_metric_variables(self, ('mean_accuracy/count:0',
+                                    'mean_accuracy/total:0'))
 
   def testMetricsCollections(self):
     my_collection_name = '__metrics__'
@@ -3797,23 +3798,6 @@ class MeanPerClassAccuracyTest(test.TestCase):
       desired_output = np.mean([1.0 / 2.0, 2.0 / 3.0, 0.])
       self.assertAlmostEqual(desired_output, mean_accuracy.eval())
 
-  def testUpdateOpEvalIsAccumulatedConfusionMatrix(self):
-    predictions = array_ops.concat([
-        constant_op.constant(0, shape=[5]), constant_op.constant(1, shape=[5])
-    ], 0)
-    labels = array_ops.concat([
-        constant_op.constant(0, shape=[3]), constant_op.constant(1, shape=[7])
-    ], 0)
-    num_classes = 2
-    with self.test_session() as sess:
-      mean_accuracy, update_op = metrics.mean_per_class_accuracy(
-          labels, predictions, num_classes)
-      sess.run(variables.local_variables_initializer())
-      confusion_matrix = update_op.eval()
-      self.assertAllEqual([[3, 0], [2, 5]], confusion_matrix)
-      desired_mean_accuracy = np.mean([3. / 3., 5. / 7.])
-      self.assertAlmostEqual(desired_mean_accuracy, mean_accuracy.eval())
-
   def testAllCorrect(self):
     predictions = array_ops.zeros([40])
     labels = array_ops.zeros([40])
@@ -3822,7 +3806,7 @@ class MeanPerClassAccuracyTest(test.TestCase):
       mean_accuracy, update_op = metrics.mean_per_class_accuracy(
           labels, predictions, num_classes)
       sess.run(variables.local_variables_initializer())
-      self.assertEqual(40, update_op.eval()[0])
+      self.assertEqual(1.0, update_op.eval()[0])
       self.assertEqual(1.0, mean_accuracy.eval())
 
   def testAllWrong(self):
@@ -3833,7 +3817,7 @@ class MeanPerClassAccuracyTest(test.TestCase):
       mean_accuracy, update_op = metrics.mean_per_class_accuracy(
           labels, predictions, num_classes)
       sess.run(variables.local_variables_initializer())
-      self.assertAllEqual([[0, 0], [40, 0]], update_op.eval())
+      self.assertAllEqual([0.0, 0.0], update_op.eval())
       self.assertEqual(0., mean_accuracy.eval())
 
   def testResultsWithSomeMissing(self):
@@ -3852,8 +3836,9 @@ class MeanPerClassAccuracyTest(test.TestCase):
       mean_accuracy, update_op = metrics.mean_per_class_accuracy(
           labels, predictions, num_classes, weights=weights)
       sess.run(variables.local_variables_initializer())
-      self.assertAllEqual([[2, 0], [2, 4]], update_op.eval())
-      desired_mean_accuracy = np.mean([2. / 2., 4. / 6.])
+      desired_accuracy = np.array([2. / 2., 4. / 6.], dtype=np.float32)
+      self.assertAllEqual(desired_accuracy, update_op.eval())
+      desired_mean_accuracy = np.mean(desired_accuracy)
       self.assertAlmostEqual(desired_mean_accuracy, mean_accuracy.eval())
 
 
diff --git a/tensorflow/python/kernel_tests/xent_op_test.py b/tensorflow/python/kernel_tests/xent_op_test.py
index 43be08f8a1..c6c7c4e26c 100644
--- a/tensorflow/python/kernel_tests/xent_op_test.py
+++ b/tensorflow/python/kernel_tests/xent_op_test.py
@@ -240,6 +240,16 @@ class XentTest(test.TestCase):
     self._testXentWrapper(features, labels, dim=-1, use_gpu=False)
     self._testXentWrapper(features, labels, dim=-1, use_gpu=True)
 
+  def testZeroDimension(self):
+    features = np.zeros([0, 2, 4]).astype(np.float32)
+    labels = np.zeros([0, 2, 4]).astype(np.float32)
+    np_loss, _ = self._npXent(features, labels)
+    with self.test_session(use_gpu=True) as sess:
+      loss = nn_ops.softmax_cross_entropy_with_logits(
+          labels=labels, logits=features)
+      tf_loss = sess.run(loss)
+    self.assertAllEqual(np_loss, tf_loss)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/layers/pooling.py b/tensorflow/python/layers/pooling.py
index c6bd7aae07..ab06a3a408 100644
--- a/tensorflow/python/layers/pooling.py
+++ b/tensorflow/python/layers/pooling.py
@@ -63,14 +63,18 @@ class _Pooling1D(base.Layer):
   def call(self, inputs):
     # There is no TF op for 1D pooling, hence we make the inputs 4D.
     if self.data_format == 'channels_last':
-      inputs = array_ops.expand_dims(inputs, 2)
-      pool_shape = (1,) + self.pool_size + (1, 1)
-      strides = (1,) + self.strides + (1, 1)
-      data_format = 'NHWC'
-    else:
+      # input is NWC, make it NHWC
       inputs = array_ops.expand_dims(inputs, 1)
+      # pool on the W dim
       pool_shape = (1, 1) + self.pool_size + (1,)
       strides = (1, 1) + self.strides + (1,)
+      data_format = 'NHWC'
+    else:
+      # input is NCW, make it NCHW
+      inputs = array_ops.expand_dims(inputs, 2)
+      # pool on the W dim
+      pool_shape = (1, 1, 1) + self.pool_size
+      strides = (1, 1, 1) + self.strides
       data_format = 'NCHW'
 
     outputs = self.pool_function(
@@ -81,9 +85,9 @@ class _Pooling1D(base.Layer):
         data_format=data_format)
 
     if self.data_format == 'channels_last':
-      return array_ops.squeeze(outputs, 2)
-    else:
       return array_ops.squeeze(outputs, 1)
+    else:
+      return array_ops.squeeze(outputs, 2)
 
   def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
diff --git a/tensorflow/python/layers/pooling_test.py b/tensorflow/python/layers/pooling_test.py
index 589fee5f71..e4d4ed4a2a 100644
--- a/tensorflow/python/layers/pooling_test.py
+++ b/tensorflow/python/layers/pooling_test.py
@@ -110,19 +110,19 @@ class PoolingTest(test.TestCase):
 
   def testCreateMaxPooling1DChannelsFirst(self):
     width = 7
-    images = random_ops.random_uniform((5, width, 4))
+    images = random_ops.random_uniform((5, 4, width))
     layer = pooling_layers.MaxPooling1D(
         2, strides=2, data_format='channels_first')
     output = layer.apply(images)
-    self.assertListEqual(output.get_shape().as_list(), [5, 3, 4])
+    self.assertListEqual(output.get_shape().as_list(), [5, 4, 3])
 
   def testCreateAveragePooling1DChannelsFirst(self):
     width = 7
-    images = random_ops.random_uniform((5, width, 4))
+    images = random_ops.random_uniform((5, 4, width))
     layer = pooling_layers.AveragePooling1D(
         2, strides=2, data_format='channels_first')
     output = layer.apply(images)
-    self.assertListEqual(output.get_shape().as_list(), [5, 3, 4])
+    self.assertListEqual(output.get_shape().as_list(), [5, 4, 3])
 
   def testCreateMaxPooling3D(self):
     depth, height, width = 6, 7, 9
diff --git a/tensorflow/python/lib/core/py_func.cc b/tensorflow/python/lib/core/py_func.cc
index dc56b39486..d3bfa0ee33 100644
--- a/tensorflow/python/lib/core/py_func.cc
+++ b/tensorflow/python/lib/core/py_func.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/python/lib/core/ndarray_tensor_bridge.h"
 #include "tensorflow/python/lib/core/py_util.h"
 #include "tensorflow/python/lib/core/safe_ptr.h"
+
 #include <Python.h>
 
 namespace tensorflow {
@@ -141,7 +142,8 @@ bool IsSingleNone(PyObject* obj) {
     return false;
   }
   std::array<npy_intp, 0> indices;
-  char* item_ptr = static_cast<char*>(PyArray_GetPtr(array_obj, indices.data()));
+  char* item_ptr =
+      static_cast<char*>(PyArray_GetPtr(array_obj, indices.data()));
   PyObject* item = PyArray_GETITEM(array_obj, item_ptr);
   CHECK(item);
   return item == Py_None;
@@ -301,13 +303,22 @@ Status ConvertNdarrayToTensor(PyObject* obj, Tensor* ret) {
         if (PyBytes_AsStringAndSize(input_data[i], &el, &el_size) == -1) {
 #if PY_MAJOR_VERSION >= 3
           el = PyUnicode_AsUTF8AndSize(input_data[i], &el_size);
-          if (!el) {
+#else
+          el = nullptr;
+          if (PyUnicode_Check(input_data[i])) {
+            PyObject* unicode = PyUnicode_AsUTF8String(input_data[i]);
+            if (unicode) {
+              if (PyString_AsStringAndSize(unicode, &el, &el_size) == -1) {
+                Py_DECREF(unicode);
+                el = nullptr;
+              }
+            }
+          }
 #endif
+          if (!el) {
             return errors::Unimplemented("Unsupported object type ",
                                          input_data[i]->ob_type->tp_name);
-#if PY_MAJOR_VERSION >= 3
           }
-#endif
         }
         tflat(i) = string(el, el_size);
       }
diff --git a/tensorflow/python/ops/histogram_ops.py b/tensorflow/python/ops/histogram_ops.py
index d46084a41f..b2de2e5015 100644
--- a/tensorflow/python/ops/histogram_ops.py
+++ b/tensorflow/python/ops/histogram_ops.py
@@ -17,6 +17,7 @@
 
 Please see @{$python/histogram_ops} guide.
 
+@@histogram_fixed_width_bins
 @@histogram_fixed_width
 """
 
@@ -33,6 +34,70 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
+def histogram_fixed_width_bins(values,
+                               value_range,
+                               nbins=100,
+                               dtype=dtypes.int32,
+                               name=None):
+  """Bins the given values for use in a histogram.
+
+  Given the tensor `values`, this operation returns a rank 1 `Tensor`
+  representing the indices of a histogram into which each element
+  of `values` would be binned. The bins are equal width and
+  determined by the arguments `value_range` and `nbins`.
+
+  Args:
+    values:  Numeric `Tensor`.
+    value_range:  Shape [2] `Tensor` of same `dtype` as `values`.
+      values <= value_range[0] will be mapped to hist[0],
+      values >= value_range[1] will be mapped to hist[-1].
+    nbins:  Scalar `int32 Tensor`.  Number of histogram bins.
+    dtype:  dtype for returned histogram.
+    name:  A name for this operation (defaults to 'histogram_fixed_width').
+
+  Returns:
+    A `Tensor` holding the indices of the binned values whose shape matches
+    `values`.
+
+  Examples:
+
+  ```python
+  # Bins will be:  (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
+  nbins = 5
+  value_range = [0.0, 5.0]
+  new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
+
+  with tf.get_default_session() as sess:
+    indices = tf.histogram_fixed_width_bins(new_values, value_range, nbins=5)
+    variables.global_variables_initializer().run()
+    sess.run(indices) => [0, 0, 1, 2, 4]
+  ```
+  """
+  with ops.name_scope(name, 'histogram_fixed_width_bins',
+                      [values, value_range, nbins]) as scope:
+    values = ops.convert_to_tensor(values, name='values')
+    shape = array_ops.shape(values)
+
+    values = array_ops.reshape(values, [-1])
+    value_range = ops.convert_to_tensor(value_range, name='value_range')
+    nbins = ops.convert_to_tensor(nbins, dtype=dtypes.int32, name='nbins')
+    nbins_float = math_ops.cast(nbins, values.dtype)
+
+    # Map tensor values that fall within value_range to [0, 1].
+    scaled_values = math_ops.truediv(values - value_range[0],
+                                     value_range[1] - value_range[0],
+                                     name='scaled_values')
+
+    # map tensor values within the open interval value_range to {0,.., nbins-1},
+    # values outside the open interval will be zero or less, or nbins or more.
+    indices = math_ops.floor(nbins_float * scaled_values, name='indices')
+
+    # Clip edge cases (e.g. value = value_range[1]) or "outliers."
+    indices = math_ops.cast(
+        clip_ops.clip_by_value(indices, 0, nbins_float - 1), dtypes.int32)
+    return array_ops.reshape(indices, shape)
+
+
 @tf_export('histogram_fixed_width')
 def histogram_fixed_width(values,
                           value_range,
diff --git a/tensorflow/python/ops/histogram_ops_test.py b/tensorflow/python/ops/histogram_ops_test.py
index 19ad6cd2ba..80ee090575 100644
--- a/tensorflow/python/ops/histogram_ops_test.py
+++ b/tensorflow/python/ops/histogram_ops_test.py
@@ -21,11 +21,64 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import histogram_ops
 from tensorflow.python.platform import test
 
 
+class BinValuesFixedWidth(test.TestCase):
+
+  def test_empty_input_gives_all_zero_counts(self):
+    # Bins will be:
+    #   (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
+    value_range = [0.0, 5.0]
+    values = []
+    expected_bins = []
+    with self.test_session():
+      bins = histogram_ops.histogram_fixed_width_bins(values, value_range, nbins=5)
+      self.assertEqual(dtypes.int32, bins.dtype)
+      self.assertAllClose(expected_bins, bins.eval())
+
+  def test_1d_values_int32_output(self):
+    # Bins will be:
+    #   (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
+    value_range = [0.0, 5.0]
+    values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
+    expected_bins = [0, 0, 1, 2, 4, 4]
+    with self.test_session():
+      bins = histogram_ops.histogram_fixed_width_bins(
+          values, value_range, nbins=5, dtype=dtypes.int64)
+      self.assertEqual(dtypes.int32, bins.dtype)
+      self.assertAllClose(expected_bins, bins.eval())
+
+  def test_1d_float64_values_int32_output(self):
+    # Bins will be:
+    #   (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
+    value_range = np.float64([0.0, 5.0])
+    values = np.float64([-1.0, 0.0, 1.5, 2.0, 5.0, 15])
+    expected_bins = [0, 0, 1, 2, 4, 4]
+    with self.test_session():
+      bins = histogram_ops.histogram_fixed_width_bins(
+          values, value_range, nbins=5)
+      self.assertEqual(dtypes.int32, bins.dtype)
+      self.assertAllClose(expected_bins, bins.eval())
+
+  def test_2d_values(self):
+    # Bins will be:
+    #   (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
+    value_range = [0.0, 5.0]
+    values = constant_op.constant(
+      [[-1.0, 0.0, 1.5], [2.0, 5.0, 15]],
+      shape=(2, 3))
+    expected_bins = [[0, 0, 1], [2, 4, 4]]
+    with self.test_session():
+      bins = histogram_ops.histogram_fixed_width_bins(
+          values, value_range, nbins=5)
+      self.assertEqual(dtypes.int32, bins.dtype)
+      self.assertAllClose(expected_bins, bins.eval())
+
+
 class HistogramFixedWidthTest(test.TestCase):
 
   def setUp(self):
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index d860a3b618..b713c44717 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -148,6 +148,28 @@ def _Check3DImage(image, require_static=True):
     return []
 
 
+def _Assert3DImage(image):
+  """Assert that we are working with a properly shaped image.
+
+    Performs the check statically if possible (i.e. if the shape
+    is statically known). Otherwise adds a control dependency
+    to an assert op that checks the dynamic shape.
+
+    Args:
+      image: 3-D Tensor of shape [height, width, channels]
+
+    Raises:
+      ValueError: if `image.shape` is not a 3-vector.
+
+    Returns:
+      If the shape of `image` could be verified statically, `image` is
+      returned unchanged, otherwise there will be a control dependency
+      added that asserts the correct dynamic shape.
+    """
+  return control_flow_ops.with_dependencies(
+    _Check3DImage(image, require_static=False), image)
+
+
 def _CheckAtLeast3DImage(image, require_static=True):
   """Assert that we are working with properly shaped image.
 
@@ -223,8 +245,7 @@ def random_flip_up_down(image, seed=None):
   """
   with ops.name_scope(None, 'random_flip_up_down', [image]) as scope:
     image = ops.convert_to_tensor(image, name='image')
-    image = control_flow_ops.with_dependencies(
-        _Check3DImage(image, require_static=False), image)
+    image = _Assert3DImage(image)
     uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
     mirror_cond = math_ops.less(uniform_random, .5)
     result = control_flow_ops.cond(mirror_cond,
@@ -255,8 +276,7 @@ def random_flip_left_right(image, seed=None):
   """
   with ops.name_scope(None, 'random_flip_left_right', [image]) as scope:
     image = ops.convert_to_tensor(image, name='image')
-    image = control_flow_ops.with_dependencies(
-        _Check3DImage(image, require_static=False), image)
+    image = _Assert3DImage(image)
     uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
     mirror_cond = math_ops.less(uniform_random, .5)
     result = control_flow_ops.cond(mirror_cond,
@@ -286,8 +306,7 @@ def flip_left_right(image):
   """
   with ops.name_scope(None, 'flip_left_right', [image]) as scope:
     image = ops.convert_to_tensor(image, name='image')
-    image = control_flow_ops.with_dependencies(
-        _Check3DImage(image, require_static=False), image)
+    image = _Assert3DImage(image)
     return fix_image_flip_shape(image,
                                 array_ops.reverse(image, [1], name=scope))
 
@@ -312,8 +331,7 @@ def flip_up_down(image):
   """
   with ops.name_scope(None, 'flip_up_down', [image]) as scope:
     image = ops.convert_to_tensor(image, name='image')
-    image = control_flow_ops.with_dependencies(
-        _Check3DImage(image, require_static=False), image)
+    image = _Assert3DImage(image)
     return fix_image_flip_shape(image,
                                 array_ops.reverse(image, [0], name=scope))
 
@@ -332,8 +350,7 @@ def rot90(image, k=1, name=None):
   """
   with ops.name_scope(name, 'rot90', [image, k]) as scope:
     image = ops.convert_to_tensor(image, name='image')
-    image = control_flow_ops.with_dependencies(
-        _Check3DImage(image, require_static=False), image)
+    image = _Assert3DImage(image)
     k = ops.convert_to_tensor(k, dtype=dtypes.int32, name='k')
     k.get_shape().assert_has_rank(0)
     k = math_ops.mod(k, 4)
@@ -373,8 +390,7 @@ def transpose_image(image):
   """
   with ops.name_scope(None, 'transpose_image', [image]) as scope:
     image = ops.convert_to_tensor(image, name='image')
-    image = control_flow_ops.with_dependencies(
-        _Check3DImage(image, require_static=False), image)
+    image = _Assert3DImage(image)
     return array_ops.transpose(image, [1, 0, 2], name=scope)
 
 
@@ -410,8 +426,7 @@ def central_crop(image, central_fraction):
     if central_fraction == 1.0:
       return image
 
-    image = control_flow_ops.with_dependencies(
-        _Check3DImage(image, require_static=False), image)
+    image = _Assert3DImage(image)
 
     img_shape = array_ops.shape(image)
     depth = image.get_shape()[2]
@@ -848,8 +863,7 @@ def per_image_standardization(image):
   """
   with ops.name_scope(None, 'per_image_standardization', [image]) as scope:
     image = ops.convert_to_tensor(image, name='image')
-    image = control_flow_ops.with_dependencies(
-        _Check3DImage(image, require_static=False), image)
+    image = _Assert3DImage(image)
     num_pixels = math_ops.reduce_prod(array_ops.shape(image))
 
     image = math_ops.cast(image, dtype=dtypes.float32)
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index 3a49d41c9e..80911ffe07 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -2833,6 +2833,16 @@ class PngTest(test_util.TensorFlowTestCase):
 
 class GifTest(test_util.TensorFlowTestCase):
 
+  def testOptimizedGifErrorString(self):
+    filename = "tensorflow/core/lib/gif/testdata/optimized.gif"
+
+    with self.test_session(use_gpu=True) as sess:
+      gif = io_ops.read_file(filename)
+      image = image_ops.decode_gif(gif)
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError, "can't process optimized gif"):
+        gif, image = sess.run([gif, image])
+
   def testValid(self):
     # Read some real GIFs
     prefix = "tensorflow/core/lib/gif/testdata/"
diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py
index e292bccc2c..72508eb435 100644
--- a/tensorflow/python/ops/losses/losses_impl.py
+++ b/tensorflow/python/ops/losses/losses_impl.py
@@ -512,7 +512,7 @@ def mean_pairwise_squared_error(
 
   Raises:
     ValueError: If the shape of `predictions` doesn't match that of `labels` or
-      if the shape of `weights` is invalid.  Also if `labels` or `predictions
+      if the shape of `weights` is invalid.  Also if `labels` or `predictions`
       is None.
   """
   if labels is None:
diff --git a/tensorflow/python/ops/metrics_impl.py b/tensorflow/python/ops/metrics_impl.py
index 92b3ff2250..2d77e26081 100644
--- a/tensorflow/python/ops/metrics_impl.py
+++ b/tensorflow/python/ops/metrics_impl.py
@@ -831,8 +831,8 @@ def mean_per_class_accuracy(labels,
   Calculates the accuracy for each class, then takes the mean of that.
 
   For estimation of the metric over a stream of data, the function creates an
-  `update_op` operation that updates these variables and returns the
-  `mean_accuracy`.
+  `update_op` operation that updates the accuracy of each class and returns
+  them.
 
   If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 
@@ -843,8 +843,8 @@ def mean_per_class_accuracy(labels,
       shape is [batch size] and type `int32` or `int64`. The tensor will be
       flattened if its rank > 1.
     num_classes: The possible number of labels the prediction task can
-      have. This value must be provided, since a confusion matrix of
-      dimension = [num_classes, num_classes] will be allocated.
+      have. This value must be provided, since two variables with shape =
+      [num_classes] will be allocated.
     weights: Optional `Tensor` whose rank is either 0, or the same rank as
       `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
       be either `1`, or the same as the corresponding `labels` dimension).
@@ -857,7 +857,7 @@ def mean_per_class_accuracy(labels,
 
   Returns:
     mean_accuracy: A `Tensor` representing the mean per class accuracy.
-    update_op: An operation that increments the confusion matrix.
+    update_op: An operation that updates the accuracy tensor.
 
   Raises:
     ValueError: If `predictions` and `labels` have mismatched shapes, or if
@@ -872,27 +872,43 @@ def mean_per_class_accuracy(labels,
 
   with variable_scope.variable_scope(name, 'mean_accuracy',
                                      (predictions, labels, weights)):
+    labels = math_ops.to_int64(labels)
+
+    # Flatten the input if its rank > 1.
+    if labels.get_shape().ndims > 1:
+      labels = array_ops.reshape(labels, [-1])
+
+    if predictions.get_shape().ndims > 1:
+      predictions = array_ops.reshape(predictions, [-1])
+
     # Check if shape is compatible.
     predictions.get_shape().assert_is_compatible_with(labels.get_shape())
 
-    total_cm, update_op = _streaming_confusion_matrix(
-        labels, predictions, num_classes, weights=weights)
+    total = metric_variable([num_classes], dtypes.float32, name='total')
+    count = metric_variable([num_classes], dtypes.float32, name='count')
 
-    def compute_mean_accuracy(name):
-      """Compute the mean per class accuracy via the confusion matrix."""
-      per_row_sum = math_ops.to_float(math_ops.reduce_sum(total_cm, 1))
-      cm_diag = math_ops.to_float(array_ops.diag_part(total_cm))
-      denominator = per_row_sum
+    ones = array_ops.ones([array_ops.size(labels)], dtypes.float32)
 
-      # If the value of the denominator is 0, set it to 1 to avoid
-      # zero division.
-      denominator = array_ops.where(
-          math_ops.greater(denominator, 0), denominator,
-          array_ops.ones_like(denominator))
-      accuracies = math_ops.div(cm_diag, denominator)
-      return math_ops.reduce_mean(accuracies, name=name)
+    if labels.dtype != predictions.dtype:
+      predictions = math_ops.cast(predictions, labels.dtype)
+    is_correct = math_ops.to_float(math_ops.equal(predictions, labels))
+
+    if weights is not None:
+      if weights.get_shape().ndims > 1:
+        weights = array_ops.reshape(weights, [-1])
+      weights = math_ops.to_float(weights)
+
+      is_correct = is_correct * weights
+      ones = ones * weights
+
+    update_total_op = state_ops.scatter_add(total, labels, ones)
+    update_count_op = state_ops.scatter_add(count, labels, is_correct)
+
+    per_class_accuracy = _safe_div(count, total, None)
 
-    mean_accuracy_v = compute_mean_accuracy('mean_accuracy')
+    mean_accuracy_v = math_ops.reduce_mean(per_class_accuracy,
+                                           name='mean_accuracy')
+    update_op = _safe_div(update_count_op, update_total_op, name='update_op')
 
     if metrics_collections:
       ops.add_to_collections(metrics_collections, mean_accuracy_v)
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index 67eee1c29e..837ee02e64 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -196,7 +196,10 @@ def weighted_cross_entropy_with_logits(targets, logits, pos_weight, name=None):
       targets * -log(sigmoid(logits)) +
           (1 - targets) * -log(1 - sigmoid(logits))
 
-  The argument `pos_weight` is used as a multiplier for the positive targets:
+  A value `pos_weights > 1` decreases the false negative count, hence increasing the recall.
+  Conversely setting `pos_weights < 1` decreases the false positive count and increases the precision.
+  This can be seen from the fact that `pos_weight` is introduced as a multiplicative coefficient for the positive targets term 
+  in the loss expression:
 
       targets * -log(sigmoid(logits)) * pos_weight +
           (1 - targets) * -log(1 - sigmoid(logits))
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 09aa45dae1..32b14f86b5 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -1558,7 +1558,8 @@ def leaky_relu(features, alpha=0.2, name=None):
   http://web.stanford.edu/~awni/papers/relu_hybrid_icml2013_final.pdf
 
   Args:
-    features: A `Tensor` representing preactivation values.
+    features: A `Tensor` representing preactivation values. Must be one of
+      the following types: `float16`, `float32`, `float64`, `int32`, `int64`.
     alpha: Slope of the activation function at x < 0.
     name: A name for the operation (optional).
 
@@ -1567,7 +1568,9 @@ def leaky_relu(features, alpha=0.2, name=None):
   """
   with ops.name_scope(name, "LeakyRelu", [features, alpha]):
     features = ops.convert_to_tensor(features, name="features")
-    alpha = ops.convert_to_tensor(alpha, name="alpha")
+    if features.dtype.is_integer:
+      features = math_ops.to_float(features)
+    alpha = ops.convert_to_tensor(alpha, dtype=features.dtype, name="alpha")
     return math_ops.maximum(alpha * features, features)
 
 
@@ -2323,7 +2326,7 @@ def conv1d(value, filters, stride, padding,
   returned to the caller.
 
   Args:
-    value: A 3D `Tensor`.  Must be of type `float32` or `float64`.
+    value: A 3D `Tensor`.  Must be of type `float16` or `float32`.
     filters: A 3D `Tensor`.  Must have the same type as `input`.
     stride: An `integer`.  The number of entries by which
       the filter is moved right at each step.
diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index 66bc0803b7..6767564024 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -878,11 +878,13 @@ class LeakyReluTest(test_lib.TestCase):
     self.assertAllClose(inputs, outputs)
 
   def testValues(self):
-    np_values = np.array([-1.0, 0.0, 0.5, 1.0, 2.0], dtype=np.float32)
-    outputs = nn_ops.leaky_relu(constant_op.constant(np_values))
-    with self.test_session() as sess:
-      outputs = sess.run(outputs)
-    self.assertAllClose(outputs, [-0.2, 0.0, 0.5, 1.0, 2.0])
+    for dtype in [np.int32, np.int64, np.float16, np.float32, np.float64]:
+      np_values = np.array([-2, -1, 0, 1, 2], dtype=dtype)
+      outputs = nn_ops.leaky_relu(constant_op.constant(np_values))
+      with self.test_session() as sess:
+        outputs = sess.run(outputs)
+      tol = 2e-3 if dtype == np.float16 else 1e-6
+      self.assertAllClose(outputs, [-0.4, -0.2, 0.0, 1.0, 2.0], rtol=tol, atol=tol)
 
 
 class SwishTest(test_lib.TestCase):
diff --git a/tensorflow/python/ops/rnn_cell_impl.py b/tensorflow/python/ops/rnn_cell_impl.py
index 1bf5551aff..f1ac3e9baf 100644
--- a/tensorflow/python/ops/rnn_cell_impl.py
+++ b/tensorflow/python/ops/rnn_cell_impl.py
@@ -987,6 +987,10 @@ class DropoutWrapper(RNNCell):
     string = (str(self._seed) + salt).encode("utf-8")
     return int(hashlib.md5(string).hexdigest()[:8], 16) & 0x7FFFFFFF
 
+  @property
+  def wrapped_cell(self):
+    return self._cell
+
   @property
   def state_size(self):
     return self._cell.state_size
diff --git a/tensorflow/python/summary/summary.py b/tensorflow/python/summary/summary.py
index 355593eca5..92c1fcadd2 100644
--- a/tensorflow/python/summary/summary.py
+++ b/tensorflow/python/summary/summary.py
@@ -286,12 +286,13 @@ def merge(inputs, collections=None, name=None):
   return val
 
 
-def merge_all(key=_ops.GraphKeys.SUMMARIES):
+def merge_all(key=_ops.GraphKeys.SUMMARIES, scope=None):
   """Merges all summaries collected in the default graph.
 
   Args:
     key: `GraphKey` used to collect the summaries.  Defaults to
       `GraphKeys.SUMMARIES`.
+    scope: Optional scope used to filter the summary ops, using `re.match`
 
   Returns:
     If no summaries were collected, returns None.  Otherwise returns a scalar
@@ -310,7 +311,7 @@ def merge_all(key=_ops.GraphKeys.SUMMARIES):
     raise RuntimeError(
         'Merging tf.summary.* ops is not compatible with eager execution. '
         'Use tf.contrib.summary instead.')
-  summary_ops = _ops.get_collection(key)
+  summary_ops = _ops.get_collection(key, scope=scope)
   if not summary_ops:
     return None
   else:
diff --git a/tensorflow/python/util/compat.py b/tensorflow/python/util/compat.py
index 07382d93df..3ab0bd16fa 100644
--- a/tensorflow/python/util/compat.py
+++ b/tensorflow/python/util/compat.py
@@ -21,6 +21,7 @@ In addition to the functions below, `as_str` converts an object to a `str`.
 @@as_bytes
 @@as_text
 @@as_str_any
+@@path_to_str
 
 ## Types
 The compatibility module also provides the following types:
@@ -108,6 +109,20 @@ def as_str_any(value):
     return str(value)
 
 
+def path_to_str(path):
+  """Returns the file system path representation of a `PathLike` object, else as it is.
+
+  Args:
+    path: An object that can be converted to path representation.
+
+  Returns:
+    A `str` object.
+  """
+  if hasattr(path, "__fspath__"):
+    path = as_str_any(path.__fspath__())
+  return path
+
+
 # Numpy 1.8 scalars don't inherit from numbers.Integral in Python 3, so we
 # need to check them specifically.  The same goes from Real and Complex.
 integral_types = (_numbers.Integral, _np.integer)
diff --git a/tensorflow/tools/api/golden/tensorflow.compat.pbtxt b/tensorflow/tools/api/golden/tensorflow.compat.pbtxt
index ccc6031400..bab480ff9b 100644
--- a/tensorflow/tools/api/golden/tensorflow.compat.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.compat.pbtxt
@@ -32,4 +32,8 @@ tf_module {
     name: "as_text"
     argspec: "args=[\'bytes_or_text\', \'encoding\'], varargs=None, keywords=None, defaults=[\'utf-8\'], "
   }
+  member_method {
+    name: "path_to_str"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
index f61a5a28e3..97edf245f6 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "weights"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "wrapped_cell"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'cell\', \'input_keep_prob\', \'output_keep_prob\', \'state_keep_prob\', \'variational_recurrent\', \'input_size\', \'dtype\', \'seed\', \'dropout_state_filter_visitor\'], varargs=None, keywords=None, defaults=[\'1.0\', \'1.0\', \'1.0\', \'False\', \'None\', \'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.pbtxt b/tensorflow/tools/api/golden/tensorflow.pbtxt
index 35917e94ad..db1ed42185 100644
--- a/tensorflow/tools/api/golden/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.pbtxt
@@ -1160,6 +1160,10 @@ tf_module {
     name: "histogram_fixed_width"
     argspec: "args=[\'values\', \'value_range\', \'nbins\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'100\', \"<dtype: \'int32\'>\", \'None\'], "
   }
+  member_method {
+    name: "histogram_fixed_width_bins"
+    argspec: "args=[\'values\', \'value_range\', \'nbins\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'100\', \"<dtype: \'int32\'>\", \'None\'], "
+  }
   member_method {
     name: "identity"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.pbtxt b/tensorflow/tools/api/golden/tensorflow.summary.pbtxt
index 326e077d39..871ebb5247 100644
--- a/tensorflow/tools/api/golden/tensorflow.summary.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.summary.pbtxt
@@ -50,7 +50,7 @@ tf_module {
   }
   member_method {
     name: "merge_all"
-    argspec: "args=[\'key\'], varargs=None, keywords=None, defaults=[\'summaries\'], "
+    argspec: "args=[\'key\', \'scope\'], varargs=None, keywords=None, defaults=[\'summaries\', \'None\'], "
   }
   member_method {
     name: "scalar"
diff --git a/tensorflow/tools/benchmark/BUILD b/tensorflow/tools/benchmark/BUILD
index caa6629c49..6ed2594e6a 100644
--- a/tensorflow/tools/benchmark/BUILD
+++ b/tensorflow/tools/benchmark/BUILD
@@ -61,10 +61,11 @@ tf_cc_test(
 
 # This binary may be built for either desktop or Android.
 # A typical Android build command will look like the following:
-# bazel build -c opt tensorflow/core:android_tensorflow_lib \
+# bazel build tensorflow/core:android_tensorflow_lib \
 # --crosstool_top=//external:android/crosstool \
 # --cpu=armeabi-v7a \
 # --host_crosstool_top=@bazel_tools//tools/cpp:toolchain
+# --config monolithic
 tf_cc_binary(
     name = "benchmark_model",
     testonly = 1,
diff --git a/tensorflow/tools/benchmark/README.md b/tensorflow/tools/benchmark/README.md
index ca0da2d41b..e64af2bfe1 100644
--- a/tensorflow/tools/benchmark/README.md
+++ b/tensorflow/tools/benchmark/README.md
@@ -17,6 +17,7 @@ bazel build -c opt \
   --crosstool_top=//external:android/crosstool \
   --cpu=armeabi-v7a \
   --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
+  --config monolithic \
   tensorflow/tools/benchmark:benchmark_model
 ```
 
diff --git a/tensorflow/tools/ci_build/builds/libtensorflow.sh b/tensorflow/tools/ci_build/builds/libtensorflow.sh
index aadf480d37..9b3ff0cba7 100755
--- a/tensorflow/tools/ci_build/builds/libtensorflow.sh
+++ b/tensorflow/tools/ci_build/builds/libtensorflow.sh
@@ -51,7 +51,7 @@ function build_libtensorflow_tarball() {
   rm -rf ${DIR}
 
   TARBALL_SUFFIX="${1}"
-  BAZEL_OPTS="-c opt"
+  BAZEL_OPTS="-c opt --cxxopt=-D_GLIBCXX_USE_CXX11_ABI=0"
   export CC_OPT_FLAGS='-mavx'
   if [ "${TF_NEED_CUDA}" == "1" ]; then
     BAZEL_OPTS="${BAZEL_OPTS} --config=cuda"
diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index b728c878da..aa341b144c 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -26,6 +26,8 @@
 SCRIPT_DIR=$( cd ${0%/*} && pwd -P )
 source "${SCRIPT_DIR}/builds/builds_common.sh"
 
+ROOT_DIR=$( cd "$SCRIPT_DIR/../../.." && pwd -P )
+
 # Helper functions
 die() {
   echo $@
@@ -418,15 +420,8 @@ do_bazel_nobuild() {
 }
 
 do_pip_smoke_test() {
-  BUILD_CMD="bazel build ${BAZEL_FLAGS} //tensorflow/tools/pip_package:pip_smoke_test"
-  ${BUILD_CMD}
-  cmd_status \
-    "Pip smoke test has failed. Please make sure any new TensorFlow are added to the tensorflow/tools/pip_package:build_pip_package dependencies."
-
-  RUN_CMD="bazel-bin/tensorflow/tools/pip_package/pip_smoke_test"
-  ${RUN_CMD}
-  cmd_status \
-    "The pip smoke test failed."
+  cd "$ROOT_DIR/tensorflow/tools/pip_package"
+  python pip_smoke_test.py
 }
 
 do_code_link_check() {
@@ -500,20 +495,23 @@ do_clang_format_check() {
 }
 
 do_check_load_py_test() {
-  BUILD_CMD="bazel build ${BAZEL_FLAGS} //tensorflow/tools/pip_package:check_load_py_test"
-  ${BUILD_CMD}
-  cmd_status \
-    "check_load_py_test failed to build."
+  cd "$ROOT_DIR/tensorflow/tools/pip_package"
+  python check_load_py_test.py
+}
 
-  BUILD_CMD="bazel-bin/tensorflow/tools/pip_package/check_load_py_test"
-  ${BUILD_CMD}
-  cmd_status \
-    "check_load_py_test failed."
+do_cmake_python_sanity() {
+  cd "$ROOT_DIR/tensorflow/contrib/cmake"
+  python -m unittest -v python_sanity_test
+}
+
+do_check_futures_test() {
+  cd "$ROOT_DIR/tensorflow/tools/test"
+  python check_futures_test.py
 }
 
 # Supply all sanity step commands and descriptions
-SANITY_STEPS=("do_pylint PYTHON2" "do_pylint PYTHON3" "do_buildifier" "do_bazel_nobuild" "do_pip_package_licenses_check" "do_lib_package_licenses_check" "do_java_package_licenses_check" "do_pip_smoke_test" "do_check_load_py_test" "do_code_link_check")
-SANITY_STEPS_DESC=("Python 2 pylint" "Python 3 pylint" "buildifier check" "bazel nobuild" "pip: license check for external dependencies" "C library: license check for external dependencies" "Java Native Library: license check for external dependencies" "Pip Smoke Test: Checking py_test dependencies exist in pip package" "Check load py_test: Check that BUILD files with py_test target properly load py_test" "Code Link Check: Check there are no broken links")
+SANITY_STEPS=("do_pylint PYTHON2" "do_pylint PYTHON3" "do_check_futures_test" "do_buildifier" "do_bazel_nobuild" "do_pip_package_licenses_check" "do_lib_package_licenses_check" "do_java_package_licenses_check" "do_pip_smoke_test" "do_check_load_py_test" "do_code_link_check" "do_cmake_python_sanity")
+SANITY_STEPS_DESC=("Python 2 pylint" "Python 3 pylint" "Check that python files have certain __future__ imports" "buildifier check" "bazel nobuild" "pip: license check for external dependencies" "C library: license check for external dependencies" "Java Native Library: license check for external dependencies" "Pip Smoke Test: Checking py_test dependencies exist in pip package" "Check load py_test: Check that BUILD files with py_test target properly load py_test" "Code Link Check: Check there are no broken links" "Test entries in /tensorflow/contrib/cmake/python_{modules|protos|protos_cc}.txt for validity and consistency")
 
 INCREMENTAL_FLAG=""
 DEFAULT_BAZEL_CONFIGS="--config=hdfs --config=gcp"
@@ -548,7 +546,10 @@ while [[ ${COUNTER} -lt "${#SANITY_STEPS[@]}" ]]; do
 "${SANITY_STEPS[COUNTER]} (${SANITY_STEPS_DESC[COUNTER]}) ==="
   echo ""
 
+  # subshell: don't leak variables or changes of working directory
+  (
   ${SANITY_STEPS[COUNTER]} ${INCREMENTAL_FLAG}
+  )
   RESULT=$?
 
   if [[ ${RESULT} != "0" ]]; then
diff --git a/tensorflow/tools/docker/parameterized_docker_build.sh b/tensorflow/tools/docker/parameterized_docker_build.sh
index 1214b6b0a3..fa867b65db 100755
--- a/tensorflow/tools/docker/parameterized_docker_build.sh
+++ b/tensorflow/tools/docker/parameterized_docker_build.sh
@@ -414,7 +414,7 @@ if [[ ! -z "${TF_DOCKER_BUILD_PUSH_WITH_CREDENTIALS}" ]]; then
   if [[ $? != "0" ]]; then
     die "FAIL: Unable to login. Invalid credentials."
   fi
-  docker push $1
+  docker push "${FINAL_IMG}"
   if [[ $? == "0" ]]; then
     docker logout
     echo "Successfully pushed Docker image ${FINAL_IMG}"
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index ff5dd6a0b0..c789e2ba0c 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -47,27 +47,6 @@ py_binary(
     deps = ["//tensorflow:tensorflow_py"],
 )
 
-py_test(
-    name = "pip_smoke_test",
-    srcs = ["pip_smoke_test.py"],
-    data = [
-        "//tensorflow:all_opensource_files",
-    ],
-    tags = [
-        "manual",
-        "notap",
-    ],
-)
-
-py_binary(
-    name = "check_load_py_test",
-    srcs = ["check_load_py_test.py"],
-    data = [
-        "//tensorflow:all_opensource_files",
-    ],
-    srcs_version = "PY2AND3",
-)
-
 # On Windows, python binary is a zip file of runfiles tree.
 # Add everything to its data dependency for generating a runfiles tree
 # for building the pip package on Windows.
diff --git a/tensorflow/tools/pip_package/check_load_py_test.py b/tensorflow/tools/pip_package/check_load_py_test.py
index 79d11b08ce..e2fe1121d7 100644
--- a/tensorflow/tools/pip_package/check_load_py_test.py
+++ b/tensorflow/tools/pip_package/check_load_py_test.py
@@ -22,6 +22,9 @@ import os
 import subprocess
 
 
+os.chdir(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../..')))
+
+
 def check_output_despite_error(args):
   """Get output of args from command line, even if there are errors.
 
diff --git a/tensorflow/tools/pip_package/pip_smoke_test.py b/tensorflow/tools/pip_package/pip_smoke_test.py
index cddf9c8f44..8eee489e2d 100644
--- a/tensorflow/tools/pip_package/pip_smoke_test.py
+++ b/tensorflow/tools/pip_package/pip_smoke_test.py
@@ -23,8 +23,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
 import subprocess
 
+
+os.chdir(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../..')))
+
+
 PIP_PACKAGE_QUERY_EXPRESSION = \
   'deps(//tensorflow/tools/pip_package:build_pip_package)'
 
@@ -134,8 +139,8 @@ def main():
 
     raise RuntimeError("""One or more dependencies are not in the pip package.
 Please either blacklist the dependencies in
-tensorflow/tensorflow/tensorflow/tools/pip_package/pip_smoke_test.py
-or add them to tensorflow/tensorflow/tensorflow/tools/pip_package/BUILD.""")
+//tensorflow/tools/pip_package/pip_smoke_test.py
+or add them to //tensorflow/tools/pip_package/BUILD.""")
 
   else:
     print("TEST PASSED")
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index b2ca4a43c1..62df6453fb 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -29,7 +29,7 @@ from setuptools.dist import Distribution
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
-_VERSION = '1.5.0-rc0'
+_VERSION = '1.5.0-rc1'
 
 REQUIRED_PACKAGES = [
     'absl-py >= 0.1.6',
diff --git a/tensorflow/tools/test/BUILD b/tensorflow/tools/test/BUILD
index 28d651e910..159a8c1cfb 100644
--- a/tensorflow/tools/test/BUILD
+++ b/tensorflow/tools/test/BUILD
@@ -104,12 +104,3 @@ filegroup(
     ),
     visibility = ["//tensorflow:__subpackages__"],
 )
-
-py_test(
-    name = "check_futures_test",
-    size = "small",
-    srcs = ["check_futures_test.py"],
-    data = ["//tensorflow:all_opensource_files"],
-    srcs_version = "PY2AND3",
-    deps = ["@six_archive//:six"],
-)
diff --git a/tensorflow/tools/test/check_futures_test.py b/tensorflow/tools/test/check_futures_test.py
index 1c07511888..9181c9bd4a 100644
--- a/tensorflow/tools/test/check_futures_test.py
+++ b/tensorflow/tools/test/check_futures_test.py
@@ -33,7 +33,7 @@ import re
 
 import six
 
-BASE_DIR = os.path.normpath(os.path.join(__file__, '../../..'))
+BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))
 FUTURES_PATTERN = re.compile(r'^from __future__ import (\w+)\s*$')
 FUTURES_PATTERN_2 = re.compile(
     r'^from __future__ import (\w+), (\w+), (\w+)\s*$')
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index d17dc81024..79e14e0e92 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -5,6 +5,7 @@ load("//third_party/mkl:build_defs.bzl", "mkl_repository")
 load("//third_party/git:git_configure.bzl", "git_configure")
 load("//third_party/py:python_configure.bzl", "python_configure")
 load("//third_party/sycl:sycl_configure.bzl", "sycl_configure")
+load("//third_party/toolchains/clang6:repo.bzl", "clang6_configure")
 load("//third_party/toolchains/cpus/arm:arm_compiler_configure.bzl", "arm_compiler_configure")
 load("//third_party:repo.bzl", "tf_http_archive")
 load("@io_bazel_rules_closure//closure/private:java_import_external.bzl", "java_import_external")
@@ -65,6 +66,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   # files, in case the parsing of those build files depends on the bazel
   # version we require here.
   check_bazel_version_at_least("0.5.4")
+  clang6_configure(name="local_config_clang6")
   cuda_configure(name="local_config_cuda")
   git_configure(name="local_config_git")
   sycl_configure(name="local_config_sycl")
diff --git a/third_party/git/git_configure.bzl b/third_party/git/git_configure.bzl
index 47e2125854..8e2839bdc2 100644
--- a/third_party/git/git_configure.bzl
+++ b/third_party/git/git_configure.bzl
@@ -38,6 +38,11 @@ def _git_conf_impl(repository_ctx):
       Label("@org_tensorflow//tensorflow/tools/git:gen_git_source.py"))
   generated_files_path = repository_ctx.path("gen")
 
+  r = repository_ctx.execute(
+      ["test", "-f", "%s/.git/logs/HEAD" % tensorflow_root_path])
+  if r.return_code == 0:
+    unused_var = repository_ctx.path(Label("//:.git/HEAD")) # pylint: disable=unused-variable
+
   result = repository_ctx.execute([
       _get_python_bin(repository_ctx),
       python_script_path, "--configure", tensorflow_root_path,
diff --git a/third_party/toolchains/clang6/BUILD b/third_party/toolchains/clang6/BUILD
new file mode 100644
index 0000000000..ffd0fb0cdc
--- /dev/null
+++ b/third_party/toolchains/clang6/BUILD
@@ -0,0 +1 @@
+package(default_visibility = ["//visibility:public"])
diff --git a/third_party/toolchains/clang6/CROSSTOOL.tpl b/third_party/toolchains/clang6/CROSSTOOL.tpl
new file mode 100644
index 0000000000..6b7e5a8808
--- /dev/null
+++ b/third_party/toolchains/clang6/CROSSTOOL.tpl
@@ -0,0 +1,587 @@
+major_version: "v1"
+minor_version: "llvm:6.0.0"
+default_target_cpu: "k8"
+
+default_toolchain {
+  cpu: "k8"
+  toolchain_identifier: "k8-clang-6.0-cxx-4.8-linux-gnu"
+}
+
+toolchain {
+  compiler: "clang6"         # bazel build --compiler=clang6
+  target_cpu: "k8"           # bazel build --cpu=k8
+  target_libc: "GLIBC_2.19"  # bazel build --glibc=GLIBC_2.19
+
+  abi_libc_version: "2.19"
+  abi_version: "gcc-4.8-cxx11"
+  builtin_sysroot: ""
+  cc_target_os: "linux-gnu"
+  default_python_version: "python2.7"
+  dynamic_runtimes_filegroup: "dynamic-runtime-libs-k8"
+  host_system_name: "x86_64-unknown-linux-gnu"
+  needsPic: true
+  static_runtimes_filegroup: "static-runtime-libs-k8"
+  supports_embedded_runtimes: true
+  supports_fission: true
+  supports_gold_linker: true
+  supports_incremental_linker: true
+  supports_interface_shared_objects: true
+  supports_normalizing_ar: true
+  supports_start_end_lib: true
+  supports_thin_archives: true
+  target_system_name: "x86_64-unknown-linux-gnu"
+  toolchain_identifier: "k8-clang-6.0-cxx-4.8-linux-gnu"
+
+  tool_path { name: "ar" path: "%package(@local_config_clang6//clang6)%/llvm/bin/llvm-ar" }
+  tool_path { name: "as" path: "%package(@local_config_clang6//clang6)%/llvm/bin/llvm-as" }
+  tool_path { name: "compat-ld" path: "%package(@local_config_clang6//clang6)%/llvm/bin/ld.lld" }
+  tool_path { name: "cpp" path: "%package(@local_config_clang6//clang6)%/llvm/bin/llvm-cpp" }
+  tool_path { name: "dwp" path: "%package(@local_config_clang6//clang6)%/llvm/bin/llvm-dwp" }
+  tool_path { name: "gcc" path: "%package(@local_config_clang6//clang6)%/llvm/bin/clang" }
+  tool_path { name: "gcov" path: "%package(@local_config_clang6//clang6)%/llvm/bin/llvm-cov" }
+  tool_path { name: "ld" path: "%package(@local_config_clang6//clang6)%/llvm/bin/ld.lld" }
+  tool_path { name: "llvm-profdata" path: "%package(@local_config_clang6//clang6)%/llvm/bin/llvm-profdata" }
+  tool_path { name: "nm" path: "%package(@local_config_clang6//clang6)%/llvm/bin/llvm-nm" }
+  tool_path { name: "objcopy" path: "%package(@local_config_clang6//clang6)%/llvm/bin/llvm-objcopy" }
+  tool_path { name: "objdump" path: "%package(@local_config_clang6//clang6)%/sbin/objdump" }
+  tool_path { name: "strip" path: "%package(@local_config_clang6//clang6)%/sbin/strip" }
+
+  unfiltered_cxx_flag: "-no-canonical-prefixes"
+
+  # Make C++ compilation deterministic. Use linkstamping instead of these
+  # compiler symbols.
+  unfiltered_cxx_flag: "-Wno-builtin-macro-redefined"
+  unfiltered_cxx_flag: "-D__DATE__=\"redacted\""
+  unfiltered_cxx_flag: "-D__TIMESTAMP__=\"redacted\""
+  unfiltered_cxx_flag: "-D__TIME__=\"redacted\""
+
+  objcopy_embed_flag: "-I"
+  objcopy_embed_flag: "binary"
+
+  # This action_config makes features flags propagate
+  # to CC_FLAGS for genrules, and eventually skylark.
+  action_config {
+    action_name: "cc-flags-make-variable"
+    config_name: "cc-flags-make-variable"
+  }
+
+  # Security hardening on by default.
+  # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases.
+  # We need to undef it before redefining it as some distributions now have
+  # it enabled by default.
+  compiler_flag: "-U_FORTIFY_SOURCE"
+  compiler_flag: "-D_FORTIFY_SOURCE=1"
+  compiler_flag: "-fstack-protector"
+  linker_flag: "-Wl,-z,relro,-z,now"
+
+  # This adds a little bit more durability to our Clang build.
+  #
+  # At the moment, this only only be needed for:
+  # - add_boringssl_s390x.patch: --Wa,--noexecstack
+  #
+  # Folks who do maintenance work on TF Bazel Clang should consider
+  # commenting out these lines, while doing that work, to gain a better
+  # understanding of what the intersection of support looks like between GCC
+  # and Clang. Please note that,  Bazel does not support
+  # -Xclang-only / -Xgcc-only.
+  compiler_flag: "-Wno-unknown-warning-option"
+  compiler_flag: "-Wno-unused-command-line-argument"
+  compiler_flag: "-Wno-ignored-optimization-argument"
+
+  #### Common compiler options. ####
+  compiler_flag: "-D_REENTRANT"
+  compiler_flag: "-D__STDC_FORMAT_MACROS"
+  compiler_flag: "-DSUPPRESS_USE_FILE_OFFSET64"
+  compiler_flag: "-Wall"
+  compiler_flag: "-Wformat-security"
+  compiler_flag: "-Wframe-larger-than=16384"
+  compiler_flag: "-Wno-char-subscripts"
+  compiler_flag: "-Wno-error=deprecated-declarations"
+  compiler_flag: "-Wno-uninitialized"
+  compiler_flag: "-Wno-sign-compare"
+  compiler_flag: "-Wno-strict-overflow"
+  compiler_flag: "-Wno-unused-function"
+  compiler_flag: "-fdiagnostics-show-option"
+  compiler_flag: "-fmessage-length=0"
+  compiler_flag: "-fno-exceptions"
+  compiler_flag: "-fno-omit-frame-pointer"
+  compiler_flag: "-fno-strict-aliasing"
+  compiler_flag: "-fno-use-init-array"
+  compiler_flag: "-funsigned-char"
+  compiler_flag: "-gmlt"
+  cxx_flag: "-Wno-deprecated"
+  cxx_flag: "-Wno-invalid-offsetof"  # Needed for protobuf code (2017-11-07)
+  cxx_flag: "-fshow-overloads=best"
+  compiler_flag: "-Wthread-safety-analysis"
+
+  # Python extensions unfortunately make this go wild.
+  compiler_flag: "-Wno-writable-strings"
+
+  # GCC's warning produces too many false positives:
+  cxx_flag: "-Woverloaded-virtual"
+  cxx_flag: "-Wnon-virtual-dtor"
+
+  # Enable coloring even if there's no attached terminal. Bazel removes the
+  # escape sequences if --nocolor is specified. This isn't supported by gcc
+  # on Ubuntu 14.04.
+  compiler_flag: "-fcolor-diagnostics"
+
+  # Disable some broken warnings from Clang.
+  compiler_flag: "-Wno-ambiguous-member-template"
+  compiler_flag: "-Wno-pointer-sign"
+
+  # These warnings have a low signal to noise ratio.
+  compiler_flag: "-Wno-reserved-user-defined-literal"
+  compiler_flag: "-Wno-return-type-c-linkage"
+  compiler_flag: "-Wno-invalid-source-encoding"
+
+  # Per default we switch off any layering related warnings.
+  compiler_flag: "-Wno-private-header"
+
+  # Clang-specific warnings that we explicitly enable for TensorFlow. Some of
+  # these aren't on by default, or under -Wall, or are subsets of warnings
+  # turned off above.
+  compiler_flag: "-Wfloat-overflow-conversion"
+  compiler_flag: "-Wfloat-zero-conversion"
+  compiler_flag: "-Wfor-loop-analysis"
+  compiler_flag: "-Wgnu-redeclared-enum"
+  compiler_flag: "-Winfinite-recursion"
+  compiler_flag: "-Wliteral-conversion"
+  compiler_flag: "-Wself-assign"
+  compiler_flag: "-Wstring-conversion"
+  compiler_flag: "-Wtautological-overlap-compare"
+  compiler_flag: "-Wunused-comparison"
+  compiler_flag: "-Wvla"
+  cxx_flag: "-Wdeprecated-increment-bool"
+
+  # Clang code-generation flags for performance optimization.
+  compiler_flag: "-faligned-allocation"
+  compiler_flag: "-fnew-alignment=8"
+
+  # Clang defaults to C99 while GCC defaults to C89. GCC plugins are written in
+  # C89 and don't have a BUILD rule we could add a copts flag to.
+  gcc_plugin_compiler_flag: "-std=gnu89"
+
+  compilation_mode_flags {
+    mode: FASTBUILD
+  }
+
+  compilation_mode_flags {
+    mode: DBG
+    compiler_flag: "-g"
+  }
+
+  compilation_mode_flags {
+    mode: OPT
+    compiler_flag: "-g0"
+    compiler_flag: "-fdebug-types-section"
+    compiler_flag: "-DNDEBUG"
+    compiler_flag: "-fno-split-dwarf-inlining"
+    compiler_flag: "-Os"
+    compiler_flag: "-fexperimental-new-pass-manager"
+    compiler_flag: "-fdebug-info-for-profiling"
+    compiler_flag: "-ffunction-sections"
+    compiler_flag: "-fdata-sections"
+    linker_flag: "-Wl,--gc-sections"
+    linker_flag: "-Wl,-z,relro,-z,now"
+  }
+
+  # Features indicating whether this is a host compile or not. Exactly one of
+  # these will be implicitly provided by bazel.
+  feature { name: "host" }
+  feature { name: "nonhost" }
+
+  # Features indicating which compiler will be used for code generation.
+  feature {
+    name: "llvm_codegen"
+    provides: "codegen"
+    enabled: true
+  }
+
+  # Features for compilation modes. Exactly one of these will be implicitly
+  # provided by bazel.
+  feature { name: "fastbuild" }
+  feature { name: "dbg" }
+  feature { name: "opt" }
+
+  # Features controlling the C++ language mode.
+  feature {
+    name: "c++11"
+    provides: "c++std"
+    flag_set {
+      action: "c++-compile"
+      action: "c++-header-parsing"
+      action: "c++-header-preprocessing"
+      action: "c++-module-compile"
+      action: "linkstamp-compile"
+      flag_group {
+        flag: "-nostdinc++"
+        flag: "-std=c++11"
+        flag: "-Wc++14-extensions"
+        flag: "-Wc++2a-extensions"
+        flag: "-Wno-binary-literal"
+      }
+    }
+  }
+  feature {
+    name: "c++14"
+    provides: "c++std"
+    flag_set {
+      action: "c++-compile"
+      action: "c++-header-parsing"
+      action: "c++-header-preprocessing"
+      action: "c++-module-compile"
+      action: "linkstamp-compile"
+      flag_group {
+        flag: "-nostdinc++"
+        flag: "-std=c++14"
+        flag: "-Wc++11-compat"
+        flag: "-Wno-c++11-compat-binary-literal"
+        flag: "-Wc++2a-extensions"
+      }
+    }
+  }
+  feature {
+    name: "c++17"
+    provides: "c++std"
+    flag_set {
+      action: "c++-compile"
+      action: "c++-header-parsing"
+      action: "c++-header-preprocessing"
+      action: "c++-module-compile"
+      action: "linkstamp-compile"
+      flag_group {
+        flag: "-nostdinc++"
+        flag: "-std=c++17"
+        flag: "-Wc++11-compat"
+        flag: "-Wno-c++11-compat-binary-literal"
+        flag: "-Wc++2a-extensions"
+      }
+    }
+  }
+  feature {
+    name: "c++2a"
+    provides: "c++std"
+    flag_set {
+      action: "c++-compile"
+      action: "c++-header-parsing"
+      action: "c++-header-preprocessing"
+      action: "c++-module-compile"
+      action: "linkstamp-compile"
+      flag_group {
+        flag: "-nostdinc++"
+        flag: "-std=c++2a"
+        flag: "-Wc++11-compat"
+        flag: "-Wno-c++11-compat-binary-literal"
+      }
+    }
+  }
+  feature {
+    name: "c++default"
+    enabled: true
+    flag_set {
+      # Provide the c++11 flags if no standard is selected
+      with_feature {
+        not_feature: "c++11"
+        not_feature: "c++14"
+        not_feature: "c++17"
+        not_feature: "c++2a"
+      }
+      action: "c++-compile"
+      action: "c++-header-parsing"
+      action: "c++-header-preprocessing"
+      action: "c++-module-compile"
+      action: "linkstamp-compile"
+      flag_group {
+        flag: "-nostdinc++"
+        flag: "-std=c++11"
+        flag: "-Wc++14-extensions"
+        flag: "-Wc++2a-extensions"
+        flag: "-Wno-binary-literal"
+      }
+    }
+  }
+
+  feature {
+    name: "use_compiler_rt"
+    requires { feature: "llvm_codegen" }
+    # TODO(saugustine): At the moment, "use_compiler_rt" also
+    # requires "linking_mode_flags { mode: FULLY_STATIC" ... },
+    # but that isn't a feature. We should probably convert it.
+    flag_set {
+      action: "c++-link"
+      action: "c++-link-interface-dynamic-library"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-executable"
+      # "link" is a misnomer for these actions. They are really just
+      # invocations of ar.
+      #action: "c++-link-pic-static-library"
+      #action: "c++-link-static-library"
+      #action: "c++-link-alwayslink-static-library"
+      #action: "c++-link-pic-static-library"
+      #action: "c++-link-alwayslink-pic-static-library"
+      flag_group {
+        flag: "-rtlib=compiler-rt"
+        flag: "-lunwind"
+      }
+    }
+  }
+
+  feature {
+    name: "pie"
+    flag_set {
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-header-parsing"
+      action: "c++-header-preprocessing"
+      action: "c++-module-compile"
+      action: "c++-module-codegen"
+      action: "cc-flags-make-variable"
+      action: "lto-backend"
+      action: "linkstamp-compile"
+      flag_group {
+        flag: "-mpie-copy-relocations"
+        flag: "-fPIE"
+      }
+    }
+    flag_set {
+      action: "cc-flags-make-variable"
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-pie"
+      }
+    }
+  }
+
+  # Pic must appear after pie, because pic may need to override pie, and bazel
+  # turns it on selectively. These don't interact with other options.
+  #
+  # TODO: In practice, normal vs pic vs pie is a ternary mode. We should
+  # implement it that way. This will require changes to bazel, which only
+  # calculates whether or not pic is needed, not pie.
+  #
+  # NOTE: Bazel might make this all a moot point.
+  feature {
+    name: "pic"
+    flag_set {
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-module-codegen"
+      action: "c++-module-compile"
+      action: "linkstamp-compile"
+      expand_if_all_available: "pic"
+      flag_group {
+        flag: "-fPIC"
+      }
+    }
+  }
+
+  feature {
+    name: "gold"
+    enabled: true
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-interface-dynamic-library"
+      flag_group {
+        expand_if_none_available: "lto"
+        flag: "-fuse-ld=gold"
+      }
+    }
+  }
+
+  # This is great if you want linking TensorFlow to take ten minutes.
+  feature {
+    name: "lto"
+    requires { feature: "nonhost" }
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-flto=thin"
+      }
+    }
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-interface-dynamic-library"
+      flag_group {
+        flag: "-flto=thin"
+      }
+    }
+  }
+
+  feature {
+    name: "parse_headers"
+    flag_set {
+      action: "c++-header-parsing"
+      flag_group {
+        flag: "-xc++-header"
+        flag: "-fsyntax-only"
+      }
+    }
+  }
+
+  feature {
+    name: "preprocess_headers"
+    flag_set {
+      action: "c++-header-preprocessing"
+      flag_group {
+        flag: "-xc++"
+        flag: "-E"
+      }
+    }
+  }
+
+  feature {
+    name: "per_object_debug_info"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-module-codegen"
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "lto-backend"
+      flag_group {
+        flag: "-gsplit-dwarf"
+        flag: "-ggnu-pubnames"
+      }
+    }
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-interface-dynamic-library"
+      flag_group {
+        expand_if_all_available: "is_using_fission"
+        flag: "-Wl,--gdb-index"
+      }
+    }
+  }
+
+  feature {
+    name: "xray"
+    requires {
+      feature: "llvm_codegen"
+      feature: "nonhost"
+    }
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-header-parsing"
+      action: "c++-header-preprocessing"
+      action: "c++-module-compile"
+      action: "c++-link-interface-dynamic-library"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-fxray-instrument"
+      }
+    }
+  }
+
+  feature {
+    name: "minimal_ubsan"
+    requires { feature: "llvm_codegen" }
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-header-parsing"
+      action: "c++-header-preprocessing"
+      action: "c++-module-compile"
+      action: "c++-module-codegen"
+      flag_group {
+        flag: "-fsanitize=return,returns-nonnull-attribute,vla-bound,unreachable,float-cast-overflow"
+        flag: "-fsanitize-trap=all"
+        flag: "-DUNDEFINED_BEHAVIOR_SANITIZER"
+      }
+    }
+  }
+
+  feature {
+    name: "minimal_ubsan_enabled_by_default"
+    requires {
+      feature: "llvm_codegen"
+      feature: "fastbuild"
+    }
+    enabled: true
+    implies: "minimal_ubsan"
+  }
+
+  cxx_builtin_include_directory: "%package(@local_config_clang6//clang6)%/llvm/lib/clang/6.0.0/include"
+  cxx_builtin_include_directory: "/usr/include"
+
+  unfiltered_cxx_flag: "-cxx-isystem"
+  unfiltered_cxx_flag: "/usr/include/c++/4.8"
+  unfiltered_cxx_flag: "-cxx-isystem"
+  unfiltered_cxx_flag: "/usr/include/x86_64-linux-gnu/c++/4.8"
+  unfiltered_cxx_flag: "-isystem"
+  unfiltered_cxx_flag: "%package(@local_config_clang6//clang6)%/llvm/lib/clang/6.0.0/include"
+  unfiltered_cxx_flag: "-isystem"
+  unfiltered_cxx_flag: "/usr/include/x86_64-linux-gnu"
+  unfiltered_cxx_flag: "-isystem"
+  unfiltered_cxx_flag: "/usr/include"
+
+  linker_flag: "-Wl,--build-id=md5"
+  linker_flag: "-Wl,--fatal-warnings"
+  linker_flag: "-Wl,--hash-style=gnu"
+  linker_flag: "-no-canonical-prefixes"
+  linker_flag: "--target=x86_64-unknown-linux-gnu"
+
+  linker_flag: "-L/usr/lib/gcc/x86_64-linux-gnu/4.8"
+
+  # This is the minimum x86 architecture TensorFlow supports.
+  compiler_flag: "-DARCH_K8"
+  compiler_flag: "-m64"
+
+  # These are for Linux.
+  ld_embed_flag: "-melf_x86_64"
+  linker_flag: "-Wl,--eh-frame-hdr"
+  linker_flag: "-Wl,-z,max-page-size=0x1000"
+
+  # Google never uses the stack like a heap, e.g. alloca(), because tcmalloc
+  # and jemalloc are so fast. However copts=["$(STACK_FRAME_UNLIMITED)"] can be
+  # specified when that can't be the case.
+  make_variable {
+    name: "STACK_FRAME_UNLIMITED"
+    value: "-Wframe-larger-than=100000000 -Wno-vla"
+  }
+
+  # These flags are for folks who build C/C++ code inside genrules.
+  make_variable {
+    name: "CC_FLAGS"
+    value: "-no-canonical-prefixes --target=x86_64-unknown-linux-gnu -fno-omit-frame-pointer -fno-tree-vrp -msse3"
+  }
+
+  feature {
+    name: "copts"
+    flag_set {
+      expand_if_all_available: "copts"
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-header-parsing"
+      action: "c++-header-preprocessing"
+      action: "c++-module-compile"
+      action: "c++-module-codegen"
+      action: "lto-backend"
+      flag_group {
+        iterate_over: "copts"
+        flag: "%{copts}"
+      }
+    }
+  }
+
+  # Please do not statically link libstdc++. This would probably lead to a lot
+  # of bloat since OpKernels need to use linkstatic=1 because  b/27630669 and
+  # it could cause memory leaks since Python uses dlopen() on our libraries:
+  # https://stackoverflow.com/a/35015415
+  linker_flag: "-lstdc++"
+  linker_flag: "-lm"
+  linker_flag: "-lpthread"
+  linker_flag: "-l:/lib/x86_64-linux-gnu/libc-2.19.so"
+}
diff --git a/third_party/toolchains/clang6/README.md b/third_party/toolchains/clang6/README.md
new file mode 100644
index 0000000000..0c6be25a0e
--- /dev/null
+++ b/third_party/toolchains/clang6/README.md
@@ -0,0 +1,101 @@
+# TensorFlow Bazel Clang
+
+This is a specialized toolchain that uses an old Debian with a new Clang that
+can cross compile to any x86_64 microarchitecture. It's intended to build Linux
+binaries that only require the following ABIs:
+
+- GLIBC_2.18
+- CXXABI_1.3.7 (GCC 4.8.3)
+- GCC_4.2.0
+
+Which are available on at least the following Linux platforms:
+
+- Ubuntu 14+
+- CentOS 7+
+- Debian 8+
+- SuSE 13.2+
+- Mint 17.3+
+- Manjaro 0.8.11
+
+# System Install
+
+On Debian 8 (Jessie) Clang 6.0 can be installed as follows:
+
+```sh
+cat >>/etc/apt/sources.list <<'EOF'
+deb http://apt.llvm.org/jessie/ llvm-toolchain-jessie main
+deb-src http://apt.llvm.org/jessie/ llvm-toolchain-jessie main
+EOF
+wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add -
+apt-key fingerprint |& grep '6084 F3CF 814B 57C1 CF12  EFD5 15CF 4D18 AF4F 7421'
+apt-get update
+apt-get install clang lld
+```
+
+# Bazel Configuration
+
+This toolchain can compile TensorFlow in 2m30s on a 96-core Skylake GCE VM if
+the following `.bazelrc` settings are added:
+
+```
+startup --host_jvm_args=-Xmx30G
+startup --host_jvm_args=-Xms30G
+startup --host_jvm_args=-XX:MaxNewSize=3g
+startup --host_jvm_args=-XX:-UseAdaptiveSizePolicy
+startup --host_jvm_args=-XX:+UseConcMarkSweepGC
+startup --host_jvm_args=-XX:TargetSurvivorRatio=70
+startup --host_jvm_args=-XX:SurvivorRatio=6
+startup --host_jvm_args=-XX:+UseCMSInitiatingOccupancyOnly
+startup --host_jvm_args=-XX:CMSFullGCsBeforeCompaction=1
+startup --host_jvm_args=-XX:CMSInitiatingOccupancyFraction=75
+
+build --jobs=100
+build --local_resources=200000,100,100
+build --crosstool_top=@local_config_clang6//clang6
+build --noexperimental_check_output_files
+build --nostamp
+build --config=opt
+build --noexperimental_check_output_files
+build --copt=-march=native
+build --host_copt=-march=native
+```
+
+# x86_64 Microarchitectures
+
+## Intel CPU Line
+
+- 2003 P6 M           SSE SSE2
+- 2004 prescott       SSE3 SSSE3 (-march=prescott)
+- 2006 core           X64 SSE4.1 (only on 45nm variety) (-march=core2)
+- 2008 nehalem        SSE4.2 VT-x VT-d (-march=nehalem)
+- 2010 westmere       CLMUL AES (-march=westmere)
+- 2012 sandybridge    AVX TXT (-march=sandybridge)
+- 2012 ivybridge      F16C MOVBE (-march=ivybridge)
+- 2013 haswell        AVX2 TSX BMI2 FMA (-march=haswell)
+- 2014 broadwell      RDSEED ADCX PREFETCHW (-march=broadwell - works on trusty gcc4.9)
+- 2015 skylake        SGX ADX MPX AVX-512[xeon-only] (-march=skylake / -march=skylake-avx512 - needs gcc7)
+- 2018 cannonlake     AVX-512 SHA (-march=cannonlake - needs clang5)
+
+## Intel Low Power CPU Line
+
+- 2013 silvermont     SSE4.1 SSE4.2 VT-x (-march=silvermont)
+- 2016 goldmont       SHA (-march=goldmont - needs clang5)
+
+## AMD CPU Line
+
+- 2003 k8             SSE SSE2 (-march=k8)
+- 2005 k8 (Venus)     SSE3 (-march=k8-sse3)
+- 2008 barcelona      SSE4a?! (-march=barcelona)
+- 2011 bulldozer      SSE4.1 SSE4.2 CLMUL AVX AES FMA4?! (-march=bdver1)
+- 2011 piledriver     FMA (-march=bdver2)
+- 2015 excavator      AVX2 BMI2 MOVBE (-march=bdver4)
+
+## Google Compute Engine Supported CPUs
+
+- 2012 sandybridge 2.6gHz -march=sandybridge
+- 2012 ivybridge   2.5gHz -march=ivybridge
+- 2013 haswell     2.3gHz -march=haswell
+- 2014 broadwell   2.2gHz -march=broadwell
+- 2015 skylake     2.0gHz -march=skylake-avx512
+
+See: <https://cloud.google.com/compute/docs/cpu-platforms>
diff --git a/third_party/toolchains/clang6/clang.BUILD b/third_party/toolchains/clang6/clang.BUILD
new file mode 100644
index 0000000000..802d62c17c
--- /dev/null
+++ b/third_party/toolchains/clang6/clang.BUILD
@@ -0,0 +1,162 @@
+package(default_visibility = ["//visibility:public"])
+
+# Please note that the output of these tools is unencumbered.
+licenses(["restricted"])  # NCSA, GPLv3 (e.g. gold)
+
+filegroup(
+    name = "ar",
+    srcs = ["llvm/bin/llvm-ar"],
+    output_licenses = ["unencumbered"],
+)
+
+filegroup(
+    name = "as",
+    srcs = ["llvm/bin/llvm-as"],
+    output_licenses = ["unencumbered"],
+)
+
+filegroup(
+    name = "cpp",
+    srcs = ["llvm/bin/llvm-cpp"],
+    output_licenses = ["unencumbered"],
+)
+
+filegroup(
+    name = "dwp",
+    srcs = ["llvm/bin/llvm-dwp"],
+    output_licenses = ["unencumbered"],
+)
+
+filegroup(
+    name = "gcc",
+    srcs = ["llvm/bin/clang"],
+    output_licenses = ["unencumbered"],
+)
+
+filegroup(
+    name = "gcov",
+    srcs = ["llvm/bin/llvm-cov"],
+    output_licenses = ["unencumbered"],
+)
+
+filegroup(
+    name = "ld",
+    srcs = ["llvm/bin/ld.lld"],
+    output_licenses = ["unencumbered"],
+)
+
+filegroup(
+    name = "nm",
+    srcs = ["llvm/bin/llvm-nm"],
+    output_licenses = ["unencumbered"],
+)
+
+filegroup(
+    name = "objcopy",
+    srcs = ["llvm/bin/llvm-objcopy"],
+    output_licenses = ["unencumbered"],
+)
+
+filegroup(
+    name = "objdump",
+    srcs = ["llvm/bin/llvm-objdump"],
+    output_licenses = ["unencumbered"],
+)
+
+filegroup(
+    name = "profdata",
+    srcs = ["llvm/bin/llvm-profdata"],
+    output_licenses = ["unencumbered"],
+)
+
+filegroup(
+    name = "strip",
+    srcs = ["sbin/strip"],
+    output_licenses = ["unencumbered"],
+)
+
+filegroup(
+    name = "xray",
+    srcs = ["llvm/bin/llvm-xray"],
+    output_licenses = ["unencumbered"],
+)
+
+filegroup(
+    name = "includes",
+    srcs = glob(["llvm/lib/clang/6.0.0/include/**"]),
+    output_licenses = ["unencumbered"],
+)
+
+filegroup(
+    name = "libraries",
+    srcs = glob([
+        "lib/*.*",
+        "lib/clang/6.0.0/lib/linux/*.*",
+    ]),
+    output_licenses = ["unencumbered"],
+)
+
+filegroup(
+    name = "compiler_files",
+    srcs = [
+        ":as",
+        ":gcc",
+        ":includes",
+    ],
+    output_licenses = ["unencumbered"],
+)
+
+filegroup(
+    name = "linker_files",
+    srcs = [
+        ":ar",
+        ":ld",
+        ":libraries",
+    ],
+    output_licenses = ["unencumbered"],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = [
+        ":compiler_files",
+        ":dwp",
+        ":gcov",
+        ":linker_files",
+        ":nm",
+        ":objcopy",
+        ":objdump",
+        ":profdata",
+        ":strip",
+        ":xray",
+    ],
+    output_licenses = ["unencumbered"],
+)
+
+filegroup(
+    name = "empty",
+    srcs = [],  # bazel crashes without this
+    output_licenses = ["unencumbered"],
+)
+
+cc_toolchain_suite(
+    name = "clang6",
+    toolchains = {
+        "k8|clang6": ":clang6-k8",
+    },
+)
+
+cc_toolchain(
+    name = "clang6-k8",
+    all_files = ":all_files",
+    compiler_files = ":compiler_files",
+    cpu = "k8",
+    dwp_files = ":dwp",
+    dynamic_runtime_libs = [":empty"],
+    linker_files = ":linker_files",
+    objcopy_files = ":objcopy",
+    output_licenses = ["unencumbered"],
+    static_runtime_libs = [":empty"],
+    strip_files = ":strip",
+    supports_param_files = 1,
+)
diff --git a/third_party/toolchains/clang6/repo.bzl b/third_party/toolchains/clang6/repo.bzl
new file mode 100644
index 0000000000..b81f44506f
--- /dev/null
+++ b/third_party/toolchains/clang6/repo.bzl
@@ -0,0 +1,30 @@
+"""Repository rule for Debian 8 Jessie Clang-6.0 portable Linux builds."""
+
+def _clang6_configure(ctx):
+  # TODO(jart): It'd probably be better to use Bazel's struct.to_proto()
+  #             method to generate a gigantic CROSSTOOL file that allows
+  #             Clang to support everything.
+  ctx.symlink(
+      ctx.os.environ.get('TF_LLVM_PATH',
+                         '/usr/lib/llvm-6.0'),
+      'clang6/llvm')
+  ctx.symlink(
+      ctx.os.environ.get('STRIP', '/usr/bin/strip'),
+      'clang6/sbin/strip')
+  ctx.symlink(
+      ctx.os.environ.get('OBJDUMP', '/usr/bin/objdump'),
+      'clang6/sbin/objdump')
+  ctx.symlink(ctx.attr._build, 'clang6/BUILD')
+  ctx.template('clang6/CROSSTOOL', ctx.attr._crosstool, {
+      '%package(@local_config_clang6//clang6)%': str(ctx.path('clang6')),
+  })
+
+clang6_configure = repository_rule(
+    implementation = _clang6_configure,
+    attrs = {
+        '_build': attr.label(
+            default=str(Label('//third_party/toolchains/clang6:clang.BUILD'))),
+        '_crosstool': attr.label(
+            default=str(Label('//third_party/toolchains/clang6:CROSSTOOL.tpl'))),
+    },
+)
-- 
GitLab


From 0d14f82da314f58bb7c9c4f5c866f1ffed056841 Mon Sep 17 00:00:00 2001
From: Loo Rong Jie <loorongjie@gmail.com>
Date: Thu, 25 Jan 2018 02:37:24 +0800
Subject: [PATCH 079/966] [Bazel/Windows] Don't use -Wl and -lm on Windows
 (#16341)

---
 tensorflow/tensorflow.bzl | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 383c97344a..cbb64a3489 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -258,6 +258,8 @@ def _rpath_linkopts(name):
       clean_dep("//tensorflow:darwin"): [
           "-Wl,%s" % (_make_search_paths("@loader_path", levels_to_root),),
       ],
+      clean_dep("//tensorflow:windows"): [],
+      clean_dep("//tensorflow:windows_msvc"): [],
       "//conditions:default": [
           "-Wl,%s" % (_make_search_paths("$$ORIGIN", levels_to_root),),
       ],
@@ -600,6 +602,8 @@ def tf_cc_test(name,
         "//tensorflow:android": [
             "-pie",
           ],
+        clean_dep("//tensorflow:windows"): [],
+        clean_dep("//tensorflow:windows_msvc"): [],
         "//conditions:default": [
             "-lpthread",
             "-lm"
@@ -1246,6 +1250,8 @@ def tf_custom_op_library(name, srcs=[], gpu_srcs=[], deps=[], linkopts=[]):
           "//conditions:default": [
               "-lm",
           ],
+          clean_dep("//tensorflow:windows"): [],
+          clean_dep("//tensorflow:windows_msvc"): [],
           clean_dep("//tensorflow:darwin"): [],
       }),)
 
-- 
GitLab


From 32e3acde7fd75c2a34fd10b6f11cd2df864e6e32 Mon Sep 17 00:00:00 2001
From: Akshay Agrawal <akshayka@google.com>
Date: Wed, 24 Jan 2018 10:34:07 -0800
Subject: [PATCH 080/966] TFE: Add graph benchmark for linear regression, for
 comparison with our eager execution benchmark.

PiperOrigin-RevId: 183105403
---
 .../python/examples/linear_regression/BUILD   | 10 +++
 .../linear_regression/linear_regression.py    | 18 ++--
 .../linear_regression_graph_test.py           | 85 +++++++++++++++++++
 .../linear_regression_test.py                 |  8 +-
 4 files changed, 113 insertions(+), 8 deletions(-)
 create mode 100644 tensorflow/contrib/eager/python/examples/linear_regression/linear_regression_graph_test.py

diff --git a/tensorflow/contrib/eager/python/examples/linear_regression/BUILD b/tensorflow/contrib/eager/python/examples/linear_regression/BUILD
index bab7ad0c70..f86331af6f 100644
--- a/tensorflow/contrib/eager/python/examples/linear_regression/BUILD
+++ b/tensorflow/contrib/eager/python/examples/linear_regression/BUILD
@@ -23,3 +23,13 @@ cuda_py_test(
         "//tensorflow:tensorflow_py",
     ],
 )
+
+cuda_py_test(
+    name = "linear_regression_graph_test",
+    size = "small",
+    srcs = ["linear_regression_graph_test.py"],
+    additional_deps = [
+        ":linear_regression",
+        "//tensorflow:tensorflow_py",
+    ],
+)
diff --git a/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression.py b/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression.py
index f4b7d67f94..6ce4de6ee0 100644
--- a/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression.py
+++ b/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression.py
@@ -63,6 +63,10 @@ class LinearModel(tfe.Network):
     return self._hidden_layer(xs)
 
 
+def mean_square_loss(model, xs, ys):
+  return tf.reduce_mean(tf.square(model(xs) - ys))
+
+
 def fit(model, dataset, optimizer, verbose=False, logdir=None):
   """Fit the linear-regression model.
 
@@ -76,10 +80,8 @@ def fit(model, dataset, optimizer, verbose=False, logdir=None):
   """
 
   # The loss function to optimize.
-  def mean_square_loss(xs, ys):
-    return tf.reduce_mean(tf.square(model(xs) - ys))
-
-  loss_and_grads = tfe.implicit_value_and_gradients(mean_square_loss)
+  mse = lambda xs, ys: mean_square_loss(model, xs, ys)
+  loss_and_grads = tfe.implicit_value_and_gradients(mse)
 
   tf.train.get_or_create_global_step()
   if logdir:
@@ -103,14 +105,20 @@ def fit(model, dataset, optimizer, verbose=False, logdir=None):
 
 def synthetic_dataset(w, b, noise_level, batch_size, num_batches):
   """tf.data.Dataset that yields synthetic data for linear regression."""
+  return synthetic_dataset_helper(w, b,
+                                  tf.shape(w)[0], noise_level, batch_size,
+                                  num_batches)
+
 
+def synthetic_dataset_helper(w, b, num_features, noise_level, batch_size,
+                             num_batches):
   # w is a matrix with shape [N, M]
   # b is a vector with shape [M]
   # So:
   # - Generate x's as vectors with shape [batch_size N]
   # - y = tf.matmul(x, W) + b + noise
   def batch(_):
-    x = tf.random_normal([batch_size, tf.shape(w)[0]])
+    x = tf.random_normal([batch_size, num_features])
     y = tf.matmul(x, w) + b + noise_level * tf.random_normal([])
     return x, y
 
diff --git a/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression_graph_test.py b/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression_graph_test.py
new file mode 100644
index 0000000000..557ad42752
--- /dev/null
+++ b/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression_graph_test.py
@@ -0,0 +1,85 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Graph benchmark for linear regression, to contrast with eager execution."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+import tensorflow as tf
+from tensorflow.contrib.eager.python.examples.linear_regression import linear_regression
+
+
+class GraphLinearRegressionBenchmark(tf.test.Benchmark):
+
+  def benchmarkGraphLinearRegression(self):
+    num_epochs = 10
+    num_batches = 200
+    batch_size = 64
+    dataset = linear_regression.synthetic_dataset_helper(
+        w=tf.random_uniform([3, 1]),
+        b=tf.random_uniform([1]),
+        num_features=3,
+        noise_level=0.01,
+        batch_size=batch_size,
+        num_batches=num_batches)
+    iterator = dataset.make_initializable_iterator()
+    x, y = iterator.get_next()
+
+    model = linear_regression.LinearModel()
+
+    if tf.test.is_gpu_available():
+      use_gpu = True
+      device = "/device:GPU:0"
+    else:
+      use_gpu = False
+      device = "/device:CPU:0"
+
+    with tf.device(device):
+      loss = linear_regression.mean_square_loss(model, x, y)
+      optimization_step = tf.train.GradientDescentOptimizer(
+          learning_rate=0.1).minimize(loss)
+
+    with tf.Session() as sess:
+      sess.run(tf.global_variables_initializer())
+
+      def train(num_epochs):
+        for _ in range(num_epochs):
+          sess.run(iterator.initializer)
+          try:
+            while True:
+              _, _ = sess.run([optimization_step, loss])
+          except tf.errors.OutOfRangeError:
+            pass
+
+      # Warmup: a single epoch.
+      train(1)
+
+      start_time = time.time()
+      train(num_epochs)
+      wall_time = time.time() - start_time
+
+      examples_per_sec = num_epochs * num_batches * batch_size / wall_time
+      self.report_benchmark(
+          name="graph_train_%s" %
+          ("gpu" if use_gpu else "cpu"),
+          iters=num_epochs * num_batches,
+          extras={"examples_per_sec": examples_per_sec},
+          wall_time=wall_time)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression_test.py b/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression_test.py
index 39e7aabd7b..e53234b51a 100644
--- a/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression_test.py
+++ b/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression_test.py
@@ -83,6 +83,7 @@ class LinearRegressionTest(tf.test.TestCase):
 class EagerLinearRegressionBenchmark(tf.test.Benchmark):
 
   def benchmarkEagerLinearRegression(self):
+    num_epochs = 10
     num_batches = 200
     batch_size = 64
     dataset = linear_regression.synthetic_dataset(
@@ -102,14 +103,15 @@ class EagerLinearRegressionBenchmark(tf.test.Benchmark):
       linear_regression.fit(model, burn_in_dataset, optimizer)
 
       start_time = time.time()
-      linear_regression.fit(model, dataset, optimizer)
+      for _ in range(num_epochs):
+        linear_regression.fit(model, dataset, optimizer)
       wall_time = time.time() - start_time
 
-      examples_per_sec = num_batches * batch_size / wall_time
+      examples_per_sec = num_epochs * num_batches * batch_size / wall_time
       self.report_benchmark(
           name="eager_train_%s" %
           ("gpu" if tfe.num_gpus() > 0 else "cpu"),
-          iters=num_batches,
+          iters=num_epochs * num_batches,
           extras={"examples_per_sec": examples_per_sec},
           wall_time=wall_time)
 
-- 
GitLab


From 4ce6decdbb8ca5e394d986bb327848dba767cc15 Mon Sep 17 00:00:00 2001
From: Cesc <Cescfangs@users.noreply.github.com>
Date: Thu, 25 Jan 2018 02:38:59 +0800
Subject: [PATCH 081/966] support preconditioner for `conjugated_gradient()` in
 `linear_equations.py` (#16237)

* support preconditioner for conjugated_gradient() in linear_equations.py

* support preconditioner for conjugated_gradient() in linear_equations.py

amend email to pass CLA check

* add jacobi, None test, insert np extra op when preconditioner=None

* using linalg_ops.norm() instead of util.l2norm_squared()
---
 .../kernel_tests/linear_equations_test.py     | 51 +++++++++++++-----
 .../solvers/python/kernel_tests/util_test.py  | 35 +++++++++++++
 .../solvers/python/ops/linear_equations.py    | 52 ++++++++++++++-----
 tensorflow/contrib/solvers/python/ops/util.py | 17 ++++++
 4 files changed, 129 insertions(+), 26 deletions(-)

diff --git a/tensorflow/contrib/solvers/python/kernel_tests/linear_equations_test.py b/tensorflow/contrib/solvers/python/kernel_tests/linear_equations_test.py
index 930df2414b..7b609ae96b 100644
--- a/tensorflow/contrib/solvers/python/kernel_tests/linear_equations_test.py
+++ b/tensorflow/contrib/solvers/python/kernel_tests/linear_equations_test.py
@@ -45,32 +45,55 @@ def _get_linear_equations_tests(dtype_, use_static_shape_, shape_):
         low=-1.0, high=1.0, size=np.prod(shape_)).reshape(shape_).astype(dtype_)
     # Make a selfadjoint, positive definite.
     a_np = np.dot(a_np.T, a_np)
+    # jacobi preconditioner
+    jacobi_np = np.zeros_like(a_np)
+    jacobi_np[range(a_np.shape[0]), range(a_np.shape[1])] = (1.0 /
+                                                             a_np.diagonal())
     rhs_np = np.random.uniform(
         low=-1.0, high=1.0, size=shape_[0]).astype(dtype_)
+    x_np = np.zeros_like(rhs_np)
     tol = 1e-6 if dtype_ == np.float64 else 1e-3
     max_iter = 20
     with self.test_session() as sess:
       if use_static_shape_:
         a = constant_op.constant(a_np)
         rhs = constant_op.constant(rhs_np)
+        x = constant_op.constant(x_np)
+        jacobi = constant_op.constant(jacobi_np)
       else:
         a = array_ops.placeholder(dtype_)
         rhs = array_ops.placeholder(dtype_)
+        x = array_ops.placeholder(dtype_)
+        jacobi = array_ops.placeholder(dtype_)
       operator = util.create_operator(a)
-      cg_graph = linear_equations.conjugate_gradient(
-          operator, rhs, tol=tol, max_iter=max_iter)
-      if use_static_shape_:
-        cg_val = sess.run(cg_graph)
-      else:
-        cg_val = sess.run(cg_graph, feed_dict={a: a_np, rhs: rhs_np})
-      norm_r0 = np.linalg.norm(rhs_np)
-      norm_r = np.sqrt(cg_val.gamma)
-      self.assertLessEqual(norm_r, tol * norm_r0)
-      # Validate that we get an equally small residual norm with numpy
-      # using the computed solution.
-      r_np = rhs_np - np.dot(a_np, cg_val.x)
-      norm_r_np = np.linalg.norm(r_np)
-      self.assertLessEqual(norm_r_np, tol * norm_r0)
+      preconditioners = [None, util.identity_operator(a),
+                         util.create_operator(jacobi)]
+      cg_results = []
+      for preconditioner in preconditioners:
+        cg_graph = linear_equations.conjugate_gradient(
+            operator, rhs, preconditioner=preconditioner,
+            x=x, tol=tol, max_iter=max_iter)
+        if use_static_shape_:
+          cg_val = sess.run(cg_graph)
+        else:
+          cg_val = sess.run(cg_graph, feed_dict={a: a_np, rhs: rhs_np, x: x_np,
+                                                 jacobi: jacobi_np})
+        norm_r0 = np.linalg.norm(rhs_np)
+        norm_r = np.linalg.norm(cg_val.r)
+        self.assertLessEqual(norm_r, tol * norm_r0)
+        # Validate that we get an equally small residual norm with numpy
+        # using the computed solution.
+        r_np = rhs_np - np.dot(a_np, cg_val.x)
+        norm_r_np = np.linalg.norm(r_np)
+        self.assertLessEqual(norm_r_np, tol * norm_r0)
+        cg_results.append(cg_val)
+      # Validate that we get same results using identity_preconditioner
+      # and None
+      self.assertEqual(cg_results[0].i, cg_results[1].i)
+      self.assertAlmostEqual(cg_results[0].gamma, cg_results[1].gamma)
+      self.assertAllClose(cg_results[0].r, cg_results[1].r, rtol=tol)
+      self.assertAllClose(cg_results[0].x, cg_results[1].x, rtol=tol)
+      self.assertAllClose(cg_results[0].p, cg_results[1].p, rtol=tol)
 
   return [test_conjugate_gradient]
 
diff --git a/tensorflow/contrib/solvers/python/kernel_tests/util_test.py b/tensorflow/contrib/solvers/python/kernel_tests/util_test.py
index 1566984b27..12e94369cb 100644
--- a/tensorflow/contrib/solvers/python/kernel_tests/util_test.py
+++ b/tensorflow/contrib/solvers/python/kernel_tests/util_test.py
@@ -63,6 +63,41 @@ class UtilTest(test.TestCase):
   def testCreateOperatorUnknownShape(self):
     self._testCreateOperator(False)
 
+  def _testIdentityOperator(self, use_static_shape_):
+    for dtype in np.float32, np.float64:
+      a_np = np.array([[1., 2.], [3., 4.], [5., 6.]], dtype=dtype)
+      x_np = np.array([[2.], [-3.]], dtype=dtype)
+      y_np = np.array([[2], [-3.], [5.]], dtype=dtype)
+      with self.test_session() as sess:
+        if use_static_shape_:
+          a = constant_op.constant(a_np, dtype=dtype)
+          x = constant_op.constant(x_np, dtype=dtype)
+          y = constant_op.constant(y_np, dtype=dtype)
+        else:
+          a = array_ops.placeholder(dtype)
+          x = array_ops.placeholder(dtype)
+          y = array_ops.placeholder(dtype)
+        id_op = util.identity_operator(a)
+        ax = id_op.apply(x)
+        aty = id_op.apply_adjoint(y)
+        op_shape = ops.convert_to_tensor(id_op.shape)
+        if use_static_shape_:
+          op_shape_val, ax_val, aty_val = sess.run([op_shape, ax, aty])
+        else:
+          op_shape_val, ax_val, aty_val = sess.run(
+              [op_shape, ax, aty], feed_dict={a: a_np,
+                                              x: x_np,
+                                              y: y_np})
+      self.assertAllEqual(op_shape_val, [3, 2])
+      self.assertAllClose(ax_val, x_np)
+      self.assertAllClose(aty_val, y_np)
+
+  def testIdentityOperator(self):
+    self._testIdentityOperator(True)
+
+  def testIdentityOperatorUnknownShape(self):
+    self._testIdentityOperator(False)
+
   def testL2Norm(self):
     with self.test_session():
       x_np = np.array([[2], [-3.], [5.]])
diff --git a/tensorflow/contrib/solvers/python/ops/linear_equations.py b/tensorflow/contrib/solvers/python/ops/linear_equations.py
index 8cba56eba6..4dfaa97ac9 100644
--- a/tensorflow/contrib/solvers/python/ops/linear_equations.py
+++ b/tensorflow/contrib/solvers/python/ops/linear_equations.py
@@ -27,10 +27,13 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import linalg_ops
 
 
 def conjugate_gradient(operator,
                        rhs,
+                       preconditioner=None,
+                       x=None,
                        tol=1e-4,
                        max_iter=20,
                        name="conjugate_gradient"):
@@ -55,6 +58,15 @@ def conjugate_gradient(operator,
         vector with the result of applying the operator to `x`, i.e. if
        `operator` represents matrix `A`, `apply` should return `A * x`.
     rhs: A rank-1 `Tensor` of shape `[N]` containing the right-hand size vector.
+    preconditioner: An object representing a linear operator, see `operator`
+      for detail. The preconditioner should approximate the inverse of `A`.
+      An efficient preconditioner could dramatically improve the rate of
+      convergence. If `preconditioner` represents matrix `M`(`M` approximates
+      `A^{-1}`), the algorithm uses `preconditioner.apply(x)` to estimate
+      `A^{-1}x`. For this to be useful, the cost of applying `M` should be
+      much lower than computing `A^{-1}` directly.
+    x: A rank-1 `Tensor` of shape `[N]` containing the initial guess for the
+      solution.
     tol: A float scalar convergence tolerance.
     max_iter: An integer giving the maximum number of iterations.
     name: A name scope for the operation.
@@ -65,35 +77,51 @@ def conjugate_gradient(operator,
       - x: A rank-1 `Tensor` of shape `[N]` containing the computed solution.
       - r: A rank-1 `Tensor` of shape `[M]` containing the residual vector.
       - p: A rank-1 `Tensor` of shape `[N]`. `A`-conjugate basis vector.
-      - gamma: \\(||r||_2^2\\)
+      - gamma: \\(r \dot M \dot r\\), equivalent to  \\(||r||_2^2\\) when
+        `preconditioner=None`.
   """
   # ephemeral class holding CG state.
   cg_state = collections.namedtuple("CGState", ["i", "x", "r", "p", "gamma"])
 
   def stopping_criterion(i, state):
-    return math_ops.logical_and(i < max_iter, state.gamma > tol)
+    return math_ops.logical_and(i < max_iter,
+                                linalg_ops.norm(state.r) > tol)
 
-  # TODO(rmlarsen): add preconditioning
   def cg_step(i, state):
     z = operator.apply(state.p)
     alpha = state.gamma / util.dot(state.p, z)
     x = state.x + alpha * state.p
     r = state.r - alpha * z
-    gamma = util.l2norm_squared(r)
-    beta = gamma / state.gamma
-    p = r + beta * state.p
+    if preconditioner is None:
+      gamma = util.dot(r, r)
+      beta = gamma / state.gamma
+      p = r + beta * state.p
+    else:
+      q = preconditioner.apply(r)
+      gamma = util.dot(r, q)
+      beta = gamma / state.gamma
+      p = q + beta * state.p
     return i + 1, cg_state(i + 1, x, r, p, gamma)
 
   with ops.name_scope(name):
     n = operator.shape[1:]
     rhs = array_ops.expand_dims(rhs, -1)
-    gamma0 = util.l2norm_squared(rhs)
-    tol = tol * tol * gamma0
-    x = array_ops.expand_dims(
-        array_ops.zeros(
-            n, dtype=rhs.dtype.base_dtype), -1)
+    if x is None:
+      x = array_ops.expand_dims(
+          array_ops.zeros(
+              n, dtype=rhs.dtype.base_dtype), -1)
+      r0 = rhs
+    else:
+      x = array_ops.expand_dims(x, -1)
+      r0 = rhs - operator.apply(x)
+    if preconditioner is None:
+      p0 = r0
+    else:
+      p0 = preconditioner.apply(r0)
+    gamma0 = util.dot(r0, p0)
+    tol = tol * linalg_ops.norm(r0)
     i = constant_op.constant(0, dtype=dtypes.int32)
-    state = cg_state(i=i, x=x, r=rhs, p=rhs, gamma=gamma0)
+    state = cg_state(i=i, x=x, r=r0, p=p0, gamma=gamma0)
     _, state = control_flow_ops.while_loop(stopping_criterion, cg_step,
                                            [i, state])
     return cg_state(
diff --git a/tensorflow/contrib/solvers/python/ops/util.py b/tensorflow/contrib/solvers/python/ops/util.py
index 777e0c185d..96947e8eea 100644
--- a/tensorflow/contrib/solvers/python/ops/util.py
+++ b/tensorflow/contrib/solvers/python/ops/util.py
@@ -45,6 +45,23 @@ def create_operator(matrix):
       apply_adjoint=lambda v: math_ops.matmul(matrix, v, adjoint_a=True))
 
 
+def identity_operator(matrix):
+  """Creates a linear operator from a rank-2 identity tensor."""
+
+  linear_operator = collections.namedtuple(
+      "LinearOperator", ["shape", "dtype", "apply", "apply_adjoint"])
+  shape = matrix.get_shape()
+  if shape.is_fully_defined():
+    shape = shape.as_list()
+  else:
+    shape = array_ops.shape(matrix)
+  return linear_operator(
+      shape=shape,
+      dtype=matrix.dtype,
+      apply=lambda v: v,
+      apply_adjoint=lambda v: v)
+
+
 # TODO(rmlarsen): Measure if we should just call matmul.
 def dot(x, y):
   return math_ops.reduce_sum(math_ops.conj(x) * y)
-- 
GitLab


From 18cb45c1cc056648cfc98311963a94cfe0f984b3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 24 Jan 2018 10:35:22 -0800
Subject: [PATCH 082/966] Internal Change

PiperOrigin-RevId: 183105591
---
 .../lite/models/smartreply/predictor_test.cc  |   8 +-
 .../lite/models/speech_asr_am_model_test.cc   | 127 ------------------
 .../lite/models/speech_asr_lm_model_test.cc   | 122 -----------------
 .../models/speech_endpointer_model_test.cc    | 104 --------------
 .../lite/models/speech_hotword_model_test.cc  | 114 ----------------
 .../models/speech_speakerid_model_test.cc     | 121 -----------------
 .../lite/models/speech_tts_model_test.cc      | 116 ----------------
 tensorflow/contrib/lite/models/test_utils.h   |  84 ------------
 8 files changed, 7 insertions(+), 789 deletions(-)
 delete mode 100644 tensorflow/contrib/lite/models/speech_asr_am_model_test.cc
 delete mode 100644 tensorflow/contrib/lite/models/speech_asr_lm_model_test.cc
 delete mode 100644 tensorflow/contrib/lite/models/speech_endpointer_model_test.cc
 delete mode 100644 tensorflow/contrib/lite/models/speech_hotword_model_test.cc
 delete mode 100644 tensorflow/contrib/lite/models/speech_speakerid_model_test.cc
 delete mode 100644 tensorflow/contrib/lite/models/speech_tts_model_test.cc
 delete mode 100644 tensorflow/contrib/lite/models/test_utils.h

diff --git a/tensorflow/contrib/lite/models/smartreply/predictor_test.cc b/tensorflow/contrib/lite/models/smartreply/predictor_test.cc
index 97d3c650e2..e6c8d966f1 100644
--- a/tensorflow/contrib/lite/models/smartreply/predictor_test.cc
+++ b/tensorflow/contrib/lite/models/smartreply/predictor_test.cc
@@ -22,8 +22,9 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_split.h"
-#include "tensorflow/contrib/lite/models/test_utils.h"
+//#include "tensorflow/contrib/lite/models/test_utils.h"
 #include "tensorflow/contrib/lite/string_util.h"
+#include "tensorflow/core/platform/test.h"
 
 namespace tflite {
 namespace custom {
@@ -33,6 +34,11 @@ namespace {
 const char kModelName[] = "smartreply_ondevice_model.bin";
 const char kSamples[] = "smartreply_samples.tsv";
 
+string TestDataPath() {
+  return string(StrCat(tensorflow::testing::TensorFlowSrcRoot(), "/",
+                       "contrib/lite/models/testdata/"));
+}
+
 MATCHER_P(IncludeAnyResponesIn, expected_response, "contains the response") {
   bool has_expected_response = false;
   for (const auto &item : *arg) {
diff --git a/tensorflow/contrib/lite/models/speech_asr_am_model_test.cc b/tensorflow/contrib/lite/models/speech_asr_am_model_test.cc
deleted file mode 100644
index bf95b313f3..0000000000
--- a/tensorflow/contrib/lite/models/speech_asr_am_model_test.cc
+++ /dev/null
@@ -1,127 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Unit test for speech ASR AM model using TFLite Ops.
-
-#include <string.h>
-
-#include <memory>
-#include <string>
-
-#include "base/logging.h"
-#include "file/base/path.h"
-#include "testing/base/public/googletest.h"
-#include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/context.h"
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/model.h"
-#include "tensorflow/contrib/lite/models/test_utils.h"
-
-namespace tflite {
-namespace models {
-
-constexpr int kModelInputTensor = 0;
-constexpr int kLstmLayer1OutputStateTensor = 19;
-constexpr int kLstmLayer1CellStateTensor = 20;
-constexpr int kLstmLayer2OutputStateTensor = 40;
-constexpr int kLstmLayer2CellStateTensor = 41;
-constexpr int kLstmLayer3OutputStateTensor = 61;
-constexpr int kLstmLayer3CellStateTensor = 62;
-constexpr int kLstmLayer4OutputStateTensor = 82;
-constexpr int kLstmLayer4CellStateTensor = 83;
-constexpr int kLstmLayer5OutputStateTensor = 103;
-constexpr int kLstmLayer5CellStateTensor = 104;
-constexpr int kModelOutputTensor = 109;
-
-TEST(SpeechAsrAm, RandomIOTest) {
-  // Read the model.
-  string tflite_file_path =
-      file::JoinPath(TestDataPath(), "speech_asr_am_model.tflite");
-  auto model = FlatBufferModel::BuildFromFile(tflite_file_path.c_str());
-  CHECK(model) << "Failed to mmap model " << tflite_file_path;
-
-  // Initialize the interpreter.
-  ops::builtin::BuiltinOpResolver builtins;
-  std::unique_ptr<Interpreter> interpreter;
-  InterpreterBuilder(*model, builtins)(&interpreter);
-  CHECK(interpreter != nullptr);
-  interpreter->AllocateTensors();
-
-  // Load the input frames.
-  Frames input_frames;
-  const string input_file_path =
-      file::JoinPath(TestDataPath(), "speech_asr_am_model_in.csv");
-  ReadFrames(input_file_path, &input_frames);
-
-  // Load the golden output results.
-  Frames output_frames;
-  const string output_file_path =
-      file::JoinPath(TestDataPath(), "speech_asr_am_model_out.csv");
-  ReadFrames(output_file_path, &output_frames);
-
-  const int speech_batch_size =
-      interpreter->tensor(kModelInputTensor)->dims->data[0];
-  const int speech_input_size =
-      interpreter->tensor(kModelInputTensor)->dims->data[1];
-  const int speech_output_size =
-      interpreter->tensor(kModelOutputTensor)->dims->data[1];
-
-  float* input_ptr = interpreter->tensor(kModelInputTensor)->data.f;
-  float* output_ptr = interpreter->tensor(kModelOutputTensor)->data.f;
-
-  // Clear the LSTM state for layers.
-  memset(interpreter->tensor(kLstmLayer1OutputStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer1OutputStateTensor)->bytes);
-  memset(interpreter->tensor(kLstmLayer1CellStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer1CellStateTensor)->bytes);
-
-  memset(interpreter->tensor(kLstmLayer2OutputStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer2OutputStateTensor)->bytes);
-  memset(interpreter->tensor(kLstmLayer2CellStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer2CellStateTensor)->bytes);
-
-  memset(interpreter->tensor(kLstmLayer3OutputStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer3OutputStateTensor)->bytes);
-  memset(interpreter->tensor(kLstmLayer3CellStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer3CellStateTensor)->bytes);
-
-  memset(interpreter->tensor(kLstmLayer4OutputStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer4OutputStateTensor)->bytes);
-  memset(interpreter->tensor(kLstmLayer4CellStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer4CellStateTensor)->bytes);
-
-  memset(interpreter->tensor(kLstmLayer5OutputStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer5OutputStateTensor)->bytes);
-  memset(interpreter->tensor(kLstmLayer5CellStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer5CellStateTensor)->bytes);
-
-
-  for (int i = 0; i < input_frames.size(); i++) {
-    // Feed the input to model.
-    int frame_ptr = 0;
-    for (int k = 0; k < speech_input_size * speech_batch_size; k++) {
-      input_ptr[k] = input_frames[i][frame_ptr++];
-    }
-    // Run the model.
-    interpreter->Invoke();
-    // Validate the output.
-    for (int k = 0; k < speech_output_size; k++) {
-      ASSERT_NEAR(output_ptr[k], output_frames[i][k], 5.2e-4);
-    }
-  }
-}
-
-}  // namespace models
-}  // namespace tflite
diff --git a/tensorflow/contrib/lite/models/speech_asr_lm_model_test.cc b/tensorflow/contrib/lite/models/speech_asr_lm_model_test.cc
deleted file mode 100644
index 53f2b66da4..0000000000
--- a/tensorflow/contrib/lite/models/speech_asr_lm_model_test.cc
+++ /dev/null
@@ -1,122 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Unit test for speech ASR LM model using TFLite Ops.
-
-#include <string.h>
-
-#include <memory>
-#include <string>
-
-#include "base/logging.h"
-#include "file/base/path.h"
-#include "testing/base/public/googletest.h"
-#include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/context.h"
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/model.h"
-#include "tensorflow/contrib/lite/models/test_utils.h"
-
-namespace tflite {
-namespace models {
-
-constexpr int kModelInput1Tensor = 0;
-constexpr int kModelInput2Tensor = 66;
-constexpr int kLstmLayer1OutputStateTensor = 21;
-constexpr int kLstmLayer1CellStateTensor = 22;
-constexpr int kLstmLayer2OutputStateTensor = 42;
-constexpr int kLstmLayer2CellStateTensor = 43;
-constexpr int kLstmLayer3OutputStateTensor = 63;
-constexpr int kLstmLayer3CellStateTensor = 64;
-constexpr int kModelOutputTensor = 75;
-
-static void ClearLstmStates(Interpreter* interpreter) {
-  memset(interpreter->tensor(kLstmLayer1OutputStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer1OutputStateTensor)->bytes);
-  memset(interpreter->tensor(kLstmLayer1CellStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer1CellStateTensor)->bytes);
-
-  memset(interpreter->tensor(kLstmLayer2OutputStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer2OutputStateTensor)->bytes);
-  memset(interpreter->tensor(kLstmLayer2CellStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer2CellStateTensor)->bytes);
-
-  memset(interpreter->tensor(kLstmLayer3OutputStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer3OutputStateTensor)->bytes);
-  memset(interpreter->tensor(kLstmLayer3CellStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer3CellStateTensor)->bytes);
-}
-
-TEST(SpeechAsrLm, EndToEndTest) {
-  // Read the model.
-  string tflite_file_path =
-      file::JoinPath(TestDataPath(), "speech_asr_lm_model.tflite");
-  auto model = FlatBufferModel::BuildFromFile(tflite_file_path.c_str());
-  CHECK(model) << "Failed to mmap model " << tflite_file_path;
-
-  // Initialize the interpreter.
-  ops::builtin::BuiltinOpResolver builtins;
-  std::unique_ptr<Interpreter> interpreter;
-  InterpreterBuilder(*model, builtins)(&interpreter);
-  CHECK(interpreter != nullptr);
-  interpreter->AllocateTensors();
-
-  // Load the input frames.
-  Frames input_frames;
-  const string input_file_path =
-      file::JoinPath(TestDataPath(), "speech_asr_lm_model_in.csv");
-  ReadFrames(input_file_path, &input_frames);
-
-  // Load the golden output results.
-  Frames output_frames;
-  const string output_file_path =
-      file::JoinPath(TestDataPath(), "speech_asr_lm_model_out.csv");
-  ReadFrames(output_file_path, &output_frames);
-
-  CHECK_EQ(interpreter->tensor(kModelInput1Tensor)->dims->size, 1);
-  const int input1_size =
-      interpreter->tensor(kModelInput1Tensor)->dims->data[0];
-  CHECK_EQ(input1_size, 1);
-  CHECK_EQ(interpreter->tensor(kModelInput2Tensor)->dims->size, 1);
-  const int output_size =
-      interpreter->tensor(kModelOutputTensor)->dims->data[0];
-  CHECK_EQ(output_size, 1);
-
-  int* input_lookup_ptr = interpreter->tensor(kModelInput1Tensor)->data.i32;
-  int* output_lookup_ptr = interpreter->tensor(kModelInput2Tensor)->data.i32;
-  float* output_ptr = interpreter->tensor(kModelOutputTensor)->data.f;
-
-
-  for (int i = 0; i < input_frames.size(); i++) {
-    float output_score = 0.0f;
-    // Reset LSTM states for each sequence.
-    ClearLstmStates(interpreter.get());
-    // For subsequent inputs feed them sequentially, one-by-one.
-    for (int k = 1; k < input_frames[i].size(); k++) {
-      // Feed the inputs to model.
-      input_lookup_ptr[0] = static_cast<int32>(input_frames[i][k - 1]);
-      output_lookup_ptr[0] = static_cast<int32>(input_frames[i][k]);
-      // Run the model.
-      interpreter->Invoke();
-      // Sum up the outputs.
-      output_score += output_ptr[0];
-    }
-    // Validate the output.
-    ASSERT_NEAR(output_score, output_frames[i][0], 1.4e-5);
-  }
-}
-
-}  // namespace models
-}  // namespace tflite
diff --git a/tensorflow/contrib/lite/models/speech_endpointer_model_test.cc b/tensorflow/contrib/lite/models/speech_endpointer_model_test.cc
deleted file mode 100644
index f7e136113a..0000000000
--- a/tensorflow/contrib/lite/models/speech_endpointer_model_test.cc
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Unit test for speech EndPointer model using TFLite Ops.
-
-#include <string.h>
-
-#include <memory>
-#include <string>
-
-#include "base/logging.h"
-#include "testing/base/public/googletest.h"
-#include <gtest/gtest.h>
-#include "absl/strings/str_cat.h"
-#include "tensorflow/contrib/lite/context.h"
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/model.h"
-#include "tensorflow/contrib/lite/models/test_utils.h"
-
-namespace tflite {
-namespace models {
-
-constexpr int kModelInputTensor = 0;
-constexpr int kLstmLayer1OutputStateTensor = 28;
-constexpr int kLstmLayer1CellStateTensor = 29;
-constexpr int kLstmLayer2OutputStateTensor = 49;
-constexpr int kLstmLayer2CellStateTensor = 50;
-constexpr int kModelOutputTensor = 58;
-
-TEST(SpeechEndpointer, EndpointerTest) {
-  // Read the model.
-  string tflite_file_path =
-      StrCat(TestDataPath(), "/", "speech_endpointer_model.tflite");
-  auto model = FlatBufferModel::BuildFromFile(tflite_file_path.c_str());
-  CHECK(model) << "Failed to read model from file " << tflite_file_path;
-
-  // Initialize the interpreter.
-  ops::builtin::BuiltinOpResolver builtins;
-  std::unique_ptr<Interpreter> interpreter;
-  InterpreterBuilder(*model, builtins)(&interpreter);
-  CHECK(interpreter != nullptr);
-  interpreter->AllocateTensors();
-
-  // Load the input frames.
-  Frames input_frames;
-  const string input_file_path =
-      StrCat(TestDataPath(), "/", "speech_endpointer_model_in.csv");
-  ReadFrames(input_file_path, &input_frames);
-
-  // Load the golden output results.
-  Frames output_frames;
-  const string output_file_path =
-      StrCat(TestDataPath(), "/", "speech_endpointer_model_out.csv");
-  ReadFrames(output_file_path, &output_frames);
-
-  const int speech_batch_size =
-      interpreter->tensor(kModelInputTensor)->dims->data[0];
-  const int speech_input_size =
-      interpreter->tensor(kModelInputTensor)->dims->data[1];
-  const int speech_output_size =
-      interpreter->tensor(kModelOutputTensor)->dims->data[1];
-
-  float* input_ptr = interpreter->tensor(kModelInputTensor)->data.f;
-  float* output_ptr = interpreter->tensor(kModelOutputTensor)->data.f;
-
-  // Clear the LSTM state for layers.
-  memset(interpreter->tensor(kLstmLayer1OutputStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer1OutputStateTensor)->bytes);
-  memset(interpreter->tensor(kLstmLayer1CellStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer1CellStateTensor)->bytes);
-  memset(interpreter->tensor(kLstmLayer2OutputStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer2OutputStateTensor)->bytes);
-  memset(interpreter->tensor(kLstmLayer2CellStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer2CellStateTensor)->bytes);
-
-  for (int i = 0; i < input_frames.size(); i++) {
-    // Feed the input to model.
-    int frame_ptr = 0;
-    for (int k = 0; k < speech_input_size * speech_batch_size; k++) {
-      input_ptr[k] = input_frames[i][frame_ptr++];
-    }
-    // Run the model.
-    interpreter->Invoke();
-    // Validate the output.
-    for (int k = 0; k < speech_output_size; k++) {
-      ASSERT_NEAR(output_ptr[k], output_frames[i][k], 1e-5);
-    }
-  }
-}
-
-}  // namespace models
-}  // namespace tflite
diff --git a/tensorflow/contrib/lite/models/speech_hotword_model_test.cc b/tensorflow/contrib/lite/models/speech_hotword_model_test.cc
deleted file mode 100644
index f69cae8d2c..0000000000
--- a/tensorflow/contrib/lite/models/speech_hotword_model_test.cc
+++ /dev/null
@@ -1,114 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Unit test for speech Hotword model using TFLite Ops.
-
-#include <string.h>
-
-#include <memory>
-#include <string>
-
-#include "base/logging.h"
-#include "testing/base/public/googletest.h"
-#include <gtest/gtest.h>
-#include "absl/strings/str_cat.h"
-#include "tensorflow/contrib/lite/context.h"
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/model.h"
-#include "tensorflow/contrib/lite/models/test_utils.h"
-
-namespace tflite {
-namespace models {
-
-void RunTest(int model_input_tensor, int svdf_layer_state_tensor,
-             int model_output_tensor, const string& model_name,
-             const string& golden_in_name, const string& golden_out_name) {
-  // Read the model.
-  string tflite_file_path = StrCat(TestDataPath(), "/", model_name);
-  auto model = FlatBufferModel::BuildFromFile(tflite_file_path.c_str());
-  CHECK(model) << "Failed to read model from file " << tflite_file_path;
-
-  // Initialize the interpreter.
-  ops::builtin::BuiltinOpResolver builtins;
-  std::unique_ptr<Interpreter> interpreter;
-  InterpreterBuilder(*model, builtins)(&interpreter);
-  CHECK(interpreter != nullptr);
-  interpreter->AllocateTensors();
-
-  // Reset the SVDF layer state.
-  memset(interpreter->tensor(svdf_layer_state_tensor)->data.raw, 0,
-         interpreter->tensor(svdf_layer_state_tensor)->bytes);
-
-  // Load the input frames.
-  Frames input_frames;
-  const string input_file_path = StrCat(TestDataPath(), "/", golden_in_name);
-  ReadFrames(input_file_path, &input_frames);
-
-  // Load the golden output results.
-  Frames output_frames;
-  const string output_file_path = StrCat(TestDataPath(), "/", golden_out_name);
-  ReadFrames(output_file_path, &output_frames);
-
-  const int speech_batch_size =
-      interpreter->tensor(model_input_tensor)->dims->data[0];
-  const int speech_input_size =
-      interpreter->tensor(model_input_tensor)->dims->data[1];
-  const int speech_output_size =
-      interpreter->tensor(model_output_tensor)->dims->data[1];
-  const int input_sequence_size =
-      input_frames[0].size() / (speech_input_size * speech_batch_size);
-  float* input_ptr = interpreter->tensor(model_input_tensor)->data.f;
-  float* output_ptr = interpreter->tensor(model_output_tensor)->data.f;
-
-  // The first layer (SVDF) input size is 40 (speech_input_size). Each speech
-  // input frames for this model is 1600 floats, which can be fed to input in a
-  // sequence of size 40 (input_sequence_size).
-  for (int i = 0; i < TestInputSize(input_frames); i++) {
-    int frame_ptr = 0;
-    for (int s = 0; s < input_sequence_size; s++) {
-      for (int k = 0; k < speech_input_size * speech_batch_size; k++) {
-        input_ptr[k] = input_frames[i][frame_ptr++];
-      }
-      interpreter->Invoke();
-    }
-    // After the whole frame (1280 floats) is fed, we can check the output frame
-    // matches with the golden output frame.
-    for (int k = 0; k < speech_output_size; k++) {
-      ASSERT_NEAR(output_ptr[k], output_frames[i][k], 1e-5);
-    }
-  }
-}
-
-TEST(SpeechHotword, OkGoogleTestRank1) {
-  constexpr int kModelInputTensor = 0;
-  constexpr int kSvdfLayerStateTensor = 4;
-  constexpr int kModelOutputTensor = 18;
-
-  RunTest(kModelInputTensor, kSvdfLayerStateTensor, kModelOutputTensor,
-          "speech_hotword_model_rank1.tflite", "speech_hotword_model_in.csv",
-          "speech_hotword_model_out_rank1.csv");
-}
-
-TEST(SpeechHotword, OkGoogleTestRank2) {
-  constexpr int kModelInputTensor = 17;
-  constexpr int kSvdfLayerStateTensor = 1;
-  constexpr int kModelOutputTensor = 18;
-  RunTest(kModelInputTensor, kSvdfLayerStateTensor, kModelOutputTensor,
-          "speech_hotword_model_rank2.tflite", "speech_hotword_model_in.csv",
-          "speech_hotword_model_out_rank2.csv");
-}
-
-}  // namespace models
-}  // namespace tflite
diff --git a/tensorflow/contrib/lite/models/speech_speakerid_model_test.cc b/tensorflow/contrib/lite/models/speech_speakerid_model_test.cc
deleted file mode 100644
index e208fac8df..0000000000
--- a/tensorflow/contrib/lite/models/speech_speakerid_model_test.cc
+++ /dev/null
@@ -1,121 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Unit test for speech SpeakerId model using TFLite Ops.
-
-#include <string.h>
-
-#include <memory>
-#include <string>
-
-#include "base/logging.h"
-#include "testing/base/public/googletest.h"
-#include <gtest/gtest.h>
-#include "absl/strings/str_cat.h"
-#include "tensorflow/contrib/lite/context.h"
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/model.h"
-#include "tensorflow/contrib/lite/models/test_utils.h"
-#include "tensorflow/contrib/lite/tools/mutable_op_resolver.h"
-
-void RegisterSelectedOps(::tflite::MutableOpResolver* resolver);
-
-namespace tflite {
-namespace models {
-
-constexpr int kModelInputTensor = 0;
-constexpr int kLstmLayer1OutputStateTensor = 19;
-constexpr int kLstmLayer1CellStateTensor = 20;
-constexpr int kLstmLayer2OutputStateTensor = 40;
-constexpr int kLstmLayer2CellStateTensor = 41;
-constexpr int kLstmLayer3OutputStateTensor = 61;
-constexpr int kLstmLayer3CellStateTensor = 62;
-constexpr int kModelOutputTensor = 66;
-
-void SpeakerIdTest(bool useNNAPI) {
-  // Read the model.
-  string tflite_file_path =
-      StrCat(TestDataPath(), "/", "speech_speakerid_model.tflite");
-  auto model = FlatBufferModel::BuildFromFile(tflite_file_path.c_str());
-  CHECK(model) << "Failed to read model from file " << tflite_file_path;
-
-  // Initialize the interpreter.
-  ::tflite::MutableOpResolver resolver;
-  RegisterSelectedOps(&resolver);
-  std::unique_ptr<Interpreter> interpreter;
-  InterpreterBuilder(*model, resolver)(&interpreter);
-  CHECK(interpreter != nullptr);
-
-  interpreter->UseNNAPI(useNNAPI);
-
-  interpreter->AllocateTensors();
-
-  // Load the input frames.
-  Frames input_frames;
-  const string input_file_path =
-      StrCat(TestDataPath(), "/", "speech_speakerid_model_in.csv");
-  ReadFrames(input_file_path, &input_frames);
-
-  // Load the golden output results.
-  Frames output_frames;
-  const string output_file_path =
-      StrCat(TestDataPath(), "/", "speech_speakerid_model_out.csv");
-  ReadFrames(output_file_path, &output_frames);
-
-  const int speech_batch_size =
-      interpreter->tensor(kModelInputTensor)->dims->data[0];
-  const int speech_input_size =
-      interpreter->tensor(kModelInputTensor)->dims->data[1];
-  const int speech_output_size =
-      interpreter->tensor(kModelOutputTensor)->dims->data[1];
-
-  float* input_ptr = interpreter->tensor(kModelInputTensor)->data.f;
-  float* output_ptr = interpreter->tensor(kModelOutputTensor)->data.f;
-
-  // Clear the LSTM state for layers.
-  memset(interpreter->tensor(kLstmLayer1OutputStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer1OutputStateTensor)->bytes);
-  memset(interpreter->tensor(kLstmLayer1CellStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer1CellStateTensor)->bytes);
-
-  memset(interpreter->tensor(kLstmLayer2OutputStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer2OutputStateTensor)->bytes);
-  memset(interpreter->tensor(kLstmLayer2CellStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer2CellStateTensor)->bytes);
-
-  memset(interpreter->tensor(kLstmLayer3OutputStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer3OutputStateTensor)->bytes);
-  memset(interpreter->tensor(kLstmLayer3CellStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer3CellStateTensor)->bytes);
-  for (int i = 0; i < input_frames.size(); i++) {
-    // Feed the input to model.
-    int frame_ptr = 0;
-    for (int k = 0; k < speech_input_size * speech_batch_size; k++) {
-      input_ptr[k] = input_frames[i][frame_ptr++];
-    }
-    // Run the model.
-    interpreter->Invoke();
-    // Validate the output.
-    for (int k = 0; k < speech_output_size; k++) {
-      ASSERT_NEAR(output_ptr[k], output_frames[i][k], 1e-5);
-    }
-  }
-}
-
-TEST(SpeechSpeakerId, OkGoogleTest) { SpeakerIdTest(false); }
-
-TEST(SpeechSpeakerId, OkGoogleTestUsingNNAPI) { SpeakerIdTest(true); }
-
-}  // namespace models
-}  // namespace tflite
diff --git a/tensorflow/contrib/lite/models/speech_tts_model_test.cc b/tensorflow/contrib/lite/models/speech_tts_model_test.cc
deleted file mode 100644
index 8829177689..0000000000
--- a/tensorflow/contrib/lite/models/speech_tts_model_test.cc
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Unit test for speech TTS model using TFLite Ops.
-
-#include <string.h>
-
-#include <memory>
-#include <string>
-
-#include "base/logging.h"
-#include "testing/base/public/googletest.h"
-#include <gtest/gtest.h>
-#include "absl/strings/str_cat.h"
-#include "tensorflow/contrib/lite/context.h"
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/model.h"
-#include "tensorflow/contrib/lite/models/test_utils.h"
-
-namespace tflite {
-namespace models {
-
-constexpr int kModelInputTensor = 0;
-constexpr int kLstmLayer1OutputStateTensor = 25;
-constexpr int kLstmLayer1CellStateTensor = 26;
-constexpr int kLstmLayer2OutputStateTensor = 46;
-constexpr int kLstmLayer2CellStateTensor = 47;
-constexpr int kLstmLayer3OutputStateTensor = 67;
-constexpr int kLstmLayer3CellStateTensor = 68;
-constexpr int kRnnLayerHiddenStateTensor = 73;
-constexpr int kModelOutputTensor = 74;
-
-TEST(SpeechTTS, RandomIOTest) {
-  // Read the model.
-  string tflite_file_path =
-      StrCat(TestDataPath(), "/", "speech_tts_model.tflite");
-  auto model = FlatBufferModel::BuildFromFile(tflite_file_path.c_str());
-  CHECK(model) << "Failed to mmap model " << tflite_file_path;
-
-  // Initialize the interpreter.
-  ops::builtin::BuiltinOpResolver builtins;
-  std::unique_ptr<Interpreter> interpreter;
-  InterpreterBuilder(*model, builtins)(&interpreter);
-  CHECK(interpreter != nullptr);
-  interpreter->AllocateTensors();
-
-  // Load the input frames.
-  Frames input_frames;
-  const string input_file_path =
-      StrCat(TestDataPath(), "/", "speech_tts_model_in.csv");
-  ReadFrames(input_file_path, &input_frames);
-
-  // Load the golden output results.
-  Frames output_frames;
-  const string output_file_path =
-      StrCat(TestDataPath(), "/", "speech_tts_model_out.csv");
-  ReadFrames(output_file_path, &output_frames);
-
-  const int speech_batch_size =
-      interpreter->tensor(kModelInputTensor)->dims->data[0];
-  const int speech_input_size =
-      interpreter->tensor(kModelInputTensor)->dims->data[1];
-  const int speech_output_size =
-      interpreter->tensor(kModelOutputTensor)->dims->data[1];
-
-  float* input_ptr = interpreter->tensor(kModelInputTensor)->data.f;
-  float* output_ptr = interpreter->tensor(kModelOutputTensor)->data.f;
-
-  // Clear the LSTM state for layers.
-  memset(interpreter->tensor(kLstmLayer1OutputStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer1OutputStateTensor)->bytes);
-  memset(interpreter->tensor(kLstmLayer1CellStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer1CellStateTensor)->bytes);
-
-  memset(interpreter->tensor(kLstmLayer2OutputStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer2OutputStateTensor)->bytes);
-  memset(interpreter->tensor(kLstmLayer2CellStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer2CellStateTensor)->bytes);
-
-  memset(interpreter->tensor(kLstmLayer3OutputStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer3OutputStateTensor)->bytes);
-  memset(interpreter->tensor(kLstmLayer3CellStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer3CellStateTensor)->bytes);
-
-  memset(interpreter->tensor(kRnnLayerHiddenStateTensor)->data.raw, 0,
-         interpreter->tensor(kRnnLayerHiddenStateTensor)->bytes);
-
-  for (int i = 0; i < input_frames.size(); i++) {
-    // Feed the input to model.
-    int frame_ptr = 0;
-    for (int k = 0; k < speech_input_size * speech_batch_size; k++) {
-      input_ptr[k] = input_frames[i][frame_ptr++];
-    }
-    // Run the model.
-    interpreter->Invoke();
-    // Validate the output.
-    for (int k = 0; k < speech_output_size; k++) {
-      ASSERT_NEAR(output_ptr[k], output_frames[i][k], 1e-5);
-    }
-  }
-}
-
-}  // namespace models
-}  // namespace tflite
diff --git a/tensorflow/contrib/lite/models/test_utils.h b/tensorflow/contrib/lite/models/test_utils.h
deleted file mode 100644
index 1e14c26a35..0000000000
--- a/tensorflow/contrib/lite/models/test_utils.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_MODELS_TEST_UTILS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_MODELS_TEST_UTILS_H_
-
-#include <stdlib.h>
-#include <string.h>
-
-#include <fstream>
-#include <memory>
-#include <string>
-#include <vector>
-
-namespace tflite {
-namespace models {
-using Frames = std::vector<std::vector<float>>;
-}  // namespace models
-}  // namespace tflite
-
-#ifndef __ANDROID__
-#include "absl/strings/str_cat.h"
-#include "tensorflow/core/platform/test.h"
-
-inline string TestDataPath() {
-  return string(StrCat(tensorflow::testing::TensorFlowSrcRoot(), "/",
-                       "contrib/lite/models/testdata/"));
-}
-inline int TestInputSize(const tflite::models::Frames& input_frames) {
-  return input_frames.size();
-}
-#else
-inline string TestDataPath() {
-  return string("third_party/tensorflow/contrib/lite/models/testdata/");
-}
-
-inline int TestInputSize(const tflite::models::Frames& input_frames) {
-  // Android TAP is very slow, we only test the first 20 frames.
-  return 20;
-}
-#endif
-
-namespace tflite {
-namespace models {
-
-// Read float data from a comma-separated file:
-// Each line will be read into a float vector.
-// The return result will be a vector of float vectors.
-void ReadFrames(const string& csv_file_path, Frames* frames) {
-  std::ifstream csv_file(csv_file_path);
-  string line;
-  while (std::getline(csv_file, line, '\n')) {
-    std::vector<float> fields;
-    // Used by strtok_r internaly for successive calls on the same string.
-    char* save_ptr = nullptr;
-
-    // Tokenize the line.
-    char* next_token =
-        strtok_r(const_cast<char*>(line.c_str()), ",", &save_ptr);
-    while (next_token != nullptr) {
-      float f = strtod(next_token, nullptr);
-      fields.push_back(f);
-      next_token = strtok_r(nullptr, ",", &save_ptr);
-    }
-    frames->push_back(fields);
-  }
-  csv_file.close();
-}
-
-}  // namespace models
-}  // namespace tflite
-
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_MODELS_TEST_UTILS_H_
-- 
GitLab


From a68bf328ff6d4261203f2aa723d77174a771a0ec Mon Sep 17 00:00:00 2001
From: brett koonce <koonce@hello.com>
Date: Wed, 24 Jan 2018 10:45:52 -0800
Subject: [PATCH 083/966] minor spelling tweaks for eager execution docs
 (#16355)

---
 tensorflow/contrib/eager/python/evaluator.py           |  2 +-
 .../contrib/eager/python/examples/resnet50/README.md   |  2 +-
 .../contrib/eager/python/examples/resnet50/resnet50.py |  2 +-
 .../contrib/eager/python/examples/rnn_ptb/README.md    |  2 +-
 .../contrib/eager/python/examples/rnn_ptb/rnn_ptb.py   |  4 ++--
 tensorflow/contrib/eager/python/examples/spinn/data.py | 10 +++++-----
 tensorflow/contrib/eager/python/network_test.py        |  2 +-
 tensorflow/contrib/eager/python/saver.py               |  2 +-
 8 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/tensorflow/contrib/eager/python/evaluator.py b/tensorflow/contrib/eager/python/evaluator.py
index 3faaeef590..68e7b5421f 100644
--- a/tensorflow/contrib/eager/python/evaluator.py
+++ b/tensorflow/contrib/eager/python/evaluator.py
@@ -178,7 +178,7 @@ class Evaluator(object):
       call_op: An op that updates evaluation state on a mini-batch of examples.
         Must generate an tf.errors.OutOfRangeError when done.
       results_op: A dictionary of tensors that compute the final evaluation
-        results from the evaulation state.
+        results from the evaluation state.
       sess: The Session to run the evaluation in. Defaults to the default
         Session.
 
diff --git a/tensorflow/contrib/eager/python/examples/resnet50/README.md b/tensorflow/contrib/eager/python/examples/resnet50/README.md
index db023e6c97..79e4600529 100644
--- a/tensorflow/contrib/eager/python/examples/resnet50/README.md
+++ b/tensorflow/contrib/eager/python/examples/resnet50/README.md
@@ -34,7 +34,7 @@ bazel run -c opt --config=cuda :resnet50_graph_test -- --benchmarks=.
 
 (Or remove the `--config=cuda` flag for running on CPU instead of GPU).
 
-On October 31, 2017, the benchmarks demostrated comparable performance
+On October 31, 2017, the benchmarks demonstrated comparable performance
 for eager and graph execution of this particular model when using
 a single NVIDIA Titan X (Pascal) GPU on a host with an
 Intel Xeon E5-1650 CPU @ 3.50GHz and a batch size of 32.
diff --git a/tensorflow/contrib/eager/python/examples/resnet50/resnet50.py b/tensorflow/contrib/eager/python/examples/resnet50/resnet50.py
index b302a87e0e..9982fdb07e 100644
--- a/tensorflow/contrib/eager/python/examples/resnet50/resnet50.py
+++ b/tensorflow/contrib/eager/python/examples/resnet50/resnet50.py
@@ -97,7 +97,7 @@ class _ConvBlock(tfe.Network):
 
   Args:
       kernel_size: the kernel size of middle conv layer at main path
-      filters: list of integers, the filterss of 3 conv layer at main path
+      filters: list of integers, the filters of 3 conv layer at main path
       stage: integer, current stage label, used for generating layer names
       block: 'a','b'..., current block label, used for generating layer names
       data_format: data_format for the input ('channels_first' or
diff --git a/tensorflow/contrib/eager/python/examples/rnn_ptb/README.md b/tensorflow/contrib/eager/python/examples/rnn_ptb/README.md
index 743ebb68ee..966177e91c 100644
--- a/tensorflow/contrib/eager/python/examples/rnn_ptb/README.md
+++ b/tensorflow/contrib/eager/python/examples/rnn_ptb/README.md
@@ -40,7 +40,7 @@ bazel run -c opt --config=cuda :rnn_ptb_graph_test -- --benchmarks=.
 
 (Or remove the `--config=cuda` flag for running on CPU instead of GPU).
 
-On October 31, 2017, the benchmarks demostrated slightly better performance
+On October 31, 2017, the benchmarks demonstrated slightly better performance
 (3-6%) for graph execution over eager execution for this particular model when
 using a single NVIDIA Titan X (Pascal) GPU on a host with an Intel Xeon E5-1650
 CPU @ 3.50GHz and a batch size of 32.
diff --git a/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py b/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py
index 7b9637a9d5..d34e9ea68b 100644
--- a/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py
+++ b/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py
@@ -88,7 +88,7 @@ class Embedding(tf.layers.Layer):
 
 
 class PTBModel(tfe.Network):
-  """LSTM for word language modelling.
+  """LSTM for word language modeling.
 
   Model described in:
   (Zaremba, et. al.) Recurrent Neural Network Regularization
@@ -340,7 +340,7 @@ if __name__ == "__main__":
   parser.add_argument(
       "--logdir", type=str, default="", help="Directory for checkpoint.")
   parser.add_argument(
-      "--epoch", type=int, default=20, help="Number of epoches.")
+      "--epoch", type=int, default=20, help="Number of epochs.")
   parser.add_argument("--batch-size", type=int, default=20, help="Batch size.")
   parser.add_argument(
       "--seq-len", type=int, default=35, help="Sequence length.")
diff --git a/tensorflow/contrib/eager/python/examples/spinn/data.py b/tensorflow/contrib/eager/python/examples/spinn/data.py
index a6e046320f..fcaae0a4f8 100644
--- a/tensorflow/contrib/eager/python/examples/spinn/data.py
+++ b/tensorflow/contrib/eager/python/examples/spinn/data.py
@@ -51,11 +51,11 @@ def get_non_parenthesis_words(items):
   """Get the non-parenthesis items from a SNLI parsed sentence.
 
   Args:
-    items: Data items from a parsed SNLI setence, with parentheses. E.g.,
+    items: Data items from a parsed SNLI sentence, with parentheses. E.g.,
       ["(", "Man", "(", "(", "(", "(", "(", "wearing", "pass", ")", ...
 
   Returns:
-    A list of non-parenthis word items, all converted to lower case. E.g.,
+    A list of non-parentheses word items, all converted to lower case. E.g.,
       ["man", "wearing", "pass", ...
   """
   return [x.lower() for x in items if x not in PARENTHESES and x]
@@ -201,7 +201,7 @@ def load_word_vectors(data_root, vocab):
 
 
 def calculate_bins(length2count, min_bin_size):
-  """Cacluate bin boundaries given a histogram of lengths and mininum bin size.
+  """Calculate bin boundaries given a histogram of lengths and minimum bin size.
 
   Args:
     length2count: A `dict` mapping length to sentence count.
@@ -335,9 +335,9 @@ class SnliData(object):
         # The sorting above and the batching here makes sure that sentences of
         # similar max lengths are batched together, minimizing the inefficiency
         # due to uneven max lengths. The sentences are batched differently in
-        # each call to get_generator() due to the shuffling before sotring
+        # each call to get_generator() due to the shuffling before sorting
         # above. The pad_and_reverse_word_ids() and pad_transitions() functions
-        # take care of any remaning unevenness of the max sentence lengths.
+        # take care of any remaining unevenness of the max sentence lengths.
         end = min(begin + batch_size, len(labels))
         # Transpose, because the SPINN model requires time-major, instead of
         # batch-major.
diff --git a/tensorflow/contrib/eager/python/network_test.py b/tensorflow/contrib/eager/python/network_test.py
index 8e6b947e5c..81c77e41ac 100644
--- a/tensorflow/contrib/eager/python/network_test.py
+++ b/tensorflow/contrib/eager/python/network_test.py
@@ -688,7 +688,7 @@ class NetworkTest(test.TestCase):
     net2(one)
     # Layer names typically are globally unique rather than being unique within
     # the scope of their first use. However, within a Network they must be named
-    # locally so that previous Layer consutrciton does not interfere with
+    # locally so that previous Layer construction does not interfere with
     # variable naming (e.g. add a Layer construction before the Network,
     # suddenly your previously saved checkpoint is incompatible).
     self.assertEqual("dense", net1.l1.name)
diff --git a/tensorflow/contrib/eager/python/saver.py b/tensorflow/contrib/eager/python/saver.py
index 57b070ec6e..62421849c7 100644
--- a/tensorflow/contrib/eager/python/saver.py
+++ b/tensorflow/contrib/eager/python/saver.py
@@ -82,7 +82,7 @@ def restore_variables_on_create(save_path, map_func=None):
       map_func_wrapper = lambda self, x: x
     else:
       if not callable(map_func):
-        raise ValueError("map_func must be callaled.")
+        raise ValueError("map_func must be callable.")
       map_func_wrapper = lambda self, x: map_func(x)
 
     ckpt_var_cache = dict()
-- 
GitLab


From 44a4674d196b27b1e1f998de910af945a25b2c52 Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Wed, 24 Jan 2018 10:56:37 -0800
Subject: [PATCH 084/966] Revert #15967. Make pip package size too large.

---
 tensorflow/tools/graph_transforms/BUILD       | 12 ------
 .../graph_transforms_wrapper.py               | 39 -------------------
 tensorflow/tools/pip_package/BUILD            |  2 -
 .../tools/pip_package/build_pip_package.sh    |  2 -
 tensorflow/tools/pip_package/setup.py         |  1 -
 5 files changed, 56 deletions(-)
 delete mode 100644 tensorflow/tools/graph_transforms/graph_transforms_wrapper.py

diff --git a/tensorflow/tools/graph_transforms/BUILD b/tensorflow/tools/graph_transforms/BUILD
index 31c4349431..b5465b7fb3 100644
--- a/tensorflow/tools/graph_transforms/BUILD
+++ b/tensorflow/tools/graph_transforms/BUILD
@@ -317,18 +317,6 @@ tf_py_test(
     main = "python/transform_graph_test.py",
 )
 
-py_binary(
-    name = "graph_transforms_wrapper",
-    srcs = ["graph_transforms_wrapper.py"],
-    srcs_version = "PY2AND3",
-    data = [
-        ":transform_graph",
-    ],
-    deps = [
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/tools/graph_transforms/graph_transforms_wrapper.py b/tensorflow/tools/graph_transforms/graph_transforms_wrapper.py
deleted file mode 100644
index e12feddba2..0000000000
--- a/tensorflow/tools/graph_transforms/graph_transforms_wrapper.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Wrapper for runninmg graph_transform binary embedded in pip site-package.
-NOTE: this mainly exists since PIP setup.py cannot install binaries to bin/.
-It can only install Python "console-scripts." This will work as a console
-script. See tools/pip_package/setup.py (search for CONSOLE_SCRIPTS).
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import sys
-import tensorflow as tf
-
-
-def main():
-  # Pip installs the binary in aux-bin off of main site-package install.
-  # Just find it and exec, passing all arguments in the process.
-  pip_binary = os.path.join(tf.__path__[0], 'aux-bin/transform_graph')
-  bazel_binary = ('bazel-bin/tensorflow/tools/graph_transforms/'
-                  'graph_transforms_wrapper.runfiles/org_tensorflow/'
-                  'tensorflow/tools/graph_transforms/transform_graph')
-  binary = bazel_binary if os.path.isfile(bazel_binary) else pip_binary
-  os.execvp(binary, sys.argv)
-
-main()
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index d61ddb889c..c789e2ba0c 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -179,8 +179,6 @@ sh_binary(
             "//tensorflow/python/tools:tools_pip",
             "//tensorflow/python:test_ops",
             "//tensorflow/tools/dist_test/server:grpc_tensorflow_server",
-            "//tensorflow/tools/graph_transforms:graph_transforms_wrapper",
-            "//tensorflow/tools/graph_transforms:transform_graph",
         ],
     }) + if_mkl(["//third_party/mkl:intel_binary_blob"]),
 )
diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
index 32b8c54a56..dc31e4c5f7 100755
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@@ -140,8 +140,6 @@ function main() {
     mkdir "${TMPDIR}/tensorflow/aux-bin"
     # Install toco as a binary in aux-bin.
     cp bazel-bin/tensorflow/contrib/lite/toco/toco ${TMPDIR}/tensorflow/aux-bin/
-    # Install graph transform tool as a bianry in aux-bin
-    cp bazel-bin/tensorflow/tools/graph_transforms/transform_graph ${TMPDIR}/tensorflow/aux-bin/
   fi
 
   # protobuf pip package doesn't ship with header files. Copy the headers
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 0034fe447c..62df6453fb 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -74,7 +74,6 @@ CONSOLE_SCRIPTS = [
     'freeze_graph = tensorflow.python.tools.freeze_graph:main',
     'toco_from_protos = tensorflow.contrib.lite.toco.python.toco_from_protos:main',
     'toco = tensorflow.contrib.lite.toco.python.toco_wrapper:main',
-    'transform_graph = tensorflow.tools.graph_transforms.graph_transforms_wrapper:main',
     'saved_model_cli = tensorflow.python.tools.saved_model_cli:main',
     # We need to keep the TensorBoard command, even though the console script
     # is now declared by the tensorboard pip package. If we remove the
-- 
GitLab


From 5031e9f169a488eebd00eb7602e27ba6516f2486 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Wed, 24 Jan 2018 10:55:55 -0800
Subject: [PATCH 085/966] Fix update_api_def.sh script to output ApiDefs
 instead of ApiDef proto.

PiperOrigin-RevId: 183109303
---
 tensorflow/core/api_def/update_api_def.cc     |  8 ++---
 tensorflow/core/api_def/update_api_def.h      |  2 +-
 .../core/api_def/update_api_def_test.cc       | 32 ++++++++++---------
 3 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/tensorflow/core/api_def/update_api_def.cc b/tensorflow/core/api_def/update_api_def.cc
index 1a6d15ec68..ea9a148260 100644
--- a/tensorflow/core/api_def/update_api_def.cc
+++ b/tensorflow/core/api_def/update_api_def.cc
@@ -224,14 +224,14 @@ void RemoveDocs(const std::vector<const OpDef*>& ops,
 }
 }  // namespace
 
-// Returns ApiDef text representation in multi-line format
+// Returns ApiDefs text representation in multi-line format
 // constructed based on the given op.
 string CreateApiDef(const OpDef& op) {
-  ApiDef api_def;
-  FillBaseApiDef(&api_def, op);
+  ApiDefs api_defs;
+  FillBaseApiDef(api_defs.add_op(), op);
 
   const std::vector<string> multi_line_fields = {"description"};
-  string new_api_defs_str = api_def.DebugString();
+  string new_api_defs_str = api_defs.DebugString();
   return PBTxtToMultiline(new_api_defs_str, multi_line_fields);
 }
 
diff --git a/tensorflow/core/api_def/update_api_def.h b/tensorflow/core/api_def/update_api_def.h
index 5eae7e528e..5d6c15010d 100644
--- a/tensorflow/core/api_def/update_api_def.h
+++ b/tensorflow/core/api_def/update_api_def.h
@@ -21,7 +21,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-// Returns ApiDef text representation in multi-line format
+// Returns ApiDefs text representation in multi-line format
 // constructed based on the given op.
 string CreateApiDef(const OpDef& op);
 
diff --git a/tensorflow/core/api_def/update_api_def_test.cc b/tensorflow/core/api_def/update_api_def_test.cc
index 8948f2c1d5..4200c9da23 100644
--- a/tensorflow/core/api_def/update_api_def_test.cc
+++ b/tensorflow/core/api_def/update_api_def_test.cc
@@ -173,30 +173,32 @@ description: "Description\nfor Op1."
   OpDef op;
   protobuf::TextFormat::ParseFromString(op_text, &op);  // NOLINT
 
-  const string expected_api_def = R"(graph_op_name: "Op1"
-in_arg {
-  name: "a"
-  description: <<END
+  const string expected_api_def = R"(op {
+  graph_op_name: "Op1"
+  in_arg {
+    name: "a"
+    description: <<END
 Description for a.
 END
-}
-out_arg {
-  name: "output"
-  description: <<END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
 Description for output.
 END
-}
-attr {
-  name: "b"
-  description: <<END
+  }
+  attr {
+    name: "b"
+    description: <<END
 Description for b.
 END
-}
-summary: "Summary for Op1."
-description: <<END
+  }
+  summary: "Summary for Op1."
+  description: <<END
 Description
 for Op1.
 END
+}
 )";
   EXPECT_EQ(expected_api_def, CreateApiDef(op));
 }
-- 
GitLab


From 98c4a4efaf836e120505b4f1d52d7508802e004e Mon Sep 17 00:00:00 2001
From: Suharsh Sivakumar <suharshs@google.com>
Date: Wed, 24 Jan 2018 10:57:28 -0800
Subject: [PATCH 086/966] Introduce experimental quantize training and eval
 apis. These APIs will include options that may have undefined behavior, still
 being researched. The non experimental APIs will have better hardware support
 and will be more suitable for users aiming to rewrite graphs to provide to
 backends (e.g. TFLite).

PiperOrigin-RevId: 183109634
---
 tensorflow/contrib/quantize/__init__.py       |  2 +
 .../contrib/quantize/python/quantize_graph.py | 64 +++++++++++++++++++
 .../quantize/python/quantize_graph_test.py    | 56 +++++++---------
 3 files changed, 88 insertions(+), 34 deletions(-)

diff --git a/tensorflow/contrib/quantize/__init__.py b/tensorflow/contrib/quantize/__init__.py
index 5d4e4575c9..933200e607 100644
--- a/tensorflow/contrib/quantize/__init__.py
+++ b/tensorflow/contrib/quantize/__init__.py
@@ -27,6 +27,8 @@ from tensorflow.python.util.all_util import remove_undocumented
 _allowed_symbols = [
     "create_eval_graph",
     "create_training_graph",
+    "experimental_create_eval_graph",
+    "experimental_create_training_graph",
 ]
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/quantize/python/quantize_graph.py b/tensorflow/contrib/quantize/python/quantize_graph.py
index d647bb94e8..bbd9743d80 100644
--- a/tensorflow/contrib/quantize/python/quantize_graph.py
+++ b/tensorflow/contrib/quantize/python/quantize_graph.py
@@ -128,3 +128,67 @@ def create_eval_graph(input_graph, elements=None, device_name_or_function=None):
       is_training=False,
       elements=elements,
       device_name_or_function=device_name_or_function)
+
+
+def experimental_create_training_graph(input_graph,
+                                       elements=None,
+                                       device_name_or_function=None):
+  """Returns a transformed training input_graph for simulated quantization.
+
+  This function has additional experimental options not (yet) available to
+  create_training_graph. The resulting behavior may be undefined.
+  The forward pass has fake quantization ops inserted to simulate the error
+  introduced by quantization.
+
+  Args:
+    input_graph: The tf.Graph to be transformed.
+    elements: (Optional) List of Tensors and Operations in input_graph whose
+        corresponding elements in the new graph will be returned.
+    device_name_or_function: (Optional) The device name or function to use.
+
+  Returns:
+    g is new tf.Graph that is rewritten for simulated quantization.
+    l is a list of Tensors/Operations in g corresponding to the provided input
+        elements, if elements is not None.
+
+  Raises:
+    ValueError: If elements contains an element that isn't a tf.Tensor or
+        tf.Operation.
+  """
+  return _create_graph(
+      input_graph=input_graph,
+      is_training=True,
+      elements=elements,
+      device_name_or_function=device_name_or_function)
+
+
+def experimental_create_eval_graph(input_graph,
+                                   elements=None,
+                                   device_name_or_function=None):
+  """Returns a transformed eval input_graph for simulated quantization.
+
+  This function has additional experimental options not (yet) available to
+  create_eval_graph. The resulting behavior may be undefined.
+  The forward pass has fake quantization ops inserted to simulate the error
+  introduced by quantization.
+
+  Args:
+    input_graph: The tf.Graph to be transformed.
+    elements: (Optional) List of Tensors and Operations in input_graph whose
+        corresponding elements in the new graph will be returned.
+    device_name_or_function: (Optional) The device name or function to use.
+
+  Returns:
+    g is new tf.Graph that is rewritten for simulated quantization.
+    l is a list of Tensors/Operations in g corresponding to the provided input
+        elements, if elements is not None.
+
+  Raises:
+    ValueError: If elements contains an element that isn't a tf.Tensor or
+        tf.Operation.
+  """
+  return _create_graph(
+      input_graph=input_graph,
+      is_training=False,
+      elements=elements,
+      device_name_or_function=device_name_or_function)
diff --git a/tensorflow/contrib/quantize/python/quantize_graph_test.py b/tensorflow/contrib/quantize/python/quantize_graph_test.py
index 3407ace391..514862a0ab 100644
--- a/tensorflow/contrib/quantize/python/quantize_graph_test.py
+++ b/tensorflow/contrib/quantize/python/quantize_graph_test.py
@@ -31,28 +31,30 @@ from tensorflow.python.platform import googletest
 
 
 class QuantizeGraphTest(test_util.TensorFlowTestCase):
-
   # We have a lot of other tests that test the details of the rewrite, here we
   # just the specific features of the quantize_graph API.
-  def testReturnedElementsTraining(self):
-    self._TestReturnElements(True)
 
-  def testReturnedElementsEval(self):
-    self._TestReturnElements(False)
+  def _RunTestOverParameters(self, test_fn):
+    rewrite_fns = [
+        quantize_graph.create_training_graph,
+        quantize_graph.create_eval_graph,
+        quantize_graph.experimental_create_training_graph,
+        quantize_graph.experimental_create_eval_graph,
+    ]
+    for fn in rewrite_fns:
+      test_fn(fn)
+
+  def testReturnedElements(self):
+    self._RunTestOverParameters(self._TestReturnElements)
 
-  def _TestReturnElements(self, is_training):
+  def _TestReturnElements(self, fn):
     graph = ops.Graph()
     with graph.as_default():
       a = constant_op.constant(1.0)
       b = variables.Variable(2.0)
       c = a + b
     elements = [a, b, c.op]
-    if is_training:
-      q_graph, returned_elements = quantize_graph.create_training_graph(
-          graph, elements=elements)
-    else:
-      q_graph, returned_elements = quantize_graph.create_eval_graph(
-          graph, elements=elements)
+    q_graph, returned_elements = fn(graph, elements=elements)
     # Make sure q_graph is different from graph.
     self.assertTrue(graph != q_graph)
     # Check that the returned elements are part of the new graph.
@@ -62,35 +64,26 @@ class QuantizeGraphTest(test_util.TensorFlowTestCase):
     for element, returned_element in zip(elements, returned_elements):
       self.assertEqual(element.name, returned_element.name)
 
-  def testNoReturnElementsTraining(self):
-    self._TestNoReturnElements(True)
+  def testNoReturnElements(self):
+    self._RunTestOverParameters(self._TestNoReturnElements)
 
-  def testNoReturnElementsEval(self):
-    self._TestNoReturnElements(False)
-
-  def _TestNoReturnElements(self, is_training):
+  def _TestNoReturnElements(self, fn):
     graph = ops.Graph()
     with graph.as_default():
       a = constant_op.constant(1.0)
       b = variables.Variable(2.0)
       _ = a + b
-    if is_training:
-      q_graph = quantize_graph.create_training_graph(graph)
-    else:
-      q_graph = quantize_graph.create_eval_graph(graph)
+    q_graph = fn(graph)
     # Check that quantize_graph didn't return a tuple when elements isn't
     # provided.
     self.assertTrue(isinstance(q_graph, ops.Graph))
     # Make sure q_graph is different from graph.
     self.assertTrue(graph != q_graph)
 
-  def testDeviceNameTraining(self):
-    self._TestDeviceName(True)
-
-  def testDeviceNameEval(self):
-    self._TestDeviceName(False)
+  def testDeviceName(self):
+    self._RunTestOverParameters(self._TestDeviceName)
 
-  def _TestDeviceName(self, is_training):
+  def _TestDeviceName(self, fn):
     graph = ops.Graph()
     with graph.as_default():
       batch_size, height, width, depth = 5, 128, 128, 3
@@ -106,12 +99,7 @@ class QuantizeGraphTest(test_util.TensorFlowTestCase):
       _ = nn_ops.relu6(conv)
 
     device_name = '/job:oink/task:0/device:CPU:0'
-    if is_training:
-      q_graph = quantize_graph.create_training_graph(
-          graph, device_name_or_function=device_name)
-    else:
-      q_graph = quantize_graph.create_eval_graph(
-          graph, device_name_or_function=device_name)
+    q_graph = fn(graph, device_name_or_function=device_name)
 
     orig_variable_names = set(
         [v.name for v in graph.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
-- 
GitLab


From 94276a1e7c6f718eada8dbdfa23a3751669e2247 Mon Sep 17 00:00:00 2001
From: Renat Idrisov <ren@ngs.ru>
Date: Wed, 24 Jan 2018 20:24:01 +0100
Subject: [PATCH 087/966] Setting proper sonames on Linux (#15307)

* Setting proper sonames on Linux

* Revert "Setting proper sonames on Linux"

This reverts commit 141d796dcd3cb6b7a96a1b0841952daf60f72f1c.

* Adding soname to tf_cc_shared_object rule definition
---
 tensorflow/tensorflow.bzl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index cbb64a3489..f32d456155 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -291,6 +291,7 @@ def tf_cc_shared_object(
               "-Wl,-install_name,@rpath/" + name.split("/")[-1],
           ],
           "//conditions:default": [
+              "-Wl,-soname," + name.split("/")[-1],
           ],
       }),
       **kwargs)
-- 
GitLab


From 127146d8d68f607d68699931030cdaa0044c58fa Mon Sep 17 00:00:00 2001
From: Rajendra arora <rajendraarora16@yahoo.com>
Date: Thu, 25 Jan 2018 00:54:31 +0530
Subject: [PATCH 088/966] Making string values in legacy constant (#16326)

---
 tensorflow/contrib/session_bundle/bundle_shim.py | 8 ++++----
 tensorflow/contrib/session_bundle/constants.py   | 3 +++
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/tensorflow/contrib/session_bundle/bundle_shim.py b/tensorflow/contrib/session_bundle/bundle_shim.py
index 062c9cc680..3149875e41 100644
--- a/tensorflow/contrib/session_bundle/bundle_shim.py
+++ b/tensorflow/contrib/session_bundle/bundle_shim.py
@@ -82,7 +82,7 @@ def _convert_default_signature_to_signature_def(signatures):
   """
   default_signature = signatures.default_signature
   signature_def = meta_graph_pb2.SignatureDef()
-  if default_signature.WhichOneof("type") == "regression_signature":
+  if default_signature.WhichOneof("type") == legacy_constants.REGRESSION_SIGNATURE:
     regression_signature = default_signature.regression_signature
     signature_def.method_name = signature_constants.REGRESS_METHOD_NAME
     _add_input_to_signature_def(regression_signature.input.tensor_name,
@@ -91,7 +91,7 @@ def _convert_default_signature_to_signature_def(signatures):
     _add_output_to_signature_def(regression_signature.output.tensor_name,
                                  signature_constants.REGRESS_OUTPUTS,
                                  signature_def)
-  elif default_signature.WhichOneof("type") == "classification_signature":
+  elif default_signature.WhichOneof("type") == legacy_constants.CLASSIFICATION_SIGNATURE:
     classification_signature = default_signature.classification_signature
     signature_def.method_name = signature_constants.CLASSIFY_METHOD_NAME
     _add_input_to_signature_def(classification_signature.input.tensor_name,
@@ -132,8 +132,8 @@ def _convert_named_signatures_to_signature_def(signatures):
       signature_constants.PREDICT_OUTPUTS]
   # TODO(pdudnik): what if there are other signatures? Mimic cr/140900781 once
   # it is submitted.
-  if (input_signature.WhichOneof("type") != "generic_signature" or
-      output_signature.WhichOneof("type") != "generic_signature"):
+  if (input_signature.WhichOneof("type") != legacy_constants.GENERIC_SIGNATURE or
+      output_signature.WhichOneof("type") != legacy_constants.GENERIC_SIGNATURE):
     raise RuntimeError("Named input and output signatures can only be "
                        "up-converted if they are generic signature. "
                        "Input signature type is %s, output signature type is "
diff --git a/tensorflow/contrib/session_bundle/constants.py b/tensorflow/contrib/session_bundle/constants.py
index 6ced73241a..e833baee79 100644
--- a/tensorflow/contrib/session_bundle/constants.py
+++ b/tensorflow/contrib/session_bundle/constants.py
@@ -32,3 +32,6 @@ INIT_OP_KEY = "serving_init_op"
 SIGNATURES_KEY = "serving_signatures"
 ASSETS_KEY = "serving_assets"
 GRAPH_KEY = "serving_graph"
+REGRESSION_SIGNATURE = "regression_signature"
+CLASSIFICATION_SIGNATURE = "classification_signature"
+GENERIC_SIGNATURE = "generic_signature"
-- 
GitLab


From 7f8e600bfb4ff5973bd1ec178b65538e2446fb69 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 24 Jan 2018 11:29:08 -0800
Subject: [PATCH 089/966] fix typos in schema.fbs comments.

PiperOrigin-RevId: 183114994
---
 tensorflow/contrib/lite/schema/schema.fbs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/lite/schema/schema.fbs b/tensorflow/contrib/lite/schema/schema.fbs
index da7db9bcf4..2172135f49 100644
--- a/tensorflow/contrib/lite/schema/schema.fbs
+++ b/tensorflow/contrib/lite/schema/schema.fbs
@@ -53,12 +53,12 @@ table Tensor {
   type:TensorType;
   // An index that refers to the buffers table at the root of the model. Or,
   // if there is no data buffer associated (i.e. intermediate results), then
-  // this is 0 (which refers to an always existant empty buffer).
+  // this is 0 (which refers to an always existent empty buffer).
   //
   // The data_buffer itself is an opaque container, with the assumption that the
   // target device is little-endian. In addition, all builtin operators assume
   // the memory is ordered such that if `shape` is [4, 3, 2], then index
-  // [i, j, k] maps to data_buffer[i*3*2 + j*3 + k].
+  // [i, j, k] maps to data_buffer[i*3*2 + j*2 + k].
   buffer:uint;
   name:string;  // For debugging and importing back into tensorflow.
   quantization:QuantizationParameters;  // Optional.
-- 
GitLab


From ad07a86d75ab06bbcfd6f8f6a24debd9036a52d0 Mon Sep 17 00:00:00 2001
From: Jianwei Xie <xiejw@google.com>
Date: Wed, 24 Jan 2018 11:31:06 -0800
Subject: [PATCH 090/966] Fixed linter errors.

PiperOrigin-RevId: 183115307
---
 .../contrib/cmake/python_sanity_test.py       |  18 +-
 .../contrib/layers/python/layers/layers.py    | 661 +++++++-------
 .../training/model_average_optimizer.py       | 119 +--
 .../training/model_average_optimizer_test.py  | 103 ++-
 .../kernel_tests/periodic_resample_op_test.py |  39 +-
 .../rnn/python/kernel_tests/rnn_cell_test.py  | 852 +++++++++---------
 tensorflow/contrib/rnn/python/ops/rnn_cell.py | 597 ++++++------
 .../kernel_tests/beam_search_decoder_test.py  |  79 +-
 .../seq2seq/python/ops/beam_search_decoder.py |  99 +-
 tensorflow/contrib/verbs/rdma.cc              | 109 ++-
 tensorflow/contrib/verbs/rdma.h               |  11 +-
 tensorflow/contrib/verbs/rdma_mgr.cc          |  29 +-
 tensorflow/core/kernels/mkl_aggregate_ops.cc  | 188 ++--
 tensorflow/core/kernels/mkl_softmax_op.cc     |  33 +-
 .../core/kernels/spectrogram_test_utils.cc    |   4 +-
 .../core/kernels/transpose_functor_cpu.cc     |  16 +-
 .../tutorials/word2vec/word2vec_basic.py      |  81 +-
 .../kernel_tests/batch_dataset_op_test.py     | 103 ++-
 tensorflow/python/ops/histogram_ops.py        |  13 +-
 tensorflow/python/ops/histogram_ops_test.py   |  10 +-
 tensorflow/python/ops/image_ops_impl.py       | 235 ++---
 tensorflow/python/ops/metrics_impl.py         | 530 ++++++-----
 tensorflow/python/ops/nn_impl.py              |  26 +-
 tensorflow/python/ops/nn_test.py              |  19 +-
 tensorflow/python/util/compat.py              |   5 +-
 .../tools/pip_package/pip_smoke_test.py       |  29 +-
 26 files changed, 2119 insertions(+), 1889 deletions(-)

diff --git a/tensorflow/contrib/cmake/python_sanity_test.py b/tensorflow/contrib/cmake/python_sanity_test.py
index 3be5bd1b23..e0056823a8 100644
--- a/tensorflow/contrib/cmake/python_sanity_test.py
+++ b/tensorflow/contrib/cmake/python_sanity_test.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""
-Complain about invalid or missing entries in python_*.txt files.
+"""Complain about invalid or missing entries in python_*.txt files.
+
 Problematic entries can be commented for temporary whitelisting.
 """
 
@@ -35,6 +35,7 @@ def abs_path(path):
   path = os.path.abspath(path)
   return path
 
+
 def read_entries(test):
   with open(abs_path(test.entries_file), "r") as f:
     lines = f.readlines()
@@ -47,25 +48,28 @@ def read_entries(test):
 
   for line in lines:
     # line is comment
-    if line.startswith('#'):
+    if line.startswith("#"):
       line = line[1:].strip()
       # whitelist entry
-      if line.startswith('tensorflow/'):
+      if line.startswith("tensorflow/"):
         test.whitelist.append(line)
     # line has comment -> strip comment
-    elif line.find('#') != -1:
-      line = line[:line.find('#')].strip()
+    elif line.find("#") != -1:
+      line = line[:line.find("#")].strip()
       test.entries.append(line)
     else:
       test.entries.append(line)
 
+
 def test_invalid_directories(test):
   for entry in test.entries:
     if not os.path.isdir(abs_path(entry)):
       problem = "'" + test.entries_file + "' contains invalid '" + entry + "'"
-      solution = "Please remove the invalid entry (or add the missing directory)."
+      solution = ("Please remove the invalid entry (or add the missing "
+                  "directory).")
       raise AssertionError(problem + "\n" + solution)
 
+
 def test_missing_directory(test, path):
   if path in test.whitelist:
     return
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index ef2b673074..7c52da7b49 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -54,47 +54,17 @@ from tensorflow.python.layers.maxout import maxout
 
 # TODO(b/28426988): Replace legacy_* fns migrated from slim.
 # TODO(b/28426988): Remove legacy_* when all uses have migrated to new API.
-__all__ = ['avg_pool2d',
-           'avg_pool3d',
-           'batch_norm',
-           'bias_add',
-           'conv2d',
-           'conv3d',
-           'conv2d_in_plane',
-           'conv2d_transpose',
-           'conv3d_transpose',
-           'convolution',
-           'convolution2d',
-           'convolution2d_in_plane',
-           'convolution2d_transpose',
-           'convolution3d',
-           'convolution3d_transpose',
-           'dropout',
-           'elu',
-           'flatten',
-           'fully_connected',
-           'GDN',
-           'gdn',
-           'layer_norm',
-           'linear',
-           'pool',
-           'max_pool2d',
-           'max_pool3d',
-           'one_hot_encoding',
-           'relu',
-           'relu6',
-           'repeat',
-           'scale_gradient',
-           'separable_conv2d',
-           'separable_convolution2d',
-           'softmax',
-           'spatial_softmax',
-           'stack',
-           'unit_norm',
-           'legacy_fully_connected',
-           'legacy_linear',
-           'legacy_relu',
-           'maxout']
+__all__ = [
+    'avg_pool2d', 'avg_pool3d', 'batch_norm', 'bias_add', 'conv2d', 'conv3d',
+    'conv2d_in_plane', 'conv2d_transpose', 'conv3d_transpose', 'convolution',
+    'convolution2d', 'convolution2d_in_plane', 'convolution2d_transpose',
+    'convolution3d', 'convolution3d_transpose', 'dropout', 'elu', 'flatten',
+    'fully_connected', 'GDN', 'gdn', 'layer_norm', 'linear', 'pool',
+    'max_pool2d', 'max_pool3d', 'one_hot_encoding', 'relu', 'relu6', 'repeat',
+    'scale_gradient', 'separable_conv2d', 'separable_convolution2d', 'softmax',
+    'spatial_softmax', 'stack', 'unit_norm', 'legacy_fully_connected',
+    'legacy_linear', 'legacy_relu', 'maxout'
+]
 
 DATA_FORMAT_NCHW = 'NCHW'
 DATA_FORMAT_NHWC = 'NHWC'
@@ -139,13 +109,14 @@ def avg_pool2d(inputs,
     raise ValueError('data_format has to be either NCHW or NHWC.')
   with ops.name_scope(scope, 'AvgPool2D', [inputs]) as sc:
     inputs = ops.convert_to_tensor(inputs)
-    df = ('channels_first' if data_format and data_format.startswith('NC')
-          else 'channels_last')
-    layer = pooling_layers.AveragePooling2D(pool_size=kernel_size,
-                                            strides=stride,
-                                            padding=padding,
-                                            data_format=df,
-                                            _scope=sc)
+    df = ('channels_first'
+          if data_format and data_format.startswith('NC') else 'channels_last')
+    layer = pooling_layers.AveragePooling2D(
+        pool_size=kernel_size,
+        strides=stride,
+        padding=padding,
+        data_format=df,
+        _scope=sc)
     outputs = layer.apply(inputs)
     return utils.collect_named_outputs(outputs_collections, sc, outputs)
 
@@ -187,13 +158,14 @@ def avg_pool3d(inputs,
     raise ValueError('data_format has to be either NCDHW or NDHWC.')
   with ops.name_scope(scope, 'AvgPool3D', [inputs]) as sc:
     inputs = ops.convert_to_tensor(inputs)
-    df = ('channels_first' if data_format and data_format.startswith('NC')
-          else 'channels_last')
-    layer = pooling_layers.AveragePooling3D(pool_size=kernel_size,
-                                            strides=stride,
-                                            padding=padding,
-                                            data_format=df,
-                                            _scope=sc)
+    df = ('channels_first'
+          if data_format and data_format.startswith('NC') else 'channels_last')
+    layer = pooling_layers.AveragePooling3D(
+        pool_size=kernel_size,
+        strides=stride,
+        padding=padding,
+        data_format=df,
+        _scope=sc)
     outputs = layer.apply(inputs)
     return utils.collect_named_outputs(outputs_collections, sc, outputs)
 
@@ -298,8 +270,8 @@ def _fused_batch_norm(inputs,
       raise ValueError('Inputs %s has undefined rank' % inputs.name)
     elif original_rank not in [2, 4]:
       raise ValueError('Inputs %s has unsupported rank.'
-                       ' Expected 2 or 4 but got %d' % (
-                           inputs.name, original_rank))
+                       ' Expected 2 or 4 but got %d' % (inputs.name,
+                                                        original_rank))
     if original_rank == 2:
       channels = inputs.get_shape()[-1].value
       if channels is None:
@@ -393,6 +365,7 @@ def _fused_batch_norm(inputs,
     def _fused_batch_norm_training():
       return nn.fused_batch_norm(
           inputs, gamma, beta, epsilon=epsilon, data_format=data_format)
+
     def _fused_batch_norm_inference():
       return nn.fused_batch_norm(
           inputs,
@@ -403,9 +376,9 @@ def _fused_batch_norm(inputs,
           epsilon=epsilon,
           is_training=False,
           data_format=data_format)
-    outputs, mean, variance = utils.smart_cond(is_training,
-                                               _fused_batch_norm_training,
-                                               _fused_batch_norm_inference)
+
+    outputs, mean, variance = utils.smart_cond(
+        is_training, _fused_batch_norm_training, _fused_batch_norm_inference)
 
     # If `is_training` doesn't have a constant value, because it is a `Tensor`,
     # a `Variable` or `Placeholder` then is_training_value will be None and
@@ -415,6 +388,7 @@ def _fused_batch_norm(inputs,
     if need_updates:
       if updates_collections is None:
         no_updates = lambda: outputs
+
         def _force_updates():
           """Internal function forces updates moving_vars if is_training."""
           update_moving_mean = moving_averages.assign_moving_average(
@@ -424,9 +398,11 @@ def _fused_batch_norm(inputs,
           with ops.control_dependencies(
               [update_moving_mean, update_moving_variance]):
             return array_ops.identity(outputs)
+
         outputs = utils.smart_cond(is_training, _force_updates, no_updates)
       else:
         moving_vars_fn = lambda: (moving_mean, moving_variance)
+
         def _delay_updates():
           """Internal function that delay updates moving_vars if is_training."""
           update_moving_mean = moving_averages.assign_moving_average(
@@ -434,9 +410,9 @@ def _fused_batch_norm(inputs,
           update_moving_variance = moving_averages.assign_moving_average(
               moving_variance, variance, decay, zero_debias=False)
           return update_moving_mean, update_moving_variance
-        update_mean, update_variance = utils.smart_cond(is_training,
-                                                        _delay_updates,
-                                                        moving_vars_fn)
+
+        update_mean, update_variance = utils.smart_cond(
+            is_training, _delay_updates, moving_vars_fn)
         ops.add_to_collections(updates_collections, update_mean)
         ops.add_to_collections(updates_collections, update_variance)
 
@@ -482,9 +458,10 @@ def batch_norm(inputs,
   Can be used as a normalizer function for conv2d and fully_connected. The
   normalization is over all but the last dimension if `data_format` is `NHWC`
   and all but the second dimension if `data_format` is `NCHW`.  In case of a 2D
-  tensor this corresponds to the batch dimension, while in case of a 4D tensor this
+  tensor this corresponds to the batch dimension, while in case of a 4D tensor
+  this
   corresponds to the batch and space dimensions.
-  
+
   Note: when training, the moving_mean and moving_variance need to be updated.
   By default the update ops are placed in `tf.GraphKeys.UPDATE_OPS`, so they
   need to be added as a dependency to the `train_op`. For example:
@@ -592,10 +569,9 @@ def batch_norm(inputs,
   #   implementation in normalization_layers.BatchNormalization.
   inputs = ops.convert_to_tensor(inputs)
   rank = inputs.get_shape().ndims
-  possible_to_fuse = (batch_weights is None and
-                      not renorm and
-                      rank in [2, 4] and
-                      adjustment is None)
+  possible_to_fuse = (
+      batch_weights is None and not renorm and rank in [2, 4] and
+      adjustment is None)
   if fused and possible_to_fuse and (
       zero_debias_moving_mean or rank == 2 or
       updates_collections is not ops.GraphKeys.UPDATE_OPS):
@@ -623,7 +599,9 @@ def batch_norm(inputs,
 
   layer_variable_getter = _build_variable_getter()
   with variable_scope.variable_scope(
-      scope, 'BatchNorm', [inputs], reuse=reuse,
+      scope,
+      'BatchNorm', [inputs],
+      reuse=reuse,
       custom_getter=layer_variable_getter) as sc:
     inputs = ops.convert_to_tensor(inputs)
 
@@ -671,15 +649,15 @@ def batch_norm(inputs,
       outputs = layer.apply(inputs, training=is_training)
 
       # Add variables to collections.
-      _add_variable_to_collections(
-          layer.moving_mean, variables_collections, 'moving_mean')
-      _add_variable_to_collections(
-          layer.moving_variance, variables_collections, 'moving_variance')
+      _add_variable_to_collections(layer.moving_mean, variables_collections,
+                                   'moving_mean')
+      _add_variable_to_collections(layer.moving_variance, variables_collections,
+                                   'moving_variance')
       if layer.beta is not None:
         _add_variable_to_collections(layer.beta, variables_collections, 'beta')
       if layer.gamma is not None:
-        _add_variable_to_collections(
-            layer.gamma, variables_collections, 'gamma')
+        _add_variable_to_collections(layer.gamma, variables_collections,
+                                     'gamma')
 
       if activation_fn is not None:
         outputs = activation_fn(outputs)
@@ -719,8 +697,8 @@ def batch_norm(inputs,
       params_shape = inputs_shape[-1:]
       params_shape_broadcast = None
     if not params_shape.is_fully_defined():
-      raise ValueError('Inputs %s has undefined channels dimension %s.' % (
-          inputs.name, params_shape))
+      raise ValueError('Inputs %s has undefined channels dimension %s.' %
+                       (inputs.name, params_shape))
 
     # Allocate parameters for the beta and gamma of the normalization.
     beta, gamma = None, None
@@ -731,23 +709,25 @@ def batch_norm(inputs,
                                                         'beta')
       beta_initializer = param_initializers.get('beta',
                                                 init_ops.zeros_initializer())
-      beta = variables.model_variable('beta',
-                                      shape=params_shape,
-                                      dtype=dtype,
-                                      initializer=beta_initializer,
-                                      collections=beta_collections,
-                                      trainable=trainable)
+      beta = variables.model_variable(
+          'beta',
+          shape=params_shape,
+          dtype=dtype,
+          initializer=beta_initializer,
+          collections=beta_collections,
+          trainable=trainable)
     if scale:
-      gamma_collections = utils.get_variable_collections(variables_collections,
-                                                         'gamma')
+      gamma_collections = utils.get_variable_collections(
+          variables_collections, 'gamma')
       gamma_initializer = param_initializers.get('gamma',
                                                  init_ops.ones_initializer())
-      gamma = variables.model_variable('gamma',
-                                       shape=params_shape,
-                                       dtype=dtype,
-                                       initializer=gamma_initializer,
-                                       collections=gamma_collections,
-                                       trainable=trainable)
+      gamma = variables.model_variable(
+          'gamma',
+          shape=params_shape,
+          dtype=dtype,
+          initializer=gamma_initializer,
+          collections=gamma_collections,
+          trainable=trainable)
 
     # Create moving_mean and moving_variance variables and add them to the
     # appropriate collections. We disable variable partitioning while creating
@@ -796,8 +776,8 @@ def batch_norm(inputs,
           mean, variance = nn.moments(inputs, moments_axes)
       else:
         if data_format == DATA_FORMAT_NCHW:
-          mean, variance = nn.weighted_moments(inputs, moments_axes,
-                                               batch_weights, keep_dims=True)
+          mean, variance = nn.weighted_moments(
+              inputs, moments_axes, batch_weights, keep_dims=True)
           mean = array_ops.reshape(mean, [-1])
           variance = array_ops.reshape(variance, [-1])
         else:
@@ -806,19 +786,21 @@ def batch_norm(inputs,
 
       moving_vars_fn = lambda: (moving_mean, moving_variance)
       if updates_collections is None:
+
         def _force_updates():
           """Internal function forces updates moving_vars if is_training."""
           update_moving_mean = moving_averages.assign_moving_average(
               moving_mean, mean, decay, zero_debias=zero_debias_moving_mean)
           update_moving_variance = moving_averages.assign_moving_average(
               moving_variance, variance, decay, zero_debias=False)
-          with ops.control_dependencies([update_moving_mean,
-                                         update_moving_variance]):
+          with ops.control_dependencies(
+              [update_moving_mean, update_moving_variance]):
             return array_ops.identity(mean), array_ops.identity(variance)
-        mean, variance = utils.smart_cond(is_training,
-                                          _force_updates,
+
+        mean, variance = utils.smart_cond(is_training, _force_updates,
                                           moving_vars_fn)
       else:
+
         def _delay_updates():
           """Internal function that delay updates moving_vars if is_training."""
           update_moving_mean = moving_averages.assign_moving_average(
@@ -827,9 +809,8 @@ def batch_norm(inputs,
               moving_variance, variance, decay, zero_debias=False)
           return update_moving_mean, update_moving_variance
 
-        update_mean, update_variance = utils.smart_cond(is_training,
-                                                        _delay_updates,
-                                                        moving_vars_fn)
+        update_mean, update_variance = utils.smart_cond(
+            is_training, _delay_updates, moving_vars_fn)
         ops.add_to_collections(updates_collections, update_mean)
         ops.add_to_collections(updates_collections, update_variance)
         # Use computed moments during training and moving_vars otherwise.
@@ -897,8 +878,8 @@ def bias_add(inputs,
   """
   if data_format not in (DATA_FORMAT_NCHW, DATA_FORMAT_NHWC):
     raise ValueError('data_format has to be either NCHW or NHWC.')
-  with variable_scope.variable_scope(scope, 'BiasAdd', [inputs],
-                                     reuse=reuse) as sc:
+  with variable_scope.variable_scope(
+      scope, 'BiasAdd', [inputs], reuse=reuse) as sc:
     inputs = ops.convert_to_tensor(inputs)
     dtype = inputs.dtype.base_dtype
     inputs_shape = inputs.get_shape()
@@ -913,13 +894,16 @@ def bias_add(inputs,
       raise ValueError('`C` dimension must be known but is None')
     biases_collections = utils.get_variable_collections(variables_collections,
                                                         'biases')
-    biases = variables.model_variable('biases',
-                                      shape=[num_features,],
-                                      dtype=dtype,
-                                      initializer=initializer,
-                                      regularizer=regularizer,
-                                      collections=biases_collections,
-                                      trainable=trainable)
+    biases = variables.model_variable(
+        'biases',
+        shape=[
+            num_features,
+        ],
+        dtype=dtype,
+        initializer=initializer,
+        regularizer=regularizer,
+        collections=biases_collections,
+        trainable=trainable)
     outputs = nn.bias_add(inputs, biases, data_format=data_format)
     if activation_fn is not None:
       outputs = activation_fn(outputs)
@@ -1019,8 +1003,10 @@ def convolution(inputs,
   if data_format not in [None, 'NWC', 'NCW', 'NHWC', 'NCHW', 'NDHWC', 'NCDHW']:
     raise ValueError('Invalid data_format: %r' % (data_format,))
 
-  layer_variable_getter = _build_variable_getter(
-      {'bias': 'biases', 'kernel': 'weights'})
+  layer_variable_getter = _build_variable_getter({
+      'bias': 'biases',
+      'kernel': 'weights'
+  })
 
   with variable_scope.variable_scope(
       scope, 'Conv', [inputs], reuse=reuse,
@@ -1038,26 +1024,27 @@ def convolution(inputs,
       raise ValueError('Convolution not supported for input with rank',
                        input_rank)
 
-    df = ('channels_first' if data_format and data_format.startswith('NC')
-          else 'channels_last')
-    layer = layer_class(filters=num_outputs,
-                        kernel_size=kernel_size,
-                        strides=stride,
-                        padding=padding,
-                        data_format=df,
-                        dilation_rate=rate,
-                        activation=None,
-                        use_bias=not normalizer_fn and biases_initializer,
-                        kernel_initializer=weights_initializer,
-                        bias_initializer=biases_initializer,
-                        kernel_regularizer=weights_regularizer,
-                        bias_regularizer=biases_regularizer,
-                        activity_regularizer=None,
-                        trainable=trainable,
-                        name=sc.name,
-                        dtype=inputs.dtype.base_dtype,
-                        _scope=sc,
-                        _reuse=reuse)
+    df = ('channels_first'
+          if data_format and data_format.startswith('NC') else 'channels_last')
+    layer = layer_class(
+        filters=num_outputs,
+        kernel_size=kernel_size,
+        strides=stride,
+        padding=padding,
+        data_format=df,
+        dilation_rate=rate,
+        activation=None,
+        use_bias=not normalizer_fn and biases_initializer,
+        kernel_initializer=weights_initializer,
+        bias_initializer=biases_initializer,
+        kernel_regularizer=weights_regularizer,
+        bias_regularizer=biases_regularizer,
+        activity_regularizer=None,
+        trainable=trainable,
+        name=sc.name,
+        dtype=inputs.dtype.base_dtype,
+        _scope=sc,
+        _reuse=reuse)
     outputs = layer.apply(inputs)
 
     # Add variables to collections.
@@ -1073,6 +1060,7 @@ def convolution(inputs,
       outputs = activation_fn(outputs)
     return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
 
+
 convolution2d = convolution
 convolution3d = convolution
 
@@ -1148,13 +1136,14 @@ def convolution2d_in_plane(
     weights_shape = [kernel_h, kernel_w, 1, 1]
     weights_collections = utils.get_variable_collections(
         variables_collections, 'weights')
-    weights = variables.model_variable('weights',
-                                       shape=weights_shape,
-                                       dtype=dtype,
-                                       initializer=weights_initializer,
-                                       regularizer=weights_regularizer,
-                                       collections=weights_collections,
-                                       trainable=trainable)
+    weights = variables.model_variable(
+        'weights',
+        shape=weights_shape,
+        dtype=dtype,
+        initializer=weights_initializer,
+        regularizer=weights_regularizer,
+        collections=weights_collections,
+        trainable=trainable)
     depthwise_weights = array_ops.tile(weights, [1, 1, num_filters_in, 1])
     outputs = nn.depthwise_conv2d(inputs, depthwise_weights,
                                   [1, stride_h, stride_w, 1], padding)
@@ -1165,13 +1154,16 @@ def convolution2d_in_plane(
       if biases_initializer is not None:
         biases_collections = utils.get_variable_collections(
             variables_collections, 'biases')
-        biases = variables.model_variable('biases',
-                                          shape=[num_filters_in,],
-                                          dtype=dtype,
-                                          initializer=biases_initializer,
-                                          regularizer=biases_regularizer,
-                                          collections=biases_collections,
-                                          trainable=trainable)
+        biases = variables.model_variable(
+            'biases',
+            shape=[
+                num_filters_in,
+            ],
+            dtype=dtype,
+            initializer=biases_initializer,
+            regularizer=biases_regularizer,
+            collections=biases_collections,
+            trainable=trainable)
         outputs = nn.bias_add(outputs, biases)
 
     if activation_fn is not None:
@@ -1244,19 +1236,23 @@ def convolution2d_transpose(
     ValueError: If `data_format` is neither `NHWC` nor `NCHW`.
     ValueError: If `C` dimension of `inputs` is None.
   """
-  layer_variable_getter = _build_variable_getter(
-      {'bias': 'biases', 'kernel': 'weights'})
+  layer_variable_getter = _build_variable_getter({
+      'bias': 'biases',
+      'kernel': 'weights'
+  })
 
   with variable_scope.variable_scope(
-      scope, 'Conv2d_transpose', [inputs], reuse=reuse,
+      scope,
+      'Conv2d_transpose', [inputs],
+      reuse=reuse,
       custom_getter=layer_variable_getter) as sc:
     if data_format not in (DATA_FORMAT_NCHW, DATA_FORMAT_NHWC):
       raise ValueError('data_format has to be either NCHW or NHWC.')
 
     inputs = ops.convert_to_tensor(inputs)
 
-    df = ('channels_first' if data_format and data_format.startswith('NC')
-          else 'channels_last')
+    df = ('channels_first'
+          if data_format and data_format.startswith('NC') else 'channels_last')
     layer = convolutional_layers.Convolution2DTranspose(
         filters=num_outputs,
         kernel_size=kernel_size,
@@ -1353,19 +1349,23 @@ def convolution3d_transpose(
     ValueError: If `data_format` is neither `NDHWC` nor `NCDHW`.
     ValueError: If `C` dimension of `inputs` is None.
   """
-  layer_variable_getter = _build_variable_getter(
-      {'bias': 'biases', 'kernel': 'weights'})
+  layer_variable_getter = _build_variable_getter({
+      'bias': 'biases',
+      'kernel': 'weights'
+  })
 
   with variable_scope.variable_scope(
-      scope, 'Conv3d_transpose', [inputs], reuse=reuse,
+      scope,
+      'Conv3d_transpose', [inputs],
+      reuse=reuse,
       custom_getter=layer_variable_getter) as sc:
     if data_format not in (DATA_FORMAT_NCDHW, DATA_FORMAT_NDHWC):
       raise ValueError('data_format has to be either NCDHW or NDHWC.')
 
     inputs = ops.convert_to_tensor(inputs)
 
-    df = ('channels_first' if data_format and data_format.startswith('NC')
-          else 'channels_last')
+    df = ('channels_first'
+          if data_format and data_format.startswith('NC') else 'channels_last')
     layer = convolutional_layers.Convolution3DTranspose(
         filters=num_outputs,
         kernel_size=kernel_size,
@@ -1434,19 +1434,18 @@ def dropout(inputs,
   with variable_scope.variable_scope(
       scope, 'Dropout', [inputs], custom_getter=_model_variable_getter) as sc:
     inputs = ops.convert_to_tensor(inputs)
-    layer = core_layers.Dropout(rate=1 - keep_prob,
-                                noise_shape=noise_shape,
-                                seed=seed,
-                                name=sc.name,
-                                _scope=sc)
+    layer = core_layers.Dropout(
+        rate=1 - keep_prob,
+        noise_shape=noise_shape,
+        seed=seed,
+        name=sc.name,
+        _scope=sc)
     outputs = layer.apply(inputs, training=is_training)
     return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
 
 
 @add_arg_scope
-def flatten(inputs,
-            outputs_collections=None,
-            scope=None):
+def flatten(inputs, outputs_collections=None, scope=None):
   """Flattens the input while maintaining the batch_size.
 
     Assumes that the first dimension represents the batch.
@@ -1478,8 +1477,8 @@ def _sparse_inner_flatten(inputs, new_rank):
 
   outer_dimensions = inputs.dense_shape[:new_rank - 1]
   inner_dimensions = inputs.dense_shape[new_rank - 1:]
-  new_shape = array_ops.concat((outer_dimensions,
-                                [math_ops.reduce_prod(inner_dimensions)]), 0)
+  new_shape = array_ops.concat(
+      (outer_dimensions, [math_ops.reduce_prod(inner_dimensions)]), 0)
   flattened = sparse_ops.sparse_reshape(inputs, new_shape)
   return flattened
 
@@ -1545,10 +1544,18 @@ def _inner_flatten(inputs, new_rank, output_collections=None, scope=None):
   return utils.collect_named_outputs(output_collections, sc, flattened)
 
 
-def _model_variable_getter(getter, name, shape=None, dtype=None,
-                           initializer=None, regularizer=None, trainable=True,
-                           collections=None, caching_device=None,
-                           partitioner=None, rename=None, use_resource=None,
+def _model_variable_getter(getter,
+                           name,
+                           shape=None,
+                           dtype=None,
+                           initializer=None,
+                           regularizer=None,
+                           trainable=True,
+                           collections=None,
+                           caching_device=None,
+                           partitioner=None,
+                           rename=None,
+                           use_resource=None,
                            **_):
   """Getter that uses model_variable for compatibility with core layers."""
   short_name = name.split('/')[-1]
@@ -1557,25 +1564,34 @@ def _model_variable_getter(getter, name, shape=None, dtype=None,
     name_components[-1] = rename[short_name]
     name = '/'.join(name_components)
   return variables.model_variable(
-      name, shape=shape, dtype=dtype, initializer=initializer,
-      regularizer=regularizer, collections=collections, trainable=trainable,
-      caching_device=caching_device, partitioner=partitioner,
-      custom_getter=getter, use_resource=use_resource)
+      name,
+      shape=shape,
+      dtype=dtype,
+      initializer=initializer,
+      regularizer=regularizer,
+      collections=collections,
+      trainable=trainable,
+      caching_device=caching_device,
+      partitioner=partitioner,
+      custom_getter=getter,
+      use_resource=use_resource)
 
 
 def _build_variable_getter(rename=None):
   """Build a model variable getter that respects scope getter and renames."""
+
   # VariableScope will nest the getters
   def layer_variable_getter(getter, *args, **kwargs):
     kwargs['rename'] = rename
     return _model_variable_getter(getter, *args, **kwargs)
+
   return layer_variable_getter
 
 
 def _add_variable_to_collections(variable, collections_set, collections_name):
   """Adds variable (or all its parts) to all collections with that name."""
-  collections = utils.get_variable_collections(
-      collections_set, collections_name) or []
+  collections = utils.get_variable_collections(collections_set,
+                                               collections_name) or []
   variables_list = [variable]
   if isinstance(variable, tf_variables.PartitionedVariable):
     variables_list = [v for v in variable]
@@ -1644,15 +1660,19 @@ def fully_connected(inputs,
     ValueError: If x has rank less than 2 or if its last dimension is not set.
   """
   if not isinstance(num_outputs, six.integer_types):
-    raise ValueError(
-        'num_outputs should be int or long, got %s.' % (num_outputs,))
+    raise ValueError('num_outputs should be int or long, got %s.' %
+                     (num_outputs,))
 
-  layer_variable_getter = _build_variable_getter({'bias': 'biases',
-                                                  'kernel': 'weights'})
+  layer_variable_getter = _build_variable_getter({
+      'bias': 'biases',
+      'kernel': 'weights'
+  })
 
   with variable_scope.variable_scope(
-      scope, 'fully_connected', [inputs],
-      reuse=reuse, custom_getter=layer_variable_getter) as sc:
+      scope,
+      'fully_connected', [inputs],
+      reuse=reuse,
+      custom_getter=layer_variable_getter) as sc:
     inputs = ops.convert_to_tensor(inputs)
     layer = core_layers.Dense(
         units=num_outputs,
@@ -1758,15 +1778,17 @@ class GDN(base.Layer):
                inverse=False,
                beta_min=1e-6,
                gamma_init=.1,
-               reparam_offset=2 ** -18,
+               reparam_offset=2**-18,
                data_format='channels_last',
                activity_regularizer=None,
                trainable=True,
                name=None,
                **kwargs):
-    super(GDN, self).__init__(trainable=trainable, name=name,
-                              activity_regularizer=activity_regularizer,
-                              **kwargs)
+    super(GDN, self).__init__(
+        trainable=trainable,
+        name=name,
+        activity_regularizer=activity_regularizer,
+        **kwargs)
     self.inverse = inverse
     self._beta_min = beta_min
     self._gamma_init = gamma_init
@@ -1801,8 +1823,9 @@ class GDN(base.Layer):
     with ops.name_scope(name, 'GDNLowerBound', [inputs, bound]) as scope:
       inputs = ops.convert_to_tensor(inputs, name='inputs')
       bound = ops.convert_to_tensor(bound, name='bound')
-      with ops.get_default_graph().gradient_override_map(
-          {'Maximum': 'GDNLowerBound'}):
+      with ops.get_default_graph().gradient_override_map({
+          'Maximum': 'GDNLowerBound'
+      }):
         return math_ops.maximum(inputs, bound, name=scope)
 
   @staticmethod
@@ -1829,12 +1852,14 @@ class GDN(base.Layer):
       raise ValueError('The channel dimension of the inputs to `GDN` '
                        'must be defined.')
     self._input_rank = input_shape.ndims
-    self.input_spec = base.InputSpec(ndim=input_shape.ndims,
-                                     axes={channel_axis: num_channels})
+    self.input_spec = base.InputSpec(
+        ndim=input_shape.ndims, axes={
+            channel_axis: num_channels
+        })
 
-    pedestal = array_ops.constant(self._reparam_offset ** 2, dtype=self.dtype)
+    pedestal = array_ops.constant(self._reparam_offset**2, dtype=self.dtype)
     beta_bound = array_ops.constant(
-        (self._beta_min + self._reparam_offset ** 2) ** .5, dtype=self.dtype)
+        (self._beta_min + self._reparam_offset**2)**.5, dtype=self.dtype)
     gamma_bound = array_ops.constant(self._reparam_offset, dtype=self.dtype)
 
     def beta_initializer(shape, dtype=None, partition_info=None):
@@ -1848,19 +1873,21 @@ class GDN(base.Layer):
       eye = linalg_ops.eye(shape[0], dtype=dtype)
       return math_ops.sqrt(self._gamma_init * eye + pedestal)
 
-    beta = self.add_variable('reparam_beta',
-                             shape=[num_channels],
-                             initializer=beta_initializer,
-                             dtype=self.dtype,
-                             trainable=True)
+    beta = self.add_variable(
+        'reparam_beta',
+        shape=[num_channels],
+        initializer=beta_initializer,
+        dtype=self.dtype,
+        trainable=True)
     beta = self._lower_bound(beta, beta_bound)
     self.beta = math_ops.square(beta) - pedestal
 
-    gamma = self.add_variable('reparam_gamma',
-                              shape=[num_channels, num_channels],
-                              initializer=gamma_initializer,
-                              dtype=self.dtype,
-                              trainable=True)
+    gamma = self.add_variable(
+        'reparam_gamma',
+        shape=[num_channels, num_channels],
+        initializer=gamma_initializer,
+        dtype=self.dtype,
+        trainable=True)
     gamma = self._lower_bound(gamma, gamma_bound)
     self.gamma = math_ops.square(gamma) - pedestal
 
@@ -1875,8 +1902,11 @@ class GDN(base.Layer):
 
     # Compute normalization pool.
     if self.data_format == 'channels_first':
-      norm_pool = nn.convolution(math_ops.square(inputs), gamma, 'VALID',
-                                 data_format='NC' + 'DHW'[-(ndim - 2):])
+      norm_pool = nn.convolution(
+          math_ops.square(inputs),
+          gamma,
+          'VALID',
+          data_format='NC' + 'DHW' [-(ndim - 2):])
       if ndim == 3:
         norm_pool = array_ops.expand_dims(norm_pool, 2)
         norm_pool = nn.bias_add(norm_pool, self.beta, data_format='NCHW')
@@ -1918,7 +1948,7 @@ def gdn(inputs,
         inverse=False,
         beta_min=1e-6,
         gamma_init=.1,
-        reparam_offset=2 ** -18,
+        reparam_offset=2**-18,
         data_format='channels_last',
         activity_regularizer=None,
         trainable=True,
@@ -1984,17 +2014,18 @@ def gdn(inputs,
   Returns:
     Output tensor.
   """
-  layer = GDN(inverse=inverse,
-              beta_min=beta_min,
-              gamma_init=gamma_init,
-              reparam_offset=reparam_offset,
-              data_format=data_format,
-              activity_regularizer=activity_regularizer,
-              trainable=trainable,
-              name=name,
-              dtype=inputs.dtype.base_dtype,
-              _scope=name,
-              _reuse=reuse)
+  layer = GDN(
+      inverse=inverse,
+      beta_min=beta_min,
+      gamma_init=gamma_init,
+      reparam_offset=reparam_offset,
+      data_format=data_format,
+      activity_regularizer=activity_regularizer,
+      trainable=trainable,
+      name=name,
+      dtype=inputs.dtype.base_dtype,
+      _scope=name,
+      _reuse=reuse)
   return layer.apply(inputs)
 
 
@@ -2070,8 +2101,8 @@ def layer_norm(inputs,
       or if `inputs.shape[begin_params_axis:]` is not fully defined at
       graph build time.
   """
-  with variable_scope.variable_scope(scope, 'LayerNorm', [inputs],
-                                     reuse=reuse) as sc:
+  with variable_scope.variable_scope(
+      scope, 'LayerNorm', [inputs], reuse=reuse) as sc:
     inputs = ops.convert_to_tensor(inputs)
     inputs_shape = inputs.shape
     inputs_rank = inputs_shape.ndims
@@ -2081,15 +2112,14 @@ def layer_norm(inputs,
     if begin_norm_axis < 0:
       begin_norm_axis = inputs_rank + begin_norm_axis
     if begin_params_axis >= inputs_rank or begin_norm_axis >= inputs_rank:
-      raise ValueError(
-          'begin_params_axis (%d) and begin_norm_axis (%d) '
-          'must be < rank(inputs) (%d)'
-          % (begin_params_axis, begin_norm_axis, inputs_rank))
+      raise ValueError('begin_params_axis (%d) and begin_norm_axis (%d) '
+                       'must be < rank(inputs) (%d)' %
+                       (begin_params_axis, begin_norm_axis, inputs_rank))
     params_shape = inputs_shape[begin_params_axis:]
     if not params_shape.is_fully_defined():
       raise ValueError(
-          'Inputs %s: shape(inputs)[%s:] is not fully defined: %s' % (
-              inputs.name, begin_params_axis, inputs_shape))
+          'Inputs %s: shape(inputs)[%s:] is not fully defined: %s' %
+          (inputs.name, begin_params_axis, inputs_shape))
     # Allocate parameters for the beta and gamma of the normalization.
     beta, gamma = None, None
     if center:
@@ -2103,8 +2133,8 @@ def layer_norm(inputs,
           collections=beta_collections,
           trainable=trainable)
     if scale:
-      gamma_collections = utils.get_variable_collections(variables_collections,
-                                                         'gamma')
+      gamma_collections = utils.get_variable_collections(
+          variables_collections, 'gamma')
       gamma = variables.model_variable(
           'gamma',
           shape=params_shape,
@@ -2118,7 +2148,11 @@ def layer_norm(inputs,
     # Compute layer normalization using the batch_normalization function.
     variance_epsilon = 1e-12
     outputs = nn.batch_normalization(
-        inputs, mean, variance, offset=beta, scale=gamma,
+        inputs,
+        mean,
+        variance,
+        offset=beta,
+        scale=gamma,
         variance_epsilon=variance_epsilon)
     outputs.set_shape(inputs_shape)
     if activation_fn is not None:
@@ -2164,13 +2198,14 @@ def max_pool2d(inputs,
     raise ValueError('data_format has to be either NCHW or NHWC.')
   with ops.name_scope(scope, 'MaxPool2D', [inputs]) as sc:
     inputs = ops.convert_to_tensor(inputs)
-    df = ('channels_first' if data_format and data_format.startswith('NC')
-          else 'channels_last')
-    layer = pooling_layers.MaxPooling2D(pool_size=kernel_size,
-                                        strides=stride,
-                                        padding=padding,
-                                        data_format=df,
-                                        _scope=sc)
+    df = ('channels_first'
+          if data_format and data_format.startswith('NC') else 'channels_last')
+    layer = pooling_layers.MaxPooling2D(
+        pool_size=kernel_size,
+        strides=stride,
+        padding=padding,
+        data_format=df,
+        _scope=sc)
     outputs = layer.apply(inputs)
     return utils.collect_named_outputs(outputs_collections, sc, outputs)
 
@@ -2213,13 +2248,14 @@ def max_pool3d(inputs,
     raise ValueError('data_format has to be either NCDHW or NDHWC.')
   with ops.name_scope(scope, 'MaxPool3D', [inputs]) as sc:
     inputs = ops.convert_to_tensor(inputs)
-    df = ('channels_first' if data_format and data_format.startswith('NC')
-          else 'channels_last')
-    layer = pooling_layers.MaxPooling3D(pool_size=kernel_size,
-                                        strides=stride,
-                                        padding=padding,
-                                        data_format=df,
-                                        _scope=sc)
+    df = ('channels_first'
+          if data_format and data_format.startswith('NC') else 'channels_last')
+    layer = pooling_layers.MaxPooling3D(
+        pool_size=kernel_size,
+        strides=stride,
+        padding=padding,
+        data_format=df,
+        _scope=sc)
     outputs = layer.apply(inputs)
     return utils.collect_named_outputs(outputs_collections, sc, outputs)
 
@@ -2272,8 +2308,8 @@ def pool(inputs,
 
   """
   # pylint: enable=line-too-long
-  with ops.name_scope(scope, '%s_pool' %
-                      (pooling_type.lower()), [inputs]) as sc:
+  with ops.name_scope(scope, '%s_pool' % (pooling_type.lower()),
+                      [inputs]) as sc:
     inputs = ops.convert_to_tensor(inputs)
     input_rank = inputs.get_shape().ndims
     if input_rank is None:
@@ -2318,18 +2354,16 @@ def one_hot_encoding(labels,
     labels = ops.convert_to_tensor(labels)
     if labels.dtype == dtypes.int32:
       labels = standard_ops.to_int64(labels)
-    outputs = standard_ops.one_hot(labels,
-                                   num_classes,
-                                   on_value=on_value,
-                                   off_value=off_value)
+    outputs = standard_ops.one_hot(
+        labels, num_classes, on_value=on_value, off_value=off_value)
     return utils.collect_named_outputs(outputs_collections, sc, outputs)
 
 
 def _apply_activation(y, activation_fn, output_collections):
   if activation_fn is not None:
     y = activation_fn(y)
-  ops.add_to_collections(list(output_collections or []) +
-                         [ops.GraphKeys.ACTIVATIONS], y)
+  ops.add_to_collections(
+      list(output_collections or []) + [ops.GraphKeys.ACTIVATIONS], y)
   return y
 
 
@@ -2374,7 +2408,7 @@ def repeat(inputs, repetitions, layer, *args, **kwargs):
         scope = 'repeat'
     outputs = inputs
     for i in range(repetitions):
-      kwargs['scope'] = scope + '_' + str(i+1)
+      kwargs['scope'] = scope + '_' + str(i + 1)
       outputs = layer(outputs, *args, **kwargs)
     return outputs
 
@@ -2389,8 +2423,8 @@ def _scale_gradient_grad(op, grad):
   return [grad * op.inputs[1], None]
 
 
-@function.Defun(python_grad_func=_scale_gradient_grad,
-                shape_func=_scale_gradient_shape)
+@function.Defun(
+    python_grad_func=_scale_gradient_grad, shape_func=_scale_gradient_shape)
 def scale_gradient(inputs, gradient_multiplier):
   """Identity operation, but with the gradient multiplied by a tensor.
 
@@ -2495,18 +2529,21 @@ def separable_convolution2d(
   """
   if data_format not in (DATA_FORMAT_NCHW, DATA_FORMAT_NHWC):
     raise ValueError('data_format has to be either NCHW or NHWC.')
-  layer_variable_getter = _build_variable_getter(
-      {'bias': 'biases',
-       'depthwise_kernel': 'depthwise_weights',
-       'pointwise_kernel': 'pointwise_weights'})
+  layer_variable_getter = _build_variable_getter({
+      'bias': 'biases',
+      'depthwise_kernel': 'depthwise_weights',
+      'pointwise_kernel': 'pointwise_weights'
+  })
 
   with variable_scope.variable_scope(
-      scope, 'SeparableConv2d', [inputs], reuse=reuse,
+      scope,
+      'SeparableConv2d', [inputs],
+      reuse=reuse,
       custom_getter=layer_variable_getter) as sc:
     inputs = ops.convert_to_tensor(inputs)
 
-    df = ('channels_first' if data_format and data_format.startswith('NC')
-          else 'channels_last')
+    df = ('channels_first'
+          if data_format and data_format.startswith('NC') else 'channels_last')
     if num_outputs is not None:
       # Apply separable conv using the SeparableConvolution2D layer.
       layer = convolutional_layers.SeparableConvolution2D(
@@ -2539,8 +2576,8 @@ def separable_convolution2d(
       _add_variable_to_collections(layer.pointwise_kernel,
                                    variables_collections, 'weights')
       if layer.bias is not None:
-        _add_variable_to_collections(layer.bias,
-                                     variables_collections, 'biases')
+        _add_variable_to_collections(layer.bias, variables_collections,
+                                     'biases')
 
       if normalizer_fn is not None:
         normalizer_params = normalizer_params or {}
@@ -2555,8 +2592,7 @@ def separable_convolution2d(
       weights_collections = utils.get_variable_collections(
           variables_collections, 'weights')
 
-      depthwise_shape = [kernel_h, kernel_w,
-                         num_filters_in, depth_multiplier]
+      depthwise_shape = [kernel_h, kernel_w, num_filters_in, depth_multiplier]
       depthwise_weights = variables.model_variable(
           'depthwise_weights',
           shape=depthwise_shape,
@@ -2570,9 +2606,13 @@ def separable_convolution2d(
                      1, stride_h, stride_w, 1
                  ]
 
-      outputs = nn.depthwise_conv2d(inputs, depthwise_weights, strides, padding,
-                                    rate=utils.two_element_tuple(rate),
-                                    data_format=data_format)
+      outputs = nn.depthwise_conv2d(
+          inputs,
+          depthwise_weights,
+          strides,
+          padding,
+          rate=utils.two_element_tuple(rate),
+          data_format=data_format)
       num_outputs = depth_multiplier * num_filters_in
 
       if normalizer_fn is not None:
@@ -2582,13 +2622,16 @@ def separable_convolution2d(
         if biases_initializer is not None:
           biases_collections = utils.get_variable_collections(
               variables_collections, 'biases')
-          biases = variables.model_variable('biases',
-                                            shape=[num_outputs,],
-                                            dtype=dtype,
-                                            initializer=biases_initializer,
-                                            regularizer=biases_regularizer,
-                                            trainable=trainable,
-                                            collections=biases_collections)
+          biases = variables.model_variable(
+              'biases',
+              shape=[
+                  num_outputs,
+              ],
+              dtype=dtype,
+              initializer=biases_initializer,
+              regularizer=biases_regularizer,
+              trainable=trainable,
+              collections=biases_collections)
           outputs = nn.bias_add(outputs, biases, data_format=data_format)
 
     if activation_fn is not None:
@@ -2673,23 +2716,24 @@ def spatial_softmax(features,
 
     with ops.name_scope('spatial_softmax_op', 'spatial_softmax_op', [features]):
       # Create tensors for x and y coordinate values, scaled to range [-1, 1].
-      pos_x, pos_y = array_ops.meshgrid(math_ops.lin_space(-1., 1., num=height),
-                                        math_ops.lin_space(-1., 1., num=width),
-                                        indexing='ij')
+      pos_x, pos_y = array_ops.meshgrid(
+          math_ops.lin_space(-1., 1., num=height),
+          math_ops.lin_space(-1., 1., num=width),
+          indexing='ij')
       pos_x = array_ops.reshape(pos_x, [height * width])
       pos_y = array_ops.reshape(pos_y, [height * width])
-      
+
       if temperature is None:
         temp_initializer = init_ops.ones_initializer()
       else:
         temp_initializer = init_ops.constant_initializer(temperature)
-          
+
       if not trainable:
         temp_collections = None
       else:
         temp_collections = utils.get_variable_collections(
-              variables_collections, 'temperature')
-      
+            variables_collections, 'temperature')
+
       temperature = variables.model_variable(
           'temperature',
           shape=(),
@@ -2703,14 +2747,14 @@ def spatial_softmax(features,
         features = array_ops.reshape(
             array_ops.transpose(features, [0, 3, 1, 2]), [-1, height * width])
 
-      softmax_attention = nn.softmax(features/temperature)
+      softmax_attention = nn.softmax(features / temperature)
       expected_x = math_ops.reduce_sum(
           pos_x * softmax_attention, [1], keep_dims=True)
       expected_y = math_ops.reduce_sum(
           pos_y * softmax_attention, [1], keep_dims=True)
       expected_xy = array_ops.concat([expected_x, expected_y], 1)
-      feature_keypoints = array_ops.reshape(
-          expected_xy, [-1, num_channels.value * 2])
+      feature_keypoints = array_ops.reshape(expected_xy,
+                                            [-1, num_channels.value * 2])
       feature_keypoints.set_shape([None, num_channels.value * 2])
   return feature_keypoints
 
@@ -2762,7 +2806,7 @@ def stack(inputs, layer, stack_args, **kwargs):
         scope = 'stack'
     outputs = inputs
     for i in range(len(stack_args)):
-      kwargs['scope'] = scope + '_' + str(i+1)
+      kwargs['scope'] = scope + '_' + str(i + 1)
       layer_args = stack_args[i]
       if not isinstance(layer_args, (list, tuple)):
         layer_args = [layer_args]
@@ -2793,11 +2837,10 @@ def unit_norm(inputs, dim, epsilon=1e-7, scope=None):
       raise ValueError('The input rank must be known.')
     input_rank = len(inputs.get_shape().as_list())
     if dim < 0 or dim >= input_rank:
-      raise ValueError(
-          'dim must be positive but smaller than the input rank.')
+      raise ValueError('dim must be positive but smaller than the input rank.')
 
-    lengths = math_ops.sqrt(epsilon + math_ops.reduce_sum(
-        math_ops.square(inputs), dim, True))
+    lengths = math_ops.sqrt(
+        epsilon + math_ops.reduce_sum(math_ops.square(inputs), dim, True))
     multiples = []
     if dim > 0:
       multiples.append(array_ops.ones([dim], dtypes.int32))
@@ -2938,29 +2981,31 @@ def legacy_fully_connected(x,
       raise ValueError('last dimension of x must be known but is None')
     dtype = x.dtype.base_dtype
 
-    weight_collections = set(list(weight_collections or []) +
-                             [ops.GraphKeys.GLOBAL_VARIABLES])
-    w = variable_scope.get_variable('weights',
-                                    shape=[num_input_units, num_output_units],
-                                    dtype=dtype,
-                                    initializer=weight_init,
-                                    collections=weight_collections,
-                                    regularizer=weight_regularizer,
-                                    trainable=trainable)
-    x_2_dim = x if len(dims) <= 2 else array_ops.reshape(x,
-                                                         [-1, num_input_units])
+    weight_collections = set(
+        list(weight_collections or []) + [ops.GraphKeys.GLOBAL_VARIABLES])
+    w = variable_scope.get_variable(
+        'weights',
+        shape=[num_input_units, num_output_units],
+        dtype=dtype,
+        initializer=weight_init,
+        collections=weight_collections,
+        regularizer=weight_regularizer,
+        trainable=trainable)
+    x_2_dim = x if len(dims) <= 2 else array_ops.reshape(
+        x, [-1, num_input_units])
     y = standard_ops.matmul(x_2_dim, w)
 
     if bias_init is not None:
-      bias_collections = set(list(bias_collections or []) +
-                             [ops.GraphKeys.GLOBAL_VARIABLES])
-      b = variable_scope.get_variable('bias',
-                                      shape=[num_output_units],
-                                      dtype=dtype,
-                                      initializer=bias_init,
-                                      collections=bias_collections,
-                                      regularizer=bias_regularizer,
-                                      trainable=trainable)
+      bias_collections = set(
+          list(bias_collections or []) + [ops.GraphKeys.GLOBAL_VARIABLES])
+      b = variable_scope.get_variable(
+          'bias',
+          shape=[num_output_units],
+          dtype=dtype,
+          initializer=bias_init,
+          collections=bias_collections,
+          regularizer=bias_regularizer,
+          trainable=trainable)
 
       y = nn.bias_add(y, b)
 
diff --git a/tensorflow/contrib/opt/python/training/model_average_optimizer.py b/tensorflow/contrib/opt/python/training/model_average_optimizer.py
index 47509ecca6..a7c97a1da2 100644
--- a/tensorflow/contrib/opt/python/training/model_average_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/model_average_optimizer.py
@@ -12,30 +12,30 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
-"""Wrapper optimizer for Model Average """
+"""Wrapper optimizer for Model Average."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import constant_op
-from tensorflow.python.training import optimizer
-from tensorflow.python.training import session_run_hook
-from tensorflow.python.ops import math_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
-from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.training import optimizer
+from tensorflow.python.training import session_run_hook
 
-GLOBAL_VARIABLE_NAME = 'global_center_variable'
+GLOBAL_VARIABLE_NAME = "global_center_variable"
 
 
 class ModelAverageCustomGetter(object):
-  """Custom_getter class is used to do:
+  """Custom_getter class is used to do.
+
   1. Change trainable variables to local collection and place them at worker
     device
   2. Generate global variables
@@ -73,15 +73,18 @@ class ModelAverageCustomGetter(object):
   def __call__(self, getter, name, trainable, collections, *args, **kwargs):
     if trainable:
       with ops.device(self._worker_device):
-        local_var = getter(name, trainable=True,
-                           collections=[ops.GraphKeys.LOCAL_VARIABLES],
-                           *args, **kwargs)
+        local_var = getter(
+            name,
+            trainable=True,
+            collections=[ops.GraphKeys.LOCAL_VARIABLES],
+            *args,
+            **kwargs)
 
       global_variable = variable_scope.variable(
-        name='%s/%s' % (GLOBAL_VARIABLE_NAME, name),
-        initial_value=local_var.initialized_value(),
-        trainable=False,
-        collections=[ops.GraphKeys.GLOBAL_VARIABLES])
+          name="%s/%s" % (GLOBAL_VARIABLE_NAME, name),
+          initial_value=local_var.initialized_value(),
+          trainable=False,
+          collections=[ops.GraphKeys.GLOBAL_VARIABLES])
 
       self._local_2_global[local_var] = global_variable
       return local_var
@@ -91,6 +94,7 @@ class ModelAverageCustomGetter(object):
 
 class ModelAverageOptimizer(optimizer.Optimizer):
   """Wrapper optimizer that implements the Model Average algorithm.
+
   This is a sync optimizer. During the training, each worker will update
   the local variables and maintains its own local_step, which starts from 0
   and is incremented by 1 after each update of local variables. Whenever the
@@ -99,15 +103,14 @@ class ModelAverageOptimizer(optimizer.Optimizer):
   local variables will be assigned by global center variables.
   """
 
-  def __init__(
-      self,
-      opt,
-      num_worker,
-      is_chief,
-      ma_custom_getter,
-      interval_steps=100,
-      use_locking=True,
-      name="ModelAverageOptimizer"):
+  def __init__(self,
+               opt,
+               num_worker,
+               is_chief,
+               ma_custom_getter,
+               interval_steps=100,
+               use_locking=True,
+               name="ModelAverageOptimizer"):
     """Construct a new model average optimizer.
 
     Args:
@@ -124,18 +127,18 @@ class ModelAverageOptimizer(optimizer.Optimizer):
     self._opt = opt
     self._num_worker = num_worker
     self._is_chief = is_chief
-    self._local_2_global = ma_custom_getter._local_2_global
+    self._local_2_global = ma_custom_getter._local_2_global  # pylint:disable=protected-access
     self._interval_steps = interval_steps
     self._accumulator_list = []
     self._chief_init_op = None
 
     self._local_step = variable_scope.get_variable(
-      initializer=0,
-      trainable=False,
-      collections=[ops.GraphKeys.LOCAL_VARIABLES],
-      name="local_step")
+        initializer=0,
+        trainable=False,
+        collections=[ops.GraphKeys.LOCAL_VARIABLES],
+        name="local_step")
 
-    self._opt._prepare()
+    self._opt._prepare()  # pylint:disable=protected-access
 
   def compute_gradients(self, *args, **kwargs):
     """Compute gradients of "loss" for the variables in "var_list".
@@ -159,10 +162,12 @@ class ModelAverageOptimizer(optimizer.Optimizer):
 
     Returns:
       An update op
+
+    Raises:
+      ValueError: if var_list is empty.
     """
     if not var_list:
-      raise ValueError(
-        'The list of local_variables should not be empty')
+      raise ValueError("The list of local_variables should not be empty")
     update_ops = []
     global_center_vars = [self._local_2_global[var] for var in var_list]
     for lvar, gvar in zip(var_list, global_center_vars):
@@ -204,28 +209,29 @@ class ModelAverageOptimizer(optimizer.Optimizer):
     apply_updates = self._opt.apply_gradients(grads_and_vars)
     with ops.control_dependencies([apply_updates]):
       local_update = state_ops.assign_add(
-        self._local_step, 1, name='local_step_update').op
+          self._local_step, 1, name="local_step_update").op
 
     # update global variables.
-    def _Update_global_variables():
+    def _update_global_variables():  # pylint: disable=missing-docstring
       local_vars = [v for g, v in grads_and_vars if g is not None]
       global_vars = [self._local_2_global[v] for v in local_vars]
       # sync queue
       with ops.colocate_with(global_step):
-        sync_queue = data_flow_ops.FIFOQueue(-1, [dtypes.bool], shapes=[[]],
-                                             shared_name='sync_queue')
+        sync_queue = data_flow_ops.FIFOQueue(
+            -1, [dtypes.bool], shapes=[[]], shared_name="sync_queue")
       train_ops = []
       aggregated_vars = []
-      with ops.name_scope(None, self._name + '/global'):
+      with ops.name_scope(None, self._name + "/global"):
         for var, gvar in zip(local_vars, global_vars):
+          # pylint: disable=protected-access
           with ops.device(gvar.device):
             if isinstance(var._ref(), ops.Tensor):
               var_accum = data_flow_ops.ConditionalAccumulator(
-                var.dtype,
-                shape=var.get_shape(),
-                shared_name=gvar.name + "/var_accum")
+                  var.dtype,
+                  shape=var.get_shape(),
+                  shared_name=gvar.name + "/var_accum")
               train_ops.append(
-                var_accum.apply_grad(var._ref(), local_step=global_step))
+                  var_accum.apply_grad(var._ref(), local_step=global_step))
               aggregated_vars.append(var_accum.take_grad(self._num_worker))
             else:
               raise ValueError("Unknown local variable type!")
@@ -254,24 +260,26 @@ class ModelAverageOptimizer(optimizer.Optimizer):
       return local_update_op
 
     with ops.control_dependencies([local_update]):
-      condition = math_ops.equal(math_ops.mod(
-        self._local_step, self._interval_steps), 0)
+      condition = math_ops.equal(
+          math_ops.mod(self._local_step, self._interval_steps), 0)
       conditional_update = control_flow_ops.cond(
-        condition, _Update_global_variables, control_flow_ops.no_op)
+          condition, _update_global_variables, control_flow_ops.no_op)
 
     chief_init_ops = []
     for accum, dev in self._accumulator_list:
       with ops.device(dev):
         chief_init_ops.append(
-          accum.set_global_step(
-            global_step, name="SetGlobalStep"))
+            accum.set_global_step(global_step, name="SetGlobalStep"))
     self._chief_init_op = control_flow_ops.group(*(chief_init_ops))
 
     return conditional_update
 
   def get_init_op(self):
-    """Returns the op to let all the local variables equal to the global
-     variables before the training begins"""
+    """Returns the op.
+
+    This method lets all the local variables equal to the global
+    variables before the training begins.
+    """
     return self._local_vars_update(variables.trainable_variables())
 
   def make_session_run_hook(self):
@@ -279,12 +287,13 @@ class ModelAverageOptimizer(optimizer.Optimizer):
     return _ModelAverageOptimizerHook(self, self._is_chief)
 
 
-class _ModelAverageOptimizerHook(session_run_hook.SessionRunHook):
+class _ModelAverageOptimizerHook(session_run_hook.SessionRunHook):  # pylint: disable=missing-docstring
+
   def __init__(self, ma_optimizer, is_chief):
     """Creates hook to handle ModelAverageOptimizer initialization ops.
 
     Args:
-      ea_optimizer: `ModelAverageOptimizer` which this hook will initialize.
+      ma_optimizer: `ModelAverageOptimizer` which this hook will initialize.
       is_chief: `Bool`, whether is this a chief replica or not.
     """
     self._ma_optimizer = ma_optimizer
@@ -295,5 +304,5 @@ class _ModelAverageOptimizerHook(session_run_hook.SessionRunHook):
     self._global_init_op = None
     if self._is_chief:
       self._global_init_op = variables.global_variables_initializer()
-      self._chief_init_op = self._ma_optimizer._chief_init_op
+      self._chief_init_op = self._ma_optimizer._chief_init_op  # pylint: disable=protected-access
     self._variable_init_op = self._ma_optimizer.get_init_op()
diff --git a/tensorflow/contrib/opt/python/training/model_average_optimizer_test.py b/tensorflow/contrib/opt/python/training/model_average_optimizer_test.py
index a73aa772bb..29ecd22839 100644
--- a/tensorflow/contrib/opt/python/training/model_average_optimizer_test.py
+++ b/tensorflow/contrib/opt/python/training/model_average_optimizer_test.py
@@ -18,18 +18,18 @@ from __future__ import division
 from __future__ import print_function
 
 import portpicker
+
+from tensorflow.contrib.opt.python.training import model_average_optimizer
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
+from tensorflow.python.training import device_setter
 from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import server_lib
 from tensorflow.python.training import training
 from tensorflow.python.training import training_util
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.training import device_setter
-from tensorflow.contrib.opt.python.training.model_average_optimizer import \
-  ModelAverageOptimizer, ModelAverageCustomGetter, GLOBAL_VARIABLE_NAME
 
 
 def create_local_cluster(num_workers, num_ps, protocol="grpc"):
@@ -37,20 +37,20 @@ def create_local_cluster(num_workers, num_ps, protocol="grpc"):
   worker_ports = [portpicker.pick_unused_port() for _ in range(num_workers)]
   ps_ports = [portpicker.pick_unused_port() for _ in range(num_ps)]
   cluster_dict = {
-    "worker": ["localhost:%s" % port for port in worker_ports],
-    "ps": ["localhost:%s" % port for port in ps_ports]
+      "worker": ["localhost:%s" % port for port in worker_ports],
+      "ps": ["localhost:%s" % port for port in ps_ports]
   }
   cs = server_lib.ClusterSpec(cluster_dict)
 
   workers = [
-    server_lib.Server(
-      cs, job_name="worker", protocol=protocol, task_index=ix, start=True)
-    for ix in range(num_workers)
+      server_lib.Server(
+          cs, job_name="worker", protocol=protocol, task_index=ix, start=True)
+      for ix in range(num_workers)
   ]
   ps_servers = [
-    server_lib.Server(
-      cs, job_name="ps", protocol=protocol, task_index=ix, start=True)
-    for ix in range(num_ps)
+      server_lib.Server(
+          cs, job_name="ps", protocol=protocol, task_index=ix, start=True)
+      for ix in range(num_ps)
   ]
 
   return cluster_dict, workers, ps_servers
@@ -67,16 +67,16 @@ def _get_workers(num_workers, steps, workers):
     is_chief = (worker_id == 0)
     with graph.as_default():
       worker_device = "/job:worker/task:%d/cpu:0" % (worker_id)
-      ma_coustom = ModelAverageCustomGetter(
-        worker_device=worker_device)
-      with variable_scope.variable_scope('',
-                                         custom_getter=ma_coustom), ops.device(
-        device_setter.replica_device_setter(worker_device=worker_device,
-                                            ps_device="/job:ps/task:0/cpu:0",
-                                            ps_tasks=1)):
-
-        global_step = variables.Variable(0, name='global_step',
-                                         trainable=False)
+      ma_coustom = model_average_optimizer.ModelAverageCustomGetter(
+          worker_device=worker_device)
+      with variable_scope.variable_scope(
+          "", custom_getter=ma_coustom), ops.device(
+              device_setter.replica_device_setter(
+                  worker_device=worker_device,
+                  ps_device="/job:ps/task:0/cpu:0",
+                  ps_tasks=1)):
+
+        global_step = variables.Variable(0, name="global_step", trainable=False)
         var_0 = variable_scope.get_variable(initializer=0.0, name="v0")
         var_1 = variable_scope.get_variable(initializer=1.0, name="v1")
 
@@ -88,22 +88,20 @@ def _get_workers(num_workers, steps, workers):
           grads_0 = constant_op.constant(-2.0)
           grads_1 = constant_op.constant(-2.0)
         sgd_opt = gradient_descent.GradientDescentOptimizer(1.0)
-        opt = ModelAverageOptimizer(
-          opt=sgd_opt,
-          num_worker=num_workers,
-          ma_custom_getter=ma_coustom,
-          is_chief=is_chief,
-          interval_steps=steps
-        )
+        opt = model_average_optimizer.ModelAverageOptimizer(
+            opt=sgd_opt,
+            num_worker=num_workers,
+            ma_custom_getter=ma_coustom,
+            is_chief=is_chief,
+            interval_steps=steps)
         train_op = [
-          opt.apply_gradients(
-            [[grads_0, var_0],
-             [grads_1, var_1]], global_step)
+            opt.apply_gradients([[grads_0, var_0], [grads_1, var_1]],
+                                global_step)
         ]
       easgd_hook = opt.make_session_run_hook()
       # Creates MonitoredSession
-      sess = training.MonitoredTrainingSession(workers[worker_id].target,
-                                               hooks=[easgd_hook])
+      sess = training.MonitoredTrainingSession(
+          workers[worker_id].target, hooks=[easgd_hook])
 
     sessions.append(sess)
     graphs.append(graph)
@@ -112,6 +110,7 @@ def _get_workers(num_workers, steps, workers):
 
 
 class ModelAverageOptimizerTest(test.TestCase):
+
   def _run(self, train_op, sess):
     sess.run(train_op)
 
@@ -119,18 +118,18 @@ class ModelAverageOptimizerTest(test.TestCase):
     num_workers = 2
     steps = 2
     num_ps = 1
-    cluster, workers, _ = create_local_cluster(num_workers=num_workers,
-                                               num_ps=num_ps)
+    _, workers, _ = create_local_cluster(
+        num_workers=num_workers, num_ps=num_ps)
 
-    sessions, graphs, train_ops = _get_workers(num_workers,
-                                               steps,
-                                               workers)
+    sessions, graphs, train_ops = _get_workers(num_workers, steps, workers)
 
-    var_0 = graphs[0].get_tensor_by_name('v0:0')
-    var_1 = graphs[0].get_tensor_by_name('v1:0')
+    var_0 = graphs[0].get_tensor_by_name("v0:0")
+    var_1 = graphs[0].get_tensor_by_name("v1:0")
     global_step = training_util.get_global_step(graphs[0])
-    global_var_0 = graphs[0].get_tensor_by_name(GLOBAL_VARIABLE_NAME + "/v0:0")
-    global_var_1 = graphs[0].get_tensor_by_name(GLOBAL_VARIABLE_NAME + "/v1:0")
+    global_var_0 = graphs[0].get_tensor_by_name(
+        model_average_optimizer.GLOBAL_VARIABLE_NAME + "/v0:0")
+    global_var_1 = graphs[0].get_tensor_by_name(
+        model_average_optimizer.GLOBAL_VARIABLE_NAME + "/v1:0")
 
     # Verify the initialized value.
     self.assertAllEqual(0.0, sessions[0].run(var_0))
@@ -150,9 +149,9 @@ class ModelAverageOptimizerTest(test.TestCase):
 
     # iteration 2, global varibale update
     thread_0 = self.checkedThread(
-      target=self._run, args=(train_ops[0], sessions[0]))
+        target=self._run, args=(train_ops[0], sessions[0]))
     thread_1 = self.checkedThread(
-      target=self._run, args=(train_ops[1], sessions[1]))
+        target=self._run, args=(train_ops[1], sessions[1]))
     thread_0.start()
     thread_1.start()
     thread_0.join()
@@ -175,20 +174,20 @@ class ModelAverageOptimizerTest(test.TestCase):
 
   def testPS2TasksWithClusterSpecClass(self):
     cluster_spec = server_lib.ClusterSpec({
-      "ps": ["ps0:2222", "ps1:2222"],
-      "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+        "ps": ["ps0:2222", "ps1:2222"],
+        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
     })
     worker_device = "/job:worker/task:0"
-    ma_coustom = ModelAverageCustomGetter(
-      worker_device=worker_device)
+    ma_coustom = model_average_optimizer.ModelAverageCustomGetter(
+        worker_device=worker_device)
     from tensorflow.python.training import device_setter
     with ops.device(
         device_setter.replica_device_setter(cluster=cluster_spec,
                                             worker_device=worker_device,
                                             ps_device="/job:ps")), \
-         variable_scope.variable_scope('', custom_getter=ma_coustom):
+         variable_scope.variable_scope("", custom_getter=ma_coustom):
       v = variable_scope.get_variable(initializer=[1, 2], name="v")
-      w = variable_scope.get_variable(initializer=[2, 1], name='w')
+      w = variable_scope.get_variable(initializer=[2, 1], name="w")
       v_g, w_g = ma_coustom._local_2_global[v], ma_coustom._local_2_global[w]
       self.assertDeviceEqual("/job:worker/task:0", v.device)
       self.assertDeviceEqual("job:ps/task:0", v_g.device)
@@ -196,5 +195,5 @@ class ModelAverageOptimizerTest(test.TestCase):
       self.assertDeviceEqual("job:ps/task:1", w_g.device)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py b/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py
index 30a2077570..a25de55e18 100644
--- a/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py
+++ b/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py
@@ -53,12 +53,11 @@ class PeriodicResampleTest(test_util.TensorFlowTestCase):
 
   def testPeriodicResampleBasic3D(self):
 
-    input_tensor = numpy.arange(2*2*4).reshape((2, 2, 4))
+    input_tensor = numpy.arange(2 * 2 * 4).reshape((2, 2, 4))
     desired_shape = numpy.array([4, 4, None])
-    output_tensor = numpy.array([[[0], [2], [4], [6]],
-                                 [[1], [3], [5], [7]],
-                                 [[8], [10], [12], [14]],
-                                 [[9], [11], [13], [15]]])
+    output_tensor = numpy.array([[[0], [2], [4], [6]], [[1], [3], [5], [7]],
+                                 [[8], [10], [12], [14]], [[9], [11], [13],
+                                                           [15]]])
 
     # NOTE: output_tensor != input_tensor.reshape((4, 4, -1))
     with self.test_session():
@@ -72,24 +71,18 @@ class PeriodicResampleTest(test_util.TensorFlowTestCase):
 
   def testPeriodicResampleBasic4D(self):
 
-    input_tensor = numpy.arange(2*2*2*8).reshape((2, 2, 2, 8))
+    input_tensor = numpy.arange(2 * 2 * 2 * 8).reshape((2, 2, 2, 8))
     desired_shape = numpy.array([4, 4, 4, None])
-    output_tensor = numpy.array([[[[0], [4], [8], [12]],
-                                  [[2], [6], [10], [14]],
-                                  [[16], [20], [24], [28]],
-                                  [[18], [22], [26], [30]]],
-                                 [[[1], [5], [9], [13]],
-                                  [[3], [7], [11], [15]],
-                                  [[17], [21], [25], [29]],
-                                  [[19], [23], [27], [31]]],
-                                 [[[32], [36], [40], [44]],
-                                  [[34], [38], [42], [46]],
-                                  [[48], [52], [56], [60]],
-                                  [[50], [54], [58], [62]]],
-                                 [[[33], [37], [41], [45]],
-                                  [[35], [39], [43], [47]],
-                                  [[49], [53], [57], [61]],
-                                  [[51], [55], [59], [63]]]])
+    output_tensor = numpy.array(
+        [[[[0], [4], [8], [12]], [[2], [6], [10], [14]],
+          [[16], [20], [24], [28]], [[18], [22], [26], [30]]],
+         [[[1], [5], [9], [13]], [[3], [7], [11], [15]], [[17], [21], [25],
+                                                          [29]],
+          [[19], [23], [27],
+           [31]]], [[[32], [36], [40], [44]], [[34], [38], [42], [46]],
+                    [[48], [52], [56], [60]], [[50], [54], [58], [62]]],
+         [[[33], [37], [41], [45]], [[35], [39], [43], [47]],
+          [[49], [53], [57], [61]], [[51], [55], [59], [63]]]])
 
     # NOTE: output_tensor != input_tensor.reshape((4, 4, 4, -1))
     with self.test_session():
@@ -111,5 +104,5 @@ class PeriodicResampleTest(test_util.TensorFlowTestCase):
         periodic_resample(input_tensor, [None, 4, 4]).eval()
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
index 70aaba1728..c780e85d72 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
@@ -53,14 +53,12 @@ class RNNCellTest(test.TestCase):
       batch_size = 3
       input_size = 4
       expected_output = np.array(
-          [[0.121753, 0.121753],
-           [0.103349, 0.103349],
-           [0.100178, 0.100178]],
+          [[0.121753, 0.121753], [0.103349, 0.103349], [0.100178, 0.100178]],
           dtype=np.float32)
       expected_state = np.array(
-          [[0.137523, 0.137523, 0.121753, 0.121753],
-           [0.105450, 0.105450, 0.103349, 0.103349],
-           [0.100742, 0.100742, 0.100178, 0.100178]],
+          [[0.137523, 0.137523, 0.121753, 0.121753], [
+              0.105450, 0.105450, 0.103349, 0.103349
+          ], [0.100742, 0.100742, 0.100178, 0.100178]],
           dtype=np.float32)
       with variable_scope.variable_scope(
           "root", initializer=init_ops.constant_initializer(0.5)):
@@ -69,14 +67,14 @@ class RNNCellTest(test.TestCase):
         output, state = contrib_rnn_cell.CoupledInputForgetGateLSTMCell(
             num_units=num_units, forget_bias=1.0, state_is_tuple=False)(x, m)
         sess.run([variables.global_variables_initializer()])
-        res = sess.run([output, state], {
-            x.name:
-                np.array([[1., 1., 1., 1.],
-                          [2., 2., 2., 2.],
-                          [3., 3., 3., 3.]]),
-            m.name:
-                0.1 * np.ones((batch_size, state_size))
-        })
+        res = sess.run(
+            [output, state], {
+                x.name:
+                    np.array([[1., 1., 1., 1.], [2., 2., 2., 2.],
+                              [3., 3., 3., 3.]]),
+                m.name:
+                    0.1 * np.ones((batch_size, state_size))
+            })
         # This is a smoke test: Only making sure expected values didn't change.
         self.assertEqual(len(res), 2)
         self.assertAllClose(res[0], expected_output)
@@ -101,14 +99,14 @@ class RNNCellTest(test.TestCase):
             frequency_skip=frequency_skip,
             forget_bias=1.0)(x, m)
         sess.run([variables.global_variables_initializer()])
-        res = sess.run([output, state], {
-            x.name:
-                np.array([[1., 1., 1., 1.],
-                          [2., 2., 2., 2.],
-                          [3., 3., 3., 3.]]),
-            m.name:
-                0.1 * np.ones((batch_size, int(state_size * (num_shifts))))
-        })
+        res = sess.run(
+            [output, state], {
+                x.name:
+                    np.array([[1., 1., 1., 1.], [2., 2., 2., 2.],
+                              [3., 3., 3., 3.]]),
+                m.name:
+                    0.1 * np.ones((batch_size, int(state_size * (num_shifts))))
+            })
         self.assertEqual(len(res), 2)
         # The numbers in results were not calculated, this is mostly just a
         # smoke test.
@@ -141,17 +139,14 @@ class RNNCellTest(test.TestCase):
             state_is_tuple=True)
         inputs = constant_op.constant(
             np.array(
-                [[1., 1., 1., 1.],
-                 [2., 2., 2., 2.],
-                 [3., 3., 3., 3.]],
+                [[1., 1., 1., 1.], [2., 2., 2., 2.], [3., 3., 3., 3.]],
                 dtype=np.float32),
             dtype=dtypes.float32)
         state_value = constant_op.constant(
-            0.1 * np.ones(
-                (batch_size, num_units), dtype=np.float32),
+            0.1 * np.ones((batch_size, num_units), dtype=np.float32),
             dtype=dtypes.float32)
-        init_state = cell.state_tuple_type(
-            *([state_value, state_value] * num_shifts))
+        init_state = cell.state_tuple_type(*(
+            [state_value, state_value] * num_shifts))
         output, state = cell(inputs, init_state)
         sess.run([variables.global_variables_initializer()])
         res = sess.run([output, state])
@@ -198,11 +193,10 @@ class RNNCellTest(test.TestCase):
                 dtype=np.float32),
             dtype=dtypes.float32)
         state_value = constant_op.constant(
-            0.1 * np.ones(
-                (batch_size, num_units), dtype=np.float32),
+            0.1 * np.ones((batch_size, num_units), dtype=np.float32),
             dtype=dtypes.float32)
-        init_state = cell.state_tuple_type(
-            *([state_value, state_value] * total_blocks))
+        init_state = cell.state_tuple_type(*(
+            [state_value, state_value] * total_blocks))
         output, state = cell(inputs, init_state)
         sess.run([variables.global_variables_initializer()])
         res = sess.run([output, state])
@@ -230,20 +224,28 @@ class RNNCellTest(test.TestCase):
     frequency_skip = 1
     num_shifts = int((input_size - feature_size) / frequency_skip + 1)
     expected_output = np.array(
-        [[0.416383, 0.416383, 0.403238, 0.403238, 0.524020, 0.524020,
-          0.565425, 0.565425, 0.557865, 0.557865, 0.609699, 0.609699],
-         [0.627331, 0.627331, 0.622393, 0.622393, 0.688342, 0.688342,
-          0.708078, 0.708078, 0.694245, 0.694245, 0.715171, 0.715171],
-         [0.711050, 0.711050, 0.709197, 0.709197, 0.736533, 0.736533,
-          0.744264, 0.744264, 0.737390, 0.737390, 0.745250, 0.745250]],
+        [[
+            0.416383, 0.416383, 0.403238, 0.403238, 0.524020, 0.524020,
+            0.565425, 0.565425, 0.557865, 0.557865, 0.609699, 0.609699
+        ], [
+            0.627331, 0.627331, 0.622393, 0.622393, 0.688342, 0.688342,
+            0.708078, 0.708078, 0.694245, 0.694245, 0.715171, 0.715171
+        ], [
+            0.711050, 0.711050, 0.709197, 0.709197, 0.736533, 0.736533,
+            0.744264, 0.744264, 0.737390, 0.737390, 0.745250, 0.745250
+        ]],
         dtype=np.float32)
     expected_state = np.array(
-        [[0.625556, 0.625556, 0.416383, 0.416383, 0.759134, 0.759134,
-          0.524020, 0.524020, 0.798795, 0.798795, 0.557865, 0.557865],
-         [0.875488, 0.875488, 0.627331, 0.627331, 0.936432, 0.936432,
-          0.688342, 0.688342, 0.941961, 0.941961, 0.694245, 0.694245],
-         [0.957327, 0.957327, 0.711050, 0.711050, 0.979522, 0.979522,
-          0.736533, 0.736533, 0.980245, 0.980245, 0.737390, 0.737390]],
+        [[
+            0.625556, 0.625556, 0.416383, 0.416383, 0.759134, 0.759134,
+            0.524020, 0.524020, 0.798795, 0.798795, 0.557865, 0.557865
+        ], [
+            0.875488, 0.875488, 0.627331, 0.627331, 0.936432, 0.936432,
+            0.688342, 0.688342, 0.941961, 0.941961, 0.694245, 0.694245
+        ], [
+            0.957327, 0.957327, 0.711050, 0.711050, 0.979522, 0.979522,
+            0.736533, 0.736533, 0.980245, 0.980245, 0.737390, 0.737390
+        ]],
         dtype=np.float32)
     for state_is_tuple in [False, True]:
       with self.test_session() as sess:
@@ -259,18 +261,16 @@ class RNNCellTest(test.TestCase):
               couple_input_forget_gates=True,
               state_is_tuple=state_is_tuple)
           inputs = constant_op.constant(
-              np.array([[1., 1., 1., 1.],
-                        [2., 2., 2., 2.],
-                        [3., 3., 3., 3.]],
-                       dtype=np.float32),
+              np.array(
+                  [[1., 1., 1., 1.], [2., 2., 2., 2.], [3., 3., 3., 3.]],
+                  dtype=np.float32),
               dtype=dtypes.float32)
           if state_is_tuple:
             state_value = constant_op.constant(
-                0.1 * np.ones(
-                    (batch_size, num_units), dtype=np.float32),
+                0.1 * np.ones((batch_size, num_units), dtype=np.float32),
                 dtype=dtypes.float32)
-            init_state = cell.state_tuple_type(
-                *([state_value, state_value] * num_shifts))
+            init_state = cell.state_tuple_type(*(
+                [state_value, state_value] * num_shifts))
           else:
             init_state = constant_op.constant(
                 0.1 * np.ones(
@@ -302,32 +302,40 @@ class RNNCellTest(test.TestCase):
       frequency_skip = 1
       num_shifts = int((input_size - feature_size) / frequency_skip + 1)
       expected_output = np.array(
-          [[0.464130, 0.464130, 0.419165, 0.419165, 0.593283, 0.593283,
-            0.738350, 0.738350, 0.661638, 0.661638, 0.866774, 0.866774,
-            0.520789, 0.520789, 0.476968, 0.476968, 0.604341, 0.604341,
-            0.760207, 0.760207, 0.635773, 0.635773, 0.850218, 0.850218],
-           [0.669636, 0.669636, 0.628966, 0.628966, 0.736057, 0.736057,
-            0.895927, 0.895927, 0.755559, 0.755559, 0.954359, 0.954359,
-            0.692621, 0.692621, 0.652363, 0.652363, 0.737517, 0.737517,
-            0.899558, 0.899558, 0.745984, 0.745984, 0.946840, 0.946840],
-           [0.751109, 0.751109, 0.711716, 0.711716, 0.778357, 0.778357,
-            0.940779, 0.940779, 0.784530, 0.784530, 0.980604, 0.980604,
-            0.759940, 0.759940, 0.720652, 0.720652, 0.778552, 0.778552,
-            0.941606, 0.941606, 0.781035, 0.781035, 0.977731, 0.977731]],
+          [[
+              0.464130, 0.464130, 0.419165, 0.419165, 0.593283, 0.593283,
+              0.738350, 0.738350, 0.661638, 0.661638, 0.866774, 0.866774,
+              0.520789, 0.520789, 0.476968, 0.476968, 0.604341, 0.604341,
+              0.760207, 0.760207, 0.635773, 0.635773, 0.850218, 0.850218
+          ], [
+              0.669636, 0.669636, 0.628966, 0.628966, 0.736057, 0.736057,
+              0.895927, 0.895927, 0.755559, 0.755559, 0.954359, 0.954359,
+              0.692621, 0.692621, 0.652363, 0.652363, 0.737517, 0.737517,
+              0.899558, 0.899558, 0.745984, 0.745984, 0.946840, 0.946840
+          ], [
+              0.751109, 0.751109, 0.711716, 0.711716, 0.778357, 0.778357,
+              0.940779, 0.940779, 0.784530, 0.784530, 0.980604, 0.980604,
+              0.759940, 0.759940, 0.720652, 0.720652, 0.778552, 0.778552,
+              0.941606, 0.941606, 0.781035, 0.781035, 0.977731, 0.977731
+          ]],
           dtype=np.float32)
       expected_state = np.array(
-          [[0.710660, 0.710660, 0.464130, 0.464130, 0.877293, 0.877293,
-            0.593283, 0.593283, 0.958505, 0.958505, 0.661638, 0.661638,
-            0.785405, 0.785405, 0.520789, 0.520789, 0.890836, 0.890836,
-            0.604341, 0.604341, 0.928512, 0.928512, 0.635773, 0.635773],
-           [0.967579, 0.967579, 0.669636, 0.669636, 1.038811, 1.038811,
-            0.736057, 0.736057, 1.058201, 1.058201, 0.755559, 0.755559,
-            0.993088, 0.993088, 0.692621, 0.692621, 1.040288, 1.040288,
-            0.737517, 0.737517, 1.048773, 1.048773, 0.745984, 0.745984],
-           [1.053842, 1.053842, 0.751109, 0.751109, 1.079919, 1.079919,
-            0.778357, 0.778357, 1.085620, 1.085620, 0.784530, 0.784530,
-            1.062455, 1.062455, 0.759940, 0.759940, 1.080101, 1.080101,
-            0.778552, 0.778552, 1.082402, 1.082402, 0.781035, 0.781035]],
+          [[
+              0.710660, 0.710660, 0.464130, 0.464130, 0.877293, 0.877293,
+              0.593283, 0.593283, 0.958505, 0.958505, 0.661638, 0.661638,
+              0.785405, 0.785405, 0.520789, 0.520789, 0.890836, 0.890836,
+              0.604341, 0.604341, 0.928512, 0.928512, 0.635773, 0.635773
+          ], [
+              0.967579, 0.967579, 0.669636, 0.669636, 1.038811, 1.038811,
+              0.736057, 0.736057, 1.058201, 1.058201, 0.755559, 0.755559,
+              0.993088, 0.993088, 0.692621, 0.692621, 1.040288, 1.040288,
+              0.737517, 0.737517, 1.048773, 1.048773, 0.745984, 0.745984
+          ], [
+              1.053842, 1.053842, 0.751109, 0.751109, 1.079919, 1.079919,
+              0.778357, 0.778357, 1.085620, 1.085620, 0.784530, 0.784530,
+              1.062455, 1.062455, 0.759940, 0.759940, 1.080101, 1.080101,
+              0.778552, 0.778552, 1.082402, 1.082402, 0.781035, 0.781035
+          ]],
           dtype=np.float32)
       with variable_scope.variable_scope(
           "root", initializer=init_ops.constant_initializer(0.5)):
@@ -339,17 +347,16 @@ class RNNCellTest(test.TestCase):
             forget_bias=1.0,
             num_frequency_blocks=[num_shifts])
         inputs = constant_op.constant(
-            np.array([[1.0, 1.1, 1.2, 1.3],
-                      [2.0, 2.1, 2.2, 2.3],
-                      [3.0, 3.1, 3.2, 3.3]],
-                     dtype=np.float32),
+            np.array(
+                [[1.0, 1.1, 1.2, 1.3], [2.0, 2.1, 2.2, 2.3],
+                 [3.0, 3.1, 3.2, 3.3]],
+                dtype=np.float32),
             dtype=dtypes.float32)
         state_value = constant_op.constant(
-            0.1 * np.ones(
-                (batch_size, num_units), dtype=np.float32),
+            0.1 * np.ones((batch_size, num_units), dtype=np.float32),
             dtype=dtypes.float32)
-        init_state = cell.state_tuple_type(
-            *([state_value, state_value] * num_shifts * 2))
+        init_state = cell.state_tuple_type(*(
+            [state_value, state_value] * num_shifts * 2))
         output, state = cell(inputs, init_state)
         sess.run([variables.global_variables_initializer()])
         res = sess.run([output, state])
@@ -375,32 +382,40 @@ class RNNCellTest(test.TestCase):
       frequency_skip = 1
       num_shifts = int((input_size - feature_size) / frequency_skip + 1)
       expected_output = np.array(
-          [[0.464130, 0.464130, 0.419165, 0.419165, 0.593283, 0.593283,
-            0.738350, 0.738350, 0.661638, 0.661638, 0.866774, 0.866774,
-            0.322645, 0.322645, 0.276068, 0.276068, 0.584654, 0.584654,
-            0.690292, 0.690292, 0.640446, 0.640446, 0.840071, 0.840071],
-           [0.669636, 0.669636, 0.628966, 0.628966, 0.736057, 0.736057,
-            0.895927, 0.895927, 0.755559, 0.755559, 0.954359, 0.954359,
-            0.493625, 0.493625, 0.449236, 0.449236, 0.730828, 0.730828,
-            0.865996, 0.865996, 0.749429, 0.749429, 0.944958, 0.944958],
-           [0.751109, 0.751109, 0.711716, 0.711716, 0.778357, 0.778357,
-            0.940779, 0.940779, 0.784530, 0.784530, 0.980604, 0.980604,
-            0.608587, 0.608587, 0.566683, 0.566683, 0.777345, 0.777345,
-            0.925820, 0.925820, 0.782597, 0.782597, 0.976858, 0.976858]],
+          [[
+              0.464130, 0.464130, 0.419165, 0.419165, 0.593283, 0.593283,
+              0.738350, 0.738350, 0.661638, 0.661638, 0.866774, 0.866774,
+              0.322645, 0.322645, 0.276068, 0.276068, 0.584654, 0.584654,
+              0.690292, 0.690292, 0.640446, 0.640446, 0.840071, 0.840071
+          ], [
+              0.669636, 0.669636, 0.628966, 0.628966, 0.736057, 0.736057,
+              0.895927, 0.895927, 0.755559, 0.755559, 0.954359, 0.954359,
+              0.493625, 0.493625, 0.449236, 0.449236, 0.730828, 0.730828,
+              0.865996, 0.865996, 0.749429, 0.749429, 0.944958, 0.944958
+          ], [
+              0.751109, 0.751109, 0.711716, 0.711716, 0.778357, 0.778357,
+              0.940779, 0.940779, 0.784530, 0.784530, 0.980604, 0.980604,
+              0.608587, 0.608587, 0.566683, 0.566683, 0.777345, 0.777345,
+              0.925820, 0.925820, 0.782597, 0.782597, 0.976858, 0.976858
+          ]],
           dtype=np.float32)
       expected_state = np.array(
-          [[0.710660, 0.710660, 0.464130, 0.464130, 0.877293, 0.877293,
-            0.593283, 0.593283, 0.958505, 0.958505, 0.661638, 0.661638,
-            0.516575, 0.516575, 0.322645, 0.322645, 0.866628, 0.866628,
-            0.584654, 0.584654, 0.934002, 0.934002, 0.640446, 0.640446],
-           [0.967579, 0.967579, 0.669636, 0.669636, 1.038811, 1.038811,
-            0.736057, 0.736057, 1.058201, 1.058201, 0.755559, 0.755559,
-            0.749836, 0.749836, 0.493625, 0.493625, 1.033488, 1.033488,
-            0.730828, 0.730828, 1.052186, 1.052186, 0.749429, 0.749429],
-           [1.053842, 1.053842, 0.751109, 0.751109, 1.079919, 1.079919,
-            0.778357, 0.778357, 1.085620, 1.085620, 0.784530, 0.784530,
-            0.895999, 0.895999, 0.608587, 0.608587, 1.078978, 1.078978,
-            0.777345, 0.777345, 1.083843, 1.083843, 0.782597, 0.782597]],
+          [[
+              0.710660, 0.710660, 0.464130, 0.464130, 0.877293, 0.877293,
+              0.593283, 0.593283, 0.958505, 0.958505, 0.661638, 0.661638,
+              0.516575, 0.516575, 0.322645, 0.322645, 0.866628, 0.866628,
+              0.584654, 0.584654, 0.934002, 0.934002, 0.640446, 0.640446
+          ], [
+              0.967579, 0.967579, 0.669636, 0.669636, 1.038811, 1.038811,
+              0.736057, 0.736057, 1.058201, 1.058201, 0.755559, 0.755559,
+              0.749836, 0.749836, 0.493625, 0.493625, 1.033488, 1.033488,
+              0.730828, 0.730828, 1.052186, 1.052186, 0.749429, 0.749429
+          ], [
+              1.053842, 1.053842, 0.751109, 0.751109, 1.079919, 1.079919,
+              0.778357, 0.778357, 1.085620, 1.085620, 0.784530, 0.784530,
+              0.895999, 0.895999, 0.608587, 0.608587, 1.078978, 1.078978,
+              0.777345, 0.777345, 1.083843, 1.083843, 0.782597, 0.782597
+          ]],
           dtype=np.float32)
       with variable_scope.variable_scope(
           "root", initializer=init_ops.constant_initializer(0.5)):
@@ -413,17 +428,16 @@ class RNNCellTest(test.TestCase):
             num_frequency_blocks=[num_shifts],
             backward_slice_offset=1)
         inputs = constant_op.constant(
-            np.array([[1.0, 1.1, 1.2, 1.3],
-                      [2.0, 2.1, 2.2, 2.3],
-                      [3.0, 3.1, 3.2, 3.3]],
-                     dtype=np.float32),
+            np.array(
+                [[1.0, 1.1, 1.2, 1.3], [2.0, 2.1, 2.2, 2.3],
+                 [3.0, 3.1, 3.2, 3.3]],
+                dtype=np.float32),
             dtype=dtypes.float32)
         state_value = constant_op.constant(
-            0.1 * np.ones(
-                (batch_size, num_units), dtype=np.float32),
+            0.1 * np.ones((batch_size, num_units), dtype=np.float32),
             dtype=dtypes.float32)
-        init_state = cell.state_tuple_type(
-            *([state_value, state_value] * num_shifts * 2))
+        init_state = cell.state_tuple_type(*(
+            [state_value, state_value] * num_shifts * 2))
         output, state = cell(inputs, init_state)
         sess.run([variables.global_variables_initializer()])
         res = sess.run([output, state])
@@ -474,8 +488,8 @@ class RNNCellTest(test.TestCase):
     for state_is_tuple in [False, True]:
       with ops.Graph().as_default():
         with self.test_session() as sess:
-          with variable_scope.variable_scope("state_is_tuple_" + str(
-              state_is_tuple)):
+          with variable_scope.variable_scope(
+              "state_is_tuple_" + str(state_is_tuple)):
             lstm_cell = rnn_cell.BasicLSTMCell(
                 num_units, state_is_tuple=state_is_tuple)
             cell = contrib_rnn_cell.AttentionCellWrapper(
@@ -525,16 +539,15 @@ class RNNCellTest(test.TestCase):
     for state_is_tuple in [False, True]:
       with ops.Graph().as_default():
         with self.test_session() as sess:
-          with variable_scope.variable_scope("state_is_tuple_" + str(
-              state_is_tuple)):
+          with variable_scope.variable_scope(
+              "state_is_tuple_" + str(state_is_tuple)):
             lstm_cell = rnn_cell.BasicLSTMCell(
                 num_units, state_is_tuple=state_is_tuple)
             cell = contrib_rnn_cell.AttentionCellWrapper(
                 lstm_cell, attn_length, state_is_tuple=state_is_tuple)
             if state_is_tuple:
               zeros = constant_op.constant(
-                  0.1 * np.ones(
-                      [batch_size, num_units], dtype=np.float32),
+                  0.1 * np.ones([batch_size, num_units], dtype=np.float32),
                   dtype=dtypes.float32)
               attn_state_zeros = constant_op.constant(
                   0.1 * np.ones(
@@ -579,22 +592,25 @@ class RNNCellTest(test.TestCase):
          [1.018088, 0.378983, -0.572179, 0.268591]],
         dtype=np.float32)
     expected_state = np.array(
-        [[0.74946702, 0.34681597, 0.26474735, 1.06485605, 0.38465962,
-          0.11420801, 0.10272158, 0.30925757, 0.63899988, 0.7181077,
-          0.47534478, 0.33715725, 0.58086717, 0.49446869, 0.7641536,
-          0.12814975, 0.92231739, 0.89857256, 0.21889746, 0.38442063,
-          0.53481543, 0.8876909, 0.45823169, 0.5905602, 0.78038228,
-          0.56501579, 0.03971386, 0.09870267, 0.8074435, 0.66821432,
-          0.99211812, 0.12295902, 1.14606023, 0.34370938, -0.79251152,
-          0.51843399],
-         [0.5179342, 0.48682183, -0.25426468, 0.96810579, 0.28809637,
-          0.13607743, -0.11446252, 0.26792109, 0.78047138, 0.63460857,
-          0.49122369, 0.52007174, 0.73000264, 0.66986895, 0.73576689,
-          0.86301267, 0.87887371, 0.35185754, 0.93417215, 0.64732957,
-          0.63173044, 0.66627824, 0.53644657, 0.20477486, 0.98458421,
-          0.38277245, 0.03746676, 0.92510188, 0.57714164, 0.84932971,
-          0.36127412, 0.12125921, 1.1362772, 0.34361625, -0.78150457,
-          0.70582712]],
+        [[
+            0.74946702, 0.34681597, 0.26474735, 1.06485605, 0.38465962,
+            0.11420801, 0.10272158, 0.30925757, 0.63899988, 0.7181077,
+            0.47534478, 0.33715725, 0.58086717, 0.49446869, 0.7641536,
+            0.12814975, 0.92231739, 0.89857256, 0.21889746, 0.38442063,
+            0.53481543, 0.8876909, 0.45823169, 0.5905602, 0.78038228,
+            0.56501579, 0.03971386, 0.09870267, 0.8074435, 0.66821432,
+            0.99211812, 0.12295902, 1.14606023, 0.34370938, -0.79251152,
+            0.51843399
+        ], [
+            0.5179342, 0.48682183, -0.25426468, 0.96810579, 0.28809637,
+            0.13607743, -0.11446252, 0.26792109, 0.78047138, 0.63460857,
+            0.49122369, 0.52007174, 0.73000264, 0.66986895, 0.73576689,
+            0.86301267, 0.87887371, 0.35185754, 0.93417215, 0.64732957,
+            0.63173044, 0.66627824, 0.53644657, 0.20477486, 0.98458421,
+            0.38277245, 0.03746676, 0.92510188, 0.57714164, 0.84932971,
+            0.36127412, 0.12125921, 1.1362772, 0.34361625, -0.78150457,
+            0.70582712
+        ]],
         dtype=np.float32)
     seed = 12345
     random_seed.set_random_seed(seed)
@@ -602,7 +618,8 @@ class RNNCellTest(test.TestCase):
     for state_is_tuple in [False, True]:
       with session.Session() as sess:
         with variable_scope.variable_scope(
-            "state_is_tuple", reuse=state_is_tuple,
+            "state_is_tuple",
+            reuse=state_is_tuple,
             initializer=init_ops.glorot_uniform_initializer()):
           lstm_cell = rnn_cell.BasicLSTMCell(
               num_units, state_is_tuple=state_is_tuple)
@@ -646,36 +663,31 @@ class RNNCellTest(test.TestCase):
   def testNASCell(self):
     num_units = 6
     batch_size = 3
-    expected_output = np.array([[0.576751, 0.576751, 0.576751, 0.576751,
-                                 0.576751, 0.576751],
-                                [0.618936, 0.618936, 0.618936, 0.618936,
-                                 0.618936, 0.618936],
-                                [0.627393, 0.627393, 0.627393, 0.627393,
-                                 0.627393, 0.627393]])
-    expected_state = np.array([[0.71579772, 0.71579772, 0.71579772, 0.71579772,
-                                0.71579772, 0.71579772, 0.57675087, 0.57675087,
-                                0.57675087, 0.57675087, 0.57675087, 0.57675087],
-                               [0.78041625, 0.78041625, 0.78041625, 0.78041625,
-                                0.78041625, 0.78041625, 0.6189357, 0.6189357,
-                                0.61893570, 0.6189357, 0.6189357, 0.6189357],
-                               [0.79457647, 0.79457647, 0.79457647, 0.79457647,
-                                0.79457653, 0.79457653, 0.62739348, 0.62739348,
-                                0.62739348, 0.62739348, 0.62739348, 0.62739348]
-                              ])
+    expected_output = np.array(
+        [[0.576751, 0.576751, 0.576751, 0.576751, 0.576751, 0.576751],
+         [0.618936, 0.618936, 0.618936, 0.618936, 0.618936, 0.618936],
+         [0.627393, 0.627393, 0.627393, 0.627393, 0.627393, 0.627393]])
+    expected_state = np.array([[
+        0.71579772, 0.71579772, 0.71579772, 0.71579772, 0.71579772, 0.71579772,
+        0.57675087, 0.57675087, 0.57675087, 0.57675087, 0.57675087, 0.57675087
+    ], [
+        0.78041625, 0.78041625, 0.78041625, 0.78041625, 0.78041625, 0.78041625,
+        0.6189357, 0.6189357, 0.61893570, 0.6189357, 0.6189357, 0.6189357
+    ], [
+        0.79457647, 0.79457647, 0.79457647, 0.79457647, 0.79457653, 0.79457653,
+        0.62739348, 0.62739348, 0.62739348, 0.62739348, 0.62739348, 0.62739348
+    ]])
     with self.test_session() as sess:
       with variable_scope.variable_scope(
-          "nas_test",
-          initializer=init_ops.constant_initializer(0.5)):
+          "nas_test", initializer=init_ops.constant_initializer(0.5)):
         cell = contrib_rnn_cell.NASCell(num_units=num_units)
         inputs = constant_op.constant(
-            np.array([[1., 1., 1., 1.],
-                      [2., 2., 2., 2.],
-                      [3., 3., 3., 3.]],
-                     dtype=np.float32),
+            np.array(
+                [[1., 1., 1., 1.], [2., 2., 2., 2.], [3., 3., 3., 3.]],
+                dtype=np.float32),
             dtype=dtypes.float32)
         state_value = constant_op.constant(
-            0.1 * np.ones(
-                (batch_size, num_units), dtype=np.float32),
+            0.1 * np.ones((batch_size, num_units), dtype=np.float32),
             dtype=dtypes.float32)
         init_state = rnn_cell.LSTMStateTuple(state_value, state_value)
         output, state = cell(inputs, init_state)
@@ -699,39 +711,34 @@ class RNNCellTest(test.TestCase):
     num_units = 6
     batch_size = 3
     num_proj = 5
-    expected_output = np.array([[1.697418, 1.697418, 1.697418, 1.697418,
-                                 1.697418],
-                                [1.840037, 1.840037, 1.840037, 1.840037,
-                                 1.840037],
-                                [1.873985, 1.873985, 1.873985, 1.873985,
-                                 1.873985]])
-    expected_state = np.array([[0.69855207, 0.69855207, 0.69855207, 0.69855207,
-                                0.69855207, 0.69855207, 1.69741797, 1.69741797,
-                                1.69741797, 1.69741797, 1.69741797],
-                               [0.77073824, 0.77073824, 0.77073824, 0.77073824,
-                                0.77073824, 0.77073824, 1.84003687, 1.84003687,
-                                1.84003687, 1.84003687, 1.84003687],
-                               [0.78973997, 0.78973997, 0.78973997, 0.78973997,
-                                0.78973997, 0.78973997, 1.87398517, 1.87398517,
-                                1.87398517, 1.87398517, 1.87398517]])
+    expected_output = np.array(
+        [[1.697418, 1.697418, 1.697418, 1.697418,
+          1.697418], [1.840037, 1.840037, 1.840037, 1.840037, 1.840037],
+         [1.873985, 1.873985, 1.873985, 1.873985, 1.873985]])
+    expected_state = np.array([[
+        0.69855207, 0.69855207, 0.69855207, 0.69855207, 0.69855207, 0.69855207,
+        1.69741797, 1.69741797, 1.69741797, 1.69741797, 1.69741797
+    ], [
+        0.77073824, 0.77073824, 0.77073824, 0.77073824, 0.77073824, 0.77073824,
+        1.84003687, 1.84003687, 1.84003687, 1.84003687, 1.84003687
+    ], [
+        0.78973997, 0.78973997, 0.78973997, 0.78973997, 0.78973997, 0.78973997,
+        1.87398517, 1.87398517, 1.87398517, 1.87398517, 1.87398517
+    ]])
     with self.test_session() as sess:
       with variable_scope.variable_scope(
-          "nas_proj_test",
-          initializer=init_ops.constant_initializer(0.5)):
+          "nas_proj_test", initializer=init_ops.constant_initializer(0.5)):
         cell = contrib_rnn_cell.NASCell(num_units=num_units, num_proj=num_proj)
         inputs = constant_op.constant(
-            np.array([[1., 1., 1., 1.],
-                      [2., 2., 2., 2.],
-                      [3., 3., 3., 3.]],
-                     dtype=np.float32),
+            np.array(
+                [[1., 1., 1., 1.], [2., 2., 2., 2.], [3., 3., 3., 3.]],
+                dtype=np.float32),
             dtype=dtypes.float32)
         state_value_c = constant_op.constant(
-            0.1 * np.ones(
-                (batch_size, num_units), dtype=np.float32),
+            0.1 * np.ones((batch_size, num_units), dtype=np.float32),
             dtype=dtypes.float32)
         state_value_h = constant_op.constant(
-            0.1 * np.ones(
-                (batch_size, num_proj), dtype=np.float32),
+            0.1 * np.ones((batch_size, num_proj), dtype=np.float32),
             dtype=dtypes.float32)
         init_state = rnn_cell.LSTMStateTuple(state_value_c, state_value_h)
         output, state = cell(inputs, init_state)
@@ -755,24 +762,20 @@ class RNNCellTest(test.TestCase):
     num_units = 2
     batch_size = 3
     expected_state_and_output = np.array(
-        [[0.13752282, 0.13752282],
-         [0.10545051, 0.10545051],
+        [[0.13752282, 0.13752282], [0.10545051, 0.10545051],
          [0.10074195, 0.10074195]],
         dtype=np.float32)
     with self.test_session() as sess:
       with variable_scope.variable_scope(
-          "ugrnn_cell_test",
-          initializer=init_ops.constant_initializer(0.5)):
+          "ugrnn_cell_test", initializer=init_ops.constant_initializer(0.5)):
         cell = contrib_rnn_cell.UGRNNCell(num_units=num_units)
         inputs = constant_op.constant(
-            np.array([[1., 1., 1., 1.],
-                      [2., 2., 2., 2.],
-                      [3., 3., 3., 3.]],
-                     dtype=np.float32),
+            np.array(
+                [[1., 1., 1., 1.], [2., 2., 2., 2.], [3., 3., 3., 3.]],
+                dtype=np.float32),
             dtype=dtypes.float32)
         init_state = constant_op.constant(
-            0.1 * np.ones(
-                (batch_size, num_units), dtype=np.float32),
+            0.1 * np.ones((batch_size, num_units), dtype=np.float32),
             dtype=dtypes.float32)
         output, state = cell(inputs, init_state)
         sess.run([variables.global_variables_initializer()])
@@ -786,13 +789,11 @@ class RNNCellTest(test.TestCase):
     num_units = 2
     batch_size = 3
     expected_state = np.array(
-        [[0.13752282, 0.13752282],
-         [0.10545051, 0.10545051],
+        [[0.13752282, 0.13752282], [0.10545051, 0.10545051],
          [0.10074195, 0.10074195]],
         dtype=np.float32)
     expected_output = np.array(
-        [[2.00431061, 2.00431061],
-         [4.00060606, 4.00060606],
+        [[2.00431061, 2.00431061], [4.00060606, 4.00060606],
          [6.00008249, 6.00008249]],
         dtype=np.float32)
     with self.test_session() as sess:
@@ -802,14 +803,12 @@ class RNNCellTest(test.TestCase):
         cell = contrib_rnn_cell.IntersectionRNNCell(
             num_units=num_units, num_in_proj=num_units)
         inputs = constant_op.constant(
-            np.array([[1., 1., 1., 1.],
-                      [2., 2., 2., 2.],
-                      [3., 3., 3., 3.]],
-                     dtype=np.float32),
+            np.array(
+                [[1., 1., 1., 1.], [2., 2., 2., 2.], [3., 3., 3., 3.]],
+                dtype=np.float32),
             dtype=dtypes.float32)
         init_state = constant_op.constant(
-            0.1 * np.ones(
-                (batch_size, num_units), dtype=np.float32),
+            0.1 * np.ones((batch_size, num_units), dtype=np.float32),
             dtype=dtypes.float32)
         output, state = cell(inputs, init_state)
         sess.run([variables.global_variables_initializer()])
@@ -824,19 +823,17 @@ class RNNCellTest(test.TestCase):
     batch_size = 3
     cell = contrib_rnn_cell.IntersectionRNNCell(num_units=num_units)
     inputs = constant_op.constant(
-        np.array([[1., 1., 1., 1.],
-                  [2., 2., 2., 2.],
-                  [3., 3., 3., 3.]],
-                 dtype=np.float32),
+        np.array(
+            [[1., 1., 1., 1.], [2., 2., 2., 2.], [3., 3., 3., 3.]],
+            dtype=np.float32),
         dtype=dtypes.float32)
     init_state = constant_op.constant(
-        0.1 * np.ones(
-            (batch_size, num_units), dtype=np.float32),
+        0.1 * np.ones((batch_size, num_units), dtype=np.float32),
         dtype=dtypes.float32)
-    with self.assertRaisesRegexp(
-        ValueError, "Must have input size == output size for "
-                    "Intersection RNN. To fix, num_in_proj should "
-                    "be set to num_units at cell init."):
+    with self.assertRaisesRegexp(ValueError,
+                                 "Must have input size == output size for "
+                                 "Intersection RNN. To fix, num_in_proj should "
+                                 "be set to num_units at cell init."):
       cell(inputs, init_state)
 
   def testPhasedLSTMCell(self):
@@ -845,13 +842,11 @@ class RNNCellTest(test.TestCase):
       batch_size = 3
       input_size = 4
       expected_state_c = np.array(
-          [[6.450831e-04, 4.697885e-04],
-           [9.862894e-05, 7.212213e-04],
+          [[6.450831e-04, 4.697885e-04], [9.862894e-05, 7.212213e-04],
            [4.401947e-04, 9.143004e-04]],
           dtype=np.float32)
       expected_state_h = np.array(
-          [[4.621217e-04, 3.365449e-04],
-           [7.438179e-05, 5.439147e-04],
+          [[4.621217e-04, 3.365449e-04], [7.438179e-05, 5.439147e-04],
            [3.347936e-04, 6.953785e-04]],
           dtype=np.float32)
       with variable_scope.variable_scope(
@@ -864,14 +859,14 @@ class RNNCellTest(test.TestCase):
         output, state = contrib_rnn_cell.PhasedLSTMCell(num_units=num_units)(
             (t, x), state0)
         sess.run([variables.global_variables_initializer()])
-        res = sess.run([output, state], {
-            t.name:
-                np.array([[1.], [2.], [3.]]),
-            x.name:
-                np.array([[1., 1., 1., 1.],
-                          [2., 2., 2., 2.],
-                          [3., 3., 3., 3.]]),
-        })
+        res = sess.run(
+            [output, state], {
+                t.name:
+                    np.array([[1.], [2.], [3.]]),
+                x.name:
+                    np.array([[1., 1., 1., 1.], [2., 2., 2., 2.],
+                              [3., 3., 3., 3.]]),
+            })
         # This is a smoke test, making sure expected values are unchanged.
         self.assertEqual(len(res), 2)
         self.assertAllClose(res[0], res[1].h)
@@ -880,36 +875,32 @@ class RNNCellTest(test.TestCase):
 
   def testConv1DLSTMCell(self):
     with self.test_session() as sess:
-      shape = [2,1]
+      shape = [2, 1]
       filter_size = [3]
       num_features = 1
       batch_size = 2
       expected_state_c = np.array(
-          [[[1.4375670191], [1.4375670191]],
-           [[2.7542609292], [2.7542609292]]],
+          [[[1.4375670191], [1.4375670191]], [[2.7542609292], [2.7542609292]]],
           dtype=np.float32)
       expected_state_h = np.array(
-          [[[0.6529865603], [0.6529865603]],
-           [[0.8736877431], [0.8736877431]]],
+          [[[0.6529865603], [0.6529865603]], [[0.8736877431], [0.8736877431]]],
           dtype=np.float32)
       with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(1.0/2.0)):
+          "root", initializer=init_ops.constant_initializer(1.0 / 2.0)):
         x = array_ops.placeholder(dtypes.float32, [None, None, 1])
-        cell = contrib_rnn_cell.Conv1DLSTMCell(input_shape=shape,
-                                               kernel_shape=filter_size,
-                                               output_channels=num_features)
+        cell = contrib_rnn_cell.Conv1DLSTMCell(
+            input_shape=shape,
+            kernel_shape=filter_size,
+            output_channels=num_features)
         hidden = cell.zero_state(array_ops.shape(x)[0], dtypes.float32)
         output, state = cell(x, hidden)
 
         sess.run([variables.global_variables_initializer()])
-        res = sess.run([output, state], {
-            hidden[0].name:
-                np.array([[[1.],[1.]],
-                          [[2.],[2.]]]),
-            x.name:
-                np.array([[[1.],[1.]],
-                          [[2.],[2.]]]),
-        })
+        res = sess.run(
+            [output, state], {
+                hidden[0].name: np.array([[[1.], [1.]], [[2.], [2.]]]),
+                x.name: np.array([[[1.], [1.]], [[2.], [2.]]]),
+            })
         # This is a smoke test, making sure expected values are unchanged.
         self.assertEqual(len(res), 2)
         self.assertAllClose(res[0], res[1].h)
@@ -918,44 +909,40 @@ class RNNCellTest(test.TestCase):
 
   def testConv2DLSTMCell(self):
     with self.test_session() as sess:
-      shape = [2,2,1]
-      filter_size = [3,3]
+      shape = [2, 2, 1]
+      filter_size = [3, 3]
       num_features = 1
       batch_size = 2
       expected_state_c = np.array(
-          [[[[1.4375670191], [1.4375670191]],
-            [[1.4375670191], [1.4375670191]]],
-           [[[2.7542609292], [2.7542609292]],
-            [[2.7542609292], [2.7542609292]]]],
+          [[[[1.4375670191], [1.4375670191]], [[1.4375670191], [1.4375670191]]],
+           [[[2.7542609292], [2.7542609292]], [[2.7542609292], [2.7542609292]]
+           ]],
           dtype=np.float32)
       expected_state_h = np.array(
-          [[[[0.6529865603], [0.6529865603]],
-            [[0.6529865603], [0.6529865603]]],
-           [[[0.8736877431], [0.8736877431]],
-            [[0.8736877431], [0.8736877431]]]],
+          [[[[0.6529865603], [0.6529865603]], [[0.6529865603], [0.6529865603]]],
+           [[[0.8736877431], [0.8736877431]], [[0.8736877431], [0.8736877431]]
+           ]],
           dtype=np.float32)
       with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(1.0/4.0)):
+          "root", initializer=init_ops.constant_initializer(1.0 / 4.0)):
         x = array_ops.placeholder(dtypes.float32, [None, None, None, 1])
-        cell = contrib_rnn_cell.Conv2DLSTMCell(input_shape=shape,
-                                               kernel_shape=filter_size,
-                                               output_channels=num_features)
+        cell = contrib_rnn_cell.Conv2DLSTMCell(
+            input_shape=shape,
+            kernel_shape=filter_size,
+            output_channels=num_features)
         hidden = cell.zero_state(array_ops.shape(x)[0], dtypes.float32)
         output, state = cell(x, hidden)
 
         sess.run([variables.global_variables_initializer()])
-        res = sess.run([output, state], {
-            hidden[0].name:
-                np.array([[[[1.],[1.]],
-                           [[1.],[1.]]],
-                          [[[2.],[2.]],
-                           [[2.],[2.]]]]),
-            x.name:
-                np.array([[[[1.],[1.]],
-                           [[1.],[1.]]],
-                          [[[2.],[2.]],
-                           [[2.],[2.]]]]),
-        })
+        res = sess.run(
+            [output, state], {
+                hidden[0].name:
+                    np.array([[[[1.], [1.]], [[1.], [1.]]], [[[2.], [2.]],
+                                                             [[2.], [2.]]]]),
+                x.name:
+                    np.array([[[[1.], [1.]], [[1.], [1.]]], [[[2.], [2.]],
+                                                             [[2.], [2.]]]]),
+            })
         # This is a smoke test, making sure expected values are unchanged.
         self.assertEqual(len(res), 2)
         self.assertAllClose(res[0], res[1].h)
@@ -964,36 +951,33 @@ class RNNCellTest(test.TestCase):
 
   def testConv3DLSTMCell(self):
     with self.test_session() as sess:
-      shape = [2,2,2,1]
-      filter_size = [3,3,3]
+      shape = [2, 2, 2, 1]
+      filter_size = [3, 3, 3]
       num_features = 1
       batch_size = 2
       expected_state_c = np.array(
-         [[[[[1.4375670191], [1.4375670191]],
-            [[1.4375670191], [1.4375670191]]],
-           [[[1.4375670191], [1.4375670191]],
-            [[1.4375670191], [1.4375670191]]]],
-          [[[[2.7542609292], [2.7542609292]],
-            [[2.7542609292], [2.7542609292]]],
-           [[[2.7542609292], [2.7542609292]],
-            [[2.7542609292], [2.7542609292]]]]],
+          [[[[[1.4375670191], [1.4375670191]], [[1.4375670191], [1.4375670191]]
+            ], [[[1.4375670191], [1.4375670191]], [[1.4375670191],
+                                                   [1.4375670191]]]],
+           [[[[2.7542609292], [2.7542609292]], [[2.7542609292], [2.7542609292]]
+            ], [[[2.7542609292], [2.7542609292]], [[2.7542609292],
+                                                   [2.7542609292]]]]],
           dtype=np.float32)
       expected_state_h = np.array(
-         [[[[[0.6529865603], [0.6529865603]],
-            [[0.6529865603], [0.6529865603]]],
-           [[[0.6529865603], [0.6529865603]],
-            [[0.6529865603], [0.6529865603]]]],
-          [[[[0.8736877431], [0.8736877431]],
-            [[0.8736877431], [0.8736877431]]],
-           [[[0.8736877431], [0.8736877431]],
-            [[0.8736877431], [0.8736877431]]]]],
+          [[[[[0.6529865603], [0.6529865603]], [[0.6529865603], [0.6529865603]]
+            ], [[[0.6529865603], [0.6529865603]], [[0.6529865603],
+                                                   [0.6529865603]]]],
+           [[[[0.8736877431], [0.8736877431]], [[0.8736877431], [0.8736877431]]
+            ], [[[0.8736877431], [0.8736877431]], [[0.8736877431],
+                                                   [0.8736877431]]]]],
           dtype=np.float32)
       with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(1.0/8.0)):
+          "root", initializer=init_ops.constant_initializer(1.0 / 8.0)):
         x = array_ops.placeholder(dtypes.float32, [None, None, None, None, 1])
-        cell = contrib_rnn_cell.Conv3DLSTMCell(input_shape=shape,
-                                               kernel_shape=filter_size,
-                                               output_channels=num_features)
+        cell = contrib_rnn_cell.Conv3DLSTMCell(
+            input_shape=shape,
+            kernel_shape=filter_size,
+            output_channels=num_features)
         hidden = cell.zero_state(array_ops.shape(x)[0], dtypes.float32)
         output, state = cell(x, hidden)
 
@@ -1056,8 +1040,8 @@ class RNNCellTest(test.TestCase):
             num_units=num_units, number_of_groups=number_of_groups)
         cell = rnn_cell.LSTMCell(num_units=num_units)
         self.assertTrue(isinstance(gcell.state_size, tuple))
-        zero_state = gcell.zero_state(batch_size=batch_size,
-                                      dtype=dtypes.float32)
+        zero_state = gcell.zero_state(
+            batch_size=batch_size, dtype=dtypes.float32)
         gh, gs = gcell(x, zero_state)
         h, g = cell(x, zero_state)
 
@@ -1080,16 +1064,16 @@ class RNNCellTest(test.TestCase):
         glstm_input = array_ops.ones([batch_size, num_units])
         gcell = contrib_rnn_cell.GLSTMCell(
             num_units=num_units, number_of_groups=number_of_groups)
-        gcell_zero_state = gcell.zero_state(batch_size=batch_size,
-                                            dtype=dtypes.float32)
+        gcell_zero_state = gcell.zero_state(
+            batch_size=batch_size, dtype=dtypes.float32)
         gh, gs = gcell(glstm_input, gcell_zero_state)
 
         # input for LSTM cell simulating single G-LSTM group
         lstm_input = array_ops.ones([batch_size, num_units / number_of_groups])
         # note division by number_of_groups. This cell one simulates G-LSTM group
         cell = rnn_cell.LSTMCell(num_units=int(num_units / number_of_groups))
-        cell_zero_state = cell.zero_state(batch_size=batch_size,
-                                          dtype=dtypes.float32)
+        cell_zero_state = cell.zero_state(
+            batch_size=batch_size, dtype=dtypes.float32)
         h, g = cell(lstm_input, cell_zero_state)
 
         sess.run([variables.global_variables_initializer()])
@@ -1099,6 +1083,7 @@ class RNNCellTest(test.TestCase):
         self.assertAllClose(gh_res[:, int(num_units / number_of_groups):],
                             h_res, 1e-5)
 
+
 class LayerNormBasicLSTMCellTest(test.TestCase):
 
   # NOTE: all the values in the current test case have been calculated.
@@ -1119,13 +1104,14 @@ class LayerNormBasicLSTMCellTest(test.TestCase):
         cell = rnn_cell.MultiRNNCell([single_cell() for _ in range(2)])
         g, out_m = cell(x, state)
         sess.run([variables.global_variables_initializer()])
-        res = sess.run([g, out_m], {
-            x.name: np.array([[1., 1.]]),
-            c0.name: 0.1 * np.asarray([[0, 1]]),
-            h0.name: 0.1 * np.asarray([[2, 3]]),
-            c1.name: 0.1 * np.asarray([[4, 5]]),
-            h1.name: 0.1 * np.asarray([[6, 7]]),
-        })
+        res = sess.run(
+            [g, out_m], {
+                x.name: np.array([[1., 1.]]),
+                c0.name: 0.1 * np.asarray([[0, 1]]),
+                h0.name: 0.1 * np.asarray([[2, 3]]),
+                c1.name: 0.1 * np.asarray([[4, 5]]),
+                h1.name: 0.1 * np.asarray([[6, 7]]),
+            })
 
         expected_h = np.array([[-0.38079708, 0.38079708]])
         expected_state0_c = np.array([[-1.0, 1.0]])
@@ -1155,11 +1141,12 @@ class LayerNormBasicLSTMCellTest(test.TestCase):
         cell = contrib_rnn_cell.LayerNormBasicLSTMCell(2)
         g, out_m = cell(x, state)
         sess.run([variables.global_variables_initializer()])
-        res = sess.run([g, out_m], {
-            x.name: np.array([[1., 1., 1.]]),
-            c.name: 0.1 * np.asarray([[0, 1]]),
-            h.name: 0.1 * np.asarray([[2, 3]]),
-        })
+        res = sess.run(
+            [g, out_m], {
+                x.name: np.array([[1., 1., 1.]]),
+                c.name: 0.1 * np.asarray([[0, 1]]),
+                h.name: 0.1 * np.asarray([[2, 3]]),
+            })
 
         expected_h = np.array([[-0.38079708, 0.38079708]])
         expected_c = np.array([[-1.0, 1.0]])
@@ -1168,7 +1155,6 @@ class LayerNormBasicLSTMCellTest(test.TestCase):
         self.assertAllClose(res[1].c, expected_c, 1e-5)
         self.assertAllClose(res[1].h, expected_h, 1e-5)
 
-
   def testBasicLSTMCellWithoutNorm(self):
     """Tests that BasicLSTMCell with layer_norm=False."""
     with self.test_session() as sess:
@@ -1186,19 +1172,20 @@ class LayerNormBasicLSTMCellTest(test.TestCase):
         cell = rnn_cell.MultiRNNCell([single_cell() for _ in range(2)])
         g, out_m = cell(x, state)
         sess.run([variables.global_variables_initializer()])
-        res = sess.run([g, out_m], {
-          x.name: np.array([[1., 1.]]),
-          c0.name: 0.1 * np.asarray([[0, 1]]),
-          h0.name: 0.1 * np.asarray([[2, 3]]),
-          c1.name: 0.1 * np.asarray([[4, 5]]),
-          h1.name: 0.1 * np.asarray([[6, 7]]),
-        })
-
-        expected_h = np.array([[ 0.70230919, 0.72581059]])
-        expected_state0_c = np.array([[ 0.8020075,  0.89599884]])
-        expected_state0_h = np.array([[ 0.56668288,  0.60858738]])
-        expected_state1_c = np.array([[ 1.17500675,  1.26892781]])
-        expected_state1_h = np.array([[ 0.70230919,  0.72581059]])
+        res = sess.run(
+            [g, out_m], {
+                x.name: np.array([[1., 1.]]),
+                c0.name: 0.1 * np.asarray([[0, 1]]),
+                h0.name: 0.1 * np.asarray([[2, 3]]),
+                c1.name: 0.1 * np.asarray([[4, 5]]),
+                h1.name: 0.1 * np.asarray([[6, 7]]),
+            })
+
+        expected_h = np.array([[0.70230919, 0.72581059]])
+        expected_state0_c = np.array([[0.8020075, 0.89599884]])
+        expected_state0_h = np.array([[0.56668288, 0.60858738]])
+        expected_state1_c = np.array([[1.17500675, 1.26892781]])
+        expected_state1_h = np.array([[0.70230919, 0.72581059]])
 
         actual_h = res[0]
         actual_state0_c = res[1][0].c
@@ -1215,21 +1202,22 @@ class LayerNormBasicLSTMCellTest(test.TestCase):
       with variable_scope.variable_scope(
           "other", initializer=init_ops.constant_initializer(0.5)) as vs:
         x = array_ops.zeros(
-          [1, 3])  # Test BasicLSTMCell with input_size != num_units.
+            [1, 3])  # Test BasicLSTMCell with input_size != num_units.
         c = array_ops.zeros([1, 2])
         h = array_ops.zeros([1, 2])
         state = rnn_cell.LSTMStateTuple(c, h)
         cell = contrib_rnn_cell.LayerNormBasicLSTMCell(2, layer_norm=False)
         g, out_m = cell(x, state)
         sess.run([variables.global_variables_initializer()])
-        res = sess.run([g, out_m], {
-          x.name: np.array([[1., 1., 1.]]),
-          c.name: 0.1 * np.asarray([[0, 1]]),
-          h.name: 0.1 * np.asarray([[2, 3]]),
-        })
-
-        expected_h = np.array([[ 0.64121795, 0.68166804]])
-        expected_c = np.array([[ 0.88477188, 0.98103917]])
+        res = sess.run(
+            [g, out_m], {
+                x.name: np.array([[1., 1., 1.]]),
+                c.name: 0.1 * np.asarray([[0, 1]]),
+                h.name: 0.1 * np.asarray([[2, 3]]),
+            })
+
+        expected_h = np.array([[0.64121795, 0.68166804]])
+        expected_c = np.array([[0.88477188, 0.98103917]])
         self.assertEqual(len(res), 2)
         self.assertAllClose(res[0], expected_h, 1e-5)
         self.assertAllClose(res[1].c, expected_c, 1e-5)
@@ -1250,13 +1238,14 @@ class LayerNormBasicLSTMCellTest(test.TestCase):
             [contrib_rnn_cell.LayerNormBasicLSTMCell(2) for _ in range(2)])
         h, (s0, s1) = cell(x, (state0, state1))
         sess.run([variables.global_variables_initializer()])
-        res = sess.run([h, s0, s1], {
-            x.name: np.array([[1., 1.]]),
-            c0.name: 0.1 * np.asarray([[0, 1]]),
-            h0.name: 0.1 * np.asarray([[2, 3]]),
-            c1.name: 0.1 * np.asarray([[4, 5]]),
-            h1.name: 0.1 * np.asarray([[6, 7]]),
-        })
+        res = sess.run(
+            [h, s0, s1], {
+                x.name: np.array([[1., 1.]]),
+                c0.name: 0.1 * np.asarray([[0, 1]]),
+                h0.name: 0.1 * np.asarray([[2, 3]]),
+                c1.name: 0.1 * np.asarray([[4, 5]]),
+                h1.name: 0.1 * np.asarray([[6, 7]]),
+            })
 
         expected_h = np.array([[-0.38079708, 0.38079708]])
         expected_h0 = np.array([[-0.38079708, 0.38079708]])
@@ -1344,11 +1333,12 @@ class LayerNormBasicLSTMCellTest(test.TestCase):
 
         g, s = cell(x, state)
         sess.run([variables.global_variables_initializer()])
-        res = sess.run([g, s], {
-            x.name: np.ones([1, 5]),
-            c.name: np.ones([1, 5]),
-            h.name: np.ones([1, 5]),
-        })
+        res = sess.run(
+            [g, s], {
+                x.name: np.ones([1, 5]),
+                c.name: np.ones([1, 5]),
+                h.name: np.ones([1, 5]),
+            })
 
         # Since the returned tensors are of size [1,n]
         # get the first component right now.
@@ -1374,35 +1364,35 @@ class LayerNormBasicLSTMCellTest(test.TestCase):
         self.assertIn(dropped_count, allowed_low)
 
 
-def _create_multi_lstm_cell_ops(batch_size, num_units, input_depth,
-                                num_layers, max_time, compiled):
+def _create_multi_lstm_cell_ops(batch_size, num_units, input_depth, num_layers,
+                                max_time, compiled):
   with variable_scope.variable_scope(
       "root",
       initializer=init_ops.random_uniform_initializer(-0.1, 0.1, seed=2)):
     inputs = variable_scope.get_variable(
-        "inputs", initializer=random_ops.random_uniform(
+        "inputs",
+        initializer=random_ops.random_uniform(
             (max_time, batch_size, input_depth), seed=1))
     maybe_xla = lambda c: contrib_rnn_cell.CompiledWrapper(c) if compiled else c
     cell = rnn_cell.MultiRNNCell(
         [maybe_xla(rnn_cell.LSTMCell(num_units)) for _ in range(num_layers)])
-    initial_state = cell.zero_state(
-        batch_size=batch_size, dtype=dtypes.float32)
+    initial_state = cell.zero_state(batch_size=batch_size, dtype=dtypes.float32)
     outputs, final_state = rnn.dynamic_rnn(
-        cell=cell, inputs=inputs, initial_state=initial_state,
-        time_major=True)
+        cell=cell, inputs=inputs, initial_state=initial_state, time_major=True)
     flat_final_state = nest.flatten(final_state)
     trainable_variables = variables.trainable_variables()
     outputs_grad = gradients_impl.gradients(
-        [outputs],
-        trainable_variables + [inputs] + nest.flatten(initial_state))
+        [outputs], trainable_variables + [inputs] + nest.flatten(initial_state))
     final_state_grad = gradients_impl.gradients(
         flat_final_state,
         trainable_variables + [inputs] + nest.flatten(initial_state))
 
-    return {"outputs": outputs,
-            "final_state": flat_final_state,
-            "outputs_grad": outputs_grad,
-            "final_state_grad": final_state_grad}
+    return {
+        "outputs": outputs,
+        "final_state": flat_final_state,
+        "outputs_grad": outputs_grad,
+        "final_state_grad": final_state_grad
+    }
 
 
 class CompiledWrapperTest(test.TestCase):
@@ -1420,8 +1410,10 @@ class CompiledWrapperTest(test.TestCase):
     random_seed.set_random_seed(1234)
     with self.test_session(graph=ops.Graph()) as sess:
       xla_ops = _create_multi_lstm_cell_ops(
-          batch_size=batch_size, num_units=num_units,
-          input_depth=input_depth, num_layers=num_layers,
+          batch_size=batch_size,
+          num_units=num_units,
+          input_depth=input_depth,
+          num_layers=num_layers,
           max_time=max_time,
           compiled=True)
       sess.run([variables.global_variables_initializer()])
@@ -1430,8 +1422,10 @@ class CompiledWrapperTest(test.TestCase):
     random_seed.set_random_seed(1234)
     with self.test_session(graph=ops.Graph()) as sess:
       non_xla_ops = _create_multi_lstm_cell_ops(
-          batch_size=batch_size, num_units=num_units,
-          input_depth=input_depth, num_layers=num_layers,
+          batch_size=batch_size,
+          num_units=num_units,
+          input_depth=input_depth,
+          num_layers=num_layers,
           max_time=max_time,
           compiled=False)
       sess.run([variables.global_variables_initializer()])
@@ -1440,16 +1434,16 @@ class CompiledWrapperTest(test.TestCase):
     self.assertAllClose(
         non_xla_results["outputs"], xla_results["outputs"], atol=atol)
 
-    for xla_value, non_xla_value in zip(
-        xla_results["final_state"], non_xla_results["final_state"]):
+    for xla_value, non_xla_value in zip(xla_results["final_state"],
+                                        non_xla_results["final_state"]):
       self.assertAllClose(xla_value, non_xla_value, atol=atol)
 
-    for xla_g, non_xla_g in zip(
-        xla_results["outputs_grad"], non_xla_results["outputs_grad"]):
+    for xla_g, non_xla_g in zip(xla_results["outputs_grad"],
+                                non_xla_results["outputs_grad"]):
       self.assertAllClose(xla_g, non_xla_g, atol=atol)
 
-    for xla_g, non_xla_g in zip(
-        xla_results["final_state_grad"], non_xla_results["final_state_grad"]):
+    for xla_g, non_xla_g in zip(xla_results["final_state_grad"],
+                                non_xla_results["final_state_grad"]):
       self.assertAllClose(xla_g, non_xla_g, atol=atol)
 
   def testMultiRNNCellWithStateTuple(self):
@@ -1463,19 +1457,20 @@ class CompiledWrapperTest(test.TestCase):
         # Test incorrectness of state
         with self.assertRaisesRegexp(ValueError, "Expected state .* a tuple"):
           rnn_cell.MultiRNNCell(
-              [rnn_cell.GRUCell(2)
-               for _ in range(2)], state_is_tuple=True)(x, m_bad)
+              [rnn_cell.GRUCell(2) for _ in range(2)],
+              state_is_tuple=True)(x, m_bad)
 
         _, ml = rnn_cell.MultiRNNCell(
-            [rnn_cell.GRUCell(2)
-             for _ in range(2)], state_is_tuple=True)(x, m_good)
+            [rnn_cell.GRUCell(2) for _ in range(2)],
+            state_is_tuple=True)(x, m_good)
 
         sess.run([variables.global_variables_initializer()])
-        res = sess.run(ml, {
-            x.name: np.array([[1., 1.]]),
-            m_good[0].name: np.array([[0.1, 0.1]]),
-            m_good[1].name: np.array([[0.1, 0.1]])
-        })
+        res = sess.run(
+            ml, {
+                x.name: np.array([[1., 1.]]),
+                m_good[0].name: np.array([[0.1, 0.1]]),
+                m_good[1].name: np.array([[0.1, 0.1]])
+            })
 
         # The numbers in results were not calculated, this is just a
         # smoke test.  However, these numbers should match those of
@@ -1490,24 +1485,20 @@ class BenchmarkLSTMCellXLA(test.Benchmark):
     num_layers = 3
     max_time = 50
     print("benchmarkDynamicRNNWithMultiLSTMCell")
-    print("\t" +
-          "\t".join(["inter_th", "intra_th",
-                     "batch_size", "num_units", "input_depth", "device",
-                     "compiled", "wall_time"]))
+    print("\t" + "\t".join([
+        "inter_th", "intra_th", "batch_size", "num_units", "input_depth",
+        "device", "compiled", "wall_time"
+    ]))
 
     warmup_run = True
-    for (threads,
-         device,
-         num_units,
-         batch_size,
-         input_depth,
-         compiled) in itertools.product(
-             [{"inter": 0, "intra": 0}, {"inter": 1, "intra": 4}],
-             ["cpu", "gpu"],
-             [32, 512],
-             [1, 32, 256],
-             [32, 512],
-             [False, True]):
+    for (threads, device, num_units, batch_size, input_depth,
+         compiled) in itertools.product([{
+             "inter": 0,
+             "intra": 0
+         }, {
+             "inter": 1,
+             "intra": 4
+         }], ["cpu", "gpu"], [32, 512], [1, 32, 256], [32, 512], [False, True]):
       if threads["inter"] != 0:
         # We only care about testing inter/intra op limitations on
         # CPU with small batch size, to mimic embedded devices.
@@ -1523,30 +1514,35 @@ class BenchmarkLSTMCellXLA(test.Benchmark):
       with session.Session(config=config, graph=ops.Graph()) as sess:
         with ops.device("/%s:0" % device):
           ops_dict = _create_multi_lstm_cell_ops(
-              batch_size=batch_size, num_units=num_units,
-              input_depth=input_depth, num_layers=num_layers,
+              batch_size=batch_size,
+              num_units=num_units,
+              input_depth=input_depth,
+              num_layers=num_layers,
               max_time=max_time,
               compiled=compiled)
         sess.run([variables.global_variables_initializer()])
         all_ops = nest.flatten(ops_dict.values())
         all_ops_group = control_flow_ops.group(*all_ops)
-        name_suffix = (
-            "inter_th_%d_intra_th_%d_bs_%d_units_%d_inputdepth_%d"
-            "_device_%s_xla_%s" % (
-                threads["inter"], threads["intra"],
-                batch_size, num_units, input_depth, device, compiled))
+        name_suffix = ("inter_th_%d_intra_th_%d_bs_%d_units_%d_inputdepth_%d"
+                       "_device_%s_xla_%s" %
+                       (threads["inter"], threads["intra"], batch_size,
+                        num_units, input_depth, device, compiled))
         if warmup_run:
           self.run_op_benchmark(
               sess, all_ops_group, min_iters=30, name="ignore_warmup")
           warmup_run = False
         benchmark_results = self.run_op_benchmark(
-            sess, all_ops_group, min_iters=50,
+            sess,
+            all_ops_group,
+            min_iters=50,
             name="benchmarkDynamicRNNWithMultiLSTMCell_%s" % name_suffix)
-        print("\t" +
-              "\t".join(["%s" % x for x in [
-                  threads["inter"], threads["intra"],
-                  batch_size, num_units, input_depth, device, compiled,
-                  benchmark_results["wall_time"]]]))
+        print("\t" + "\t".join([
+            "%s" % x
+            for x in [
+                threads["inter"], threads["intra"], batch_size, num_units,
+                input_depth, device, compiled, benchmark_results["wall_time"]
+            ]
+        ]))
 
 
 class WeightNormLSTMCellTest(test.TestCase):
@@ -1557,8 +1553,7 @@ class WeightNormLSTMCellTest(test.TestCase):
 
     with self.test_session() as sess:
       init = init_ops.constant_initializer(0.5)
-      with variable_scope.variable_scope("root",
-                                         initializer=init):
+      with variable_scope.variable_scope("root", initializer=init):
         x = array_ops.zeros([1, 2])
         c0 = array_ops.zeros([1, 2])
         h0 = array_ops.zeros([1, 2])
@@ -1568,11 +1563,12 @@ class WeightNormLSTMCellTest(test.TestCase):
         xout, sout = cell()(x, state0)
 
       sess.run([variables.global_variables_initializer()])
-      res = sess.run([xout, sout], {
-          x.name: np.array([[1., 1.]]),
-          c0.name: 0.1 * np.asarray([[0, 1]]),
-          h0.name: 0.1 * np.asarray([[2, 3]]),
-      })
+      res = sess.run(
+          [xout, sout], {
+              x.name: np.array([[1., 1.]]),
+              c0.name: 0.1 * np.asarray([[0, 1]]),
+              h0.name: 0.1 * np.asarray([[2, 3]]),
+          })
 
     actual_state_c = res[1].c
     actual_state_h = res[1].h
@@ -1583,9 +1579,8 @@ class WeightNormLSTMCellTest(test.TestCase):
     """Tests cell w/o peepholes and w/o normalisation"""
 
     def cell():
-      return contrib_rnn_cell.WeightNormLSTMCell(2,
-                                                 norm=False,
-                                                 use_peepholes=False)
+      return contrib_rnn_cell.WeightNormLSTMCell(
+          2, norm=False, use_peepholes=False)
 
     actual_c, actual_h = self._cell_output(cell)
 
@@ -1599,9 +1594,8 @@ class WeightNormLSTMCellTest(test.TestCase):
     """Tests cell with peepholes and w/o normalisation"""
 
     def cell():
-      return contrib_rnn_cell.WeightNormLSTMCell(2,
-                                                 norm=False,
-                                                 use_peepholes=True)
+      return contrib_rnn_cell.WeightNormLSTMCell(
+          2, norm=False, use_peepholes=True)
 
     actual_c, actual_h = self._cell_output(cell)
 
@@ -1611,14 +1605,12 @@ class WeightNormLSTMCellTest(test.TestCase):
     self.assertAllClose(expected_c, actual_c, 1e-5)
     self.assertAllClose(expected_h, actual_h, 1e-5)
 
-
   def testBasicCellWithNorm(self):
     """Tests cell w/o peepholes and with normalisation"""
 
     def cell():
-      return contrib_rnn_cell.WeightNormLSTMCell(2,
-                                                 norm=True,
-                                                 use_peepholes=False)
+      return contrib_rnn_cell.WeightNormLSTMCell(
+          2, norm=True, use_peepholes=False)
 
     actual_c, actual_h = self._cell_output(cell)
 
@@ -1632,9 +1624,8 @@ class WeightNormLSTMCellTest(test.TestCase):
     """Tests cell with peepholes and with normalisation"""
 
     def cell():
-      return contrib_rnn_cell.WeightNormLSTMCell(2,
-                                                 norm=True,
-                                                 use_peepholes=True)
+      return contrib_rnn_cell.WeightNormLSTMCell(
+          2, norm=True, use_peepholes=True)
 
     actual_c, actual_h = self._cell_output(cell)
 
@@ -1644,5 +1635,6 @@ class WeightNormLSTMCellTest(test.TestCase):
     self.assertAllClose(expected_c, actual_c, 1e-5)
     self.assertAllClose(expected_h, actual_h, 1e-5)
 
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/rnn/python/ops/rnn_cell.py b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
index d7ae6621db..8adf5dce6e 100644
--- a/tensorflow/contrib/rnn/python/ops/rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Module for constructing RNN Cells."""
 from __future__ import absolute_import
 from __future__ import division
@@ -56,16 +55,15 @@ def _get_concat_variable(name, shape, dtype, num_shards):
       return value
 
   concat_variable = array_ops.concat(sharded_variable, 0, name=concat_name)
-  ops.add_to_collection(ops.GraphKeys.CONCATENATED_VARIABLES,
-                        concat_variable)
+  ops.add_to_collection(ops.GraphKeys.CONCATENATED_VARIABLES, concat_variable)
   return concat_variable
 
 
 def _get_sharded_variable(name, shape, dtype, num_shards):
   """Get a list of sharded variables with the given dtype."""
   if num_shards > shape[0]:
-    raise ValueError("Too many shards: shape=%s, num_shards=%d" %
-                     (shape, num_shards))
+    raise ValueError("Too many shards: shape=%s, num_shards=%d" % (shape,
+                                                                   num_shards))
   unit_shard_size = int(math.floor(shape[0] / num_shards))
   remaining_rows = shape[0] - unit_shard_size * num_shards
 
@@ -74,8 +72,9 @@ def _get_sharded_variable(name, shape, dtype, num_shards):
     current_size = unit_shard_size
     if i < remaining_rows:
       current_size += 1
-    shards.append(vs.get_variable(name + "_%d" % i, [current_size] + shape[1:],
-                                  dtype=dtype))
+    shards.append(
+        vs.get_variable(
+            name + "_%d" % i, [current_size] + shape[1:], dtype=dtype))
   return shards
 
 
@@ -177,9 +176,8 @@ class CoupledInputForgetGateLSTMCell(rnn_cell_impl.RNNCell):
     """
     super(CoupledInputForgetGateLSTMCell, self).__init__(_reuse=reuse)
     if not state_is_tuple:
-      logging.warn(
-          "%s: Using a concatenated state is slower and will soon be "
-          "deprecated.  Use state_is_tuple=True.", self)
+      logging.warn("%s: Using a concatenated state is slower and will soon be "
+                   "deprecated.  Use state_is_tuple=True.", self)
     self._num_units = num_units
     self._use_peepholes = use_peepholes
     self._initializer = initializer
@@ -196,12 +194,14 @@ class CoupledInputForgetGateLSTMCell(rnn_cell_impl.RNNCell):
     self._norm_shift = norm_shift
 
     if num_proj:
-      self._state_size = (rnn_cell_impl.LSTMStateTuple(num_units, num_proj)
-                          if state_is_tuple else num_units + num_proj)
+      self._state_size = (
+          rnn_cell_impl.LSTMStateTuple(num_units, num_proj)
+          if state_is_tuple else num_units + num_proj)
       self._output_size = num_proj
     else:
-      self._state_size = (rnn_cell_impl.LSTMStateTuple(num_units, num_units)
-                          if state_is_tuple else 2 * num_units)
+      self._state_size = (
+          rnn_cell_impl.LSTMStateTuple(num_units, num_units)
+          if state_is_tuple else 2 * num_units)
       self._output_size = num_units
 
   @property
@@ -251,8 +251,8 @@ class CoupledInputForgetGateLSTMCell(rnn_cell_impl.RNNCell):
     if input_size.value is None:
       raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
     concat_w = _get_concat_variable(
-        "W", [input_size.value + num_proj, 3 * self._num_units],
-        dtype, self._num_unit_shards)
+        "W", [input_size.value + num_proj, 3 * self._num_units], dtype,
+        self._num_unit_shards)
 
     b = vs.get_variable(
         "B",
@@ -299,9 +299,9 @@ class CoupledInputForgetGateLSTMCell(rnn_cell_impl.RNNCell):
       m = sigmoid(o) * self._activation(c)
 
     if self._num_proj is not None:
-      concat_w_proj = _get_concat_variable(
-          "W_P", [self._num_units, self._num_proj],
-          dtype, self._num_proj_shards)
+      concat_w_proj = _get_concat_variable("W_P",
+                                           [self._num_units, self._num_proj],
+                                           dtype, self._num_proj_shards)
 
       m = math_ops.matmul(m, concat_w_proj)
       if self._proj_clip is not None:
@@ -309,8 +309,9 @@ class CoupledInputForgetGateLSTMCell(rnn_cell_impl.RNNCell):
         m = clip_ops.clip_by_value(m, -self._proj_clip, self._proj_clip)
         # pylint: enable=invalid-unary-operand-type
 
-    new_state = (rnn_cell_impl.LSTMStateTuple(c, m)
-                 if self._state_is_tuple else array_ops.concat([c, m], 1))
+    new_state = (
+        rnn_cell_impl.LSTMStateTuple(c, m)
+        if self._state_is_tuple else array_ops.concat([c, m], 1))
     return m, new_state
 
 
@@ -326,10 +327,15 @@ class TimeFreqLSTMCell(rnn_cell_impl.RNNCell):
   It uses peep-hole connections and optional cell clipping.
   """
 
-  def __init__(self, num_units, use_peepholes=False,
-               cell_clip=None, initializer=None,
-               num_unit_shards=1, forget_bias=1.0,
-               feature_size=None, frequency_skip=1,
+  def __init__(self,
+               num_units,
+               use_peepholes=False,
+               cell_clip=None,
+               initializer=None,
+               num_unit_shards=1,
+               forget_bias=1.0,
+               feature_size=None,
+               frequency_skip=1,
                reuse=None):
     """Initialize the parameters for an LSTM cell.
 
@@ -399,7 +405,7 @@ class TimeFreqLSTMCell(rnn_cell_impl.RNNCell):
     actual_input_size = freq_inputs[0].get_shape().as_list()[1]
 
     concat_w = _get_concat_variable(
-        "W", [actual_input_size + 2*self._num_units, 4 * self._num_units],
+        "W", [actual_input_size + 2 * self._num_units, 4 * self._num_units],
         dtype, self._num_unit_shards)
 
     b = vs.get_variable(
@@ -418,23 +424,23 @@ class TimeFreqLSTMCell(rnn_cell_impl.RNNCell):
           "W_O_diag", shape=[self._num_units], dtype=dtype)
 
     # initialize the first freq state to be zero
-    m_prev_freq = array_ops.zeros([int(inputs.get_shape()[0]),
-                                   self._num_units], dtype)
+    m_prev_freq = array_ops.zeros([int(inputs.get_shape()[0]), self._num_units],
+                                  dtype)
     for fq in range(len(freq_inputs)):
-      c_prev = array_ops.slice(state, [0, 2*fq*self._num_units],
+      c_prev = array_ops.slice(state, [0, 2 * fq * self._num_units],
                                [-1, self._num_units])
-      m_prev = array_ops.slice(state, [0, (2*fq+1)*self._num_units],
+      m_prev = array_ops.slice(state, [0, (2 * fq + 1) * self._num_units],
                                [-1, self._num_units])
       # i = input_gate, j = new_input, f = forget_gate, o = output_gate
-      cell_inputs = array_ops.concat([freq_inputs[fq], m_prev, m_prev_freq],
-                                     1)
+      cell_inputs = array_ops.concat([freq_inputs[fq], m_prev, m_prev_freq], 1)
       lstm_matrix = nn_ops.bias_add(math_ops.matmul(cell_inputs, concat_w), b)
       i, j, f, o = array_ops.split(
           value=lstm_matrix, num_or_size_splits=4, axis=1)
 
       if self._use_peepholes:
-        c = (sigmoid(f + self._forget_bias + w_f_diag * c_prev) * c_prev +
-             sigmoid(i + w_i_diag * c_prev) * tanh(j))
+        c = (
+            sigmoid(f + self._forget_bias + w_f_diag * c_prev) * c_prev +
+            sigmoid(i + w_i_diag * c_prev) * tanh(j))
       else:
         c = (sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) * tanh(j))
 
@@ -472,11 +478,11 @@ class TimeFreqLSTMCell(rnn_cell_impl.RNNCell):
     input_size = input_feat.get_shape().with_rank(2)[-1].value
     if input_size is None:
       raise ValueError("Cannot infer input_size from static shape inference.")
-    num_feats = int((input_size - self._feature_size) / (
-        self._frequency_skip)) + 1
+    num_feats = int(
+        (input_size - self._feature_size) / (self._frequency_skip)) + 1
     freq_inputs = []
     for f in range(num_feats):
-      cur_input = array_ops.slice(input_feat, [0, f*self._frequency_skip],
+      cur_input = array_ops.slice(input_feat, [0, f * self._frequency_skip],
                                   [-1, self._feature_size])
       freq_inputs.append(cur_input)
     return freq_inputs
@@ -498,11 +504,16 @@ class GridLSTMCell(rnn_cell_impl.RNNCell):
   The code uses optional peephole connections, shared_weights and cell clipping.
   """
 
-  def __init__(self, num_units, use_peepholes=False,
+  def __init__(self,
+               num_units,
+               use_peepholes=False,
                share_time_frequency_weights=False,
-               cell_clip=None, initializer=None,
-               num_unit_shards=1, forget_bias=1.0,
-               feature_size=None, frequency_skip=None,
+               cell_clip=None,
+               initializer=None,
+               num_unit_shards=1,
+               forget_bias=1.0,
+               feature_size=None,
+               frequency_skip=None,
                num_frequency_blocks=None,
                start_freqindex_list=None,
                end_freqindex_list=None,
@@ -580,10 +591,10 @@ class GridLSTMCell(rnn_cell_impl.RNNCell):
         for freq_index in range(self._num_frequency_blocks[block_index]):
           name_prefix = "state_f%02d_b%02d" % (freq_index, block_index)
           state_names += ("%s_c, %s_m," % (name_prefix, name_prefix))
-      self._state_tuple_type = collections.namedtuple(
-          "GridLSTMStateTuple", state_names.strip(","))
-      self._state_size = self._state_tuple_type(
-          *([num_units, num_units] * self._total_blocks))
+      self._state_tuple_type = collections.namedtuple("GridLSTMStateTuple",
+                                                      state_names.strip(","))
+      self._state_size = self._state_tuple_type(*(
+          [num_units, num_units] * self._total_blocks))
     else:
       self._state_tuple_type = None
       self._state_size = num_units * self._total_blocks * 2
@@ -626,7 +637,10 @@ class GridLSTMCell(rnn_cell_impl.RNNCell):
     state_out_lst = []
     for block in range(len(freq_inputs)):
       m_out_lst_current, state_out_lst_current = self._compute(
-          freq_inputs[block], block, state, batch_size,
+          freq_inputs[block],
+          block,
+          state,
+          batch_size,
           state_is_tuple=self._state_is_tuple)
       m_out_lst.extend(m_out_lst_current)
       state_out_lst.extend(state_out_lst_current)
@@ -637,7 +651,11 @@ class GridLSTMCell(rnn_cell_impl.RNNCell):
     m_out = array_ops.concat(m_out_lst, 1)
     return m_out, state_out
 
-  def _compute(self, freq_inputs, block, state, batch_size,
+  def _compute(self,
+               freq_inputs,
+               block,
+               state,
+               batch_size,
                state_prefix="state",
                state_is_tuple=True):
     """Run the actual computation of one step LSTM.
@@ -666,8 +684,8 @@ class GridLSTMCell(rnn_cell_impl.RNNCell):
     actual_input_size = freq_inputs[0].get_shape().as_list()[1]
 
     concat_w_f = _get_concat_variable(
-        "W_f_%d" % block, [actual_input_size + 2 * self._num_units,
-                           num_gates * self._num_units],
+        "W_f_%d" % block,
+        [actual_input_size + 2 * self._num_units, num_gates * self._num_units],
         dtype, self._num_unit_shards)
     b_f = vs.get_variable(
         "B_f_%d" % block,
@@ -675,10 +693,9 @@ class GridLSTMCell(rnn_cell_impl.RNNCell):
         initializer=init_ops.zeros_initializer(),
         dtype=dtype)
     if not self._share_time_frequency_weights:
-      concat_w_t = _get_concat_variable(
-          "W_t_%d" % block, [actual_input_size + 2 * self._num_units,
-                             num_gates * self._num_units],
-          dtype, self._num_unit_shards)
+      concat_w_t = _get_concat_variable("W_t_%d" % block, [
+          actual_input_size + 2 * self._num_units, num_gates * self._num_units
+      ], dtype, self._num_unit_shards)
       b_t = vs.get_variable(
           "B_t_%d" % block,
           shape=[num_gates * self._num_units],
@@ -691,7 +708,7 @@ class GridLSTMCell(rnn_cell_impl.RNNCell):
         w_f_diag_freqf = vs.get_variable(
             "W_F_diag_freqf_%d" % block, shape=[self._num_units], dtype=dtype)
         w_f_diag_freqt = vs.get_variable(
-            "W_F_diag_freqt_%d"% block, shape=[self._num_units], dtype=dtype)
+            "W_F_diag_freqt_%d" % block, shape=[self._num_units], dtype=dtype)
       w_i_diag_freqf = vs.get_variable(
           "W_I_diag_freqf_%d" % block, shape=[self._num_units], dtype=dtype)
       w_i_diag_freqt = vs.get_variable(
@@ -725,8 +742,7 @@ class GridLSTMCell(rnn_cell_impl.RNNCell):
         m_prev_time = getattr(state, name_prefix + "_m")
       else:
         c_prev_time = array_ops.slice(
-            state, [0, 2 * freq_index * self._num_units],
-            [-1, self._num_units])
+            state, [0, 2 * freq_index * self._num_units], [-1, self._num_units])
         m_prev_time = array_ops.slice(
             state, [0, (2 * freq_index + 1) * self._num_units],
             [-1, self._num_units])
@@ -736,8 +752,8 @@ class GridLSTMCell(rnn_cell_impl.RNNCell):
           [freq_inputs[freq_index], m_prev_time, m_prev_freq], 1)
 
       # F-LSTM
-      lstm_matrix_freq = nn_ops.bias_add(math_ops.matmul(cell_inputs,
-                                                         concat_w_f), b_f)
+      lstm_matrix_freq = nn_ops.bias_add(
+          math_ops.matmul(cell_inputs, concat_w_f), b_f)
       if self._couple_input_forget_gates:
         i_freq, j_freq, o_freq = array_ops.split(
             value=lstm_matrix_freq, num_or_size_splits=num_gates, axis=1)
@@ -752,8 +768,8 @@ class GridLSTMCell(rnn_cell_impl.RNNCell):
         f_time = f_freq
         o_time = o_freq
       else:
-        lstm_matrix_time = nn_ops.bias_add(math_ops.matmul(cell_inputs,
-                                                           concat_w_t), b_t)
+        lstm_matrix_time = nn_ops.bias_add(
+            math_ops.matmul(cell_inputs, concat_w_t), b_t)
         if self._couple_input_forget_gates:
           i_time, j_time, o_time = array_ops.split(
               value=lstm_matrix_time, num_or_size_splits=num_gates, axis=1)
@@ -765,8 +781,7 @@ class GridLSTMCell(rnn_cell_impl.RNNCell):
       # F-LSTM c_freq
       # input gate activations
       if self._use_peepholes:
-        i_freq_g = sigmoid(i_freq +
-                           w_i_diag_freqf * c_prev_freq +
+        i_freq_g = sigmoid(i_freq + w_i_diag_freqf * c_prev_freq +
                            w_i_diag_freqt * c_prev_time)
       else:
         i_freq_g = sigmoid(i_freq)
@@ -775,9 +790,8 @@ class GridLSTMCell(rnn_cell_impl.RNNCell):
         f_freq_g = 1.0 - i_freq_g
       else:
         if self._use_peepholes:
-          f_freq_g = sigmoid(f_freq + self._forget_bias +
-                             w_f_diag_freqf * c_prev_freq +
-                             w_f_diag_freqt * c_prev_time)
+          f_freq_g = sigmoid(f_freq + self._forget_bias + w_f_diag_freqf *
+                             c_prev_freq + w_f_diag_freqt * c_prev_time)
         else:
           f_freq_g = sigmoid(f_freq + self._forget_bias)
       # cell state
@@ -792,12 +806,10 @@ class GridLSTMCell(rnn_cell_impl.RNNCell):
       # input gate activations
       if self._use_peepholes:
         if self._share_time_frequency_weights:
-          i_time_g = sigmoid(i_time +
-                             w_i_diag_freqf * c_prev_freq +
+          i_time_g = sigmoid(i_time + w_i_diag_freqf * c_prev_freq +
                              w_i_diag_freqt * c_prev_time)
         else:
-          i_time_g = sigmoid(i_time +
-                             w_i_diag_timef * c_prev_freq +
+          i_time_g = sigmoid(i_time + w_i_diag_timef * c_prev_freq +
                              w_i_diag_timet * c_prev_time)
       else:
         i_time_g = sigmoid(i_time)
@@ -807,13 +819,11 @@ class GridLSTMCell(rnn_cell_impl.RNNCell):
       else:
         if self._use_peepholes:
           if self._share_time_frequency_weights:
-            f_time_g = sigmoid(f_time + self._forget_bias +
-                               w_f_diag_freqf * c_prev_freq +
-                               w_f_diag_freqt * c_prev_time)
+            f_time_g = sigmoid(f_time + self._forget_bias + w_f_diag_freqf *
+                               c_prev_freq + w_f_diag_freqt * c_prev_time)
           else:
-            f_time_g = sigmoid(f_time + self._forget_bias +
-                               w_f_diag_timef * c_prev_freq +
-                               w_f_diag_timet * c_prev_time)
+            f_time_g = sigmoid(f_time + self._forget_bias + w_f_diag_timef *
+                               c_prev_freq + w_f_diag_timet * c_prev_time)
         else:
           f_time_g = sigmoid(f_time + self._forget_bias)
       # cell state
@@ -826,8 +836,7 @@ class GridLSTMCell(rnn_cell_impl.RNNCell):
 
       # F-LSTM m_freq
       if self._use_peepholes:
-        m_freq = sigmoid(o_freq +
-                         w_o_diag_freqf * c_freq +
+        m_freq = sigmoid(o_freq + w_o_diag_freqf * c_freq +
                          w_o_diag_freqt * c_time) * tanh(c_freq)
       else:
         m_freq = sigmoid(o_freq) * tanh(c_freq)
@@ -835,12 +844,10 @@ class GridLSTMCell(rnn_cell_impl.RNNCell):
       # T-LSTM m_time
       if self._use_peepholes:
         if self._share_time_frequency_weights:
-          m_time = sigmoid(o_time +
-                           w_o_diag_freqf * c_freq +
+          m_time = sigmoid(o_time + w_o_diag_freqf * c_freq +
                            w_o_diag_freqt * c_time) * tanh(c_time)
         else:
-          m_time = sigmoid(o_time +
-                           w_o_diag_timef * c_freq +
+          m_time = sigmoid(o_time + w_o_diag_timef * c_freq +
                            w_o_diag_timet * c_time) * tanh(c_time)
       else:
         m_time = sigmoid(o_time) * tanh(c_time)
@@ -879,16 +886,18 @@ class GridLSTMCell(rnn_cell_impl.RNNCell):
       raise ValueError("Cannot infer input_size from static shape inference.")
     if slice_offset > 0:
       # Padding to the end
-      inputs = array_ops.pad(
-          input_feat, array_ops.constant([0, 0, 0, slice_offset], shape=[2, 2],
-                                         dtype=dtypes.int32),
-          "CONSTANT")
+      inputs = array_ops.pad(input_feat,
+                             array_ops.constant(
+                                 [0, 0, 0, slice_offset],
+                                 shape=[2, 2],
+                                 dtype=dtypes.int32), "CONSTANT")
     elif slice_offset < 0:
       # Padding to the front
-      inputs = array_ops.pad(
-          input_feat, array_ops.constant([0, 0, -slice_offset, 0], shape=[2, 2],
-                                         dtype=dtypes.int32),
-          "CONSTANT")
+      inputs = array_ops.pad(input_feat,
+                             array_ops.constant(
+                                 [0, 0, -slice_offset, 0],
+                                 shape=[2, 2],
+                                 dtype=dtypes.int32), "CONSTANT")
       slice_offset = 0
     else:
       inputs = input_feat
@@ -898,13 +907,13 @@ class GridLSTMCell(rnn_cell_impl.RNNCell):
         raise ValueError("Length of num_frequency_blocks"
                          " is not 1, but instead is %d",
                          len(self._num_frequency_blocks))
-      num_feats = int((input_size - self._feature_size) / (
-          self._frequency_skip)) + 1
+      num_feats = int(
+          (input_size - self._feature_size) / (self._frequency_skip)) + 1
       if num_feats != self._num_frequency_blocks[0]:
         raise ValueError(
             "Invalid num_frequency_blocks, requires %d but gets %d, please"
-            " check the input size and filter config are correct." % (
-                self._num_frequency_blocks[0], num_feats))
+            " check the input size and filter config are correct." %
+            (self._num_frequency_blocks[0], num_feats))
       block_inputs = []
       for f in range(num_feats):
         cur_input = array_ops.slice(
@@ -927,18 +936,18 @@ class GridLSTMCell(rnn_cell_impl.RNNCell):
         start_index = self._start_freqindex_list[b]
         end_index = self._end_freqindex_list[b]
         cur_size = end_index - start_index
-        block_feats = int((cur_size - self._feature_size) / (
-            self._frequency_skip)) + 1
+        block_feats = int(
+            (cur_size - self._feature_size) / (self._frequency_skip)) + 1
         if block_feats != self._num_frequency_blocks[b]:
           raise ValueError(
               "Invalid num_frequency_blocks, requires %d but gets %d, please"
-              " check the input size and filter config are correct." % (
-                  self._num_frequency_blocks[b], block_feats))
+              " check the input size and filter config are correct." %
+              (self._num_frequency_blocks[b], block_feats))
         block_inputs = []
         for f in range(block_feats):
           cur_input = array_ops.slice(
-              inputs, [0, start_index + slice_offset + f *
-                       self._frequency_skip],
+              inputs,
+              [0, start_index + slice_offset + f * self._frequency_skip],
               [-1, self._feature_size])
           block_inputs.append(cur_input)
         freq_inputs.append(block_inputs)
@@ -954,11 +963,16 @@ class BidirectionalGridLSTMCell(GridLSTMCell):
   The current implementation uses different weights for the two directions.
   """
 
-  def __init__(self, num_units, use_peepholes=False,
+  def __init__(self,
+               num_units,
+               use_peepholes=False,
                share_time_frequency_weights=False,
-               cell_clip=None, initializer=None,
-               num_unit_shards=1, forget_bias=1.0,
-               feature_size=None, frequency_skip=None,
+               cell_clip=None,
+               initializer=None,
+               num_unit_shards=1,
+               forget_bias=1.0,
+               feature_size=None,
+               frequency_skip=None,
                num_frequency_blocks=None,
                start_freqindex_list=None,
                end_freqindex_list=None,
@@ -1017,8 +1031,8 @@ class BidirectionalGridLSTMCell(GridLSTMCell):
           state_names += ("%s_c, %s_m," % (name_prefix, name_prefix))
     self._state_tuple_type = collections.namedtuple(
         "BidirectionalGridLSTMStateTuple", state_names.strip(","))
-    self._state_size = self._state_tuple_type(
-        *([num_units, num_units] * self._total_blocks * 2))
+    self._state_size = self._state_tuple_type(*(
+        [num_units, num_units] * self._total_blocks * 2))
     self._output_size = 2 * num_units * self._total_blocks * 2
 
   def call(self, inputs, state):
@@ -1052,8 +1066,12 @@ class BidirectionalGridLSTMCell(GridLSTMCell):
       fwd_state_out_lst = []
       for block in range(len(fwd_inputs)):
         fwd_m_out_lst_current, fwd_state_out_lst_current = self._compute(
-            fwd_inputs[block], block, state, batch_size,
-            state_prefix="fwd_state", state_is_tuple=True)
+            fwd_inputs[block],
+            block,
+            state,
+            batch_size,
+            state_prefix="fwd_state",
+            state_is_tuple=True)
         fwd_m_out_lst.extend(fwd_m_out_lst_current)
         fwd_state_out_lst.extend(fwd_state_out_lst_current)
     # Backward processing
@@ -1064,8 +1082,12 @@ class BidirectionalGridLSTMCell(GridLSTMCell):
         # Reverse the blocks
         bwd_inputs_reverse = bwd_inputs[block][::-1]
         bwd_m_out_lst_current, bwd_state_out_lst_current = self._compute(
-            bwd_inputs_reverse, block, state, batch_size,
-            state_prefix="bwd_state", state_is_tuple=True)
+            bwd_inputs_reverse,
+            block,
+            state,
+            batch_size,
+            state_prefix="bwd_state",
+            state_is_tuple=True)
         bwd_m_out_lst.extend(bwd_m_out_lst_current)
         bwd_state_out_lst.extend(bwd_state_out_lst_current)
     state_out = self._state_tuple_type(*(fwd_state_out_lst + bwd_state_out_lst))
@@ -1076,6 +1098,7 @@ class BidirectionalGridLSTMCell(GridLSTMCell):
 
 # pylint: disable=protected-access
 _Linear = core_rnn_cell._Linear  # pylint: disable=invalid-name
+
 # pylint: enable=protected-access
 
 
@@ -1085,8 +1108,14 @@ class AttentionCellWrapper(rnn_cell_impl.RNNCell):
   Implementation based on https://arxiv.org/abs/1409.0473.
   """
 
-  def __init__(self, cell, attn_length, attn_size=None, attn_vec_size=None,
-               input_size=None, state_is_tuple=True, reuse=None):
+  def __init__(self,
+               cell,
+               attn_length,
+               attn_size=None,
+               attn_vec_size=None,
+               input_size=None,
+               state_is_tuple=True,
+               reuse=None):
     """Create a cell with attention.
 
     Args:
@@ -1116,16 +1145,15 @@ class AttentionCellWrapper(rnn_cell_impl.RNNCell):
     if not rnn_cell_impl._like_rnncell(cell):  # pylint: disable=protected-access
       raise TypeError("The parameter cell is not RNNCell.")
     if nest.is_sequence(cell.state_size) and not state_is_tuple:
-      raise ValueError("Cell returns tuple of states, but the flag "
-                       "state_is_tuple is not set. State size is: %s"
-                       % str(cell.state_size))
+      raise ValueError(
+          "Cell returns tuple of states, but the flag "
+          "state_is_tuple is not set. State size is: %s" % str(cell.state_size))
     if attn_length <= 0:
-      raise ValueError("attn_length should be greater than zero, got %s"
-                       % str(attn_length))
+      raise ValueError(
+          "attn_length should be greater than zero, got %s" % str(attn_length))
     if not state_is_tuple:
-      logging.warn(
-          "%s: Using a concatenated state is slower and will soon be "
-          "deprecated.  Use state_is_tuple=True.", self)
+      logging.warn("%s: Using a concatenated state is slower and will soon be "
+                   "deprecated.  Use state_is_tuple=True.", self)
     if attn_size is None:
       attn_size = cell.output_size
     if attn_vec_size is None:
@@ -1161,8 +1189,8 @@ class AttentionCellWrapper(rnn_cell_impl.RNNCell):
     else:
       states = state
       state = array_ops.slice(states, [0, 0], [-1, self._cell.state_size])
-      attns = array_ops.slice(
-          states, [0, self._cell.state_size], [-1, self._attn_size])
+      attns = array_ops.slice(states, [0, self._cell.state_size],
+                              [-1, self._attn_size])
       attn_states = array_ops.slice(
           states, [0, self._cell.state_size + self._attn_size],
           [-1, self._attn_size * self._attn_length])
@@ -1200,8 +1228,8 @@ class AttentionCellWrapper(rnn_cell_impl.RNNCell):
     tanh = math_ops.tanh
 
     with vs.variable_scope("attention"):
-      k = vs.get_variable(
-          "attn_w", [1, 1, self._attn_size, self._attn_vec_size])
+      k = vs.get_variable("attn_w",
+                          [1, 1, self._attn_size, self._attn_vec_size])
       v = vs.get_variable("attn_v", [self._attn_vec_size])
       hidden = array_ops.reshape(attn_states,
                                  [-1, self._attn_length, 1, self._attn_size])
@@ -1228,7 +1256,8 @@ class HighwayWrapper(rnn_cell_impl.RNNCell):
     https://arxiv.org/abs/1505.00387
   """
 
-  def __init__(self, cell,
+  def __init__(self,
+               cell,
                couple_carry_transform_gates=True,
                carry_bias_init=1.0):
     """Constructs a `HighwayWrapper` for `cell`.
@@ -1260,8 +1289,7 @@ class HighwayWrapper(rnn_cell_impl.RNNCell):
     carry_weight = vs.get_variable("carry_w", [input_size, input_size])
     carry_bias = vs.get_variable(
         "carry_b", [input_size],
-        initializer=init_ops.constant_initializer(
-            self._carry_bias_init))
+        initializer=init_ops.constant_initializer(self._carry_bias_init))
     carry = math_ops.sigmoid(nn_ops.xw_plus_b(inp, carry_weight, carry_bias))
     if self._couple_carry_transform_gates:
       transform = 1 - carry
@@ -1270,11 +1298,9 @@ class HighwayWrapper(rnn_cell_impl.RNNCell):
                                          [input_size, input_size])
       transform_bias = vs.get_variable(
           "transform_b", [input_size],
-          initializer=init_ops.constant_initializer(
-              -self._carry_bias_init))
-      transform = math_ops.sigmoid(nn_ops.xw_plus_b(inp,
-                                                    transform_weight,
-                                                    transform_bias))
+          initializer=init_ops.constant_initializer(-self._carry_bias_init))
+      transform = math_ops.sigmoid(
+          nn_ops.xw_plus_b(inp, transform_weight, transform_bias))
     return inp * carry + out * transform
 
   def __call__(self, inputs, state, scope=None):
@@ -1294,9 +1320,11 @@ class HighwayWrapper(rnn_cell_impl.RNNCell):
     """
     outputs, new_state = self._cell(inputs, state, scope=scope)
     nest.assert_same_structure(inputs, outputs)
+
     # Ensure shapes match
     def assert_shape_match(inp, out):
       inp.get_shape().assert_is_compatible_with(out.get_shape())
+
     nest.map_structure(assert_shape_match, inputs, outputs)
     res_outputs = nest.map_structure(self._highway, inputs, outputs)
     return (res_outputs, new_state)
@@ -1322,10 +1350,16 @@ class LayerNormBasicLSTMCell(rnn_cell_impl.RNNCell):
   Stanislau Semeniuta, Aliaksei Severyn, Erhardt Barth.
   """
 
-  def __init__(self, num_units, forget_bias=1.0,
-               input_size=None, activation=math_ops.tanh,
-               layer_norm=True, norm_gain=1.0, norm_shift=0.0,
-               dropout_keep_prob=1.0, dropout_prob_seed=None,
+  def __init__(self,
+               num_units,
+               forget_bias=1.0,
+               input_size=None,
+               activation=math_ops.tanh,
+               layer_norm=True,
+               norm_gain=1.0,
+               norm_shift=0.0,
+               dropout_keep_prob=1.0,
+               dropout_prob_seed=None,
                reuse=None):
     """Initializes the basic LSTM cell.
 
@@ -1410,8 +1444,8 @@ class LayerNormBasicLSTMCell(rnn_cell_impl.RNNCell):
     if (not isinstance(self._keep_prob, float)) or self._keep_prob < 1:
       g = nn_ops.dropout(g, self._keep_prob, seed=self._seed)
 
-    new_c = (c * math_ops.sigmoid(f + self._forget_bias)
-             + math_ops.sigmoid(i) * g)
+    new_c = (
+        c * math_ops.sigmoid(f + self._forget_bias) + math_ops.sigmoid(i) * g)
     if self._layer_norm:
       new_c = self._norm(new_c, "state", dtype=dtype)
     new_h = self._activation(new_c) * math_ops.sigmoid(o)
@@ -1433,8 +1467,7 @@ class NASCell(rnn_cell_impl.RNNCell):
   The class uses an optional projection layer.
   """
 
-  def __init__(self, num_units, num_proj=None,
-               use_biases=False, reuse=None):
+  def __init__(self, num_units, num_proj=None, use_biases=False, reuse=None):
     """Initialize the parameters for a NAS cell.
 
     Args:
@@ -1504,12 +1537,10 @@ class NASCell(rnn_cell_impl.RNNCell):
       raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
     # Variables for the NAS cell. W_m is all matrices multiplying the
     # hiddenstate and W_inputs is all matrices multiplying the inputs.
-    concat_w_m = vs.get_variable(
-        "recurrent_kernel", [num_proj, 8 * self._num_units],
-        dtype)
+    concat_w_m = vs.get_variable("recurrent_kernel",
+                                 [num_proj, 8 * self._num_units], dtype)
     concat_w_inputs = vs.get_variable(
-        "kernel", [input_size.value, 8 * self._num_units],
-        dtype)
+        "kernel", [input_size.value, 8 * self._num_units], dtype)
 
     m_matrix = math_ops.matmul(m_prev, concat_w_m)
     inputs_matrix = math_ops.matmul(inputs, concat_w_inputs)
@@ -1524,10 +1555,10 @@ class NASCell(rnn_cell_impl.RNNCell):
 
     # The NAS cell branches into 8 different splits for both the hiddenstate
     # and the input
-    m_matrix_splits = array_ops.split(axis=1, num_or_size_splits=8,
-                                      value=m_matrix)
-    inputs_matrix_splits = array_ops.split(axis=1, num_or_size_splits=8,
-                                           value=inputs_matrix)
+    m_matrix_splits = array_ops.split(
+        axis=1, num_or_size_splits=8, value=m_matrix)
+    inputs_matrix_splits = array_ops.split(
+        axis=1, num_or_size_splits=8, value=inputs_matrix)
 
     # First layer
     layer1_0 = sigmoid(inputs_matrix_splits[0] + m_matrix_splits[0])
@@ -1559,9 +1590,8 @@ class NASCell(rnn_cell_impl.RNNCell):
 
     # Projection layer if specified
     if self._num_proj is not None:
-      concat_w_proj = vs.get_variable(
-          "projection_weights", [self._num_units, self._num_proj],
-          dtype)
+      concat_w_proj = vs.get_variable("projection_weights",
+                                      [self._num_units, self._num_proj], dtype)
       new_m = math_ops.matmul(new_m, concat_w_proj)
 
     new_state = rnn_cell_impl.LSTMStateTuple(new_c, new_m)
@@ -1584,8 +1614,12 @@ class UGRNNCell(rnn_cell_impl.RNNCell):
   "Capacity and Trainability in Recurrent Neural Networks" Proc. ICLR 2017.
   """
 
-  def __init__(self, num_units, initializer=None, forget_bias=1.0,
-               activation=math_ops.tanh, reuse=None):
+  def __init__(self,
+               num_units,
+               initializer=None,
+               forget_bias=1.0,
+               activation=math_ops.tanh,
+               reuse=None):
     """Initialize the parameters for an UGRNN cell.
 
     Args:
@@ -1640,8 +1674,8 @@ class UGRNNCell(rnn_cell_impl.RNNCell):
     if input_size.value is None:
       raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
 
-    with vs.variable_scope(vs.get_variable_scope(),
-                           initializer=self._initializer):
+    with vs.variable_scope(
+        vs.get_variable_scope(), initializer=self._initializer):
       cell_inputs = array_ops.concat([inputs, state], 1)
       if self._linear is None:
         self._linear = _Linear(cell_inputs, 2 * self._num_units, True)
@@ -1681,9 +1715,13 @@ class IntersectionRNNCell(rnn_cell_impl.RNNCell):
   RNNs so it may not achieve best performance with depth 1.
   """
 
-  def __init__(self, num_units, num_in_proj=None,
-               initializer=None, forget_bias=1.0,
-               y_activation=nn_ops.relu, reuse=None):
+  def __init__(self,
+               num_units,
+               num_in_proj=None,
+               initializer=None,
+               forget_bias=1.0,
+               y_activation=nn_ops.relu,
+               reuse=None):
     """Initialize the parameters for an +RNN cell.
 
     Args:
@@ -1747,8 +1785,8 @@ class IntersectionRNNCell(rnn_cell_impl.RNNCell):
     if input_size.value is None:
       raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
 
-    with vs.variable_scope(vs.get_variable_scope(),
-                           initializer=self._initializer):
+    with vs.variable_scope(
+        vs.get_variable_scope(), initializer=self._initializer):
       # read-in projections (should be used for first layer in deep +RNN
       # to transform size of inputs from I --> N)
       if input_size.value != self._num_units:
@@ -1765,13 +1803,13 @@ class IntersectionRNNCell(rnn_cell_impl.RNNCell):
       n_dim = i_dim = self._num_units
       cell_inputs = array_ops.concat([inputs, state], 1)
       if self._linear2 is None:
-        self._linear2 = _Linear(cell_inputs, 2*n_dim + 2*i_dim, True)
+        self._linear2 = _Linear(cell_inputs, 2 * n_dim + 2 * i_dim, True)
       rnn_matrix = self._linear2(cell_inputs)
 
-      gh_act = rnn_matrix[:, :n_dim]                           # b x n
-      h_act = rnn_matrix[:, n_dim:2*n_dim]                     # b x n
-      gy_act = rnn_matrix[:, 2*n_dim:2*n_dim+i_dim]            # b x i
-      y_act = rnn_matrix[:, 2*n_dim+i_dim:2*n_dim+2*i_dim]     # b x i
+      gh_act = rnn_matrix[:, :n_dim]  # b x n
+      h_act = rnn_matrix[:, n_dim:2 * n_dim]  # b x n
+      gy_act = rnn_matrix[:, 2 * n_dim:2 * n_dim + i_dim]  # b x i
+      y_act = rnn_matrix[:, 2 * n_dim + i_dim:2 * n_dim + 2 * i_dim]  # b x i
 
       h = tanh(h_act)
       y = self._y_activation(y_act)
@@ -1817,6 +1855,7 @@ class CompiledWrapper(rnn_cell_impl.RNNCell):
     if self._compile_stateful:
       compile_ops = True
     else:
+
       def compile_ops(node_def):
         global _REGISTERED_OPS
         if _REGISTERED_OPS is None:
@@ -1827,10 +1866,7 @@ class CompiledWrapper(rnn_cell_impl.RNNCell):
       return self._cell(inputs, state, scope=scope)
 
 
-def _random_exp_initializer(minval,
-                            maxval,
-                            seed=None,
-                            dtype=dtypes.float32):
+def _random_exp_initializer(minval, maxval, seed=None, dtype=dtypes.float32):
   """Returns an exponential distribution initializer.
 
   Args:
@@ -1849,10 +1885,7 @@ def _random_exp_initializer(minval,
     del partition_info  # Unused.
     return math_ops.exp(
         random_ops.random_uniform(
-            shape,
-            math_ops.log(minval),
-            math_ops.log(maxval),
-            dtype,
+            shape, math_ops.log(minval), math_ops.log(maxval), dtype,
             seed=seed))
 
   return _initializer
@@ -1956,8 +1989,7 @@ class PhasedLSTMCell(rnn_cell_impl.RNNCell):
       if self._linear1 is None:
         self._linear1 = _Linear(in_mask_gates, 2 * self._num_units, True)
 
-      mask_gates = math_ops.sigmoid(
-          self._linear1(in_mask_gates))
+      mask_gates = math_ops.sigmoid(self._linear1(in_mask_gates))
       [input_gate, forget_gate] = array_ops.split(
           axis=1, num_or_size_splits=2, value=mask_gates)
 
@@ -1981,12 +2013,12 @@ class PhasedLSTMCell(rnn_cell_impl.RNNCell):
 
     period = vs.get_variable(
         "period", [self._num_units],
-        initializer=_random_exp_initializer(
-            self._period_init_min, self._period_init_max))
+        initializer=_random_exp_initializer(self._period_init_min,
+                                            self._period_init_max))
     phase = vs.get_variable(
         "phase", [self._num_units],
-        initializer=init_ops.random_uniform_initializer(
-            0., period.initial_value))
+        initializer=init_ops.random_uniform_initializer(0.,
+                                                        period.initial_value))
     ratio_on = vs.get_variable(
         "ratio_on", [self._num_units],
         initializer=init_ops.constant_initializer(self._ratio_on),
@@ -2008,6 +2040,7 @@ class PhasedLSTMCell(rnn_cell_impl.RNNCell):
 
     return new_h, new_state
 
+
 class ConvLSTMCell(rnn_cell_impl.RNNCell):
   """Convolutional LSTM recurrent network cell.
 
@@ -2041,7 +2074,7 @@ class ConvLSTMCell(rnn_cell_impl.RNNCell):
     """
     super(ConvLSTMCell, self).__init__(name=name)
 
-    if conv_ndims != len(input_shape)-1:
+    if conv_ndims != len(input_shape) - 1:
       raise ValueError("Invalid input_shape {} for conv_ndims={}.".format(
           input_shape, conv_ndims))
 
@@ -2060,8 +2093,8 @@ class ConvLSTMCell(rnn_cell_impl.RNNCell):
     state_size = tensor_shape.TensorShape(
         self._input_shape[:-1] + [self._output_channels])
     self._state_size = rnn_cell_impl.LSTMStateTuple(state_size, state_size)
-    self._output_size = tensor_shape.TensorShape(self._input_shape[:-1]
-                                                 + [self._total_output_channels])
+    self._output_size = tensor_shape.TensorShape(
+        self._input_shape[:-1] + [self._total_output_channels])
 
   @property
   def output_size(self):
@@ -2073,13 +2106,10 @@ class ConvLSTMCell(rnn_cell_impl.RNNCell):
 
   def call(self, inputs, state, scope=None):
     cell, hidden = state
-    new_hidden = _conv([inputs, hidden],
-                       self._kernel_shape,
-                       4*self._output_channels,
-                       self._use_bias)
-    gates = array_ops.split(value=new_hidden,
-                            num_or_size_splits=4,
-                            axis=self._conv_ndims+1)
+    new_hidden = _conv([inputs, hidden], self._kernel_shape,
+                       4 * self._output_channels, self._use_bias)
+    gates = array_ops.split(
+        value=new_hidden, num_or_size_splits=4, axis=self._conv_ndims + 1)
 
     input_gate, new_input, forget_gate, output_gate = gates
     new_cell = math_ops.sigmoid(forget_gate + self._forget_bias) * cell
@@ -2091,29 +2121,35 @@ class ConvLSTMCell(rnn_cell_impl.RNNCell):
     new_state = rnn_cell_impl.LSTMStateTuple(new_cell, output)
     return output, new_state
 
+
 class Conv1DLSTMCell(ConvLSTMCell):
   """1D Convolutional LSTM recurrent network cell.
 
   https://arxiv.org/pdf/1506.04214v1.pdf
   """
+
   def __init__(self, name="conv_1d_lstm_cell", **kwargs):
     """Construct Conv1DLSTM. See `ConvLSTMCell` for more details."""
     super(Conv1DLSTMCell, self).__init__(conv_ndims=1, **kwargs)
 
+
 class Conv2DLSTMCell(ConvLSTMCell):
   """2D Convolutional LSTM recurrent network cell.
 
   https://arxiv.org/pdf/1506.04214v1.pdf
   """
+
   def __init__(self, name="conv_2d_lstm_cell", **kwargs):
     """Construct Conv2DLSTM. See `ConvLSTMCell` for more details."""
     super(Conv2DLSTMCell, self).__init__(conv_ndims=2, **kwargs)
 
+
 class Conv3DLSTMCell(ConvLSTMCell):
   """3D Convolutional LSTM recurrent network cell.
 
   https://arxiv.org/pdf/1506.04214v1.pdf
   """
+
   def __init__(self, name="conv_3d_lstm_cell", **kwargs):
     """Construct Conv3DLSTM. See `ConvLSTMCell` for more details."""
     super(Conv3DLSTMCell, self).__init__(conv_ndims=3, **kwargs)
@@ -2138,7 +2174,7 @@ def _conv(args, filter_size, num_features, bias, bias_start=0.0):
   shapes = [a.get_shape().as_list() for a in args]
   shape_length = len(shapes[0])
   for shape in shapes:
-    if len(shape) not in [3,4,5]:
+    if len(shape) not in [3, 4, 5]:
       raise ValueError("Conv Linear expects 3D, 4D "
                        "or 5D arguments: %s" % str(shapes))
     if len(shape) != len(shapes[0]):
@@ -2149,40 +2185,36 @@ def _conv(args, filter_size, num_features, bias, bias_start=0.0):
   dtype = [a.dtype for a in args][0]
 
   # determine correct conv operation
-  if   shape_length == 3:
+  if shape_length == 3:
     conv_op = nn_ops.conv1d
     strides = 1
   elif shape_length == 4:
     conv_op = nn_ops.conv2d
-    strides = shape_length*[1]
+    strides = shape_length * [1]
   elif shape_length == 5:
     conv_op = nn_ops.conv3d
-    strides = shape_length*[1]
+    strides = shape_length * [1]
 
   # Now the computation.
   kernel = vs.get_variable(
-      "kernel",
-      filter_size + [total_arg_size_depth, num_features],
-      dtype=dtype)
+      "kernel", filter_size + [total_arg_size_depth, num_features], dtype=dtype)
   if len(args) == 1:
-    res = conv_op(args[0],
-                  kernel,
-                  strides,
-                  padding='SAME')
+    res = conv_op(args[0], kernel, strides, padding="SAME")
   else:
-    res = conv_op(array_ops.concat(axis=shape_length-1, values=args),
-                  kernel,
-                  strides,
-                  padding='SAME')
+    res = conv_op(
+        array_ops.concat(axis=shape_length - 1, values=args),
+        kernel,
+        strides,
+        padding="SAME")
   if not bias:
     return res
   bias_term = vs.get_variable(
       "biases", [num_features],
       dtype=dtype,
-      initializer=init_ops.constant_initializer(
-          bias_start, dtype=dtype))
+      initializer=init_ops.constant_initializer(bias_start, dtype=dtype))
   return res + bias_term
 
+
 class GLSTMCell(rnn_cell_impl.RNNCell):
   """Group LSTM cell (G-LSTM).
 
@@ -2194,8 +2226,13 @@ class GLSTMCell(rnn_cell_impl.RNNCell):
   "Factorization Tricks for LSTM Networks", ICLR 2017 workshop.
   """
 
-  def __init__(self, num_units, initializer=None, num_proj=None,
-               number_of_groups=1, forget_bias=1.0, activation=math_ops.tanh,
+  def __init__(self,
+               num_units,
+               initializer=None,
+               num_proj=None,
+               number_of_groups=1,
+               forget_bias=1.0,
+               activation=math_ops.tanh,
                reuse=None):
     """Initialize the parameters of G-LSTM cell.
 
@@ -2232,11 +2269,15 @@ class GLSTMCell(rnn_cell_impl.RNNCell):
     if self._num_proj:
       if self._num_proj % self._number_of_groups != 0:
         raise ValueError("num_proj must be divisible by number_of_groups")
-      self._group_shape = [int(self._num_proj / self._number_of_groups),
-                           int(self._num_units / self._number_of_groups)]
+      self._group_shape = [
+          int(self._num_proj / self._number_of_groups),
+          int(self._num_units / self._number_of_groups)
+      ]
     else:
-      self._group_shape = [int(self._num_units / self._number_of_groups),
-                           int(self._num_units / self._number_of_groups)]
+      self._group_shape = [
+          int(self._num_units / self._number_of_groups),
+          int(self._num_units / self._number_of_groups)
+      ]
 
     if num_proj:
       self._state_size = rnn_cell_impl.LSTMStateTuple(num_units, num_proj)
@@ -2268,10 +2309,11 @@ class GLSTMCell(rnn_cell_impl.RNNCell):
       subset of inputs corresponding to group "group_id",
       a Tensor, 2D, [batch x num_units/number_of_groups]
     """
-    return array_ops.slice(input_=inputs,
-                           begin=[0, group_id * group_size],
-                           size=[self._batch_size, group_size],
-                           name=("GLSTM_group%d_input_generation" % group_id))
+    return array_ops.slice(
+        input_=inputs,
+        begin=[0, group_id * group_size],
+        size=[self._batch_size, group_size],
+        name=("GLSTM_group%d_input_generation" % group_id))
 
   def call(self, inputs, state):
     """Run one step of G-LSTM.
@@ -2310,10 +2352,13 @@ class GLSTMCell(rnn_cell_impl.RNNCell):
       for group_id in range(self._number_of_groups):
         with vs.variable_scope("group%d" % group_id):
           x_g_id = array_ops.concat(
-            [self._get_input_for_group(inputs, group_id,
-                                       self._group_shape[0]),
-             self._get_input_for_group(m_prev, group_id,
-                                       self._group_shape[0])], axis=1)
+              [
+                  self._get_input_for_group(inputs, group_id,
+                                            self._group_shape[0]),
+                  self._get_input_for_group(m_prev, group_id,
+                                            self._group_shape[0])
+              ],
+              axis=1)
           if self._linear1 is None:
             self._linear1 = _Linear(x_g_id, 4 * self._group_shape[1], False)
           R_k = self._linear1(x_g_id)  # pylint: disable=invalid-name
@@ -2324,34 +2369,35 @@ class GLSTMCell(rnn_cell_impl.RNNCell):
         f_parts.append(f_k)
         o_parts.append(o_k)
 
-      bi = vs.get_variable(name="bias_i",
-                           shape=[self._num_units],
-                           dtype=dtype,
-                           initializer=
-                           init_ops.constant_initializer(0.0, dtype=dtype))
-      bj = vs.get_variable(name="bias_j",
-                           shape=[self._num_units],
-                           dtype=dtype,
-                           initializer=
-                           init_ops.constant_initializer(0.0, dtype=dtype))
-      bf = vs.get_variable(name="bias_f",
-                           shape=[self._num_units],
-                           dtype=dtype,
-                           initializer=
-                           init_ops.constant_initializer(0.0, dtype=dtype))
-      bo = vs.get_variable(name="bias_o",
-                           shape=[self._num_units],
-                           dtype=dtype,
-                           initializer=
-                           init_ops.constant_initializer(0.0, dtype=dtype))
+      bi = vs.get_variable(
+          name="bias_i",
+          shape=[self._num_units],
+          dtype=dtype,
+          initializer=init_ops.constant_initializer(0.0, dtype=dtype))
+      bj = vs.get_variable(
+          name="bias_j",
+          shape=[self._num_units],
+          dtype=dtype,
+          initializer=init_ops.constant_initializer(0.0, dtype=dtype))
+      bf = vs.get_variable(
+          name="bias_f",
+          shape=[self._num_units],
+          dtype=dtype,
+          initializer=init_ops.constant_initializer(0.0, dtype=dtype))
+      bo = vs.get_variable(
+          name="bias_o",
+          shape=[self._num_units],
+          dtype=dtype,
+          initializer=init_ops.constant_initializer(0.0, dtype=dtype))
 
       i = nn_ops.bias_add(array_ops.concat(i_parts, axis=1), bi)
       j = nn_ops.bias_add(array_ops.concat(j_parts, axis=1), bj)
       f = nn_ops.bias_add(array_ops.concat(f_parts, axis=1), bf)
       o = nn_ops.bias_add(array_ops.concat(o_parts, axis=1), bo)
 
-    c = (math_ops.sigmoid(f + self._forget_bias) * c_prev +
-         math_ops.sigmoid(i) * math_ops.tanh(j))
+    c = (
+        math_ops.sigmoid(f + self._forget_bias) * c_prev +
+        math_ops.sigmoid(i) * math_ops.tanh(j))
     m = math_ops.sigmoid(o) * self._activation(c)
 
     if self._num_proj is not None:
@@ -2636,10 +2682,12 @@ class LayerNormLSTMCell(rnn_cell_impl.RNNCell):
 
 class SRUCell(rnn_cell_impl._LayerRNNCell):
   """SRU, Simple Recurrent Unit
+
      Implementation based on
      Training RNNs as Fast as CNNs (cf. https://arxiv.org/abs/1709.02755).
 
-     This variation of RNN cell is characterized by the simplified data dependence
+     This variation of RNN cell is characterized by the simplified data
+     dependence
      between hidden states of two consecutive time steps. Traditionally, hidden
      states from a cell at time step t-1 needs to be multiplied with a matrix
      W_hh before being fed into the ensuing cell at time step t.
@@ -2657,8 +2705,8 @@ class SRUCell(rnn_cell_impl._LayerRNNCell):
       will share weights, but to avoid mistakes we require reuse=True in such
       cases.
   """
-  def __init__(self, num_units,
-               activation=None, reuse=None, name=None):
+
+  def __init__(self, num_units, activation=None, reuse=None, name=None):
     super(SRUCell, self).__init__(_reuse=reuse, name=name)
     self._num_units = num_units
     self._activation = activation or math_ops.tanh
@@ -2676,8 +2724,8 @@ class SRUCell(rnn_cell_impl._LayerRNNCell):
 
   def build(self, inputs_shape):
     if inputs_shape[1].value is None:
-      raise ValueError("Expected inputs.shape[-1] to be known, saw shape: %s"
-                       % inputs_shape)
+      raise ValueError(
+          "Expected inputs.shape[-1] to be known, saw shape: %s" % inputs_shape)
 
     input_depth = inputs_shape[1].value
 
@@ -2712,12 +2760,12 @@ class SRUCell(rnn_cell_impl._LayerRNNCell):
     """Simple recurrent unit (SRU) with num_units cells."""
 
     U = math_ops.matmul(inputs, self._kernel)
-    x_bar, f_intermediate, r_intermediate = array_ops.split(value=U,
-                                                            num_or_size_splits=3,
-                                                            axis=1)
+    x_bar, f_intermediate, r_intermediate = array_ops.split(
+        value=U, num_or_size_splits=3, axis=1)
 
-    f_r = math_ops.sigmoid(nn_ops.bias_add(array_ops.concat(
-        [f_intermediate, r_intermediate], 1), self._bias))
+    f_r = math_ops.sigmoid(
+        nn_ops.bias_add(
+            array_ops.concat([f_intermediate, r_intermediate], 1), self._bias))
     f, r = array_ops.split(value=f_r, num_or_size_splits=2, axis=1)
 
     c = f * state + (1.0 - f) * x_bar
@@ -2750,9 +2798,16 @@ class WeightNormLSTMCell(rnn_cell_impl.RNNCell):
     large scale acoustic modeling." INTERSPEECH, 2014.
   """
 
-  def __init__(self, num_units, norm=True, use_peepholes=False,
-               cell_clip=None, initializer=None, num_proj=None,
-               proj_clip=None, forget_bias=1, activation=None,
+  def __init__(self,
+               num_units,
+               norm=True,
+               use_peepholes=False,
+               cell_clip=None,
+               initializer=None,
+               num_proj=None,
+               proj_clip=None,
+               forget_bias=1,
+               activation=None,
                reuse=None):
     """Initialize the parameters of a weight-normalized LSTM cell.
 
@@ -2779,7 +2834,7 @@ class WeightNormLSTMCell(rnn_cell_impl.RNNCell):
     """
     super(WeightNormLSTMCell, self).__init__(_reuse=reuse)
 
-    self._scope = 'wn_lstm_cell'
+    self._scope = "wn_lstm_cell"
     self._num_units = num_units
     self._norm = norm
     self._initializer = initializer
@@ -2822,7 +2877,8 @@ class WeightNormLSTMCell(rnn_cell_impl.RNNCell):
     g = vs.get_variable(name, [output_size], dtype=weight.dtype)
     return nn_impl.l2_normalize(weight, dim=0) * g
 
-  def _linear(self, args,
+  def _linear(self,
+              args,
               output_size,
               norm,
               bias,
@@ -2877,8 +2933,8 @@ class WeightNormLSTMCell(rnn_cell_impl.RNNCell):
         with ops.control_dependencies(None):
           for i in range(len(args)):
             en = st + shapes[i][1].value
-            wn.append(self._normalize(weights[st:en, :],
-                                      name='norm_{}'.format(i)))
+            wn.append(
+                self._normalize(weights[st:en, :], name="norm_{}".format(i)))
             st = en
 
           weights = array_ops.concat(wn, axis=0)
@@ -2936,8 +2992,8 @@ class WeightNormLSTMCell(rnn_cell_impl.RNNCell):
 
     with vs.variable_scope(self._scope, initializer=self._initializer):
 
-      concat = self._linear([inputs, h], 4 * num_units,
-                            norm=self._norm, bias=True)
+      concat = self._linear(
+          [inputs, h], 4 * num_units, norm=self._norm, bias=True)
 
       # i = input_gate, j = new_input, f = forget_gate, o = output_gate
       i, j, f, o = array_ops.split(value=concat, num_or_size_splits=4, axis=1)
@@ -2947,11 +3003,13 @@ class WeightNormLSTMCell(rnn_cell_impl.RNNCell):
         w_i_diag = vs.get_variable("w_i_diag", shape=[num_units], dtype=dtype)
         w_o_diag = vs.get_variable("w_o_diag", shape=[num_units], dtype=dtype)
 
-        new_c = (c * sigmoid(f + self._forget_bias + w_f_diag * c)
-                 + sigmoid(i + w_i_diag * c) * self._activation(j))
+        new_c = (
+            c * sigmoid(f + self._forget_bias + w_f_diag * c) +
+            sigmoid(i + w_i_diag * c) * self._activation(j))
       else:
-        new_c = (c * sigmoid(f + self._forget_bias)
-                 + sigmoid(i) * self._activation(j))
+        new_c = (
+            c * sigmoid(f + self._forget_bias) +
+            sigmoid(i) * self._activation(j))
 
       if self._cell_clip is not None:
         # pylint: disable=invalid-unary-operand-type
@@ -2964,15 +3022,12 @@ class WeightNormLSTMCell(rnn_cell_impl.RNNCell):
 
       if self._num_proj is not None:
         with vs.variable_scope("projection"):
-          new_h = self._linear(new_h,
-                               self._num_proj,
-                               norm=self._norm,
-                               bias=False)
+          new_h = self._linear(
+              new_h, self._num_proj, norm=self._norm, bias=False)
 
         if self._proj_clip is not None:
           # pylint: disable=invalid-unary-operand-type
-          new_h = clip_ops.clip_by_value(new_h,
-                                         -self._proj_clip,
+          new_h = clip_ops.clip_by_value(new_h, -self._proj_clip,
                                          self._proj_clip)
           # pylint: enable=invalid-unary-operand-type
 
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py
index f498b2bb57..9265540317 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py
@@ -46,20 +46,18 @@ class TestGatherTree(test.TestCase):
 
     # create (batch_size, max_time, beam_width) matrix and transpose it
     predicted_ids = np.array(
-        [[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
-         [[2, 3, 4], [5, 6, 7], [8, 9, 10]]],
+        [[[1, 2, 3], [4, 5, 6], [7, 8, 9]], [[2, 3, 4], [5, 6, 7], [8, 9, 10]]],
         dtype=np.int32).transpose([1, 0, 2])
     parent_ids = np.array(
-        [[[0, 0, 0], [0, 1, 1], [2, 1, 2]],
-         [[0, 0, 0], [1, 2, 0], [2, 1, 1]]],
+        [[[0, 0, 0], [0, 1, 1], [2, 1, 2]], [[0, 0, 0], [1, 2, 0], [2, 1, 1]]],
         dtype=np.int32).transpose([1, 0, 2])
 
     # sequence_lengths is shaped (batch_size = 3)
     max_sequence_lengths = [3, 3]
 
-    expected_result = np.array(
-        [[[2, 2, 2], [6, 5, 6], [7, 8, 9]],
-         [[2, 4, 4], [7, 6, 6], [8, 9, 10]]]).transpose([1, 0, 2])
+    expected_result = np.array([[[2, 2, 2], [6, 5, 6], [7, 8, 9]],
+                                [[2, 4, 4], [7, 6, 6],
+                                 [8, 9, 10]]]).transpose([1, 0, 2])
 
     res = beam_search_ops.gather_tree(
         predicted_ids,
@@ -157,8 +155,8 @@ class TestBeamStep(test.TestCase):
     self.assertAllEqual(outputs_.predicted_ids, [[3, 3, 2], [2, 2, 1]])
     self.assertAllEqual(outputs_.parent_ids, [[1, 0, 0], [2, 1, 0]])
     self.assertAllEqual(next_state_.lengths, [[3, 3, 3], [3, 3, 3]])
-    self.assertAllEqual(next_state_.finished, [[False, False, False],
-                                               [False, False, False]])
+    self.assertAllEqual(next_state_.finished,
+                        [[False, False, False], [False, False, False]])
 
     expected_log_probs = []
     expected_log_probs.append(state_.log_probs[0][[1, 0, 0]])
@@ -212,8 +210,8 @@ class TestBeamStep(test.TestCase):
     self.assertAllEqual(outputs_.parent_ids, [[1, 0, 0], [1, 2, 0]])
     self.assertAllEqual(outputs_.predicted_ids, [[0, 3, 2], [2, 0, 1]])
     self.assertAllEqual(next_state_.lengths, [[1, 3, 3], [3, 1, 3]])
-    self.assertAllEqual(next_state_.finished, [[True, False, False],
-                                               [False, True, False]])
+    self.assertAllEqual(next_state_.finished,
+                        [[True, False, False], [False, True, False]])
 
     expected_log_probs = []
     expected_log_probs.append(state_.log_probs[0][[1, 0, 0]])
@@ -226,9 +224,10 @@ class TestBeamStep(test.TestCase):
 
 
 class TestLargeBeamStep(test.TestCase):
-  """
-  Tests a single step of beam search in such
-  case that beam size is larger than vocabulary size.
+  """Tests large beam step.
+
+  Tests a single step of beam search in such case that beam size is larger than
+  vocabulary size.
   """
 
   def setUp(self):
@@ -239,19 +238,21 @@ class TestLargeBeamStep(test.TestCase):
     self.end_token = 0
     self.length_penalty_weight = 0.6
 
-
   def test_step(self):
-    def get_probs():
-      """this simulates the initialize method in BeamSearchDecoder"""
-      log_prob_mask = array_ops.one_hot(array_ops.zeros([self.batch_size],
-                                                        dtype=dtypes.int32),
-                                        depth=self.beam_width, on_value=True,
-                                        off_value=False, dtype=dtypes.bool)
 
-      log_prob_zeros = array_ops.zeros([self.batch_size, self.beam_width],
-                                       dtype=dtypes.float32)
-      log_prob_neg_inf = array_ops.ones([self.batch_size, self.beam_width],
-                                        dtype=dtypes.float32) * -np.Inf
+    def get_probs():
+      """this simulates the initialize method in BeamSearchDecoder."""
+      log_prob_mask = array_ops.one_hot(
+          array_ops.zeros([self.batch_size], dtype=dtypes.int32),
+          depth=self.beam_width,
+          on_value=True,
+          off_value=False,
+          dtype=dtypes.bool)
+
+      log_prob_zeros = array_ops.zeros(
+          [self.batch_size, self.beam_width], dtype=dtypes.float32)
+      log_prob_neg_inf = array_ops.ones(
+          [self.batch_size, self.beam_width], dtype=dtypes.float32) * -np.Inf
 
       log_probs = array_ops.where(log_prob_mask, log_prob_zeros,
                                   log_prob_neg_inf)
@@ -260,12 +261,15 @@ class TestLargeBeamStep(test.TestCase):
     log_probs = get_probs()
     dummy_cell_state = array_ops.zeros([self.batch_size, self.beam_width])
 
+    # pylint: disable=invalid-name
     _finished = array_ops.one_hot(
         array_ops.zeros([self.batch_size], dtype=dtypes.int32),
-        depth=self.beam_width, on_value=False,
-        off_value=True, dtype=dtypes.bool)
+        depth=self.beam_width,
+        on_value=False,
+        off_value=True,
+        dtype=dtypes.bool)
     _lengths = np.zeros([self.batch_size, self.beam_width], dtype=np.int64)
-    _lengths[:, 0]=2
+    _lengths[:, 0] = 2
     _lengths = constant_op.constant(_lengths, dtype=dtypes.int64)
 
     beam_state = beam_search_decoder.BeamSearchDecoderState(
@@ -298,20 +302,20 @@ class TestLargeBeamStep(test.TestCase):
         length_penalty_weight=self.length_penalty_weight)
 
     with self.test_session() as sess:
-      outputs_, next_state_, state_, log_probs_ = sess.run(
+      outputs_, next_state_, _, _ = sess.run(
           [outputs, next_beam_state, beam_state, log_probs])
 
     self.assertEqual(outputs_.predicted_ids[0, 0], 3)
     self.assertEqual(outputs_.predicted_ids[0, 1], 2)
     self.assertEqual(outputs_.predicted_ids[1, 0], 1)
     neg_inf = -np.Inf
-    self.assertAllEqual(next_state_.log_probs[:, -3:],
-                        [[neg_inf, neg_inf, neg_inf],
-                         [neg_inf, neg_inf, neg_inf]])
+    self.assertAllEqual(
+        next_state_.log_probs[:, -3:],
+        [[neg_inf, neg_inf, neg_inf], [neg_inf, neg_inf, neg_inf]])
     self.assertEqual((next_state_.log_probs[:, :-3] > neg_inf).all(), True)
     self.assertEqual((next_state_.lengths[:, :-3] > 0).all(), True)
-    self.assertAllEqual(next_state_.lengths[:, -3:], [[0, 0, 0],
-                                                      [0, 0, 0]])
+    self.assertAllEqual(next_state_.lengths[:, -3:], [[0, 0, 0], [0, 0, 0]])
+
 
 class BeamSearchDecoderTest(test.TestCase):
 
@@ -338,8 +342,8 @@ class BeamSearchDecoderTest(test.TestCase):
       initial_state = cell.zero_state(batch_size, dtypes.float32)
       if has_attention:
         inputs = array_ops.placeholder_with_default(
-            np.random.randn(batch_size, decoder_max_time,
-                            input_depth).astype(np.float32),
+            np.random.randn(batch_size, decoder_max_time, input_depth).astype(
+                np.float32),
             shape=(None, None, input_depth))
         tiled_inputs = beam_search_decoder.tile_batch(
             inputs, multiplier=beam_width)
@@ -359,8 +363,7 @@ class BeamSearchDecoderTest(test.TestCase):
       cell_state = cell.zero_state(
           dtype=dtypes.float32, batch_size=batch_size_tensor * beam_width)
       if has_attention:
-        cell_state = cell_state.clone(
-            cell_state=initial_state)
+        cell_state = cell_state.clone(cell_state=initial_state)
       bsd = beam_search_decoder.BeamSearchDecoder(
           cell=cell,
           embedding=embedding,
diff --git a/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py b/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
index a5f7169c31..d6184d6109 100644
--- a/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
+++ b/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
@@ -37,7 +37,6 @@ from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.util import nest
 
-
 __all__ = [
     "BeamSearchDecoderOutput",
     "BeamSearchDecoderState",
@@ -48,8 +47,8 @@ __all__ = [
 
 
 class BeamSearchDecoderState(
-    collections.namedtuple("BeamSearchDecoderState", ("cell_state", "log_probs",
-                                                      "finished", "lengths"))):
+    collections.namedtuple("BeamSearchDecoderState",
+                           ("cell_state", "log_probs", "finished", "lengths"))):
   pass
 
 
@@ -85,11 +84,12 @@ def _tile_batch(t, multiplier):
   tiled_static_batch_size = (
       t.shape[0].value * multiplier if t.shape[0].value is not None else None)
   tiled = array_ops.tile(array_ops.expand_dims(t, 1), tiling)
-  tiled = array_ops.reshape(
-      tiled, array_ops.concat(([shape_t[0] * multiplier], shape_t[1:]), 0))
+  tiled = array_ops.reshape(tiled,
+                            array_ops.concat(
+                                ([shape_t[0] * multiplier], shape_t[1:]), 0))
   tiled.set_shape(
-      tensor_shape.TensorShape(
-          [tiled_static_batch_size]).concatenate(t.shape[1:]))
+      tensor_shape.TensorShape([tiled_static_batch_size]).concatenate(
+          t.shape[1:]))
   return tiled
 
 
@@ -197,8 +197,8 @@ class BeamSearchDecoder(decoder.Decoder):
     """
     if not rnn_cell_impl._like_rnncell(cell):  # pylint: disable=protected-access
       raise TypeError("cell must be an RNNCell, received: %s" % type(cell))
-    if (output_layer is not None
-        and not isinstance(output_layer, layers_base.Layer)):
+    if (output_layer is not None and
+        not isinstance(output_layer, layers_base.Layer)):
       raise TypeError(
           "output_layer must be a Layer, received: %s" % type(output_layer))
     self._cell = cell
@@ -223,16 +223,17 @@ class BeamSearchDecoder(decoder.Decoder):
     self._beam_width = beam_width
     self._length_penalty_weight = length_penalty_weight
     self._initial_cell_state = nest.map_structure(
-        self._maybe_split_batch_beams,
-        initial_state, self._cell.state_size)
+        self._maybe_split_batch_beams, initial_state, self._cell.state_size)
     self._start_tokens = array_ops.tile(
         array_ops.expand_dims(self._start_tokens, 1), [1, self._beam_width])
     self._start_inputs = self._embedding_fn(self._start_tokens)
-    
+
     self._finished = array_ops.one_hot(
         array_ops.zeros([self._batch_size], dtype=dtypes.int32),
-        depth=self._beam_width, on_value=False,
-        off_value=True, dtype=dtypes.bool)
+        depth=self._beam_width,
+        on_value=False,
+        off_value=True,
+        dtype=dtypes.bool)
 
   @property
   def batch_size(self):
@@ -250,8 +251,7 @@ class BeamSearchDecoder(decoder.Decoder):
       # dimensions to get the output size of the rnn with the layer
       # applied to the top.
       output_shape_with_unknown_batch = nest.map_structure(
-          lambda s: tensor_shape.TensorShape([None]).concatenate(s),
-          size)
+          lambda s: tensor_shape.TensorShape([None]).concatenate(s), size)
       layer_output_shape = self._output_layer.compute_output_shape(
           output_shape_with_unknown_batch)
       return nest.map_structure(lambda s: s[1:], layer_output_shape)
@@ -302,10 +302,11 @@ class BeamSearchDecoder(decoder.Decoder):
 
     log_probs = array_ops.one_hot(  # shape(batch_sz, beam_sz)
         array_ops.zeros([self._batch_size], dtype=dtypes.int32),
-        depth=self._beam_width, on_value=0.0, off_value=-np.Inf,
+        depth=self._beam_width,
+        on_value=0.0,
+        off_value=-np.Inf,
         dtype=nest.flatten(self._initial_cell_state)[0].dtype)
 
-
     initial_state = BeamSearchDecoderState(
         cell_state=self._initial_cell_state,
         log_probs=log_probs,
@@ -365,11 +366,12 @@ class BeamSearchDecoder(decoder.Decoder):
     t_shape = array_ops.shape(t)
     static_batch_size = tensor_util.constant_value(self._batch_size)
     batch_size_beam_width = (
-        None if static_batch_size is None
-        else static_batch_size * self._beam_width)
+        None
+        if static_batch_size is None else static_batch_size * self._beam_width)
     reshaped_t = array_ops.reshape(
-        t, array_ops.concat(
-            ([self._batch_size * self._beam_width], t_shape[2:]), 0))
+        t,
+        array_ops.concat(([self._batch_size * self._beam_width], t_shape[2:]),
+                         0))
     reshaped_t.set_shape(
         (tensor_shape.TensorShape([batch_size_beam_width]).concatenate(s)))
     return reshaped_t
@@ -398,8 +400,9 @@ class BeamSearchDecoder(decoder.Decoder):
       s = tensor_shape.TensorShape(s)
     t_shape = array_ops.shape(t)
     reshaped_t = array_ops.reshape(
-        t, array_ops.concat(
-            ([self._batch_size, self._beam_width], t_shape[1:]), 0))
+        t,
+        array_ops.concat(([self._batch_size, self._beam_width], t_shape[1:]),
+                         0))
     static_batch_size = tensor_util.constant_value(self._batch_size)
     expected_reshaped_shape = tensor_shape.TensorShape(
         [static_batch_size, self._beam_width]).concatenate(s)
@@ -409,8 +412,8 @@ class BeamSearchDecoder(decoder.Decoder):
                        "We expected it to have shape "
                        "(batch_size, beam_width, depth) == %s.  Perhaps you "
                        "forgot to create a zero_state with "
-                       "batch_size=encoder_batch_size * beam_width?"
-                       % (reshaped_t.shape, expected_reshaped_shape))
+                       "batch_size=encoder_batch_size * beam_width?" %
+                       (reshaped_t.shape, expected_reshaped_shape))
     reshaped_t.set_shape(expected_reshaped_shape)
     return reshaped_t
 
@@ -482,15 +485,13 @@ class BeamSearchDecoder(decoder.Decoder):
       cell_state = state.cell_state
       inputs = nest.map_structure(
           lambda inp: self._merge_batch_beams(inp, s=inp.shape[2:]), inputs)
-      cell_state = nest.map_structure(
-          self._maybe_merge_batch_beams,
-          cell_state, self._cell.state_size)
+      cell_state = nest.map_structure(self._maybe_merge_batch_beams, cell_state,
+                                      self._cell.state_size)
       cell_outputs, next_cell_state = self._cell(inputs, cell_state)
       cell_outputs = nest.map_structure(
           lambda out: self._split_batch_beams(out, out.shape[1:]), cell_outputs)
       next_cell_state = nest.map_structure(
-          self._maybe_split_batch_beams,
-          next_cell_state, self._cell.state_size)
+          self._maybe_split_batch_beams, next_cell_state, self._cell.state_size)
 
       if self._output_layer is not None:
         cell_outputs = self._output_layer(cell_outputs)
@@ -553,7 +554,8 @@ def _beam_search_step(time, logits, next_cell_state, beam_state, batch_size,
   lengths_to_add = array_ops.one_hot(
       indices=array_ops.fill([batch_size, beam_width], end_token),
       depth=vocab_size,
-      on_value=np.int64(0), off_value=np.int64(1),
+      on_value=np.int64(0),
+      off_value=np.int64(1),
       dtype=dtypes.int64)
   add_mask = math_ops.to_int64(math_ops.logical_not(previously_finished))
   lengths_to_add *= array_ops.expand_dims(add_mask, 2)
@@ -572,8 +574,8 @@ def _beam_search_step(time, logits, next_cell_state, beam_state, batch_size,
   scores_flat = array_ops.reshape(scores, [batch_size, -1])
 
   # Pick the next beams according to the specified successors function
-  next_beam_size = ops.convert_to_tensor(beam_width, dtype=dtypes.int32,
-                                         name="beam_width")
+  next_beam_size = ops.convert_to_tensor(
+      beam_width, dtype=dtypes.int32, name="beam_width")
   next_beam_scores, word_indices = nn_ops.top_k(scores_flat, k=next_beam_size)
 
   next_beam_scores.set_shape([static_batch_size, beam_width])
@@ -592,11 +594,11 @@ def _beam_search_step(time, logits, next_cell_state, beam_state, batch_size,
   #       name="next_beam_word_ids")
   # would be a lot cleaner but for reasons unclear, that hides the results of
   # the op which prevents capturing it with tfdbg debug ops.
-  raw_next_word_ids = math_ops.mod(word_indices, vocab_size,
-                                   name="next_beam_word_ids")
+  raw_next_word_ids = math_ops.mod(
+      word_indices, vocab_size, name="next_beam_word_ids")
   next_word_ids = math_ops.to_int32(raw_next_word_ids)
-  next_beam_ids = math_ops.to_int32(word_indices / vocab_size,
-                                    name="next_beam_parent_ids")
+  next_beam_ids = math_ops.to_int32(
+      word_indices / vocab_size, name="next_beam_parent_ids")
 
   # Append new ids to current predictions
   previously_finished = _tensor_gather_helper(
@@ -605,9 +607,10 @@ def _beam_search_step(time, logits, next_cell_state, beam_state, batch_size,
       batch_size=batch_size,
       range_size=beam_width,
       gather_shape=[-1])
-  next_finished = math_ops.logical_or(previously_finished,
-                                      math_ops.equal(next_word_ids, end_token),
-                                      name="next_beam_finished")
+  next_finished = math_ops.logical_or(
+      previously_finished,
+      math_ops.equal(next_word_ids, end_token),
+      name="next_beam_finished")
 
   # Calculate the length of the next predictions.
   # 1. Finished beams remain unchanged.
@@ -768,8 +771,12 @@ def _maybe_tensor_gather_helper(gather_indices, gather_from, batch_size,
     return gather_from
 
 
-def _tensor_gather_helper(gather_indices, gather_from, batch_size,
-                          range_size, gather_shape, name=None):
+def _tensor_gather_helper(gather_indices,
+                          gather_from,
+                          batch_size,
+                          range_size,
+                          gather_shape,
+                          name=None):
   """Helper for gathering the right indices from the tensor.
 
   This works by reshaping gather_from to gather_shape (e.g. [-1]) and then
@@ -800,9 +807,9 @@ def _tensor_gather_helper(gather_indices, gather_from, batch_size,
         array_ops.reshape(gather_from, gather_shape), gather_indices)
     final_shape = array_ops.shape(gather_from)[:1 + len(gather_shape)]
     static_batch_size = tensor_util.constant_value(batch_size)
-    final_static_shape = (tensor_shape.TensorShape([static_batch_size])
-                          .concatenate(
-                              gather_from.shape[1:1 + len(gather_shape)]))
+    final_static_shape = (
+        tensor_shape.TensorShape([static_batch_size]).concatenate(
+            gather_from.shape[1:1 + len(gather_shape)]))
     output = array_ops.reshape(output, final_shape, name="output")
     output.set_shape(final_static_shape)
     return output
diff --git a/tensorflow/contrib/verbs/rdma.cc b/tensorflow/contrib/verbs/rdma.cc
index ec5271abe0..7d95b6522c 100644
--- a/tensorflow/contrib/verbs/rdma.cc
+++ b/tensorflow/contrib/verbs/rdma.cc
@@ -15,10 +15,11 @@ limitations under the License.
 
 #ifdef TENSORFLOW_USE_VERBS
 
+#include <fcntl.h>
+#include <cstdlib>
+
 #include "tensorflow/contrib/verbs/rdma.h"
 #include "tensorflow/contrib/verbs/verbs_service.pb.h"
-#include <cstdlib>
-#include <fcntl.h>
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/common_runtime/process_util.h"
@@ -27,15 +28,15 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/gpu/process_state.h"
 #endif
 #include "tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h"
-#include "tensorflow/core/distributed_runtime/session_mgr.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+#include "tensorflow/core/distributed_runtime/session_mgr.h"
 #include "tensorflow/core/framework/rendezvous.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/lib/random/random.h"
-#include "tensorflow/core/lib/core/threadpool.h"
 
 namespace tensorflow {
 
@@ -447,9 +448,9 @@ void RdmaAdapter::Process_CQ() {
     CHECK_GE(ne, 0);
     for (int i = 0; i < ne; ++i) {
       CHECK(wc_[i].status == IBV_WC_SUCCESS)
-          << "Failed status \n" << ibv_wc_status_str(wc_[i].status) << " "
-          << wc_[i].status << " " << static_cast<int>(wc_[i].wr_id) << " "
-          << wc_[i].vendor_err;
+          << "Failed status \n"
+          << ibv_wc_status_str(wc_[i].status) << " " << wc_[i].status << " "
+          << static_cast<int>(wc_[i].wr_id) << " " << wc_[i].vendor_err;
       if (wc_[i].opcode == IBV_WC_RECV_RDMA_WITH_IMM) {
         RdmaChannel* rc = reinterpret_cast<RdmaChannel*>(wc_[i].wr_id);
         // put back a recv wr.
@@ -538,7 +539,7 @@ int RdmaChannel::PingPostRecv() {
 int RdmaChannel::PingPostSend() {
   struct ibv_send_wr wr, *bad_wr;
   memset(&wr, 0, sizeof(wr));
-  wr.wr_id = (uint64_t) this;
+  wr.wr_id = (uint64_t)this;
   wr.sg_list = &ping_sge_list_;
   wr.num_sge = 1;
   wr.opcode = IBV_WR_SEND;
@@ -658,7 +659,7 @@ void RdmaChannel::SetRemoteAddress(const RdmaAddress& ra, bool override) {
 void RdmaChannel::Recv() {
   struct ibv_recv_wr wr;
   memset(&wr, 0, sizeof(wr));
-  wr.wr_id = (uint64_t) this;
+  wr.wr_id = (uint64_t)this;
   struct ibv_recv_wr* bad_wr;
   CHECK(!ibv_post_recv(qp_, &wr, &bad_wr)) << "Failed to post recv";
 }
@@ -729,11 +730,11 @@ void RdmaChannel::Connect(const RdmaAddress& remoteAddr) {
     attr.ah_attr.grh.traffic_class = adapter_->params_.traffic_class;
 
     int r;
-    CHECK(!(r = ibv_modify_qp(qp_, &attr, IBV_QP_STATE | IBV_QP_AV |
-                                              IBV_QP_PATH_MTU |
-                                              IBV_QP_DEST_QPN | IBV_QP_RQ_PSN |
-                                              IBV_QP_MAX_DEST_RD_ATOMIC |
-                                              IBV_QP_MIN_RNR_TIMER)))
+    CHECK(!(r = ibv_modify_qp(qp_, &attr,
+                              IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU |
+                                  IBV_QP_DEST_QPN | IBV_QP_RQ_PSN |
+                                  IBV_QP_MAX_DEST_RD_ATOMIC |
+                                  IBV_QP_MIN_RNR_TIMER)))
         << "QP to Ready to Receive " << r;
 
     memset(&attr, 0, sizeof(ibv_qp_attr));
@@ -744,10 +745,10 @@ void RdmaChannel::Connect(const RdmaAddress& remoteAddr) {
     attr.rnr_retry = 7; /* infinite */
     attr.max_rd_atomic = 1;
 
-    CHECK(!(r = ibv_modify_qp(qp_, &attr, IBV_QP_STATE | IBV_QP_TIMEOUT |
-                                              IBV_QP_RETRY_CNT |
-                                              IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN |
-                                              IBV_QP_MAX_QP_RD_ATOMIC)))
+    CHECK(!(r = ibv_modify_qp(qp_, &attr,
+                              IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT |
+                                  IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN |
+                                  IBV_QP_MAX_QP_RD_ATOMIC)))
         << "QP to Ready to Send " << r;
 
     connected_ = true;
@@ -897,16 +898,16 @@ static void CountCopies(const std::string& key, void* src_addr, void* dst_addr,
   }
   if ((++numTotalCopies % 0x400) == 0) {
     RDMA_LOG(0) << "Tensor copies:"
-                << " GPU to CPU: " << numGPUToCPUCopies
-                << " (" << numGPUToCPUCopiedBytes << " Bytes)"
-                << " CPU to GPU: " << numCPUToGPUCopies
-                << " (" << numCPUToGPUCopiedBytes << " Bytes)";
+                << " GPU to CPU: " << numGPUToCPUCopies << " ("
+                << numGPUToCPUCopiedBytes << " Bytes)"
+                << " CPU to GPU: " << numCPUToGPUCopies << " ("
+                << numCPUToGPUCopiedBytes << " Bytes)";
   }
-  RDMA_LOG(2) << "Copying tensor " << key
-              << " From: " << src_addr << " To: " << dst_addr;
-#endif // RDMA_COUNT_COPIES
+  RDMA_LOG(2) << "Copying tensor " << key << " From: " << src_addr
+              << " To: " << dst_addr;
+#endif  // RDMA_COUNT_COPIES
 }
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 
 #ifdef RDMA_DATA_VALIDATION
 static uint64_t Checksum(Device* device, const DeviceContext* device_context,
@@ -920,7 +921,7 @@ static uint64_t Checksum(Device* device, const DeviceContext* device_context,
     checksum = (device_context != nullptr)
                    ? GPUUtil::Checksum(device, device_context, in)
                    : GPUUtil::Checksum(in);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
   } else {
     string s = in.SummarizeValue(999999);
     checksum = Hash64(s.c_str(), s.size(), 0);
@@ -955,17 +956,16 @@ static void ValidateChecksum(uint64_t expected, uint64_t actual,
     }
   }
 }
-#endif // RDMA_DATA_VALIDATION
+#endif  // RDMA_DATA_VALIDATION
 
 #if GOOGLE_CUDA
 // Sync the 'done' operation on the GPU stream, but without all the data
 // copying.
-static void StreamGPUOp(Device* gpu_device,
-                        const DeviceContext* device_context,
+static void StreamGPUOp(Device* gpu_device, const DeviceContext* device_context,
                         StatusCallback done) {
   Tensor dummy1, dummy2;
-  GPUUtil::CopyGPUTensorToCPU(
-      gpu_device, device_context, &dummy1, &dummy2, done);
+  GPUUtil::CopyGPUTensorToCPU(gpu_device, device_context, &dummy1, &dummy2,
+                              done);
 }
 #endif  // GOOGLE_CUDA
 
@@ -1072,7 +1072,7 @@ void RdmaTensorResponse::RecvHandler(Rendezvous::ParsedKey parsed,
       // skip the copy here as well.
       if ((in.TotalBytes() > 0) && !meta_data_changed_ &&
           (RdmaMemoryMgr::Singleton().FindMemoryRegion(
-              (void*)DMAHelper::base(&in), in.TotalBytes()) != nullptr)) {
+               (void*)DMAHelper::base(&in), in.TotalBytes()) != nullptr)) {
         StreamGPUOp(src_dev_, send_dev_context,
                     [this, in, proto, is_dead](const Status& s) {
                       Send(in, proto, is_dead, s);
@@ -1118,8 +1118,8 @@ void RdmaTensorResponse::Send(const Tensor& in, const TensorProto& proto,
     return;
   }
   bool can_memcpy = DataTypeCanUseMemcpy(in.dtype());
-  bool proto_size_changed = (!can_memcpy) &&
-                            (proto.ByteSize() != rm_.tensor_bytes_);
+  bool proto_size_changed =
+      (!can_memcpy) && (proto.ByteSize() != rm_.tensor_bytes_);
   if (meta_data_changed_ || proto_size_changed) {
     Clone(in, proto, is_dead);
     SendMetaData(in, proto, is_dead);
@@ -1238,9 +1238,8 @@ void RdmaTensorResponse::SendErrorStatus(const Status& status) {
   rm.request_index_ = rm_.request_index_;
   rm.status_ = status;
   LOG(ERROR) << "Step 0x" << std::hex << rm.step_id_ << std::dec
-             << ": Sending RDMA_MESSAGE_ERROR_STATUS #"
-             << rm.request_index_ << ": " << rm.name_
-             << ". Status: " << status.ToString();
+             << ": Sending RDMA_MESSAGE_ERROR_STATUS #" << rm.request_index_
+             << ": " << rm.name_ << ". Status: " << status.ToString();
 
   string message = RdmaMessage::CreateMessage(rm);
   channel_->tx_message_buffer_->EnqueueItem(message);
@@ -1336,14 +1335,13 @@ string RdmaMessage::CreateMessage(const RdmaMessage& rm) {
     uint32_t gsProtoSize = gsProto.ByteSize();
     if (gsProtoSize + 4 > kErrorStatusMaxSize) {
       LOG(ERROR) << "Error status (" << gsProtoSize + 4 << " bytes) "
-                 << "is too big to fit in RDMA message ("
-                 << kErrorStatusMaxSize << " bytes). Truncated.";
+                 << "is too big to fit in RDMA message (" << kErrorStatusMaxSize
+                 << " bytes). Truncated.";
       gsProtoSize = kErrorStatusMaxSize - 4;
     }
     uint32_t* proto_size = (uint32_t*)&message[kErrorStatusStartIndex];
     *proto_size = gsProtoSize;
-    gsProto.SerializeToArray(&message[kErrorStatusStartIndex + 4],
-                             gsProtoSize);
+    gsProto.SerializeToArray(&message[kErrorStatusStartIndex + 4], gsProtoSize);
     message_size += gsProtoSize + 4;
   }
   return string(message, message_size);
@@ -1393,8 +1391,8 @@ void RdmaMessage::ParseMessage(RdmaMessage& rm, void* buffer) {
   if (rm.type_ == RDMA_MESSAGE_ERROR_STATUS) {
     ErrorStatusProto gsProto;
     uint32_t gsProtoSize = *(uint32_t*)&message[kErrorStatusStartIndex];
-    CHECK(ParseProtoUnlimited(
-        &gsProto, &message[kErrorStatusStartIndex + 4], gsProtoSize))
+    CHECK(ParseProtoUnlimited(&gsProto, &message[kErrorStatusStartIndex + 4],
+                              gsProtoSize))
         << "Failed to parse error status proto from message. Aborting.";
     ::grpc::Status gs((::grpc::StatusCode)gsProto.error_code(),
                       gsProto.error_message(), gsProto.error_details());
@@ -1566,8 +1564,8 @@ void RdmaTensorRequest::AllocateTensorsAsync(StatusCallback done) {
   if (dst_dev_->tensorflow_gpu_device_info() && !on_host &&
       (proxy_tensor_ == nullptr)) {
 #if GOOGLE_CUDA
-        // We need to sync the memory allocation on the GPU:
-        StreamGPUOp(dst_dev_, recv_args_.device_context, done);
+    // We need to sync the memory allocation on the GPU:
+    StreamGPUOp(dst_dev_, recv_args_.device_context, done);
 #endif
   } else {
     done(Status::OK());
@@ -1594,9 +1592,8 @@ void RdmaTensorRequest::Send(RdmaMessageType message_type) {
   rm.rkey_ = (mr_ == nullptr) ? 0 : mr_->rkey;
 
   RDMA_LOG(1) << "Step 0x" << std::hex << rm.step_id_ << std::dec
-              << ": Sending  " << MessageTypeToString(message_type)
-              << " #" << index_ << ": "
-              << rm.name_ << " on " << rdma_addr_
+              << ": Sending  " << MessageTypeToString(message_type) << " #"
+              << index_ << ": " << rm.name_ << " on " << rdma_addr_
               << " (rkey: 0x" << std::hex << rm.rkey_ << ")";
 
   string message = RdmaMessage::CreateMessage(rm);
@@ -1610,9 +1607,8 @@ void RdmaTensorRequest::RecvTensorMetaData(DataType dtype, TensorShape shape,
       key_, dtype, shape, is_dead, proto_size);
 
   DeallocateTensors();
-  AllocateTensorsAsync([this](const Status& s) {
-    Send(RDMA_MESSAGE_TENSOR_RE_REQUEST);
-  });
+  AllocateTensorsAsync(
+      [this](const Status& s) { Send(RDMA_MESSAGE_TENSOR_RE_REQUEST); });
 }
 
 void RdmaTensorRequest::RecvTensorContent() {
@@ -1620,8 +1616,8 @@ void RdmaTensorRequest::RecvTensorContent() {
   size_t message_size =
       can_memcpy ? result_tensor_->TotalBytes() : meta_data_->proto_size_;
   RDMA_LOG(1) << "Step 0x" << std::hex << step_id_ << std::dec
-              << ": Received tensor content #" << index_ << ": "
-              << key_ << " (Size: 0x" << std::hex << message_size << ")";
+              << ": Received tensor content #" << index_ << ": " << key_
+              << " (Size: 0x" << std::hex << message_size << ")";
 
   Tensor val;
 
@@ -1667,9 +1663,8 @@ void RdmaTensorRequest::RecvErrorStatus(const Status& status) {
 void RdmaTensorRequest::Start() {
   meta_data_ = RdmaMemoryMgr::Singleton().GetTensorMetaData(key_);
   if (meta_data_ != nullptr) {
-    AllocateTensorsAsync([this](const Status& s) {
-      Send(RDMA_MESSAGE_TENSOR_REQUEST);
-    });
+    AllocateTensorsAsync(
+        [this](const Status& s) { Send(RDMA_MESSAGE_TENSOR_REQUEST); });
   } else {
     Send(RDMA_MESSAGE_TENSOR_REQUEST);
   }
diff --git a/tensorflow/contrib/verbs/rdma.h b/tensorflow/contrib/verbs/rdma.h
index 68b3d59f56..b6c41de6ee 100644
--- a/tensorflow/contrib/verbs/rdma.h
+++ b/tensorflow/contrib/verbs/rdma.h
@@ -73,15 +73,8 @@ struct RemoteMR {
   uint64_t remote_addr;
   uint32_t rkey;
 };
-enum BufferStatus {
-  none,
-  idle,
-  busy
-};
-enum Location {
-  local,
-  remote
-};
+enum BufferStatus { none, idle, busy };
+enum Location { local, remote };
 
 enum RdmaMessageType {
   RDMA_MESSAGE_META_DATA_UPDATE,
diff --git a/tensorflow/contrib/verbs/rdma_mgr.cc b/tensorflow/contrib/verbs/rdma_mgr.cc
index f3644af0b4..369bd986df 100644
--- a/tensorflow/contrib/verbs/rdma_mgr.cc
+++ b/tensorflow/contrib/verbs/rdma_mgr.cc
@@ -116,9 +116,9 @@ void RdmaMgr::SetupChannels() {
         }
         CHECK(i == RdmaChannel::kNumMessageBuffers);
       } else {
-        LOG(ERROR) << "Connecting to " << worker_name
-                   << ": Got " << s.error_message() << ". Retrying ("
-                   << (attempts + 1) << "/" << max_num_attempts << ")..." ;
+        LOG(ERROR) << "Connecting to " << worker_name << ": Got "
+                   << s.error_message() << ". Retrying (" << (attempts + 1)
+                   << "/" << max_num_attempts << ")...";
         if (++attempts == max_num_attempts) {
           break;
         }
@@ -159,19 +159,17 @@ bool RdmaMgr::ConnectivityCheck() {
       ibv_wc_status s = rdma_adapter_->wc_[i].status;
       // recv complete
       if ((int)rdma_adapter_->wc_[i].wr_id == RdmaChannel::kPingRecvWrid) {
-        CHECK(s == IBV_WC_SUCCESS) << ": " << ibv_wc_status_str(
-                                                  rdma_adapter_->wc_[i].status)
-                                   << "(" << rdma_adapter_->wc_[i].status
-                                   << ") for PING_RECV_WRID";
+        CHECK(s == IBV_WC_SUCCESS)
+            << ": " << ibv_wc_status_str(rdma_adapter_->wc_[i].status) << "("
+            << rdma_adapter_->wc_[i].status << ") for PING_RECV_WRID";
         ++rcnt;
         // send complete
       } else {
         RdmaChannel* rc =
             reinterpret_cast<RdmaChannel*>(rdma_adapter_->wc_[i].wr_id);
-        CHECK(s == IBV_WC_SUCCESS) << ": " << ibv_wc_status_str(
-                                                  rdma_adapter_->wc_[i].status)
-                                   << "(" << rdma_adapter_->wc_[i].status
-                                   << ") to " << rc->remote_name_;
+        CHECK(s == IBV_WC_SUCCESS)
+            << ": " << ibv_wc_status_str(rdma_adapter_->wc_[i].status) << "("
+            << rdma_adapter_->wc_[i].status << ") to " << rc->remote_name_;
         ++scnt;
       }
     }  // for
@@ -238,8 +236,9 @@ int TryToReadNumaNode(ibv_device* device) {
   if (strings::safe_strto32(content, &value)) {
     if (value < 0) {
       LOG(INFO) << "Successful NUMA node read from SysFS had negative value ("
-                << value << "), but there must be at least one NUMA node"
-                            ", so returning NUMA node zero";
+                << value
+                << "), but there must be at least one NUMA node"
+                   ", so returning NUMA node zero";
       return 0;
     }
     LOG(INFO) << "NUMA node for device: " << device->name << " is " << value;
@@ -302,8 +301,8 @@ void RdmaMgr::InitAllocators() {
         &RdmaMemoryMgr::EvictMemoryRegion, &RdmaMemoryMgr::Singleton(), _1, _2);
 
     auto* visitable_allocator = dynamic_cast<VisitableAllocator*>(allocator);
-    CHECK(visitable_allocator) << "is not visitable for instrumentation"
-                               << allocator->Name();
+    CHECK(visitable_allocator)
+        << "is not visitable for instrumentation" << allocator->Name();
     // Make sure we don't instrument the same allocator twice
     if (instrumented_.find(allocator) == std::end(instrumented_)) {
       visitable_allocator->AddAllocVisitor(alloc_visitor);
diff --git a/tensorflow/core/kernels/mkl_aggregate_ops.cc b/tensorflow/core/kernels/mkl_aggregate_ops.cc
index bb5eceab27..89d37d2f87 100644
--- a/tensorflow/core/kernels/mkl_aggregate_ops.cc
+++ b/tensorflow/core/kernels/mkl_aggregate_ops.cc
@@ -65,13 +65,11 @@ class MklAddNOp : public OpKernel {
     TensorShape src1_shape, src2_shape;
     src1_shape = input0.shape();
     src2_shape = input1.shape();
-    if (!src1_shape.IsSameSize(src2_shape) ){
-      ctx->SetStatus( 
-          errors::InvalidArgument(
-          "Inputs to operation ", this->name(), " of type ", this->type_string(),
-          " must have the same size and shape.  Input 0: ",
-          src1_shape.DebugString(), " != input 1: ",
-          src2_shape.DebugString()));
+    if (!src1_shape.IsSameSize(src2_shape)) {
+      ctx->SetStatus(errors::InvalidArgument(
+          "Inputs to operation ", this->name(), " of type ",
+          this->type_string(), " must have the same size and shape.  Input 0: ",
+          src1_shape.DebugString(), " != input 1: ", src2_shape.DebugString()));
     }
     // handle the case of a scalar
     if (!input1_in_mkl_format && input0.dims() == 0) {
@@ -82,17 +80,16 @@ class MklAddNOp : public OpKernel {
                                 mkl_context.output_shape);
       float user_i1 = (input0.scalar<T>()());
       float user_i2 = (input1.scalar<T>()());
-      out_tensor->scalar<T>()() =
-          std::plus<float>{}(user_i1, user_i2);
+      out_tensor->scalar<T>()() = std::plus<float>{}(user_i1, user_i2);
       return;
     }
 
     mkl_context.in_dims = input1_in_mkl_format
-        ? mkl_context.input1_shape.GetDimension()
-        : input0.dims();
+                              ? mkl_context.input1_shape.GetDimension()
+                              : input0.dims();
     mkl_context.in_dims = input2_in_mkl_format
-        ? mkl_context.input2_shape.GetDimension()
-        : input1.dims();
+                              ? mkl_context.input2_shape.GetDimension()
+                              : input1.dims();
 
     // If there is nothing to compute, return.
     if (!input1_in_mkl_format && !input2_in_mkl_format) {
@@ -101,7 +98,7 @@ class MklAddNOp : public OpKernel {
         Tensor* out_tensor = nullptr;
         mkl_context.output_shape.SetMklTensor(false);
         AllocateOutputSetMklShape(ctx, src1_idx, &out_tensor, o_shape,
-                                 mkl_context.output_shape);
+                                  mkl_context.output_shape);
         return;
       }
     }
@@ -110,9 +107,9 @@ class MklAddNOp : public OpKernel {
     mkl_context.in_strides = new size_t[mkl_context.in_dims];
     // Generate size, stride for input if input is in MKL format.
     if (input1_in_mkl_format || input2_in_mkl_format) {
-      const MklShape* tmp_mkl_shape =
-        (input1_in_mkl_format) ? &mkl_context.input1_shape :
-        &mkl_context.input2_shape;
+      const MklShape* tmp_mkl_shape = (input1_in_mkl_format)
+                                          ? &mkl_context.input1_shape
+                                          : &mkl_context.input2_shape;
       for (int i = 0; i < mkl_context.in_dims; i++) {
         mkl_context.in_sizes[i] = tmp_mkl_shape->GetSizes()[i];
         mkl_context.in_strides[i] = tmp_mkl_shape->GetStrides()[i];
@@ -136,32 +133,33 @@ class MklAddNOp : public OpKernel {
 
     Tensor mkl_tmp_input1_buf_tensor, mkl_tmp_input2_buf_tensor;
     mkl_context.MklPrepareAddNInputs(ctx, &mkl_tmp_input1_buf_tensor,
-    &mkl_tmp_input2_buf_tensor);
+                                     &mkl_tmp_input2_buf_tensor);
     Tensor* output = nullptr;
     if (input1_in_mkl_format || input2_in_mkl_format) {
-     TensorShape tf_shape;
-     mkl_context.output_shape.SetMklTensor(true);
-     mkl_context.output_shape.SetMklLayout(mkl_context.Eltwise, dnnResourceDst);
-
-     mkl_context.output_shape.SetTfLayout(
-        mkl_context.in_dims, mkl_context.in_sizes, mkl_context.in_strides);
-     if (input1_in_mkl_format == true) {
-      mkl_context.output_shape.SetTfDimOrder(mkl_context.in_dims,
-      mkl_context.input1_shape.GetTfToMklDimMap());
-     } else {
-      mkl_context.output_shape.SetTfDimOrder(mkl_context.in_dims,
-      mkl_context.input2_shape.GetTfToMklDimMap());
-     }
-     tf_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
-                        mkl_context.output_shape.GetMklLayout())) /
-                    sizeof(T));
-
-     AllocateOutputSetMklShape(ctx, src1_idx, &output, tf_shape,
-                              mkl_context.output_shape);
+      TensorShape tf_shape;
+      mkl_context.output_shape.SetMklTensor(true);
+      mkl_context.output_shape.SetMklLayout(mkl_context.Eltwise,
+                                            dnnResourceDst);
+
+      mkl_context.output_shape.SetTfLayout(
+          mkl_context.in_dims, mkl_context.in_sizes, mkl_context.in_strides);
+      if (input1_in_mkl_format == true) {
+        mkl_context.output_shape.SetTfDimOrder(
+            mkl_context.in_dims, mkl_context.input1_shape.GetTfToMklDimMap());
+      } else {
+        mkl_context.output_shape.SetTfDimOrder(
+            mkl_context.in_dims, mkl_context.input2_shape.GetTfToMklDimMap());
+      }
+      tf_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
+                          mkl_context.output_shape.GetMklLayout())) /
+                      sizeof(T));
+
+      AllocateOutputSetMklShape(ctx, src1_idx, &output, tf_shape,
+                                mkl_context.output_shape);
     } else {
-     const TensorShape& o_shape = input1.shape();
-     mkl_context.output_shape.SetMklTensor(false);
-     AllocateOutputSetMklShape(ctx, src1_idx, &output, o_shape,
+      const TensorShape& o_shape = input1.shape();
+      mkl_context.output_shape.SetMklTensor(false);
+      AllocateOutputSetMklShape(ctx, src1_idx, &output, o_shape,
                                 mkl_context.output_shape);
     }
 
@@ -189,18 +187,16 @@ class MklAddNOp : public OpKernel {
     void MklCreateInputLayouts(OpKernelContext* context) {
       bool input1_in_mkl_format = input1_shape.IsMklTensor();
       if (!input1_in_mkl_format) {
-        CHECK_EQ(
-            dnnLayoutCreate_F32(&lt_input1, in_dims, in_sizes, in_strides),
-            E_SUCCESS);
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_input1, in_dims, in_sizes, in_strides),
+                 E_SUCCESS);
       } else {
         lt_input1 = static_cast<dnnLayout_t>(input1_shape.GetCurLayout());
       }
 
       bool input2_in_mkl_format = input2_shape.IsMklTensor();
       if (!input2_in_mkl_format) {
-        CHECK_EQ(
-            dnnLayoutCreate_F32(&lt_input2, in_dims, in_sizes, in_strides),
-            E_SUCCESS);
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_input2, in_dims, in_sizes, in_strides),
+                 E_SUCCESS);
       } else {
         lt_input2 = static_cast<dnnLayout_t>(input2_shape.GetCurLayout());
       }
@@ -276,14 +272,14 @@ class MklAddNOp : public OpKernel {
       bool input2_in_mkl_format = input2_shape.IsMklTensor();
       dnnDelete_F32(Eltwise);
       if (!input1_in_mkl_format || !input2_in_mkl_format) {
-         delete [] in_sizes;
-         delete [] in_strides;
+        delete[] in_sizes;
+        delete[] in_strides;
       }
       if (!input1_in_mkl_format) {
-         dnnLayoutDelete_F32(lt_input1);
+        dnnLayoutDelete_F32(lt_input1);
       }
       if (!input2_in_mkl_format) {
-         dnnLayoutDelete_F32(lt_input2);
+        dnnLayoutDelete_F32(lt_input2);
       }
     }
   } MklAddNOpContext;
@@ -315,45 +311,44 @@ class MklAddNOp : public OpKernel {
       GetMklShape(ctx, src2_idx, &src2_mkl_shape);
       bool input1_in_mkl_format = src1_mkl_shape.IsMklTensor();
       bool input2_in_mkl_format = src2_mkl_shape.IsMklTensor();
-      int src1_dims_size = input1_in_mkl_format?
-       src1_mkl_shape.GetDimension(): src1_tensor.dims();
-      int src2_dims_size = input2_in_mkl_format?
-       src2_mkl_shape.GetDimension(): src2_tensor.dims();
+      int src1_dims_size = input1_in_mkl_format ? src1_mkl_shape.GetDimension()
+                                                : src1_tensor.dims();
+      int src2_dims_size = input2_in_mkl_format ? src2_mkl_shape.GetDimension()
+                                                : src2_tensor.dims();
       // if the shapes of two tensors are not same raise op error
       TensorShape src1_shape, src2_shape;
       src1_shape = src1_tensor.shape();
       src2_shape = src2_tensor.shape();
-      if (!src1_shape.IsSameSize(src2_shape) ){
-	ctx->SetStatus( 
-            errors::InvalidArgument(
-            "Inputs to operation ", this->name(), " of type ", this->type_string(),
+      if (!src1_shape.IsSameSize(src2_shape)) {
+        ctx->SetStatus(errors::InvalidArgument(
+            "Inputs to operation ", this->name(), " of type ",
+            this->type_string(),
             " must have the same size and shape.  Input 0: ",
-            src1_shape.DebugString(), " != input 1: ",
-            src2_shape.DebugString()));
+            src1_shape.DebugString(),
+            " != input 1: ", src2_shape.DebugString()));
       }
 
       if (!input1_in_mkl_format && src1_dims_size == 0) {
-         Tensor* dst_tensor = nullptr;
-         MklShape mkl_shape_dst;
-         mkl_shape_dst.SetMklTensor(false);
-         AllocateOutputSetMklShape(ctx, output_idx, &dst_tensor,
-         src1_tensor.shape(), mkl_shape_dst);
-         float user_i1 = (src1_tensor.scalar<T>()());
-         float user_i2 = (src2_tensor.scalar<T>()());
-         dst_tensor->scalar<T>()() =
-           std::plus<float>{}(user_i1, user_i2);
-         return;
-       }
+        Tensor* dst_tensor = nullptr;
+        MklShape mkl_shape_dst;
+        mkl_shape_dst.SetMklTensor(false);
+        AllocateOutputSetMklShape(ctx, output_idx, &dst_tensor,
+                                  src1_tensor.shape(), mkl_shape_dst);
+        float user_i1 = (src1_tensor.scalar<T>()());
+        float user_i2 = (src2_tensor.scalar<T>()());
+        dst_tensor->scalar<T>()() = std::plus<float>{}(user_i1, user_i2);
+        return;
+      }
 
       // If there is nothing to compute, return.
       if (!input1_in_mkl_format && !input2_in_mkl_format) {
         if (src1_tensor.shape().num_elements() == 0) {
-           Tensor* dst_tensor = nullptr;
-           MklShape mkl_shape_dst;
-           mkl_shape_dst.SetMklTensor(false);
-           AllocateOutputSetMklShape(ctx, output_idx, &dst_tensor,
-           src1_tensor.shape(), mkl_shape_dst);
-           return;
+          Tensor* dst_tensor = nullptr;
+          MklShape mkl_shape_dst;
+          mkl_shape_dst.SetMklTensor(false);
+          AllocateOutputSetMklShape(ctx, output_idx, &dst_tensor,
+                                    src1_tensor.shape(), mkl_shape_dst);
+          return;
         }
       }
 
@@ -362,7 +357,7 @@ class MklAddNOp : public OpKernel {
       MklDnnData<T> src2(&cpu_engine);
       MklDnnData<T> dst(&cpu_engine);
 
-      int tmp_size = input1_in_mkl_format ? src2_dims_size: src1_dims_size;
+      int tmp_size = input1_in_mkl_format ? src2_dims_size : src1_dims_size;
       memory::dims dims(tmp_size);
       memory::dims strides(tmp_size);
       memory::desc md1({}, memory::data_undef, memory::format_undef);
@@ -392,21 +387,19 @@ class MklAddNOp : public OpKernel {
         md1 = src1_mkl_shape.GetMklLayout();
 
         memory::format src1_mkl_data_format = src1_mkl_shape.GetTfDataFormat();
-        auto src1_tf_data_format = MklDnnDataFormatToTFDataFormat(
-                                    src1_mkl_data_format);
-        auto src2_dims = TFShapeToMklDnnDimsInNCHW(src2_tensor.shape(),
-                                    src1_tf_data_format);
-        md2 = memory::desc(src2_dims, MklDnnType<T>(),
-                           src1_mkl_data_format);
+        auto src1_tf_data_format =
+            MklDnnDataFormatToTFDataFormat(src1_mkl_data_format);
+        auto src2_dims =
+            TFShapeToMklDnnDimsInNCHW(src2_tensor.shape(), src1_tf_data_format);
+        md2 = memory::desc(src2_dims, MklDnnType<T>(), src1_mkl_data_format);
       } else if (input2_in_mkl_format && !input1_in_mkl_format) {
         // Same comment as above.
         memory::format src2_mkl_data_format = src2_mkl_shape.GetTfDataFormat();
-        auto src2_tf_data_format = MklDnnDataFormatToTFDataFormat(
-                                     src2_mkl_data_format);
-        auto src1_dims = TFShapeToMklDnnDimsInNCHW(src1_tensor.shape(),
-                                    src2_tf_data_format);
-        md1 = memory::desc(src1_dims, MklDnnType<T>(),
-                           src2_mkl_data_format);
+        auto src2_tf_data_format =
+            MklDnnDataFormatToTFDataFormat(src2_mkl_data_format);
+        auto src1_dims =
+            TFShapeToMklDnnDimsInNCHW(src1_tensor.shape(), src2_tf_data_format);
+        md1 = memory::desc(src1_dims, MklDnnType<T>(), src2_mkl_data_format);
 
         md2 = src2_mkl_shape.GetMklLayout();
       } else {
@@ -480,20 +473,19 @@ class MklAddNOp : public OpKernel {
         output_mkl_shape.SetMklTensor(false);
         output_tf_shape = src1_tensor.shape();
       }
-      AllocateOutputSetMklShape(ctx, output_idx, &dst_tensor,
-                                output_tf_shape, output_mkl_shape);
+      AllocateOutputSetMklShape(ctx, output_idx, &dst_tensor, output_tf_shape,
+                                output_mkl_shape);
       dst.SetUsrMemDataHandle(dst_tensor);
 
       // Create Sum op, and submit net for execution.
       net.push_back(sum(sum_pd, inputs, dst.GetOpMem()));
       stream(stream::kind::eager).submit(net).wait();
-    } catch (mkldnn::error &e) {
+    } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
-                       ", message: " + string(e.message) +
-                       ", in file " + string(__FILE__) + ":" +
-                       std::to_string(__LINE__);
-      OP_REQUIRES_OK(ctx, errors::Aborted("Operation received an exception:",
-                                            error_msg));
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          ctx, errors::Aborted("Operation received an exception:", error_msg));
     }
   }
 };
diff --git a/tensorflow/core/kernels/mkl_softmax_op.cc b/tensorflow/core/kernels/mkl_softmax_op.cc
index 896d562933..c46eabdde1 100644
--- a/tensorflow/core/kernels/mkl_softmax_op.cc
+++ b/tensorflow/core/kernels/mkl_softmax_op.cc
@@ -17,13 +17,13 @@ limitations under the License.
 #ifdef INTEL_MKL
 #ifdef INTEL_MKL_DNN
 
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/util/tensor_format.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 #include "mkldnn.h"
 #include "mkldnn_types.h"
@@ -31,16 +31,14 @@ limitations under the License.
 #include "tensorflow/core/util/mkl_util.h"
 
 #include "mkldnn.hpp"
-using mkldnn::stream;
 using mkldnn::prop_kind;
 using mkldnn::softmax_forward;
+using mkldnn::stream;
 
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-
-
 template <typename Device, typename T>
 class MklSoftmaxOp : public OpKernel {
  public:
@@ -60,11 +58,11 @@ class MklSoftmaxOp : public OpKernel {
       MklDnnShape src_mkl_shape;
       GetMklShape(context, src_idx, &src_mkl_shape);
 
-
       // src_dims is the dimenstion of src_tensor
       // dim of the dst will also be same as src_dims
-      auto src_tf_shape = src_mkl_shape.IsMklTensor() ?
-                          src_mkl_shape.GetTfShape() : src_tensor.shape();
+      auto src_tf_shape = src_mkl_shape.IsMklTensor()
+                              ? src_mkl_shape.GetTfShape()
+                              : src_tensor.shape();
       auto src_dims = TFShapeToMklDnnDims(src_tf_shape);
       auto output_dims = src_dims;
 
@@ -77,10 +75,10 @@ class MklSoftmaxOp : public OpKernel {
       // construct input Tf layout. For TF layout, although input shape
       // (src_dims) required is in MKL-DNN order, the layout is Tensorflow's
       // layout
-      auto src_md = src_mkl_shape.IsMklTensor()
-                    ? src_mkl_shape.GetMklLayout()
-                    : memory::desc(src_dims, MklDnnType<T>(),
-                                         memory::format::nc);
+      auto src_md =
+          src_mkl_shape.IsMklTensor()
+              ? src_mkl_shape.GetMklLayout()
+              : memory::desc(src_dims, MklDnnType<T>(), memory::format::nc);
 
       // src: setting memory descriptor and op memory descriptor
       // Basically following two functions maps the TF "src_tensor" to mkl
@@ -95,8 +93,8 @@ class MklSoftmaxOp : public OpKernel {
       int axis = 1;  // axis to which softmax will be applied
       auto softmax_fwd_desc = softmax_forward::desc(prop_kind::forward_scoring,
                                                     src.GetOpMemDesc(), axis);
-      auto softmax_fwd_pd = softmax_forward::primitive_desc(softmax_fwd_desc,
-                                                            cpu_engine);
+      auto softmax_fwd_pd =
+          softmax_forward::primitive_desc(softmax_fwd_desc, cpu_engine);
 
       // add: output
       Tensor* output_tensor = nullptr;
@@ -136,9 +134,9 @@ class MklSoftmaxOp : public OpKernel {
       net.push_back(softmax_fwd);
       stream(stream::kind::eager).submit(net).wait();
     } catch (mkldnn::error& e) {
-      string error_msg = "Status: " + std::to_string(e.status) + ", message: " +
-                         string(e.message) + ", in file " + string(__FILE__) +
-                         ":" + std::to_string(__LINE__);
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
       OP_REQUIRES_OK(
           context,
           errors::Aborted("Operation received an exception:", error_msg));
@@ -148,7 +146,7 @@ class MklSoftmaxOp : public OpKernel {
 
 /* Register DNN kernels for supported operations and supported types - right now
  * it is only Softmax and f32 */
-#define REGISTER_SOFTMAX_MKL_SUPPORTED_KERNELS_TYPES(type)             \
+#define REGISTER_SOFTMAX_MKL_SUPPORTED_KERNELS_TYPES(type)          \
   REGISTER_KERNEL_BUILDER(Name("_MklSoftmax")                       \
                               .Device(DEVICE_CPU)                   \
                               .TypeConstraint<type>("T")            \
@@ -156,7 +154,6 @@ class MklSoftmaxOp : public OpKernel {
                           MklSoftmaxOp<CPUDevice, type>);
 TF_CALL_float(REGISTER_SOFTMAX_MKL_SUPPORTED_KERNELS_TYPES);
 
-
 }  // namespace tensorflow
 
 #endif  // INTEL_MKL_DNN
diff --git a/tensorflow/core/kernels/spectrogram_test_utils.cc b/tensorflow/core/kernels/spectrogram_test_utils.cc
index bc30330d61..872a6e9d1b 100644
--- a/tensorflow/core/kernels/spectrogram_test_utils.cc
+++ b/tensorflow/core/kernels/spectrogram_test_utils.cc
@@ -72,12 +72,12 @@ bool ReadRawFloatFileToComplexVector(
   while (offset < end) {
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
     char arr[4];
-    for (int i = 0; i < kBytesPerValue; ++i ) {
+    for (int i = 0; i < kBytesPerValue; ++i) {
       arr[3 - i] = *(data_string.data() + offset + i);
     }
     memcpy(&real_out, arr, kBytesPerValue);
     offset += kBytesPerValue;
-    for (int i = 0; i < kBytesPerValue; ++i ) {
+    for (int i = 0; i < kBytesPerValue; ++i) {
       arr[3 - i] = *(data_string.data() + offset + i);
     }
     memcpy(&imag_out, arr, kBytesPerValue);
diff --git a/tensorflow/core/kernels/transpose_functor_cpu.cc b/tensorflow/core/kernels/transpose_functor_cpu.cc
index 6594f7ee7b..5198df7e16 100644
--- a/tensorflow/core/kernels/transpose_functor_cpu.cc
+++ b/tensorflow/core/kernels/transpose_functor_cpu.cc
@@ -89,17 +89,17 @@ struct Transpose<CPUDevice, T, conjugate> {
                                                        out);
         break;
       case 6:
-	internal::TransposeUsingEigen<CPUDevice, T, 6>(d, in, perm, conjugate,
-						       out);
-	break;
+        internal::TransposeUsingEigen<CPUDevice, T, 6>(d, in, perm, conjugate,
+                                                       out);
+        break;
       case 7:
-	internal::TransposeUsingEigen<CPUDevice, T, 7>(d, in, perm, conjugate,
-						       out);
-	break;
+        internal::TransposeUsingEigen<CPUDevice, T, 7>(d, in, perm, conjugate,
+                                                       out);
+        break;
       case 8:
         internal::TransposeUsingEigen<CPUDevice, T, 8>(d, in, perm, conjugate,
-						       out);
-	break;
+                                                       out);
+        break;
       default:
         TransposeSimple<T, conjugate>(d, in, perm, out);
         break;
diff --git a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
index 7d1650f05e..f6906b0f79 100644
--- a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
+++ b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
@@ -40,10 +40,10 @@ current_path = os.path.dirname(os.path.realpath(sys.argv[0]))
 
 parser = argparse.ArgumentParser()
 parser.add_argument(
-      '--log_dir',
-      type=str,
-      default=os.path.join(current_path, 'log'),
-      help='The log directory for TensorBoard summaries.')
+    '--log_dir',
+    type=str,
+    default=os.path.join(current_path, 'log'),
+    help='The log directory for TensorBoard summaries.')
 FLAGS, unparsed = parser.parse_known_args()
 
 # Create the directory for TensorBoard variables if there is not.
@@ -81,6 +81,7 @@ def read_data(filename):
     data = tf.compat.as_str(f.read(f.namelist()[0])).split()
   return data
 
+
 vocabulary = read_data(filename)
 print('Data size', len(vocabulary))
 
@@ -106,20 +107,22 @@ def build_dataset(words, n_words):
   reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
   return data, count, dictionary, reversed_dictionary
 
+
 # Filling 4 global variables:
 # data - list of codes (integers from 0 to vocabulary_size-1).
 #   This is the original text but words are replaced by their codes
 # count - map of words(strings) to count of occurrences
 # dictionary - map of words(strings) to their codes(integers)
 # reverse_dictionary - maps codes(integers) to words(strings)
-data, count, dictionary, reverse_dictionary = build_dataset(vocabulary,
-                                                            vocabulary_size)
+data, count, dictionary, reverse_dictionary = build_dataset(
+    vocabulary, vocabulary_size)
 del vocabulary  # Hint to reduce memory.
 print('Most common words (+UNK)', count[:5])
 print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])
 
 data_index = 0
 
+
 # Step 3: Function to generate a training batch for the skip-gram model.
 def generate_batch(batch_size, num_skips, skip_window):
   global data_index
@@ -149,28 +152,28 @@ def generate_batch(batch_size, num_skips, skip_window):
   data_index = (data_index + len(data) - span) % len(data)
   return batch, labels
 
+
 batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)
 for i in range(8):
-  print(batch[i], reverse_dictionary[batch[i]],
-        '->', labels[i, 0], reverse_dictionary[labels[i, 0]])
+  print(batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0],
+        reverse_dictionary[labels[i, 0]])
 
 # Step 4: Build and train a skip-gram model.
 
 batch_size = 128
 embedding_size = 128  # Dimension of the embedding vector.
-skip_window = 1       # How many words to consider left and right.
-num_skips = 2         # How many times to reuse an input to generate a label.
-num_sampled = 64      # Number of negative examples to sample.
+skip_window = 1  # How many words to consider left and right.
+num_skips = 2  # How many times to reuse an input to generate a label.
+num_sampled = 64  # Number of negative examples to sample.
 
 # We pick a random validation set to sample nearest neighbors. Here we limit the
 # validation samples to the words that have a low numeric ID, which by
 # construction are also the most frequent. These 3 variables are used only for
 # displaying model accuracy, they don't affect calculation.
-valid_size = 16     # Random set of words to evaluate similarity on.
+valid_size = 16  # Random set of words to evaluate similarity on.
 valid_window = 100  # Only pick dev samples in the head of the distribution.
 valid_examples = np.random.choice(valid_window, valid_size, replace=False)
 
-
 graph = tf.Graph()
 
 with graph.as_default():
@@ -192,8 +195,9 @@ with graph.as_default():
     # Construct the variables for the NCE loss
     with tf.name_scope('weights'):
       nce_weights = tf.Variable(
-          tf.truncated_normal([vocabulary_size, embedding_size],
-                              stddev=1.0 / math.sqrt(embedding_size)))
+          tf.truncated_normal(
+              [vocabulary_size, embedding_size],
+              stddev=1.0 / math.sqrt(embedding_size)))
     with tf.name_scope('biases'):
       nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
 
@@ -204,12 +208,13 @@ with graph.as_default():
   #   http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
   with tf.name_scope('loss'):
     loss = tf.reduce_mean(
-        tf.nn.nce_loss(weights=nce_weights,
-                       biases=nce_biases,
-                       labels=train_labels,
-                       inputs=embed,
-                       num_sampled=num_sampled,
-                       num_classes=vocabulary_size))
+        tf.nn.nce_loss(
+            weights=nce_weights,
+            biases=nce_biases,
+            labels=train_labels,
+            inputs=embed,
+            num_sampled=num_sampled,
+            num_classes=vocabulary_size))
 
   # Add the loss value as a scalar to summary.
   tf.summary.scalar('loss', loss)
@@ -221,8 +226,8 @@ with graph.as_default():
   # Compute the cosine similarity between minibatch examples and all embeddings.
   norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
   normalized_embeddings = embeddings / norm
-  valid_embeddings = tf.nn.embedding_lookup(
-      normalized_embeddings, valid_dataset)
+  valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,
+                                            valid_dataset)
   similarity = tf.matmul(
       valid_embeddings, normalized_embeddings, transpose_b=True)
 
@@ -248,8 +253,8 @@ with tf.Session(graph=graph) as session:
 
   average_loss = 0
   for step in xrange(num_steps):
-    batch_inputs, batch_labels = generate_batch(
-        batch_size, num_skips, skip_window)
+    batch_inputs, batch_labels = generate_batch(batch_size, num_skips,
+                                                skip_window)
     feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
 
     # Define metadata variable.
@@ -259,9 +264,12 @@ with tf.Session(graph=graph) as session:
     # in the list of returned values for session.run()
     # Also, evaluate the merged op to get all summaries from the returned "summary" variable.
     # Feed metadata variable to session for visualizing the graph in TensorBoard.
-    _, summary, loss_val = session.run([optimizer, merged, loss], feed_dict=feed_dict, run_metadata=run_metadata)
+    _, summary, loss_val = session.run(
+        [optimizer, merged, loss],
+        feed_dict=feed_dict,
+        run_metadata=run_metadata)
     average_loss += loss_val
-    
+
     # Add returned summaries to writer in each step.
     writer.add_summary(summary, step)
     # Add metadata to visualize the graph for the last run.
@@ -295,7 +303,7 @@ with tf.Session(graph=graph) as session:
       f.write(reverse_dictionary[i] + '\n')
 
   # Save the model for checkpoints.
-  saver.save(session, os.path.join(FLAGS.log_dir, "model.ckpt"))
+  saver.save(session, os.path.join(FLAGS.log_dir, 'model.ckpt'))
 
   # Create a configuration for visualizing embeddings with the labels in TensorBoard.
   config = projector.ProjectorConfig()
@@ -317,21 +325,24 @@ def plot_with_labels(low_dim_embs, labels, filename):
   for i, label in enumerate(labels):
     x, y = low_dim_embs[i, :]
     plt.scatter(x, y)
-    plt.annotate(label,
-                 xy=(x, y),
-                 xytext=(5, 2),
-                 textcoords='offset points',
-                 ha='right',
-                 va='bottom')
+    plt.annotate(
+        label,
+        xy=(x, y),
+        xytext=(5, 2),
+        textcoords='offset points',
+        ha='right',
+        va='bottom')
 
   plt.savefig(filename)
 
+
 try:
   # pylint: disable=g-import-not-at-top
   from sklearn.manifold import TSNE
   import matplotlib.pyplot as plt
 
-  tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact')
+  tsne = TSNE(
+      perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact')
   plot_only = 500
   low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :])
   labels = [reverse_dictionary[i] for i in xrange(plot_only)]
diff --git a/tensorflow/python/data/kernel_tests/batch_dataset_op_test.py b/tensorflow/python/data/kernel_tests/batch_dataset_op_test.py
index eac1c1960d..bd80b9dbf5 100644
--- a/tensorflow/python/data/kernel_tests/batch_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/batch_dataset_op_test.py
@@ -51,8 +51,9 @@ class BatchDatasetTest(test.TestCase):
     def _map_fn(x, y, z):
       return math_ops.square(x), math_ops.square(y), math_ops.square(z)
 
-    iterator = (dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
-                .repeat(count).batch(batch_size).make_initializable_iterator())
+    iterator = (
+        dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
+        .repeat(count).batch(batch_size).make_initializable_iterator())
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -68,7 +69,7 @@ class BatchDatasetTest(test.TestCase):
         result = sess.run(get_next)
         for component, result_component in zip(components, result):
           for j in range(14):
-            self.assertAllEqual(component[(i*14 + j) % 7]**2,
+            self.assertAllEqual(component[(i * 14 + j) % 7]**2,
                                 result_component[j])
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
@@ -83,12 +84,12 @@ class BatchDatasetTest(test.TestCase):
         result = sess.run(get_next)
         for component, result_component in zip(components, result):
           for j in range(8):
-            self.assertAllEqual(component[(i*8 + j) % 7]**2,
+            self.assertAllEqual(component[(i * 8 + j) % 7]**2,
                                 result_component[j])
       result = sess.run(get_next)
       for component, result_component in zip(components, result):
         for j in range((14 * 7) % 8):
-          self.assertAllEqual(component[((num_batches - 1)*8 + j) % 7]**2,
+          self.assertAllEqual(component[((num_batches - 1) * 8 + j) % 7]**2,
                               result_component[j])
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
@@ -189,33 +190,34 @@ class BatchDatasetTest(test.TestCase):
         sess.run(get_next)
 
   def testBatchShapeError(self):
+
     def generator():
       yield [1.0, 2.0, 3.0]
       yield [4.0, 5.0, 6.0]
       yield [7.0, 8.0, 9.0, 10.0]
 
-    iterator = (dataset_ops.Dataset.from_generator(generator, dtypes.float32,
-                                                   output_shapes=[None])
-                .batch(3)
-                .make_initializable_iterator())
+    iterator = (
+        dataset_ops.Dataset.from_generator(
+            generator, dtypes.float32, output_shapes=[None]).batch(3)
+        .make_initializable_iterator())
     next_element = iterator.get_next()
 
     with self.test_session() as sess:
       sess.run(iterator.initializer)
       with self.assertRaisesRegexp(
           errors.InvalidArgumentError,
-          r"Cannot batch tensors with different shapes in component 0. "
-          r"First element had shape \[3\] and element 2 had shape \[4\]."):
+          r'Cannot batch tensors with different shapes in component 0. '
+          r'First element had shape \[3\] and element 2 had shape \[4\].'):
         sess.run(next_element)
 
   def testPaddedBatchDataset(self):
     seq_lens = array_ops.placeholder(dtypes.int32, shape=[None])
     padded_shape = array_ops.placeholder(dtypes.int64, shape=[1])
 
-    iterator = (dataset_ops.Dataset.from_tensor_slices(seq_lens)
-                .map(lambda x: array_ops.fill([x], x)).padded_batch(
-                    4,
-                    padded_shapes=padded_shape).make_initializable_iterator())
+    iterator = (
+        dataset_ops.Dataset.from_tensor_slices(seq_lens)
+        .map(lambda x: array_ops.fill([x], x)).padded_batch(
+            4, padded_shapes=padded_shape).make_initializable_iterator())
 
     init_op = iterator.initializer
     get_next = iterator.get_next()
@@ -223,35 +225,40 @@ class BatchDatasetTest(test.TestCase):
     with self.test_session() as sess:
       # Test with random sequence lengths, and max padding.
       random_seq_lens = np.random.randint(20, size=(32,)).astype(np.int32)
-      sess.run(init_op, feed_dict={padded_shape: [-1],
-                                   seq_lens: random_seq_lens})
+      sess.run(
+          init_op, feed_dict={
+              padded_shape: [-1],
+              seq_lens: random_seq_lens
+          })
       for i in range(8):
         result = sess.run(get_next)
         padded_len = np.max(result)
         self.assertEqual((4, padded_len), result.shape)
         for j in range(4):
-          seq_len = random_seq_lens[(i*4)+j]
+          seq_len = random_seq_lens[(i * 4) + j]
           self.assertAllEqual(result[j, :seq_len], [seq_len] * seq_len)
           self.assertAllEqual(result[j, seq_len:], [0] * (padded_len - seq_len))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
       # Test with random sequence lengths, and constant padding.
-      sess.run(init_op, feed_dict={padded_shape: [25],
-                                   seq_lens: random_seq_lens})
+      sess.run(
+          init_op, feed_dict={
+              padded_shape: [25],
+              seq_lens: random_seq_lens
+          })
       for i in range(8):
         result = sess.run(get_next)
         self.assertEqual((4, 25), result.shape)
         for j in range(4):
-          seq_len = random_seq_lens[(i*4)+j]
+          seq_len = random_seq_lens[(i * 4) + j]
           self.assertAllEqual(result[j, :seq_len], [seq_len] * seq_len)
           self.assertAllEqual(result[j, seq_len:], [0] * (25 - seq_len))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
       # Test correct handling of empty tensors.
-      sess.run(init_op, feed_dict={padded_shape: [-1],
-                                   seq_lens: [0, 0, 0, 0]})
+      sess.run(init_op, feed_dict={padded_shape: [-1], seq_lens: [0, 0, 0, 0]})
       result = sess.run(get_next)
       self.assertAllEqual([[], [], [], []], result)
       with self.assertRaises(errors.OutOfRangeError):
@@ -259,8 +266,7 @@ class BatchDatasetTest(test.TestCase):
 
       # Test error handling with constant sequence lengths, and
       # too-short padding.
-      sess.run(init_op, feed_dict={padded_shape: [5],
-                                   seq_lens: [6, 5, 5, 5]})
+      sess.run(init_op, feed_dict={padded_shape: [5], seq_lens: [6, 5, 5, 5]})
       with self.assertRaises(errors.DataLossError):
         result = sess.run(get_next)
 
@@ -271,11 +277,13 @@ class BatchDatasetTest(test.TestCase):
     def fill_tuple(x):
       filled = array_ops.fill([x], x)
       return (filled, string_ops.as_string(filled))
-    iterator = (dataset_ops.Dataset.from_tensor_slices(seq_lens).map(fill_tuple)
-                .padded_batch(
-                    4,
-                    padded_shapes=(padded_shape, padded_shape),
-                    padding_values=(-1, "<end>")).make_initializable_iterator())
+
+    iterator = (
+        dataset_ops.Dataset.from_tensor_slices(seq_lens).map(fill_tuple)
+        .padded_batch(
+            4,
+            padded_shapes=(padded_shape, padded_shape),
+            padding_values=(-1, '<end>')).make_initializable_iterator())
 
     init_op = iterator.initializer
     get_next = iterator.get_next()
@@ -283,46 +291,46 @@ class BatchDatasetTest(test.TestCase):
     with self.test_session() as sess:
       # Test with random sequence lengths, and max padding.
       random_seq_lens = np.random.randint(20, size=(32,)).astype(np.int32)
-      sess.run(init_op, feed_dict={padded_shape: [-1],
-                                   seq_lens: random_seq_lens})
+      sess.run(
+          init_op, feed_dict={
+              padded_shape: [-1],
+              seq_lens: random_seq_lens
+          })
       for i in range(8):
         result = sess.run(get_next)
         padded_len = np.max(result[0])
         self.assertEqual((4, padded_len), result[0].shape)
         self.assertEqual((4, padded_len), result[1].shape)
         for j in range(4):
-          seq_len = random_seq_lens[(i*4)+j]
+          seq_len = random_seq_lens[(i * 4) + j]
           self.assertAllEqual(result[0][j, :seq_len], [seq_len] * seq_len)
           self.assertAllEqual(result[0][j, seq_len:],
                               [-1] * (padded_len - seq_len))
           self.assertAllEqual(result[1][j, :seq_len],
                               [compat.as_bytes(str(seq_len))] * seq_len)
           self.assertAllEqual(result[1][j, seq_len:],
-                              [b"<end>"] * (padded_len - seq_len))
+                              [b'<end>'] * (padded_len - seq_len))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
   def testPaddedBatchDatasetUnicode(self):
     # See GitHub issue 16149
     def generator():
-      data = [
-          [u'Простой', u'тест', u'юникода'],
-          [u'никогда', u'не', u'бывает', u'простым']]
+      data = [[u'Простой', u'тест', u'юникода'],
+              [u'никогда', u'не', u'бывает', u'простым']]
 
       for seq in data:
         yield seq, [0, 1, 2, 3]
 
     dataset = dataset_ops.Dataset.from_generator(
-        generator,
-        (dtypes.string, dtypes.int32),
+        generator, (dtypes.string, dtypes.int32),
         (tensor_shape.TensorShape([None]), tensor_shape.TensorShape([None])))
-    padded_dataset = dataset.padded_batch(2, padded_shapes=([None], [None]),
-                                          padding_values=('', 0))
+    padded_dataset = dataset.padded_batch(
+        2, padded_shapes=([None], [None]), padding_values=('', 0))
     with self.test_session() as sess:
       next_element = padded_dataset.make_one_shot_iterator().get_next()
       sess.run(next_element)
 
-
   def testPaddedBatchDatasetShapeSpecifications(self):
     int_placeholder = array_ops.placeholder(dtypes.int32)
     float_placeholder = array_ops.placeholder(dtypes.float32)
@@ -346,15 +354,16 @@ class BatchDatasetTest(test.TestCase):
                        constant_op.constant([-1, -1], dtype=dtypes.int64),
                        constant_op.constant([37], dtype=dtypes.int64)))
 
-    for dataset in [dynamic_padding_from_tensor_shapes,
-                    dynamic_padding_from_lists,
-                    dynamic_padding_from_lists_with_minus_one,
-                    dynamic_padding_from_tensors]:
+    for dataset in [
+        dynamic_padding_from_tensor_shapes, dynamic_padding_from_lists,
+        dynamic_padding_from_lists_with_minus_one, dynamic_padding_from_tensors
+    ]:
       self.assertEqual([None, None], dataset.output_shapes[0].as_list())
       self.assertEqual([None, None, None], dataset.output_shapes[1].as_list())
       self.assertEqual([None, 37], dataset.output_shapes[2].as_list())
 
   def testPaddedBatchSparseError(self):
+
     def _map_fn(i):
       return sparse_tensor.SparseTensorValue(
           indices=[[0, 0]], values=(i * [1]), dense_shape=[1, 1]), i
@@ -363,5 +372,5 @@ class BatchDatasetTest(test.TestCase):
       _ = dataset_ops.Dataset.range(10).map(_map_fn).padded_batch(10)
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/ops/histogram_ops.py b/tensorflow/python/ops/histogram_ops.py
index b2de2e5015..f079e56b10 100644
--- a/tensorflow/python/ops/histogram_ops.py
+++ b/tensorflow/python/ops/histogram_ops.py
@@ -74,7 +74,7 @@ def histogram_fixed_width_bins(values,
   ```
   """
   with ops.name_scope(name, 'histogram_fixed_width_bins',
-                      [values, value_range, nbins]) as scope:
+                      [values, value_range, nbins]):
     values = ops.convert_to_tensor(values, name='values')
     shape = array_ops.shape(values)
 
@@ -84,9 +84,10 @@ def histogram_fixed_width_bins(values,
     nbins_float = math_ops.cast(nbins, values.dtype)
 
     # Map tensor values that fall within value_range to [0, 1].
-    scaled_values = math_ops.truediv(values - value_range[0],
-                                     value_range[1] - value_range[0],
-                                     name='scaled_values')
+    scaled_values = math_ops.truediv(
+        values - value_range[0],
+        value_range[1] - value_range[0],
+        name='scaled_values')
 
     # map tensor values within the open interval value_range to {0,.., nbins-1},
     # values outside the open interval will be zero or less, or nbins or more.
@@ -138,5 +139,5 @@ def histogram_fixed_width(values,
   """
   with ops.name_scope(name, 'histogram_fixed_width',
                       [values, value_range, nbins]) as name:
-    return gen_math_ops._histogram_fixed_width(values, value_range, nbins,
-                                               dtype=dtype, name=name)
+    return gen_math_ops._histogram_fixed_width(  # pylint: disable=protected-access
+        values, value_range, nbins, dtype=dtype, name=name)
diff --git a/tensorflow/python/ops/histogram_ops_test.py b/tensorflow/python/ops/histogram_ops_test.py
index 80ee090575..a226ac81bb 100644
--- a/tensorflow/python/ops/histogram_ops_test.py
+++ b/tensorflow/python/ops/histogram_ops_test.py
@@ -36,7 +36,8 @@ class BinValuesFixedWidth(test.TestCase):
     values = []
     expected_bins = []
     with self.test_session():
-      bins = histogram_ops.histogram_fixed_width_bins(values, value_range, nbins=5)
+      bins = histogram_ops.histogram_fixed_width_bins(
+          values, value_range, nbins=5)
       self.assertEqual(dtypes.int32, bins.dtype)
       self.assertAllClose(expected_bins, bins.eval())
 
@@ -69,8 +70,7 @@ class BinValuesFixedWidth(test.TestCase):
     #   (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
     value_range = [0.0, 5.0]
     values = constant_op.constant(
-      [[-1.0, 0.0, 1.5], [2.0, 5.0, 15]],
-      shape=(2, 3))
+        [[-1.0, 0.0, 1.5], [2.0, 5.0, 15]], shape=(2, 3))
     expected_bins = [[0, 0, 1], [2, 4, 4]]
     with self.test_session():
       bins = histogram_ops.histogram_fixed_width_bins(
@@ -140,8 +140,8 @@ class HistogramFixedWidthTest(test.TestCase):
       self.assertEqual(dtypes.int32, hist.dtype)
       self.assertAllClose(expected_bin_counts, hist.eval())
 
-      hist = histogram_ops.histogram_fixed_width(values, value_range,
-                                                 nbins=placeholder)
+      hist = histogram_ops.histogram_fixed_width(
+          values, value_range, nbins=placeholder)
       self.assertEquals(hist.shape.ndims, 1)
       self.assertIs(hist.shape[0].value, None)
       self.assertEqual(dtypes.int32, hist.dtype)
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index b713c44717..76da3bed31 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -12,15 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Implementation of image ops."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -28,7 +25,6 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_image_ops
 from tensorflow.python.ops import gen_nn_ops
@@ -38,7 +34,6 @@ from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.util.tf_export import tf_export
 
-
 ops.NotDifferentiable('RandomCrop')
 # TODO(b/31222613): This op may be differentiable, and there may be
 # latent bugs here.
@@ -110,8 +105,9 @@ def _ImageDimensions(image, rank):
   else:
     static_shape = image.get_shape().with_rank(rank).as_list()
     dynamic_shape = array_ops.unstack(array_ops.shape(image), rank)
-    return [s if s is not None else d
-            for s, d in zip(static_shape, dynamic_shape)]
+    return [
+        s if s is not None else d for s, d in zip(static_shape, dynamic_shape)
+    ]
 
 
 def _Check3DImage(image, require_static=True):
@@ -132,18 +128,19 @@ def _Check3DImage(image, require_static=True):
   try:
     image_shape = image.get_shape().with_rank(3)
   except ValueError:
-    raise ValueError("'image' (shape %s) must be three-dimensional." %
-                     image.shape)
+    raise ValueError(
+        "'image' (shape %s) must be three-dimensional." % image.shape)
   if require_static and not image_shape.is_fully_defined():
-    raise ValueError("'image' (shape %s) must be fully defined." %
-                     image_shape)
+    raise ValueError("'image' (shape %s) must be fully defined." % image_shape)
   if any(x == 0 for x in image_shape):
-    raise ValueError("all dims of 'image.shape' must be > 0: %s" %
-                     image_shape)
+    raise ValueError("all dims of 'image.shape' must be > 0: %s" % image_shape)
   if not image_shape.is_fully_defined():
-    return [check_ops.assert_positive(array_ops.shape(image),
-                                      ["all dims of 'image.shape' "
-                                       "must be > 0."])]
+    return [
+        check_ops.assert_positive(
+            array_ops.shape(image),
+            ["all dims of 'image.shape' "
+             'must be > 0.'])
+    ]
   else:
     return []
 
@@ -167,7 +164,7 @@ def _Assert3DImage(image):
       added that asserts the correct dynamic shape.
     """
   return control_flow_ops.with_dependencies(
-    _Check3DImage(image, require_static=False), image)
+      _Check3DImage(image, require_static=False), image)
 
 
 def _CheckAtLeast3DImage(image, require_static=True):
@@ -195,12 +192,15 @@ def _CheckAtLeast3DImage(image, require_static=True):
   if require_static and not image_shape.is_fully_defined():
     raise ValueError('\'image\' must be fully defined.')
   if any(x == 0 for x in image_shape):
-    raise ValueError('all dims of \'image.shape\' must be > 0: %s' %
-                     image_shape)
+    raise ValueError(
+        'all dims of \'image.shape\' must be > 0: %s' % image_shape)
   if not image_shape.is_fully_defined():
-    return [check_ops.assert_positive(array_ops.shape(image),
-                                      ["all dims of 'image.shape' "
-                                       "must be > 0."])]
+    return [
+        check_ops.assert_positive(
+            array_ops.shape(image),
+            ["all dims of 'image.shape' "
+             'must be > 0.'])
+    ]
   else:
     return []
 
@@ -248,10 +248,11 @@ def random_flip_up_down(image, seed=None):
     image = _Assert3DImage(image)
     uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
     mirror_cond = math_ops.less(uniform_random, .5)
-    result = control_flow_ops.cond(mirror_cond,
-                                   lambda: array_ops.reverse(image, [0]),
-                                   lambda: image,
-                                   name=scope)
+    result = control_flow_ops.cond(
+        mirror_cond,
+        lambda: array_ops.reverse(image, [0]),
+        lambda: image,
+        name=scope)
     return fix_image_flip_shape(image, result)
 
 
@@ -279,10 +280,11 @@ def random_flip_left_right(image, seed=None):
     image = _Assert3DImage(image)
     uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
     mirror_cond = math_ops.less(uniform_random, .5)
-    result = control_flow_ops.cond(mirror_cond,
-                                   lambda: array_ops.reverse(image, [1]),
-                                   lambda: image,
-                                   name=scope)
+    result = control_flow_ops.cond(
+        mirror_cond,
+        lambda: array_ops.reverse(image, [1]),
+        lambda: image,
+        name=scope)
     return fix_image_flip_shape(image, result)
 
 
@@ -307,8 +309,8 @@ def flip_left_right(image):
   with ops.name_scope(None, 'flip_left_right', [image]) as scope:
     image = ops.convert_to_tensor(image, name='image')
     image = _Assert3DImage(image)
-    return fix_image_flip_shape(image,
-                                array_ops.reverse(image, [1], name=scope))
+    return fix_image_flip_shape(image, array_ops.reverse(
+        image, [1], name=scope))
 
 
 @tf_export('image.flip_up_down')
@@ -332,8 +334,8 @@ def flip_up_down(image):
   with ops.name_scope(None, 'flip_up_down', [image]) as scope:
     image = ops.convert_to_tensor(image, name='image')
     image = _Assert3DImage(image)
-    return fix_image_flip_shape(image,
-                                array_ops.reverse(image, [0], name=scope))
+    return fix_image_flip_shape(image, array_ops.reverse(
+        image, [0], name=scope))
 
 
 @tf_export('image.rot90')
@@ -356,19 +358,19 @@ def rot90(image, k=1, name=None):
     k = math_ops.mod(k, 4)
 
     def _rot90():
-      return array_ops.transpose(array_ops.reverse_v2(image, [1]),
-                                 [1, 0, 2])
+      return array_ops.transpose(array_ops.reverse_v2(image, [1]), [1, 0, 2])
+
     def _rot180():
       return array_ops.reverse_v2(image, [0, 1])
+
     def _rot270():
-      return array_ops.reverse_v2(array_ops.transpose(image, [1, 0, 2]),
-                                  [1])
-    cases = [(math_ops.equal(k, 1), _rot90),
-             (math_ops.equal(k, 2), _rot180),
+      return array_ops.reverse_v2(array_ops.transpose(image, [1, 0, 2]), [1])
+
+    cases = [(math_ops.equal(k, 1), _rot90), (math_ops.equal(k, 2), _rot180),
              (math_ops.equal(k, 3), _rot270)]
 
-    ret = control_flow_ops.case(cases, default=lambda: image, exclusive=True,
-                                name=scope)
+    ret = control_flow_ops.case(
+        cases, default=lambda: image, exclusive=True, name=scope)
     ret.set_shape([None, None, image.get_shape()[2]])
     return ret
 
@@ -518,8 +520,10 @@ def pad_to_bounding_box(image, offset_height, offset_width, target_height,
         ]), [4, 2])
     padded = array_ops.pad(image, paddings)
 
-    padded_shape = [None if _is_tensor(i) else i
-                    for i in [batch, target_height, target_width, depth]]
+    padded_shape = [
+        None if _is_tensor(i) else i
+        for i in [batch, target_height, target_width, depth]
+    ]
     padded.set_shape(padded_shape)
 
     if not is_batch:
@@ -593,12 +597,13 @@ def crop_to_bounding_box(image, offset_height, offset_width, target_height,
     image = control_flow_ops.with_dependencies(assert_ops, image)
 
     cropped = array_ops.slice(
-        image,
-        array_ops.stack([0, offset_height, offset_width, 0]),
+        image, array_ops.stack([0, offset_height, offset_width, 0]),
         array_ops.stack([-1, target_height, target_width, -1]))
 
-    cropped_shape = [None if _is_tensor(i) else i
-                     for i in [batch, target_height, target_width, depth]]
+    cropped_shape = [
+        None if _is_tensor(i) else i
+        for i in [batch, target_height, target_width, depth]
+    ]
     cropped.set_shape(cropped_shape)
 
     if not is_batch:
@@ -663,8 +668,8 @@ def resize_image_with_crop_or_pad(image, target_height, target_width):
       target_height = control_flow_ops.with_dependencies(
           assert_ops, target_height)
     if _is_tensor(target_width):
-      target_width = control_flow_ops.with_dependencies(
-          assert_ops, target_width)
+      target_width = control_flow_ops.with_dependencies(assert_ops,
+                                                        target_width)
 
     def max_(x, y):
       if _is_tensor(x) or _is_tensor(y):
@@ -709,10 +714,12 @@ def resize_image_with_crop_or_pad(image, target_height, target_width):
     _, resized_height, resized_width, _ = _ImageDimensions(resized, rank=4)
 
     assert_ops = []
-    assert_ops += _assert(equal_(resized_height, target_height), ValueError,
-                          'resized height is not correct.')
-    assert_ops += _assert(equal_(resized_width, target_width), ValueError,
-                          'resized width is not correct.')
+    assert_ops += _assert(
+        equal_(resized_height, target_height), ValueError,
+        'resized height is not correct.')
+    assert_ops += _assert(
+        equal_(resized_width, target_width), ValueError,
+        'resized width is not correct.')
 
     resized = control_flow_ops.with_dependencies(assert_ops, resized)
 
@@ -813,22 +820,17 @@ def resize_images(images,
       return images
 
     if method == ResizeMethod.BILINEAR:
-      images = gen_image_ops.resize_bilinear(images,
-                                             size,
-                                             align_corners=align_corners)
+      images = gen_image_ops.resize_bilinear(
+          images, size, align_corners=align_corners)
     elif method == ResizeMethod.NEAREST_NEIGHBOR:
-      images = gen_image_ops.resize_nearest_neighbor(images,
-                                                     size,
-                                                     align_corners=
-                                                     align_corners)
+      images = gen_image_ops.resize_nearest_neighbor(
+          images, size, align_corners=align_corners)
     elif method == ResizeMethod.BICUBIC:
-      images = gen_image_ops.resize_bicubic(images,
-                                            size,
-                                            align_corners=align_corners)
+      images = gen_image_ops.resize_bicubic(
+          images, size, align_corners=align_corners)
     elif method == ResizeMethod.AREA:
-      images = gen_image_ops.resize_area(images,
-                                         size,
-                                         align_corners=align_corners)
+      images = gen_image_ops.resize_area(
+          images, size, align_corners=align_corners)
     else:
       raise ValueError('Resize method is not implemented.')
 
@@ -869,8 +871,9 @@ def per_image_standardization(image):
     image = math_ops.cast(image, dtype=dtypes.float32)
     image_mean = math_ops.reduce_mean(image)
 
-    variance = (math_ops.reduce_mean(math_ops.square(image)) -
-                math_ops.square(image_mean))
+    variance = (
+        math_ops.reduce_mean(math_ops.square(image)) -
+        math_ops.square(image_mean))
     variance = gen_nn_ops.relu(variance)
     stddev = math_ops.sqrt(variance)
 
@@ -971,9 +974,8 @@ def adjust_brightness(image, delta):
     orig_dtype = image.dtype
     flt_image = convert_image_dtype(image, dtypes.float32)
 
-    adjusted = math_ops.add(flt_image,
-                            math_ops.cast(delta, dtypes.float32),
-                            name=name)
+    adjusted = math_ops.add(
+        flt_image, math_ops.cast(delta, dtypes.float32), name=name)
 
     return convert_image_dtype(adjusted, orig_dtype, saturate=True)
 
@@ -1012,9 +1014,8 @@ def adjust_contrast(images, contrast_factor):
     flt_images = convert_image_dtype(images, dtypes.float32)
 
     # pylint: disable=protected-access
-    adjusted = gen_image_ops._adjust_contrastv2(flt_images,
-                                                contrast_factor=contrast_factor,
-                                                name=name)
+    adjusted = gen_image_ops._adjust_contrastv2(
+        flt_images, contrast_factor=contrast_factor, name=name)
     # pylint: enable=protected-access
 
     return convert_image_dtype(adjusted, orig_dtype, saturate=True)
@@ -1061,10 +1062,10 @@ def adjust_gamma(image, gamma=1, gain=1):
       gamma = control_flow_ops.with_dependencies(assert_op, gamma)
 
     # scale = max(dtype) - min(dtype).
-    scale = constant_op.constant(image.dtype.limits[1] - image.dtype.limits[0],
-                                 dtype=dtypes.float32)
+    scale = constant_op.constant(
+        image.dtype.limits[1] - image.dtype.limits[0], dtype=dtypes.float32)
     # According to the definition of gamma correction.
-    adjusted_img = (img / scale) ** gamma * scale * gain
+    adjusted_img = (img / scale)**gamma * scale * gain
 
     return adjusted_img
 
@@ -1195,9 +1196,8 @@ def grayscale_to_rgb(images, name=None):
   with ops.name_scope(name, 'grayscale_to_rgb', [images]) as name:
     images = ops.convert_to_tensor(images, name='images')
     rank_1 = array_ops.expand_dims(array_ops.rank(images) - 1, 0)
-    shape_list = (
-        [array_ops.ones(rank_1,
-                        dtype=dtypes.int32)] + [array_ops.expand_dims(3, 0)])
+    shape_list = ([array_ops.ones(rank_1, dtype=dtypes.int32)] +
+                  [array_ops.expand_dims(3, 0)])
     multiples = array_ops.concat(shape_list, 0)
     rgb = array_ops.tile(images, multiples, name=name)
     rgb.set_shape(images.get_shape()[:-1].concatenate([3]))
@@ -1393,8 +1393,7 @@ def decode_image(contents, channels=None, name=None):
       gif_channels = 0 if channels is None else channels
       good_channels = math_ops.logical_and(
           math_ops.not_equal(gif_channels, 1, name='check_gif_channels'),
-          math_ops.not_equal(gif_channels, 4, name='check_gif_channels')
-      )
+          math_ops.not_equal(gif_channels, 4, name='check_gif_channels'))
       channels_msg = 'Channels must be in (None, 0, 3) when decoding GIF images'
       assert_channels = control_flow_ops.Assert(good_channels, [channels_msg])
       with ops.control_dependencies([assert_channels]):
@@ -1417,8 +1416,8 @@ def decode_image(contents, channels=None, name=None):
     def _jpeg():
       """Decodes a jpeg image."""
       jpeg_channels = 0 if channels is None else channels
-      good_channels = math_ops.not_equal(jpeg_channels, 4,
-                                         name='check_jpeg_channels')
+      good_channels = math_ops.not_equal(
+          jpeg_channels, 4, name='check_jpeg_channels')
       channels_msg = ('Channels must be in (None, 0, 1, 3) when decoding JPEG '
                       'images')
       assert_channels = control_flow_ops.Assert(good_channels, [channels_msg])
@@ -1496,16 +1495,21 @@ def total_variation(images, name=None):
 
     # Calculate the total variation by taking the absolute value of the
     # pixel-differences and summing over the appropriate axis.
-    tot_var = (math_ops.reduce_sum(math_ops.abs(pixel_dif1), axis=sum_axis) +
-               math_ops.reduce_sum(math_ops.abs(pixel_dif2), axis=sum_axis))
+    tot_var = (
+        math_ops.reduce_sum(math_ops.abs(pixel_dif1), axis=sum_axis) +
+        math_ops.reduce_sum(math_ops.abs(pixel_dif2), axis=sum_axis))
 
   return tot_var
 
 
 @tf_export('image.sample_distorted_bounding_box')
-def sample_distorted_bounding_box(image_size, bounding_boxes, seed=None,
-                                  seed2=None, min_object_covered=None,
-                                  aspect_ratio_range=None, area_range=None,
+def sample_distorted_bounding_box(image_size,
+                                  bounding_boxes,
+                                  seed=None,
+                                  seed2=None,
+                                  min_object_covered=None,
+                                  aspect_ratio_range=None,
+                                  area_range=None,
                                   max_attempts=None,
                                   use_image_if_no_bounding_boxes=None,
                                   name=None):
@@ -1521,10 +1525,12 @@ def sample_distorted_bounding_box(image_size, bounding_boxes, seed=None,
   The output of this Op is a single bounding box that may be used to crop the
   original image. The output is returned as 3 tensors: `begin`, `size` and
   `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
-  image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
+  image. The latter may be supplied to `tf.image.draw_bounding_boxes` to
+  visualize
   what the bounding box looks like.
 
-  Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
+  Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`.
+  The
   bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
   height of the underlying image.
 
@@ -1552,23 +1558,27 @@ def sample_distorted_bounding_box(image_size, bounding_boxes, seed=None,
   false and no bounding boxes are supplied, an error is raised.
 
   Args:
-    image_size: A `Tensor`. Must be one of the following types: `uint8`, `int8`, `int16`, `int32`, `int64`.
+    image_size: A `Tensor`. Must be one of the following types: `uint8`, `int8`,
+      `int16`, `int32`, `int64`.
       1-D, containing `[height, width, channels]`.
     bounding_boxes: A `Tensor` of type `float32`.
       3-D with shape `[batch, N, 4]` describing the N bounding boxes
       associated with the image.
     seed: An optional `int`. Defaults to `0`.
       If either `seed` or `seed2` are set to non-zero, the random number
-      generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
+      generator is seeded by the given `seed`.  Otherwise, it is seeded by a
+        random
       seed.
     seed2: An optional `int`. Defaults to `0`.
       A second seed to avoid seed collision.
     min_object_covered: A Tensor of type `float32`. Defaults to `0.1`.
       The cropped area of the image must contain at least this
-      fraction of any bounding box supplied. The value of this parameter should be
+      fraction of any bounding box supplied. The value of this parameter should
+        be
       non-negative. In the case of 0, the cropped area does not need to overlap
       any of the bounding boxes supplied.
-    aspect_ratio_range: An optional list of `floats`. Defaults to `[0.75, 1.33]`.
+    aspect_ratio_range: An optional list of `floats`. Defaults to `[0.75,
+      1.33]`.
       The cropped area of the image must have an aspect ratio =
       width / height within this range.
     area_range: An optional list of `floats`. Defaults to `[0.05, 1]`.
@@ -1576,32 +1586,41 @@ def sample_distorted_bounding_box(image_size, bounding_boxes, seed=None,
       supplied image within in this range.
     max_attempts: An optional `int`. Defaults to `100`.
       Number of attempts at generating a cropped region of the image
-      of the specified constraints. After `max_attempts` failures, return the entire
+      of the specified constraints. After `max_attempts` failures, return the
+        entire
       image.
     use_image_if_no_bounding_boxes: An optional `bool`. Defaults to `False`.
       Controls behavior if no bounding boxes supplied.
-      If true, assume an implicit bounding box covering the whole input. If false,
+      If true, assume an implicit bounding box covering the whole input. If
+        false,
       raise an error.
     name: A name for the operation (optional).
 
   Returns:
     A tuple of `Tensor` objects (begin, size, bboxes).
 
-    begin: A `Tensor`. Has the same type as `image_size`. 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
+    begin: A `Tensor`. Has the same type as `image_size`. 1-D, containing
+    `[offset_height, offset_width, 0]`. Provide as input to
       `tf.slice`.
-    size: A `Tensor`. Has the same type as `image_size`. 1-D, containing `[target_height, target_width, -1]`. Provide as input to
+    size: A `Tensor`. Has the same type as `image_size`. 1-D, containing
+    `[target_height, target_width, -1]`. Provide as input to
       `tf.slice`.
-    bboxes: A `Tensor` of type `float32`. 3-D with shape `[1, 1, 4]` containing the distorted bounding box.
+    bboxes: A `Tensor` of type `float32`. 3-D with shape `[1, 1, 4]` containing
+    the distorted bounding box.
       Provide as input to `tf.image.draw_bounding_boxes`.
   """
   with ops.name_scope(name, 'sample_distorted_bounding_box'):
-    return gen_image_ops._sample_distorted_bounding_box_v2(image_size,
-                bounding_boxes, seed=seed,
-                seed2=seed2, min_object_covered=min_object_covered,
-                aspect_ratio_range=aspect_ratio_range, area_range=area_range,
-                max_attempts=max_attempts,
-                use_image_if_no_bounding_boxes=use_image_if_no_bounding_boxes,
-                name=name)
+    return gen_image_ops._sample_distorted_bounding_box_v2(  # pylint: disable=protected-access
+        image_size,
+        bounding_boxes,
+        seed=seed,
+        seed2=seed2,
+        min_object_covered=min_object_covered,
+        aspect_ratio_range=aspect_ratio_range,
+        area_range=area_range,
+        max_attempts=max_attempts,
+        use_image_if_no_bounding_boxes=use_image_if_no_bounding_boxes,
+        name=name)
 
 
 @tf_export('image.non_max_suppression')
diff --git a/tensorflow/python/ops/metrics_impl.py b/tensorflow/python/ops/metrics_impl.py
index 2d77e26081..7776ff08c4 100644
--- a/tensorflow/python/ops/metrics_impl.py
+++ b/tensorflow/python/ops/metrics_impl.py
@@ -100,27 +100,29 @@ def _remove_squeezable_dimensions(predictions, labels, weights):
     # Use dynamic rank.
     weights_rank_tensor = array_ops.rank(weights)
     rank_diff = weights_rank_tensor - array_ops.rank(predictions)
+
     def _maybe_expand_weights():
       return control_flow_ops.cond(
           math_ops.equal(rank_diff, -1),
-          lambda: array_ops.expand_dims(weights, [-1]),
-          lambda: weights)
+          lambda: array_ops.expand_dims(weights, [-1]), lambda: weights)
+
     # Don't attempt squeeze if it will fail based on static check.
     if ((weights_rank is not None) and
         (not weights_shape.dims[-1].is_compatible_with(1))):
       maybe_squeeze_weights = lambda: weights
     else:
       maybe_squeeze_weights = lambda: array_ops.squeeze(weights, [-1])
+
     def _maybe_adjust_weights():
       return control_flow_ops.cond(
-          math_ops.equal(rank_diff, 1),
-          maybe_squeeze_weights,
+          math_ops.equal(rank_diff, 1), maybe_squeeze_weights,
           _maybe_expand_weights)
+
     # If weights are scalar, do nothing. Otherwise, try to add or remove a
     # dimension to match predictions.
     weights = control_flow_ops.cond(
-        math_ops.equal(weights_rank_tensor, 0),
-        lambda: weights, _maybe_adjust_weights)
+        math_ops.equal(weights_rank_tensor, 0), lambda: weights,
+        _maybe_adjust_weights)
   return predictions, labels, weights
 
 
@@ -165,14 +167,14 @@ def _maybe_expand_labels(labels, predictions):
         if predictions_rank == labels_rank + 1:
           return array_ops.expand_dims(labels, -1, name=scope)
         raise ValueError(
-            'Unexpected labels shape %s for predictions shape %s.' % (
-                labels.get_shape(), predictions.get_shape()))
+            'Unexpected labels shape %s for predictions shape %s.' %
+            (labels.get_shape(), predictions.get_shape()))
 
     # Otherwise, use dynamic shape.
     return control_flow_ops.cond(
-        math_ops.equal(array_ops.rank(predictions), array_ops.rank(labels) + 1),
-        lambda: array_ops.expand_dims(labels, -1, name=scope),
-        lambda: labels)
+        math_ops.equal(array_ops.rank(predictions),
+                       array_ops.rank(labels) + 1),
+        lambda: array_ops.expand_dims(labels, -1, name=scope), lambda: labels)
 
 
 def _safe_div(numerator, denominator, name):
@@ -264,8 +266,11 @@ def _streaming_confusion_matrix(labels, predictions, num_classes, weights=None):
 
 
 @tf_export('metrics.mean')
-def mean(values, weights=None, metrics_collections=None,
-         updates_collections=None, name=None):
+def mean(values,
+         weights=None,
+         metrics_collections=None,
+         updates_collections=None,
+         name=None):
   """Computes the (weighted) mean of the given values.
 
   The `mean` function creates two local variables, `total` and `count`
@@ -340,8 +345,12 @@ def mean(values, weights=None, metrics_collections=None,
 
 
 @tf_export('metrics.accuracy')
-def accuracy(labels, predictions, weights=None, metrics_collections=None,
-             updates_collections=None, name=None):
+def accuracy(labels,
+             predictions,
+             weights=None,
+             metrics_collections=None,
+             updates_collections=None,
+             name=None):
   """Calculates how often `predictions` matches `labels`.
 
   The `accuracy` function creates two local variables, `total` and
@@ -395,12 +404,15 @@ def accuracy(labels, predictions, weights=None, metrics_collections=None,
   if labels.dtype != predictions.dtype:
     predictions = math_ops.cast(predictions, labels.dtype)
   is_correct = math_ops.to_float(math_ops.equal(predictions, labels))
-  return mean(is_correct, weights, metrics_collections,
-              updates_collections, name or 'accuracy')
+  return mean(is_correct, weights, metrics_collections, updates_collections,
+              name or 'accuracy')
 
 
-def _confusion_matrix_at_thresholds(
-    labels, predictions, thresholds, weights=None, includes=None):
+def _confusion_matrix_at_thresholds(labels,
+                                    predictions,
+                                    thresholds,
+                                    weights=None,
+                                    includes=None):
   """Computes true_positives, false_negatives, true_negatives, false_positives.
 
   This function creates up to four local variables, `true_positives`,
@@ -498,8 +510,8 @@ def _confusion_matrix_at_thresholds(
   if weights is not None:
     weights = weights_broadcast_ops.broadcast_weights(
         math_ops.to_float(weights), predictions)
-    weights_tiled = array_ops.tile(array_ops.reshape(
-        weights, [1, -1]), [num_thresholds, 1])
+    weights_tiled = array_ops.tile(
+        array_ops.reshape(weights, [1, -1]), [num_thresholds, 1])
     thresh_tiled.get_shape().assert_is_compatible_with(
         weights_tiled.get_shape())
   else:
@@ -515,8 +527,9 @@ def _confusion_matrix_at_thresholds(
         math_ops.logical_and(label_is_pos, pred_is_pos))
     if weights_tiled is not None:
       is_true_positive *= weights_tiled
-    update_ops['tp'] = state_ops.assign_add(
-        true_p, math_ops.reduce_sum(is_true_positive, 1))
+    update_ops['tp'] = state_ops.assign_add(true_p,
+                                            math_ops.reduce_sum(
+                                                is_true_positive, 1))
     values['tp'] = true_p
 
   if 'fn' in includes:
@@ -526,8 +539,9 @@ def _confusion_matrix_at_thresholds(
         math_ops.logical_and(label_is_pos, pred_is_neg))
     if weights_tiled is not None:
       is_false_negative *= weights_tiled
-    update_ops['fn'] = state_ops.assign_add(
-        false_n, math_ops.reduce_sum(is_false_negative, 1))
+    update_ops['fn'] = state_ops.assign_add(false_n,
+                                            math_ops.reduce_sum(
+                                                is_false_negative, 1))
     values['fn'] = false_n
 
   if 'tn' in includes:
@@ -537,8 +551,9 @@ def _confusion_matrix_at_thresholds(
         math_ops.logical_and(label_is_neg, pred_is_neg))
     if weights_tiled is not None:
       is_true_negative *= weights_tiled
-    update_ops['tn'] = state_ops.assign_add(
-        true_n, math_ops.reduce_sum(is_true_negative, 1))
+    update_ops['tn'] = state_ops.assign_add(true_n,
+                                            math_ops.reduce_sum(
+                                                is_true_negative, 1))
     values['tn'] = true_n
 
   if 'fp' in includes:
@@ -548,17 +563,24 @@ def _confusion_matrix_at_thresholds(
         math_ops.logical_and(label_is_neg, pred_is_pos))
     if weights_tiled is not None:
       is_false_positive *= weights_tiled
-    update_ops['fp'] = state_ops.assign_add(
-        false_p, math_ops.reduce_sum(is_false_positive, 1))
+    update_ops['fp'] = state_ops.assign_add(false_p,
+                                            math_ops.reduce_sum(
+                                                is_false_positive, 1))
     values['fp'] = false_p
 
   return values, update_ops
 
 
 @tf_export('metrics.auc')
-def auc(labels, predictions, weights=None, num_thresholds=200,
-        metrics_collections=None, updates_collections=None,
-        curve='ROC', name=None, summation_method='trapezoidal'):
+def auc(labels,
+        predictions,
+        weights=None,
+        num_thresholds=200,
+        metrics_collections=None,
+        updates_collections=None,
+        curve='ROC',
+        name=None,
+        summation_method='trapezoidal'):
   """Computes the approximate AUC via a Riemann sum.
 
   The `auc` function creates four local variables, `true_positives`,
@@ -626,14 +648,14 @@ def auc(labels, predictions, weights=None, num_thresholds=200,
     raise RuntimeError('tf.metrics.auc is not supported when eager execution '
                        'is enabled.')
 
-  with variable_scope.variable_scope(
-      name, 'auc', (labels, predictions, weights)):
+  with variable_scope.variable_scope(name, 'auc',
+                                     (labels, predictions, weights)):
     if curve != 'ROC' and curve != 'PR':
-      raise ValueError('curve must be either ROC or PR, %s unknown' %
-                       (curve))
+      raise ValueError('curve must be either ROC or PR, %s unknown' % (curve))
     kepsilon = 1e-7  # to account for floating point imprecisions
-    thresholds = [(i + 1) * 1.0 / (num_thresholds - 1)
-                  for i in range(num_thresholds-2)]
+    thresholds = [
+        (i + 1) * 1.0 / (num_thresholds - 1) for i in range(num_thresholds - 2)
+    ]
     thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon]
 
     values, update_ops = _confusion_matrix_at_thresholds(
@@ -641,6 +663,7 @@ def auc(labels, predictions, weights=None, num_thresholds=200,
 
     # Add epsilons to avoid dividing by 0.
     epsilon = 1.0e-6
+
     def compute_auc(tp, fn, tn, fp, name):
       """Computes the roc-auc or pr-auc based on confusion counts."""
       rec = math_ops.div(tp + epsilon, tp + fn + epsilon)
@@ -671,11 +694,10 @@ def auc(labels, predictions, weights=None, num_thresholds=200,
         raise ValueError('Invalid summation_method: %s' % summation_method)
 
     # sum up the areas of all the trapeziums
-    auc_value = compute_auc(
-        values['tp'], values['fn'], values['tn'], values['fp'], 'value')
-    update_op = compute_auc(
-        update_ops['tp'], update_ops['fn'], update_ops['tn'], update_ops['fp'],
-        'update_op')
+    auc_value = compute_auc(values['tp'], values['fn'], values['tn'],
+                            values['fp'], 'value')
+    update_op = compute_auc(update_ops['tp'], update_ops['fn'],
+                            update_ops['tn'], update_ops['fp'], 'update_op')
 
     if metrics_collections:
       ops.add_to_collections(metrics_collections, auc_value)
@@ -687,7 +709,9 @@ def auc(labels, predictions, weights=None, num_thresholds=200,
 
 
 @tf_export('metrics.mean_absolute_error')
-def mean_absolute_error(labels, predictions, weights=None,
+def mean_absolute_error(labels,
+                        predictions,
+                        weights=None,
                         metrics_collections=None,
                         updates_collections=None,
                         name=None):
@@ -746,7 +770,10 @@ def mean_absolute_error(labels, predictions, weights=None,
 
 
 @tf_export('metrics.mean_cosine_distance')
-def mean_cosine_distance(labels, predictions, dim, weights=None,
+def mean_cosine_distance(labels,
+                         predictions,
+                         dim,
+                         weights=None,
                          metrics_collections=None,
                          updates_collections=None,
                          name=None):
@@ -802,10 +829,8 @@ def mean_cosine_distance(labels, predictions, dim, weights=None,
       radial_diffs, reduction_indices=[
           dim,
       ], keepdims=True)
-  mean_distance, update_op = mean(radial_diffs, weights,
-                                  None,
-                                  None,
-                                  name or 'mean_cosine_distance')
+  mean_distance, update_op = mean(radial_diffs, weights, None, None, name or
+                                  'mean_cosine_distance')
   mean_distance = math_ops.subtract(1.0, mean_distance)
   update_op = math_ops.subtract(1.0, update_op)
 
@@ -906,8 +931,8 @@ def mean_per_class_accuracy(labels,
 
     per_class_accuracy = _safe_div(count, total, None)
 
-    mean_accuracy_v = math_ops.reduce_mean(per_class_accuracy,
-                                           name='mean_accuracy')
+    mean_accuracy_v = math_ops.reduce_mean(
+        per_class_accuracy, name='mean_accuracy')
     update_op = _safe_div(update_count_op, update_total_op, name='update_op')
 
     if metrics_collections:
@@ -975,13 +1000,14 @@ def mean_iou(labels,
     raise RuntimeError('tf.metrics.mean_iou is not supported when '
                        'eager execution is enabled.')
 
-  with variable_scope.variable_scope(
-      name, 'mean_iou', (predictions, labels, weights)):
+  with variable_scope.variable_scope(name, 'mean_iou',
+                                     (predictions, labels, weights)):
     # Check if shape is compatible.
     predictions.get_shape().assert_is_compatible_with(labels.get_shape())
 
     total_cm, update_op = _streaming_confusion_matrix(labels, predictions,
                                                       num_classes, weights)
+
     def compute_mean_iou(name):
       """Compute the mean intersection-over-union via the confusion matrix."""
       sum_over_row = math_ops.to_float(math_ops.reduce_sum(total_cm, 0))
@@ -992,22 +1018,21 @@ def mean_iou(labels,
       # The mean is only computed over classes that appear in the
       # label or prediction tensor. If the denominator is 0, we need to
       # ignore the class.
-      num_valid_entries = math_ops.reduce_sum(math_ops.cast(
-          math_ops.not_equal(denominator, 0), dtype=dtypes.float32))
+      num_valid_entries = math_ops.reduce_sum(
+          math_ops.cast(
+              math_ops.not_equal(denominator, 0), dtype=dtypes.float32))
 
       # If the value of the denominator is 0, set it to 1 to avoid
       # zero division.
       denominator = array_ops.where(
-          math_ops.greater(denominator, 0),
-          denominator,
+          math_ops.greater(denominator, 0), denominator,
           array_ops.ones_like(denominator))
       iou = math_ops.div(cm_diag, denominator)
 
       # If the number of valid entries is 0 (no classes) we return 0.
       result = array_ops.where(
           math_ops.greater(num_valid_entries, 0),
-          math_ops.reduce_sum(iou, name=name) / num_valid_entries,
-          0)
+          math_ops.reduce_sum(iou, name=name) / num_valid_entries, 0)
       return result
 
     mean_iou_v = compute_mean_iou('mean_iou')
@@ -1022,7 +1047,10 @@ def mean_iou(labels,
 
 
 @tf_export('metrics.mean_relative_error')
-def mean_relative_error(labels, predictions, normalizer, weights=None,
+def mean_relative_error(labels,
+                        predictions,
+                        normalizer,
+                        weights=None,
                         metrics_collections=None,
                         updates_collections=None,
                         name=None):
@@ -1081,15 +1109,16 @@ def mean_relative_error(labels, predictions, normalizer, weights=None,
       predictions, normalizer)
   predictions.get_shape().assert_is_compatible_with(normalizer.get_shape())
   relative_errors = array_ops.where(
-      math_ops.equal(normalizer, 0.0),
-      array_ops.zeros_like(labels),
+      math_ops.equal(normalizer, 0.0), array_ops.zeros_like(labels),
       math_ops.div(math_ops.abs(labels - predictions), normalizer))
   return mean(relative_errors, weights, metrics_collections,
               updates_collections, name or 'mean_relative_error')
 
 
 @tf_export('metrics.mean_squared_error')
-def mean_squared_error(labels, predictions, weights=None,
+def mean_squared_error(labels,
+                       predictions,
+                       weights=None,
                        metrics_collections=None,
                        updates_collections=None,
                        name=None):
@@ -1143,13 +1172,16 @@ def mean_squared_error(labels, predictions, weights=None,
   predictions, labels, weights = _remove_squeezable_dimensions(
       predictions=predictions, labels=labels, weights=weights)
   squared_error = math_ops.square(labels - predictions)
-  return mean(squared_error, weights, metrics_collections,
-              updates_collections, name or 'mean_squared_error')
+  return mean(squared_error, weights, metrics_collections, updates_collections,
+              name or 'mean_squared_error')
 
 
 @tf_export('metrics.mean_tensor')
-def mean_tensor(values, weights=None, metrics_collections=None,
-                updates_collections=None, name=None):
+def mean_tensor(values,
+                weights=None,
+                metrics_collections=None,
+                updates_collections=None,
+                name=None):
   """Computes the element-wise (weighted) mean of the given tensors.
 
   In contrast to the `mean` function which returns a scalar with the
@@ -1216,9 +1248,8 @@ def mean_tensor(values, weights=None, metrics_collections=None,
       update_count_op = state_ops.assign_add(count, num_values)
 
     def compute_mean(total, count, name):
-      non_zero_count = math_ops.maximum(count,
-                                        array_ops.ones_like(count),
-                                        name=name)
+      non_zero_count = math_ops.maximum(
+          count, array_ops.ones_like(count), name=name)
       return math_ops.truediv(total, non_zero_count, name=name)
 
     mean_t = compute_mean(total, count, 'value')
@@ -1234,7 +1265,9 @@ def mean_tensor(values, weights=None, metrics_collections=None,
 
 
 @tf_export('metrics.percentage_below')
-def percentage_below(values, threshold, weights=None,
+def percentage_below(values,
+                     threshold,
+                     weights=None,
                      metrics_collections=None,
                      updates_collections=None,
                      name=None):
@@ -1281,14 +1314,13 @@ def percentage_below(values, threshold, weights=None,
                        'eager execution is enabled.')
 
   is_below_threshold = math_ops.to_float(math_ops.less(values, threshold))
-  return mean(is_below_threshold,
-              weights,
-              metrics_collections,
-              updates_collections,
-              name or 'percentage_below_threshold')
+  return mean(is_below_threshold, weights, metrics_collections,
+              updates_collections, name or 'percentage_below_threshold')
 
 
-def _count_condition(values, weights=None, metrics_collections=None,
+def _count_condition(values,
+                     weights=None,
+                     metrics_collections=None,
                      updates_collections=None):
   """Sums the weights of cases where the given values are True.
 
@@ -1318,8 +1350,8 @@ def _count_condition(values, weights=None, metrics_collections=None,
 
   values = math_ops.to_float(values)
   if weights is not None:
-    with ops.control_dependencies((
-        check_ops.assert_rank_in(weights, (0, array_ops.rank(values))),)):
+    with ops.control_dependencies((check_ops.assert_rank_in(
+        weights, (0, array_ops.rank(values))),)):
       weights = math_ops.to_float(weights)
       values = math_ops.multiply(values, weights)
 
@@ -1336,7 +1368,9 @@ def _count_condition(values, weights=None, metrics_collections=None,
 
 
 @tf_export('metrics.false_negatives')
-def false_negatives(labels, predictions, weights=None,
+def false_negatives(labels,
+                    predictions,
+                    weights=None,
                     metrics_collections=None,
                     updates_collections=None,
                     name=None):
@@ -1372,21 +1406,24 @@ def false_negatives(labels, predictions, weights=None,
     raise RuntimeError('tf.metrics.false_negatives is not supported when '
                        'eager execution is enabled.')
 
-  with variable_scope.variable_scope(
-      name, 'false_negatives', (predictions, labels, weights)):
+  with variable_scope.variable_scope(name, 'false_negatives',
+                                     (predictions, labels, weights)):
 
     predictions, labels, weights = _remove_squeezable_dimensions(
         predictions=math_ops.cast(predictions, dtype=dtypes.bool),
         labels=math_ops.cast(labels, dtype=dtypes.bool),
         weights=weights)
-    is_false_negative = math_ops.logical_and(math_ops.equal(labels, True),
-                                             math_ops.equal(predictions, False))
+    is_false_negative = math_ops.logical_and(
+        math_ops.equal(labels, True), math_ops.equal(predictions, False))
     return _count_condition(is_false_negative, weights, metrics_collections,
                             updates_collections)
 
 
 @tf_export('metrics.false_negatives_at_thresholds')
-def false_negatives_at_thresholds(labels, predictions, thresholds, weights=None,
+def false_negatives_at_thresholds(labels,
+                                  predictions,
+                                  thresholds,
+                                  weights=None,
                                   metrics_collections=None,
                                   updates_collections=None,
                                   name=None):
@@ -1440,7 +1477,9 @@ def false_negatives_at_thresholds(labels, predictions, thresholds, weights=None,
 
 
 @tf_export('metrics.false_positives')
-def false_positives(labels, predictions, weights=None,
+def false_positives(labels,
+                    predictions,
+                    weights=None,
                     metrics_collections=None,
                     updates_collections=None,
                     name=None):
@@ -1477,21 +1516,24 @@ def false_positives(labels, predictions, weights=None,
     raise RuntimeError('tf.metrics.false_positives is not supported when '
                        'eager execution is enabled.')
 
-  with variable_scope.variable_scope(
-      name, 'false_positives', (predictions, labels, weights)):
+  with variable_scope.variable_scope(name, 'false_positives',
+                                     (predictions, labels, weights)):
 
     predictions, labels, weights = _remove_squeezable_dimensions(
         predictions=math_ops.cast(predictions, dtype=dtypes.bool),
         labels=math_ops.cast(labels, dtype=dtypes.bool),
         weights=weights)
-    is_false_positive = math_ops.logical_and(math_ops.equal(labels, False),
-                                             math_ops.equal(predictions, True))
+    is_false_positive = math_ops.logical_and(
+        math_ops.equal(labels, False), math_ops.equal(predictions, True))
     return _count_condition(is_false_positive, weights, metrics_collections,
                             updates_collections)
 
 
 @tf_export('metrics.false_positives_at_thresholds')
-def false_positives_at_thresholds(labels, predictions, thresholds, weights=None,
+def false_positives_at_thresholds(labels,
+                                  predictions,
+                                  thresholds,
+                                  weights=None,
                                   metrics_collections=None,
                                   updates_collections=None,
                                   name=None):
@@ -1545,7 +1587,9 @@ def false_positives_at_thresholds(labels, predictions, thresholds, weights=None,
 
 
 @tf_export('metrics.true_negatives')
-def true_negatives(labels, predictions, weights=None,
+def true_negatives(labels,
+                   predictions,
+                   weights=None,
                    metrics_collections=None,
                    updates_collections=None,
                    name=None):
@@ -1582,21 +1626,24 @@ def true_negatives(labels, predictions, weights=None,
     raise RuntimeError('tf.metrics.true_negatives is not '
                        'supported when eager execution is enabled.')
 
-  with variable_scope.variable_scope(
-      name, 'true_negatives', (predictions, labels, weights)):
+  with variable_scope.variable_scope(name, 'true_negatives',
+                                     (predictions, labels, weights)):
 
     predictions, labels, weights = _remove_squeezable_dimensions(
         predictions=math_ops.cast(predictions, dtype=dtypes.bool),
         labels=math_ops.cast(labels, dtype=dtypes.bool),
         weights=weights)
-    is_true_negative = math_ops.logical_and(math_ops.equal(labels, False),
-                                            math_ops.equal(predictions, False))
+    is_true_negative = math_ops.logical_and(
+        math_ops.equal(labels, False), math_ops.equal(predictions, False))
     return _count_condition(is_true_negative, weights, metrics_collections,
                             updates_collections)
 
 
 @tf_export('metrics.true_negatives_at_thresholds')
-def true_negatives_at_thresholds(labels, predictions, thresholds, weights=None,
+def true_negatives_at_thresholds(labels,
+                                 predictions,
+                                 thresholds,
+                                 weights=None,
                                  metrics_collections=None,
                                  updates_collections=None,
                                  name=None):
@@ -1650,7 +1697,9 @@ def true_negatives_at_thresholds(labels, predictions, thresholds, weights=None,
 
 
 @tf_export('metrics.true_positives')
-def true_positives(labels, predictions, weights=None,
+def true_positives(labels,
+                   predictions,
+                   weights=None,
                    metrics_collections=None,
                    updates_collections=None,
                    name=None):
@@ -1687,21 +1736,24 @@ def true_positives(labels, predictions, weights=None,
     raise RuntimeError('tf.metrics.true_positives is not '
                        'supported when eager execution is enabled.')
 
-  with variable_scope.variable_scope(
-      name, 'true_positives', (predictions, labels, weights)):
+  with variable_scope.variable_scope(name, 'true_positives',
+                                     (predictions, labels, weights)):
 
     predictions, labels, weights = _remove_squeezable_dimensions(
         predictions=math_ops.cast(predictions, dtype=dtypes.bool),
         labels=math_ops.cast(labels, dtype=dtypes.bool),
         weights=weights)
-    is_true_positive = math_ops.logical_and(math_ops.equal(labels, True),
-                                            math_ops.equal(predictions, True))
+    is_true_positive = math_ops.logical_and(
+        math_ops.equal(labels, True), math_ops.equal(predictions, True))
     return _count_condition(is_true_positive, weights, metrics_collections,
                             updates_collections)
 
 
 @tf_export('metrics.true_positives_at_thresholds')
-def true_positives_at_thresholds(labels, predictions, thresholds, weights=None,
+def true_positives_at_thresholds(labels,
+                                 predictions,
+                                 thresholds,
+                                 weights=None,
                                  metrics_collections=None,
                                  updates_collections=None,
                                  name=None):
@@ -1755,8 +1807,11 @@ def true_positives_at_thresholds(labels, predictions, thresholds, weights=None,
 
 
 @tf_export('metrics.precision')
-def precision(labels, predictions, weights=None,
-              metrics_collections=None, updates_collections=None,
+def precision(labels,
+              predictions,
+              weights=None,
+              metrics_collections=None,
+              updates_collections=None,
               name=None):
   """Computes the precision of the predictions with respect to the labels.
 
@@ -1805,8 +1860,8 @@ def precision(labels, predictions, weights=None,
     raise RuntimeError('tf.metrics.precision is not '
                        'supported when eager execution is enabled.')
 
-  with variable_scope.variable_scope(
-      name, 'precision', (predictions, labels, weights)):
+  with variable_scope.variable_scope(name, 'precision',
+                                     (predictions, labels, weights)):
 
     predictions, labels, weights = _remove_squeezable_dimensions(
         predictions=math_ops.cast(predictions, dtype=dtypes.bool),
@@ -1814,22 +1869,27 @@ def precision(labels, predictions, weights=None,
         weights=weights)
 
     true_p, true_positives_update_op = true_positives(
-        labels, predictions, weights, metrics_collections=None,
-        updates_collections=None, name=None)
+        labels,
+        predictions,
+        weights,
+        metrics_collections=None,
+        updates_collections=None,
+        name=None)
     false_p, false_positives_update_op = false_positives(
-        labels, predictions, weights, metrics_collections=None,
-        updates_collections=None, name=None)
+        labels,
+        predictions,
+        weights,
+        metrics_collections=None,
+        updates_collections=None,
+        name=None)
 
     def compute_precision(tp, fp, name):
       return array_ops.where(
-          math_ops.greater(tp + fp, 0),
-          math_ops.div(tp, tp + fp),
-          0,
-          name)
+          math_ops.greater(tp + fp, 0), math_ops.div(tp, tp + fp), 0, name)
 
     p = compute_precision(true_p, false_p, 'value')
-    update_op = compute_precision(
-        true_positives_update_op, false_positives_update_op, 'update_op')
+    update_op = compute_precision(true_positives_update_op,
+                                  false_positives_update_op, 'update_op')
 
     if metrics_collections:
       ops.add_to_collections(metrics_collections, p)
@@ -1841,10 +1901,13 @@ def precision(labels, predictions, weights=None,
 
 
 @tf_export('metrics.precision_at_thresholds')
-def precision_at_thresholds(labels, predictions, thresholds,
+def precision_at_thresholds(labels,
+                            predictions,
+                            thresholds,
                             weights=None,
                             metrics_collections=None,
-                            updates_collections=None, name=None):
+                            updates_collections=None,
+                            name=None):
   """Computes precision values for different `thresholds` on `predictions`.
 
   The `precision_at_thresholds` function creates four local variables,
@@ -1900,12 +1963,13 @@ def precision_at_thresholds(labels, predictions, thresholds,
 
     # Avoid division by zero.
     epsilon = 1e-7
+
     def compute_precision(tp, fp, name):
       return math_ops.div(tp, epsilon + tp + fp, name='precision_' + name)
 
     prec = compute_precision(values['tp'], values['fp'], 'value')
-    update_op = compute_precision(
-        update_ops['tp'], update_ops['fp'], 'update_op')
+    update_op = compute_precision(update_ops['tp'], update_ops['fp'],
+                                  'update_op')
 
     if metrics_collections:
       ops.add_to_collections(metrics_collections, prec)
@@ -1917,8 +1981,11 @@ def precision_at_thresholds(labels, predictions, thresholds,
 
 
 @tf_export('metrics.recall')
-def recall(labels, predictions, weights=None,
-           metrics_collections=None, updates_collections=None,
+def recall(labels,
+           predictions,
+           weights=None,
+           metrics_collections=None,
+           updates_collections=None,
            name=None):
   """Computes the recall of the predictions with respect to the labels.
 
@@ -1965,30 +2032,36 @@ def recall(labels, predictions, weights=None,
     raise RuntimeError('tf.metrics.recall is not supported is not '
                        'supported when eager execution is enabled.')
 
-  with variable_scope.variable_scope(
-      name, 'recall', (predictions, labels, weights)):
+  with variable_scope.variable_scope(name, 'recall',
+                                     (predictions, labels, weights)):
     predictions, labels, weights = _remove_squeezable_dimensions(
         predictions=math_ops.cast(predictions, dtype=dtypes.bool),
         labels=math_ops.cast(labels, dtype=dtypes.bool),
         weights=weights)
 
     true_p, true_positives_update_op = true_positives(
-        labels, predictions, weights, metrics_collections=None,
-        updates_collections=None, name=None)
+        labels,
+        predictions,
+        weights,
+        metrics_collections=None,
+        updates_collections=None,
+        name=None)
     false_n, false_negatives_update_op = false_negatives(
-        labels, predictions, weights, metrics_collections=None,
-        updates_collections=None, name=None)
+        labels,
+        predictions,
+        weights,
+        metrics_collections=None,
+        updates_collections=None,
+        name=None)
 
     def compute_recall(true_p, false_n, name):
       return array_ops.where(
           math_ops.greater(true_p + false_n, 0),
-          math_ops.div(true_p, true_p + false_n),
-          0,
-          name)
+          math_ops.div(true_p, true_p + false_n), 0, name)
 
     rec = compute_recall(true_p, false_n, 'value')
-    update_op = compute_recall(
-        true_positives_update_op, false_negatives_update_op, 'update_op')
+    update_op = compute_recall(true_positives_update_op,
+                               false_negatives_update_op, 'update_op')
 
     if metrics_collections:
       ops.add_to_collections(metrics_collections, rec)
@@ -2022,8 +2095,8 @@ def _select_class_id(ids, selected_id):
   """
   ids = sparse_tensor.convert_to_tensor_or_sparse_tensor(ids)
   if isinstance(ids, sparse_tensor.SparseTensor):
-    return sparse_ops.sparse_retain(
-        ids, math_ops.equal(ids.values, selected_id))
+    return sparse_ops.sparse_retain(ids, math_ops.equal(ids.values,
+                                                        selected_id))
 
   # TODO(ptucker): Make this more efficient, maybe add a sparse version of
   # tf.equal and tf.reduce_any?
@@ -2031,12 +2104,13 @@ def _select_class_id(ids, selected_id):
   # Shape of filled IDs is the same as `ids` with the last dim collapsed to 1.
   ids_shape = array_ops.shape(ids, out_type=dtypes.int64)
   ids_last_dim = array_ops.size(ids_shape) - 1
-  filled_selected_id_shape = math_ops.reduced_shape(
-      ids_shape, array_ops.reshape(ids_last_dim, [1]))
+  filled_selected_id_shape = math_ops.reduced_shape(ids_shape,
+                                                    array_ops.reshape(
+                                                        ids_last_dim, [1]))
 
   # Intersect `ids` with the selected ID.
-  filled_selected_id = array_ops.fill(
-      filled_selected_id_shape, math_ops.to_int64(selected_id))
+  filled_selected_id = array_ops.fill(filled_selected_id_shape,
+                                      math_ops.to_int64(selected_id))
   result = sets.set_intersection(filled_selected_id, ids)
   return sparse_tensor.SparseTensor(
       indices=result.indices, values=result.values, dense_shape=ids_shape)
@@ -2096,15 +2170,15 @@ def _sparse_true_positive_at_k(labels,
   Returns:
     A [D1, ... DN] `Tensor` of true positive counts.
   """
-  with ops.name_scope(
-      name, 'true_positives', (predictions_idx, labels, weights)):
-    labels, predictions_idx = _maybe_select_class_id(
-        labels, predictions_idx, class_id)
+  with ops.name_scope(name, 'true_positives',
+                      (predictions_idx, labels, weights)):
+    labels, predictions_idx = _maybe_select_class_id(labels, predictions_idx,
+                                                     class_id)
     tp = sets.set_size(sets.set_intersection(predictions_idx, labels))
     tp = math_ops.to_double(tp)
     if weights is not None:
-      with ops.control_dependencies((
-          weights_broadcast_ops.assert_broadcastable(weights, tp),)):
+      with ops.control_dependencies((weights_broadcast_ops.assert_broadcastable(
+          weights, tp),)):
         weights = math_ops.to_double(weights)
         tp = math_ops.multiply(tp, weights)
     return tp
@@ -2148,11 +2222,12 @@ def _streaming_sparse_true_positive_at_k(labels,
   Raises:
     ValueError: If `weights` is not `None` and has an incompatible shape.
   """
-  with ops.name_scope(
-      name, _at_k_name('true_positive', k, class_id=class_id),
-      (predictions_idx, labels, weights)) as scope:
+  with ops.name_scope(name, _at_k_name('true_positive', k, class_id=class_id),
+                      (predictions_idx, labels, weights)) as scope:
     tp = _sparse_true_positive_at_k(
-        predictions_idx=predictions_idx, labels=labels, class_id=class_id,
+        predictions_idx=predictions_idx,
+        labels=labels,
+        class_id=class_id,
         weights=weights)
     batch_total_tp = math_ops.to_double(math_ops.reduce_sum(tp))
 
@@ -2189,18 +2264,16 @@ def _sparse_false_negative_at_k(labels,
   Returns:
     A [D1, ... DN] `Tensor` of false negative counts.
   """
-  with ops.name_scope(
-      None, 'false_negatives', (predictions_idx, labels, weights)):
-    labels, predictions_idx = _maybe_select_class_id(labels,
-                                                     predictions_idx,
+  with ops.name_scope(None, 'false_negatives',
+                      (predictions_idx, labels, weights)):
+    labels, predictions_idx = _maybe_select_class_id(labels, predictions_idx,
                                                      class_id)
-    fn = sets.set_size(sets.set_difference(predictions_idx,
-                                           labels,
-                                           aminusb=False))
+    fn = sets.set_size(
+        sets.set_difference(predictions_idx, labels, aminusb=False))
     fn = math_ops.to_double(fn)
     if weights is not None:
-      with ops.control_dependencies((
-          weights_broadcast_ops.assert_broadcastable(weights, fn),)):
+      with ops.control_dependencies((weights_broadcast_ops.assert_broadcastable(
+          weights, fn),)):
         weights = math_ops.to_double(weights)
         fn = math_ops.multiply(fn, weights)
     return fn
@@ -2244,11 +2317,12 @@ def _streaming_sparse_false_negative_at_k(labels,
   Raises:
     ValueError: If `weights` is not `None` and has an incompatible shape.
   """
-  with ops.name_scope(
-      name, _at_k_name('false_negative', k, class_id=class_id),
-      (predictions_idx, labels, weights)) as scope:
+  with ops.name_scope(name, _at_k_name('false_negative', k, class_id=class_id),
+                      (predictions_idx, labels, weights)) as scope:
     fn = _sparse_false_negative_at_k(
-        predictions_idx=predictions_idx, labels=labels, class_id=class_id,
+        predictions_idx=predictions_idx,
+        labels=labels,
+        class_id=class_id,
         weights=weights)
     batch_total_fn = math_ops.to_double(math_ops.reduce_sum(fn))
 
@@ -2335,9 +2409,8 @@ def recall_at_k(labels,
     raise RuntimeError('tf.metrics.recall_at_k is not '
                        'supported when eager execution is enabled.')
 
-  with ops.name_scope(
-      name, _at_k_name('recall', k, class_id=class_id),
-      (predictions, labels, weights)) as scope:
+  with ops.name_scope(name, _at_k_name('recall', k, class_id=class_id),
+                      (predictions, labels, weights)) as scope:
     _, top_k_idx = nn.top_k(predictions, k)
     return recall_at_top_k(
         labels=labels,
@@ -2404,16 +2477,21 @@ def recall_at_top_k(labels,
     `predictions`, or if either `metrics_collections` or `updates_collections`
     are not a list or tuple.
   """
-  with ops.name_scope(name,
-                      _at_k_name('recall', k, class_id=class_id),
+  with ops.name_scope(name, _at_k_name('recall', k, class_id=class_id),
                       (predictions_idx, labels, weights)) as scope:
     labels = _maybe_expand_labels(labels, predictions_idx)
     top_k_idx = math_ops.to_int64(predictions_idx)
     tp, tp_update = _streaming_sparse_true_positive_at_k(
-        predictions_idx=top_k_idx, labels=labels, k=k, class_id=class_id,
+        predictions_idx=top_k_idx,
+        labels=labels,
+        k=k,
+        class_id=class_id,
         weights=weights)
     fn, fn_update = _streaming_sparse_false_negative_at_k(
-        predictions_idx=top_k_idx, labels=labels, k=k, class_id=class_id,
+        predictions_idx=top_k_idx,
+        labels=labels,
+        k=k,
+        class_id=class_id,
         weights=weights)
 
     metric = math_ops.div(tp, math_ops.add(tp, fn), name=scope)
@@ -2427,9 +2505,13 @@ def recall_at_top_k(labels,
 
 
 @tf_export('metrics.recall_at_thresholds')
-def recall_at_thresholds(labels, predictions, thresholds,
-                         weights=None, metrics_collections=None,
-                         updates_collections=None, name=None):
+def recall_at_thresholds(labels,
+                         predictions,
+                         thresholds,
+                         weights=None,
+                         metrics_collections=None,
+                         updates_collections=None,
+                         name=None):
   """Computes various recall values for different `thresholds` on `predictions`.
 
   The `recall_at_thresholds` function creates four local variables,
@@ -2483,6 +2565,7 @@ def recall_at_thresholds(labels, predictions, thresholds,
 
     # Avoid division by zero.
     epsilon = 1e-7
+
     def compute_recall(tp, fn, name):
       return math_ops.div(tp, epsilon + tp + fn, name='recall_' + name)
 
@@ -2499,7 +2582,9 @@ def recall_at_thresholds(labels, predictions, thresholds,
 
 
 @tf_export('metrics.root_mean_squared_error')
-def root_mean_squared_error(labels, predictions, weights=None,
+def root_mean_squared_error(labels,
+                            predictions,
+                            weights=None,
                             metrics_collections=None,
                             updates_collections=None,
                             name=None):
@@ -2552,9 +2637,9 @@ def root_mean_squared_error(labels, predictions, weights=None,
 
   predictions, labels, weights = _remove_squeezable_dimensions(
       predictions=predictions, labels=labels, weights=weights)
-  mse, update_mse_op = mean_squared_error(
-      labels, predictions, weights, None, None,
-      name or 'root_mean_squared_error')
+  mse, update_mse_op = mean_squared_error(labels, predictions, weights, None,
+                                          None, name or
+                                          'root_mean_squared_error')
 
   rmse = math_ops.sqrt(mse)
   update_rmse_op = math_ops.sqrt(update_mse_op)
@@ -2569,9 +2654,14 @@ def root_mean_squared_error(labels, predictions, weights=None,
 
 
 @tf_export('metrics.sensitivity_at_specificity')
-def sensitivity_at_specificity(
-    labels, predictions, specificity, weights=None, num_thresholds=200,
-    metrics_collections=None, updates_collections=None, name=None):
+def sensitivity_at_specificity(labels,
+                               predictions,
+                               specificity,
+                               weights=None,
+                               num_thresholds=200,
+                               metrics_collections=None,
+                               updates_collections=None,
+                               name=None):
   """Computes the specificity at a given sensitivity.
 
   The `sensitivity_at_specificity` function creates four local
@@ -2632,8 +2722,9 @@ def sensitivity_at_specificity(
   with variable_scope.variable_scope(name, 'sensitivity_at_specificity',
                                      (predictions, labels, weights)):
     kepsilon = 1e-7  # to account for floating point imprecisions
-    thresholds = [(i + 1) * 1.0 / (num_thresholds - 1)
-                  for i in range(num_thresholds-2)]
+    thresholds = [
+        (i + 1) * 1.0 / (num_thresholds - 1) for i in range(num_thresholds - 2)
+    ]
     thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon]
 
     values, update_ops = _confusion_matrix_at_thresholds(
@@ -2645,8 +2736,7 @@ def sensitivity_at_specificity(
       tf_index = math_ops.cast(tf_index, dtypes.int32)
 
       # Now, we have the implicit threshold, so compute the sensitivity:
-      return math_ops.div(tp[tf_index],
-                          tp[tf_index] + fn[tf_index] + kepsilon,
+      return math_ops.div(tp[tf_index], tp[tf_index] + fn[tf_index] + kepsilon,
                           name)
 
     sensitivity = compute_sensitivity_at_specificity(
@@ -2685,8 +2775,8 @@ def _expand_and_tile(tensor, multiple, dim=0, name=None):
   """
   if multiple < 1:
     raise ValueError('Invalid multiple %s, must be > 0.' % multiple)
-  with ops.name_scope(
-      name, 'expand_and_tile', (tensor, multiple, dim)) as scope:
+  with ops.name_scope(name, 'expand_and_tile',
+                      (tensor, multiple, dim)) as scope:
     # Sparse.
     tensor = sparse_tensor.convert_to_tensor_or_sparse_tensor(tensor)
     if isinstance(tensor, sparse_tensor.SparseTensor):
@@ -2786,8 +2876,8 @@ def _sparse_average_precision_at_top_k(labels, predictions_idx):
   Raises:
     ValueError: if the last dimension of predictions_idx is not set.
   """
-  with ops.name_scope(
-      None, 'average_precision', (predictions_idx, labels)) as scope:
+  with ops.name_scope(None, 'average_precision',
+                      (predictions_idx, labels)) as scope:
     predictions_idx = math_ops.to_int64(predictions_idx, name='predictions_idx')
     if predictions_idx.get_shape().ndims == 0:
       raise ValueError('The rank of predictions_idx must be at least 1.')
@@ -2824,10 +2914,12 @@ def _sparse_average_precision_at_top_k(labels, predictions_idx):
     retrieved_per_k = math_ops.cumsum(
         array_ops.ones_like(relevant_per_k), axis=-1, name='retrieved_per_k')
     precision_per_k = math_ops.div(
-        math_ops.to_double(tp_per_k), math_ops.to_double(retrieved_per_k),
+        math_ops.to_double(tp_per_k),
+        math_ops.to_double(retrieved_per_k),
         name='precision_per_k')
     relevant_precision_per_k = math_ops.multiply(
-        precision_per_k, math_ops.to_double(relevant_per_k),
+        precision_per_k,
+        math_ops.to_double(relevant_per_k),
         name='relevant_precision_per_k')
 
     # Reduce along k dimension to get the sum, yielding a [D1, ... DN] tensor.
@@ -3017,9 +3109,8 @@ def average_precision_at_k(labels,
 
   if k < 1:
     raise ValueError('Invalid k=%s.' % k)
-  with ops.name_scope(
-      name, _at_k_name('average_precision', k),
-      (predictions, labels, weights)) as scope:
+  with ops.name_scope(name, _at_k_name('average_precision', k),
+                      (predictions, labels, weights)) as scope:
     # Calculate top k indices to produce [D1, ... DN, k] tensor.
     _, predictions_idx = nn.top_k(predictions, k)
     return _streaming_sparse_average_precision_at_top_k(
@@ -3060,17 +3151,16 @@ def _sparse_false_positive_at_k(labels,
   Returns:
     A [D1, ... DN] `Tensor` of false positive counts.
   """
-  with ops.name_scope(
-      None, 'false_positives', (predictions_idx, labels, weights)):
-    labels, predictions_idx = _maybe_select_class_id(labels,
-                                                     predictions_idx,
+  with ops.name_scope(None, 'false_positives',
+                      (predictions_idx, labels, weights)):
+    labels, predictions_idx = _maybe_select_class_id(labels, predictions_idx,
                                                      class_id)
-    fp = sets.set_size(sets.set_difference(
-        predictions_idx, labels, aminusb=True))
+    fp = sets.set_size(
+        sets.set_difference(predictions_idx, labels, aminusb=True))
     fp = math_ops.to_double(fp)
     if weights is not None:
-      with ops.control_dependencies((
-          weights_broadcast_ops.assert_broadcastable(weights, fp),)):
+      with ops.control_dependencies((weights_broadcast_ops.assert_broadcastable(
+          weights, fp),)):
         weights = math_ops.to_double(weights)
         fp = math_ops.multiply(fp, weights)
     return fp
@@ -3114,11 +3204,12 @@ def _streaming_sparse_false_positive_at_k(labels,
   Raises:
     ValueError: If `weights` is not `None` and has an incompatible shape.
   """
-  with ops.name_scope(
-      name, _at_k_name('false_positive', k, class_id=class_id),
-      (predictions_idx, labels, weights)) as scope:
+  with ops.name_scope(name, _at_k_name('false_positive', k, class_id=class_id),
+                      (predictions_idx, labels, weights)) as scope:
     fp = _sparse_false_positive_at_k(
-        predictions_idx=predictions_idx, labels=labels, class_id=class_id,
+        predictions_idx=predictions_idx,
+        labels=labels,
+        class_id=class_id,
         weights=weights)
     batch_total_fp = math_ops.to_double(math_ops.reduce_sum(fp))
 
@@ -3190,10 +3281,16 @@ def precision_at_top_k(labels,
     labels = _maybe_expand_labels(labels, predictions_idx)
     top_k_idx = math_ops.to_int64(predictions_idx)
     tp, tp_update = _streaming_sparse_true_positive_at_k(
-        predictions_idx=top_k_idx, labels=labels, k=k, class_id=class_id,
+        predictions_idx=top_k_idx,
+        labels=labels,
+        k=k,
+        class_id=class_id,
         weights=weights)
     fp, fp_update = _streaming_sparse_false_positive_at_k(
-        predictions_idx=top_k_idx, labels=labels, k=k, class_id=class_id,
+        predictions_idx=top_k_idx,
+        labels=labels,
+        k=k,
+        class_id=class_id,
         weights=weights)
 
     metric = math_ops.div(tp, math_ops.add(tp, fp), name=scope)
@@ -3323,9 +3420,14 @@ def precision_at_k(labels,
 
 
 @tf_export('metrics.specificity_at_sensitivity')
-def specificity_at_sensitivity(
-    labels, predictions, sensitivity, weights=None, num_thresholds=200,
-    metrics_collections=None, updates_collections=None, name=None):
+def specificity_at_sensitivity(labels,
+                               predictions,
+                               sensitivity,
+                               weights=None,
+                               num_thresholds=200,
+                               metrics_collections=None,
+                               updates_collections=None,
+                               name=None):
   """Computes the specificity at a given sensitivity.
 
   The `specificity_at_sensitivity` function creates four local
@@ -3386,8 +3488,9 @@ def specificity_at_sensitivity(
   with variable_scope.variable_scope(name, 'specificity_at_sensitivity',
                                      (predictions, labels, weights)):
     kepsilon = 1e-7  # to account for floating point imprecisions
-    thresholds = [(i + 1) * 1.0 / (num_thresholds - 1)
-                  for i in range(num_thresholds-2)]
+    thresholds = [
+        (i + 1) * 1.0 / (num_thresholds - 1) for i in range(num_thresholds - 2)
+    ]
     thresholds = [0.0 - kepsilon] + thresholds + [1.0 - kepsilon]
 
     values, update_ops = _confusion_matrix_at_thresholds(
@@ -3419,8 +3522,7 @@ def specificity_at_sensitivity(
       tf_index = math_ops.cast(tf_index, dtypes.int32)
 
       # Now, we have the implicit threshold, so compute the specificity:
-      return math_ops.div(tn[tf_index],
-                          tn[tf_index] + fp[tf_index] + kepsilon,
+      return math_ops.div(tn[tf_index], tn[tf_index] + fp[tf_index] + kepsilon,
                           name)
 
     specificity = compute_specificity_at_sensitivity(
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index 837ee02e64..3268fd0e0a 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -196,9 +196,12 @@ def weighted_cross_entropy_with_logits(targets, logits, pos_weight, name=None):
       targets * -log(sigmoid(logits)) +
           (1 - targets) * -log(1 - sigmoid(logits))
 
-  A value `pos_weights > 1` decreases the false negative count, hence increasing the recall.
-  Conversely setting `pos_weights < 1` decreases the false positive count and increases the precision.
-  This can be seen from the fact that `pos_weight` is introduced as a multiplicative coefficient for the positive targets term 
+  A value `pos_weights > 1` decreases the false negative count, hence increasing
+  the recall.
+  Conversely setting `pos_weights < 1` decreases the false positive count and
+  increases the precision.
+  This can be seen from the fact that `pos_weight` is introduced as a
+  multiplicative coefficient for the positive targets term
   in the loss expression:
 
       targets * -log(sigmoid(logits)) * pos_weight +
@@ -646,9 +649,12 @@ def normalize_moments(counts, mean_ss, variance_ss, shift, name=None):
 
 
 @tf_export("nn.moments")
-def moments(x, axes,
-            shift=None,  # pylint: disable=unused-argument
-            name=None, keep_dims=False):
+def moments(
+    x,
+    axes,
+    shift=None,  # pylint: disable=unused-argument
+    name=None,
+    keep_dims=False):
   """Calculate the mean and variance of `x`.
 
   The mean and variance are calculated by aggregating the contents of `x`
@@ -692,8 +698,8 @@ def moments(x, axes,
       mean = array_ops.squeeze(mean, axes)
       variance = array_ops.squeeze(variance, axes)
     if x.dtype == dtypes.float16:
-      return (math_ops.cast(mean, dtypes.float16), math_ops.cast(
-          variance, dtypes.float16))
+      return (math_ops.cast(mean, dtypes.float16),
+              math_ops.cast(variance, dtypes.float16))
     else:
       return (mean, variance)
 
@@ -824,8 +830,8 @@ def batch_normalization(x,
     inv = math_ops.rsqrt(variance + variance_epsilon)
     if scale is not None:
       inv *= scale
-    return x * inv + (offset - mean * inv
-                      if offset is not None else -mean * inv)
+    return x * inv + (
+        offset - mean * inv if offset is not None else -mean * inv)
 
 
 @tf_export("nn.fused_batch_norm")
diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index 6767564024..5a45bdc1e5 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -131,8 +131,7 @@ class LogPoissonLossTest(test_lib.TestCase):
     y_np = self._log_poisson_loss(x_np, z_np, compute_full_loss=False)
     y_np_stirling = self._log_poisson_loss(x_np, z_np, compute_full_loss=True)
     y_tf = nn_impl.log_poisson_loss(z_np, x_np, compute_full_loss=False)
-    y_tf_stirling = nn_impl.log_poisson_loss(
-        z_np, x_np, compute_full_loss=True)
+    y_tf_stirling = nn_impl.log_poisson_loss(z_np, x_np, compute_full_loss=True)
     y_tf_np = self.evaluate(y_tf)
     y_tf_np_stirling = self.evaluate(y_tf_stirling)
     eps = 1e-3
@@ -773,8 +772,8 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
     def _SoftmaxCrossEntropyWithLogits(logits, targets):
       # logits, targets: float arrays of the same shape.
       assert logits.shape == targets.shape
-      stable_exp_logits = np.exp(logits - np.amax(
-          logits, axis=1, keepdims=True))
+      stable_exp_logits = np.exp(
+          logits - np.amax(logits, axis=1, keepdims=True))
       pred = stable_exp_logits / np.sum(stable_exp_logits, 1, keepdims=True)
       return -np.sum(targets * np.log(pred + 1.0e-20), axis=1)
 
@@ -865,8 +864,8 @@ class LeakyReluTest(test_lib.TestCase):
     batch_size = 3
     height, width = 4, 4
     np.random.seed(1)  # Make it reproducible.
-    inputs = np.random.uniform(
-        size=(batch_size, height, width, 3)).astype(np.float32)
+    inputs = np.random.uniform(size=(batch_size, height, width, 3)).astype(
+        np.float32)
     inputs = constant_op.constant(inputs)
 
     outputs = nn_ops.leaky_relu(inputs)
@@ -884,7 +883,8 @@ class LeakyReluTest(test_lib.TestCase):
       with self.test_session() as sess:
         outputs = sess.run(outputs)
       tol = 2e-3 if dtype == np.float16 else 1e-6
-      self.assertAllClose(outputs, [-0.4, -0.2, 0.0, 1.0, 2.0], rtol=tol, atol=tol)
+      self.assertAllClose(
+          outputs, [-0.4, -0.2, 0.0, 1.0, 2.0], rtol=tol, atol=tol)
 
 
 class SwishTest(test_lib.TestCase):
@@ -915,7 +915,10 @@ class SwishTest(test_lib.TestCase):
 
 class MomentsTest(test_lib.TestCase):
 
-  def doOutputTest(self, input_shape, moments_axes, tol=1e-4,
+  def doOutputTest(self,
+                   input_shape,
+                   moments_axes,
+                   tol=1e-4,
                    check_gradients=False):
     for mu in [0.0, 1.0, 1e3]:
       for sigma in [1.0, 0.1]:
diff --git a/tensorflow/python/util/compat.py b/tensorflow/python/util/compat.py
index 3ab0bd16fa..270d96a3c7 100644
--- a/tensorflow/python/util/compat.py
+++ b/tensorflow/python/util/compat.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Functions for Python 2 vs. 3 compatibility.
 
 ## Conversion routines
@@ -118,7 +117,7 @@ def path_to_str(path):
   Returns:
     A `str` object.
   """
-  if hasattr(path, "__fspath__"):
+  if hasattr(path, '__fspath__'):
     path = as_str_any(path.__fspath__())
   return path
 
@@ -129,11 +128,9 @@ integral_types = (_numbers.Integral, _np.integer)
 real_types = (_numbers.Real, _np.integer, _np.floating)
 complex_types = (_numbers.Complex, _np.number)
 
-
 # Either bytes or text.
 bytes_or_text_types = (bytes, _six.text_type)
 
-
 _allowed_symbols = [
     'as_str',
     'bytes_or_text_types',
diff --git a/tensorflow/tools/pip_package/pip_smoke_test.py b/tensorflow/tools/pip_package/pip_smoke_test.py
index 8eee489e2d..38a9007387 100644
--- a/tensorflow/tools/pip_package/pip_smoke_test.py
+++ b/tensorflow/tools/pip_package/pip_smoke_test.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """This pip smoke test verifies dependency files exist in the pip package.
 
 This script runs bazel queries to see what python files are required by the
@@ -26,13 +25,12 @@ from __future__ import print_function
 import os
 import subprocess
 
+os.chdir(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../..")))
 
-os.chdir(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../..')))
-
-
-PIP_PACKAGE_QUERY_EXPRESSION = \
-  'deps(//tensorflow/tools/pip_package:build_pip_package)'
+PIP_PACKAGE_QUERY_EXPRESSION = (
+    "deps(//tensorflow/tools/pip_package:build_pip_package)")
 
+# pylint: disable=g-backslash-continuation
 PY_TEST_QUERY_EXPRESSION = 'deps(\
   filter("^((?!benchmark).)*$",\
   kind(py_test,\
@@ -40,6 +38,7 @@ PY_TEST_QUERY_EXPRESSION = 'deps(\
   + //tensorflow/contrib/... \
   - //tensorflow/contrib/tensorboard/... \
   - attr(tags, "manual|no_pip", //tensorflow/...))), 1)'
+# pylint: enable=g-backslash-continuation
 
 # Hard-coded blacklist of files if not included in pip package
 # TODO(amitpatankar): Clean up blacklist.
@@ -90,15 +89,15 @@ def main():
   """
 
   # pip_package_dependencies_list is the list of included files in pip packages
-  pip_package_dependencies = subprocess.check_output([
-      'bazel', 'query', PIP_PACKAGE_QUERY_EXPRESSION])
+  pip_package_dependencies = subprocess.check_output(
+      ["bazel", "query", PIP_PACKAGE_QUERY_EXPRESSION])
   pip_package_dependencies_list = pip_package_dependencies.strip().split("\n")
   print("Pip package superset size: %d" % len(pip_package_dependencies_list))
 
   # tf_py_test_dependencies is the list of dependencies for all python
   # tests in tensorflow
-  tf_py_test_dependencies = subprocess.check_output([
-      'bazel', 'query', PY_TEST_QUERY_EXPRESSION])
+  tf_py_test_dependencies = subprocess.check_output(
+      ["bazel", "query", PY_TEST_QUERY_EXPRESSION])
   tf_py_test_dependencies_list = tf_py_test_dependencies.strip().split("\n")
   print("Pytest dependency subset size: %d" % len(tf_py_test_dependencies_list))
 
@@ -119,8 +118,7 @@ def main():
 
       # Check if the dependency is in the pip package, the blacklist, or
       # should be ignored because of its file extension
-      if not (ignore or
-              dependency in pip_package_dependencies_list or
+      if not (ignore or dependency in pip_package_dependencies_list or
               dependency in BLACKLIST):
         missing_dependencies.append(dependency)
 
@@ -131,9 +129,9 @@ def main():
     for missing_dependency in missing_dependencies:
       print("\nMissing dependency: %s " % missing_dependency)
       print("Affected Tests:")
-      rdep_query = 'rdeps(kind(py_test, \
-      //tensorflow/python/...), %s)' % missing_dependency
-      affected_tests = subprocess.check_output(['bazel', 'query', rdep_query])
+      rdep_query = ("rdeps(kind(py_test, //tensorflow/python/...), %s)" %
+                    missing_dependency)
+      affected_tests = subprocess.check_output(["bazel", "query", rdep_query])
       affected_tests_list = affected_tests.split("\n")[:-2]
       print("\n".join(affected_tests_list))
 
@@ -145,5 +143,6 @@ or add them to //tensorflow/tools/pip_package/BUILD.""")
   else:
     print("TEST PASSED")
 
+
 if __name__ == "__main__":
   main()
-- 
GitLab


From e8e1f1083d2c6ba4c2fb4e7e804d36624775e971 Mon Sep 17 00:00:00 2001
From: Yunxing Dai <yunxing@google.com>
Date: Wed, 24 Jan 2018 11:42:22 -0800
Subject: [PATCH 091/966] [BatchNorm] Remove CPU implementation

We now use batchnorm rewriter (tensorflow/compiler/xla/service/batchnorm_rewriter.h) to expand batch norm into smaller ops. A specific implementation should not be needed anymore (for CPU).

RELNOTES:n/a
PiperOrigin-RevId: 183117252
---
 .../compiler/xla/service/cpu/ir_emitter.cc    | 199 ------------------
 .../compiler/xla/service/cpu/ir_emitter.h     |   2 -
 2 files changed, 201 deletions(-)

diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index ac8d5d33fa..b03a9f9aa5 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -1226,205 +1226,6 @@ static llvm_ir::IrArray::Index FillReducedDimensionIndex(
   return index_with_free_var;
 }
 
-Status IrEmitter::HandleBatchNormTraining(HloInstruction* batch_norm_training) {
-  // The output of BatchNormTraining is a tuple of three element:
-  //   - An N-dimensional array containing normalized values.
-  //   - A 1 dimensional array containing the mean value for each feature.
-  //   - A 1 dimensional array containing the variance value for each feature.
-  HloInstruction* operand = batch_norm_training->operands()[0];
-  HloInstruction* scale = batch_norm_training->operands()[1];
-  HloInstruction* offset = batch_norm_training->operands()[2];
-  float epsilon = batch_norm_training->epsilon();
-  int64 feature_index = batch_norm_training->feature_index();
-  TF_RET_CHECK(ShapeUtil::IsTuple(batch_norm_training->shape()) &&
-               ShapeUtil::TupleElementCount(batch_norm_training->shape()) == 3);
-
-  const Shape& output_shape =
-      ShapeUtil::GetTupleElementShape(batch_norm_training->shape(), 0);
-  const Shape& feature_shape =
-      ShapeUtil::GetTupleElementShape(batch_norm_training->shape(), 1);
-
-  // Reduce vector of the non-feature dimensions.
-  std::vector<int64> dimensions_to_reduce;
-
-  for (int64 i = 0; i < operand->shape().dimensions_size(); ++i) {
-    if (i != feature_index) {
-      dimensions_to_reduce.push_back(i);
-    }
-  }
-
-  // Get the second and third allocations in the output tuple, which should be
-  // used to store the result of mean and variance value calculation.
-  TF_ASSIGN_OR_RETURN(
-      const BufferAllocation::Slice slice_mean,
-      assignment_.GetUniqueSlice(batch_norm_training, /*index=*/{1}));
-  TF_ASSIGN_OR_RETURN(
-      const BufferAllocation::Slice slice_var,
-      assignment_.GetUniqueSlice(batch_norm_training, /*index=*/{2}));
-  const int feature_count = output_shape.dimensions(feature_index);
-  const int size_in_elements = ShapeUtil::ElementsIn(output_shape);
-  TF_RET_CHECK(ShapeUtil::ElementsIn(operand->shape()) == size_in_elements);
-  const int elements_per_feature = size_in_elements / feature_count;
-
-  llvm::Value* mean = EmitTempBufferPointer(slice_mean, feature_shape);
-  llvm_ir::IrArray mean_array(mean, feature_shape);
-
-  llvm::Value* var = EmitTempBufferPointer(slice_var, feature_shape);
-  llvm_ir::IrArray var_array(var, feature_shape);
-
-  // This loop calculates mean and variance for each feature.
-  //
-  // In theory this could be swapped by multi-output fusion. We will evaluate
-  // this when it's ready.
-  //
-  // For variance calculation, we use a simplified formula so we can fuse the
-  // computation into the same loop to calculate mean: Var=E(X^2) - E(X)^2.
-  TF_RETURN_IF_ERROR(
-      llvm_ir::LoopEmitter(
-          [&](const llvm_ir::IrArray::Index& index) {
-            PrimitiveType element_type = operand->shape().element_type();
-            // Used to calculate E(X).
-            llvm::Value* sum_address = llvm_ir::EmitAllocaAtFunctionEntry(
-                llvm_ir::PrimitiveTypeToIrType(element_type, module_),
-                "sum_address", &ir_builder_,
-                MinimumAlignmentForPrimitiveType(element_type));
-
-            // Used to calculate E(X^2).
-            llvm::Value* sum_square_address =
-                llvm_ir::EmitAllocaAtFunctionEntry(
-                    llvm_ir::PrimitiveTypeToIrType(element_type, module_),
-                    "sum_square_address", &ir_builder_,
-                    MinimumAlignmentForPrimitiveType(element_type));
-
-            ir_builder_.CreateStore(
-                llvm::ConstantFP::get(ir_builder_.getFloatTy(), 0.0),
-                sum_address);
-
-            ir_builder_.CreateStore(
-                llvm::ConstantFP::get(ir_builder_.getFloatTy(), 0.0),
-                sum_square_address);
-
-            llvm_ir::ForLoopNest loops(IrName(batch_norm_training, "inner"),
-                                       &ir_builder_);
-
-            const llvm_ir::IrArray::Index reduced_dims_index =
-                loops.AddLoopsForShapeOnDimensions(
-                    operand->shape(), dimensions_to_reduce, "reduction_dim");
-
-            SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(),
-                                  &ir_builder_);
-
-            llvm_ir::IrArray operand_array(GetIrArrayFor(operand));
-            llvm_ir::IrArray::Index input_index =
-                FillReducedDimensionIndex(reduced_dims_index, index);
-            llvm::Value* new_value =
-                operand_array.EmitReadArrayElement(input_index, &ir_builder_);
-
-            llvm::Value* new_value_square =
-                ir_builder_.CreateFMul(new_value, new_value);
-
-            llvm::Value* current_sum = ir_builder_.CreateLoad(sum_address);
-            llvm::Value* current_sum_square =
-                ir_builder_.CreateLoad(sum_square_address);
-            // Update sum.
-            ir_builder_.CreateStore(
-                ir_builder_.CreateFAdd(current_sum, new_value), sum_address);
-
-            // Update sum square.
-            ir_builder_.CreateStore(
-                ir_builder_.CreateFAdd(current_sum_square, new_value_square),
-                sum_square_address);
-
-            SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(),
-                                  &ir_builder_);
-
-            llvm::Value* sum = ir_builder_.CreateLoad(sum_address);
-            llvm::Value* elements_per_feature_value = llvm::ConstantFP::get(
-                ir_builder_.getFloatTy(), elements_per_feature);
-            llvm::Value* mean =
-                ir_builder_.CreateFDiv(sum, elements_per_feature_value);
-            llvm::Value* mean_square = ir_builder_.CreateFMul(mean, mean);
-            llvm::Value* sum_square =
-                ir_builder_.CreateLoad(sum_square_address);
-
-            // Var=E(X^2) - E(X)^2.
-            llvm::Value* var = ir_builder_.CreateFSub(
-                ir_builder_.CreateFDiv(sum_square, elements_per_feature_value),
-                mean_square);
-
-            var_array.EmitWriteArrayElement(index, var, &ir_builder_);
-            return mean;
-          },
-          mean_array, &ir_builder_)
-          .EmitLoop(IrName(batch_norm_training, "mean_var")));
-
-  TF_RETURN_IF_ERROR(EmitTargetAddressForOp(batch_norm_training));
-  TF_ASSIGN_OR_RETURN(
-      const BufferAllocation::Slice slice,
-      assignment_.GetUniqueSlice(batch_norm_training, /*index=*/{0}));
-
-  llvm::Value* normalized = EmitTempBufferPointer(slice, output_shape);
-
-  llvm_ir::IrArray target_array(normalized, output_shape);
-
-  AddAliasingInformationToIrArray(*batch_norm_training, &target_array);
-
-  TF_RETURN_IF_ERROR(
-      llvm_ir::LoopEmitter(
-          [this, mean_array, var_array, epsilon, operand, dimensions_to_reduce,
-           feature_index, offset, scale](const llvm_ir::IrArray::Index& index) {
-            // The following logic normalizes the input value, scales and shifts
-            // it:
-            //
-            // normalized = (input - mean) / sqrt(variance + epsilon)
-            // result = normalized * scale + offset
-
-            // Current index in the feature dimension.
-            llvm_ir::IrArray::Index feature_index_value(1,
-                                                        index[feature_index]);
-
-            llvm::Value* mean = mean_array.EmitReadArrayElement(
-                feature_index_value, &ir_builder_);
-            llvm::Value* var = var_array.EmitReadArrayElement(
-                feature_index_value, &ir_builder_);
-
-            llvm_ir::IrArray operand_array(GetIrArrayFor(operand));
-            llvm::Value* input =
-                operand_array.EmitReadArrayElement(index, &ir_builder_);
-
-            llvm::Value* variance_with_epsilon = ir_builder_.CreateFAdd(
-                var, llvm::ConstantFP::get(ir_builder_.getFloatTy(), epsilon));
-            llvm::Function* func_llvm_sqrt = llvm::Intrinsic::getDeclaration(
-                module_, llvm::Intrinsic::sqrt, {ir_builder_.getFloatTy()});
-            llvm::Value* variance_sqrt =
-                ir_builder_.CreateCall(func_llvm_sqrt, {variance_with_epsilon});
-            llvm::Value* normalized = ir_builder_.CreateFDiv(
-                ir_builder_.CreateFSub(input, mean), variance_sqrt);
-            llvm_ir::IrArray offset_array(GetIrArrayFor(offset));
-            llvm::Value* offset = offset_array.EmitReadArrayElement(
-                feature_index_value, &ir_builder_);
-            llvm_ir::IrArray scale_array(GetIrArrayFor(scale));
-            llvm::Value* scale = scale_array.EmitReadArrayElement(
-                feature_index_value, &ir_builder_);
-            llvm::Value* result = ir_builder_.CreateFAdd(
-                ir_builder_.CreateFMul(normalized, scale), offset);
-
-            return result;
-          },
-          target_array, &ir_builder_)
-          .EmitLoop(IrName(batch_norm_training, "normalize")));
-
-  llvm_ir::EmitTuple(GetIrArrayFor(batch_norm_training),
-                     {normalized, mean, var}, &ir_builder_, module_);
-  return Status::OK();
-}
-
-Status IrEmitter::HandleBatchNormGrad(HloInstruction* batch_norm_grad) {
-  // TODO(b/62843645) Implement BatchNormGrad on CPU backend.
-  return Unimplemented(
-      "BatchNormGrad is not implemented on CPU. See b/62843645.");
-}
-
 Status IrEmitter::HandleParameter(HloInstruction* parameter) {
   VLOG(2) << "HandleParameter: " << parameter->ToString();
   auto param_number = parameter->parameter_number();
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
index 66f2aeeab3..5094402514 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
@@ -125,8 +125,6 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   Status HandleDot(HloInstruction* dot) override;
   Status HandleConvolution(HloInstruction* convolution) override;
   Status HandleFft(HloInstruction* fft) override;
-  Status HandleBatchNormTraining(HloInstruction* batch_norm_training) override;
-  Status HandleBatchNormGrad(HloInstruction* batch_norm_grad) override;
   Status HandleCrossReplicaSum(HloInstruction* crs) override;
   Status HandleInfeed(HloInstruction* infeed) override;
   Status HandleOutfeed(HloInstruction* outfeed) override;
-- 
GitLab


From 94faa1e921d647f321ca1b3204a25aa0dbede856 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Wed, 24 Jan 2018 11:44:55 -0800
Subject: [PATCH 092/966] Internal change.

PiperOrigin-RevId: 183117665
---
 tensorflow/tools/api/generator/BUILD          |  3 +++
 .../tools/api/generator/create_python_api.py  | 22 ++++++++++++++-----
 2 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/tensorflow/tools/api/generator/BUILD b/tensorflow/tools/api/generator/BUILD
index 4c47ffe07c..d110316395 100644
--- a/tensorflow/tools/api/generator/BUILD
+++ b/tensorflow/tools/api/generator/BUILD
@@ -86,4 +86,7 @@ py_library(
     name = "python_api",
     srcs = [":python_api_gen"],
     srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib:contrib_py",  # keep
+    ],
 )
diff --git a/tensorflow/tools/api/generator/create_python_api.py b/tensorflow/tools/api/generator/create_python_api.py
index 2c9a2fa731..1557314939 100644
--- a/tensorflow/tools/api/generator/create_python_api.py
+++ b/tensorflow/tools/api/generator/create_python_api.py
@@ -31,6 +31,7 @@ from tensorflow.python.util import tf_decorator
 _API_CONSTANTS_ATTR = '_tf_api_constants'
 _API_NAMES_ATTR = '_tf_api_names'
 _API_DIR = '/api/'
+_CONTRIB_IMPORT = 'from tensorflow import contrib'
 _GENERATED_FILE_HEADER = """\"\"\"Imports for Python API.
 
 This file is MACHINE GENERATED! Do not edit.
@@ -50,11 +51,17 @@ def format_import(source_module_name, source_name, dest_name):
   Returns:
     An import statement string.
   """
-  if source_name == dest_name:
-    return 'from %s import %s' % (source_module_name, source_name)
+  if source_module_name:
+    if source_name == dest_name:
+      return 'from %s import %s' % (source_module_name, source_name)
+    else:
+      return 'from %s import %s as %s' % (
+          source_module_name, source_name, dest_name)
   else:
-    return 'from %s import %s as %s' % (
-        source_module_name, source_name, dest_name)
+    if source_name == dest_name:
+      return 'import %s' % source_name
+    else:
+      return 'import %s as %s' % (source_name, dest_name)
 
 
 def get_api_imports():
@@ -74,6 +81,9 @@ def get_api_imports():
     # Only look at tensorflow modules.
     if not module or 'tensorflow.' not in module.__name__:
       continue
+    # Do not generate __init__.py files for contrib modules for now.
+    if '.contrib.' in module.__name__ or module.__name__.endswith('.contrib'):
+      continue
 
     for module_contents_name in dir(module):
       attr = getattr(module, module_contents_name)
@@ -151,8 +161,10 @@ def create_api_files(output_files):
       os.makedirs(os.path.dirname(file_path))
     open(file_path, 'a').close()
 
-  # Add imports to output files.
   module_imports = get_api_imports()
+  module_imports['tf'].append(_CONTRIB_IMPORT)  # Include all of contrib.
+
+  # Add imports to output files.
   missing_output_files = []
   for module, exports in module_imports.items():
     # Make sure genrule output file list is in sync with API exports.
-- 
GitLab


From 23ed83de415b465fb5570216b8c171d63187289e Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Wed, 24 Jan 2018 20:58:28 +0100
Subject: [PATCH 093/966] Annotated return types for chaining dataset
 operations (#16359)

---
 tensorflow/python/data/ops/dataset_ops.py | 44 +++++++++++------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index 0594c6d6a7..c1ba67e474 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -201,7 +201,7 @@ class Dataset(object):
       tensors: A nested structure of tensors.
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
     """
     return TensorDataset(tensors)
 
@@ -214,7 +214,7 @@ class Dataset(object):
         0th dimension.
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
     """
     return TensorSliceDataset(tensors)
 
@@ -227,7 +227,7 @@ class Dataset(object):
       sparse_tensor: A `tf.SparseTensor`.
 
     Returns:
-      A `Dataset` of rank-(N-1) sparse tensors.
+      Dataset: A `Dataset` of rank-(N-1) sparse tensors.
     """
     return SparseTensorSliceDataset(sparse_tensor)
 
@@ -313,7 +313,7 @@ class Dataset(object):
         `generator`.
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
     """
     if not callable(generator):
       raise TypeError("`generator` must be callable.")
@@ -456,7 +456,7 @@ class Dataset(object):
         len(args) == 3 -> start = args[0], stop = args[1, stop = args[2]
 
     Returns:
-      A `RangeDataset`.
+      Dataset: A `RangeDataset`.
 
     Raises:
       ValueError: if len(args) == 0.
@@ -500,7 +500,7 @@ class Dataset(object):
       datasets: A nested structure of datasets.
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
     """
     return ZipDataset(datasets)
 
@@ -526,7 +526,7 @@ class Dataset(object):
       dataset: `Dataset` to be concatenated.
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
     """
     return ConcatenateDataset(self, dataset)
 
@@ -538,7 +538,7 @@ class Dataset(object):
         maximum number elements that will be buffered when prefetching.
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
     """
     return PrefetchDataset(self, buffer_size)
 
@@ -561,7 +561,7 @@ class Dataset(object):
         the filename pattern that will be matched.
 
     Returns:
-     A `Dataset` of strings corresponding to file names.
+     Dataset: A `Dataset` of strings corresponding to file names.
     """
     return Dataset.from_tensor_slices(gen_io_ops.matching_files(file_pattern))
 
@@ -578,7 +578,7 @@ class Dataset(object):
         indefinitely.
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
     """
     return RepeatDataset(self, count)
 
@@ -602,7 +602,7 @@ class Dataset(object):
         iterated over. (Defaults to `True`.)
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
     """
     return ShuffleDataset(self, buffer_size, seed, reshuffle_each_iteration)
 
@@ -615,7 +615,7 @@ class Dataset(object):
         If a filename is not provided, the dataset will be cached in memory.
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
     """
     return CacheDataset(self, filename)
 
@@ -629,7 +629,7 @@ class Dataset(object):
         dataset, the new dataset will contain all elements of this dataset.
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
     """
     return TakeDataset(self, count)
 
@@ -644,7 +644,7 @@ class Dataset(object):
         is -1, skips the entire dataset.
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
     """
     return SkipDataset(self, count)
 
@@ -691,7 +691,7 @@ class Dataset(object):
       index: A `tf.int64` scalar `tf.Tensor`, representing the worker index.
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
 
     Raises:
       ValueError: if `num_shards` or `index` are illegal values. Note: error
@@ -735,7 +735,7 @@ class Dataset(object):
         consecutive elements of this dataset to combine in a single batch.
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
     """
     return BatchDataset(self, batch_size)
 
@@ -764,7 +764,7 @@ class Dataset(object):
         the empty string for string types.
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
     """
     return PaddedBatchDataset(self, batch_size, padded_shapes, padding_values)
 
@@ -780,7 +780,7 @@ class Dataset(object):
         specified, elements will be processed sequentially.
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
     """
     if num_parallel_calls is None:
       return MapDataset(self, map_func)
@@ -796,7 +796,7 @@ class Dataset(object):
         `Dataset`.
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
     """
     return FlatMapDataset(self, map_func)
 
@@ -865,7 +865,7 @@ class Dataset(object):
         input element before cycling to another input element.
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
     """
     return InterleaveDataset(self, map_func, cycle_length, block_length)
 
@@ -878,7 +878,7 @@ class Dataset(object):
         scalar `tf.bool` tensor.
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
     """
     return FilterDataset(self, predicate)
 
@@ -902,7 +902,7 @@ class Dataset(object):
         returns a `Dataset`.
 
     Returns:
-      The `Dataset` returned by applying `transformation_func` to this dataset.
+      Dataset: The `Dataset` returned by applying `transformation_func` to this dataset.
     """
     dataset = transformation_func(self)
     if not isinstance(dataset, Dataset):
-- 
GitLab


From 02c2214b8da916a4a9e9eb3fde2711e49e7b4703 Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Wed, 24 Jan 2018 12:12:02 -0800
Subject: [PATCH 094/966] [XLA] Fix crash in HloOrdering::ToString().

HloOrdering::ToString() was crashing when given a computation containing
a fusion instruction.
PiperOrigin-RevId: 183121645
---
 tensorflow/compiler/xla/service/BUILD         |  1 +
 .../compiler/xla/service/hlo_ordering.cc      |  2 +-
 .../compiler/xla/service/hlo_ordering_test.cc | 52 +++++++++++++++++++
 3 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 2e0ac256cc..d0e2dc88ea 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -908,6 +908,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/service/hlo_ordering.cc b/tensorflow/compiler/xla/service/hlo_ordering.cc
index 6f6e679a21..68e3c9618c 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering.cc
+++ b/tensorflow/compiler/xla/service/hlo_ordering.cc
@@ -249,7 +249,7 @@ bool PredecessorHloOrdering::ExecutesBeforeInSameComputation(
 string PredecessorHloOrdering::ToStringHelper(const string& name) const {
   std::vector<string> pieces;
   pieces.push_back(name);
-  for (auto* computation : module_->computations()) {
+  for (auto* computation : module_->MakeNonfusionComputations()) {
     pieces.push_back(tensorflow::strings::Printf("computation %s:",
                                                  computation->name().c_str()));
     const auto all = computation->MakeInstructionPostOrder();
diff --git a/tensorflow/compiler/xla/service/hlo_ordering_test.cc b/tensorflow/compiler/xla/service/hlo_ordering_test.cc
index 33bafd05c1..aba66114de 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_ordering_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_scheduling.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
@@ -310,5 +311,56 @@ TEST_F(HloOrderingTest, ValuesInWhileComputations) {
       *dataflow));
 }
 
+// Regression test for HloOrdering::ToString() crashing when fed a computation
+// containing a fusion node.
+TEST_F(HloOrderingTest, ToStringDoesNotCrash) {
+  const char* module_str = R"(
+HloModule test_module
+
+body.v8 {
+  prev.1 = (s32[], f32[3]{0}, f32[3]{0}, f32[3]{0}) parameter(0)
+  get-tuple-element.4 = s32[] get-tuple-element(prev.1), index=0
+  constant.1 = s32[] constant(1)
+  add = s32[] add(get-tuple-element.4, constant.1)
+  get-tuple-element.5 = f32[3]{0} get-tuple-element(prev.1), index=3
+  get-tuple-element.6 = f32[3]{0} get-tuple-element(prev.1), index=1
+  get-tuple-element.7 = f32[3]{0} get-tuple-element(prev.1), index=2
+  ROOT tuple = (s32[], f32[3]{0}, f32[3]{0}, f32[3]{0}) tuple(add, get-tuple-element.5, get-tuple-element.6, get-tuple-element.7)
+}
+
+condition.v4 {
+  constant.2 = s32[] constant(2)
+  prev.2 = (s32[], f32[3]{0}, f32[3]{0}, f32[3]{0}) parameter(0)
+  get-tuple-element.8 = s32[] get-tuple-element(prev.2), index=0
+  ROOT greater-than = pred[] greater-than(constant.2, get-tuple-element.8)
+}
+
+fused_computation {
+  get-tuple-element.5.param_1 = f32[3]{0} parameter(1)
+  get-tuple-element.6.param_2 = f32[3]{0} parameter(2)
+  add.4 = f32[3]{0} add(get-tuple-element.5.param_1, get-tuple-element.6.param_2)
+  get-tuple-element.7.param_1.1 = f32[3]{0} parameter(0)
+  ROOT add.5 = f32[3]{0} add(add.4, get-tuple-element.7.param_1.1)
+}
+
+ENTRY while.v11 {
+  constant.5 = s32[] constant(0)
+  constant.6 = f32[3]{0} constant({1, 1, 1})
+  constant.7 = f32[3]{0} constant({2, 2, 2})
+  constant.8 = f32[3]{0} constant({3, 3, 3})
+  tuple.1 = (s32[], f32[3]{0}, f32[3]{0}, f32[3]{0}) tuple(constant.5, constant.6, constant.7, constant.8)
+  while = (s32[], f32[3]{0}, f32[3]{0}, f32[3]{0}) while(tuple.1), condition=condition.v4, body=body.v8
+  get-tuple-element.9 = f32[3]{0} get-tuple-element(while), index=3
+  get-tuple-element.10 = f32[3]{0} get-tuple-element(while), index=1
+  get-tuple-element.11 = f32[3]{0} get-tuple-element(while), index=2
+  ROOT fusion = f32[3]{0} fusion(get-tuple-element.9, get-tuple-element.10, get-tuple-element.11), kind=kLoop, calls=fused_computation
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          tools::Parse(module_str));
+  DependencyHloOrdering ordering(module.get());
+  ordering.ToString();  // Shouldn't crash.
+}
+
 }  // namespace
 }  // namespace xla
-- 
GitLab


From 83d3a47d25b87c7e14ffc902489401b47bf568f1 Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Wed, 24 Jan 2018 12:46:09 -0800
Subject: [PATCH 095/966] Initializes the context before setting the device
 policy

PiperOrigin-RevId: 183126037
---
 tensorflow/python/eager/context.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index df483b69c4..8d708b8b04 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -413,6 +413,8 @@ class Context(object):
 
   @contextlib.contextmanager
   def device_policy(self, policy):
+    if not self._context_handle:
+      self._initialize_handle_and_devices()
     old = pywrap_tensorflow.TFE_ContextGetDevicePlacementPolicy(
         self._context_handle)
     pywrap_tensorflow.TFE_ContextSetThreadLocalDevicePlacementPolicy(
-- 
GitLab


From 0c97ad742c176b3854c03067afc42eac20810fe9 Mon Sep 17 00:00:00 2001
From: Austin Anderson <angerson@google.com>
Date: Wed, 24 Jan 2018 12:52:02 -0800
Subject: [PATCH 096/966] Remove note about Ubuntu 14 incompatibility due to
 future change plans

---
 RELEASE.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/RELEASE.md b/RELEASE.md
index 3bb612848d..eb7323dac9 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -2,8 +2,6 @@
 
 ## Breaking Changes
 * Prebuilt binaries are now built against CUDA 9 and cuDNN 7.
-* Our Linux binaries are built using ubuntu 16 containers, potentially
-  introducing glibc incompatibility issues with ubuntu 14.
 * Starting from 1.6 release, our prebuilt binaries will use AVX instructions.
   This may break TF on older CPUs.
 
-- 
GitLab


From a7a74f141525d6163f2ea54c5b6c2965fe1d6d35 Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Wed, 24 Jan 2018 12:47:00 -0800
Subject: [PATCH 097/966] tf_contextlib forgotten oops

PiperOrigin-RevId: 183126126
---
 tensorflow/python/eager/context.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index 8d708b8b04..0569791352 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -411,7 +411,7 @@ class Context(object):
       self._initialize_handle_and_devices()
     pywrap_tensorflow.TFE_ContextEnableRunMetadata(self._context_handle)
 
-  @contextlib.contextmanager
+  @tf_contextlib.contextmanager
   def device_policy(self, policy):
     if not self._context_handle:
       self._initialize_handle_and_devices()
-- 
GitLab


From c8f6966ec7f071e25ab8c5a8168539d18a86fad9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 24 Jan 2018 13:09:10 -0800
Subject: [PATCH 098/966] Portability fix for parse_testdata.cc

PiperOrigin-RevId: 183129228
---
 tensorflow/contrib/lite/testing/parse_testdata.cc | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/tensorflow/contrib/lite/testing/parse_testdata.cc b/tensorflow/contrib/lite/testing/parse_testdata.cc
index 7c371f2bd4..0caef0fe22 100644
--- a/tensorflow/contrib/lite/testing/parse_testdata.cc
+++ b/tensorflow/contrib/lite/testing/parse_testdata.cc
@@ -18,6 +18,7 @@ limitations under the License.
 // ASCII file.
 #include "tensorflow/contrib/lite/testing/parse_testdata.h"
 
+#include <cinttypes>
 #include <cmath>
 #include <cstdint>
 #include <cstdio>
@@ -218,8 +219,8 @@ TfLiteStatus CheckOutputs(tflite::Interpreter* interpreter,
         int32_t computed = data[idx];
         int32_t reference = example.outputs[0].flat_data[idx];
         if (std::abs(computed - reference) > 0) {
-          fprintf(stderr, "output[%zu][%zu] did not match %d vs reference %f\n",
-                  i, idx, data[idx], example.outputs[0].flat_data[idx]);
+          fprintf(stderr, "output[%zu][%zu] did not match %d vs reference %d\n",
+                  i, idx, computed, reference);
           return kTfLiteError;
         }
       }
@@ -231,8 +232,9 @@ TfLiteStatus CheckOutputs(tflite::Interpreter* interpreter,
         int64_t reference = example.outputs[0].flat_data[idx];
         if (std::abs(computed - reference) > 0) {
           fprintf(stderr,
-                  "output[%zu][%zu] did not match %ld vs reference %f\n", i,
-                  idx, data[idx], example.outputs[0].flat_data[idx]);
+                  "output[%zu][%zu] did not match %" PRId64
+                  " vs reference %" PRId64 "\n",
+                  i, idx, computed, reference);
           return kTfLiteError;
         }
       }
-- 
GitLab


From c75a6087c19ed7203081af0b0f0bdb215aedfc00 Mon Sep 17 00:00:00 2001
From: Austin Anderson <angerson@google.com>
Date: Wed, 24 Jan 2018 13:24:18 -0800
Subject: [PATCH 099/966] Update version names to 1.5.0 from 1.5.0-rc1

---
 tensorflow/core/public/version.h              |  2 +-
 tensorflow/docs_src/install/install_c.md      |  2 +-
 tensorflow/docs_src/install/install_go.md     |  2 +-
 tensorflow/docs_src/install/install_java.md   | 22 +++++++++----------
 tensorflow/docs_src/install/install_linux.md  | 22 +++++++++----------
 tensorflow/docs_src/install/install_mac.md    | 10 ++++-----
 .../docs_src/install/install_sources.md       | 14 ++++++------
 tensorflow/tools/pip_package/setup.py         |  2 +-
 8 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index c2fad5dbd8..f28d89125e 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -24,7 +24,7 @@ limitations under the License.
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
 // "-beta", "-rc", "-rc.1")
-#define TF_VERSION_SUFFIX "-rc1"
+#define TF_VERSION_SUFFIX ""
 
 #define TF_STR_HELPER(x) #x
 #define TF_STR(x) TF_STR_HELPER(x)
diff --git a/tensorflow/docs_src/install/install_c.md b/tensorflow/docs_src/install/install_c.md
index ba1a4118ae..14add7c77e 100644
--- a/tensorflow/docs_src/install/install_c.md
+++ b/tensorflow/docs_src/install/install_c.md
@@ -38,7 +38,7 @@ enable TensorFlow for C:
          OS="linux" # Change to "darwin" for macOS
          TARGET_DIRECTORY="/usr/local"
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.5.0-rc1.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.5.0.tar.gz" |
            sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_go.md b/tensorflow/docs_src/install/install_go.md
index 87cc647317..d2af9d9843 100644
--- a/tensorflow/docs_src/install/install_go.md
+++ b/tensorflow/docs_src/install/install_go.md
@@ -38,7 +38,7 @@ steps to install this library and enable TensorFlow for Go:
          TF_TYPE="cpu" # Change to "gpu" for GPU support
          TARGET_DIRECTORY='/usr/local'
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.5.0-rc1.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.5.0.tar.gz" |
          sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_java.md b/tensorflow/docs_src/install/install_java.md
index 49dc3cf47f..d87fedcf77 100644
--- a/tensorflow/docs_src/install/install_java.md
+++ b/tensorflow/docs_src/install/install_java.md
@@ -36,7 +36,7 @@ following to the project's `pom.xml` to use the TensorFlow Java APIs:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>tensorflow</artifactId>
-  <version>1.5.0-rc1</version>
+  <version>1.5.0</version>
 </dependency>
 ```
 
@@ -65,7 +65,7 @@ As an example, these steps will create a Maven project that uses TensorFlow:
                <dependency>
                  <groupId>org.tensorflow</groupId>
                  <artifactId>tensorflow</artifactId>
-                 <version>1.5.0-rc1</version>
+                 <version>1.5.0</version>
                </dependency>
              </dependencies>
          </project>
@@ -123,12 +123,12 @@ instead:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow</artifactId>
-  <version>1.5.0-rc1</version>
+  <version>1.5.0</version>
 </dependency>
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow_jni_gpu</artifactId>
-  <version>1.5.0-rc1</version>
+  <version>1.5.0</version>
 </dependency>
 ```
 
@@ -147,7 +147,7 @@ refer to the simpler instructions above instead.
 Take the following steps to install TensorFlow for Java on Linux or macOS:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.5.0-rc1.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.5.0.jar),
      which is the TensorFlow Java Archive (JAR).
 
   2. Decide whether you will run TensorFlow for Java on CPU(s) only or with
@@ -166,7 +166,7 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
          OS=$(uname -s | tr '[:upper:]' '[:lower:]')
          mkdir -p ./jni
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.5.0-rc1.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.5.0.tar.gz" |
            tar -xz -C ./jni
 
 ### Install on Windows
@@ -174,10 +174,10 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
 Take the following steps to install TensorFlow for Java on Windows:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.5.0-rc1.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.5.0.jar),
      which is the TensorFlow Java Archive (JAR).
   2. Download the following Java Native Interface (JNI) file appropriate for
-     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.5.0-rc1.zip).
+     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.5.0.zip).
   3. Extract this .zip file.
 
 
@@ -225,7 +225,7 @@ must be part of your `classpath`. For example, you can include the
 downloaded `.jar` in your `classpath` by using the `-cp` compilation flag
 as follows:
 
-<pre><b>javac -cp libtensorflow-1.5.0-rc1.jar HelloTF.java</b></pre>
+<pre><b>javac -cp libtensorflow-1.5.0.jar HelloTF.java</b></pre>
 
 
 ### Running
@@ -239,11 +239,11 @@ two files are available to the JVM:
 For example, the following command line executes the `HelloTF` program on Linux
 and macOS X:
 
-<pre><b>java -cp libtensorflow-1.5.0-rc1.jar:. -Djava.library.path=./jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.5.0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
 
 And the following command line executes the `HelloTF` program on Windows:
 
-<pre><b>java -cp libtensorflow-1.5.0-rc1.jar;. -Djava.library.path=jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.5.0.jar;. -Djava.library.path=jni HelloTF</b></pre>
 
 If the program prints <tt>Hello from <i>version</i></tt>, you've successfully
 installed TensorFlow for Java and are ready to use the API.  If the program
diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index f94bd4e5e6..ac8836748b 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -188,7 +188,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      Virtualenv environment:
 
      <pre>(tensorflow)$ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc1-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common_installation_problems).
@@ -293,7 +293,7 @@ take the following steps:
 
      <pre>
      $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc1-cp34-cp34m-linux_x86_64.whl</b>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp34-cp34m-linux_x86_64.whl</b>
      </pre>
 
      If this step fails, see
@@ -480,7 +480,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
 
      <pre>
      (tensorflow)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc1-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -648,14 +648,14 @@ This section documents the relevant values for Linux installations.
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc1-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp27-none-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0rc1-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0-cp27-none-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -667,14 +667,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc1-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0rc1-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -686,14 +686,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc1-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0rc1-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
@@ -705,14 +705,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc1-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0rc1-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
diff --git a/tensorflow/docs_src/install/install_mac.md b/tensorflow/docs_src/install/install_mac.md
index 1851275571..ae4d760a2c 100644
--- a/tensorflow/docs_src/install/install_mac.md
+++ b/tensorflow/docs_src/install/install_mac.md
@@ -115,7 +115,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      TensorFlow in the active Virtualenv is as follows:
 
      <pre> $ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0rc1-py2-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0-py2-none-any.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common-installation-problems).
@@ -238,7 +238,7 @@ take the following steps:
      issue the following command:
 
      <pre> $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0rc1-py2-none-any.whl</b> </pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0-py2-none-any.whl</b> </pre>
 
      If the preceding command fails, see
      [installation problems](#common-installation-problems).
@@ -347,7 +347,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
      TensorFlow for Python 2.7:
 
      <pre> (<i>targetDirectory</i>)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0rc1-py2-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0-py2-none-any.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -520,7 +520,7 @@ This section documents the relevant values for Mac OS installations.
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0rc1-py2-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0-py2-none-any.whl
 </pre>
 
 
@@ -528,7 +528,7 @@ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0rc1-py2-none-a
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0rc1-py3-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0-py3-none-any.whl
 </pre>
 
 
diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index f2af2919df..52e1bc79c3 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -355,10 +355,10 @@ Invoke `pip install` to install that pip package.
 The filename of the `.whl` file depends on your platform.
 For example, the following command will install the pip package
 
-for TensorFlow 1.5.0rc1 on Linux:
+for TensorFlow 1.5.0 on Linux:
 
 <pre>
-$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.5.0rc1-py2-none-any.whl</b>
+$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.5.0-py2-none-any.whl</b>
 </pre>
 
 ## Validate your installation
@@ -457,8 +457,8 @@ Stack Overflow and specify the `tensorflow` tag.
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
 
-<tr><td>tensorflow-1.5.0-rc1</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.8.0</td><td>N/A</td><td>N/A</td></tr>
-<tr><td>tensorflow_gpu-1.5.0-rc1</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.8.0</td><td>7</td><td>9</td></tr>
+<tr><td>tensorflow-1.5.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.8.0</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.5.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.8.0</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.4.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.5.4</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.4.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.5.4</td><td>6</td><td>8</td></tr>
 <tr><td>tensorflow-1.3.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.4.5</td><td>N/A</td><td>N/A</td></tr>
@@ -474,7 +474,7 @@ Stack Overflow and specify the `tensorflow` tag.
 **Mac**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
-<tr><td>tensorflow-1.5.0-rc1</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.8.1</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow-1.5.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.8.1</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.4.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.5.4</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.3.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.4.5</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.2.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.4.5</td><td>N/A</td><td>N/A</td></tr>
@@ -487,8 +487,8 @@ Stack Overflow and specify the `tensorflow` tag.
 **Windows**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
-<tr><td>tensorflow-1.5.0-rc1</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
-<tr><td>tensorflow_gpu-1.5.0-rc1</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
+<tr><td>tensorflow-1.5.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.5.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.4.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.4.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>6</td><td>8</td></tr>
 <tr><td>tensorflow-1.3.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 0c5f42b123..7654b8df1b 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -29,7 +29,7 @@ from setuptools.dist import Distribution
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
-_VERSION = '1.5.0-rc1'
+_VERSION = '1.5.0'
 
 REQUIRED_PACKAGES = [
     'absl-py >= 0.1.6',
-- 
GitLab


From 68ab9a99cf07ed5216d310417873572043803a1e Mon Sep 17 00:00:00 2001
From: Nupur Garg <nupurgarg@google.com>
Date: Wed, 24 Jan 2018 13:21:54 -0800
Subject: [PATCH 100/966] Make TFLite Pad op have parity with TF Pad op.

PiperOrigin-RevId: 183131070
---
 tensorflow/contrib/lite/builtin_op_data.h     |   5 -
 tensorflow/contrib/lite/kernels/pad.cc        | 102 ++++++++++------
 tensorflow/contrib/lite/kernels/pad_test.cc   | 109 +++++++++++++-----
 tensorflow/contrib/lite/kernels/test_util.cc  |  36 ++++--
 tensorflow/contrib/lite/kernels/test_util.h   |   7 +-
 tensorflow/contrib/lite/model.cc              |  19 ---
 tensorflow/contrib/lite/schema/schema.fbs     |   2 -
 .../contrib/lite/schema/schema_generated.h    |  62 +---------
 .../contrib/lite/testing/generate_examples.py |  27 ++++-
 .../testing/generated_examples_zip_test.cc    |  59 +++++-----
 .../contrib/lite/toco/tflite/operator.cc      |  10 +-
 .../contrib/lite/toco/tflite/operator_test.cc |  10 --
 12 files changed, 237 insertions(+), 211 deletions(-)

diff --git a/tensorflow/contrib/lite/builtin_op_data.h b/tensorflow/contrib/lite/builtin_op_data.h
index ab07c58c92..8269de8e3f 100644
--- a/tensorflow/contrib/lite/builtin_op_data.h
+++ b/tensorflow/contrib/lite/builtin_op_data.h
@@ -172,11 +172,6 @@ typedef struct {
 } TfLiteResizeBilinearParams;
 
 typedef struct {
-  // TODO(ahentz): We can't have dynamic data in this struct, at least not yet.
-  // For now we will fix the maximum possible number of dimensions.
-  int before_padding[8];
-  int after_padding[8];
-  int num_dimensions;
 } TfLitePadParams;
 
 typedef struct {
diff --git a/tensorflow/contrib/lite/kernels/pad.cc b/tensorflow/contrib/lite/kernels/pad.cc
index 1a0d9d1505..569bf0fe8f 100644
--- a/tensorflow/contrib/lite/kernels/pad.cc
+++ b/tensorflow/contrib/lite/kernels/pad.cc
@@ -33,65 +33,93 @@ enum KernelType {
   kGenericOptimized,
 };
 
-// TODO(nupurgarg): Padding represented as a tensor is ignored. Only use the
-// `left_padding` and `right_padding` specified in `params`.
 struct PadContext {
   PadContext(TfLiteContext* context, TfLiteNode* node) {
-    params = reinterpret_cast<TfLitePadParams*>(node->builtin_data);
     input = GetInput(context, node, 0);
+    paddings = GetInput(context, node, 1);
     output = GetOutput(context, node, 0);
+    dims = NumDimensions(input);
   }
-  TfLitePadParams* params;
   TfLiteTensor* input;
+  TfLiteTensor* paddings;
   TfLiteTensor* output;
+  int dims;
 };
 
+// Resizes output array based on the input size and padding size. This function
+// is callable from both Prepare() and Eval() as long as the caller ensures the
+// paddings data is present.
+TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
+                                PadContext* op_context) {
+  // TODO(nupurgarg): Our current implementations rely on the inputs being 4D.
+  TF_LITE_ENSURE_EQ(context, op_context->dims, 4);
+
+  // Ensures the paddings array is dims x 2.
+  TF_LITE_ENSURE_EQ(context, SizeOfDimension(op_context->paddings, 0),
+                    op_context->dims);
+  TF_LITE_ENSURE_EQ(context, SizeOfDimension(op_context->paddings, 1), 2);
+
+  // Determines the size of the output tensor.
+  const TfLiteIntArray* input_size = op_context->input->dims;
+  TfLiteIntArray* output_size = TfLiteIntArrayCreate(op_context->dims);
+  const int32* paddings_data = GetTensorData<int32>(op_context->paddings);
+
+  for (int idx = 0; idx < op_context->dims; ++idx) {
+    int before_padding = *paddings_data++;
+    int after_padding = *paddings_data++;
+
+    TF_LITE_ENSURE_MSG(context, (before_padding >= 0 && after_padding >= 0),
+                       "Pad value has to be greater than equal to 0.");
+
+    output_size->data[idx] =
+        (input_size->data[idx] + before_padding + after_padding);
+  }
+
+  return context->ResizeTensor(context, op_context->output, output_size);
+}
+
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  TF_LITE_ENSURE(context, NumInputs(node) == 1 || NumInputs(node) == 2);
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  // Determines size of output tensor.
   PadContext op_context(context, node);
-  int dims = NumDimensions(op_context.input);
-  TF_LITE_ENSURE_EQ(context, dims, op_context.params->num_dimensions);
   TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type);
 
-  // TODO(nupurgarg): Our current implementations rely on the inputs being 4D.
-  TF_LITE_ENSURE_EQ(context, dims, 4);
-
-  const TfLiteIntArray* input_size = op_context.input->dims;
-  TfLiteIntArray* output_size = TfLiteIntArrayCreate(dims);
-  for (int idx = 0; idx < dims; ++idx) {
-    TF_LITE_ENSURE_MSG(context,
-                       (op_context.params->before_padding[idx] >= 0 &&
-                        op_context.params->after_padding[idx] >= 0),
-                       "Pad value has to be greater than equal to 0.");
-    output_size->data[idx] =
-        (input_size->data[idx] + op_context.params->before_padding[idx] +
-         op_context.params->after_padding[idx]);
+  // TODO(nupurgarg): Create wrapper functions for dynamic tensor logic.
+  // Exit early if paddings is a non-const tensor. Set output tensor to
+  // dynamic so output size can be determined in Eval.
+  if (op_context.paddings->allocation_type != kTfLiteMmapRo) {
+    op_context.output->allocation_type = kTfLiteDynamic;
+    return kTfLiteOk;
   }
-
-  return context->ResizeTensor(context, op_context.output, output_size);
+  return ResizeOutputTensor(context, &op_context);
 }
 
 template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   PadContext op_context(context, node);
 
-  std::vector<int> before_padding(
-      op_context.params->before_padding,
-      op_context.params->before_padding + op_context.params->num_dimensions);
-  std::vector<int> after_padding(
-      op_context.params->after_padding,
-      op_context.params->after_padding + op_context.params->num_dimensions);
-
-  // TODO(nupurgarg): Change TOCO's implementation to use padding arrays
-  // in forward order (depth, width, height, batch).
-  // Converts from int[] = {depth, width, height, batch} to int[] = {batch,
-  // height, width, depth} to match TOCO's implementation of pad in
-  // referenced_ops.h and optimized_ops.h.
-  std::reverse(before_padding.begin(), before_padding.end());
-  std::reverse(after_padding.begin(), after_padding.end());
+  // Resize the output tensor if the output tensor is dynamic.
+  if (op_context.output->allocation_type == kTfLiteDynamic) {
+    TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context));
+    TfLiteTensorRealloc(op_context.output->bytes, op_context.output);
+  }
+
+  // TODO(nupurgarg): Change kernel implementation to take in int* instead of
+  // vector<int> to remove malloc from Eval().
+  // Create before and after padding arrays that are accepted by the kernel.
+  std::vector<int> before_padding;
+  std::vector<int> after_padding;
+  const int32* paddings_data = GetTensorData<int32>(op_context.paddings);
+
+  // TODO(nupurgarg): Change kernel implementation to use padding arrays in
+  // forward order (depth, width, height, batch).
+  // Build paddings in order of int[] = {batch, height, width, depth} to match
+  // kernel implementation of Pad in referenced_ops.h and optimized_ops.h.
+  for (int idx = op_context.dims - 1; idx >= 0; --idx) {
+    before_padding.push_back(paddings_data[idx * 2]);
+    after_padding.push_back(paddings_data[idx * 2 + 1]);
+  }
 
 #define TF_LITE_PAD(type, scalar)                                           \
   type::Pad(GetTensorData<scalar>(op_context.input),                        \
diff --git a/tensorflow/contrib/lite/kernels/pad_test.cc b/tensorflow/contrib/lite/kernels/pad_test.cc
index f3ea9417df..28834ad071 100644
--- a/tensorflow/contrib/lite/kernels/pad_test.cc
+++ b/tensorflow/contrib/lite/kernels/pad_test.cc
@@ -25,52 +25,87 @@ using ::testing::ElementsAreArray;
 
 class PadOpModel : public SingleOpModel {
  public:
-  PadOpModel(std::initializer_list<int> input_shape,
-             std::initializer_list<int> before_padding,
-             std::initializer_list<int> after_padding) {
-    input_ = AddInput(TensorType_FLOAT32);
-    output_ = AddOutput(TensorType_FLOAT32);
-    SetBuiltinOp(
-        BuiltinOperator_PAD, BuiltinOptions_PadOptions,
-        CreatePadOptions(builder_, builder_.CreateVector<int>(before_padding),
-                         builder_.CreateVector<int>(after_padding))
-            .Union());
-    BuildInterpreter({input_shape});
-  }
-
   void SetInput(std::initializer_list<float> data) {
     PopulateTensor<float>(input_, data);
   }
 
+  void SetPaddings(std::initializer_list<int> paddings) {
+    PopulateTensor<int>(paddings_, paddings);
+  }
+
   std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
 
- private:
+ protected:
   int input_;
   int output_;
+  int paddings_;
+};
+
+// Tests case where paddings is a const tensor.
+//
+// Example usage is as follows:
+//    PadOpDynamicModel m(input_shape, paddings_shape, paddings_data);
+//    m.SetInput(input_data);
+//    m.Invoke();
+class PadOpConstModel : public PadOpModel {
+ public:
+  PadOpConstModel(std::initializer_list<int> input_shape,
+                  std::initializer_list<int> paddings_shape,
+                  std::initializer_list<int> paddings) {
+    input_ = AddInput(TensorType_FLOAT32);
+    paddings_ = AddConstInput(TensorType_INT32, paddings, paddings_shape);
+    output_ = AddOutput(TensorType_FLOAT32);
+
+    SetBuiltinOp(BuiltinOperator_PAD, BuiltinOptions_PadOptions,
+                 CreatePadOptions(builder_).Union());
+    BuildInterpreter({input_shape});
+  }
+};
+
+// Test case where paddings is a non-const tensor.
+//
+// Example usage is as follows:
+//    PadOpDynamicModel m(input_shape, paddings_shape);
+//    m.SetInput(input_data);
+//    m.SetPaddings(paddings_data);
+//    m.Invoke();
+class PadOpDynamicModel : public PadOpModel {
+ public:
+  PadOpDynamicModel(std::initializer_list<int> input_shape,
+                    std::initializer_list<int> paddings_shape) {
+    input_ = AddInput(TensorType_FLOAT32);
+    paddings_ = AddInput(TensorType_INT32);
+    output_ = AddOutput(TensorType_FLOAT32);
+
+    SetBuiltinOp(BuiltinOperator_PAD, BuiltinOptions_PadOptions,
+                 CreatePadOptions(builder_).Union());
+    BuildInterpreter({input_shape, paddings_shape});
+  }
 };
 
 TEST(PadOpTest, TooManyDimensions) {
   EXPECT_DEATH(
-      PadOpModel({1, 2, 3, 4, 5, 6, 7, 8, 9}, {1, 2, 3, 4, 5, 6, 7, 8, 9},
-                 {1, 2, 3, 4, 5, 6, 7, 8, 9}),
+      PadOpConstModel({1, 2, 3, 4, 5, 6, 7, 8, 9}, {9, 2},
+                      {1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9}),
       "dims != 4");
 }
 
-// TODO(nupurgarg): Test case where before padding and after padding arrays
-// don't contain the same number of dimensions.
 TEST(PadOpTest, UnequalDimensions) {
-  EXPECT_DEATH(PadOpModel({1, 1, 2, 1}, {1, 2, 3}, {1, 2, 3}),
-               "dims != op_context.params->num_dimensions");
+  EXPECT_DEATH(PadOpConstModel({1, 1, 2, 1}, {3, 2}, {1, 1, 2, 2, 3, 3}),
+               "3 != 4");
 }
 
 TEST(PadOpTest, InvalidPadValue) {
-  EXPECT_DEATH(PadOpModel({1, 1, 2, 1}, {0, 1, 2, 0}, {0, -1, -1, 0}),
-               "Pad value has to be greater than equal to 0.");
+  EXPECT_DEATH(
+      PadOpConstModel({1, 1, 2, 1}, {4, 2}, {0, 0, 1, -1, 2, -1, 0, 0}),
+      "Pad value has to be greater than equal to 0.");
 }
 
-TEST(PadOpTest, SimpleTest) {
-  PadOpModel m({1, 2, 2, 1}, {0, 1, 1, 0}, {0, 1, 1, 0});
+TEST(PadOpTest, SimpleConstTest) {
+  // Padding is represented as four 2-D lists representing above padding and
+  // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
+  PadOpConstModel m({1, 2, 2, 1}, {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0});
   m.SetInput({1, 2, 3, 4});
   m.Invoke();
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0, 0, 0, 0, 1, 2, 0, 0, 3, 4,
@@ -78,10 +113,30 @@ TEST(PadOpTest, SimpleTest) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
 }
 
-TEST(PadOpTest, AdvancedTest) {
-  // The padding is input in the order of batch, height, width, depth.
-  PadOpModel m({1, 2, 3, 1}, {0, 0, 1, 0}, {0, 2, 3, 0});
+TEST(PadOpTest, SimpleDynamicTest) {
+  PadOpDynamicModel m({1, 2, 2, 1}, {4, 2});
+  m.SetInput({1, 2, 3, 4});
+  m.SetPaddings({0, 0, 1, 1, 1, 1, 0, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0, 0, 0, 0, 1, 2, 0, 0, 3, 4,
+                                               0, 0, 0, 0, 0}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+TEST(PadOpTest, AdvancedConstTest) {
+  PadOpConstModel m({1, 2, 3, 1}, {4, 2}, {0, 0, 0, 2, 1, 3, 0, 0});
+  m.SetInput({1, 2, 3, 4, 5, 6});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray({0, 1, 2, 3, 0, 0, 0, 0, 4, 5, 6, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
+}
+
+TEST(PadOpTest, AdvancedDynamicTest) {
+  PadOpDynamicModel m({1, 2, 3, 1}, {4, 2});
   m.SetInput({1, 2, 3, 4, 5, 6});
+  m.SetPaddings({0, 0, 0, 2, 1, 3, 0, 0});
   m.Invoke();
   EXPECT_THAT(m.GetOutput(),
               ElementsAreArray({0, 1, 2, 3, 0, 0, 0, 0, 4, 5, 6, 0, 0, 0,
diff --git a/tensorflow/contrib/lite/kernels/test_util.cc b/tensorflow/contrib/lite/kernels/test_util.cc
index b69f2b3e4b..3a58e7ec32 100644
--- a/tensorflow/contrib/lite/kernels/test_util.cc
+++ b/tensorflow/contrib/lite/kernels/test_util.cc
@@ -49,7 +49,7 @@ std::vector<Matcher<float>> ArrayFloatNear(const std::vector<float>& values,
   return matchers;
 }
 
-int SingleOpModel::AddTensor(TensorData t) {
+int SingleOpModel::AddTensor(TensorData t, std::initializer_list<int> data) {
   int id = tensors_.size();
 
   // This is slightly different depending on whether we are adding a
@@ -78,8 +78,23 @@ int SingleOpModel::AddTensor(TensorData t) {
         builder_.CreateVector<int64_t>({t.zero_point}));
   }
 
-  tensors_.push_back(CreateTensor(builder_, builder_.CreateVector<int>({}),
-                                  t.type, /*buffer=*/0,
+  int buffer_id = 0;
+  if (data.size()) {
+    // Initialize buffers list with empty buffer to allow for non-const tensors.
+    if (buffers_.empty()) {
+      buffers_.push_back(CreateBuffer(builder_, builder_.CreateVector({})));
+    }
+
+    // Add data as a Buffer to buffers list.
+    buffer_id = buffers_.size();
+    auto data_buffer =
+        builder_.CreateVector(reinterpret_cast<const uint8_t*>(data.begin()),
+                              sizeof(int) * data.size());
+    buffers_.push_back(CreateBuffer(builder_, data_buffer));
+  }
+
+  tensors_.push_back(CreateTensor(builder_, builder_.CreateVector<int>(t.shape),
+                                  t.type, /*buffer=*/buffer_id,
                                   /*name=*/0, q_params));
 
   tensor_data_[id] = t;
@@ -88,7 +103,15 @@ int SingleOpModel::AddTensor(TensorData t) {
 }
 
 int SingleOpModel::AddInput(const TensorData& t) {
-  int id = AddTensor(t);
+  int id = AddTensor(t, {});
+  inputs_.push_back(id);
+  return id;
+}
+
+int SingleOpModel::AddConstInput(TensorType type,
+                                 std::initializer_list<int> data,
+                                 std::initializer_list<int> shape) {
+  int id = AddTensor(TensorData{type, shape}, data);
   inputs_.push_back(id);
   return id;
 }
@@ -100,7 +123,7 @@ int SingleOpModel::AddNullInput() {
 }
 
 int SingleOpModel::AddOutput(const TensorData& t) {
-  int id = AddTensor(t);
+  int id = AddTensor(t, {});
   outputs_.push_back(id);
   return id;
 }
@@ -142,8 +165,7 @@ void SingleOpModel::BuildInterpreter(
   subgraphs.push_back(subgraph);
   auto subgraphs_flatbuffer = builder_.CreateVector(subgraphs);
 
-  std::vector<flatbuffers::Offset<Buffer>> buffers_vec;
-  auto buffers = builder_.CreateVector(buffers_vec);
+  auto buffers = builder_.CreateVector(buffers_);
   auto description = builder_.CreateString("programmatic model");
   builder_.Finish(CreateModel(builder_, TFLITE_SCHEMA_VERSION, opcodes,
                               subgraphs_flatbuffer, description, buffers));
diff --git a/tensorflow/contrib/lite/kernels/test_util.h b/tensorflow/contrib/lite/kernels/test_util.h
index 531c1366a8..b9c0ba8f47 100644
--- a/tensorflow/contrib/lite/kernels/test_util.h
+++ b/tensorflow/contrib/lite/kernels/test_util.h
@@ -98,6 +98,10 @@ class SingleOpModel {
   int AddInput(TensorType type) { return AddInput(TensorData{type}); }
   int AddInput(const TensorData& t);
 
+  // Add a Tensor containing const data and return the tensor id.
+  int AddConstInput(TensorType type, std::initializer_list<int> data,
+                    std::initializer_list<int> shape);
+
   // Add a null input tensor (optional input) and return kOptionalTensor.
   int AddNullInput();
 
@@ -181,7 +185,7 @@ class SingleOpModel {
   std::unique_ptr<tflite::Interpreter> interpreter_;
 
  private:
-  int AddTensor(TensorData t);
+  int AddTensor(TensorData t, std::initializer_list<int> data);
 
   std::map<int, TensorData> tensor_data_;
   std::vector<int32_t> inputs_;
@@ -189,6 +193,7 @@ class SingleOpModel {
   std::vector<flatbuffers::Offset<Tensor>> tensors_;
   std::vector<flatbuffers::Offset<OperatorCode>> opcodes_;
   std::vector<flatbuffers::Offset<Operator>> operators_;
+  std::vector<flatbuffers::Offset<Buffer>> buffers_;
   std::map<string, std::function<TfLiteRegistration*()>> custom_registrations_;
 };
 
diff --git a/tensorflow/contrib/lite/model.cc b/tensorflow/contrib/lite/model.cc
index a01b74f9da..95949be9e6 100644
--- a/tensorflow/contrib/lite/model.cc
+++ b/tensorflow/contrib/lite/model.cc
@@ -486,25 +486,6 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
       break;
     }
     case BuiltinOperator_PAD: {
-      auto* params = MallocPOD<TfLitePadParams>();
-      if (auto* schema_params = op->builtin_options_as_PadOptions()) {
-        auto* before_padding = schema_params->before_padding();
-        FlatBufferIntVectorToArray(sizeof(params->before_padding),
-                                   before_padding, params->before_padding,
-                                   error_reporter);
-
-        auto* after_padding = schema_params->after_padding();
-        FlatBufferIntVectorToArray(sizeof(params->after_padding), after_padding,
-                                   params->after_padding, error_reporter);
-
-        if (before_padding->Length() != after_padding->Length()) {
-          error_reporter->Report(
-              "Before padding and after padding arrays need to contain the "
-              "same number of dimensions.\n");
-        }
-        params->num_dimensions = after_padding->Length();
-      }
-      builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_RESHAPE: {
diff --git a/tensorflow/contrib/lite/schema/schema.fbs b/tensorflow/contrib/lite/schema/schema.fbs
index 2172135f49..ec202cd407 100644
--- a/tensorflow/contrib/lite/schema/schema.fbs
+++ b/tensorflow/contrib/lite/schema/schema.fbs
@@ -275,8 +275,6 @@ table CallOptions {
 }
 
 table PadOptions {
-  before_padding:[int];
-  after_padding:[int];
 }
 
 table ReshapeOptions {
diff --git a/tensorflow/contrib/lite/schema/schema_generated.h b/tensorflow/contrib/lite/schema/schema_generated.h
index b756891f66..c04a73a2bf 100644
--- a/tensorflow/contrib/lite/schema/schema_generated.h
+++ b/tensorflow/contrib/lite/schema/schema_generated.h
@@ -2657,26 +2657,13 @@ flatbuffers::Offset<CallOptions> CreateCallOptions(
 
 struct PadOptionsT : public flatbuffers::NativeTable {
   typedef PadOptions TableType;
-  std::vector<int32_t> before_padding;
-  std::vector<int32_t> after_padding;
   PadOptionsT() {}
 };
 
 struct PadOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef PadOptionsT NativeTableType;
-  enum { VT_BEFORE_PADDING = 4, VT_AFTER_PADDING = 6 };
-  const flatbuffers::Vector<int32_t> *before_padding() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_BEFORE_PADDING);
-  }
-  const flatbuffers::Vector<int32_t> *after_padding() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_AFTER_PADDING);
-  }
   bool Verify(flatbuffers::Verifier &verifier) const {
-    return VerifyTableStart(verifier) &&
-           VerifyOffset(verifier, VT_BEFORE_PADDING) &&
-           verifier.Verify(before_padding()) &&
-           VerifyOffset(verifier, VT_AFTER_PADDING) &&
-           verifier.Verify(after_padding()) && verifier.EndTable();
+    return VerifyTableStart(verifier) && verifier.EndTable();
   }
   PadOptionsT *UnPack(
       const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -2691,14 +2678,6 @@ struct PadOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
 struct PadOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
-  void add_before_padding(
-      flatbuffers::Offset<flatbuffers::Vector<int32_t>> before_padding) {
-    fbb_.AddOffset(PadOptions::VT_BEFORE_PADDING, before_padding);
-  }
-  void add_after_padding(
-      flatbuffers::Offset<flatbuffers::Vector<int32_t>> after_padding) {
-    fbb_.AddOffset(PadOptions::VT_AFTER_PADDING, after_padding);
-  }
   explicit PadOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
       : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -2712,24 +2691,11 @@ struct PadOptionsBuilder {
 };
 
 inline flatbuffers::Offset<PadOptions> CreatePadOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> before_padding = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> after_padding = 0) {
+    flatbuffers::FlatBufferBuilder &_fbb) {
   PadOptionsBuilder builder_(_fbb);
-  builder_.add_after_padding(after_padding);
-  builder_.add_before_padding(before_padding);
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<PadOptions> CreatePadOptionsDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    const std::vector<int32_t> *before_padding = nullptr,
-    const std::vector<int32_t> *after_padding = nullptr) {
-  return tflite::CreatePadOptions(
-      _fbb, before_padding ? _fbb.CreateVector<int32_t>(*before_padding) : 0,
-      after_padding ? _fbb.CreateVector<int32_t>(*after_padding) : 0);
-}
-
 flatbuffers::Offset<PadOptions> CreatePadOptions(
     flatbuffers::FlatBufferBuilder &_fbb, const PadOptionsT *_o,
     const flatbuffers::rehasher_function_t *_rehasher = nullptr);
@@ -5572,24 +5538,6 @@ inline void PadOptions::UnPackTo(
     PadOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  {
-    auto _e = before_padding();
-    if (_e) {
-      _o->before_padding.resize(_e->size());
-      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
-        _o->before_padding[_i] = _e->Get(_i);
-      }
-    }
-  };
-  {
-    auto _e = after_padding();
-    if (_e) {
-      _o->after_padding.resize(_e->size());
-      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
-        _o->after_padding[_i] = _e->Get(_i);
-      }
-    }
-  };
 }
 
 inline flatbuffers::Offset<PadOptions> PadOptions::Pack(
@@ -5609,11 +5557,7 @@ inline flatbuffers::Offset<PadOptions> CreatePadOptions(
     const flatbuffers::rehasher_function_t *__rehasher;
   } _va = {&_fbb, _o, _rehasher};
   (void)_va;
-  auto _before_padding =
-      _o->before_padding.size() ? _fbb.CreateVector(_o->before_padding) : 0;
-  auto _after_padding =
-      _o->after_padding.size() ? _fbb.CreateVector(_o->after_padding) : 0;
-  return tflite::CreatePadOptions(_fbb, _before_padding, _after_padding);
+  return tflite::CreatePadOptions(_fbb);
 }
 
 inline ReshapeOptionsT *ReshapeOptions::UnPack(
diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py
index c225cd4f00..a639351657 100644
--- a/tensorflow/contrib/lite/testing/generate_examples.py
+++ b/tensorflow/contrib/lite/testing/generate_examples.py
@@ -1141,28 +1141,43 @@ def make_pad_tests(zip_path):
           "input_shape": [[1, 1, 2, 1], [2, 1, 1, 1]],
           "paddings": [[[0, 0], [0, 1], [2, 3], [0, 0]], [[0, 1], [0, 0],
                                                           [0, 0], [2, 3]]],
+          "constant_paddings": [True, False],
       },
       # Non-4D use case.
       {
           "dtype": [tf.int32, tf.int64, tf.float32],
           "input_shape": [[1, 2], [0, 1, 2]],
           "paddings": [[[0, 1], [2, 3]]],
+          "constant_paddings": [True, False],
       },
   ]
 
   def build_graph(parameters):
+    """Build a pad graph given `parameters`."""
     input_tensor = tf.placeholder(
         dtype=parameters["dtype"],
         name="input",
         shape=parameters["input_shape"])
-    out = tf.pad(input_tensor, paddings=parameters["paddings"])
-    return [input_tensor], [out]
+
+    # Get paddings as either a placeholder or constants.
+    if parameters["constant_paddings"]:
+      paddings = parameters["paddings"]
+      input_tensors = [input_tensor]
+    else:
+      shape = [len(parameters["paddings"]), 2]
+      paddings = tf.placeholder(dtype=tf.int32, name="padding", shape=shape)
+      input_tensors = [input_tensor, paddings]
+
+    out = tf.pad(input_tensor, paddings=paddings)
+    return input_tensors, [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
-    input_values = create_tensor_data(parameters["dtype"],
-                                      parameters["input_shape"])
-    return [input_values], sess.run(
-        outputs, feed_dict=dict(zip(inputs, [input_values])))
+    values = [
+        create_tensor_data(parameters["dtype"], parameters["input_shape"])
+    ]
+    if not parameters["constant_paddings"]:
+      values.append(np.array(parameters["paddings"]))
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
diff --git a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
index 36aa09090b..41652a07d2 100644
--- a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
+++ b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
@@ -48,50 +48,51 @@ tensorflow::Env* env = tensorflow::Env::Default();
 // TODO(ahentz): make sure we clean this list up frequently.
 std::map<string, string> kBrokenTests = {
     // Add doesn't support broadcasting.
-    {R"(adda.*input_shape_1=\[1,3,4,3\],input_shape_2=\[3\])", "68500195"},
-    {R"(mula.*input_shape_1=\[1,3,4,3\],input_shape_2=\[3\])", "68500195"},
-    {R"(diva.*input_shape_1=\[1,3,4,3\],input_shape_2=\[3\])", "68500195"},
-    {R"(suba.*input_shape_1=\[1,3,4,3\],input_shape_2=\[3\])", "68500195"},
+    {R"(^\/adda.*input_shape_1=\[1,3,4,3\],input_shape_2=\[3\])", "68500195"},
+    {R"(^\/mula.*input_shape_1=\[1,3,4,3\],input_shape_2=\[3\])", "68500195"},
+    {R"(^\/diva.*input_shape_1=\[1,3,4,3\],input_shape_2=\[3\])", "68500195"},
+    {R"(^\/suba.*input_shape_1=\[1,3,4,3\],input_shape_2=\[3\])", "68500195"},
 
     // Add only supports float32. (and "constant" tests use Add)
-    {R"(adda.*int32)", "68808744"},
-    {R"(constant.*int32)", "68808744"},
-    {R"(mul.*int32)", "68808744"},
-    {R"(div.*int32)", "68808744"},
-    {R"(sub.*int32)", "68808744"},
+    {R"(^\/adda.*int32)", "68808744"},
+    {R"(^\/constant.*int32)", "68808744"},
+    {R"(^\/mul.*int32)", "68808744"},
+    {R"(^\/div.*int32)", "68808744"},
+    {R"(^\/sub.*int32)", "68808744"},
 
     // Pad only supports 4D tensors.
-    {R"(paddtype=.*,input_shape=\[.,.\],paddings=\[\[.,.\],\[.,.\]\])",
+    {R"(^\/pad.*,input_shape=\[.,.\],paddings=\[\[.,.\],\[.,.\]\])",
      "70527055"},
 
     // L2Norm only supports tensors with 4D or fewer.
-    {R"(l2normdim=.*,epsilon=.*,input_shape=\[.,.,.,.,.*\])", "67963684"},
+    {R"(^\/l2normdim=.*,epsilon=.*,input_shape=\[.,.,.,.,.*\])", "67963684"},
 
     // SpaceToBatch only supports 4D tensors.
-    {R"(space_to_batch_nd.*input_shape=\[1,4,4,4,1,1\])", "70848787"},
+    {R"(^\/space_to_batch_nd.*input_shape=\[1,4,4,4,1,1\])", "70848787"},
 
     // L2Norm only works for dim=-1.
-    {R"(l2normdim=-2,epsilon=.*,input_shape=\[.,.\])", "67963812"},
-    {R"(l2normdim=0,epsilon=.*,input_shape=\[.,.\])", "67963812"},
-    {R"(l2normdim=-2,epsilon=.*,input_shape=\[3,15,14,3\])", "67963812"},
-    {R"(l2normdim=-2,epsilon=.*,input_shape=\[1,3,4,3\])", "67963812"},
-    {R"(l2normdim=2,epsilon=.*,input_shape=\[3,15,14,3\])", "67963812"},
-    {R"(l2normdim=2,epsilon=.*,input_shape=\[1,3,4,3\])", "67963812"},
-    {R"(l2normdim=0,epsilon=.*,input_shape=\[3,15,14,3\])", "67963812"},
-    {R"(l2normdim=0,epsilon=.*,input_shape=\[1,3,4,3\])", "67963812"},
-    {R"(l2normdim=1,epsilon=.*,input_shape=\[3,15,14,3\])", "67963812"},
-    {R"(l2normdim=1,epsilon=.*,input_shape=\[1,3,4,3\])", "67963812"},
-    {R"(l2normdim=\[2,3\],epsilon=.*,input_shape=\[3,15,14,3\])", "67963812"},
-    {R"(l2normdim=\[2,3\],epsilon=.*,input_shape=\[1,3,4,3\])", "67963812"},
+    {R"(^\/l2normdim=-2,epsilon=.*,input_shape=\[.,.\])", "67963812"},
+    {R"(^\/l2normdim=0,epsilon=.*,input_shape=\[.,.\])", "67963812"},
+    {R"(^\/l2normdim=-2,epsilon=.*,input_shape=\[3,15,14,3\])", "67963812"},
+    {R"(^\/l2normdim=-2,epsilon=.*,input_shape=\[1,3,4,3\])", "67963812"},
+    {R"(^\/l2normdim=2,epsilon=.*,input_shape=\[3,15,14,3\])", "67963812"},
+    {R"(^\/l2normdim=2,epsilon=.*,input_shape=\[1,3,4,3\])", "67963812"},
+    {R"(^\/l2normdim=0,epsilon=.*,input_shape=\[3,15,14,3\])", "67963812"},
+    {R"(^\/l2normdim=0,epsilon=.*,input_shape=\[1,3,4,3\])", "67963812"},
+    {R"(^\/l2normdim=1,epsilon=.*,input_shape=\[3,15,14,3\])", "67963812"},
+    {R"(^\/l2normdim=1,epsilon=.*,input_shape=\[1,3,4,3\])", "67963812"},
+    {R"(^\/l2normdim=\[2,3\],epsilon=.*,input_shape=\[3,15,14,3\])",
+     "67963812"},
+    {R"(^\/l2normdim=\[2,3\],epsilon=.*,input_shape=\[1,3,4,3\])", "67963812"},
 
     // ResizeBilinear looks completely incompatible with Tensorflow
-    {R"(resize_bilinear.*dtype=tf.int32)", "72401107"},
-    {R"(resize_bilinearalign_corners=True,.*,size=\[2,2\])", "72401483"},
-    {R"(resize_bilinearalign_corners=True,.*,size=\[4,3\])", "72401483"},
-    {R"(resize_bilinearalign_corners=True,.*,size=\[5,6\])", "72401483"},
+    {R"(^\/resize_bilinear.*dtype=tf.int32)", "72401107"},
+    {R"(^\/resize_bilinearalign_corners=True,.*,size=\[2,2\])", "72401483"},
+    {R"(^\/resize_bilinearalign_corners=True,.*,size=\[4,3\])", "72401483"},
+    {R"(^\/resize_bilinearalign_corners=True,.*,size=\[5,6\])", "72401483"},
 
     // Transpose only supports 1D-4D input tensors.
-    {R"(transposedtype=.*,input_shape=\[.,.,.,.,.\],perm=.*)", "71545879"},
+    {R"(^\/transposedtype=.*,input_shape=\[.,.,.,.,.\],perm=.*)", "71545879"},
 };
 
 // Allows test data to be unzipped into a temporary directory and makes
diff --git a/tensorflow/contrib/lite/toco/tflite/operator.cc b/tensorflow/contrib/lite/toco/tflite/operator.cc
index 0c2b570aad..d75d1fcc5b 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator.cc
@@ -474,19 +474,11 @@ class Pad : public BuiltinOperator<PadOperator, ::tflite::PadOptions,
   flatbuffers::Offset<TfLiteOptions> WriteOptions(
       const TocoOperator& op,
       flatbuffers::FlatBufferBuilder* builder) const override {
-    auto before_padding = builder->CreateVector(op.left_padding);
-    auto after_padding = builder->CreateVector(op.right_padding);
-    return ::tflite::CreatePadOptions(*builder, before_padding, after_padding);
+    return ::tflite::CreatePadOptions(*builder);
   }
 
   void ReadOptions(const TfLiteOptions& options,
                    TocoOperator* op) const override {
-    op->left_padding.insert(op->left_padding.end(),
-                            options.before_padding()->begin(),
-                            options.before_padding()->end());
-    op->right_padding.insert(op->right_padding.end(),
-                             options.after_padding()->begin(),
-                             options.after_padding()->end());
   }
 };
 
diff --git a/tensorflow/contrib/lite/toco/tflite/operator_test.cc b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
index de79c70e1b..9036a16d1c 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator_test.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
@@ -258,16 +258,6 @@ TEST_F(OperatorTest, BuiltinMaxPool) {
   EXPECT_EQ(op.kheight, output_toco_op->kheight);
 }
 
-TEST_F(OperatorTest, BuiltinPad) {
-  PadOperator op;
-  op.left_padding = {1, 2, 3};
-  op.right_padding = {1, 2, 3};
-  auto output_toco_op =
-      SerializeAndDeserialize(GetOperator("PAD", OperatorType::kPad), op);
-  EXPECT_EQ(op.left_padding, output_toco_op->left_padding);
-  EXPECT_EQ(op.right_padding, output_toco_op->right_padding);
-}
-
 TEST_F(OperatorTest, BuiltinReshape) {
   TensorFlowReshapeOperator op;
   op.shape = {1, 2, 4, 5, 8};
-- 
GitLab


From 7ed85251e0b17f04a073100717b97a36ebc1b61d Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 24 Jan 2018 13:31:33 -0800
Subject: [PATCH 101/966] Update docs and test cases for missing types in
 `tf.zeros_like` (#15934)

* Update docs for missing types in `tf.zeros_like`

This fix update docs for missing types in `tf.zeros_like`
where `half` and `string` are supported but
not listed.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Also enable test cases for missing types to `tf.zeros_like`

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/kernel_tests/constant_op_test.py | 14 ++++++++------
 tensorflow/python/ops/array_ops.py                 |  6 +++---
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/tensorflow/python/kernel_tests/constant_op_test.py b/tensorflow/python/kernel_tests/constant_op_test.py
index 030c690167..576bb68ba4 100644
--- a/tensorflow/python/kernel_tests/constant_op_test.py
+++ b/tensorflow/python/kernel_tests/constant_op_test.py
@@ -454,18 +454,20 @@ class ZerosLikeTest(test.TestCase):
 
   def testZerosLikeCPU(self):
     for dtype in [
-        dtypes_lib.float32, dtypes_lib.float64, dtypes_lib.int8,
-        dtypes_lib.uint8, dtypes_lib.int16, dtypes_lib.uint16, dtypes_lib.int32,
-        dtypes_lib.int64, dtypes_lib.bool, dtypes_lib.complex64,
-        dtypes_lib.complex128, dtypes_lib.string
+        dtypes_lib.half, dtypes_lib.float32, dtypes_lib.float64,
+        dtypes_lib.int8, dtypes_lib.uint8, dtypes_lib.int16, dtypes_lib.uint16,
+        dtypes_lib.int32, dtypes_lib.int64, dtypes_lib.bool,
+        dtypes_lib.complex64, dtypes_lib.complex128, dtypes_lib.string
     ]:
       self._compareZeros(dtype, fully_defined_shape=False, use_gpu=False)
       self._compareZeros(dtype, fully_defined_shape=True, use_gpu=False)
 
   def testZerosLikeGPU(self):
     for dtype in [
-        dtypes_lib.float32, dtypes_lib.float64, dtypes_lib.int32,
-        dtypes_lib.bool, dtypes_lib.int64, dtypes_lib.string
+        dtypes_lib.half, dtypes_lib.float32, dtypes_lib.float64,
+        dtypes_lib.int32, dtypes_lib.int64,
+        dtypes_lib.complex64, dtypes_lib.complex128,
+        dtypes_lib.bool
     ]:
       self._compareZeros(dtype, fully_defined_shape=False, use_gpu=True)
       self._compareZeros(dtype, fully_defined_shape=True, use_gpu=True)
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 6210204d80..24a0c18619 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -1588,9 +1588,9 @@ def zeros_like(tensor, dtype=None, name=None, optimize=True):
 
   Args:
     tensor: A `Tensor`.
-    dtype: A type for the returned `Tensor`. Must be `float32`, `float64`,
-      `int8`, `uint8`, `int16`, `uint16`, int32`, `int64`,
-      `complex64`, `complex128` or `bool`.
+    dtype: A type for the returned `Tensor`. Must be `float16`, `float32`,
+      `float64`, `int8`, `uint8`, `int16`, `uint16`, `int32`, `int64`,
+      `complex64`, `complex128`, `bool` or `string`.
     name: A name for the operation (optional).
     optimize: if true, attempt to statically determine the shape of 'tensor'
     and encode it as a constant.
-- 
GitLab


From 64cbff313353c248241b7a447f5ef57838f116fa Mon Sep 17 00:00:00 2001
From: Darjan Salaj <salaj.au@gmail.com>
Date: Wed, 24 Jan 2018 22:31:51 +0100
Subject: [PATCH 102/966] Documentation fix to contrib.signals (#15877)

* Documentation fix to contrib.signals

Fixed very confusing little typo

* Update contrib.signal.md

* Update contrib.signal.md
---
 tensorflow/docs_src/api_guides/python/contrib.signal.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/docs_src/api_guides/python/contrib.signal.md b/tensorflow/docs_src/api_guides/python/contrib.signal.md
index 85ef3ad134..0f7690f80a 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.signal.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.signal.md
@@ -28,14 +28,14 @@ The `axis` parameter to @{tf.contrib.signal.frame} allows you to frame tensors
 with inner structure (e.g. a spectrogram):
 
 ```python
-# `magnitude_spectrograms` is a [batch_size, ?, 127] tensor of spectrograms. We
+# `magnitude_spectrograms` is a [batch_size, ?, 129] tensor of spectrograms. We
 # would like to produce overlapping fixed-size spectrogram patches; for example,
 # for use in a situation where a fixed size input is needed.
 magnitude_spectrograms = tf.abs(tf.contrib.signal.stft(
     signals, frame_length=256, frame_step=64, fft_length=256))
 
-# `spectrogram_patches` is a [batch_size, ?, 64, 127] tensor containing a
-# variable number of [64, 127] spectrogram patches per batch item.
+# `spectrogram_patches` is a [batch_size, ?, 64, 129] tensor containing a
+# variable number of [64, 129] spectrogram patches per batch item.
 spectrogram_patches = tf.contrib.signal.frame(
     magnitude_spectrograms, frame_length=64, frame_step=16, axis=1)
 ```
-- 
GitLab


From a651cd8a44426d2ce6cf62138ea5f18e922656e9 Mon Sep 17 00:00:00 2001
From: Scott Tseng <scott.tseng@kikatech.com>
Date: Thu, 25 Jan 2018 05:41:55 +0800
Subject: [PATCH 103/966] Fix a buildscript error which prevents macro being
 used by other workspaces (#16241)

Converting to label and back to string will prepend current workspace,
e.g: "//tensorflow" -> "@org_tensorflow//tensorflow". Projects who
integrate TensorFlow from another Bazel workspace, and use macros like
tflite_copts() need the prefix.

For more info please take a look at: https://github.com/bazelbuild/bazel/issues/1551
---
 tensorflow/contrib/lite/build_def.bzl | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tensorflow/contrib/lite/build_def.bzl b/tensorflow/contrib/lite/build_def.bzl
index 0a097d5a69..19829e4991 100644
--- a/tensorflow/contrib/lite/build_def.bzl
+++ b/tensorflow/contrib/lite/build_def.bzl
@@ -5,25 +5,25 @@ def tflite_copts():
   copts = [
       "-DFARMHASH_NO_CXX_STRING",
   ] + select({
-          "//tensorflow:android_arm64": [
+          str(Label("//tensorflow:android_arm64")): [
               "-std=c++11",
               "-O3",
           ],
-          "//tensorflow:android_arm": [
+          str(Label("//tensorflow:android_arm")): [
               "-mfpu=neon",
               "-mfloat-abi=softfp",
               "-std=c++11",
               "-O3",
           ],
-          "//tensorflow:android_x86": [
+          str(Label("//tensorflow:android_x86")): [
               "-DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK",
           ],
-          "//tensorflow:ios_x86_64": [
+          str(Label("//tensorflow:ios_x86_64")): [
               "-msse4.1",
           ],
           "//conditions:default": [],
   }) + select({
-      "//tensorflow:with_default_optimizations": [],
+      str(Label("//tensorflow:with_default_optimizations")): [],
       "//conditions:default": ["-DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK"],
   })
 
-- 
GitLab


From aa53e0f5411a844bf4b7e0ea832c18485697d3e0 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Wed, 24 Jan 2018 13:39:02 -0800
Subject: [PATCH 104/966] [TF:XLA] Bump open source llvm revision to r323309

PiperOrigin-RevId: 183133556
---
 tensorflow/workspace.bzl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 79e14e0e92..9145d9e58a 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -475,11 +475,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "llvm",
       urls = [
-          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/95461910b8134c70de1d94f6854b64cfff3615d9.tar.gz",
-          "https://github.com/llvm-mirror/llvm/archive/95461910b8134c70de1d94f6854b64cfff3615d9.tar.gz",
+          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/11a2ca6eea8a7fe240a14c0c35fd2017341279be.tar.gz",
+          "https://github.com/llvm-mirror/llvm/archive/11a2ca6eea8a7fe240a14c0c35fd2017341279be.tar.gz",
       ],
-      sha256 = "2e16617820ec59e3b57b6800aab00238c2e5955182b56a0fbc03bde12b5f2d7b",
-      strip_prefix = "llvm-95461910b8134c70de1d94f6854b64cfff3615d9",
+      sha256 = "b5429ccf8d57273cb8489714f728c997cd720ec66fc2c0292422ab8f0e729ce0",
+      strip_prefix = "llvm-11a2ca6eea8a7fe240a14c0c35fd2017341279be",
       build_file = str(Label("//third_party/llvm:llvm.BUILD")),
   )
 
-- 
GitLab


From a8343c49796018234b692831050865509c6e5398 Mon Sep 17 00:00:00 2001
From: Jianwei Xie <xiejw@google.com>
Date: Wed, 24 Jan 2018 13:50:09 -0800
Subject: [PATCH 105/966] fix lite problem

---
 tensorflow/contrib/lite/model.cc | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/tensorflow/contrib/lite/model.cc b/tensorflow/contrib/lite/model.cc
index a01b74f9da..409259b14f 100644
--- a/tensorflow/contrib/lite/model.cc
+++ b/tensorflow/contrib/lite/model.cc
@@ -476,13 +476,6 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
       break;
     }
     case BuiltinOperator_RESIZE_BILINEAR: {
-      auto* params = MallocPOD<TfLiteResizeBilinearParams>();
-      if (auto* schema_params =
-              op->builtin_options_as_ResizeBilinearOptions()) {
-        params->new_height = schema_params->new_height();
-        params->new_width = schema_params->new_width();
-      }
-      builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_PAD: {
-- 
GitLab


From f7b60fd704ced02f250b2dd0da436cf0ad9c9a8c Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Wed, 24 Jan 2018 14:02:14 -0800
Subject: [PATCH 106/966] Silent copies for int32 tensors in TFE.

Fixes #16106

PiperOrigin-RevId: 183137298
---
 tensorflow/c/eager/c_api.cc              |  9 ++++++++-
 tensorflow/c/eager/c_api.h               |  6 ++++--
 tensorflow/c/eager/c_api_internal.h      |  3 ++-
 tensorflow/python/eager/context.py       |  4 ++++
 tensorflow/python/eager/core_test.py     |  9 +++++++++
 tensorflow/python/eager/function_test.py |  5 +++--
 tensorflow/python/eager/ops_test.py      | 10 ----------
 tensorflow/python/framework/ops.py       |  8 +++++---
 tensorflow/python/pywrap_tfe.i           |  1 +
 9 files changed, 36 insertions(+), 19 deletions(-)

diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index ecba8c0109..a76c8f5ec0 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -453,9 +453,16 @@ tensorflow::Status ValidateInputTypeAndPlacement(
         op->input_devices[i] == nullptr ? host_device : op->input_devices[i];
     if (expected_device != actual_device) {
       switch (TFE_ContextGetDevicePlacementPolicy(ctx)) {
-        case TFE_DEVICE_PLACEMENT_EXPLICIT:
+        case TFE_DEVICE_PLACEMENT_SILENT_FOR_INT32:
           // TODO(xpan): See if we could bubble python related error up
           // to python level.
+          if (op->inputs[i].dtype() == tensorflow::DT_INT32) {
+            // Note: enabling silent copies of int32 tensors to match behavior
+            // of graph mode.
+            break;
+          }
+          TF_FALLTHROUGH_INTENDED;
+        case TFE_DEVICE_PLACEMENT_EXPLICIT:
           return tensorflow::errors::InvalidArgument(
               "Tensors on conflicting devices:"
               " cannot compute ",
diff --git a/tensorflow/c/eager/c_api.h b/tensorflow/c/eager/c_api.h
index 26e6d94753..387de07894 100644
--- a/tensorflow/c/eager/c_api.h
+++ b/tensorflow/c/eager/c_api.h
@@ -61,14 +61,16 @@ TF_CAPI_EXPORT extern void TFE_ContextOptionsSetConfig(
 // Controls how to act when we try to run an operation on a given device but
 // some input tensors are not on that device.
 typedef enum TFE_ContextDevicePlacementPolicy {
-  // The default: running operations with input tensors on the wrong device will
-  // fail.
+  // Running operations with input tensors on the wrong device will fail.
   TFE_DEVICE_PLACEMENT_EXPLICIT = 0,
   // Copy the tensor to the right device but log a warning.
   TFE_DEVICE_PLACEMENT_WARN = 1,
   // Silently copy the tensor, which has a performance cost since the
   // operation will be blocked till the copy completes.
   TFE_DEVICE_PLACEMENT_SILENT = 2,
+  // Default placement policy which silently copies int32 tensors but not other
+  // dtypes.
+  TFE_DEVICE_PLACEMENT_SILENT_FOR_INT32 = 3,
 } TFE_ContextDevicePlacementPolicy;
 
 TF_CAPI_EXPORT extern void TFE_ContextOptionsSetDevicePlacementPolicy(
diff --git a/tensorflow/c/eager/c_api_internal.h b/tensorflow/c/eager/c_api_internal.h
index f40c41576c..a6f76c732f 100644
--- a/tensorflow/c/eager/c_api_internal.h
+++ b/tensorflow/c/eager/c_api_internal.h
@@ -38,7 +38,8 @@ limitations under the License.
 
 struct TFE_ContextOptions {
   TF_SessionOptions session_options;
-  TFE_ContextDevicePlacementPolicy policy{TFE_DEVICE_PLACEMENT_EXPLICIT};
+  TFE_ContextDevicePlacementPolicy policy{
+      TFE_DEVICE_PLACEMENT_SILENT_FOR_INT32};
 };
 
 struct TFE_Context {
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index 0569791352..b6c7d82323 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -49,6 +49,8 @@ _MAXINT32 = 2**31 - 1
 DEVICE_PLACEMENT_EXPLICIT = pywrap_tensorflow.TFE_DEVICE_PLACEMENT_EXPLICIT
 DEVICE_PLACEMENT_WARN = pywrap_tensorflow.TFE_DEVICE_PLACEMENT_WARN
 DEVICE_PLACEMENT_SILENT = pywrap_tensorflow.TFE_DEVICE_PLACEMENT_SILENT
+DEVICE_PLACEMENT_SILENT_FOR_INT32 = (
+    pywrap_tensorflow.TFE_DEVICE_PLACEMENT_SILENT_FOR_INT32)
 
 
 # TODO(agarwal): better name ?
@@ -122,6 +124,8 @@ class Context(object):
            right device but raises a warning.
          tfe.DEVICE_PLACEMENT_SILENT: silently copies the tensors. This might
            hide performance problems.
+         tfe.DEVICE_PLACEMENT_SILENT_FOR_INT32: silently copies int32 tensors,
+           raising errors on the other ones.
     """
     self._eager_context = _EagerContext()
     self._context_handle = None
diff --git a/tensorflow/python/eager/core_test.py b/tensorflow/python/eager/core_test.py
index 82438ecadb..ee3c10633e 100644
--- a/tensorflow/python/eager/core_test.py
+++ b/tensorflow/python/eager/core_test.py
@@ -33,6 +33,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import nn_ops
 
 
 def execute(op_name, num_outputs, inputs, attrs=None):
@@ -112,6 +113,14 @@ class TFETest(test_util.TensorFlowTestCase):
     # is enabled; the stack entry should reflect this fact.
     self.assertFalse(stack_entry.is_building_function)
 
+  def testInt32GPU(self):
+    if not context.context().num_gpus():
+      self.skipTest('No GPUs found')
+    with ops.device('gpu:0'):
+      xent = nn_ops.sparse_softmax_cross_entropy_with_logits(
+          logits=[[0.0, 0.0]], labels=[0])
+    self.assertAllClose(xent, [0.69314718])
+
   def _runInThread(self, target, args):
     t = threading.Thread(target=target, args=args)
     try:
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index 9b08a35ff1..0babc29f17 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -400,10 +400,11 @@ class FunctionTest(test.TestCase):
 
     # The Reshape op requires the shape tensor to be placed in host memory.
     reshape = function.defun(array_ops.reshape)
-    value = constant_op.constant([1., 2.]).gpu()
+    value = constant_op.constant([1., 2.])
     shape = constant_op.constant([2, 1]).gpu()
     with self.assertRaises(errors.InvalidArgumentError):
-      reshape(value, shape)
+      with ops.device('gpu:0'):
+        reshape(value, shape)
 
   def testDifferentiableFunctionNoneOutputs(self):
 
diff --git a/tensorflow/python/eager/ops_test.py b/tensorflow/python/eager/ops_test.py
index f8c5037dcf..f2e70341d9 100644
--- a/tensorflow/python/eager/ops_test.py
+++ b/tensorflow/python/eager/ops_test.py
@@ -24,7 +24,6 @@ from tensorflow.python.eager import execute
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
@@ -246,15 +245,6 @@ class OpsTest(test_util.TensorFlowTestCase):
     reshaped = array_ops.reshape(value, shape)
     self.assertAllEqual([[1], [2]], reshaped.cpu())
 
-    # And if the shape is in device memory, it should complain
-    # TODO(ashankar): Revisit this - perhaps instead of complaining,
-    # it should implicitly copy the tensor to host memory?
-    with self.assertRaisesRegexp(
-        errors.InvalidArgumentError,
-        'cannot compute Reshape as input #1 was expected to be on.*'
-        'using.*DEVICE_PLACEMENT_SILENT'):
-      reshaped = array_ops.reshape(value, shape.gpu())
-
   def testInt64(self):
     # Fill requires the first input to be an int32 tensor.
     self.assertAllEqual(
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 6c4563f769..a1c2e07e94 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -5048,6 +5048,8 @@ def enable_eager_execution(config=None, device_policy=None):
          right device but raises a warning.
        tfe.DEVICE_PLACEMENT_SILENT: silently copies the tensors. This might
          hide performance problems.
+       tfe.DEVICE_PLACEMENT_SILENT_FOR_INT32: silently copies int32 tensors,
+         raising errors on the other ones.
 
   Raises:
     ValueError: If trying to create a context after using graph operations
@@ -5059,10 +5061,10 @@ def enable_eager_execution(config=None, device_policy=None):
         "config must be a tf.ConfigProto, but got %s" % type(config))
   if device_policy not in (None, context.DEVICE_PLACEMENT_EXPLICIT,
                            context.DEVICE_PLACEMENT_WARN,
-                           context.DEVICE_PLACEMENT_SILENT):
+                           context.DEVICE_PLACEMENT_SILENT,
+                           context.DEVICE_PLACEMENT_SILENT_FOR_INT32):
     raise ValueError(
-        "device_policy must be one of None, tfe.DEVICE_PLACEMENT_EXPLICIT, "
-        "tfe.DEVICE_PLACEMENT_WARN, tfe.DEVICE_PLACEMENT_SILENT"
+        "device_policy must be one of None, tfe.DEVICE_PLACEMENT_*"
     )
   # pylint: disable=protected-access
   if context._default_mode == context.GRAPH_MODE:
diff --git a/tensorflow/python/pywrap_tfe.i b/tensorflow/python/pywrap_tfe.i
index ed7e4b17ad..3f25311a83 100644
--- a/tensorflow/python/pywrap_tfe.i
+++ b/tensorflow/python/pywrap_tfe.i
@@ -121,6 +121,7 @@ limitations under the License.
 %rename("%s") TFE_DEVICE_PLACEMENT_EXPLICIT;
 %rename("%s") TFE_DEVICE_PLACEMENT_WARN;
 %rename("%s") TFE_DEVICE_PLACEMENT_SILENT;
+%rename("%s") TFE_DEVICE_PLACEMENT_SILENT_FOR_INT32;
 
 %include "tensorflow/c/eager/c_api.h"
 
-- 
GitLab


From 3ec5b5855eb4dc781f471df2f8c981b1f0951ff2 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Wed, 24 Jan 2018 14:03:09 -0800
Subject: [PATCH 107/966] [Eager mode] Add an Eager-optimized version of the
 IteratorGetNext op.

Since in eager mode the kernel execution happens on the calling thread
(which would be blocked anyway), we can invoke the iterator
synchronously, and do not need to perform a context switch to and from
a background thread.

Improves the latency benchmarks in
//third_party/tensorflow/contrib/eager/python:datasets_test by
approximately 7us to 13us per element.

PiperOrigin-RevId: 183137488
---
 tensorflow/contrib/eager/python/datasets.py   |  7 +-
 .../api_def_IteratorGetNextSync.pbtxt         | 10 +++
 tensorflow/core/kernels/data/iterator_ops.cc  | 39 ++++++++++-
 tensorflow/core/ops/dataset_ops.cc            | 68 +++++++++----------
 4 files changed, 85 insertions(+), 39 deletions(-)
 create mode 100644 tensorflow/core/api_def/base_api/api_def_IteratorGetNextSync.pbtxt

diff --git a/tensorflow/contrib/eager/python/datasets.py b/tensorflow/contrib/eager/python/datasets.py
index a7f50c13bb..544a3eafc0 100644
--- a/tensorflow/contrib/eager/python/datasets.py
+++ b/tensorflow/contrib/eager/python/datasets.py
@@ -141,7 +141,12 @@ class Iterator(object):
         # TODO(ashankar): Consider removing this ops.device() contextmanager
         # and instead mimic ops placement in graphs: Operations on resource
         # handles execute on the same device as where the resource is placed.
-        ret = gen_dataset_ops.iterator_get_next(
+        # NOTE(mrry): Here we use the "_sync" variant of `iterator_get_next`
+        # because in eager mode this code will run synchronously on the calling
+        # thread. Therefore we do not need to make a defensive context switch
+        # to a background thread, and can achieve a small constant performance
+        # boost by invoking the iterator synchronously.
+        ret = gen_dataset_ops.iterator_get_next_sync(
             self._resource,
             output_types=self._flat_output_types,
             output_shapes=self._flat_output_shapes)
diff --git a/tensorflow/core/api_def/base_api/api_def_IteratorGetNextSync.pbtxt b/tensorflow/core/api_def/base_api/api_def_IteratorGetNextSync.pbtxt
new file mode 100644
index 0000000000..641679e8ea
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_IteratorGetNextSync.pbtxt
@@ -0,0 +1,10 @@
+op {
+  graph_op_name: "IteratorGetNextSync"
+  summary: "Gets the next output from the given iterator."
+  description: <<END
+This operation is a synchronous version IteratorGetNext. It should only be used
+in situations where the iterator does not block the calling thread, or where
+the calling thread is not a member of the thread pool used to execute parallel
+operations (e.g. in eager mode).
+END
+}
diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index 244df137cb..56044a3d41 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -829,8 +829,8 @@ class IteratorGetNextOp : public AsyncOpKernel {
 
   void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
     IteratorResource* iterator;
-    OP_REQUIRES_OK(ctx,
-                   LookupResource(ctx, HandleFromInput(ctx, 0), &iterator));
+    OP_REQUIRES_OK_ASYNC(
+        ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &iterator), done);
     // The call to `iterator->GetNext()` may block and depend on an
     // inter-op thread pool thread, so we issue the call from the
     // owned thread pool.
@@ -870,6 +870,39 @@ class IteratorGetNextOp : public AsyncOpKernel {
   std::unique_ptr<thread::ThreadPool> thread_pool_;
 };
 
+class IteratorGetNextSyncOp : public OpKernel {
+ public:
+  explicit IteratorGetNextSyncOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    IteratorResource* iterator;
+    OP_REQUIRES_OK(ctx,
+                   LookupResource(ctx, HandleFromInput(ctx, 0), &iterator));
+    core::ScopedUnref unref_iterator(iterator);
+
+    std::vector<Tensor> components;
+    bool end_of_sequence = false;
+
+    IteratorContext::Params params;
+    params.env = ctx->env();
+    params.stats_aggregator_getter = [iterator]() {
+      return iterator->stats_aggregator();
+    };
+    params.runner = *(ctx->runner());
+    params.function_library = iterator->function_library();
+    IteratorContext iter_ctx(std::move(params));
+
+    OP_REQUIRES_OK(ctx,
+                   iterator->GetNext(&iter_ctx, &components, &end_of_sequence));
+    OP_REQUIRES(ctx, !end_of_sequence, errors::OutOfRange("End of sequence"));
+
+    for (int i = 0; i < components.size(); ++i) {
+      // TODO(mrry): Check that the shapes match the shape attrs.
+      ctx->set_output(i, components[i]);
+    }
+  }
+};
+
 class IteratorToStringHandleOp : public OpKernel {
  public:
   explicit IteratorToStringHandleOp(OpKernelConstruction* ctx)
@@ -1033,6 +1066,8 @@ REGISTER_KERNEL_BUILDER(Name("OneShotIterator").Device(DEVICE_CPU),
                         OneShotIteratorOp);
 REGISTER_KERNEL_BUILDER(Name("IteratorGetNext").Device(DEVICE_CPU),
                         IteratorGetNextOp);
+REGISTER_KERNEL_BUILDER(Name("IteratorGetNextSync").Device(DEVICE_CPU),
+                        IteratorGetNextSyncOp);
 REGISTER_KERNEL_BUILDER(Name("IteratorToStringHandle").Device(DEVICE_CPU),
                         IteratorToStringHandleOp);
 REGISTER_KERNEL_BUILDER(Name("IteratorFromStringHandle").Device(DEVICE_CPU),
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index b86816bb54..2cae814eab 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -409,53 +409,49 @@ REGISTER_OP("OneShotIterator")
     .SetIsStateful()
     .SetShapeFn(shape_inference::ScalarShape);
 
+namespace {
+
+Status IteratorGetNextShapeFn(shape_inference::InferenceContext* c) {
+  shape_inference::ShapeHandle unused;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
+  std::vector<PartialTensorShape> output_shapes;
+  TF_RETURN_IF_ERROR(c->GetAttr("output_shapes", &output_shapes));
+  if (output_shapes.size() != c->num_outputs()) {
+    return errors::InvalidArgument(
+        "`output_shapes` must be the same length as `output_types` (",
+        output_shapes.size(), " vs. ", c->num_outputs());
+  }
+  for (size_t i = 0; i < output_shapes.size(); ++i) {
+    shape_inference::ShapeHandle output_shape_handle;
+    TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(
+        output_shapes[i], &output_shape_handle));
+    c->set_output(static_cast<int>(i), output_shape_handle);
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
 REGISTER_OP("IteratorGetNext")
     .Input("iterator: resource")
     .Output("components: output_types")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      shape_inference::ShapeHandle unused;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
-      std::vector<PartialTensorShape> output_shapes;
-      TF_RETURN_IF_ERROR(c->GetAttr("output_shapes", &output_shapes));
-      if (output_shapes.size() != c->num_outputs()) {
-        return errors::InvalidArgument(
-            "`output_shapes` must be the same length as `output_types` (",
-            output_shapes.size(), " vs. ", c->num_outputs());
-      }
-      for (size_t i = 0; i < output_shapes.size(); ++i) {
-        shape_inference::ShapeHandle output_shape_handle;
-        TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(
-            output_shapes[i], &output_shape_handle));
-        c->set_output(static_cast<int>(i), output_shape_handle);
-      }
-      return Status::OK();
-    });
+    .SetShapeFn(IteratorGetNextShapeFn);
+
+REGISTER_OP("IteratorGetNextSync")
+    .Input("iterator: resource")
+    .Output("components: output_types")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(IteratorGetNextShapeFn);
 
 REGISTER_OP("DatasetToSingleElement")
     .Input("dataset: variant")
     .Output("components: output_types")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      shape_inference::ShapeHandle unused;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
-      std::vector<PartialTensorShape> output_shapes;
-      TF_RETURN_IF_ERROR(c->GetAttr("output_shapes", &output_shapes));
-      if (output_shapes.size() != c->num_outputs()) {
-        return errors::InvalidArgument(
-            "`output_shapes` must be the same length as `output_types` (",
-            output_shapes.size(), " vs. ", c->num_outputs());
-      }
-      for (size_t i = 0; i < output_shapes.size(); ++i) {
-        shape_inference::ShapeHandle output_shape_handle;
-        TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(
-            output_shapes[i], &output_shape_handle));
-        c->set_output(static_cast<int>(i), output_shape_handle);
-      }
-      return Status::OK();
-    });
+    .SetShapeFn(IteratorGetNextShapeFn);
 
 REGISTER_OP("IteratorToStringHandle")
     .Input("resource_handle: resource")
-- 
GitLab


From 43b4de97147ee7d62f5161ea07c76700f2089e36 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 24 Jan 2018 14:17:05 -0800
Subject: [PATCH 108/966] Adds loss_reduction argument to canned estimators.

PiperOrigin-RevId: 183139970
---
 tensorflow/python/estimator/BUILD             |  3 +++
 tensorflow/python/estimator/canned/dnn.py     | 16 ++++++++++++---
 .../estimator/canned/dnn_linear_combined.py   | 20 ++++++++++++++-----
 tensorflow/python/estimator/canned/linear.py  | 20 ++++++++++++++-----
 ...nsorflow.estimator.-d-n-n-classifier.pbtxt |  2 +-
 ...or.-d-n-n-linear-combined-classifier.pbtxt |  2 +-
 ...tor.-d-n-n-linear-combined-regressor.pbtxt |  2 +-
 ...ensorflow.estimator.-d-n-n-regressor.pbtxt |  2 +-
 ...sorflow.estimator.-linear-classifier.pbtxt |  2 +-
 ...nsorflow.estimator.-linear-regressor.pbtxt |  2 +-
 10 files changed, 52 insertions(+), 19 deletions(-)

diff --git a/tensorflow/python/estimator/BUILD b/tensorflow/python/estimator/BUILD
index 58d540b1ec..41f55b12af 100644
--- a/tensorflow/python/estimator/BUILD
+++ b/tensorflow/python/estimator/BUILD
@@ -267,6 +267,7 @@ py_library(
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/feature_column",
+        "//tensorflow/python/ops/losses",
         "@six_archive//:six",
     ],
 )
@@ -356,6 +357,7 @@ py_library(
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/feature_column",
+        "//tensorflow/python/ops/losses",
         "@six_archive//:six",
     ],
 )
@@ -680,6 +682,7 @@ py_library(
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/feature_column",
+        "//tensorflow/python/ops/losses",
         "@six_archive//:six",
     ],
 )
diff --git a/tensorflow/python/estimator/canned/dnn.py b/tensorflow/python/estimator/canned/dnn.py
index 0392ff9a71..ba96d738ae 100644
--- a/tensorflow/python/estimator/canned/dnn.py
+++ b/tensorflow/python/estimator/canned/dnn.py
@@ -31,6 +31,7 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops.losses import losses
 from tensorflow.python.summary import summary
 from tensorflow.python.training import training_util
 
@@ -280,6 +281,7 @@ class DNNClassifier(estimator.Estimator):
       input_layer_partitioner=None,
       config=None,
       warm_start_from=None,
+      loss_reduction=losses.Reduction.SUM,
   ):
     """Initializes a `DNNClassifier` instance.
 
@@ -323,15 +325,19 @@ class DNNClassifier(estimator.Estimator):
         string filepath is provided instead of a `WarmStartSettings`, then all
         weights are warm-started, and it is assumed that vocabularies and Tensor
         names are unchanged.
+      loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how
+        to reduce training loss over batch. Defaults to `SUM`.
     """
     if n_classes == 2:
       head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(  # pylint: disable=protected-access
           weight_column=weight_column,
-          label_vocabulary=label_vocabulary)
+          label_vocabulary=label_vocabulary,
+          loss_reduction=loss_reduction)
     else:
       head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(  # pylint: disable=protected-access
           n_classes, weight_column=weight_column,
-          label_vocabulary=label_vocabulary)
+          label_vocabulary=label_vocabulary,
+          loss_reduction=loss_reduction)
 
     def _model_fn(features, labels, mode, config):
       """Call the defined shared _dnn_model_fn and possibly warm-start."""
@@ -441,6 +447,7 @@ class DNNRegressor(estimator.Estimator):
       input_layer_partitioner=None,
       config=None,
       warm_start_from=None,
+      loss_reduction=losses.Reduction.SUM,
   ):
     """Initializes a `DNNRegressor` instance.
 
@@ -478,6 +485,8 @@ class DNNRegressor(estimator.Estimator):
         string filepath is provided instead of a `WarmStartSettings`, then all
         weights are warm-started, and it is assumed that vocabularies and Tensor
         names are unchanged.
+      loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how
+        to reduce training loss over batch. Defaults to `SUM`.
     """
 
     def _model_fn(features, labels, mode, config):
@@ -488,7 +497,8 @@ class DNNRegressor(estimator.Estimator):
           mode=mode,
           head=head_lib.  # pylint: disable=protected-access
           _regression_head_with_mean_squared_error_loss(
-              label_dimension=label_dimension, weight_column=weight_column),
+              label_dimension=label_dimension, weight_column=weight_column,
+              loss_reduction=loss_reduction),
           hidden_units=hidden_units,
           feature_columns=tuple(feature_columns or []),
           optimizer=optimizer,
diff --git a/tensorflow/python/estimator/canned/dnn_linear_combined.py b/tensorflow/python/estimator/canned/dnn_linear_combined.py
index 1d06a54a32..d29c892662 100644
--- a/tensorflow/python/estimator/canned/dnn_linear_combined.py
+++ b/tensorflow/python/estimator/canned/dnn_linear_combined.py
@@ -34,6 +34,7 @@ from tensorflow.python.ops import nn
 from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops.losses import losses
 from tensorflow.python.summary import summary
 from tensorflow.python.training import sync_replicas_optimizer
 from tensorflow.python.training import training_util
@@ -309,7 +310,8 @@ class DNNLinearCombinedClassifier(estimator.Estimator):
                label_vocabulary=None,
                input_layer_partitioner=None,
                config=None,
-               warm_start_from=None):
+               warm_start_from=None,
+               loss_reduction=losses.Reduction.SUM):
     """Initializes a DNNLinearCombinedClassifier instance.
 
     Args:
@@ -356,6 +358,8 @@ class DNNLinearCombinedClassifier(estimator.Estimator):
         string filepath is provided instead of a `WarmStartSettings`, then all
         weights are warm-started, and it is assumed that vocabularies and Tensor
         names are unchanged.
+      loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how
+        to reduce training loss over batch. Defaults to `SUM`.
 
     Raises:
       ValueError: If both linear_feature_columns and dnn_features_columns are
@@ -371,12 +375,14 @@ class DNNLinearCombinedClassifier(estimator.Estimator):
     if n_classes == 2:
       head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(  # pylint: disable=protected-access
           weight_column=weight_column,
-          label_vocabulary=label_vocabulary)
+          label_vocabulary=label_vocabulary,
+          loss_reduction=loss_reduction)
     else:
       head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(  # pylint: disable=protected-access
           n_classes,
           weight_column=weight_column,
-          label_vocabulary=label_vocabulary)
+          label_vocabulary=label_vocabulary,
+          loss_reduction=loss_reduction)
 
     def _model_fn(features, labels, mode, config):
       """Call the _dnn_linear_combined_model_fn and possibly warm-start."""
@@ -490,7 +496,8 @@ class DNNLinearCombinedRegressor(estimator.Estimator):
                weight_column=None,
                input_layer_partitioner=None,
                config=None,
-               warm_start_from=None):
+               warm_start_from=None,
+               loss_reduction=losses.Reduction.SUM):
     """Initializes a DNNLinearCombinedRegressor instance.
 
     Args:
@@ -531,6 +538,8 @@ class DNNLinearCombinedRegressor(estimator.Estimator):
         string filepath is provided instead of a `WarmStartSettings`, then all
         weights are warm-started, and it is assumed that vocabularies and Tensor
         names are unchanged.
+      loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how
+        to reduce training loss over batch. Defaults to `SUM`.
 
     Raises:
       ValueError: If both linear_feature_columns and dnn_features_columns are
@@ -552,7 +561,8 @@ class DNNLinearCombinedRegressor(estimator.Estimator):
           mode=mode,
           head=head_lib.  # pylint: disable=protected-access
           _regression_head_with_mean_squared_error_loss(
-              label_dimension=label_dimension, weight_column=weight_column),
+              label_dimension=label_dimension, weight_column=weight_column,
+              loss_reduction=loss_reduction),
           linear_feature_columns=linear_feature_columns,
           linear_optimizer=linear_optimizer,
           dnn_feature_columns=dnn_feature_columns,
diff --git a/tensorflow/python/estimator/canned/linear.py b/tensorflow/python/estimator/canned/linear.py
index 97cfd24a10..7a80dfacc2 100644
--- a/tensorflow/python/estimator/canned/linear.py
+++ b/tensorflow/python/estimator/canned/linear.py
@@ -31,6 +31,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops.losses import losses
 from tensorflow.python.summary import summary
 from tensorflow.python.training import ftrl
 from tensorflow.python.training import training_util
@@ -245,7 +246,8 @@ class LinearClassifier(estimator.Estimator):
                optimizer='Ftrl',
                config=None,
                partitioner=None,
-               warm_start_from=None):
+               warm_start_from=None,
+               loss_reduction=losses.Reduction.SUM):
     """Construct a `LinearClassifier` estimator object.
 
     Args:
@@ -282,6 +284,8 @@ class LinearClassifier(estimator.Estimator):
         string filepath is provided instead of a `WarmStartSettings`, then all
         weights and biases are warm-started, and it is assumed that vocabularies
         and Tensor names are unchanged.
+      loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how
+        to reduce training loss over batch. Defaults to `SUM`.
 
     Returns:
       A `LinearClassifier` estimator.
@@ -292,11 +296,13 @@ class LinearClassifier(estimator.Estimator):
     if n_classes == 2:
       head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(  # pylint: disable=protected-access
           weight_column=weight_column,
-          label_vocabulary=label_vocabulary)
+          label_vocabulary=label_vocabulary,
+          loss_reduction=loss_reduction)
     else:
       head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(  # pylint: disable=protected-access
           n_classes, weight_column=weight_column,
-          label_vocabulary=label_vocabulary)
+          label_vocabulary=label_vocabulary,
+          loss_reduction=loss_reduction)
 
     def _model_fn(features, labels, mode, config):
       """Call the defined shared _linear_model_fn and possibly warm-start."""
@@ -388,7 +394,8 @@ class LinearRegressor(estimator.Estimator):
                optimizer='Ftrl',
                config=None,
                partitioner=None,
-               warm_start_from=None):
+               warm_start_from=None,
+               loss_reduction=losses.Reduction.SUM):
     """Initializes a `LinearRegressor` instance.
 
     Args:
@@ -417,9 +424,12 @@ class LinearRegressor(estimator.Estimator):
         string filepath is provided instead of a `WarmStartSettings`, then all
         weights and biases are warm-started, and it is assumed that vocabularies
         and Tensor names are unchanged.
+      loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how
+        to reduce training loss over batch. Defaults to `SUM`.
     """
     head = head_lib._regression_head_with_mean_squared_error_loss(  # pylint: disable=protected-access
-        label_dimension=label_dimension, weight_column=weight_column)
+        label_dimension=label_dimension, weight_column=weight_column,
+        loss_reduction=loss_reduction)
 
     def _model_fn(features, labels, mode, config):
       """Call the defined shared _linear_model_fn and possibly warm-start."""
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-classifier.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-classifier.pbtxt
index 46d5957057..efc441ae2f 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-classifier.pbtxt
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'hidden_units\', \'feature_columns\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'activation_fn\', \'dropout\', \'input_layer_partitioner\', \'config\', \'warm_start_from\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Adagrad\', \'<function relu instance>\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'hidden_units\', \'feature_columns\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'activation_fn\', \'dropout\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Adagrad\', \'<function relu instance>\', \'None\', \'None\', \'None\', \'None\', \'weighted_sum\'], "
   }
   member_method {
     name: "evaluate"
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt
index 439e87375b..20ce879870 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_dir\', \'linear_feature_columns\', \'linear_optimizer\', \'dnn_feature_columns\', \'dnn_optimizer\', \'dnn_hidden_units\', \'dnn_activation_fn\', \'dnn_dropout\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'input_layer_partitioner\', \'config\', \'warm_start_from\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Ftrl\', \'None\', \'Adagrad\', \'None\', \'<function relu instance>\', \'None\', \'2\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'model_dir\', \'linear_feature_columns\', \'linear_optimizer\', \'dnn_feature_columns\', \'dnn_optimizer\', \'dnn_hidden_units\', \'dnn_activation_fn\', \'dnn_dropout\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Ftrl\', \'None\', \'Adagrad\', \'None\', \'<function relu instance>\', \'None\', \'2\', \'None\', \'None\', \'None\', \'None\', \'None\', \'weighted_sum\'], "
   }
   member_method {
     name: "evaluate"
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt
index f79a8be3f6..73211aaf8b 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_dir\', \'linear_feature_columns\', \'linear_optimizer\', \'dnn_feature_columns\', \'dnn_optimizer\', \'dnn_hidden_units\', \'dnn_activation_fn\', \'dnn_dropout\', \'label_dimension\', \'weight_column\', \'input_layer_partitioner\', \'config\', \'warm_start_from\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Ftrl\', \'None\', \'Adagrad\', \'None\', \'<function relu instance>\', \'None\', \'1\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'model_dir\', \'linear_feature_columns\', \'linear_optimizer\', \'dnn_feature_columns\', \'dnn_optimizer\', \'dnn_hidden_units\', \'dnn_activation_fn\', \'dnn_dropout\', \'label_dimension\', \'weight_column\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Ftrl\', \'None\', \'Adagrad\', \'None\', \'<function relu instance>\', \'None\', \'1\', \'None\', \'None\', \'None\', \'None\', \'weighted_sum\'], "
   }
   member_method {
     name: "evaluate"
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-regressor.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-regressor.pbtxt
index c466dcb4c2..27a159639d 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-regressor.pbtxt
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'hidden_units\', \'feature_columns\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'activation_fn\', \'dropout\', \'input_layer_partitioner\', \'config\', \'warm_start_from\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Adagrad\', \'<function relu instance>\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'hidden_units\', \'feature_columns\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'activation_fn\', \'dropout\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Adagrad\', \'<function relu instance>\', \'None\', \'None\', \'None\', \'None\', \'weighted_sum\'], "
   }
   member_method {
     name: "evaluate"
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-linear-classifier.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-linear-classifier.pbtxt
index cb9e95588d..c45318b98a 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-linear-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-linear-classifier.pbtxt
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'feature_columns\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'config\', \'partitioner\', \'warm_start_from\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Ftrl\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'feature_columns\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'config\', \'partitioner\', \'warm_start_from\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Ftrl\', \'None\', \'None\', \'None\', \'weighted_sum\'], "
   }
   member_method {
     name: "evaluate"
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-linear-regressor.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-linear-regressor.pbtxt
index 637f19ba26..04a2aa080d 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-linear-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-linear-regressor.pbtxt
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'feature_columns\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'config\', \'partitioner\', \'warm_start_from\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Ftrl\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'feature_columns\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'config\', \'partitioner\', \'warm_start_from\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Ftrl\', \'None\', \'None\', \'None\', \'weighted_sum\'], "
   }
   member_method {
     name: "evaluate"
-- 
GitLab


From c80dd872bfb722b45eff48f3fdd6fc06c3a5aefb Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 24 Jan 2018 14:17:07 -0800
Subject: [PATCH 109/966] Expose shared_embedding_columns in tf.feature_column.

PiperOrigin-RevId: 183139975
---
 tensorflow/python/feature_column/BUILD        |  3 ++
 .../python/feature_column/feature_column.py   | 10 +++----
 .../feature_column/feature_column_lib.py      |  1 +
 .../feature_column/feature_column_test.py     | 29 +++++++++----------
 .../golden/tensorflow.feature_column.pbtxt    |  4 +++
 5 files changed, 27 insertions(+), 20 deletions(-)

diff --git a/tensorflow/python/feature_column/BUILD b/tensorflow/python/feature_column/BUILD
index 76d44fc474..a758f8a4fc 100644
--- a/tensorflow/python/feature_column/BUILD
+++ b/tensorflow/python/feature_column/BUILD
@@ -85,6 +85,7 @@ py_test(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:lookup_ops",
         "//tensorflow/python:parsing_ops",
         "//tensorflow/python:partitioned_variables",
@@ -93,6 +94,8 @@ py_test(
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
         "//tensorflow/python/estimator:numpy_io",
     ],
 )
diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index a7fe528ee1..7feb209cc4 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -657,11 +657,11 @@ def embedding_column(
       trainable=trainable)
 
 
-def _shared_embedding_columns(
+def shared_embedding_columns(
     categorical_columns, dimension, combiner='mean', initializer=None,
     shared_embedding_collection_name=None, ckpt_to_load_from=None,
     tensor_name_in_ckpt=None, max_norm=None, trainable=True):
-  """List of `_DenseColumn`s that convert from sparse, categorical input.
+  """List of dense columns that convert from sparse, categorical input.
 
   This is similar to `embedding_column`, except that that it produces a list of
   embedding columns that share the same embedding weights.
@@ -670,7 +670,7 @@ def _shared_embedding_columns(
   impression video IDs that share the same vocabulary), and you want to convert
   them to a dense representation (e.g., to feed to a DNN).
 
-  Inputs must be a list of `_CategoricalColumn` created by any of the
+  Inputs must be a list of categorical columns created by any of the
   `categorical_column_*` function. They must all be of the same type and have
   the same arguments except `key`. E.g. they can be
   categorical_column_with_vocabulary_file with the same vocabulary_file. Some or
@@ -714,7 +714,7 @@ def _shared_embedding_columns(
   ```
 
   Args:
-    categorical_columns: List of `_CategoricalColumn`s created by a
+    categorical_columns: List of categorical columns created by a
       `categorical_column_with_*` function. These columns produce the sparse IDs
       that are inputs to the embedding lookup. All columns must be of the same
       type and have the same arguments except `key`. E.g. they can be
@@ -744,7 +744,7 @@ def _shared_embedding_columns(
     trainable: Whether or not the embedding is trainable. Default is True.
 
   Returns:
-    A list of `_DenseColumn`s that converts from sparse input. The order of
+    A list of dense columns that converts from sparse input. The order of
     results follows the ordering of `categorical_columns`.
 
   Raises:
diff --git a/tensorflow/python/feature_column/feature_column_lib.py b/tensorflow/python/feature_column/feature_column_lib.py
index 8a57986764..505a1408d2 100644
--- a/tensorflow/python/feature_column/feature_column_lib.py
+++ b/tensorflow/python/feature_column/feature_column_lib.py
@@ -29,6 +29,7 @@ _allowed_symbols = [
     'linear_model',
     'make_parse_example_spec',
     'embedding_column',
+    'shared_embedding_columns',
     'crossed_column',
     'numeric_column',
     'bucketized_column',
diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
index d78d74c8ca..6f366e7722 100644
--- a/tensorflow/python/feature_column/feature_column_test.py
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -29,7 +29,6 @@ from tensorflow.python.client import session
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.estimator.inputs import numpy_io
-from tensorflow.python.feature_column import feature_column as fc_lib
 from tensorflow.python.feature_column import feature_column_lib as fc
 from tensorflow.python.feature_column.feature_column import _CategoricalColumn
 from tensorflow.python.feature_column.feature_column import _DenseColumn
@@ -4151,7 +4150,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
     categorical_column_b = fc.categorical_column_with_identity(
         key='bbb', num_buckets=3)
     embedding_dimension = 2
-    embedding_column_b, embedding_column_a = fc_lib._shared_embedding_columns(
+    embedding_column_b, embedding_column_a = fc.shared_embedding_columns(
         [categorical_column_b, categorical_column_a],
         dimension=embedding_dimension)
     self.assertIs(categorical_column_a, embedding_column_a.categorical_column)
@@ -4197,7 +4196,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
     categorical_column_b = fc.categorical_column_with_identity(
         key='bbb', num_buckets=3)
     embedding_dimension = 2
-    embedding_column_a, embedding_column_b = fc_lib._shared_embedding_columns(
+    embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
         [categorical_column_a, categorical_column_b],
         dimension=embedding_dimension,
         combiner='my_combiner',
@@ -4250,7 +4249,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
     categorical_column_b = fc.categorical_column_with_identity(
         key='bbb', num_buckets=3)
     embedding_dimension = 2
-    original_a, _ = fc_lib._shared_embedding_columns(
+    original_a, _ = fc.shared_embedding_columns(
         [categorical_column_a, categorical_column_b],
         dimension=embedding_dimension,
         combiner='my_combiner',
@@ -4288,7 +4287,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
     categorical_column_b = fc.categorical_column_with_identity(
         key='bbb', num_buckets=3)
     with self.assertRaisesRegexp(ValueError, 'initializer must be callable'):
-      fc_lib._shared_embedding_columns(
+      fc.shared_embedding_columns(
           [categorical_column_a, categorical_column_b], dimension=2,
           initializer='not_fn')
 
@@ -4303,7 +4302,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
         ValueError,
         'all categorical_columns must have the same type.*'
         '_IdentityCategoricalColumn.*_HashedCategoricalColumn'):
-      fc_lib._shared_embedding_columns(
+      fc.shared_embedding_columns(
           [categorical_column_a, categorical_column_b, categorical_column_c],
           dimension=2)
 
@@ -4316,11 +4315,11 @@ class SharedEmbeddingColumnTest(test.TestCase):
         key='bbb', num_buckets=3)
     weighted_categorical_column_b = fc.weighted_categorical_column(
         categorical_column_b, weight_feature_key='bbb_weights')
-    fc_lib._shared_embedding_columns(
+    fc.shared_embedding_columns(
         [weighted_categorical_column_a, categorical_column_b], dimension=2)
-    fc_lib._shared_embedding_columns(
+    fc.shared_embedding_columns(
         [categorical_column_a, weighted_categorical_column_b], dimension=2)
-    fc_lib._shared_embedding_columns(
+    fc.shared_embedding_columns(
         [weighted_categorical_column_a, weighted_categorical_column_b],
         dimension=2)
 
@@ -4329,7 +4328,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
         key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     b = fc.categorical_column_with_vocabulary_list(
         key='bbb', vocabulary_list=('omar', 'stringer', 'marlo'))
-    a_embedded, b_embedded = fc_lib._shared_embedding_columns(
+    a_embedded, b_embedded = fc.shared_embedding_columns(
         [a, b], dimension=2)
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
@@ -4364,7 +4363,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
   def test_transform_feature(self):
     a = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
     b = fc.categorical_column_with_identity(key='bbb', num_buckets=3)
-    a_embedded, b_embedded = fc_lib._shared_embedding_columns(
+    a_embedded, b_embedded = fc.shared_embedding_columns(
         [a, b], dimension=2)
     features = {
         'aaa': sparse_tensor.SparseTensor(
@@ -4434,7 +4433,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
         key='aaa', num_buckets=vocabulary_size)
     categorical_column_b = fc.categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    embedding_column_a, embedding_column_b = fc_lib._shared_embedding_columns(
+    embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
         [categorical_column_a, categorical_column_b],
         dimension=embedding_dimension, initializer=_initializer)
 
@@ -4496,7 +4495,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
         key='aaa', num_buckets=vocabulary_size)
     categorical_column_b = fc.categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    embedding_column_a, embedding_column_b = fc_lib._shared_embedding_columns(
+    embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
         [categorical_column_a, categorical_column_b],
         dimension=embedding_dimension, initializer=_initializer)
 
@@ -4536,7 +4535,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
         key='aaa', num_buckets=vocabulary_size)
     categorical_column_b = fc.categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    embedding_column_a, embedding_column_b = fc_lib._shared_embedding_columns(
+    embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
         [categorical_column_a, categorical_column_b],
         dimension=embedding_dimension, initializer=_initializer)
 
@@ -4642,7 +4641,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
         key='aaa', num_buckets=vocabulary_size)
     categorical_column_b = fc.categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    embedding_column_a, embedding_column_b = fc_lib._shared_embedding_columns(
+    embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
         [categorical_column_a, categorical_column_b],
         dimension=embedding_dimension, initializer=_initializer,
         trainable=trainable)
diff --git a/tensorflow/tools/api/golden/tensorflow.feature_column.pbtxt b/tensorflow/tools/api/golden/tensorflow.feature_column.pbtxt
index 018e8c909a..24a58fb118 100644
--- a/tensorflow/tools/api/golden/tensorflow.feature_column.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.feature_column.pbtxt
@@ -48,6 +48,10 @@ tf_module {
     name: "numeric_column"
     argspec: "args=[\'key\', \'shape\', \'default_value\', \'dtype\', \'normalizer_fn\'], varargs=None, keywords=None, defaults=[\'(1,)\', \'None\', \"<dtype: \'float32\'>\", \'None\'], "
   }
+  member_method {
+    name: "shared_embedding_columns"
+    argspec: "args=[\'categorical_columns\', \'dimension\', \'combiner\', \'initializer\', \'shared_embedding_collection_name\', \'ckpt_to_load_from\', \'tensor_name_in_ckpt\', \'max_norm\', \'trainable\'], varargs=None, keywords=None, defaults=[\'mean\', \'None\', \'None\', \'None\', \'None\', \'None\', \'True\'], "
+  }
   member_method {
     name: "weighted_categorical_column"
     argspec: "args=[\'categorical_column\', \'weight_feature_key\', \'dtype\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\"], "
-- 
GitLab


From 1b0a771d92227c0d31a24d1dbd37d9fa28637e8e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 24 Jan 2018 14:18:30 -0800
Subject: [PATCH 110/966] Update ops-related pbtxt files.

PiperOrigin-RevId: 183140179
---
 .../core/ops/compat/ops_history.v1.pbtxt      | 24 +++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 | 24 +++++++++++++++++++
 2 files changed, 48 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 47de3d398e..65ab81931a 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -22679,6 +22679,30 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "IteratorGetNextSync"
+  input_arg {
+    name: "iterator"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "IteratorSetStatsAggregator"
   input_arg {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index fe0b362d8f..b57206c9c4 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -10866,6 +10866,30 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "IteratorGetNextSync"
+  input_arg {
+    name: "iterator"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "IteratorSetStatsAggregator"
   input_arg {
-- 
GitLab


From f8347ceebbad0e06552633fcdf8e63f52246ba62 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Wed, 24 Jan 2018 14:23:02 -0800
Subject: [PATCH 111/966] Remove THIRD_PARTY_ from #include guards

They don't make sense in the open source repository.

PiperOrigin-RevId: 183140889
---
 tensorflow/c/c_test_util.h                                | 6 +++---
 tensorflow/c/python_api.h                                 | 6 +++---
 tensorflow/cc/framework/cc_op_gen.h                       | 6 +++---
 tensorflow/cc/framework/grad_op_registry.h                | 6 +++---
 tensorflow/cc/framework/gradient_checker.h                | 6 +++---
 tensorflow/cc/framework/gradients.h                       | 6 +++---
 tensorflow/cc/framework/ops.h                             | 6 +++---
 tensorflow/cc/framework/scope.h                           | 6 +++---
 tensorflow/cc/framework/scope_internal.h                  | 6 +++---
 tensorflow/cc/framework/testutil.h                        | 6 +++---
 tensorflow/cc/framework/while_gradients.h                 | 6 +++---
 tensorflow/cc/gradients/grad_testutil.h                   | 6 +++---
 tensorflow/cc/ops/const_op.h                              | 6 +++---
 tensorflow/cc/ops/standard_ops.h                          | 6 +++---
 tensorflow/cc/ops/while_loop.h                            | 6 +++---
 tensorflow/cc/profiler/profiler.h                         | 6 +++---
 tensorflow/cc/saved_model/constants.h                     | 6 +++---
 tensorflow/cc/saved_model/loader.h                        | 6 +++---
 tensorflow/cc/saved_model/signature_constants.h           | 6 +++---
 tensorflow/cc/saved_model/tag_constants.h                 | 6 +++---
 tensorflow/cc/tools/freeze_saved_model.h                  | 6 +++---
 tensorflow/cc/training/coordinator.h                      | 6 +++---
 tensorflow/cc/training/queue_runner.h                     | 6 +++---
 tensorflow/compiler/tf2xla/kernels/shape_util.h           | 2 +-
 tensorflow/compiler/xla/execution_options_util.h          | 6 +++---
 tensorflow/compiler/xla/iterator_util.h                   | 6 +++---
 .../compiler/xla/legacy_flags/debug_options_flags.h       | 6 +++---
 .../compiler/xla/legacy_flags/debug_options_parsers.h     | 6 +++---
 .../compiler/xla/service/cpu/cpu_hlo_support_checker.h    | 6 +++---
 .../xla/service/cpu/custom_call_target_registry.h         | 6 +++---
 .../compiler/xla/service/cpu/external_constant_pool.h     | 6 +++---
 tensorflow/compiler/xla/service/cpu/ir_function.h         | 6 +++---
 .../compiler/xla/service/cpu/orc_jit_memory_mapper.h      | 6 +++---
 .../compiler/xla/service/cpu/parallel_loop_emitter.h      | 6 +++---
 .../compiler/xla/service/cpu/parallel_task_assignment.h   | 6 +++---
 tensorflow/compiler/xla/service/cpu/runtime_fork_join.h   | 6 +++---
 tensorflow/compiler/xla/service/cpu/runtime_matvec.h      | 6 +++---
 tensorflow/compiler/xla/service/cpu/shape_partition.h     | 6 +++---
 tensorflow/compiler/xla/service/dot_decomposer.h          | 6 +++---
 tensorflow/compiler/xla/service/gpu/for_thunk.h           | 6 +++---
 tensorflow/compiler/xla/service/gpu/fusion_merger.h       | 6 +++---
 tensorflow/compiler/xla/service/gpu/gpu_constants.h       | 6 +++---
 .../compiler/xla/service/gpu/gpu_hlo_support_checker.h    | 6 +++---
 tensorflow/compiler/xla/service/gpu/while_transformer.h   | 6 +++---
 tensorflow/compiler/xla/service/hlo_evaluator.h           | 6 +++---
 tensorflow/compiler/xla/service/hlo_profile_printer.h     | 6 +++---
 tensorflow/compiler/xla/service/hlo_tfgraph_builder.h     | 6 +++---
 tensorflow/compiler/xla/service/hlo_verifier.h            | 6 +++---
 .../compiler/xla/service/llvm_ir/kernel_support_library.h | 6 +++---
 tensorflow/compiler/xla/service/llvm_ir/ops.h             | 6 +++---
 tensorflow/compiler/xla/service/logical_buffer_analysis.h | 6 +++---
 tensorflow/compiler/xla/service/while_loop_simplifier.h   | 6 +++---
 .../compiler/xla/service/zero_sized_hlo_elimination.h     | 6 +++---
 tensorflow/compiler/xla/sparse_index_array.h              | 6 +++---
 tensorflow/compiler/xla/statusor_internals.h              | 6 +++---
 tensorflow/compiler/xla/tests/filecheck.h                 | 6 +++---
 tensorflow/contrib/android/asset_manager_filesystem.h     | 6 +++---
 .../contrib/batching/adaptive_shared_batch_scheduler.h    | 6 +++---
 tensorflow/contrib/batching/basic_batch_scheduler.h       | 6 +++---
 tensorflow/contrib/batching/batch_scheduler.h             | 6 +++---
 tensorflow/contrib/batching/shared_batch_scheduler.h      | 6 +++---
 tensorflow/contrib/batching/test_util/fake_clock_env.h    | 6 +++---
 tensorflow/contrib/batching/util/periodic_function.h      | 6 +++---
 .../lib/learner/common/accumulators/class-partition-key.h | 6 +++---
 .../common/accumulators/feature-stats-accumulator.h       | 6 +++---
 .../lib/learner/common/partitioners/example_partitioner.h | 6 +++---
 .../lib/learner/common/stats/feature-split-candidate.h    | 6 +++---
 .../lib/learner/common/stats/gradient-stats.h             | 6 +++---
 .../boosted_trees/lib/learner/common/stats/node-stats.h   | 6 +++---
 .../boosted_trees/lib/learner/common/stats/split-stats.h  | 6 +++---
 .../boosted_trees/lib/models/multiple_additive_trees.h    | 6 +++---
 .../lib/quantiles/weighted_quantiles_buffer.h             | 6 +++---
 .../lib/quantiles/weighted_quantiles_stream.h             | 6 +++---
 .../lib/quantiles/weighted_quantiles_summary.h            | 6 +++---
 .../boosted_trees/lib/testutil/batch_features_testutil.h  | 6 +++---
 .../contrib/boosted_trees/lib/testutil/random_tree_gen.h  | 6 +++---
 .../contrib/boosted_trees/lib/trees/decision_tree.h       | 6 +++---
 .../contrib/boosted_trees/lib/utils/batch_features.h      | 6 +++---
 .../contrib/boosted_trees/lib/utils/dropout_utils.h       | 6 +++---
 tensorflow/contrib/boosted_trees/lib/utils/example.h      | 6 +++---
 .../contrib/boosted_trees/lib/utils/examples_iterable.h   | 6 +++---
 tensorflow/contrib/boosted_trees/lib/utils/macros.h       | 6 +++---
 .../contrib/boosted_trees/lib/utils/optional_value.h      | 6 +++---
 tensorflow/contrib/boosted_trees/lib/utils/parallel_for.h | 6 +++---
 tensorflow/contrib/boosted_trees/lib/utils/random.h       | 6 +++---
 .../boosted_trees/lib/utils/sparse_column_iterable.h      | 6 +++---
 tensorflow/contrib/boosted_trees/lib/utils/tensor_utils.h | 6 +++---
 .../resources/decision_tree_ensemble_resource.h           | 6 +++---
 .../boosted_trees/resources/quantile_stream_resource.h    | 6 +++---
 .../contrib/boosted_trees/resources/stamped_resource.h    | 6 +++---
 .../contrib/cloud/kernels/bigquery_table_accessor.h       | 6 +++---
 .../cloud/kernels/bigquery_table_accessor_test_data.h     | 6 +++---
 tensorflow/contrib/coder/kernels/range_coder.h            | 6 +++---
 tensorflow/contrib/coder/kernels/range_coder_ops_util.h   | 6 +++---
 tensorflow/contrib/ffmpeg/ffmpeg_lib.h                    | 6 +++---
 .../contrib/fused_conv/kernels/fused_conv_ops_gpu.h       | 6 +++---
 tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.h   | 6 +++---
 tensorflow/contrib/lite/allocation.h                      | 6 +++---
 tensorflow/contrib/lite/arena_planner.h                   | 6 +++---
 tensorflow/contrib/lite/builtin_op_data.h                 | 6 +++---
 tensorflow/contrib/lite/context.h                         | 6 +++---
 tensorflow/contrib/lite/error_reporter.h                  | 6 +++---
 tensorflow/contrib/lite/graph_info.h                      | 6 +++---
 tensorflow/contrib/lite/interpreter.h                     | 6 +++---
 tensorflow/contrib/lite/kernels/activation_functor.h      | 6 +++---
 tensorflow/contrib/lite/kernels/gemm_support.h            | 6 +++---
 tensorflow/contrib/lite/kernels/internal/common.h         | 6 +++---
 tensorflow/contrib/lite/kernels/internal/compatibility.h  | 6 +++---
 .../lite/kernels/internal/optimized/depthwiseconv_float.h | 6 +++---
 .../lite/kernels/internal/optimized/depthwiseconv_uint8.h | 6 +++---
 .../internal/optimized/eigen_spatial_convolutions.h       | 6 +++---
 .../eigen_tensor_reduced_instantiations_google.h          | 6 +++---
 .../optimized/eigen_tensor_reduced_instantiations_oss.h   | 6 +++---
 .../lite/kernels/internal/optimized/multithreaded_conv.h  | 6 +++---
 .../lite/kernels/internal/optimized/neon_tensor_utils.h   | 6 +++---
 .../lite/kernels/internal/optimized/optimized_ops.h       | 6 +++---
 .../lite/kernels/internal/reference/depthwiseconv_float.h | 6 +++---
 .../lite/kernels/internal/reference/depthwiseconv_uint8.h | 6 +++---
 .../kernels/internal/reference/portable_tensor_utils.h    | 6 +++---
 .../lite/kernels/internal/reference/reference_ops.h       | 6 +++---
 tensorflow/contrib/lite/kernels/internal/round.h          | 6 +++---
 tensorflow/contrib/lite/kernels/internal/tensor.h         | 6 +++---
 tensorflow/contrib/lite/kernels/internal/tensor_utils.h   | 6 +++---
 tensorflow/contrib/lite/kernels/internal/types.h          | 6 +++---
 tensorflow/contrib/lite/kernels/kernel_util.h             | 6 +++---
 tensorflow/contrib/lite/kernels/op_macros.h               | 6 +++---
 tensorflow/contrib/lite/kernels/padding.h                 | 6 +++---
 tensorflow/contrib/lite/kernels/register.h                | 6 +++---
 tensorflow/contrib/lite/kernels/test_util.h               | 6 +++---
 tensorflow/contrib/lite/memory_planner.h                  | 6 +++---
 tensorflow/contrib/lite/model.h                           | 6 +++---
 tensorflow/contrib/lite/models/smartreply/predictor.h     | 6 +++---
 tensorflow/contrib/lite/nnapi_delegate.h                  | 6 +++---
 tensorflow/contrib/lite/optional_debug_tools.h            | 6 +++---
 tensorflow/contrib/lite/simple_memory_arena.h             | 6 +++---
 tensorflow/contrib/lite/string_util.h                     | 6 +++---
 tensorflow/contrib/lite/testing/message.h                 | 6 +++---
 tensorflow/contrib/lite/testing/parse_testdata.h          | 6 +++---
 tensorflow/contrib/lite/testing/split.h                   | 6 +++---
 tensorflow/contrib/lite/testing/test_runner.h             | 6 +++---
 tensorflow/contrib/lite/testing/tflite_driver.h           | 6 +++---
 tensorflow/contrib/lite/testing/tokenize.h                | 6 +++---
 tensorflow/contrib/lite/testing/util.h                    | 6 +++---
 tensorflow/contrib/lite/toco/allocate_transient_arrays.h  | 6 +++---
 tensorflow/contrib/lite/toco/args.h                       | 6 +++---
 tensorflow/contrib/lite/toco/dump_graphviz.h              | 6 +++---
 tensorflow/contrib/lite/toco/export_tensorflow.h          | 6 +++---
 tensorflow/contrib/lite/toco/format_port.h                | 6 +++---
 .../toco/graph_transformations/graph_transformations.h    | 6 +++---
 .../graph_transformations/remove_trivial_passthrough.h    | 6 +++---
 tensorflow/contrib/lite/toco/import_tensorflow.h          | 6 +++---
 tensorflow/contrib/lite/toco/model.h                      | 6 +++---
 tensorflow/contrib/lite/toco/model_cmdline_flags.h        | 7 +++----
 tensorflow/contrib/lite/toco/runtime/common.h             | 6 +++---
 tensorflow/contrib/lite/toco/runtime/types.h              | 6 +++---
 tensorflow/contrib/lite/toco/tensorflow_util.h            | 6 +++---
 tensorflow/contrib/lite/toco/tflite/builtin_operator.h    | 6 +++---
 tensorflow/contrib/lite/toco/tflite/custom_operator.h     | 6 +++---
 tensorflow/contrib/lite/toco/tflite/export.h              | 6 +++---
 tensorflow/contrib/lite/toco/tflite/import.h              | 6 +++---
 tensorflow/contrib/lite/toco/tflite/operator.h            | 6 +++---
 tensorflow/contrib/lite/toco/tflite/simple_operator.h     | 6 +++---
 tensorflow/contrib/lite/toco/tflite/types.h               | 6 +++---
 tensorflow/contrib/lite/toco/toco_cmdline_flags.h         | 6 +++---
 tensorflow/contrib/lite/toco/toco_graphviz_dump_options.h | 6 +++---
 tensorflow/contrib/lite/toco/toco_port.h                  | 6 +++---
 tensorflow/contrib/lite/toco/toco_tooling.h               | 6 +++---
 tensorflow/contrib/lite/toco/toco_types.h                 | 6 +++---
 tensorflow/contrib/lite/toco/tooling_util.h               | 6 +++---
 tensorflow/contrib/lite/tools/gen_op_registration.h       | 6 +++---
 tensorflow/contrib/lite/tools/mutable_op_resolver.h       | 6 +++---
 tensorflow/contrib/lite/version.h                         | 6 +++---
 tensorflow/contrib/nccl/kernels/nccl_manager.h            | 6 +++---
 tensorflow/contrib/nearest_neighbor/kernels/heap.h        | 6 +++---
 .../nearest_neighbor/kernels/hyperplane_lsh_probes.h      | 6 +++---
 .../contrib/reduce_slice_ops/kernels/reduce_slice_ops.h   | 6 +++---
 tensorflow/contrib/resampler/kernels/resampler_ops.h      | 7 +++----
 tensorflow/contrib/rnn/kernels/blas_gemm.h                | 6 +++---
 tensorflow/contrib/rnn/kernels/gru_ops.h                  | 6 +++---
 tensorflow/contrib/rnn/kernels/lstm_ops.h                 | 6 +++---
 .../saved_model/cc/saved_model/signature_def_utils.h      | 6 +++---
 tensorflow/contrib/seq2seq/kernels/beam_search_ops.h      | 6 +++---
 tensorflow/contrib/session_bundle/bundle_shim.h           | 6 +++---
 tensorflow/contrib/session_bundle/session_bundle.h        | 6 +++---
 tensorflow/contrib/session_bundle/signature.h             | 6 +++---
 tensorflow/contrib/session_bundle/test_util.h             | 6 +++---
 tensorflow/contrib/tensor_forest/kernels/data_spec.h      | 6 +++---
 tensorflow/contrib/tensor_forest/kernels/tree_utils.h     | 6 +++---
 .../tensor_forest/kernels/v4/candidate_graph_runner.h     | 6 +++---
 .../tensor_forest/kernels/v4/decision-tree-resource.h     | 6 +++---
 .../tensor_forest/kernels/v4/decision_node_evaluator.h    | 6 +++---
 .../tensor_forest/kernels/v4/fertile-stats-resource.h     | 6 +++---
 .../tensor_forest/kernels/v4/graph_collection_operator.h  | 6 +++---
 tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.h  | 6 +++---
 tensorflow/contrib/tensor_forest/kernels/v4/input_data.h  | 6 +++---
 .../contrib/tensor_forest/kernels/v4/input_target.h       | 6 +++---
 .../tensor_forest/kernels/v4/leaf_model_operators.h       | 6 +++---
 tensorflow/contrib/tensor_forest/kernels/v4/params.h      | 7 +++----
 .../tensor_forest/kernels/v4/split_collection_operators.h | 8 +++-----
 tensorflow/contrib/tensor_forest/kernels/v4/stat_utils.h  | 6 +++---
 tensorflow/contrib/tensor_forest/kernels/v4/test_utils.h  | 6 +++---
 tensorflow/contrib/tpu/profiler/dump_tpu_profile.h        | 6 +++---
 tensorflow/contrib/tpu/profiler/trace_events_to_json.h    | 6 +++---
 tensorflow/contrib/tpu/profiler/version.h                 | 6 +++---
 .../contrib/util/convert_graphdef_memmapped_format_lib.h  | 6 +++---
 tensorflow/contrib/verbs/grpc_verbs_client.h              | 6 +++---
 tensorflow/contrib/verbs/grpc_verbs_service.h             | 6 +++---
 tensorflow/contrib/verbs/grpc_verbs_service_impl.h        | 6 +++---
 tensorflow/contrib/verbs/rdma.h                           | 6 +++---
 tensorflow/contrib/verbs/rdma_mgr.h                       | 6 +++---
 tensorflow/contrib/verbs/rdma_rendezvous_mgr.h            | 6 +++---
 tensorflow/contrib/verbs/verbs_server_lib.h               | 6 +++---
 tensorflow/core/api_def/update_api_def.h                  | 6 +++---
 tensorflow/core/common_runtime/function_testlib.h         | 6 +++---
 tensorflow/core/common_runtime/gpu/gpu_id.h               | 6 +++---
 tensorflow/core/common_runtime/gpu/gpu_id_utils.h         | 6 +++---
 .../core/common_runtime/gpu/gpu_managed_allocator.h       | 6 +++---
 tensorflow/core/common_runtime/graph_optimizer.h          | 6 +++---
 tensorflow/core/common_runtime/memory_types.h             | 6 +++---
 tensorflow/core/common_runtime/pending_counts.h           | 6 +++---
 .../common_runtime/process_function_library_runtime.h     | 6 +++---
 tensorflow/core/common_runtime/profile_handler.h          | 6 +++---
 tensorflow/core/common_runtime/renamed_device.h           | 6 +++---
 tensorflow/core/common_runtime/rendezvous_util.h          | 6 +++---
 tensorflow/core/common_runtime/shape_refiner.h            | 6 +++---
 .../core/common_runtime/stats_publisher_interface.h       | 6 +++---
 .../cluster_function_library_runtime.h                    | 6 +++---
 tensorflow/core/distributed_runtime/local_master.h        | 6 +++---
 tensorflow/core/distributed_runtime/message_wrappers.h    | 6 +++---
 tensorflow/core/distributed_runtime/partial_run_mgr.h     | 6 +++---
 tensorflow/core/distributed_runtime/recent_request_ids.h  | 6 +++---
 tensorflow/core/distributed_runtime/request_id.h          | 6 +++---
 .../distributed_runtime/rpc/async_service_interface.h     | 6 +++---
 tensorflow/core/distributed_runtime/rpc/grpc_call.h       | 6 +++---
 tensorflow/core/distributed_runtime/rpc/grpc_channel.h    | 6 +++---
 .../core/distributed_runtime/rpc/grpc_client_cq_tag.h     | 6 +++---
 .../core/distributed_runtime/rpc/grpc_master_service.h    | 6 +++---
 .../distributed_runtime/rpc/grpc_master_service_impl.h    | 6 +++---
 .../core/distributed_runtime/rpc/grpc_remote_master.h     | 6 +++---
 .../core/distributed_runtime/rpc/grpc_remote_worker.h     | 6 +++---
 .../distributed_runtime/rpc/grpc_serialization_traits.h   | 6 +++---
 tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h | 6 +++---
 tensorflow/core/distributed_runtime/rpc/grpc_session.h    | 6 +++---
 tensorflow/core/distributed_runtime/rpc/grpc_state.h      | 6 +++---
 tensorflow/core/distributed_runtime/rpc/grpc_testlib.h    | 6 +++---
 tensorflow/core/distributed_runtime/rpc/grpc_util.h       | 6 +++---
 .../core/distributed_runtime/rpc/grpc_worker_cache.h      | 6 +++---
 .../core/distributed_runtime/rpc/grpc_worker_service.h    | 6 +++---
 .../distributed_runtime/rpc/grpc_worker_service_impl.h    | 6 +++---
 tensorflow/core/distributed_runtime/server_lib.h          | 6 +++---
 tensorflow/core/distributed_runtime/session_mgr.h         | 6 +++---
 tensorflow/core/distributed_runtime/worker.h              | 6 +++---
 tensorflow/core/distributed_runtime/worker_session.h      | 6 +++---
 tensorflow/core/example/example_parser_configuration.h    | 6 +++---
 tensorflow/core/framework/common_shape_fns.h              | 6 +++---
 tensorflow/core/framework/shape_inference.h               | 6 +++---
 tensorflow/core/framework/shape_inference_testutil.h      | 6 +++---
 tensorflow/core/graph/gradients.h                         | 6 +++---
 tensorflow/core/grappler/costs/op_context.h               | 6 +++---
 tensorflow/core/grappler/costs/virtual_scheduler.h        | 6 +++---
 .../core/grappler/optimizers/dependency_optimizer.h       | 6 +++---
 tensorflow/core/grappler/optimizers/static_schedule.h     | 6 +++---
 tensorflow/core/grappler/utils/frame.h                    | 6 +++---
 tensorflow/core/grappler/utils/scc.h                      | 6 +++---
 tensorflow/core/grappler/utils/topological_sort.h         | 6 +++---
 tensorflow/core/kernels/adjust_hsv_gpu.cu.h               | 6 +++---
 tensorflow/core/kernels/batch_util.h                      | 6 +++---
 .../batching_util/adaptive_shared_batch_scheduler.h       | 7 +++----
 .../core/kernels/batching_util/basic_batch_scheduler.h    | 6 +++---
 tensorflow/core/kernels/batching_util/batch_scheduler.h   | 6 +++---
 tensorflow/core/kernels/batching_util/fake_clock_env.h    | 6 +++---
 tensorflow/core/kernels/batching_util/periodic_function.h | 7 +++----
 .../core/kernels/batching_util/shared_batch_scheduler.h   | 6 +++---
 tensorflow/core/kernels/bitcast_op.h                      | 6 +++---
 tensorflow/core/kernels/captured_function.h               | 6 +++---
 tensorflow/core/kernels/cast_op_impl.h                    | 6 +++---
 tensorflow/core/kernels/compare_and_bitpack_op.h          | 6 +++---
 tensorflow/core/kernels/cuda_device_array.h               | 6 +++---
 tensorflow/core/kernels/cuda_device_array_gpu.h           | 6 +++---
 tensorflow/core/kernels/data/captured_function.h          | 6 +++---
 tensorflow/core/kernels/data/dataset.h                    | 6 +++---
 tensorflow/core/kernels/data/dataset_utils.h              | 6 +++---
 tensorflow/core/kernels/data/sql/driver_manager.h         | 6 +++---
 tensorflow/core/kernels/data/sql/query_connection.h       | 6 +++---
 .../core/kernels/data/sql/sqlite_query_connection.h       | 6 +++---
 tensorflow/core/kernels/data/stats_aggregator.h           | 6 +++---
 tensorflow/core/kernels/data/window_dataset.h             | 6 +++---
 tensorflow/core/kernels/dataset.h                         | 6 +++---
 tensorflow/core/kernels/deep_conv2d.h                     | 6 +++---
 tensorflow/core/kernels/depthwise_conv_op.h               | 6 +++---
 tensorflow/core/kernels/determinant_op.h                  | 6 +++---
 tensorflow/core/kernels/eigen_activations.h               | 6 +++---
 tensorflow/core/kernels/eigen_attention.h                 | 6 +++---
 .../core/kernels/eigen_backward_cuboid_convolutions.h     | 6 +++---
 .../core/kernels/eigen_backward_spatial_convolutions.h    | 4 ++--
 tensorflow/core/kernels/eigen_cuboid_convolution.h        | 6 +++---
 tensorflow/core/kernels/eigen_pooling.h                   | 6 +++---
 tensorflow/core/kernels/eigen_softmax.h                   | 6 +++---
 tensorflow/core/kernels/eigen_spatial_convolutions.h      | 6 +++---
 tensorflow/core/kernels/eigen_volume_patch.h              | 6 +++---
 tensorflow/core/kernels/eye_functor.h                     | 6 +++---
 tensorflow/core/kernels/fake_quant_ops_functor.h          | 6 +++---
 tensorflow/core/kernels/gather_functor_gpu.cu.h           | 6 +++---
 tensorflow/core/kernels/gpu_utils.h                       | 6 +++---
 tensorflow/core/kernels/hexagon/graph_transferer.h        | 6 +++---
 tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h | 6 +++---
 tensorflow/core/kernels/hexagon/hexagon_ops_definitions.h | 6 +++---
 tensorflow/core/kernels/i_remote_fused_graph_executor.h   | 6 +++---
 .../core/kernels/i_remote_fused_graph_ops_definitions.h   | 6 +++---
 tensorflow/core/kernels/list_kernels.h                    | 6 +++---
 tensorflow/core/kernels/meta_support.h                    | 6 +++---
 tensorflow/core/kernels/mfcc.h                            | 6 +++---
 tensorflow/core/kernels/mfcc_dct.h                        | 6 +++---
 tensorflow/core/kernels/mfcc_mel_filterbank.h             | 6 +++---
 tensorflow/core/kernels/mirror_pad_op_cpu_impl.h          | 6 +++---
 tensorflow/core/kernels/neon/depthwiseconv_float.h        | 6 +++---
 tensorflow/core/kernels/neon/types.h                      | 6 +++---
 tensorflow/core/kernels/population_count_op.h             | 6 +++---
 tensorflow/core/kernels/quantization_utils.h              | 6 +++---
 tensorflow/core/kernels/reference_gemm.h                  | 6 +++---
 .../kernels/remote_fused_graph_execute_op_test_utils.h    | 6 +++---
 .../core/kernels/remote_fused_graph_execute_utils.h       | 6 +++---
 tensorflow/core/kernels/reshape_util.h                    | 6 +++---
 tensorflow/core/kernels/scatter_nd_op_cpu_impl.h          | 6 +++---
 tensorflow/core/kernels/segment_reduction_ops.h           | 6 +++---
 tensorflow/core/kernels/slice_op_cpu_impl.h               | 6 +++---
 tensorflow/core/kernels/spectrogram.h                     | 6 +++---
 tensorflow/core/kernels/spectrogram_test_utils.h          | 6 +++---
 tensorflow/core/kernels/tile_ops_cpu_impl.h               | 6 +++---
 tensorflow/core/kernels/tile_ops_gpu_impl.h               | 6 +++---
 tensorflow/core/kernels/winograd_transform.h              | 6 +++---
 tensorflow/core/kernels/xsmm_conv2d.h                     | 6 +++---
 tensorflow/core/lib/core/bitmap.h                         | 6 +++---
 tensorflow/core/lib/gtl/compactptrset.h                   | 6 +++---
 tensorflow/core/lib/gtl/flatmap.h                         | 6 +++---
 tensorflow/core/lib/gtl/flatrep.h                         | 6 +++---
 tensorflow/core/lib/gtl/flatset.h                         | 6 +++---
 tensorflow/core/lib/io/buffered_inputstream.h             | 2 +-
 tensorflow/core/lib/io/compression.h                      | 6 +++---
 tensorflow/core/lib/io/inputstream_interface.h            | 2 +-
 tensorflow/core/lib/io/snappy/snappy_outputbuffer.h       | 6 +++---
 tensorflow/core/lib/io/zlib_outputbuffer.h                | 6 +++---
 tensorflow/core/lib/monitoring/collected_metrics.h        | 6 +++---
 tensorflow/core/lib/monitoring/collection_registry.h      | 6 +++---
 tensorflow/core/lib/monitoring/counter.h                  | 6 +++---
 tensorflow/core/lib/monitoring/gauge.h                    | 6 +++---
 tensorflow/core/lib/monitoring/metric_def.h               | 6 +++---
 tensorflow/core/lib/monitoring/mobile_counter.h           | 6 +++---
 tensorflow/core/lib/monitoring/mobile_gauge.h             | 6 +++---
 tensorflow/core/lib/monitoring/mobile_sampler.h           | 6 +++---
 tensorflow/core/lib/monitoring/sampler.h                  | 6 +++---
 tensorflow/core/lib/strings/proto_text_util.h             | 6 +++---
 tensorflow/core/platform/cloud/gcs_dns_cache.h            | 6 +++---
 tensorflow/core/platform/cloud/oauth_client.h             | 6 +++---
 tensorflow/core/platform/cloud/retrying_utils.h           | 6 +++---
 tensorflow/core/platform/cloud/time_util.h                | 6 +++---
 tensorflow/core/platform/cuda_libdevice_path.h            | 6 +++---
 tensorflow/core/platform/cupti_wrapper.h                  | 6 +++---
 tensorflow/core/platform/default/gpu/cupti_wrapper.h      | 6 +++---
 tensorflow/core/platform/demangle.h                       | 6 +++---
 tensorflow/core/platform/file_statistics.h                | 6 +++---
 tensorflow/core/platform/hadoop/hadoop_file_system.h      | 6 +++---
 tensorflow/core/platform/stacktrace_handler.h             | 6 +++---
 .../internal/advisor/accelerator_utilization_checker.h    | 6 +++---
 tensorflow/core/profiler/internal/advisor/checker.h       | 6 +++---
 .../internal/advisor/expensive_operation_checker.h        | 6 +++---
 .../profiler/internal/advisor/internal_checker_runner.h   | 6 +++---
 .../core/profiler/internal/advisor/operation_checker.h    | 6 +++---
 .../core/profiler/internal/advisor/tfprof_advisor.h       | 6 +++---
 tensorflow/core/profiler/internal/print_model_analysis.h  | 6 +++---
 tensorflow/core/profiler/internal/tfprof_code.h           | 6 +++---
 tensorflow/core/profiler/internal/tfprof_constants.h      | 6 +++---
 tensorflow/core/profiler/internal/tfprof_graph.h          | 6 +++---
 tensorflow/core/profiler/internal/tfprof_node.h           | 6 +++---
 tensorflow/core/profiler/internal/tfprof_node_show.h      | 6 +++---
 tensorflow/core/profiler/internal/tfprof_op.h             | 6 +++---
 tensorflow/core/profiler/internal/tfprof_scope.h          | 6 +++---
 tensorflow/core/profiler/internal/tfprof_show.h           | 6 +++---
 tensorflow/core/profiler/internal/tfprof_show_multi.h     | 6 +++---
 tensorflow/core/profiler/internal/tfprof_stats.h          | 6 +++---
 tensorflow/core/profiler/internal/tfprof_tensor.h         | 6 +++---
 tensorflow/core/profiler/internal/tfprof_timeline.h       | 6 +++---
 tensorflow/core/profiler/internal/tfprof_utils.h          | 6 +++---
 tensorflow/core/profiler/tfprof_options.h                 | 6 +++---
 tensorflow/core/util/command_line_flags.h                 | 6 +++---
 tensorflow/core/util/example_proto_fast_parsing.h         | 6 +++---
 tensorflow/core/util/example_proto_helper.h               | 6 +++---
 tensorflow/core/util/matmul_autotune.h                    | 6 +++---
 tensorflow/core/util/strided_slice_op.h                   | 6 +++---
 tensorflow/examples/android/jni/object_tracking/config.h  | 6 +++---
 .../examples/android/jni/object_tracking/flow_cache.h     | 6 +++---
 .../examples/android/jni/object_tracking/frame_pair.h     | 6 +++---
 tensorflow/examples/android/jni/object_tracking/geom.h    | 6 +++---
 .../examples/android/jni/object_tracking/gl_utils.h       | 6 +++---
 .../examples/android/jni/object_tracking/image-inl.h      | 6 +++---
 tensorflow/examples/android/jni/object_tracking/image.h   | 6 +++---
 .../examples/android/jni/object_tracking/image_data.h     | 6 +++---
 .../examples/android/jni/object_tracking/image_utils.h    | 6 +++---
 .../examples/android/jni/object_tracking/integral_image.h | 6 +++---
 .../examples/android/jni/object_tracking/jni_utils.h      | 4 ++--
 .../examples/android/jni/object_tracking/keypoint.h       | 6 +++---
 .../android/jni/object_tracking/keypoint_detector.h       | 6 +++---
 tensorflow/examples/android/jni/object_tracking/logging.h | 6 +++---
 .../android/jni/object_tracking/object_detector.h         | 6 +++---
 .../examples/android/jni/object_tracking/object_model.h   | 6 +++---
 .../examples/android/jni/object_tracking/object_tracker.h | 6 +++---
 .../examples/android/jni/object_tracking/optical_flow.h   | 6 +++---
 tensorflow/examples/android/jni/object_tracking/sprite.h  | 6 +++---
 .../examples/android/jni/object_tracking/time_log.h       | 6 +++---
 .../examples/android/jni/object_tracking/tracked_object.h | 6 +++---
 tensorflow/examples/android/jni/object_tracking/utils.h   | 6 +++---
 tensorflow/examples/speech_commands/accuracy_utils.h      | 6 +++---
 tensorflow/examples/speech_commands/recognize_commands.h  | 6 +++---
 .../examples/wav_to_spectrogram/wav_to_spectrogram.h      | 6 +++---
 tensorflow/python/eager/python_eager_op_gen.h             | 6 +++---
 tensorflow/python/framework/cpp_shape_inference.h         | 6 +++---
 tensorflow/python/framework/python_op_gen_internal.h      | 6 +++---
 tensorflow/python/lib/core/ndarray_tensor.h               | 6 +++---
 tensorflow/python/lib/core/py_seq_tensor.h                | 6 +++---
 tensorflow/python/lib/core/safe_ptr.h                     | 6 +++---
 tensorflow/python/util/kernel_registry.h                  | 6 +++---
 tensorflow/python/util/util.h                             | 6 +++---
 tensorflow/tools/graph_transforms/file_utils.h            | 6 +++---
 .../tools/proto_text/gen_proto_text_functions_lib.h       | 6 +++---
 .../Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h           | 6 +++---
 .../Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h         | 6 +++---
 .../Eigen/CXX11/src/FixedPoint/TypeCastingAVX2.h          | 6 +++---
 .../Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h        | 6 +++---
 third_party/fft2d/fft.h                                   | 6 +++---
 429 files changed, 1279 insertions(+), 1286 deletions(-)

diff --git a/tensorflow/c/c_test_util.h b/tensorflow/c/c_test_util.h
index 3429009a71..6acc2fec00 100644
--- a/tensorflow/c/c_test_util.h
+++ b/tensorflow/c/c_test_util.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_C_C_TEST_UTIL_H_
-#define THIRD_PARTY_TENSORFLOW_C_C_TEST_UTIL_H_
+#ifndef TENSORFLOW_C_C_TEST_UTIL_H_
+#define TENSORFLOW_C_C_TEST_UTIL_H_
 
 #include "tensorflow/c/c_api.h"
 
@@ -136,4 +136,4 @@ class CSession {
   std::vector<TF_Operation*> targets_;
 };
 
-#endif  // THIRD_PARTY_TENSORFLOW_C_C_TEST_UTIL_H_
+#endif  // TENSORFLOW_C_C_TEST_UTIL_H_
diff --git a/tensorflow/c/python_api.h b/tensorflow/c/python_api.h
index b51ef2b531..aa9d9e06b2 100644
--- a/tensorflow/c/python_api.h
+++ b/tensorflow/c/python_api.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_C_PYTHON_API_H_
-#define THIRD_PARTY_TENSORFLOW_C_PYTHON_API_H_
+#ifndef TENSORFLOW_C_PYTHON_API_H_
+#define TENSORFLOW_C_PYTHON_API_H_
 
 #include "tensorflow/c/c_api.h"
 
@@ -39,4 +39,4 @@ void RemoveAllControlInputs(TF_Graph* graph, TF_Operation* op);
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_C_PYTHON_API_H_
+#endif  // TENSORFLOW_C_PYTHON_API_H_
diff --git a/tensorflow/cc/framework/cc_op_gen.h b/tensorflow/cc/framework/cc_op_gen.h
index 1b5f7dd923..c7256a7dc3 100644
--- a/tensorflow/cc/framework/cc_op_gen.h
+++ b/tensorflow/cc/framework/cc_op_gen.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_CC_OP_GEN_H_
-#define THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_CC_OP_GEN_H_
+#ifndef TENSORFLOW_CC_FRAMEWORK_CC_OP_GEN_H_
+#define TENSORFLOW_CC_FRAMEWORK_CC_OP_GEN_H_
 
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/op_gen_lib.h"
@@ -28,4 +28,4 @@ void WriteCCOps(const OpList& ops, const ApiDefMap& api_def_map,
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_CC_OP_GEN_H_
+#endif  // TENSORFLOW_CC_FRAMEWORK_CC_OP_GEN_H_
diff --git a/tensorflow/cc/framework/grad_op_registry.h b/tensorflow/cc/framework/grad_op_registry.h
index 190b96f685..0fc5abb20c 100644
--- a/tensorflow/cc/framework/grad_op_registry.h
+++ b/tensorflow/cc/framework/grad_op_registry.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_GRAD_OP_REGISTRY_H_
-#define THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_GRAD_OP_REGISTRY_H_
+#ifndef TENSORFLOW_CC_FRAMEWORK_GRAD_OP_REGISTRY_H_
+#define TENSORFLOW_CC_FRAMEWORK_GRAD_OP_REGISTRY_H_
 
 #include <unordered_map>
 
@@ -72,4 +72,4 @@ class GradOpRegistry {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_GRAD_OP_REGISTRY_H_
+#endif  // TENSORFLOW_CC_FRAMEWORK_GRAD_OP_REGISTRY_H_
diff --git a/tensorflow/cc/framework/gradient_checker.h b/tensorflow/cc/framework/gradient_checker.h
index d055c60d09..1aa215a908 100644
--- a/tensorflow/cc/framework/gradient_checker.h
+++ b/tensorflow/cc/framework/gradient_checker.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_GRADIENT_CHECKER_H_
-#define THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_GRADIENT_CHECKER_H_
+#ifndef TENSORFLOW_CC_FRAMEWORK_GRADIENT_CHECKER_H_
+#define TENSORFLOW_CC_FRAMEWORK_GRADIENT_CHECKER_H_
 
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
@@ -60,4 +60,4 @@ Status ComputeGradientError(const Scope& scope, const Output& x,
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_GRADIENT_CHECKER_H_
+#endif  // TENSORFLOW_CC_FRAMEWORK_GRADIENT_CHECKER_H_
diff --git a/tensorflow/cc/framework/gradients.h b/tensorflow/cc/framework/gradients.h
index 717f6f0636..0a377ad56d 100644
--- a/tensorflow/cc/framework/gradients.h
+++ b/tensorflow/cc/framework/gradients.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_GRADIENTS_H_
-#define THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_GRADIENTS_H_
+#ifndef TENSORFLOW_CC_FRAMEWORK_GRADIENTS_H_
+#define TENSORFLOW_CC_FRAMEWORK_GRADIENTS_H_
 
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
@@ -49,4 +49,4 @@ Output NoGradient();
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_GRADIENTS_H_
+#endif  // TENSORFLOW_CC_FRAMEWORK_GRADIENTS_H_
diff --git a/tensorflow/cc/framework/ops.h b/tensorflow/cc/framework/ops.h
index 8d4154220c..a085e1d6e2 100644
--- a/tensorflow/cc/framework/ops.h
+++ b/tensorflow/cc/framework/ops.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_OPS_H_
-#define THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_OPS_H_
+#ifndef TENSORFLOW_CC_FRAMEWORK_OPS_H_
+#define TENSORFLOW_CC_FRAMEWORK_OPS_H_
 
 #include <type_traits>
 
@@ -296,4 +296,4 @@ class InputList {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_OPS_H_
+#endif  // TENSORFLOW_CC_FRAMEWORK_OPS_H_
diff --git a/tensorflow/cc/framework/scope.h b/tensorflow/cc/framework/scope.h
index 0225ac0472..30c32bd44b 100644
--- a/tensorflow/cc/framework/scope.h
+++ b/tensorflow/cc/framework/scope.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_SCOPE_H_
-#define THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_SCOPE_H_
+#ifndef TENSORFLOW_CC_FRAMEWORK_SCOPE_H_
+#define TENSORFLOW_CC_FRAMEWORK_SCOPE_H_
 
 #include <memory>
 #include <string>
@@ -242,4 +242,4 @@ struct CompositeOpScopes {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_SCOPE_H_
+#endif  // TENSORFLOW_CC_FRAMEWORK_SCOPE_H_
diff --git a/tensorflow/cc/framework/scope_internal.h b/tensorflow/cc/framework/scope_internal.h
index 968c366550..8efcfed20d 100644
--- a/tensorflow/cc/framework/scope_internal.h
+++ b/tensorflow/cc/framework/scope_internal.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_SCOPE_INTERNAL_H_
-#define THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_SCOPE_INTERNAL_H_
+#ifndef TENSORFLOW_CC_FRAMEWORK_SCOPE_INTERNAL_H_
+#define TENSORFLOW_CC_FRAMEWORK_SCOPE_INTERNAL_H_
 
 #include "tensorflow/cc/framework/scope.h"
 
@@ -117,4 +117,4 @@ class Scope::Impl {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_SCOPE_INTERNAL_H_
+#endif  // TENSORFLOW_CC_FRAMEWORK_SCOPE_INTERNAL_H_
diff --git a/tensorflow/cc/framework/testutil.h b/tensorflow/cc/framework/testutil.h
index a3e19870ec..7ad6fb4a67 100644
--- a/tensorflow/cc/framework/testutil.h
+++ b/tensorflow/cc/framework/testutil.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_TESTUTIL_H_
-#define THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_TESTUTIL_H_
+#ifndef TENSORFLOW_CC_FRAMEWORK_TESTUTIL_H_
+#define TENSORFLOW_CC_FRAMEWORK_TESTUTIL_H_
 
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
@@ -44,4 +44,4 @@ void GetTensor(const Scope& scope, const std::vector<Output>& assign_vars,
 }  // namespace test
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_TESTUTIL_H_
+#endif  // TENSORFLOW_CC_FRAMEWORK_TESTUTIL_H_
diff --git a/tensorflow/cc/framework/while_gradients.h b/tensorflow/cc/framework/while_gradients.h
index 8f592accc9..cb4e579c85 100644
--- a/tensorflow/cc/framework/while_gradients.h
+++ b/tensorflow/cc/framework/while_gradients.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_WHILE_GRADIENTS_H_
-#define THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_WHILE_GRADIENTS_H_
+#ifndef TENSORFLOW_CC_FRAMEWORK_WHILE_GRADIENTS_H_
+#define TENSORFLOW_CC_FRAMEWORK_WHILE_GRADIENTS_H_
 
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
@@ -37,4 +37,4 @@ Status AddWhileLoopGradient(WhileContext* while_ctx, const Scope& scope,
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_WHILE_GRADIENTS_H_
+#endif  // TENSORFLOW_CC_FRAMEWORK_WHILE_GRADIENTS_H_
diff --git a/tensorflow/cc/gradients/grad_testutil.h b/tensorflow/cc/gradients/grad_testutil.h
index d31f412754..70c81f1a73 100644
--- a/tensorflow/cc/gradients/grad_testutil.h
+++ b/tensorflow/cc/gradients/grad_testutil.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CC_GRADIENTS_GRAD_TESTUTIL_H_
-#define THIRD_PARTY_TENSORFLOW_CC_GRADIENTS_GRAD_TESTUTIL_H_
+#ifndef TENSORFLOW_CC_GRADIENTS_GRAD_TESTUTIL_H_
+#define TENSORFLOW_CC_GRADIENTS_GRAD_TESTUTIL_H_
 
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
@@ -32,4 +32,4 @@ Status CallGradFunction(const Scope& scope, const Operation& op,
 }  // namespace test
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CC_GRADIENTS_GRAD_TESTUTIL_H_
+#endif  // TENSORFLOW_CC_GRADIENTS_GRAD_TESTUTIL_H_
diff --git a/tensorflow/cc/ops/const_op.h b/tensorflow/cc/ops/const_op.h
index d11fda475b..424a683665 100644
--- a/tensorflow/cc/ops/const_op.h
+++ b/tensorflow/cc/ops/const_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CC_OPS_CONST_OP_H_
-#define THIRD_PARTY_TENSORFLOW_CC_OPS_CONST_OP_H_
+#ifndef TENSORFLOW_CC_OPS_CONST_OP_H_
+#define TENSORFLOW_CC_OPS_CONST_OP_H_
 
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
@@ -82,4 +82,4 @@ std::vector<NodeBuilder::NodeOut> AsNodeOutList(const Scope& scope,
 }  // namespace ops
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CC_OPS_CONST_OP_H_
+#endif  // TENSORFLOW_CC_OPS_CONST_OP_H_
diff --git a/tensorflow/cc/ops/standard_ops.h b/tensorflow/cc/ops/standard_ops.h
index 0c021f0b3a..98f53010ec 100644
--- a/tensorflow/cc/ops/standard_ops.h
+++ b/tensorflow/cc/ops/standard_ops.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CC_OPS_STANDARD_OPS_H_
-#define THIRD_PARTY_TENSORFLOW_CC_OPS_STANDARD_OPS_H_
+#ifndef TENSORFLOW_CC_OPS_STANDARD_OPS_H_
+#define TENSORFLOW_CC_OPS_STANDARD_OPS_H_
 
 #include "tensorflow/cc/ops/array_ops.h"
 #include "tensorflow/cc/ops/candidate_sampling_ops.h"
@@ -37,4 +37,4 @@ limitations under the License.
 #include "tensorflow/cc/ops/training_ops.h"
 #include "tensorflow/cc/ops/user_ops.h"
 
-#endif  // THIRD_PARTY_TENSORFLOW_CC_OPS_STANDARD_OPS_H_
+#endif  // TENSORFLOW_CC_OPS_STANDARD_OPS_H_
diff --git a/tensorflow/cc/ops/while_loop.h b/tensorflow/cc/ops/while_loop.h
index a04476056a..727237b5c7 100644
--- a/tensorflow/cc/ops/while_loop.h
+++ b/tensorflow/cc/ops/while_loop.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CC_OPS_WHILE_LOOP_H_
-#define THIRD_PARTY_TENSORFLOW_CC_OPS_WHILE_LOOP_H_
+#ifndef TENSORFLOW_CC_OPS_WHILE_LOOP_H_
+#define TENSORFLOW_CC_OPS_WHILE_LOOP_H_
 
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
@@ -71,4 +71,4 @@ Status BuildWhileLoop(const Scope& scope, const std::vector<Output>& inputs,
 }  // namespace ops
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CC_OPS_WHILE_LOOP_H_
+#endif  // TENSORFLOW_CC_OPS_WHILE_LOOP_H_
diff --git a/tensorflow/cc/profiler/profiler.h b/tensorflow/cc/profiler/profiler.h
index e1ce315d3c..6077c45c58 100644
--- a/tensorflow/cc/profiler/profiler.h
+++ b/tensorflow/cc/profiler/profiler.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CC_PROFILER_PROFILER_H_
-#define THIRD_PARTY_TENSORFLOW_CC_PROFILER_PROFILER_H_
+#ifndef TENSORFLOW_CC_PROFILER_PROFILER_H_
+#define TENSORFLOW_CC_PROFILER_PROFILER_H_
 
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -94,4 +94,4 @@ class Profiler {
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CC_PROFILER_PROFILER_H_
+#endif  // TENSORFLOW_CC_PROFILER_PROFILER_H_
diff --git a/tensorflow/cc/saved_model/constants.h b/tensorflow/cc/saved_model/constants.h
index c940df8a87..645a3f101d 100644
--- a/tensorflow/cc/saved_model/constants.h
+++ b/tensorflow/cc/saved_model/constants.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CC_SAVED_MODEL_CONSTANTS_H_
-#define THIRD_PARTY_TENSORFLOW_CC_SAVED_MODEL_CONSTANTS_H_
+#ifndef TENSORFLOW_CC_SAVED_MODEL_CONSTANTS_H_
+#define TENSORFLOW_CC_SAVED_MODEL_CONSTANTS_H_
 
 namespace tensorflow {
 
@@ -47,4 +47,4 @@ constexpr char kSavedModelVariablesFilename[] = "variables";
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CC_SAVED_MODEL_CONSTANTS_H_
+#endif  // TENSORFLOW_CC_SAVED_MODEL_CONSTANTS_H_
diff --git a/tensorflow/cc/saved_model/loader.h b/tensorflow/cc/saved_model/loader.h
index 3d634dd515..a8e098fa54 100644
--- a/tensorflow/cc/saved_model/loader.h
+++ b/tensorflow/cc/saved_model/loader.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 /// SavedModel loading functions and SavedModelBundle struct.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CC_SAVED_MODEL_LOADER_H_
-#define THIRD_PARTY_TENSORFLOW_CC_SAVED_MODEL_LOADER_H_
+#ifndef TENSORFLOW_CC_SAVED_MODEL_LOADER_H_
+#define TENSORFLOW_CC_SAVED_MODEL_LOADER_H_
 
 #include <string>
 #include <unordered_set>
@@ -61,4 +61,4 @@ bool MaybeSavedModelDirectory(const string& export_dir);
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CC_SAVED_MODEL_LOADER_H_
+#endif  // TENSORFLOW_CC_SAVED_MODEL_LOADER_H_
diff --git a/tensorflow/cc/saved_model/signature_constants.h b/tensorflow/cc/saved_model/signature_constants.h
index b2d39bd55b..7d8c07f5cf 100644
--- a/tensorflow/cc/saved_model/signature_constants.h
+++ b/tensorflow/cc/saved_model/signature_constants.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CC_SAVED_MODEL_SIGNATURE_CONSTANTS_H_
-#define THIRD_PARTY_TENSORFLOW_CC_SAVED_MODEL_SIGNATURE_CONSTANTS_H_
+#ifndef TENSORFLOW_CC_SAVED_MODEL_SIGNATURE_CONSTANTS_H_
+#define TENSORFLOW_CC_SAVED_MODEL_SIGNATURE_CONSTANTS_H_
 
 namespace tensorflow {
 
@@ -66,4 +66,4 @@ static constexpr char kRegressOutputs[] = "outputs";
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CC_SAVED_MODEL_SIGNATURE_CONSTANTS_H_
+#endif  // TENSORFLOW_CC_SAVED_MODEL_SIGNATURE_CONSTANTS_H_
diff --git a/tensorflow/cc/saved_model/tag_constants.h b/tensorflow/cc/saved_model/tag_constants.h
index b71cb263ca..68a090e0c4 100644
--- a/tensorflow/cc/saved_model/tag_constants.h
+++ b/tensorflow/cc/saved_model/tag_constants.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CC_SAVED_MODEL_TAG_CONSTANTS_H_
-#define THIRD_PARTY_TENSORFLOW_CC_SAVED_MODEL_TAG_CONSTANTS_H_
+#ifndef TENSORFLOW_CC_SAVED_MODEL_TAG_CONSTANTS_H_
+#define TENSORFLOW_CC_SAVED_MODEL_TAG_CONSTANTS_H_
 
 namespace tensorflow {
 
@@ -32,4 +32,4 @@ constexpr char kSavedModelTagTrain[] = "train";
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CC_SAVED_MODEL_TAG_CONSTANTS_H_
+#endif  // TENSORFLOW_CC_SAVED_MODEL_TAG_CONSTANTS_H_
diff --git a/tensorflow/cc/tools/freeze_saved_model.h b/tensorflow/cc/tools/freeze_saved_model.h
index bd5e0516c8..b10f29805a 100644
--- a/tensorflow/cc/tools/freeze_saved_model.h
+++ b/tensorflow/cc/tools/freeze_saved_model.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CC_TOOLS_FREEZE_SAVED_MODEL_H_
-#define THIRD_PARTY_TENSORFLOW_CC_TOOLS_FREEZE_SAVED_MODEL_H_
+#ifndef TENSORFLOW_CC_TOOLS_FREEZE_SAVED_MODEL_H_
+#define TENSORFLOW_CC_TOOLS_FREEZE_SAVED_MODEL_H_
 
 #include <unordered_set>
 
@@ -40,4 +40,4 @@ Status FreezeSavedModel(const SavedModelBundle& saved_model_bundle,
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CC_TOOLS_FREEZE_SAVED_MODEL_H_
+#endif  // TENSORFLOW_CC_TOOLS_FREEZE_SAVED_MODEL_H_
diff --git a/tensorflow/cc/training/coordinator.h b/tensorflow/cc/training/coordinator.h
index 0e01b19cd9..7168b77525 100644
--- a/tensorflow/cc/training/coordinator.h
+++ b/tensorflow/cc/training/coordinator.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CC_TRAINING_COORDINATOR_H_
-#define THIRD_PARTY_TENSORFLOW_CC_TRAINING_COORDINATOR_H_
+#ifndef TENSORFLOW_CC_TRAINING_COORDINATOR_H_
+#define TENSORFLOW_CC_TRAINING_COORDINATOR_H_
 
 #include <atomic>
 #include <memory>
@@ -128,4 +128,4 @@ class Coordinator {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CC_TRAINING_COORDINATOR_H_
+#endif  // TENSORFLOW_CC_TRAINING_COORDINATOR_H_
diff --git a/tensorflow/cc/training/queue_runner.h b/tensorflow/cc/training/queue_runner.h
index 2d34500323..21189b4b04 100644
--- a/tensorflow/cc/training/queue_runner.h
+++ b/tensorflow/cc/training/queue_runner.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CC_TRAINING_QUEUE_RUNNER_H_
-#define THIRD_PARTY_TENSORFLOW_CC_TRAINING_QUEUE_RUNNER_H_
+#ifndef TENSORFLOW_CC_TRAINING_QUEUE_RUNNER_H_
+#define TENSORFLOW_CC_TRAINING_QUEUE_RUNNER_H_
 
 #include <memory>
 #include <string>
@@ -137,4 +137,4 @@ class QueueRunner : public RunnerInterface {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CC_TRAINING_QUEUE_RUNNER_H_
+#endif  // TENSORFLOW_CC_TRAINING_QUEUE_RUNNER_H_
diff --git a/tensorflow/compiler/tf2xla/kernels/shape_util.h b/tensorflow/compiler/tf2xla/kernels/shape_util.h
index 575086e118..ca57be3d47 100644
--- a/tensorflow/compiler/tf2xla/kernels/shape_util.h
+++ b/tensorflow/compiler/tf2xla/kernels/shape_util.h
@@ -31,4 +31,4 @@ Status TensorShapeToConstant(const TensorShape& input_shape,
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_TF2XLA_KERNELS_SHAPE_UTIL_H_
+#endif  // TENSORFLOW_COMPILER_TF2XLA_KERNELS_SHAPE_UTIL_H_
diff --git a/tensorflow/compiler/xla/execution_options_util.h b/tensorflow/compiler/xla/execution_options_util.h
index 562da78e83..a8ca27ec8d 100644
--- a/tensorflow/compiler/xla/execution_options_util.h
+++ b/tensorflow/compiler/xla/execution_options_util.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_EXECUTION_OPTIONS_UTIL_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_EXECUTION_OPTIONS_UTIL_H_
+#ifndef TENSORFLOW_COMPILER_XLA_EXECUTION_OPTIONS_UTIL_H_
+#define TENSORFLOW_COMPILER_XLA_EXECUTION_OPTIONS_UTIL_H_
 
 #include "tensorflow/compiler/xla/xla.pb.h"
 
@@ -26,4 +26,4 @@ ExecutionOptions CreateDefaultExecutionOptions();
 
 }  // namespace xla
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_EXECUTION_OPTIONS_UTIL_H_
+#endif  // TENSORFLOW_COMPILER_XLA_EXECUTION_OPTIONS_UTIL_H_
diff --git a/tensorflow/compiler/xla/iterator_util.h b/tensorflow/compiler/xla/iterator_util.h
index a39999705e..a8bb8c7a7e 100644
--- a/tensorflow/compiler/xla/iterator_util.h
+++ b/tensorflow/compiler/xla/iterator_util.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_ITERATOR_UTIL_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_ITERATOR_UTIL_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_ITERATOR_UTIL_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_ITERATOR_UTIL_H_
 
 #include <iterator>
 #include <utility>
@@ -95,4 +95,4 @@ UnwrappingIterator<NestedIter> MakeUnwrappingIterator(NestedIter iter) {
 
 }  // namespace xla
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_ITERATOR_UTIL_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_ITERATOR_UTIL_H_
diff --git a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.h b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.h
index d0ef8e66ab..b53157f59c 100644
--- a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.h
+++ b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_DEBUG_OPTIONS_FLAGS_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_DEBUG_OPTIONS_FLAGS_H_
+#ifndef TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_DEBUG_OPTIONS_FLAGS_H_
+#define TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_DEBUG_OPTIONS_FLAGS_H_
 
 #include <vector>
 
@@ -35,4 +35,4 @@ xla::DebugOptions GetDebugOptionsFromFlags();
 }  // namespace legacy_flags
 }  // namespace xla
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_DEBUG_OPTIONS_FLAGS_H_
+#endif  // TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_DEBUG_OPTIONS_FLAGS_H_
diff --git a/tensorflow/compiler/xla/legacy_flags/debug_options_parsers.h b/tensorflow/compiler/xla/legacy_flags/debug_options_parsers.h
index 0c238e6a5d..e9cf435d83 100644
--- a/tensorflow/compiler/xla/legacy_flags/debug_options_parsers.h
+++ b/tensorflow/compiler/xla/legacy_flags/debug_options_parsers.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_DEBUG_OPTIONS_PARSERS_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_DEBUG_OPTIONS_PARSERS_H_
+#ifndef TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_DEBUG_OPTIONS_PARSERS_H_
+#define TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_DEBUG_OPTIONS_PARSERS_H_
 
 #include <vector>
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -148,4 +148,4 @@ inline bool parse_xla_reduce_precision_option(
 }  // namespace legacy_flags
 }  // namespace xla
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_DEBUG_OPTIONS_PARSERS_H_
+#endif  // TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_DEBUG_OPTIONS_PARSERS_H_
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker.h b/tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker.h
index 2271af7b24..2924b63659 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_HLO_SUPPORT_CHECKER_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_HLO_SUPPORT_CHECKER_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_HLO_SUPPORT_CHECKER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_HLO_SUPPORT_CHECKER_H_
 
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
@@ -39,4 +39,4 @@ class CpuHloSupportChecker : public HloPassInterface {
 
 }  // namespace xla
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_HLO_SUPPORT_CHECKER_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_HLO_SUPPORT_CHECKER_H_
diff --git a/tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h b/tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h
index 2994642356..664125ecc9 100644
--- a/tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h
+++ b/tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CUSTOM_CALL_TARGET_REGISTRY_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CUSTOM_CALL_TARGET_REGISTRY_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CUSTOM_CALL_TARGET_REGISTRY_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CUSTOM_CALL_TARGET_REGISTRY_H_
 
 // This file is depended on by kernels that have to build for mobile devices.
 // For this reason, we avoid relying on TensorFlow and instead only use the
@@ -71,4 +71,4 @@ class RegisterCustomCallTarget {
 }  // namespace cpu
 }  // namespace xla
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CUSTOM_CALL_TARGET_REGISTRY_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CUSTOM_CALL_TARGET_REGISTRY_H_
diff --git a/tensorflow/compiler/xla/service/cpu/external_constant_pool.h b/tensorflow/compiler/xla/service/cpu/external_constant_pool.h
index 9c00d476b1..8008a56df4 100644
--- a/tensorflow/compiler/xla/service/cpu/external_constant_pool.h
+++ b/tensorflow/compiler/xla/service/cpu/external_constant_pool.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_EXTERNAL_CONSTANT_POOL_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_EXTERNAL_CONSTANT_POOL_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_EXTERNAL_CONSTANT_POOL_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_EXTERNAL_CONSTANT_POOL_H_
 
 #include <memory>
 
@@ -62,4 +62,4 @@ class ExternalConstantPool {
 }  // namespace cpu
 }  // namespace xla
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_EXTERNAL_CONSTANT_POOL_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_EXTERNAL_CONSTANT_POOL_H_
diff --git a/tensorflow/compiler/xla/service/cpu/ir_function.h b/tensorflow/compiler/xla/service/cpu/ir_function.h
index 1fd2da4dce..557aa4a6bf 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_function.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_function.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_IR_FUNCTION_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_IR_FUNCTION_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_IR_FUNCTION_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_IR_FUNCTION_H_
 
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
@@ -131,4 +131,4 @@ Status EmitCallToParallelForkJoin(
 }  // namespace cpu
 }  // namespace xla
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_IR_FUNCTION_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_IR_FUNCTION_H_
diff --git a/tensorflow/compiler/xla/service/cpu/orc_jit_memory_mapper.h b/tensorflow/compiler/xla/service/cpu/orc_jit_memory_mapper.h
index 2d29550fd5..f896384115 100644
--- a/tensorflow/compiler/xla/service/cpu/orc_jit_memory_mapper.h
+++ b/tensorflow/compiler/xla/service/cpu/orc_jit_memory_mapper.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_ORC_JIT_MEMORY_MAPPER_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_ORC_JIT_MEMORY_MAPPER_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_ORC_JIT_MEMORY_MAPPER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_ORC_JIT_MEMORY_MAPPER_H_
 
 #include <memory>
 
@@ -53,4 +53,4 @@ class Registrar {
 }  // namespace cpu
 }  // namespace xla
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_ORC_JIT_MEMORY_MAPPER_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_ORC_JIT_MEMORY_MAPPER_H_
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h
index 9335d2818e..ce92e36a94 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_PARALLEL_LOOP_EMITTER_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_PARALLEL_LOOP_EMITTER_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_PARALLEL_LOOP_EMITTER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_PARALLEL_LOOP_EMITTER_H_
 
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Value.h"
@@ -70,4 +70,4 @@ class ParallelLoopEmitter : public llvm_ir::LoopEmitter {
 }  // namespace cpu
 }  // namespace xla
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_PARALLEL_LOOP_EMITTER_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_PARALLEL_LOOP_EMITTER_H_
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.h b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.h
index 5801ec8d27..7140dabe51 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.h
+++ b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_PARALLEL_TASK_ASSIGNMENT_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_PARALLEL_TASK_ASSIGNMENT_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_PARALLEL_TASK_ASSIGNMENT_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_PARALLEL_TASK_ASSIGNMENT_H_
 
 #include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
@@ -99,4 +99,4 @@ class ParallelTaskAssigner : public HloPassInterface {
 }  // namespace cpu
 }  // namespace xla
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_PARALLEL_TASK_ASSIGNMENT_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_PARALLEL_TASK_ASSIGNMENT_H_
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_fork_join.h b/tensorflow/compiler/xla/service/cpu/runtime_fork_join.h
index fcf1cc6207..1cf0ec6e3d 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_fork_join.h
+++ b/tensorflow/compiler/xla/service/cpu/runtime_fork_join.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_FORK_JOIN_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_FORK_JOIN_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_FORK_JOIN_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_FORK_JOIN_H_
 
 #include "tensorflow/core/platform/types.h"
 
@@ -30,4 +30,4 @@ extern void __xla_cpu_runtime_ParallelForkJoin(
 
 }  // extern "C"
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_FORK_JOIN_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_FORK_JOIN_H_
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_matvec.h b/tensorflow/compiler/xla/service/cpu/runtime_matvec.h
index cb7e0a81f0..1bd8dfb377 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_matvec.h
+++ b/tensorflow/compiler/xla/service/cpu/runtime_matvec.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_MATVEC_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_MATVEC_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_MATVEC_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_MATVEC_H_
 
 #include "tensorflow/core/platform/types.h"
 
@@ -42,4 +42,4 @@ void EigenMatVecF64(double* out, double* lhs, double* rhs, tensorflow::int64 m,
 
 }  // namespace xla
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_MATVEC_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_MATVEC_H_
diff --git a/tensorflow/compiler/xla/service/cpu/shape_partition.h b/tensorflow/compiler/xla/service/cpu/shape_partition.h
index 7a2d00421c..33d02b70e6 100644
--- a/tensorflow/compiler/xla/service/cpu/shape_partition.h
+++ b/tensorflow/compiler/xla/service/cpu/shape_partition.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_SHAPE_PARTITION_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_SHAPE_PARTITION_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_SHAPE_PARTITION_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_SHAPE_PARTITION_H_
 
 #include <vector>
 
@@ -102,4 +102,4 @@ class ShapePartitionIterator {
 }  // namespace cpu
 }  // namespace xla
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_SHAPE_PARTITION_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_SHAPE_PARTITION_H_
diff --git a/tensorflow/compiler/xla/service/dot_decomposer.h b/tensorflow/compiler/xla/service/dot_decomposer.h
index 5ff0ab34ea..1959b687f1 100644
--- a/tensorflow/compiler/xla/service/dot_decomposer.h
+++ b/tensorflow/compiler/xla/service/dot_decomposer.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_DOT_DECOMPOSER_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_DOT_DECOMPOSER_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_DOT_DECOMPOSER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_DOT_DECOMPOSER_H_
 
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
@@ -41,4 +41,4 @@ class DotDecomposer : public HloPassInterface {
 
 }  // namespace xla
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_DOT_DECOMPOSER_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_DOT_DECOMPOSER_H_
diff --git a/tensorflow/compiler/xla/service/gpu/for_thunk.h b/tensorflow/compiler/xla/service/gpu/for_thunk.h
index 525a2af941..832494d17e 100644
--- a/tensorflow/compiler/xla/service/gpu/for_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/for_thunk.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_GPU_FOR_THUNK_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_GPU_FOR_THUNK_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_FOR_THUNK_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_FOR_THUNK_H_
 
 #include <vector>
 
@@ -49,4 +49,4 @@ class ForThunk : public Thunk {
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_GPU_FOR_THUNK_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_FOR_THUNK_H_
diff --git a/tensorflow/compiler/xla/service/gpu/fusion_merger.h b/tensorflow/compiler/xla/service/gpu/fusion_merger.h
index bd720f8584..4c523a66de 100644
--- a/tensorflow/compiler/xla/service/gpu/fusion_merger.h
+++ b/tensorflow/compiler/xla/service/gpu/fusion_merger.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_GPU_FUSION_MERGER_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_GPU_FUSION_MERGER_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_FUSION_MERGER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_FUSION_MERGER_H_
 
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
@@ -44,4 +44,4 @@ class FusionMerger : public HloPassInterface {
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_GPU_FUSION_MERGER_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_FUSION_MERGER_H_
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_constants.h b/tensorflow/compiler/xla/service/gpu/gpu_constants.h
index 572c856282..eb1ca4c6c9 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_constants.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_constants.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_CONSTANTS_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_CONSTANTS_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_CONSTANTS_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_CONSTANTS_H_
 
 #include "tensorflow/compiler/xla/types.h"
 
@@ -28,4 +28,4 @@ extern const int64 kCudaMallocAlignBytes;
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_CONSTANTS_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_CONSTANTS_H_
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker.h b/tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker.h
index d9550f81b5..d63e213d2b 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_HLO_SUPPORT_CHECKER_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_HLO_SUPPORT_CHECKER_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_HLO_SUPPORT_CHECKER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_HLO_SUPPORT_CHECKER_H_
 
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
@@ -39,4 +39,4 @@ class GpuHloSupportChecker : public HloPassInterface {
 
 }  // namespace xla
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_HLO_SUPPORT_CHECKER_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_HLO_SUPPORT_CHECKER_H_
diff --git a/tensorflow/compiler/xla/service/gpu/while_transformer.h b/tensorflow/compiler/xla/service/gpu/while_transformer.h
index a4f527fce0..fe3a954e18 100644
--- a/tensorflow/compiler/xla/service/gpu/while_transformer.h
+++ b/tensorflow/compiler/xla/service/gpu/while_transformer.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_GPU_WHILE_TRANSFORMER_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_GPU_WHILE_TRANSFORMER_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_WHILE_TRANSFORMER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_WHILE_TRANSFORMER_H_
 
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -40,4 +40,4 @@ StatusOr<std::tuple<int64, int64, int64>> CanTransformWhileToFor(
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_GPU_WHILE_TRANSFORMER_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_WHILE_TRANSFORMER_H_
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.h b/tensorflow/compiler/xla/service/hlo_evaluator.h
index 02bb8b0a47..3b2b697e49 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_HLO_EVALUATOR_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_HLO_EVALUATOR_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_EVALUATOR_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_EVALUATOR_H_
 
 #include <memory>
 
@@ -195,4 +195,4 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
 
 }  // namespace xla
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_HLO_EVALUATOR_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_EVALUATOR_H_
diff --git a/tensorflow/compiler/xla/service/hlo_profile_printer.h b/tensorflow/compiler/xla/service/hlo_profile_printer.h
index 2f056490ae..35152e744d 100644
--- a/tensorflow/compiler/xla/service/hlo_profile_printer.h
+++ b/tensorflow/compiler/xla/service/hlo_profile_printer.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_HLO_PROFILE_PRINTER_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_HLO_PROFILE_PRINTER_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_PROFILE_PRINTER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_PROFILE_PRINTER_H_
 
 #include <cstdint>
 #include <string>
@@ -100,4 +100,4 @@ class HloProfilePrinter {
 };
 }  // namespace xla
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_HLO_PROFILE_PRINTER_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_PROFILE_PRINTER_H_
diff --git a/tensorflow/compiler/xla/service/hlo_tfgraph_builder.h b/tensorflow/compiler/xla/service/hlo_tfgraph_builder.h
index 9aa3e501d5..c4876b852e 100644
--- a/tensorflow/compiler/xla/service/hlo_tfgraph_builder.h
+++ b/tensorflow/compiler/xla/service/hlo_tfgraph_builder.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_HLO_TFGRAPH_BUILDER_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_HLO_TFGRAPH_BUILDER_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_TFGRAPH_BUILDER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_TFGRAPH_BUILDER_H_
 
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/xla.pb.h"
@@ -56,4 +56,4 @@ class HloTfGraphBuilder {
 }  // namespace hlo_graph_dumper
 }  // namespace xla
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_HLO_TFGRAPH_BUILDER_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_TFGRAPH_BUILDER_H_
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h
index 6368611f32..5a1d864e03 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.h
+++ b/tensorflow/compiler/xla/service/hlo_verifier.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_HLO_VERIFIER_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_HLO_VERIFIER_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_VERIFIER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_VERIFIER_H_
 
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
@@ -127,4 +127,4 @@ class HloVerifier : public HloPassInterface {
 
 }  // namespace xla
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_HLO_VERIFIER_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_VERIFIER_H_
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h
index 827e092a3f..1c00b2aabd 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_KERNEL_SUPPORT_LIBRARY_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_KERNEL_SUPPORT_LIBRARY_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_KERNEL_SUPPORT_LIBRARY_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_KERNEL_SUPPORT_LIBRARY_H_
 
 #include <string>
 
@@ -179,4 +179,4 @@ class KernelSupportLibrary {
 };
 }  // namespace xla
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_KERNEL_SUPPORT_LIBRARY_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_KERNEL_SUPPORT_LIBRARY_H_
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ops.h b/tensorflow/compiler/xla/service/llvm_ir/ops.h
index f72f482e31..175b081e84 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ops.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/ops.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_OPS_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_OPS_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_OPS_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_OPS_H_
 
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/elemental_ir_emitter.h"
@@ -90,4 +90,4 @@ Status EmitParallelFusedDynamicUpdateSliceInPlace(
 }  // namespace llvm_ir
 }  // namespace xla
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_OPS_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_OPS_H_
diff --git a/tensorflow/compiler/xla/service/logical_buffer_analysis.h b/tensorflow/compiler/xla/service/logical_buffer_analysis.h
index 598d08b720..f4c63dd86b 100644
--- a/tensorflow/compiler/xla/service/logical_buffer_analysis.h
+++ b/tensorflow/compiler/xla/service/logical_buffer_analysis.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_LOGICAL_BUFFER_ANALYSIS_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_LOGICAL_BUFFER_ANALYSIS_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_LOGICAL_BUFFER_ANALYSIS_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_LOGICAL_BUFFER_ANALYSIS_H_
 
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
@@ -90,4 +90,4 @@ class LogicalBufferAnalysis : public DfsHloVisitorWithDefault {
 
 }  // namespace xla
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_LOGICAL_BUFFER_ANALYSIS_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_LOGICAL_BUFFER_ANALYSIS_H_
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier.h b/tensorflow/compiler/xla/service/while_loop_simplifier.h
index 50dac32a4a..d3d55634c9 100644
--- a/tensorflow/compiler/xla/service/while_loop_simplifier.h
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_SIMPLIFIER_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_SIMPLIFIER_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_SIMPLIFIER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_SIMPLIFIER_H_
 
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
@@ -41,4 +41,4 @@ class WhileLoopSimplifier : public HloPassInterface {
 
 }  // namespace xla
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_SIMPLIFIER_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_SIMPLIFIER_H_
diff --git a/tensorflow/compiler/xla/service/zero_sized_hlo_elimination.h b/tensorflow/compiler/xla/service/zero_sized_hlo_elimination.h
index 63afab4206..063e312df6 100644
--- a/tensorflow/compiler/xla/service/zero_sized_hlo_elimination.h
+++ b/tensorflow/compiler/xla/service/zero_sized_hlo_elimination.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_ZERO_SIZED_HLO_ELIMINATION_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_ZERO_SIZED_HLO_ELIMINATION_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_ZERO_SIZED_HLO_ELIMINATION_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_ZERO_SIZED_HLO_ELIMINATION_H_
 
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
@@ -29,4 +29,4 @@ class ZeroSizedHloElimination : public HloPassInterface {
   }
 };
 }  // namespace xla
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_ZERO_SIZED_HLO_ELIMINATION_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_ZERO_SIZED_HLO_ELIMINATION_H_
diff --git a/tensorflow/compiler/xla/sparse_index_array.h b/tensorflow/compiler/xla/sparse_index_array.h
index 903fee5255..f2ce22d672 100644
--- a/tensorflow/compiler/xla/sparse_index_array.h
+++ b/tensorflow/compiler/xla/sparse_index_array.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // Utility class for managing sparse array indices.
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SPARSE_INDEX_ARRAY_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SPARSE_INDEX_ARRAY_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SPARSE_INDEX_ARRAY_H_
+#define TENSORFLOW_COMPILER_XLA_SPARSE_INDEX_ARRAY_H_
 
 #include <vector>
 
@@ -173,4 +173,4 @@ void SparseIndexArray::SortWithValues(
 
 }  // namespace xla
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SPARSE_INDEX_ARRAY_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SPARSE_INDEX_ARRAY_H_
diff --git a/tensorflow/compiler/xla/statusor_internals.h b/tensorflow/compiler/xla/statusor_internals.h
index a2fda5bb3c..14636bd144 100644
--- a/tensorflow/compiler/xla/statusor_internals.h
+++ b/tensorflow/compiler/xla/statusor_internals.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_STATUSOR_INTERNALS_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_STATUSOR_INTERNALS_H_
+#ifndef TENSORFLOW_COMPILER_XLA_STATUSOR_INTERNALS_H_
+#define TENSORFLOW_COMPILER_XLA_STATUSOR_INTERNALS_H_
 
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/core/platform/macros.h"
@@ -242,4 +242,4 @@ struct TraitsBase<false, false> {
 }  // namespace internal_statusor
 }  // namespace xla
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_STATUSOR_INTERNALS_H_
+#endif  // TENSORFLOW_COMPILER_XLA_STATUSOR_INTERNALS_H_
diff --git a/tensorflow/compiler/xla/tests/filecheck.h b/tensorflow/compiler/xla/tests/filecheck.h
index 493ff7414b..3830d5a44d 100644
--- a/tensorflow/compiler/xla/tests/filecheck.h
+++ b/tensorflow/compiler/xla/tests/filecheck.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_TESTS_FILECHECK_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_TESTS_FILECHECK_H_
+#ifndef TENSORFLOW_COMPILER_XLA_TESTS_FILECHECK_H_
+#define TENSORFLOW_COMPILER_XLA_TESTS_FILECHECK_H_
 
 #include <string>
 
@@ -30,4 +30,4 @@ StatusOr<bool> RunFileCheck(const string& input, const string& pattern);
 
 }  // namespace xla
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_TESTS_FILECHECK_H_
+#endif  // TENSORFLOW_COMPILER_XLA_TESTS_FILECHECK_H_
diff --git a/tensorflow/contrib/android/asset_manager_filesystem.h b/tensorflow/contrib/android/asset_manager_filesystem.h
index 2b43939f14..665304b5ee 100644
--- a/tensorflow/contrib/android/asset_manager_filesystem.h
+++ b/tensorflow/contrib/android/asset_manager_filesystem.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_ANDROID_ASSET_MANAGER_FILESYSTEM_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_ANDROID_ASSET_MANAGER_FILESYSTEM_H_
+#ifndef TENSORFLOW_CONTRIB_ANDROID_ASSET_MANAGER_FILESYSTEM_H_
+#define TENSORFLOW_CONTRIB_ANDROID_ASSET_MANAGER_FILESYSTEM_H_
 
 #include <android/asset_manager.h>
 #include <android/asset_manager_jni.h>
@@ -79,4 +79,4 @@ class AssetManagerFileSystem : public FileSystem {
 };
 
 }  // namespace tensorflow
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_ANDROID_ASSET_MANAGER_FILESYSTEM_H_
+#endif  // TENSORFLOW_CONTRIB_ANDROID_ASSET_MANAGER_FILESYSTEM_H_
diff --git a/tensorflow/contrib/batching/adaptive_shared_batch_scheduler.h b/tensorflow/contrib/batching/adaptive_shared_batch_scheduler.h
index 60861f83f4..86250e6692 100644
--- a/tensorflow/contrib/batching/adaptive_shared_batch_scheduler.h
+++ b/tensorflow/contrib/batching/adaptive_shared_batch_scheduler.h
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BATCHING_ADAPTIVE_SHARED_BATCH_SCHEDULER_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BATCHING_ADAPTIVE_SHARED_BATCH_SCHEDULER_H_
+#ifndef TENSORFLOW_CONTRIB_BATCHING_ADAPTIVE_SHARED_BATCH_SCHEDULER_H_
+#define TENSORFLOW_CONTRIB_BATCHING_ADAPTIVE_SHARED_BATCH_SCHEDULER_H_
 
 #include "tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h"
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BATCHING_ADAPTIVE_SHARED_BATCH_SCHEDULER_H_
+#endif  // TENSORFLOW_CONTRIB_BATCHING_ADAPTIVE_SHARED_BATCH_SCHEDULER_H_
diff --git a/tensorflow/contrib/batching/basic_batch_scheduler.h b/tensorflow/contrib/batching/basic_batch_scheduler.h
index 63ba8fcf45..d9b37da693 100644
--- a/tensorflow/contrib/batching/basic_batch_scheduler.h
+++ b/tensorflow/contrib/batching/basic_batch_scheduler.h
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BATCHING_BASIC_BATCH_SCHEDULER_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BATCHING_BASIC_BATCH_SCHEDULER_H_
+#ifndef TENSORFLOW_CONTRIB_BATCHING_BASIC_BATCH_SCHEDULER_H_
+#define TENSORFLOW_CONTRIB_BATCHING_BASIC_BATCH_SCHEDULER_H_
 
 #include "tensorflow/core/kernels/batching_util/basic_batch_scheduler.h"
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BATCHING_BASIC_BATCH_SCHEDULER_H_
+#endif  // TENSORFLOW_CONTRIB_BATCHING_BASIC_BATCH_SCHEDULER_H_
diff --git a/tensorflow/contrib/batching/batch_scheduler.h b/tensorflow/contrib/batching/batch_scheduler.h
index 3afce2761f..8e94e1fd8b 100644
--- a/tensorflow/contrib/batching/batch_scheduler.h
+++ b/tensorflow/contrib/batching/batch_scheduler.h
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BATCHING_BATCH_SCHEDULER_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BATCHING_BATCH_SCHEDULER_H_
+#ifndef TENSORFLOW_CONTRIB_BATCHING_BATCH_SCHEDULER_H_
+#define TENSORFLOW_CONTRIB_BATCHING_BATCH_SCHEDULER_H_
 
 #include "tensorflow/core/kernels/batching_util/batch_scheduler.h"
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BATCHING_BATCH_SCHEDULER_H_
+#endif  // TENSORFLOW_CONTRIB_BATCHING_BATCH_SCHEDULER_H_
diff --git a/tensorflow/contrib/batching/shared_batch_scheduler.h b/tensorflow/contrib/batching/shared_batch_scheduler.h
index 7eb1e20c42..83a59695d7 100644
--- a/tensorflow/contrib/batching/shared_batch_scheduler.h
+++ b/tensorflow/contrib/batching/shared_batch_scheduler.h
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BATCHING_SHARED_BATCH_SCHEDULER_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BATCHING_SHARED_BATCH_SCHEDULER_H_
+#ifndef TENSORFLOW_CONTRIB_BATCHING_SHARED_BATCH_SCHEDULER_H_
+#define TENSORFLOW_CONTRIB_BATCHING_SHARED_BATCH_SCHEDULER_H_
 
 #include "tensorflow/core/kernels/batching_util/shared_batch_scheduler.h"
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BATCHING_SHARED_BATCH_SCHEDULER_H_
+#endif  // TENSORFLOW_CONTRIB_BATCHING_SHARED_BATCH_SCHEDULER_H_
diff --git a/tensorflow/contrib/batching/test_util/fake_clock_env.h b/tensorflow/contrib/batching/test_util/fake_clock_env.h
index ced27a8833..40a39a5569 100644
--- a/tensorflow/contrib/batching/test_util/fake_clock_env.h
+++ b/tensorflow/contrib/batching/test_util/fake_clock_env.h
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BATCHING_TEST_UTIL_FAKE_CLOCK_ENV_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BATCHING_TEST_UTIL_FAKE_CLOCK_ENV_H_
+#ifndef TENSORFLOW_CONTRIB_BATCHING_TEST_UTIL_FAKE_CLOCK_ENV_H_
+#define TENSORFLOW_CONTRIB_BATCHING_TEST_UTIL_FAKE_CLOCK_ENV_H_
 
 #include "tensorflow/core/kernels/batching_util/fake_clock_env.h"
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BATCHING_TEST_UTIL_FAKE_CLOCK_ENV_H_
+#endif  // TENSORFLOW_CONTRIB_BATCHING_TEST_UTIL_FAKE_CLOCK_ENV_H_
diff --git a/tensorflow/contrib/batching/util/periodic_function.h b/tensorflow/contrib/batching/util/periodic_function.h
index fb61bc2eea..aa2ed0a385 100644
--- a/tensorflow/contrib/batching/util/periodic_function.h
+++ b/tensorflow/contrib/batching/util/periodic_function.h
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BATCHING_UTIL_PERIODIC_FUNCTION_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BATCHING_UTIL_PERIODIC_FUNCTION_H_
+#ifndef TENSORFLOW_CONTRIB_BATCHING_UTIL_PERIODIC_FUNCTION_H_
+#define TENSORFLOW_CONTRIB_BATCHING_UTIL_PERIODIC_FUNCTION_H_
 
 #include "tensorflow/core/kernels/batching_util/periodic_function.h"
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BATCHING_UTIL_PERIODIC_FUNCTION_H_
+#endif  // TENSORFLOW_CONTRIB_BATCHING_UTIL_PERIODIC_FUNCTION_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/common/accumulators/class-partition-key.h b/tensorflow/contrib/boosted_trees/lib/learner/common/accumulators/class-partition-key.h
index e1bef02788..3c54868951 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/common/accumulators/class-partition-key.h
+++ b/tensorflow/contrib/boosted_trees/lib/learner/common/accumulators/class-partition-key.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_ACCUMULATORS_CLASS_PARTITION_KEY_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_ACCUMULATORS_CLASS_PARTITION_KEY_H_
+#ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_ACCUMULATORS_CLASS_PARTITION_KEY_H_
+#define TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_ACCUMULATORS_CLASS_PARTITION_KEY_H_
 
 #include "tensorflow/core/lib/hash/hash.h"
 
@@ -58,4 +58,4 @@ struct ClassPartitionKey {
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_ACCUMULATORS_CLASS_PARTITION_KEY_H_
+#endif  // TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_ACCUMULATORS_CLASS_PARTITION_KEY_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/common/accumulators/feature-stats-accumulator.h b/tensorflow/contrib/boosted_trees/lib/learner/common/accumulators/feature-stats-accumulator.h
index 3814edb567..ec4e7c52bb 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/common/accumulators/feature-stats-accumulator.h
+++ b/tensorflow/contrib/boosted_trees/lib/learner/common/accumulators/feature-stats-accumulator.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_ACCUMULATORS_FEATURE_STATS_ACCUMULATOR_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_ACCUMULATORS_FEATURE_STATS_ACCUMULATOR_H_
+#ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_ACCUMULATORS_FEATURE_STATS_ACCUMULATOR_H_
+#define TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_ACCUMULATORS_FEATURE_STATS_ACCUMULATOR_H_
 
 #include <unordered_map>
 #include <vector>
@@ -79,4 +79,4 @@ class FeatureStatsAccumulator {
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_ACCUMULATORS_FEATURE_STATS_ACCUMULATOR_H_
+#endif  // TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_ACCUMULATORS_FEATURE_STATS_ACCUMULATOR_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/common/partitioners/example_partitioner.h b/tensorflow/contrib/boosted_trees/lib/learner/common/partitioners/example_partitioner.h
index aed0d9fdac..37a7103704 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/common/partitioners/example_partitioner.h
+++ b/tensorflow/contrib/boosted_trees/lib/learner/common/partitioners/example_partitioner.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_PARTITIONERS_EXAMPLE_PARTITIONER_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_PARTITIONERS_EXAMPLE_PARTITIONER_H_
+#ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_PARTITIONERS_EXAMPLE_PARTITIONER_H_
+#define TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_PARTITIONERS_EXAMPLE_PARTITIONER_H_
 
 #include <vector>
 #include "tensorflow/contrib/boosted_trees/lib/trees/decision_tree.h"
@@ -50,4 +50,4 @@ class ExamplePartitioner {
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_PARTITIONERS_EXAMPLE_PARTITIONER_H_
+#endif  // TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_PARTITIONERS_EXAMPLE_PARTITIONER_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/common/stats/feature-split-candidate.h b/tensorflow/contrib/boosted_trees/lib/learner/common/stats/feature-split-candidate.h
index 339c2e0fde..382b85cf0b 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/common/stats/feature-split-candidate.h
+++ b/tensorflow/contrib/boosted_trees/lib/learner/common/stats/feature-split-candidate.h
@@ -13,8 +13,8 @@
 // limitations under the License.
 //
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_FEATURE_SPLIT_CANDIDATE_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_FEATURE_SPLIT_CANDIDATE_H_
+#ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_FEATURE_SPLIT_CANDIDATE_H_
+#define TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_FEATURE_SPLIT_CANDIDATE_H_
 
 #include "tensorflow/contrib/boosted_trees/lib/learner/common/stats/split-stats.h"
 #include "tensorflow/contrib/boosted_trees/proto/tree_config.pb.h"
@@ -58,4 +58,4 @@ struct FeatureSplitCandidate {
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_FEATURE_SPLIT_CANDIDATE_H_
+#endif  // TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_FEATURE_SPLIT_CANDIDATE_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/common/stats/gradient-stats.h b/tensorflow/contrib/boosted_trees/lib/learner/common/stats/gradient-stats.h
index 34e3ddb777..3dd03215d8 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/common/stats/gradient-stats.h
+++ b/tensorflow/contrib/boosted_trees/lib/learner/common/stats/gradient-stats.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_GRADIENT_STATS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_GRADIENT_STATS_H_
+#ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_GRADIENT_STATS_H_
+#define TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_GRADIENT_STATS_H_
 
 #include <math.h>
 
@@ -190,4 +190,4 @@ inline GradientStats operator-(const GradientStats& a, const GradientStats& b) {
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_GRADIENT_STATS_H_
+#endif  // TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_GRADIENT_STATS_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/common/stats/node-stats.h b/tensorflow/contrib/boosted_trees/lib/learner/common/stats/node-stats.h
index 642a183aec..cd925f6b65 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/common/stats/node-stats.h
+++ b/tensorflow/contrib/boosted_trees/lib/learner/common/stats/node-stats.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_NODE_STATS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_NODE_STATS_H_
+#ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_NODE_STATS_H_
+#define TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_NODE_STATS_H_
 
 #include "third_party/eigen3/Eigen/Core"
 #include "third_party/eigen3/Eigen/Eigenvalues"
@@ -298,4 +298,4 @@ struct NodeStats {
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_NODE_STATS_H_
+#endif  // TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_NODE_STATS_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/common/stats/split-stats.h b/tensorflow/contrib/boosted_trees/lib/learner/common/stats/split-stats.h
index 054ccd9a8c..81ee2774bd 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/common/stats/split-stats.h
+++ b/tensorflow/contrib/boosted_trees/lib/learner/common/stats/split-stats.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_SPLIT_STATS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_SPLIT_STATS_H_
+#ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_SPLIT_STATS_H_
+#define TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_SPLIT_STATS_H_
 
 #include <string>
 
@@ -81,4 +81,4 @@ struct SplitStats {
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_SPLIT_STATS_H_
+#endif  // TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_SPLIT_STATS_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/models/multiple_additive_trees.h b/tensorflow/contrib/boosted_trees/lib/models/multiple_additive_trees.h
index ee29a8aa79..cc3dc226cd 100644
--- a/tensorflow/contrib/boosted_trees/lib/models/multiple_additive_trees.h
+++ b/tensorflow/contrib/boosted_trees/lib/models/multiple_additive_trees.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_MODELS_MULTIPLE_ADDITIVE_TREES_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_MODELS_MULTIPLE_ADDITIVE_TREES_H_
+#ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_MODELS_MULTIPLE_ADDITIVE_TREES_H_
+#define TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_MODELS_MULTIPLE_ADDITIVE_TREES_H_
 
 #include <vector>
 
@@ -45,4 +45,4 @@ class MultipleAdditiveTrees {
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_MODELS_MULTIPLE_ADDITIVE_TREES_H_
+#endif  // TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_MODELS_MULTIPLE_ADDITIVE_TREES_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_buffer.h b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_buffer.h
index 70037d5bd8..804b218f1c 100644
--- a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_buffer.h
+++ b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_buffer.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_BUFFER_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_BUFFER_H_
+#ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_BUFFER_H_
+#define TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_BUFFER_H_
 
 #include <algorithm>
 #include <unordered_map>
@@ -129,4 +129,4 @@ constexpr decltype(CompareFn())
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_BUFFER_H_
+#endif  // TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_BUFFER_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h
index fd577ad712..1c4181f1b1 100644
--- a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h
+++ b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_STREAM_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_STREAM_H_
+#ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_STREAM_H_
+#define TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_STREAM_H_
 
 #include <memory>
 #include <vector>
@@ -322,4 +322,4 @@ WeightedQuantilesStream<ValueType, WeightType, CompareFn>::GetQuantileSpecs(
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_STREAM_H_
+#endif  // TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_STREAM_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h
index c329c6d4f7..aec232f3cb 100644
--- a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h
+++ b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_SUMMARY_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_SUMMARY_H_
+#ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_SUMMARY_H_
+#define TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_SUMMARY_H_
 
 #include <cstring>
 #include <vector>
@@ -334,4 +334,4 @@ constexpr decltype(CompareFn())
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_SUMMARY_H_
+#endif  // TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_SUMMARY_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/testutil/batch_features_testutil.h b/tensorflow/contrib/boosted_trees/lib/testutil/batch_features_testutil.h
index d95878ec87..b98190b10d 100644
--- a/tensorflow/contrib/boosted_trees/lib/testutil/batch_features_testutil.h
+++ b/tensorflow/contrib/boosted_trees/lib/testutil/batch_features_testutil.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TESTUTIL_BATCH_FEATURES_TESTUTIL_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TESTUTIL_BATCH_FEATURES_TESTUTIL_H_
+#ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TESTUTIL_BATCH_FEATURES_TESTUTIL_H_
+#define TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TESTUTIL_BATCH_FEATURES_TESTUTIL_H_
 
 #include "tensorflow/contrib/boosted_trees/lib/utils/batch_features.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -42,4 +42,4 @@ void RandomlyInitializeBatchFeatures(
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TESTUTIL_BATCH_FEATURES_TESTUTIL_H_
+#endif  // TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TESTUTIL_BATCH_FEATURES_TESTUTIL_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/testutil/random_tree_gen.h b/tensorflow/contrib/boosted_trees/lib/testutil/random_tree_gen.h
index 5e12429ba7..1838b4cee2 100644
--- a/tensorflow/contrib/boosted_trees/lib/testutil/random_tree_gen.h
+++ b/tensorflow/contrib/boosted_trees/lib/testutil/random_tree_gen.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TESTUTIL_RANDOM_TREE_GEN_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TESTUTIL_RANDOM_TREE_GEN_H_
+#ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TESTUTIL_RANDOM_TREE_GEN_H_
+#define TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TESTUTIL_RANDOM_TREE_GEN_H_
 
 #include <memory>
 
@@ -72,4 +72,4 @@ class RandomTreeGen {
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TESTUTIL_RANDOM_TREE_GEN_H_
+#endif  // TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TESTUTIL_RANDOM_TREE_GEN_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/trees/decision_tree.h b/tensorflow/contrib/boosted_trees/lib/trees/decision_tree.h
index 604ff02744..43526c229a 100644
--- a/tensorflow/contrib/boosted_trees/lib/trees/decision_tree.h
+++ b/tensorflow/contrib/boosted_trees/lib/trees/decision_tree.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TREES_DECISION_TREE_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TREES_DECISION_TREE_H_
+#ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TREES_DECISION_TREE_H_
+#define TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TREES_DECISION_TREE_H_
 
 #include "tensorflow/contrib/boosted_trees/lib/utils/example.h"
 #include "tensorflow/contrib/boosted_trees/proto/tree_config.pb.h"  // NOLINT
@@ -46,4 +46,4 @@ class DecisionTree {
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TREES_DECISION_TREE_H_
+#endif  // TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TREES_DECISION_TREE_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/batch_features.h b/tensorflow/contrib/boosted_trees/lib/utils/batch_features.h
index badc629a11..da5e744851 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/batch_features.h
+++ b/tensorflow/contrib/boosted_trees/lib/utils/batch_features.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_BATCH_FEATURES_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_BATCH_FEATURES_H_
+#ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_BATCH_FEATURES_H_
+#define TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_BATCH_FEATURES_H_
 
 #include <vector>
 #include "tensorflow/contrib/boosted_trees/lib/utils/examples_iterable.h"
@@ -92,4 +92,4 @@ class BatchFeatures {
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_BATCH_FEATURES_H_
+#endif  // TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_BATCH_FEATURES_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.h b/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.h
index c3f1c918ca..928bfbfe5c 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.h
+++ b/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_DROPOUT_UTILS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_DROPOUT_UTILS_H_
+#ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_DROPOUT_UTILS_H_
+#define TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_DROPOUT_UTILS_H_
 
 #include <unordered_set>
 #include <vector>
@@ -74,4 +74,4 @@ class DropoutUtils {
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_DROPOUT_UTILS_H_
+#endif  // TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_DROPOUT_UTILS_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/example.h b/tensorflow/contrib/boosted_trees/lib/utils/example.h
index 54f60e1dee..1371ff337f 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/example.h
+++ b/tensorflow/contrib/boosted_trees/lib/utils/example.h
@@ -13,8 +13,8 @@
 // limitations under the License.
 // =============================================================================
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_EXAMPLE_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_EXAMPLE_H_
+#ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_EXAMPLE_H_
+#define TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_EXAMPLE_H_
 
 #include <algorithm>
 #include <unordered_set>
@@ -131,4 +131,4 @@ struct Example {
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_EXAMPLE_H_
+#endif  // TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_EXAMPLE_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/examples_iterable.h b/tensorflow/contrib/boosted_trees/lib/utils/examples_iterable.h
index 5b33c81588..1b654e1c44 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/examples_iterable.h
+++ b/tensorflow/contrib/boosted_trees/lib/utils/examples_iterable.h
@@ -13,8 +13,8 @@
 // limitations under the License.
 // =============================================================================
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_EXAMPLES_ITERABLE_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_EXAMPLES_ITERABLE_H_
+#ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_EXAMPLES_ITERABLE_H_
+#define TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_EXAMPLES_ITERABLE_H_
 
 #include <vector>
 
@@ -205,4 +205,4 @@ class ExamplesIterable {
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_EXAMPLES_ITERABLE_H_
+#endif  // TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_EXAMPLES_ITERABLE_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/macros.h b/tensorflow/contrib/boosted_trees/lib/utils/macros.h
index 28ea0a4dc1..9a53fb2ef7 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/macros.h
+++ b/tensorflow/contrib/boosted_trees/lib/utils/macros.h
@@ -13,8 +13,8 @@
 // limitations under the License.
 // =============================================================================
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_MACROS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_MACROS_H_
+#ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_MACROS_H_
+#define TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_MACROS_H_
 
 #include "tensorflow/core/platform/macros.h"
 
@@ -23,4 +23,4 @@
     return (STATUS);                              \
   }
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_MACROS_H_
+#endif  // TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_MACROS_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/optional_value.h b/tensorflow/contrib/boosted_trees/lib/utils/optional_value.h
index c141fe059d..b2166f53d7 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/optional_value.h
+++ b/tensorflow/contrib/boosted_trees/lib/utils/optional_value.h
@@ -13,8 +13,8 @@
 // limitations under the License.
 // =============================================================================
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_OPTIONAL_VALUE_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_OPTIONAL_VALUE_H_
+#ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_OPTIONAL_VALUE_H_
+#define TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_OPTIONAL_VALUE_H_
 
 #include "tensorflow/core/platform/logging.h"
 
@@ -44,4 +44,4 @@ class OptionalValue {
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_OPTIONAL_VALUE_H_
+#endif  // TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_OPTIONAL_VALUE_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/parallel_for.h b/tensorflow/contrib/boosted_trees/lib/utils/parallel_for.h
index c80431b558..ec06787e1d 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/parallel_for.h
+++ b/tensorflow/contrib/boosted_trees/lib/utils/parallel_for.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LIB_UTILS_PARALLEL_FOR_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LIB_UTILS_PARALLEL_FOR_H_
+#ifndef TENSORFLOW_CONTRIB_LIB_UTILS_PARALLEL_FOR_H_
+#define TENSORFLOW_CONTRIB_LIB_UTILS_PARALLEL_FOR_H_
 
 #include "tensorflow/core/lib/core/threadpool.h"
 
@@ -30,4 +30,4 @@ void ParallelFor(int64 batch_size, int64 desired_parallelism,
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LIB_UTILS_PARALLEL_FOR_H_
+#endif  // TENSORFLOW_CONTRIB_LIB_UTILS_PARALLEL_FOR_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/random.h b/tensorflow/contrib/boosted_trees/lib/utils/random.h
index 6dd55fcacc..546d344f55 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/random.h
+++ b/tensorflow/contrib/boosted_trees/lib/utils/random.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LIB_UTILS_RANDOM_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LIB_UTILS_RANDOM_H_
+#ifndef TENSORFLOW_CONTRIB_LIB_UTILS_RANDOM_H_
+#define TENSORFLOW_CONTRIB_LIB_UTILS_RANDOM_H_
 
 #include "tensorflow/core/lib/random/simple_philox.h"
 
@@ -36,4 +36,4 @@ inline int32 PoissonBootstrap(random::SimplePhilox* rng) {
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LIB_UTILS_RANDOM_H_
+#endif  // TENSORFLOW_CONTRIB_LIB_UTILS_RANDOM_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/sparse_column_iterable.h b/tensorflow/contrib/boosted_trees/lib/utils/sparse_column_iterable.h
index 9664c9d1c6..87fb1fbf5a 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/sparse_column_iterable.h
+++ b/tensorflow/contrib/boosted_trees/lib/utils/sparse_column_iterable.h
@@ -13,8 +13,8 @@
 // limitations under the License.
 // =============================================================================
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_SPARSE_COLUMN_ITERABLE_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_SPARSE_COLUMN_ITERABLE_H_
+#ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_SPARSE_COLUMN_ITERABLE_H_
+#define TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_SPARSE_COLUMN_ITERABLE_H_
 
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
@@ -127,4 +127,4 @@ class SparseColumnIterable {
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_SPARSE_COLUMN_ITERABLE_H_
+#endif  // TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_SPARSE_COLUMN_ITERABLE_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/tensor_utils.h b/tensorflow/contrib/boosted_trees/lib/utils/tensor_utils.h
index 58f5e5a0d1..475d3718ec 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/tensor_utils.h
+++ b/tensorflow/contrib/boosted_trees/lib/utils/tensor_utils.h
@@ -13,8 +13,8 @@
 // limitations under the License.
 // =============================================================================
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_TENSOR_UTILS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_TENSOR_UTILS_H_
+#ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_TENSOR_UTILS_H_
+#define TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_TENSOR_UTILS_H_
 
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -57,4 +57,4 @@ class TensorUtils {
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_TENSOR_UTILS_H_
+#endif  // TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_TENSOR_UTILS_H_
diff --git a/tensorflow/contrib/boosted_trees/resources/decision_tree_ensemble_resource.h b/tensorflow/contrib/boosted_trees/resources/decision_tree_ensemble_resource.h
index ad9c8961aa..3ebf28ea44 100644
--- a/tensorflow/contrib/boosted_trees/resources/decision_tree_ensemble_resource.h
+++ b/tensorflow/contrib/boosted_trees/resources/decision_tree_ensemble_resource.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_DECISION_TREE_ENSEMBLE_RESOURCE_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_DECISION_TREE_ENSEMBLE_RESOURCE_H_
+#ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_DECISION_TREE_ENSEMBLE_RESOURCE_H_
+#define TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_DECISION_TREE_ENSEMBLE_RESOURCE_H_
 
 #include "tensorflow/contrib/boosted_trees/lib/trees/decision_tree.h"
 #include "tensorflow/contrib/boosted_trees/resources/stamped_resource.h"
@@ -179,4 +179,4 @@ class DecisionTreeEnsembleResource : public StampedResource {
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_DECISION_TREE_ENSEMBLE_RESOURCE_H_
+#endif  // TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_DECISION_TREE_ENSEMBLE_RESOURCE_H_
diff --git a/tensorflow/contrib/boosted_trees/resources/quantile_stream_resource.h b/tensorflow/contrib/boosted_trees/resources/quantile_stream_resource.h
index c1fa35eaf1..fdaaae7f47 100644
--- a/tensorflow/contrib/boosted_trees/resources/quantile_stream_resource.h
+++ b/tensorflow/contrib/boosted_trees/resources/quantile_stream_resource.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_QUANTILE_STREAM_RESOURCE_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_QUANTILE_STREAM_RESOURCE_H_
+#ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_QUANTILE_STREAM_RESOURCE_H_
+#define TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_QUANTILE_STREAM_RESOURCE_H_
 
 #include "tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h"
 #include "tensorflow/contrib/boosted_trees/proto/quantiles.pb.h"  // NOLINT
@@ -113,4 +113,4 @@ class QuantileStreamResource : public StampedResource {
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_QUANTILE_STREAM_RESOURCE_H_
+#endif  // TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_QUANTILE_STREAM_RESOURCE_H_
diff --git a/tensorflow/contrib/boosted_trees/resources/stamped_resource.h b/tensorflow/contrib/boosted_trees/resources/stamped_resource.h
index aabeeb9851..957bbe8d61 100644
--- a/tensorflow/contrib/boosted_trees/resources/stamped_resource.h
+++ b/tensorflow/contrib/boosted_trees/resources/stamped_resource.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_STAMPED_RESOURCE_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_STAMPED_RESOURCE_H_
+#ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_STAMPED_RESOURCE_H_
+#define TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_STAMPED_RESOURCE_H_
 
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -39,4 +39,4 @@ class StampedResource : public ResourceBase {
 
 }  // namespace boosted_trees
 }  // namespace tensorflow
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_STAMPED_RESOURCE_H_
+#endif  // TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_STAMPED_RESOURCE_H_
diff --git a/tensorflow/contrib/cloud/kernels/bigquery_table_accessor.h b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor.h
index 7d0eee59ae..b349063715 100644
--- a/tensorflow/contrib/cloud/kernels/bigquery_table_accessor.h
+++ b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_CLOUD_BIGQUERY_PARTITION_ACCESSOR_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_CLOUD_BIGQUERY_PARTITION_ACCESSOR_H_
+#ifndef TENSORFLOW_CORE_KERNELS_CLOUD_BIGQUERY_PARTITION_ACCESSOR_H_
+#define TENSORFLOW_CORE_KERNELS_CLOUD_BIGQUERY_PARTITION_ACCESSOR_H_
 
 #include <map>
 #include <memory>
@@ -205,4 +205,4 @@ class BigQueryTableAccessor {
 };
 
 }  // namespace tensorflow
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_CLOUD_BIGQUERY_PARTITION_ACCESSOR_H_
+#endif  // TENSORFLOW_CORE_KERNELS_CLOUD_BIGQUERY_PARTITION_ACCESSOR_H_
diff --git a/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test_data.h b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test_data.h
index b2b11f4f57..59f2333298 100644
--- a/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test_data.h
+++ b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test_data.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_CLOUD_BIGQUERY_TABLE_ACCESSOR_TEST_DATA_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_CLOUD_BIGQUERY_TABLE_ACCESSOR_TEST_DATA_H_
+#ifndef TENSORFLOW_CORE_KERNELS_CLOUD_BIGQUERY_TABLE_ACCESSOR_TEST_DATA_H_
+#define TENSORFLOW_CORE_KERNELS_CLOUD_BIGQUERY_TABLE_ACCESSOR_TEST_DATA_H_
 
 #include <string>
 
@@ -401,4 +401,4 @@ const string kTestEmptyRow = R"({
 }  // namespace
 }  // namepsace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_CLOUD_BIGQUERY_TABLE_ACCESSOR_TEST_DATA_H_
+#endif  // TENSORFLOW_CORE_KERNELS_CLOUD_BIGQUERY_TABLE_ACCESSOR_TEST_DATA_H_
diff --git a/tensorflow/contrib/coder/kernels/range_coder.h b/tensorflow/contrib/coder/kernels/range_coder.h
index c24fb707fc..f46413072e 100644
--- a/tensorflow/contrib/coder/kernels/range_coder.h
+++ b/tensorflow/contrib/coder/kernels/range_coder.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_CODER_KERNELS_RANGE_CODER_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_CODER_KERNELS_RANGE_CODER_H_
+#ifndef TENSORFLOW_CONTRIB_CODER_KERNELS_RANGE_CODER_H_
+#define TENSORFLOW_CONTRIB_CODER_KERNELS_RANGE_CODER_H_
 
 #include <limits>
 #include <string>
@@ -106,4 +106,4 @@ class RangeDecoder {
 };
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_CODER_KERNELS_RANGE_CODER_H_
+#endif  // TENSORFLOW_CONTRIB_CODER_KERNELS_RANGE_CODER_H_
diff --git a/tensorflow/contrib/coder/kernels/range_coder_ops_util.h b/tensorflow/contrib/coder/kernels/range_coder_ops_util.h
index 95241a8682..b8aabcef62 100644
--- a/tensorflow/contrib/coder/kernels/range_coder_ops_util.h
+++ b/tensorflow/contrib/coder/kernels/range_coder_ops_util.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_CODER_KERNELS_RANGE_CODER_OPS_UTIL_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_CODER_KERNELS_RANGE_CODER_OPS_UTIL_H_
+#ifndef TENSORFLOW_CONTRIB_CODER_KERNELS_RANGE_CODER_OPS_UTIL_H_
+#define TENSORFLOW_CONTRIB_CODER_KERNELS_RANGE_CODER_OPS_UTIL_H_
 
 #include <vector>
 
@@ -30,4 +30,4 @@ Status MergeAxes(const TensorShape& broadcast_shape,
                  std::vector<int64>* merged_storage_shape_pointer);
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_CODER_KERNELS_RANGE_CODER_OPS_UTIL_H_
+#endif  // TENSORFLOW_CONTRIB_CODER_KERNELS_RANGE_CODER_OPS_UTIL_H_
diff --git a/tensorflow/contrib/ffmpeg/ffmpeg_lib.h b/tensorflow/contrib/ffmpeg/ffmpeg_lib.h
index bc1733ead8..a8d5a0dd83 100644
--- a/tensorflow/contrib/ffmpeg/ffmpeg_lib.h
+++ b/tensorflow/contrib/ffmpeg/ffmpeg_lib.h
@@ -13,8 +13,8 @@
 // limitations under the License.
 // =============================================================================
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_FFMPEG_FFMPEG_LIB_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_FFMPEG_FFMPEG_LIB_H_
+#ifndef TENSORFLOW_CONTRIB_FFMPEG_FFMPEG_LIB_H_
+#define TENSORFLOW_CONTRIB_FFMPEG_FFMPEG_LIB_H_
 
 #include <string>
 #include <vector>
@@ -61,4 +61,4 @@ Status ReadVideoFile(const string& filename, std::vector<uint8>* output_data,
 }  // namespace ffmpeg
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_FFMPEG_DEFAULT_FFMPEG_LIB_H_
+#endif  // TENSORFLOW_CONTRIB_FFMPEG_DEFAULT_FFMPEG_LIB_H_
diff --git a/tensorflow/contrib/fused_conv/kernels/fused_conv_ops_gpu.h b/tensorflow/contrib/fused_conv/kernels/fused_conv_ops_gpu.h
index fa7a3c03aa..ba52697679 100644
--- a/tensorflow/contrib/fused_conv/kernels/fused_conv_ops_gpu.h
+++ b/tensorflow/contrib/fused_conv/kernels/fused_conv_ops_gpu.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_FUSED_CONV_KERNELS_FUSED_CONV_OPS_GPU_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_FUSED_CONV_KERNELS_FUSED_CONV_OPS_GPU_H_
+#ifndef TENSORFLOW_CONTRIB_FUSED_CONV_KERNELS_FUSED_CONV_OPS_GPU_H_
+#define TENSORFLOW_CONTRIB_FUSED_CONV_KERNELS_FUSED_CONV_OPS_GPU_H_
 
 #if GOOGLE_CUDA
 
@@ -72,4 +72,4 @@ class FusedConvParameters : public ConvParameters {
 
 #endif  // GOOGLE_CUDA
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_FUSED_CONV_KERNELS_FUSED_CONV_OPS_GPU_H_
+#endif  // TENSORFLOW_CONTRIB_FUSED_CONV_KERNELS_FUSED_CONV_OPS_GPU_H_
diff --git a/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.h b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.h
index 194ae2ba47..8968da6d82 100644
--- a/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.h
+++ b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.h
@@ -11,8 +11,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_IMAGE_KERNELS_ADJUST_HSV_IN_YIQ_OP_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_IMAGE_KERNELS_ADJUST_HSV_IN_YIQ_OP_H_
+#ifndef TENSORFLOW_CONTRIB_IMAGE_KERNELS_ADJUST_HSV_IN_YIQ_OP_H_
+#define TENSORFLOW_CONTRIB_IMAGE_KERNELS_ADJUST_HSV_IN_YIQ_OP_H_
 
 #if GOOGLE_CUDA
 #define EIGEN_USE_GPU
@@ -84,4 +84,4 @@ struct AdjustHsvInYiqGPU {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_IMAGE_KERNELS_ADJUST_HSV_IN_YIQ_OP_H_
+#endif  // TENSORFLOW_CONTRIB_IMAGE_KERNELS_ADJUST_HSV_IN_YIQ_OP_H_
diff --git a/tensorflow/contrib/lite/allocation.h b/tensorflow/contrib/lite/allocation.h
index ee8a7ccd0b..68aee2e644 100644
--- a/tensorflow/contrib/lite/allocation.h
+++ b/tensorflow/contrib/lite/allocation.h
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 // Main abstraction controlling the tflite interpreter.
 // See context.h for the API for defining operations (TfLiteRegistration).
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_ALLOCATION_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_ALLOCATION_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_ALLOCATION_H_
+#define TENSORFLOW_CONTRIB_LITE_ALLOCATION_H_
 
 #include <cstdio>
 #include <cstdlib>
@@ -91,4 +91,4 @@ class MemoryAllocation : public Allocation {
 
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_ALLOCATION_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_ALLOCATION_H_
diff --git a/tensorflow/contrib/lite/arena_planner.h b/tensorflow/contrib/lite/arena_planner.h
index bd87414ec3..58bc164619 100644
--- a/tensorflow/contrib/lite/arena_planner.h
+++ b/tensorflow/contrib/lite/arena_planner.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_ARENA_PLANNER_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_ARENA_PLANNER_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_ARENA_PLANNER_H_
+#define TENSORFLOW_CONTRIB_LITE_ARENA_PLANNER_H_
 
 #include <memory>
 #include <vector>
@@ -104,4 +104,4 @@ class ArenaPlanner : public MemoryPlanner {
 
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_ARENA_PLANNER_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_ARENA_PLANNER_H_
diff --git a/tensorflow/contrib/lite/builtin_op_data.h b/tensorflow/contrib/lite/builtin_op_data.h
index 8269de8e3f..0b48ef4741 100644
--- a/tensorflow/contrib/lite/builtin_op_data.h
+++ b/tensorflow/contrib/lite/builtin_op_data.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_BUILTIN_OP_DATA_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_BUILTIN_OP_DATA_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_BUILTIN_OP_DATA_H_
+#define TENSORFLOW_CONTRIB_LITE_BUILTIN_OP_DATA_H_
 
 #include <stdint.h>
 
@@ -239,4 +239,4 @@ typedef struct {
 }  // extern "C"
 #endif  // __cplusplus
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_BUILTIN_OP_DATA_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_BUILTIN_OP_DATA_H_
diff --git a/tensorflow/contrib/lite/context.h b/tensorflow/contrib/lite/context.h
index fca7116503..d6dfc20ae8 100644
--- a/tensorflow/contrib/lite/context.h
+++ b/tensorflow/contrib/lite/context.h
@@ -26,8 +26,8 @@ limitations under the License.
 // TfLiteRegistration - the implementation of a conceptual operation.
 //
 // Some abstractions in this file are created and managed by Interpreter.
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_CONTEXT_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_CONTEXT_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_CONTEXT_H_
+#define TENSORFLOW_CONTRIB_LITE_CONTEXT_H_
 
 #include <stdint.h>
 #include <stdlib.h>
@@ -296,4 +296,4 @@ typedef struct {
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_CONTEXT_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_CONTEXT_H_
diff --git a/tensorflow/contrib/lite/error_reporter.h b/tensorflow/contrib/lite/error_reporter.h
index d5715e4f90..da193d2586 100644
--- a/tensorflow/contrib/lite/error_reporter.h
+++ b/tensorflow/contrib/lite/error_reporter.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_ERROR_REPORTER_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_ERROR_REPORTER_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_ERROR_REPORTER_H_
+#define TENSORFLOW_CONTRIB_LITE_ERROR_REPORTER_H_
 
 #include <cstdarg>
 #include "tensorflow/contrib/lite/context.h"
@@ -51,4 +51,4 @@ ErrorReporter* DefaultErrorReporter();
 
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_ERROR_REPORTER_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_ERROR_REPORTER_H_
diff --git a/tensorflow/contrib/lite/graph_info.h b/tensorflow/contrib/lite/graph_info.h
index 5481aede60..57690058c4 100644
--- a/tensorflow/contrib/lite/graph_info.h
+++ b/tensorflow/contrib/lite/graph_info.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_GRAPH_INFO_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_GRAPH_INFO_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_GRAPH_INFO_H_
+#define TENSORFLOW_CONTRIB_LITE_GRAPH_INFO_H_
 
 #include <vector>
 
@@ -50,4 +50,4 @@ class GraphInfo {
 
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_GRAPH_INFO_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_GRAPH_INFO_H_
diff --git a/tensorflow/contrib/lite/interpreter.h b/tensorflow/contrib/lite/interpreter.h
index 38dd402e8a..4f732769f9 100644
--- a/tensorflow/contrib/lite/interpreter.h
+++ b/tensorflow/contrib/lite/interpreter.h
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 // Main abstraction controlling the tflite interpreter.
 // See context.h for the API for defining operations (TfLiteRegistration).
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_INTERPRETER_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_INTERPRETER_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_INTERPRETER_H_
+#define TENSORFLOW_CONTRIB_LITE_INTERPRETER_H_
 
 #include <cstdio>
 #include <cstdlib>
@@ -363,4 +363,4 @@ class Interpreter {
 };
 
 }  // namespace tflite
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_INTERPRETER_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_INTERPRETER_H_
diff --git a/tensorflow/contrib/lite/kernels/activation_functor.h b/tensorflow/contrib/lite/kernels/activation_functor.h
index cfb3369e99..41ec3cca33 100644
--- a/tensorflow/contrib/lite/kernels/activation_functor.h
+++ b/tensorflow/contrib/lite/kernels/activation_functor.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_ACTIVATION_FUNCTOR_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_ACTIVATION_FUNCTOR_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_ACTIVATION_FUNCTOR_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_ACTIVATION_FUNCTOR_H_
 
 #include <algorithm>
 #include <cmath>
@@ -55,4 +55,4 @@ class ActivationFunctor {
 
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_ACTIVATION_FUNCTOR_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_ACTIVATION_FUNCTOR_H_
diff --git a/tensorflow/contrib/lite/kernels/gemm_support.h b/tensorflow/contrib/lite/kernels/gemm_support.h
index b531959ffb..466781cbce 100644
--- a/tensorflow/contrib/lite/kernels/gemm_support.h
+++ b/tensorflow/contrib/lite/kernels/gemm_support.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_GEMM_SUPPORT_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_GEMM_SUPPORT_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_GEMM_SUPPORT_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_GEMM_SUPPORT_H_
 
 #include "public/gemmlowp.h"
 #include "tensorflow/contrib/lite/context.h"
@@ -51,4 +51,4 @@ void SetMaxNumThreads(TfLiteContext* context, int num_threads);
 }  // namespace gemm_support
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_GEMM_SUPPORT_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_GEMM_SUPPORT_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/common.h b/tensorflow/contrib/lite/kernels/internal/common.h
index 28f19a2506..fdeacedace 100644
--- a/tensorflow/contrib/lite/kernels/internal/common.h
+++ b/tensorflow/contrib/lite/kernels/internal/common.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMMON_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMMON_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMMON_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMMON_H_
 
 #ifndef ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
 #ifdef GEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK
@@ -104,4 +104,4 @@ inline int32 MultiplyByQuantizedMultiplierGreaterThanOne(
 
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMMON_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMMON_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/compatibility.h b/tensorflow/contrib/lite/kernels/internal/compatibility.h
index 796a03566a..1d963afb7e 100644
--- a/tensorflow/contrib/lite/kernels/internal/compatibility.h
+++ b/tensorflow/contrib/lite/kernels/internal/compatibility.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMPATIBILITY_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMPATIBILITY_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMPATIBILITY_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMPATIBILITY_H_
 
 #include <cassert>
 #include <cstdint>
@@ -75,4 +75,4 @@ using uint16 = std::uint16_t;
 using int32 = std::int32_t;
 using uint32 = std::uint32_t;
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMPATIBILITY_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMPATIBILITY_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h
index da34c8aef9..81796e295d 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_FLOAT_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_FLOAT_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_FLOAT_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_FLOAT_H_
 
 #include "public/gemmlowp.h"
 #include "tensorflow/contrib/lite/kernels/internal/common.h"
@@ -1057,4 +1057,4 @@ void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
 }  // namespace optimized_ops
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_FLOAT_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_FLOAT_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h
index 051ed2a2c4..f993fd6a00 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_
 
 #include "fixedpoint/fixedpoint.h"
 #include "public/gemmlowp.h"
@@ -1913,4 +1913,4 @@ void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
 }  // namespace optimized_ops
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/eigen_spatial_convolutions.h b/tensorflow/contrib/lite/kernels/internal/optimized/eigen_spatial_convolutions.h
index 8004c24a99..f21fbf532a 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/eigen_spatial_convolutions.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/eigen_spatial_convolutions.h
@@ -16,8 +16,8 @@ limitations under the License.
 // Copied from tensorflow/core/kernels/eigen_spatial_convolutions.h.
 // TODO(petewarden) - move this to a common location in Eigen itself.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_SPATIAL_CONVOLUTIONS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_SPATIAL_CONVOLUTIONS_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_SPATIAL_CONVOLUTIONS_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_SPATIAL_CONVOLUTIONS_H_
 
 #define EIGEN_USE_CUSTOM_THREAD_POOL
 #define EIGEN_USE_THREADS
@@ -228,4 +228,4 @@ EIGEN_DEVICE_FUNC
 
 // clang-format on
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_SPATIAL_CONVOLUTIONS_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_SPATIAL_CONVOLUTIONS_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h b/tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h
index 7f78f69360..d85e06a5d5 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_GOOGLE_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_GOOGLE_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_GOOGLE_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_GOOGLE_H_
 
 #define EIGEN_USE_CUSTOM_THREAD_POOL
 #define EIGEN_USE_THREADS
@@ -140,4 +140,4 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h"
 
 #include "Eigen/src/Core/util/ReenableStupidWarnings.h"
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_H
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_H
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_oss.h b/tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_oss.h
index 1d5c316194..d34708b8fd 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_oss.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_oss.h
@@ -19,8 +19,8 @@ limitations under the License.
 // clang-format off
 
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_OSS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_OSS_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_OSS_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_OSS_H_
 
 
 #include "Eigen/Core"
@@ -164,4 +164,4 @@ typedef unsigned __int64 uint64_t;
 #include "Eigen/src/Core/util/ReenableStupidWarnings.h"
 
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_OSS_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_OSS_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h b/tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h
index b3615f4658..0bfb4e9b1f 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_MULTITHREAD_CONV
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_MULTITHREAD_CONV
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_MULTITHREAD_CONV
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_MULTITHREAD_CONV
 
 #include <assert.h>
 #include <stdint.h>
@@ -192,4 +192,4 @@ inline void Conv(const float* input_data, const Dims<4>& input_dims,
 }  // namespace multithreaded_ops
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_MULTITHREAD_CONV
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_MULTITHREAD_CONV
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.h b/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.h
index 3a4af87304..b7e317dc60 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_H_
 
 // TODO(ghodrat): Remove this header file and the dependency to internal data
 // structure.
@@ -110,4 +110,4 @@ void ReductionSumVector(const float* input_vector, float* output_vector,
 }  // namespace tensor_utils
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
index 1b1f455855..c35b9da938 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_OPS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_OPS_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_OPS_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_OPS_H_
 
 #include <assert.h>
 #include <stdint.h>
@@ -3805,4 +3805,4 @@ void ArgMax(const T3* axis, const T1* input_data, const Dims<4>& input_dims,
 #pragma GCC diagnostic pop
 #endif
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_OPS_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_OPS_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_float.h b/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_float.h
index 8e0f234545..9aabee5000 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_float.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_float.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_FLOAT_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_FLOAT_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_FLOAT_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_FLOAT_H_
 
 #include "tensorflow/contrib/lite/kernels/internal/common.h"
 #include "tensorflow/contrib/lite/kernels/internal/compatibility.h"
@@ -112,4 +112,4 @@ void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
 }  // end namespace reference_ops
 }  // end namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_FLOAT_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_FLOAT_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h b/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h
index 8a80558b32..e9b6baeaee 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_UINT8_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_UINT8_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_UINT8_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_UINT8_H_
 
 #include <algorithm>
 
@@ -135,4 +135,4 @@ void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
 }  // end namespace reference_ops
 }  // end namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_UINT8_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_UINT8_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.h b/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.h
index 7f90d731b8..afc3e26e79 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_H_
 
 // TDOD(ghodrat): Remove this header file and the dependency to internal data
 // structure.
@@ -186,4 +186,4 @@ void ReductionSumVector(const float* input_vector, float* output_vector,
 }  // namespace tensor_utils
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index 03708eb32b..64651d8348 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_
 
 #include <stdint.h>
 #include <sys/types.h>
@@ -2655,4 +2655,4 @@ void Transpose(const T* input, const Dims<4>& input_dims, T* output,
 }  // namespace reference_ops
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/round.h b/tensorflow/contrib/lite/kernels/internal/round.h
index 38525b0e20..f299d0bd87 100644
--- a/tensorflow/contrib/lite/kernels/internal/round.h
+++ b/tensorflow/contrib/lite/kernels/internal/round.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_ROUND_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_ROUND_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_ROUND_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_ROUND_H_
 
 #include <cmath>
 
@@ -36,4 +36,4 @@ inline T TfLiteRound(const T x) {
 
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_ROUND_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_ROUND_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/tensor.h b/tensorflow/contrib/lite/kernels/internal/tensor.h
index 1961e1a2d5..dfe76c2afd 100644
--- a/tensorflow/contrib/lite/kernels/internal/tensor.h
+++ b/tensorflow/contrib/lite/kernels/internal/tensor.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_H_
 
 #include <vector>
 #include "tensorflow/contrib/lite/context.h"
@@ -83,4 +83,4 @@ inline Dims<4> GetTensorDims(const TfLiteTensor* tensor) {
 
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/tensor_utils.h b/tensorflow/contrib/lite/kernels/internal/tensor_utils.h
index e7e2994397..40d144979b 100644
--- a/tensorflow/contrib/lite/kernels/internal/tensor_utils.h
+++ b/tensorflow/contrib/lite/kernels/internal/tensor_utils.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_UTILS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_UTILS_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_UTILS_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_UTILS_H_
 
 #include "tensorflow/contrib/lite/builtin_op_data.h"
 
@@ -113,4 +113,4 @@ void ReductionSumVector(const float* input_vector, float* output_vector,
 }  // namespace tensor_utils
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_UTILS_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_UTILS_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/types.h b/tensorflow/contrib/lite/kernels/internal/types.h
index 5989ac8fcd..afe131b06e 100644
--- a/tensorflow/contrib/lite/kernels/internal/types.h
+++ b/tensorflow/contrib/lite/kernels/internal/types.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TYPES_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TYPES_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TYPES_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TYPES_H_
 
 #include "tensorflow/contrib/lite/kernels/internal/compatibility.h"
 
@@ -134,4 +134,4 @@ bool IsPackedWithoutStrides(const Dims<N>& dims) {
 
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TYPES_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TYPES_H_
diff --git a/tensorflow/contrib/lite/kernels/kernel_util.h b/tensorflow/contrib/lite/kernels/kernel_util.h
index 25556ae456..1cf30ecff9 100644
--- a/tensorflow/contrib/lite/kernels/kernel_util.h
+++ b/tensorflow/contrib/lite/kernels/kernel_util.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_KERNEL_UTIL_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_KERNEL_UTIL_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_KERNEL_UTIL_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_KERNEL_UTIL_H_
 
 #include "tensorflow/contrib/lite/builtin_op_data.h"
 #include "tensorflow/contrib/lite/context.h"
@@ -62,4 +62,4 @@ void CalculateActivationRangeFloat(TfLiteFusedActivation activation,
 
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_KERNEL_UTIL_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_KERNEL_UTIL_H_
diff --git a/tensorflow/contrib/lite/kernels/op_macros.h b/tensorflow/contrib/lite/kernels/op_macros.h
index 63670efcb1..7568eaa88e 100644
--- a/tensorflow/contrib/lite/kernels/op_macros.h
+++ b/tensorflow/contrib/lite/kernels/op_macros.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_OP_UTIL_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_OP_UTIL_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_OP_UTIL_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_OP_UTIL_H_
 
 #include <cstdio>
 
@@ -31,4 +31,4 @@ limitations under the License.
     if ((x) != (y)) TF_LITE_FATAL(#x " didn't equal " #y); \
   } while (0)
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_OP_UTIL_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_OP_UTIL_H_
diff --git a/tensorflow/contrib/lite/kernels/padding.h b/tensorflow/contrib/lite/kernels/padding.h
index 3a60274524..40b8476b37 100644
--- a/tensorflow/contrib/lite/kernels/padding.h
+++ b/tensorflow/contrib/lite/kernels/padding.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_PADDING_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_PADDING_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_PADDING_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_PADDING_H_
 
 namespace tflite {
 
@@ -25,4 +25,4 @@ inline int ComputePadding(int stride, int in_size, int filter_size,
 
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_PADDING_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_PADDING_H_
diff --git a/tensorflow/contrib/lite/kernels/register.h b/tensorflow/contrib/lite/kernels/register.h
index 28f5e0fcc8..b9cff0ae21 100644
--- a/tensorflow/contrib/lite/kernels/register.h
+++ b/tensorflow/contrib/lite/kernels/register.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_REGISTER_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_REGISTER_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_REGISTER_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_REGISTER_H_
 
 #include <unordered_map>
 #include "tensorflow/contrib/lite/context.h"
@@ -47,4 +47,4 @@ class BuiltinOpResolver : public OpResolver {
 }  // namespace ops
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_BUILTIN_KERNELS_H
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_BUILTIN_KERNELS_H
diff --git a/tensorflow/contrib/lite/kernels/test_util.h b/tensorflow/contrib/lite/kernels/test_util.h
index b9c0ba8f47..cc445299ff 100644
--- a/tensorflow/contrib/lite/kernels/test_util.h
+++ b/tensorflow/contrib/lite/kernels/test_util.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_TEST_UTIL_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_TEST_UTIL_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_TEST_UTIL_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_TEST_UTIL_H_
 
 #include <vector>
 
@@ -202,4 +202,4 @@ template <>
 std::vector<string> SingleOpModel::ExtractVector(int index);
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_TEST_UTIL_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_TEST_UTIL_H_
diff --git a/tensorflow/contrib/lite/memory_planner.h b/tensorflow/contrib/lite/memory_planner.h
index b11d86c375..5cd6c20850 100644
--- a/tensorflow/contrib/lite/memory_planner.h
+++ b/tensorflow/contrib/lite/memory_planner.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_MEMORY_PLANNER_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_MEMORY_PLANNER_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_MEMORY_PLANNER_H_
+#define TENSORFLOW_CONTRIB_LITE_MEMORY_PLANNER_H_
 
 #include "tensorflow/contrib/lite/context.h"
 
@@ -42,4 +42,4 @@ class MemoryPlanner {
 
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_MEMORY_PLANNER_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_MEMORY_PLANNER_H_
diff --git a/tensorflow/contrib/lite/model.h b/tensorflow/contrib/lite/model.h
index e0c96f7f04..a467df5bb4 100644
--- a/tensorflow/contrib/lite/model.h
+++ b/tensorflow/contrib/lite/model.h
@@ -31,8 +31,8 @@ limitations under the License.
 // OpResolver must be defined to provide your kernel implementations to the
 // interpreter. This is environment specific and may consist of just the builtin
 // ops, or some custom operators you defined to extend tflite.
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_MODEL_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_MODEL_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_MODEL_H_
+#define TENSORFLOW_CONTRIB_LITE_MODEL_H_
 
 #include <memory>
 #include "tensorflow/contrib/lite/error_reporter.h"
@@ -173,4 +173,4 @@ class InterpreterBuilder {
 
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_MODEL_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_MODEL_H_
diff --git a/tensorflow/contrib/lite/models/smartreply/predictor.h b/tensorflow/contrib/lite/models/smartreply/predictor.h
index d17323a3f9..90260c8d62 100644
--- a/tensorflow/contrib/lite/models/smartreply/predictor.h
+++ b/tensorflow/contrib/lite/models/smartreply/predictor.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_MODELS_SMARTREPLY_PREDICTOR_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_MODELS_SMARTREPLY_PREDICTOR_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_MODELS_SMARTREPLY_PREDICTOR_H_
+#define TENSORFLOW_CONTRIB_LITE_MODELS_SMARTREPLY_PREDICTOR_H_
 
 #include <string>
 #include <vector>
@@ -77,4 +77,4 @@ struct SmartReplyConfig {
 }  // namespace custom
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_MODELS_SMARTREPLY_PREDICTOR_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_MODELS_SMARTREPLY_PREDICTOR_H_
diff --git a/tensorflow/contrib/lite/nnapi_delegate.h b/tensorflow/contrib/lite/nnapi_delegate.h
index f29aa9e18e..e98000929a 100644
--- a/tensorflow/contrib/lite/nnapi_delegate.h
+++ b/tensorflow/contrib/lite/nnapi_delegate.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_NNAPI_DELEGATE_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_NNAPI_DELEGATE_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_NNAPI_DELEGATE_H_
+#define TENSORFLOW_CONTRIB_LITE_NNAPI_DELEGATE_H_
 
 #include "tensorflow/contrib/lite/allocation.h"
 #include "tensorflow/contrib/lite/context.h"
@@ -63,4 +63,4 @@ class NNAPIDelegate {
 
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_NNAPI_DELEGATE_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_NNAPI_DELEGATE_H_
diff --git a/tensorflow/contrib/lite/optional_debug_tools.h b/tensorflow/contrib/lite/optional_debug_tools.h
index 54d4876095..1b6998cda3 100644
--- a/tensorflow/contrib/lite/optional_debug_tools.h
+++ b/tensorflow/contrib/lite/optional_debug_tools.h
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 // Optional debugging functionality. For small sized binaries, these are not
 // needed.
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_DEBUG_TOOLS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_DEBUG_TOOLS_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_DEBUG_TOOLS_H_
+#define TENSORFLOW_CONTRIB_LITE_DEBUG_TOOLS_H_
 
 #include "tensorflow/contrib/lite/interpreter.h"
 
@@ -29,4 +29,4 @@ TfLiteStatus ValidateInterpreterState(const Interpreter* interpreter);
 
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_DEBUG_TOOLS_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_DEBUG_TOOLS_H_
diff --git a/tensorflow/contrib/lite/simple_memory_arena.h b/tensorflow/contrib/lite/simple_memory_arena.h
index 07a38c4243..0c5e00a1f2 100644
--- a/tensorflow/contrib/lite/simple_memory_arena.h
+++ b/tensorflow/contrib/lite/simple_memory_arena.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_SIMPLE_MEMORY_ARENA_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_SIMPLE_MEMORY_ARENA_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_SIMPLE_MEMORY_ARENA_H_
+#define TENSORFLOW_CONTRIB_LITE_SIMPLE_MEMORY_ARENA_H_
 
 #include <list>
 #include <memory>
@@ -85,4 +85,4 @@ class SimpleMemoryArena {
 
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_SIMPLE_MEMORY_ARENA_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_SIMPLE_MEMORY_ARENA_H_
diff --git a/tensorflow/contrib/lite/string_util.h b/tensorflow/contrib/lite/string_util.h
index 12872d1123..c35a2fff3c 100644
--- a/tensorflow/contrib/lite/string_util.h
+++ b/tensorflow/contrib/lite/string_util.h
@@ -37,8 +37,8 @@ limitations under the License.
 //   # described above.
 //   buf.WriteToTensor(tensor)
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_STRING_UTIL_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_STRING_UTIL_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_STRING_UTIL_H_
+#define TENSORFLOW_CONTRIB_LITE_STRING_UTIL_H_
 
 #include <vector>
 
@@ -88,4 +88,4 @@ int GetStringCount(const TfLiteTensor* tensor);
 StringRef GetString(const TfLiteTensor* tensor, int string_index);
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_STRING_UTIL_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_STRING_UTIL_H_
diff --git a/tensorflow/contrib/lite/testing/message.h b/tensorflow/contrib/lite/testing/message.h
index 78ef7e2cbe..e2bc408214 100644
--- a/tensorflow/contrib/lite/testing/message.h
+++ b/tensorflow/contrib/lite/testing/message.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_MESSAGE_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_MESSAGE_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TESTING_MESSAGE_H_
+#define TENSORFLOW_CONTRIB_LITE_TESTING_MESSAGE_H_
 
 #include <memory>
 #include <string>
@@ -79,4 +79,4 @@ class Message {
 }  // namespace testing
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_MESSAGE_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TESTING_MESSAGE_H_
diff --git a/tensorflow/contrib/lite/testing/parse_testdata.h b/tensorflow/contrib/lite/testing/parse_testdata.h
index 90839fe245..7ebf362eb9 100644
--- a/tensorflow/contrib/lite/testing/parse_testdata.h
+++ b/tensorflow/contrib/lite/testing/parse_testdata.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_NNAPI_PARSE_TESTDATA_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_NNAPI_PARSE_TESTDATA_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_NNAPI_PARSE_TESTDATA_H_
+#define TENSORFLOW_CONTRIB_LITE_NNAPI_PARSE_TESTDATA_H_
 
 #include <vector>
 #include "tensorflow/contrib/lite/interpreter.h"
@@ -71,4 +71,4 @@ bool ParseAndRunTests(std::istream* input, TestRunner* test_runner);
 }  // namespace testing
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_NNAPI_PARSE_TESTDATA_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_NNAPI_PARSE_TESTDATA_H_
diff --git a/tensorflow/contrib/lite/testing/split.h b/tensorflow/contrib/lite/testing/split.h
index cfc1e929e9..428cfda4f2 100644
--- a/tensorflow/contrib/lite/testing/split.h
+++ b/tensorflow/contrib/lite/testing/split.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_SPLIT_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_SPLIT_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TESTING_SPLIT_H_
+#define TENSORFLOW_CONTRIB_LITE_TESTING_SPLIT_H_
 
 #include <cstdlib>
 #include <string>
@@ -83,4 +83,4 @@ inline std::vector<uint8_t> Split(const string& s, const string& delimiter) {
 }  // namespace testing
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_SPLIT_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TESTING_SPLIT_H_
diff --git a/tensorflow/contrib/lite/testing/test_runner.h b/tensorflow/contrib/lite/testing/test_runner.h
index f4b26949b5..60eaafa474 100644
--- a/tensorflow/contrib/lite/testing/test_runner.h
+++ b/tensorflow/contrib/lite/testing/test_runner.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_TEST_RUNNER_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_TEST_RUNNER_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TESTING_TEST_RUNNER_H_
+#define TENSORFLOW_CONTRIB_LITE_TESTING_TEST_RUNNER_H_
 
 #include <memory>
 #include <string>
@@ -121,4 +121,4 @@ class TestRunner {
 
 }  // namespace testing
 }  // namespace tflite
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_TEST_RUNNER_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TESTING_TEST_RUNNER_H_
diff --git a/tensorflow/contrib/lite/testing/tflite_driver.h b/tensorflow/contrib/lite/testing/tflite_driver.h
index 4440d4285e..25689a9fb4 100644
--- a/tensorflow/contrib/lite/testing/tflite_driver.h
+++ b/tensorflow/contrib/lite/testing/tflite_driver.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_TFLITE_DRIVER_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_TFLITE_DRIVER_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TESTING_TFLITE_DRIVER_H_
+#define TENSORFLOW_CONTRIB_LITE_TESTING_TFLITE_DRIVER_H_
 
 #include <map>
 
@@ -59,4 +59,4 @@ class TfLiteDriver : public TestRunner {
 }  // namespace testing
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_TFLITE_DRIVER_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TESTING_TFLITE_DRIVER_H_
diff --git a/tensorflow/contrib/lite/testing/tokenize.h b/tensorflow/contrib/lite/testing/tokenize.h
index daccf0e84a..7ed8eb96b7 100644
--- a/tensorflow/contrib/lite/testing/tokenize.h
+++ b/tensorflow/contrib/lite/testing/tokenize.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_TOKENIZER_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_TOKENIZER_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TESTING_TOKENIZER_H_
+#define TENSORFLOW_CONTRIB_LITE_TESTING_TOKENIZER_H_
 
 #include <istream>
 #include <string>
@@ -39,4 +39,4 @@ void Tokenize(std::istream* input, TokenProcessor* processor);
 }  // namespace testing
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_TOKENIZER_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TESTING_TOKENIZER_H_
diff --git a/tensorflow/contrib/lite/testing/util.h b/tensorflow/contrib/lite/testing/util.h
index 4d4304f022..6d20aec141 100644
--- a/tensorflow/contrib/lite/testing/util.h
+++ b/tensorflow/contrib/lite/testing/util.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_UTIL_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_UTIL_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TESTING_UTIL_H_
+#define TENSORFLOW_CONTRIB_LITE_TESTING_UTIL_H_
 
 namespace tflite {
 
@@ -25,4 +25,4 @@ inline void LogToStderr() {
 
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_UTIL_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TESTING_UTIL_H_
diff --git a/tensorflow/contrib/lite/toco/allocate_transient_arrays.h b/tensorflow/contrib/lite/toco/allocate_transient_arrays.h
index 12d0d0498f..59d8ada1e9 100644
--- a/tensorflow/contrib/lite/toco/allocate_transient_arrays.h
+++ b/tensorflow/contrib/lite/toco/allocate_transient_arrays.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_ALLOCATE_TRANSIENT_ARRAYS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_ALLOCATE_TRANSIENT_ARRAYS_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_ALLOCATE_TRANSIENT_ARRAYS_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_ALLOCATE_TRANSIENT_ARRAYS_H_
 
 #include "tensorflow/contrib/lite/toco/model.h"
 
@@ -41,4 +41,4 @@ void AllocateTransientArrays(Model* model,
 
 }  // namespace toco
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_ALLOCATE_TRANSIENT_ARRAYS_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_ALLOCATE_TRANSIENT_ARRAYS_H_
diff --git a/tensorflow/contrib/lite/toco/args.h b/tensorflow/contrib/lite/toco/args.h
index e988cbda33..8004a1a37a 100644
--- a/tensorflow/contrib/lite/toco/args.h
+++ b/tensorflow/contrib/lite/toco/args.h
@@ -15,8 +15,8 @@ limitations under the License.
 // This abstracts command line arguments in toco.
 // Arg<T> is a parseable type that can register a default value, be able to
 // parse itself, and keep track of whether it was specified.
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_ARGS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_ARGS_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_ARGS_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_ARGS_H_
 
 #include <functional>
 #include <unordered_map>
@@ -232,4 +232,4 @@ struct ParsedTocoFlags {
 };
 
 }  // namespace toco
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_ARGS_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_ARGS_H_
diff --git a/tensorflow/contrib/lite/toco/dump_graphviz.h b/tensorflow/contrib/lite/toco/dump_graphviz.h
index 0fb28e3de8..ea5a4031c3 100644
--- a/tensorflow/contrib/lite/toco/dump_graphviz.h
+++ b/tensorflow/contrib/lite/toco/dump_graphviz.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_DUMP_GRAPHVIZ_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_DUMP_GRAPHVIZ_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_DUMP_GRAPHVIZ_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_DUMP_GRAPHVIZ_H_
 
 #include <string>
 
@@ -25,4 +25,4 @@ void DumpGraphviz(const Model& model, string* output_file_contents);
 
 }  // namespace toco
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_DUMP_GRAPHVIZ_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_DUMP_GRAPHVIZ_H_
diff --git a/tensorflow/contrib/lite/toco/export_tensorflow.h b/tensorflow/contrib/lite/toco/export_tensorflow.h
index eca9774576..79682153a8 100644
--- a/tensorflow/contrib/lite/toco/export_tensorflow.h
+++ b/tensorflow/contrib/lite/toco/export_tensorflow.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_EXPORT_TENSORFLOW_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_EXPORT_TENSORFLOW_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_EXPORT_TENSORFLOW_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_EXPORT_TENSORFLOW_H_
 
 #include <string>
 #include "tensorflow/contrib/lite/toco/model.h"
@@ -24,4 +24,4 @@ void ExportTensorFlowGraphDef(const Model& model, string* output_file_contents);
 
 }  // namespace toco
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_EXPORT_TENSORFLOW_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_EXPORT_TENSORFLOW_H_
diff --git a/tensorflow/contrib/lite/toco/format_port.h b/tensorflow/contrib/lite/toco/format_port.h
index 0e999001e0..eb81e90faf 100644
--- a/tensorflow/contrib/lite/toco/format_port.h
+++ b/tensorflow/contrib/lite/toco/format_port.h
@@ -16,8 +16,8 @@ limitations under the License.
 // and util::format::AppendF. Unfortunately, type safety is not as good as a
 // a full C++ example.
 // TODO(aselle): When absl adds support for StrFormat, use that instead.
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_FORMAT_PORT_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_FORMAT_PORT_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_FORMAT_PORT_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_FORMAT_PORT_H_
 
 #include "tensorflow/contrib/lite/toco/toco_types.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
@@ -74,4 +74,4 @@ inline string StringF(const char* fmt, Args&&... args) {
 }  // namespace port
 }  // namespace toco
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_FORMAT_PORT_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_FORMAT_PORT_H_
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
index d6950cf987..4ac2265be9 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
+++ b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_GRAPH_TRANSFORMATIONS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_GRAPH_TRANSFORMATIONS_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_GRAPH_TRANSFORMATIONS_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_GRAPH_TRANSFORMATIONS_H_
 
 #include <cstddef>
 #include <initializer_list>
@@ -193,4 +193,4 @@ class RemoveTrivialReshape : public GraphTransformation {
 
 }  // end namespace toco
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_GRAPH_TRANSFORMATIONS_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_GRAPH_TRANSFORMATIONS_H_
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h
index a06181ca0b..9d448c3ee9 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_REMOVE_TRIVIAL_PASSTHROUGH_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_REMOVE_TRIVIAL_PASSTHROUGH_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_REMOVE_TRIVIAL_PASSTHROUGH_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_REMOVE_TRIVIAL_PASSTHROUGH_H_
 
 #include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
 #include "tensorflow/contrib/lite/toco/model.h"
@@ -54,4 +54,4 @@ bool RemoveTrivialPassthroughOp(GraphTransformation* transformation,
 
 }  // namespace toco
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_REMOVE_TRIVIAL_PASSTHROUGH_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_REMOVE_TRIVIAL_PASSTHROUGH_H_
diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.h b/tensorflow/contrib/lite/toco/import_tensorflow.h
index 312e3b8f17..2177872334 100644
--- a/tensorflow/contrib/lite/toco/import_tensorflow.h
+++ b/tensorflow/contrib/lite/toco/import_tensorflow.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_IMPORT_TENSORFLOW_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_IMPORT_TENSORFLOW_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_IMPORT_TENSORFLOW_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_IMPORT_TENSORFLOW_H_
 
 #include <memory>
 #include <string>
@@ -39,4 +39,4 @@ std::unique_ptr<Model> ImportTensorFlowGraphDef(
 
 }  // namespace toco
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_IMPORT_TENSORFLOW_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_IMPORT_TENSORFLOW_H_
diff --git a/tensorflow/contrib/lite/toco/model.h b/tensorflow/contrib/lite/toco/model.h
index 3cda63a8ce..b1b9b718bb 100644
--- a/tensorflow/contrib/lite/toco/model.h
+++ b/tensorflow/contrib/lite/toco/model.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_MODEL_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_MODEL_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_MODEL_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_MODEL_H_
 
 #include <initializer_list>
 #include <memory>
@@ -1591,4 +1591,4 @@ class Model {
 };
 }  // namespace toco
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_MODEL_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_MODEL_H_
diff --git a/tensorflow/contrib/lite/toco/model_cmdline_flags.h b/tensorflow/contrib/lite/toco/model_cmdline_flags.h
index 027d7ae1aa..c868d5c7d0 100644
--- a/tensorflow/contrib/lite/toco/model_cmdline_flags.h
+++ b/tensorflow/contrib/lite/toco/model_cmdline_flags.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_MODEL_CMDLINE_FLAGS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_MODEL_CMDLINE_FLAGS_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_MODEL_CMDLINE_FLAGS_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_MODEL_CMDLINE_FLAGS_H_
 
 #include <string>
 #include <unordered_map>
@@ -40,5 +40,4 @@ ParsedModelFlags* GlobalParsedModelFlags();
 
 }  // namespace toco
 
-
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_MODEL_CMDLINE_FLAGS_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_MODEL_CMDLINE_FLAGS_H_
diff --git a/tensorflow/contrib/lite/toco/runtime/common.h b/tensorflow/contrib/lite/toco/runtime/common.h
index bd55544f57..3c6828840c 100644
--- a/tensorflow/contrib/lite/toco/runtime/common.h
+++ b/tensorflow/contrib/lite/toco/runtime/common.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_RUNTIME_COMMON_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_RUNTIME_COMMON_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_RUNTIME_COMMON_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_RUNTIME_COMMON_H_
 
 #ifndef ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
 #ifdef GEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK
@@ -23,4 +23,4 @@ limitations under the License.
 
 #include "tensorflow/contrib/lite/kernels/internal/common.h"
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_RUNTIME_COMMON_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_RUNTIME_COMMON_H_
diff --git a/tensorflow/contrib/lite/toco/runtime/types.h b/tensorflow/contrib/lite/toco/runtime/types.h
index df63b2d59e..f5de5a5781 100644
--- a/tensorflow/contrib/lite/toco/runtime/types.h
+++ b/tensorflow/contrib/lite/toco/runtime/types.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_RUNTIME_TYPES_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_RUNTIME_TYPES_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_RUNTIME_TYPES_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_RUNTIME_TYPES_H_
 
 #include "tensorflow/contrib/lite/kernels/internal/common.h"
 #include "tensorflow/contrib/lite/kernels/internal/compatibility.h"
@@ -29,4 +29,4 @@ using tflite::RequiredBufferSizeForDims;
 
 }  // namespace toco
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_RUNTIME_TYPES_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_RUNTIME_TYPES_H_
diff --git a/tensorflow/contrib/lite/toco/tensorflow_util.h b/tensorflow/contrib/lite/toco/tensorflow_util.h
index 152b4f7a72..61f9104268 100644
--- a/tensorflow/contrib/lite/toco/tensorflow_util.h
+++ b/tensorflow/contrib/lite/toco/tensorflow_util.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_UTIL_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_UTIL_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_UTIL_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_UTIL_H_
 
 #include <string>
 #include <vector>
@@ -29,4 +29,4 @@ void LogDumpGraphDef(int log_level, const string& message,
 
 }  // namespace toco
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_UTIL_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_UTIL_H_
diff --git a/tensorflow/contrib/lite/toco/tflite/builtin_operator.h b/tensorflow/contrib/lite/toco/tflite/builtin_operator.h
index 93cc79ddb6..cfe7ecd9f9 100644
--- a/tensorflow/contrib/lite/toco/tflite/builtin_operator.h
+++ b/tensorflow/contrib/lite/toco/tflite/builtin_operator.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_BUILTIN_OPERATOR_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_BUILTIN_OPERATOR_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_BUILTIN_OPERATOR_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_BUILTIN_OPERATOR_H_
 
 #include "absl/memory/memory.h"
 #include "tensorflow/contrib/lite/toco/tflite/operator.h"
@@ -71,4 +71,4 @@ class BuiltinOperator : public BaseOperator {
 
 }  // namespace toco
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_BUILTIN_OPERATOR_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_BUILTIN_OPERATOR_H_
diff --git a/tensorflow/contrib/lite/toco/tflite/custom_operator.h b/tensorflow/contrib/lite/toco/tflite/custom_operator.h
index 1a4bfac7d4..bd5713618f 100644
--- a/tensorflow/contrib/lite/toco/tflite/custom_operator.h
+++ b/tensorflow/contrib/lite/toco/tflite/custom_operator.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_CUSTOM_OPERATOR_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_CUSTOM_OPERATOR_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_CUSTOM_OPERATOR_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_CUSTOM_OPERATOR_H_
 
 #include "flatbuffers/flexbuffers.h"
 #include "absl/memory/memory.h"
@@ -71,4 +71,4 @@ class CustomOperator : public BaseOperator {
 
 }  // namespace toco
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_CUSTOM_OPERATOR_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_CUSTOM_OPERATOR_H_
diff --git a/tensorflow/contrib/lite/toco/tflite/export.h b/tensorflow/contrib/lite/toco/tflite/export.h
index 44012b7126..8c79cb8200 100644
--- a/tensorflow/contrib/lite/toco/tflite/export.h
+++ b/tensorflow/contrib/lite/toco/tflite/export.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_EXPORT_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_EXPORT_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_EXPORT_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_EXPORT_H_
 
 #include "tensorflow/contrib/lite/toco/model.h"
 
@@ -73,4 +73,4 @@ void LoadOperatorsMap(const Model& model, OperatorsMap* operators_map);
 
 }  // namespace toco
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_EXPORT_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_EXPORT_H_
diff --git a/tensorflow/contrib/lite/toco/tflite/import.h b/tensorflow/contrib/lite/toco/tflite/import.h
index 3c27a2843c..280677bae1 100644
--- a/tensorflow/contrib/lite/toco/tflite/import.h
+++ b/tensorflow/contrib/lite/toco/tflite/import.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_IMPORT_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_IMPORT_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_IMPORT_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_IMPORT_H_
 
 #include "tensorflow/contrib/lite/schema/schema_generated.h"
 #include "tensorflow/contrib/lite/toco/model.h"
@@ -46,4 +46,4 @@ void LoadOperatorsTable(const ::tflite::Model &input_model,
 
 }  // namespace toco
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_IMPORT_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_IMPORT_H_
diff --git a/tensorflow/contrib/lite/toco/tflite/operator.h b/tensorflow/contrib/lite/toco/tflite/operator.h
index 37df302d46..88af3d6ab6 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator.h
+++ b/tensorflow/contrib/lite/toco/tflite/operator.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_OPERATOR_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_OPERATOR_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_OPERATOR_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_OPERATOR_H_
 
 #include "flatbuffers/flatbuffers.h"
 #include "tensorflow/contrib/lite/schema/schema_generated.h"
@@ -86,4 +86,4 @@ class BaseOperator {
 
 }  // namespace toco
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_OPERATOR_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_OPERATOR_H_
diff --git a/tensorflow/contrib/lite/toco/tflite/simple_operator.h b/tensorflow/contrib/lite/toco/tflite/simple_operator.h
index 992b98baca..72678c82a2 100644
--- a/tensorflow/contrib/lite/toco/tflite/simple_operator.h
+++ b/tensorflow/contrib/lite/toco/tflite/simple_operator.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_SIMPLE_OPERATOR_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_SIMPLE_OPERATOR_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_SIMPLE_OPERATOR_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_SIMPLE_OPERATOR_H_
 
 #include "tensorflow/contrib/lite/toco/tflite/operator.h"
 
@@ -47,4 +47,4 @@ class SimpleOperator : public BaseOperator {
 
 }  // namespace toco
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_SIMPLE_OPERATOR_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_SIMPLE_OPERATOR_H_
diff --git a/tensorflow/contrib/lite/toco/tflite/types.h b/tensorflow/contrib/lite/toco/tflite/types.h
index f7c5140510..3923756fc9 100644
--- a/tensorflow/contrib/lite/toco/tflite/types.h
+++ b/tensorflow/contrib/lite/toco/tflite/types.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_TYPES_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_TYPES_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_TYPES_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_TYPES_H_
 
 #include "tensorflow/contrib/lite/schema/schema_generated.h"
 #include "tensorflow/contrib/lite/toco/model.h"
@@ -55,4 +55,4 @@ struct ActivationFunction {
 
 }  // namespace toco
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_TYPES_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_TYPES_H_
diff --git a/tensorflow/contrib/lite/toco/toco_cmdline_flags.h b/tensorflow/contrib/lite/toco/toco_cmdline_flags.h
index ba35ca8d5d..46eb3f5728 100644
--- a/tensorflow/contrib/lite/toco/toco_cmdline_flags.h
+++ b/tensorflow/contrib/lite/toco/toco_cmdline_flags.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_CMDLINE_FLAGS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_CMDLINE_FLAGS_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_CMDLINE_FLAGS_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_CMDLINE_FLAGS_H_
 
 #include <string>
 #include <vector>
@@ -33,4 +33,4 @@ void ReadTocoFlagsFromCommandLineFlags(const ParsedTocoFlags& parsed_toco_flags,
 
 }  // namespace toco
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_CMDLINE_FLAGS_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_CMDLINE_FLAGS_H_
diff --git a/tensorflow/contrib/lite/toco/toco_graphviz_dump_options.h b/tensorflow/contrib/lite/toco/toco_graphviz_dump_options.h
index ae0541f62b..d6c3ba6543 100644
--- a/tensorflow/contrib/lite/toco/toco_graphviz_dump_options.h
+++ b/tensorflow/contrib/lite/toco/toco_graphviz_dump_options.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_GRAPHVIZ_DUMP_OPTIONS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_GRAPHVIZ_DUMP_OPTIONS_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_GRAPHVIZ_DUMP_OPTIONS_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_GRAPHVIZ_DUMP_OPTIONS_H_
 
 #include <string>
 
@@ -31,4 +31,4 @@ struct GraphVizDumpOptions {
 
 }  // namespace toco
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_GRAPHVIZ_DUMP_OPTIONS_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_GRAPHVIZ_DUMP_OPTIONS_H_
diff --git a/tensorflow/contrib/lite/toco/toco_port.h b/tensorflow/contrib/lite/toco/toco_port.h
index b5cb7a11e7..0572848cb5 100644
--- a/tensorflow/contrib/lite/toco/toco_port.h
+++ b/tensorflow/contrib/lite/toco/toco_port.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_PORT_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_PORT_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_PORT_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_PORT_H_
 
 // Portability layer for toco tool. Mainly, abstract filesystem access so we
 // can build and use on google internal environments and on OSX.
@@ -77,4 +77,4 @@ void CopyToBuffer(const string& src, char* dest);
 }  // namespace port
 }  // namespace toco
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_PORT_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_PORT_H_
diff --git a/tensorflow/contrib/lite/toco/toco_tooling.h b/tensorflow/contrib/lite/toco/toco_tooling.h
index 9c5a93a211..e731c149ee 100644
--- a/tensorflow/contrib/lite/toco/toco_tooling.h
+++ b/tensorflow/contrib/lite/toco/toco_tooling.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_TOOLING_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_TOOLING_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_TOOLING_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_TOOLING_H_
 
 #include <memory>
 #include <string>
@@ -47,4 +47,4 @@ inline void Export(const TocoFlags& toco_flags, const Model& model,
 
 }  // namespace toco
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_TOOLING_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_TOOLING_H_
diff --git a/tensorflow/contrib/lite/toco/toco_types.h b/tensorflow/contrib/lite/toco/toco_types.h
index ad42497ada..d72a3bd1f3 100644
--- a/tensorflow/contrib/lite/toco/toco_types.h
+++ b/tensorflow/contrib/lite/toco/toco_types.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TYPES_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TYPES_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TYPES_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_TYPES_H_
 
 #include <string>
 #include "tensorflow/core/platform/platform.h"
@@ -42,4 +42,4 @@ using tensorflow::uint8;
 
 }  // namespace toco
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TYPES_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TYPES_H_
diff --git a/tensorflow/contrib/lite/toco/tooling_util.h b/tensorflow/contrib/lite/toco/tooling_util.h
index 72fb0fd1a7..5986d63649 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.h
+++ b/tensorflow/contrib/lite/toco/tooling_util.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TOOLING_UTIL_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TOOLING_UTIL_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TOOLING_UTIL_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_TOOLING_UTIL_H_
 
 #include <algorithm>
 #include <cmath>
@@ -300,4 +300,4 @@ ArrayDataType ConvertIODataTypeToArrayDataType(IODataType type);
 
 }  // namespace toco
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TOOLING_UTIL_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TOOLING_UTIL_H_
diff --git a/tensorflow/contrib/lite/tools/gen_op_registration.h b/tensorflow/contrib/lite/tools/gen_op_registration.h
index 318859e23d..5f2ac6ca97 100644
--- a/tensorflow/contrib/lite/tools/gen_op_registration.h
+++ b/tensorflow/contrib/lite/tools/gen_op_registration.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOOLS_GEN_OP_REGISTRATION_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOOLS_GEN_OP_REGISTRATION_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_GEN_OP_REGISTRATION_H_
+#define TENSORFLOW_CONTRIB_LITE_TOOLS_GEN_OP_REGISTRATION_H_
 
 #include "tensorflow/contrib/lite/model.h"
 #include "tensorflow/contrib/lite/string.h"
@@ -36,4 +36,4 @@ void ReadOpsFromModel(const ::tflite::Model* model,
 
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOOLS_GEN_OP_REGISTRATION_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_GEN_OP_REGISTRATION_H_
diff --git a/tensorflow/contrib/lite/tools/mutable_op_resolver.h b/tensorflow/contrib/lite/tools/mutable_op_resolver.h
index 906553da57..573a359c45 100644
--- a/tensorflow/contrib/lite/tools/mutable_op_resolver.h
+++ b/tensorflow/contrib/lite/tools/mutable_op_resolver.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOOLS_MUTABLE_OP_RESOLVER_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOOLS_MUTABLE_OP_RESOLVER_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_MUTABLE_OP_RESOLVER_H_
+#define TENSORFLOW_CONTRIB_LITE_TOOLS_MUTABLE_OP_RESOLVER_H_
 
 #include <map>
 #include "tensorflow/contrib/lite/context.h"
@@ -52,4 +52,4 @@ class MutableOpResolver : public OpResolver {
 
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOOLS_MUTABLE_OP_RESOLVER_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_MUTABLE_OP_RESOLVER_H_
diff --git a/tensorflow/contrib/lite/version.h b/tensorflow/contrib/lite/version.h
index a751afabe7..efd63f4006 100644
--- a/tensorflow/contrib/lite/version.h
+++ b/tensorflow/contrib/lite/version.h
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_VERSION_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_VERSION_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_VERSION_H_
+#define TENSORFLOW_CONTRIB_LITE_VERSION_H_
 
 // The version number of the Schema. Ideally all changes will be backward
 // compatible. If that ever changes, we must ensure that version is the first
 // entry in the new tflite root so that we can see that version is not 1.
 #define TFLITE_SCHEMA_VERSION (3)
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_VERSION_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_VERSION_H_
diff --git a/tensorflow/contrib/nccl/kernels/nccl_manager.h b/tensorflow/contrib/nccl/kernels/nccl_manager.h
index cb1719c3be..bb219e0edc 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_manager.h
+++ b/tensorflow/contrib/nccl/kernels/nccl_manager.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_NCCL_COMMUNICATOR_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_NCCL_COMMUNICATOR_H_
+#ifndef TENSORFLOW_CORE_KERNELS_NCCL_COMMUNICATOR_H_
+#define TENSORFLOW_CORE_KERNELS_NCCL_COMMUNICATOR_H_
 
 #ifdef GOOGLE_CUDA
 
@@ -136,4 +136,4 @@ class NcclManager {
 
 #endif  // GOOGLE_CUDA
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_NCCL_COMMUNICATOR_H_
+#endif  // TENSORFLOW_CORE_KERNELS_NCCL_COMMUNICATOR_H_
diff --git a/tensorflow/contrib/nearest_neighbor/kernels/heap.h b/tensorflow/contrib/nearest_neighbor/kernels/heap.h
index 6e33a574e2..32925569a8 100644
--- a/tensorflow/contrib/nearest_neighbor/kernels/heap.h
+++ b/tensorflow/contrib/nearest_neighbor/kernels/heap.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_NEAREST_NEIGHBOR_KERNELS_HEAP_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_NEAREST_NEIGHBOR_KERNELS_HEAP_H_
+#ifndef TENSORFLOW_CONTRIB_NEAREST_NEIGHBOR_KERNELS_HEAP_H_
+#define TENSORFLOW_CONTRIB_NEAREST_NEIGHBOR_KERNELS_HEAP_H_
 
 #include <cassert>
 #include <cstdint>
@@ -205,4 +205,4 @@ class AugmentedHeap : public HeapBase<KeyType, DataType> {
 }  // namespace nearest_neighbor
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_NEAREST_NEIGHBOR_KERNELS_HEAP_H_
+#endif  // TENSORFLOW_CONTRIB_NEAREST_NEIGHBOR_KERNELS_HEAP_H_
diff --git a/tensorflow/contrib/nearest_neighbor/kernels/hyperplane_lsh_probes.h b/tensorflow/contrib/nearest_neighbor/kernels/hyperplane_lsh_probes.h
index 1670e2f83b..c53205e1a4 100644
--- a/tensorflow/contrib/nearest_neighbor/kernels/hyperplane_lsh_probes.h
+++ b/tensorflow/contrib/nearest_neighbor/kernels/hyperplane_lsh_probes.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_NEAREST_NEIGHBOR_KERNELS_HYPERPLANE_LSH_PROBES_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_NEAREST_NEIGHBOR_KERNELS_HYPERPLANE_LSH_PROBES_H_
+#ifndef TENSORFLOW_CONTRIB_NEAREST_NEIGHBOR_KERNELS_HYPERPLANE_LSH_PROBES_H_
+#define TENSORFLOW_CONTRIB_NEAREST_NEIGHBOR_KERNELS_HYPERPLANE_LSH_PROBES_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
@@ -232,4 +232,4 @@ class HyperplaneMultiprobe {
 }  // namespace nearest_neighbor
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_NEAREST_NEIGHBOR_KERNELS_HYPERPLANE_LSH_PROBES_H_
+#endif  // TENSORFLOW_CONTRIB_NEAREST_NEIGHBOR_KERNELS_HYPERPLANE_LSH_PROBES_H_
diff --git a/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.h b/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.h
index fc3a2da9b3..9bb1724a2c 100644
--- a/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.h
+++ b/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_PARTIAL_REDUCTION_OPS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_PARTIAL_REDUCTION_OPS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_PARTIAL_REDUCTION_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_PARTIAL_REDUCTION_OPS_H_
 
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -81,4 +81,4 @@ CALL_ALL_REDUCEOPS(ReduceSliceFunctorReduceop)
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_PARTIAL_REDUCTION_OPS_H_
+#endif  // TENSORFLOW_CORE_KERNELS_PARTIAL_REDUCTION_OPS_H_
diff --git a/tensorflow/contrib/resampler/kernels/resampler_ops.h b/tensorflow/contrib/resampler/kernels/resampler_ops.h
index 8258ecaf5d..85d3676efa 100644
--- a/tensorflow/contrib/resampler/kernels/resampler_ops.h
+++ b/tensorflow/contrib/resampler/kernels/resampler_ops.h
@@ -13,8 +13,8 @@
 // limitations under the License.
 // =============================================================================
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_RESAMPLER_KERNELS_RESAMPLER_OPS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_RESAMPLER_KERNELS_RESAMPLER_OPS_H_
+#ifndef TENSORFLOW_CONTRIB_RESAMPLER_KERNELS_RESAMPLER_OPS_H_
+#define TENSORFLOW_CONTRIB_RESAMPLER_KERNELS_RESAMPLER_OPS_H_
 
 #if PLATFORM_WINDOWS
 #define __restrict__ __restrict
@@ -64,5 +64,4 @@ struct ResamplerGrad2DFunctor{
 }  // namespace functor
 }  // namespace tensorflow
 
-
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_RESAMPLER_KERNELS_RESAMPLER_OPS_H_
+#endif  // TENSORFLOW_CONTRIB_RESAMPLER_KERNELS_RESAMPLER_OPS_H_
diff --git a/tensorflow/contrib/rnn/kernels/blas_gemm.h b/tensorflow/contrib/rnn/kernels/blas_gemm.h
index e33eceadff..a52c934233 100644
--- a/tensorflow/contrib/rnn/kernels/blas_gemm.h
+++ b/tensorflow/contrib/rnn/kernels/blas_gemm.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_RNN_KERNELS_BLAS_GEMM_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_RNN_KERNELS_BLAS_GEMM_H_
+#ifndef TENSORFLOW_CONTRIB_RNN_KERNELS_BLAS_GEMM_H_
+#define TENSORFLOW_CONTRIB_RNN_KERNELS_BLAS_GEMM_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor_types.h"
@@ -74,4 +74,4 @@ struct TensorBlasGemm<Device, T, false /* USE_CUBLAS */> {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_RNN_KERNELS_BLAS_GEMM_H_
+#endif  // TENSORFLOW_CONTRIB_RNN_KERNELS_BLAS_GEMM_H_
diff --git a/tensorflow/contrib/rnn/kernels/gru_ops.h b/tensorflow/contrib/rnn/kernels/gru_ops.h
index 06a5665062..3e2cb39e64 100644
--- a/tensorflow/contrib/rnn/kernels/gru_ops.h
+++ b/tensorflow/contrib/rnn/kernels/gru_ops.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_RNN_KERNELS_GRU_OPS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_RNN_KERNELS_GRU_OPS_H_
+#ifndef TENSORFLOW_CONTRIB_RNN_KERNELS_GRU_OPS_H_
+#define TENSORFLOW_CONTRIB_RNN_KERNELS_GRU_OPS_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/contrib/rnn/kernels/blas_gemm.h"
@@ -181,4 +181,4 @@ struct GRUBlockCellBprop : public GRUCell {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_RNN_KERNELS_GRU_OPS_H_
+#endif  // TENSORFLOW_CONTRIB_RNN_KERNELS_GRU_OPS_H_
diff --git a/tensorflow/contrib/rnn/kernels/lstm_ops.h b/tensorflow/contrib/rnn/kernels/lstm_ops.h
index 1906581b16..bc6b85f3f1 100644
--- a/tensorflow/contrib/rnn/kernels/lstm_ops.h
+++ b/tensorflow/contrib/rnn/kernels/lstm_ops.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_RNN_KERNELS_LSTM_OPS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_RNN_KERNELS_LSTM_OPS_H_
+#ifndef TENSORFLOW_CONTRIB_RNN_KERNELS_LSTM_OPS_H_
+#define TENSORFLOW_CONTRIB_RNN_KERNELS_LSTM_OPS_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/contrib/rnn/kernels/blas_gemm.h"
@@ -291,4 +291,4 @@ struct BlockLSTMBprop : public LSTMBlockCell {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_RNN_KERNELS_LSTM_OPS_H_
+#endif  // TENSORFLOW_CONTRIB_RNN_KERNELS_LSTM_OPS_H_
diff --git a/tensorflow/contrib/saved_model/cc/saved_model/signature_def_utils.h b/tensorflow/contrib/saved_model/cc/saved_model/signature_def_utils.h
index c0df224bc8..b732cdd41e 100644
--- a/tensorflow/contrib/saved_model/cc/saved_model/signature_def_utils.h
+++ b/tensorflow/contrib/saved_model/cc/saved_model/signature_def_utils.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // Helpers for working with the SignatureDefs of TensorFlow SavedModels.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_SAVED_MODEL_CC_SAVED_MODEL_SIGNATURE_DEF_UTILS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_SAVED_MODEL_CC_SAVED_MODEL_SIGNATURE_DEF_UTILS_H_
+#ifndef TENSORFLOW_CONTRIB_SAVED_MODEL_CC_SAVED_MODEL_SIGNATURE_DEF_UTILS_H_
+#define TENSORFLOW_CONTRIB_SAVED_MODEL_CC_SAVED_MODEL_SIGNATURE_DEF_UTILS_H_
 
 #include <string>
 #include <utility>
@@ -66,4 +66,4 @@ Status FindOutputTensorNameByKey(const SignatureDef& signature_def,
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_SAVED_MODEL_CC_SAVED_MODEL_SIGNATURE_DEF_UTILS_H_
+#endif  // TENSORFLOW_CONTRIB_SAVED_MODEL_CC_SAVED_MODEL_SIGNATURE_DEF_UTILS_H_
diff --git a/tensorflow/contrib/seq2seq/kernels/beam_search_ops.h b/tensorflow/contrib/seq2seq/kernels/beam_search_ops.h
index 693b02dc43..34da8c82cd 100644
--- a/tensorflow/contrib/seq2seq/kernels/beam_search_ops.h
+++ b/tensorflow/contrib/seq2seq/kernels/beam_search_ops.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_SEQ2SEQ_KERNELS_BEAM_SEARCH_OPS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_SEQ2SEQ_KERNELS_BEAM_SEARCH_OPS_H_
+#ifndef TENSORFLOW_CONTRIB_SEQ2SEQ_KERNELS_BEAM_SEARCH_OPS_H_
+#define TENSORFLOW_CONTRIB_SEQ2SEQ_KERNELS_BEAM_SEARCH_OPS_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor_types.h"
@@ -38,4 +38,4 @@ struct GatherTree {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_SEQ2SEQ_KERNELS_BEAM_SEARCH_OPS_H_
+#endif  // TENSORFLOW_CONTRIB_SEQ2SEQ_KERNELS_BEAM_SEARCH_OPS_H_
diff --git a/tensorflow/contrib/session_bundle/bundle_shim.h b/tensorflow/contrib/session_bundle/bundle_shim.h
index e24efa0de1..4628b6ab1b 100644
--- a/tensorflow/contrib/session_bundle/bundle_shim.h
+++ b/tensorflow/contrib/session_bundle/bundle_shim.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // Shim for systems that need to load both SessionBundle and
 // SavedModelBundle interchangeably during migration to SavedModel.
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_SESSION_BUNDLE_BUNDLE_SHIM_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_SESSION_BUNDLE_BUNDLE_SHIM_H_
+#ifndef TENSORFLOW_CONTRIB_SESSION_BUNDLE_BUNDLE_SHIM_H_
+#define TENSORFLOW_CONTRIB_SESSION_BUNDLE_BUNDLE_SHIM_H_
 
 #include <memory>
 
@@ -67,4 +67,4 @@ Status LoadSessionBundleOrSavedModelBundle(
 
 }  // namespace serving
 }  // namespace tensorflow
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_SESSION_BUNDLE_BUNDLE_SHIM_H_
+#endif  // TENSORFLOW_CONTRIB_SESSION_BUNDLE_BUNDLE_SHIM_H_
diff --git a/tensorflow/contrib/session_bundle/session_bundle.h b/tensorflow/contrib/session_bundle/session_bundle.h
index 2ff258411d..b2be46efa6 100644
--- a/tensorflow/contrib/session_bundle/session_bundle.h
+++ b/tensorflow/contrib/session_bundle/session_bundle.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // Low-level functionality for setting up a inference Session.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_SESSION_BUNDLE_SESSION_BUNDLE_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_SESSION_BUNDLE_SESSION_BUNDLE_H_
+#ifndef TENSORFLOW_CONTRIB_SESSION_BUNDLE_SESSION_BUNDLE_H_
+#define TENSORFLOW_CONTRIB_SESSION_BUNDLE_SESSION_BUNDLE_H_
 
 #include <memory>
 
@@ -82,4 +82,4 @@ bool IsPossibleExportDirectory(const StringPiece export_dir);
 }  // namespace serving
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_SESSION_BUNDLE_SESSION_BUNDLE_H_
+#endif  // TENSORFLOW_CONTRIB_SESSION_BUNDLE_SESSION_BUNDLE_H_
diff --git a/tensorflow/contrib/session_bundle/signature.h b/tensorflow/contrib/session_bundle/signature.h
index 0049bea008..4ef1277cec 100644
--- a/tensorflow/contrib/session_bundle/signature.h
+++ b/tensorflow/contrib/session_bundle/signature.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // Helpers for working with TensorFlow exports and their signatures.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_SESSION_BUNDLE_SIGNATURE_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_SESSION_BUNDLE_SIGNATURE_H_
+#ifndef TENSORFLOW_CONTRIB_SESSION_BUNDLE_SIGNATURE_H_
+#define TENSORFLOW_CONTRIB_SESSION_BUNDLE_SIGNATURE_H_
 
 #include <string>
 #include <utility>
@@ -121,4 +121,4 @@ Status BindGenericNames(const GenericSignature& signature,
 }  // namespace serving
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_SESSION_BUNDLE_SIGNATURE_H_
+#endif  // TENSORFLOW_CONTRIB_SESSION_BUNDLE_SIGNATURE_H_
diff --git a/tensorflow/contrib/session_bundle/test_util.h b/tensorflow/contrib/session_bundle/test_util.h
index dd0fc8d1c0..f0d41ce5a4 100644
--- a/tensorflow/contrib/session_bundle/test_util.h
+++ b/tensorflow/contrib/session_bundle/test_util.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_SESSION_BUNDLE_TEST_UTIL_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_SESSION_BUNDLE_TEST_UTIL_H_
+#ifndef TENSORFLOW_CONTRIB_SESSION_BUNDLE_TEST_UTIL_H_
+#define TENSORFLOW_CONTRIB_SESSION_BUNDLE_TEST_UTIL_H_
 
 #include <string>
 
@@ -35,4 +35,4 @@ string TestSrcDirPath(const string& relative_path);
 }  // namespace serving
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_SESSION_BUNDLE_TEST_UTIL_H_
+#endif  // TENSORFLOW_CONTRIB_SESSION_BUNDLE_TEST_UTIL_H_
diff --git a/tensorflow/contrib/tensor_forest/kernels/data_spec.h b/tensorflow/contrib/tensor_forest/kernels/data_spec.h
index 05590d6992..0a3abe56df 100644
--- a/tensorflow/contrib/tensor_forest/kernels/data_spec.h
+++ b/tensorflow/contrib/tensor_forest/kernels/data_spec.h
@@ -15,8 +15,8 @@
 // This is a surrogate for using a proto, since it doesn't seem to be possible
 // to use protos in a dynamically-loaded/shared-linkage library, which is
 // what is used for custom ops in tensorflow/contrib.
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_CORE_OPS_DATA_SPEC_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_CORE_OPS_DATA_SPEC_H_
+#ifndef TENSORFLOW_CONTRIB_TENSOR_FOREST_CORE_OPS_DATA_SPEC_H_
+#define TENSORFLOW_CONTRIB_TENSOR_FOREST_CORE_OPS_DATA_SPEC_H_
 #include <unordered_map>
 
 #include "tensorflow/core/lib/strings/numbers.h"
@@ -138,4 +138,4 @@ class TensorForestDataSpec {
 }  // namespace tensorforest
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_CORE_OPS_DATA_SPEC_H_
+#endif  // TENSORFLOW_CONTRIB_TENSOR_FOREST_CORE_OPS_DATA_SPEC_H_
diff --git a/tensorflow/contrib/tensor_forest/kernels/tree_utils.h b/tensorflow/contrib/tensor_forest/kernels/tree_utils.h
index 35f9fb7eaf..dad9df4898 100644
--- a/tensorflow/contrib/tensor_forest/kernels/tree_utils.h
+++ b/tensorflow/contrib/tensor_forest/kernels/tree_utils.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_CORE_OPS_TREE_UTILS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_CORE_OPS_TREE_UTILS_H_
+#ifndef TENSORFLOW_CONTRIB_TENSOR_FOREST_CORE_OPS_TREE_UTILS_H_
+#define TENSORFLOW_CONTRIB_TENSOR_FOREST_CORE_OPS_TREE_UTILS_H_
 
 #include <limits>
 
@@ -307,4 +307,4 @@ void GetParentWeightedMean(float leaf_sum, const float* leaf_data,
 }  // namespace tensorforest
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_CORE_OPS_TREE_UTILS_H_
+#endif  // TENSORFLOW_CONTRIB_TENSOR_FOREST_CORE_OPS_TREE_UTILS_H_
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/candidate_graph_runner.h b/tensorflow/contrib/tensor_forest/kernels/v4/candidate_graph_runner.h
index 4bd1f06c72..2e7368dc12 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/candidate_graph_runner.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/candidate_graph_runner.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_CANDIDATE_GRAPH_RUNNER_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_CANDIDATE_GRAPH_RUNNER_H_
+#ifndef TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_CANDIDATE_GRAPH_RUNNER_H_
+#define TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_CANDIDATE_GRAPH_RUNNER_H_
 #include <string>
 #include <vector>
 
@@ -70,4 +70,4 @@ class CandidateGraphRunner {
 }  // namespace tensorforest
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_CANDIDATE_GRAPH_RUNNER_H_
+#endif  // TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_CANDIDATE_GRAPH_RUNNER_H_
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h b/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h
index bf88216d66..cced26b903 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_DECISION_TREE_RESOURCE_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_DECISION_TREE_RESOURCE_H_
+#ifndef TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_DECISION_TREE_RESOURCE_H_
+#define TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_DECISION_TREE_RESOURCE_H_
 
 #include "tensorflow/contrib/decision_trees/proto/generic_tree_model.pb.h"
 #include "tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h"
@@ -88,4 +88,4 @@ class DecisionTreeResource : public ResourceBase {
 }  // namespace tensorforest
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_DECISION_TREE_RESOURCE_H_
+#endif  // TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_DECISION_TREE_RESOURCE_H_
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h b/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h
index 3f03c2d05b..85ce7b825b 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_DECISION_NODE_EVALUATOR_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_DECISION_NODE_EVALUATOR_H_
+#ifndef TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_DECISION_NODE_EVALUATOR_H_
+#define TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_DECISION_NODE_EVALUATOR_H_
 
 #include "tensorflow/contrib/decision_trees/proto/generic_tree_model.pb.h"
 #include "tensorflow/contrib/decision_trees/proto/generic_tree_model_extensions.pb.h"
@@ -104,4 +104,4 @@ struct CandidateEvalatorCollection {
 }  // namespace tensorforest
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_DECISION_NODE_EVALUATOR_H_
+#endif  // TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_DECISION_NODE_EVALUATOR_H_
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/fertile-stats-resource.h b/tensorflow/contrib/tensor_forest/kernels/v4/fertile-stats-resource.h
index dacf033d99..0d6712e9e5 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/fertile-stats-resource.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/fertile-stats-resource.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_FERTILE_STATS_RESOURCE_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_FERTILE_STATS_RESOURCE_H_
+#ifndef TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_FERTILE_STATS_RESOURCE_H_
+#define TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_FERTILE_STATS_RESOURCE_H_
 
 #include <vector>
 
@@ -98,4 +98,4 @@ class FertileStatsResource : public ResourceBase {
 }  // namespace tensorforest
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_FERTILE_STATS_RESOURCE_H_
+#endif  // TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_FERTILE_STATS_RESOURCE_H_
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/graph_collection_operator.h b/tensorflow/contrib/tensor_forest/kernels/v4/graph_collection_operator.h
index 2ae3a79b3d..4ae48179af 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/graph_collection_operator.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/graph_collection_operator.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_GRAPH_COLLECTION_OPERATOR_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_GRAPH_COLLECTION_OPERATOR_H_
+#ifndef TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_GRAPH_COLLECTION_OPERATOR_H_
+#define TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_GRAPH_COLLECTION_OPERATOR_H_
 
 #include <vector>
 #include "tensorflow/contrib/decision_trees/proto/generic_tree_model.pb.h"
@@ -78,4 +78,4 @@ class GraphRunnerSplitCollectionOperator : public SplitCollectionOperator {
 }  // namespace tensorforest
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_GRAPH_COLLECTION_OPERATOR_H_
+#endif  // TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_GRAPH_COLLECTION_OPERATOR_H_
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.h b/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.h
index 3e41ab50b9..f938d08c84 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_GROW_STATS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_GROW_STATS_H_
+#ifndef TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_GROW_STATS_H_
+#define TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_GROW_STATS_H_
 #include <unordered_map>
 #include <vector>
 
@@ -609,4 +609,4 @@ class LeastSquaresRegressionGrowStats : public GrowStats {
 }  // namespace tensorforest
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_GROW_STATS_H_
+#endif  // TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_GROW_STATS_H_
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/input_data.h b/tensorflow/contrib/tensor_forest/kernels/v4/input_data.h
index e3d4edbf8a..eafad6b591 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/input_data.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/input_data.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_INPUT_DATA_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_INPUT_DATA_H_
+#ifndef TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_INPUT_DATA_H_
+#define TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_INPUT_DATA_H_
 #include <ctime>
 #include <unordered_map>
 #include "google/protobuf/any.pb.h"
@@ -123,4 +123,4 @@ class TensorDataSet {
 }  // namespace tensorforest
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_INPUT_DATA_H_
+#endif  // TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_INPUT_DATA_H_
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/input_target.h b/tensorflow/contrib/tensor_forest/kernels/v4/input_target.h
index 0309ec1de9..44ec09c50e 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/input_target.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/input_target.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_INPUT_TARGET_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_INPUT_TARGET_H_
+#ifndef TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_INPUT_TARGET_H_
+#define TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_INPUT_TARGET_H_
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 
@@ -89,4 +89,4 @@ class TensorInputTarget : public StoredInputTarget<SingleDimStorageType> {
 }  // namespace tensorforest
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_INPUT_TARGET_H_
+#endif  // TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_INPUT_TARGET_H_
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators.h b/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators.h
index 946a648f22..cc4ec8dc9e 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_LEAF_MODEL_OPERATORS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_LEAF_MODEL_OPERATORS_H_
+#ifndef TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_LEAF_MODEL_OPERATORS_H_
+#define TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_LEAF_MODEL_OPERATORS_H_
 
 #include "tensorflow/contrib/decision_trees/proto/generic_tree_model.pb.h"
 #include "tensorflow/contrib/tensor_forest/kernels/v4/input_target.h"
@@ -146,4 +146,4 @@ class LeafModelOperatorFactory {
 }  // namespace tensorforest
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_LEAF_MODEL_OPERATORS_H_
+#endif  // TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_LEAF_MODEL_OPERATORS_H_
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/params.h b/tensorflow/contrib/tensor_forest/kernels/v4/params.h
index 97a9d8d096..b0ed949424 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/params.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/params.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_PARAMS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_PARAMS_H_
+#ifndef TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_PARAMS_H_
+#define TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_PARAMS_H_
 
 #include "tensorflow/contrib/tensor_forest/proto/tensor_forest_params.pb.h"
 #include "tensorflow/core/platform/types.h"
@@ -28,5 +28,4 @@ float ResolveParam(const DepthDependentParam& param, int32 depth);
 }  // namespace tensorforest
 }  // namespace tensorflow
 
-
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_PARAMS_H_
+#endif  // TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_PARAMS_H_
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/split_collection_operators.h b/tensorflow/contrib/tensor_forest/kernels/v4/split_collection_operators.h
index 6c21c0bd34..ad52f89fad 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/split_collection_operators.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/split_collection_operators.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_SPLIT_COLLECTION_OPERATORS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_SPLIT_COLLECTION_OPERATORS_H_
+#ifndef TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_SPLIT_COLLECTION_OPERATORS_H_
+#define TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_SPLIT_COLLECTION_OPERATORS_H_
 
 #include <vector>
 #include "tensorflow/contrib/decision_trees/proto/generic_tree_model.pb.h"
@@ -128,6 +128,4 @@ class AnyCollectionCreator : public CollectionCreator {
 }  // namespace tensorforest
 }  // namespace tensorflow
 
-
-
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_SPLIT_COLLECTION_OPERATORS_H_
+#endif  // TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_SPLIT_COLLECTION_OPERATORS_H_
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/stat_utils.h b/tensorflow/contrib/tensor_forest/kernels/v4/stat_utils.h
index 8e002d0414..e6140065bb 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/stat_utils.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/stat_utils.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_STAT_UTILS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_STAT_UTILS_H_
+#ifndef TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_STAT_UTILS_H_
+#define TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_STAT_UTILS_H_
 #include "tensorflow/contrib/tensor_forest/proto/fertile_stats.pb.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -47,4 +47,4 @@ float WeightedSmoothedGini(float sum, float square, int num_classes);
 }  // namespace tensorforest
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_STAT_UTILS_H_
+#endif  // TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_STAT_UTILS_H_
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/test_utils.h b/tensorflow/contrib/tensor_forest/kernels/v4/test_utils.h
index b6e543b96f..289c81e9d5 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/test_utils.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/test_utils.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_TEST_UTILS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_TEST_UTILS_H_
+#ifndef TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_TEST_UTILS_H_
+#define TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_TEST_UTILS_H_
 #include "tensorflow/contrib/tensor_forest/kernels/v4/input_data.h"
 #include "tensorflow/contrib/tensor_forest/kernels/v4/input_target.h"
 
@@ -71,4 +71,4 @@ class TestableDataSet : public TensorDataSet {
 }  // namespace tensorforest
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_TEST_UTILS_H_
+#endif  // TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_TEST_UTILS_H_
diff --git a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h
index 65b92aa418..25b958bcfe 100644
--- a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h
+++ b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TPU_PROFILER_DUMP_TPU_PROFILE_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_TPU_PROFILER_DUMP_TPU_PROFILE_H_
+#ifndef TENSORFLOW_CONTRIB_TPU_PROFILER_DUMP_TPU_PROFILE_H_
+#define TENSORFLOW_CONTRIB_TPU_PROFILER_DUMP_TPU_PROFILE_H_
 
 #include "tensorflow/contrib/tpu/profiler/tpu_profiler.grpc.pb.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -35,4 +35,4 @@ Status WriteTensorboardTPUProfile(const string& logdir, const string& run,
 }  // namespace tpu
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TPU_PROFILER_DUMP_TPU_PROFILE_H_
+#endif  // TENSORFLOW_CONTRIB_TPU_PROFILER_DUMP_TPU_PROFILE_H_
diff --git a/tensorflow/contrib/tpu/profiler/trace_events_to_json.h b/tensorflow/contrib/tpu/profiler/trace_events_to_json.h
index 992eae43d9..3bd76dd01c 100644
--- a/tensorflow/contrib/tpu/profiler/trace_events_to_json.h
+++ b/tensorflow/contrib/tpu/profiler/trace_events_to_json.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TPU_PROFILER_TRACE_EVENTS_TO_JSON_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_TPU_PROFILER_TRACE_EVENTS_TO_JSON_H_
+#ifndef TENSORFLOW_CONTRIB_TPU_PROFILER_TRACE_EVENTS_TO_JSON_H_
+#define TENSORFLOW_CONTRIB_TPU_PROFILER_TRACE_EVENTS_TO_JSON_H_
 
 #include "tensorflow/contrib/tpu/profiler/trace_events.pb.h"
 #include "tensorflow/core/platform/types.h"
@@ -29,4 +29,4 @@ string TraceEventsToJson(const Trace &trace);
 }  // namespace tpu
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TPU_PROFILER_TRACE_EVENTS_TO_JSON_H_
+#endif  // TENSORFLOW_CONTRIB_TPU_PROFILER_TRACE_EVENTS_TO_JSON_H_
diff --git a/tensorflow/contrib/tpu/profiler/version.h b/tensorflow/contrib/tpu/profiler/version.h
index a6c9a9503d..0f645a5492 100644
--- a/tensorflow/contrib/tpu/profiler/version.h
+++ b/tensorflow/contrib/tpu/profiler/version.h
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 =============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TPU_PROFILER_VERSION_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_TPU_PROFILER_VERSION_H_
+#ifndef TENSORFLOW_CONTRIB_TPU_PROFILER_VERSION_H_
+#define TENSORFLOW_CONTRIB_TPU_PROFILER_VERSION_H_
 
 #define TPU_PROFILER_VERSION "1.4.3"
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TPU_PROFILER_VERSION_H_
+#endif  // TENSORFLOW_CONTRIB_TPU_PROFILER_VERSION_H_
diff --git a/tensorflow/contrib/util/convert_graphdef_memmapped_format_lib.h b/tensorflow/contrib/util/convert_graphdef_memmapped_format_lib.h
index 6518e7a10f..61fc6f36f7 100644
--- a/tensorflow/contrib/util/convert_graphdef_memmapped_format_lib.h
+++ b/tensorflow/contrib/util/convert_graphdef_memmapped_format_lib.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_UTIL_CONVERT_GRAPHDEF_MEMMAPPED_FORMAT_LIB_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_UTIL_CONVERT_GRAPHDEF_MEMMAPPED_FORMAT_LIB_H_
+#ifndef TENSORFLOW_CONTRIB_UTIL_CONVERT_GRAPHDEF_MEMMAPPED_FORMAT_LIB_H_
+#define TENSORFLOW_CONTRIB_UTIL_CONVERT_GRAPHDEF_MEMMAPPED_FORMAT_LIB_H_
 
 #include <string>
 
@@ -31,4 +31,4 @@ Status ConvertConstantsToImmutable(const string& in_graph_filename,
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_UTIL_CONVERT_GRAPHDEF_MEMMAPPED_FORMAT_LIB_H_
+#endif  // TENSORFLOW_CONTRIB_UTIL_CONVERT_GRAPHDEF_MEMMAPPED_FORMAT_LIB_H_
diff --git a/tensorflow/contrib/verbs/grpc_verbs_client.h b/tensorflow/contrib/verbs/grpc_verbs_client.h
index 358977f925..2cfaa4986c 100644
--- a/tensorflow/contrib/verbs/grpc_verbs_client.h
+++ b/tensorflow/contrib/verbs/grpc_verbs_client.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_GRPC_VERBS_CLIENT_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_GRPC_VERBS_CLIENT_H_
+#ifndef TENSORFLOW_CONTRIB_GRPC_VERBS_CLIENT_H_
+#define TENSORFLOW_CONTRIB_GRPC_VERBS_CLIENT_H_
 
 #include "tensorflow/contrib/verbs/grpc_verbs_service_impl.h"
 #include "tensorflow/contrib/verbs/verbs_service.pb.h"
@@ -47,4 +47,4 @@ class GrpcVerbsClient {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_GRPC_VERBS_CLIENT_H_
+#endif  // TENSORFLOW_CONTRIB_GRPC_VERBS_CLIENT_H_
diff --git a/tensorflow/contrib/verbs/grpc_verbs_service.h b/tensorflow/contrib/verbs/grpc_verbs_service.h
index aa509602b5..444c863b94 100644
--- a/tensorflow/contrib/verbs/grpc_verbs_service.h
+++ b/tensorflow/contrib/verbs/grpc_verbs_service.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_GRPC_VERBS_SERVICE_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_GRPC_VERBS_SERVICE_H_
+#ifndef TENSORFLOW_CONTRIB_VERBS_GRPC_VERBS_SERVICE_H_
+#define TENSORFLOW_CONTRIB_VERBS_GRPC_VERBS_SERVICE_H_
 
 #ifdef TENSORFLOW_USE_VERBS
 
@@ -69,4 +69,4 @@ void SetNewVerbsService(GrpcVerbsService** handle, const WorkerEnv* worker_env,
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_USE_VERBS
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_GRPC_VERBS_SERVICE_H_
+#endif  // TENSORFLOW_CONTRIB_VERBS_GRPC_VERBS_SERVICE_H_
diff --git a/tensorflow/contrib/verbs/grpc_verbs_service_impl.h b/tensorflow/contrib/verbs/grpc_verbs_service_impl.h
index 86431ca030..1f0f10517e 100644
--- a/tensorflow/contrib/verbs/grpc_verbs_service_impl.h
+++ b/tensorflow/contrib/verbs/grpc_verbs_service_impl.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_GRPC_VERBS_SERVICE_IMPL_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_GRPC_VERBS_SERVICE_IMPL_H_
+#ifndef TENSORFLOW_CONTRIB_GRPC_VERBS_SERVICE_IMPL_H_
+#define TENSORFLOW_CONTRIB_GRPC_VERBS_SERVICE_IMPL_H_
 
 #include "grpc++/impl/codegen/async_stream.h"
 #include "grpc++/impl/codegen/async_unary_call.h"
@@ -86,4 +86,4 @@ class VerbsService GRPC_FINAL {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_GRPC_VERBS_SERVICE_IMPL_H_
+#endif  // TENSORFLOW_CONTRIB_GRPC_VERBS_SERVICE_IMPL_H_
diff --git a/tensorflow/contrib/verbs/rdma.h b/tensorflow/contrib/verbs/rdma.h
index b6c41de6ee..94203ee2b3 100644
--- a/tensorflow/contrib/verbs/rdma.h
+++ b/tensorflow/contrib/verbs/rdma.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_RDMA_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_RDMA_H_
+#ifndef TENSORFLOW_CONTRIB_VERBS_RDMA_H_
+#define TENSORFLOW_CONTRIB_VERBS_RDMA_H_
 
 #ifdef TENSORFLOW_USE_VERBS
 
@@ -524,4 +524,4 @@ class RdmaMessageBuffer {
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_USE_VERBS
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_RDMA_H_
+#endif  // TENSORFLOW_CONTRIB_VERBS_RDMA_H_
diff --git a/tensorflow/contrib/verbs/rdma_mgr.h b/tensorflow/contrib/verbs/rdma_mgr.h
index 29227a9544..9fffc335bb 100644
--- a/tensorflow/contrib/verbs/rdma_mgr.h
+++ b/tensorflow/contrib/verbs/rdma_mgr.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_RDMA_MGR_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_RDMA_MGR_H_
+#ifndef TENSORFLOW_CONTRIB_VERBS_RDMA_MGR_H_
+#define TENSORFLOW_CONTRIB_VERBS_RDMA_MGR_H_
 
 #ifdef TENSORFLOW_USE_VERBS
 
@@ -55,4 +55,4 @@ class RdmaMgr {
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_USE_VERBS
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_RDMA_MGR_H_
+#endif  // TENSORFLOW_CONTRIB_VERBS_RDMA_MGR_H_
diff --git a/tensorflow/contrib/verbs/rdma_rendezvous_mgr.h b/tensorflow/contrib/verbs/rdma_rendezvous_mgr.h
index 2dedd6c48f..c0d6f59c48 100644
--- a/tensorflow/contrib/verbs/rdma_rendezvous_mgr.h
+++ b/tensorflow/contrib/verbs/rdma_rendezvous_mgr.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_RDMA_RENDEZVOUS_MGR_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_RDMA_RENDEZVOUS_MGR_H_
+#ifndef TENSORFLOW_CONTRIB_VERBS_RDMA_RENDEZVOUS_MGR_H_
+#define TENSORFLOW_CONTRIB_VERBS_RDMA_RENDEZVOUS_MGR_H_
 
 #ifdef TENSORFLOW_USE_VERBS
 
@@ -60,4 +60,4 @@ class RdmaRendezvousMgr : public BaseRendezvousMgr {
 }  // end namespace tensorflow
 
 #endif  // TENSORFLOW_USE_VERBS
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_RDMA_RENDEZVOUS_MGR_H_
+#endif  // TENSORFLOW_CONTRIB_VERBS_RDMA_RENDEZVOUS_MGR_H_
diff --git a/tensorflow/contrib/verbs/verbs_server_lib.h b/tensorflow/contrib/verbs/verbs_server_lib.h
index 855380129f..54ce8c1d47 100644
--- a/tensorflow/contrib/verbs/verbs_server_lib.h
+++ b/tensorflow/contrib/verbs/verbs_server_lib.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_VERBS_SERVER_LIB_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_VERBS_SERVER_LIB_H_
+#ifndef TENSORFLOW_CONTRIB_VERBS_VERBS_SERVER_LIB_H_
+#define TENSORFLOW_CONTRIB_VERBS_VERBS_SERVER_LIB_H_
 
 #ifdef TENSORFLOW_USE_VERBS
 
@@ -63,4 +63,4 @@ class VerbsServer : public GrpcServer {
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_USE_VERBS
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_VERBS_SERVER_LIB_H_
+#endif  // TENSORFLOW_CONTRIB_VERBS_VERBS_SERVER_LIB_H_
diff --git a/tensorflow/core/api_def/update_api_def.h b/tensorflow/core/api_def/update_api_def.h
index 5d6c15010d..1e285c0688 100644
--- a/tensorflow/core/api_def/update_api_def.h
+++ b/tensorflow/core/api_def/update_api_def.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_API_DEF_UPDATE_API_DEF_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_API_DEF_UPDATE_API_DEF_H_
+#ifndef TENSORFLOW_CORE_API_DEF_UPDATE_API_DEF_H_
+#define TENSORFLOW_CORE_API_DEF_UPDATE_API_DEF_H_
 // Functions for updating ApiDef when new ops are added.
 
 #include "tensorflow/core/framework/op_def.pb.h"
@@ -42,4 +42,4 @@ void CreateApiDefs(const OpList& ops, const string& api_def_dir,
                    const string& op_file_pattern);
 
 }  // namespace tensorflow
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_API_DEF_UPDATE_API_DEF_H_
+#endif  // TENSORFLOW_CORE_API_DEF_UPDATE_API_DEF_H_
diff --git a/tensorflow/core/common_runtime/function_testlib.h b/tensorflow/core/common_runtime/function_testlib.h
index 0bf6699f5a..3ddb26de92 100644
--- a/tensorflow/core/common_runtime/function_testlib.h
+++ b/tensorflow/core/common_runtime/function_testlib.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_FUNCTION_TESTLIB_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_FUNCTION_TESTLIB_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_FUNCTION_TESTLIB_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_FUNCTION_TESTLIB_H_
 
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/core/framework/function.h"
@@ -34,4 +34,4 @@ Output Call(Scope* scope, const string& op_name, const string& fn_name,
 }  // namespace test
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_FUNCTION_TESTLIB_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_FUNCTION_TESTLIB_H_
diff --git a/tensorflow/core/common_runtime/gpu/gpu_id.h b/tensorflow/core/common_runtime/gpu/gpu_id.h
index ff81ccd432..4e9c4abce1 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_id.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_id.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ID_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ID_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ID_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ID_H_
 
 #include "tensorflow/core/lib/gtl/int_type.h"
 
@@ -85,4 +85,4 @@ TF_LIB_GTL_DEFINE_INT_TYPE(CudaGpuId, int32);
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ID_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ID_H_
diff --git a/tensorflow/core/common_runtime/gpu/gpu_id_utils.h b/tensorflow/core/common_runtime/gpu/gpu_id_utils.h
index 78e51c84c1..6d196b16ed 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_id_utils.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_id_utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ID_UTILS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ID_UTILS_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ID_UTILS_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ID_UTILS_H_
 
 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
@@ -58,4 +58,4 @@ class GpuIdUtil {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ID_UTILS_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ID_UTILS_H_
diff --git a/tensorflow/core/common_runtime/gpu/gpu_managed_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_managed_allocator.h
index 006b2ca448..2d49a64c0f 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_managed_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_managed_allocator.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_MANAGED_ALLOCATOR_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_MANAGED_ALLOCATOR_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_MANAGED_ALLOCATOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_MANAGED_ALLOCATOR_H_
 
 #include "tensorflow/core/framework/allocator.h"
 
@@ -33,4 +33,4 @@ class GpuManagedAllocator : public Allocator {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_MANAGED_ALLOCATOR_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_MANAGED_ALLOCATOR_H_
diff --git a/tensorflow/core/common_runtime/graph_optimizer.h b/tensorflow/core/common_runtime/graph_optimizer.h
index 8f3a082134..8477cea126 100644
--- a/tensorflow/core/common_runtime/graph_optimizer.h
+++ b/tensorflow/core/common_runtime/graph_optimizer.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_GRAPH_OPTIMIZER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_GRAPH_OPTIMIZER_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GRAPH_OPTIMIZER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_GRAPH_OPTIMIZER_H_
 
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/graph/graph.h"
@@ -60,4 +60,4 @@ class GraphOptimizer {
 
 }  // end namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_GRAPH_OPTIMIZER_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GRAPH_OPTIMIZER_H_
diff --git a/tensorflow/core/common_runtime/memory_types.h b/tensorflow/core/common_runtime/memory_types.h
index fa0a7595f3..f854acfdc5 100644
--- a/tensorflow/core/common_runtime/memory_types.h
+++ b/tensorflow/core/common_runtime/memory_types.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_MEMORY_TYPES_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_MEMORY_TYPES_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_MEMORY_TYPES_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_MEMORY_TYPES_H_
 
 #include "tensorflow/core/framework/memory_types.h"
 #include "tensorflow/core/graph/graph.h"
@@ -45,4 +45,4 @@ Status MemoryTypeForOutput(const DeviceType& device_type, const Graph* g,
 
 }  // end namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_MEMORY_TYPES_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_MEMORY_TYPES_H_
diff --git a/tensorflow/core/common_runtime/pending_counts.h b/tensorflow/core/common_runtime/pending_counts.h
index 5707f52592..5e1925c401 100644
--- a/tensorflow/core/common_runtime/pending_counts.h
+++ b/tensorflow/core/common_runtime/pending_counts.h
@@ -1,5 +1,5 @@
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_PENDING_COUNTS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_PENDING_COUNTS_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_PENDING_COUNTS_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_PENDING_COUNTS_H_
 
 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 
@@ -328,4 +328,4 @@ inline PendingCounts::Handle PendingCounts::Layout::CreateHandle(
 
 }  // end namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_PENDING_COUNTS_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_PENDING_COUNTS_H_
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.h b/tensorflow/core/common_runtime/process_function_library_runtime.h
index 38003b7726..a1adc4b6b3 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.h
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_PROCESS_FUNCTION_LIBRARY_RUNTIME_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_PROCESS_FUNCTION_LIBRARY_RUNTIME_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_PROCESS_FUNCTION_LIBRARY_RUNTIME_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_PROCESS_FUNCTION_LIBRARY_RUNTIME_H_
 
 #include <unordered_map>
 
@@ -173,4 +173,4 @@ class ProcessFunctionLibraryRuntime {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_PROCESS_FUNCTION_LIBRARY_RUNTIME_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_PROCESS_FUNCTION_LIBRARY_RUNTIME_H_
diff --git a/tensorflow/core/common_runtime/profile_handler.h b/tensorflow/core/common_runtime/profile_handler.h
index 57c83c2e6f..9d31b1aecb 100644
--- a/tensorflow/core/common_runtime/profile_handler.h
+++ b/tensorflow/core/common_runtime/profile_handler.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_PROFILE_HANDLER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_PROFILE_HANDLER_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_PROFILE_HANDLER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_PROFILE_HANDLER_H_
 
 #include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/graph/types.h"
@@ -80,4 +80,4 @@ class ProfileHandler {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_PROFILE_HANDLER_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_PROFILE_HANDLER_H_
diff --git a/tensorflow/core/common_runtime/renamed_device.h b/tensorflow/core/common_runtime/renamed_device.h
index c5c204d4fa..fe4df1c106 100644
--- a/tensorflow/core/common_runtime/renamed_device.h
+++ b/tensorflow/core/common_runtime/renamed_device.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_RENAMED_DEVICE_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_RENAMED_DEVICE_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_RENAMED_DEVICE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_RENAMED_DEVICE_H_
 
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/util/device_name_utils.h"
@@ -134,4 +134,4 @@ class RenamedDevice : public Device {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_RENAMED_DEVICE_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_RENAMED_DEVICE_H_
diff --git a/tensorflow/core/common_runtime/rendezvous_util.h b/tensorflow/core/common_runtime/rendezvous_util.h
index 3b6354603b..aad910f6d8 100644
--- a/tensorflow/core/common_runtime/rendezvous_util.h
+++ b/tensorflow/core/common_runtime/rendezvous_util.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_RENDEZVOUS_UTIL_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_RENDEZVOUS_UTIL_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_RENDEZVOUS_UTIL_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_RENDEZVOUS_UTIL_H_
 
 #include <map>
 
@@ -49,4 +49,4 @@ Status RecvOutputsFromRendezvous(Rendezvous* rendezvous, NamedTensors* out,
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_RENDEZVOUS_UTIL_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_RENDEZVOUS_UTIL_H_
diff --git a/tensorflow/core/common_runtime/shape_refiner.h b/tensorflow/core/common_runtime/shape_refiner.h
index da42c30ce9..75eb5bf0d2 100644
--- a/tensorflow/core/common_runtime/shape_refiner.h
+++ b/tensorflow/core/common_runtime/shape_refiner.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_SHAPE_REFINER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_SHAPE_REFINER_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_SHAPE_REFINER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_SHAPE_REFINER_H_
 
 #include <vector>
 
@@ -303,4 +303,4 @@ class ShapeRefiner {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_SHAPE_REFINER_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_SHAPE_REFINER_H_
diff --git a/tensorflow/core/common_runtime/stats_publisher_interface.h b/tensorflow/core/common_runtime/stats_publisher_interface.h
index b285420798..f063ee5297 100644
--- a/tensorflow/core/common_runtime/stats_publisher_interface.h
+++ b/tensorflow/core/common_runtime/stats_publisher_interface.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_STATS_PUBLISHER_INTERFACE_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_STATS_PUBLISHER_INTERFACE_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_STATS_PUBLISHER_INTERFACE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_STATS_PUBLISHER_INTERFACE_H_
 
 #include "tensorflow/core/common_runtime/build_graph_options.h"
 #include "tensorflow/core/common_runtime/profile_handler.h"
@@ -61,4 +61,4 @@ std::unique_ptr<StatsPublisherInterface> CreateNoOpStatsPublisher(
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_STATS_PUBLISHER_INTERFACE_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_STATS_PUBLISHER_INTERFACE_H_
diff --git a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h
index 3deb80dff7..d3ca350e36 100644
--- a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h
+++ b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_CLUSTER_FUNCTION_LIBRARY_RUNTIME_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_CLUSTER_FUNCTION_LIBRARY_RUNTIME_H_
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_CLUSTER_FUNCTION_LIBRARY_RUNTIME_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_CLUSTER_FUNCTION_LIBRARY_RUNTIME_H_
 
 #include "tensorflow/core/distributed_runtime/worker_interface.h"
 #include "tensorflow/core/distributed_runtime/worker_session.h"
@@ -74,4 +74,4 @@ class ClusterFunctionLibraryRuntime : public DistributedFunctionLibraryRuntime {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_CLUSTER_FUNCTION_LIBRARY_RUNTIME_H_
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_CLUSTER_FUNCTION_LIBRARY_RUNTIME_H_
diff --git a/tensorflow/core/distributed_runtime/local_master.h b/tensorflow/core/distributed_runtime/local_master.h
index 5fc21d3a1e..c20b40329a 100644
--- a/tensorflow/core/distributed_runtime/local_master.h
+++ b/tensorflow/core/distributed_runtime/local_master.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_LOCAL_MASTER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_LOCAL_MASTER_H_
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_LOCAL_MASTER_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_LOCAL_MASTER_H_
 
 #include <memory>
 
@@ -98,4 +98,4 @@ class LocalMaster : public MasterInterface {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_LOCAL_MASTER_H_
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_LOCAL_MASTER_H_
diff --git a/tensorflow/core/distributed_runtime/message_wrappers.h b/tensorflow/core/distributed_runtime/message_wrappers.h
index 7113d73dd7..79fa6f926e 100644
--- a/tensorflow/core/distributed_runtime/message_wrappers.h
+++ b/tensorflow/core/distributed_runtime/message_wrappers.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_MESSAGE_WRAPPERS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_MESSAGE_WRAPPERS_H_
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_MESSAGE_WRAPPERS_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_MESSAGE_WRAPPERS_H_
 
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/cost_graph.pb.h"
@@ -702,4 +702,4 @@ class NonOwnedProtoRunStepResponse : public MutableRunStepResponseWrapper {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW
+#endif  // TENSORFLOW
diff --git a/tensorflow/core/distributed_runtime/partial_run_mgr.h b/tensorflow/core/distributed_runtime/partial_run_mgr.h
index af56e723a9..e95f4da6c3 100644
--- a/tensorflow/core/distributed_runtime/partial_run_mgr.h
+++ b/tensorflow/core/distributed_runtime/partial_run_mgr.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_PARTIAL_RUN_MGR_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_PARTIAL_RUN_MGR_H_
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_PARTIAL_RUN_MGR_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_PARTIAL_RUN_MGR_H_
 
 #include <unordered_map>
 
@@ -84,4 +84,4 @@ class PartialRunMgr {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_PARTIAL_RUN_MGR_H_
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_PARTIAL_RUN_MGR_H_
diff --git a/tensorflow/core/distributed_runtime/recent_request_ids.h b/tensorflow/core/distributed_runtime/recent_request_ids.h
index 396235dcc6..e8e45331dd 100644
--- a/tensorflow/core/distributed_runtime/recent_request_ids.h
+++ b/tensorflow/core/distributed_runtime/recent_request_ids.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RECENT_REQUEST_IDS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RECENT_REQUEST_IDS_H_
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RECENT_REQUEST_IDS_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RECENT_REQUEST_IDS_H_
 
 #include <vector>
 
@@ -69,4 +69,4 @@ class RecentRequestIds {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RECENT_REQUEST_IDS_H_
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RECENT_REQUEST_IDS_H_
diff --git a/tensorflow/core/distributed_runtime/request_id.h b/tensorflow/core/distributed_runtime/request_id.h
index 288c49153d..a882b69ab1 100644
--- a/tensorflow/core/distributed_runtime/request_id.h
+++ b/tensorflow/core/distributed_runtime/request_id.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_REQUEST_ID_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_REQUEST_ID_H_
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_REQUEST_ID_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_REQUEST_ID_H_
 
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/types.h"
@@ -28,4 +28,4 @@ int64 GetUniqueRequestId();
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_REQUEST_ID_H_
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_REQUEST_ID_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/async_service_interface.h b/tensorflow/core/distributed_runtime/rpc/async_service_interface.h
index 63b0f2272d..b2730a583b 100644
--- a/tensorflow/core/distributed_runtime/rpc/async_service_interface.h
+++ b/tensorflow/core/distributed_runtime/rpc/async_service_interface.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_ASYNC_SERVICE_INTERFACE_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_ASYNC_SERVICE_INTERFACE_H_
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_ASYNC_SERVICE_INTERFACE_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_ASYNC_SERVICE_INTERFACE_H_
 
 namespace tensorflow {
 
@@ -38,4 +38,4 @@ class AsyncServiceInterface {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_ASYNC_SERVICE_INTERFACE_H_
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_ASYNC_SERVICE_INTERFACE_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_call.h b/tensorflow/core/distributed_runtime/rpc/grpc_call.h
index 2ab0a40f33..ecad1274cc 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_call.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_call.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CALL_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CALL_H_
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CALL_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CALL_H_
 
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/platform/macros.h"
@@ -265,4 +265,4 @@ class Call : public UntypedCall<Service> {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CALL_H_
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CALL_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_channel.h b/tensorflow/core/distributed_runtime/rpc/grpc_channel.h
index c662cde9be..de9840fca8 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_channel.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_channel.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CHANNEL_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CHANNEL_H_
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CHANNEL_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CHANNEL_H_
 
 #include <map>
 #include <memory>
@@ -93,4 +93,4 @@ Status NewHostPortGrpcChannel(const string& target,
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CHANNEL_H_
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CHANNEL_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_client_cq_tag.h b/tensorflow/core/distributed_runtime/rpc/grpc_client_cq_tag.h
index 95c2c935f0..d367b83ee7 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_client_cq_tag.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_client_cq_tag.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CLIENT_CQ_TAG_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CLIENT_CQ_TAG_H_
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CLIENT_CQ_TAG_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CLIENT_CQ_TAG_H_
 
 #include "grpc++/grpc++.h"
 
@@ -41,4 +41,4 @@ class GrpcClientCQTag {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CLIENT_CQ_TAG_H_
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CLIENT_CQ_TAG_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.h b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.h
index 8770dcc3ac..473604f257 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_MASTER_SERVICE_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_MASTER_SERVICE_H_
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_MASTER_SERVICE_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_MASTER_SERVICE_H_
 
 #include <memory>
 #include "tensorflow/core/platform/types.h"
@@ -34,4 +34,4 @@ AsyncServiceInterface* NewGrpcMasterService(Master* master,
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_MASTER_SERVICE_H_
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_MASTER_SERVICE_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h
index 412395c526..4e203e260a 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_MASTER_SERVICE_IMPL_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_MASTER_SERVICE_IMPL_H_
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_MASTER_SERVICE_IMPL_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_MASTER_SERVICE_IMPL_H_
 
 #include "grpc++/impl/codegen/async_stream.h"
 #include "grpc++/impl/codegen/async_unary_call.h"
@@ -186,4 +186,4 @@ class MasterService final {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_MASTER_SERVICE_IMPL_H_
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_MASTER_SERVICE_IMPL_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.h b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.h
index d661caaa60..c80668e899 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_REMOTE_MASTER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_REMOTE_MASTER_H_
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_REMOTE_MASTER_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_REMOTE_MASTER_H_
 
 #include "tensorflow/core/distributed_runtime/master_interface.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
@@ -24,4 +24,4 @@ namespace tensorflow {
 MasterInterface* NewGrpcMaster(const SharedGrpcChannelPtr& channel);
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_REMOTE_MASTER_H_
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_REMOTE_MASTER_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h
index 8ad4133540..709c3833e7 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_DISTRIBUTED_RUNTIME_RPC_GRPC_REMOTE_WORKER_H_
-#define THIRD_PARTY_TENSORFLOW_DISTRIBUTED_RUNTIME_RPC_GRPC_REMOTE_WORKER_H_
+#ifndef TENSORFLOW_DISTRIBUTED_RUNTIME_RPC_GRPC_REMOTE_WORKER_H_
+#define TENSORFLOW_DISTRIBUTED_RUNTIME_RPC_GRPC_REMOTE_WORKER_H_
 
 #include <memory>
 
@@ -35,4 +35,4 @@ WorkerInterface* NewGrpcRemoteWorker(SharedGrpcChannelPtr channel,
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_DISTRIBUTED_RUNTIME_RPC_GRPC_REMOTE_WORKER_H_
+#endif  // TENSORFLOW_DISTRIBUTED_RUNTIME_RPC_GRPC_REMOTE_WORKER_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h b/tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h
index b35d4843e8..dd114d39c6 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SERIALIZATION_TRAITS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SERIALIZATION_TRAITS_H_
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SERIALIZATION_TRAITS_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SERIALIZATION_TRAITS_H_
 
 #include "grpc++/impl/codegen/proto_utils.h"
 #include "grpc++/support/slice.h"
@@ -231,4 +231,4 @@ class UnlimitedSizeProtoSerializationTraits {
       : public UnlimitedSizeProtoSerializationTraits<MessageType> {}; \
   }  // namespace grpc
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SERIALIZATION_TRAITS_H_
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SERIALIZATION_TRAITS_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
index c3f513d492..8b12ac1461 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SERVER_LIB_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SERVER_LIB_H_
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SERVER_LIB_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SERVER_LIB_H_
 
 #include <memory>
 
@@ -141,4 +141,4 @@ class GrpcServer : public ServerInterface {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SERVER_LIB_H_
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SERVER_LIB_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session.h b/tensorflow/core/distributed_runtime/rpc/grpc_session.h
index 300f727124..d87956a135 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SESSION_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SESSION_H_
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SESSION_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SESSION_H_
 
 #include <memory>
 #include <string>
@@ -130,4 +130,4 @@ class GrpcSession : public Session {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SESSION_H_
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SESSION_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_state.h b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
index 3f80bdfb70..0b6f9474dd 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_state.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_STATE_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_STATE_H_
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_STATE_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_STATE_H_
 
 #include <utility>
 
@@ -96,4 +96,4 @@ class RPCState : public GrpcClientCQTag {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_STATE_H_
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_STATE_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_testlib.h b/tensorflow/core/distributed_runtime/rpc/grpc_testlib.h
index 5e81b90189..4b3a03b1d7 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_testlib.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_testlib.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_TESTLIB_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_TESTLIB_H_
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_TESTLIB_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_TESTLIB_H_
 
 #include <memory>
 #include <string>
@@ -70,4 +70,4 @@ class TestCluster {
 }  // end namespace test
 }  // end namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_TESTLIB_H_
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_TESTLIB_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_util.h b/tensorflow/core/distributed_runtime/rpc/grpc_util.h
index bb85478347..d5e7e9f5b3 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_util.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_util.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_UTIL_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_UTIL_H_
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_UTIL_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_UTIL_H_
 
 #include <memory>
 
@@ -114,4 +114,4 @@ class GrpcByteBufferSource : public ::grpc::protobuf::io::ZeroCopyInputStream {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_UTIL_H_
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_UTIL_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.h b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.h
index 17a307a6d9..7a35fdbca0 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_CACHE_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_CACHE_H_
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_CACHE_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_CACHE_H_
 
 #include "tensorflow/core/distributed_runtime/rpc/grpc_channel.h"
 #include "tensorflow/core/distributed_runtime/worker_cache.h"
@@ -29,4 +29,4 @@ WorkerCacheInterface* NewGrpcWorkerCacheWithLocalWorker(
     const string& local_target);
 
 }  // namespace tensorflow
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_CACHE_H_
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_CACHE_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
index 28b389b25f..78a21fd9f6 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_SERVICE_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_SERVICE_H_
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_SERVICE_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_SERVICE_H_
 
 #include "tensorflow/core/distributed_runtime/recent_request_ids.h"
 #include "tensorflow/core/distributed_runtime/worker.h"
@@ -54,4 +54,4 @@ std::unique_ptr<AsyncServiceInterface> NewGrpcWorkerService(
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_SERVICE_H_
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_SERVICE_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
index fb23f8631f..1a5e2edfb2 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_SERVICE_IMPL_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_SERVICE_IMPL_H_
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_SERVICE_IMPL_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_SERVICE_IMPL_H_
 
 #include "grpc++/impl/codegen/async_stream.h"
 #include "grpc++/impl/codegen/async_unary_call.h"
@@ -147,4 +147,4 @@ class WorkerService final {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_SERVICE_IMPL_H_
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_SERVICE_IMPL_H_
diff --git a/tensorflow/core/distributed_runtime/server_lib.h b/tensorflow/core/distributed_runtime/server_lib.h
index a064d20cdb..275f526d31 100644
--- a/tensorflow/core/distributed_runtime/server_lib.h
+++ b/tensorflow/core/distributed_runtime/server_lib.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_SERVER_LIB_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_SERVER_LIB_H_
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_SERVER_LIB_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_SERVER_LIB_H_
 
 #include <memory>
 
@@ -95,4 +95,4 @@ Status NewServer(const ServerDef& server_def,
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_SERVER_LIB_H_
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_SERVER_LIB_H_
diff --git a/tensorflow/core/distributed_runtime/session_mgr.h b/tensorflow/core/distributed_runtime/session_mgr.h
index ba077c3acc..3ce260d12e 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.h
+++ b/tensorflow/core/distributed_runtime/session_mgr.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_SESSION_MGR_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_SESSION_MGR_H_
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_SESSION_MGR_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_SESSION_MGR_H_
 
 #include <functional>
 
@@ -87,4 +87,4 @@ class SessionMgr {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_SESSION_MGR_H_
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_SESSION_MGR_H_
diff --git a/tensorflow/core/distributed_runtime/worker.h b/tensorflow/core/distributed_runtime/worker.h
index c62347926f..62fa5f3cf5 100644
--- a/tensorflow/core/distributed_runtime/worker.h
+++ b/tensorflow/core/distributed_runtime/worker.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_H_
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_H_
 
 #include <unordered_map>
 
@@ -120,4 +120,4 @@ class Worker : public WorkerInterface {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_H_
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_H_
diff --git a/tensorflow/core/distributed_runtime/worker_session.h b/tensorflow/core/distributed_runtime/worker_session.h
index 9da3bb253f..0fd19ac27f 100644
--- a/tensorflow/core/distributed_runtime/worker_session.h
+++ b/tensorflow/core/distributed_runtime/worker_session.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_SESSION_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_SESSION_H_
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_SESSION_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_SESSION_H_
 
 #include <string>
 
@@ -61,4 +61,4 @@ struct WorkerSession {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_SESSION_H_
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_SESSION_H_
diff --git a/tensorflow/core/example/example_parser_configuration.h b/tensorflow/core/example/example_parser_configuration.h
index 69955ec4cb..3d06bd55e2 100644
--- a/tensorflow/core/example/example_parser_configuration.h
+++ b/tensorflow/core/example/example_parser_configuration.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_EXAMPLE_EXAMPLE_PARSER_CONFIGURATION_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_EXAMPLE_EXAMPLE_PARSER_CONFIGURATION_H_
+#ifndef TENSORFLOW_CORE_EXAMPLE_EXAMPLE_PARSER_CONFIGURATION_H_
+#define TENSORFLOW_CORE_EXAMPLE_EXAMPLE_PARSER_CONFIGURATION_H_
 
 #include <string>
 #include <vector>
@@ -53,4 +53,4 @@ Status ExampleParserConfigurationProtoToFeatureVectors(
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_EXAMPLE_EXAMPLE_PARSE_CONFIGURATION_H_
+#endif  // TENSORFLOW_CORE_EXAMPLE_EXAMPLE_PARSE_CONFIGURATION_H_
diff --git a/tensorflow/core/framework/common_shape_fns.h b/tensorflow/core/framework/common_shape_fns.h
index c0deb473a2..293c40e04d 100644
--- a/tensorflow/core/framework/common_shape_fns.h
+++ b/tensorflow/core/framework/common_shape_fns.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_OPS_COMMON_SHAPE_FNS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_OPS_COMMON_SHAPE_FNS_H_
+#ifndef TENSORFLOW_CORE_OPS_COMMON_SHAPE_FNS_H_
+#define TENSORFLOW_CORE_OPS_COMMON_SHAPE_FNS_H_
 
 #include <array>
 
@@ -287,4 +287,4 @@ Status ExplicitShape(InferenceContext* c);
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_OPS_COMMON_SHAPE_FNS_H_
+#endif  // TENSORFLOW_CORE_OPS_COMMON_SHAPE_FNS_H_
diff --git a/tensorflow/core/framework/shape_inference.h b/tensorflow/core/framework/shape_inference.h
index 4a4ef12635..d552ec1693 100644
--- a/tensorflow/core/framework/shape_inference.h
+++ b/tensorflow/core/framework/shape_inference.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_FRAMEWORK_SHAPE_INFERENCE_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_FRAMEWORK_SHAPE_INFERENCE_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_SHAPE_INFERENCE_H_
+#define TENSORFLOW_CORE_FRAMEWORK_SHAPE_INFERENCE_H_
 
 #include <vector>
 
@@ -787,4 +787,4 @@ Status InferenceContext::GetAttr(StringPiece attr_name, T* value) const {
 }  // namespace shape_inference
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_FRAMEWORK_SHAPE_INFERENCE_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_SHAPE_INFERENCE_H_
diff --git a/tensorflow/core/framework/shape_inference_testutil.h b/tensorflow/core/framework/shape_inference_testutil.h
index fbfd24538b..7977841482 100644
--- a/tensorflow/core/framework/shape_inference_testutil.h
+++ b/tensorflow/core/framework/shape_inference_testutil.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_FRAMEWORK_SHAPE_INFERENCE_TESTUTIL_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_FRAMEWORK_SHAPE_INFERENCE_TESTUTIL_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_SHAPE_INFERENCE_TESTUTIL_H_
+#define TENSORFLOW_CORE_FRAMEWORK_SHAPE_INFERENCE_TESTUTIL_H_
 
 #include <vector>
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -98,4 +98,4 @@ class ShapeInferenceTestutil {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_FRAMEWORK_SHAPE_INFERENCE_TESTUTIL_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_SHAPE_INFERENCE_TESTUTIL_H_
diff --git a/tensorflow/core/graph/gradients.h b/tensorflow/core/graph/gradients.h
index 75906e6ce9..ddfed084b0 100644
--- a/tensorflow/core/graph/gradients.h
+++ b/tensorflow/core/graph/gradients.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_GRAPH_GRADIENTS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_GRAPH_GRADIENTS_H_
+#ifndef TENSORFLOW_CORE_GRAPH_GRADIENTS_H_
+#define TENSORFLOW_CORE_GRAPH_GRADIENTS_H_
 
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -55,4 +55,4 @@ Status AddSymbolicGradients(gtl::ArraySlice<NodeOut> y_node_outputs,
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_GRAPH_GRADIENTS_H_
+#endif  // TENSORFLOW_CORE_GRAPH_GRADIENTS_H_
diff --git a/tensorflow/core/grappler/costs/op_context.h b/tensorflow/core/grappler/costs/op_context.h
index 735a1e68ea..6391de4a91 100644
--- a/tensorflow/core/grappler/costs/op_context.h
+++ b/tensorflow/core/grappler/costs/op_context.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_COSTS_OP_CONTEXT_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_COSTS_OP_CONTEXT_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_COSTS_OP_CONTEXT_H_
+#define TENSORFLOW_CORE_GRAPPLER_COSTS_OP_CONTEXT_H_
 
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/grappler/costs/op_performance_data.pb.h"
@@ -36,4 +36,4 @@ struct OpContext {
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_COSTS_OP_CONTEXT_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_COSTS_OP_CONTEXT_H_
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.h b/tensorflow/core/grappler/costs/virtual_scheduler.h
index c180250908..8ccc51f545 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.h
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_COSTS_VIRTUAL_SCHEDULER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_COSTS_VIRTUAL_SCHEDULER_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_COSTS_VIRTUAL_SCHEDULER_H_
+#define TENSORFLOW_CORE_GRAPPLER_COSTS_VIRTUAL_SCHEDULER_H_
 
 #include <list>
 #include <memory>
@@ -342,4 +342,4 @@ class VirtualScheduler {
 }  // namespace grappler
 }  // end namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_COSTS_VIRTUAL_SCHEDULER_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_COSTS_VIRTUAL_SCHEDULER_H_
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer.h b/tensorflow/core/grappler/optimizers/dependency_optimizer.h
index 3f6f418bee..02d8a0f32a 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DEPENDENCY_OPTIMIZER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DEPENDENCY_OPTIMIZER_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DEPENDENCY_OPTIMIZER_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DEPENDENCY_OPTIMIZER_H_
 
 #include <unordered_set>
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
@@ -73,4 +73,4 @@ class DependencyOptimizer : public GraphOptimizer {
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DEPENDENCY_OPTIMIZER_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DEPENDENCY_OPTIMIZER_H_
diff --git a/tensorflow/core/grappler/optimizers/static_schedule.h b/tensorflow/core/grappler/optimizers/static_schedule.h
index aa2726a2bd..678b4d193f 100644
--- a/tensorflow/core/grappler/optimizers/static_schedule.h
+++ b/tensorflow/core/grappler/optimizers/static_schedule.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_STATIC_SCHEDULE_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_STATIC_SCHEDULE_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_STATIC_SCHEDULE_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_STATIC_SCHEDULE_H_
 
 #include <unordered_map>
 
@@ -47,4 +47,4 @@ Status EstimateRequiredTimes(
 }  // namespace grappler
 }  // end namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_STATIC_SCHEDULE_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_STATIC_SCHEDULE_H_
diff --git a/tensorflow/core/grappler/utils/frame.h b/tensorflow/core/grappler/utils/frame.h
index be726ae795..95b72748f4 100644
--- a/tensorflow/core/grappler/utils/frame.h
+++ b/tensorflow/core/grappler/utils/frame.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_UTILS_FRAME_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_UTILS_FRAME_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_UTILS_FRAME_H_
+#define TENSORFLOW_CORE_GRAPPLER_UTILS_FRAME_H_
 
 #include <unordered_map>
 #include "tensorflow/core/framework/graph.pb.h"
@@ -40,4 +40,4 @@ Status IdentifyFramesWithNodeMap(const GraphDef& graph, const NodeMap& node_map,
 }  // namespace grappler
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_UTILS_FRAME_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_UTILS_FRAME_H_
diff --git a/tensorflow/core/grappler/utils/scc.h b/tensorflow/core/grappler/utils/scc.h
index 4e46169971..4fb7aab647 100644
--- a/tensorflow/core/grappler/utils/scc.h
+++ b/tensorflow/core/grappler/utils/scc.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_UTILS_SCC_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_UTILS_SCC_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_UTILS_SCC_H_
+#define TENSORFLOW_CORE_GRAPPLER_UTILS_SCC_H_
 
 #include <unordered_map>
 #include "tensorflow/core/framework/graph.pb.h"
@@ -43,4 +43,4 @@ int IdentifyLoops(const GraphDef& graph,
 }  // namespace grappler
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_UTILS_SCC_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_UTILS_SCC_H_
diff --git a/tensorflow/core/grappler/utils/topological_sort.h b/tensorflow/core/grappler/utils/topological_sort.h
index f2c9bbfa4e..7700fe41e4 100644
--- a/tensorflow/core/grappler/utils/topological_sort.h
+++ b/tensorflow/core/grappler/utils/topological_sort.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_UTILS_TOPOLOGICAL_SORT_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_UTILS_TOPOLOGICAL_SORT_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_UTILS_TOPOLOGICAL_SORT_H_
+#define TENSORFLOW_CORE_GRAPPLER_UTILS_TOPOLOGICAL_SORT_H_
 
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -28,4 +28,4 @@ Status TopologicalSort(GraphDef* graph);
 }  // namespace grappler
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_UTILS_TOPOLOGICAL_SORT_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_UTILS_TOPOLOGICAL_SORT_H_
diff --git a/tensorflow/core/kernels/adjust_hsv_gpu.cu.h b/tensorflow/core/kernels/adjust_hsv_gpu.cu.h
index c160ce2c33..49df5ae296 100644
--- a/tensorflow/core/kernels/adjust_hsv_gpu.cu.h
+++ b/tensorflow/core/kernels/adjust_hsv_gpu.cu.h
@@ -11,8 +11,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_ADJUST_HSV_GPU_CU_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_ADJUST_HSV_GPU_CU_H_
+#ifndef TENSORFLOW_CORE_KERNELS_ADJUST_HSV_GPU_CU_H_
+#define TENSORFLOW_CORE_KERNELS_ADJUST_HSV_GPU_CU_H_
 
 #if GOOGLE_CUDA
 
@@ -143,4 +143,4 @@ __global__ void adjust_hsv_nhwc(const int64 number_elements,
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_ADJUST_HSV_GPU_CU_H_
+#endif  // TENSORFLOW_CORE_KERNELS_ADJUST_HSV_GPU_CU_H_
diff --git a/tensorflow/core/kernels/batch_util.h b/tensorflow/core/kernels/batch_util.h
index b066e2a574..0d634ae7b0 100644
--- a/tensorflow/core/kernels/batch_util.h
+++ b/tensorflow/core/kernels/batch_util.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_BATCH_UTIL_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_BATCH_UTIL_H_
+#ifndef TENSORFLOW_CORE_KERNELS_BATCH_UTIL_H_
+#define TENSORFLOW_CORE_KERNELS_BATCH_UTIL_H_
 
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -35,4 +35,4 @@ Status CopySliceToElement(const Tensor& parent, Tensor* element, int64 index);
 }  // namespace batch_util
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_BATCH_UTIL_H_
+#endif  // TENSORFLOW_CORE_KERNELS_BATCH_UTIL_H_
diff --git a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
index ff8ebb349f..25c5f9cf42 100644
--- a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
@@ -13,9 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_ADAPTIVE_SHARED_BATCH_SCHEDULER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_ADAPTIVE_SHARED_BATCH_SCHEDULER_H_
-
+#ifndef TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_ADAPTIVE_SHARED_BATCH_SCHEDULER_H_
+#define TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_ADAPTIVE_SHARED_BATCH_SCHEDULER_H_
 
 #include <algorithm>
 #include <functional>
@@ -657,4 +656,4 @@ size_t ASBSQueue<TaskType>::SchedulingCapacity() const {
 }  // namespace serving
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_ADAPTIVE_SHARED_BATCH_SCHEDULER_H_
+#endif  // TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_ADAPTIVE_SHARED_BATCH_SCHEDULER_H_
diff --git a/tensorflow/core/kernels/batching_util/basic_batch_scheduler.h b/tensorflow/core/kernels/batching_util/basic_batch_scheduler.h
index 9207972100..2b5a991caf 100644
--- a/tensorflow/core/kernels/batching_util/basic_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/basic_batch_scheduler.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_BASIC_BATCH_SCHEDULER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_BASIC_BATCH_SCHEDULER_H_
+#ifndef TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_BASIC_BATCH_SCHEDULER_H_
+#define TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_BASIC_BATCH_SCHEDULER_H_
 
 #include <stddef.h>
 #include <cstddef>
@@ -265,4 +265,4 @@ BasicBatchScheduler<TaskType>::BasicBatchScheduler(
 }  // namespace serving
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_BASIC_BATCH_SCHEDULER_H_
+#endif  // TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_BASIC_BATCH_SCHEDULER_H_
diff --git a/tensorflow/core/kernels/batching_util/batch_scheduler.h b/tensorflow/core/kernels/batching_util/batch_scheduler.h
index a5316f152b..f6d9a8f0c8 100644
--- a/tensorflow/core/kernels/batching_util/batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/batch_scheduler.h
@@ -23,8 +23,8 @@ limitations under the License.
 //
 // This file defines an abstract BatchScheduler class.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_BATCH_SCHEDULER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_BATCH_SCHEDULER_H_
+#ifndef TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_BATCH_SCHEDULER_H_
+#define TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_BATCH_SCHEDULER_H_
 
 #include <stddef.h>
 #include <algorithm>
@@ -278,4 +278,4 @@ void Batch<TaskType>::Close() {
 }  // namespace serving
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_BATCH_SCHEDULER_H_
+#endif  // TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_BATCH_SCHEDULER_H_
diff --git a/tensorflow/core/kernels/batching_util/fake_clock_env.h b/tensorflow/core/kernels/batching_util/fake_clock_env.h
index b2848afe07..60f1cbe7bd 100644
--- a/tensorflow/core/kernels/batching_util/fake_clock_env.h
+++ b/tensorflow/core/kernels/batching_util/fake_clock_env.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_FAKE_CLOCK_ENV_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_FAKE_CLOCK_ENV_H_
+#ifndef TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_FAKE_CLOCK_ENV_H_
+#define TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_FAKE_CLOCK_ENV_H_
 
 #include <functional>
 #include <string>
@@ -73,4 +73,4 @@ class FakeClockEnv : public EnvWrapper {
 }  // namespace serving
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_FAKE_CLOCK_ENV_H_
+#endif  // TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_FAKE_CLOCK_ENV_H_
diff --git a/tensorflow/core/kernels/batching_util/periodic_function.h b/tensorflow/core/kernels/batching_util/periodic_function.h
index 6811cd015e..dbf1733dcc 100644
--- a/tensorflow/core/kernels/batching_util/periodic_function.h
+++ b/tensorflow/core/kernels/batching_util/periodic_function.h
@@ -49,9 +49,8 @@ limitations under the License.
 //     PeriodicFunction periodic_function_;
 //   };
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_PERIODIC_FUNCTION_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_PERIODIC_FUNCTION_H_
-
+#ifndef TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_PERIODIC_FUNCTION_H_
+#define TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_PERIODIC_FUNCTION_H_
 
 #include "tensorflow/core/kernels/batching_util/periodic_function.h"
 
@@ -132,4 +131,4 @@ class PeriodicFunction {
 }  // namespace serving
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_PERIODIC_FUNCTION_H_
+#endif  // TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_PERIODIC_FUNCTION_H_
diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
index 3736d8ef64..b77289aded 100644
--- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_SHARED_BATCH_SCHEDULER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_SHARED_BATCH_SCHEDULER_H_
+#ifndef TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_SHARED_BATCH_SCHEDULER_H_
+#define TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_SHARED_BATCH_SCHEDULER_H_
 
 #include <stddef.h>
 #include <deque>
@@ -702,4 +702,4 @@ size_t QueueHandle<TaskType>::SchedulingCapacity() const {
 }  // namespace serving
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_SHARED_BATCH_SCHEDULER_H_
+#endif  // TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_SHARED_BATCH_SCHEDULER_H_
diff --git a/tensorflow/core/kernels/bitcast_op.h b/tensorflow/core/kernels/bitcast_op.h
index 0413569e79..900ab6f35c 100644
--- a/tensorflow/core/kernels/bitcast_op.h
+++ b/tensorflow/core/kernels/bitcast_op.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // See docs in ../ops/array_ops.cc.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_BITCAST_OP_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_BITCAST_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_BITCAST_OP_H_
+#define TENSORFLOW_CORE_KERNELS_BITCAST_OP_H_
 
 #include <string.h>  // for memcpy
 
@@ -27,4 +27,4 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/casts.h"
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_BITCAST_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_BITCAST_OP_H_
diff --git a/tensorflow/core/kernels/captured_function.h b/tensorflow/core/kernels/captured_function.h
index cdf191f4c7..2d2d87134e 100644
--- a/tensorflow/core/kernels/captured_function.h
+++ b/tensorflow/core/kernels/captured_function.h
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_CAPTURED_FUNCTION_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_CAPTURED_FUNCTION_H_
+#ifndef TENSORFLOW_CORE_KERNELS_CAPTURED_FUNCTION_H_
+#define TENSORFLOW_CORE_KERNELS_CAPTURED_FUNCTION_H_
 
 #include "tensorflow/core/kernels/data/captured_function.h"
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_CAPTURED_FUNCTION_H_
+#endif  // TENSORFLOW_CORE_KERNELS_CAPTURED_FUNCTION_H_
diff --git a/tensorflow/core/kernels/cast_op_impl.h b/tensorflow/core/kernels/cast_op_impl.h
index 6309e4a4dc..470e9e0804 100644
--- a/tensorflow/core/kernels/cast_op_impl.h
+++ b/tensorflow/core/kernels/cast_op_impl.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_CAST_OP_IMPL_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_CAST_OP_IMPL_H_
+#ifndef TENSORFLOW_CORE_KERNELS_CAST_OP_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_CAST_OP_IMPL_H_
 
 #define EIGEN_USE_THREADS
 
@@ -181,4 +181,4 @@ GetSyclCastFromDouble(DataType dst_dtype);
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_CAST_OP_IMPL_H_
+#endif  // TENSORFLOW_CORE_KERNELS_CAST_OP_IMPL_H_
diff --git a/tensorflow/core/kernels/compare_and_bitpack_op.h b/tensorflow/core/kernels/compare_and_bitpack_op.h
index 8e020249c1..af8566c7ce 100644
--- a/tensorflow/core/kernels/compare_and_bitpack_op.h
+++ b/tensorflow/core/kernels/compare_and_bitpack_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_COMPARE_AND_BITPACK_OP_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_COMPARE_AND_BITPACK_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_COMPARE_AND_BITPACK_OP_H_
+#define TENSORFLOW_CORE_KERNELS_COMPARE_AND_BITPACK_OP_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -39,4 +39,4 @@ struct CompareAndBitpack {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_COMPARE_AND_BITPACK_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_COMPARE_AND_BITPACK_OP_H_
diff --git a/tensorflow/core/kernels/cuda_device_array.h b/tensorflow/core/kernels/cuda_device_array.h
index a570993cf8..e7a5db0683 100644
--- a/tensorflow/core/kernels/cuda_device_array.h
+++ b/tensorflow/core/kernels/cuda_device_array.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_CUDA_DEVICE_ARRAY_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_CUDA_DEVICE_ARRAY_H_
+#ifndef TENSORFLOW_CORE_KERNELS_CUDA_DEVICE_ARRAY_H_
+#define TENSORFLOW_CORE_KERNELS_CUDA_DEVICE_ARRAY_H_
 
 #if GOOGLE_CUDA
 
@@ -117,4 +117,4 @@ class CudaDeviceArrayOnHost {
 
 #endif  // GOOGLE_CUDA
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_CUDA_DEVICE_ARRAY_H_
+#endif  // TENSORFLOW_CORE_KERNELS_CUDA_DEVICE_ARRAY_H_
diff --git a/tensorflow/core/kernels/cuda_device_array_gpu.h b/tensorflow/core/kernels/cuda_device_array_gpu.h
index 220f762636..64fa3cb806 100644
--- a/tensorflow/core/kernels/cuda_device_array_gpu.h
+++ b/tensorflow/core/kernels/cuda_device_array_gpu.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // Contains structs and functions to be included in device code.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_CUDA_DEVICE_ARRAY_GPU_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_CUDA_DEVICE_ARRAY_GPU_H_
+#ifndef TENSORFLOW_CORE_KERNELS_CUDA_DEVICE_ARRAY_GPU_H_
+#define TENSORFLOW_CORE_KERNELS_CUDA_DEVICE_ARRAY_GPU_H_
 
 #if GOOGLE_CUDA
 
@@ -47,4 +47,4 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ValueType* GetCudaDeviceArrayOnDevice(
 
 #endif  // GOOGLE_CUDA
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_CUDA_DEVICE_ARRAY_GPU_H_
+#endif  // TENSORFLOW_CORE_KERNELS_CUDA_DEVICE_ARRAY_GPU_H_
diff --git a/tensorflow/core/kernels/data/captured_function.h b/tensorflow/core/kernels/data/captured_function.h
index 99e0ef426e..32d2bc3aae 100644
--- a/tensorflow/core/kernels/data/captured_function.h
+++ b/tensorflow/core/kernels/data/captured_function.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATA_CAPTURED_FUNCTION_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATA_CAPTURED_FUNCTION_H_
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_CAPTURED_FUNCTION_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_CAPTURED_FUNCTION_H_
 
 #include <memory>
 #include <vector>
@@ -105,4 +105,4 @@ class CapturedFunction {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATA_CAPTURED_FUNCTION_H_
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_CAPTURED_FUNCTION_H_
diff --git a/tensorflow/core/kernels/data/dataset.h b/tensorflow/core/kernels/data/dataset.h
index fbaaf22527..2ef31ddfaa 100644
--- a/tensorflow/core/kernels/data/dataset.h
+++ b/tensorflow/core/kernels/data/dataset.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATA_DATASET_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATA_DATASET_H_
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_DATASET_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_DATASET_H_
 
 #include <memory>
 
@@ -606,4 +606,4 @@ Status StoreDatasetInVariantTensor(DatasetBase* dataset, Tensor* tensor);
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATA_DATASET_H_
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_DATASET_H_
diff --git a/tensorflow/core/kernels/data/dataset_utils.h b/tensorflow/core/kernels/data/dataset_utils.h
index 40bc873584..6c4191c2be 100644
--- a/tensorflow/core/kernels/data/dataset_utils.h
+++ b/tensorflow/core/kernels/data/dataset_utils.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATA_DATASET_UTILS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATA_DATASET_UTILS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_DATASET_UTILS_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_DATASET_UTILS_H_
 
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/data/captured_function.h"
@@ -32,4 +32,4 @@ Status MakeIteratorFromInputElement(
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATA_DATASET_UTILS_H_
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_DATASET_UTILS_H_
diff --git a/tensorflow/core/kernels/data/sql/driver_manager.h b/tensorflow/core/kernels/data/sql/driver_manager.h
index 0d0c38eb58..a34691b5a2 100644
--- a/tensorflow/core/kernels/data/sql/driver_manager.h
+++ b/tensorflow/core/kernels/data/sql/driver_manager.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATA_SQL_DRIVER_MANAGER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATA_SQL_DRIVER_MANAGER_H_
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_SQL_DRIVER_MANAGER_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_SQL_DRIVER_MANAGER_H_
 
 #include "tensorflow/core/kernels/data/sql/query_connection.h"
 
@@ -38,4 +38,4 @@ class DriverManager {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATA_SQL_DRIVER_MANAGER_H_
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_SQL_DRIVER_MANAGER_H_
diff --git a/tensorflow/core/kernels/data/sql/query_connection.h b/tensorflow/core/kernels/data/sql/query_connection.h
index 1947148972..f31017bd19 100644
--- a/tensorflow/core/kernels/data/sql/query_connection.h
+++ b/tensorflow/core/kernels/data/sql/query_connection.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATA_SQL_QUERY_CONNECTION_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATA_SQL_QUERY_CONNECTION_H_
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_SQL_QUERY_CONNECTION_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_SQL_QUERY_CONNECTION_H_
 
 #include "tensorflow/core/framework/tensor.h"
 
@@ -64,4 +64,4 @@ class QueryConnection {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATA_SQL_QUERY_CONNECTION_H_
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_SQL_QUERY_CONNECTION_H_
diff --git a/tensorflow/core/kernels/data/sql/sqlite_query_connection.h b/tensorflow/core/kernels/data/sql/sqlite_query_connection.h
index b36b69eae4..787c17d6c0 100644
--- a/tensorflow/core/kernels/data/sql/sqlite_query_connection.h
+++ b/tensorflow/core/kernels/data/sql/sqlite_query_connection.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATA_SQL_SQLITE_QUERY_CONNECTION_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATA_SQL_SQLITE_QUERY_CONNECTION_H_
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_SQL_SQLITE_QUERY_CONNECTION_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_SQL_SQLITE_QUERY_CONNECTION_H_
 
 #include <memory>
 
@@ -53,4 +53,4 @@ class SqliteQueryConnection : public QueryConnection {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATA_SQL_SQLITE_QUERY_CONNECTION_H_
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_SQL_SQLITE_QUERY_CONNECTION_H_
diff --git a/tensorflow/core/kernels/data/stats_aggregator.h b/tensorflow/core/kernels/data/stats_aggregator.h
index 4cb8dba5cb..076a56b0bf 100644
--- a/tensorflow/core/kernels/data/stats_aggregator.h
+++ b/tensorflow/core/kernels/data/stats_aggregator.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATA_STATS_AGGREGATOR_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATA_STATS_AGGREGATOR_H_
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_STATS_AGGREGATOR_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_STATS_AGGREGATOR_H_
 
 #include <memory>
 #include <string>
@@ -81,4 +81,4 @@ class StatsAggregatorResource : public ResourceBase {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATA_STATS_AGGREGATOR_H_
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_STATS_AGGREGATOR_H_
diff --git a/tensorflow/core/kernels/data/window_dataset.h b/tensorflow/core/kernels/data/window_dataset.h
index 25396bd3e7..97c31668ac 100644
--- a/tensorflow/core/kernels/data/window_dataset.h
+++ b/tensorflow/core/kernels/data/window_dataset.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATA_WINDOW_DATASET_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATA_WINDOW_DATASET_H_
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_WINDOW_DATASET_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_WINDOW_DATASET_H_
 
 #include <vector>
 
@@ -45,4 +45,4 @@ Status NewWindowDataset(std::vector<std::vector<Tensor>> elements,
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATA_WINDOW_DATASET_H_
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_WINDOW_DATASET_H_
diff --git a/tensorflow/core/kernels/dataset.h b/tensorflow/core/kernels/dataset.h
index 2aa6dbe6f3..69ab78d635 100644
--- a/tensorflow/core/kernels/dataset.h
+++ b/tensorflow/core/kernels/dataset.h
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATASET_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATASET_H_
+#ifndef TENSORFLOW_CORE_KERNELS_DATASET_H_
+#define TENSORFLOW_CORE_KERNELS_DATASET_H_
 
 #include "tensorflow/core/kernels/data/dataset.h"
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATASET_H_
+#endif  // TENSORFLOW_CORE_KERNELS_DATASET_H_
diff --git a/tensorflow/core/kernels/deep_conv2d.h b/tensorflow/core/kernels/deep_conv2d.h
index c3f6f66dc9..17a0230516 100644
--- a/tensorflow/core/kernels/deep_conv2d.h
+++ b/tensorflow/core/kernels/deep_conv2d.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DEEP_CONV2D_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DEEP_CONV2D_H_
+#ifndef TENSORFLOW_CORE_KERNELS_DEEP_CONV2D_H_
+#define TENSORFLOW_CORE_KERNELS_DEEP_CONV2D_H_
 
 #include "tensorflow/core/framework/types.h"
 
@@ -114,4 +114,4 @@ struct DeepConv2D {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DEEP_CONV2D_H_
+#endif  // TENSORFLOW_CORE_KERNELS_DEEP_CONV2D_H_
diff --git a/tensorflow/core/kernels/depthwise_conv_op.h b/tensorflow/core/kernels/depthwise_conv_op.h
index 097a9f5bfa..ba262d56ee 100644
--- a/tensorflow/core/kernels/depthwise_conv_op.h
+++ b/tensorflow/core/kernels/depthwise_conv_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DEPTHWISE_CONV_OP_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DEPTHWISE_CONV_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_DEPTHWISE_CONV_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DEPTHWISE_CONV_OP_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/types.h"
@@ -284,4 +284,4 @@ struct DepthwiseInputCopyOp {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DEPTHWISE_CONV_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_DEPTHWISE_CONV_OP_H_
diff --git a/tensorflow/core/kernels/determinant_op.h b/tensorflow/core/kernels/determinant_op.h
index e931e328e4..eefdfe0ae4 100644
--- a/tensorflow/core/kernels/determinant_op.h
+++ b/tensorflow/core/kernels/determinant_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DETERMINANT_OP_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DETERMINANT_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_DETERMINANT_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DETERMINANT_OP_H_
 
 #include "tensorflow/core/framework/tensor_types.h"
 
@@ -44,4 +44,4 @@ struct LogDeterminantFromPivotedLUFunctor {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DETERMINANT_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_DETERMINANT_OP_H_
diff --git a/tensorflow/core/kernels/eigen_activations.h b/tensorflow/core/kernels/eigen_activations.h
index 57c8157b87..99b4b2abe6 100644
--- a/tensorflow/core/kernels/eigen_activations.h
+++ b/tensorflow/core/kernels/eigen_activations.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_ACTIVATIONS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_ACTIVATIONS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_EIGEN_ACTIVATIONS_H_
+#define TENSORFLOW_CORE_KERNELS_EIGEN_ACTIVATIONS_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
@@ -122,4 +122,4 @@ struct functor_traits<scalar_clip_op<Scalar> > {
 
 }  // end namespace Eigen
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_ACTIVATIONS_H_
+#endif  // TENSORFLOW_CORE_KERNELS_EIGEN_ACTIVATIONS_H_
diff --git a/tensorflow/core/kernels/eigen_attention.h b/tensorflow/core/kernels/eigen_attention.h
index f4c42372b1..3a94b8c993 100644
--- a/tensorflow/core/kernels/eigen_attention.h
+++ b/tensorflow/core/kernels/eigen_attention.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_ATTENTION_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_ATTENTION_H_
+#ifndef TENSORFLOW_CORE_KERNELS_EIGEN_ATTENTION_H_
+#define TENSORFLOW_CORE_KERNELS_EIGEN_ATTENTION_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
@@ -239,4 +239,4 @@ ExtractGlimpses(const Input& input,
 
 } // end namespace Eigen
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_ATTENTION_H_
+#endif  // TENSORFLOW_CORE_KERNELS_EIGEN_ATTENTION_H_
diff --git a/tensorflow/core/kernels/eigen_backward_cuboid_convolutions.h b/tensorflow/core/kernels/eigen_backward_cuboid_convolutions.h
index a44e7197a9..e13e548f86 100644
--- a/tensorflow/core/kernels/eigen_backward_cuboid_convolutions.h
+++ b/tensorflow/core/kernels/eigen_backward_cuboid_convolutions.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_BACKWARD_CUBOID_CONVOLUTIONS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_BACKWARD_CUBOID_CONVOLUTIONS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_EIGEN_BACKWARD_CUBOID_CONVOLUTIONS_H_
+#define TENSORFLOW_CORE_KERNELS_EIGEN_BACKWARD_CUBOID_CONVOLUTIONS_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/kernels/eigen_volume_patch.h"
@@ -617,4 +617,4 @@ CuboidConvolutionBackwardKernel(
 
 }  // end namespace Eigen
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_BACKWARD_CUBOID_CONVOLUTIONS_H_
+#endif  // TENSORFLOW_CORE_KERNELS_EIGEN_BACKWARD_CUBOID_CONVOLUTIONS_H_
diff --git a/tensorflow/core/kernels/eigen_backward_spatial_convolutions.h b/tensorflow/core/kernels/eigen_backward_spatial_convolutions.h
index d172de8e18..aec7697810 100644
--- a/tensorflow/core/kernels/eigen_backward_spatial_convolutions.h
+++ b/tensorflow/core/kernels/eigen_backward_spatial_convolutions.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_BACKWARD_SPATIAL_CONVOLUTIONS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_BACKWARD_SPATIAL_CONVOLUTIONS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_EIGEN_BACKWARD_SPATIAL_CONVOLUTIONS_H_
+#define TENSORFLOW_CORE_KERNELS_EIGEN_BACKWARD_SPATIAL_CONVOLUTIONS_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
diff --git a/tensorflow/core/kernels/eigen_cuboid_convolution.h b/tensorflow/core/kernels/eigen_cuboid_convolution.h
index 2dca664a86..62e9f9123d 100644
--- a/tensorflow/core/kernels/eigen_cuboid_convolution.h
+++ b/tensorflow/core/kernels/eigen_cuboid_convolution.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_CUBOID_CONVOLUTION_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_CUBOID_CONVOLUTION_H_
+#ifndef TENSORFLOW_CORE_KERNELS_EIGEN_CUBOID_CONVOLUTION_H_
+#define TENSORFLOW_CORE_KERNELS_EIGEN_CUBOID_CONVOLUTION_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/kernels/eigen_volume_patch.h"
@@ -224,4 +224,4 @@ CuboidConvolution(const Input& input, const Kernel& kernel,
 
 }  // end namespace Eigen
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_CUBOID_CONVOLUTION_H_
+#endif  // TENSORFLOW_CORE_KERNELS_EIGEN_CUBOID_CONVOLUTION_H_
diff --git a/tensorflow/core/kernels/eigen_pooling.h b/tensorflow/core/kernels/eigen_pooling.h
index 94100d71ec..972036833f 100644
--- a/tensorflow/core/kernels/eigen_pooling.h
+++ b/tensorflow/core/kernels/eigen_pooling.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_POOLING_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_POOLING_H_
+#ifndef TENSORFLOW_CORE_KERNELS_EIGEN_POOLING_H_
+#define TENSORFLOW_CORE_KERNELS_EIGEN_POOLING_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/kernels/eigen_volume_patch.h"
@@ -610,4 +610,4 @@ CuboidAvgPooling(const Input& input, DenseIndex patchPlanes,
 
 }  // end namespace Eigen
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_POOLING_H_
+#endif  // TENSORFLOW_CORE_KERNELS_EIGEN_POOLING_H_
diff --git a/tensorflow/core/kernels/eigen_softmax.h b/tensorflow/core/kernels/eigen_softmax.h
index 20bb8a44dd..a2930a726f 100644
--- a/tensorflow/core/kernels/eigen_softmax.h
+++ b/tensorflow/core/kernels/eigen_softmax.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_SOFTMAX_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_SOFTMAX_H_
+#ifndef TENSORFLOW_CORE_KERNELS_EIGEN_SOFTMAX_H_
+#define TENSORFLOW_CORE_KERNELS_EIGEN_SOFTMAX_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
@@ -87,4 +87,4 @@ SoftMax(const Input& input, const float beta)
 
 } // end namespace Eigen
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_SOFTMAX_H_
+#endif  // TENSORFLOW_CORE_KERNELS_EIGEN_SOFTMAX_H_
diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions.h b/tensorflow/core/kernels/eigen_spatial_convolutions.h
index 7702f3e70a..2fe64cd72a 100644
--- a/tensorflow/core/kernels/eigen_spatial_convolutions.h
+++ b/tensorflow/core/kernels/eigen_spatial_convolutions.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_H_
+#define TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
@@ -1069,4 +1069,4 @@ EIGEN_DEVICE_FUNC
 
 }  // end namespace Eigen
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_H_
+#endif  // TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_H_
diff --git a/tensorflow/core/kernels/eigen_volume_patch.h b/tensorflow/core/kernels/eigen_volume_patch.h
index afd5f37e35..a3d795813d 100644
--- a/tensorflow/core/kernels/eigen_volume_patch.h
+++ b/tensorflow/core/kernels/eigen_volume_patch.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_VOLUME_PATCH_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_VOLUME_PATCH_H_
+#ifndef TENSORFLOW_CORE_KERNELS_EIGEN_VOLUME_PATCH_H_
+#define TENSORFLOW_CORE_KERNELS_EIGEN_VOLUME_PATCH_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
@@ -653,4 +653,4 @@ OVERRIDE_EVALUATOR(Eigen::DefaultDevice);
 
 };  // namespace Eigen
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_VOLUME_PATCH_H_
+#endif  // TENSORFLOW_CORE_KERNELS_EIGEN_VOLUME_PATCH_H_
diff --git a/tensorflow/core/kernels/eye_functor.h b/tensorflow/core/kernels/eye_functor.h
index 70f093f813..3799cfba9a 100644
--- a/tensorflow/core/kernels/eye_functor.h
+++ b/tensorflow/core/kernels/eye_functor.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EYE_FUNCTOR_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EYE_FUNCTOR_H_
+#ifndef TENSORFLOW_CORE_KERNELS_EYE_FUNCTOR_H_
+#define TENSORFLOW_CORE_KERNELS_EYE_FUNCTOR_H_
 
 #include "tensorflow/core/framework/tensor_types.h"
 
@@ -29,4 +29,4 @@ struct EyeFunctor {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EYE_FUNCTOR_H_
+#endif  // TENSORFLOW_CORE_KERNELS_EYE_FUNCTOR_H_
diff --git a/tensorflow/core/kernels/fake_quant_ops_functor.h b/tensorflow/core/kernels/fake_quant_ops_functor.h
index 7aaad6e6c7..81189866c3 100644
--- a/tensorflow/core/kernels/fake_quant_ops_functor.h
+++ b/tensorflow/core/kernels/fake_quant_ops_functor.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_FAKE_QUANT_FUNCTOR_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_FAKE_QUANT_FUNCTOR_H_
+#ifndef TENSORFLOW_CORE_KERNELS_FAKE_QUANT_FUNCTOR_H_
+#define TENSORFLOW_CORE_KERNELS_FAKE_QUANT_FUNCTOR_H_
 
 #include <tuple>
 
@@ -277,4 +277,4 @@ struct FakeQuantWithMinMaxVarsPerChannelGradientFunctor {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_FAKE_QUANT_FUNCTOR_H_
+#endif  // TENSORFLOW_CORE_KERNELS_FAKE_QUANT_FUNCTOR_H_
diff --git a/tensorflow/core/kernels/gather_functor_gpu.cu.h b/tensorflow/core/kernels/gather_functor_gpu.cu.h
index a50b51b54b..11ea63d730 100644
--- a/tensorflow/core/kernels/gather_functor_gpu.cu.h
+++ b/tensorflow/core/kernels/gather_functor_gpu.cu.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_GATHER_FUNCTOR_GPU_CU_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_GATHER_FUNCTOR_GPU_CU_H_
+#ifndef TENSORFLOW_CORE_KERNELS_GATHER_FUNCTOR_GPU_CU_H_
+#define TENSORFLOW_CORE_KERNELS_GATHER_FUNCTOR_GPU_CU_H_
 
 #if GOOGLE_CUDA
 
@@ -118,4 +118,4 @@ struct GatherFunctor<GPUDevice, T, Index> {
 
 #endif  // GOOGLE_CUDA
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_GATHER_FUNCTOR_GPU_CU_H_
+#endif  // TENSORFLOW_CORE_KERNELS_GATHER_FUNCTOR_GPU_CU_H_
diff --git a/tensorflow/core/kernels/gpu_utils.h b/tensorflow/core/kernels/gpu_utils.h
index 366877bcf5..ffc733e6bb 100644
--- a/tensorflow/core/kernels/gpu_utils.h
+++ b/tensorflow/core/kernels/gpu_utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_GPU_UTILS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_GPU_UTILS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_GPU_UTILS_H_
+#define TENSORFLOW_CORE_KERNELS_GPU_UTILS_H_
 
 #if GOOGLE_CUDA
 
@@ -162,4 +162,4 @@ class AutoTuneSingleton {
 
 #endif  // GOOGLE_CUDA
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_GPU_UTILS_H_
+#endif  // TENSORFLOW_CORE_KERNELS_GPU_UTILS_H_
diff --git a/tensorflow/core/kernels/hexagon/graph_transferer.h b/tensorflow/core/kernels/hexagon/graph_transferer.h
index 125d1fd200..a360d188cc 100644
--- a/tensorflow/core/kernels/hexagon/graph_transferer.h
+++ b/tensorflow/core/kernels/hexagon/graph_transferer.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_GRAPH_TRANSFERER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_GRAPH_TRANSFERER_H_
+#ifndef TENSORFLOW_CORE_KERNELS_HEXAGON_GRAPH_TRANSFERER_H_
+#define TENSORFLOW_CORE_KERNELS_HEXAGON_GRAPH_TRANSFERER_H_
 
 #include <array>
 #include <unordered_map>
@@ -225,4 +225,4 @@ class GraphTransferer {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_GRAPH_TRANSFERER_H
+#endif  // TENSORFLOW_CORE_KERNELS_HEXAGON_GRAPH_TRANSFERER_H
diff --git a/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h
index 8eb3995fc4..dca1f94a9b 100644
--- a/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h
+++ b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_CONTROL_WRAPPER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_CONTROL_WRAPPER_H_
+#ifndef TENSORFLOW_CORE_KERNELS_HEXAGON_CONTROL_WRAPPER_H_
+#define TENSORFLOW_CORE_KERNELS_HEXAGON_CONTROL_WRAPPER_H_
 
 #include <unordered_map>
 #include <vector>
@@ -88,4 +88,4 @@ class HexagonControlWrapper final : public IRemoteFusedGraphExecutor {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_CONTROL_WRAPPER_H_
+#endif  // TENSORFLOW_CORE_KERNELS_HEXAGON_CONTROL_WRAPPER_H_
diff --git a/tensorflow/core/kernels/hexagon/hexagon_ops_definitions.h b/tensorflow/core/kernels/hexagon/hexagon_ops_definitions.h
index 993a5f9a3a..b9328c8e0e 100644
--- a/tensorflow/core/kernels/hexagon/hexagon_ops_definitions.h
+++ b/tensorflow/core/kernels/hexagon/hexagon_ops_definitions.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_HEXAGON_OPS_DEFINITIONS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_HEXAGON_OPS_DEFINITIONS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_HEXAGON_HEXAGON_OPS_DEFINITIONS_H_
+#define TENSORFLOW_CORE_KERNELS_HEXAGON_HEXAGON_OPS_DEFINITIONS_H_
 
 #include <unordered_map>
 
@@ -55,4 +55,4 @@ class HexagonOpsDefinitions final : public IRemoteFusedGraphOpsDefinitions {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_HEXAGON_OPS_DEFINITIONS_H
+#endif  // TENSORFLOW_CORE_KERNELS_HEXAGON_HEXAGON_OPS_DEFINITIONS_H
diff --git a/tensorflow/core/kernels/i_remote_fused_graph_executor.h b/tensorflow/core/kernels/i_remote_fused_graph_executor.h
index 05b76172b2..eb6b64da58 100644
--- a/tensorflow/core/kernels/i_remote_fused_graph_executor.h
+++ b/tensorflow/core/kernels/i_remote_fused_graph_executor.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_I_REMOTE_GRAPH_EXECUTOR_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_I_REMOTE_GRAPH_EXECUTOR_H_
+#ifndef TENSORFLOW_CORE_KERNELS_I_REMOTE_GRAPH_EXECUTOR_H_
+#define TENSORFLOW_CORE_KERNELS_I_REMOTE_GRAPH_EXECUTOR_H_
 
 #include "tensorflow/core/framework/remote_fused_graph_execute_info.pb.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -72,4 +72,4 @@ class IRemoteFusedGraphExecutor {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_I_REMOTE_GRAPH_EXECUTOR_H_
+#endif  // TENSORFLOW_CORE_KERNELS_I_REMOTE_GRAPH_EXECUTOR_H_
diff --git a/tensorflow/core/kernels/i_remote_fused_graph_ops_definitions.h b/tensorflow/core/kernels/i_remote_fused_graph_ops_definitions.h
index 7d3329f490..9e51c9f51f 100644
--- a/tensorflow/core/kernels/i_remote_fused_graph_ops_definitions.h
+++ b/tensorflow/core/kernels/i_remote_fused_graph_ops_definitions.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_I_REMOTE_FUSED_GRAPH_OPS_DEFINITIONS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_I_REMOTE_FUSED_GRAPH_OPS_DEFINITIONS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_I_REMOTE_FUSED_GRAPH_OPS_DEFINITIONS_H_
+#define TENSORFLOW_CORE_KERNELS_I_REMOTE_FUSED_GRAPH_OPS_DEFINITIONS_H_
 
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/macros.h"
@@ -43,4 +43,4 @@ class IRemoteFusedGraphOpsDefinitions {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_I_REMOTE_FUSED_GRAPH_OPS_DEFINITIONS_H_
+#endif  // TENSORFLOW_CORE_KERNELS_I_REMOTE_FUSED_GRAPH_OPS_DEFINITIONS_H_
diff --git a/tensorflow/core/kernels/list_kernels.h b/tensorflow/core/kernels/list_kernels.h
index 1e6cbfebbe..9733883001 100644
--- a/tensorflow/core/kernels/list_kernels.h
+++ b/tensorflow/core/kernels/list_kernels.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_LIST_KERNELS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_LIST_KERNELS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_LIST_KERNELS_H_
+#define TENSORFLOW_CORE_KERNELS_LIST_KERNELS_H_
 
 #define EIGEN_USE_THREADS
 #if GOOGLE_CUDA
@@ -270,4 +270,4 @@ Status TensorListZerosLike(OpKernelContext* c, const TensorList& x,
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_LIST_KERNELS_H_
+#endif  // TENSORFLOW_CORE_KERNELS_LIST_KERNELS_H_
diff --git a/tensorflow/core/kernels/meta_support.h b/tensorflow/core/kernels/meta_support.h
index 53aece78e8..97f39eb598 100644
--- a/tensorflow/core/kernels/meta_support.h
+++ b/tensorflow/core/kernels/meta_support.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_QUANTIZATION_KERNELS_META_SUPPORT_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_QUANTIZATION_KERNELS_META_SUPPORT_H_
+#ifndef TENSORFLOW_CONTRIB_QUANTIZATION_KERNELS_META_SUPPORT_H_
+#define TENSORFLOW_CONTRIB_QUANTIZATION_KERNELS_META_SUPPORT_H_
 
 #include "meta/multi_thread_gemm.h"
 #include "meta/multi_thread_transform.h"
@@ -109,4 +109,4 @@ void Clamp(OpKernelContext* context, const quint8* input, int input_count,
 }  // namespace meta
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_QUANTIZATION_KERNELS_META_SUPPORT_H_
+#endif  // TENSORFLOW_CONTRIB_QUANTIZATION_KERNELS_META_SUPPORT_H_
diff --git a/tensorflow/core/kernels/mfcc.h b/tensorflow/core/kernels/mfcc.h
index 0d5d9fb90f..8268f47203 100644
--- a/tensorflow/core/kernels/mfcc.h
+++ b/tensorflow/core/kernels/mfcc.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // Basic class for computing MFCCs from spectrogram slices.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_MFCC_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_MFCC_H_
+#ifndef TENSORFLOW_CORE_KERNELS_MFCC_H_
+#define TENSORFLOW_CORE_KERNELS_MFCC_H_
 
 #include <vector>
 
@@ -74,4 +74,4 @@ class Mfcc {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_MFCC_H_
+#endif  // TENSORFLOW_CORE_KERNELS_MFCC_H_
diff --git a/tensorflow/core/kernels/mfcc_dct.h b/tensorflow/core/kernels/mfcc_dct.h
index 4fa3c01628..888b8e8df8 100644
--- a/tensorflow/core/kernels/mfcc_dct.h
+++ b/tensorflow/core/kernels/mfcc_dct.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // Basic minimal DCT class for MFCC speech processing.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_MFCC_DCT_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_MFCC_DCT_H_
+#ifndef TENSORFLOW_CORE_KERNELS_MFCC_DCT_H_
+#define TENSORFLOW_CORE_KERNELS_MFCC_DCT_H_
 
 #include <vector>
 
@@ -41,4 +41,4 @@ class MfccDct {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_MFCC_DCT_H_
+#endif  // TENSORFLOW_CORE_KERNELS_MFCC_DCT_H_
diff --git a/tensorflow/core/kernels/mfcc_mel_filterbank.h b/tensorflow/core/kernels/mfcc_mel_filterbank.h
index a766a20cbc..1bdc2dc93b 100644
--- a/tensorflow/core/kernels/mfcc_mel_filterbank.h
+++ b/tensorflow/core/kernels/mfcc_mel_filterbank.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // Basic class for applying a mel-scale mapping to a power spectrum.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_MFCC_MEL_FILTERBANK_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_MFCC_MEL_FILTERBANK_H_
+#ifndef TENSORFLOW_CORE_KERNELS_MFCC_MEL_FILTERBANK_H_
+#define TENSORFLOW_CORE_KERNELS_MFCC_MEL_FILTERBANK_H_
 
 #include <vector>
 #include "tensorflow/core/framework/op_kernel.h"
@@ -63,4 +63,4 @@ class MfccMelFilterbank {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_MFCC_MEL_FILTERBANK_H_
+#endif  // TENSORFLOW_CORE_KERNELS_MFCC_MEL_FILTERBANK_H_
diff --git a/tensorflow/core/kernels/mirror_pad_op_cpu_impl.h b/tensorflow/core/kernels/mirror_pad_op_cpu_impl.h
index bb22b2aa91..6716a26fac 100644
--- a/tensorflow/core/kernels/mirror_pad_op_cpu_impl.h
+++ b/tensorflow/core/kernels/mirror_pad_op_cpu_impl.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_MIRROR_PAD_OP_CPU_IMPL_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_MIRROR_PAD_OP_CPU_IMPL_H_
+#ifndef TENSORFLOW_CORE_MIRROR_PAD_OP_CPU_IMPL_H_
+#define TENSORFLOW_CORE_MIRROR_PAD_OP_CPU_IMPL_H_
 
 #define EIGEN_USE_THREADS
 
@@ -41,4 +41,4 @@ TF_CALL_NUMBER_TYPES(DEFINE_CPU_SPECS);
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_MIRROR_PAD_OP_CPU_IMPL_H_
+#endif  // TENSORFLOW_CORE_MIRROR_PAD_OP_CPU_IMPL_H_
diff --git a/tensorflow/core/kernels/neon/depthwiseconv_float.h b/tensorflow/core/kernels/neon/depthwiseconv_float.h
index acd58a644f..11f5be7c03 100644
--- a/tensorflow/core/kernels/neon/depthwiseconv_float.h
+++ b/tensorflow/core/kernels/neon/depthwiseconv_float.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_NEON_DEPTHWISECONV_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_NEON_DEPTHWISECONV_H_
+#ifndef TENSORFLOW_CORE_KERNELS_NEON_DEPTHWISECONV_H_
+#define TENSORFLOW_CORE_KERNELS_NEON_DEPTHWISECONV_H_
 
 #include "public/gemmlowp.h"
 #include "tensorflow/core/kernels/neon/types.h"
@@ -722,4 +722,4 @@ void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
 }  // end namespace neon
 }  // end namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_NEON_DEPTHWISECONV_H_
+#endif  // TENSORFLOW_CORE_KERNELS_NEON_DEPTHWISECONV_H_
diff --git a/tensorflow/core/kernels/neon/types.h b/tensorflow/core/kernels/neon/types.h
index 4ece22f015..05ff1bcc6c 100644
--- a/tensorflow/core/kernels/neon/types.h
+++ b/tensorflow/core/kernels/neon/types.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_NEON_TYPES_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_NEON_TYPES_H_
+#ifndef TENSORFLOW_CORE_KERNELS_NEON_TYPES_H_
+#define TENSORFLOW_CORE_KERNELS_NEON_TYPES_H_
 
 #include "tensorflow/core/platform/logging.h"
 
@@ -70,4 +70,4 @@ inline int RequiredBufferSizeForDims(const Dims<4>& dims) {
 }  // end namespace neon
 }  // end namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_NEON_TYPES_H_
+#endif  // TENSORFLOW_CORE_KERNELS_NEON_TYPES_H_
diff --git a/tensorflow/core/kernels/population_count_op.h b/tensorflow/core/kernels/population_count_op.h
index de89582e13..2c98129673 100644
--- a/tensorflow/core/kernels/population_count_op.h
+++ b/tensorflow/core/kernels/population_count_op.h
@@ -14,8 +14,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_POPULATION_COUNT_OP_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_POPULATION_COUNT_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_POPULATION_COUNT_OP_H_
+#define TENSORFLOW_CORE_KERNELS_POPULATION_COUNT_OP_H_
 
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
@@ -35,4 +35,4 @@ struct PopulationCount {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_POPULATION_COUNT_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_POPULATION_COUNT_OP_H_
diff --git a/tensorflow/core/kernels/quantization_utils.h b/tensorflow/core/kernels/quantization_utils.h
index 7c18496357..9fafe6bb65 100644
--- a/tensorflow/core/kernels/quantization_utils.h
+++ b/tensorflow/core/kernels/quantization_utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_QUANTIZATION_UTILS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_QUANTIZATION_UTILS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_QUANTIZATION_UTILS_H_
+#define TENSORFLOW_CORE_KERNELS_QUANTIZATION_UTILS_H_
 
 #define EIGEN_USE_THREADS
 
@@ -956,4 +956,4 @@ class TensorflowGemmContext : public gemmlowp::MultiThreadGemmContextBase {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_QUANTIZATION_UTILS_H_
+#endif  // TENSORFLOW_CORE_KERNELS_QUANTIZATION_UTILS_H_
diff --git a/tensorflow/core/kernels/reference_gemm.h b/tensorflow/core/kernels/reference_gemm.h
index bb2a21720f..c9cc04ed1b 100644
--- a/tensorflow/core/kernels/reference_gemm.h
+++ b/tensorflow/core/kernels/reference_gemm.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_REFERENCE_GEMM_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_REFERENCE_GEMM_H_
+#ifndef TENSORFLOW_CORE_KERNELS_REFERENCE_GEMM_H_
+#define TENSORFLOW_CORE_KERNELS_REFERENCE_GEMM_H_
 
 #include <stdlib.h>
 
@@ -92,4 +92,4 @@ void ReferenceGemm(bool transpose_a, bool transpose_b, bool transpose_c,
 }
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_REFERENCE_GEMM_H_
+#endif  // TENSORFLOW_CORE_KERNELS_REFERENCE_GEMM_H_
diff --git a/tensorflow/core/kernels/remote_fused_graph_execute_op_test_utils.h b/tensorflow/core/kernels/remote_fused_graph_execute_op_test_utils.h
index 3fa052108e..7de45eaaa1 100644
--- a/tensorflow/core/kernels/remote_fused_graph_execute_op_test_utils.h
+++ b/tensorflow/core/kernels/remote_fused_graph_execute_op_test_utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_REMOTE_FUSED_GRAPH_EXECUTE_OP_TEST_UTILS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_REMOTE_FUSED_GRAPH_EXECUTE_OP_TEST_UTILS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_REMOTE_FUSED_GRAPH_EXECUTE_OP_TEST_UTILS_H_
+#define TENSORFLOW_CORE_KERNELS_REMOTE_FUSED_GRAPH_EXECUTE_OP_TEST_UTILS_H_
 
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
@@ -86,4 +86,4 @@ class TestRemoteFusedGraphExecutor final : public IRemoteFusedGraphExecutor {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_REMOTE_FUSED_GRAPH_EXECUTE_OP_TEST_UTILS_H_
+#endif  // TENSORFLOW_CORE_KERNELS_REMOTE_FUSED_GRAPH_EXECUTE_OP_TEST_UTILS_H_
diff --git a/tensorflow/core/kernels/remote_fused_graph_execute_utils.h b/tensorflow/core/kernels/remote_fused_graph_execute_utils.h
index 541c26baaf..f047144278 100644
--- a/tensorflow/core/kernels/remote_fused_graph_execute_utils.h
+++ b/tensorflow/core/kernels/remote_fused_graph_execute_utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_REMOTE_FUSED_GRAPH_EXECUTE_UTILS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_REMOTE_FUSED_GRAPH_EXECUTE_UTILS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_REMOTE_FUSED_GRAPH_EXECUTE_UTILS_H_
+#define TENSORFLOW_CORE_KERNELS_REMOTE_FUSED_GRAPH_EXECUTE_UTILS_H_
 
 #include <unordered_map>
 #include <unordered_set>
@@ -312,4 +312,4 @@ class RemoteFusedGraphExecuteUtils {
 };
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_REMOTE_FUSED_GRAPH_EXECUTE_UTILS_H_
+#endif  // TENSORFLOW_CORE_KERNELS_REMOTE_FUSED_GRAPH_EXECUTE_UTILS_H_
diff --git a/tensorflow/core/kernels/reshape_util.h b/tensorflow/core/kernels/reshape_util.h
index ed583afd13..6777748b63 100644
--- a/tensorflow/core/kernels/reshape_util.h
+++ b/tensorflow/core/kernels/reshape_util.h
@@ -13,8 +13,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_RESHAPE_UTIL_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_RESHAPE_UTIL_H_
+#ifndef TENSORFLOW_CORE_KERNELS_RESHAPE_UTIL_H_
+#define TENSORFLOW_CORE_KERNELS_RESHAPE_UTIL_H_
 
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -28,4 +28,4 @@ void Reshape(OpKernelContext *context, const Tensor &input_indices_in,
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_RESHAPE_UTIL_H_
+#endif  // TENSORFLOW_CORE_KERNELS_RESHAPE_UTIL_H_
diff --git a/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h b/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
index cffc326174..c6c9d4e658 100644
--- a/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
+++ b/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SCATTER_ND_OP_CPU_IMPL_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SCATTER_ND_OP_CPU_IMPL_H_
+#ifndef TENSORFLOW_CORE_KERNELS_SCATTER_ND_OP_CPU_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_SCATTER_ND_OP_CPU_IMPL_H_
 
 // Functor definitions for ScatterND ops, must be compilable by nvcc.
 
@@ -257,4 +257,4 @@ REGISTER_SCATTER_ND_MATH_SYCL(int32);
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SCATTER_ND_OP_CPU_IMPL_H_
+#endif  // TENSORFLOW_CORE_KERNELS_SCATTER_ND_OP_CPU_IMPL_H_
diff --git a/tensorflow/core/kernels/segment_reduction_ops.h b/tensorflow/core/kernels/segment_reduction_ops.h
index b10bea72ba..bcdd42c80c 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.h
+++ b/tensorflow/core/kernels/segment_reduction_ops.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor.h"
@@ -98,4 +98,4 @@ struct UnsortedSegmentMaxFunctor: public UnsortedSegmentBaseFunctor<Device, T, I
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
+#endif  // TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
diff --git a/tensorflow/core/kernels/slice_op_cpu_impl.h b/tensorflow/core/kernels/slice_op_cpu_impl.h
index 58dc7df3e0..47f1d5342a 100644
--- a/tensorflow/core/kernels/slice_op_cpu_impl.h
+++ b/tensorflow/core/kernels/slice_op_cpu_impl.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SLICE_OP_CPU_IMPL_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SLICE_OP_CPU_IMPL_H_
+#ifndef TENSORFLOW_CORE_KERNELS_SLICE_OP_CPU_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_SLICE_OP_CPU_IMPL_H_
 
 #define EIGEN_USE_THREADS
 
@@ -47,4 +47,4 @@ DEFINE_SYCL_KERNELS(int32);
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SLICE_OP_CPU_IMPL_H_
+#endif  // TENSORFLOW_CORE_KERNELS_SLICE_OP_CPU_IMPL_H_
diff --git a/tensorflow/core/kernels/spectrogram.h b/tensorflow/core/kernels/spectrogram.h
index 5476a0a961..fef0e64942 100644
--- a/tensorflow/core/kernels/spectrogram.h
+++ b/tensorflow/core/kernels/spectrogram.h
@@ -28,8 +28,8 @@ limitations under the License.
 // window = hann(window_length_samples, 'periodic');
 // S = abs(spectrogram(audio, window, overlap_samples)).^2;
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SPECTROGRAM_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SPECTROGRAM_H_
+#ifndef TENSORFLOW_CORE_KERNELS_SPECTROGRAM_H_
+#define TENSORFLOW_CORE_KERNELS_SPECTROGRAM_H_
 
 #include <complex>
 #include <deque>
@@ -109,4 +109,4 @@ class Spectrogram {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SPECTROGRAM_H_
+#endif  // TENSORFLOW_CORE_KERNELS_SPECTROGRAM_H_
diff --git a/tensorflow/core/kernels/spectrogram_test_utils.h b/tensorflow/core/kernels/spectrogram_test_utils.h
index 59a903549e..d4187076e7 100644
--- a/tensorflow/core/kernels/spectrogram_test_utils.h
+++ b/tensorflow/core/kernels/spectrogram_test_utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SPECTROGRAM_TEST_UTILS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SPECTROGRAM_TEST_UTILS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_SPECTROGRAM_TEST_UTILS_H_
+#define TENSORFLOW_CORE_KERNELS_SPECTROGRAM_TEST_UTILS_H_
 
 #include <complex>
 #include <string>
@@ -78,4 +78,4 @@ void SineWave(int sample_rate, float frequency, float duration_seconds,
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SPECTROGRAM_TEST_UTILS_H_
+#endif  // TENSORFLOW_CORE_KERNELS_SPECTROGRAM_TEST_UTILS_H_
diff --git a/tensorflow/core/kernels/tile_ops_cpu_impl.h b/tensorflow/core/kernels/tile_ops_cpu_impl.h
index a6eed4935d..054b31ef9e 100644
--- a/tensorflow/core/kernels/tile_ops_cpu_impl.h
+++ b/tensorflow/core/kernels/tile_ops_cpu_impl.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_TILE_OPS_CPU_IMPL_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_TILE_OPS_CPU_IMPL_H_
+#ifndef TENSORFLOW_CORE_KERNELS_TILE_OPS_CPU_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_TILE_OPS_CPU_IMPL_H_
 
 #define EIGEN_USE_THREADS
 
@@ -68,4 +68,4 @@ TF_CALL_int64(DEFINE_TYPE);
 }  // end namespace functor
 }  // end namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_TILE_OPS_CPU_IMPL_H_
+#endif  // TENSORFLOW_CORE_KERNELS_TILE_OPS_CPU_IMPL_H_
diff --git a/tensorflow/core/kernels/tile_ops_gpu_impl.h b/tensorflow/core/kernels/tile_ops_gpu_impl.h
index 592f99e9b7..8da337dabd 100644
--- a/tensorflow/core/kernels/tile_ops_gpu_impl.h
+++ b/tensorflow/core/kernels/tile_ops_gpu_impl.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_TILE_OPS_GPU_IMPL_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_TILE_OPS_GPU_IMPL_H_
+#ifndef TENSORFLOW_CORE_KERNELS_TILE_OPS_GPU_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_TILE_OPS_GPU_IMPL_H_
 
 // Header used to split up compilation of GPU tile ops.  For each type you want
 // to have tile ops, create a .cu.cc file containing
@@ -56,4 +56,4 @@ limitations under the License.
   }                             \
   }
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_TILE_OPS_GPU_IMPL_H_
+#endif  // TENSORFLOW_CORE_KERNELS_TILE_OPS_GPU_IMPL_H_
diff --git a/tensorflow/core/kernels/winograd_transform.h b/tensorflow/core/kernels/winograd_transform.h
index 5caee9fdc1..d22710e503 100644
--- a/tensorflow/core/kernels/winograd_transform.h
+++ b/tensorflow/core/kernels/winograd_transform.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_WINOGRAD_TRANSFORM_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_WINOGRAD_TRANSFORM_H_
+#ifndef TENSORFLOW_CORE_KERNELS_WINOGRAD_TRANSFORM_H_
+#define TENSORFLOW_CORE_KERNELS_WINOGRAD_TRANSFORM_H_
 
 #include "tensorflow/core/kernels/deep_conv2d.h"
 
@@ -374,4 +374,4 @@ void WinogradTransform<T>::GetOutputTransformMatrix(const int64 rows,
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_WINOGRAD_TRANSFORM_H_
+#endif  // TENSORFLOW_CORE_KERNELS_WINOGRAD_TRANSFORM_H_
diff --git a/tensorflow/core/kernels/xsmm_conv2d.h b/tensorflow/core/kernels/xsmm_conv2d.h
index b439511dc7..003291329a 100644
--- a/tensorflow/core/kernels/xsmm_conv2d.h
+++ b/tensorflow/core/kernels/xsmm_conv2d.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_XSMM_CONV2D_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_XSMM_CONV2D_H_
+#ifndef TENSORFLOW_CORE_KERNELS_XSMM_CONV2D_H_
+#define TENSORFLOW_CORE_KERNELS_XSMM_CONV2D_H_
 
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/util/tensor_format.h"
@@ -57,4 +57,4 @@ struct XsmmBkwFilterConv2D {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_XSMM_CONV2D_H_
+#endif  // TENSORFLOW_CORE_KERNELS_XSMM_CONV2D_H_
diff --git a/tensorflow/core/lib/core/bitmap.h b/tensorflow/core/lib/core/bitmap.h
index b30479fa1b..8ff1e666b4 100644
--- a/tensorflow/core/lib/core/bitmap.h
+++ b/tensorflow/core/lib/core/bitmap.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_LIB_CORE_BITMAP_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_LIB_CORE_BITMAP_H_
+#ifndef TENSORFLOW_CORE_LIB_CORE_BITMAP_H_
+#define TENSORFLOW_CORE_LIB_CORE_BITMAP_H_
 
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
@@ -103,4 +103,4 @@ inline void Bitmap::clear(size_t i) {
 }  // namespace core
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_LIB_CORE_BITMAP_H_
+#endif  // TENSORFLOW_CORE_LIB_CORE_BITMAP_H_
diff --git a/tensorflow/core/lib/gtl/compactptrset.h b/tensorflow/core/lib/gtl/compactptrset.h
index 1d4d6cc8d2..d3d23b94aa 100644
--- a/tensorflow/core/lib/gtl/compactptrset.h
+++ b/tensorflow/core/lib/gtl/compactptrset.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_LIB_GTL_COMPACTPTRSET_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_LIB_GTL_COMPACTPTRSET_H_
+#ifndef TENSORFLOW_CORE_LIB_GTL_COMPACTPTRSET_H_
+#define TENSORFLOW_CORE_LIB_GTL_COMPACTPTRSET_H_
 
 #include <type_traits>
 #include "tensorflow/core/lib/gtl/flatset.h"
@@ -205,4 +205,4 @@ class CompactPointerSet {
 }  // namespace gtl
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_LIB_GTL_COMPACTPTRSET_H_
+#endif  // TENSORFLOW_CORE_LIB_GTL_COMPACTPTRSET_H_
diff --git a/tensorflow/core/lib/gtl/flatmap.h b/tensorflow/core/lib/gtl/flatmap.h
index 6dd67ad2ea..889d2ddaa6 100644
--- a/tensorflow/core/lib/gtl/flatmap.h
+++ b/tensorflow/core/lib/gtl/flatmap.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_LIB_GTL_FLATMAP_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_LIB_GTL_FLATMAP_H_
+#ifndef TENSORFLOW_CORE_LIB_GTL_FLATMAP_H_
+#define TENSORFLOW_CORE_LIB_GTL_FLATMAP_H_
 
 #include <stddef.h>
 #include <functional>
@@ -379,4 +379,4 @@ class FlatMap {
 }  // namespace gtl
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_LIB_GTL_FLATMAP_H_
+#endif  // TENSORFLOW_CORE_LIB_GTL_FLATMAP_H_
diff --git a/tensorflow/core/lib/gtl/flatrep.h b/tensorflow/core/lib/gtl/flatrep.h
index bb405b327a..0d7e7487fc 100644
--- a/tensorflow/core/lib/gtl/flatrep.h
+++ b/tensorflow/core/lib/gtl/flatrep.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_LIB_GTL_FLATREP_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_LIB_GTL_FLATREP_H_
+#ifndef TENSORFLOW_CORE_LIB_GTL_FLATREP_H_
+#define TENSORFLOW_CORE_LIB_GTL_FLATREP_H_
 
 #include <string.h>
 #include <utility>
@@ -328,4 +328,4 @@ class FlatRep {
 }  // namespace gtl
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_LIB_GTL_FLATREP_H_
+#endif  // TENSORFLOW_CORE_LIB_GTL_FLATREP_H_
diff --git a/tensorflow/core/lib/gtl/flatset.h b/tensorflow/core/lib/gtl/flatset.h
index 2b7f31ab22..f31e3abe41 100644
--- a/tensorflow/core/lib/gtl/flatset.h
+++ b/tensorflow/core/lib/gtl/flatset.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_LIB_GTL_FLATSET_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_LIB_GTL_FLATSET_H_
+#ifndef TENSORFLOW_CORE_LIB_GTL_FLATSET_H_
+#define TENSORFLOW_CORE_LIB_GTL_FLATSET_H_
 
 #include <stddef.h>
 #include <functional>
@@ -278,4 +278,4 @@ class FlatSet {
 }  // namespace gtl
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_LIB_GTL_FLATSET_H_
+#endif  // TENSORFLOW_CORE_LIB_GTL_FLATSET_H_
diff --git a/tensorflow/core/lib/io/buffered_inputstream.h b/tensorflow/core/lib/io/buffered_inputstream.h
index 2b824f35f8..924619f40f 100644
--- a/tensorflow/core/lib/io/buffered_inputstream.h
+++ b/tensorflow/core/lib/io/buffered_inputstream.h
@@ -104,4 +104,4 @@ class BufferedInputStream : public InputStreamInterface {
 }  // namespace io
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_LIB_IO_BUFFERED_INPUTSTREAM_H_
+#endif  // TENSORFLOW_LIB_IO_BUFFERED_INPUTSTREAM_H_
diff --git a/tensorflow/core/lib/io/compression.h b/tensorflow/core/lib/io/compression.h
index 7a0c5c12a7..ef90c60a3a 100644
--- a/tensorflow/core/lib/io/compression.h
+++ b/tensorflow/core/lib/io/compression.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_LIB_IO_COMPRESSION_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_LIB_IO_COMPRESSION_H_
+#ifndef TENSORFLOW_CORE_LIB_IO_COMPRESSION_H_
+#define TENSORFLOW_CORE_LIB_IO_COMPRESSION_H_
 
 namespace tensorflow {
 namespace io {
@@ -27,4 +27,4 @@ extern const char kGzip[];
 }
 }
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_LIB_IO_COMPRESSION_H_
+#endif  // TENSORFLOW_CORE_LIB_IO_COMPRESSION_H_
diff --git a/tensorflow/core/lib/io/inputstream_interface.h b/tensorflow/core/lib/io/inputstream_interface.h
index 096248693b..3083d20776 100644
--- a/tensorflow/core/lib/io/inputstream_interface.h
+++ b/tensorflow/core/lib/io/inputstream_interface.h
@@ -54,4 +54,4 @@ class InputStreamInterface {
 }  // namespace io
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_LIB_IO_INPUTSTREAM_INTERFACE_H_
+#endif  // TENSORFLOW_CORE_LIB_IO_INPUTSTREAM_INTERFACE_H_
diff --git a/tensorflow/core/lib/io/snappy/snappy_outputbuffer.h b/tensorflow/core/lib/io/snappy/snappy_outputbuffer.h
index 5d330a2c5a..5aea503846 100644
--- a/tensorflow/core/lib/io/snappy/snappy_outputbuffer.h
+++ b/tensorflow/core/lib/io/snappy/snappy_outputbuffer.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_LIB_IO_SNAPPY_OUTPUTBUFFER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_LIB_IO_SNAPPY_OUTPUTBUFFER_H_
+#ifndef TENSORFLOW_CORE_LIB_IO_SNAPPY_OUTPUTBUFFER_H_
+#define TENSORFLOW_CORE_LIB_IO_SNAPPY_OUTPUTBUFFER_H_
 
 #include <string>
 #include "tensorflow/core/lib/core/status.h"
@@ -117,4 +117,4 @@ class SnappyOutputBuffer {
 }  // namespace io
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_LIB_IO_SNAPPY_OUTPUTBUFFER_H_
+#endif  // TENSORFLOW_CORE_LIB_IO_SNAPPY_OUTPUTBUFFER_H_
diff --git a/tensorflow/core/lib/io/zlib_outputbuffer.h b/tensorflow/core/lib/io/zlib_outputbuffer.h
index 5cad2e9457..3d86d89a99 100644
--- a/tensorflow/core/lib/io/zlib_outputbuffer.h
+++ b/tensorflow/core/lib/io/zlib_outputbuffer.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_LIB_IO_COMPRESSED_OUTPUTBUFFER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_LIB_IO_COMPRESSED_OUTPUTBUFFER_H_
+#ifndef TENSORFLOW_CORE_LIB_IO_COMPRESSED_OUTPUTBUFFER_H_
+#define TENSORFLOW_CORE_LIB_IO_COMPRESSED_OUTPUTBUFFER_H_
 
 #include <zlib.h>
 
@@ -143,4 +143,4 @@ class ZlibOutputBuffer : public WritableFile {
 }  // namespace io
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_LIB_IO_COMPRESSED_OUTPUTBUFFER_H_
+#endif  // TENSORFLOW_CORE_LIB_IO_COMPRESSED_OUTPUTBUFFER_H_
diff --git a/tensorflow/core/lib/monitoring/collected_metrics.h b/tensorflow/core/lib/monitoring/collected_metrics.h
index acdb0d86ed..e200981609 100644
--- a/tensorflow/core/lib/monitoring/collected_metrics.h
+++ b/tensorflow/core/lib/monitoring/collected_metrics.h
@@ -17,8 +17,8 @@ limitations under the License.
 // These are to be used only by the CollectionRegistry and exporters which
 // collect metrics using the CollectionRegistry.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_COLLECTED_METRICS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_COLLECTED_METRICS_H_
+#ifndef TENSORFLOW_CORE_LIB_MONITORING_COLLECTED_METRICS_H_
+#define TENSORFLOW_CORE_LIB_MONITORING_COLLECTED_METRICS_H_
 
 #include <map>
 #include <memory>
@@ -151,4 +151,4 @@ struct CollectedMetrics {
 }  // namespace monitoring
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_COLLECTED_METRICS_H_
+#endif  // TENSORFLOW_CORE_LIB_MONITORING_COLLECTED_METRICS_H_
diff --git a/tensorflow/core/lib/monitoring/collection_registry.h b/tensorflow/core/lib/monitoring/collection_registry.h
index 2c8e250c56..63cc0f550d 100644
--- a/tensorflow/core/lib/monitoring/collection_registry.h
+++ b/tensorflow/core/lib/monitoring/collection_registry.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_COLLECTION_REGISTRY_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_COLLECTION_REGISTRY_H_
+#ifndef TENSORFLOW_CORE_LIB_MONITORING_COLLECTION_REGISTRY_H_
+#define TENSORFLOW_CORE_LIB_MONITORING_COLLECTION_REGISTRY_H_
 
 #include <map>
 #include <memory>
@@ -356,4 +356,4 @@ MetricCollector<metric_kind, Value, NumLabels> MetricCollectorGetter::Get(
 }  // namespace monitoring
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_COLLECTION_REGISTRY_H_
+#endif  // TENSORFLOW_CORE_LIB_MONITORING_COLLECTION_REGISTRY_H_
diff --git a/tensorflow/core/lib/monitoring/counter.h b/tensorflow/core/lib/monitoring/counter.h
index 7240348a9b..8ff810db41 100644
--- a/tensorflow/core/lib/monitoring/counter.h
+++ b/tensorflow/core/lib/monitoring/counter.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_COUNTER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_COUNTER_H_
+#ifndef TENSORFLOW_CORE_LIB_MONITORING_COUNTER_H_
+#define TENSORFLOW_CORE_LIB_MONITORING_COUNTER_H_
 
 // We replace this implementation with a null implementation for mobile
 // platforms.
@@ -172,4 +172,4 @@ CounterCell* Counter<NumLabels>::GetCell(const Labels&... labels)
 }  // namespace tensorflow
 
 #endif  // IS_MOBILE_PLATFORM
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_COUNTER_H_
+#endif  // TENSORFLOW_CORE_LIB_MONITORING_COUNTER_H_
diff --git a/tensorflow/core/lib/monitoring/gauge.h b/tensorflow/core/lib/monitoring/gauge.h
index ec978a9193..ee9a862f40 100644
--- a/tensorflow/core/lib/monitoring/gauge.h
+++ b/tensorflow/core/lib/monitoring/gauge.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_GAUGE_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_GAUGE_H_
+#ifndef TENSORFLOW_CORE_LIB_MONITORING_GAUGE_H_
+#define TENSORFLOW_CORE_LIB_MONITORING_GAUGE_H_
 
 // We replace this implementation with a null implementation for mobile
 // platforms.
@@ -241,4 +241,4 @@ GaugeCell<ValueType>* Gauge<ValueType, NumLabels>::GetCell(
 }  // namespace tensorflow
 
 #endif  // IS_MOBILE_PLATFORM
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_GAUGE_H_
+#endif  // TENSORFLOW_CORE_LIB_MONITORING_GAUGE_H_
diff --git a/tensorflow/core/lib/monitoring/metric_def.h b/tensorflow/core/lib/monitoring/metric_def.h
index f046842618..5ecadcc427 100644
--- a/tensorflow/core/lib/monitoring/metric_def.h
+++ b/tensorflow/core/lib/monitoring/metric_def.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_METRIC_DEF_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_METRIC_DEF_H_
+#ifndef TENSORFLOW_CORE_LIB_MONITORING_METRIC_DEF_H_
+#define TENSORFLOW_CORE_LIB_MONITORING_METRIC_DEF_H_
 
 #include <array>
 #include <vector>
@@ -139,4 +139,4 @@ class MetricDef : public AbstractMetricDef {
 }  // namespace monitoring
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_METRIC_DEF_H_
+#endif  // TENSORFLOW_CORE_LIB_MONITORING_METRIC_DEF_H_
diff --git a/tensorflow/core/lib/monitoring/mobile_counter.h b/tensorflow/core/lib/monitoring/mobile_counter.h
index c30bfe026f..c297d843d2 100644
--- a/tensorflow/core/lib/monitoring/mobile_counter.h
+++ b/tensorflow/core/lib/monitoring/mobile_counter.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // Null implementation of the Counter metric for mobile platforms.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_MOBILE_COUNTER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_MOBILE_COUNTER_H_
+#ifndef TENSORFLOW_CORE_LIB_MONITORING_MOBILE_COUNTER_H_
+#define TENSORFLOW_CORE_LIB_MONITORING_MOBILE_COUNTER_H_
 
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
@@ -64,4 +64,4 @@ class Counter {
 }  // namespace monitoring
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_MOBILE_COUNTER_H_
+#endif  // TENSORFLOW_CORE_LIB_MONITORING_MOBILE_COUNTER_H_
diff --git a/tensorflow/core/lib/monitoring/mobile_gauge.h b/tensorflow/core/lib/monitoring/mobile_gauge.h
index ac13ad35c0..a03b41aef3 100644
--- a/tensorflow/core/lib/monitoring/mobile_gauge.h
+++ b/tensorflow/core/lib/monitoring/mobile_gauge.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // Null implementation of the Gauge metric for mobile platforms.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_MOBILE_GAUGE_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_MOBILE_GAUGE_H_
+#ifndef TENSORFLOW_CORE_LIB_MONITORING_MOBILE_GAUGE_H_
+#define TENSORFLOW_CORE_LIB_MONITORING_MOBILE_GAUGE_H_
 
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
@@ -69,4 +69,4 @@ class Gauge {
 }  // namespace monitoring
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_MOBILE_GAUGE_H_
+#endif  // TENSORFLOW_CORE_LIB_MONITORING_MOBILE_GAUGE_H_
diff --git a/tensorflow/core/lib/monitoring/mobile_sampler.h b/tensorflow/core/lib/monitoring/mobile_sampler.h
index cf390e5c7f..77310dd619 100644
--- a/tensorflow/core/lib/monitoring/mobile_sampler.h
+++ b/tensorflow/core/lib/monitoring/mobile_sampler.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // Null implementation of the Sampler metric for mobile platforms.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_MOBILE_SAMPLER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_MOBILE_SAMPLER_H_
+#ifndef TENSORFLOW_CORE_LIB_MONITORING_MOBILE_SAMPLER_H_
+#define TENSORFLOW_CORE_LIB_MONITORING_MOBILE_SAMPLER_H_
 
 #include <memory>
 
@@ -98,4 +98,4 @@ class Sampler {
 }  // namespace monitoring
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_MOBILE_SAMPLER_H_
+#endif  // TENSORFLOW_CORE_LIB_MONITORING_MOBILE_SAMPLER_H_
diff --git a/tensorflow/core/lib/monitoring/sampler.h b/tensorflow/core/lib/monitoring/sampler.h
index c7a05428e2..a4f397f556 100644
--- a/tensorflow/core/lib/monitoring/sampler.h
+++ b/tensorflow/core/lib/monitoring/sampler.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_SAMPLER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_SAMPLER_H_
+#ifndef TENSORFLOW_CORE_LIB_MONITORING_SAMPLER_H_
+#define TENSORFLOW_CORE_LIB_MONITORING_SAMPLER_H_
 
 // We replace this implementation with a null implementation for mobile
 // platforms.
@@ -215,4 +215,4 @@ SamplerCell* Sampler<NumLabels>::GetCell(const Labels&... labels)
 }  // namespace tensorflow
 
 #endif  // IS_MOBILE_PLATFORM
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_SAMPLER_H_
+#endif  // TENSORFLOW_CORE_LIB_MONITORING_SAMPLER_H_
diff --git a/tensorflow/core/lib/strings/proto_text_util.h b/tensorflow/core/lib/strings/proto_text_util.h
index ed6d0af010..05dbda6e15 100644
--- a/tensorflow/core/lib/strings/proto_text_util.h
+++ b/tensorflow/core/lib/strings/proto_text_util.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_LIB_STRINGS_PROTO_TEXT_UTIL_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_LIB_STRINGS_PROTO_TEXT_UTIL_H_
+#ifndef TENSORFLOW_CORE_LIB_STRINGS_PROTO_TEXT_UTIL_H_
+#define TENSORFLOW_CORE_LIB_STRINGS_PROTO_TEXT_UTIL_H_
 
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/scanner.h"
@@ -164,4 +164,4 @@ bool ProtoParseStringLiteralFromScanner(Scanner* scanner, string* value);
 }  // namespace strings
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_LIB_STRINGS_PROTO_TEXT_UTIL_H_
+#endif  // TENSORFLOW_CORE_LIB_STRINGS_PROTO_TEXT_UTIL_H_
diff --git a/tensorflow/core/platform/cloud/gcs_dns_cache.h b/tensorflow/core/platform/cloud/gcs_dns_cache.h
index dd95c18f35..40f16f1044 100644
--- a/tensorflow/core/platform/cloud/gcs_dns_cache.h
+++ b/tensorflow/core/platform/cloud/gcs_dns_cache.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_PLATNFORM_CLOUD_DNS_CACHE_H_
-#define THIRD_PARTY_TENSORFLOW_PLATNFORM_CLOUD_DNS_CACHE_H_
+#ifndef TENSORFLOW_PLATNFORM_CLOUD_DNS_CACHE_H_
+#define TENSORFLOW_PLATNFORM_CLOUD_DNS_CACHE_H_
 
 #include <random>
 
@@ -74,4 +74,4 @@ class GcsDnsCache {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_PLATNFORM_CLOUD_DNS_CACHE_H_
+#endif  // TENSORFLOW_PLATNFORM_CLOUD_DNS_CACHE_H_
diff --git a/tensorflow/core/platform/cloud/oauth_client.h b/tensorflow/core/platform/cloud/oauth_client.h
index 1614c7b315..519d69acf9 100644
--- a/tensorflow/core/platform/cloud/oauth_client.h
+++ b/tensorflow/core/platform/cloud/oauth_client.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_CLOUD_OAUTH_CLIENT_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_CLOUD_OAUTH_CLIENT_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_CLOUD_OAUTH_CLIENT_H_
+#define TENSORFLOW_CORE_PLATFORM_CLOUD_OAUTH_CLIENT_H_
 
 #include <memory>
 #include "include/json/json.h"
@@ -59,4 +59,4 @@ class OAuthClient {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_CLOUD_OAUTH_CLIENT_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_CLOUD_OAUTH_CLIENT_H_
diff --git a/tensorflow/core/platform/cloud/retrying_utils.h b/tensorflow/core/platform/cloud/retrying_utils.h
index 99ab216e97..546b8d1c4a 100644
--- a/tensorflow/core/platform/cloud/retrying_utils.h
+++ b/tensorflow/core/platform/cloud/retrying_utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_CLOUD_RETRYING_UTILS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_CLOUD_RETRYING_UTILS_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_CLOUD_RETRYING_UTILS_H_
+#define TENSORFLOW_CORE_PLATFORM_CLOUD_RETRYING_UTILS_H_
 
 #include <functional>
 #include "tensorflow/core/lib/core/status.h"
@@ -47,4 +47,4 @@ class RetryingUtils {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_CLOUD_RETRYING_UTILS_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_CLOUD_RETRYING_UTILS_H_
diff --git a/tensorflow/core/platform/cloud/time_util.h b/tensorflow/core/platform/cloud/time_util.h
index b1bb7f1119..d6d4bc499f 100644
--- a/tensorflow/core/platform/cloud/time_util.h
+++ b/tensorflow/core/platform/cloud/time_util.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_CLOUD_TIME_UTIL_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_CLOUD_TIME_UTIL_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_CLOUD_TIME_UTIL_H_
+#define TENSORFLOW_CORE_PLATFORM_CLOUD_TIME_UTIL_H_
 
 #include "tensorflow/core/lib/core/status.h"
 
@@ -26,4 +26,4 @@ Status ParseRfc3339Time(const string& time, int64* mtime_nsec);
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_CLOUD_TIME_UTIL_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_CLOUD_TIME_UTIL_H_
diff --git a/tensorflow/core/platform/cuda_libdevice_path.h b/tensorflow/core/platform/cuda_libdevice_path.h
index 601d0db6d4..6ef565ecd3 100644
--- a/tensorflow/core/platform/cuda_libdevice_path.h
+++ b/tensorflow/core/platform/cuda_libdevice_path.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_CUDA_LIBDEVICE_PATH_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_CUDA_LIBDEVICE_PATH_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_CUDA_LIBDEVICE_PATH_H_
+#define TENSORFLOW_CORE_PLATFORM_CUDA_LIBDEVICE_PATH_H_
 
 #include "tensorflow/core/platform/types.h"
 
@@ -29,4 +29,4 @@ string LibdeviceRoot();
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_CUDA_LIBDEVICE_PATH_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_CUDA_LIBDEVICE_PATH_H_
diff --git a/tensorflow/core/platform/cupti_wrapper.h b/tensorflow/core/platform/cupti_wrapper.h
index c909dcd35b..9a17ab60c0 100644
--- a/tensorflow/core/platform/cupti_wrapper.h
+++ b/tensorflow/core/platform/cupti_wrapper.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_CUPTI_WRAPPER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_CUPTI_WRAPPER_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_CUPTI_WRAPPER_H_
+#define TENSORFLOW_CORE_PLATFORM_CUPTI_WRAPPER_H_
 
 #include "tensorflow/core/platform/platform.h"
 
@@ -24,4 +24,4 @@ limitations under the License.
 #include "tensorflow/core/platform/default/gpu/cupti_wrapper.h"
 #endif
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_CUPTI_WRAPPER_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_CUPTI_WRAPPER_H_
diff --git a/tensorflow/core/platform/default/gpu/cupti_wrapper.h b/tensorflow/core/platform/default/gpu/cupti_wrapper.h
index 38e01cefad..acd889e474 100644
--- a/tensorflow/core/platform/default/gpu/cupti_wrapper.h
+++ b/tensorflow/core/platform/default/gpu/cupti_wrapper.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_DEFAULT_CUPTI_WRAPPER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_DEFAULT_CUPTI_WRAPPER_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_DEFAULT_CUPTI_WRAPPER_H_
+#define TENSORFLOW_CORE_PLATFORM_DEFAULT_CUPTI_WRAPPER_H_
 
 #if GOOGLE_CUDA
 
@@ -76,4 +76,4 @@ class CuptiWrapper {
 
 #endif  // GOOGLE_CUDA
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_DEFAULT_CUPTI_WRAPPER_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_DEFAULT_CUPTI_WRAPPER_H_
diff --git a/tensorflow/core/platform/demangle.h b/tensorflow/core/platform/demangle.h
index c2def217a1..ce33be2e68 100644
--- a/tensorflow/core/platform/demangle.h
+++ b/tensorflow/core/platform/demangle.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_DEMANGLE_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_DEMANGLE_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_DEMANGLE_H_
+#define TENSORFLOW_CORE_PLATFORM_DEMANGLE_H_
 
 #include "tensorflow/core/platform/types.h"
 
@@ -28,4 +28,4 @@ string Demangle(const char* mangled);
 }  // namespace port
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_DEMANGLE_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_DEMANGLE_H_
diff --git a/tensorflow/core/platform/file_statistics.h b/tensorflow/core/platform/file_statistics.h
index 7629db6ef9..9e3489b1ad 100644
--- a/tensorflow/core/platform/file_statistics.h
+++ b/tensorflow/core/platform/file_statistics.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_FILE_STATISTICS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_FILE_STATISTICS_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_FILE_STATISTICS_H_
+#define TENSORFLOW_CORE_PLATFORM_FILE_STATISTICS_H_
 
 #include "tensorflow/core/platform/types.h"
 
@@ -36,4 +36,4 @@ struct FileStatistics {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_FILE_STATISTICS_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_FILE_STATISTICS_H_
diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system.h b/tensorflow/core/platform/hadoop/hadoop_file_system.h
index 447e83158a..5f2b222622 100644
--- a/tensorflow/core/platform/hadoop/hadoop_file_system.h
+++ b/tensorflow/core/platform/hadoop/hadoop_file_system.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_HADOOP_HADOOP_FILE_SYSTEM_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_HADOOP_HADOOP_FILE_SYSTEM_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_HADOOP_HADOOP_FILE_SYSTEM_H_
+#define TENSORFLOW_CORE_PLATFORM_HADOOP_HADOOP_FILE_SYSTEM_H_
 
 #include "tensorflow/core/platform/env.h"
 
@@ -70,4 +70,4 @@ class HadoopFileSystem : public FileSystem {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_HADOOP_HADOOP_FILE_SYSTEM_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_HADOOP_HADOOP_FILE_SYSTEM_H_
diff --git a/tensorflow/core/platform/stacktrace_handler.h b/tensorflow/core/platform/stacktrace_handler.h
index d36c82c9ba..a52970fdaa 100644
--- a/tensorflow/core/platform/stacktrace_handler.h
+++ b/tensorflow/core/platform/stacktrace_handler.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_BACKTRACE_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_BACKTRACE_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_BACKTRACE_H_
+#define TENSORFLOW_CORE_PLATFORM_BACKTRACE_H_
 
 namespace tensorflow {
 namespace testing {
@@ -25,4 +25,4 @@ void InstallStacktraceHandler();
 }  // namespace testing
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_BACKTRACE_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_BACKTRACE_H_
diff --git a/tensorflow/core/profiler/internal/advisor/accelerator_utilization_checker.h b/tensorflow/core/profiler/internal/advisor/accelerator_utilization_checker.h
index c6544fe0b0..25766668d8 100644
--- a/tensorflow/core/profiler/internal/advisor/accelerator_utilization_checker.h
+++ b/tensorflow/core/profiler/internal/advisor/accelerator_utilization_checker.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 // This checker checks the accelerator's utilization.
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_ACCELERATOR_UTILIZATION_CHECKER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_ACCELERATOR_UTILIZATION_CHECKER_H_
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_ACCELERATOR_UTILIZATION_CHECKER_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_ACCELERATOR_UTILIZATION_CHECKER_H_
 
 #include "tensorflow/core/profiler/internal/advisor/checker.h"
 
@@ -106,4 +106,4 @@ class AcceleratorUtilizationChecker : public Checker {
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_ACCELERATOR_UTILIZATION_CHECKER_H_
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_ACCELERATOR_UTILIZATION_CHECKER_H_
diff --git a/tensorflow/core/profiler/internal/advisor/checker.h b/tensorflow/core/profiler/internal/advisor/checker.h
index 4b5ebcf9e8..5d7da39e6b 100644
--- a/tensorflow/core/profiler/internal/advisor/checker.h
+++ b/tensorflow/core/profiler/internal/advisor/checker.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_CHECKER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_CHECKER_H_
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_CHECKER_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_CHECKER_H_
 
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/profiler/internal/tfprof_stats.h"
@@ -49,4 +49,4 @@ class Checker {
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_CHECKER_H_
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_CHECKER_H_
diff --git a/tensorflow/core/profiler/internal/advisor/expensive_operation_checker.h b/tensorflow/core/profiler/internal/advisor/expensive_operation_checker.h
index 145782c7bd..f5ac5c9c5a 100644
--- a/tensorflow/core/profiler/internal/advisor/expensive_operation_checker.h
+++ b/tensorflow/core/profiler/internal/advisor/expensive_operation_checker.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 // This checker checks the most expensive operations.
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_EXPENSIVE_OPERATION_CHECKER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_EXPENSIVE_OPERATION_CHECKER_H_
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_EXPENSIVE_OPERATION_CHECKER_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_EXPENSIVE_OPERATION_CHECKER_H_
 
 #include "tensorflow/core/profiler/internal/advisor/checker.h"
 
@@ -137,4 +137,4 @@ class ExpensiveOperationChecker : public Checker {
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_EXPENSIVE_OP_CHECKER_H_
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_EXPENSIVE_OP_CHECKER_H_
diff --git a/tensorflow/core/profiler/internal/advisor/internal_checker_runner.h b/tensorflow/core/profiler/internal/advisor/internal_checker_runner.h
index ec52741b19..6fc16cf903 100644
--- a/tensorflow/core/profiler/internal/advisor/internal_checker_runner.h
+++ b/tensorflow/core/profiler/internal/advisor/internal_checker_runner.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_INTERNAL_CHECKER_RUNNER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_INTERNAL_CHECKER_RUNNER_H_
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_INTERNAL_CHECKER_RUNNER_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_INTERNAL_CHECKER_RUNNER_H_
 
 #include "tensorflow/core/profiler/internal/tfprof_utils.h"
 #include "tensorflow/core/profiler/tfprof_options.pb.h"
@@ -31,4 +31,4 @@ AdviceProto RunInternalCheckers(const AdvisorOptionsProto& options,
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_INTERNAL_CHECKER_RUNNER_H_
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_INTERNAL_CHECKER_RUNNER_H_
diff --git a/tensorflow/core/profiler/internal/advisor/operation_checker.h b/tensorflow/core/profiler/internal/advisor/operation_checker.h
index f0bd72fa40..6c1d5cd670 100644
--- a/tensorflow/core/profiler/internal/advisor/operation_checker.h
+++ b/tensorflow/core/profiler/internal/advisor/operation_checker.h
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 // This checker checks common wrong configurations of operations.
 //
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_OPERATION_CHECKER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_OPERATION_CHECKER_H_
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_OPERATION_CHECKER_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_OPERATION_CHECKER_H_
 
 #include "tensorflow/core/profiler/internal/advisor/checker.h"
 
@@ -74,4 +74,4 @@ class OperationChecker : public Checker {
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_OPERATION_CHECKER_H_
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_OPERATION_CHECKER_H_
diff --git a/tensorflow/core/profiler/internal/advisor/tfprof_advisor.h b/tensorflow/core/profiler/internal/advisor/tfprof_advisor.h
index 42bd6d5438..270662bd4a 100644
--- a/tensorflow/core/profiler/internal/advisor/tfprof_advisor.h
+++ b/tensorflow/core/profiler/internal/advisor/tfprof_advisor.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_TFPROF_ADVICE_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_TFPROF_ADVICE_H_
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_TFPROF_ADVICE_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_TFPROF_ADVICE_H_
 
 #include "tensorflow/core/profiler/internal/advisor/accelerator_utilization_checker.h"
 #include "tensorflow/core/profiler/internal/advisor/checker.h"
@@ -78,4 +78,4 @@ class Advisor {
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_TFPROF_ADVICE_H_
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_TFPROF_ADVICE_H_
diff --git a/tensorflow/core/profiler/internal/print_model_analysis.h b/tensorflow/core/profiler/internal/print_model_analysis.h
index 90166aa7d5..29666ab936 100644
--- a/tensorflow/core/profiler/internal/print_model_analysis.h
+++ b/tensorflow/core/profiler/internal/print_model_analysis.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_PRINT_MODEL_ANALYSIS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_PRINT_MODEL_ANALYSIS_H_
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_PRINT_MODEL_ANALYSIS_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_PRINT_MODEL_ANALYSIS_H_
 
 #include <string>
 
@@ -63,4 +63,4 @@ string PrintModelAnalysis(const string* graph, const string* run_meta,
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_PRINT_MODEL_ANALYSIS_H_
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_PRINT_MODEL_ANALYSIS_H_
diff --git a/tensorflow/core/profiler/internal/tfprof_code.h b/tensorflow/core/profiler/internal/tfprof_code.h
index bcbdc1b48c..38395f967c 100644
--- a/tensorflow/core/profiler/internal/tfprof_code.h
+++ b/tensorflow/core/profiler/internal/tfprof_code.h
@@ -16,8 +16,8 @@ limitations under the License.
 // Build a tree structure based on the TensorFlow model's python code stacks.
 // Stats are aggregated from descendants to ancestors.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_CODE_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_CODE_H_
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_CODE_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_CODE_H_
 
 #include <map>
 #include <memory>
@@ -94,4 +94,4 @@ class TFCode : public TFMultiShow {
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_CODE_H_
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_CODE_H_
diff --git a/tensorflow/core/profiler/internal/tfprof_constants.h b/tensorflow/core/profiler/internal/tfprof_constants.h
index 6a4eaaa890..d4a47931a2 100644
--- a/tensorflow/core/profiler/internal/tfprof_constants.h
+++ b/tensorflow/core/profiler/internal/tfprof_constants.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_CONSTANTS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_CONSTANTS_H_
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_CONSTANTS_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_CONSTANTS_H_
 
 namespace tensorflow {
 namespace tfprof {
@@ -34,4 +34,4 @@ static const char* const kCkptVarType = "_checkpoint_variables";
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_CONSTANTS_H_
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_CONSTANTS_H_
diff --git a/tensorflow/core/profiler/internal/tfprof_graph.h b/tensorflow/core/profiler/internal/tfprof_graph.h
index f7eef9c835..356a459a65 100644
--- a/tensorflow/core/profiler/internal/tfprof_graph.h
+++ b/tensorflow/core/profiler/internal/tfprof_graph.h
@@ -16,8 +16,8 @@ limitations under the License.
 // Build a graph structure based on op inputs/outputs. The graph is a directed
 // acyclic graph pointing *from outputs to inputs*.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_GRAPH_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_GRAPH_H_
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_GRAPH_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_GRAPH_H_
 
 #include <deque>
 #include <map>
@@ -86,4 +86,4 @@ class TFGraph : public TFShow {
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_GRAPH_H_
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_GRAPH_H_
diff --git a/tensorflow/core/profiler/internal/tfprof_node.h b/tensorflow/core/profiler/internal/tfprof_node.h
index 255a0987e6..0a97b1cb0f 100644
--- a/tensorflow/core/profiler/internal/tfprof_node.h
+++ b/tensorflow/core/profiler/internal/tfprof_node.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_NODE_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_NODE_H_
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_NODE_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_NODE_H_
 
 #include <map>
 #include <set>
@@ -915,4 +915,4 @@ bool IsCanonicalDevice(const string& device);
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_NODE_H_
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_NODE_H_
diff --git a/tensorflow/core/profiler/internal/tfprof_node_show.h b/tensorflow/core/profiler/internal/tfprof_node_show.h
index ca6f9bca5e..517da67d74 100644
--- a/tensorflow/core/profiler/internal/tfprof_node_show.h
+++ b/tensorflow/core/profiler/internal/tfprof_node_show.h
@@ -21,8 +21,8 @@ limitations under the License.
 // ScopeNode and GraphNode each maps to one TFGraphNode.
 // CodeNode and OpNode each maps to one TFMultiGraphNode.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_NODE_SHOW_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_NODE_SHOW_H_
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_NODE_SHOW_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_NODE_SHOW_H_
 
 #include <algorithm>
 #include <string>
@@ -156,4 +156,4 @@ class OpNode : public ShowMultiNode {
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_NODE_SHOW_H_
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_NODE_SHOW_H_
diff --git a/tensorflow/core/profiler/internal/tfprof_op.h b/tensorflow/core/profiler/internal/tfprof_op.h
index fcc5e68f47..fe1c3b2ae8 100644
--- a/tensorflow/core/profiler/internal/tfprof_op.h
+++ b/tensorflow/core/profiler/internal/tfprof_op.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // Build a flat structure of ops.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_OP_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_OP_H_
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_OP_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_OP_H_
 
 #include <deque>
 #include <map>
@@ -76,4 +76,4 @@ class TFOp : public TFMultiShow {
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_OP_H_
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_OP_H_
diff --git a/tensorflow/core/profiler/internal/tfprof_scope.h b/tensorflow/core/profiler/internal/tfprof_scope.h
index bb847c0866..235dfde803 100644
--- a/tensorflow/core/profiler/internal/tfprof_scope.h
+++ b/tensorflow/core/profiler/internal/tfprof_scope.h
@@ -17,8 +17,8 @@ limitations under the License.
 // For example, 'name1/name2' is a child of 'name1'.
 // Stats are aggregated from descendants to ancestors.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SCOPE_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SCOPE_H_
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SCOPE_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SCOPE_H_
 
 #include <map>
 #include <memory>
@@ -74,4 +74,4 @@ class TFScope : public TFShow {
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SCOPE_H_
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SCOPE_H_
diff --git a/tensorflow/core/profiler/internal/tfprof_show.h b/tensorflow/core/profiler/internal/tfprof_show.h
index 2067ea3b73..4d6de06070 100644
--- a/tensorflow/core/profiler/internal/tfprof_show.h
+++ b/tensorflow/core/profiler/internal/tfprof_show.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // Parent class and utilities for tfprof_graph and tfprof_scope.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SHOW_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SHOW_H_
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SHOW_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SHOW_H_
 
 #include <algorithm>
 #include <string>
@@ -151,4 +151,4 @@ string FormatAcceleratorExecTime(const T* node, const Options& opts) {
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SHOW_H_
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SHOW_H_
diff --git a/tensorflow/core/profiler/internal/tfprof_show_multi.h b/tensorflow/core/profiler/internal/tfprof_show_multi.h
index ac0ada0449..2a2208d8e7 100644
--- a/tensorflow/core/profiler/internal/tfprof_show_multi.h
+++ b/tensorflow/core/profiler/internal/tfprof_show_multi.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // Parent class and utilities for tfprof_code.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SHOW_MULTI_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SHOW_MULTI_H_
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SHOW_MULTI_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SHOW_MULTI_H_
 
 #include <algorithm>
 #include <string>
@@ -127,4 +127,4 @@ class TFMultiShow {
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SHOW_MULTI_H_
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SHOW_MULTI_H_
diff --git a/tensorflow/core/profiler/internal/tfprof_stats.h b/tensorflow/core/profiler/internal/tfprof_stats.h
index d78abda588..0790cb0ca6 100644
--- a/tensorflow/core/profiler/internal/tfprof_stats.h
+++ b/tensorflow/core/profiler/internal/tfprof_stats.h
@@ -20,8 +20,8 @@ limitations under the License.
 // 3. Accept command and options to selectively aggregate stats for analysis
 //    and print out the results.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_STATS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_STATS_H_
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_STATS_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_STATS_H_
 
 #include <map>
 #include <memory>
@@ -125,4 +125,4 @@ class TFStats {
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_STATS_H_
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_STATS_H_
diff --git a/tensorflow/core/profiler/internal/tfprof_tensor.h b/tensorflow/core/profiler/internal/tfprof_tensor.h
index 9f72e081c9..7a08857720 100644
--- a/tensorflow/core/profiler/internal/tfprof_tensor.h
+++ b/tensorflow/core/profiler/internal/tfprof_tensor.h
@@ -19,8 +19,8 @@ limitations under the License.
 //    is not supported by TensorFlow CheckPointReader library, though it is
 //    supported in current code.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_TENSOR_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_TENSOR_H_
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_TENSOR_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_TENSOR_H_
 
 #include <typeinfo>
 
@@ -173,4 +173,4 @@ class TFProfTensor {
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_TENSOR_H_
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_TENSOR_H_
diff --git a/tensorflow/core/profiler/internal/tfprof_timeline.h b/tensorflow/core/profiler/internal/tfprof_timeline.h
index b8174cdecb..4428ab571f 100644
--- a/tensorflow/core/profiler/internal/tfprof_timeline.h
+++ b/tensorflow/core/profiler/internal/tfprof_timeline.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_TIMELINE_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_TIMELINE_H_
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_TIMELINE_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_TIMELINE_H_
 
 #include "include/json/json.h"
 #include "tensorflow/core/framework/graph.pb.h"
@@ -191,4 +191,4 @@ class Timeline {
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_TIMELINE_H_
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_TIMELINE_H_
diff --git a/tensorflow/core/profiler/internal/tfprof_utils.h b/tensorflow/core/profiler/internal/tfprof_utils.h
index afca3df7f8..d4f80afce0 100644
--- a/tensorflow/core/profiler/internal/tfprof_utils.h
+++ b/tensorflow/core/profiler/internal/tfprof_utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_UTILS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_UTILS_H_
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_UTILS_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_UTILS_H_
 
 #include <string>
 #include <vector>
@@ -72,4 +72,4 @@ string QueryDoc(const string& cmd, const Options& opts);
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_UTILS_H_
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_UTILS_H_
diff --git a/tensorflow/core/profiler/tfprof_options.h b/tensorflow/core/profiler/tfprof_options.h
index 463f5b3c3a..d61deb72ac 100644
--- a/tensorflow/core/profiler/tfprof_options.h
+++ b/tensorflow/core/profiler/tfprof_options.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_OPTIONS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_OPTIONS_H_
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_OPTIONS_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_OPTIONS_H_
 
 #include <set>
 #include <string>
@@ -183,4 +183,4 @@ tensorflow::Status ParseOutput(const string& output_opt, string* output_type,
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_OPTIONS_H_
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_OPTIONS_H_
diff --git a/tensorflow/core/util/command_line_flags.h b/tensorflow/core/util/command_line_flags.h
index 121c7063c9..928ae8a4e9 100644
--- a/tensorflow/core/util/command_line_flags.h
+++ b/tensorflow/core/util/command_line_flags.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_UTIL_COMMAND_LINE_FLAGS_H
-#define THIRD_PARTY_TENSORFLOW_CORE_UTIL_COMMAND_LINE_FLAGS_H
+#ifndef TENSORFLOW_CORE_UTIL_COMMAND_LINE_FLAGS_H
+#define TENSORFLOW_CORE_UTIL_COMMAND_LINE_FLAGS_H
 
 #include <functional>
 #include <string>
@@ -134,4 +134,4 @@ class Flags {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_UTIL_COMMAND_LINE_FLAGS_H
+#endif  // TENSORFLOW_CORE_UTIL_COMMAND_LINE_FLAGS_H
diff --git a/tensorflow/core/util/example_proto_fast_parsing.h b/tensorflow/core/util/example_proto_fast_parsing.h
index fe59ec77ca..1b08f02267 100644
--- a/tensorflow/core/util/example_proto_fast_parsing.h
+++ b/tensorflow/core/util/example_proto_fast_parsing.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_UTIL_EXAMPLE_PROTO_FAST_PARSING_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_UTIL_EXAMPLE_PROTO_FAST_PARSING_H_
+#ifndef TENSORFLOW_CORE_UTIL_EXAMPLE_PROTO_FAST_PARSING_H_
+#define TENSORFLOW_CORE_UTIL_EXAMPLE_PROTO_FAST_PARSING_H_
 
 #include <string>
 #include <unordered_map>
@@ -94,4 +94,4 @@ bool TestFastParse(const string& serialized, Example* example);
 }  // namespace example
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_UTIL_EXAMPLE_PROTO_FAST_PARSING_H_
+#endif  // TENSORFLOW_CORE_UTIL_EXAMPLE_PROTO_FAST_PARSING_H_
diff --git a/tensorflow/core/util/example_proto_helper.h b/tensorflow/core/util/example_proto_helper.h
index 8b3c6c5a3f..e511704962 100644
--- a/tensorflow/core/util/example_proto_helper.h
+++ b/tensorflow/core/util/example_proto_helper.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_UTIL_EXAMPLE_PROTO_HELPER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_UTIL_EXAMPLE_PROTO_HELPER_H_
+#ifndef TENSORFLOW_CORE_UTIL_EXAMPLE_PROTO_HELPER_H_
+#define TENSORFLOW_CORE_UTIL_EXAMPLE_PROTO_HELPER_H_
 
 #include <string>
 #include <vector>
@@ -314,4 +314,4 @@ class ParseSingleSequenceExampleAttrs {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_UTIL_EXAMPLE_PROTO_HELPER_H_
+#endif  // TENSORFLOW_CORE_UTIL_EXAMPLE_PROTO_HELPER_H_
diff --git a/tensorflow/core/util/matmul_autotune.h b/tensorflow/core/util/matmul_autotune.h
index 5366623883..5846cae2fc 100644
--- a/tensorflow/core/util/matmul_autotune.h
+++ b/tensorflow/core/util/matmul_autotune.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // The utility to check matmul autotune related flags.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_UTIL_MATMUL_AUTOTUNE_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_UTIL_MATMUL_AUTOTUNE_H_
+#ifndef TENSORFLOW_CORE_UTIL_MATMUL_AUTOTUNE_H_
+#define TENSORFLOW_CORE_UTIL_MATMUL_AUTOTUNE_H_
 
 namespace tensorflow {
 
@@ -25,4 +25,4 @@ bool MatmulDoFP32ComputationFP16Input();
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_UTIL_MATMUL_AUTOTUNE_H_
+#endif  // TENSORFLOW_CORE_UTIL_MATMUL_AUTOTUNE_H_
diff --git a/tensorflow/core/util/strided_slice_op.h b/tensorflow/core/util/strided_slice_op.h
index abca98f27b..25ecccd285 100644
--- a/tensorflow/core/util/strided_slice_op.h
+++ b/tensorflow/core/util/strided_slice_op.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_UTIL_STRIDED_SLICE_OP_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_UTIL_STRIDED_SLICE_OP_H_
+#ifndef TENSORFLOW_CORE_UTIL_STRIDED_SLICE_OP_H_
+#define TENSORFLOW_CORE_UTIL_STRIDED_SLICE_OP_H_
 
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -62,4 +62,4 @@ Status ValidateStridedSliceOp(
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_UTIL_STRIDED_SLICE_OP_H_
+#endif  // TENSORFLOW_CORE_UTIL_STRIDED_SLICE_OP_H_
diff --git a/tensorflow/examples/android/jni/object_tracking/config.h b/tensorflow/examples/android/jni/object_tracking/config.h
index 86e9fc71b6..47de2d2c15 100644
--- a/tensorflow/examples/android/jni/object_tracking/config.h
+++ b/tensorflow/examples/android/jni/object_tracking/config.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_CONFIG_H_
-#define THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_CONFIG_H_
+#ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_CONFIG_H_
+#define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_CONFIG_H_
 
 #include <math.h>
 
@@ -297,4 +297,4 @@ struct TrackerConfig {
 
 }  // namespace tf_tracking
 
-#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_CONFIG_H_
+#endif  // TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_CONFIG_H_
diff --git a/tensorflow/examples/android/jni/object_tracking/flow_cache.h b/tensorflow/examples/android/jni/object_tracking/flow_cache.h
index 8813ab6d71..b62e334ecd 100644
--- a/tensorflow/examples/android/jni/object_tracking/flow_cache.h
+++ b/tensorflow/examples/android/jni/object_tracking/flow_cache.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_FLOW_CACHE_H_
-#define THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_FLOW_CACHE_H_
+#ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_FLOW_CACHE_H_
+#define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_FLOW_CACHE_H_
 
 #include "tensorflow/examples/android/jni/object_tracking/geom.h"
 #include "tensorflow/examples/android/jni/object_tracking/utils.h"
@@ -303,4 +303,4 @@ class FlowCache {
 
 }  // namespace tf_tracking
 
-#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_FLOW_CACHE_H_
+#endif  // TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_FLOW_CACHE_H_
diff --git a/tensorflow/examples/android/jni/object_tracking/frame_pair.h b/tensorflow/examples/android/jni/object_tracking/frame_pair.h
index 8f409fe806..6c8ac9be98 100644
--- a/tensorflow/examples/android/jni/object_tracking/frame_pair.h
+++ b/tensorflow/examples/android/jni/object_tracking/frame_pair.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_FRAME_PAIR_H_
-#define THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_FRAME_PAIR_H_
+#ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_FRAME_PAIR_H_
+#define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_FRAME_PAIR_H_
 
 #include "tensorflow/examples/android/jni/object_tracking/keypoint.h"
 
@@ -100,4 +100,4 @@ class FramePair {
 
 }  // namespace tf_tracking
 
-#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_FRAME_PAIR_H_
+#endif  // TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_FRAME_PAIR_H_
diff --git a/tensorflow/examples/android/jni/object_tracking/geom.h b/tensorflow/examples/android/jni/object_tracking/geom.h
index 2819063616..c975e40144 100644
--- a/tensorflow/examples/android/jni/object_tracking/geom.h
+++ b/tensorflow/examples/android/jni/object_tracking/geom.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_GEOM_H_
-#define THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_GEOM_H_
+#ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_GEOM_H_
+#define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_GEOM_H_
 
 #include "tensorflow/examples/android/jni/object_tracking/logging.h"
 #include "tensorflow/examples/android/jni/object_tracking/utils.h"
@@ -316,4 +316,4 @@ inline BoundingSquare GetCenteredSquare(const BoundingBox& original_box) {
 
 }  // namespace tf_tracking
 
-#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_GEOM_H_
+#endif  // TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_GEOM_H_
diff --git a/tensorflow/examples/android/jni/object_tracking/gl_utils.h b/tensorflow/examples/android/jni/object_tracking/gl_utils.h
index bd5c233f4f..a29e677d3c 100755
--- a/tensorflow/examples/android/jni/object_tracking/gl_utils.h
+++ b/tensorflow/examples/android/jni/object_tracking/gl_utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_GL_UTILS_H_
-#define THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_GL_UTILS_H_
+#ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_GL_UTILS_H_
+#define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_GL_UTILS_H_
 
 #include <GLES/gl.h>
 #include <GLES/glext.h>
@@ -52,4 +52,4 @@ inline static void MapWorldSquareToUnitSquare(const BoundingSquare& square) {
 
 }  // namespace tf_tracking
 
-#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_GL_UTILS_H_
+#endif  // TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_GL_UTILS_H_
diff --git a/tensorflow/examples/android/jni/object_tracking/image-inl.h b/tensorflow/examples/android/jni/object_tracking/image-inl.h
index 9c4c389aa7..61d69908b5 100644
--- a/tensorflow/examples/android/jni/object_tracking/image-inl.h
+++ b/tensorflow/examples/android/jni/object_tracking/image-inl.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_IMAGE_INL_H_
-#define THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_IMAGE_INL_H_
+#ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_IMAGE_INL_H_
+#define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_IMAGE_INL_H_
 
 #include <stdint.h>
 
@@ -641,4 +641,4 @@ inline void Image<T>::FromArray(const T* const pixels, const int stride,
 
 }  // namespace tf_tracking
 
-#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_IMAGE_INL_H_
+#endif  // TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_IMAGE_INL_H_
diff --git a/tensorflow/examples/android/jni/object_tracking/image.h b/tensorflow/examples/android/jni/object_tracking/image.h
index b7a2301f5e..a436f0e0a1 100644
--- a/tensorflow/examples/android/jni/object_tracking/image.h
+++ b/tensorflow/examples/android/jni/object_tracking/image.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_IMAGE_H_
-#define THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_IMAGE_H_
+#ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_IMAGE_H_
+#define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_IMAGE_H_
 
 #include <stdint.h>
 
@@ -338,4 +338,4 @@ inline std::ostream& operator<<(std::ostream& stream, const Image<t>& image) {
 
 }  // namespace tf_tracking
 
-#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_IMAGE_H_
+#endif  // TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_IMAGE_H_
diff --git a/tensorflow/examples/android/jni/object_tracking/image_data.h b/tensorflow/examples/android/jni/object_tracking/image_data.h
index 445cdb57a3..c4f91d8cbd 100644
--- a/tensorflow/examples/android/jni/object_tracking/image_data.h
+++ b/tensorflow/examples/android/jni/object_tracking/image_data.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_IMAGE_DATA_H_
-#define THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_IMAGE_DATA_H_
+#ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_IMAGE_DATA_H_
+#define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_IMAGE_DATA_H_
 
 #include <stdint.h>
 #include <memory>
@@ -261,4 +261,4 @@ class ImageData {
 
 }  // namespace tf_tracking
 
-#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_IMAGE_DATA_H_
+#endif  // TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_IMAGE_DATA_H_
diff --git a/tensorflow/examples/android/jni/object_tracking/image_utils.h b/tensorflow/examples/android/jni/object_tracking/image_utils.h
index ac9ffd90f8..b4ad7000b3 100644
--- a/tensorflow/examples/android/jni/object_tracking/image_utils.h
+++ b/tensorflow/examples/android/jni/object_tracking/image_utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_IMAGE_UTILS_H_
-#define THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_IMAGE_UTILS_H_
+#ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_IMAGE_UTILS_H_
+#define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_IMAGE_UTILS_H_
 
 #include <stdint.h>
 
@@ -295,4 +295,4 @@ inline void NormalizeImage(Image<float>* const image) {
 
 }  // namespace tf_tracking
 
-#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_IMAGE_UTILS_H_
+#endif  // TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_IMAGE_UTILS_H_
diff --git a/tensorflow/examples/android/jni/object_tracking/integral_image.h b/tensorflow/examples/android/jni/object_tracking/integral_image.h
index 8e82334abf..caf9b7d2ab 100755
--- a/tensorflow/examples/android/jni/object_tracking/integral_image.h
+++ b/tensorflow/examples/android/jni/object_tracking/integral_image.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_INTEGRAL_IMAGE_H_
-#define THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_INTEGRAL_IMAGE_H_
+#ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_INTEGRAL_IMAGE_H_
+#define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_INTEGRAL_IMAGE_H_
 
 #include "tensorflow/examples/android/jni/object_tracking/geom.h"
 #include "tensorflow/examples/android/jni/object_tracking/image-inl.h"
@@ -184,4 +184,4 @@ class IntegralImage : public Image<uint32_t> {
 
 }  // namespace tf_tracking
 
-#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_INTEGRAL_IMAGE_H_
+#endif  // TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_INTEGRAL_IMAGE_H_
diff --git a/tensorflow/examples/android/jni/object_tracking/jni_utils.h b/tensorflow/examples/android/jni/object_tracking/jni_utils.h
index 21fbabb521..b81d9e0c12 100644
--- a/tensorflow/examples/android/jni/object_tracking/jni_utils.h
+++ b/tensorflow/examples/android/jni/object_tracking/jni_utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_JNI_UTILS_H_
-#define THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_JNI_UTILS_H_
+#ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_JNI_UTILS_H_
+#define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_JNI_UTILS_H_
 
 #include <stdint.h>
 
diff --git a/tensorflow/examples/android/jni/object_tracking/keypoint.h b/tensorflow/examples/android/jni/object_tracking/keypoint.h
index 719f9aff3f..93405a5b2a 100644
--- a/tensorflow/examples/android/jni/object_tracking/keypoint.h
+++ b/tensorflow/examples/android/jni/object_tracking/keypoint.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_KEYPOINT_H_
-#define THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_KEYPOINT_H_
+#ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_KEYPOINT_H_
+#define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_KEYPOINT_H_
 
 #include "tensorflow/examples/android/jni/object_tracking/geom.h"
 #include "tensorflow/examples/android/jni/object_tracking/image-inl.h"
@@ -45,4 +45,4 @@ inline std::ostream& operator<<(std::ostream& stream, const Keypoint keypoint) {
 
 }  // namespace tf_tracking
 
-#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_KEYPOINT_H_
+#endif  // TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_KEYPOINT_H_
diff --git a/tensorflow/examples/android/jni/object_tracking/keypoint_detector.h b/tensorflow/examples/android/jni/object_tracking/keypoint_detector.h
index 33d228128d..2e85b835a7 100644
--- a/tensorflow/examples/android/jni/object_tracking/keypoint_detector.h
+++ b/tensorflow/examples/android/jni/object_tracking/keypoint_detector.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_KEYPOINT_DETECTOR_H_
-#define THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_KEYPOINT_DETECTOR_H_
+#ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_KEYPOINT_DETECTOR_H_
+#define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_KEYPOINT_DETECTOR_H_
 
 #include <stdint.h>
 #include <vector>
@@ -125,4 +125,4 @@ class KeypointDetector {
 
 }  // namespace tf_tracking
 
-#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_KEYPOINT_DETECTOR_H_
+#endif  // TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_KEYPOINT_DETECTOR_H_
diff --git a/tensorflow/examples/android/jni/object_tracking/logging.h b/tensorflow/examples/android/jni/object_tracking/logging.h
index dbc89af2f7..852a749399 100644
--- a/tensorflow/examples/android/jni/object_tracking/logging.h
+++ b/tensorflow/examples/android/jni/object_tracking/logging.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_LOG_STREAMING_H_
-#define THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_LOG_STREAMING_H_
+#ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_LOG_STREAMING_H_
+#define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_LOG_STREAMING_H_
 
 #include <android/log.h>
 #include <string.h>
@@ -118,4 +118,4 @@ void LogPrintF(const int severity, const char* format, ...);
 
 #endif
 
-#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_LOG_STREAMING_H_
+#endif  // TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_LOG_STREAMING_H_
diff --git a/tensorflow/examples/android/jni/object_tracking/object_detector.h b/tensorflow/examples/android/jni/object_tracking/object_detector.h
index 2525567678..a65c7b0db7 100644
--- a/tensorflow/examples/android/jni/object_tracking/object_detector.h
+++ b/tensorflow/examples/android/jni/object_tracking/object_detector.h
@@ -20,8 +20,8 @@ limitations under the License.
 // Defines the ObjectDetector class that is the main interface for detecting
 // ObjectModelBases in frames.
 
-#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_OBJECT_DETECTOR_H_
-#define THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_OBJECT_DETECTOR_H_
+#ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_OBJECT_DETECTOR_H_
+#define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_OBJECT_DETECTOR_H_
 
 #include <float.h>
 #include <map>
@@ -227,4 +227,4 @@ class ObjectDetector : public ObjectDetectorBase {
 
 }  // namespace tf_tracking
 
-#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_OBJECT_DETECTOR_H_
+#endif  // TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_OBJECT_DETECTOR_H_
diff --git a/tensorflow/examples/android/jni/object_tracking/object_model.h b/tensorflow/examples/android/jni/object_tracking/object_model.h
index be33aea638..5e81c49080 100644
--- a/tensorflow/examples/android/jni/object_tracking/object_model.h
+++ b/tensorflow/examples/android/jni/object_tracking/object_model.h
@@ -19,8 +19,8 @@ limitations under the License.
 
 // Contains ObjectModelBase declaration.
 
-#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_DETECTION_OBJECT_MODEL_H_
-#define THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_DETECTION_OBJECT_MODEL_H_
+#ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_DETECTION_OBJECT_MODEL_H_
+#define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_DETECTION_OBJECT_MODEL_H_
 
 #ifdef __RENDER_OPENGL__
 #include <GLES/gl.h>
@@ -99,4 +99,4 @@ class ObjectModel : public ObjectModelBase {
 
 }  // namespace tf_tracking
 
-#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_DETECTION_OBJECT_MODEL_H_
+#endif  // TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_DETECTION_OBJECT_MODEL_H_
diff --git a/tensorflow/examples/android/jni/object_tracking/object_tracker.h b/tensorflow/examples/android/jni/object_tracking/object_tracker.h
index eb281fad37..20c7627fc5 100644
--- a/tensorflow/examples/android/jni/object_tracking/object_tracker.h
+++ b/tensorflow/examples/android/jni/object_tracking/object_tracker.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_OBJECT_TRACKER_H_
-#define THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_OBJECT_TRACKER_H_
+#ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_OBJECT_TRACKER_H_
+#define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_OBJECT_TRACKER_H_
 
 #include <map>
 #include <string>
@@ -267,4 +267,4 @@ inline std::ostream& operator<<(std::ostream& stream,
 
 }  // namespace tf_tracking
 
-#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_OBJECT_TRACKER_H_
+#endif  // TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_OBJECT_TRACKER_H_
diff --git a/tensorflow/examples/android/jni/object_tracking/optical_flow.h b/tensorflow/examples/android/jni/object_tracking/optical_flow.h
index 2206375beb..f98ae22bd6 100644
--- a/tensorflow/examples/android/jni/object_tracking/optical_flow.h
+++ b/tensorflow/examples/android/jni/object_tracking/optical_flow.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_OPTICAL_FLOW_H_
-#define THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_OPTICAL_FLOW_H_
+#ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_OPTICAL_FLOW_H_
+#define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_OPTICAL_FLOW_H_
 
 #include "tensorflow/examples/android/jni/object_tracking/geom.h"
 #include "tensorflow/examples/android/jni/object_tracking/image-inl.h"
@@ -97,4 +97,4 @@ class OpticalFlow {
 
 }  // namespace tf_tracking
 
-#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_OPTICAL_FLOW_H_
+#endif  // TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_OPTICAL_FLOW_H_
diff --git a/tensorflow/examples/android/jni/object_tracking/sprite.h b/tensorflow/examples/android/jni/object_tracking/sprite.h
index 05a13fea11..b54a68458f 100755
--- a/tensorflow/examples/android/jni/object_tracking/sprite.h
+++ b/tensorflow/examples/android/jni/object_tracking/sprite.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_SPRITE_H_
-#define THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_SPRITE_H_
+#ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_SPRITE_H_
+#define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_SPRITE_H_
 
 #include <GLES/gl.h>
 #include <GLES/glext.h>
@@ -199,4 +199,4 @@ class Sprite {
 
 }  // namespace tf_tracking
 
-#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_SPRITE_H_
+#endif  // TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_SPRITE_H_
diff --git a/tensorflow/examples/android/jni/object_tracking/time_log.h b/tensorflow/examples/android/jni/object_tracking/time_log.h
index 60911da396..0073e11596 100644
--- a/tensorflow/examples/android/jni/object_tracking/time_log.h
+++ b/tensorflow/examples/android/jni/object_tracking/time_log.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // Utility functions for performance profiling.
 
-#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_TIME_LOG_H_
-#define THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_TIME_LOG_H_
+#ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_TIME_LOG_H_
+#define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_TIME_LOG_H_
 
 #include <stdint.h>
 
@@ -134,4 +134,4 @@ inline static void TimeLog(const char* const str) {
 inline static void PrintTimeLog() {}
 #endif
 
-#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_TIME_LOG_H_
+#endif  // TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_TIME_LOG_H_
diff --git a/tensorflow/examples/android/jni/object_tracking/tracked_object.h b/tensorflow/examples/android/jni/object_tracking/tracked_object.h
index cda14e19d2..d7f1a7019b 100644
--- a/tensorflow/examples/android/jni/object_tracking/tracked_object.h
+++ b/tensorflow/examples/android/jni/object_tracking/tracked_object.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_TRACKED_OBJECT_H_
-#define THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_TRACKED_OBJECT_H_
+#ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_TRACKED_OBJECT_H_
+#define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_TRACKED_OBJECT_H_
 
 #ifdef __RENDER_OPENGL__
 #include "tensorflow/examples/android/jni/object_tracking/gl_utils.h"
@@ -183,4 +183,4 @@ inline std::ostream& operator<<(std::ostream& stream,
 
 }  // namespace tf_tracking
 
-#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_TRACKED_OBJECT_H_
+#endif  // TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_TRACKED_OBJECT_H_
diff --git a/tensorflow/examples/android/jni/object_tracking/utils.h b/tensorflow/examples/android/jni/object_tracking/utils.h
index 51cdfcdcfb..2e98734ec4 100644
--- a/tensorflow/examples/android/jni/object_tracking/utils.h
+++ b/tensorflow/examples/android/jni/object_tracking/utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_UTILS_H_
-#define THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_UTILS_H_
+#ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_UTILS_H_
+#define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_UTILS_H_
 
 #include <math.h>
 #include <stdint.h>
@@ -378,4 +378,4 @@ inline bool Invert2x2(const T* const a, float* const a_inv) {
 
 }  // namespace tf_tracking
 
-#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_UTILS_H_
+#endif  // TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_UTILS_H_
diff --git a/tensorflow/examples/speech_commands/accuracy_utils.h b/tensorflow/examples/speech_commands/accuracy_utils.h
index 8d918cb64b..eea048365b 100644
--- a/tensorflow/examples/speech_commands/accuracy_utils.h
+++ b/tensorflow/examples/speech_commands/accuracy_utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_SPEECH_COMMANDS_ACCURACY_UTILS_H_
-#define THIRD_PARTY_TENSORFLOW_EXAMPLES_SPEECH_COMMANDS_ACCURACY_UTILS_H_
+#ifndef TENSORFLOW_EXAMPLES_SPEECH_COMMANDS_ACCURACY_UTILS_H_
+#define TENSORFLOW_EXAMPLES_SPEECH_COMMANDS_ACCURACY_UTILS_H_
 
 #include <vector>
 
@@ -57,4 +57,4 @@ void PrintAccuracyStats(const StreamingAccuracyStats& stats);
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_SPEECH_COMMANDS_ACCURACY_UTILS_H_
+#endif  // TENSORFLOW_EXAMPLES_SPEECH_COMMANDS_ACCURACY_UTILS_H_
diff --git a/tensorflow/examples/speech_commands/recognize_commands.h b/tensorflow/examples/speech_commands/recognize_commands.h
index 7f8041f9ed..a7cd194bec 100644
--- a/tensorflow/examples/speech_commands/recognize_commands.h
+++ b/tensorflow/examples/speech_commands/recognize_commands.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_SPEECH_COMMANDS_RECOGNIZE_COMMANDS_H_
-#define THIRD_PARTY_TENSORFLOW_EXAMPLES_SPEECH_COMMANDS_RECOGNIZE_COMMANDS_H_
+#ifndef TENSORFLOW_EXAMPLES_SPEECH_COMMANDS_RECOGNIZE_COMMANDS_H_
+#define TENSORFLOW_EXAMPLES_SPEECH_COMMANDS_RECOGNIZE_COMMANDS_H_
 
 #include <deque>
 #include <unordered_set>
@@ -76,4 +76,4 @@ class RecognizeCommands {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_SPEECH_COMMANDS_RECOGNIZE_COMMANDS_H_
+#endif  // TENSORFLOW_EXAMPLES_SPEECH_COMMANDS_RECOGNIZE_COMMANDS_H_
diff --git a/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.h b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.h
index fa8cb0abe9..eada07e06f 100644
--- a/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.h
+++ b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_WAV_TO_SPECTROGRAM_WAV_TO_SPECTROGRAM_H_
-#define THIRD_PARTY_TENSORFLOW_EXAMPLES_WAV_TO_SPECTROGRAM_WAV_TO_SPECTROGRAM_H_
+#ifndef TENSORFLOW_EXAMPLES_WAV_TO_SPECTROGRAM_WAV_TO_SPECTROGRAM_H_
+#define TENSORFLOW_EXAMPLES_WAV_TO_SPECTROGRAM_WAV_TO_SPECTROGRAM_H_
 
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
@@ -28,4 +28,4 @@ tensorflow::Status WavToSpectrogram(const tensorflow::string& input_wav,
                                     tensorflow::int32 stride, float brightness,
                                     const tensorflow::string& output_image);
 
-#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_WAV_TO_SPECTROGRAM_WAV_TO_SPECTROGRAM_H_
+#endif  // TENSORFLOW_EXAMPLES_WAV_TO_SPECTROGRAM_WAV_TO_SPECTROGRAM_H_
diff --git a/tensorflow/python/eager/python_eager_op_gen.h b/tensorflow/python/eager/python_eager_op_gen.h
index f9dfdf0408..d27b00139d 100644
--- a/tensorflow/python/eager/python_eager_op_gen.h
+++ b/tensorflow/python/eager/python_eager_op_gen.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_PYTHON_EAGER_PYTHON_EAGER_OP_GEN_H_
-#define THIRD_PARTY_TENSORFLOW_PYTHON_EAGER_PYTHON_EAGER_OP_GEN_H_
+#ifndef TENSORFLOW_PYTHON_EAGER_PYTHON_EAGER_OP_GEN_H_
+#define TENSORFLOW_PYTHON_EAGER_PYTHON_EAGER_OP_GEN_H_
 
 #include <string>
 #include <vector>
@@ -40,4 +40,4 @@ string GetEagerPythonWrappers(const char* op_list_buf, size_t op_list_len);
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_PYTHON_EAGER_PYTHON_EAGER_OP_GEN_H_
+#endif  // TENSORFLOW_PYTHON_EAGER_PYTHON_EAGER_OP_GEN_H_
diff --git a/tensorflow/python/framework/cpp_shape_inference.h b/tensorflow/python/framework/cpp_shape_inference.h
index afca7277c7..c6ab6b106f 100644
--- a/tensorflow/python/framework/cpp_shape_inference.h
+++ b/tensorflow/python/framework/cpp_shape_inference.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_PYTHON_FRAMEWORK_CPP_SHAPE_INFERENCE_H_
-#define THIRD_PARTY_TENSORFLOW_PYTHON_FRAMEWORK_CPP_SHAPE_INFERENCE_H_
+#ifndef TENSORFLOW_PYTHON_FRAMEWORK_CPP_SHAPE_INFERENCE_H_
+#define TENSORFLOW_PYTHON_FRAMEWORK_CPP_SHAPE_INFERENCE_H_
 
 // Must be included first
 #include "tensorflow/python/lib/core/numpy.h"
@@ -51,4 +51,4 @@ std::vector<string> RunCppShapeInference(
 }  // namespace swig
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_PYTHON_FRAMEWORK_CPP_SHAPE_INFERENCE_H_
+#endif  // TENSORFLOW_PYTHON_FRAMEWORK_CPP_SHAPE_INFERENCE_H_
diff --git a/tensorflow/python/framework/python_op_gen_internal.h b/tensorflow/python/framework/python_op_gen_internal.h
index 6b53825a6d..d09b36a3e8 100644
--- a/tensorflow/python/framework/python_op_gen_internal.h
+++ b/tensorflow/python/framework/python_op_gen_internal.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_PYTHON_FRAMEWORK_PYTHON_OP_GEN_INTERNAL_H_
-#define THIRD_PARTY_TENSORFLOW_PYTHON_FRAMEWORK_PYTHON_OP_GEN_INTERNAL_H_
+#ifndef TENSORFLOW_PYTHON_FRAMEWORK_PYTHON_OP_GEN_INTERNAL_H_
+#define TENSORFLOW_PYTHON_FRAMEWORK_PYTHON_OP_GEN_INTERNAL_H_
 
 #include <unordered_map>
 
@@ -112,4 +112,4 @@ class GenPythonOp {
 }  // namespace python_op_gen_internal
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_PYTHON_FRAMEWORK_PYTHON_OP_GEN_INTERNAL_H_
+#endif  // TENSORFLOW_PYTHON_FRAMEWORK_PYTHON_OP_GEN_INTERNAL_H_
diff --git a/tensorflow/python/lib/core/ndarray_tensor.h b/tensorflow/python/lib/core/ndarray_tensor.h
index 5172d504bd..b2cd4133ca 100644
--- a/tensorflow/python/lib/core/ndarray_tensor.h
+++ b/tensorflow/python/lib/core/ndarray_tensor.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_PYTHON_LIB_CORE_NDARRAY_TENSOR_H_
-#define THIRD_PARTY_TENSORFLOW_PYTHON_LIB_CORE_NDARRAY_TENSOR_H_
+#ifndef TENSORFLOW_PYTHON_LIB_CORE_NDARRAY_TENSOR_H_
+#define TENSORFLOW_PYTHON_LIB_CORE_NDARRAY_TENSOR_H_
 
 // Must be included first.
 #include "tensorflow/python/lib/core/numpy.h"
@@ -45,4 +45,4 @@ Status TensorToNdarray(const Tensor& t, PyObject** ret);
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_PYTHON_LIB_CORE_NDARRAY_TENSOR_H_
+#endif  // TENSORFLOW_PYTHON_LIB_CORE_NDARRAY_TENSOR_H_
diff --git a/tensorflow/python/lib/core/py_seq_tensor.h b/tensorflow/python/lib/core/py_seq_tensor.h
index 6dc4d9c777..c6e5080c62 100644
--- a/tensorflow/python/lib/core/py_seq_tensor.h
+++ b/tensorflow/python/lib/core/py_seq_tensor.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_PYTHON_LIB_CORE_PY_SEQ_TENSOR_H_
-#define THIRD_PARTY_TENSORFLOW_PYTHON_LIB_CORE_PY_SEQ_TENSOR_H_
+#ifndef TENSORFLOW_PYTHON_LIB_CORE_PY_SEQ_TENSOR_H_
+#define TENSORFLOW_PYTHON_LIB_CORE_PY_SEQ_TENSOR_H_
 
 #include <Python.h>
 
@@ -34,4 +34,4 @@ Status PySeqToTensor(PyObject* obj, PyObject* dtype, Tensor* ret);
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_PYTHON_LIB_CORE_PY_SEQ_TENSOR_H_
+#endif  // TENSORFLOW_PYTHON_LIB_CORE_PY_SEQ_TENSOR_H_
diff --git a/tensorflow/python/lib/core/safe_ptr.h b/tensorflow/python/lib/core/safe_ptr.h
index 80db840aeb..32d2868886 100644
--- a/tensorflow/python/lib/core/safe_ptr.h
+++ b/tensorflow/python/lib/core/safe_ptr.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_PYTHON_LIB_CORE_SAFE_PTR_H_
-#define THIRD_PARTY_TENSORFLOW_PYTHON_LIB_CORE_SAFE_PTR_H_
+#ifndef TENSORFLOW_PYTHON_LIB_CORE_SAFE_PTR_H_
+#define TENSORFLOW_PYTHON_LIB_CORE_SAFE_PTR_H_
 
 #include <memory>
 
@@ -66,4 +66,4 @@ Safe_TF_StatusPtr make_safe(TF_Status* status);
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_PYTHON_LIB_CORE_SAFE_PTR_H_
+#endif  // TENSORFLOW_PYTHON_LIB_CORE_SAFE_PTR_H_
diff --git a/tensorflow/python/util/kernel_registry.h b/tensorflow/python/util/kernel_registry.h
index c00b60d91b..1ba76f020b 100644
--- a/tensorflow/python/util/kernel_registry.h
+++ b/tensorflow/python/util/kernel_registry.h
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 // Functions for getting information about kernels registered in the binary.
-#ifndef THIRD_PARTY_TENSORFLOW_PYTHON_UTIL_KERNEL_REGISTRY_H_
-#define THIRD_PARTY_TENSORFLOW_PYTHON_UTIL_KERNEL_REGISTRY_H_
+#ifndef TENSORFLOW_PYTHON_UTIL_KERNEL_REGISTRY_H_
+#define TENSORFLOW_PYTHON_UTIL_KERNEL_REGISTRY_H_
 
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/platform/types.h"
@@ -31,4 +31,4 @@ string TryFindKernelClass(const string& serialized_node_def);
 }  // namespace swig
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_PYTHON_UTIL_KERNEL_REGISTRY_H_
+#endif  // TENSORFLOW_PYTHON_UTIL_KERNEL_REGISTRY_H_
diff --git a/tensorflow/python/util/util.h b/tensorflow/python/util/util.h
index 493d26b497..2af71dc753 100644
--- a/tensorflow/python/util/util.h
+++ b/tensorflow/python/util/util.h
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 // Functions for getting information about kernels registered in the binary.
-#ifndef THIRD_PARTY_TENSORFLOW_PYTHON_UTIL_UTIL_H_
-#define THIRD_PARTY_TENSORFLOW_PYTHON_UTIL_UTIL_H_
+#ifndef TENSORFLOW_PYTHON_UTIL_UTIL_H_
+#define TENSORFLOW_PYTHON_UTIL_UTIL_H_
 
 #include <Python.h>
 
@@ -71,4 +71,4 @@ void RegisterSequenceClass(PyObject* sequence_class);
 }  // namespace swig
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_PYTHON_UTIL_UTIL_H_
+#endif  // TENSORFLOW_PYTHON_UTIL_UTIL_H_
diff --git a/tensorflow/tools/graph_transforms/file_utils.h b/tensorflow/tools/graph_transforms/file_utils.h
index 4737e95abc..a3723f5cd3 100644
--- a/tensorflow/tools/graph_transforms/file_utils.h
+++ b/tensorflow/tools/graph_transforms/file_utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_TOOLS_GRAPH_TRANSFORMS_FILE_UTILS_H_
-#define THIRD_PARTY_TENSORFLOW_TOOLS_GRAPH_TRANSFORMS_FILE_UTILS_H_
+#ifndef TENSORFLOW_TOOLS_GRAPH_TRANSFORMS_FILE_UTILS_H_
+#define TENSORFLOW_TOOLS_GRAPH_TRANSFORMS_FILE_UTILS_H_
 
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -29,4 +29,4 @@ Status LoadTextOrBinaryGraphFile(const string& file_name, GraphDef* graph_def);
 }  // namespace graph_transforms
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_TOOLS_GRAPH_TRANSFORMS_FILE_UTILS_H_
+#endif  // TENSORFLOW_TOOLS_GRAPH_TRANSFORMS_FILE_UTILS_H_
diff --git a/tensorflow/tools/proto_text/gen_proto_text_functions_lib.h b/tensorflow/tools/proto_text/gen_proto_text_functions_lib.h
index 44387bbd4d..e18d749cff 100644
--- a/tensorflow/tools/proto_text/gen_proto_text_functions_lib.h
+++ b/tensorflow/tools/proto_text/gen_proto_text_functions_lib.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_UTIL_CREATE_PROTO_DEBUG_STRING_LIB_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_UTIL_CREATE_PROTO_DEBUG_STRING_LIB_H_
+#ifndef TENSORFLOW_CORE_UTIL_CREATE_PROTO_DEBUG_STRING_LIB_H_
+#define TENSORFLOW_CORE_UTIL_CREATE_PROTO_DEBUG_STRING_LIB_H_
 
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
@@ -50,4 +50,4 @@ ProtoTextFunctionCode GetProtoTextFunctionCode(
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_UTIL_CREATE_PROTO_DEBUG_STRING_LIB_H_
+#endif  // TENSORFLOW_CORE_UTIL_CREATE_PROTO_DEBUG_STRING_LIB_H_
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
index c210b1712c..cb1636256d 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
@@ -1,5 +1,5 @@
-#ifndef THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_
-#define THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_
+#ifndef EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_
+#define EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_
 
 #ifdef _MSC_VER
 
@@ -502,4 +502,4 @@ struct functor_traits<scalar_product_op<QInt32, double>> {
 }  // end namespace internal
 }  // end namespace Eigen
 
-#endif  // THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_
+#endif  // EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h
index 7a222fddc1..8f9906dbf9 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h
@@ -1,5 +1,5 @@
-#ifndef THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX512_H_
-#define THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX512_H_
+#ifndef EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX512_H_
+#define EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX512_H_
 
 #include "PacketMathAVX2.h"
 
@@ -542,4 +542,4 @@ EIGEN_STRONG_INLINE QInt8 predux_max<Packet64q8i>(const Packet64q8i& a) {
 }  // end namespace internal
 }  // end namespace Eigen
 
-#endif  // THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX512_H_
+#endif  // EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX512_H_
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX2.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX2.h
index 045384d7fc..7b4ecc752f 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX2.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX2.h
@@ -1,5 +1,5 @@
-#ifndef THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX2_H_
-#define THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX2_H_
+#ifndef EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX2_H_
+#define EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX2_H_
 
 namespace Eigen {
 namespace internal {
@@ -63,4 +63,4 @@ pcast<Packet8q32i, Packet32q8u>(const Packet8q32i& a, const Packet8q32i& b,
 }  // end namespace internal
 }  // end namespace Eigen
 
-#endif  // THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX2_H_
+#endif  // EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX2_H_
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h
index cd7120ec00..26735743d4 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h
@@ -1,5 +1,5 @@
-#ifndef THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX512_H_
-#define THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX512_H_
+#ifndef EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX512_H_
+#define EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX512_H_
 
 namespace Eigen {
 namespace internal {
@@ -177,4 +177,4 @@ pcast<Packet16q32i, Packet32q16u>(const Packet16q32i& a,
 }  // end namespace internal
 }  // end namespace Eigen
 
-#endif  // THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX512_H_
+#endif  // EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX512_H_
diff --git a/third_party/fft2d/fft.h b/third_party/fft2d/fft.h
index 252cc01fec..31b4935089 100644
--- a/third_party/fft2d/fft.h
+++ b/third_party/fft2d/fft.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // Declarations for 1D FFT routines in third_party/fft2d/fft.
 
-#ifndef THIRD_PARTY_FFT2D_FFT_H__
-#define THIRD_PARTY_FFT2D_FFT_H__
+#ifndef FFT2D_FFT_H__
+#define FFT2D_FFT_H__
 
 #ifdef __cplusplus
 extern "C" {
@@ -33,4 +33,4 @@ extern void dfst(int, double *, double *, int *, double *);
 }
 #endif
 
-#endif  // THIRD_PARTY_FFT2D_FFT_H__
+#endif  // FFT2D_FFT_H__
-- 
GitLab


From 98809980c420bb5db4759ce796d6b6dcc8877b73 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 24 Jan 2018 14:41:35 -0800
Subject: [PATCH 112/966] [XLA] Enable F16 for the CPU and GPU backends.

Enhance the IR emitter to handle mathematics operations with F16 types.
Add tests for F16 mathematics operations.

PiperOrigin-RevId: 183144087
---
 .../xla/service/cpu/elemental_ir_emitter.cc   |  32 ++-
 .../xla/service/gpu/elemental_ir_emitter.cc   |  41 ++-
 .../compiler/xla/service/llvm_ir/llvm_util.cc |   7 +
 tensorflow/compiler/xla/tests/BUILD           |  24 ++
 .../xla/tests/client_library_test_base.h      |   5 +
 tensorflow/compiler/xla/tests/half_test.cc    | 257 ++++++++++++++++++
 .../compiler/xla/tests/literal_test_util.cc   |  12 +
 7 files changed, 362 insertions(+), 16 deletions(-)
 create mode 100644 tensorflow/compiler/xla/tests/half_test.cc

diff --git a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
index ebd96c4c42..99c5e16db7 100644
--- a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
@@ -33,8 +33,14 @@ StatusOr<llvm::Value*> CpuElementalIrEmitter::EmitFloatUnaryOp(
   switch (op->opcode()) {
     case HloOpcode::kTanh: {
       PrimitiveType element_type = op->shape().element_type();
+      bool cast_result_to_fp16 = false;
       string function_name;
       switch (element_type) {
+        case F16:
+          cast_result_to_fp16 = true;
+          operand_value = ir_builder_->CreateFPCast(operand_value,
+                                                    ir_builder_->getFloatTy());
+          TF_FALLTHROUGH_INTENDED;
         case F32:
           function_name = "tanhf";
           break;
@@ -44,7 +50,7 @@ StatusOr<llvm::Value*> CpuElementalIrEmitter::EmitFloatUnaryOp(
         default:
           return Unimplemented("tanh");
       }
-      // Create function declaration for 'tanhf'.
+      // Create a function declaration.
       llvm::Function* function =
           llvm::cast<llvm::Function>(module_->getOrInsertFunction(
               llvm_ir::AsStringRef(function_name), operand_value->getType(),
@@ -52,8 +58,12 @@ StatusOr<llvm::Value*> CpuElementalIrEmitter::EmitFloatUnaryOp(
       function->setCallingConv(llvm::CallingConv::C);
       function->setDoesNotThrow();
       function->setDoesNotAccessMemory();
-      // Create instruction to call 'tanhf'.
-      return ir_builder_->CreateCall(function, operand_value);
+      // Create an instruction to call the function.
+      llvm::Value* result = ir_builder_->CreateCall(function, operand_value);
+      if (cast_result_to_fp16) {
+        result = ir_builder_->CreateFPCast(result, ir_builder_->getHalfTy());
+      }
+      return result;
     }
     default:
       return ElementalIrEmitter::EmitFloatUnaryOp(op, operand_value);
@@ -63,7 +73,13 @@ StatusOr<llvm::Value*> CpuElementalIrEmitter::EmitFloatUnaryOp(
 StatusOr<llvm::Value*> CpuElementalIrEmitter::EmitAtan2(
     PrimitiveType prim_type, llvm::Value* lhs, llvm::Value* rhs) const {
   string function_name;
+  bool cast_result_to_fp16 = false;
   switch (prim_type) {
+    case F16:
+      cast_result_to_fp16 = true;
+      lhs = ir_builder_->CreateFPCast(lhs, ir_builder_->getFloatTy());
+      rhs = ir_builder_->CreateFPCast(rhs, ir_builder_->getFloatTy());
+      TF_FALLTHROUGH_INTENDED;
     case F32:
       function_name = "atan2f";
       break;
@@ -73,7 +89,7 @@ StatusOr<llvm::Value*> CpuElementalIrEmitter::EmitAtan2(
     default:
       return Unimplemented("atan2");
   }
-  // Create function declaration for 'atan2'.
+  // Create a function declaration.
   llvm::Function* function =
       llvm::cast<llvm::Function>(module_->getOrInsertFunction(
           llvm_ir::AsStringRef(function_name), lhs->getType(), lhs->getType(),
@@ -81,8 +97,12 @@ StatusOr<llvm::Value*> CpuElementalIrEmitter::EmitAtan2(
   function->setCallingConv(llvm::CallingConv::C);
   function->setDoesNotThrow();
   function->setDoesNotAccessMemory();
-  // Create instruction to call 'atan2'.
-  return ir_builder_->CreateCall(function, {lhs, rhs});
+  // Create an instruction to call the function.
+  llvm::Value* result = ir_builder_->CreateCall(function, {lhs, rhs});
+  if (cast_result_to_fp16) {
+    result = ir_builder_->CreateFPCast(result, ir_builder_->getHalfTy());
+  }
+  return result;
 }
 
 llvm_ir::ElementGenerator CpuElementalIrEmitter::MakeElementGenerator(
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
index 4b511cb4bb..5af7a77ea8 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
@@ -72,9 +72,27 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitLibdeviceMathCall(
     tensorflow::gtl::ArraySlice<PrimitiveType> input_types,
     PrimitiveType output_type) const {
   // The libdevice math functions differentiate between "double" and "float" by
-  // appending an 'f' to the function's name.
+  // appending an 'f' to the function's name. libdevice doesn't have f16 math
+  // functions, so we convert the operands to f32 before calling the function
+  // and then convert the result back to f16.
   string munged_callee = callee_name;
+  bool cast_result_to_fp16 = false;
+  std::vector<llvm::Value*> converted_operands(operands.begin(),
+                                               operands.end());
+  std::vector<PrimitiveType> converted_input_types(input_types.begin(),
+                                                   input_types.end());
   switch (output_type) {
+    case F16:
+      cast_result_to_fp16 = true;
+      for (int64 i = 0; i < operands.size(); ++i) {
+        if (input_types[i] == F16) {
+          converted_operands[i] = ir_builder_->CreateFPCast(
+              converted_operands[i], ir_builder_->getFloatTy());
+          converted_input_types[i] = F32;
+        }
+      }
+      output_type = F32;
+      TF_FALLTHROUGH_INTENDED;
     case F32:
       StrAppend(&munged_callee, "f");
       break;
@@ -84,7 +102,13 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitLibdeviceMathCall(
       return Unimplemented("Bad type for libdevice math call: %s",
                            PrimitiveType_Name(output_type).c_str());
   }
-  return EmitMathCall(munged_callee, operands, input_types, output_type);
+  llvm::Value* result = EmitMathCall(munged_callee, converted_operands,
+                                     converted_input_types, output_type)
+                            .ValueOrDie();
+  if (cast_result_to_fp16) {
+    result = ir_builder_->CreateFPCast(result, ir_builder_->getHalfTy());
+  }
+  return result;
 }
 
 StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitLlvmIntrinsicMathCall(
@@ -92,10 +116,13 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitLlvmIntrinsicMathCall(
     tensorflow::gtl::ArraySlice<llvm::Value*> operands,
     tensorflow::gtl::ArraySlice<PrimitiveType> input_types,
     PrimitiveType output_type) const {
-  // llvm intrinsics differentiate between float/double functions via the ".f32"
-  // and ".f64" suffixes.
+  // llvm intrinsics differentiate between half/float/double functions via
+  // the suffixes ".f16", ".f32" and ".f64".
   string munged_callee = callee_name;
   switch (output_type) {
+    case F16:
+      StrAppend(&munged_callee, ".f16");
+      break;
     case F32:
       StrAppend(&munged_callee, ".f32");
       break;
@@ -233,12 +260,6 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitFloatUnaryOp(
   PrimitiveType input_type = op->operand(0)->shape().element_type();
   PrimitiveType output_type = op->shape().element_type();
   switch (op->opcode()) {
-    case HloOpcode::kFloor:
-      return EmitLibdeviceMathCall("__nv_floor", {operand_value}, {input_type},
-                                   output_type);
-    case HloOpcode::kCeil:
-      return EmitLibdeviceMathCall("__nv_ceil", {operand_value}, {input_type},
-                                   output_type);
     case HloOpcode::kTanh:
       return EmitLibdeviceMathCall("__nv_tanh", {operand_value}, {input_type},
                                    output_type);
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
index d2bcb38d09..8d1e6338e1 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
@@ -150,6 +150,8 @@ llvm::Type* PrimitiveTypeToIrType(PrimitiveType element_type,
       // addition to an addition on this type (int16) - this is just the type
       // used for storage.
       return llvm::Type::getInt16Ty(module->getContext());
+    case F16:
+      return llvm::Type::getHalfTy(module->getContext());
     case S32:
     case U32:
       return llvm::Type::getInt32Ty(module->getContext());
@@ -292,6 +294,11 @@ llvm::Constant* LiteralToConstant(const Literal& literal, int64 dimension_index,
             ir_element_type,
             tensorflow::bit_cast<uint16>(literal.Get<bfloat16>(*multi_index)));
         break;
+      case F16:
+        value = llvm::ConstantFP::get(
+            ir_element_type,
+            static_cast<float>(literal.Get<half>(*multi_index)));
+        break;
       case F64:
         value = llvm::ConstantFP::get(ir_element_type,
                                       literal.Get<double>(*multi_index));
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 84e7c1770b..6ab406d6c3 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -844,6 +844,30 @@ xla_test(
     ],
 )
 
+xla_test(
+    name = "half_test",
+    srcs = ["half_test.cc"],
+    backends = [
+        "cpu",
+        "gpu",
+    ],
+    deps = [
+        ":test_utils",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla/client:computation",
+        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+    ],
+)
+
 xla_test(
     name = "slice_test",
     srcs = ["slice_test.cc"],
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.h b/tensorflow/compiler/xla/tests/client_library_test_base.h
index a559a653df..ba0319990b 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.h
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.h
@@ -431,6 +431,7 @@ void ClientLibraryTestBase::ComputeAndCompareR0(
   static_assert(std::is_same<NativeT, float>::value ||
                     std::is_same<NativeT, double>::value ||
                     std::is_same<NativeT, bfloat16>::value ||
+                    std::is_same<NativeT, half>::value ||
                     std::is_same<NativeT, complex64>::value,
                 "Float or complex type required when specifying an ErrorSpec");
   std::unique_ptr<Literal> expected_literal =
@@ -456,6 +457,7 @@ void ClientLibraryTestBase::ComputeAndCompareR1(
   static_assert(std::is_same<NativeT, float>::value ||
                     std::is_same<NativeT, double>::value ||
                     std::is_same<NativeT, bfloat16>::value ||
+                    std::is_same<NativeT, half>::value ||
                     std::is_same<NativeT, complex64>::value,
                 "Float or complex type required when specifying an ErrorSpec");
   std::unique_ptr<Literal> expected_literal =
@@ -481,6 +483,7 @@ void ClientLibraryTestBase::ComputeAndCompareR2(
   static_assert(std::is_same<NativeT, float>::value ||
                     std::is_same<NativeT, double>::value ||
                     std::is_same<NativeT, bfloat16>::value ||
+                    std::is_same<NativeT, half>::value ||
                     std::is_same<NativeT, complex64>::value,
                 "Float or complex type required when specifying an ErrorSpec");
   std::unique_ptr<Literal> expected_literal =
@@ -506,6 +509,7 @@ void ClientLibraryTestBase::ComputeAndCompareR3(
   static_assert(std::is_same<NativeT, float>::value ||
                     std::is_same<NativeT, double>::value ||
                     std::is_same<NativeT, bfloat16>::value ||
+                    std::is_same<NativeT, half>::value ||
                     std::is_same<NativeT, complex64>::value,
                 "Float or complex type required when specifying an ErrorSpec");
   std::unique_ptr<Literal> expected_literal =
@@ -531,6 +535,7 @@ void ClientLibraryTestBase::ComputeAndCompareR4(
   static_assert(std::is_same<NativeT, float>::value ||
                     std::is_same<NativeT, double>::value ||
                     std::is_same<NativeT, bfloat16>::value ||
+                    std::is_same<NativeT, half>::value ||
                     std::is_same<NativeT, complex64>::value,
                 "Float or complex type required when specifying an ErrorSpec");
   std::unique_ptr<Literal> expected_literal =
diff --git a/tensorflow/compiler/xla/tests/half_test.cc b/tensorflow/compiler/xla/tests/half_test.cc
new file mode 100644
index 0000000000..ec2f49d43b
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/half_test.cc
@@ -0,0 +1,257 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cmath>
+#include <vector>
+
+#include "tensorflow/compiler/xla/client/computation.h"
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/tests/test_utils.h"
+
+// Tests the handling of the basic mathematics operations with F16 operands.
+
+namespace xla {
+namespace {
+
+class HalfTestBase : public ClientLibraryTestBase {
+ protected:
+  const ErrorSpec error_spec_{0.001, 0.001};
+  // Number of elements in the input buffers.
+  static const int kNumElements = 4;
+};
+
+using UnaryBuildFuncTy =
+    std::function<void(ComputationBuilder*, const ComputationDataHandle& src)>;
+
+struct UnaryOpTestParam {
+  std::function<half(half)> compute_func;
+  UnaryBuildFuncTy build_func;
+};
+
+class UnaryOpTest : public HalfTestBase,
+                    public ::testing::WithParamInterface<UnaryOpTestParam> {};
+
+XLA_TEST_P(UnaryOpTest, Ops) {
+  std::vector<half> x({half(1.4), half(-2.3), half(3.2), half(-4.1)});
+  ComputationBuilder builder(client_, TestName());
+  ComputationDataHandle x_opnd;
+  auto x_data = CreateR1Parameter<half>(x, /*parameter_number=*/0, "x",
+                                        &builder, &x_opnd);
+
+  std::function<half(half)> compute_func = GetParam().compute_func;
+  std::vector<half> expected;
+  for (int64 i = 0; i < x.size(); ++i) {
+    expected.push_back(compute_func(x[i]));
+  }
+
+  UnaryBuildFuncTy build_func = GetParam().build_func;
+  build_func(&builder, x_opnd);
+
+  ComputeAndCompareR1<half>(&builder, expected, {x_data.get()}, error_spec_);
+}
+
+half sign_imp(half value) {
+  const float x(std::move(value));
+  return half((x < .0) ? -1 : (x > .0));
+}
+
+half round_imp(half value) {
+  return half(round(static_cast<float>(std::move(value))));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    half, UnaryOpTest,
+    ::testing::Values(UnaryOpTestParam{[](half x) { return abs(x); },
+                                       &ComputationBuilder::Abs},
+                      UnaryOpTestParam{[](half x) { return round_imp(x); },
+                                       &ComputationBuilder::Round},
+                      UnaryOpTestParam{[](half x) { return ceil(x); },
+                                       &ComputationBuilder::Ceil},
+                      UnaryOpTestParam{[](half x) { return cos(x); },
+                                       &ComputationBuilder::Cos},
+                      UnaryOpTestParam{[](half x) { return exp(x); },
+                                       &ComputationBuilder::Exp},
+                      UnaryOpTestParam{[](half x) { return floor(x); },
+                                       &ComputationBuilder::Floor},
+                      UnaryOpTestParam{[](half x) { return log(x); },
+                                       &ComputationBuilder::Log},
+                      UnaryOpTestParam{[](half x) { return -x; },
+                                       &ComputationBuilder::Neg},
+                      UnaryOpTestParam{[](half x) { return sign_imp(x); },
+                                       &ComputationBuilder::Sign},
+                      UnaryOpTestParam{[](half x) { return sin(x); },
+                                       &ComputationBuilder::Sin},
+                      UnaryOpTestParam{[](half x) { return tanh(x); },
+                                       &ComputationBuilder::Tanh}
+
+                      ));
+
+struct UnaryPredTestParam {
+  std::function<bool(half)> compute_func;
+  UnaryBuildFuncTy build_func;
+};
+
+class UnaryPredTest : public HalfTestBase,
+                      public ::testing::WithParamInterface<UnaryPredTestParam> {
+};
+
+XLA_TEST_P(UnaryPredTest, Ops) {
+  std::vector<half> x({half(1.4), half(-2.3), half(3.2), half(-4.1)});
+  ComputationBuilder builder(client_, TestName());
+  ComputationDataHandle x_opnd;
+  auto x_data = CreateR1Parameter<half>(x, /*parameter_number=*/0, "x",
+                                        &builder, &x_opnd);
+
+  std::function<bool(half)> compute_func = GetParam().compute_func;
+  CHECK_EQ(kNumElements, x.size());
+  bool expected[kNumElements];
+  for (int64 i = 0; i < x.size(); ++i) {
+    expected[i] = compute_func(x[i]);
+  }
+
+  UnaryBuildFuncTy build_func = GetParam().build_func;
+  build_func(&builder, x_opnd);
+
+  ComputeAndCompareR1<bool>(&builder, expected, {x_data.get()});
+}
+
+INSTANTIATE_TEST_CASE_P(half, UnaryPredTest,
+                        ::testing::Values(UnaryPredTestParam{
+                            [](half x) { return isfinite(x); },
+                            &ComputationBuilder::IsFinite}));
+
+using BinaryBuildFuncTy = std::function<void(
+    ComputationBuilder*, const ComputationDataHandle& x,
+    const ComputationDataHandle& y, tensorflow::gtl::ArraySlice<int64>)>;
+
+struct BinaryOpTestParam {
+  std::function<half(half, half)> compute_func;
+  BinaryBuildFuncTy build_func;
+};
+
+class BinaryOpTest : public HalfTestBase,
+                     public ::testing::WithParamInterface<BinaryOpTestParam> {};
+
+XLA_TEST_P(BinaryOpTest, Ops) {
+  std::vector<half> x({half(1.0), half(2.0), half(3.0), half(-4.0)});
+  std::vector<half> y({half(0.4), half(-0.3), half(0.2), half(0.1)});
+  ComputationBuilder builder(client_, TestName());
+  ComputationDataHandle x_opnd;
+  auto x_data = CreateR1Parameter<half>(x, /*parameter_number=*/0, "x",
+                                        &builder, &x_opnd);
+
+  ComputationDataHandle y_opnd;
+  auto y_data = CreateR1Parameter<half>(y, /*parameter_number=*/1, "y",
+                                        &builder, &y_opnd);
+
+  std::function<half(half, half)> compute_func = GetParam().compute_func;
+  std::vector<half> expected;
+  for (int64 i = 0; i < x.size(); ++i) {
+    expected.push_back(compute_func(x[i], y[i]));
+  }
+
+  BinaryBuildFuncTy build_func = GetParam().build_func;
+  build_func(&builder, x_opnd, y_opnd, {});
+
+  ComputeAndCompareR1<half>(&builder, expected, {x_data.get(), y_data.get()},
+                            error_spec_);
+}
+
+half atan2_imp(half x, half y) {
+  return half(atan2(static_cast<float>(std::move(x)),
+                    static_cast<float>(std::move(y))));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    half, BinaryOpTest,
+    ::testing::Values(
+        BinaryOpTestParam{[](half x, half y) { return x + y; },
+                          &ComputationBuilder::Add},
+        BinaryOpTestParam{[](half x, half y) { return atan2_imp(x, y); },
+                          &ComputationBuilder::Atan2},
+        BinaryOpTestParam{[](half x, half y) { return x / y; },
+                          &ComputationBuilder::Div},
+        BinaryOpTestParam{[](half x, half y) { return max(x, y); },
+                          &ComputationBuilder::Max},
+        BinaryOpTestParam{[](half x, half y) { return min(x, y); },
+                          &ComputationBuilder::Min},
+        BinaryOpTestParam{[](half x, half y) { return x * y; },
+                          &ComputationBuilder::Mul},
+        BinaryOpTestParam{[](half x, half y) { return pow(x, y); },
+                          &ComputationBuilder::Pow},
+        BinaryOpTestParam{[](half x, half y) { return x - y; },
+                          &ComputationBuilder::Sub}
+
+        ));
+
+struct BinaryPredTestParam {
+  std::function<bool(half, half)> compute_func;
+  BinaryBuildFuncTy build_func;
+};
+
+class BinaryPredTest
+    : public HalfTestBase,
+      public ::testing::WithParamInterface<BinaryPredTestParam> {};
+
+XLA_TEST_P(BinaryPredTest, Ops) {
+  std::vector<half> x({half(1.0), half(2.0), half(0.2), half(-4.0)});
+  std::vector<half> y({half(0.4), half(-0.3), half(0.2), half(0.1)});
+  ComputationBuilder builder(client_, TestName());
+  ComputationDataHandle x_opnd;
+  auto x_data = CreateR1Parameter<half>(x, /*parameter_number=*/0, "x",
+                                        &builder, &x_opnd);
+
+  ComputationDataHandle y_opnd;
+  auto y_data = CreateR1Parameter<half>(y, /*parameter_number=*/1, "y",
+                                        &builder, &y_opnd);
+
+  std::function<bool(half, half)> compute_func = GetParam().compute_func;
+  CHECK_EQ(kNumElements, x.size());
+  bool expected[kNumElements];
+  for (int64 i = 0; i < x.size(); ++i) {
+    expected[i] = compute_func(x[i], y[i]);
+  }
+
+  BinaryBuildFuncTy build_func = GetParam().build_func;
+  build_func(&builder, x_opnd, y_opnd, {});
+
+  ComputeAndCompareR1<bool>(&builder, expected, {x_data.get(), y_data.get()});
+}
+
+INSTANTIATE_TEST_CASE_P(
+    half, BinaryPredTest,
+    ::testing::Values(BinaryPredTestParam{[](half x, half y) { return x == y; },
+                                          &ComputationBuilder::Eq},
+                      BinaryPredTestParam{[](half x, half y) { return x != y; },
+                                          &ComputationBuilder::Ne},
+                      BinaryPredTestParam{[](half x, half y) { return x >= y; },
+                                          &ComputationBuilder::Ge},
+                      BinaryPredTestParam{[](half x, half y) { return x > y; },
+                                          &ComputationBuilder::Gt},
+                      BinaryPredTestParam{[](half x, half y) { return x <= y; },
+                                          &ComputationBuilder::Le},
+                      BinaryPredTestParam{[](half x, half y) { return x < y; },
+                                          &ComputationBuilder::Lt}
+
+                      ));
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/literal_test_util.cc b/tensorflow/compiler/xla/tests/literal_test_util.cc
index 975feaa050..f8205de702 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util.cc
+++ b/tensorflow/compiler/xla/tests/literal_test_util.cc
@@ -301,6 +301,9 @@ bool ExpectLiteralsEqual(const Literal& expected, const Literal& actual,
     case BF16:
       match = ExpectLiteralsEqual<bfloat16>(expected, actual, &multi_index, 0);
       break;
+    case F16:
+      match = ExpectLiteralsEqual<half>(expected, actual, &multi_index, 0);
+      break;
     case F32:
       match = ExpectLiteralsEqual<float>(expected, actual, &multi_index, 0);
       break;
@@ -380,6 +383,9 @@ class NearComparator {
       case BF16:
         ExpectLiteralsNear<bfloat16>(expected, actual, 0);
         break;
+      case F16:
+        ExpectLiteralsNear<half>(expected, actual, 0);
+        break;
       case F32:
         ExpectLiteralsNear<float>(expected, actual, 0);
         break;
@@ -572,6 +578,12 @@ bool NearComparator::ExpectValuesNear<bfloat16>(bfloat16 expected,
                           static_cast<float>(actual));
 }
 
+template <>
+bool NearComparator::ExpectValuesNear<half>(half expected, half actual) {
+  return ExpectValuesNear(static_cast<float>(std::move(expected)),
+                          static_cast<float>(std::move(actual)));
+}
+
 }  // namespace
 
 /* static */ ::testing::AssertionResult LiteralTestUtil::Near(
-- 
GitLab


From 170246dadb4926fb35000bba8beffb73f2299968 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 24 Jan 2018 14:56:16 -0800
Subject: [PATCH 113/966] Go: Update generated wrapper functions for TensorFlow
 ops.

PiperOrigin-RevId: 183146357
---
 tensorflow/go/op/wrappers.go | 132 +++++++++++++++++------------------
 1 file changed, 66 insertions(+), 66 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 7873201f3e..5b19c90238 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -5486,6 +5486,72 @@ func QuantizedReluX(scope *Scope, features tf.Output, max_value tf.Output, min_f
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
+// SummaryWriterAttr is an optional argument to SummaryWriter.
+type SummaryWriterAttr func(optionalAttr)
+
+// SummaryWriterSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func SummaryWriterSharedName(value string) SummaryWriterAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// SummaryWriterContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func SummaryWriterContainer(value string) SummaryWriterAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// Returns a handle to be used to access a summary writer.
+//
+// The summary writer is an in-graph resource which can be used by ops to write
+// summaries to event files.
+//
+// Returns the summary writer resource. Scalar handle.
+func SummaryWriter(scope *Scope, optional ...SummaryWriterAttr) (writer tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SummaryWriter",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes gradients for SparseSegmentMean.
+//
+// Returns tensor "output" with same shape as grad, except for dimension 0 whose
+// value is output_dim0.
+//
+// Arguments:
+//	grad: gradient propagated to the SparseSegmentMean op.
+//	indices: indices passed to the corresponding SparseSegmentMean op.
+//	segment_ids: segment_ids passed to the corresponding SparseSegmentMean op.
+//	output_dim0: dimension 0 of "data" passed to SparseSegmentMean op.
+func SparseSegmentMeanGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSegmentMeanGrad",
+		Input: []tf.Input{
+			grad, indices, segment_ids, output_dim0,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Applies softmax to a batched N-D `SparseTensor`.
 //
 // The inputs represent an N-D SparseTensor  with logical shape `[..., B, C]`
@@ -18597,72 +18663,6 @@ func MatchingFiles(scope *Scope, pattern tf.Output) (filenames tf.Output) {
 	return op.Output(0)
 }
 
-// Computes gradients for SparseSegmentMean.
-//
-// Returns tensor "output" with same shape as grad, except for dimension 0 whose
-// value is output_dim0.
-//
-// Arguments:
-//	grad: gradient propagated to the SparseSegmentMean op.
-//	indices: indices passed to the corresponding SparseSegmentMean op.
-//	segment_ids: segment_ids passed to the corresponding SparseSegmentMean op.
-//	output_dim0: dimension 0 of "data" passed to SparseSegmentMean op.
-func SparseSegmentMeanGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSegmentMeanGrad",
-		Input: []tf.Input{
-			grad, indices, segment_ids, output_dim0,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SummaryWriterAttr is an optional argument to SummaryWriter.
-type SummaryWriterAttr func(optionalAttr)
-
-// SummaryWriterSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func SummaryWriterSharedName(value string) SummaryWriterAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// SummaryWriterContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func SummaryWriterContainer(value string) SummaryWriterAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// Returns a handle to be used to access a summary writer.
-//
-// The summary writer is an in-graph resource which can be used by ops to write
-// summaries to event files.
-//
-// Returns the summary writer resource. Scalar handle.
-func SummaryWriter(scope *Scope, optional ...SummaryWriterAttr) (writer tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SummaryWriter",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // ResizeBicubicGradAttr is an optional argument to ResizeBicubicGrad.
 type ResizeBicubicGradAttr func(optionalAttr)
 
-- 
GitLab


From b3294d74c73d8f01412ffeeac616d0b8a2f4e18e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 24 Jan 2018 15:07:21 -0800
Subject: [PATCH 114/966] When a device list API such as TF_DeviceListName()
 succeeds, set the output status to OK.

PiperOrigin-RevId: 183148319
---
 tensorflow/c/c_api.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index 92c9adbec7..3c7f041b39 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -927,6 +927,7 @@ int TF_DeviceListCount(const TF_DeviceList* list) {
       status->status = InvalidArgument("index out of bounds");            \
       return err_val;                                                     \
     }                                                                     \
+    status->status = Status::OK();                                        \
     return list->response[index].accessor;                                \
   }
 
-- 
GitLab


From 2d78a128d67b7469e0da1143fab51d96ab9aa3aa Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Wed, 24 Jan 2018 15:10:19 -0800
Subject: [PATCH 115/966] Parameterized docker build now supports a local pip
 whl file path.

PiperOrigin-RevId: 183148798
---
 .../docker/parameterized_docker_build.sh      | 31 +++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/tensorflow/tools/docker/parameterized_docker_build.sh b/tensorflow/tools/docker/parameterized_docker_build.sh
index fa867b65db..b4fba5b8f5 100755
--- a/tensorflow/tools/docker/parameterized_docker_build.sh
+++ b/tensorflow/tools/docker/parameterized_docker_build.sh
@@ -34,6 +34,11 @@
 #     If set to a non-empty string, will use it as the URL from which the
 #     pip wheel file will be downloaded (instead of building the pip locally).
 #
+#   TF_DOCKER_BUILD_CENTRAL_PIP_IS_LOCAL
+#     (Optional)
+#     If set to a non-empty string, we will treat TF_DOCKER_BUILD_CENTRAL_PIP
+#     as a path rather than a url.
+#
 #   TF_DOCKER_BUILD_IMAGE_NAME:
 #     (Optional)
 #     If set to any non-empty value, will use it as the image of the
@@ -234,6 +239,32 @@ if [[ "${TF_DOCKER_BUILD_IS_DEVEL}" == "no" ]]; then
 "COPY ${PIP_WHL} /\n"\
 "RUN pip --no-cache-dir install /${PIP_WHL}" "${ORIG_DOCKERFILE}" \
     > "${DOCKERFILE}"
+
+  # Build from a local whl file path rather than an URL
+  elif [[ ! -z "${TF_DOCKER_BUILD_CENTRAL_PIP_IS_LOCAL}" ]]; then
+    PIP_WHL="${TF_DOCKER_BUILD_CENTRAL_PIP}"
+    if [[ -z "${PIP_WHL}" ]]; then
+      die "ERROR: Cannot locate the specified pip whl file"
+    fi
+    echo "Specified PIP whl file is at: ${PIP_WHL}"
+
+    # Copy the pip file to tmp directory
+    cp "${PIP_WHL}" "${TMP_DIR}/" || \
+        die "ERROR: Failed to copy wheel file: ${PIP_WHL}"
+
+    # Use string replacement to put the correct file name into the Dockerfile
+    PIP_WHL=$(basename "${PIP_WHL}")
+
+    # Modify the non-devel Dockerfile to point to the correct pip whl file
+    # location
+    sed -e "/# --- DO NOT EDIT OR DELETE BETWEEN THE LINES --- #/,"\
+"/# --- ~ DO NOT EDIT OR DELETE BETWEEN THE LINES --- #/c"\
+"COPY ${PIP_WHL} /\n"\
+"RUN pip --no-cache-dir install /${PIP_WHL}" "${ORIG_DOCKERFILE}" \
+    > "${DOCKERFILE}"
+    echo "Using local pip wheel from: ${TF_DOCKER_BUILD_CENTRAL_PIP}"
+    echo
+
   else
     echo "Downloading pip wheel from: ${TF_DOCKER_BUILD_CENTRAL_PIP}"
     echo
-- 
GitLab


From a5eaaf5996fc8a802e8d15de95447c849067c94f Mon Sep 17 00:00:00 2001
From: Eugene Brevdo <ebrevdo@google.com>
Date: Wed, 24 Jan 2018 15:10:28 -0800
Subject: [PATCH 116/966] Modify dynamic_rnn to use max_seq_length for its
 while_loop.

This is especially important with XLA compilation, which will usually have
a maximum time dimension on the input that is (sometimes much larger than)
max(sequence_length).  In these cases, a lot of compute can be saved.

PiperOrigin-RevId: 183148831
---
 tensorflow/python/ops/rnn.py | 23 +++++++----------------
 1 file changed, 7 insertions(+), 16 deletions(-)

diff --git a/tensorflow/python/ops/rnn.py b/tensorflow/python/ops/rnn.py
index a079b4485f..a1008f1c83 100644
--- a/tensorflow/python/ops/rnn.py
+++ b/tensorflow/python/ops/rnn.py
@@ -35,7 +35,6 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import tensor_array_ops
@@ -726,6 +725,8 @@ def _dynamic_rnn_loop(cell,
   if sequence_length is not None:
     min_sequence_length = math_ops.reduce_min(sequence_length)
     max_sequence_length = math_ops.reduce_max(sequence_length)
+  else:
+    max_sequence_length = time_steps
 
   time = array_ops.constant(0, dtype=dtypes.int32, name="time")
 
@@ -810,28 +811,18 @@ def _dynamic_rnn_loop(cell,
 
     return (time + 1, output_ta_t, new_state)
 
-  # TODO(pbar) `loop_bound` can be reduced to `max_sequence_length` once
-  # TensorArray shape inference is working.  When sequence lengths are highly
-  # variable, this will reduce the performance overheads of padding to a fixed
-  # maximum length.
-  loop_bound = time_steps
-
-  # This is a workaround since we cannot currently use maximum_iterations if
-  # time_steps is defined inside control flow, see the comment in
-  # control_flow_ops.py.
-  if (context.in_eager_mode() or
-      not (control_flow_util.IsInWhileLoop(time_steps.op) or
-           control_flow_util.IsInCond(time_steps.op))):
-    maximum_iterations = time_steps
+  if in_graph_mode:
+    loop_bound = max_sequence_length
   else:
-    maximum_iterations = None
+    # Using max_sequence_length isn't currently supported in the Eager branch.
+    loop_bound = time_steps
 
   _, output_final_ta, final_state = control_flow_ops.while_loop(
       cond=lambda time, *_: time < loop_bound,
       body=_time_step,
       loop_vars=(time, output_ta, state),
       parallel_iterations=parallel_iterations,
-      maximum_iterations=maximum_iterations,
+      maximum_iterations=time_steps,
       swap_memory=swap_memory)
 
   # Unpack final output if not using output tuples.
-- 
GitLab


From 6908cc233c679b8fe61d99a30d3828362caf47be Mon Sep 17 00:00:00 2001
From: Sami Kama <skama@nvidia.com>
Date: Wed, 24 Jan 2018 15:20:10 -0800
Subject: [PATCH 117/966] [ Fixes for PR comments]   - removed commented code  
 - moved all code to tensorflow namespace   - removed a dangling config
 parameter from tensorflow/BUILD   - fixed Copyright notice years

---
 tensorflow/BUILD                              |   8 -
 tensorflow/contrib/tensorrt/__init__.py       |   2 +-
 .../contrib/tensorrt/convert/convert_graph.cc |  52 +++----
 .../contrib/tensorrt/convert/convert_graph.h  |   5 +-
 .../contrib/tensorrt/convert/convert_nodes.cc | 143 ++----------------
 .../contrib/tensorrt/convert/convert_nodes.h  |   5 +-
 .../contrib/tensorrt/kernels/trt_engine_op.cc |  28 +---
 .../contrib/tensorrt/kernels/trt_engine_op.h  |   4 +-
 tensorflow/contrib/tensorrt/log/trt_logger.cc |   2 +-
 tensorflow/contrib/tensorrt/log/trt_logger.h  |   2 +-
 .../contrib/tensorrt/ops/trt_engine_op.cc     |   6 +-
 .../tensorrt/python/ops/trt_engine_op.py      |   2 +-
 .../contrib/tensorrt/python/trt_convert.py    |  41 ++---
 .../contrib/tensorrt/segment/segment.cc       |   5 +-
 tensorflow/contrib/tensorrt/segment/segment.h |   5 +-
 .../contrib/tensorrt/segment/segment_test.cc  |   2 +-
 .../contrib/tensorrt/segment/union_find.h     |   5 +-
 .../contrib/tensorrt/shape_fn/trt_shfn.cc     |   2 +-
 .../contrib/tensorrt/shape_fn/trt_shfn.h      |   2 +-
 tensorflow/contrib/tensorrt/trt_conversion.i  |   2 +-
 third_party/tensorrt/BUILD.tpl                |   4 -
 21 files changed, 78 insertions(+), 249 deletions(-)

diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index b374462d32..da37564697 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -358,14 +358,6 @@ config_setting(
     },
 )
 
-config_setting(
-    name = "using_tensorrt",
-    define_values = {
-        "using_tensorrt":"true",
-    },
-    visibility = ["//visibility:public"],
-)
-
 config_setting(
     name = "with_mpi_support",
     values = {"define": "with_mpi_support=true"},
diff --git a/tensorflow/contrib/tensorrt/__init__.py b/tensorflow/contrib/tensorrt/__init__.py
index 0d69ffe466..5072ab1196 100644
--- a/tensorflow/contrib/tensorrt/__init__.py
+++ b/tensorflow/contrib/tensorrt/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index e90790716c..9a1815d6b9 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -42,7 +42,6 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
 #include "tensorflow/core/grappler/optimizers/layout_optimizer.h" 
 #include "tensorflow/core/grappler/devices.h"
-//#include "tensorflow/core/grappler/clusters/single_machine.h"
 #include "tensorflow/core/grappler/clusters/virtual_cluster.h"
 #include "tensorflow/core/protobuf/device_properties.pb.h"
 #include "tensorflow/core/grappler/grappler_item.h"
@@ -51,24 +50,28 @@ limitations under the License.
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 
 //------------------------------------------------------------------------------
+namespace tensorflow {
 namespace tensorrt {
 namespace convert {
-
 namespace {
 
 static std::unordered_set<std::string> output_nodes;
 bool IsTensorRTCandidate(const tensorflow::NodeDef& node_def) {
+// LINT.IfChange
   static const std::set<std::string> candidate_ops = {
       "Identity", "Const", "Conv2D", "MaxPool", "BiasAdd", "Relu",
       "Add",      "Mul",   "Sub",    "Rsqrt",   "Pad"  // "Placeholder" ,"Mean"
                                                        // TODO(ben,jie): ...
   };
+// LINT.ThenChange(
+//    https://www.tensorflow.org/code/tensorflow/contrib/tensorrt/convert/convert_nodes.h)
+
   if (output_nodes.count(node_def.name())) return false;
   return candidate_ops.count(node_def.op());
 }
 
-void GetSubGraphIncomingEdges(tensorflow::Graph const& graph,
-                              std::set<int> const& subgraph_node_ids,
+void GetSubGraphIncomingEdges(const tensorflow::Graph & graph,
+                              const std::set<int> &subgraph_node_ids,
                               tensorflow::EdgeSet* incoming_edges) {
   for (int node_id : subgraph_node_ids) {
     tensorflow::Node const* node = graph.FindNodeId(node_id);
@@ -83,8 +86,8 @@ void GetSubGraphIncomingEdges(tensorflow::Graph const& graph,
   }
 }
 
-void GetSubGraphOutgoingEdges(tensorflow::Graph const& graph,
-                              std::set<int> const& subgraph_node_ids,
+void GetSubGraphOutgoingEdges(const tensorflow::Graph &graph,
+                              const std::set<int> &subgraph_node_ids,
                               tensorflow::EdgeSet* outgoing_edges) {
   for (int node_id : subgraph_node_ids) {
     tensorflow::Node const* node = graph.FindNodeId(node_id);
@@ -123,7 +126,9 @@ std::unordered_map<std::string, std::vector<int>> BuildTensorNameMap(
 
 tensorflow::Status ConvertSubGraphToTensorRT(
     tensorflow::Graph& graph, const std::vector<std::string>& output_names,
-    const std::set<int>& subgraph_node_ids, size_t max_batch_size,
+    const std::set<int>& subgraph_node_ids,
+    size_t max_batch_size, // max batch size that engine will be created for
+    // max amount of memory that engine will be allowed to consume, in bytes
     size_t max_workspace_size,
     const tensorflow::grappler::GraphProperties& graph_properties) {
   tensorflow::EdgeSet subgraph_incoming_edges;
@@ -139,7 +144,6 @@ tensorflow::Status ConvertSubGraphToTensorRT(
   std::set<std::pair<int, int>> subgraph_outputs_set;
   // Collect outputs referenced from output_names
   auto output_name_to_index_map = BuildTensorNameMap(output_names);
-  // for (int node_id : subgraph_node_ids_no_placeholder) {
   for (int node_id : subgraph_node_ids) {
     tensorflow::Node* node = graph.FindNodeId(node_id);
     if (output_name_to_index_map.count(node->name())) {
@@ -150,8 +154,6 @@ tensorflow::Status ConvertSubGraphToTensorRT(
   }
   // Collect outputs referenced from outgoing edges
   tensorflow::EdgeSet subgraph_outgoing_edges;
-  // GetSubGraphOutgoingEdges(graph, subgraph_node_ids_no_placeholder,
-  //  &subgraph_outgoing_edges);
   GetSubGraphOutgoingEdges(graph, subgraph_node_ids, &subgraph_outgoing_edges);
   for (tensorflow::Edge const* edge : subgraph_outgoing_edges) {
     subgraph_outputs_set.insert({edge->src()->id(), edge->src_output()});
@@ -233,9 +235,6 @@ tensorflow::Status ConvertGraphDefToTensorRT(
   int num_gpus = tensorflow::grappler::GetNumAvailableGPUs();
   LOG(DEBUG) << "cpu_cores: " << num_cpu_cores;
   LOG(DEBUG) << "gpus: " << num_gpus;
-  // int timeout_s = 60 * 10;
-  // gCluster = new tensorflow::grappler::SingleMachine(
-  //                  timeout_s, num_cpu_cores, num_gpus);
 
   tensorflow::Status status = optimizer.Optimize(gCluster, item, &gdef);
 
@@ -252,19 +251,6 @@ tensorflow::Status ConvertGraphDefToTensorRT(
   // AJ refactoring shape inference through grappler/GraphProperties.
   tensorflow::grappler::GraphProperties static_graph_properties(item);
   static_graph_properties.InferStatically(false);
-  // TF_CHECK_OK(static_graph_prop.InferStatically(false));
-  // ShapeMap shape_map;
-  // TF_RETURN_IF_ERROR(
-  //     tensorflow::trt::inferShapes(gdef, output_names, shape_map));
-  // std::stringstream oss;
-  // for (auto& n : shape_map) {  // nodes
-  //   oss << " Node= " << n.first << ", ";
-  //   for (auto o : n.second) {  // outputs
-  //     oss << o.first.DebugString() << " T= " << o.second << ", ";
-  //   }
-  //   LOG(DEBUG) << oss.str();
-  //   oss.str("");
-  // }
 
   // Build full graph
   tensorflow::FunctionLibraryDefinition flib(tensorflow::OpRegistry::Global(),
@@ -274,26 +260,23 @@ tensorflow::Status ConvertGraphDefToTensorRT(
       tensorflow::GraphConstructorOptions(), gdef, &graph));
 
   // Segment the graph into subgraphs that can be converted to TensorRT
-  tensorrt::segment::SegmentOptions segment_options;
+  tensorflow::tensorrt::segment::SegmentOptions segment_options;
   // TODO(ben,jie,sami): exclude output nodes (DISCUSS IT)
   for (auto node : output_names) output_nodes.insert(node);
 
   // TODO(sami): this should be passed as a knob!!!!
   segment_options.minimum_segment_size = 2;
-  tensorrt::segment::SegmentNodesVector segments;
+  tensorflow::tensorrt::segment::SegmentNodesVector segments;
   TF_RETURN_IF_ERROR(tensorrt::segment::SegmentGraph(
       gdef, IsTensorRTCandidate, segment_options, &segments));
   if (segments.size() > 1) {
-    // LOG(WARNING) << "Multiple TensorRT candidate subgraphs were found, "
-    //<< "but only the first can be converted.";
-    // segments.erase(++segments.begin(), segments.end());
     LOG(INFO) << "MULTIPLE tensorrt candidate conversion: " << segments.size();
   }
   std::unordered_map<std::string, tensorflow::Node*> node_map;
   TF_RETURN_IF_ERROR(BuildNodeMap(graph, &node_map));
-  for (std::set<std::string> const& subgraph_node_names : segments) {
+  for (const std::set<std::string> &subgraph_node_names : segments) {
     std::set<int> subgraph_node_ids;
-    for (std::string const& node_name : subgraph_node_names) {
+    for (const std::string &node_name : subgraph_node_names) {
       subgraph_node_ids.insert(node_map.at(node_name)->id());
     }
     TF_RETURN_IF_ERROR(ConvertSubGraphToTensorRT(
@@ -306,3 +289,4 @@ tensorflow::Status ConvertGraphDefToTensorRT(
 
 }  // namespace convert
 }  // namespace tensorrt
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.h b/tensorflow/contrib/tensorrt/convert/convert_graph.h
index cd713de888..7695f66c3d 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.h
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.h
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 
+namespace tensorflow {
 namespace tensorrt {
 namespace convert {
 
@@ -30,5 +31,5 @@ tensorflow::Status ConvertGraphDefToTensorRT(
     size_t max_workspace_size, tensorflow::GraphDef* new_graph_def);
 }
 }  // namespace tensorrt
-
+}  // namespace tensorrt
 #endif  // TENSORFLOW_CONTRIB_TENSORRT_CONVERT_CONVERT_GRAPH_H_
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index 6c77cdc0b6..c22d8bcd04 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -45,6 +45,7 @@ limitations under the License.
 //  would work!
 #define CHECK_EQ_TYPE(val1, val2) CHECK_EQ((int)val1, (int)val2)
 //------------------------------------------------------------------------------
+namespace tensorflow {
 namespace tensorrt {
 namespace convert {
 
@@ -121,12 +122,12 @@ static std::vector<std::pair<int, int>> createSamePadding(
   CHECK_EQ((size_t)stride.nbDims, inputDims.size());  // TODO(jie): N+C? NC+?
 
   for (size_t i = 0; i < inputDims.size(); ++i) {
-    /* formula to calculate the padding */
+    // formula to calculate the padding
     int p = ((inputDims[i] - 1) / stride.d[i]) * stride.d[i] + kernel.d[i] -
             inputDims[i];
     p = (p > 0) ? p : 0;
 
-    /* right precedence padding, like in TensorFlow */
+    // right precedence padding, like in TensorFlow
     int left = p / 2;
     int right = p - left;
 
@@ -138,7 +139,6 @@ static std::vector<std::pair<int, int>> createSamePadding(
   return padding;
 }
 
-// class TRT_ShapedWeights : public nvinfer1::Weights {
 class TRT_ShapedWeights {
  public:
   nvinfer1::Dims shape_;
@@ -271,21 +271,6 @@ class TFAttrs {
     return _attrs.count(key) ? this->get<T>(key) : default_value;
   }
 };
-// template <>
-// float TFAttrs::get<float>(std::string key) const {
-//  return this->at(key)->f();
-//}
-
-// template <>
-// int TFAttrs::get<int>(std::string key) const {
-//  return (int)this->at(key)->i();
-//}
-
-// template <>
-// bool TFAttrs::get<bool>(std::string key) const {
-//  auto value = this->at(key)->i();
-//  return bool(value);
-//}
 
 template <>
 std::string TFAttrs::get<std::string>(std::string key) const {
@@ -305,43 +290,14 @@ nvinfer1::Dims TFAttrs::get<nvinfer1::Dims>(std::string key) const {
   // Note: No dimension type information is included
   return dims;
 }
-// template <>
-// nvinfer1::DimsHW TFAttrs::get<nvinfer1::DimsHW>(std::string key) const {
-//  nvinfer1::Dims dims = this->get<nvinfer1::Dims>(key);
-//  CHECK_EQ(dims.nbDims, 2);
-//  return nvinfer1::DimsHW(dims.d[0], dims.d[1]);
-//}
-// template <>
-// nvinfer1::Permutation TFAttrs::get<nvinfer1::Permutation>(
-//    std::string key) const {
-//  auto values = this->get<std::vector<int>>(key);
-//  nvinfer1::Permutation perm;
-//  std::copy(values.begin(), values.end(), perm.order);
-//  // Fill unused values with -1 to aid debugging
-//  std::fill(perm.order + values.size(), perm.order + nvinfer1::Dims::MAX_DIMS,
-//            -1);
-//  return perm;
-//}
-// template <>
-// nvinfer1::Dims TFAttrs::getShape<nvinfer1::Dims>(std::string key) const {
-//  auto attr = this->at(key)->shape();
-//  nvinfer1::Dims dims;
-//  dims.nbDims = attr.dim_size();
-//  for (int i = 0; i < dims.nbDims; i++) dims.d[i] = attr.dim(i).size();
-//  return dims;
-//}
-// template<> TRT_ShapedWeights TFAttrs::get<TRT_ShapedWeights>(std::string key)
-// const {
-//  tensorflow::TensorProto const* tf_weights_tensor = &this->at(key)->tensor();
-// TODO(jie): Implement this
-//  return convert_tf_weights(tf_weights_tensor);
-//}
+
 template <>
 nvinfer1::DataType TFAttrs::get<nvinfer1::DataType>(std::string key) const {
   nvinfer1::DataType trt_dtype(nvinfer1::DataType::kFLOAT);
   TF_CHECK_OK(convert_dtype(this->at(key)->type(), &trt_dtype));
   return trt_dtype;
 }
+
 template <>
 tensorflow::DataType TFAttrs::get<tensorflow::DataType>(std::string key) const {
   return this->at(key)->type();
@@ -376,7 +332,6 @@ void reorder_rsck_to_kcrs(TRT_ShapedWeights const& iweights,
   oweights->shape_.d[1] = c;
   oweights->shape_.d[2] = r;
   oweights->shape_.d[3] = s;
-  // nvinfer1::DimsNCHW istrides = {1, s, c*r*s, r*s};
   nvinfer1::DimsNCHW istrides = {1, k, s * k * c, c * k};
   nvinfer1::DimsNCHW ostrides = {c * r * s, r * s, s, 1};
   switch (iweights.type_) {
@@ -390,12 +345,6 @@ void reorder_rsck_to_kcrs(TRT_ShapedWeights const& iweights,
   }
 }
 
-/* not used. clean up needed.
-nvinfer1::Weights make_dummy_weights(nvinfer1::DataType
-dtype=nvinfer1::DataType::kFLOAT) { nvinfer1::Weights w; w.count  = 0; w.values
-= nullptr; w.type   = dtype; return w;
-}
-*/
 
 struct InferDeleter {
   template <typename T>
@@ -521,11 +470,11 @@ class Converter {
   }
 };
 
-/*******************************************************************************
-  Constant folding functions
-  TODO(jie): once optimizer kicks in, we should have done constant folding
-there.
-*******************************************************************************/
+// ****************************************************************************
+//  Constant folding functions
+//  TODO(jie): once optimizer kicks in, we should have done constant folding
+//    there.
+//*****************************************************************************/
 struct LambdaFactory {
   enum class OP_CATEGORY : int { RSQRT = 0, NEG, ADD, MUL, SUB };
   OP_CATEGORY op;
@@ -812,21 +761,6 @@ tensorflow::Status BinaryTensorOpWeight(
   // default to channel-wise
   auto scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
 
-  /*
-  if (weights.count() == 1) {
-    LOG(DEBUG) << "UNIFORM";
-    scale_mode = nvinfer1::ScaleMode::kUNIFORM;
-  } else if (dims_w.nbDims == 1) {
-    // TODO(jie): should we check for implicit chennel wise binary op
-    //   where weights has shape 1x1xC?
-    LOG(DEBUG) << "CHANNEL";
-    scale_mode = nvinfer1::ScaleMode::kCHANNEL;
-  } else {
-    // TODO(jie): check weight shape.
-    // broadcast is not fully supported
-    LOG(DEBUG) << "ELEMENTWISE";
-    scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
-  } */
 
   if (weights.count() == 1) {
     LOG(DEBUG) << "UNIFORM";
@@ -856,21 +790,6 @@ tensorflow::Status BinaryTensorOpWeight(
     }
   }
 
-  // transpose last dimension
-  /*
-  std::vector<int> permutation(dims_t.nbDims + 1);
-  if (scale_mode == nvinfer1::ScaleMode::kCHANNEL && dims_t.nbDims > 1) {
-    // we swap the last dimension into channel for trt.
-    // because of tensorflow default broadcasting rules.
-    for (int i = 0; i < static_cast<int>(permutation.size()); i++) {
-      permutation[i] = i;
-    }
-    permutation[1] = dims_t.nbDims;
-    permutation[dims_t.nbDims] = 1;
-    tensor = ctx.transposeTensor(const_cast<nvinfer1::ITensor*>(tensor),
-                                 permutation);
-  }
-  */
 
   // prepare weights
   TRT_ShapedWeights shiftWeights(weights.type_);
@@ -898,12 +817,6 @@ tensorflow::Status BinaryTensorOpWeight(
       scaleWeights, powerWeights);
 
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
-  // transpose back dimension
-  /*
-  if (scale_mode == nvinfer1::ScaleMode::kCHANNEL && dims_t.nbDims > 1) {
-    output_tensor = ctx.transposeTensor(output_tensor, permutation);
-  }
-  */
 
   // pass the output
   outputs->push_back(TRT_TensorOrWeights(output_tensor));
@@ -978,7 +891,6 @@ tensorflow::Status ConvertConv2D(Converter& ctx,
                                  std::vector<TRT_TensorOrWeights> const& inputs,
                                  std::vector<TRT_TensorOrWeights>* outputs) {
   nvinfer1::ITensor const* tensor = inputs.at(0).tensor();
-  // nvinfer1::ITensor* tensor = inputs.at(0).tensor();
   // TODO(jie): handle NHWC/NCHW transpose;
   TRT_ShapedWeights weights_rsck = inputs.at(1).weights();
   TRT_ShapedWeights weights = ctx.get_temp_weights_like(weights_rsck);
@@ -1020,16 +932,12 @@ tensorflow::Status ConvertConv2D(Converter& ctx,
                                 {static_cast<int>(tensor_dim.d[1]),
                                  static_cast<int>(tensor_dim.d[2])});
   } else {
-    // return tensorflow::errors::Unimplemented(
-    //          "Current Conv2D cannot support padding other than SAME");
     padding = {{0, 0}, {0, 0}};
   }
 
   if (padding[0].first != padding[0].second ||
       padding[1].first != padding[1].second) {
     // TODO(jie): handle asymmetric padding
-    // return tensorflow::errors::Unimplemented(
-    //         "Asymmetric padding not implemented yet");
     LOG(DEBUG) << "padding!!!: " << padding[0].first << padding[0].second
                                  << padding[1].first << padding[1].second;
 
@@ -1125,8 +1033,6 @@ tensorflow::Status ConvertPool(Converter& ctx,
   if (padding[0].first != padding[0].second ||
       padding[1].first != padding[1].second) {
     // TODO(jie): handle asymmetric padding
-    // return tensorflow::errors::Unimplemented(
-    //          "Asymmetric padding not implemented yet");
     LOG(DEBUG) << "padding!!!: " << padding[0].first << padding[0].second << padding[1].first << padding[1].second;
     auto padLayer = ctx.network()->addPadding(
         *const_cast<nvinfer1::ITensor*>(tensor),
@@ -1176,11 +1082,9 @@ tensorflow::Status ConvertScale(Converter& ctx,
         "only supports tensor op weight for now, at " + node_def.name());
   // implement tensor binaryOp weight [channel wise] for now;
   nvinfer1::ITensor const* tensor = inputs.at(0).tensor();
-  // nvinfer1::ITensor* tensor = inputs.at(0).tensor();
 
   // TODO(jie): handle NHWC/NCHW transpose;
   TRT_ShapedWeights weights = inputs.at(1).weights();
-  // nvinfer1::Weights empty_weights{weights.type, nullptr, 0};
   TRT_ShapedWeights empty_weights(weights.type_);
 
   TFAttrs attrs(node_def);
@@ -1246,12 +1150,6 @@ tensorflow::Status ConvertConst(Converter& ctx,
       weights = TRT_ShapedWeights(dtype, weights_tensor.float_val().data(),
                                   scalar_shape);
     }
-    // LOG(INFO) << " add: " << weights_tensor.float_val().data();
-    // LOG(INFO) << " value: " << (*weights_tensor.float_val().data());
-
-    // weights = ctx.get_temp_weights(dtype, scalar_shape);
-    // std::memcpy(const_cast<void*>(weights.values),
-    //           weights_tensor.float_val().data(), weights.size_bytes());
   } else if (!weights_tensor.tensor_content().empty()) {
     LOG(DEBUG) << "TENSOR!!!" << node_def.name();
     weights = TRT_ShapedWeights(dtype, weights_tensor.tensor_content().data(),
@@ -1337,22 +1235,15 @@ tensorflow::Status ConvertReduce(Converter& ctx,
 
   TFAttrs attrs(node_def);
   // TODO(jie): handle data type
-  // auto data_type = attrs.get<nvinfer1::DataType>("T");
   // index type here is done through TF type
   //   so I can leverage their EnumToDataType for my cast
   auto index_type = attrs.get<tensorflow::DataType>("Tidx");
-  // auto keep_dims_flag = attrs.get<bool>("keep_dims");
 
   // Only expect to handle INT32 as attributes for now
   if (index_type != tensorflow::DataType::DT_INT32)
     return tensorflow::errors::Unimplemented("Tidx supports only DT_INT32");
-  // auto pad_data = const_cast<tensorflow::EnumToDataType<padding_type>::Type*>
-  //                  (pads.values);
   auto index_list_data =
       static_cast<int*>(const_cast<void*>(index_list.values_));
-  // auto index_list_data =
-  //       const_cast<tensorflow::EnumToDataType<index_type>::Type*>
-  //         (index_list.values);
 
   // hack warning:
   //   have to fall back to pool layer since reduce is not in public TRT yet.
@@ -1440,7 +1331,6 @@ tensorflow::Status ConvertPad(Converter& ctx,
   //   so I can leverage their EnumToDataType for my cast
   auto padding_type = attrs.get<tensorflow::DataType>("Tpaddings");
   // TODO(jie): handle data type conversion for TRT?
-  // auto data_type = attrs.get<nvinfer1::DataType>("T");
 
   if (pads.shape_.d[0] != nbDims || pads.shape_.d[1] != 2)
     return tensorflow::errors::InvalidArgument(
@@ -1451,8 +1341,6 @@ tensorflow::Status ConvertPad(Converter& ctx,
   if (padding_type != tensorflow::DataType::DT_INT32)
     return tensorflow::errors::Unimplemented(
         "Tpaddings supports only DT_INT32");
-  // auto pad_data = const_cast<tensorflow::EnumToDataType<padding_type>::Type*>
-  //                  (pads.values);
   auto pad_data = static_cast<int*>(const_cast<void*>(pads.values_));
 
   std::vector<int32_t> pad_index;
@@ -1560,7 +1448,6 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
   std::list<tensorflow::Node*> order;
   for (tensorflow::Node* node : order_vec) {
     if (subgraph_node_ids.count(node->id())) {
-      // order.push_back(node);
       order.push_front(node);  // we want topological order to contstruct the
                                // network layer by layer
     }
@@ -1568,10 +1455,7 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
   // topological order is needed to build TRT network
   LOG(DEBUG) << "BUILDING 1";
 
-  //  nvinfer1::ILogger::Severity verbosity =
-  //  nvinfer1::ILogger::Severity::kWARNING;
   tensorflow::tensorrt::Logger trt_logger;
-  //  TRT_Logger trt_logger(verbosity);
 
   LOG(DEBUG) << "BUILDING 2";
 
@@ -1605,7 +1489,6 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
     auto node_name = node->name();
     input_names.push_back(node_name);  // insert original node name without port
     // TODO(jie): alternative :)
-    // tensorflow::DataType tf_dtype = node->output_type(output_idx);
     if (!graph_properties.HasOutputProperties(node_name))
       return tensorflow::errors::Internal("failed to find input node: " +
                                           node_name);
@@ -1719,9 +1602,6 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
     engine_plan_string = std::move(
         std::string(engine_plan_data, engine_plan_data + engine_plan->size()));
   }
-  // std::ofstream engine_out("mini.engine");
-  // engine_out << engine_plan_string;
-  // engine_out.close();
 
   LOG(INFO) << "finished engine";
 
@@ -1759,3 +1639,4 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
 
 }  // namespace convert
 }  // namespace tensorrt
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.h b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
index dc59c37892..9446dd5dcb 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.h
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 
+namespace tensorflow {
 namespace tensorrt {
 namespace convert {
 
@@ -39,5 +40,5 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
     tensorflow::NodeDef* trt_node);
 }  // namespace convert
 }  // namespace tensorrt
-
+}  // namespace tensorflow
 #endif  // TENSORFLOW_CONTRIB_TENSORRT_CONVERT_CONVERT_NODES_H_
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
index a1524a592a..bef508d996 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -18,7 +18,6 @@ limitations under the License.
 #include "tensorflow/contrib/tensorrt/log/trt_logger.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor.h"
-// Use TF logging f
 
 
 namespace tensorflow {
@@ -29,9 +28,6 @@ using namespace nvinfer1;
 namespace tensorrt {
 
 TRTEngineOp::TRTEngineOp(OpKernelConstruction* context) : OpKernel(context) {
-  // char *gieModelStream{nullptr};
-  // size_t size{0};
-
   // read serialized_engine
   std::string serialized_engine;
   OP_REQUIRES_OK(context,
@@ -48,7 +44,7 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context) : OpKernel(context) {
   trt_engine_ptr_.reset(infer->deserializeCudaEngine(
       serialized_engine.c_str(), serialized_engine.size(), nullptr));
 
-  trt_context_ptr_.reset(trt_engine_ptr_->createExecutionContext());
+  trt_execution_context_ptr_.reset(trt_engine_ptr_->createExecutionContext());
   // runtime is safe to delete after engine creation
   infer->destroy();
   std::stringstream oss;
@@ -83,8 +79,6 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context) : OpKernel(context) {
     }
   }
 
-  // CHECK_NE(cudaStreamCreate(&stream_),0); // logic here is wrong
-  // cudaStreamCreate(&stream_);
 }
 
 void TRTEngineOp::Compute(OpKernelContext* context) {
@@ -107,20 +101,16 @@ void TRTEngineOp::Compute(OpKernelContext* context) {
       valid = false;
       break;
     }
-    // int64 input_shape.dim_size(int d)
-    // int input_shape.dims()
     switch (trt_engine_ptr_->getBindingDataType(bindingIndex)) {
       case nvinfer1::DataType::kFLOAT:
         LOG(INFO) << "float";
         buffers[bindingIndex] = (void*)(input_tensor.flat<float>().data());
         break;
       case nvinfer1::DataType::kHALF:
-        LOG(INFO) << "half";
-        // buffers[bindingIndex] = (void*)input_tensor.flat<float16>().data();
+        LOG(FATAL) << "half size is not supported yet!";
         break;
       case nvinfer1::DataType::kINT8:
-        LOG(INFO) << "int8";
-        // buffers[bindingIndex] = (void*)input_tensor.flat<int8>().data();
+        LOG(FATAL) << "int8 is not supported yet!";
         break;
     }
   }
@@ -149,8 +139,6 @@ void TRTEngineOp::Compute(OpKernelContext* context) {
 
     OP_REQUIRES_OK(context,
                    context->allocate_output(i, output_shape, &output_tensor));
-    // buffers[bindingIndex] = (void*)output_tensor->flat<float>();
-    // buffers[bindingIndex] = output_tensor->flat<float>().data();
     switch (trt_engine_ptr_->getBindingDataType(bindingIndex)) {
       case nvinfer1::DataType::kFLOAT:
         LOG(INFO) << "float";
@@ -158,12 +146,10 @@ void TRTEngineOp::Compute(OpKernelContext* context) {
             reinterpret_cast<void*>(output_tensor->flat<float>().data());
         break;
       case nvinfer1::DataType::kHALF:
-        LOG(INFO) << "half";
-        // buffers[bindingIndex] = (void*)output_tensor->flat<float16>().data();
+        LOG(FATAL) << "half size is not supported yet!";
         break;
       case nvinfer1::DataType::kINT8:
-        LOG(INFO) << "int8";
-        // buffers[bindingIndex] = (void*)output_tensor->flat<int8>().data();
+        LOG(FATAL) << "int8 is not supported yet!";
         break;
     }
   }
@@ -174,7 +160,7 @@ void TRTEngineOp::Compute(OpKernelContext* context) {
                                                 ->implementation()
                                                 ->CudaStreamMemberHack()));
 
-  trt_context_ptr_->enqueue(nbBatch, &buffers[0], *stream, nullptr);
+  trt_execution_context_ptr_->enqueue(nbBatch, &buffers[0], *stream, nullptr);
   cudaStreamSynchronize(*stream);
 }
 
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
index 631fc114f2..2f2d453dda 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -43,7 +43,7 @@ class TRTEngineOp : public OpKernel {
   using destroyed_ptr = std::unique_ptr<T, Destroyer<T>>;
   destroyed_ptr<nvinfer1::ICudaEngine> trt_engine_ptr_;
   // TODO(samikama) context should go to a resource manager!
-  destroyed_ptr<nvinfer1::IExecutionContext> trt_context_ptr_;
+  destroyed_ptr<nvinfer1::IExecutionContext> trt_execution_context_ptr_;
   std::vector<string> input_nodes_;
   std::vector<string> output_nodes_;
 };
diff --git a/tensorflow/contrib/tensorrt/log/trt_logger.cc b/tensorflow/contrib/tensorrt/log/trt_logger.cc
index 545a4aac50..fc3d778002 100644
--- a/tensorflow/contrib/tensorrt/log/trt_logger.cc
+++ b/tensorflow/contrib/tensorrt/log/trt_logger.cc
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/tensorflow/contrib/tensorrt/log/trt_logger.h b/tensorflow/contrib/tensorrt/log/trt_logger.h
index 10a78b7a1d..3a3a29516a 100644
--- a/tensorflow/contrib/tensorrt/log/trt_logger.h
+++ b/tensorflow/contrib/tensorrt/log/trt_logger.h
@@ -1,5 +1,5 @@
 // -*- c++ -*-
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc b/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc
index 38d3707190..7139ff9618 100644
--- a/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc
+++ b/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -28,8 +28,8 @@ REGISTER_OP("TRTEngineOp")
     .Attr("serialized_engine: string")
     .Attr("input_nodes: list(string)")
     .Attr("output_nodes: list(string)")
-    .Attr("InT: list({int8, float16, float32})")
-    .Attr("OutT: list({int8, float16, float32})")
+    .Attr("InT: list({float32})")
+    .Attr("OutT: list({float32})")
     .Input("in_tensor: InT")
     .Output("out_tensor: OutT")
     .SetShapeFn(shape_inference::TRTEngineOpShapeInference);
diff --git a/tensorflow/contrib/tensorrt/python/ops/trt_engine_op.py b/tensorflow/contrib/tensorrt/python/ops/trt_engine_op.py
index ce78d328de..c4ab9b89ea 100644
--- a/tensorflow/contrib/tensorrt/python/ops/trt_engine_op.py
+++ b/tensorflow/contrib/tensorrt/python/ops/trt_engine_op.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tensorflow/contrib/tensorrt/python/trt_convert.py b/tensorflow/contrib/tensorrt/python/trt_convert.py
index 521ccdce42..4f4dcf65aa 100644
--- a/tensorflow/contrib/tensorrt/python/trt_convert.py
+++ b/tensorflow/contrib/tensorrt/python/trt_convert.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -36,7 +36,7 @@ def CreateInferenceGraph(input_graph_def, outputs,max_batch_size=1,max_workspace
 
   Args:
     input_graph_def: GraphDef object containing a model to be transformed.
-    outputs: List of node names for the model outputs.
+    outputs: List of tensors or node names for the model outputs.
     max_batch_size: max size for the input batch
     max_workspace_size: parameter to control memory allocation (in Bytes)
 
@@ -44,36 +44,21 @@ def CreateInferenceGraph(input_graph_def, outputs,max_batch_size=1,max_workspace
     New GraphDef with TRTEngineOps placed in graph replacing subgraphs.
   """
 
-  # with errors.raise_exception_on_not_ok_status() as status:
-  #   output_graph_def_string = trt_convert(
-  #       input_graph_def_string,outputs,
-  #       max_batch_size,max_workspace_size, status)
-  # g = tf.Graph()
-  # with g.as_default():
-  #   tf.import_graph_def(input_graph_def, name="")
-  # rewriter_config = rewriter_config_pb2.RewriterConfig()
-  # rewriter_config.optimizers.append('layout')
-  # rewriter_config.optimizers.append('constfold')
-
-  # # mark output nodes as fetch
-  # train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
-  # for node_name in outputs:
-  #   out_node = g.get_operation_by_name(node_name)
-  #   for i in range(0,len(out_node.outputs)):
-  #     train_op.append(out_node.outputs[0])
-
-  # # constant folding
-  # mg = meta_graph.create_meta_graph_def(graph=g)
-  # meta_graph.add_collection_def(mg, ops.GraphKeys.TRAIN_OP)
-  # optimized_graph_def_str = \
-  #   tf_optimizer.OptimizeGraph(rewriter_config, mg).SerializeToString()
+  out_names=[]
+  for i in outputs:
+    if isinstance(i,ops.Tensor):
+      out_names.append(i.name)
+    else:
+      out_names.append(i)
+      
   input_graph_def_str= \
     input_graph_def.SerializeToString()
 
   # TODO(sami): Fix this when we can return status from C++ library
-  # There is a problem with the TF internal library setup that doesn't allow us to return a status object from C++.
-  # Thus we return a  pair or strings where first one is encoded status and the second one is the
-  # transformed graphs protobuf string.
+  # There is a problem with the TF internal library setup that doesn't
+  # allow us to return a status object from C++.  Thus we return a
+  # pair or strings where first one is encoded status and the second
+  # one is the transformed graphs protobuf string.
   out = trt_convert(
       input_graph_def_str ,outputs,
       max_batch_size,max_workspace_size)
diff --git a/tensorflow/contrib/tensorrt/segment/segment.cc b/tensorflow/contrib/tensorrt/segment/segment.cc
index 41da528247..967e29df76 100644
--- a/tensorflow/contrib/tensorrt/segment/segment.cc
+++ b/tensorflow/contrib/tensorrt/segment/segment.cc
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 
 //------------------------------------------------------------------------------
+namespace tensorflow {
 namespace tensorrt {
 namespace segment {
 
@@ -227,7 +228,6 @@ tensorflow::Status SegmentGraph(
         graph.RemoveNode(node);
       }
     }
-    // tensorflow::DumpGraph("Post-Segment", &graph);
   }
 
   // Convert the segments into the expected return format
@@ -257,3 +257,4 @@ tensorflow::Status SegmentGraph(
 
 }  // namespace segment
 }  // namespace tensorrt
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tensorrt/segment/segment.h b/tensorflow/contrib/tensorrt/segment/segment.h
index b5aee5bc34..d64c325b9e 100644
--- a/tensorflow/contrib/tensorrt/segment/segment.h
+++ b/tensorflow/contrib/tensorrt/segment/segment.h
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 
+namespace tensorflow {
 namespace tensorrt {
 namespace segment {
 
@@ -49,5 +50,5 @@ tensorflow::Status SegmentGraph(
 
 }  // namespace segment
 }  // namespace tensorrt
-
+}  // namespace tensorflow
 #endif  // TENSORFLOW_CONTRIB_TENSORRT_SEGMENT_SEGMENT_H_
diff --git a/tensorflow/contrib/tensorrt/segment/segment_test.cc b/tensorflow/contrib/tensorrt/segment/segment_test.cc
index dcd0c71ed7..cded3b6b61 100644
--- a/tensorflow/contrib/tensorrt/segment/segment_test.cc
+++ b/tensorflow/contrib/tensorrt/segment/segment_test.cc
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/tensorflow/contrib/tensorrt/segment/union_find.h b/tensorflow/contrib/tensorrt/segment/union_find.h
index 8ae877cd05..0d3cf62cdf 100644
--- a/tensorflow/contrib/tensorrt/segment/union_find.h
+++ b/tensorflow/contrib/tensorrt/segment/union_find.h
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_TENSORRT_SEGMENT_UNION_FIND_H_
 #define TENSORFLOW_CONTRIB_TENSORRT_SEGMENT_UNION_FIND_H_
 
+namespace tensorflow {
 namespace tensorrt {
 namespace segment {
 
@@ -73,5 +74,5 @@ UnionFind<T>* UnionFind<T>::FindRoot() {
 
 }  // namespace segment
 }  // namespace tensorrt
-
+}  // namespace tensorflow
 #endif  // TENSORFLOW_CONTRIB_TENSORRT_SEGMENT_UNION_FIND_H_
diff --git a/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc b/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
index 72022b99e2..48b12fde76 100644
--- a/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
+++ b/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.h b/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.h
index 90a226d91d..f09b261139 100644
--- a/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.h
+++ b/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.h
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/tensorflow/contrib/tensorrt/trt_conversion.i b/tensorflow/contrib/tensorrt/trt_conversion.i
index 5f8e73a59f..38cdabdff0 100644
--- a/tensorflow/contrib/tensorrt/trt_conversion.i
+++ b/tensorflow/contrib/tensorrt/trt_conversion.i
@@ -54,7 +54,7 @@
     }
     tensorflow::GraphDef outGraph;
     tensorflow::Status conversion_status =
-      tensorrt::convert::ConvertGraphDefToTensorRT(graph_def,
+        tensorflow::tensorrt::convert::ConvertGraphDefToTensorRT(graph_def,
 						   output_names,
 						   max_batch_size,
 						   max_workspace_size,
diff --git a/third_party/tensorrt/BUILD.tpl b/third_party/tensorrt/BUILD.tpl
index 8962751f56..a8e52d13d3 100644
--- a/third_party/tensorrt/BUILD.tpl
+++ b/third_party/tensorrt/BUILD.tpl
@@ -1,8 +1,4 @@
 # -*- python -*-
-# Description:
-#   provide tensorrt information
-
-#TODO(Sami) these needs to be defined 
 
 licenses(["notice"])  
 
-- 
GitLab


From bf83571eb14d57ce32320bf8f49134458659a441 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 24 Jan 2018 15:10:52 -0800
Subject: [PATCH 118/966] internal change

PiperOrigin-RevId: 183148922
---
 tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
index 0ed5b2fad3..b842951eb2 100644
--- a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
+++ b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
@@ -171,7 +171,6 @@ Status WriteTensorboardTPUProfile(const string& logdir, const string& run,
           DumpToolDataToLogDirectory(profile_run_dir, tool_data, os));
     }
   }
-  TF_RETURN_IF_ERROR(DumpGraphEvents(logdir, run, response, os));
 
   return Status::OK();
 }
-- 
GitLab


From 1df1544aeb8a6311c98a0d9ee9b6946e035fdbeb Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Wed, 24 Jan 2018 15:23:12 -0800
Subject: [PATCH 119/966] Make tensor_util_test.py work with the C API enabled.

Some of the shape inference error messages have changed. We also need
to wrap an InvalidArgumentError as a ValueError for backwards compat.

PiperOrigin-RevId: 183150857
---
 .../python/framework/tensor_util_test.py       | 15 +++++++++++++--
 tensorflow/python/framework/ops.py             | 18 +++++++++++-------
 2 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/tensorflow/contrib/framework/python/framework/tensor_util_test.py b/tensorflow/contrib/framework/python/framework/tensor_util_test.py
index 2effe8eb26..8cdb340f2d 100644
--- a/tensorflow/contrib/framework/python/framework/tensor_util_test.py
+++ b/tensorflow/contrib/framework/python/framework/tensor_util_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
@@ -77,6 +78,7 @@ class AssertScalarIntTest(test.TestCase):
               [3, 4], dtype=dtypes.int32))
 
 
+@test_util.with_c_api
 class WithShapeTest(test.TestCase):
 
   def _assert_with_shape(self, tensor, expected_value, expected_shape,
@@ -213,16 +215,25 @@ class WithShapeTest(test.TestCase):
       tensor_partial_shape.set_shape([None, 2])
 
       for incompatible_shape in [[0], [1]]:
+        if ops._USE_C_API:
+          error_message = "Shapes must be equal rank, but are 2 and 1"
+        else:
+          error_message = r"Shapes \(\?, 2\) and \([01],\) are not compatible"
         self.assertRaisesRegexp(
-            ValueError, r"Shapes \(\?, 2\) and \([01],\) are not compatible",
+            ValueError, error_message,
             tensor_util.with_shape, incompatible_shape, tensor_partial_shape)
       for incompatible_shape in [[1, 2, 1]]:
         self.assertRaisesRegexp(ValueError, "Dimensions must be equal",
                                 tensor_util.with_shape, incompatible_shape,
                                 tensor_partial_shape)
       for incompatible_shape in [[2, 1]]:
+        if ops._USE_C_API:
+          error_message = (r"Dimension 1 in both shapes must be equal, but are "
+                           r"2 and 1. Shapes are \[\?,2\] and \[2,1\].")
+        else:
+          error_message = r"Shapes \(\?, 2\) and \(2, 1\) are not compatible"
         self.assertRaisesRegexp(
-            ValueError, r"Shapes \(\?, 2\) and \(2, 1\) are not compatible",
+            ValueError, error_message,
             tensor_util.with_shape, incompatible_shape, tensor_partial_shape)
 
       compatible_shape = [2, 2]
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index a1c2e07e94..b107670275 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -481,13 +481,17 @@ class Tensor(_TensorLike):
           dim_list.append(-1)
         else:
           dim_list.append(dim.value)
-    with errors.raise_exception_on_not_ok_status() as status:
-      c_api.TF_GraphSetTensorShape_wrapper(
-          self._op._graph._c_graph,  # pylint: disable=protected-access
-          self._as_tf_output(),
-          dim_list,
-          unknown_shape,
-          status)
+    try:
+      with errors.raise_exception_on_not_ok_status() as status:
+        c_api.TF_GraphSetTensorShape_wrapper(
+            self._op._graph._c_graph,  # pylint: disable=protected-access
+            self._as_tf_output(),
+            dim_list,
+            unknown_shape,
+            status)
+    except errors.InvalidArgumentError as e:
+      # Convert to ValueError for backwards compatibility.
+      raise ValueError(str(e))
 
   @property
   def value_index(self):
-- 
GitLab


From 7bf8ccdb4ef5b0b28c1cf0d5084e07ffbf0e2703 Mon Sep 17 00:00:00 2001
From: Yutaka Leon <yleon@google.com>
Date: Wed, 24 Jan 2018 15:45:55 -0800
Subject: [PATCH 120/966] Set export_outs in KMeans' EstimatorSpec.

PiperOrigin-RevId: 183154542
---
 .../contrib/factorization/python/ops/kmeans.py     | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/factorization/python/ops/kmeans.py b/tensorflow/contrib/factorization/python/ops/kmeans.py
index 9a5413fc3f..4d0f9b2424 100644
--- a/tensorflow/contrib/factorization/python/ops/kmeans.py
+++ b/tensorflow/contrib/factorization/python/ops/kmeans.py
@@ -25,6 +25,7 @@ import time
 from tensorflow.contrib.factorization.python.ops import clustering_ops
 from tensorflow.python.estimator import estimator
 from tensorflow.python.estimator import model_fn as model_fn_lib
+from tensorflow.python.estimator.export import export_output
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -32,6 +33,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import metrics
 from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.summary import summary
 from tensorflow.python.training import session_run_hook
 from tensorflow.python.training import training_util
@@ -207,6 +209,15 @@ class _ModelFn(object):
       training_hooks.append(
           _LossRelativeChangeHook(loss, self._relative_tolerance))
 
+    export_outputs = {
+        KMeansClustering.ALL_DISTANCES:
+            export_output.PredictOutput(all_distances[0]),
+        KMeansClustering.CLUSTER_INDEX:
+            export_output.PredictOutput(model_predictions[0]),
+        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
+            export_output.PredictOutput(model_predictions[0])
+    }
+
     return model_fn_lib.EstimatorSpec(
         mode=mode,
         predictions={
@@ -216,7 +227,8 @@ class _ModelFn(object):
         loss=loss,
         train_op=training_op,
         eval_metric_ops={KMeansClustering.SCORE: metrics.mean(loss)},
-        training_hooks=training_hooks)
+        training_hooks=training_hooks,
+        export_outputs=export_outputs)
 
 
 # TODO(agarwal,ands): support sharded input.
-- 
GitLab


From ffa63e57bdd703ae051ae849af5b5a272fca2223 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Wed, 24 Jan 2018 16:10:12 -0800
Subject: [PATCH 121/966] [TF:XLA] Replace most of HloProfilePrinter by a
 protocol buffer

This change replaces the meat of HloProfilePrinter with a protobuf
HloProfilePrinterData.  The original plan was to serialize HloProfilePrinter
into C++ source code and put that in a .cc file along with the string for the
xla::ProgramShape.  However, since we now directly serialize xla::ProgramShape
into a .o file, for consistency I think we should do the same thing for
HloProfilePrinter (instead of adding yet another output file to tfcompile).

The change itself is fairly simple, it is large mostly due to the mass renaming
I had to do.

PiperOrigin-RevId: 183158192
---
 .../tf2xla/xla_compiled_cpu_function.cc       |   2 +-
 .../tf2xla/xla_compiled_cpu_function.h        |  20 +--
 .../tf2xla/xla_jit_compiled_cpu_function.cc   |   6 +-
 tensorflow/compiler/xla/service/BUILD         |   6 +
 .../compiler/xla/service/cpu/cpu_compiler.cc  |  10 +-
 .../xla/service/cpu/cpu_executable.cc         |   4 +-
 .../compiler/xla/service/cpu/cpu_executable.h |   2 +-
 .../service/cpu/parallel_cpu_executable.cc    |   4 +-
 .../xla/service/cpu/parallel_cpu_executable.h |   2 +-
 tensorflow/compiler/xla/service/executable.cc |   2 +-
 tensorflow/compiler/xla/service/executable.h  |  21 ++--
 .../compiler/xla/service/gpu/gpu_compiler.cc  |   4 +-
 .../xla/service/gpu/gpu_executable.cc         |   4 +-
 .../compiler/xla/service/gpu/gpu_executable.h |   2 +-
 .../xla/service/hlo_execution_profile.cc      | 118 ++++++++----------
 .../xla/service/hlo_execution_profile.h       |  15 ++-
 .../xla/service/hlo_execution_profile_test.cc |   4 +-
 .../xla/service/hlo_profile_printer.cc        |  45 ++++---
 .../xla/service/hlo_profile_printer.h         |  79 +-----------
 .../service/hlo_profile_printer_data.proto    |  60 +++++++++
 tensorflow/compiler/xla/service/service.cc    |   2 +-
 .../xla/tests/xla_hlo_profile_test.cc         |   3 +-
 tensorflow/compiler/xla/util.h                |   8 +-
 23 files changed, 208 insertions(+), 215 deletions(-)
 create mode 100644 tensorflow/compiler/xla/service/hlo_profile_printer_data.proto

diff --git a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc
index 79da701fd2..672e19bd93 100644
--- a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc
@@ -29,7 +29,7 @@ XlaCompiledCpuFunction::XlaCompiledCpuFunction(const StaticData& static_data,
       arg_names_(static_data.arg_names),
       result_names_(static_data.result_names),
       program_shape_(static_data.program_shape),
-      hlo_profile_printer_(static_data.hlo_profile_printer) {
+      hlo_profile_printer_data_(static_data.hlo_profile_printer_data) {
   // Allocate arg and temp buffers.
   if (alloc_mode == AllocMode::ARGS_RESULTS_PROFILES_AND_TEMPS) {
     alloc_args_ = tensorflow::tfcompile::runtime::MallocContiguousBuffers(
diff --git a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
index e0ae3ed9a8..48a8c083ca 100644
--- a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
+++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
@@ -26,7 +26,7 @@ limitations under the License.
 // never use this functionality.
 namespace xla {
 class ProgramShape;
-class HloProfilePrinter;
+class HloProfilePrinterData;
 }
 
 namespace tensorflow {
@@ -77,12 +77,14 @@ class XlaCompiledCpuFunction {
     // [Optional] Arg and result shapes.
     const xla::ProgramShape* program_shape = nullptr;
 
-    // [Optional] Profile printer.  Null if profiling is disabled.
-    const xla::HloProfilePrinter* hlo_profile_printer = nullptr;
+    // [Optional] Profile printer data.  Null if profiling is disabled.
+    const xla::HloProfilePrinterData* hlo_profile_printer_data = nullptr;
 
     // [Optional] The number of profile counters expected in the profile counter
     // buffer by the generated code and hlo_profile_printer.  0 if profiling is
-    // disabled.
+    // disabled.  This information is already present in
+    // hlo_profile_printer_data but xla::HloProfilePrinterData is forward
+    // declared so we don't have access to that information here.
     int64 profile_counters_size = 0;
   };
 
@@ -205,10 +207,12 @@ class XlaCompiledCpuFunction {
   // program shape isn't available.
   const xla::ProgramShape* ProgramShape() const { return program_shape_; }
 
-  bool hlo_profiling_enabled() const { return hlo_profile_printer_ != nullptr; }
-  const xla::HloProfilePrinter& hlo_profile_printer() const {
+  bool hlo_profiling_enabled() const {
+    return hlo_profile_printer_data_ != nullptr;
+  }
+  const xla::HloProfilePrinterData& hlo_profile_printer_data() const {
     assert(hlo_profiling_enabled());
-    return *hlo_profile_printer_;
+    return *hlo_profile_printer_data_;
   }
 
  private:
@@ -234,7 +238,7 @@ class XlaCompiledCpuFunction {
   const char** arg_names_ = nullptr;
   const char** result_names_ = nullptr;
   const xla::ProgramShape* program_shape_ = nullptr;
-  const xla::HloProfilePrinter* hlo_profile_printer_ = nullptr;
+  const xla::HloProfilePrinterData* hlo_profile_printer_data_ = nullptr;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
index 584417bc72..1fe6e69ff2 100644
--- a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
+++ b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
@@ -182,10 +182,10 @@ XlaJitCompiledCpuFunction::Compile(
   jit->static_data_.program_shape = jit->program_shape_.get();
 
   if (cpu_executable->hlo_profiling_enabled()) {
-    jit->static_data_.hlo_profile_printer =
-        &cpu_executable->hlo_profile_printer();
+    jit->static_data_.hlo_profile_printer_data =
+        &cpu_executable->hlo_profile_printer_data();
     jit->static_data_.profile_counters_size =
-        cpu_executable->hlo_profile_printer().profile_counters_size();
+        cpu_executable->hlo_profile_printer_data().profile_counters_size();
   }
 
   return std::move(jit_unique_ptr);
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index d0e2dc88ea..2c5f3ea1dd 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -29,6 +29,11 @@ xla_proto_library(
     deps = ["//tensorflow/compiler/xla:xla_data_proto"],
 )
 
+xla_proto_library(
+    name = "hlo_profile_printer_data",
+    srcs = ["hlo_profile_printer_data.proto"],
+)
+
 # Filegroup used to collect source files for dependency checking.
 filegroup(
     name = "c_srcs",
@@ -2267,6 +2272,7 @@ cc_library(
     srcs = ["hlo_profile_printer.cc"],
     hdrs = ["hlo_profile_printer.h"],
     deps = [
+        ":hlo_profile_printer_data",
         ":human_readable_profile_builder",
         "//tensorflow/compiler/xla:types",
     ],
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index f0507982b3..33af77e1a8 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -485,7 +485,7 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
   std::unordered_map<const HloInstruction*, int64> instruction_to_profile_idx;
   std::unordered_map<const HloComputation*, int64> computation_to_profile_idx;
   std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map;
-  std::unique_ptr<HloProfilePrinter> hlo_profile_printer;
+  std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data;
   if (module->config().hlo_profiling_enabled()) {
     hlo_profile_index_map = MakeUnique<HloProfileIndexMap>(*module);
 
@@ -505,8 +505,8 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
 
     HloCostAnalysis cost_analysis(shape_size_bytes);
     TF_RETURN_IF_ERROR(entry_computation->Accept(&cost_analysis));
-    hlo_profile_printer =
-        CreateHloProfilePrinter(*hlo_profile_index_map, cost_analysis);
+    hlo_profile_printer_data =
+        CreateHloProfilePrinterData(*hlo_profile_index_map, cost_analysis);
     computation_to_profile_idx =
         hlo_profile_index_map->computation_to_profile_idx();
   }
@@ -619,7 +619,7 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
     cpu_executable.reset(new ParallelCpuExecutable(
         std::move(jit), std::move(assignment), std::move(module),
         std::move(function_names), std::move(aligned_constants),
-        std::move(hlo_profile_printer), std::move(hlo_profile_index_map)));
+        std::move(hlo_profile_printer_data), std::move(hlo_profile_index_map)));
 
     if (embed_ir_in_executable) {
       static_cast<CpuExecutable&>(*cpu_executable)
@@ -698,7 +698,7 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
     jit->AddModule(std::move(llvm_module));
     cpu_executable.reset(new CpuExecutable(
         std::move(jit), std::move(assignment), std::move(module), function_name,
-        std::move(hlo_profile_printer), std::move(hlo_profile_index_map)));
+        std::move(hlo_profile_printer_data), std::move(hlo_profile_index_map)));
 
     if (embed_ir_in_executable) {
       static_cast<CpuExecutable&>(*cpu_executable)
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index f335bd1bbc..802d0a6fb4 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -55,9 +55,9 @@ CpuExecutable::CpuExecutable(
     std::unique_ptr<const BufferAssignment> assignment,
     std::unique_ptr<const HloModule> hlo_module,
     const string& entry_function_name,
-    std::unique_ptr<HloProfilePrinter> hlo_profile_printer,
+    std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
     std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
-    : Executable(std::move(hlo_module), std::move(hlo_profile_printer),
+    : Executable(std::move(hlo_module), std::move(hlo_profile_printer_data),
                  std::move(hlo_profile_index_map)),
       jit_(std::move(jit)),
       assignment_(std::move(assignment)) {
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.h b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
index 50443a5995..267b89a10b 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
@@ -51,7 +51,7 @@ class CpuExecutable : public Executable {
                 std::unique_ptr<const BufferAssignment> assignment,
                 std::unique_ptr<const HloModule> hlo_module,
                 const string& entry_function_name,
-                std::unique_ptr<HloProfilePrinter> hlo_profile_printer,
+                std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
                 std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
   ~CpuExecutable() override {}
 
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
index d1b88b27f0..cd997f0789 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
@@ -61,9 +61,9 @@ ParallelCpuExecutable::ParallelCpuExecutable(
     std::unique_ptr<const HloInstructionMap<string>> function_names,
     std::unordered_map<const HloInstruction*, std::unique_ptr<unsigned char[]>>
         aligned_constants,
-    std::unique_ptr<HloProfilePrinter> hlo_profile_printer,
+    std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
     std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
-    : Executable(std::move(hlo_module), std::move(hlo_profile_printer),
+    : Executable(std::move(hlo_module), std::move(hlo_profile_printer_data),
                  std::move(hlo_profile_index_map)),
       jit_(std::move(jit)),
       assignment_(std::move(assignment)),
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
index 90ac94ef92..c393e9b8ea 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
+++ b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
@@ -55,7 +55,7 @@ class ParallelCpuExecutable : public Executable {
       std::unordered_map<const HloInstruction*,
                          std::unique_ptr<unsigned char[]>>
           aligned_constants,
-      std::unique_ptr<HloProfilePrinter> hlo_profile_printer,
+      std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
       std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
   ~ParallelCpuExecutable() override {}
 
diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc
index 21e7fbea29..90481c7a88 100644
--- a/tensorflow/compiler/xla/service/executable.cc
+++ b/tensorflow/compiler/xla/service/executable.cc
@@ -73,7 +73,7 @@ StatusOr<std::unique_ptr<ShapedBuffer>> Executable::ExecuteOnStreamWrapper(
   std::unique_ptr<HloExecutionProfile> profile_ptr =
       module_config().debug_options().xla_hlo_profile() &&
               hlo_profiling_enabled()
-          ? MakeUnique<HloExecutionProfile>(&hlo_profile_printer(),
+          ? MakeUnique<HloExecutionProfile>(&hlo_profile_printer_data(),
                                             &hlo_profile_index_map())
           : nullptr;
 
diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h
index 5ecfdffe21..0aee535ee7 100644
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@@ -44,13 +44,14 @@ namespace xla {
 // interface that is used for launching compiled programs across platforms.
 class Executable {
  public:
-  explicit Executable(std::unique_ptr<const HloModule> hlo_module,
-                      std::unique_ptr<HloProfilePrinter> hlo_profile_printer,
-                      std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
+  explicit Executable(
+      std::unique_ptr<const HloModule> hlo_module,
+      std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
+      std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
       : hlo_module_(std::move(hlo_module)),
-        hlo_profile_printer_(std::move(hlo_profile_printer)),
+        hlo_profile_printer_data_(std::move(hlo_profile_printer_data)),
         hlo_profile_index_map_(std::move(hlo_profile_index_map)) {
-    CHECK_EQ(hlo_profile_printer_.get() == nullptr,
+    CHECK_EQ(hlo_profile_printer_data_.get() == nullptr,
              hlo_profile_index_map_.get() == nullptr);
   }
   virtual ~Executable() {}
@@ -116,9 +117,9 @@ class Executable {
         "Equality test on this executable is not implemented.");
   }
 
-  const HloProfilePrinter& hlo_profile_printer() const {
+  const HloProfilePrinterData& hlo_profile_printer_data() const {
     CHECK(hlo_profiling_enabled());
-    return *hlo_profile_printer_;
+    return *hlo_profile_printer_data_;
   }
 
   const HloProfileIndexMap& hlo_profile_index_map() const {
@@ -129,7 +130,9 @@ class Executable {
   // Returns whether this executable was compiled with HLO profilings support
   // enabled. If not, the caller should not expect an hlo_execution_profile
   // passed to ExecuteOnStream above to be populated during execution.
-  bool hlo_profiling_enabled() const { return hlo_profile_printer_ != nullptr; }
+  bool hlo_profiling_enabled() const {
+    return hlo_profile_printer_data_ != nullptr;
+  }
 
   const HloModule& module() const { return *hlo_module_; }
 
@@ -179,7 +182,7 @@ class Executable {
   // execution.
   int64 execution_count_ = 0;
 
-  std::unique_ptr<HloProfilePrinter> hlo_profile_printer_;
+  std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data_;
   std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map_;
 };
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index 21798ed606..0cca3ca092 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -593,14 +593,14 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
   XLA_VLOG_LINES(2, thunk_schedule->ToString());
 
   std::unique_ptr<HloProfileIndexMap> profile_index_map;
-  std::unique_ptr<HloProfilePrinter> profile_printer;
+  std::unique_ptr<HloProfilePrinterData> profile_printer;
 
   if (module->config().hlo_profiling_enabled()) {
     HloCostAnalysis cost_analysis(ShapeSizeBytesFunction());
     TF_RETURN_IF_ERROR(module->entry_computation()->Accept(&cost_analysis));
     profile_index_map = MakeUnique<HloProfileIndexMap>(*module);
     profile_printer =
-        CreateHloProfilePrinter(*profile_index_map, cost_analysis);
+        CreateHloProfilePrinterData(*profile_index_map, cost_analysis);
   }
 
   auto* gpu_executable = new GpuExecutable(
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index 51d164cdf4..f5d67b9ea9 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -116,9 +116,9 @@ GpuExecutable::GpuExecutable(
     std::unique_ptr<const ThunkSchedule> thunk_schedule,
     std::unique_ptr<const HloModule> hlo_module,
     std::unique_ptr<const BufferAssignment> assignment,
-    std::unique_ptr<HloProfilePrinter> hlo_profile_printer,
+    std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
     std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
-    : Executable(std::move(hlo_module), std::move(hlo_profile_printer),
+    : Executable(std::move(hlo_module), std::move(hlo_profile_printer_data),
                  std::move(hlo_profile_index_map)),
       ptx_(ptx),
       cubin_(cubin),
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
index 00da64dfad..b19cfd43de 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
@@ -54,7 +54,7 @@ class GpuExecutable : public Executable {
                 std::unique_ptr<const ThunkSchedule> thunk_schedule,
                 std::unique_ptr<const HloModule> hlo_module,
                 std::unique_ptr<const BufferAssignment> assignment,
-                std::unique_ptr<HloProfilePrinter> hlo_profile_printer,
+                std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
                 std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
 
   // This should be called after set_ir_module_string.
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.cc b/tensorflow/compiler/xla/service/hlo_execution_profile.cc
index 849aac0b12..f0df93b61d 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile.cc
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile.cc
@@ -40,83 +40,75 @@ HloProfileIndexMap::HloProfileIndexMap(const HloModule& module) {
   }
 }
 
-std::unique_ptr<HloProfilePrinter> CreateHloProfilePrinter(
+std::unique_ptr<HloProfilePrinterData> CreateHloProfilePrinterData(
     const HloProfileIndexMap& hlo_profile_index_map,
     const HloCostAnalysis& cost_analysis) {
-  using HloComputationInfo = HloProfilePrinter::HloComputationInfo;
-  using HloInstructionInfo = HloProfilePrinter::HloInstructionInfo;
-
-  HloComputationInfo* computation_infos =
-      new HloComputationInfo[hlo_profile_index_map.computation_count()];
-
-  // There are two "indices" in play here.  The first one is the index of the
-  // HloComputationInfo or HloInstructionInfo in the array that contains said
-  // HloComputationInfo or HloInstructionInfo.  The second index is the index of
-  // the HloComputationInfo or HloInstructionInfo in the profile counters array,
-  // as decided by hlo_profile_index_map.  The latter index is always referred
-  // to as "profile_index".
-
-  size_t computation_index_in_static_data = 0;
-  size_t max_profile_index = hlo_profile_index_map.total_count();
-  for (const auto& pair : hlo_profile_index_map.computation_to_profile_idx()) {
-    CHECK_LT(pair.second, max_profile_index);
+  using HloComputationInfo = HloProfilePrinterData::HloComputationInfo;
+  using HloInstructionInfo = HloProfilePrinterData::HloInstructionInfo;
+
+  size_t profile_counters_size = hlo_profile_index_map.total_count();
+
+  std::unique_ptr<HloProfilePrinterData> profile_printer_data =
+      MakeUnique<HloProfilePrinterData>();
+  profile_printer_data->set_profile_counters_size(profile_counters_size);
+  profile_printer_data->mutable_computation_infos()->Reserve(
+      hlo_profile_index_map.computation_count());
+
+  const auto& computation_to_profile_idx_map =
+      hlo_profile_index_map.computation_to_profile_idx();
+
+  // computation_to_profile_idx_map's order is not deterministic so create a
+  // deterministic computation_and_profile_idx_list so that we end up with a
+  // deterministic HloProfilePrinterData protobuf.
+
+  std::vector<std::pair<const HloComputation*, int64>>
+      computation_and_profile_idx_list(computation_to_profile_idx_map.begin(),
+                                       computation_to_profile_idx_map.end());
+
+  // The profile indices were computed deterministically in
+  // HloProfileIndexMap::HloProfileIndexMap.
+  c_sort(computation_and_profile_idx_list,
+         [](const std::pair<const HloComputation*, int64>& left,
+            const std::pair<const HloComputation*, int64>& right) {
+           return left.second < right.second;
+         });
+
+  for (const auto& pair : computation_and_profile_idx_list) {
+    CHECK_LT(pair.second, profile_counters_size);
     const HloComputation* computation = pair.first;
-    size_t current_computation_index = computation_index_in_static_data++;
     HloComputationInfo* computation_info =
-        &computation_infos[current_computation_index];
+        profile_printer_data->add_computation_infos();
 
-    computation_info->name = strdup(computation->name().c_str());
-    computation_info->profile_index = pair.second;
-    computation_info->instructions =
-        new HloInstructionInfo[computation->instruction_count()];
-    computation_info->instructions_size = computation->instruction_count();
+    computation_info->set_name(computation->name());
+    computation_info->set_profile_index(pair.second);
+    computation_info->mutable_instruction_infos()->Reserve(
+        computation->instruction_count());
 
-    size_t instruction_index_in_static_data = 0;
     for (const HloInstruction* hlo : computation->instructions()) {
-      HloProfilePrinter::HloInstructionInfo* instruction_info =
-          &computation_info->instructions[instruction_index_in_static_data++];
-      instruction_info->long_name = strdup(hlo->ToString().c_str());
-      instruction_info->short_name = strdup(
-          hlo->ToString(HloPrintOptions().set_compact_operands(true)).c_str());
-      instruction_info->category = strdup(hlo->ToCategory().c_str());
-      instruction_info->flop_count = cost_analysis.flop_count(*hlo);
-      instruction_info->transcendental_count =
-          cost_analysis.transcendental_count(*hlo);
-      instruction_info->bytes_accessed = cost_analysis.bytes_accessed(*hlo);
-      instruction_info->optimal_seconds = cost_analysis.optimal_seconds(*hlo);
-      instruction_info->profile_index =
-          hlo_profile_index_map.GetProfileIndexFor(*hlo);
-      CHECK_LT(instruction_info->profile_index, max_profile_index);
+      HloInstructionInfo* instruction_info =
+          computation_info->add_instruction_infos();
+      instruction_info->set_long_name(hlo->ToString());
+      instruction_info->set_short_name(
+          hlo->ToString(HloPrintOptions().set_compact_operands(true)));
+      instruction_info->set_category(hlo->ToCategory());
+      instruction_info->set_flop_count(cost_analysis.flop_count(*hlo));
+      instruction_info->set_transcendental_count(
+          cost_analysis.transcendental_count(*hlo));
+      instruction_info->set_bytes_accessed(cost_analysis.bytes_accessed(*hlo));
+      instruction_info->set_optimal_seconds(
+          cost_analysis.optimal_seconds(*hlo));
+      instruction_info->set_profile_index(
+          hlo_profile_index_map.GetProfileIndexFor(*hlo));
     }
   }
 
-  auto deleter = [](HloProfilePrinter::HloComputationInfo* computation_infos,
-                    int64 computation_infos_size) {
-    for (int64 i = 0; i < computation_infos_size; i++) {
-      HloInstructionInfo* instruction_infos = computation_infos[i].instructions;
-      for (int64 j = 0; j < computation_infos[i].instructions_size; j++) {
-        // We can't make instruction_infos[j].long_name etc. non-const pointers
-        // since they may point into static storage, so we have a const_cast
-        // here.
-        free(const_cast<char*>(instruction_infos[j].long_name));
-        free(const_cast<char*>(instruction_infos[j].short_name));
-        free(const_cast<char*>(instruction_infos[j].category));
-      }
-      delete[] instruction_infos;
-      free(const_cast<char*>(computation_infos[i].name));
-    }
-    delete[] computation_infos;
-  };
-
-  return MakeUnique<HloProfilePrinter>(
-      computation_infos, hlo_profile_index_map.computation_count(),
-      /*profile_counters_size=*/max_profile_index, deleter);
+  return profile_printer_data;
 }
 
 HloExecutionProfile::HloExecutionProfile(
-    const HloProfilePrinter* hlo_profile_printer,
+    const HloProfilePrinterData* hlo_profile_printer_data,
     const HloProfileIndexMap* hlo_profile_index_map)
-    : hlo_profile_printer_(*hlo_profile_printer),
+    : hlo_profile_printer_data_(*hlo_profile_printer_data),
       hlo_profile_index_map_(*hlo_profile_index_map),
       profile_counters_(
           /*count*/ hlo_profile_index_map_.total_count(),
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.h b/tensorflow/compiler/xla/service/hlo_execution_profile.h
index 1a6b069609..6fb91b9bef 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile.h
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile.h
@@ -77,8 +77,8 @@ class HloProfileIndexMap {
   std::unordered_map<const HloComputation*, int64> computation_to_profile_idx_;
 };
 
-// Create an instance of `HloProfilePrinter` that owns its memory.
-std::unique_ptr<HloProfilePrinter> CreateHloProfilePrinter(
+// Create an instance of `HloProfilePrinterData`.
+std::unique_ptr<HloProfilePrinterData> CreateHloProfilePrinterData(
     const HloProfileIndexMap& hlo_profile_index_map,
     const HloCostAnalysis& cost_analysis);
 
@@ -90,7 +90,7 @@ class HloExecutionProfile {
  public:
   using DeviceDescription = perftools::gputools::DeviceDescription;
 
-  HloExecutionProfile(const HloProfilePrinter* hlo_profile_printer,
+  HloExecutionProfile(const HloProfilePrinterData* hlo_profile_printer_data,
                       const HloProfileIndexMap* hlo_profile_index_map);
 
   // Record how many cycles this HLO took to execute.
@@ -117,11 +117,10 @@ class HloExecutionProfile {
   // debugging; e.g. emits cycle counts, execution time at the nominal device
   // frequency, and the effective throughput given the provided cost_analysis
   // for the operations in a given computation. Returns an empty string if it
-  // wasn't possible to generate a printable version. cost_analysis should be a
-  // clean analysis that can be used to visit the computation.
+  // wasn't possible to generate a printable version.
   string ToString(const DeviceDescription& device_description) const {
-    return hlo_profile_printer_.ToString(profile_counters_.data(),
-                                         device_description.clock_rate_ghz());
+    return PrintHloProfile(hlo_profile_printer_data_, profile_counters_.data(),
+                           device_description.clock_rate_ghz());
   }
 
   std::vector<int64>* mutable_profile_counters() { return &profile_counters_; }
@@ -130,7 +129,7 @@ class HloExecutionProfile {
   }
 
  private:
-  const HloProfilePrinter& hlo_profile_printer_;
+  const HloProfilePrinterData& hlo_profile_printer_data_;
   const HloProfileIndexMap& hlo_profile_index_map_;
 
   // Stores per-Hlo profile counters.  This is the only thing that changes when
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc b/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc
index b1e6729e2b..a0cb28246d 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc
@@ -73,8 +73,8 @@ TEST_F(HloExecutionProfileTest, Basic) {
 
   HloCostAnalysis cost_analysis(shape_size_function);
   HloProfileIndexMap profile_index_map(*hlo_module);
-  std::unique_ptr<HloProfilePrinter> profile_printer =
-      CreateHloProfilePrinter(profile_index_map, cost_analysis);
+  std::unique_ptr<HloProfilePrinterData> profile_printer =
+      CreateHloProfilePrinterData(profile_index_map, cost_analysis);
   HloExecutionProfile execution_profile(profile_printer.get(),
                                         &profile_index_map);
 
diff --git a/tensorflow/compiler/xla/service/hlo_profile_printer.cc b/tensorflow/compiler/xla/service/hlo_profile_printer.cc
index e944ad1513..dcc2279301 100644
--- a/tensorflow/compiler/xla/service/hlo_profile_printer.cc
+++ b/tensorflow/compiler/xla/service/hlo_profile_printer.cc
@@ -18,20 +18,20 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/human_readable_profile_builder.h"
 
 namespace xla {
-string HloProfilePrinter::ToString(const int64* counters,
-                                   double clock_rate_ghz) const {
+string PrintHloProfile(const HloProfilePrinterData& hlo_profile_printer_data,
+                       const int64* counters, double clock_rate_ghz) {
+  using HloComputationInfo = HloProfilePrinterData::HloComputationInfo;
+  using HloInstructionInfo = HloProfilePrinterData::HloInstructionInfo;
+
   string result;
 
-  for (int computation_idx = 0; computation_idx < computation_infos_size_;
-       computation_idx++) {
-    const HloComputationInfo& computation = computation_infos_[computation_idx];
-    const HloInstructionInfo* instructions_begin = computation.instructions;
-    const HloInstructionInfo* instructions_end =
-        computation.instructions + computation.instructions_size;
+  for (const HloComputationInfo& computation_info :
+       hlo_profile_printer_data.computation_infos()) {
+    const auto& instruction_infos = computation_info.instruction_infos();
     bool any_instruction_profiled =
-        std::any_of(instructions_begin, instructions_end,
+        std::any_of(instruction_infos.begin(), instruction_infos.end(),
                     [&](const HloInstructionInfo& instruction_info) {
-                      return counters[instruction_info.profile_index] != 0;
+                      return counters[instruction_info.profile_index()] != 0;
                     });
 
     if (!any_instruction_profiled) {
@@ -41,16 +41,19 @@ string HloProfilePrinter::ToString(const int64* counters,
     // Once we start using this in AOT for real, we will probably need a more
     // minimal version of HumanReadableProfileBuilder.
     HumanReadableProfileBuilder builder(
-        computation.name, counters[computation.profile_index], clock_rate_ghz);
+        computation_info.name(), counters[computation_info.profile_index()],
+        clock_rate_ghz);
 
-    for (const auto* instruction = instructions_begin;
-         instruction != instructions_end; instruction++) {
+    for (const auto& instruction_info : instruction_infos) {
       builder.AddOp(
-          /*op_name=*/instruction->long_name,
-          /*short_name=*/instruction->short_name, instruction->category,
-          counters[instruction->profile_index], instruction->flop_count,
-          instruction->transcendental_count, instruction->bytes_accessed,
-          instruction->optimal_seconds);
+          /*op_name=*/instruction_info.long_name(),
+          /*short_name=*/instruction_info.short_name(),
+          instruction_info.category(),
+          counters[instruction_info.profile_index()],
+          instruction_info.flop_count(),
+          instruction_info.transcendental_count(),
+          instruction_info.bytes_accessed(),
+          instruction_info.optimal_seconds());
     }
 
     result += builder.ToString();
@@ -58,10 +61,4 @@ string HloProfilePrinter::ToString(const int64* counters,
 
   return result;
 }
-
-HloProfilePrinter::~HloProfilePrinter() {
-  if (deleter_) {
-    deleter_(computation_infos_, computation_infos_size_);
-  }
-}
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_profile_printer.h b/tensorflow/compiler/xla/service/hlo_profile_printer.h
index 35152e744d..b72325c755 100644
--- a/tensorflow/compiler/xla/service/hlo_profile_printer.h
+++ b/tensorflow/compiler/xla/service/hlo_profile_printer.h
@@ -20,84 +20,13 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "tensorflow/compiler/xla/service/hlo_profile_printer_data.pb.h"
 #include "tensorflow/compiler/xla/types.h"
 
 namespace xla {
-// Instances of this class can pretty-print profile counters gathered from
-// running an XLA computation without having access to the backing module.
-class HloProfilePrinter {
- public:
-  // Holds meta information about an HloInstruction.
-  //
-  // The pointer-typed fields can be owning or non-owning -- this decision is
-  // manifested as the deleter_ function in the containing HloProfilePrinter.
-  struct HloInstructionInfo {
-    // Textual information for pretty printing.
-    const char* long_name;
-    const char* short_name;
-    const char* category;
-
-    // Metrics computed by HloCostAnalysis.
-    float flop_count;
-    float transcendental_count;
-    float bytes_accessed;
-    float optimal_seconds;
-
-    // The index into the profile counters array for the HloInstruction
-    // corresponding to this HloInstructionInfo.
-    int64 profile_index;
-  };
-
-  // Holds meta information about an HloComputation.
-  //
-  // The pointer-typed fields can be owning or non-owning -- this decision is
-  // manifested as the deleter_ function in the containing HloProfilePrinter.
-  struct HloComputationInfo {
-    const char* name;
-
-    // The index into the profile counters array for the HloInstruction
-    // corresponding to this HloComputationInfo.
-    int64 profile_index;
-
-    HloInstructionInfo* instructions;
-    int64 instructions_size;
-  };
-
-  HloProfilePrinter(
-      HloComputationInfo* computation_infos, int64 computation_infos_size,
-      int64 profile_counters_size,
-      std::function<void(HloComputationInfo*, int64)> deleter = nullptr)
-      : computation_infos_(computation_infos),
-        computation_infos_size_(computation_infos_size),
-        profile_counters_size_(profile_counters_size),
-        deleter_(std::move(deleter)) {}
-
-  HloProfilePrinter(HloProfilePrinter&& other) {
-    std::swap(other.computation_infos_, computation_infos_);
-    std::swap(other.computation_infos_size_, computation_infos_size_);
-    std::swap(other.deleter_, deleter_);
-  }
-
-  HloProfilePrinter(const HloProfilePrinter&) = delete;
-  HloProfilePrinter& operator=(const HloProfilePrinter&) = delete;
-
-  // Converts the profile counter sequence `counters` to a human readable string
-  // representation.
-  string ToString(const int64* counters, double clock_rate_ghz) const;
-
-  // Returns the size of the profile buffer expected by this printer.
-  int64 profile_counters_size() const { return profile_counters_size_; }
-
-  ~HloProfilePrinter();
-
- private:
-  // The `computation_infos_` field can be owning or non-owning -- this decision
-  // is manifested as the deleter_ function.
-  HloComputationInfo* computation_infos_ = nullptr;
-  int64 computation_infos_size_ = 0;
-  int64 profile_counters_size_ = 0;
-  std::function<void(HloComputationInfo*, int64)> deleter_;
-};
+// Pretty-print an array of profile counters using hlo_profile_printer_data.
+string PrintHloProfile(const HloProfilePrinterData& hlo_profile_printer_data,
+                       const int64* counters, double clock_rate_ghz);
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_PROFILE_PRINTER_H_
diff --git a/tensorflow/compiler/xla/service/hlo_profile_printer_data.proto b/tensorflow/compiler/xla/service/hlo_profile_printer_data.proto
new file mode 100644
index 0000000000..9f22b733fe
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_profile_printer_data.proto
@@ -0,0 +1,60 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+syntax = "proto3";
+
+package xla;
+
+option cc_enable_arenas = true;
+
+// Describes how to pretty-print a profile counter array gathered for a specific
+// HloModule.
+message HloProfilePrinterData {
+  // Pretty-printer information about an HloInstruction.
+  message HloInstructionInfo {
+    string long_name = 1;
+    string short_name = 2;
+    string category = 3;
+
+    // Metrics computed by HloCostAnalysis.
+    float flop_count = 4;
+    float transcendental_count = 5;
+    float bytes_accessed = 6;
+    float optimal_seconds = 7;
+
+    // The index into the profile counters array for the HloInstruction
+    // corresponding to this HloInstructionInfo.
+    int64 profile_index = 8;
+  }
+
+  // Pretty-printer information about an HloComputation.
+  message HloComputationInfo {
+    string name = 1;
+
+    // The index into the profile counters array for the HloComputation
+    // corresponding to this HloComputationInfo.
+    int64 profile_index = 2;
+
+    // HloInstructionInfos for every HloInstruction in the HloComputation for
+    // corresponding to this HloComputattionInfo.
+    repeated HloInstructionInfo instruction_infos = 3;
+  }
+
+  // HloComputationInfos for every HloComputation in the HloModule.
+  repeated HloComputationInfo computation_infos = 1;
+
+  // The size of the profile counters array we will pretty-print.
+  int64 profile_counters_size = 2;
+}
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index e230d25f1e..926ebbe314 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -569,7 +569,7 @@ Service::ExecuteParallelAndRegisterResult(
     se::Stream* stream = index_to_profiled_stream.second;
     Executable* executable = executables[device];
     const HloModule& module = executable->module();
-    HloExecutionProfile hlo_profile(&executable->hlo_profile_printer(),
+    HloExecutionProfile hlo_profile(&executable->hlo_profile_printer_data(),
                                     &executable->hlo_profile_index_map());
     TF_RETURN_IF_ERROR(
         executable->PopulateExecutionProfile(&hlo_profile, stream->parent()));
diff --git a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
index 146fbadcb6..1d2f436194 100644
--- a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
+++ b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
@@ -110,7 +110,8 @@ void ExecuteAndFetchProfile(string* profile_output, LocalClient* client,
 
   Executable* executable = local_executable->executable();
   HloExecutionProfile hlo_execution_profile(
-      &executable->hlo_profile_printer(), &executable->hlo_profile_index_map());
+      &executable->hlo_profile_printer_data(),
+      &executable->hlo_profile_index_map());
 
   TF_ASSERT_OK_AND_ASSIGN(
       Backend::StreamPtr stream_ptr,
diff --git a/tensorflow/compiler/xla/util.h b/tensorflow/compiler/xla/util.h
index bb2db2010c..1d7dd34449 100644
--- a/tensorflow/compiler/xla/util.h
+++ b/tensorflow/compiler/xla/util.h
@@ -398,13 +398,11 @@ std::vector<std::pair<int64, int64>> CommonFactors(
 // Removes illegal characters from filenames.
 string SanitizeFileName(string file_name);
 
-// Simple wrapper around std::all_of.
 template <typename Container, typename Predicate>
 bool c_all_of(Container container, Predicate predicate) {
   return std::all_of(std::begin(container), std::end(container), predicate);
 }
 
-// Simple wrapper around std::transform.
 template <typename InputContainer, typename OutputIterator,
           typename UnaryOperation>
 OutputIterator c_transform(InputContainer input_container,
@@ -414,7 +412,6 @@ OutputIterator c_transform(InputContainer input_container,
                         output_iterator, unary_op);
 }
 
-// Simple wrapper around std::copy_if.
 template <class InputContainer, class OutputIterator, class UnaryPredicate>
 OutputIterator c_copy_if(InputContainer input_container,
                          OutputIterator output_iterator,
@@ -423,6 +420,11 @@ OutputIterator c_copy_if(InputContainer input_container,
                       output_iterator, predicate);
 }
 
+template <class InputContainer, class Comparator>
+void c_sort(InputContainer& input_container, Comparator comparator) {
+  std::sort(input_container.begin(), input_container.end(), comparator);
+}
+
 }  // namespace xla
 
 #define XLA_LOG_LINES(SEV, STRING) \
-- 
GitLab


From e76a7a7c8bccb1fb67559160c9a06ba3a722fd54 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Wed, 24 Jan 2018 16:10:37 -0800
Subject: [PATCH 122/966] [TF:XLA:CPU] Fix an issue with explicitly vectorized
 reductions

The explicitly vectorized reduction kernel assumes that the input and the output
layouts are "same" (not identical which is not possible, but have the same
ordering of the un-reduced dimensions).  This change makes IrEmitter check this
precondition.

PiperOrigin-RevId: 183158249
---
 .../compiler/xla/service/cpu/ir_emitter.cc    |  51 +++++++
 tensorflow/compiler/xla/tests/BUILD           |  13 ++
 .../compiler/xla/tests/reduce_hlo_test.cc     | 132 ++++++++++++++++++
 3 files changed, 196 insertions(+)
 create mode 100644 tensorflow/compiler/xla/tests/reduce_hlo_test.cc

diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index b03a9f9aa5..71e8133189 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -62,6 +62,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
@@ -1271,6 +1272,52 @@ Status IrEmitter::HandleParameter(HloInstruction* parameter) {
   return Status::OK();
 }
 
+// Returns true if the relative order of the unreduced dimensions stays the same
+// through the reduce operation.
+static bool ReductionPreservesLayout(const HloInstruction& reduce) {
+  DCHECK_EQ(reduce.opcode(), HloOpcode::kReduce);
+
+  // Maps dimensions that were not reduced from their dimension numbers in the
+  // source shape to their dimensions numbers in the destination shape.
+  //
+  // So if we reduce f32[A,B,C,D] on dimensions 1 and 2, this map contains
+  // [0->0, 3->1].
+  gtl::FlatMap<int64, int64> unreduced_dim_map;
+
+  gtl::FlatSet<int64> reduced_dims(reduce.dimensions().begin(),
+                                   reduce.dimensions().end());
+
+  const Shape& operand_shape = reduce.operand(0)->shape();
+  const Shape& result_shape = reduce.shape();
+
+  int64 delta = 0;
+  for (int64 i = 0; i < operand_shape.dimensions_size(); i++) {
+    if (reduced_dims.count(i)) {
+      delta++;
+    } else {
+      InsertOrDie(&unreduced_dim_map, i, i - delta);
+    }
+  }
+
+  // Iterate dimensions minor to major and check that the corresponding
+  // dimensions in the source and target shapes are equivalent.
+  int64 result_dim_idx = 0;
+  for (int64 operand_dim_idx = 0;
+       operand_dim_idx < operand_shape.dimensions_size(); operand_dim_idx++) {
+    int64 operand_dim = operand_shape.layout().minor_to_major(operand_dim_idx);
+    if (!reduced_dims.count(operand_dim)) {
+      if (FindOrDie(unreduced_dim_map, operand_dim) !=
+          result_shape.layout().minor_to_major(result_dim_idx++)) {
+        return false;
+      }
+    }
+  }
+
+  CHECK_EQ(result_dim_idx, result_shape.dimensions_size());
+
+  return true;
+}
+
 IrEmitter::ReductionGenerator IrEmitter::MatchReductionGenerator(
     HloComputation* function, string* failure_reason) const {
   CHECK_EQ(function->num_parameters(), 2);
@@ -1540,6 +1587,10 @@ StatusOr<bool> IrEmitter::EmitVectorizedReduce(
     HloInstruction* reduce, HloInstruction* arg, HloInstruction* init_value,
     gtl::ArraySlice<int64> dimensions, HloComputation* function,
     string* failure_reason) {
+  if (!ReductionPreservesLayout(*reduce)) {
+    return false;
+  }
+
   ReductionGenerator reduction_generator =
       MatchReductionGenerator(function, failure_reason);
   if (!reduction_generator) {
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 6ab406d6c3..bc15bd9593 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -1072,6 +1072,19 @@ xla_test(
     ],
 )
 
+xla_test(
+    name = "reduce_hlo_test",
+    srcs = ["reduce_hlo_test.cc"],
+    deps = [
+        ":client_library_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+    ],
+)
+
 xla_test(
     name = "call_test",
     srcs = ["call_test.cc"],
diff --git a/tensorflow/compiler/xla/tests/reduce_hlo_test.cc b/tensorflow/compiler/xla/tests/reduce_hlo_test.cc
new file mode 100644
index 0000000000..c0a2c0ca4c
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/reduce_hlo_test.cc
@@ -0,0 +1,132 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <array>
+
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+
+// Tests the Reduce HLO in ways that can't be done using the ComputationBuilder
+// API.
+
+namespace xla {
+namespace {
+
+namespace str_util = tensorflow::str_util;
+namespace strings = tensorflow::strings;
+
+struct ReduceLayout {
+  std::array<int64, 4> input_minor_to_major;
+  std::array<int64, 3> output_minor_to_major;
+
+  string ToString() const {
+    return strings::StrCat(str_util::Join(input_minor_to_major, "x"), "_",
+                           str_util::Join(output_minor_to_major, "x"));
+  }
+};
+
+string PrintReduceLayout(
+    ::testing::TestParamInfo<ReduceLayout> reduce_layout_param) {
+  return reduce_layout_param.param.ToString();
+}
+
+void PrintTo(const ReduceLayout& reduce_layout, ::std::ostream* os) {
+  *os << reduce_layout.ToString();
+}
+
+class ReduceWithLayoutTest
+    : public HloTestBase,
+      public ::testing::WithParamInterface<ReduceLayout> {};
+
+StatusOr<std::unique_ptr<HloModule>> GetParsedModule() {
+  const char* const hlo_string = R"(
+HloModule BadReduce
+
+Sum {
+  x.1 = f32[] parameter(0)
+  y.1 = f32[] parameter(1)
+  ROOT add.1 = f32[] add(x.1, y.1)
+}
+
+ENTRY reduce.1 {
+  parameter = f32[2,2,2,3]{3,2,1,0} parameter(0)
+  init_value = f32[] constant(0)
+  reduce = f32[2,2,3]{2,1,0} reduce(parameter, init_value), dimensions={1}, to_apply=Sum
+  ROOT copy = f32[2,2,3]{2,1,0} copy(reduce)
+}
+)";
+
+  return tools::Parse(hlo_string);
+}
+
+// TODO(b/72454718): XLA:GPU does not support executing code compiled without
+// optimizations.
+XLA_TEST_P(ReduceWithLayoutTest, DISABLED_ON_GPU(Reduce)) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module, GetParsedModule());
+  HloInstruction* reduce_instruction =
+      module->entry_computation()->root_instruction()->mutable_operand(0);
+  ASSERT_EQ(reduce_instruction->opcode(), HloOpcode::kReduce);
+
+  const ReduceLayout& reduce_layout = GetParam();
+
+  Shape* reduce_output_shape = reduce_instruction->mutable_shape();
+  *reduce_output_shape->mutable_layout() =
+      LayoutUtil::MakeLayout(reduce_layout.output_minor_to_major);
+
+  Shape* reduce_input_shape =
+      reduce_instruction->mutable_operand(0)->mutable_shape();
+  *reduce_input_shape->mutable_layout() =
+      LayoutUtil::MakeLayout(reduce_layout.input_minor_to_major);
+
+  std::unique_ptr<Literal> reduce_input =
+      Literal::CreateR4<float>({{ /*i0=0*/
+                                 {/*i1=0*/
+                                  {-0.246092796, -0.179497838, -0.161181688},
+                                  {-0.151643038, -0.240213156, -0.198156}},
+                                 {/*i1=1*/
+                                  {-0.14222312, -0.162200093, -0.193907976},
+                                  {-0.239411, -0.198166847, -0.172471642}}},
+                                { /*i0=1*/
+                                 {/*i1=0*/
+                                  {-0.22965157, -0.218723893, -0.129257083},
+                                  {-0.188762426, -0.16123569, -0.181166649}},
+                                 {/*i1=1*/
+                                  {-0.241772294, -0.245131493, -0.160247207},
+                                  {-0.179881215, -0.23383224, -0.121976733}}}});
+
+  EXPECT_TRUE(RunAndCompareNoHloPasses(std::move(module), ErrorSpec(1e-5)));
+}
+
+INSTANTIATE_TEST_CASE_P(ReduceWithLayoutTest_Instantiation,
+                        ReduceWithLayoutTest,
+                        ::testing::Values(                           //
+                            ReduceLayout{{3, 2, 1, 0}, {0, 1, 2}},   //
+                            ReduceLayout{{3, 2, 1, 0}, {0, 2, 1}},   //
+                            ReduceLayout{{3, 2, 1, 0}, {1, 2, 0}},   //
+                            ReduceLayout{{3, 2, 1, 0}, {1, 0, 2}},   //
+                            ReduceLayout{{3, 2, 1, 0}, {2, 0, 1}},   //
+                            ReduceLayout{{3, 2, 1, 0}, {2, 1, 0}},   //
+                            ReduceLayout{{3, 1, 2, 0}, {1, 2, 0}},   //
+                            ReduceLayout{{1, 2, 3, 0}, {1, 0, 2}},   //
+                            ReduceLayout{{0, 2, 1, 3}, {2, 0, 1}}),  //
+                        PrintReduceLayout);
+
+}  // namespace
+}  // namespace xla
-- 
GitLab


From dc0511e51a8f2af8b0f053d7f04352f0b0be58fa Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 24 Jan 2018 16:23:09 -0800
Subject: [PATCH 123/966] Basic AddN support in toco

PiperOrigin-RevId: 183160197
---
 tensorflow/contrib/lite/toco/BUILD            |  1 +
 .../contrib/lite/toco/export_tensorflow.cc    | 15 ++++++
 .../convert_trivial_addn_to_add.cc            | 51 +++++++++++++++++++
 .../graph_transformations.h                   |  1 +
 .../propagate_fixed_sizes.cc                  | 25 +++++++++
 .../contrib/lite/toco/import_tensorflow.cc    | 15 ++++++
 tensorflow/contrib/lite/toco/model.h          | 11 ++++
 .../contrib/lite/toco/tflite/operator.cc      |  2 +
 tensorflow/contrib/lite/toco/toco_tooling.cc  |  1 +
 tensorflow/contrib/lite/toco/tooling_util.cc  | 11 ++++
 10 files changed, 133 insertions(+)
 create mode 100644 tensorflow/contrib/lite/toco/graph_transformations/convert_trivial_addn_to_add.cc

diff --git a/tensorflow/contrib/lite/toco/BUILD b/tensorflow/contrib/lite/toco/BUILD
index ad8f0e4a47..041e248790 100644
--- a/tensorflow/contrib/lite/toco/BUILD
+++ b/tensorflow/contrib/lite/toco/BUILD
@@ -172,6 +172,7 @@ cc_library(
         "graph_transformations/convert_expanddims_to_reshape.cc",
         "graph_transformations/convert_pure_conv_to_depthwise.cc",
         "graph_transformations/convert_reorder_axes.cc",
+        "graph_transformations/convert_trivial_addn_to_add.cc",
         "graph_transformations/convert_trivial_transpose_to_reshape.cc",
         "graph_transformations/create_im2col_arrays.cc",
         "graph_transformations/dequantize.cc",
diff --git a/tensorflow/contrib/lite/toco/export_tensorflow.cc b/tensorflow/contrib/lite/toco/export_tensorflow.cc
index 4fc01dbc20..529df3cd2e 100644
--- a/tensorflow/contrib/lite/toco/export_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/export_tensorflow.cc
@@ -519,6 +519,18 @@ void ConvertAddOperator(const Model& model, const AddOperator& src_op,
   (*add_op->mutable_attr())["T"].set_type(DT_FLOAT);
 }
 
+void ConvertAddNOperator(const Model& model, const AddNOperator& src_op,
+                         GraphDef* tensorflow_graph) {
+  auto* add_op = tensorflow_graph->add_node();
+  add_op->set_op("AddN");
+  add_op->set_name(src_op.outputs[0]);
+  for (const auto& input : src_op.inputs) {
+    *add_op->add_input() = input;
+  }
+  (*add_op->mutable_attr())["N"].set_i(src_op.inputs.size());
+  (*add_op->mutable_attr())["T"].set_type(DT_FLOAT);
+}
+
 void ConvertMulOperator(const Model& model, const MulOperator& src_op,
                         GraphDef* tensorflow_graph) {
   auto* add_op = tensorflow_graph->add_node();
@@ -1406,6 +1418,9 @@ void ConvertOperator(const Model& model, const Operator& src_op,
   } else if (src_op.type == OperatorType::kAdd) {
     ConvertAddOperator(model, static_cast<const AddOperator&>(src_op),
                        tensorflow_graph);
+  } else if (src_op.type == OperatorType::kAddN) {
+    ConvertAddNOperator(model, static_cast<const AddNOperator&>(src_op),
+                        tensorflow_graph);
   } else if (src_op.type == OperatorType::kMul) {
     ConvertMulOperator(model, static_cast<const MulOperator&>(src_op),
                        tensorflow_graph);
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/convert_trivial_addn_to_add.cc b/tensorflow/contrib/lite/toco/graph_transformations/convert_trivial_addn_to_add.cc
new file mode 100644
index 0000000000..dcaaddbf3b
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/convert_trivial_addn_to_add.cc
@@ -0,0 +1,51 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+// This pass will convert an AddN operator with only 2 inputs into a regular Add
+// operator, to which more optimizations may apply.
+bool ConvertTrivialAddNToAdd::Run(Model* model, std::size_t op_index) {
+  auto addn_it = model->operators.begin() + op_index;
+  if (addn_it->get()->type != OperatorType::kAddN) {
+    return false;
+  }
+  AddNOperator* addn_op = static_cast<AddNOperator*>(addn_it->get());
+  CHECK_GE(addn_op->inputs.size(), 2);
+  CHECK_EQ(addn_op->outputs.size(), 1);
+
+  // We only reduce AddN with N=2 to a regular Add.
+  if (addn_op->inputs.size() != 2) {
+    return false;
+  }
+
+  // Copy inputs & outputs to regular Add.
+  auto* add_op = new AddOperator;
+  add_op->inputs.push_back(addn_op->inputs[0]);
+  add_op->inputs.push_back(addn_op->inputs[1]);
+  add_op->outputs = addn_op->outputs;
+
+  // Replace the AddN operator in the graph.
+  const auto add_it = model->operators.emplace(addn_it, add_op);
+  addn_it = add_it + 1;
+  CHECK_EQ(addn_it->get(), addn_op);
+  model->operators.erase(addn_it);
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
index 4ac2265be9..e11bebcd4e 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
+++ b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
@@ -114,6 +114,7 @@ void RunGraphTransformations(Model* model, const string& message,
 // List of all graph transformations
 DECLARE_GRAPH_TRANSFORMATION(ConvertExpandDimsToReshape)
 DECLARE_GRAPH_TRANSFORMATION(ConvertPureConvToDepthwise)
+DECLARE_GRAPH_TRANSFORMATION(ConvertTrivialAddNToAdd)
 DECLARE_GRAPH_TRANSFORMATION(ConvertTrivialTransposeToReshape)
 DECLARE_GRAPH_TRANSFORMATION(ConvertReorderAxes)
 DECLARE_GRAPH_TRANSFORMATION(EnsureBiasVectors)
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index ff0a3bd881..4fb3b6ae7a 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -406,6 +406,28 @@ void ProcessSimpleBinaryOperator(Model* model, Operator* op) {
                                   &output_array);
 }
 
+void ProcessAddNOperator(Model* model, Operator* op) {
+  // Yield until all input dims have been resolved.
+  //
+  // TODO(myenik): Since AddN does not support broadcasting, maybe we could
+  // actually use this to improve shape propagation by propagating the shape of
+  // one input to all other inputs once it is resolved instead of just the
+  // output, since all inputs must be the same size and shape for a well-formed
+  // graph.
+  for (const auto& input : op->inputs) {
+    const auto& input_array = model->GetArray(input);
+    if (!input_array.has_shape()) {
+      return;
+    }
+  }
+
+  // AddN does not support broadcasting, all inputs must be the same shape, so
+  // we just take the first input shape and apply it to the output.
+  const auto& input0_array = model->GetArray(op->inputs[0]);
+  auto& output_array = model->GetArray(op->outputs[0]);
+  output_array.copy_shape(input0_array.shape());
+}
+
 bool KeepDims(const Operator& op) {
   switch (op.type) {
     case OperatorType::kTensorFlowMin:
@@ -1282,6 +1304,9 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
     case OperatorType::kTensorFlowGreaterEqual:
       ProcessSimpleBinaryOperator(model, op);
       break;
+    case OperatorType::kAddN:
+      ProcessAddNOperator(model, op);
+      break;
     case OperatorType::kConv:
       ProcessConvOperator(model, static_cast<ConvOperator*>(op));
       break;
diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.cc b/tensorflow/contrib/lite/toco/import_tensorflow.cc
index e8f318cd43..ca378af4c5 100644
--- a/tensorflow/contrib/lite/toco/import_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/import_tensorflow.cc
@@ -696,6 +696,19 @@ void ConvertAddOperator(const NodeDef& node,
   model->operators.emplace_back(op);
 }
 
+void ConvertAddNOperator(const NodeDef& node,
+                         const TensorFlowImportFlags& tf_import_flags,
+                         Model* model) {
+  CHECK_EQ(node.op(), "AddN");
+  const int num_inputs = GetInputsCount(node, tf_import_flags);
+  auto* op = new AddNOperator;
+  for (int i = 0; i < num_inputs; ++i) {
+    op->inputs.push_back(node.input(i));
+  }
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
 void ConvertMulOperator(const NodeDef& node,
                         const TensorFlowImportFlags& tf_import_flags,
                         Model* model) {
@@ -1862,6 +1875,8 @@ std::unique_ptr<Model> ImportTensorFlowGraphDef(
       ConvertSquareOperator(node, tf_import_flags, model);
     } else if (node.op() == "Add") {
       ConvertAddOperator(node, tf_import_flags, model);
+    } else if (node.op() == "AddN") {
+      ConvertAddNOperator(node, tf_import_flags, model);
     } else if (node.op() == "Mul") {
       ConvertMulOperator(node, tf_import_flags, model);
     } else if (node.op() == "Sub") {
diff --git a/tensorflow/contrib/lite/toco/model.h b/tensorflow/contrib/lite/toco/model.h
index b1b9b718bb..d1af371fd4 100644
--- a/tensorflow/contrib/lite/toco/model.h
+++ b/tensorflow/contrib/lite/toco/model.h
@@ -32,6 +32,7 @@ enum class OperatorType {
   kNone,
   // General-purpose neural network operators.
   kAdd,
+  kAddN,
   kAveragePool,
   kBatchNormalization,
   kConv,
@@ -559,6 +560,16 @@ struct AddOperator : Operator {
   AddOperator() : Operator(OperatorType::kAdd) {}
 };
 
+// Element-wise addition operator for N inputs.
+//
+// Inputs:
+//   inputs[i]: The i-th array to add together to form the output.
+//
+// TensorFlow equivalent: AddN
+struct AddNOperator : Operator {
+  AddNOperator() : Operator(OperatorType::kAddN) {}
+};
+
 // Concatenation operator: concatenates its inputs
 // along the axis.
 //
diff --git a/tensorflow/contrib/lite/toco/tflite/operator.cc b/tensorflow/contrib/lite/toco/tflite/operator.cc
index d75d1fcc5b..298f49025f 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator.cc
@@ -807,6 +807,8 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList() {
 
   // There operators are supported by Toco, but not by TF Lite, and has no
   // attributes.
+  ops.emplace_back(
+      new SimpleOperator<AddNOperator>("ADDN", OperatorType::kAddN));
   ops.emplace_back(new SimpleOperator<NegOperator>("NEG", OperatorType::kNeg));
   ops.emplace_back(new SimpleOperator<TensorFlowRsqrtOperator>(
       "RSQRT", OperatorType::kTensorFlowRsqrt));
diff --git a/tensorflow/contrib/lite/toco/toco_tooling.cc b/tensorflow/contrib/lite/toco/toco_tooling.cc
index f2753c84e9..720c33777d 100644
--- a/tensorflow/contrib/lite/toco/toco_tooling.cc
+++ b/tensorflow/contrib/lite/toco/toco_tooling.cc
@@ -52,6 +52,7 @@ void MakeGeneralGraphTransformationsSet(
     GraphTransformationsSet* transformations) {
   CHECK(transformations->empty());
   transformations->Add(new ConvertExpandDimsToReshape);
+  transformations->Add(new ConvertTrivialAddNToAdd);
   transformations->Add(new ConvertTrivialTransposeToReshape);
   transformations->Add(new ConvertReorderAxes);
   transformations->Add(new ResolveReshapeAttributes);
diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc
index 8543ba4742..99a54a300b 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.cc
+++ b/tensorflow/contrib/lite/toco/tooling_util.cc
@@ -197,6 +197,7 @@ const char* OperatorTypeName(OperatorType type) {
   case OperatorType::k##c:              \
     return #c;
     HANDLE_OPERATORTYPENAME_CASE(Add)
+    HANDLE_OPERATORTYPENAME_CASE(AddN)
     HANDLE_OPERATORTYPENAME_CASE(AveragePool)
     HANDLE_OPERATORTYPENAME_CASE(BatchNormalization)
     HANDLE_OPERATORTYPENAME_CASE(Conv)
@@ -1396,6 +1397,16 @@ bool EstimateArithmeticOpsCount(const Model& model, int64* result) {
         total += RequiredBufferSizeForShape(output_array.shape());
         break;
       }
+      case OperatorType::kAddN: {
+        const auto& output_array = model.GetArray(op->outputs[0]);
+        if (!output_array.has_shape()) {
+          return false;
+        }
+        // AddN cost is roughly the same cost as N-1 Adds.
+        const int num_adds = op->inputs.size() - 1;
+        total += num_adds * RequiredBufferSizeForShape(output_array.shape());
+        break;
+      }
       case OperatorType::kLogistic:
       case OperatorType::kSoftmax:
       case OperatorType::kTanh: {
-- 
GitLab


From 7b44779ff2031e73ea46ecc7cf2a73405d22a57a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 24 Jan 2018 16:28:20 -0800
Subject: [PATCH 124/966] Prepare internal kernels for BroadcastMul and
 BroadcastAdd

PiperOrigin-RevId: 183160907
---
 .../internal/optimized/optimized_ops.h        | 48 ++++++++++++---
 .../internal/reference/reference_ops.h        | 60 ++++++++++++++-----
 2 files changed, 86 insertions(+), 22 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
index c35b9da938..8163c76cfd 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -1538,9 +1538,10 @@ void Add(const int32* input1_data, const Dims<4>& input1_dims,
 // reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
 // is no longer referenced in this file, move NdArrayDesc<T> from types.h to
 // reference_ops.h.
-template <FusedActivationFunctionType Ac, typename T>
+template <typename T>
 void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims,
                   const T* input2_data, const Dims<4>& input2_dims,
+                  T output_activation_min, T output_activation_max,
                   T* output_data, const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("BroadcastAdd");
 
@@ -1563,15 +1564,30 @@ void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims,
     for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
       for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
         for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
-              input1_data[SubscriptToIndex(desc1, c, x, y, b)] +
-              input2_data[SubscriptToIndex(desc2, c, x, y, b)]);
+          output_data[Offset(output_dims, c, x, y, b)] =
+              ActivationFunctionWithMinMax(
+                  input1_data[SubscriptToIndex(desc1, c, x, y, b)] +
+                      input2_data[SubscriptToIndex(desc2, c, x, y, b)],
+                  output_activation_min, output_activation_max);
         }
       }
     }
   }
 }
 
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac, typename T>
+void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T* output_data, const Dims<4>& output_dims) {
+  T output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+
+  BroadcastAdd(input1_data, input1_dims, input2_data, input2_dims,
+               output_activation_min, output_activation_max, output_data,
+               output_dims);
+}
+
 inline void BroadcastAdd(int left_shift, const uint8* input1_data,
                          const Dims<4>& input1_dims, int32 input1_offset,
                          int32 input1_multiplier, int input1_shift,
@@ -1772,9 +1788,10 @@ void Mul(const int32* input1_data, const Dims<4>& input1_dims,
 // reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
 // is no longer referenced in this file, move NdArrayDesc<T> from types.h to
 // reference_ops.h.
-template <FusedActivationFunctionType Ac, typename T>
+template <typename T>
 void BroadcastMul(const T* input1_data, const Dims<4>& input1_dims,
                   const T* input2_data, const Dims<4>& input2_dims,
+                  T output_activation_min, T output_activation_max,
                   T* output_data, const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("BroadcastMul");
 
@@ -1797,15 +1814,30 @@ void BroadcastMul(const T* input1_data, const Dims<4>& input1_dims,
     for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
       for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
         for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
-              input1_data[SubscriptToIndex(desc1, c, x, y, b)] *
-              input2_data[SubscriptToIndex(desc2, c, x, y, b)]);
+          output_data[Offset(output_dims, c, x, y, b)] =
+              ActivationFunctionWithMinMax(
+                  input1_data[SubscriptToIndex(desc1, c, x, y, b)] *
+                      input2_data[SubscriptToIndex(desc2, c, x, y, b)],
+                  output_activation_min, output_activation_max);
         }
       }
     }
   }
 }
 
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac, typename T>
+void BroadcastMul(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T* output_data, const Dims<4>& output_dims) {
+  T output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+
+  BroadcastMul(input1_data, input1_dims, input2_data, input2_dims,
+               output_activation_min, output_activation_max, output_data,
+               output_dims);
+}
+
 inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
                          int32 input1_offset, const uint8* input2_data,
                          const Dims<4>& input2_dims, int32 input2_offset,
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index 64651d8348..31bade26f9 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -889,10 +889,11 @@ inline void Add(int left_shift, const uint8* input1_data,
 // dimensionality if the runtime code does a single loop over one dimension
 // that handles broadcasting as the base case. The code generator would then
 // generate max(D1, D2) nested for loops.
-template <FusedActivationFunctionType Ac>
-void BroadcastAdd(const float* input1_data, const Dims<4>& input1_dims,
-                  const float* input2_data, const Dims<4>& input2_dims,
-                  float* output_data, const Dims<4>& output_dims) {
+template <typename T>
+void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T output_activation_min, T output_activation_max,
+                  T* output_data, const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("BroadcastAdd");
 
   NdArrayDesc<4> desc1;
@@ -914,15 +915,30 @@ void BroadcastAdd(const float* input1_data, const Dims<4>& input1_dims,
     for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
       for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
         for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
-              input1_data[SubscriptToIndex(desc1, c, x, y, b)] +
-              input2_data[SubscriptToIndex(desc2, c, x, y, b)]);
+          output_data[Offset(output_dims, c, x, y, b)] =
+              ActivationFunctionWithMinMax(
+                  input1_data[SubscriptToIndex(desc1, c, x, y, b)] +
+                      input2_data[SubscriptToIndex(desc2, c, x, y, b)],
+                  output_activation_min, output_activation_max);
         }
       }
     }
   }
 }
 
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac, typename T>
+void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T* output_data, const Dims<4>& output_dims) {
+  T output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+
+  BroadcastAdd(input1_data, input1_dims, input2_data, input2_dims,
+               output_activation_min, output_activation_max, output_data,
+               output_dims);
+}
+
 inline void BroadcastAdd(int left_shift, const uint8* input1_data,
                          const Dims<4>& input1_dims, int32 input1_offset,
                          int32 input1_multiplier, int input1_shift,
@@ -1053,10 +1069,11 @@ void Mul(const float* input1_data, const Dims<4>& input1_dims,
 // dimensionality if the runtime code does a single loop over one dimension
 // that handles broadcasting as the base case. The code generator would then
 // generate max(D1, D2) nested for loops.
-template <FusedActivationFunctionType Ac>
-void BroadcastMul(const float* input1_data, const Dims<4>& input1_dims,
-                  const float* input2_data, const Dims<4>& input2_dims,
-                  float* output_data, const Dims<4>& output_dims) {
+template <typename T>
+void BroadcastMul(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T output_activation_min, T output_activation_max,
+                  T* output_data, const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("BroadcastMul");
 
   NdArrayDesc<4> desc1;
@@ -1078,15 +1095,30 @@ void BroadcastMul(const float* input1_data, const Dims<4>& input1_dims,
     for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
       for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
         for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
-              input1_data[SubscriptToIndex(desc1, c, x, y, b)] *
-              input2_data[SubscriptToIndex(desc2, c, x, y, b)]);
+          output_data[Offset(output_dims, c, x, y, b)] =
+              ActivationFunctionWithMinMax(
+                  input1_data[SubscriptToIndex(desc1, c, x, y, b)] *
+                      input2_data[SubscriptToIndex(desc2, c, x, y, b)],
+                  output_activation_min, output_activation_max);
         }
       }
     }
   }
 }
 
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac, typename T>
+void BroadcastMul(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T* output_data, const Dims<4>& output_dims) {
+  T output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+
+  BroadcastMul(input1_data, input1_dims, input2_data, input2_dims,
+               output_activation_min, output_activation_max, output_data,
+               output_dims);
+}
+
 inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
                          int32 input1_offset, const uint8* input2_data,
                          const Dims<4>& input2_dims, int32 input2_offset,
-- 
GitLab


From 0454180a47bdbe7174a2f51ecbe76b2aed2ce719 Mon Sep 17 00:00:00 2001
From: Russell Power <power@google.com>
Date: Wed, 24 Jan 2018 16:41:50 -0800
Subject: [PATCH 125/966] Defer logging infeed error messages for a short time
 to see if the main session returns.

PiperOrigin-RevId: 183162866
---
 .../contrib/tpu/python/tpu/tpu_estimator.py   | 457 ++++++++++--------
 1 file changed, 249 insertions(+), 208 deletions(-)

diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index b6d685b3fc..2ae3a26a85 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ===================================================================
-
 """TPUEstimator class."""
 
 from __future__ import absolute_import
@@ -24,6 +23,7 @@ from contextlib import contextmanager
 import copy
 import threading
 import time
+import traceback
 
 import six
 from six.moves import queue as Queue  # pylint: disable=redefined-builtin
@@ -60,7 +60,6 @@ from tensorflow.python.training import session_run_hook
 from tensorflow.python.training import training
 from tensorflow.python.training import training_util
 
-
 _INITIAL_LOSS = 1e7
 _ZERO_LOSS = 0.
 _TPU_ESTIMATOR = 'tpu_estimator'
@@ -86,8 +85,7 @@ def _create_global_step(graph):
         initializer=init_ops.zeros_initializer(),
         trainable=False,
         use_resource=True,
-        collections=[ops.GraphKeys.GLOBAL_VARIABLES,
-                     ops.GraphKeys.GLOBAL_STEP])
+        collections=[ops.GraphKeys.GLOBAL_VARIABLES, ops.GraphKeys.GLOBAL_STEP])
 
 
 def _create_or_get_iterations_per_loop():
@@ -100,8 +98,8 @@ def _create_or_get_iterations_per_loop():
     raise RuntimeError('Multiple iterations_per_loop_var in collection.')
 
   with ops.colocate_with(training_util.get_global_step()):
-    with variable_scope.variable_scope(_TPU_ESTIMATOR,
-                                       reuse=variable_scope.AUTO_REUSE):
+    with variable_scope.variable_scope(
+        _TPU_ESTIMATOR, reuse=variable_scope.AUTO_REUSE):
       return variable_scope.get_variable(
           _ITERATIONS_PER_LOOP_VAR,
           initializer=init_ops.zeros_initializer(),
@@ -242,9 +240,9 @@ class _TPUContext(object):
         return self._eval_batch_size
       return None
 
-    global_batch_size = (self._train_batch_size if
-                         mode == model_fn_lib.ModeKeys.TRAIN
-                         else self._eval_batch_size)
+    global_batch_size = (
+        self._train_batch_size
+        if mode == model_fn_lib.ModeKeys.TRAIN else self._eval_batch_size)
     # On TPU
     if self.is_input_sharded_per_core():
       return global_batch_size // self.num_cores
@@ -291,8 +289,9 @@ class _TPUContext(object):
     # The tpu job is determined by the run_config. Right now, this method is
     # required as tpu_config is not part of the RunConfig.
     mode = self._assert_mode()
-    master = (run_config.evaluation_master if mode == model_fn_lib.ModeKeys.EVAL
-              else run_config.master)
+    master = (
+        run_config.evaluation_master
+        if mode == model_fn_lib.ModeKeys.EVAL else run_config.master)
     if master in _LOCAL_MASTERS:
       return None
 
@@ -319,6 +318,7 @@ class _TPUContext(object):
   def tpu_host_placement_function(self):
     """Returns the TPU host place function."""
     master = self.master_job
+
     def _placement_function(_sentinal=None, core_id=None, host_id=None):  # pylint: disable=invalid-name
       assert _sentinal is None
       if core_id is not None and host_id is not None:
@@ -333,19 +333,23 @@ class _TPUContext(object):
         if core_id is not None:
           host_id = core_id / 8
         return '/job:%s/task:%d/device:CPU:0' % (master, host_id)
+
     return _placement_function
 
   @property
   def tpu_device_placement_function(self):
     master = self.master_job
     job_device = '' if master is None else ('/job:%s' % master)
+
     def _placement_function(i):
       return '%s/task:%d/device:TPU:%d' % (job_device, i / 8, i % 8)
+
     return _placement_function
 
   @property
   def tpu_ordinal_function(self):
     """Returns the TPU ordinal fn."""
+
     def _tpu_ordinal_function(index):
       """Return the TPU ordinal associated with a shard.
 
@@ -358,6 +362,7 @@ class _TPUContext(object):
         The ordinal of the TPU device the shard's infeed should be placed on.
       """
       return index % 8
+
     return _tpu_ordinal_function
 
 
@@ -371,14 +376,16 @@ class _SIGNAL(object):
   STOP = -2
 
 
-class TPUEstimatorSpec(collections.namedtuple('TPUEstimatorSpec', [
-    'mode',
-    'predictions',
-    'loss',
-    'train_op',
-    'eval_metrics',
-    'export_outputs',
-    'scaffold_fn'])):
+class TPUEstimatorSpec(
+    collections.namedtuple('TPUEstimatorSpec', [
+        'mode',
+        'predictions',
+        'loss',
+        'train_op',
+        'eval_metrics',
+        'export_outputs',
+        'scaffold_fn'
+    ])):
   """Ops and objects returned from a `model_fn` and passed to `TPUEstimator`.
 
   See `EstimatorSpec` for `mode`, 'predictions, 'loss', 'train_op', and
@@ -416,111 +423,116 @@ class TPUEstimatorSpec(collections.namedtuple('TPUEstimatorSpec', [
     """Creates a validated `TPUEstimatorSpec` instance."""
     if eval_metrics is not None:
       _EvalMetrics.validate(eval_metrics)
-    return super(TPUEstimatorSpec, cls).__new__(cls,
-                                                mode=mode,
-                                                predictions=predictions,
-                                                loss=loss,
-                                                train_op=train_op,
-                                                eval_metrics=eval_metrics,
-                                                export_outputs=export_outputs,
-                                                scaffold_fn=scaffold_fn)
+    return super(TPUEstimatorSpec, cls).__new__(
+        cls,
+        mode=mode,
+        predictions=predictions,
+        loss=loss,
+        train_op=train_op,
+        eval_metrics=eval_metrics,
+        export_outputs=export_outputs,
+        scaffold_fn=scaffold_fn)
 
   def as_estimator_spec(self):
     """Creates an equivalent `EstimatorSpec` used by CPU train/eval."""
     eval_metric_ops = _EvalMetrics.to_metric_metric_ops_for_cpu(
         self.eval_metrics)
     scaffold = self.scaffold_fn() if self.scaffold_fn else None
-    return model_fn_lib.EstimatorSpec(mode=self.mode,
-                                      predictions=self.predictions,
-                                      loss=self.loss,
-                                      train_op=self.train_op,
-                                      eval_metric_ops=eval_metric_ops,
-                                      export_outputs=self.export_outputs,
-                                      scaffold=scaffold)
+    return model_fn_lib.EstimatorSpec(
+        mode=self.mode,
+        predictions=self.predictions,
+        loss=self.loss,
+        train_op=self.train_op,
+        eval_metric_ops=eval_metric_ops,
+        export_outputs=self.export_outputs,
+        scaffold=scaffold)
+
+
+class _OpQueueContext(object):
+  """Manages work queue and thread for a infeed/outfeed thread."""
+
+  def __init__(self, name, target, args):
+    self._name = name
+    self._queue = Queue.Queue()
+    args = (self,) + args
+    self._thread = threading.Thread(name=name, target=target, args=args)
+    self._thread.daemon = True
+    self._thread.start()
+
+  def stop(self):
+    self._queue.put(_SIGNAL.STOP)
+
+  def send_next_batch_signal(self, iterations):
+    self._queue.put(iterations)
+
+  def read_iteration_counts(self):
+    while True:
+      signal = self._queue.get(block=True)
+      logging.debug('%s read signal %s', self._name, signal)
+      if signal == _SIGNAL.STOP:
+        logging.info('%s received signal, stopping.', self._name)
+        return
+      yield signal
 
+  def join(self):
+    logging.info('Shutting down %s thread.' % self._name)
+    self.stop()
+    self._thread.join()
 
-class _InfeedOutfeedThreadBaseController(object):
-  """This wraps the infeed/outfeed thread and stops when Estimator finishes."""
 
-  def __init__(self, thd):
-    self._signal_queue = Queue.Queue()
-    thd.daemon = True
-    thd.start()
-    self._thd = thd
+class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
+  """A Session hook setting up the TPU initialization, infeed, and outfeed.
 
-  def block_and_get_signal(self):
-    return self._signal_queue.get()
+  This hook does two major things:
+  1. initialize and shutdown TPU system.
+  2. launch and join the threads for infeed enqueue and (optional) outfeed
+     dequeue.
+  """
 
-  def send_next_batch_signal(self, signal=_SIGNAL.NEXT_BATCH):
-    self._signal_queue.put(signal)
+  def __init__(self, ctx, enqueue_ops, dequeue_ops=None):
+    self._master_job = ctx.master_job
+    self._enqueue_ops = enqueue_ops
+    self._dequeue_ops = dequeue_ops
+    self._initial_infeed_sleep_secs = (
+        ctx.config.tpu_config.initial_infeed_sleep_secs)
+    self._session_cancel_timer = None
 
-  def join(self):
-    self._signal_queue.put(_SIGNAL.STOP)
-    self._thd.join()
+    self._feed_error = None
+    self._finished = False
 
+  def begin(self):
+    logging.info('TPU job name %s', self._master_job)
+    self._iterations_per_loop_var = _create_or_get_iterations_per_loop()
+    self._init_op = [tpu.initialize_system(job=self._master_job)]
+    self._finalize_op = [tpu.shutdown_system(job=self._master_job)]
 
-class _OutfeedThreadController(_InfeedOutfeedThreadBaseController):
-  """This wraps the outfeed thread and stops when Estimator finishes."""
+  def _log_error(self, session, error):
+    """Log an infeed or outfeed error.
 
-  def __init__(self, session, dequeue_ops):
-    super(_OutfeedThreadController, self).__init__(
-        threading.Thread(target=self._execute_dequeue_ops,
-                         args=(session, dequeue_ops)))
+    This logs a short error message immediately, and schedules a timer to
+    emit the full stack trace and error message after a short period of time.
+    If the main session has terminated by the time the timer triggers, we
+    assume the real source of the error was from the main session and avoid
+    emitting a stack trace for the infeed.
 
-  def _execute_dequeue_ops(self, session, dequeue_ops):
-    count = 0
-    while True:
-      signal = self.block_and_get_signal()
-      if signal == _SIGNAL.STOP:
-        logging.info('Stop outfeed thread.')
-        return
+    Args:
+      session: `tf.Session`, session to be terminated
+      error: exception that triggered logging.
+    """
+    logging.warning(
+        '\n\n'
+        'Error occurred during infeed/outfeed.  This may be due to a compile '
+        'error in the main session.  Waiting for a short time for the main '
+        'session to come back.\n\n%s', error)
 
-      iterations = signal
-      for i in range(iterations):
-        logging.debug('Outfeed dequeue for iteration (%d, %d)', count, i)
-        session.run(dequeue_ops)
-      count += 1
+    self._feed_error = traceback.format_exc()
 
-  def join(self):
-    logging.info('Waiting for Outfeed Thread to exit.')
-    super(_OutfeedThreadController, self).join()
-
-
-class _InfeedThreadController(_InfeedOutfeedThreadBaseController):
-  """This wraps the infeed thread and stops when Estimator finishes."""
-
-  def __init__(self, session, enqueue_ops, initial_infeed_sleep_secs):
-    super(_InfeedThreadController, self).__init__(
-        threading.Thread(
-            target=self._input_thread_fn_for_loading,
-            args=(session, enqueue_ops, initial_infeed_sleep_secs)))
-
-  def _input_thread_fn_for_loading(self, session, enqueue_ops,
-                                   initial_infeed_sleep_secs):
-    count = 0
-    if initial_infeed_sleep_secs:
-      logging.info('Infeed thread sleeping for %d seconds.',
-                   initial_infeed_sleep_secs)
-      time.sleep(initial_infeed_sleep_secs)
-      logging.info('Infeed thread starting after sleep')
-    try:
-      while True:
-        signal = self._signal_queue.get()
-        if signal == _SIGNAL.STOP:
-          logging.info('Stop Infeed input thread.')
-          return
-
-        if _WRAP_INPUT_FN_INTO_WHILE_LOOP:
-          # Enqueue batches for next loop.
-          session.run(enqueue_ops)
-        else:
-          iterations = signal
-          for i in range(iterations):
-            logging.debug('Infeed enqueue for iteration (%d, %d)', count, i)
-            session.run(enqueue_ops)
-          count += 1
+    # If we've already encountered a feed error, don't schedule another
+    # cancellation op.
+    if self._session_cancel_timer:
+      return
 
-    except Exception:  # pylint: disable=broad-except
+    def _cancel_session():
       # Close the session to avoid the main thread from hanging. If input
       # pipeline triggers any error, the infeed thread dies but the main thread
       # for TPU computation waits for the infeed enqueue forever. Close the
@@ -535,77 +547,94 @@ class _InfeedThreadController(_InfeedOutfeedThreadBaseController):
       # exception in the main thread, instead of the expected compile error.
       # User code that depends on having the proper exception type will
       # therefore be confused.
-      logging.error(
-          'Failed running infeed, closing session.\n'
-          'You may see an exception from your main session after this. '
-          'Sleep for 2 minutes before close Session from infeed thread to '
-          'allow the main thread returning an error first, if any.',
-          exc_info=1
-      )
-      time.sleep(120)
-      logging.error('Closing the failed session.')
-      session.close()
-
-  def join(self):
-    logging.info('Waiting for Infeed Thread to exit.')
-    super(_InfeedThreadController, self).join()
-
-
-class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
-  """A Session hook setting up the TPU initialization, infeed, and outfeed.
-
-  This hook does two major things:
-  1. initialize and shutdown TPU system.
-  2. launch and join the threads for infeed enqueue and (optional) outfeed
-     dequeue.
-  """
+      time.sleep(5)
+
+      # If the main session is still running, the infeed/outfeed errors are
+      # legitimate, and should be logged.
+      if not self._finished:
+        logging.error('Feed error: %s', self._feed_error)
+        logging.error('Closing session.  A RuntimeError should follow.')
+        session.close()
+
+    self._session_cancel_timer = threading.Thread(target=_cancel_session)
+    self._session_cancel_timer.daemon = True
+    self._session_cancel_timer.start()
+
+  def _run_infeed(self, queue_ctx, session):
+    logging.info('Starting infeed thread controller.')
+    if self._initial_infeed_sleep_secs:
+      logging.info('%s thread sleeping for %d seconds.', self._name,
+                   self._initial_infeed_sleep_secs)
+      time.sleep(self._initial_infeed_sleep_secs)
+      logging.info('%s thread starting after sleep', self._name)
 
-  def __init__(self, ctx, enqueue_ops, dequeue_ops=None):
-    self._master_job = ctx.master_job
-    self._enqueue_ops = enqueue_ops
-    self._dequeue_ops = dequeue_ops
-    self._initial_infeed_sleep_secs = (
-        ctx.config.tpu_config.initial_infeed_sleep_secs)
+    try:
+      if _WRAP_INPUT_FN_INTO_WHILE_LOOP:
+        for _ in queue_ctx.read_iteration_counts():
+          session.run(self._enqueue_ops)
+      else:
+        for count, steps in enumerate(queue_ctx.read_iteration_counts()):
+          for i in xrange(steps):
+            logging.debug('Infeed enqueue for iteration (%d, %d)', count, i)
+            session.run(self._enqueue_ops)
+      logging.debug('Infeed thread finished, shutting down.')
+    except Exception as e:  # pylint: disable=broad-except
+      self._log_error(session, e)
 
-  def begin(self):
-    logging.info('TPU job name %s', self._master_job)
-    self._iterations_per_loop_var = _create_or_get_iterations_per_loop()
-    self._init_op = [tpu.initialize_system(job=self._master_job)]
-    self._finalize_op = [tpu.shutdown_system(job=self._master_job)]
+  def _run_outfeed(self, queue_ctx, session):
+    logging.info('Starting outfeed thread controller.')
+    try:
+      for count, steps in enumerate(queue_ctx.read_iteration_counts()):
+        for i in xrange(steps):
+          logging.debug('Outfeed dequeue for iteration (%d, %d)', count, i)
+          session.run(self._dequeue_ops)
+    except Exception as e:  # pylint: disable=broad-except
+      self._log_error(session, e)
 
   def after_create_session(self, session, coord):
     logging.info('Init TPU system')
-    session.run(self._init_op,
-                options=config_pb2.RunOptions(timeout_in_ms=5*60*1000))
+    session.run(
+        self._init_op,
+        options=config_pb2.RunOptions(timeout_in_ms=5 * 60 * 1000))
 
     logging.info('Start infeed thread controller')
-    self._infeed_thd_controller = _InfeedThreadController(
-        session, self._enqueue_ops, self._initial_infeed_sleep_secs)
+    self._infeed_controller = _OpQueueContext(
+        name='InfeedController', target=self._run_infeed, args=(session,))
 
     if self._dequeue_ops is not None:
       logging.info('Start outfeed thread controller')
-      self._outfeed_thd_controller = _OutfeedThreadController(
-          session, self._dequeue_ops)
+      self._outfeed_controller = _OpQueueContext(
+          name='OutfeedController', target=self._run_outfeed, args=(session,))
 
   def before_run(self, run_context):
+    if self._feed_error:
+      logging.warning('Feed error occurred, terminating session.')
+      run_context.request_stop()
+      return
+
     iterations = run_context.session.run(self._iterations_per_loop_var)
 
     logging.info('Enqueue next (%d) batch(es) of data to infeed.', iterations)
+    self._infeed_controller.send_next_batch_signal(iterations)
 
-    self._infeed_thd_controller.send_next_batch_signal(iterations)
     if self._dequeue_ops is not None:
       # TODO(xiejw): Refactor the outfeed dequeue into tf.while_loop.
-      logging.info(
-          'Dequeue next (%d) batch(es) of data from outfeed.', iterations)
-      self._outfeed_thd_controller.send_next_batch_signal(iterations)
+      logging.info('Dequeue next (%d) batch(es) of data from outfeed.',
+                   iterations)
+      self._outfeed_controller.send_next_batch_signal(iterations)
 
   def end(self, session):
+    if self._session_cancel_timer:
+      logging.warning('Feed error occurred; waiting for message.')
+      self._session_cancel_timer.join()
+
+    self._finished = True
     logging.info('Stop infeed thread controller')
-    self._infeed_thd_controller.join()
+    self._infeed_controller.join()
 
     if self._dequeue_ops is not None:
       logging.info('Stop output thread controller')
-      self._outfeed_thd_controller.join()
+      self._outfeed_controller.join()
 
     logging.info('Shutdown TPU system.')
     session.run(self._finalize_op)
@@ -676,8 +705,8 @@ class _TPUStopAtStepHook(session_run_hook.SessionRunHook):
       run_context.request_stop()
     else:
       iterations = self._next_iterations(global_step, self._last_step)
-      self._iterations_per_loop_var.load(iterations,
-                                         session=run_context.session)
+      self._iterations_per_loop_var.load(
+          iterations, session=run_context.session)
 
 
 class _SetEvalIterationsHook(session_run_hook.SessionRunHook):
@@ -698,8 +727,8 @@ class _SetEvalIterationsHook(session_run_hook.SessionRunHook):
     self._iterations_per_loop_var.load(self._num_steps, session=session)
 
 
-def generate_per_core_enqueue_ops_fn_for_host(
-    ctx, input_fn, inputs_structure_recorder):
+def generate_per_core_enqueue_ops_fn_for_host(ctx, input_fn,
+                                              inputs_structure_recorder):
   """Generates infeed enqueue ops for per-core input_fn on a single host."""
   captured_infeed_queue = _CapturedObject()
 
@@ -729,9 +758,9 @@ def generate_per_core_enqueue_ops_fn_for_host(
         per_host_sharded_inputs)
 
     per_host_enqueue_ops = infeed_queue.generate_enqueue_ops(
-        per_host_sharded_inputs,
-        tpu_ordinal_function=ctx.tpu_ordinal_function)
+        per_host_sharded_inputs, tpu_ordinal_function=ctx.tpu_ordinal_function)
     return per_host_enqueue_ops
+
   return enqueue_ops_fn, captured_infeed_queue
 
 
@@ -748,8 +777,7 @@ def generate_per_host_enqueue_ops_fn_for_host(
         features, labels = inputs
       else:
         features, labels = inputs, None
-      inputs_structure_recorder.validate_and_record_structure(
-          features, labels)
+      inputs_structure_recorder.validate_and_record_structure(features, labels)
       unsharded_tensor_list = (
           inputs_structure_recorder.flatten_features_and_labels(
               features, labels))
@@ -763,9 +791,9 @@ def generate_per_host_enqueue_ops_fn_for_host(
 
       per_host_enqueue_ops = (
           infeed_queue.split_inputs_and_generate_enqueue_ops(
-              unsharded_tensor_list,
-              placement_function=lambda x: device))
+              unsharded_tensor_list, placement_function=lambda x: device))
       return per_host_enqueue_ops
+
   return enqueue_ops_fn, captured_infeed_queue
 
 
@@ -815,6 +843,7 @@ class _InputPipeline(object):
 
     def validate_and_record_structure(self, features, labels):
       """Validates and records the structure of features` and `labels`."""
+
       def _extract_key_names(tensor_or_dict):
         if tensor_or_dict is None:
           return []
@@ -842,8 +871,8 @@ class _InputPipeline(object):
       flattened_inputs = []
       if self._feature_names:
         # We need a fixed ordering for enqueueing and dequeueing.
-        flattened_inputs.extend([features[name]
-                                 for name in self._feature_names])
+        flattened_inputs.extend(
+            [features[name] for name in self._feature_names])
       else:
         flattened_inputs.append(features)
 
@@ -870,11 +899,11 @@ class _InputPipeline(object):
         ValueError: If the number of expected tensors from `flattened_inputs`
           mismatches the recorded structure.
       """
-      expected_num_features = (len(self._feature_names) if self._feature_names
-                               else 1)
+      expected_num_features = (
+          len(self._feature_names) if self._feature_names else 1)
       if self._has_labels:
-        expected_num_labels = (len(self._label_names) if self._label_names
-                               else 1)
+        expected_num_labels = (
+            len(self._label_names) if self._label_names else 1)
       else:
         expected_num_labels = 0
 
@@ -895,8 +924,8 @@ class _InputPipeline(object):
       if expected_num_labels == 0:
         unflattened_label = None
       elif self._label_names:
-        unflattened_label = dict(zip(self._label_names,
-                                     flattened_inputs[expected_num_features:]))
+        unflattened_label = dict(
+            zip(self._label_names, flattened_inputs[expected_num_features:]))
       else:
         # Single tensor case.
         unflattened_label = flattened_inputs[expected_num_features]
@@ -961,8 +990,9 @@ class _InputPipeline(object):
                     self._ctx, self._input_fn, self._inputs_structure_recorder))
 
             if _WRAP_INPUT_FN_INTO_WHILE_LOOP:
-              enqueue_ops.append(_wrap_computation_in_while_loop(
-                  device=host_device, op_fn=enqueue_ops_fn))
+              enqueue_ops.append(
+                  _wrap_computation_in_while_loop(
+                      device=host_device, op_fn=enqueue_ops_fn))
             else:
               enqueue_ops.append(enqueue_ops_fn())
             # Infeed_queue_getter must be called after enqueue_ops_fn is called.
@@ -979,8 +1009,9 @@ class _InputPipeline(object):
                     self._batch_axis, host_device))
 
             if _WRAP_INPUT_FN_INTO_WHILE_LOOP:
-              enqueue_ops.append(_wrap_computation_in_while_loop(
-                  device=host_device, op_fn=enqueue_ops_fn))
+              enqueue_ops.append(
+                  _wrap_computation_in_while_loop(
+                      device=host_device, op_fn=enqueue_ops_fn))
             else:
               enqueue_ops.append(enqueue_ops_fn())
             infeed_queues.append(captured_infeed_queue.get())
@@ -1066,6 +1097,7 @@ class _ModelFnWrapper(object):
 
       with ops.control_dependencies([train_op]):
         return array_ops.identity(loss)
+
     return train_step, captured_scaffold_fn
 
   def convert_to_single_tpu_eval_step(self, dequeue_fn):
@@ -1114,6 +1146,7 @@ class _ModelFnWrapper(object):
 
       with ops.control_dependencies([outfeed_ops]):
         return math_ops.add(total_loss, loss)
+
     return eval_step, eval_metrics, captured_scaffold_fn
 
   def _call_model_fn(self, features, labels):
@@ -1138,10 +1171,9 @@ class _ModelFnWrapper(object):
       kwargs['params'] = params
 
     if 'params' not in model_fn_args:
-      raise ValueError(
-          'model_fn ({}) does not include params argument, '
-          'required by TPUEstimator to pass batch size as '
-          'params[\'batch_size\']'.format(self._model_fn))
+      raise ValueError('model_fn ({}) does not include params argument, '
+                       'required by TPUEstimator to pass batch size as '
+                       'params[\'batch_size\']'.format(self._model_fn))
 
     batch_size_for_model_fn = self._ctx.batch_size_for_model_fn
     if batch_size_for_model_fn is not None:
@@ -1348,8 +1380,9 @@ class ExamplesPerSecondHook(basic_session_run_hooks.StepCounterHook):
   def _log_and_record(self, elapsed_steps, elapsed_time, global_step):
     examples_per_sec = self._batch_size * elapsed_steps / elapsed_time
     if self._summary_writer is not None:
-      example_summary = Summary(value=[Summary.Value(
-          tag='examples_sec', simple_value=examples_per_sec)])
+      example_summary = Summary(value=[
+          Summary.Value(tag='examples_sec', simple_value=examples_per_sec)
+      ])
       self._summary_writer.add_summary(example_summary, global_step)
     logging.info('examples/sec: %g', examples_per_sec)
 
@@ -1488,9 +1521,8 @@ class TPUEstimator(estimator_lib.Estimator):
           '`config` must be provided with type `tpu_config.RunConfig`')
 
     if params is not None and any(k in params for k in _RESERVED_PARAMS_KEYS):
-      raise ValueError(
-          '{} are reserved keys but existed in params {}.'.format(
-              _RESERVED_PARAMS_KEYS, params))
+      raise ValueError('{} are reserved keys but existed in params {}.'.format(
+          _RESERVED_PARAMS_KEYS, params))
 
     if use_tpu:
       if train_batch_size is None:
@@ -1571,8 +1603,9 @@ class TPUEstimator(estimator_lib.Estimator):
     if max_steps is not None:
       util_lib.check_positive_integer(max_steps, 'Train max_steps')
 
-    return [_TPUStopAtStepHook(self._iterations_per_training_loop, steps,
-                               max_steps)]
+    return [
+        _TPUStopAtStepHook(self._iterations_per_training_loop, steps, max_steps)
+    ]
 
   def _convert_eval_steps_to_hooks(self, steps):
     with self._ctx.with_mode(model_fn_lib.ModeKeys.EVAL) as ctx:
@@ -1640,6 +1673,7 @@ class TPUEstimator(estimator_lib.Estimator):
       # `features` in `model_fn` signature.
       def _input_fn():
         return input_fn(**kwargs)
+
       return _input_fn
 
   def _augment_model_fn(self, model_fn, batch_axis):
@@ -1695,9 +1729,10 @@ class TPUEstimator(estimator_lib.Estimator):
         total_loss, eval_metric_ops, scaffold = _eval_on_tpu_system(
             ctx, model_fn_wrapper, dequeue_fn)
         iterations_per_loop_var = _create_or_get_iterations_per_loop()
-        mean_loss = math_ops.div(
-            total_loss,
-            math_ops.cast(iterations_per_loop_var, dtype=total_loss.dtype))
+        mean_loss = math_ops.div(total_loss,
+                                 math_ops.cast(
+                                     iterations_per_loop_var,
+                                     dtype=total_loss.dtype))
 
         # Creates a dummy metric update_op for all metrics. Estimator expects
         # all metrics in eval_metric_ops have update_op and calls them one by
@@ -1725,6 +1760,7 @@ class TPUEstimator(estimator_lib.Estimator):
             evaluation_hooks=hooks,
             eval_metric_ops=eval_metric_ops,
             scaffold=scaffold)
+
     return _model_fn
 
 
@@ -1737,15 +1773,16 @@ def _eval_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
       model_fn_wrapper.convert_to_single_tpu_eval_step(dequeue_fn))
 
   def multi_tpu_eval_steps_on_single_shard():
-    return training_loop.repeat(iterations_per_loop_var,
-                                single_tpu_eval_step,
-                                [_ZERO_LOSS],
-                                name='loop')
+    return training_loop.repeat(
+        iterations_per_loop_var,
+        single_tpu_eval_step, [_ZERO_LOSS],
+        name='loop')
 
-  (loss,) = tpu.shard(multi_tpu_eval_steps_on_single_shard,
-                      inputs=[],
-                      num_shards=num_cores,
-                      outputs_from_all_shards=False)
+  (loss,) = tpu.shard(
+      multi_tpu_eval_steps_on_single_shard,
+      inputs=[],
+      num_shards=num_cores,
+      outputs_from_all_shards=False)
 
   scaffold = _get_scaffold(captured_scaffold_fn)
   return loss, eval_metric_ops, scaffold
@@ -1762,14 +1799,14 @@ def _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
   def multi_tpu_train_steps_on_single_shard():
     return training_loop.repeat(
         iterations_per_loop_var,
-        single_tpu_train_step,
-        [_INITIAL_LOSS],
+        single_tpu_train_step, [_INITIAL_LOSS],
         name=b'loop')
 
-  (loss,) = tpu.shard(multi_tpu_train_steps_on_single_shard,
-                      inputs=[],
-                      num_shards=num_cores,
-                      outputs_from_all_shards=False)
+  (loss,) = tpu.shard(
+      multi_tpu_train_steps_on_single_shard,
+      inputs=[],
+      num_shards=num_cores,
+      outputs_from_all_shards=False)
 
   scaffold = _get_scaffold(captured_scaffold_fn)
   return loss, scaffold
@@ -1777,6 +1814,7 @@ def _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
 
 def _wrap_computation_in_while_loop(device, op_fn):
   """Wraps the ops generated by `op_fn` in tf.while_loop."""
+
   def computation(i):
     with ops.control_dependencies(op_fn()):
       return i + 1
@@ -1788,7 +1826,8 @@ def _wrap_computation_in_while_loop(device, op_fn):
     iterations = array_ops.identity(iterations_per_loop_var)
     return control_flow_ops.while_loop(
         lambda i: i < iterations,
-        computation, [constant_op.constant(0)], parallel_iterations=1)
+        computation, [constant_op.constant(0)],
+        parallel_iterations=1)
 
 
 def _validate_tpu_training_graph():
@@ -1801,8 +1840,9 @@ def _validate_tpu_training_graph():
 
   # Check if there is atleast one CrossReplicaSum operation in the graph
   # This should be introduced by using the CrossShardOptimizer wrapper
-  cross_replica_sum_ops = [o for o in operations
-                           if o.type == _CROSS_REPLICA_SUM_OP]
+  cross_replica_sum_ops = [
+      o for o in operations if o.type == _CROSS_REPLICA_SUM_OP
+  ]
   if not cross_replica_sum_ops:
     raise ValueError(
         'CrossShardOptimizer must be used for model training on TPUs.')
@@ -1849,9 +1889,11 @@ def _get_scaffold(captured_scaffold_fn):
 
   if scaffold:
     wrapped_finalize = scaffold.finalize
+
     def _finalize():
       with _CapturingContext('Inside Scaffold.finalize'):
         wrapped_finalize()
+
     scaffold.finalize = _finalize
   return scaffold
 
@@ -1866,9 +1908,8 @@ class _CapturingContext(control_flow_ops.ControlFlowContext):
   def AddOp(self, op):  # pylint: disable=invalid-name
     for c in op.inputs:
       if tpu._TPU_REPLICATE_ATTR in c.op.node_def.attr:  # pylint: disable=protected-access
-        raise ValueError(
-            '{}: Op {} depends on TPU computation {}, '
-            'which is not allowed.'.format(self._message, op, c))
+        raise ValueError('{}: Op {} depends on TPU computation {}, '
+                         'which is not allowed.'.format(self._message, op, c))
 
   def __enter__(self):
     # pylint: disable=protected-access
-- 
GitLab


From 5e3ed99469d32e494869b8d044620b2ef8e96a40 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 24 Jan 2018 17:11:42 -0800
Subject: [PATCH 126/966] Dedup control dependencies where the node already
 depends on the source as a non-control input.

PiperOrigin-RevId: 183166819
---
 .../grappler/optimizers/dependency_optimizer.cc |  6 +-----
 tensorflow/python/framework/function_test.py    | 17 +++++++++++------
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
index 1f68ecbade..d2da125236 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
@@ -58,11 +58,7 @@ void PruneControlInputs(NodeDef* node) {
   int pos = 0;
   while (pos < node->input_size()) {
     const string& input = node->input(pos);
-    // TODO(rmlarsen): Remove control inputs that also appears as a regular
-    // inputs. Currently, doing so breaks testControlFlowStrictness in
-    // python/framework/function_test.
-    //    if (!inputs.insert(NodeName(input)).second && IsControlInput(input)) {
-    if (IsControlInput(input) && !inputs.insert(input).second) {
+    if (!inputs.insert(NodeName(input)).second && IsControlInput(input)) {
       VLOG(1) << "**** Removing duplicate control input: " << input
               << " from node " << node->DebugString();
       node->mutable_input()->SwapElements(pos, node->input_size() - 1);
diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py
index 57e5a724c9..a4ca3f9a89 100644
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@@ -26,6 +26,7 @@ import numpy as np
 
 from tensorflow.core.framework import function_pb2
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -451,13 +452,17 @@ class FunctionTest(test.TestCase):
                                          lambda y: AssertFail(y), [x])
       # pylint: enable=unnecessary-lambda
 
+    rewriter_config = rewriter_config_pb2.RewriterConfig(
+        dependency_optimization=rewriter_config_pb2.RewriterConfig.OFF)
     # Enables inlining.
-    config = config_pb2.ConfigProto(graph_options=config_pb2.GraphOptions(
-        optimizer_options=config_pb2.OptimizerOptions(
-            opt_level=config_pb2.OptimizerOptions.L0,
-            do_common_subexpression_elimination=True,
-            do_function_inlining=True,
-            do_constant_folding=True)))
+    config = config_pb2.ConfigProto(
+        graph_options=config_pb2.GraphOptions(
+            optimizer_options=config_pb2.OptimizerOptions(
+                opt_level=config_pb2.OptimizerOptions.L0,
+                do_common_subexpression_elimination=True,
+                do_function_inlining=True,
+                do_constant_folding=True),
+            rewrite_options=rewriter_config))
 
     with session.Session(config=config) as sess:
       # Since the 'False' branch is not taken, the assertion should not fire.
-- 
GitLab


From 95a568febcee480cd6d4e6a6bd687754b1ca1422 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Jan 2018 14:20:14 -0800
Subject: [PATCH 127/966] Make bfloat16 work correctly with matmul

PiperOrigin-RevId: 182437226
---
 .../compiler/tf2xla/kernels/matmul_op.cc      | 15 ++++++++++++++-
 .../kernel_tests/sparse_matmul_op_test.py     | 19 +++++++++++++------
 tensorflow/python/ops/gradient_checker.py     | 14 ++++++++++++--
 tensorflow/python/ops/math_ops.py             |  8 +++++++-
 4 files changed, 46 insertions(+), 10 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/kernels/matmul_op.cc b/tensorflow/compiler/tf2xla/kernels/matmul_op.cc
index 644abd5905..886baf8115 100644
--- a/tensorflow/compiler/tf2xla/kernels/matmul_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/matmul_op.cc
@@ -29,10 +29,12 @@ constexpr std::array<DataType, 5> kMatmulTypes = {
 class MatMulOp : public XlaOpKernel {
  public:
   explicit MatMulOp(OpKernelConstruction* ctx, bool is_sparse = false)
-      : XlaOpKernel(ctx) {
+      : XlaOpKernel(ctx), is_sparse_(is_sparse) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("transpose_a", &transpose_a_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("transpose_b", &transpose_b_));
     if (is_sparse) {
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("Ta", &a_type_));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("Tb", &b_type_));
       // SparseMatMul is actually dense matmul with a hint that one or
       // both of the inputs may contain a lot of zeroes. On CPU these
       // inputs are dynamically converted to sparse representation
@@ -66,14 +68,25 @@ class MatMulOp : public XlaOpKernel {
 
     xla::ComputationDataHandle a = ctx->Input(0);
     xla::ComputationDataHandle b = ctx->Input(1);
+    if (is_sparse_) {
+      if (a_type_ == DT_BFLOAT16) {
+        a = ctx->builder()->ConvertElementType(a, xla::F32);
+      }
+      if (b_type_ == DT_BFLOAT16) {
+        b = ctx->builder()->ConvertElementType(b, xla::F32);
+      }
+    }
     auto lhs = (transpose_a_) ? ctx->builder()->Transpose(a, {1, 0}) : a;
     auto rhs = (transpose_b_) ? ctx->builder()->Transpose(b, {1, 0}) : b;
     ctx->SetOutput(0, ctx->builder()->Dot(lhs, rhs));
   }
 
  private:
+  bool is_sparse_;
   bool transpose_a_;
   bool transpose_b_;
+  DataType a_type_;
+  DataType b_type_;
 };
 
 REGISTER_XLA_OP(Name("MatMul").TypeConstraint("T", kMatmulTypes), MatMulOp);
diff --git a/tensorflow/python/kernel_tests/sparse_matmul_op_test.py b/tensorflow/python/kernel_tests/sparse_matmul_op_test.py
index 6ca4479671..4935ed6ca5 100644
--- a/tensorflow/python/kernel_tests/sparse_matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_matmul_op_test.py
@@ -69,7 +69,7 @@ class SparseMatMulTest(test.TestCase):
 
     np_ans = np.matrix(np_x) * np.matrix(np_y)
     self.assertShapeEqual(np_ans, tf_ans)
-    self.assertAllClose(np_ans, out, rtol=1e-4, atol=1e-4)
+    self.assertAllCloseAccordingToType(np_ans, out, rtol=1e-4, atol=1e-4)
 
   def testBasic(self):
     x = np.arange(0., 4.).reshape([4, 1]).astype(np.float32)
@@ -128,7 +128,8 @@ class SparseMatMulTest(test.TestCase):
 
 class MatMulGradientTest(test.TestCase):
 
-  def _testGradients(self, tr_a, tr_b, sp_a, sp_b, a_dtype, b_dtype, name):
+  def _testGradients(self, tr_a, tr_b, sp_a, sp_b, a_dtype, b_dtype, delta,
+                     name):
     with self.test_session():
       a = constant_op.constant(
           RandMatrix(
@@ -151,12 +152,12 @@ class MatMulGradientTest(test.TestCase):
           a, [2, 3] if tr_a else [3, 2],
           m, [3, 4],
           x_init_value=a.eval(),
-          delta=1 / 64.) + gradient_checker.compute_gradient_error(
+          delta=delta) + gradient_checker.compute_gradient_error(
               b, [4, 2] if tr_b else [2, 4],
               m, [3, 4],
               x_init_value=b.eval(),
-              delta=1 / 64.))
-    self.assertLess(err, 1 / 128.)
+              delta=delta))
+    self.assertLess(err, delta / 2.)
 
   def testGradientInput(self):
     for tr_a in [True, False]:
@@ -165,9 +166,15 @@ class MatMulGradientTest(test.TestCase):
           for sp_b in [True, False]:
             for a_dtype in (dtypes.float32, dtypes.bfloat16):
               for b_dtype in (dtypes.float32, dtypes.bfloat16):
+                # Note: bfloat16 only has 7 mantissa bits, versus float32 with
+                # 10. Hence, we shift by 2 bits to pass the test.
+                if a_dtype == dtypes.bfloat16 and b_dtype == dtypes.bfloat16:
+                  delta = 1 / 16.
+                else:
+                  delta = 1 / 64.
                 name = "sparse_matmul_%s_%s_%s_%s" % (tr_a, tr_b, sp_a, sp_b)
                 self._testGradients(tr_a, tr_b, sp_a, sp_b, a_dtype, b_dtype,
-                                    name)
+                                    delta, name)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/ops/gradient_checker.py b/tensorflow/python/ops/gradient_checker.py
index 1ff1968055..b9f42f9eb2 100644
--- a/tensorflow/python/ops/gradient_checker.py
+++ b/tensorflow/python/ops/gradient_checker.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
 
 
@@ -151,6 +152,15 @@ def _compute_numeric_jacobian(x, x_shape, x_data, y, y_shape, delta,
     and "y_size" columns where "x_size" is the number of elements in x and
     "y_size" is the number of elements in y.
   """
+  # bfloat16 doesn't have enough bits to represent high precision numbers such
+  # as delta. Convert to float32 here. Since numeric_jacobian is expected to
+  # be the groundtruth to compare against, it shouldn't lose any information.
+  if x.dtype == dtypes.bfloat16:
+    x = math_ops.cast(x, dtypes.float32)
+  if y.dtype == dtypes.bfloat16:
+    y = math_ops.cast(y, dtypes.float32)
+  if x_data.dtype == dtypes.bfloat16.as_numpy_dtype:
+    x_data = x_data.astype(np.float32)
 
   # To compute the jacobian, we treat x and y as one-dimensional vectors
   x_size = _product(x_shape) * (2 if x.dtype.is_complex else 1)
@@ -206,8 +216,8 @@ def _compute_gradient(x,
                       extra_feed_dict=None):
   """Computes the theoretical and numerical jacobian."""
   t = dtypes.as_dtype(x.dtype)
-  allowed_types = [dtypes.float16, dtypes.float32, dtypes.float64,
-                   dtypes.complex64, dtypes.complex128]
+  allowed_types = [dtypes.float16, dtypes.bfloat16, dtypes.float32,
+                   dtypes.float64, dtypes.complex64, dtypes.complex128]
   assert t.base_dtype in allowed_types, "Don't support type %s for x" % t.name
   t2 = dtypes.as_dtype(y.dtype)
   assert t2.base_dtype in allowed_types, "Don't support type %s for y" % t2.name
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 6c5bdc661f..1c2720e973 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -2003,7 +2003,7 @@ def matmul(a,
       # matmul currently doesn't handle bfloat16 inputs.
       use_sparse_matmul = True
     if use_sparse_matmul:
-      return sparse_matmul(
+      ret = sparse_matmul(
           a,
           b,
           transpose_a=transpose_a,
@@ -2011,6 +2011,12 @@ def matmul(a,
           a_is_sparse=a_is_sparse,
           b_is_sparse=b_is_sparse,
           name=name)
+      # sparse_matmul always returns float32, even with
+      # bfloat16 inputs. This prevents us from configuring bfloat16 training.
+      # casting to bfloat16 also matches non-sparse matmul behavior better.
+      if a.dtype == dtypes.bfloat16 and b.dtype == dtypes.bfloat16:
+        ret = cast(ret, dtypes.bfloat16)
+      return ret
     else:
       return gen_math_ops._mat_mul(
           a, b, transpose_a=transpose_a, transpose_b=transpose_b, name=name)
-- 
GitLab


From 65aa9ee2500b0108d89f5fd1368ec3b73b273082 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 24 Jan 2018 17:16:58 -0800
Subject: [PATCH 128/966] Fix comparison bug in remove_trivial_binary

PiperOrigin-RevId: 183167508
---
 .../toco/graph_transformations/remove_trivial_binary.cc   | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_binary.cc b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_binary.cc
index 8512e6bb5a..95a50c6179 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_binary.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_binary.cc
@@ -89,14 +89,14 @@ bool RemoveTrivialBinaryOperator::Run(Model* model, std::size_t op_index) {
   const auto& constant_input_float_data =
       constant_input_array.GetBuffer<ArrayDataType::kFloat>().data;
   bool is_trivial = false;
-  if (binary_op->type != OperatorType::kAdd) {
+  if (binary_op->type == OperatorType::kAdd) {
     is_trivial = AreAllBufferElementsEqualTo(constant_input_float_data, 0.f);
-  } else if (binary_op->type != OperatorType::kSub) {
+  } else if (binary_op->type == OperatorType::kSub) {
     is_trivial = index_of_constant_input == 1 &&
                  AreAllBufferElementsEqualTo(constant_input_float_data, 0.f);
-  } else if (binary_op->type != OperatorType::kMul) {
+  } else if (binary_op->type == OperatorType::kMul) {
     is_trivial = AreAllBufferElementsEqualTo(constant_input_float_data, 1.f);
-  } else if (binary_op->type != OperatorType::kDiv) {
+  } else if (binary_op->type == OperatorType::kDiv) {
     is_trivial = index_of_constant_input == 1 &&
                  AreAllBufferElementsEqualTo(constant_input_float_data, 1.f);
   }
-- 
GitLab


From 10eab61afb2292b4b8ad82426b7650c77e483d7a Mon Sep 17 00:00:00 2001
From: Jerome <jvgonzalvo@up.edu.ph>
Date: Thu, 25 Jan 2018 09:21:51 +0800
Subject: [PATCH 129/966] Created dense_to_sparse in contrib.layers (#16119)

* Added ctc_loss_dense_labels. This does the conversion of dense labels into sparse ones to be passed into the core ctc_loss function.

* Removed constant_op from the import.

* Matched ctc_loss_dense_labels with the other layers ops.

* Added ctc_loss_dense_labels to contrib.layers __init__.py file

* Added missing comma to list of ops.

* Reordred arguments for ctc_loss_dense_labels

Labels should be first then inputs for ctc_loss.

* Removed ctc_loss_dense_labels.

Replaced it with dense_to_sparse instead so that there'll be only one ctc_loss function.

* Replaced ctc_loss_dense_labels with dense_to_sparse

* Fixed dense_to_sparse. Some of the names of the variables did not match with that of the parameters.

* Updated documentation for dense_to_sparse since it can accept a tensor of any shape.

* Added test case for dense_to_sparse.

* Updated documentation. Dense to sparse accepts int tensors.

* Fixed testDenseFromConstantToSparse.

The sparse_to_dense order of arguments in the test are wrong and the expected constant should be of int64.
---
 tensorflow/contrib/layers/__init__.py         |  1 +
 .../contrib/layers/python/layers/layers.py    | 27 ++++++++++++++++++-
 .../layers/python/layers/layers_test.py       | 12 +++++++++
 3 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/layers/__init__.py b/tensorflow/contrib/layers/__init__.py
index 6c624929f2..ef419862b4 100644
--- a/tensorflow/contrib/layers/__init__.py
+++ b/tensorflow/contrib/layers/__init__.py
@@ -27,6 +27,7 @@ See the @{$python/contrib.layers} guide.
 @@convolution2d_transpose
 @@conv3d_transpose
 @@convolution3d_transpose
+@@dense_to_sparse
 @@dropout
 @@elu
 @@embedding_lookup_unique
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index 7c52da7b49..c8e3307ee8 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -29,6 +29,7 @@ from tensorflow.contrib.framework.python.ops import variables
 from tensorflow.contrib.layers.python.layers import initializers
 from tensorflow.contrib.layers.python.layers import utils
 from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
@@ -58,7 +59,8 @@ __all__ = [
     'avg_pool2d', 'avg_pool3d', 'batch_norm', 'bias_add', 'conv2d', 'conv3d',
     'conv2d_in_plane', 'conv2d_transpose', 'conv3d_transpose', 'convolution',
     'convolution2d', 'convolution2d_in_plane', 'convolution2d_transpose',
-    'convolution3d', 'convolution3d_transpose', 'dropout', 'elu', 'flatten',
+    'convolution3d', 'convolution3d_transpose', 'dense_to_sparse',
+           'dropout', 'elu', 'flatten',
     'fully_connected', 'GDN', 'gdn', 'layer_norm', 'linear', 'pool',
     'max_pool2d', 'max_pool3d', 'one_hot_encoding', 'relu', 'relu6', 'repeat',
     'scale_gradient', 'separable_conv2d', 'separable_convolution2d', 'softmax',
@@ -1400,6 +1402,29 @@ def convolution3d_transpose(
     return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
 
 
+@add_arg_scope
+def dense_to_sparse(tensor, eos_token=0, outputs_collections=None, scope=None):
+  """Converts a dense tensor into a sparse tensor.
+  An example use would be to convert dense labels to sparse ones
+  so that they can be fed to the ctc_loss.
+
+  Args:
+     tensor: An `int` `Tensor` to be converted to a `Sparse`.
+     eos_token: An integer.
+       It is part of the target label that signfies the end of a sentence.
+     outputs_collections: Collection to add the outputs.
+     scope: Optional scope for name_scope.
+  """
+  with variable_scope.variable_scope(
+      scope, 'dense_to_sparse', [tensor]) as sc:
+    tensor = ops.convert_to_tensor(tensor)
+    indices = array_ops.where(math_ops.not_equal(tensor, constant_op.constant(eos_token, tensor.dtype)))
+    values = array_ops.gather_nd(tensor, indices)
+    shape = array_ops.shape(tensor, out_type=dtypes.int64)
+    outputs = sparse_tensor.SparseTensor(indices, values, shape)
+    return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
+
+
 @add_arg_scope
 def dropout(inputs,
             keep_prob=0.5,
diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py
index a9bdbe0138..c5790c7622 100644
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@@ -44,6 +44,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import template
 from tensorflow.python.ops import variable_scope
@@ -1292,6 +1293,17 @@ class ConvolutionInPlaneTest(test.TestCase):
       self.assertAllClose(result, expected, rtol=1e-5, atol=1e-5)
 
 
+class DenseToSparseTest(test.TestCase):
+
+  def testDenseFromConstantToSparse(self):
+    expected_constant = np.reshape(np.arange(24, dtype=np.int64), (3, 4, 2))
+    tensor = constant_op.constant(expected_constant)
+    sparse = _layers.dense_to_sparse(tensor)
+    dense = sparse_ops.sparse_to_dense(sparse.indices, sparse.dense_shape, sparse.values)
+    with self.test_session() as sess:
+      constant = sess.run(dense)
+      self.assertAllEqual(expected_constant, constant)
+
 class DropoutTest(test.TestCase):
 
   def testCreateDropout(self):
-- 
GitLab


From e2b93e6c45fc9e0745ec4b70a7ba0d1b86f42c94 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 24 Jan 2018 17:22:27 -0800
Subject: [PATCH 130/966] Fix sample_distorted_bounding_box where
 min_object_covered could be None (#15531)

* Fix sample_distorted_bounding_box where min_object_covered could be None

This fix tried to address the issue raised in 15529 where
not providing min_object_covered a value will result in
a ValueError. In the docstring, however, min_object_covered
has been described as default to 0.1.

The reason for the issue is that when sample_distorted_bounding_box
switched to V2, min_object_covered has been changed form an attr
to an input. As input could not have a default value,
min_object_covered=None will result in an error.

This fix adds the check so that a default value 0.1 will be
provided if min_object_covered=None.

This fix fixes 15529.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add test case for min_object_covered=None with sample_distorted_bounding_box.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Move min_object_covered = 0.1 to the args

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Update golden API

with
```
bazel-bin/tensorflow/tools/api/tests/api_compatibility_test
           --update_goldens True
```

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/image_ops_impl.py       |  2 +-
 tensorflow/python/ops/image_ops_test.py       | 19 +++++++++++++++++++
 .../tools/api/golden/tensorflow.image.pbtxt   |  2 +-
 3 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index 76da3bed31..9bd452155c 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -1507,7 +1507,7 @@ def sample_distorted_bounding_box(image_size,
                                   bounding_boxes,
                                   seed=None,
                                   seed2=None,
-                                  min_object_covered=None,
+                                  min_object_covered=0.1,
                                   aspect_ratio_range=None,
                                   area_range=None,
                                   max_attempts=None,
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index 80911ffe07..0c5ed2150d 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -1857,6 +1857,25 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([3], end.get_shape().as_list())
       self.assertAllEqual([1, 1, 4], bbox_for_drawing.get_shape().as_list())
 
+  def testDefaultMinObjectCovered(self):
+    # By default min_object_covered=0.1 if not provided
+    with self.test_session(use_gpu=True):
+      image_size = constant_op.constant(
+          [40, 50, 1], shape=[3], dtype=dtypes.int32)
+      bounding_box = constant_op.constant(
+          [0.0, 0.0, 1.0, 1.0],
+          shape=[4],
+          dtype=dtypes.float32,)
+      begin, end, bbox_for_drawing = image_ops.sample_distorted_bounding_box(
+          image_size=image_size,
+          bounding_boxes=bounding_box,
+          aspect_ratio_range=(0.75, 1.33),
+          area_range=(0.05, 1.0))
+
+      self.assertAllEqual([3], begin.get_shape().as_list())
+      self.assertAllEqual([3], end.get_shape().as_list())
+      self.assertAllEqual([1, 1, 4], bbox_for_drawing.get_shape().as_list())
+
 
 class ResizeImagesTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/tools/api/golden/tensorflow.image.pbtxt b/tensorflow/tools/api/golden/tensorflow.image.pbtxt
index f32353c957..441621a2a0 100644
--- a/tensorflow/tools/api/golden/tensorflow.image.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.image.pbtxt
@@ -174,7 +174,7 @@ tf_module {
   }
   member_method {
     name: "sample_distorted_bounding_box"
-    argspec: "args=[\'image_size\', \'bounding_boxes\', \'seed\', \'seed2\', \'min_object_covered\', \'aspect_ratio_range\', \'area_range\', \'max_attempts\', \'use_image_if_no_bounding_boxes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'image_size\', \'bounding_boxes\', \'seed\', \'seed2\', \'min_object_covered\', \'aspect_ratio_range\', \'area_range\', \'max_attempts\', \'use_image_if_no_bounding_boxes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'0.1\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "total_variation"
-- 
GitLab


From 6743031da633d1ce284a606c49eb00e793c1d729 Mon Sep 17 00:00:00 2001
From: Mohamed Aly <mohamedadaly@users.noreply.github.com>
Date: Wed, 24 Jan 2018 17:23:10 -0800
Subject: [PATCH 131/966] Changes to fix a bug in ResolveConstantConcat whereby
 shared tensors are removed without checking if they are used in other
 operators in the graph (#16012)

---
 .../toco/graph_transformations/graph_transformations.cc    | 1 +
 .../resolve_constant_concatenation.cc                      | 5 ++++-
 tensorflow/contrib/lite/toco/tooling_util.cc               | 7 +++++--
 3 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.cc b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.cc
index 2340f0e850..6961e23690 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.cc
@@ -132,6 +132,7 @@ bool GraphTransformationsPass(int increment, Model* model,
   CHECK(increment == 1 || increment == -1);
   bool changed = false;
   if (model->operators.empty()) {
+    LOG(INFO) << "Model is empty!!!";
     return false;
   }
   int op_index = increment == 1 ? 0 : model->operators.size() - 1;
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc
index 833c97c758..5ac449749a 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc
@@ -189,7 +189,10 @@ bool ResolveConstantConcatenation::Run(Model* model, std::size_t op_index) {
 
   // Remove all the resolved arrays.
   for (const string& input_name : concat_op->inputs) {
-    model->EraseArray(input_name);
+    // Check to prevent removal of shared tensors
+    if(CountOpsWithInput(*model, input_name) == 1) {
+      model->EraseArray(input_name);
+    }
   }
 
   // Remove concatenate operator
diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc
index 8543ba4742..69187bb142 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.cc
+++ b/tensorflow/contrib/lite/toco/tooling_util.cc
@@ -652,10 +652,13 @@ void CheckNonExistentIOArrays(const Model& model) {
 void CheckNoMissingArray(const Model& model) {
   for (const auto& op : model.operators) {
     for (const auto& input : op->inputs) {
-      CHECK(model.HasArray(input) || model.optional_arrays.count(input));
+      CHECK(model.HasArray(input) || model.optional_arrays.count(input))
+       << "Input: " << input << " missing for op: " 
+       << op->outputs[0] << ".";
     }
     for (const auto& output : op->outputs) {
-      CHECK(model.HasArray(output));
+      CHECK(model.HasArray(output)) << "Output: " << output 
+        << " missing.";
     }
   }
   CheckNonExistentIOArrays(model);
-- 
GitLab


From 7c4f482a851d12a3b0187bbf31db65ff6b7a7ad3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 24 Jan 2018 17:18:47 -0800
Subject: [PATCH 132/966] Add copy method to EagerVariableStore.

EagerVariableStore.copy creates a new EagerVariableStore instance containing new variables so that they can be modified without affecting the variables in the old store.

PiperOrigin-RevId: 183167776
---
 .../kernel_tests/variable_scope_test.py       | 24 ++++++++++++++
 tensorflow/python/ops/variable_scope.py       | 31 +++++++++++++++++++
 2 files changed, 55 insertions(+)

diff --git a/tensorflow/python/kernel_tests/variable_scope_test.py b/tensorflow/python/kernel_tests/variable_scope_test.py
index 238d4b58d5..8527f116f9 100644
--- a/tensorflow/python/kernel_tests/variable_scope_test.py
+++ b/tensorflow/python/kernel_tests/variable_scope_test.py
@@ -131,6 +131,30 @@ class VariableScopeTest(test.TestCase):
       self.assertFalse(v in store.non_trainable_variables())
       self.assertTrue(w in store.non_trainable_variables())
 
+      # Test copying.
+      new_store = store.copy()
+      with new_store.as_default():
+        new_v = variable_scope.get_variable("v")
+        new_w = variable_scope.get_variable("w")
+      self.assertEqual(new_v.numpy(), v.numpy())
+      self.assertEqual(new_w.numpy(), w.numpy())
+      self.assertTrue(new_v in new_store.variables())
+      self.assertTrue(new_w in new_store.variables())
+      self.assertTrue(new_v in new_store.trainable_variables())
+      self.assertFalse(new_w in new_store.trainable_variables())
+      self.assertFalse(new_v in new_store.non_trainable_variables())
+      self.assertTrue(new_w in new_store.non_trainable_variables())
+
+      # Check that variables are separate instances.
+      for v in store.variables():
+        v.assign(-1)
+      for v in new_store.variables():
+        v.assign(1)
+      for v in store.variables():
+        self.assertEqual(v.numpy(), -1)
+      for v in new_store.variables():
+        self.assertEqual(v.numpy(), 1)
+
   @test_util.run_in_graph_and_eager_modes()
   def testInitFromNonTensorValue(self):
     v = variable_scope.get_variable("v4", initializer=4, dtype=dtypes.int32)
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index c52d5fff5d..db594ac6a0 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -27,6 +27,7 @@ import sys
 import traceback
 
 import six
+from six import iteritems
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.eager import context
@@ -1242,6 +1243,36 @@ class EagerVariableStore(object):
                   key=lambda x: x.name)
     # pylint: enable=protected-access
 
+  def copy(self):
+    """Copy this variable store and all of its contents.
+
+    Variables contained in this store will be copied over to the new variable
+    store, meaning that they can be modified without affecting the variables in
+    this store.
+
+    Returns:
+      A new EagerVariableStore instance containing copied variables.
+    """
+    # pylint: disable=protected-access
+    new_store = EagerVariableStore()
+    for key, var in iteritems(self._store._vars):
+      # Strip device out of variable name.
+      try:
+        index = var.name.index(":")
+      except ValueError:
+        stripped_var_name = var.name
+      else:
+        stripped_var_name = var.name[:index]
+
+      # Create new variable with same value, name, and "trainable" flag.
+      new_var = resource_variable_ops.ResourceVariable(
+          var.read_value(),
+          name=stripped_var_name,
+          trainable=var._trainable)
+      new_store._store._vars[key] = new_var
+    return new_store
+    # pylint: enable=protected-access
+
 
 @tf_export("get_variable")
 def get_variable(name,
-- 
GitLab


From 0412e0946bdd2765d5c3dba0cc9b12b8650f564a Mon Sep 17 00:00:00 2001
From: Yunxing Dai <yunxing@google.com>
Date: Wed, 24 Jan 2018 17:24:50 -0800
Subject: [PATCH 133/966] Add R3 and R5 tests to select and scatter.

- Added evaluator support so that we can test S&S in arbitrary dimensions.

RELNOTES: n/a
PiperOrigin-RevId: 183168473
---
 .../compiler/xla/service/hlo_evaluator.cc     | 189 +++++++++++++++---
 tensorflow/compiler/xla/tests/BUILD           |   1 +
 .../xla/tests/select_and_scatter_test.cc      | 189 +++++++++++-------
 3 files changed, 276 insertions(+), 103 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index 2112cf57c7..e3f5c17e35 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -43,6 +43,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
@@ -166,6 +167,34 @@ StatusOr<std::unique_ptr<Literal>> ElementWiseUnaryOpImpl(
   return std::move(result);
 }
 
+// For one particular placement of a window in a base shape (the placement is
+// represented as `window_count_index`), iterates inside the window. Translates
+// the window index into base index. If the base index is within bound, call `f`
+// with the base index.
+void IterateThroughWindow(
+    const Shape& window_shape, const Window& window, const Shape& base_shape,
+    const tensorflow::gtl::ArraySlice<int64>& window_count_index,
+    const std::function<void(const std::vector<int64>&)>& f) {
+  const int64 rank = ShapeUtil::Rank(base_shape);
+  DimensionVector window_index(rank);
+  std::fill(window_index.begin(), window_index.end(), 0);
+  do {
+    std::vector<int64> base_index(rank);
+    bool out_of_bound = false;
+    for (int64 i = 0; i < rank; ++i) {
+      base_index[i] = window_count_index[i] * window.dimensions(i).stride() +
+                      window_index[i] - window.dimensions(i).padding_low();
+      if (base_index[i] < 0 || base_index[i] >= base_shape.dimensions(i)) {
+        out_of_bound = true;
+        break;
+      }
+    }
+    if (!out_of_bound) {
+      f(base_index);
+    }
+  } while (IndexUtil::BumpIndices(window_shape, &window_index));
+}
+
 }  // namespace
 
 template <typename ReturnT, typename ElementwiseT>
@@ -1420,6 +1449,111 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     return Status::OK();
   }
 
+  Status HandleSelectAndScatter(HloInstruction* select_and_scatter) override {
+    auto operand = select_and_scatter->operand(0);
+    auto source = select_and_scatter->operand(1);
+    const Window& window = select_and_scatter->window();
+
+    const Literal& init_literal =
+        parent_->GetEvaluatedLiteralFor(select_and_scatter->operand(2));
+    TF_RET_CHECK(ShapeUtil::IsScalar(init_literal.shape()));
+    auto init_scalar = init_literal.Get<ReturnT>({});
+
+    auto result = Literal::CreateFromShape(select_and_scatter->shape());
+
+    // Initialize result array with the init value.
+    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
+        [&](tensorflow::gtl::ArraySlice<int64> output_index) {
+          return init_scalar;
+        }));
+
+    std::vector<int64> window_dimension_sizes;
+    for (const auto& window_dimension : window.dimensions()) {
+      window_dimension_sizes.push_back(window_dimension.size());
+    }
+    const Shape window_shape = ShapeUtil::MakeShape(
+        operand->shape().element_type(), window_dimension_sizes);
+
+    HloComputation* select = select_and_scatter->select();
+    HloComputation* scatter = select_and_scatter->scatter();
+
+    const Literal& operand_literal = parent_->GetEvaluatedLiteralFor(operand);
+    const Literal& source_literal = parent_->GetEvaluatedLiteralFor(source);
+
+    int64 rank = ShapeUtil::Rank(operand_literal.shape());
+
+    HloEvaluator embedded_evaluator;
+    DimensionVector source_index(rank);
+
+    std::fill(source_index.begin(), source_index.end(), 0);
+    do {
+      // For each element in `source`, we place a window in `operand`. For each
+      // window placement, we iterate inside the window twice:
+      //
+      // 1. Find the selected index by applying `select` function to all
+      // elements. E.g., If the `select` function is GreaterEqual, the first
+      // iteration through the window finds the biggest value and returns its
+      // index.
+      //
+      // 2. Using the selected index, scatter value from `source` to result. We
+      // do this by iterating through the window, and compare each index with
+      // the selected index.
+      tensorflow::gtl::optional<ReturnT> selected_val;
+      tensorflow::gtl::optional<std::vector<int64>> selected_index;
+
+      IterateThroughWindow(
+          window_shape, window, operand_literal.shape(), source_index,
+          [&](const std::vector<int64>& operand_index) {
+            auto curr_val = operand_literal.Get<ReturnT>(operand_index);
+            if (!selected_val) {
+              selected_val = curr_val;
+              selected_index = operand_index;
+            }
+            const auto curr_val_literal = Literal::CreateR0<ReturnT>(curr_val);
+            const auto selected_val_literal =
+                Literal::CreateR0<ReturnT>(*selected_val);
+
+            const std::vector<const Literal*> args = {
+                curr_val_literal.get(), selected_val_literal.get()};
+            std::unique_ptr<Literal> computed_result =
+                embedded_evaluator.Evaluate<const Literal*>(*select, args)
+                    .ConsumeValueOrDie();
+            bool selected = computed_result->Get<bool>({});
+            if (selected) {
+              selected_val = curr_val;
+              selected_index = operand_index;
+            }
+            embedded_evaluator.ResetVisitStates();
+          });
+
+      IterateThroughWindow(
+          window_shape, window, operand_literal.shape(), source_index,
+          [&](const std::vector<int64>& operand_index) {
+            if (std::equal(operand_index.begin(), operand_index.end(),
+                           selected_index->begin())) {
+              auto source = source_literal.Get<ReturnT>(source_index);
+              auto scattered = result->Get<ReturnT>(operand_index);
+              const auto source_literal = Literal::CreateR0<ReturnT>(source);
+              const auto scattered_literal =
+                  Literal::CreateR0<ReturnT>(scattered);
+
+              const std::vector<const Literal*> args = {
+                  source_literal.get(), scattered_literal.get()};
+              std::unique_ptr<Literal> computed_result =
+                  embedded_evaluator.Evaluate<const Literal*>(*scatter, args)
+                      .ConsumeValueOrDie();
+              result->Set(operand_index, computed_result->Get<ReturnT>({}));
+              // Clear visit states so that the we can use the evaluator again
+              // on the same computation.
+              embedded_evaluator.ResetVisitStates();
+            }
+          });
+    } while (IndexUtil::BumpIndices(source->shape(), &source_index));
+
+    parent_->evaluated_[select_and_scatter] = std::move(result);
+    return Status::OK();
+  }
+
   Status HandleReduceWindow(HloInstruction* reduce_window) override {
     auto operand = reduce_window->operand(0);
     const Window& window = reduce_window->window();
@@ -1468,39 +1602,28 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
           std::fill(window_index.begin(), window_index.end(), 0);
           std::fill(operand_index.begin(), operand_index.end(), 0);
 
-          do {
-            bool out_of_bound = false;
-            for (int i = 0; i < operand_index.size(); ++i) {
-              operand_index[i] =
-                  output_index[i] * window.dimensions(i).stride() +
-                  window_index[i] - window.dimensions(i).padding_low();
-              if (operand_index[i] < 0 ||
-                  operand_index[i] >= operand_literal.shape().dimensions(i)) {
-                out_of_bound = true;
-                break;
-              }
-            }
-            if (!out_of_bound) {
-              auto curr_val = operand_literal.Get<ReturnT>(operand_index);
-
-              // Evaluate computation with specified literal operands.
-              const auto curr_val_literal =
-                  Literal::CreateR0<ReturnT>(curr_val);
-              const auto result_val_literal =
-                  Literal::CreateR0<ReturnT>(result_val);
-              const std::vector<const Literal*> args = {
-                  curr_val_literal.get(), result_val_literal.get()};
-              std::unique_ptr<Literal> computed_result =
-                  embedded_evaluator.Evaluate<const Literal*>(*function, args)
-                      .ConsumeValueOrDie();
-
-              // Clear visit states so that the we can use the evaluate again on
-              // the same computation.
-              embedded_evaluator.ResetVisitStates();
-
-              result_val = computed_result->Get<ReturnT>({});
-            }
-          } while (IndexUtil::BumpIndices(window_shape, &window_index));
+          IterateThroughWindow(
+              window_shape, window, operand_literal.shape(), output_index,
+              [&](const std::vector<int64>& operand_index) {
+                auto curr_val = operand_literal.Get<ReturnT>(operand_index);
+
+                // Evaluate computation with specified literal operands.
+                const auto curr_val_literal =
+                    Literal::CreateR0<ReturnT>(curr_val);
+                const auto result_val_literal =
+                    Literal::CreateR0<ReturnT>(result_val);
+                const std::vector<const Literal*> args = {
+                    curr_val_literal.get(), result_val_literal.get()};
+                std::unique_ptr<Literal> computed_result =
+                    embedded_evaluator.Evaluate<const Literal*>(*function, args)
+                        .ConsumeValueOrDie();
+
+                // Clear visit states so that the we can use the evaluate again
+                // on the same computation.
+                embedded_evaluator.ResetVisitStates();
+
+                result_val = computed_result->Get<ReturnT>({});
+              });
 
           return result_val;
         }));
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index bc15bd9593..3afd52b6b2 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -1034,6 +1034,7 @@ xla_test(
     name = "select_and_scatter_test",
     timeout = "long",
     srcs = ["select_and_scatter_test.cc"],
+    tags = ["enable_for_xla_interpreter"],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:literal_util",
diff --git a/tensorflow/compiler/xla/tests/select_and_scatter_test.cc b/tensorflow/compiler/xla/tests/select_and_scatter_test.cc
index 62ff349e9c..9ee94b8571 100644
--- a/tensorflow/compiler/xla/tests/select_and_scatter_test.cc
+++ b/tensorflow/compiler/xla/tests/select_and_scatter_test.cc
@@ -39,8 +39,8 @@ namespace xla {
 namespace {
 
 struct SelectAndScatterTestParam {
-  Array4D<float> operand_shape;
-  Array4D<float> source_shape;
+  std::vector<int64> operand_shape;
+  std::vector<int64> source_shape;
   Padding padding_type;
   tensorflow::gtl::ArraySlice<int64> window_dimensions;
   tensorflow::gtl::ArraySlice<int64> window_strides;
@@ -69,83 +69,132 @@ class SelectAndScatterTest
   Computation min_f32_;
 };
 
-XLA_TEST_P(SelectAndScatterTest, R4Randomized) {
-  Array4D<float> o(GetParam().operand_shape);
+XLA_TEST_P(SelectAndScatterTest, ParamTest) {
+  auto operand_shape = GetParam().operand_shape;
+  Array<float> o(operand_shape);
   o.FillRandom(1.5f);
-  auto operand = builder_.ConstantR4FromArray4D(o);
+  auto operand = builder_.ConstantFromArray(o);
 
-  Array4D<float> s(GetParam().source_shape);
+  auto source_shape = GetParam().source_shape;
+  Array<float> s(source_shape);
   s.FillRandom(12.0f);
-  auto source = builder_.ConstantR4FromArray4D(s);
-
-  builder_.SelectAndScatter(operand, ge_f32_, GetParam().window_dimensions,
-                            GetParam().window_strides, GetParam().padding_type,
-                            source, builder_.ConstantR0<float>(0.0f), add_f32_);
+  auto source = builder_.ConstantFromArray(s);
 
-  auto e = ReferenceUtil::SelectAndScatter4DGePlus(
-      o, s, 0.0f, GetParam().window_dimensions, GetParam().window_strides,
-      GetParam().padding_type == Padding::kSame);
+  auto select_and_scatter = builder_.SelectAndScatter(
+      operand, ge_f32_, GetParam().window_dimensions, GetParam().window_strides,
+      GetParam().padding_type, source, builder_.ConstantR0<float>(0.0f),
+      add_f32_);
 
-  ComputeAndCompareR4<float>(&builder_, *e, {}, ErrorSpec(1e-5));
+  ComputeAndCompare(&builder_, select_and_scatter, {}, ErrorSpec(1e-5));
 }
 
 INSTANTIATE_TEST_CASE_P(
     SelectAndScatterTest_Instantiation, SelectAndScatterTest,
-    ::testing::Values(SelectAndScatterTestParam{{6, 6, 256, 128},
-                                                {3, 3, 256, 128},
-                                                Padding::kSame,
-                                                {3, 3, 1, 1},
-                                                {2, 2, 1, 1}},
-                      SelectAndScatterTestParam{{7, 7, 256, 128},
-                                                {3, 3, 256, 128},
-                                                Padding::kValid,
-                                                {3, 3, 1, 1},
-                                                {2, 2, 1, 1}},
-                      SelectAndScatterTestParam{{6, 7, 256, 128},
-                                                {3, 3, 256, 128},
-                                                Padding::kValid,
-                                                {2, 3, 1, 1},
-                                                {2, 2, 1, 1}},
-                      SelectAndScatterTestParam{{6, 7, 256, 128},
-                                                {2, 3, 256, 128},
-                                                Padding::kValid,
-                                                {2, 3, 1, 1},
-                                                {3, 2, 1, 1}},
-                      SelectAndScatterTestParam{{9, 9, 16, 128},
-                                                {3, 3, 16, 128},
-                                                Padding::kValid,
-                                                {3, 3, 1, 1},
-                                                {3, 3, 1, 1}},
-                      SelectAndScatterTestParam{{3, 3, 4, 4},
-                                                {1, 1, 4, 4},
-                                                Padding::kValid,
-                                                {3, 3, 1, 1},
-                                                {3, 3, 1, 1}},
-                      SelectAndScatterTestParam{{3, 3, 4, 4},
-                                                {1, 1, 4, 4},
-                                                Padding::kValid,
-                                                {3, 3, 1, 1},
-                                                {3, 3, 1, 1}},
-                      SelectAndScatterTestParam{{9, 3, 4, 4},
-                                                {3, 1, 4, 4},
-                                                Padding::kValid,
-                                                {3, 3, 1, 1},
-                                                {3, 3, 1, 1}},
-                      SelectAndScatterTestParam{{7, 3, 4, 4},
-                                                {3, 1, 4, 4},
-                                                Padding::kValid,
-                                                {3, 3, 1, 1},
-                                                {2, 3, 1, 1}},
-                      SelectAndScatterTestParam{{1, 1, 5, 5},
-                                                {1, 1, 5, 5},
-                                                Padding::kSame,
-                                                {3, 3, 1, 1},
-                                                {3, 3, 1, 1}},
-                      SelectAndScatterTestParam{{7, 7, 8, 256},
-                                                {4, 4, 8, 256},
-                                                Padding::kSame,
-                                                {2, 2, 1, 1},
-                                                {2, 2, 1, 1}}));
+    ::testing::Values(
+        SelectAndScatterTestParam{{6, 6, 6, 4, 4},
+                                  {3, 3, 3, 4, 4},
+                                  Padding::kSame,
+                                  {3, 3, 3, 1, 1},
+                                  {2, 2, 2, 1, 1}},
+        SelectAndScatterTestParam{{7, 7, 7, 4, 4},
+                                  {3, 3, 3, 4, 4},
+                                  Padding::kValid,
+                                  {3, 3, 3, 1, 1},
+                                  {2, 2, 2, 1, 1}},
+
+        SelectAndScatterTestParam{{8, 8, 8, 4, 4},
+                                  {1, 3, 3, 4, 4},
+                                  Padding::kValid,
+                                  {8, 4, 4, 1, 1},
+                                  {1, 2, 2, 1, 1}},
+        SelectAndScatterTestParam{{6, 6, 256, 128},
+                                  {3, 3, 256, 128},
+                                  Padding::kSame,
+                                  {3, 3, 1, 1},
+                                  {2, 2, 1, 1}},
+        SelectAndScatterTestParam{{7, 7, 256, 128},
+                                  {3, 3, 256, 128},
+                                  Padding::kValid,
+                                  {3, 3, 1, 1},
+                                  {2, 2, 1, 1}},
+        SelectAndScatterTestParam{{6, 7, 256, 128},
+                                  {3, 3, 256, 128},
+                                  Padding::kValid,
+                                  {2, 3, 1, 1},
+                                  {2, 2, 1, 1}},
+        SelectAndScatterTestParam{{6, 7, 256, 128},
+                                  {2, 3, 256, 128},
+                                  Padding::kValid,
+                                  {2, 3, 1, 1},
+                                  {3, 2, 1, 1}},
+        SelectAndScatterTestParam{{9, 9, 16, 128},
+                                  {3, 3, 16, 128},
+                                  Padding::kValid,
+                                  {3, 3, 1, 1},
+                                  {3, 3, 1, 1}},
+        SelectAndScatterTestParam{{3, 3, 4, 4},
+                                  {1, 1, 4, 4},
+                                  Padding::kValid,
+                                  {3, 3, 1, 1},
+                                  {3, 3, 1, 1}},
+        SelectAndScatterTestParam{{3, 3, 4, 4},
+                                  {1, 1, 4, 4},
+                                  Padding::kValid,
+                                  {3, 3, 1, 1},
+                                  {3, 3, 1, 1}},
+        SelectAndScatterTestParam{{9, 3, 4, 4},
+                                  {3, 1, 4, 4},
+                                  Padding::kValid,
+                                  {3, 3, 1, 1},
+                                  {3, 3, 1, 1}},
+        SelectAndScatterTestParam{{7, 3, 4, 4},
+                                  {3, 1, 4, 4},
+                                  Padding::kValid,
+                                  {3, 3, 1, 1},
+                                  {2, 3, 1, 1}},
+        SelectAndScatterTestParam{{1, 1, 5, 5},
+                                  {1, 1, 5, 5},
+                                  Padding::kSame,
+                                  {3, 3, 1, 1},
+                                  {3, 3, 1, 1}},
+        SelectAndScatterTestParam{{7, 7, 8, 256},
+                                  {4, 4, 8, 256},
+                                  Padding::kSame,
+                                  {2, 2, 1, 1},
+                                  {2, 2, 1, 1}},
+        SelectAndScatterTestParam{
+            {6, 4, 4}, {3, 4, 4}, Padding::kSame, {3, 1, 1}, {2, 1, 1}},
+        SelectAndScatterTestParam{
+            {6, 256, 128}, {3, 256, 128}, Padding::kSame, {3, 1, 1}, {2, 1, 1}},
+        SelectAndScatterTestParam{{7, 256, 128},
+                                  {3, 256, 128},
+                                  Padding::kValid,
+                                  {3, 1, 1},
+                                  {2, 1, 1}},
+        SelectAndScatterTestParam{{6, 256, 128},
+                                  {3, 256, 128},
+                                  Padding::kValid,
+                                  {2, 1, 1},
+                                  {2, 1, 1}},
+        SelectAndScatterTestParam{{6, 256, 128},
+                                  {2, 256, 128},
+                                  Padding::kValid,
+                                  {2, 1, 1},
+                                  {3, 1, 1}},
+        SelectAndScatterTestParam{
+            {9, 16, 128}, {3, 16, 128}, Padding::kValid, {3, 1, 1}, {3, 1, 1}},
+        SelectAndScatterTestParam{
+            {3, 4, 4}, {1, 4, 4}, Padding::kValid, {3, 1, 1}, {3, 1, 1}},
+        SelectAndScatterTestParam{
+            {3, 4, 4}, {1, 4, 4}, Padding::kValid, {3, 1, 1}, {3, 1, 1}},
+        SelectAndScatterTestParam{
+            {9, 4, 4}, {3, 4, 4}, Padding::kValid, {3, 1, 1}, {3, 1, 1}},
+        SelectAndScatterTestParam{
+            {7, 4, 4}, {3, 4, 4}, Padding::kValid, {3, 1, 1}, {2, 1, 1}},
+        SelectAndScatterTestParam{
+            {1, 5, 5}, {1, 5, 5}, Padding::kSame, {3, 1, 1}, {3, 1, 1}},
+        SelectAndScatterTestParam{
+            {7, 8, 256}, {4, 8, 256}, Padding::kSame, {2, 1, 1}, {2, 1, 1}}));
 
 // Test for F32 1D array, with a zero-element input.
 XLA_TEST_F(SelectAndScatterTest, R1S0F32) {
-- 
GitLab


From 8a5b0f457c89e3ae77f67628654c1b64c4e65000 Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Wed, 24 Jan 2018 17:49:28 -0800
Subject: [PATCH 134/966] Make resource_variable_ops_test.py work with the C
 API enabled.

PiperOrigin-RevId: 183171341
---
 tensorflow/python/kernel_tests/resource_variable_ops_test.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index 7b131a5b8c..b4b555591d 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -38,6 +38,7 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
+@test_util.with_c_api
 class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
 
   def tearDown(self):
@@ -342,14 +343,14 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
       v = resource_variable_ops.ResourceVariable(
           2.0, caching_device="/job:localhost")
       self.assertEqual("/job:localhost", v.value().device)
-      with self.assertRaisesRegexp(ValueError, "No attr named '_class'"):
+      with self.assertRaises(ValueError):
         _ = v.value().op.get_attr("_class")
 
     with ops.colocate_with(v.op):
       w = resource_variable_ops.ResourceVariable(
           2.0, caching_device="/job:localhost")
       self.assertEqual("/job:localhost", w.value().device)
-      with self.assertRaisesRegexp(ValueError, "No attr named '_class'"):
+      with self.assertRaises(ValueError):
         _ = w.value().op.get_attr("_class")
 
   def testSharedName(self):
-- 
GitLab


From 2c3205a96c50e2488815b6e8dd8d8c18dca5d431 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 14 Dec 2017 11:18:05 -0800
Subject: [PATCH 135/966] Enable bfloat16 tests and add a filter for currently
 failed tests.

PiperOrigin-RevId: 179069257
---
 tensorflow/compiler/tests/binary_ops_test.py  |   2 +-
 .../compiler/tests/tensor_array_ops_test.py   |   3 +-
 tensorflow/compiler/tests/unary_ops_test.py   |   3 +-
 tensorflow/compiler/tests/xla_test.py         | 105 ++++++++++++++----
 tensorflow/compiler/tf2xla/lib/util.cc        |   2 +-
 tensorflow/compiler/tf2xla/lib/util.h         |   2 +-
 tensorflow/compiler/tf2xla/xla_op_registry.h  |   8 +-
 tensorflow/core/kernels/split_op.cc           |   2 +
 tensorflow/python/framework/test_util.py      |   6 +
 9 files changed, 100 insertions(+), 33 deletions(-)

diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index 654dc15e86..905dd9fc7b 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -547,7 +547,7 @@ class BinaryOpsTest(XLATestCase):
       self._testDivision(dtype)
 
   def testFloatDivision(self):
-    for dtype in self.float_types + self.complex_types:
+    for dtype in self.float_types | self.complex_types:
       self._testDivision(dtype)
 
   def _testRemainder(self, dtype):
diff --git a/tensorflow/compiler/tests/tensor_array_ops_test.py b/tensorflow/compiler/tests/tensor_array_ops_test.py
index ac039e0162..a62925a181 100644
--- a/tensorflow/compiler/tests/tensor_array_ops_test.py
+++ b/tensorflow/compiler/tests/tensor_array_ops_test.py
@@ -330,8 +330,7 @@ class TensorArrayTest(xla_test.XLATestCase):
     # Find two different floating point types, create an array of
     # the first type, but try to read the other type.
     if len(self.float_types) > 1:
-      dtype1 = self.float_types[0]
-      dtype2 = self.float_types[1]
+      dtype1, dtype2 = list(self.float_types)[:2]
       with self.test_session(), self.test_scope():
         ta = tensor_array_ops.TensorArray(
             dtype=dtype1, tensor_array_name="foo", size=3)
diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py
index 0da7442a24..b0623c0fbc 100644
--- a/tensorflow/compiler/tests/unary_ops_test.py
+++ b/tensorflow/compiler/tests/unary_ops_test.py
@@ -573,7 +573,8 @@ class UnaryOpsTest(XLATestCase):
 
   def testCast(self):
     shapes = [[], [4], [2, 3], [2, 0, 4]]
-    types = [dtypes.bool, dtypes.int32, dtypes.float32] + self.complex_tf_types
+    types = (set([dtypes.bool, dtypes.int32, dtypes.float32]) |
+             self.complex_tf_types)
     for shape in shapes:
       for src_type in types:
         for dst_type in types:
diff --git a/tensorflow/compiler/tests/xla_test.py b/tensorflow/compiler/tests/xla_test.py
index 0be127997e..7e1f5c76ed 100644
--- a/tensorflow/compiler/tests/xla_test.py
+++ b/tensorflow/compiler/tests/xla_test.py
@@ -53,41 +53,100 @@ class XLATestCase(test.TestCase):
     super(XLATestCase, self).__init__(method_name)
     self.device = FLAGS.test_device
     self.has_custom_call = (self.device == 'XLA_CPU')
-    self.all_tf_types = [
+    self._all_tf_types = set([
         dtypes.as_dtype(types_pb2.DataType.Value(name))
         for name in FLAGS.types.split(',')
-    ]
-    self.int_tf_types = [
-        dtype for dtype in self.all_tf_types if dtype.is_integer
-    ]
-    self.float_tf_types = [
-        dtype for dtype in self.all_tf_types if dtype.is_floating
-    ]
-    self.complex_tf_types = [
-        dtype for dtype in self.all_tf_types if dtype.is_complex
-    ]
-    self.numeric_tf_types = (
-        self.int_tf_types + self.float_tf_types + self.complex_tf_types)
-
-    self.all_types = [dtype.as_numpy_dtype for dtype in self.all_tf_types]
-    self.int_types = [dtype.as_numpy_dtype for dtype in self.int_tf_types]
-    self.float_types = [dtype.as_numpy_dtype for dtype in self.float_tf_types]
-    self.complex_types = [
+    ])
+    self.int_tf_types = set([
+        dtype for dtype in self._all_tf_types if dtype.is_integer
+    ])
+    self._float_tf_types = set([
+        dtype for dtype in self._all_tf_types if dtype.is_floating
+    ])
+    self.complex_tf_types = set([
+        dtype for dtype in self._all_tf_types if dtype.is_complex
+    ])
+    self._numeric_tf_types = set(
+        self.int_tf_types | self._float_tf_types | self.complex_tf_types)
+
+    self._all_types = set(
+        [dtype.as_numpy_dtype for dtype in self._all_tf_types])
+    self.int_types = set([dtype.as_numpy_dtype for dtype in self.int_tf_types])
+    self._float_types = set(
+        [dtype.as_numpy_dtype for dtype in self._float_tf_types])
+    self.complex_types = set([
         dtype.as_numpy_dtype for dtype in self.complex_tf_types
-    ]
-    self.numeric_types = self.int_types + self.float_types + self.complex_types
+    ])
+    self._numeric_types = set(
+        self.int_types | self._float_types | self.complex_types)
 
     # Parse the manifest file, if any, into a regex identifying tests to
     # disable
     self.disabled_regex = None
+    self._method_types_filter = dict()
+    # TODO(xpan): Make it text proto if it doesn't scale.
+    # Each line of the manifest file specifies an entry. The entry can be
+    # 1) TestNameRegex  // E.g. CumprodTest.* Or
+    # 2) TestName TypeName  // E.g. AdamOptimizerTest.testSharing DT_BFLOAT16
+    # The 1) disables the entire test. While 2) only filter some numeric types
+    # so that they are not used in those tests.
+
     if FLAGS.disabled_manifest is not None:
       comments_re = re.compile('#.*$')
       manifest_file = open(FLAGS.disabled_manifest, 'r')
-      lines = manifest_file.read().splitlines()
-      lines = [comments_re.sub('', l).strip() for l in lines]
-      self.disabled_regex = re.compile('|'.join(lines))
+      disabled_tests = []
+      disabled_method_types = []
+      for l in manifest_file.read().splitlines():
+        entry = comments_re.sub('', l).strip().split(' ')
+        if len(entry) == 1:
+          disabled_tests.append(entry[0])
+        elif len(entry) == 2:
+          disabled_method_types.append(
+              (entry[0], entry[1].strip().split(',')))
+        else:
+          raise ValueError('Bad entry in manifest file.')
+
+      self.disabled_regex = re.compile('|'.join(disabled_tests))
+      for method, types in disabled_method_types:
+        self._method_types_filter[method] = set([
+            dtypes.as_dtype(types_pb2.DataType.Value(name)).as_numpy_dtype
+            for name in types])
       manifest_file.close()
 
+  @property
+  def all_tf_types(self):
+    name = '{}.{}'.format(type(self).__name__, self._testMethodName)
+    tf_types = set([dtypes.as_dtype(t)
+                    for t in self._method_types_filter.get(name, set())])
+    return self._all_tf_types - tf_types
+
+  @property
+  def float_types(self):
+    name = '{}.{}'.format(type(self).__name__, self._testMethodName)
+    return self._float_types - self._method_types_filter.get(name, set())
+
+  @property
+  def float_tf_types(self):
+    name = '{}.{}'.format(type(self).__name__, self._testMethodName)
+    return self._float_tf_types - self._method_types_filter.get(name, set())
+
+  @property
+  def numeric_tf_types(self):
+    name = '{}.{}'.format(type(self).__name__, self._testMethodName)
+    tf_types = set([dtypes.as_dtype(t)
+                    for t in self._method_types_filter.get(name, set())])
+    return self._numeric_tf_types - tf_types
+
+  @property
+  def numeric_types(self):
+    name = '{}.{}'.format(type(self).__name__, self._testMethodName)
+    return self._numeric_types - self._method_types_filter.get(name, set())
+
+  @property
+  def all_types(self):
+    name = '{}.{}'.format(type(self).__name__, self._testMethodName)
+    return self._all_types - self._method_types_filter.get(name, set())
+
   def setUp(self):
     super(XLATestCase, self).setUp()
     name = '{}.{}'.format(type(self).__name__, self._testMethodName)
diff --git a/tensorflow/compiler/tf2xla/lib/util.cc b/tensorflow/compiler/tf2xla/lib/util.cc
index 943248aedb..ce24b61b5d 100644
--- a/tensorflow/compiler/tf2xla/lib/util.cc
+++ b/tensorflow/compiler/tf2xla/lib/util.cc
@@ -28,7 +28,7 @@ limitations under the License.
 namespace tensorflow {
 
 xla::ComputationDataHandle Zeros(xla::ComputationBuilder* builder,
-                                 xla::Shape& shape) {
+                                 const xla::Shape& shape) {
   return builder->Broadcast(
       builder->ConstantLiteral(xla::Literal::Zero(shape.element_type())),
       xla::AsInt64Slice(shape.dimensions()));
diff --git a/tensorflow/compiler/tf2xla/lib/util.h b/tensorflow/compiler/tf2xla/lib/util.h
index 8fba6b5cf2..fb138b4f73 100644
--- a/tensorflow/compiler/tf2xla/lib/util.h
+++ b/tensorflow/compiler/tf2xla/lib/util.h
@@ -25,7 +25,7 @@ namespace tensorflow {
 
 // Returns a zero-filled tensor with shape `shape`.
 xla::ComputationDataHandle Zeros(xla::ComputationBuilder* builder,
-                                 xla::Shape& shape);
+                                 const xla::Shape& shape);
 
 // Returns a floating point scalar constant of 'type' with 'value'.
 // If 'type' is complex, returns a real value with zero imaginary component.
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.h b/tensorflow/compiler/tf2xla/xla_op_registry.h
index 2959d2ab69..8bfd9758f7 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry.h
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.h
@@ -45,11 +45,11 @@ extern const char* const DEVICE_GPU_XLA_JIT;  // "GPU_XLA_JIT"
 extern const char* const DEVICE_XLA_CPU;
 extern const char* const DEVICE_XLA_GPU;
 
-constexpr std::array<DataType, 3> kFloatTypes = {
-    {DT_HALF, DT_FLOAT, DT_DOUBLE}};
-constexpr std::array<DataType, 8> kNumericTypes = {
+constexpr std::array<DataType, 4> kFloatTypes = {
+    {DT_HALF, DT_FLOAT, DT_DOUBLE, DT_BFLOAT16}};
+constexpr std::array<DataType, 9> kNumericTypes = {
     {DT_UINT32, DT_UINT64, DT_INT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE,
-     DT_COMPLEX64}};
+     DT_COMPLEX64, DT_BFLOAT16}};
 
 constexpr std::array<DataType, 8> kCpuAllTypes = {
     {DT_UINT32, DT_UINT64, DT_INT32, DT_INT64, DT_FLOAT, DT_DOUBLE,
diff --git a/tensorflow/core/kernels/split_op.cc b/tensorflow/core/kernels/split_op.cc
index 58e1a73be6..094ba8bb86 100644
--- a/tensorflow/core/kernels/split_op.cc
+++ b/tensorflow/core/kernels/split_op.cc
@@ -360,6 +360,8 @@ class SplitOpSYCL : public SplitOpBase<SYCLDevice, T> {
 
 TF_CALL_ALL_TYPES(REGISTER_SPLIT);
 REGISTER_SPLIT(quint8);
+// TODO(xpan): Merge bfloat16 into TF_CALL_ALL_TYPES
+REGISTER_SPLIT(bfloat16);
 
 #undef REGISTER_SPLIT
 
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 26904aadab..644e3bb515 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -51,6 +51,7 @@ from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import tape
 from tensorflow.python.framework import device as pydev
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
@@ -1168,6 +1169,7 @@ class TensorFlowTestCase(googletest.TestCase):
     """
     a = self._GetNdArray(a)
     b = self._GetNdArray(b)
+    # types with lower tol are put later to overwrite previous ones.
     if (a.dtype == np.float32 or b.dtype == np.float32 or
         a.dtype == np.complex64 or b.dtype == np.complex64):
       rtol = max(rtol, float_rtol)
@@ -1175,6 +1177,10 @@ class TensorFlowTestCase(googletest.TestCase):
     if a.dtype == np.float16 or b.dtype == np.float16:
       rtol = max(rtol, half_rtol)
       atol = max(atol, half_atol)
+    if (a.dtype == dtypes.bfloat16.as_numpy_dtype or
+        b.dtype == dtypes.bfloat16.as_numpy_dtype):
+      rtol = max(rtol, half_rtol)
+      atol = max(atol, half_atol)
 
     self.assertAllClose(a, b, rtol=rtol, atol=atol)
 
-- 
GitLab


From f76387a10f5188b059bc13223c5e9040a3bb7143 Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Wed, 24 Jan 2018 17:52:19 -0800
Subject: [PATCH 136/966] Make batch_sequences_with_states_test.py work with C
 API enabled.

PiperOrigin-RevId: 183171572
---
 .../batch_sequences_with_states_test.py       | 26 ++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py b/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py
index 2a0ef0e6b3..04538405e4 100644
--- a/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py
+++ b/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py
@@ -320,6 +320,18 @@ class BatchSequencesWithStatesTest(test.TestCase):
   def testNotAMultiple(self):
     num_unroll = 3  # Not a divisor of value_length -
     # so padding would have been necessary.
+
+    # Use placeholder_with_default in sequences to make sure we get runtime
+    # error instead of shape inference error
+    sequences = {
+        "seq1": array_ops.placeholder_with_default(self.sequences["seq1"],
+                                                   shape=(None, 5)),
+        "seq2": array_ops.placeholder_with_default(self.sequences["seq2"],
+                                                   shape=(None, 4, 2)),
+        "seq3": self.sequences["seq3"],
+        "seq4": self.sequences["seq4"],
+    }
+
     with self.test_session() as sess:
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                    ".*should be a multiple of: 3, but saw "
@@ -330,7 +342,7 @@ class BatchSequencesWithStatesTest(test.TestCase):
           with coord.stop_on_exception():
             next_batch = sqss.batch_sequences_with_states(
                 input_key=self.key,
-                input_sequences=self.sequences,
+                input_sequences=sequences,
                 input_context=self.context,
                 input_length=3,
                 initial_states=self.initial_states,
@@ -493,6 +505,18 @@ class BatchSequencesWithStatesTest(test.TestCase):
         expected_seq4_batch2=expected_seq4_batch2)
 
 
+class BatchSequencesWithStatesTestWithCApi(BatchSequencesWithStatesTest):
+
+  def setUp(self):
+    self._prev_value = ops._USE_C_API
+    ops._USE_C_API = True
+    super(BatchSequencesWithStatesTestWithCApi, self).setUp()
+
+  def tearDown(self):
+    super(BatchSequencesWithStatesTestWithCApi, self).tearDown()
+    ops._USE_C_API = self._prev_value
+
+
 class PaddingTest(test.TestCase):
 
   def testPaddingInvalidLengths(self):
-- 
GitLab


From 4aacd356fe7354b044d7c5787fb2366219294658 Mon Sep 17 00:00:00 2001
From: Ian Langmore <langmore@google.com>
Date: Wed, 24 Jan 2018 17:56:55 -0800
Subject: [PATCH 137/966] VISIBILITY_FIX:  Add 'auto_correlation' to
 _allowed_symbols.

PiperOrigin-RevId: 183171955
---
 tensorflow/contrib/distributions/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/contrib/distributions/__init__.py b/tensorflow/contrib/distributions/__init__.py
index 59cc5eae06..60a187e541 100644
--- a/tensorflow/contrib/distributions/__init__.py
+++ b/tensorflow/contrib/distributions/__init__.py
@@ -85,6 +85,7 @@ from tensorflow.python.ops.distributions.uniform import *
 from tensorflow.python.util.all_util import remove_undocumented
 
 _allowed_symbols = [
+    'auto_correlation',
     'bijectors',
     'Cauchy',
     'ConditionalDistribution',
-- 
GitLab


From 1323fc76958c9bf4907cf8bc53c9db0a85fb7d9b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 16 Jan 2018 18:20:02 -0800
Subject: [PATCH 138/966] Remove a duplicate bfloat16 registration.

PiperOrigin-RevId: 182143007
---
 tensorflow/core/kernels/identity_op.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/core/kernels/identity_op.cc b/tensorflow/core/kernels/identity_op.cc
index 7b8abf5494..5d288ff19c 100644
--- a/tensorflow/core/kernels/identity_op.cc
+++ b/tensorflow/core/kernels/identity_op.cc
@@ -102,7 +102,6 @@ REGISTER_SYCL_HOST_KERNEL(bool);
                           IdentityOp)
 
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
-REGISTER_GPU_KERNEL(bfloat16);
 REGISTER_GPU_KERNEL(Variant);
 
 #undef REGISTER_GPU_KERNEL
-- 
GitLab


From 764f90eca2ac7bdb858c30888ca7b4bb0c47dee1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 15 Dec 2017 10:09:07 -0800
Subject: [PATCH 139/966] Fix bfloat16 numerics issues in the tests.

PiperOrigin-RevId: 179207115
---
 tensorflow/compiler/tests/ftrl_test.py      | 40 ++++++++++-----------
 tensorflow/compiler/tests/momentum_test.py  | 23 ++++++------
 tensorflow/compiler/tests/unary_ops_test.py |  2 +-
 tensorflow/python/framework/test_util.py    | 10 ++++--
 4 files changed, 39 insertions(+), 36 deletions(-)

diff --git a/tensorflow/compiler/tests/ftrl_test.py b/tensorflow/compiler/tests/ftrl_test.py
index 7e3871312c..f9db4cf201 100644
--- a/tensorflow/compiler/tests/ftrl_test.py
+++ b/tensorflow/compiler/tests/ftrl_test.py
@@ -161,9 +161,9 @@ class FtrlOptimizerTest(XLATestCase):
           ftrl_update.run()
 
         # Validate updated params
-        self.assertAllClose(
+        self.assertAllCloseAccordingToType(
             np.array([-2.55607247, -3.98729396]), var0.eval(), 1e-5, 1e-5)
-        self.assertAllClose(
+        self.assertAllCloseAccordingToType(
             np.array([-0.28232238, -0.56096673]), var1.eval(), 1e-5, 1e-5)
 
   def testFtrlWithL1(self):
@@ -189,10 +189,10 @@ class FtrlOptimizerTest(XLATestCase):
           ftrl_update.run()
 
         # Validate updated params
-        self.assertAllClose(np.array([-7.66718769, -10.91273689]), var0.eval(),
-                            rtol=1e-4)
-        self.assertAllClose(np.array([-0.93460727, -1.86147261]), var1.eval(),
-                            rtol=1e-4)
+        self.assertAllCloseAccordingToType(
+            np.array([-7.66718769, -10.91273689]), var0.eval(), rtol=1e-4)
+        self.assertAllCloseAccordingToType(
+            np.array([-0.93460727, -1.86147261]), var1.eval(), rtol=1e-4)
 
   def testFtrlWithL1_L2(self):
     for dtype in self.float_types:
@@ -217,10 +217,10 @@ class FtrlOptimizerTest(XLATestCase):
           ftrl_update.run()
 
         # Validate updated params
-        self.assertAllClose(np.array([-0.24059935, -0.46829352]), var0.eval(),
-                            rtol=1e-5)
-        self.assertAllClose(np.array([-0.02406147, -0.04830509]), var1.eval(),
-                            rtol=1e-5)
+        self.assertAllCloseAccordingToType(
+            np.array([-0.24059935, -0.46829352]), var0.eval(), rtol=1e-5)
+        self.assertAllCloseAccordingToType(
+            np.array([-0.02406147, -0.04830509]), var1.eval(), rtol=1e-5)
 
   def testFtrlWithL1_L2_L2Shrinkage(self):
     """Test the new FTRL op with support for l2 shrinkage.
@@ -244,18 +244,18 @@ class FtrlOptimizerTest(XLATestCase):
         ftrl_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([4.0, 3.0], var1.eval())
+        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
+        self.assertAllCloseAccordingToType([4.0, 3.0], var1.eval())
 
         # Run 10 steps FTRL
         for _ in range(10):
           ftrl_update.run()
 
         # Validate updated params
-        self.assertAllClose(np.array([-0.21931979, -0.40642974]), var0.eval(),
-                            rtol=1e-4)
-        self.assertAllClose(np.array([-0.0282721, -0.07188385]), var1.eval(),
-                            rtol=1e-4)
+        self.assertAllCloseAccordingToType(
+            np.array([-0.21931979, -0.40642974]), var0.eval(), rtol=1e-4)
+        self.assertAllCloseAccordingToType(
+            np.array([-0.0282721, -0.07188385]), var1.eval(), rtol=1e-4)
 
   # When variables are initialized with Zero, FTRL-Proximal has two properties:
   # 1. Without L1&L2 but with fixed learning rate, FTRL-Proximal is identical
@@ -272,8 +272,8 @@ class FtrlOptimizerTest(XLATestCase):
       with self.test_session(), self.test_scope():
         val2, val3 = self.equivAdagradTest_AdagradPart(steps, dtype)
 
-    self.assertAllClose(val0, val2, rtol=1e-4)
-    self.assertAllClose(val1, val3, rtol=1e-4)
+    self.assertAllCloseAccordingToType(val0, val2, rtol=1e-4)
+    self.assertAllCloseAccordingToType(val1, val3, rtol=1e-4)
 
   def testEquivGradientDescentwithoutRegularization(self):
     steps = 5
@@ -284,8 +284,8 @@ class FtrlOptimizerTest(XLATestCase):
         val2, val3 = self.equivGradientDescentTest_GradientDescentPart(
             steps, dtype)
 
-    self.assertAllClose(val0, val2, rtol=1e-5)
-    self.assertAllClose(val1, val3, rtol=1e-5)
+    self.assertAllCloseAccordingToType(val0, val2, rtol=1e-5)
+    self.assertAllCloseAccordingToType(val1, val3, rtol=1e-5)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/compiler/tests/momentum_test.py b/tensorflow/compiler/tests/momentum_test.py
index c00e3035a0..af9394e7d7 100644
--- a/tensorflow/compiler/tests/momentum_test.py
+++ b/tensorflow/compiler/tests/momentum_test.py
@@ -96,28 +96,27 @@ class MomentumOptimizerTest(XLATestCase):
   def testNesterovMomentum(self):
     for dtype in self.float_types:
       with self.test_session(), self.test_scope():
-        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
-        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
-        var0_np = np.array([1.0, 2.0], dtype=dtype)
-        var1_np = np.array([3.0, 4.0], dtype=dtype)
+        var0 = resource_variable_ops.ResourceVariable([0.1, 0.2], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([0.3, 0.4], dtype=dtype)
+        var0_np = np.array([0.1, 0.2], dtype=dtype)
+        var1_np = np.array([0.3, 0.4], dtype=dtype)
         accum0_np = np.array([0.0, 0.0], dtype=dtype)
         accum1_np = np.array([0.0, 0.0], dtype=dtype)
-        cost = 5 * var0 * var0 + 3 * var1
+        cost = 0.4 * var0 * var0 + 0.9 * var1
         global_step = resource_variable_ops.ResourceVariable(
             array_ops.zeros([], dtypes.int32), name="global_step")
         mom_op = momentum_lib.MomentumOptimizer(
-            learning_rate=2.0, momentum=0.9, use_nesterov=True)
+            learning_rate=0.1, momentum=0.9, use_nesterov=True)
         opt_op = mom_op.minimize(cost, global_step, [var0, var1])
         variables.global_variables_initializer().run()
         for _ in range(1, 5):
           opt_op.run()
           var0_np, accum0_np = self._update_nesterov_momentum_numpy(
-              var0_np, accum0_np, var0_np * 10, 2.0, 0.9)
-          var1_np, accum1_np = self._update_nesterov_momentum_numpy(var1_np,
-                                                                    accum1_np,
-                                                                    3, 2.0, 0.9)
-          self.assertAllClose(var0_np, var0.eval())
-          self.assertAllClose(var1_np, var1.eval())
+              var0_np, accum0_np, var0_np * 0.8, 0.1, 0.9)
+          var1_np, accum1_np = self._update_nesterov_momentum_numpy(
+              var1_np, accum1_np, 0.9, 0.1, 0.9)
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
 
   def testTensorLearningRateAndMomentum(self):
     for dtype in self.float_types:
diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py
index b0623c0fbc..ecba5a4fb0 100644
--- a/tensorflow/compiler/tests/unary_ops_test.py
+++ b/tensorflow/compiler/tests/unary_ops_test.py
@@ -67,7 +67,7 @@ class UnaryOpsTest(XLATestCase):
         output = op(pinp)
       result = session.run(output, {pinp: inp})
       if equality_test is None:
-        equality_test = self.assertAllClose
+        equality_test = self.assertAllCloseAccordingToType
       equality_test(result, expected, rtol=rtol, atol=atol)
 
   def ListsAreClose(self, result, expected, rtol, atol):
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 644e3bb515..7627fb3e69 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -1151,7 +1151,9 @@ class TensorFlowTestCase(googletest.TestCase):
                                     float_rtol=1e-6,
                                     float_atol=1e-6,
                                     half_rtol=1e-3,
-                                    half_atol=1e-3):
+                                    half_atol=1e-3,
+                                    bfloat16_rtol=1e-2,
+                                    bfloat16_atol=1e-2):
     """Like assertAllClose, but also suitable for comparing fp16 arrays.
 
     In particular, the tolerance is reduced to 1e-3 if at least
@@ -1166,6 +1168,8 @@ class TensorFlowTestCase(googletest.TestCase):
       float_atol: absolute tolerance for float32.
       half_rtol: relative tolerance for float16.
       half_atol: absolute tolerance for float16.
+      bfloat16_rtol: relative tolerance for bfloat16.
+      bfloat16_atol: absolute tolerance for bfloat16.
     """
     a = self._GetNdArray(a)
     b = self._GetNdArray(b)
@@ -1179,8 +1183,8 @@ class TensorFlowTestCase(googletest.TestCase):
       atol = max(atol, half_atol)
     if (a.dtype == dtypes.bfloat16.as_numpy_dtype or
         b.dtype == dtypes.bfloat16.as_numpy_dtype):
-      rtol = max(rtol, half_rtol)
-      atol = max(atol, half_atol)
+      rtol = max(rtol, bfloat16_rtol)
+      atol = max(atol, bfloat16_atol)
 
     self.assertAllClose(a, b, rtol=rtol, atol=atol)
 
-- 
GitLab


From cbd1ee59d28f94c369738cbba8b6a4faed1e5fad Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Wed, 24 Jan 2018 17:56:57 -0800
Subject: [PATCH 140/966] Don't close session in debug_gradients_test.py.

PiperOrigin-RevId: 183171960
---
 tensorflow/python/debug/lib/debug_gradients_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/debug/lib/debug_gradients_test.py b/tensorflow/python/debug/lib/debug_gradients_test.py
index 6fd89e018a..b6c7280a41 100644
--- a/tensorflow/python/debug/lib/debug_gradients_test.py
+++ b/tensorflow/python/debug/lib/debug_gradients_test.py
@@ -39,7 +39,7 @@ class IdentifyGradientTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
     self.sess = session.Session()
-    with self.sess:
+    with self.sess.as_default():
       self.u = variables.Variable(2.0, name="u")
       self.v = variables.Variable(3.0, name="v")
       self.w = math_ops.multiply(self.u.value(), self.v.value(), name="w")
-- 
GitLab


From 4153e7afff4e17bbef866bd4811b0392ddb25b53 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 24 Jan 2018 18:23:18 -0800
Subject: [PATCH 141/966] Refactoring pass. Add a container context class to
 shorten argument lists and expose context information in a more organized
 manner. Clean up names, docs and tests. Rename submodule to avoid clashing
 with the new @covert decorator.

PiperOrigin-RevId: 183174971
---
 tensorflow/BUILD                              |   2 +-
 tensorflow/contrib/py2tf/BUILD                |   4 +-
 tensorflow/contrib/py2tf/api.py               |  66 ++++----
 tensorflow/contrib/py2tf/config.py            |   1 +
 tensorflow/contrib/py2tf/conversion.py        | 145 +++++++++---------
 tensorflow/contrib/py2tf/conversion_test.py   |  15 +-
 .../py2tf/{convert => converters}/BUILD       |  43 +++---
 .../py2tf/{convert => converters}/__init__.py |   0
 .../break_canonicalization.py                 |   0
 .../break_canonicalization_test.py            |  20 +--
 .../builtin_functions.py                      |   0
 .../builtin_functions_test.py                 |  18 +--
 .../{convert => converters}/call_trees.py     |   0
 .../call_trees_test.py                        |  20 +--
 .../continue_canonicalization.py              |   0
 .../continue_canonicalization_test.py         |  20 +--
 .../{convert => converters}/control_flow.py   |   0
 .../control_flow_test.py                      |  24 +--
 .../py2tf/converters/converter_test_base.py   |  48 ++++++
 .../{convert => converters}/decorators.py     |   0
 .../for_canonicalization.py                   |   0
 .../for_canonicalization_test.py              |  16 +-
 .../logical_expressions.py                    |   0
 .../logical_expressions_test.py               |  10 +-
 .../print_functions.py                        |   0
 .../print_functions_test.py                   |  18 +--
 .../side_effect_guards.py                     |   0
 .../side_effect_guards_test.py                |  18 +--
 tensorflow/contrib/py2tf/pyct/BUILD           |   1 +
 tensorflow/contrib/py2tf/pyct/context.py      |  42 +++++
 .../py2tf/pyct/static_analysis/type_info.py   |  16 +-
 .../pyct/static_analysis/type_info_test.py    |  61 +++-----
 tensorflow/contrib/py2tf/pyct/transformer.py  |  18 ++-
 tensorflow/tools/pip_package/BUILD            |   3 +-
 34 files changed, 328 insertions(+), 301 deletions(-)
 rename tensorflow/contrib/py2tf/{convert => converters}/BUILD (79%)
 rename tensorflow/contrib/py2tf/{convert => converters}/__init__.py (100%)
 rename tensorflow/contrib/py2tf/{convert => converters}/break_canonicalization.py (100%)
 rename tensorflow/contrib/py2tf/{convert => converters}/break_canonicalization_test.py (84%)
 rename tensorflow/contrib/py2tf/{convert => converters}/builtin_functions.py (100%)
 rename tensorflow/contrib/py2tf/{convert => converters}/builtin_functions_test.py (68%)
 rename tensorflow/contrib/py2tf/{convert => converters}/call_trees.py (100%)
 rename tensorflow/contrib/py2tf/{convert => converters}/call_trees_test.py (78%)
 rename tensorflow/contrib/py2tf/{convert => converters}/continue_canonicalization.py (100%)
 rename tensorflow/contrib/py2tf/{convert => converters}/continue_canonicalization_test.py (83%)
 rename tensorflow/contrib/py2tf/{convert => converters}/control_flow.py (100%)
 rename tensorflow/contrib/py2tf/{convert => converters}/control_flow_test.py (79%)
 create mode 100644 tensorflow/contrib/py2tf/converters/converter_test_base.py
 rename tensorflow/contrib/py2tf/{convert => converters}/decorators.py (100%)
 rename tensorflow/contrib/py2tf/{convert => converters}/for_canonicalization.py (100%)
 rename tensorflow/contrib/py2tf/{convert => converters}/for_canonicalization_test.py (75%)
 rename tensorflow/contrib/py2tf/{convert => converters}/logical_expressions.py (100%)
 rename tensorflow/contrib/py2tf/{convert => converters}/logical_expressions_test.py (85%)
 rename tensorflow/contrib/py2tf/{convert => converters}/print_functions.py (100%)
 rename tensorflow/contrib/py2tf/{convert => converters}/print_functions_test.py (65%)
 rename tensorflow/contrib/py2tf/{convert => converters}/side_effect_guards.py (100%)
 rename tensorflow/contrib/py2tf/{convert => converters}/side_effect_guards_test.py (72%)
 create mode 100644 tensorflow/contrib/py2tf/pyct/context.py

diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 2ea0e38c78..9099463c4f 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -527,7 +527,7 @@ filegroup(
         "//tensorflow/contrib/periodic_resample:all_files",
         "//tensorflow/contrib/predictor:all_files",
         "//tensorflow/contrib/py2tf:all_files",
-        "//tensorflow/contrib/py2tf/convert:all_files",
+        "//tensorflow/contrib/py2tf/converters:all_files",
         "//tensorflow/contrib/py2tf/pyct:all_files",
         "//tensorflow/contrib/py2tf/pyct/static_analysis:all_files",
         "//tensorflow/contrib/quantize:all_files",
diff --git a/tensorflow/contrib/py2tf/BUILD b/tensorflow/contrib/py2tf/BUILD
index 7358822ef5..d395de986d 100644
--- a/tensorflow/contrib/py2tf/BUILD
+++ b/tensorflow/contrib/py2tf/BUILD
@@ -26,7 +26,7 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/contrib/py2tf/convert",
+        "//tensorflow/contrib/py2tf/converters",
         "//tensorflow/contrib/py2tf/pyct",
         "//tensorflow/contrib/py2tf/pyct/static_analysis",
         "@gast_archive//:gast",
@@ -46,7 +46,7 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
-        "//tensorflow/contrib/py2tf/convert",
+        "//tensorflow/contrib/py2tf/converters",
         "//tensorflow/contrib/py2tf/pyct",
         "//tensorflow/contrib/py2tf/pyct/static_analysis",
         "@gast_archive//:gast",
diff --git a/tensorflow/contrib/py2tf/api.py b/tensorflow/contrib/py2tf/api.py
index 9a2b70c53c..1f250d5f57 100644
--- a/tensorflow/contrib/py2tf/api.py
+++ b/tensorflow/contrib/py2tf/api.py
@@ -83,7 +83,7 @@ def convert_inline(f, *args, **kwargs):
   return convert(arg_value_hints)(f)(*args, **kwargs)
 
 
-def convert(recursive=False, arg_value_hints=None):
+def convert(recursive=False, arg_types=None):
   """Decorator that compiles a function to graph mode.
 
   The decorator is dynamic - invoking compilation whenever the decorated fuction
@@ -92,8 +92,7 @@ def convert(recursive=False, arg_value_hints=None):
   Args:
     recursive: Whether to recusrively convert any functions that the decorator
         function may call.
-    arg_value_hints: A dict mapping parameter names to objects that can hint
-        at the type of those parameters.
+    arg_types: See to_graph.
 
   Returns:
     A decorator that compiles the given function to graph mode.
@@ -101,8 +100,8 @@ def convert(recursive=False, arg_value_hints=None):
   Raises:
     ValueError: If any of the arguments are illegal.
   """
-  if arg_value_hints is None:
-    arg_value_hints = {}
+  if arg_types is None:
+    arg_types = {}
 
   def decorator(f):
     """Decorator implementation."""
@@ -111,22 +110,23 @@ def convert(recursive=False, arg_value_hints=None):
     def wrapper(*args, **kwargs):
       """Wrapper that calls the compiled version of the wrapped function."""
       partial_types = ()
+      arg_values = {}
       arg_names = tf_inspect.getargspec(f)[0]
       for name, arg in zip(arg_names, args):
+        arg_values[name] = arg
         arg_class = arg.__class__
-        if tf_inspect.isclass(arg_class):
-          # If arg_value_hints specifies any name, use that instead.
-          # TODO(mdan): Shouldn't this just be in the func's globals?
-          if name not in arg_value_hints:
-            arg_value_hints[name] = (arg_class.__name__, arg_class)
+        # If arg_value_hints specifies any name, use that instead.
+        if name not in arg_types:
+          arg_types[name] = (arg_class.__name__, arg_class)
+        if name == 'self' and tf_inspect.isclass(arg_class):
           # Annotated methods need to specify that their owner type is partial,
           # otherwise other members they call will not be converted.
-          if name == 'self':
-            partial_types = (arg_class,)
+          partial_types = (arg_class,)
       wrapped = to_graph(
           f,
           recursive=recursive,
-          arg_value_hints=arg_value_hints,
+          arg_values=arg_values,
+          arg_types=arg_types,
           partial_types=partial_types)
       return wrapped(*args, **kwargs)
 
@@ -138,7 +138,11 @@ def convert(recursive=False, arg_value_hints=None):
   return decorator
 
 
-def to_graph(o, recursive=True, arg_value_hints=None, partial_types=None):
+def to_graph(e,
+             recursive=True,
+             arg_values=None,
+             arg_types=None,
+             partial_types=None):
   """Compile a Python entity into equivalent TensorFlow code.
 
   Currently supported entities:
@@ -148,11 +152,13 @@ def to_graph(o, recursive=True, arg_value_hints=None, partial_types=None):
   Classes are handled by converting all their methods into a new class.
 
   Args:
-    o: A Python function or class.
+    e: A Python entity.
     recursive: Whether to recusrively convert any functions that the decorator
         function may call.
-    arg_value_hints: A dict mapping parameter names to objects that can hint
-        at the type of those parameters.
+    arg_values: A dict containing value hints for symbols like function
+        parameters.
+    arg_types: A dict containing type hints for symbols like function
+        parameters.
     partial_types: A set of types (e.g. classes) that will not be converted
         entirely. Calls to member functions for these types will be renamed
         independently.
@@ -165,7 +171,7 @@ def to_graph(o, recursive=True, arg_value_hints=None, partial_types=None):
       recursive=recursive,
       nocompile_decorators=(convert, graph_ready, convert_inline),
       partial_types=partial_types)
-  _, name = conversion.object_to_graph(o, conversion_map, arg_value_hints)
+  _, name = conversion.entity_to_graph(e, conversion_map, arg_values, arg_types)
 
   module = gast.Module([])
   for import_line in config.COMPILED_IMPORT_STATEMENTS:
@@ -176,16 +182,17 @@ def to_graph(o, recursive=True, arg_value_hints=None, partial_types=None):
 
   # The compiled code should see everything the entry function saw.
   # TODO(mdan): This might not work well if the call tree spans modules?
-  if tf_inspect.isfunction(o):
-    compiled_node.__dict__.update(six.get_function_globals(o))
+  if tf_inspect.isfunction(e):
+    compiled_node.__dict__.update(six.get_function_globals(e))
 
   compiled_fn = getattr(compiled_node, name)
   return compiled_fn
 
 
-def to_code(o,
+def to_code(e,
             recursive=True,
-            arg_value_hints=None,
+            arg_values=None,
+            arg_types=None,
             partial_types=None,
             indentation='  '):
   """Return the equivalent of an entity in TensorFlow code.
@@ -193,14 +200,11 @@ def to_code(o,
   See `to_graph` for more details.
 
   Args:
-    o: A Python function or class.
-    recursive: Whether to recusrively convert any functions that the decorator
-        function may call.
-    arg_value_hints: A dict mapping parameter names to objects that can hint
-        at the type of those parameters.
-    partial_types: A set of types (e.g. classes) that will not be converted
-        entirely. Calls to member functions for these types will be renamed
-        independently.
+    e: A Python entity.
+    recursive: See to_graph.
+    arg_values: See to_graph.
+    arg_types: See to_graph.
+    partial_types: See to_graph.
     indentation: String, when to use for each level of indentation.
 
   Returns:
@@ -210,7 +214,7 @@ def to_code(o,
       recursive=recursive,
       nocompile_decorators=(convert, graph_ready, convert_inline),
       partial_types=partial_types)
-  conversion.object_to_graph(o, conversion_map, arg_value_hints)
+  conversion.entity_to_graph(e, conversion_map, arg_values, arg_types)
 
   imports = '\n'.join(config.COMPILED_IMPORT_STATEMENTS)
   code = '\n'.join(
diff --git a/tensorflow/contrib/py2tf/config.py b/tensorflow/contrib/py2tf/config.py
index 0a9d52136e..8c502a7a9e 100644
--- a/tensorflow/contrib/py2tf/config.py
+++ b/tensorflow/contrib/py2tf/config.py
@@ -22,6 +22,7 @@ PYTHON_LITERALS = {
     'None': None,
     'False': False,
     'True': True,
+    'float': float,
 }
 
 DEFAULT_UNCOMPILED_MODULES = set((
diff --git a/tensorflow/contrib/py2tf/conversion.py b/tensorflow/contrib/py2tf/conversion.py
index 38f1c0a14a..b484eebbd5 100644
--- a/tensorflow/contrib/py2tf/conversion.py
+++ b/tensorflow/contrib/py2tf/conversion.py
@@ -23,16 +23,17 @@ import six
 
 from tensorflow.contrib.py2tf import config
 from tensorflow.contrib.py2tf import naming
-from tensorflow.contrib.py2tf.convert import break_canonicalization
-from tensorflow.contrib.py2tf.convert import builtin_functions
-from tensorflow.contrib.py2tf.convert import call_trees
-from tensorflow.contrib.py2tf.convert import continue_canonicalization
-from tensorflow.contrib.py2tf.convert import control_flow
-from tensorflow.contrib.py2tf.convert import decorators
-from tensorflow.contrib.py2tf.convert import for_canonicalization
-from tensorflow.contrib.py2tf.convert import logical_expressions
-from tensorflow.contrib.py2tf.convert import print_functions
-from tensorflow.contrib.py2tf.convert import side_effect_guards
+from tensorflow.contrib.py2tf.converters import break_canonicalization
+from tensorflow.contrib.py2tf.converters import builtin_functions
+from tensorflow.contrib.py2tf.converters import call_trees
+from tensorflow.contrib.py2tf.converters import continue_canonicalization
+from tensorflow.contrib.py2tf.converters import control_flow
+from tensorflow.contrib.py2tf.converters import decorators
+from tensorflow.contrib.py2tf.converters import for_canonicalization
+from tensorflow.contrib.py2tf.converters import logical_expressions
+from tensorflow.contrib.py2tf.converters import print_functions
+from tensorflow.contrib.py2tf.converters import side_effect_guards
+from tensorflow.contrib.py2tf.pyct import context
 from tensorflow.contrib.py2tf.pyct import parser
 from tensorflow.contrib.py2tf.pyct.static_analysis import access
 from tensorflow.contrib.py2tf.pyct.static_analysis import live_values
@@ -51,9 +52,9 @@ class ConversionMap(object):
         function may call.
     nocompile_decorators: tuple of decorator functions that toggle compilation
         off.
-    dependency_cache: dict[object]: ast; maps original objects to their
+    dependency_cache: dict[object]: ast; maps original entities to their
         converted AST
-    name_map: dict[string]: string; maps original objects to the name of
+    name_map: dict[string]: string; maps original entities to the name of
         their converted counterparts
   """
 
@@ -66,8 +67,8 @@ class ConversionMap(object):
     self.dependency_cache = {}
     self.name_map = {}
 
-  def new_namer(self, global_symbols):
-    return naming.Namer(global_symbols, self.recursive, self.name_map,
+  def new_namer(self, namespace):
+    return naming.Namer(namespace, self.recursive, self.name_map,
                         self.partial_types)
 
   def update_name_map(self, namer):
@@ -76,48 +77,47 @@ class ConversionMap(object):
         if self.name_map[o] != name:
           raise ValueError(
               'Calls to %s were converted using multiple names (%s). This is '
-              'possible when an object with one of these names already '
+              'possible when an entity with one of these names already '
               'existed. To fix, avoid using any of these names.')
       else:
         self.name_map[o] = name
 
-  def add_to_cache(self, original_object, converted_ast):
-    self.dependency_cache[original_object] = converted_ast
+  def add_to_cache(self, original_entity, converted_ast):
+    self.dependency_cache[original_entity] = converted_ast
 
 
-def object_to_graph(o, conversion_map, value_hints):
-  """Compile a Python object into equivalent TensorFlow.
+def entity_to_graph(o, conversion_map, arg_values, arg_types):
+  """Compile a Python entity into equivalent TensorFlow.
 
-  The function will also recursively compile all the objects that `o`
+  The function will also recursively compile all the entities that `o`
   references, updating `dependency_cache`.
 
   This function is reentrant, and relies on dependency_cache to avoid
   generating duplicate code.
 
   Args:
-    o: A Python object.
+    o: A Python entity.
     conversion_map: A ConversionMap object.
-    value_hints: A dict containing value hints for symbols like function
+    arg_values: A dict containing value hints for symbols like function
+        parameters.
+    arg_types: A dict containing type hints for symbols like function
         parameters.
 
   Returns:
     A tuple (ast, new_name):
-        * ast: An AST representing an object with interface equivalent to `o`,
+        * ast: An AST representing an entity with interface equivalent to `o`,
             but which when executed it creates TF a graph.
-        * new_name: The symbol name under which the new object can be found.
+        * new_name: The symbol name under which the new entity can be found.
 
   Raises:
-    ValueError: if the object is not supported.
+    ValueError: if the entity type is not supported.
   """
-  if value_hints is None:
-    value_hints = {}
-
   if tf_inspect.isclass(o):
-    node, new_name = class_to_graph(o, conversion_map, value_hints)
+    node, new_name = class_to_graph(o, conversion_map)
   elif tf_inspect.isfunction(o):
-    node, new_name = function_to_graph(o, conversion_map, value_hints)
+    node, new_name = function_to_graph(o, conversion_map, arg_values, arg_types)
   elif tf_inspect.ismethod(o):
-    node, new_name = function_to_graph(o, conversion_map, value_hints)
+    node, new_name = function_to_graph(o, conversion_map, arg_values, arg_types)
   else:
     raise ValueError(
         'Entity "%s" has unsupported type "%s". Only functions and classes are '
@@ -132,25 +132,26 @@ def object_to_graph(o, conversion_map, value_hints):
           # Class members are converted with their objects, unless they're
           # only converted partially.
           continue
-        object_to_graph(obj, conversion_map, None)
+        entity_to_graph(obj, conversion_map, {}, {})
 
   return node, new_name
 
 
-def class_to_graph(c, conversion_map, param_value_hints):
-  """Specialization of `object_to_graph` for classes."""
+def class_to_graph(c, conversion_map):
+  """Specialization of `entity_to_graph` for classes."""
   converted_members = {}
   members = tf_inspect.getmembers(c, predicate=tf_inspect.ismethod)
   if not members:
     raise ValueError('Cannot convert %s: it has no member methods.')
 
-  if 'self' in param_value_hints:
-    raise ValueError('Hints may not be provided for reserved name "self".')
-  param_value_hints['self'] = (c.__name__, c)
-
   class_globals = None
   for _, m in members:
-    node, _ = function_to_graph(m, conversion_map, param_value_hints, c)
+    node, _ = function_to_graph(
+        m,
+        conversion_map=conversion_map,
+        arg_values={},
+        arg_types={'self': (c.__name__, c)},
+        owner_type=c)
     # TODO(mdan): Do not assume all members have the same view of globals.
     if class_globals is None:
       class_globals = six.get_function_globals(m)
@@ -167,10 +168,11 @@ def class_to_graph(c, conversion_map, param_value_hints):
   return node, class_name
 
 
-def function_to_graph(f, conversion_map, param_value_hints, owner_type=None):
-  """Specialization of `object_to_graph` for callable functions."""
+def function_to_graph(f, conversion_map, arg_values, arg_types,
+                      owner_type=None):
+  """Specialization of `entity_to_graph` for callable functions."""
   node = parser.parse_object(f).body[0]
-  node_globals = six.get_function_globals(f)
+  namespace = six.get_function_globals(f)
 
   # This is needed for non-global functions.
   closure = six.get_function_closure(f)
@@ -178,12 +180,17 @@ def function_to_graph(f, conversion_map, param_value_hints, owner_type=None):
     for e in closure:
       if callable(e.cell_contents):
         fn = e.cell_contents
-        node_globals[fn.__name__] = fn
-
-  namer = conversion_map.new_namer(node_globals)
-  node = node_to_graph(node, tf_inspect.getsource(f), tf_inspect.getfile(f),
-                       namer, node_globals, param_value_hints,
-                       conversion_map.nocompile_decorators)
+        namespace[fn.__name__] = fn
+
+  namer = conversion_map.new_namer(namespace)
+  ctx = context.EntityContext(
+      namer=namer,
+      source_code=tf_inspect.getsource(f),
+      source_file=tf_inspect.getfile(f),
+      namespace=namespace,
+      arg_values=arg_values,
+      arg_types=arg_types)
+  node = node_to_graph(node, ctx, conversion_map.nocompile_decorators)
 
   # Simulate a rename to ensure the top level is in the name map. This is needed
   # for top level functions, and it also helps the consistency verification made
@@ -197,34 +204,26 @@ def function_to_graph(f, conversion_map, param_value_hints, owner_type=None):
   return node, conversion_map.name_map[f]
 
 
-def _static_analysis_pass(node, source, f, namespace, value_hints):
+def _static_analysis_pass(node, ctx):
   node = access.resolve(node)
-  node = live_values.resolve(node, namespace, config.PYTHON_LITERALS)
-  node = type_info.resolve(node, source, f, value_hints)
+  node = live_values.resolve(node, ctx.namespace, config.PYTHON_LITERALS)
+  node = type_info.resolve(node, ctx)
   return node
 
 
-def node_to_graph(node, source, f, namer, namespace, value_hints,
-                  nocompile_decorators):
+def node_to_graph(node, ctx, nocompile_decorators):
   """Convert Python code to equivalent TF graph mode code.
 
   Args:
     node: A Python AST node representing the code to convert.
-    source: Optional string containing the source code of the node. Used in
-        error messages.
-    f: Optional string indicating the file where the node originated. None if
-        unknown. Used in error messages.
-    namer: A naming.Namer object.
-    namespace: Dict mapping symbol names to their corresponding live objects.
-    value_hints: A dict containing value hints for symbols like function
-        parameters.
+    ctx: An EntityContext object.
     nocompile_decorators: A tuple containing decorators to be stripped from
         functions during conversion.
 
   Returns:
     A tuple (node, deps):
         * node: A Python ast node, representing the converted code.
-        * deps: A set of strings, the fully qualified names of object
+        * deps: A set of strings, the fully qualified names of entity
             dependencies that this node has.
   """
   # TODO(mdan): Verify arguments for correctness.
@@ -241,30 +240,30 @@ def node_to_graph(node, source, f, namer, namespace, value_hints,
   # tree, which must be accounted. Although less efficient, it is most robust
   # to re-run the analysis.
 
-  node = _static_analysis_pass(node, source, f, namespace, value_hints)
+  node = _static_analysis_pass(node, ctx)
   node = decorators.transform(node, nocompile_decorators)
-  node = break_canonicalization.transform(node, namer)
+  node = break_canonicalization.transform(node, ctx.namer)
 
   # Note: sequencing continue canonicalization before for loop one avoids
   # dealing with the extra loop increment operation that the for
   # canonicalization creates.
-  node = continue_canonicalization.transform(node, namer)
-  namespace['len'] = len
+  node = continue_canonicalization.transform(node, ctx.namer)
+  ctx.namespace['len'] = len
 
-  node = _static_analysis_pass(node, None, None, namespace, value_hints)
-  node = for_canonicalization.transform(node, namer)
+  node = _static_analysis_pass(node, ctx)
+  node = for_canonicalization.transform(node, ctx.namer)
   # for_canonicalization may insert new global references.
   node = builtin_functions.transform(node)
   # builtin_functions may insert new global references.
-  namespace['print'] = print
+  ctx.namespace['print'] = print
 
-  node = _static_analysis_pass(node, None, None, namespace, value_hints)
+  node = _static_analysis_pass(node, ctx)
   node = print_functions.transform(node)
-  node = call_trees.transform(node, namer, namespace,
+  node = call_trees.transform(node, ctx.namer, ctx.namespace,
                               config.DEFAULT_UNCOMPILED_MODULES,
                               nocompile_decorators)
-  node = control_flow.transform(node, namer)
+  node = control_flow.transform(node, ctx.namer)
   node = logical_expressions.transform(node)
-  node = side_effect_guards.transform(node, namer)
+  node = side_effect_guards.transform(node, ctx.namer)
 
   return node
diff --git a/tensorflow/contrib/py2tf/conversion_test.py b/tensorflow/contrib/py2tf/conversion_test.py
index e48bfe4464..26f915f4f4 100644
--- a/tensorflow/contrib/py2tf/conversion_test.py
+++ b/tensorflow/contrib/py2tf/conversion_test.py
@@ -26,20 +26,23 @@ from tensorflow.python.platform import test
 
 class ConversionTest(test.TestCase):
 
-  def test_object_to_graph_unsupported_types(self):
+  def test_entity_to_graph_unsupported_types(self):
     with self.assertRaises(ValueError):
-      conversion.object_to_graph('dummy', None, {})
+      conversion_map = conversion.ConversionMap(True, (), ())
+      conversion.entity_to_graph('dummy', conversion_map, None, None)
+
+  def test_entity_to_graph_callable(self):
 
-  def test_object_to_graph_callable(self):
     def f(a):
       return a
 
     conversion_map = conversion.ConversionMap(True, (), ())
-    ast, new_name = conversion.object_to_graph(f, conversion_map, {})
+    ast, new_name = conversion.entity_to_graph(f, conversion_map, None, None)
     self.assertTrue(isinstance(ast, gast.FunctionDef), ast)
     self.assertEqual('tf__f', new_name)
 
-  def test_object_to_graph_call_tree(self):
+  def test_entity_to_graph_call_tree(self):
+
     def g(a):
       return a
 
@@ -47,7 +50,7 @@ class ConversionTest(test.TestCase):
       return g(a)
 
     conversion_map = conversion.ConversionMap(True, (), ())
-    conversion.object_to_graph(f, conversion_map, {})
+    conversion.entity_to_graph(f, conversion_map, None, None)
 
     self.assertTrue(f in conversion_map.dependency_cache)
     self.assertTrue(g in conversion_map.dependency_cache)
diff --git a/tensorflow/contrib/py2tf/convert/BUILD b/tensorflow/contrib/py2tf/converters/BUILD
similarity index 79%
rename from tensorflow/contrib/py2tf/convert/BUILD
rename to tensorflow/contrib/py2tf/converters/BUILD
index 050e2ef108..2b0a1234e6 100644
--- a/tensorflow/contrib/py2tf/convert/BUILD
+++ b/tensorflow/contrib/py2tf/converters/BUILD
@@ -15,7 +15,7 @@ filegroup(
 )
 
 py_library(
-    name = "convert",
+    name = "converters",
     srcs = [
         "break_canonicalization.py",
         "builtin_functions.py",
@@ -35,13 +35,26 @@ py_library(
     ],
 )
 
+py_library(
+    name = "test_lib",
+    srcs = [
+        "converter_test_base.py",
+    ],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:__subpackages__"],
+    deps = [
+        ":converters",
+        "//tensorflow/contrib/py2tf/pyct/static_analysis",
+        "@gast_archive//:gast",
+    ],
+)
+
 py_test(
     name = "break_canonicalization_test",
     srcs = ["break_canonicalization_test.py"],
     deps = [
-        ":convert",
+        ":test_lib",
         "//tensorflow/contrib/py2tf/pyct",
-        "//tensorflow/contrib/py2tf/pyct/static_analysis",
         "//tensorflow/python:client_testlib",
     ],
 )
@@ -50,9 +63,8 @@ py_test(
     name = "call_trees_test",
     srcs = ["call_trees_test.py"],
     deps = [
-        ":convert",
+        ":test_lib",
         "//tensorflow/contrib/py2tf/pyct",
-        "//tensorflow/contrib/py2tf/pyct/static_analysis",
         "//tensorflow/python:client_testlib",
     ],
 )
@@ -61,9 +73,8 @@ py_test(
     name = "continue_canonicalization_test",
     srcs = ["continue_canonicalization_test.py"],
     deps = [
-        ":convert",
+        ":test_lib",
         "//tensorflow/contrib/py2tf/pyct",
-        "//tensorflow/contrib/py2tf/pyct/static_analysis",
         "//tensorflow/python:client_testlib",
     ],
 )
@@ -72,9 +83,8 @@ py_test(
     name = "control_flow_test",
     srcs = ["control_flow_test.py"],
     deps = [
-        ":convert",
+        ":test_lib",
         "//tensorflow/contrib/py2tf/pyct",
-        "//tensorflow/contrib/py2tf/pyct/static_analysis",
         "//tensorflow/python:client_testlib",
     ],
 )
@@ -83,9 +93,8 @@ py_test(
     name = "builtin_functions_test",
     srcs = ["builtin_functions_test.py"],
     deps = [
-        ":convert",
+        ":test_lib",
         "//tensorflow/contrib/py2tf/pyct",
-        "//tensorflow/contrib/py2tf/pyct/static_analysis",
         "//tensorflow/python:client_testlib",
     ],
 )
@@ -94,9 +103,8 @@ py_test(
     name = "for_canonicalization_test",
     srcs = ["for_canonicalization_test.py"],
     deps = [
-        ":convert",
+        ":test_lib",
         "//tensorflow/contrib/py2tf/pyct",
-        "//tensorflow/contrib/py2tf/pyct/static_analysis",
         "//tensorflow/python:client_testlib",
     ],
 )
@@ -105,9 +113,8 @@ py_test(
     name = "logical_expressions_test",
     srcs = ["logical_expressions_test.py"],
     deps = [
-        ":convert",
+        ":test_lib",
         "//tensorflow/contrib/py2tf/pyct",
-        "//tensorflow/contrib/py2tf/pyct/static_analysis",
         "//tensorflow/python:client_testlib",
     ],
 )
@@ -116,9 +123,8 @@ py_test(
     name = "print_functions_test",
     srcs = ["print_functions_test.py"],
     deps = [
-        ":convert",
+        ":test_lib",
         "//tensorflow/contrib/py2tf/pyct",
-        "//tensorflow/contrib/py2tf/pyct/static_analysis",
         "//tensorflow/python:client_testlib",
         "@gast_archive//:gast",
     ],
@@ -128,9 +134,8 @@ py_test(
     name = "side_effect_guards_test",
     srcs = ["side_effect_guards_test.py"],
     deps = [
-        ":convert",
+        ":test_lib",
         "//tensorflow/contrib/py2tf/pyct",
-        "//tensorflow/contrib/py2tf/pyct/static_analysis",
         "//tensorflow/python:client_testlib",
     ],
 )
diff --git a/tensorflow/contrib/py2tf/convert/__init__.py b/tensorflow/contrib/py2tf/converters/__init__.py
similarity index 100%
rename from tensorflow/contrib/py2tf/convert/__init__.py
rename to tensorflow/contrib/py2tf/converters/__init__.py
diff --git a/tensorflow/contrib/py2tf/convert/break_canonicalization.py b/tensorflow/contrib/py2tf/converters/break_canonicalization.py
similarity index 100%
rename from tensorflow/contrib/py2tf/convert/break_canonicalization.py
rename to tensorflow/contrib/py2tf/converters/break_canonicalization.py
diff --git a/tensorflow/contrib/py2tf/convert/break_canonicalization_test.py b/tensorflow/contrib/py2tf/converters/break_canonicalization_test.py
similarity index 84%
rename from tensorflow/contrib/py2tf/convert/break_canonicalization_test.py
rename to tensorflow/contrib/py2tf/converters/break_canonicalization_test.py
index 23c4c4d3e2..b5ba2ad923 100644
--- a/tensorflow/contrib/py2tf/convert/break_canonicalization_test.py
+++ b/tensorflow/contrib/py2tf/converters/break_canonicalization_test.py
@@ -18,11 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.convert import break_canonicalization
-from tensorflow.contrib.py2tf.convert import control_flow
+from tensorflow.contrib.py2tf.converters import break_canonicalization
+from tensorflow.contrib.py2tf.converters import control_flow
+from tensorflow.contrib.py2tf.converters import converter_test_base
 from tensorflow.contrib.py2tf.pyct import compiler
-from tensorflow.contrib.py2tf.pyct import parser
-from tensorflow.contrib.py2tf.pyct.static_analysis import access
 from tensorflow.python.platform import test
 
 
@@ -32,12 +31,7 @@ class TestNamer(control_flow.SymbolNamer):
     return name_root
 
 
-class BreakCanonicalizationTest(test.TestCase):
-
-  def _parse_and_analyze(self, test_fn, namespace):
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-    return node
+class BreakCanonicalizationTest(converter_test_base.TestCase):
 
   def test_basic_break(self):
 
@@ -50,7 +44,7 @@ class BreakCanonicalizationTest(test.TestCase):
         v.append(x)
       return v
 
-    node = self._parse_and_analyze(test_fn, {})
+    node = self.parse_and_analyze(test_fn, {}, include_type_analysis=False)
     node = break_canonicalization.transform(node, TestNamer())
     result = compiler.ast_to_object(node)
 
@@ -82,7 +76,7 @@ class BreakCanonicalizationTest(test.TestCase):
         v.append(x)
       return v
 
-    node = self._parse_and_analyze(test_fn, {})
+    node = self.parse_and_analyze(test_fn, {}, include_type_analysis=False)
     node = break_canonicalization.transform(node, TestNamer())
     result = compiler.ast_to_object(node)
 
@@ -110,7 +104,7 @@ class BreakCanonicalizationTest(test.TestCase):
         v.append(x)
       return v, u, w
 
-    node = self._parse_and_analyze(test_fn, {})
+    node = self.parse_and_analyze(test_fn, {}, include_type_analysis=False)
     node = break_canonicalization.transform(node, TestNamer())
     result = compiler.ast_to_object(node)
 
diff --git a/tensorflow/contrib/py2tf/convert/builtin_functions.py b/tensorflow/contrib/py2tf/converters/builtin_functions.py
similarity index 100%
rename from tensorflow/contrib/py2tf/convert/builtin_functions.py
rename to tensorflow/contrib/py2tf/converters/builtin_functions.py
diff --git a/tensorflow/contrib/py2tf/convert/builtin_functions_test.py b/tensorflow/contrib/py2tf/converters/builtin_functions_test.py
similarity index 68%
rename from tensorflow/contrib/py2tf/convert/builtin_functions_test.py
rename to tensorflow/contrib/py2tf/converters/builtin_functions_test.py
index ab02b362aa..b5358da6bc 100644
--- a/tensorflow/contrib/py2tf/convert/builtin_functions_test.py
+++ b/tensorflow/contrib/py2tf/converters/builtin_functions_test.py
@@ -18,32 +18,22 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.convert import builtin_functions
+from tensorflow.contrib.py2tf.converters import builtin_functions
+from tensorflow.contrib.py2tf.converters import converter_test_base
 from tensorflow.contrib.py2tf.pyct import compiler
-from tensorflow.contrib.py2tf.pyct import parser
-from tensorflow.contrib.py2tf.pyct.static_analysis import access
-from tensorflow.contrib.py2tf.pyct.static_analysis import live_values
-from tensorflow.contrib.py2tf.pyct.static_analysis import type_info
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
-class BuiltinFunctionsTest(test.TestCase):
-
-  def _parse_and_analyze(self, test_fn, namespace):
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-    node = live_values.resolve(node, namespace, {})
-    node = type_info.resolve(node, None, None, {})
-    return node
+class BuiltinFunctionsTest(converter_test_base.TestCase):
 
   def test_len(self):
 
     def test_fn(a):
       return len(a)
 
-    node = self._parse_and_analyze(test_fn, {'len': len})
+    node = self.parse_and_analyze(test_fn, {'len': len})
     node = builtin_functions.transform(node)
     result = compiler.ast_to_object(node)
     setattr(result, 'tf', array_ops)
diff --git a/tensorflow/contrib/py2tf/convert/call_trees.py b/tensorflow/contrib/py2tf/converters/call_trees.py
similarity index 100%
rename from tensorflow/contrib/py2tf/convert/call_trees.py
rename to tensorflow/contrib/py2tf/converters/call_trees.py
diff --git a/tensorflow/contrib/py2tf/convert/call_trees_test.py b/tensorflow/contrib/py2tf/converters/call_trees_test.py
similarity index 78%
rename from tensorflow/contrib/py2tf/convert/call_trees_test.py
rename to tensorflow/contrib/py2tf/converters/call_trees_test.py
index 78a6b53910..8cb8d7be0f 100644
--- a/tensorflow/contrib/py2tf/convert/call_trees_test.py
+++ b/tensorflow/contrib/py2tf/converters/call_trees_test.py
@@ -18,12 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.convert import call_trees
+from tensorflow.contrib.py2tf.converters import call_trees
+from tensorflow.contrib.py2tf.converters import converter_test_base
 from tensorflow.contrib.py2tf.pyct import compiler
-from tensorflow.contrib.py2tf.pyct import parser
-from tensorflow.contrib.py2tf.pyct.static_analysis import access
-from tensorflow.contrib.py2tf.pyct.static_analysis import live_values
-from tensorflow.contrib.py2tf.pyct.static_analysis import type_info
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -35,14 +32,7 @@ class TestNamer(call_trees.FunctionNamer):
     return 'renamed_%s' % original_name
 
 
-class CallTreesTest(test.TestCase):
-
-  def _parse_and_analyze(self, test_fn, namespace):
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-    node = live_values.resolve(node, namespace, {})
-    node = type_info.resolve(node, None, None, {})
-    return node
+class CallTreesTest(converter_test_base.TestCase):
 
   def test_basic(self):
 
@@ -55,7 +45,7 @@ class CallTreesTest(test.TestCase):
     def test_fn_2(a):
       return test_fn_1(a) + 1
 
-    node = self._parse_and_analyze(test_fn_2, {'test_fn_1': test_fn_1})
+    node = self.parse_and_analyze(test_fn_2, {'test_fn_1': test_fn_1})
     node = call_trees.transform(node, TestNamer(), {}, (), ())
     result = compiler.ast_to_object(node)
     # Only test_fn_2 is transformed, so we'll insert renamed_test_fn_1 manually.
@@ -70,7 +60,7 @@ class CallTreesTest(test.TestCase):
       a = math_ops.add(a, constant_op.constant(1))
       return a
 
-    node = self._parse_and_analyze(test_fn, {
+    node = self.parse_and_analyze(test_fn, {
         'math_ops': math_ops,
         'constant_op': constant_op
     })
diff --git a/tensorflow/contrib/py2tf/convert/continue_canonicalization.py b/tensorflow/contrib/py2tf/converters/continue_canonicalization.py
similarity index 100%
rename from tensorflow/contrib/py2tf/convert/continue_canonicalization.py
rename to tensorflow/contrib/py2tf/converters/continue_canonicalization.py
diff --git a/tensorflow/contrib/py2tf/convert/continue_canonicalization_test.py b/tensorflow/contrib/py2tf/converters/continue_canonicalization_test.py
similarity index 83%
rename from tensorflow/contrib/py2tf/convert/continue_canonicalization_test.py
rename to tensorflow/contrib/py2tf/converters/continue_canonicalization_test.py
index a041ff4641..c1fe903a2d 100644
--- a/tensorflow/contrib/py2tf/convert/continue_canonicalization_test.py
+++ b/tensorflow/contrib/py2tf/converters/continue_canonicalization_test.py
@@ -18,11 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.convert import continue_canonicalization
-from tensorflow.contrib.py2tf.convert import control_flow
+from tensorflow.contrib.py2tf.converters import continue_canonicalization
+from tensorflow.contrib.py2tf.converters import control_flow
+from tensorflow.contrib.py2tf.converters import converter_test_base
 from tensorflow.contrib.py2tf.pyct import compiler
-from tensorflow.contrib.py2tf.pyct import parser
-from tensorflow.contrib.py2tf.pyct.static_analysis import access
 from tensorflow.python.platform import test
 
 
@@ -32,12 +31,7 @@ class TestNamer(control_flow.SymbolNamer):
     return name_root
 
 
-class ContinueCanonicalizationTest(test.TestCase):
-
-  def _parse_and_analyze(self, test_fn, namespace):
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-    return node
+class ContinueCanonicalizationTest(converter_test_base.TestCase):
 
   def test_basic_continue(self):
 
@@ -50,7 +44,7 @@ class ContinueCanonicalizationTest(test.TestCase):
         v.append(x)
       return v
 
-    node = self._parse_and_analyze(test_fn, {})
+    node = self.parse_and_analyze(test_fn, {}, include_type_analysis=False)
     node = continue_canonicalization.transform(node, TestNamer())
     result = compiler.ast_to_object(node)
 
@@ -71,7 +65,7 @@ class ContinueCanonicalizationTest(test.TestCase):
         v.append(x)
       return v
 
-    node = self._parse_and_analyze(test_fn, {})
+    node = self.parse_and_analyze(test_fn, {}, include_type_analysis=False)
     node = continue_canonicalization.transform(node, TestNamer())
     result = compiler.ast_to_object(node)
 
@@ -97,7 +91,7 @@ class ContinueCanonicalizationTest(test.TestCase):
         v.append(x)
       return v, u, w
 
-    node = self._parse_and_analyze(test_fn, {})
+    node = self.parse_and_analyze(test_fn, {}, include_type_analysis=False)
     node = continue_canonicalization.transform(node, TestNamer())
     result = compiler.ast_to_object(node)
 
diff --git a/tensorflow/contrib/py2tf/convert/control_flow.py b/tensorflow/contrib/py2tf/converters/control_flow.py
similarity index 100%
rename from tensorflow/contrib/py2tf/convert/control_flow.py
rename to tensorflow/contrib/py2tf/converters/control_flow.py
diff --git a/tensorflow/contrib/py2tf/convert/control_flow_test.py b/tensorflow/contrib/py2tf/converters/control_flow_test.py
similarity index 79%
rename from tensorflow/contrib/py2tf/convert/control_flow_test.py
rename to tensorflow/contrib/py2tf/converters/control_flow_test.py
index 64a317ee9c..054e33750d 100644
--- a/tensorflow/contrib/py2tf/convert/control_flow_test.py
+++ b/tensorflow/contrib/py2tf/converters/control_flow_test.py
@@ -18,12 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.convert import control_flow
+from tensorflow.contrib.py2tf.converters import control_flow
+from tensorflow.contrib.py2tf.converters import converter_test_base
 from tensorflow.contrib.py2tf.pyct import compiler
-from tensorflow.contrib.py2tf.pyct import parser
-from tensorflow.contrib.py2tf.pyct.static_analysis import access
-from tensorflow.contrib.py2tf.pyct.static_analysis import live_values
-from tensorflow.contrib.py2tf.pyct.static_analysis import type_info
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.platform import test
@@ -40,14 +37,7 @@ class TestNamer(control_flow.SymbolNamer):
       i += 1
 
 
-class ControlFlowTest(test.TestCase):
-
-  def _parse_and_analyze(self, test_fn, namespace):
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-    node = live_values.resolve(node, namespace, {})
-    node = type_info.resolve(node, None, None, {})
-    return node
+class ControlFlowTest(converter_test_base.TestCase):
 
   def test_simple_while(self):
 
@@ -59,7 +49,7 @@ class ControlFlowTest(test.TestCase):
         i += 1
       return s, i, n
 
-    node = self._parse_and_analyze(test_fn, {})
+    node = self.parse_and_analyze(test_fn, {})
     node = control_flow.transform(node, TestNamer())
     result = compiler.ast_to_object(node)
     setattr(result, 'tf', control_flow_ops)
@@ -75,7 +65,7 @@ class ControlFlowTest(test.TestCase):
         n -= 1
       return n
 
-    node = self._parse_and_analyze(test_fn, {})
+    node = self.parse_and_analyze(test_fn, {})
     node = control_flow.transform(node, TestNamer())
     result = compiler.ast_to_object(node)
     setattr(result, 'tf', control_flow_ops)
@@ -94,7 +84,7 @@ class ControlFlowTest(test.TestCase):
         b = 2 * n
       return a, b
 
-    node = self._parse_and_analyze(test_fn, {})
+    node = self.parse_and_analyze(test_fn, {})
     node = control_flow.transform(node, TestNamer())
     result = compiler.ast_to_object(node)
     setattr(result, 'tf', control_flow_ops)
@@ -112,7 +102,7 @@ class ControlFlowTest(test.TestCase):
         n = -n
       return n
 
-    node = self._parse_and_analyze(test_fn, {})
+    node = self.parse_and_analyze(test_fn, {})
     node = control_flow.transform(node, TestNamer())
     result = compiler.ast_to_object(node)
     setattr(result, 'tf', control_flow_ops)
diff --git a/tensorflow/contrib/py2tf/converters/converter_test_base.py b/tensorflow/contrib/py2tf/converters/converter_test_base.py
new file mode 100644
index 0000000000..ed006bad6d
--- /dev/null
+++ b/tensorflow/contrib/py2tf/converters/converter_test_base.py
@@ -0,0 +1,48 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Base class for tests in this module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.py2tf.pyct import context
+from tensorflow.contrib.py2tf.pyct import parser
+from tensorflow.contrib.py2tf.pyct.static_analysis import access
+from tensorflow.contrib.py2tf.pyct.static_analysis import live_values
+from tensorflow.contrib.py2tf.pyct.static_analysis import type_info
+from tensorflow.python.platform import test
+
+
+class TestCase(test.TestCase):
+
+  def parse_and_analyze(self,
+                        test_fn,
+                        namespace,
+                        arg_types=None,
+                        include_type_analysis=True):
+    ctx = context.EntityContext(
+        namer=None,
+        source_code=None,
+        source_file=None,
+        namespace=namespace,
+        arg_values=None,
+        arg_types=arg_types)
+    node = parser.parse_object(test_fn)
+    node = access.resolve(node)
+    node = live_values.resolve(node, namespace, {})
+    if include_type_analysis:
+      node = type_info.resolve(node, ctx)
+    return node
diff --git a/tensorflow/contrib/py2tf/convert/decorators.py b/tensorflow/contrib/py2tf/converters/decorators.py
similarity index 100%
rename from tensorflow/contrib/py2tf/convert/decorators.py
rename to tensorflow/contrib/py2tf/converters/decorators.py
diff --git a/tensorflow/contrib/py2tf/convert/for_canonicalization.py b/tensorflow/contrib/py2tf/converters/for_canonicalization.py
similarity index 100%
rename from tensorflow/contrib/py2tf/convert/for_canonicalization.py
rename to tensorflow/contrib/py2tf/converters/for_canonicalization.py
diff --git a/tensorflow/contrib/py2tf/convert/for_canonicalization_test.py b/tensorflow/contrib/py2tf/converters/for_canonicalization_test.py
similarity index 75%
rename from tensorflow/contrib/py2tf/convert/for_canonicalization_test.py
rename to tensorflow/contrib/py2tf/converters/for_canonicalization_test.py
index 8de2d1a0f8..a6e6350fd4 100644
--- a/tensorflow/contrib/py2tf/convert/for_canonicalization_test.py
+++ b/tensorflow/contrib/py2tf/converters/for_canonicalization_test.py
@@ -18,11 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.convert import control_flow
-from tensorflow.contrib.py2tf.convert import for_canonicalization
+from tensorflow.contrib.py2tf.converters import control_flow
+from tensorflow.contrib.py2tf.converters import converter_test_base
+from tensorflow.contrib.py2tf.converters import for_canonicalization
 from tensorflow.contrib.py2tf.pyct import compiler
-from tensorflow.contrib.py2tf.pyct import parser
-from tensorflow.contrib.py2tf.pyct.static_analysis import access
 from tensorflow.python.platform import test
 
 
@@ -32,12 +31,7 @@ class TestNamer(control_flow.SymbolNamer):
     return name_root
 
 
-class ControlFlowTest(test.TestCase):
-
-  def _parse_and_analyze(self, test_fn, namespace):
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-    return node
+class ControlFlowTest(converter_test_base.TestCase):
 
   def test_basic_for(self):
 
@@ -47,7 +41,7 @@ class ControlFlowTest(test.TestCase):
         s += e
       return s
 
-    node = self._parse_and_analyze(test_fn, {})
+    node = self.parse_and_analyze(test_fn, {})
     node = for_canonicalization.transform(node, TestNamer())
     result = compiler.ast_to_object(node)
 
diff --git a/tensorflow/contrib/py2tf/convert/logical_expressions.py b/tensorflow/contrib/py2tf/converters/logical_expressions.py
similarity index 100%
rename from tensorflow/contrib/py2tf/convert/logical_expressions.py
rename to tensorflow/contrib/py2tf/converters/logical_expressions.py
diff --git a/tensorflow/contrib/py2tf/convert/logical_expressions_test.py b/tensorflow/contrib/py2tf/converters/logical_expressions_test.py
similarity index 85%
rename from tensorflow/contrib/py2tf/convert/logical_expressions_test.py
rename to tensorflow/contrib/py2tf/converters/logical_expressions_test.py
index f07fa017b9..d711065099 100644
--- a/tensorflow/contrib/py2tf/convert/logical_expressions_test.py
+++ b/tensorflow/contrib/py2tf/converters/logical_expressions_test.py
@@ -18,21 +18,21 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.convert import logical_expressions
+from tensorflow.contrib.py2tf.converters import converter_test_base
+from tensorflow.contrib.py2tf.converters import logical_expressions
 from tensorflow.contrib.py2tf.pyct import compiler
-from tensorflow.contrib.py2tf.pyct import parser
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
-class GradientsFunctionTest(test.TestCase):
+class GradientsFunctionTest(converter_test_base.TestCase):
 
   def test_equals(self):
 
     def test_fn(a, b):
       return a == b
 
-    node = parser.parse_object(test_fn)
+    node = self.parse_and_analyze(test_fn, {})
     node = logical_expressions.transform(node)
     result = compiler.ast_to_object(node)
     setattr(result, 'tf', math_ops)
@@ -46,7 +46,7 @@ class GradientsFunctionTest(test.TestCase):
     def test_fn(a, b, c):
       return (a or b) and (a or b or c)
 
-    node = parser.parse_object(test_fn)
+    node = self.parse_and_analyze(test_fn, {})
     node = logical_expressions.transform(node)
     result = compiler.ast_to_object(node)
     setattr(result, 'tf', math_ops)
diff --git a/tensorflow/contrib/py2tf/convert/print_functions.py b/tensorflow/contrib/py2tf/converters/print_functions.py
similarity index 100%
rename from tensorflow/contrib/py2tf/convert/print_functions.py
rename to tensorflow/contrib/py2tf/converters/print_functions.py
diff --git a/tensorflow/contrib/py2tf/convert/print_functions_test.py b/tensorflow/contrib/py2tf/converters/print_functions_test.py
similarity index 65%
rename from tensorflow/contrib/py2tf/convert/print_functions_test.py
rename to tensorflow/contrib/py2tf/converters/print_functions_test.py
index 8b6c238aa4..475196ce10 100644
--- a/tensorflow/contrib/py2tf/convert/print_functions_test.py
+++ b/tensorflow/contrib/py2tf/converters/print_functions_test.py
@@ -20,30 +20,20 @@ from __future__ import print_function
 
 import gast
 
-from tensorflow.contrib.py2tf.convert import print_functions
+from tensorflow.contrib.py2tf.converters import converter_test_base
+from tensorflow.contrib.py2tf.converters import print_functions
 from tensorflow.contrib.py2tf.pyct import compiler
-from tensorflow.contrib.py2tf.pyct import parser
-from tensorflow.contrib.py2tf.pyct.static_analysis import access
-from tensorflow.contrib.py2tf.pyct.static_analysis import live_values
-from tensorflow.contrib.py2tf.pyct.static_analysis import type_info
 from tensorflow.python.platform import test
 
 
-class PrintFunctionsTest(test.TestCase):
-
-  def _parse_and_analyze(self, test_fn, namespace):
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-    node = live_values.resolve(node, namespace, {})
-    node = type_info.resolve(node, None, None, {})
-    return node
+class PrintFunctionsTest(converter_test_base.TestCase):
 
   def test_transform(self):
 
     def test_fn(a):
       print(a)
 
-    node = self._parse_and_analyze(test_fn, {'print': print})
+    node = self.parse_and_analyze(test_fn, {'print': print})
     node = print_functions.transform(node)
     result = compiler.ast_to_object(node)
 
diff --git a/tensorflow/contrib/py2tf/convert/side_effect_guards.py b/tensorflow/contrib/py2tf/converters/side_effect_guards.py
similarity index 100%
rename from tensorflow/contrib/py2tf/convert/side_effect_guards.py
rename to tensorflow/contrib/py2tf/converters/side_effect_guards.py
diff --git a/tensorflow/contrib/py2tf/convert/side_effect_guards_test.py b/tensorflow/contrib/py2tf/converters/side_effect_guards_test.py
similarity index 72%
rename from tensorflow/contrib/py2tf/convert/side_effect_guards_test.py
rename to tensorflow/contrib/py2tf/converters/side_effect_guards_test.py
index 1715e9eb95..5c56973dc2 100644
--- a/tensorflow/contrib/py2tf/convert/side_effect_guards_test.py
+++ b/tensorflow/contrib/py2tf/converters/side_effect_guards_test.py
@@ -18,12 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.convert import side_effect_guards
+from tensorflow.contrib.py2tf.converters import converter_test_base
+from tensorflow.contrib.py2tf.converters import side_effect_guards
 from tensorflow.contrib.py2tf.pyct import compiler
-from tensorflow.contrib.py2tf.pyct import parser
-from tensorflow.contrib.py2tf.pyct.static_analysis import access
-from tensorflow.contrib.py2tf.pyct.static_analysis import live_values
-from tensorflow.contrib.py2tf.pyct.static_analysis import type_info
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import state_ops
@@ -37,14 +34,7 @@ class TestNamer(side_effect_guards.SymbolNamer):
     return name_root
 
 
-class SideEffectGuardsTest(test.TestCase):
-
-  def _parse_and_analyze(self, test_fn, namespace):
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-    node = live_values.resolve(node, namespace, {})
-    node = type_info.resolve(node, None, None, {})
-    return node
+class SideEffectGuardsTest(converter_test_base.TestCase):
 
   def test_transform(self):
 
@@ -52,7 +42,7 @@ class SideEffectGuardsTest(test.TestCase):
       state_ops.assign(a, a + 1)
       return a
 
-    node = self._parse_and_analyze(test_fn, {'state_ops': state_ops})
+    node = self.parse_and_analyze(test_fn, {'state_ops': state_ops})
     node = side_effect_guards.transform(node, TestNamer())
     result = compiler.ast_to_object(node)
     setattr(result, 'state_ops', state_ops)
diff --git a/tensorflow/contrib/py2tf/pyct/BUILD b/tensorflow/contrib/py2tf/pyct/BUILD
index 9dd564cb9f..e0331dbc97 100644
--- a/tensorflow/contrib/py2tf/pyct/BUILD
+++ b/tensorflow/contrib/py2tf/pyct/BUILD
@@ -20,6 +20,7 @@ py_library(
         "__init__.py",
         "anno.py",
         "compiler.py",
+        "context.py",
         "parser.py",
         "pretty_printer.py",
         "templates.py",
diff --git a/tensorflow/contrib/py2tf/pyct/context.py b/tensorflow/contrib/py2tf/pyct/context.py
new file mode 100644
index 0000000000..73f3613d09
--- /dev/null
+++ b/tensorflow/contrib/py2tf/pyct/context.py
@@ -0,0 +1,42 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Conversion context containers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+class EntityContext(object):
+  """Contains information about an entity, like source code.
+
+  Attributes:
+    namer: Namer that matches the contract of all converters.
+    source_code: The entity's source code.
+    source_file: The entity's source file.
+    namespace: Dict[str->*], containing symbols visible to the entity
+        (excluding parameters).
+    arg_values: Dict[str->*], containing parameter values, if known.
+    arg_types: Dict[str->*], containing parameter types, if known.
+  """
+
+  def __init__(self, namer, source_code, source_file, namespace, arg_values,
+               arg_types):
+    self.namer = namer
+    self.source_code = source_code
+    self.source_file = source_file
+    self.namespace = namespace
+    self.arg_values = {} if arg_values is None else arg_values
+    self.arg_types = {} if arg_types is None else arg_types
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/type_info.py b/tensorflow/contrib/py2tf/pyct/static_analysis/type_info.py
index b17af5d844..0042aa90ed 100644
--- a/tensorflow/contrib/py2tf/pyct/static_analysis/type_info.py
+++ b/tensorflow/contrib/py2tf/pyct/static_analysis/type_info.py
@@ -78,10 +78,9 @@ class TypeInfoResolver(transformer.Base):
     * Attribute (helps resolve object methods)
   """
 
-  def __init__(self, value_hints, source, f):
-    super(TypeInfoResolver, self).__init__(source, f)
+  def __init__(self, context):
+    super(TypeInfoResolver, self).__init__(context)
     self.scope = Scope(None)
-    self.value_hints = value_hints
     self.function_level = 0
 
   def visit_FunctionDef(self, node):
@@ -122,13 +121,11 @@ class TypeInfoResolver(transformer.Base):
     self.generic_visit(node)
     if isinstance(node.ctx, gast.Param):
       self.scope.setval(node.id, gast.Name(node.id, gast.Load(), None))
-      # TODO(mdan): Member functions should not need type hints.
-      # We could attemp to extract im_class from the live_val annotation.
-      if self.function_level == 1 and node.id in self.value_hints:
+      if self.function_level == 1 and node.id in self.context.arg_types:
         # Forge a node to hold the type information, so that method calls on
         # it can resolve the type.
         type_holder = gast.Name(node.id, gast.Load(), None)
-        type_string, type_obj = self.value_hints[node.id]
+        type_string, type_obj = self.context.arg_types[node.id]
         anno.setanno(type_holder, 'type', type_obj)
         anno.setanno(type_holder, 'type_fqn', tuple(type_string.split('.')))
         self.scope.setval(node.id, type_holder)
@@ -208,6 +205,5 @@ class TypeInfoResolver(transformer.Base):
     return node
 
 
-def resolve(node, source, f, value_hints):
-  assert value_hints is not None
-  return TypeInfoResolver(value_hints, source, f).visit(node)
+def resolve(node, context):
+  return TypeInfoResolver(context).visit(node)
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/type_info_test.py b/tensorflow/contrib/py2tf/pyct/static_analysis/type_info_test.py
index 98dc7bf50f..a491f49ca3 100644
--- a/tensorflow/contrib/py2tf/pyct/static_analysis/type_info_test.py
+++ b/tensorflow/contrib/py2tf/pyct/static_analysis/type_info_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.py2tf.pyct import anno
+from tensorflow.contrib.py2tf.pyct import context
 from tensorflow.contrib.py2tf.pyct import parser
 from tensorflow.contrib.py2tf.pyct import transformer
 from tensorflow.contrib.py2tf.pyct.static_analysis import access
@@ -55,17 +56,27 @@ class ScopeTest(test.TestCase):
 
 class TypeInfoResolverTest(test.TestCase):
 
+  def _parse_and_analyze(self, test_fn, namespace, arg_types=None):
+    ctx = context.EntityContext(
+        namer=None,
+        source_code=None,
+        source_file=None,
+        namespace=namespace,
+        arg_values=None,
+        arg_types=arg_types)
+    node = parser.parse_object(test_fn)
+    node = access.resolve(node)
+    node = live_values.resolve(node, namespace, {})
+    node = type_info.resolve(node, ctx)
+    return node
+
   def test_constructor_detection(self):
 
     def test_fn():
       opt = training.GradientDescentOptimizer(0.1)
       return opt
 
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-    node = live_values.resolve(node, {'training': training}, {})
-    node = type_info.resolve(node, None, None, {})
-
+    node = self._parse_and_analyze(test_fn, {'training': training})
     call_node = node.body[0].body[0].value
     self.assertEquals(training.GradientDescentOptimizer,
                       anno.getanno(call_node, 'type'))
@@ -78,11 +89,7 @@ class TypeInfoResolverTest(test.TestCase):
       opt = training.GradientDescentOptimizer(0.1)
       opt.minimize(0)
 
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-    node = live_values.resolve(node, {'training': training}, {})
-    node = type_info.resolve(node, None, None, {})
-
+    node = self._parse_and_analyze(test_fn, {'training': training})
     attr_call_node = node.body[0].body[1].value.func
     self.assertEquals((training.__name__, 'GradientDescentOptimizer'),
                       anno.getanno(attr_call_node, 'type_fqn'))
@@ -93,11 +100,7 @@ class TypeInfoResolverTest(test.TestCase):
       with session.Session() as sess:
         sess.run(x)
 
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-    node = live_values.resolve(node, {'session': session}, {})
-    node = type_info.resolve(node, None, None, {})
-
+    node = self._parse_and_analyze(test_fn, {'session': session})
     constructor_call = node.body[0].body[0].items[0].context_expr
     self.assertEquals(session.Session, anno.getanno(constructor_call, 'type'))
     self.assertEquals((session.__name__, 'Session'),
@@ -116,33 +119,25 @@ class TypeInfoResolverTest(test.TestCase):
         opt = training.GradientDescentOptimizer(0.01)
       opt.minimize(0)
 
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-    node = live_values.resolve(node, {'training': training}, {})
     with self.assertRaises(transformer.PyFlowParseError):
-      node = type_info.resolve(node, None, None, {})
+      self._parse_and_analyze(test_fn, {'training': training})
 
   def test_parameter_class_members(self):
 
     def test_fn(opt):
       opt.minimize(0)
 
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-    node = live_values.resolve(node, {'training': training}, {})
     with self.assertRaises(transformer.PyFlowParseError):
-      node = type_info.resolve(node, None, None, {})
+      self._parse_and_analyze(test_fn, {'training': training})
 
   def test_parameter_class_members_with_value_hints(self):
 
     def test_fn(opt):
       opt.minimize(0)
 
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-    node = live_values.resolve(node, {'training': training}, {})
-    node = type_info.resolve(
-        node, None, None, {
+    node = self._parse_and_analyze(
+        test_fn, {'training': training},
+        arg_types={
             'opt': (('%s.GradientDescentOptimizer' % training.__name__),
                     training.GradientDescentOptimizer(0.1))
         })
@@ -161,11 +156,8 @@ class TypeInfoResolverTest(test.TestCase):
       foo = bar
       foo()
 
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-    node = live_values.resolve(node, {'bar': bar}, {})
     with self.assertRaises(transformer.PyFlowParseError):
-      node = type_info.resolve(node, None, None, {})
+      self._parse_and_analyze(test_fn, {'bar': bar})
 
   def test_nested_members(self):
 
@@ -173,11 +165,8 @@ class TypeInfoResolverTest(test.TestCase):
       foo = training.GradientDescentOptimizer(0.1)
       foo.bar.baz()
 
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-    node = live_values.resolve(node, {'training': training}, {})
     with self.assertRaises(transformer.PyFlowParseError):
-      node = type_info.resolve(node, None, None, {})
+      self._parse_and_analyze(test_fn, {'training': training})
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/py2tf/pyct/transformer.py b/tensorflow/contrib/py2tf/pyct/transformer.py
index 1658a1b694..d5aa23eaeb 100644
--- a/tensorflow/contrib/py2tf/pyct/transformer.py
+++ b/tensorflow/contrib/py2tf/pyct/transformer.py
@@ -30,23 +30,29 @@ class PyFlowParseError(SyntaxError):
 class Base(gast.NodeTransformer):
   """Base class for specialized transformers."""
 
-  def __init__(self, source, f):
+  def __init__(self, context):
+    """Initialize the transformer. Subclasses should call this.
+
+    Args:
+      context: An EntityContext.
+    """
     self._lineno = 0
     self._col_offset = 0
-    self._source = source
-    self._file = f
+    self.context = context
 
   def visit(self, node):
     try:
-      if self._source and hasattr(node, 'lineno'):
+      source_code = self.context.source_code
+      source_file = self.context.source_file
+      if source_code and hasattr(node, 'lineno'):
         self._lineno = node.lineno
         self._col_offset = node.col_offset
       return super(Base, self).visit(node)
     except ValueError as e:
       msg = '%s\nOccurred at node:\n%s' % (str(e), pretty_printer.fmt(node))
-      if self._source:
+      if source_code:
         line = self._source.splitlines()[self._lineno - 1]
       else:
         line = '<no source available>'
       raise PyFlowParseError(
-          msg, (self._file, self._lineno, self._col_offset + 1, line))
+          msg, (source_file, self._lineno, self._col_offset + 1, line))
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index c789e2ba0c..598080ed27 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -152,7 +152,8 @@ sh_binary(
             "//tensorflow/contrib/nn:nn_py",
             "//tensorflow/contrib/predictor:predictor_pip",
             "//tensorflow/contrib/py2tf:py2tf_internal",
-            "//tensorflow/contrib/py2tf/convert:convert",
+            "//tensorflow/contrib/py2tf/converters:converters",
+            "//tensorflow/contrib/py2tf/converters:test_lib",
             "//tensorflow/contrib/py2tf/pyct:pyct",
             "//tensorflow/contrib/py2tf/pyct/static_analysis:static_analysis",
             "//tensorflow/contrib/receptive_field:receptive_field_pip",
-- 
GitLab


From faf3547da8bde4aa05ac65562250901f1784e562 Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Wed, 24 Jan 2018 19:01:16 -0800
Subject: [PATCH 142/966] [XLA] Allow buffers for CustomCalls to be reused.

PiperOrigin-RevId: 183177944
---
 tensorflow/compiler/xla/service/buffer_assignment.cc     | 9 ++++-----
 tensorflow/compiler/xla/tests/local_client_aot_test.cc   | 3 +--
 .../compiler/xla/tests/local_client_aot_test_helper.cc   | 3 +--
 3 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
index 33fe11b81d..323620c131 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -846,14 +846,13 @@ Status BufferAssigner::AssignBuffersForComputation(
       continue;
     }
 
-    if (is_thread_local || instruction->opcode() == HloOpcode::kCustomCall) {
-      // Custom call operations never have reusable buffers. Also we do not
-      // reuse thread-local buffers for now, because they are dynamically
-      // allocated and their lifetimes are hard to compute.
+    if (is_thread_local) {
+      // We do not reuse thread-local buffers for now, because they are
+      // dynamically allocated and their lifetimes are hard to compute.
       BufferAllocation* allocation = assignment->NewAllocation(
           *buffer, buffer_size, is_thread_local, /*is_reusable=*/false);
       VLOG(3) << "New allocation #" << allocation->index()
-              << " for thread-local/CustomCall: " << *buffer;
+              << " for thread-local: " << *buffer;
       continue;
     }
 
diff --git a/tensorflow/compiler/xla/tests/local_client_aot_test.cc b/tensorflow/compiler/xla/tests/local_client_aot_test.cc
index 569d5944ca..47cab79604 100644
--- a/tensorflow/compiler/xla/tests/local_client_aot_test.cc
+++ b/tensorflow/compiler/xla/tests/local_client_aot_test.cc
@@ -44,8 +44,7 @@ TEST_F(LocalClientAotTest, Constant) {
   OpaqueData opaque_data{100, 20, 3};
   void* parameters[] = {&opaque_data};
   float out = 0;
-  char tmp[4] = {0};
-  void* temporary_buffers[] = {nullptr, &out, &tmp};
+  void* temporary_buffers[] = {nullptr, &out};
   SumAndDouble(&out, &run_options, parameters, temporary_buffers);
   EXPECT_EQ(out, 246.0f);
 
diff --git a/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc b/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc
index 4d3b513b09..3704ddd801 100644
--- a/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc
+++ b/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc
@@ -87,10 +87,9 @@ int main(int argc, char** argv) {
   // It's lame to hard-code the buffer assignments, but we need
   // local_client_aot_test.cc to be able to easily invoke the function.
   CHECK_EQ(result->result_buffer_index(), 1);
-  CHECK_EQ(result->buffer_sizes().size(), 3);
+  CHECK_EQ(result->buffer_sizes().size(), 2);
   CHECK_EQ(result->buffer_sizes()[0], -1);             // param buffer
   CHECK_EQ(result->buffer_sizes()[1], sizeof(float));  // result buffer
-  CHECK_EQ(result->buffer_sizes()[2], sizeof(float));  // temp buffer
   if (triple.isOSBinFormatELF()) {
     // Check the ELF magic.
     CHECK_EQ(result->object_file_data()[0], 0x7F);
-- 
GitLab


From b25e892311fbdb308f89605ede30fce1b138c7f6 Mon Sep 17 00:00:00 2001
From: Rui Zhao <rzhao@google.com>
Date: Wed, 24 Jan 2018 19:32:10 -0800
Subject: [PATCH 143/966] Allow passing candidate sampling seed in to
 sampled_softmax_loss for testing.

PiperOrigin-RevId: 183180196
---
 tensorflow/python/ops/nn_impl.py                | 16 ++++++++++++----
 tensorflow/tools/api/golden/tensorflow.nn.pbtxt |  2 +-
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index 3268fd0e0a..55fcd176d6 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -969,7 +969,8 @@ def _compute_sampled_logits(weights,
                             subtract_log_q=True,
                             remove_accidental_hits=False,
                             partition_strategy="mod",
-                            name=None):
+                            name=None,
+                            seed=None):
   """Helper function for nce_loss and sampled_softmax_loss functions.
 
   Computes sampled output training logits and labels suitable for implementing
@@ -1007,6 +1008,8 @@ def _compute_sampled_logits(weights,
         if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported.
         Default is `"mod"`. See `tf.nn.embedding_lookup` for more details.
     name: A name for the operation (optional).
+    seed: random seed for candidate sampling. Default to None, which doesn't set
+        the op-level random seed for candidate sampling.
   Returns:
     out_logits: `Tensor` object with shape
         `[batch_size, num_true + num_sampled]`, for passing to either
@@ -1036,7 +1039,8 @@ def _compute_sampled_logits(weights,
           num_true=num_true,
           num_sampled=num_sampled,
           unique=True,
-          range_max=num_classes)
+          range_max=num_classes,
+          seed=seed)
     # NOTE: pylint cannot tell that 'sampled_values' is a sequence
     # pylint: disable=unpacking-non-sequence
     sampled, true_expected_count, sampled_expected_count = (
@@ -1255,7 +1259,8 @@ def sampled_softmax_loss(weights,
                          sampled_values=None,
                          remove_accidental_hits=True,
                          partition_strategy="mod",
-                         name="sampled_softmax_loss"):
+                         name="sampled_softmax_loss",
+                         seed=None):
   """Computes and returns the sampled softmax training loss.
 
   This is a faster way to train a softmax classifier over a huge number of
@@ -1316,6 +1321,8 @@ def sampled_softmax_loss(weights,
         if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported.
         Default is `"mod"`. See `tf.nn.embedding_lookup` for more details.
     name: A name for the operation (optional).
+    seed: random seed for candidate sampling. Default to None, which doesn't set
+        the op-level random seed for candidate sampling.
 
   Returns:
     A `batch_size` 1-D tensor of per-example sampled softmax losses.
@@ -1333,7 +1340,8 @@ def sampled_softmax_loss(weights,
       subtract_log_q=True,
       remove_accidental_hits=remove_accidental_hits,
       partition_strategy=partition_strategy,
-      name=name)
+      name=name,
+      seed=seed)
   sampled_losses = nn_ops.softmax_cross_entropy_with_logits(
       labels=labels, logits=logits)
   # sampled_losses is a [batch_size] tensor.
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.pbtxt
index 8ce022e454..455590d866 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.pbtxt
@@ -262,7 +262,7 @@ tf_module {
   }
   member_method {
     name: "sampled_softmax_loss"
-    argspec: "args=[\'weights\', \'biases\', \'labels\', \'inputs\', \'num_sampled\', \'num_classes\', \'num_true\', \'sampled_values\', \'remove_accidental_hits\', \'partition_strategy\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'None\', \'True\', \'mod\', \'sampled_softmax_loss\'], "
+    argspec: "args=[\'weights\', \'biases\', \'labels\', \'inputs\', \'num_sampled\', \'num_classes\', \'num_true\', \'sampled_values\', \'remove_accidental_hits\', \'partition_strategy\', \'name\', \'seed\'], varargs=None, keywords=None, defaults=[\'1\', \'None\', \'True\', \'mod\', \'sampled_softmax_loss\', \'None\'], "
   }
   member_method {
     name: "selu"
-- 
GitLab


From 1a6216e61e804019cd64732d6f95d4d9bbedb594 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 24 Jan 2018 19:47:58 -0800
Subject: [PATCH 144/966] [XLA] Fix HloModule clone.

Currently the cloning of an instruction is usually "shallow": the called
computations of the instruction are reused in the clone. This mechanism is
useful when the hlo graph need to be modified in place (e.g. inliner, and some
hlo passes). One exception is the fusion instruction: it's always "deep"
copied, which means the fused computation is copied as well. So when we deep
cloning an HLO module, don't re-copy the fused computation, and do let the
instruction's clone function know where to put the copied fused computation.

PiperOrigin-RevId: 183181206
---
 tensorflow/compiler/xla/service/hlo_module.cc | 21 ++++++++--
 .../compiler/xla/service/hlo_module_test.cc   | 42 +++++++++++++++++++
 2 files changed, 60 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_module.cc b/tensorflow/compiler/xla/service/hlo_module.cc
index 58bb942211..99d8dd04e5 100644
--- a/tensorflow/compiler/xla/service/hlo_module.cc
+++ b/tensorflow/compiler/xla/service/hlo_module.cc
@@ -523,7 +523,15 @@ std::unique_ptr<HloModule> HloModule::Clone(const string& suffix) const {
 
   std::unordered_map<HloComputation*, HloComputation*> clone_map;
   for (auto& computation : computations_) {
-    auto cloned_computation = computation->Clone(suffix);
+    if (computation->IsFusionComputation()) {
+      // Cloning of a fused computation is handled by its fusion instruction.
+      continue;
+    }
+
+    // When cloning a computation, pass in the new module, so that for any
+    // fusion instruction in this computation, the fused computation will be
+    // deep cloned to the new module.
+    auto cloned_computation = computation->Clone(suffix, module.get());
     InsertOrDie(&clone_map, computation.get(), cloned_computation.get());
 
     if (entry_computation_ == computation.get()) {
@@ -537,8 +545,15 @@ std::unique_ptr<HloModule> HloModule::Clone(const string& suffix) const {
     for (auto* instruction : cloned_computation->instructions()) {
       // Rewrite instruction's called_computation to point to the cloned
       // computations.
-      instruction->ReplaceCalledComputations(
-          [&](HloComputation* hlo) { return FindOrDie(clone_map, hlo); });
+      instruction->ReplaceCalledComputations([&](HloComputation* hlo) {
+        if (hlo->IsFusionComputation()) {
+          // Cloning of a fused computation has already been handled when its
+          // fusion instruction is cloned. So this hlo computation is already
+          // the cloned one.
+          return hlo;
+        }
+        return FindOrDie(clone_map, hlo);
+      });
     }
   }
   return module;
diff --git a/tensorflow/compiler/xla/service/hlo_module_test.cc b/tensorflow/compiler/xla/service/hlo_module_test.cc
index 0f5d3dccb7..cd51fa4e85 100644
--- a/tensorflow/compiler/xla/service/hlo_module_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_test.cc
@@ -105,6 +105,48 @@ TEST_F(HloModuleTest, CloneTest) {
   }
 }
 
+TEST_F(HloModuleTest, CloneHasFusion) {
+  auto module = CreateNewModule();
+
+  // Create the fused computation.
+  HloComputation* fused_computation;
+  {
+    auto b = HloComputation::Builder("Fused");
+    auto x = b.AddInstruction(HloInstruction::CreateParameter(0, r0f32_, "x"));
+    b.AddInstruction(
+        HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, x, x));
+    fused_computation = module->AddEmbeddedComputation(b.Build());
+  }
+
+  // Create the entry computation.
+  {
+    auto b = HloComputation::Builder("Entry");
+    auto input = b.AddInstruction(
+        HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0f)));
+    b.AddInstruction(
+        HloInstruction::CreateFusion(r0f32_, HloInstruction::FusionKind::kInput,
+                                     /*operands=*/{input}, fused_computation));
+    module->AddEntryComputation(b.Build());
+  }
+
+  auto post_order = module->MakeComputationPostOrder();
+  auto cloned_module = module->Clone("copy");
+  auto post_order_copied = cloned_module->MakeComputationPostOrder();
+
+  EXPECT_EQ(post_order.size(), post_order_copied.size());
+  for (auto origin = post_order.begin(), copied = post_order_copied.begin();
+       origin != post_order.end() && copied != post_order_copied.end();
+       ++origin, ++copied) {
+    if ((*origin)->name() == "Fused") {
+      // Clone of the fused computation is handled when its fusion instruction
+      // is cloned, which always use suffix ".clone".
+      EXPECT_EQ((*origin)->name() + ".clone", (*copied)->name());
+    } else {
+      EXPECT_EQ((*origin)->name() + ".copy", (*copied)->name());
+    }
+  }
+}
+
 TEST_F(HloModuleTest, DiamondComputationsPostOrder) {
   // Create a module with a diamond call graph of computations.
   auto module = CreateNewModule();
-- 
GitLab


From 0d11ed25e49e9dfbb45c15c1af9a5892e9146768 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 24 Jan 2018 20:13:14 -0800
Subject: [PATCH 145/966] Make the graph generation for TFBT deterministic.

PiperOrigin-RevId: 183183086
---
 tensorflow/contrib/boosted_trees/python/ops/batch_ops_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/boosted_trees/python/ops/batch_ops_utils.py b/tensorflow/contrib/boosted_trees/python/ops/batch_ops_utils.py
index 23168bf493..b281a4c6d1 100644
--- a/tensorflow/contrib/boosted_trees/python/ops/batch_ops_utils.py
+++ b/tensorflow/contrib/boosted_trees/python/ops/batch_ops_utils.py
@@ -104,7 +104,7 @@ def run_handler_scheduled_ops(per_handler_ops, stamp, worker_device):
   batched_ops = collections.defaultdict(list)
   # Group the ops by their batching_key. Ops that share the same batching key
   # can be executed together.
-  for handler in per_handler_ops.keys():
+  for handler in sorted(per_handler_ops.keys()):
     for op in per_handler_ops[handler]:
       batched_ops[(op.batching_key(), op.batch_runner_fn())].append(op)
   op_results = {}
-- 
GitLab


From b986f8944d22b922e9a5feb5b7234b9a0b1087ce Mon Sep 17 00:00:00 2001
From: Chris Leary <leary@google.com>
Date: Wed, 24 Jan 2018 20:51:33 -0800
Subject: [PATCH 146/966] [XLA] Add more tests for ConvertElementType.

PiperOrigin-RevId: 183185601
---
 tensorflow/compiler/xla/tests/unary_op_test.cc | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/tensorflow/compiler/xla/tests/unary_op_test.cc b/tensorflow/compiler/xla/tests/unary_op_test.cc
index fa4192e928..835e2d7e55 100644
--- a/tensorflow/compiler/xla/tests/unary_op_test.cc
+++ b/tensorflow/compiler/xla/tests/unary_op_test.cc
@@ -215,5 +215,23 @@ XLA_TEST_F(UnaryOpTest, SignAbsTestR2) {
   ComputeAndCompareR2<float>(&builder, {{0, 0}, {0, 0}}, {});
 }
 
+XLA_TEST_F(UnaryOpTest, ConvertElementTypePredToS32) {
+  ComputationBuilder builder(client_, TestName());
+  auto lhs = builder.ConstantR1<int32>({0, 1});
+  auto rhs = builder.ConstantR1<int32>({1, 1});
+  builder.ConvertElementType(builder.Eq(lhs, rhs), S32);
+
+  ComputeAndCompareR1<int32>(&builder, {0, 1}, {});
+}
+
+XLA_TEST_F(UnaryOpTest, ConvertElementTypePredToF32) {
+  ComputationBuilder builder(client_, TestName());
+  auto lhs = builder.ConstantR1<int32>({0, 1});
+  auto rhs = builder.ConstantR1<int32>({1, 1});
+  builder.ConvertElementType(builder.Eq(lhs, rhs), F32);
+
+  ComputeAndCompareR1<float>(&builder, {0.0, 1.0}, {});
+}
+
 }  // namespace
 }  // namespace xla
-- 
GitLab


From 2fad3428e3b13c963375970cbfa9eea554a16486 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 19 Dec 2017 15:50:58 -0800
Subject: [PATCH 147/966] Add comparison ufuncs for numpy bfloat16 type. Fix 2
 bfloat16 tests.

PiperOrigin-RevId: 179614898
---
 tensorflow/python/lib/core/bfloat16.cc      | 106 ++++++++++++++++++++
 tensorflow/python/lib/core/bfloat16_test.py |  18 ++++
 tensorflow/python/lib/core/numpy.h          |   1 +
 3 files changed, 125 insertions(+)

diff --git a/tensorflow/python/lib/core/bfloat16.cc b/tensorflow/python/lib/core/bfloat16.cc
index 4902978e2d..7f07deebef 100644
--- a/tensorflow/python/lib/core/bfloat16.cc
+++ b/tensorflow/python/lib/core/bfloat16.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <array>
+
 #include "tensorflow/python/lib/core/bfloat16.h"
 
 #include "tensorflow/core/framework/numeric_types.h"
@@ -477,8 +479,61 @@ bool RegisterBfloat16Cast(int numpy_type, bool cast_is_safe) {
   return true;
 }
 
+template <typename InType, typename OutType, typename Functor>
+void BinaryUFunc(char** args, npy_intp* dimensions, npy_intp* steps,
+                 void* data) {
+  const char* i0 = args[0];
+  const char* i1 = args[1];
+  char* o = args[2];
+  for (npy_intp k = 0; k < *dimensions; k++) {
+    InType x = *reinterpret_cast<const InType*>(i0);
+    InType y = *reinterpret_cast<const InType*>(i1);
+    *reinterpret_cast<OutType*>(o) = Functor()(x, y);
+    i0 += steps[0];
+    i1 += steps[1];
+    o += steps[2];
+  }
+}
+
+template <typename Functor>
+void CompareUFunc(char** args, npy_intp* dimensions, npy_intp* steps,
+                  void* data) {
+  BinaryUFunc<bfloat16, npy_bool, Functor>(args, dimensions, steps, data);
+}
+
+struct Bfloat16EqFunctor {
+  npy_bool operator()(bfloat16 a, bfloat16 b) { return a == b; }
+};
+struct Bfloat16NeFunctor {
+  npy_bool operator()(bfloat16 a, bfloat16 b) { return a != b; }
+};
+struct Bfloat16LtFunctor {
+  npy_bool operator()(bfloat16 a, bfloat16 b) { return a < b; }
+};
+struct Bfloat16GtFunctor {
+  npy_bool operator()(bfloat16 a, bfloat16 b) { return a > b; }
+};
+struct Bfloat16LeFunctor {
+  npy_bool operator()(bfloat16 a, bfloat16 b) { return a <= b; }
+};
+struct Bfloat16GeFunctor {
+  npy_bool operator()(bfloat16 a, bfloat16 b) { return a >= b; }
+};
+
 // Initializes the module.
 bool Initialize() {
+  // It's critical to import umath to avoid crash in open source build.
+  import_umath1(false);
+
+  Safe_PyObjectPtr numpy_str = make_safe(MakePyString("numpy"));
+  if (!numpy_str) {
+    return false;
+  }
+  Safe_PyObjectPtr numpy = make_safe(PyImport_Import(numpy_str.get()));
+  if (!numpy) {
+    return false;
+  }
+
   // We hit a mysterious crash if we haven't initialized numpy before this:
   PyBfloat16_Type.tp_base = &PyGenericArrType_Type;
 
@@ -536,6 +591,57 @@ bool Initialize() {
                                         /*cast_is_safe=*/true)) {
     return false;
   }
+
+  // Register ufuncs
+  auto register_ufunc = [&](const char* name, PyUFuncGenericFunction fn,
+                            const std::array<int, 3>& types) {
+    Safe_PyObjectPtr ufunc_obj =
+        make_safe(PyObject_GetAttrString(numpy.get(), name));
+    if (!ufunc_obj) {
+      return false;
+    }
+    PyUFuncObject* ufunc = reinterpret_cast<PyUFuncObject*>(ufunc_obj.get());
+    if (types.size() != ufunc->nargs) {
+      PyErr_Format(PyExc_AssertionError,
+                   "ufunc %s takes %d arguments, loop takes %lu", name,
+                   ufunc->nargs, types.size());
+      return false;
+    }
+    if (PyUFunc_RegisterLoopForType(ufunc, npy_bfloat16_, fn,
+                                    const_cast<int*>(types.data()),
+                                    nullptr) < 0) {
+      return false;
+    }
+    return true;
+  };
+
+  // Comparisons
+  const std::array<int, 3> compare_types = {npy_bfloat16_, npy_bfloat16_,
+                                            NPY_BOOL};
+
+  if (!register_ufunc("equal", CompareUFunc<Bfloat16EqFunctor>,
+                      compare_types)) {
+    return false;
+  }
+  if (!register_ufunc("not_equal", CompareUFunc<Bfloat16NeFunctor>,
+                      compare_types)) {
+    return false;
+  }
+  if (!register_ufunc("less", CompareUFunc<Bfloat16LtFunctor>, compare_types)) {
+    return false;
+  }
+  if (!register_ufunc("greater", CompareUFunc<Bfloat16GtFunctor>,
+                      compare_types)) {
+    return false;
+  }
+  if (!register_ufunc("less_equal", CompareUFunc<Bfloat16LeFunctor>,
+                      compare_types)) {
+    return false;
+  }
+  if (!register_ufunc("greater_equal", CompareUFunc<Bfloat16GeFunctor>,
+                      compare_types)) {
+    return false;
+  }
   return true;
 }
 
diff --git a/tensorflow/python/lib/core/bfloat16_test.py b/tensorflow/python/lib/core/bfloat16_test.py
index 0872348c51..985a11272c 100644
--- a/tensorflow/python/lib/core/bfloat16_test.py
+++ b/tensorflow/python/lib/core/bfloat16_test.py
@@ -172,6 +172,24 @@ class Bfloat16NumPyTest(test.TestCase):
     self.assertEqual("[[bfloat16(1) bfloat16(2) bfloat16(3)]]", str(x))
     self.assertAllEqual(x, x)
     self.assertAllClose(x, x)
+    self.assertTrue((x == x).all())
+
+  def testComparisons(self):
+    x = np.array([401408, 7, -32], dtype=np.float32)
+    bx = x.astype(bfloat16)
+    y = np.array([82432, 7, 0], dtype=np.float32)
+    by = y.astype(bfloat16)
+    self.assertAllEqual(x == y, bx == by)
+    self.assertAllEqual(x != y, bx != by)
+    self.assertAllEqual(x < y, bx < by)
+    self.assertAllEqual(x > y, bx > by)
+    self.assertAllEqual(x <= y, bx <= by)
+    self.assertAllEqual(x >= y, bx >= by)
+
+  def testEqual2(self):
+    a = np.array([401408], bfloat16)
+    b = np.array([82432], bfloat16)
+    self.assertFalse(a.__eq__(b))
 
   def testCasts(self):
     for dtype in [
diff --git a/tensorflow/python/lib/core/numpy.h b/tensorflow/python/lib/core/numpy.h
index 0eafe890db..25322b458b 100644
--- a/tensorflow/python/lib/core/numpy.h
+++ b/tensorflow/python/lib/core/numpy.h
@@ -32,6 +32,7 @@ limitations under the License.
 #include <Python.h>
 
 #include "numpy/arrayobject.h"
+#include "numpy/ufuncobject.h"
 
 namespace tensorflow {
 
-- 
GitLab


From 72c420a32702f7a7638c0130a7d7dc1db4469840 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 24 Jan 2018 22:33:14 -0800
Subject: [PATCH 148/966] Automated g4 rollback of changelist 183171572

PiperOrigin-RevId: 183192221
---
 .../batch_sequences_with_states_test.py       | 26 +------------------
 1 file changed, 1 insertion(+), 25 deletions(-)

diff --git a/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py b/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py
index 04538405e4..2a0ef0e6b3 100644
--- a/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py
+++ b/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py
@@ -320,18 +320,6 @@ class BatchSequencesWithStatesTest(test.TestCase):
   def testNotAMultiple(self):
     num_unroll = 3  # Not a divisor of value_length -
     # so padding would have been necessary.
-
-    # Use placeholder_with_default in sequences to make sure we get runtime
-    # error instead of shape inference error
-    sequences = {
-        "seq1": array_ops.placeholder_with_default(self.sequences["seq1"],
-                                                   shape=(None, 5)),
-        "seq2": array_ops.placeholder_with_default(self.sequences["seq2"],
-                                                   shape=(None, 4, 2)),
-        "seq3": self.sequences["seq3"],
-        "seq4": self.sequences["seq4"],
-    }
-
     with self.test_session() as sess:
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                    ".*should be a multiple of: 3, but saw "
@@ -342,7 +330,7 @@ class BatchSequencesWithStatesTest(test.TestCase):
           with coord.stop_on_exception():
             next_batch = sqss.batch_sequences_with_states(
                 input_key=self.key,
-                input_sequences=sequences,
+                input_sequences=self.sequences,
                 input_context=self.context,
                 input_length=3,
                 initial_states=self.initial_states,
@@ -505,18 +493,6 @@ class BatchSequencesWithStatesTest(test.TestCase):
         expected_seq4_batch2=expected_seq4_batch2)
 
 
-class BatchSequencesWithStatesTestWithCApi(BatchSequencesWithStatesTest):
-
-  def setUp(self):
-    self._prev_value = ops._USE_C_API
-    ops._USE_C_API = True
-    super(BatchSequencesWithStatesTestWithCApi, self).setUp()
-
-  def tearDown(self):
-    super(BatchSequencesWithStatesTestWithCApi, self).tearDown()
-    ops._USE_C_API = self._prev_value
-
-
 class PaddingTest(test.TestCase):
 
   def testPaddingInvalidLengths(self):
-- 
GitLab


From e52f17b1e273eaafbddd2581c65a535a198918e0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 16 Jan 2018 15:52:12 -0800
Subject: [PATCH 149/966] Enable bfloat16 for CPU kernels

PiperOrigin-RevId: 182124532
---
 tensorflow/contrib/batching/util/BUILD        |   1 +
 tensorflow/contrib/ffmpeg/default/BUILD       |   1 +
 .../contrib/tensor_forest/kernels/v4/BUILD    |   2 +
 tensorflow/core/BUILD                         |   8 +-
 tensorflow/core/framework/numeric_types.h     | 174 ++---------
 tensorflow/core/framework/register_types.h    |  18 +-
 tensorflow/core/kernels/concat_lib_cpu.cc     |   1 -
 tensorflow/core/kernels/concat_op.cc          |   1 -
 tensorflow/core/kernels/constant_op_gpu.cu.cc |  26 +-
 tensorflow/core/kernels/cross_op.cc           |   1 +
 tensorflow/core/kernels/fill_functor.cc       |   2 +
 .../core/kernels/save_restore_tensor.cc       |   6 +-
 tensorflow/core/kernels/slice_op.cc           |   3 -
 tensorflow/core/kernels/slice_op_cpu_impl.h   |   1 -
 tensorflow/core/kernels/split_lib_cpu.cc      |   1 -
 tensorflow/core/kernels/split_op.cc           |   2 -
 tensorflow/core/kernels/split_v_op.cc         |   1 -
 tensorflow/core/kernels/strided_slice_op.cc   |   1 -
 .../core/kernels/strided_slice_op_impl.h      |   1 -
 tensorflow/core/kernels/tensor_array_ops.cc   |   2 -
 tensorflow/core/kernels/transpose_op.cc       |   2 -
 tensorflow/core/lib/bfloat16/bfloat16.cc      |  25 ++
 tensorflow/core/lib/bfloat16/bfloat16.h       | 276 ++++++++++++++++++
 tensorflow/core/lib/hash/hash.h               |   7 +
 tensorflow/core/lib/strings/strcat.h          |   3 +
 .../core/platform/default/build_config.bzl    |   1 +
 tensorflow/core/platform/types.h              |   6 +
 tensorflow/python/BUILD                       |   1 +
 28 files changed, 381 insertions(+), 193 deletions(-)
 create mode 100644 tensorflow/core/lib/bfloat16/bfloat16.cc
 create mode 100644 tensorflow/core/lib/bfloat16/bfloat16.h

diff --git a/tensorflow/contrib/batching/util/BUILD b/tensorflow/contrib/batching/util/BUILD
index f33a08cb81..3ccd3e08d9 100644
--- a/tensorflow/contrib/batching/util/BUILD
+++ b/tensorflow/contrib/batching/util/BUILD
@@ -28,6 +28,7 @@ cc_library(
     deps = [
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:protos_all_cc",
+        "//third_party/eigen3",
     ],
 )
 
diff --git a/tensorflow/contrib/ffmpeg/default/BUILD b/tensorflow/contrib/ffmpeg/default/BUILD
index 949ae9ad9e..6b455567d7 100644
--- a/tensorflow/contrib/ffmpeg/default/BUILD
+++ b/tensorflow/contrib/ffmpeg/default/BUILD
@@ -19,6 +19,7 @@ cc_library(
     ],
     deps = [
         "//tensorflow/core:framework_headers_lib",
+        "//third_party/eigen3",
         "@protobuf_archive//:protobuf_headers",
     ],
 )
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/BUILD b/tensorflow/contrib/tensor_forest/kernels/v4/BUILD
index b7876e1df6..794b76d858 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/BUILD
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/BUILD
@@ -302,6 +302,7 @@ cc_library(
             "//tensorflow/contrib/tensor_forest/proto:fertile_stats_proto_cc",
         ],
         [
+            "//third_party/eigen3",
             "//tensorflow/contrib/decision_trees/proto:generic_tree_model_cc_headers_only",
             "//tensorflow/contrib/tensor_forest/proto:fertile_stats_proto_cc_headers_only",
         ],
@@ -322,6 +323,7 @@ cc_library(
     srcs = ["params.cc"],
     hdrs = ["params.h"],
     deps = [
+        "//third_party/eigen3",
         "//tensorflow/core:framework_headers_lib",
     ] + if_static(
         [
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 2956aae2e9..497281041f 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -274,7 +274,8 @@ cc_library(
         "platform/platform.h",
         "platform/protobuf.h",
         "platform/types.h",
-    ] + glob(tf_additional_proto_hdrs()) + glob(tf_env_time_hdrs()),
+        "lib/bfloat16/bfloat16.h",
+    ] + tf_additional_proto_hdrs() + glob(tf_env_time_hdrs()),
     copts = tf_copts(),
     deps = tf_lib_proto_parsing_deps(),
 )
@@ -286,6 +287,7 @@ cc_library(
 cc_library(
     name = "lib",
     hdrs = [
+        "lib/bfloat16/bfloat16.h",
         "lib/core/arena.h",
         "lib/core/bitmap.h",
         "lib/core/bits.h",
@@ -549,6 +551,7 @@ cc_library(
         "framework/numeric_types.h",
         "framework/tensor_types.h",
         "framework/type_traits.h",
+        "lib/bfloat16/bfloat16.h",
         "platform/default/dynamic_annotations.h",
         "platform/default/integral_types.h",
         "platform/default/logging.h",
@@ -1562,6 +1565,7 @@ cc_library(
         "platform/jpeg.h",
     ]),
     hdrs = [
+        "lib/bfloat16/bfloat16.h",
         "lib/core/stringpiece.h",
         "lib/jpeg/jpeg_handle.h",
         "lib/jpeg/jpeg_mem.h",
@@ -1589,6 +1593,7 @@ cc_library(
         "platform/gif.h",
     ]),
     hdrs = [
+        "lib/bfloat16/bfloat16.h",
         "lib/core/stringpiece.h",
         "lib/gif/gif_io.h",
         "lib/gtl/cleanup.h",
@@ -1616,6 +1621,7 @@ cc_library(
         "platform/png.h",
     ]),
     hdrs = [
+        "lib/bfloat16/bfloat16.h",
         "lib/core/casts.h",
         "lib/core/stringpiece.h",
         "lib/png/png_io.h",
diff --git a/tensorflow/core/framework/numeric_types.h b/tensorflow/core/framework/numeric_types.h
index e7268fd7a7..988a18da0e 100644
--- a/tensorflow/core/framework/numeric_types.h
+++ b/tensorflow/core/framework/numeric_types.h
@@ -41,165 +41,39 @@ typedef Eigen::QInt32 qint32;
 typedef Eigen::QInt16 qint16;
 typedef Eigen::QUInt16 quint16;
 
-// see framework/bfloat16.h for description.
-struct bfloat16 {
-  EIGEN_DEVICE_FUNC bfloat16() {}
-
-  EIGEN_DEVICE_FUNC explicit bfloat16(const float v) {
-    if (Eigen::numext::isnan(v)) {
-      value = NAN_VALUE;
-      return;
-    }
-    const uint16_t* p = reinterpret_cast<const uint16_t*>(&v);
-#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-    value = p[0];
-#else
-    value = p[1];
-#endif
-  }
-
-  // Following the convention of numpy, converting between complex and
-  // float will lead to loss of imag value.
-  explicit EIGEN_DEVICE_FUNC bfloat16(const complex64& val)
-      : bfloat16(val.real()) {}
-
-  explicit EIGEN_DEVICE_FUNC bfloat16(const complex128& val)
-      : bfloat16(static_cast<float>(val.real())) {}
-
-  template <class T>
-  explicit EIGEN_DEVICE_FUNC bfloat16(const T& val)
-      : bfloat16(static_cast<float>(val)) {}
-
-  EIGEN_DEVICE_FUNC explicit operator float() const {
-    float result;
-
-    uint16_t* q = reinterpret_cast<uint16_t*>(&result);
-
-#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-    q[0] = value;
-    q[1] = 0;
-#else
-    q[0] = 0;
-    q[1] = value;
-#endif
-    return result;
-  }
-
-  EIGEN_DEVICE_FUNC explicit operator bool() const {
-    return static_cast<bool>(float(*this));
-  }
-
-  EIGEN_DEVICE_FUNC explicit operator Eigen::half() const {
-    return static_cast<Eigen::half>(float(*this));
-  }
-
-  EIGEN_DEVICE_FUNC explicit operator short() const {
-    return static_cast<short>(float(*this));
-  }
-
-  EIGEN_DEVICE_FUNC explicit operator int() const {
-    return static_cast<int>(float(*this));
-  }
+}  // namespace tensorflow
 
-  EIGEN_DEVICE_FUNC explicit operator long() const {
-    return static_cast<long>(float(*this));
-  }
-
-  EIGEN_DEVICE_FUNC explicit operator char() const {
-    return static_cast<char>(float(*this));
-  }
-
-  EIGEN_DEVICE_FUNC explicit operator signed char() const {
-    return static_cast<signed char>(float(*this));
-  }
-
-  EIGEN_DEVICE_FUNC explicit operator unsigned char() const {
-    return static_cast<unsigned char>(float(*this));
-  }
-
-  EIGEN_DEVICE_FUNC explicit operator unsigned int() const {
-    return static_cast<unsigned int>(float(*this));
-  }
-
-  EIGEN_DEVICE_FUNC explicit operator unsigned long() const {
-    return static_cast<unsigned long>(float(*this));
-  }
-
-  EIGEN_DEVICE_FUNC explicit operator unsigned long long() const {
-    return static_cast<unsigned long long>(float(*this));
-  }
-
-  EIGEN_DEVICE_FUNC explicit operator long long() const {
-    return static_cast<long long>(float(*this));
-  }
-
-  EIGEN_DEVICE_FUNC explicit operator double() const {
-    return static_cast<double>(float(*this));
-  }
-
-  EIGEN_DEVICE_FUNC explicit operator complex64() const {
-    return complex64(float(*this), float(0.0));
-  }
-
-  EIGEN_DEVICE_FUNC explicit operator complex128() const {
-    return complex128(double(*this), double(0.0));
-  }
-
-  static bfloat16 epsilon() {
-    bfloat16 x;
-    x.value = 0x3c00;  // 0x1.0p-7
-    return x;
-  }
+namespace Eigen {
+// TOOD(xpan): We probably need to overwrite more methods to have correct eigen
+// behavior. E.g. loest(), is_integer, etc. See NumTraits.h in eigen.
+template <>
+struct NumTraits<tensorflow::bfloat16>
+    : GenericNumTraits<tensorflow::bfloat16> {};
 
-  uint16_t value;
+using ::tensorflow::operator==;
+using ::tensorflow::operator!=;
 
-  // A value that represents "not a number".
-  static const uint16_t NAN_VALUE = 0x7FC0;
-};
+namespace numext {
 
-inline bfloat16 operator+(bfloat16 a, bfloat16 b) {
-  return bfloat16(static_cast<float>(a) + static_cast<float>(b));
-}
-inline bfloat16 operator-(bfloat16 a, bfloat16 b) {
-  return bfloat16(static_cast<float>(a) - static_cast<float>(b));
-}
-inline bfloat16 operator*(bfloat16 a, bfloat16 b) {
-  return bfloat16(static_cast<float>(a) * static_cast<float>(b));
-}
-inline bfloat16 operator/(bfloat16 a, bfloat16 b) {
-  return bfloat16(static_cast<float>(a) / static_cast<float>(b));
-}
-inline bfloat16 operator-(bfloat16 a) {
-  a.value ^= 0x8000;
-  return a;
-}
-inline bool operator<(bfloat16 a, bfloat16 b) {
-  return static_cast<float>(a) < static_cast<float>(b);
-}
-inline bool operator<=(bfloat16 a, bfloat16 b) {
-  return static_cast<float>(a) <= static_cast<float>(b);
-}
-inline bool operator==(bfloat16 a, bfloat16 b) {
-  return static_cast<float>(a) == static_cast<float>(b);
-}
-inline bool operator!=(bfloat16 a, bfloat16 b) {
-  return static_cast<float>(a) != static_cast<float>(b);
-}
-inline bool operator>(bfloat16 a, bfloat16 b) {
-  return static_cast<float>(a) > static_cast<float>(b);
-}
-inline bool operator>=(bfloat16 a, bfloat16 b) {
-  return static_cast<float>(a) >= static_cast<float>(b);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE tensorflow::bfloat16 log(
+    const tensorflow::bfloat16& x) {
+  return static_cast<tensorflow::bfloat16>(::logf(static_cast<float>(x)));
 }
 
-}  // end namespace tensorflow
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE tensorflow::bfloat16 exp(
+    const tensorflow::bfloat16& x) {
+  return static_cast<tensorflow::bfloat16>(::expf(static_cast<float>(x)));
+}
 
-namespace Eigen {
 template <>
-struct NumTraits<tensorflow::bfloat16> : GenericNumTraits<uint16_t> {};
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE tensorflow::bfloat16 abs(
+    const tensorflow::bfloat16& x) {
+  return static_cast<tensorflow::bfloat16>(::fabsf(static_cast<float>(x)));
+}
 
-using ::tensorflow::operator==;
-using ::tensorflow::operator!=;
+}  // namespace numext
 }  // namespace Eigen
 
 #ifdef COMPILER_MSVC
diff --git a/tensorflow/core/framework/register_types.h b/tensorflow/core/framework/register_types.h
index 0f186a7a06..320531f03a 100644
--- a/tensorflow/core/framework/register_types.h
+++ b/tensorflow/core/framework/register_types.h
@@ -155,11 +155,16 @@ limitations under the License.
       TF_CALL_uint8(m) TF_CALL_int8(m)
 
 #define TF_CALL_REAL_NUMBER_TYPES(m) \
+  TF_CALL_INTEGRAL_TYPES(m)          \
+  TF_CALL_half(m) TF_CALL_bfloat16(m) TF_CALL_float(m) TF_CALL_double(m)
+
+#define TF_CALL_REAL_NUMBER_TYPES_NO_BFLOAT16(m) \
   TF_CALL_INTEGRAL_TYPES(m) TF_CALL_half(m) TF_CALL_float(m) TF_CALL_double(m)
 
-#define TF_CALL_REAL_NUMBER_TYPES_NO_INT32(m)                         \
-  TF_CALL_half(m) TF_CALL_float(m) TF_CALL_double(m) TF_CALL_int64(m) \
-      TF_CALL_uint16(m) TF_CALL_int16(m) TF_CALL_uint8(m) TF_CALL_int8(m)
+#define TF_CALL_REAL_NUMBER_TYPES_NO_INT32(m)                              \
+  TF_CALL_half(m) TF_CALL_bfloat16(m) TF_CALL_float(m) TF_CALL_double(m)   \
+      TF_CALL_int64(m) TF_CALL_uint16(m) TF_CALL_int16(m) TF_CALL_uint8(m) \
+          TF_CALL_int8(m)
 
 // Call "m" for all number types, including complex64 and complex128.
 #define TF_CALL_NUMBER_TYPES(m) \
@@ -194,6 +199,13 @@ limitations under the License.
 #define TF_CALL_QUANTIZED_TYPES(m) \
   TF_CALL_qint8(m) TF_CALL_quint8(m) TF_CALL_qint32(m)
 
+// Types used for save and restore ops.
+#define TF_CALL_SAVE_RESTORE_TYPES(m)                                     \
+  TF_CALL_INTEGRAL_TYPES(m)                                               \
+  TF_CALL_half(m) TF_CALL_float(m) TF_CALL_double(m) TF_CALL_complex64(m) \
+      TF_CALL_complex128(m) TF_CALL_bool(m) TF_CALL_string(m)             \
+          TF_CALL_QUANTIZED_TYPES(m)
+
 #ifdef TENSORFLOW_SYCL_NO_DOUBLE
 #define TF_CALL_SYCL_double(m)
 #else  // TENSORFLOW_SYCL_NO_DOUBLE
diff --git a/tensorflow/core/kernels/concat_lib_cpu.cc b/tensorflow/core/kernels/concat_lib_cpu.cc
index 743e3acfd5..43731114c0 100644
--- a/tensorflow/core/kernels/concat_lib_cpu.cc
+++ b/tensorflow/core/kernels/concat_lib_cpu.cc
@@ -72,7 +72,6 @@ REGISTER(qint8)
 REGISTER(quint16)
 REGISTER(qint16)
 REGISTER(qint32)
-REGISTER(bfloat16)
 TF_CALL_variant(REGISTER)
 
 #if defined(IS_MOBILE_PLATFORM) && !defined(SUPPORT_SELECTIVE_REGISTRATION) && \
diff --git a/tensorflow/core/kernels/concat_op.cc b/tensorflow/core/kernels/concat_op.cc
index 8e480aa995..ae1b5da32e 100644
--- a/tensorflow/core/kernels/concat_op.cc
+++ b/tensorflow/core/kernels/concat_op.cc
@@ -172,7 +172,6 @@ REGISTER_CONCAT(qint8);
 REGISTER_CONCAT(quint16);
 REGISTER_CONCAT(qint16);
 REGISTER_CONCAT(qint32);
-REGISTER_CONCAT(bfloat16);
 
 #undef REGISTER_CONCAT
 
diff --git a/tensorflow/core/kernels/constant_op_gpu.cu.cc b/tensorflow/core/kernels/constant_op_gpu.cu.cc
index d1a1e34ec3..3487606778 100644
--- a/tensorflow/core/kernels/constant_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/constant_op_gpu.cu.cc
@@ -77,7 +77,7 @@ struct FillFunctor<GPUDevice, T> {
 
 #define DEFINE_FILL_GPU(T) template struct FillFunctor<GPUDevice, T>;
 TF_CALL_REAL_NUMBER_TYPES(DEFINE_FILL_GPU);
-DEFINE_FILL_GPU(bool);
+TF_CALL_bool(DEFINE_FILL_GPU);
 #undef DEFINE_FILL_GPU
 
 // Partial specialization of FillFunctor<Device=GPUDevice, T>.
@@ -88,15 +88,9 @@ struct SetZeroFunctor<GPUDevice, T> {
   }
 };
 
-#define DEFINE_SETZERO_GPU(T) template struct SetZeroFunctor<GPUDevice, T>
-DEFINE_SETZERO_GPU(bool);
-DEFINE_SETZERO_GPU(Eigen::half);
-DEFINE_SETZERO_GPU(float);
-DEFINE_SETZERO_GPU(double);
-DEFINE_SETZERO_GPU(complex64);
-DEFINE_SETZERO_GPU(complex128);
-DEFINE_SETZERO_GPU(int32);
-DEFINE_SETZERO_GPU(int64);
+#define DEFINE_SETZERO_GPU(T) template struct SetZeroFunctor<GPUDevice, T>;
+TF_CALL_NUMBER_TYPES(DEFINE_SETZERO_GPU);
+TF_CALL_bool(DEFINE_SETZERO_GPU);
 #undef DEFINE_SETZERO_GPU
 
 // Partial specialization of FillFunctor<Device=GPUDevice, T>.
@@ -107,15 +101,9 @@ struct SetOneFunctor<GPUDevice, T> {
   }
 };
 
-#define DEFINE_SETONE_GPU(T) template struct SetOneFunctor<GPUDevice, T>
-DEFINE_SETONE_GPU(bool);
-DEFINE_SETONE_GPU(Eigen::half);
-DEFINE_SETONE_GPU(float);
-DEFINE_SETONE_GPU(double);
-DEFINE_SETONE_GPU(complex64);
-DEFINE_SETONE_GPU(complex128);
-DEFINE_SETONE_GPU(int32);
-DEFINE_SETONE_GPU(int64);
+#define DEFINE_SETONE_GPU(T) template struct SetOneFunctor<GPUDevice, T>;
+TF_CALL_NUMBER_TYPES(DEFINE_SETONE_GPU);
+TF_CALL_bool(DEFINE_SETONE_GPU);
 #undef DEFINE_SETONE_GPU
 
 }  // end namespace functor
diff --git a/tensorflow/core/kernels/cross_op.cc b/tensorflow/core/kernels/cross_op.cc
index 05a33a97b4..b29524f1f9 100644
--- a/tensorflow/core/kernels/cross_op.cc
+++ b/tensorflow/core/kernels/cross_op.cc
@@ -105,6 +105,7 @@ TF_CALL_REAL_NUMBER_TYPES(DECLARE_GPU_KERNEL);
   REGISTER_KERNEL_BUILDER(                                        \
       Name("Cross").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
       CrossOp<GPUDevice, type>);
+
 TF_CALL_REAL_NUMBER_TYPES(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 #endif
diff --git a/tensorflow/core/kernels/fill_functor.cc b/tensorflow/core/kernels/fill_functor.cc
index ea0cc139f3..6bc004c236 100644
--- a/tensorflow/core/kernels/fill_functor.cc
+++ b/tensorflow/core/kernels/fill_functor.cc
@@ -41,6 +41,7 @@ void SetZeroFunctor<Eigen::ThreadPoolDevice, string>::operator()(
   template struct SetZeroFunctor<Eigen::ThreadPoolDevice, T>;
 DEFINE_SETZERO_CPU(bool);
 DEFINE_SETZERO_CPU(Eigen::half);
+DEFINE_SETZERO_CPU(bfloat16);
 DEFINE_SETZERO_CPU(float);
 DEFINE_SETZERO_CPU(double);
 DEFINE_SETZERO_CPU(uint8);
@@ -85,6 +86,7 @@ void SetOneFunctor<Eigen::ThreadPoolDevice, T>::operator()(
   template struct SetOneFunctor<Eigen::ThreadPoolDevice, T>;
 DEFINE_SETONE_CPU(bool);
 DEFINE_SETONE_CPU(Eigen::half);
+DEFINE_SETONE_CPU(bfloat16);
 DEFINE_SETONE_CPU(float);
 DEFINE_SETONE_CPU(double);
 DEFINE_SETONE_CPU(uint8);
diff --git a/tensorflow/core/kernels/save_restore_tensor.cc b/tensorflow/core/kernels/save_restore_tensor.cc
index 6b06cf650a..2a0d94c8bc 100644
--- a/tensorflow/core/kernels/save_restore_tensor.cc
+++ b/tensorflow/core/kernels/save_restore_tensor.cc
@@ -109,8 +109,7 @@ void SaveTensors(
     break;
 
     switch (input.dtype()) {
-      TF_CALL_POD_STRING_TYPES(WRITER_ADD)
-      TF_CALL_QUANTIZED_TYPES(WRITER_ADD)
+      TF_CALL_SAVE_RESTORE_TYPES(WRITER_ADD)
       default:
         context->SetStatus(errors::Unimplemented("Saving data type ",
                                                  DataTypeString(input.dtype()),
@@ -225,8 +224,7 @@ void RestoreTensor(OpKernelContext* context,
     break;
 
   switch (type) {
-    TF_CALL_POD_STRING_TYPES(READER_COPY)
-    TF_CALL_QUANTIZED_TYPES(READER_COPY)
+    TF_CALL_SAVE_RESTORE_TYPES(READER_COPY)
     default:
       context->SetStatus(errors::Unimplemented(
           "Restoring data type ", DataTypeString(type), " not yet supported"));
diff --git a/tensorflow/core/kernels/slice_op.cc b/tensorflow/core/kernels/slice_op.cc
index d46701749b..12c6901067 100644
--- a/tensorflow/core/kernels/slice_op.cc
+++ b/tensorflow/core/kernels/slice_op.cc
@@ -439,7 +439,6 @@ namespace functor {
   DECLARE_CPU_SPEC(T, 7);
 
 TF_CALL_ALL_TYPES(DECLARE_FOR_N);
-DECLARE_FOR_N(bfloat16);
 
 #undef DECLARE_FOR_N
 #undef DECLARE_CPU_SPEC
@@ -456,7 +455,6 @@ DECLARE_FOR_N(bfloat16);
 
 TF_CALL_POD_STRING_TYPES(REGISTER_SLICE);
 TF_CALL_QUANTIZED_TYPES(REGISTER_SLICE);
-REGISTER_SLICE(bfloat16);
 #undef REGISTER_SLICE
 #else
 #define REGISTER_SLICE(type)                             \
@@ -469,7 +467,6 @@ REGISTER_SLICE(bfloat16);
 
 TF_CALL_POD_STRING_TYPES(REGISTER_SLICE);
 TF_CALL_QUANTIZED_TYPES(REGISTER_SLICE);
-REGISTER_SLICE(bfloat16);
 #undef REGISTER_SLICE
 #endif  // INTEL_MKL
 
diff --git a/tensorflow/core/kernels/slice_op_cpu_impl.h b/tensorflow/core/kernels/slice_op_cpu_impl.h
index a70805658e..58dc7df3e0 100644
--- a/tensorflow/core/kernels/slice_op_cpu_impl.h
+++ b/tensorflow/core/kernels/slice_op_cpu_impl.h
@@ -30,7 +30,6 @@ using CpuDevice = Eigen::ThreadPoolDevice;
   template struct functor::Slice<CpuDevice, T, CPU_PROVIDED_IXDIM>;
 
 TF_CALL_ALL_TYPES(DEFINE_CPU_KERNELS);
-DEFINE_CPU_KERNELS(bfloat16);
 
 #undef DEFINE_CPU_KERNELS
 
diff --git a/tensorflow/core/kernels/split_lib_cpu.cc b/tensorflow/core/kernels/split_lib_cpu.cc
index 6583f96a91..25026208d1 100644
--- a/tensorflow/core/kernels/split_lib_cpu.cc
+++ b/tensorflow/core/kernels/split_lib_cpu.cc
@@ -41,7 +41,6 @@ void Split<Eigen::ThreadPoolDevice, T>::operator()(
 
 TF_CALL_ALL_TYPES(DEFINE_CPU_KERNELS)
 DEFINE_CPU_KERNELS(quint8)
-DEFINE_CPU_KERNELS(bfloat16)
 
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
diff --git a/tensorflow/core/kernels/split_op.cc b/tensorflow/core/kernels/split_op.cc
index 094ba8bb86..58e1a73be6 100644
--- a/tensorflow/core/kernels/split_op.cc
+++ b/tensorflow/core/kernels/split_op.cc
@@ -360,8 +360,6 @@ class SplitOpSYCL : public SplitOpBase<SYCLDevice, T> {
 
 TF_CALL_ALL_TYPES(REGISTER_SPLIT);
 REGISTER_SPLIT(quint8);
-// TODO(xpan): Merge bfloat16 into TF_CALL_ALL_TYPES
-REGISTER_SPLIT(bfloat16);
 
 #undef REGISTER_SPLIT
 
diff --git a/tensorflow/core/kernels/split_v_op.cc b/tensorflow/core/kernels/split_v_op.cc
index 3316e5fcc9..f1078ac349 100644
--- a/tensorflow/core/kernels/split_v_op.cc
+++ b/tensorflow/core/kernels/split_v_op.cc
@@ -406,7 +406,6 @@ class SplitVOpGPU : public SplitVOpBase<GPUDevice, T, Tlen> {
   REGISTER_SPLIT(type, int64);
 
 TF_CALL_ALL_TYPES(REGISTER_SPLIT_LEN);
-REGISTER_SPLIT_LEN(bfloat16);
 
 #undef REGISTER_SPLIT_LEN
 #undef REGISTER_SPLIT
diff --git a/tensorflow/core/kernels/strided_slice_op.cc b/tensorflow/core/kernels/strided_slice_op.cc
index 73b6d4cf6a..7c213e14d2 100644
--- a/tensorflow/core/kernels/strided_slice_op.cc
+++ b/tensorflow/core/kernels/strided_slice_op.cc
@@ -386,7 +386,6 @@ class StridedSliceAssignOp : public OpKernel {
                           StridedSliceAssignOp<CPUDevice, type>)
 
 TF_CALL_ALL_TYPES(REGISTER_STRIDED_SLICE);
-REGISTER_STRIDED_SLICE(bfloat16);
 
 #undef REGISTER_STRIDED_SLICE
 
diff --git a/tensorflow/core/kernels/strided_slice_op_impl.h b/tensorflow/core/kernels/strided_slice_op_impl.h
index afe3a051e6..a84ba38ef4 100644
--- a/tensorflow/core/kernels/strided_slice_op_impl.h
+++ b/tensorflow/core/kernels/strided_slice_op_impl.h
@@ -288,7 +288,6 @@ DECLARE_FOR_N_GPU(int64);
 #endif  // END GOOGLE_CUDA
 
 TF_CALL_ALL_TYPES(DECLARE_FOR_N_CPU);
-DECLARE_FOR_N_CPU(bfloat16);
 
 #ifdef TENSORFLOW_USE_SYCL
 #define PREVENT_FOR_N_SYCL(T) \
diff --git a/tensorflow/core/kernels/tensor_array_ops.cc b/tensorflow/core/kernels/tensor_array_ops.cc
index cca6d0e35f..9a3d2e124e 100644
--- a/tensorflow/core/kernels/tensor_array_ops.cc
+++ b/tensorflow/core/kernels/tensor_array_ops.cc
@@ -709,7 +709,6 @@ TF_CALL_POD_STRING_TYPES(REGISTER_GATHER_AND_PACK);
 REGISTER_GATHER_AND_PACK(quint8);
 REGISTER_GATHER_AND_PACK(qint8);
 REGISTER_GATHER_AND_PACK(qint32);
-REGISTER_GATHER_AND_PACK(bfloat16);
 
 #undef REGISTER_GATHER_AND_PACK
 
@@ -940,7 +939,6 @@ TF_CALL_POD_STRING_TYPES(REGISTER_CONCAT);
 REGISTER_CONCAT(quint8);
 REGISTER_CONCAT(qint8);
 REGISTER_CONCAT(qint32);
-REGISTER_CONCAT(bfloat16);
 
 #undef REGISTER_CONCAT
 
diff --git a/tensorflow/core/kernels/transpose_op.cc b/tensorflow/core/kernels/transpose_op.cc
index 96c051c636..2e0d18b634 100644
--- a/tensorflow/core/kernels/transpose_op.cc
+++ b/tensorflow/core/kernels/transpose_op.cc
@@ -230,7 +230,6 @@ Status ConjugateTransposeCpuOp::DoTranspose(OpKernelContext* ctx,
                               .HostMemory("perm"),    \
                           MklConjugateTransposeCpuOp);
 TF_CALL_ALL_TYPES(REGISTER);
-REGISTER(bfloat16);
 #undef REGISTER
 
 #else  // INTEL_MKL
@@ -247,7 +246,6 @@ REGISTER(bfloat16);
                               .HostMemory("perm"),    \
                           ConjugateTransposeCpuOp);
 TF_CALL_ALL_TYPES(REGISTER)
-REGISTER(bfloat16);
 #undef REGISTER
 #endif  // INTEL_MKL
 
diff --git a/tensorflow/core/lib/bfloat16/bfloat16.cc b/tensorflow/core/lib/bfloat16/bfloat16.cc
new file mode 100644
index 0000000000..a591717fd1
--- /dev/null
+++ b/tensorflow/core/lib/bfloat16/bfloat16.cc
@@ -0,0 +1,25 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/lib/bfloat16/bfloat16.h"
+
+#include "third_party/eigen3/Eigen/Core"
+
+namespace tensorflow {
+
+B16_DEVICE_FUNC bfloat16::operator Eigen::half() const {
+  return static_cast<Eigen::half>(float(*this));
+}
+}  // end namespace tensorflow
diff --git a/tensorflow/core/lib/bfloat16/bfloat16.h b/tensorflow/core/lib/bfloat16/bfloat16.h
new file mode 100644
index 0000000000..f9cca0ef2a
--- /dev/null
+++ b/tensorflow/core/lib/bfloat16/bfloat16.h
@@ -0,0 +1,276 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_BFLOAT16_BFLOAT16_H_
+#define TENSORFLOW_CORE_LIB_BFLOAT16_BFLOAT16_H_
+
+#include <complex>
+
+#ifdef __CUDACC__
+// All functions callable from CUDA code must be qualified with __device__
+#define B16_DEVICE_FUNC __host__ __device__
+
+#else
+#define B16_DEVICE_FUNC
+
+#endif
+
+namespace Eigen {
+struct half;
+}
+
+namespace tensorflow {
+
+// Single precision complex.
+typedef std::complex<float> complex64;
+// Double precision complex.
+typedef std::complex<double> complex128;
+
+// see framework/bfloat16.h for description.
+struct bfloat16 {
+  B16_DEVICE_FUNC bfloat16() {}
+
+  B16_DEVICE_FUNC explicit bfloat16(const float v) {
+    if (float_isnan(v)) {
+      value = NAN_VALUE;
+      return;
+    }
+    const uint16_t* p = reinterpret_cast<const uint16_t*>(&v);
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+    value = p[0];
+#else
+    value = p[1];
+#endif
+  }
+
+  B16_DEVICE_FUNC explicit bfloat16(const double val)
+      : bfloat16(static_cast<float>(val)) {}
+  // Following the convention of numpy, converting between complex and
+  // float will lead to loss of imag value.
+  B16_DEVICE_FUNC explicit bfloat16(const complex64& val)
+      : bfloat16(val.real()) {}
+
+  B16_DEVICE_FUNC explicit bfloat16(const complex128& val)
+      : bfloat16(static_cast<float>(val.real())) {}
+
+  B16_DEVICE_FUNC explicit bfloat16(const unsigned short val)
+      : bfloat16(static_cast<float>(val)) {}
+
+  B16_DEVICE_FUNC explicit bfloat16(const unsigned int val)
+      : bfloat16(static_cast<float>(val)) {}
+
+  B16_DEVICE_FUNC explicit bfloat16(const int val)
+      : bfloat16(static_cast<float>(val)) {}
+
+  B16_DEVICE_FUNC explicit bfloat16(const long val)
+      : bfloat16(static_cast<float>(val)) {}
+
+  B16_DEVICE_FUNC explicit bfloat16(const long long val)
+      : bfloat16(static_cast<float>(val)) {}
+
+  template <class T>
+  B16_DEVICE_FUNC explicit bfloat16(const T& val)
+      : bfloat16(static_cast<float>(val)) {}
+
+  B16_DEVICE_FUNC explicit operator float() const {
+    float result;
+
+    uint16_t* q = reinterpret_cast<uint16_t*>(&result);
+
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+    q[0] = value;
+    q[1] = 0;
+#else
+    q[0] = 0;
+    q[1] = value;
+#endif
+    return result;
+  }
+
+  B16_DEVICE_FUNC explicit operator bool() const {
+    return static_cast<bool>(float(*this));
+  }
+
+  B16_DEVICE_FUNC explicit operator Eigen::half() const;
+
+  B16_DEVICE_FUNC explicit operator short() const {
+    return static_cast<short>(float(*this));
+  }
+
+  B16_DEVICE_FUNC explicit operator int() const {
+    return static_cast<int>(float(*this));
+  }
+
+  B16_DEVICE_FUNC explicit operator long() const {
+    return static_cast<long>(float(*this));
+  }
+
+  B16_DEVICE_FUNC explicit operator char() const {
+    return static_cast<char>(float(*this));
+  }
+
+  B16_DEVICE_FUNC explicit operator signed char() const {
+    return static_cast<signed char>(float(*this));
+  }
+
+  B16_DEVICE_FUNC explicit operator unsigned char() const {
+    return static_cast<unsigned char>(float(*this));
+  }
+
+  B16_DEVICE_FUNC explicit operator unsigned short() const {
+    return static_cast<unsigned short>(float(*this));
+  }
+
+  B16_DEVICE_FUNC explicit operator unsigned int() const {
+    return static_cast<unsigned int>(float(*this));
+  }
+
+  B16_DEVICE_FUNC explicit operator unsigned long() const {
+    return static_cast<unsigned long>(float(*this));
+  }
+
+  B16_DEVICE_FUNC explicit operator unsigned long long() const {
+    return static_cast<unsigned long long>(float(*this));
+  }
+
+  B16_DEVICE_FUNC explicit operator long long() const {
+    return static_cast<long long>(float(*this));
+  }
+
+  B16_DEVICE_FUNC explicit operator double() const {
+    return static_cast<double>(float(*this));
+  }
+
+  B16_DEVICE_FUNC explicit operator complex64() const {
+    return complex64(float(*this), float(0.0));
+  }
+
+  B16_DEVICE_FUNC explicit operator complex128() const {
+    return complex128(double(*this), double(0.0));
+  }
+
+  static bfloat16 epsilon() {
+    bfloat16 x;
+    x.value = 0x3c00;  // 0x1.0p-7
+    return x;
+  }
+
+  uint16_t value;
+
+  // A value that represents "not a number".
+  static const uint16_t NAN_VALUE = 0x7FC0;
+
+ private:
+  B16_DEVICE_FUNC bool float_isnan(const float& x) {
+#ifdef __CUDA_ARCH__
+    return ::isnan(x);
+#else
+    return std::isnan(x);
+#endif
+  }
+};
+
+B16_DEVICE_FUNC inline std::ostream& operator<<(std::ostream& os,
+                                                const bfloat16& dt) {
+  os << static_cast<float>(dt);
+  return os;
+}
+
+B16_DEVICE_FUNC inline bfloat16 operator+(bfloat16 a, bfloat16 b) {
+  return bfloat16(static_cast<float>(a) + static_cast<float>(b));
+}
+B16_DEVICE_FUNC inline bfloat16 operator+(bfloat16 a, int b) {
+  return bfloat16(static_cast<float>(a) + static_cast<float>(b));
+}
+B16_DEVICE_FUNC inline bfloat16 operator+(int a, bfloat16 b) {
+  return bfloat16(static_cast<float>(a) + static_cast<float>(b));
+}
+B16_DEVICE_FUNC inline bfloat16 operator-(bfloat16 a, bfloat16 b) {
+  return bfloat16(static_cast<float>(a) - static_cast<float>(b));
+}
+B16_DEVICE_FUNC inline bfloat16 operator*(bfloat16 a, bfloat16 b) {
+  return bfloat16(static_cast<float>(a) * static_cast<float>(b));
+}
+B16_DEVICE_FUNC inline bfloat16 operator/(bfloat16 a, bfloat16 b) {
+  return bfloat16(static_cast<float>(a) / static_cast<float>(b));
+}
+B16_DEVICE_FUNC inline bfloat16 operator-(bfloat16 a) {
+  a.value ^= 0x8000;
+  return a;
+}
+B16_DEVICE_FUNC inline bool operator<(bfloat16 a, bfloat16 b) {
+  return static_cast<float>(a) < static_cast<float>(b);
+}
+B16_DEVICE_FUNC inline bool operator<=(bfloat16 a, bfloat16 b) {
+  return static_cast<float>(a) <= static_cast<float>(b);
+}
+B16_DEVICE_FUNC inline bool operator==(bfloat16 a, bfloat16 b) {
+  return static_cast<float>(a) == static_cast<float>(b);
+}
+B16_DEVICE_FUNC inline bool operator!=(bfloat16 a, bfloat16 b) {
+  return static_cast<float>(a) != static_cast<float>(b);
+}
+B16_DEVICE_FUNC inline bool operator>(bfloat16 a, bfloat16 b) {
+  return static_cast<float>(a) > static_cast<float>(b);
+}
+B16_DEVICE_FUNC inline bool operator>=(bfloat16 a, bfloat16 b) {
+  return static_cast<float>(a) >= static_cast<float>(b);
+}
+B16_DEVICE_FUNC inline bfloat16& operator+=(bfloat16& a, bfloat16 b) {
+  a = a + b;
+  return a;
+}
+B16_DEVICE_FUNC inline bfloat16& operator-=(bfloat16& a, bfloat16 b) {
+  a = a - b;
+  return a;
+}
+B16_DEVICE_FUNC inline bfloat16 operator++(bfloat16& a) {
+  a += bfloat16(1);
+  return a;
+}
+B16_DEVICE_FUNC inline bfloat16 operator--(bfloat16& a) {
+  a -= bfloat16(1);
+  return a;
+}
+B16_DEVICE_FUNC inline bfloat16 operator++(bfloat16& a, int) {
+  bfloat16 original_value = a;
+  ++a;
+  return original_value;
+}
+B16_DEVICE_FUNC inline bfloat16 operator--(bfloat16& a, int) {
+  bfloat16 original_value = a;
+  --a;
+  return original_value;
+}
+B16_DEVICE_FUNC inline bfloat16& operator*=(bfloat16& a, bfloat16 b) {
+  a = a * b;
+  return a;
+}
+B16_DEVICE_FUNC inline bfloat16& operator/=(bfloat16& a, bfloat16 b) {
+  a = a / b;
+  return a;
+}
+}  // end namespace tensorflow
+
+namespace std {
+template <>
+struct hash<tensorflow::bfloat16> {
+  size_t operator()(const tensorflow::bfloat16& v) const {
+    return hash<float>()(static_cast<float>(v));
+  }
+};
+}  // namespace std
+
+#endif  // TENSORFLOW_CORE_LIB_BFLOAT16_BFLOAT16_H_
diff --git a/tensorflow/core/lib/hash/hash.h b/tensorflow/core/lib/hash/hash.h
index 0fb12966af..4d312ab7e8 100644
--- a/tensorflow/core/lib/hash/hash.h
+++ b/tensorflow/core/lib/hash/hash.h
@@ -64,6 +64,13 @@ struct hash<T*> {
   }
 };
 
+template <>
+struct hash<bfloat16> {
+  size_t operator()(const bfloat16& t) const {
+    return std::hash<float>()(static_cast<float>(t));
+  }
+};
+
 template <>
 struct hash<string> {
   size_t operator()(const string& s) const {
diff --git a/tensorflow/core/lib/strings/strcat.h b/tensorflow/core/lib/strings/strcat.h
index 8e35549ed4..5835b0101d 100644
--- a/tensorflow/core/lib/strings/strcat.h
+++ b/tensorflow/core/lib/strings/strcat.h
@@ -119,6 +119,9 @@ class AlphaNum {
 
   AlphaNum(float f)  // NOLINT(runtime/explicit)
       : piece_(digits_, strlen(FloatToBuffer(f, digits_))) {}
+  AlphaNum(bfloat16 f)  // NOLINT(runtime/explicit)
+      : piece_(digits_, strlen(FloatToBuffer(static_cast<float>(f), digits_))) {
+  }
   AlphaNum(double f)  // NOLINT(runtime/explicit)
       : piece_(digits_, strlen(DoubleToBuffer(f, digits_))) {}
 
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index 948334d27b..4ff750909e 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -509,6 +509,7 @@ def tf_additional_cloud_kernel_deps():
 def tf_lib_proto_parsing_deps():
   return [
       ":protos_all_cc",
+      "//third_party/eigen3",
       "//tensorflow/core/platform/default/build_config:proto_parsing",
   ]
 
diff --git a/tensorflow/core/platform/types.h b/tensorflow/core/platform/types.h
index 93b82ecb7a..41428bb2c8 100644
--- a/tensorflow/core/platform/types.h
+++ b/tensorflow/core/platform/types.h
@@ -29,6 +29,12 @@ limitations under the License.
 #error Define the appropriate PLATFORM_<foo> macro for this platform
 #endif
 
+#if defined(PLATFORM_WINDOWS)
+#include "tensorflow/core/platform/windows/cpu_info.h"
+#endif
+
+#include "tensorflow/core/lib/bfloat16/bfloat16.h"
+
 namespace tensorflow {
 
 // Define tensorflow::string to refer to appropriate platform specific type.
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 375a5a0720..46e81646a7 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -384,6 +384,7 @@ tf_cc_shared_object(
     }),
     deps = [
         "//tensorflow/core:framework_headers_lib",
+        "//third_party/eigen3",
         "@protobuf_archive//:protobuf_headers",
     ],
 )
-- 
GitLab


From 86967885684433c86d4764d82e5d975e3ef4ab8e Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Wed, 24 Jan 2018 23:36:20 -0800
Subject: [PATCH 150/966] Fix eager Pooling1D unit test for
 data_format='channels_first'

PiperOrigin-RevId: 183196050
---
 tensorflow/python/layers/pooling_test.py | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/tensorflow/python/layers/pooling_test.py b/tensorflow/python/layers/pooling_test.py
index e4d4ed4a2a..7533674e5a 100644
--- a/tensorflow/python/layers/pooling_test.py
+++ b/tensorflow/python/layers/pooling_test.py
@@ -96,33 +96,41 @@ class PoolingTest(test.TestCase):
 
   def testCreateMaxPooling1D(self):
     width = 7
-    images = random_ops.random_uniform((5, width, 4))
+    channels = 3
+    images = random_ops.random_uniform((5, width, channels))
     layer = pooling_layers.MaxPooling1D(2, strides=2)
     output = layer.apply(images)
-    self.assertListEqual(output.get_shape().as_list(), [5, 3, 4])
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, width // 2, channels])
 
   def testCreateAveragePooling1D(self):
     width = 7
-    images = random_ops.random_uniform((5, width, 4))
+    channels = 3
+    images = random_ops.random_uniform((5, width, channels))
     layer = pooling_layers.AveragePooling1D(2, strides=2)
     output = layer.apply(images)
-    self.assertListEqual(output.get_shape().as_list(), [5, 3, 4])
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, width // 2, channels])
 
   def testCreateMaxPooling1DChannelsFirst(self):
     width = 7
-    images = random_ops.random_uniform((5, 4, width))
+    channels = 3
+    images = random_ops.random_uniform((5, channels, width))
     layer = pooling_layers.MaxPooling1D(
         2, strides=2, data_format='channels_first')
     output = layer.apply(images)
-    self.assertListEqual(output.get_shape().as_list(), [5, 4, 3])
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, channels, width // 2])
 
   def testCreateAveragePooling1DChannelsFirst(self):
     width = 7
-    images = random_ops.random_uniform((5, 4, width))
+    channels = 3
+    images = random_ops.random_uniform((5, channels, width))
     layer = pooling_layers.AveragePooling1D(
         2, strides=2, data_format='channels_first')
     output = layer.apply(images)
-    self.assertListEqual(output.get_shape().as_list(), [5, 4, 3])
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, channels, width // 2])
 
   def testCreateMaxPooling3D(self):
     depth, height, width = 6, 7, 9
-- 
GitLab


From 9814329d58a392af905266a38f68e7212db6eecf Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 25 Jan 2018 00:05:16 -0800
Subject: [PATCH 151/966] Add WarmStartSettings configuration for all
 Estimators.

PiperOrigin-RevId: 183197945
---
 tensorflow/python/estimator/canned/dnn.py     | 31 +++++------------
 .../estimator/canned/dnn_linear_combined.py   | 31 +++++------------
 tensorflow/python/estimator/canned/linear.py  | 31 +++++------------
 tensorflow/python/estimator/estimator.py      | 34 ++++++++++++++++++-
 tensorflow/python/estimator/estimator_test.py | 28 +++++++++++++++
 .../python/estimator/warm_starting_util.py    |  6 ++--
 .../tensorflow.estimator.-estimator.pbtxt     |  2 +-
 7 files changed, 89 insertions(+), 74 deletions(-)

diff --git a/tensorflow/python/estimator/canned/dnn.py b/tensorflow/python/estimator/canned/dnn.py
index ba96d738ae..0f274a23c0 100644
--- a/tensorflow/python/estimator/canned/dnn.py
+++ b/tensorflow/python/estimator/canned/dnn.py
@@ -22,7 +22,6 @@ import six
 
 from tensorflow.python.estimator import estimator
 from tensorflow.python.estimator import model_fn
-from tensorflow.python.estimator import warm_starting_util
 from tensorflow.python.estimator.canned import head as head_lib
 from tensorflow.python.estimator.canned import optimizers
 from tensorflow.python.feature_column import feature_column as feature_column_lib
@@ -340,8 +339,8 @@ class DNNClassifier(estimator.Estimator):
           loss_reduction=loss_reduction)
 
     def _model_fn(features, labels, mode, config):
-      """Call the defined shared _dnn_model_fn and possibly warm-start."""
-      estimator_spec = _dnn_model_fn(
+      """Call the defined shared _dnn_model_fn."""
+      return _dnn_model_fn(
           features=features,
           labels=labels,
           mode=mode,
@@ -353,17 +352,10 @@ class DNNClassifier(estimator.Estimator):
           dropout=dropout,
           input_layer_partitioner=input_layer_partitioner,
           config=config)
-      # pylint: disable=protected-access
-      warm_start_settings = warm_starting_util._get_default_warm_start_settings(
-          warm_start_from)
-      if warm_start_settings:
-        warm_starting_util._warm_start(warm_start_settings)
-      # pylint: enable=protected-access
-
-      return estimator_spec
 
     super(DNNClassifier, self).__init__(
-        model_fn=_model_fn, model_dir=model_dir, config=config)
+        model_fn=_model_fn, model_dir=model_dir, config=config,
+        warm_start_from=warm_start_from)
 
 
 class DNNRegressor(estimator.Estimator):
@@ -490,8 +482,8 @@ class DNNRegressor(estimator.Estimator):
     """
 
     def _model_fn(features, labels, mode, config):
-      """Call the defined shared _dnn_model_fn and possibly warm-start."""
-      estimator_spec = _dnn_model_fn(
+      """Call the defined shared _dnn_model_fn."""
+      return _dnn_model_fn(
           features=features,
           labels=labels,
           mode=mode,
@@ -506,14 +498,7 @@ class DNNRegressor(estimator.Estimator):
           dropout=dropout,
           input_layer_partitioner=input_layer_partitioner,
           config=config)
-      # pylint: disable=protected-access
-      warm_start_settings = warm_starting_util._get_default_warm_start_settings(
-          warm_start_from)
-      if warm_start_settings:
-        warm_starting_util._warm_start(warm_start_settings)
-      # pylint: enable=protected-access
-
-      return estimator_spec
 
     super(DNNRegressor, self).__init__(
-        model_fn=_model_fn, model_dir=model_dir, config=config)
+        model_fn=_model_fn, model_dir=model_dir, config=config,
+        warm_start_from=warm_start_from)
diff --git a/tensorflow/python/estimator/canned/dnn_linear_combined.py b/tensorflow/python/estimator/canned/dnn_linear_combined.py
index d29c892662..1a0f4c5c39 100644
--- a/tensorflow/python/estimator/canned/dnn_linear_combined.py
+++ b/tensorflow/python/estimator/canned/dnn_linear_combined.py
@@ -23,7 +23,6 @@ import math
 import six
 
 from tensorflow.python.estimator import estimator
-from tensorflow.python.estimator import warm_starting_util
 from tensorflow.python.estimator.canned import dnn
 from tensorflow.python.estimator.canned import head as head_lib
 from tensorflow.python.estimator.canned import linear
@@ -385,8 +384,8 @@ class DNNLinearCombinedClassifier(estimator.Estimator):
           loss_reduction=loss_reduction)
 
     def _model_fn(features, labels, mode, config):
-      """Call the _dnn_linear_combined_model_fn and possibly warm-start."""
-      estimator_spec = _dnn_linear_combined_model_fn(
+      """Call the _dnn_linear_combined_model_fn."""
+      return _dnn_linear_combined_model_fn(
           features=features,
           labels=labels,
           mode=mode,
@@ -400,17 +399,10 @@ class DNNLinearCombinedClassifier(estimator.Estimator):
           dnn_dropout=dnn_dropout,
           input_layer_partitioner=input_layer_partitioner,
           config=config)
-      # pylint: disable=protected-access
-      warm_start_settings = warm_starting_util._get_default_warm_start_settings(
-          warm_start_from)
-      if warm_start_settings:
-        warm_starting_util._warm_start(warm_start_settings)
-      # pylint: enable=protected-access
-
-      return estimator_spec
 
     super(DNNLinearCombinedClassifier, self).__init__(
-        model_fn=_model_fn, model_dir=model_dir, config=config)
+        model_fn=_model_fn, model_dir=model_dir, config=config,
+        warm_start_from=warm_start_from)
 
 
 class DNNLinearCombinedRegressor(estimator.Estimator):
@@ -554,8 +546,8 @@ class DNNLinearCombinedRegressor(estimator.Estimator):
                        'must be defined.')
 
     def _model_fn(features, labels, mode, config):
-      """Call the _dnn_linear_combined_model_fn and possibly warm-start."""
-      estimator_spec = _dnn_linear_combined_model_fn(
+      """Call the _dnn_linear_combined_model_fn."""
+      return _dnn_linear_combined_model_fn(
           features=features,
           labels=labels,
           mode=mode,
@@ -572,14 +564,7 @@ class DNNLinearCombinedRegressor(estimator.Estimator):
           dnn_dropout=dnn_dropout,
           input_layer_partitioner=input_layer_partitioner,
           config=config)
-      # pylint: disable=protected-access
-      warm_start_settings = warm_starting_util._get_default_warm_start_settings(
-          warm_start_from)
-      if warm_start_settings:
-        warm_starting_util._warm_start(warm_start_settings)
-      # pylint: enable=protected-access
-
-      return estimator_spec
 
     super(DNNLinearCombinedRegressor, self).__init__(
-        model_fn=_model_fn, model_dir=model_dir, config=config)
+        model_fn=_model_fn, model_dir=model_dir, config=config,
+        warm_start_from=warm_start_from)
diff --git a/tensorflow/python/estimator/canned/linear.py b/tensorflow/python/estimator/canned/linear.py
index 7a80dfacc2..a5b1172e72 100644
--- a/tensorflow/python/estimator/canned/linear.py
+++ b/tensorflow/python/estimator/canned/linear.py
@@ -23,7 +23,6 @@ import math
 import six
 
 from tensorflow.python.estimator import estimator
-from tensorflow.python.estimator import warm_starting_util
 from tensorflow.python.estimator.canned import head as head_lib
 from tensorflow.python.estimator.canned import optimizers
 from tensorflow.python.feature_column import feature_column as feature_column_lib
@@ -305,8 +304,8 @@ class LinearClassifier(estimator.Estimator):
           loss_reduction=loss_reduction)
 
     def _model_fn(features, labels, mode, config):
-      """Call the defined shared _linear_model_fn and possibly warm-start."""
-      estimator_spec = _linear_model_fn(
+      """Call the defined shared _linear_model_fn."""
+      return _linear_model_fn(
           features=features,
           labels=labels,
           mode=mode,
@@ -315,19 +314,12 @@ class LinearClassifier(estimator.Estimator):
           optimizer=optimizer,
           partitioner=partitioner,
           config=config)
-      # pylint: disable=protected-access
-      warm_start_settings = warm_starting_util._get_default_warm_start_settings(
-          warm_start_from)
-      if warm_start_settings:
-        warm_starting_util._warm_start(warm_start_settings)
-      # pylint: enable=protected-access
-
-      return estimator_spec
 
     super(LinearClassifier, self).__init__(
         model_fn=_model_fn,
         model_dir=model_dir,
-        config=config)
+        config=config,
+        warm_start_from=warm_start_from)
 
 
 class LinearRegressor(estimator.Estimator):
@@ -432,8 +424,8 @@ class LinearRegressor(estimator.Estimator):
         loss_reduction=loss_reduction)
 
     def _model_fn(features, labels, mode, config):
-      """Call the defined shared _linear_model_fn and possibly warm-start."""
-      estimator_spec = _linear_model_fn(
+      """Call the defined shared _linear_model_fn."""
+      return _linear_model_fn(
           features=features,
           labels=labels,
           mode=mode,
@@ -442,16 +434,9 @@ class LinearRegressor(estimator.Estimator):
           optimizer=optimizer,
           partitioner=partitioner,
           config=config)
-      # pylint: disable=protected-access
-      warm_start_settings = warm_starting_util._get_default_warm_start_settings(
-          warm_start_from)
-      if warm_start_settings:
-        warm_starting_util._warm_start(warm_start_settings)
-      # pylint: enable=protected-access
-
-      return estimator_spec
 
     super(LinearRegressor, self).__init__(
         model_fn=_model_fn,
         model_dir=model_dir,
-        config=config)
+        config=config,
+        warm_start_from=warm_start_from)
diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index 2e914fa7e0..face20d530 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -35,6 +35,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.estimator import run_config
 from tensorflow.python.estimator import util
+from tensorflow.python.estimator import warm_starting_util
 from tensorflow.python.estimator.export.export import build_all_signature_defs
 from tensorflow.python.estimator.export.export import get_temp_export_dir
 from tensorflow.python.estimator.export.export import get_timestamped_export_dir
@@ -96,9 +97,22 @@ class Estimator(object):
   @end_compatibility
   """
 
-  def __init__(self, model_fn, model_dir=None, config=None, params=None):
+  def __init__(self, model_fn, model_dir=None, config=None, params=None,
+               warm_start_from=None):
     """Constructs an `Estimator` instance.
 
+    See @{$estimators} for more information. To warm-start an `Estimator`:
+
+    ```python
+    estimator = tf.estimator.DNNClassifier(
+        feature_columns=[categorical_feature_a_emb, categorical_feature_b_emb],
+        hidden_units=[1024, 512, 256],
+        warm_start_from="/path/to/checkpoint/dir")
+    ```
+
+    For more details on warm-start configuration, see
+    @{tf.estimator.WarmStartSettings$WarmStartSettings}.
+
     Args:
       model_fn: Model function. Follows the signature:
 
@@ -135,6 +149,12 @@ class Estimator(object):
       config: Configuration object.
       params: `dict` of hyper parameters that will be passed into `model_fn`.
               Keys are names of parameters, values are basic python types.
+      warm_start_from: Optional string filepath to a checkpoint to warm-start
+                       from, or a `tf.estimator.WarmStartSettings` object to
+                       fully configure warm-starting.  If the string filepath is
+                       provided instead of a `WarmStartSettings`, then all
+                       variables are warm-started, and it is assumed that
+                       vocabularies and Tensor names are unchanged.
 
     Raises:
       RuntimeError: If eager execution is enabled.
@@ -192,6 +212,11 @@ class Estimator(object):
     self._model_fn = model_fn
     self._params = copy.deepcopy(params or {})
 
+    # pylint: disable=protected-access
+    self._warm_start_settings = (
+        warm_starting_util._get_default_warm_start_settings(warm_start_from))
+    # pylint: enable=protected-access
+
   @property
   def model_dir(self):
     return self._model_dir
@@ -781,6 +806,13 @@ class Estimator(object):
       worker_hooks.extend(input_hooks)
       estimator_spec = self._call_model_fn(
           features, labels, model_fn_lib.ModeKeys.TRAIN, self.config)
+
+      if self._warm_start_settings:
+        logging.info('Warm-starting with WarmStartSettings: %s' %
+                     (self._warm_start_settings,))
+        # pylint: disable=protected-access
+        warm_starting_util._warm_start(self._warm_start_settings)
+        # pylint: enable=protected-access
       # Check if the user created a loss summary, and add one if they didn't.
       # We assume here that the summary is called 'loss'. If it is not, we will
       # make another one with the name 'loss' to ensure it shows up in the right
diff --git a/tensorflow/python/estimator/estimator_test.py b/tensorflow/python/estimator/estimator_test.py
index ed1676a92d..833f3dcac3 100644
--- a/tensorflow/python/estimator/estimator_test.py
+++ b/tensorflow/python/estimator/estimator_test.py
@@ -52,6 +52,7 @@ from tensorflow.python.ops import metrics as metrics_lib
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import string_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import gfile
@@ -629,6 +630,33 @@ class EstimatorTrainTest(test.TestCase):
     self.assertEqual(
         10, estimator._load_global_step_from_checkpoint_dir(est.model_dir))
 
+  def test_warm_starts(self):
+    def _make_model_fn(x):
+      def _variable_creating_model_fn(features, labels, mode):
+        _, _ = features, labels
+        variable_scope.get_variable('x', initializer=x)
+        global_step = training.get_global_step()
+        return model_fn_lib.EstimatorSpec(
+            mode,
+            loss=constant_op.constant(1.),
+            train_op=state_ops.assign_add(global_step, 1))
+      return _variable_creating_model_fn
+
+    est = estimator.Estimator(model_fn=_make_model_fn(42.))
+    est.train(dummy_input_fn, steps=10)
+
+    warm_started_est = estimator.Estimator(
+        model_fn=_make_model_fn(36.),
+        warm_start_from=est.model_dir)
+    warm_started_est.train(dummy_input_fn, steps=5)
+    # warm_start is called after the model_fn, so x should have the value
+    # from the checkpoint.
+    self.assertEqual(42., warm_started_est.get_variable_value('x'))
+    # global_step should not be warm-started.
+    self.assertEqual(
+        5, estimator._load_global_step_from_checkpoint_dir(
+            warm_started_est.model_dir))
+
   def test_max_step(self):
     est = estimator.Estimator(model_fn=model_fn_global_step_incrementer)
     est.train(dummy_input_fn, max_steps=5)
diff --git a/tensorflow/python/estimator/warm_starting_util.py b/tensorflow/python/estimator/warm_starting_util.py
index c748b318b7..ad95c71234 100644
--- a/tensorflow/python/estimator/warm_starting_util.py
+++ b/tensorflow/python/estimator/warm_starting_util.py
@@ -402,10 +402,10 @@ def _warm_start_var_with_vocab(var,
 
 
 def _warm_start(warm_start_settings):
-  """Warmstarts a model using the given settings.
+  """Warm-starts a model using the given settings.
 
-  Currently, this is intended for use only in canned Estimators.  Once made
-  public, it can be used in any model_fn.
+  If you are using a tf.estimator.Estimator, this will automatically be called
+  during training.
 
   Args:
     warm_start_settings: An object of `WarmStartSettings`.
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-estimator.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-estimator.pbtxt
index d0bf043754..76f527f796 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-estimator.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-estimator.pbtxt
@@ -20,7 +20,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_fn\', \'model_dir\', \'config\', \'params\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'model_fn\', \'model_dir\', \'config\', \'params\', \'warm_start_from\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "evaluate"
-- 
GitLab


From 15b368d31f38547aa3eb69f2c05dacff8ce8858a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 25 Jan 2018 04:48:59 -0800
Subject: [PATCH 152/966] Fixes FunctionLibraryRuntime to be destroyed before
 ExecutorImpl when NewLocalExecutor returns an error status.

PiperOrigin-RevId: 183220585
---
 tensorflow/core/common_runtime/direct_session.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index e9bdd922ba..20c59ad42b 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -1143,8 +1143,8 @@ Status DirectSession::GetOrCreateExecutors(
     options.debug_options = run_state_args->debug_options;
   }
 
-  std::shared_ptr<ExecutorsAndKeys> ek(new ExecutorsAndKeys);
   std::unique_ptr<FunctionInfo> func_info(new FunctionInfo);
+  std::shared_ptr<ExecutorsAndKeys> ek(new ExecutorsAndKeys);
 
   // The executor_lock_ is intentionally released while executor is
   // being created.
-- 
GitLab


From 028ef1e67201700e8d9d77af64655f1dd20ae665 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 25 Jan 2018 08:05:30 -0800
Subject: [PATCH 153/966] Drop the manually_create field from RnnState.

Initially, I thought that the shape of RNN state arrays could always be
determined by shape propagation. Then I came across some graphs where this
wasn't so easy to infer, so I introduced manually_create thinking of it
as a hack. Today I took another look at dropping that hack, and had a
"D'oh" moment when I realized that the cyclic nature of RNN graphs makes
it impossible to infer the shapes of all arrays by usual propagation.
For example, in a LSTM cell, the input array is concatenated with
a state array, so if we don't already know the shape of that state array,
shape propagation stops there.

Thus, this change removes manually_create by making toco always behave as
if manually_create=true, i.e. early-creating all RNN state arrays with
the shape explicitly specified by the user. The next TODO item here
(see model_flags.proto) is to introduce a generic 'shape' field, so far
the current 'size' field only allows specifying 1-D shapes.

PiperOrigin-RevId: 183239252
---
 .../lite/toco/allocate_transient_arrays.cc        |  4 +---
 .../contrib/lite/toco/model_cmdline_flags.cc      |  3 ---
 tensorflow/contrib/lite/toco/model_flags.proto    | 15 +++------------
 tensorflow/contrib/lite/toco/tooling_util.cc      | 13 ++++++-------
 4 files changed, 10 insertions(+), 25 deletions(-)

diff --git a/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc b/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc
index 5961d30bf5..49cc1fc2aa 100644
--- a/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc
+++ b/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc
@@ -158,9 +158,7 @@ std::size_t TransientArraySize(const Model& model, const string& array_name,
         LOG(FATAL)
             << "A RNN state array, " << array_name << ", still does not "
             << "have a known data type after all graph transformations have "
-            << "run. That's mostly a toco bug --- sorry. For now, you can "
-            << "work around this issue by adding manually_create:true in the "
-            << "--rnn_state description of this RNN state.";
+            << "run.";
       }
     }
     LOG(FATAL) << "An array, " << array_name << ", still does not "
diff --git a/tensorflow/contrib/lite/toco/model_cmdline_flags.cc b/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
index 790b3443ce..36520d9c55 100644
--- a/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
+++ b/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
@@ -327,9 +327,6 @@ void ReadModelFlagsFromCommandLineFlags(
         CHECK(absl::SimpleAtoi(value, &size));
         CHECK_GT(size, 0);
         rnn_state_proto->set_size(size);
-      } else if (key == "manually_create") {
-        CHECK_EQ(absl::AsciiStrToLower(value), "true");
-        rnn_state_proto->set_manually_create(true);
       } else {
         LOG(FATAL) << "Unknown key '" << key << "' in --rnn_states";
       }
diff --git a/tensorflow/contrib/lite/toco/model_flags.proto b/tensorflow/contrib/lite/toco/model_flags.proto
index 13fea29a07..9070ddc883 100644
--- a/tensorflow/contrib/lite/toco/model_flags.proto
+++ b/tensorflow/contrib/lite/toco/model_flags.proto
@@ -81,19 +81,10 @@ message RnnState {
   optional string state_array = 1;
   optional string back_edge_source_array = 2;
   optional bool discardable = 5;
-  // TODO(benoitjacob): drop the 'size' field. Should be redundant with
-  // --input_shapes and shapes propagation.
+  // size allows to specify a 1-D shape for the RNN state array.
+  // Will be expanded with 1's to fit the model.
+  // TODO(benoitjacob): should allow a generic, explicit shape.
   optional int32 size = 3;
-  // TODO(benoitjacob): manually_create is a temporary hack:
-  // due to discrepancies between the current toco dims tracking and
-  // TensorFlow shapes, for some models we need to manually create RNN state
-  // arrays with a specified shape.
-  // Maybe we should actually implement back-edges as operators of their own,
-  // which would remove the need for much special-casing, including here,
-  // we could probably consistently let PropagateFixedSizes handle state
-  // arrays.
-  // TODO(benoitjacob): should really drop manually_create now.
-  optional bool manually_create = 4;
 }
 
 // ModelFlags encodes properties of a model that, depending on the file
diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc
index 99a54a300b..df785a5102 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.cc
+++ b/tensorflow/contrib/lite/toco/tooling_util.cc
@@ -958,7 +958,9 @@ void CheckModelCounts(const Model& model) {
 void MakeArrayDims(int num_dims, int batch, int height, int width, int depth,
                    std::vector<int>* out_dims) {
   CHECK(out_dims->empty());
-  if (num_dims == 1) {
+  if (num_dims == 0) {
+    return;
+  } else if (num_dims == 1) {
     CHECK_EQ(batch, 1);
     *out_dims = {depth};
   } else if (num_dims == 2) {
@@ -990,13 +992,13 @@ void CreateOrCheckRnnStateArray(const string& name, int size, Model* model) {
   if (array.has_shape()) {
     num_dims = array.shape().dimensions_count();
   }
-  std::vector<int> dims;
-  MakeArrayDims(num_dims, batch, 1, 1, size, &dims);
   CHECK(array.data_type == ArrayDataType::kFloat ||
         array.data_type == ArrayDataType::kNone);
   array.data_type = ArrayDataType::kFloat;
-  if (!array.has_shape()) {
+  if (!array.has_shape() && num_dims >= 0) {
     Shape* shape = array.mutable_shape();
+    std::vector<int> dims;
+    MakeArrayDims(num_dims, batch, 1, 1, size, &dims);
     *shape->mutable_dims() = dims;
   }
 }
@@ -1185,9 +1187,6 @@ void ResolveModelFlags(const ModelFlags& model_flags, Model* model) {
   }
   // Creation of the RNN state arrays
   for (const auto& rnn_state : model->flags.rnn_states()) {
-    if (!rnn_state.manually_create()) {
-      continue;
-    }
     CreateOrCheckRnnStateArray(rnn_state.state_array(), rnn_state.size(),
                                model);
   }
-- 
GitLab


From 10d7ddfa9bb95d65f7245dae4230a00b0badde06 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 25 Jan 2018 08:20:51 -0800
Subject: [PATCH 154/966] Automated g4 rollback of changelist 183239252

PiperOrigin-RevId: 183241034
---
 .../lite/toco/allocate_transient_arrays.cc        |  4 +++-
 .../contrib/lite/toco/model_cmdline_flags.cc      |  3 +++
 tensorflow/contrib/lite/toco/model_flags.proto    | 15 ++++++++++++---
 tensorflow/contrib/lite/toco/tooling_util.cc      | 13 +++++++------
 4 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc b/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc
index 49cc1fc2aa..5961d30bf5 100644
--- a/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc
+++ b/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc
@@ -158,7 +158,9 @@ std::size_t TransientArraySize(const Model& model, const string& array_name,
         LOG(FATAL)
             << "A RNN state array, " << array_name << ", still does not "
             << "have a known data type after all graph transformations have "
-            << "run.";
+            << "run. That's mostly a toco bug --- sorry. For now, you can "
+            << "work around this issue by adding manually_create:true in the "
+            << "--rnn_state description of this RNN state.";
       }
     }
     LOG(FATAL) << "An array, " << array_name << ", still does not "
diff --git a/tensorflow/contrib/lite/toco/model_cmdline_flags.cc b/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
index 36520d9c55..790b3443ce 100644
--- a/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
+++ b/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
@@ -327,6 +327,9 @@ void ReadModelFlagsFromCommandLineFlags(
         CHECK(absl::SimpleAtoi(value, &size));
         CHECK_GT(size, 0);
         rnn_state_proto->set_size(size);
+      } else if (key == "manually_create") {
+        CHECK_EQ(absl::AsciiStrToLower(value), "true");
+        rnn_state_proto->set_manually_create(true);
       } else {
         LOG(FATAL) << "Unknown key '" << key << "' in --rnn_states";
       }
diff --git a/tensorflow/contrib/lite/toco/model_flags.proto b/tensorflow/contrib/lite/toco/model_flags.proto
index 9070ddc883..13fea29a07 100644
--- a/tensorflow/contrib/lite/toco/model_flags.proto
+++ b/tensorflow/contrib/lite/toco/model_flags.proto
@@ -81,10 +81,19 @@ message RnnState {
   optional string state_array = 1;
   optional string back_edge_source_array = 2;
   optional bool discardable = 5;
-  // size allows to specify a 1-D shape for the RNN state array.
-  // Will be expanded with 1's to fit the model.
-  // TODO(benoitjacob): should allow a generic, explicit shape.
+  // TODO(benoitjacob): drop the 'size' field. Should be redundant with
+  // --input_shapes and shapes propagation.
   optional int32 size = 3;
+  // TODO(benoitjacob): manually_create is a temporary hack:
+  // due to discrepancies between the current toco dims tracking and
+  // TensorFlow shapes, for some models we need to manually create RNN state
+  // arrays with a specified shape.
+  // Maybe we should actually implement back-edges as operators of their own,
+  // which would remove the need for much special-casing, including here,
+  // we could probably consistently let PropagateFixedSizes handle state
+  // arrays.
+  // TODO(benoitjacob): should really drop manually_create now.
+  optional bool manually_create = 4;
 }
 
 // ModelFlags encodes properties of a model that, depending on the file
diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc
index df785a5102..99a54a300b 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.cc
+++ b/tensorflow/contrib/lite/toco/tooling_util.cc
@@ -958,9 +958,7 @@ void CheckModelCounts(const Model& model) {
 void MakeArrayDims(int num_dims, int batch, int height, int width, int depth,
                    std::vector<int>* out_dims) {
   CHECK(out_dims->empty());
-  if (num_dims == 0) {
-    return;
-  } else if (num_dims == 1) {
+  if (num_dims == 1) {
     CHECK_EQ(batch, 1);
     *out_dims = {depth};
   } else if (num_dims == 2) {
@@ -992,13 +990,13 @@ void CreateOrCheckRnnStateArray(const string& name, int size, Model* model) {
   if (array.has_shape()) {
     num_dims = array.shape().dimensions_count();
   }
+  std::vector<int> dims;
+  MakeArrayDims(num_dims, batch, 1, 1, size, &dims);
   CHECK(array.data_type == ArrayDataType::kFloat ||
         array.data_type == ArrayDataType::kNone);
   array.data_type = ArrayDataType::kFloat;
-  if (!array.has_shape() && num_dims >= 0) {
+  if (!array.has_shape()) {
     Shape* shape = array.mutable_shape();
-    std::vector<int> dims;
-    MakeArrayDims(num_dims, batch, 1, 1, size, &dims);
     *shape->mutable_dims() = dims;
   }
 }
@@ -1187,6 +1185,9 @@ void ResolveModelFlags(const ModelFlags& model_flags, Model* model) {
   }
   // Creation of the RNN state arrays
   for (const auto& rnn_state : model->flags.rnn_states()) {
+    if (!rnn_state.manually_create()) {
+      continue;
+    }
     CreateOrCheckRnnStateArray(rnn_state.state_array(), rnn_state.size(),
                                model);
   }
-- 
GitLab


From 9acbe517b5fa5d3a88770c7a02450d755691beda Mon Sep 17 00:00:00 2001
From: mktozk <mktozk@users.noreply.github.com>
Date: Fri, 26 Jan 2018 02:01:37 +0900
Subject: [PATCH 155/966] Fix Conv3DTranspose in tf.keras (#16307)

* Update convolutional.py

Fix Conv3DTranspose

* Update tensorflow.keras.layers.-conv3-d-transpose.pbtxt

* Update tensorflow.keras.layers.-convolution3-d-transpose.pbtxt

* Update convolutional.py

* Fix compute_output_shape() in Conv3DTranspose

* Fix compute_output_shape() in Conv3DTranspose

* Update convolutional.py
---
 .../python/keras/_impl/keras/layers/convolutional.py     | 2 +-
 tensorflow/python/layers/convolutional.py                | 9 ++++++---
 .../tensorflow.keras.layers.-conv3-d-transpose.pbtxt     | 1 +
 ...nsorflow.keras.layers.-convolution3-d-transpose.pbtxt | 1 +
 4 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/keras/_impl/keras/layers/convolutional.py b/tensorflow/python/keras/_impl/keras/layers/convolutional.py
index 22496e8a76..f0f5e1fb46 100644
--- a/tensorflow/python/keras/_impl/keras/layers/convolutional.py
+++ b/tensorflow/python/keras/_impl/keras/layers/convolutional.py
@@ -563,7 +563,7 @@ class Conv2DTranspose(tf_convolutional_layers.Conv2DTranspose, Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-class Conv3DTranspose(tf_convolutional_layers.Conv3D, Layer):
+class Conv3DTranspose(tf_convolutional_layers.Conv3DTranspose, Layer):
   """Transposed convolution layer (sometimes called Deconvolution).
 
   The need for transposed convolutions generally arises
diff --git a/tensorflow/python/layers/convolutional.py b/tensorflow/python/layers/convolutional.py
index d5147b237b..e8dba3cea3 100644
--- a/tensorflow/python/layers/convolutional.py
+++ b/tensorflow/python/layers/convolutional.py
@@ -1904,6 +1904,7 @@ class Conv3DTranspose(Conv3D):
           dtype=self.dtype)
     else:
       self.bias = None
+    self.built = True
 
   def call(self, inputs):
     inputs_shape = array_ops.shape(inputs)
@@ -1974,6 +1975,8 @@ class Conv3DTranspose(Conv3D):
 
     if self.use_bias:
       outputs_shape = outputs.shape.as_list()
+      if outputs_shape[0] is None:
+        outputs_shape[0] = -1
       if self.data_format == 'channels_first':
         outputs_4d = array_ops.reshape(outputs, [
             outputs_shape[0], outputs_shape[1],
@@ -2007,11 +2010,11 @@ class Conv3DTranspose(Conv3D):
 
     output_shape[c_axis] = self.filters
     output_shape[d_axis] = utils.deconv_output_length(
-        output_shape[d_axis], stride_d, kernel_d, self.padding)
+        output_shape[d_axis], kernel_d, self.padding, stride_d)
     output_shape[h_axis] = utils.deconv_output_length(
-        output_shape[h_axis], stride_h, kernel_h, self.padding)
+        output_shape[h_axis], kernel_h, self.padding, stride_h)
     output_shape[w_axis] = utils.deconv_output_length(
-        output_shape[w_axis], stride_w, kernel_w, self.padding)
+        output_shape[w_axis], kernel_w, self.padding, stride_w)
     return tensor_shape.TensorShape(output_shape)
 
 
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
index d898c54627..11e05f884d 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.keras.layers.Conv3DTranspose"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3DTranspose\'>"
+  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3DTranspose\'>"
   is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3D\'>"
   is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
index a7001bbe34..58724a1e16 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.keras.layers.Convolution3DTranspose"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3DTranspose\'>"
+  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3DTranspose\'>"
   is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3D\'>"
   is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
-- 
GitLab


From 76d621efecc63a08821c294390115ff5f96d47e6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20Kapica?= <pawel.kapica@gmail.com>
Date: Thu, 25 Jan 2018 18:02:17 +0100
Subject: [PATCH 156/966] RGB<->YIQ colorspace conversion (#15555)

* image colorspace conversion using tensordot op

* fixed bug with assuming input tensor is 4D

* fix

* fix - push with all changes commited

* fix tuple object in np.random.rand

* updated goldens
---
 tensorflow/python/ops/image_ops.py            |   4 +
 tensorflow/python/ops/image_ops_impl.py       | 103 ++++++++++++++++++
 tensorflow/python/ops/image_ops_test.py       |  58 ++++++++++
 .../tools/api/golden/tensorflow.image.pbtxt   |  16 +++
 4 files changed, 181 insertions(+)

diff --git a/tensorflow/python/ops/image_ops.py b/tensorflow/python/ops/image_ops.py
index 3b0b5a978c..de12c5f63f 100644
--- a/tensorflow/python/ops/image_ops.py
+++ b/tensorflow/python/ops/image_ops.py
@@ -49,6 +49,10 @@ See the @{$python/image} guide.
 @@grayscale_to_rgb
 @@hsv_to_rgb
 @@rgb_to_hsv
+@@rgb_to_yiq
+@@yiq_to_rgb
+@@rgb_to_yuv
+@@yuv_to_rgb
 @@convert_image_dtype
 @@adjust_brightness
 @@random_brightness
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index 9bd452155c..721efcf786 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -1668,3 +1668,106 @@ def non_max_suppression(boxes,
     return gen_image_ops._non_max_suppression_v2(boxes, scores, max_output_size,
                                                  iou_threshold)
     # pylint: enable=protected-access
+
+
+_rgb_to_yiq_kernel = [[0.299, 0.59590059, 0.2115],
+                      [0.587, -0.27455667, -0.52273617],
+                      [0.114, -0.32134392, 0.31119955]]
+
+
+def rgb_to_yiq(images):
+  """Converts one or more images from RGB to YIQ.
+
+  Outputs a tensor of the same shape as the `images` tensor, containing the YIQ
+  value of the pixels.
+  The output is only well defined if the value in images are in [0,1].
+
+  Args:
+    images: 2-D or higher rank. Image data to convert. Last dimension must be
+    size 3.
+
+  Returns:
+    images: tensor with the same shape as `images`.
+  """
+  images = ops.convert_to_tensor(images, name='images')
+  kernel = ops.convert_to_tensor(_rgb_to_yiq_kernel, dtype=images.dtype, name='kernel')
+  ndims = images.get_shape().ndims
+  return math_ops.tensordot(images, kernel, axes=[[ndims-1], [0]])
+
+
+_yiq_to_rgb_kernel = [[1, 1, 1],
+                      [0.95598634, -0.27201283, -1.10674021],
+                      [0.6208248, -0.64720424, 1.70423049]]
+
+
+def yiq_to_rgb(images):
+  """Converts one or more images from YIQ to RGB.
+
+  Outputs a tensor of the same shape as the `images` tensor, containing the RGB
+  value of the pixels.
+  The output is only well defined if the Y value in images are in [0,1],
+  I value are in [-0.5957,0.5957] and Q value are in [-0.5226,0.5226].
+
+  Args:
+    images: 2-D or higher rank. Image data to convert. Last dimension must be
+    size 3.
+
+  Returns:
+    images: tensor with the same shape as `images`.
+  """
+  images = ops.convert_to_tensor(images, name='images')
+  kernel = ops.convert_to_tensor(_yiq_to_rgb_kernel, dtype=images.dtype, name='kernel')
+  ndims = images.get_shape().ndims
+  return math_ops.tensordot(images, kernel, axes=[[ndims-1], [0]])
+
+
+_rgb_to_yuv_kernel = [[0.299, -0.14714119, 0.61497538],
+                      [0.587, -0.28886916, -0.51496512],
+                      [0.114, 0.43601035, -0.10001026]]
+
+
+def rgb_to_yuv(images):
+  """Converts one or more images from RGB to YUV.
+
+  Outputs a tensor of the same shape as the `images` tensor, containing the YUV
+  value of the pixels.
+  The output is only well defined if the value in images are in [0,1].
+
+  Args:
+    images: 2-D or higher rank. Image data to convert. Last dimension must be
+    size 3.
+
+  Returns:
+    images: tensor with the same shape as `images`.
+  """
+  images = ops.convert_to_tensor(images, name='images')
+  kernel = ops.convert_to_tensor(_rgb_to_yuv_kernel, dtype=images.dtype, name='kernel')
+  ndims = images.get_shape().ndims
+  return math_ops.tensordot(images, kernel, axes=[[ndims-1], [0]])
+
+
+_yuv_to_rgb_kernel = [[1, 1, 1],
+                      [0, -0.394642334, 2.03206185],
+                      [1.13988303, -0.58062185, 0]]
+
+
+def yuv_to_rgb(images):
+  """Converts one or more images from YUV to RGB.
+
+  Outputs a tensor of the same shape as the `images` tensor, containing the RGB
+  value of the pixels.
+  The output is only well defined if the Y value in images are in [0,1],
+  U and V value are in [-0.5,0.5].
+
+  Args:
+    images: 2-D or higher rank. Image data to convert. Last dimension must be
+    size 3.
+
+  Returns:
+    images: tensor with the same shape as `images`.
+  """
+  images = ops.convert_to_tensor(images, name='images')
+  kernel = ops.convert_to_tensor(_yuv_to_rgb_kernel, dtype=images.dtype, name='kernel')
+  ndims = images.get_shape().ndims
+  return math_ops.tensordot(images, kernel, axes=[[ndims-1], [0]])
+
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index 0c5ed2150d..9834384634 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -85,6 +85,64 @@ class RGBToHSVTest(test_util.TensorFlowTestCase):
       self.assertAllClose(rgb_tf, rgb_np)
 
 
+class RGBToYIQTest(test_util.TensorFlowTestCase):
+
+  def testBatch(self):
+    # Build an arbitrary RGB image
+    np.random.seed(7)
+    batch_size = 5
+    shape = (batch_size, 2, 7, 3)
+
+    for nptype in [np.float32, np.float64]:
+      inp = np.random.rand(*shape).astype(nptype)
+
+      # Convert to YIQ and back, as a batch and individually
+      with self.test_session(use_gpu=True) as sess:
+        batch0 = constant_op.constant(inp)
+        batch1 = image_ops.rgb_to_yiq(batch0)
+        batch2 = image_ops.yiq_to_rgb(batch1)
+        split0 = array_ops.unstack(batch0)
+        split1 = list(map(image_ops.rgb_to_yiq, split0))
+        split2 = list(map(image_ops.yiq_to_rgb, split1))
+        join1 = array_ops.stack(split1)
+        join2 = array_ops.stack(split2)
+        batch1, batch2, join1, join2 = sess.run([batch1, batch2, join1, join2])
+
+      # Verify that processing batch elements together is the same as separate
+      self.assertAllClose(batch1, join1, rtol=1e-4, atol=1e-4)
+      self.assertAllClose(batch2, join2, rtol=1e-4, atol=1e-4)
+      self.assertAllClose(batch2, inp, rtol=1e-4, atol=1e-4)
+
+
+class RGBToYUVTest(test_util.TensorFlowTestCase):
+
+  def testBatch(self):
+    # Build an arbitrary RGB image
+    np.random.seed(7)
+    batch_size = 5
+    shape = (batch_size, 2, 7, 3)
+
+    for nptype in [np.float32, np.float64]:
+      inp = np.random.rand(*shape).astype(nptype)
+
+      # Convert to YUV and back, as a batch and individually
+      with self.test_session(use_gpu=True) as sess:
+        batch0 = constant_op.constant(inp)
+        batch1 = image_ops.rgb_to_yuv(batch0)
+        batch2 = image_ops.yuv_to_rgb(batch1)
+        split0 = array_ops.unstack(batch0)
+        split1 = list(map(image_ops.rgb_to_yuv, split0))
+        split2 = list(map(image_ops.yuv_to_rgb, split1))
+        join1 = array_ops.stack(split1)
+        join2 = array_ops.stack(split2)
+        batch1, batch2, join1, join2 = sess.run([batch1, batch2, join1, join2])
+
+      # Verify that processing batch elements together is the same as separate
+      self.assertAllClose(batch1, join1, rtol=1e-4, atol=1e-4)
+      self.assertAllClose(batch2, join2, rtol=1e-4, atol=1e-4)
+      self.assertAllClose(batch2, inp, rtol=1e-4, atol=1e-4)
+
+
 class GrayscaleToRGBTest(test_util.TensorFlowTestCase):
 
   def _RGBToGrayscale(self, images):
diff --git a/tensorflow/tools/api/golden/tensorflow.image.pbtxt b/tensorflow/tools/api/golden/tensorflow.image.pbtxt
index 441621a2a0..baedf596e8 100644
--- a/tensorflow/tools/api/golden/tensorflow.image.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.image.pbtxt
@@ -168,6 +168,14 @@ tf_module {
     name: "rgb_to_hsv"
     argspec: "args=[\'images\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "rgb_to_yiq"
+    argspec: "args=[\'images\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "rgb_to_yuv"
+    argspec: "args=[\'images\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "rot90"
     argspec: "args=[\'image\', \'k\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
@@ -184,4 +192,12 @@ tf_module {
     name: "transpose_image"
     argspec: "args=[\'image\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "yiq_to_rgb"
+    argspec: "args=[\'images\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "yuv_to_rgb"
+    argspec: "args=[\'images\'], varargs=None, keywords=None, defaults=None"
+  }
 }
-- 
GitLab


From bab793233e42b7654771769d4d4a7974b432883c Mon Sep 17 00:00:00 2001
From: Yunxing Dai <yunxing@google.com>
Date: Thu, 25 Jan 2018 09:32:22 -0800
Subject: [PATCH 157/966] Add shard count to select and scatter tests

RELNOTES: n/a
PiperOrigin-RevId: 183250076
---
 tensorflow/compiler/xla/tests/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 3afd52b6b2..02ad9d982f 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -1034,6 +1034,7 @@ xla_test(
     name = "select_and_scatter_test",
     timeout = "long",
     srcs = ["select_and_scatter_test.cc"],
+    shard_count = 40,
     tags = ["enable_for_xla_interpreter"],
     deps = [
         "//tensorflow/compiler/xla:array2d",
-- 
GitLab


From 03bb1c4a6014fdf3f10f301f093ec02d84f717c7 Mon Sep 17 00:00:00 2001
From: Nupur Garg <nupurgarg@google.com>
Date: Thu, 25 Jan 2018 09:34:11 -0800
Subject: [PATCH 158/966] Add functions to encapsulate the logic for checking
 and setting tensor type.

PiperOrigin-RevId: 183250334
---
 tensorflow/contrib/lite/kernels/kernel_util.h | 16 ++++++++++++++++
 tensorflow/contrib/lite/kernels/pad.cc        | 17 ++++++++---------
 2 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/kernel_util.h b/tensorflow/contrib/lite/kernels/kernel_util.h
index 1cf30ecff9..bfdfba00f5 100644
--- a/tensorflow/contrib/lite/kernels/kernel_util.h
+++ b/tensorflow/contrib/lite/kernels/kernel_util.h
@@ -44,6 +44,22 @@ inline TfLiteTensor* GetOptionalInputTensor(TfLiteContext* context,
   return nullptr;
 }
 
+// Determines whether tensor is constant.
+inline bool IsConstantTensor(TfLiteTensor* tensor) {
+  return tensor->allocation_type == kTfLiteMmapRo;
+}
+
+// Determines whether tensor is dynamic. Note that a tensor can be non-const and
+// not dynamic. This function specificially checks for a dynamic tensor.
+inline bool IsDynamicTensor(TfLiteTensor* tensor) {
+  return tensor->allocation_type == kTfLiteDynamic;
+}
+
+// Sets tensor to dynamic.
+inline void SetTensorToDynamic(TfLiteTensor* tensor) {
+  tensor->allocation_type = kTfLiteDynamic;
+}
+
 // Calculates the multiplication factor for a quantized convolution (or
 // quantized depthwise convolution) involving the given tensors. Returns an
 // error if the scales of the tensors are not compatible.
diff --git a/tensorflow/contrib/lite/kernels/pad.cc b/tensorflow/contrib/lite/kernels/pad.cc
index 569bf0fe8f..4003ed10df 100644
--- a/tensorflow/contrib/lite/kernels/pad.cc
+++ b/tensorflow/contrib/lite/kernels/pad.cc
@@ -51,17 +51,14 @@ struct PadContext {
 // paddings data is present.
 TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
                                 PadContext* op_context) {
-  // TODO(nupurgarg): Our current implementations rely on the inputs being 4D.
-  TF_LITE_ENSURE_EQ(context, op_context->dims, 4);
-
   // Ensures the paddings array is dims x 2.
   TF_LITE_ENSURE_EQ(context, SizeOfDimension(op_context->paddings, 0),
                     op_context->dims);
   TF_LITE_ENSURE_EQ(context, SizeOfDimension(op_context->paddings, 1), 2);
 
   // Determines the size of the output tensor.
-  const TfLiteIntArray* input_size = op_context->input->dims;
-  TfLiteIntArray* output_size = TfLiteIntArrayCreate(op_context->dims);
+  TfLiteIntArray* input_size = op_context->input->dims;
+  TfLiteIntArray* output_size = TfLiteIntArrayCopy(input_size);
   const int32* paddings_data = GetTensorData<int32>(op_context->paddings);
 
   for (int idx = 0; idx < op_context->dims; ++idx) {
@@ -85,11 +82,13 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   PadContext op_context(context, node);
   TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type);
 
-  // TODO(nupurgarg): Create wrapper functions for dynamic tensor logic.
+  // TODO(nupurgarg): Our current implementations rely on the inputs being 4D.
+  TF_LITE_ENSURE_EQ(context, op_context.dims, 4);
+
   // Exit early if paddings is a non-const tensor. Set output tensor to
   // dynamic so output size can be determined in Eval.
-  if (op_context.paddings->allocation_type != kTfLiteMmapRo) {
-    op_context.output->allocation_type = kTfLiteDynamic;
+  if (!IsConstantTensor(op_context.paddings)) {
+    SetTensorToDynamic(op_context.output);
     return kTfLiteOk;
   }
   return ResizeOutputTensor(context, &op_context);
@@ -100,7 +99,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   PadContext op_context(context, node);
 
   // Resize the output tensor if the output tensor is dynamic.
-  if (op_context.output->allocation_type == kTfLiteDynamic) {
+  if (IsDynamicTensor(op_context.output)) {
     TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context));
     TfLiteTensorRealloc(op_context.output->bytes, op_context.output);
   }
-- 
GitLab


From ee8b13791a921964cad1fc699e72ff1f738b704f Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Thu, 25 Jan 2018 09:42:55 -0800
Subject: [PATCH 159/966] Improve SkipNBytes to RandomInputStream  (#14557)

* Convert SkipNBytes in InputStreamInterface to pure virtual

This fix tries to address the issue raised in 14512 where SkipNBytes
will takes a long time as it needs to read through the file
even though it is possible to random access.

This fix convert SkipNBytes in InputStreamInterface to pure virtual
so that it is possible to have different implementations based
on the underlying file type (seekable vs. no seekable)

This fix fixes 14512.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add missing implementation in inputstream_interface_test.cc

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Update implementation of RandomInputStream

So that if random access read of 1 byte to the expected pos
is able to complete then return immediately. Otherwise,
read through until the pos is reached.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add SkipNBytes to SnappyInputBuffer

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Remove inputstream_interface.cc from makefile

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Remove pure virtual and use override for RandomInputStream

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/core/lib/io/random_inputstream.cc | 37 ++++++++++++++++++++
 tensorflow/core/lib/io/random_inputstream.h  |  2 ++
 2 files changed, 39 insertions(+)

diff --git a/tensorflow/core/lib/io/random_inputstream.cc b/tensorflow/core/lib/io/random_inputstream.cc
index 8b8c1392a1..09336e79cd 100644
--- a/tensorflow/core/lib/io/random_inputstream.cc
+++ b/tensorflow/core/lib/io/random_inputstream.cc
@@ -57,6 +57,43 @@ Status RandomAccessInputStream::ReadNBytes(int64 bytes_to_read,
   return Status::OK();
 }
 
+// To limit memory usage, the default implementation of SkipNBytes() only reads
+// 8MB at a time.
+static constexpr int64 kMaxSkipSize = 8 * 1024 * 1024;
+
+Status RandomAccessInputStream::SkipNBytes(int64 bytes_to_skip) {
+  if (bytes_to_skip < 0) {
+    return errors::InvalidArgument("Can't skip a negative number of bytes");
+  }
+  std::unique_ptr<char[]> scratch(new char[kMaxSkipSize]);
+  // Try to read 1 bytes first, if we could complete the read then EOF is
+  // not reached yet and we could return.
+  if (bytes_to_skip > 0) {
+    StringPiece data;
+    Status s = file_->Read(pos_ + bytes_to_skip - 1, 1, &data, scratch.get());
+    if ((s.ok() || errors::IsOutOfRange(s)) && data.size() == 1) {
+      pos_ += bytes_to_skip;
+      return Status::OK();
+    }
+  }
+  // Read kDefaultSkipSize at a time till bytes_to_skip.
+  while (bytes_to_skip > 0) {
+    int64 bytes_to_read = std::min<int64>(kMaxSkipSize, bytes_to_skip);
+    StringPiece data;
+    Status s = file_->Read(pos_, bytes_to_read, &data, scratch.get());
+    if (s.ok() || errors::IsOutOfRange(s)) {
+      pos_ += data.size();
+    } else {
+      return s;
+    }
+    if (data.size() < bytes_to_read) {
+      return errors::OutOfRange("reached end of file");
+    }
+    bytes_to_skip -= bytes_to_read;
+  }
+  return Status::OK();
+}
+
 int64 RandomAccessInputStream::Tell() const { return pos_; }
 
 }  // namespace io
diff --git a/tensorflow/core/lib/io/random_inputstream.h b/tensorflow/core/lib/io/random_inputstream.h
index 09ebe9ba49..bdbdbd71ff 100644
--- a/tensorflow/core/lib/io/random_inputstream.h
+++ b/tensorflow/core/lib/io/random_inputstream.h
@@ -34,6 +34,8 @@ class RandomAccessInputStream : public InputStreamInterface {
 
   Status ReadNBytes(int64 bytes_to_read, string* result) override;
 
+  Status SkipNBytes(int64 bytes_to_skip) override;
+
   int64 Tell() const override;
 
   Status Seek(int64 position) {
-- 
GitLab


From e487c1d00559cb0af036e0ab981f6eca065217bd Mon Sep 17 00:00:00 2001
From: Eugene Brevdo <ebrevdo@google.com>
Date: Thu, 25 Jan 2018 09:44:04 -0800
Subject: [PATCH 160/966] [TF RNN] Ensure dynamic_rnn runs at least one step to
 propagate dynamic shape info.

PiperOrigin-RevId: 183251689
---
 tensorflow/python/ops/rnn.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/rnn.py b/tensorflow/python/ops/rnn.py
index a1008f1c83..a10e1963d1 100644
--- a/tensorflow/python/ops/rnn.py
+++ b/tensorflow/python/ops/rnn.py
@@ -812,7 +812,10 @@ def _dynamic_rnn_loop(cell,
     return (time + 1, output_ta_t, new_state)
 
   if in_graph_mode:
-    loop_bound = max_sequence_length
+    # Make sure that we run at least 1 step, if necessary, to ensure
+    # the TensorArrays pick up the dynamic shape.
+    loop_bound = math_ops.minimum(
+        time_steps, math_ops.maximum(1, max_sequence_length))
   else:
     # Using max_sequence_length isn't currently supported in the Eager branch.
     loop_bound = time_steps
-- 
GitLab


From f8adc061e8c1134082e8716725c7f1899917d340 Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Thu, 25 Jan 2018 09:53:10 -0800
Subject: [PATCH 161/966] Surface error if input is not flat list of eager
 tensors

PiperOrigin-RevId: 183253112
---
 tensorflow/python/eager/pywrap_tfe_src.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index 6162644036..647f03351d 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -766,6 +766,9 @@ void TFE_Py_TapeSetRecordOperation(PyObject* op_type, PyObject* output_tensors,
     return;
   }
   std::vector<tensorflow::int64> input_ids = MakeTensorIDList(input_tensors);
+  if (PyErr_Occurred()) {
+    return;
+  }
   std::vector<tensorflow::eager::TapeTensor> output_info;
   PyObject* seq = PySequence_Fast(output_tensors,
                                   "expected a sequence of integer tensor ids");
-- 
GitLab


From 9384314e0cbf6f315d870200fc5abe421deefcab Mon Sep 17 00:00:00 2001
From: David Goodwin <davidg@nvidia.com>
Date: Thu, 25 Jan 2018 10:36:31 -0800
Subject: [PATCH 162/966] improve comment describing segmentation algorithm

---
 tensorflow/contrib/tensorrt/segment/segment.cc | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/tensorrt/segment/segment.cc b/tensorflow/contrib/tensorrt/segment/segment.cc
index 967e29df76..45bd5cdf14 100644
--- a/tensorflow/contrib/tensorrt/segment/segment.cc
+++ b/tensorflow/contrib/tensorrt/segment/segment.cc
@@ -146,8 +146,14 @@ tensorflow::Status SegmentGraph(
     node_segments.emplace_back(node);
   }
 
-  // Visit nodes in reverse topological order and use edge
-  // contraction to merge candidate nodes.
+  // The segmentation algorithm below visits nodes in reverse
+  // topological order and attempts to merge nodes along output
+  // edges. That means that subgraphs grow from the output-side of the
+  // network towards the inputs. In general this is not guaranteed to
+  // produce a globally optimal segmentation. In the future if we have
+  // a measure of how beneficial it is to include a given node in a
+  // TRT subgraph then we can revisit this algorithm to take advantage
+  // of that information.
   std::vector<tensorflow::Node*> order;
   tensorflow::GetPostOrder(graph, &order);
 
-- 
GitLab


From f492fac8e31f15b9529d20f9787cafc888669fbc Mon Sep 17 00:00:00 2001
From: Sam Matzek <smatzek@us.ibm.com>
Date: Thu, 25 Jan 2018 12:39:31 -0600
Subject: [PATCH 163/966] Build libjpeg-turbo ALTIVEC SIMD (#16409)

The libjpeg-turbo package has ALTIVEC SIMD and this updates the
third_party build to build the ALTIVEC SIMD on the appropriate
platform.
---
 third_party/jpeg/jpeg.BUILD | 51 +++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/third_party/jpeg/jpeg.BUILD b/third_party/jpeg/jpeg.BUILD
index 37924125cf..ca2d38d687 100644
--- a/third_party/jpeg/jpeg.BUILD
+++ b/third_party/jpeg/jpeg.BUILD
@@ -34,6 +34,10 @@ libjpegturbo_copts = select({
         "-mfloat-abi=softfp",
         "-fprefetch-loop-arrays",
     ],
+    ":linux_ppc64le": [
+        "-mcpu=power8",
+        "-mtune=power8",
+    ],
     "//conditions:default": [],
 })
 
@@ -123,10 +127,50 @@ cc_library(
         ":k8": [":simd_x86_64"],
         ":armeabi-v7a": [":simd_armv7a"],
         ":arm64-v8a": [":simd_armv8a"],
+        ":linux_ppc64le": [":simd_altivec"],
         "//conditions:default": [":simd_none"],
     }),
 )
 
+cc_library(
+    name = "simd_altivec",
+    srcs = [
+        "jchuff.h",
+        "jconfig.h",
+        "jdct.h",
+        "jerror.h",
+        "jinclude.h",
+        "jmorecfg.h",
+        "jpegint.h",
+        "jpeglib.h",
+        "jsimd.h",
+        "jsimddct.h",
+        "simd/jsimd.h",
+        "simd/jccolor-altivec.c",
+        "simd/jcgray-altivec.c",
+        "simd/jcsample-altivec.c",
+        "simd/jdcolor-altivec.c",
+        "simd/jdmerge-altivec.c",
+        "simd/jdsample-altivec.c",
+        "simd/jfdctfst-altivec.c",
+        "simd/jfdctint-altivec.c",
+        "simd/jidctfst-altivec.c",
+        "simd/jidctint-altivec.c",
+        "simd/jquanti-altivec.c",
+        "simd/jsimd_powerpc.c",
+        "simd/jsimd_altivec.h",
+        "simd/jcsample.h",
+    ],
+    hdrs = [
+      "simd/jdmrgext-altivec.c",  # should have been named .inc
+      "simd/jccolext-altivec.c",  # should have been named .inc
+      "simd/jcgryext-altivec.c",  # should have been named .inc
+      "simd/jdcolext-altivec.c",  # should have been named .inc
+    ],
+    copts = libjpegturbo_copts,
+    nocopts = libjpegturbo_nocopts,
+)
+
 cc_library(
     name = "simd_x86_64",
     srcs = [
@@ -381,6 +425,7 @@ genrule(
         ":k8": "cp $(location jconfig_nowin_simd.h) $@",
         ":armeabi-v7a": "cp $(location jconfig_nowin_simd.h) $@",
         ":arm64-v8a": "cp $(location jconfig_nowin_simd.h) $@",
+        ":linux_ppc64le": "cp $(location jconfig_nowin_simd.h) $@",
         "//conditions:default": "cp $(location jconfig_nowin_nosimd.h) $@",
     }),
 )
@@ -498,3 +543,9 @@ config_setting(
     name = "windows_msvc",
     values = {"cpu": "x64_windows_msvc"},
 )
+
+config_setting(
+   name = "linux_ppc64le",
+   values = {"cpu": "ppc"},
+
+)
-- 
GitLab


From 949dd29d3a8bdc21328c9e94721b344310686eab Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 25 Jan 2018 10:36:25 -0800
Subject: [PATCH 164/966] Simplify the template mechanism by specifying
 templates using multi-line strings instead of functions. This loses the
 syntax verification on templates, but it avoids the clutter of lint overrides
 and the duplication of parameter names, so things are more readable.

Addresses #16318

PiperOrigin-RevId: 183260854
---
 .../converters/break_canonicalization.py      | 26 ++---
 .../py2tf/converters/builtin_functions.py     |  7 +-
 .../contrib/py2tf/converters/call_trees.py    | 19 +---
 .../converters/continue_canonicalization.py   | 22 ++---
 .../contrib/py2tf/converters/control_flow.py  | 96 +++++++------------
 .../py2tf/converters/for_canonicalization.py  | 28 +++---
 .../py2tf/converters/side_effect_guards.py    | 26 ++---
 tensorflow/contrib/py2tf/pyct/templates.py    | 55 ++++++-----
 .../contrib/py2tf/pyct/templates_test.py      | 36 ++++---
 9 files changed, 132 insertions(+), 183 deletions(-)

diff --git a/tensorflow/contrib/py2tf/converters/break_canonicalization.py b/tensorflow/contrib/py2tf/converters/break_canonicalization.py
index ef58573445..2ae65e3007 100644
--- a/tensorflow/contrib/py2tf/converters/break_canonicalization.py
+++ b/tensorflow/contrib/py2tf/converters/break_canonicalization.py
@@ -33,31 +33,25 @@ class BreakCanonicalizationTransformer(gast.NodeTransformer):
     self.break_uses = []
 
   def _create_break_check(self):
-
-    def template(var_name):
-      (not var_name)  # pylint:disable=pointless-statement
-
-    expr, = templates.replace(
-        template, var_name=gast.Name(self.break_uses[-1][1], None, None))
+    template = """
+      (not var_name)
+    """
+    expr, = templates.replace(template, var_name=self.break_uses[-1][1])
     return expr.value
 
   def _create_break_trigger(self):
-
-    def template(var_name):  # pylint:disable=unused-argument
+    template = """
       var_name = True
-
-    block = templates.replace(
-        template, var_name=gast.Name(self.break_uses[-1][1], None, None))
+    """
+    block = templates.replace(template, var_name=self.break_uses[-1][1])
     block.append(gast.Continue())
     return block
 
   def _create_break_init(self):
-
-    def template(var_name):  # pylint:disable=unused-argument
+    template = """
       var_name = False
-
-    assign, = templates.replace(
-        template, var_name=gast.Name(self.break_uses[-1][1], None, None))
+    """
+    assign, = templates.replace(template, var_name=self.break_uses[-1][1])
     return assign
 
   # TODO(mdan): Surely the transformer supports this better?
diff --git a/tensorflow/contrib/py2tf/converters/builtin_functions.py b/tensorflow/contrib/py2tf/converters/builtin_functions.py
index b80c96c97a..7f6b64a34c 100644
--- a/tensorflow/contrib/py2tf/converters/builtin_functions.py
+++ b/tensorflow/contrib/py2tf/converters/builtin_functions.py
@@ -29,10 +29,9 @@ class BuiltinFunctionTransformer(gast.NodeTransformer):
   # TODO(mdan): Bring print_functions in here.
 
   def _convert_len(self, node):
-
-    def template(args):
-      tf.shape(args)[0]  # pylint:disable=undefined-variable,expression-not-assigned
-
+    template = """
+      tf.shape(args)[0]
+    """
     new_call = templates.replace(template, args=node.args)[0].value
     return new_call
 
diff --git a/tensorflow/contrib/py2tf/converters/call_trees.py b/tensorflow/contrib/py2tf/converters/call_trees.py
index df071f596f..0aae030450 100644
--- a/tensorflow/contrib/py2tf/converters/call_trees.py
+++ b/tensorflow/contrib/py2tf/converters/call_trees.py
@@ -151,7 +151,7 @@ class CallTreeTransformer(gast.NodeTransformer):
     else:
       new_name = self.namer.compiled_function_name(
           '__'.join(target_fqn), live_object=target_obj)
-    node.func = gast.Name(id=new_name, ctx=gast.Load(), annotation=None)
+    node.func = gast.Name(new_name, gast.Load(), None)
     return node
 
   def _rename_member_function_of_known_type(self, node):
@@ -184,26 +184,17 @@ class CallTreeTransformer(gast.NodeTransformer):
   def _wrap_to_py_func_no_return(self, node):
     args_scope = anno.getanno(node, 'args_scope')
     # TODO(mdan): Properly handle varargs, kwargs, etc.
-    args = tuple(gast.Name(n, gast.Load(), None) for n in args_scope.used)
-
-    # pylint:disable=undefined-variable,unused-argument,function-redefined
-
-    def template(call, wrapper, args):
-
+    template = """
       def wrapper(args):
         call(args)
         return 1
-
       tf.py_func(wrapper, [args], [tf.int64])
-
-    # pylint:enable=undefined-variable,unused-argument,function-redefined
-
-    wrapper_name = self.namer.compiled_function_name(node.func.id)
+    """
     wrapper_def, call_expr = templates.replace(
         template,
         call=node.func,
-        wrapper=gast.Name(wrapper_name, gast.Load(), None),
-        args=args)
+        wrapper=self.namer.compiled_function_name(node.func.id),
+        args=tuple(gast.Name(n, gast.Load(), None) for n in args_scope.used))
     anno.setanno(call_expr.value, 'args_scope', args_scope)
     # TODO(mdan): Rename this annotation to 'graph_ready'
     anno.setanno(wrapper_def, 'skip_processing', True)
diff --git a/tensorflow/contrib/py2tf/converters/continue_canonicalization.py b/tensorflow/contrib/py2tf/converters/continue_canonicalization.py
index 7f8ace77a8..486f0f6509 100644
--- a/tensorflow/contrib/py2tf/converters/continue_canonicalization.py
+++ b/tensorflow/contrib/py2tf/converters/continue_canonicalization.py
@@ -33,32 +33,28 @@ class ContinueCanonicalizationTransformer(gast.NodeTransformer):
     self.continuation_uses = []
 
   def _create_continuation_check(self):
-
-    def template(var_name):
+    template = """
       if not var_name:
         pass
-
-    cond, = templates.replace(
-        template, var_name=gast.Name(self.continuation_uses[-1][1], None, None))
+    """
+    cond, = templates.replace(template, var_name=self.continuation_uses[-1][1])
     cond.body = []
     return cond
 
   def _create_continuation_trigger(self):
-
-    def template(var_name):  # pylint:disable=unused-argument
+    template = """
       var_name = True
-
+    """
     assign, = templates.replace(
-        template, var_name=gast.Name(self.continuation_uses[-1][1], None, None))
+        template, var_name=self.continuation_uses[-1][1])
     return assign
 
   def _create_continuation_init(self):
-
-    def template(var_name):  # pylint:disable=unused-argument
+    template = """
       var_name = False
-
+    """
     assign, = templates.replace(
-        template, var_name=gast.Name(self.continuation_uses[-1][1], None, None))
+        template, var_name=self.continuation_uses[-1][1])
     return assign
 
   def _visit_and_reindent_if_necessary(self, nodes):
diff --git a/tensorflow/contrib/py2tf/converters/control_flow.py b/tensorflow/contrib/py2tf/converters/control_flow.py
index 8ebd9ad93d..a40c7b28f7 100644
--- a/tensorflow/contrib/py2tf/converters/control_flow.py
+++ b/tensorflow/contrib/py2tf/converters/control_flow.py
@@ -75,29 +75,6 @@ class ControlFlowTransformer(gast.NodeTransformer):
       raise ValueError(
           'The else branch creates new symbols that the if branch does not.')
 
-    def template(  # pylint:disable=missing-docstring
-        test,
-        body_name,
-        body,
-        orelse_name,
-        orelse,
-        aliased,
-        aliases,  # pylint:disable=unused-argument
-        aliased_results,
-        results):  # pylint:disable=unused-argument
-
-      def body_name():  # pylint:disable=function-redefined
-        aliases, = aliased,  # pylint:disable=unused-variable
-        body  # pylint:disable=pointless-statement
-        return (aliased_results,)
-
-      def orelse_name():  # pylint:disable=function-redefined
-        aliases, = aliased,  # pylint:disable=unused-variable
-        orelse  # pylint:disable=pointless-statement
-        return (aliased_results,)
-
-      results = tf.cond(test, body_name, orelse_name)  # pylint:disable=undefined-variable
-
     all_modified = tuple(body_scope.modified | orelse_scope.modified)
     all_referenced = body_scope.referenced | orelse_scope.referenced
 
@@ -107,10 +84,10 @@ class ControlFlowTransformer(gast.NodeTransformer):
     need_alias = (
         (body_scope.modified | orelse_scope.modified) -
         (body_scope.created | orelse_scope.created))
-    aliased = tuple(need_alias)
-    aliases = tuple(
-        self.namer.new_symbol(s, all_referenced) for s in aliased)
-    alias_map = dict(zip(aliased, aliases))
+    aliased_orig_names = tuple(need_alias)
+    aliased_new_names = tuple(
+        self.namer.new_symbol(s, all_referenced) for s in aliased_orig_names)
+    alias_map = dict(zip(aliased_orig_names, aliased_new_names))
     node_body = node.body
     node_body = [SymbolRenamer(alias_map).visit(n) for n in node_body]
     node_orelse = node.orelse
@@ -122,20 +99,29 @@ class ControlFlowTransformer(gast.NodeTransformer):
       results = gast.Tuple(
           tuple(gast.Name(s, None, None) for s in all_modified), None)
 
+    template = """
+      def body_name():
+        aliased_new_names, = aliased_orig_names,
+        body
+        return (all_results,)
+      def orelse_name():
+        aliased_new_names, = aliased_orig_names,
+        orelse
+        return (all_results,)
+      results = tf.cond(test, body_name, orelse_name)
+    """
+    body_name = self.namer.new_symbol('if_true', all_referenced)
     return templates.replace(
         template,
         test=node.test,
-        body_name=gast.Name(
-            self.namer.new_symbol('if_true', all_referenced), None, None),
+        body_name=body_name,
         body=node_body,
-        orelse_name=gast.Name(
-            self.namer.new_symbol('if_false', all_referenced), None, None),
+        orelse_name=self.namer.new_symbol('if_false', all_referenced),
         orelse=node_orelse,
-        aliased=tuple(gast.Name(s, None, None) for s in aliased),
-        aliases=tuple(gast.Name(s, None, None) for s in aliases),
-        aliased_results=tuple(
-            gast.Name(alias_map[s] if s in aliased else s, None, None)
-            for s in all_modified),
+        aliased_orig_names=tuple(aliased_orig_names),
+        aliased_new_names=tuple(aliased_new_names),
+        all_results=tuple(alias_map[s] if s in aliased_orig_names else s
+                          for s in all_modified),
         results=results)
 
   def visit_While(self, node):
@@ -144,38 +130,28 @@ class ControlFlowTransformer(gast.NodeTransformer):
     body_scope = anno.getanno(node, 'body_scope')
     body_closure = tuple(body_scope.modified - body_scope.created)
 
-    def template(
-        state,  # pylint:disable=unused-argument
-        state_ast_tuple,  # pylint:disable=unused-argument
-        test_name,
-        test,  # pylint:disable=unused-argument
-        body_name,
-        body):
-
-      def test_name(state):  # pylint:disable=function-redefined,unused-argument
-        return test
-
-      def body_name(state):  # pylint:disable=function-redefined,unused-argument
-        body  # pylint:disable=pointless-statement
-        return state,
-
-      state_ast_tuple = tf.while_loop(test_name, body_name, [state])  # pylint:disable=undefined-variable
-
-    test_name = self.namer.new_symbol('loop_test', body_scope.referenced)
-    body_name = self.namer.new_symbol('loop_body', body_scope.referenced)
     if len(body_closure) == 1:
-      state = gast.Name(body_closure[0], None, None)
+      state = body_closure[0]
       state_ast_tuple = state
     else:
-      state = tuple(gast.Name(n, None, None) for n in body_closure)
-      state_ast_tuple = gast.Tuple(state, None)
+      state = tuple(body_closure)
+      state_ast_tuple = gast.Tuple(
+          tuple(gast.Name(n, None, None) for n in state), None)
+    template = """
+      def test_name(state):
+        return test
+      def body_name(state):
+        body
+        return state,
+      state_ast_tuple = tf.while_loop(test_name, body_name, [state])
+    """
     node = templates.replace(
         template,
         state=state,
         state_ast_tuple=state_ast_tuple,
-        test_name=gast.Name(test_name, gast.Load(), None),
+        test_name=self.namer.new_symbol('loop_test', body_scope.referenced),
         test=node.test,
-        body_name=gast.Name(body_name, gast.Load(), None),
+        body_name=self.namer.new_symbol('loop_body', body_scope.referenced),
         body=node.body)
 
     return node
diff --git a/tensorflow/contrib/py2tf/converters/for_canonicalization.py b/tensorflow/contrib/py2tf/converters/for_canonicalization.py
index 52360789cd..c284689b90 100644
--- a/tensorflow/contrib/py2tf/converters/for_canonicalization.py
+++ b/tensorflow/contrib/py2tf/converters/for_canonicalization.py
@@ -42,46 +42,40 @@ class ForLoopCanonicalizationTransformer(gast.NodeTransformer):
     # Or maybe we should replace range with tf.range?
 
     if anno.hasanno(node, 'extra_cond'):
-
-      def template(loop_iter, target, body, i, n, extra_cond):  # pylint:disable=unused-argument
+      template = """
         i = 0
-        n = len(loop_iter)  # pylint:disable=undefined-variable
+        n = len(loop_iter)
         while i < n and extra_cond:
           # TODO(mdan): Use TensorListFromTensor(loop_iter) here.
           target = loop_iter[i]
-          body  # pylint:disable=pointless-statement
+          body
           i += 1
-
+      """
       return templates.replace(
           template,
           loop_iter=node.iter,
           target=node.target,
           body=node.body,
-          i=gast.Name(
-              self.namer.new_symbol('i', body_scope.referenced), None, None),
-          n=gast.Name(
-              self.namer.new_symbol('n', body_scope.referenced), None, None),
+          i=self.namer.new_symbol('i', body_scope.referenced),
+          n=self.namer.new_symbol('n', body_scope.referenced),
           extra_cond=anno.getanno(node, 'extra_cond'))
     else:
-
-      def template(loop_iter, target, body, i, n):  # pylint:disable=unused-argument
+      template = """
         i = 0
-        n = len(loop_iter)  # pylint:disable=undefined-variable
+        n = len(loop_iter)
         while i < n:
           # TODO(mdan): Use TensorListFromTensor(loop_iter) here.
           target = loop_iter[i]
           body  # pylint:disable=pointless-statement
           i += 1
-
+      """
       return templates.replace(
           template,
           loop_iter=node.iter,
           target=node.target,
           body=node.body,
-          i=gast.Name(
-              self.namer.new_symbol('i', body_scope.referenced), None, None),
-          n=gast.Name(
-              self.namer.new_symbol('n', body_scope.referenced), None, None))
+          i=self.namer.new_symbol('i', body_scope.referenced),
+          n=self.namer.new_symbol('n', body_scope.referenced))
 
   def visit_Continue(self, node):
     assert False, 'continue statement should be desugared at this point'
diff --git a/tensorflow/contrib/py2tf/converters/side_effect_guards.py b/tensorflow/contrib/py2tf/converters/side_effect_guards.py
index 1f25303fba..a88828ff80 100644
--- a/tensorflow/contrib/py2tf/converters/side_effect_guards.py
+++ b/tensorflow/contrib/py2tf/converters/side_effect_guards.py
@@ -94,12 +94,10 @@ class SideEffectGuardTransformer(gast.NodeTransformer):
     return node
 
   def _gate_symbols(self, guard_statement, guarded_args):
-
-    def template(args):  # pylint:disable=unused-argument
-      (args,) = (tf.identity(a) for a in (args,))  # pylint:disable=undefined-variable
-
-    guards = templates.replace(
-        template, args=tuple(gast.Name(a, None, None) for a in guarded_args))
+    template = """
+      (args,) = (tf.identity(a) for a in (args,))
+    """
+    guards = templates.replace(template, args=tuple(guarded_args))
     guard_statement.body.extend(guards)
     return guard_statement
 
@@ -110,29 +108,25 @@ class SideEffectGuardTransformer(gast.NodeTransformer):
       #   opt.minimize(loss)
       # or:
       #   tf.py_func(...)
-
       args_scope = anno.getanno(node.value, 'args_scope')
       temp_name = self.namer.new_symbol('temp', args_scope.parent.referenced)
       # TODO(mdan): Unsafe reference modification!
       args_scope.mark_write(temp_name)
-
-      def template(call, temp_result):
+      template = """
         temp_result = call
         if temp_result is not None:
           if not isinstance(temp_result, (list, tuple)):
             temp_result = (temp_result,)
-          ctx = tf.control_dependencies(temp_result)  # pylint:disable=undefined-variable
+          ctx = tf.control_dependencies(temp_result)
         else:
-          ctx = contextmanager(lambda: (yield))()  # pylint:disable=undefined-variable
+          ctx = contextmanager(lambda: (yield))()
         with ctx:
           # TODO(mdan): Also insert ops to re-fetch if variables are involved.
           pass  # Will be removed below.
-
-      # TODO(mdan): This is brittle. Reorganize this mechanism.
+      """
+      # TODO(mdan): This is brittle. Reorganize the mechanism.
       statements = templates.replace(
-          template,
-          call=node.value,
-          temp_result=gast.Name(temp_name, None, None))
+          template, call=node.value, temp_result=temp_name)
       control_deps_guard = statements[-1]
       control_deps_guard.body = []
 
diff --git a/tensorflow/contrib/py2tf/pyct/templates.py b/tensorflow/contrib/py2tf/pyct/templates.py
index 4fadc793e6..77c5fbe02a 100644
--- a/tensorflow/contrib/py2tf/pyct/templates.py
+++ b/tensorflow/contrib/py2tf/pyct/templates.py
@@ -80,37 +80,46 @@ class ReplaceTransformer(gast.NodeTransformer):
       return node
 
 
+def _strings_to_names(n):
+  if isinstance(n, str):
+    # Note: the node will receive the ctx value from the template, see
+    # ReplaceTransformer.visit_Name.
+    return gast.Name(id=n, ctx=None, annotation=None)
+  if isinstance(n, list):
+    return [_strings_to_names(e) for e in n]
+  if isinstance(n, tuple):
+    return tuple(_strings_to_names(e) for e in n)
+  return n
+
+
 def replace(template, **replacements):
   """Replace placeholders in a Python template.
 
+  AST Name and Tuple nodes always receive the context that inferred from
+  the template. However, when replacing more complex nodes (that can potentially
+  contain Name children), then the caller is responsible for setting the
+  appropriate context.
+
   Args:
-    template: A function to be used as a template. Any placeholder is expected
-        to also be a function argument.
+    template: A string representing Python code. Any symbol name can be used
+        that appears in the template code can be used as placeholder.
     **replacements: A mapping from placeholder names to (lists of) AST nodes
-        that these placeholders will be replaced by.
+        that these placeholders will be replaced by. String values are also
+        supported as a shorthand for AST Name nodes with the respective ID.
 
   Returns:
-    body: An AST node or list of AST nodes with the replacements made. If the
-        template was a function, a list will be returned. If the template was a
-        node, the same node will be returned. If the template was a string, an
-        AST node will be returned (a `Module` node in the case of a multi-line
-        string, an `Expr` node otherwise).
+    An AST node or list of AST nodes with the replacements made. If the
+    template was a function, a list will be returned. If the template was a
+    node, the same node will be returned. If the template was a string, an
+    AST node will be returned (a `Module` node in the case of a multi-line
+    string, an `Expr` node otherwise).
 
   Raises:
-    ValueError: If a function is used as a template and an incorrect set of
-        replacements was passed.
+    ValueError: if the arguments are incorrect.
   """
-  tree = parser.parse_object(template).body[0]
-  placeholders = set(arg.id for arg in tree.args.args)
-  tree.args.args = []
-  if tree.args.vararg:
-    placeholders.add(tree.args.vararg)
-    tree.args.vararg = None
-  if set(replacements.keys()) != placeholders:
-    raise ValueError(
-        'too many or few replacements. replacements: %s; placeholders: %s' %
-        (replacements.keys(), placeholders))
-
-  # Perform the replacement, stripping the function into which the template was
-  # wrapped.
+  if not isinstance(template, str):
+    raise ValueError('Expected string template, got %s' % type(template))
+  tree = parser.parse_str(template)
+  for k in replacements:
+    replacements[k] = _strings_to_names(replacements[k])
   return ReplaceTransformer(replacements).visit(tree).body
diff --git a/tensorflow/contrib/py2tf/pyct/templates_test.py b/tensorflow/contrib/py2tf/pyct/templates_test.py
index 2ad8b9317b..1143131283 100644
--- a/tensorflow/contrib/py2tf/pyct/templates_test.py
+++ b/tensorflow/contrib/py2tf/pyct/templates_test.py
@@ -28,46 +28,42 @@ from tensorflow.python.platform import test
 class TemplatesTest(test.TestCase):
 
   def test_replace_variable(self):
-    def template(a):  # pylint:disable=unused-argument
-      def test_fn(a):  # pylint:disable=unused-variable
+    template = """
+      def test_fn(a):
         a += 1
         a = 2 * a + 1
-        return b  # pylint:disable=undefined-variable
+        return b
+    """
 
-    node = templates.replace(
-        template, a=gast.Name('b', gast.Load(), None))[0]
+    node = templates.replace(template, a='b')[0]
     result = compiler.ast_to_object(node)
     self.assertEquals(7, result.test_fn(2))
 
   def test_replace_function_name(self):
-    def template(fname):  # pylint:disable=unused-argument
-      def fname(a):  # pylint:disable=function-redefined
+    template = """
+      def fname(a):
         a += 1
         a = 2 * a + 1
         return a
+    """
 
-    node = templates.replace(
-        template, fname=gast.Name('test_fn', gast.Load(), None))[0]
+    node = templates.replace(template, fname='test_fn')[0]
     result = compiler.ast_to_object(node)
     self.assertEquals(7, result.test_fn(2))
 
   def test_code_block(self):
-    def template(block):  # pylint:disable=unused-argument
-      def test_fn(a):  # pylint:disable=unused-variable
-        block  # pylint:disable=pointless-statement
+    template = """
+      def test_fn(a):
+        block
         return a
+    """
 
     node = templates.replace(
         template,
         block=[
-            gast.Assign(
-                [
-                    gast.Name('a', gast.Store(), None)
-                ],
-                gast.BinOp(
-                    gast.Name('a', gast.Load(), None),
-                    gast.Add(),
-                    gast.Num(1))),
+            gast.Assign([
+                gast.Name('a', None, None)
+            ], gast.BinOp(gast.Name('a', None, None), gast.Add(), gast.Num(1))),
         ] * 2)[0]
     result = compiler.ast_to_object(node)
     self.assertEquals(3, result.test_fn(1))
-- 
GitLab


From dcb918a8f64790c615d1ec018b7b6e141a6a8653 Mon Sep 17 00:00:00 2001
From: Jerome <jvgonzalvo@up.edu.ph>
Date: Fri, 26 Jan 2018 02:41:30 +0800
Subject: [PATCH 165/966] Modified Implementation of ndlstm_base_dynamic.
 (#16402)

* Added ctc_loss_dense_labels. This does the conversion of dense labels into sparse ones to be passed into the core ctc_loss function.

* Removed constant_op from the import.

* Matched ctc_loss_dense_labels with the other layers ops.

* Added ctc_loss_dense_labels to contrib.layers __init__.py file

* Added missing comma to list of ops.

* Reordred arguments for ctc_loss_dense_labels

Labels should be first then inputs for ctc_loss.

* Removed ctc_loss_dense_labels.

Replaced it with dense_to_sparse instead so that there'll be only one ctc_loss function.

* Replaced ctc_loss_dense_labels with dense_to_sparse

* Fixed dense_to_sparse. Some of the names of the variables did not match with that of the parameters.

* Updated documentation for dense_to_sparse since it can accept a tensor of any shape.

* Added test case for dense_to_sparse.

* Updated documentation. Dense to sparse accepts int tensors.

* Fixed testDenseFromConstantToSparse.

The sparse_to_dense order of arguments in the test are wrong and the expected constant should be of int64.

* Modified implementation of ndlstm_base_dynamic.

It now uses a BasicLSTMCell that has state_is_tuple=True to address deprecation. Right now it is still unknown why it was set to false in the first place.
---
 tensorflow/contrib/ndlstm/python/lstm1d.py | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/tensorflow/contrib/ndlstm/python/lstm1d.py b/tensorflow/contrib/ndlstm/python/lstm1d.py
index d3c3531f40..b24e332e4a 100644
--- a/tensorflow/contrib/ndlstm/python/lstm1d.py
+++ b/tensorflow/contrib/ndlstm/python/lstm1d.py
@@ -22,7 +22,6 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.contrib.framework.python.ops import variables
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import rnn
@@ -85,18 +84,11 @@ def ndlstm_base_dynamic(inputs, noutput, scope=None, reverse=False):
     Output sequence (length, batch_size, noutput)
   """
   with variable_scope.variable_scope(scope, "SeqLstm", [inputs]):
-    # TODO(tmb) make batch size, sequence_length dynamic
-    # example: sequence_length = tf.shape(inputs)[0]
-    _, batch_size, _ = _shape(inputs)
-    lstm_cell = rnn_cell.BasicLSTMCell(noutput, state_is_tuple=False)
-    state = array_ops.zeros([batch_size, lstm_cell.state_size])
-    sequence_length = int(inputs.get_shape()[0])
-    sequence_lengths = math_ops.to_int64(
-        array_ops.fill([batch_size], sequence_length))
+    lstm_cell = rnn_cell.BasicLSTMCell(noutput)
     if reverse:
       inputs = array_ops.reverse_v2(inputs, [0])
     outputs, _ = rnn.dynamic_rnn(
-        lstm_cell, inputs, sequence_lengths, state, time_major=True)
+      lstm_cell, inputs, time_major=True, dtype=inputs.dtype)
     if reverse:
       outputs = array_ops.reverse_v2(outputs, [0])
     return outputs
-- 
GitLab


From ab9ff0047b10dbca2bf819d4656a59213f98184a Mon Sep 17 00:00:00 2001
From: ManHyuk <manhyuk@kw.ac.kr>
Date: Fri, 26 Jan 2018 03:52:06 +0900
Subject: [PATCH 166/966] fix typos (#16384)

---
 tensorflow/compiler/xla/client/computation_builder.h | 2 +-
 tensorflow/tools/docs/pretty_docs.py                 | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/xla/client/computation_builder.h b/tensorflow/compiler/xla/client/computation_builder.h
index d82ba63e8a..ea4cdb7667 100644
--- a/tensorflow/compiler/xla/client/computation_builder.h
+++ b/tensorflow/compiler/xla/client/computation_builder.h
@@ -67,7 +67,7 @@ class ComputationBuilder {
   // OpMetadata is often applied to a series of XLA HLO instructions. As a
   // result, OpMetadata is set on the Computation Builder. All subsequent
   // instructions generated via this Computation Builder will have the same
-  // OpMetadata attached until a call to ClearOpMetdata.
+  // OpMetadata attached until a call to ClearOpMetadata.
   void SetOpMetadata(const OpMetadata& metadata) { metadata_ = metadata; }
 
   // Clears the HloMetadata state.
diff --git a/tensorflow/tools/docs/pretty_docs.py b/tensorflow/tools/docs/pretty_docs.py
index c033c16ae9..b5df633800 100644
--- a/tensorflow/tools/docs/pretty_docs.py
+++ b/tensorflow/tools/docs/pretty_docs.py
@@ -323,7 +323,7 @@ class _Metadata(object):
   """
 
   def __init__(self, name):
-    """Creata a Metadata builder.
+    """Create a Metadata builder.
 
     Args:
       name: The name of the page being described by the Metadata block.
-- 
GitLab


From 5b8e8ce30c4f424b5a9c2906e2d95bc177181133 Mon Sep 17 00:00:00 2001
From: ted chang <htchang@us.ibm.com>
Date: Thu, 25 Jan 2018 11:05:08 -0800
Subject: [PATCH 167/966] Add additional argument to freeze_graph (#15906)

---
 tensorflow/python/tools/freeze_graph.py      | 22 ++++++++++++++------
 tensorflow/python/tools/freeze_graph_test.py |  3 ++-
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/tensorflow/python/tools/freeze_graph.py b/tensorflow/python/tools/freeze_graph.py
index 0ddf09260b..a2e86a1c43 100644
--- a/tensorflow/python/tools/freeze_graph.py
+++ b/tensorflow/python/tools/freeze_graph.py
@@ -72,7 +72,8 @@ def freeze_graph_with_def_protos(input_graph_def,
                                  variable_names_blacklist="",
                                  input_meta_graph_def=None,
                                  input_saved_model_dir=None,
-                                 saved_model_tags=None):
+                                 saved_model_tags=None,
+                                 checkpoint_version=saver_pb2.SaverDef.V2):
   """Converts all variables in a graph and checkpoint into constants."""
   del restore_op_name, filename_tensor_name  # Unused by updated loading code.
 
@@ -100,7 +101,8 @@ def freeze_graph_with_def_protos(input_graph_def,
     _ = importer.import_graph_def(input_graph_def, name="")
   with session.Session() as sess:
     if input_saver_def:
-      saver = saver_lib.Saver(saver_def=input_saver_def)
+      saver = saver_lib.Saver(saver_def=input_saver_def,
+                              write_version=checkpoint_version)
       saver.restore(sess, input_checkpoint)
     elif input_meta_graph_def:
       restorer = saver_lib.import_meta_graph(
@@ -124,7 +126,8 @@ def freeze_graph_with_def_protos(input_graph_def,
           # 'global_step' or a similar housekeeping element) so skip it.
           continue
         var_list[key] = tensor
-      saver = saver_lib.Saver(var_list=var_list)
+      saver = saver_lib.Saver(var_list=var_list,
+                              write_version=checkpoint_version)
       saver.restore(sess, input_checkpoint)
       if initializer_nodes:
         sess.run(initializer_nodes.split(","))
@@ -217,7 +220,8 @@ def freeze_graph(input_graph,
                  variable_names_blacklist="",
                  input_meta_graph=None,
                  input_saved_model_dir=None,
-                 saved_model_tags=tag_constants.SERVING):
+                 saved_model_tags=tag_constants.SERVING,
+                 checkpoint_version=saver_pb2.SaverDef.V2):
   """Converts all variables in a graph and checkpoint into constants."""
   input_graph_def = None
   if input_saved_model_dir:
@@ -236,7 +240,8 @@ def freeze_graph(input_graph,
       input_graph_def, input_saver_def, input_checkpoint, output_node_names,
       restore_op_name, filename_tensor_name, output_graph, clear_devices,
       initializer_nodes, variable_names_whitelist, variable_names_blacklist,
-      input_meta_graph_def, input_saved_model_dir, saved_model_tags.split(","))
+      input_meta_graph_def, input_saved_model_dir,
+      saved_model_tags.split(","), checkpoint_version=checkpoint_version)
 
 
 def main(unused_args):
@@ -246,7 +251,7 @@ def main(unused_args):
                FLAGS.output_graph, FLAGS.clear_devices, FLAGS.initializer_nodes,
                FLAGS.variable_names_whitelist, FLAGS.variable_names_blacklist,
                FLAGS.input_meta_graph, FLAGS.input_saved_model_dir,
-               FLAGS.saved_model_tags)
+               FLAGS.saved_model_tags, checkpoint_version=checkpoint_version)
 
 
 if __name__ == "__main__":
@@ -267,6 +272,11 @@ if __name__ == "__main__":
       type=str,
       default="",
       help="TensorFlow variables file to load.")
+  parser.add_argument(
+      "--checkpoint_version",
+      type=int,
+      default=saver_pb2.SaverDef.V2,
+      help="Tensorflow variable file format")
   parser.add_argument(
       "--output_graph",
       type=str,
diff --git a/tensorflow/python/tools/freeze_graph_test.py b/tensorflow/python/tools/freeze_graph_test.py
index feeed7102c..342732465d 100644
--- a/tensorflow/python/tools/freeze_graph_test.py
+++ b/tensorflow/python/tools/freeze_graph_test.py
@@ -86,7 +86,8 @@ class FreezeGraphTest(test_util.TensorFlowTestCase):
     freeze_graph.freeze_graph(
         input_graph_path, input_saver_def_path, input_binary, checkpoint_path,
         output_node_names, restore_op_name, filename_tensor_name,
-        output_graph_path, clear_devices, "", "", input_meta_graph)
+        output_graph_path, clear_devices, "", "", input_meta_graph,
+        checkpoint_version=saver_write_version)
 
     # Now we make sure the variable is now a constant, and that the graph still
     # produces the expected result.
-- 
GitLab


From 2a16133061ba3f8fa60c0338cd629f2211f9b17d Mon Sep 17 00:00:00 2001
From: ted chang <htchang@us.ibm.com>
Date: Thu, 25 Jan 2018 11:16:27 -0800
Subject: [PATCH 168/966] Add checkpoint file prefix check (#14341)

Additionally fix Two failed tests caused by the PR

Fixes #9465
---
 .../contrib/slim/python/slim/evaluation_test.py       |  2 +-
 tensorflow/python/training/saver.py                   | 11 ++++++++---
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/tensorflow/contrib/slim/python/slim/evaluation_test.py b/tensorflow/contrib/slim/python/slim/evaluation_test.py
index 870f504d10..f5a9299d26 100644
--- a/tensorflow/contrib/slim/python/slim/evaluation_test.py
+++ b/tensorflow/contrib/slim/python/slim/evaluation_test.py
@@ -236,7 +236,7 @@ class SingleEvaluationTest(test.TestCase):
   def _prepareCheckpoint(self, checkpoint_path):
     init_op = control_flow_ops.group(variables.global_variables_initializer(),
                                      variables.local_variables_initializer())
-    saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V1)
+    saver = saver_lib.Saver()
     with self.test_session() as sess:
       sess.run(init_op)
       saver.save(sess, checkpoint_path)
diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py
index 2c59b82ebe..4f3773c0fc 100644
--- a/tensorflow/python/training/saver.py
+++ b/tensorflow/python/training/saver.py
@@ -1592,9 +1592,9 @@ class Saver(object):
         [Stripping Default-Valued Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
 
     Returns:
-      A string: path prefix used for the checkpoint files.  If the saver is
-        sharded, this string ends with: '-?????-of-nnnnn' where 'nnnnn'
-        is the number of shards created.
+      A string: path prefix used for the checkpoint files. If checkpoint
+        format is V1 and the saver is sharded, this string ends with:
+         '-?????-of-nnnnn' where 'nnnnn' is the number of shards created.
       If the saver is empty, returns None.
 
     Raises:
@@ -1744,6 +1744,11 @@ class Saver(object):
       return
     if save_path is None:
       raise ValueError("Can't load save_path when it is None.")
+    if (os.path.isfile(save_path) and
+        self._write_version != saver_pb2.SaverDef.V1):
+      raise ValueError("The specified path: %s is a file."
+                       " Please specify only the path prefix"
+                       " to the checkpoint files." % save_path)
     logging.info("Restoring parameters from %s", save_path)
     if context.in_graph_mode():
       sess.run(self.saver_def.restore_op_name,
-- 
GitLab


From a8c4e8d96de7c0978851a5f9718bbd6b8056d862 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Thu, 25 Jan 2018 11:17:08 -0800
Subject: [PATCH 169/966] [XLA] Make xla_hlo_profile_test less flaky

Instead of relying on some oeprations always taking longer than others (and this
appearing in a specific order in the rendered HLO profile), pick them out by
opcode.

PiperOrigin-RevId: 183268593
---
 tensorflow/compiler/xla/BUILD                 |   1 +
 tensorflow/compiler/xla/map_util.h            |  21 +++
 tensorflow/compiler/xla/tests/BUILD           |   1 +
 .../xla/tests/xla_hlo_profile_test.cc         | 125 +++++++++++++-----
 4 files changed, 112 insertions(+), 36 deletions(-)

diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index 438f1443f1..c22fd37129 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -182,6 +182,7 @@ cc_library(
     deps = [
         ":status",
         ":status_macros",
+        ":statusor",
         ":types",
         ":xla_data_proto",
         "//tensorflow/core:lib",
diff --git a/tensorflow/compiler/xla/map_util.h b/tensorflow/compiler/xla/map_util.h
index 50659c1240..0ad0b91330 100644
--- a/tensorflow/compiler/xla/map_util.h
+++ b/tensorflow/compiler/xla/map_util.h
@@ -16,6 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_MAP_UTIL_H_
 #define TENSORFLOW_COMPILER_XLA_MAP_UTIL_H_
 
+#include <functional>
+#include <sstream>
+
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
@@ -44,6 +49,22 @@ typename Collection::value_type::second_type& FindOrDie(
   return it->second;
 }
 
+// Like FindOrDie but returns an error instead of dying if `key` is not in
+// `container`.
+template <class Collection>
+StatusOr<
+    std::reference_wrapper<const typename Collection::value_type::second_type>>
+MaybeFind(const Collection& collection,
+          const typename Collection::value_type::first_type& key) {
+  typename Collection::const_iterator it = collection.find(key);
+  if (it == collection.end()) {
+    std::ostringstream os;
+    os << key;
+    return NotFound("key not found: %s", os.str().c_str());
+  }
+  return {it->second};
+}
+
 // Inserts the key-value pair into the collection. Dies if key was already
 // present.
 template <class Collection>
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 02ad9d982f..ac11081699 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -351,6 +351,7 @@ xla_test(
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/service:platform_util",
diff --git a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
index 1d2f436194..9ad2a19853 100644
--- a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
+++ b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
@@ -19,12 +19,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/platform/regexp.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
@@ -32,6 +34,7 @@ limitations under the License.
 namespace xla {
 namespace {
 namespace se = ::perftools::gputools;
+namespace gtl = ::tensorflow::gtl;
 
 class HloProfileTest : public ClientLibraryTestBase {};
 
@@ -43,39 +46,74 @@ struct ParsedProfileOutputLine {
   string trops;
   string bytes_per_sec;
   string bytes_per_cycle;
-  string name;
+  string opcode;
 };
 
-StatusOr<ParsedProfileOutputLine> ParseProfileOutputLine(const string& line,
-                                                         bool expect_flops,
-                                                         bool expect_trops) {
+::testing::AssertionResult HasFlops(
+    const ParsedProfileOutputLine& parsed_line) {
+  if (RE2::FullMatch(parsed_line.flops, "[0-9.TGMk]+FLOP/s")) {
+    return ::testing::AssertionSuccess()
+           << "'flops' field present in  " << parsed_line.opcode << ": '"
+           << parsed_line.flops << "'";
+  }
+
+  return ::testing::AssertionFailure()
+         << "'flops' field absent in  " << parsed_line.opcode << ": '"
+         << parsed_line.flops << "'";
+}
+
+::testing::AssertionResult HasTrops(
+    const ParsedProfileOutputLine& parsed_line) {
+  if (RE2::FullMatch(parsed_line.trops, "[0-9.TGMk]+TROP/s")) {
+    return ::testing::AssertionSuccess()
+           << "'trops' field present in  " << parsed_line.opcode << ": '"
+           << parsed_line.trops << "'";
+  }
+
+  return ::testing::AssertionFailure()
+         << "'trops' field absent in  " << parsed_line.opcode << ": '"
+         << parsed_line.trops << "'";
+}
+
+Status ParseOneProfileOutputLine(
+    const string& line, bool expect_hlo,
+    gtl::FlatMap<string, ParsedProfileOutputLine>* parsed_results) {
   string separator = "[^:]*:: +";
   string match_percentage = "\\d+\\.\\d\\d%";
   string match_cycles = "(\\d+) cycles +\\( *(" + match_percentage + ")\\)";
   string match_usecs = "([0-9.]+) usec";
-  string match_flops = expect_flops ? "([0-9.TGMk]+)FLOP/s" : "(<none>)";
-  string match_trops = expect_trops ? "([0-9.TGMk]+)TROP/s" : "(<none>)";
+  string match_flops = "([^ ]+)";
+  string match_trops = "([^ ]+)";
   string match_bytes_per_sec = "([0-9.TGMKi]+)B/s";
   string match_bytes_per_cycle = "([0-9.TGMKi]+)B/cycle";
+
+  // The underlined part is what we're trying to match with match_opcode:
+  //
+  //   %dot33 = f32[256,256]{1,0} dot(...)
+  //                              ^^^
+
+  string match_opcode =
+      expect_hlo ? "%[^=]+= [^ ]+ ([^(]+)\\(.*" : "(\\[total\\])";
   string regexp_pattern = tensorflow::strings::StrCat(
       " +", match_cycles, separator, match_usecs, separator, match_flops,
       separator, match_trops, separator, match_bytes_per_sec, separator,
-      match_bytes_per_cycle, separator, "(.*)");
+      match_bytes_per_cycle, separator, match_opcode);
 
-  RE2 pattern(regexp_pattern);
   ParsedProfileOutputLine parsed_line;
   bool matched = RE2::FullMatch(
-      line, pattern, &parsed_line.cycles, &parsed_line.cycles_percentage,
+      line, regexp_pattern, &parsed_line.cycles, &parsed_line.cycles_percentage,
       &parsed_line.usec, &parsed_line.flops, &parsed_line.trops,
       &parsed_line.bytes_per_sec, &parsed_line.bytes_per_cycle,
-      &parsed_line.name);
+      &parsed_line.opcode);
   if (!matched) {
     return tensorflow::errors::InvalidArgument(
         "Input did not match regexp.  Input: ", line,
         ", Regexp: ", regexp_pattern);
   }
 
-  return parsed_line;
+  InsertOrDie(parsed_results, parsed_line.opcode, parsed_line);
+
+  return Status::OK();
 }
 
 // Returns void so that we can ASSERT.
@@ -148,7 +186,7 @@ XLA_TEST_F(HloProfileTest, DISABLED_ON_CPU_PARALLEL(ProfileSingleComputation)) {
                           ClientLibrary::GetOrCreateLocalClient(platform));
 
   ComputationBuilder builder(client, TestName());
-  auto result = builder.Tanh(builder.Dot(
+  auto result = builder.Tanh(builder.Add(
       builder.Parameter(0, ShapeUtil::MakeShape(F32, {m, k}), "dot_lhs"),
       builder.Parameter(1, ShapeUtil::MakeShape(F32, {k, n}), "dot_rhs")));
 
@@ -161,31 +199,43 @@ XLA_TEST_F(HloProfileTest, DISABLED_ON_CPU_PARALLEL(ProfileSingleComputation)) {
   std::vector<string> profile_output_lines =
       tensorflow::str_util::Split(profile_output, '\n');
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      ParsedProfileOutputLine total_profile,
-      ParseProfileOutputLine(profile_output_lines[1], /*expect_flops=*/true,
-                             /*expect_trops=*/true));
+  gtl::FlatMap<string, ParsedProfileOutputLine> parsed_profile_lines;
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      ParsedProfileOutputLine dot_profile,
-      ParseProfileOutputLine(profile_output_lines[2], /*expect_flops=*/true,
-                             /*expect_trops=*/false));
+  TF_ASSERT_OK(ParseOneProfileOutputLine(
+      profile_output_lines[1], /*expect_hlo=*/false, &parsed_profile_lines));
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      ParsedProfileOutputLine tanh_profile,
-      ParseProfileOutputLine(profile_output_lines[3], /*expect_flops=*/false,
-                             /*expect_trops=*/true));
+  TF_ASSERT_OK(ParseOneProfileOutputLine(
+      profile_output_lines[2], /*expect_hlo=*/true, &parsed_profile_lines));
+
+  TF_ASSERT_OK(ParseOneProfileOutputLine(
+      profile_output_lines[3], /*expect_hlo=*/true, &parsed_profile_lines));
+
+  TF_ASSERT_OK_AND_ASSIGN(ParsedProfileOutputLine total_profile,
+                          MaybeFind(parsed_profile_lines, "[total]"));
+  TF_ASSERT_OK_AND_ASSIGN(ParsedProfileOutputLine dot_profile,
+                          MaybeFind(parsed_profile_lines, "add"));
+  TF_ASSERT_OK_AND_ASSIGN(ParsedProfileOutputLine tanh_profile,
+                          MaybeFind(parsed_profile_lines, "tanh"));
 
   EXPECT_GT(total_profile.cycles, 0);
   EXPECT_EQ(total_profile.cycles_percentage, "100.00%");
 
+  EXPECT_TRUE(HasFlops(total_profile));
+  EXPECT_TRUE(HasTrops(total_profile));
+
   EXPECT_GT(total_profile.cycles, dot_profile.cycles);
   EXPECT_NE(dot_profile.cycles_percentage, "0.00%");
   EXPECT_NE(dot_profile.cycles_percentage, "100.00%");
 
+  EXPECT_TRUE(HasFlops(dot_profile));
+  EXPECT_FALSE(HasTrops(dot_profile));
+
   EXPECT_GT(total_profile.cycles, tanh_profile.cycles);
   EXPECT_NE(tanh_profile.cycles_percentage, "0.00%");
   EXPECT_NE(tanh_profile.cycles_percentage, "100.00%");
+
+  EXPECT_FALSE(HasFlops(tanh_profile));
+  EXPECT_TRUE(HasTrops(tanh_profile));
 }
 
 // TODO(b/71364943): This test exposes a bug in the parallel CPU backend.
@@ -220,7 +270,7 @@ XLA_TEST_F(HloProfileTest,
     auto matrix = builder.GetTupleElement(state, 1);
     auto next_iteration = builder.Add(builder.GetTupleElement(state, 0),
                                       builder.ConstantR0<int32>(1));
-    builder.Tuple({next_iteration, builder.Dot(matrix, matrix)});
+    builder.Tuple({next_iteration, builder.Add(matrix, matrix)});
     TF_ASSERT_OK_AND_ASSIGN(body, builder.Build());
   }
 
@@ -249,20 +299,23 @@ XLA_TEST_F(HloProfileTest,
 
   ASSERT_NE(while_body_profile_start, profile_output_lines.end());
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      ParsedProfileOutputLine total_while_body_profile,
-      ParseProfileOutputLine(*std::next(while_body_profile_start, 1),
-                             /*expect_flops=*/false,
-                             /*expect_trops=*/false));
+  gtl::FlatMap<string, ParsedProfileOutputLine> parsed_profile_lines;
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      ParsedProfileOutputLine dot_profile,
-      ParseProfileOutputLine(*std::next(while_body_profile_start, 2),
-                             /*expect_flops=*/false,
-                             /*expect_trops=*/false));
+  TF_ASSERT_OK(
+      ParseOneProfileOutputLine(*std::next(while_body_profile_start, 1),
+                                /*expect_hlo=*/false, &parsed_profile_lines));
+
+  TF_ASSERT_OK(
+      ParseOneProfileOutputLine(*std::next(while_body_profile_start, 2),
+                                /*expect_hlo=*/true, &parsed_profile_lines));
+
+  TF_ASSERT_OK_AND_ASSIGN(ParsedProfileOutputLine total_while_body_profile,
+                          MaybeFind(parsed_profile_lines, "[total]"));
+  TF_ASSERT_OK_AND_ASSIGN(ParsedProfileOutputLine dot_profile,
+                          MaybeFind(parsed_profile_lines, "add"));
 
   EXPECT_GT(total_while_body_profile.cycles, 0);
-  EXPECT_EQ(total_while_body_profile.name, "[total]");
+  EXPECT_EQ(total_while_body_profile.opcode, "[total]");
   EXPECT_EQ(total_while_body_profile.cycles_percentage, "100.00%");
 
   EXPECT_GT(total_while_body_profile.cycles, dot_profile.cycles);
-- 
GitLab


From 45d47f30243dd4a26705f24b2a82188b0ec9b7d2 Mon Sep 17 00:00:00 2001
From: Andrew Harp <andrewharp@users.noreply.github.com>
Date: Thu, 25 Jan 2018 14:56:22 -0500
Subject: [PATCH 170/966] Don't load libcupti.so from regular path on Android
 (#16303)

* Don't load libcupti.so from regular path on Android

* replace NVIDIA_TEGRA with ANDROID_TEGRA to be less redundant and more specific
---
 tensorflow/contrib/makefile/Makefile                | 6 +++---
 tensorflow/core/common_runtime/gpu/gpu_device.cc    | 2 +-
 tensorflow/core/framework/register_types.h          | 2 +-
 tensorflow/stream_executor/cuda/cuda_diagnostics.cc | 2 +-
 tensorflow/stream_executor/dso_loader.cc            | 8 ++++++++
 5 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/tensorflow/contrib/makefile/Makefile b/tensorflow/contrib/makefile/Makefile
index dd5770dc99..c50f8ceec0 100644
--- a/tensorflow/contrib/makefile/Makefile
+++ b/tensorflow/contrib/makefile/Makefile
@@ -377,10 +377,10 @@ $(MARCH_OPTION) \
 
 	ifeq ($(BUILD_FOR_TEGRA),1)
 		NVCC := $(JETPACK)/cuda/bin/nvcc
-		NVCCFLAGS := -x=cu -D__CUDACC__ -DNVCC -DNVIDIA_TEGRA -ccbin $(NDK_ROOT)/toolchains/$(TOOLCHAIN)/prebuilt/$(ANDROID_HOST_OS_ARCH)/bin/$(BIN_PREFIX)-g++ --std c++11 --expt-relaxed-constexpr -m64 -gencode arch=compute_53,\"code=sm_53\" -gencode arch=compute_62,\"code=sm_62\" -DEIGEN_AVOID_STL_ARRAY -DTENSORFLOW_USE_EIGEN_THREADPOOL -DLANG_CXX11 -DEIGEN_HAS_C99_MATH -DGOOGLE_CUDA=1 -DTF_EXTRA_CUDA_CAPABILITIES=5.3
+		NVCCFLAGS := -x=cu -D__CUDACC__ -DNVCC -DANDROID_TEGRA -ccbin $(NDK_ROOT)/toolchains/$(TOOLCHAIN)/prebuilt/$(ANDROID_HOST_OS_ARCH)/bin/$(BIN_PREFIX)-g++ --std c++11 --expt-relaxed-constexpr -m64 -gencode arch=compute_53,\"code=sm_53\" -gencode arch=compute_62,\"code=sm_62\" -DEIGEN_AVOID_STL_ARRAY -DTENSORFLOW_USE_EIGEN_THREADPOOL -DLANG_CXX11 -DEIGEN_HAS_C99_MATH -DGOOGLE_CUDA=1 -DTF_EXTRA_CUDA_CAPABILITIES=5.3
 		CXXFLAGS4NVCC =\
 -DIS_SLIM_BUILD \
--DNVIDIA_TEGRA \
+-DANDROID_TEGRA \
 -fno-exceptions \
 -DNDEBUG $(OPTFLAGS) \
 -march=armv8-a \
@@ -391,7 +391,7 @@ $(MARCH_OPTION) \
 		CXXFLAGS +=\
 -DGOOGLE_CUDA=1 \
 -D__ANDROID_TYPES_FULL__ \
--DNVIDIA_TEGRA \
+-DANDROID_TEGRA \
 -DEIGEN_AVOID_STL_ARRAY \
 -DEIGEN_HAS_C99_MATH \
 -DLANG_CXX11 -DTENSORFLOW_USE_EIGEN_THREADPOOL -DTF_EXTRA_CUDA_CAPABILITIES=5.3
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index 0e5b6b7ef8..933d700f60 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -762,7 +762,7 @@ int64 MinSystemMemory(int64 available_memory) {
   // is necessary.
   min_system_memory *= 2;
 #endif
-#if defined(NVIDIA_TEGRA)
+#if defined(ANDROID_TEGRA)
   // 1GB system mem for NVIDIA Tegra devices since they use the same mem for RAM and Video RAM
   min_system_memory = 1<<30;
 #endif
diff --git a/tensorflow/core/framework/register_types.h b/tensorflow/core/framework/register_types.h
index edc93aec7f..e062adffe8 100644
--- a/tensorflow/core/framework/register_types.h
+++ b/tensorflow/core/framework/register_types.h
@@ -53,7 +53,7 @@ limitations under the License.
 */
 
 #if !defined(IS_MOBILE_PLATFORM) || defined(SUPPORT_SELECTIVE_REGISTRATION) || \
-    defined(NVIDIA_TEGRA)
+    defined(ANDROID_TEGRA)
 
 // All types are supported, so all macros are invoked.
 //
diff --git a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
index f35542e18f..933c103f52 100644
--- a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
+++ b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
@@ -232,7 +232,7 @@ port::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
       result = StringToDriverVersion(version);
     }
 #else
-#if !defined(PLATFORM_WINDOWS) && !defined(NVIDIA_TEGRA)
+#if !defined(PLATFORM_WINDOWS) && !defined(ANDROID_TEGRA)
   // Callback used when iterating through DSOs. Looks for the driver-interfacing
   // DSO and yields its version number into the callback data, when found.
   auto iterate_phdr =
diff --git a/tensorflow/stream_executor/dso_loader.cc b/tensorflow/stream_executor/dso_loader.cc
index 5210a81092..d71938634d 100644
--- a/tensorflow/stream_executor/dso_loader.cc
+++ b/tensorflow/stream_executor/dso_loader.cc
@@ -96,10 +96,18 @@ string GetCudnnVersion() { return TF_CUDNN_VERSION; }
 }
 
 /* static */ port::Status DsoLoader::GetLibcuptiDsoHandle(void** dso_handle) {
+#if defined(ANDROID_TEGRA)
+  // On Android devices the CUDA version number is not added to the library name.
+  return GetDsoHandle(FindDsoPath(port::Env::Default()->FormatLibraryFileName(
+                                      "cupti", ""),
+                                  GetCudaCuptiLibraryPath()),
+                      dso_handle);
+#else
   return GetDsoHandle(FindDsoPath(port::Env::Default()->FormatLibraryFileName(
                                       "cupti", GetCudaVersion()),
                                   GetCudaCuptiLibraryPath()),
                       dso_handle);
+#endif
 }
 
 static mutex& GetRpathMutex() {
-- 
GitLab


From 351c0a533a111636333b4ebeede16485cf679ca9 Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Thu, 25 Jan 2018 12:02:36 -0800
Subject: [PATCH 171/966] Add C0330 bad-continuation check to pylint.

PiperOrigin-RevId: 183270896
---
 .../copy_graph/python/util/copy_elements.py   |  75 +-
 .../python/layers/feature_column_test.py      | 150 ++-
 .../learn/python/learn/datasets/__init__.py   |  23 +-
 .../learn/python/learn/datasets/synthetic.py  |  66 +-
 .../python/learn/estimators/estimator.py      | 262 ++---
 .../python/learn/estimators/estimator_test.py | 223 +++--
 .../learn/estimators/estimators_test.py       |  32 +-
 .../contrib/learn/python/learn/monitors.py    | 102 +-
 .../learn/python/learn/utils/export_test.py   |  34 +-
 .../learn/python/learn/utils/gc_test.py       |  49 +-
 .../contrib/metrics/python/ops/metric_ops.py  |  77 +-
 .../metrics/python/ops/metric_ops_test.py     | 933 ++++++++----------
 .../examples/cifar10/cifar10_pruning.py       |  86 +-
 tensorflow/contrib/mpi_collectives/mpi_ops.py |  22 +-
 .../training/elastic_average_optimizer.py     | 115 ++-
 .../elastic_average_optimizer_test.py         |  99 +-
 .../predictor/predictor_factories_test.py     |   4 +-
 .../kernel_tests/sparsemax_loss_test.py       |  34 +-
 .../python/kernel_tests/sparsemax_test.py     |  48 +-
 .../reading_data/fully_connected_reader.py    |  49 +-
 .../examples/label_image/label_image.py       |  41 +-
 tensorflow/python/client/session.py           | 185 ++--
 tensorflow/python/client/session_test.py      | 310 +++---
 .../inputs/queues/feeding_functions.py        | 101 +-
 .../python/kernel_tests/array_ops_test.py     |  97 +-
 .../python/kernel_tests/diag_op_test.py       | 225 ++---
 .../python/kernel_tests/map_stage_op_test.py  | 105 +-
 .../python/kernel_tests/pooling_ops_test.py   |  72 +-
 .../python/kernel_tests/reader_ops_test.py    |  23 +-
 .../python/kernel_tests/relu_op_test.py       |  20 +-
 .../kernel_tests/sparse_slice_op_test.py      | 102 +-
 .../python/kernel_tests/stage_op_test.py      |  34 +-
 tensorflow/python/layers/maxout.py            |  34 +-
 tensorflow/python/ops/array_grad.py           | 116 ++-
 tensorflow/python/ops/control_flow_ops.py     | 481 +++++----
 tensorflow/python/ops/data_flow_ops.py        | 390 +++++---
 tensorflow/python/ops/gradients_impl.py       |  91 +-
 tensorflow/python/ops/nn_grad.py              | 298 +++---
 tensorflow/python/ops/nn_grad_test.py         |  13 +-
 tensorflow/python/ops/special_math_ops.py     | 116 +--
 .../python/ops/special_math_ops_test.py       |  74 +-
 tensorflow/python/tools/inspect_checkpoint.py |  14 +-
 .../python/training/coordinator_test.py       |  70 +-
 tensorflow/tools/ci_build/ci_sanity.sh        |   8 +-
 tensorflow/tools/compatibility/ast_edits.py   |  71 +-
 tensorflow/tools/compatibility/tf_upgrade.py  |   4 +-
 46 files changed, 2937 insertions(+), 2641 deletions(-)

diff --git a/tensorflow/contrib/copy_graph/python/util/copy_elements.py b/tensorflow/contrib/copy_graph/python/util/copy_elements.py
index bae66ffd42..b806799202 100644
--- a/tensorflow/contrib/copy_graph/python/util/copy_elements.py
+++ b/tensorflow/contrib/copy_graph/python/util/copy_elements.py
@@ -35,10 +35,10 @@ from tensorflow.python.ops.variables import Variable
 from tensorflow.python.client.session import Session
 from tensorflow.python.framework import ops
 
-__all__ = ["copy_op_to_graph", "copy_variable_to_graph", "get_copied_op"]
+__all__ = ['copy_op_to_graph', 'copy_variable_to_graph', 'get_copied_op']
 
 
-def copy_variable_to_graph(org_instance, to_graph, scope=""):
+def copy_variable_to_graph(org_instance, to_graph, scope=''):
   """Given a `Variable` instance from one `Graph`, initializes and returns
   a copy of it from another `Graph`, under the specified scope
   (default `""`).
@@ -56,12 +56,11 @@ def copy_variable_to_graph(org_instance, to_graph, scope=""):
   """
 
   if not isinstance(org_instance, Variable):
-    raise TypeError(str(org_instance) + " is not a Variable")
+    raise TypeError(str(org_instance) + ' is not a Variable')
 
   #The name of the new variable
-  if scope != "":
-    new_name = (scope + '/' +
-                org_instance.name[:org_instance.name.index(':')])
+  if scope != '':
+    new_name = (scope + '/' + org_instance.name[:org_instance.name.index(':')])
   else:
     new_name = org_instance.name[:org_instance.name.index(':')]
 
@@ -73,15 +72,15 @@ def copy_variable_to_graph(org_instance, to_graph, scope=""):
   for name, collection in org_instance.graph._collections.items():
     if org_instance in collection:
       if (name == ops.GraphKeys.GLOBAL_VARIABLES or
-          name == ops.GraphKeys.TRAINABLE_VARIABLES or
-          scope == ''):
+          name == ops.GraphKeys.TRAINABLE_VARIABLES or scope == ''):
         collections.append(name)
       else:
         collections.append(scope + '/' + name)
 
   #See if its trainable.
-  trainable = (org_instance in org_instance.graph.get_collection(
-      ops.GraphKeys.TRAINABLE_VARIABLES))
+  trainable = (
+      org_instance in org_instance.graph.get_collection(
+          ops.GraphKeys.TRAINABLE_VARIABLES))
   #Get the initial value
   with org_instance.graph.as_default():
     temp_session = Session()
@@ -89,17 +88,17 @@ def copy_variable_to_graph(org_instance, to_graph, scope=""):
 
   #Initialize the new variable
   with to_graph.as_default():
-    new_var = Variable(init_value,
-                       trainable,
-                       name=new_name,
-                       collections=collections,
-                       validate_shape=False)
+    new_var = Variable(
+        init_value,
+        trainable,
+        name=new_name,
+        collections=collections,
+        validate_shape=False)
 
   return new_var
 
 
-def copy_op_to_graph(org_instance, to_graph, variables,
-                     scope=""):
+def copy_op_to_graph(org_instance, to_graph, variables, scope=''):
   """Returns a copy of an operation from another Graph under a specified scope.
 
   Given an `Operation` `org_instance` from one `Graph`,
@@ -139,14 +138,12 @@ def copy_op_to_graph(org_instance, to_graph, variables,
   #If a variable by the new name already exists, return the
   #correspondng tensor that will act as an input
   if new_name in copied_variables:
-    return to_graph.get_tensor_by_name(
-        copied_variables[new_name].name)
+    return to_graph.get_tensor_by_name(copied_variables[new_name].name)
 
   #If an instance of the same name exists, return appropriately
   try:
-    already_present = to_graph.as_graph_element(new_name,
-                                                allow_tensor=True,
-                                                allow_operation=True)
+    already_present = to_graph.as_graph_element(
+        new_name, allow_tensor=True, allow_operation=True)
     return already_present
   except:
     pass
@@ -184,20 +181,21 @@ def copy_op_to_graph(org_instance, to_graph, variables,
 
     #If it has an original_op parameter, copy it
     if op._original_op is not None:
-      new_original_op = copy_op_to_graph(op._original_op, to_graph,
-                                      variables, scope)
+      new_original_op = copy_op_to_graph(op._original_op, to_graph, variables,
+                                         scope)
     else:
       new_original_op = None
 
     #If it has control inputs, call this function recursively on each.
-    new_control_inputs = [copy_op_to_graph(x, to_graph, variables,
-                                        scope)
-                          for x in op.control_inputs]
+    new_control_inputs = [
+        copy_op_to_graph(x, to_graph, variables, scope)
+        for x in op.control_inputs
+    ]
 
     #If it has inputs, call this function recursively on each.
-    new_inputs = [copy_op_to_graph(x, to_graph, variables,
-                                scope)
-                  for x in op.inputs]
+    new_inputs = [
+        copy_op_to_graph(x, to_graph, variables, scope) for x in op.inputs
+    ]
 
     #Make a new node_def based on that of the original.
     #An instance of tensorflow.core.framework.node_def_pb2.NodeDef, it
@@ -216,13 +214,8 @@ def copy_op_to_graph(org_instance, to_graph, variables,
     op_def = deepcopy(op._op_def)
 
     #Initialize a new Operation instance
-    new_op = ops.Operation(new_node_def,
-                           to_graph,
-                           new_inputs,
-                           output_types,
-                           new_control_inputs,
-                           input_types,
-                           new_original_op,
+    new_op = ops.Operation(new_node_def, to_graph, new_inputs, output_types,
+                           new_control_inputs, input_types, new_original_op,
                            op_def)
     #Use Graph's hidden methods to add the op
     to_graph._add_op(new_op)  # pylint: disable=protected-access
@@ -233,10 +226,10 @@ def copy_op_to_graph(org_instance, to_graph, variables,
     return new_op
 
   else:
-    raise TypeError("Could not copy instance: " + str(org_instance))
+    raise TypeError('Could not copy instance: ' + str(org_instance))
 
 
-def get_copied_op(org_instance, graph, scope=""):
+def get_copied_op(org_instance, graph, scope=''):
   """Given an `Operation` instance from some `Graph`, returns
   its namesake from `graph`, under the specified scope
   (default `""`).
@@ -259,5 +252,5 @@ def get_copied_op(org_instance, graph, scope=""):
   else:
     new_name = org_instance.name
 
-  return graph.as_graph_element(new_name, allow_tensor=True,
-                                allow_operation=True)
+  return graph.as_graph_element(
+      new_name, allow_tensor=True, allow_operation=True)
diff --git a/tensorflow/contrib/layers/python/layers/feature_column_test.py b/tensorflow/contrib/layers/python/layers/feature_column_test.py
index 2eaea23177..fc8f153fe3 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column_test.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column_test.py
@@ -221,8 +221,8 @@ class FeatureColumnTest(test.TestCase):
     weighted_sparse_col = fc.weighted_sparse_column(ids, "weights")
     self.assertEqual(weighted_sparse_col.name, "ids_weighted_by_weights")
 
-    b = fc.shared_embedding_columns([sparse_col, weighted_sparse_col],
-                                    dimension=4, combiner="mean")
+    b = fc.shared_embedding_columns(
+        [sparse_col, weighted_sparse_col], dimension=4, combiner="mean")
     self.assertEqual(len(b), 2)
     self.assertEqual(b[0].shared_embedding_name,
                      "a1_ids_weighted_by_weights_shared_embedding")
@@ -230,8 +230,8 @@ class FeatureColumnTest(test.TestCase):
                      "a1_ids_weighted_by_weights_shared_embedding")
 
     # Tries reversing order to check compatibility condition.
-    b = fc.shared_embedding_columns([weighted_sparse_col, sparse_col],
-                                    dimension=4, combiner="mean")
+    b = fc.shared_embedding_columns(
+        [weighted_sparse_col, sparse_col], dimension=4, combiner="mean")
     self.assertEqual(len(b), 2)
     self.assertEqual(b[0].shared_embedding_name,
                      "a1_ids_weighted_by_weights_shared_embedding")
@@ -240,18 +240,17 @@ class FeatureColumnTest(test.TestCase):
 
     # Tries adding two weighted columns to check compatibility between them.
     weighted_sparse_col_2 = fc.weighted_sparse_column(ids, "weights_2")
-    b = fc.shared_embedding_columns([weighted_sparse_col,
-                                     weighted_sparse_col_2],
-                                    dimension=4, combiner="mean")
+    b = fc.shared_embedding_columns(
+        [weighted_sparse_col, weighted_sparse_col_2],
+        dimension=4,
+        combiner="mean")
     self.assertEqual(len(b), 2)
     self.assertEqual(
         b[0].shared_embedding_name,
-        "ids_weighted_by_weights_ids_weighted_by_weights_2_shared_embedding"
-    )
+        "ids_weighted_by_weights_ids_weighted_by_weights_2_shared_embedding")
     self.assertEqual(
         b[1].shared_embedding_name,
-        "ids_weighted_by_weights_ids_weighted_by_weights_2_shared_embedding"
-    )
+        "ids_weighted_by_weights_ids_weighted_by_weights_2_shared_embedding")
 
   def testSharedEmbeddingColumnDeterminism(self):
     # Tests determinism in auto-generated shared_embedding_name.
@@ -286,10 +285,10 @@ class FeatureColumnTest(test.TestCase):
     columns = fc.shared_embedding_columns(
         [a1, a2], dimension=4, combiner="mean")
     columns_copy = copy.deepcopy(columns)
-    self.assertEqual(
-        columns_copy[0].shared_embedding_name, "a1_a2_shared_embedding")
-    self.assertEqual(
-        columns_copy[1].shared_embedding_name, "a1_a2_shared_embedding")
+    self.assertEqual(columns_copy[0].shared_embedding_name,
+                     "a1_a2_shared_embedding")
+    self.assertEqual(columns_copy[1].shared_embedding_name,
+                     "a1_a2_shared_embedding")
 
   def testOneHotColumn(self):
     a = fc.sparse_column_with_keys("a", ["a", "b", "c", "d"])
@@ -336,11 +335,11 @@ class FeatureColumnTest(test.TestCase):
     weighted_ids = fc.weighted_sparse_column(ids, "weights")
     one_hot = fc.one_hot_column(weighted_ids)
     features = {
-        'ids': constant_op.constant([['marlo', 'unknown', 'omar']]),
-        'weights': constant_op.constant([[2., 4., 6.]])
+        "ids": constant_op.constant([["marlo", "unknown", "omar"]]),
+        "weights": constant_op.constant([[2., 4., 6.]])
     }
     one_hot_tensor = feature_column_ops.input_from_feature_columns(
-      features, [one_hot])
+        features, [one_hot])
     with self.test_session() as sess:
       sess.run(variables.global_variables_initializer())
       sess.run(lookup_ops.tables_initializer())
@@ -349,11 +348,9 @@ class FeatureColumnTest(test.TestCase):
   def testMissingValueInOneHotColumnForSparseColumnWithKeys(self):
     ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"])
     one_hot = fc.one_hot_column(ids)
-    features = {
-      'ids': constant_op.constant([['marlo', 'unknown', 'omar']])
-    }
+    features = {"ids": constant_op.constant([["marlo", "unknown", "omar"]])}
     one_hot_tensor = feature_column_ops.input_from_feature_columns(
-      features, [one_hot])
+        features, [one_hot])
     with self.test_session() as sess:
       sess.run(variables.global_variables_initializer())
       sess.run(lookup_ops.tables_initializer())
@@ -379,8 +376,7 @@ class FeatureColumnTest(test.TestCase):
     self.assertEqual(d4.default_value, None)
     self.assertEqual(d4.is_sparse, True)
     # Default value is a list but dimension is None.
-    with self.assertRaisesRegexp(ValueError,
-                                 "Only scalar default value.*"):
+    with self.assertRaisesRegexp(ValueError, "Only scalar default value.*"):
       fc._real_valued_var_len_column("g5", default_value=[2., 3.])
 
   def testRealValuedVarLenColumnDtypes(self):
@@ -390,18 +386,19 @@ class FeatureColumnTest(test.TestCase):
             "rvc": parsing_ops.VarLenFeature(dtype=dtypes.float32)
         }, rvc.config)
 
-    rvc = fc._real_valued_var_len_column("rvc", default_value=0,
-                                         is_sparse=False)
-    self.assertDictEqual(
-        {
-            "rvc": parsing_ops.FixedLenSequenceFeature(shape=[],
-                                                       dtype=dtypes.float32,
-                                                       allow_missing=True,
-                                                       default_value=0.0)
-        }, rvc.config)
-
-    rvc = fc._real_valued_var_len_column("rvc", dtype=dtypes.int32,
-                                         default_value=0, is_sparse=True)
+    rvc = fc._real_valued_var_len_column(
+        "rvc", default_value=0, is_sparse=False)
+    self.assertDictEqual({
+        "rvc":
+            parsing_ops.FixedLenSequenceFeature(
+                shape=[],
+                dtype=dtypes.float32,
+                allow_missing=True,
+                default_value=0.0)
+    }, rvc.config)
+
+    rvc = fc._real_valued_var_len_column(
+        "rvc", dtype=dtypes.int32, default_value=0, is_sparse=True)
     self.assertDictEqual(
         {
             "rvc": parsing_ops.VarLenFeature(dtype=dtypes.int32)
@@ -409,8 +406,8 @@ class FeatureColumnTest(test.TestCase):
 
     with self.assertRaisesRegexp(TypeError,
                                  "dtype must be convertible to float"):
-      fc._real_valued_var_len_column("rvc", dtype=dtypes.string,
-                                     default_value="", is_sparse=True)
+      fc._real_valued_var_len_column(
+          "rvc", dtype=dtypes.string, default_value="", is_sparse=True)
 
   def testRealValuedColumn(self):
     a = fc.real_valued_column("aaa")
@@ -504,13 +501,13 @@ class FeatureColumnTest(test.TestCase):
     for output_rank in range(1, 3 + len(dimensions)):
       with variable_scope.variable_scope("output_rank_{}".format(output_rank)):
         real_valued_output = real_valued_column._to_dnn_input_layer(
-            constant_op.constant(
-                real_valued_input, dtype=dtypes.float32),
+            constant_op.constant(real_valued_input, dtype=dtypes.float32),
             output_rank=output_rank)
       with self.test_session() as sess:
         real_valued_eval = sess.run(real_valued_output)
-      expected_shape = (input_shape[:output_rank - 1] +
-                        [np.prod(input_shape[output_rank - 1:])])
+      expected_shape = (
+          input_shape[:output_rank - 1] +
+          [np.prod(input_shape[output_rank - 1:])])
       self.assertEquals(expected_shape, list(real_valued_eval.shape))
 
   def testRealValuedColumnDensification(self):
@@ -520,8 +517,7 @@ class FeatureColumnTest(test.TestCase):
         "sparse_real_valued1", is_sparse=True)
     sparse_tensor = sparse_tensor_lib.SparseTensor(
         values=[2.0, 5.0], indices=[[0, 0], [2, 0]], dense_shape=[3, 1])
-    with self.assertRaisesRegexp(
-        ValueError, "Set is_sparse to False"):
+    with self.assertRaisesRegexp(ValueError, "Set is_sparse to False"):
       real_valued_column._to_dnn_input_layer(sparse_tensor)
 
   def testRealValuedColumnDeepCopy(self):
@@ -549,9 +545,8 @@ class FeatureColumnTest(test.TestCase):
   def testBucketizedColumnRequiresRealValuedColumnDimension(self):
     with self.assertRaisesRegexp(
         TypeError, "source_column must be an instance of _RealValuedColumn.*"):
-      fc.bucketized_column(fc._real_valued_var_len_column("bbb",
-                                                          is_sparse=True),
-                           [0])
+      fc.bucketized_column(
+          fc._real_valued_var_len_column("bbb", is_sparse=True), [0])
 
   def testBucketizedColumnRequiresSortedBuckets(self):
     with self.assertRaisesRegexp(ValueError,
@@ -654,20 +649,14 @@ class FeatureColumnTest(test.TestCase):
 
   def testRealValuedColumnDtypes(self):
     rvc = fc.real_valued_column("rvc")
-    self.assertDictEqual(
-        {
-            "rvc": parsing_ops.FixedLenFeature(
-                [1], dtype=dtypes.float32)
-        },
-        rvc.config)
+    self.assertDictEqual({
+        "rvc": parsing_ops.FixedLenFeature([1], dtype=dtypes.float32)
+    }, rvc.config)
 
     rvc = fc.real_valued_column("rvc", dtype=dtypes.int32)
-    self.assertDictEqual(
-        {
-            "rvc": parsing_ops.FixedLenFeature(
-                [1], dtype=dtypes.int32)
-        },
-        rvc.config)
+    self.assertDictEqual({
+        "rvc": parsing_ops.FixedLenFeature([1], dtype=dtypes.int32)
+    }, rvc.config)
 
     with self.assertRaisesRegexp(ValueError,
                                  "dtype must be convertible to float"):
@@ -702,8 +691,9 @@ class FeatureColumnTest(test.TestCase):
     batch_size = 4
     dense_scalar_input = [1, 2, 3, 4]
     sparse_column = fc.sparse_column_with_integerized_feature("values", 10)
-    features = {"values":
-                constant_op.constant(dense_scalar_input, dtype=dtypes.int64)}
+    features = {
+        "values": constant_op.constant(dense_scalar_input, dtype=dtypes.int64)
+    }
     sparse_column.insert_transformed_feature(features)
     sparse_output = features[sparse_column]
     expected_shape = [batch_size, 1]
@@ -731,8 +721,7 @@ class FeatureColumnTest(test.TestCase):
 
   def testSparseColumnKeysDeepCopy(self):
     """Tests deepcopy of sparse_column_with_keys."""
-    column = fc.sparse_column_with_keys(
-        "a", keys=["key0", "key1", "key2"])
+    column = fc.sparse_column_with_keys("a", keys=["key0", "key1", "key2"])
     self.assertEqual("a", column.name)
     column_copy = copy.deepcopy(column)
     self.assertEqual("a", column_copy.name)
@@ -785,8 +774,9 @@ class FeatureColumnTest(test.TestCase):
     a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100)
     b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100)
     cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000)
-    one_hot_col = fc.one_hot_column(fc.sparse_column_with_hash_bucket(
-        "sparse_column_for_one_hot", hash_bucket_size=100))
+    one_hot_col = fc.one_hot_column(
+        fc.sparse_column_with_hash_bucket(
+            "sparse_column_for_one_hot", hash_bucket_size=100))
     scattered_embedding_col = fc.scattered_embedding_column(
         "scattered_embedding_column", size=100, dimension=10, hash_key=1)
     feature_columns = set([
@@ -809,17 +799,13 @@ class FeatureColumnTest(test.TestCase):
         "str_id_weights_column":
             parsing_ops.VarLenFeature(dtypes.float32),
         "real_valued_column1":
-            parsing_ops.FixedLenFeature(
-                [1], dtype=dtypes.float32),
+            parsing_ops.FixedLenFeature([1], dtype=dtypes.float32),
         "real_valued_column2":
-            parsing_ops.FixedLenFeature(
-                [5], dtype=dtypes.float32),
+            parsing_ops.FixedLenFeature([5], dtype=dtypes.float32),
         "real_valued_column_for_bucketization1":
-            parsing_ops.FixedLenFeature(
-                [1], dtype=dtypes.float32),
+            parsing_ops.FixedLenFeature([1], dtype=dtypes.float32),
         "real_valued_column_for_bucketization2":
-            parsing_ops.FixedLenFeature(
-                [4], dtype=dtypes.float32),
+            parsing_ops.FixedLenFeature([4], dtype=dtypes.float32),
         "cross_aaa":
             parsing_ops.VarLenFeature(dtypes.string),
         "cross_bbb":
@@ -849,11 +835,14 @@ class FeatureColumnTest(test.TestCase):
     real_valued_col0 = fc._real_valued_var_len_column(
         "real_valued_column0", is_sparse=True)
     real_valued_col1 = fc._real_valued_var_len_column(
-        "real_valued_column1", dtype=dtypes.int64, default_value=0,
+        "real_valued_column1",
+        dtype=dtypes.int64,
+        default_value=0,
         is_sparse=False)
     feature_columns = set([real_valued_col0, real_valued_col1])
     expected_config = {
-        "real_valued_column0": parsing_ops.VarLenFeature(dtype=dtypes.float32),
+        "real_valued_column0":
+            parsing_ops.VarLenFeature(dtype=dtypes.float32),
         "real_valued_column1":
             parsing_ops.FixedLenSequenceFeature(
                 [], dtype=dtypes.int64, allow_missing=True, default_value=0),
@@ -874,7 +863,9 @@ class FeatureColumnTest(test.TestCase):
     real_valued_col5 = fc._real_valued_var_len_column(
         "real_valued_column5", default_value=2, is_sparse=True)
     real_valued_col6 = fc._real_valued_var_len_column(
-        "real_valued_column6", dtype=dtypes.int64, default_value=1,
+        "real_valued_column6",
+        dtype=dtypes.int64,
+        default_value=1,
         is_sparse=False)
     feature_columns = [
         real_valued_col1, real_valued_col2, real_valued_col3, real_valued_col4,
@@ -902,8 +893,7 @@ class FeatureColumnTest(test.TestCase):
                 parsing_ops.VarLenFeature(dtype=dtypes.float32),
             "real_valued_column6":
                 parsing_ops.FixedLenSequenceFeature(
-                    [], dtype=dtypes.int64, allow_missing=True,
-                    default_value=1)
+                    [], dtype=dtypes.int64, allow_missing=True, default_value=1)
         },
         config)
 
@@ -1104,8 +1094,8 @@ class FeatureColumnTest(test.TestCase):
       # This will initialize the crossed column weights from provided checkpoint
       # and return a [4, 1] tensor which is same as weights variable. Since we
       # won't modify weights, this should be same as 'saved_col_weights'.
-      _, col_weights, _ = (feature_column_ops.weighted_sum_from_feature_columns(
-          {
+      _, col_weights, _ = (
+          feature_column_ops.weighted_sum_from_feature_columns({
               sparse_col_1.name: input_tensor,
               sparse_col_2.name: input_tensor
           }, [crossed_col_initialized], 1))
diff --git a/tensorflow/contrib/learn/python/learn/datasets/__init__.py b/tensorflow/contrib/learn/python/learn/datasets/__init__.py
index a3521b4109..7240b0de14 100644
--- a/tensorflow/contrib/learn/python/learn/datasets/__init__.py
+++ b/tensorflow/contrib/learn/python/learn/datasets/__init__.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Dataset utilities and synthetic/reference datasets."""
 
 from __future__ import absolute_import
@@ -46,11 +45,12 @@ DATASETS = {
 
 # List of all synthetic datasets
 SYNTHETIC = {
-  # All of these will return ['data', 'target'] -> base.Dataset
-  'circles': synthetic.circles,
-  'spirals': synthetic.spirals
+    # All of these will return ['data', 'target'] -> base.Dataset
+    'circles': synthetic.circles,
+    'spirals': synthetic.spirals
 }
 
+
 def load_dataset(name, size='small', test_with_fake_data=False):
   """Loads dataset by name.
 
@@ -83,23 +83,28 @@ def make_dataset(name, n_samples=100, noise=None, seed=42, *args, **kwargs):
     seed: int or None, seed for noise
 
   Returns:
-    Shuffled features and labels for given synthetic dataset of type `base.Dataset`
+    Shuffled features and labels for given synthetic dataset of type
+    `base.Dataset`
 
   Raises:
     ValueError: Raised if `name` not found
 
   Note:
-    - This is a generic synthetic data generator - individual generators might have more parameters!
+    - This is a generic synthetic data generator - individual generators might
+    have more parameters!
       See documentation for individual parameters
-    - Note that the `noise` parameter uses `numpy.random.normal` and depends on `numpy`'s seed
+    - Note that the `noise` parameter uses `numpy.random.normal` and depends on
+    `numpy`'s seed
 
   TODO:
     - Support multiclass datasets
-    - Need shuffling routine. Currently synthetic datasets are reshuffled to avoid train/test correlation,
+    - Need shuffling routine. Currently synthetic datasets are reshuffled to
+    avoid train/test correlation,
       but that hurts reprodusability
   """
   # seed = kwargs.pop('seed', None)
   if name not in SYNTHETIC:
     raise ValueError('Synthetic dataset not found or not implemeted: %s' % name)
   else:
-    return SYNTHETIC[name](n_samples=n_samples, noise=noise, seed=seed, *args, **kwargs)
+    return SYNTHETIC[name](
+        n_samples=n_samples, noise=noise, seed=seed, *args, **kwargs)
diff --git a/tensorflow/contrib/learn/python/learn/datasets/synthetic.py b/tensorflow/contrib/learn/python/learn/datasets/synthetic.py
index 907dc0f3df..649996c49c 100644
--- a/tensorflow/contrib/learn/python/learn/datasets/synthetic.py
+++ b/tensorflow/contrib/learn/python/learn/datasets/synthetic.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Synthetic dataset generators."""
 
 from __future__ import absolute_import
@@ -23,18 +22,27 @@ import numpy as np
 
 from tensorflow.contrib.learn.python.learn.datasets.base import Dataset
 
-def circles(n_samples=100, noise=None, seed=None, factor=0.8, n_classes=2, *args, **kwargs):
+
+def circles(n_samples=100,
+            noise=None,
+            seed=None,
+            factor=0.8,
+            n_classes=2,
+            *args,
+            **kwargs):
   """Create circles separated by some value
 
   Args:
     n_samples: int, number of datapoints to generate
     noise: float or None, standard deviation of the Gaussian noise added
     seed: int or None, seed for the noise
-    factor: float, size factor of the inner circles with respect to the outer ones
+    factor: float, size factor of the inner circles with respect to the outer
+      ones
     n_classes: int, number of classes to generate
 
   Returns:
-    Shuffled features and labels for 'circles' synthetic dataset of type `base.Dataset`
+    Shuffled features and labels for 'circles' synthetic dataset of type
+    `base.Dataset`
 
   Note:
     The multi-class support might not work as expected if `noise` is enabled
@@ -54,7 +62,7 @@ def circles(n_samples=100, noise=None, seed=None, factor=0.8, n_classes=2, *args
   if seed is not None:
     np.random.seed(seed)
   # Algo: 1) Generate initial circle, 2) For ever class generate a smaller radius circle
-  linspace = np.linspace(0, 2*np.pi, n_samples // n_classes)
+  linspace = np.linspace(0, 2 * np.pi, n_samples // n_classes)
   circ_x = np.empty(0, dtype=np.int32)
   circ_y = np.empty(0, dtype=np.int32)
   base_cos = np.cos(linspace)
@@ -66,12 +74,12 @@ def circles(n_samples=100, noise=None, seed=None, factor=0.8, n_classes=2, *args
     circ_y = np.append(circ_y, base_sin)
     base_cos *= factor
     base_sin *= factor
-    y = np.append(y, label*np.ones(n_samples // n_classes, dtype=np.int32))
+    y = np.append(y, label * np.ones(n_samples // n_classes, dtype=np.int32))
 
   # Add more points if n_samples is not divisible by n_classes (unbalanced!)
   extras = n_samples % n_classes
-  circ_x = np.append(circ_x, np.cos(np.random.rand(extras)*2*np.pi))
-  circ_y = np.append(circ_y, np.sin(np.random.rand(extras)*2*np.pi))
+  circ_x = np.append(circ_x, np.cos(np.random.rand(extras) * 2 * np.pi))
+  circ_y = np.append(circ_y, np.sin(np.random.rand(extras) * 2 * np.pi))
   y = np.append(y, np.zeros(extras, dtype=np.int32))
 
   # Reshape the features/labels
@@ -85,10 +93,13 @@ def circles(n_samples=100, noise=None, seed=None, factor=0.8, n_classes=2, *args
   return Dataset(data=X[indices], target=y[indices])
 
 
-def spirals(n_samples=100, noise=None, seed=None,
-            mode = 'archimedes',
-            n_loops = 2,
-            *args, **kwargs):
+def spirals(n_samples=100,
+            noise=None,
+            seed=None,
+            mode='archimedes',
+            n_loops=2,
+            *args,
+            **kwargs):
   """Create spirals
 
   Currently only binary classification is supported for spiral generation
@@ -104,7 +115,8 @@ def spirals(n_samples=100, noise=None, seed=None,
       'fermat': a spiral with branch distances decreasing (sqrt)
 
   Returns:
-    Shuffled features and labels for 'spirals' synthetic dataset of type `base.Dataset`
+    Shuffled features and labels for 'spirals' synthetic dataset of type
+    `base.Dataset`
 
   Raises:
     ValueError: If the generation `mode` is not valid
@@ -112,34 +124,35 @@ def spirals(n_samples=100, noise=None, seed=None,
   TODO:
     - Generation of unbalanced data
   """
-  n_classes = 2 # I am not sure how to make it multiclass
+  n_classes = 2  # I am not sure how to make it multiclass
 
   _modes = {
-    'archimedes': _archimedes_spiral,
-    'bernoulli': _bernoulli_spiral,
-    'fermat': _fermat_spiral
+      'archimedes': _archimedes_spiral,
+      'bernoulli': _bernoulli_spiral,
+      'fermat': _fermat_spiral
   }
 
   if mode is None or mode not in _modes:
-    raise ValueError("Cannot generate spiral with mode %s"%mode)
+    raise ValueError('Cannot generate spiral with mode %s' % mode)
 
   if seed is not None:
     np.random.seed(seed)
-  linspace = np.linspace(0, 2*n_loops*np.pi, n_samples // n_classes)
+  linspace = np.linspace(0, 2 * n_loops * np.pi, n_samples // n_classes)
   spir_x = np.empty(0, dtype=np.int32)
   spir_y = np.empty(0, dtype=np.int32)
 
   y = np.empty(0, dtype=np.int32)
   for label in range(n_classes):
-    base_cos, base_sin = _modes[mode](linspace, label*np.pi, *args, **kwargs)
+    base_cos, base_sin = _modes[mode](linspace, label * np.pi, *args, **kwargs)
     spir_x = np.append(spir_x, base_cos)
     spir_y = np.append(spir_y, base_sin)
-    y = np.append(y, label*np.ones(n_samples // n_classes, dtype=np.int32))
+    y = np.append(y, label * np.ones(n_samples // n_classes, dtype=np.int32))
 
   # Add more points if n_samples is not divisible by n_classes (unbalanced!)
   extras = n_samples % n_classes
   if extras > 0:
-    x_exrta, y_extra = _modes[mode](np.random.rand(extras)*2*np.pi, *args, **kwargs)
+    x_exrta, y_extra = _modes[mode](np.random.rand(extras) * 2 * np.pi, *args,
+                                    **kwargs)
     spir_x = np.append(spir_x, x_extra)
     spir_y = np.append(spir_y, y_extra)
     y = np.append(y, np.zeros(extras, dtype=np.int32))
@@ -162,7 +175,8 @@ def _archimedes_spiral(theta, theta_offset=0., *args, **kwargs):
     theta: array-like, angles from polar coordinates to be converted
     theta_offset: float, angle offset in radians (2*pi = 0)
   """
-  x, y = theta*np.cos(theta + theta_offset), theta*np.sin(theta + theta_offset)
+  x, y = theta * np.cos(theta + theta_offset), theta * np.sin(
+      theta + theta_offset)
   x_norm = np.max(np.abs(x))
   y_norm = np.max(np.abs(y))
   x, y = x / x_norm, y / y_norm
@@ -181,7 +195,8 @@ def _bernoulli_spiral(theta, theta_offset=0., *args, **kwargs):
   """
   exp_scale = kwargs.pop('exp_scale', 0.1)
 
-  x, y = np.exp(exp_scale*theta)*np.cos(theta + theta_offset), np.exp(exp_scale*theta)*np.sin(theta + theta_offset)
+  x, y = np.exp(exp_scale * theta) * np.cos(theta + theta_offset), np.exp(
+      exp_scale * theta) * np.sin(theta + theta_offset)
   x_norm = np.max(np.abs(x))
   y_norm = np.max(np.abs(y))
   x, y = x / x_norm, y / y_norm
@@ -195,7 +210,8 @@ def _fermat_spiral(theta, theta_offset=0., *args, **kwargs):
     theta: array-like, angles from polar coordinates to be converted
     theta_offset: float, angle offset in radians (2*pi = 0)
   """
-  x, y = np.sqrt(theta)*np.cos(theta + theta_offset), np.sqrt(theta)*np.sin(theta + theta_offset)
+  x, y = np.sqrt(theta) * np.cos(theta + theta_offset), np.sqrt(theta) * np.sin(
+      theta + theta_offset)
   x_norm = np.max(np.abs(x))
   y_norm = np.max(np.abs(y))
   x, y = x / x_norm, y / y_norm
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator.py b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
index 50c74add86..8d59fe66d9 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Base Estimator class."""
 
 from __future__ import absolute_import
@@ -76,7 +75,6 @@ from tensorflow.python.util import compat
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
 
-
 AS_ITERABLE_DATE = '2016-09-15'
 AS_ITERABLE_INSTRUCTIONS = (
     'The default behavior of predict() is changing. The default value for\n'
@@ -223,8 +221,11 @@ def _get_replica_device_setter(config):
 
   if config.num_ps_replicas > 0:
     return device_setter.replica_device_setter(
-        ps_tasks=config.num_ps_replicas, worker_device=worker_device,
-        merge_devices=True, ps_ops=ps_ops, cluster=config.cluster_spec)
+        ps_tasks=config.num_ps_replicas,
+        worker_device=worker_device,
+        merge_devices=True,
+        ps_ops=ps_ops,
+        cluster=config.cluster_spec)
   else:
     return None
 
@@ -284,10 +285,10 @@ def _make_metrics_ops(metrics, features, labels, predictions):
         raise ValueError('Invalid metric for {}. It returned a tuple with '
                          'len {}, expected 2.'.format(name, len(name)))
       if not isinstance(predictions, dict):
-        raise ValueError(
-            'Metrics passed provide (name, prediction), '
-            'but predictions are not dict. '
-            'Metrics: %s, Predictions: %s.' % (metrics, predictions))
+        raise ValueError('Metrics passed provide (name, prediction), '
+                         'but predictions are not dict. '
+                         'Metrics: %s, Predictions: %s.' % (metrics,
+                                                            predictions))
       # Here are two options: labels are single Tensor or a dict.
       if isinstance(labels, dict) and name[1] in labels:
         # If labels are dict and the prediction name is in it, apply metric.
@@ -298,10 +299,10 @@ def _make_metrics_ops(metrics, features, labels, predictions):
     else:
       # Single head metrics.
       if isinstance(predictions, dict):
-        raise ValueError(
-            'Metrics passed provide only name, no prediction, '
-            'but predictions are dict. '
-            'Metrics: %s, Labels: %s.' % (metrics, labels_tensor_or_dict))
+        raise ValueError('Metrics passed provide only name, no prediction, '
+                         'but predictions are dict. '
+                         'Metrics: %s, Labels: %s.' % (metrics,
+                                                       labels_tensor_or_dict))
       result[name] = metric(predictions, labels_tensor_or_dict)
   return result
 
@@ -369,9 +370,8 @@ def _write_dict_to_summary(output_dir, dictionary, current_global_step):
       logging.info(
           'Summary for np.ndarray is not visible in Tensorboard by default. '
           'Consider using a Tensorboard plugin for visualization (see '
-          'https://github.com/tensorflow/tensorboard-plugin-example/blob/master/README.md '  # pylint:disable=line-too-long
-          'for more information).'
-      )
+          'https://github.com/tensorflow/tensorboard-plugin-example/blob/master/README.md'
+          ' for more information).')
     else:
       logging.warn(
           'Skipping summary for %s, must be a float, np.float32, np.int64, '
@@ -385,8 +385,8 @@ GraphRewriteSpec = collections.namedtuple('GraphRewriteSpec',
                                           ['tags', 'transforms'])
 
 
-class BaseEstimator(
-    sklearn.BaseEstimator, evaluable.Evaluable, trainable.Trainable):
+class BaseEstimator(sklearn.BaseEstimator, evaluable.Evaluable,
+                    trainable.Trainable):
   """Abstract BaseEstimator class to train and evaluate TensorFlow models.
 
   Users should not instantiate or subclass this class. Instead, use an
@@ -428,7 +428,7 @@ class BaseEstimator(
         #                  necessary.
         # pylint: disable=g-doc-exception
         raise ValueError(
-            "model_dir are set both in constructor and RunConfig, but with "
+            'model_dir are set both in constructor and RunConfig, but with '
             "different values. In constructor: '{}', in RunConfig: "
             "'{}' ".format(model_dir, self._config.model_dir))
         # pylint: enable=g-doc-exception
@@ -457,12 +457,16 @@ class BaseEstimator(
     # TODO(wicke): make RunConfig immutable, and then return it without a copy.
     return copy.deepcopy(self._config)
 
-  @deprecated_args(
-      SCIKIT_DECOUPLE_DATE, SCIKIT_DECOUPLE_INSTRUCTIONS, ('x', None),
-      ('y', None), ('batch_size', None)
-  )
-  def fit(self, x=None, y=None, input_fn=None, steps=None, batch_size=None,
-          monitors=None, max_steps=None):
+  @deprecated_args(SCIKIT_DECOUPLE_DATE, SCIKIT_DECOUPLE_INSTRUCTIONS,
+                   ('x', None), ('y', None), ('batch_size', None))
+  def fit(self,
+          x=None,
+          y=None,
+          input_fn=None,
+          steps=None,
+          batch_size=None,
+          monitors=None,
+          max_steps=None):
     # pylint: disable=g-doc-args,g-doc-return-or-yield
     """See `Trainable`.
 
@@ -494,13 +498,15 @@ class BaseEstimator(
     logging.info('Loss for final step: %s.', loss)
     return self
 
-  @deprecated_args(
-      SCIKIT_DECOUPLE_DATE, SCIKIT_DECOUPLE_INSTRUCTIONS, ('x', None),
-      ('y', None), ('batch_size', None)
-  )
-  def partial_fit(
-      self, x=None, y=None, input_fn=None, steps=1, batch_size=None,
-      monitors=None):
+  @deprecated_args(SCIKIT_DECOUPLE_DATE, SCIKIT_DECOUPLE_INSTRUCTIONS,
+                   ('x', None), ('y', None), ('batch_size', None))
+  def partial_fit(self,
+                  x=None,
+                  y=None,
+                  input_fn=None,
+                  steps=1,
+                  batch_size=None,
+                  monitors=None):
     """Incremental fit on a batch of samples.
 
     This method is expected to be called several times consecutively
@@ -536,13 +542,16 @@ class BaseEstimator(
     """
     logging.warning('The current implementation of partial_fit is not optimized'
                     ' for use in a loop. Consider using fit() instead.')
-    return self.fit(x=x, y=y, input_fn=input_fn, steps=steps,
-                    batch_size=batch_size, monitors=monitors)
+    return self.fit(
+        x=x,
+        y=y,
+        input_fn=input_fn,
+        steps=steps,
+        batch_size=batch_size,
+        monitors=monitors)
 
-  @deprecated_args(
-      SCIKIT_DECOUPLE_DATE, SCIKIT_DECOUPLE_INSTRUCTIONS, ('x', None),
-      ('y', None), ('batch_size', None)
-  )
+  @deprecated_args(SCIKIT_DECOUPLE_DATE, SCIKIT_DECOUPLE_INSTRUCTIONS,
+                   ('x', None), ('y', None), ('batch_size', None))
   def evaluate(self,
                x=None,
                y=None,
@@ -584,13 +593,14 @@ class BaseEstimator(
       eval_results.update({'global_step': global_step})
     return eval_results
 
-  @deprecated_args(
-      SCIKIT_DECOUPLE_DATE, SCIKIT_DECOUPLE_INSTRUCTIONS, ('x', None),
-      ('batch_size', None), ('as_iterable', True)
-  )
-  def predict(
-      self, x=None, input_fn=None, batch_size=None, outputs=None,
-      as_iterable=True):
+  @deprecated_args(SCIKIT_DECOUPLE_DATE, SCIKIT_DECOUPLE_INSTRUCTIONS,
+                   ('x', None), ('batch_size', None), ('as_iterable', True))
+  def predict(self,
+              x=None,
+              input_fn=None,
+              batch_size=None,
+              outputs=None,
+              as_iterable=True):
     """Returns predictions for given features.
 
     Args:
@@ -651,16 +661,17 @@ class BaseEstimator(
     return self._model_dir
 
   @deprecated('2017-03-25', 'Please use Estimator.export_savedmodel() instead.')
-  def export(self,
-             export_dir,
-             input_fn=export._default_input_fn,  # pylint: disable=protected-access
-             input_feature_key=None,
-             use_deprecated_input_fn=True,
-             signature_fn=None,
-             prediction_key=None,
-             default_batch_size=1,
-             exports_to_keep=None,
-             checkpoint_path=None):
+  def export(
+      self,
+      export_dir,
+      input_fn=export._default_input_fn,  # pylint: disable=protected-access
+      input_feature_key=None,
+      use_deprecated_input_fn=True,
+      signature_fn=None,
+      prediction_key=None,
+      default_batch_size=1,
+      exports_to_keep=None,
+      checkpoint_path=None):
     """Exports inference graph into given dir.
 
     Args:
@@ -798,8 +809,8 @@ class BaseEstimator(
       logging.debug('Setting feature info to %s.', str(self._features_info))
     if labels is not None:
       if self._labels_info is not None:
-        logging.debug('Given labels: %s, required signatures: %s.',
-                      str(labels), str(self._labels_info))
+        logging.debug('Given labels: %s, required signatures: %s.', str(labels),
+                      str(self._labels_info))
         if not tensor_signature.tensors_compatible(labels, self._labels_info):
           raise ValueError('Labels are incompatible with given information. '
                            'Given labels: %s, required signatures: %s.' %
@@ -850,13 +861,13 @@ class BaseEstimator(
     if not checkpoint_path:
       latest_path = saver.latest_checkpoint(self._model_dir)
       if not latest_path:
-        raise NotFittedError("Couldn't find trained model at %s."
-                             % self._model_dir)
+        raise NotFittedError(
+            "Couldn't find trained model at %s." % self._model_dir)
       checkpoint_path = latest_path
 
     # Setup output directory.
-    eval_dir = os.path.join(self._model_dir, 'eval' if not name else
-                            'eval_' + name)
+    eval_dir = os.path.join(self._model_dir, 'eval'
+                            if not name else 'eval_' + name)
 
     with ops.Graph().as_default() as g:
       random_seed.set_random_seed(self._config.tf_random_seed)
@@ -879,8 +890,7 @@ class BaseEstimator(
                         'Use steps=None if intended.')
       if steps:
         hooks.append(
-            evaluation.StopAfterNEvalsHook(
-                steps, log_progress=log_progress))
+            evaluation.StopAfterNEvalsHook(steps, log_progress=log_progress))
 
       global_step_key = 'global_step'
       while global_step_key in eval_dict:
@@ -916,8 +926,8 @@ class BaseEstimator(
     # Check that model has been trained.
     checkpoint_path = saver.latest_checkpoint(self._model_dir)
     if not checkpoint_path:
-      raise NotFittedError("Couldn't find trained model at %s."
-                           % self._model_dir)
+      raise NotFittedError(
+          "Couldn't find trained model at %s." % self._model_dir)
 
     with ops.Graph().as_default() as g:
       random_seed.set_random_seed(self._config.tf_random_seed)
@@ -979,7 +989,8 @@ class BaseEstimator(
     existing_keys = predictions.keys()
     predictions = {
         key: value
-        for key, value in six.iteritems(predictions) if key in outputs
+        for key, value in six.iteritems(predictions)
+        if key in outputs
     }
     if not predictions:
       raise ValueError('Expected to run at least one output from %s, '
@@ -1045,8 +1056,7 @@ class BaseEstimator(
           chief_only_hooks=chief_hooks + model_fn_ops.training_chief_hooks,
           save_checkpoint_secs=0,  # Saving is handled by a hook.
           save_summaries_steps=self._config.save_summary_steps,
-          config=self._session_config
-      ) as mon_sess:
+          config=self._session_config) as mon_sess:
         loss = None
         while not mon_sess.should_stop():
           _, loss = mon_sess.run([model_fn_ops.train_op, model_fn_ops.loss])
@@ -1137,8 +1147,7 @@ class Estimator(BaseEstimator):
       if params is not None and 'params' not in model_fn_args:
         raise ValueError('Estimator\'s model_fn (%s) does not have a params '
                          'argument, but params (%s) were passed to the '
-                         'Estimator\'s constructor.' %
-                         (model_fn, params))
+                         'Estimator\'s constructor.' % (model_fn, params))
       if params is None and 'params' in model_fn_args:
         logging.warning('Estimator\'s model_fn (%s) includes params '
                         'argument, but params are not passed to Estimator.',
@@ -1192,8 +1201,9 @@ class Estimator(BaseEstimator):
 
     # Custom metrics should overwrite defaults.
     if metrics:
-      model_fn_ops.eval_metric_ops.update(_make_metrics_ops(
-        metrics, features, labels, model_fn_ops.predictions))
+      model_fn_ops.eval_metric_ops.update(
+          _make_metrics_ops(metrics, features, labels,
+                            model_fn_ops.predictions))
 
     return model_fn_ops
 
@@ -1238,8 +1248,8 @@ class Estimator(BaseEstimator):
     Raises:
       ValueError: if `metrics` don't match `labels`.
     """
-    model_fn_ops = self._call_model_fn(
-        features, labels, model_fn_lib.ModeKeys.EVAL, metrics)
+    model_fn_ops = self._call_model_fn(features, labels,
+                                       model_fn_lib.ModeKeys.EVAL, metrics)
 
     if metric_key.MetricKey.LOSS not in model_fn_ops.eval_metric_ops:
       model_fn_ops.eval_metric_ops[metric_key.MetricKey.LOSS] = (
@@ -1263,14 +1273,16 @@ class Estimator(BaseEstimator):
         self._labels_info)
     return self._call_model_fn(features, labels, model_fn_lib.ModeKeys.INFER)
 
-  def export_savedmodel(
-      self, export_dir_base, serving_input_fn,
-      default_output_alternative_key=None,
-      assets_extra=None,
-      as_text=False,
-      checkpoint_path=None,
-      graph_rewrite_specs=(GraphRewriteSpec((tag_constants.SERVING,), ()),),
-      strip_default_attrs=False):
+  def export_savedmodel(self,
+                        export_dir_base,
+                        serving_input_fn,
+                        default_output_alternative_key=None,
+                        assets_extra=None,
+                        as_text=False,
+                        checkpoint_path=None,
+                        graph_rewrite_specs=(GraphRewriteSpec(
+                            (tag_constants.SERVING,), ()),),
+                        strip_default_attrs=False):
     # pylint: disable=line-too-long
     """Exports inference graph as a SavedModel into given dir.
 
@@ -1297,7 +1309,8 @@ class Estimator(BaseEstimator):
         default serving tag ("serve") and no rewriting.
       strip_default_attrs: Boolean. If `True`, default-valued attributes will be
         removed from the NodeDefs. For a detailed guide, see
-        [Stripping Default-Valued Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
+        [Stripping Default-Valued
+          Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
 
     Returns:
       The string path to the exported directory.
@@ -1313,8 +1326,8 @@ class Estimator(BaseEstimator):
       # Locate the latest checkpoint
       checkpoint_path = saver.latest_checkpoint(self._model_dir)
     if not checkpoint_path:
-      raise NotFittedError("Couldn't find trained model at %s."
-                           % self._model_dir)
+      raise NotFittedError(
+          "Couldn't find trained model at %s." % self._model_dir)
 
     export_dir = saved_model_export_utils.get_timestamped_export_dir(
         export_dir_base)
@@ -1348,10 +1361,10 @@ class Estimator(BaseEstimator):
           saved_model_export_utils.get_output_alternatives(
               model_fn_ops, default_output_alternative_key))
 
-      init_op = control_flow_ops.group(
-          variables.local_variables_initializer(),
-          resources.initialize_resources(resources.shared_resources()),
-          lookup_ops.tables_initializer())
+      init_op = control_flow_ops.group(variables.local_variables_initializer(),
+                                       resources.initialize_resources(
+                                           resources.shared_resources()),
+                                       lookup_ops.tables_initializer())
 
       # Build the SignatureDefs from all pairs of input and output alternatives
       signature_def_map = saved_model_export_utils.build_all_signature_defs(
@@ -1381,10 +1394,10 @@ class Estimator(BaseEstimator):
 
         # TODO(soergel): switch to main_op or otherwise update when dust settles
         builder.add_meta_graph_and_variables(
-            session, untransformed_tags,
+            session,
+            untransformed_tags,
             signature_def_map=signature_def_map,
-            assets_collection=ops.get_collection(
-                ops.GraphKeys.ASSET_FILEPATHS),
+            assets_collection=ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS),
             legacy_init_op=init_op,
             strip_default_attrs=strip_default_attrs)
 
@@ -1395,12 +1408,16 @@ class Estimator(BaseEstimator):
     if graph_rewrite_specs[1:]:
       # Prepare the input_names and output_names needed for the
       # meta_graph_transform call below.
-      input_names = [tensor.name
-                     for input_dict in input_alternatives.values()
-                     for tensor in input_dict.values()]
-      output_names = [tensor.name
-                      for output_alternative in output_alternatives.values()
-                      for tensor in output_alternative[1].values()]
+      input_names = [
+          tensor.name
+          for input_dict in input_alternatives.values()
+          for tensor in input_dict.values()
+      ]
+      output_names = [
+          tensor.name
+          for output_alternative in output_alternatives.values()
+          for tensor in output_alternative[1].values()
+      ]
 
     # Write the additional MetaGraphDefs
     for graph_rewrite_spec in graph_rewrite_specs[1:]:
@@ -1419,11 +1436,11 @@ class Estimator(BaseEstimator):
 
     # Add the extra assets
     if assets_extra:
-      assets_extra_path = os.path.join(compat.as_bytes(temp_export_dir),
-                                       compat.as_bytes('assets.extra'))
+      assets_extra_path = os.path.join(
+          compat.as_bytes(temp_export_dir), compat.as_bytes('assets.extra'))
       for dest_relative, source in assets_extra.items():
-        dest_absolute = os.path.join(compat.as_bytes(assets_extra_path),
-                                     compat.as_bytes(dest_relative))
+        dest_absolute = os.path.join(
+            compat.as_bytes(assets_extra_path), compat.as_bytes(dest_relative))
         dest_path = os.path.dirname(dest_absolute)
         gfile.MakeDirs(dest_path)
         gfile.Copy(source, dest_absolute)
@@ -1443,25 +1460,36 @@ class SKCompat(sklearn.BaseEstimator):
 
   def fit(self, x, y, batch_size=128, steps=None, max_steps=None,
           monitors=None):
-    input_fn, feed_fn = _get_input_fn(x, y, input_fn=None, feed_fn=None,
-                                      batch_size=batch_size, shuffle=True,
-                                      epochs=None)
+    input_fn, feed_fn = _get_input_fn(
+        x,
+        y,
+        input_fn=None,
+        feed_fn=None,
+        batch_size=batch_size,
+        shuffle=True,
+        epochs=None)
     all_monitors = []
     if feed_fn:
       all_monitors = [basic_session_run_hooks.FeedFnHook(feed_fn)]
     if monitors:
       all_monitors.extend(monitors)
 
-    self._estimator.fit(input_fn=input_fn,
-                        steps=steps,
-                        max_steps=max_steps,
-                        monitors=all_monitors)
+    self._estimator.fit(
+        input_fn=input_fn,
+        steps=steps,
+        max_steps=max_steps,
+        monitors=all_monitors)
     return self
 
   def score(self, x, y, batch_size=128, steps=None, metrics=None, name=None):
-    input_fn, feed_fn = _get_input_fn(x, y, input_fn=None,
-                                      feed_fn=None, batch_size=batch_size,
-                                      shuffle=False, epochs=1)
+    input_fn, feed_fn = _get_input_fn(
+        x,
+        y,
+        input_fn=None,
+        feed_fn=None,
+        batch_size=batch_size,
+        shuffle=False,
+        epochs=1)
     if metrics is not None and not isinstance(metrics, dict):
       raise ValueError('Metrics argument should be None or dict. '
                        'Got %s.' % metrics)
@@ -1477,8 +1505,13 @@ class SKCompat(sklearn.BaseEstimator):
 
   def predict(self, x, batch_size=128, outputs=None):
     input_fn, feed_fn = _get_input_fn(
-        x, None, input_fn=None, feed_fn=None, batch_size=batch_size,
-        shuffle=False, epochs=1)
+        x,
+        None,
+        input_fn=None,
+        feed_fn=None,
+        batch_size=batch_size,
+        shuffle=False,
+        epochs=1)
     results = list(
         self._estimator._infer_model(
             input_fn=input_fn,
@@ -1489,7 +1522,6 @@ class SKCompat(sklearn.BaseEstimator):
     if not isinstance(results[0], dict):
       return np.concatenate([output for output in results], axis=0)
     return {
-        key: np.concatenate(
-            [output[key] for output in results], axis=0)
+        key: np.concatenate([output[key] for output in results], axis=0)
         for key in results[0]
     }
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py b/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
index 5f682838b7..d81a534b79 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
@@ -111,8 +111,8 @@ def boston_eval_fn():
       constant_op.constant(boston.data), [n_examples, _BOSTON_INPUT_DIM])
   labels = array_ops.reshape(
       constant_op.constant(boston.target), [n_examples, 1])
-  return array_ops.concat([features, features], 0), array_ops.concat(
-      [labels, labels], 0)
+  return array_ops.concat([features, features],
+                          0), array_ops.concat([labels, labels], 0)
 
 
 def extract(data, key):
@@ -147,7 +147,10 @@ def linear_model_fn(features, labels, mode):
     (_, features), = features.items()
   prediction, loss = (models.linear_regression_zero_init(features, labels))
   train_op = optimizers.optimize_loss(
-      loss, training_util.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
+      loss,
+      training_util.get_global_step(),
+      optimizer='Adagrad',
+      learning_rate=0.1)
   return prediction, loss, train_op
 
 
@@ -157,7 +160,10 @@ def linear_model_fn_with_model_fn_ops(features, labels, mode):
                   model_fn.ModeKeys.INFER)
   prediction, loss = (models.linear_regression_zero_init(features, labels))
   train_op = optimizers.optimize_loss(
-      loss, training_util.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
+      loss,
+      training_util.get_global_step(),
+      optimizer='Adagrad',
+      learning_rate=0.1)
   return model_fn.ModelFnOps(
       mode=mode, predictions=prediction, loss=loss, train_op=train_op)
 
@@ -168,7 +174,10 @@ def logistic_model_no_mode_fn(features, labels):
   labels = array_ops.one_hot(labels, 3, 1, 0)
   prediction, loss = (models.logistic_regression_zero_init(features, labels))
   train_op = optimizers.optimize_loss(
-      loss, training_util.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
+      loss,
+      training_util.get_global_step(),
+      optimizer='Adagrad',
+      learning_rate=0.1)
   return {
       'class': math_ops.argmax(prediction, 1),
       'prob': prediction
@@ -184,14 +193,12 @@ def _build_estimator_for_export_tests(tmpdir):
   def _input_fn():
     iris = base.load_iris()
     return {
-        'feature': constant_op.constant(
-            iris.data, dtype=dtypes.float32)
+        'feature': constant_op.constant(iris.data, dtype=dtypes.float32)
     }, constant_op.constant(
         iris.target, shape=[150], dtype=dtypes.int32)
 
   feature_columns = [
-      feature_column_lib.real_valued_column(
-          'feature', dimension=4)
+      feature_column_lib.real_valued_column('feature', dimension=4)
   ]
 
   est = linear.LinearRegressor(feature_columns)
@@ -291,8 +298,8 @@ class CheckCallsMonitor(monitors_lib.BaseMonitor):
             self.begin_calls == self.expect_calls)
 
 
-def _model_fn_ops(
-    expected_features, expected_labels, actual_features, actual_labels, mode):
+def _model_fn_ops(expected_features, expected_labels, actual_features,
+                  actual_labels, mode):
   assert_ops = tuple([
       check_ops.assert_equal(
           expected_features[k], actual_features[k], name='assert_%s' % k)
@@ -310,11 +317,11 @@ def _model_fn_ops(
 
 
 def _make_input_fn(features, labels):
+
   def _input_fn():
-    return {
-        k: constant_op.constant(v)
-        for k, v in six.iteritems(features)
-    }, constant_op.constant(labels)
+    return {k: constant_op.constant(v)
+            for k, v in six.iteritems(features)}, constant_op.constant(labels)
+
   return _input_fn
 
 
@@ -369,11 +376,13 @@ class EstimatorModelFnTest(test.TestCase):
       self.assertEqual(expected_params, params)
       self.assertTrue(config.i_am_test)
       return _model_fn_ops(features, labels, arg0, arg1, mode)
+
     partial_model_fn = functools.partial(
         _model_fn, foo=expected_foo, bar=expected_bar)
 
     est = estimator.Estimator(
-        model_fn=partial_model_fn, params=expected_params,
+        model_fn=partial_model_fn,
+        params=expected_params,
         config=expected_config)
     self.assertEqual(0, model_fn_call_count[0])
     est.fit(input_fn=_make_input_fn(features, labels), steps=1)
@@ -382,7 +391,12 @@ class EstimatorModelFnTest(test.TestCase):
   def testModelFnWithModelDir(self):
     expected_param = {'some_param': 'some_value'}
     expected_model_dir = tempfile.mkdtemp()
-    def _argument_checker(features, labels, mode, params, config=None,
+
+    def _argument_checker(features,
+                          labels,
+                          mode,
+                          params,
+                          config=None,
                           model_dir=None):
       _, _, _ = features, labels, config
       self.assertEqual(model_fn.ModeKeys.TRAIN, mode)
@@ -390,9 +404,11 @@ class EstimatorModelFnTest(test.TestCase):
       self.assertEqual(model_dir, expected_model_dir)
       return (constant_op.constant(0.), constant_op.constant(0.),
               training_util.get_global_step().assign_add(1))
-    est = estimator.Estimator(model_fn=_argument_checker,
-                              params=expected_param,
-                              model_dir=expected_model_dir)
+
+    est = estimator.Estimator(
+        model_fn=_argument_checker,
+        params=expected_param,
+        model_dir=expected_model_dir)
     est.fit(input_fn=boston_input_fn, steps=1)
 
   def testInvalidModelFn_no_train_op(self):
@@ -447,8 +463,7 @@ class EstimatorModelFnTest(test.TestCase):
       est.predict(input_fn=boston_input_fn)
     with self.assertRaisesRegexp(ValueError, 'Missing prediction'):
       est.predict(
-          input_fn=functools.partial(
-              boston_input_fn, num_epochs=1),
+          input_fn=functools.partial(boston_input_fn, num_epochs=1),
           as_iterable=True)
 
   def testModelFnScaffoldInTraining(self):
@@ -498,15 +513,17 @@ class EstimatorModelFnTest(test.TestCase):
     self.assertTrue(self.mock_saver.restore.called)
     est.predict(input_fn=input_fn)
     self.assertTrue(self.mock_saver.restore.called)
+
     def serving_input_fn():
-      serialized_tf_example = array_ops.placeholder(dtype=dtypes.string,
-                                                    shape=[None],
-                                                    name='input_example_tensor')
+      serialized_tf_example = array_ops.placeholder(
+          dtype=dtypes.string, shape=[None], name='input_example_tensor')
       features, labels = input_fn()
-      return input_fn_utils.InputFnOps(
-          features, labels, {'examples': serialized_tf_example})
+      return input_fn_utils.InputFnOps(features, labels, {
+          'examples': serialized_tf_example
+      })
 
-    est.export_savedmodel(os.path.join(est.model_dir, 'export'), serving_input_fn)
+    est.export_savedmodel(
+        os.path.join(est.model_dir, 'export'), serving_input_fn)
     self.assertTrue(self.mock_saver.restore.called)
 
 
@@ -550,33 +567,28 @@ class EstimatorTest(test.TestCase):
 
   def testRunConfigModelDir(self):
     config = run_config.RunConfig(model_dir='test_dir')
-    est = estimator.Estimator(model_fn=linear_model_fn,
-                              config=config)
+    est = estimator.Estimator(model_fn=linear_model_fn, config=config)
     self.assertEqual('test_dir', est.config.model_dir)
     self.assertEqual('test_dir', est.model_dir)
 
   def testModelDirAndRunConfigModelDir(self):
     config = run_config.RunConfig(model_dir='test_dir')
-    est = estimator.Estimator(model_fn=linear_model_fn,
-                              config=config,
-                              model_dir='test_dir')
+    est = estimator.Estimator(
+        model_fn=linear_model_fn, config=config, model_dir='test_dir')
     self.assertEqual('test_dir', est.config.model_dir)
 
     with self.assertRaisesRegexp(
-        ValueError,
-        'model_dir are set both in constructor and RunConfig, '
+        ValueError, 'model_dir are set both in constructor and RunConfig, '
         'but with different'):
-      estimator.Estimator(model_fn=linear_model_fn,
-                          config=config,
-                          model_dir='different_dir')
+      estimator.Estimator(
+          model_fn=linear_model_fn, config=config, model_dir='different_dir')
 
   def testModelDirIsCopiedToRunConfig(self):
     config = run_config.RunConfig()
     self.assertIsNone(config.model_dir)
 
-    est = estimator.Estimator(model_fn=linear_model_fn,
-                              model_dir='test_dir',
-                              config=config)
+    est = estimator.Estimator(
+        model_fn=linear_model_fn, model_dir='test_dir', config=config)
     self.assertEqual('test_dir', est.config.model_dir)
     self.assertEqual('test_dir', est.model_dir)
 
@@ -656,25 +668,27 @@ class EstimatorTest(test.TestCase):
     boston = base.load_boston()
     output_dir = tempfile.mkdtemp()
     est = estimator.SKCompat(
-        estimator.Estimator(
-            model_fn=linear_model_fn, model_dir=output_dir))
+        estimator.Estimator(model_fn=linear_model_fn, model_dir=output_dir))
     float64_labels = boston.target.astype(np.float64)
     est.fit(x=boston.data, y=float64_labels, steps=50)
     scores = est.score(
         x=boston.data,
         y=float64_labels,
-        metrics={'MSE': metric_ops.streaming_mean_squared_error})
+        metrics={
+            'MSE': metric_ops.streaming_mean_squared_error
+        })
     del est
     # Create another estimator object with the same output dir.
     est2 = estimator.SKCompat(
-        estimator.Estimator(
-            model_fn=linear_model_fn, model_dir=output_dir))
+        estimator.Estimator(model_fn=linear_model_fn, model_dir=output_dir))
 
     # Check we can evaluate and predict.
     scores2 = est2.score(
         x=boston.data,
         y=float64_labels,
-        metrics={'MSE': metric_ops.streaming_mean_squared_error})
+        metrics={
+            'MSE': metric_ops.streaming_mean_squared_error
+        })
     self.assertAllClose(scores['MSE'], scores2['MSE'])
     predictions = np.array(list(est2.predict(x=boston.data)))
     other_score = _sklearn.mean_squared_error(predictions, float64_labels)
@@ -685,14 +699,15 @@ class EstimatorTest(test.TestCase):
     scores3 = est2.score(
         x=boston.data,
         y=float64_labels,
-        metrics={'MSE': metric_ops.streaming_mean_squared_error})
+        metrics={
+            'MSE': metric_ops.streaming_mean_squared_error
+        })
     self.assertLess(scores3['MSE'], scores['MSE'])
 
   def test_checkpoint_contains_relative_paths(self):
     tmpdir = tempfile.mkdtemp()
     est = estimator.Estimator(
-        model_dir=tmpdir,
-        model_fn=linear_model_fn_with_model_fn_ops)
+        model_dir=tmpdir, model_fn=linear_model_fn_with_model_fn_ops)
     est.fit(input_fn=boston_input_fn, steps=5)
 
     checkpoint_file_content = file_io.read_file_to_string(
@@ -700,22 +715,20 @@ class EstimatorTest(test.TestCase):
     ckpt = checkpoint_state_pb2.CheckpointState()
     text_format.Merge(checkpoint_file_content, ckpt)
     self.assertEqual(ckpt.model_checkpoint_path, 'model.ckpt-5')
-    self.assertAllEqual(
-        ['model.ckpt-1', 'model.ckpt-5'], ckpt.all_model_checkpoint_paths)
+    self.assertAllEqual(['model.ckpt-1', 'model.ckpt-5'],
+                        ckpt.all_model_checkpoint_paths)
 
   def test_train_save_copy_reload(self):
     tmpdir = tempfile.mkdtemp()
     model_dir1 = os.path.join(tmpdir, 'model_dir1')
     est1 = estimator.Estimator(
-        model_dir=model_dir1,
-        model_fn=linear_model_fn_with_model_fn_ops)
+        model_dir=model_dir1, model_fn=linear_model_fn_with_model_fn_ops)
     est1.fit(input_fn=boston_input_fn, steps=5)
 
     model_dir2 = os.path.join(tmpdir, 'model_dir2')
     os.renames(model_dir1, model_dir2)
     est2 = estimator.Estimator(
-        model_dir=model_dir2,
-        model_fn=linear_model_fn_with_model_fn_ops)
+        model_dir=model_dir2, model_fn=linear_model_fn_with_model_fn_ops)
     self.assertEqual(5, est2.get_variable_value('global_step'))
     est2.fit(input_fn=boston_input_fn, steps=5)
     self.assertEqual(10, est2.get_variable_value('global_step'))
@@ -724,7 +737,9 @@ class EstimatorTest(test.TestCase):
     boston = base.load_boston()
     est = estimator.SKCompat(
         estimator.Estimator(
-            model_fn=linear_model_params_fn, params={'learning_rate': 0.01}))
+            model_fn=linear_model_params_fn, params={
+                'learning_rate': 0.01
+            }))
     est.fit(x=boston.data, y=boston.target, steps=100)
 
   def testHooksNotChanged(self):
@@ -824,11 +839,13 @@ class EstimatorTest(test.TestCase):
 
   def testMonitorsForFit(self):
     est = estimator.Estimator(model_fn=linear_model_fn)
-    est.fit(input_fn=boston_input_fn,
-            steps=21,
-            monitors=[CheckCallsMonitor(expect_calls=21)])
+    est.fit(
+        input_fn=boston_input_fn,
+        steps=21,
+        monitors=[CheckCallsMonitor(expect_calls=21)])
 
   def testHooksForEvaluate(self):
+
     class CheckCallHook(session_run_hook.SessionRunHook):
 
       def __init__(self):
@@ -874,7 +891,9 @@ class EstimatorTest(test.TestCase):
     est.evaluate(
         input_fn=boston_input_fn,
         steps=200,
-        metrics={'MSE': _streaming_mean_squared_error_histogram})
+        metrics={
+            'MSE': _streaming_mean_squared_error_histogram
+        })
     events = util_test.latest_events(est.model_dir + '/eval')
     output_values = {}
     for e in events:
@@ -903,7 +922,9 @@ class EstimatorTest(test.TestCase):
     est.evaluate(
         input_fn=boston_input_fn,
         steps=200,
-        metrics={'PMT': _streaming_precition_mean_tensor})
+        metrics={
+            'PMT': _streaming_precition_mean_tensor
+        })
     events = util_test.latest_events(est.model_dir + '/eval')
     output_values = {}
     for e in events:
@@ -956,8 +977,8 @@ class EstimatorTest(test.TestCase):
     self.assertTrue(
         gfile.Exists(
             os.path.join(
-                compat.as_bytes(export_dir), compat.as_bytes(
-                    'saved_model.pb'))))
+                compat.as_bytes(export_dir),
+                compat.as_bytes('saved_model.pb'))))
     self.assertTrue(
         gfile.Exists(
             os.path.join(
@@ -1017,11 +1038,11 @@ class EstimatorTest(test.TestCase):
         self.assertTrue('input_example_tensor' in graph_ops)
         self.assertTrue('ParseExample/ParseExample' in graph_ops)
         self.assertTrue('linear/linear/feature/matmul' in graph_ops)
-        self.assertItemsEqual(
-          ['bogus_lookup', 'feature'],
-          [compat.as_str_any(x) for x in graph.get_collection(
-            constants.COLLECTION_DEF_KEY_FOR_INPUT_FEATURE_KEYS)])
-
+        self.assertItemsEqual(['bogus_lookup', 'feature'], [
+            compat.as_str_any(x)
+            for x in graph.get_collection(
+                constants.COLLECTION_DEF_KEY_FOR_INPUT_FEATURE_KEYS)
+        ])
 
     # cleanup
     gfile.DeleteRecursively(tmpdir)
@@ -1039,8 +1060,8 @@ class EstimatorTest(test.TestCase):
     self.assertTrue(
         gfile.Exists(
             os.path.join(
-                compat.as_bytes(export_dir), compat.as_bytes(
-                    'saved_model.pb'))))
+                compat.as_bytes(export_dir),
+                compat.as_bytes('saved_model.pb'))))
     self.assertTrue(
         gfile.Exists(
             os.path.join(
@@ -1083,19 +1104,22 @@ class EstimatorTest(test.TestCase):
     export_dir_base = os.path.join(
         compat.as_bytes(tmpdir), compat.as_bytes('export'))
     export_dir = est.export_savedmodel(
-        export_dir_base, serving_input_fn, assets_extra=assets_extra,
+        export_dir_base,
+        serving_input_fn,
+        assets_extra=assets_extra,
         graph_rewrite_specs=[
             estimator.GraphRewriteSpec(['tag_1'], []),
             estimator.GraphRewriteSpec(['tag_2', 'tag_3'],
-                                       ['strip_unused_nodes'])])
+                                       ['strip_unused_nodes'])
+        ])
 
     self.assertTrue(gfile.Exists(export_dir_base))
     self.assertTrue(gfile.Exists(export_dir))
     self.assertTrue(
         gfile.Exists(
             os.path.join(
-                compat.as_bytes(export_dir), compat.as_bytes(
-                    'saved_model.pb'))))
+                compat.as_bytes(export_dir),
+                compat.as_bytes('saved_model.pb'))))
     self.assertTrue(
         gfile.Exists(
             os.path.join(
@@ -1208,18 +1232,15 @@ class InferRealValuedColumnsTest(test.TestCase):
     self.assertEqual(1, len(feature_columns))
     feature_column = feature_columns[0]
     self.assertEqual('', feature_column.name)
-    self.assertEqual(
-        {
-            '':
-                parsing_ops.FixedLenFeature(
-                    shape=expected_shape, dtype=expected_dtype)
-        },
-        feature_column.config)
+    self.assertEqual({
+        '':
+            parsing_ops.FixedLenFeature(
+                shape=expected_shape, dtype=expected_dtype)
+    }, feature_column.config)
 
   def testInt32Input(self):
     feature_columns = estimator.infer_real_valued_columns_from_input(
-        np.ones(
-            shape=[7, 8], dtype=np.int32))
+        np.ones(shape=[7, 8], dtype=np.int32))
     self._assert_single_feature_column([8], dtypes.int32, feature_columns)
 
   def testInt32InputFn(self):
@@ -1229,8 +1250,7 @@ class InferRealValuedColumnsTest(test.TestCase):
 
   def testInt64Input(self):
     feature_columns = estimator.infer_real_valued_columns_from_input(
-        np.ones(
-            shape=[7, 8], dtype=np.int64))
+        np.ones(shape=[7, 8], dtype=np.int64))
     self._assert_single_feature_column([8], dtypes.int64, feature_columns)
 
   def testInt64InputFn(self):
@@ -1240,8 +1260,7 @@ class InferRealValuedColumnsTest(test.TestCase):
 
   def testFloat32Input(self):
     feature_columns = estimator.infer_real_valued_columns_from_input(
-        np.ones(
-            shape=[7, 8], dtype=np.float32))
+        np.ones(shape=[7, 8], dtype=np.float32))
     self._assert_single_feature_column([8], dtypes.float32, feature_columns)
 
   def testFloat32InputFn(self):
@@ -1251,8 +1270,7 @@ class InferRealValuedColumnsTest(test.TestCase):
 
   def testFloat64Input(self):
     feature_columns = estimator.infer_real_valued_columns_from_input(
-        np.ones(
-            shape=[7, 8], dtype=np.float64))
+        np.ones(shape=[7, 8], dtype=np.float64))
     self._assert_single_feature_column([8], dtypes.float64, feature_columns)
 
   def testFloat64InputFn(self):
@@ -1271,8 +1289,8 @@ class InferRealValuedColumnsTest(test.TestCase):
         ValueError, 'on integer or non floating types are not supported'):
       # pylint: disable=g-long-lambda
       estimator.infer_real_valued_columns_from_input_fn(
-          lambda: (constant_op.constant(False, shape=[7, 8], dtype=dtypes.bool),
-                   None))
+          lambda: (constant_op.constant(False, shape=[7, 8], dtype=dtypes.bool), None)
+      )
 
   def testStringInput(self):
     with self.assertRaisesRegexp(
@@ -1309,8 +1327,9 @@ class ReplicaDeviceSetterTest(test.TestCase):
 
   def testVariablesAreOnPs(self):
     tf_config = {'cluster': {run_config.TaskType.PS: ['fake_ps_0']}}
-    with test.mock.patch.dict('os.environ',
-                              {'TF_CONFIG': json.dumps(tf_config)}):
+    with test.mock.patch.dict('os.environ', {
+        'TF_CONFIG': json.dumps(tf_config)
+    }):
       config = run_config.RunConfig()
 
     with ops.device(estimator._get_replica_device_setter(config)):
@@ -1337,14 +1356,14 @@ class ReplicaDeviceSetterTest(test.TestCase):
 
   def testMutableHashTableIsOnPs(self):
     tf_config = {'cluster': {run_config.TaskType.PS: ['fake_ps_0']}}
-    with test.mock.patch.dict('os.environ',
-                              {'TF_CONFIG': json.dumps(tf_config)}):
+    with test.mock.patch.dict('os.environ', {
+        'TF_CONFIG': json.dumps(tf_config)
+    }):
       config = run_config.RunConfig()
 
     with ops.device(estimator._get_replica_device_setter(config)):
       default_val = constant_op.constant([-1, -1], dtypes.int64)
-      table = lookup.MutableHashTable(dtypes.string, dtypes.int64,
-                                      default_val)
+      table = lookup.MutableHashTable(dtypes.string, dtypes.int64, default_val)
       input_string = constant_op.constant(['brain', 'salad', 'tank'])
       output = table.lookup(input_string)
     self.assertDeviceEqual('/job:ps/task:0', table._table_ref.device)
@@ -1354,8 +1373,7 @@ class ReplicaDeviceSetterTest(test.TestCase):
     with ops.device(
         estimator._get_replica_device_setter(run_config.RunConfig())):
       default_val = constant_op.constant([-1, -1], dtypes.int64)
-      table = lookup.MutableHashTable(dtypes.string, dtypes.int64,
-                                      default_val)
+      table = lookup.MutableHashTable(dtypes.string, dtypes.int64, default_val)
       input_string = constant_op.constant(['brain', 'salad', 'tank'])
       output = table.lookup(input_string)
     self.assertDeviceEqual('', table._table_ref.device)
@@ -1371,8 +1389,9 @@ class ReplicaDeviceSetterTest(test.TestCase):
             'index': 3
         }
     }
-    with test.mock.patch.dict('os.environ',
-                              {'TF_CONFIG': json.dumps(tf_config)}):
+    with test.mock.patch.dict('os.environ', {
+        'TF_CONFIG': json.dumps(tf_config)
+    }):
       config = run_config.RunConfig()
 
     with ops.device(estimator._get_replica_device_setter(config)):
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimators_test.py b/tensorflow/contrib/learn/python/learn/estimators/estimators_test.py
index 8131e0fde6..2113fae394 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimators_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimators_test.py
@@ -72,9 +72,11 @@ class FeatureEngineeringFunctionTest(test.TestCase):
     # predictions = transformed_x (9)
     self.assertEqual(9., prediction)
     metrics = estimator.evaluate(
-        input_fn=input_fn, steps=1,
-        metrics={"label":
-                 metric_spec.MetricSpec(lambda predictions, labels: labels)})
+        input_fn=input_fn,
+        steps=1,
+        metrics={
+            "label": metric_spec.MetricSpec(lambda predictions, labels: labels)
+        })
     # labels = transformed_y (99)
     self.assertEqual(99., metrics["label"])
 
@@ -82,10 +84,10 @@ class FeatureEngineeringFunctionTest(test.TestCase):
 
     def input_fn():
       return {
-               "x": constant_op.constant(["9."])
-             }, {
-               "y": constant_op.constant(["99."])
-             }
+          "x": constant_op.constant(["9."])
+      }, {
+          "y": constant_op.constant(["99."])
+      }
 
     def feature_engineering_fn(features, labels):
       # Github #12205: raise a TypeError if called twice.
@@ -104,15 +106,17 @@ class FeatureEngineeringFunctionTest(test.TestCase):
       return predictions, loss, update_global_step
 
     estimator = estimator_lib.Estimator(
-      model_fn=model_fn, feature_engineering_fn=feature_engineering_fn)
+        model_fn=model_fn, feature_engineering_fn=feature_engineering_fn)
     estimator.fit(input_fn=input_fn, steps=1)
     prediction = next(estimator.predict(input_fn=input_fn, as_iterable=True))
     # predictions = transformed_x (9)
     self.assertEqual(9., prediction)
     metrics = estimator.evaluate(
-      input_fn=input_fn, steps=1,
-      metrics={"label":
-                 metric_spec.MetricSpec(lambda predictions, labels: labels)})
+        input_fn=input_fn,
+        steps=1,
+        metrics={
+            "label": metric_spec.MetricSpec(lambda predictions, labels: labels)
+        })
     # labels = transformed_y (99)
     self.assertEqual(99., metrics["label"])
 
@@ -150,12 +154,10 @@ class FeatureEngineeringFunctionTest(test.TestCase):
 
     # predictions = x
     prediction_with_fe_fn = next(
-        estimator_with_fe_fn.predict(
-            input_fn=input_fn, as_iterable=True))
+        estimator_with_fe_fn.predict(input_fn=input_fn, as_iterable=True))
     self.assertEqual(9., prediction_with_fe_fn)
     prediction_without_fe_fn = next(
-        estimator_without_fe_fn.predict(
-            input_fn=input_fn, as_iterable=True))
+        estimator_without_fe_fn.predict(input_fn=input_fn, as_iterable=True))
     self.assertEqual(1., prediction_without_fe_fn)
 
 
diff --git a/tensorflow/contrib/learn/python/learn/monitors.py b/tensorflow/contrib/learn/python/learn/monitors.py
index 3e0b1ad21a..0948dee7e2 100644
--- a/tensorflow/contrib/learn/python/learn/monitors.py
+++ b/tensorflow/contrib/learn/python/learn/monitors.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Monitors instrument the training process.
 
 @@get_default_monitors
@@ -151,8 +150,8 @@ class BaseMonitor(object):
       ValueError: if we've not begun an epoch, or `epoch` number does not match.
     """
     if self._current_epoch != epoch:
-      raise ValueError(
-          "epoch_end expected %s but got %s.", self._current_epoch, epoch)
+      raise ValueError("epoch_end expected %s but got %s.", self._current_epoch,
+                       epoch)
     self._current_epoch = None
 
   def step_begin(self, step):
@@ -171,8 +170,8 @@ class BaseMonitor(object):
       ValueError: if we've already begun a step, or `step` < 0, or
           `step` > `max_steps`.
     """
-    if (step < 0) or (
-        (self._max_steps is not None) and (step > self._max_steps)):
+    if (step < 0) or ((self._max_steps is not None) and
+                      (step > self._max_steps)):
       raise ValueError("Invalid step %s." % step)
     self._current_step = step
     return []
@@ -203,8 +202,8 @@ class BaseMonitor(object):
       ValueError: if we've not begun a step, or `step` number does not match.
     """
     if self._current_step != step:
-      raise ValueError(
-          "step_end expected %s but got %s.", self._current_step, step)
+      raise ValueError("step_end expected %s but got %s.", self._current_step,
+                       step)
     self._current_step = None
     return False
 
@@ -253,6 +252,7 @@ class EveryN(BaseMonitor):
   treatment.
 
   """
+
   # TODO(ipolosukhin): Add also every n seconds.
 
   def __init__(self, every_n_steps=100, first_n_steps=1):
@@ -475,8 +475,8 @@ class LoggingTrainable(EveryN):
     super(LoggingTrainable, self).every_n_step_begin(step)
     # Get a list of trainable variables at the beginning of every N steps.
     # We cannot get this in __init__ because train_op has not been generated.
-    trainables = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES,
-                                    scope=self._scope)
+    trainables = ops.get_collection(
+        ops.GraphKeys.TRAINABLE_VARIABLES, scope=self._scope)
     self._names = {}
     for var in trainables:
       self._names[var.name] = var.value().name
@@ -561,12 +561,19 @@ class ValidationMonitor(EveryN):
   provided.
   """
 
-  def __init__(self, x=None, y=None, input_fn=None, batch_size=None,
+  def __init__(self,
+               x=None,
+               y=None,
+               input_fn=None,
+               batch_size=None,
                eval_steps=None,
-               every_n_steps=100, metrics=None, hooks=None,
+               every_n_steps=100,
+               metrics=None,
+               hooks=None,
                early_stopping_rounds=None,
                early_stopping_metric="loss",
-               early_stopping_metric_minimize=True, name=None):
+               early_stopping_metric_minimize=True,
+               name=None):
     """Initializes a ValidationMonitor.
 
     Args:
@@ -597,8 +604,8 @@ class ValidationMonitor(EveryN):
     Raises:
       ValueError: If both x and input_fn are provided.
     """
-    super(ValidationMonitor, self).__init__(every_n_steps=every_n_steps,
-                                            first_n_steps=-1)
+    super(ValidationMonitor, self).__init__(
+        every_n_steps=every_n_steps, first_n_steps=-1)
     # TODO(mdan): Checks like this are already done by evaluate.
     if x is None and input_fn is None:
       raise ValueError("Either x or input_fn should be provided.")
@@ -654,20 +661,27 @@ class ValidationMonitor(EveryN):
 
   def _evaluate_estimator(self):
     if isinstance(self._estimator, core_estimator.Estimator):
-      if any((x is not None for x in
-              [self.x, self.y, self.batch_size, self.metrics])):
+      if any((x is not None
+              for x in [self.x, self.y, self.batch_size, self.metrics])):
         raise ValueError(
             "tf.estimator.Estimator does not support following "
             "arguments: x, y, batch_size, metrics. Should set as `None` "
             "in ValidationMonitor")
       return self._estimator.evaluate(
-          input_fn=self.input_fn, steps=self.eval_steps, hooks=self.hooks,
+          input_fn=self.input_fn,
+          steps=self.eval_steps,
+          hooks=self.hooks,
           name=self.name)
     else:
       return self._estimator.evaluate(
-          x=self.x, y=self.y, input_fn=self.input_fn,
-          batch_size=self.batch_size, steps=self.eval_steps,
-          metrics=self.metrics, hooks=self.hooks, name=self.name)
+          x=self.x,
+          y=self.y,
+          input_fn=self.input_fn,
+          batch_size=self.batch_size,
+          steps=self.eval_steps,
+          metrics=self.metrics,
+          hooks=self.hooks,
+          name=self.name)
 
   def every_n_step_end(self, step, outputs):
     super(ValidationMonitor, self).every_n_step_end(step, outputs)
@@ -700,8 +714,9 @@ class ValidationMonitor(EveryN):
     # Early stopping logic.
     if self.early_stopping_rounds is not None:
       if self.early_stopping_metric not in validation_outputs:
-        raise ValueError("Metric %s missing from outputs %s." % (
-            self.early_stopping_metric, set(validation_outputs.keys())))
+        raise ValueError("Metric %s missing from outputs %s." %
+                         (self.early_stopping_metric,
+                          set(validation_outputs.keys())))
       current_value = validation_outputs[self.early_stopping_metric]
       if (self._best_value is None or (self.early_stopping_metric_minimize and
                                        (current_value < self._best_value)) or
@@ -712,9 +727,9 @@ class ValidationMonitor(EveryN):
         self._best_value_step = step
       stop_now = (step - self._best_value_step >= self.early_stopping_rounds)
       if stop_now:
-        logging.info("Stopping. Best step: {} with {} = {}."
-                     .format(self._best_value_step,
-                             self.early_stopping_metric, self._best_value))
+        logging.info("Stopping. Best step: {} with {} = {}.".format(
+            self._best_value_step, self.early_stopping_metric,
+            self._best_value))
         self._early_stopped = True
         return True
     return False
@@ -763,8 +778,11 @@ class CaptureVariable(EveryN):
     self._var_values[step] = _extract_output(outputs, self._var_name)
 
 
-def get_default_monitors(loss_op=None, summary_op=None, save_summary_steps=100,
-                         output_dir=None, summary_writer=None):
+def get_default_monitors(loss_op=None,
+                         summary_op=None,
+                         save_summary_steps=100,
+                         output_dir=None,
+                         summary_writer=None):
   """Returns a default set of typically-used monitors.
 
   Args:
@@ -782,9 +800,12 @@ def get_default_monitors(loss_op=None, summary_op=None, save_summary_steps=100,
   if loss_op is not None:
     monitors.append(PrintTensor(tensor_names={"loss": loss_op.name}))
   if summary_op is not None:
-    monitors.append(SummarySaver(summary_op, save_steps=save_summary_steps,
-                                 output_dir=output_dir,
-                                 summary_writer=summary_writer))
+    monitors.append(
+        SummarySaver(
+            summary_op,
+            save_steps=save_summary_steps,
+            output_dir=output_dir,
+            summary_writer=summary_writer))
   return monitors
 
 
@@ -794,8 +815,10 @@ class GraphDump(BaseMonitor):
   Note, this is very expensive, prefer `PrintTensor` in production.
   """
 
-  IGNORE_OPS = ["Const", "Assign", "Identity", "Placeholder",
-                "RandomUniform", "Cast", "RestoreSlice"]
+  IGNORE_OPS = [
+      "Const", "Assign", "Identity", "Placeholder", "RandomUniform", "Cast",
+      "RestoreSlice"
+  ]
 
   def __init__(self, ignore_ops=None):
     """Initializes GraphDump monitor.
@@ -881,8 +904,8 @@ class ExportMonitor(EveryN):
   """Monitor that exports Estimator every N steps."""
 
   @deprecation.deprecated("2017-03-25",
-              "ExportMonitor is deprecated. Please pass an "
-              "ExportStrategy to Experiment instead.")
+                          "ExportMonitor is deprecated. Please pass an "
+                          "ExportStrategy to Experiment instead.")
   def __init__(self,
                every_n_steps,
                export_dir,
@@ -1088,8 +1111,7 @@ class CheckpointSaver(BaseMonitor):
 class StepCounter(EveryN):
   """Steps per second monitor."""
 
-  def __init__(self, every_n_steps=100, output_dir=None,
-               summary_writer=None):
+  def __init__(self, every_n_steps=100, output_dir=None, summary_writer=None):
     super(StepCounter, self).__init__(every_n_steps=every_n_steps)
     self._summary_tag = "global_step/sec"
     self._last_reported_step = None
@@ -1101,7 +1123,8 @@ class StepCounter(EveryN):
   def set_estimator(self, estimator):
     super(StepCounter, self).set_estimator(estimator)
     if self._summary_writer is None:
-      self._summary_writer = core_summary.FileWriterCache.get(estimator.model_dir)
+      self._summary_writer = core_summary.FileWriterCache.get(
+          estimator.model_dir)
 
   def every_n_step_end(self, current_step, outputs):
     current_time = time.time()
@@ -1109,8 +1132,9 @@ class StepCounter(EveryN):
       added_steps = current_step - self._last_reported_step
       elapsed_time = current_time - self._last_reported_time
       steps_per_sec = added_steps / elapsed_time
-      summary = Summary(value=[Summary.Value(tag=self._summary_tag,
-                                             simple_value=steps_per_sec)])
+      summary = Summary(value=[
+          Summary.Value(tag=self._summary_tag, simple_value=steps_per_sec)
+      ])
       self._summary_writer.add_summary(summary, current_step)
     self._last_reported_step = current_step
     self._last_reported_time = current_time
diff --git a/tensorflow/contrib/learn/python/learn/utils/export_test.py b/tensorflow/contrib/learn/python/learn/utils/export_test.py
index 95070ada3b..9bfb1fc952 100644
--- a/tensorflow/contrib/learn/python/learn/utils/export_test.py
+++ b/tensorflow/contrib/learn/python/learn/utils/export_test.py
@@ -50,6 +50,7 @@ def _training_input_fn():
 
 
 class ExportTest(test.TestCase):
+
   def _get_default_signature(self, export_meta_filename):
     """ Gets the default signature from the export.meta file. """
     with session.Session():
@@ -69,18 +70,18 @@ class ExportTest(test.TestCase):
     # Only the written checkpoints are exported.
     self.assertTrue(
         saver.checkpoint_exists(os.path.join(export_dir, '00000001', 'export')),
-        'Exported checkpoint expected but not found: %s' %
-        os.path.join(export_dir, '00000001', 'export'))
+        'Exported checkpoint expected but not found: %s' % os.path.join(
+            export_dir, '00000001', 'export'))
     self.assertTrue(
         saver.checkpoint_exists(os.path.join(export_dir, '00000010', 'export')),
-        'Exported checkpoint expected but not found: %s' %
-        os.path.join(export_dir, '00000010', 'export'))
+        'Exported checkpoint expected but not found: %s' % os.path.join(
+            export_dir, '00000010', 'export'))
     self.assertEquals(
         six.b(os.path.join(export_dir, '00000010')),
         export_monitor.last_export_dir)
     # Validate the signature
     signature = self._get_default_signature(
-      os.path.join(export_dir, '00000010', 'export.meta'))
+        os.path.join(export_dir, '00000010', 'export.meta'))
     self.assertTrue(signature.HasField(expected_signature))
 
   def testExportMonitor_EstimatorProvidesSignature(self):
@@ -116,8 +117,7 @@ class ExportTest(test.TestCase):
     def _serving_input_fn():
       return {
           _X_KEY:
-              random_ops.random_uniform(
-                  shape=(1,), minval=0.0, maxval=1000.0)
+              random_ops.random_uniform(shape=(1,), minval=0.0, maxval=1000.0)
       }, None
 
     input_feature_key = 'my_example_key'
@@ -160,8 +160,7 @@ class ExportTest(test.TestCase):
           input_feature_key:
               None,
           _X_KEY:
-              random_ops.random_uniform(
-                  shape=(1,), minval=0.0, maxval=1000.0)
+              random_ops.random_uniform(shape=(1,), minval=0.0, maxval=1000.0)
       }, None
 
     monitor = learn.monitors.ExportMonitor(
@@ -182,8 +181,7 @@ class ExportTest(test.TestCase):
     def _serving_input_fn():
       return {
           input_feature_key:
-              array_ops.placeholder(
-                  dtype=dtypes.string, shape=(1,))
+              array_ops.placeholder(dtype=dtypes.string, shape=(1,))
       }, None
 
     monitor = learn.monitors.ExportMonitor(
@@ -204,11 +202,9 @@ class ExportTest(test.TestCase):
     def _serving_input_fn():
       return {
           input_feature_key:
-              array_ops.placeholder(
-                  dtype=dtypes.string, shape=(1,)),
+              array_ops.placeholder(dtype=dtypes.string, shape=(1,)),
           _X_KEY:
-              random_ops.random_uniform(
-                  shape=(1,), minval=0.0, maxval=1000.0)
+              random_ops.random_uniform(shape=(1,), minval=0.0, maxval=1000.0)
       }, None
 
     export_dir = os.path.join(tempfile.mkdtemp(), 'export')
@@ -227,8 +223,8 @@ class ExportTest(test.TestCase):
 
     def _regression_signature(examples, unused_features, predictions):
       signatures = {}
-      signatures['regression'] = (exporter.regression_signature(examples,
-                                                                predictions))
+      signatures['regression'] = (
+          exporter.regression_signature(examples, predictions))
       return signatures['regression'], signatures
 
     random.seed(42)
@@ -248,10 +244,10 @@ class ExportTest(test.TestCase):
     with self.assertRaises(errors.NotFoundError):
       saver.checkpoint_exists(os.path.join(export_dir, '00000000', 'export'))
     self.assertTrue(
-      saver.checkpoint_exists(os.path.join(export_dir, '00000010', 'export')))
+        saver.checkpoint_exists(os.path.join(export_dir, '00000010', 'export')))
     # Validate the signature
     signature = self._get_default_signature(
-      os.path.join(export_dir, '00000010', 'export.meta'))
+        os.path.join(export_dir, '00000010', 'export.meta'))
     self.assertTrue(signature.HasField('regression_signature'))
 
 
diff --git a/tensorflow/contrib/learn/python/learn/utils/gc_test.py b/tensorflow/contrib/learn/python/learn/utils/gc_test.py
index 76cfd88e1d..e7d091e18a 100644
--- a/tensorflow/contrib/learn/python/learn/utils/gc_test.py
+++ b/tensorflow/contrib/learn/python/learn/utils/gc_test.py
@@ -34,12 +34,13 @@ def _create_parser(base_dir):
   # create a simple parser that pulls the export_version from the directory.
   def parser(path):
     # Modify the path object for RegEx match for Windows Paths
-    if os.name == 'nt':
-      match = re.match("^" + compat.as_str_any(base_dir).replace('\\','/') + "/(\\d+)$",
-                      compat.as_str_any(path.path).replace('\\','/'))
+    if os.name == "nt":
+      match = re.match(
+          "^" + compat.as_str_any(base_dir).replace("\\", "/") + "/(\\d+)$",
+          compat.as_str_any(path.path).replace("\\", "/"))
     else:
       match = re.match("^" + compat.as_str_any(base_dir) + "/(\\d+)$",
-                      compat.as_str_any(path.path))
+                       compat.as_str_any(path.path))
     if not match:
       return None
     return path._replace(export_version=int(match.group(1)))
@@ -63,7 +64,9 @@ class GcTest(test_util.TensorFlowTestCase):
 
   def testModExportVersion(self):
     paths = [
-        gc.Path("/foo", 4), gc.Path("/foo", 5), gc.Path("/foo", 6),
+        gc.Path("/foo", 4),
+        gc.Path("/foo", 5),
+        gc.Path("/foo", 6),
         gc.Path("/foo", 9)
     ]
     mod = gc.mod_export_version(2)
@@ -73,14 +76,21 @@ class GcTest(test_util.TensorFlowTestCase):
 
   def testOneOfEveryNExportVersions(self):
     paths = [
-        gc.Path("/foo", 0), gc.Path("/foo", 1), gc.Path("/foo", 3),
-        gc.Path("/foo", 5), gc.Path("/foo", 6), gc.Path("/foo", 7),
-        gc.Path("/foo", 8), gc.Path("/foo", 33)
+        gc.Path("/foo", 0),
+        gc.Path("/foo", 1),
+        gc.Path("/foo", 3),
+        gc.Path("/foo", 5),
+        gc.Path("/foo", 6),
+        gc.Path("/foo", 7),
+        gc.Path("/foo", 8),
+        gc.Path("/foo", 33)
     ]
     one_of = gc.one_of_every_n_export_versions(3)
     self.assertEqual(
         one_of(paths), [
-            gc.Path("/foo", 3), gc.Path("/foo", 6), gc.Path("/foo", 8),
+            gc.Path("/foo", 3),
+            gc.Path("/foo", 6),
+            gc.Path("/foo", 8),
             gc.Path("/foo", 33)
         ])
 
@@ -98,13 +108,19 @@ class GcTest(test_util.TensorFlowTestCase):
     f = gc.union(gc.largest_export_versions(3), gc.mod_export_version(3))
     self.assertEqual(
         f(paths), [
-            gc.Path("/foo", 0), gc.Path("/foo", 3), gc.Path("/foo", 6),
-            gc.Path("/foo", 7), gc.Path("/foo", 8), gc.Path("/foo", 9)
+            gc.Path("/foo", 0),
+            gc.Path("/foo", 3),
+            gc.Path("/foo", 6),
+            gc.Path("/foo", 7),
+            gc.Path("/foo", 8),
+            gc.Path("/foo", 9)
         ])
 
   def testNegation(self):
     paths = [
-        gc.Path("/foo", 4), gc.Path("/foo", 5), gc.Path("/foo", 6),
+        gc.Path("/foo", 4),
+        gc.Path("/foo", 5),
+        gc.Path("/foo", 6),
         gc.Path("/foo", 9)
     ]
     mod = gc.negation(gc.mod_export_version(2))
@@ -121,8 +137,7 @@ class GcTest(test_util.TensorFlowTestCase):
     gfile.MakeDirs(os.path.join(base_dir, "ignore"))
 
     self.assertEqual(
-        gc.get_paths(base_dir, _create_parser(base_dir)),
-        [
+        gc.get_paths(base_dir, _create_parser(base_dir)), [
             gc.Path(os.path.join(base_dir, "0"), 0),
             gc.Path(os.path.join(base_dir, "1"), 1),
             gc.Path(os.path.join(base_dir, "2"), 2)
@@ -131,10 +146,10 @@ class GcTest(test_util.TensorFlowTestCase):
   def testMixedStrTypes(self):
     temp_dir = compat.as_bytes(test.get_temp_dir())
 
-    for sub_dir in ['str', b'bytes', u'unicode']:
+    for sub_dir in ["str", b"bytes", u"unicode"]:
       base_dir = os.path.join(
-          (temp_dir if isinstance(sub_dir, bytes) else temp_dir.decode()),
-          sub_dir)
+          (temp_dir
+           if isinstance(sub_dir, bytes) else temp_dir.decode()), sub_dir)
       self.assertFalse(gfile.Exists(base_dir))
       gfile.MakeDirs(os.path.join(compat.as_str_any(base_dir), "42"))
       gc.get_paths(base_dir, _create_parser(base_dir))
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index c3de1c4c62..55946c128b 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -339,9 +339,9 @@ def streaming_mean_tensor(values,
       name=name)
 
 
-@deprecated(
-    None, 'Please switch to tf.metrics.accuracy. Note that the order of the '
-    'labels and predictions arguments has been switched.')
+@deprecated(None,
+            'Please switch to tf.metrics.accuracy. Note that the order of the '
+            'labels and predictions arguments has been switched.')
 def streaming_accuracy(predictions,
                        labels,
                        weights=None,
@@ -936,8 +936,9 @@ def streaming_curve_points(labels=None,
     if curve != 'ROC' and curve != 'PR':
       raise ValueError('curve must be either ROC or PR, %s unknown' % (curve))
     kepsilon = _EPSILON  # to account for floating point imprecisions
-    thresholds = [(i + 1) * 1.0 / (num_thresholds - 1)
-                  for i in range(num_thresholds - 2)]
+    thresholds = [
+        (i + 1) * 1.0 / (num_thresholds - 1) for i in range(num_thresholds - 2)
+    ]
     thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon]
 
     values, update_ops = _streaming_confusion_matrix_at_thresholds(
@@ -973,9 +974,8 @@ def streaming_curve_points(labels=None,
     return points, update_op
 
 
-@deprecated(
-    None, 'Please switch to tf.metrics.auc. Note that the order of the '
-    'labels and predictions arguments has been switched.')
+@deprecated(None, 'Please switch to tf.metrics.auc. Note that the order of the '
+            'labels and predictions arguments has been switched.')
 def streaming_auc(predictions,
                   labels,
                   weights=None,
@@ -1105,8 +1105,7 @@ def _compute_dynamic_auc(labels, predictions, curve='ROC'):
       # For conformance, set precision to 1 when the number of positive
       # classifications is 0.
       y_axis_values = array_ops.where(
-          math_ops.greater(splits, 0),
-          math_ops.truediv(true_positives, splits),
+          math_ops.greater(splits, 0), math_ops.truediv(true_positives, splits),
           array_ops.ones_like(true_positives, dtype=dtypes.float64))
 
     # Calculate trapezoid areas.
@@ -1119,9 +1118,8 @@ def _compute_dynamic_auc(labels, predictions, curve='ROC'):
   # exception seems excessive) so we return 0, otherwise we finish computing.
   return control_flow_ops.cond(
       math_ops.logical_or(
-          math_ops.equal(total_positive, 0),
-          math_ops.equal(total_positive, size)
-      ),
+          math_ops.equal(total_positive, 0), math_ops.equal(
+              total_positive, size)),
       true_fn=lambda: array_ops.constant(0, dtypes.float64),
       false_fn=continue_computing_dynamic_auc)
 
@@ -1185,10 +1183,10 @@ def streaming_dynamic_auc(labels,
             array_ops.ones_like(labels, dtypes.int64),
             message='labels must be 0 or 1, at least one is >1')
     ]):
-      preds_accum, update_preds = streaming_concat(predictions,
-                                                   name='concat_preds')
-      labels_accum, update_labels = streaming_concat(labels,
-                                                     name='concat_labels')
+      preds_accum, update_preds = streaming_concat(
+          predictions, name='concat_preds')
+      labels_accum, update_labels = streaming_concat(
+          labels, name='concat_labels')
       update_op = control_flow_ops.group(update_labels, update_preds)
       auc = _compute_dynamic_auc(labels_accum, preds_accum, curve=curve)
       if updates_collections:
@@ -1571,9 +1569,9 @@ def streaming_precision_at_thresholds(predictions,
       name=name)
 
 
-@deprecated(
-    None, 'Please switch to tf.metrics.recall_at_thresholds. Note that the '
-    'order of the labels and predictions arguments has been switched.')
+@deprecated(None,
+            'Please switch to tf.metrics.recall_at_thresholds. Note that the '
+            'order of the labels and predictions arguments has been switched.')
 def streaming_recall_at_thresholds(predictions,
                                    labels,
                                    thresholds,
@@ -3299,8 +3297,13 @@ def count(values,
     return count_, update_op
 
 
-def cohen_kappa(labels, predictions_idx, num_classes, weights=None,
-                metrics_collections=None, updates_collections=None, name=None):
+def cohen_kappa(labels,
+                predictions_idx,
+                num_classes,
+                weights=None,
+                metrics_collections=None,
+                updates_collections=None,
+                name=None):
   """Calculates Cohen's kappa.
 
   [Cohen's kappa](https://en.wikipedia.org/wiki/Cohen's_kappa) is a statistic
@@ -3367,14 +3370,15 @@ def cohen_kappa(labels, predictions_idx, num_classes, weights=None,
       labels = array_ops.squeeze(labels, axis=[-1])
     predictions_idx, labels, weights = (
         metrics_impl._remove_squeezable_dimensions(  # pylint: disable=protected-access
-            predictions=predictions_idx, labels=labels, weights=weights))
+            predictions=predictions_idx,
+            labels=labels,
+            weights=weights))
     predictions_idx.get_shape().assert_is_compatible_with(labels.get_shape())
 
-    stat_dtype = (dtypes.int64
-                  if weights is None or weights.dtype.is_integer
-                  else dtypes.float32)
-    po = metrics_impl.metric_variable(
-        (num_classes,), stat_dtype, name='po')
+    stat_dtype = (
+        dtypes.int64
+        if weights is None or weights.dtype.is_integer else dtypes.float32)
+    po = metrics_impl.metric_variable((num_classes,), stat_dtype, name='po')
     pe_row = metrics_impl.metric_variable(
         (num_classes,), stat_dtype, name='pe_row')
     pe_col = metrics_impl.metric_variable(
@@ -3382,9 +3386,12 @@ def cohen_kappa(labels, predictions_idx, num_classes, weights=None,
 
     # Table of the counts of agreement:
     counts_in_table = confusion_matrix.confusion_matrix(
-      labels, predictions_idx,
-      num_classes=num_classes, weights=weights,
-      dtype=stat_dtype, name="counts_in_table")
+        labels,
+        predictions_idx,
+        num_classes=num_classes,
+        weights=weights,
+        dtype=stat_dtype,
+        name='counts_in_table')
 
     po_t = array_ops.diag_part(counts_in_table)
     pe_row_t = math_ops.reduce_sum(counts_in_table, axis=0)
@@ -3404,12 +3411,14 @@ def cohen_kappa(labels, predictions_idx, num_classes, weights=None,
                                math_ops.to_double(total))
       # kappa = (po - pe) / (N - pe)
       k = metrics_impl._safe_scalar_div(  # pylint: disable=protected-access
-          po_sum - pe_sum, total - pe_sum, name=name)
+          po_sum - pe_sum,
+          total - pe_sum,
+          name=name)
       return k
 
     kappa = _calculate_k(po, pe_row, pe_col, name='value')
-    update_op = _calculate_k(update_po, update_pe_row, update_pe_col,
-                             name='update_op')
+    update_op = _calculate_k(
+        update_po, update_pe_row, update_pe_col, name='update_op')
 
     if metrics_collections:
       ops.add_to_collections(metrics_collections, kappa)
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
index 89aa29f711..e067f08bab 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
@@ -46,8 +46,7 @@ def _enqueue_vector(sess, queue, values, shape=None):
     shape = (1, len(values))
   dtype = queue.dtypes[0]
   sess.run(
-      queue.enqueue(constant_op.constant(
-          values, dtype=dtype, shape=shape)))
+      queue.enqueue(constant_op.constant(values, dtype=dtype, shape=shape)))
 
 
 def _binary_2d_label_to_sparse_value(labels):
@@ -79,8 +78,8 @@ def _binary_2d_label_to_sparse_value(labels):
     batch += 1
   shape = [len(labels), len(labels[0])]
   return sparse_tensor.SparseTensorValue(
-      np.array(indices, np.int64),
-      np.array(values, np.int64), np.array(shape, np.int64))
+      np.array(indices, np.int64), np.array(values, np.int64),
+      np.array(shape, np.int64))
 
 
 def _binary_2d_label_to_sparse(labels):
@@ -125,8 +124,8 @@ def _binary_3d_label_to_sparse_value(labels):
           assert label == 0
   shape = [len(labels), len(labels[0]), len(labels[0][0])]
   return sparse_tensor.SparseTensorValue(
-      np.array(indices, np.int64),
-      np.array(values, np.int64), np.array(shape, np.int64))
+      np.array(indices, np.int64), np.array(values, np.int64),
+      np.array(shape, np.int64))
 
 
 def _binary_3d_label_to_sparse(labels):
@@ -669,20 +668,18 @@ class StreamingTruePositivesTest(test.TestCase):
     for expand_predictions in [True, False]:
       for expand_labels in [True, False]:
         for dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
-          predictions = math_ops.cast(constant_op.constant(
-              ((1, 0, 1, 0),
-               (0, 1, 1, 1),
-               (0, 0, 0, 0))), dtype=dtype)
+          predictions = math_ops.cast(
+              constant_op.constant(((1, 0, 1, 0), (0, 1, 1, 1), (0, 0, 0, 0))),
+              dtype=dtype)
           if expand_predictions:
             predictions = array_ops.expand_dims(predictions, 2)
-          labels = math_ops.cast(constant_op.constant(
-              ((0, 1, 1, 0),
-               (1, 0, 0, 0),
-               (0, 0, 0, 0))), dtype=dtype)
+          labels = math_ops.cast(
+              constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0))),
+              dtype=dtype)
           if expand_labels:
             labels = array_ops.expand_dims(labels, 2)
-          tp, tp_update_op = metrics.streaming_true_positives(predictions,
-                                                              labels)
+          tp, tp_update_op = metrics.streaming_true_positives(
+              predictions, labels)
 
           with self.test_session() as sess:
             sess.run(variables.local_variables_initializer())
@@ -692,14 +689,12 @@ class StreamingTruePositivesTest(test.TestCase):
 
   def testWeighted(self):
     for dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
-      predictions = math_ops.cast(constant_op.constant(
-          ((1, 0, 1, 0),
-           (0, 1, 1, 1),
-           (0, 0, 0, 0))), dtype=dtype)
-      labels = math_ops.cast(constant_op.constant(
-          ((0, 1, 1, 0),
-           (1, 0, 0, 0),
-           (0, 0, 0, 0))), dtype=dtype)
+      predictions = math_ops.cast(
+          constant_op.constant(((1, 0, 1, 0), (0, 1, 1, 1), (0, 0, 0, 0))),
+          dtype=dtype)
+      labels = math_ops.cast(
+          constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0))),
+          dtype=dtype)
       tp, tp_update_op = metrics.streaming_true_positives(
           predictions, labels, weights=37.0)
 
@@ -717,28 +712,25 @@ class StreamingFalseNegativesTest(test.TestCase):
     ops.reset_default_graph()
 
   def testVars(self):
-    metrics.streaming_false_negatives((0, 1, 0),
-                                      (0, 1, 1))
+    metrics.streaming_false_negatives((0, 1, 0), (0, 1, 1))
     _assert_metric_variables(self, ('false_negatives/count:0',))
 
   def testUnweighted(self):
     for expand_predictions in [True, False]:
       for expand_labels in [True, False]:
         for dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
-          predictions = math_ops.cast(constant_op.constant(
-              ((1, 0, 1, 0),
-               (0, 1, 1, 1),
-               (0, 0, 0, 0))), dtype=dtype)
+          predictions = math_ops.cast(
+              constant_op.constant(((1, 0, 1, 0), (0, 1, 1, 1), (0, 0, 0, 0))),
+              dtype=dtype)
           if expand_predictions:
             predictions = array_ops.expand_dims(predictions, 2)
-          labels = math_ops.cast(constant_op.constant(
-              ((0, 1, 1, 0),
-               (1, 0, 0, 0),
-               (0, 0, 0, 0))), dtype=dtype)
+          labels = math_ops.cast(
+              constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0))),
+              dtype=dtype)
           if expand_labels:
             labels = array_ops.expand_dims(labels, 2)
-          fn, fn_update_op = metrics.streaming_false_negatives(predictions,
-                                                               labels)
+          fn, fn_update_op = metrics.streaming_false_negatives(
+              predictions, labels)
 
           with self.test_session() as sess:
             sess.run(variables.local_variables_initializer())
@@ -748,14 +740,12 @@ class StreamingFalseNegativesTest(test.TestCase):
 
   def testWeighted(self):
     for dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
-      predictions = math_ops.cast(constant_op.constant(
-          ((1, 0, 1, 0),
-           (0, 1, 1, 1),
-           (0, 0, 0, 0))), dtype=dtype)
-      labels = math_ops.cast(constant_op.constant(
-          ((0, 1, 1, 0),
-           (1, 0, 0, 0),
-           (0, 0, 0, 0))), dtype=dtype)
+      predictions = math_ops.cast(
+          constant_op.constant(((1, 0, 1, 0), (0, 1, 1, 1), (0, 0, 0, 0))),
+          dtype=dtype)
+      labels = math_ops.cast(
+          constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0))),
+          dtype=dtype)
       fn, fn_update_op = metrics.streaming_false_negatives(
           predictions, labels, weights=((3.0,), (5.0,), (7.0,)))
 
@@ -773,28 +763,25 @@ class StreamingFalsePositivesTest(test.TestCase):
     ops.reset_default_graph()
 
   def testVars(self):
-    metrics.streaming_false_positives((0, 1, 0),
-                                      (0, 1, 1))
+    metrics.streaming_false_positives((0, 1, 0), (0, 1, 1))
     _assert_metric_variables(self, ('false_positives/count:0',))
 
   def testUnweighted(self):
     for expand_predictions in [True, False]:
       for expand_labels in [True, False]:
         for dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
-          predictions = math_ops.cast(constant_op.constant(
-              ((1, 0, 1, 0),
-               (0, 1, 1, 1),
-               (0, 0, 0, 0))), dtype=dtype)
+          predictions = math_ops.cast(
+              constant_op.constant(((1, 0, 1, 0), (0, 1, 1, 1), (0, 0, 0, 0))),
+              dtype=dtype)
           if expand_predictions:
             predictions = array_ops.expand_dims(predictions, 2)
-          labels = math_ops.cast(constant_op.constant(
-              ((0, 1, 1, 0),
-               (1, 0, 0, 0),
-               (0, 0, 0, 0))), dtype=dtype)
+          labels = math_ops.cast(
+              constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0))),
+              dtype=dtype)
           if expand_labels:
             labels = array_ops.expand_dims(labels, 2)
-          fp, fp_update_op = metrics.streaming_false_positives(predictions,
-                                                               labels)
+          fp, fp_update_op = metrics.streaming_false_positives(
+              predictions, labels)
 
           with self.test_session() as sess:
             sess.run(variables.local_variables_initializer())
@@ -804,20 +791,17 @@ class StreamingFalsePositivesTest(test.TestCase):
 
   def testWeighted(self):
     for dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
-      predictions = math_ops.cast(constant_op.constant(
-          ((1, 0, 1, 0),
-           (0, 1, 1, 1),
-           (0, 0, 0, 0))), dtype=dtype)
-      labels = math_ops.cast(constant_op.constant(
-          ((0, 1, 1, 0),
-           (1, 0, 0, 0),
-           (0, 0, 0, 0))), dtype=dtype)
+      predictions = math_ops.cast(
+          constant_op.constant(((1, 0, 1, 0), (0, 1, 1, 1), (0, 0, 0, 0))),
+          dtype=dtype)
+      labels = math_ops.cast(
+          constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0))),
+          dtype=dtype)
       fp, fp_update_op = metrics.streaming_false_positives(
           predictions,
           labels,
-          weights=((1.0, 2.0, 3.0, 5.0),
-                   (7.0, 11.0, 13.0, 17.0),
-                   (19.0, 23.0, 29.0, 31.0)))
+          weights=((1.0, 2.0, 3.0, 5.0), (7.0, 11.0, 13.0, 17.0), (19.0, 23.0,
+                                                                   29.0, 31.0)))
 
       with self.test_session() as sess:
         sess.run(variables.local_variables_initializer())
@@ -833,28 +817,25 @@ class StreamingTrueNegativesTest(test.TestCase):
     ops.reset_default_graph()
 
   def testVars(self):
-    metrics.streaming_true_negatives((0, 1, 0),
-                                     (0, 1, 1))
+    metrics.streaming_true_negatives((0, 1, 0), (0, 1, 1))
     _assert_metric_variables(self, ('true_negatives/count:0',))
 
   def testUnweighted(self):
     for expand_predictions in [True, False]:
       for expand_labels in [True, False]:
         for dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
-          predictions = math_ops.cast(constant_op.constant(
-              ((1, 0, 1, 0),
-               (0, 1, 1, 1),
-               (0, 0, 0, 0))), dtype=dtype)
+          predictions = math_ops.cast(
+              constant_op.constant(((1, 0, 1, 0), (0, 1, 1, 1), (0, 0, 0, 0))),
+              dtype=dtype)
           if expand_predictions:
             predictions = array_ops.expand_dims(predictions, 2)
-          labels = math_ops.cast(constant_op.constant(
-              ((0, 1, 1, 0),
-               (1, 0, 0, 0),
-               (0, 0, 0, 0))), dtype=dtype)
+          labels = math_ops.cast(
+              constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0))),
+              dtype=dtype)
           if expand_labels:
             labels = array_ops.expand_dims(labels, 2)
-          tn, tn_update_op = metrics.streaming_true_negatives(predictions,
-                                                              labels)
+          tn, tn_update_op = metrics.streaming_true_negatives(
+              predictions, labels)
 
           with self.test_session() as sess:
             sess.run(variables.local_variables_initializer())
@@ -864,14 +845,12 @@ class StreamingTrueNegativesTest(test.TestCase):
 
   def testWeighted(self):
     for dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
-      predictions = math_ops.cast(constant_op.constant(
-          ((1, 0, 1, 0),
-           (0, 1, 1, 1),
-           (0, 0, 0, 0))), dtype=dtype)
-      labels = math_ops.cast(constant_op.constant(
-          ((0, 1, 1, 0),
-           (1, 0, 0, 0),
-           (0, 0, 0, 0))), dtype=dtype)
+      predictions = math_ops.cast(
+          constant_op.constant(((1, 0, 1, 0), (0, 1, 1, 1), (0, 0, 0, 0))),
+          dtype=dtype)
+      labels = math_ops.cast(
+          constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0))),
+          dtype=dtype)
       tn, tn_update_op = metrics.streaming_true_negatives(
           predictions, labels, weights=((0.0, 2.0, 3.0, 5.0),))
 
@@ -894,12 +873,9 @@ class StreamingTruePositivesAtThresholdsTest(test.TestCase):
     _assert_metric_variables(self, ('true_positives:0',))
 
   def testUnweighted(self):
-    predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
-                                        (0.2, 0.9, 0.7, 0.6),
-                                        (0.1, 0.2, 0.4, 0.3)))
-    labels = constant_op.constant(((0, 1, 1, 0),
-                                   (1, 0, 0, 0),
-                                   (0, 0, 0, 0)))
+    predictions = constant_op.constant(
+        ((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6), (0.1, 0.2, 0.4, 0.3)))
+    labels = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0)))
     tp, tp_update_op = metrics.streaming_true_positives_at_thresholds(
         predictions, labels, thresholds=(0.15, 0.5, 0.85))
 
@@ -910,12 +886,9 @@ class StreamingTruePositivesAtThresholdsTest(test.TestCase):
       self.assertAllEqual((3, 1, 0), tp.eval())
 
   def testWeighted(self):
-    predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
-                                        (0.2, 0.9, 0.7, 0.6),
-                                        (0.1, 0.2, 0.4, 0.3)))
-    labels = constant_op.constant(((0, 1, 1, 0),
-                                   (1, 0, 0, 0),
-                                   (0, 0, 0, 0)))
+    predictions = constant_op.constant(
+        ((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6), (0.1, 0.2, 0.4, 0.3)))
+    labels = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0)))
     tp, tp_update_op = metrics.streaming_true_positives_at_thresholds(
         predictions, labels, weights=37.0, thresholds=(0.15, 0.5, 0.85))
 
@@ -937,16 +910,14 @@ class StreamingFalseNegativesAtThresholdsTest(test.TestCase):
         (0.0, 1.0, 0.0), (0, 1, 1), thresholds=(
             0.15,
             0.5,
-            0.85,))
+            0.85,
+        ))
     _assert_metric_variables(self, ('false_negatives:0',))
 
   def testUnweighted(self):
-    predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
-                                        (0.2, 0.9, 0.7, 0.6),
-                                        (0.1, 0.2, 0.4, 0.3)))
-    labels = constant_op.constant(((0, 1, 1, 0),
-                                   (1, 0, 0, 0),
-                                   (0, 0, 0, 0)))
+    predictions = constant_op.constant(
+        ((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6), (0.1, 0.2, 0.4, 0.3)))
+    labels = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0)))
     fn, fn_update_op = metrics.streaming_false_negatives_at_thresholds(
         predictions, labels, thresholds=(0.15, 0.5, 0.85))
 
@@ -957,12 +928,9 @@ class StreamingFalseNegativesAtThresholdsTest(test.TestCase):
       self.assertAllEqual((0, 2, 3), fn.eval())
 
   def testWeighted(self):
-    predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
-                                        (0.2, 0.9, 0.7, 0.6),
-                                        (0.1, 0.2, 0.4, 0.3)))
-    labels = constant_op.constant(((0, 1, 1, 0),
-                                   (1, 0, 0, 0),
-                                   (0, 0, 0, 0)))
+    predictions = constant_op.constant(
+        ((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6), (0.1, 0.2, 0.4, 0.3)))
+    labels = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0)))
     fn, fn_update_op = metrics.streaming_false_negatives_at_thresholds(
         predictions,
         labels,
@@ -988,12 +956,9 @@ class StreamingFalsePositivesAtThresholdsTest(test.TestCase):
     _assert_metric_variables(self, ('false_positives:0',))
 
   def testUnweighted(self):
-    predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
-                                        (0.2, 0.9, 0.7, 0.6),
-                                        (0.1, 0.2, 0.4, 0.3)))
-    labels = constant_op.constant(((0, 1, 1, 0),
-                                   (1, 0, 0, 0),
-                                   (0, 0, 0, 0)))
+    predictions = constant_op.constant(
+        ((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6), (0.1, 0.2, 0.4, 0.3)))
+    labels = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0)))
     fp, fp_update_op = metrics.streaming_false_positives_at_thresholds(
         predictions, labels, thresholds=(0.15, 0.5, 0.85))
 
@@ -1004,18 +969,14 @@ class StreamingFalsePositivesAtThresholdsTest(test.TestCase):
       self.assertAllEqual((7, 4, 2), fp.eval())
 
   def testWeighted(self):
-    predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
-                                        (0.2, 0.9, 0.7, 0.6),
-                                        (0.1, 0.2, 0.4, 0.3)))
-    labels = constant_op.constant(((0, 1, 1, 0),
-                                   (1, 0, 0, 0),
-                                   (0, 0, 0, 0)))
+    predictions = constant_op.constant(
+        ((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6), (0.1, 0.2, 0.4, 0.3)))
+    labels = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0)))
     fp, fp_update_op = metrics.streaming_false_positives_at_thresholds(
         predictions,
         labels,
-        weights=((1.0, 2.0, 3.0, 5.0),
-                 (7.0, 11.0, 13.0, 17.0),
-                 (19.0, 23.0, 29.0, 31.0)),
+        weights=((1.0, 2.0, 3.0, 5.0), (7.0, 11.0, 13.0, 17.0), (19.0, 23.0,
+                                                                 29.0, 31.0)),
         thresholds=(0.15, 0.5, 0.85))
 
     with self.test_session() as sess:
@@ -1037,12 +998,9 @@ class StreamingTrueNegativesAtThresholdsTest(test.TestCase):
     _assert_metric_variables(self, ('true_negatives:0',))
 
   def testUnweighted(self):
-    predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
-                                        (0.2, 0.9, 0.7, 0.6),
-                                        (0.1, 0.2, 0.4, 0.3)))
-    labels = constant_op.constant(((0, 1, 1, 0),
-                                   (1, 0, 0, 0),
-                                   (0, 0, 0, 0)))
+    predictions = constant_op.constant(
+        ((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6), (0.1, 0.2, 0.4, 0.3)))
+    labels = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0)))
     tn, tn_update_op = metrics.streaming_true_negatives_at_thresholds(
         predictions, labels, thresholds=(0.15, 0.5, 0.85))
 
@@ -1053,12 +1011,9 @@ class StreamingTrueNegativesAtThresholdsTest(test.TestCase):
       self.assertAllEqual((2, 5, 7), tn.eval())
 
   def testWeighted(self):
-    predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
-                                        (0.2, 0.9, 0.7, 0.6),
-                                        (0.1, 0.2, 0.4, 0.3)))
-    labels = constant_op.constant(((0, 1, 1, 0),
-                                   (1, 0, 0, 0),
-                                   (0, 0, 0, 0)))
+    predictions = constant_op.constant(
+        ((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6), (0.1, 0.2, 0.4, 0.3)))
+    labels = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0)))
     tn, tn_update_op = metrics.streaming_true_negatives_at_thresholds(
         predictions,
         labels,
@@ -1393,8 +1348,7 @@ class StreamingFPRTest(test.TestCase):
         (10, 3), maxval=1, dtype=dtypes_lib.int64, seed=1)
     labels = random_ops.random_uniform(
         (10, 3), maxval=2, dtype=dtypes_lib.int64, seed=2)
-    fpr, update_op = metrics.streaming_false_positive_rate(
-        predictions, labels)
+    fpr, update_op = metrics.streaming_false_positive_rate(predictions, labels)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -1413,8 +1367,7 @@ class StreamingFPRTest(test.TestCase):
 
     predictions = constant_op.constant(np_inputs)
     labels = constant_op.constant(np_inputs)
-    fpr, update_op = metrics.streaming_false_positive_rate(
-        predictions, labels)
+    fpr, update_op = metrics.streaming_false_positive_rate(predictions, labels)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -1424,8 +1377,7 @@ class StreamingFPRTest(test.TestCase):
   def testSomeCorrect(self):
     predictions = constant_op.constant([1, 0, 1, 0], shape=(1, 4))
     labels = constant_op.constant([0, 1, 1, 0], shape=(1, 4))
-    fpr, update_op = metrics.streaming_false_positive_rate(
-        predictions, labels)
+    fpr, update_op = metrics.streaming_false_positive_rate(predictions, labels)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -1467,8 +1419,7 @@ class StreamingFPRTest(test.TestCase):
 
     predictions = constant_op.constant(np_inputs)
     labels = constant_op.constant(1 - np_inputs)
-    fpr, update_op = metrics.streaming_false_positive_rate(
-        predictions, labels)
+    fpr, update_op = metrics.streaming_false_positive_rate(predictions, labels)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -1478,8 +1429,7 @@ class StreamingFPRTest(test.TestCase):
   def testZeroFalsePositivesAndTrueNegativesGivesZeroFPR(self):
     predictions = array_ops.ones((1, 4))
     labels = array_ops.ones((1, 4))
-    fpr, update_op = metrics.streaming_false_positive_rate(
-        predictions, labels)
+    fpr, update_op = metrics.streaming_false_positive_rate(predictions, labels)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -1521,8 +1471,7 @@ class StreamingFNRTest(test.TestCase):
         (10, 3), maxval=1, dtype=dtypes_lib.int64, seed=1)
     labels = random_ops.random_uniform(
         (10, 3), maxval=2, dtype=dtypes_lib.int64, seed=2)
-    fnr, update_op = metrics.streaming_false_negative_rate(
-        predictions, labels)
+    fnr, update_op = metrics.streaming_false_negative_rate(predictions, labels)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -1541,8 +1490,7 @@ class StreamingFNRTest(test.TestCase):
 
     predictions = constant_op.constant(np_inputs)
     labels = constant_op.constant(np_inputs)
-    fnr, update_op = metrics.streaming_false_negative_rate(
-        predictions, labels)
+    fnr, update_op = metrics.streaming_false_negative_rate(predictions, labels)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -1552,8 +1500,7 @@ class StreamingFNRTest(test.TestCase):
   def testSomeCorrect(self):
     predictions = constant_op.constant([1, 0, 1, 0], shape=(1, 4))
     labels = constant_op.constant([0, 1, 1, 0], shape=(1, 4))
-    fnr, update_op = metrics.streaming_false_negative_rate(
-        predictions, labels)
+    fnr, update_op = metrics.streaming_false_negative_rate(predictions, labels)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -1595,8 +1542,7 @@ class StreamingFNRTest(test.TestCase):
 
     predictions = constant_op.constant(np_inputs)
     labels = constant_op.constant(1 - np_inputs)
-    fnr, update_op = metrics.streaming_false_negative_rate(
-        predictions, labels)
+    fnr, update_op = metrics.streaming_false_negative_rate(predictions, labels)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -1606,8 +1552,7 @@ class StreamingFNRTest(test.TestCase):
   def testZeroFalseNegativesAndTruePositivesGivesZeroFNR(self):
     predictions = array_ops.zeros((1, 4))
     labels = array_ops.zeros((1, 4))
-    fnr, update_op = metrics.streaming_false_negative_rate(
-        predictions, labels)
+    fnr, update_op = metrics.streaming_false_negative_rate(predictions, labels)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -1944,16 +1889,17 @@ class StreamingAUCTest(test.TestCase):
         enqueue_ops[i].append(x_queue.enqueue(x_batches[i, :]))
       return x_queue.dequeue()
 
-    for weights in (None, np.ones(num_samples), np.random.exponential(
-        scale=1.0, size=num_samples)):
+    for weights in (None, np.ones(num_samples),
+                    np.random.exponential(scale=1.0, size=num_samples)):
       expected_auc = _np_auc(predictions, labels, weights)
 
       with self.test_session() as sess:
         enqueue_ops = [[] for i in range(num_batches)]
         tf_predictions = _enqueue_as_batches(predictions, enqueue_ops)
         tf_labels = _enqueue_as_batches(labels, enqueue_ops)
-        tf_weights = (_enqueue_as_batches(weights, enqueue_ops) if
-                      weights is not None else None)
+        tf_weights = (
+            _enqueue_as_batches(weights, enqueue_ops)
+            if weights is not None else None)
 
         for i in range(num_batches):
           sess.run(enqueue_ops[i])
@@ -1985,17 +1931,18 @@ class StreamingDynamicAUCTest(test.TestCase):
   def testUnknownCurve(self):
     with self.assertRaisesRegexp(
         ValueError, 'curve must be either ROC or PR, TEST_CURVE unknown'):
-      metrics.streaming_dynamic_auc(labels=array_ops.ones((10, 1)),
-                                    predictions=array_ops.ones((10, 1)),
-                                    curve='TEST_CURVE')
+      metrics.streaming_dynamic_auc(
+          labels=array_ops.ones((10, 1)),
+          predictions=array_ops.ones((10, 1)),
+          curve='TEST_CURVE')
 
   def testVars(self):
     metrics.streaming_dynamic_auc(
         labels=array_ops.ones((10, 1)), predictions=array_ops.ones((10, 1)))
-    _assert_metric_variables(self, ['dynamic_auc/concat_labels/array:0',
-                                    'dynamic_auc/concat_labels/size:0',
-                                    'dynamic_auc/concat_preds/array:0',
-                                    'dynamic_auc/concat_preds/size:0'])
+    _assert_metric_variables(self, [
+        'dynamic_auc/concat_labels/array:0', 'dynamic_auc/concat_labels/size:0',
+        'dynamic_auc/concat_preds/array:0', 'dynamic_auc/concat_preds/size:0'
+    ])
 
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
@@ -2049,8 +1996,8 @@ class StreamingDynamicAUCTest(test.TestCase):
 
   def testNonZeroOnePredictions(self):
     with self.test_session() as sess:
-      predictions = constant_op.constant([2.5, -2.5, 2.5, -2.5],
-                                         dtype=dtypes_lib.float32)
+      predictions = constant_op.constant(
+          [2.5, -2.5, 2.5, -2.5], dtype=dtypes_lib.float32)
       labels = constant_op.constant([1, 0, 1, 0])
       auc, update_op = metrics.streaming_dynamic_auc(labels, predictions)
       sess.run(variables.local_variables_initializer())
@@ -2122,9 +2069,10 @@ class StreamingDynamicAUCTest(test.TestCase):
     num_batches = 100
     labels = np.array([])
     predictions = np.array([])
-    tf_labels = variables.Variable(array_ops.ones(batch_size, dtypes_lib.int32),
-                                   collections=[ops.GraphKeys.LOCAL_VARIABLES],
-                                   dtype=dtypes_lib.int32)
+    tf_labels = variables.Variable(
+        array_ops.ones(batch_size, dtypes_lib.int32),
+        collections=[ops.GraphKeys.LOCAL_VARIABLES],
+        dtype=dtypes_lib.int32)
     tf_predictions = variables.Variable(
         array_ops.ones(batch_size),
         collections=[ops.GraphKeys.LOCAL_VARIABLES],
@@ -2195,8 +2143,7 @@ class StreamingPrecisionRecallAtEqualThresholdsTest(test.TestCase):
       gotten_result: A PrecisionRecallData object.
     """
     gotten_dict = {k: t.eval() for k, t in gotten_result._asdict().items()}
-    self.assertItemsEqual(
-        list(expected_dict.keys()), list(gotten_dict.keys()))
+    self.assertItemsEqual(list(expected_dict.keys()), list(gotten_dict.keys()))
 
     for key, expected_values in expected_dict.items():
       self.assertAllClose(expected_values, gotten_dict[key])
@@ -2261,60 +2208,65 @@ class StreamingPrecisionRecallAtEqualThresholdsTest(test.TestCase):
         sess.run(update_op)
 
       # Then verify idempotency.
-      initial_result = {k: value.eval().tolist() for k, value in
-                        result._asdict().items()}
+      initial_result = {
+          k: value.eval().tolist()
+          for k, value in result._asdict().items()
+      }
       for _ in range(3):
         self._testResultsEqual(initial_result, result)
 
   def testAllTruePositives(self):
-    self._testCase([[1]], [[True]], {
-        'tp': [1, 1, 1],
-        'fp': [0, 0, 0],
-        'tn': [0, 0, 0],
-        'fn': [0, 0, 0],
-        'precision': [1.0, 1.0, 1.0],
-        'recall': [1.0, 1.0, 1.0],
-        'thresholds': [0.0, 0.5, 1.0],
-    })
+    self._testCase(
+        [[1]], [[True]], {
+            'tp': [1, 1, 1],
+            'fp': [0, 0, 0],
+            'tn': [0, 0, 0],
+            'fn': [0, 0, 0],
+            'precision': [1.0, 1.0, 1.0],
+            'recall': [1.0, 1.0, 1.0],
+            'thresholds': [0.0, 0.5, 1.0],
+        })
 
   def testAllTrueNegatives(self):
-    self._testCase([[0]], [[False]], {
-        'tp': [0, 0, 0],
-        'fp': [1, 0, 0],
-        'tn': [0, 1, 1],
-        'fn': [0, 0, 0],
-        'precision': [0.0, 0.0, 0.0],
-        'recall': [0.0, 0.0, 0.0],
-        'thresholds': [0.0, 0.5, 1.0],
-    })
+    self._testCase(
+        [[0]], [[False]], {
+            'tp': [0, 0, 0],
+            'fp': [1, 0, 0],
+            'tn': [0, 1, 1],
+            'fn': [0, 0, 0],
+            'precision': [0.0, 0.0, 0.0],
+            'recall': [0.0, 0.0, 0.0],
+            'thresholds': [0.0, 0.5, 1.0],
+        })
 
   def testAllFalsePositives(self):
-    self._testCase([[1]], [[False]], {
-        'tp': [0, 0, 0],
-        'fp': [1, 1, 1],
-        'tn': [0, 0, 0],
-        'fn': [0, 0, 0],
-        'precision': [0.0, 0.0, 0.0],
-        'recall': [0.0, 0.0, 0.0],
-        'thresholds': [0.0, 0.5, 1.0],
-    })
+    self._testCase(
+        [[1]], [[False]], {
+            'tp': [0, 0, 0],
+            'fp': [1, 1, 1],
+            'tn': [0, 0, 0],
+            'fn': [0, 0, 0],
+            'precision': [0.0, 0.0, 0.0],
+            'recall': [0.0, 0.0, 0.0],
+            'thresholds': [0.0, 0.5, 1.0],
+        })
 
   def testAllFalseNegatives(self):
-    self._testCase([[0]], [[True]], {
-        'tp': [1, 0, 0],
-        'fp': [0, 0, 0],
-        'tn': [0, 0, 0],
-        'fn': [0, 1, 1],
-        'precision': [1.0, 0.0, 0.0],
-        'recall': [1.0, 0.0, 0.0],
-        'thresholds': [0.0, 0.5, 1.0],
-    })
+    self._testCase(
+        [[0]], [[True]], {
+            'tp': [1, 0, 0],
+            'fp': [0, 0, 0],
+            'tn': [0, 0, 0],
+            'fn': [0, 1, 1],
+            'precision': [1.0, 0.0, 0.0],
+            'recall': [1.0, 0.0, 0.0],
+            'thresholds': [0.0, 0.5, 1.0],
+        })
 
   def testManyValues(self):
     self._testCase(
         [[0.2, 0.3, 0.4, 0.6, 0.7, 0.8]],
-        [[True, False, False, True, True, True]],
-        {
+        [[True, False, False, True, True, True]], {
             'tp': [4, 3, 0],
             'fp': [2, 0, 0],
             'tn': [0, 2, 2],
@@ -2327,8 +2279,7 @@ class StreamingPrecisionRecallAtEqualThresholdsTest(test.TestCase):
   def testManyValuesWithWeights(self):
     self._testCase(
         [[0.2, 0.3, 0.4, 0.6, 0.7, 0.8]],
-        [[True, False, False, True, True, True]],
-        {
+        [[True, False, False, True, True, True]], {
             'tp': [1.5, 1.5, 0.0],
             'fp': [2.5, 0.0, 0.0],
             'tn': [0.0, 2.5, 2.5],
@@ -2644,11 +2595,10 @@ class StreamingPrecisionRecallThresholdsTest(test.TestCase):
     labels = random_ops.random_uniform(
         (10, 3), maxval=2, dtype=dtypes_lib.int64, seed=2)
     thresholds = [0, 0.5, 1.0]
-    prec, prec_op = metrics.streaming_precision_at_thresholds(predictions,
-                                                              labels,
-                                                              thresholds)
-    rec, rec_op = metrics.streaming_recall_at_thresholds(predictions, labels,
-                                                         thresholds)
+    prec, prec_op = metrics.streaming_precision_at_thresholds(
+        predictions, labels, thresholds)
+    rec, rec_op = metrics.streaming_recall_at_thresholds(
+        predictions, labels, thresholds)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -2672,11 +2622,10 @@ class StreamingPrecisionRecallThresholdsTest(test.TestCase):
       predictions = constant_op.constant(inputs, dtype=dtypes_lib.float32)
       labels = constant_op.constant(inputs)
       thresholds = [0.5]
-      prec, prec_op = metrics.streaming_precision_at_thresholds(predictions,
-                                                                labels,
-                                                                thresholds)
-      rec, rec_op = metrics.streaming_recall_at_thresholds(predictions, labels,
-                                                           thresholds)
+      prec, prec_op = metrics.streaming_precision_at_thresholds(
+          predictions, labels, thresholds)
+      rec, rec_op = metrics.streaming_recall_at_thresholds(
+          predictions, labels, thresholds)
 
       sess.run(variables.local_variables_initializer())
       sess.run([prec_op, rec_op])
@@ -2690,11 +2639,10 @@ class StreamingPrecisionRecallThresholdsTest(test.TestCase):
           [1, 0, 1, 0], shape=(1, 4), dtype=dtypes_lib.float32)
       labels = constant_op.constant([0, 1, 1, 0], shape=(1, 4))
       thresholds = [0.5]
-      prec, prec_op = metrics.streaming_precision_at_thresholds(predictions,
-                                                                labels,
-                                                                thresholds)
-      rec, rec_op = metrics.streaming_recall_at_thresholds(predictions, labels,
-                                                           thresholds)
+      prec, prec_op = metrics.streaming_precision_at_thresholds(
+          predictions, labels, thresholds)
+      rec, rec_op = metrics.streaming_recall_at_thresholds(
+          predictions, labels, thresholds)
 
       sess.run(variables.local_variables_initializer())
       sess.run([prec_op, rec_op])
@@ -2709,11 +2657,10 @@ class StreamingPrecisionRecallThresholdsTest(test.TestCase):
       predictions = constant_op.constant(inputs, dtype=dtypes_lib.float32)
       labels = constant_op.constant(1 - inputs, dtype=dtypes_lib.float32)
       thresholds = [0.5]
-      prec, prec_op = metrics.streaming_precision_at_thresholds(predictions,
-                                                                labels,
-                                                                thresholds)
-      rec, rec_op = metrics.streaming_recall_at_thresholds(predictions, labels,
-                                                           thresholds)
+      prec, prec_op = metrics.streaming_precision_at_thresholds(
+          predictions, labels, thresholds)
+      rec, rec_op = metrics.streaming_recall_at_thresholds(
+          predictions, labels, thresholds)
 
       sess.run(variables.local_variables_initializer())
       sess.run([prec_op, rec_op])
@@ -2779,11 +2726,10 @@ class StreamingPrecisionRecallThresholdsTest(test.TestCase):
           [1, 0, 1, 0], shape=(1, 4), dtype=dtypes_lib.float32)
       labels = constant_op.constant([0, 1, 1, 1], shape=(1, 4))
       thresholds = [-1.0, 2.0]  # lower/higher than any values
-      prec, prec_op = metrics.streaming_precision_at_thresholds(predictions,
-                                                                labels,
-                                                                thresholds)
-      rec, rec_op = metrics.streaming_recall_at_thresholds(predictions, labels,
-                                                           thresholds)
+      prec, prec_op = metrics.streaming_precision_at_thresholds(
+          predictions, labels, thresholds)
+      rec, rec_op = metrics.streaming_recall_at_thresholds(
+          predictions, labels, thresholds)
 
       prec_low = prec[0]
       prec_high = prec[1]
@@ -2803,11 +2749,10 @@ class StreamingPrecisionRecallThresholdsTest(test.TestCase):
       predictions = array_ops.zeros([4], dtype=dtypes_lib.float32)
       labels = array_ops.zeros([4])
       thresholds = [0.5]
-      prec, prec_op = metrics.streaming_precision_at_thresholds(predictions,
-                                                                labels,
-                                                                thresholds)
-      rec, rec_op = metrics.streaming_recall_at_thresholds(predictions, labels,
-                                                           thresholds)
+      prec, prec_op = metrics.streaming_precision_at_thresholds(
+          predictions, labels, thresholds)
+      rec, rec_op = metrics.streaming_recall_at_thresholds(
+          predictions, labels, thresholds)
 
       sess.run(variables.local_variables_initializer())
       sess.run([prec_op, rec_op])
@@ -2872,12 +2817,10 @@ class StreamingPrecisionRecallThresholdsTest(test.TestCase):
       tf_predictions = predictions_queue.dequeue()
       tf_labels = labels_queue.dequeue()
 
-      prec, prec_op = metrics.streaming_precision_at_thresholds(tf_predictions,
-                                                                tf_labels,
-                                                                thresholds)
-      rec, rec_op = metrics.streaming_recall_at_thresholds(tf_predictions,
-                                                           tf_labels,
-                                                           thresholds)
+      prec, prec_op = metrics.streaming_precision_at_thresholds(
+          tf_predictions, tf_labels, thresholds)
+      rec, rec_op = metrics.streaming_recall_at_thresholds(
+          tf_predictions, tf_labels, thresholds)
 
       sess.run(variables.local_variables_initializer())
       for _ in range(int(num_samples / batch_size)):
@@ -2921,8 +2864,7 @@ class StreamingFPRThresholdsTest(test.TestCase):
         labels=array_ops.ones((10, 1)),
         thresholds=[0, 0.5, 1.0],
         updates_collections=[my_collection_name])
-    self.assertListEqual(
-        ops.get_collection(my_collection_name), [update_op])
+    self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
   def testValueTensorIsIdempotent(self):
     predictions = random_ops.random_uniform(
@@ -3271,8 +3213,7 @@ class StreamingFNRThresholdsTest(test.TestCase):
         labels=array_ops.ones((10, 1)),
         thresholds=[0, 0.5, 1.0],
         updates_collections=[my_collection_name])
-    self.assertListEqual(
-        ops.get_collection(my_collection_name), [update_op])
+    self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
   def testValueTensorIsIdempotent(self):
     predictions = random_ops.random_uniform(
@@ -3492,8 +3433,7 @@ class StreamingRecallAtKTest(test.TestCase):
   def testVars(self):
     metrics.streaming_recall_at_k(
         predictions=array_ops.ones((self._batch_size, self._num_classes)),
-        labels=array_ops.ones(
-            (self._batch_size,), dtype=dtypes_lib.int32),
+        labels=array_ops.ones((self._batch_size,), dtype=dtypes_lib.int32),
         k=1)
     _assert_metric_variables(self,
                              ('recall_at_1/count:0', 'recall_at_1/total:0'))
@@ -3502,8 +3442,7 @@ class StreamingRecallAtKTest(test.TestCase):
     my_collection_name = '__metrics__'
     mean, _ = metrics.streaming_recall_at_k(
         predictions=array_ops.ones((self._batch_size, self._num_classes)),
-        labels=array_ops.ones(
-            (self._batch_size,), dtype=dtypes_lib.int32),
+        labels=array_ops.ones((self._batch_size,), dtype=dtypes_lib.int32),
         k=1,
         metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [mean])
@@ -3512,8 +3451,7 @@ class StreamingRecallAtKTest(test.TestCase):
     my_collection_name = '__updates__'
     _, update_op = metrics.streaming_recall_at_k(
         predictions=array_ops.ones((self._batch_size, self._num_classes)),
-        labels=array_ops.ones(
-            (self._batch_size,), dtype=dtypes_lib.int32),
+        labels=array_ops.ones((self._batch_size,), dtype=dtypes_lib.int32),
         k=1,
         updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
@@ -3715,9 +3653,17 @@ class StreamingSparsePrecisionTest(test.TestCase):
       # top_k_predictions has rank < 2.
       top_k_predictions = [9, 4, 6, 2, 0]
       sp_labels = sparse_tensor.SparseTensorValue(
-          indices=np.array([[0,], [1,], [2,]], np.int64),
+          indices=np.array([[
+              0,
+          ], [
+              1,
+          ], [
+              2,
+          ]], np.int64),
           values=np.array([2, 7, 8], np.int64),
-          dense_shape=np.array([10,], np.int64))
+          dense_shape=np.array([
+              10,
+          ], np.int64))
 
       with self.assertRaises(ValueError):
         precision, _ = metrics.streaming_sparse_precision_at_top_k(
@@ -3774,8 +3720,9 @@ class StreamingSparsePrecisionTest(test.TestCase):
     # average of the 2 examples.
     labels = np.array([labels_ex1, labels_ex2], dtype=np.int64)
     predictions = (predictions_ex1, predictions_ex2)
-    streaming_precision = [(ex1 + ex2) / 2
-                           for ex1, ex2 in zip(precision_ex1, precision_ex2)]
+    streaming_precision = [
+        (ex1 + ex2) / 2 for ex1, ex2 in zip(precision_ex1, precision_ex2)
+    ]
     streaming_average_precision = [
         (ex1 + ex2) / 2
         for ex1, ex2 in zip(avg_precision_ex1, avg_precision_ex2)
@@ -3835,29 +3782,29 @@ class StreamingSparsePrecisionTest(test.TestCase):
           (predictions_top_k_ex1[:k],), labels, expected=avg_precision_ex1[i])
 
   def test_average_precision_at_top_k_static_shape_check(self):
-    predictions_top_k = array_ops.placeholder(shape=(2, None),
-                                              dtype=dtypes_lib.int64)
+    predictions_top_k = array_ops.placeholder(
+        shape=(2, None), dtype=dtypes_lib.int64)
     labels = np.array(((1,), (2,)), dtype=np.int64)
     # Fails due to non-static predictions_idx shape.
     with self.assertRaises(ValueError):
-      metric_ops.streaming_sparse_average_precision_at_top_k(predictions_top_k,
-                                                             labels)
+      metric_ops.streaming_sparse_average_precision_at_top_k(
+          predictions_top_k, labels)
 
     predictions_top_k = (2, 1)
     # Fails since rank of predictions_idx is less than one.
     with self.assertRaises(ValueError):
-      metric_ops.streaming_sparse_average_precision_at_top_k(predictions_top_k,
-                                                             labels)
+      metric_ops.streaming_sparse_average_precision_at_top_k(
+          predictions_top_k, labels)
     predictions_top_k = ((2,), (1,))
     # Valid static shape.
-    metric_ops.streaming_sparse_average_precision_at_top_k(predictions_top_k,
-                                                           labels)
+    metric_ops.streaming_sparse_average_precision_at_top_k(
+        predictions_top_k, labels)
 
   def test_one_label_at_k1_nan(self):
     predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
     top_k_predictions = [[3], [3]]
-    sparse_labels = _binary_2d_label_to_sparse_value(
-        [[0, 0, 0, 1], [0, 0, 1, 0]])
+    sparse_labels = _binary_2d_label_to_sparse_value([[0, 0, 0, 1],
+                                                      [0, 0, 1, 0]])
     dense_labels = np.array([[3], [2]], dtype=np.int64)
 
     for labels in (sparse_labels, dense_labels):
@@ -3871,8 +3818,8 @@ class StreamingSparsePrecisionTest(test.TestCase):
   def test_one_label_at_k1(self):
     predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
     top_k_predictions = [[3], [3]]
-    sparse_labels = _binary_2d_label_to_sparse_value(
-        [[0, 0, 0, 1], [0, 0, 1, 0]])
+    sparse_labels = _binary_2d_label_to_sparse_value([[0, 0, 0, 1],
+                                                      [0, 0, 1, 0]])
     dense_labels = np.array([[3], [2]], dtype=np.int64)
 
     for labels in (sparse_labels, dense_labels):
@@ -3971,8 +3918,8 @@ class StreamingSparsePrecisionTest(test.TestCase):
         [5, 7, 2, 9, 6],
     ]
     sp_labels = sparse_tensor.SparseTensorValue(
-        indices=[[0, 0], [0, 1], [0, 2], [0, 3], [1, 0], [1, 1], [1, 2],
-                 [1, 3]],
+        indices=[[0, 0], [0, 1], [0, 2], [0, 3], [1, 0], [1, 1], [1, 2], [1,
+                                                                          3]],
         # values -1 and 10 are outside the [0, n_classes) range and are ignored.
         values=np.array([2, 7, -1, 8, 1, 2, 5, 10], np.int64),
         dense_shape=[2, 4])
@@ -4324,8 +4271,8 @@ class StreamingSparseRecallTest(test.TestCase):
   def test_one_label_at_k1_nan(self):
     predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
     top_k_predictions = [[3], [3]]
-    sparse_labels = _binary_2d_label_to_sparse_value(
-        [[0, 0, 0, 1], [0, 0, 1, 0]])
+    sparse_labels = _binary_2d_label_to_sparse_value([[0, 0, 0, 1],
+                                                      [0, 0, 1, 0]])
     dense_labels = np.array([[3], [2]], dtype=np.int64)
 
     # Classes 0,1 have 0 labels, 0 predictions, classes -1 and 4 are out of
@@ -4340,8 +4287,8 @@ class StreamingSparseRecallTest(test.TestCase):
   def test_one_label_at_k1_no_predictions(self):
     predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
     top_k_predictions = [[3], [3]]
-    sparse_labels = _binary_2d_label_to_sparse_value(
-        [[0, 0, 0, 1], [0, 0, 1, 0]])
+    sparse_labels = _binary_2d_label_to_sparse_value([[0, 0, 0, 1],
+                                                      [0, 0, 1, 0]])
     dense_labels = np.array([[3], [2]], dtype=np.int64)
 
     for labels in (sparse_labels, dense_labels):
@@ -4354,8 +4301,8 @@ class StreamingSparseRecallTest(test.TestCase):
   def test_one_label_at_k1(self):
     predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
     top_k_predictions = [[3], [3]]
-    sparse_labels = _binary_2d_label_to_sparse_value(
-        [[0, 0, 0, 1], [0, 0, 1, 0]])
+    sparse_labels = _binary_2d_label_to_sparse_value([[0, 0, 0, 1],
+                                                      [0, 0, 1, 0]])
     dense_labels = np.array([[3], [2]], dtype=np.int64)
 
     for labels in (sparse_labels, dense_labels):
@@ -4374,8 +4321,8 @@ class StreamingSparseRecallTest(test.TestCase):
   def test_one_label_at_k1_weighted(self):
     predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
     top_k_predictions = [[3], [3]]
-    sparse_labels = _binary_2d_label_to_sparse_value(
-        [[0, 0, 0, 1], [0, 0, 1, 0]])
+    sparse_labels = _binary_2d_label_to_sparse_value([[0, 0, 0, 1],
+                                                      [0, 0, 1, 0]])
     dense_labels = np.array([[3], [2]], dtype=np.int64)
 
     for labels in (sparse_labels, dense_labels):
@@ -4647,8 +4594,8 @@ class StreamingSparseRecallTest(test.TestCase):
         [5, 7, 2, 9, 6],
     ]
     sp_labels = sparse_tensor.SparseTensorValue(
-        indices=[[0, 0], [0, 1], [0, 2], [0, 3], [1, 0], [1, 1], [1, 2],
-                 [1, 3]],
+        indices=[[0, 0], [0, 1], [0, 2], [0, 3], [1, 0], [1, 1], [1, 2], [1,
+                                                                          3]],
         # values -1 and 10 are outside the [0, n_classes) range.
         values=np.array([2, 7, -1, 8, 1, 2, 5, 10], np.int64),
         dense_shape=[2, 4])
@@ -4661,10 +4608,7 @@ class StreamingSparseRecallTest(test.TestCase):
         expected=2.0 / 2,
         class_id=2)
     self._test_sparse_recall_at_top_k(
-        sp_labels,
-        top_k_predictions,
-        expected=2.0 / 2,
-        class_id=2)
+        sp_labels, top_k_predictions, expected=2.0 / 2, class_id=2)
 
     # Class 5: 1 label, incorrect.
     self._test_streaming_sparse_recall_at_k(
@@ -4674,10 +4618,7 @@ class StreamingSparseRecallTest(test.TestCase):
         expected=1.0 / 1,
         class_id=5)
     self._test_sparse_recall_at_top_k(
-        sp_labels,
-        top_k_predictions,
-        expected=1.0 / 1,
-        class_id=5)
+        sp_labels, top_k_predictions, expected=1.0 / 1, class_id=5)
 
     # Class 7: 1 label, incorrect.
     self._test_streaming_sparse_recall_at_k(
@@ -4687,10 +4628,7 @@ class StreamingSparseRecallTest(test.TestCase):
         expected=0.0 / 1,
         class_id=7)
     self._test_sparse_recall_at_top_k(
-        sp_labels,
-        top_k_predictions,
-        expected=0.0 / 1,
-        class_id=7)
+        sp_labels, top_k_predictions, expected=0.0 / 1, class_id=7)
 
     # All classes: 8 labels, 3 correct.
     self._test_streaming_sparse_recall_at_k(
@@ -4740,10 +4678,8 @@ class StreamingSparseRecallTest(test.TestCase):
         [9, 4, 6, 2, 0],
     ]]
     sparse_labels = _binary_3d_label_to_sparse_value(
-        [[[0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
-          [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]],
-         [[0, 1, 1, 0, 0, 1, 0, 0, 0, 0],
-          [0, 0, 1, 0, 0, 0, 0, 1, 1, 0]]])
+        [[[0, 0, 1, 0, 0, 0, 0, 1, 1, 0], [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]],
+         [[0, 1, 1, 0, 0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 1, 1, 0]]])
     dense_labels = np.array(
         [[[2, 7, 8], [1, 2, 5]], [
             [1, 2, 5],
@@ -4771,10 +4707,8 @@ class StreamingSparseRecallTest(test.TestCase):
         [9, 4, 6, 2, 0],
     ]]
     labels = _binary_3d_label_to_sparse_value(
-        [[[0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
-          [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]],
-         [[0, 1, 1, 0, 0, 1, 0, 1, 0, 0],
-          [0, 0, 1, 0, 0, 0, 0, 0, 1, 0]]])
+        [[[0, 0, 1, 0, 0, 0, 0, 1, 1, 0], [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]],
+         [[0, 1, 1, 0, 0, 1, 0, 1, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 1, 0]]])
 
     # Class 2: 4 labels, all correct.
     self._test_streaming_sparse_recall_at_k(
@@ -4813,10 +4747,8 @@ class StreamingSparseRecallTest(test.TestCase):
         [9, 4, 6, 2, 0],
     ]]
     labels = _binary_3d_label_to_sparse_value(
-        [[[0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
-          [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]],
-         [[0, 1, 1, 0, 0, 1, 0, 1, 0, 0],
-          [0, 0, 1, 0, 0, 0, 0, 0, 1, 0]]])
+        [[[0, 0, 1, 0, 0, 0, 0, 1, 1, 0], [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]],
+         [[0, 1, 1, 0, 0, 1, 0, 1, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 1, 0]]])
 
     for class_id in xrange(10):
       self._test_streaming_sparse_recall_at_k(
@@ -4867,10 +4799,8 @@ class StreamingSparseRecallTest(test.TestCase):
         [9, 4, 6, 2, 0],
     ]]
     labels = _binary_3d_label_to_sparse_value(
-        [[[0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
-          [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]],
-         [[0, 1, 1, 0, 0, 1, 0, 1, 0, 0],
-          [0, 0, 1, 0, 0, 0, 0, 0, 1, 0]]])
+        [[[0, 0, 1, 0, 0, 0, 0, 1, 1, 0], [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]],
+         [[0, 1, 1, 0, 0, 1, 0, 1, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 1, 0]]])
 
     # Class 2: 2 labels, both correct.
     self._test_streaming_sparse_recall_at_k(
@@ -4963,10 +4893,8 @@ class StreamingSparseRecallTest(test.TestCase):
         weights=[[0, 1], [0, 1]])
 
   def test_sparse_tensor_value(self):
-    predictions = [[0.1, 0.3, 0.2, 0.4],
-                   [0.1, 0.2, 0.3, 0.4]]
-    labels = [[0, 0, 1, 0],
-              [0, 0, 0, 1]]
+    predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
+    labels = [[0, 0, 1, 0], [0, 0, 0, 1]]
     expected_recall = 0.5
     with self.test_session():
       _, recall = metrics.streaming_sparse_recall_at_k(
@@ -5009,8 +4937,8 @@ class StreamingMeanAbsoluteErrorTest(test.TestCase):
   def testValueTensorIsIdempotent(self):
     predictions = random_ops.random_normal((10, 3), seed=1)
     labels = random_ops.random_normal((10, 3), seed=2)
-    error, update_op = metrics.streaming_mean_absolute_error(predictions,
-                                                             labels)
+    error, update_op = metrics.streaming_mean_absolute_error(
+        predictions, labels)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -5031,8 +4959,8 @@ class StreamingMeanAbsoluteErrorTest(test.TestCase):
         [1, 3, 2, 3], shape=(1, 4), dtype=dtypes_lib.float32)
     weights = constant_op.constant([0, 1, 0, 1], shape=(1, 4))
 
-    error, update_op = metrics.streaming_mean_absolute_error(predictions,
-                                                             labels, weights)
+    error, update_op = metrics.streaming_mean_absolute_error(
+        predictions, labels, weights)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -5075,8 +5003,8 @@ class StreamingMeanRelativeErrorTest(test.TestCase):
     predictions = random_ops.random_normal((10, 3), seed=1)
     labels = random_ops.random_normal((10, 3), seed=2)
     normalizer = random_ops.random_normal((10, 3), seed=3)
-    error, update_op = metrics.streaming_mean_relative_error(predictions,
-                                                             labels, normalizer)
+    error, update_op = metrics.streaming_mean_relative_error(
+        predictions, labels, normalizer)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -5200,8 +5128,8 @@ class StreamingMeanSquaredErrorTest(test.TestCase):
         [1, 3, 2, 3], shape=(1, 4), dtype=dtypes_lib.float32)
     weights = constant_op.constant([0, 1, 0, 1], shape=(1, 4))
 
-    error, update_op = metrics.streaming_mean_squared_error(predictions, labels,
-                                                            weights)
+    error, update_op = metrics.streaming_mean_squared_error(
+        predictions, labels, weights)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -5224,8 +5152,8 @@ class StreamingMeanSquaredErrorTest(test.TestCase):
       _enqueue_vector(sess, labels_queue, [2, 4, 6])
       labels = labels_queue.dequeue()
 
-      error, update_op = metrics.streaming_mean_squared_error(predictions,
-                                                              labels)
+      error, update_op = metrics.streaming_mean_squared_error(
+          predictions, labels)
 
       sess.run(variables.local_variables_initializer())
       sess.run(update_op)
@@ -5292,10 +5220,10 @@ class StreamingMeanSquaredErrorTest(test.TestCase):
       _enqueue_vector(sess, labels_queue, [2, 4, 6])
       labels = labels_queue.dequeue()
 
-      mae, ma_update_op = metrics.streaming_mean_absolute_error(predictions,
-                                                                labels)
-      mse, ms_update_op = metrics.streaming_mean_squared_error(predictions,
-                                                               labels)
+      mae, ma_update_op = metrics.streaming_mean_absolute_error(
+          predictions, labels)
+      mse, ms_update_op = metrics.streaming_mean_squared_error(
+          predictions, labels)
 
       sess.run(variables.local_variables_initializer())
       sess.run([ma_update_op, ms_update_op])
@@ -5336,8 +5264,8 @@ class StreamingRootMeanSquaredErrorTest(test.TestCase):
   def testValueTensorIsIdempotent(self):
     predictions = random_ops.random_normal((10, 3), seed=1)
     labels = random_ops.random_normal((10, 3), seed=2)
-    error, update_op = metrics.streaming_root_mean_squared_error(predictions,
-                                                                 labels)
+    error, update_op = metrics.streaming_root_mean_squared_error(
+        predictions, labels)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -5357,8 +5285,8 @@ class StreamingRootMeanSquaredErrorTest(test.TestCase):
           0.0, shape=(1, 3), dtype=dtypes_lib.float32)
       labels = constant_op.constant(0.0, shape=(1, 3), dtype=dtypes_lib.float32)
 
-      rmse, update_op = metrics.streaming_root_mean_squared_error(predictions,
-                                                                  labels)
+      rmse, update_op = metrics.streaming_root_mean_squared_error(
+          predictions, labels)
 
       sess.run(variables.local_variables_initializer())
       self.assertEqual(0, sess.run(update_op))
@@ -5372,8 +5300,8 @@ class StreamingRootMeanSquaredErrorTest(test.TestCase):
       labels = constant_op.constant(
           [1, 3, 2], shape=(1, 3), dtype=dtypes_lib.float32)
 
-      rmse, update_op = metrics.streaming_root_mean_squared_error(predictions,
-                                                                  labels)
+      rmse, update_op = metrics.streaming_root_mean_squared_error(
+          predictions, labels)
 
       sess.run(variables.local_variables_initializer())
       self.assertAlmostEqual(math.sqrt(6), update_op.eval(), 5)
@@ -5387,9 +5315,8 @@ class StreamingRootMeanSquaredErrorTest(test.TestCase):
           [1, 3, 2, 3], shape=(1, 4), dtype=dtypes_lib.float32)
       weights = constant_op.constant([0, 1, 0, 1], shape=(1, 4))
 
-      rmse, update_op = metrics.streaming_root_mean_squared_error(predictions,
-                                                                  labels,
-                                                                  weights)
+      rmse, update_op = metrics.streaming_root_mean_squared_error(
+          predictions, labels, weights)
 
       sess.run(variables.local_variables_initializer())
       self.assertAlmostEqual(math.sqrt(13), sess.run(update_op))
@@ -5404,8 +5331,8 @@ class StreamingCovarianceTest(test.TestCase):
 
   def testVars(self):
     metrics.streaming_covariance(
-        predictions=math_ops.to_float(math_ops.range(10)) + array_ops.ones(
-            [10, 10]),
+        predictions=math_ops.to_float(math_ops.range(10)) +
+        array_ops.ones([10, 10]),
         labels=math_ops.to_float(math_ops.range(10)) + array_ops.ones([10, 10]))
     _assert_metric_variables(self, (
         'covariance/comoment:0',
@@ -5417,8 +5344,8 @@ class StreamingCovarianceTest(test.TestCase):
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
     cov, _ = metrics.streaming_covariance(
-        predictions=math_ops.to_float(math_ops.range(10)) + array_ops.ones(
-            [10, 10]),
+        predictions=math_ops.to_float(math_ops.range(10)) +
+        array_ops.ones([10, 10]),
         labels=math_ops.to_float(math_ops.range(10)) + array_ops.ones([10, 10]),
         metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [cov])
@@ -5426,8 +5353,8 @@ class StreamingCovarianceTest(test.TestCase):
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.streaming_covariance(
-        predictions=math_ops.to_float(math_ops.range(10)) + array_ops.ones(
-            [10, 10]),
+        predictions=math_ops.to_float(math_ops.range(10)) +
+        array_ops.ones([10, 10]),
         labels=math_ops.to_float(math_ops.range(10)) + array_ops.ones([10, 10]),
         updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
@@ -5487,9 +5414,8 @@ class StreamingCovarianceTest(test.TestCase):
       cov, update_op = metrics.streaming_covariance(
           predictions, labels, weights=weights)
 
-      expected_cov = np.cov([2, 4, 6, 8],
-                            [1, 3, 2, 7],
-                            fweights=[0, 1, 3, 1])[0, 1]
+      expected_cov = np.cov(
+          [2, 4, 6, 8], [1, 3, 2, 7], fweights=[0, 1, 3, 1])[0, 1]
       sess.run(variables.local_variables_initializer())
       self.assertAlmostEqual(expected_cov, sess.run(update_op))
       self.assertAlmostEqual(expected_cov, cov.eval())
@@ -5514,17 +5440,18 @@ class StreamingCovarianceTest(test.TestCase):
             predictions_t: predictions[stride * i:stride * (i + 1)],
             labels_t: labels[stride * i:stride * (i + 1)]
         }
-        self.assertEqual(np.isnan(prev_expected_cov),
-                         np.isnan(sess.run(cov, feed_dict=feed_dict)))
+        self.assertEqual(
+            np.isnan(prev_expected_cov),
+            np.isnan(sess.run(cov, feed_dict=feed_dict)))
         if not np.isnan(prev_expected_cov):
-          self.assertAlmostEqual(
-              prev_expected_cov, sess.run(cov, feed_dict=feed_dict), 5)
+          self.assertAlmostEqual(prev_expected_cov,
+                                 sess.run(cov, feed_dict=feed_dict), 5)
         expected_cov = np.cov(predictions[:stride * (i + 1)],
                               labels[:stride * (i + 1)])[0, 1]
-        self.assertAlmostEqual(
-            expected_cov, sess.run(update_op, feed_dict=feed_dict), 5)
-        self.assertAlmostEqual(
-            expected_cov, sess.run(cov, feed_dict=feed_dict), 5)
+        self.assertAlmostEqual(expected_cov,
+                               sess.run(update_op, feed_dict=feed_dict), 5)
+        self.assertAlmostEqual(expected_cov, sess.run(cov, feed_dict=feed_dict),
+                               5)
         prev_expected_cov = expected_cov
 
   def testMultiUpdateWithErrorAndWeights(self):
@@ -5552,18 +5479,20 @@ class StreamingCovarianceTest(test.TestCase):
             labels_t: labels[stride * i:stride * (i + 1)],
             weights_t: weights[stride * i:stride * (i + 1)]
         }
-        self.assertEqual(np.isnan(prev_expected_cov),
-                         np.isnan(sess.run(cov, feed_dict=feed_dict)))
+        self.assertEqual(
+            np.isnan(prev_expected_cov),
+            np.isnan(sess.run(cov, feed_dict=feed_dict)))
         if not np.isnan(prev_expected_cov):
-          self.assertAlmostEqual(
-              prev_expected_cov, sess.run(cov, feed_dict=feed_dict), 5)
-        expected_cov = np.cov(predictions[:stride * (i + 1)],
-                              labels[:stride * (i + 1)],
-                              fweights=weights[:stride * (i + 1)])[0, 1]
-        self.assertAlmostEqual(
-            expected_cov, sess.run(update_op, feed_dict=feed_dict), 5)
-        self.assertAlmostEqual(
-            expected_cov, sess.run(cov, feed_dict=feed_dict), 5)
+          self.assertAlmostEqual(prev_expected_cov,
+                                 sess.run(cov, feed_dict=feed_dict), 5)
+        expected_cov = np.cov(
+            predictions[:stride * (i + 1)],
+            labels[:stride * (i + 1)],
+            fweights=weights[:stride * (i + 1)])[0, 1]
+        self.assertAlmostEqual(expected_cov,
+                               sess.run(update_op, feed_dict=feed_dict), 5)
+        self.assertAlmostEqual(expected_cov, sess.run(cov, feed_dict=feed_dict),
+                               5)
         prev_expected_cov = expected_cov
 
 
@@ -5574,8 +5503,8 @@ class StreamingPearsonRTest(test.TestCase):
 
   def testVars(self):
     metrics.streaming_pearson_correlation(
-        predictions=math_ops.to_float(math_ops.range(10)) + array_ops.ones(
-            [10, 10]),
+        predictions=math_ops.to_float(math_ops.range(10)) +
+        array_ops.ones([10, 10]),
         labels=math_ops.to_float(math_ops.range(10)) + array_ops.ones([10, 10]))
     _assert_metric_variables(self, (
         'pearson_r/covariance/comoment:0',
@@ -5595,8 +5524,8 @@ class StreamingPearsonRTest(test.TestCase):
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
     pearson_r, _ = metrics.streaming_pearson_correlation(
-        predictions=math_ops.to_float(math_ops.range(10)) + array_ops.ones(
-            [10, 10]),
+        predictions=math_ops.to_float(math_ops.range(10)) +
+        array_ops.ones([10, 10]),
         labels=math_ops.to_float(math_ops.range(10)) + array_ops.ones([10, 10]),
         metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [pearson_r])
@@ -5604,8 +5533,8 @@ class StreamingPearsonRTest(test.TestCase):
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.streaming_pearson_correlation(
-        predictions=math_ops.to_float(math_ops.range(10)) + array_ops.ones(
-            [10, 10]),
+        predictions=math_ops.to_float(math_ops.range(10)) +
+        array_ops.ones([10, 10]),
         labels=math_ops.to_float(math_ops.range(10)) + array_ops.ones([10, 10]),
         updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
@@ -5613,8 +5542,8 @@ class StreamingPearsonRTest(test.TestCase):
   def testValueTensorIsIdempotent(self):
     labels = random_ops.random_normal((10, 3), seed=2)
     predictions = labels * 0.5 + random_ops.random_normal((10, 3), seed=1) * 0.5
-    pearson_r, update_op = metrics.streaming_pearson_correlation(predictions,
-                                                                 labels)
+    pearson_r, update_op = metrics.streaming_pearson_correlation(
+        predictions, labels)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -5633,8 +5562,8 @@ class StreamingPearsonRTest(test.TestCase):
       predictions = math_ops.to_float(math_ops.range(10))
       labels = math_ops.to_float(math_ops.range(10))
 
-      pearson_r, update_op = metrics.streaming_pearson_correlation(predictions,
-                                                                   labels)
+      pearson_r, update_op = metrics.streaming_pearson_correlation(
+          predictions, labels)
 
       expected_r = np.corrcoef(np.arange(10), np.arange(10))[0, 1]
       sess.run(variables.local_variables_initializer())
@@ -5648,8 +5577,8 @@ class StreamingPearsonRTest(test.TestCase):
       labels = constant_op.constant(
           [1, 3, 2], shape=(1, 3), dtype=dtypes_lib.float32)
 
-      pearson_r, update_op = metrics.streaming_pearson_correlation(predictions,
-                                                                   labels)
+      pearson_r, update_op = metrics.streaming_pearson_correlation(
+          predictions, labels)
 
       expected_r = np.corrcoef([2, 4, 6], [1, 3, 2])[0, 1]
       sess.run(variables.local_variables_initializer())
@@ -5698,17 +5627,18 @@ class StreamingPearsonRTest(test.TestCase):
             predictions_t: predictions[stride * i:stride * (i + 1)],
             labels_t: labels[stride * i:stride * (i + 1)]
         }
-        self.assertEqual(np.isnan(prev_expected_r),
-                         np.isnan(sess.run(pearson_r, feed_dict=feed_dict)))
+        self.assertEqual(
+            np.isnan(prev_expected_r),
+            np.isnan(sess.run(pearson_r, feed_dict=feed_dict)))
         if not np.isnan(prev_expected_r):
-          self.assertAlmostEqual(
-              prev_expected_r, sess.run(pearson_r, feed_dict=feed_dict), 5)
+          self.assertAlmostEqual(prev_expected_r,
+                                 sess.run(pearson_r, feed_dict=feed_dict), 5)
         expected_r = np.corrcoef(predictions[:stride * (i + 1)],
                                  labels[:stride * (i + 1)])[0, 1]
-        self.assertAlmostEqual(
-            expected_r, sess.run(update_op, feed_dict=feed_dict), 5)
-        self.assertAlmostEqual(
-            expected_r, sess.run(pearson_r, feed_dict=feed_dict), 5)
+        self.assertAlmostEqual(expected_r,
+                               sess.run(update_op, feed_dict=feed_dict), 5)
+        self.assertAlmostEqual(expected_r,
+                               sess.run(pearson_r, feed_dict=feed_dict), 5)
         prev_expected_r = expected_r
 
   def testMultiUpdateWithErrorAndWeights(self):
@@ -5736,19 +5666,21 @@ class StreamingPearsonRTest(test.TestCase):
             labels_t: labels[stride * i:stride * (i + 1)],
             weights_t: weights[stride * i:stride * (i + 1)]
         }
-        self.assertEqual(np.isnan(prev_expected_r),
-                         np.isnan(sess.run(pearson_r, feed_dict=feed_dict)))
+        self.assertEqual(
+            np.isnan(prev_expected_r),
+            np.isnan(sess.run(pearson_r, feed_dict=feed_dict)))
         if not np.isnan(prev_expected_r):
-          self.assertAlmostEqual(
-              prev_expected_r, sess.run(pearson_r, feed_dict=feed_dict), 5)
-        cmat = np.cov(predictions[:stride * (i + 1)],
-                      labels[:stride * (i + 1)],
-                      fweights=weights[:stride * (i + 1)])
+          self.assertAlmostEqual(prev_expected_r,
+                                 sess.run(pearson_r, feed_dict=feed_dict), 5)
+        cmat = np.cov(
+            predictions[:stride * (i + 1)],
+            labels[:stride * (i + 1)],
+            fweights=weights[:stride * (i + 1)])
         expected_r = cmat[0, 1] / np.sqrt(cmat[0, 0] * cmat[1, 1])
-        self.assertAlmostEqual(
-            expected_r, sess.run(update_op, feed_dict=feed_dict), 5)
-        self.assertAlmostEqual(
-            expected_r, sess.run(pearson_r, feed_dict=feed_dict), 5)
+        self.assertAlmostEqual(expected_r,
+                               sess.run(update_op, feed_dict=feed_dict), 5)
+        self.assertAlmostEqual(expected_r,
+                               sess.run(pearson_r, feed_dict=feed_dict), 5)
         prev_expected_r = expected_r
 
   def testMultiUpdateWithErrorAndSingletonBatches(self):
@@ -5758,7 +5690,7 @@ class StreamingPearsonRTest(test.TestCase):
       predictions = np.random.randn(n)
       labels = 0.5 * predictions + np.random.randn(n)
       stride = 10
-      weights = (np.arange(n).reshape(n//stride, stride) % stride == 0)
+      weights = (np.arange(n).reshape(n // stride, stride) % stride == 0)
       for row in weights:
         np.random.shuffle(row)
       # Now, weights is one-hot by row - one item per batch has non-zero weight.
@@ -5778,19 +5710,20 @@ class StreamingPearsonRTest(test.TestCase):
             labels_t: labels[stride * i:stride * (i + 1)],
             weights_t: weights[stride * i:stride * (i + 1)]
         }
-        cmat = np.cov(predictions[:stride * (i + 1)],
-                      labels[:stride * (i + 1)],
-                      fweights=weights[:stride * (i + 1)])
+        cmat = np.cov(
+            predictions[:stride * (i + 1)],
+            labels[:stride * (i + 1)],
+            fweights=weights[:stride * (i + 1)])
         expected_r = cmat[0, 1] / np.sqrt(cmat[0, 0] * cmat[1, 1])
         actual_r = sess.run(update_op, feed_dict=feed_dict)
         self.assertEqual(np.isnan(expected_r), np.isnan(actual_r))
-        self.assertEqual(np.isnan(expected_r),
-                         np.isnan(sess.run(pearson_r, feed_dict=feed_dict)))
+        self.assertEqual(
+            np.isnan(expected_r),
+            np.isnan(sess.run(pearson_r, feed_dict=feed_dict)))
         if not np.isnan(expected_r):
-          self.assertAlmostEqual(
-              expected_r, actual_r, 5)
-          self.assertAlmostEqual(
-              expected_r, sess.run(pearson_r, feed_dict=feed_dict), 5)
+          self.assertAlmostEqual(expected_r, actual_r, 5)
+          self.assertAlmostEqual(expected_r,
+                                 sess.run(pearson_r, feed_dict=feed_dict), 5)
 
 
 class StreamingMeanCosineDistanceTest(test.TestCase):
@@ -6191,20 +6124,14 @@ class StreamingMeanIOUTest(test.TestCase):
       self.assertAlmostEqual(desired_output, miou.eval())
 
   def testUpdateOpEvalIsAccumulatedConfusionMatrix(self):
-    predictions = array_ops.concat(
-        [
-            constant_op.constant(
-                0, shape=[5]), constant_op.constant(
-                    1, shape=[5])
-        ],
-        0)
-    labels = array_ops.concat(
-        [
-            constant_op.constant(
-                0, shape=[3]), constant_op.constant(
-                    1, shape=[7])
-        ],
-        0)
+    predictions = array_ops.concat([
+        constant_op.constant(0, shape=[5]),
+        constant_op.constant(1, shape=[5])
+    ], 0)
+    labels = array_ops.concat([
+        constant_op.constant(0, shape=[3]),
+        constant_op.constant(1, shape=[7])
+    ], 0)
     num_classes = 2
     with self.test_session() as sess:
       miou, update_op = metrics.streaming_mean_iou(predictions, labels,
@@ -6238,29 +6165,20 @@ class StreamingMeanIOUTest(test.TestCase):
       self.assertEqual(0., miou.eval())
 
   def testResultsWithSomeMissing(self):
-    predictions = array_ops.concat(
-        [
-            constant_op.constant(
-                0, shape=[5]), constant_op.constant(
-                    1, shape=[5])
-        ],
-        0)
-    labels = array_ops.concat(
-        [
-            constant_op.constant(
-                0, shape=[3]), constant_op.constant(
-                    1, shape=[7])
-        ],
-        0)
+    predictions = array_ops.concat([
+        constant_op.constant(0, shape=[5]),
+        constant_op.constant(1, shape=[5])
+    ], 0)
+    labels = array_ops.concat([
+        constant_op.constant(0, shape=[3]),
+        constant_op.constant(1, shape=[7])
+    ], 0)
     num_classes = 2
-    weights = array_ops.concat(
-        [
-            constant_op.constant(
-                0, shape=[1]), constant_op.constant(
-                    1, shape=[8]), constant_op.constant(
-                        0, shape=[1])
-        ],
-        0)
+    weights = array_ops.concat([
+        constant_op.constant(0, shape=[1]),
+        constant_op.constant(1, shape=[8]),
+        constant_op.constant(0, shape=[1])
+    ], 0)
     with self.test_session() as sess:
       miou, update_op = metrics.streaming_mean_iou(
           predictions, labels, num_classes, weights=weights)
@@ -6270,56 +6188,45 @@ class StreamingMeanIOUTest(test.TestCase):
       self.assertAlmostEqual(desired_miou, miou.eval())
 
   def testMissingClassInLabels(self):
-    labels = constant_op.constant([
-      [[0, 0, 1, 1, 0, 0],
-       [1, 0, 0, 0, 0, 1]],
-      [[1, 1, 1, 1, 1, 1],
-       [0, 0, 0, 0, 0, 0]]])
-    predictions = constant_op.constant([
-      [[0, 0, 2, 1, 1, 0],
-       [0, 1, 2, 2, 0, 1]],
-      [[0, 0, 2, 1, 1, 1],
-       [1, 1, 2, 0, 0, 0]]])
+    labels = constant_op.constant([[[0, 0, 1, 1, 0, 0], [1, 0, 0, 0, 0, 1]],
+                                   [[1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 0]]])
+    predictions = constant_op.constant(
+        [[[0, 0, 2, 1, 1, 0], [0, 1, 2, 2, 0, 1]], [[0, 0, 2, 1, 1, 1],
+                                                    [1, 1, 2, 0, 0, 0]]])
     num_classes = 3
     with self.test_session() as sess:
-      miou, update_op = metrics.streaming_mean_iou(
-          predictions, labels, num_classes)
+      miou, update_op = metrics.streaming_mean_iou(predictions, labels,
+                                                   num_classes)
       sess.run(variables.local_variables_initializer())
       self.assertAllEqual([[7, 4, 3], [3, 5, 2], [0, 0, 0]], update_op.eval())
-      self.assertAlmostEqual(
-        1 / 3 * (7 / (7 + 3 + 7) + 5 / (5 + 4 + 5) + 0 / (0 + 5 + 0)),
-        miou.eval())
+      self.assertAlmostEqual(1 / 3 * (7 / (7 + 3 + 7) + 5 / (5 + 4 + 5) + 0 /
+                                      (0 + 5 + 0)), miou.eval())
 
   def testMissingClassOverallSmall(self):
     labels = constant_op.constant([0])
     predictions = constant_op.constant([0])
     num_classes = 2
     with self.test_session() as sess:
-      miou, update_op = metrics.streaming_mean_iou(
-          predictions, labels, num_classes)
+      miou, update_op = metrics.streaming_mean_iou(predictions, labels,
+                                                   num_classes)
       sess.run(variables.local_variables_initializer())
       self.assertAllEqual([[1, 0], [0, 0]], update_op.eval())
       self.assertAlmostEqual(1, miou.eval())
 
   def testMissingClassOverallLarge(self):
-    labels = constant_op.constant([
-      [[0, 0, 1, 1, 0, 0],
-       [1, 0, 0, 0, 0, 1]],
-      [[1, 1, 1, 1, 1, 1],
-       [0, 0, 0, 0, 0, 0]]])
-    predictions = constant_op.constant([
-      [[0, 0, 1, 1, 0, 0],
-       [1, 1, 0, 0, 1, 1]],
-      [[0, 0, 0, 1, 1, 1],
-       [1, 1, 1, 0, 0, 0]]])
+    labels = constant_op.constant([[[0, 0, 1, 1, 0, 0], [1, 0, 0, 0, 0, 1]],
+                                   [[1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 0]]])
+    predictions = constant_op.constant(
+        [[[0, 0, 1, 1, 0, 0], [1, 1, 0, 0, 1, 1]], [[0, 0, 0, 1, 1, 1],
+                                                    [1, 1, 1, 0, 0, 0]]])
     num_classes = 3
     with self.test_session() as sess:
-      miou, update_op = metrics.streaming_mean_iou(
-          predictions, labels, num_classes)
+      miou, update_op = metrics.streaming_mean_iou(predictions, labels,
+                                                   num_classes)
       sess.run(variables.local_variables_initializer())
       self.assertAllEqual([[9, 5, 0], [3, 7, 0], [0, 0, 0]], update_op.eval())
-      self.assertAlmostEqual(
-        1 / 2 * (9 / (9 + 3 + 5) + 7 / (7 + 5 + 3)), miou.eval())
+      self.assertAlmostEqual(1 / 2 * (9 / (9 + 3 + 5) + 7 / (7 + 5 + 3)),
+                             miou.eval())
 
 
 class StreamingConcatTest(test.TestCase):
@@ -6683,7 +6590,8 @@ class CohenKappaTest(test.TestCase):
     _assert_metric_variables(self, (
         'cohen_kappa/po:0',
         'cohen_kappa/pe_row:0',
-        'cohen_kappa/pe_col:0',))
+        'cohen_kappa/pe_col:0',
+    ))
 
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
@@ -6705,9 +6613,9 @@ class CohenKappaTest(test.TestCase):
 
   def testValueTensorIsIdempotent(self):
     predictions = random_ops.random_uniform(
-      (10, 1), maxval=3, dtype=dtypes_lib.int64, seed=1)
+        (10, 1), maxval=3, dtype=dtypes_lib.int64, seed=1)
     labels = random_ops.random_uniform(
-      (10, 1), maxval=3, dtype=dtypes_lib.int64, seed=2)
+        (10, 1), maxval=3, dtype=dtypes_lib.int64, seed=2)
     kappa, update_op = metrics.cohen_kappa(labels, predictions, 3)
 
     with self.test_session() as sess:
@@ -6723,10 +6631,7 @@ class CohenKappaTest(test.TestCase):
         self.assertAlmostEqual(initial_kappa, kappa.eval(), 5)
 
   def testBasic(self):
-    confusion_matrix = np.array([
-      [9, 3, 1],
-      [4, 8, 2],
-      [2, 1, 6]])
+    confusion_matrix = np.array([[9, 3, 1], [4, 8, 2], [2, 1, 6]])
     # overall total = 36
     # po = [9, 8, 6], sum(po) = 23
     # pe_row = [15, 12, 9], pe_col = [13, 14, 9], so pe = [5.42, 4.67, 2.25]
@@ -6738,8 +6643,10 @@ class CohenKappaTest(test.TestCase):
     labels, predictions = self._confusion_matrix_to_samples(confusion_matrix)
 
     dtypes = [dtypes_lib.int16, dtypes_lib.int32, dtypes_lib.int64]
-    shapes = [(len(labels,)),  # 1-dim
-              (len(labels), 1)]  # 2-dim
+    shapes = [
+        (len(labels,)),  # 1-dim
+        (len(labels), 1)
+    ]  # 2-dim
     weights = [None, np.ones_like(labels)]
 
     for dtype in dtypes:
@@ -6795,10 +6702,7 @@ class CohenKappaTest(test.TestCase):
       self.assertAlmostEqual(expect, kappa.eval(), 5)
 
   def testWeighted(self):
-    confusion_matrix = np.array([
-      [9, 3, 1],
-      [4, 8, 2],
-      [2, 1, 6]])
+    confusion_matrix = np.array([[9, 3, 1], [4, 8, 2], [2, 1, 6]])
     labels, predictions = self._confusion_matrix_to_samples(confusion_matrix)
     num_samples = np.sum(confusion_matrix, dtype=np.int32)
     weights = (np.arange(0, num_samples) % 5) / 5.0
@@ -6809,31 +6713,26 @@ class CohenKappaTest(test.TestCase):
     with self.test_session() as sess:
       predictions = constant_op.constant(predictions, dtype=dtypes_lib.float32)
       labels = constant_op.constant(labels)
-      kappa, update_op = metrics.cohen_kappa(labels, predictions, 4,
-                                             weights=weights)
+      kappa, update_op = metrics.cohen_kappa(
+          labels, predictions, 4, weights=weights)
 
       sess.run(variables.local_variables_initializer())
       self.assertAlmostEqual(expect, sess.run(update_op), 5)
       self.assertAlmostEqual(expect, kappa.eval(), 5)
 
   def testWithMultipleUpdates(self):
-    confusion_matrix = np.array([
-      [90, 30, 10, 20],
-      [40, 80, 20, 30],
-      [20, 10, 60, 35],
-      [15, 25, 30, 25]])
+    confusion_matrix = np.array([[90, 30, 10, 20], [40, 80, 20, 30],
+                                 [20, 10, 60, 35], [15, 25, 30, 25]])
     labels, predictions = self._confusion_matrix_to_samples(confusion_matrix)
     num_samples = np.sum(confusion_matrix, dtype=np.int32)
     weights = (np.arange(0, num_samples) % 5) / 5.0
     num_classes = confusion_matrix.shape[0]
 
     batch_size = num_samples // 10
-    predictions_t = array_ops.placeholder(dtypes_lib.float32,
-                                          shape=(batch_size,))
-    labels_t = array_ops.placeholder(dtypes_lib.int32,
-                                     shape=(batch_size,))
-    weights_t = array_ops.placeholder(dtypes_lib.float32,
-                                      shape=(batch_size,))
+    predictions_t = array_ops.placeholder(
+        dtypes_lib.float32, shape=(batch_size,))
+    labels_t = array_ops.placeholder(dtypes_lib.int32, shape=(batch_size,))
+    weights_t = array_ops.placeholder(dtypes_lib.float32, shape=(batch_size,))
     kappa, update_op = metrics.cohen_kappa(
         labels_t, predictions_t, num_classes, weights=weights_t)
     with self.test_session() as sess:
@@ -6841,10 +6740,13 @@ class CohenKappaTest(test.TestCase):
 
       for idx in range(0, num_samples, batch_size):
         batch_start, batch_end = idx, idx + batch_size
-        sess.run(update_op,
-                 feed_dict={labels_t: labels[batch_start:batch_end],
-                            predictions_t: predictions[batch_start:batch_end],
-                            weights_t: weights[batch_start:batch_end]})
+        sess.run(
+            update_op,
+            feed_dict={
+                labels_t: labels[batch_start:batch_end],
+                predictions_t: predictions[batch_start:batch_end],
+                weights_t: weights[batch_start:batch_end]
+            })
       # Calculated by v0.19: sklearn.metrics.cohen_kappa_score(
       #                          labels_np, predictions_np, sample_weight=weights_np)
       expect = 0.289965397924
@@ -6862,7 +6764,8 @@ class CohenKappaTest(test.TestCase):
     with self.assertRaises(ValueError):
       metrics.cohen_kappa(invalid_labels, predictions, 3)
 
-    invalid_predictions = array_ops.placeholder(dtypes_lib.float32, shape=(4, 2))
+    invalid_predictions = array_ops.placeholder(
+        dtypes_lib.float32, shape=(4, 2))
     labels = array_ops.placeholder(dtypes_lib.int32, shape=(4, 1))
     with self.assertRaises(ValueError):
       metrics.cohen_kappa(labels, invalid_predictions, 3)
diff --git a/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_pruning.py b/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_pruning.py
index 0d1de869f6..73dd56398c 100644
--- a/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_pruning.py
+++ b/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_pruning.py
@@ -54,10 +54,10 @@ BATCH_SIZE = 128
 DATA_DIR = '/tmp/cifar10_data'
 
 # Constants describing the training process.
-MOVING_AVERAGE_DECAY = 0.9999     # The decay to use for the moving average.
-NUM_EPOCHS_PER_DECAY = 350.0      # Epochs after which learning rate decays.
+MOVING_AVERAGE_DECAY = 0.9999  # The decay to use for the moving average.
+NUM_EPOCHS_PER_DECAY = 350.0  # Epochs after which learning rate decays.
 LEARNING_RATE_DECAY_FACTOR = 0.1  # Learning rate decay factor.
-INITIAL_LEARNING_RATE = 0.1       # Initial learning rate.
+INITIAL_LEARNING_RATE = 0.1  # Initial learning rate.
 
 # If a model is trained with multiple GPUs, prefix all Op names with tower_name
 # to differentiate the operations. Note that this prefix is removed from the
@@ -82,8 +82,7 @@ def _activation_summary(x):
   # session. This helps the clarity of presentation on tensorboard.
   tensor_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', x.op.name)
   tf.summary.histogram(tensor_name + '/activations', x)
-  tf.summary.scalar(tensor_name + '/sparsity',
-                                       tf.nn.zero_fraction(x))
+  tf.summary.scalar(tensor_name + '/sparsity', tf.nn.zero_fraction(x))
 
 
 def _variable_on_cpu(name, shape, initializer):
@@ -120,10 +119,9 @@ def _variable_with_weight_decay(name, shape, stddev, wd):
     Variable Tensor
   """
   dtype = tf.float32
-  var = _variable_on_cpu(
-      name,
-      shape,
-      tf.truncated_normal_initializer(stddev=stddev, dtype=dtype))
+  var = _variable_on_cpu(name, shape,
+                         tf.truncated_normal_initializer(
+                             stddev=stddev, dtype=dtype))
   if wd is not None:
     weight_decay = tf.multiply(tf.nn.l2_loss(var), wd, name='weight_loss')
     tf.add_to_collection('losses', weight_decay)
@@ -188,10 +186,8 @@ def inference(images):
   # Note that the masks are applied only to the weight tensors
   # conv1
   with tf.variable_scope('conv1') as scope:
-    kernel = _variable_with_weight_decay('weights',
-                                         shape=[5, 5, 3, 64],
-                                         stddev=5e-2,
-                                         wd=0.0)
+    kernel = _variable_with_weight_decay(
+        'weights', shape=[5, 5, 3, 64], stddev=5e-2, wd=0.0)
 
     conv = tf.nn.conv2d(
         images, pruning.apply_mask(kernel, scope), [1, 1, 1, 1], padding='SAME')
@@ -201,18 +197,20 @@ def inference(images):
     _activation_summary(conv1)
 
   # pool1
-  pool1 = tf.nn.max_pool(conv1, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1],
-                         padding='SAME', name='pool1')
+  pool1 = tf.nn.max_pool(
+      conv1,
+      ksize=[1, 3, 3, 1],
+      strides=[1, 2, 2, 1],
+      padding='SAME',
+      name='pool1')
   # norm1
-  norm1 = tf.nn.lrn(pool1, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75,
-                    name='norm1')
+  norm1 = tf.nn.lrn(
+      pool1, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75, name='norm1')
 
   # conv2
   with tf.variable_scope('conv2') as scope:
-    kernel = _variable_with_weight_decay('weights',
-                                         shape=[5, 5, 64, 64],
-                                         stddev=5e-2,
-                                         wd=0.0)
+    kernel = _variable_with_weight_decay(
+        'weights', shape=[5, 5, 64, 64], stddev=5e-2, wd=0.0)
     conv = tf.nn.conv2d(
         norm1, pruning.apply_mask(kernel, scope), [1, 1, 1, 1], padding='SAME')
     biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.1))
@@ -221,19 +219,23 @@ def inference(images):
     _activation_summary(conv2)
 
   # norm2
-  norm2 = tf.nn.lrn(conv2, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75,
-                    name='norm2')
+  norm2 = tf.nn.lrn(
+      conv2, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75, name='norm2')
   # pool2
-  pool2 = tf.nn.max_pool(norm2, ksize=[1, 3, 3, 1],
-                         strides=[1, 2, 2, 1], padding='SAME', name='pool2')
+  pool2 = tf.nn.max_pool(
+      norm2,
+      ksize=[1, 3, 3, 1],
+      strides=[1, 2, 2, 1],
+      padding='SAME',
+      name='pool2')
 
   # local3
   with tf.variable_scope('local3') as scope:
     # Move everything into depth so we can perform a single matrix multiply.
     reshape = tf.reshape(pool2, [BATCH_SIZE, -1])
     dim = reshape.get_shape()[1].value
-    weights = _variable_with_weight_decay('weights', shape=[dim, 384],
-                                          stddev=0.04, wd=0.004)
+    weights = _variable_with_weight_decay(
+        'weights', shape=[dim, 384], stddev=0.04, wd=0.004)
     biases = _variable_on_cpu('biases', [384], tf.constant_initializer(0.1))
     local3 = tf.nn.relu(
         tf.matmul(reshape, pruning.apply_mask(weights, scope)) + biases,
@@ -242,8 +244,8 @@ def inference(images):
 
   # local4
   with tf.variable_scope('local4') as scope:
-    weights = _variable_with_weight_decay('weights', shape=[384, 192],
-                                          stddev=0.04, wd=0.004)
+    weights = _variable_with_weight_decay(
+        'weights', shape=[384, 192], stddev=0.04, wd=0.004)
     biases = _variable_on_cpu('biases', [192], tf.constant_initializer(0.1))
     local4 = tf.nn.relu(
         tf.matmul(local3, pruning.apply_mask(weights, scope)) + biases,
@@ -255,8 +257,8 @@ def inference(images):
   # tf.nn.sparse_softmax_cross_entropy_with_logits accepts the unscaled logits
   # and performs the softmax internally for efficiency.
   with tf.variable_scope('softmax_linear') as scope:
-    weights = _variable_with_weight_decay('weights', [192, NUM_CLASSES],
-                                          stddev=1/192.0, wd=0.0)
+    weights = _variable_with_weight_decay(
+        'weights', [192, NUM_CLASSES], stddev=1 / 192.0, wd=0.0)
     biases = _variable_on_cpu('biases', [NUM_CLASSES],
                               tf.constant_initializer(0.0))
     softmax_linear = tf.add(
@@ -337,11 +339,12 @@ def train(total_loss, global_step):
   decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)
 
   # Decay the learning rate exponentially based on the number of steps.
-  lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE,
-                                  global_step,
-                                  decay_steps,
-                                  LEARNING_RATE_DECAY_FACTOR,
-                                  staircase=True)
+  lr = tf.train.exponential_decay(
+      INITIAL_LEARNING_RATE,
+      global_step,
+      decay_steps,
+      LEARNING_RATE_DECAY_FACTOR,
+      staircase=True)
   tf.summary.scalar('learning_rate', lr)
 
   # Generate moving averages of all losses and associated summaries.
@@ -365,8 +368,8 @@ def train(total_loss, global_step):
       tf.summary.histogram(var.op.name + '/gradients', grad)
 
   # Track the moving averages of all trainable variables.
-  variable_averages = tf.train.ExponentialMovingAverage(
-      MOVING_AVERAGE_DECAY, global_step)
+  variable_averages = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY,
+                                                        global_step)
   variables_averages_op = variable_averages.apply(tf.trainable_variables())
 
   with tf.control_dependencies([apply_gradient_op, variables_averages_op]):
@@ -383,10 +386,13 @@ def maybe_download_and_extract():
   filename = DATA_URL.split('/')[-1]
   filepath = os.path.join(dest_directory, filename)
   if not os.path.exists(filepath):
+
     def _progress(count, block_size, total_size):
-      sys.stdout.write('\r>> Downloading %s %.1f%%' % (filename,
-          float(count * block_size) / float(total_size) * 100.0))
+      sys.stdout.write('\r>> Downloading %s %.1f%%' %
+                       (filename,
+                        float(count * block_size) / float(total_size) * 100.0))
       sys.stdout.flush()
+
     filepath, _ = urllib.request.urlretrieve(DATA_URL, filepath, _progress)
     print()
     statinfo = os.stat(filepath)
diff --git a/tensorflow/contrib/mpi_collectives/mpi_ops.py b/tensorflow/contrib/mpi_collectives/mpi_ops.py
index 81567cc688..bd7096d9ce 100644
--- a/tensorflow/contrib/mpi_collectives/mpi_ops.py
+++ b/tensorflow/contrib/mpi_collectives/mpi_ops.py
@@ -46,17 +46,16 @@ def _load_library(name, op_list=None):
         if lib_op.name == expected_op:
           break
       else:
-        raise NameError(
-          'Could not find operator %s in dynamic library %s' %
-          (expected_op, name))
+        raise NameError('Could not find operator %s in dynamic library %s' %
+                        (expected_op, name))
     return library
   except errors.NotFoundError:
     logging.warning('%s file could not be loaded.', name)
 
 
-MPI_LIB = _load_library('mpi_collectives.so', ['MPISize', 'MPIRank',
-                                               'MPILocalRank', 'MPIAllgather',
-                                               'MPIAllreduce'])
+MPI_LIB = _load_library(
+    'mpi_collectives.so',
+    ['MPISize', 'MPIRank', 'MPILocalRank', 'MPIAllgather', 'MPIAllreduce'])
 
 
 def size(name=None):
@@ -151,15 +150,14 @@ def allgather(tensor, name=None):
   """
   # Specify that first allgather is to collect the tensor gather sizes,
   # indicated by passing in a scalar (0-D tensor) of value 0
-  sizes_flag = tf.constant(0, dtype=tf.int64, name="size_flag_const")
-  my_size = tf.slice(tf.shape(tensor, out_type=tf.int64), [0], [1], name="size_slice")
+  sizes_flag = tf.constant(0, dtype=tf.int64, name='size_flag_const')
+  my_size = tf.slice(
+      tf.shape(tensor, out_type=tf.int64), [0], [1], name='size_slice')
   if name is None:
-    name = "allgather"
-  sizing_name = "{}_sizing".format(name)
+    name = 'allgather'
+  sizing_name = '{}_sizing'.format(name)
   sizes = MPI_LIB.mpi_allgather(my_size, sizes_flag, name=sizing_name)
   return MPI_LIB.mpi_allgather(tensor, sizes, name=name)
 
 
 ops.NotDifferentiable('MPIAllgather')
-
-
diff --git a/tensorflow/contrib/opt/python/training/elastic_average_optimizer.py b/tensorflow/contrib/opt/python/training/elastic_average_optimizer.py
index 6132cba1f5..716ee9cdf7 100644
--- a/tensorflow/contrib/opt/python/training/elastic_average_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/elastic_average_optimizer.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Wrapper optimizer for Elastic Average SGD """
 from __future__ import absolute_import
 from __future__ import division
@@ -78,23 +77,24 @@ class ElasticAverageCustomGetter(object):
   def __call__(self, getter, name, trainable, collections, *args, **kwargs):
     if trainable:
       with ops.device(self._worker_device):
-        local_var = getter(name, trainable=True,
-                           collections=[ops.GraphKeys.LOCAL_VARIABLES],
-                           *args, **kwargs)
+        local_var = getter(
+            name,
+            trainable=True,
+            collections=[ops.GraphKeys.LOCAL_VARIABLES],
+            *args,
+            **kwargs)
       global_center_variable = variable_scope.variable(
-        name='%s/%s' %
-             (GLOBAL_VARIABLE_NAME,
-              name),
-        initial_value=local_var.initialized_value(),
-        trainable=False,
-        collections=[ops.GraphKeys.GLOBAL_VARIABLES])
+          name='%s/%s' % (GLOBAL_VARIABLE_NAME, name),
+          initial_value=local_var.initialized_value(),
+          trainable=False,
+          collections=[ops.GraphKeys.GLOBAL_VARIABLES])
 
       with ops.device(self._worker_device):
         local_center_variable = variable_scope.variable(
-          name='%s/%s' % (LOCAL_VARIABLE_NAME, name),
-          initial_value=local_var.initialized_value(),
-          trainable=False,
-          collections=[ops.GraphKeys.LOCAL_VARIABLES])
+            name='%s/%s' % (LOCAL_VARIABLE_NAME, name),
+            initial_value=local_var.initialized_value(),
+            trainable=False,
+            collections=[ops.GraphKeys.LOCAL_VARIABLES])
 
       self._local_map[local_var] = local_center_variable
       self._global_map[local_var] = global_center_variable
@@ -117,16 +117,15 @@ class ElasticAverageOptimizer(optimizer.Optimizer):
   # Default value as paper described
   BETA = 0.9
 
-  def __init__(
-      self,
-      opt,
-      num_worker,
-      ea_custom_getter,
-      communication_period=10,
-      moving_rate=None,
-      rho=None,
-      use_locking=True,
-      name="ElasticAverageOptimizer"):
+  def __init__(self,
+               opt,
+               num_worker,
+               ea_custom_getter,
+               communication_period=10,
+               moving_rate=None,
+               rho=None,
+               use_locking=True,
+               name='ElasticAverageOptimizer'):
     """Construct a new gradient descent optimizer.
 
     Args:
@@ -160,13 +159,15 @@ class ElasticAverageOptimizer(optimizer.Optimizer):
       self._rho = rho
 
     self._local_step = variable_scope.get_variable(
-      initializer=0,
-      trainable=False,
-      collections=[ops.GraphKeys.LOCAL_VARIABLES],
-      name="local_step")
+        initializer=0,
+        trainable=False,
+        collections=[ops.GraphKeys.LOCAL_VARIABLES],
+        name='local_step')
     self._opt._prepare()
 
-  def compute_gradients(self, loss, var_list=None,
+  def compute_gradients(self,
+                        loss,
+                        var_list=None,
                         gate_gradients=optimizer.Optimizer.GATE_OP,
                         aggregation_method=None,
                         colocate_gradients_with_ops=False,
@@ -204,16 +205,18 @@ class ElasticAverageOptimizer(optimizer.Optimizer):
     if not var_list:
       var_list = variables.trainable_variables()
 
-    elastic_difference = [math_ops.subtract(v, lv) for v, lv in zip(
-      variables.trainable_variables(),
-      [self._local_map[var] for var in var_list])]
+    elastic_difference = [
+        math_ops.subtract(v, lv)
+        for v, lv in zip(variables.trainable_variables(),
+                         [self._local_map[var] for var in var_list])
+    ]
 
     distance_loss = self._rho * math_ops.add_n(
-                      [gen_nn_ops.l2_loss(ed) for ed in elastic_difference])
+        [gen_nn_ops.l2_loss(ed) for ed in elastic_difference])
 
     total_loss = loss + distance_loss
-    return self._opt.compute_gradients(total_loss, var_list,
-                                       gate_gradients, aggregation_method,
+    return self._opt.compute_gradients(total_loss, var_list, gate_gradients,
+                                       aggregation_method,
                                        colocate_gradients_with_ops, grad_loss)
 
   def apply_gradients(self, grads_and_vars, global_step=None, name=None):
@@ -241,7 +244,7 @@ class ElasticAverageOptimizer(optimizer.Optimizer):
     apply_updates = self._opt.apply_gradients(grads_and_vars)
     with ops.control_dependencies([apply_updates]):
       local_update = state_ops.assign_add(
-        self._local_step, 1, name='local_step_update').op
+          self._local_step, 1, name='local_step_update').op
 
     # update global variables.
     def _Update_global_variables():
@@ -259,12 +262,16 @@ class ElasticAverageOptimizer(optimizer.Optimizer):
             differences.append(math_ops.subtract(v, lv))
         for lvar, diff in zip(local_vars, differences):
           with ops.device(lvar.device):
-            update_ops.append(state_ops.assign_sub(lvar, math_ops.multiply(
-              self._moving_rate, diff)))
+            update_ops.append(
+                state_ops.assign_sub(lvar,
+                                     math_ops.multiply(self._moving_rate,
+                                                       diff)))
         for var, diff in zip(global_center_vars, differences):
           with ops.device(var.device):
-            update_ops.append(state_ops.assign_add(var, math_ops.multiply(
-              self._moving_rate, diff)))
+            update_ops.append(
+                state_ops.assign_add(var,
+                                     math_ops.multiply(self._moving_rate,
+                                                       diff)))
         if global_step:
           with ops.colocate_with(global_step):
             update_ops.append(state_ops.assign_add(global_step, 1))
@@ -272,10 +279,10 @@ class ElasticAverageOptimizer(optimizer.Optimizer):
       return variable_update
 
     with ops.control_dependencies([local_update]):
-      condition = math_ops.equal(math_ops.mod(
-        self._local_step, self._period), 0)
+      condition = math_ops.equal(
+          math_ops.mod(self._local_step, self._period), 0)
       conditional_update = control_flow_ops.cond(
-        condition, _Update_global_variables, control_flow_ops.no_op)
+          condition, _Update_global_variables, control_flow_ops.no_op)
     return conditional_update
 
   def get_init_op(self, task_index):
@@ -285,10 +292,12 @@ class ElasticAverageOptimizer(optimizer.Optimizer):
     def _Add_sync_queues_and_barrier(enqueue_after_list):
       """Adds ops to enqueu on all worker queues"""
       sync_queues = [
-        data_flow_ops.FIFOQueue(self._num_worker, [dtypes.bool], shapes=[[]],
-                                shared_name='%s%s' % (
-                                  'variable_init_sync_queue', i)) for i in
-        range(self._num_worker)]
+          data_flow_ops.FIFOQueue(
+              self._num_worker, [dtypes.bool],
+              shapes=[[]],
+              shared_name='%s%s' % ('variable_init_sync_queue', i))
+          for i in range(self._num_worker)
+      ]
       queue_ops = []
       # For each other worker, add an entry in a queue
       token = constant_op.constant(False)
@@ -299,7 +308,7 @@ class ElasticAverageOptimizer(optimizer.Optimizer):
           else:
             queue_ops.append(q.enqueue(token))
       queue_ops.append(
-        sync_queues[task_index].dequeue_many(len(sync_queues) - 1))
+          sync_queues[task_index].dequeue_many(len(sync_queues) - 1))
       return control_flow_ops.group(*queue_ops)
 
     init_ops = []
@@ -307,11 +316,10 @@ class ElasticAverageOptimizer(optimizer.Optimizer):
     global_center_vars = [self._global_map[var] for var in local_vars]
     local_center_vars = [self._local_map[var] for var in local_vars]
     if not (local_vars and global_center_vars and local_center_vars):
-      raise ValueError(
-        'The lists of local_variables, global_center_variables, '
-        'local_center_variables should not be empty  ')
-    for lvar, gc_var, lc_var in zip(
-        local_vars, global_center_vars, local_center_vars):
+      raise ValueError('The lists of local_variables, global_center_variables, '
+                       'local_center_variables should not be empty  ')
+    for lvar, gc_var, lc_var in zip(local_vars, global_center_vars,
+                                    local_center_vars):
       init_ops.append(state_ops.assign(lvar, gc_var))
       init_ops.append(state_ops.assign(lc_var, gc_var))
 
@@ -325,6 +333,7 @@ class ElasticAverageOptimizer(optimizer.Optimizer):
 
 
 class _ElasticAverageOptimizerHook(session_run_hook.SessionRunHook):
+
   def __init__(self, ea_optimizer, is_chief, task_index):
     """Creates hook to handle ElasticAverageOptimizer initialization ops.
 
diff --git a/tensorflow/contrib/opt/python/training/elastic_average_optimizer_test.py b/tensorflow/contrib/opt/python/training/elastic_average_optimizer_test.py
index 446e91018d..37539b9599 100644
--- a/tensorflow/contrib/opt/python/training/elastic_average_optimizer_test.py
+++ b/tensorflow/contrib/opt/python/training/elastic_average_optimizer_test.py
@@ -38,20 +38,20 @@ def create_local_cluster(num_workers, num_ps, protocol="grpc"):
   worker_ports = [portpicker.pick_unused_port() for _ in range(num_workers)]
   ps_ports = [portpicker.pick_unused_port() for _ in range(num_ps)]
   cluster_dict = {
-    "worker": ["localhost:%s" % port for port in worker_ports],
-    "ps": ["localhost:%s" % port for port in ps_ports]
+      "worker": ["localhost:%s" % port for port in worker_ports],
+      "ps": ["localhost:%s" % port for port in ps_ports]
   }
   cs = server_lib.ClusterSpec(cluster_dict)
 
   workers = [
-    server_lib.Server(
-      cs, job_name="worker", protocol=protocol, task_index=ix, start=True)
-    for ix in range(num_workers)
+      server_lib.Server(
+          cs, job_name="worker", protocol=protocol, task_index=ix, start=True)
+      for ix in range(num_workers)
   ]
   ps_servers = [
-    server_lib.Server(
-      cs, job_name="ps", protocol=protocol, task_index=ix, start=True)
-    for ix in range(num_ps)
+      server_lib.Server(
+          cs, job_name="ps", protocol=protocol, task_index=ix, start=True)
+      for ix in range(num_ps)
   ]
 
   return cluster_dict, workers, ps_servers
@@ -68,15 +68,14 @@ def _get_workers(num_workers, period, workers, moving_rate):
     is_chief = (worker_id == 0)
     with graph.as_default():
       worker_device = "/job:worker/task:%d/cpu:0" % (worker_id)
-      ea_coustom = ElasticAverageCustomGetter(
-        worker_device=worker_device)
-      with variable_scope.variable_scope('',
-                                         custom_getter=ea_coustom), ops.device(
-        device_setter.replica_device_setter(worker_device=worker_device,
-                                            ps_device="/job:ps/task:0/cpu:0",
-                                            ps_tasks=1)):
-        global_step = variables.Variable(0, name='global_step',
-                                         trainable=False)
+      ea_coustom = ElasticAverageCustomGetter(worker_device=worker_device)
+      with variable_scope.variable_scope(
+          "", custom_getter=ea_coustom), ops.device(
+              device_setter.replica_device_setter(
+                  worker_device=worker_device,
+                  ps_device="/job:ps/task:0/cpu:0",
+                  ps_tasks=1)):
+        global_step = variables.Variable(0, name="global_step", trainable=False)
         var_0 = variable_scope.get_variable(initializer=0.0, name="v0")
         var_1 = variable_scope.get_variable(initializer=1.0, name="v1")
 
@@ -86,21 +85,19 @@ def _get_workers(num_workers, period, workers, moving_rate):
 
         sgd_opt = gradient_descent.GradientDescentOptimizer(1.0)
         opt = ElasticAverageOptimizer(
-          opt=sgd_opt,
-          num_worker=num_workers,
-          moving_rate=moving_rate,
-          communication_period=period,
-          ea_custom_getter=ea_coustom
-        )
+            opt=sgd_opt,
+            num_worker=num_workers,
+            moving_rate=moving_rate,
+            communication_period=period,
+            ea_custom_getter=ea_coustom)
         train_op = [
-          opt.apply_gradients(
-            ([grads_0, var_0],
-             [grads_1, var_1]), global_step)
+            opt.apply_gradients(([grads_0, var_0], [grads_1, var_1]),
+                                global_step)
         ]
         easgd_hook = opt.make_session_run_hook(is_chief, worker_id)
       # Creates MonitoredSession
-      sess = training.MonitoredTrainingSession(workers[worker_id].target,
-                                               hooks=[easgd_hook])
+      sess = training.MonitoredTrainingSession(
+          workers[worker_id].target, hooks=[easgd_hook])
 
     sessions.append(sess)
     graphs.append(graph)
@@ -110,6 +107,7 @@ def _get_workers(num_workers, period, workers, moving_rate):
 
 
 class ElasticAverageOptimizerTest(test.TestCase):
+
   def _run(self, train_op, sess):
     sess.run(train_op)
 
@@ -117,15 +115,14 @@ class ElasticAverageOptimizerTest(test.TestCase):
     num_workers = 1
     communication_period = 2
     num_ps = 1
-    cluster, workers, _ = create_local_cluster(num_workers=num_workers,
-                                               num_ps=num_ps)
+    cluster, workers, _ = create_local_cluster(
+        num_workers=num_workers, num_ps=num_ps)
 
-    sessions, graphs, train_ops = _get_workers(num_workers,
-                                               communication_period,
-                                               workers, 1.0)
+    sessions, graphs, train_ops = _get_workers(
+        num_workers, communication_period, workers, 1.0)
 
-    var_0 = graphs[0].get_tensor_by_name('v0:0')
-    var_1 = graphs[0].get_tensor_by_name('v1:0')
+    var_0 = graphs[0].get_tensor_by_name("v0:0")
+    var_1 = graphs[0].get_tensor_by_name("v1:0")
     global_step = training_util.get_global_step(graphs[0])
     var_0_g = graphs[0].get_tensor_by_name(GLOBAL_VARIABLE_NAME + "/v0:0")
     var_1_g = graphs[0].get_tensor_by_name(GLOBAL_VARIABLE_NAME + "/v1:0")
@@ -166,18 +163,17 @@ class ElasticAverageOptimizerTest(test.TestCase):
     num_workers = 2
     communication_period = 1
     num_ps = 2
-    cluster, workers, _ = create_local_cluster(num_workers=num_workers,
-                                               num_ps=num_ps)
+    cluster, workers, _ = create_local_cluster(
+        num_workers=num_workers, num_ps=num_ps)
 
-    sessions, graphs, train_ops = _get_workers(num_workers,
-                                               communication_period,
-                                               workers, 0.5)
+    sessions, graphs, train_ops = _get_workers(
+        num_workers, communication_period, workers, 0.5)
 
-    var_0 = graphs[0].get_tensor_by_name('v0:0')
-    var_1 = graphs[0].get_tensor_by_name('v1:0')
+    var_0 = graphs[0].get_tensor_by_name("v0:0")
+    var_1 = graphs[0].get_tensor_by_name("v1:0")
 
-    var_0_1 = graphs[1].get_tensor_by_name('v0:0')
-    var_1_1 = graphs[1].get_tensor_by_name('v1:0')
+    var_0_1 = graphs[1].get_tensor_by_name("v0:0")
+    var_1_1 = graphs[1].get_tensor_by_name("v1:0")
 
     var_0_g = graphs[0].get_tensor_by_name(GLOBAL_VARIABLE_NAME + "/v0:0")
     var_1_g = graphs[0].get_tensor_by_name(GLOBAL_VARIABLE_NAME + "/v1:0")
@@ -201,25 +197,24 @@ class ElasticAverageOptimizerTest(test.TestCase):
 
   def testPS2TasksWithClusterSpecClass(self):
     cluster_spec = server_lib.ClusterSpec({
-      "ps": ["ps0:2222", "ps1:2222"],
-      "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+        "ps": ["ps0:2222", "ps1:2222"],
+        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
     })
-    ea_coustom = ElasticAverageCustomGetter(
-      worker_device="/job:worker/task:0")
+    ea_coustom = ElasticAverageCustomGetter(worker_device="/job:worker/task:0")
     from tensorflow.python.training import device_setter
     with ops.device(
         device_setter.replica_device_setter(cluster=cluster_spec,
                                             worker_device="/job:worker/task:0",
                                             ps_device="/job:ps")), \
-         variable_scope.variable_scope('', custom_getter=ea_coustom):
+         variable_scope.variable_scope("", custom_getter=ea_coustom):
       v = variable_scope.get_variable(initializer=[1, 2], name="v")
-      w = variable_scope.get_variable(initializer=[2, 1], name='w')
-      v_g, w_g = ea_coustom._global_map[v],ea_coustom._global_map[w]
+      w = variable_scope.get_variable(initializer=[2, 1], name="w")
+      v_g, w_g = ea_coustom._global_map[v], ea_coustom._global_map[w]
       self.assertDeviceEqual("/job:worker/task:0", v.device)
       self.assertDeviceEqual("job:ps/task:0", v_g.device)
       self.assertDeviceEqual("/job:worker/task:0", w.device)
       self.assertDeviceEqual("job:ps/task:1", w_g.device)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/predictor/predictor_factories_test.py b/tensorflow/contrib/predictor/predictor_factories_test.py
index e8443e718d..578d9424b2 100644
--- a/tensorflow/contrib/predictor/predictor_factories_test.py
+++ b/tensorflow/contrib/predictor/predictor_factories_test.py
@@ -50,8 +50,8 @@ class PredictorFactoriesTest(test.TestCase):
   def testFromContribEstimator(self):
     estimator = testing_common.get_arithmetic_estimator(core=False)
     input_fn = testing_common.get_arithmetic_input_fn(core=False)
-    predictor_factories.from_contrib_estimator(estimator, input_fn,
-        output_alternative_key='sum')
+    predictor_factories.from_contrib_estimator(
+        estimator, input_fn, output_alternative_key='sum')
 
   def testFromContribEstimatorWithCoreEstimatorRaises(self):
     estimator = testing_common.get_arithmetic_estimator(core=True)
diff --git a/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_loss_test.py b/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_loss_test.py
index c8b4e472c9..360e7dbe75 100644
--- a/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_loss_test.py
+++ b/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_loss_test.py
@@ -105,8 +105,8 @@ class SparsemaxLossTest(test.TestCase):
     tf_loss_op, tf_loss_out = self._tf_sparsemax_loss(z, q, dtype, use_gpu)
     np_loss = self._np_sparsemax_loss(z, q).astype(dtype)
 
-    self.assertAllCloseAccordingToType(np_loss, tf_loss_out,
-                                       half_atol=1e-2, half_rtol=5e-3)
+    self.assertAllCloseAccordingToType(
+        np_loss, tf_loss_out, half_atol=1e-2, half_rtol=5e-3)
     self.assertShapeEqual(np_loss, tf_loss_op)
 
   def _test_constant_add(self, dtype, random, use_gpu):
@@ -116,17 +116,17 @@ class SparsemaxLossTest(test.TestCase):
     q = np.zeros((test_obs, 10))
     q[np.arange(0, test_obs), np.random.randint(0, 10, size=test_obs)] = 1
 
-    _, tf_loss_zpc = self._tf_sparsemax_loss(
-        z + c, q, dtype, use_gpu
-    )
+    _, tf_loss_zpc = self._tf_sparsemax_loss(z + c, q, dtype, use_gpu)
 
-    _, tf_loss_z = self._tf_sparsemax_loss(
-        z, q, dtype, use_gpu
-    )
+    _, tf_loss_z = self._tf_sparsemax_loss(z, q, dtype, use_gpu)
 
-    self.assertAllCloseAccordingToType(tf_loss_zpc, tf_loss_z,
-                                       float_atol=5e-6, float_rtol=5e-6,
-                                       half_atol=1e-2, half_rtol=1e-2)
+    self.assertAllCloseAccordingToType(
+        tf_loss_zpc,
+        tf_loss_z,
+        float_atol=5e-6,
+        float_rtol=5e-6,
+        half_atol=1e-2,
+        half_rtol=1e-2)
 
   def _test_sparsemax_loss_positive(self, dtype, random, use_gpu):
     """check sparsemax-loss proposition 4"""
@@ -170,10 +170,7 @@ class SparsemaxLossTest(test.TestCase):
 
     with self.test_session(use_gpu=use_gpu):
       err = gradient_checker.compute_gradient_error(
-        logits, z.shape,
-        loss_op, (test_obs, ),
-        x_init_value=z, delta=1e-9
-      )
+          logits, z.shape, loss_op, (test_obs,), x_init_value=z, delta=1e-9)
 
     self.assertLess(err, 1e-4)
 
@@ -192,8 +189,8 @@ class SparsemaxLossTest(test.TestCase):
       tf_grad = loss_grad_op.eval()
       np_grad = self._np_sparsemax_loss_grad(z, q).astype(dtype)
 
-      self.assertAllCloseAccordingToType(np_grad, tf_grad,
-                                         half_atol=1e-2, half_rtol=5e-3)
+      self.assertAllCloseAccordingToType(
+          np_grad, tf_grad, half_atol=1e-2, half_rtol=5e-3)
       self.assertShapeEqual(np_grad, loss_grad_op)
 
   def _test_dtype(self, dtype):
@@ -220,5 +217,6 @@ class SparsemaxLossTest(test.TestCase):
   def testDouble(self):
     self._test_dtype('float64')
 
-if __name__ == "__main__":
+
+if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_test.py b/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_test.py
index 82d36ee9cb..259e62bd86 100644
--- a/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_test.py
+++ b/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_test.py
@@ -83,8 +83,8 @@ class SparsemaxTest(test.TestCase):
     tf_sparsemax_op, tf_sparsemax_out = self._tf_sparsemax(z, dtype, use_gpu)
     p_sparemax = self._np_sparsemax(z).astype(dtype)
 
-    self.assertAllCloseAccordingToType(p_sparemax, tf_sparsemax_out,
-                                       half_atol=5e-3)
+    self.assertAllCloseAccordingToType(
+        p_sparemax, tf_sparsemax_out, half_atol=5e-3)
     self.assertShapeEqual(p_sparemax, tf_sparsemax_op)
 
   def _test_sparsemax_of_zero(self, dtype, random, use_gpu):
@@ -111,9 +111,8 @@ class SparsemaxTest(test.TestCase):
     p_expected = np.zeros((test_obs, 10), dtype=dtype)
     p_expected[np.arange(0, test_obs), z_sort_arg[:, 0]] = 1
 
-    tf_sparsemax_op, tf_sparsemax_out = self._tf_sparsemax(
-        (1 / epsilon) * z, dtype, use_gpu
-    )
+    tf_sparsemax_op, tf_sparsemax_out = self._tf_sparsemax((1 / epsilon) * z,
+                                                           dtype, use_gpu)
 
     self.assertAllCloseAccordingToType(p_expected, tf_sparsemax_out)
     self.assertShapeEqual(p_expected, tf_sparsemax_op)
@@ -123,16 +122,12 @@ class SparsemaxTest(test.TestCase):
     z = random.uniform(low=-3, high=3, size=(test_obs, 10)).astype(dtype)
     c = random.uniform(low=-3, high=3, size=(test_obs, 1)).astype(dtype)
 
-    _, tf_sparsemax_zpc = self._tf_sparsemax(
-        z + c, dtype, use_gpu
-    )
+    _, tf_sparsemax_zpc = self._tf_sparsemax(z + c, dtype, use_gpu)
 
-    _, tf_sparsemax_z = self._tf_sparsemax(
-        z, dtype, use_gpu
-    )
+    _, tf_sparsemax_z = self._tf_sparsemax(z, dtype, use_gpu)
 
-    self.assertAllCloseAccordingToType(tf_sparsemax_zpc, tf_sparsemax_z,
-                                       half_atol=5e-3)
+    self.assertAllCloseAccordingToType(
+        tf_sparsemax_zpc, tf_sparsemax_z, half_atol=5e-3)
 
   def _test_permutation(self, dtype, random, use_gpu):
     """check sparsemax proposition 3"""
@@ -143,12 +138,11 @@ class SparsemaxTest(test.TestCase):
       per = random.permutation(10)
 
       tf_sparsemax_op, tf_sparsemax_out = self._tf_sparsemax(
-        z[i, per].reshape(1, -1), dtype, use_gpu
-      )
+          z[i, per].reshape(1, -1), dtype, use_gpu)
       p_expected = p[i, per].reshape(1, -1)
 
-      self.assertAllCloseAccordingToType(p_expected, tf_sparsemax_out,
-                                         half_atol=5e-3)
+      self.assertAllCloseAccordingToType(
+          p_expected, tf_sparsemax_out, half_atol=5e-3)
       self.assertShapeEqual(p_expected, tf_sparsemax_op)
 
   def _test_diffrence(self, dtype, random, use_gpu):
@@ -166,18 +160,14 @@ class SparsemaxTest(test.TestCase):
             continue
 
           self.assertTrue(
-            0 <= p[val, j] - p[val, i] <= z[val, j] - z[val, i] + etol,
-            "0 <= %.10f <= %.10f" % (
-              p[val, j] - p[val, i], z[val, j] - z[val, i] + etol
-            )
-          )
+              0 <= p[val, j] - p[val, i] <= z[val, j] - z[val, i] + etol,
+              '0 <= %.10f <= %.10f' % (p[val, j] - p[val, i],
+                                       z[val, j] - z[val, i] + etol))
 
   def _test_two_dimentional(self, dtype, random, use_gpu):
     """check two dimentation sparsemax case"""
     t = np.linspace(-2, 2, test_obs, dtype=dtype)
-    z = np.vstack([
-      t, np.zeros(test_obs, dtype=dtype)
-    ]).T
+    z = np.vstack([t, np.zeros(test_obs, dtype=dtype)]).T
 
     tf_sparsemax_op, tf_sparsemax_out = self._tf_sparsemax(z, dtype, use_gpu)
 
@@ -196,10 +186,7 @@ class SparsemaxTest(test.TestCase):
 
     with self.test_session(use_gpu=use_gpu):
       err = gradient_checker.compute_gradient_error(
-        logits, z.shape,
-        sparsemax_op, z.shape,
-        x_init_value=z, delta=1e-9
-      )
+          logits, z.shape, sparsemax_op, z.shape, x_init_value=z, delta=1e-9)
 
     self.assertLess(err, 1e-4)
 
@@ -248,5 +235,6 @@ class SparsemaxTest(test.TestCase):
   def testDouble(self):
     self._test_dtype('float64')
 
-if __name__ == "__main__":
+
+if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py b/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py
index fa4c1c0da5..461fb1c517 100644
--- a/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py
+++ b/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Train and Eval the MNIST network.
 
 This version is like fully_connected_feed.py but uses data converted
@@ -65,6 +64,7 @@ def decode(serialized_example):
 
   return image, label
 
+
 def augment(image, label):
   # OPTIONAL: Could reshape into a 28x28 image and apply distortions
   # here.  Since we are not applying any distortions in this
@@ -72,12 +72,14 @@ def augment(image, label):
   # into a vector, we don't bother.
   return image, label
 
+
 def normalize(image, label):
   # Convert from [0, 255] -> [-0.5, 0.5] floats.
   image = tf.cast(image, tf.float32) * (1. / 255) - 0.5
 
   return image, label
 
+
 def inputs(train, batch_size, num_epochs):
   """Reads input data num_epochs times.
 
@@ -98,9 +100,10 @@ def inputs(train, batch_size, num_epochs):
     over the dataset once. On the other hand there is no special initialization
     required.
   """
-  if not num_epochs: num_epochs = None
-  filename = os.path.join(FLAGS.train_dir,
-                          TRAIN_FILE if train else VALIDATION_FILE)
+  if not num_epochs:
+    num_epochs = None
+  filename = os.path.join(FLAGS.train_dir, TRAIN_FILE
+                          if train else VALIDATION_FILE)
 
   with tf.name_scope('input'):
     # TFRecordDataset opens a protobuf and reads entries line by line
@@ -127,13 +130,11 @@ def run_training():
   # Tell TensorFlow that the model will be built into the default Graph.
   with tf.Graph().as_default():
     # Input images and labels.
-    image_batch, label_batch = inputs(train=True, batch_size=FLAGS.batch_size,
-                               num_epochs=FLAGS.num_epochs)
+    image_batch, label_batch = inputs(
+        train=True, batch_size=FLAGS.batch_size, num_epochs=FLAGS.num_epochs)
 
     # Build a Graph that computes predictions from the inference model.
-    logits = mnist.inference(image_batch,
-                             FLAGS.hidden1,
-                             FLAGS.hidden2)
+    logits = mnist.inference(image_batch, FLAGS.hidden1, FLAGS.hidden2)
 
     # Add to the Graph the loss calculation.
     loss = mnist.loss(logits, label_batch)
@@ -152,7 +153,7 @@ def run_training():
       sess.run(init_op)
       try:
         step = 0
-        while True: #train until OutOfRangeError
+        while True:  #train until OutOfRangeError
           start_time = time.time()
 
           # Run one step of the model.  The return values are
@@ -168,10 +169,12 @@ def run_training():
           # Print an overview fairly often.
           if step % 100 == 0:
             print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value,
-                                                     duration))
+                                                       duration))
           step += 1
       except tf.errors.OutOfRangeError:
-        print('Done training for %d epochs, %d steps.' % (FLAGS.num_epochs, step))
+        print('Done training for %d epochs, %d steps.' % (FLAGS.num_epochs,
+                                                          step))
+
 
 def main(_):
   run_training()
@@ -183,37 +186,27 @@ if __name__ == '__main__':
       '--learning_rate',
       type=float,
       default=0.01,
-      help='Initial learning rate.'
-  )
+      help='Initial learning rate.')
   parser.add_argument(
       '--num_epochs',
       type=int,
       default=2,
-      help='Number of epochs to run trainer.'
-  )
+      help='Number of epochs to run trainer.')
   parser.add_argument(
       '--hidden1',
       type=int,
       default=128,
-      help='Number of units in hidden layer 1.'
-  )
+      help='Number of units in hidden layer 1.')
   parser.add_argument(
       '--hidden2',
       type=int,
       default=32,
-      help='Number of units in hidden layer 2.'
-  )
-  parser.add_argument(
-      '--batch_size',
-      type=int,
-      default=100,
-      help='Batch size.'
-  )
+      help='Number of units in hidden layer 2.')
+  parser.add_argument('--batch_size', type=int, default=100, help='Batch size.')
   parser.add_argument(
       '--train_dir',
       type=str,
       default='/tmp/data',
-      help='Directory with the training data.'
-  )
+      help='Directory with the training data.')
   FLAGS, unparsed = parser.parse_known_args()
   tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/label_image/label_image.py b/tensorflow/examples/label_image/label_image.py
index d62b73384c..1c1bd57d71 100644
--- a/tensorflow/examples/label_image/label_image.py
+++ b/tensorflow/examples/label_image/label_image.py
@@ -23,6 +23,7 @@ import sys
 import numpy as np
 import tensorflow as tf
 
+
 def load_graph(model_file):
   graph = tf.Graph()
   graph_def = tf.GraphDef()
@@ -34,22 +35,26 @@ def load_graph(model_file):
 
   return graph
 
-def read_tensor_from_image_file(file_name, input_height=299, input_width=299,
-				input_mean=0, input_std=255):
+
+def read_tensor_from_image_file(file_name,
+                                input_height=299,
+                                input_width=299,
+                                input_mean=0,
+                                input_std=255):
   input_name = "file_reader"
   output_name = "normalized"
   file_reader = tf.read_file(file_name, input_name)
   if file_name.endswith(".png"):
-    image_reader = tf.image.decode_png(file_reader, channels = 3,
-                                       name='png_reader')
+    image_reader = tf.image.decode_png(
+        file_reader, channels=3, name="png_reader")
   elif file_name.endswith(".gif"):
-    image_reader = tf.squeeze(tf.image.decode_gif(file_reader,
-                                                  name='gif_reader'))
+    image_reader = tf.squeeze(
+        tf.image.decode_gif(file_reader, name="gif_reader"))
   elif file_name.endswith(".bmp"):
-    image_reader = tf.image.decode_bmp(file_reader, name='bmp_reader')
+    image_reader = tf.image.decode_bmp(file_reader, name="bmp_reader")
   else:
-    image_reader = tf.image.decode_jpeg(file_reader, channels = 3,
-                                        name='jpeg_reader')
+    image_reader = tf.image.decode_jpeg(
+        file_reader, channels=3, name="jpeg_reader")
   float_caster = tf.cast(image_reader, tf.float32)
   dims_expander = tf.expand_dims(float_caster, 0)
   resized = tf.image.resize_bilinear(dims_expander, [input_height, input_width])
@@ -59,6 +64,7 @@ def read_tensor_from_image_file(file_name, input_height=299, input_width=299,
 
   return result
 
+
 def load_labels(label_file):
   label = []
   proto_as_ascii_lines = tf.gfile.GFile(label_file).readlines()
@@ -66,6 +72,7 @@ def load_labels(label_file):
     label.append(l.rstrip())
   return label
 
+
 if __name__ == "__main__":
   file_name = "tensorflow/examples/label_image/data/grace_hopper.jpg"
   model_file = \
@@ -110,11 +117,12 @@ if __name__ == "__main__":
     output_layer = args.output_layer
 
   graph = load_graph(model_file)
-  t = read_tensor_from_image_file(file_name,
-                                  input_height=input_height,
-                                  input_width=input_width,
-                                  input_mean=input_mean,
-                                  input_std=input_std)
+  t = read_tensor_from_image_file(
+      file_name,
+      input_height=input_height,
+      input_width=input_width,
+      input_mean=input_mean,
+      input_std=input_std)
 
   input_name = "import/" + input_layer
   output_name = "import/" + output_layer
@@ -122,8 +130,9 @@ if __name__ == "__main__":
   output_operation = graph.get_operation_by_name(output_name)
 
   with tf.Session(graph=graph) as sess:
-    results = sess.run(output_operation.outputs[0],
-                      {input_operation.outputs[0]: t})
+    results = sess.run(output_operation.outputs[0], {
+        input_operation.outputs[0]: t
+    })
   results = np.squeeze(results)
 
   top_k = results.argsort()[-5:][::-1]
diff --git a/tensorflow/python/client/session.py b/tensorflow/python/client/session.py
index 1481a4d035..e6f94396b8 100644
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """A client interface for TensorFlow."""
 
 from __future__ import absolute_import
@@ -71,8 +70,9 @@ def _get_indexed_slices_value_from_fetches(fetched_vals):
 
 
 def _get_feeds_for_indexed_slices(feed, feed_val):
-  return list(zip([feed.values, feed.indices] if feed.dense_shape is None else
-                  [feed.values, feed.indices, feed.dense_shape], feed_val))
+  return list(
+      zip([feed.values, feed.indices] if feed.dense_shape is None else
+          [feed.values, feed.indices, feed.dense_shape], feed_val))
 
 
 # List of extensions supported to convert run arguments into actual fetches and
@@ -124,6 +124,7 @@ _REGISTERED_EXPANSIONS = [
      lambda fetch: ([fetch], lambda fetched_vals: fetched_vals[0]),
      lambda feed, feed_val: [(feed, feed_val)],
      lambda feed: [feed])]
+
 # pylint: enable=g-long-lambda
 
 
@@ -132,8 +133,11 @@ def _convert_to_numpy_obj(numpy_dtype, obj):
   return numpy_dtype(obj) if numpy_dtype is not object else str(obj)
 
 
-def register_session_run_conversion_functions(tensor_type, fetch_function,
-    feed_function=None, feed_function_for_partial_run=None):
+def register_session_run_conversion_functions(
+    tensor_type,
+    fetch_function,
+    feed_function=None,
+    feed_function_for_partial_run=None):
   """Register fetch and feed conversion functions for `tf.Session.run()`.
 
   This function registers a triple of conversion functions for fetching and/or
@@ -174,11 +178,11 @@ def register_session_run_conversion_functions(tensor_type, fetch_function,
   """
   for conversion_function in _REGISTERED_EXPANSIONS:
     if issubclass(conversion_function[0], tensor_type):
-      raise ValueError(
-          '%s has already been registered so ignore it.', tensor_type)
+      raise ValueError('%s has already been registered so ignore it.',
+                       tensor_type)
       return
-  _REGISTERED_EXPANSIONS.insert(0,
-    (tensor_type, fetch_function, feed_function, feed_function_for_partial_run))
+  _REGISTERED_EXPANSIONS.insert(0, (tensor_type, fetch_function, feed_function,
+                                    feed_function_for_partial_run))
 
 
 class _FetchMapper(object):
@@ -233,8 +237,8 @@ class _FetchMapper(object):
       An instance of a subclass of `_FetchMapper` that handles the shape.
     """
     if fetch is None:
-      raise TypeError('Fetch argument %r has invalid type %r' %
-                      (fetch, type(fetch)))
+      raise TypeError('Fetch argument %r has invalid type %r' % (fetch,
+                                                                 type(fetch)))
     elif isinstance(fetch, (list, tuple)):
       # NOTE(touts): This is also the code path for namedtuples.
       return _ListFetchMapper(fetch)
@@ -247,8 +251,8 @@ class _FetchMapper(object):
           fetches, contraction_fn = fetch_fn(fetch)
           return _ElementFetchMapper(fetches, contraction_fn)
     # Did not find anything.
-    raise TypeError('Fetch argument %r has invalid type %r' %
-                    (fetch, type(fetch)))
+    raise TypeError('Fetch argument %r has invalid type %r' % (fetch,
+                                                               type(fetch)))
 
 
 class _ElementFetchMapper(_FetchMapper):
@@ -277,8 +281,8 @@ class _ElementFetchMapper(_FetchMapper):
             fetch, allow_tensor=True, allow_operation=True))
       except TypeError as e:
         raise TypeError('Fetch argument %r has invalid type %r, '
-                        'must be a string or Tensor. (%s)'
-                        % (fetch, type(fetch), str(e)))
+                        'must be a string or Tensor. (%s)' %
+                        (fetch, type(fetch), str(e)))
       except ValueError as e:
         raise ValueError('Fetch argument %r cannot be interpreted as a '
                          'Tensor. (%s)' % (fetch, str(e)))
@@ -376,8 +380,9 @@ class _DictFetchMapper(_FetchMapper):
     """
     self._fetch_type = type(fetches)
     self._keys = fetches.keys()
-    self._mappers = [_FetchMapper.for_fetch(fetch)
-                     for fetch in fetches.values()]
+    self._mappers = [
+        _FetchMapper.for_fetch(fetch) for fetch in fetches.values()
+    ]
     self._unique_fetches, self._value_indices = _uniquify_fetches(self._mappers)
 
   def unique_fetches(self):
@@ -401,6 +406,7 @@ class _FetchHandler(object):
   result structure matching the user-provided structure for fetches, but
   containing the corresponding results.
   """
+
   # TODO(touts): Make this class also take care of destructuring the feed
   # dict instead of doing it in the callers.
 
@@ -551,8 +557,11 @@ class _DeviceAttributes(object):
     return self._memory_limit_bytes
 
   def __repr__(self):
-    return '_DeviceAttributes(%s, %s, %d)' % (self.name, self.device_type,
-                                              self.memory_limit_bytes,)
+    return '_DeviceAttributes(%s, %s, %d)' % (
+        self.name,
+        self.device_type,
+        self.memory_limit_bytes,
+    )
 
 
 class BaseSession(SessionInterface):
@@ -601,8 +610,8 @@ class BaseSession(SessionInterface):
 
     if config is not None:
       if not isinstance(config, config_pb2.ConfigProto):
-        raise TypeError('config must be a tf.ConfigProto, but got %s'
-                        % type(config))
+        raise TypeError(
+            'config must be a tf.ConfigProto, but got %s' % type(config))
       self._config = config
       self._add_shapes = config.graph_options.infer_shapes
     else:
@@ -976,8 +985,8 @@ class BaseSession(SessionInterface):
       for tensor_type, _, _, feed_fn in _REGISTERED_EXPANSIONS:
         if isinstance(feed, tensor_type):
           return feed_fn(feed)
-      raise TypeError('Feed argument %r has invalid type %r'
-                      % (feed, type(feed)))
+      raise TypeError('Feed argument %r has invalid type %r' % (feed,
+                                                                type(feed)))
 
     # Check session.
     if self._closed:
@@ -998,8 +1007,8 @@ class BaseSession(SessionInterface):
     for feed in feeds:
       for subfeed in _feed_fn(feed):
         try:
-          subfeed_t = self.graph.as_graph_element(subfeed, allow_tensor=True,
-                                                  allow_operation=False)
+          subfeed_t = self.graph.as_graph_element(
+              subfeed, allow_tensor=True, allow_operation=False)
           if self._created_with_new_api:
             # pylint: disable=protected-access
             feed_list.append(subfeed_t._as_tf_output())
@@ -1007,8 +1016,7 @@ class BaseSession(SessionInterface):
           else:
             feed_list.append(compat.as_bytes(subfeed_t.name))
         except Exception as e:
-          e.message = ('Cannot interpret feed_list key as Tensor: '
-                       + e.message)
+          e.message = ('Cannot interpret feed_list key as Tensor: ' + e.message)
           e.args = (e.message,)
           raise e
 
@@ -1041,12 +1049,13 @@ class BaseSession(SessionInterface):
 
   def _run(self, handle, fetches, feed_dict, options, run_metadata):
     """Perform either run or partial_run, depending the presence of `handle`."""
+
     def _feed_fn(feed, feed_val):
       for tensor_type, _, feed_fn, _ in _REGISTERED_EXPANSIONS:
         if isinstance(feed, tensor_type):
           return feed_fn(feed, feed_val)
-      raise TypeError('Feed argument %r has invalid type %r'
-                      % (feed, type(feed)))
+      raise TypeError('Feed argument %r has invalid type %r' % (feed,
+                                                                type(feed)))
 
     # Check session.
     if self._closed:
@@ -1066,11 +1075,11 @@ class BaseSession(SessionInterface):
       for feed, feed_val in feed_dict.items():
         for subfeed, subfeed_val in _feed_fn(feed, feed_val):
           try:
-            subfeed_t = self.graph.as_graph_element(subfeed, allow_tensor=True,
-                                                    allow_operation=False)
+            subfeed_t = self.graph.as_graph_element(
+                subfeed, allow_tensor=True, allow_operation=False)
           except Exception as e:
-            raise TypeError('Cannot interpret feed_dict key as Tensor: '
-                            + e.args[0])
+            raise TypeError(
+                'Cannot interpret feed_dict key as Tensor: ' + e.args[0])
 
           if isinstance(subfeed_val, ops.Tensor):
             raise TypeError('The value of a feed cannot be a tf.Tensor object. '
@@ -1081,10 +1090,9 @@ class BaseSession(SessionInterface):
           if isinstance(subfeed_val, int) and _convert_to_numpy_obj(
               subfeed_dtype, subfeed_val) != subfeed_val:
             raise TypeError(
-                'Type of feed value ' + str(subfeed_val) + ' with type ' +
-                str(type(subfeed_val)) +
-                ' is not compatible with Tensor type ' +
-                str(subfeed_dtype) +
+                'Type of feed value ' + str(subfeed_val) + ' with type ' + str(
+                    type(subfeed_val)) +
+                ' is not compatible with Tensor type ' + str(subfeed_dtype) +
                 '. Try explicitly setting the type of the feed tensor'
                 ' to a larger type (e.g. int64).')
 
@@ -1098,10 +1106,10 @@ class BaseSession(SessionInterface):
 
           if (not is_tensor_handle_feed and
               not subfeed_t.get_shape().is_compatible_with(np_val.shape)):
-            raise ValueError(
-                'Cannot feed value of shape %r for Tensor %r, '
-                'which has shape %r'
-                % (np_val.shape, subfeed_t.name, str(subfeed_t.get_shape())))
+            raise ValueError('Cannot feed value of shape %r for Tensor %r, '
+                             'which has shape %r' %
+                             (np_val.shape, subfeed_t.name,
+                              str(subfeed_t.get_shape())))
           if not self.graph.is_feedable(subfeed_t):
             raise ValueError('Tensor %s may not be fed.' % subfeed_t)
 
@@ -1130,10 +1138,7 @@ class BaseSession(SessionInterface):
       results = []
     return fetch_handler.build_results(self, results)
 
-  def make_callable(self,
-                    fetches,
-                    feed_list=None,
-                    accept_options=False):
+  def make_callable(self, fetches, feed_list=None, accept_options=False):
     """Returns a Python callable that runs a particular step.
 
     The returned callable will take `len(feed_list)` arguments whose types
@@ -1176,9 +1181,12 @@ class BaseSession(SessionInterface):
       # `Session._run()` so that we can convert the feeds to a list of
       # strings here.
       def _generic_run(*feed_args, **kwargs):
-        feed_dict = {feed: feed_val
-                     for feed, feed_val in zip(feed_list, feed_args)}
+        feed_dict = {
+            feed: feed_val
+            for feed, feed_val in zip(feed_list, feed_args)
+        }
         return self.run(fetches, feed_dict=feed_dict, **kwargs)
+
       return _generic_run
 
     # Ensure any changes to the graph are reflected in the runtime.
@@ -1198,12 +1206,11 @@ class BaseSession(SessionInterface):
       fetch_list = _name_list(fetch_handler.fetches())
       target_list = _name_list(fetch_handler.targets())
 
-    def _callable_template_with_options_and_metadata(
-        fetch_list,
-        target_list,
-        fetch_handler,
-        options=None,
-        run_metadata=None):
+    def _callable_template_with_options_and_metadata(fetch_list,
+                                                     target_list,
+                                                     fetch_handler,
+                                                     options=None,
+                                                     run_metadata=None):
       """Template callable that accepts RunOptions and RunMetadata."""
       options_ptr = tf_session.TF_NewBufferFromString(
           compat.as_bytes(options.SerializeToString())) if options else None
@@ -1215,9 +1222,9 @@ class BaseSession(SessionInterface):
                 self._session, options_ptr, {}, fetch_list, target_list,
                 run_metadata_ptr, status)
           else:
-            results = tf_session.TF_Run(
-                self._session, options_ptr, {}, fetch_list, target_list, status,
-                run_metadata_ptr)
+            results = tf_session.TF_Run(self._session, options_ptr, {},
+                                        fetch_list, target_list, status,
+                                        run_metadata_ptr)
           if fetch_handler:
             results = fetch_handler.build_results(self, results)
           else:
@@ -1233,37 +1240,40 @@ class BaseSession(SessionInterface):
       return results
 
     if accept_options:
-      return functools.partial(
-          _callable_template_with_options_and_metadata, fetch_list,
-          target_list, fetch_handler)
+      return functools.partial(_callable_template_with_options_and_metadata,
+                               fetch_list, target_list, fetch_handler)
     elif isinstance(fetches, ops.Operation):
       # Special case for fetching a single operation, because the
       # function will have no return value.
       assert not fetch_list
       assert len(target_list) == 1
+
       def _single_operation_run():
         with errors.raise_exception_on_not_ok_status() as status:
           if self._created_with_new_api:
-            tf_session.TF_SessionRun_wrapper(
-                self._session, None, {}, [], target_list, None, status)
+            tf_session.TF_SessionRun_wrapper(self._session, None, {}, [],
+                                             target_list, None, status)
           else:
-            tf_session.TF_Run(
-                self._session, None, {}, [], target_list, status, None)
+            tf_session.TF_Run(self._session, None, {}, [], target_list, status,
+                              None)
+
       return _single_operation_run
     elif isinstance(fetches, ops.Tensor):
       # Special case for fetching a single tensor, because the
       # function can return the result of `TF_Run()` directly.
       assert len(fetch_list) == 1
       assert not target_list
+
       def _single_tensor_run():
         with errors.raise_exception_on_not_ok_status() as status:
           if self._created_with_new_api:
             results = tf_session.TF_SessionRun_wrapper(
                 self._session, None, {}, fetch_list, [], None, status)
           else:
-            results = tf_session.TF_Run(
-                self._session, None, {}, fetch_list, [], status, None)
+            results = tf_session.TF_Run(self._session, None, {}, fetch_list, [],
+                                        status, None)
         return results[0]
+
       return _single_tensor_run
     else:
       # In all other cases, we must use `fetch_handler` to build the
@@ -1274,16 +1284,17 @@ class BaseSession(SessionInterface):
             results = tf_session.TF_SessionRun_wrapper(
                 self._session, None, {}, fetch_list, target_list, None, status)
           else:
-            results = tf_session.TF_Run(
-                self._session, None, {}, fetch_list, target_list, status, None)
+            results = tf_session.TF_Run(self._session, None, {}, fetch_list,
+                                        target_list, status, None)
         return fetch_handler.build_results(self, results)
+
       return _fetch_handler_run
 
   # Captures the name of a node in an error status.
   _NODEDEF_NAME_RE = re.compile(r'\[\[Node: ([^ ]*?) =')
 
-  def _do_run(self, handle, target_list, fetch_list, feed_dict,
-              options, run_metadata):
+  def _do_run(self, handle, target_list, fetch_list, feed_dict, options,
+              run_metadata):
     """Runs a step based on the given fetches and feeds.
 
     Args:
@@ -1320,13 +1331,12 @@ class BaseSession(SessionInterface):
       self._extend_graph()
       with errors.raise_exception_on_not_ok_status() as status:
         if self._created_with_new_api:
-          return tf_session.TF_SessionRun_wrapper(
-              session, options, feed_dict, fetch_list, target_list,
-              run_metadata, status)
+          return tf_session.TF_SessionRun_wrapper(session, options, feed_dict,
+                                                  fetch_list, target_list,
+                                                  run_metadata, status)
         else:
-          return tf_session.TF_Run(session, options,
-                                   feed_dict, fetch_list, target_list,
-                                   status, run_metadata)
+          return tf_session.TF_Run(session, options, feed_dict, fetch_list,
+                                   target_list, status, run_metadata)
 
     def _prun_fn(session, handle, feed_dict, fetch_list):
       if target_list:
@@ -1365,20 +1375,20 @@ class BaseSession(SessionInterface):
   def _extend_graph(self):
     # Nothing to do if we're using the new session interface
     # TODO(skyewm): remove this function altogether eventually
-    if self._created_with_new_api: return
+    if self._created_with_new_api:
+      return
 
     # Ensure any changes to the graph are reflected in the runtime.
     with self._extend_lock:
       if self._graph.version > self._current_version:
         # pylint: disable=protected-access
         graph_def, self._current_version = self._graph._as_graph_def(
-            from_version=self._current_version,
-            add_shapes=self._add_shapes)
+            from_version=self._current_version, add_shapes=self._add_shapes)
         # pylint: enable=protected-access
 
         with errors.raise_exception_on_not_ok_status() as status:
-          tf_session.TF_ExtendGraph(
-              self._session, graph_def.SerializeToString(), status)
+          tf_session.TF_ExtendGraph(self._session,
+                                    graph_def.SerializeToString(), status)
         self._opened = True
 
   # The threshold to run garbage collection to delete dead tensors.
@@ -1398,9 +1408,8 @@ class BaseSession(SessionInterface):
       feeds = {}
       fetches = []
       for deleter_key, tensor_handle in enumerate(tensors_to_delete):
-        holder, deleter = session_ops._get_handle_deleter(self.graph,
-                                                          deleter_key,
-                                                          tensor_handle)
+        holder, deleter = session_ops._get_handle_deleter(
+            self.graph, deleter_key, tensor_handle)
         feeds[holder] = tensor_handle
         fetches.append(deleter)
       self.run(fetches, feed_dict=feeds)
@@ -1471,7 +1480,8 @@ class Session(BaseSession):
     sess.run(...)
   ```
 
-  The [`ConfigProto`](https://www.tensorflow.org/code/tensorflow/core/protobuf/config.proto)
+  The
+  [`ConfigProto`](https://www.tensorflow.org/code/tensorflow/core/protobuf/config.proto)
   protocol buffer exposes various configuration options for a
   session. For example, to create a session that uses soft constraints
   for device placement, and log the resulting placement decisions,
@@ -1502,7 +1512,8 @@ class Session(BaseSession):
         @{$distributed$Distributed TensorFlow}
         for more examples.
       graph: (Optional.) The `Graph` to be launched (described above).
-      config: (Optional.) A [`ConfigProto`](https://www.tensorflow.org/code/tensorflow/core/protobuf/config.proto)
+      config: (Optional.) A
+        [`ConfigProto`](https://www.tensorflow.org/code/tensorflow/core/protobuf/config.proto)
         protocol buffer with configuration options for the session.
 
     """
@@ -1526,8 +1537,8 @@ class Session(BaseSession):
   def __exit__(self, exec_type, exec_value, exec_tb):
     if exec_type is errors.OpError:
       logging.error('Session closing due to OpError: %s', (exec_value,))
-    self._default_session_context_manager.__exit__(
-        exec_type, exec_value, exec_tb)
+    self._default_session_context_manager.__exit__(exec_type, exec_value,
+                                                   exec_tb)
     self._default_graph_context_manager.__exit__(exec_type, exec_value, exec_tb)
 
     self._default_session_context_manager = None
diff --git a/tensorflow/python/client/session_test.py b/tensorflow/python/client/session_test.py
index c579fba339..768a5db88a 100644
--- a/tensorflow/python/client/session_test.py
+++ b/tensorflow/python/client/session_test.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Tests for tensorflow.python.client.session.Session."""
 from __future__ import absolute_import
 from __future__ import division
@@ -57,7 +56,6 @@ from tensorflow.python.platform import googletest
 from tensorflow.python.training import server_lib
 from tensorflow.python.util import compat
 
-
 # NOTE(mrry): Dummy shape registration for ops used in the tests, since they
 # don't have C++ op registrations on which to attach C++ shape fns.
 ops.RegisterShape('ConstructionFails')(common_shapes.unknown_shape)
@@ -95,14 +93,18 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(arr, copy_val)
       # Test without feed.
       copy_val = copy.eval()
-      self.assertAllEqual(np.asarray([[10.0, 10.0, 10.0], [10.0, 10.0, 10.0]],
-                                     dtype=np.float32), copy_val)
+      self.assertAllEqual(
+          np.asarray(
+              [[10.0, 10.0, 10.0], [10.0, 10.0, 10.0]], dtype=np.float32),
+          copy_val)
 
   def testManyCPUs(self):
     # TODO(keveman): Implement ListDevices and test for the number of
     # devices returned by ListDevices.
     with session.Session(
-        config=config_pb2.ConfigProto(device_count={'CPU': 2})):
+        config=config_pb2.ConfigProto(device_count={
+            'CPU': 2
+        })):
       inp = constant_op.constant(10.0, name='W1')
       self.assertAllEqual(inp.eval(), 10.0)
 
@@ -161,20 +163,23 @@ class SessionTest(test_util.TensorFlowTestCase):
       def exc_predicate(e):
         return (e.op is None and e.node_def is None and
                 e.error_code == error_codes_pb2.INVALID_ARGUMENT)
+
       with self.assertRaisesOpError(exc_predicate):
         # Run with a bogus handle.
         s.partial_run('foo', r1, feed_dict={a: 1, b: 2})
 
   def testOpConstructionErrorPayload(self):
-    if ops._USE_C_API: return  # No shape registration for 'ConstructionFails'
+    if ops._USE_C_API:
+      return  # No shape registration for 'ConstructionFails'
 
     with session.Session():
       failing_op = ops.get_default_graph().create_op(
           'ConstructionFails', [], [], name='f')
 
       def exc_predicate(e):
-        return (e.op == failing_op
-                and e.error_code == error_codes_pb2.INVALID_ARGUMENT)
+        return (e.op == failing_op and
+                e.error_code == error_codes_pb2.INVALID_ARGUMENT)
+
       with self.assertRaisesOpError(exc_predicate):
         failing_op.run()
 
@@ -191,9 +196,9 @@ class SessionTest(test_util.TensorFlowTestCase):
       # pylint: enable=protected-access
 
       def exc_predicate(e):
-        return (e.op == c.op
-                and e.op._original_op == b.op
-                and e.op._original_op._original_op == a.op)
+        return (e.op == c.op and e.op._original_op == b.op and
+                e.op._original_op._original_op == a.op)
+
       with self.assertRaisesOpError(exc_predicate):
         c.eval()
 
@@ -341,8 +346,12 @@ class SessionTest(test_util.TensorFlowTestCase):
       b = control_flow_ops.no_op()  # An op, not a tensor.
       c = constant_op.constant(c_val)
       # List of lists, tuples, namedtuple, and dict
-      res = sess.run([[a, b, c], (a, b, c), ABC(a=a, b=b, c=c),
-                      {'a': a.name, 'c': c, 'b': b}])
+      res = sess.run([[a, b, c], (a, b, c),
+                      ABC(a=a, b=b, c=c), {
+                          'a': a.name,
+                          'c': c,
+                          'b': b
+                      }])
       self.assertTrue(isinstance(res, list))
       self.assertEqual(4, len(res))
       self.assertTrue(isinstance(res[0], list))
@@ -365,8 +374,11 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertEqual(b_val, res[3]['b'])
       self.assertEqual(c_val, res[3]['c'])
       # Tuple of lists, tuples, namedtuple, and dict
-      res = sess.run(([a, b, c], (a.name, b, c), ABC(a=a, b=b, c=c),
-                      {'a': a, 'c': c, 'b': b}))
+      res = sess.run(([a, b, c], (a.name, b, c), ABC(a=a, b=b, c=c), {
+          'a': a,
+          'c': c,
+          'b': b
+      }))
       self.assertTrue(isinstance(res, tuple))
       self.assertEqual(4, len(res))
       self.assertTrue(isinstance(res[0], list))
@@ -389,10 +401,16 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertEqual(b_val, res[3]['b'])
       self.assertEqual(c_val, res[3]['c'])
       # Namedtuple of lists, tuples, namedtuples, and dict
-      res = sess.run(DEFG(d=[a, b, c],
-                          e=(a, b, c),
-                          f=ABC(a=a.name, b=b, c=c),
-                          g={'a': a, 'c': c, 'b': b}))
+      res = sess.run(
+          DEFG(
+              d=[a, b, c],
+              e=(a, b, c),
+              f=ABC(a=a.name, b=b, c=c),
+              g={
+                  'a': a,
+                  'c': c,
+                  'b': b
+              }))
       self.assertTrue(isinstance(res, DEFG))
       self.assertTrue(isinstance(res.d, list))
       self.assertEqual(3, len(res.d))
@@ -414,10 +432,16 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertEqual(b_val, res.g['b'])
       self.assertEqual(c_val, res.g['c'])
       # Dict of lists, tuples, namedtuples, and dict
-      res = sess.run({'d': [a, b, c],
-                      'e': (a, b, c),
-                      'f': ABC(a=a, b=b, c=c),
-                      'g': {'a': a.name, 'c': c, 'b': b}})
+      res = sess.run({
+          'd': [a, b, c],
+          'e': (a, b, c),
+          'f': ABC(a=a, b=b, c=c),
+          'g': {
+              'a': a.name,
+              'c': c,
+              'b': b
+          }
+      })
       self.assertTrue(isinstance(res, dict))
       self.assertEqual(4, len(res))
       self.assertTrue(isinstance(res['d'], list))
@@ -516,8 +540,7 @@ class SessionTest(test_util.TensorFlowTestCase):
       values = np.array([1.0, 2.0]).astype(np.float32)
       shape = np.array([7, 9, 2]).astype(np.int64)
       sp = sparse_tensor.SparseTensor(
-          constant_op.constant(indices),
-          constant_op.constant(values),
+          constant_op.constant(indices), constant_op.constant(values),
           constant_op.constant(shape))
       # Single fetch, use as tuple
       sp_out = s.run(sp)
@@ -587,14 +610,17 @@ class SessionTest(test_util.TensorFlowTestCase):
       sp = sparse_tensor.SparseTensor(
           array_ops.placeholder(dtype=np.int64, shape=(2, 3)),
           array_ops.placeholder(dtype=np.float32, shape=(2,)),
-          array_ops.placeholder(dtype=np.int64, shape=(3,)),)
+          array_ops.placeholder(dtype=np.int64, shape=(3,)),
+      )
       sp_indices = array_ops.identity(sp.indices)
       sp_values = array_ops.identity(sp.values)
       sp_shape = array_ops.identity(sp.dense_shape)
       sp2 = sparse_tensor.SparseTensor(sp_indices, sp_values, sp_shape)
       # Feed with tuple
       indices_out, values_out, shape_out = s.run(
-          [sp_indices, sp_values, sp_shape], {sp: (indices, values, shape)})
+          [sp_indices, sp_values, sp_shape], {
+              sp: (indices, values, shape)
+          })
       self.assertAllEqual(indices_out, indices)
       self.assertAllEqual(values_out, values)
       self.assertAllEqual(shape_out, shape)
@@ -605,20 +631,23 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(sp_out.dense_shape, shape)
       # Feed with SparseTensorValue
       indices_out, values_out, shape_out = s.run(
-          [sp_indices, sp_values, sp_shape],
-          {sp: sparse_tensor.SparseTensorValue(indices, values, shape)})
+          [sp_indices, sp_values, sp_shape], {
+              sp: sparse_tensor.SparseTensorValue(indices, values, shape)
+          })
       self.assertAllEqual(indices_out, indices)
       self.assertAllEqual(values_out, values)
       self.assertAllEqual(shape_out, shape)
       # Feed with SparseTensorValue, fetch SparseTensorValue
-      sp2_out = s.run(
-          sp2, {sp: sparse_tensor.SparseTensorValue(indices, values, shape)})
+      sp2_out = s.run(sp2, {
+          sp: sparse_tensor.SparseTensorValue(indices, values, shape)
+      })
       self.assertAllEqual(sp2_out.indices, indices)
       self.assertAllEqual(sp2_out.values, values)
       self.assertAllEqual(sp2_out.dense_shape, shape)
       # Feed SparseTensorValue and fetch sp directly.
-      sp_out = s.run(
-          sp, {sp: sparse_tensor.SparseTensorValue(indices, values, shape)})
+      sp_out = s.run(sp, {
+          sp: sparse_tensor.SparseTensorValue(indices, values, shape)
+      })
       self.assertAllEqual(sp_out.indices, indices)
       self.assertAllEqual(sp_out.values, values)
       self.assertAllEqual(sp_out.dense_shape, shape)
@@ -635,20 +664,24 @@ class SessionTest(test_util.TensorFlowTestCase):
       sp2 = sparse_tensor.SparseTensor(sp_indices, sp_values, sp_shape)
       # Feed with tuple
       indices_out, values_out, shape_out = s.run(
-          [sp_indices, sp_values, sp_shape], {sp: (indices, values, shape)})
+          [sp_indices, sp_values, sp_shape], {
+              sp: (indices, values, shape)
+          })
       self.assertAllEqual(indices_out, indices)
       self.assertAllEqual(values_out, values)
       self.assertAllEqual(shape_out, shape)
       # Feed with SparseTensorValue
       indices_out, values_out, shape_out = s.run(
-          [sp_indices, sp_values, sp_shape],
-          {sp: sparse_tensor.SparseTensorValue(indices, values, shape)})
+          [sp_indices, sp_values, sp_shape], {
+              sp: sparse_tensor.SparseTensorValue(indices, values, shape)
+          })
       self.assertAllEqual(indices_out, indices)
       self.assertAllEqual(values_out, values)
       self.assertAllEqual(shape_out, shape)
       # Feed with SparseTensorValue, fetch SparseTensorValue
-      sp2_out = s.run(
-          sp2, {sp: sparse_tensor.SparseTensorValue(indices, values, shape)})
+      sp2_out = s.run(sp2, {
+          sp: sparse_tensor.SparseTensorValue(indices, values, shape)
+      })
       self.assertAllEqual(sp2_out.indices, indices)
       self.assertAllEqual(sp2_out.values, values)
       self.assertAllEqual(sp2_out.dense_shape, shape)
@@ -666,20 +699,24 @@ class SessionTest(test_util.TensorFlowTestCase):
       sp2 = sparse_tensor.SparseTensor(sp_indices, sp_values, sp_shape)
       # Feed with tuple
       indices_out, values_out, shape_out = s.run(
-          [sp_indices, sp_values, sp_shape], {sp: (indices, values, shape)})
+          [sp_indices, sp_values, sp_shape], {
+              sp: (indices, values, shape)
+          })
       self.assertAllEqual(indices_out, indices)
       self.assertAllEqual(values_out, values)
       self.assertAllEqual(shape_out, shape)
       # Feed with SparseTensorValue
       indices_out, values_out, shape_out = s.run(
-          [sp_indices, sp_values, sp_shape],
-          {sp: sparse_tensor.SparseTensorValue(indices, values, shape)})
+          [sp_indices, sp_values, sp_shape], {
+              sp: sparse_tensor.SparseTensorValue(indices, values, shape)
+          })
       self.assertAllEqual(indices_out, indices)
       self.assertAllEqual(values_out, values)
       self.assertAllEqual(shape_out, shape)
       # Feed with SparseTensorValue, fetch SparseTensorValue
-      sp2_out = s.run(
-          sp2, {sp: sparse_tensor.SparseTensorValue(indices, values, shape)})
+      sp2_out = s.run(sp2, {
+          sp: sparse_tensor.SparseTensorValue(indices, values, shape)
+      })
       self.assertAllEqual(sp2_out.indices, indices)
       self.assertAllEqual(sp2_out.values, values)
       self.assertAllEqual(sp2_out.dense_shape, shape)
@@ -689,9 +726,8 @@ class SessionTest(test_util.TensorFlowTestCase):
       indices = np.array([[3, 2, 0], [4, 5, 1]]).astype(np.int64)
       values = np.array([1.0, 2.0]).astype(np.float32)
       shape = np.array([7, 9, 2]).astype(np.int64)
-      sp = array_ops.sparse_placeholder(dtype=np.float32,
-                                        shape=shape,
-                                        name='placeholder1')
+      sp = array_ops.sparse_placeholder(
+          dtype=np.float32, shape=shape, name='placeholder1')
       self.assertAllEqual(sp.dense_shape.eval(session=s), shape)
       self.assertAllEqual(tensor_util.constant_value(sp.dense_shape), shape)
       sp_indices = array_ops.identity(sp.indices)
@@ -699,7 +735,9 @@ class SessionTest(test_util.TensorFlowTestCase):
       sp_shape = array_ops.identity(sp.dense_shape)
       # Feed with tuple
       indices_out, values_out, shape_out = s.run(
-          [sp_indices, sp_values, sp_shape], {sp: (indices, values)})
+          [sp_indices, sp_values, sp_shape], {
+              sp: (indices, values)
+          })
       self.assertAllEqual(indices_out, indices)
       self.assertAllEqual(values_out, values)
       self.assertAllEqual(shape_out, shape)
@@ -745,33 +783,34 @@ class SessionTest(test_util.TensorFlowTestCase):
       indices = np.array([[3, 2, 0], [4, 5, 1]]).astype(np.int64)
       dense_shape = np.array([7, 9, 2]).astype(np.int64)
       ind = ops.IndexedSlices(
-          array_ops.placeholder(dtype=np.float32,
-                                shape=(2,)),
-          array_ops.placeholder(dtype=np.int64,
-                                shape=(2, 3)),
-          array_ops.placeholder(dtype=np.int64,
-                                shape=(3,)),)
+          array_ops.placeholder(dtype=np.float32, shape=(2,)),
+          array_ops.placeholder(dtype=np.int64, shape=(2, 3)),
+          array_ops.placeholder(dtype=np.int64, shape=(3,)),
+      )
       ind_values = array_ops.identity(ind.values)
       ind_indices = array_ops.identity(ind.indices)
       ind_dense_shape = array_ops.identity(ind.dense_shape)
       ind2 = ops.IndexedSlices(ind_values, ind_indices, ind_dense_shape)
       # Feed with tuple
       values_out, indices_out, dense_shape_out = s.run(
-          [ind_values, ind_indices, ind_dense_shape],
-          {ind: (values, indices, dense_shape)})
+          [ind_values, ind_indices, ind_dense_shape], {
+              ind: (values, indices, dense_shape)
+          })
       self.assertAllEqual(values_out, values)
       self.assertAllEqual(indices_out, indices)
       self.assertAllEqual(dense_shape_out, dense_shape)
       # Feed with IndexedSlicesValue
       values_out, indices_out, dense_shape_out = s.run(
-          [ind_values, ind_indices, ind_dense_shape],
-          {ind: ops.IndexedSlicesValue(values, indices, dense_shape)})
+          [ind_values, ind_indices, ind_dense_shape], {
+              ind: ops.IndexedSlicesValue(values, indices, dense_shape)
+          })
       self.assertAllEqual(values_out, values)
       self.assertAllEqual(indices_out, indices)
       self.assertAllEqual(dense_shape_out, dense_shape)
       # Feed with IndexedSlicesValue, fetch IndexedSlicesValue
-      ind2_out = s.run(ind2, {ind: ops.IndexedSlicesValue(values, indices,
-                                                          dense_shape)})
+      ind2_out = s.run(ind2, {
+          ind: ops.IndexedSlicesValue(values, indices, dense_shape)
+      })
       self.assertAllEqual(ind2_out.values, values)
       self.assertAllEqual(ind2_out.indices, indices)
       self.assertAllEqual(ind2_out.dense_shape, dense_shape)
@@ -816,28 +855,27 @@ class SessionTest(test_util.TensorFlowTestCase):
       indices = np.array([[3, 2, 0], [4, 5, 1]]).astype(np.int64)
       dense_shape = None
       ind = ops.IndexedSlices(
-          array_ops.placeholder(dtype=np.float32,
-                                shape=(2,)),
-          array_ops.placeholder(dtype=np.int64,
-                                shape=(2, 3)),
-          None)
+          array_ops.placeholder(dtype=np.float32, shape=(2,)),
+          array_ops.placeholder(dtype=np.int64, shape=(2, 3)), None)
       ind_values = array_ops.identity(ind.values)
       ind_indices = array_ops.identity(ind.indices)
       ind2 = ops.IndexedSlices(ind_values, ind_indices)
       # Feed with tuple
-      values_out, indices_out = s.run(
-          [ind_values, ind_indices], {ind: (values, indices)})
+      values_out, indices_out = s.run([ind_values, ind_indices], {
+          ind: (values, indices)
+      })
       self.assertAllEqual(values_out, values)
       self.assertAllEqual(indices_out, indices)
       # Feed with IndexedSlicesValue
-      values_out, indices_out = s.run(
-          [ind_values, ind_indices],
-          {ind: ops.IndexedSlicesValue(values, indices, dense_shape)})
+      values_out, indices_out = s.run([ind_values, ind_indices], {
+          ind: ops.IndexedSlicesValue(values, indices, dense_shape)
+      })
       self.assertAllEqual(values_out, values)
       self.assertAllEqual(indices_out, indices)
       # Feed with IndexedSlicesValue, fetch IndexedSlicesValue
-      ind2_out = s.run(ind2, {ind: ops.IndexedSlicesValue(values, indices,
-                                                          dense_shape)})
+      ind2_out = s.run(ind2, {
+          ind: ops.IndexedSlicesValue(values, indices, dense_shape)
+      })
       self.assertAllEqual(ind2_out.values, values)
       self.assertAllEqual(ind2_out.indices, indices)
       self.assertAllEqual(ind2_out.dense_shape, dense_shape)
@@ -986,8 +1024,9 @@ class SessionTest(test_util.TensorFlowTestCase):
     constructed_events = [threading.Event() for _ in range(10)]
     continue_event = threading.Event()
     for i, constructed_event in enumerate(constructed_events):
-      t = self.checkedThread(target=self._testDefaultGraphInThread,
-                             args=(constructed_event, continue_event, i))
+      t = self.checkedThread(
+          target=self._testDefaultGraphInThread,
+          args=(constructed_event, continue_event, i))
       threads.append(t)
     for t in threads:
       t.start()
@@ -1006,6 +1045,7 @@ class SessionTest(test_util.TensorFlowTestCase):
         ev.wait()
         val = c.eval(session=sess)
         self.assertEqual(val, 5.0)
+
       threads = [self.checkedThread(target=run_step) for _ in range(100)]
       for t in threads:
         t.start()
@@ -1038,11 +1078,10 @@ class SessionTest(test_util.TensorFlowTestCase):
 
   def testGraphDef(self):
     with session.Session() as sess:
-      self.assertProtoEquals(
-          'versions { producer: %d min_consumer: %d }' % (
-              versions.GRAPH_DEF_VERSION,
-              versions.GRAPH_DEF_VERSION_MIN_CONSUMER),
-          sess.graph_def)
+      self.assertProtoEquals('versions { producer: %d min_consumer: %d }' %
+                             (versions.GRAPH_DEF_VERSION,
+                              versions.GRAPH_DEF_VERSION_MIN_CONSUMER),
+                             sess.graph_def)
       c = constant_op.constant(5.0, name='c')
       self.assertEquals(len(sess.graph_def.node), 1)
       d = constant_op.constant(6.0, name='d')
@@ -1072,6 +1111,7 @@ class SessionTest(test_util.TensorFlowTestCase):
             lambda e: 'Attempted to use a closed Session.' in str(e)):
           while True:
             sess.run(c)
+
       t = threading.Thread(target=update_thread)
       t.start()
       time.sleep(0.1)
@@ -1177,17 +1217,11 @@ class SessionTest(test_util.TensorFlowTestCase):
 
   def testFeedAndFetch(self):
     with session.Session() as sess:
-      for dtype in [dtypes.float16,
-                    dtypes.float32,
-                    dtypes.float64,
-                    dtypes.int32,
-                    dtypes.uint8,
-                    dtypes.int16,
-                    dtypes.int8,
-                    dtypes.int64,
-                    dtypes.bool,
-                    dtypes.complex64,
-                    dtypes.complex128]:
+      for dtype in [
+          dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32,
+          dtypes.uint8, dtypes.int16, dtypes.int8, dtypes.int64, dtypes.bool,
+          dtypes.complex64, dtypes.complex128
+      ]:
         for shape in [(32, 4, 128), (37,), (2, 0, 6), (0, 0, 0)]:
           np_dtype = dtype.as_numpy_dtype
 
@@ -1206,13 +1240,19 @@ class SessionTest(test_util.TensorFlowTestCase):
             np_array = np_array.astype(np_dtype)
 
           self.assertAllEqual(np_array,
-                              sess.run(out_t, feed_dict={feed_t: np_array}))
+                              sess.run(out_t, feed_dict={
+                                  feed_t: np_array
+                              }))
           # Check that we can also get the feed back.
           self.assertAllEqual(np_array,
-                              sess.run(feed_t, feed_dict={feed_t: np_array}))
+                              sess.run(feed_t, feed_dict={
+                                  feed_t: np_array
+                              }))
           # Also check that we can get both back.
-          out_v, feed_v = sess.run([out_t, feed_t],
-                                   feed_dict={feed_t: np_array})
+          out_v, feed_v = sess.run(
+              [out_t, feed_t], feed_dict={
+                  feed_t: np_array
+              })
           self.assertAllEqual(np_array, out_v)
           self.assertAllEqual(np_array, feed_v)
 
@@ -1257,9 +1297,11 @@ class SessionTest(test_util.TensorFlowTestCase):
           trace_level=config_pb2.RunOptions.FULL_TRACE)
       run_metadata = config_pb2.RunMetadata()
       self.assertEqual(0, len(run_metadata.step_stats.dev_stats))
-      self.assertAllClose(
-          42.0,
-          tensor_runner(41.0, options=run_options, run_metadata=run_metadata))
+      self.assertAllClose(42.0,
+                          tensor_runner(
+                              41.0,
+                              options=run_options,
+                              run_metadata=run_metadata))
       self.assertGreater(len(run_metadata.step_stats.dev_stats), 0)
 
   def testFeedError(self):
@@ -1296,8 +1338,9 @@ class SessionTest(test_util.TensorFlowTestCase):
         size = 1
         for s in shape:
           size *= s
-        c_list = np.array([compat.as_bytes(str(i)) for i in xrange(size)],
-                          dtype=np.object).reshape(shape) if size > 0 else []
+        c_list = np.array(
+            [compat.as_bytes(str(i)) for i in xrange(size)],
+            dtype=np.object).reshape(shape) if size > 0 else []
         c = constant_op.constant(c_list)
         self.assertAllEqual(c.eval(), c_list)
 
@@ -1307,13 +1350,16 @@ class SessionTest(test_util.TensorFlowTestCase):
         size = 1
         for s in shape:
           size *= s
-        c_list = np.array([compat.as_bytes(str(i)) for i in xrange(size)],
-                          dtype=np.object).reshape(shape)
+        c_list = np.array(
+            [compat.as_bytes(str(i)) for i in xrange(size)],
+            dtype=np.object).reshape(shape)
         feed_t = array_ops.placeholder(dtype=dtypes.string, shape=shape)
         c = array_ops.identity(feed_t)
         self.assertAllEqual(sess.run(c, feed_dict={feed_t: c_list}), c_list)
-        self.assertAllEqual(sess.run(feed_t, feed_dict={feed_t: c_list}),
-                            c_list)
+        self.assertAllEqual(
+            sess.run(feed_t, feed_dict={
+                feed_t: c_list
+            }), c_list)
         c_v, feed_v = sess.run([c, feed_t], feed_dict={feed_t: c_list})
         self.assertAllEqual(c_v, c_list)
         self.assertAllEqual(feed_v, c_list)
@@ -1329,8 +1375,10 @@ class SessionTest(test_util.TensorFlowTestCase):
 
   def testStringFeedWithUnicode(self):
     with session.Session():
-      c_list = [u'\n\x01\x00', u'\n\x00\x01',
-                u'\u26a3 unicode', u'\U0001f60e deal with it']
+      c_list = [
+          u'\n\x01\x00', u'\n\x00\x01', u'\u26a3 unicode',
+          u'\U0001f60e deal with it'
+      ]
       feed_t = array_ops.placeholder(dtype=dtypes.string, shape=[len(c_list)])
       c = array_ops.identity(feed_t)
 
@@ -1423,9 +1471,10 @@ class SessionTest(test_util.TensorFlowTestCase):
         sess.run(constant_op.constant(1.0), run_metadata=run_metadata)
         self.assertTrue(not run_metadata.HasField('step_stats'))
 
-        sess.run(constant_op.constant(1.0),
-                 options=run_options,
-                 run_metadata=run_metadata)
+        sess.run(
+            constant_op.constant(1.0),
+            options=run_options,
+            run_metadata=run_metadata)
 
         self.assertTrue(run_metadata.HasField('step_stats'))
         self.assertEquals(len(run_metadata.step_stats.dev_stats), 1)
@@ -1439,23 +1488,26 @@ class SessionTest(test_util.TensorFlowTestCase):
       with session.Session() as sess:
         # all combinations are valid
         sess.run(constant_op.constant(1.0), options=None, run_metadata=None)
-        sess.run(constant_op.constant(1.0), options=None,
-                 run_metadata=run_metadata)
+        sess.run(
+            constant_op.constant(1.0), options=None, run_metadata=run_metadata)
         self.assertTrue(not run_metadata.HasField('step_stats'))
 
-        sess.run(constant_op.constant(1.0), options=run_options,
-                 run_metadata=None)
+        sess.run(
+            constant_op.constant(1.0), options=run_options, run_metadata=None)
         self.assertTrue(not run_metadata.HasField('step_stats'))
 
-        sess.run(constant_op.constant(1.0), options=run_options,
-                 run_metadata=run_metadata)
+        sess.run(
+            constant_op.constant(1.0),
+            options=run_options,
+            run_metadata=run_metadata)
 
         self.assertTrue(run_metadata.HasField('step_stats'))
         self.assertEquals(len(run_metadata.step_stats.dev_stats), 1)
 
   def testFeedShapeCompatibility(self):
     # TODO(nolivia): C API doesn't yet handle marking nodes as not feedable.
-    if ops._USE_C_API: return
+    if ops._USE_C_API:
+      return
 
     with session.Session() as sess:
       some_tensor = constant_op.constant([2.0, 2.0, 2.0, 2.0])
@@ -1499,8 +1551,11 @@ class SessionTest(test_util.TensorFlowTestCase):
         d = math_ops.multiply(c, c)
       for step in xrange(120):
         run_metadata = config_pb2.RunMetadata()
-        sess.run(d, feed_dict={a: 1.0},
-                 options=run_options, run_metadata=run_metadata)
+        sess.run(
+            d,
+            feed_dict={a: 1.0},
+            options=run_options,
+            run_metadata=run_metadata)
         if step == 99:
           self.assertTrue(run_metadata.HasField('cost_graph'))
         else:
@@ -1569,8 +1624,7 @@ class SessionTest(test_util.TensorFlowTestCase):
 
   def testTimeoutWithShortOperations(self):
     num_epochs = 5
-    q = data_flow_ops.FIFOQueue(
-        capacity=50, dtypes=[dtypes.int32], shapes=[()])
+    q = data_flow_ops.FIFOQueue(capacity=50, dtypes=[dtypes.int32], shapes=[()])
     enqueue_op = q.enqueue_many(constant_op.constant([1, 2]))
 
     # Use a 10-second timeout, which should be longer than any
@@ -1582,7 +1636,9 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertEqual(sess.run(q.size()), num_epochs * 2)
 
   def testRegisterFetchAndFeedConversionFunctions(self):
+
     class SquaredTensor(object):
+
       def __init__(self, tensor):
         self.sq = math_ops.square(tensor)
 
@@ -1591,24 +1647,27 @@ class SessionTest(test_util.TensorFlowTestCase):
     feed_fn2 = lambda feed: [feed.sq]
 
     session.register_session_run_conversion_functions(SquaredTensor, fetch_fn,
-        feed_fn1, feed_fn2)
+                                                      feed_fn1, feed_fn2)
     with self.assertRaises(ValueError):
-      session.register_session_run_conversion_functions(SquaredTensor,
-          fetch_fn, feed_fn1, feed_fn2)
+      session.register_session_run_conversion_functions(SquaredTensor, fetch_fn,
+                                                        feed_fn1, feed_fn2)
     with self.test_session() as sess:
       np1 = np.array([1.0, 1.5, 2.0, 2.5])
       np2 = np.array([3.0, 3.5, 4.0, 4.5])
       squared_tensor = SquaredTensor(np2)
       squared_eval = sess.run(squared_tensor)
       self.assertAllClose(np2 * np2, squared_eval)
-      squared_eval = sess.run(squared_tensor, feed_dict={
-        squared_tensor : np1 * np1})
+      squared_eval = sess.run(
+          squared_tensor, feed_dict={
+              squared_tensor: np1 * np1
+          })
       self.assertAllClose(np1 * np1, squared_eval)
       partial_run = sess.partial_run_setup([squared_tensor], [])
       squared_eval = sess.partial_run(partial_run, squared_tensor)
       self.assertAllClose(np2 * np2, squared_eval)
 
   def testDefaultLogDevicePlacement(self):
+
     class CaptureStderr(str):
       """Class to capture stderr from C++ shared library."""
 
@@ -1719,6 +1778,7 @@ class SessionTest(test_util.TensorFlowTestCase):
 
   def runTestAddFunctionToSession(self, target=''):
     """Add a function to a session after the graph has already been run."""
+
     @function.Defun(dtypes.float32)
     def foo(x):
       return x + 1
@@ -1753,6 +1813,7 @@ class SessionTest(test_util.TensorFlowTestCase):
           TypeError, 'Type of feed value 1 with type <(\w+) \'int\'> is not'):
         sess.run(a, feed_dict={a: 1})
 
+
 class GraphMutationTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
@@ -1803,8 +1864,7 @@ class GraphMutationTest(test_util.TensorFlowTestCase):
 
     with session.Session(graph=g) as sess:
       self.assertAllEqual(1.0, sess.run(b))
-      b.op._set_attr('DstT',
-                     attr_value_pb2.AttrValue(type=types_pb2.DT_FLOAT))
+      b.op._set_attr('DstT', attr_value_pb2.AttrValue(type=types_pb2.DT_FLOAT))
       with self.assertRaisesRegexp(
           errors.FailedPreconditionError,
           'Cast.*was changed by setting attribute after it was run'):
diff --git a/tensorflow/python/estimator/inputs/queues/feeding_functions.py b/tensorflow/python/estimator/inputs/queues/feeding_functions.py
index 75c0e61d47..8e5d8141a1 100644
--- a/tensorflow/python/estimator/inputs/queues/feeding_functions.py
+++ b/tensorflow/python/estimator/inputs/queues/feeding_functions.py
@@ -47,10 +47,9 @@ except ImportError:
 
 
 def _fill_array(arr, seq, fillvalue=0):
-  """
-  Recursively fills padded arr with elements from seq.
-  If length of seq is less than arr padded length, fillvalue used.
+  """Recursively fills padded arr with elements from seq.
 
+  If length of seq is less than arr padded length, fillvalue used.
   Args:
     arr: Padded tensor of shape [batch_size, ..., max_padded_dim_len].
     seq: Non-padded list of data sampels of shape
@@ -84,28 +83,30 @@ def _pad_if_needed(batch_key_item, fillvalue=0):
   Raises:
     ValueError if data samples have different shapes (except last padded dim).
   """
-  shapes = [seq.shape[:-1] if len(seq.shape) > 0 else -1
-            for seq in batch_key_item]
+  shapes = [
+      seq.shape[:-1] if len(seq.shape) > 0 else -1 for seq in batch_key_item
+  ]
   if not all(shapes[0] == x for x in shapes):
     raise ValueError("Array shapes must match.")
 
-  last_length = [seq.shape[-1] if len(seq.shape) > 0 else 0
-                 for seq in batch_key_item]
+  last_length = [
+      seq.shape[-1] if len(seq.shape) > 0 else 0 for seq in batch_key_item
+  ]
   if all([x == last_length[0] for x in last_length]):
     return batch_key_item
 
   batch_size = len(batch_key_item)
   max_sequence_length = max(last_length)
   result_batch = np.zeros(
-    shape=[batch_size] + list(shapes[0]) + [max_sequence_length],
-    dtype=batch_key_item[0].dtype)
+      shape=[batch_size] + list(shapes[0]) + [max_sequence_length],
+      dtype=batch_key_item[0].dtype)
   _fill_array(result_batch, batch_key_item, fillvalue)
   return result_batch
 
 
-def _get_integer_indices_for_next_batch(
-    batch_indices_start, batch_size, epoch_end, array_length,
-    current_epoch, total_epochs):
+def _get_integer_indices_for_next_batch(batch_indices_start, batch_size,
+                                        epoch_end, array_length, current_epoch,
+                                        total_epochs):
   """Returns the integer indices for next batch.
 
   If total epochs is not None and current epoch is the final epoch, the end
@@ -135,8 +136,9 @@ def _get_integer_indices_for_next_batch(
                                  "Already emitted %s epochs." % current_epoch)
 
   batch_indices_end = batch_indices_start + batch_size
-  batch_indices = [j % array_length for j in
-                   range(batch_indices_start, batch_indices_end)]
+  batch_indices = [
+      j % array_length for j in range(batch_indices_start, batch_indices_end)
+  ]
   epoch_end_indices = [i for i, x in enumerate(batch_indices) if x == epoch_end]
   current_epoch += len(epoch_end_indices)
 
@@ -320,16 +322,20 @@ class _GeneratorFeedFn(object):
           raise KeyError("key mismatch between dicts emitted by GenFun "
                          "Expected {} keys; got {}".format(
                              self._keys, data_row.keys()))
-        list_dict.setdefault(self._col_placeholders[index],
-                             list()).append(data_row[key])
+        list_dict.setdefault(self._col_placeholders[index], list()).append(
+            data_row[key])
         list_dict_size += 1
 
     if self._pad_value is not None:
-      feed_dict = {key: np.asarray(_pad_if_needed(item, self._pad_value))
-                   for key, item in list(list_dict.items())}
+      feed_dict = {
+          key: np.asarray(_pad_if_needed(item, self._pad_value))
+          for key, item in list(list_dict.items())
+      }
     else:
-      feed_dict = {key: np.asarray(item)
-                   for key, item in list(list_dict.items())}
+      feed_dict = {
+          key: np.asarray(item)
+          for key, item in list(list_dict.items())
+      }
     return feed_dict
 
 
@@ -382,9 +388,8 @@ def _enqueue_data(data,
       queue_shapes = [(), data.shape[1:]]
       get_feed_fn = _ArrayFeedFn
     elif isinstance(data, collections.OrderedDict):
-      types = [dtypes.int64] + [
-          dtypes.as_dtype(col.dtype) for col in data.values()
-      ]
+      types = [dtypes.int64
+              ] + [dtypes.as_dtype(col.dtype) for col in data.values()]
       queue_shapes = [()] + [col.shape[1:] for col in data.values()]
       get_feed_fn = _OrderedDictNumpyFeedFn
     elif isinstance(data, tp.FunctionType):
@@ -447,11 +452,11 @@ def _enqueue_data(data,
           seed=seed)
     elif pad_data:
       min_after_dequeue = 0  # just for the summary text
-      queue_shapes = list(map(
-        lambda x: tuple(list(x[:-1]) + [None]) if len(x) > 0 else x,
-        queue_shapes))
+      queue_shapes = list(
+          map(lambda x: tuple(list(x[:-1]) + [None]) if len(x) > 0 else x,
+              queue_shapes))
       queue = data_flow_ops.PaddingFIFOQueue(
-        capacity, dtypes=types, shapes=queue_shapes)
+          capacity, dtypes=types, shapes=queue_shapes)
     else:
       min_after_dequeue = 0  # just for the summary text
       queue = data_flow_ops.FIFOQueue(
@@ -470,31 +475,35 @@ def _enqueue_data(data,
 
       if not pad_data:
         feed_fns.append(
-          get_feed_fn(
-              placeholders,
-              data,
-              enqueue_size,
-              random_start=shuffle,
-              seed=seed_i,
-              num_epochs=num_epochs))
+            get_feed_fn(
+                placeholders,
+                data,
+                enqueue_size,
+                random_start=shuffle,
+                seed=seed_i,
+                num_epochs=num_epochs))
       else:
         feed_fns.append(
-          get_feed_fn(
-              placeholders,
-              data,
-              enqueue_size,
-              random_start=shuffle,
-              seed=seed_i,
-              num_epochs=num_epochs,
-              pad_value=pad_value))
+            get_feed_fn(
+                placeholders,
+                data,
+                enqueue_size,
+                random_start=shuffle,
+                seed=seed_i,
+                num_epochs=num_epochs,
+                pad_value=pad_value))
 
     runner = fqr._FeedingQueueRunner(  # pylint: disable=protected-access
-        queue=queue, enqueue_ops=enqueue_ops, feed_fns=feed_fns)
+        queue=queue,
+        enqueue_ops=enqueue_ops,
+        feed_fns=feed_fns)
     queue_runner.add_queue_runner(runner)
 
-    full = (math_ops.cast(
-        math_ops.maximum(0, queue.size() - min_after_dequeue),
-        dtypes.float32) * (1. / (capacity - min_after_dequeue)))
+    full = (
+        math_ops.cast(
+            math_ops.maximum(0,
+                             queue.size() - min_after_dequeue), dtypes.float32)
+        * (1. / (capacity - min_after_dequeue)))
     # Note that name contains a '/' at the end so we intentionally do not place
     # a '/' after %s below.
     summary_name = ("queue/%sfraction_over_%d_of_%d_full" %
diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index ec6184aacd..a96b88d96f 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -82,7 +82,9 @@ class BatchMatrixTransposeTest(test_util.TensorFlowTestCase):
       matrix_ph = array_ops.placeholder(dtypes.int32)
       transposed = array_ops.matrix_transpose(matrix_ph)
       self.assertAllEqual(
-          expected_transposed, transposed.eval(feed_dict={matrix_ph: matrix}))
+          expected_transposed, transposed.eval(feed_dict={
+              matrix_ph: matrix
+          }))
 
   def testBatchMatrixDynamicallyDefined(self):
     matrix_0 = [[1, 2, 3], [4, 5, 6]]
@@ -96,7 +98,9 @@ class BatchMatrixTransposeTest(test_util.TensorFlowTestCase):
       transposed = array_ops.matrix_transpose(batch_matrix_ph)
       self.assertAllEqual(
           expected_transposed,
-          transposed.eval(feed_dict={batch_matrix_ph: batch_matrix}))
+          transposed.eval(feed_dict={
+              batch_matrix_ph: batch_matrix
+          }))
 
   def testTensorWithStaticRankLessThanTwoRaisesBecauseNotAMatrix(self):
     vector = [1, 2, 3]
@@ -203,8 +207,10 @@ class BooleanMaskTest(test_util.TensorFlowTestCase):
 
       masked_tensor = sess.run(
           array_ops.boolean_mask(ph_tensor, ph_mask),
-          feed_dict={ph_tensor: arr,
-                     ph_mask: mask})
+          feed_dict={
+              ph_tensor: arr,
+              ph_mask: mask
+          })
       np.testing.assert_allclose(masked_tensor, arr[mask])
 
   def testMaskDimensionsSetToNoneRaises(self):
@@ -280,7 +286,8 @@ class ReverseV2Test(test_util.TensorFlowTestCase):
       for axis_dtype in [dtypes.int32, dtypes.int64]:
         with self.test_session(use_gpu=use_gpu):
           x_tf = array_ops.reverse_v2(x_np,
-              constant_op.constant([0], dtype=axis_dtype)).eval()
+                                      constant_op.constant(
+                                          [0], dtype=axis_dtype)).eval()
           self.assertAllEqual(x_tf, np.asarray(x_np)[::-1])
 
   def _reverse2DimAuto(self, np_dtype):
@@ -290,16 +297,17 @@ class ReverseV2Test(test_util.TensorFlowTestCase):
       for use_gpu in [False, True]:
         for axis_dtype in [dtypes.int32, dtypes.int64]:
           with self.test_session(use_gpu=use_gpu):
-            x_tf_1 = reverse_f(x_np,
-                constant_op.constant([0], dtype=axis_dtype)).eval()
-            x_tf_2 = reverse_f(x_np,
-                constant_op.constant([-2], dtype=axis_dtype)).eval()
-            x_tf_3 = reverse_f(x_np,
-                constant_op.constant([1], dtype=axis_dtype)).eval()
-            x_tf_4 = reverse_f(x_np,
-                constant_op.constant([-1], dtype=axis_dtype)).eval()
+            x_tf_1 = reverse_f(x_np, constant_op.constant(
+                [0], dtype=axis_dtype)).eval()
+            x_tf_2 = reverse_f(x_np, constant_op.constant(
+                [-2], dtype=axis_dtype)).eval()
+            x_tf_3 = reverse_f(x_np, constant_op.constant(
+                [1], dtype=axis_dtype)).eval()
+            x_tf_4 = reverse_f(x_np, constant_op.constant(
+                [-1], dtype=axis_dtype)).eval()
             x_tf_5 = reverse_f(x_np,
-                constant_op.constant([1, 0], dtype=axis_dtype)).eval()
+                               constant_op.constant([1, 0],
+                                                    dtype=axis_dtype)).eval()
             self.assertAllEqual(x_tf_1, np.asarray(x_np)[::-1, :])
             self.assertAllEqual(x_tf_2, np.asarray(x_np)[::-1, :])
             self.assertAllEqual(x_tf_3, np.asarray(x_np)[:, ::-1])
@@ -324,18 +332,16 @@ class ReverseV2Test(test_util.TensorFlowTestCase):
 
   def testReverse1DimAuto(self):
     for dtype in [
-        np.uint8, np.int8, np.uint16, np.int16, np.int32, np.int64,
-        np.bool, np.float16, np.float32,
-        np.float64, np.complex64, np.complex128,
+        np.uint8, np.int8, np.uint16, np.int16, np.int32, np.int64, np.bool,
+        np.float16, np.float32, np.float64, np.complex64, np.complex128,
         np.array(b"").dtype.type
     ]:
       self._reverse1DimAuto(dtype)
 
   def testReverse2DimAuto(self):
     for dtype in [
-        np.uint8, np.int8, np.uint16, np.int16, np.int32, np.int64,
-        np.bool, np.float16, np.float32,
-        np.float64, np.complex64, np.complex128,
+        np.uint8, np.int8, np.uint16, np.int16, np.int32, np.int64, np.bool,
+        np.float16, np.float32, np.float64, np.complex64, np.complex128,
         np.array(b"").dtype.type
     ]:
       self._reverse2DimAuto(dtype)
@@ -711,8 +717,8 @@ class GradSliceChecker(object):
     slice_val_grad2, = gradients_impl.gradients(
         slice_val_grad, dy, grad_ys=self.var)
     self.sess.run(assign)
-    slice_val_grad_evaled, slice_val_grad2_evaled = (self.sess.run(
-        [slice_val_grad, slice_val_grad2]))
+    slice_val_grad_evaled, slice_val_grad2_evaled = (
+        self.sess.run([slice_val_grad, slice_val_grad2]))
     analytic_grad2_evaled = analytic_grad2.eval()
     self.test.assertAllEqual(slice_val_grad2_evaled, analytic_grad2_evaled)
 
@@ -987,9 +993,10 @@ class SequenceMaskTest(test_util.TensorFlowTestCase):
     with self.test_session():
       res = array_ops.sequence_mask(constant_op.constant([1, 3, 2]), 5)
       self.assertAllEqual(res.get_shape(), [3, 5])
-      self.assertAllEqual(res.eval(), [[True, False, False, False, False],
-                                       [True, True, True, False, False],
-                                       [True, True, False, False, False]])
+      self.assertAllEqual(
+          res.eval(),
+          [[True, False, False, False, False], [True, True, True, False, False],
+           [True, True, False, False, False]])
 
       # test dtype and default maxlen:
       res = array_ops.sequence_mask(
@@ -998,17 +1005,17 @@ class SequenceMaskTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(res.get_shape().as_list(), [3, 4])
       else:
         self.assertAllEqual(res.get_shape().as_list(), [3, None])
-      self.assertAllEqual(res.eval(), [[0.0, 0.0, 0.0,
-                                        0.0], [1.0, 0.0, 0.0, 0.0],
-                                       [1.0, 1.0, 1.0, 1.0]])
+      self.assertAllEqual(
+          res.eval(),
+          [[0.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 1.0]])
 
   def testTwoDimensional(self):
     with self.test_session():
       res = array_ops.sequence_mask(constant_op.constant([[1, 3, 2]]), 5)
       self.assertAllEqual(res.get_shape(), [1, 3, 5])
-      self.assertAllEqual(res.eval(), [[[True, False, False, False, False],
-                                        [True, True, True, False, False],
-                                        [True, True, False, False, False]]])
+      self.assertAllEqual(res.eval(), [[[True, False, False, False, False], [
+          True, True, True, False, False
+      ], [True, True, False, False, False]]])
 
       # test dtype and default maxlen:
       res = array_ops.sequence_mask(
@@ -1017,12 +1024,10 @@ class SequenceMaskTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(res.get_shape().as_list(), [2, 3, 4])
       else:
         self.assertAllEqual(res.get_shape().as_list(), [2, 3, None])
-      self.assertAllEqual(res.eval(), [[[0.0, 0.0, 0.0, 0.0],
-                                        [1.0, 0.0, 0.0, 0.0],
-                                        [1.0, 1.0, 1.0, 1.0]],
-                                       [[1.0, 0.0, 0.0, 0.0],
-                                        [1.0, 1.0, 0.0, 0.0],
-                                        [1.0, 1.0, 1.0, 0.0]]])
+      self.assertAllEqual(
+          res.eval(),
+          [[[0.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 1.0]],
+           [[1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0], [1.0, 1.0, 1.0, 0.0]]])
 
   def testDtypes(self):
 
@@ -1031,9 +1036,10 @@ class SequenceMaskTest(test_util.TensorFlowTestCase):
           constant_op.constant([1, 3, 2], dtype=lengths_dtype),
           constant_op.constant(5, dtype=maxlen_dtype))
       self.assertAllEqual(res.get_shape(), [3, 5])
-      self.assertAllEqual(res.eval(), [[True, False, False, False, False],
-                                       [True, True, True, False, False],
-                                       [True, True, False, False, False]])
+      self.assertAllEqual(
+          res.eval(),
+          [[True, False, False, False, False], [True, True, True, False, False],
+           [True, True, False, False, False]])
 
     with self.test_session():
       check_dtypes(dtypes.int32, dtypes.int32)
@@ -1088,13 +1094,14 @@ class PadTest(test_util.TensorFlowTestCase):
   def testEager(self):
     with context.eager_mode():
       t = constant_op.constant([[1, 2, 3], [4, 5, 6]])
-      paddings = constant_op.constant([[1, 1,], [2, 2]])
+      paddings = constant_op.constant([[
+          1,
+          1,
+      ], [2, 2]])
       padded = array_ops.pad(t, paddings, "CONSTANT")
       self.assertAllEqual(padded.numpy(),
-                          [[0, 0, 0, 0, 0, 0, 0],
-                           [0, 0, 1, 2, 3, 0, 0],
-                           [0, 0, 4, 5, 6, 0, 0],
-                           [0, 0, 0, 0, 0, 0, 0]])
+                          [[0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 2, 3, 0, 0],
+                           [0, 0, 4, 5, 6, 0, 0], [0, 0, 0, 0, 0, 0, 0]])
 
 
 class InvertPermutationTest(test_util.TensorFlowTestCase):
diff --git a/tensorflow/python/kernel_tests/diag_op_test.py b/tensorflow/python/kernel_tests/diag_op_test.py
index 6cfa9b37fe..0825d8fc6b 100644
--- a/tensorflow/python/kernel_tests/diag_op_test.py
+++ b/tensorflow/python/kernel_tests/diag_op_test.py
@@ -84,11 +84,8 @@ class MatrixSetDiagTest(test.TestCase):
   def testSquare(self):
     with self.test_session(use_gpu=True):
       v = np.array([1.0, 2.0, 3.0])
-      mat = np.array([[0.0, 1.0, 0.0],
-                      [1.0, 0.0, 1.0],
-                      [1.0, 1.0, 1.0]])
-      mat_set_diag = np.array([[1.0, 1.0, 0.0],
-                               [1.0, 2.0, 1.0],
+      mat = np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 1.0], [1.0, 1.0, 1.0]])
+      mat_set_diag = np.array([[1.0, 1.0, 0.0], [1.0, 2.0, 1.0],
                                [1.0, 1.0, 3.0]])
       output = array_ops.matrix_set_diag(mat, v)
       self.assertEqual((3, 3), output.get_shape())
@@ -135,19 +132,12 @@ class MatrixSetDiagTest(test.TestCase):
 
   def testRectangularBatch(self):
     with self.test_session(use_gpu=True):
-      v_batch = np.array([[-1.0, -2.0],
-                          [-4.0, -5.0]])
-      mat_batch = np.array(
-          [[[1.0, 0.0, 3.0],
-            [0.0, 2.0, 0.0]],
-           [[4.0, 0.0, 4.0],
-            [0.0, 5.0, 0.0]]])
-
-      mat_set_diag_batch = np.array(
-          [[[-1.0, 0.0, 3.0],
-            [0.0, -2.0, 0.0]],
-           [[-4.0, 0.0, 4.0],
-            [0.0, -5.0, 0.0]]])
+      v_batch = np.array([[-1.0, -2.0], [-4.0, -5.0]])
+      mat_batch = np.array([[[1.0, 0.0, 3.0], [0.0, 2.0, 0.0]],
+                            [[4.0, 0.0, 4.0], [0.0, 5.0, 0.0]]])
+
+      mat_set_diag_batch = np.array([[[-1.0, 0.0, 3.0], [0.0, -2.0, 0.0]],
+                                     [[-4.0, 0.0, 4.0], [0.0, -5.0, 0.0]]])
       output = array_ops.matrix_set_diag(mat_batch, v_batch)
       self.assertEqual((2, 2, 3), output.get_shape())
       self.assertAllEqual(mat_set_diag_batch, output.eval())
@@ -178,10 +168,14 @@ class MatrixSetDiagTest(test.TestCase):
             np.random.rand(*diag_shape), dtype=dtypes_lib.float32)
         y = array_ops.matrix_set_diag(x, x_diag)
         error_x = gradient_checker.compute_gradient_error(
-            x, x.get_shape().as_list(), y, y.get_shape().as_list())
+            x,
+            x.get_shape().as_list(), y,
+            y.get_shape().as_list())
         self.assertLess(error_x, 1e-4)
         error_x_diag = gradient_checker.compute_gradient_error(
-            x_diag, x_diag.get_shape().as_list(), y, y.get_shape().as_list())
+            x_diag,
+            x_diag.get_shape().as_list(), y,
+            y.get_shape().as_list())
         self.assertLess(error_x_diag, 1e-4)
 
   def testGradWithNoShapeInformation(self):
@@ -192,12 +186,13 @@ class MatrixSetDiagTest(test.TestCase):
       output = array_ops.matrix_set_diag(mat, v)
       grads = gradients_impl.gradients(output, [mat, v], grad_ys=grad_input)
       grad_input_val = np.random.rand(3, 3).astype(np.float32)
-      grad_vals = sess.run(grads,
-                           feed_dict={
-                               v: 2 * np.ones(3),
-                               mat: np.ones((3, 3)),
-                               grad_input: grad_input_val
-                           })
+      grad_vals = sess.run(
+          grads,
+          feed_dict={
+              v: 2 * np.ones(3),
+              mat: np.ones((3, 3)),
+              grad_input: grad_input_val
+          })
       self.assertAllEqual(np.diag(grad_input_val), grad_vals[1])
       self.assertAllEqual(grad_input_val - np.diag(np.diag(grad_input_val)),
                           grad_vals[0])
@@ -242,13 +237,9 @@ class MatrixDiagPartTest(test.TestCase):
 
   def testRectangularBatch(self):
     with self.test_session(use_gpu=True):
-      v_batch = np.array([[1.0, 2.0],
-                          [4.0, 5.0]])
-      mat_batch = np.array(
-          [[[1.0, 0.0, 0.0],
-            [0.0, 2.0, 0.0]],
-           [[4.0, 0.0, 0.0],
-            [0.0, 5.0, 0.0]]])
+      v_batch = np.array([[1.0, 2.0], [4.0, 5.0]])
+      mat_batch = np.array([[[1.0, 0.0, 0.0], [0.0, 2.0, 0.0]],
+                            [[4.0, 0.0, 0.0], [0.0, 5.0, 0.0]]])
       self.assertEqual(mat_batch.shape, (2, 2, 3))
       mat_batch_diag = array_ops.matrix_diag_part(mat_batch)
       self.assertEqual((2, 2), mat_batch_diag.get_shape())
@@ -301,19 +292,13 @@ class DiagTest(test.TestCase):
 
   def testRankOneIntTensor(self):
     x = np.array([1, 2, 3])
-    expected_ans = np.array(
-        [[1, 0, 0],
-         [0, 2, 0],
-         [0, 0, 3]])
+    expected_ans = np.array([[1, 0, 0], [0, 2, 0], [0, 0, 3]])
     self.diagOp(x, np.int32, expected_ans)
     self.diagOp(x, np.int64, expected_ans)
 
   def testRankOneFloatTensor(self):
     x = np.array([1.1, 2.2, 3.3])
-    expected_ans = np.array(
-        [[1.1, 0, 0],
-         [0, 2.2, 0],
-         [0, 0, 3.3]])
+    expected_ans = np.array([[1.1, 0, 0], [0, 2.2, 0], [0, 0, 3.3]])
     self.diagOp(x, np.float32, expected_ans)
     self.diagOp(x, np.float64, expected_ans)
 
@@ -321,123 +306,105 @@ class DiagTest(test.TestCase):
     for dtype in [np.complex64, np.complex128]:
       x = np.array([1.1 + 1.1j, 2.2 + 2.2j, 3.3 + 3.3j], dtype=dtype)
       expected_ans = np.array(
-          [[1.1 + 1.1j, 0 + 0j, 0 + 0j],
-           [0 + 0j, 2.2 + 2.2j, 0 + 0j],
-           [0 + 0j, 0 + 0j, 3.3 + 3.3j]], dtype=dtype)
+          [[1.1 + 1.1j, 0 + 0j, 0 + 0j], [0 + 0j, 2.2 + 2.2j, 0 + 0j],
+           [0 + 0j, 0 + 0j, 3.3 + 3.3j]],
+          dtype=dtype)
       self.diagOp(x, dtype, expected_ans)
 
   def testRankTwoIntTensor(self):
     x = np.array([[1, 2, 3], [4, 5, 6]])
-    expected_ans = np.array(
-        [[[[1, 0, 0], [0, 0, 0]],
-          [[0, 2, 0], [0, 0, 0]],
-          [[0, 0, 3], [0, 0, 0]]],
-         [[[0, 0, 0], [4, 0, 0]],
-          [[0, 0, 0], [0, 5, 0]],
-          [[0, 0, 0], [0, 0, 6]]]])
+    expected_ans = np.array([[[[1, 0, 0], [0, 0, 0]], [[0, 2, 0], [0, 0, 0]],
+                              [[0, 0, 3], [0, 0, 0]]],
+                             [[[0, 0, 0], [4, 0, 0]], [[0, 0, 0], [0, 5, 0]],
+                              [[0, 0, 0], [0, 0, 6]]]])
     self.diagOp(x, np.int32, expected_ans)
     self.diagOp(x, np.int64, expected_ans)
 
   def testRankTwoFloatTensor(self):
     x = np.array([[1.1, 2.2, 3.3], [4.4, 5.5, 6.6]])
     expected_ans = np.array(
-        [[[[1.1, 0, 0], [0, 0, 0]],
-          [[0, 2.2, 0], [0, 0, 0]],
-          [[0, 0, 3.3], [0, 0, 0]]],
-         [[[0, 0, 0], [4.4, 0, 0]],
-          [[0, 0, 0], [0, 5.5, 0]],
-          [[0, 0, 0], [0, 0, 6.6]]]])
+        [[[[1.1, 0, 0], [0, 0, 0]], [[0, 2.2, 0], [0, 0, 0]],
+          [[0, 0, 3.3], [0, 0, 0]]], [[[0, 0, 0], [4.4, 0, 0]],
+                                      [[0, 0, 0], [0, 5.5, 0]], [[0, 0, 0],
+                                                                 [0, 0, 6.6]]]])
     self.diagOp(x, np.float32, expected_ans)
     self.diagOp(x, np.float64, expected_ans)
 
   def testRankTwoComplexTensor(self):
     for dtype in [np.complex64, np.complex128]:
-      x = np.array([[1.1 + 1.1j, 2.2 + 2.2j, 3.3 + 3.3j],
-                    [4.4 + 4.4j, 5.5 + 5.5j, 6.6 + 6.6j]], dtype=dtype)
+      x = np.array(
+          [[1.1 + 1.1j, 2.2 + 2.2j, 3.3 + 3.3j],
+           [4.4 + 4.4j, 5.5 + 5.5j, 6.6 + 6.6j]],
+          dtype=dtype)
       expected_ans = np.array(
-          [[[[1.1 + 1.1j, 0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j, 0 + 0j]],
-            [[0 + 0j, 2.2 + 2.2j, 0 + 0j], [0 + 0j, 0 + 0j, 0 + 0j]],
-            [[0 + 0j, 0 + 0j, 3.3 + 3.3j], [0 + 0j, 0 + 0j, 0 + 0j]]],
-           [[[0 + 0j, 0 + 0j, 0 + 0j], [4.4 + 4.4j, 0 + 0j, 0 + 0j]],
-            [[0 + 0j, 0 + 0j, 0 + 0j], [0 + 0j, 5.5 + 5.5j, 0 + 0j]],
-            [[0 + 0j, 0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j, 6.6 + 6.6j]]]],
-           dtype=dtype)
+          [[[[1.1 + 1.1j, 0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j, 0 + 0j]], [
+              [0 + 0j, 2.2 + 2.2j, 0 + 0j], [0 + 0j, 0 + 0j, 0 + 0j]
+          ], [[0 + 0j, 0 + 0j, 3.3 + 3.3j], [0 + 0j, 0 + 0j, 0 + 0j]]], [[
+              [0 + 0j, 0 + 0j, 0 + 0j], [4.4 + 4.4j, 0 + 0j, 0 + 0j]
+          ], [[0 + 0j, 0 + 0j, 0 + 0j], [0 + 0j, 5.5 + 5.5j, 0 + 0j]
+             ], [[0 + 0j, 0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j, 6.6 + 6.6j]]]],
+          dtype=dtype)
       self.diagOp(x, dtype, expected_ans)
 
   def testRankThreeFloatTensor(self):
-    x = np.array([[[1.1, 2.2], [3.3, 4.4]],
-                  [[5.5, 6.6], [7.7, 8.8]]])
-    expected_ans = np.array(
-        [[[[[[1.1, 0], [0, 0]], [[0, 0], [0, 0]]],
-           [[[0, 2.2], [0, 0]], [[0, 0], [0, 0]]]],
-          [[[[0, 0], [3.3, 0]], [[0, 0], [0, 0]]],
-           [[[0, 0], [0, 4.4]], [[0, 0], [0, 0]]]]],
-         [[[[[0, 0], [0, 0]], [[5.5, 0], [0, 0]]],
-           [[[0, 0], [0, 0]], [[0, 6.6], [0, 0]]]],
-          [[[[0, 0], [0, 0]], [[0, 0], [7.7, 0]]],
-           [[[0, 0], [0, 0]], [[0, 0], [0, 8.8]]]]]])
+    x = np.array([[[1.1, 2.2], [3.3, 4.4]], [[5.5, 6.6], [7.7, 8.8]]])
+    expected_ans = np.array([[[[[[1.1, 0], [0, 0]], [[0, 0], [0, 0]]],
+                               [[[0, 2.2], [0, 0]], [[0, 0], [0, 0]]]],
+                              [[[[0, 0], [3.3, 0]], [[0, 0], [0, 0]]],
+                               [[[0, 0], [0, 4.4]], [[0, 0], [0, 0]]]]],
+                             [[[[[0, 0], [0, 0]], [[5.5, 0], [0, 0]]],
+                               [[[0, 0], [0, 0]], [[0, 6.6], [0, 0]]]],
+                              [[[[0, 0], [0, 0]], [[0, 0], [7.7, 0]]],
+                               [[[0, 0], [0, 0]], [[0, 0], [0, 8.8]]]]]])
     self.diagOp(x, np.float32, expected_ans)
     self.diagOp(x, np.float64, expected_ans)
 
   def testRankThreeComplexTensor(self):
     for dtype in [np.complex64, np.complex128]:
-      x = np.array([[[1.1 + 1.1j, 2.2 + 2.2j], [3.3 + 3.3j, 4.4 + 4.4j]],
-                    [[5.5 + 5.5j, 6.6 + 6.6j], [7.7 + 7.7j, 8.8 + 8.8j]]],
-                    dtype=dtype)
+      x = np.array(
+          [[[1.1 + 1.1j, 2.2 + 2.2j], [3.3 + 3.3j, 4.4 + 4.4j]],
+           [[5.5 + 5.5j, 6.6 + 6.6j], [7.7 + 7.7j, 8.8 + 8.8j]]],
+          dtype=dtype)
       expected_ans = np.array(
-          [[[[[[1.1 + 1.1j, 0 + 0j], [0 + 0j, 0 + 0j]],
-              [[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]]],
-             [[[0 + 0j, 2.2 + 2.2j], [0 + 0j, 0 + 0j]],
-              [[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]]]],
-            [[[[0 + 0j, 0 + 0j], [3.3 + 3.3j, 0 + 0j]],
-              [[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]]],
-             [[[0 + 0j, 0 + 0j], [0 + 0j, 4.4 + 4.4j]],
-              [[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]]]]],
-           [[[[[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]],
-              [[5.5 + 5.5j, 0 + 0j], [0 + 0j, 0 + 0j]]],
-             [[[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]],
-              [[0 + 0j, 6.6 + 6.6j], [0 + 0j, 0 + 0j]]]],
-            [[[[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]],
-              [[0 + 0j, 0 + 0j], [7.7 + 7.7j, 0 + 0j]]],
-             [[[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]],
-              [[0 + 0j, 0 + 0j], [0 + 0j, 8.8 + 8.8j]]]]]],
+          [[[[[[1.1 + 1.1j, 0 + 0j], [0 + 0j, 0 + 0j]], [[0 + 0j, 0 + 0j], [
+              0 + 0j, 0 + 0j
+          ]]], [[[0 + 0j, 2.2 + 2.2j], [0 + 0j, 0 + 0j]], [[0 + 0j, 0 + 0j], [
+              0 + 0j, 0 + 0j
+          ]]]], [[[[0 + 0j, 0 + 0j], [3.3 + 3.3j, 0 + 0j]], [[0 + 0j, 0 + 0j], [
+              0 + 0j, 0 + 0j
+          ]]], [[[0 + 0j, 0 + 0j], [0 + 0j, 4.4 + 4.4j]], [[0 + 0j, 0 + 0j], [
+              0 + 0j, 0 + 0j
+          ]]]]], [[[[[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]], [
+              [5.5 + 5.5j, 0 + 0j], [0 + 0j, 0 + 0j]
+          ]], [[[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]], [[0 + 0j, 6.6 + 6.6j], [
+              0 + 0j, 0 + 0j
+          ]]]], [[[[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]], [[0 + 0j, 0 + 0j], [
+              7.7 + 7.7j, 0 + 0j
+          ]]], [[[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]],
+                [[0 + 0j, 0 + 0j], [0 + 0j, 8.8 + 8.8j]]]]]],
           dtype=dtype)
       self.diagOp(x, dtype, expected_ans)
 
   def testRankFourNumberTensor(self):
     for dtype in [np.float32, np.float64, np.int64, np.int32]:
       # Input with shape [2, 1, 2, 3]
-      x = np.array([[[[ 1,  2,  3],
-                      [ 4,  5,  6]]],
-                    [[[ 7,  8,  9],
-                      [10, 11, 12]]]], dtype=dtype)
+      x = np.array(
+          [[[[1, 2, 3], [4, 5, 6]]], [[[7, 8, 9], [10, 11, 12]]]], dtype=dtype)
       # Output with shape [2, 1, 2, 3, 2, 1, 2, 3]
       expected_ans = np.array(
-          [[[[[[[[1, 0, 0], [0, 0, 0]]],
-               [[[0, 0, 0], [0, 0, 0]]]],
-              [[[[0, 2, 0], [0, 0, 0]]],
-               [[[0, 0, 0], [0, 0, 0]]]],
-              [[[[0, 0, 3], [0, 0, 0]]],
-               [[[0, 0, 0], [0, 0, 0]]]]],
-             [[[[[0, 0, 0], [4, 0, 0]]],
-               [[[0, 0, 0], [0, 0, 0]]]],
-              [[[[0, 0, 0], [0, 5, 0]]],
-               [[[0, 0, 0], [0, 0, 0]]]],
-              [[[[0, 0, 0], [0, 0, 6]]],
-               [[[0, 0, 0], [0, 0, 0]]]]]]],
-
-           [[[[[[[0, 0, 0], [0, 0, 0]]],
-               [[[7, 0, 0], [0, 0, 0]]]],
-              [[[[0, 0, 0], [0, 0, 0]]],
-               [[[0, 8, 0], [0, 0, 0]]]],
-              [[[[0, 0, 0], [0, 0, 0]]],
-               [[[0, 0, 9], [0, 0, 0]]]]],
-             [[[[[0, 0, 0], [0, 0, 0]]],
-               [[[0, 0, 0], [10, 0, 0]]]],
-              [[[[0, 0, 0], [0, 0, 0]]],
-               [[[0, 0, 0], [0, 11, 0]]]],
-              [[[[0, 0, 0], [0, 0, 0]]],
-               [[[0, 0, 0], [0, 0, 12]]]]]]]], dtype=dtype)
+          [[[[[[[[1, 0, 0], [0, 0, 0]]], [[[0, 0, 0], [0, 0, 0]]]], [
+              [[[0, 2, 0], [0, 0, 0]]], [[[0, 0, 0], [0, 0, 0]]]
+          ], [[[[0, 0, 3], [0, 0, 0]]], [[[0, 0, 0], [0, 0, 0]]]]], [[
+              [[[0, 0, 0], [4, 0, 0]]], [[[0, 0, 0], [0, 0, 0]]]
+          ], [[[[0, 0, 0], [0, 5, 0]]], [[[0, 0, 0], [0, 0, 0]]]], [
+              [[[0, 0, 0], [0, 0, 6]]], [[[0, 0, 0], [0, 0, 0]]]
+          ]]]], [[[[[[[0, 0, 0], [0, 0, 0]]], [[[7, 0, 0], [0, 0, 0]]]], [
+              [[[0, 0, 0], [0, 0, 0]]], [[[0, 8, 0], [0, 0, 0]]]
+          ], [[[[0, 0, 0], [0, 0, 0]]], [[[0, 0, 9], [0, 0, 0]]]]], [[
+              [[[0, 0, 0], [0, 0, 0]]], [[[0, 0, 0], [10, 0, 0]]]
+          ], [[[[0, 0, 0], [0, 0, 0]]], [[[0, 0, 0], [0, 11, 0]]]
+             ], [[[[0, 0, 0], [0, 0, 0]]], [[[0, 0, 0], [0, 0, 12]]]]]]]],
+          dtype=dtype)
       self.diagOp(x, dtype, expected_ans)
 
   def testInvalidRank(self):
@@ -537,7 +504,9 @@ class DiagGradOpTest(test.TestCase):
           x1 = constant_op.constant(np.random.rand(*shape), dtype=dtype)
           y = array_ops.diag(x1)
           error = gradient_checker.compute_gradient_error(
-              x1, x1.get_shape().as_list(), y, y.get_shape().as_list())
+              x1,
+              x1.get_shape().as_list(), y,
+              y.get_shape().as_list())
           tf_logging.info("error = %f", error)
           self.assertLess(error, 1e-4)
 
@@ -555,7 +524,9 @@ class DiagGradPartOpTest(test.TestCase):
           x1 = constant_op.constant(np.random.rand(*shape), dtype=dtype)
           y = array_ops.diag_part(x1)
           error = gradient_checker.compute_gradient_error(
-              x1, x1.get_shape().as_list(), y, y.get_shape().as_list())
+              x1,
+              x1.get_shape().as_list(), y,
+              y.get_shape().as_list())
           tf_logging.info("error = %f", error)
           self.assertLess(error, 1e-4)
 
diff --git a/tensorflow/python/kernel_tests/map_stage_op_test.py b/tensorflow/python/kernel_tests/map_stage_op_test.py
index 8b66945059..acfafde9e0 100644
--- a/tensorflow/python/kernel_tests/map_stage_op_test.py
+++ b/tensorflow/python/kernel_tests/map_stage_op_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.platform import test
 
 TIMEOUT = 1
 
+
 class MapStageTest(test.TestCase):
 
   def testSimple(self):
@@ -83,7 +84,7 @@ class MapStageTest(test.TestCase):
             [dtypes.float32, dtypes.float32],
             shapes=[[], [128, 128]],
             names=['x', 'v'])
-        stage = stager.put(pi,{'x': x, 'v': v})
+        stage = stager.put(pi, {'x': x, 'v': v})
         key, ret = stager.get(gi)
         z = ret['x']
         y = ret['v']
@@ -128,8 +129,11 @@ class MapStageTest(test.TestCase):
         gi = array_ops.placeholder(dtypes.int64)
         p = array_ops.placeholder(dtypes.int32, name='p')
       with ops.device(test.gpu_device_name()):
-        stager = data_flow_ops.MapStagingArea([dtypes.int32, ], shapes=[[]])
-        stage = stager.put(pi,[x], [0])
+        stager = data_flow_ops.MapStagingArea(
+            [
+                dtypes.int32,
+            ], shapes=[[]])
+        stage = stager.put(pi, [x], [0])
         peek = stager.peek(gi)
         size = stager.size()
 
@@ -158,7 +162,7 @@ class MapStageTest(test.TestCase):
             [dtypes.float32, dtypes.float32],
             shapes=[[], [128, 128]],
             names=['x', 'v'])
-        stage = stager.put(pi,{'x': x, 'v': v})
+        stage = stager.put(pi, {'x': x, 'v': v})
         size = stager.size()
         clear = stager.clear()
 
@@ -172,7 +176,6 @@ class MapStageTest(test.TestCase):
       sess.run(clear)
       self.assertEqual(sess.run(size), 0)
 
-
   def testCapacity(self):
     capacity = 3
 
@@ -182,8 +185,10 @@ class MapStageTest(test.TestCase):
         pi = array_ops.placeholder(dtypes.int64, name='pi')
         gi = array_ops.placeholder(dtypes.int64, name='gi')
       with ops.device(test.gpu_device_name()):
-        stager = data_flow_ops.MapStagingArea([dtypes.int32, ],
-          capacity=capacity, shapes=[[]])
+        stager = data_flow_ops.MapStagingArea(
+            [
+                dtypes.int32,
+            ], capacity=capacity, shapes=[[]])
 
       stage = stager.put(pi, [x], [0])
       get = stager.get()
@@ -222,9 +227,8 @@ class MapStageTest(test.TestCase):
         self.fail("Expected to timeout on iteration '{}' "
                   "but instead timed out on iteration '{}' "
                   "Staging Area size is '{}' and configured "
-                  "capacity is '{}'.".format(capacity, i,
-                                            sess.run(size),
-                                            capacity))
+                  "capacity is '{}'.".format(capacity, i, sess.run(size),
+                                             capacity))
 
       # Should have capacity elements in the staging area
       self.assertTrue(sess.run(size) == capacity)
@@ -236,8 +240,8 @@ class MapStageTest(test.TestCase):
       self.assertTrue(sess.run(size) == 0)
 
   def testMemoryLimit(self):
-    memory_limit = 512*1024  # 512K
-    chunk = 200*1024 # 256K
+    memory_limit = 512 * 1024  # 512K
+    chunk = 200 * 1024  # 256K
     capacity = memory_limit // chunk
 
     with ops.Graph().as_default() as G:
@@ -246,8 +250,8 @@ class MapStageTest(test.TestCase):
         pi = array_ops.placeholder(dtypes.int64, name='pi')
         gi = array_ops.placeholder(dtypes.int64, name='gi')
       with ops.device(test.gpu_device_name()):
-        stager = data_flow_ops.MapStagingArea([dtypes.uint8],
-          memory_limit=memory_limit, shapes=[[]])
+        stager = data_flow_ops.MapStagingArea(
+            [dtypes.uint8], memory_limit=memory_limit, shapes=[[]])
         stage = stager.put(pi, [x], [0])
         get = stager.get()
         size = stager.size()
@@ -287,9 +291,8 @@ class MapStageTest(test.TestCase):
         self.fail("Expected to timeout on iteration '{}' "
                   "but instead timed out on iteration '{}' "
                   "Staging Area size is '{}' and configured "
-                  "capacity is '{}'.".format(capacity, i,
-                                            sess.run(size),
-                                            capacity))
+                  "capacity is '{}'.".format(capacity, i, sess.run(size),
+                                             capacity))
 
       # Should have capacity elements in the staging area
       self.assertTrue(sess.run(size) == capacity)
@@ -310,8 +313,10 @@ class MapStageTest(test.TestCase):
         pi = array_ops.placeholder(dtypes.int64, name='pi')
         gi = array_ops.placeholder(dtypes.int64, name='gi')
       with ops.device(test.gpu_device_name()):
-        stager = data_flow_ops.MapStagingArea([dtypes.int32, ],
-          shapes=[[]], ordered=True)
+        stager = data_flow_ops.MapStagingArea(
+            [
+                dtypes.int32,
+            ], shapes=[[]], ordered=True)
         stage = stager.put(pi, [x], [0])
         get = stager.get()
         size = stager.size()
@@ -349,7 +354,7 @@ class MapStageTest(test.TestCase):
         stager = data_flow_ops.MapStagingArea(
             [dtypes.float32, dtypes.float32, dtypes.float32],
             names=['x', 'v', 'f'])
-        stage_xf = stager.put(pi,{'x': x, 'f': f})
+        stage_xf = stager.put(pi, {'x': x, 'f': f})
         stage_v = stager.put(pi, {'v': v})
         key, ret = stager.get(gi)
         size = stager.size()
@@ -373,12 +378,13 @@ class MapStageTest(test.TestCase):
       self.assertTrue(sess.run([size, isize]) == [1, 1])
       # We can now obtain tuple associated with key 0
       self.assertTrue(
-          sess.run([key, ret],
-                   feed_dict={gi: 0}) == [0, {
-                       'x': 1,
-                       'f': 2,
-                       'v': 1
-                   }])
+          sess.run([key, ret], feed_dict={
+              gi: 0
+          }) == [0, {
+              'x': 1,
+              'f': 2,
+              'v': 1
+          }])
 
       # 0 complete and 1 incomplete entry
       self.assertTrue(sess.run([size, isize]) == [0, 1])
@@ -386,12 +392,13 @@ class MapStageTest(test.TestCase):
       sess.run(stage_v, feed_dict={pi: 1, v: 3})
       # We can now obtain tuple associated with key 1
       self.assertTrue(
-          sess.run([key, ret],
-                   feed_dict={gi: 1}) == [1, {
-                       'x': 1,
-                       'f': 2,
-                       'v': 3
-                   }])
+          sess.run([key, ret], feed_dict={
+              gi: 1
+          }) == [1, {
+              'x': 1,
+              'f': 2,
+              'v': 3
+          }])
 
   def testPartialIndexInsert(self):
     with ops.Graph().as_default() as G:
@@ -450,7 +457,7 @@ class MapStageTest(test.TestCase):
         stager = data_flow_ops.MapStagingArea(
             [dtypes.float32, dtypes.float32, dtypes.float32],
             names=['x', 'v', 'f'])
-        stage_xf = stager.put(pi,{'x': x, 'f': f})
+        stage_xf = stager.put(pi, {'x': x, 'f': f})
         stage_v = stager.put(pi, {'v': v})
         peek_xf = stager.peek(pei, ['x', 'f'])
         peek_v = stager.peek(pei, ['v'])
@@ -487,11 +494,12 @@ class MapStageTest(test.TestCase):
 
       # We can now obtain 'x' and 'f' values associated with key 0
       self.assertTrue(
-          sess.run([key_xf, get_xf],
-                   feed_dict={gi: 0}) == [0, {
-                       'x': 1,
-                       'f': 2
-                   }])
+          sess.run([key_xf, get_xf], feed_dict={
+              gi: 0
+          }) == [0, {
+              'x': 1,
+              'f': 2
+          }])
       # Still have 1 complete and 1 incomplete entry
       self.assertTrue(sess.run([size, isize]) == [1, 1])
 
@@ -499,14 +507,15 @@ class MapStageTest(test.TestCase):
       with self.assertRaises(errors.InvalidArgumentError) as cm:
         sess.run([key_xf, get_xf], feed_dict={gi: 0})
 
-      exc_str = ("Tensor at index '0' for key '0' "
-                "has already been removed.")
+      exc_str = ("Tensor at index '0' for key '0' " 'has already been removed.')
 
       self.assertTrue(exc_str in cm.exception.message)
 
       # Obtain 'v' value associated with key 0
       self.assertTrue(
-          sess.run([key_v, get_v], feed_dict={gi: 0}) == [0, {
+          sess.run([key_v, get_v], feed_dict={
+              gi: 0
+          }) == [0, {
               'v': 1
           }])
       # 0 complete and 1 incomplete entry
@@ -523,7 +532,9 @@ class MapStageTest(test.TestCase):
       self.assertTrue(sess.run([size, isize]) == [1, 0])
       # We can now obtain 'x' and 'f' values associated with key 1
       self.assertTrue(
-          sess.run([pop_key_v, pop_v], feed_dict={pi: 1}) == [1, {
+          sess.run([pop_key_v, pop_v], feed_dict={
+              pi: 1
+          }) == [1, {
               'v': 1
           }])
       # Nothing is left
@@ -557,18 +568,20 @@ class MapStageTest(test.TestCase):
       self.assertTrue(sess.run([size, isize]) == [1, 0])
 
       # Partial get using indices
-      self.assertTrue(sess.run([key_xf, get_xf],
-            feed_dict={gi: 0}) == [0, [1, 2]])
+      self.assertTrue(
+          sess.run([key_xf, get_xf], feed_dict={
+              gi: 0
+          }) == [0, [1, 2]])
 
       # Still some of key 0 left
       self.assertTrue(sess.run([size, isize]) == [1, 0])
 
       # Partial get of remaining index
-      self.assertTrue(sess.run([key_v, get_v],
-            feed_dict={gi: 0}) == [0, [3]])
+      self.assertTrue(sess.run([key_v, get_v], feed_dict={gi: 0}) == [0, [3]])
 
       # All gone
       self.assertTrue(sess.run([size, isize]) == [0, 0])
 
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/pooling_ops_test.py b/tensorflow/python/kernel_tests/pooling_ops_test.py
index 5c0ea8ec8e..3263ed1a60 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_test.py
@@ -159,8 +159,10 @@ class PoolingTest(test.TestCase):
       elif data_format == "NCHW":
         t = test_util.NCHWToNHWC(t)
       if v2:
-        actual = t.eval(feed_dict={ksize_placeholder: ksize,
-                                   strides_placeholder: strides})
+        actual = t.eval(feed_dict={
+            ksize_placeholder: ksize,
+            strides_placeholder: strides
+        })
       else:
         actual = t.eval()
         self.assertShapeEqual(actual, t)
@@ -195,8 +197,15 @@ class PoolingTest(test.TestCase):
       self._VerifyOneType(pool_func, input_sizes, ksize, strides, padding,
                           data_format, dtypes.float16, expected, use_gpu, v2)
 
-  def _VerifyValues(self, pool_func, input_sizes, ksize, strides, padding,
-                    expected, use_gpu, v2=False):
+  def _VerifyValues(self,
+                    pool_func,
+                    input_sizes,
+                    ksize,
+                    strides,
+                    padding,
+                    expected,
+                    use_gpu,
+                    v2=False):
     """Verifies the output values of the pooling function.
 
     Args:
@@ -1148,16 +1157,16 @@ class PoolingTest(test.TestCase):
   def _testMaxPoolGradSamePadding3_1(self, data_format, use_gpu):
     for pool_func in [gen_nn_ops._max_pool_v2, nn_ops.max_pool]:
       self._ConstructAndTestGradient(
-        pool_func,
-        input_sizes=[1, 7, 7, 1],
-        output_sizes=[1, 7, 7, 1],
-        window_rows=3,
-        window_cols=3,
-        row_stride=1,
-        col_stride=1,
-        padding="SAME",
-        data_format=data_format,
-        use_gpu=use_gpu)
+          pool_func,
+          input_sizes=[1, 7, 7, 1],
+          output_sizes=[1, 7, 7, 1],
+          window_rows=3,
+          window_cols=3,
+          row_stride=1,
+          col_stride=1,
+          padding="SAME",
+          data_format=data_format,
+          use_gpu=use_gpu)
 
   def testMaxPoolGrad(self):
     for (data_format, use_gpu) in GetTestConfigs():
@@ -1202,17 +1211,14 @@ class PoolingTest(test.TestCase):
     pool_func = gen_nn_ops._max_pool_v2 if v2 else nn_ops.max_pool
     with self.test_session(use_gpu=use_gpu):
       input_tensor = constant_op.constant(input_data, shape=input_sizes)
-      output_tensor = pool_func(input_tensor,
-                                [1, window_rows, window_cols, 1],
+      output_tensor = pool_func(input_tensor, [1, window_rows, window_cols, 1],
                                 [1, row_stride, col_stride, 1], padding)
       output_backprop_tensor = constant_op.constant(
           output_backprop, shape=output_sizes)
 
-      input_backprop_tensor = self._MaxPoolGrad(input_tensor, output_tensor,
-                                                output_backprop_tensor,
-                                                window_rows, window_cols,
-                                                row_stride, col_stride,
-                                                padding, v2)
+      input_backprop_tensor = self._MaxPoolGrad(
+          input_tensor, output_tensor, output_backprop_tensor, window_rows,
+          window_cols, row_stride, col_stride, padding, v2)
 
       actual_input_backprop = input_backprop_tensor.eval()
       self.assertShapeEqual(actual_input_backprop, input_backprop_tensor)
@@ -1414,13 +1420,15 @@ class PoolingTest(test.TestCase):
   def _testMaxPoolGradDirectWithNans2_2(self):
     input_data = [float("nan")] * 16
     output_backprop = [
-        float("nan"), 12.0, 13.0, 15.0, float("nan"), 17.0, 19.0, 20.0,
+        float("nan"), 12.0, 13.0, 15.0,
+        float("nan"), 17.0, 19.0, 20.0,
         float("nan")
     ]
     # Test the CPU implementation, which propagates diffs in case of NaN
     expected_input_backprop_tf_cpu = [
-        float("nan"), 12.0, 13.0, 0.0, 15.0, float("nan"), 17.0, 0.0, 19.0,
-        20.0, float("nan"), 0.0, 0.0, 0.0, 0.0, 0.0
+        float("nan"), 12.0, 13.0, 0.0, 15.0,
+        float("nan"), 17.0, 0.0, 19.0, 20.0,
+        float("nan"), 0.0, 0.0, 0.0, 0.0, 0.0
     ]
     for v2 in [True, False]:
       self._testMaxPoolGradDirect(
@@ -1636,10 +1644,9 @@ class PoolingTest(test.TestCase):
     Returns:
       A Tensor.
     """
-    return gen_nn_ops._max_pool_grad_grad(orig_input, orig_output, grad,
-                                          [1, window_rows, window_cols,
-                                           1], [1, row_stride, col_stride,
-                                                1], padding)
+    return gen_nn_ops._max_pool_grad_grad(
+        orig_input, orig_output, grad, [1, window_rows, window_cols, 1],
+        [1, row_stride, col_stride, 1], padding)
 
   def testAvgPoolGrad(self):
     for (data_format, use_gpu) in GetTestConfigs():
@@ -1793,8 +1800,7 @@ class PoolingTest(test.TestCase):
     ]:
       with self.assertRaises(ValueError):
         pool_func(
-            array_ops.placeholder(
-                dtypes.float32, shape=[1, 3]),
+            array_ops.placeholder(dtypes.float32, shape=[1, 3]),
             ksize=[1, 1, 1, 1],
             strides=[1, 1, 1, 1],
             padding="SAME")
@@ -1820,15 +1826,13 @@ class PoolingTest(test.TestCase):
         with self.assertRaisesRegexp(ValueError, "Negative dimension size"):
           sess.run(
               pool_func(
-                  array_ops.placeholder(
-                      dtypes.float32, shape=[32, 20, 20, 3]),
+                  array_ops.placeholder(dtypes.float32, shape=[32, 20, 20, 3]),
                   ksize=[1, 20, 21, 1],
                   strides=[1, 1, 1, 1],
                   padding="VALID"))
         with self.assertRaisesRegexp(ValueError, "Negative dimension size"):
           pool_func(
-              array_ops.placeholder(
-                  dtypes.float32, shape=[32, 20, 20, 3]),
+              array_ops.placeholder(dtypes.float32, shape=[32, 20, 20, 3]),
               ksize=[1, 21, 20, 1],
               strides=[1, 1, 1, 1],
               padding="VALID")
diff --git a/tensorflow/python/kernel_tests/reader_ops_test.py b/tensorflow/python/kernel_tests/reader_ops_test.py
index 223a4b2c87..82a27eebee 100644
--- a/tensorflow/python/kernel_tests/reader_ops_test.py
+++ b/tensorflow/python/kernel_tests/reader_ops_test.py
@@ -428,7 +428,7 @@ class FixedLengthRecordReaderTest(test.TestCase):
     for i in range(self._num_files):
       fn = os.path.join(self.get_temp_dir(), "fixed_length_record.%d.txt" % i)
       filenames.append(fn)
-      with open(fn+".tmp", "wb") as f:
+      with open(fn + ".tmp", "wb") as f:
         f.write(b"H" * self._header_bytes)
         if num_records > 0:
           f.write(self._Record(i, 0))
@@ -437,7 +437,7 @@ class FixedLengthRecordReaderTest(test.TestCase):
             f.write(b"G" * gap_bytes)
           f.write(self._Record(i, j))
         f.write(b"F" * self._footer_bytes)
-      with open(fn+".tmp", "rb") as f:
+      with open(fn + ".tmp", "rb") as f:
         cdata = zlib.compress(f.read())
         with open(fn, "wb") as zf:
           zf.write(cdata)
@@ -455,7 +455,7 @@ class FixedLengthRecordReaderTest(test.TestCase):
           all_records_str = "".join([
               str(i)[0]
               for i in range(self._record_bytes + self._hop_bytes *
-                           (num_overlapped_records - 1))
+                             (num_overlapped_records - 1))
           ])
           f.write(compat.as_bytes(all_records_str))
         f.write(b"F" * self._footer_bytes)
@@ -467,7 +467,7 @@ class FixedLengthRecordReaderTest(test.TestCase):
       fn = os.path.join(self.get_temp_dir(),
                         "fixed_length_overlapped_record.%d.txt" % i)
       filenames.append(fn)
-      with open(fn+".tmp", "wb") as f:
+      with open(fn + ".tmp", "wb") as f:
         f.write(b"H" * self._header_bytes)
         if num_overlapped_records > 0:
           all_records_str = "".join([
@@ -477,7 +477,7 @@ class FixedLengthRecordReaderTest(test.TestCase):
           ])
           f.write(compat.as_bytes(all_records_str))
         f.write(b"F" * self._footer_bytes)
-      with open(fn+".tmp", "rb") as f:
+      with open(fn + ".tmp", "rb") as f:
         cdata = zlib.compress(f.read())
         with open(fn, "wb") as zf:
           zf.write(cdata)
@@ -509,7 +509,10 @@ class FixedLengthRecordReaderTest(test.TestCase):
                                     "\\(requested 1, current size 0\\)"):
         k, v = sess.run([key, value])
 
-  def _TestOneEpochWithHopBytes(self, files, num_overlapped_records, encoding=None):
+  def _TestOneEpochWithHopBytes(self,
+                                files,
+                                num_overlapped_records,
+                                encoding=None):
     with self.test_session() as sess:
       reader = io_ops.FixedLengthRecordReader(
           header_bytes=self._header_bytes,
@@ -565,13 +568,15 @@ class FixedLengthRecordReaderTest(test.TestCase):
 
   def testGzipOneEpochWithHopBytes(self):
     for num_overlapped_records in [0, 2]:
-      files = self._CreateGzipOverlappedRecordFiles(num_overlapped_records, )
-      self._TestOneEpochWithHopBytes(files, num_overlapped_records, encoding="GZIP")
+      files = self._CreateGzipOverlappedRecordFiles(num_overlapped_records,)
+      self._TestOneEpochWithHopBytes(
+          files, num_overlapped_records, encoding="GZIP")
 
   def testZlibOneEpochWithHopBytes(self):
     for num_overlapped_records in [0, 2]:
       files = self._CreateZlibOverlappedRecordFiles(num_overlapped_records)
-      self._TestOneEpochWithHopBytes(files, num_overlapped_records, encoding="ZLIB")
+      self._TestOneEpochWithHopBytes(
+          files, num_overlapped_records, encoding="ZLIB")
 
 
 class TFRecordReaderTest(test.TestCase):
diff --git a/tensorflow/python/kernel_tests/relu_op_test.py b/tensorflow/python/kernel_tests/relu_op_test.py
index dd11ba700d..6b4091ae5d 100644
--- a/tensorflow/python/kernel_tests/relu_op_test.py
+++ b/tensorflow/python/kernel_tests/relu_op_test.py
@@ -48,8 +48,8 @@ class ReluTest(test.TestCase):
     self.assertAllClose(
         np.array([[0.0, 0.7, 0.0, 0.3, 0.0], [0.1, 0.0, 0.5, 0.0, 0.9]]),
         self._npRelu(
-            np.array([[-0.9, 0.7, -0.5, 0.3, -0.1], [0.1, -0.3, 0.5, -0.7, 0.9]
-                     ])))
+            np.array([[-0.9, 0.7, -0.5, 0.3, -0.1], [0.1, -0.3, 0.5, -0.7,
+                                                     0.9]])))
 
   def _testRelu(self, np_features, use_gpu=False):
     np_relu = self._npRelu(np_features)
@@ -163,8 +163,8 @@ class Relu6Test(test.TestCase):
     self.assertAllClose(
         np.array([[0.0, 0.7, 0.0, 0.3, 6.0], [0.1, 0.0, 6.0, 0.0, 0.9]]),
         self._npRelu6(
-            np.array([[-0.9, 0.7, -0.5, 0.3, 6.0], [0.1, -0.3, 6.5, -0.7, 0.9]
-                     ])))
+            np.array([[-0.9, 0.7, -0.5, 0.3, 6.0], [0.1, -0.3, 6.5, -0.7,
+                                                    0.9]])))
 
   def _testRelu6(self, np_features, use_gpu=False):
     np_relu6 = self._npRelu6(np_features)
@@ -231,8 +231,8 @@ class EluTest(test.TestCase):
         np.array([[-0.59343034025, 0.7, -0.39346934028, 0.3, -0.09516258196],
                   [0.1, -0.25918177931, 0.5, -0.5034146962, 0.9]]),
         self._npElu(
-            np.array([[-0.9, 0.7, -0.5, 0.3, -0.1], [0.1, -0.3, 0.5, -0.7, 0.9]
-                     ])))
+            np.array([[-0.9, 0.7, -0.5, 0.3, -0.1], [0.1, -0.3, 0.5, -0.7,
+                                                     0.9]])))
 
   def _testElu(self, np_features, use_gpu=False):
     np_elu = self._npElu(np_features)
@@ -330,11 +330,11 @@ class SeluTest(test.TestCase):
 
   def testNpSelu(self):
     self.assertAllClose(
-        np.array([[-1.0433095, 0.73549069, -0.6917582, 0.3152103 , -0.16730527],
-                 [0.1050701 , -0.45566732, 0.5253505, -0.88505305, 0.9456309]]),
+        np.array([[-1.0433095, 0.73549069, -0.6917582, 0.3152103, -0.16730527],
+                  [0.1050701, -0.45566732, 0.5253505, -0.88505305, 0.9456309]]),
         self._npSelu(
-            np.array([[-0.9, 0.7, -0.5, 0.3, -0.1], [0.1, -0.3, 0.5, -0.7, 0.9]
-                     ])))
+            np.array([[-0.9, 0.7, -0.5, 0.3, -0.1], [0.1, -0.3, 0.5, -0.7,
+                                                     0.9]])))
 
   def _testSelu(self, np_features, use_gpu=False):
     np_selu = self._npSelu(np_features)
diff --git a/tensorflow/python/kernel_tests/sparse_slice_op_test.py b/tensorflow/python/kernel_tests/sparse_slice_op_test.py
index 762e400447..da116601f8 100644
--- a/tensorflow/python/kernel_tests/sparse_slice_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_slice_op_test.py
@@ -32,11 +32,12 @@ class SparseSliceOpTest(test.TestCase):
     # [  |11|  |13|14|  ]
     # [20|  |  |23|  |25]
     # [30|  |32|33|  |35]
-    ind = np.array([[0, 0], [0, 2], [0, 4], [0, 5], [1, 1], [1, 3], [1, 4],
-                    [2, 0], [2, 3], [2, 5], [3, 0], [3, 2], [3, 3],
-                    [3, 5]]).astype(np.int64)
-    val = np.array(
-        [0, 2, 4, 5, 11, 13, 14, 20, 23, 25, 30, 32, 33, 35]).astype(np.int64)
+    ind = np.array([[0, 0], [0, 2], [0, 4], [0, 5], [1, 1], [1, 3], [1,
+                                                                     4], [2, 0],
+                    [2, 3], [2, 5], [3, 0], [3, 2], [3, 3], [3, 5]]).astype(
+                        np.int64)
+    val = np.array([0, 2, 4, 5, 11, 13, 14, 20, 23, 25, 30, 32, 33, 35]).astype(
+        np.int64)
     shape = np.array([4, 6]).astype(np.int64)
     return sparse_tensor.SparseTensor(ind, val, shape)
 
@@ -65,50 +66,49 @@ class SparseSliceOpTest(test.TestCase):
     #  [    |'c1'|    |'d1']
     #  [    |    |'e1'|    ]
     ind = np.array([[0, 0, 0], [0, 0, 1], [0, 2, 0], [0, 2, 1], [1, 1, 0],
-                    [1, 1, 1], [1, 3, 0], [1, 3, 1], [2, 2, 0],
-                    [2, 2, 1]]).astype(np.int64)
+                    [1, 1, 1], [1, 3, 0], [1, 3, 1], [2, 2, 0], [2, 2,
+                                                                 1]]).astype(
+                                                                     np.int64)
     val = np.array(['a0', 'a1', 'b0', 'b1', 'c0', 'c1', 'd0', 'd1', 'e0', 'e1'])
     shape = np.array([3, 4, 2]).astype(np.int64)
     return sparse_tensor.SparseTensorValue(ind, val, shape)
 
   def _SparseTensor_3x4x2(self):
-    return sparse_tensor.SparseTensor.from_value(self._SparseTensorValue_3x4x2(
-    ))
+    return sparse_tensor.SparseTensor.from_value(
+        self._SparseTensorValue_3x4x2())
 
   def testSliceMatrixRows(self):
     with self.test_session(use_gpu=False):
-      sp_input=self._SparseTensor_4x6()
+      sp_input = self._SparseTensor_4x6()
       sp_tensor0 = sparse_ops.sparse_slice(sp_input, [0, 0], [2, 6])
       sp_tensor1 = sparse_ops.sparse_slice(sp_input, [2, 0], [3, 7])
-      self.assertAllEqual(sp_tensor0.indices.eval(), [[0, 0], [0, 2], [0, 4],
-                                                      [0, 5], [1, 1], [1, 3],
-                                                      [1, 4]])
+      self.assertAllEqual(
+          sp_tensor0.indices.eval(),
+          [[0, 0], [0, 2], [0, 4], [0, 5], [1, 1], [1, 3], [1, 4]])
       self.assertAllEqual(sp_tensor0.values.eval(), [0, 2, 4, 5, 11, 13, 14])
       self.assertAllEqual(sp_tensor0.dense_shape.eval(), [2, 6])
-      self.assertAllEqual(sp_tensor1.indices.eval(), [[0, 0], [0, 3], [0, 5],
-                                                      [1, 0], [1, 2], [1, 3],
-                                                      [1, 5]])
+      self.assertAllEqual(
+          sp_tensor1.indices.eval(),
+          [[0, 0], [0, 3], [0, 5], [1, 0], [1, 2], [1, 3], [1, 5]])
       self.assertAllEqual(sp_tensor1.values.eval(),
                           [20, 23, 25, 30, 32, 33, 35])
       self.assertAllEqual(sp_tensor1.dense_shape.eval(), [2, 6])
 
   def testSliceMatrixUnevenCols(self):
     with self.test_session(use_gpu=False):
-      sp_input=self._SparseTensor_5x7()
+      sp_input = self._SparseTensor_5x7()
       sp_tensor0 = sparse_ops.sparse_slice(sp_input, [0, 0], [5, 3])
       sp_tensor1 = sparse_ops.sparse_slice(sp_input, [0, 3], [5, 2])
       sp_tensor2 = sparse_ops.sparse_slice(sp_input, [0, 5], [5, 2])
 
-      self.assertAllEqual(sp_tensor0.indices.eval(),
-                          [[0, 0], [0, 2], [1, 1], [2, 0], [3, 0], [3, 2],
-                           [4, 1]])
-      self.assertAllEqual(sp_tensor0.values.eval(),
-                          [0, 2, 11, 20, 30, 32, 41])
+      self.assertAllEqual(
+          sp_tensor0.indices.eval(),
+          [[0, 0], [0, 2], [1, 1], [2, 0], [3, 0], [3, 2], [4, 1]])
+      self.assertAllEqual(sp_tensor0.values.eval(), [0, 2, 11, 20, 30, 32, 41])
       self.assertAllEqual(sp_tensor0.dense_shape.eval(), [5, 3])
       self.assertAllEqual(sp_tensor1.indices.eval(),
                           [[0, 1], [1, 0], [1, 1], [2, 0], [3, 0], [4, 1]])
-      self.assertAllEqual(sp_tensor1.values.eval(),
-                          [4, 13, 14, 23, 33, 44])
+      self.assertAllEqual(sp_tensor1.values.eval(), [4, 13, 14, 23, 33, 44])
       self.assertAllEqual(sp_tensor1.dense_shape.eval(), [5, 2])
       self.assertAllEqual(sp_tensor2.indices.eval(),
                           [[0, 0], [1, 1], [2, 0], [3, 0], [4, 1]])
@@ -137,7 +137,7 @@ class SparseSliceOpTest(test.TestCase):
 
   def testSliceMatrixUnevenRows(self):
     with self.test_session(use_gpu=False):
-      sp_input=self._SparseTensor_5x7()
+      sp_input = self._SparseTensor_5x7()
       sp_tensor0 = sparse_ops.sparse_slice(sp_input, [0, 0], [3, 7])
       sp_tensor1 = sparse_ops.sparse_slice(sp_input, [3, 0], [3, 7])
       self.assertAllEqual(sp_tensor0.indices.eval(),
@@ -146,9 +146,9 @@ class SparseSliceOpTest(test.TestCase):
       self.assertAllEqual(sp_tensor0.values.eval(),
                           [0, 2, 4, 5, 11, 13, 14, 16, 20, 23, 25])
       self.assertAllEqual(sp_tensor0.dense_shape.eval(), [3, 7])
-      self.assertAllEqual(sp_tensor1.indices.eval(),
-                          [[0, 0], [0, 2], [0, 3], [0, 5], [1, 1], [1, 4],
-                           [1, 6]])
+      self.assertAllEqual(
+          sp_tensor1.indices.eval(),
+          [[0, 0], [0, 2], [0, 3], [0, 5], [1, 1], [1, 4], [1, 6]])
       self.assertAllEqual(sp_tensor1.values.eval(),
                           [30, 32, 33, 35, 41, 44, 46])
       self.assertAllEqual(sp_tensor1.dense_shape.eval(), [2, 7])
@@ -156,9 +156,9 @@ class SparseSliceOpTest(test.TestCase):
       sp_tensor0 = sparse_ops.sparse_slice(sp_input, [0, 0], [2, 7])
       sp_tensor1 = sparse_ops.sparse_slice(sp_input, [2, 0], [2, 7])
       sp_tensor2 = sparse_ops.sparse_slice(sp_input, [4, 0], [2, 7])
-      self.assertAllEqual(sp_tensor0.indices.eval(),
-                          [[0, 0], [0, 2], [0, 4], [0, 5], [1, 1], [1, 3],
-                           [1, 4], [1, 6]])
+      self.assertAllEqual(
+          sp_tensor0.indices.eval(),
+          [[0, 0], [0, 2], [0, 4], [0, 5], [1, 1], [1, 3], [1, 4], [1, 6]])
       self.assertAllEqual(sp_tensor0.values.eval(),
                           [0, 2, 4, 5, 11, 13, 14, 16])
       self.assertAllEqual(sp_tensor0.dense_shape.eval(), [2, 7])
@@ -166,45 +166,42 @@ class SparseSliceOpTest(test.TestCase):
       self.assertAllEqual(sp_tensor1.values.eval(),
                           [20, 23, 25, 30, 32, 33, 35])
       self.assertAllEqual(sp_tensor1.dense_shape.eval(), [2, 7])
-      self.assertAllEqual(sp_tensor2.indices.eval(), [[0, 1], [0, 4],
-                                                           [0, 6]])
+      self.assertAllEqual(sp_tensor2.indices.eval(), [[0, 1], [0, 4], [0, 6]])
       self.assertAllEqual(sp_tensor2.values.eval(), [41, 44, 46])
       self.assertAllEqual(sp_tensor2.dense_shape.eval(), [1, 7])
     return
 
   def testSliceAllRows(self):
     with self.test_session(use_gpu=False):
-      sp_input=self._SparseTensor_4x6()
+      sp_input = self._SparseTensor_4x6()
       sp_tensor0 = sparse_ops.sparse_slice(sp_input, [0, 0], [1, 6])
       sp_tensor1 = sparse_ops.sparse_slice(sp_input, [1, 0], [1, 6])
       sp_tensor2 = sparse_ops.sparse_slice(sp_input, [2, 0], [1, 7])
       sp_tensor3 = sparse_ops.sparse_slice(sp_input, [3, 0], [2, 7])
-      self.assertAllEqual(sp_tensor0.indices.eval(), [[0, 0], [0, 2], [0, 4],
-                                                         [0, 5]])
+      self.assertAllEqual(sp_tensor0.indices.eval(),
+                          [[0, 0], [0, 2], [0, 4], [0, 5]])
       self.assertAllEqual(sp_tensor0.values.eval(), [0, 2, 4, 5])
       self.assertAllEqual(sp_tensor0.dense_shape.eval(), [1, 6])
-      self.assertAllEqual(sp_tensor1.indices.eval(), [[0, 1], [0, 3], [0,
-                                                                          4]])
+      self.assertAllEqual(sp_tensor1.indices.eval(), [[0, 1], [0, 3], [0, 4]])
       self.assertAllEqual(sp_tensor1.values.eval(), [11, 13, 14])
       self.assertAllEqual(sp_tensor1.dense_shape.eval(), [1, 6])
-      self.assertAllEqual(sp_tensor2.indices.eval(), [[0, 0], [0, 3], [0,
-                                                                          5]])
+      self.assertAllEqual(sp_tensor2.indices.eval(), [[0, 0], [0, 3], [0, 5]])
       self.assertAllEqual(sp_tensor2.values.eval(), [20, 23, 25])
       self.assertAllEqual(sp_tensor2.dense_shape.eval(), [1, 6])
-      self.assertAllEqual(sp_tensor3.indices.eval(), [[0, 0], [0, 2], [0, 3],
-                                                         [0, 5]])
+      self.assertAllEqual(sp_tensor3.indices.eval(),
+                          [[0, 0], [0, 2], [0, 3], [0, 5]])
       self.assertAllEqual(sp_tensor3.values.eval(), [30, 32, 33, 35])
       self.assertAllEqual(sp_tensor3.dense_shape.eval(), [1, 6])
 
   def testSliceColumns(self):
     with self.test_session(use_gpu=False):
-      sp_input=self._SparseTensor_4x6()
+      sp_input = self._SparseTensor_4x6()
       sparse_tensor0 = sparse_ops.sparse_slice(sp_input, [0, 0], [4, 2])
       sparse_tensor1 = sparse_ops.sparse_slice(sp_input, [0, 2], [5, 2])
       sparse_tensor2 = sparse_ops.sparse_slice(sp_input, [0, 4], [5, 3])
 
-      self.assertAllEqual(sparse_tensor0.indices.eval(), [[0, 0], [1, 1],
-                                                             [2, 0], [3, 0]])
+      self.assertAllEqual(sparse_tensor0.indices.eval(),
+                          [[0, 0], [1, 1], [2, 0], [3, 0]])
       self.assertAllEqual(sparse_tensor0.values.eval(), [0, 11, 20, 30])
       self.assertAllEqual(sparse_tensor0.dense_shape.eval(), [4, 2])
       self.assertAllEqual(sparse_tensor1.indices.eval(),
@@ -218,15 +215,15 @@ class SparseSliceOpTest(test.TestCase):
 
   def testSliceAllColumns(self):
     with self.test_session(use_gpu=False):
-      sp_input=self._SparseTensor_4x6()
+      sp_input = self._SparseTensor_4x6()
       sparse_tensor0 = sparse_ops.sparse_slice(sp_input, [0, 0], [4, 1])
       sparse_tensor1 = sparse_ops.sparse_slice(sp_input, [0, 1], [4, 1])
       sparse_tensor2 = sparse_ops.sparse_slice(sp_input, [0, 2], [4, 1])
       sparse_tensor3 = sparse_ops.sparse_slice(sp_input, [0, 3], [4, 1])
       sparse_tensor4 = sparse_ops.sparse_slice(sp_input, [0, 4], [5, 1])
       sparse_tensor5 = sparse_ops.sparse_slice(sp_input, [0, 5], [6, 3])
-      self.assertAllEqual(sparse_tensor0.indices.eval(), [[0, 0], [2, 0],
-                                                             [3, 0]])
+      self.assertAllEqual(sparse_tensor0.indices.eval(),
+                          [[0, 0], [2, 0], [3, 0]])
       self.assertAllEqual(sparse_tensor0.values.eval(), [0, 20, 30])
       self.assertAllEqual(sparse_tensor0.dense_shape.eval(), [4, 1])
       self.assertAllEqual(sparse_tensor1.indices.eval(), [[1, 0]])
@@ -235,17 +232,18 @@ class SparseSliceOpTest(test.TestCase):
       self.assertAllEqual(sparse_tensor2.indices.eval(), [[0, 0], [3, 0]])
       self.assertAllEqual(sparse_tensor2.values.eval(), [2, 32])
       self.assertAllEqual(sparse_tensor2.dense_shape.eval(), [4, 1])
-      self.assertAllEqual(sparse_tensor3.indices.eval(), [[1, 0], [2, 0],
-                                                             [3, 0]])
+      self.assertAllEqual(sparse_tensor3.indices.eval(),
+                          [[1, 0], [2, 0], [3, 0]])
       self.assertAllEqual(sparse_tensor3.dense_shape.eval(), [4, 1])
       self.assertAllEqual(sparse_tensor3.values.eval(), [13, 23, 33])
       self.assertAllEqual(sparse_tensor4.indices.eval(), [[0, 0], [1, 0]])
       self.assertAllEqual(sparse_tensor4.values.eval(), [4, 14])
       self.assertAllEqual(sparse_tensor4.dense_shape.eval(), [4, 1])
-      self.assertAllEqual(sparse_tensor5.indices.eval(), [[0, 0], [2, 0],
-                                                             [3, 0]])
+      self.assertAllEqual(sparse_tensor5.indices.eval(),
+                          [[0, 0], [2, 0], [3, 0]])
       self.assertAllEqual(sparse_tensor5.values.eval(), [5, 25, 35])
       self.assertAllEqual(sparse_tensor5.dense_shape.eval(), [4, 1])
 
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/stage_op_test.py b/tensorflow/python/kernel_tests/stage_op_test.py
index 64b3388c5c..dd06d30391 100644
--- a/tensorflow/python/kernel_tests/stage_op_test.py
+++ b/tensorflow/python/kernel_tests/stage_op_test.py
@@ -25,8 +25,8 @@ from tensorflow.python.platform import test
 
 TIMEOUT = 1
 
-class StageTest(test.TestCase):
 
+class StageTest(test.TestCase):
 
   def testSimple(self):
     with ops.Graph().as_default() as G:
@@ -116,7 +116,10 @@ class StageTest(test.TestCase):
         x = array_ops.placeholder(dtypes.int32, name='x')
         p = array_ops.placeholder(dtypes.int32, name='p')
       with ops.device(test.gpu_device_name()):
-        stager = data_flow_ops.StagingArea([dtypes.int32, ], shapes=[[]])
+        stager = data_flow_ops.StagingArea(
+            [
+                dtypes.int32,
+            ], shapes=[[]])
         stage = stager.put([x])
         peek = stager.peek(p)
         ret = stager.get()
@@ -162,8 +165,10 @@ class StageTest(test.TestCase):
       with ops.device('/cpu:0'):
         x = array_ops.placeholder(dtypes.int32, name='x')
       with ops.device(test.gpu_device_name()):
-        stager = data_flow_ops.StagingArea([dtypes.int32, ],
-          capacity=capacity, shapes=[[]])
+        stager = data_flow_ops.StagingArea(
+            [
+                dtypes.int32,
+            ], capacity=capacity, shapes=[[]])
         stage = stager.put([x])
         ret = stager.get()
         size = stager.size()
@@ -201,9 +206,8 @@ class StageTest(test.TestCase):
         self.fail("Expected to timeout on iteration '{}' "
                   "but instead timed out on iteration '{}' "
                   "Staging Area size is '{}' and configured "
-                  "capacity is '{}'.".format(capacity, i,
-                                            sess.run(size),
-                                            capacity))
+                  "capacity is '{}'.".format(capacity, i, sess.run(size),
+                                             capacity))
 
       # Should have capacity elements in the staging area
       self.assertTrue(sess.run(size) == capacity)
@@ -216,16 +220,18 @@ class StageTest(test.TestCase):
       self.assertTrue(sess.run(size) == 0)
 
   def testMemoryLimit(self):
-    memory_limit = 512*1024  # 512K
-    chunk = 200*1024 # 256K
+    memory_limit = 512 * 1024  # 512K
+    chunk = 200 * 1024  # 256K
     capacity = memory_limit // chunk
 
     with ops.Graph().as_default() as G:
       with ops.device('/cpu:0'):
         x = array_ops.placeholder(dtypes.uint8, name='x')
       with ops.device(test.gpu_device_name()):
-        stager = data_flow_ops.StagingArea([dtypes.uint8, ],
-          memory_limit=memory_limit, shapes=[[]])
+        stager = data_flow_ops.StagingArea(
+            [
+                dtypes.uint8,
+            ], memory_limit=memory_limit, shapes=[[]])
         stage = stager.put([x])
         ret = stager.get()
         size = stager.size()
@@ -264,9 +270,8 @@ class StageTest(test.TestCase):
         self.fail("Expected to timeout on iteration '{}' "
                   "but instead timed out on iteration '{}' "
                   "Staging Area size is '{}' and configured "
-                  "capacity is '{}'.".format(capacity, i,
-                                            sess.run(size),
-                                            capacity))
+                  "capacity is '{}'.".format(capacity, i, sess.run(size),
+                                             capacity))
 
       # Should have capacity elements in the staging area
       self.assertTrue(sess.run(size) == capacity)
@@ -277,5 +282,6 @@ class StageTest(test.TestCase):
 
       self.assertTrue(sess.run(size) == 0)
 
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/layers/maxout.py b/tensorflow/python/layers/maxout.py
index ed048845a0..20ce6c9770 100644
--- a/tensorflow/python/layers/maxout.py
+++ b/tensorflow/python/layers/maxout.py
@@ -31,15 +31,18 @@ from tensorflow.python.layers import base
 def maxout(inputs, num_units, axis=-1, name=None):
   """Adds a maxout op from https://arxiv.org/abs/1302.4389
 
-  "Maxout Networks" Ian J. Goodfellow, David Warde-Farley, Mehdi Mirza, Aaron Courville,
+  "Maxout Networks" Ian J. Goodfellow, David Warde-Farley, Mehdi Mirza, Aaron
+  Courville,
    Yoshua Bengio
 
-   Usually the operation is performed in the filter/channel dimension. This can also be
+   Usually the operation is performed in the filter/channel dimension. This can
+   also be
    used after fully-connected layers to reduce number of features.
 
    Arguments:
    inputs: Tensor input
-   num_units: Specifies how many features will remain after maxout in the `axis` dimension
+   num_units: Specifies how many features will remain after maxout in the `axis`
+     dimension
          (usually channel). This must be multiple of number of `axis`.
    axis: The dimension where max pooling will be performed. Default is the
    last dimension.
@@ -57,15 +60,18 @@ def maxout(inputs, num_units, axis=-1, name=None):
 class MaxOut(base.Layer):
   """Adds a maxout op from https://arxiv.org/abs/1302.4389
 
-  "Maxout Networks" Ian J. Goodfellow, David Warde-Farley, Mehdi Mirza, Aaron Courville, Yoshua
+  "Maxout Networks" Ian J. Goodfellow, David Warde-Farley, Mehdi Mirza, Aaron
+  Courville, Yoshua
   Bengio
 
-  Usually the operation is performed in the filter/channel dimension. This can also be
+  Usually the operation is performed in the filter/channel dimension. This can
+  also be
   used after fully-connected layers to reduce number of features.
 
   Arguments:
     inputs: Tensor input
-    num_units: Specifies how many features will remain after maxout in the `axis` dimension
+    num_units: Specifies how many features will remain after maxout in the
+      `axis` dimension
          (usually channel).
     This must be multiple of number of `axis`.
     axis: The dimension where max pooling will be performed. Default is the
@@ -79,13 +85,8 @@ class MaxOut(base.Layer):
     ValueError: if num_units is not multiple of number of features.
   """
 
-  def __init__(self,
-         num_units,
-         axis=-1,
-         name=None,
-         **kwargs):
-    super(MaxOut, self).__init__(
-      name=name, trainable=False, **kwargs)
+  def __init__(self, num_units, axis=-1, name=None, **kwargs):
+    super(MaxOut, self).__init__(name=name, trainable=False, **kwargs)
     self.axis = axis
     self.num_units = num_units
 
@@ -95,8 +96,8 @@ class MaxOut(base.Layer):
     num_channels = shape[self.axis]
     if num_channels % self.num_units:
       raise ValueError('number of features({}) is not '
-               'a multiple of num_units({})'
-               .format(num_channels, self.num_units))
+                       'a multiple of num_units({})'.format(
+                           num_channels, self.num_units))
     shape[self.axis] = -1
     shape += [num_channels // self.num_units]
 
@@ -104,6 +105,7 @@ class MaxOut(base.Layer):
     for i in range(len(shape)):
       if shape[i] is None:
         shape[i] = gen_array_ops.shape(inputs)[i]
-    outputs = math_ops.reduce_max(gen_array_ops.reshape(inputs, shape), -1, keep_dims=False)
+    outputs = math_ops.reduce_max(
+        gen_array_ops.reshape(inputs, shape), -1, keep_dims=False)
 
     return outputs
diff --git a/tensorflow/python/ops/array_grad.py b/tensorflow/python/ops/array_grad.py
index 55cae0bcbf..c9292184e6 100644
--- a/tensorflow/python/ops/array_grad.py
+++ b/tensorflow/python/ops/array_grad.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Gradients for operators defined in array_ops.py."""
 
 from __future__ import absolute_import
@@ -131,8 +130,8 @@ def _ConcatGradHelper(op, grad, start_value_index, end_value_index, dim_index):
         # extract the size of each input along the concat dimension
         sizes = array_ops.squeeze(
             array_ops.slice(
-                array_ops.stack(
-                    sizes, axis=1), [non_neg_concat_dim, 0], [1, -1]))
+                array_ops.stack(sizes, axis=1), [non_neg_concat_dim, 0],
+                [1, -1]))
         out_grads = array_ops.split(grad, sizes, non_neg_concat_dim)
       else:
         offset = gen_array_ops._concat_offset(non_neg_concat_dim, sizes)
@@ -167,8 +166,7 @@ def _ConcatGradHelper(op, grad, start_value_index, end_value_index, dim_index):
         new_values = array_ops.slice(
             grad.values, begin,
             array_ops.concat([[-1], array_ops.slice(size, [1], [-1])], 0))
-        out_grads.append(
-            ops.IndexedSlices(new_values, grad.indices, size))
+        out_grads.append(ops.IndexedSlices(new_values, grad.indices, size))
         # Lint complains begin = begin + ...
         begin = math_ops.add(begin, size * mask)
     else:
@@ -178,30 +176,33 @@ def _ConcatGradHelper(op, grad, start_value_index, end_value_index, dim_index):
       for size in sizes:
         size_concat_dim = array_ops.gather(size, non_neg_concat_dim)
         if size_concat_dim.dtype != grad.indices.dtype:
-          size_concat_dim = math_ops.cast(size_concat_dim,
-                                          dtype=grad.indices.dtype)
+          size_concat_dim = math_ops.cast(
+              size_concat_dim, dtype=grad.indices.dtype)
         end = start + size_concat_dim
         # Compute the 1-D Tensor of indices relevant for this input.
         indices_to_select = array_ops.squeeze(
-            array_ops.where(math_ops.logical_and(grad.indices >= start,
-                                                 grad.indices < end)),
+            array_ops.where(
+                math_ops.logical_and(grad.indices >= start,
+                                     grad.indices < end)),
             squeeze_dims=[1])
         new_indices = array_ops.gather(grad.indices, indices_to_select) - start
         new_values = array_ops.gather(grad.values, indices_to_select)
-        out_grads.append(
-            ops.IndexedSlices(new_values, new_indices, size))
+        out_grads.append(ops.IndexedSlices(new_values, new_indices, size))
         start = end
   else:
     raise TypeError("Expected Tensor or IndexedSlices, got %s" % type(grad))
 
-  return (out_grads + [None] if end_value_index <= dim_index
-          else [None] + out_grads)
+  return (out_grads + [None]
+          if end_value_index <= dim_index else [None] + out_grads)
 
 
 @ops.RegisterGradient("Concat")
 def _ConcatGrad(op, grad):
   return _ConcatGradHelper(
-      op, grad, start_value_index=1, end_value_index=len(op.inputs),
+      op,
+      grad,
+      start_value_index=1,
+      end_value_index=len(op.inputs),
       dim_index=0)
 
 
@@ -287,9 +288,13 @@ def _SplitGrad(op, *grads):
 @ops.RegisterGradient("SplitV")
 def _SplitVGrad(op, *grads):
   returnval = array_ops.concat(list(grads), op.inputs[2])
-  returnval = [returnval] + [None,] * (len(op.inputs) - 1)
+  returnval = [returnval] + [
+      None,
+  ] * (
+      len(op.inputs) - 1)
   return returnval
 
+
 ops.NotDifferentiable("Const")
 
 
@@ -334,9 +339,9 @@ def _MatrixSetDiagGrad(op, grad):
       matrix_shape = array_ops.slice(grad_shape, [grad_rank - 2], [2])
       min_dim = math_ops.reduce_min(matrix_shape)
       diag_shape = array_ops.concat([batch_shape, [min_dim]], 0)
-  grad_input = array_ops.matrix_set_diag(
-      grad, array_ops.zeros(
-          diag_shape, dtype=grad.dtype))
+  grad_input = array_ops.matrix_set_diag(grad,
+                                         array_ops.zeros(
+                                             diag_shape, dtype=grad.dtype))
   grad_diag = array_ops.matrix_diag_part(grad)
   return (grad_input, grad_diag)
 
@@ -444,8 +449,8 @@ def _GatherV2Grad(op, grad):
   values_transpose = array_ops.transpose(values, transpose_dims)
   num_segments = params_shape[axis]
 
-  params_grad = math_ops.unsorted_segment_sum(
-      values_transpose, indices, num_segments)
+  params_grad = math_ops.unsorted_segment_sum(values_transpose, indices,
+                                              num_segments)
 
   # Inverts the above transpose by moving dimension 0 back to its original
   # position.
@@ -536,13 +541,10 @@ def _ConjugateTransposeGrad(op, grad):
 
 ops.NotDifferentiable("Shape")
 
-
 ops.NotDifferentiable("ShapeN")
 
-
 ops.NotDifferentiable("Rank")
 
-
 ops.NotDifferentiable("Size")
 
 
@@ -590,6 +592,7 @@ def _PadGrad(op, grad):
   else:
     return x_grad, None
 
+
 ops.RegisterGradient("Pad")(_PadGrad)
 ops.RegisterGradient("PadV2")(_PadGrad)
 
@@ -625,30 +628,34 @@ def _ReverseV2Grad(op, grad):
 def _SpaceToBatchGrad(op, grad):
   # Its gradient is the opposite op: BatchToSpace.
   block_size = op.get_attr("block_size")
-  return [array_ops.batch_to_space(grad, op.inputs[1], block_size=block_size),
-          None]
+  return [
+      array_ops.batch_to_space(grad, op.inputs[1], block_size=block_size), None
+  ]
 
 
 @ops.RegisterGradient("SpaceToBatchND")
 def _SpaceToBatchNDGrad(op, grad):
   # Its gradient is the opposite op: BatchToSpaceND.
-  return [array_ops.batch_to_space_nd(grad, op.inputs[1], op.inputs[2]),
-          None, None]
+  return [
+      array_ops.batch_to_space_nd(grad, op.inputs[1], op.inputs[2]), None, None
+  ]
 
 
 @ops.RegisterGradient("BatchToSpace")
 def _BatchToSpaceGrad(op, grad):
   # Its gradient is the opposite op: SpaceToBatch.
   block_size = op.get_attr("block_size")
-  return [array_ops.space_to_batch(grad, op.inputs[1], block_size=block_size),
-          None]
+  return [
+      array_ops.space_to_batch(grad, op.inputs[1], block_size=block_size), None
+  ]
 
 
 @ops.RegisterGradient("BatchToSpaceND")
 def _BatchToSpaceNDGrad(op, grad):
   # Its gradient is the opposite op: SpaceToBatchND.
-  return [array_ops.space_to_batch_nd(grad, op.inputs[1], op.inputs[2]),
-          None, None]
+  return [
+      array_ops.space_to_batch_nd(grad, op.inputs[1], op.inputs[2]), None, None
+  ]
 
 
 @ops.RegisterGradient("SpaceToDepth")
@@ -712,30 +719,28 @@ def _QuantizeAndDequantizeV3Grad(_, grad):
 def _ExtractImagePatchesGrad(op, grad):
 
   batch_size, rows_in, cols_in, channels = [
-    dim.value for dim in op.inputs[0].get_shape()
+      dim.value for dim in op.inputs[0].get_shape()
   ]
   input_bhwc = array_ops.shape(op.inputs[0])
   batch_size = input_bhwc[0]
   channels = input_bhwc[3]
 
-  _, rows_out, cols_out, _ = [
-    dim.value for dim in op.outputs[0].get_shape()
-  ]
-  _, ksize_r, ksize_c, _ = op.get_attr('ksizes')
-  _, stride_r, stride_h, _ = op.get_attr('strides')
-  _, rate_r, rate_c, _ = op.get_attr('rates')
-  padding = op.get_attr('padding')
+  _, rows_out, cols_out, _ = [dim.value for dim in op.outputs[0].get_shape()]
+  _, ksize_r, ksize_c, _ = op.get_attr("ksizes")
+  _, stride_r, stride_h, _ = op.get_attr("strides")
+  _, rate_r, rate_c, _ = op.get_attr("rates")
+  padding = op.get_attr("padding")
 
   ksize_r_eff = ksize_r + (ksize_r - 1) * (rate_r - 1)
   ksize_c_eff = ksize_c + (ksize_c - 1) * (rate_c - 1)
 
-  if padding == b'SAME':
+  if padding == b"SAME":
     rows_out = int(ceil(rows_in / stride_r))
     cols_out = int(ceil(cols_in / stride_h))
     pad_rows = ((rows_out - 1) * stride_r + ksize_r_eff - rows_in) // 2
     pad_cols = ((cols_out - 1) * stride_h + ksize_c_eff - cols_in) // 2
 
-  elif padding == b'VALID':
+  elif padding == b"VALID":
     rows_out = int(ceil((rows_in - ksize_r_eff + 1) / stride_r))
     cols_out = int(ceil((cols_in - ksize_c_eff + 1) / stride_h))
     pad_rows = (rows_out - 1) * stride_r + ksize_r_eff - rows_in
@@ -744,10 +749,9 @@ def _ExtractImagePatchesGrad(op, grad):
   pad_rows, pad_cols = max(0, pad_rows), max(0, pad_cols)
 
   grad_expanded = array_ops.transpose(
-    array_ops.reshape(grad, (batch_size, rows_out,
-                             cols_out, ksize_r, ksize_c, channels)),
-    (1, 2, 3, 4, 0, 5)
-  )
+      array_ops.reshape(
+          grad, (batch_size, rows_out, cols_out, ksize_r, ksize_c, channels)),
+      (1, 2, 3, 4, 0, 5))
   grad_flat = array_ops.reshape(grad_expanded, (-1, batch_size * channels))
 
   row_steps = range(0, rows_out * stride_r, stride_r)
@@ -759,29 +763,21 @@ def _ExtractImagePatchesGrad(op, grad):
       r_low, c_low = row_steps[i] - pad_rows, col_steps[j] - pad_cols
       r_high, c_high = r_low + ksize_r_eff, c_low + ksize_c_eff
 
-      idx.extend([(r * (cols_in) + c,
-                   i * (cols_out * ksize_r * ksize_c) +
-                   j * (ksize_r * ksize_c) +
-                   ri * (ksize_c) + ci)
+      idx.extend([(r * (cols_in) + c, i * (cols_out * ksize_r * ksize_c) + j *
+                   (ksize_r * ksize_c) + ri * (ksize_c) + ci)
                   for (ri, r) in enumerate(range(r_low, r_high, rate_r))
                   for (ci, c) in enumerate(range(c_low, c_high, rate_c))
-                  if 0 <= r and r < rows_in and 0 <= c and c < cols_in
-      ])
+                  if 0 <= r and r < rows_in and 0 <= c and c < cols_in])
 
-  sp_shape = (rows_in * cols_in,
-              rows_out * cols_out * ksize_r * ksize_c)
+  sp_shape = (rows_in * cols_in, rows_out * cols_out * ksize_r * ksize_c)
 
   sp_mat = sparse_tensor.SparseTensor(
-    array_ops.constant(idx, dtype=ops.dtypes.int64),
-    array_ops.ones((len(idx),), dtype=ops.dtypes.float32),
-    sp_shape
-  )
+      array_ops.constant(idx, dtype=ops.dtypes.int64),
+      array_ops.ones((len(idx),), dtype=ops.dtypes.float32), sp_shape)
 
   jac = sparse_ops.sparse_tensor_dense_matmul(sp_mat, grad_flat)
 
-  grad_out = array_ops.reshape(
-    jac, (rows_in, cols_in, batch_size, channels)
-  )
+  grad_out = array_ops.reshape(jac, (rows_in, cols_in, batch_size, channels))
   grad_out = array_ops.transpose(grad_out, (2, 0, 1, 3))
 
   return [grad_out]
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index d379eccc20..49191c647d 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Control Flow Operations.
 
 See the @{$python/control_flow_ops} guide.
@@ -84,7 +83,6 @@ from tensorflow.python.util import nest
 from tensorflow.python.util import tf_should_use
 from tensorflow.python.util.tf_export import tf_export
 
-
 # We override the 'tuple' for a control flow op, so we keep python's
 # existing 'tuple' for later use in this module.
 _basetuple = tuple
@@ -156,9 +154,10 @@ def Assert(condition, data, summarize=None, name=None):
       xs = ops.convert_n_to_tensor(data)
       data_str = [_summarize_eager(x, summarize) for x in xs]
       raise errors.InvalidArgumentError(
-          node_def=None, op=None,
-          message="Expected '%s' to be true. Summarized data: %s" % (
-              condition, "\n".join(data_str)))
+          node_def=None,
+          op=None,
+          message="Expected '%s' to be true. Summarized data: %s" %
+          (condition, "\n".join(data_str)))
     return
 
   with ops.name_scope(name, "Assert", [condition, data]) as name:
@@ -167,15 +166,15 @@ def Assert(condition, data, summarize=None, name=None):
       # As a simple heuristic, we assume that string and int32 are
       # on host to avoid the need to use cond. If it is not case,
       # we will pay the price copying the tensor to host memory.
-      return gen_logging_ops._assert(
-          condition, data, summarize, name="Assert")
+      return gen_logging_ops._assert(condition, data, summarize, name="Assert")
     else:
       condition = ops.convert_to_tensor(condition, name="Condition")
+
       def true_assert():
         return gen_logging_ops._assert(
             condition, data, summarize, name="Assert")
-      guarded_assert = cond(
-          condition, no_op, true_assert, name="AssertGuard")
+
+      guarded_assert = cond(condition, no_op, true_assert, name="AssertGuard")
       if context.in_eager_mode():
         return
       return guarded_assert.op
@@ -215,7 +214,7 @@ def _Identity(data, name=None):
 def _NextIteration(data, name=None):
   data = ops.internal_convert_to_tensor_or_indexed_slices(data, as_ref=True)
   if isinstance(data, ops.Tensor):
-    if data.dtype._is_ref_dtype:   # pylint: disable=protected-access
+    if data.dtype._is_ref_dtype:  # pylint: disable=protected-access
       return ref_next_iteration(data, name=name)
     else:
       return next_iteration(data, name=name)
@@ -234,8 +233,13 @@ def _NextIteration(data, name=None):
       return sparse_tensor.SparseTensor(indices, values, dense_shape)
 
 
-def _Enter(data, frame_name, is_constant=False, parallel_iterations=10,
-           use_ref=True, use_input_shape=True, name=None):
+def _Enter(data,
+           frame_name,
+           is_constant=False,
+           parallel_iterations=10,
+           use_ref=True,
+           use_input_shape=True,
+           name=None):
   """Creates or finds a child frame, and makes `data` available to it.
 
   The unique `frame_name` is used by the `Executor` to identify frames. If
@@ -257,35 +261,51 @@ def _Enter(data, frame_name, is_constant=False, parallel_iterations=10,
   data = ops.internal_convert_to_tensor_or_indexed_slices(data, as_ref=True)
   if isinstance(data, ops.Tensor):
     if data.dtype._is_ref_dtype and use_ref:  # pylint: disable=protected-access
-      result = ref_enter(data, frame_name, is_constant, parallel_iterations,
-                         name=name)
+      result = ref_enter(
+          data, frame_name, is_constant, parallel_iterations, name=name)
     else:
-      result = enter(data, frame_name, is_constant, parallel_iterations,
-                     name=name)
+      result = enter(
+          data, frame_name, is_constant, parallel_iterations, name=name)
     if use_input_shape:
       result.set_shape(data.get_shape())
     return result
   else:
     if not isinstance(data, (ops.IndexedSlices, sparse_tensor.SparseTensor)):
       raise TypeError("Type %s not supported" % type(data))
-    values = _Enter(data.values, frame_name, is_constant,
-                    parallel_iterations=parallel_iterations,
-                    use_input_shape=use_input_shape, name=name)
-    indices = enter(data.indices, frame_name, is_constant,
-                    parallel_iterations, name="indices")
+    values = _Enter(
+        data.values,
+        frame_name,
+        is_constant,
+        parallel_iterations=parallel_iterations,
+        use_input_shape=use_input_shape,
+        name=name)
+    indices = enter(
+        data.indices,
+        frame_name,
+        is_constant,
+        parallel_iterations,
+        name="indices")
     if use_input_shape:
       indices.set_shape(data.indices.get_shape())
     if isinstance(data, ops.IndexedSlices):
       dense_shape = data.dense_shape
       if dense_shape is not None:
-        dense_shape = enter(dense_shape, frame_name, is_constant,
-                            parallel_iterations, name="dense_shape")
+        dense_shape = enter(
+            dense_shape,
+            frame_name,
+            is_constant,
+            parallel_iterations,
+            name="dense_shape")
         if use_input_shape:
           dense_shape.set_shape(data.dense_shape.get_shape())
       return ops.IndexedSlices(values, indices, dense_shape)
     else:
-      dense_shape = enter(data.dense_shape, frame_name, is_constant,
-                          parallel_iterations, name="dense_shape")
+      dense_shape = enter(
+          data.dense_shape,
+          frame_name,
+          is_constant,
+          parallel_iterations,
+          name="dense_shape")
       if use_input_shape:
         dense_shape.set_shape(data.dense_shape.get_shape())
       return sparse_tensor.SparseTensor(indices, values, dense_shape)
@@ -444,8 +464,10 @@ def merge(inputs, name=None):
   if any([inp is None for inp in inputs]):
     raise ValueError("At least one of the merge inputs is None: %s" % inputs)
   with ops.name_scope(name, "Merge", inputs) as name:
-    inputs = [ops.internal_convert_to_tensor_or_indexed_slices(inp, as_ref=True)
-              for inp in inputs]
+    inputs = [
+        ops.internal_convert_to_tensor_or_indexed_slices(inp, as_ref=True)
+        for inp in inputs
+    ]
     if all([isinstance(v, ops.Tensor) for v in inputs]):
       if all([v.dtype._is_ref_dtype for v in inputs]):  # pylint: disable=protected-access
         return gen_control_flow_ops._ref_merge(inputs, name)
@@ -475,6 +497,8 @@ def merge(inputs, name=None):
       else:
         dense_shape = None
       return ops.IndexedSlices(values, indices, dense_shape), chosen_index
+
+
 # pylint: enable=protected-access
 
 
@@ -488,7 +512,9 @@ def _convert_tensorarray_to_flow(tensor_or_tensor_array):
 def _make_tensor_array(ta, t_or_flow):
   # pylint: disable=protected-access
   new_ta = tensor_array_ops.TensorArray(
-      dtype=ta.dtype, handle=ta.handle, flow=t_or_flow,
+      dtype=ta.dtype,
+      handle=ta.handle,
+      flow=t_or_flow,
       infer_shape=ta._infer_shape,
       colocate_with_first_write_call=ta._colocate_with_first_write_call)
   new_ta._colocate_with = ta._colocate_with
@@ -500,13 +526,13 @@ def _make_tensor_array(ta, t_or_flow):
 def _convert_flows_to_tensorarrays(tensors_or_tensorarrays, tensors_or_flows):
   if len(tensors_or_tensorarrays) != len(tensors_or_flows):
     raise ValueError(
-        "Lengths of original Tensor list and new list do not match: %d vs. %d"
-        % (len(tensors_or_tensorarrays), len(tensors_or_flows)))
+        "Lengths of original Tensor list and new list do not match: %d vs. %d" %
+        (len(tensors_or_tensorarrays), len(tensors_or_flows)))
   return [
       _make_tensor_array(ta, t_or_flow)
-      if isinstance(ta, tensor_array_ops.TensorArray)
-      else t_or_flow
-      for (ta, t_or_flow) in zip(tensors_or_tensorarrays, tensors_or_flows)]
+      if isinstance(ta, tensor_array_ops.TensorArray) else t_or_flow
+      for (ta, t_or_flow) in zip(tensors_or_tensorarrays, tensors_or_flows)
+  ]
 
 
 def _ShapeLessThanOrEqual(shape1, shape2):
@@ -545,8 +571,8 @@ def _SetShapeInvariants(input_vars, enter_vars, shapes):
         raise ValueError(
             "The shape invariant specified for %s is not compatible with "
             "the initial shape of the loop variable. It enters the loop "
-            "with shape %s, but the specified shape invariant is %s."
-            % (inp.name, inp.get_shape(), shape))
+            "with shape %s, but the specified shape invariant is %s." %
+            (inp.name, inp.get_shape(), shape))
       var.set_shape(shape)
     else:
       if not isinstance(var, (ops.IndexedSlices, sparse_tensor.SparseTensor)):
@@ -557,8 +583,8 @@ def _SetShapeInvariants(input_vars, enter_vars, shapes):
               "The shape invariant specified for %s is not compatible with "
               "the initial shape of the values tensor of this IndexedSlices. "
               "It enters the loop with shape %s, but the specified shape "
-              "invariant is %s."
-              % (inp.values.name, inp.values.get_shape(), shape))
+              "invariant is %s." % (inp.values.name, inp.values.get_shape(),
+                                    shape))
         var.values.set_shape(shape)
         var.indices.set_shape(tensor_shape.TensorShape([shape[0]]))
         if var.dense_shape is not None:
@@ -569,8 +595,8 @@ def _SetShapeInvariants(input_vars, enter_vars, shapes):
               "The shape invariant specified for %s is not compatible with "
               "the initial shape of the shape tensor of this SparseTensor. "
               "It enters the loop with shape %s, but the specified shape "
-              "invariant is %s."
-              % (inp.dense_shape.name, inp.dense_shape.get_shape(), shape))
+              "invariant is %s." % (inp.dense_shape.name,
+                                    inp.dense_shape.get_shape(), shape))
         var.values.set_shape(tensor_shape.TensorShape([None]))
         var.indices.set_shape(tensor_shape.TensorShape([None, shape.ndims]))
         var.dense_shape.set_shape(shape)
@@ -599,8 +625,8 @@ def _EnforceShapeInvariant(merge_var, next_var):
           "The shape for %s is not an invariant for the loop. It enters "
           "the loop with shape %s, but has shape %s after one iteration. "
           "Provide shape invariants using either the `shape_invariants` "
-          "argument of tf.while_loop or set_shape() on the loop variables."
-          % (merge_var.name, m_shape, n_shape))
+          "argument of tf.while_loop or set_shape() on the loop variables." %
+          (merge_var.name, m_shape, n_shape))
   else:
     if not isinstance(var, (ops.IndexedSlices, sparse_tensor.SparseTensor)):
       raise TypeError("Type %s not supported" % type(var))
@@ -623,9 +649,9 @@ def _EnforceShapeInvariant(merge_var, next_var):
               "the loop with shape (%s, %s, %s), but has shape (%s, %s, %s) "
               "after one iteration. Provide shape invariants using either the "
               "`shape_invariants` argument of tf.while_loop or set_shape() "
-              "on the loop variables."
-              % (merge_var.name, m_values_shape, m_indices_shape, m_shape_shape,
-                 n_values_shape, n_indices_shape, n_shape_shape))
+              "on the loop variables." %
+              (merge_var.name, m_values_shape, m_indices_shape, m_shape_shape,
+               n_values_shape, n_indices_shape, n_shape_shape))
     else:
       m_values_shape = merge_var.values.get_shape()
       m_indices_shape = merge_var.indices.get_shape()
@@ -637,12 +663,12 @@ def _EnforceShapeInvariant(merge_var, next_var):
           not _ShapeLessThanOrEqual(n_indices_shape, m_indices_shape) or
           not _ShapeLessThanOrEqual(n_shape_shape, m_shape_shape)):
         raise ValueError(
-          "The shape for %s is not an invariant for the loop. It enters "
-          "the loop with shape (%s, %s, %s), but has shape (%s, %s, %s) "
-          "after one iteration. Provide shape invariants using either "
-          "the `shape_invariants` argument of tf.while_loop or set_shape() "
-          "on the loop variables."
-          % (merge_var.name, m_values_shape, m_indices_shape, m_shape_shape,
+            "The shape for %s is not an invariant for the loop. It enters "
+            "the loop with shape (%s, %s, %s), but has shape (%s, %s, %s) "
+            "after one iteration. Provide shape invariants using either "
+            "the `shape_invariants` argument of tf.while_loop or set_shape() "
+            "on the loop variables." %
+            (merge_var.name, m_values_shape, m_indices_shape, m_shape_shape,
              n_values_shape, n_indices_shape, n_shape_shape))
 
 
@@ -657,7 +683,7 @@ def _AddNextAndBackEdge(m, v, enforce_shape_invariant=True):
       # the types don't match.
       # TODO(skyewm): call this for other cases below (needs testing)
       _EnforceShapeInvariant(m, v)
-    m.op._update_input(1, v)   # pylint: disable=protected-access
+    m.op._update_input(1, v)  # pylint: disable=protected-access
   elif isinstance(m, ops.IndexedSlices):
     # pylint: disable=protected-access
     v = math_ops._as_indexed_slices(v, optimize=False)
@@ -720,8 +746,7 @@ def GetMaxSizeFromNestedMaximumIterations(value, while_ctxt):
       raise ValueError(
           "Cannot create a gradient accumulator for tensor '%s' inside "
           "XLA while_loop because maximum_iterations was not passed to "
-          "the tf.while_loop call ('%s')."
-          % (value_name, while_ctxt.name))
+          "the tf.while_loop call ('%s')." % (value_name, while_ctxt.name))
 
     # pylint: disable=protected-access
     max_iter_ctxt = max_iter.op._get_control_flow_context()
@@ -742,9 +767,9 @@ def GetMaxSizeFromNestedMaximumIterations(value, while_ctxt):
             "while_loop. maximum_iterations tensor '%s' for while_loop context "
             "'%s' must be statically known (e.g. a constant value or known "
             "shape dimension), or be defined at or outside the while loop "
-            "context '%s' (currently defined in '%s')." % (
-                value_name, max_iter.name, while_ctxt.name,
-                curr_ctxt_name, max_iter_ctxt.name))
+            "context '%s' (currently defined in '%s')." %
+            (value_name, max_iter.name, while_ctxt.name, curr_ctxt_name,
+             max_iter_ctxt.name))
       max_size *= const_max_iter
 
     # Find the next outer WhileContext (or stop if we reach the
@@ -808,9 +833,11 @@ class GradLoopState(object):
       outer_forward_ctxt = forward_ctxt.outer_context
 
     # Add the forward loop counter.
-    if outer_forward_ctxt: outer_forward_ctxt.Enter()
+    if outer_forward_ctxt:
+      outer_forward_ctxt.Enter()
     cnt, forward_index = forward_ctxt.AddForwardLoopCounter(outer_grad_state)
-    if outer_forward_ctxt: outer_forward_ctxt.Exit()
+    if outer_forward_ctxt:
+      outer_forward_ctxt.Exit()
     self._forward_context = forward_ctxt
     self._forward_index = forward_index
 
@@ -835,7 +862,8 @@ class GradLoopState(object):
           real_cnt, outer_grad_state)
       outer_grad_ctxt.Exit()
     else:
-      if outer_forward_ctxt: outer_forward_ctxt.Enter()
+      if outer_forward_ctxt:
+        outer_forward_ctxt.Enter()
       self._grad_context = WhileContext(
           maximum_iterations=forward_ctxt.maximum_iterations,
           parallel_iterations=forward_ctxt.parallel_iterations,
@@ -845,7 +873,8 @@ class GradLoopState(object):
           grad_state=self)
       self._grad_index = self._grad_context.AddBackpropLoopCounter(
           cnt, outer_grad_state)
-      if outer_forward_ctxt: outer_forward_ctxt.Exit()
+      if outer_forward_ctxt:
+        outer_forward_ctxt.Exit()
 
   @property
   def outer_grad_state(self):
@@ -973,7 +1002,8 @@ class GradLoopState(object):
     # curr_ctxt is the context that tf.gradients was called in.
     curr_ctxt = ops.get_default_graph()._get_control_flow_context()  # pylint: disable=protected-access
     with ops.control_dependencies(None):
-      if curr_ctxt: curr_ctxt.Enter()
+      if curr_ctxt:
+        curr_ctxt.Enter()
       with ops.colocate_with(value):
         # We only need to pass maximum_iterations to the stack if
         # we're inside an XLA context.
@@ -984,11 +1014,10 @@ class GradLoopState(object):
               value, self.forward_context)
         # pylint: disable=protected-access
         acc = gen_data_flow_ops._stack_v2(
-            max_size=max_size,
-            elem_type=value.dtype.base_dtype,
-            name="f_acc")
+            max_size=max_size, elem_type=value.dtype.base_dtype, name="f_acc")
         # pylint: enable=protected-access
-      if curr_ctxt: curr_ctxt.Exit()
+      if curr_ctxt:
+        curr_ctxt.Exit()
 
       # Make acc available in the forward context.
       enter_acc = self.forward_context.AddValue(acc)
@@ -1009,8 +1038,7 @@ class GradLoopState(object):
       else:
         # value is in a cond context within the forward context.
         if not isinstance(value_ctxt, CondContext):
-          raise TypeError(
-              "value_ctxt is not a CondContext: %s" % value_ctxt)
+          raise TypeError("value_ctxt is not a CondContext: %s" % value_ctxt)
         if dead_branch:
           # The special case for creating a zero tensor for a dead
           # branch of a switch. See ControlFlowState.ZerosLike().
@@ -1134,8 +1162,8 @@ class GradLoopState(object):
 
       if real_value is None:
         # Add the stack pop op in the grad context.
-        real_value = cur_grad_state.AddBackpropAccumulatedValue(history_value,
-                                                                cur_value)
+        real_value = cur_grad_state.AddBackpropAccumulatedValue(
+            history_value, cur_value)
         if cur_grad_state != self:
           real_value = self._grad_context.AddValue(real_value)
       self._history_map[value.name] = real_value
@@ -1154,7 +1182,7 @@ class ControlFlowState(object):
   """Maintain the mapping from the loops to their grad states."""
 
   def __init__(self):
-    self._map = {}   # maps forward loop context to GradLoopState
+    self._map = {}  # maps forward loop context to GradLoopState
 
   def GetGradState(self, op, before):
     """Return the grad state for this op if it's in a forward loop context."""
@@ -1318,7 +1346,8 @@ class ControlFlowState(object):
     Returns:
       A zero tensor of the same shape of op.outputs[index].
     """
-    if util.IsLoopSwitch(op): return None
+    if util.IsLoopSwitch(op):
+      return None
     dead_branch = util.IsSwitch(op)
     forward_ctxt = _GetWhileContext(op)
     grad_state = self._map.get(forward_ctxt)
@@ -1361,8 +1390,8 @@ class ControlFlowState(object):
       grad_state.grad_context.Enter()
 
       # Create a zero tensor with the right shape.
-      shape = grad_state.AddBackpropAccumulatedValue(
-          history_zeros_shape, zeros_shape, dead_branch)
+      shape = grad_state.AddBackpropAccumulatedValue(history_zeros_shape,
+                                                     zeros_shape, dead_branch)
       result = array_ops.zeros(shape, val.dtype)
     return result
 
@@ -1393,12 +1422,14 @@ class ControlFlowState(object):
           else:
             # Create a zeros in the outer grad context.
             outer_grad_ctxt = grad_state.grad_context.outer_context
-            if outer_grad_ctxt: outer_grad_ctxt.Enter()
+            if outer_grad_ctxt:
+              outer_grad_ctxt.Enter()
             enter_grad_op = b_merge.op.inputs[0].op
             enter_grad = enter_grad_op.inputs[0]
             grad_shape = array_ops.shape_internal(enter_grad, optimize=False)
             grad_val = array_ops.zeros(grad_shape)
-            if outer_grad_ctxt: outer_grad_ctxt.Exit()
+            if outer_grad_ctxt:
+              outer_grad_ctxt.Exit()
             # Use the zeros for iterations > 0.
             grad_state.grad_context.Enter()
             next_grad_val = _NextIteration(grad_val)
@@ -1470,8 +1501,7 @@ class ControlFlowContext(object):
     self._outer_context = ops.get_default_graph()._get_control_flow_context()
     self._context_stack = []
     if values_def:
-      self._init_values_from_proto(values_def,
-                                   import_scope=import_scope)
+      self._init_values_from_proto(values_def, import_scope=import_scope)
     else:
       # Values that have been already seen in this context.
       self._values = set()
@@ -1532,19 +1562,16 @@ class ControlFlowContext(object):
     """
     values_def = control_flow_pb2.ValuesDef()
     values_def.values.extend(
-        [ops.strip_name_scope(v, export_scope)
-         for v in sorted(self._values)])
+        [ops.strip_name_scope(v, export_scope) for v in sorted(self._values)])
     for k, v in self._external_values.items():
       k = ops.strip_name_scope(k, export_scope)
-      values_def.external_values[k] = ops.strip_name_scope(
-          v.name, export_scope)
+      values_def.external_values[k] = ops.strip_name_scope(v.name, export_scope)
     return values_def
 
   @staticmethod
   def _from_proto(values_def, import_scope=None):
     """Returns a `ControlFlowContext` created from `values_def`."""
-    return ControlFlowContext(values_def=values_def,
-                              import_scope=import_scope)
+    return ControlFlowContext(values_def=values_def, import_scope=import_scope)
 
   def AddName(self, name):
     self._values.add(name)
@@ -1599,6 +1626,7 @@ class ControlFlowContext(object):
       op._remove_all_control_inputs()
       op._add_control_inputs(internal_control_inputs)
     return internal_control_inputs
+
   # pylint: enable=protected-access
 
   def AddInnerOp(self, op):
@@ -1626,8 +1654,13 @@ class ControlFlowContext(object):
 class CondContext(ControlFlowContext):
   """The context for the conditional construct."""
 
-  def __init__(self, pred=None, pivot=None, branch=None,
-               name="cond_text", context_def=None, import_scope=None):
+  def __init__(self,
+               pred=None,
+               pivot=None,
+               branch=None,
+               name="cond_text",
+               context_def=None,
+               import_scope=None):
     """Creates a `CondContext`.
 
     Args:
@@ -1647,9 +1680,9 @@ class CondContext(ControlFlowContext):
     else:
       # Initializes the default fields.
       ControlFlowContext.__init__(self)
-      self._pred = pred         # The boolean tensor for the cond predicate
-      self._pivot = pivot       # The predicate tensor in this branch
-      self._branch = branch     # 0 or 1 representing this branch
+      self._pred = pred  # The boolean tensor for the cond predicate
+      self._pivot = pivot  # The predicate tensor in this branch
+      self._branch = branch  # 0 or 1 representing this branch
 
       # Values considered to have been already seen in this context.
       self._values.add(pred.name)
@@ -1665,15 +1698,14 @@ class CondContext(ControlFlowContext):
     assert isinstance(context_def, control_flow_pb2.CondContextDef)
     # Create from context_def.
     g = ops.get_default_graph()
-    self._name = ops.prepend_name_scope(
-        context_def.context_name, import_scope)
-    self._pred = g.as_graph_element(ops.prepend_name_scope(
-        context_def.pred_name, import_scope))
-    self._pivot = g.as_graph_element(ops.prepend_name_scope(
-        context_def.pivot_name, import_scope))
+    self._name = ops.prepend_name_scope(context_def.context_name, import_scope)
+    self._pred = g.as_graph_element(
+        ops.prepend_name_scope(context_def.pred_name, import_scope))
+    self._pivot = g.as_graph_element(
+        ops.prepend_name_scope(context_def.pivot_name, import_scope))
     self._branch = context_def.branch
-    super(CondContext, self).__init__(values_def=context_def.values_def,
-                                      import_scope=import_scope)
+    super(CondContext, self).__init__(
+        values_def=context_def.values_def, import_scope=import_scope)
 
   @property
   def pred(self):
@@ -1711,18 +1743,16 @@ class CondContext(ControlFlowContext):
     Returns:
       A `CondContextDef` protocol buffer.
     """
-    if (export_scope is None or
-        self.name.startswith(export_scope)):
+    if (export_scope is None or self.name.startswith(export_scope)):
       context_def = control_flow_pb2.CondContextDef()
-      context_def.context_name = ops.strip_name_scope(
-          self.name, export_scope)
-      context_def.pred_name = ops.strip_name_scope(
-          self._pred.name, export_scope)
-      context_def.pivot_name = ops.strip_name_scope(
-          self._pivot.name, export_scope)
+      context_def.context_name = ops.strip_name_scope(self.name, export_scope)
+      context_def.pred_name = ops.strip_name_scope(self._pred.name,
+                                                   export_scope)
+      context_def.pivot_name = ops.strip_name_scope(self._pivot.name,
+                                                    export_scope)
       context_def.branch = self._branch
-      context_def.values_def.MergeFrom(super(CondContext, self)._to_proto(
-          export_scope))
+      context_def.values_def.MergeFrom(
+          super(CondContext, self)._to_proto(export_scope))
 
       return context_def
     else:
@@ -1731,8 +1761,7 @@ class CondContext(ControlFlowContext):
   @staticmethod
   def from_proto(context_def, import_scope=None):
     """Returns a `CondContext` object created from `context_def`."""
-    return CondContext(context_def=context_def,
-                       import_scope=import_scope)
+    return CondContext(context_def=context_def, import_scope=import_scope)
 
   def AddValue(self, val):
     """Add `val` to the current context and its outer context recursively."""
@@ -1846,8 +1875,8 @@ class CondContext(ControlFlowContext):
         if original_result is None:
           return no_op(), None
         else:
-          original_result = nest.map_structure(
-              array_ops.identity, original_result)
+          original_result = nest.map_structure(array_ops.identity,
+                                               original_result)
     if original_result is None:
       return None, None
 
@@ -1871,11 +1900,15 @@ def _UnpackIfSingleton(res):
 # pylint: disable=g-doc-args
 @tf_export("cond")
 @deprecation.deprecated_args(
-    None,
-    "fn1/fn2 are deprecated in favor of the true_fn/false_fn arguments.",
+    None, "fn1/fn2 are deprecated in favor of the true_fn/false_fn arguments.",
     "fn1", "fn2")
-def cond(pred, true_fn=None, false_fn=None, strict=False, name=None,
-         fn1=None, fn2=None):
+def cond(pred,
+         true_fn=None,
+         false_fn=None,
+         strict=False,
+         name=None,
+         fn1=None,
+         fn2=None):
   """Return `true_fn()` if the predicate `pred` is true else `false_fn()`.
 
   `true_fn` and `false_fn` both return lists of output tensors. `true_fn` and
@@ -2044,6 +2077,8 @@ def cond(pred, true_fn=None, false_fn=None, strict=False, name=None,
     if not strict:
       merges = _UnpackIfSingleton(merges)
     return merges
+
+
 # pylint: enable=g-doc-args
 # pylint: enable=redefined-outer-name
 
@@ -2139,8 +2174,7 @@ class WhileContext(ControlFlowContext):
     assert isinstance(context_def, control_flow_pb2.WhileContextDef)
     # Create from context_def.
     g = ops.get_default_graph()
-    self._name = ops.prepend_name_scope(
-        context_def.context_name, import_scope)
+    self._name = ops.prepend_name_scope(context_def.context_name, import_scope)
     if context_def.maximum_iterations_name:
       self._maximum_iterations = g.as_graph_element(
           ops.prepend_name_scope(context_def.maximum_iterations_name,
@@ -2150,25 +2184,27 @@ class WhileContext(ControlFlowContext):
     self._parallel_iterations = context_def.parallel_iterations
     self._back_prop = context_def.back_prop
     self._swap_memory = context_def.swap_memory
-    self._pivot_for_pred = g.as_graph_element(ops.prepend_name_scope(
-        context_def.pivot_for_pred_name, import_scope))
+    self._pivot_for_pred = g.as_graph_element(
+        ops.prepend_name_scope(context_def.pivot_for_pred_name, import_scope))
     # We use this node to control constants created by the body lambda.
-    self._pivot_for_body = g.as_graph_element(ops.prepend_name_scope(
-        context_def.pivot_for_body_name, import_scope))
+    self._pivot_for_body = g.as_graph_element(
+        ops.prepend_name_scope(context_def.pivot_for_body_name, import_scope))
     # The boolean tensor for loop termination condition. Used in code
     # generation for gradient computation.
     self._pivot = g.as_graph_element(
         ops.prepend_name_scope(context_def.pivot_name, import_scope))
     # The list of exit tensors for loop variables.
-    self._loop_exits = [g.as_graph_element(
-        ops.prepend_name_scope(exit_name, import_scope))
-                        for exit_name in context_def.loop_exit_names]
+    self._loop_exits = [
+        g.as_graph_element(ops.prepend_name_scope(exit_name, import_scope))
+        for exit_name in context_def.loop_exit_names
+    ]
     # The list of enter tensors for loop variables.
-    self._loop_enters = [g.as_graph_element(
-        ops.prepend_name_scope(enter_name, import_scope))
-                         for enter_name in context_def.loop_enter_names]
-    super(WhileContext, self).__init__(values_def=context_def.values_def,
-                                       import_scope=import_scope)
+    self._loop_enters = [
+        g.as_graph_element(ops.prepend_name_scope(enter_name, import_scope))
+        for enter_name in context_def.loop_enter_names
+    ]
+    super(WhileContext, self).__init__(
+        values_def=context_def.values_def, import_scope=import_scope)
 
   @property
   def maximum_iterations(self):
@@ -2219,11 +2255,9 @@ class WhileContext(ControlFlowContext):
     Returns:
       A `WhileContextDef` protocol buffer.
     """
-    if (export_scope is None or
-        self.name.startswith(export_scope)):
+    if (export_scope is None or self.name.startswith(export_scope)):
       context_def = control_flow_pb2.WhileContextDef()
-      context_def.context_name = ops.strip_name_scope(
-          self.name, export_scope)
+      context_def.context_name = ops.strip_name_scope(self.name, export_scope)
       context_def.parallel_iterations = self._parallel_iterations
       if self._maximum_iterations is not None:
         context_def.maximum_iterations_name = ops.strip_name_scope(
@@ -2234,17 +2268,16 @@ class WhileContext(ControlFlowContext):
           self._pivot_for_pred.name, export_scope)
       context_def.pivot_for_body_name = ops.strip_name_scope(
           self._pivot_for_body.name, export_scope)
-      context_def.pivot_name = ops.strip_name_scope(
-          self._pivot.name, export_scope)
-      context_def.loop_exit_names.extend(
-          [ops.strip_name_scope(l.name, export_scope)
-           for l in self._loop_exits])
-      context_def.loop_enter_names.extend(
-          [ops.strip_name_scope(l.name, export_scope)
-           for l in self._loop_enters])
+      context_def.pivot_name = ops.strip_name_scope(self._pivot.name,
+                                                    export_scope)
+      context_def.loop_exit_names.extend([
+          ops.strip_name_scope(l.name, export_scope) for l in self._loop_exits
+      ])
+      context_def.loop_enter_names.extend([
+          ops.strip_name_scope(l.name, export_scope) for l in self._loop_enters
+      ])
       context_def.values_def.MergeFrom(
-          super(WhileContext, self)._to_proto(
-              export_scope=export_scope))
+          super(WhileContext, self)._to_proto(export_scope=export_scope))
 
       return context_def
     else:
@@ -2261,8 +2294,7 @@ class WhileContext(ControlFlowContext):
     Returns:
       A `WhileContext` Python object.
     """
-    return WhileContext(context_def=context_def,
-                        import_scope=import_scope)
+    return WhileContext(context_def=context_def, import_scope=import_scope)
 
   def GetWhileContext(self):
     return self
@@ -2299,8 +2331,11 @@ class WhileContext(ControlFlowContext):
         result = self._outer_context.AddValue(val)
       # Create an Enter to make `result` known to this loop context.
       with ops.control_dependencies(None):
-        enter = _Enter(result, self._name, is_constant=True,
-                       parallel_iterations=self._parallel_iterations)
+        enter = _Enter(
+            result,
+            self._name,
+            is_constant=True,
+            parallel_iterations=self._parallel_iterations)
         enter.graph.prevent_feeding(enter)
         if self._outer_context:
           self._outer_context.AddInnerOp(enter.op)
@@ -2378,6 +2413,7 @@ class WhileContext(ControlFlowContext):
 
   def _MaybeAddControlDependency(self, op):
     """Add a control input to the op if it only depends on loop invariants."""
+
     def _IsOpFree(op):
       """Determines if `op` needs a control dependency."""
       if op.control_inputs:
@@ -2390,6 +2426,7 @@ class WhileContext(ControlFlowContext):
         if not util.IsLoopConstantEnter(x.op):
           return False
       return True
+
     if _IsOpFree(op):
       # pylint: disable=protected-access
       op._add_control_input(self.GetControlPivot().op)
@@ -2423,9 +2460,12 @@ class WhileContext(ControlFlowContext):
 
     self.Enter()
     self.AddName(n.name)
-    enter_n = _Enter(n, self._name, is_constant=False,
-                     parallel_iterations=self._parallel_iterations,
-                     name="f_count")
+    enter_n = _Enter(
+        n,
+        self._name,
+        is_constant=False,
+        parallel_iterations=self._parallel_iterations,
+        name="f_count")
     self.loop_enters.append(enter_n)
 
     merge_n = merge([enter_n, enter_n])[0]
@@ -2465,9 +2505,12 @@ class WhileContext(ControlFlowContext):
 
     self.Enter()
     self.AddName(count.name)
-    enter_count = _Enter(count, self._name, is_constant=False,
-                         parallel_iterations=self._parallel_iterations,
-                         name="b_count")
+    enter_count = _Enter(
+        count,
+        self._name,
+        is_constant=False,
+        parallel_iterations=self._parallel_iterations,
+        name="b_count")
     self.loop_enters.append(enter_count)
 
     merge_count = merge([enter_count, enter_count])[0]
@@ -2525,9 +2568,11 @@ class WhileContext(ControlFlowContext):
     # without running any iterations.
     shape = grad.get_shape()
     if shape.is_fully_defined():
-      if self.outer_context: self.outer_context.Enter()
+      if self.outer_context:
+        self.outer_context.Enter()
       acc = constant_op.constant(0, grad.dtype, shape=shape, name="b_acc")
-      if self.outer_context: self.outer_context.Exit()
+      if self.outer_context:
+        self.outer_context.Exit()
     else:
       value = op.inputs[0]
       if (isinstance(self.outer_context, WhileContext) and
@@ -2546,16 +2591,21 @@ class WhileContext(ControlFlowContext):
         acc = array_ops.zeros(real_shape, grad.dtype)
         self.outer_context.Exit()
       else:
-        if self.outer_context: self.outer_context.Enter()
+        if self.outer_context:
+          self.outer_context.Enter()
         zeros_shape = array_ops.shape_internal(value, optimize=False)
         acc = array_ops.zeros(zeros_shape, grad.dtype)
-        if self.outer_context: self.outer_context.Exit()
+        if self.outer_context:
+          self.outer_context.Exit()
 
     self.Enter()
     self.AddName(acc.name)
-    enter_acc = _Enter(acc, self._name, is_constant=False,
-                       parallel_iterations=self._parallel_iterations,
-                       name="b_acc")
+    enter_acc = _Enter(
+        acc,
+        self._name,
+        is_constant=False,
+        parallel_iterations=self._parallel_iterations,
+        name="b_acc")
     self.loop_enters.append(enter_acc)
 
     merge_acc = merge([enter_acc, enter_acc], name="b_acc")[0]
@@ -2588,14 +2638,17 @@ class WhileContext(ControlFlowContext):
     dense_shape = grad.dense_shape
 
     self.Exit()
-    if self.outer_context: self.outer_context.Enter()
+    if self.outer_context:
+      self.outer_context.Enter()
     if values.get_shape().is_fully_defined():
       values_shape = tensor_shape.TensorShape(
           [tensor_shape.Dimension(1)] + values.get_shape().dims[1:])
-      if self.outer_context: self.outer_context.Enter()
-      values_acc = constant_op.constant(0, values.dtype, shape=values_shape,
-                                        name="b_acc")
-      if self.outer_context: self.outer_context.Exit()
+      if self.outer_context:
+        self.outer_context.Enter()
+      values_acc = constant_op.constant(
+          0, values.dtype, shape=values_shape, name="b_acc")
+      if self.outer_context:
+        self.outer_context.Exit()
     else:
       values_shape = _resource_safe_shape(op.inputs[0])[1:]
       values_shape = array_ops.concat([[1], values_shape], 0)
@@ -2604,16 +2657,19 @@ class WhileContext(ControlFlowContext):
     shape_acc = None
     if dense_shape is not None:
       if dense_shape.get_shape().is_fully_defined():
-        if self.outer_context: self.outer_context.Enter()
-        shape_acc = constant_op.constant(0, dense_shape.dtype,
-                                         shape=dense_shape.get_shape())
-        if self.outer_context: self.outer_context.Exit()
+        if self.outer_context:
+          self.outer_context.Enter()
+        shape_acc = constant_op.constant(
+            0, dense_shape.dtype, shape=dense_shape.get_shape())
+        if self.outer_context:
+          self.outer_context.Exit()
       else:
         shape_acc = array_ops.zeros_like(
             array_ops.shape_internal(op.inputs[0], optimize=False),
             optimize=False)
 
-    if self.outer_context: self.outer_context.Exit()
+    if self.outer_context:
+      self.outer_context.Exit()
 
     self.Enter()
     self.AddName(values_acc.name)
@@ -2626,9 +2682,15 @@ class WhileContext(ControlFlowContext):
     # Set use_input_shape=False since the accumulator tensors will grow in
     # size. If use_input_shape=True, the _update_input call below will result in
     # incompatible shapes.
-    enter_acc = [_Enter(x, self._name, is_constant=False,
-                        parallel_iterations=self._parallel_iterations,
-                        use_input_shape=False, name="b_acc") for x in init_acc]
+    enter_acc = [
+        _Enter(
+            x,
+            self._name,
+            is_constant=False,
+            parallel_iterations=self._parallel_iterations,
+            use_input_shape=False,
+            name="b_acc") for x in init_acc
+    ]
     # Manually set appropriate partial shapes.
     enter_acc[0].set_shape([None])
     if values_acc.shape.dims is not None:
@@ -2645,8 +2707,7 @@ class WhileContext(ControlFlowContext):
     ]
     if shape_acc is not None:
       # For the shape we just keep the maximum
-      acc_indexed_slices.append(
-          math_ops.maximum(dense_shape, switch_acc[2][1]))
+      acc_indexed_slices.append(math_ops.maximum(dense_shape, switch_acc[2][1]))
 
     next_acc = [_NextIteration(x) for x in acc_indexed_slices]
     for xm, xn in zip(merge_acc, next_acc):
@@ -2657,7 +2718,8 @@ class WhileContext(ControlFlowContext):
 
     self.ExitResult(exit_acc)
     return ops.IndexedSlices(
-        indices=exit_acc[0], values=exit_acc[1],
+        indices=exit_acc[0],
+        values=exit_acc[1],
         dense_shape=exit_acc[2] if shape_acc is not None else None)
 
   def _InitializeValues(self, values):
@@ -2690,10 +2752,14 @@ class WhileContext(ControlFlowContext):
     if self._outer_context:
       real_vars = [self._outer_context.AddValue(x) for x in loop_vars]
     with ops.control_dependencies(None):
-      enter_vars = [_Enter(x, self._name, is_constant=False,
-                           parallel_iterations=self._parallel_iterations,
-                           use_input_shape=(shape_invariants is None))
-                    for x in real_vars]
+      enter_vars = [
+          _Enter(
+              x,
+              self._name,
+              is_constant=False,
+              parallel_iterations=self._parallel_iterations,
+              use_input_shape=(shape_invariants is None)) for x in real_vars
+      ]
       for x in enter_vars:
         x.graph.prevent_feeding(x)
         if self._outer_context:
@@ -2754,11 +2820,13 @@ class WhileContext(ControlFlowContext):
       summary_ref = ops.get_collection_ref(ops.GraphKeys._SUMMARY_COLLECTION)  # pylint: disable=protected-access
       summary_ref[:] = pre_summaries
       with ops.control_dependencies(new_summaries):
+
         def map_fn(x):
           # TODO(apassos) figure out how to trigger with tensor arrays as well
           if isinstance(x, tensor_array_ops.TensorArray):
             return x
           return array_ops.identity(x)
+
         body_result = nest.map_structure(map_fn, body_result)
 
     # Compare the structure types of input and output of body.
@@ -2815,8 +2883,7 @@ class WhileContext(ControlFlowContext):
     packed_exit_vars = nest.pack_sequence_as(
         structure=original_body_result,
         flat_sequence=exit_vars_with_tensor_arrays)
-    return (packed_exit_vars[0] if len(exit_vars) == 1
-            else packed_exit_vars)
+    return (packed_exit_vars[0] if len(exit_vars) == 1 else packed_exit_vars)
 
   def _FixControlInputsAndContext(self, enters):
     graph = ops.get_default_graph()
@@ -2834,8 +2901,9 @@ class WhileContext(ControlFlowContext):
       for x in xs:
         inp_op = x.op.inputs[0].op
         control_inputs = graph._control_dependencies_for_inputs([inp_op])
-        outer_control_inputs = [op for op in control_inputs
-                                if self._IsInOuterContext(op)]
+        outer_control_inputs = [
+            op for op in control_inputs if self._IsInOuterContext(op)
+        ]
         x.op._set_control_flow_context(self)
         x.op._add_control_inputs(outer_control_inputs)
         graph._record_op_seen_by_control_dependencies(x.op)
@@ -2847,9 +2915,15 @@ class WhileContext(ControlFlowContext):
 
 # pylint: disable=redefined-outer-name
 @tf_export("while_loop")
-def while_loop(cond, body, loop_vars, shape_invariants=None,
-               parallel_iterations=10, back_prop=True, swap_memory=False,
-               name=None, maximum_iterations=None):
+def while_loop(cond,
+               body,
+               loop_vars,
+               shape_invariants=None,
+               parallel_iterations=10,
+               back_prop=True,
+               swap_memory=False,
+               name=None,
+               maximum_iterations=None):
   """Repeat `body` while the condition `cond` is true.
 
   `cond` is a callable returning a boolean scalar tensor. `body` is a callable
@@ -3024,6 +3098,8 @@ def while_loop(cond, body, loop_vars, shape_invariants=None,
       return result[1]
     else:
       return result
+
+
 # pylint: enable=redefined-outer-name
 
 
@@ -3051,8 +3127,9 @@ def _AsTensorList(x, p):
     if isinstance(v, ops.Tensor):
       l.append(array_ops.identity(v))
     else:
-      l.append(ops.IndexedSlices(array_ops.identity(v.values),
-                                 array_ops.identity(v.indices)))
+      l.append(
+          ops.IndexedSlices(
+              array_ops.identity(v.values), array_ops.identity(v.indices)))
   return l
 
 
@@ -3062,8 +3139,7 @@ def _CheckResults(a, b):
   for x, y in zip(a, b):
     assert x.dtype == y.dtype, (
         "Values returned by a() [%s] and b() [%s] must have "
-        "the same type: %s, %s." %
-        (x.name, y.name, x.dtype.name, y.dtype.name))
+        "the same type: %s, %s." % (x.name, y.name, x.dtype.name, y.dtype.name))
 
 
 def with_dependencies(dependencies, output_tensor, name=None):
@@ -3099,9 +3175,9 @@ def with_dependencies(dependencies, output_tensor, name=None):
         if isinstance(output_tensor, ops.Tensor):
           return _Identity(output_tensor, name=name)
         else:
-          return ops.IndexedSlices(_Identity(output_tensor.values, name=name),
-                                   output_tensor.indices,
-                                   output_tensor.dense_shape)
+          return ops.IndexedSlices(
+              _Identity(output_tensor.values, name=name), output_tensor.indices,
+              output_tensor.dense_shape)
 
 
 def _GroupControlDeps(dev, deps, name=None):
@@ -3173,6 +3249,7 @@ def group(*inputs, **kwargs):
     def device_key(dev):
       """A sort key that allows None to be compared to strings."""
       return "" if dev is None else dev
+
     for dev in sorted(six.iterkeys(ops_on_device), key=device_key):
       deps.append(_GroupControlDeps(dev, ops_on_device[dev]))
 
@@ -3463,12 +3540,14 @@ class XLAControlFlowContext(ControlFlowContext):
     return x
 
 
-ops.register_proto_function(ops.GraphKeys.COND_CONTEXT,
-                            proto_type=control_flow_pb2.CondContextDef,
-                            to_proto=CondContext.to_proto,
-                            from_proto=CondContext.from_proto)
+ops.register_proto_function(
+    ops.GraphKeys.COND_CONTEXT,
+    proto_type=control_flow_pb2.CondContextDef,
+    to_proto=CondContext.to_proto,
+    from_proto=CondContext.from_proto)
 
-ops.register_proto_function(ops.GraphKeys.WHILE_CONTEXT,
-                            proto_type=control_flow_pb2.WhileContextDef,
-                            to_proto=WhileContext.to_proto,
-                            from_proto=WhileContext.from_proto)
+ops.register_proto_function(
+    ops.GraphKeys.WHILE_CONTEXT,
+    proto_type=control_flow_pb2.WhileContextDef,
+    to_proto=WhileContext.to_proto,
+    from_proto=WhileContext.from_proto)
diff --git a/tensorflow/python/ops/data_flow_ops.py b/tensorflow/python/ops/data_flow_ops.py
index 34f0bf7b78..95e45bff06 100644
--- a/tensorflow/python/ops/data_flow_ops.py
+++ b/tensorflow/python/ops/data_flow_ops.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #==============================================================================
-
 """Data Flow Operations."""
 # pylint: disable=g-bad-name
 from __future__ import absolute_import
@@ -40,6 +39,7 @@ from tensorflow.python.ops import math_ops
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_data_flow_ops import *
 from tensorflow.python.util.tf_export import tf_export
+
 # pylint: enable=wildcard-import
 
 
@@ -54,17 +54,19 @@ def _as_type_list(dtypes):
     return list(dtypes)
 
 
-def _as_shape_list(shapes, dtypes, unknown_dim_allowed=False,
+def _as_shape_list(shapes,
+                   dtypes,
+                   unknown_dim_allowed=False,
                    unknown_rank_allowed=False):
   """Convert shapes to a list of tuples of int (or None)."""
   del dtypes
   if unknown_dim_allowed:
-    if (not isinstance(shapes, collections.Sequence)
-        or not shapes
-        or any(shape is None or isinstance(shape, int) for shape in shapes)):
+    if (not isinstance(shapes, collections.Sequence) or not shapes or
+        any(shape is None or isinstance(shape, int) for shape in shapes)):
       raise ValueError(
           "When providing partial shapes, a list of shapes must be provided.")
-  if shapes is None: return None
+  if shapes is None:
+    return None
   if isinstance(shapes, tensor_shape.TensorShape):
     shapes = [shapes]
   if not isinstance(shapes, (tuple, list)):
@@ -103,7 +105,8 @@ def _shape_common(s1, s2):
     return tensor_shape.unknown_shape()
   d = [
       d1 if d1 is not None and d1 == d2 else None
-      for (d1, d2) in zip(s1.as_list(), s2.as_list())]
+      for (d1, d2) in zip(s1.as_list(), s2.as_list())
+  ]
   return tensor_shape.TensorShape(d)
 
 
@@ -195,8 +198,7 @@ class QueueBase(object):
       TypeError: When `queues` is not a list of `QueueBase` objects,
         or when the data types of `queues` are not all the same.
     """
-    if ((not queues) or
-        (not isinstance(queues, list)) or
+    if ((not queues) or (not isinstance(queues, list)) or
         (not all(isinstance(x, QueueBase) for x in queues))):
       raise TypeError("A list of queues expected")
 
@@ -210,12 +212,16 @@ class QueueBase(object):
 
     queue_shapes = [q.shapes for q in queues]
     reduced_shapes = [
-        six.moves.reduce(_shape_common, s) for s in zip(*queue_shapes)]
+        six.moves.reduce(_shape_common, s) for s in zip(*queue_shapes)
+    ]
 
     queue_refs = array_ops.stack([x.queue_ref for x in queues])
     selected_queue = array_ops.gather(queue_refs, index)
-    return QueueBase(dtypes=dtypes, shapes=reduced_shapes, names=names,
-                     queue_ref=selected_queue)
+    return QueueBase(
+        dtypes=dtypes,
+        shapes=reduced_shapes,
+        names=names,
+        queue_ref=selected_queue)
 
   @property
   def queue_ref(self):
@@ -282,8 +288,8 @@ class QueueBase(object):
 
     tensors = []
     for i, (val, dtype) in enumerate(zip(vals, self._dtypes)):
-      tensors.append(ops.convert_to_tensor(val, dtype=dtype,
-                                           name="component_%d" % i))
+      tensors.append(
+          ops.convert_to_tensor(val, dtype=dtype, name="component_%d" % i))
 
     return tensors
 
@@ -555,11 +561,13 @@ class QueueBase(object):
       name = "%s_Close" % self._name
     if self._queue_ref.dtype == _dtypes.resource:
       return gen_data_flow_ops._queue_close_v2(
-          self._queue_ref, cancel_pending_enqueues=cancel_pending_enqueues,
+          self._queue_ref,
+          cancel_pending_enqueues=cancel_pending_enqueues,
           name=name)
     else:
       return gen_data_flow_ops._queue_close(
-          self._queue_ref, cancel_pending_enqueues=cancel_pending_enqueues,
+          self._queue_ref,
+          cancel_pending_enqueues=cancel_pending_enqueues,
           name=name)
 
   def is_closed(self, name=None):
@@ -577,9 +585,9 @@ class QueueBase(object):
     if name is None:
       name = "%s_Is_Closed" % self._name
     if self._queue_ref.dtype == _dtypes.resource:
-      return gen_data_flow_ops.queue_is_closed_v2(self._queue_ref,name=name)
+      return gen_data_flow_ops.queue_is_closed_v2(self._queue_ref, name=name)
     else:
-      return gen_data_flow_ops.queue_is_closed_(self._queue_ref,name=name)
+      return gen_data_flow_ops.queue_is_closed_(self._queue_ref, name=name)
 
   def size(self, name=None):
     """Compute the number of elements in this queue.
@@ -611,8 +619,14 @@ class RandomShuffleQueue(QueueBase):
   @end_compatibility
   """
 
-  def __init__(self, capacity, min_after_dequeue, dtypes, shapes=None,
-               names=None, seed=None, shared_name=None,
+  def __init__(self,
+               capacity,
+               min_after_dequeue,
+               dtypes,
+               shapes=None,
+               names=None,
+               seed=None,
+               shared_name=None,
                name="random_shuffle_queue"):
     """Create a queue that dequeues elements in a random order.
 
@@ -670,9 +684,14 @@ class RandomShuffleQueue(QueueBase):
       string = (str(seed1) + shared_name).encode("utf-8")
       seed2 = int(hashlib.md5(string).hexdigest()[:8], 16) & 0x7FFFFFFF
     queue_ref = gen_data_flow_ops._random_shuffle_queue_v2(
-        component_types=dtypes, shapes=shapes, capacity=capacity,
-        min_after_dequeue=min_after_dequeue, seed=seed1, seed2=seed2,
-        shared_name=shared_name, name=name)
+        component_types=dtypes,
+        shapes=shapes,
+        capacity=capacity,
+        min_after_dequeue=min_after_dequeue,
+        seed=seed1,
+        seed2=seed2,
+        shared_name=shared_name,
+        name=name)
 
     super(RandomShuffleQueue, self).__init__(dtypes, shapes, names, queue_ref)
 
@@ -690,8 +709,13 @@ class FIFOQueue(QueueBase):
   @end_compatibility
   """
 
-  def __init__(self, capacity, dtypes, shapes=None, names=None,
-               shared_name=None, name="fifo_queue"):
+  def __init__(self,
+               capacity,
+               dtypes,
+               shapes=None,
+               names=None,
+               shared_name=None,
+               name="fifo_queue"):
     """Creates a queue that dequeues elements in a first-in first-out order.
 
     A `FIFOQueue` has bounded capacity; supports multiple concurrent
@@ -725,8 +749,11 @@ class FIFOQueue(QueueBase):
     shapes = _as_shape_list(shapes, dtypes)
     names = _as_name_list(names, dtypes)
     queue_ref = gen_data_flow_ops._fifo_queue_v2(
-        component_types=dtypes, shapes=shapes, capacity=capacity,
-        shared_name=shared_name, name=name)
+        component_types=dtypes,
+        shapes=shapes,
+        capacity=capacity,
+        shared_name=shared_name,
+        name=name)
 
     super(FIFOQueue, self).__init__(dtypes, shapes, names, queue_ref)
 
@@ -747,7 +774,12 @@ class PaddingFIFOQueue(QueueBase):
   @end_compatibility
   """
 
-  def __init__(self, capacity, dtypes, shapes, names=None, shared_name=None,
+  def __init__(self,
+               capacity,
+               dtypes,
+               shapes,
+               names=None,
+               shared_name=None,
                name="padding_fifo_queue"):
     """Creates a queue that dequeues elements in a first-in first-out order.
 
@@ -792,12 +824,15 @@ class PaddingFIFOQueue(QueueBase):
     names = _as_name_list(names, dtypes)
     if len(dtypes) != len(shapes):
       raise ValueError("Shapes must be provided for all components, "
-                       "but received %d dtypes and %d shapes."
-                       % (len(dtypes), len(shapes)))
+                       "but received %d dtypes and %d shapes." % (len(dtypes),
+                                                                  len(shapes)))
 
     queue_ref = gen_data_flow_ops._padding_fifo_queue_v2(
-        component_types=dtypes, shapes=shapes, capacity=capacity,
-        shared_name=shared_name, name=name)
+        component_types=dtypes,
+        shapes=shapes,
+        capacity=capacity,
+        shared_name=shared_name,
+        name=name)
 
     super(PaddingFIFOQueue, self).__init__(dtypes, shapes, names, queue_ref)
 
@@ -815,7 +850,12 @@ class PriorityQueue(QueueBase):
   @end_compatibility
   """
 
-  def __init__(self, capacity, types, shapes=None, names=None, shared_name=None,
+  def __init__(self,
+               capacity,
+               types,
+               shapes=None,
+               names=None,
+               shared_name=None,
                name="priority_queue"):
     """Creates a queue that dequeues elements in a first-in first-out order.
 
@@ -856,14 +896,17 @@ class PriorityQueue(QueueBase):
     shapes = _as_shape_list(shapes, types)
 
     queue_ref = gen_data_flow_ops._priority_queue_v2(
-        component_types=types, shapes=shapes, capacity=capacity,
-        shared_name=shared_name, name=name)
+        component_types=types,
+        shapes=shapes,
+        capacity=capacity,
+        shared_name=shared_name,
+        name=name)
 
     priority_dtypes = [_dtypes.int64] + types
     priority_shapes = [()] + shapes if shapes else shapes
 
-    super(PriorityQueue, self).__init__(
-        priority_dtypes, priority_shapes, names, queue_ref)
+    super(PriorityQueue, self).__init__(priority_dtypes, priority_shapes, names,
+                                        queue_ref)
 
 
 # TODO(josh11b): class BatchQueue(QueueBase):
@@ -943,8 +986,10 @@ class Barrier(object):
       self._shapes = [tensor_shape.unknown_shape() for _ in self._types]
 
     self._barrier_ref = gen_data_flow_ops._barrier(
-        component_types=self._types, shapes=self._shapes,
-        shared_name=shared_name, name=name)
+        component_types=self._types,
+        shapes=self._shapes,
+        shared_name=shared_name,
+        name=name)
     if context.in_graph_mode():
       self._name = self._barrier_ref.op.name.split("/")[-1]
     else:
@@ -1028,12 +1073,13 @@ class Barrier(object):
     """
     if name is None:
       name = "%s_BarrierTakeMany" % self._name
-    ret = gen_data_flow_ops._barrier_take_many(self._barrier_ref,
-                                               num_elements,
-                                               self._types,
-                                               allow_small_batch,
-                                               timeout,
-                                               name=name)
+    ret = gen_data_flow_ops._barrier_take_many(
+        self._barrier_ref,
+        num_elements,
+        self._types,
+        allow_small_batch,
+        timeout,
+        name=name)
 
     # NOTE(mrry): Not using a shape function because we need access to
     # the Barrier object.
@@ -1048,8 +1094,7 @@ class Barrier(object):
       op.outputs[1].set_shape(tensor_shape.vector(batch_dim))  # keys
       for output, shape in zip(op.outputs[2:], self._shapes):  # value_list
         output.set_shape(
-            tensor_shape.TensorShape([batch_dim]).concatenate(
-                shape))
+            tensor_shape.TensorShape([batch_dim]).concatenate(shape))
 
     return ret
 
@@ -1298,8 +1343,8 @@ class SparseConditionalAccumulator(ConditionalAccumulatorBase):
                name="sparse_conditional_accumulator"):
     accumulator_ref = gen_data_flow_ops.sparse_conditional_accumulator(
         dtype=dtype, shape=shape, shared_name=shared_name, name=name)
-    super(SparseConditionalAccumulator,
-          self).__init__(dtype, shape, accumulator_ref)
+    super(SparseConditionalAccumulator, self).__init__(dtype, shape,
+                                                       accumulator_ref)
 
   def apply_indexed_slices_grad(self, grad, local_step=0, name=None):
     """Attempts to apply a gradient to the accumulator.
@@ -1368,8 +1413,8 @@ class SparseConditionalAccumulator(ConditionalAccumulatorBase):
         local_step=local_step,
         gradient_indices=math_ops.to_int64(grad_indices),
         gradient_values=grad_values,
-        gradient_shape=math_ops.to_int64([] if grad_shape is None else
-                                         grad_shape),
+        gradient_shape=math_ops.to_int64([]
+                                         if grad_shape is None else grad_shape),
         has_known_shape=(grad_shape is not None),
         name=name)
 
@@ -1431,11 +1476,16 @@ class BaseStagingArea(object):
   _identifier = 0
   _lock = threading.Lock()
 
-  def __init__(self, dtypes, shapes=None, names=None, shared_name=None,
-                  capacity=0, memory_limit=0):
+  def __init__(self,
+               dtypes,
+               shapes=None,
+               names=None,
+               shared_name=None,
+               capacity=0,
+               memory_limit=0):
     if shared_name is None:
-      self._name = (ops.get_default_graph()
-                       .unique_name(self.__class__.__name__))
+      self._name = (
+          ops.get_default_graph().unique_name(self.__class__.__name__))
     elif isinstance(shared_name, six.string_types):
       self._name = shared_name
     else:
@@ -1532,8 +1582,9 @@ class BaseStagingArea(object):
                          (sorted(vals.keys()), sorted(self._names)))
       # The order of values in `self._names` indicates the order in which the
       # tensors in the dictionary `vals` must be listed.
-      vals, indices, n = zip(*[(vals[k], i, k) for i, k in enumerate(self._names)
-                                                  if k in vals])
+      vals, indices, n = zip(*[(vals[k], i, k)
+                               for i, k in enumerate(self._names)
+                               if k in vals])
     else:
       if self._names:
         raise ValueError("You must enqueue a dictionary in a staging area "
@@ -1541,7 +1592,7 @@ class BaseStagingArea(object):
 
       if indices is None:
         raise ValueError("Indices must be supplied when inserting a list "
-                        "of tensors")
+                         "of tensors")
 
       if len(indices) != len(vals):
         raise ValueError("Number of indices '%s' doesn't match "
@@ -1553,8 +1604,8 @@ class BaseStagingArea(object):
 
     # Sanity check number of values
     if not len(vals) <= len(self._dtypes):
-      raise ValueError("Unexpected number of inputs '%s' vs '%s'" % (
-                          len(vals), len(self._dtypes)))
+      raise ValueError("Unexpected number of inputs '%s' vs '%s'" %
+                       (len(vals), len(self._dtypes)))
 
     tensors = []
 
@@ -1562,14 +1613,14 @@ class BaseStagingArea(object):
       dtype, shape = self._dtypes[i], self._shapes[i]
       # Check dtype
       if not val.dtype == dtype:
-        raise ValueError("Datatypes do not match. '%s' != '%s'" %(
-                        str(val.dtype), str(dtype)))
+        raise ValueError("Datatypes do not match. '%s' != '%s'" %
+                         (str(val.dtype), str(dtype)))
 
       # Check shape
       val.get_shape().assert_is_compatible_with(shape)
 
-      tensors.append(ops.convert_to_tensor(val, dtype=dtype,
-                                          name="component_%d" % i))
+      tensors.append(
+          ops.convert_to_tensor(val, dtype=dtype, name="component_%d" % i))
 
     return tensors, indices
 
@@ -1632,6 +1683,7 @@ class BaseStagingArea(object):
     else:
       return [vals]
 
+
 class StagingArea(BaseStagingArea):
   """Class for staging inputs. No ordering guarantees.
 
@@ -1666,8 +1718,13 @@ class StagingArea(BaseStagingArea):
 
   """
 
-  def __init__(self, dtypes, shapes=None, names=None, shared_name=None,
-                  capacity=0, memory_limit=0):
+  def __init__(self,
+               dtypes,
+               shapes=None,
+               names=None,
+               shared_name=None,
+               capacity=0,
+               memory_limit=0):
     """Constructs a staging area object.
 
     The two optional lists, `shapes` and `names`, must be of the same length
@@ -1702,9 +1759,8 @@ class StagingArea(BaseStagingArea):
       ValueError: If one of the arguments is invalid.
     """
 
-    super(StagingArea, self).__init__(dtypes, shapes,
-                                          names, shared_name,
-                                          capacity, memory_limit)
+    super(StagingArea, self).__init__(dtypes, shapes, names, shared_name,
+                                      capacity, memory_limit)
 
   def put(self, values, name=None):
     """Create an op that places a value into the staging area.
@@ -1726,14 +1782,18 @@ class StagingArea(BaseStagingArea):
                         self._scope_vals(values)) as scope:
 
       # Hard-code indices for this staging area
-      indices = (list(six.moves.range(len(values)))
-                  if isinstance(values, (list, tuple)) else None)
+      indices = (
+          list(six.moves.range(len(values)))
+          if isinstance(values, (list, tuple)) else None)
       vals, _ = self._check_put_dtypes(values, indices)
 
       with ops.colocate_with(self._coloc_op):
-        op = gen_data_flow_ops.stage(values=vals, shared_name=self._name,
-                                     name=scope, capacity=self._capacity,
-                                     memory_limit=self._memory_limit)
+        op = gen_data_flow_ops.stage(
+            values=vals,
+            shared_name=self._name,
+            name=scope,
+            capacity=self._capacity,
+            memory_limit=self._memory_limit)
 
       return op
 
@@ -1741,7 +1801,7 @@ class StagingArea(BaseStagingArea):
     with ops.colocate_with(self._coloc_op):
       ret = get_fn()
 
-    indices = list(six.moves.range(len(self._dtypes))) # Hard coded
+    indices = list(six.moves.range(len(self._dtypes)))  # Hard coded
     return self._get_return_value(ret, indices)
 
   def get(self, name=None):
@@ -1769,10 +1829,12 @@ class StagingArea(BaseStagingArea):
     if name is None:
       name = "%s_get" % self._name
 
+    # pylint: disable=bad-continuation
     fn = lambda: gen_data_flow_ops.unstage(dtypes=self._dtypes,
                     shared_name=self._name, name=name,
                     capacity=self._capacity,
                     memory_limit=self._memory_limit)
+    # pylint: enable=bad-continuation
 
     return self.__internal_get(fn, name)
 
@@ -1797,10 +1859,12 @@ class StagingArea(BaseStagingArea):
     if name is None:
       name = "%s_peek" % self._name
 
+    # pylint: disable=bad-continuation
     fn = lambda: gen_data_flow_ops.stage_peek(index,
                     dtypes=self._dtypes, shared_name=self._name,
                     name=name, capacity=self._capacity,
                     memory_limit=self._memory_limit)
+    # pylint: enable=bad-continuation
 
     return self.__internal_get(fn, name)
 
@@ -1816,9 +1880,12 @@ class StagingArea(BaseStagingArea):
     if name is None:
       name = "%s_size" % self._name
 
-    return gen_data_flow_ops.stage_size(name=name, shared_name=self._name,
-                        dtypes=self._dtypes, capacity=self._capacity,
-                        memory_limit=self._memory_limit)
+    return gen_data_flow_ops.stage_size(
+        name=name,
+        shared_name=self._name,
+        dtypes=self._dtypes,
+        capacity=self._capacity,
+        memory_limit=self._memory_limit)
 
   def clear(self, name=None):
     """Clears the staging area.
@@ -1832,14 +1899,16 @@ class StagingArea(BaseStagingArea):
     if name is None:
       name = "%s_clear" % self._name
 
-    return gen_data_flow_ops.stage_clear(name=name, shared_name=self._name,
-                        dtypes=self._dtypes, capacity=self._capacity,
-                        memory_limit=self._memory_limit)
+    return gen_data_flow_ops.stage_clear(
+        name=name,
+        shared_name=self._name,
+        dtypes=self._dtypes,
+        capacity=self._capacity,
+        memory_limit=self._memory_limit)
+
 
 class MapStagingArea(BaseStagingArea):
-  """
-  A `MapStagingArea` is a TensorFlow data structure that stores tensors across
-  multiple steps, and exposes operations that can put and get tensors.
+  """A `MapStagingArea` is a TensorFlow data structure that stores tensors across multiple steps, and exposes operations that can put and get tensors.
 
   Each `MapStagingArea` element is a (key, value) pair.
   Only int64 keys are supported, other types should be
@@ -1852,7 +1921,8 @@ class MapStagingArea(BaseStagingArea):
   It supports multiple concurrent producers and consumers; and
   provides exactly-once delivery.
 
-  Each value tuple of a `MapStagingArea` is a fixed-length tuple of tensors whose
+  Each value tuple of a `MapStagingArea` is a fixed-length tuple of tensors
+  whose
   dtypes are described by `dtypes`, and whose shapes are optionally described
   by the `shapes` argument.
 
@@ -1896,10 +1966,16 @@ class MapStagingArea(BaseStagingArea):
   associated with it are removed.
   """
 
-  def __init__(self, dtypes, shapes=None, names=None, shared_name=None,
-                      ordered=False, capacity=0, memory_limit=0):
-    """
-    Args:
+  def __init__(self,
+               dtypes,
+               shapes=None,
+               names=None,
+               shared_name=None,
+               ordered=False,
+               capacity=0,
+               memory_limit=0):
+    """Args:
+
       dtypes:  A list of types.  The length of dtypes must equal the number
         of tensors in each element.
       capacity: (Optional.) Maximum number of elements.
@@ -1925,9 +2001,8 @@ class MapStagingArea(BaseStagingArea):
 
     """
 
-    super(MapStagingArea, self).__init__(dtypes, shapes,
-                                      names, shared_name,
-                                      capacity, memory_limit)
+    super(MapStagingArea, self).__init__(dtypes, shapes, names, shared_name,
+                                         capacity, memory_limit)
 
     # Defer to different methods depending if the map is ordered
     self._ordered = ordered
@@ -1950,8 +2025,7 @@ class MapStagingArea(BaseStagingArea):
       self._clear_fn = gen_data_flow_ops.map_clear
 
   def put(self, key, vals, indices=None, name=None):
-    """
-    Create an op that stores the (key, vals) pair in the staging area.
+    """Create an op that stores the (key, vals) pair in the staging area.
 
     Incomplete puts are possible, preferably using a dictionary for vals
     as the appropriate dtypes and shapes can be inferred from the value names
@@ -1973,7 +2047,8 @@ class MapStagingArea(BaseStagingArea):
         The created op
 
     Raises:
-        ValueError: If the number or type of inputs don't match the staging area.
+        ValueError: If the number or type of inputs don't match the staging
+        area.
     """
 
     with ops.name_scope(name, "%s_put" % self._name,
@@ -1982,10 +2057,15 @@ class MapStagingArea(BaseStagingArea):
       vals, indices = self._check_put_dtypes(vals, indices)
 
       with ops.colocate_with(self._coloc_op):
-        op = self._put_fn(key, indices, vals, dtypes=self._dtypes,
-                             shared_name=self._name, name=scope,
-                             capacity=self._capacity,
-                             memory_limit=self._memory_limit)
+        op = self._put_fn(
+            key,
+            indices,
+            vals,
+            dtypes=self._dtypes,
+            shared_name=self._name,
+            name=scope,
+            capacity=self._capacity,
+            memory_limit=self._memory_limit)
     return op
 
   def _get_indices_and_dtypes(self, indices=None):
@@ -2001,13 +2081,13 @@ class MapStagingArea(BaseStagingArea):
     if all(isinstance(i, str) for i in indices):
       if self._names is None:
         raise ValueError("String indices provided '%s', but this Staging Area "
-                        "was not created with names." % indices)
+                         "was not created with names." % indices)
 
       try:
         indices = [self._names.index(n) for n in indices]
       except ValueError:
         raise ValueError("Named index '%s' not in "
-                        "Staging Area names '%s'" % (n, self._names))
+                         "Staging Area names '%s'" % (n, self._names))
     elif all(isinstance(i, int) for i in indices):
       pass
     else:
@@ -2018,10 +2098,8 @@ class MapStagingArea(BaseStagingArea):
 
     return indices, dtypes
 
-
   def peek(self, key, indices=None, name=None):
-    """
-    Peeks at staging area data associated with the key.
+    """Peeks at staging area data associated with the key.
 
     If the key is not in the staging area, it will block
     until the associated (key, value) is inserted.
@@ -2044,22 +2122,22 @@ class MapStagingArea(BaseStagingArea):
     indices, dtypes = self._get_indices_and_dtypes(indices)
 
     with ops.colocate_with(self._coloc_op):
-      result = self._peek_fn(key, shared_name=self._name,
-                      indices=indices,
-                      dtypes=dtypes,
-                      name=name,
-                      capacity=self._capacity,
-                      memory_limit=self._memory_limit)
+      result = self._peek_fn(
+          key,
+          shared_name=self._name,
+          indices=indices,
+          dtypes=dtypes,
+          name=name,
+          capacity=self._capacity,
+          memory_limit=self._memory_limit)
 
     return self._get_return_value(result, indices)
 
   def get(self, key=None, indices=None, name=None):
-    """
-    If the key is provided, the associated (key, value)
-    is returned from the staging area. If the key is not
-    in the staging area, this method will block until
-    the associated (key, value) is inserted.
+    """If the key is provided, the associated (key, value) is returned from the staging area.
 
+    If the key is not in the staging area, this method will block until
+    the associated (key, value) is inserted.
     If no key is provided and the staging area is ordered,
     the (key, value) with the smallest key will be returned.
     Otherwise, a random (key, value) will be returned.
@@ -2084,12 +2162,10 @@ class MapStagingArea(BaseStagingArea):
       return self._pop(key, indices=indices, name=name)
 
   def _pop(self, key, indices=None, name=None):
-    """
-    Remove and return the associated (key, value)
-    is returned from the staging area. If the key is not
-    in the staging area, this method will block until
-    the associated (key, value) is inserted.
+    """Remove and return the associated (key, value) is returned from the staging area.
 
+    If the key is not in the staging area, this method will block until
+    the associated (key, value) is inserted.
     Args:
         key: Key associated with the required data
         indices: Partial list of tensors to retrieve (optional).
@@ -2107,21 +2183,21 @@ class MapStagingArea(BaseStagingArea):
     indices, dtypes = self._get_indices_and_dtypes(indices)
 
     with ops.colocate_with(self._coloc_op):
-      result = self._pop_fn(key, shared_name=self._name,
-                      indices=indices,
-                      dtypes=dtypes,
-                      name=name,
-                      capacity=self._capacity,
-                      memory_limit=self._memory_limit)
+      result = self._pop_fn(
+          key,
+          shared_name=self._name,
+          indices=indices,
+          dtypes=dtypes,
+          name=name,
+          capacity=self._capacity,
+          memory_limit=self._memory_limit)
 
     return key, self._get_return_value(result, indices)
 
   def _popitem(self, indices=None, name=None):
-    """
-    If the staging area is ordered,
-    the (key, value) with the smallest key will be returned.
-    Otherwise, a random (key, value) will be returned.
+    """If the staging area is ordered, the (key, value) with the smallest key will be returned.
 
+    Otherwise, a random (key, value) will be returned.
     If the staging area is empty when this operation executes,
     it will block until there is an element to dequeue.
 
@@ -2142,12 +2218,13 @@ class MapStagingArea(BaseStagingArea):
     indices, dtypes = self._get_indices_and_dtypes(indices)
 
     with ops.colocate_with(self._coloc_op):
-      key, result = self._popitem_fn(shared_name=self._name,
-                              indices=indices,
-                              dtypes=dtypes,
-                              name=name,
-                              capacity=self._capacity,
-                              memory_limit=self._memory_limit)
+      key, result = self._popitem_fn(
+          shared_name=self._name,
+          indices=indices,
+          dtypes=dtypes,
+          name=name,
+          capacity=self._capacity,
+          memory_limit=self._memory_limit)
 
     # Separate keys and results out from
     # underlying namedtuple
@@ -2157,8 +2234,7 @@ class MapStagingArea(BaseStagingArea):
     return key, result
 
   def size(self, name=None):
-    """
-    Returns the number of elements in the staging area.
+    """Returns the number of elements in the staging area.
 
     Args:
         name: A name for the operation (optional)
@@ -2169,14 +2245,15 @@ class MapStagingArea(BaseStagingArea):
     if name is None:
       name = "%s_size" % self._name
 
-    return self._size_fn(shared_name=self._name,
-                        name=name, dtypes=self._dtypes,
-                        capacity=self._capacity,
-                        memory_limit=self._memory_limit)
+    return self._size_fn(
+        shared_name=self._name,
+        name=name,
+        dtypes=self._dtypes,
+        capacity=self._capacity,
+        memory_limit=self._memory_limit)
 
   def incomplete_size(self, name=None):
-    """
-    Returns the number of incomplete elements in the staging area.
+    """Returns the number of incomplete elements in the staging area.
 
     Args:
         name: A name for the operation (optional)
@@ -2187,16 +2264,15 @@ class MapStagingArea(BaseStagingArea):
     if name is None:
       name = "%s_incomplete_size" % self._name
 
-    return self._incomplete_size_fn(shared_name=self._name,
-                        name=name, dtypes=self._dtypes,
-                        capacity=self._capacity,
-                        memory_limit=self._memory_limit)
-
-
+    return self._incomplete_size_fn(
+        shared_name=self._name,
+        name=name,
+        dtypes=self._dtypes,
+        capacity=self._capacity,
+        memory_limit=self._memory_limit)
 
   def clear(self, name=None):
-    """
-    Clears the staging area.
+    """Clears the staging area.
 
     Args:
         name: A name for the operation (optional)
@@ -2207,10 +2283,12 @@ class MapStagingArea(BaseStagingArea):
     if name is None:
       name = "%s_clear" % self._name
 
-    return self._clear_fn(shared_name=self._name,
-                        name=name, dtypes=self._dtypes,
-                        capacity=self._capacity,
-                        memory_limit=self._memory_limit)
+    return self._clear_fn(
+        shared_name=self._name,
+        name=name,
+        dtypes=self._dtypes,
+        capacity=self._capacity,
+        memory_limit=self._memory_limit)
 
 
 class RecordInput(object):
diff --git a/tensorflow/python/ops/gradients_impl.py b/tensorflow/python/ops/gradients_impl.py
index 5d4b9ecd8b..314726ede6 100644
--- a/tensorflow/python/ops/gradients_impl.py
+++ b/tensorflow/python/ops/gradients_impl.py
@@ -52,7 +52,6 @@ from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
 
-
 # Warn the user if we convert a sparse representation to dense with at
 # least this number of elements.
 _LARGE_SPARSE_NUM_ELEMENTS = 100000000
@@ -235,9 +234,10 @@ def _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops):
           raise TypeError(
               "Gradients of complex tensors must set grad_ys (y.dtype = %r)" %
               y.dtype)
-        new_grad_ys.append(array_ops.fill(
-            array_ops.shape(y), constant_op.constant(
-                1, dtype=y.dtype, name="grad_ys_%d" % i)))
+        new_grad_ys.append(
+            array_ops.fill(
+                array_ops.shape(y),
+                constant_op.constant(1, dtype=y.dtype, name="grad_ys_%d" % i)))
         continue
       if y.dtype.is_floating or y.dtype.is_integer:
         if not grad_y.dtype.is_floating and not grad_y.dtype.is_integer:
@@ -492,11 +492,12 @@ def gradients(ys,
       name, "gradients",
       list(ys) + list(xs) + list(stop_gradients) + list(grad_ys)) as grad_scope:
     ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y")
-    xs = [x.handle if isinstance(x, resource_variable_ops.ResourceVariable)
-          else x
-          for x in xs]
-    xs = ops.internal_convert_n_to_tensor_or_indexed_slices(xs, name="x",
-                                                            as_ref=True)
+    xs = [
+        x.handle if isinstance(x, resource_variable_ops.ResourceVariable) else x
+        for x in xs
+    ]
+    xs = ops.internal_convert_n_to_tensor_or_indexed_slices(
+        xs, name="x", as_ref=True)
     grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops)
 
     # The approach we take here is as follows: Create a list of all ops in the
@@ -513,9 +514,8 @@ def gradients(ys,
     to_ops = [t.op for t in ys]
     from_ops = [t.op for t in xs]
     stop_gradient_ops = [t.op for t in stop_gradients]
-    pending_count, loop_state = _PendingCount(ops.get_default_graph(), to_ops,
-                                              from_ops,
-                                              colocate_gradients_with_ops)
+    pending_count, loop_state = _PendingCount(
+        ops.get_default_graph(), to_ops, from_ops, colocate_gradients_with_ops)
 
     # Iterate over the collected ops.
     #
@@ -588,9 +588,8 @@ def gradients(ys,
           # output, it means that the cost does not depend on output[i],
           # therefore dC/doutput[i] is 0.
           for i, out_grad in enumerate(out_grads):
-            if (not isinstance(out_grad, ops.Tensor) and
-                not out_grad) and ((not grad_fn and is_func_call) or
-                                   _IsTrainable(op.outputs[i])):
+            if (not isinstance(out_grad, ops.Tensor) and not out_grad) and (
+                (not grad_fn and is_func_call) or _IsTrainable(op.outputs[i])):
               # Only trainable outputs or outputs for a function call that
               # will use SymbolicGradient get a zero gradient. Gradient
               # functions should ignore the gradient for other outputs.
@@ -607,17 +606,17 @@ def gradients(ys,
               if grad_fn:
                 # If grad_fn was found, do not use SymbolicGradient even for
                 # functions.
-                in_grads = _MaybeCompile(
-                    grad_scope, op, func_call, lambda: grad_fn(op, *out_grads))
+                in_grads = _MaybeCompile(grad_scope, op, func_call,
+                                         lambda: grad_fn(op, *out_grads))
               else:
                 # For function call ops, we add a 'SymbolicGradient'
                 # node to the graph to compute gradients.
-                in_grads = _MaybeCompile(
-                    grad_scope, op, func_call, lambda: _SymGrad(op, out_grads))
+                in_grads = _MaybeCompile(grad_scope, op, func_call,
+                                         lambda: _SymGrad(op, out_grads))
               in_grads = _AsList(in_grads)
               _VerifyGeneratedGradients(in_grads, op)
-              if gate_gradients and len(
-                  [x for x in in_grads if x is not None]) > 1:
+              if gate_gradients and len([x for x in in_grads
+                                         if x is not None]) > 1:
                 with ops.device(None):
                   with ops.colocate_with(None, ignore_existing=True):
                     in_grads = control_flow_ops.tuple(in_grads)
@@ -637,8 +636,8 @@ def gradients(ys,
                     "Incompatible shapes between op input and calculated "
                     "input gradient.  Forward operation: %s.  Input index: %d. "
                     "Original input shape: %s.  "
-                    "Calculated input gradient shape: %s"
-                    % (op.name, i, t_in.shape, in_grad.shape))
+                    "Calculated input gradient shape: %s" %
+                    (op.name, i, t_in.shape, in_grad.shape))
             _SetGrad(grads, t_in, in_grad)
         if loop_state:
           loop_state.ExitGradWhileContext(op, before=False)
@@ -670,8 +669,8 @@ def _UpdatePendingAndEnqueueReady(grads, op, queue, pending_count, loop_state):
     pending_count[x.op._id] -= 1
     ready = (pending_count[x.op._id] == 0)
     if loop_state and not ready:
-      ready = (pending_count[x.op._id] > 0 and
-               control_flow_util.IsLoopSwitch(x.op))
+      ready = (
+          pending_count[x.op._id] > 0 and control_flow_util.IsLoopSwitch(x.op))
     # pylint: enable=protected-access
     if ready:
       if control_flow_util.IsLoopExit(x.op):
@@ -725,8 +724,8 @@ def _GetGrad(grads, t):
   if not op_grads:
     return None
   t_grad = op_grads[t.value_index]
-  assert not isinstance(t_grad, list), (
-      "gradients list should have been aggregated by now.")
+  assert not isinstance(
+      t_grad, list), ("gradients list should have been aggregated by now.")
   return t_grad
 
 
@@ -745,9 +744,8 @@ def _HandleNestedIndexedSlices(grad):
   else:
     assert isinstance(grad.values, ops.IndexedSlices)
     g = _HandleNestedIndexedSlices(grad.values)
-    return ops.IndexedSlices(g.values,
-                             array_ops.gather(grad.indices, g.indices),
-                             g.dense_shape)
+    return ops.IndexedSlices(g.values, array_ops.gather(
+        grad.indices, g.indices), g.dense_shape)
 
 
 def _AccumulatorShape(inputs):
@@ -849,8 +847,8 @@ def _AggregatedGrads(grads, op, loop_state, aggregation_method=None):
       AggregationMethod.ADD_N, AggregationMethod.EXPERIMENTAL_TREE,
       AggregationMethod.EXPERIMENTAL_ACCUMULATE_N
   ]:
-    raise ValueError("Invalid aggregation_method specified %s." %
-                     aggregation_method)
+    raise ValueError(
+        "Invalid aggregation_method specified %s." % aggregation_method)
   out_grads = _GetGrads(grads, op)
   for i, out_grad in enumerate(out_grads):
     if loop_state:
@@ -859,7 +857,8 @@ def _AggregatedGrads(grads, op, loop_state, aggregation_method=None):
         continue
     # Grads have to be Tensors or IndexedSlices
     if (isinstance(out_grad, collections.Sequence) and not all([
-        isinstance(g, (ops.Tensor, ops.IndexedSlices)) for g in out_grad
+        isinstance(g, (ops.Tensor, ops.IndexedSlices))
+        for g in out_grad
         if g is not None
     ])):
       raise TypeError("gradients have to be either all Tensors "
@@ -903,8 +902,8 @@ def _AggregatedGrads(grads, op, loop_state, aggregation_method=None):
         else:
           used = "add_n"
           out_grads[i] = _MultiDeviceAddN(out_grad)
-        logging.vlog(2, "  _AggregatedGrads %d x %s using %s",
-                     len(out_grad), tensor_shape, used)
+        logging.vlog(2, "  _AggregatedGrads %d x %s using %s", len(out_grad),
+                     tensor_shape, used)
       else:
         out_grad = math_ops._as_indexed_slices_list(
             [g for g in out_grad if g is not None])
@@ -967,7 +966,8 @@ def _hessian_vector_product(ys, xs, v):
   assert len(grads) == length
   elemwise_products = [
       math_ops.multiply(grad_elem, array_ops.stop_gradient(v_elem))
-      for grad_elem, v_elem in zip(grads, v) if grad_elem is not None
+      for grad_elem, v_elem in zip(grads, v)
+      if grad_elem is not None
   ]
 
   # Second backprop
@@ -975,8 +975,12 @@ def _hessian_vector_product(ys, xs, v):
 
 
 @tf_export("hessians")
-def hessians(ys, xs, name="hessians", colocate_gradients_with_ops=False,
-            gate_gradients=False, aggregation_method=None):
+def hessians(ys,
+             xs,
+             name="hessians",
+             colocate_gradients_with_ops=False,
+             gate_gradients=False,
+             aggregation_method=None):
   """Constructs the Hessian of sum of `ys` with respect to `x` in `xs`.
 
   `hessians()` adds ops to the graph to output the Hessian matrix of `ys`
@@ -1004,9 +1008,9 @@ def hessians(ys, xs, name="hessians", colocate_gradients_with_ops=False,
   """
   xs = _AsList(xs)
   kwargs = {
-      'colocate_gradients_with_ops': colocate_gradients_with_ops,
-      'gate_gradients': gate_gradients,
-      'aggregation_method': aggregation_method
+      "colocate_gradients_with_ops": colocate_gradients_with_ops,
+      "gate_gradients": gate_gradients,
+      "aggregation_method": aggregation_method
   }
   # Compute first-order derivatives and iterate for each x in xs.
   hessians = []
@@ -1031,8 +1035,7 @@ def hessians(ys, xs, name="hessians", colocate_gradients_with_ops=False,
     )
 
     _shape = array_ops.shape(x)
-    _reshaped_hessian = array_ops.reshape(
-        hessian.stack(), array_ops.concat((_shape, _shape), 0)
-    )
+    _reshaped_hessian = array_ops.reshape(hessian.stack(),
+                                          array_ops.concat((_shape, _shape), 0))
     hessians.append(_reshaped_hessian)
   return hessians
diff --git a/tensorflow/python/ops/nn_grad.py b/tensorflow/python/ops/nn_grad.py
index cfff73774b..5e6cafd6aa 100644
--- a/tensorflow/python/ops/nn_grad.py
+++ b/tensorflow/python/ops/nn_grad.py
@@ -89,52 +89,63 @@ def _Conv2DBackpropFilterGrad(op, grad):
 @ops.RegisterGradient("Conv3D")
 def _Conv3DGrad(op, grad):
   data_format = op.get_attr("data_format")
-  return [nn_ops.conv3d_backprop_input_v2(array_ops.shape(op.inputs[0]),
-                                          op.inputs[1],
-                                          grad,
-                                          strides=op.get_attr("strides"),
-                                          padding=op.get_attr("padding"),
-                                          data_format=data_format),
-          nn_ops.conv3d_backprop_filter_v2(op.inputs[0],
-                                           array_ops.shape(op.inputs[1]),
-                                           grad,
-                                           strides=op.get_attr("strides"),
-                                           padding=op.get_attr("padding"),
-                                           data_format=data_format)]
+  return [
+      nn_ops.conv3d_backprop_input_v2(
+          array_ops.shape(op.inputs[0]),
+          op.inputs[1],
+          grad,
+          strides=op.get_attr("strides"),
+          padding=op.get_attr("padding"),
+          data_format=data_format),
+      nn_ops.conv3d_backprop_filter_v2(
+          op.inputs[0],
+          array_ops.shape(op.inputs[1]),
+          grad,
+          strides=op.get_attr("strides"),
+          padding=op.get_attr("padding"),
+          data_format=data_format)
+  ]
 
 
 @ops.RegisterGradient("Conv3DBackpropInputV2")
 def _Conv3DBackpropInputGrad(op, grad):
   data_format = op.get_attr("data_format")
-  return [None,
-          nn_ops.conv3d_backprop_filter_v2(grad,
-                                           array_ops.shape(op.inputs[1]),
-                                           op.inputs[2],
-                                           strides=op.get_attr("strides"),
-                                           padding=op.get_attr("padding"),
-                                           data_format=data_format),
-          nn_ops.conv3d(grad,
-                        op.inputs[1],
-                        strides=op.get_attr("strides"),
-                        padding=op.get_attr("padding"),
-                        data_format=data_format)]
+  return [
+      None,
+      nn_ops.conv3d_backprop_filter_v2(
+          grad,
+          array_ops.shape(op.inputs[1]),
+          op.inputs[2],
+          strides=op.get_attr("strides"),
+          padding=op.get_attr("padding"),
+          data_format=data_format),
+      nn_ops.conv3d(
+          grad,
+          op.inputs[1],
+          strides=op.get_attr("strides"),
+          padding=op.get_attr("padding"),
+          data_format=data_format)
+  ]
 
 
 @ops.RegisterGradient("Conv3DBackpropFilterV2")
 def _Conv3DBackpropFilterGrad(op, grad):
   data_format = op.get_attr("data_format")
-  return [nn_ops.conv3d_backprop_input_v2(array_ops.shape(op.inputs[0]),
-                                          grad,
-                                          op.inputs[2],
-                                          strides=op.get_attr("strides"),
-                                          padding=op.get_attr("padding"),
-                                          data_format=data_format),
-          None,
-          nn_ops.conv3d(op.inputs[0],
-                        grad,
-                        strides=op.get_attr("strides"),
-                        padding=op.get_attr("padding"),
-                        data_format=data_format)]
+  return [
+      nn_ops.conv3d_backprop_input_v2(
+          array_ops.shape(op.inputs[0]),
+          grad,
+          op.inputs[2],
+          strides=op.get_attr("strides"),
+          padding=op.get_attr("padding"),
+          data_format=data_format), None,
+      nn_ops.conv3d(
+          op.inputs[0],
+          grad,
+          strides=op.get_attr("strides"),
+          padding=op.get_attr("padding"),
+          data_format=data_format)
+  ]
 
 
 @ops.RegisterGradient("AvgPool3D")
@@ -150,12 +161,13 @@ def _AvgPool3DGrad(op, grad):
 
 @ops.RegisterGradient("AvgPool3DGrad")
 def _AvgPool3DGradGrad(op, grad):
-  return (array_ops.stop_gradient(op.inputs[0]), gen_nn_ops.avg_pool3d(
-      grad,
-      op.get_attr("ksize"),
-      op.get_attr("strides"),
-      op.get_attr("padding"),
-      data_format=op.get_attr("data_format")))
+  return (array_ops.stop_gradient(op.inputs[0]),
+          gen_nn_ops.avg_pool3d(
+              grad,
+              op.get_attr("ksize"),
+              op.get_attr("strides"),
+              op.get_attr("padding"),
+              data_format=op.get_attr("data_format")))
 
 
 @ops.RegisterGradient("MaxPool3D")
@@ -173,9 +185,9 @@ def _MaxPool3DGrad(op, grad):
 @ops.RegisterGradient("MaxPool3DGrad")
 def _MaxPool3DGradGrad(op, grad):
   return (array_ops.zeros(
-      shape=array_ops.shape(op.inputs[0]),
-      dtype=op.inputs[0].dtype), array_ops.zeros(
-          shape=array_ops.shape(op.inputs[1]), dtype=op.inputs[1].dtype),
+      shape=array_ops.shape(op.inputs[0]), dtype=op.inputs[0].dtype),
+          array_ops.zeros(
+              shape=array_ops.shape(op.inputs[1]), dtype=op.inputs[1].dtype),
           gen_nn_ops._max_pool3d_grad_grad(
               op.inputs[0],
               op.inputs[1],
@@ -189,9 +201,9 @@ def _MaxPool3DGradGrad(op, grad):
 @ops.RegisterGradient("MaxPool3DGradGrad")
 def _MaxPool3DGradGradGrad(op, grad):
   return (array_ops.zeros(
-      shape=array_ops.shape(op.inputs[0]),
-      dtype=op.inputs[0].dtype), array_ops.zeros(
-          shape=array_ops.shape(op.inputs[1]), dtype=op.inputs[1].dtype),
+      shape=array_ops.shape(op.inputs[0]), dtype=op.inputs[0].dtype),
+          array_ops.zeros(
+              shape=array_ops.shape(op.inputs[1]), dtype=op.inputs[1].dtype),
           gen_nn_ops._max_pool3d_grad(
               op.inputs[0],
               op.inputs[1],
@@ -272,8 +284,9 @@ def _BiasAddGrad(op, received_grad):
     data_format = op.get_attr("data_format")
   except ValueError:
     data_format = None
-  return (received_grad, gen_nn_ops.bias_add_grad(out_backprop=received_grad,
-                                                  data_format=data_format))
+  return (received_grad,
+          gen_nn_ops.bias_add_grad(
+              out_backprop=received_grad, data_format=data_format))
 
 
 @ops.RegisterGradient("BiasAddGrad")
@@ -346,10 +359,9 @@ def _ReluGrad(op, grad):
 def _EluGradGrad(op, grad):
   elu_x = op.inputs[1]
   return (gen_nn_ops._elu_grad(grad, op.outputs[0]),
-          array_ops.where(elu_x < 0,
-                          grad * op.inputs[0],
-                          array_ops.zeros(shape=array_ops.shape(elu_x),
-                                          dtype=elu_x.dtype)))
+          array_ops.where(elu_x < 0, grad * op.inputs[0],
+                          array_ops.zeros(
+                              shape=array_ops.shape(elu_x), dtype=elu_x.dtype)))
 
 
 @ops.RegisterGradient("SeluGrad")
@@ -357,9 +369,11 @@ def _SeluGradGrad(op, grad):
   x = op.inputs[1]
   scale_alpha = 1.7580993408473768599402175208123
   return (gen_nn_ops._elu_grad(grad, op.outputs[0]),
-          array_ops.where(
-              x < 0., gen_nn_ops._elu_grad(grad, op.outputs[0] + scale_alpha),
-              array_ops.zeros(shape=array_ops.shape(x), dtype=x.dtype)))
+          array_ops.where(x < 0.,
+                          gen_nn_ops._elu_grad(grad,
+                                               op.outputs[0] + scale_alpha),
+                          array_ops.zeros(
+                              shape=array_ops.shape(x), dtype=x.dtype)))
 
 
 @ops.RegisterGradient("Relu6")
@@ -370,8 +384,8 @@ def _Relu6Grad(op, grad):
 @ops.RegisterGradient("Relu6Grad")
 def _Relu6GradGrad(op, grad):
   x = op.inputs[1]
-  return (gen_nn_ops._relu6_grad(grad, x), array_ops.zeros(
-      shape=array_ops.shape(x), dtype=x.dtype))
+  return (gen_nn_ops._relu6_grad(grad, x),
+          array_ops.zeros(shape=array_ops.shape(x), dtype=x.dtype))
 
 
 @ops.RegisterGradient("Elu")
@@ -410,8 +424,8 @@ def _SoftsignGrad(op, grad):
 @ops.RegisterGradient("ReluGrad")
 def _ReluGradGrad(op, grad):
   x = op.inputs[1]
-  return (gen_nn_ops._relu_grad(grad, x), array_ops.zeros(
-      shape=array_ops.shape(x), dtype=x.dtype))
+  return (gen_nn_ops._relu_grad(grad, x),
+          array_ops.zeros(shape=array_ops.shape(x), dtype=x.dtype))
 
 
 def _BroadcastMul(vec, mat):
@@ -455,8 +469,8 @@ def _SoftmaxCrossEntropyWithLogitsGrad(op, grad_loss, grad_grad):
     softmax = nn_ops.softmax(logits)
 
     grad += ((grad_grad - array_ops.squeeze(
-        math_ops.matmul(grad_grad[:, None, :],
-                        softmax[:, :, None]), axis=1)) * softmax)
+        math_ops.matmul(grad_grad[:, None, :], softmax[:, :, None]), axis=1)) *
+             softmax)
 
   return grad, _BroadcastMul(grad_loss, -nn_ops.log_softmax(logits))
 
@@ -473,7 +487,8 @@ def _SparseSoftmaxCrossEntropyWithLogitsGrad(op, grad_0, _):
   # so we make sure we prevent silently incorrect results by raising
   # an error if the second derivative is requested via prevent_gradient.
   sparse_softmax_grad_without_gradient = array_ops.prevent_gradient(
-      op.outputs[1], message="Currently there is no way to take the second "
+      op.outputs[1],
+      message="Currently there is no way to take the second "
       "derivative of sparse_softmax_cross_entropy_with_logits due to the fused "
       "implementation's interaction with tf.gradients()")
   return _BroadcastMul(grad_0, sparse_softmax_grad_without_gradient), None
@@ -531,14 +546,16 @@ def _DepthwiseConv2dNativeGrad(op, grad):
 
 @ops.RegisterGradient("Dilation2D")
 def _Dilation2DGrad(op, grad):
-  return [nn_ops.dilation2d_backprop_input(op.inputs[0], op.inputs[1], grad,
-                                           op.get_attr("strides"),
-                                           op.get_attr("rates"),
-                                           op.get_attr("padding")),
-          nn_ops.dilation2d_backprop_filter(op.inputs[0], op.inputs[1], grad,
-                                            op.get_attr("strides"),
-                                            op.get_attr("rates"),
-                                            op.get_attr("padding"))]
+  return [
+      nn_ops.dilation2d_backprop_input(op.inputs[0], op.inputs[1], grad,
+                                       op.get_attr("strides"),
+                                       op.get_attr("rates"),
+                                       op.get_attr("padding")),
+      nn_ops.dilation2d_backprop_filter(op.inputs[0], op.inputs[1], grad,
+                                        op.get_attr("strides"),
+                                        op.get_attr("rates"),
+                                        op.get_attr("padding"))
+  ]
 
 
 @ops.RegisterGradient("LRN")
@@ -547,8 +564,10 @@ def _LRNGrad(op, grad):
   bias = op.get_attr("bias")
   alpha = op.get_attr("alpha")
   beta = op.get_attr("beta")
-  return [gen_nn_ops._lrn_grad(grad, op.inputs[0], op.outputs[0], depth_radius,
-                               bias, alpha, beta)]
+  return [
+      gen_nn_ops._lrn_grad(grad, op.inputs[0], op.outputs[0], depth_radius,
+                           bias, alpha, beta)
+  ]
 
 
 @ops.RegisterGradient("AvgPool")
@@ -564,54 +583,58 @@ def _AvgPoolGrad(op, grad):
 
 @ops.RegisterGradient("AvgPoolGrad")
 def _AvgPoolGradGrad(op, grad):
-  return (array_ops.stop_gradient(op.inputs[0]), gen_nn_ops._avg_pool(
-      grad,
-      op.get_attr("ksize"),
-      op.get_attr("strides"),
-      op.get_attr("padding"),
-      data_format=op.get_attr("data_format")))
+  return (array_ops.stop_gradient(op.inputs[0]),
+          gen_nn_ops._avg_pool(
+              grad,
+              op.get_attr("ksize"),
+              op.get_attr("strides"),
+              op.get_attr("padding"),
+              data_format=op.get_attr("data_format")))
 
 
 @ops.RegisterGradient("MaxPool")
 def _MaxPoolGrad(op, grad):
-  return gen_nn_ops._max_pool_grad(op.inputs[0],
-                                   op.outputs[0],
-                                   grad,
-                                   op.get_attr("ksize"),
-                                   op.get_attr("strides"),
-                                   padding=op.get_attr("padding"),
-                                   data_format=op.get_attr("data_format"))
+  return gen_nn_ops._max_pool_grad(
+      op.inputs[0],
+      op.outputs[0],
+      grad,
+      op.get_attr("ksize"),
+      op.get_attr("strides"),
+      padding=op.get_attr("padding"),
+      data_format=op.get_attr("data_format"))
 
 
 @ops.RegisterGradient("MaxPoolV2")
 def _MaxPoolGradV2(op, grad):
   ksize = op.inputs[1]
   strides = op.inputs[2]
-  return gen_nn_ops.max_pool_grad_v2(op.inputs[0],
-                                     op.outputs[0],
-                                     grad,
-                                     ksize,
-                                     strides,
-                                     padding=op.get_attr("padding"),
-                                     data_format=op.get_attr("data_format")), None, None
+  return gen_nn_ops.max_pool_grad_v2(
+      op.inputs[0],
+      op.outputs[0],
+      grad,
+      ksize,
+      strides,
+      padding=op.get_attr("padding"),
+      data_format=op.get_attr("data_format")), None, None
 
 
 @ops.RegisterGradient("MaxPoolWithArgmax")
 def _MaxPoolGradWithArgmax(op, grad, unused_argmax_grad):
-  return gen_nn_ops._max_pool_grad_with_argmax(op.inputs[0],
-                                               grad,
-                                               op.outputs[1],
-                                               op.get_attr("ksize"),
-                                               op.get_attr("strides"),
-                                               padding=op.get_attr("padding"))
+  return gen_nn_ops._max_pool_grad_with_argmax(
+      op.inputs[0],
+      grad,
+      op.outputs[1],
+      op.get_attr("ksize"),
+      op.get_attr("strides"),
+      padding=op.get_attr("padding"))
 
 
 @ops.RegisterGradient("MaxPoolGrad")
 def _MaxPoolGradGrad(op, grad):
   return (array_ops.zeros(
-      shape=array_ops.shape(op.inputs[0]),
-      dtype=op.inputs[0].dtype), array_ops.zeros(
-          shape=array_ops.shape(op.inputs[1]), dtype=op.inputs[1].dtype),
+      shape=array_ops.shape(op.inputs[0]), dtype=op.inputs[0].dtype),
+          array_ops.zeros(
+              shape=array_ops.shape(op.inputs[1]), dtype=op.inputs[1].dtype),
           gen_nn_ops._max_pool_grad_grad(
               op.inputs[0],
               op.inputs[1],
@@ -627,9 +650,9 @@ def _MaxPoolGradGradV2(op, grad):
   ksize = op.inputs[3]
   strides = op.inputs[4]
   return (array_ops.zeros(
-      shape=array_ops.shape(op.inputs[0]),
-      dtype=op.inputs[0].dtype), array_ops.zeros(
-          shape=array_ops.shape(op.inputs[1]), dtype=op.inputs[1].dtype),
+      shape=array_ops.shape(op.inputs[0]), dtype=op.inputs[0].dtype),
+          array_ops.zeros(
+              shape=array_ops.shape(op.inputs[1]), dtype=op.inputs[1].dtype),
           gen_nn_ops.max_pool_grad_grad_v2(
               op.inputs[0],
               op.inputs[1],
@@ -643,9 +666,9 @@ def _MaxPoolGradGradV2(op, grad):
 @ops.RegisterGradient("MaxPoolGradGrad")
 def _MaxPoolGradGradGrad(op, grad):
   return (array_ops.zeros(
-      shape=array_ops.shape(op.inputs[0]),
-      dtype=op.inputs[0].dtype), array_ops.zeros(
-          shape=array_ops.shape(op.inputs[1]), dtype=op.inputs[1].dtype),
+      shape=array_ops.shape(op.inputs[0]), dtype=op.inputs[0].dtype),
+          array_ops.zeros(
+              shape=array_ops.shape(op.inputs[1]), dtype=op.inputs[1].dtype),
           gen_nn_ops._max_pool_grad(
               op.inputs[0],
               op.inputs[1],
@@ -674,10 +697,9 @@ def _FractionalMaxPoolGrad(op, grad_0, unused_grad_1, unused_grad_2):
     Input backprop for FractionalMaxPool op.
   """
   # pylint: disable=protected-access
-  return gen_nn_ops._fractional_max_pool_grad(op.inputs[0], op.outputs[0],
-                                              grad_0, op.outputs[1],
-                                              op.outputs[2],
-                                              op.get_attr("overlapping"))
+  return gen_nn_ops._fractional_max_pool_grad(
+      op.inputs[0], op.outputs[0], grad_0, op.outputs[1], op.outputs[2],
+      op.get_attr("overlapping"))
 
 
 @ops.RegisterGradient("FractionalAvgPool")
@@ -761,8 +783,9 @@ def _BaseFusedBatchNormGrad(op, use_v2, *grad):
   epsilon = op.get_attr("epsilon")
   data_format = op.get_attr("data_format")
   is_training = op.get_attr("is_training")
-  grad_fun = (gen_nn_ops.fused_batch_norm_grad_v2 if use_v2
-              else gen_nn_ops.fused_batch_norm_grad)
+  grad_fun = (
+      gen_nn_ops.fused_batch_norm_grad_v2
+      if use_v2 else gen_nn_ops.fused_batch_norm_grad)
   if is_training:
     return grad_fun(
         grad_y,
@@ -786,7 +809,7 @@ def _BaseFusedBatchNormGrad(op, use_v2, *grad):
         pop_mean,
         pop_var,
         epsilon=epsilon,
-        data_format='NHWC',
+        data_format="NHWC",
         is_training=is_training)
     if data_format == b"NCHW":
       dx = array_ops.transpose(dx, [0, 3, 1, 2])
@@ -803,18 +826,28 @@ def _FusedBatchNormV2Grad(op, *grad):
   return _BaseFusedBatchNormGrad(op, True, *grad)
 
 
-def _BatchNormGrad(grad_y, x, scale, pop_mean, pop_var, epsilon, data_format, is_training=True):
+def _BatchNormGrad(grad_y,
+                   x,
+                   scale,
+                   pop_mean,
+                   pop_var,
+                   epsilon,
+                   data_format,
+                   is_training=True):
   """Returns the gradients for the 3 inputs of BatchNorm.
 
   Args:
     grad_y: A `Tensor` of 4 dimensions for gradient for y.
     x: A `Tensor` of 4 dimensions for x.
     scale: A `Tensor` of 1 dimension for scaling.
-    pop_mean: A `Tensor` of 1 dimension for the population mean. Only used when is_training=False.
-    pop_var: A `Tensor` of 1 dimension for the population variance. Only used when is_training=False.
+    pop_mean: A `Tensor` of 1 dimension for the population mean. Only used when
+      is_training=False.
+    pop_var: A `Tensor` of 1 dimension for the population variance. Only used
+      when is_training=False.
     epsilon: A small float number added to the variance of x.
     data_format: The data format for input. Either b"NHWC" or b"NCHW".
-    is_training: A bool value to indicate the operation is for training (default)
+    is_training: A bool value to indicate the operation is for training
+      (default)
         or inference.
 
   Returns:
@@ -900,7 +933,7 @@ def _FusedBatchNormGradGrad(op, *grad):
   grad_grad_scale = grad[1]
   grad_grad_offset = grad[2]
   grad_x, grad_scale, grad_offset = _BatchNormGrad(
-        grad_y, x, scale, pop_mean, pop_var, epsilon, data_format, is_training)
+      grad_y, x, scale, pop_mean, pop_var, epsilon, data_format, is_training)
   grad_initial = [grad_grad_x, grad_grad_scale, grad_grad_offset]
   grad_grad_y, grad_x, grad_scale = gradients_impl.gradients(
       [grad_x, grad_scale, grad_offset], [grad_y, x, scale], grad_initial)
@@ -954,14 +987,15 @@ def _TopKGrad(op, grad, _):
 
   # Substitute grad to appropriate locations and fill the rest with zeros,
   # finally reshaping it to the original input shape.
-  return [array_ops.reshape(
-      sparse_ops.sparse_to_dense(ind,
-                                 array_ops.reshape(
-                                     math_ops.reduce_prod(in_shape), [1]),
-                                 array_ops.reshape(grad, [-1]),
-                                 validate_indices=False),
-      in_shape), array_ops.zeros(
-          [], dtype=dtypes.int32)]
+  return [
+      array_ops.reshape(
+          sparse_ops.sparse_to_dense(
+              ind,
+              array_ops.reshape(math_ops.reduce_prod(in_shape), [1]),
+              array_ops.reshape(grad, [-1]),
+              validate_indices=False), in_shape),
+      array_ops.zeros([], dtype=dtypes.int32)
+  ]
 
 
 @ops.RegisterGradient("NthElement")
@@ -983,11 +1017,9 @@ def _NthElementGrad(op, grad):
   # dimension. If there are multiple elements then the gradient will be
   # divided between them.
   indicators = math_ops.cast(
-      math_ops.equal(array_ops.expand_dims(output, -1), input),
-      grad.dtype)
+      math_ops.equal(array_ops.expand_dims(output, -1), input), grad.dtype)
 
   grad = array_ops.expand_dims(grad, -1)
-  num_selected = array_ops.expand_dims(
-      math_ops.reduce_sum(indicators, -1), -1)
+  num_selected = array_ops.expand_dims(math_ops.reduce_sum(indicators, -1), -1)
 
   return [math_ops.div(indicators, num_selected) * grad, None]
diff --git a/tensorflow/python/ops/nn_grad_test.py b/tensorflow/python/ops/nn_grad_test.py
index f7541c0e89..aa7539ae9f 100644
--- a/tensorflow/python/ops/nn_grad_test.py
+++ b/tensorflow/python/ops/nn_grad_test.py
@@ -30,17 +30,20 @@ from tensorflow.python.platform import test
 
 
 class Relu6OpTest(test.TestCase):
+
   def testRelu6GradGrad(self):
-    inputs = constant_op.constant([[-2, -1, 1, 3], [5, 7, 8, 9]],
-                                  dtype=dtypes.float32)
+    inputs = constant_op.constant(
+        [[-2, -1, 1, 3], [5, 7, 8, 9]], dtype=dtypes.float32)
     x_init_value = np.array([[-3.5, -1.5, 2, 4], [4.5, 7.5, 8.5, 11]])
     r = nn_ops.relu6(inputs)
     r_g = gradients_impl.gradients(r, inputs)[0]
     with self.test_session():
       error = gradient_checker.compute_gradient_error(
-        inputs, inputs.get_shape().as_list(),
-        r_g, r_g.get_shape().as_list(),
-        x_init_value=x_init_value)
+          inputs,
+          inputs.get_shape().as_list(),
+          r_g,
+          r_g.get_shape().as_list(),
+          x_init_value=x_init_value)
       self.assertLess(error, 1e-4)
 
 
diff --git a/tensorflow/python/ops/special_math_ops.py b/tensorflow/python/ops/special_math_ops.py
index 1990087072..15127862a4 100644
--- a/tensorflow/python/ops/special_math_ops.py
+++ b/tensorflow/python/ops/special_math_ops.py
@@ -155,27 +155,24 @@ def einsum(equation, *inputs, **kwargs):
         indices in its subscript, or
       - the input shapes are inconsistent along a particular axis.
   """
-  name = kwargs.pop("name", None)
+  name = kwargs.pop('name', None)
   if kwargs:
-    raise TypeError("invalid keyword arguments for this function: " +
-                    ", ".join([format(key)
-                               for key in sorted(list(kwargs.keys()))]))
-  with ops.name_scope(name, "einsum", [equation, inputs]) as name:
+    raise TypeError('invalid keyword arguments for this function: ' + ', '.join(
+        [format(key) for key in sorted(list(kwargs.keys()))]))
+  with ops.name_scope(name, 'einsum', [equation, inputs]) as name:
     if '...' in equation:
       raise ValueError('Subscripts with ellipses are not yet supported.')
 
     match = re.match('([a-z,]+)(->[a-z]*)?', equation)
     if not match:
-      raise ValueError(
-          'Indices have incorrect format: %s' % equation
-      )
+      raise ValueError('Indices have incorrect format: %s' % equation)
 
     inputs = list(inputs)
     input_axis_labels = match.group(1).split(',')
 
     if len(inputs) != len(input_axis_labels):
-      raise ValueError('Got %d arguments for equation "%s", expecting %d' % (
-          len(inputs), equation, len(input_axis_labels)))
+      raise ValueError('Got %d arguments for equation "%s", expecting %d' %
+                       (len(inputs), equation, len(input_axis_labels)))
 
     axis_labels = set(''.join(input_axis_labels))
     if match.group(2):
@@ -188,10 +185,8 @@ def einsum(equation, *inputs, **kwargs):
         for ax in axes_:
           counts[ax] += 1
 
-      output_axis_labels = ''.join(sorted(
-          ax for ax in indices
-          if counts[ax] == 1
-      ))
+      output_axis_labels = ''.join(
+          sorted(ax for ax in indices if counts[ax] == 1))
 
     for a in axis_labels:
       input_count = sum(1 for s in input_axis_labels if a in s)
@@ -203,22 +198,23 @@ def einsum(equation, *inputs, **kwargs):
 
     temp = inputs[0]
     temp_axis_labels = input_axis_labels[0]
-    for i in xrange(len(inputs)-1):
-      axes_to_sum = (set(temp_axis_labels) & set(input_axis_labels[i+1])
-                     - set(output_axis_labels))
-      temp, temp_axis_labels = _einsum_reduction(temp,
-                                                 temp_axis_labels,
-                                                 inputs[i+1],
-                                                 input_axis_labels[i+1],
-                                                 axes_to_sum)
+    for i in xrange(len(inputs) - 1):
+      axes_to_sum = (
+          set(temp_axis_labels) &
+          set(input_axis_labels[i + 1]) - set(output_axis_labels))
+      temp, temp_axis_labels = _einsum_reduction(
+          temp, temp_axis_labels, inputs[i + 1], input_axis_labels[i + 1],
+          axes_to_sum)
 
     missing_indices = set(temp_axis_labels) - set(output_axis_labels)
     if missing_indices:
-      reduction_indices = [i for i, a in enumerate(temp_axis_labels)
-                           if a not in output_axis_labels]
+      reduction_indices = [
+          i for i, a in enumerate(temp_axis_labels)
+          if a not in output_axis_labels
+      ]
       temp = math_ops.reduce_sum(temp, reduction_indices=reduction_indices)
-      temp_axis_labels = ''.join(a for a in temp_axis_labels
-                                 if a in output_axis_labels)
+      temp_axis_labels = ''.join(
+          a for a in temp_axis_labels if a in output_axis_labels)
 
     if sorted(temp_axis_labels) != sorted(output_axis_labels):
       raise ValueError('Invalid equation: %s' % equation)
@@ -296,8 +292,10 @@ def _einsum_reduction(t0, t0_axis_labels, t1, t1_axis_labels, axes_to_sum):
       return (1, a)
 
   axis_labels = [t0_axis_labels, t1_axis_labels]
-  sorted_axes = [sorted(sym_list, key=lambda a: sort_key(i, a))
-                 for i, sym_list in enumerate(axis_labels)]
+  sorted_axes = [
+      sorted(sym_list, key=lambda a: sort_key(i, a))
+      for i, sym_list in enumerate(axis_labels)
+  ]
   inputs = [t0, t1]
   for i, axes_str in enumerate(axis_labels):
     perm = [axes_str.find(a) for a in sorted_axes[i]]
@@ -325,30 +323,30 @@ def _einsum_reduction(t0, t0_axis_labels, t1, t1_axis_labels, axes_to_sum):
     num_broadcast_elements_t0 = _total_size(
         t0_shape[len(preserved_axes):-len(axes_to_sum)])
     num_summed_elements = _total_size(t0_shape[-len(axes_to_sum):])
-    new_shape = (t0_shape[:len(preserved_axes)]
-                 + [num_broadcast_elements_t0, num_summed_elements])
+    new_shape = (
+        t0_shape[:len(preserved_axes)] +
+        [num_broadcast_elements_t0, num_summed_elements])
     t0 = _reshape_if_necessary(t0, new_shape)
 
     t1_shape = _get_shape(t1)
     num_broadcast_elements_t1 = _total_size(
-        t1_shape[len(preserved_axes)+len(axes_to_sum):])
-    new_shape = (t1_shape[:len(preserved_axes)]
-                 + [num_summed_elements, num_broadcast_elements_t1])
+        t1_shape[len(preserved_axes) + len(axes_to_sum):])
+    new_shape = (
+        t1_shape[:len(preserved_axes)] +
+        [num_summed_elements, num_broadcast_elements_t1])
     t1 = _reshape_if_necessary(t1, new_shape)
 
     product = math_ops.matmul(t0, t1)
 
     # Undo compaction of broadcast axes
     uncompacted_shape = (
-        t0_shape[:len(preserved_axes)+len(broadcast_axes[0])]
-        + t1_shape[len(t1_shape)-len(broadcast_axes[1]):]
-    )
+        t0_shape[:len(preserved_axes) + len(broadcast_axes[0])] +
+        t1_shape[len(t1_shape) - len(broadcast_axes[1]):])
     product = _reshape_if_necessary(product, uncompacted_shape)
 
     product_axes = (
-        sorted_axes[0][:len(preserved_axes)+len(broadcast_axes[0])] +
-        sorted_axes[1][len(sorted_axes[1])-len(broadcast_axes[1]):]
-    )
+        sorted_axes[0][:len(preserved_axes) + len(broadcast_axes[0])] +
+        sorted_axes[1][len(sorted_axes[1]) - len(broadcast_axes[1]):])
 
     return product, ''.join(product_axes)
 
@@ -402,13 +400,11 @@ def _total_size(shape_values):
 def _exponential_space_einsum(equation, *inputs):
   """Fallback implementation that supports summing an index over > 2 inputs."""
   if '...' in equation:
-    raise ValueError("Subscripts with ellipses are not yet supported.")
+    raise ValueError('Subscripts with ellipses are not yet supported.')
 
   match = re.match('([a-z,]+)(->[a-z]*)?', equation)
   if not match:
-    raise ValueError(
-        'Indices have incorrect format: %s' % equation
-    )
+    raise ValueError('Indices have incorrect format: %s' % equation)
 
   inputs = list(inputs)
   idx_in = match.group(1).split(',')
@@ -425,21 +421,15 @@ def _exponential_space_einsum(equation, *inputs):
       for ax in axes_:
         counts[ax] += 1
 
-    idx_out = ''.join(sorted(
-        ax for ax in indices
-        if counts[ax] == 1
-    ))
+    idx_out = ''.join(sorted(ax for ax in indices if counts[ax] == 1))
 
   if len(idx_in) != len(inputs):
-    raise ValueError(
-        'Expected %d inputs but got %d' % (len(idx_in), len(inputs))
-    )
+    raise ValueError('Expected %d inputs but got %d' % (len(idx_in),
+                                                        len(inputs)))
 
   missing_idx = set(idx_out).difference(idx_all)
   if missing_idx:
-    raise ValueError(
-        'Unknown output axes: %s' % missing_idx
-    )
+    raise ValueError('Unknown output axes: %s' % missing_idx)
 
   axis_order = {}
   for ax in indices:
@@ -452,18 +442,17 @@ def _exponential_space_einsum(equation, *inputs):
   for i, (input_, axes_) in enumerate(zip(inputs, idx_in)):
     if input_.get_shape().ndims != len(axes_):
       raise ValueError(
-        'Input %d with axes %s has incorrect' \
-        ' number of dimensions (expected %d, got %d)' % (
-          i, axes_, len(axes_), input_.get_shape().ndims
-        )
+          'Input %d with axes %s has incorrect' \
+          ' number of dimensions (expected %d, got %d)' % (
+              i, axes_, len(axes_), input_.get_shape().ndims
+          )
       )
 
     sorted_idx = sorted(axes_, key=axis_order.get)
 
     if len(set(axes_)) != len(axes_):
       raise ValueError(
-          'Subscript not supported: an axis appears more than once: %s' % axes_
-      )
+          'Subscript not supported: an axis appears more than once: %s' % axes_)
 
     if list(axes_) != sorted_idx:
       permuted = [axes_.find(ax) for ax in sorted_idx]
@@ -487,16 +476,15 @@ def _exponential_space_einsum(equation, *inputs):
           dims.append(dim)
 
     if len(set(dims)) > 1:
-      raise ValueError(
-          'Dimension mismatch on axis: %s' % ax
-      )
+      raise ValueError('Dimension mismatch on axis: %s' % ax)
 
     if ax not in idx_out:
       reduction_idx.append(j)
 
   # reshape, multiply
-  expanded_inputs = [array_ops.reshape(input_, shape)
-                     for input_, shape in zip(inputs, shapes)]
+  expanded_inputs = [
+      array_ops.reshape(input_, shape) for input_, shape in zip(inputs, shapes)
+  ]
   expanded_output = 1
   for input_ in expanded_inputs:
     expanded_output *= input_
diff --git a/tensorflow/python/ops/special_math_ops_test.py b/tensorflow/python/ops/special_math_ops_test.py
index c1a66717d8..2c212f4548 100644
--- a/tensorflow/python/ops/special_math_ops_test.py
+++ b/tensorflow/python/ops/special_math_ops_test.py
@@ -39,8 +39,9 @@ class LBetaTest(test.TestCase):
     x_one_half = [2, 1.]
     with self.test_session(use_gpu=True):
       self.assertAllClose(1, math_ops.exp(special_math_ops.lbeta(x_one)).eval())
-      self.assertAllClose(
-          0.5, math_ops.exp(special_math_ops.lbeta(x_one_half)).eval())
+      self.assertAllClose(0.5,
+                          math_ops.exp(
+                              special_math_ops.lbeta(x_one_half)).eval())
       self.assertEqual([], special_math_ops.lbeta(x_one).get_shape())
 
   def test_one_dimensional_arg_dynamic(self):
@@ -70,8 +71,9 @@ class LBetaTest(test.TestCase):
     # Should evaluate to 1/2.
     x_one_half = [[2, 1.], [2, 1.]]
     with self.test_session(use_gpu=True):
-      self.assertAllClose(
-          [0.5, 0.5], math_ops.exp(special_math_ops.lbeta(x_one_half)).eval())
+      self.assertAllClose([0.5, 0.5],
+                          math_ops.exp(
+                              special_math_ops.lbeta(x_one_half)).eval())
       self.assertEqual((2,), special_math_ops.lbeta(x_one_half).get_shape())
 
   def test_two_dimensional_arg_dynamic(self):
@@ -86,10 +88,12 @@ class LBetaTest(test.TestCase):
     # Should evaluate to 1/2.
     x_one_half = [[2, 1.], [2, 1.]]
     with self.test_session(use_gpu=True):
-      self.assertAllClose(
-          [0.5, 0.5], math_ops.exp(special_math_ops.lbeta(x_one_half)).eval())
+      self.assertAllClose([0.5, 0.5],
+                          math_ops.exp(
+                              special_math_ops.lbeta(x_one_half)).eval())
       self.assertEqual(
-          (2,), array_ops.shape(special_math_ops.lbeta(x_one_half)).eval())
+          (2,),
+          array_ops.shape(special_math_ops.lbeta(x_one_half)).eval())
       self.assertEqual(
           tensor_shape.TensorShape([2]),
           special_math_ops.lbeta(x_one_half).get_shape())
@@ -97,8 +101,8 @@ class LBetaTest(test.TestCase):
   def test_complicated_shape(self):
     with self.test_session(use_gpu=True):
       x = ops.convert_to_tensor(np.random.rand(3, 2, 2))
-      self.assertAllEqual(
-          (3, 2), array_ops.shape(special_math_ops.lbeta(x)).eval())
+      self.assertAllEqual((3, 2),
+                          array_ops.shape(special_math_ops.lbeta(x)).eval())
       self.assertEqual(
           tensor_shape.TensorShape([3, 2]),
           special_math_ops.lbeta(x).get_shape())
@@ -155,7 +159,6 @@ class EinsumTest(test.TestCase):
       'ijk->i',
       'ijk->kji',
       'ji,kj->ik',
-
       'ikl,kji->kl',
       'klj,lki->ij',
       'ijk,ilj->kli',
@@ -164,7 +167,6 @@ class EinsumTest(test.TestCase):
       'i,ijk,j->k',
       'ij,ij,jk,kl->il',
       'ij,kj,il,jm->ml',
-
       'a,ab,abc->abc',
       'a,b,ab->ab',
       'ab,ab,c->',
@@ -173,25 +175,21 @@ class EinsumTest(test.TestCase):
       'ab,ab,cd,cd->ac',
       'ab,ab,cd,cd->cd',
       'ab,ab,cd,cd,ef,ef->',
-
       'ab,cd,ef->abcdef',
       'ab,cd,ef->acdf',
       'ab,cd,de->abcde',
       'ab,cd,de->be',
       'ab,bcd,cd->abcd',
       'ab,bcd,cd->abd',
-
       'eb,cb,fb->cef',
       'abcd,ad',
       'bd,db,eac->ace',
       'ba,ac,da->bcd',
-
       'ab,ab',
       'ab,ba',
       'abc,abc',
       'abc,bac',
       'abc,cba',
-
       'dba,ead,cad->bce',
       'aef,fbc,dca->bde',
   ]
@@ -234,10 +232,8 @@ class EinsumTest(test.TestCase):
   def test_invalid(self):
     for axes in self.invalid_cases:
       inputs = [
-          array_ops.placeholder(
-              dtypes.float32, shape=(3, 4)),
-          array_ops.placeholder(
-              dtypes.float32, shape=(3, 4)),
+          array_ops.placeholder(dtypes.float32, shape=(3, 4)),
+          array_ops.placeholder(dtypes.float32, shape=(3, 4)),
       ]
       with self.assertRaises(ValueError):
         _ = special_math_ops.einsum(axes, *inputs)
@@ -245,16 +241,22 @@ class EinsumTest(test.TestCase):
   def test_invalid_keyword_arguments(self):
     m0 = array_ops.placeholder(dtypes.int32, shape=(1, None))
     m1 = array_ops.placeholder(dtypes.int32, shape=(None, 1))
-    with self.assertRaisesRegexp(TypeError,
+    with self.assertRaisesRegexp(
+        TypeError,
         'invalid keyword arguments for this function: invalid1, invalid2'):
-      _ = special_math_ops.einsum('ij,jk->ik', m0, m1, name="name",
-                                  invalid1="value1", invalid2="value2")
+      _ = special_math_ops.einsum(
+          'ij,jk->ik',
+          m0,
+          m1,
+          name='name',
+          invalid1='value1',
+          invalid2='value2')
 
   def test_dim_mismatch(self):
     for axes, input_shapes in self.dim_mismatch_cases:
       inputs = [
-          array_ops.placeholder(
-              dtypes.float32, shape=shape) for shape in input_shapes
+          array_ops.placeholder(dtypes.float32, shape=shape)
+          for shape in input_shapes
       ]
       with self.assertRaises(ValueError):
         _ = special_math_ops.einsum(axes, *inputs)
@@ -291,8 +293,8 @@ class EinsumTest(test.TestCase):
             m0: [[1, 2, 3]],
             m1: [[2], [1], [1]],
         }
-        np.testing.assert_almost_equal(
-            [[7]], sess.run(out, feed_dict=feed_dict))
+        np.testing.assert_almost_equal([[7]], sess.run(
+            out, feed_dict=feed_dict))
 
     with ops.Graph().as_default():
       m0 = array_ops.placeholder(dtypes.int32, shape=(None, 3))
@@ -312,11 +314,11 @@ class EinsumTest(test.TestCase):
       out = special_math_ops.einsum('ijk,kl->ijl', m0, m1)
       with session.Session() as sess:
         feed_dict = {
-            m0: [[[1,2]]],
+            m0: [[[1, 2]]],
             m1: [[3], [2]],
         }
-        np.testing.assert_almost_equal(
-            [[[7]]], sess.run(out, feed_dict=feed_dict))
+        np.testing.assert_almost_equal([[[7]]],
+                                       sess.run(out, feed_dict=feed_dict))
 
     with ops.Graph().as_default():
       m0 = array_ops.placeholder(dtypes.int32, shape=(2, 1))
@@ -325,10 +327,10 @@ class EinsumTest(test.TestCase):
       with session.Session() as sess:
         feed_dict = {
             m0: [[3], [2]],
-            m1: [[[1,2]]],
+            m1: [[[1, 2]]],
         }
-        np.testing.assert_almost_equal(
-            [[[7]]], sess.run(out, feed_dict=feed_dict))
+        np.testing.assert_almost_equal([[[7]]],
+                                       sess.run(out, feed_dict=feed_dict))
 
     with ops.Graph().as_default():
       m0 = array_ops.placeholder(dtypes.int32, shape=(None, None, 2))
@@ -339,8 +341,8 @@ class EinsumTest(test.TestCase):
             m0: [[[1, 2]]],
             m1: [3, 2],
         }
-        np.testing.assert_almost_equal(
-           [[7]], sess.run(out, feed_dict=feed_dict))
+        np.testing.assert_almost_equal([[7]], sess.run(
+            out, feed_dict=feed_dict))
 
     with ops.Graph().as_default():
       m0 = array_ops.placeholder(dtypes.int32, shape=(None, 2, None, 2))
@@ -351,8 +353,8 @@ class EinsumTest(test.TestCase):
             m0: [[[[1, 2]], [[2, 1]]]],
             m1: [[3, 2]],
         }
-        np.testing.assert_almost_equal(
-            [[[7, 8]]], sess.run(out, feed_dict=feed_dict))
+        np.testing.assert_almost_equal([[[7, 8]]],
+                                       sess.run(out, feed_dict=feed_dict))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/tools/inspect_checkpoint.py b/tensorflow/python/tools/inspect_checkpoint.py
index 8716058e61..dd876cbe7f 100644
--- a/tensorflow/python/tools/inspect_checkpoint.py
+++ b/tensorflow/python/tools/inspect_checkpoint.py
@@ -97,8 +97,9 @@ def parse_numpy_printoption(kv_str):
     raise argparse.ArgumentTypeError(
         "Setting '%s' from the command line is not supported." % k)
   try:
-    v = (v_type(v_str) if v_type is not bool
-         else flags.BooleanParser().parse(v_str))
+    v = (
+        v_type(v_str)
+        if v_type is not bool else flags.BooleanParser().parse(v_str))
   except ValueError as e:
     raise argparse.ArgumentTypeError(e.message)
   np.set_printoptions(**{k: v})
@@ -121,9 +122,12 @@ if __name__ == "__main__":
   parser = argparse.ArgumentParser()
   parser.register("type", "bool", lambda v: v.lower() == "true")
   parser.add_argument(
-      "--file_name", type=str, default="", help="Checkpoint filename. "
-                    "Note, if using Checkpoint V2 format, file_name is the "
-                    "shared prefix between all files in the checkpoint.")
+      "--file_name",
+      type=str,
+      default="",
+      help="Checkpoint filename. "
+      "Note, if using Checkpoint V2 format, file_name is the "
+      "shared prefix between all files in the checkpoint.")
   parser.add_argument(
       "--tensor_name",
       type=str,
diff --git a/tensorflow/python/training/coordinator_test.py b/tensorflow/python/training/coordinator_test.py
index 149d3eed41..3e4ac1dfff 100644
--- a/tensorflow/python/training/coordinator_test.py
+++ b/tensorflow/python/training/coordinator_test.py
@@ -85,8 +85,8 @@ class CoordinatorTest(test.TestCase):
     self.assertFalse(coord.wait_for_stop(0.1))
     wait_for_stop_ev = threading.Event()
     has_stopped_ev = threading.Event()
-    t = threading.Thread(target=StopOnEvent,
-                         args=(coord, wait_for_stop_ev, has_stopped_ev))
+    t = threading.Thread(
+        target=StopOnEvent, args=(coord, wait_for_stop_ev, has_stopped_ev))
     t.start()
     self.assertFalse(coord.should_stop())
     self.assertFalse(coord.wait_for_stop(0.01))
@@ -100,7 +100,8 @@ class CoordinatorTest(test.TestCase):
     threads = [
         threading.Thread(target=SleepABit, args=(0.01,)),
         threading.Thread(target=SleepABit, args=(0.02,)),
-        threading.Thread(target=SleepABit, args=(0.01,))]
+        threading.Thread(target=SleepABit, args=(0.01,))
+    ]
     for t in threads:
       t.start()
     coord.join(threads)
@@ -112,7 +113,8 @@ class CoordinatorTest(test.TestCase):
     threads = [
         threading.Thread(target=SleepABit, args=(0.01, coord)),
         threading.Thread(target=SleepABit, args=(0.02, coord)),
-        threading.Thread(target=SleepABit, args=(0.01, coord))]
+        threading.Thread(target=SleepABit, args=(0.01, coord))
+    ]
     for t in threads:
       t.start()
     WaitForThreadsToRegister(coord, 3)
@@ -125,7 +127,8 @@ class CoordinatorTest(test.TestCase):
     threads = [
         threading.Thread(target=SleepABit, args=(0.01, coord)),
         threading.Thread(target=SleepABit, args=(0.02,)),
-        threading.Thread(target=SleepABit, args=(0.01, coord))]
+        threading.Thread(target=SleepABit, args=(0.01, coord))
+    ]
     for t in threads:
       t.start()
     WaitForThreadsToRegister(coord, 2)
@@ -135,14 +138,17 @@ class CoordinatorTest(test.TestCase):
       self.assertFalse(t.is_alive())
 
   def testJoinGraceExpires(self):
+
     def TestWithGracePeriod(stop_grace_period):
       coord = coordinator.Coordinator()
       wait_for_stop_ev = threading.Event()
       has_stopped_ev = threading.Event()
       threads = [
-          threading.Thread(target=StopOnEvent,
-                           args=(coord, wait_for_stop_ev, has_stopped_ev)),
-          threading.Thread(target=SleepABit, args=(10.0,))]
+          threading.Thread(
+              target=StopOnEvent,
+              args=(coord, wait_for_stop_ev, has_stopped_ev)),
+          threading.Thread(target=SleepABit, args=(10.0,))
+      ]
       for t in threads:
         t.daemon = True
         t.start()
@@ -150,6 +156,7 @@ class CoordinatorTest(test.TestCase):
       has_stopped_ev.wait()
       with self.assertRaisesRegexp(RuntimeError, "threads still running"):
         coord.join(threads, stop_grace_period_secs=stop_grace_period)
+
     TestWithGracePeriod(1e-10)
     TestWithGracePeriod(0.002)
     TestWithGracePeriod(1.0)
@@ -159,16 +166,16 @@ class CoordinatorTest(test.TestCase):
     wait_for_stop_ev = threading.Event()
     has_stopped_ev = threading.Event()
     threads = [
-        threading.Thread(target=StopOnEvent,
-                         args=(coord, wait_for_stop_ev, has_stopped_ev)),
-        threading.Thread(target=SleepABit, args=(10.0,))]
+        threading.Thread(
+            target=StopOnEvent, args=(coord, wait_for_stop_ev, has_stopped_ev)),
+        threading.Thread(target=SleepABit, args=(10.0,))
+    ]
     for t in threads:
       t.daemon = True
       t.start()
     wait_for_stop_ev.set()
     has_stopped_ev.wait()
-    coord.join(
-        threads, stop_grace_period_secs=1., ignore_live_threads=True)
+    coord.join(threads, stop_grace_period_secs=1., ignore_live_threads=True)
 
   def testJoinRaiseReportExcInfo(self):
     coord = coordinator.Coordinator()
@@ -180,7 +187,8 @@ class CoordinatorTest(test.TestCase):
             args=(coord, ev_1, ev_2, RuntimeError("First"), False)),
         threading.Thread(
             target=RaiseOnEvent,
-            args=(coord, ev_2, None, RuntimeError("Too late"), False))]
+            args=(coord, ev_2, None, RuntimeError("Too late"), False))
+    ]
     for t in threads:
       t.start()
 
@@ -199,7 +207,8 @@ class CoordinatorTest(test.TestCase):
             args=(coord, ev_1, ev_2, RuntimeError("First"), True)),
         threading.Thread(
             target=RaiseOnEvent,
-            args=(coord, ev_2, None, RuntimeError("Too late"), True))]
+            args=(coord, ev_2, None, RuntimeError("Too late"), True))
+    ]
     for t in threads:
       t.start()
 
@@ -214,9 +223,8 @@ class CoordinatorTest(test.TestCase):
         threading.Thread(
             target=RaiseOnEvent,
             args=(coord, ev_1, None,
-                  errors_impl.OutOfRangeError(None, None, "First"),
-                  True))
-        ]
+                  errors_impl.OutOfRangeError(None, None, "First"), True))
+    ]
     for t in threads:
       t.start()
 
@@ -230,7 +238,7 @@ class CoordinatorTest(test.TestCase):
         threading.Thread(
             target=RaiseOnEvent,
             args=(coord, ev_1, None, ValueError("Clean stop"), True))
-        ]
+    ]
     for t in threads:
       t.start()
 
@@ -247,7 +255,8 @@ class CoordinatorTest(test.TestCase):
             args=(coord, ev_1, ev_2, RuntimeError("First"))),
         threading.Thread(
             target=RaiseOnEventUsingContextHandler,
-            args=(coord, ev_2, None, RuntimeError("Too late")))]
+            args=(coord, ev_2, None, RuntimeError("Too late")))
+    ]
     for t in threads:
       t.start()
 
@@ -262,7 +271,7 @@ class CoordinatorTest(test.TestCase):
         threading.Thread(
             target=RaiseOnEvent,
             args=(coord, ev_1, None, RuntimeError("First"), True)),
-        ]
+    ]
     for t in threads:
       t.start()
 
@@ -274,7 +283,7 @@ class CoordinatorTest(test.TestCase):
         threading.Thread(
             target=RaiseOnEvent,
             args=(coord, ev_1, None, RuntimeError("Second"), True)),
-        ]
+    ]
     for t in threads:
       t.start()
     with self.assertRaisesRegexp(RuntimeError, "Second"):
@@ -337,24 +346,29 @@ class LooperTest(test.TestCase):
   def testTargetArgs(self):
     n = [3]
     coord = coordinator.Coordinator()
-    thread = coordinator.LooperThread.loop(coord, 0, target=_StopAt0,
-                                        args=(coord, n))
+    thread = coordinator.LooperThread.loop(
+        coord, 0, target=_StopAt0, args=(coord, n))
     coord.join([thread])
     self.assertEqual(0, n[0])
 
   def testTargetKwargs(self):
     n = [3]
     coord = coordinator.Coordinator()
-    thread = coordinator.LooperThread.loop(coord, 0, target=_StopAt0,
-                                        kwargs={"coord": coord, "n": n})
+    thread = coordinator.LooperThread.loop(
+        coord, 0, target=_StopAt0, kwargs={
+            "coord": coord,
+            "n": n
+        })
     coord.join([thread])
     self.assertEqual(0, n[0])
 
   def testTargetMixedArgs(self):
     n = [3]
     coord = coordinator.Coordinator()
-    thread = coordinator.LooperThread.loop(coord, 0, target=_StopAt0,
-                                        args=(coord,), kwargs={"n": n})
+    thread = coordinator.LooperThread.loop(
+        coord, 0, target=_StopAt0, args=(coord,), kwargs={
+            "n": n
+        })
     coord.join([thread])
     self.assertEqual(0, n[0])
 
diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index aa341b144c..27fa1b89ce 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -177,7 +177,13 @@ do_pylint() {
   echo "pylint took $((PYLINT_END_TIME - PYLINT_START_TIME)) s"
   echo ""
 
-  grep -E '(\[E|\[W0311|\[W0312)' ${OUTPUT_FILE} > ${ERRORS_FILE}
+  # Report only what we care about
+  # Ref https://pylint.readthedocs.io/en/latest/technical_reference/features.html
+  # E: all errors
+  # W0311 bad-indentation
+  # W0312 mixed-indentation
+  # C0330 bad-continuation
+  grep -E '(\[E|\[W0311|\[W0312|\[C0330)' ${OUTPUT_FILE} > ${ERRORS_FILE}
 
   N_ERRORS=0
   while read -r LINE; do
diff --git a/tensorflow/tools/compatibility/ast_edits.py b/tensorflow/tools/compatibility/ast_edits.py
index e7e4c91692..23cc4a21a9 100644
--- a/tensorflow/tools/compatibility/ast_edits.py
+++ b/tensorflow/tools/compatibility/ast_edits.py
@@ -45,8 +45,9 @@ class APIChangeSpec(object):
   """
 
 
-class _FileEditTuple(collections.namedtuple(
-    "_FileEditTuple", ["comment", "line", "start", "old", "new"])):
+class _FileEditTuple(
+    collections.namedtuple("_FileEditTuple",
+                           ["comment", "line", "start", "old", "new"])):
   """Each edit that is recorded by a _FileEditRecorder.
 
   Fields:
@@ -178,8 +179,7 @@ class _ASTCallVisitor(ast.NodeVisitor):
     function_renames = self._api_change_spec.function_renames
     try:
       new_name = function_renames[full_name]
-      self._file_edit.add("Renamed function %r to %r" % (full_name,
-                                                         new_name),
+      self._file_edit.add("Renamed function %r to %r" % (full_name, new_name),
                           node.lineno, node.col_offset, full_name, new_name)
     except KeyError:
       pass
@@ -226,7 +226,7 @@ class _ASTCallVisitor(ast.NodeVisitor):
       # loop over lines
       while 1:
         # Reverse the text to and regular expression search for whitespace
-        text = self._lines[line-1]
+        text = self._lines[line - 1]
         reversed_preceding_text = text[:col][::-1]
         # First find if a [ can be found with only whitespace between it and
         # col.
@@ -235,8 +235,8 @@ class _ASTCallVisitor(ast.NodeVisitor):
           new_col_offset = col - m.start(1) - 1
           return line, new_col_offset
         else:
-          if (reversed_preceding_text=="" or
-             reversed_preceding_text.isspace()):
+          if (reversed_preceding_text == "" or
+              reversed_preceding_text.isspace()):
             line = line - 1
             prev_line = self._lines[line - 1]
             # TODO(aselle):
@@ -247,8 +247,8 @@ class _ASTCallVisitor(ast.NodeVisitor):
             # node ranges to filter out spurious #'s that appear in string
             # literals.
             comment_start = prev_line.find("#")
-            if comment_start ==  -1:
-              col = len(prev_line) -1
+            if comment_start == -1:
+              col = len(prev_line) - 1
             elif find_string_chars.search(prev_line[comment_start:]) is None:
               col = comment_start
             else:
@@ -259,7 +259,6 @@ class _ASTCallVisitor(ast.NodeVisitor):
     # it is not possible to use that in an argument.
     return node.lineno, node.col_offset
 
-
   def visit_Call(self, node):  # pylint: disable=invalid-name
     """Handle visiting a call node in the AST.
 
@@ -267,7 +266,6 @@ class _ASTCallVisitor(ast.NodeVisitor):
       node: Current Node
     """
 
-
     # Find a simple attribute name path e.g. "tf.foo.bar"
     full_name = self._get_attribute_full_path(node.func)
 
@@ -292,18 +290,21 @@ class _ASTCallVisitor(ast.NodeVisitor):
           lineno, col_offset = self._find_true_position(arg)
           if lineno is None or col_offset is None:
             self._file_edit.add(
-                "Failed to add keyword %r to reordered function %r"
-                % (reordered[idx], full_name), arg.lineno, arg.col_offset,
-                "", "",
+                "Failed to add keyword %r to reordered function %r" %
+                (reordered[idx], full_name),
+                arg.lineno,
+                arg.col_offset,
+                "",
+                "",
                 error="A necessary keyword argument failed to be inserted.")
           else:
             keyword_arg = reordered[idx]
             if (full_name in function_keyword_renames and
                 keyword_arg in function_keyword_renames[full_name]):
               keyword_arg = function_keyword_renames[full_name][keyword_arg]
-            self._file_edit.add("Added keyword %r to reordered function %r"
-                                % (reordered[idx], full_name), lineno,
-                                col_offset, "", keyword_arg + "=")
+            self._file_edit.add("Added keyword %r to reordered function %r" %
+                                (reordered[idx], full_name), lineno, col_offset,
+                                "", keyword_arg + "=")
 
       # Examine each keyword argument and convert it to the final renamed form
       renamed_keywords = ({} if full_name not in function_keyword_renames else
@@ -321,11 +322,11 @@ class _ASTCallVisitor(ast.NodeVisitor):
             # value.
             key_start = argval_col_offset - len(argkey) - 1
             key_end = key_start + len(argkey) + 1
-            if (self._lines[argval_lineno - 1][key_start:key_end] ==
-                argkey + "="):
+            if (self._lines[argval_lineno - 1][key_start:key_end] == argkey +
+                "="):
               self._file_edit.add("Renamed keyword argument from %r to %r" %
-                                  (argkey, renamed_keywords[argkey]),
-                                  argval_lineno,
+                                  (argkey,
+                                   renamed_keywords[argkey]), argval_lineno,
                                   argval_col_offset - len(argkey) - 1,
                                   argkey + "=", renamed_keywords[argkey] + "=")
               continue
@@ -334,7 +335,8 @@ class _ASTCallVisitor(ast.NodeVisitor):
               (argkey, renamed_keywords[argkey]),
               argval.lineno,
               argval.col_offset - len(argkey) - 1,
-              "", "",
+              "",
+              "",
               error="Failed to find keyword lexographically. Fix manually.")
 
     ast.NodeVisitor.generic_visit(self, node)
@@ -351,7 +353,7 @@ class _ASTCallVisitor(ast.NodeVisitor):
     if full_name in self._api_change_spec.change_to_function:
       if not hasattr(node, "is_function_for_call"):
         new_text = full_name + "()"
-        self._file_edit.add("Changed %r to %r"%(full_name, new_text),
+        self._file_edit.add("Changed %r to %r" % (full_name, new_text),
                             node.lineno, node.col_offset, full_name, new_text)
 
     ast.NodeVisitor.generic_visit(self, node)
@@ -379,8 +381,8 @@ class ASTCodeUpgrader(object):
     # Write to a temporary file, just in case we are doing an implace modify.
     with open(in_filename, "r") as in_file, \
         tempfile.NamedTemporaryFile("w", delete=False) as temp_file:
-      ret = self.process_opened_file(
-          in_filename, in_file, out_filename, temp_file)
+      ret = self.process_opened_file(in_filename, in_file, out_filename,
+                                     temp_file)
 
     shutil.move(temp_file.name, out_filename)
     return ret
@@ -423,6 +425,7 @@ class ASTCodeUpgrader(object):
         out_file.write(out_text)
     text += "\n"
     return 1, text, process_errors
+
   # pylint: enable=broad-except
 
   def process_tree(self, root_directory, output_root_directory,
@@ -443,16 +446,16 @@ class ASTCodeUpgrader(object):
 
     # make sure output directory doesn't exist
     if output_root_directory and os.path.exists(output_root_directory):
-      print("Output directory %r must not already exist." % (
-          output_root_directory))
+      print("Output directory %r must not already exist." %
+            (output_root_directory))
       sys.exit(1)
 
     # make sure output directory does not overlap with root_directory
     norm_root = os.path.split(os.path.normpath(root_directory))
     norm_output = os.path.split(os.path.normpath(output_root_directory))
     if norm_root == norm_output:
-      print("Output directory %r same as input directory %r" % (
-          root_directory, output_root_directory))
+      print("Output directory %r same as input directory %r" %
+            (root_directory, output_root_directory))
       sys.exit(1)
 
     # Collect list of files to process (we do this to correctly handle if the
@@ -464,14 +467,16 @@ class ASTCodeUpgrader(object):
       copy_files = [f for f in file_list if not f.endswith(".py")]
       for filename in py_files:
         fullpath = os.path.join(dir_name, filename)
-        fullpath_output = os.path.join(
-            output_root_directory, os.path.relpath(fullpath, root_directory))
+        fullpath_output = os.path.join(output_root_directory,
+                                       os.path.relpath(fullpath,
+                                                       root_directory))
         files_to_process.append((fullpath, fullpath_output))
       if copy_other_files:
         for filename in copy_files:
           fullpath = os.path.join(dir_name, filename)
-          fullpath_output = os.path.join(
-              output_root_directory, os.path.relpath(fullpath, root_directory))
+          fullpath_output = os.path.join(output_root_directory,
+                                         os.path.relpath(
+                                             fullpath, root_directory))
           files_to_copy.append((fullpath, fullpath_output))
 
     file_count = 0
diff --git a/tensorflow/tools/compatibility/tf_upgrade.py b/tensorflow/tools/compatibility/tf_upgrade.py
index fa1cc73905..f678681dac 100644
--- a/tensorflow/tools/compatibility/tf_upgrade.py
+++ b/tensorflow/tools/compatibility/tf_upgrade.py
@@ -236,8 +236,8 @@ class _ASTCallVisitor(ast.NodeVisitor):
           new_col_offset = col - m.start(1) - 1
           return line, new_col_offset
         else:
-          if (reversed_preceding_text=="" or
-             reversed_preceding_text.isspace()):
+          if (reversed_preceding_text == "" or
+              reversed_preceding_text.isspace()):
             line = line - 1
             prev_line = self._lines[line - 1]
             # TODO(aselle):
-- 
GitLab


From 111a466ea14cb3ca9cd2b9c76b459bf8472ca98f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 25 Jan 2018 12:24:58 -0800
Subject: [PATCH 172/966] Automated g4 rollback of changelist 183251689

PiperOrigin-RevId: 183273334
---
 tensorflow/python/ops/rnn.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/tensorflow/python/ops/rnn.py b/tensorflow/python/ops/rnn.py
index a10e1963d1..a1008f1c83 100644
--- a/tensorflow/python/ops/rnn.py
+++ b/tensorflow/python/ops/rnn.py
@@ -812,10 +812,7 @@ def _dynamic_rnn_loop(cell,
     return (time + 1, output_ta_t, new_state)
 
   if in_graph_mode:
-    # Make sure that we run at least 1 step, if necessary, to ensure
-    # the TensorArrays pick up the dynamic shape.
-    loop_bound = math_ops.minimum(
-        time_steps, math_ops.maximum(1, max_sequence_length))
+    loop_bound = max_sequence_length
   else:
     # Using max_sequence_length isn't currently supported in the Eager branch.
     loop_bound = time_steps
-- 
GitLab


From dce14f5d38a19b3a19c834a877acc0042f1e9fbd Mon Sep 17 00:00:00 2001
From: Viraj Navkal <vnavkal0@gmail.com>
Date: Thu, 25 Jan 2018 13:11:40 -0800
Subject: [PATCH 173/966] Remove calculation of unnecessary matrix columns in
 SVD gradient (#15801)

* remove calculation of unnecessary matrix columns in svd gradient

* simplify expression for svd gradient when compute_uv=False

* assign Operation attribute to local variable
---
 tensorflow/python/ops/linalg_grad.py | 59 ++++++++++++----------------
 1 file changed, 26 insertions(+), 33 deletions(-)

diff --git a/tensorflow/python/ops/linalg_grad.py b/tensorflow/python/ops/linalg_grad.py
index 13a32c83d9..3cbbf3412a 100644
--- a/tensorflow/python/ops/linalg_grad.py
+++ b/tensorflow/python/ops/linalg_grad.py
@@ -277,20 +277,28 @@ def _SvdGrad(op, grad_s, grad_u, grad_v):
   # https://j-towns.github.io/papers/svd-derivative.pdf
   a = op.inputs[0]
   a_shape = a.get_shape().with_rank_at_least(2)
+  grad_s_mat = array_ops.matrix_diag(grad_s)
 
-  if op.get_attr("compute_uv"):
-    # TODO(rmlarsen): Make this work with complex types.
-    if a.dtype.is_complex:
-      raise NotImplementedError(
-          "SVD gradient is not implemented for complex types and "
-          "compute_uv=True.")
-    grad_u_shape = grad_u.get_shape().with_rank_at_least(2)
-    grad_v_shape = grad_v.get_shape().with_rank_at_least(2)
-    m = a_shape[-2].merge_with(grad_u_shape[-2])
-    n = a_shape[-1].merge_with(grad_v_shape[-2])
-    batch_shape = a_shape[:-2].merge_with(grad_u_shape[:-2]).merge_with(
-        grad_v_shape[:-2])
-    a_shape = batch_shape.concatenate([m, n])
+  if not op.get_attr("compute_uv"):
+    s, u, v = linalg_ops.svd(a, compute_uv=True)
+    grad_a = math_ops.matmul(u, math_ops.matmul(grad_s_mat, v, adjoint_b=True))
+    grad_a.set_shape(a_shape)
+    return grad_a
+
+  full_matrices = op.get_attr("full_matrices")
+
+  # TODO(rmlarsen): Make this work with complex types.
+  if a.dtype.is_complex:
+    raise NotImplementedError(
+        "SVD gradient is not implemented for complex types and "
+        "compute_uv=True.")
+  grad_u_shape = grad_u.get_shape().with_rank_at_least(2)
+  grad_v_shape = grad_v.get_shape().with_rank_at_least(2)
+  m = a_shape[-2].merge_with(grad_u_shape[-2])
+  n = a_shape[-1].merge_with(grad_v_shape[-2])
+  batch_shape = a_shape[:-2].merge_with(grad_u_shape[:-2]).merge_with(
+      grad_v_shape[:-2])
+  a_shape = batch_shape.concatenate([m, n])
 
   m = a_shape[-2].value
   n = a_shape[-1].value
@@ -300,12 +308,9 @@ def _SvdGrad(op, grad_s, grad_u, grad_v):
         "SVD gradient has not been implemented for input with unknown "
         "inner matrix shape.")
 
-  if not op.get_attr("compute_uv"):
-    s, u, v = linalg_ops.svd(a, compute_uv=True, full_matrices=True)
-  else:
-    s = op.outputs[0]
-    u = op.outputs[1]
-    v = op.outputs[2]
+  s = op.outputs[0]
+  u = op.outputs[1]
+  v = op.outputs[2]
 
   use_adjoint = False
   if m > n:
@@ -317,19 +322,7 @@ def _SvdGrad(op, grad_s, grad_u, grad_v):
     grad_u, grad_v = grad_v, grad_u
 
   with ops.control_dependencies([grad_s, grad_u, grad_v]):
-    grad_s_mat = array_ops.matrix_diag(grad_s)
-    if not op.get_attr("compute_uv"):
-      if use_adjoint:
-        grad_a = math_ops.matmul(
-            v[..., :, :m], math_ops.matmul(u, grad_s_mat), adjoint_b=True)
-      else:
-        grad_a = math_ops.matmul(u,
-                                 math_ops.matmul(
-                                     grad_s_mat, v[..., :, :m], adjoint_b=True))
-      grad_a.set_shape(a_shape)
-      return grad_a
-
-    if op.get_attr("full_matrices") and abs(m - n) > 1:
+    if full_matrices and abs(m - n) > 1:
       raise NotImplementedError(
           "svd gradient is not implemented for abs(m - n) > 1 "
           "when full_matrices is True")
@@ -371,7 +364,7 @@ def _SvdGrad(op, grad_s, grad_u, grad_v):
       gv1t_v1 = math_ops.matmul(gv1t, v1)
       term2_nous = gv1t - math_ops.matmul(gv1t_v1, v1, adjoint_b=True)
 
-      if op.get_attr("full_matrices"):
+      if full_matrices:
         v2 = v[..., :, m:n]
         grad_v2 = grad_v[..., :, m:n]
 
-- 
GitLab


From 97cdb0625f45b02d2f1eb71da8ac001141851438 Mon Sep 17 00:00:00 2001
From: Yao Zhang <yaozhang@google.com>
Date: Thu, 25 Jan 2018 13:21:24 -0800
Subject: [PATCH 174/966] VLOG shape inference and annotation return status.

PiperOrigin-RevId: 183280222
---
 tensorflow/core/grappler/optimizers/layout_optimizer.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.cc b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
index 50e6ba4a64..735d78e7ee 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
@@ -2076,6 +2076,7 @@ Status LayoutOptimizer::Tune(const GrapplerItem& item,
                              const TuningConfig& config, GraphDef* output) {
   auto status = graph_properties.AnnotateOutputShapes(output);
   if (!status.ok()) {
+    VLOG(1) << "Annotate shape return status: " << status.ToString();
     *output = item.graph;
     return status;
   }
@@ -2100,6 +2101,7 @@ Status LayoutOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   GraphProperties graph_properties(item);
   auto status = graph_properties.InferStatically(false);
   if (!status.ok()) {
+    VLOG(1) << "Infer shape return status: " << status.ToString();
     *output = item.graph;
     return status;
   }
-- 
GitLab


From dbfa5f559448a9466187be472d8a54eee1f69b46 Mon Sep 17 00:00:00 2001
From: Nick Felt <nickfelt@google.com>
Date: Thu, 25 Jan 2018 13:11:07 -0800
Subject: [PATCH 175/966] Update tensorboard dep to >= 1.5.0, < 1.6.0

---
 tensorflow/tools/pip_package/setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index cc5c6f2afc..45159fbfc9 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -36,7 +36,7 @@ REQUIRED_PACKAGES = [
     'numpy >= 1.12.1',
     'six >= 1.10.0',
     'protobuf >= 3.4.0',
-    'tensorflow-tensorboard >= 0.4.0',
+    'tensorflow-tensorboard >= 1.5.0, < 1.6.0',
 ]
 
 project_name = 'tensorflow'
-- 
GitLab


From 45a2fe1e98c679e02ba037b4fb1933e16a1e3256 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 25 Jan 2018 13:28:55 -0800
Subject: [PATCH 176/966] Windows: Add missing dependencies in
 lib_proto_parsing

---
 tensorflow/core/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 497281041f..1eb51370f3 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -274,6 +274,7 @@ cc_library(
         "platform/platform.h",
         "platform/protobuf.h",
         "platform/types.h",
+        "platform/windows/cpu_info.h",
         "lib/bfloat16/bfloat16.h",
     ] + tf_additional_proto_hdrs() + glob(tf_env_time_hdrs()),
     copts = tf_copts(),
-- 
GitLab


From 48d6280da20836f253f50485ce52eef91d4b5f50 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 25 Jan 2018 13:46:54 -0800
Subject: [PATCH 177/966] Remove no longer used param fields from
 TfLiteResizeBilinerParams

PiperOrigin-RevId: 183284937
---
 tensorflow/contrib/lite/builtin_op_data.h | 2 --
 tensorflow/contrib/lite/model.cc          | 2 --
 2 files changed, 4 deletions(-)

diff --git a/tensorflow/contrib/lite/builtin_op_data.h b/tensorflow/contrib/lite/builtin_op_data.h
index 0b48ef4741..8338fde8ac 100644
--- a/tensorflow/contrib/lite/builtin_op_data.h
+++ b/tensorflow/contrib/lite/builtin_op_data.h
@@ -167,8 +167,6 @@ typedef struct {
 } TfLiteLSTMParams;
 
 typedef struct {
-  int new_height;
-  int new_width;
 } TfLiteResizeBilinearParams;
 
 typedef struct {
diff --git a/tensorflow/contrib/lite/model.cc b/tensorflow/contrib/lite/model.cc
index 95949be9e6..ba29a2f4d1 100644
--- a/tensorflow/contrib/lite/model.cc
+++ b/tensorflow/contrib/lite/model.cc
@@ -479,8 +479,6 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
       auto* params = MallocPOD<TfLiteResizeBilinearParams>();
       if (auto* schema_params =
               op->builtin_options_as_ResizeBilinearOptions()) {
-        params->new_height = schema_params->new_height();
-        params->new_width = schema_params->new_width();
       }
       builtin_data = reinterpret_cast<void*>(params);
       break;
-- 
GitLab


From 6124dfd8f54fbc362914b562603b2b1c8a411df2 Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Thu, 25 Jan 2018 14:18:45 -0800
Subject: [PATCH 178/966] Deprecation warning on Variables's += methods

PiperOrigin-RevId: 183290246
---
 .../python/ops/resource_variable_ops.py       | 34 ++++++++---
 tensorflow/python/ops/variables.py            | 56 +++++++++++++++++++
 2 files changed, 83 insertions(+), 7 deletions(-)

diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index 879c206313..f727a29233 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -835,25 +835,45 @@ class ResourceVariable(variables.Variable):
       return self.value()
 
   def __iadd__(self, unused_other):
-    raise RuntimeError("Variable += value not supported.")
+    raise RuntimeError("Variable += value not supported. Use "
+                       "variable.assign_add(value) to modify the variable "
+                       "value and variable = variable + value to get a new "
+                       "Tensor object.")
 
   def __isub__(self, unused_other):
-    raise RuntimeError("Variable -= value not supported.")
+    raise RuntimeError("Variable -= value not supported. Use "
+                       "variable.assign_sub(value) to modify the variable "
+                       "value and variable = variable - value to get a new "
+                       "Tensor object.")
 
   def __imul__(self, unused_other):
-    raise RuntimeError("Variable *= value not supported.")
+    raise RuntimeError("Variable *= value not supported. Use "
+                       "variable.assign_mul(value) to modify the variable "
+                       "value and variable = variable * value to get a new "
+                       "Tensor object.")
 
   def __idiv__(self, unused_other):
-    raise RuntimeError("Variable /= value not supported.")
+    raise RuntimeError("Variable /= value not supported. Use "
+                       "variable.assign_div(value) to modify the variable "
+                       "value and variable = variable / value to get a new "
+                       "Tensor object.")
 
   def __itruediv__(self, unused_other):
-    raise RuntimeError("Variable /= value not supported.")
+    raise RuntimeError("Variable /= value not supported. Use "
+                       "variable.assign_div(value) to modify the variable "
+                       "value and variable = variable / value to get a new "
+                       "Tensor object.")
 
   def __irealdiv__(self, unused_other):
-    raise RuntimeError("Variable /= value not supported.")
+    raise RuntimeError("Variable /= value not supported. Use "
+                       "variable.assign_div(value) to modify the variable "
+                       "value and variable = variable / value to get a new "
+                       "Tensor object.")
 
   def __ipow__(self, unused_other):
-    raise RuntimeError("Variable **= value not supported.")
+    raise RuntimeError("Variable **= value not supported. Use "
+                       "value and variable = variable ** value to get a new "
+                       "Tensor object.")
 
 
 def _dense_var_to_tensor(var, dtype=None, name=None, as_ref=False):
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index 7d7fa646c0..ff19612383 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -28,6 +28,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
 from tensorflow.python.util import tf_should_use
 from tensorflow.python.util.deprecation import deprecated
@@ -1021,6 +1022,61 @@ class Variable(object):
     return Variable(variable_def=variable_def,
                     import_scope=import_scope)
 
+  def __iadd__(self, other):
+    logging.log_first_n(
+        logging.WARN,
+        "Variable += will be deprecated. Use variable.assign_add"
+        " if you want assignment to the variable value or 'x = x + y'"
+        " if you want a new python Tensor object.", 1)
+    return self + other
+
+  def __isub__(self, other):
+    logging.log_first_n(
+        logging.WARN,
+        "Variable -= will be deprecated. Use variable.assign_sub"
+        " if you want assignment to the variable value or 'x = x - y'"
+        " if you want a new python Tensor object.", 1)
+    return self - other
+
+  def __imul__(self, other):
+    logging.log_first_n(
+        logging.WARN,
+        "Variable *= will be deprecated. Use variable.assign_mul"
+        " if you want assignment to the variable value or 'x = x * y'"
+        " if you want a new python Tensor object.", 1)
+    return self * other
+
+  def __idiv__(self, other):
+    logging.log_first_n(
+        logging.WARN,
+        "Variable /= will be deprecated. Use variable.assign_div"
+        " if you want assignment to the variable value or 'x = x / y'"
+        " if you want a new python Tensor object.", 1)
+    return self / other
+
+  def __itruediv__(self, other):
+    logging.log_first_n(
+        logging.WARN,
+        "Variable /= will be deprecated. Use variable.assign_div"
+        " if you want assignment to the variable value or 'x = x / y'"
+        " if you want a new python Tensor object.", 1)
+    return self / other
+
+  def __irealdiv__(self, other):
+    logging.log_first_n(
+        logging.WARN,
+        "Variable /= will be deprecated. Use variable.assign_div"
+        " if you want assignment to the variable value or 'x = x / y'"
+        " if you want a new python Tensor object.", 1)
+    return self / other
+
+  def __ipow__(self, other):
+    logging.log_first_n(
+        logging.WARN,
+        "Variable **= will be deprecated. Use 'x = x ** y'"
+        " if you want a new python Tensor object.", 1)
+    return self ** other
+
   class SaveSliceInfo(object):
     """Information on how to save this Variable as a slice.
 
-- 
GitLab


From 119b993e00a2a138d1ef2f4886e39ca528bad1c3 Mon Sep 17 00:00:00 2001
From: "freedom\" Koan-Sin Tan" <koansin.tan@gmail.com>
Date: Fri, 26 Jan 2018 06:24:23 +0800
Subject: [PATCH 179/966] make label_image for tflite build again (#16206)

* make label_image for tflite build again

1. add namespace to label_image.h to make label_image for tflite build again
2. add --config monolithic and mention NDK settings in label_image.md
3. fix a typo in display_usage()

* relies on tensor type info to switch types

use tensor types of input and output tensors instead of command
line flag from user

* reformatted according to review
---
 .../lite/examples/label_image/label_image.cc  | 55 +++++++++++--------
 .../lite/examples/label_image/label_image.h   |  7 ++-
 .../lite/examples/label_image/label_image.md  | 12 ++--
 3 files changed, 46 insertions(+), 28 deletions(-)

diff --git a/tensorflow/contrib/lite/examples/label_image/label_image.cc b/tensorflow/contrib/lite/examples/label_image/label_image.cc
index 4d2e1ce0bc..d7f49ad875 100644
--- a/tensorflow/contrib/lite/examples/label_image/label_image.cc
+++ b/tensorflow/contrib/lite/examples/label_image/label_image.cc
@@ -148,14 +148,22 @@ void RunInference(Settings* s) {
   int wanted_width = dims->data[2];
   int wanted_channels = dims->data[3];
 
-  if (s->input_floating) {
-    downsize<float>(interpreter->typed_tensor<float>(input), in, image_height,
-                    image_width, image_channels, wanted_height, wanted_width,
-                    wanted_channels, s);
-  } else {
-    downsize<uint8_t>(interpreter->typed_tensor<uint8_t>(input), in,
-                      image_height, image_width, image_channels, wanted_height,
-                      wanted_width, wanted_channels, s);
+  switch (interpreter->tensor(input)->type) {
+    case kTfLiteFloat32:
+      s->input_floating = true;
+      downsize<float>(interpreter->typed_tensor<float>(input), in,
+                      image_height, image_width, image_channels,
+                      wanted_height, wanted_width, wanted_channels, s);
+      break;
+    case kTfLiteUInt8:
+      downsize<uint8_t>(interpreter->typed_tensor<uint8_t>(input), in,
+                        image_height, image_width, image_channels,
+                        wanted_height, wanted_width, wanted_channels, s);
+      break;
+    default:
+      LOG(FATAL) << "cannot handle input type "
+                 << interpreter->tensor(input)->type << " yet";
+      exit(-1);
   }
 
   struct timeval start_time, stop_time;
@@ -177,13 +185,22 @@ void RunInference(Settings* s) {
 
   std::vector<std::pair<float, int>> top_results;
 
-  if (s->input_floating) {
-    get_top_n<float>(interpreter->typed_output_tensor<float>(0), output_size,
-                     num_results, threshold, &top_results, s->input_floating);
-  } else {
-    get_top_n<uint8_t>(interpreter->typed_output_tensor<uint8_t>(0),
+  int output = interpreter->outputs()[0];
+  switch (interpreter->tensor(output)->type) {
+    case kTfLiteFloat32:
+      get_top_n<float>(interpreter->typed_output_tensor<float>(0),
                        output_size, num_results, threshold, &top_results,
-                       s->input_floating);
+                       true);
+      break;
+    case kTfLiteUInt8:
+      get_top_n<uint8_t>(interpreter->typed_output_tensor<uint8_t>(0),
+                         output_size, num_results, threshold, &top_results,
+                         false);
+      break;
+    default:
+      LOG(FATAL) << "cannot handle output type "
+                 << interpreter->tensor(input)->type << " yet";
+      exit(-1);
   }
 
   std::vector<string> labels;
@@ -203,13 +220,11 @@ void display_usage() {
   LOG(INFO) << "label_image\n"
             << "--accelerated, -a: [0|1], use Android NNAPI or note\n"
             << "--count, -c: loop interpreter->Invoke() for certain times\n"
-            << "--input_floating, -f: [0|1] type of input layer is floating "
-               "point numbers\n"
             << "--input_mean, -b: input mean\n"
             << "--input_std, -s: input standard deviation\n"
             << "--image, -i: image_name.bmp\n"
             << "--labels, -l: labels for the model\n"
-            << "--tflite_mode, -m: model_name.tflite\n"
+            << "--tflite_model, -m: model_name.tflite\n"
             << "--threads, -t: number of threads\n"
             << "--verbose, -v: [0|1] print more information\n"
             << "\n";
@@ -223,7 +238,6 @@ int Main(int argc, char** argv) {
     static struct option long_options[] = {
         {"accelerated", required_argument, 0, 'a'},
         {"count", required_argument, 0, 'c'},
-        {"input_floating", required_argument, 0, 'f'},
         {"verbose", required_argument, 0, 'v'},
         {"image", required_argument, 0, 'i'},
         {"labels", required_argument, 0, 'l'},
@@ -254,11 +268,6 @@ int Main(int argc, char** argv) {
         s.loop_count = strtol(  // NOLINT(runtime/deprecated_fn)
             optarg, (char**)NULL, 10);
         break;
-      case 'f':
-        s.input_floating = strtol(  // NOLINT(runtime/deprecated_fn)
-            optarg, (char**)NULL, 10);
-        s.input_layer_type = "float";
-        break;
       case 'i':
         s.input_bmp_name = optarg;
         break;
diff --git a/tensorflow/contrib/lite/examples/label_image/label_image.h b/tensorflow/contrib/lite/examples/label_image/label_image.h
index ce98e06fc1..4de32e33fb 100644
--- a/tensorflow/contrib/lite/examples/label_image/label_image.h
+++ b/tensorflow/contrib/lite/examples/label_image/label_image.h
@@ -16,9 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_LABEL_IMAGE_H
 #define TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_LABEL_IMAGE_H
 
-#include <string>
 #include "tensorflow/contrib/lite/string.h"
 
+namespace tflite {
+namespace label_image {
+
 struct Settings {
   bool verbose = false;
   bool accel = false;
@@ -33,4 +35,7 @@ struct Settings {
   int number_of_threads = 4;
 };
 
+}  // namespace label_image
+}  // namespace tflite
+
 #endif  // TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_LABEL_IMAGE_H
diff --git a/tensorflow/contrib/lite/examples/label_image/label_image.md b/tensorflow/contrib/lite/examples/label_image/label_image.md
index d6019d673f..9ce32cf101 100644
--- a/tensorflow/contrib/lite/examples/label_image/label_image.md
+++ b/tensorflow/contrib/lite/examples/label_image/label_image.md
@@ -1,8 +1,12 @@
 label_image for TensorFlow Lite inspired by TensorFlow's label_image.
+
+To build label_image for Android, run $TENSORFLOW_ROOT/configure 
+and set Android NDK or configure NDK setting in 
+$TENSORFLOW_ROOT/WORKSPACE first.
  
 To build it for android ARMv8:
 ```
-> bazel build --cxxopt=-std=c++11 \
+> bazel build --config monolithic --cxxopt=-std=c++11 \
   --crosstool_top=//external:android/crosstool \
   --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
   --cpu=arm64-v8a \
@@ -10,13 +14,13 @@ To build it for android ARMv8:
 ```
 or
 ```
-> bazel build --config android_arm64 --cxxopt=-std=c++11 \
+> bazel build --config android_arm64 --config monolithic --cxxopt=-std=c++11 \
   //tensorflow/contrib/lite/examples/label_image:label_image
 ```
 
 To build it for android arm-v7a:
 ```
-> bazel build --cxxopt=-std=c++11 \
+> bazel build --config monolithic --cxxopt=-std=c++11 \
   --crosstool_top=//external:android/crosstool \
   --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
   --cpu=armeabi-v7a \
@@ -24,7 +28,7 @@ To build it for android arm-v7a:
 ```
 or
 ```
-> bazel build --config android_arm --cxxopt=-std=c++11 \
+> bazel build --config android_arm --config monolithic --cxxopt=-std=c++11 \
   //tensorflow/contrib/lite/examples/label_image:label_image
 ```
 
-- 
GitLab


From 9019ace82139058538fcbee1bbcea765881d5724 Mon Sep 17 00:00:00 2001
From: Eugene Brevdo <ebrevdo@google.com>
Date: Thu, 25 Jan 2018 14:28:41 -0800
Subject: [PATCH 180/966] Automated g4 rollback of changelist 183273334

PiperOrigin-RevId: 183291956
---
 tensorflow/python/ops/rnn.py                      |  5 ++++-
 tensorflow/python/profiler/model_analyzer_test.py | 10 +++++-----
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/tensorflow/python/ops/rnn.py b/tensorflow/python/ops/rnn.py
index a1008f1c83..a10e1963d1 100644
--- a/tensorflow/python/ops/rnn.py
+++ b/tensorflow/python/ops/rnn.py
@@ -812,7 +812,10 @@ def _dynamic_rnn_loop(cell,
     return (time + 1, output_ta_t, new_state)
 
   if in_graph_mode:
-    loop_bound = max_sequence_length
+    # Make sure that we run at least 1 step, if necessary, to ensure
+    # the TensorArrays pick up the dynamic shape.
+    loop_bound = math_ops.minimum(
+        time_steps, math_ops.maximum(1, max_sequence_length))
   else:
     # Using max_sequence_length isn't currently supported in the Eager branch.
     loop_bound = time_steps
diff --git a/tensorflow/python/profiler/model_analyzer_test.py b/tensorflow/python/profiler/model_analyzer_test.py
index 9153855588..04ba28c219 100644
--- a/tensorflow/python/profiler/model_analyzer_test.py
+++ b/tensorflow/python/profiler/model_analyzer_test.py
@@ -224,15 +224,15 @@ class PrintModelAnalysisTest(test.TestCase):
         # pylint: disable=line-too-long
         with gfile.Open(outfile, 'r') as f:
           lines = f.read().split('\n')
+          self.assertGreater(len(lines), 5)
           result = '\n'.join([l[:min(len(l), 80)] for l in lines])
-          self.assertEqual(
-              compat.as_bytes(
-                  'node name | # parameters | # float_ops\n_TFProfRoot (--/2.84k params, --/168.86k flops)\n  model_analyzer_testlib.py:63:BuildFullModel (0/1.80k params, 0/45.37k flops)\n    model_analyzer_testlib.py:40:BuildSmallModel (0/0 params, 0/0 flops)\n    model_analyzer_testlib.py:44:BuildSmallModel (0/4 params, 0/8 flops)\n    model_analyzer_testlib.py:48:BuildSmallModel (0/648 params, 0/1.30k flops)\n    model_analyzer_testlib.py:49:BuildSmallModel (0/0 params, 0/23.33k flops)\n    model_analyzer_testlib.py:53:BuildSmallModel (0/1.15k params, 0/2.30k flops)\n    model_analyzer_testlib.py:54:BuildSmallModel (0/0 params, 0/18.43k flops)\n  model_analyzer_testlib.py:63:BuildFullModel (gradient) (0/0 params, 0/67.39k f\n    model_analyzer_testlib.py:49:BuildSmallModel (gradient) (0/0 params, 0/46.66\n    model_analyzer_testlib.py:54:BuildSmallModel (gradient) (0/0 params, 0/20.74\n  model_analyzer_testlib.py:67:BuildFullModel (0/1.04k params, 0/18.58k flops)\n  model_analyzer_testlib.py:67:BuildFullModel (gradient) (0/0 params, 0/37.00k f\n  model_analyzer_testlib.py:69:BuildFullModel (0/0 params, 0/0 flops)\n  model_analyzer_testlib.py:70:BuildFullModel (0/0 params, 0/258 flops)\n  model_analyzer_testlib.py:70:BuildFullModel (gradient) (0/0 params, 0/129 flop\n  model_analyzer_testlib.py:72:BuildFullModel (0/0 params, 0/141 flops)\n'
-              ), compat.as_bytes(lib.CheckAndRemoveDoc(result)))
+          self.assertTrue(
+              compat.as_text(lib.CheckAndRemoveDoc(result))
+              .startswith('node name | # parameters | # float_ops'))
 
         self.assertLess(0, tfprof_node.total_exec_micros)
         self.assertEqual(2844, tfprof_node.total_parameters)
-        self.assertEqual(168863, tfprof_node.total_float_ops)
+        self.assertLess(168800, tfprof_node.total_float_ops)
         self.assertEqual(8, len(tfprof_node.children))
         self.assertEqual('_TFProfRoot', tfprof_node.name)
         self.assertEqual(
-- 
GitLab


From 46a78c429b689e08fa25fcebe6656884bcd6a45e Mon Sep 17 00:00:00 2001
From: Yunxing Dai <yunxing@google.com>
Date: Thu, 25 Jan 2018 14:30:15 -0800
Subject: [PATCH 181/966] Make select_and_scatter_test optonly

- Make select_and_scatter_test optonly.
- Reduce number of shards.

PiperOrigin-RevId: 183292230
---
 tensorflow/compiler/xla/tests/BUILD | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index ac11081699..85ac28533d 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -1035,8 +1035,10 @@ xla_test(
     name = "select_and_scatter_test",
     timeout = "long",
     srcs = ["select_and_scatter_test.cc"],
-    shard_count = 40,
-    tags = ["enable_for_xla_interpreter"],
+    tags = [
+        "enable_for_xla_interpreter",
+        "optonly",
+    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:literal_util",
-- 
GitLab


From f4b5d26f7d22d710c5bf8815ea97cdc00d979fce Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 25 Jan 2018 14:38:57 -0800
Subject: [PATCH 182/966] Add OPENSOURCE extension to schema_generated.h

PiperOrigin-RevId: 183293637
---
 tensorflow/contrib/lite/schema/schema_generated.h | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 tensorflow/contrib/lite/schema/schema_generated.h

diff --git a/tensorflow/contrib/lite/schema/schema_generated.h b/tensorflow/contrib/lite/schema/schema_generated.h
old mode 100644
new mode 100755
-- 
GitLab


From 73b4b1502924acd461013d4ecf9825aedd3a3968 Mon Sep 17 00:00:00 2001
From: Jonathan Hseu <jhseu@google.com>
Date: Thu, 25 Jan 2018 14:39:44 -0800
Subject: [PATCH 183/966] Use LookupOrCreateResource when creating summary
 writers. The resource may already exist because it's associated with the
 device.

PiperOrigin-RevId: 183293761
---
 tensorflow/core/kernels/summary_kernels.cc | 41 ++++++++++++++--------
 1 file changed, 26 insertions(+), 15 deletions(-)

diff --git a/tensorflow/core/kernels/summary_kernels.cc b/tensorflow/core/kernels/summary_kernels.cc
index 41cbece1d6..da3644779d 100644
--- a/tensorflow/core/kernels/summary_kernels.cc
+++ b/tensorflow/core/kernels/summary_kernels.cc
@@ -42,11 +42,16 @@ class CreateSummaryFileWriterOp : public OpKernel {
     const int32 flush_millis = tmp->scalar<int32>()();
     OP_REQUIRES_OK(ctx, ctx->input("filename_suffix", &tmp));
     const string filename_suffix = tmp->scalar<string>()();
-    SummaryWriterInterface* s;
-    OP_REQUIRES_OK(ctx,
-                   CreateSummaryFileWriter(max_queue, flush_millis, logdir,
-                                           filename_suffix, ctx->env(), &s));
-    OP_REQUIRES_OK(ctx, CreateResource(ctx, HandleFromInput(ctx, 0), s));
+
+    SummaryWriterInterface* s = nullptr;
+    OP_REQUIRES_OK(ctx, LookupOrCreateResource<SummaryWriterInterface>(
+                            ctx, HandleFromInput(ctx, 0), &s,
+                            [max_queue, flush_millis, logdir, filename_suffix,
+                             ctx](SummaryWriterInterface** s) {
+                              return CreateSummaryFileWriter(
+                                  max_queue, flush_millis, logdir,
+                                  filename_suffix, ctx->env(), s);
+                            }));
   }
 };
 REGISTER_KERNEL_BUILDER(Name("CreateSummaryFileWriter").Device(DEVICE_CPU),
@@ -66,17 +71,23 @@ class CreateSummaryDbWriterOp : public OpKernel {
     const string run_name = tmp->scalar<string>()();
     OP_REQUIRES_OK(ctx, ctx->input("user_name", &tmp));
     const string user_name = tmp->scalar<string>()();
-    SummaryWriterInterface* s;
-    Sqlite* db;
-    OP_REQUIRES_OK(ctx, Sqlite::Open(db_uri,
-                                     SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE,
-                                     &db));
-    core::ScopedUnref unref(db);
-    OP_REQUIRES_OK(ctx, SetupTensorboardSqliteDb(db));
+
+    SummaryWriterInterface* s = nullptr;
     OP_REQUIRES_OK(
-        ctx, CreateSummaryDbWriter(db, experiment_name,
-                                   run_name, user_name, ctx->env(), &s));
-    OP_REQUIRES_OK(ctx, CreateResource(ctx, HandleFromInput(ctx, 0), s));
+        ctx,
+        LookupOrCreateResource<SummaryWriterInterface>(
+            ctx, HandleFromInput(ctx, 0), &s,
+            [db_uri, experiment_name, run_name, user_name,
+             ctx](SummaryWriterInterface** s) {
+              Sqlite* db;
+              TF_RETURN_IF_ERROR(Sqlite::Open(
+                  db_uri, SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE, &db));
+              core::ScopedUnref unref(db);
+              TF_RETURN_IF_ERROR(SetupTensorboardSqliteDb(db));
+              TF_RETURN_IF_ERROR(CreateSummaryDbWriter(
+                  db, experiment_name, run_name, user_name, ctx->env(), s));
+              return Status::OK();
+            }));
   }
 };
 REGISTER_KERNEL_BUILDER(Name("CreateSummaryDbWriter").Device(DEVICE_CPU),
-- 
GitLab


From b998b7b456066530dd27ef532dae195d27505266 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 25 Jan 2018 14:42:03 -0800
Subject: [PATCH 184/966] Drop the manually_create field from RnnState.

Initially, I thought that the shape of RNN state arrays could always be
determined by shape propagation. Then I came across some graphs where this
wasn't so easy to infer, so I introduced manually_create thinking of it
as a hack. Today I took another look at dropping that hack, and had a
"D'oh" moment when I realized that the cyclic nature of RNN graphs makes
it impossible to infer the shapes of all arrays by usual propagation.
For example, in a LSTM cell, the input array is concatenated with
a state array, so if we don't already know the shape of that state array,
shape propagation stops there.

Thus, this change removes manually_create by making toco always behave as
if manually_create=true, i.e. early-creating all RNN state arrays with
the shape explicitly specified by the user. The next TODO item here
(see model_flags.proto) is to introduce a generic 'shape' field, so far
the current 'size' field only allows specifying 1-D shapes.

PiperOrigin-RevId: 183294102
---
 .../lite/toco/allocate_transient_arrays.cc        |  4 +---
 .../contrib/lite/toco/model_cmdline_flags.cc      |  3 ---
 tensorflow/contrib/lite/toco/model_flags.proto    | 15 +++------------
 tensorflow/contrib/lite/toco/tooling_util.cc      | 13 ++++++-------
 4 files changed, 10 insertions(+), 25 deletions(-)

diff --git a/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc b/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc
index 5961d30bf5..49cc1fc2aa 100644
--- a/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc
+++ b/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc
@@ -158,9 +158,7 @@ std::size_t TransientArraySize(const Model& model, const string& array_name,
         LOG(FATAL)
             << "A RNN state array, " << array_name << ", still does not "
             << "have a known data type after all graph transformations have "
-            << "run. That's mostly a toco bug --- sorry. For now, you can "
-            << "work around this issue by adding manually_create:true in the "
-            << "--rnn_state description of this RNN state.";
+            << "run.";
       }
     }
     LOG(FATAL) << "An array, " << array_name << ", still does not "
diff --git a/tensorflow/contrib/lite/toco/model_cmdline_flags.cc b/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
index 790b3443ce..36520d9c55 100644
--- a/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
+++ b/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
@@ -327,9 +327,6 @@ void ReadModelFlagsFromCommandLineFlags(
         CHECK(absl::SimpleAtoi(value, &size));
         CHECK_GT(size, 0);
         rnn_state_proto->set_size(size);
-      } else if (key == "manually_create") {
-        CHECK_EQ(absl::AsciiStrToLower(value), "true");
-        rnn_state_proto->set_manually_create(true);
       } else {
         LOG(FATAL) << "Unknown key '" << key << "' in --rnn_states";
       }
diff --git a/tensorflow/contrib/lite/toco/model_flags.proto b/tensorflow/contrib/lite/toco/model_flags.proto
index 13fea29a07..9070ddc883 100644
--- a/tensorflow/contrib/lite/toco/model_flags.proto
+++ b/tensorflow/contrib/lite/toco/model_flags.proto
@@ -81,19 +81,10 @@ message RnnState {
   optional string state_array = 1;
   optional string back_edge_source_array = 2;
   optional bool discardable = 5;
-  // TODO(benoitjacob): drop the 'size' field. Should be redundant with
-  // --input_shapes and shapes propagation.
+  // size allows to specify a 1-D shape for the RNN state array.
+  // Will be expanded with 1's to fit the model.
+  // TODO(benoitjacob): should allow a generic, explicit shape.
   optional int32 size = 3;
-  // TODO(benoitjacob): manually_create is a temporary hack:
-  // due to discrepancies between the current toco dims tracking and
-  // TensorFlow shapes, for some models we need to manually create RNN state
-  // arrays with a specified shape.
-  // Maybe we should actually implement back-edges as operators of their own,
-  // which would remove the need for much special-casing, including here,
-  // we could probably consistently let PropagateFixedSizes handle state
-  // arrays.
-  // TODO(benoitjacob): should really drop manually_create now.
-  optional bool manually_create = 4;
 }
 
 // ModelFlags encodes properties of a model that, depending on the file
diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc
index 99a54a300b..df785a5102 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.cc
+++ b/tensorflow/contrib/lite/toco/tooling_util.cc
@@ -958,7 +958,9 @@ void CheckModelCounts(const Model& model) {
 void MakeArrayDims(int num_dims, int batch, int height, int width, int depth,
                    std::vector<int>* out_dims) {
   CHECK(out_dims->empty());
-  if (num_dims == 1) {
+  if (num_dims == 0) {
+    return;
+  } else if (num_dims == 1) {
     CHECK_EQ(batch, 1);
     *out_dims = {depth};
   } else if (num_dims == 2) {
@@ -990,13 +992,13 @@ void CreateOrCheckRnnStateArray(const string& name, int size, Model* model) {
   if (array.has_shape()) {
     num_dims = array.shape().dimensions_count();
   }
-  std::vector<int> dims;
-  MakeArrayDims(num_dims, batch, 1, 1, size, &dims);
   CHECK(array.data_type == ArrayDataType::kFloat ||
         array.data_type == ArrayDataType::kNone);
   array.data_type = ArrayDataType::kFloat;
-  if (!array.has_shape()) {
+  if (!array.has_shape() && num_dims >= 0) {
     Shape* shape = array.mutable_shape();
+    std::vector<int> dims;
+    MakeArrayDims(num_dims, batch, 1, 1, size, &dims);
     *shape->mutable_dims() = dims;
   }
 }
@@ -1185,9 +1187,6 @@ void ResolveModelFlags(const ModelFlags& model_flags, Model* model) {
   }
   // Creation of the RNN state arrays
   for (const auto& rnn_state : model->flags.rnn_states()) {
-    if (!rnn_state.manually_create()) {
-      continue;
-    }
     CreateOrCheckRnnStateArray(rnn_state.state_array(), rnn_state.size(),
                                model);
   }
-- 
GitLab


From ad5c04c9e1151c4de71288520d45f3b3142299fb Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Thu, 25 Jan 2018 14:53:37 -0800
Subject: [PATCH 185/966] Whitelist "bool" as a valid TPU infeed type.

PiperOrigin-RevId: 183296017
---
 tensorflow/contrib/tpu/python/ops/tpu_ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/tpu/python/ops/tpu_ops.py b/tensorflow/contrib/tpu/python/ops/tpu_ops.py
index a49a3dcf29..1c970655d0 100644
--- a/tensorflow/contrib/tpu/python/ops/tpu_ops.py
+++ b/tensorflow/contrib/tpu/python/ops/tpu_ops.py
@@ -47,7 +47,7 @@ if platform.system() != "Windows":
   # types are supported.
 
   _SUPPORTED_INFEED_DTYPES = set([
-      dtypes.int32, dtypes.bfloat16, dtypes.float32
+      dtypes.bool, dtypes.int32, dtypes.bfloat16, dtypes.float32
   ])
 
   def infeed_dequeue(dtype, shape, name=None):
-- 
GitLab


From 022890f6ac03bb87cc7b4f1a5b722cd6b058e616 Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Thu, 25 Jan 2018 14:56:38 -0800
Subject: [PATCH 186/966] [XLA] Add HLO matcher for CustomCall that accepts a
 call target.

PiperOrigin-RevId: 183296506
---
 .../compiler/xla/service/hlo_matchers.cc      | 24 +++++++++
 .../compiler/xla/service/hlo_matchers.h       | 53 +++++++++++++++++--
 .../compiler/xla/service/hlo_matchers_test.cc | 33 ++++++++++++
 3 files changed, 107 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_matchers.cc b/tensorflow/compiler/xla/service/hlo_matchers.cc
index 4255d60866..fe1bf61e97 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers.cc
+++ b/tensorflow/compiler/xla/service/hlo_matchers.cc
@@ -102,6 +102,30 @@ bool HloGetTupleElementMatcher::MatchAndExplain(
   return true;
 }
 
+void HloCustomCallMatcher::DescribeTo(std::ostream* os) const {
+  HloMatcher::DescribeTo(os);
+  *os << " with call target that "
+      << ::testing::DescribeMatcher<string>(call_target_matcher_);
+}
+
+bool HloCustomCallMatcher::MatchAndExplain(
+    const HloInstruction* instruction,
+    ::testing::MatchResultListener* listener) const {
+  if (!HloMatcher::MatchAndExplain(instruction, listener)) {
+    return false;
+  }
+  ::testing::StringMatchResultListener sub_listener;
+  bool result = ExplainMatchResult(
+      call_target_matcher_, instruction->custom_call_target(), &sub_listener);
+  if (sub_listener.str().empty()) {
+    sub_listener << " that "
+                 << ::testing::DescribeMatcher<string>(call_target_matcher_,
+                                                       /*negation=*/!result);
+  }
+  *listener << "custom-call with call target" << sub_listener.str();
+  return result;
+}
+
 }  // namespace testing
 
 void PrintTo(const HloInstruction* inst, ::std::ostream* os) {
diff --git a/tensorflow/compiler/xla/service/hlo_matchers.h b/tensorflow/compiler/xla/service/hlo_matchers.h
index 9206cdac05..103f04a2cb 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers.h
+++ b/tensorflow/compiler/xla/service/hlo_matchers.h
@@ -56,8 +56,8 @@ class HloParameterMatcher : public HloMatcher {
 // index to match.
 class HloGetTupleElementMatcher : public HloMatcher {
  public:
-  explicit HloGetTupleElementMatcher(
-      ::testing::Matcher<const HloInstruction*> operand, int64 tuple_index)
+  HloGetTupleElementMatcher(::testing::Matcher<const HloInstruction*> operand,
+                            int64 tuple_index)
       : HloMatcher(HloOpcode::kGetTupleElement, /*operands=*/{operand}),
         tuple_index_(tuple_index) {}
 
@@ -68,6 +68,24 @@ class HloGetTupleElementMatcher : public HloMatcher {
   int64 tuple_index_;
 };
 
+// Custom matcher for custom-call instructions, which accepts a matcher for its
+// call target.
+class HloCustomCallMatcher : public HloMatcher {
+ public:
+  HloCustomCallMatcher(
+      ::testing::Matcher<string> call_target_matcher,
+      std::vector<::testing::Matcher<const HloInstruction*>> operands)
+      : HloMatcher(HloOpcode::kCustomCall, operands),
+        call_target_matcher_(call_target_matcher) {}
+
+  bool MatchAndExplain(const HloInstruction* instruction,
+                       ::testing::MatchResultListener* listener) const override;
+  void DescribeTo(std::ostream* os) const override;
+
+ private:
+  ::testing::Matcher<string> call_target_matcher_;
+};
+
 // HloInstruction* matchers for opcode and operands. Example:
 //   namespace op = xla::opcode_matchers;
 //   EXPECT_THAT(instruction,
@@ -94,7 +112,6 @@ HLO_MATCHER(Convert);
 HLO_MATCHER(Convolution);
 HLO_MATCHER(Copy);
 HLO_MATCHER(CrossReplicaSum);
-HLO_MATCHER(CustomCall);
 HLO_MATCHER(Divide);
 HLO_MATCHER(Dot);
 HLO_MATCHER(DynamicSlice);
@@ -184,6 +201,36 @@ inline ::testing::Matcher<const ::xla::HloInstruction*> GetTupleElement() {
       new ::xla::testing::HloMatcher(HloOpcode::kGetTupleElement, {}));
 }
 
+// - CustomCall(T, operand1, ..., operandN) matches a CustomCall with call
+//   target T and the given operands.
+//
+// - CustomCall(operand1, ..., operandN) matches any CustomCall HLO with the
+//   given operands.
+//
+// - CustomCall() matches any CustomCall HLO at all.
+template <typename... M>
+inline ::testing::Matcher<const ::xla::HloInstruction*> CustomCall(
+    ::testing::Matcher<string> call_target_matcher, M... operands) {
+  return ::testing::MakeMatcher(new ::xla::testing::HloCustomCallMatcher(
+      call_target_matcher, {operands...}));
+}
+// This overload of CustomCall(A, B, C, ...) exists iff A is not convertible to
+// ::testing::Matcher<string>.  In that case, we want to prefer the overload
+// above.
+template <typename FirstM, typename... M,
+          typename Dummy = typename std::enable_if<
+              !std::is_convertible<FirstM, ::testing::Matcher<string>>::value,
+              void>::type*>
+inline ::testing::Matcher<const ::xla::HloInstruction*> CustomCall(
+    FirstM operands_first, M... operands_rest) {
+  return ::testing::MakeMatcher(new ::xla::testing::HloMatcher(
+      HloOpcode::kCustomCall, {operands_first, operands_rest...}));
+}
+inline ::testing::Matcher<const ::xla::HloInstruction*> CustomCall() {
+  return ::testing::MakeMatcher(
+      new ::xla::testing::HloMatcher(HloOpcode::kCustomCall, {}));
+}
+
 #undef HLO_MATCHER
 }  // namespace opcode_matchers
 
diff --git a/tensorflow/compiler/xla/service/hlo_matchers_test.cc b/tensorflow/compiler/xla/service/hlo_matchers_test.cc
index 1465d1cacd..1c21703a45 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_matchers_test.cc
@@ -23,6 +23,12 @@ using ::testing::Eq;
 namespace xla {
 namespace {
 
+string DescribeHloMatcher(const ::testing::Matcher<const HloInstruction*>& m) {
+  std::stringstream ss;
+  m.DescribeTo(&ss);
+  return ss.str();
+}
+
 template <typename M, typename T>
 string Explain(const T& t, const M& m) {
   ::testing::StringMatchResultListener listener;
@@ -67,5 +73,32 @@ TEST(HloMatchersTest, Test) {
          "add"));
 }
 
+TEST(HloMatchersTest, CustomCallMatcher) {
+  auto c1 = HloInstruction::CreateConstant(Literal::CreateR1<float>({1, 2, 3}));
+  auto c2 = HloInstruction::CreateConstant(Literal::CreateR1<int32>({1, 2, 3}));
+  auto call = HloInstruction::CreateCustomCall(
+      ShapeUtil::MakeShape(F32, {1}), {c1.get(), c2.get()}, "foo_target");
+
+  EXPECT_THAT(call.get(), op::CustomCall());
+  EXPECT_THAT(call.get(), op::CustomCall(c1.get(), c2.get()));
+  EXPECT_THAT(call.get(), op::CustomCall("foo_target"));
+  EXPECT_THAT(call.get(), op::CustomCall("foo_target", c1.get(), c2.get()));
+  EXPECT_THAT(call.get(), op::CustomCall(::testing::StartsWith("foo")));
+  EXPECT_THAT(call.get(),
+              op::CustomCall(::testing::Not(::testing::StartsWith("bar"))));
+
+  // Wrong number of operands.
+  EXPECT_THAT(call.get(), ::testing::Not(op::CustomCall(c1.get())));
+
+  // Call target does not match.
+  EXPECT_THAT(call.get(),
+              ::testing::Not(op::CustomCall(::testing::StartsWith("bar"))));
+
+  EXPECT_THAT(Explain(call.get(), op::CustomCall("bar")),
+              R"(custom-call with call target that isn't equal to "bar")");
+  EXPECT_THAT(DescribeHloMatcher(op::CustomCall("foo_target")),
+              R"(custom-call with call target that is equal to "foo_target")");
+}
+
 }  // namespace
 }  // namespace xla
-- 
GitLab


From dfb59da4ede1daf163a167da590ac70c447eb41a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 25 Jan 2018 15:01:30 -0800
Subject: [PATCH 187/966] [XLA:GPU] Implement conditional as a sequence of
 thunks in the GPU backend.

This also includes the following fixes:

(1) Update buffer assignment for conditionals so that the buffers corresponding to the true operand and the true computation parameter are colocated, and similarly, the buffers corresponding to the false operand and the false computation parameter are colocated.

(2) Update GPU copy insertion pass to insert copies when constants appear as operands of conditional instructions.

PiperOrigin-RevId: 183297282
---
 .../compiler/xla/service/buffer_assignment.cc |  37 ++++++
 tensorflow/compiler/xla/service/gpu/BUILD     |   2 +
 .../xla/service/gpu/conditional_thunk.cc      |  72 +++++++++++
 .../xla/service/gpu/conditional_thunk.h       |  65 ++++++++++
 .../xla/service/gpu/gpu_copy_insertion.cc     |  43 ++++---
 .../xla/service/gpu/gpu_copy_insertion.h      |   4 +-
 .../compiler/xla/service/gpu/ir_emitter.cc    |  31 -----
 .../compiler/xla/service/gpu/ir_emitter.h     |   6 +-
 .../xla/service/gpu/ir_emitter_unnested.cc    | 120 ++++++++++++++----
 tensorflow/compiler/xla/service/gpu/thunk.h   |   1 +
 .../compiler/xla/tests/conditional_test.cc    |  36 +++++-
 11 files changed, 334 insertions(+), 83 deletions(-)
 create mode 100644 tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
 create mode 100644 tensorflow/compiler/xla/service/gpu/conditional_thunk.h

diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
index 323620c131..d5594dc07c 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -1358,6 +1358,43 @@ void BufferAssigner::BuildColocatedBufferSets(
                   index, points_to_analysis, &colocated_set);
               AddSetToColocatedBufferSets(colocated_set, colocated_buffer_sets);
             });
+
+        // Add true_operand and conditional.true_computation.parameter(0) as a
+        // colocated buffer set. Note that this has to be done for each subshape
+        // in the true_operand of the conditional.
+        ShapeUtil::ForEachSubshape(
+            conditional_hlo->operand(1)->shape(),
+            [this, conditional_hlo, &points_to_analysis, colocated_buffer_sets](
+                const Shape& /*subshape*/, const ShapeIndex& index) {
+              std::vector<const LogicalBuffer*> true_set;
+              // Add conditional.true_operand.
+              AddBufferToColocatedSet(conditional_hlo->operand(1), index,
+                                      points_to_analysis, &true_set);
+              // Add conditional.true_computation.parameter_instruction(0).
+              AddBufferToColocatedSet(
+                  conditional_hlo->true_computation()->parameter_instruction(0),
+                  index, points_to_analysis, &true_set);
+              AddSetToColocatedBufferSets(true_set, colocated_buffer_sets);
+            });
+
+        // Add false_operand and conditional.false_computation.parameter(0) as a
+        // colocated buffer set. Note that this has to be done for each subshape
+        // in the false_operand of the conditional.
+        ShapeUtil::ForEachSubshape(
+            conditional_hlo->operand(2)->shape(),
+            [this, conditional_hlo, &points_to_analysis, colocated_buffer_sets](
+                const Shape& /*subshape*/, const ShapeIndex& index) {
+              std::vector<const LogicalBuffer*> false_set;
+              // Add conditional.false_operand.
+              AddBufferToColocatedSet(conditional_hlo->operand(2), index,
+                                      points_to_analysis, &false_set);
+              // Add conditional.false_computation.parameter_instruction(0).
+              AddBufferToColocatedSet(
+                  conditional_hlo->false_computation()->parameter_instruction(
+                      0),
+                  index, points_to_analysis, &false_set);
+              AddSetToColocatedBufferSets(false_set, colocated_buffer_sets);
+            });
       }
     }
   }
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index df5e2e35f8..3c3328b9cd 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -228,6 +228,7 @@ cc_library(
 cc_library(
     name = "gpu_executable",
     srcs = [
+        "conditional_thunk.cc",
         "convolution_thunk.cc",
         "copy_thunk.cc",
         "cudnn_batchnorm_thunk.cc",
@@ -243,6 +244,7 @@ cc_library(
         "while_thunk.cc",
     ],
     hdrs = [
+        "conditional_thunk.h",
         "convolution_thunk.h",
         "copy_thunk.h",
         "cudnn_batchnorm_thunk.h",
diff --git a/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc b/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
new file mode 100644
index 0000000000..790ca535b1
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
@@ -0,0 +1,72 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/conditional_thunk.h"
+
+#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace xla {
+namespace gpu {
+
+ConditionalThunk::ConditionalThunk(
+    const BufferAllocation::Slice& predicate_buffer_index,
+    const BufferAllocation::Slice& true_operand_buffer_index,
+    const BufferAllocation::Slice& false_operand_buffer_index,
+    ThunkSequence true_thunk_sequence, ThunkSequence false_thunk_sequence,
+    const HloInstruction* hlo)
+    : Thunk(Kind::kConditional, hlo),
+      predicate_buffer_index_(predicate_buffer_index),
+      true_operand_buffer_index_(true_operand_buffer_index),
+      false_operand_buffer_index_(false_operand_buffer_index),
+      true_thunk_(std::move(true_thunk_sequence), hlo),
+      false_thunk_(std::move(false_thunk_sequence), hlo) {}
+
+Status ConditionalThunk::Initialize(const GpuExecutable& executable) {
+  TF_RETURN_IF_ERROR(true_thunk_.Initialize(executable));
+  TF_RETURN_IF_ERROR(false_thunk_.Initialize(executable));
+  return Status::OK();
+}
+
+Status ConditionalThunk::ExecuteOnStream(
+    const BufferAllocations& buffer_allocations,
+    perftools::gputools::Stream* stream) {
+  // Copy the predicate value from device.
+  bool predicate;
+  perftools::gputools::DeviceMemoryBase predicate_address =
+      buffer_allocations.GetDeviceAddress(predicate_buffer_index_);
+  stream->ThenMemcpy(&predicate, predicate_address, sizeof(bool));
+
+  Status block_status = stream->BlockHostUntilDone();
+  if (!block_status.ok()) {
+    return InternalError("Failed to retrieve predicate value on stream %p: %s.",
+                         stream, block_status.error_message().c_str());
+  }
+
+  // Execute the true or the false computation depending on the value of the
+  // predicate.
+  if (predicate) {
+    TF_RETURN_IF_ERROR(true_thunk_.ExecuteOnStream(buffer_allocations, stream));
+  } else {
+    TF_RETURN_IF_ERROR(
+        false_thunk_.ExecuteOnStream(buffer_allocations, stream));
+  }
+
+  return Status::OK();
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/conditional_thunk.h b/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
new file mode 100644
index 0000000000..7725c46a3b
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
@@ -0,0 +1,65 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CONDITIONAL_THUNK_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CONDITIONAL_THUNK_H_
+
+#include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
+#include "tensorflow/compiler/xla/service/gpu/sequential_thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/thunk.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+
+namespace xla {
+namespace gpu {
+
+// ConditionalThunk implements the conditional instruction on GPU by reading the
+// predicate of the conditional and executing the true or the false computation
+// depending on the value of the predicate.
+//
+// ConditionalThunk assumes that the buffers of the conditional result and the
+// result of the true and false computations share the same allocation. Also,
+// the buffers of the true operand of the conditional and that of the parameter
+// instruction of the true computation share the same allocation. Similarly, the
+// buffers of the false operand and that of the parameter instruction of the
+// false computation share the same allocation.
+class ConditionalThunk : public Thunk {
+ public:
+  ConditionalThunk(const BufferAllocation::Slice& predicate_buffer_index,
+                   const BufferAllocation::Slice& true_operand_buffer_index,
+                   const BufferAllocation::Slice& false_operand_buffer_index,
+                   ThunkSequence true_thunk_sequence,
+                   ThunkSequence false_thunk_sequence,
+                   const HloInstruction* hlo);
+
+  ConditionalThunk(const ConditionalThunk&) = delete;
+  ConditionalThunk& operator=(const ConditionalThunk&) = delete;
+
+  Status Initialize(const GpuExecutable& executable) override;
+  Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                         perftools::gputools::Stream* stream) override;
+
+ private:
+  BufferAllocation::Slice predicate_buffer_index_;
+  BufferAllocation::Slice true_operand_buffer_index_;
+  BufferAllocation::Slice false_operand_buffer_index_;
+  SequentialThunk true_thunk_;
+  SequentialThunk false_thunk_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CONDITIONAL_THUNK_H_
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc b/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc
index e67087d822..e3b493c663 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc
@@ -36,7 +36,7 @@ namespace gpu {
 
 StatusOr<HloInstruction*> GpuCopyInsertion::FindOrInsertCopy(
     HloInstruction* hlo) {
-  HloInstruction*& copy = inserted_copies_[hlo];
+  HloInstruction*& copy = hlo_to_copy_map_[hlo];
   if (copy == nullptr) {
     TF_ASSIGN_OR_RETURN(copy, hlo->parent()->DeepCopyInstruction(hlo));
   }
@@ -86,27 +86,34 @@ StatusOr<bool> GpuCopyInsertion::Run(HloModule* module) {
     }
   }
 
-  // Init values of a while node cannot be constants. Insert copies for any
-  // constants found at the operand of a while.
-  tensorflow::gtl::FlatSet<HloInstruction*> copied_constants;
+  // Init values of while and conditional nodes cannot be constants. Insert
+  // copies for any constants found at the operands of these nodes.
+  tensorflow::gtl::FlatSet<HloInstruction*> inserted_copies;
   for (HloComputation* computation : module->computations()) {
     for (HloInstruction* instruction : computation->instructions()) {
-      if (instruction->opcode() != HloOpcode::kWhile) {
+      if (instruction->opcode() != HloOpcode::kWhile &&
+          instruction->opcode() != HloOpcode::kConditional) {
         continue;
       }
-      for (auto& pair :
-               dataflow->GetInstructionValueSet(instruction->operand(0))) {
-        const HloValueSet& value_set = pair.second;
-        for (const HloValue* value : value_set.values()) {
-          if (value->defining_instruction()->opcode() ==
-              HloOpcode::kConstant &&
-              !ContainsKey(copied_constants, value->defining_instruction())) {
-            HloInstruction* constant = value->defining_instruction();
-            TF_ASSIGN_OR_RETURN(HloInstruction * copy,
-                                FindOrInsertCopy(constant));
-            TF_RETURN_IF_ERROR(constant->ReplaceAllUsesWith(copy));
-            copied_constants.insert(constant);
-            changed = true;
+      for (auto operand : instruction->operands()) {
+        // Skip the operands that have already been replaced with a copy in a
+        // previous iteration (which is possible when a constant is used as an
+        // operand in multiple places).
+        if (ContainsKey(inserted_copies, operand)) {
+          continue;
+        }
+        for (auto& pair : dataflow->GetInstructionValueSet(operand)) {
+          const HloValueSet& value_set = pair.second;
+          for (const HloValue* value : value_set.values()) {
+            if (value->defining_instruction()->IsConstant() &&
+                !ContainsKey(hlo_to_copy_map_, value->defining_instruction())) {
+              HloInstruction* constant = value->defining_instruction();
+              TF_ASSIGN_OR_RETURN(HloInstruction * copy,
+                                  FindOrInsertCopy(constant));
+              TF_RETURN_IF_ERROR(constant->ReplaceAllUsesWith(copy));
+              inserted_copies.insert(copy);
+              changed = true;
+            }
           }
         }
       }
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.h b/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.h
index 4d77f337e6..0c6f9b511f 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.h
@@ -32,13 +32,13 @@ class GpuCopyInsertion : public HloPassInterface {
   StatusOr<bool> Run(HloModule* module) override;
 
  protected:
-  // Returns a copy of `hlo`. Looks in inserted_copies_ first to avoid making
+  // Returns a copy of `hlo`. Looks in hlo_to_copy_map_ first to avoid making
   // duplicate copies.
   StatusOr<HloInstruction*> FindOrInsertCopy(HloInstruction* hlo);
 
   // A map containing all copies inserted to materialize operands of library
   // calls. The key is the copied instruction and the value is the copy.
-  tensorflow::gtl::FlatMap<HloInstruction*, HloInstruction*> inserted_copies_;
+  tensorflow::gtl::FlatMap<HloInstruction*, HloInstruction*> hlo_to_copy_map_;
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
index 095c3df3bf..23b72c3f71 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
@@ -758,37 +758,6 @@ Status IrEmitter::HandleBatchNormGrad(HloInstruction*) {
       "to a cudnn CustomCall using CudnnBatchNormRewriter.");
 }
 
-Status IrEmitter::HandleConditional(HloInstruction* conditional) {
-  auto pred = conditional->operand(0);
-  auto true_arg = conditional->operand(1);
-  auto false_arg = conditional->operand(2);
-
-  llvm::Value* conditional_result = GetBasePointer(*conditional);
-
-  llvm::LoadInst* pred_value = ir_builder_.CreateLoad(
-      GetBasePointer(*pred),
-      llvm_ir::AsStringRef(IrName(conditional, "load_predicate_value")));
-  llvm::Value* pred_cond = ir_builder_.CreateICmpNE(
-      pred_value,
-      llvm::ConstantInt::get(llvm_ir::PrimitiveTypeToIrType(PRED, module_), 0),
-      llvm_ir::AsStringRef(IrName(conditional, "boolean_predicate")));
-  llvm_ir::LlvmIfData if_data = llvm_ir::EmitIfThenElse(
-      pred_cond, IrName(conditional, "if_then_else"), &ir_builder_);
-
-  SetToFirstInsertPoint(if_data.true_block, &ir_builder_);
-  TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
-      *conditional->true_computation(), {GetBasePointer(*true_arg)},
-      conditional_result));
-
-  SetToFirstInsertPoint(if_data.false_block, &ir_builder_);
-  TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
-      *conditional->false_computation(), {GetBasePointer(*false_arg)},
-      conditional_result));
-
-  SetToFirstInsertPoint(if_data.after_block, &ir_builder_);
-  return Status::OK();
-}
-
 llvm_ir::IrArray::Index IrEmitter::EmitOperandArrayLoopNest(
     const llvm_ir::IrArray& operand_array, int64 reduction_dimension,
     tensorflow::StringPiece name_suffix, llvm_ir::ForLoopNest* loop_nest) {
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.h b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
index 39bafaa346..3aa178410f 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
@@ -96,7 +96,6 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   Status HandleCall(HloInstruction* call) override;
   Status HandleCustomCall(HloInstruction* custom_call) override;
   Status HandleRng(HloInstruction* random) override;
-  Status HandleConditional(HloInstruction* conditional) override;
   Status HandleBatchNormInference(HloInstruction* batch_norm) override;
   Status HandleBatchNormTraining(HloInstruction* batch_norm) override;
   Status HandleBatchNormGrad(HloInstruction* batch_norm) override;
@@ -367,6 +366,11 @@ class IrEmitterUnnested : public IrEmitter {
   std::unique_ptr<Thunk> BuildForThunk(const HloInstruction* hlo,
                                        const int64 loop_limit);
 
+  // Returns a ConditionalThunk that executes the thunk sequence for
+  // 'true_computation' or 'false_computation' depending on the value of the
+  // predicate in the given conditional instruction.
+  std::unique_ptr<Thunk> BuildConditionalThunk(const HloInstruction* hlo);
+
   Status Postprocess(HloInstruction* hlo) override;
 
   // Returns the last generated thunk.
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index be35351e87..fc8783e753 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor.h"
+#include "tensorflow/compiler/xla/service/gpu/conditional_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/convolution_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/copy_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.h"
@@ -272,8 +273,8 @@ Status IrEmitterUnnested::HandleDot(HloInstruction* dot) {
 }
 
 Status IrEmitterUnnested::HandleConditional(HloInstruction* conditional) {
-  thunk_sequence_->push_back(BuildKernelThunk(conditional));
-  return IrEmitter::HandleConditional(conditional);
+  thunk_sequence_->emplace_back(BuildConditionalThunk(conditional));
+  return Status::OK();
 }
 
 Status IrEmitterUnnested::HandleConvolution(HloInstruction* convolution) {
@@ -2102,6 +2103,24 @@ Status IrEmitterUnnested::EmitInitializer(const HloInstruction* hlo,
 
 namespace {
 
+// Checks that the buffers corresponding to the given two HLOs share the same
+// allocation.
+Status CheckHloBuffersShareAllocation(
+    const HloInstruction* a, const HloInstruction* b, const ShapeIndex& index,
+    const BufferAssignment& buffer_assignment) {
+  const BufferAllocation::Slice slice_a =
+      buffer_assignment.GetUniqueSlice(a, index).ConsumeValueOrDie();
+  const BufferAllocation::Slice slice_b =
+      buffer_assignment.GetUniqueSlice(b, index).ConsumeValueOrDie();
+  if (slice_a != slice_b) {
+    return InternalError(
+        "instruction %s %s does not share allocation with instruction %s %s",
+        a->ToString().c_str(), slice_a.ToString().c_str(),
+        b->ToString().c_str(), slice_b.ToString().c_str());
+  }
+  return Status::OK();
+}
+
 // Checks that all buffers used during while loop iteration share the same
 // buffer allocation. This includes buffers for while result, while init
 // operand, condition parameter, body parameter and body result.
@@ -2111,37 +2130,65 @@ Status CheckWhileBuffersShareAllocation(
     const BufferAssignment& buffer_assignment) {
   return ShapeUtil::ForEachSubshapeWithStatus(
       xla_while->shape(),
-      [&buffer_assignment, &xla_while](const Shape& /*subshape*/,
-                                       const ShapeIndex& index) -> Status {
-        auto check = [&buffer_assignment](const HloInstruction* a,
-                                          const HloInstruction* b,
-                                          const ShapeIndex& index) -> Status {
-          const BufferAllocation::Slice slice_a =
-              buffer_assignment.GetUniqueSlice(a, index).ConsumeValueOrDie();
-          const BufferAllocation::Slice slice_b =
-              buffer_assignment.GetUniqueSlice(b, index).ConsumeValueOrDie();
-          if (slice_a != slice_b) {
-            return InternalError(
-                "instruction %s %s does not share allocation with "
-                "instruction %s %s",
-                a->ToString().c_str(), slice_a.ToString().c_str(),
-                b->ToString().c_str(), slice_b.ToString().c_str());
-          }
-          return Status::OK();
-        };
+      [&](const Shape& /*subshape*/, const ShapeIndex& index) -> Status {
         const HloInstruction* condition_parameter =
             xla_while->while_condition()->parameter_instruction(0);
         const HloComputation* body = xla_while->while_body();
         const HloInstruction* body_parameter = body->parameter_instruction(0);
         const HloInstruction* body_result = body->root_instruction();
-        TF_RETURN_IF_ERROR(check(xla_while, xla_while->operand(0), index));
-        TF_RETURN_IF_ERROR(check(xla_while, condition_parameter, index));
-        TF_RETURN_IF_ERROR(check(xla_while, body_parameter, index));
-        TF_RETURN_IF_ERROR(check(xla_while, body_result, index));
+        TF_RETURN_IF_ERROR(CheckHloBuffersShareAllocation(
+            xla_while, xla_while->operand(0), index, buffer_assignment));
+        TF_RETURN_IF_ERROR(CheckHloBuffersShareAllocation(
+            xla_while, condition_parameter, index, buffer_assignment));
+        TF_RETURN_IF_ERROR(CheckHloBuffersShareAllocation(
+            xla_while, body_parameter, index, buffer_assignment));
+        TF_RETURN_IF_ERROR(CheckHloBuffersShareAllocation(
+            xla_while, body_result, index, buffer_assignment));
         return Status::OK();
       });
 }
 
+// Checks that the buffers used in a conditional instruction are shared with the
+// operands and result as follows:
+//   * The result buffer of the conditional should share the allocation with the
+//     result buffers of the true and false computations.
+//   * The buffer of operand 1 should share the allocation with the buffer of
+//     the parameter 0 instruction of the true computation.
+//   * The buffer of operand 2 should share the allocation with the buffer of
+//     the parameter 0 instruction of the false computation.
+Status CheckConditionalBuffersShareAllocation(
+    const HloInstruction* conditional,
+    const BufferAssignment& buffer_assignment) {
+  TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
+      conditional->shape(),
+      [&](const Shape& /*subshape*/, const ShapeIndex& index) -> Status {
+        TF_RETURN_IF_ERROR(CheckHloBuffersShareAllocation(
+            conditional, conditional->true_computation()->root_instruction(),
+            index, buffer_assignment));
+        TF_RETURN_IF_ERROR(CheckHloBuffersShareAllocation(
+            conditional, conditional->false_computation()->root_instruction(),
+            index, buffer_assignment));
+        return Status::OK();
+      }));
+  TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
+      conditional->operand(1)->shape(),
+      [&](const Shape& /*subshape*/, const ShapeIndex& index) -> Status {
+        return CheckHloBuffersShareAllocation(
+            conditional->operand(1),
+            conditional->true_computation()->parameter_instruction(0), index,
+            buffer_assignment);
+      }));
+  TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
+      conditional->operand(2)->shape(),
+      [&](const Shape& /*subshape*/, const ShapeIndex& index) -> Status {
+        return CheckHloBuffersShareAllocation(
+            conditional->operand(2),
+            conditional->false_computation()->parameter_instruction(0), index,
+            buffer_assignment);
+      }));
+  return Status::OK();
+}
+
 }  // namespace
 
 std::unique_ptr<Thunk> IrEmitterUnnested::BuildWhileThunk(
@@ -2184,6 +2231,31 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildForThunk(
                               ir_emitter_body.ConsumeThunkSequence(), hlo);
 }
 
+std::unique_ptr<Thunk> IrEmitterUnnested::BuildConditionalThunk(
+    const HloInstruction* hlo) {
+  // Check that the buffers used in conditional are shared with the operands and
+  // result appropriately.
+  TF_CHECK_OK(CheckConditionalBuffersShareAllocation(
+      hlo, ir_emitter_context_->buffer_assignment()));
+
+  HloComputation* true_computation = hlo->true_computation();
+  IrEmitterUnnested ir_emitter_true(hlo_module_config_, true_computation,
+                                    ir_emitter_context_);
+  TF_CHECK_OK(true_computation->root_instruction()->Accept(&ir_emitter_true));
+
+  HloComputation* false_computation = hlo->false_computation();
+  IrEmitterUnnested ir_emitter_false(hlo_module_config_, false_computation,
+                                     ir_emitter_context_);
+  TF_CHECK_OK(false_computation->root_instruction()->Accept(&ir_emitter_false));
+
+  return MakeUnique<ConditionalThunk>(
+      GetAllocationSlice(*hlo->operand(0)),
+      GetAllocationSlice(*hlo->operand(1)),
+      GetAllocationSlice(*hlo->operand(2)),
+      std::move(*ir_emitter_true.ConsumeThunkSequence()),
+      std::move(*ir_emitter_false.ConsumeThunkSequence()), hlo);
+}
+
 Status IrEmitterUnnested::EmitTargetElementLoopInThunk(
     const HloInstruction& hlo,
     const llvm_ir::ElementGenerator& element_generator, KernelThunk* thunk) {
diff --git a/tensorflow/compiler/xla/service/gpu/thunk.h b/tensorflow/compiler/xla/service/gpu/thunk.h
index 625c3f8bea..2c3032d79b 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/thunk.h
@@ -41,6 +41,7 @@ class GpuExecutable;
 class Thunk {
  public:
   enum class Kind {
+    kConditional,
     kConvolution,
     kCopy,
     kCudnnBatchNormBackward,
diff --git a/tensorflow/compiler/xla/tests/conditional_test.cc b/tensorflow/compiler/xla/tests/conditional_test.cc
index 0016b6cc61..bc82167482 100644
--- a/tensorflow/compiler/xla/tests/conditional_test.cc
+++ b/tensorflow/compiler/xla/tests/conditional_test.cc
@@ -355,8 +355,7 @@ XLA_TEST_F(ConditionalOpTest, ReturnTupleOfScalars) {
 }
 
 // Test true and false computations that return a tuple of arrays.
-// TODO(b/71715476): Returning tuples from Conditional fails in GPU backend.
-XLA_TEST_F(ConditionalOpTest, DISABLED_ON_GPU(ReturnTupleOfArrays)) {
+XLA_TEST_F(ConditionalOpTest, ReturnTupleOfArrays) {
   ComputationBuilder builder(client_, TestName());
   auto pred = builder.ConstantR0<bool>(true);
   auto operands = builder.Tuple({builder.ConstantR1<float>({12.2f, 15.8f}),
@@ -373,9 +372,7 @@ XLA_TEST_F(ConditionalOpTest, DISABLED_ON_GPU(ReturnTupleOfArrays)) {
 
 // Test true and false computations that return a tuple of a predicate, a
 // scalar, and an array.
-// TODO(b/71715476): Returning tuples from Conditional fails in GPU backend.
-XLA_TEST_F(ConditionalOpTest,
-           DISABLED_ON_GPU(ReturnTupleofPredicateScalarArray)) {
+XLA_TEST_F(ConditionalOpTest, ReturnTupleofPredicateScalarArray) {
   ComputationBuilder true_builder(client_, TestName() + ".true");
   {
     true_builder.Parameter(0, empty_tuple_, "tuple");
@@ -413,8 +410,7 @@ XLA_TEST_F(ConditionalOpTest,
 }
 
 // Test true and false computations that return a nested tuple.
-// TODO(b/71715476): Returning tuples from Conditional fails in GPU backend.
-XLA_TEST_F(ConditionalOpTest, DISABLED_ON_GPU(ReturnNestedTuple)) {
+XLA_TEST_F(ConditionalOpTest, ReturnNestedTuple) {
   ComputationBuilder true_builder(client_, TestName() + ".true");
   {
     true_builder.Parameter(0, empty_tuple_, "tuple");
@@ -532,6 +528,32 @@ XLA_TEST_F(ConditionalOpTest, NestedConditionals) {
   ComputeAndCompareR0<float>(&builder, 12.0f, {}, error_spec_);
 }
 
+XLA_TEST_F(ConditionalOpTest, ConditionalInNestedComputation) {
+  ComputationBuilder inner_builder(client_, TestName() + ".inner_conditional");
+  {
+    Shape r0bool = ShapeUtil::MakeShape(PRED, {});
+    Shape tuple_shape = ShapeUtil::MakeTupleShape({r0bool, r0f32_, r0f32_});
+    auto param0 = inner_builder.Parameter(0, tuple_shape, "param0");
+    auto pred_cond = inner_builder.GetTupleElement(param0, 0);
+    auto true_operand = inner_builder.GetTupleElement(param0, 1);
+    auto false_operand = inner_builder.GetTupleElement(param0, 2);
+    inner_builder.Conditional(pred_cond, true_operand,
+                              CreateR0CeilComputation(), false_operand,
+                              CreateR0FloorComputation());
+  }
+  auto inner_builder_result = inner_builder.Build();
+  EXPECT_IS_OK(inner_builder_result.status());
+
+  ComputationBuilder builder(client_, TestName());
+  auto pred2 = builder.ConstantR0<bool>(false);
+  auto operand1 = builder.ConstantR0<float>(1.1f);
+  auto operand2 = builder.ConstantR0<float>(12.2f);
+  auto tuple_operand = builder.Tuple({pred2, operand1, operand2});
+  builder.Call(inner_builder_result.ConsumeValueOrDie(), {tuple_operand});
+
+  ComputeAndCompareR0<float>(&builder, 12.0f, {}, error_spec_);
+}
+
 // Test a mismatch in the shape of the true operand and true computation.
 XLA_TEST_F(ConditionalOpTest, ShapeMismatch) {
   ComputationBuilder builder(client_, TestName());
-- 
GitLab


From 3942de820958e75792c86a50084f9312b5edd3ba Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 25 Jan 2018 15:25:58 -0800
Subject: [PATCH 188/966] Adds loss_fn argument in remaining heads.

PiperOrigin-RevId: 183301479
---
 tensorflow/contrib/estimator/BUILD            |   3 +-
 .../estimator/python/estimator/head.py        |  89 +++----
 .../estimator/python/estimator/head_test.py   |   4 +-
 tensorflow/python/estimator/BUILD             |   1 +
 tensorflow/python/estimator/canned/head.py    | 134 +++++++++-
 .../python/estimator/canned/head_test.py      | 251 ++++++++++++++++++
 6 files changed, 412 insertions(+), 70 deletions(-)

diff --git a/tensorflow/contrib/estimator/BUILD b/tensorflow/contrib/estimator/BUILD
index cdbe05e4d2..6cdbed5b89 100644
--- a/tensorflow/contrib/estimator/BUILD
+++ b/tensorflow/contrib/estimator/BUILD
@@ -163,7 +163,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:check_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:lookup_ops",
@@ -177,7 +177,6 @@ py_library(
         "//tensorflow/python/estimator:metric_keys",
         "//tensorflow/python/estimator:model_fn",
         "//tensorflow/python/estimator:prediction_keys",
-        "//tensorflow/python/estimator:util",
         "//tensorflow/python/ops/losses",
         "//tensorflow/python/saved_model:signature_constants",
     ],
diff --git a/tensorflow/contrib/estimator/python/estimator/head.py b/tensorflow/contrib/estimator/python/estimator/head.py
index fd0994490a..238cf287b7 100644
--- a/tensorflow/contrib/estimator/python/estimator/head.py
+++ b/tensorflow/contrib/estimator/python/estimator/head.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.estimator import model_fn
-from tensorflow.python.estimator import util
 from tensorflow.python.estimator.canned import head as head_lib
 from tensorflow.python.estimator.canned import metric_keys
 from tensorflow.python.estimator.canned import prediction_keys
@@ -29,7 +28,6 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import metrics as metrics_lib
@@ -45,6 +43,7 @@ def multi_class_head(n_classes,
                      weight_column=None,
                      label_vocabulary=None,
                      loss_reduction=losses.Reduction.SUM,
+                     loss_fn=None,
                      name=None):
   """Creates a `_Head` for multi class classification.
 
@@ -65,6 +64,12 @@ def multi_class_head(n_classes,
   labels have shape `[batch_size, 1]`, the loss is the weighted sum over
   `batch_size`.
 
+  Also supports custom `loss_fn`. `loss_fn` takes `(labels, logits)` or
+  `(labels, logits, features)` as arguments and returns unreduced loss with
+  shape `[D0, D1, ... DN, 1]`. `loss_fn` must support integer `labels` with
+  shape `[D0, D1, ... DN, 1]`. Namely, the head applies `label_vocabulary` to
+  the input labels before passing them to `loss_fn`.
+
   Args:
     n_classes: Number of classes, must be greater than 2 (for 2 classes, use
       `binary_classification_head`).
@@ -79,6 +84,7 @@ def multi_class_head(n_classes,
       `label_vocabulary` is not provided but labels are strings.
     loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
       reduce training loss over batch. Defaults to `SUM`.
+    loss_fn: Optional loss function.
     name: name of the head. If provided, summary and metrics keys will be
       suffixed by `"/" + name`. Also used as `name_scope` when creating ops.
 
@@ -94,12 +100,17 @@ def multi_class_head(n_classes,
       weight_column=weight_column,
       label_vocabulary=label_vocabulary,
       loss_reduction=loss_reduction,
+      loss_fn=loss_fn,
       name=name)
 
 
 def binary_classification_head(
-    weight_column=None, thresholds=None, label_vocabulary=None,
-    loss_reduction=losses.Reduction.SUM, name=None):
+    weight_column=None,
+    thresholds=None,
+    label_vocabulary=None,
+    loss_reduction=losses.Reduction.SUM,
+    loss_fn=None,
+    name=None):
   """Creates a `_Head` for single label binary classification.
 
   This head uses `sigmoid_cross_entropy_with_logits` loss.
@@ -119,6 +130,12 @@ def binary_classification_head(
   labels have shape `[batch_size, 1]`, the loss is the weighted sum over
   `batch_size`.
 
+  Also supports custom `loss_fn`. `loss_fn` takes `(labels, logits)` or
+  `(labels, logits, features)` as arguments and returns unreduced loss with
+  shape `[D0, D1, ... DN, 1]`. `loss_fn` must support float `labels` with
+  shape `[D0, D1, ... DN, 1]`. Namely, the head applies `label_vocabulary` to
+  the input labels before passing them to `loss_fn`.
+
   Args:
     weight_column: A string or a `_NumericColumn` created by
       `tf.feature_column.numeric_column` defining feature column representing
@@ -136,6 +153,7 @@ def binary_classification_head(
       is not provided but labels are strings.
     loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
       reduce training loss over batch. Defaults to `SUM`.
+    loss_fn: Optional loss function.
     name: name of the head. If provided, summary and metrics keys will be
       suffixed by `"/" + name`. Also used as `name_scope` when creating ops.
 
@@ -151,12 +169,14 @@ def binary_classification_head(
       thresholds=thresholds,
       label_vocabulary=label_vocabulary,
       loss_reduction=loss_reduction,
+      loss_fn=loss_fn,
       name=name)
 
 
 def regression_head(weight_column=None,
                     label_dimension=1,
                     loss_reduction=losses.Reduction.SUM,
+                    loss_fn=None,
                     name=None):
   """Creates a `_Head` for regression using the `mean_squared_error` loss.
 
@@ -175,6 +195,10 @@ def regression_head(weight_column=None,
   `[D0, D1, ... DN]`, `[D0, D1, ... DN, 1]` or
   `[D0, D1, ... DN, label_dimension]`.
 
+  Also supports custom `loss_fn`. `loss_fn` takes `(labels, logits)` or
+  `(labels, logits, features)` as arguments and returns unreduced loss with
+  shape `[D0, D1, ... DN, label_dimension]`.
+
   Args:
     weight_column: A string or a `_NumericColumn` created by
       `tf.feature_column.numeric_column` defining feature column representing
@@ -185,6 +209,7 @@ def regression_head(weight_column=None,
       `[batch_size, label_dimension]`).
     loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
       reduce training loss over batch. Defaults to `SUM`.
+    loss_fn: Optional loss function.
     name: name of the head. If provided, summary and metrics keys will be
       suffixed by `"/" + name`. Also used as `name_scope` when creating ops.
 
@@ -198,6 +223,7 @@ def regression_head(weight_column=None,
       weight_column=weight_column,
       label_dimension=label_dimension,
       loss_reduction=loss_reduction,
+      loss_fn=loss_fn,
       name=name)
 
 
@@ -287,7 +313,7 @@ def multi_label_head(n_classes,
           'Length of label_vocabulary must be n_classes ({}). '
           'Given: {}'.format(n_classes, len(label_vocabulary)))
   if loss_fn:
-    _validate_loss_fn_args(loss_fn)
+    head_lib._validate_loss_fn_args(loss_fn)  # pylint:disable=protected-access
   if (loss_reduction not in losses.Reduction.all() or
       loss_reduction == losses.Reduction.NONE):
     raise ValueError('Invalid loss_reduction: {}'.format(loss_reduction))
@@ -371,9 +397,9 @@ class _MultiLabelHead(head_lib._Head):  # pylint:disable=protected-access
         labels=processed_labels, logits=logits,
         expected_labels_dimension=self.logits_dimension)
     if self._loss_fn:
-      unweighted_loss = _call_loss_fn(
+      unweighted_loss = head_lib._call_loss_fn(  # pylint:disable=protected-access
           loss_fn=self._loss_fn, labels=processed_labels, logits=logits,
-          features=features)
+          features=features, expected_loss_dim=1)
     else:
       unweighted_loss = losses.sigmoid_cross_entropy(
           multi_class_labels=processed_labels, logits=logits,
@@ -555,52 +581,3 @@ class _MultiLabelHead(head_lib._Head):  # pylint:disable=protected-access
                 threshold=threshold,
                 name=recall_key))
     return metric_ops
-
-
-def _validate_loss_fn_args(loss_fn):
-  """Validates loss_fn arguments.
-
-  Required arguments: labels, logits.
-  Optional arguments: features.
-
-  Args:
-    loss_fn: The loss function.
-  Raises:
-    ValueError: If the signature is unexpected.
-  """
-  loss_fn_args = util.fn_args(loss_fn)
-  for required_arg in ['labels', 'logits']:
-    if required_arg not in loss_fn_args:
-      raise ValueError(
-          'loss_fn must contain argument: {}. '
-          'Given arguments: {}'.format(required_arg, loss_fn_args))
-  invalid_args = list(set(loss_fn_args) - set(['labels', 'logits', 'features']))
-  if invalid_args:
-    raise ValueError('loss_fn has unexpected args: {}'.format(invalid_args))
-
-
-def _call_loss_fn(loss_fn, labels, logits, features):
-  """Calls loss_fn and checks the returned shape.
-
-  Args:
-    loss_fn: The loss function.
-    labels: Processed labels Tensor.
-    logits: Logits Tensor of shape [batch_size, logits_dimension].
-    features: Features dict.
-  Returns:
-    Loss Tensor with shape [batch_size, 1].
-  """
-  loss_fn_args = util.fn_args(loss_fn)
-  kwargs = {}
-  if 'features' in loss_fn_args:
-    kwargs['features'] = features
-  unweighted_loss = loss_fn(labels=labels, logits=logits, **kwargs)
-  batch_size = array_ops.shape(logits)[0]
-  loss_shape = array_ops.shape(unweighted_loss)
-  check_shape_op = control_flow_ops.Assert(
-      math_ops.reduce_all(math_ops.equal(loss_shape, [batch_size, 1])),
-      data=[
-          'loss_fn must return Tensor of shape [batch_size, 1]. Given: ',
-          loss_shape])
-  with ops.control_dependencies([check_shape_op]):
-    return array_ops.identity(unweighted_loss)
diff --git a/tensorflow/contrib/estimator/python/estimator/head_test.py b/tensorflow/contrib/estimator/python/estimator/head_test.py
index 1adbd6f0fe..43cdfec968 100644
--- a/tensorflow/contrib/estimator/python/estimator/head_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/head_test.py
@@ -381,8 +381,8 @@ class MultiLabelHead(test.TestCase):
       _initialize_variables(self, monitored_session.Scaffold())
       with self.assertRaisesRegexp(
           errors.InvalidArgumentError,
-          r'loss_fn must return Tensor of shape \[batch_size, 1\]\. '
-          r'Given: \] \[2\]'):
+          r'\[loss_fn must return Tensor of shape \[D0, D1, ... DN, 1\]\. \] '
+          r'\[logits_shape: \] \[2 2\] \[loss_shape: \] \[2\]'):
         actual_training_loss.eval()
 
   def test_eval_labels_none(self):
diff --git a/tensorflow/python/estimator/BUILD b/tensorflow/python/estimator/BUILD
index 41f55b12af..c519fd557a 100644
--- a/tensorflow/python/estimator/BUILD
+++ b/tensorflow/python/estimator/BUILD
@@ -604,6 +604,7 @@ py_library(
         ":metric_keys",
         ":model_fn",
         ":prediction_keys",
+        ":util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
         "//tensorflow/python:control_flow_ops",
diff --git a/tensorflow/python/estimator/canned/head.py b/tensorflow/python/estimator/canned/head.py
index 94a5d3a342..cb9e3fc6ca 100644
--- a/tensorflow/python/estimator/canned/head.py
+++ b/tensorflow/python/estimator/canned/head.py
@@ -24,6 +24,7 @@ import collections
 import six
 
 from tensorflow.python.estimator import model_fn
+from tensorflow.python.estimator import util
 from tensorflow.python.estimator.canned import metric_keys
 from tensorflow.python.estimator.canned import prediction_keys
 from tensorflow.python.estimator.export import export_output
@@ -371,6 +372,64 @@ def _check_logits_final_dim(logits, expected_logits_dimension):
         return array_ops.identity(logits, name=scope)
 
 
+def _validate_loss_fn_args(loss_fn):
+  """Validates loss_fn arguments.
+
+  Required arguments: labels, logits.
+  Optional arguments: features.
+
+  Args:
+    loss_fn: The loss function.
+  Raises:
+    ValueError: If the signature is unexpected.
+  """
+  loss_fn_args = util.fn_args(loss_fn)
+  for required_arg in ['labels', 'logits']:
+    if required_arg not in loss_fn_args:
+      raise ValueError(
+          'loss_fn must contain argument: {}. '
+          'Given arguments: {}'.format(required_arg, loss_fn_args))
+  invalid_args = list(set(loss_fn_args) - set(['labels', 'logits', 'features']))
+  if invalid_args:
+    raise ValueError('loss_fn has unexpected args: {}'.format(invalid_args))
+
+
+def _call_loss_fn(loss_fn, labels, logits, features, expected_loss_dim=1):
+  """Calls loss_fn and checks the returned shape.
+
+  Args:
+    loss_fn: The loss function.
+    labels: Processed labels Tensor.
+    logits: Logits Tensor of shape [D0, D1, ... DN, logits_dimension].
+    features: Features dict.
+    expected_loss_dim: The expected last dimension of loss Tensor.
+  Returns:
+    Loss Tensor with shape [D0, D1, ... DN, expected_loss_dim].
+  """
+  loss_fn_args = util.fn_args(loss_fn)
+  kwargs = {}
+  if 'features' in loss_fn_args:
+    kwargs['features'] = features
+  with ops.name_scope(
+      None, 'call_loss_fn',
+      values=[labels, logits] + list(six.itervalues(features))):
+    unweighted_loss = loss_fn(labels=labels, logits=logits, **kwargs)
+    logits_shape = array_ops.shape(logits, name='logits_shape')
+    expected_loss_shape = array_ops.concat(
+        [logits_shape[:-1], [expected_loss_dim]], axis=0,
+        name='expected_loss_shape')
+    loss_shape = array_ops.shape(unweighted_loss, name='loss_shape')
+    check_loss_shape_op = control_flow_ops.Assert(
+        math_ops.reduce_all(math_ops.equal(loss_shape, expected_loss_shape)),
+        data=[
+            'loss_fn must return Tensor of shape '
+            '[D0, D1, ... DN, {}]. '.format(expected_loss_dim),
+            'logits_shape: ', logits_shape, 'loss_shape: ', loss_shape],
+        name='check_loss_shape')
+    with ops.control_dependencies([check_loss_shape_op]):
+      return array_ops.identity(unweighted_loss)
+
+
 def _indicator_labels_mean(labels, weights=None, name=None):
   with ops.name_scope(name, 'labels_mean', (labels, weights)) as scope:
     labels = math_ops.to_float(labels, name='labels')
@@ -467,6 +526,7 @@ def _multi_class_head_with_softmax_cross_entropy_loss(
     weight_column=None,
     label_vocabulary=None,
     loss_reduction=losses.Reduction.SUM,
+    loss_fn=None,
     name=None):
   """Creates a '_Head' for multi class classification.
 
@@ -485,6 +545,12 @@ def _multi_class_head_with_softmax_cross_entropy_loss(
   labels have shape `[batch_size, 1]`, the loss is the weighted sum over
   `batch_size`.
 
+  Also supports custom `loss_fn`. `loss_fn` takes `(labels, logits)` or
+  `(labels, logits, features)` as arguments and returns unreduced loss with
+  shape `[D0, D1, ... DN, 1]`. `loss_fn` must support integer `labels` with
+  shape `[D0, D1, ... DN, 1]`. Namely, the head applies `label_vocabulary` to
+  the input labels before passing them to `loss_fn`.
+
   Args:
     n_classes: Number of classes, must be greater than 2 (for 2 classes, use
       `_BinaryLogisticHeadWithSigmoidCrossEntropyLoss`).
@@ -499,6 +565,7 @@ def _multi_class_head_with_softmax_cross_entropy_loss(
       `label_vocabulary` is not provided but labels are strings.
     loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
       reduce training loss over batch. Defaults to `SUM`.
+    loss_fn: Optional loss function.
     name: name of the head. If provided, summary and metrics keys will be
       suffixed by `"/" + name`. Also used as `name_scope` when creating ops.
 
@@ -517,11 +584,14 @@ def _multi_class_head_with_softmax_cross_entropy_loss(
   if (loss_reduction not in losses.Reduction.all() or
       loss_reduction == losses.Reduction.NONE):
     raise ValueError('Invalid loss_reduction: {}'.format(loss_reduction))
+  if loss_fn:
+    _validate_loss_fn_args(loss_fn)
   return _MultiClassHeadWithSoftmaxCrossEntropyLoss(
       n_classes=n_classes,
       weight_column=weight_column,
       label_vocabulary=label_vocabulary,
       loss_reduction=loss_reduction,
+      loss_fn=loss_fn,
       name=name)
 
 
@@ -533,6 +603,7 @@ class _MultiClassHeadWithSoftmaxCrossEntropyLoss(_Head):
                weight_column=None,
                label_vocabulary=None,
                loss_reduction=losses.Reduction.SUM,
+               loss_fn=None,
                name=None):
     if (n_classes is None) or (n_classes <= 2):
       raise ValueError('n_classes must be > 2: %s.' % n_classes)
@@ -540,6 +611,7 @@ class _MultiClassHeadWithSoftmaxCrossEntropyLoss(_Head):
     self._weight_column = weight_column
     self._label_vocabulary = label_vocabulary
     self._loss_reduction = loss_reduction
+    self._loss_fn = loss_fn
     self._name = name
 
   @property
@@ -602,10 +674,15 @@ class _MultiClassHeadWithSoftmaxCrossEntropyLoss(_Head):
     labels = _check_dense_labels_match_logits_and_reshape(
         labels=labels, logits=logits, expected_labels_dimension=1)
     label_ids = self._label_ids(labels)
-    unweighted_loss = losses.sparse_softmax_cross_entropy(
-        labels=label_ids, logits=logits, reduction=losses.Reduction.NONE)
-    # Restore the squeezed dim, so unweighted_loss matches the weights shape.
-    unweighted_loss = array_ops.expand_dims(unweighted_loss, axis=-1)
+    if self._loss_fn:
+      unweighted_loss = _call_loss_fn(
+          loss_fn=self._loss_fn, labels=label_ids, logits=logits,
+          features=features, expected_loss_dim=1)
+    else:
+      unweighted_loss = losses.sparse_softmax_cross_entropy(
+          labels=label_ids, logits=logits, reduction=losses.Reduction.NONE)
+      # Restore the squeezed dim, so unweighted_loss matches the weights shape.
+      unweighted_loss = array_ops.expand_dims(unweighted_loss, axis=-1)
     weights = _get_weights_and_check_match_logits(
         features=features, weight_column=self._weight_column, logits=logits)
     training_loss = losses.compute_weighted_loss(
@@ -734,8 +811,12 @@ class _MultiClassHeadWithSoftmaxCrossEntropyLoss(_Head):
 
 
 def _binary_logistic_head_with_sigmoid_cross_entropy_loss(
-    weight_column=None, thresholds=None, label_vocabulary=None,
-    loss_reduction=losses.Reduction.SUM, name=None):
+    weight_column=None,
+    thresholds=None,
+    label_vocabulary=None,
+    loss_reduction=losses.Reduction.SUM,
+    loss_fn=None,
+    name=None):
   """Creates a `_Head` for single label binary classification.
 
   This head uses `sigmoid_cross_entropy_with_logits` loss.
@@ -755,6 +836,12 @@ def _binary_logistic_head_with_sigmoid_cross_entropy_loss(
   labels have shape `[batch_size, 1]`, the loss is the weighted sum over
   `batch_size`.
 
+  Also supports custom `loss_fn`. `loss_fn` takes `(labels, logits)` or
+  `(labels, logits, features)` as arguments and returns unreduced loss with
+  shape `[D0, D1, ... DN, 1]`. `loss_fn` must support float `labels` with
+  shape `[D0, D1, ... DN, 1]`. Namely, the head applies `label_vocabulary` to
+  the input labels before passing them to `loss_fn`.
+
   Args:
     weight_column: A string or a `_NumericColumn` created by
       `tf.feature_column.numeric_column` defining feature column representing
@@ -772,6 +859,7 @@ def _binary_logistic_head_with_sigmoid_cross_entropy_loss(
       is not provided but labels are strings.
     loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
       reduce training loss over batch. Defaults to `SUM`.
+    loss_fn: Optional loss function.
     name: name of the head. If provided, summary and metrics keys will be
       suffixed by `"/" + name`. Also used as `name_scope` when creating ops.
 
@@ -795,11 +883,14 @@ def _binary_logistic_head_with_sigmoid_cross_entropy_loss(
   if (loss_reduction not in losses.Reduction.all() or
       loss_reduction == losses.Reduction.NONE):
     raise ValueError('Invalid loss_reduction: {}'.format(loss_reduction))
+  if loss_fn:
+    _validate_loss_fn_args(loss_fn)
   return _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(
       weight_column=weight_column,
       thresholds=thresholds,
       label_vocabulary=label_vocabulary,
       loss_reduction=loss_reduction,
+      loss_fn=loss_fn,
       name=name)
 
 
@@ -811,11 +902,13 @@ class _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(_Head):
                thresholds=None,
                label_vocabulary=None,
                loss_reduction=losses.Reduction.SUM,
+               loss_fn=None,
                name=None):
     self._weight_column = weight_column
     self._thresholds = thresholds
     self._label_vocabulary = label_vocabulary
     self._loss_reduction = loss_reduction
+    self._loss_fn = loss_fn
     self._name = name
 
   @property
@@ -916,8 +1009,13 @@ class _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(_Head):
           name='class_id_lookup').lookup(labels)
     labels = math_ops.to_float(labels)
     labels = _assert_range(labels, 2)
-    unweighted_loss = nn.sigmoid_cross_entropy_with_logits(
-        labels=labels, logits=logits)
+    if self._loss_fn:
+      unweighted_loss = _call_loss_fn(
+          loss_fn=self._loss_fn, labels=labels, logits=logits,
+          features=features, expected_loss_dim=1)
+    else:
+      unweighted_loss = nn.sigmoid_cross_entropy_with_logits(
+          labels=labels, logits=logits)
     weights = _get_weights_and_check_match_logits(
         features=features, weight_column=self._weight_column, logits=logits)
     training_loss = losses.compute_weighted_loss(
@@ -1057,6 +1155,7 @@ def _regression_head_with_mean_squared_error_loss(
     weight_column=None,
     label_dimension=1,
     loss_reduction=losses.Reduction.SUM,
+    loss_fn=None,
     name=None):
   """Creates a `_Head` for regression using the `mean_squared_error` loss.
 
@@ -1075,6 +1174,10 @@ def _regression_head_with_mean_squared_error_loss(
   `[D0, D1, ... DN]`, `[D0, D1, ... DN, 1]` or
   `[D0, D1, ... DN, label_dimension]`.
 
+  Also supports custom `loss_fn`. `loss_fn` takes `(labels, logits)` or
+  `(labels, logits, features)` as arguments and returns unreduced loss with
+  shape `[D0, D1, ... DN, label_dimension]`.
+
   Args:
     weight_column: A string or a `_NumericColumn` created by
       `tf.feature_column.numeric_column` defining feature column representing
@@ -1085,6 +1188,7 @@ def _regression_head_with_mean_squared_error_loss(
       `[batch_size, label_dimension]`).
     loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
       reduce training loss over batch. Defaults to `SUM`.
+    loss_fn: Optional loss function.
     name: name of the head. If provided, summary and metrics keys will be
       suffixed by `"/" + name`. Also used as `name_scope` when creating ops.
 
@@ -1097,10 +1201,13 @@ def _regression_head_with_mean_squared_error_loss(
   if (loss_reduction not in losses.Reduction.all() or
       loss_reduction == losses.Reduction.NONE):
     raise ValueError('Invalid loss_reduction: {}'.format(loss_reduction))
+  if loss_fn:
+    _validate_loss_fn_args(loss_fn)
   return _RegressionHeadWithMeanSquaredErrorLoss(
       weight_column=weight_column,
       label_dimension=label_dimension,
       loss_reduction=loss_reduction,
+      loss_fn=loss_fn,
       name=name)
 
 
@@ -1112,6 +1219,7 @@ class _RegressionHeadWithMeanSquaredErrorLoss(_Head):
       label_dimension,
       weight_column=None,
       loss_reduction=losses.Reduction.SUM,
+      loss_fn=None,
       name=None):
     """`Head` for regression."""
     if label_dimension < 1:
@@ -1119,6 +1227,7 @@ class _RegressionHeadWithMeanSquaredErrorLoss(_Head):
     self._logits_dimension = label_dimension
     self._weight_column = weight_column
     self._loss_reduction = loss_reduction
+    self._loss_fn = loss_fn
     self._name = name
 
   @property
@@ -1137,8 +1246,13 @@ class _RegressionHeadWithMeanSquaredErrorLoss(_Head):
         labels=labels, logits=logits,
         expected_labels_dimension=self._logits_dimension)
     labels = math_ops.to_float(labels)
-    unweighted_loss = losses.mean_squared_error(
-        labels=labels, predictions=logits, reduction=losses.Reduction.NONE)
+    if self._loss_fn:
+      unweighted_loss = _call_loss_fn(
+          loss_fn=self._loss_fn, labels=labels, logits=logits,
+          features=features, expected_loss_dim=self._logits_dimension)
+    else:
+      unweighted_loss = losses.mean_squared_error(
+          labels=labels, predictions=logits, reduction=losses.Reduction.NONE)
     weights = _get_weights_and_check_match_logits(
         features=features, weight_column=self._weight_column, logits=logits,
         allow_per_logit_weights=True)
diff --git a/tensorflow/python/estimator/canned/head_test.py b/tensorflow/python/estimator/canned/head_test.py
index 4e871e8f37..3a03770af4 100644
--- a/tensorflow/python/estimator/canned/head_test.py
+++ b/tensorflow/python/estimator/canned/head_test.py
@@ -111,6 +111,41 @@ class MultiClassHeadWithSoftmaxCrossEntropyLoss(test.TestCase):
       head_lib._multi_class_head_with_softmax_cross_entropy_loss(
           n_classes=3, loss_reduction=losses.Reduction.NONE)
 
+  def test_loss_fn_arg_labels_missing(self):
+    def _loss_fn(logits):
+      del logits  # Unused
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'loss_fn must contain argument: labels\. '
+        r'Given arguments: \(\'logits\',\)'):
+      head_lib._multi_class_head_with_softmax_cross_entropy_loss(
+          n_classes=3, loss_fn=_loss_fn)
+
+  def test_loss_fn_arg_logits_missing(self):
+    def _loss_fn(labels):
+      del labels  # unused
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'loss_fn must contain argument: logits\. '
+        r'Given arguments: \(\'labels\',\)'):
+      head_lib._multi_class_head_with_softmax_cross_entropy_loss(
+          n_classes=3, loss_fn=_loss_fn)
+
+  def test_loss_fn_arg_features_ok(self):
+    def _loss_fn(labels, logits, features):
+      del labels, logits, features  # Unused
+    head_lib._multi_class_head_with_softmax_cross_entropy_loss(
+        n_classes=3, loss_fn=_loss_fn)
+
+  def test_loss_fn_arg_invalid(self):
+    def _loss_fn(labels, logits, name=None):
+      del labels, logits, name  # Unused
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'loss_fn has unexpected args: \[\'name\'\]'):
+      head_lib._multi_class_head_with_softmax_cross_entropy_loss(
+          n_classes=3, loss_fn=_loss_fn)
+
   def test_invalid_logits_shape(self):
     n_classes = 3
     head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(n_classes)
@@ -406,6 +441,56 @@ class MultiClassHeadWithSoftmaxCrossEntropyLoss(test.TestCase):
       self.assertAllClose(
           expected_training_loss, training_loss.eval(), rtol=1e-2, atol=1e-2)
 
+  def test_eval_create_loss_loss_fn(self):
+    """Tests head.create_loss for eval mode and custom loss_fn."""
+    loss = np.array([[1.], [2.]], dtype=np.float32)
+    logits_input = np.array([[-10., 10., 0.], [-15., 10., 0]], dtype=np.float32)
+    labels_input = np.array([[1], [2]], dtype=np.int64)
+    def _loss_fn(labels, logits):
+      check_labels = control_flow_ops.Assert(
+          math_ops.reduce_all(math_ops.equal(labels, labels_input)),
+          data=[labels])
+      check_logits = control_flow_ops.Assert(
+          math_ops.reduce_all(math_ops.equal(logits, logits_input)),
+          data=[logits])
+      with ops.control_dependencies([check_labels, check_logits]):
+        return constant_op.constant(loss)
+    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
+        n_classes=3, loss_fn=_loss_fn)
+
+    actual_training_loss = head.create_loss(
+        features={'x': np.array(((42,),), dtype=np.int32)},
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits_input,
+        labels=labels_input)[0]
+    with self.test_session():
+      _initialize_variables(self, monitored_session.Scaffold())
+      self.assertAllClose(np.sum(loss), actual_training_loss.eval())
+
+  def test_eval_create_loss_loss_fn_wrong_shape(self):
+    """Tests custom loss_fn that returns Tensor of unexpected shape."""
+    loss = np.array([1., 2.], dtype=np.float32)
+    def _loss_fn(labels, logits):
+      del labels, logits  # Unused
+      return constant_op.constant(loss)
+    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
+        n_classes=3, loss_fn=_loss_fn)
+
+    logits = np.array([[-10., 10., 0.], [-15., 10., 0.]], dtype=np.float32)
+    labels = np.array([[1], [2]], dtype=np.int64)
+    actual_training_loss = head.create_loss(
+        features={'x': np.array(((42,),), dtype=np.int32)},
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits,
+        labels=labels)[0]
+    with self.test_session():
+      _initialize_variables(self, monitored_session.Scaffold())
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r'\[loss_fn must return Tensor of shape \[D0, D1, ... DN, 1\]\. \] '
+          r'\[logits_shape: \] \[2 3\] \[loss_shape: \] \[2\]'):
+        actual_training_loss.eval()
+
   def test_eval_labels_none(self):
     """Tests that error is raised when labels is None."""
     head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
@@ -1204,6 +1289,41 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
       head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
           loss_reduction=losses.Reduction.NONE)
 
+  def test_loss_fn_arg_labels_missing(self):
+    def _loss_fn(logits):
+      del logits  # Unused
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'loss_fn must contain argument: labels\. '
+        r'Given arguments: \(\'logits\',\)'):
+      head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+          loss_fn=_loss_fn)
+
+  def test_loss_fn_arg_logits_missing(self):
+    def _loss_fn(labels):
+      del labels  # unused
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'loss_fn must contain argument: logits\. '
+        r'Given arguments: \(\'labels\',\)'):
+      head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+          loss_fn=_loss_fn)
+
+  def test_loss_fn_arg_features_ok(self):
+    def _loss_fn(labels, logits, features):
+      del labels, logits, features  # Unused
+      head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+          loss_fn=_loss_fn)
+
+  def test_loss_fn_arg_invalid(self):
+    def _loss_fn(labels, logits, name=None):
+      del labels, logits, name  # Unused
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'loss_fn has unexpected args: \[\'name\'\]'):
+      head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+          loss_fn=_loss_fn)
+
   def test_invalid_logits_shape(self):
     head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss()
     self.assertEqual(1, head.logits_dimension)
@@ -1699,6 +1819,56 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
       self.assertAllClose(expected_unreduced_loss, unreduced_loss.eval())
       self.assertAllClose(expected_weights, actual_weights)
 
+  def test_eval_create_loss_loss_fn(self):
+    """Tests head.create_loss for eval mode and custom loss_fn."""
+    loss = np.array([[1.], [2.]], dtype=np.float32)
+    logits_input = np.array([[-10.], [10.]], dtype=np.float32)
+    labels_input = np.array([[1], [0]], dtype=np.int64)
+    def _loss_fn(labels, logits):
+      check_labels = control_flow_ops.Assert(
+          math_ops.reduce_all(math_ops.equal(labels, labels_input)),
+          data=[labels])
+      check_logits = control_flow_ops.Assert(
+          math_ops.reduce_all(math_ops.equal(logits, logits_input)),
+          data=[logits])
+      with ops.control_dependencies([check_labels, check_logits]):
+        return constant_op.constant(loss)
+    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+        loss_fn=_loss_fn)
+
+    actual_training_loss = head.create_loss(
+        features={'x': np.array(((42,),), dtype=np.int32)},
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits_input,
+        labels=labels_input)[0]
+    with self.test_session():
+      _initialize_variables(self, monitored_session.Scaffold())
+      self.assertAllClose(np.sum(loss), actual_training_loss.eval())
+
+  def test_eval_create_loss_loss_fn_wrong_shape(self):
+    """Tests custom loss_fn that returns Tensor of unexpected shape."""
+    loss = np.array([1., 2.], dtype=np.float32)
+    def _loss_fn(labels, logits):
+      del labels, logits  # Unused
+      return constant_op.constant(loss)
+    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+        loss_fn=_loss_fn)
+
+    logits = np.array([[-10.], [10.]], dtype=np.float32)
+    labels = np.array([[1], [0]], dtype=np.int64)
+    actual_training_loss = head.create_loss(
+        features={'x': np.array(((42,),), dtype=np.int32)},
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits,
+        labels=labels)[0]
+    with self.test_session():
+      _initialize_variables(self, monitored_session.Scaffold())
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r'\[loss_fn must return Tensor of shape \[D0, D1, ... DN, 1\]\. \] '
+          r'\[logits_shape: \] \[2 1\] \[loss_shape: \] \[2\]'):
+        actual_training_loss.eval()
+
   def test_train_labels_none(self):
     """Tests that error is raised when labels is None."""
     head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss()
@@ -2355,6 +2525,37 @@ class RegressionHeadWithMeanSquaredErrorLossTest(test.TestCase):
       head_lib._regression_head_with_mean_squared_error_loss(
           loss_reduction=losses.Reduction.NONE)
 
+  def test_loss_fn_arg_labels_missing(self):
+    def _loss_fn(logits):
+      del logits  # Unused
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'loss_fn must contain argument: labels\. '
+        r'Given arguments: \(\'logits\',\)'):
+      head_lib._regression_head_with_mean_squared_error_loss(loss_fn=_loss_fn)
+
+  def test_loss_fn_arg_logits_missing(self):
+    def _loss_fn(labels):
+      del labels  # unused
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'loss_fn must contain argument: logits\. '
+        r'Given arguments: \(\'labels\',\)'):
+      head_lib._regression_head_with_mean_squared_error_loss(loss_fn=_loss_fn)
+
+  def test_loss_fn_arg_features_ok(self):
+    def _loss_fn(labels, logits, features):
+      del labels, logits, features  # Unused
+      head_lib._regression_head_with_mean_squared_error_loss(loss_fn=_loss_fn)
+
+  def test_loss_fn_arg_invalid(self):
+    def _loss_fn(labels, logits, name=None):
+      del labels, logits, name  # Unused
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'loss_fn has unexpected args: \[\'name\'\]'):
+      head_lib._regression_head_with_mean_squared_error_loss(loss_fn=_loss_fn)
+
   def test_invalid_logits(self):
     head = head_lib._regression_head_with_mean_squared_error_loss(
         label_dimension=3)
@@ -2530,6 +2731,56 @@ class RegressionHeadWithMeanSquaredErrorLossTest(test.TestCase):
       # loss = [(43-45)^2, (44-41)] = [4, 9]
       self.assertAllClose(13., training_loss.eval())
 
+  def test_eval_create_loss_loss_fn(self):
+    """Tests head.create_loss for eval mode and custom loss_fn."""
+    loss = np.array([[0., 1.], [2., 3.]], dtype=np.float32)
+    logits_input = np.array([[-1., 1.], [-2., 2.]], dtype=np.float32)
+    labels_input = np.array([[1., 0.], [2., -1.]], dtype=np.float32)
+    def _loss_fn(labels, logits):
+      check_labels = control_flow_ops.Assert(
+          math_ops.reduce_all(math_ops.equal(labels, labels_input)),
+          data=[labels])
+      check_logits = control_flow_ops.Assert(
+          math_ops.reduce_all(math_ops.equal(logits, logits_input)),
+          data=[logits])
+      with ops.control_dependencies([check_labels, check_logits]):
+        return constant_op.constant(loss)
+    head = head_lib._regression_head_with_mean_squared_error_loss(
+        label_dimension=2, loss_fn=_loss_fn)
+
+    actual_training_loss = head.create_loss(
+        features={'x': np.array(((42,),), dtype=np.int32)},
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits_input,
+        labels=labels_input)[0]
+    with self.test_session():
+      _initialize_variables(self, monitored_session.Scaffold())
+      self.assertAllClose(np.sum(loss), actual_training_loss.eval())
+
+  def test_eval_create_loss_loss_fn_wrong_shape(self):
+    """Tests custom loss_fn that returns Tensor of unexpected shape."""
+    loss = np.array([[1.], [2.]], dtype=np.float32)
+    def _loss_fn(labels, logits):
+      del labels, logits  # Unused
+      return constant_op.constant(loss)
+    head = head_lib._regression_head_with_mean_squared_error_loss(
+        label_dimension=2, loss_fn=_loss_fn)
+
+    logits = np.array([[-1., 1.], [-2., 2.]], dtype=np.float32)
+    labels = np.array([[1., 0.], [2., -1.]], dtype=np.float32)
+    actual_training_loss = head.create_loss(
+        features={'x': np.array(((42,),), dtype=np.int32)},
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits,
+        labels=labels)[0]
+    with self.test_session():
+      _initialize_variables(self, monitored_session.Scaffold())
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r'\[loss_fn must return Tensor of shape \[D0, D1, ... DN, 2\]\. \] '
+          r'\[logits_shape: \] \[2 2\] \[loss_shape: \] \[2 1\]'):
+        actual_training_loss.eval()
+
   def test_eval_labels_none(self):
     """Tests that error is raised when labels is None."""
     head = head_lib._regression_head_with_mean_squared_error_loss()
-- 
GitLab


From 11905a5da2785dba666c23762bd82c1f0f9d583b Mon Sep 17 00:00:00 2001
From: Ashish Kumar Ram <ashish.ram88@gmail.com>
Date: Fri, 26 Jan 2018 00:31:25 +0100
Subject: [PATCH 189/966] Add missing library in Dockerfile (#16417)

The local Dockerfile does not have all the dependencies for running the exercise notebooks in udacity assignments.
---
 tensorflow/examples/udacity/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/examples/udacity/Dockerfile b/tensorflow/examples/udacity/Dockerfile
index 3ca58566c1..00eb853e52 100644
--- a/tensorflow/examples/udacity/Dockerfile
+++ b/tensorflow/examples/udacity/Dockerfile
@@ -8,7 +8,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
-RUN pip install scikit-learn pyreadline Pillow
+RUN pip install scikit-learn pyreadline Pillow imageio
 RUN rm -rf /notebooks/*
 ADD *.ipynb /notebooks/
 WORKDIR /notebooks
-- 
GitLab


From 8defb1047db9dc0444d30f3e4be7ac2580f426c2 Mon Sep 17 00:00:00 2001
From: Jian Lin <lj@linjian.org>
Date: Fri, 26 Jan 2018 07:32:10 +0800
Subject: [PATCH 190/966] add URLEncode for the CopyObjectRequest of S3 Rename
 function (#16415)

---
 tensorflow/core/platform/s3/s3_file_system.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/platform/s3/s3_file_system.cc b/tensorflow/core/platform/s3/s3_file_system.cc
index 2c0babe098..1e89fa77c1 100644
--- a/tensorflow/core/platform/s3/s3_file_system.cc
+++ b/tensorflow/core/platform/s3/s3_file_system.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <aws/core/utils/FileSystemUtils.h>
 #include <aws/core/utils/logging/AWSLogging.h>
 #include <aws/core/utils/logging/LogSystemInterface.h>
+#include <aws/core/utils/StringUtils.h>
 #include <aws/s3/S3Client.h>
 #include <aws/s3/S3Errors.h>
 #include <aws/s3/model/CopyObjectRequest.h>
@@ -607,7 +608,8 @@ Status S3FileSystem::RenameFile(const string& src, const string& target) {
       Aws::String src_key = object.GetKey();
       Aws::String target_key = src_key;
       target_key.replace(0, src_object.length(), target_object.c_str());
-      Aws::String source = Aws::String(src_bucket.c_str()) + "/" + src_key;
+      Aws::String source = Aws::String(src_bucket.c_str()) + "/"
+          + Aws::Utils::StringUtils::URLEncode(src_key.c_str());
 
       copyObjectRequest.SetBucket(target_bucket.c_str());
       copyObjectRequest.SetKey(target_key);
-- 
GitLab


From aa1e754d4c3f7f03abf6a1e13ce92e71c1ff80c4 Mon Sep 17 00:00:00 2001
From: Sayed Hadi Hashemi <info@xldrx.com>
Date: Thu, 25 Jan 2018 17:32:44 -0600
Subject: [PATCH 191/966] Implement LoggingAsync for GRPC Worker Services
 (#14604)

* Implement LoggingAsync for GRPC Worker.

* Add nullptr checks

* Fix BUILD file format

* Change LoggingAsync implementation

- Revert changes to *_rendezvous_mgr
- Implement logging primitives in session_mgr instead
- Implement ClearLogs
- Fixed C++ formating

* Check for nullptr

* Better handling of the case when both "clear" and "retrieve" flags are sent to "LogAsync."
nullptr check on default_worker_cache_.

* Fix formatting

* Updata session_mgr.cc to address changes in 619792f
---
 tensorflow/core/distributed_runtime/BUILD     |  1 +
 .../distributed_runtime/master_session.cc     |  4 +
 .../rpc/grpc_worker_service.cc                | 18 ++++
 .../rpc/grpc_worker_service.h                 |  3 +
 .../core/distributed_runtime/session_mgr.cc   | 82 ++++++++++++++++++-
 .../core/distributed_runtime/session_mgr.h    |  9 ++
 6 files changed, 115 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD
index f4ee841032..9e152aa082 100644
--- a/tensorflow/core/distributed_runtime/BUILD
+++ b/tensorflow/core/distributed_runtime/BUILD
@@ -145,6 +145,7 @@ cc_library(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:worker_proto_cc",
     ],
 )
 
diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index dcc25e4426..9d4a1eb8a1 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -1448,6 +1448,8 @@ Status MasterSession::DoPartialRun(CallOptions* opts,
     const auto count = run_state->count;
     pss.collect_timeline =
         req.options().trace_level() == RunOptions::FULL_TRACE;
+    pss.collect_rpcs =
+            req.options().trace_level() == RunOptions::FULL_TRACE;
     pss.report_tensor_allocations_upon_oom =
         req.options().report_tensor_allocations_upon_oom();
 
@@ -1610,6 +1612,8 @@ Status MasterSession::DoRunWithLocalExecution(
   TRACEPRINTF("stepid %llu", step_id);
 
   pss.collect_timeline = req.options().trace_level() == RunOptions::FULL_TRACE;
+  pss.collect_rpcs =
+          req.options().trace_level() == RunOptions::FULL_TRACE;
   pss.report_tensor_allocations_upon_oom =
       req.options().report_tensor_allocations_upon_oom();
   // Build the cost model every 'build_cost_model_every' steps after skipping an
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
index 95811476f7..b20e744a97 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
@@ -444,6 +444,24 @@ void GrpcWorker::GrpcRecvTensorAsync(CallOptions* opts,
       });
 }
 
+void GrpcWorker::LoggingAsync(const LoggingRequest* request,
+                              LoggingResponse* response, StatusCallback done) {
+  auto env = this->env();
+  if (env) {
+    auto session_mgr = (SessionMgr*)env->session_mgr;
+    if (session_mgr) {
+      session_mgr->SetLogging(request->rpc_logging());
+      for (const auto& step_id : request->fetch_step_id()) {
+        session_mgr->RetrieveLogs(step_id, response);
+      }
+      if (request->clear()) {
+        session_mgr->ClearLogs();
+      }
+    }
+  }
+  done(Status::OK());
+}
+
 WorkerEnv* GrpcWorker::env() { return env_; }
 
 std::unique_ptr<GrpcWorker> NewGrpcWorker(WorkerEnv* env) {
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
index 78a21fd9f6..3954af8ad8 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
@@ -40,6 +40,9 @@ class GrpcWorker : public Worker {
                                    ::grpc::ByteBuffer* response,
                                    StatusCallback done);
 
+  virtual void LoggingAsync(const LoggingRequest* request,
+                    LoggingResponse* response, StatusCallback done);
+
   WorkerEnv* env();
 
  private:
diff --git a/tensorflow/core/distributed_runtime/session_mgr.cc b/tensorflow/core/distributed_runtime/session_mgr.cc
index 8db49e7f15..51b9547f53 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr.cc
@@ -43,8 +43,8 @@ SessionMgr::SessionMgr(
       worker_cache_factory_(std::move(worker_cache_factory)) {}
 
 string SessionMgr::WorkerNameFromServerDef(const ServerDef& server_def) {
-  return strings::StrCat("/job:", server_def.job_name(),
-                         "/replica:0/task:", server_def.task_index());
+  return strings::StrCat("/job:", server_def.job_name(), "/replica:0/task:",
+                         server_def.task_index());
 }
 
 Status SessionMgr::CreateSession(const string& session,
@@ -64,8 +64,13 @@ Status SessionMgr::CreateSession(const string& session,
     TF_RETURN_IF_ERROR(worker_cache_factory_(server_def, &worker_cache));
   }
 
+  if (worker_cache != nullptr & default_worker_cache_.get() != nullptr) {
+    worker_cache->SetLogging(this->is_logging_active_);
+  }
+
   CHECK(!worker_env_->local_devices.empty())
       << "The WorkerEnv must have at least one device in `local_devices`.";
+
   std::vector<Device*> renamed_devices;
   for (Device* d : worker_env_->local_devices) {
     renamed_devices.push_back(RenamedDevice::NewRenamedDevice(
@@ -113,4 +118,77 @@ std::shared_ptr<WorkerSession> SessionMgr::LegacySession() {
   return legacy_session_;
 }
 
+void SessionMgr::SetLogging(bool active) {
+  mutex_lock l(mu_);
+  this->is_logging_active_ = active;
+  // Legacy Session
+  if (legacy_session_) {
+    auto* worker_cache = legacy_session_->worker_cache.get();
+    if (worker_cache) {
+      worker_cache->SetLogging(active);
+    }
+  }
+
+  for (const auto& session_kv : sessions_) {
+    auto session = session_kv.second.get();
+    if (session) {
+      auto* worker_cache = session->worker_cache.get();
+      if (worker_cache) {
+        worker_cache->SetLogging(active);
+      }
+    }
+  }
+}
+
+void SessionMgr::RetrieveLogs(tensorflow::int64 step_id,
+                              LoggingResponse* response) {
+  mutex_lock l(mu_);
+  // Legacy Session
+  if (legacy_session_) {
+    auto* worker_cache = legacy_session_->worker_cache.get();
+    if (worker_cache) {
+      auto step_stats = StepStats();
+      if (worker_cache->RetrieveLogs(step_id, &step_stats)) {
+        auto* labeled_step_stats = response->add_step();
+        labeled_step_stats->set_step_id(step_id);
+        labeled_step_stats->mutable_step_stats()->Swap(&step_stats);
+      }
+    }
+  }
+  for (const auto& session_kv : sessions_) {
+    auto session = session_kv.second.get();
+    if (session) {
+      auto* worker_cache = session->worker_cache.get();
+      if (worker_cache) {
+        auto step_stats = StepStats();
+        if (worker_cache->RetrieveLogs(step_id, &step_stats)) {
+          auto* labeled_step_stats = response->add_step();
+          labeled_step_stats->set_step_id(step_id);
+          labeled_step_stats->mutable_step_stats()->Swap(&step_stats);
+        }
+      }
+    }
+  }
+}
+
+void SessionMgr::ClearLogs() {
+  mutex_lock l(mu_);
+  // Legacy Session
+  if (legacy_session_) {
+    auto* worker_cache = legacy_session_->worker_cache.get();
+    if (worker_cache) {
+      worker_cache->ClearLogs();
+    }
+  }
+
+  for (const auto& session_kv : sessions_) {
+    auto session = session_kv.second.get();
+    if (session) {
+      auto* worker_cache = session->worker_cache.get();
+      if (worker_cache) {
+        worker_cache->ClearLogs();
+      }
+    }
+  }
+}
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/session_mgr.h b/tensorflow/core/distributed_runtime/session_mgr.h
index 3ce260d12e..4c9702d522 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.h
+++ b/tensorflow/core/distributed_runtime/session_mgr.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/protobuf/tensorflow_server.pb.h"
+#include "tensorflow/core/protobuf/worker.pb.h"
 
 namespace tensorflow {
 
@@ -56,6 +57,12 @@ class SessionMgr {
 
   static string WorkerNameFromServerDef(const ServerDef& server_def);
 
+  void SetLogging(bool active);
+
+  void RetrieveLogs(tensorflow::int64 step_id, LoggingResponse* response);
+
+  void ClearLogs();
+
  private:
   const WorkerEnv* const worker_env_;  // Not owned.
 
@@ -75,6 +82,8 @@ class SessionMgr {
   std::unique_ptr<WorkerCacheInterface> default_worker_cache_;
   std::shared_ptr<WorkerSession> legacy_session_;
 
+  bool is_logging_active_ = false;
+
   const WorkerCacheFactory worker_cache_factory_;
 
   std::shared_ptr<WorkerSession> WorkerSessionForSessionUnlocked(
-- 
GitLab


From 4f9ef6c625913947b8c83cdaef957b23b0bada62 Mon Sep 17 00:00:00 2001
From: Yuefeng Zhou <yuefengz@google.com>
Date: Thu, 25 Jan 2018 15:59:12 -0800
Subject: [PATCH 192/966] In the TF cost model, ensure when id grows, records
 for each op's outputs match the number of outputs.

PiperOrigin-RevId: 183306267
---
 tensorflow/core/graph/costmodel.cc | 53 +++++++++++++++++++-----------
 tensorflow/core/graph/costmodel.h  |  4 +--
 2 files changed, 36 insertions(+), 21 deletions(-)

diff --git a/tensorflow/core/graph/costmodel.cc b/tensorflow/core/graph/costmodel.cc
index b1e6cf64e8..4118f14f8b 100644
--- a/tensorflow/core/graph/costmodel.cc
+++ b/tensorflow/core/graph/costmodel.cc
@@ -57,10 +57,10 @@ void CostModel::MergeFromLocal(const Graph& g, const CostModel& cm) {
     const int local_id = cm.Id(n);
     const int global_id = Id(n);
     if (local_id < 0 || global_id < 0) continue;
-    Ensure(global_id);
+    int num_slots = cm.slot_bytes_[local_id].size();
+    Ensure(global_id, num_slots);
     count_[global_id] += cm.count_[local_id];
     time_[global_id] += cm.time_[local_id];
-    int num_slots = cm.slot_bytes_[local_id].size();
     if (num_slots > 0) {
       if (slot_bytes_[global_id].empty()) {
         slot_bytes_[global_id].resize(num_slots);
@@ -78,11 +78,11 @@ void CostModel::MergeFromGlobal(const CostModel& cm) {
   CHECK(is_global_);
   CHECK_EQ(true, cm.is_global());
   const int num_nodes = cm.count_.size();
-  Ensure(num_nodes);
-  for (int i = 0; i < num_nodes; ++i) {
+  for (int i = num_nodes - 1; i >= 0; --i) {
     count_[i] += cm.count_[i];
     time_[i] += cm.time_[i];
     int num_slots = cm.slot_bytes_[i].size();
+    Ensure(i, num_slots);
     if (num_slots > 0) {
       if (slot_bytes_[i].empty()) {
         slot_bytes_[i].resize(num_slots);
@@ -106,7 +106,7 @@ void CostModel::MergeFromStats(const NodeNameToCostIdMap& map,
       // copy/send/recv nodes, feed/fetch, etc.
       if (iter == map.end()) continue;
       int32 global_id = iter->second;
-      Ensure(global_id);
+      Ensure(global_id, ns.output_size());
       int64 elapsed_micros = ns.op_end_rel_micros() - ns.op_start_rel_micros();
       count_[global_id]++;
       time_[global_id] += elapsed_micros;
@@ -122,7 +122,7 @@ void CostModel::MergeFromStats(const NodeNameToCostIdMap& map,
   }
 }
 
-void CostModel::Ensure(int id) {
+void CostModel::Ensure(int id, int num_outputs) {
   if (slot_bytes_.size() <= static_cast<size_t>(id)) {
     slot_bytes_.resize(id + 1);
     count_.resize(id + 1);
@@ -131,25 +131,37 @@ void CostModel::Ensure(int id) {
     max_exec_time_.resize(id + 1);
     output_port_alloc_ids_.resize(id + 1);
   }
+  if (num_outputs > 0) {
+    auto perslot = &slot_bytes_[id];
+    auto output_port_alloc_ids = &output_port_alloc_ids_[id];
+    auto max_mem_usage = &max_mem_usage_[id];
+
+    CHECK_LE(perslot->size(), num_outputs);
+    DCHECK_EQ(output_port_alloc_ids->size(), perslot->size());
+    DCHECK_EQ(max_mem_usage->output_port_mem.size(), perslot->size());
+    DCHECK_EQ(max_mem_usage->output_port_shape.size(), perslot->size());
+    DCHECK_EQ(max_mem_usage->output_port_type.size(), perslot->size());
+
+    perslot->resize(num_outputs, Bytes(-1));
+    output_port_alloc_ids->resize(num_outputs, -1);
+    max_mem_usage->output_port_mem.resize(num_outputs, Bytes(-1));
+    max_mem_usage->output_port_shape.resize(num_outputs, unknown_shape_);
+    max_mem_usage->output_port_type.resize(num_outputs, DT_INVALID);
+  }
 }
 
 void CostModel::SetNumOutputs(const Node* node, int num_outputs) {
   const int id = Id(node);
   if (id < 0) return;
-  Ensure(id);
+  // Do not resize the number of slots before checking its existing number of
+  // slots.
+  Ensure(id, 0);
   auto perslot = &slot_bytes_[id];
-  auto max_mem_usage = &max_mem_usage_[id];
-  auto output_port_alloc_ids = &output_port_alloc_ids_[id];
   if (!perslot->empty()) {
     CHECK_EQ(num_outputs, perslot->size()) << "Cannot resize slot_bytes, node="
                                            << node->name();
-  } else {
-    perslot->resize(num_outputs, Bytes(-1));
-    output_port_alloc_ids->resize(num_outputs, -1);
-    max_mem_usage->output_port_mem.resize(num_outputs, Bytes(-1));
-    max_mem_usage->output_port_shape.resize(num_outputs, unknown_shape_);
-    max_mem_usage->output_port_type.resize(num_outputs, DT_INVALID);
   }
+  Ensure(id, num_outputs);
 }
 
 void CostModel::RecordCount(const Node* node, int count) {
@@ -198,7 +210,7 @@ void CostModel::RecordTime(const Node* node, Microseconds time) {
   const int id = Id(node);
   if (id < 0) return;
   DCHECK(node->IsOp()) << node->DebugString();
-  Ensure(id);
+  Ensure(id, node->num_outputs());
   time_[id] += time;
 }
 
@@ -240,7 +252,10 @@ void CostModel::RecordMaxMemorySize(const Node* node, int output_slot,
                                     const DataType& dtype) {
   const int id = Id(node);
   if (id < 0) return;
-  Ensure(id);
+  CHECK_LT(output_slot, node->num_outputs())
+      << "Unexpected output slot for node " << node->DebugString() << ". Got "
+      << output_slot << " but its num_outputs is " << node->num_outputs();
+  Ensure(id, node->num_outputs());
   auto& current_max = max_mem_usage_[id].output_port_mem[output_slot];
   // If the memory allocator doesn't track memory usage, let's infer a lower
   // bound from the tensor shape and its data type.
@@ -316,7 +331,7 @@ void CostModel::RecordMemoryStats(const Node* node,
 void CostModel::RecordMaxExecutionTime(const Node* node, Microseconds time) {
   const int id = Id(node);
   if (id < 0) return;
-  Ensure(id);
+  Ensure(id, node->num_outputs());
   max_exec_time_[id] = std::max(max_exec_time_[id], time);
 }
 
@@ -332,7 +347,7 @@ void CostModel::RecordAllocationId(const Node* node, int output_slot,
                                    int64 alloc_id) {
   const int id = Id(node);
   if (id < 0) return;
-  Ensure(id);
+  Ensure(id, node->num_outputs());
   output_port_alloc_ids_[id][output_slot] = alloc_id;
 }
 
diff --git a/tensorflow/core/graph/costmodel.h b/tensorflow/core/graph/costmodel.h
index 081eb2ff4c..c60a946c2c 100644
--- a/tensorflow/core/graph/costmodel.h
+++ b/tensorflow/core/graph/costmodel.h
@@ -183,8 +183,8 @@ class CostModel {
 
   const bool is_global_;
 
-  // Resizes vectors so that they are large enough for "id".
-  void Ensure(int id);
+  // Resizes vectors so that they are large enough for "id" and id's outputs.
+  void Ensure(int id, int num_outputs);
 
   // Nodes and Edges whose count is < this value
   // get type/byte estimates of 0.
-- 
GitLab


From 9581462f8743ee92f39d46263d68fc1283082b44 Mon Sep 17 00:00:00 2001
From: Max Galkin <maxgalkin@google.com>
Date: Thu, 25 Jan 2018 16:08:13 -0800
Subject: [PATCH 193/966] Show friendlier error message on failure in
 tf_optimizer.i

Without it we trigger a segmentation fault, but later in a different stack, which is not so helpful.

PiperOrigin-RevId: 183307729
---
 tensorflow/python/grappler/tf_optimizer.i | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tensorflow/python/grappler/tf_optimizer.i b/tensorflow/python/grappler/tf_optimizer.i
index f0dd4483a6..1b657983a4 100644
--- a/tensorflow/python/grappler/tf_optimizer.i
+++ b/tensorflow/python/grappler/tf_optimizer.i
@@ -103,6 +103,11 @@ PyObject* TF_OptimizeGraph(
     std::unique_ptr<tensorflow::grappler::GrapplerItem> grappler_item =
         tensorflow::grappler::GrapplerItemFromMetaGraphDef(graph_id, metagraph, item_config);
 
+    if (!grappler_item) {
+      TF_SetStatus(out_status, TF_INVALID_ARGUMENT, "Failed to import metagraph, check error log for more info.");
+      return nullptr;
+    }
+
     tensorflow::DeviceBase* cpu_device = nullptr;
     tensorflow::GraphDef out_graph;
     tensorflow::grappler::MetaOptimizer optimizer(cpu_device, rewriter_config);
-- 
GitLab


From 2564d5e90e12b4ec3dbd01b442b42a2a8ac7f8f6 Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Thu, 25 Jan 2018 16:15:50 -0800
Subject: [PATCH 194/966] Make batch_sequences_with_states_test.py work with C
 API enabled, take 2.

This fixes the original rollback by using placeholders for the
SparseTensor shapes. The flakiness was caused by the nondeterministic
ordering of the sequences dict.

PiperOrigin-RevId: 183308774
---
 .../batch_sequences_with_states_test.py       | 30 +++++++++++++++++--
 1 file changed, 27 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py b/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py
index 2a0ef0e6b3..dbdbb08a82 100644
--- a/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py
+++ b/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py
@@ -53,7 +53,7 @@ class BatchSequencesWithStatesTest(test.TestCase):
     sp_tensor1 = sparse_tensor.SparseTensor(
         array_ops.constant(ind1, dtypes.int64),
         array_ops.constant(val1, dtypes.int64),
-        array_ops.constant(shape1, dtypes.int64))
+        array_ops.placeholder_with_default(shape1, shape=[2]))
     ind2 = np.array([
         [0, 0, 1],
         [0, 1, 0],
@@ -68,7 +68,7 @@ class BatchSequencesWithStatesTest(test.TestCase):
     sp_tensor2 = sparse_tensor.SparseTensor(
         array_ops.constant(ind2, dtypes.int64),
         array_ops.constant(val2, dtypes.int64),
-        array_ops.constant(shape2, dtypes.int64))
+        array_ops.placeholder_with_default(shape2, shape=[3]))
     sp_tensor3 = sparse_tensor.SparseTensor(
         array_ops.constant([[1, 9], [2, 2], [2, 10]], dtypes.int64),
         array_ops.constant([7, 15, 2], dtypes.int64),
@@ -320,6 +320,18 @@ class BatchSequencesWithStatesTest(test.TestCase):
   def testNotAMultiple(self):
     num_unroll = 3  # Not a divisor of value_length -
     # so padding would have been necessary.
+
+    # Use placeholder_with_default in sequences to make sure we get runtime
+    # error instead of shape inference error
+    sequences = {
+        "seq1": array_ops.placeholder_with_default(self.sequences["seq1"],
+                                                   shape=(None, 5)),
+        "seq2": array_ops.placeholder_with_default(self.sequences["seq2"],
+                                                   shape=(None, 4, 2)),
+        "seq3": self.sequences["seq3"],
+        "seq4": self.sequences["seq4"],
+    }
+
     with self.test_session() as sess:
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                    ".*should be a multiple of: 3, but saw "
@@ -330,7 +342,7 @@ class BatchSequencesWithStatesTest(test.TestCase):
           with coord.stop_on_exception():
             next_batch = sqss.batch_sequences_with_states(
                 input_key=self.key,
-                input_sequences=self.sequences,
+                input_sequences=sequences,
                 input_context=self.context,
                 input_length=3,
                 initial_states=self.initial_states,
@@ -493,6 +505,18 @@ class BatchSequencesWithStatesTest(test.TestCase):
         expected_seq4_batch2=expected_seq4_batch2)
 
 
+class BatchSequencesWithStatesTestWithCApi(BatchSequencesWithStatesTest):
+
+  def setUp(self):
+    self._prev_value = ops._USE_C_API
+    ops._USE_C_API = True
+    super(BatchSequencesWithStatesTestWithCApi, self).setUp()
+
+  def tearDown(self):
+    super(BatchSequencesWithStatesTestWithCApi, self).tearDown()
+    ops._USE_C_API = self._prev_value
+
+
 class PaddingTest(test.TestCase):
 
   def testPaddingInvalidLengths(self):
-- 
GitLab


From 8a87518ae7074d5a0da779089e7024cd0920bba4 Mon Sep 17 00:00:00 2001
From: cclauss <cclauss@bluewin.ch>
Date: Fri, 26 Jan 2018 01:31:41 +0100
Subject: [PATCH 195/966] import tensorflow as tf (#16318)

* import tensorflow as tf

* import tensorflow as tf

* from contextlib import contextmanager

* remove the last remaining change to py2tf
---
 tensorflow/compiler/tests/binary_ops_test.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index 16856bd736..c95fb1c515 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -774,15 +774,15 @@ class BinaryOpsTest(XLATestCase):
   def DISABLED_testSparseMatMul(self):
     # Binary wrappers for sparse_matmul with different hints
     def SparseMatmulWrapperTF(a, b):
-      return tf.sparse_matmul(a, b, a_is_sparse=True)
+      return math_ops.sparse_matmul(a, b, a_is_sparse=True)
 
     def SparseMatmulWrapperFT(a, b):
-      return tf.sparse_matmul(a, b, b_is_sparse=True)
+      return math_ops.sparse_matmul(a, b, b_is_sparse=True)
 
     def SparseMatmulWrapperTT(a, b):
-      return tf.sparse_matmul(a, b, a_is_sparse=True, b_is_sparse=True)
+      return math_ops.sparse_matmul(a, b, a_is_sparse=True, b_is_sparse=True)
 
-    self._testMatMul(tf.sparse_matmul)
+    self._testMatMul(math_ops.sparse_matmul)
     self._testMatMul(SparseMatmulWrapperTF)
     self._testMatMul(SparseMatmulWrapperFT)
     self._testMatMul(SparseMatmulWrapperTT)
-- 
GitLab


From 1b44a76921295fa5dfa7271f3a486b0ceaa8b3e1 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <bsteiner@google.com>
Date: Thu, 25 Jan 2018 16:45:04 -0800
Subject: [PATCH 196/966] Deleted unused data fields

PiperOrigin-RevId: 183312596
---
 tensorflow/core/grappler/costs/cost_estimator.h            | 3 ---
 tensorflow/core/grappler/costs/measuring_cost_estimator.cc | 4 ----
 2 files changed, 7 deletions(-)

diff --git a/tensorflow/core/grappler/costs/cost_estimator.h b/tensorflow/core/grappler/costs/cost_estimator.h
index 852e69737b..b7eaf8dc63 100644
--- a/tensorflow/core/grappler/costs/cost_estimator.h
+++ b/tensorflow/core/grappler/costs/cost_estimator.h
@@ -85,10 +85,7 @@ struct Costs {
   typedef NanoSeconds Duration;
 
   // Overall cost of running the graph; latency.
-  // Mean
   Duration execution_time;
-  Duration min_execution_time;
-  Duration max_execution_time;
 
   // Computation cost of running the graph.
   Duration compute_time;
diff --git a/tensorflow/core/grappler/costs/measuring_cost_estimator.cc b/tensorflow/core/grappler/costs/measuring_cost_estimator.cc
index 8fd1801863..ea4320687a 100644
--- a/tensorflow/core/grappler/costs/measuring_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/measuring_cost_estimator.cc
@@ -117,8 +117,6 @@ Status MeasuringCostEstimator::PredictCosts(const GraphDef& optimized_graph,
     LOG(ERROR) << "Failed to measure graph performance: "
                << status.error_message();
     costs->execution_time = Costs::Duration::max();
-    costs->max_execution_time = Costs::Duration::max();
-    costs->min_execution_time = 0;
     return status;
   }
 
@@ -126,8 +124,6 @@ Status MeasuringCostEstimator::PredictCosts(const GraphDef& optimized_graph,
   // to filter out outliers.
   RobustStats stats(times);
   costs->execution_time = Costs::Duration(stats.mean());
-  costs->max_execution_time = Costs::Duration(stats.hi());
-  costs->min_execution_time = Costs::Duration(stats.lo());
 
   return Status::OK();
 }
-- 
GitLab


From 879ea264780e6867ac48f507cddcdd9cb1e9b544 Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Thu, 25 Jan 2018 16:45:52 -0800
Subject: [PATCH 197/966] Automated g4 rollback of changelist 183296506

PiperOrigin-RevId: 183312680
---
 .../compiler/xla/service/hlo_matchers.cc      | 24 ---------
 .../compiler/xla/service/hlo_matchers.h       | 53 ++-----------------
 .../compiler/xla/service/hlo_matchers_test.cc | 33 ------------
 3 files changed, 3 insertions(+), 107 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_matchers.cc b/tensorflow/compiler/xla/service/hlo_matchers.cc
index fe1bf61e97..4255d60866 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers.cc
+++ b/tensorflow/compiler/xla/service/hlo_matchers.cc
@@ -102,30 +102,6 @@ bool HloGetTupleElementMatcher::MatchAndExplain(
   return true;
 }
 
-void HloCustomCallMatcher::DescribeTo(std::ostream* os) const {
-  HloMatcher::DescribeTo(os);
-  *os << " with call target that "
-      << ::testing::DescribeMatcher<string>(call_target_matcher_);
-}
-
-bool HloCustomCallMatcher::MatchAndExplain(
-    const HloInstruction* instruction,
-    ::testing::MatchResultListener* listener) const {
-  if (!HloMatcher::MatchAndExplain(instruction, listener)) {
-    return false;
-  }
-  ::testing::StringMatchResultListener sub_listener;
-  bool result = ExplainMatchResult(
-      call_target_matcher_, instruction->custom_call_target(), &sub_listener);
-  if (sub_listener.str().empty()) {
-    sub_listener << " that "
-                 << ::testing::DescribeMatcher<string>(call_target_matcher_,
-                                                       /*negation=*/!result);
-  }
-  *listener << "custom-call with call target" << sub_listener.str();
-  return result;
-}
-
 }  // namespace testing
 
 void PrintTo(const HloInstruction* inst, ::std::ostream* os) {
diff --git a/tensorflow/compiler/xla/service/hlo_matchers.h b/tensorflow/compiler/xla/service/hlo_matchers.h
index 103f04a2cb..9206cdac05 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers.h
+++ b/tensorflow/compiler/xla/service/hlo_matchers.h
@@ -56,8 +56,8 @@ class HloParameterMatcher : public HloMatcher {
 // index to match.
 class HloGetTupleElementMatcher : public HloMatcher {
  public:
-  HloGetTupleElementMatcher(::testing::Matcher<const HloInstruction*> operand,
-                            int64 tuple_index)
+  explicit HloGetTupleElementMatcher(
+      ::testing::Matcher<const HloInstruction*> operand, int64 tuple_index)
       : HloMatcher(HloOpcode::kGetTupleElement, /*operands=*/{operand}),
         tuple_index_(tuple_index) {}
 
@@ -68,24 +68,6 @@ class HloGetTupleElementMatcher : public HloMatcher {
   int64 tuple_index_;
 };
 
-// Custom matcher for custom-call instructions, which accepts a matcher for its
-// call target.
-class HloCustomCallMatcher : public HloMatcher {
- public:
-  HloCustomCallMatcher(
-      ::testing::Matcher<string> call_target_matcher,
-      std::vector<::testing::Matcher<const HloInstruction*>> operands)
-      : HloMatcher(HloOpcode::kCustomCall, operands),
-        call_target_matcher_(call_target_matcher) {}
-
-  bool MatchAndExplain(const HloInstruction* instruction,
-                       ::testing::MatchResultListener* listener) const override;
-  void DescribeTo(std::ostream* os) const override;
-
- private:
-  ::testing::Matcher<string> call_target_matcher_;
-};
-
 // HloInstruction* matchers for opcode and operands. Example:
 //   namespace op = xla::opcode_matchers;
 //   EXPECT_THAT(instruction,
@@ -112,6 +94,7 @@ HLO_MATCHER(Convert);
 HLO_MATCHER(Convolution);
 HLO_MATCHER(Copy);
 HLO_MATCHER(CrossReplicaSum);
+HLO_MATCHER(CustomCall);
 HLO_MATCHER(Divide);
 HLO_MATCHER(Dot);
 HLO_MATCHER(DynamicSlice);
@@ -201,36 +184,6 @@ inline ::testing::Matcher<const ::xla::HloInstruction*> GetTupleElement() {
       new ::xla::testing::HloMatcher(HloOpcode::kGetTupleElement, {}));
 }
 
-// - CustomCall(T, operand1, ..., operandN) matches a CustomCall with call
-//   target T and the given operands.
-//
-// - CustomCall(operand1, ..., operandN) matches any CustomCall HLO with the
-//   given operands.
-//
-// - CustomCall() matches any CustomCall HLO at all.
-template <typename... M>
-inline ::testing::Matcher<const ::xla::HloInstruction*> CustomCall(
-    ::testing::Matcher<string> call_target_matcher, M... operands) {
-  return ::testing::MakeMatcher(new ::xla::testing::HloCustomCallMatcher(
-      call_target_matcher, {operands...}));
-}
-// This overload of CustomCall(A, B, C, ...) exists iff A is not convertible to
-// ::testing::Matcher<string>.  In that case, we want to prefer the overload
-// above.
-template <typename FirstM, typename... M,
-          typename Dummy = typename std::enable_if<
-              !std::is_convertible<FirstM, ::testing::Matcher<string>>::value,
-              void>::type*>
-inline ::testing::Matcher<const ::xla::HloInstruction*> CustomCall(
-    FirstM operands_first, M... operands_rest) {
-  return ::testing::MakeMatcher(new ::xla::testing::HloMatcher(
-      HloOpcode::kCustomCall, {operands_first, operands_rest...}));
-}
-inline ::testing::Matcher<const ::xla::HloInstruction*> CustomCall() {
-  return ::testing::MakeMatcher(
-      new ::xla::testing::HloMatcher(HloOpcode::kCustomCall, {}));
-}
-
 #undef HLO_MATCHER
 }  // namespace opcode_matchers
 
diff --git a/tensorflow/compiler/xla/service/hlo_matchers_test.cc b/tensorflow/compiler/xla/service/hlo_matchers_test.cc
index 1c21703a45..1465d1cacd 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_matchers_test.cc
@@ -23,12 +23,6 @@ using ::testing::Eq;
 namespace xla {
 namespace {
 
-string DescribeHloMatcher(const ::testing::Matcher<const HloInstruction*>& m) {
-  std::stringstream ss;
-  m.DescribeTo(&ss);
-  return ss.str();
-}
-
 template <typename M, typename T>
 string Explain(const T& t, const M& m) {
   ::testing::StringMatchResultListener listener;
@@ -73,32 +67,5 @@ TEST(HloMatchersTest, Test) {
          "add"));
 }
 
-TEST(HloMatchersTest, CustomCallMatcher) {
-  auto c1 = HloInstruction::CreateConstant(Literal::CreateR1<float>({1, 2, 3}));
-  auto c2 = HloInstruction::CreateConstant(Literal::CreateR1<int32>({1, 2, 3}));
-  auto call = HloInstruction::CreateCustomCall(
-      ShapeUtil::MakeShape(F32, {1}), {c1.get(), c2.get()}, "foo_target");
-
-  EXPECT_THAT(call.get(), op::CustomCall());
-  EXPECT_THAT(call.get(), op::CustomCall(c1.get(), c2.get()));
-  EXPECT_THAT(call.get(), op::CustomCall("foo_target"));
-  EXPECT_THAT(call.get(), op::CustomCall("foo_target", c1.get(), c2.get()));
-  EXPECT_THAT(call.get(), op::CustomCall(::testing::StartsWith("foo")));
-  EXPECT_THAT(call.get(),
-              op::CustomCall(::testing::Not(::testing::StartsWith("bar"))));
-
-  // Wrong number of operands.
-  EXPECT_THAT(call.get(), ::testing::Not(op::CustomCall(c1.get())));
-
-  // Call target does not match.
-  EXPECT_THAT(call.get(),
-              ::testing::Not(op::CustomCall(::testing::StartsWith("bar"))));
-
-  EXPECT_THAT(Explain(call.get(), op::CustomCall("bar")),
-              R"(custom-call with call target that isn't equal to "bar")");
-  EXPECT_THAT(DescribeHloMatcher(op::CustomCall("foo_target")),
-              R"(custom-call with call target that is equal to "foo_target")");
-}
-
 }  // namespace
 }  // namespace xla
-- 
GitLab


From 68f55a1b061b821866ae4d4aa8b43da628d0e18a Mon Sep 17 00:00:00 2001
From: Yuefeng Zhou <yuefengz@google.com>
Date: Thu, 25 Jan 2018 16:50:24 -0800
Subject: [PATCH 198/966] Record requested cpu cores in OpPerformance.

PiperOrigin-RevId: 183313321
---
 tensorflow/core/grappler/costs/op_performance_data.proto | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tensorflow/core/grappler/costs/op_performance_data.proto b/tensorflow/core/grappler/costs/op_performance_data.proto
index 1d623b8db8..37f9ebd6a1 100644
--- a/tensorflow/core/grappler/costs/op_performance_data.proto
+++ b/tensorflow/core/grappler/costs/op_performance_data.proto
@@ -58,11 +58,18 @@ message LogNormalDistribution {
   double sigma = 2;
 }
 
+message SessionInfo {
+  int64 intra_op_parallelism = 1;
+}
+
 // Performance data for tensorflow operations
 message OpPerformance {
   // The op
   OpInfo op = 1;
 
+  // Information about the session configs.
+  SessionInfo session_info = 12;
+
   // The node name (optional). Makes it easier to associate the performance data
   // with a specific graph node.
   string node = 5;
-- 
GitLab


From c89c452cd7a1675a6e2332d09379469320197a8c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 25 Jan 2018 17:08:50 -0800
Subject: [PATCH 199/966] [XLA] Disable half_test_cpu as it is flaky

PiperOrigin-RevId: 183315762
---
 tensorflow/compiler/xla/tests/BUILD | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 85ac28533d..4410647f84 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -849,7 +849,8 @@ xla_test(
     name = "half_test",
     srcs = ["half_test.cc"],
     backends = [
-        "cpu",
+        # TODO(b/72509305): Flaky (fails with SEGV) as of 2018-01-25
+        # "cpu",
         "gpu",
     ],
     deps = [
-- 
GitLab


From fbd3e8a2c01d83a6aa6cca044fe5678d20035451 Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Thu, 25 Jan 2018 17:20:07 -0800
Subject: [PATCH 200/966] Make kernel_tests/scalar_test.py work with the C API
 enabled.

This also moves the set_producer_version function from a specific test
file to test_util.py, since it's needed in two test files now.

PiperOrigin-RevId: 183316990
---
 tensorflow/python/framework/test_util.py      | 12 ++++++++++++
 tensorflow/python/kernel_tests/scalar_test.py |  4 +++-
 tensorflow/python/ops/nn_batchnorm_test.py    | 15 ++-------------
 3 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 0133318456..6a7e1d0c89 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -53,6 +53,7 @@ from tensorflow.python.eager import tape
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import versions
@@ -1460,3 +1461,14 @@ def get_node_def_from_graph(node_name, graph_def):
     if node_def.name == node_name:
       return node_def
   return None
+
+
+def set_producer_version(graph, producer_version):
+  """Sets graph.graph_def_versions.producer to `producer_version`."""
+  # The C API doesn't expose altering GraphDefVersions. We can indirectly set
+  # it via import_graph_def though.
+  graph_def = graph_pb2.GraphDef()
+  graph_def.versions.producer = producer_version
+  with graph.as_default():
+    importer.import_graph_def(graph_def)
+  assert graph.graph_def_versions.producer, producer_version
diff --git a/tensorflow/python/kernel_tests/scalar_test.py b/tensorflow/python/kernel_tests/scalar_test.py
index b34426cc21..e65241981e 100644
--- a/tensorflow/python/kernel_tests/scalar_test.py
+++ b/tensorflow/python/kernel_tests/scalar_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_io_ops
 from tensorflow.python.ops import math_ops
@@ -30,6 +31,7 @@ import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
 
 
+@test_util.with_c_api
 class ScalarTest(test.TestCase):
 
   def check(self, op, args, error, correct=None):
@@ -51,7 +53,7 @@ class ScalarTest(test.TestCase):
     # Test various GraphDef versions
     for version in strict + lenient:
       with ops.Graph().as_default() as g:
-        g.graph_def_versions.producer = version
+        test_util.set_producer_version(g, version)
         with self.test_session(graph=g) as sess:
           feed = {}
           xs = placeholders(args, feed)
diff --git a/tensorflow/python/ops/nn_batchnorm_test.py b/tensorflow/python/ops/nn_batchnorm_test.py
index fc013b565b..eebfb17085 100644
--- a/tensorflow/python/ops/nn_batchnorm_test.py
+++ b/tensorflow/python/ops/nn_batchnorm_test.py
@@ -21,10 +21,8 @@ from __future__ import print_function
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
-from tensorflow.core.framework import graph_pb2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -40,15 +38,6 @@ from tensorflow.python.platform import test
 @test_util.with_c_api
 class BatchNormalizationTest(test.TestCase):
 
-  def SetProducerVersion(self, graph, producer_version):
-    # The C API doesn't expose altering GraphDefVersions. We can indirectly set
-    # it via import_graph_def though.
-    graph_def = graph_pb2.GraphDef()
-    graph_def.versions.producer = producer_version
-    with graph.as_default():
-      importer.import_graph_def(graph_def)
-    assert graph.graph_def_versions.producer, producer_version
-
   def _npBatchNorm(self, x, m, v, beta, gamma, epsilon,
                    scale_after_normalization, shift_after_normalization):
     y = (x - m) / np.sqrt(v + epsilon)
@@ -65,7 +54,7 @@ class BatchNormalizationTest(test.TestCase):
   def _tfBatchNormV1(self, x, m, v, beta, gamma, epsilon,
                      scale_after_normalization):
     """Original implementation."""
-    self.SetProducerVersion(ops.get_default_graph(), 8)
+    test_util.set_producer_version(ops.get_default_graph(), 8)
     return gen_nn_ops._batch_norm_with_global_normalization(
         x, m, v, beta, gamma, epsilon, scale_after_normalization)
     # pylint: enable=protected-access
@@ -233,7 +222,7 @@ class BatchNormalizationTest(test.TestCase):
         epsilon = 0.001
         for scale_after_normalization in [True, False]:
           # _batch_norm_with_global_normalization_grad is deprecated in v9
-          self.SetProducerVersion(ops.get_default_graph(), 8)
+          test_util.set_producer_version(ops.get_default_graph(), 8)
           grad = gen_nn_ops._batch_norm_with_global_normalization_grad(
               x, m, v, gamma, backprop, epsilon, scale_after_normalization)
           dx, dm, dv, db, dg = grad
-- 
GitLab


From 8220c228ab066d23ddf506a58bb08b1694239a34 Mon Sep 17 00:00:00 2001
From: Yao Zhang <yaozhang@google.com>
Date: Thu, 25 Jan 2018 17:27:42 -0800
Subject: [PATCH 201/966] Add an option to input a GraphDef.

PiperOrigin-RevId: 183317862
---
 .../python/grappler/cost_analyzer_tool.py     | 41 +++++++++++++++++--
 1 file changed, 37 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/grappler/cost_analyzer_tool.py b/tensorflow/python/grappler/cost_analyzer_tool.py
index 146bb4311c..61dc4e2afb 100644
--- a/tensorflow/python/grappler/cost_analyzer_tool.py
+++ b/tensorflow/python/grappler/cost_analyzer_tool.py
@@ -23,18 +23,33 @@ import sys
 
 from google.protobuf import text_format
 
+from tensorflow.core.framework import graph_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.python.framework import importer
+from tensorflow.python.framework import ops
 from tensorflow.python.grappler import cost_analyzer
 from tensorflow.python.grappler import tf_optimizer
 from tensorflow.python.platform import app
 from tensorflow.python.platform import gfile
+from tensorflow.python.training import saver
 
 
 def main(_):
-  with gfile.GFile(FLAGS.input) as input_file:
-    metagraph = meta_graph_pb2.MetaGraphDef()
-    metagraph.ParseFromString(input_file.read())
+  if FLAGS.metagraphdef:
+    with gfile.GFile(FLAGS.metagraphdef) as meta_file:
+      metagraph = meta_graph_pb2.MetaGraphDef()
+      metagraph.ParseFromString(meta_file.read())
+  else:
+    with gfile.GFile(FLAGS.graphdef) as graph_file:
+      graph_def = graph_pb2.GraphDef()
+      graph_def.ParseFromString(graph_file.read())
+      importer.import_graph_def(graph_def, name="")
+      graph = ops.get_default_graph()
+      fetch = graph.get_operation_by_name(FLAGS.fetch)
+      graph.add_to_collection("train_op", fetch)
+      metagraph = saver.export_meta_graph(
+          graph_def=graph.as_graph_def(), graph=graph)
 
   if FLAGS.rewriter_config is not None:
     rewriter_config = rewriter_config_pb2.RewriterConfig()
@@ -49,7 +64,25 @@ def main(_):
 if __name__ == "__main__":
   parser = argparse.ArgumentParser()
   parser.add_argument(
-      "--input", type=str, default=None, help="Input .meta file path.")
+      "--metagraphdef",
+      type=str,
+      default=None,
+      help="Input .meta MetaGraphDef file path.")
+  parser.add_argument(
+      "--graphdef",
+      type=str,
+      default=None,
+      help="Input .pb GraphDef file path.")
+  # Consider making flag fetch work together with flag metagraphdef. As some
+  # MetaGraphDef files don't have collection train_op.
+  parser.add_argument(
+      "--fetch",
+      type=str,
+      default=None,
+      help=
+      "The name of the fetch node. This flag is ignored if flag "
+      "metagraphdef is used."
+  )
   parser.add_argument(
       "--rewriter_config",
       type=str,
-- 
GitLab


From 32a6eb80dcde4d107f21e41097fddd8341a725b9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 25 Jan 2018 17:32:12 -0800
Subject: [PATCH 202/966] Move flatbuffer verifier to a separate lib

PiperOrigin-RevId: 183318384
---
 tensorflow/contrib/lite/model.cc              |  15 +-
 tensorflow/contrib/lite/model_test.cc         |   9 --
 tensorflow/contrib/lite/tools/BUILD           |  25 ++++
 tensorflow/contrib/lite/tools/verifier.cc     |  43 ++++++
 tensorflow/contrib/lite/tools/verifier.h      |  31 ++++
 .../contrib/lite/tools/verifier_test.cc       | 136 ++++++++++++++++++
 6 files changed, 237 insertions(+), 22 deletions(-)
 create mode 100644 tensorflow/contrib/lite/tools/verifier.cc
 create mode 100644 tensorflow/contrib/lite/tools/verifier.h
 create mode 100644 tensorflow/contrib/lite/tools/verifier_test.cc

diff --git a/tensorflow/contrib/lite/model.cc b/tensorflow/contrib/lite/model.cc
index ba29a2f4d1..415d984ad8 100644
--- a/tensorflow/contrib/lite/model.cc
+++ b/tensorflow/contrib/lite/model.cc
@@ -30,17 +30,6 @@ limitations under the License.
 
 namespace tflite {
 
-namespace {
-inline const tflite::Model* VerifyAndGetModel(const void* buf, size_t len) {
-  ::flatbuffers::Verifier verifier(static_cast<const uint8_t*>(buf), len);
-  if (VerifyModelBuffer(verifier)) {
-    return ::tflite::GetModel(buf);
-  } else {
-    return nullptr;
-  }
-}
-}  // namespace
-
 const char* kEmptyTensorName = "";
 
 std::unique_ptr<FlatBufferModel> FlatBufferModel::BuildFromFile(
@@ -82,7 +71,7 @@ FlatBufferModel::FlatBufferModel(const char* filename, bool mmap_file,
   }
   if (!allocation_->valid() || !CheckModelIdentifier()) return;
 
-  model_ = VerifyAndGetModel(allocation_->base(), allocation_->bytes());
+  model_ = ::tflite::GetModel(allocation_->base());
 }
 
 bool FlatBufferModel::CheckModelIdentifier() const {
@@ -103,7 +92,7 @@ FlatBufferModel::FlatBufferModel(const char* ptr, size_t num_bytes,
   allocation_ = new MemoryAllocation(ptr, num_bytes, error_reporter);
   if (!allocation_->valid()) return;
 
-  model_ = VerifyAndGetModel(allocation_->base(), allocation_->bytes());
+  model_ = ::tflite::GetModel(allocation_->base());
 }
 
 FlatBufferModel::FlatBufferModel(const Model* model,
diff --git a/tensorflow/contrib/lite/model_test.cc b/tensorflow/contrib/lite/model_test.cc
index 5330c8f594..66f22fd66a 100644
--- a/tensorflow/contrib/lite/model_test.cc
+++ b/tensorflow/contrib/lite/model_test.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <unistd.h>
-#include <string>
 
 #include "tensorflow/contrib/lite/model.h"
 
@@ -247,14 +246,6 @@ TEST(BasicFlatBufferModel, TestNullErrorReporter) {
   ASSERT_NE(interpreter->Invoke(), kTfLiteOk);
 }
 
-// Test what happens if we cannot bind any of the ops.
-TEST(BasicFlatBufferModel, TestBuildModelFromCorruptedData) {
-  std::string corrupted_data = "123";
-  auto model = FlatBufferModel::BuildFromBuffer(corrupted_data.c_str(),
-                                                corrupted_data.length());
-  ASSERT_FALSE(model);
-}
-
 // Test that loading model directly from a Model flatbuffer works.
 TEST(BasicFlatBufferModel, TestBuildFromModel) {
   TestErrorReporter reporter;
diff --git a/tensorflow/contrib/lite/tools/BUILD b/tensorflow/contrib/lite/tools/BUILD
index 20df905270..1bffcfb987 100644
--- a/tensorflow/contrib/lite/tools/BUILD
+++ b/tensorflow/contrib/lite/tools/BUILD
@@ -93,3 +93,28 @@ filegroup(
     ),
     visibility = ["//tensorflow:__subpackages__"],
 )
+
+cc_library(
+    name = "verifier",
+    srcs = ["verifier.cc"],
+    hdrs = ["verifier.h"],
+    deps = [
+        "//tensorflow/contrib/lite:schema_fbs_version",
+        "//tensorflow/contrib/lite/schema:schema_fbs",
+    ],
+)
+
+cc_test(
+    name = "verifier_test",
+    size = "small",
+    srcs = ["verifier_test.cc"],
+    deps = [
+        ":verifier",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite:schema_fbs_version",
+        "//tensorflow/contrib/lite/schema:schema_fbs",
+        "//tensorflow/contrib/lite/testing:util",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
diff --git a/tensorflow/contrib/lite/tools/verifier.cc b/tensorflow/contrib/lite/tools/verifier.cc
new file mode 100644
index 0000000000..95a0895379
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/verifier.cc
@@ -0,0 +1,43 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/tools/verifier.h"
+#include "tensorflow/contrib/lite/schema/schema_generated.h"
+#include "tensorflow/contrib/lite/version.h"
+
+namespace tflite {
+
+namespace {
+
+const Model* VerifyFlatbufferAndGetModel(const void* buf, size_t len) {
+  ::flatbuffers::Verifier verifier(static_cast<const uint8_t*>(buf), len);
+  if (VerifyModelBuffer(verifier)) {
+    return ::tflite::GetModel(buf);
+  } else {
+    return nullptr;
+  }
+}
+
+}  // namespace
+
+bool Verify(const void* buf, size_t len) {
+  const Model* model = VerifyFlatbufferAndGetModel(buf, len);
+  if (model == nullptr) {
+    return false;
+  }
+
+  return model->version() == TFLITE_SCHEMA_VERSION;
+}
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/tools/verifier.h b/tensorflow/contrib/lite/tools/verifier.h
new file mode 100644
index 0000000000..03e1f22b7e
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/verifier.h
@@ -0,0 +1,31 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_VERIFIER_H_
+#define TENSORFLOW_CONTRIB_LITE_TOOLS_VERIFIER_H_
+
+#include <stdio.h>
+
+namespace tflite {
+
+// Verifies the integrity of a Tensorflow Lite flatbuffer model file.
+// Currently, it verifies:
+// * The file is following a legit flatbuffer schema.
+// * The model is in supported version.
+bool Verify(const void* buf, size_t len);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_VERIFIER_H_
diff --git a/tensorflow/contrib/lite/tools/verifier_test.cc b/tensorflow/contrib/lite/tools/verifier_test.cc
new file mode 100644
index 0000000000..0481a55a78
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/verifier_test.cc
@@ -0,0 +1,136 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/tools/verifier.h"
+#include "flatbuffers/flatbuffers.h"
+#include "flatbuffers/util.h"
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/allocation.h"
+#include "tensorflow/contrib/lite/error_reporter.h"
+#include "tensorflow/contrib/lite/schema/schema_generated.h"
+#include "tensorflow/contrib/lite/testing/util.h"
+#include "tensorflow/contrib/lite/version.h"
+
+namespace tflite {
+
+using flatbuffers::FlatBufferBuilder;
+using flatbuffers::Offset;
+using flatbuffers::Vector;
+
+// Class that abstracts the list of buffers at the end of the TF Lite structure
+class DeferredBufferWriter {
+ public:
+  DeferredBufferWriter() {
+    data_.push_back({});  // sentinel empty buffer.
+  }
+
+  Offset<Vector<Offset<Buffer>>> BuildBuffers(FlatBufferBuilder *builder) {
+    std::vector<Offset<Buffer>> buffer_vector;
+    for (const auto &vec : data_) {
+      auto data_buffer = builder->CreateVector(vec.data(), vec.size());
+      buffer_vector.push_back(tflite::CreateBuffer(*builder, data_buffer));
+    }
+    return builder->CreateVector(buffer_vector);
+  }
+
+  // Registers a buffer index and takes ownership of the data to write to it.
+  int Record(std::vector<uint8_t> data) {
+    int buffer_index = data_.size();
+    data_.emplace_back(std::move(data));
+    return buffer_index;
+  }
+
+ private:
+  std::vector<std::vector<unsigned char>> data_;
+};
+
+TEST(VerifyModel, TestEmptyModel) {
+  FlatBufferBuilder builder;
+  auto model = CreateModel(builder, /*version=*/TFLITE_SCHEMA_VERSION,
+                           /*operator_codes=*/0, /*subgraphs=*/0,
+                           /*description=*/0, /*buffers=*/0);
+  ::tflite::FinishModelBuffer(builder, model);
+
+  ASSERT_TRUE(Verify(builder.GetBufferPointer(), builder.GetSize()));
+}
+
+TEST(VerifyModel, TestSimpleModel) {
+  FlatBufferBuilder builder;
+  auto inputs = builder.CreateVector<int32_t>({0});
+  auto outputs = builder.CreateVector<int32_t>({1});
+  auto operator_codes = builder.CreateVector(std::vector<Offset<OperatorCode>>{
+      CreateOperatorCodeDirect(builder, BuiltinOperator_CUSTOM, "test")});
+  auto operators =
+      builder.CreateVector(std::vector<Offset<Operator>>{CreateOperator(
+          builder, /*opcode_index=*/0,
+          /*inputs=*/builder.CreateVector<int32_t>({0}),
+          /*outputs=*/builder.CreateVector<int32_t>({1}), BuiltinOptions_NONE,
+          /*builtin_options=*/0,
+          /*custom_options=*/0, ::tflite::CustomOptionsFormat_FLEXBUFFERS)});
+  std::vector<int> shape;
+  auto tensors = builder.CreateVector(std::vector<Offset<Tensor>>{
+      CreateTensorDirect(builder, &shape, TensorType_INT32, /*buffer=*/0,
+                         "input", /*quantization=*/0),
+      CreateTensorDirect(builder, &shape, TensorType_INT32, /*buffer=*/0,
+                         "output", /*quantization=*/0)});
+  auto subgraph = std::vector<Offset<SubGraph>>(
+      {CreateSubGraph(builder, tensors, inputs, outputs, operators,
+                      builder.CreateString("Main"))});
+
+  auto model = CreateModel(builder, TFLITE_SCHEMA_VERSION, operator_codes,
+                           builder.CreateVector(subgraph),
+                           builder.CreateString("SmartReply"), /*buffers=*/0);
+
+  ::tflite::FinishModelBuffer(builder, model);
+  ASSERT_TRUE(Verify(builder.GetBufferPointer(), builder.GetSize()));
+}
+
+TEST(VerifyModel, TestCorruptedData) {
+  string model = "123";
+  ASSERT_FALSE(Verify(model.data(), model.size()));
+}
+
+TEST(VerifyModel, TestUnsupportedVersion) {
+  FlatBufferBuilder builder;
+  auto model = CreateModel(builder, /*version=*/1, /*operator_codes=*/0,
+                           /*subgraphs=*/0, /*description=*/0, /*buffers=*/0);
+  ::tflite::FinishModelBuffer(builder, model);
+  ASSERT_FALSE(Verify(builder.GetBufferPointer(), builder.GetSize()));
+}
+
+TEST(VerifyModel, TestRandomModificationIsNotAllowed) {
+  FlatBufferBuilder builder;
+  auto model = CreateModel(builder, /*version=*/TFLITE_SCHEMA_VERSION,
+                           /*operator_codes=*/0,
+                           /*subgraphs=*/0, /*description=*/0, /*buffers=*/0);
+  ::tflite::FinishModelBuffer(builder, model);
+
+  string model_content(reinterpret_cast<char *>(builder.GetBufferPointer()),
+                       builder.GetSize());
+  for (int i = 0; i < model_content.size(); i++) {
+    model_content[i] = (model_content[i] + 137) % 255;
+    EXPECT_FALSE(Verify(model_content.data(), model_content.size()))
+        << "Fail at position: " << i;
+  }
+}
+
+// TODO(yichengfan): make up malicious files to test with.
+
+}  // namespace tflite
+
+int main(int argc, char **argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
-- 
GitLab


From c8f1a34ba6f9bbde7e3fbb106158661ded98f0a0 Mon Sep 17 00:00:00 2001
From: Akshay Agrawal <akshayka@google.com>
Date: Thu, 25 Jan 2018 17:50:40 -0800
Subject: [PATCH 203/966] Replace instances of `control_dependencies(None)`
 with `init_scope` when initializing variables.

Today, when variables are constructed (Resource and otherwise), we lift certain
operations, including the VarHandleOp and initialization ops, out of all
control flow contexts; the mechanism for doing so is entering the context
manager returned by `control_dependencies(None)`. This change replaces various
instances of this mechanism with `init_scope`, which clears control dependencies, lifts ops out of function-building graphs, and pauses the gradient tape. As a result,
variables that are created inside graph functions will be automatically hoisted
into an outer context.

PiperOrigin-RevId: 183320576
---
 tensorflow/python/layers/base.py              | 118 +++++++++---------
 .../python/ops/resource_variable_ops.py       |   4 +-
 tensorflow/python/ops/variable_scope.py       |   4 +-
 tensorflow/python/ops/variables.py            |   7 +-
 tensorflow/python/training/moving_averages.py |   4 +-
 tensorflow/python/training/optimizer.py       |   2 +-
 6 files changed, 73 insertions(+), 66 deletions(-)

diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py
index 00faf3faa1..d892654ebe 100644
--- a/tensorflow/python/layers/base.py
+++ b/tensorflow/python/layers/base.py
@@ -500,13 +500,30 @@ class Layer(object):
       instance is returned.
 
     Raises:
-      RuntimeError: If called in Eager mode with partioned variable
-        regularization.
+      RuntimeError: If called with partioned variable regularization and
+        eager execution is enabled.
     """
 
-    in_graph_mode = context.in_graph_mode()
-    if in_graph_mode:
-      existing_variables = set(tf_variables.global_variables())
+    # `init_graph` should point to the graph in which variable initialization
+    # will occur; it should be None if and only if initialization will take
+    # place in the eager context.
+    init_graph = None
+    if context.in_graph_mode():
+      default_graph = ops.get_default_graph()
+      if default_graph.building_function:
+        with ops.init_scope():
+          # Retrieve the variables from the graph into which variables
+          # will be lifted; if initialization ops will be lifted into
+          # the eager context, then there is nothing to retrieve, since variable
+          # collections are not supported when eager execution is enabled.
+          if context.in_graph_mode():
+            init_graph = ops.get_default_graph()
+            existing_variables = set(tf_variables.global_variables())
+      else:
+        # Initialization ops will not be lifted out of the default graph.
+        init_graph = default_graph
+        existing_variables = set(tf_variables.global_variables())
+
     if dtype is None:
       dtype = self.dtype or dtypes.float32
 
@@ -523,54 +540,51 @@ class Layer(object):
                                    trainable=trainable and self.trainable,
                                    partitioner=partitioner)
 
-        if in_graph_mode:
-          if (trainable and self.trainable
-              and variable not in tf_variables.trainable_variables()):
-            # A custom getter / variable scope overrode the trainable flag.
-            trainable = False
+        if init_graph is not None:  # pylint: disable=protected-access
+          # The variable was created and initialized in a graph.
+
           if variable in existing_variables:
             # To match the behavior of tf.get_variable(), we only apply
             # regularization if the variable is newly created.
             return variable
 
-        if regularizer:
-          def regularizer_factory():
-            if context.in_graph_mode():
-              with vs.variable_scope(scope, reuse=reuse,
-                                     auxiliary_name_scope=False):
-                with ops.name_scope(self._name_scope_name(scope)):
-                  if isinstance(variable, tf_variables.PartitionedVariable):
-                    for v in variable:
-                      with ops.colocate_with(v.op):
-                        with ops.name_scope(name + '/Regularizer'):
-                          regularization = regularizer(v)
-                      if regularization is not None:
-                        self.add_loss(regularization)
-                  else:
-                    with ops.colocate_with(variable.op):
-                      with ops.name_scope(name + '/Regularizer'):
-                        regularization = regularizer(variable)
-                    if regularization is not None:
-                      self.add_loss(regularization)
+          with init_graph.as_default():
+            trainable_variables = tf_variables.trainable_variables()
+          if (trainable and self.trainable and
+              variable not in trainable_variables):
+            # A custom getter / variable scope overrode the trainable flag.
+            trainable = False
+
+          if regularizer:
+            if isinstance(variable, tf_variables.PartitionedVariable):
+              for v in variable:
+                with ops.colocate_with(v.op):
+                  with ops.name_scope(name + '/Regularizer'):
+                    regularization = regularizer(v)
+                if regularization is not None:
+                  self.add_loss(regularization)
             else:
-              if isinstance(variable, tf_variables.PartitionedVariable):
-                raise RuntimeError(
-                    'Partitioned variable regularization is not yet '
-                    'supported when executing eagerly. File a feature request'
-                    'if this is important to you.')
-              # Save a zero-argument lambda which runs the regularizer on the
-              # variable, to be executed when `Layer.losses` is requested.
-              # This makes losses responsive to variable updates when
-              # executing eagerly.
-              self._losses.append(lambda: regularizer(variable))
-
-          if hasattr(self, '_defer_regularizers') and self._defer_regularizers:
-            # _defer_regularizers exists and is set to True if `build` was
-            # invoked in `__call__`: deferring regularizer construction
-            # prevents the regularizer from being created in an `init_scope`.
-            self._get_regularizer_factories().append(regularizer_factory)
-          else:
-            regularizer_factory()
+              with ops.colocate_with(variable.op):
+                with ops.name_scope(name + '/Regularizer'):
+                  regularization = regularizer(variable)
+              if regularization is not None:
+                self.add_loss(regularization)
+        elif regularizer:  # and initialization took place in an eager context
+          if isinstance(variable, tf_variables.PartitionedVariable):
+            raise RuntimeError(
+                'Partitioned variable regularization is not yet '
+                'supported when executing eagerly. File a feature request'
+                'if this is important to you.')
+          # Save a zero-argument lambda which runs the regularizer on the
+          # variable, to be executed when `Layer.losses` is requested.
+          # This makes losses responsive to variable updates when executing
+          # eagerly.
+          #
+          # TODO(akshayka): Do the same for graphs as well, so that losses
+          # collected in a while_loop can be run outside its control flow
+          # context and so that losses won't be swallowed up by graph functions
+          # (i.e., `.losses()` should always create regularizers).
+          self._losses.append(lambda: regularizer(variable))
 
     if trainable:
       self._trainable_weights.append(variable)
@@ -670,15 +684,7 @@ class Layer(object):
             except AttributeError:
               pass
           input_shapes = nest.map_structure(lambda x: x.get_shape(), inputs)
-
-          # Signal to `add_variable` that regularizer construction should be
-          # deferred.
-          self._defer_regularizers = True
-          with ops.init_scope():
-            self.build(input_shapes)
-          # Create any regularizers added by `build`.
-          self._maybe_create_variable_regularizers()
-          self._defer_regularizers = False
+          self.build(input_shapes)
         try:
           # Note: not all sub-classes of Layer call Layer.__init__ (especially
           # the ones under tensorflow/python/keras). Hence we recompute this
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index f727a29233..bdf41cd75d 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -348,11 +348,11 @@ class ResourceVariable(variables.Variable):
     if trainable and ops.GraphKeys.TRAINABLE_VARIABLES not in collections:
       collections = list(collections) + [ops.GraphKeys.TRAINABLE_VARIABLES]
     self._save_slice_info = None
-    self._in_graph_mode = context.in_graph_mode()
     # Save the graph's container prefix for error checking. Reading the value of
     # the ResourceVariable from another Graph in Eager mode is an error.
     self._container_prefix = ops.get_default_graph()._container_prefix  # pylint: disable=protected-access
-    with ops.control_dependencies(None):
+    with ops.init_scope():
+      self._in_graph_mode = context.in_graph_mode()
       with ops.name_scope(name, "Variable", []
                           if init_from_fn else [initial_value]) as name:
         # pylint: disable=protected-access
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index db594ac6a0..81565a6377 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -771,8 +771,8 @@ class _VariableStore(object):
     if initializer is None:
       initializer, initializing_from_value = self._get_default_initializer(
           name=name, shape=shape, dtype=dtype)
-    # Clear control dependencies while creating the initializer.
-    with ops.control_dependencies(None):
+    # Enter an init scope when creating the initializer.
+    with ops.init_scope():
       if initializing_from_value:
         init_val = initializer
         variable_dtype = None
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index ff19612383..19e3298e40 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -212,6 +212,7 @@ class Variable(object):
     if not context.in_graph_mode():
       raise RuntimeError("tf.Variable not supported in Eager mode. "
                          "Please use tfe.Variable instead")
+    self._in_graph_mode = context.in_graph_mode()
     if variable_def:
       # If variable_def is provided, recreates the variable from its fields.
       if initial_value:
@@ -307,7 +308,7 @@ class Variable(object):
 
     if trainable and ops.GraphKeys.TRAINABLE_VARIABLES not in collections:
       collections = list(collections) + [ops.GraphKeys.TRAINABLE_VARIABLES]
-    with ops.control_dependencies(None):
+    with ops.init_scope():
       with ops.name_scope(name, "Variable", [] if init_from_fn else
                           [initial_value]) as name:
 
@@ -378,8 +379,8 @@ class Variable(object):
         else:
           with ops.colocate_with(self._variable.op):
             self._snapshot = array_ops.identity(self._variable, name="read")
+      ops.add_to_collections(collections, self)
 
-    ops.add_to_collections(collections, self)
     self._caching_device = caching_device
     self._save_slice_info = None
     self._constraint = constraint
@@ -553,7 +554,7 @@ class Variable(object):
       A `Tensor` holding the value of this variable after its initializer
       has run.
     """
-    with ops.control_dependencies(None):
+    with ops.init_scope():
       return control_flow_ops.cond(is_variable_initialized(self),
                                    self.read_value,
                                    lambda: self.initial_value)
diff --git a/tensorflow/python/training/moving_averages.py b/tensorflow/python/training/moving_averages.py
index e34c759e89..43ed1ac170 100644
--- a/tensorflow/python/training/moving_averages.py
+++ b/tensorflow/python/training/moving_averages.py
@@ -187,7 +187,7 @@ def _zero_debias(unbiased_var, value, decay):
   with variable_scope.variable_scope(
       unbiased_var.op.name, values=[unbiased_var, value, decay]) as scope:
     with ops.colocate_with(unbiased_var):
-      with ops.control_dependencies(None):
+      with ops.init_scope():
         biased_initializer = init_ops.zeros_initializer(
             dtype=unbiased_var.dtype)(unbiased_var.get_shape())
         local_step_initializer = init_ops.zeros_initializer()
@@ -385,7 +385,7 @@ class ExponentialMovingAverage(object):
       # For variables: to lower communication bandwidth across devices we keep
       # the moving averages on the same device as the variables. For other
       # tensors, we rely on the existing device allocation mechanism.
-      with ops.control_dependencies(None):
+      with ops.init_scope():
         if isinstance(var, variables.Variable):
           avg = slot_creator.create_slot(var,
                                          var.initialized_value(),
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index 038469b1ba..719b83e5ca 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -514,7 +514,7 @@ class Optimizer(object):
     if not var_list:
       raise ValueError("No gradients provided for any variable: %s." %
                        ([str(v) for _, _, v in converted_grads_and_vars],))
-    with ops.control_dependencies(None):
+    with ops.init_scope():
       self._create_slots([_get_variable_for(v) for v in var_list])
     update_ops = []
     with ops.name_scope(name, self._name) as name:
-- 
GitLab


From 642454bd3296959f0025e1fb1730cdd95c36713f Mon Sep 17 00:00:00 2001
From: Adam Roberts <adarob@google.com>
Date: Thu, 25 Jan 2018 17:58:48 -0800
Subject: [PATCH 204/966] Add input_shape to seq2seq helpers.

PiperOrigin-RevId: 183321394
---
 .../contrib/seq2seq/python/ops/helper.py      | 36 +++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/tensorflow/contrib/seq2seq/python/ops/helper.py b/tensorflow/contrib/seq2seq/python/ops/helper.py
index ef3722ee41..6d8f786223 100644
--- a/tensorflow/contrib/seq2seq/python/ops/helper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/helper.py
@@ -72,6 +72,14 @@ class Helper(object):
     """
     raise NotImplementedError("batch_size has not been implemented")
 
+  @abc.abstractproperty
+  def input_shape(self):
+    """Shape of each input element in batch.
+
+    Returns a `TensorShape`.
+    """
+    raise NotImplementedError("input_shape has not been implemented")
+
   @abc.abstractproperty
   def sample_ids_shape(self):
     """Shape of tensor returned by `sample`, excluding the batch dimension.
@@ -127,6 +135,7 @@ class CustomHelper(Helper):
     self._sample_fn = sample_fn
     self._next_inputs_fn = next_inputs_fn
     self._batch_size = None
+    self._input_shape = None
     self._sample_ids_shape = tensor_shape.TensorShape(sample_ids_shape or [])
     self._sample_ids_dtype = sample_ids_dtype or dtypes.int32
 
@@ -149,6 +158,8 @@ class CustomHelper(Helper):
       (finished, next_inputs) = self._initialize_fn()
       if self._batch_size is None:
         self._batch_size = array_ops.size(finished)
+      if self._input_shape is None:
+        self._input_shape = next_inputs.shape[1:]
     return (finished, next_inputs)
 
   def sample(self, time, outputs, state, name=None):
@@ -184,6 +195,7 @@ class TrainingHelper(Helper):
     """
     with ops.name_scope(name, "TrainingHelper", [inputs, sequence_length]):
       inputs = ops.convert_to_tensor(inputs, name="inputs")
+      self._inputs = inputs
       if not time_major:
         inputs = nest.map_structure(_transpose_batch_time, inputs)
 
@@ -199,11 +211,16 @@ class TrainingHelper(Helper):
           lambda inp: array_ops.zeros_like(inp[0, :]), inputs)
 
       self._batch_size = array_ops.size(sequence_length)
+      self._input_shape = inputs.shape[2:]
 
   @property
   def batch_size(self):
     return self._batch_size
 
+  @property
+  def input_shape(self):
+    return self._input_shape
+
   @property
   def sample_ids_shape(self):
     return tensor_shape.TensorShape([])
@@ -212,6 +229,14 @@ class TrainingHelper(Helper):
   def sample_ids_dtype(self):
     return dtypes.int32
 
+  @property
+  def inputs(self):
+    return self._inputs
+
+  @property
+  def sequence_length(self):
+    return self._sequence_length
+
   def initialize(self, name=None):
     with ops.name_scope(name, "TrainingHelperInitialize"):
       finished = math_ops.equal(0, self._sequence_length)
@@ -516,11 +541,16 @@ class GreedyEmbeddingHelper(Helper):
     if self._end_token.get_shape().ndims != 0:
       raise ValueError("end_token must be a scalar")
     self._start_inputs = self._embedding_fn(self._start_tokens)
+    self._input_shape = self._start_inputs.shape[1:]
 
   @property
   def batch_size(self):
     return self._batch_size
 
+  @property
+  def input_shape(self):
+    return self._input_shape
+
   @property
   def sample_ids_shape(self):
     return tensor_shape.TensorShape([])
@@ -632,6 +662,8 @@ class InferenceHelper(Helper):
     self._sample_dtype = sample_dtype
     self._next_inputs_fn = next_inputs_fn
     self._batch_size = array_ops.shape(start_inputs)[0]
+    self._input_shape = start_inputs.shape[1:]
+
     self._start_inputs = ops.convert_to_tensor(
         start_inputs, name="start_inputs")
 
@@ -639,6 +671,10 @@ class InferenceHelper(Helper):
   def batch_size(self):
     return self._batch_size
 
+  @property
+  def input_shape(self):
+    return self._input_shape
+
   @property
   def sample_ids_shape(self):
     return self._sample_shape
-- 
GitLab


From 2b7d03c91d092cda88e6db345705fff3cd5b7b77 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 25 Jan 2018 18:42:44 -0800
Subject: [PATCH 205/966] Allow passing dummy/custom minmax information on a
 per-array basis, unlike the existing --default_ranges_{min,max} flags which
 only allowed to set a single global value for all arrays.

This takes the form of a new embedded message in ModelFlags, which is
its own message so that it can be serialized separately. The command-line
interface is --arrays_extra_info_file=some_proto.pbtxt, i.e. we don't
try to make a command-line-flags-only interface, we mandate putting the info
in a file. The rationale is that users may want to specify custom minmax
for hundreds of arrays, so it would be cumbersome to have that all in a
command line.

This should be considered an experimental feature, in the sense that
in properly quantized models, minmax information is already embedded
in the graph (e.g. in FakeQuant nodes). This is an extension of the
existing --default_ranges_{min,max} feature which had turned out to be
too restrictive for many users.

PiperOrigin-RevId: 183326000
---
 .../internal/optimized/depthwiseconv_uint8.h  |  2 +-
 tensorflow/contrib/lite/toco/BUILD            |  1 +
 tensorflow/contrib/lite/toco/args.h           |  1 +
 .../contrib/lite/toco/model_cmdline_flags.cc  | 15 +++++++++++++
 .../contrib/lite/toco/model_flags.proto       | 22 ++++++++++++++++++-
 tensorflow/contrib/lite/toco/toco_port.h      | 21 ++++++++++++++++++
 tensorflow/contrib/lite/toco/toco_tooling.cc  |  2 ++
 tensorflow/contrib/lite/toco/tooling_util.cc  | 14 ++++++++++++
 tensorflow/contrib/lite/toco/tooling_util.h   | 22 ++-----------------
 9 files changed, 78 insertions(+), 22 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h
index f993fd6a00..fc58978964 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h
@@ -1504,7 +1504,7 @@ inline void QuantizedDepthwiseConvAccumRowGeneric(
       << "*\n"
       << "* If you would like to carry on with the slow code, compile\n"
       << "* with this preprocessor token defined:\n"
-      << "* TFLITE_ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK.\n"
+      << "* ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK.\n"
       << "*\n"
       << "* The right thing to do, if you care about performance, is to add\n"
       << "* a new DepthwiseConv kernel to tfmini to cover your case.\n"
diff --git a/tensorflow/contrib/lite/toco/BUILD b/tensorflow/contrib/lite/toco/BUILD
index 041e248790..6fc7e5e3fd 100644
--- a/tensorflow/contrib/lite/toco/BUILD
+++ b/tensorflow/contrib/lite/toco/BUILD
@@ -160,6 +160,7 @@ cc_library(
     ],
     deps = [
         # Placeholder for internal file dependency.
+        "@protobuf_archive//:protobuf_headers",
         "//tensorflow/core:framework_lite",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
diff --git a/tensorflow/contrib/lite/toco/args.h b/tensorflow/contrib/lite/toco/args.h
index 8004a1a37a..b97a4720a7 100644
--- a/tensorflow/contrib/lite/toco/args.h
+++ b/tensorflow/contrib/lite/toco/args.h
@@ -208,6 +208,7 @@ struct ParsedModelFlags {
   Arg<bool> dump_graphviz_video = Arg<bool>(false);
   Arg<bool> allow_nonexistent_arrays = Arg<bool>(false);
   Arg<bool> allow_nonascii_arrays = Arg<bool>(false);
+  Arg<string> arrays_extra_info_file;
 };
 
 // Flags that describe the operation you would like to do (what conversion
diff --git a/tensorflow/contrib/lite/toco/model_cmdline_flags.cc b/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
index 36520d9c55..4e2dec15a5 100644
--- a/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
+++ b/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
@@ -148,6 +148,12 @@ bool ParseModelFlagsFromCommandLineFlags(
            "ranging from 32 to 127. This is disallowed by default so as to "
            "catch common copy-and-paste issues where invisible unicode "
            "characters are unwittingly added to these strings."),
+      Flag(
+          "arrays_extra_info_file", parsed_flags.arrays_extra_info_file.bind(),
+          parsed_flags.arrays_extra_info_file.default_value(),
+          "Path to an optional file containing a serialized ArraysExtraInfo "
+          "proto allowing to pass extra information about arrays not specified "
+          "in the input model file, such as extra MinMax information."),
   };
   bool asked_for_help =
       *argc == 2 && (!strcmp(argv[1], "--help") || !strcmp(argv[1], "-help"));
@@ -365,6 +371,15 @@ void ReadModelFlagsFromCommandLineFlags(
       parsed_model_flags.allow_nonascii_arrays.value());
   model_flags->set_allow_nonexistent_arrays(
       parsed_model_flags.allow_nonexistent_arrays.value());
+
+  if (parsed_model_flags.arrays_extra_info_file.specified()) {
+    string arrays_extra_info_file_contents;
+    port::file::GetContents(parsed_model_flags.arrays_extra_info_file.value(),
+                            &arrays_extra_info_file_contents,
+                            port::file::Defaults());
+    ParseFromStringEitherTextOrBinary(arrays_extra_info_file_contents,
+                                      model_flags->mutable_arrays_extra_info());
+  }
 }
 
 ParsedModelFlags* UncheckedGlobalParsedModelFlags(bool must_already_exist) {
diff --git a/tensorflow/contrib/lite/toco/model_flags.proto b/tensorflow/contrib/lite/toco/model_flags.proto
index 9070ddc883..e4b39b34e8 100644
--- a/tensorflow/contrib/lite/toco/model_flags.proto
+++ b/tensorflow/contrib/lite/toco/model_flags.proto
@@ -87,6 +87,22 @@ message RnnState {
   optional int32 size = 3;
 }
 
+// An ArraysExtraInfo message stores a collection of additional Information
+// about arrays in a model, complementing the information in the model itself.
+// It is intentionally a separate message so that it may be serialized and
+// passed separately from the model. See --arrays_extra_info_file.
+//
+// A typical use case is to manually specify MinMax for specific arrays in a
+// model that does not already contain such MinMax information.
+message ArraysExtraInfo {
+  message Entry {
+    optional string name = 1;
+    optional float min = 2;
+    optional float max = 3;
+  }
+  repeated Entry entries = 1;
+}
+
 // ModelFlags encodes properties of a model that, depending on the file
 // format, may or may not be recorded in the model file. The purpose of
 // representing these properties in ModelFlags is to allow passing them
@@ -108,7 +124,7 @@ message RnnState {
 //   optional int32 input_dims = 11 [ default = 4];
 //   repeated int32 input_shape = 13;
 //
-// Next ID to USE: 18.
+// Next ID to USE: 19.
 message ModelFlags {
   // Information about the input arrays, i.e. the arrays from which input
   // activations will be read.
@@ -151,4 +167,8 @@ message ModelFlags {
   // catch common copy-and-paste issues where invisible unicode
   // characters are unwittingly added to these strings.
   optional bool allow_nonascii_arrays = 17;
+
+  // If set, this ArraysExtraInfo allows to pass extra information about arrays
+  // not specified in the input model file, such as extra MinMax information.
+  optional ArraysExtraInfo arrays_extra_info = 18;
 }
diff --git a/tensorflow/contrib/lite/toco/toco_port.h b/tensorflow/contrib/lite/toco/toco_port.h
index 0572848cb5..4be3b5a0bf 100644
--- a/tensorflow/contrib/lite/toco/toco_port.h
+++ b/tensorflow/contrib/lite/toco/toco_port.h
@@ -19,6 +19,7 @@ limitations under the License.
 // can build and use on google internal environments and on OSX.
 
 #include <string>
+#include "google/protobuf/text_format.h"
 #include "tensorflow/contrib/lite/toco/format_port.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/platform.h"
@@ -75,6 +76,26 @@ void CopyToBuffer(const ::Cord& src, char* dest);
 #endif  // PLATFORM_GOOGLE
 void CopyToBuffer(const string& src, char* dest);
 }  // namespace port
+
+inline bool ParseFromStringOverload(const std::string& in,
+                                    TFLITE_PROTO_NS::Message* proto) {
+  return TFLITE_PROTO_NS::TextFormat::ParseFromString(in, proto);
+}
+
+template <typename Proto>
+bool ParseFromStringEitherTextOrBinary(const std::string& input_file_contents,
+                                       Proto* proto) {
+  if (proto->ParseFromString(input_file_contents)) {
+    return true;
+  }
+
+  if (ParseFromStringOverload(input_file_contents, proto)) {
+    return true;
+  }
+
+  return false;
+}
+
 }  // namespace toco
 
 #endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_PORT_H_
diff --git a/tensorflow/contrib/lite/toco/toco_tooling.cc b/tensorflow/contrib/lite/toco/toco_tooling.cc
index 720c33777d..727df1cc76 100644
--- a/tensorflow/contrib/lite/toco/toco_tooling.cc
+++ b/tensorflow/contrib/lite/toco/toco_tooling.cc
@@ -193,6 +193,7 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
   }
 
   SetFinalDataTypeOnInputs(toco_flags, model);
+  UseArraysExtraInfo(model);
 
   // Remove unused ops before performing any other optimizations. This is to
   // stop optimizations from crossing the input/output boundaries. For example
@@ -232,6 +233,7 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
   transformations.Add(new ResolveConstantConcatenation);
   RunGraphTransformations(model, "general graph transformations",
                           transformations);
+
   if (quantize_output) {
     RunGraphTransformations(model, "pre-quantization graph transformations",
                             {new HardcodeMinMax, new DropFakeQuant});
diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc
index df785a5102..3728d48659 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.cc
+++ b/tensorflow/contrib/lite/toco/tooling_util.cc
@@ -1200,6 +1200,9 @@ void ResolveModelFlags(const ModelFlags& model_flags, Model* model) {
   model->flags.set_allow_nonascii_arrays(model_flags.allow_nonascii_arrays());
   model->flags.set_allow_nonexistent_arrays(
       model_flags.allow_nonexistent_arrays());
+
+  CHECK(!model->flags.has_arrays_extra_info());
+  *model->flags.mutable_arrays_extra_info() = model_flags.arrays_extra_info();
 }
 
 void CheckIsReadyForQuantization(const Model& model) {
@@ -1711,4 +1714,15 @@ ArrayDataType ConvertIODataTypeToArrayDataType(IODataType type) {
   }
 }
 
+void UseArraysExtraInfo(Model* model) {
+  for (const auto& entry : model->flags.arrays_extra_info().entries()) {
+    QCHECK(model->HasArray(entry.name()))
+        << "ArraysExtraInfo refers to non-existent array name: "
+        << entry.name();
+    auto& minmax = model->GetArray(entry.name()).GetOrCreateMinMax();
+    minmax.min = entry.min();
+    minmax.max = entry.max();
+  }
+}
+
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/tooling_util.h b/tensorflow/contrib/lite/toco/tooling_util.h
index 5986d63649..2ac51c7e5b 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.h
+++ b/tensorflow/contrib/lite/toco/tooling_util.h
@@ -23,7 +23,6 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "google/protobuf/text_format.h"
 #include "tensorflow/core/platform/logging.h"
 #if TOCO_SUPPORT_PORTABLE_PROTOS
 #include "third_party/protobuf/src/google/protobuf/text_format.h"
@@ -84,25 +83,6 @@ void DumpGraphvizVideoFrame(const Model& model);
 void LogDump(int log_level, const string& message, const Model& model);
 void LogSummary(int log_level, const string& message, const Model& model);
 
-inline bool ParseFromStringOverload(const std::string& in,
-                                    TFLITE_PROTO_NS::Message* proto) {
-  return TFLITE_PROTO_NS::TextFormat::ParseFromString(in, proto);
-}
-
-template <typename Proto>
-bool ParseFromStringEitherTextOrBinary(const std::string& input_file_contents,
-                                       Proto* proto) {
-  if (proto->ParseFromString(input_file_contents)) {
-    return true;
-  }
-
-  if (ParseFromStringOverload(input_file_contents, proto)) {
-    return true;
-  }
-
-  return false;
-}
-
 // TODO(b/36075966): Clean up when dims superseded by array shape.
 void ExtendShape(Shape* shape, int new_shape_size);
 
@@ -298,6 +278,8 @@ void CheckFinalDataTypesSatisfied(const Model& model);
 
 ArrayDataType ConvertIODataTypeToArrayDataType(IODataType type);
 
+void UseArraysExtraInfo(Model* model);
+
 }  // namespace toco
 
 #endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TOOLING_UTIL_H_
-- 
GitLab


From c2ea7a710c5ec478fcc0150ec0117250c54e601e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 25 Jan 2018 18:46:51 -0800
Subject: [PATCH 206/966] Set size of test
 //third_party/tensorflow/python/data/kernel_tests:dataset_from_generator_op_test
 to medium

It sometimes takes longer than a minute, and thus gets flaky timeouts.

PiperOrigin-RevId: 183326334
---
 tensorflow/python/data/kernel_tests/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index 5fb389cf92..43cbde69d9 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -59,7 +59,7 @@ tf_py_test(
 
 tf_py_test(
     name = "dataset_from_generator_op_test",
-    size = "small",
+    size = "medium",
     srcs = ["dataset_from_generator_op_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
-- 
GitLab


From 8dba939e34416b15f21dfcb41e9db30b6d46bcb2 Mon Sep 17 00:00:00 2001
From: cclauss <cclauss@bluewin.ch>
Date: Fri, 26 Jan 2018 03:51:55 +0100
Subject: [PATCH 207/966] import contextmanager in side_effect_guards.py
 (#16426)

---
 tensorflow/contrib/py2tf/converters/side_effect_guards.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/py2tf/converters/side_effect_guards.py b/tensorflow/contrib/py2tf/converters/side_effect_guards.py
index 1f25303fba..83d0720b6b 100644
--- a/tensorflow/contrib/py2tf/converters/side_effect_guards.py
+++ b/tensorflow/contrib/py2tf/converters/side_effect_guards.py
@@ -34,6 +34,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from contextlib import contextmanager
+
 import gast
 
 from tensorflow.contrib.py2tf.pyct import anno
@@ -123,7 +125,7 @@ class SideEffectGuardTransformer(gast.NodeTransformer):
             temp_result = (temp_result,)
           ctx = tf.control_dependencies(temp_result)  # pylint:disable=undefined-variable
         else:
-          ctx = contextmanager(lambda: (yield))()  # pylint:disable=undefined-variable
+          ctx = contextmanager(lambda: (yield))()
         with ctx:
           # TODO(mdan): Also insert ops to re-fetch if variables are involved.
           pass  # Will be removed below.
-- 
GitLab


From b0048a926e35311a107d93dd4df3132d16271d66 Mon Sep 17 00:00:00 2001
From: ManHyuk <manhyuk@kw.ac.kr>
Date: Fri, 26 Jan 2018 11:52:18 +0900
Subject: [PATCH 208/966] Fix typo (#16425)

* fix typos

* fix typos
---
 tensorflow/core/platform/profile_utils/cpu_utils.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/platform/profile_utils/cpu_utils.h b/tensorflow/core/platform/profile_utils/cpu_utils.h
index 5d215b4804..e95843b80a 100644
--- a/tensorflow/core/platform/profile_utils/cpu_utils.h
+++ b/tensorflow/core/platform/profile_utils/cpu_utils.h
@@ -42,7 +42,7 @@ namespace profile_utils {
 class CpuUtils {
  public:
   // Constant for invalid frequency.
-  // This value is returned when the furequency is not obtained somehow.
+  // This value is returned when the frequency is not obtained somehow.
   static constexpr int64 INVALID_FREQUENCY = -1;
   static constexpr uint64 DUMMY_CYCLE_CLOCK = 1;
 
@@ -103,7 +103,7 @@ class CpuUtils {
      static int64 GetCycleCounterFrequency();
   #endif
 
-  // Return micro secound per each clock
+  // Return micro second per each clock
   // As this method caches the cpu frequency internally,
   // the first call will incur overhead, but not subsequent calls.
   static double GetMicroSecPerClock();
-- 
GitLab


From 81c36a6987bc074a4cbcbeeb7baa93b31c960b9e Mon Sep 17 00:00:00 2001
From: Stanislav Levental <stanislav.levental@gmail.com>
Date: Thu, 25 Jan 2018 18:52:59 -0800
Subject: [PATCH 209/966] Including common.h with NEON_2_SSE.h (#15743)

* Including common.h with NEON_2_SSE.h

Including common.h to make sure that USE_NEON is defined in case of NEON_2_SSE.h is used; otherwise USE_NEON will not be propagated to this file and `portable_tensor_utils.h` will be used

* Removing spaces

* Using neon is USE_NEON is defined

* Using common.h to set USE_NEON if applicable

* Adding required dependencies

Since tests are more likely compiled on sse4-enabled machine - if they are run with common.h - it will enable sse4, there are two ways - just enable it and let tests using sse4, or add extra flag check which would require USE_SSE4 flag to be defined. I've added all required dependencies for tests to pass, so it could work.

* Rearranging dependencies according to linter

* Fixing sanity check

* trying to rerun, could be cached

* Triggering build

* triggering build - looks like it stuck

no status for more than 24h

* including x86_64 since NEON_2_SSE4 convertion will allow neon to run on x86 with SSE3,4

* Fixing name

* using darwing for MacOS

* Fixing header inclusion rules

* Fix syntax

* Depend on includes

We depend on both types and compatibility. That's in the "types" rule.
---
 .../contrib/lite/kernels/internal/BUILD       | 21 +++++++++++++++++++
 .../kernels/internal/optimized/cpu_check.h    |  2 +-
 .../internal/optimized/neon_tensor_utils.cc   |  2 +-
 .../lite/kernels/internal/tensor_utils.cc     |  1 +
 4 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/internal/BUILD b/tensorflow/contrib/lite/kernels/internal/BUILD
index 21118fc96d..38b032c6de 100644
--- a/tensorflow/contrib/lite/kernels/internal/BUILD
+++ b/tensorflow/contrib/lite/kernels/internal/BUILD
@@ -267,6 +267,8 @@ cc_library(
         "optimized/neon_tensor_utils.cc",
     ],
     hdrs = [
+        "common.h",
+        "optimized/cpu_check.h",
         "optimized/neon_tensor_utils.h",
         "optimized/tensor_utils_impl.h",
     ],
@@ -274,8 +276,11 @@ cc_library(
     deps = [
         ":cpu_check",
         ":portable_tensor_utils",
+        ":types",
         "//tensorflow/contrib/lite:builtin_op_data",
         "//tensorflow/contrib/lite/kernels:activation_functor",
+        "@arm_neon_2_x86_sse",
+        "@gemmlowp//:gemmlowp",
     ],
 )
 
@@ -285,14 +290,21 @@ cc_library(
         "tensor_utils.cc",
     ],
     hdrs = [
+        "common.h",
+        "compatibility.h",
+        "optimized/cpu_check.h",
+        "optimized/neon_tensor_utils.h",
         "optimized/tensor_utils_impl.h",
         "reference/portable_tensor_utils.h",
         "tensor_utils.h",
+        "types.h",
     ],
     copts = NEON_FLAGS_IF_APPLICABLE,
     deps = [
         "//tensorflow/contrib/lite/kernels:activation_functor",
         "//tensorflow/contrib/lite:builtin_op_data",
+        "@arm_neon_2_x86_sse",
+        "@gemmlowp//:gemmlowp",
     ] + select({
         ":arm": [
             ":neon_tensor_utils",
@@ -312,6 +324,15 @@ cc_library(
         ":ios_arm64": [
             ":neon_tensor_utils",
         ],
+        ":x86_64": [
+            ":neon_tensor_utils",
+        ],
+        ":x86": [
+            ":neon_tensor_utils",
+        ],
+        ":darwin": [
+            ":neon_tensor_utils",
+        ],
         "//conditions:default": [
             ":portable_tensor_utils",
         ],
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/cpu_check.h b/tensorflow/contrib/lite/kernels/internal/optimized/cpu_check.h
index dea46cc120..629783d7e5 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/cpu_check.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/cpu_check.h
@@ -34,7 +34,7 @@ inline bool TestCPUFeatureNeon() {
 #endif  // __aarch64__
 }
 
-#elif __ARM_NEON
+#elif defined USE_NEON || defined __ARM_NEON 
 
 inline bool TestCPUFeatureNeon() {
   return true;
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.cc b/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.cc
index bf0bdfb1fb..ea8502ae33 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.cc
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.cc
@@ -15,12 +15,12 @@ limitations under the License.
 #include <string.h>
 
 #include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/kernels/internal/common.h"
 #include "tensorflow/contrib/lite/kernels/activation_functor.h"
 #include "tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h"
 
 #ifdef USE_NEON
 
-#include <arm_neon.h>
 #define kFloatWeightsPerNeonLane 4
 
 namespace tflite {
diff --git a/tensorflow/contrib/lite/kernels/internal/tensor_utils.cc b/tensorflow/contrib/lite/kernels/internal/tensor_utils.cc
index 904a97803a..f4181b18a8 100644
--- a/tensorflow/contrib/lite/kernels/internal/tensor_utils.cc
+++ b/tensorflow/contrib/lite/kernels/internal/tensor_utils.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
+#include "tensorflow/contrib/lite/kernels/internal/common.h"
 
 #ifndef USE_NEON
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
-- 
GitLab


From 0bd0bf02aa15a3238b77053a2f0ad6fe373c7d1c Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Thu, 25 Jan 2018 19:05:28 -0800
Subject: [PATCH 210/966] Update tf.keras to the Keras 2.1.3 API.

PiperOrigin-RevId: 183328052
---
 tensorflow/contrib/cmake/python_modules.txt   |   2 +
 tensorflow/python/keras/BUILD                 |  32 +-
 .../python/keras/_impl/keras/__init__.py      |   2 +-
 .../python/keras/_impl/keras/activations.py   |  10 +-
 .../_impl/keras/applications/__init__.py      |   5 +
 .../_impl/keras/applications/densenet.py      | 346 ++++++++
 .../_impl/keras/applications/densenet_test.py | 101 +++
 .../keras/applications/imagenet_utils.py      | 156 ++--
 .../keras/applications/inception_resnet_v2.py |  19 +-
 .../_impl/keras/applications/inception_v3.py  |  21 +-
 .../_impl/keras/applications/mobilenet.py     |  70 +-
 .../keras/_impl/keras/applications/nasnet.py  | 783 ++++++++++++++++++
 .../_impl/keras/applications/nasnet_test.py   |  76 ++
 .../_impl/keras/applications/resnet50.py      |  51 +-
 .../keras/_impl/keras/applications/vgg16.py   |  60 +-
 .../keras/_impl/keras/applications/vgg19.py   |  73 +-
 .../_impl/keras/applications/xception.py      |  90 +-
 .../python/keras/_impl/keras/backend.py       | 289 +++++--
 .../python/keras/_impl/keras/backend_test.py  |  12 -
 .../python/keras/_impl/keras/callbacks.py     | 132 +--
 .../python/keras/_impl/keras/constraints.py   |  22 +-
 .../_impl/keras/datasets/boston_housing.py    |  10 +-
 .../keras/_impl/keras/datasets/cifar.py       |   3 +-
 .../keras/_impl/keras/datasets/cifar10.py     |   4 +-
 .../keras/_impl/keras/datasets/cifar100.py    |   6 +-
 .../_impl/keras/datasets/fashion_mnist.py     |   7 +-
 .../python/keras/_impl/keras/datasets/imdb.py |  63 +-
 .../keras/_impl/keras/datasets/mnist.py       |  10 +-
 .../keras/_impl/keras/datasets/reuters.py     |  58 +-
 .../keras/_impl/keras/engine/topology.py      |  61 +-
 .../keras/_impl/keras/engine/training.py      | 588 +++++++------
 .../keras/_impl/keras/engine/training_test.py |  87 +-
 .../keras/layers/advanced_activations.py      |  68 +-
 .../keras/layers/advanced_activations_test.py |   6 +
 .../keras/_impl/keras/layers/convolutional.py | 139 ++++
 .../keras/layers/convolutional_recurrent.py   |  44 +-
 .../_impl/keras/layers/convolutional_test.py  |  66 ++
 .../keras/_impl/keras/layers/embeddings.py    |  26 +-
 .../python/keras/_impl/keras/layers/local.py  | 110 ++-
 .../python/keras/_impl/keras/layers/merge.py  | 110 ++-
 .../python/keras/_impl/keras/layers/noise.py  |  49 +-
 .../keras/_impl/keras/layers/recurrent.py     | 650 ++++++++-------
 .../_impl/keras/layers/recurrent_test.py      |  99 +++
 .../keras/_impl/keras/layers/wrappers.py      |  58 +-
 .../keras/_impl/keras/layers/wrappers_test.py | 125 +++
 tensorflow/python/keras/_impl/keras/losses.py |  20 +-
 .../python/keras/_impl/keras/metrics.py       |   9 +-
 tensorflow/python/keras/_impl/keras/models.py |   4 +-
 .../python/keras/_impl/keras/models_test.py   |  29 +
 .../python/keras/_impl/keras/optimizers.py    | 125 +--
 .../keras/_impl/keras/optimizers_test.py      |   1 +
 .../keras/_impl/keras/preprocessing/image.py  | 164 ++--
 .../_impl/keras/preprocessing/sequence.py     |  25 +-
 .../keras/_impl/keras/preprocessing/text.py   |  51 +-
 .../_impl/keras/preprocessing/text_test.py    |  16 +
 .../python/keras/_impl/keras/regularizers.py  |   2 +-
 .../keras/_impl/keras/utils/data_utils.py     | 213 +++--
 .../keras/_impl/keras/utils/generic_utils.py  |   6 +-
 .../keras/_impl/keras/utils/io_utils.py       |  18 +-
 .../keras/_impl/keras/utils/layer_utils.py    |  35 +-
 .../keras/_impl/keras/utils/np_utils.py       |   5 +-
 .../keras/_impl/keras/utils/vis_utils.py      |  35 +-
 .../_impl/keras/wrappers/scikit_learn.py      |  71 +-
 .../python/keras/applications/__init__.py     |   7 +
 .../keras/applications/densenet/__init__.py   |  29 +
 .../keras/applications/nasnet/__init__.py     |  28 +
 tensorflow/python/keras/layers/__init__.py    |   3 +
 tensorflow/python/layers/base.py              |  21 +
 tensorflow/python/layers/network.py           |   7 +
 .../api/golden/tensorflow.keras.-model.pbtxt  |   8 +-
 ...nsorflow.keras.applications.densenet.pbtxt |  23 +
 ...tensorflow.keras.applications.nasnet.pbtxt |  19 +
 .../tensorflow.keras.applications.pbtxt       |  28 +
 .../api/golden/tensorflow.keras.backend.pbtxt |   2 +-
 ...s.callbacks.-learning-rate-scheduler.pbtxt |   2 +-
 ...orflow.keras.datasets.boston_housing.pbtxt |   2 +-
 .../tensorflow.keras.datasets.imdb.pbtxt      |   2 +-
 .../tensorflow.keras.datasets.reuters.pbtxt   |   2 +-
 .../golden/tensorflow.keras.layers.-add.pbtxt |   4 +-
 ...nsorflow.keras.layers.-alpha-dropout.pbtxt |   2 +-
 .../tensorflow.keras.layers.-average.pbtxt    |   4 +-
 ...nsorflow.keras.layers.-bidirectional.pbtxt |   4 +-
 ...tensorflow.keras.layers.-concatenate.pbtxt |   4 +-
 ...orflow.keras.layers.-conv-l-s-t-m2-d.pbtxt |   4 +-
 .../golden/tensorflow.keras.layers.-dot.pbtxt |   4 +-
 .../tensorflow.keras.layers.-e-l-u.pbtxt      |   2 +-
 .../tensorflow.keras.layers.-embedding.pbtxt  |   4 +-
 .../tensorflow.keras.layers.-g-r-u-cell.pbtxt |   2 +-
 .../tensorflow.keras.layers.-g-r-u.pbtxt      |   4 +-
 ...rflow.keras.layers.-gaussian-dropout.pbtxt |   2 +-
 ...sorflow.keras.layers.-gaussian-noise.pbtxt |   2 +-
 ...ensorflow.keras.layers.-l-s-t-m-cell.pbtxt |   2 +-
 .../tensorflow.keras.layers.-l-s-t-m.pbtxt    |   4 +-
 ...ensorflow.keras.layers.-leaky-re-l-u.pbtxt |   2 +-
 ...w.keras.layers.-locally-connected1-d.pbtxt |   4 +-
 ...w.keras.layers.-locally-connected2-d.pbtxt |   4 +-
 .../tensorflow.keras.layers.-maximum.pbtxt    |   4 +-
 .../tensorflow.keras.layers.-multiply.pbtxt   |   4 +-
 .../tensorflow.keras.layers.-p-re-l-u.pbtxt   |   4 +-
 .../tensorflow.keras.layers.-r-n-n.pbtxt      |   6 +-
 ...flow.keras.layers.-separable-conv1-d.pbtxt | 186 +++++
 ...ras.layers.-separable-convolution1-d.pbtxt | 186 +++++
 ...flow.keras.layers.-simple-r-n-n-cell.pbtxt |   2 +-
 ...ensorflow.keras.layers.-simple-r-n-n.pbtxt |   4 +-
 .../tensorflow.keras.layers.-softmax.pbtxt    | 183 ++++
 ...ow.keras.layers.-stacked-r-n-n-cells.pbtxt |   2 +-
 ...low.keras.layers.-thresholded-re-l-u.pbtxt |   2 +-
 .../api/golden/tensorflow.keras.layers.pbtxt  |  12 +
 .../tensorflow.keras.models.-model.pbtxt      |   8 +-
 ...ensorflow.keras.optimizers.-adadelta.pbtxt |   2 +-
 ...tensorflow.keras.optimizers.-adagrad.pbtxt |   2 +-
 .../tensorflow.keras.optimizers.-adam.pbtxt   |   2 +-
 .../tensorflow.keras.optimizers.-adamax.pbtxt |   2 +-
 .../tensorflow.keras.optimizers.-nadam.pbtxt  |   2 +-
 ...nsorflow.keras.optimizers.-r-m-sprop.pbtxt |   2 +-
 ....keras.preprocessing.text.-tokenizer.pbtxt |   2 +-
 116 files changed, 4870 insertions(+), 1616 deletions(-)
 create mode 100644 tensorflow/python/keras/_impl/keras/applications/densenet.py
 create mode 100644 tensorflow/python/keras/_impl/keras/applications/densenet_test.py
 create mode 100644 tensorflow/python/keras/_impl/keras/applications/nasnet.py
 create mode 100644 tensorflow/python/keras/_impl/keras/applications/nasnet_test.py
 create mode 100644 tensorflow/python/keras/applications/densenet/__init__.py
 create mode 100644 tensorflow/python/keras/applications/nasnet/__init__.py
 create mode 100644 tensorflow/tools/api/golden/tensorflow.keras.applications.densenet.pbtxt
 create mode 100644 tensorflow/tools/api/golden/tensorflow.keras.applications.nasnet.pbtxt
 create mode 100644 tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv1-d.pbtxt
 create mode 100644 tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
 create mode 100644 tensorflow/tools/api/golden/tensorflow.keras.layers.-softmax.pbtxt

diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt
index 7db454bd83..9ce8b3cc9c 100644
--- a/tensorflow/contrib/cmake/python_modules.txt
+++ b/tensorflow/contrib/cmake/python_modules.txt
@@ -33,9 +33,11 @@ tensorflow/python/grappler
 tensorflow/python/keras
 tensorflow/python/keras/activations
 tensorflow/python/keras/applications
+tensorflow/python/keras/applications/densenet
 tensorflow/python/keras/applications/inception_resnet_v2
 tensorflow/python/keras/applications/inception_v3
 tensorflow/python/keras/applications/mobilenet
+tensorflow/python/keras/applications/nasnet
 tensorflow/python/keras/applications/resnet50
 tensorflow/python/keras/applications/vgg16
 tensorflow/python/keras/applications/vgg19
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index 1f20b3ae0e..6125755775 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -14,10 +14,12 @@ py_library(
         "_impl/keras/__init__.py",
         "_impl/keras/activations.py",
         "_impl/keras/applications/__init__.py",
+        "_impl/keras/applications/densenet.py",
         "_impl/keras/applications/imagenet_utils.py",
         "_impl/keras/applications/inception_resnet_v2.py",
         "_impl/keras/applications/inception_v3.py",
         "_impl/keras/applications/mobilenet.py",
+        "_impl/keras/applications/nasnet.py",
         "_impl/keras/applications/resnet50.py",
         "_impl/keras/applications/vgg16.py",
         "_impl/keras/applications/vgg19.py",
@@ -76,9 +78,11 @@ py_library(
         "_impl/keras/wrappers/scikit_learn.py",
         "activations/__init__.py",
         "applications/__init__.py",
+        "applications/densenet/__init__.py",
         "applications/inception_resnet_v2/__init__.py",
         "applications/inception_v3/__init__.py",
         "applications/mobilenet/__init__.py",
+        "applications/nasnet/__init__.py",
         "applications/resnet50/__init__.py",
         "applications/vgg16/__init__.py",
         "applications/vgg19/__init__.py",
@@ -256,6 +260,18 @@ py_test(
     ],
 )
 
+py_test(
+    name = "densenet_test",
+    size = "large",
+    srcs = ["_impl/keras/applications/densenet_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "inception_resnet_v2_test",
     size = "medium",
@@ -292,6 +308,18 @@ py_test(
     ],
 )
 
+py_test(
+    name = "nasnet_test",
+    size = "large",
+    srcs = ["_impl/keras/applications/nasnet_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "resnet50_test",
     size = "small",
@@ -504,7 +532,7 @@ py_test(
 
 py_test(
     name = "recurrent_test",
-    size = "small",
+    size = "medium",
     srcs = ["_impl/keras/layers/recurrent_test.py"],
     srcs_version = "PY2AND3",
     deps = [
@@ -527,7 +555,7 @@ py_test(
 
 py_test(
     name = "wrappers_test",
-    size = "small",
+    size = "medium",
     srcs = ["_impl/keras/layers/wrappers_test.py"],
     srcs_version = "PY2AND3",
     tags = ["notsan"],
diff --git a/tensorflow/python/keras/_impl/keras/__init__.py b/tensorflow/python/keras/_impl/keras/__init__.py
index a70250d796..7311353932 100644
--- a/tensorflow/python/keras/_impl/keras/__init__.py
+++ b/tensorflow/python/keras/_impl/keras/__init__.py
@@ -40,4 +40,4 @@ from tensorflow.python.keras._impl.keras.layers import Input
 from tensorflow.python.keras._impl.keras.models import Model
 from tensorflow.python.keras._impl.keras.models import Sequential
 
-__version__ = '2.1.2-tf'
+__version__ = '2.1.3-tf'
diff --git a/tensorflow/python/keras/_impl/keras/activations.py b/tensorflow/python/keras/_impl/keras/activations.py
index f017d2ae85..4852b8c36a 100644
--- a/tensorflow/python/keras/_impl/keras/activations.py
+++ b/tensorflow/python/keras/_impl/keras/activations.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Keras built-in activation functions.
+"""Built-in activation functions.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -61,10 +61,12 @@ def selu(x):
       x: A tensor or variable to compute the activation function for.
 
   Returns:
-    Tensor with the same shape and dtype as `x`.
+      Tensor with the same shape and dtype as `x`.
+
+  # Note
+      - To be used together with the initialization "lecun_normal".
+      - To be used together with the dropout variant "AlphaDropout".
 
-  References:
-      - [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
   """
   alpha = 1.6732632423543772848170429916717
   scale = 1.0507009873554804934193349852946
diff --git a/tensorflow/python/keras/_impl/keras/applications/__init__.py b/tensorflow/python/keras/_impl/keras/applications/__init__.py
index c11c52b71e..206a769b37 100644
--- a/tensorflow/python/keras/_impl/keras/applications/__init__.py
+++ b/tensorflow/python/keras/_impl/keras/applications/__init__.py
@@ -18,9 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.keras._impl.keras.applications.densenet import DenseNet121
+from tensorflow.python.keras._impl.keras.applications.densenet import DenseNet169
+from tensorflow.python.keras._impl.keras.applications.densenet import DenseNet201
 from tensorflow.python.keras._impl.keras.applications.inception_resnet_v2 import InceptionResNetV2
 from tensorflow.python.keras._impl.keras.applications.inception_v3 import InceptionV3
 from tensorflow.python.keras._impl.keras.applications.mobilenet import MobileNet
+from tensorflow.python.keras._impl.keras.applications.nasnet import NASNetLarge
+from tensorflow.python.keras._impl.keras.applications.nasnet import NASNetMobile
 from tensorflow.python.keras._impl.keras.applications.resnet50 import ResNet50
 from tensorflow.python.keras._impl.keras.applications.vgg16 import VGG16
 from tensorflow.python.keras._impl.keras.applications.vgg19 import VGG19
diff --git a/tensorflow/python/keras/_impl/keras/applications/densenet.py b/tensorflow/python/keras/_impl/keras/applications/densenet.py
new file mode 100644
index 0000000000..9e40d34930
--- /dev/null
+++ b/tensorflow/python/keras/_impl/keras/applications/densenet.py
@@ -0,0 +1,346 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=invalid-name
+# pylint: disable=unused-import
+"""DenseNet models for Keras.
+
+# Reference paper
+
+- [Densely Connected Convolutional Networks]
+  (https://arxiv.org/abs/1608.06993) (CVPR 2017 Best Paper Award)
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.python.keras._impl.keras import backend as K
+from tensorflow.python.keras._impl.keras.applications import imagenet_utils
+from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
+from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
+from tensorflow.python.keras._impl.keras.engine.topology import get_source_inputs
+from tensorflow.python.keras._impl.keras.layers import Activation
+from tensorflow.python.keras._impl.keras.layers import AveragePooling2D
+from tensorflow.python.keras._impl.keras.layers import BatchNormalization
+from tensorflow.python.keras._impl.keras.layers import Concatenate
+from tensorflow.python.keras._impl.keras.layers import Conv2D
+from tensorflow.python.keras._impl.keras.layers import Dense
+from tensorflow.python.keras._impl.keras.layers import GlobalAveragePooling2D
+from tensorflow.python.keras._impl.keras.layers import GlobalMaxPooling2D
+from tensorflow.python.keras._impl.keras.layers import Input
+from tensorflow.python.keras._impl.keras.layers import MaxPooling2D
+from tensorflow.python.keras._impl.keras.layers import ZeroPadding2D
+from tensorflow.python.keras._impl.keras.models import Model
+from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+
+
+DENSENET121_WEIGHT_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.8/densenet121_weights_tf_dim_ordering_tf_kernels.h5'
+DENSENET121_WEIGHT_PATH_NO_TOP = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.8/densenet121_weights_tf_dim_ordering_tf_kernels_notop.h5'
+DENSENET169_WEIGHT_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.8/densenet169_weights_tf_dim_ordering_tf_kernels.h5'
+DENSENET169_WEIGHT_PATH_NO_TOP = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.8/densenet169_weights_tf_dim_ordering_tf_kernels_notop.h5'
+DENSENET201_WEIGHT_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.8/densenet201_weights_tf_dim_ordering_tf_kernels.h5'
+DENSENET201_WEIGHT_PATH_NO_TOP = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.8/densenet201_weights_tf_dim_ordering_tf_kernels_notop.h5'
+
+
+def dense_block(x, blocks, name):
+  """A dense block.
+
+  Arguments:
+      x: input tensor.
+      blocks: integer, the number of building blocks.
+      name: string, block label.
+
+  Returns:
+      output tensor for the block.
+  """
+  for i in range(blocks):
+    x = conv_block(x, 32, name=name + '_block' + str(i + 1))
+  return x
+
+
+def transition_block(x, reduction, name):
+  """A transition block.
+
+  Arguments:
+      x: input tensor.
+      reduction: float, compression rate at transition layers.
+      name: string, block label.
+
+  Returns:
+      output tensor for the block.
+  """
+  bn_axis = 3 if K.image_data_format() == 'channels_last' else 1
+  x = BatchNormalization(axis=bn_axis, epsilon=1.001e-5, name=name + '_bn')(x)
+  x = Activation('relu', name=name + '_relu')(x)
+  x = Conv2D(
+      int(K.int_shape(x)[bn_axis] * reduction),
+      1,
+      use_bias=False,
+      name=name + '_conv')(
+          x)
+  x = AveragePooling2D(2, strides=2, name=name + '_pool')(x)
+  return x
+
+
+def conv_block(x, growth_rate, name):
+  """A building block for a dense block.
+
+  Arguments:
+      x: input tensor.
+      growth_rate: float, growth rate at dense layers.
+      name: string, block label.
+
+  Returns:
+      output tensor for the block.
+  """
+  bn_axis = 3 if K.image_data_format() == 'channels_last' else 1
+  x1 = BatchNormalization(
+      axis=bn_axis, epsilon=1.001e-5, name=name + '_0_bn')(
+          x)
+  x1 = Activation('relu', name=name + '_0_relu')(x1)
+  x1 = Conv2D(4 * growth_rate, 1, use_bias=False, name=name + '_1_conv')(x1)
+  x1 = BatchNormalization(
+      axis=bn_axis, epsilon=1.001e-5, name=name + '_1_bn')(
+          x1)
+  x1 = Activation('relu', name=name + '_1_relu')(x1)
+  x1 = Conv2D(
+      growth_rate, 3, padding='same', use_bias=False, name=name + '_2_conv')(
+          x1)
+  x = Concatenate(axis=bn_axis, name=name + '_concat')([x, x1])
+  return x
+
+
+def DenseNet(blocks,
+             include_top=True,
+             weights='imagenet',
+             input_tensor=None,
+             input_shape=None,
+             pooling=None,
+             classes=1000):
+  """Instantiates the DenseNet architecture.
+
+  Optionally loads weights pre-trained
+  on ImageNet. Note that when using TensorFlow,
+  for best performance you should set
+  `image_data_format='channels_last'` in your Keras config
+  at ~/.keras/keras.json.
+
+  The model and the weights are compatible with
+  TensorFlow, Theano, and CNTK. The data format
+  convention used by the model is the one
+  specified in your Keras config file.
+
+  Arguments:
+      blocks: numbers of building blocks for the four dense layers.
+      include_top: whether to include the fully-connected
+          layer at the top of the network.
+      weights: one of `None` (random initialization),
+            'imagenet' (pre-training on ImageNet),
+            or the path to the weights file to be loaded.
+      input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
+          to use as image input for the model.
+      input_shape: optional shape tuple, only to be specified
+          if `include_top` is False (otherwise the input shape
+          has to be `(224, 224, 3)` (with `channels_last` data format)
+          or `(3, 224, 224)` (with `channels_first` data format).
+          It should have exactly 3 inputs channels.
+      pooling: optional pooling mode for feature extraction
+          when `include_top` is `False`.
+          - `None` means that the output of the model will be
+              the 4D tensor output of the
+              last convolutional layer.
+          - `avg` means that global average pooling
+              will be applied to the output of the
+              last convolutional layer, and thus
+              the output of the model will be a 2D tensor.
+          - `max` means that global max pooling will
+              be applied.
+      classes: optional number of classes to classify images
+          into, only to be specified if `include_top` is True, and
+          if no `weights` argument is specified.
+
+  Returns:
+      A Keras model instance.
+
+  Raises:
+      ValueError: in case of invalid argument for `weights`,
+          or invalid input shape.
+  """
+  if not (weights in {'imagenet', None} or os.path.exists(weights)):
+    raise ValueError('The `weights` argument should be either '
+                     '`None` (random initialization), `imagenet` '
+                     '(pre-training on ImageNet), '
+                     'or the path to the weights file to be loaded.')
+
+  if weights == 'imagenet' and include_top and classes != 1000:
+    raise ValueError('If using `weights` as imagenet with `include_top`'
+                     ' as true, `classes` should be 1000')
+
+  # Determine proper input shape
+  input_shape = _obtain_input_shape(
+      input_shape,
+      default_size=224,
+      min_size=221,
+      data_format=K.image_data_format(),
+      require_flatten=include_top,
+      weights=weights)
+
+  if input_tensor is None:
+    img_input = Input(shape=input_shape)
+  else:
+    if not K.is_keras_tensor(input_tensor):
+      img_input = Input(tensor=input_tensor, shape=input_shape)
+    else:
+      img_input = input_tensor
+
+  bn_axis = 3 if K.image_data_format() == 'channels_last' else 1
+
+  x = ZeroPadding2D(padding=((3, 3), (3, 3)))(img_input)
+  x = Conv2D(64, 7, strides=2, use_bias=False, name='conv1/conv')(x)
+  x = BatchNormalization(axis=bn_axis, epsilon=1.001e-5, name='conv1/bn')(x)
+  x = Activation('relu', name='conv1/relu')(x)
+  x = ZeroPadding2D(padding=((1, 1), (1, 1)))(x)
+  x = MaxPooling2D(3, strides=2, name='pool1')(x)
+
+  x = dense_block(x, blocks[0], name='conv2')
+  x = transition_block(x, 0.5, name='pool2')
+  x = dense_block(x, blocks[1], name='conv3')
+  x = transition_block(x, 0.5, name='pool3')
+  x = dense_block(x, blocks[2], name='conv4')
+  x = transition_block(x, 0.5, name='pool4')
+  x = dense_block(x, blocks[3], name='conv5')
+
+  x = BatchNormalization(axis=bn_axis, epsilon=1.001e-5, name='bn')(x)
+
+  if include_top:
+    x = GlobalAveragePooling2D(name='avg_pool')(x)
+    x = Dense(classes, activation='softmax', name='fc1000')(x)
+  else:
+    if pooling == 'avg':
+      x = GlobalAveragePooling2D(name='avg_pool')(x)
+    elif pooling == 'max':
+      x = GlobalMaxPooling2D(name='max_pool')(x)
+
+  # Ensure that the model takes into account
+  # any potential predecessors of `input_tensor`.
+  if input_tensor is not None:
+    inputs = get_source_inputs(input_tensor)
+  else:
+    inputs = img_input
+
+  # Create model.
+  if blocks == [6, 12, 24, 16]:
+    model = Model(inputs, x, name='densenet121')
+  elif blocks == [6, 12, 32, 32]:
+    model = Model(inputs, x, name='densenet169')
+  elif blocks == [6, 12, 48, 32]:
+    model = Model(inputs, x, name='densenet201')
+  else:
+    model = Model(inputs, x, name='densenet')
+
+  # Load weights.
+  if weights == 'imagenet':
+    if include_top:
+      if blocks == [6, 12, 24, 16]:
+        weights_path = get_file(
+            'densenet121_weights_tf_dim_ordering_tf_kernels.h5',
+            DENSENET121_WEIGHT_PATH,
+            cache_subdir='models',
+            file_hash='0962ca643bae20f9b6771cb844dca3b0')
+      elif blocks == [6, 12, 32, 32]:
+        weights_path = get_file(
+            'densenet169_weights_tf_dim_ordering_tf_kernels.h5',
+            DENSENET169_WEIGHT_PATH,
+            cache_subdir='models',
+            file_hash='bcf9965cf5064a5f9eb6d7dc69386f43')
+      elif blocks == [6, 12, 48, 32]:
+        weights_path = get_file(
+            'densenet201_weights_tf_dim_ordering_tf_kernels.h5',
+            DENSENET201_WEIGHT_PATH,
+            cache_subdir='models',
+            file_hash='7bb75edd58cb43163be7e0005fbe95ef')
+    else:
+      if blocks == [6, 12, 24, 16]:
+        weights_path = get_file(
+            'densenet121_weights_tf_dim_ordering_tf_kernels_notop.h5',
+            DENSENET121_WEIGHT_PATH_NO_TOP,
+            cache_subdir='models',
+            file_hash='4912a53fbd2a69346e7f2c0b5ec8c6d3')
+      elif blocks == [6, 12, 32, 32]:
+        weights_path = get_file(
+            'densenet169_weights_tf_dim_ordering_tf_kernels_notop.h5',
+            DENSENET169_WEIGHT_PATH_NO_TOP,
+            cache_subdir='models',
+            file_hash='50662582284e4cf834ce40ab4dfa58c6')
+      elif blocks == [6, 12, 48, 32]:
+        weights_path = get_file(
+            'densenet201_weights_tf_dim_ordering_tf_kernels_notop.h5',
+            DENSENET201_WEIGHT_PATH_NO_TOP,
+            cache_subdir='models',
+            file_hash='1c2de60ee40562448dbac34a0737e798')
+    model.load_weights(weights_path)
+  elif weights is not None:
+    model.load_weights(weights)
+
+  return model
+
+
+def DenseNet121(include_top=True,
+                weights='imagenet',
+                input_tensor=None,
+                input_shape=None,
+                pooling=None,
+                classes=1000):
+  return DenseNet([6, 12, 24, 16], include_top, weights, input_tensor,
+                  input_shape, pooling, classes)
+
+
+def DenseNet169(include_top=True,
+                weights='imagenet',
+                input_tensor=None,
+                input_shape=None,
+                pooling=None,
+                classes=1000):
+  return DenseNet([6, 12, 32, 32], include_top, weights, input_tensor,
+                  input_shape, pooling, classes)
+
+
+def DenseNet201(include_top=True,
+                weights='imagenet',
+                input_tensor=None,
+                input_shape=None,
+                pooling=None,
+                classes=1000):
+  return DenseNet([6, 12, 48, 32], include_top, weights, input_tensor,
+                  input_shape, pooling, classes)
+
+
+def preprocess_input(x, data_format=None):
+  """Preprocesses a numpy array encoding a batch of images.
+
+  Arguments:
+      x: a 3D or 4D numpy array consists of RGB values within [0, 255].
+      data_format: data format of the image tensor.
+
+  Returns:
+      Preprocessed array.
+  """
+  return imagenet_utils.preprocess_input(x, data_format, mode='torch')
+
+
+setattr(DenseNet121, '__doc__', DenseNet.__doc__)
+setattr(DenseNet169, '__doc__', DenseNet.__doc__)
+setattr(DenseNet201, '__doc__', DenseNet.__doc__)
diff --git a/tensorflow/python/keras/_impl/keras/applications/densenet_test.py b/tensorflow/python/keras/_impl/keras/applications/densenet_test.py
new file mode 100644
index 0000000000..3b92287a1e
--- /dev/null
+++ b/tensorflow/python/keras/_impl/keras/applications/densenet_test.py
@@ -0,0 +1,101 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for DenseNet application."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.keras._impl import keras
+from tensorflow.python.platform import test
+
+
+class DenseNet121Test(test.TestCase):
+
+  def test_with_top(self):
+    model = keras.applications.DenseNet121(weights=None)
+    self.assertEqual(model.output_shape, (None, 1000))
+
+  def test_no_top(self):
+    model = keras.applications.DenseNet121(weights=None, include_top=False)
+    self.assertEqual(model.output_shape, (None, None, None, 1024))
+
+  def test_with_pooling(self):
+    model = keras.applications.DenseNet121(weights=None,
+                                           include_top=False,
+                                           pooling='avg')
+    self.assertEqual(model.output_shape, (None, 1024))
+
+  def test_weight_loading(self):
+    with self.assertRaises(ValueError):
+      keras.applications.DenseNet121(weights='unknown',
+                                     include_top=False)
+    with self.assertRaises(ValueError):
+      keras.applications.DenseNet121(weights='imagenet',
+                                     classes=2000)
+
+
+class DenseNet169Test(test.TestCase):
+
+  def test_with_top(self):
+    model = keras.applications.DenseNet169(weights=None)
+    self.assertEqual(model.output_shape, (None, 1000))
+
+  def test_no_top(self):
+    model = keras.applications.DenseNet169(weights=None, include_top=False)
+    self.assertEqual(model.output_shape, (None, None, None, 1664))
+
+  def test_with_pooling(self):
+    model = keras.applications.DenseNet169(weights=None,
+                                           include_top=False,
+                                           pooling='max')
+    self.assertEqual(model.output_shape, (None, 1664))
+
+  def test_weight_loading(self):
+    with self.assertRaises(ValueError):
+      keras.applications.DenseNet169(weights='unknown',
+                                     include_top=False)
+    with self.assertRaises(ValueError):
+      keras.applications.DenseNet169(weights='imagenet',
+                                     classes=2000)
+
+
+class DenseNet201(test.TestCase):
+
+  def test_with_top(self):
+    model = keras.applications.DenseNet201(weights=None)
+    self.assertEqual(model.output_shape, (None, 1000))
+
+  def test_no_top(self):
+    model = keras.applications.DenseNet201(weights=None, include_top=False)
+    self.assertEqual(model.output_shape, (None, None, None, 1920))
+
+  def test_with_pooling(self):
+    model = keras.applications.DenseNet201(weights=None,
+                                           include_top=False,
+                                           pooling='avg')
+    self.assertEqual(model.output_shape, (None, 1920))
+
+  def test_weight_loading(self):
+    with self.assertRaises(ValueError):
+      keras.applications.DenseNet201(weights='unknown',
+                                     include_top=False)
+    with self.assertRaises(ValueError):
+      keras.applications.DenseNet201(weights='imagenet',
+                                     classes=2000)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/_impl/keras/applications/imagenet_utils.py b/tensorflow/python/keras/_impl/keras/applications/imagenet_utils.py
index 63ee83cb51..f1f20f12a8 100644
--- a/tensorflow/python/keras/_impl/keras/applications/imagenet_utils.py
+++ b/tensorflow/python/keras/_impl/keras/applications/imagenet_utils.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Utilities used by models pre-trained on ImageNet.
+"""Utilities for ImageNet data preprocessing & prediction decoding.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -35,63 +35,92 @@ _IMAGENET_MEAN = None
 
 
 def _preprocess_numpy_input(x, data_format, mode):
-  """Preprocesses a image tensor as a Numpy array.
+  """Preprocesses a Numpy array encoding a batch of images.
 
   Arguments:
-      x: input Numpy, 3D or 4D.
-      data_format: data format of the image tensor.
-      mode: One of "caffe", "tf".
+      x: Input array, 3D or 4D.
+      data_format: Data format of the image array.
+      mode: One of "caffe", "tf" or "torch".
           - caffe: will convert the images from RGB to BGR,
               then will zero-center each color channel with
               respect to the ImageNet dataset,
               without scaling.
           - tf: will scale pixels between -1 and 1,
               sample-wise.
+          - torch: will scale pixels between 0 and 1 and then
+              will normalize each channel with respect to the
+              ImageNet dataset.
 
   Returns:
-      Preprocessed array.
+      Preprocessed Numpy array.
   """
   if mode == 'tf':
     x /= 127.5
     x -= 1.
     return x
 
+  if mode == 'torch':
+    x /= 255.
+    mean = [0.485, 0.456, 0.406]
+    std = [0.229, 0.224, 0.225]
+  else:
+    if data_format == 'channels_first':
+      # 'RGB'->'BGR'
+      if x.ndim == 3:
+        x = x[::-1, ...]
+      else:
+        x = x[:, ::-1, ...]
+    else:
+      # 'RGB'->'BGR'
+      x = x[..., ::-1]
+    mean = [103.939, 116.779, 123.68]
+    std = None
+
+  # Zero-center by mean pixel
   if data_format == 'channels_first':
     if x.ndim == 3:
-      # 'RGB'->'BGR'
-      x = x[::-1, ...]
-      # Zero-center by mean pixel
-      x[0, :, :] -= 103.939
-      x[1, :, :] -= 116.779
-      x[2, :, :] -= 123.68
+      x[0, :, :] -= mean[0]
+      x[1, :, :] -= mean[1]
+      x[2, :, :] -= mean[2]
+      if std is not None:
+        x[0, :, :] /= std[0]
+        x[1, :, :] /= std[1]
+        x[2, :, :] /= std[2]
     else:
-      x = x[:, ::-1, ...]
-      x[:, 0, :, :] -= 103.939
-      x[:, 1, :, :] -= 116.779
-      x[:, 2, :, :] -= 123.68
+      x[:, 0, :, :] -= mean[0]
+      x[:, 1, :, :] -= mean[1]
+      x[:, 2, :, :] -= mean[2]
+      if std is not None:
+        x[:, 0, :, :] /= std[0]
+        x[:, 1, :, :] /= std[1]
+        x[:, 2, :, :] /= std[2]
   else:
-    # 'RGB'->'BGR'
-    x = x[..., ::-1]
-    # Zero-center by mean pixel
-    x[..., 0] -= 103.939
-    x[..., 1] -= 116.779
-    x[..., 2] -= 123.68
+    x[..., 0] -= mean[0]
+    x[..., 1] -= mean[1]
+    x[..., 2] -= mean[2]
+    if std is not None:
+      x[..., 0] /= std[0]
+      x[..., 1] /= std[1]
+      x[..., 2] /= std[2]
   return x
 
 
 def _preprocess_symbolic_input(x, data_format, mode):
-  """Preprocesses a symbolic image tensor.
+  """Preprocesses a tensor encoding a batch of images.
 
   Arguments:
-      x: symoblic tensor, 3D or 4D.
-      data_format: data format of the image tensor.
-      mode: One of "caffe", "tf".
+      x: Input tensor, 3D or 4D.
+      data_format: Data format of the image tensor.
+      mode: One of "caffe", "tf" or "torch".
           - caffe: will convert the images from RGB to BGR,
               then will zero-center each color channel with
               respect to the ImageNet dataset,
               without scaling.
           - tf: will scale pixels between -1 and 1,
               sample-wise.
+          - torch: will scale pixels between 0 and 1 and then
+              will normalize each channel with respect to the
+              ImageNet dataset.
 
   Returns:
       Preprocessed tensor.
@@ -103,32 +132,42 @@ def _preprocess_symbolic_input(x, data_format, mode):
     x -= 1.
     return x
 
-  if data_format == 'channels_first':
-    # 'RGB'->'BGR'
-    if K.ndim(x) == 3:
-      x = x[::-1, ...]
-    else:
-      x = x[:, ::-1, ...]
+  if mode == 'torch':
+    x /= 255.
+    mean = [0.485, 0.456, 0.406]
+    std = [0.229, 0.224, 0.225]
   else:
-    # 'RGB'->'BGR'
-    x = x[..., ::-1]
+    if data_format == 'channels_first':
+      # 'RGB'->'BGR'
+      if K.ndim(x) == 3:
+        x = x[::-1, ...]
+      else:
+        x = x[:, ::-1, ...]
+    else:
+      # 'RGB'->'BGR'
+      x = x[..., ::-1]
+    mean = [103.939, 116.779, 123.68]
+    std = None
 
   if _IMAGENET_MEAN is None:
-    _IMAGENET_MEAN = K.constant(-np.array([103.939, 116.779, 123.68]))
+    _IMAGENET_MEAN = K.constant(-np.array(mean))
+
   # Zero-center by mean pixel
   if K.dtype(x) != K.dtype(_IMAGENET_MEAN):
     x = K.bias_add(x, K.cast(_IMAGENET_MEAN, K.dtype(x)), data_format)
   else:
     x = K.bias_add(x, _IMAGENET_MEAN, data_format)
+  if std is not None:
+    x /= std
   return x
 
 
 def preprocess_input(x, data_format=None, mode='caffe'):
-  """Preprocesses a tensor encoding a batch of images.
+  """Preprocesses a tensor or Numpy array encoding a batch of images.
 
   Arguments:
-      x: input Numpy or symoblic tensor, 3D or 4D.
-      data_format: data format of the image tensor.
+      x: Input Numpy or symbolic tensor, 3D or 4D.
+      data_format: Data format of the image tensor/array.
       mode: One of "caffe", "tf".
           - caffe: will convert the images from RGB to BGR,
               then will zero-center each color channel with
@@ -138,10 +177,10 @@ def preprocess_input(x, data_format=None, mode='caffe'):
               sample-wise.
 
   Returns:
-      Preprocessed tensor.
+      Preprocessed tensor or Numpy array.
 
   Raises:
-      ValueError: in case of incorrect data_format.
+      ValueError: In case of unknown `data_format` argument.
   """
   if data_format is None:
     data_format = K.image_data_format()
@@ -159,7 +198,7 @@ def decode_predictions(preds, top=5):
 
   Arguments:
       preds: Numpy tensor encoding a batch of predictions.
-      top: integer, how many top-guesses to return.
+      top: Integer, how many top-guesses to return.
 
   Returns:
       A list of lists of top class prediction tuples
@@ -167,7 +206,7 @@ def decode_predictions(preds, top=5):
       One list of tuples per sample in batch input.
 
   Raises:
-      ValueError: in case of invalid shape of the `pred` array
+      ValueError: In case of invalid shape of the `pred` array
           (must be 2D).
   """
   global CLASS_INDEX
@@ -177,10 +216,11 @@ def decode_predictions(preds, top=5):
                      '(i.e. a 2D array of shape (samples, 1000)). '
                      'Found array with shape: ' + str(preds.shape))
   if CLASS_INDEX is None:
-    fpath = get_file('imagenet_class_index.json',
-                     CLASS_INDEX_PATH,
-                     cache_subdir='models',
-                     file_hash='c2c37ea517e94d9795004a39431a14cb')
+    fpath = get_file(
+        'imagenet_class_index.json',
+        CLASS_INDEX_PATH,
+        cache_subdir='models',
+        file_hash='c2c37ea517e94d9795004a39431a14cb')
     CLASS_INDEX = json.load(open(fpath))
   results = []
   for pred in preds:
@@ -197,17 +237,17 @@ def _obtain_input_shape(input_shape,
                         data_format,
                         require_flatten,
                         weights=None):
-  """Internal utility to compute/validate an ImageNet model's input shape.
+  """Internal utility to compute/validate a model's input shape.
 
   Arguments:
-      input_shape: either None (will return the default network input shape),
+      input_shape: Either None (will return the default network input shape),
           or a user-provided shape to be validated.
-      default_size: default input width/height for the model.
-      min_size: minimum input width/height accepted by the model.
-      data_format: image data format to use.
-      require_flatten: whether the model is expected to
+      default_size: Default input width/height for the model.
+      min_size: Minimum input width/height accepted by the model.
+      data_format: Image data format to use.
+      require_flatten: Whether the model is expected to
           be linked to a classifier via a Flatten layer.
-      weights: one of `None` (random initialization)
+      weights: One of `None` (random initialization)
           or 'imagenet' (pre-training on ImageNet).
           If weights='imagenet' input channels must be equal to 3.
 
@@ -215,7 +255,7 @@ def _obtain_input_shape(input_shape,
       An integer shape tuple (may include None entries).
 
   Raises:
-      ValueError: in case of invalid argument values.
+      ValueError: In case of invalid argument values.
   """
   if weights != 'imagenet' and input_shape and len(input_shape) == 3:
     if data_format == 'channels_first':
@@ -252,8 +292,8 @@ def _obtain_input_shape(input_shape,
                            '`input_shape=' + str(input_shape) + '`')
         if ((input_shape[1] is not None and input_shape[1] < min_size) or
             (input_shape[2] is not None and input_shape[2] < min_size)):
-          raise ValueError('Input size must be at least ' + str(min_size) + 'x'
-                           + str(min_size) + '; got '
+          raise ValueError('Input size must be at least ' + str(min_size) +
+                           'x' + str(min_size) + '; got '
                            '`input_shape=' + str(input_shape) + '`')
     else:
       if input_shape is not None:
@@ -264,8 +304,8 @@ def _obtain_input_shape(input_shape,
                            '`input_shape=' + str(input_shape) + '`')
         if ((input_shape[0] is not None and input_shape[0] < min_size) or
             (input_shape[1] is not None and input_shape[1] < min_size)):
-          raise ValueError('Input size must be at least ' + str(min_size) + 'x'
-                           + str(min_size) + '; got '
+          raise ValueError('Input size must be at least ' + str(min_size) +
+                           'x' + str(min_size) + '; got '
                            '`input_shape=' + str(input_shape) + '`')
   else:
     if require_flatten:
diff --git a/tensorflow/python/keras/_impl/keras/applications/inception_resnet_v2.py b/tensorflow/python/keras/_impl/keras/applications/inception_resnet_v2.py
index 2e73cefb6c..1dc15b5b34 100644
--- a/tensorflow/python/keras/_impl/keras/applications/inception_resnet_v2.py
+++ b/tensorflow/python/keras/_impl/keras/applications/inception_resnet_v2.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+# pylint: disable=invalid-name
+# pylint: disable=unused-import
 """Inception-ResNet V2 model for Keras.
 
 # Reference
@@ -28,7 +30,7 @@ import os
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.applications import imagenet_utils
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions  # pylint: disable=unused-import
+from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
 from tensorflow.python.keras._impl.keras.engine.topology import get_source_inputs
 from tensorflow.python.keras._impl.keras.layers import Activation
 from tensorflow.python.keras._impl.keras.layers import AveragePooling2D
@@ -43,6 +45,8 @@ from tensorflow.python.keras._impl.keras.layers import Lambda
 from tensorflow.python.keras._impl.keras.layers import MaxPooling2D
 from tensorflow.python.keras._impl.keras.models import Model
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.platform import tf_logging as logging
+
 
 BASE_WEIGHT_URL = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.7/'
 
@@ -116,7 +120,8 @@ def inception_resnet_block(x, scale, block_type, block_idx, activation='relu'):
       scale: scaling factor to scale the residuals (i.e., the output of
           passing `x` through an inception module) before adding them
           to the shortcut branch. Let `r` be the output from the residual
-          branch, the output of this block will be `x + scale * r`.
+            branch,
+          the output of this block will be `x + scale * r`.
       block_type: `'block35'`, `'block17'` or `'block8'`, determines
           the network structure in the residual branch.
       block_idx: an `int` used for generating layer names. The Inception-ResNet
@@ -128,8 +133,7 @@ def inception_resnet_block(x, scale, block_type, block_idx, activation='relu'):
           will have `block_type='block35', block_idx=0`, ane the layer names
             will have
           a common prefix `'block35_0'`.
-      activation: activation function to use at the end of the block
-          (see [activations](../activations.md)).
+      activation: activation function to use at the end of the block.
           When `activation=None`, no activation is applied
           (i.e., "linear" activation: `a(x) = x`).
 
@@ -178,6 +182,7 @@ def inception_resnet_block(x, scale, block_type, block_idx, activation='relu'):
 
   x = Lambda(
       lambda inputs, scale: inputs[0] + inputs[1] * scale,
+      output_shape=K.int_shape(x)[1:],
       arguments={'scale': scale},
       name=block_name)([x, up])
   if activation is not None:
@@ -185,7 +190,7 @@ def inception_resnet_block(x, scale, block_type, block_idx, activation='relu'):
   return x
 
 
-def InceptionResNetV2(include_top=True,  # pylint: disable=invalid-name
+def InceptionResNetV2(include_top=True,
                       weights='imagenet',
                       input_tensor=None,
                       input_shape=None,
@@ -211,8 +216,8 @@ def InceptionResNetV2(include_top=True,  # pylint: disable=invalid-name
       include_top: whether to include the fully-connected
           layer at the top of the network.
       weights: one of `None` (random initialization),
-          'imagenet' (pre-training on ImageNet),
-          or the path to the weights file to be loaded.
+            'imagenet' (pre-training on ImageNet),
+            or the path to the weights file to be loaded.
       input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
           to use as image input for the model.
       input_shape: optional shape tuple, only to be specified
diff --git a/tensorflow/python/keras/_impl/keras/applications/inception_v3.py b/tensorflow/python/keras/_impl/keras/applications/inception_v3.py
index 4424b92804..ff57116f2d 100644
--- a/tensorflow/python/keras/_impl/keras/applications/inception_v3.py
+++ b/tensorflow/python/keras/_impl/keras/applications/inception_v3.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 # pylint: disable=invalid-name
+# pylint: disable=unused-import
 """Inception V3 model for Keras.
 
 Note that the input image format for this model is different than for
@@ -35,7 +36,7 @@ from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import layers
 from tensorflow.python.keras._impl.keras.applications import imagenet_utils
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions  # pylint: disable=unused-import
+from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
 from tensorflow.python.keras._impl.keras.engine.topology import get_source_inputs
 from tensorflow.python.keras._impl.keras.layers import Activation
 from tensorflow.python.keras._impl.keras.layers import AveragePooling2D
@@ -48,6 +49,7 @@ from tensorflow.python.keras._impl.keras.layers import Input
 from tensorflow.python.keras._impl.keras.layers import MaxPooling2D
 from tensorflow.python.keras._impl.keras.models import Model
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.platform import tf_logging as logging
 
 
 WEIGHTS_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.5/inception_v3_weights_tf_dim_ordering_tf_kernels.h5'
@@ -92,7 +94,8 @@ def conv2d_bn(x,
       strides=strides,
       padding=padding,
       use_bias=False,
-      name=conv_name)(x)
+      name=conv_name)(
+          x)
   x = BatchNormalization(axis=bn_axis, scale=False, name=bn_name)(x)
   x = Activation('relu', name=name)(x)
   return x
@@ -109,7 +112,7 @@ def InceptionV3(include_top=True,
   Optionally loads weights pre-trained
   on ImageNet. Note that when using TensorFlow,
   for best performance you should set
-  `image_data_format="channels_last"` in your Keras config
+  `image_data_format='channels_last'` in your Keras config
   at ~/.keras/keras.json.
   The model and the weights are compatible with both
   TensorFlow and Theano. The data format
@@ -121,15 +124,15 @@ def InceptionV3(include_top=True,
       include_top: whether to include the fully-connected
           layer at the top of the network.
       weights: one of `None` (random initialization),
-          "imagenet" (pre-training on ImageNet),
-          or the path to the weights file to be loaded.
+            'imagenet' (pre-training on ImageNet),
+            or the path to the weights file to be loaded.
       input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
           to use as image input for the model.
       input_shape: optional shape tuple, only to be specified
           if `include_top` is False (otherwise the input shape
           has to be `(299, 299, 3)` (with `channels_last` data format)
           or `(3, 299, 299)` (with `channels_first` data format).
-          It should have exactly 3 input channels,
+          It should have exactly 3 inputs channels,
           and width and height should be no smaller than 139.
           E.g. `(150, 150, 3)` would be one valid value.
       pooling: Optional pooling mode for feature extraction
@@ -176,7 +179,10 @@ def InceptionV3(include_top=True,
   if input_tensor is None:
     img_input = Input(shape=input_shape)
   else:
-    img_input = Input(tensor=input_tensor, shape=input_shape)
+    if not K.is_keras_tensor(input_tensor):
+      img_input = Input(tensor=input_tensor, shape=input_shape)
+    else:
+      img_input = input_tensor
 
   if K.image_data_format() == 'channels_first':
     channel_axis = 1
@@ -389,6 +395,7 @@ def InceptionV3(include_top=True,
     model.load_weights(weights_path)
   elif weights is not None:
     model.load_weights(weights)
+
   return model
 
 
diff --git a/tensorflow/python/keras/_impl/keras/applications/mobilenet.py b/tensorflow/python/keras/_impl/keras/applications/mobilenet.py
index 5f97c138fc..790bf8cead 100644
--- a/tensorflow/python/keras/_impl/keras/applications/mobilenet.py
+++ b/tensorflow/python/keras/_impl/keras/applications/mobilenet.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+# pylint: disable=invalid-name
+# pylint: disable=unused-import
 """MobileNet v1 models for Keras.
 
 MobileNet is a general architecture and can be used for multiple use cases.
@@ -56,7 +58,7 @@ the 100 % MobileNet on various input sizes:
 ------------------------------------------------------------------------
 
 The weights for all 16 models are obtained and translated
-from Tensorflow checkpoints found at
+from TensorFlow checkpoints found at
 https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet_v1.md
 
 # Reference
@@ -75,9 +77,10 @@ from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.applications import imagenet_utils
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions  # pylint: disable=unused-import
+from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
 from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine.topology import get_source_inputs
+from tensorflow.python.keras._impl.keras.engine.topology import shape_type_conversion
 from tensorflow.python.keras._impl.keras.layers import Activation
 from tensorflow.python.keras._impl.keras.layers import BatchNormalization
 from tensorflow.python.keras._impl.keras.layers import Conv2D
@@ -91,6 +94,7 @@ from tensorflow.python.keras._impl.keras.utils import conv_utils
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
 from tensorflow.python.platform import tf_logging as logging
 
+
 BASE_WEIGHT_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.6/'
 
 
@@ -130,7 +134,7 @@ class DepthwiseConv2D(Conv2D):
           all spatial dimensions.
           Specifying any stride value != 1 is incompatible with specifying
           any `dilation_rate` value != 1.
-      padding: one of `"valid"` or `"same"` (case-insensitive).
+      padding: one of `'valid'` or `'same'` (case-insensitive).
       depth_multiplier: The number of depthwise convolution output channels
           for each input channel.
           The total number of depthwise convolution output
@@ -144,29 +148,21 @@ class DepthwiseConv2D(Conv2D):
           `(batch, channels, height, width)`.
           It defaults to the `image_data_format` value found in your
           Keras config file at `~/.keras/keras.json`.
-          If you never set it, then it will be "channels_last".
-      activation: Activation function to use
-          (see [activations](../activations.md)).
+          If you never set it, then it will be 'channels_last'.
+      activation: Activation function to use.
           If you don't specify anything, no activation is applied
-          (ie. "linear" activation: `a(x) = x`).
+          (ie. 'linear' activation: `a(x) = x`).
       use_bias: Boolean, whether the layer uses a bias vector.
-      depthwise_initializer: Initializer for the depthwise kernel matrix
-          (see [initializers](../initializers.md)).
-      bias_initializer: Initializer for the bias vector
-          (see [initializers](../initializers.md)).
+      depthwise_initializer: Initializer for the depthwise kernel matrix.
+      bias_initializer: Initializer for the bias vector.
       depthwise_regularizer: Regularizer function applied to
-          the depthwise kernel matrix
-          (see [regularizer](../regularizers.md)).
-      bias_regularizer: Regularizer function applied to the bias vector
-          (see [regularizer](../regularizers.md)).
+          the depthwise kernel matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
       activity_regularizer: Regularizer function applied to
-          the output of the layer (its "activation").
-          (see [regularizer](../regularizers.md)).
+          the output of the layer (its 'activation')..
       depthwise_constraint: Constraint function applied to
-          the depthwise kernel matrix
-          (see [constraints](../constraints.md)).
-      bias_constraint: Constraint function applied to the bias vector
-          (see [constraints](../constraints.md)).
+          the depthwise kernel matrix.
+      bias_constraint: Constraint function applied to the bias vector.
 
   Input shape:
       4D tensor with shape:
@@ -216,6 +212,7 @@ class DepthwiseConv2D(Conv2D):
     self.depthwise_constraint = constraints.get(depthwise_constraint)
     self.bias_initializer = initializers.get(bias_initializer)
 
+  @shape_type_conversion
   def build(self, input_shape):
     if len(input_shape) < 4:
       raise ValueError('Inputs to `DepthwiseConv2D` should have rank 4. '
@@ -269,6 +266,7 @@ class DepthwiseConv2D(Conv2D):
 
     return outputs
 
+  @shape_type_conversion
   def compute_output_shape(self, input_shape):
     if self.data_format == 'channels_first':
       rows = input_shape[2]
@@ -305,7 +303,7 @@ class DepthwiseConv2D(Conv2D):
     return config
 
 
-def MobileNet(input_shape=None,  # pylint: disable=invalid-name
+def MobileNet(input_shape=None,
               alpha=1.0,
               depth_multiplier=1,
               dropout=1e-3,
@@ -334,7 +332,7 @@ def MobileNet(input_shape=None,  # pylint: disable=invalid-name
           if `include_top` is False (otherwise the input shape
           has to be `(224, 224, 3)` (with `channels_last` data format)
           or (3, 224, 224) (with `channels_first` data format).
-          It should have exactly 3 input channels,
+          It should have exactly 3 inputs channels,
           and width and height should be no smaller than 32.
           E.g. `(200, 200, 3)` would be one valid value.
       alpha: controls the width of the network.
@@ -350,8 +348,8 @@ def MobileNet(input_shape=None,  # pylint: disable=invalid-name
       include_top: whether to include the fully-connected
           layer at the top of the network.
       weights: one of `None` (random initialization),
-          'imagenet' (pre-training on ImageNet),
-          or the path to the weights file to be loaded.
+            'imagenet' (pre-training on ImageNet),
+            or the path to the weights file to be loaded.
       input_tensor: optional Keras tensor (i.e. output of
           `layers.Input()`)
           to use as image input for the model.
@@ -380,6 +378,12 @@ def MobileNet(input_shape=None,  # pylint: disable=invalid-name
       RuntimeError: If attempting to run this model with a
           backend that does not support separable convolutions.
   """
+
+  if K.backend() != 'tensorflow':
+    raise RuntimeError('Only TensorFlow backend is currently supported, '
+                       'as other backends do not support '
+                       'depthwise convolution.')
+
   if not (weights in {'imagenet', None} or os.path.exists(weights)):
     raise ValueError('The `weights` argument should be either '
                      '`None` (random initialization), `imagenet` '
@@ -390,7 +394,7 @@ def MobileNet(input_shape=None,  # pylint: disable=invalid-name
     raise ValueError('If using `weights` as ImageNet with `include_top` '
                      'as true, `classes` should be 1000')
 
-  # Determine proper input shape.
+  # Determine proper input shape and default size.
   if input_shape is None:
     default_size = 224
   else:
@@ -400,10 +404,12 @@ def MobileNet(input_shape=None,  # pylint: disable=invalid-name
     else:
       rows = input_shape[0]
       cols = input_shape[1]
+
     if rows == cols and rows in [128, 160, 192, 224]:
       default_size = rows
     else:
       default_size = 224
+
   input_shape = _obtain_input_shape(
       input_shape,
       default_size=default_size,
@@ -411,6 +417,7 @@ def MobileNet(input_shape=None,  # pylint: disable=invalid-name
       data_format=K.image_data_format(),
       require_flatten=include_top,
       weights=weights)
+
   if K.image_data_format() == 'channels_last':
     row_axis, col_axis = (0, 1)
   else:
@@ -536,8 +543,6 @@ def MobileNet(input_shape=None,  # pylint: disable=invalid-name
 
   if old_data_format:
     K.set_image_data_format(old_data_format)
-  elif weights is not None:
-    model.load_weights(weights)
   return model
 
 
@@ -595,7 +600,8 @@ def _conv_block(inputs, filters, alpha, kernel=(3, 3), strides=(1, 1)):
       padding='same',
       use_bias=False,
       strides=strides,
-      name='conv1')(inputs)
+      name='conv1')(
+          inputs)
   x = BatchNormalization(axis=channel_axis, name='conv1_bn')(x)
   return Activation(relu6, name='conv1_relu')(x)
 
@@ -662,7 +668,8 @@ def _depthwise_conv_block(inputs,
       depth_multiplier=depth_multiplier,
       strides=strides,
       use_bias=False,
-      name='conv_dw_%d' % block_id)(inputs)
+      name='conv_dw_%d' % block_id)(
+          inputs)
   x = BatchNormalization(axis=channel_axis, name='conv_dw_%d_bn' % block_id)(x)
   x = Activation(relu6, name='conv_dw_%d_relu' % block_id)(x)
 
@@ -671,6 +678,7 @@ def _depthwise_conv_block(inputs,
       padding='same',
       use_bias=False,
       strides=(1, 1),
-      name='conv_pw_%d' % block_id)(x)
+      name='conv_pw_%d' % block_id)(
+          x)
   x = BatchNormalization(axis=channel_axis, name='conv_pw_%d_bn' % block_id)(x)
   return Activation(relu6, name='conv_pw_%d_relu' % block_id)(x)
diff --git a/tensorflow/python/keras/_impl/keras/applications/nasnet.py b/tensorflow/python/keras/_impl/keras/applications/nasnet.py
new file mode 100644
index 0000000000..5dd038c096
--- /dev/null
+++ b/tensorflow/python/keras/_impl/keras/applications/nasnet.py
@@ -0,0 +1,783 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=line-too-long
+# pylint: disable=invalid-name
+# pylint: disable=unused-import
+"""NASNet-A models for Keras.
+
+NASNet refers to Neural Architecture Search Network, a family of models
+that were designed automatically by learning the model architectures
+directly on the dataset of interest.
+
+Here we consider NASNet-A, the highest performance model that was found
+for the CIFAR-10 dataset, and then extended to ImageNet 2012 dataset,
+obtaining state of the art performance on CIFAR-10 and ImageNet 2012.
+Only the NASNet-A models, and their respective weights, which are suited
+for ImageNet 2012 are provided.
+
+The below table describes the performance on ImageNet 2012:
+--------------------------------------------------------------------------------
+      Architecture       | Top-1 Acc | Top-5 Acc |  Multiply-Adds |  Params (M)
+--------------------------------------------------------------------------------
+|   NASNet-A (4 @ 1056)  |   74.0 %  |   91.6 %  |       564 M    |     5.3    |
+|   NASNet-A (6 @ 4032)  |   82.7 %  |   96.2 %  |      23.8 B    |    88.9    |
+--------------------------------------------------------------------------------
+
+References:
+ - [Learning Transferable Architectures for Scalable Image Recognition]
+    (https://arxiv.org/abs/1707.07012)
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.python.keras._impl.keras import backend as K
+from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
+from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
+from tensorflow.python.keras._impl.keras.applications.inception_v3 import preprocess_input
+from tensorflow.python.keras._impl.keras.engine.topology import get_source_inputs
+from tensorflow.python.keras._impl.keras.layers import Activation
+from tensorflow.python.keras._impl.keras.layers import add
+from tensorflow.python.keras._impl.keras.layers import AveragePooling2D
+from tensorflow.python.keras._impl.keras.layers import BatchNormalization
+from tensorflow.python.keras._impl.keras.layers import concatenate
+from tensorflow.python.keras._impl.keras.layers import Conv2D
+from tensorflow.python.keras._impl.keras.layers import Cropping2D
+from tensorflow.python.keras._impl.keras.layers import Dense
+from tensorflow.python.keras._impl.keras.layers import GlobalAveragePooling2D
+from tensorflow.python.keras._impl.keras.layers import GlobalMaxPooling2D
+from tensorflow.python.keras._impl.keras.layers import Input
+from tensorflow.python.keras._impl.keras.layers import MaxPooling2D
+from tensorflow.python.keras._impl.keras.layers import SeparableConv2D
+from tensorflow.python.keras._impl.keras.layers import ZeroPadding2D
+from tensorflow.python.keras._impl.keras.models import Model
+from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.platform import tf_logging as logging
+
+
+NASNET_MOBILE_WEIGHT_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.8/NASNet-mobile.h5'
+NASNET_MOBILE_WEIGHT_PATH_NO_TOP = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.8/NASNet-mobile-no-top.h5'
+NASNET_LARGE_WEIGHT_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.8/NASNet-large.h5'
+NASNET_LARGE_WEIGHT_PATH_NO_TOP = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.8/NASNet-large-no-top.h5'
+
+
+def NASNet(input_shape=None,
+           penultimate_filters=4032,
+           num_blocks=6,
+           stem_block_filters=96,
+           skip_reduction=True,
+           filter_multiplier=2,
+           include_top=True,
+           weights=None,
+           input_tensor=None,
+           pooling=None,
+           classes=1000,
+           default_size=None):
+  """Instantiates a NASNet model.
+
+  Note that only TensorFlow is supported for now,
+  therefore it only works with the data format
+  `image_data_format='channels_last'` in your Keras config
+  at `~/.keras/keras.json`.
+
+  Arguments:
+      input_shape: Optional shape tuple, only to be specified
+          if `include_top` is False (otherwise the input shape
+          has to be `(331, 331, 3)` for NASNetLarge or
+          `(224, 224, 3)` for NASNetMobile
+          It should have exactly 3 inputs channels,
+          and width and height should be no smaller than 32.
+          E.g. `(224, 224, 3)` would be one valid value.
+      penultimate_filters: Number of filters in the penultimate layer.
+          NASNet models use the notation `NASNet (N @ P)`, where:
+              -   N is the number of blocks
+              -   P is the number of penultimate filters
+      num_blocks: Number of repeated blocks of the NASNet model.
+          NASNet models use the notation `NASNet (N @ P)`, where:
+              -   N is the number of blocks
+              -   P is the number of penultimate filters
+      stem_block_filters: Number of filters in the initial stem block
+      skip_reduction: Whether to skip the reduction step at the tail
+          end of the network. Set to `False` for CIFAR models.
+      filter_multiplier: Controls the width of the network.
+          - If `filter_multiplier` < 1.0, proportionally decreases the number
+              of filters in each layer.
+          - If `filter_multiplier` > 1.0, proportionally increases the number
+              of filters in each layer.
+          - If `filter_multiplier` = 1, default number of filters from the
+               paper are used at each layer.
+      include_top: Whether to include the fully-connected
+          layer at the top of the network.
+      weights: `None` (random initialization) or
+          `imagenet` (ImageNet weights)
+      input_tensor: Optional Keras tensor (i.e. output of
+          `layers.Input()`)
+          to use as image input for the model.
+      pooling: Optional pooling mode for feature extraction
+          when `include_top` is `False`.
+          - `None` means that the output of the model
+              will be the 4D tensor output of the
+              last convolutional layer.
+          - `avg` means that global average pooling
+              will be applied to the output of the
+              last convolutional layer, and thus
+              the output of the model will be a
+              2D tensor.
+          - `max` means that global max pooling will
+              be applied.
+      classes: Optional number of classes to classify images
+          into, only to be specified if `include_top` is True, and
+          if no `weights` argument is specified.
+      default_size: Specifies the default image size of the model
+
+  Returns:
+      A Keras model instance.
+
+  Raises:
+      ValueError: In case of invalid argument for `weights`,
+          invalid input shape or invalid `penultimate_filters` value.
+      RuntimeError: If attempting to run this model with a
+          backend that does not support separable convolutions.
+  """
+  if K.backend() != 'tensorflow':
+    raise RuntimeError('Only Tensorflow backend is currently supported, '
+                       'as other backends do not support '
+                       'separable convolution.')
+
+  if not (weights in {'imagenet', None} or os.path.exists(weights)):
+    raise ValueError('The `weights` argument should be either '
+                     '`None` (random initialization), `imagenet` '
+                     '(pre-training on ImageNet), '
+                     'or the path to the weights file to be loaded.')
+
+  if weights == 'imagenet' and include_top and classes != 1000:
+    raise ValueError('If using `weights` as ImageNet with `include_top` '
+                     'as true, `classes` should be 1000')
+
+  if default_size is None:
+    default_size = 331
+
+  # Determine proper input shape and default size.
+  input_shape = _obtain_input_shape(
+      input_shape,
+      default_size=default_size,
+      min_size=32,
+      data_format=K.image_data_format(),
+      require_flatten=include_top or weights,
+      weights=weights)
+
+  if K.image_data_format() != 'channels_last':
+    logging.warning('The NASNet family of models is only available '
+                    'for the input data format "channels_last" '
+                    '(width, height, channels). '
+                    'However your settings specify the default '
+                    'data format "channels_first" (channels, width, height).'
+                    ' You should set `image_data_format="channels_last"` '
+                    'in your Keras config located at ~/.keras/keras.json. '
+                    'The model being returned right now will expect inputs '
+                    'to follow the "channels_last" data format.')
+    K.set_image_data_format('channels_last')
+    old_data_format = 'channels_first'
+  else:
+    old_data_format = None
+
+  if input_tensor is None:
+    img_input = Input(shape=input_shape)
+  else:
+    if not K.is_keras_tensor(input_tensor):
+      img_input = Input(tensor=input_tensor, shape=input_shape)
+    else:
+      img_input = input_tensor
+
+  if penultimate_filters % 24 != 0:
+    raise ValueError(
+        'For NASNet-A models, the value of `penultimate_filters` '
+        'needs to be divisible by 24. Current value: %d' % penultimate_filters)
+
+  channel_dim = 1 if K.image_data_format() == 'channels_first' else -1
+  filters = penultimate_filters // 24
+
+  if not skip_reduction:
+    x = Conv2D(
+        stem_block_filters, (3, 3),
+        strides=(2, 2),
+        padding='valid',
+        use_bias=False,
+        name='stem_conv1',
+        kernel_initializer='he_normal')(
+            img_input)
+  else:
+    x = Conv2D(
+        stem_block_filters, (3, 3),
+        strides=(1, 1),
+        padding='same',
+        use_bias=False,
+        name='stem_conv1',
+        kernel_initializer='he_normal')(
+            img_input)
+
+  x = BatchNormalization(
+      axis=channel_dim, momentum=0.9997, epsilon=1e-3, name='stem_bn1')(
+          x)
+
+  p = None
+  if not skip_reduction:  # imagenet / mobile mode
+    x, p = _reduction_a_cell(
+        x, p, filters // (filter_multiplier**2), block_id='stem_1')
+    x, p = _reduction_a_cell(
+        x, p, filters // filter_multiplier, block_id='stem_2')
+
+  for i in range(num_blocks):
+    x, p = _normal_a_cell(x, p, filters, block_id='%d' % (i))
+
+  x, p0 = _reduction_a_cell(
+      x, p, filters * filter_multiplier, block_id='reduce_%d' % (num_blocks))
+
+  p = p0 if not skip_reduction else p
+
+  for i in range(num_blocks):
+    x, p = _normal_a_cell(
+        x, p, filters * filter_multiplier, block_id='%d' % (num_blocks + i + 1))
+
+  x, p0 = _reduction_a_cell(
+      x,
+      p,
+      filters * filter_multiplier**2,
+      block_id='reduce_%d' % (2 * num_blocks))
+
+  p = p0 if not skip_reduction else p
+
+  for i in range(num_blocks):
+    x, p = _normal_a_cell(
+        x,
+        p,
+        filters * filter_multiplier**2,
+        block_id='%d' % (2 * num_blocks + i + 1))
+
+  x = Activation('relu')(x)
+
+  if include_top:
+    x = GlobalAveragePooling2D()(x)
+    x = Dense(classes, activation='softmax', name='predictions')(x)
+  else:
+    if pooling == 'avg':
+      x = GlobalAveragePooling2D()(x)
+    elif pooling == 'max':
+      x = GlobalMaxPooling2D()(x)
+
+  # Ensure that the model takes into account
+  # any potential predecessors of `input_tensor`.
+  if input_tensor is not None:
+    inputs = get_source_inputs(input_tensor)
+  else:
+    inputs = img_input
+
+  model = Model(inputs, x, name='NASNet')
+
+  # load weights
+  if weights == 'imagenet':
+    if default_size == 224:  # mobile version
+      if include_top:
+        weight_path = NASNET_MOBILE_WEIGHT_PATH
+        model_name = 'nasnet_mobile.h5'
+      else:
+        weight_path = NASNET_MOBILE_WEIGHT_PATH_NO_TOP
+        model_name = 'nasnet_mobile_no_top.h5'
+
+      weights_file = get_file(model_name, weight_path, cache_subdir='models')
+      model.load_weights(weights_file)
+
+    elif default_size == 331:  # large version
+      if include_top:
+        weight_path = NASNET_LARGE_WEIGHT_PATH
+        model_name = 'nasnet_large.h5'
+      else:
+        weight_path = NASNET_LARGE_WEIGHT_PATH_NO_TOP
+        model_name = 'nasnet_large_no_top.h5'
+
+      weights_file = get_file(model_name, weight_path, cache_subdir='models')
+      model.load_weights(weights_file)
+    else:
+      raise ValueError('ImageNet weights can only be loaded with NASNetLarge'
+                       ' or NASNetMobile')
+  elif weights is not None:
+    model.load_weights(weights)
+
+  if old_data_format:
+    K.set_image_data_format(old_data_format)
+
+  return model
+
+
+def NASNetLarge(input_shape=None,
+                include_top=True,
+                weights='imagenet',
+                input_tensor=None,
+                pooling=None,
+                classes=1000):
+  """Instantiates a NASNet model in ImageNet mode.
+
+  Note that only TensorFlow is supported for now,
+  therefore it only works with the data format
+  `image_data_format='channels_last'` in your Keras config
+  at `~/.keras/keras.json`.
+
+  Arguments:
+      input_shape: Optional shape tuple, only to be specified
+          if `include_top` is False (otherwise the input shape
+          has to be `(331, 331, 3)` for NASNetLarge.
+          It should have exactly 3 inputs channels,
+          and width and height should be no smaller than 32.
+          E.g. `(224, 224, 3)` would be one valid value.
+      include_top: Whether to include the fully-connected
+          layer at the top of the network.
+      weights: `None` (random initialization) or
+          `imagenet` (ImageNet weights)
+      input_tensor: Optional Keras tensor (i.e. output of
+          `layers.Input()`)
+          to use as image input for the model.
+      pooling: Optional pooling mode for feature extraction
+          when `include_top` is `False`.
+          - `None` means that the output of the model
+              will be the 4D tensor output of the
+              last convolutional layer.
+          - `avg` means that global average pooling
+              will be applied to the output of the
+              last convolutional layer, and thus
+              the output of the model will be a
+              2D tensor.
+          - `max` means that global max pooling will
+              be applied.
+      classes: Optional number of classes to classify images
+          into, only to be specified if `include_top` is True, and
+          if no `weights` argument is specified.
+
+  Returns:
+      A Keras model instance.
+
+  Raises:
+      ValueError: in case of invalid argument for `weights`,
+          or invalid input shape.
+      RuntimeError: If attempting to run this model with a
+          backend that does not support separable convolutions.
+  """
+  return NASNet(
+      input_shape,
+      penultimate_filters=4032,
+      num_blocks=6,
+      stem_block_filters=96,
+      skip_reduction=False,
+      filter_multiplier=2,
+      include_top=include_top,
+      weights=weights,
+      input_tensor=input_tensor,
+      pooling=pooling,
+      classes=classes,
+      default_size=331)
+
+
+def NASNetMobile(input_shape=None,
+                 include_top=True,
+                 weights='imagenet',
+                 input_tensor=None,
+                 pooling=None,
+                 classes=1000):
+  """Instantiates a Mobile NASNet model in ImageNet mode.
+
+  Note that only TensorFlow is supported for now,
+  therefore it only works with the data format
+  `image_data_format='channels_last'` in your Keras config
+  at `~/.keras/keras.json`.
+
+  Arguments:
+      input_shape: Optional shape tuple, only to be specified
+          if `include_top` is False (otherwise the input shape
+          has to be `(224, 224, 3)` for NASNetMobile
+          It should have exactly 3 inputs channels,
+          and width and height should be no smaller than 32.
+          E.g. `(224, 224, 3)` would be one valid value.
+      include_top: Whether to include the fully-connected
+          layer at the top of the network.
+      weights: `None` (random initialization) or
+          `imagenet` (ImageNet weights)
+      input_tensor: Optional Keras tensor (i.e. output of
+          `layers.Input()`)
+          to use as image input for the model.
+      pooling: Optional pooling mode for feature extraction
+          when `include_top` is `False`.
+          - `None` means that the output of the model
+              will be the 4D tensor output of the
+              last convolutional layer.
+          - `avg` means that global average pooling
+              will be applied to the output of the
+              last convolutional layer, and thus
+              the output of the model will be a
+              2D tensor.
+          - `max` means that global max pooling will
+              be applied.
+      classes: Optional number of classes to classify images
+          into, only to be specified if `include_top` is True, and
+          if no `weights` argument is specified.
+
+  Returns:
+      A Keras model instance.
+
+  Raises:
+      ValueError: In case of invalid argument for `weights`,
+          or invalid input shape.
+      RuntimeError: If attempting to run this model with a
+          backend that does not support separable convolutions.
+  """
+  return NASNet(
+      input_shape,
+      penultimate_filters=1056,
+      num_blocks=4,
+      stem_block_filters=32,
+      skip_reduction=False,
+      filter_multiplier=2,
+      include_top=include_top,
+      weights=weights,
+      input_tensor=input_tensor,
+      pooling=pooling,
+      classes=classes,
+      default_size=224)
+
+
+def _separable_conv_block(ip,
+                          filters,
+                          kernel_size=(3, 3),
+                          strides=(1, 1),
+                          block_id=None):
+  """Adds 2 blocks of [relu-separable conv-batchnorm].
+
+  Arguments:
+      ip: Input tensor
+      filters: Number of output filters per layer
+      kernel_size: Kernel size of separable convolutions
+      strides: Strided convolution for downsampling
+      block_id: String block_id
+
+  Returns:
+      A Keras tensor
+  """
+  channel_dim = 1 if K.image_data_format() == 'channels_first' else -1
+
+  with K.name_scope('separable_conv_block_%s' % block_id):
+    x = Activation('relu')(ip)
+    x = SeparableConv2D(
+        filters,
+        kernel_size,
+        strides=strides,
+        name='separable_conv_1_%s' % block_id,
+        padding='same',
+        use_bias=False,
+        kernel_initializer='he_normal')(
+            x)
+    x = BatchNormalization(
+        axis=channel_dim,
+        momentum=0.9997,
+        epsilon=1e-3,
+        name='separable_conv_1_bn_%s' % (block_id))(
+            x)
+    x = Activation('relu')(x)
+    x = SeparableConv2D(
+        filters,
+        kernel_size,
+        name='separable_conv_2_%s' % block_id,
+        padding='same',
+        use_bias=False,
+        kernel_initializer='he_normal')(
+            x)
+    x = BatchNormalization(
+        axis=channel_dim,
+        momentum=0.9997,
+        epsilon=1e-3,
+        name='separable_conv_2_bn_%s' % (block_id))(
+            x)
+  return x
+
+
+def _adjust_block(p, ip, filters, block_id=None):
+  """Adjusts the input `previous path` to match the shape of the `input`.
+
+  Used in situations where the output number of filters needs to be changed.
+
+  Arguments:
+      p: Input tensor which needs to be modified
+      ip: Input tensor whose shape needs to be matched
+      filters: Number of output filters to be matched
+      block_id: String block_id
+
+  Returns:
+      Adjusted Keras tensor
+  """
+  channel_dim = 1 if K.image_data_format() == 'channels_first' else -1
+  img_dim = 2 if K.image_data_format() == 'channels_first' else -2
+
+  ip_shape = K.int_shape(ip)
+
+  if p is not None:
+    p_shape = K.int_shape(p)
+
+  with K.name_scope('adjust_block'):
+    if p is None:
+      p = ip
+
+    elif p_shape[img_dim] != ip_shape[img_dim]:
+      with K.name_scope('adjust_reduction_block_%s' % block_id):
+        p = Activation('relu', name='adjust_relu_1_%s' % block_id)(p)
+
+        p1 = AveragePooling2D(
+            (1, 1),
+            strides=(2, 2),
+            padding='valid',
+            name='adjust_avg_pool_1_%s' % block_id)(
+                p)
+        p1 = Conv2D(
+            filters // 2, (1, 1),
+            padding='same',
+            use_bias=False,
+            name='adjust_conv_1_%s' % block_id,
+            kernel_initializer='he_normal')(
+                p1)
+
+        p2 = ZeroPadding2D(padding=((0, 1), (0, 1)))(p)
+        p2 = Cropping2D(cropping=((1, 0), (1, 0)))(p2)
+        p2 = AveragePooling2D(
+            (1, 1),
+            strides=(2, 2),
+            padding='valid',
+            name='adjust_avg_pool_2_%s' % block_id)(
+                p2)
+        p2 = Conv2D(
+            filters // 2, (1, 1),
+            padding='same',
+            use_bias=False,
+            name='adjust_conv_2_%s' % block_id,
+            kernel_initializer='he_normal')(
+                p2)
+
+        p = concatenate([p1, p2], axis=channel_dim)
+        p = BatchNormalization(
+            axis=channel_dim,
+            momentum=0.9997,
+            epsilon=1e-3,
+            name='adjust_bn_%s' % block_id)(
+                p)
+
+    elif p_shape[channel_dim] != filters:
+      with K.name_scope('adjust_projection_block_%s' % block_id):
+        p = Activation('relu')(p)
+        p = Conv2D(
+            filters, (1, 1),
+            strides=(1, 1),
+            padding='same',
+            name='adjust_conv_projection_%s' % block_id,
+            use_bias=False,
+            kernel_initializer='he_normal')(
+                p)
+        p = BatchNormalization(
+            axis=channel_dim,
+            momentum=0.9997,
+            epsilon=1e-3,
+            name='adjust_bn_%s' % block_id)(
+                p)
+  return p
+
+
+def _normal_a_cell(ip, p, filters, block_id=None):
+  """Adds a Normal cell for NASNet-A (Fig. 4 in the paper).
+
+  Arguments:
+      ip: Input tensor `x`
+      p: Input tensor `p`
+      filters: Number of output filters
+      block_id: String block_id
+
+  Returns:
+      A Keras tensor
+  """
+  channel_dim = 1 if K.image_data_format() == 'channels_first' else -1
+
+  with K.name_scope('normal_A_block_%s' % block_id):
+    p = _adjust_block(p, ip, filters, block_id)
+
+    h = Activation('relu')(ip)
+    h = Conv2D(
+        filters, (1, 1),
+        strides=(1, 1),
+        padding='same',
+        name='normal_conv_1_%s' % block_id,
+        use_bias=False,
+        kernel_initializer='he_normal')(
+            h)
+    h = BatchNormalization(
+        axis=channel_dim,
+        momentum=0.9997,
+        epsilon=1e-3,
+        name='normal_bn_1_%s' % block_id)(
+            h)
+
+    with K.name_scope('block_1'):
+      x1_1 = _separable_conv_block(
+          h, filters, kernel_size=(5, 5), block_id='normal_left1_%s' % block_id)
+      x1_2 = _separable_conv_block(
+          p, filters, block_id='normal_right1_%s' % block_id)
+      x1 = add([x1_1, x1_2], name='normal_add_1_%s' % block_id)
+
+    with K.name_scope('block_2'):
+      x2_1 = _separable_conv_block(
+          p, filters, (5, 5), block_id='normal_left2_%s' % block_id)
+      x2_2 = _separable_conv_block(
+          p, filters, (3, 3), block_id='normal_right2_%s' % block_id)
+      x2 = add([x2_1, x2_2], name='normal_add_2_%s' % block_id)
+
+    with K.name_scope('block_3'):
+      x3 = AveragePooling2D(
+          (3, 3),
+          strides=(1, 1),
+          padding='same',
+          name='normal_left3_%s' % (block_id))(
+              h)
+      x3 = add([x3, p], name='normal_add_3_%s' % block_id)
+
+    with K.name_scope('block_4'):
+      x4_1 = AveragePooling2D(
+          (3, 3),
+          strides=(1, 1),
+          padding='same',
+          name='normal_left4_%s' % (block_id))(
+              p)
+      x4_2 = AveragePooling2D(
+          (3, 3),
+          strides=(1, 1),
+          padding='same',
+          name='normal_right4_%s' % (block_id))(
+              p)
+      x4 = add([x4_1, x4_2], name='normal_add_4_%s' % block_id)
+
+    with K.name_scope('block_5'):
+      x5 = _separable_conv_block(
+          h, filters, block_id='normal_left5_%s' % block_id)
+      x5 = add([x5, h], name='normal_add_5_%s' % block_id)
+
+    x = concatenate(
+        [p, x1, x2, x3, x4, x5],
+        axis=channel_dim,
+        name='normal_concat_%s' % block_id)
+  return x, ip
+
+
+def _reduction_a_cell(ip, p, filters, block_id=None):
+  """Adds a Reduction cell for NASNet-A (Fig. 4 in the paper).
+
+  Arguments:
+      ip: Input tensor `x`
+      p: Input tensor `p`
+      filters: Number of output filters
+      block_id: String block_id
+
+  Returns:
+      A Keras tensor
+  """
+  channel_dim = 1 if K.image_data_format() == 'channels_first' else -1
+
+  with K.name_scope('reduction_A_block_%s' % block_id):
+    p = _adjust_block(p, ip, filters, block_id)
+
+    h = Activation('relu')(ip)
+    h = Conv2D(
+        filters, (1, 1),
+        strides=(1, 1),
+        padding='same',
+        name='reduction_conv_1_%s' % block_id,
+        use_bias=False,
+        kernel_initializer='he_normal')(
+            h)
+    h = BatchNormalization(
+        axis=channel_dim,
+        momentum=0.9997,
+        epsilon=1e-3,
+        name='reduction_bn_1_%s' % block_id)(
+            h)
+
+    with K.name_scope('block_1'):
+      x1_1 = _separable_conv_block(
+          h,
+          filters, (5, 5),
+          strides=(2, 2),
+          block_id='reduction_left1_%s' % block_id)
+      x1_2 = _separable_conv_block(
+          p,
+          filters, (7, 7),
+          strides=(2, 2),
+          block_id='reduction_1_%s' % block_id)
+      x1 = add([x1_1, x1_2], name='reduction_add_1_%s' % block_id)
+
+    with K.name_scope('block_2'):
+      x2_1 = MaxPooling2D(
+          (3, 3),
+          strides=(2, 2),
+          padding='same',
+          name='reduction_left2_%s' % block_id)(
+              h)
+      x2_2 = _separable_conv_block(
+          p,
+          filters, (7, 7),
+          strides=(2, 2),
+          block_id='reduction_right2_%s' % block_id)
+      x2 = add([x2_1, x2_2], name='reduction_add_2_%s' % block_id)
+
+    with K.name_scope('block_3'):
+      x3_1 = AveragePooling2D(
+          (3, 3),
+          strides=(2, 2),
+          padding='same',
+          name='reduction_left3_%s' % block_id)(
+              h)
+      x3_2 = _separable_conv_block(
+          p,
+          filters, (5, 5),
+          strides=(2, 2),
+          block_id='reduction_right3_%s' % block_id)
+      x3 = add([x3_1, x3_2], name='reduction_add3_%s' % block_id)
+
+    with K.name_scope('block_4'):
+      x4 = AveragePooling2D(
+          (3, 3),
+          strides=(1, 1),
+          padding='same',
+          name='reduction_left4_%s' % block_id)(
+              x1)
+      x4 = add([x2, x4])
+
+    with K.name_scope('block_5'):
+      x5_1 = _separable_conv_block(
+          x1, filters, (3, 3), block_id='reduction_left4_%s' % block_id)
+      x5_2 = MaxPooling2D(
+          (3, 3),
+          strides=(2, 2),
+          padding='same',
+          name='reduction_right5_%s' % block_id)(
+              h)
+      x5 = add([x5_1, x5_2], name='reduction_add4_%s' % block_id)
+
+    x = concatenate(
+        [x2, x3, x4, x5],
+        axis=channel_dim,
+        name='reduction_concat_%s' % block_id)
+    return x, ip
diff --git a/tensorflow/python/keras/_impl/keras/applications/nasnet_test.py b/tensorflow/python/keras/_impl/keras/applications/nasnet_test.py
new file mode 100644
index 0000000000..aa1dec670c
--- /dev/null
+++ b/tensorflow/python/keras/_impl/keras/applications/nasnet_test.py
@@ -0,0 +1,76 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Nasnet application."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.keras._impl import keras
+from tensorflow.python.platform import test
+
+
+class NASNetMobileTest(test.TestCase):
+
+  def test_with_top(self):
+    model = keras.applications.NASNetMobile(weights=None)
+    self.assertEqual(model.output_shape, (None, 1000))
+
+  def test_no_top(self):
+    model = keras.applications.NASNetMobile(weights=None, include_top=False)
+    self.assertEqual(model.output_shape, (None, None, None, 1056))
+
+  def test_with_pooling(self):
+    model = keras.applications.NASNetMobile(weights=None,
+                                            include_top=False,
+                                            pooling='avg')
+    self.assertEqual(model.output_shape, (None, 1056))
+
+  def test_weight_loading(self):
+    with self.assertRaises(ValueError):
+      keras.applications.NASNetMobile(weights='unknown',
+                                      include_top=False)
+    with self.assertRaises(ValueError):
+      keras.applications.NASNetMobile(weights='imagenet',
+                                      classes=2000)
+
+
+class NASNetLargeTest(test.TestCase):
+
+  def test_with_top(self):
+    model = keras.applications.NASNetLarge(weights=None)
+    self.assertEqual(model.output_shape, (None, 1000))
+
+  def test_no_top(self):
+    model = keras.applications.NASNetLarge(weights=None, include_top=False)
+    self.assertEqual(model.output_shape, (None, None, None, 4032))
+
+  def test_with_pooling(self):
+    model = keras.applications.NASNetLarge(weights=None,
+                                           include_top=False,
+                                           pooling='avg')
+    self.assertEqual(model.output_shape, (None, 4032))
+
+  def test_weight_loading(self):
+    with self.assertRaises(ValueError):
+      keras.applications.NASNetLarge(weights='unknown',
+                                     include_top=False)
+    with self.assertRaises(ValueError):
+      keras.applications.NASNetLarge(weights='imagenet',
+                                     classes=2000)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/_impl/keras/applications/resnet50.py b/tensorflow/python/keras/_impl/keras/applications/resnet50.py
index 8ab46693aa..5705b3481a 100644
--- a/tensorflow/python/keras/_impl/keras/applications/resnet50.py
+++ b/tensorflow/python/keras/_impl/keras/applications/resnet50.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 # pylint: disable=invalid-name
+# pylint: disable=unused-import
 """ResNet50 model for Keras.
 
 # Reference:
@@ -31,8 +32,8 @@ import os
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import layers
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions  # pylint: disable=unused-import
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import preprocess_input  # pylint: disable=unused-import
+from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
+from tensorflow.python.keras._impl.keras.applications.imagenet_utils import preprocess_input
 from tensorflow.python.keras._impl.keras.engine.topology import get_source_inputs
 from tensorflow.python.keras._impl.keras.layers import Activation
 from tensorflow.python.keras._impl.keras.layers import AveragePooling2D
@@ -45,7 +46,9 @@ from tensorflow.python.keras._impl.keras.layers import GlobalMaxPooling2D
 from tensorflow.python.keras._impl.keras.layers import Input
 from tensorflow.python.keras._impl.keras.layers import MaxPooling2D
 from tensorflow.python.keras._impl.keras.models import Model
+from tensorflow.python.keras._impl.keras.utils import layer_utils
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.platform import tf_logging as logging
 
 
 WEIGHTS_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.2/resnet50_weights_tf_dim_ordering_tf_kernels.h5'
@@ -78,7 +81,8 @@ def identity_block(input_tensor, kernel_size, filters, stage, block):
   x = Activation('relu')(x)
 
   x = Conv2D(
-      filters2, kernel_size, padding='same', name=conv_name_base + '2b')(x)
+      filters2, kernel_size, padding='same', name=conv_name_base + '2b')(
+          x)
   x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2b')(x)
   x = Activation('relu')(x)
 
@@ -92,7 +96,7 @@ def identity_block(input_tensor, kernel_size, filters, stage, block):
 
 def conv_block(input_tensor, kernel_size, filters, stage, block, strides=(2,
                                                                           2)):
-  """conv_block is the block that has a conv layer at shortcut.
+  """A block that has a conv layer at shortcut.
 
   Arguments:
       input_tensor: input tensor
@@ -100,14 +104,14 @@ def conv_block(input_tensor, kernel_size, filters, stage, block, strides=(2,
       filters: list of integers, the filters of 3 conv layer at main path
       stage: integer, current stage label, used for generating layer names
       block: 'a','b'..., current block label, used for generating layer names
-      strides: Tuple of integers.
+      strides: Strides for the first conv layer in the block.
 
   Returns:
       Output tensor for the block.
 
-  Note that from stage 3, the first conv layer at main path is with
-  strides=(2,2)
-  And the shortcut should have strides=(2,2) as well
+  Note that from stage 3,
+  the first conv layer at main path is with strides=(2, 2)
+  And the shortcut should have strides=(2, 2) as well
   """
   filters1, filters2, filters3 = filters
   if K.image_data_format() == 'channels_last':
@@ -118,13 +122,14 @@ def conv_block(input_tensor, kernel_size, filters, stage, block, strides=(2,
   bn_name_base = 'bn' + str(stage) + block + '_branch'
 
   x = Conv2D(
-      filters1, (1, 1), strides=strides,
-      name=conv_name_base + '2a')(input_tensor)
+      filters1, (1, 1), strides=strides, name=conv_name_base + '2a')(
+          input_tensor)
   x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2a')(x)
   x = Activation('relu')(x)
 
   x = Conv2D(
-      filters2, kernel_size, padding='same', name=conv_name_base + '2b')(x)
+      filters2, kernel_size, padding='same', name=conv_name_base + '2b')(
+          x)
   x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2b')(x)
   x = Activation('relu')(x)
 
@@ -132,8 +137,8 @@ def conv_block(input_tensor, kernel_size, filters, stage, block, strides=(2,
   x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2c')(x)
 
   shortcut = Conv2D(
-      filters3, (1, 1), strides=strides,
-      name=conv_name_base + '1')(input_tensor)
+      filters3, (1, 1), strides=strides, name=conv_name_base + '1')(
+          input_tensor)
   shortcut = BatchNormalization(axis=bn_axis, name=bn_name_base + '1')(shortcut)
 
   x = layers.add([x, shortcut])
@@ -152,7 +157,7 @@ def ResNet50(include_top=True,
   Optionally loads weights pre-trained
   on ImageNet. Note that when using TensorFlow,
   for best performance you should set
-  `image_data_format="channels_last"` in your Keras config
+  `image_data_format='channels_last'` in your Keras config
   at ~/.keras/keras.json.
 
   The model and the weights are compatible with both
@@ -164,15 +169,15 @@ def ResNet50(include_top=True,
       include_top: whether to include the fully-connected
           layer at the top of the network.
       weights: one of `None` (random initialization),
-          'imagenet' (pre-training on ImageNet),
-          or the path to the weights file to be loaded.
+            'imagenet' (pre-training on ImageNet),
+            or the path to the weights file to be loaded.
       input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
           to use as image input for the model.
       input_shape: optional shape tuple, only to be specified
           if `include_top` is False (otherwise the input shape
           has to be `(224, 224, 3)` (with `channels_last` data format)
           or `(3, 224, 224)` (with `channels_first` data format).
-          It should have exactly 3 input channels,
+          It should have exactly 3 inputs channels,
           and width and height should be no smaller than 197.
           E.g. `(200, 200, 3)` would be one valid value.
       pooling: Optional pooling mode for feature extraction
@@ -219,15 +224,18 @@ def ResNet50(include_top=True,
   if input_tensor is None:
     img_input = Input(shape=input_shape)
   else:
-    img_input = Input(tensor=input_tensor, shape=input_shape)
-
+    if not K.is_keras_tensor(input_tensor):
+      img_input = Input(tensor=input_tensor, shape=input_shape)
+    else:
+      img_input = input_tensor
   if K.image_data_format() == 'channels_last':
     bn_axis = 3
   else:
     bn_axis = 1
 
-  x = Conv2D(64, (7, 7),
-             strides=(2, 2), padding='same', name='conv1')(img_input)
+  x = Conv2D(
+      64, (7, 7), strides=(2, 2), padding='same', name='conv1')(
+          img_input)
   x = BatchNormalization(axis=bn_axis, name='bn_conv1')(x)
   x = Activation('relu')(x)
   x = MaxPooling2D((3, 3), strides=(2, 2))(x)
@@ -289,4 +297,5 @@ def ResNet50(include_top=True,
     model.load_weights(weights_path)
   elif weights is not None:
     model.load_weights(weights)
+
   return model
diff --git a/tensorflow/python/keras/_impl/keras/applications/vgg16.py b/tensorflow/python/keras/_impl/keras/applications/vgg16.py
index 38dbbdc809..c91c24e6fb 100644
--- a/tensorflow/python/keras/_impl/keras/applications/vgg16.py
+++ b/tensorflow/python/keras/_impl/keras/applications/vgg16.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 # pylint: disable=invalid-name
+# pylint: disable=unused-import
 """VGG16 model for Keras.
 
 # Reference
@@ -29,8 +30,8 @@ import os
 
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions  # pylint: disable=unused-import
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import preprocess_input  # pylint: disable=unused-import
+from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
+from tensorflow.python.keras._impl.keras.applications.imagenet_utils import preprocess_input
 from tensorflow.python.keras._impl.keras.engine.topology import get_source_inputs
 from tensorflow.python.keras._impl.keras.layers import Conv2D
 from tensorflow.python.keras._impl.keras.layers import Dense
@@ -42,6 +43,7 @@ from tensorflow.python.keras._impl.keras.layers import MaxPooling2D
 from tensorflow.python.keras._impl.keras.models import Model
 from tensorflow.python.keras._impl.keras.utils import layer_utils
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.platform import tf_logging as logging
 
 
 WEIGHTS_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.1/vgg16_weights_tf_dim_ordering_tf_kernels.h5'
@@ -59,7 +61,7 @@ def VGG16(include_top=True,
   Optionally loads weights pre-trained
   on ImageNet. Note that when using TensorFlow,
   for best performance you should set
-  `image_data_format="channels_last"` in your Keras config
+  `image_data_format='channels_last'` in your Keras config
   at ~/.keras/keras.json.
 
   The model and the weights are compatible with both
@@ -71,8 +73,8 @@ def VGG16(include_top=True,
       include_top: whether to include the 3 fully-connected
           layers at the top of the network.
       weights: one of `None` (random initialization),
-          'imagenet' (pre-training on ImageNet),
-          or the path to the weights file to be loaded.
+            'imagenet' (pre-training on ImageNet),
+            or the path to the weights file to be loaded.
       input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
           to use as image input for the model.
       input_shape: optional shape tuple, only to be specified
@@ -125,48 +127,62 @@ def VGG16(include_top=True,
   if input_tensor is None:
     img_input = Input(shape=input_shape)
   else:
-    img_input = Input(tensor=input_tensor, shape=input_shape)
-
+    if not K.is_keras_tensor(input_tensor):
+      img_input = Input(tensor=input_tensor, shape=input_shape)
+    else:
+      img_input = input_tensor
   # Block 1
   x = Conv2D(
-      64, (3, 3), activation='relu', padding='same',
-      name='block1_conv1')(img_input)
+      64, (3, 3), activation='relu', padding='same', name='block1_conv1')(
+          img_input)
   x = Conv2D(
-      64, (3, 3), activation='relu', padding='same', name='block1_conv2')(x)
+      64, (3, 3), activation='relu', padding='same', name='block1_conv2')(
+          x)
   x = MaxPooling2D((2, 2), strides=(2, 2), name='block1_pool')(x)
 
   # Block 2
   x = Conv2D(
-      128, (3, 3), activation='relu', padding='same', name='block2_conv1')(x)
+      128, (3, 3), activation='relu', padding='same', name='block2_conv1')(
+          x)
   x = Conv2D(
-      128, (3, 3), activation='relu', padding='same', name='block2_conv2')(x)
+      128, (3, 3), activation='relu', padding='same', name='block2_conv2')(
+          x)
   x = MaxPooling2D((2, 2), strides=(2, 2), name='block2_pool')(x)
 
   # Block 3
   x = Conv2D(
-      256, (3, 3), activation='relu', padding='same', name='block3_conv1')(x)
+      256, (3, 3), activation='relu', padding='same', name='block3_conv1')(
+          x)
   x = Conv2D(
-      256, (3, 3), activation='relu', padding='same', name='block3_conv2')(x)
+      256, (3, 3), activation='relu', padding='same', name='block3_conv2')(
+          x)
   x = Conv2D(
-      256, (3, 3), activation='relu', padding='same', name='block3_conv3')(x)
+      256, (3, 3), activation='relu', padding='same', name='block3_conv3')(
+          x)
   x = MaxPooling2D((2, 2), strides=(2, 2), name='block3_pool')(x)
 
   # Block 4
   x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block4_conv1')(x)
+      512, (3, 3), activation='relu', padding='same', name='block4_conv1')(
+          x)
   x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block4_conv2')(x)
+      512, (3, 3), activation='relu', padding='same', name='block4_conv2')(
+          x)
   x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block4_conv3')(x)
+      512, (3, 3), activation='relu', padding='same', name='block4_conv3')(
+          x)
   x = MaxPooling2D((2, 2), strides=(2, 2), name='block4_pool')(x)
 
   # Block 5
   x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block5_conv1')(x)
+      512, (3, 3), activation='relu', padding='same', name='block5_conv1')(
+          x)
   x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block5_conv2')(x)
+      512, (3, 3), activation='relu', padding='same', name='block5_conv2')(
+          x)
   x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block5_conv3')(x)
+      512, (3, 3), activation='relu', padding='same', name='block5_conv3')(
+          x)
   x = MaxPooling2D((2, 2), strides=(2, 2), name='block5_pool')(x)
 
   if include_top:
@@ -215,6 +231,8 @@ def VGG16(include_top=True,
         dense = model.get_layer(name='fc1')
         layer_utils.convert_dense_weights_data_format(dense, shape,
                                                       'channels_first')
+
   elif weights is not None:
     model.load_weights(weights)
+
   return model
diff --git a/tensorflow/python/keras/_impl/keras/applications/vgg19.py b/tensorflow/python/keras/_impl/keras/applications/vgg19.py
index 126c64260b..223cd79d7b 100644
--- a/tensorflow/python/keras/_impl/keras/applications/vgg19.py
+++ b/tensorflow/python/keras/_impl/keras/applications/vgg19.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 # pylint: disable=invalid-name
+# pylint: disable=unused-import
 """VGG19 model for Keras.
 
 # Reference
@@ -29,8 +30,8 @@ import os
 
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions  # pylint: disable=unused-import
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import preprocess_input  # pylint: disable=unused-import
+from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
+from tensorflow.python.keras._impl.keras.applications.imagenet_utils import preprocess_input
 from tensorflow.python.keras._impl.keras.engine.topology import get_source_inputs
 from tensorflow.python.keras._impl.keras.layers import Conv2D
 from tensorflow.python.keras._impl.keras.layers import Dense
@@ -42,6 +43,7 @@ from tensorflow.python.keras._impl.keras.layers import MaxPooling2D
 from tensorflow.python.keras._impl.keras.models import Model
 from tensorflow.python.keras._impl.keras.utils import layer_utils
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.platform import tf_logging as logging
 
 
 WEIGHTS_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.1/vgg19_weights_tf_dim_ordering_tf_kernels.h5'
@@ -59,7 +61,7 @@ def VGG19(include_top=True,
   Optionally loads weights pre-trained
   on ImageNet. Note that when using TensorFlow,
   for best performance you should set
-  `image_data_format="channels_last"` in your Keras config
+  `image_data_format='channels_last'` in your Keras config
   at ~/.keras/keras.json.
 
   The model and the weights are compatible with both
@@ -71,15 +73,15 @@ def VGG19(include_top=True,
       include_top: whether to include the 3 fully-connected
           layers at the top of the network.
       weights: one of `None` (random initialization),
-         'imagenet' (pre-training on ImageNet),
-         or the path to the weights file to be loaded.
+            'imagenet' (pre-training on ImageNet),
+            or the path to the weights file to be loaded.
       input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
           to use as image input for the model.
       input_shape: optional shape tuple, only to be specified
           if `include_top` is False (otherwise the input shape
           has to be `(224, 224, 3)` (with `channels_last` data format)
           or `(3, 224, 224)` (with `channels_first` data format).
-          It should have exactly 3 input channels,
+          It should have exactly 3 inputs channels,
           and width and height should be no smaller than 48.
           E.g. `(200, 200, 3)` would be one valid value.
       pooling: Optional pooling mode for feature extraction
@@ -125,54 +127,71 @@ def VGG19(include_top=True,
   if input_tensor is None:
     img_input = Input(shape=input_shape)
   else:
-    img_input = Input(tensor=input_tensor, shape=input_shape)
-
+    if not K.is_keras_tensor(input_tensor):
+      img_input = Input(tensor=input_tensor, shape=input_shape)
+    else:
+      img_input = input_tensor
   # Block 1
   x = Conv2D(
-      64, (3, 3), activation='relu', padding='same',
-      name='block1_conv1')(img_input)
+      64, (3, 3), activation='relu', padding='same', name='block1_conv1')(
+          img_input)
   x = Conv2D(
-      64, (3, 3), activation='relu', padding='same', name='block1_conv2')(x)
+      64, (3, 3), activation='relu', padding='same', name='block1_conv2')(
+          x)
   x = MaxPooling2D((2, 2), strides=(2, 2), name='block1_pool')(x)
 
   # Block 2
   x = Conv2D(
-      128, (3, 3), activation='relu', padding='same', name='block2_conv1')(x)
+      128, (3, 3), activation='relu', padding='same', name='block2_conv1')(
+          x)
   x = Conv2D(
-      128, (3, 3), activation='relu', padding='same', name='block2_conv2')(x)
+      128, (3, 3), activation='relu', padding='same', name='block2_conv2')(
+          x)
   x = MaxPooling2D((2, 2), strides=(2, 2), name='block2_pool')(x)
 
   # Block 3
   x = Conv2D(
-      256, (3, 3), activation='relu', padding='same', name='block3_conv1')(x)
+      256, (3, 3), activation='relu', padding='same', name='block3_conv1')(
+          x)
   x = Conv2D(
-      256, (3, 3), activation='relu', padding='same', name='block3_conv2')(x)
+      256, (3, 3), activation='relu', padding='same', name='block3_conv2')(
+          x)
   x = Conv2D(
-      256, (3, 3), activation='relu', padding='same', name='block3_conv3')(x)
+      256, (3, 3), activation='relu', padding='same', name='block3_conv3')(
+          x)
   x = Conv2D(
-      256, (3, 3), activation='relu', padding='same', name='block3_conv4')(x)
+      256, (3, 3), activation='relu', padding='same', name='block3_conv4')(
+          x)
   x = MaxPooling2D((2, 2), strides=(2, 2), name='block3_pool')(x)
 
   # Block 4
   x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block4_conv1')(x)
+      512, (3, 3), activation='relu', padding='same', name='block4_conv1')(
+          x)
   x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block4_conv2')(x)
+      512, (3, 3), activation='relu', padding='same', name='block4_conv2')(
+          x)
   x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block4_conv3')(x)
+      512, (3, 3), activation='relu', padding='same', name='block4_conv3')(
+          x)
   x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block4_conv4')(x)
+      512, (3, 3), activation='relu', padding='same', name='block4_conv4')(
+          x)
   x = MaxPooling2D((2, 2), strides=(2, 2), name='block4_pool')(x)
 
   # Block 5
   x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block5_conv1')(x)
+      512, (3, 3), activation='relu', padding='same', name='block5_conv1')(
+          x)
   x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block5_conv2')(x)
+      512, (3, 3), activation='relu', padding='same', name='block5_conv2')(
+          x)
   x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block5_conv3')(x)
+      512, (3, 3), activation='relu', padding='same', name='block5_conv3')(
+          x)
   x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block5_conv4')(x)
+      512, (3, 3), activation='relu', padding='same', name='block5_conv4')(
+          x)
   x = MaxPooling2D((2, 2), strides=(2, 2), name='block5_pool')(x)
 
   if include_top:
@@ -211,6 +230,8 @@ def VGG19(include_top=True,
           cache_subdir='models',
           file_hash='253f8cb515780f3b799900260a226db6')
     model.load_weights(weights_path)
+    if K.backend() == 'theano':
+      layer_utils.convert_all_kernels_in_model(model)
 
     if K.image_data_format() == 'channels_first':
       if include_top:
@@ -219,6 +240,8 @@ def VGG19(include_top=True,
         dense = model.get_layer(name='fc1')
         layer_utils.convert_dense_weights_data_format(dense, shape,
                                                       'channels_first')
+
   elif weights is not None:
     model.load_weights(weights)
+
   return model
diff --git a/tensorflow/python/keras/_impl/keras/applications/xception.py b/tensorflow/python/keras/_impl/keras/applications/xception.py
index 8219831408..0a6eb4953a 100644
--- a/tensorflow/python/keras/_impl/keras/applications/xception.py
+++ b/tensorflow/python/keras/_impl/keras/applications/xception.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 # pylint: disable=invalid-name
+# pylint: disable=unused-import
 """Xception V1 model for Keras.
 
 On ImageNet, this model gets to a top-1 validation accuracy of 0.790
@@ -42,7 +43,7 @@ from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import layers
 from tensorflow.python.keras._impl.keras.applications import imagenet_utils
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions  # pylint: disable=unused-import
+from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
 from tensorflow.python.keras._impl.keras.engine.topology import get_source_inputs
 from tensorflow.python.keras._impl.keras.layers import Activation
 from tensorflow.python.keras._impl.keras.layers import BatchNormalization
@@ -74,7 +75,7 @@ def Xception(include_top=True,
   on ImageNet. This model is available for TensorFlow only,
   and can only be used with inputs following the TensorFlow
   data format `(width, height, channels)`.
-  You should set `image_data_format="channels_last"` in your Keras config
+  You should set `image_data_format='channels_last'` in your Keras config
   located at ~/.keras/keras.json.
 
   Note that the default input image size for this model is 299x299.
@@ -83,14 +84,14 @@ def Xception(include_top=True,
       include_top: whether to include the fully-connected
           layer at the top of the network.
       weights: one of `None` (random initialization),
-          'imagenet' (pre-training on ImageNet),
-          or the path to the weights file to be loaded.
+            'imagenet' (pre-training on ImageNet),
+            or the path to the weights file to be loaded.
       input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
           to use as image input for the model.
       input_shape: optional shape tuple, only to be specified
           if `include_top` is False (otherwise the input shape
           has to be `(299, 299, 3)`.
-          It should have exactly 3 input channels,
+          It should have exactly 3 inputs channels,
           and width and height should be no smaller than 71.
           E.g. `(150, 150, 3)` would be one valid value.
       pooling: Optional pooling mode for feature extraction
@@ -155,11 +156,14 @@ def Xception(include_top=True,
   if input_tensor is None:
     img_input = Input(shape=input_shape)
   else:
-    img_input = Input(tensor=input_tensor, shape=input_shape)
+    if not K.is_keras_tensor(input_tensor):
+      img_input = Input(tensor=input_tensor, shape=input_shape)
+    else:
+      img_input = input_tensor
 
   x = Conv2D(
-      32, (3, 3), strides=(2, 2), use_bias=False,
-      name='block1_conv1')(img_input)
+      32, (3, 3), strides=(2, 2), use_bias=False, name='block1_conv1')(
+          img_input)
   x = BatchNormalization(name='block1_conv1_bn')(x)
   x = Activation('relu', name='block1_conv1_act')(x)
   x = Conv2D(64, (3, 3), use_bias=False, name='block1_conv2')(x)
@@ -167,53 +171,65 @@ def Xception(include_top=True,
   x = Activation('relu', name='block1_conv2_act')(x)
 
   residual = Conv2D(
-      128, (1, 1), strides=(2, 2), padding='same', use_bias=False)(x)
+      128, (1, 1), strides=(2, 2), padding='same', use_bias=False)(
+          x)
   residual = BatchNormalization()(residual)
 
   x = SeparableConv2D(
-      128, (3, 3), padding='same', use_bias=False, name='block2_sepconv1')(x)
+      128, (3, 3), padding='same', use_bias=False, name='block2_sepconv1')(
+          x)
   x = BatchNormalization(name='block2_sepconv1_bn')(x)
   x = Activation('relu', name='block2_sepconv2_act')(x)
   x = SeparableConv2D(
-      128, (3, 3), padding='same', use_bias=False, name='block2_sepconv2')(x)
+      128, (3, 3), padding='same', use_bias=False, name='block2_sepconv2')(
+          x)
   x = BatchNormalization(name='block2_sepconv2_bn')(x)
 
   x = MaxPooling2D(
-      (3, 3), strides=(2, 2), padding='same', name='block2_pool')(x)
+      (3, 3), strides=(2, 2), padding='same', name='block2_pool')(
+          x)
   x = layers.add([x, residual])
 
   residual = Conv2D(
-      256, (1, 1), strides=(2, 2), padding='same', use_bias=False)(x)
+      256, (1, 1), strides=(2, 2), padding='same', use_bias=False)(
+          x)
   residual = BatchNormalization()(residual)
 
   x = Activation('relu', name='block3_sepconv1_act')(x)
   x = SeparableConv2D(
-      256, (3, 3), padding='same', use_bias=False, name='block3_sepconv1')(x)
+      256, (3, 3), padding='same', use_bias=False, name='block3_sepconv1')(
+          x)
   x = BatchNormalization(name='block3_sepconv1_bn')(x)
   x = Activation('relu', name='block3_sepconv2_act')(x)
   x = SeparableConv2D(
-      256, (3, 3), padding='same', use_bias=False, name='block3_sepconv2')(x)
+      256, (3, 3), padding='same', use_bias=False, name='block3_sepconv2')(
+          x)
   x = BatchNormalization(name='block3_sepconv2_bn')(x)
 
   x = MaxPooling2D(
-      (3, 3), strides=(2, 2), padding='same', name='block3_pool')(x)
+      (3, 3), strides=(2, 2), padding='same', name='block3_pool')(
+          x)
   x = layers.add([x, residual])
 
   residual = Conv2D(
-      728, (1, 1), strides=(2, 2), padding='same', use_bias=False)(x)
+      728, (1, 1), strides=(2, 2), padding='same', use_bias=False)(
+          x)
   residual = BatchNormalization()(residual)
 
   x = Activation('relu', name='block4_sepconv1_act')(x)
   x = SeparableConv2D(
-      728, (3, 3), padding='same', use_bias=False, name='block4_sepconv1')(x)
+      728, (3, 3), padding='same', use_bias=False, name='block4_sepconv1')(
+          x)
   x = BatchNormalization(name='block4_sepconv1_bn')(x)
   x = Activation('relu', name='block4_sepconv2_act')(x)
   x = SeparableConv2D(
-      728, (3, 3), padding='same', use_bias=False, name='block4_sepconv2')(x)
+      728, (3, 3), padding='same', use_bias=False, name='block4_sepconv2')(
+          x)
   x = BatchNormalization(name='block4_sepconv2_bn')(x)
 
   x = MaxPooling2D(
-      (3, 3), strides=(2, 2), padding='same', name='block4_pool')(x)
+      (3, 3), strides=(2, 2), padding='same', name='block4_pool')(
+          x)
   x = layers.add([x, residual])
 
   for i in range(8):
@@ -222,46 +238,52 @@ def Xception(include_top=True,
 
     x = Activation('relu', name=prefix + '_sepconv1_act')(x)
     x = SeparableConv2D(
-        728, (3, 3), padding='same', use_bias=False,
-        name=prefix + '_sepconv1')(x)
+        728, (3, 3), padding='same', use_bias=False, name=prefix + '_sepconv1')(
+            x)
     x = BatchNormalization(name=prefix + '_sepconv1_bn')(x)
     x = Activation('relu', name=prefix + '_sepconv2_act')(x)
     x = SeparableConv2D(
-        728, (3, 3), padding='same', use_bias=False,
-        name=prefix + '_sepconv2')(x)
+        728, (3, 3), padding='same', use_bias=False, name=prefix + '_sepconv2')(
+            x)
     x = BatchNormalization(name=prefix + '_sepconv2_bn')(x)
     x = Activation('relu', name=prefix + '_sepconv3_act')(x)
     x = SeparableConv2D(
-        728, (3, 3), padding='same', use_bias=False,
-        name=prefix + '_sepconv3')(x)
+        728, (3, 3), padding='same', use_bias=False, name=prefix + '_sepconv3')(
+            x)
     x = BatchNormalization(name=prefix + '_sepconv3_bn')(x)
 
     x = layers.add([x, residual])
 
   residual = Conv2D(
-      1024, (1, 1), strides=(2, 2), padding='same', use_bias=False)(x)
+      1024, (1, 1), strides=(2, 2), padding='same', use_bias=False)(
+          x)
   residual = BatchNormalization()(residual)
 
   x = Activation('relu', name='block13_sepconv1_act')(x)
   x = SeparableConv2D(
-      728, (3, 3), padding='same', use_bias=False, name='block13_sepconv1')(x)
+      728, (3, 3), padding='same', use_bias=False, name='block13_sepconv1')(
+          x)
   x = BatchNormalization(name='block13_sepconv1_bn')(x)
   x = Activation('relu', name='block13_sepconv2_act')(x)
   x = SeparableConv2D(
-      1024, (3, 3), padding='same', use_bias=False, name='block13_sepconv2')(x)
+      1024, (3, 3), padding='same', use_bias=False, name='block13_sepconv2')(
+          x)
   x = BatchNormalization(name='block13_sepconv2_bn')(x)
 
   x = MaxPooling2D(
-      (3, 3), strides=(2, 2), padding='same', name='block13_pool')(x)
+      (3, 3), strides=(2, 2), padding='same', name='block13_pool')(
+          x)
   x = layers.add([x, residual])
 
   x = SeparableConv2D(
-      1536, (3, 3), padding='same', use_bias=False, name='block14_sepconv1')(x)
+      1536, (3, 3), padding='same', use_bias=False, name='block14_sepconv1')(
+          x)
   x = BatchNormalization(name='block14_sepconv1_bn')(x)
   x = Activation('relu', name='block14_sepconv1_act')(x)
 
   x = SeparableConv2D(
-      2048, (3, 3), padding='same', use_bias=False, name='block14_sepconv2')(x)
+      2048, (3, 3), padding='same', use_bias=False, name='block14_sepconv2')(
+          x)
   x = BatchNormalization(name='block14_sepconv2_bn')(x)
   x = Activation('relu', name='block14_sepconv2_act')(x)
 
@@ -303,8 +325,6 @@ def Xception(include_top=True,
 
   if old_data_format:
     K.set_image_data_format(old_data_format)
-  elif weights is not None:
-    model.load_weights(weights)
   return model
 
 
diff --git a/tensorflow/python/keras/_impl/keras/backend.py b/tensorflow/python/keras/_impl/keras/backend.py
index 9476085bd8..460c0dc5f3 100644
--- a/tensorflow/python/keras/_impl/keras/backend.py
+++ b/tensorflow/python/keras/_impl/keras/backend.py
@@ -85,7 +85,7 @@ _MANUAL_VAR_INIT = False
 _FLOATX = 'float32'
 
 # Epsilon fuzz factor used throughout the codebase.
-_EPSILON = 10e-8
+_EPSILON = 1e-7
 
 # Default image data format, one of "channels_last", "channels_first".
 _IMAGE_DATA_FORMAT = 'channels_last'
@@ -116,7 +116,7 @@ def epsilon():
   Example:
   ```python
       >>> keras.backend.epsilon()
-      1e-08
+      1e-07
   ```
   """
   return _EPSILON
@@ -132,7 +132,7 @@ def set_epsilon(value):
   ```python
       >>> from keras import backend as K
       >>> K.epsilon()
-      1e-08
+      1e-07
       >>> K.set_epsilon(1e-05)
       >>> K.epsilon()
       1e-05
@@ -295,7 +295,8 @@ def clear_session():
   ops.reset_default_graph()
   reset_uids()
   _SESSION = None
-  phase = array_ops.placeholder(dtype='bool', name='keras_learning_phase')
+  phase = array_ops.placeholder_with_default(
+      False, shape=(), name='keras_learning_phase')
   _GRAPH_LEARNING_PHASES = {}
   _GRAPH_LEARNING_PHASES[ops.get_default_graph()] = phase
 
@@ -328,7 +329,8 @@ def learning_phase():
   """
   graph = ops.get_default_graph()
   if graph not in _GRAPH_LEARNING_PHASES:
-    phase = array_ops.placeholder(dtype='bool', name='keras_learning_phase')
+    phase = array_ops.placeholder_with_default(
+        False, shape=(), name='keras_learning_phase')
     _GRAPH_LEARNING_PHASES[graph] = phase
   return _GRAPH_LEARNING_PHASES[graph]
 
@@ -876,6 +878,8 @@ def zeros(shape, dtype=None, name=None):
 
   Returns:
       A variable (including Keras metadata), filled with `0.0`.
+      Note that if `shape` was symbolic, we cannot return a variable,
+      and will return a dynamically-shaped tensor instead.
 
   Example:
   ```python
@@ -890,12 +894,14 @@ def zeros(shape, dtype=None, name=None):
   if dtype is None:
     dtype = floatx()
   tf_dtype = dtypes_module.as_dtype(dtype)
-  return variable(
-      init_ops.constant_initializer(0., dtype=tf_dtype)(shape), dtype, name)
+  v = array_ops.zeros(shape=shape, dtype=tf_dtype, name=name)
+  if py_all(v.get_shape().as_list()):
+    return variable(v, dtype=dtype, name=name)
+  return v
 
 
 def ones(shape, dtype=None, name=None):
-  """Instantiates an all-ones tensor variable and returns it.
+  """Instantiates an all-ones variable and returns it.
 
   Arguments:
       shape: Tuple of integers, shape of returned Keras variable.
@@ -904,6 +910,8 @@ def ones(shape, dtype=None, name=None):
 
   Returns:
       A Keras variable, filled with `1.0`.
+      Note that if `shape` was symbolic, we cannot return a variable,
+      and will return a dynamically-shaped tensor instead.
 
   Example:
   ```python
@@ -918,8 +926,10 @@ def ones(shape, dtype=None, name=None):
   if dtype is None:
     dtype = floatx()
   tf_dtype = dtypes_module.as_dtype(dtype)
-  return variable(
-      init_ops.constant_initializer(1., dtype=tf_dtype)(shape), dtype, name)
+  v = array_ops.ones(shape=shape, dtype=tf_dtype, name=name)
+  if py_all(v.get_shape().as_list()):
+    return variable(v, dtype=dtype, name=name)
+  return v
 
 
 def eye(size, dtype=None, name=None):
@@ -1185,7 +1195,7 @@ def moving_average_update(x, value, momentum):
       An Operation to update the variable.
   """
   return moving_averages.assign_moving_average(
-      x, value, momentum, zero_debias=False)
+      x, value, momentum, zero_debias=True)
 
 
 # LINEAR ALGEBRA
@@ -1419,7 +1429,7 @@ def max(x, axis=None, keepdims=False):
   Returns:
       A tensor with maximum values of `x`.
   """
-  return math_ops.reduce_max(x, axis=axis, keep_dims=keepdims)
+  return math_ops.reduce_max(x, axis, keepdims)
 
 
 def min(x, axis=None, keepdims=False):
@@ -1436,7 +1446,7 @@ def min(x, axis=None, keepdims=False):
   Returns:
       A tensor with miminum values of `x`.
   """
-  return math_ops.reduce_min(x, axis=axis, keep_dims=keepdims)
+  return math_ops.reduce_min(x, axis, keepdims)
 
 
 def sum(x, axis=None, keepdims=False):
@@ -1453,7 +1463,7 @@ def sum(x, axis=None, keepdims=False):
   Returns:
       A tensor with sum of `x`.
   """
-  return math_ops.reduce_sum(x, axis=axis, keep_dims=keepdims)
+  return math_ops.reduce_sum(x, axis, keepdims)
 
 
 def prod(x, axis=None, keepdims=False):
@@ -1470,7 +1480,7 @@ def prod(x, axis=None, keepdims=False):
   Returns:
       A tensor with the product of elements of `x`.
   """
-  return math_ops.reduce_prod(x, axis=axis, keep_dims=keepdims)
+  return math_ops.reduce_prod(x, axis, keepdims)
 
 
 def cumsum(x, axis=0):
@@ -1515,10 +1525,10 @@ def var(x, axis=None, keepdims=False):
   """
   if x.dtype.base_dtype == dtypes_module.bool:
     x = math_ops.cast(x, floatx())
-  m = math_ops.reduce_mean(x, axis=axis, keep_dims=True)
+  m = math_ops.reduce_mean(x, axis, True)
   devs_squared = math_ops.square(x - m)
   return math_ops.reduce_mean(
-      devs_squared, axis=axis, keep_dims=keepdims)
+      devs_squared, axis, keepdims)
 
 
 def std(x, axis=None, keepdims=False):
@@ -1546,7 +1556,7 @@ def mean(x, axis=None, keepdims=False):
       axis: A list of integer. Axes to compute the mean.
       keepdims: A boolean, whether to keep the dimensions or not.
           If `keepdims` is `False`, the rank of the tensor is reduced
-          by 1 for each entry in `axis`. If `keep_dims` is `True`,
+          by 1 for each entry in `axis`. If `keepdims` is `True`,
           the reduced dimensions are retained with length 1.
 
   Returns:
@@ -1554,7 +1564,7 @@ def mean(x, axis=None, keepdims=False):
   """
   if x.dtype.base_dtype == dtypes_module.bool:
     x = math_ops.cast(x, floatx())
-  return math_ops.reduce_mean(x, axis=axis, keep_dims=keepdims)
+  return math_ops.reduce_mean(x, axis, keepdims)
 
 
 def any(x, axis=None, keepdims=False):
@@ -1569,7 +1579,7 @@ def any(x, axis=None, keepdims=False):
       A uint8 tensor (0s and 1s).
   """
   x = math_ops.cast(x, dtypes_module.bool)
-  return math_ops.reduce_any(x, axis=axis, keep_dims=keepdims)
+  return math_ops.reduce_any(x, axis, keepdims)
 
 
 def all(x, axis=None, keepdims=False):
@@ -1584,7 +1594,7 @@ def all(x, axis=None, keepdims=False):
       A uint8 tensor (0s and 1s).
   """
   x = math_ops.cast(x, dtypes_module.bool)
-  return math_ops.reduce_all(x, axis=axis, keep_dims=keepdims)
+  return math_ops.reduce_all(x, axis, keepdims)
 
 
 def argmax(x, axis=-1):
@@ -1694,7 +1704,7 @@ def logsumexp(x, axis=None, keepdims=False):
   Returns:
       The reduced tensor.
   """
-  return math_ops.reduce_logsumexp(x, axis=axis, keep_dims=keepdims)
+  return math_ops.reduce_logsumexp(x, axis, keepdims)
 
 
 def round(x):
@@ -1884,6 +1894,108 @@ def cos(x):
   return math_ops.cos(x)
 
 
+def _regular_normalize_batch_in_training(x,
+                                         gamma,
+                                         beta,
+                                         reduction_axes,
+                                         epsilon=1e-3):
+  """Non-fused version of `normalize_batch_in_training`.
+
+  Arguments:
+      x: Input tensor or variable.
+      gamma: Tensor by which to scale the input.
+      beta: Tensor with which to center the input.
+      reduction_axes: iterable of integers,
+          axes over which to normalize.
+      epsilon: Fuzz factor.
+
+  Returns:
+      A tuple length of 3, `(normalized_tensor, mean, variance)`.
+  """
+  mean, var = nn.moments(x, reduction_axes, None, None, False)
+  normed = nn.batch_normalization(x, mean, var, beta, gamma, epsilon)
+  return normed, mean, var
+
+
+def _broadcast_normalize_batch_in_training(x,
+                                           gamma,
+                                           beta,
+                                           reduction_axes,
+                                           epsilon=1e-3):
+  """Non-fused, broadcast version of `normalize_batch_in_training`.
+
+  Arguments:
+      x: Input tensor or variable.
+      gamma: Tensor by which to scale the input.
+      beta: Tensor with which to center the input.
+      reduction_axes: iterable of integers,
+          axes over which to normalize.
+      epsilon: Fuzz factor.
+
+  Returns:
+      A tuple length of 3, `(normalized_tensor, mean, variance)`.
+  """
+  mean, var = nn.moments(x, reduction_axes, None, None, False)
+  target_shape = []
+  for axis in range(ndim(x)):
+    if axis in reduction_axes:
+      target_shape.append(1)
+    else:
+      target_shape.append(array_ops.shape(x)[axis])
+  target_shape = array_ops.stack(target_shape)
+
+  broadcast_mean = array_ops.reshape(mean, target_shape)
+  broadcast_var = array_ops.reshape(var, target_shape)
+  if gamma is None:
+    broadcast_gamma = None
+  else:
+    broadcast_gamma = array_ops.reshape(gamma, target_shape)
+  if beta is None:
+    broadcast_beta = None
+  else:
+    broadcast_beta = array_ops.reshape(beta, target_shape)
+
+  normed = nn.batch_normalization(x, broadcast_mean, broadcast_var,
+                                  broadcast_beta, broadcast_gamma, epsilon)
+  return normed, mean, var
+
+
+def _fused_normalize_batch_in_training(x,
+                                       gamma,
+                                       beta,
+                                       reduction_axes,
+                                       epsilon=1e-3):
+  """Fused version of `normalize_batch_in_training`.
+
+  Arguments:
+      x: Input tensor or variable.
+      gamma: Tensor by which to scale the input.
+      beta: Tensor with which to center the input.
+      reduction_axes: iterable of integers,
+          axes over which to normalize.
+      epsilon: Fuzz factor.
+
+  Returns:
+      A tuple length of 3, `(normalized_tensor, mean, variance)`.
+  """
+  if list(reduction_axes) == [0, 1, 2]:
+    normalization_axis = 3
+    tf_data_format = 'NHWC'
+  else:
+    normalization_axis = 1
+    tf_data_format = 'NCHW'
+
+  if gamma is None:
+    gamma = constant_op.constant(
+        1.0, dtype=x.dtype, shape=[x.get_shape()[normalization_axis]])
+  if beta is None:
+    beta = constant_op.constant(
+        0.0, dtype=x.dtype, shape=[x.get_shape()[normalization_axis]])
+
+  return nn.fused_batch_norm(
+      x, gamma, beta, epsilon=epsilon, data_format=tf_data_format)
+
+
 def normalize_batch_in_training(x, gamma, beta, reduction_axes, epsilon=1e-3):
   """Computes mean and std for batch then apply batch_normalization on batch.
 
@@ -1898,33 +2010,19 @@ def normalize_batch_in_training(x, gamma, beta, reduction_axes, epsilon=1e-3):
   Returns:
       A tuple length of 3, `(normalized_tensor, mean, variance)`.
   """
-  mean, var = nn.moments(
-      x, reduction_axes, shift=None, name=None, keep_dims=False)
-  if sorted(reduction_axes) == list(range(ndim(x)))[:-1]:
-    normed = nn.batch_normalization(x, mean, var, beta, gamma, epsilon)
+  if ndim(x) == 4 and list(reduction_axes) in [[0, 1, 2], [0, 2, 3]]:
+    if not _has_nchw_support() and list(reduction_axes) == [0, 2, 3]:
+      return _broadcast_normalize_batch_in_training(
+          x, gamma, beta, reduction_axes, epsilon=epsilon)
+    return _fused_normalize_batch_in_training(
+        x, gamma, beta, reduction_axes, epsilon=epsilon)
   else:
-    # need broadcasting
-    target_shape = []
-    for axis in range(ndim(x)):
-      if axis in reduction_axes:
-        target_shape.append(1)
-      else:
-        target_shape.append(array_ops.shape(x)[axis])
-    target_shape = array_ops.stack(target_shape)
-
-    broadcast_mean = array_ops.reshape(mean, target_shape)
-    broadcast_var = array_ops.reshape(var, target_shape)
-    if gamma is None:
-      broadcast_gamma = None
+    if sorted(reduction_axes) == list(range(ndim(x)))[:-1]:
+      return _regular_normalize_batch_in_training(
+          x, gamma, beta, reduction_axes, epsilon=epsilon)
     else:
-      broadcast_gamma = array_ops.reshape(gamma, target_shape)
-    if beta is None:
-      broadcast_beta = None
-    else:
-      broadcast_beta = array_ops.reshape(beta, target_shape)
-    normed = nn.batch_normalization(x, broadcast_mean, broadcast_var,
-                                    broadcast_beta, broadcast_gamma, epsilon)
-  return normed, mean, var
+      return _broadcast_normalize_batch_in_training(
+          x, gamma, beta, reduction_axes, epsilon=epsilon)
 
 
 def batch_normalization(x, mean, var, beta, gamma, epsilon=1e-3):
@@ -2619,7 +2717,8 @@ def rnn(step_function,
         go_backwards=False,
         mask=None,
         constants=None,
-        unroll=False):
+        unroll=False,
+        input_length=None):
   """Iterates over the time dimension of a tensor.
 
   Arguments:
@@ -2648,6 +2747,7 @@ def rnn(step_function,
       constants: a list of constant values passed at each step.
       unroll: whether to unroll the RNN or to use a symbolic loop
           (`while_loop` or `scan` depending on backend).
+      input_length: Unused; exists for API compatibility.
 
   Returns:
       A tuple, `(last_output, outputs, new_states)`.
@@ -2665,6 +2765,7 @@ def rnn(step_function,
       ValueError: if `mask` is provided (not `None`) but states is not provided
           (`len(states)` == 0).
   """
+  del input_length
   ndim = len(inputs.get_shape())
   if ndim < 3:
     raise ValueError('Input should be at least 3D.')
@@ -3016,7 +3117,7 @@ def elu(x, alpha=1.):
 
   Arguments:
       x: A tensor or variable to compute the activation function for.
-      alpha: A scalar, slope of positive section.
+      alpha: A scalar, slope of negative section.
 
   Returns:
       A tensor.
@@ -3083,7 +3184,7 @@ def categorical_crossentropy(target, output, from_logits=False):
   if not from_logits:
     # scale preds so that the class probas of each sample sum to 1
     output /= math_ops.reduce_sum(
-        output, axis=len(output.get_shape()) - 1, keep_dims=True)
+        output, len(output.get_shape()) - 1, True)
     # manual computation of crossentropy
     epsilon_ = _to_tensor(epsilon(), output.dtype.base_dtype)
     output = clip_ops.clip_by_value(output, epsilon_, 1. - epsilon_)
@@ -3248,6 +3349,25 @@ def in_top_k(predictions, targets, k):
 # CONVOLUTIONS
 
 
+def _preprocess_conv1d_input(x, data_format):
+  """Transpose and cast the input before the conv1d.
+
+  Arguments:
+      x: input tensor.
+      data_format: string, `"channels_last"` or `"channels_first"`.
+
+  Returns:
+      A tensor.
+  """
+  tf_data_format = 'NHWC'  # to pass TF Conv2dNative operations
+  if data_format == 'channels_first':
+    if not _has_nchw_support():
+      x = array_ops.transpose(x, (0, 2, 1))  # NCW -> NWC
+    else:
+      tf_data_format = 'NCHW'
+  return x, tf_data_format
+
+
 def _preprocess_conv2d_input(x, data_format):
   """Transpose and cast the input before the conv2d.
 
@@ -3461,6 +3581,66 @@ def conv2d_transpose(x,
   return x
 
 
+def separable_conv1d(x,
+                     depthwise_kernel,
+                     pointwise_kernel,
+                     strides=1,
+                     padding='valid',
+                     data_format=None,
+                     dilation_rate=1):
+  """1D convolution with separable filters.
+
+  Arguments:
+      x: input tensor
+      depthwise_kernel: convolution kernel for the depthwise convolution.
+      pointwise_kernel: kernel for the 1x1 convolution.
+      strides: stride integer.
+      padding: string, `"same"` or `"valid"`.
+      data_format: string, `"channels_last"` or `"channels_first"`.
+      dilation_rate: integer dilation rate.
+
+  Returns:
+      Output tensor.
+
+  Raises:
+      ValueError: if `data_format` is neither `channels_last` or
+      `channels_first`.
+  """
+  if data_format is None:
+    data_format = image_data_format()
+  if data_format not in {'channels_first', 'channels_last'}:
+    raise ValueError('Unknown data_format ' + str(data_format))
+
+  x, tf_data_format = _preprocess_conv1d_input(x, data_format)
+  padding = _preprocess_padding(padding)
+  if tf_data_format == 'NHWC':
+    spatial_start_dim = 1
+    strides = (1, 1) + strides + (1,)
+  else:
+    spatial_start_dim = 2
+    strides = (1, 1, 1) + strides
+  x = array_ops.expand_dims(x, spatial_start_dim)
+  depthwise_kernel = array_ops.expand_dims(depthwise_kernel, 0)
+  pointwise_kernel = array_ops.expand_dims(pointwise_kernel, 0)
+  dilation_rate = (1,) + dilation_rate
+
+  x = nn.separable_conv2d(
+      x,
+      depthwise_kernel,
+      pointwise_kernel,
+      strides=strides,
+      padding=padding,
+      rate=dilation_rate,
+      data_format=tf_data_format)
+
+  x = array_ops.squeeze(x, [spatial_start_dim])
+
+  if data_format == 'channels_first' and tf_data_format == 'NHWC':
+    x = array_ops.transpose(x, (0, 2, 1))  # NWC -> NCW
+
+  return x
+
+
 def separable_conv2d(x,
                      depthwise_kernel,
                      pointwise_kernel,
@@ -3921,7 +4101,10 @@ def bias_add(x, bias, data_format=None):
   elif ndim(x) == 4:
     if data_format == 'channels_first':
       if len(bias_shape) == 1:
-        x += reshape(bias, (1, bias_shape[0], 1, 1))
+        if _has_nchw_support():
+          x = nn.bias_add(x, bias, data_format='NCHW')
+        else:
+          x += reshape(bias, (1, bias_shape[0], 1, 1))
       else:
         x += reshape(bias, (1, bias_shape[2]) + bias_shape[:2])
     elif data_format == 'channels_last':
@@ -4113,7 +4296,7 @@ def ctc_batch_cost(y_true, y_pred, input_length, label_length):
   sparse_labels = math_ops.to_int32(
       ctc_label_dense_to_sparse(y_true, label_length))
 
-  y_pred = math_ops.log(array_ops.transpose(y_pred, perm=[1, 0, 2]) + 1e-8)
+  y_pred = math_ops.log(array_ops.transpose(y_pred, perm=[1, 0, 2]) + epsilon())
 
   return array_ops.expand_dims(
       ctc.ctc_loss(
@@ -4148,7 +4331,7 @@ def ctc_decode(y_pred, input_length, greedy=True, beam_width=100, top_paths=1):
           Tensor `(top_paths, )` that contains
               the log probability of each decoded sequence.
   """
-  y_pred = math_ops.log(array_ops.transpose(y_pred, perm=[1, 0, 2]) + 1e-8)
+  y_pred = math_ops.log(array_ops.transpose(y_pred, perm=[1, 0, 2]) + epsilon())
   input_length = math_ops.to_int32(input_length)
 
   if greedy:
diff --git a/tensorflow/python/keras/_impl/keras/backend_test.py b/tensorflow/python/keras/_impl/keras/backend_test.py
index e34f1b6926..27833e368d 100644
--- a/tensorflow/python/keras/_impl/keras/backend_test.py
+++ b/tensorflow/python/keras/_impl/keras/backend_test.py
@@ -954,7 +954,6 @@ class BackendNNOpsTest(test.TestCase):
     x = keras.backend.variable(val)
     reduction_axes = (0, 2, 3)
 
-    # case: need broadcasting
     g_val = np.random.random((3,))
     b_val = np.random.random((3,))
     gamma = keras.backend.variable(g_val)
@@ -965,17 +964,6 @@ class BackendNNOpsTest(test.TestCase):
     self.assertEqual(mean.get_shape().as_list(), [3,])
     self.assertEqual(var.get_shape().as_list(), [3,])
 
-    # case: doesn't need broadcasting
-    g_val = np.random.random((1, 3, 1, 1))
-    b_val = np.random.random((1, 3, 1, 1))
-    gamma = keras.backend.variable(g_val)
-    beta = keras.backend.variable(b_val)
-    normed, mean, var = keras.backend.normalize_batch_in_training(
-        x, gamma, beta, reduction_axes, epsilon=1e-3)
-    self.assertEqual(normed.get_shape().as_list(), [10, 3, 10, 10])
-    self.assertEqual(mean.get_shape().as_list(), [3,])
-    self.assertEqual(var.get_shape().as_list(), [3,])
-
     # case: gamma=None
     gamma = None
     normed, mean, var = keras.backend.normalize_batch_in_training(
diff --git a/tensorflow/python/keras/_impl/keras/callbacks.py b/tensorflow/python/keras/_impl/keras/callbacks.py
index 8da3b85718..f0d9e0b0f5 100644
--- a/tensorflow/python/keras/_impl/keras/callbacks.py
+++ b/tensorflow/python/keras/_impl/keras/callbacks.py
@@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Keras callbacks: utilities called at certain points during model training.
+# pylint: disable=g-import-not-at-top
+"""Callbacks: utilities called at certain points during model training.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -36,12 +37,10 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.summary import summary as tf_summary
 
 
-# pylint: disable=g-import-not-at-top
 try:
   import requests
 except ImportError:
   requests = None
-# pylint: enable=g-import-not-at-top
 
 
 class CallbackList(object):
@@ -109,9 +108,9 @@ class CallbackList(object):
     delta_t_median = np.median(self._delta_ts_batch_begin)
     if (self._delta_t_batch > 0. and
         delta_t_median > 0.95 * self._delta_t_batch and delta_t_median > 0.1):
-      logging.warning(
-          'Method on_batch_begin() is slow compared '
-          'to the batch update (%f). Check your callbacks.' % delta_t_median)
+      logging.warning('Method on_batch_begin() is slow compared '
+                      'to the batch update (%f). Check your callbacks.',
+                      delta_t_median)
     self._t_enter_batch = time.time()
 
   def on_batch_end(self, batch, logs=None):
@@ -132,9 +131,9 @@ class CallbackList(object):
     delta_t_median = np.median(self._delta_ts_batch_end)
     if (self._delta_t_batch > 0. and
         (delta_t_median > 0.95 * self._delta_t_batch and delta_t_median > 0.1)):
-      logging.warning(
-          'Method on_batch_end() is slow compared '
-          'to the batch update (%f). Check your callbacks.' % delta_t_median)
+      logging.warning('Method on_batch_end() is slow compared '
+                      'to the batch update (%f). Check your callbacks.',
+                      delta_t_median)
 
   def on_train_begin(self, logs=None):
     """Called at the beginning of training.
@@ -246,7 +245,8 @@ class BaseLogger(Callback):
 
 
 class TerminateOnNaN(Callback):
-  """Callback that terminates training when a NaN loss is encountered."""
+  """Callback that terminates training when a NaN loss is encountered.
+  """
 
   def __init__(self):
     super(TerminateOnNaN, self).__init__()
@@ -396,7 +396,7 @@ class ModelCheckpoint(Callback):
 
     if mode not in ['auto', 'min', 'max']:
       logging.warning('ModelCheckpoint mode %s is unknown, '
-                      'fallback to auto mode.' % mode)
+                      'fallback to auto mode.', (mode), RuntimeWarning)
       mode = 'auto'
 
     if mode == 'min':
@@ -423,11 +423,11 @@ class ModelCheckpoint(Callback):
         current = logs.get(self.monitor)
         if current is None:
           logging.warning('Can save best model only with %s available, '
-                          'skipping.' % (self.monitor))
+                          'skipping.', self.monitor, RuntimeWarning)
         else:
           if self.monitor_op(current, self.best):
             if self.verbose > 0:
-              print('Epoch %05d: %s improved from %0.5f to %0.5f,'
+              print('\nEpoch %05d: %s improved from %0.5f to %0.5f,'
                     ' saving model to %s' % (epoch + 1, self.monitor, self.best,
                                              current, filepath))
             self.best = current
@@ -437,11 +437,11 @@ class ModelCheckpoint(Callback):
               self.model.save(filepath, overwrite=True)
           else:
             if self.verbose > 0:
-              print('Epoch %05d: %s did not improve' % (epoch + 1,
-                                                        self.monitor))
+              print('\nEpoch %05d: %s did not improve' % (epoch + 1,
+                                                          self.monitor))
       else:
         if self.verbose > 0:
-          print('Epoch %05d: saving model to %s' % (epoch + 1, filepath))
+          print('\nEpoch %05d: saving model to %s' % (epoch + 1, filepath))
         if self.save_weights_only:
           self.model.save_weights(filepath, overwrite=True)
         else:
@@ -486,7 +486,7 @@ class EarlyStopping(Callback):
 
     if mode not in ['auto', 'min', 'max']:
       logging.warning('EarlyStopping mode %s is unknown, '
-                      'fallback to auto mode.' % mode)
+                      'fallback to auto mode.', mode, RuntimeWarning)
       mode = 'auto'
 
     if mode == 'min':
@@ -514,8 +514,8 @@ class EarlyStopping(Callback):
     current = logs.get(self.monitor)
     if current is None:
       logging.warning('Early stopping conditioned on metric `%s` '
-                      'which is not available. Available metrics are: %s' %
-                      (self.monitor, ','.join(list(logs.keys()))))
+                      'which is not available. Available metrics are: %s',
+                      self.monitor, ','.join(list(logs.keys())), RuntimeWarning)
       return
     if self.monitor_op(current - self.min_delta, self.best):
       self.best = current
@@ -544,8 +544,6 @@ class RemoteMonitor(Callback):
       path: String; path relative to `root` to which the events will be sent.
       field: String; JSON field under which the data will be stored.
       headers: Dictionary; optional custom HTTP headers.
-          Defaults to:
-          `{'Accept': 'application/json', 'Content-Type': 'application/json'}`
   """
 
   def __init__(self,
@@ -554,11 +552,7 @@ class RemoteMonitor(Callback):
                field='data',
                headers=None):
     super(RemoteMonitor, self).__init__()
-    if headers is None:
-      headers = {
-          'Accept': 'application/json',
-          'Content-Type': 'application/json'
-      }
+
     self.root = root
     self.path = path
     self.field = field
@@ -588,11 +582,13 @@ class LearningRateScheduler(Callback):
       schedule: a function that takes an epoch index as input
           (integer, indexed from 0) and returns a new
           learning rate as output (float).
+      verbose: int. 0: quiet, 1: update messages.
   """
 
-  def __init__(self, schedule):
+  def __init__(self, schedule, verbose=0):
     super(LearningRateScheduler, self).__init__()
     self.schedule = schedule
+    self.verbose = verbose
 
   def on_epoch_begin(self, epoch, logs=None):
     if not hasattr(self.model.optimizer, 'lr'):
@@ -602,6 +598,9 @@ class LearningRateScheduler(Callback):
       raise ValueError('The output of the "schedule" function '
                        'should be float.')
     K.set_value(self.model.optimizer.lr, lr)
+    if self.verbose > 0:
+      print('\nEpoch %05d: LearningRateScheduler reducing learning '
+            'rate to %s.' % (epoch + 1, lr))
 
 
 class TensorBoard(Callback):
@@ -842,7 +841,7 @@ class ReduceLROnPlateau(Callback):
     """
     if self.mode not in ['auto', 'min', 'max']:
       logging.warning('Learning Rate Plateau Reducing mode %s is unknown, '
-                      'fallback to auto mode.' % (self.mode))
+                      'fallback to auto mode.', self.mode, RuntimeWarning)
       self.mode = 'auto'
     if (self.mode == 'min' or
         (self.mode == 'auto' and 'acc' not in self.monitor)):
@@ -853,7 +852,6 @@ class ReduceLROnPlateau(Callback):
       self.best = -np.Inf
     self.cooldown_counter = 0
     self.wait = 0
-    self.lr_epsilon = self.min_lr * 1e-4
 
   def on_train_begin(self, logs=None):
     self._reset()
@@ -864,8 +862,9 @@ class ReduceLROnPlateau(Callback):
     current = logs.get(self.monitor)
     if current is None:
       logging.warning('Reduce LR on plateau conditioned on metric `%s` '
-                      'which is not available. Available metrics are: %s' %
-                      (self.monitor, ','.join(list(logs.keys()))))
+                      'which is not available. Available metrics are: %s',
+                      self.monitor, ','.join(list(logs.keys())), RuntimeWarning)
+
     else:
       if self.in_cooldown():
         self.cooldown_counter -= 1
@@ -877,13 +876,13 @@ class ReduceLROnPlateau(Callback):
       elif not self.in_cooldown():
         if self.wait >= self.patience:
           old_lr = float(K.get_value(self.model.optimizer.lr))
-          if old_lr > self.min_lr + self.lr_epsilon:
+          if old_lr > self.min_lr:
             new_lr = old_lr * self.factor
             new_lr = max(new_lr, self.min_lr)
             K.set_value(self.model.optimizer.lr, new_lr)
             if self.verbose > 0:
-              print('\nEpoch %05d: reducing learning rate to %s.' % (epoch,
-                                                                     new_lr))
+              print('\nEpoch %05d: ReduceLROnPlateau reducing learning '
+                    'rate to %s.' % (epoch + 1, new_lr))
             self.cooldown_counter = self.cooldown
             self.wait = 0
         self.wait += 1
@@ -899,10 +898,11 @@ class CSVLogger(Callback):
   including 1D iterables such as np.ndarray.
 
   Example:
-      ```python
-      csv_logger = CSVLogger('training.log')
-      model.fit(X_train, Y_train, callbacks=[csv_logger])
-      ```
+
+  ```python
+  csv_logger = CSVLogger('training.log')
+  model.fit(X_train, Y_train, callbacks=[csv_logger])
+  ```
 
   Arguments:
       filename: filename of the csv file, e.g. 'run/log.csv'.
@@ -942,12 +942,14 @@ class CSVLogger(Callback):
       else:
         return k
 
+    if self.keys is None:
+      self.keys = sorted(logs.keys())
+
     if self.model.stop_training:
       # We set NA so that csv parsers do not fail for this last epoch.
       logs = dict([(k, logs[k]) if k in logs else (k, 'NA') for k in self.keys])
 
     if not self.writer:
-      self.keys = sorted(logs.keys())
 
       class CustomDialect(csv.excel):
         delimiter = self.sep
@@ -993,32 +995,32 @@ class LambdaCallback(Callback):
 
   Example:
 
-      ```python
-      # Print the batch number at the beginning of every batch.
-      batch_print_callback = LambdaCallback(
-          on_batch_begin=lambda batch,logs: print(batch))
-
-      # Stream the epoch loss to a file in JSON format. The file content
-      # is not well-formed JSON but rather has a JSON object per line.
-      import json
-      json_log = open('loss_log.json', mode='wt', buffering=1)
-      json_logging_callback = LambdaCallback(
-          on_epoch_end=lambda epoch, logs: json_log.write(
-              json.dumps({'epoch': epoch, 'loss': logs['loss']}) + '\n'),
-          on_train_end=lambda logs: json_log.close()
-      )
-
-      # Terminate some processes after having finished model training.
-      processes = ...
-      cleanup_callback = LambdaCallback(
-          on_train_end=lambda logs: [
-              p.terminate() for p in processes if p.is_alive()])
-
-      model.fit(...,
-                callbacks=[batch_print_callback,
-                           json_logging_callback,
-                           cleanup_callback])
-      ```
+  ```python
+  # Print the batch number at the beginning of every batch.
+  batch_print_callback = LambdaCallback(
+      on_batch_begin=lambda batch,logs: print(batch))
+
+  # Stream the epoch loss to a file in JSON format. The file content
+  # is not well-formed JSON but rather has a JSON object per line.
+  import json
+  json_log = open('loss_log.json', mode='wt', buffering=1)
+  json_logging_callback = LambdaCallback(
+      on_epoch_end=lambda epoch, logs: json_log.write(
+          json.dumps({'epoch': epoch, 'loss': logs['loss']}) + '\n'),
+      on_train_end=lambda logs: json_log.close()
+  )
+
+  # Terminate some processes after having finished model training.
+  processes = ...
+  cleanup_callback = LambdaCallback(
+      on_train_end=lambda logs: [
+          p.terminate() for p in processes if p.is_alive()])
+
+  model.fit(...,
+            callbacks=[batch_print_callback,
+                       json_logging_callback,
+                       cleanup_callback])
+  ```
   """
 
   def __init__(self,
diff --git a/tensorflow/python/keras/_impl/keras/constraints.py b/tensorflow/python/keras/_impl/keras/constraints.py
index e58e3b0377..4b051c93f3 100644
--- a/tensorflow/python/keras/_impl/keras/constraints.py
+++ b/tensorflow/python/keras/_impl/keras/constraints.py
@@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Constraints: functions that impose constraints on weights values.
+# pylint: disable=invalid-name
+"""Constraints: functions that impose constraints on weight values.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -54,10 +55,6 @@ class MaxNorm(Constraint):
           to constrain the weights of each filter tensor of size
           `(rows, cols, input_depth)`.
 
-  References:
-      - [Dropout: A Simple Way to Prevent Neural Networks from Overfitting
-        Srivastava, Hinton, et al.
-        2014](http://www.cs.toronto.edu/~rsalakhu/papers/srivastava14a.pdf)
   """
 
   def __init__(self, max_value=2, axis=0):
@@ -79,7 +76,7 @@ class NonNeg(Constraint):
   """
 
   def __call__(self, w):
-    w *= K.cast(w >= 0., K.floatx())
+    w *= K.cast(K.greater_equal(w, 0.), K.floatx())
     return w
 
 
@@ -132,7 +129,7 @@ class MinMaxNorm(Constraint):
           has shape `(input_dim, output_dim)`,
           set `axis` to `0` to constrain each weight vector
           of length `(input_dim,)`.
-          In a `Conv2D` layer with `dim_ordering="channels_last"`,
+          In a `Conv2D` layer with `data_format="channels_last"`,
           the weight tensor has shape
           `(rows, cols, input_depth, output_depth)`,
           set `axis` to `[0, 1, 2]`
@@ -148,8 +145,9 @@ class MinMaxNorm(Constraint):
 
   def __call__(self, w):
     norms = K.sqrt(K.sum(K.square(w), axis=self.axis, keepdims=True))
-    desired = (self.rate * K.clip(norms, self.min_value, self.max_value) +
-               (1 - self.rate) * norms)
+    desired = (
+        self.rate * K.clip(norms, self.min_value, self.max_value) +
+        (1 - self.rate) * norms)
     w *= (desired / (K.epsilon() + norms))
     return w
 
@@ -164,13 +162,15 @@ class MinMaxNorm(Constraint):
 
 # Aliases.
 
-# pylint: disable=invalid-name
 max_norm = MaxNorm
 non_neg = NonNeg
 unit_norm = UnitNorm
 min_max_norm = MinMaxNorm
 
-# pylint: enable=invalid-name
+# Legacy aliases.
+maxnorm = max_norm
+nonneg = non_neg
+unitnorm = unit_norm
 
 
 def serialize(constraint):
diff --git a/tensorflow/python/keras/_impl/keras/datasets/boston_housing.py b/tensorflow/python/keras/_impl/keras/datasets/boston_housing.py
index 0570e9bc0c..cfd7df61d5 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/boston_housing.py
+++ b/tensorflow/python/keras/_impl/keras/datasets/boston_housing.py
@@ -21,29 +21,27 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
-from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('keras.datasets.boston_housing.load_data')
-def load_data(path='boston_housing.npz', seed=113, test_split=0.2):
+def load_data(path='boston_housing.npz', test_split=0.2, seed=113):
   """Loads the Boston Housing dataset.
 
   Arguments:
       path: path where to cache the dataset locally
           (relative to ~/.keras/datasets).
+      test_split: fraction of the data to reserve as test set.
       seed: Random seed for shuffling the data
           before computing the test split.
-      test_split: fraction of the data to reserve as test set.
 
   Returns:
       Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
   """
   assert 0 <= test_split < 1
-  fh = 'f553886a1f8d56431e820c5b82552d9d95cfcb96d1e678153f8839538947dff5'
   path = get_file(
       path,
       origin='https://s3.amazonaws.com/keras-datasets/boston_housing.npz',
-      file_hash=fh)
+      file_hash=
+      'f553886a1f8d56431e820c5b82552d9d95cfcb96d1e678153f8839538947dff5')
   f = np.load(path)
   x = f['x']
   y = f['y']
diff --git a/tensorflow/python/keras/_impl/keras/datasets/cifar.py b/tensorflow/python/keras/_impl/keras/datasets/cifar.py
index 564709c0ee..7ada3340a5 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/cifar.py
+++ b/tensorflow/python/keras/_impl/keras/datasets/cifar.py
@@ -12,9 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Utilities used by the CIFAR10 and CIFAR100 datasets.
+"""Utilities common to CIFAR10 and CIFAR100 datasets.
 """
-
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
diff --git a/tensorflow/python/keras/_impl/keras/datasets/cifar10.py b/tensorflow/python/keras/_impl/keras/datasets/cifar10.py
index 1971f434b9..fb9d98d42c 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/cifar10.py
+++ b/tensorflow/python/keras/_impl/keras/datasets/cifar10.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""CIFAR10 small image classification dataset.
+"""CIFAR10 small images classification dataset.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -25,10 +25,8 @@ import numpy as np
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.datasets.cifar import load_batch
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
-from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('keras.datasets.cifar10.load_data')
 def load_data():
   """Loads CIFAR10 dataset.
 
diff --git a/tensorflow/python/keras/_impl/keras/datasets/cifar100.py b/tensorflow/python/keras/_impl/keras/datasets/cifar100.py
index f4039e9350..95aace599a 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/cifar100.py
+++ b/tensorflow/python/keras/_impl/keras/datasets/cifar100.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""CIFAR100 small image classification dataset.
+"""CIFAR100 small images classification dataset.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -25,10 +25,8 @@ import numpy as np
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.datasets.cifar import load_batch
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
-from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('keras.datasets.cifar100.load_data')
 def load_data(label_mode='fine'):
   """Loads CIFAR100 dataset.
 
@@ -42,7 +40,7 @@ def load_data(label_mode='fine'):
       ValueError: in case of invalid `label_mode`.
   """
   if label_mode not in ['fine', 'coarse']:
-    raise ValueError('label_mode must be one of "fine" "coarse".')
+    raise ValueError('`label_mode` must be one of `"fine"`, `"coarse"`.')
 
   dirname = 'cifar-100-python'
   origin = 'https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz'
diff --git a/tensorflow/python/keras/_impl/keras/datasets/fashion_mnist.py b/tensorflow/python/keras/_impl/keras/datasets/fashion_mnist.py
index 17be684e4f..b9ae41a0d4 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/fashion_mnist.py
+++ b/tensorflow/python/keras/_impl/keras/datasets/fashion_mnist.py
@@ -20,7 +20,9 @@ from __future__ import print_function
 
 import gzip
 import os
+
 import numpy as np
+
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
 
 
@@ -38,9 +40,8 @@ def load_data():
   ]
 
   paths = []
-  for given_file in files:
-    paths.append(
-        get_file(given_file, origin=base + given_file, cache_subdir=dirname))
+  for fname in files:
+    paths.append(get_file(fname, origin=base + fname, cache_subdir=dirname))
 
   with gzip.open(paths[0], 'rb') as lbpath:
     y_train = np.frombuffer(lbpath.read(), np.uint8, offset=8)
diff --git a/tensorflow/python/keras/_impl/keras/datasets/imdb.py b/tensorflow/python/keras/_impl/keras/datasets/imdb.py
index 7946c46960..880c9c821b 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/imdb.py
+++ b/tensorflow/python/keras/_impl/keras/datasets/imdb.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""IMDB movie review sentiment classification dataset.
+"""IMDB sentiment classification dataset.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -21,13 +21,12 @@ from __future__ import print_function
 import json
 
 import numpy as np
-from six.moves import zip  # pylint: disable=redefined-builtin
 
+from tensorflow.python.keras._impl.keras.preprocessing.sequence import _remove_long_seq
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.platform import tf_logging as logging
 
 
-@tf_export('keras.datasets.imdb.load_data')
 def load_data(path='imdb.npz',
               num_words=None,
               skip_top=0,
@@ -35,7 +34,8 @@ def load_data(path='imdb.npz',
               seed=113,
               start_char=1,
               oov_char=2,
-              index_from=3):
+              index_from=3,
+              **kwargs):
   """Loads the IMDB dataset.
 
   Arguments:
@@ -52,6 +52,7 @@ def load_data(path='imdb.npz',
       oov_char: words that were cut out because of the `num_words`
           or `skip_top` limit will be replaced with this character.
       index_from: index actual words with this index and higher.
+      **kwargs: Used for backwards compatibility.
 
   Returns:
       Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
@@ -66,14 +67,21 @@ def load_data(path='imdb.npz',
   Words that were not seen in the training set but are in the test set
   have simply been skipped.
   """
+  # Legacy support
+  if 'nb_words' in kwargs:
+    logging.warning('The `nb_words` argument in `load_data` '
+                    'has been renamed `num_words`.')
+    num_words = kwargs.pop('nb_words')
+  if kwargs:
+    raise TypeError('Unrecognized keyword arguments: ' + str(kwargs))
+
   path = get_file(
       path,
       origin='https://s3.amazonaws.com/text-datasets/imdb.npz',
       file_hash='599dadb1135973df5b59232a0e9a887c')
-  f = np.load(path)
-  x_train, labels_train = f['x_train'], f['y_train']
-  x_test, labels_test = f['x_test'], f['y_test']
-  f.close()
+  with np.load(path) as f:
+    x_train, labels_train = f['x_train'], f['y_train']
+    x_test, labels_test = f['x_test'], f['y_test']
 
   np.random.seed(seed)
   indices = np.arange(len(x_train))
@@ -95,14 +103,7 @@ def load_data(path='imdb.npz',
     xs = [[w + index_from for w in x] for x in xs]
 
   if maxlen:
-    new_xs = []
-    new_labels = []
-    for x, y in zip(xs, labels):
-      if len(x) < maxlen:
-        new_xs.append(x)
-        new_labels.append(y)
-    xs = new_xs
-    labels = new_labels
+    xs, labels = _remove_long_seq(maxlen, xs, labels)
     if not xs:
       raise ValueError('After filtering for sequences shorter than maxlen=' +
                        str(maxlen) + ', no sequence was kept. '
@@ -114,28 +115,19 @@ def load_data(path='imdb.npz',
   # reserve 'index_from' (=3 by default) characters:
   # 0 (padding), 1 (start), 2 (OOV)
   if oov_char is not None:
-    xs = [[oov_char if (w >= num_words or w < skip_top) else w for w in x]
-          for x in xs]
+    xs = [
+        [w if (skip_top <= w < num_words) else oov_char for w in x] for x in xs
+    ]
   else:
-    new_xs = []
-    for x in xs:
-      nx = []
-      for w in x:
-        if skip_top <= w < num_words:
-          nx.append(w)
-      new_xs.append(nx)
-    xs = new_xs
-
-  x_train = np.array(xs[:len(x_train)])
-  y_train = np.array(labels[:len(x_train)])
+    xs = [[w for w in x if skip_top <= w < num_words] for x in xs]
 
-  x_test = np.array(xs[len(x_train):])
-  y_test = np.array(labels[len(x_train):])
+  idx = len(x_train)
+  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
+  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])
 
   return (x_train, y_train), (x_test, y_test)
 
 
-@tf_export('keras.datasets.imdb.get_word_index')
 def get_word_index(path='imdb_word_index.json'):
   """Retrieves the dictionary mapping word indices back to words.
 
@@ -147,7 +139,8 @@ def get_word_index(path='imdb_word_index.json'):
   """
   path = get_file(
       path,
-      origin='https://s3.amazonaws.com/text-datasets/imdb_word_index.json')
+      origin='https://s3.amazonaws.com/text-datasets/imdb_word_index.json',
+      file_hash='bfafd718b763782e994055a2d397834f')
   f = open(path)
   data = json.load(f)
   f.close()
diff --git a/tensorflow/python/keras/_impl/keras/datasets/mnist.py b/tensorflow/python/keras/_impl/keras/datasets/mnist.py
index e9f5348015..ec12a31dcf 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/mnist.py
+++ b/tensorflow/python/keras/_impl/keras/datasets/mnist.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""MNIST handwritten digits classification dataset.
+"""MNIST handwritten digits dataset.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -21,10 +21,8 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
-from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('keras.datasets.mnist.load_data')
 def load_data(path='mnist.npz'):
   """Loads the MNIST dataset.
 
@@ -40,9 +38,7 @@ def load_data(path='mnist.npz'):
       origin='https://s3.amazonaws.com/img-datasets/mnist.npz',
       file_hash='8a61469f7ea1b51cbae51d4f78837e45')
   f = np.load(path)
-  x_train = f['x_train']
-  y_train = f['y_train']
-  x_test = f['x_test']
-  y_test = f['y_test']
+  x_train, y_train = f['x_train'], f['y_train']
+  x_test, y_test = f['x_test'], f['y_test']
   f.close()
   return (x_train, y_train), (x_test, y_test)
diff --git a/tensorflow/python/keras/_impl/keras/datasets/reuters.py b/tensorflow/python/keras/_impl/keras/datasets/reuters.py
index 6da5aa4b5e..95cf8852a9 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/reuters.py
+++ b/tensorflow/python/keras/_impl/keras/datasets/reuters.py
@@ -12,9 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Reuters newswire topic classification dataset.
+"""Reuters topic classification dataset.
 """
-
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -22,13 +21,12 @@ from __future__ import print_function
 import json
 
 import numpy as np
-from six.moves import zip  # pylint: disable=redefined-builtin
 
+from tensorflow.python.keras._impl.keras.preprocessing.sequence import _remove_long_seq
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.platform import tf_logging as logging
 
 
-@tf_export('keras.datasets.reuters.load_data')
 def load_data(path='reuters.npz',
               num_words=None,
               skip_top=0,
@@ -37,7 +35,8 @@ def load_data(path='reuters.npz',
               seed=113,
               start_char=1,
               oov_char=2,
-              index_from=3):
+              index_from=3,
+              **kwargs):
   """Loads the Reuters newswire classification dataset.
 
   Arguments:
@@ -55,6 +54,7 @@ def load_data(path='reuters.npz',
       oov_char: words that were cut out because of the `num_words`
           or `skip_top` limit will be replaced with this character.
       index_from: index actual words with this index and higher.
+      **kwargs: Used for backwards compatibility.
 
   Returns:
       Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
@@ -65,14 +65,20 @@ def load_data(path='reuters.npz',
   Words that were not seen in the training set but are in the test set
   have simply been skipped.
   """
+  # Legacy support
+  if 'nb_words' in kwargs:
+    logging.warning('The `nb_words` argument in `load_data` '
+                    'has been renamed `num_words`.')
+    num_words = kwargs.pop('nb_words')
+  if kwargs:
+    raise TypeError('Unrecognized keyword arguments: ' + str(kwargs))
+
   path = get_file(
       path,
       origin='https://s3.amazonaws.com/text-datasets/reuters.npz',
       file_hash='87aedbeb0cb229e378797a632c1997b6')
-  npzfile = np.load(path)
-  xs = npzfile['x']
-  labels = npzfile['y']
-  npzfile.close()
+  with np.load(path) as f:
+    xs, labels = f['x'], f['y']
 
   np.random.seed(seed)
   indices = np.arange(len(xs))
@@ -80,22 +86,13 @@ def load_data(path='reuters.npz',
   xs = xs[indices]
   labels = labels[indices]
 
-  np.random.shuffle(labels)
-
   if start_char is not None:
     xs = [[start_char] + [w + index_from for w in x] for x in xs]
   elif index_from:
     xs = [[w + index_from for w in x] for x in xs]
 
   if maxlen:
-    new_xs = []
-    new_labels = []
-    for x, y in zip(xs, labels):
-      if len(x) < maxlen:
-        new_xs.append(x)
-        new_labels.append(y)
-    xs = new_xs
-    labels = new_labels
+    xs, labels = _remove_long_seq(maxlen, xs, labels)
 
   if not num_words:
     num_words = max([max(x) for x in xs])
@@ -104,28 +101,17 @@ def load_data(path='reuters.npz',
   # reserve 'index_from' (=3 by default) characters:
   # 0 (padding), 1 (start), 2 (OOV)
   if oov_char is not None:
-    xs = [[oov_char if (w >= num_words or w < skip_top) else w for w in x]
-          for x in xs]
+    xs = [[w if skip_top <= w < num_words else oov_char for w in x] for x in xs]
   else:
-    new_xs = []
-    for x in xs:
-      nx = []
-      for w in x:
-        if skip_top <= w < num_words:
-          nx.append(w)
-      new_xs.append(nx)
-    xs = new_xs
-
-  x_train = np.array(xs[:int(len(xs) * (1 - test_split))])
-  y_train = np.array(labels[:int(len(xs) * (1 - test_split))])
+    xs = [[w for w in x if skip_top <= w < num_words] for x in xs]
 
-  x_test = np.array(xs[int(len(xs) * (1 - test_split)):])
-  y_test = np.array(labels[int(len(xs) * (1 - test_split)):])
+  idx = int(len(xs) * (1 - test_split))
+  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
+  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])
 
   return (x_train, y_train), (x_test, y_test)
 
 
-@tf_export('keras.datasets.reuters.get_word_index')
 def get_word_index(path='reuters_word_index.json'):
   """Retrieves the dictionary mapping word indices back to words.
 
diff --git a/tensorflow/python/keras/_impl/keras/engine/topology.py b/tensorflow/python/keras/_impl/keras/engine/topology.py
index d6e0be8e43..64aa868f38 100644
--- a/tensorflow/python/keras/_impl/keras/engine/topology.py
+++ b/tensorflow/python/keras/_impl/keras/engine/topology.py
@@ -27,6 +27,7 @@ import numpy as np
 from six.moves import zip  # pylint: disable=redefined-builtin
 
 from tensorflow.python.eager import context
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import constraints
 from tensorflow.python.keras._impl.keras import initializers
@@ -712,8 +713,8 @@ class Network(tf_network.GraphNetwork, Layer):
     for layer in self._output_layers:
       self.output_names.append(layer.name)
 
-    self.internal_input_shapes = [K.int_shape(x) for x in self.inputs]
-    self.internal_output_shapes = [K.int_shape(x) for x in self.outputs]
+    self._internal_input_shapes = [K.int_shape(x) for x in self.inputs]
+    self._internal_output_shapes = [K.int_shape(x) for x in self.outputs]
 
   @property
   def uses_learning_phase(self):
@@ -1303,18 +1304,17 @@ def preprocess_weights_for_loading(layer,
   Returns:
       A list of weights values (Numpy arrays).
   """
-  if original_keras_version == '1':
-    if layer.__class__.__name__ == 'Bidirectional':
-      num_weights_per_layer = len(weights) // 2
-
-      forward_weights = preprocess_weights_for_loading(
-          layer.forward_layer, weights[:num_weights_per_layer],
-          original_keras_version, original_backend)
-      backward_weights = preprocess_weights_for_loading(
-          layer.backward_layer, weights[num_weights_per_layer:],
-          original_keras_version, original_backend)
-      weights = forward_weights + backward_weights
+  if layer.__class__.__name__ == 'Bidirectional':
+    num_weights_per_layer = len(weights) // 2
+    forward_weights = preprocess_weights_for_loading(
+        layer.forward_layer, weights[:num_weights_per_layer],
+        original_keras_version, original_backend)
+    backward_weights = preprocess_weights_for_loading(
+        layer.backward_layer, weights[num_weights_per_layer:],
+        original_keras_version, original_backend)
+    weights = forward_weights + backward_weights
 
+  if original_keras_version == '1':
     if layer.__class__.__name__ == 'TimeDistributed':
       weights = preprocess_weights_for_loading(
           layer.layer, weights, original_keras_version, original_backend)
@@ -1418,7 +1418,7 @@ def preprocess_weights_for_loading(layer,
 
   conv_layers = ['Conv1D', 'Conv2D', 'Conv3D', 'Conv2DTranspose', 'ConvLSTM2D']
   if layer.__class__.__name__ in conv_layers:
-    if original_backend and K.backend() != original_backend:
+    if original_backend == 'theano':
       weights[0] = conv_utils.convert_kernel(weights[0])
       if layer.__class__.__name__ == 'ConvLSTM2D':
         weights[1] = conv_utils.convert_kernel(weights[1])
@@ -1427,10 +1427,9 @@ def preprocess_weights_for_loading(layer,
       if layer.__class__.__name__ == 'ConvLSTM2D':
         weights[1] = np.transpose(weights[1], (3, 2, 0, 1))
 
-  # convert the weights of CuDNNLSTM so that they could be loaded into LSTM
+  # Convert the weights of CuDNNLSTM so that they could be loaded into LSTM
   if layer.__class__.__name__ == 'LSTM' and len(weights) == 3:
-    # determine if we're loading a CuDNNLSTM layer from the number of bias
-    # weights:
+    # Determine if loading a CuDNNLSTM layer from the number of bias weights:
     # CuDNNLSTM has (units * 8) weights; while LSTM has (units * 4)
     # if there's no bias weight in the file, skip this conversion
     units = weights[1].shape[0]
@@ -1572,3 +1571,31 @@ def load_weights_from_hdf5_group_by_name(f, layers):
       for i in range(len(weight_values)):
         weight_value_tuples.append((symbolic_weights[i], weight_values[i]))
   K.batch_set_value(weight_value_tuples)
+
+
+def shape_type_conversion(fn):
+  """Decorator that handles tuple/TensorShape conversion.
+
+  Used in `compute_output_shape` and `build`.
+
+  Arguments:
+    fn: function to wrap.
+
+  Returns:
+    Wrapped function.
+  """
+
+  def wrapper(instance, input_shape):
+    if input_shape is not None:
+      if isinstance(input_shape, list):
+        input_shape = [
+            tuple(tensor_shape.TensorShape(x).as_list()) for x in input_shape]
+      else:
+        input_shape = tuple(tensor_shape.TensorShape(input_shape).as_list())
+    output_shape = fn(instance, input_shape)
+    if output_shape is not None:
+      if isinstance(output_shape, list):
+        return [tensor_shape.TensorShape(x) for x in output_shape]
+      return tensor_shape.TensorShape(output_shape)
+
+  return wrapper
diff --git a/tensorflow/python/keras/_impl/keras/engine/training.py b/tensorflow/python/keras/_impl/keras/engine/training.py
index debea2503e..699ae2edf0 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training.py
@@ -12,9 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Keras training and evaluation routines.
+"""Training-related part of the Keras engine.
 """
-
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -35,6 +34,11 @@ from tensorflow.python.keras._impl.keras.utils.data_utils import Sequence
 from tensorflow.python.keras._impl.keras.utils.generic_utils import Progbar
 from tensorflow.python.platform import tf_logging as logging
 
+try:
+  from scipy.sparse import issparse  # pylint: disable=g-import-not-at-top
+except ImportError:
+  issparse = None
+
 
 def _standardize_input_data(data,
                             names,
@@ -70,89 +74,72 @@ def _standardize_input_data(data,
     return []
   if data is None:
     return [None for _ in range(len(names))]
+
   if isinstance(data, dict):
-    for key, value in data.items():
-      if value.__class__.__name__ == 'DataFrame':
-        data[key] = value.values
-    arrays = []
-    for name in names:
-      if name not in data:
-        raise ValueError('No data provided for "' + name +
-                         '". Need data for each key in: ' + str(names))
-      arrays.append(data[name])
+    try:
+      data = [
+          data[x].values
+          if data[x].__class__.__name__ == 'DataFrame' else data[x]
+          for x in names
+      ]
+      data = [np.expand_dims(x, 1) if x.ndim == 1 else x for x in data]
+    except KeyError as e:
+      raise ValueError('No data provided for "' + e.args[0] + '". Need data '
+                       'for each key in: ' + str(names))
   elif isinstance(data, list):
-    for key, value in enumerate(data):
-      if value.__class__.__name__ == 'DataFrame':
-        data[key] = value.values
-    if len(data) != len(names):
-      if data and hasattr(data[0], 'shape'):
-        raise ValueError(
-            'Error when checking model ' + exception_prefix +
-            ': the list of Numpy arrays '
-            'that you are passing to your model '
-            'is not the size the model expected. '
-            'Expected to see ' + str(len(names)) + ' array(s), but instead got '
-            'the following list of ' + str(len(data)) + ' arrays: ' +
-            str(data)[:200] + '...')
-      else:
-        if len(names) == 1:
-          data = [np.asarray(data)]
-        else:
-          raise ValueError('Error when checking model ' + exception_prefix +
-                           ': you are passing a list as '
-                           'input to your model, '
-                           'but the model expects '
-                           'a list of ' + str(len(names)) +
-                           ' Numpy arrays instead. '
-                           'The list you passed was: ' + str(data)[:200])
-    arrays = data
-  elif data.__class__.__name__ == 'DataFrame':
-    # test if data is a DataFrame, without pandas installed
-    arrays = data.values
+    data = [
+        x.values if x.__class__.__name__ == 'DataFrame' else x for x in data
+    ]
+    data = [
+        np.expand_dims(x, 1) if x is not None and x.ndim == 1 else x
+        for x in data
+    ]
   else:
-    if not hasattr(data, 'shape'):
+    data = data.values if data.__class__.__name__ == 'DataFrame' else data
+    data = [np.expand_dims(data, 1)] if data.ndim == 1 else [data]
+
+  if len(data) != len(names):
+    if data and hasattr(data[0], 'shape'):
+      raise ValueError('Error when checking model ' + exception_prefix +
+                       ': the list of Numpy arrays that you are passing to '
+                       'your model is not the size the model expected. '
+                       'Expected to see ' + str(len(names)) + ' array(s), '
+                       'but instead got the following list of ' +
+                       str(len(data)) + ' arrays: ' + str(data)[:200] + '...')
+    elif len(names) > 1:
+      raise ValueError(
+          'Error when checking model ' + exception_prefix +
+          ': you are passing a list as input to your model, '
+          'but the model expects a list of ' + str(len(names)) +
+          ' Numpy arrays instead. The list you passed was: ' + str(data)[:200])
+    elif len(data) == 1 and not hasattr(data[0], 'shape'):
       raise TypeError('Error when checking model ' + exception_prefix +
-                      ': data should be a Numpy array, '
-                      'or list/dict of Numpy arrays. '
-                      'Found: ' + str(data)[:200] + '...')
-    if len(names) > 1:
-      # Case: model expects multiple inputs but only received
-      # a single Numpy array.
-      raise ValueError('The model expects ' + str(len(names)) + ' ' +
-                       exception_prefix +
-                       ' arrays, but only received one array. '
-                       'Found: array with shape ' + str(data.shape))
-    arrays = [data]
-
-  # Make arrays at least 2D.
-  for i in range(len(names)):
-    array = arrays[i]
-    if len(array.shape) == 1:
-      array = np.expand_dims(array, 1)
-      arrays[i] = array
+                      ': data should be a Numpy array, or list/dict of '
+                      'Numpy arrays. Found: ' + str(data)[:200] + '...')
+    elif len(names) == 1:
+      data = [np.asarray(data)]
 
   # Check shapes compatibility.
   if shapes:
     for i in range(len(names)):
-      if shapes[i] is None:
-        continue
-      array = arrays[i]
-      if len(array.shape) != len(shapes[i]):
-        raise ValueError(
-            'Error when checking ' + exception_prefix + ': expected ' + names[i]
-            + ' to have ' + str(len(shapes[i])) +
-            ' dimensions, but got array with shape ' + str(array.shape))
-      for j, (dim, ref_dim) in enumerate(zip(array.shape, shapes[i])):
-        if not j and not check_batch_axis:
-          # skip the first axis
-          continue
-        if ref_dim:
-          if ref_dim != dim:
-            raise ValueError('Error when checking ' + exception_prefix +
-                             ': expected ' + names[i] + ' to have shape ' +
-                             str(shapes[i]) + ' but got array with shape ' +
-                             str(array.shape))
-  return arrays
+      if shapes[i] is not None:
+        data_shape = data[i].shape
+        shape = shapes[i]
+        if data[i].ndim != len(shape):
+          raise ValueError('Error when checking ' + exception_prefix +
+                           ': expected ' + names[i] + ' to have ' +
+                           str(len(shape)) + ' dimensions, but got array '
+                           'with shape ' + str(data_shape))
+        if not check_batch_axis:
+          data_shape = data_shape[1:]
+          shape = shape[1:]
+        for dim, ref_dim in zip(data_shape, shape):
+          if ref_dim != dim and ref_dim:
+            raise ValueError(
+                'Error when checking ' + exception_prefix + ': expected ' +
+                names[i] + ' to have shape ' + str(shape) +
+                ' but got array with shape ' + str(data_shape))
+  return data
 
 
 def _standardize_sample_or_class_weights(x_weight, output_names, weight_type):
@@ -193,10 +180,10 @@ def _standardize_sample_or_class_weights(x_weight, output_names, weight_type):
       x_weights.append(x_weight.get(name))
     return x_weights
   else:
-    raise TypeError('The model has multiple outputs, so `' + weight_type + '` '
-                    'should be either a list or a dict. '
-                    'Provided `' + weight_type + '` type not understood: ' +
-                    str(x_weight))
+    raise TypeError(
+        'The model has multiple outputs, so `' + weight_type + '` '
+        'should be either a list or a dict. '
+        'Provided `' + weight_type + '` type not understood: ' + str(x_weight))
 
 
 def _standardize_class_weights(class_weight, output_names):
@@ -234,12 +221,12 @@ def _check_array_lengths(inputs, targets, weights=None):
   set_w = set_of_lengths(weights)
   if len(set_x) > 1:
     raise ValueError('All input arrays (x) should have '
-                     'the same number of samples. Got array shapes: ' + str(
-                         [x.shape for x in inputs]))
+                     'the same number of samples. Got array shapes: ' +
+                     str([x.shape for x in inputs]))
   if len(set_y) > 1:
     raise ValueError('All target arrays (y) should have '
-                     'the same number of samples. Got array shapes: ' + str(
-                         [y.shape for y in targets]))
+                     'the same number of samples. Got array shapes: ' +
+                     str([y.shape for y in targets]))
   if set_x and set_y and list(set_x)[0] != list(set_y)[0]:
     raise ValueError('Input arrays should have '
                      'the same number of samples as target arrays. '
@@ -247,8 +234,8 @@ def _check_array_lengths(inputs, targets, weights=None):
                      'and ' + str(list(set_y)[0]) + ' target samples.')
   if len(set_w) > 1:
     raise ValueError('All sample_weight arrays should have '
-                     'the same number of samples. Got array shapes: ' + str(
-                         [w.shape for w in weights]))
+                     'the same number of samples. Got array shapes: ' +
+                     str([w.shape for w in weights]))
   if set_y and set_w and list(set_y)[0] != list(set_w)[0]:
     raise ValueError('Sample_weight arrays should have '
                      'the same number of samples as target arrays. Got ' +
@@ -528,16 +515,16 @@ def _standardize_weights(y,
 
   if sample_weight is not None:
     if len(sample_weight.shape) > len(y.shape):
-      raise ValueError('Found a sample_weight with shape' +
-                       str(sample_weight.shape) + '.'
-                       'Expected sample_weight with rank '
-                       'less than or equal to ' + str(len(y.shape)))
+      raise ValueError(
+          'Found a sample_weight with shape' + str(sample_weight.shape) + '.'
+          'Expected sample_weight with rank '
+          'less than or equal to ' + str(len(y.shape)))
 
     if y.shape[:sample_weight.ndim] != sample_weight.shape:
-      raise ValueError('Found a sample_weight array with shape ' +
-                       str(sample_weight.shape) + ' for an input with shape ' +
-                       str(y.shape) + '. '
-                       'sample_weight cannot be broadcast.')
+      raise ValueError(
+          'Found a sample_weight array with shape ' + str(sample_weight.shape) +
+          ' for an input with shape ' + str(y.shape) + '. '
+          'sample_weight cannot be broadcast.')
     return sample_weight
   elif isinstance(class_weight, dict):
     if len(y.shape) > 2:
@@ -632,7 +619,6 @@ class Model(Network):
     """
     loss = loss or {}
     self.optimizer = optimizers.get(optimizer)
-    self.sample_weight_mode = sample_weight_mode
     self.loss = loss
     self.loss_weights = loss_weights
     self.sample_weight_mode = sample_weight_mode
@@ -641,10 +627,10 @@ class Model(Network):
     if isinstance(loss, dict):
       for name in loss:
         if name not in self.output_names:
-          raise ValueError('Unknown entry in loss '
-                           'dictionary: "' + name + '". '
-                           'Only expected the following keys: ' +
-                           str(self.output_names))
+          raise ValueError(
+              'Unknown entry in loss '
+              'dictionary: "' + name + '". '
+              'Only expected the following keys: ' + str(self.output_names))
       loss_functions = []
       for name in self.output_names:
         if name not in loss:
@@ -657,7 +643,7 @@ class Model(Network):
     elif isinstance(loss, list):
       if len(loss) != len(self.outputs):
         raise ValueError('When passing a list as loss, '
-                         'it should have one entry per model output. '
+                         'it should have one entry per model outputs. '
                          'The model has ' + str(len(self.outputs)) +
                          ' outputs, but you passed loss=' + str(loss))
       loss_functions = [losses.get(l) for l in loss]
@@ -690,20 +676,20 @@ class Model(Network):
     elif isinstance(loss_weights, dict):
       for name in loss_weights:
         if name not in self.output_names:
-          raise ValueError('Unknown entry in loss_weights '
-                           'dictionary: "' + name + '". '
-                           'Only expected the following keys: ' +
-                           str(self.output_names))
+          raise ValueError(
+              'Unknown entry in loss_weights '
+              'dictionary: "' + name + '". '
+              'Only expected the following keys: ' + str(self.output_names))
       loss_weights_list = []
       for name in self.output_names:
         loss_weights_list.append(loss_weights.get(name, 1.))
     elif isinstance(loss_weights, list):
       if len(loss_weights) != len(self.outputs):
-        raise ValueError('When passing a list as loss_weights, '
-                         'it should have one entry per model output. '
-                         'The model has ' + str(len(self.outputs)) +
-                         ' outputs, but you passed loss_weights=' +
-                         str(loss_weights))
+        raise ValueError(
+            'When passing a list as loss_weights, '
+            'it should have one entry per model output. '
+            'The model has ' + str(len(self.outputs)) +
+            ' outputs, but you passed loss_weights=' + str(loss_weights))
       loss_weights_list = loss_weights
     else:
       raise TypeError('Could not interpret loss_weights argument: ' +
@@ -715,22 +701,22 @@ class Model(Network):
     if target_tensors is not None:
       if isinstance(target_tensors, list):
         if len(target_tensors) != len(self.outputs):
-          raise ValueError('When passing a list as `target_tensors`, '
-                           'it should have one entry per model output. '
-                           'The model has ' + str(len(self.outputs)) +
-                           ' outputs, but you passed target_tensors=' +
-                           str(target_tensors))
+          raise ValueError(
+              'When passing a list as `target_tensors`, '
+              'it should have one entry per model output. '
+              'The model has ' + str(len(self.outputs)) +
+              ' outputs, but you passed target_tensors=' + str(target_tensors))
       elif isinstance(target_tensors, dict):
         for name in target_tensors:
           if name not in self.output_names:
-            raise ValueError('Unknown entry in `target_tensors` '
-                             'dictionary: "' + name + '". '
-                             'Only expected the following keys: ' +
-                             str(self.output_names))
-        target_tensors_ = []
+            raise ValueError(
+                'Unknown entry in `target_tensors` '
+                'dictionary: "' + name + '". '
+                'Only expected the following keys: ' + str(self.output_names))
+        tmp_target_tensors = []
         for name in self.output_names:
-          target_tensors_.append(target_tensors.get(name, None))
-        target_tensors = target_tensors_
+          tmp_target_tensors.append(target_tensors.get(name, None))
+        target_tensors = tmp_target_tensors
       else:
         raise TypeError('Expected `target_tensors` to be '
                         'a list or dict, but got:', target_tensors)
@@ -738,7 +724,7 @@ class Model(Network):
       if i in skip_target_indices:
         self.targets.append(None)
       else:
-        shape = self.internal_output_shapes[i]
+        shape = self._internal_output_shapes[i]
         name = self.output_names[i]
         if target_tensors is not None:
           target = target_tensors[i]
@@ -766,19 +752,19 @@ class Model(Network):
     if isinstance(sample_weight_mode, dict):
       for name in sample_weight_mode:
         if name not in self.output_names:
-          raise ValueError('Unknown entry in '
-                           'sample_weight_mode dictionary: "' + name + '". '
-                           'Only expected the following keys: ' +
-                           str(self.output_names))
+          raise ValueError(
+              'Unknown entry in '
+              'sample_weight_mode dictionary: "' + name + '". '
+              'Only expected the following keys: ' + str(self.output_names))
       for i, name in enumerate(self.output_names):
         if i in skip_target_weighing_indices:
           weight = None
           sample_weight_modes.append(None)
         else:
           if name not in sample_weight_mode:
-            raise ValueError('Output "' + name +
-                             '" missing from sample_weight_modes '
-                             'dictionary')
+            raise ValueError(
+                'Output "' + name + '" missing from sample_weight_modes '
+                'dictionary')
           if sample_weight_mode.get(name) == 'temporal':
             weight = K.placeholder(ndim=2, name=name + '_sample_weights')
             sample_weight_modes.append('temporal')
@@ -894,23 +880,36 @@ class Model(Network):
           metric_name_prefix = 'weighted_' if weights is not None else ''
 
           for metric in metrics:
-            if metric == 'accuracy' or metric == 'acc':
-              # custom handling of accuracy
+            if metric in ('accuracy', 'acc', 'crossentropy', 'ce'):
+              # custom handling of accuracy/crossentropy
               # (because of class mode duality)
-              output_shape = self.internal_output_shapes[i]
+              output_shape = self._internal_output_shapes[i]
               if (output_shape[-1] == 1 or
                   self.loss_functions[i] == losses.binary_crossentropy):
-                # case: binary accuracy
-                acc_fn = metrics_module.binary_accuracy
+                # case: binary accuracy/crossentropy
+                if metric in ('accuracy', 'acc'):
+                  acc_fn = metrics_module.binary_accuracy
+                elif metric in ('crossentropy', 'ce'):
+                  acc_fn = metrics_module.binary_crossentropy
               elif self.loss_functions[
                   i] == losses.sparse_categorical_crossentropy:
-                # case: categorical accuracy with sparse targets
-                acc_fn = metrics_module.sparse_categorical_accuracy
+                # case: categorical accuracy/crossentropy with sparse targets
+                if metric in ('accuracy', 'acc'):
+                  acc_fn = metrics_module.sparse_categorical_accuracy
+                elif metric in ('crossentropy', 'ce'):
+                  acc_fn = metrics_module.sparse_categorical_crossentropy
               else:
-                acc_fn = metrics_module.categorical_accuracy
-
+                # case: categorical accuracy/crossentropy
+                if metric in ('accuracy', 'acc'):
+                  acc_fn = metrics_module.categorical_accuracy
+                elif metric in ('crossentropy', 'ce'):
+                  acc_fn = metrics_module.categorical_crossentropy
+              if metric in ('accuracy', 'acc'):
+                suffix = 'acc'
+              elif metric in ('crossentropy', 'ce'):
+                suffix = 'ce'
               weighted_metric_fn = _weighted_masked_objective(acc_fn)
-              metric_name = metric_name_prefix + 'acc'
+              metric_name = metric_name_prefix + suffix
             else:
               metric_fn = metrics_module.get(metric)
               weighted_metric_fn = _weighted_masked_objective(metric_fn)
@@ -949,7 +948,7 @@ class Model(Network):
     """Check trainable weights count consistency.
 
     This will raise a warning if `trainable_weights` and
-    `_collected_trainable_weights` are consistent (i.e. have the same
+    `_collected_trainable_weights` are inconsistent (i.e. have different
     number of parameters).
     Inconsistency will typically arise when one modifies `model.trainable`
     without calling `model.compile` again.
@@ -959,9 +958,10 @@ class Model(Network):
 
     if len(self.trainable_weights) != len(self._collected_trainable_weights):
       logging.warning(
-          'Discrepancy between trainable weights and collected trainable'
-          ' weights, did you set `model.trainable` without calling'
-          ' `model.compile` after ?')
+          UserWarning(
+              'Discrepancy between trainable weights and collected trainable'
+              ' weights, did you set `model.trainable` without calling'
+              ' `model.compile` after ?'))
 
   def _make_train_function(self):
     if not hasattr(self, 'train_function'):
@@ -1050,18 +1050,21 @@ class Model(Network):
         processed based on the size of the first dimension of the
         first input numpy array. When steps is not `None` and
         `batch_size` is `None`, returns `None`.
+
+    Raises:
+        ValueError: In case of invalid arguments.
     """
     if steps is not None:
       num_samples = None
       if batch_size is not None:
-        raise ValueError('If ' + steps_name +
-                         ' is set, the `batch_size` must be None.')
+        raise ValueError(
+            'If ' + steps_name + ' is set, the `batch_size` must be None.')
     elif ins and hasattr(ins[0], 'shape'):
       num_samples = ins[0].shape[0]
     else:
-      raise ValueError('Either the input data should have '
-                       'a defined shape, or ' + steps_name +
-                       ' should be specified.')
+      raise ValueError(
+          'Either the input data should have '
+          'a defined shape, or ' + steps_name + ' should be specified.')
     return num_samples
 
   def _fit_loop(self,
@@ -1104,31 +1107,33 @@ class Model(Network):
         steps_per_epoch: Total number of steps (batches of samples)
             before declaring one epoch finished and starting the
             next epoch. Ignored with the default value of `None`.
-        validation_steps: Number of steps to run validation for (only if doing
-          validation from data tensors). Ignored with default value of `None`.
+        validation_steps: Number of steps to run validation for
+            (only if doing validation from data tensors).
+            Ignored with the default value of `None`.
 
     Returns:
         `History` object.
 
     Raises:
-      ValueError: In case of invalid argument values.
+        ValueError: in case of invalid arguments.
     """
     do_validation = False
     if val_f and val_ins:
       do_validation = True
-      if (verbose and ins and
-          hasattr(ins[0], 'shape') and hasattr(val_ins[0], 'shape')):
+      if verbose and ins and hasattr(ins[0], 'shape') and hasattr(
+          val_ins[0], 'shape'):
         print('Train on %d samples, validate on %d samples' %
               (ins[0].shape[0], val_ins[0].shape[0]))
     if validation_steps:
-      if steps_per_epoch is None:
-        raise ValueError('Can only use `validation_steps` when doing step-wise '
-                         'training, i.e. `steps_per_epoch` must be set.')
       do_validation = True
+      if steps_per_epoch is None:
+        raise ValueError('Can only use `validation_steps` '
+                         'when doing step-wise '
+                         'training, i.e. `steps_per_epoch` '
+                         'must be set.')
 
     num_train_samples = self._check_num_samples(
         ins, batch_size, steps_per_epoch, 'steps_per_epoch')
-
     if num_train_samples is not None:
       index_array = np.arange(num_train_samples)
 
@@ -1165,6 +1170,13 @@ class Model(Network):
     for cbk in callbacks:
       cbk.validation_data = val_ins
 
+    # To prevent a slowdown, we find beforehand the arrays that need conversion.
+    feed = self._feed_inputs + self._feed_targets + self._feed_sample_weights
+    indices_for_conversion_to_dense = []
+    for i in range(len(feed)):
+      if issparse is not None and issparse(ins[i]) and not K.is_sparse(feed[i]):
+        indices_for_conversion_to_dense.append(i)
+
     for epoch in range(initial_epoch, epochs):
       callbacks.on_epoch_begin(epoch)
       epoch_logs = {}
@@ -1220,6 +1232,9 @@ class Model(Network):
           batch_logs['batch'] = batch_index
           batch_logs['size'] = len(batch_ids)
           callbacks.on_batch_begin(batch_index, batch_logs)
+          for i in indices_for_conversion_to_dense:
+            ins_batch[i] = ins_batch[i].toarray()
+
           outs = f(ins_batch)
           if not isinstance(outs, list):
             outs = [outs]
@@ -1268,6 +1283,13 @@ class Model(Network):
         progbar = Progbar(target=steps)
       else:
         progbar = Progbar(target=num_samples)
+
+    indices_for_conversion_to_dense = []
+    for i in range(len(self._feed_inputs)):
+      if (issparse is not None and issparse(ins[i]) and
+          not K.is_sparse(self._feed_inputs[i])):
+        indices_for_conversion_to_dense.append(i)
+
     if steps is not None:
       # Step-based predictions.
       # Since we do not know how many samples
@@ -1305,6 +1327,9 @@ class Model(Network):
           ins_batch = _slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
         else:
           ins_batch = _slice_arrays(ins, batch_ids)
+        for i in indices_for_conversion_to_dense:
+          ins_batch[i] = ins_batch[i].toarray()
+
         batch_outs = f(ins_batch)
         if not isinstance(batch_outs, list):
           batch_outs = [batch_outs]
@@ -1341,12 +1366,19 @@ class Model(Network):
     """
     num_samples = self._check_num_samples(ins, batch_size, steps, 'steps')
     outs = []
-
     if verbose == 1:
       if steps is not None:
         progbar = Progbar(target=steps)
       else:
         progbar = Progbar(target=num_samples)
+
+    # To prevent a slowdown, we find beforehand the arrays that need conversion.
+    feed = self._feed_inputs + self._feed_targets + self._feed_sample_weights
+    indices_for_conversion_to_dense = []
+    for i in range(len(feed)):
+      if issparse is not None and issparse(ins[i]) and not K.is_sparse(feed[i]):
+        indices_for_conversion_to_dense.append(i)
+
     if steps is not None:
       for step in range(steps):
         batch_outs = f(ins)
@@ -1365,8 +1397,6 @@ class Model(Network):
       for i in range(len(outs)):
         outs[i] /= steps
     else:
-      if verbose == 1:
-        progbar = Progbar(target=num_samples)
       batches = _make_batches(num_samples, batch_size)
       index_array = np.arange(num_samples)
       for batch_index, (batch_start, batch_end) in enumerate(batches):
@@ -1376,6 +1406,8 @@ class Model(Network):
           ins_batch = _slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
         else:
           ins_batch = _slice_arrays(ins, batch_ids)
+        for i in indices_for_conversion_to_dense:
+          ins_batch[i] = ins_batch[i].toarray()
 
         batch_outs = f(ins_batch)
         if isinstance(batch_outs, list):
@@ -1484,7 +1516,8 @@ class Model(Network):
           sample_weight=None,
           initial_epoch=0,
           steps_per_epoch=None,
-          validation_steps=None):
+          validation_steps=None,
+          **kwargs):
     """Trains the model for a fixed number of epochs (iterations on a dataset).
 
     Arguments:
@@ -1501,10 +1534,9 @@ class Model(Network):
             dictionary mapping output names to Numpy arrays.
             `y` can be `None` (default) if feeding from
             TensorFlow data tensors.
-            Can be `None` (default) if feeding from framework-native tensors.
         batch_size: Integer or `None`.
             Number of samples per gradient update.
-            If unspecified, it will default to 32.
+            If unspecified, `batch_size` will default to 32.
         epochs: Integer. Number of epochs to train the model.
             An epoch is an iteration over the entire `x` and `y`
             data provided.
@@ -1513,7 +1545,7 @@ class Model(Network):
             The model is not trained for a number of iterations
             given by `epochs`, but merely until the epoch
             of index `epochs` is reached.
-        verbose: 0, 1, or 2. Verbosity mode.
+        verbose: Integer. 0, 1, or 2. Verbosity mode.
             0 = silent, 1 = progress bar, 2 = one line per epoch.
         callbacks: List of `keras.callbacks.Callback` instances.
             List of callbacks to apply during training.
@@ -1530,7 +1562,7 @@ class Model(Network):
             `(x_val, y_val, val_sample_weights)` on which to evaluate
             the loss and any model metrics at the end of each epoch.
             The model will not be trained on this data.
-            This will override `validation_split`.
+            `validation_data` will override `validation_split`.
         shuffle: Boolean (whether to shuffle the training data
             before each epoch) or str (for 'batch').
             'batch' is a special option for dealing with the
@@ -1553,17 +1585,20 @@ class Model(Network):
             to apply a different weight to every timestep of every sample.
             In this case you should make sure to specify
             `sample_weight_mode="temporal"` in `compile()`.
-        initial_epoch: Epoch at which to start training
+        initial_epoch: Integer.
+            Epoch at which to start training
             (useful for resuming a previous training run).
-        steps_per_epoch: Total number of steps (batches of samples)
+        steps_per_epoch: Integer or `None`.
+            Total number of steps (batches of samples)
             before declaring one epoch finished and starting the
             next epoch. When training with input tensors such as
             TensorFlow data tensors, the default `None` is equal to
-            the number of unique samples in your dataset divided by
+            the number of samples in your dataset divided by
             the batch size, or 1 if that cannot be determined.
         validation_steps: Only relevant if `steps_per_epoch`
             is specified. Total number of steps (batches of samples)
             to validate before stopping.
+        **kwargs: Used for backwards compatibility.
 
     Returns:
         A `History` object. Its `History.history` attribute is
@@ -1572,12 +1607,21 @@ class Model(Network):
         and validation metrics values (if applicable).
 
     Raises:
+        RuntimeError: If the model was never compiled.
         ValueError: In case of mismatch between the provided input data
             and what the model expects.
     """
     # Backwards compatibility
     if batch_size is None and steps_per_epoch is None:
       batch_size = 32
+    # Legacy support
+    if 'nb_epoch' in kwargs:
+      logging.warning(
+          'The `nb_epoch` argument in `fit` '
+          'has been renamed `epochs`.')
+      epochs = kwargs.pop('nb_epoch')
+    if kwargs:
+      raise TypeError('Unrecognized keyword arguments: ' + str(kwargs))
     if x is None and y is None and steps_per_epoch is None:
       raise ValueError('If fitting from data tensors, '
                        'you should specify the `steps_per_epoch` '
@@ -1590,10 +1634,8 @@ class Model(Network):
         class_weight=class_weight,
         check_batch_axis=False,
         batch_size=batch_size)
-
     # Prepare validation data.
     do_validation = False
-    val_ins = []
     if validation_data:
       do_validation = True
       if len(validation_data) == 2:
@@ -1657,8 +1699,9 @@ class Model(Network):
           'val_' + n for n in out_labels
       ]
     else:
-      val_f = None
       callback_metrics = copy.copy(out_labels)
+      val_f = None
+      val_ins = []
 
     # Delegate logic to `_fit_loop`.
     return self._fit_loop(
@@ -1694,14 +1737,14 @@ class Model(Network):
             If input layers in the model are named, you can also pass a
             dictionary mapping input names to Numpy arrays.
             `x` can be `None` (default) if feeding from
-            framework-native tensors (e.g. TensorFlow data tensors).
+            TensorFlow data tensors.
         y: Numpy array of target (label) data
             (if the model has a single output),
             or list of Numpy arrays (if the model has multiple outputs).
             If output layers in the model are named, you can also pass a
             dictionary mapping output names to Numpy arrays.
             `y` can be `None` (default) if feeding from
-            framework-native tensors (e.g. TensorFlow data tensors).
+            TensorFlow data tensors.
         batch_size: Integer or `None`.
             Number of samples per evaluation step.
             If unspecified, `batch_size` will default to 32.
@@ -1721,8 +1764,7 @@ class Model(Network):
         steps: Integer or `None`.
             Total number of steps (batches of samples)
             before declaring the evaluation round finished.
-            The default `None` is equal to the number of unique samples in
-            your dataset divided by the batch size.
+            Ignored with the default value of `None`.
 
     Returns:
         Scalar test loss (if the model has a single output and no metrics)
@@ -1731,7 +1773,7 @@ class Model(Network):
         the display labels for the scalar outputs.
 
     Raises:
-      ValueError: In case of invalid arguments.
+        ValueError: in case of invalid arguments.
     """
     # Backwards compatibility.
     if batch_size is None and steps is None:
@@ -1890,6 +1932,9 @@ class Model(Network):
         or list of scalars (if the model has multiple outputs
         and/or metrics). The attribute `model.metrics_names` will give you
         the display labels for the scalar outputs.
+
+    Raises:
+        ValueError: in case of invalid arguments.
     """
     x, y, sample_weights = self._standardize_user_data(
         x, y, sample_weight=sample_weight, check_batch_axis=True)
@@ -1937,8 +1982,7 @@ class Model(Network):
                     workers=1,
                     use_multiprocessing=False,
                     shuffle=True,
-                    initial_epoch=0,
-                    **kwargs):
+                    initial_epoch=0):
     """Fits the model on data yielded batch-by-batch by a Python generator.
 
     The generator is run in parallel to the model, for efficiency.
@@ -1950,22 +1994,31 @@ class Model(Network):
     using `use_multiprocessing=True`.
 
     Arguments:
-        generator: A generator or an instance of Sequence (keras.utils.Sequence)
-            object in order to avoid duplicate data when using multiprocessing.
+        generator: A generator or an instance of `Sequence`
+          (`keras.utils.Sequence`)
+            object in order to avoid duplicate data
+            when using multiprocessing.
             The output of the generator must be either
-            - a tuple (inputs, targets)
-            - a tuple (inputs, targets, sample_weights).
-            All arrays should contain the same number of samples.
+            - a tuple `(inputs, targets)`
+            - a tuple `(inputs, targets, sample_weights)`.
+            This tuple (a single output of the generator) makes a single batch.
+            Therefore, all arrays in this tuple must have the same length (equal
+            to the size of this batch). Different batches may have different
+              sizes.
+            For example, the last batch of the epoch is commonly smaller than
+              the
+            others, if the size of the dataset is not divisible by the batch
+              size.
             The generator is expected to loop over its data
             indefinitely. An epoch finishes when `steps_per_epoch`
             batches have been seen by the model.
         steps_per_epoch: Total number of steps (batches of samples)
             to yield from `generator` before declaring one epoch
             finished and starting the next epoch. It should typically
-            be equal to the number of unique samples of your dataset
+            be equal to the number of samples of your dataset
             divided by the batch size.
             Optional for `Sequence`: if unspecified, will use
-            `len(generator)` as a number of steps.
+            the `len(generator)` as a number of steps.
         epochs: Integer, total number of iterations on the data.
         verbose: Verbosity mode, 0, 1, or 2.
         callbacks: List of callbacks to be called during training.
@@ -1977,27 +2030,28 @@ class Model(Network):
             is a generator. Total number of steps (batches of samples)
             to yield from `generator` before stopping.
             Optional for `Sequence`: if unspecified, will use
-            `len(generator)` as a number of steps.
+            the `len(validation_data)` as a number of steps.
         class_weight: Dictionary mapping class indices to a weight
             for the class.
-        max_queue_size: Maximum size for the generator queue.
+        max_queue_size: Integer. Maximum size for the generator queue.
+            If unspecified, `max_queue_size` will default to 10.
         workers: Integer. Maximum number of processes to spin up
             when using process based threading.
             If unspecified, `workers` will default to 1. If 0, will
             execute the generator on the main thread.
-        use_multiprocessing: If True, use process based threading.
+        use_multiprocessing: Boolean. If True, use process based threading.
+            If unspecified, `workers` will default to False.
             Note that because
             this implementation relies on multiprocessing,
             you should not pass
             non picklable arguments to the generator
             as they can't be passed
             easily to children processes.
-        shuffle: Whether to shuffle the data at the beginning of each
-            epoch. Only used with instances of `Sequence`
-            (`keras.utils.Sequence`).
+        shuffle: Whether to shuffle the order of the batches at
+            the beginning of each epoch. Only used with instances
+            of `Sequence` (keras.utils.Sequence).
         initial_epoch: Epoch at which to start training
             (useful for resuming a previous training run)
-        **kwargs: support for legacy arguments.
 
     Returns:
         A `History` object.
@@ -2023,19 +2077,6 @@ class Model(Network):
         ValueError: In case the generator yields
             data in an invalid format.
     """
-    # Legacy support
-    if 'max_q_size' in kwargs:
-      max_queue_size = kwargs.pop('max_q_size')
-      logging.warning('The argument `max_q_size` has been renamed '
-                      '`max_queue_size`. Update your method calls accordingly.')
-    if 'pickle_safe' in kwargs:
-      use_multiprocessing = kwargs.pop('pickle_safe')
-      logging.warning('The argument `pickle_safe` has been renamed '
-                      '`use_multiprocessing`. '
-                      'Update your method calls accordingly.')
-    if kwargs:
-      raise ValueError('Unrecognized keyword arguments: ' + str(kwargs))
-
     wait_time = 0.01  # in seconds
     epoch = initial_epoch
 
@@ -2046,10 +2087,11 @@ class Model(Network):
 
     is_sequence = isinstance(generator, Sequence)
     if not is_sequence and use_multiprocessing and workers > 1:
-      logging.warning('Using a generator with `use_multiprocessing=True`'
+      logging.warning(
+          UserWarning('Using a generator with `use_multiprocessing=True`'
                       ' and multiple workers may duplicate your data.'
                       ' Please consider using the`keras.utils.Sequence'
-                      ' class.')
+                      ' class.'))
     if steps_per_epoch is None:
       if is_sequence:
         steps_per_epoch = len(generator)
@@ -2098,26 +2140,47 @@ class Model(Network):
     })
     callbacks.on_train_begin()
 
-    if do_validation and not val_gen:
-      if len(validation_data) == 2:
-        val_x, val_y = validation_data  # pylint: disable=unpacking-non-sequence
-        val_sample_weight = None
-      elif len(validation_data) == 3:
-        val_x, val_y, val_sample_weight = validation_data  # pylint: disable=unpacking-non-sequence
-      else:
-        raise ValueError('`validation_data` should be a tuple '
-                         '`(val_x, val_y, val_sample_weight)` '
-                         'or `(val_x, val_y)`. Found: ' + str(validation_data))
-      val_x, val_y, val_sample_weights = self._standardize_user_data(
-          val_x, val_y, val_sample_weight)
-      val_data = val_x + val_y + val_sample_weights
-      if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
-        val_data += [0.]
-      for cbk in callbacks:
-        cbk.validation_data = val_data
     enqueuer = None
+    val_enqueuer = None
 
     try:
+      if do_validation:
+        if val_gen:
+          if workers > 0:
+            if isinstance(validation_data, Sequence):
+              val_enqueuer = OrderedEnqueuer(
+                  validation_data, use_multiprocessing=use_multiprocessing)
+              if validation_steps is None:
+                validation_steps = len(validation_data)
+            else:
+              val_enqueuer = GeneratorEnqueuer(
+                  validation_data,
+                  use_multiprocessing=use_multiprocessing,
+                  wait_time=wait_time)
+            val_enqueuer.start(workers=workers, max_queue_size=max_queue_size)
+            validation_generator = val_enqueuer.get()
+          else:
+            validation_generator = validation_data
+        else:
+          if len(validation_data) == 2:
+            val_x, val_y = validation_data  # pylint: disable=unpacking-non-sequence
+            val_sample_weight = None
+          elif len(validation_data) == 3:
+            val_x, val_y, val_sample_weight = validation_data  # pylint: disable=unpacking-non-sequence
+          else:
+            raise ValueError(
+                '`validation_data` should be a tuple '
+                '`(val_x, val_y, val_sample_weight)` '
+                'or `(val_x, val_y)`. Found: ' + str(validation_data))
+          val_x, val_y, val_sample_weights = self._standardize_user_data(
+              val_x, val_y, val_sample_weight)
+          val_data = val_x + val_y + val_sample_weights
+          if self.uses_learning_phase and not isinstance(
+              K.learning_phase(), int):
+            val_data += [0.]
+          for cbk in callbacks:
+            cbk.validation_data = val_data
+
       if workers > 0:
         if is_sequence:
           enqueuer = OrderedEnqueuer(
@@ -2135,6 +2198,8 @@ class Model(Network):
         output_generator = generator
 
       callback_model.stop_training = False
+      # Construct epoch logs.
+      epoch_logs = {}
       while epoch < epochs:
         callbacks.on_epoch_begin(epoch)
         steps_done = 0
@@ -2178,8 +2243,6 @@ class Model(Network):
 
           callbacks.on_batch_end(batch_index, batch_logs)
 
-          # Construct epoch logs.
-          epoch_logs = {}
           batch_index += 1
           steps_done += 1
 
@@ -2187,11 +2250,7 @@ class Model(Network):
           if steps_done >= steps_per_epoch and do_validation:
             if val_gen:
               val_outs = self.evaluate_generator(
-                  validation_data,
-                  validation_steps,
-                  max_queue_size=max_queue_size,
-                  workers=workers,
-                  use_multiprocessing=use_multiprocessing)
+                  validation_generator, validation_steps, workers=0)
             else:
               # No need for try/except because
               # data has already been validated.
@@ -2216,8 +2275,12 @@ class Model(Network):
           break
 
     finally:
-      if enqueuer is not None:
-        enqueuer.stop()
+      try:
+        if enqueuer is not None:
+          enqueuer.stop()
+      finally:
+        if val_enqueuer is not None:
+          val_enqueuer.stop()
 
     callbacks.on_train_end()
     return self.history
@@ -2227,8 +2290,7 @@ class Model(Network):
                          steps=None,
                          max_queue_size=10,
                          workers=1,
-                         use_multiprocessing=False,
-                         **kwargs):
+                         use_multiprocessing=False):
     """Evaluates the model on a data generator.
 
     The generator should return the same kind of data
@@ -2256,7 +2318,6 @@ class Model(Network):
             non picklable arguments to the generator
             as they can't be passed
             easily to children processes.
-        **kwargs: support for legacy arguments.
 
     Returns:
         Scalar test loss (if the model has a single output and no metrics)
@@ -2264,23 +2325,13 @@ class Model(Network):
         and/or metrics). The attribute `model.metrics_names` will give you
         the display labels for the scalar outputs.
 
+    Raises:
+        ValueError: in case of invalid arguments.
+
     Raises:
         ValueError: In case the generator yields
             data in an invalid format.
     """
-    # Legacy support
-    if 'max_q_size' in kwargs:
-      max_queue_size = kwargs.pop('max_q_size')
-      logging.warning('The argument `max_q_size` has been renamed '
-                      '`max_queue_size`. Update your method calls accordingly.')
-    if 'pickle_safe' in kwargs:
-      use_multiprocessing = kwargs.pop('pickle_safe')
-      logging.warning('The argument `pickle_safe` has been renamed '
-                      '`use_multiprocessing`. '
-                      'Update your method calls accordingly.')
-    if kwargs:
-      raise ValueError('Unrecognized keyword arguments: ' + str(kwargs))
-
     self._make_test_function()
 
     steps_done = 0
@@ -2289,10 +2340,11 @@ class Model(Network):
     batch_sizes = []
     is_sequence = isinstance(generator, Sequence)
     if not is_sequence and use_multiprocessing and workers > 1:
-      logging.warning('Using a generator with `use_multiprocessing=True`'
+      logging.warning(
+          UserWarning('Using a generator with `use_multiprocessing=True`'
                       ' and multiple workers may duplicate your data.'
                       ' Please consider using the`keras.utils.Sequence'
-                      ' class.')
+                      ' class.'))
     if steps is None:
       if is_sequence:
         steps = len(generator)
@@ -2368,8 +2420,7 @@ class Model(Network):
                         max_queue_size=10,
                         workers=1,
                         use_multiprocessing=False,
-                        verbose=0,
-                        **kwargs):
+                        verbose=0):
     """Generates predictions for the input samples from a data generator.
 
     The generator should return the same kind of data as accepted by
@@ -2377,9 +2428,9 @@ class Model(Network):
 
     Arguments:
         generator: Generator yielding batches of input samples
-                or an instance of Sequence (keras.utils.Sequence)
-                object in order to avoid duplicate data
-                when using multiprocessing.
+            or an instance of Sequence (keras.utils.Sequence)
+            object in order to avoid duplicate data
+            when using multiprocessing.
         steps: Total number of steps (batches of samples)
             to yield from `generator` before stopping.
             Optional for `Sequence`: if unspecified, will use
@@ -2397,7 +2448,6 @@ class Model(Network):
             as they can't be passed
             easily to children processes.
         verbose: verbosity mode, 0 or 1.
-        **kwargs: support for legacy arguments.
 
     Returns:
         Numpy array(s) of predictions.
@@ -2406,17 +2456,6 @@ class Model(Network):
         ValueError: In case the generator yields
             data in an invalid format.
     """
-    # Legacy support
-    if 'max_q_size' in kwargs:
-      max_queue_size = kwargs.pop('max_q_size')
-      logging.warning('The argument `max_q_size` has been renamed '
-                      '`max_queue_size`. Update your method calls accordingly.')
-    if 'pickle_safe' in kwargs:
-      use_multiprocessing = kwargs.pop('pickle_safe')
-      logging.warning('The argument `pickle_safe` has been renamed '
-                      '`use_multiprocessing`. '
-                      'Update your method calls accordingly.')
-
     self._make_predict_function()
 
     steps_done = 0
@@ -2424,10 +2463,11 @@ class Model(Network):
     all_outs = []
     is_sequence = isinstance(generator, Sequence)
     if not is_sequence and use_multiprocessing and workers > 1:
-      logging.warn('Using a generator with `use_multiprocessing=True`'
-                   ' and multiple workers may duplicate your data.'
-                   ' Please consider using the`keras.utils.Sequence'
-                   ' class.')
+      logging.warning(
+          UserWarning('Using a generator with `use_multiprocessing=True`'
+                      ' and multiple workers may duplicate your data.'
+                      ' Please consider using the`keras.utils.Sequence'
+                      ' class.'))
     if steps is None:
       if is_sequence:
         steps = len(generator)
@@ -2498,6 +2538,6 @@ class Model(Network):
       else:
         return np.concatenate(all_outs[0])
     if steps_done == 1:
-      return [out for out in all_outs]
+      return [out[0] for out in all_outs]
     else:
       return [np.concatenate(out) for out in all_outs]
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_test.py b/tensorflow/python/keras/_impl/keras/engine/training_test.py
index 7650bfb6e8..5a033a04ad 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_test.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training_test.py
@@ -28,6 +28,11 @@ from tensorflow.python.keras._impl.keras import testing_utils
 from tensorflow.python.keras._impl.keras.engine.training import _weighted_masked_objective
 from tensorflow.python.platform import test
 
+try:
+  import scipy.sparse as scipy_sparse  # pylint: disable=g-import-not-at-top
+except ImportError:
+  scipy_sparse = None
+
 
 class TrainingTest(test.TestCase):
 
@@ -169,7 +174,7 @@ class TrainingTest(test.TestCase):
       with self.assertRaises(ValueError):
         model.train_on_batch({'input_a': input_a_np},
                              [output_d_np, output_e_np])
-      with self.assertRaises(TypeError):
+      with self.assertRaises(AttributeError):
         model.fit(
             [input_a_np, input_b_np], [output_d_np, output_e_np],
             epochs=1,
@@ -177,7 +182,7 @@ class TrainingTest(test.TestCase):
             verbose=0)
       with self.assertRaises(ValueError):
         model.train_on_batch([input_a_np], [output_d_np, output_e_np])
-      with self.assertRaises(TypeError):
+      with self.assertRaises(AttributeError):
         model.train_on_batch(1, [output_d_np, output_e_np])
       with self.assertRaises(ValueError):
         model.train_on_batch(input_a_np, [output_d_np, output_e_np])
@@ -312,6 +317,63 @@ class TrainingTest(test.TestCase):
         model.compile(loss=None,
                       optimizer='rmsprop')
 
+  def test_training_on_sparse_data_with_dense_placeholders(self):
+    if scipy_sparse is None:
+      return
+
+    test_inputs = [
+        scipy_sparse.random(6, 3, density=0.25).tocsr() for _ in range(2)]
+    test_outputs = [
+        scipy_sparse.random(6, i, density=0.25).tocsr() for i in range(3, 5)]
+    in1 = keras.layers.Input(shape=(3,))
+    in2 = keras.layers.Input(shape=(3,))
+    out1 = keras.layers.Dropout(0.5, name='dropout')(in1)
+    out2 = keras.layers.Dense(4, name='dense_1')(in2)
+    model = keras.Model([in1, in2], [out1, out2])
+    model.predict(test_inputs, batch_size=2)
+    model.compile('rmsprop', 'mse')
+    model.fit(test_inputs, test_outputs,
+              epochs=1, batch_size=2, validation_split=0.5)
+    model.evaluate(test_inputs, test_outputs, batch_size=2)
+
+  def test_that_trainable_disables_updates(self):
+    val_a = np.random.random((10, 4))
+    val_out = np.random.random((10, 4))
+
+    with self.test_session():
+      a = keras.layers.Input(shape=(4,))
+      layer = keras.layers.BatchNormalization(input_shape=(4,))
+      b = layer(a)
+      model = keras.Model(a, b)
+
+      model.trainable = False
+      assert not model.updates
+
+      model.compile('sgd', 'mse')
+      assert not model.updates
+
+      x1 = model.predict(val_a)
+      model.train_on_batch(val_a, val_out)
+      x2 = model.predict(val_a)
+      self.assertAllClose(x1, x2, atol=1e-7)
+
+      model.trainable = True
+      model.compile('sgd', 'mse')
+      assert model.updates
+
+      model.train_on_batch(val_a, val_out)
+      x2 = model.predict(val_a)
+      assert np.abs(np.sum(x1 - x2)) > 1e-5
+
+      layer.trainable = False
+      model.compile('sgd', 'mse')
+      assert not model.updates
+
+      x1 = model.predict(val_a)
+      model.train_on_batch(val_a, val_out)
+      x2 = model.predict(val_a)
+      self.assertAllClose(x1, x2, atol=1e-7)
+
 
 class LossWeightingTest(test.TestCase):
 
@@ -869,25 +931,6 @@ class TestGeneratorMethods(test.TestCase):
                                  use_multiprocessing=False,
                                  workers=0)
 
-        # Test legacy API
-        model.fit_generator(custom_generator(),
-                            steps_per_epoch=5,
-                            epochs=1,
-                            verbose=1,
-                            max_q_size=10,
-                            workers=4,
-                            pickle_safe=True)
-        model.predict_generator(custom_generator(),
-                                steps=5,
-                                max_q_size=10,
-                                workers=2,
-                                pickle_safe=True)
-        model.evaluate_generator(custom_generator(),
-                                 steps=5,
-                                 max_q_size=10,
-                                 workers=2,
-                                 pickle_safe=True)
-
   def test_generator_methods_with_sample_weights(self):
     arr_data = np.random.random((50, 2))
     arr_labels = np.random.random((50,))
@@ -960,7 +1003,7 @@ class TestGeneratorMethods(test.TestCase):
                             use_multiprocessing=False,
                             validation_data=custom_generator(),
                             validation_steps=10)
-      with self.assertRaises(TypeError):
+      with self.assertRaises(AttributeError):
         model.predict_generator(custom_generator(),
                                 steps=5,
                                 max_queue_size=10,
diff --git a/tensorflow/python/keras/_impl/keras/layers/advanced_activations.py b/tensorflow/python/keras/_impl/keras/layers/advanced_activations.py
index e4b9afd38a..ffbf77c4b8 100644
--- a/tensorflow/python/keras/_impl/keras/layers/advanced_activations.py
+++ b/tensorflow/python/keras/_impl/keras/layers/advanced_activations.py
@@ -14,18 +14,18 @@
 # ==============================================================================
 """Layers that act as activation functions.
 """
-
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras._impl.keras import activations
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import constraints
 from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
+from tensorflow.python.keras._impl.keras.engine.topology import shape_type_conversion
 
 
 class LeakyReLU(Layer):
@@ -61,6 +61,7 @@ class LeakyReLU(Layer):
     base_config = super(LeakyReLU, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
+  @shape_type_conversion
   def compute_output_shape(self, input_shape):
     return input_shape
 
@@ -114,9 +115,9 @@ class PReLU(Layer):
     else:
       self.shared_axes = list(shared_axes)
 
+  @shape_type_conversion
   def build(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    param_shape = input_shape[1:]
+    param_shape = list(input_shape[1:])
     self.param_broadcast = [False] * len(param_shape)
     if self.shared_axes is not None:
       for i in self.shared_axes:
@@ -140,15 +141,13 @@ class PReLU(Layer):
   def call(self, inputs, mask=None):
     pos = K.relu(inputs)
     if K.backend() == 'theano':
-      neg = (K.pattern_broadcast(self.alpha, self.param_broadcast) *
-             (inputs - K.abs(inputs)) * 0.5)
+      neg = (
+          K.pattern_broadcast(self.alpha, self.param_broadcast) *
+          (inputs - K.abs(inputs)) * 0.5)
     else:
       neg = -self.alpha * K.relu(-inputs)
     return pos + neg
 
-  def compute_output_shape(self, input_shape):
-    return input_shape
-
   def get_config(self):
     config = {
         'alpha_initializer': initializers.serialize(self.alpha_initializer),
@@ -159,6 +158,10 @@ class PReLU(Layer):
     base_config = super(PReLU, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
+  @shape_type_conversion
+  def compute_output_shape(self, input_shape):
+    return input_shape
+
 
 class ELU(Layer):
   """Exponential Linear Unit.
@@ -188,14 +191,15 @@ class ELU(Layer):
   def call(self, inputs):
     return K.elu(inputs, self.alpha)
 
-  def compute_output_shape(self, input_shape):
-    return input_shape
-
   def get_config(self):
     config = {'alpha': float(self.alpha)}
     base_config = super(ELU, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
+  @shape_type_conversion
+  def compute_output_shape(self, input_shape):
+    return input_shape
+
 
 class ThresholdedReLU(Layer):
   """Thresholded Rectified Linear Unit.
@@ -223,12 +227,46 @@ class ThresholdedReLU(Layer):
     self.theta = K.cast_to_floatx(theta)
 
   def call(self, inputs, mask=None):
-    return inputs * K.cast(inputs > self.theta, K.floatx())
+    return inputs * K.cast(K.greater(inputs, self.theta), K.floatx())
+
+  def get_config(self):
+    config = {'theta': float(self.theta)}
+    base_config = super(ThresholdedReLU, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
 
+  @shape_type_conversion
   def compute_output_shape(self, input_shape):
     return input_shape
 
+
+class Softmax(Layer):
+  """Softmax activation function.
+
+  Input shape:
+      Arbitrary. Use the keyword argument `input_shape`
+      (tuple of integers, does not include the samples axis)
+      when using this layer as the first layer in a model.
+
+  Output shape:
+      Same shape as the input.
+
+  Arguments:
+      axis: Integer, axis along which the softmax normalization is applied.
+  """
+
+  def __init__(self, axis=-1, **kwargs):
+    super(Softmax, self).__init__(**kwargs)
+    self.supports_masking = True
+    self.axis = axis
+
+  def call(self, inputs):
+    return activations.softmax(inputs, axis=self.axis)
+
   def get_config(self):
-    config = {'theta': float(self.theta)}
-    base_config = super(ThresholdedReLU, self).get_config()
+    config = {'axis': self.axis}
+    base_config = super(Softmax, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
+
+  @shape_type_conversion
+  def compute_output_shape(self, input_shape):
+    return input_shape
diff --git a/tensorflow/python/keras/_impl/keras/layers/advanced_activations_test.py b/tensorflow/python/keras/_impl/keras/layers/advanced_activations_test.py
index 91efab30ed..343b7949ac 100644
--- a/tensorflow/python/keras/_impl/keras/layers/advanced_activations_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/advanced_activations_test.py
@@ -56,6 +56,12 @@ class AdvancedActivationsTest(test.TestCase):
                                kwargs={'theta': 0.5},
                                input_shape=(2, 3, 4))
 
+  def test_softmax(self):
+    with self.test_session():
+      testing_utils.layer_test(keras.layers.Softmax,
+                               kwargs={'axis': 1},
+                               input_shape=(2, 3, 4))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/layers/convolutional.py b/tensorflow/python/keras/_impl/keras/layers/convolutional.py
index 22496e8a76..b2ad4c4b65 100644
--- a/tensorflow/python/keras/_impl/keras/layers/convolutional.py
+++ b/tensorflow/python/keras/_impl/keras/layers/convolutional.py
@@ -711,6 +711,144 @@ class Conv3DTranspose(tf_convolutional_layers.Conv3D, Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+class SeparableConv1D(tf_convolutional_layers.SeparableConv1D, Layer):
+  """Depthwise separable 1D convolution.
+
+  This layer performs a depthwise convolution that acts separately on
+  channels, followed by a pointwise convolution that mixes channels.
+  If `use_bias` is True and a bias initializer is provided,
+  it adds a bias vector to the output.
+  It then optionally applies an activation function to produce the final output.
+
+  Arguments:
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: A single integer specifying the spatial
+      dimensions of the filters.
+    strides: A single integer specifying the strides
+      of the convolution.
+      Specifying any `stride` value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, length, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, length)`.
+    dilation_rate: A single integer, specifying
+      the dilation rate to use for dilated convolution.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any stride value != 1.
+    depth_multiplier: The number of depthwise convolution output channels for
+      each input channel. The total number of depthwise convolution output
+      channels will be equal to `num_filters_in * depth_multiplier`.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    use_bias: Boolean, whether the layer uses a bias.
+    depthwise_initializer: An initializer for the depthwise convolution kernel.
+    pointwise_initializer: An initializer for the pointwise convolution kernel.
+    bias_initializer: An initializer for the bias vector. If None, the default
+      initializer will be used.
+    depthwise_regularizer: Optional regularizer for the depthwise
+      convolution kernel.
+    pointwise_regularizer: Optional regularizer for the pointwise
+      convolution kernel.
+    bias_regularizer: Optional regularizer for the bias vector.
+    activity_regularizer: Optional regularizer function for the output.
+    depthwise_constraint: Optional projection function to be applied to the
+        depthwise kernel after being updated by an `Optimizer` (e.g. used for
+        norm constraints or value constraints for layer weights). The function
+        must take as input the unprojected variable and must return the
+        projected variable (which must have the same shape). Constraints are
+        not safe to use when doing asynchronous distributed training.
+    pointwise_constraint: Optional projection function to be applied to the
+        pointwise kernel after being updated by an `Optimizer`.
+    bias_constraint: Optional projection function to be applied to the
+        bias after being updated by an `Optimizer`.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self,
+               filters,
+               kernel_size,
+               strides=1,
+               padding='valid',
+               data_format=None,
+               dilation_rate=1,
+               depth_multiplier=1,
+               activation=None,
+               use_bias=True,
+               depthwise_initializer='glorot_uniform',
+               pointwise_initializer='glorot_uniform',
+               bias_initializer='zeros',
+               depthwise_regularizer=None,
+               pointwise_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               depthwise_constraint=None,
+               pointwise_constraint=None,
+               bias_constraint=None,
+               **kwargs):
+    if data_format is None:
+      data_format = K.image_data_format()
+    super(SeparableConv1D, self).__init__(
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        dilation_rate=dilation_rate,
+        activation=activations.get(activation),
+        use_bias=use_bias,
+        depthwise_initializer=initializers.get(depthwise_initializer),
+        pointwise_initializer=initializers.get(pointwise_initializer),
+        bias_initializer=initializers.get(bias_initializer),
+        depthwise_regularizer=regularizers.get(depthwise_regularizer),
+        pointwise_regularizer=regularizers.get(pointwise_regularizer),
+        bias_regularizer=regularizers.get(bias_regularizer),
+        activity_regularizer=regularizers.get(activity_regularizer),
+        depthwise_constraint=constraints.get(depthwise_constraint),
+        pointwise_constraint=constraints.get(pointwise_constraint),
+        bias_constraint=constraints.get(bias_constraint),
+        **kwargs)
+
+  def get_config(self):
+    config = {
+        'filters': self.filters,
+        'kernel_size': self.kernel_size,
+        'strides': self.strides,
+        'padding': self.padding,
+        'data_format': self.data_format,
+        'dilation_rate': self.dilation_rate,
+        'activation': activations.serialize(self.activation),
+        'use_bias': self.use_bias,
+        'depthwise_initializer':
+            initializers.serialize(self.depthwise_initializer),
+        'pointwise_initializer':
+            initializers.serialize(self.pointwise_initializer),
+        'bias_initializer':
+            initializers.serialize(self.bias_initializer),
+        'depthwise_regularizer':
+            regularizers.serialize(self.depthwise_regularizer),
+        'pointwise_regularizer':
+            regularizers.serialize(self.pointwise_regularizer),
+        'bias_regularizer':
+            regularizers.serialize(self.bias_regularizer),
+        'activity_regularizer':
+            regularizers.serialize(self.activity_regularizer),
+        'depthwise_constraint':
+            constraints.serialize(self.depthwise_constraint),
+        'pointwise_constraint':
+            constraints.serialize(self.pointwise_constraint),
+        'bias_constraint':
+            constraints.serialize(self.bias_constraint)
+    }
+    base_config = super(SeparableConv1D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
 class SeparableConv2D(tf_convolutional_layers.SeparableConv2D, Layer):
   """Depthwise separable 2D convolution.
 
@@ -1663,6 +1801,7 @@ class Cropping3D(Layer):
 Convolution1D = Conv1D
 Convolution2D = Conv2D
 Convolution3D = Conv3D
+SeparableConvolution1D = SeparableConv1D
 SeparableConvolution2D = SeparableConv2D
 Convolution2DTranspose = Conv2DTranspose
 Convolution3DTranspose = Conv3DTranspose
diff --git a/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py b/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py
index 4f0e9fc691..565db19e41 100644
--- a/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py
+++ b/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py
@@ -20,13 +20,13 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras._impl.keras import activations
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import constraints
 from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.engine import InputSpec
+from tensorflow.python.keras._impl.keras.engine.topology import shape_type_conversion
 from tensorflow.python.keras._impl.keras.layers.recurrent import Recurrent
 from tensorflow.python.keras._impl.keras.utils import conv_utils
 
@@ -127,10 +127,10 @@ class ConvRecurrent2D(Recurrent):
     self.input_spec = [InputSpec(ndim=5)]
     self.state_spec = None
 
+  @shape_type_conversion
   def compute_output_shape(self, input_shape):
     if isinstance(input_shape, list):
       input_shape = input_shape[0]
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
     if self.data_format == 'channels_first':
       rows = input_shape[3]
       cols = input_shape[4]
@@ -151,30 +151,28 @@ class ConvRecurrent2D(Recurrent):
         dilation=self.dilation_rate[1])
     if self.return_sequences:
       if self.data_format == 'channels_first':
-        output_shape = [input_shape[0], input_shape[1],
-                        self.filters, rows, cols]
+        output_shape = (input_shape[0], input_shape[1], self.filters, rows,
+                        cols)
       elif self.data_format == 'channels_last':
-        output_shape = [input_shape[0], input_shape[1],
-                        rows, cols, self.filters]
+        output_shape = (input_shape[0], input_shape[1], rows, cols,
+                        self.filters)
     else:
       if self.data_format == 'channels_first':
-        output_shape = [input_shape[0], self.filters, rows, cols]
+        output_shape = (input_shape[0], self.filters, rows, cols)
       elif self.data_format == 'channels_last':
-        output_shape = [input_shape[0], rows, cols, self.filters]
+        output_shape = (input_shape[0], rows, cols, self.filters)
 
     if self.return_state:
       if self.data_format == 'channels_first':
-        output_shapes = [output_shape] + [(input_shape[0],
-                                           self.filters,
-                                           rows,
-                                           cols) for _ in range(2)]
+        output_shape = [output_shape] + [
+            (input_shape[0], self.filters, rows, cols) for _ in range(2)
+        ]
       elif self.data_format == 'channels_last':
-        output_shapes = [output_shape] + [(input_shape[0],
-                                           rows,
-                                           cols,
-                                           self.filters) for _ in range(2)]
-      return [tensor_shape.TensorShape(shape) for shape in output_shapes]
-    return tensor_shape.TensorShape(output_shape)
+        output_shape = [output_shape] + [
+            (input_shape[0], rows, cols, self.filters) for _ in range(2)
+        ]
+
+    return output_shape
 
   def get_config(self):
     config = {
@@ -294,11 +292,6 @@ class ConvLSTM2D(ConvRecurrent2D):
   Raises:
       ValueError: in case of invalid constructor arguments.
 
-  References:
-      - [Convolutional LSTM Network: A Machine Learning Approach for
-      Precipitation Nowcasting](http://arxiv.org/abs/1506.04214v1)
-      The current implementation does not include the feedback loop on the
-      cells output
   """
 
   def __init__(self,
@@ -338,7 +331,6 @@ class ConvLSTM2D(ConvRecurrent2D):
         return_sequences=return_sequences,
         go_backwards=go_backwards,
         stateful=stateful,
-        activity_regularizer=regularizers.get(activity_regularizer),
         **kwargs)
     self.activation = activations.get(activation)
     self.recurrent_activation = activations.get(recurrent_activation)
@@ -352,6 +344,7 @@ class ConvLSTM2D(ConvRecurrent2D):
     self.kernel_regularizer = regularizers.get(kernel_regularizer)
     self.recurrent_regularizer = regularizers.get(recurrent_regularizer)
     self.bias_regularizer = regularizers.get(bias_regularizer)
+    self.activity_regularizer = regularizers.get(activity_regularizer)
 
     self.kernel_constraint = constraints.get(kernel_constraint)
     self.recurrent_constraint = constraints.get(recurrent_constraint)
@@ -361,13 +354,12 @@ class ConvLSTM2D(ConvRecurrent2D):
     self.recurrent_dropout = min(1., max(0., recurrent_dropout))
     self.state_spec = [InputSpec(ndim=4), InputSpec(ndim=4)]
 
+  @shape_type_conversion
   def build(self, input_shape):
     if isinstance(input_shape, list):
       input_shape = input_shape[0]
-    input_shape = tuple(tensor_shape.TensorShape(input_shape).as_list())
     batch_size = input_shape[0] if self.stateful else None
     self.input_spec[0] = InputSpec(shape=(batch_size, None) + input_shape[2:])
-
     if self.stateful:
       self.reset_states()
     else:
diff --git a/tensorflow/python/keras/_impl/keras/layers/convolutional_test.py b/tensorflow/python/keras/_impl/keras/layers/convolutional_test.py
index be7da6f2b4..39c9d4f0fb 100644
--- a/tensorflow/python/keras/_impl/keras/layers/convolutional_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/convolutional_test.py
@@ -311,6 +311,72 @@ class Conv3DTransposeTest(test.TestCase):
       self.assertEqual(layer.bias.constraint, b_constraint)
 
 
+class SeparableConv1DTest(test.TestCase):
+
+  def test_separable_conv_1d(self):
+    num_samples = 2
+    filters = 6
+    stack_size = 3
+    length = 7
+    strides = 1
+
+    for padding in ['valid', 'same']:
+      for multiplier in [1, 2]:
+        if padding == 'same' and strides != 1:
+          continue
+
+        with self.test_session(use_gpu=True):
+          testing_utils.layer_test(
+              keras.layers.SeparableConv1D,
+              kwargs={
+                  'filters': filters,
+                  'kernel_size': 3,
+                  'padding': padding,
+                  'strides': strides,
+                  'depth_multiplier': multiplier
+              },
+              input_shape=(num_samples, length, stack_size))
+
+  def test_separable_conv1d_regularizers(self):
+    kwargs = {
+        'filters': 3,
+        'kernel_size': 3,
+        'padding': 'valid',
+        'depthwise_regularizer': 'l2',
+        'pointwise_regularizer': 'l2',
+        'bias_regularizer': 'l2',
+        'activity_regularizer': 'l2',
+        'strides': 1
+    }
+    with self.test_session(use_gpu=True):
+      layer = keras.layers.SeparableConv1D(**kwargs)
+      layer.build((None, 5, 2))
+      self.assertEqual(len(layer.losses), 3)
+      layer(keras.backend.variable(np.ones((1, 5, 2))))
+      self.assertEqual(len(layer.losses), 4)
+
+  def test_separable_conv1d_constraints(self):
+    d_constraint = lambda x: x
+    p_constraint = lambda x: x
+    b_constraint = lambda x: x
+
+    kwargs = {
+        'filters': 3,
+        'kernel_size': 3,
+        'padding': 'valid',
+        'pointwise_constraint': p_constraint,
+        'depthwise_constraint': d_constraint,
+        'bias_constraint': b_constraint,
+        'strides': 1
+    }
+    with self.test_session(use_gpu=True):
+      layer = keras.layers.SeparableConv1D(**kwargs)
+      layer.build((None, 5, 2))
+      self.assertEqual(layer.depthwise_kernel.constraint, d_constraint)
+      self.assertEqual(layer.pointwise_kernel.constraint, p_constraint)
+      self.assertEqual(layer.bias.constraint, b_constraint)
+
+
 class SeparableConv2DTest(test.TestCase):
 
   def test_separable_conv_2d(self):
diff --git a/tensorflow/python/keras/_impl/keras/layers/embeddings.py b/tensorflow/python/keras/_impl/keras/layers/embeddings.py
index 51c520be38..f8e31068f8 100644
--- a/tensorflow/python/keras/_impl/keras/layers/embeddings.py
+++ b/tensorflow/python/keras/_impl/keras/layers/embeddings.py
@@ -18,12 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import constraints
 from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.engine import Layer
+from tensorflow.python.keras._impl.keras.engine.topology import shape_type_conversion
 
 
 class Embedding(Layer):
@@ -58,13 +58,13 @@ class Embedding(Layer):
     output_dim: int >= 0. Dimension of the dense embedding.
     embeddings_initializer: Initializer for the `embeddings` matrix.
     embeddings_regularizer: Regularizer function applied to
-          the `embeddings` matrix.
+        the `embeddings` matrix.
     embeddings_constraint: Constraint function applied to
-          the `embeddings` matrix.
+        the `embeddings` matrix.
     mask_zero: Whether or not the input value 0 is a special "padding"
         value that should be masked out.
-        This is useful when using recurrent layers,
-        which may take variable length inputs.
+        This is useful when using recurrent layers
+        which may take variable length input.
         If this is `True` then all subsequent layers
         in the model need to support masking or an exception will be raised.
         If mask_zero is set to True, as a consequence, index 0 cannot be
@@ -81,9 +81,6 @@ class Embedding(Layer):
   Output shape:
       3D tensor with shape: `(batch_size, sequence_length, output_dim)`.
 
-  References:
-      - [A Theoretically Grounded Application of Dropout in Recurrent Neural
-        Networks](http://arxiv.org/abs/1512.05287)
   """
 
   def __init__(self,
@@ -101,19 +98,19 @@ class Embedding(Layer):
         kwargs['input_shape'] = (input_length,)
       else:
         kwargs['input_shape'] = (None,)
-    super(Embedding, self).__init__(
-        activity_regularizer=regularizers.get(activity_regularizer), **kwargs)
+    super(Embedding, self).__init__(**kwargs)
 
     self.input_dim = input_dim
     self.output_dim = output_dim
     self.embeddings_initializer = initializers.get(embeddings_initializer)
     self.embeddings_regularizer = regularizers.get(embeddings_regularizer)
+    self.activity_regularizer = regularizers.get(activity_regularizer)
     self.embeddings_constraint = constraints.get(embeddings_constraint)
     self.mask_zero = mask_zero
     self.input_length = input_length
 
+  @shape_type_conversion
   def build(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
     self.embeddings = self.add_weight(
         shape=(self.input_dim, self.output_dim),
         initializer=self.embeddings_initializer,
@@ -129,10 +126,10 @@ class Embedding(Layer):
     else:
       return K.not_equal(inputs, 0)
 
+  @shape_type_conversion
   def compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
     if self.input_length is None:
-      return tensor_shape.TensorShape(input_shape + [self.output_dim])
+      return input_shape + (self.output_dim,)
     else:
       # input_length can be tuple if input is 3D or higher
       if isinstance(self.input_length, (list, tuple)):
@@ -149,8 +146,7 @@ class Embedding(Layer):
                        (str(self.input_length), str(input_shape)))
           elif s1 is None:
             in_lens[i] = s2
-      return tensor_shape.TensorShape(
-          (input_shape[0],) + tuple(in_lens) + (self.output_dim,))
+      return (input_shape[0],) + tuple(in_lens) + (self.output_dim,)
 
   def call(self, inputs):
     if K.dtype(inputs) != 'int32':
diff --git a/tensorflow/python/keras/_impl/keras/layers/local.py b/tensorflow/python/keras/_impl/keras/layers/local.py
index 0a31b87fb5..b844b071e0 100644
--- a/tensorflow/python/keras/_impl/keras/layers/local.py
+++ b/tensorflow/python/keras/_impl/keras/layers/local.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras._impl.keras import activations
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import constraints
@@ -26,6 +25,7 @@ from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
+from tensorflow.python.keras._impl.keras.engine.topology import shape_type_conversion
 from tensorflow.python.keras._impl.keras.utils import conv_utils
 
 
@@ -98,8 +98,7 @@ class LocallyConnected1D(Layer):
                kernel_constraint=None,
                bias_constraint=None,
                **kwargs):
-    super(LocallyConnected1D, self).__init__(
-        activity_regularizer=regularizers.get(activity_regularizer), **kwargs)
+    super(LocallyConnected1D, self).__init__(**kwargs)
     self.filters = filters
     self.kernel_size = conv_utils.normalize_tuple(kernel_size, 1, 'kernel_size')
     self.strides = conv_utils.normalize_tuple(strides, 1, 'strides')
@@ -114,12 +113,13 @@ class LocallyConnected1D(Layer):
     self.bias_initializer = initializers.get(bias_initializer)
     self.kernel_regularizer = regularizers.get(kernel_regularizer)
     self.bias_regularizer = regularizers.get(bias_regularizer)
+    self.activity_regularizer = regularizers.get(activity_regularizer)
     self.kernel_constraint = constraints.get(kernel_constraint)
     self.bias_constraint = constraints.get(bias_constraint)
     self.input_spec = InputSpec(ndim=3)
 
+  @shape_type_conversion
   def build(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
     input_dim = input_shape[2]
     if input_dim is None:
       raise ValueError('Axis 2 of input should be fully-defined. '
@@ -146,15 +146,14 @@ class LocallyConnected1D(Layer):
     self.input_spec = InputSpec(ndim=3, axes={2: input_dim})
     self.built = True
 
+  @shape_type_conversion
   def compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
     length = conv_utils.conv_output_length(input_shape[1], self.kernel_size[0],
                                            self.padding, self.strides[0])
-    return tensor_shape.TensorShape([input_shape[0], length, self.filters])
+    return (input_shape[0], length, self.filters)
 
   def call(self, inputs):
     output = K.local_conv1d(inputs, self.kernel, self.kernel_size, self.strides)
-
     if self.use_bias:
       output = K.bias_add(output, self.bias)
     if self.activation is not None:
@@ -163,20 +162,32 @@ class LocallyConnected1D(Layer):
 
   def get_config(self):
     config = {
-        'filters': self.filters,
-        'kernel_size': self.kernel_size,
-        'strides': self.strides,
-        'padding': self.padding,
-        'activation': activations.serialize(self.activation),
-        'use_bias': self.use_bias,
-        'kernel_initializer': initializers.serialize(self.kernel_initializer),
-        'bias_initializer': initializers.serialize(self.bias_initializer),
-        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
-        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+        'filters':
+            self.filters,
+        'kernel_size':
+            self.kernel_size,
+        'strides':
+            self.strides,
+        'padding':
+            self.padding,
+        'activation':
+            activations.serialize(self.activation),
+        'use_bias':
+            self.use_bias,
+        'kernel_initializer':
+            initializers.serialize(self.kernel_initializer),
+        'bias_initializer':
+            initializers.serialize(self.bias_initializer),
+        'kernel_regularizer':
+            regularizers.serialize(self.kernel_regularizer),
+        'bias_regularizer':
+            regularizers.serialize(self.bias_regularizer),
         'activity_regularizer':
             regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint': constraints.serialize(self.kernel_constraint),
-        'bias_constraint': constraints.serialize(self.bias_constraint)
+        'kernel_constraint':
+            constraints.serialize(self.kernel_constraint),
+        'bias_constraint':
+            constraints.serialize(self.bias_constraint)
     }
     base_config = super(LocallyConnected1D, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
@@ -273,8 +284,7 @@ class LocallyConnected2D(Layer):
                kernel_constraint=None,
                bias_constraint=None,
                **kwargs):
-    super(LocallyConnected2D, self).__init__(
-        activity_regularizer=regularizers.get(activity_regularizer), **kwargs)
+    super(LocallyConnected2D, self).__init__(**kwargs)
     self.filters = filters
     self.kernel_size = conv_utils.normalize_tuple(kernel_size, 2, 'kernel_size')
     self.strides = conv_utils.normalize_tuple(strides, 2, 'strides')
@@ -289,12 +299,13 @@ class LocallyConnected2D(Layer):
     self.bias_initializer = initializers.get(bias_initializer)
     self.kernel_regularizer = regularizers.get(kernel_regularizer)
     self.bias_regularizer = regularizers.get(bias_regularizer)
+    self.activity_regularizer = regularizers.get(activity_regularizer)
     self.kernel_constraint = constraints.get(kernel_constraint)
     self.bias_constraint = constraints.get(bias_constraint)
     self.input_spec = InputSpec(ndim=4)
 
+  @shape_type_conversion
   def build(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
     if self.data_format == 'channels_last':
       input_row, input_col = input_shape[1:-1]
       input_filter = input_shape[3]
@@ -306,7 +317,6 @@ class LocallyConnected2D(Layer):
                        ' a LocallyConnected2D layer '
                        'should be fully-defined, but layer received '
                        'the inputs shape ' + str(input_shape))
-
     output_row = conv_utils.conv_output_length(input_row, self.kernel_size[0],
                                                self.padding, self.strides[0])
     output_col = conv_utils.conv_output_length(input_col, self.kernel_size[1],
@@ -337,33 +347,30 @@ class LocallyConnected2D(Layer):
       self.input_spec = InputSpec(ndim=4, axes={-1: input_filter})
     self.built = True
 
+  @shape_type_conversion
   def compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
     if self.data_format == 'channels_first':
       rows = input_shape[2]
       cols = input_shape[3]
     elif self.data_format == 'channels_last':
       rows = input_shape[1]
       cols = input_shape[2]
+
     rows = conv_utils.conv_output_length(rows, self.kernel_size[0],
                                          self.padding, self.strides[0])
     cols = conv_utils.conv_output_length(cols, self.kernel_size[1],
                                          self.padding, self.strides[1])
 
     if self.data_format == 'channels_first':
-      return tensor_shape.TensorShape(
-          [input_shape[0], self.filters, rows, cols])
+      return (input_shape[0], self.filters, rows, cols)
     elif self.data_format == 'channels_last':
-      return tensor_shape.TensorShape(
-          [input_shape[0], rows, cols, self.filters])
+      return (input_shape[0], rows, cols, self.filters)
 
   def call(self, inputs):
-    output = K.local_conv2d(inputs,
-                            self.kernel,
-                            self.kernel_size,
-                            self.strides,
+    output = K.local_conv2d(inputs, self.kernel, self.kernel_size, self.strides,
                             (self.output_row, self.output_col),
                             self.data_format)
+
     if self.use_bias:
       output = K.bias_add(output, self.bias, data_format=self.data_format)
 
@@ -372,21 +379,34 @@ class LocallyConnected2D(Layer):
 
   def get_config(self):
     config = {
-        'filters': self.filters,
-        'kernel_size': self.kernel_size,
-        'strides': self.strides,
-        'padding': self.padding,
-        'data_format': self.data_format,
-        'activation': activations.serialize(self.activation),
-        'use_bias': self.use_bias,
-        'kernel_initializer': initializers.serialize(self.kernel_initializer),
-        'bias_initializer': initializers.serialize(self.bias_initializer),
-        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
-        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+        'filters':
+            self.filters,
+        'kernel_size':
+            self.kernel_size,
+        'strides':
+            self.strides,
+        'padding':
+            self.padding,
+        'data_format':
+            self.data_format,
+        'activation':
+            activations.serialize(self.activation),
+        'use_bias':
+            self.use_bias,
+        'kernel_initializer':
+            initializers.serialize(self.kernel_initializer),
+        'bias_initializer':
+            initializers.serialize(self.bias_initializer),
+        'kernel_regularizer':
+            regularizers.serialize(self.kernel_regularizer),
+        'bias_regularizer':
+            regularizers.serialize(self.bias_regularizer),
         'activity_regularizer':
             regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint': constraints.serialize(self.kernel_constraint),
-        'bias_constraint': constraints.serialize(self.bias_constraint)
+        'kernel_constraint':
+            constraints.serialize(self.kernel_constraint),
+        'bias_constraint':
+            constraints.serialize(self.bias_constraint)
     }
     base_config = super(LocallyConnected2D, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow/python/keras/_impl/keras/layers/merge.py b/tensorflow/python/keras/_impl/keras/layers/merge.py
index 76eb03cf27..38b0b30297 100644
--- a/tensorflow/python/keras/_impl/keras/layers/merge.py
+++ b/tensorflow/python/keras/_impl/keras/layers/merge.py
@@ -14,15 +14,15 @@
 # ==============================================================================
 # pylint: disable=not-callable
 # pylint: disable=redefined-builtin
-"""Layers can merge several input tensors into a single output tensor.
+"""Layers that can merge several inputs into one.
 """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.engine.topology import Layer
+from tensorflow.python.keras._impl.keras.engine.topology import shape_type_conversion
 
 
 class _Merge(Layer):
@@ -73,12 +73,13 @@ class _Merge(Layer):
         output_shape.append(i)
       else:
         if i != j:
-          raise ValueError('Operands could not be broadcast '
-                           'together with shapes ' + str(shape1) + ' ' +
-                           str(shape2))
+          raise ValueError(
+              'Operands could not be broadcast '
+              'together with shapes ' + str(shape1) + ' ' + str(shape2))
         output_shape.append(i)
     return tuple(output_shape)
 
+  @shape_type_conversion
   def build(self, input_shape):
     # Used purely for shape validation.
     if not isinstance(input_shape, list):
@@ -87,14 +88,13 @@ class _Merge(Layer):
       raise ValueError('A merge layer should be called '
                        'on a list of at least 2 inputs. '
                        'Got ' + str(len(input_shape)) + ' inputs.')
-    input_shape = [tensor_shape.TensorShape(s).as_list() for s in input_shape]
     batch_sizes = [s[0] for s in input_shape if s is not None]
     batch_sizes = set(batch_sizes)
     batch_sizes -= set([None])
     if len(batch_sizes) > 1:
-      raise ValueError('Can not merge tensors with different '
-                       'batch sizes. Got tensors with shapes : ' +
-                       str(input_shape))
+      raise ValueError(
+          'Can not merge tensors with different '
+          'batch sizes. Got tensors with shapes : ' + str(input_shape))
     if input_shape[0] is None:
       output_shape = None
     else:
@@ -111,9 +111,10 @@ class _Merge(Layer):
       self._reshape_required = False
     else:
       self._reshape_required = True
-    self.built = True
 
   def call(self, inputs):
+    if not isinstance(inputs, list):
+      raise ValueError('A merge layer should be called ' 'on a list of inputs.')
     if self._reshape_required:
       reshaped_inputs = []
       input_ndims = list(map(K.ndim, inputs))
@@ -172,6 +173,7 @@ class _Merge(Layer):
     else:
       return self._merge_function(inputs)
 
+  @shape_type_conversion
   def compute_output_shape(self, input_shape):
     if input_shape[0] is None:
       output_shape = None
@@ -214,6 +216,22 @@ class Add(_Merge):
   It takes as input a list of tensors,
   all of the same shape, and returns
   a single tensor (also of the same shape).
+
+  Examples:
+
+  ```python
+      import keras
+
+      input1 = keras.layers.Input(shape=(16,))
+      x1 = keras.layers.Dense(8, activation='relu')(input1)
+      input2 = keras.layers.Input(shape=(32,))
+      x2 = keras.layers.Dense(8, activation='relu')(input2)
+      added = keras.layers.Add()([x1, x2])  # equivalent to added =
+      keras.layers.add([x1, x2])
+
+      out = keras.layers.Dense(4)(added)
+      model = keras.models.Model(inputs=[input1, input2], outputs=out)
+  ```
   """
 
   def _merge_function(self, inputs):
@@ -247,10 +265,17 @@ class Subtract(_Merge):
   ```
   """
 
+  @shape_type_conversion
+  def build(self, input_shape):
+    super(Subtract, self).build(input_shape)
+    if len(input_shape) != 2:
+      raise ValueError('A `Subtract` layer should be called '
+                       'on exactly 2 inputs')
+
   def _merge_function(self, inputs):
     if len(inputs) != 2:
-      raise ValueError('`Subtract` layer should be called '
-                       'on exactly 2 inputs. Received: %s' % inputs)
+      raise ValueError('A `Subtract` layer should be called '
+                       'on exactly 2 inputs')
     return inputs[0] - inputs[1]
 
 
@@ -330,47 +355,43 @@ class Concatenate(_Merge):
     super(Concatenate, self).__init__(**kwargs)
     self.axis = axis
     self.supports_masking = True
+    self._reshape_required = False
 
+  @shape_type_conversion
   def build(self, input_shape):
     # Used purely for shape validation.
-    if not (isinstance(input_shape, list) and len(input_shape) > 1):
-      raise ValueError('`Concatenate` layer should be called '
-                       'on a list containing at least two inputs')
+    if not isinstance(input_shape, list) or len(input_shape) < 2:
+      raise ValueError('A `Concatenate` layer should be called '
+                       'on a list of at least 2 inputs')
     if all([shape is None for shape in input_shape]):
       return
-    reduced_inputs_shapes = [
-        tensor_shape.TensorShape(shape).as_list() for shape in input_shape
-    ]
+    reduced_inputs_shapes = [list(shape) for shape in input_shape]
     shape_set = set()
     for i in range(len(reduced_inputs_shapes)):
       del reduced_inputs_shapes[i][self.axis]
       shape_set.add(tuple(reduced_inputs_shapes[i]))
     if len(shape_set) > 1:
-      raise ValueError('`Concatenate` layer requires '
+      raise ValueError('A `Concatenate` layer requires '
                        'inputs with matching shapes '
                        'except for the concat axis. '
                        'Got inputs shapes: %s' % (input_shape))
-    self.built = True
 
-  def call(self, inputs):
-    if not isinstance(inputs, list):
-      raise ValueError('A `Concatenate` layer should be called '
-                       'on a list of inputs.')
+  def _merge_function(self, inputs):
     return K.concatenate(inputs, axis=self.axis)
 
+  @shape_type_conversion
   def compute_output_shape(self, input_shape):
     if not isinstance(input_shape, list):
       raise ValueError('A `Concatenate` layer should be called '
                        'on a list of inputs.')
     input_shapes = input_shape
-    output_shape = tensor_shape.TensorShape(input_shapes[0]).as_list()
+    output_shape = list(input_shapes[0])
     for shape in input_shapes[1:]:
-      shape = tensor_shape.TensorShape(shape).as_list()
       if output_shape[self.axis] is None or shape[self.axis] is None:
         output_shape[self.axis] = None
         break
       output_shape[self.axis] += shape[self.axis]
-    return tensor_shape.TensorShape(output_shape)
+    return tuple(output_shape)
 
   def compute_mask(self, inputs, mask=None):
     if mask is None:
@@ -390,7 +411,7 @@ class Concatenate(_Merge):
     masks = []
     for input_i, mask_i in zip(inputs, mask):
       if mask_i is None:
-        # Input is unmasked. Append all 1s to masks
+        # Input is unmasked. Append all 1s to masks,
         masks.append(K.ones_like(input_i, dtype='bool'))
       elif K.ndim(mask_i) < K.ndim(input_i):
         # Mask is smaller than the input, expand it
@@ -441,14 +462,16 @@ class Dot(_Merge):
     self.axes = axes
     self.normalize = normalize
     self.supports_masking = True
+    self._reshape_required = False
 
+  @shape_type_conversion
   def build(self, input_shape):
     # Used purely for shape validation.
     if not isinstance(input_shape, list) or len(input_shape) != 2:
       raise ValueError('A `Dot` layer should be called '
                        'on a list of 2 inputs.')
-    shape1 = tensor_shape.TensorShape(input_shape[0]).as_list()
-    shape2 = tensor_shape.TensorShape(input_shape[1]).as_list()
+    shape1 = input_shape[0]
+    shape2 = input_shape[1]
     if shape1 is None or shape2 is None:
       return
     if isinstance(self.axes, int):
@@ -462,9 +485,10 @@ class Dot(_Merge):
       raise ValueError('Dimension incompatibility '
                        '%s != %s. ' % (shape1[axes[0]], shape2[axes[1]]) +
                        'Layer shapes: %s, %s' % (shape1, shape2))
-    self.built = True
 
-  def call(self, inputs):
+  def _merge_function(self, inputs):
+    if len(inputs) != 2:
+      raise ValueError('A `Dot` layer should be called ' 'on exactly 2 inputs')
     x1 = inputs[0]
     x2 = inputs[1]
     if isinstance(self.axes, int):
@@ -485,12 +509,13 @@ class Dot(_Merge):
     output = K.batch_dot(x1, x2, axes)
     return output
 
+  @shape_type_conversion
   def compute_output_shape(self, input_shape):
     if not isinstance(input_shape, list) or len(input_shape) != 2:
       raise ValueError('A `Dot` layer should be called '
                        'on a list of 2 inputs.')
-    shape1 = tensor_shape.TensorShape(input_shape[0]).as_list()
-    shape2 = tensor_shape.TensorShape(input_shape[1]).as_list()
+    shape1 = list(input_shape[0])
+    shape2 = list(input_shape[1])
     if isinstance(self.axes, int):
       if self.axes < 0:
         axes = [self.axes % len(shape1), self.axes % len(shape2)]
@@ -504,7 +529,7 @@ class Dot(_Merge):
     output_shape = shape1 + shape2
     if len(output_shape) == 1:
       output_shape += [1]
-    return tensor_shape.TensorShape(output_shape)
+    return tuple(output_shape)
 
   def compute_mask(self, inputs, mask=None):
     return None
@@ -527,6 +552,21 @@ def add(inputs, **kwargs):
 
   Returns:
       A tensor, the sum of the inputs.
+
+  Examples:
+
+  ```python
+      import keras
+
+      input1 = keras.layers.Input(shape=(16,))
+      x1 = keras.layers.Dense(8, activation='relu')(input1)
+      input2 = keras.layers.Input(shape=(32,))
+      x2 = keras.layers.Dense(8, activation='relu')(input2)
+      added = keras.layers.add([x1, x2])
+
+      out = keras.layers.Dense(4)(added)
+      model = keras.models.Model(inputs=[input1, input2], outputs=out)
+  ```
   """
   return Add(**kwargs)(inputs)
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/noise.py b/tensorflow/python/keras/_impl/keras/layers/noise.py
index 459f13145f..04fffcc384 100644
--- a/tensorflow/python/keras/_impl/keras/layers/noise.py
+++ b/tensorflow/python/keras/_impl/keras/layers/noise.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Layers for regularization models via the addition of noise.
+"""Layers that operate regularization via the addition of noise.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.engine import Layer
+from tensorflow.python.keras._impl.keras.engine.topology import shape_type_conversion
 
 
 class GaussianNoise(Layer):
@@ -59,14 +60,15 @@ class GaussianNoise(Layer):
 
     return K.in_train_phase(noised, inputs, training=training)
 
-  def compute_output_shape(self, input_shape):
-    return input_shape
-
   def get_config(self):
     config = {'stddev': self.stddev}
     base_config = super(GaussianNoise, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
+  @shape_type_conversion
+  def compute_output_shape(self, input_shape):
+    return input_shape
+
 
 class GaussianDropout(Layer):
   """Apply multiplicative 1-centered Gaussian noise.
@@ -86,10 +88,6 @@ class GaussianDropout(Layer):
   Output shape:
       Same shape as input.
 
-  References:
-      - [Dropout: A Simple Way to Prevent Neural Networks from Overfitting
-        Srivastava, Hinton, et al.
-        2014](http://www.cs.toronto.edu/~rsalakhu/papers/srivastava14a.pdf)
   """
 
   def __init__(self, rate, **kwargs):
@@ -108,14 +106,15 @@ class GaussianDropout(Layer):
       return K.in_train_phase(noised, inputs, training=training)
     return inputs
 
-  def compute_output_shape(self, input_shape):
-    return input_shape
-
   def get_config(self):
     config = {'rate': self.rate}
     base_config = super(GaussianDropout, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
+  @shape_type_conversion
+  def compute_output_shape(self, input_shape):
+    return input_shape
+
 
 class AlphaDropout(Layer):
   """Applies Alpha Dropout to the input.
@@ -140,8 +139,6 @@ class AlphaDropout(Layer):
   Output shape:
       Same shape as input.
 
-  References:
-      - [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
   """
 
   def __init__(self, rate, noise_shape=None, seed=None, **kwargs):
@@ -157,26 +154,34 @@ class AlphaDropout(Layer):
   def call(self, inputs, training=None):
     if 0. < self.rate < 1.:
       noise_shape = self._get_noise_shape(inputs)
-      alpha = 1.6732632423543772848170429916717
-      scale = 1.0507009873554804934193349852946
 
-      def dropped_inputs(inputs=inputs, rate=self.rate, seed=self.seed):
+      def dropped_inputs(inputs=inputs, rate=self.rate, seed=self.seed):  # pylint: disable=missing-docstring
+        alpha = 1.6732632423543772848170429916717
+        scale = 1.0507009873554804934193349852946
         alpha_p = -alpha * scale
-        kept_idx = K.greater_equal(K.random_uniform(noise_shape, seed=seed),
-                                   rate)
+
+        kept_idx = K.greater_equal(
+            K.random_uniform(noise_shape, seed=seed), rate)
         kept_idx = K.cast(kept_idx, K.floatx())
-        a = ((1 - rate) * (1 + rate * alpha_p ** 2)) ** -0.5
+
+        # Get affine transformation params
+        a = ((1 - rate) * (1 + rate * alpha_p**2))**-0.5
         b = -a * alpha_p * rate
+
+        # Apply mask
         x = inputs * kept_idx + alpha_p * (1 - kept_idx)
+
+        # Do affine transformation
         return a * x + b
 
       return K.in_train_phase(dropped_inputs, inputs, training=training)
     return inputs
 
-  def compute_output_shape(self, input_shape):
-    return input_shape
-
   def get_config(self):
     config = {'rate': self.rate}
     base_config = super(AlphaDropout, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
+
+  @shape_type_conversion
+  def compute_output_shape(self, input_shape):
+    return input_shape
diff --git a/tensorflow/python/keras/_impl/keras/layers/recurrent.py b/tensorflow/python/keras/_impl/keras/layers/recurrent.py
index 9ea21c9c36..1b0f6cb6cf 100644
--- a/tensorflow/python/keras/_impl/keras/layers/recurrent.py
+++ b/tensorflow/python/keras/_impl/keras/layers/recurrent.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 # pylint: disable=protected-access
-"""Recurrent layers.
+"""Recurrent layers and their base classes.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -29,6 +29,7 @@ from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
+from tensorflow.python.keras._impl.keras.engine.topology import shape_type_conversion
 from tensorflow.python.keras._impl.keras.utils.generic_utils import has_arg
 from tensorflow.python.platform import tf_logging as logging
 
@@ -109,6 +110,7 @@ class StackedRNNCells(Layer):
       states += cell_states
     return inputs, states
 
+  @shape_type_conversion
   def build(self, input_shape):
     for cell in self.cells:
       if isinstance(cell, Layer):
@@ -117,7 +119,7 @@ class StackedRNNCells(Layer):
         output_dim = cell.state_size[0]
       else:
         output_dim = cell.state_size
-      input_shape = (input_shape[0], input_shape[1], output_dim)
+      input_shape = (input_shape[0], output_dim)
     self.built = True
 
   def get_config(self):
@@ -262,8 +264,7 @@ class RNN(Layer):
           (e.g. via the `input_shape` argument)
 
   Input shape:
-      3D tensor with shape `(batch_size, timesteps, input_dim)`,
-      (Optional) 2D tensors with shape `(batch_size, output_dim)`.
+      3D tensor with shape `(batch_size, timesteps, input_dim)`.
 
   Output shape:
       - if `return_state`: a list of tensors. The first tensor is
@@ -370,7 +371,6 @@ class RNN(Layer):
                go_backwards=False,
                stateful=False,
                unroll=False,
-               activity_regularizer=None,
                **kwargs):
     if isinstance(cell, (list, tuple)):
       cell = StackedRNNCells(cell)
@@ -382,8 +382,7 @@ class RNN(Layer):
                        'an attribute `state_size` '
                        '(tuple of integers, '
                        'one integer per RNN state).')
-    super(RNN, self).__init__(
-        activity_regularizer=regularizers.get(activity_regularizer), **kwargs)
+    super(RNN, self).__init__(**kwargs)
     self.cell = cell
     self.return_sequences = return_sequences
     self.return_state = return_state
@@ -412,15 +411,16 @@ class RNN(Layer):
   def states(self, states):
     self._states = states
 
+  @shape_type_conversion
   def compute_output_shape(self, input_shape):
     if isinstance(input_shape, list):
       input_shape = input_shape[0]
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
 
     if hasattr(self.cell.state_size, '__len__'):
-      output_dim = self.cell.state_size[0]
+      state_size = self.cell.state_size
     else:
-      output_dim = self.cell.state_size
+      state_size = [self.cell.state_size]
+    output_dim = state_size[0]
 
     if self.return_sequences:
       output_shape = (input_shape[0], input_shape[1], output_dim)
@@ -428,11 +428,10 @@ class RNN(Layer):
       output_shape = (input_shape[0], output_dim)
 
     if self.return_state:
-      state_shape = [(input_shape[0], output_dim) for _ in self.states]
-      output_shape = [output_shape] + state_shape
+      state_shape = [(input_shape[0], dim) for dim in state_size]
+      return [output_shape] + state_shape
     else:
-      output_shape = output_shape
-    return tensor_shape.TensorShape(output_shape)
+      return output_shape
 
   def compute_mask(self, inputs, mask):
     if isinstance(mask, list):
@@ -444,6 +443,7 @@ class RNN(Layer):
     else:
       return output_mask
 
+  @shape_type_conversion
   def build(self, input_shape):
     # Note input_shape will be list of shapes of initial states and
     # constants if these are passed in __call__.
@@ -454,7 +454,6 @@ class RNN(Layer):
 
     if isinstance(input_shape, list):
       input_shape = input_shape[0]
-    input_shape = tuple(tensor_shape.TensorShape(input_shape).as_list())
 
     batch_size = input_shape[0] if self.stateful else None
     input_dim = input_shape[-1]
@@ -478,9 +477,9 @@ class RNN(Layer):
       # initial_state was passed in call, check compatibility
       if [spec.shape[-1] for spec in self.state_spec] != state_size:
         raise ValueError(
-            'An initial_state was passed that is not compatible with '
+            'An `initial_state` was passed that is not compatible with '
             '`cell.state_size`. Received `state_spec`={}; '
-            'However `cell.state_size` is '
+            'however `cell.state_size` is '
             '{}'.format(self.state_spec, self.cell.state_size))
     else:
       self.state_spec = [InputSpec(shape=(None, dim)) for dim in state_size]
@@ -610,7 +609,8 @@ class RNN(Layer):
         constants=constants,
         go_backwards=self.go_backwards,
         mask=mask,
-        unroll=self.unroll)
+        unroll=self.unroll,
+        input_length=timesteps)
     if self.stateful:
       updates = []
       for i in range(len(states)):
@@ -625,6 +625,8 @@ class RNN(Layer):
     # Properly set learning phase
     if getattr(last_output, '_uses_learning_phase', False):
       output._uses_learning_phase = True
+      for state in states:
+        state._uses_learning_phase = True
 
     if self.return_state:
       if not isinstance(states, (list, tuple)):
@@ -636,7 +638,7 @@ class RNN(Layer):
       return output
 
   def _standardize_args(self, inputs, initial_state, constants):
-    """Standardize `__call__` arguments to a single list of tensor inputs.
+    """Standardize `__call__` to a single list of tensor inputs.
 
     When running a model loaded from file, the input tensors
     `initial_state` and `constants` can be passed to `RNN.__call__` as part
@@ -688,7 +690,7 @@ class RNN(Layer):
                        'a `batch_input_shape` '
                        'argument to your first layer.\n'
                        '- If using the functional API, specify '
-                       'the time dimension by passing a '
+                       'the batch size by passing a '
                        '`batch_shape` argument to your Input layer.')
     # initialize state if None
     if self.states[0] is None:
@@ -788,37 +790,26 @@ class SimpleRNNCell(Layer):
 
   Arguments:
       units: Positive integer, dimensionality of the output space.
-      activation: Activation function to use
-          (see [activations](../activations.md)).
-          Default: hyperbolic tangent (`tanh`).
-          If you pass `None`, no activation is applied
+      activation: Activation function to use.
+          If you pass None, no activation is applied
           (ie. "linear" activation: `a(x) = x`).
       use_bias: Boolean, whether the layer uses a bias vector.
       kernel_initializer: Initializer for the `kernel` weights matrix,
           used for the linear transformation of the inputs.
-          (see [initializers](../initializers.md)).
       recurrent_initializer: Initializer for the `recurrent_kernel`
           weights matrix,
           used for the linear transformation of the recurrent state.
-          (see [initializers](../initializers.md)).
-      bias_initializer: Initializer for the bias vector
-          (see [initializers](../initializers.md)).
+      bias_initializer: Initializer for the bias vector.
       kernel_regularizer: Regularizer function applied to
-          the `kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
+          the `kernel` weights matrix.
       recurrent_regularizer: Regularizer function applied to
-          the `recurrent_kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
-      bias_regularizer: Regularizer function applied to the bias vector
-          (see [regularizer](../regularizers.md)).
+          the `recurrent_kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
       kernel_constraint: Constraint function applied to
-          the `kernel` weights matrix
-          (see [constraints](../constraints.md)).
+          the `kernel` weights matrix.
       recurrent_constraint: Constraint function applied to
-          the `recurrent_kernel` weights matrix
-          (see [constraints](../constraints.md)).
-      bias_constraint: Constraint function applied to the bias vector
-          (see [constraints](../constraints.md)).
+          the `recurrent_kernel` weights matrix.
+      bias_constraint: Constraint function applied to the bias vector.
       dropout: Float between 0 and 1.
           Fraction of the units to drop for
           the linear transformation of the inputs.
@@ -866,6 +857,7 @@ class SimpleRNNCell(Layer):
     self._dropout_mask = None
     self._recurrent_dropout_mask = None
 
+  @shape_type_conversion
   def build(self, input_shape):
     self.kernel = self.add_weight(
         shape=(input_shape[-1], self.units),
@@ -890,33 +882,21 @@ class SimpleRNNCell(Layer):
       self.bias = None
     self.built = True
 
-  def _generate_dropout_mask(self, inputs, training=None):
-    if 0 < self.dropout < 1:
-      ones = K.ones_like(K.squeeze(inputs[:, 0:1, :], axis=1))
-
-      def dropped_inputs():
-        return K.dropout(ones, self.dropout)
-
-      self._dropout_mask = K.in_train_phase(
-          dropped_inputs, ones, training=training)
-    else:
-      self._dropout_mask = None
-
-  def _generate_recurrent_dropout_mask(self, inputs, training=None):
-    if 0 < self.recurrent_dropout < 1:
-      ones = K.ones_like(K.reshape(inputs[:, 0, 0], (-1, 1)))
-      ones = K.tile(ones, (1, self.units))
-
-      def dropped_inputs():
-        return K.dropout(ones, self.dropout)
-
-      self._recurrent_dropout_mask = K.in_train_phase(
-          dropped_inputs, ones, training=training)
-    else:
-      self._recurrent_dropout_mask = None
-
   def call(self, inputs, states, training=None):
     prev_output = states[0]
+    if 0 < self.dropout < 1 and self._dropout_mask is None:
+      self._dropout_mask = _generate_dropout_mask(
+          _generate_dropout_ones(inputs,
+                                 K.shape(inputs)[-1]),
+          self.dropout,
+          training=training)
+    if (0 < self.recurrent_dropout < 1 and
+        self._recurrent_dropout_mask is None):
+      self._recurrent_dropout_mask = _generate_dropout_mask(
+          _generate_dropout_ones(inputs, self.units),
+          self.recurrent_dropout,
+          training=training)
+
     dp_mask = self._dropout_mask
     rec_dp_mask = self._recurrent_dropout_mask
 
@@ -939,46 +919,68 @@ class SimpleRNNCell(Layer):
         output._uses_learning_phase = True
     return output, [output]
 
+  def get_config(self):
+    config = {
+        'units':
+            self.units,
+        'activation':
+            activations.serialize(self.activation),
+        'use_bias':
+            self.use_bias,
+        'kernel_initializer':
+            initializers.serialize(self.kernel_initializer),
+        'recurrent_initializer':
+            initializers.serialize(self.recurrent_initializer),
+        'bias_initializer':
+            initializers.serialize(self.bias_initializer),
+        'kernel_regularizer':
+            regularizers.serialize(self.kernel_regularizer),
+        'recurrent_regularizer':
+            regularizers.serialize(self.recurrent_regularizer),
+        'bias_regularizer':
+            regularizers.serialize(self.bias_regularizer),
+        'kernel_constraint':
+            constraints.serialize(self.kernel_constraint),
+        'recurrent_constraint':
+            constraints.serialize(self.recurrent_constraint),
+        'bias_constraint':
+            constraints.serialize(self.bias_constraint),
+        'dropout':
+            self.dropout,
+        'recurrent_dropout':
+            self.recurrent_dropout
+    }
+    base_config = super(SimpleRNNCell, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
 
 class SimpleRNN(RNN):
   """Fully-connected RNN where the output is to be fed back to input.
 
   Arguments:
       units: Positive integer, dimensionality of the output space.
-      activation: Activation function to use
-          (see [activations](../activations.md)).
-          Default: hyperbolic tangent (`tanh`).
-          If you pass `None`, no activation is applied
+      activation: Activation function to use.
+          If you pass None, no activation is applied
           (ie. "linear" activation: `a(x) = x`).
       use_bias: Boolean, whether the layer uses a bias vector.
       kernel_initializer: Initializer for the `kernel` weights matrix,
           used for the linear transformation of the inputs.
-          (see [initializers](../initializers.md)).
       recurrent_initializer: Initializer for the `recurrent_kernel`
           weights matrix,
           used for the linear transformation of the recurrent state.
-          (see [initializers](../initializers.md)).
-      bias_initializer: Initializer for the bias vector
-          (see [initializers](../initializers.md)).
+      bias_initializer: Initializer for the bias vector.
       kernel_regularizer: Regularizer function applied to
-          the `kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
+          the `kernel` weights matrix.
       recurrent_regularizer: Regularizer function applied to
-          the `recurrent_kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
-      bias_regularizer: Regularizer function applied to the bias vector
-          (see [regularizer](../regularizers.md)).
+          the `recurrent_kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
       activity_regularizer: Regularizer function applied to
-          the output of the layer (its "activation").
-          (see [regularizer](../regularizers.md)).
+          the output of the layer (its "activation")..
       kernel_constraint: Constraint function applied to
-          the `kernel` weights matrix
-          (see [constraints](../constraints.md)).
+          the `kernel` weights matrix.
       recurrent_constraint: Constraint function applied to
-          the `recurrent_kernel` weights matrix
-          (see [constraints](../constraints.md)).
-      bias_constraint: Constraint function applied to the bias vector
-          (see [constraints](../constraints.md)).
+          the `recurrent_kernel` weights matrix.
+      bias_constraint: Constraint function applied to the bias vector.
       dropout: Float between 0 and 1.
           Fraction of the units to drop for
           the linear transformation of the inputs.
@@ -1052,12 +1054,12 @@ class SimpleRNN(RNN):
         go_backwards=go_backwards,
         stateful=stateful,
         unroll=unroll,
-        activity_regularizer=regularizers.get(activity_regularizer),
         **kwargs)
+    self.activity_regularizer = regularizers.get(activity_regularizer)
 
   def call(self, inputs, mask=None, training=None, initial_state=None):
-    self.cell._generate_dropout_mask(inputs, training=training)
-    self.cell._generate_recurrent_dropout_mask(inputs, training=training)
+    self.cell._dropout_mask = None
+    self.cell._recurrent_dropout_mask = None
     return super(SimpleRNN, self).call(
         inputs, mask=mask, training=training, initial_state=initial_state)
 
@@ -1119,25 +1121,36 @@ class SimpleRNN(RNN):
 
   def get_config(self):
     config = {
-        'units': self.units,
-        'activation': activations.serialize(self.activation),
-        'use_bias': self.use_bias,
-        'kernel_initializer': initializers.serialize(self.kernel_initializer),
+        'units':
+            self.units,
+        'activation':
+            activations.serialize(self.activation),
+        'use_bias':
+            self.use_bias,
+        'kernel_initializer':
+            initializers.serialize(self.kernel_initializer),
         'recurrent_initializer':
             initializers.serialize(self.recurrent_initializer),
-        'bias_initializer': initializers.serialize(self.bias_initializer),
-        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
+        'bias_initializer':
+            initializers.serialize(self.bias_initializer),
+        'kernel_regularizer':
+            regularizers.serialize(self.kernel_regularizer),
         'recurrent_regularizer':
             regularizers.serialize(self.recurrent_regularizer),
-        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+        'bias_regularizer':
+            regularizers.serialize(self.bias_regularizer),
         'activity_regularizer':
             regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint': constraints.serialize(self.kernel_constraint),
+        'kernel_constraint':
+            constraints.serialize(self.kernel_constraint),
         'recurrent_constraint':
             constraints.serialize(self.recurrent_constraint),
-        'bias_constraint': constraints.serialize(self.bias_constraint),
-        'dropout': self.dropout,
-        'recurrent_dropout': self.recurrent_dropout
+        'bias_constraint':
+            constraints.serialize(self.bias_constraint),
+        'dropout':
+            self.dropout,
+        'recurrent_dropout':
+            self.recurrent_dropout
     }
     base_config = super(SimpleRNN, self).get_config()
     del base_config['cell']
@@ -1155,43 +1168,28 @@ class GRUCell(Layer):
 
   Arguments:
       units: Positive integer, dimensionality of the output space.
-      activation: Activation function to use
-          (see [activations](../activations.md)).
-          Default: hyperbolic tangent (`tanh`).
-          If you pass `None`, no activation is applied
+      activation: Activation function to use.
+          If you pass None, no activation is applied
           (ie. "linear" activation: `a(x) = x`).
       recurrent_activation: Activation function to use
-          for the recurrent step
-          (see [activations](../activations.md)).
-          Default: hard sigmoid (`hard_sigmoid`).
-          If you pass `None`, no activation is applied
-          (ie. "linear" activation: `a(x) = x`).
+          for the recurrent step.
       use_bias: Boolean, whether the layer uses a bias vector.
       kernel_initializer: Initializer for the `kernel` weights matrix,
           used for the linear transformation of the inputs.
-          (see [initializers](../initializers.md)).
       recurrent_initializer: Initializer for the `recurrent_kernel`
           weights matrix,
           used for the linear transformation of the recurrent state.
-          (see [initializers](../initializers.md)).
-      bias_initializer: Initializer for the bias vector
-          (see [initializers](../initializers.md)).
+      bias_initializer: Initializer for the bias vector.
       kernel_regularizer: Regularizer function applied to
-          the `kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
+          the `kernel` weights matrix.
       recurrent_regularizer: Regularizer function applied to
-          the `recurrent_kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
-      bias_regularizer: Regularizer function applied to the bias vector
-          (see [regularizer](../regularizers.md)).
+          the `recurrent_kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
       kernel_constraint: Constraint function applied to
-          the `kernel` weights matrix
-          (see [constraints](../constraints.md)).
+          the `kernel` weights matrix.
       recurrent_constraint: Constraint function applied to
-          the `recurrent_kernel` weights matrix
-          (see [constraints](../constraints.md)).
-      bias_constraint: Constraint function applied to the bias vector
-          (see [constraints](../constraints.md)).
+          the `recurrent_kernel` weights matrix.
+      bias_constraint: Constraint function applied to the bias vector.
       dropout: Float between 0 and 1.
           Fraction of the units to drop for
           the linear transformation of the inputs.
@@ -1249,6 +1247,7 @@ class GRUCell(Layer):
     self._dropout_mask = None
     self._recurrent_dropout_mask = None
 
+  @shape_type_conversion
   def build(self, input_shape):
     input_dim = input_shape[-1]
     self.kernel = self.add_weight(
@@ -1292,38 +1291,24 @@ class GRUCell(Layer):
       self.bias_h = None
     self.built = True
 
-  def _generate_dropout_mask(self, inputs, training=None):
-    if 0 < self.dropout < 1:
-      ones = K.ones_like(K.squeeze(inputs[:, 0:1, :], axis=1))
-
-      def dropped_inputs():
-        return K.dropout(ones, self.dropout)
-
-      self._dropout_mask = [
-          K.in_train_phase(dropped_inputs, ones, training=training)
-          for _ in range(3)
-      ]
-    else:
-      self._dropout_mask = None
-
-  def _generate_recurrent_dropout_mask(self, inputs, training=None):
-    if 0 < self.recurrent_dropout < 1:
-      ones = K.ones_like(K.reshape(inputs[:, 0, 0], (-1, 1)))
-      ones = K.tile(ones, (1, self.units))
-
-      def dropped_inputs():
-        return K.dropout(ones, self.dropout)
-
-      self._recurrent_dropout_mask = [
-          K.in_train_phase(dropped_inputs, ones, training=training)
-          for _ in range(3)
-      ]
-    else:
-      self._recurrent_dropout_mask = None
-
   def call(self, inputs, states, training=None):
     h_tm1 = states[0]  # previous memory
 
+    if 0 < self.dropout < 1 and self._dropout_mask is None:
+      self._dropout_mask = _generate_dropout_mask(
+          _generate_dropout_ones(inputs,
+                                 K.shape(inputs)[-1]),
+          self.dropout,
+          training=training,
+          count=3)
+    if (0 < self.recurrent_dropout < 1 and
+        self._recurrent_dropout_mask is None):
+      self._recurrent_dropout_mask = _generate_dropout_mask(
+          _generate_dropout_ones(inputs, self.units),
+          self.recurrent_dropout,
+          training=training,
+          count=3)
+
     # dropout matrices for input units
     dp_mask = self._dropout_mask
     # dropout matrices for recurrent units
@@ -1387,55 +1372,76 @@ class GRUCell(Layer):
         h._uses_learning_phase = True
     return h, [h]
 
+  def get_config(self):
+    config = {
+        'units':
+            self.units,
+        'activation':
+            activations.serialize(self.activation),
+        'recurrent_activation':
+            activations.serialize(self.recurrent_activation),
+        'use_bias':
+            self.use_bias,
+        'kernel_initializer':
+            initializers.serialize(self.kernel_initializer),
+        'recurrent_initializer':
+            initializers.serialize(self.recurrent_initializer),
+        'bias_initializer':
+            initializers.serialize(self.bias_initializer),
+        'kernel_regularizer':
+            regularizers.serialize(self.kernel_regularizer),
+        'recurrent_regularizer':
+            regularizers.serialize(self.recurrent_regularizer),
+        'bias_regularizer':
+            regularizers.serialize(self.bias_regularizer),
+        'kernel_constraint':
+            constraints.serialize(self.kernel_constraint),
+        'recurrent_constraint':
+            constraints.serialize(self.recurrent_constraint),
+        'bias_constraint':
+            constraints.serialize(self.bias_constraint),
+        'dropout':
+            self.dropout,
+        'recurrent_dropout':
+            self.recurrent_dropout,
+        'implementation':
+            self.implementation
+    }
+    base_config = super(GRUCell, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
 
 class GRU(RNN):
-  # pylint: disable=line-too-long
   """Gated Recurrent Unit - Cho et al.
 
   2014.
 
   Arguments:
       units: Positive integer, dimensionality of the output space.
-      activation: Activation function to use
-          (see [activations](../activations.md)).
-          Default: hyperbolic tangent (`tanh`).
-          If you pass `None`, no activation is applied
+      activation: Activation function to use.
+          If you pass None, no activation is applied
           (ie. "linear" activation: `a(x) = x`).
       recurrent_activation: Activation function to use
-          for the recurrent step
-          (see [activations](../activations.md)).
-          Default: hard sigmoid (`hard_sigmoid`).
-          If you pass `None`, no activation is applied
-          (ie. "linear" activation: `a(x) = x`).
+          for the recurrent step.
       use_bias: Boolean, whether the layer uses a bias vector.
       kernel_initializer: Initializer for the `kernel` weights matrix,
           used for the linear transformation of the inputs.
-          (see [initializers](../initializers.md)).
       recurrent_initializer: Initializer for the `recurrent_kernel`
           weights matrix,
           used for the linear transformation of the recurrent state.
-          (see [initializers](../initializers.md)).
-      bias_initializer: Initializer for the bias vector
-          (see [initializers](../initializers.md)).
+      bias_initializer: Initializer for the bias vector.
       kernel_regularizer: Regularizer function applied to
-          the `kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
+          the `kernel` weights matrix.
       recurrent_regularizer: Regularizer function applied to
-          the `recurrent_kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
-      bias_regularizer: Regularizer function applied to the bias vector
-          (see [regularizer](../regularizers.md)).
+          the `recurrent_kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
       activity_regularizer: Regularizer function applied to
-          the output of the layer (its "activation").
-          (see [regularizer](../regularizers.md)).
+          the output of the layer (its "activation")..
       kernel_constraint: Constraint function applied to
-          the `kernel` weights matrix
-          (see [constraints](../constraints.md)).
+          the `kernel` weights matrix.
       recurrent_constraint: Constraint function applied to
-          the `recurrent_kernel` weights matrix
-          (see [constraints](../constraints.md)).
-      bias_constraint: Constraint function applied to the bias vector
-          (see [constraints](../constraints.md)).
+          the `recurrent_kernel` weights matrix.
+      bias_constraint: Constraint function applied to the bias vector.
       dropout: Float between 0 and 1.
           Fraction of the units to drop for
           the linear transformation of the inputs.
@@ -1465,12 +1471,7 @@ class GRU(RNN):
           although it tends to be more memory-intensive.
           Unrolling is only suitable for short sequences.
 
-  References:
-      - [On the Properties of Neural Machine Translation: Encoder-Decoder Approaches](https://arxiv.org/abs/1409.1259)
-      - [Empirical Evaluation of Gated Recurrent Neural Networks on Sequence Modeling](http://arxiv.org/abs/1412.3555v1)
-      - [A Theoretically Grounded Application of Dropout in Recurrent Neural Networks](http://arxiv.org/abs/1512.05287)
   """
-  # pylint: enable=line-too-long
 
   def __init__(self,
                units,
@@ -1528,8 +1529,8 @@ class GRU(RNN):
     self.activity_regularizer = regularizers.get(activity_regularizer)
 
   def call(self, inputs, mask=None, training=None, initial_state=None):
-    self.cell._generate_dropout_mask(inputs, training=training)
-    self.cell._generate_recurrent_dropout_mask(inputs, training=training)
+    self.cell._dropout_mask = None
+    self.cell._recurrent_dropout_mask = None
     return super(GRU, self).call(
         inputs, mask=mask, training=training, initial_state=initial_state)
 
@@ -1599,28 +1600,40 @@ class GRU(RNN):
 
   def get_config(self):
     config = {
-        'units': self.units,
-        'activation': activations.serialize(self.activation),
+        'units':
+            self.units,
+        'activation':
+            activations.serialize(self.activation),
         'recurrent_activation':
             activations.serialize(self.recurrent_activation),
-        'use_bias': self.use_bias,
-        'kernel_initializer': initializers.serialize(self.kernel_initializer),
+        'use_bias':
+            self.use_bias,
+        'kernel_initializer':
+            initializers.serialize(self.kernel_initializer),
         'recurrent_initializer':
             initializers.serialize(self.recurrent_initializer),
-        'bias_initializer': initializers.serialize(self.bias_initializer),
-        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
+        'bias_initializer':
+            initializers.serialize(self.bias_initializer),
+        'kernel_regularizer':
+            regularizers.serialize(self.kernel_regularizer),
         'recurrent_regularizer':
             regularizers.serialize(self.recurrent_regularizer),
-        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+        'bias_regularizer':
+            regularizers.serialize(self.bias_regularizer),
         'activity_regularizer':
             regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint': constraints.serialize(self.kernel_constraint),
+        'kernel_constraint':
+            constraints.serialize(self.kernel_constraint),
         'recurrent_constraint':
             constraints.serialize(self.recurrent_constraint),
-        'bias_constraint': constraints.serialize(self.bias_constraint),
-        'dropout': self.dropout,
-        'recurrent_dropout': self.recurrent_dropout,
-        'implementation': self.implementation
+        'bias_constraint':
+            constraints.serialize(self.bias_constraint),
+        'dropout':
+            self.dropout,
+        'recurrent_dropout':
+            self.recurrent_dropout,
+        'implementation':
+            self.implementation
     }
     base_config = super(GRU, self).get_config()
     del base_config['cell']
@@ -1638,48 +1651,33 @@ class LSTMCell(Layer):
 
   Arguments:
       units: Positive integer, dimensionality of the output space.
-      activation: Activation function to use
-          (see [activations](../activations.md)).
-          Default: hyperbolic tangent (`tanh`).
-          If you pass `None`, no activation is applied
+      activation: Activation function to use.
+          If you pass None, no activation is applied
           (ie. "linear" activation: `a(x) = x`).
       recurrent_activation: Activation function to use
-          for the recurrent step
-          (see [activations](../activations.md)).
-          Default: hard sigmoid (`hard_sigmoid`).
-          If you pass `None`, no activation is applied
-          (ie. "linear" activation: `a(x) = x`).
+          for the recurrent step.
       use_bias: Boolean, whether the layer uses a bias vector.
       kernel_initializer: Initializer for the `kernel` weights matrix,
           used for the linear transformation of the inputs.
-          (see [initializers](../initializers.md)).
       recurrent_initializer: Initializer for the `recurrent_kernel`
           weights matrix,
           used for the linear transformation of the recurrent state.
-          (see [initializers](../initializers.md)).
-      bias_initializer: Initializer for the bias vector
-          (see [initializers](../initializers.md)).
+      bias_initializer: Initializer for the bias vector.
       unit_forget_bias: Boolean.
           If True, add 1 to the bias of the forget gate at initialization.
           Setting it to true will also force `bias_initializer="zeros"`.
           This is recommended in [Jozefowicz et
             al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
       kernel_regularizer: Regularizer function applied to
-          the `kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
+          the `kernel` weights matrix.
       recurrent_regularizer: Regularizer function applied to
-          the `recurrent_kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
-      bias_regularizer: Regularizer function applied to the bias vector
-          (see [regularizer](../regularizers.md)).
+          the `recurrent_kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
       kernel_constraint: Constraint function applied to
-          the `kernel` weights matrix
-          (see [constraints](../constraints.md)).
+          the `kernel` weights matrix.
       recurrent_constraint: Constraint function applied to
-          the `recurrent_kernel` weights matrix
-          (see [constraints](../constraints.md)).
-      bias_constraint: Constraint function applied to the bias vector
-          (see [constraints](../constraints.md)).
+          the `recurrent_kernel` weights matrix.
+      bias_constraint: Constraint function applied to the bias vector.
       dropout: Float between 0 and 1.
           Fraction of the units to drop for
           the linear transformation of the inputs.
@@ -1739,6 +1737,7 @@ class LSTMCell(Layer):
     self._dropout_mask = None
     self._recurrent_dropout_mask = None
 
+  @shape_type_conversion
   def build(self, input_shape):
     input_dim = input_shape[-1]
     self.kernel = self.add_weight(
@@ -1798,36 +1797,22 @@ class LSTMCell(Layer):
       self.bias_o = None
     self.built = True
 
-  def _generate_dropout_mask(self, inputs, training=None):
-    if 0 < self.dropout < 1:
-      ones = K.ones_like(K.squeeze(inputs[:, 0:1, :], axis=1))
-
-      def dropped_inputs():
-        return K.dropout(ones, self.dropout)
-
-      self._dropout_mask = [
-          K.in_train_phase(dropped_inputs, ones, training=training)
-          for _ in range(4)
-      ]
-    else:
-      self._dropout_mask = None
-
-  def _generate_recurrent_dropout_mask(self, inputs, training=None):
-    if 0 < self.recurrent_dropout < 1:
-      ones = K.ones_like(K.reshape(inputs[:, 0, 0], (-1, 1)))
-      ones = K.tile(ones, (1, self.units))
-
-      def dropped_inputs():
-        return K.dropout(ones, self.dropout)
-
-      self._recurrent_dropout_mask = [
-          K.in_train_phase(dropped_inputs, ones, training=training)
-          for _ in range(4)
-      ]
-    else:
-      self._recurrent_dropout_mask = None
-
   def call(self, inputs, states, training=None):
+    if 0 < self.dropout < 1 and self._dropout_mask is None:
+      self._dropout_mask = _generate_dropout_mask(
+          _generate_dropout_ones(inputs,
+                                 K.shape(inputs)[-1]),
+          self.dropout,
+          training=training,
+          count=4)
+    if (0 < self.recurrent_dropout < 1 and
+        self._recurrent_dropout_mask is None):
+      self._recurrent_dropout_mask = _generate_dropout_mask(
+          _generate_dropout_ones(inputs, self.units),
+          self.recurrent_dropout,
+          training=training,
+          count=4)
+
     # dropout matrices for input units
     dp_mask = self._dropout_mask
     # dropout matrices for recurrent units
@@ -1901,59 +1886,81 @@ class LSTMCell(Layer):
         h._uses_learning_phase = True
     return h, [h, c]
 
+  def get_config(self):
+    config = {
+        'units':
+            self.units,
+        'activation':
+            activations.serialize(self.activation),
+        'recurrent_activation':
+            activations.serialize(self.recurrent_activation),
+        'use_bias':
+            self.use_bias,
+        'kernel_initializer':
+            initializers.serialize(self.kernel_initializer),
+        'recurrent_initializer':
+            initializers.serialize(self.recurrent_initializer),
+        'bias_initializer':
+            initializers.serialize(self.bias_initializer),
+        'unit_forget_bias':
+            self.unit_forget_bias,
+        'kernel_regularizer':
+            regularizers.serialize(self.kernel_regularizer),
+        'recurrent_regularizer':
+            regularizers.serialize(self.recurrent_regularizer),
+        'bias_regularizer':
+            regularizers.serialize(self.bias_regularizer),
+        'kernel_constraint':
+            constraints.serialize(self.kernel_constraint),
+        'recurrent_constraint':
+            constraints.serialize(self.recurrent_constraint),
+        'bias_constraint':
+            constraints.serialize(self.bias_constraint),
+        'dropout':
+            self.dropout,
+        'recurrent_dropout':
+            self.recurrent_dropout,
+        'implementation':
+            self.implementation
+    }
+    base_config = super(LSTMCell, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
 
 class LSTM(RNN):
-  # pylint: disable=line-too-long
   """Long-Short Term Memory layer - Hochreiter 1997.
 
   Arguments:
       units: Positive integer, dimensionality of the output space.
-      activation: Activation function to use
-          (see [activations](../activations.md)).
-          Default: hyperbolic tangent (`tanh`).
-          If you pass `None`, no activation is applied
+      activation: Activation function to use.
+          If you pass None, no activation is applied
           (ie. "linear" activation: `a(x) = x`).
       recurrent_activation: Activation function to use
-          for the recurrent step
-          (see [activations](../activations.md)).
-          Default: hyperbolic tangent (`tanh`).
-          Default: hard sigmoid (`hard_sigmoid`).
-          If you pass `None`, no activation is applied
-          (ie. "linear" activation: `a(x) = x`).
+          for the recurrent step.
       use_bias: Boolean, whether the layer uses a bias vector.
       kernel_initializer: Initializer for the `kernel` weights matrix,
-          used for the linear transformation of the inputs.
-          (see [initializers](../initializers.md)).
+          used for the linear transformation of the inputs..
       recurrent_initializer: Initializer for the `recurrent_kernel`
           weights matrix,
-          used for the linear transformation of the recurrent state.
-          (see [initializers](../initializers.md)).
-      bias_initializer: Initializer for the bias vector
-          (see [initializers](../initializers.md)).
+          used for the linear transformation of the recurrent state..
+      bias_initializer: Initializer for the bias vector.
       unit_forget_bias: Boolean.
           If True, add 1 to the bias of the forget gate at initialization.
           Setting it to true will also force `bias_initializer="zeros"`.
           This is recommended in [Jozefowicz et
             al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
       kernel_regularizer: Regularizer function applied to
-          the `kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
+          the `kernel` weights matrix.
       recurrent_regularizer: Regularizer function applied to
-          the `recurrent_kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
-      bias_regularizer: Regularizer function applied to the bias vector
-          (see [regularizer](../regularizers.md)).
+          the `recurrent_kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
       activity_regularizer: Regularizer function applied to
-          the output of the layer (its "activation").
-          (see [regularizer](../regularizers.md)).
+          the output of the layer (its "activation")..
       kernel_constraint: Constraint function applied to
-          the `kernel` weights matrix
-          (see [constraints](../constraints.md)).
+          the `kernel` weights matrix.
       recurrent_constraint: Constraint function applied to
-          the `recurrent_kernel` weights matrix
-          (see [constraints](../constraints.md)).
-      bias_constraint: Constraint function applied to the bias vector
-          (see [constraints](../constraints.md)).
+          the `recurrent_kernel` weights matrix.
+      bias_constraint: Constraint function applied to the bias vector.
       dropout: Float between 0 and 1.
           Fraction of the units to drop for
           the linear transformation of the inputs.
@@ -1983,13 +1990,7 @@ class LSTM(RNN):
           although it tends to be more memory-intensive.
           Unrolling is only suitable for short sequences.
 
-  References:
-      - [Long short-term memory](http://www.bioinf.jku.at/publications/older/2604.pdf)
-      - [Learning to forget: Continual prediction with LSTM](http://www.mitpressjournals.org/doi/pdf/10.1162/089976600300015015)
-      - [Supervised sequence labeling with recurrent neural networks](http://www.cs.toronto.edu/~graves/preprint.pdf)
-      - [A Theoretically Grounded Application of Dropout in Recurrent Neural Networks](http://arxiv.org/abs/1512.05287)
   """
-  # pylint: enable=line-too-long
 
   def __init__(self,
                units,
@@ -2049,8 +2050,8 @@ class LSTM(RNN):
     self.activity_regularizer = regularizers.get(activity_regularizer)
 
   def call(self, inputs, mask=None, training=None, initial_state=None):
-    self.cell._generate_dropout_mask(inputs, training=training)
-    self.cell._generate_recurrent_dropout_mask(inputs, training=training)
+    self.cell._dropout_mask = None
+    self.cell._recurrent_dropout_mask = None
     return super(LSTM, self).call(
         inputs, mask=mask, training=training, initial_state=initial_state)
 
@@ -2124,29 +2125,42 @@ class LSTM(RNN):
 
   def get_config(self):
     config = {
-        'units': self.units,
-        'activation': activations.serialize(self.activation),
+        'units':
+            self.units,
+        'activation':
+            activations.serialize(self.activation),
         'recurrent_activation':
             activations.serialize(self.recurrent_activation),
-        'use_bias': self.use_bias,
-        'kernel_initializer': initializers.serialize(self.kernel_initializer),
+        'use_bias':
+            self.use_bias,
+        'kernel_initializer':
+            initializers.serialize(self.kernel_initializer),
         'recurrent_initializer':
             initializers.serialize(self.recurrent_initializer),
-        'bias_initializer': initializers.serialize(self.bias_initializer),
-        'unit_forget_bias': self.unit_forget_bias,
-        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
+        'bias_initializer':
+            initializers.serialize(self.bias_initializer),
+        'unit_forget_bias':
+            self.unit_forget_bias,
+        'kernel_regularizer':
+            regularizers.serialize(self.kernel_regularizer),
         'recurrent_regularizer':
             regularizers.serialize(self.recurrent_regularizer),
-        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+        'bias_regularizer':
+            regularizers.serialize(self.bias_regularizer),
         'activity_regularizer':
             regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint': constraints.serialize(self.kernel_constraint),
+        'kernel_constraint':
+            constraints.serialize(self.kernel_constraint),
         'recurrent_constraint':
             constraints.serialize(self.recurrent_constraint),
-        'bias_constraint': constraints.serialize(self.bias_constraint),
-        'dropout': self.dropout,
-        'recurrent_dropout': self.recurrent_dropout,
-        'implementation': self.implementation
+        'bias_constraint':
+            constraints.serialize(self.bias_constraint),
+        'dropout':
+            self.dropout,
+        'recurrent_dropout':
+            self.recurrent_dropout,
+        'implementation':
+            self.implementation
     }
     base_config = super(LSTM, self).get_config()
     del base_config['cell']
@@ -2159,6 +2173,23 @@ class LSTM(RNN):
     return cls(**config)
 
 
+def _generate_dropout_ones(inputs, dims):
+  return K.ones((K.shape(inputs)[0], dims))
+
+
+def _generate_dropout_mask(ones, rate, training=None, count=1):
+
+  def dropped_inputs():
+    return K.dropout(ones, rate)
+
+  if count > 1:
+    return [
+        K.in_train_phase(dropped_inputs, ones, training=training)
+        for _ in range(count)
+    ]
+  return K.in_train_phase(dropped_inputs, ones, training=training)
+
+
 class Recurrent(Layer):
   """Deprecated abstract base class for recurrent layers.
 
@@ -2285,6 +2316,7 @@ class Recurrent(Layer):
     self.dropout = 0
     self.recurrent_dropout = 0
 
+  @shape_type_conversion
   def compute_output_shape(self, input_shape):
     if isinstance(input_shape, list):
       input_shape = input_shape[0]
diff --git a/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py b/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py
index 7dc4c1db9b..a1407a24ea 100644
--- a/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py
@@ -392,6 +392,105 @@ class RNNTest(test.TestCase):
     self.assertEqual(len(layer.trainable_weights), 3)
     self.assertEqual(len(layer.non_trainable_weights), 0)
 
+  def test_state_reuse_with_dropout(self):
+    layer_class = keras.layers.SimpleRNN
+    embedding_dim = 4
+    units = 3
+    timesteps = 2
+    num_samples = 2
+
+    with self.test_session():
+      input1 = keras.Input(batch_shape=(num_samples, timesteps, embedding_dim))
+      layer = layer_class(units,
+                          return_state=True,
+                          return_sequences=True,
+                          dropout=0.2)
+      state = layer(input1)[1:]
+
+      input2 = keras.Input(batch_shape=(num_samples, timesteps, embedding_dim))
+      output = layer_class(units)(input2, initial_state=state)
+      model = keras.Model([input1, input2], output)
+
+      inputs = [np.random.random((num_samples, timesteps, embedding_dim)),
+                np.random.random((num_samples, timesteps, embedding_dim))]
+      model.predict(inputs)
+
+  def test_builtin_rnn_cell_serialization(self):
+    for cell_class in [keras.layers.SimpleRNNCell,
+                       keras.layers.GRUCell,
+                       keras.layers.LSTMCell]:
+      with self.test_session():
+        # Test basic case.
+        x = keras.Input((None, 5))
+        cell = cell_class(32)
+        layer = keras.layers.RNN(cell)
+        y = layer(x)
+        model = keras.models.Model(x, y)
+        model.compile(optimizer='rmsprop', loss='mse')
+
+        # Test basic case serialization.
+        x_np = np.random.random((6, 5, 5))
+        y_np = model.predict(x_np)
+        weights = model.get_weights()
+        config = layer.get_config()
+        layer = keras.layers.RNN.from_config(config)
+        y = layer(x)
+        model = keras.models.Model(x, y)
+        model.set_weights(weights)
+        y_np_2 = model.predict(x_np)
+        self.assertAllClose(y_np, y_np_2, atol=1e-4)
+
+        # Test stacking.
+        cells = [cell_class(8),
+                 cell_class(12),
+                 cell_class(32)]
+        layer = keras.layers.RNN(cells)
+        y = layer(x)
+        model = keras.models.Model(x, y)
+        model.compile(optimizer='rmsprop', loss='mse')
+
+        # Test stacked RNN serialization.
+        x_np = np.random.random((6, 5, 5))
+        y_np = model.predict(x_np)
+        weights = model.get_weights()
+        config = layer.get_config()
+        layer = keras.layers.RNN.from_config(config)
+        y = layer(x)
+        model = keras.models.Model(x, y)
+        model.set_weights(weights)
+        y_np_2 = model.predict(x_np)
+        self.assertAllClose(y_np, y_np_2, atol=1e-4)
+
+  def test_stacked_rnn_dropout(self):
+    cells = [keras.layers.LSTMCell(3, dropout=0.1, recurrent_dropout=0.1),
+             keras.layers.LSTMCell(3, dropout=0.1, recurrent_dropout=0.1)]
+    layer = keras.layers.RNN(cells)
+
+    with self.test_session():
+      x = keras.Input((None, 5))
+      y = layer(x)
+      model = keras.models.Model(x, y)
+      model.compile('sgd', 'mse')
+      x_np = np.random.random((6, 5, 5))
+      y_np = np.random.random((6, 3))
+      model.train_on_batch(x_np, y_np)
+
+  def test_stacked_rnn_compute_output_shape(self):
+    cells = [keras.layers.LSTMCell(3),
+             keras.layers.LSTMCell(6)]
+    embedding_dim = 4
+    timesteps = 2
+    layer = keras.layers.RNN(cells, return_state=True, return_sequences=True)
+    output_shape = layer.compute_output_shape((None, timesteps, embedding_dim))
+    expected_output_shape = [(None, timesteps, 6),
+                             (None, 6),
+                             (None, 6),
+                             (None, 3),
+                             (None, 3)]
+    self.assertEqual(
+        [tuple(o.as_list()) for o in output_shape],
+        expected_output_shape)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/layers/wrappers.py b/tensorflow/python/keras/_impl/keras/layers/wrappers.py
index 452801b656..3667956f80 100644
--- a/tensorflow/python/keras/_impl/keras/layers/wrappers.py
+++ b/tensorflow/python/keras/_impl/keras/layers/wrappers.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
+from tensorflow.python.keras._impl.keras.engine.topology import shape_type_conversion
 from tensorflow.python.keras._impl.keras.utils.generic_utils import has_arg
 from tensorflow.python.layers import utils as tf_layers_util
 
@@ -291,6 +292,7 @@ class Bidirectional(Wrapper):
       self.backward_layer.initial_weights = weights[nw // 2:]
     self.stateful = layer.stateful
     self.return_sequences = layer.return_sequences
+    self.return_state = layer.return_state
     self.supports_masking = True
 
   def get_weights(self):
@@ -301,27 +303,54 @@ class Bidirectional(Wrapper):
     self.forward_layer.set_weights(weights[:nw // 2])
     self.backward_layer.set_weights(weights[nw // 2:])
 
+  @shape_type_conversion
   def compute_output_shape(self, input_shape):
-    input_shape = tuple(tensor_shape.TensorShape(input_shape).as_list())
-    if self.merge_mode in ['sum', 'ave', 'mul']:
-      return self.forward_layer.compute_output_shape(input_shape)
-    elif self.merge_mode == 'concat':
-      shape = self.forward_layer.compute_output_shape(input_shape).as_list()
-      shape[-1] *= 2
-      return tensor_shape.TensorShape(shape)
+    output_shape = tuple(self.forward_layer.compute_output_shape(
+        input_shape).as_list())
+    if self.return_state:
+      state_shape = output_shape[1:]
+      output_shape = output_shape[0]
+
+    if self.merge_mode == 'concat':
+      output_shape = list(output_shape)
+      output_shape[-1] *= 2
+      output_shape = tuple(output_shape)
     elif self.merge_mode is None:
-      shape = self.forward_layer.compute_output_shape(input_shape)
-      return [shape, copy.copy(shape)]
+      output_shape = [output_shape, copy.copy(output_shape)]
 
-  def call(self, inputs, training=None, mask=None):
+    if self.return_state:
+      if self.merge_mode is None:
+        return output_shape + state_shape + copy.copy(state_shape)
+      return [output_shape] + state_shape + copy.copy(state_shape)
+    return output_shape
+
+  def call(self, inputs, training=None, mask=None, initial_state=None):
     kwargs = {}
     if has_arg(self.layer.call, 'training'):
       kwargs['training'] = training
     if has_arg(self.layer.call, 'mask'):
       kwargs['mask'] = mask
 
-    y = self.forward_layer.call(inputs, **kwargs)
-    y_rev = self.backward_layer.call(inputs, **kwargs)
+    if initial_state is not None and has_arg(self.layer.call, 'initial_state'):
+      if not isinstance(initial_state, list):
+        raise ValueError(
+            'When passing `initial_state` to a Bidirectional RNN, the state '
+            'should be a list containing the states of the underlying RNNs. '
+            'Found: ' + str(initial_state))
+      forward_state = initial_state[:len(initial_state) // 2]
+      backward_state = initial_state[len(initial_state) // 2:]
+      y = self.forward_layer.call(inputs, initial_state=forward_state, **kwargs)
+      y_rev = self.backward_layer.call(
+          inputs, initial_state=backward_state, **kwargs)
+    else:
+      y = self.forward_layer.call(inputs, **kwargs)
+      y_rev = self.backward_layer.call(inputs, **kwargs)
+
+    if self.return_state:
+      states = y[1:] + y_rev[1:]
+      y = y[0]
+      y_rev = y_rev[0]
+
     if self.return_sequences:
       y_rev = K.reverse(y_rev, 1)
     if self.merge_mode == 'concat':
@@ -343,6 +372,11 @@ class Bidirectional(Wrapper):
           out._uses_learning_phase = True
       else:
         output._uses_learning_phase = True
+
+    if self.return_state:
+      if self.merge_mode is None:
+        return output + states
+      return [output] + states
     return output
 
   def reset_states(self):
diff --git a/tensorflow/python/keras/_impl/keras/layers/wrappers_test.py b/tensorflow/python/keras/_impl/keras/layers/wrappers_test.py
index 0866c4b0ae..f48c8919a1 100644
--- a/tensorflow/python/keras/_impl/keras/layers/wrappers_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/wrappers_test.py
@@ -238,6 +238,131 @@ class BidirectionalTest(test.TestCase):
       model.compile(loss='mse', optimizer='sgd')
       model.fit(x, y, epochs=1, batch_size=1)
 
+  def test_Bidirectional_merged_value(self):
+    rnn = keras.layers.LSTM
+    samples = 2
+    dim = 5
+    timesteps = 3
+    units = 3
+    x = [np.random.rand(samples, timesteps, dim)]
+
+    with self.test_session():
+      for merge_mode in ['sum', 'mul', 'ave', 'concat', None]:
+        if merge_mode == 'sum':
+          merge_func = lambda y, y_rev: y + y_rev
+        elif merge_mode == 'mul':
+          merge_func = lambda y, y_rev: y * y_rev
+        elif merge_mode == 'ave':
+          merge_func = lambda y, y_rev: (y + y_rev) / 2
+        elif merge_mode == 'concat':
+          merge_func = lambda y, y_rev: np.concatenate((y, y_rev), axis=-1)
+        else:
+          merge_func = lambda y, y_rev: [y, y_rev]
+
+        # basic case
+        inputs = keras.Input((timesteps, dim))
+        layer = keras.layers.Bidirectional(
+            rnn(units, return_sequences=True), merge_mode=merge_mode)
+        f_merged = keras.backend.function([inputs], _to_list(layer(inputs)))
+        f_forward = keras.backend.function([inputs],
+                                           [layer.forward_layer.call(inputs)])
+        f_backward = keras.backend.function(
+            [inputs],
+            [keras.backend.reverse(layer.backward_layer.call(inputs), 1)])
+
+        y_merged = f_merged(x)
+        y_expected = _to_list(merge_func(f_forward(x)[0], f_backward(x)[0]))
+        assert len(y_merged) == len(y_expected)
+        for x1, x2 in zip(y_merged, y_expected):
+          self.assertAllClose(x1, x2, atol=1e-5)
+
+        # test return_state
+        inputs = keras.Input((timesteps, dim))
+        layer = keras.layers.Bidirectional(
+            rnn(units, return_state=True), merge_mode=merge_mode)
+        f_merged = keras.backend.function([inputs], layer(inputs))
+        f_forward = keras.backend.function([inputs],
+                                           layer.forward_layer.call(inputs))
+        f_backward = keras.backend.function([inputs],
+                                            layer.backward_layer.call(inputs))
+        n_states = len(layer.layer.states)
+
+        y_merged = f_merged(x)
+        y_forward = f_forward(x)
+        y_backward = f_backward(x)
+        y_expected = _to_list(merge_func(y_forward[0], y_backward[0]))
+        assert len(y_merged) == len(y_expected) + n_states * 2
+        for x1, x2 in zip(y_merged, y_expected):
+          self.assertAllClose(x1, x2, atol=1e-5)
+
+        y_merged = y_merged[-n_states * 2:]
+        y_forward = y_forward[-n_states:]
+        y_backward = y_backward[-n_states:]
+        for state_birnn, state_inner in zip(y_merged, y_forward + y_backward):
+          self.assertAllClose(state_birnn, state_inner, atol=1e-5)
+
+  def test_Bidirectional_dropout(self):
+    rnn = keras.layers.LSTM
+    samples = 2
+    dim = 5
+    timesteps = 3
+    units = 3
+    merge_mode = 'sum'
+    x = [np.random.rand(samples, timesteps, dim)]
+
+    with self.test_session():
+      inputs = keras.Input((timesteps, dim))
+      wrapped = keras.layers.Bidirectional(
+          rnn(units, dropout=0.2, recurrent_dropout=0.2), merge_mode=merge_mode)
+      outputs = _to_list(wrapped(inputs, training=True))
+      assert all(not getattr(x, '_uses_learning_phase') for x in outputs)
+
+      inputs = keras.Input((timesteps, dim))
+      wrapped = keras.layers.Bidirectional(
+          rnn(units, dropout=0.2, return_state=True), merge_mode=merge_mode)
+      outputs = _to_list(wrapped(inputs))
+      assert all(x._uses_learning_phase for x in outputs)
+
+      model = keras.Model(inputs, outputs)
+      assert model.uses_learning_phase
+      y1 = _to_list(model.predict(x))
+      y2 = _to_list(model.predict(x))
+      for x1, x2 in zip(y1, y2):
+        self.assertAllClose(x1, x2, atol=1e-5)
+
+  def test_Bidirectional_state_reuse(self):
+    rnn = keras.layers.LSTM
+    samples = 2
+    dim = 5
+    timesteps = 3
+    units = 3
+
+    with self.test_session():
+      inputs = keras.Input((timesteps, dim))
+      layer = keras.layers.Bidirectional(
+          rnn(units, return_state=True, return_sequences=True))
+      outputs = layer(inputs)
+      output, state = outputs[0], outputs[1:]
+
+      # test passing invalid initial_state: passing a tensor
+      with self.assertRaises(ValueError):
+        output = keras.layers.Bidirectional(
+            rnn(units))(output, initial_state=state[0])
+
+      # test valid usage: passing a list
+      output = keras.layers.Bidirectional(
+          rnn(units))(output, initial_state=state)
+      model = keras.Model(inputs, output)
+      inputs = np.random.rand(samples, timesteps, dim)
+      outputs = model.predict(inputs)
+
+
+def _to_list(ls):
+  if isinstance(ls, list):
+    return ls
+  else:
+    return [ls]
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/losses.py b/tensorflow/python/keras/_impl/keras/losses.py
index 1d6319abb1..fe0ef54360 100644
--- a/tensorflow/python/keras/_impl/keras/losses.py
+++ b/tensorflow/python/keras/_impl/keras/losses.py
@@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Built-in Keras loss functions.
+# pylint: disable=unused-import
+"""Built-in loss functions.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -34,7 +35,6 @@ def mean_absolute_error(y_true, y_pred):
 
 
 def mean_absolute_percentage_error(y_true, y_pred):
-  # Equivalent to MAE, but sometimes easier to interpret.
   diff = K.abs((y_true - y_pred) / K.clip(K.abs(y_true), K.epsilon(), None))
   return 100. * K.mean(diff, axis=-1)
 
@@ -56,10 +56,24 @@ def hinge(y_true, y_pred):
 def categorical_hinge(y_true, y_pred):
   pos = K.sum(y_true * y_pred, axis=-1)
   neg = K.max((1. - y_true) * y_pred, axis=-1)
-  return K.maximum(neg - pos + 1., 0.)
+  return K.maximum(0., neg - pos + 1.)
 
 
 def logcosh(y_true, y_pred):
+  """Logarithm of the hyperbolic cosine of the prediction error.
+
+  `log(cosh(x))` is approximately equal to `(x ** 2) / 2` for small `x` and
+  to `abs(x) - log(2)` for large `x`. This means that 'logcosh' works mostly
+  like the mean squared error, but will not be so strongly affected by the
+  occasional wildly incorrect prediction.
+
+  Arguments:
+      y_true: tensor of true targets.
+      y_pred: tensor of predicted targets.
+
+  Returns:
+      Tensor with one scalar loss entry per sample.
+  """
 
   def _logcosh(x):
     return x + K.softplus(-2. * x) - K.log(2.)
diff --git a/tensorflow/python/keras/_impl/keras/metrics.py b/tensorflow/python/keras/_impl/keras/metrics.py
index 202048f26d..3c18e68260 100644
--- a/tensorflow/python/keras/_impl/keras/metrics.py
+++ b/tensorflow/python/keras/_impl/keras/metrics.py
@@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Built-in Keras metrics functions.
+# pylint: disable=unused-import
+"""Built-in metrics.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -21,7 +22,6 @@ from __future__ import print_function
 import six
 
 from tensorflow.python.keras._impl.keras import backend as K
-# pylint: disable=unused-import
 from tensorflow.python.keras._impl.keras.losses import binary_crossentropy
 from tensorflow.python.keras._impl.keras.losses import categorical_crossentropy
 from tensorflow.python.keras._impl.keras.losses import cosine_proximity
@@ -35,7 +35,6 @@ from tensorflow.python.keras._impl.keras.losses import mean_squared_logarithmic_
 from tensorflow.python.keras._impl.keras.losses import poisson
 from tensorflow.python.keras._impl.keras.losses import sparse_categorical_crossentropy
 from tensorflow.python.keras._impl.keras.losses import squared_hinge
-# pylint: disable=unused-import
 from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
 
 
@@ -60,8 +59,8 @@ def top_k_categorical_accuracy(y_true, y_pred, k=5):
 
 
 def sparse_top_k_categorical_accuracy(y_true, y_pred, k=5):
-  return K.mean(K.in_top_k(y_pred,
-                           K.cast(K.max(y_true, axis=-1), 'int32'), k), axis=-1)
+  return K.mean(
+      K.in_top_k(y_pred, K.cast(K.max(y_true, axis=-1), 'int32'), k), axis=-1)
 
 
 # Aliases
diff --git a/tensorflow/python/keras/_impl/keras/models.py b/tensorflow/python/keras/_impl/keras/models.py
index e262cc8c8e..9cd547200d 100644
--- a/tensorflow/python/keras/_impl/keras/models.py
+++ b/tensorflow/python/keras/_impl/keras/models.py
@@ -492,13 +492,13 @@ class Sequential(Model):
         # to the input layer we just created.
         layer(x)
 
-      if len(layer.inbound_nodes[-1].output_tensors) != 1:
+      if len(layer._inbound_nodes[-1].output_tensors) != 1:
         raise ValueError('All layers in a Sequential model '
                          'should have a single output tensor. '
                          'For multi-output layers, '
                          'use the functional API.')
 
-      self.outputs = [layer.inbound_nodes[-1].output_tensors[0]]
+      self.outputs = [layer._inbound_nodes[-1].output_tensors[0]]
       self.inputs = topology.get_source_inputs(self.outputs[0])
 
       # We create an input node, which we will keep updated
diff --git a/tensorflow/python/keras/_impl/keras/models_test.py b/tensorflow/python/keras/_impl/keras/models_test.py
index edfc0ce0eb..04017e4b28 100644
--- a/tensorflow/python/keras/_impl/keras/models_test.py
+++ b/tensorflow/python/keras/_impl/keras/models_test.py
@@ -340,6 +340,35 @@ class TestSequential(test.TestCase):
     inner_model.trainable = True
     self.assertEqual(len(model.trainable_weights), 4)
 
+  def test_sequential_update_disabling(self):
+    val_a = np.random.random((10, 4))
+    val_out = np.random.random((10, 4))
+
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(keras.layers.BatchNormalization(input_shape=(4,)))
+
+      model.trainable = False
+      assert not model.updates
+
+      model.compile('sgd', 'mse')
+      assert not model.updates
+      assert not model.model.updates
+
+      x1 = model.predict(val_a)
+      model.train_on_batch(val_a, val_out)
+      x2 = model.predict(val_a)
+      self.assertAllClose(x1, x2, atol=1e-7)
+
+      model.trainable = True
+      model.compile('sgd', 'mse')
+      assert model.updates
+      assert model.model.updates
+
+      model.train_on_batch(val_a, val_out)
+      x2 = model.predict(val_a)
+      assert np.abs(np.sum(x1 - x2)) > 1e-5
+
 
 class TestModelCloning(test.TestCase):
 
diff --git a/tensorflow/python/keras/_impl/keras/optimizers.py b/tensorflow/python/keras/_impl/keras/optimizers.py
index a08073fa86..e47987aadc 100644
--- a/tensorflow/python/keras/_impl/keras/optimizers.py
+++ b/tensorflow/python/keras/_impl/keras/optimizers.py
@@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Keras optimizer classes (will eventually be replaced with core optimizers).
+# pylint: disable=invalid-name
+"""Built-in optimizer classes.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -121,9 +122,9 @@ class Optimizer(object):
     param_values = K.batch_get_value(params)
     for pv, p, w in zip(param_values, params, weights):
       if pv.shape != w.shape:
-        raise ValueError('Optimizer weight shape ' + str(pv.shape) +
-                         ' not compatible with '
-                         'provided weight shape ' + str(w.shape))
+        raise ValueError(
+            'Optimizer weight shape ' + str(pv.shape) + ' not compatible with '
+            'provided weight shape ' + str(w.shape))
       weight_value_tuples.append((p, w))
     K.batch_set_value(weight_value_tuples)
 
@@ -156,7 +157,8 @@ class SGD(Optimizer):
 
   Arguments:
       lr: float >= 0. Learning rate.
-      momentum: float >= 0. Parameter updates momentum.
+      momentum: float >= 0. Parameter that accelerates SGD
+          in the relevant direction and dampens oscillations.
       decay: float >= 0. Learning rate decay over each update.
       nesterov: boolean. Whether to apply Nesterov momentum.
   """
@@ -177,9 +179,8 @@ class SGD(Optimizer):
 
     lr = self.lr
     if self.initial_decay > 0:
-      lr *= (1. / (1. + self.decay * K.cast(self.iterations,
-                                            K.dtype(self.decay))))
-
+      lr *= (1. /
+             (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay))))
     # momentum
     shapes = [K.int_shape(p) for p in params]
     moments = [K.zeros(shape) for shape in shapes]
@@ -224,32 +225,33 @@ class RMSprop(Optimizer):
   Arguments:
       lr: float >= 0. Learning rate.
       rho: float >= 0.
-      epsilon: float >= 0. Fuzz factor.
+      epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
       decay: float >= 0. Learning rate decay over each update.
+
   """
 
-  def __init__(self, lr=0.001, rho=0.9, epsilon=1e-8, decay=0., **kwargs):
+  def __init__(self, lr=0.001, rho=0.9, epsilon=None, decay=0., **kwargs):
     super(RMSprop, self).__init__(**kwargs)
     with K.name_scope(self.__class__.__name__):
       self.lr = K.variable(lr, name='lr')
       self.rho = K.variable(rho, name='rho')
       self.decay = K.variable(decay, name='decay')
       self.iterations = K.variable(0, dtype='int64', name='iterations')
+    if epsilon is None:
+      epsilon = K.epsilon()
     self.epsilon = epsilon
     self.initial_decay = decay
 
   def get_updates(self, loss, params):
     grads = self.get_gradients(loss, params)
-    accumulators = [
-        K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params
-    ]
+    accumulators = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
     self.weights = accumulators
     self.updates = [K.update_add(self.iterations, 1)]
 
     lr = self.lr
     if self.initial_decay > 0:
-      lr *= (1. / (1. + self.decay * K.cast(self.iterations,
-                                            K.dtype(self.decay))))
+      lr *= (1. /
+             (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay))))
 
     for p, g, a in zip(params, grads, accumulators):
       # update accumulator
@@ -283,20 +285,19 @@ class Adagrad(Optimizer):
 
   Arguments:
       lr: float >= 0. Learning rate.
-      epsilon: float >= 0.
+      epsilon: float >= 0. If `None`, defaults to `K.epsilon()`.
       decay: float >= 0. Learning rate decay over each update.
 
-  References:
-      - [Adaptive Subgradient Methods for Online Learning and Stochastic
-        Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
   """
 
-  def __init__(self, lr=0.01, epsilon=1e-8, decay=0., **kwargs):
+  def __init__(self, lr=0.01, epsilon=None, decay=0., **kwargs):
     super(Adagrad, self).__init__(**kwargs)
     with K.name_scope(self.__class__.__name__):
       self.lr = K.variable(lr, name='lr')
       self.decay = K.variable(decay, name='decay')
       self.iterations = K.variable(0, dtype='int64', name='iterations')
+    if epsilon is None:
+      epsilon = K.epsilon()
     self.epsilon = epsilon
     self.initial_decay = decay
 
@@ -309,8 +310,8 @@ class Adagrad(Optimizer):
 
     lr = self.lr
     if self.initial_decay > 0:
-      lr *= (1. / (1. + self.decay * K.cast(self.iterations,
-                                            K.dtype(self.decay))))
+      lr *= (1. /
+             (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay))))
 
     for p, g, a in zip(params, grads, accumulators):
       new_a = a + K.square(g)  # update accumulator
@@ -344,20 +345,19 @@ class Adadelta(Optimizer):
       lr: float >= 0. Learning rate.
           It is recommended to leave it at the default value.
       rho: float >= 0.
-      epsilon: float >= 0. Fuzz factor.
+      epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
       decay: float >= 0. Learning rate decay over each update.
 
-  References:
-      - [Adadelta - an adaptive learning rate
-        method](http://arxiv.org/abs/1212.5701)
   """
 
-  def __init__(self, lr=1.0, rho=0.95, epsilon=1e-8, decay=0., **kwargs):
+  def __init__(self, lr=1.0, rho=0.95, epsilon=None, decay=0., **kwargs):
     super(Adadelta, self).__init__(**kwargs)
     with K.name_scope(self.__class__.__name__):
       self.lr = K.variable(lr, name='lr')
       self.decay = K.variable(decay, name='decay')
       self.iterations = K.variable(0, dtype='int64', name='iterations')
+    if epsilon is None:
+      epsilon = K.epsilon()
     self.rho = rho
     self.epsilon = epsilon
     self.initial_decay = decay
@@ -372,8 +372,8 @@ class Adadelta(Optimizer):
 
     lr = self.lr
     if self.initial_decay > 0:
-      lr *= (1. / (1. + self.decay * K.cast(self.iterations,
-                                            K.dtype(self.decay))))
+      lr *= (1. /
+             (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay))))
 
     for p, g, a, d_a in zip(params, grads, accumulators, delta_accumulators):
       # update accumulator
@@ -415,20 +415,21 @@ class Adam(Optimizer):
       lr: float >= 0. Learning rate.
       beta_1: float, 0 < beta < 1. Generally close to 1.
       beta_2: float, 0 < beta < 1. Generally close to 1.
-      epsilon: float >= 0. Fuzz factor.
+      epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
       decay: float >= 0. Learning rate decay over each update.
+      amsgrad: boolean. Whether to apply the AMSGrad variant of this
+          algorithm from the paper "On the Convergence of Adam and
+          Beyond".
 
-  References:
-      - [Adam - A Method for Stochastic
-        Optimization](http://arxiv.org/abs/1412.6980v8)
   """
 
   def __init__(self,
                lr=0.001,
                beta_1=0.9,
                beta_2=0.999,
-               epsilon=1e-8,
+               epsilon=None,
                decay=0.,
+               amsgrad=False,
                **kwargs):
     super(Adam, self).__init__(**kwargs)
     with K.name_scope(self.__class__.__name__):
@@ -437,8 +438,11 @@ class Adam(Optimizer):
       self.beta_1 = K.variable(beta_1, name='beta_1')
       self.beta_2 = K.variable(beta_2, name='beta_2')
       self.decay = K.variable(decay, name='decay')
+    if epsilon is None:
+      epsilon = K.epsilon()
     self.epsilon = epsilon
     self.initial_decay = decay
+    self.amsgrad = amsgrad
 
   def get_updates(self, loss, params):
     grads = self.get_gradients(loss, params)
@@ -446,21 +450,30 @@ class Adam(Optimizer):
 
     lr = self.lr
     if self.initial_decay > 0:
-      lr *= (1. / (1. + self.decay * K.cast(self.iterations,
-                                            K.dtype(self.decay))))
+      lr *= (1. /
+             (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay))))
 
     t = K.cast(self.iterations, K.floatx()) + 1
-    lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
-                 (1. - K.pow(self.beta_1, t)))
+    lr_t = lr * (
+        K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t)))
 
     ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
     vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
-    self.weights = [self.iterations] + ms + vs
+    if self.amsgrad:
+      vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
+    else:
+      vhats = [K.zeros(1) for _ in params]
+    self.weights = [self.iterations] + ms + vs + vhats
 
-    for p, g, m, v in zip(params, grads, ms, vs):
+    for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):
       m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
       v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
-      p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)
+      if self.amsgrad:
+        vhat_t = K.maximum(vhat, v_t)
+        p_t = p - lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon)
+        self.updates.append(K.update(vhat, vhat_t))
+      else:
+        p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)
 
       self.updates.append(K.update(m, m_t))
       self.updates.append(K.update(v, v_t))
@@ -479,7 +492,8 @@ class Adam(Optimizer):
         'beta_1': float(K.get_value(self.beta_1)),
         'beta_2': float(K.get_value(self.beta_2)),
         'decay': float(K.get_value(self.decay)),
-        'epsilon': self.epsilon
+        'epsilon': self.epsilon,
+        'amsgrad': self.amsgrad
     }
     base_config = super(Adam, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
@@ -494,19 +508,16 @@ class Adamax(Optimizer):
   Arguments:
       lr: float >= 0. Learning rate.
       beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1.
-      epsilon: float >= 0. Fuzz factor.
+      epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
       decay: float >= 0. Learning rate decay over each update.
 
-  References:
-      - [Adam - A Method for Stochastic
-        Optimization](http://arxiv.org/abs/1412.6980v8)
   """
 
   def __init__(self,
                lr=0.002,
                beta_1=0.9,
                beta_2=0.999,
-               epsilon=1e-8,
+               epsilon=None,
                decay=0.,
                **kwargs):
     super(Adamax, self).__init__(**kwargs)
@@ -516,6 +527,8 @@ class Adamax(Optimizer):
       self.beta_1 = K.variable(beta_1, name='beta_1')
       self.beta_2 = K.variable(beta_2, name='beta_2')
       self.decay = K.variable(decay, name='decay')
+    if epsilon is None:
+      epsilon = K.epsilon()
     self.epsilon = epsilon
     self.initial_decay = decay
 
@@ -525,8 +538,8 @@ class Adamax(Optimizer):
 
     lr = self.lr
     if self.initial_decay > 0:
-      lr *= (1. / (1. + self.decay * K.cast(self.iterations,
-                                            K.dtype(self.decay))))
+      lr *= (1. /
+             (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay))))
 
     t = K.cast(self.iterations, K.floatx()) + 1
     lr_t = lr / (1. - K.pow(self.beta_1, t))
@@ -580,19 +593,15 @@ class Nadam(Optimizer):
   Arguments:
       lr: float >= 0. Learning rate.
       beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1.
-      epsilon: float >= 0. Fuzz factor.
+      epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
 
-  References:
-      - [Nadam report](http://cs229.stanford.edu/proj2015/054_report.pdf)
-      - [On the importance of initialization and momentum in deep
-        learning](http://www.cs.toronto.edu/~fritz/absps/momentum.pdf)
   """
 
   def __init__(self,
                lr=0.002,
                beta_1=0.9,
                beta_2=0.999,
-               epsilon=1e-8,
+               epsilon=None,
                schedule_decay=0.004,
                **kwargs):
     super(Nadam, self).__init__(**kwargs)
@@ -602,12 +611,15 @@ class Nadam(Optimizer):
       self.lr = K.variable(lr, name='lr')
       self.beta_1 = K.variable(beta_1, name='beta_1')
       self.beta_2 = K.variable(beta_2, name='beta_2')
+    if epsilon is None:
+      epsilon = K.epsilon()
     self.epsilon = epsilon
     self.schedule_decay = schedule_decay
 
   def get_updates(self, loss, params):
     grads = self.get_gradients(loss, params)
     self.updates = [K.update_add(self.iterations, 1)]
+
     t = K.cast(self.iterations, K.floatx()) + 1
 
     # Due to the recommendations in [2], i.e. warming momentum schedule
@@ -691,7 +703,6 @@ class TFOptimizer(Optimizer):
 
 # Aliases.
 
-# pylint: disable=invalid-name
 sgd = SGD
 rmsprop = RMSprop
 adagrad = Adagrad
@@ -700,8 +711,6 @@ adam = Adam
 adamax = Adamax
 nadam = Nadam
 
-# pylint: enable=invalid-name
-
 
 def serialize(optimizer):
   return serialize_keras_object(optimizer)
diff --git a/tensorflow/python/keras/_impl/keras/optimizers_test.py b/tensorflow/python/keras/_impl/keras/optimizers_test.py
index 6e9e4e6c99..57636afbf0 100644
--- a/tensorflow/python/keras/_impl/keras/optimizers_test.py
+++ b/tensorflow/python/keras/_impl/keras/optimizers_test.py
@@ -102,6 +102,7 @@ class KerasOptimizersTest(test.TestCase):
     with self.test_session():
       _test_optimizer(keras.optimizers.Adam())
       _test_optimizer(keras.optimizers.Adam(decay=1e-3))
+      _test_optimizer(keras.optimizers.Adam(amsgrad=True))
 
   def test_adamax(self):
     with self.test_session():
diff --git a/tensorflow/python/keras/_impl/keras/preprocessing/image.py b/tensorflow/python/keras/_impl/keras/preprocessing/image.py
index 82441de592..db1fdd4e6b 100644
--- a/tensorflow/python/keras/_impl/keras/preprocessing/image.py
+++ b/tensorflow/python/keras/_impl/keras/preprocessing/image.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+# pylint: disable=g-import-not-at-top
 """Fairly basic set of tools for real-time data augmentation on image data.
 
 Can easily be extended to include new transformations,
@@ -28,25 +29,22 @@ import re
 import threading
 
 import numpy as np
-from six.moves import range  # pylint: disable=redefined-builtin
-
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.utils.data_utils import Sequence
 from tensorflow.python.platform import tf_logging as logging
 
-
-# pylint: disable=g-import-not-at-top
-try:
-  from PIL import Image as pil_image
-except ImportError:
-  pil_image = None
 try:
   from scipy import linalg
   import scipy.ndimage as ndi
 except ImportError:
   linalg = None
   ndi = None
-# pylint: enable=g-import-not-at-top
+
+
+try:
+  from PIL import Image as pil_image
+except ImportError:
+  pil_image = None
 
 if pil_image is not None:
   _PIL_INTERPOLATION_METHODS = {
@@ -88,7 +86,7 @@ def random_rotation(x,
   Returns:
       Rotated Numpy image tensor.
   """
-  theta = np.pi / 180 * np.random.uniform(-rg, rg)
+  theta = np.deg2rad(np.random.uniform(-rg, rg))
   rotation_matrix = np.array([[np.cos(theta), -np.sin(theta), 0],
                               [np.sin(theta), np.cos(theta), 0], [0, 0, 1]])
 
@@ -145,7 +143,7 @@ def random_shear(x,
 
   Arguments:
       x: Input tensor. Must be 3D.
-      intensity: Transformation intensity.
+      intensity: Transformation intensity in degrees.
       row_axis: Index of axis for rows in the input tensor.
       col_axis: Index of axis for columns in the input tensor.
       channel_axis: Index of axis for channels in the input tensor.
@@ -158,7 +156,7 @@ def random_shear(x,
   Returns:
       Sheared Numpy image tensor.
   """
-  shear = np.random.uniform(-intensity, intensity)
+  shear = np.deg2rad(np.random.uniform(-intensity, intensity))
   shear_matrix = np.array([[1, -np.sin(shear), 0], [0, np.cos(shear), 0],
                            [0, 0, 1]])
 
@@ -188,8 +186,10 @@ def random_zoom(x,
           (one of `{'constant', 'nearest', 'reflect', 'wrap'}`).
       cval: Value used for points outside the boundaries
           of the input if `mode='constant'`.
+
   Returns:
       Zoomed Numpy image tensor.
+
   Raises:
       ValueError: if `zoom_range` isn't a tuple.
   """
@@ -366,7 +366,7 @@ def load_img(path, grayscale=False, target_size=None, interpolation='nearest'):
       grayscale: Boolean, whether to load the image as grayscale.
       target_size: Either `None` (default to original size)
           or tuple of ints `(img_height, img_width)`.
-     interpolation: Interpolation method used to resample the image if the
+      interpolation: Interpolation method used to resample the image if the
           target size is different from that of the loaded image.
           Supported methods are "nearest", "bilinear", and "bicubic".
           If PIL version 1.1.3 or newer is installed, "lanczos" is also
@@ -394,11 +394,9 @@ def load_img(path, grayscale=False, target_size=None, interpolation='nearest'):
     width_height_tuple = (target_size[1], target_size[0])
     if img.size != width_height_tuple:
       if interpolation not in _PIL_INTERPOLATION_METHODS:
-        raise ValueError(
-            'Invalid interpolation method {} specified. Supported '
-            'methods are {}'.format(
-                interpolation,
-                ', '.join(_PIL_INTERPOLATION_METHODS.keys())))
+        raise ValueError('Invalid interpolation method {} specified. Supported '
+                         'methods are {}'.format(interpolation, ', '.join(
+                             _PIL_INTERPOLATION_METHODS.keys())))
       resample = _PIL_INTERPOLATION_METHODS[interpolation]
       img = img.resize(width_height_tuple, resample)
   return img
@@ -407,7 +405,8 @@ def load_img(path, grayscale=False, target_size=None, interpolation='nearest'):
 def list_pictures(directory, ext='jpg|jpeg|bmp|png|ppm'):
   return [
       os.path.join(root, f)
-      for root, _, files in os.walk(directory) for f in files
+      for root, _, files in os.walk(directory)
+      for f in files
       if re.match(r'([\w]+\.(?:' + ext + '))', f)
   ]
 
@@ -423,9 +422,9 @@ class ImageDataGenerator(object):
       zca_whitening: apply ZCA whitening.
       zca_epsilon: epsilon for ZCA whitening. Default is 1e-6.
       rotation_range: degrees (0 to 180).
-      width_shift_range: fraction of total width.
-      height_shift_range: fraction of total height.
-      shear_range: shear intensity (shear angle in radians).
+      width_shift_range: fraction of total width, if < 1, or pixels if >= 1.
+      height_shift_range: fraction of total height, if < 1, or pixels if >= 1.
+      shear_range: shear intensity (shear angle in degrees).
       zoom_range: amount of zoom. if scalar z, zoom will be randomly picked
           in the range [1-z, 1+z]. A sequence of two can be passed instead
           to select this range.
@@ -433,6 +432,12 @@ class ImageDataGenerator(object):
       fill_mode: points outside the boundaries are filled according to the
           given mode ('constant', 'nearest', 'reflect' or 'wrap'). Default
           is 'nearest'.
+          Points outside the boundaries of the input are filled according to the
+            given mode:
+              'constant': kkkkkkkk|abcd|kkkkkkkk (cval=k)
+              'nearest':  aaaaaaaa|abcd|dddddddd
+              'reflect':  abcddcba|abcd|dcbaabcd
+              'wrap':  abcdabcd|abcd|abcdabcd
       cval: value used for points outside the boundaries when fill_mode is
           'constant'. Default is 0.
       horizontal_flip: whether to randomly flip images horizontally.
@@ -522,6 +527,32 @@ class ImageDataGenerator(object):
       raise ValueError('`zoom_range` should be a float or '
                        'a tuple or list of two floats. '
                        'Received arg: ', zoom_range)
+    if zca_whitening:
+      if not featurewise_center:
+        self.featurewise_center = True
+        logging.warning('This ImageDataGenerator specifies '
+                        '`zca_whitening`, which overrides '
+                        'setting of `featurewise_center`.')
+      if featurewise_std_normalization:
+        self.featurewise_std_normalization = False
+        logging.warning('This ImageDataGenerator specifies '
+                        '`zca_whitening` '
+                        'which overrides setting of'
+                        '`featurewise_std_normalization`.')
+    if featurewise_std_normalization:
+      if not featurewise_center:
+        self.featurewise_center = True
+        logging.warning('This ImageDataGenerator specifies '
+                        '`featurewise_std_normalization`, '
+                        'which overrides setting of '
+                        '`featurewise_center`.')
+    if samplewise_std_normalization:
+      if not samplewise_center:
+        self.samplewise_center = True
+        logging.warning('This ImageDataGenerator specifies '
+                        '`samplewise_std_normalization`, '
+                        'which overrides setting of '
+                        '`samplewise_center`.')
 
   def flow(self,
            x,
@@ -591,7 +622,7 @@ class ImageDataGenerator(object):
     if self.samplewise_center:
       x -= np.mean(x, keepdims=True)
     if self.samplewise_std_normalization:
-      x /= np.std(x, keepdims=True) + 1e-7
+      x /= (np.std(x, keepdims=True) + K.epsilon())
 
     if self.featurewise_center:
       if self.mean is not None:
@@ -603,7 +634,7 @@ class ImageDataGenerator(object):
                         'first by calling `.fit(numpy_data)`.')
     if self.featurewise_std_normalization:
       if self.std is not None:
-        x /= (self.std + 1e-7)
+        x /= (self.std + K.epsilon())
       else:
         logging.warning('This ImageDataGenerator specifies '
                         '`featurewise_std_normalization`, but it hasn\'t '
@@ -636,7 +667,6 @@ class ImageDataGenerator(object):
     """
     if ndi is None:
       raise ImportError('Scipy is required for image transformations.')
-
     # x is a single image, so it doesn't have image number at index 0
     img_row_axis = self.row_axis - 1
     img_col_axis = self.col_axis - 1
@@ -648,25 +678,27 @@ class ImageDataGenerator(object):
     # use composition of homographies
     # to generate final transform that needs to be applied
     if self.rotation_range:
-      theta = np.pi / 180 * np.random.uniform(-self.rotation_range,
-                                              self.rotation_range)
+      theta = np.deg2rad(
+          np.random.uniform(-self.rotation_range, self.rotation_range))
     else:
       theta = 0
 
     if self.height_shift_range:
-      tx = np.random.uniform(-self.height_shift_range,
-                             self.height_shift_range) * x.shape[img_row_axis]
+      tx = np.random.uniform(-self.height_shift_range, self.height_shift_range)
+      if self.height_shift_range < 1:
+        tx *= x.shape[img_row_axis]
     else:
       tx = 0
 
     if self.width_shift_range:
-      ty = np.random.uniform(-self.width_shift_range,
-                             self.width_shift_range) * x.shape[img_col_axis]
+      ty = np.random.uniform(-self.width_shift_range, self.width_shift_range)
+      if self.width_shift_range < 1:
+        ty *= x.shape[img_col_axis]
     else:
       ty = 0
 
     if self.shear_range:
-      shear = np.random.uniform(-self.shear_range, self.shear_range)
+      shear = np.deg2rad(np.random.uniform(-self.shear_range, self.shear_range))
     else:
       shear = 0
 
@@ -744,7 +776,7 @@ class ImageDataGenerator(object):
     if x.ndim != 4:
       raise ValueError('Input to `.fit()` should have rank 4. '
                        'Got array with shape: ' + str(x.shape))
-    if x.shape[self.channel_axis] not in {3, 4}:
+    if x.shape[self.channel_axis] not in {1, 3, 4}:
       logging.warning(
           'Expected input to be images (as Numpy array) '
           'following the data format convention "' + self.data_format + '" '
@@ -784,10 +816,12 @@ class ImageDataGenerator(object):
         raise ImportError('Scipy is required for zca_whitening.')
 
       flat_x = np.reshape(x, (x.shape[0], x.shape[1] * x.shape[2] * x.shape[3]))
-      sigma = np.dot(flat_x.T, flat_x) / flat_x.shape[0]
-      u, s, _ = linalg.svd(sigma)
-      self.principal_components = np.dot(
-          np.dot(u, np.diag(1. / np.sqrt(s + self.zca_epsilon))), u.T)
+      num_examples = flat_x.shape[0]
+      _, s, vt = linalg.svd(flat_x / np.sqrt(num_examples))
+      s_expand = np.hstack(
+          (s, np.zeros(vt.shape[0] - num_examples, dtype=flat_x.dtype)))
+      self.principal_components = (
+          vt.T / np.sqrt(s_expand**2 + self.zca_epsilon)).dot(vt)
 
 
 class Iterator(Sequence):
@@ -797,10 +831,10 @@ class Iterator(Sequence):
   method.
 
   Arguments:
-    n: Integer, total number of samples in the dataset to loop over.
-    batch_size: Integer, size of a batch.
-    shuffle: Boolean, whether to shuffle the data between epochs.
-    seed: Random seeding for data shuffling.
+      n: Integer, total number of samples in the dataset to loop over.
+      batch_size: Integer, size of a batch.
+      shuffle: Boolean, whether to shuffle the data between epochs.
+      seed: Random seeding for data shuffling.
   """
 
   def __init__(self, n, batch_size, shuffle, seed):
@@ -823,15 +857,14 @@ class Iterator(Sequence):
     if idx >= len(self):
       raise ValueError('Asked to retrieve element {idx}, '
                        'but the Sequence '
-                       'has length {length}'.format(idx=idx,
-                                                    length=len(self)))
+                       'has length {length}'.format(idx=idx, length=len(self)))
     if self.seed is not None:
       np.random.seed(self.seed + self.total_batches_seen)
     self.total_batches_seen += 1
     if self.index_array is None:
       self._set_index_array()
-    index_array = self.index_array[self.batch_size * idx:self.batch_size *
-                                   (idx + 1)]
+    index_array = self.index_array[self.batch_size * idx:self.batch_size * (
+        idx + 1)]
     return self._get_batches_of_transformed_samples(index_array)
 
   def __len__(self):
@@ -873,6 +906,7 @@ class Iterator(Sequence):
 
     Arguments:
         index_array: array of sample indices to include in batch.
+
     Returns:
         A batch of transformed samples.
     """
@@ -948,8 +982,8 @@ class NumpyArrayIterator(Iterator):
                                              seed)
 
   def _get_batches_of_transformed_samples(self, index_array):
-    batch_x = np.zeros(tuple([len(index_array)] + list(self.x.shape)[1:]),
-                       dtype=K.floatx())
+    batch_x = np.zeros(
+        tuple([len(index_array)] + list(self.x.shape)[1:]), dtype=K.floatx())
     for i, j in enumerate(index_array):
       x = self.x[j]
       x = self.image_data_generator.random_transform(x.astype(K.floatx()))
@@ -959,7 +993,9 @@ class NumpyArrayIterator(Iterator):
       for i, j in enumerate(index_array):
         img = array_to_img(batch_x[i], self.data_format, scale=True)
         fname = '{prefix}_{index}_{hash}.{format}'.format(
-            prefix=self.save_prefix, index=j, hash=np.random.randint(1e4),
+            prefix=self.save_prefix,
+            index=j,
+            hash=np.random.randint(1e4),
             format=self.save_format)
         img.save(os.path.join(self.save_to_dir, fname))
     if self.y is None:
@@ -984,10 +1020,11 @@ class NumpyArrayIterator(Iterator):
 
 def _count_valid_files_in_directory(directory, white_list_formats,
                                     follow_links):
-  """Count files with extension in `white_list_formats` in a directory.
+  """Count files with extension in `white_list_formats` contained in directory.
 
   Arguments:
-      directory: absolute path to the directory containing files to be counted
+      directory: absolute path to the directory
+          containing files to be counted
       white_list_formats: set of strings containing allowed extensions for
           the files to be counted.
       follow_links: boolean.
@@ -1003,7 +1040,7 @@ def _count_valid_files_in_directory(directory, white_list_formats,
 
   samples = 0
   for _, _, files in _recursive_list(directory):
-    for fname in sorted(files):
+    for fname in files:
       is_valid = False
       for extension in white_list_formats:
         if fname.lower().endswith('.' + extension):
@@ -1043,7 +1080,7 @@ def _list_valid_filenames_in_directory(directory, white_list_formats,
   subdir = os.path.basename(directory)
   basedir = os.path.dirname(directory)
   for root, _, files in _recursive_list(directory):
-    for fname in files:
+    for fname in sorted(files):
       is_valid = False
       for extension in white_list_formats:
         if fname.lower().endswith('.' + extension):
@@ -1167,8 +1204,8 @@ class DirectoryIterator(Iterator):
         white_list_formats=white_list_formats,
         follow_links=follow_links)
     self.samples = sum(
-        pool.map(function_partial, (os.path.join(directory, subdir)
-                                    for subdir in classes)))
+        pool.map(function_partial,
+                 (os.path.join(directory, subdir) for subdir in classes)))
 
     print('Found %d images belonging to %d classes.' % (self.samples,
                                                         self.num_classes))
@@ -1181,8 +1218,9 @@ class DirectoryIterator(Iterator):
     i = 0
     for dirpath in (os.path.join(directory, subdir) for subdir in classes):
       results.append(
-          pool.apply_async(_list_valid_filenames_in_directory, (
-              dirpath, white_list_formats, self.class_indices, follow_links)))
+          pool.apply_async(
+              _list_valid_filenames_in_directory,
+              (dirpath, white_list_formats, self.class_indices, follow_links)))
     for res in results:
       classes, filenames = res.get()
       self.classes[i:i + len(classes)] = classes
@@ -1199,10 +1237,11 @@ class DirectoryIterator(Iterator):
     # build batch of image data
     for i, j in enumerate(index_array):
       fname = self.filenames[j]
-      img = load_img(os.path.join(self.directory, fname),
-                     grayscale=grayscale,
-                     target_size=self.target_size,
-                     interpolation=self.interpolation)
+      img = load_img(
+          os.path.join(self.directory, fname),
+          grayscale=grayscale,
+          target_size=self.target_size,
+          interpolation=self.interpolation)
       x = img_to_array(img, data_format=self.data_format)
       x = self.image_data_generator.random_transform(x)
       x = self.image_data_generator.standardize(x)
@@ -1212,7 +1251,9 @@ class DirectoryIterator(Iterator):
       for i, j in enumerate(index_array):
         img = array_to_img(batch_x[i], self.data_format, scale=True)
         fname = '{prefix}_{index}_{hash}.{format}'.format(
-            prefix=self.save_prefix, index=j, hash=np.random.randint(1e7),
+            prefix=self.save_prefix,
+            index=j,
+            hash=np.random.randint(1e7),
             format=self.save_format)
         img.save(os.path.join(self.save_to_dir, fname))
     # build batch of labels
@@ -1241,4 +1282,3 @@ class DirectoryIterator(Iterator):
     # The transformation of images is not under thread lock
     # so it can be done in parallel
     return self._get_batches_of_transformed_samples(index_array)
-
diff --git a/tensorflow/python/keras/_impl/keras/preprocessing/sequence.py b/tensorflow/python/keras/_impl/keras/preprocessing/sequence.py
index 642f4f2fac..4d59250af0 100644
--- a/tensorflow/python/keras/_impl/keras/preprocessing/sequence.py
+++ b/tensorflow/python/keras/_impl/keras/preprocessing/sequence.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Preprocessing utilities for sequence data.
+"""Utilities for preprocessing sequence data.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -129,7 +129,7 @@ def make_sampling_table(size, sampling_factor=1e-5):
       is the probability that a word of rank i should be sampled.
   """
   gamma = 0.577
-  rank = np.array(list(range(size)))
+  rank = np.arange(size)
   rank[0] = 1
   inv_fq = rank * (np.log(rank) + gamma) + 0.5 - 1. / (12. * rank)
   f = sampling_factor * inv_fq
@@ -170,7 +170,7 @@ def skipgrams(sequence,
           if True labels will be categorical eg. [[1,0],[0,1],[0,1] .. ]
       sampling_table: 1D array of size `vocabulary_size` where the entry i
           encodes the probability to sample a word of rank i.
-      seed: Random seed.
+      seed: random seed.
 
   Returns:
       couples, labels: where `couples` are int pairs and
@@ -224,3 +224,22 @@ def skipgrams(sequence,
     random.shuffle(labels)
 
   return couples, labels
+
+
+def _remove_long_seq(maxlen, seq, label):
+  """Removes sequences that exceed the maximum length.
+
+  Arguments:
+      maxlen: int, maximum length
+      seq: list of lists where each sublist is a sequence
+      label: list where each element is an integer
+
+  Returns:
+      new_seq, new_label: shortened lists for `seq` and `label`.
+  """
+  new_seq, new_label = [], []
+  for x, y in zip(seq, label):
+    if len(x) < maxlen:
+      new_seq.append(x)
+      new_label.append(y)
+  return new_seq, new_label
diff --git a/tensorflow/python/keras/_impl/keras/preprocessing/text.py b/tensorflow/python/keras/_impl/keras/preprocessing/text.py
index 47e5aa064f..8f7f25dc0a 100644
--- a/tensorflow/python/keras/_impl/keras/preprocessing/text.py
+++ b/tensorflow/python/keras/_impl/keras/preprocessing/text.py
@@ -13,8 +13,6 @@
 # limitations under the License.
 # ==============================================================================
 """Utilities for text input preprocessing.
-
-May benefit from a fast Cython rewrite.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -29,6 +27,9 @@ import numpy as np
 from six.moves import range  # pylint: disable=redefined-builtin
 from six.moves import zip  # pylint: disable=redefined-builtin
 
+from tensorflow.python.platform import tf_logging as logging
+
+
 if sys.version_info < (3,):
   maketrans = string.maketrans
 else:
@@ -68,6 +69,21 @@ def one_hot(text,
             filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
             lower=True,
             split=' '):
+  """One-hot encodes a text into a list of word indexes of size n.
+
+  This is a wrapper to the `hashing_trick` function using `hash` as the
+  hashing function; unicity of word to index mapping non-guaranteed.
+
+  Arguments:
+      text: Input text (string).
+      n: Dimension of the hashing space.
+      filters: Sequence of characters to filter out.
+      lower: Whether to convert the input to lowercase.
+      split: Sentence split marker (string).
+
+  Returns:
+      A list of integer word indices (unicity non-guaranteed).
+  """
   return hashing_trick(
       text, n, hash_function=hash, filters=filters, lower=lower, split=split)
 
@@ -99,6 +115,10 @@ def hashing_trick(text,
 
   Two or more words may be assigned to the same index, due to possible
   collisions by the hashing function.
+  The
+  probability
+  of a collision is in relation to the dimension of the hashing space and
+  the number of distinct objects.
   """
   if hash_function is None:
     hash_function = hash
@@ -127,6 +147,8 @@ class Tokenizer(object):
       lower: boolean. Whether to convert the texts to lowercase.
       split: character or string to use for token splitting.
       char_level: if True, every character will be treated as a token.
+      oov_token: if given, it will be added to word_index and used to
+          replace out-of-vocabulary words during text_to_sequence calls
 
   By default, all punctuation is removed, turning the texts into
   space-separated sequences of words
@@ -141,7 +163,17 @@ class Tokenizer(object):
                filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                lower=True,
                split=' ',
-               char_level=False):
+               char_level=False,
+               oov_token=None,
+               **kwargs):
+    # Legacy support
+    if 'nb_words' in kwargs:
+      logging.warning('The `nb_words` argument in `Tokenizer` '
+                      'has been renamed `num_words`.')
+      num_words = kwargs.pop('nb_words')
+    if kwargs:
+      raise TypeError('Unrecognized keyword arguments: ' + str(kwargs))
+
     self.word_counts = OrderedDict()
     self.word_docs = {}
     self.filters = filters
@@ -150,6 +182,7 @@ class Tokenizer(object):
     self.num_words = num_words
     self.document_count = 0
     self.char_level = char_level
+    self.oov_token = oov_token
 
   def fit_on_texts(self, texts):
     """Updates internal vocabulary based on a list of texts.
@@ -181,7 +214,13 @@ class Tokenizer(object):
     sorted_voc = [wc[0] for wc in wcounts]
     # note that index 0 is reserved, never assigned to an existing word
     self.word_index = dict(
-        list(zip(sorted_voc, list(range(1, len(sorted_voc) + 1)))))
+        list(zip(sorted_voc, list(range(1,
+                                        len(sorted_voc) + 1)))))
+
+    if self.oov_token is not None:
+      i = self.word_index.get(self.oov_token)
+      if i is None:
+        self.word_index[self.oov_token] = len(self.word_index) + 1
 
     self.index_docs = {}
     for w, c in list(self.word_docs.items()):
@@ -248,6 +287,10 @@ class Tokenizer(object):
             continue
           else:
             vect.append(i)
+        elif self.oov_token is not None:
+          i = self.word_index.get(self.oov_token)
+          if i is not None:
+            vect.append(i)
       yield vect
 
   def texts_to_matrix(self, texts, mode='binary'):
diff --git a/tensorflow/python/keras/_impl/keras/preprocessing/text_test.py b/tensorflow/python/keras/_impl/keras/preprocessing/text_test.py
index 17ab48ba3f..a934e331c4 100644
--- a/tensorflow/python/keras/_impl/keras/preprocessing/text_test.py
+++ b/tensorflow/python/keras/_impl/keras/preprocessing/text_test.py
@@ -76,6 +76,22 @@ class TestText(test.TestCase):
     self.assertLessEqual(np.max(encoded), 4)
     self.assertGreaterEqual(np.min(encoded), 1)
 
+  def test_tokenizer_oov_flag(self):
+    x_train = ['This text has only known words']
+    x_test = ['This text has some unknown words']  # 2 OOVs: some, unknown
+
+    # Defalut, without OOV flag
+    tokenizer = keras.preprocessing.text.Tokenizer()
+    tokenizer.fit_on_texts(x_train)
+    x_test_seq = tokenizer.texts_to_sequences(x_test)
+    assert len(x_test_seq[0]) == 4  # discards 2 OOVs
+
+    # With OOV feature
+    tokenizer = keras.preprocessing.text.Tokenizer(oov_token='<unk>')
+    tokenizer.fit_on_texts(x_train)
+    x_test_seq = tokenizer.texts_to_sequences(x_test)
+    assert len(x_test_seq[0]) == 6  # OOVs marked in place
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/regularizers.py b/tensorflow/python/keras/_impl/keras/regularizers.py
index 161ff9bf5b..c53ee8a1ae 100644
--- a/tensorflow/python/keras/_impl/keras/regularizers.py
+++ b/tensorflow/python/keras/_impl/keras/regularizers.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Keras built-in regularizers.
+"""Built-in regularizers.
 """
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/python/keras/_impl/keras/utils/data_utils.py b/tensorflow/python/keras/_impl/keras/utils/data_utils.py
index d9e8f37e36..fcee9fbcc3 100644
--- a/tensorflow/python/keras/_impl/keras/utils/data_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/data_utils.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,12 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+# pylint: disable=g-import-not-at-top
 """Utilities for file download and caching."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 from abc import abstractmethod
+from contextlib import closing
 import hashlib
 import multiprocessing
 from multiprocessing.pool import ThreadPool
@@ -38,12 +40,12 @@ from six.moves.urllib.error import URLError
 from six.moves.urllib.request import urlopen
 
 from tensorflow.python.keras._impl.keras.utils.generic_utils import Progbar
-from tensorflow.python.util.tf_export import tf_export
+
 
 try:
-  import queue  # pylint:disable=g-import-not-at-top
+  import queue
 except ImportError:
-  import Queue as queue  # pylint:disable=g-import-not-at-top
+  import Queue as queue
 
 
 if sys.version_info[0] == 2:
@@ -87,7 +89,7 @@ if sys.version_info[0] == 2:
       for chunk in chunk_read(response, reporthook=reporthook):
         fd.write(chunk)
 else:
-  from six.moves.urllib.request import urlretrieve  # pylint: disable=g-import-not-at-top
+  from six.moves.urllib.request import urlretrieve
 
 
 def _extract_archive(file_path, path='.', archive_format='auto'):
@@ -136,7 +138,6 @@ def _extract_archive(file_path, path='.', archive_format='auto'):
   return False
 
 
-@tf_export('keras.utils.get_file')
 def get_file(fname,
              origin,
              untar=False,
@@ -188,7 +189,7 @@ def get_file(fname,
       Path to the downloaded file
   """
   if cache_dir is None:
-    cache_dir = os.path.expanduser(os.path.join('~', '.keras'))
+    cache_dir = os.path.join(os.path.expanduser('~'), '.keras')
   if md5_hash is not None and file_hash is None:
     file_hash = md5_hash
     hash_algorithm = 'md5'
@@ -317,37 +318,46 @@ def validate_file(fpath, file_hash, algorithm='auto', chunk_size=65535):
     return False
 
 
-@tf_export('keras.utils.Sequence')
 class Sequence(object):
   """Base object for fitting to a sequence of data, such as a dataset.
 
   Every `Sequence` must implements the `__getitem__` and the `__len__` methods.
   If you want to modify your dataset between epochs you may implement
-  `on_epoch_end`. The method `__getitem__` should return a complete batch.
+  `on_epoch_end`.
+  The method `__getitem__` should return a complete batch.
+
+  # Notes
 
-  Notes:
   `Sequence` are a safer way to do multiprocessing. This structure guarantees
-   that the network will only train once on each sample per epoch which is not
-   the case with generators.
+  that the network will only train once
+   on each sample per epoch which is not the case with generators.
+
   Examples:
+
   ```python
       from skimage.io import imread
       from skimage.transform import resize
       import numpy as np
       import math
+
       # Here, `x_set` is list of path to the images
       # and `y_set` are the associated classes.
+
       class CIFAR10Sequence(Sequence):
+
           def __init__(self, x_set, y_set, batch_size):
               self.x, self.y = x_set, y_set
               self.batch_size = batch_size
+
           def __len__(self):
               return math.ceil(len(self.x) / self.batch_size)
+
           def __getitem__(self, idx):
               batch_x = self.x[idx * self.batch_size:(idx + 1) *
-                        self.batch_size]
+              self.batch_size]
               batch_y = self.y[idx * self.batch_size:(idx + 1) *
-                        self.batch_size]
+              self.batch_size]
+
               return np.array([
                   resize(imread(file_name), (200, 200))
                      for file_name in batch_x]), np.array(batch_y)
@@ -375,7 +385,6 @@ class Sequence(object):
     """
     raise NotImplementedError
 
-  @abstractmethod
   def on_epoch_end(self):
     """Method called at the end of every epoch.
     """
@@ -405,7 +414,6 @@ def get_index(uid, i):
   return _SHARED_SEQUENCES[uid][i]
 
 
-@tf_export('keras.utils.SequenceEnqueuer')
 class SequenceEnqueuer(object):
   """Base class to enqueue inputs.
 
@@ -474,35 +482,36 @@ class OrderedEnqueuer(SequenceEnqueuer):
 
   Arguments:
       sequence: A `keras.utils.data_utils.Sequence` object.
-      use_multiprocessing: Use multiprocessing if True, otherwise threading
-      shuffle: Whether to shuffle the data at the beginning of each epoch
+      use_multiprocessing: use multiprocessing if True, otherwise threading
+      shuffle: whether to shuffle the data at the beginning of each epoch
   """
 
   def __init__(self, sequence, use_multiprocessing=False, shuffle=False):
     self.sequence = sequence
     self.use_multiprocessing = use_multiprocessing
 
-    # Doing Multiprocessing.Value += x is not process-safe.
     global _SEQUENCE_COUNTER
     if _SEQUENCE_COUNTER is None:
-      if self.use_multiprocessing:
+      try:
         _SEQUENCE_COUNTER = multiprocessing.Value('i', 0)
-      else:
+      except OSError:
+        # In this case the OS does not allow us to use
+        # multiprocessing. We resort to an int
+        # for enqueuer indexing.
         _SEQUENCE_COUNTER = 0
 
-    if self.use_multiprocessing:
+    if isinstance(_SEQUENCE_COUNTER, int):
+      self.uid = _SEQUENCE_COUNTER
+      _SEQUENCE_COUNTER += 1
+    else:
+      # Doing Multiprocessing.Value += x is not process-safe.
       with _SEQUENCE_COUNTER.get_lock():
         self.uid = _SEQUENCE_COUNTER.value
         _SEQUENCE_COUNTER.value += 1
-    else:
-      self.uid = _SEQUENCE_COUNTER
-      if isinstance(_SEQUENCE_COUNTER, int):
-        _SEQUENCE_COUNTER += 1
-      else:
-        _SEQUENCE_COUNTER.value += 1
+
     self.shuffle = shuffle
     self.workers = 0
-    self.executor = None
+    self.executor_fn = None
     self.queue = None
     self.run_thread = None
     self.stop_signal = None
@@ -519,9 +528,9 @@ class OrderedEnqueuer(SequenceEnqueuer):
             (when full, workers could block on `put()`)
     """
     if self.use_multiprocessing:
-      self.executor = multiprocessing.Pool(workers)
+      self.executor_fn = lambda: multiprocessing.Pool(workers)
     else:
-      self.executor = ThreadPool(workers)
+      self.executor_fn = lambda: ThreadPool(workers)
     self.workers = workers
     self.queue = queue.Queue(max_queue_size)
     self.stop_signal = threading.Event()
@@ -537,24 +546,26 @@ class OrderedEnqueuer(SequenceEnqueuer):
         return
 
   def _run(self):
-    """Function to submit request to the executor & queue `Future` objects."""
+    """Submits request to the executor and queue the `Future` objects."""
     sequence = list(range(len(self.sequence)))
     self._send_sequence()  # Share the initial sequence
     while True:
       if self.shuffle:
         random.shuffle(sequence)
-      for i in sequence:
-        if self.stop_signal.is_set():
-          return
-        self.queue.put(
-            self.executor.apply_async(get_index, (self.uid, i)), block=True)
 
-      # Done with the current epoch, waiting for the final batches
-      self._wait_queue()
+      with closing(self.executor_fn()) as executor:
+        for i in sequence:
+          if self.stop_signal.is_set():
+            return
+          self.queue.put(
+              executor.apply_async(get_index, (self.uid, i)), block=True)
 
-      if self.stop_signal.is_set():
-        # We're done
-        return
+        # Done with the current epoch, waiting for the final batches
+        self._wait_queue()
+
+        if self.stop_signal.is_set():
+          # We're done
+          return
 
       # Call the internal on epoch end.
       self.sequence.on_epoch_end()
@@ -566,8 +577,9 @@ class OrderedEnqueuer(SequenceEnqueuer):
     Skip the data if it is `None`.
 
     Yields:
-        Tuples (inputs, targets)
-        or (inputs, targets, sample_weights)
+        The next element in the queue, i.e. a tuple
+        `(inputs, targets)` or
+        `(inputs, targets, sample_weights)`.
     """
     try:
       while self.is_running():
@@ -581,14 +593,8 @@ class OrderedEnqueuer(SequenceEnqueuer):
 
   def _send_sequence(self):
     """Send current Sequence to all workers."""
-    _SHARED_SEQUENCES[
-        self.uid] = self.sequence  # For new processes that may spawn
-
-    self._close_pool()
-    if self.use_multiprocessing:
-      self.executor = multiprocessing.Pool(self.workers)
-    else:
-      self.executor = ThreadPool(self.workers)
+    # For new processes that may spawn
+    _SHARED_SEQUENCES[self.uid] = self.sequence
 
   def stop(self, timeout=None):
     """Stops running threads and wait for them to exit, if necessary.
@@ -603,16 +609,10 @@ class OrderedEnqueuer(SequenceEnqueuer):
       self.queue.queue.clear()
       self.queue.unfinished_tasks = 0
       self.queue.not_full.notify()
-    self._close_pool()
     self.run_thread.join(timeout)
     _SHARED_SEQUENCES[self.uid] = None
 
-  def _close_pool(self):
-    self.executor.close()
-    self.executor.join()
 
-
-@tf_export('keras.utils.GeneratorEnqueuer')
 class GeneratorEnqueuer(SequenceEnqueuer):
   """Builds a queue out of a data generator.
 
@@ -636,26 +636,53 @@ class GeneratorEnqueuer(SequenceEnqueuer):
                seed=None):
     self.wait_time = wait_time
     self._generator = generator
-    self._use_multiprocessing = use_multiprocessing
+    if os.name is 'nt' and use_multiprocessing is True:
+      # On Windows, avoid **SYSTEMATIC** error in `multiprocessing`:
+      # `TypeError: can't pickle generator objects`
+      # => Suggest multithreading instead of multiprocessing on Windows
+      raise ValueError('Using a generator with `use_multiprocessing=True`'
+                       ' is not supported on Windows (no marshalling of'
+                       ' generators across process boundaries). Instead,'
+                       ' use single thread/process or multithreading.')
+    else:
+      self._use_multiprocessing = use_multiprocessing
     self._threads = []
     self._stop_event = None
     self._manager = None
     self.queue = None
     self.seed = seed
 
-  def start(self, workers=1, max_queue_size=10):
-    """Kicks off threads which add data from the generator into the queue.
-
-    Arguments:
-        workers: number of worker threads
-        max_queue_size: queue size
-            (when full, threads could block on `put()`)
-    """
-
-    def data_generator_task():
+  def _data_generator_task(self):
+    if self._use_multiprocessing is False:
+      while not self._stop_event.is_set():
+        with self.genlock:
+          try:
+            if (self.queue is not None and
+                self.queue.qsize() < self.max_queue_size):
+              # On all OSes, avoid **SYSTEMATIC** error
+              # in multithreading mode:
+              # `ValueError: generator already executing`
+              # => Serialize calls to
+              # infinite iterator/generator's next() function
+              generator_output = next(self._generator)
+              self.queue.put((True, generator_output))
+            else:
+              time.sleep(self.wait_time)
+          except StopIteration:
+            break
+          except Exception as e:  # pylint: disable=broad-except
+            # Can't pickle tracebacks.
+            # As a compromise, print the traceback and pickle None instead.
+            if not hasattr(e, '__traceback__'):
+              setattr(e, '__traceback__', sys.exc_info()[2])
+            self.queue.put((False, e))
+            self._stop_event.set()
+            break
+    else:
       while not self._stop_event.is_set():
         try:
-          if self._use_multiprocessing or self.queue.qsize() < max_queue_size:
+          if (self.queue is not None and
+              self.queue.qsize() < self.max_queue_size):
             generator_output = next(self._generator)
             self.queue.put((True, generator_output))
           else:
@@ -663,24 +690,34 @@ class GeneratorEnqueuer(SequenceEnqueuer):
         except StopIteration:
           break
         except Exception as e:  # pylint: disable=broad-except
-          # Can't pick tracebacks.
+          # Can't pickle tracebacks.
           # As a compromise, print the traceback and pickle None instead.
-          if self._use_multiprocessing:
-            traceback.print_exc()
-            setattr(e, '__traceback__', None)
-          elif not hasattr(e, '__traceback__'):
-            setattr(e, '__traceback__', sys.exc_info()[2])
+          traceback.print_exc()
+          setattr(e, '__traceback__', None)
           self.queue.put((False, e))
           self._stop_event.set()
           break
 
+  def start(self, workers=1, max_queue_size=10):
+    """Kicks off threads which add data from the generator into the queue.
+
+    Arguments:
+        workers: number of worker threads
+        max_queue_size: queue size
+            (when full, threads could block on `put()`)
+    """
     try:
+      self.max_queue_size = max_queue_size
       if self._use_multiprocessing:
         self._manager = multiprocessing.Manager()
         self.queue = self._manager.Queue(maxsize=max_queue_size)
         self._stop_event = multiprocessing.Event()
       else:
-        self.queue = queue.Queue()
+        # On all OSes, avoid **SYSTEMATIC** error in multithreading mode:
+        # `ValueError: generator already executing`
+        # => Serialize calls to infinite iterator/generator's next() function
+        self.genlock = threading.Lock()
+        self.queue = queue.Queue(maxsize=max_queue_size)
         self._stop_event = threading.Event()
 
       for _ in range(workers):
@@ -688,12 +725,12 @@ class GeneratorEnqueuer(SequenceEnqueuer):
           # Reset random seed else all children processes
           # share the same seed
           np.random.seed(self.seed)
-          thread = multiprocessing.Process(target=data_generator_task)
+          thread = multiprocessing.Process(target=self._data_generator_task)
           thread.daemon = True
           if self.seed is not None:
             self.seed += 1
         else:
-          thread = threading.Thread(target=data_generator_task)
+          thread = threading.Thread(target=self._data_generator_task)
         self._threads.append(thread)
         thread.start()
     except:
@@ -715,11 +752,15 @@ class GeneratorEnqueuer(SequenceEnqueuer):
       self._stop_event.set()
 
     for thread in self._threads:
-      if thread.is_alive():
-        if self._use_multiprocessing:
+      if self._use_multiprocessing:
+        if thread.is_alive():
           thread.terminate()
-        else:
-          thread.join(timeout)
+      else:
+        # The thread.is_alive() test is subject to a race condition:
+        # the thread could terminate right after the test and before the
+        # join, rendering this test meaningless -> Call thread.join()
+        # always, which is ok no matter what the status of the thread.
+        thread.join(timeout)
 
     if self._manager:
       self._manager.shutdown()
@@ -734,7 +775,9 @@ class GeneratorEnqueuer(SequenceEnqueuer):
     Skip the data if it is `None`.
 
     Yields:
-        Data arrays.
+        The next element in the queue, i.e. a tuple
+        `(inputs, targets)` or
+        `(inputs, targets, sample_weights)`.
     """
     while self.is_running():
       if not self.queue.empty():
@@ -752,7 +795,7 @@ class GeneratorEnqueuer(SequenceEnqueuer):
         else:
           time.sleep(self.wait_time)
 
-      # Make sure to rethrow the first exception in the queue, if any
+    # Make sure to rethrow the first exception in the queue, if any
     while not self.queue.empty():
       success, value = self.queue.get()
       if not success:
diff --git a/tensorflow/python/keras/_impl/keras/utils/generic_utils.py b/tensorflow/python/keras/_impl/keras/utils/generic_utils.py
index a805315c94..adbe6c3288 100644
--- a/tensorflow/python/keras/_impl/keras/utils/generic_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/generic_utils.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import binascii
 import codecs
 import marshal
 import os
@@ -255,7 +256,10 @@ def func_load(code, defaults=None, closure=None, globs=None):
 
   if closure is not None:
     closure = tuple(ensure_value_to_cell(_) for _ in closure)
-  raw_code = codecs.decode(code.encode('ascii'), 'base64')
+  try:
+    raw_code = codecs.decode(code.encode('ascii'), 'base64')
+  except (UnicodeEncodeError, binascii.Error):
+    raw_code = code.encode('raw_unicode_escape')
   code = marshal.loads(raw_code)
   if globs is None:
     globs = globals()
diff --git a/tensorflow/python/keras/_impl/keras/utils/io_utils.py b/tensorflow/python/keras/_impl/keras/utils/io_utils.py
index e123339f5a..b36c769843 100644
--- a/tensorflow/python/keras/_impl/keras/utils/io_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/io_utils.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+# pylint: disable=g-import-not-at-top
 """Utilities related to disk I/O."""
 from __future__ import absolute_import
 from __future__ import division
@@ -21,16 +22,14 @@ from collections import defaultdict
 import sys
 
 import numpy as np
-from tensorflow.python.util.tf_export import tf_export
 
 
 try:
-  import h5py  # pylint:disable=g-import-not-at-top
+  import h5py
 except ImportError:
   h5py = None
 
 
-@tf_export('keras.utils.HDF5Matrix')
 class HDF5Matrix(object):
   """Representation of HDF5 dataset to be used instead of a Numpy array.
 
@@ -65,11 +64,11 @@ class HDF5Matrix(object):
                         'HDF5 and h5py installed.')
 
     if datapath not in list(self.refs.keys()):
-      self._f = h5py.File(datapath)
-      self.refs[datapath] = self._f
+      f = h5py.File(datapath)
+      self.refs[datapath] = f
     else:
-      self._f = self.refs[datapath]
-    self.data = self._f[dataset]
+      f = self.refs[datapath]
+    self.data = f[dataset]
     self.start = start
     if end is None:
       self.end = self.data.shape[0]
@@ -80,9 +79,6 @@ class HDF5Matrix(object):
   def __len__(self):
     return self.end - self.start
 
-  def __del__(self):
-    self._f.close()
-
   def __getitem__(self, key):
     if isinstance(key, slice):
       start, stop = key.start, key.stop
diff --git a/tensorflow/python/keras/_impl/keras/utils/layer_utils.py b/tensorflow/python/keras/_impl/keras/utils/layer_utils.py
index 30af285cbf..a2d32424b5 100644
--- a/tensorflow/python/keras/_impl/keras/utils/layer_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/layer_utils.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Utilities related to Keras layers.
+# pylint: disable=protected-access
+"""Utilities related to layer/model functionality.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -22,17 +23,16 @@ import numpy as np
 
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.utils.conv_utils import convert_kernel
-from tensorflow.python.util.tf_export import tf_export
 
 
 def count_params(weights):
   """Count the total number of scalars composing the weights.
 
   Arguments:
-    weights: An iterable containing the weights on which to compute params
+      weights: An iterable containing the weights on which to compute params
 
   Returns:
-    The total number of scalars composing the weights
+      The total number of scalars composing the weights
   """
   return int(np.sum([K.count_params(p) for p in set(weights)]))
 
@@ -47,10 +47,11 @@ def print_summary(model, line_length=None, positions=None, print_fn=None):
           terminal window sizes).
       positions: Relative or absolute positions of log elements in each line.
           If not provided, defaults to `[.33, .55, .67, 1.]`.
-      print_fn: Print function to use (defaults to `print`).
+      print_fn: Print function to use.
           It will be called on each line of the summary.
           You can set it to a custom function
           in order to capture the string summary.
+          It defaults to `print` (prints to stdout).
   """
   if print_fn is None:
     print_fn = print
@@ -59,12 +60,13 @@ def print_summary(model, line_length=None, positions=None, print_fn=None):
     sequential_like = True
   else:
     sequential_like = True
-    nodes_by_depth = model._nodes_by_depth.values()  # pylint: disable=protected-access
+    nodes_by_depth = model._nodes_by_depth.values()
     nodes = []
     for v in nodes_by_depth:
       if (len(v) > 1) or (len(v) == 1 and len(v[0].inbound_layers) > 1):
-        # If the model has multiple nodes or if the nodes have
-        # multiple inbound_layers, the model is no longer sequential.
+        # if the model has multiple nodes
+        # or if the nodes have multiple inbound_layers
+        # the model is no longer sequential
         sequential_like = False
         break
       nodes += v
@@ -72,7 +74,7 @@ def print_summary(model, line_length=None, positions=None, print_fn=None):
       # search for shared layers
       for layer in model.layers:
         flag = False
-        for node in layer.inbound_nodes:
+        for node in layer._inbound_nodes:
           if node in nodes:
             if flag:
               sequential_like = False
@@ -97,7 +99,7 @@ def print_summary(model, line_length=None, positions=None, print_fn=None):
     # header names for the different log elements
     to_display = ['Layer (type)', 'Output Shape', 'Param #', 'Connected to']
     relevant_nodes = []
-    for v in model._nodes_by_depth.values():  # pylint: disable=protected-access
+    for v in model._nodes_by_depth.values():
       relevant_nodes += v
 
   def print_row(fields, positions):
@@ -135,7 +137,7 @@ def print_summary(model, line_length=None, positions=None, print_fn=None):
     except AttributeError:
       output_shape = 'multiple'
     connections = []
-    for node in layer._inbound_nodes:  # pylint: disable=protected-access
+    for node in layer._inbound_nodes:
       if relevant_nodes and node not in relevant_nodes:
         # node is not part of the current network
         continue
@@ -143,8 +145,8 @@ def print_summary(model, line_length=None, positions=None, print_fn=None):
         inbound_layer = node.inbound_layers[i].name
         inbound_node_index = node.node_indices[i]
         inbound_tensor_index = node.tensor_indices[i]
-        connections.append(inbound_layer + '[' + str(inbound_node_index) + ']['
-                           + str(inbound_tensor_index) + ']')
+        connections.append(inbound_layer + '[' + str(inbound_node_index) +
+                           '][' + str(inbound_tensor_index) + ']')
 
     name = layer.name
     cls_name = layer.__class__.__name__
@@ -173,9 +175,9 @@ def print_summary(model, line_length=None, positions=None, print_fn=None):
     else:
       print_fn('_' * line_length)
 
-  model._check_trainable_weights_consistency()  # pylint: disable=protected-access
+  model._check_trainable_weights_consistency()
   if hasattr(model, '_collected_trainable_weights'):
-    trainable_count = count_params(model._collected_trainable_weights)  # pylint: disable=protected-access
+    trainable_count = count_params(model._collected_trainable_weights)
   else:
     trainable_count = count_params(model.trainable_weights)
 
@@ -188,7 +190,6 @@ def print_summary(model, line_length=None, positions=None, print_fn=None):
   print_fn('_' * line_length)
 
 
-@tf_export('keras.utils.convert_all_kernels_in_model')
 def convert_all_kernels_in_model(model):
   """Converts all convolution kernels in a model from Theano to TensorFlow.
 
diff --git a/tensorflow/python/keras/_impl/keras/utils/np_utils.py b/tensorflow/python/keras/_impl/keras/utils/np_utils.py
index 3dddb99191..231833e776 100644
--- a/tensorflow/python/keras/_impl/keras/utils/np_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/np_utils.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,10 +18,8 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('keras.utils.to_categorical')
 def to_categorical(y, num_classes=None):
   """Converts a class vector (integers) to binary class matrix.
 
@@ -50,7 +48,6 @@ def to_categorical(y, num_classes=None):
   return categorical
 
 
-@tf_export('keras.utils.normalize')
 def normalize(x, axis=-1, order=2):
   """Normalizes a Numpy array.
 
diff --git a/tensorflow/python/keras/_impl/keras/utils/vis_utils.py b/tensorflow/python/keras/_impl/keras/utils/vis_utils.py
index 1ec8e3a2bf..0c5f2c19c7 100644
--- a/tensorflow/python/keras/_impl/keras/utils/vis_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/vis_utils.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,31 +12,29 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+# pylint: disable=protected-access
+# pylint: disable=g-import-not-at-top
 """Utilities related to model visualization."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 import os
-import sys
-from tensorflow.python.util.tf_export import tf_export
+
 
 try:
   # pydot-ng is a fork of pydot that is better maintained.
-  import pydot_ng as pydot  # pylint: disable=g-import-not-at-top
+  import pydot_ng as pydot
 except ImportError:
-  # Fall back on pydot if necessary.
-  # Silence a `print` statement that occurs in case of import error,
-  # by temporarily replacing sys.stdout.
-  _stdout = sys.stdout
-  sys.stdout = sys.stderr
+  # pydotplus is an improved version of pydot
   try:
-    import pydot  # pylint: disable=g-import-not-at-top
+    import pydotplus as pydot
   except ImportError:
-    pydot = None
-  finally:
-    # Restore sys.stdout.
-    sys.stdout = _stdout
+    # Fall back on pydot if necessary.
+    try:
+      import pydot
+    except ImportError:
+      pydot = None
 
 
 def _check_pydot():
@@ -66,8 +64,8 @@ def model_to_dot(model, show_shapes=False, show_layer_names=True, rankdir='TB'):
   Returns:
       A `pydot.Dot` instance representing the Keras model.
   """
-  from tensorflow.python.keras._impl.keras.layers.wrappers import Wrapper  # pylint: disable=g-import-not-at-top
-  from tensorflow.python.keras._impl.keras.models import Sequential  # pylint: disable=g-import-not-at-top
+  from tensorflow.python.keras._impl.keras.layers.wrappers import Wrapper
+  from tensorflow.python.keras._impl.keras.models import Sequential
 
   _check_pydot()
   dot = pydot.Dot()
@@ -119,9 +117,9 @@ def model_to_dot(model, show_shapes=False, show_layer_names=True, rankdir='TB'):
   # Connect nodes with edges.
   for layer in layers:
     layer_id = str(id(layer))
-    for i, node in enumerate(layer._inbound_nodes):  # pylint: disable=protected-access
+    for i, node in enumerate(layer._inbound_nodes):
       node_key = layer.name + '_ib-' + str(i)
-      if node_key in model._network_nodes:  # pylint: disable=protected-access
+      if node_key in model._container_nodes:
         for inbound_layer in node.inbound_layers:
           inbound_layer_id = str(id(inbound_layer))
           layer_id = str(id(layer))
@@ -129,7 +127,6 @@ def model_to_dot(model, show_shapes=False, show_layer_names=True, rankdir='TB'):
   return dot
 
 
-@tf_export('keras.utils.plot_model')
 def plot_model(model,
                to_file='model.png',
                show_shapes=False,
diff --git a/tensorflow/python/keras/_impl/keras/wrappers/scikit_learn.py b/tensorflow/python/keras/_impl/keras/wrappers/scikit_learn.py
index bc788d874f..223ceac3de 100644
--- a/tensorflow/python/keras/_impl/keras/wrappers/scikit_learn.py
+++ b/tensorflow/python/keras/_impl/keras/wrappers/scikit_learn.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""API wrapper allowing to use certain Keras models with the Scikit-Learn API.
+"""Wrapper for using the Scikit-Learn API with Keras models.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -24,8 +24,8 @@ import types
 import numpy as np
 
 from tensorflow.python.keras._impl.keras.models import Sequential
+from tensorflow.python.keras._impl.keras.utils.generic_utils import has_arg
 from tensorflow.python.keras._impl.keras.utils.np_utils import to_categorical
-from tensorflow.python.util import tf_inspect
 
 
 class BaseWrapper(object):
@@ -75,7 +75,7 @@ class BaseWrapper(object):
     self.check_params(sk_params)
 
   def check_params(self, params):
-    """Checks for user typos in "params".
+    """Checks for user typos in `params`.
 
     Arguments:
         params: dictionary; the parameters to be checked
@@ -95,13 +95,11 @@ class BaseWrapper(object):
     else:
       legal_params_fns.append(self.build_fn)
 
-    legal_params = []
-    for fn in legal_params_fns:
-      legal_params += tf_inspect.getargspec(fn)[0]
-    legal_params = set(legal_params)
-
     for params_name in params:
-      if params_name not in legal_params:
+      for fn in legal_params_fns:
+        if has_arg(fn, params_name):
+          break
+      else:
         if params_name != 'nb_epoch':
           raise ValueError('{} is not a legal parameter'.format(params_name))
 
@@ -136,10 +134,10 @@ class BaseWrapper(object):
 
     Arguments:
         x : array-like, shape `(n_samples, n_features)`
-            Training samples where n_samples in the number of samples
-            and n_features is the number of features.
+            Training samples where `n_samples` is the number of samples
+            and `n_features` is the number of features.
         y : array-like, shape `(n_samples,)` or `(n_samples, n_outputs)`
-            True labels for X.
+            True labels for `x`.
         **kwargs: dictionary arguments
             Legal arguments are the arguments of `Sequential.fit`
 
@@ -170,21 +168,20 @@ class BaseWrapper(object):
     return history
 
   def filter_sk_params(self, fn, override=None):
-    """Filters `sk_params` and return those in `fn`'s arguments.
+    """Filters `sk_params` and returns those in `fn`'s arguments.
 
     Arguments:
         fn : arbitrary function
-        override: dictionary, values to override sk_params
+        override: dictionary, values to override `sk_params`
 
     Returns:
-        res : dictionary dictionary containing variables
-            in both sk_params and fn's arguments.
+        res : dictionary containing variables
+            in both `sk_params` and `fn`'s arguments.
     """
     override = override or {}
     res = {}
-    fn_args = tf_inspect.getargspec(fn)[0]
     for name, value in self.sk_params.items():
-      if name in fn_args:
+      if has_arg(fn, name):
         res.update({name: value})
     res.update(override)
     return res
@@ -199,10 +196,10 @@ class KerasClassifier(BaseWrapper):
 
     Arguments:
         x : array-like, shape `(n_samples, n_features)`
-            Training samples where n_samples in the number of samples
-            and n_features is the number of features.
+            Training samples where `n_samples` is the number of samples
+            and `n_features` is the number of features.
         y : array-like, shape `(n_samples,)` or `(n_samples, n_outputs)`
-            True labels for X.
+            True labels for `x`.
         **kwargs: dictionary arguments
             Legal arguments are the arguments of `Sequential.fit`
 
@@ -229,8 +226,8 @@ class KerasClassifier(BaseWrapper):
 
     Arguments:
         x: array-like, shape `(n_samples, n_features)`
-            Test samples where n_samples in the number of samples
-            and n_features is the number of features.
+            Test samples where `n_samples` is the number of samples
+            and `n_features` is the number of features.
         **kwargs: dictionary arguments
             Legal arguments are the arguments
             of `Sequential.predict_classes`.
@@ -248,8 +245,8 @@ class KerasClassifier(BaseWrapper):
 
     Arguments:
         x: array-like, shape `(n_samples, n_features)`
-            Test samples where n_samples in the number of samples
-            and n_features is the number of features.
+            Test samples where `n_samples` is the number of samples
+            and `n_features` is the number of features.
         **kwargs: dictionary arguments
             Legal arguments are the arguments
             of `Sequential.predict_classes`.
@@ -258,8 +255,8 @@ class KerasClassifier(BaseWrapper):
         proba: array-like, shape `(n_samples, n_outputs)`
             Class probability estimates.
             In the case of binary classification,
-            tp match the scikit-learn API,
-            will return an array of shape '(n_samples, 2)'
+            to match the scikit-learn API,
+            will return an array of shape `(n_samples, 2)`
             (instead of `(n_sample, 1)` as in Keras).
     """
     kwargs = self.filter_sk_params(Sequential.predict_proba, kwargs)
@@ -276,16 +273,16 @@ class KerasClassifier(BaseWrapper):
 
     Arguments:
         x: array-like, shape `(n_samples, n_features)`
-            Test samples where n_samples in the number of samples
-            and n_features is the number of features.
+            Test samples where `n_samples` is the number of samples
+            and `n_features` is the number of features.
         y: array-like, shape `(n_samples,)` or `(n_samples, n_outputs)`
-            True labels for x.
+            True labels for `x`.
         **kwargs: dictionary arguments
             Legal arguments are the arguments of `Sequential.evaluate`.
 
     Returns:
         score: float
-            Mean accuracy of predictions on X wrt. y.
+            Mean accuracy of predictions on `x` wrt. `y`.
 
     Raises:
         ValueError: If the underlying model isn't configured to
@@ -321,8 +318,8 @@ class KerasRegressor(BaseWrapper):
 
     Arguments:
         x: array-like, shape `(n_samples, n_features)`
-            Test samples where n_samples in the number of samples
-            and n_features is the number of features.
+            Test samples where `n_samples` is the number of samples
+            and `n_features` is the number of features.
         **kwargs: dictionary arguments
             Legal arguments are the arguments of `Sequential.predict`.
 
@@ -338,16 +335,16 @@ class KerasRegressor(BaseWrapper):
 
     Arguments:
         x: array-like, shape `(n_samples, n_features)`
-            Test samples where n_samples in the number of samples
-            and n_features is the number of features.
+            Test samples where `n_samples` is the number of samples
+            and `n_features` is the number of features.
         y: array-like, shape `(n_samples,)`
-            True labels for X.
+            True labels for `x`.
         **kwargs: dictionary arguments
             Legal arguments are the arguments of `Sequential.evaluate`.
 
     Returns:
         score: float
-            Mean accuracy of predictions on X wrt. y.
+            Mean accuracy of predictions on `x` wrt. `y`.
     """
     kwargs = self.filter_sk_params(Sequential.evaluate, kwargs)
     loss = self.model.evaluate(x, y, **kwargs)
diff --git a/tensorflow/python/keras/applications/__init__.py b/tensorflow/python/keras/applications/__init__.py
index 34f1435ffb..fccedf919a 100644
--- a/tensorflow/python/keras/applications/__init__.py
+++ b/tensorflow/python/keras/applications/__init__.py
@@ -18,16 +18,23 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.keras.applications import densenet
 from tensorflow.python.keras.applications import inception_resnet_v2
 from tensorflow.python.keras.applications import inception_v3
 from tensorflow.python.keras.applications import mobilenet
+from tensorflow.python.keras.applications import nasnet
 from tensorflow.python.keras.applications import resnet50
 from tensorflow.python.keras.applications import vgg16
 from tensorflow.python.keras.applications import vgg19
 from tensorflow.python.keras.applications import xception
+from tensorflow.python.keras.applications.densenet import DenseNet121
+from tensorflow.python.keras.applications.densenet import DenseNet169
+from tensorflow.python.keras.applications.densenet import DenseNet201
 from tensorflow.python.keras.applications.inception_resnet_v2 import InceptionResNetV2
 from tensorflow.python.keras.applications.inception_v3 import InceptionV3
 from tensorflow.python.keras.applications.mobilenet import MobileNet
+from tensorflow.python.keras.applications.nasnet import NASNetLarge
+from tensorflow.python.keras.applications.nasnet import NASNetMobile
 from tensorflow.python.keras.applications.resnet50 import ResNet50
 from tensorflow.python.keras.applications.vgg16 import VGG16
 from tensorflow.python.keras.applications.vgg19 import VGG19
diff --git a/tensorflow/python/keras/applications/densenet/__init__.py b/tensorflow/python/keras/applications/densenet/__init__.py
new file mode 100644
index 0000000000..6b8ea83920
--- /dev/null
+++ b/tensorflow/python/keras/applications/densenet/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""DenseNet Keras applications."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.keras._impl.keras.applications.densenet import decode_predictions
+from tensorflow.python.keras._impl.keras.applications.densenet import DenseNet121
+from tensorflow.python.keras._impl.keras.applications.densenet import DenseNet169
+from tensorflow.python.keras._impl.keras.applications.densenet import DenseNet201
+from tensorflow.python.keras._impl.keras.applications.densenet import preprocess_input
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/python/keras/applications/nasnet/__init__.py b/tensorflow/python/keras/applications/nasnet/__init__.py
new file mode 100644
index 0000000000..94eb145b85
--- /dev/null
+++ b/tensorflow/python/keras/applications/nasnet/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""NASNet Keras applications."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.keras._impl.keras.applications.nasnet import decode_predictions
+from tensorflow.python.keras._impl.keras.applications.nasnet import NASNetLarge
+from tensorflow.python.keras._impl.keras.applications.nasnet import NASNetMobile
+from tensorflow.python.keras._impl.keras.applications.nasnet import preprocess_input
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/python/keras/layers/__init__.py b/tensorflow/python/keras/layers/__init__.py
index b94bf8f0f6..84ee5040dc 100644
--- a/tensorflow/python/keras/layers/__init__.py
+++ b/tensorflow/python/keras/layers/__init__.py
@@ -30,6 +30,7 @@ from tensorflow.python.keras._impl.keras.layers.advanced_activations import Leak
 from tensorflow.python.keras._impl.keras.layers.advanced_activations import PReLU
 from tensorflow.python.keras._impl.keras.layers.advanced_activations import ELU
 from tensorflow.python.keras._impl.keras.layers.advanced_activations import ThresholdedReLU
+from tensorflow.python.keras._impl.keras.layers.advanced_activations import Softmax
 
 # Convolution layers.
 from tensorflow.python.keras._impl.keras.layers.convolutional import Conv1D
@@ -37,6 +38,7 @@ from tensorflow.python.keras._impl.keras.layers.convolutional import Conv2D
 from tensorflow.python.keras._impl.keras.layers.convolutional import Conv3D
 from tensorflow.python.keras._impl.keras.layers.convolutional import Conv2DTranspose
 from tensorflow.python.keras._impl.keras.layers.convolutional import Conv3DTranspose
+from tensorflow.python.keras._impl.keras.layers.convolutional import SeparableConv1D
 from tensorflow.python.keras._impl.keras.layers.convolutional import SeparableConv2D
 
 # Convolution layer aliases.
@@ -45,6 +47,7 @@ from tensorflow.python.keras._impl.keras.layers.convolutional import Convolution
 from tensorflow.python.keras._impl.keras.layers.convolutional import Convolution3D
 from tensorflow.python.keras._impl.keras.layers.convolutional import Convolution2DTranspose
 from tensorflow.python.keras._impl.keras.layers.convolutional import Convolution3DTranspose
+from tensorflow.python.keras._impl.keras.layers.convolutional import SeparableConvolution1D
 from tensorflow.python.keras._impl.keras.layers.convolutional import SeparableConvolution2D
 
 # Image processing layers.
diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py
index d892654ebe..5d9feb07b4 100644
--- a/tensorflow/python/layers/base.py
+++ b/tensorflow/python/layers/base.py
@@ -99,8 +99,16 @@ class Layer(object):
         raise TypeError('Keyword argument not understood:', kwarg)
 
     # Mutable properties
+    # Indicates whether the layer's weights are updated during training
+    # and whether the layer's updates are run during training
     self.trainable = trainable
+    # A stateful layer is a layer whose updates are run during inference too,
+    # for instance stateful RNNs.
+    self.stateful = False
+    # Indicates whether `build` needs to be called upon layer call, to create
+    # the layer's weights.
     self.built = False
+    # Provides information about which inputs are compatible with the layer.
     self.input_spec = None
 
     if activity_regularizer and context.in_eager_mode():
@@ -223,6 +231,8 @@ class Layer(object):
   def updates(self):
     if context.in_eager_mode():
       raise RuntimeError('Layer.updates not supported in Eager mode.')
+    if not self.trainable and not self.stateful:
+      return []
     return self._updates
 
   def add_update(self, updates, inputs=None):
@@ -284,6 +294,8 @@ class Layer(object):
     """
     if context.in_eager_mode():
       raise RuntimeError('Layer.get_updates_for not supported in Eager mode.')
+    if not self.trainable and not self.stateful:
+      return []
     if inputs is not None:
       inputs = nest.flatten(inputs)
     if not inputs:
@@ -1269,6 +1281,15 @@ class InputSpec(object):
     self.min_ndim = min_ndim
     self.axes = axes or {}
 
+  def __repr__(self):
+    spec = [('dtype=' + str(self.dtype)) if self.dtype else '',
+            ('shape=' + str(self.shape)) if self.shape else '',
+            ('ndim=' + str(self.ndim)) if self.ndim else '',
+            ('max_ndim=' + str(self.max_ndim)) if self.max_ndim else '',
+            ('min_ndim=' + str(self.min_ndim)) if self.min_ndim else '',
+            ('axes=' + str(self.axes)) if self.axes else '']
+    return 'InputSpec(%s)' % ', '.join(x for x in spec if x)
+
 
 class Node(object):
   """A `Node` describes the connectivity between two layers.
diff --git a/tensorflow/python/layers/network.py b/tensorflow/python/layers/network.py
index ade57da411..0a5dd57621 100644
--- a/tensorflow/python/layers/network.py
+++ b/tensorflow/python/layers/network.py
@@ -574,6 +574,11 @@ class GraphNetwork(base.Layer):
         return layer
     raise ValueError('No such layer: ' + name)
 
+  @property
+  def stateful(self):
+    return any([(hasattr(layer, 'stateful') and layer.stateful)
+                for layer in self.layers])
+
   @property
   def updates(self):
     """Retrieve the network's updates.
@@ -586,6 +591,8 @@ class GraphNetwork(base.Layer):
     Returns:
         A list of update ops.
     """
+    if not self.trainable and not self.stateful:
+      return []
     updates = []
     for layer in self.layers:
       if hasattr(layer, 'updates'):
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt
index 7fe3e2db09..2bf584fa29 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt
@@ -160,15 +160,15 @@ tf_class {
   }
   member_method {
     name: "evaluate_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\'], "
   }
   member_method {
     name: "fit_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
   }
   member_method {
     name: "from_config"
@@ -228,7 +228,7 @@ tf_class {
   }
   member_method {
     name: "predict_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "predict_on_batch"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.applications.densenet.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.applications.densenet.pbtxt
new file mode 100644
index 0000000000..42cb914450
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.applications.densenet.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.keras.applications.densenet"
+tf_module {
+  member_method {
+    name: "DenseNet121"
+    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
+  }
+  member_method {
+    name: "DenseNet169"
+    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
+  }
+  member_method {
+    name: "DenseNet201"
+    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
+  }
+  member_method {
+    name: "decode_predictions"
+    argspec: "args=[\'preds\', \'top\'], varargs=None, keywords=None, defaults=[\'5\'], "
+  }
+  member_method {
+    name: "preprocess_input"
+    argspec: "args=[\'x\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.applications.nasnet.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.applications.nasnet.pbtxt
new file mode 100644
index 0000000000..cd75b87540
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.applications.nasnet.pbtxt
@@ -0,0 +1,19 @@
+path: "tensorflow.keras.applications.nasnet"
+tf_module {
+  member_method {
+    name: "NASNetLarge"
+    argspec: "args=[\'input_shape\', \'include_top\', \'weights\', \'input_tensor\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'imagenet\', \'None\', \'None\', \'1000\'], "
+  }
+  member_method {
+    name: "NASNetMobile"
+    argspec: "args=[\'input_shape\', \'include_top\', \'weights\', \'input_tensor\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'imagenet\', \'None\', \'None\', \'1000\'], "
+  }
+  member_method {
+    name: "decode_predictions"
+    argspec: "args=[\'preds\', \'top\'], varargs=None, keywords=None, defaults=[\'5\'], "
+  }
+  member_method {
+    name: "preprocess_input"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.applications.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.applications.pbtxt
index daeb5aad41..9fc086eb8e 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.applications.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.applications.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.keras.applications"
 tf_module {
+  member {
+    name: "densenet"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "inception_resnet_v2"
     mtype: "<type \'module\'>"
@@ -12,6 +16,10 @@ tf_module {
     name: "mobilenet"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "nasnet"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "resnet50"
     mtype: "<type \'module\'>"
@@ -28,6 +36,18 @@ tf_module {
     name: "xception"
     mtype: "<type \'module\'>"
   }
+  member_method {
+    name: "DenseNet121"
+    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
+  }
+  member_method {
+    name: "DenseNet169"
+    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
+  }
+  member_method {
+    name: "DenseNet201"
+    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
+  }
   member_method {
     name: "InceptionResNetV2"
     argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
@@ -40,6 +60,14 @@ tf_module {
     name: "MobileNet"
     argspec: "args=[\'input_shape\', \'alpha\', \'depth_multiplier\', \'dropout\', \'include_top\', \'weights\', \'input_tensor\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'None\', \'1.0\', \'1\', \'0.001\', \'True\', \'imagenet\', \'None\', \'None\', \'1000\'], "
   }
+  member_method {
+    name: "NASNetLarge"
+    argspec: "args=[\'input_shape\', \'include_top\', \'weights\', \'input_tensor\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'imagenet\', \'None\', \'None\', \'1000\'], "
+  }
+  member_method {
+    name: "NASNetMobile"
+    argspec: "args=[\'input_shape\', \'include_top\', \'weights\', \'input_tensor\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'imagenet\', \'None\', \'None\', \'1000\'], "
+  }
   member_method {
     name: "ResNet50"
     argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.backend.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.backend.pbtxt
index 44fbe0f7a0..ba2d083a75 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.backend.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.backend.pbtxt
@@ -398,7 +398,7 @@ tf_module {
   }
   member_method {
     name: "rnn"
-    argspec: "args=[\'step_function\', \'inputs\', \'initial_states\', \'go_backwards\', \'mask\', \'constants\', \'unroll\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'None\', \'False\'], "
+    argspec: "args=[\'step_function\', \'inputs\', \'initial_states\', \'go_backwards\', \'mask\', \'constants\', \'unroll\', \'input_length\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "round"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt
index 8719c07ca3..d4c85a4519 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'schedule\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'schedule\', \'verbose\'], varargs=None, keywords=None, defaults=[\'0\'], "
   }
   member_method {
     name: "on_batch_begin"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.datasets.boston_housing.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.datasets.boston_housing.pbtxt
index ef08f9b20f..bda31751d4 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.datasets.boston_housing.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.datasets.boston_housing.pbtxt
@@ -2,6 +2,6 @@ path: "tensorflow.keras.datasets.boston_housing"
 tf_module {
   member_method {
     name: "load_data"
-    argspec: "args=[\'path\', \'seed\', \'test_split\'], varargs=None, keywords=None, defaults=[\'boston_housing.npz\', \'113\', \'0.2\'], "
+    argspec: "args=[\'path\', \'test_split\', \'seed\'], varargs=None, keywords=None, defaults=[\'boston_housing.npz\', \'0.2\', \'113\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.datasets.imdb.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.datasets.imdb.pbtxt
index 8b1c17e9da..ff962876b6 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.datasets.imdb.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.datasets.imdb.pbtxt
@@ -6,6 +6,6 @@ tf_module {
   }
   member_method {
     name: "load_data"
-    argspec: "args=[\'path\', \'num_words\', \'skip_top\', \'maxlen\', \'seed\', \'start_char\', \'oov_char\', \'index_from\'], varargs=None, keywords=None, defaults=[\'imdb.npz\', \'None\', \'0\', \'None\', \'113\', \'1\', \'2\', \'3\'], "
+    argspec: "args=[\'path\', \'num_words\', \'skip_top\', \'maxlen\', \'seed\', \'start_char\', \'oov_char\', \'index_from\'], varargs=None, keywords=kwargs, defaults=[\'imdb.npz\', \'None\', \'0\', \'None\', \'113\', \'1\', \'2\', \'3\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.datasets.reuters.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.datasets.reuters.pbtxt
index 6b3ed1e9af..2da4a13067 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.datasets.reuters.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.datasets.reuters.pbtxt
@@ -6,6 +6,6 @@ tf_module {
   }
   member_method {
     name: "load_data"
-    argspec: "args=[\'path\', \'num_words\', \'skip_top\', \'maxlen\', \'test_split\', \'seed\', \'start_char\', \'oov_char\', \'index_from\'], varargs=None, keywords=None, defaults=[\'reuters.npz\', \'None\', \'0\', \'None\', \'0.2\', \'113\', \'1\', \'2\', \'3\'], "
+    argspec: "args=[\'path\', \'num_words\', \'skip_top\', \'maxlen\', \'test_split\', \'seed\', \'start_char\', \'oov_char\', \'index_from\'], varargs=None, keywords=kwargs, defaults=[\'reuters.npz\', \'None\', \'0\', \'None\', \'0.2\', \'113\', \'1\', \'2\', \'3\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-add.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-add.pbtxt
index a32151e22f..770a107b66 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-add.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-add.pbtxt
@@ -115,7 +115,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -127,7 +127,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-alpha-dropout.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-alpha-dropout.pbtxt
index 46b1713196..0ce42b706e 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-alpha-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-alpha-dropout.pbtxt
@@ -126,7 +126,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average.pbtxt
index 9bfaf27562..b371ad148c 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average.pbtxt
@@ -115,7 +115,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -127,7 +127,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-bidirectional.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-bidirectional.pbtxt
index 2b8ac4f1f4..2f5e65a0c5 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-bidirectional.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-bidirectional.pbtxt
@@ -123,7 +123,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'training\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'inputs\', \'training\', \'mask\', \'initial_state\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
@@ -131,7 +131,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-concatenate.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-concatenate.pbtxt
index c9a0b88725..ff08def0a0 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-concatenate.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-concatenate.pbtxt
@@ -115,7 +115,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -127,7 +127,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
index b847e224d6..6db22ca032 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
@@ -116,7 +116,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -128,7 +128,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-dot.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-dot.pbtxt
index 86578d958e..07d3f023e5 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-dot.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-dot.pbtxt
@@ -115,7 +115,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -127,7 +127,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-e-l-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-e-l-u.pbtxt
index 348012dcde..92b9760d53 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-e-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-e-l-u.pbtxt
@@ -126,7 +126,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-embedding.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-embedding.pbtxt
index 0419251083..83c528b401 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-embedding.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-embedding.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -126,7 +126,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt
index 337e85e812..b329f1c46b 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt
index 1357dc0f0d..d0f6d2a14f 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt
@@ -183,7 +183,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -195,7 +195,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-dropout.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-dropout.pbtxt
index b71a08f6c3..57596badf1 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-dropout.pbtxt
@@ -126,7 +126,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-noise.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-noise.pbtxt
index a01a6067ef..3829353cc3 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-noise.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-noise.pbtxt
@@ -126,7 +126,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
index 0dbbdf2838..3b171b137a 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m.pbtxt
index 964ef89c2e..0036d6805b 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m.pbtxt
@@ -187,7 +187,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -199,7 +199,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-leaky-re-l-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
index 6a7b23c540..8134fb7386 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
@@ -126,7 +126,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected1-d.pbtxt
index 324745e5a3..c5d4523009 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected1-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -126,7 +126,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected2-d.pbtxt
index e12ae05054..bcbed9241b 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected2-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -126,7 +126,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-maximum.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-maximum.pbtxt
index 9e889ca863..ff0db15f19 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-maximum.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-maximum.pbtxt
@@ -115,7 +115,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -127,7 +127,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-multiply.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-multiply.pbtxt
index 932680941d..1d3f33f045 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-multiply.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-multiply.pbtxt
@@ -115,7 +115,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -127,7 +127,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-p-re-l-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-p-re-l-u.pbtxt
index db644f958f..c86bc49b22 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-p-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-p-re-l-u.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -126,7 +126,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-r-n-n.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-r-n-n.pbtxt
index 74fa1db020..b29f65d79d 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-r-n-n.pbtxt
@@ -94,7 +94,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'cell\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'unroll\', \'activity_regularizer\'], varargs=None, keywords=kwargs, defaults=[\'False\', \'False\', \'False\', \'False\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'cell\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'unroll\'], varargs=None, keywords=kwargs, defaults=[\'False\', \'False\', \'False\', \'False\', \'False\'], "
   }
   member_method {
     name: "add_loss"
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -130,7 +130,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv1-d.pbtxt
new file mode 100644
index 0000000000..dd67b76523
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv1-d.pbtxt
@@ -0,0 +1,186 @@
+path: "tensorflow.keras.layers.SeparableConv1D"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv1D\'>"
+  is_instance: "<class \'tensorflow.python.layers.convolutional.SeparableConv1D\'>"
+  is_instance: "<class \'tensorflow.python.layers.convolutional._SeparableConv\'>"
+  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
+  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "scope_name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'depth_multiplier\', \'activation\', \'use_bias\', \'depthwise_initializer\', \'pointwise_initializer\', \'bias_initializer\', \'depthwise_regularizer\', \'pointwise_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'depthwise_constraint\', \'pointwise_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'None\', \'1\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
new file mode 100644
index 0000000000..bf62c095e7
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
@@ -0,0 +1,186 @@
+path: "tensorflow.keras.layers.SeparableConvolution1D"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv1D\'>"
+  is_instance: "<class \'tensorflow.python.layers.convolutional.SeparableConv1D\'>"
+  is_instance: "<class \'tensorflow.python.layers.convolutional._SeparableConv\'>"
+  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
+  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "scope_name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'depth_multiplier\', \'activation\', \'use_bias\', \'depthwise_initializer\', \'pointwise_initializer\', \'bias_initializer\', \'depthwise_regularizer\', \'pointwise_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'depthwise_constraint\', \'pointwise_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'None\', \'1\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
index 3414810db4..6e3cde3e3e 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n.pbtxt
index cf34034ef0..b875898a81 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n.pbtxt
@@ -175,7 +175,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -187,7 +187,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-softmax.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-softmax.pbtxt
new file mode 100644
index 0000000000..ee4b2fa39e
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-softmax.pbtxt
@@ -0,0 +1,183 @@
+path: "tensorflow.keras.layers.Softmax"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.advanced_activations.Softmax\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
+  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "scope_name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'axis\'], varargs=None, keywords=kwargs, defaults=[\'-1\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
index b76499658d..db9f90caef 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
index 2376d815a6..ef31c5443e 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
@@ -126,7 +126,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt
index fe336c4be5..088c8e88e2 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt
@@ -292,10 +292,18 @@ tf_module {
     name: "Reshape"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SeparableConv1D"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "SeparableConv2D"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SeparableConvolution1D"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "SeparableConvolution2D"
     mtype: "<type \'type\'>"
@@ -308,6 +316,10 @@ tf_module {
     name: "SimpleRNNCell"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Softmax"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "SpatialDropout1D"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt
index d239098b0b..0b816b5863 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt
@@ -160,15 +160,15 @@ tf_class {
   }
   member_method {
     name: "evaluate_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\'], "
   }
   member_method {
     name: "fit_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
   }
   member_method {
     name: "from_config"
@@ -228,7 +228,7 @@ tf_class {
   }
   member_method {
     name: "predict_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "predict_on_batch"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adadelta.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adadelta.pbtxt
index ed040c1586..32667cf31e 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adadelta.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adadelta.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'lr\', \'rho\', \'epsilon\', \'decay\'], varargs=None, keywords=kwargs, defaults=[\'1.0\', \'0.95\', \'1e-08\', \'0.0\'], "
+    argspec: "args=[\'self\', \'lr\', \'rho\', \'epsilon\', \'decay\'], varargs=None, keywords=kwargs, defaults=[\'1.0\', \'0.95\', \'None\', \'0.0\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adagrad.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adagrad.pbtxt
index a24651429a..efca59e8e4 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adagrad.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adagrad.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'lr\', \'epsilon\', \'decay\'], varargs=None, keywords=kwargs, defaults=[\'0.01\', \'1e-08\', \'0.0\'], "
+    argspec: "args=[\'self\', \'lr\', \'epsilon\', \'decay\'], varargs=None, keywords=kwargs, defaults=[\'0.01\', \'None\', \'0.0\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adam.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adam.pbtxt
index a0d978fded..5546e2067a 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adam.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adam.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'lr\', \'beta_1\', \'beta_2\', \'epsilon\', \'decay\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-08\', \'0.0\'], "
+    argspec: "args=[\'self\', \'lr\', \'beta_1\', \'beta_2\', \'epsilon\', \'decay\', \'amsgrad\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'None\', \'0.0\', \'False\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adamax.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adamax.pbtxt
index 1b70c93ad5..aaa54a1060 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adamax.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adamax.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'lr\', \'beta_1\', \'beta_2\', \'epsilon\', \'decay\'], varargs=None, keywords=kwargs, defaults=[\'0.002\', \'0.9\', \'0.999\', \'1e-08\', \'0.0\'], "
+    argspec: "args=[\'self\', \'lr\', \'beta_1\', \'beta_2\', \'epsilon\', \'decay\'], varargs=None, keywords=kwargs, defaults=[\'0.002\', \'0.9\', \'0.999\', \'None\', \'0.0\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-nadam.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-nadam.pbtxt
index b49dbe5cf8..1fada7fd9c 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-nadam.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-nadam.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'lr\', \'beta_1\', \'beta_2\', \'epsilon\', \'schedule_decay\'], varargs=None, keywords=kwargs, defaults=[\'0.002\', \'0.9\', \'0.999\', \'1e-08\', \'0.004\'], "
+    argspec: "args=[\'self\', \'lr\', \'beta_1\', \'beta_2\', \'epsilon\', \'schedule_decay\'], varargs=None, keywords=kwargs, defaults=[\'0.002\', \'0.9\', \'0.999\', \'None\', \'0.004\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-r-m-sprop.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
index c8860d80d4..fd3f97f35d 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'lr\', \'rho\', \'epsilon\', \'decay\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'1e-08\', \'0.0\'], "
+    argspec: "args=[\'self\', \'lr\', \'rho\', \'epsilon\', \'decay\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'None\', \'0.0\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.text.-tokenizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.text.-tokenizer.pbtxt
index 5bc8c40120..ce91caa1af 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.text.-tokenizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.text.-tokenizer.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'num_words\', \'filters\', \'lower\', \'split\', \'char_level\'], varargs=None, keywords=None, defaults=[\'None\', \'!\"#$%&()*+,-./:;<=>?@[\\\\]^_`{|}~\\t\\n\', \'True\', \' \', \'False\'], "
+    argspec: "args=[\'self\', \'num_words\', \'filters\', \'lower\', \'split\', \'char_level\', \'oov_token\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'!\"#$%&()*+,-./:;<=>?@[\\\\]^_`{|}~\\t\\n\', \'True\', \' \', \'False\', \'None\'], "
   }
   member_method {
     name: "fit_on_sequences"
-- 
GitLab


From 6e46b1abdb7f6077d2e3168008cd0ecbc6774b2a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 25 Jan 2018 19:10:13 -0800
Subject: [PATCH 211/966]   [tpu:profiler] Add infeed enqueue operation data to
 tf_op_stats.proto.

PiperOrigin-RevId: 183328456
---
 tensorflow/contrib/tpu/profiler/tf_op_stats.proto | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tensorflow/contrib/tpu/profiler/tf_op_stats.proto b/tensorflow/contrib/tpu/profiler/tf_op_stats.proto
index 5440bbbfdd..2094294baa 100644
--- a/tensorflow/contrib/tpu/profiler/tf_op_stats.proto
+++ b/tensorflow/contrib/tpu/profiler/tf_op_stats.proto
@@ -61,6 +61,11 @@ message OpMetricsResult {
 message OpMetricsDbResult {
   // A bunch of OpMetricsResults.
   repeated OpMetricsResult metrics_db = 1;
+  // The total host infeed-enqueue duration in picoseconds.
+  optional uint64 total_host_infeed_enq_duration_ps = 2;
+  // The total of the difference between the start times of two
+  // consecutive infeed-enqueues (per host) in picoseconds.
+  optional uint64 total_host_infeed_enq_start_timestamp_ps_diff = 3;
 }
 
 // Result proto for StepInfo.
-- 
GitLab


From 1fde827acd1ac75e1086fba0cd4e1f162c8d2cc0 Mon Sep 17 00:00:00 2001
From: Chris Leary <leary@google.com>
Date: Thu, 25 Jan 2018 19:50:17 -0800
Subject: [PATCH 212/966] [XLA] Add source mapping utility translation unit,
 use it in the local client.

PiperOrigin-RevId: 183331075
---
 tensorflow/compiler/xla/client/BUILD          |  1 +
 .../compiler/xla/client/local_client.cc       | 10 ++-
 tensorflow/compiler/xla/python/xla_client.py  | 25 ++++---
 tensorflow/compiler/xla/service/BUILD         | 13 ++++
 .../xla/service/compile_only_service.cc       |  2 +-
 .../compiler/xla/service/local_service.cc     |  3 +-
 tensorflow/compiler/xla/service/service.cc    | 33 ++++++----
 tensorflow/compiler/xla/service/service.h     |  6 +-
 .../compiler/xla/service/source_map_util.cc   | 66 +++++++++++++++++++
 .../compiler/xla/service/source_map_util.h    | 46 +++++++++++++
 .../xla/tests/check_execution_arity_test.cc   |  9 ++-
 tensorflow/compiler/xla/util.cc               | 15 +++--
 tensorflow/compiler/xla/util.h                | 10 +++
 13 files changed, 202 insertions(+), 37 deletions(-)
 create mode 100644 tensorflow/compiler/xla/service/source_map_util.cc
 create mode 100644 tensorflow/compiler/xla/service/source_map_util.h

diff --git a/tensorflow/compiler/xla/client/BUILD b/tensorflow/compiler/xla/client/BUILD
index d6b4ebfc39..952109dde2 100644
--- a/tensorflow/compiler/xla/client/BUILD
+++ b/tensorflow/compiler/xla/client/BUILD
@@ -98,6 +98,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:local_service",
         "//tensorflow/compiler/xla/service:shaped_buffer",
+        "//tensorflow/compiler/xla/service:source_map_util",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "@llvm//:support",
diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc
index 523169fdd2..fbeedfcecd 100644
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@@ -21,10 +21,13 @@ limitations under the License.
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/service_executable_run_options.h"
+#include "tensorflow/compiler/xla/service/source_map_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 
 namespace se = ::perftools::gputools;
 
+using xla::source_map_util::InvalidParameterArgument;
+
 namespace xla {
 
 ExecutableBuildOptions& ExecutableBuildOptions::set_device_ordinal(
@@ -79,9 +82,10 @@ tensorflow::Status LocalExecutable::ValidateExecutionOptions(
   for (int i = 0; i < arguments.size(); ++i) {
     if (!computation_layout.parameter_layout(i).MatchesLayoutInShape(
             arguments[i]->on_host_shape())) {
-      return InvalidArgument(
-          "argument does not match shape or layout of computation parameter "
-          "%d: expected %s, got %s",
+      return InvalidParameterArgument(
+          executable_.get(), i,
+          "Argument does not match shape or layout of computation parameter "
+          "%d: want %s, got %s",
           i,
           ShapeUtil::HumanString(computation_layout.parameter_layout(i).shape())
               .c_str(),
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index 9cfe1249f5..66ace613a0 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -36,15 +36,22 @@ from tensorflow.compiler.xla.python import pywrap_xla as c_api
 # pylint: disable=invalid-name
 
 
-OpMetadata = collections.namedtuple(
-    'OpMetadata',
-    [
-        'op_type',
-        'op_name',
-        'source_file',
-        'source_line',
-    ],
-)
+_OP_METADATA_FIELDS = [
+    'op_type',
+    'op_name',
+    'source_file',
+    'source_line',
+]
+OpMetadata = collections.namedtuple('OpMetadata', _OP_METADATA_FIELDS)
+
+
+def OpMetadataToProto(pyobj):
+  proto = xla_data_pb2.OpMetadata()
+  for field in _OP_METADATA_FIELDS:
+    attr = getattr(pyobj, field)
+    if attr is not None:
+      setattr(proto, field, attr)
+  return proto
 
 
 def CurrentSourceInfoMetadata(op_type=None, op_name=None, skip_frames=1):
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 2c5f3ea1dd..031d077c6a 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -462,6 +462,7 @@ cc_library(
         ":hlo_proto_util",
         ":platform_util",
         ":session_proto",
+        ":source_map_util",
         ":transfer_manager",
         ":user_computation",
         ":versioned_computation_handle",
@@ -2350,6 +2351,18 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "source_map_util",
+    srcs = ["source_map_util.cc"],
+    hdrs = ["source_map_util.h"],
+    deps = [
+        ":executable",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:lib",
+    ],
+)
+
 # -----------------------------------------------------------------------------
 
 filegroup(
diff --git a/tensorflow/compiler/xla/service/compile_only_service.cc b/tensorflow/compiler/xla/service/compile_only_service.cc
index b9306a8bb0..dab73596e1 100644
--- a/tensorflow/compiler/xla/service/compile_only_service.cc
+++ b/tensorflow/compiler/xla/service/compile_only_service.cc
@@ -101,7 +101,7 @@ CompileOnlyService::CompileAheadOfTime(
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<HloModuleConfig> module_config,
         CreateModuleConfig(*program_shape, instance.argument_layouts,
-                           &execution_options));
+                           &execution_options, *user_computation));
 
     TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> hlo_module,
                         computation_tracker_.BuildHloModule(
diff --git a/tensorflow/compiler/xla/service/local_service.cc b/tensorflow/compiler/xla/service/local_service.cc
index 2194d24257..f30530db08 100644
--- a/tensorflow/compiler/xla/service/local_service.cc
+++ b/tensorflow/compiler/xla/service/local_service.cc
@@ -128,7 +128,8 @@ StatusOr<std::unique_ptr<Executable>> LocalService::CompileExecutable(
   }
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<HloModuleConfig> module_config,
-      CreateModuleConfig(*program_shape, argument_layouts, &execution_options));
+      CreateModuleConfig(*program_shape, argument_layouts, &execution_options,
+                         *user_computation));
 
   TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor,
                       execute_backend_->stream_executor(device_ordinal));
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index 926ebbe314..849df1d8e6 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_proto_util.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/service/session.pb.h"
+#include "tensorflow/compiler/xla/service/source_map_util.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
 #include "tensorflow/compiler/xla/shape_layout.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -56,6 +57,7 @@ namespace se = ::perftools::gputools;
 
 using ::tensorflow::strings::Printf;
 using ::tensorflow::strings::StrCat;
+using ::xla::source_map_util::InvalidParameterArgument;
 
 namespace xla {
 
@@ -261,7 +263,8 @@ StatusOr<std::vector<const ShapedBuffer*>> Service::ResolveAndValidateArguments(
 StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
     const ProgramShape& program_shape,
     tensorflow::gtl::ArraySlice<const Shape*> argument_shapes,
-    const ExecutionOptions* execution_options) {
+    const ExecutionOptions* execution_options,
+    const UserComputation& user_computation) {
   auto config = MakeUnique<HloModuleConfig>(program_shape);
   auto* computation_layout = config->mutable_entry_computation_layout();
 
@@ -275,8 +278,10 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
     // ProgramShape.
     if (!ShapeUtil::Compatible(*argument_shapes[i],
                                program_shape.parameters(i))) {
-      return InvalidArgument(
-          "computation expects parameter %d to have shape %s, given shape %s",
+      return InvalidParameterArgument(
+          *user_computation.ParameterMetadata(i).value(),
+          "Argument does not match shape of computation parameter %d: want %s, "
+          "got %s",
           i, ShapeUtil::HumanString(program_shape.parameters(i)).c_str(),
           ShapeUtil::HumanString(*argument_shapes[i]).c_str());
     }
@@ -318,12 +323,14 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
 StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
     const ProgramShape& program_shape,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-    const ExecutionOptions& execution_options) {
+    const ExecutionOptions& execution_options,
+    const UserComputation& user_computation) {
   std::vector<const Shape*> argument_shapes;
   for (const auto* arg : arguments) {
     argument_shapes.push_back(&arg->on_host_shape());
   }
-  return CreateModuleConfig(program_shape, argument_shapes, &execution_options);
+  return CreateModuleConfig(program_shape, argument_shapes, &execution_options,
+                            user_computation);
 }
 
 StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
@@ -742,9 +749,10 @@ tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
 
     // Create an HloModuleConfig object for the computation, given the shape of
     // the program and the argument allocations.
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> module_config,
-                        CreateModuleConfig(*program_shape, arguments,
-                                           request.execution_options()));
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<HloModuleConfig> module_config,
+        CreateModuleConfig(*program_shape, arguments,
+                           request.execution_options(), *user_computation));
     VLOG(3) << "ExecuteParallel created HloModuleConfig computation layout: "
             << module_config->entry_computation_layout().ToString();
 
@@ -852,7 +860,8 @@ tensorflow::Status Service::Execute(const ExecuteRequest* arg,
 
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<HloModuleConfig> module_config,
-      CreateModuleConfig(*program_shape, arguments, arg->execution_options()));
+      CreateModuleConfig(*program_shape, arguments, arg->execution_options(),
+                         *user_computation));
 
   VLOG(3) << "Execute created HloModuleConfig computation layout: "
           << module_config->entry_computation_layout().ToString();
@@ -916,7 +925,8 @@ tensorflow::Status Service::ExecuteAsync(const ExecuteAsyncRequest* arg,
 
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<HloModuleConfig> module_config,
-      CreateModuleConfig(*program_shape, arguments, arg->execution_options()));
+      CreateModuleConfig(*program_shape, arguments, arg->execution_options(),
+                         *user_computation));
 
   VLOG(3) << "ExecuteAsync created HloModuleConfig computation layout: "
           << module_config->entry_computation_layout().ToString();
@@ -1236,7 +1246,8 @@ tensorflow::Status Service::ComputeConstant(const ComputeConstantRequest* arg,
   }
 
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> module_config,
-                      CreateModuleConfig(program_shape, {}, execution_options));
+                      CreateModuleConfig(program_shape, {}, execution_options,
+                                         *user_computation));
 
   // Exclude dead parameter instructions for the purpose of computing constants.
   TF_ASSIGN_OR_RETURN(
diff --git a/tensorflow/compiler/xla/service/service.h b/tensorflow/compiler/xla/service/service.h
index 0a7d0b3a7d..ca77e8fe3a 100644
--- a/tensorflow/compiler/xla/service/service.h
+++ b/tensorflow/compiler/xla/service/service.h
@@ -251,7 +251,8 @@ class Service : public ServiceInterface {
   StatusOr<std::unique_ptr<HloModuleConfig>> CreateModuleConfig(
       const ProgramShape& program_shape,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-      const ExecutionOptions& execution_options);
+      const ExecutionOptions& execution_options,
+      const UserComputation& user_computation);
 
  protected:
   friend class LocalExecutable;
@@ -275,7 +276,8 @@ class Service : public ServiceInterface {
   StatusOr<std::unique_ptr<HloModuleConfig>> CreateModuleConfig(
       const ProgramShape& program_shape,
       tensorflow::gtl::ArraySlice<const Shape*> argument_shapes,
-      const ExecutionOptions* execution_options);
+      const ExecutionOptions* execution_options,
+      const UserComputation& user_computation);
 
   // Builds an Executable for the given parameters.
   StatusOr<std::unique_ptr<Executable>> BuildExecutable(
diff --git a/tensorflow/compiler/xla/service/source_map_util.cc b/tensorflow/compiler/xla/service/source_map_util.cc
new file mode 100644
index 0000000000..8cbaac7b37
--- /dev/null
+++ b/tensorflow/compiler/xla/service/source_map_util.cc
@@ -0,0 +1,66 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/source_map_util.h"
+
+#include "tensorflow/compiler/xla/util.h"
+
+namespace xla {
+namespace source_map_util {
+namespace {
+
+Status InvalidParameterArgumentV(const OpMetadata& op_metadata,
+                                 const char* format, va_list args) {
+  string message;
+  tensorflow::strings::Appendv(&message, format, args);
+  if (!op_metadata.source_file().empty()) {
+    tensorflow::strings::Appendf(&message, " (%s:%d)",
+                                 op_metadata.source_file().c_str(),
+                                 op_metadata.source_line());
+  }
+  return InvalidArgument("%s", message.c_str());
+}
+
+}  // namespace
+
+Status InvalidParameterArgument(const OpMetadata& op_metadata,
+                                const char* format, ...) {
+  va_list args;
+  va_start(args, format);
+  Status result = InvalidParameterArgumentV(op_metadata, format, args);
+  va_end(args);
+  return result;
+}
+
+Status InvalidParameterArgument(Executable* executable, int parameter_number,
+                                const char* format, ...) {
+  va_list args;
+  va_start(args, format);
+  if (executable != nullptr && executable->has_module()) {
+    const HloModule& module = executable->module();
+    const HloComputation& computation = *module.entry_computation();
+    HloInstruction* param = computation.parameter_instruction(parameter_number);
+    const OpMetadata& metadata = param->metadata();
+    Status result = InvalidParameterArgumentV(metadata, format, args);
+    va_end(args);
+    return result;
+  }
+  Status result = InvalidArgumentV(format, args);
+  va_end(args);
+  return result;
+}
+
+}  // namespace source_map_util
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/source_map_util.h b/tensorflow/compiler/xla/service/source_map_util.h
new file mode 100644
index 0000000000..a776d745f4
--- /dev/null
+++ b/tensorflow/compiler/xla/service/source_map_util.h
@@ -0,0 +1,46 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SOURCE_MAP_UTIL_H_
+#define TENSORFLOW_COMPILER_XLA_SOURCE_MAP_UTIL_H_
+
+#include "tensorflow/compiler/xla/service/executable.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace xla {
+namespace source_map_util {
+
+// Creates an INVALID_ARUGMENT status with the given format string.
+//
+// Also, attempts to extract the OpMetadata for parameter_number on executable
+// and append it to the status message for source mapping to user code.
+//
+// executable may be nullptr, but parameter_number should not be out of bounds
+// or a CHECK-failure may occur.
+Status InvalidParameterArgument(Executable* executable, int parameter_number,
+                                const char* format, ...)
+    TF_PRINTF_ATTRIBUTE(3, 4);
+
+// As above, but takes the parameter metadata directly instead of extracting it
+// from the executable.
+Status InvalidParameterArgument(const OpMetadata& op_metadata,
+                                const char* format, ...)
+    TF_PRINTF_ATTRIBUTE(2, 3);
+
+}  // namespace source_map_util
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SOURCE_MAP_UTIL_H_
diff --git a/tensorflow/compiler/xla/tests/check_execution_arity_test.cc b/tensorflow/compiler/xla/tests/check_execution_arity_test.cc
index 659660d91e..f594cc10ac 100644
--- a/tensorflow/compiler/xla/tests/check_execution_arity_test.cc
+++ b/tensorflow/compiler/xla/tests/check_execution_arity_test.cc
@@ -104,7 +104,8 @@ XLA_TEST_F(CheckExecutionArityTest, CheckArgumentShapes) {
   ASSERT_FALSE(status.ok());
   ASSERT_EQ(status.status().code(), tensorflow::error::INVALID_ARGUMENT);
   ASSERT_THAT(status.status().error_message(),
-              ContainsRegex("expects parameter 0"));
+              ContainsRegex(
+                  "Argument does not match shape of computation parameter 0"));
 
   // Shape mismatch in parameter 1 (rank)
   status = client_->Execute(computation, {f32_data.get(), f32_data.get()},
@@ -112,7 +113,8 @@ XLA_TEST_F(CheckExecutionArityTest, CheckArgumentShapes) {
   ASSERT_FALSE(status.ok());
   ASSERT_EQ(status.status().code(), tensorflow::error::INVALID_ARGUMENT);
   ASSERT_THAT(status.status().error_message(),
-              ContainsRegex("expects parameter 1"));
+              ContainsRegex(
+                  "Argument does not match shape of computation parameter 1"));
 
   // Shape mismatch in parameter 1 (element type)
   status = client_->Execute(computation, {f32_data.get(), u8_4_data.get()},
@@ -120,7 +122,8 @@ XLA_TEST_F(CheckExecutionArityTest, CheckArgumentShapes) {
   ASSERT_FALSE(status.ok());
   ASSERT_EQ(status.status().code(), tensorflow::error::INVALID_ARGUMENT);
   ASSERT_THAT(status.status().error_message(),
-              ContainsRegex("expects parameter 1"));
+              ContainsRegex(
+                  "Argument does not match shape of computation parameter 1"));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/util.cc b/tensorflow/compiler/xla/util.cc
index fe5d29a6b6..b020905035 100644
--- a/tensorflow/compiler/xla/util.cc
+++ b/tensorflow/compiler/xla/util.cc
@@ -30,9 +30,7 @@ limitations under the License.
 #include "tensorflow/core/platform/stacktrace.h"
 
 namespace xla {
-namespace {
 
-// Logs the provided status message with a backtrace.
 Status WithLogBacktrace(const Status& status) {
   CHECK(!status.ok());
   VLOG(1) << status.ToString();
@@ -40,8 +38,6 @@ Status WithLogBacktrace(const Status& status) {
   return status;
 }
 
-}  // namespace
-
 ScopedLoggingTimer::ScopedLoggingTimer(const string& label, bool enabled)
     : enabled(enabled), label(label) {
   if (enabled) {
@@ -74,13 +70,18 @@ Status AppendStatus(Status prior, tensorflow::StringPiece context) {
 // Implementation note: we can't common these out (without using macros) because
 // they all need to va_start/va_end their varargs in their frame.
 
-Status InvalidArgument(const char* format, ...) {
+Status InvalidArgumentV(const char* format, va_list args) {
   string message;
+  tensorflow::strings::Appendv(&message, format, args);
+  return WithLogBacktrace(tensorflow::errors::InvalidArgument(message));
+}
+
+Status InvalidArgument(const char* format, ...) {
   va_list args;
   va_start(args, format);
-  tensorflow::strings::Appendv(&message, format, args);
+  Status result = InvalidArgumentV(format, args);
   va_end(args);
-  return WithLogBacktrace(tensorflow::errors::InvalidArgument(message));
+  return result;
 }
 
 Status Unimplemented(const char* format, ...) {
diff --git a/tensorflow/compiler/xla/util.h b/tensorflow/compiler/xla/util.h
index 1d7dd34449..4bc2d632cd 100644
--- a/tensorflow/compiler/xla/util.h
+++ b/tensorflow/compiler/xla/util.h
@@ -40,6 +40,13 @@ limitations under the License.
 
 namespace xla {
 
+// Logs the provided status message with a backtrace.
+//
+// For use by Status-factories, logs a backtrace at the point where the status
+// is created, such that we can use --vmodule=util=1 to see all status
+// creation backtraces.
+Status WithLogBacktrace(const Status& status);
+
 // Ranks greater than 8 are very rare, so use InlinedVector<int64, 8> to store
 // the bounds and indices. And for the rare cases of ranks greater than 8,
 // the InlinedVector will just behave like an std::vector<> and allocate the
@@ -207,6 +214,9 @@ Status ResourceExhausted(const char* format, ...) TF_PRINTF_ATTRIBUTE(1, 2);
 Status NotFound(const char* format, ...) TF_PRINTF_ATTRIBUTE(1, 2);
 Status Unavailable(const char* format, ...) TF_PRINTF_ATTRIBUTE(1, 2);
 
+// Passed-varargs variant of the InvalidArgument factory above.
+Status InvalidArgumentV(const char* format, va_list args);
+
 // Splits the lines of the original, replaces leading whitespace with the prefix
 // given by "indentation", and returns the string joined by newlines again. As a
 // side effect, any additional trailing whitespace is removed.
-- 
GitLab


From d3d9dc68ec625ed853b6356757210b063302f396 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 25 Jan 2018 20:26:35 -0800
Subject: [PATCH 213/966] internal change

PiperOrigin-RevId: 183333411
---
 .../contrib/tpu/profiler/capture_tpu_profile.cc     | 11 +++++++++--
 tensorflow/contrib/tpu/profiler/tpu_profiler.proto  | 13 ++++++++++++-
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
index 1cded9f8cf..7373d0e17c 100644
--- a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
+++ b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
@@ -47,12 +47,14 @@ string GetCurrentTimeStampAsString() {
   return s;
 }
 
-ProfileResponse Profile(const string& service_addr, int duration_ms) {
+ProfileResponse Profile(const string& service_addr, int duration_ms,
+                        const ProfileOptions& opts) {
   ProfileRequest request;
   request.set_duration_ms(duration_ms);
   request.set_max_events(kMaxEvents);
   request.add_tools("input_pipeline");
   request.add_tools("overview_page");
+  *request.mutable_opts() = opts;
   std::cout << "Limiting the number of trace events to " << kMaxEvents
             << std::endl;
   ::grpc::ClientContext context;
@@ -76,6 +78,7 @@ int main(int argc, char** argv) {
   tensorflow::string FLAGS_service_addr;
   tensorflow::string FLAGS_logdir;
   int FLAGS_duration_ms = 2000;
+  bool FLAGS_include_dataset_ops = true;
   std::vector<tensorflow::Flag> flag_list = {
       tensorflow::Flag("service_addr", &FLAGS_service_addr,
                        "Address of TPU profiler service e.g. localhost:8466"),
@@ -83,6 +86,8 @@ int main(int argc, char** argv) {
                        "Path of TensorBoard log directory e.g. /tmp/tb_log"),
       tensorflow::Flag("duration_ms", &FLAGS_duration_ms,
                        "Duration of tracing in ms. Default is 2000ms."),
+      tensorflow::Flag("include_dataset_ops", &FLAGS_include_dataset_ops,
+                       "Set to false to profile longer TPU device traces."),
   };
 
   std::cout << "Welcome to the Cloud TPU Profiler v" << TPU_PROFILER_VERSION
@@ -97,8 +102,10 @@ int main(int argc, char** argv) {
   tensorflow::port::InitMain(argv[0], &argc, &argv);
 
   int duration_ms = FLAGS_duration_ms;
+  tensorflow::ProfileOptions opts;
+  opts.set_include_dataset_ops(FLAGS_include_dataset_ops);
   tensorflow::ProfileResponse response =
-      tensorflow::tpu::Profile(FLAGS_service_addr, duration_ms);
+      tensorflow::tpu::Profile(FLAGS_service_addr, duration_ms, opts);
   // Use the current timestamp as the run name.
   tensorflow::string run = tensorflow::tpu::GetCurrentTimeStampAsString();
   TF_CHECK_OK(tensorflow::tpu::WriteTensorboardTPUProfile(
diff --git a/tensorflow/contrib/tpu/profiler/tpu_profiler.proto b/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
index bf30d2ce09..f3f3302ceb 100644
--- a/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
+++ b/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
@@ -13,6 +13,14 @@ service TPUProfiler {
   }
 }
 
+message ProfileOptions {
+  // We don't collect the dataset ops by default for better trace-viewer
+  // scalability. The caller can mannually set this field to include the ops.
+  bool include_dataset_ops = 1;
+
+  // next-field: 2
+}
+
 message ProfileRequest {
   // In future, the caller will be able to customize when profiling starts and
   // stops. For now, it collects `duration_ms` milliseconds worth of data.
@@ -25,10 +33,13 @@ message ProfileRequest {
   // required profiling tools name such as "input_pipeline_analyzer" etc
   repeated string tools = 3;
 
+  // Optional profiling options that control how a TF session will be profiled.
+  ProfileOptions opts = 4;
+
   // In future, the caller will indicate which TF session is being profiled, and
   // only data relating to that program will be returned. For now, we assume
   // all activity during the profiling period is relevant.
-  // next-field: 4
+  // next-field: 5
 }
 
 message ProfileToolData {
-- 
GitLab


From e35d8a1752538197a48ad34eb256371260d97e1c Mon Sep 17 00:00:00 2001
From: Max Galkin <maxgalkin@google.com>
Date: Thu, 25 Jan 2018 20:27:11 -0800
Subject: [PATCH 214/966] Log more info about the ill-formed node in
 ComputeTransitiveFanin.

PiperOrigin-RevId: 183333452
---
 tensorflow/core/grappler/grappler_item.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/core/grappler/grappler_item.cc b/tensorflow/core/grappler/grappler_item.cc
index 149f6fc735..2f8549cf39 100644
--- a/tensorflow/core/grappler/grappler_item.cc
+++ b/tensorflow/core/grappler/grappler_item.cc
@@ -134,6 +134,7 @@ std::vector<const NodeDef*> ComputeTransitiveFanin(
     const NodeDef* node = name_to_node[NodeName(root)];
     if (!node) {
       *ill_formed = true;
+      VLOG(2) << "ComputeTransitiveFanin: problem with root node: " << root;
       return {};
     }
     queue.push_back(node);
@@ -153,6 +154,7 @@ std::vector<const NodeDef*> ComputeTransitiveFanin(
     for (const string& input : node->input()) {
       const NodeDef* in = name_to_node[NodeName(input)];
       if (!in) {
+        VLOG(2) << "ComputeTransitiveFanin: problem with node: " << input;
         *ill_formed = true;
         return {};
       }
-- 
GitLab


From 4383f3d002ddb0712a7aac3303cde6e599de65eb Mon Sep 17 00:00:00 2001
From: Joel Shor <joel-shor@users.noreply.github.com>
Date: Thu, 25 Jan 2018 21:30:45 -0800
Subject: [PATCH 215/966] Increase tolerance on TFGAN losses test. fixes #16238
 (#16435)

---
 tensorflow/contrib/gan/python/losses/python/losses_impl_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py b/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py
index 7d2a7a254f..56ac45554d 100644
--- a/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py
+++ b/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py
@@ -620,7 +620,7 @@ class CombineAdversarialLossTest(test.TestCase):
     with self.test_session(use_gpu=True) as sess:
       for _ in range(10):  # spot check closeness on more than one sample.
         gnorm_np, precond_gnorm_np = sess.run([gnorm, precond_gnorm])
-        self.assertNear(gnorm_np, precond_gnorm_np, 1e-5)
+        self.assertNear(gnorm_np, precond_gnorm_np, 1e-4)
 
 
 class CycleConsistencyLossTest(test.TestCase):
-- 
GitLab


From 4e68568923332e14b45b5062391b30016ca3c607 Mon Sep 17 00:00:00 2001
From: Taehoon Lee <me@taehoonlee.com>
Date: Fri, 26 Jan 2018 14:30:55 +0900
Subject: [PATCH 216/966] Fix docstrings in `scan` (#16432)

---
 tensorflow/python/ops/functional_ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/functional_ops.py b/tensorflow/python/ops/functional_ops.py
index 7dbccf1caf..ac03d30fcd 100644
--- a/tensorflow/python/ops/functional_ops.py
+++ b/tensorflow/python/ops/functional_ops.py
@@ -458,7 +458,7 @@ def scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
 
   For example, if `elems` is `(t1, [t2, t3])` and `initializer` is
   `[i1, i2]` then an appropriate signature for `fn` in `python2` is:
-  `fn = lambda (acc_p1, acc_p2), (t1 [t2, t3]):` and `fn` must return a list,
+  `fn = lambda (acc_p1, acc_p2), (t1, [t2, t3]):` and `fn` must return a list,
   `[acc_n1, acc_n2]`.  An alternative correct signature for `fn`, and the
    one that works in `python3`, is:
   `fn = lambda a, t:`, where `a` and `t` correspond to the input tuples.
-- 
GitLab


From 0086c55a0faf41ff9e2a66947e4e94531b5d2cac Mon Sep 17 00:00:00 2001
From: Luke Schaefer <las26@njit.edu>
Date: Fri, 26 Jan 2018 00:31:39 -0500
Subject: [PATCH 217/966] Updating error handling in normalize_tuple (#15822)

On line 83 we test to see if single_value is an int (or able to be cast to an int).

ValueError is fired if `int()` is called with an input like 'asdf' - this is caught and gives a helpful error, using the 'name' param to provide more context.

However, when given an other than a string or int, this is *not* caught - making error messages much more esoteric than the helpful one written out here.

For example, before, I was getting an error:
> line 83, in normalize_tuple
>   int(single_value)
>  TypeError: int() argument must be a string or a number, not 'tuple'


Now I get the more useful:
> ValueError: The `kernel_size` argument must be a tuple of 2 integers.
>  Received: ((0, 3), 50) including element (0, 3) of type <type 'tuple'>
---
 tensorflow/python/layers/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/layers/utils.py b/tensorflow/python/layers/utils.py
index e8be347799..7407d9a7b3 100644
--- a/tensorflow/python/layers/utils.py
+++ b/tensorflow/python/layers/utils.py
@@ -81,7 +81,7 @@ def normalize_tuple(value, n, name):
     for single_value in value_tuple:
       try:
         int(single_value)
-      except ValueError:
+      except (ValueError, TypeError):
         raise ValueError('The `' + name + '` argument must be a tuple of ' +
                          str(n) + ' integers. Received: ' + str(value) + ' '
                          'including element ' + str(single_value) + ' of type' +
-- 
GitLab


From 73cf824a24e46766a1674c7879d8c48bd0728083 Mon Sep 17 00:00:00 2001
From: Armando Fandango <armando-fandango@users.noreply.github.com>
Date: Fri, 26 Jan 2018 00:31:56 -0500
Subject: [PATCH 218/966] Hotfix/fix android example for focus mode continuous
 picture - #15487 (#15489)

* fixed #15487

* fixed #15487

* simplify and tweak formatting to TF style
---
 .../tensorflow/demo/LegacyCameraConnectionFragment.java    | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/LegacyCameraConnectionFragment.java b/tensorflow/examples/android/src/org/tensorflow/demo/LegacyCameraConnectionFragment.java
index a317273acd..bc0c738e53 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/LegacyCameraConnectionFragment.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/LegacyCameraConnectionFragment.java
@@ -81,8 +81,11 @@ public class LegacyCameraConnectionFragment extends Fragment {
 
           try {
             Camera.Parameters parameters = camera.getParameters();
-            parameters.setFocusMode(Camera.Parameters.FOCUS_MODE_CONTINUOUS_PICTURE);
-
+            List<String> focusModes = parameters.getSupportedFocusModes();
+            if (focusModes != null 
+                && focusModes.contains(Camera.Parameters.FOCUS_MODE_CONTINUOUS_PICTURE)) {
+              parameters.setFocusMode(Camera.Parameters.FOCUS_MODE_CONTINUOUS_PICTURE);
+            }
             List<Camera.Size> cameraSizes = parameters.getSupportedPreviewSizes();
             Size[] sizes = new Size[cameraSizes.size()];
             int i = 0;
-- 
GitLab


From b6a4bc9d79cd4df561da3326ac921477e6fe301c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 25 Jan 2018 21:57:50 -0800
Subject: [PATCH 219/966] Clarified documentation on
 resize_images.align_corners parameter.

PiperOrigin-RevId: 183339087
---
 .../api_def_FusedResizeAndPadConv2D.pbtxt     |  5 +-
 .../api_def_QuantizedResizeBilinear.pbtxt     |  5 +-
 .../api_def/base_api/api_def_ResizeArea.pbtxt |  5 +-
 .../base_api/api_def_ResizeBicubic.pbtxt      |  5 +-
 .../base_api/api_def_ResizeBicubicGrad.pbtxt  |  5 +-
 .../base_api/api_def_ResizeBilinear.pbtxt     |  5 +-
 .../base_api/api_def_ResizeBilinearGrad.pbtxt |  5 +-
 .../api_def_ResizeNearestNeighbor.pbtxt       |  5 +-
 .../api_def_ResizeNearestNeighborGrad.pbtxt   |  5 +-
 tensorflow/go/op/wrappers.go                  | 69 +++++++++----------
 tensorflow/python/ops/image_ops_impl.py       |  5 +-
 11 files changed, 54 insertions(+), 65 deletions(-)

diff --git a/tensorflow/core/api_def/base_api/api_def_FusedResizeAndPadConv2D.pbtxt b/tensorflow/core/api_def/base_api/api_def_FusedResizeAndPadConv2D.pbtxt
index a72f2bfe5f..118d0e2178 100644
--- a/tensorflow/core/api_def/base_api/api_def_FusedResizeAndPadConv2D.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_FusedResizeAndPadConv2D.pbtxt
@@ -30,9 +30,8 @@ END
   attr {
     name: "resize_align_corners"
     description: <<END
-If true, rescale input by (new_height - 1) / (height - 1),
-which exactly aligns the 4 corners of images and resized images. If false, rescale
-by new_height / height. Treat similarly the width dimension.
+If true, the centers of the 4 corner pixels of the input and output tensors are
+aligned, preserving the values at the corner pixels. Defaults to false.
 END
   }
   attr {
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizedResizeBilinear.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizedResizeBilinear.pbtxt
index 6b3ba72e53..a08ed710b7 100644
--- a/tensorflow/core/api_def/base_api/api_def_QuantizedResizeBilinear.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizedResizeBilinear.pbtxt
@@ -23,9 +23,8 @@ END
   attr {
     name: "align_corners"
     description: <<END
-If true, rescale input by (new_height - 1) / (height - 1), which
-exactly aligns the 4 corners of images and resized images. If false, rescale
-by new_height / height. Treat similarly the width dimension.
+If true, the centers of the 4 corner pixels of the input and output tensors are
+aligned, preserving the values at the corner pixels. Defaults to false.
 END
   }
   summary: "Resize quantized `images` to `size` using quantized bilinear interpolation."
diff --git a/tensorflow/core/api_def/base_api/api_def_ResizeArea.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResizeArea.pbtxt
index 6dc321a544..317ad263cc 100644
--- a/tensorflow/core/api_def/base_api/api_def_ResizeArea.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ResizeArea.pbtxt
@@ -23,9 +23,8 @@ END
   attr {
     name: "align_corners"
     description: <<END
-If true, rescale input by (new_height - 1) / (height - 1), which
-exactly aligns the 4 corners of images and resized images. If false, rescale
-by new_height / height. Treat similarly the width dimension.
+If true, the centers of the 4 corner pixels of the input and output tensors are
+aligned, preserving the values at the corner pixels. Defaults to false.
 END
   }
   summary: "Resize `images` to `size` using area interpolation."
diff --git a/tensorflow/core/api_def/base_api/api_def_ResizeBicubic.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResizeBicubic.pbtxt
index 06e645e3ee..d4f8233d25 100644
--- a/tensorflow/core/api_def/base_api/api_def_ResizeBicubic.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ResizeBicubic.pbtxt
@@ -23,9 +23,8 @@ END
   attr {
     name: "align_corners"
     description: <<END
-If true, rescale input by (new_height - 1) / (height - 1), which
-exactly aligns the 4 corners of images and resized images. If false, rescale
-by new_height / height. Treat similarly the width dimension.
+If true, the centers of the 4 corner pixels of the input and output tensors are
+aligned, preserving the values at the corner pixels. Defaults to false.
 END
   }
   summary: "Resize `images` to `size` using bicubic interpolation."
diff --git a/tensorflow/core/api_def/base_api/api_def_ResizeBicubicGrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResizeBicubicGrad.pbtxt
index bf5201d82e..eeb0680ab8 100644
--- a/tensorflow/core/api_def/base_api/api_def_ResizeBicubicGrad.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ResizeBicubicGrad.pbtxt
@@ -25,9 +25,8 @@ END
   attr {
     name: "align_corners"
     description: <<END
-If true, rescale grads by (orig_height - 1) / (height - 1), which
-exactly aligns the 4 corners of grads and original_image. If false, rescale by
-orig_height / height. Treat similarly the width dimension.
+If true, the centers of the 4 corner pixels of the input and grad tensors are
+aligned. Defaults to false.
 END
   }
   summary: "Computes the gradient of bicubic interpolation."
diff --git a/tensorflow/core/api_def/base_api/api_def_ResizeBilinear.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResizeBilinear.pbtxt
index 0768e437fa..0673baa703 100644
--- a/tensorflow/core/api_def/base_api/api_def_ResizeBilinear.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ResizeBilinear.pbtxt
@@ -23,9 +23,8 @@ END
   attr {
     name: "align_corners"
     description: <<END
-If true, rescale input by (new_height - 1) / (height - 1), which
-exactly aligns the 4 corners of images and resized images. If false, rescale
-by new_height / height. Treat similarly the width dimension.
+If true, the centers of the 4 corner pixels of the input and output tensors are
+aligned, preserving the values at the corner pixels. Defaults to false.
 END
   }
   summary: "Resize `images` to `size` using bilinear interpolation."
diff --git a/tensorflow/core/api_def/base_api/api_def_ResizeBilinearGrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResizeBilinearGrad.pbtxt
index fba64203c2..9a1a5fb69a 100644
--- a/tensorflow/core/api_def/base_api/api_def_ResizeBilinearGrad.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ResizeBilinearGrad.pbtxt
@@ -25,9 +25,8 @@ END
   attr {
     name: "align_corners"
     description: <<END
-If true, rescale grads by (orig_height - 1) / (height - 1), which
-exactly aligns the 4 corners of grads and original_image. If false, rescale by
-orig_height / height. Treat similarly the width dimension.
+If true, the centers of the 4 corner pixels of the input and grad tensors are
+aligned. Defaults to false.
 END
   }
   summary: "Computes the gradient of bilinear interpolation."
diff --git a/tensorflow/core/api_def/base_api/api_def_ResizeNearestNeighbor.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResizeNearestNeighbor.pbtxt
index a74db4c9dc..e6f8dc1941 100644
--- a/tensorflow/core/api_def/base_api/api_def_ResizeNearestNeighbor.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ResizeNearestNeighbor.pbtxt
@@ -23,9 +23,8 @@ END
   attr {
     name: "align_corners"
     description: <<END
-If true, rescale input by (new_height - 1) / (height - 1), which
-exactly aligns the 4 corners of images and resized images. If false, rescale
-by new_height / height. Treat similarly the width dimension.
+If true, the centers of the 4 corner pixels of the input and output tensors are
+aligned, preserving the values at the corner pixels. Defaults to false.
 END
   }
   summary: "Resize `images` to `size` using nearest neighbor interpolation."
diff --git a/tensorflow/core/api_def/base_api/api_def_ResizeNearestNeighborGrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResizeNearestNeighborGrad.pbtxt
index 4ef1547eb4..8d52ca8334 100644
--- a/tensorflow/core/api_def/base_api/api_def_ResizeNearestNeighborGrad.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ResizeNearestNeighborGrad.pbtxt
@@ -24,9 +24,8 @@ END
   attr {
     name: "align_corners"
     description: <<END
-If true, rescale grads by (orig_height - 1) / (height - 1), which
-exactly aligns the 4 corners of grads and original_image. If false, rescale by
-orig_height / height. Treat similarly the width dimension.
+If true, the centers of the 4 corner pixels of the input and grad tensors are
+aligned. Defaults to false.
 END
   }
   summary: "Computes the gradient of nearest neighbor interpolation."
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 5b19c90238..6d54429186 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -2589,10 +2589,10 @@ type ResizeBicubicAttr func(optionalAttr)
 
 // ResizeBicubicAlignCorners sets the optional align_corners attribute to value.
 //
-// value: If true, rescale input by (new_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
-// If not specified, defaults to false
+// value: If true, the centers of the 4 corner pixels of the input and output images are
+// aligned, preserving the values at the corner pixels. If false, rescale by
+// new_height / height, new_width / width.
+// If not specified, defaults to false.
 func ResizeBicubicAlignCorners(value bool) ResizeBicubicAttr {
 	return func(m optionalAttr) {
 		m["align_corners"] = value
@@ -5817,10 +5817,9 @@ type ResizeBilinearGradAttr func(optionalAttr)
 
 // ResizeBilinearGradAlignCorners sets the optional align_corners attribute to value.
 //
-// value: If true, rescale grads by (orig_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of grads and original_image. If false, rescale by
-// orig_height / height. Treat similarly the width dimension.
-// If not specified, defaults to false
+// value: If true, the centers of the 4 corner pixels of the grads and the original_image are
+// aligned. If false, rescale by new_height / height, new_width / width.
+// If not specified, defaults to false.
 func ResizeBilinearGradAlignCorners(value bool) ResizeBilinearGradAttr {
 	return func(m optionalAttr) {
 		m["align_corners"] = value
@@ -6387,10 +6386,10 @@ type ResizeBilinearAttr func(optionalAttr)
 
 // ResizeBilinearAlignCorners sets the optional align_corners attribute to value.
 //
-// value: If true, rescale input by (new_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
-// If not specified, defaults to false
+// value: If true, the centers of the 4 corner pixels of the input and output images are
+// aligned, preserving the values at the corner pixels. If false, rescale by
+// new_height / height, new_width / width.
+// If not specified, defaults to false.
 func ResizeBilinearAlignCorners(value bool) ResizeBilinearAttr {
 	return func(m optionalAttr) {
 		m["align_corners"] = value
@@ -7382,10 +7381,10 @@ type ResizeAreaAttr func(optionalAttr)
 
 // ResizeAreaAlignCorners sets the optional align_corners attribute to value.
 //
-// value: If true, rescale input by (new_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
-// If not specified, defaults to false
+// value: If true, the centers of the 4 corner pixels of the input and output images are
+// aligned, preserving the values at the corner pixels. If false, rescale by
+// new_height / height, new_width / width.
+// If not specified, defaults to false.
 func ResizeAreaAlignCorners(value bool) ResizeAreaAttr {
 	return func(m optionalAttr) {
 		m["align_corners"] = value
@@ -13688,10 +13687,10 @@ type FusedResizeAndPadConv2DAttr func(optionalAttr)
 
 // FusedResizeAndPadConv2DResizeAlignCorners sets the optional resize_align_corners attribute to value.
 //
-// value: If true, rescale input by (new_height - 1) / (height - 1),
-// which exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
-// If not specified, defaults to false
+// value: If true, the centers of the 4 corner pixels of the input and output images are
+// aligned, preserving the values at the corner pixels. If false, rescale by
+// new_height / height, new_width / width.
+// If not specified, defaults to false.
 func FusedResizeAndPadConv2DResizeAlignCorners(value bool) FusedResizeAndPadConv2DAttr {
 	return func(m optionalAttr) {
 		m["resize_align_corners"] = value
@@ -13833,10 +13832,10 @@ type QuantizedResizeBilinearAttr func(optionalAttr)
 
 // QuantizedResizeBilinearAlignCorners sets the optional align_corners attribute to value.
 //
-// value: If true, rescale input by (new_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
-// If not specified, defaults to false
+// value: If true, the centers of the 4 corner pixels of the input and output images are
+// aligned, preserving the values at the corner pixels. If false, rescale by
+// new_height / height, new_width / width.
+// If not specified, defaults to false.
 func QuantizedResizeBilinearAlignCorners(value bool) QuantizedResizeBilinearAttr {
 	return func(m optionalAttr) {
 		m["align_corners"] = value
@@ -18668,10 +18667,9 @@ type ResizeBicubicGradAttr func(optionalAttr)
 
 // ResizeBicubicGradAlignCorners sets the optional align_corners attribute to value.
 //
-// value: If true, rescale grads by (orig_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of grads and original_image. If false, rescale by
-// orig_height / height. Treat similarly the width dimension.
-// If not specified, defaults to false
+// value: If true, the centers of the 4 corner pixels of the grads and the original_image are
+// aligned. If false, rescale by new_height / height, new_width / width.
+// If not specified, defaults to false.
 func ResizeBicubicGradAlignCorners(value bool) ResizeBicubicGradAttr {
 	return func(m optionalAttr) {
 		m["align_corners"] = value
@@ -18712,10 +18710,10 @@ type ResizeNearestNeighborAttr func(optionalAttr)
 
 // ResizeNearestNeighborAlignCorners sets the optional align_corners attribute to value.
 //
-// value: If true, rescale input by (new_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
-// If not specified, defaults to false
+// value: If true, the centers of the 4 corner pixels of the input and output images are
+// aligned, preserving the values at the corner pixels. If false, rescale by
+// new_height / height, new_width / width.
+// If not specified, defaults to false.
 func ResizeNearestNeighborAlignCorners(value bool) ResizeNearestNeighborAttr {
 	return func(m optionalAttr) {
 		m["align_corners"] = value
@@ -18755,10 +18753,9 @@ type ResizeNearestNeighborGradAttr func(optionalAttr)
 
 // ResizeNearestNeighborGradAlignCorners sets the optional align_corners attribute to value.
 //
-// value: If true, rescale grads by (orig_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of grads and original_image. If false, rescale by
-// orig_height / height. Treat similarly the width dimension.
-// If not specified, defaults to false
+// value: If true, the centers of the 4 corner pixels of the grads and the original_image are
+// aligned. If false, rescale by new_height / height, new_width / width.
+// If not specified, defaults to false.
 func ResizeNearestNeighborGradAlignCorners(value bool) ResizeNearestNeighborGradAttr {
 	return func(m optionalAttr) {
 		m["align_corners"] = value
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index 76da3bed31..2c231ef56c 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -770,8 +770,9 @@ def resize_images(images,
     size: A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
           new size for the images.
     method: ResizeMethod.  Defaults to `ResizeMethod.BILINEAR`.
-    align_corners: bool. If true, exactly align all 4 corners of the input and
-                   output. Defaults to `false`.
+    align_corners: bool.  If True, the centers of the 4 corner pixels of the
+        input and output tensors are aligned, preserving the values at the
+        corner pixels. Defaults to `False`.
 
   Raises:
     ValueError: if the shape of `images` is incompatible with the
-- 
GitLab


From 83ea281804d90f5487b18f6af5eb9dd30b14d9fb Mon Sep 17 00:00:00 2001
From: Gunhan Gulsoy <gunan@google.com>
Date: Thu, 25 Jan 2018 22:32:15 -0800
Subject: [PATCH 220/966] For windows cmake build turn on
 CMAKE_SUPPRESS_REGENERATION to avoid flaky build failures.

PiperOrigin-RevId: 183341561
---
 tensorflow/contrib/cmake/CMakeLists.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index 817e96f5da..12bfd3c62b 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -134,6 +134,9 @@ if(WIN32)
   set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /D_ITERATOR_DEBUG_LEVEL=0")
   set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL} /D_ITERATOR_DEBUG_LEVEL=0")
   set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /D_ITERATOR_DEBUG_LEVEL=0")
+
+  # Try to avoid flaky failures due to failed generation of generate.stamp files.
+  set(CMAKE_SUPPRESS_REGENERATION ON)
 endif()
 
 if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
-- 
GitLab


From e4912296bc67c13e3f53bf599b5ba5af4eea7b06 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 25 Jan 2018 22:45:59 -0800
Subject: [PATCH 221/966] Go: Update generated wrapper functions for TensorFlow
 ops.

PiperOrigin-RevId: 183342483
---
 tensorflow/go/op/wrappers.go | 69 +++++++++++++++++++-----------------
 1 file changed, 36 insertions(+), 33 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 6d54429186..5b19c90238 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -2589,10 +2589,10 @@ type ResizeBicubicAttr func(optionalAttr)
 
 // ResizeBicubicAlignCorners sets the optional align_corners attribute to value.
 //
-// value: If true, the centers of the 4 corner pixels of the input and output images are
-// aligned, preserving the values at the corner pixels. If false, rescale by
-// new_height / height, new_width / width.
-// If not specified, defaults to false.
+// value: If true, rescale input by (new_height - 1) / (height - 1), which
+// exactly aligns the 4 corners of images and resized images. If false, rescale
+// by new_height / height. Treat similarly the width dimension.
+// If not specified, defaults to false
 func ResizeBicubicAlignCorners(value bool) ResizeBicubicAttr {
 	return func(m optionalAttr) {
 		m["align_corners"] = value
@@ -5817,9 +5817,10 @@ type ResizeBilinearGradAttr func(optionalAttr)
 
 // ResizeBilinearGradAlignCorners sets the optional align_corners attribute to value.
 //
-// value: If true, the centers of the 4 corner pixels of the grads and the original_image are
-// aligned. If false, rescale by new_height / height, new_width / width.
-// If not specified, defaults to false.
+// value: If true, rescale grads by (orig_height - 1) / (height - 1), which
+// exactly aligns the 4 corners of grads and original_image. If false, rescale by
+// orig_height / height. Treat similarly the width dimension.
+// If not specified, defaults to false
 func ResizeBilinearGradAlignCorners(value bool) ResizeBilinearGradAttr {
 	return func(m optionalAttr) {
 		m["align_corners"] = value
@@ -6386,10 +6387,10 @@ type ResizeBilinearAttr func(optionalAttr)
 
 // ResizeBilinearAlignCorners sets the optional align_corners attribute to value.
 //
-// value: If true, the centers of the 4 corner pixels of the input and output images are
-// aligned, preserving the values at the corner pixels. If false, rescale by
-// new_height / height, new_width / width.
-// If not specified, defaults to false.
+// value: If true, rescale input by (new_height - 1) / (height - 1), which
+// exactly aligns the 4 corners of images and resized images. If false, rescale
+// by new_height / height. Treat similarly the width dimension.
+// If not specified, defaults to false
 func ResizeBilinearAlignCorners(value bool) ResizeBilinearAttr {
 	return func(m optionalAttr) {
 		m["align_corners"] = value
@@ -7381,10 +7382,10 @@ type ResizeAreaAttr func(optionalAttr)
 
 // ResizeAreaAlignCorners sets the optional align_corners attribute to value.
 //
-// value: If true, the centers of the 4 corner pixels of the input and output images are
-// aligned, preserving the values at the corner pixels. If false, rescale by
-// new_height / height, new_width / width.
-// If not specified, defaults to false.
+// value: If true, rescale input by (new_height - 1) / (height - 1), which
+// exactly aligns the 4 corners of images and resized images. If false, rescale
+// by new_height / height. Treat similarly the width dimension.
+// If not specified, defaults to false
 func ResizeAreaAlignCorners(value bool) ResizeAreaAttr {
 	return func(m optionalAttr) {
 		m["align_corners"] = value
@@ -13687,10 +13688,10 @@ type FusedResizeAndPadConv2DAttr func(optionalAttr)
 
 // FusedResizeAndPadConv2DResizeAlignCorners sets the optional resize_align_corners attribute to value.
 //
-// value: If true, the centers of the 4 corner pixels of the input and output images are
-// aligned, preserving the values at the corner pixels. If false, rescale by
-// new_height / height, new_width / width.
-// If not specified, defaults to false.
+// value: If true, rescale input by (new_height - 1) / (height - 1),
+// which exactly aligns the 4 corners of images and resized images. If false, rescale
+// by new_height / height. Treat similarly the width dimension.
+// If not specified, defaults to false
 func FusedResizeAndPadConv2DResizeAlignCorners(value bool) FusedResizeAndPadConv2DAttr {
 	return func(m optionalAttr) {
 		m["resize_align_corners"] = value
@@ -13832,10 +13833,10 @@ type QuantizedResizeBilinearAttr func(optionalAttr)
 
 // QuantizedResizeBilinearAlignCorners sets the optional align_corners attribute to value.
 //
-// value: If true, the centers of the 4 corner pixels of the input and output images are
-// aligned, preserving the values at the corner pixels. If false, rescale by
-// new_height / height, new_width / width.
-// If not specified, defaults to false.
+// value: If true, rescale input by (new_height - 1) / (height - 1), which
+// exactly aligns the 4 corners of images and resized images. If false, rescale
+// by new_height / height. Treat similarly the width dimension.
+// If not specified, defaults to false
 func QuantizedResizeBilinearAlignCorners(value bool) QuantizedResizeBilinearAttr {
 	return func(m optionalAttr) {
 		m["align_corners"] = value
@@ -18667,9 +18668,10 @@ type ResizeBicubicGradAttr func(optionalAttr)
 
 // ResizeBicubicGradAlignCorners sets the optional align_corners attribute to value.
 //
-// value: If true, the centers of the 4 corner pixels of the grads and the original_image are
-// aligned. If false, rescale by new_height / height, new_width / width.
-// If not specified, defaults to false.
+// value: If true, rescale grads by (orig_height - 1) / (height - 1), which
+// exactly aligns the 4 corners of grads and original_image. If false, rescale by
+// orig_height / height. Treat similarly the width dimension.
+// If not specified, defaults to false
 func ResizeBicubicGradAlignCorners(value bool) ResizeBicubicGradAttr {
 	return func(m optionalAttr) {
 		m["align_corners"] = value
@@ -18710,10 +18712,10 @@ type ResizeNearestNeighborAttr func(optionalAttr)
 
 // ResizeNearestNeighborAlignCorners sets the optional align_corners attribute to value.
 //
-// value: If true, the centers of the 4 corner pixels of the input and output images are
-// aligned, preserving the values at the corner pixels. If false, rescale by
-// new_height / height, new_width / width.
-// If not specified, defaults to false.
+// value: If true, rescale input by (new_height - 1) / (height - 1), which
+// exactly aligns the 4 corners of images and resized images. If false, rescale
+// by new_height / height. Treat similarly the width dimension.
+// If not specified, defaults to false
 func ResizeNearestNeighborAlignCorners(value bool) ResizeNearestNeighborAttr {
 	return func(m optionalAttr) {
 		m["align_corners"] = value
@@ -18753,9 +18755,10 @@ type ResizeNearestNeighborGradAttr func(optionalAttr)
 
 // ResizeNearestNeighborGradAlignCorners sets the optional align_corners attribute to value.
 //
-// value: If true, the centers of the 4 corner pixels of the grads and the original_image are
-// aligned. If false, rescale by new_height / height, new_width / width.
-// If not specified, defaults to false.
+// value: If true, rescale grads by (orig_height - 1) / (height - 1), which
+// exactly aligns the 4 corners of grads and original_image. If false, rescale by
+// orig_height / height. Treat similarly the width dimension.
+// If not specified, defaults to false
 func ResizeNearestNeighborGradAlignCorners(value bool) ResizeNearestNeighborGradAttr {
 	return func(m optionalAttr) {
 		m["align_corners"] = value
-- 
GitLab


From 3cf36b440e7866895e5fc20f2c15ac56f0d96cbc Mon Sep 17 00:00:00 2001
From: Koan-Sin Tan <koansin.tan@gmail.com>
Date: Fri, 26 Jan 2018 14:45:27 +0800
Subject: [PATCH 222/966] use bilinear op to resize input of label_image

replace previous naive downsize() function with TF Lite
RESIZE_BILINEAR operator
---
 .../contrib/lite/examples/label_image/BUILD   |  5 +-
 .../examples/label_image/bitmap_helpers.h     | 14 ++--
 .../label_image/bitmap_helpers_impl.h         | 80 ++++++++++++++-----
 .../lite/examples/label_image/label_image.cc  | 12 +--
 4 files changed, 79 insertions(+), 32 deletions(-)

diff --git a/tensorflow/contrib/lite/examples/label_image/BUILD b/tensorflow/contrib/lite/examples/label_image/BUILD
index 476d85c031..d216cdf69b 100644
--- a/tensorflow/contrib/lite/examples/label_image/BUILD
+++ b/tensorflow/contrib/lite/examples/label_image/BUILD
@@ -42,7 +42,10 @@ cc_library(
         "bitmap_helpers_impl.h",
         "label_image.h",
     ],
-    deps = ["//tensorflow/contrib/lite:string"],
+    deps = [
+        "//tensorflow/contrib/lite:string",
+        "//tensorflow/contrib/lite/kernels:builtin_ops",
+    ],
 )
 
 # TODO(ahentz): Test disabled as it has a memory leek from read_bmp
diff --git a/tensorflow/contrib/lite/examples/label_image/bitmap_helpers.h b/tensorflow/contrib/lite/examples/label_image/bitmap_helpers.h
index 860e27e5ba..471fda2ba4 100644
--- a/tensorflow/contrib/lite/examples/label_image/bitmap_helpers.h
+++ b/tensorflow/contrib/lite/examples/label_image/bitmap_helpers.h
@@ -26,15 +26,15 @@ uint8_t* read_bmp(const std::string& input_bmp_name, int* width, int* height,
                   int* channels, Settings* s);
 
 template <class T>
-void downsize(T* out, uint8_t* in, int image_height, int image_width,
-              int image_channels, int wanted_height, int wanted_width,
-              int wanted_channels, Settings* s);
+void resize(T* out, uint8_t* in, int image_height, int image_width,
+            int image_channels, int wanted_height, int wanted_width,
+            int wanted_channels, Settings* s);
 
 // explicit instantiation
-template void downsize<uint8_t>(uint8_t*, unsigned char*, int, int, int, int,
-                                int, int, Settings*);
-template void downsize<float>(float*, unsigned char*, int, int, int, int, int,
-                              int, Settings*);
+template void resize<uint8_t>(uint8_t*, unsigned char*, int, int, int, int,
+                              int, int, Settings*);
+template void resize<float>(float*, unsigned char*, int, int, int, int, int,
+                            int, Settings*);
 
 }  // namespace label_image
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/examples/label_image/bitmap_helpers_impl.h b/tensorflow/contrib/lite/examples/label_image/bitmap_helpers_impl.h
index 64a931082b..942906e269 100644
--- a/tensorflow/contrib/lite/examples/label_image/bitmap_helpers_impl.h
+++ b/tensorflow/contrib/lite/examples/label_image/bitmap_helpers_impl.h
@@ -16,30 +16,74 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_BITMAP_HELPERS_IMPL_H
 #define TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_BITMAP_HELPERS_IMPL_H
 
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/string_util.h"
+#include "tensorflow/contrib/lite/version.h"
+
 #include "tensorflow/contrib/lite/examples/label_image/label_image.h"
 
 namespace tflite {
 namespace label_image {
 
 template <class T>
-void downsize(T* out, uint8_t* in, int image_height, int image_width,
-              int image_channels, int wanted_height, int wanted_width,
-              int wanted_channels, Settings* s) {
-  for (int y = 0; y < wanted_height; ++y) {
-    const int in_y = (y * image_height) / wanted_height;
-    uint8_t* in_row = in + (in_y * image_width * image_channels);
-    T* out_row = out + (y * wanted_width * wanted_channels);
-    for (int x = 0; x < wanted_width; ++x) {
-      const int in_x = (x * image_width) / wanted_width;
-      uint8_t* in_pixel = in_row + (in_x * image_channels);
-      T* out_pixel = out_row + (x * wanted_channels);
-      for (int c = 0; c < wanted_channels; ++c) {
-        if (s->input_floating)
-          out_pixel[c] = (in_pixel[c] - s->input_mean) / s->input_std;
-        else
-          out_pixel[c] = in_pixel[c];
-      }
-    }
+void resize(T* out, uint8_t* in, int image_height, int image_width,
+            int image_channels, int wanted_height, int wanted_width,
+            int wanted_channels, Settings* s) {
+
+  int number_of_pixels = image_height * image_width * image_channels;
+  std::unique_ptr<Interpreter> interpreter(new Interpreter);
+
+  int base_index = 0;
+
+  // two inputs: input and new_sizes
+  interpreter->AddTensors(2, &base_index);
+  // one output
+  interpreter->AddTensors(1, &base_index);
+  // set input and output tensors
+  interpreter->SetInputs({0, 1});
+  interpreter->SetOutputs({2});
+
+  // set paramters of tensors
+  TfLiteQuantizationParams quant;
+  interpreter->SetTensorParametersReadWrite(
+      0, kTfLiteFloat32, "input",
+      {1, image_height, image_width, image_channels}, quant);
+  interpreter->SetTensorParametersReadWrite(1, kTfLiteInt32, "new_size", {2},
+                                            quant);
+  interpreter->SetTensorParametersReadWrite(
+      2, kTfLiteFloat32, "output",
+      {1, wanted_height, wanted_width, wanted_channels}, quant);
+
+  ops::builtin::BuiltinOpResolver resolver;
+  TfLiteRegistration* resize_op =
+      resolver.FindOp(BuiltinOperator_RESIZE_BILINEAR);
+  interpreter->AddNodeWithParameters({0, 1}, {2}, nullptr, 0, nullptr,
+                                     resize_op, nullptr);
+
+  interpreter->AllocateTensors();
+
+  // fill input image
+  // in[] are integers, cannot do memcpy() directly
+  auto input = interpreter->typed_tensor<float>(0);
+  for (int i = 0; i < number_of_pixels; i++) input[i] = in[i];
+
+  // fill new_sizes
+  interpreter->typed_tensor<int>(1)[0] = wanted_height;
+  interpreter->typed_tensor<int>(1)[1] = wanted_width;
+
+  interpreter->Invoke();
+
+  auto output = interpreter->typed_tensor<float>(2);
+  auto output_number_of_pixels =
+      wanted_height * wanted_height * wanted_channels;
+
+  for (int i = 0; i < output_number_of_pixels; i++) {
+    if (s->input_floating)
+      out[i] = (output[i] - s->input_mean) / s->input_std;
+    else
+      out[i] = (uint8_t)output[i];
   }
 }
 
diff --git a/tensorflow/contrib/lite/examples/label_image/label_image.cc b/tensorflow/contrib/lite/examples/label_image/label_image.cc
index d7f49ad875..a78900122e 100644
--- a/tensorflow/contrib/lite/examples/label_image/label_image.cc
+++ b/tensorflow/contrib/lite/examples/label_image/label_image.cc
@@ -151,14 +151,14 @@ void RunInference(Settings* s) {
   switch (interpreter->tensor(input)->type) {
     case kTfLiteFloat32:
       s->input_floating = true;
-      downsize<float>(interpreter->typed_tensor<float>(input), in,
-                      image_height, image_width, image_channels,
-                      wanted_height, wanted_width, wanted_channels, s);
+      resize<float>(interpreter->typed_tensor<float>(input), in,
+                    image_height, image_width, image_channels,
+                    wanted_height, wanted_width, wanted_channels, s);
       break;
     case kTfLiteUInt8:
-      downsize<uint8_t>(interpreter->typed_tensor<uint8_t>(input), in,
-                        image_height, image_width, image_channels,
-                        wanted_height, wanted_width, wanted_channels, s);
+      resize<uint8_t>(interpreter->typed_tensor<uint8_t>(input), in,
+                      image_height, image_width, image_channels,
+                      wanted_height, wanted_width, wanted_channels, s);
       break;
     default:
       LOG(FATAL) << "cannot handle input type "
-- 
GitLab


From ffda6079ed619df8fd3edb4db71ffc7d005c2430 Mon Sep 17 00:00:00 2001
From: Tayo Oguntebi <tayo@google.com>
Date: Thu, 25 Jan 2018 23:37:20 -0800
Subject: [PATCH 223/966]   Adds R1 test for ReduceWindow.

PiperOrigin-RevId: 183345779
---
 .../compiler/xla/tests/reduce_window_test.cc  | 76 ++++++++++++-------
 1 file changed, 49 insertions(+), 27 deletions(-)

diff --git a/tensorflow/compiler/xla/tests/reduce_window_test.cc b/tensorflow/compiler/xla/tests/reduce_window_test.cc
index 73b37e201a..7f3c72671d 100644
--- a/tensorflow/compiler/xla/tests/reduce_window_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_window_test.cc
@@ -1016,37 +1016,39 @@ class R2ReduceWindowTest : public ReduceWindowTestBase,
                                ::testing::tuple<R2ReduceWindowTestData, bool>> {
  protected:
   R2ReduceWindowTest() { set_use_bfloat16(::testing::get<1>(GetParam())); }
-};
 
-TEST_P(R2ReduceWindowTest, Add) {
-  ComputationBuilder b(client_, TestName());
-  const auto& param = ::testing::get<0>(GetParam());
-  CHECK(param.reducer == kAdd);
-
-  const float kInitValue = 0.0f;
-  Array2D<float> input(param.base_bounds[0], param.base_bounds[1], 1.0f);
-  std::unique_ptr<Literal> input_literal =
-      Literal::CreateR2FromArray2DWithLayout(
-          input, LayoutUtil::MakeLayout(param.layout));
+  void DoIt() {
+    ComputationBuilder b(client_, TestName());
+    const auto& param = ::testing::get<0>(GetParam());
+    CHECK(param.reducer == kAdd);
 
-  ComputationDataHandle parameter;
-  auto input_arg = CreateParameterAndTransferLiteral(0, *input_literal, "p0",
-                                                     &b, &parameter);
-  auto init_value =
-      CreateConstantFromLiteral(*Literal::CreateR0(kInitValue), &b);
-  b.ReduceWindow(/*operand=*/parameter,
-                 /*init_value=*/init_value,
-                 /*computation=*/CreateScalarAddComputation(FloatType(), &b),
-                 /*window_dimensions=*/param.window_bounds,
-                 /*window_strides=*/param.strides, /*padding=*/param.padding);
+    const float kInitValue = 0.0f;
+    Array2D<float> input(param.base_bounds[0], param.base_bounds[1], 1.0f);
+    std::unique_ptr<Literal> input_literal =
+        Literal::CreateR2FromArray2DWithLayout(
+            input, LayoutUtil::MakeLayout(param.layout));
 
-  auto expected = ReferenceUtil::ReduceWindow2DAdd(
-      /*operand=*/input, /*init=*/kInitValue, /*window=*/param.window_bounds,
-      /*stride=*/param.strides, /*padding=*/param.padding);
+    ComputationDataHandle parameter;
+    auto input_arg = CreateParameterAndTransferLiteral(0, *input_literal, "p0",
+                                                       &b, &parameter);
+    auto init_value =
+        CreateConstantFromLiteral(*Literal::CreateR0(kInitValue), &b);
+    b.ReduceWindow(/*operand=*/parameter,
+                   /*init_value=*/init_value,
+                   /*computation=*/CreateScalarAddComputation(FloatType(), &b),
+                   /*window_dimensions=*/param.window_bounds,
+                   /*window_strides=*/param.strides, /*padding=*/param.padding);
+
+    auto expected = ReferenceUtil::ReduceWindow2DAdd(
+        /*operand=*/input, /*init=*/kInitValue, /*window=*/param.window_bounds,
+        /*stride=*/param.strides, /*padding=*/param.padding);
+
+    ComputeAndCompareLiteral(&b, *Literal::CreateFromArray(*expected),
+                             {input_arg.get()}, DefaultErrorSpec());
+  }
+};
 
-  ComputeAndCompareLiteral(&b, *Literal::CreateFromArray(*expected),
-                           {input_arg.get()}, DefaultErrorSpec());
-}
+TEST_P(R2ReduceWindowTest, DoIt) { DoIt(); }
 
 INSTANTIATE_TEST_CASE_P(
     R2ReduceWindowTestInstantiation, R2ReduceWindowTest,
@@ -1054,6 +1056,26 @@ INSTANTIATE_TEST_CASE_P(
                        ::testing::ValuesIn(use_bfloat16_params)),
     R2ReduceWindowTestDataToString);
 
+class R2ReduceWindowFailingCpuGpuBf16Test : public R2ReduceWindowTest {};
+
+// TODO(b/72234705): Fix the test cases failed on CPU and GPU.
+XLA_TEST_P(R2ReduceWindowFailingCpuGpuBf16Test,
+           DISABLED_ON_CPU_PARALLEL(DISABLED_ON_CPU(DISABLED_ON_GPU(DoIt)))) {
+  DoIt();
+}
+
+const R2ReduceWindowTestData kR2FailingValuesCpuGpuBf16Test[] = {
+    {/*base_bounds=*/{8, 128}, /*window_bounds=*/{8, 128},
+     /*strides=*/{1, 1}, /*layout=*/{1, 0},
+     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
+};
+
+INSTANTIATE_TEST_CASE_P(
+    R2ReduceWindowFailingInstantiation, R2ReduceWindowFailingCpuGpuBf16Test,
+    ::testing::Combine(::testing::ValuesIn(kR2FailingValuesCpuGpuBf16Test),
+                       ::testing::ValuesIn(use_bfloat16_params)),
+    R2ReduceWindowTestDataToString);
+
 struct R1ReduceWindowTestData {
   int64 base_bounds[1];
   int64 window_bounds[1];
-- 
GitLab


From 76f6938bafeb81a4ca41b8dac2b9c83e1286fa95 Mon Sep 17 00:00:00 2001
From: Guangda Lai <laigd@google.com>
Date: Thu, 25 Jan 2018 23:59:19 -0800
Subject: [PATCH 224/966] Set up TensorRT configurations for external use, and
 add a test.

PiperOrigin-RevId: 183347199
---
 configure.py                                 | 119 ++++++++++
 tensorflow/BUILD                             |   9 +
 tensorflow/contrib/tensorrt/BUILD            |  45 ++++
 tensorflow/contrib/tensorrt/tensorrt_test.cc | 159 +++++++++++++
 tensorflow/tensorflow.bzl                    |  16 +-
 tensorflow/workspace.bzl                     |   2 +
 third_party/gpus/cuda_configure.bzl          | 104 +++++----
 third_party/tensorrt/BUILD                   |   0
 third_party/tensorrt/BUILD.tpl               |  67 ++++++
 third_party/tensorrt/build_defs.bzl.tpl      |   7 +
 third_party/tensorrt/tensorrt_configure.bzl  | 224 +++++++++++++++++++
 11 files changed, 701 insertions(+), 51 deletions(-)
 create mode 100644 tensorflow/contrib/tensorrt/BUILD
 create mode 100644 tensorflow/contrib/tensorrt/tensorrt_test.cc
 create mode 100644 third_party/tensorrt/BUILD
 create mode 100644 third_party/tensorrt/BUILD.tpl
 create mode 100644 third_party/tensorrt/build_defs.bzl.tpl
 create mode 100644 third_party/tensorrt/tensorrt_configure.bzl

diff --git a/configure.py b/configure.py
index cf16ef4837..083fed1710 100644
--- a/configure.py
+++ b/configure.py
@@ -43,6 +43,7 @@ _DEFAULT_CUDA_PATH = '/usr/local/cuda'
 _DEFAULT_CUDA_PATH_LINUX = '/opt/cuda'
 _DEFAULT_CUDA_PATH_WIN = ('C:/Program Files/NVIDIA GPU Computing '
                           'Toolkit/CUDA/v%s' % _DEFAULT_CUDA_VERSION)
+_DEFAULT_TENSORRT_PATH_LINUX = '/usr/lib/x86_64-linux-gnu'
 _TF_OPENCL_VERSION = '1.2'
 _DEFAULT_COMPUTECPP_TOOLKIT_PATH = '/usr/local/computecpp'
 _DEFAULT_TRISYCL_INCLUDE_DIR = '/usr/local/triSYCL/include'
@@ -959,6 +960,119 @@ def set_tf_cudnn_version(environ_cp):
   write_action_env_to_bazelrc('TF_CUDNN_VERSION', tf_cudnn_version)
 
 
+def set_tf_tensorrt_install_path(environ_cp):
+  """Set TENSORRT_INSTALL_PATH and TF_TENSORRT_VERSION.
+
+  Adapted from code contributed by Sami Kama (https://github.com/samikama).
+
+  Args:
+    environ_cp: copy of the os.environ.
+
+  Raises:
+    ValueError: if this method was called under non-Linux platform.
+    UserInputError: if user has provided invalid input multiple times.
+  """
+  if not is_linux():
+    raise ValueError('Currently TensorRT is only supported on Linux platform.')
+
+  # Ask user whether to add TensorRT support.
+  if str(int(get_var(
+      environ_cp, 'TF_NEED_TENSORRT', 'TensorRT', False))) != '1':
+    return
+
+  for _ in range(_DEFAULT_PROMPT_ASK_ATTEMPTS):
+    ask_tensorrt_path = (r'Please specify the location where TensorRT is '
+                         'installed. [Default is %s]:') % (
+                             _DEFAULT_TENSORRT_PATH_LINUX)
+    trt_install_path = get_from_env_or_user_or_default(
+        environ_cp, 'TENSORRT_INSTALL_PATH', ask_tensorrt_path,
+        _DEFAULT_TENSORRT_PATH_LINUX)
+
+    # Result returned from "read" will be used unexpanded. That make "~"
+    # unusable. Going through one more level of expansion to handle that.
+    trt_install_path = os.path.realpath(
+        os.path.expanduser(trt_install_path))
+
+    def find_libs(search_path):
+      """Search for libnvinfer.so in "search_path"."""
+      fl = set()
+      if os.path.exists(search_path) and os.path.isdir(search_path):
+        fl.update([os.path.realpath(os.path.join(search_path, x))
+                   for x in os.listdir(search_path) if 'libnvinfer.so' in x])
+      return fl
+
+    possible_files = find_libs(trt_install_path)
+    possible_files.update(find_libs(os.path.join(trt_install_path, 'lib')))
+    possible_files.update(find_libs(os.path.join(trt_install_path, 'lib64')))
+
+    def is_compatible(tensorrt_lib, cuda_ver, cudnn_ver):
+      """Check the compatibility between tensorrt and cudnn/cudart libraries."""
+      ldd_bin = which('ldd') or '/usr/bin/ldd'
+      ldd_out = run_shell([ldd_bin, tensorrt_lib]).split(os.linesep)
+      cudnn_pattern = re.compile('.*libcudnn.so\\.?(.*) =>.*$')
+      cuda_pattern = re.compile('.*libcudart.so\\.?(.*) =>.*$')
+      cudnn = None
+      cudart = None
+      for line in ldd_out:
+        if 'libcudnn.so' in line:
+          cudnn = cudnn_pattern.search(line)
+        elif 'libcudart.so' in line:
+          cudart = cuda_pattern.search(line)
+      if cudnn and len(cudnn.group(1)):
+        cudnn = convert_version_to_int(cudnn.group(1))
+      if cudart and len(cudart.group(1)):
+        cudart = convert_version_to_int(cudart.group(1))
+      return (cudnn == cudnn_ver) and (cudart == cuda_ver)
+
+    cuda_ver = convert_version_to_int(environ_cp['TF_CUDA_VERSION'])
+    cudnn_ver = convert_version_to_int(environ_cp['TF_CUDNN_VERSION'])
+    nvinfer_pattern = re.compile('.*libnvinfer.so.?(.*)$')
+    highest_ver = [0, None, None]
+
+    for lib_file in possible_files:
+      if is_compatible(lib_file, cuda_ver, cudnn_ver):
+        ver_str = nvinfer_pattern.search(lib_file).group(1)
+        ver = convert_version_to_int(ver_str) if len(ver_str) else 0
+        if ver > highest_ver[0]:
+          highest_ver = [ver, ver_str, lib_file]
+    if highest_ver[1] is not None:
+      trt_install_path = os.path.dirname(highest_ver[2])
+      tf_tensorrt_version = highest_ver[1]
+      break
+
+    # Try another alternative from ldconfig.
+    ldconfig_bin = which('ldconfig') or '/sbin/ldconfig'
+    ldconfig_output = run_shell([ldconfig_bin, '-p'])
+    search_result = re.search(
+        '.*libnvinfer.so\\.?([0-9.]*).* => (.*)', ldconfig_output)
+    if search_result:
+      libnvinfer_path_from_ldconfig = search_result.group(2)
+      if os.path.exists(libnvinfer_path_from_ldconfig):
+        if is_compatible(libnvinfer_path_from_ldconfig, cuda_ver, cudnn_ver):
+          trt_install_path = os.path.dirname(libnvinfer_path_from_ldconfig)
+          tf_tensorrt_version = search_result.group(1)
+          break
+
+    # Reset and Retry
+    print('Invalid path to TensorRT. None of the following files can be found:')
+    print(trt_install_path)
+    print(os.path.join(trt_install_path, 'lib'))
+    print(os.path.join(trt_install_path, 'lib64'))
+    if search_result:
+      print(libnvinfer_path_from_ldconfig)
+
+  else:
+    raise UserInputError('Invalid TF_TENSORRT setting was provided %d '
+                         'times in a row. Assuming to be a scripting mistake.' %
+                         _DEFAULT_PROMPT_ASK_ATTEMPTS)
+
+  # Set TENSORRT_INSTALL_PATH and TF_TENSORRT_VERSION
+  environ_cp['TENSORRT_INSTALL_PATH'] = trt_install_path
+  write_action_env_to_bazelrc('TENSORRT_INSTALL_PATH', trt_install_path)
+  environ_cp['TF_TENSORRT_VERSION'] = tf_tensorrt_version
+  write_action_env_to_bazelrc('TF_TENSORRT_VERSION', tf_tensorrt_version)
+
+
 def get_native_cuda_compute_capabilities(environ_cp):
   """Get native cuda compute capabilities.
 
@@ -1244,9 +1358,11 @@ def main():
     environ_cp['TF_NEED_COMPUTECPP'] = '0'
     environ_cp['TF_NEED_OPENCL'] = '0'
     environ_cp['TF_CUDA_CLANG'] = '0'
+    environ_cp['TF_NEED_TENSORRT'] = '0'
 
   if is_macos():
     environ_cp['TF_NEED_JEMALLOC'] = '0'
+    environ_cp['TF_NEED_TENSORRT'] = '0'
 
   set_build_var(environ_cp, 'TF_NEED_JEMALLOC', 'jemalloc as malloc',
                 'with_jemalloc', True)
@@ -1278,6 +1394,8 @@ def main():
       'TF_CUDA_CONFIG_REPO' not in environ_cp):
     set_tf_cuda_version(environ_cp)
     set_tf_cudnn_version(environ_cp)
+    if is_linux():
+      set_tf_tensorrt_install_path(environ_cp)
     set_tf_cuda_compute_capabilities(environ_cp)
 
     set_tf_cuda_clang(environ_cp)
@@ -1332,6 +1450,7 @@ def main():
         'more details.')
   config_info_line('mkl', 'Build with MKL support.')
   config_info_line('monolithic', 'Config for mostly static monolithic build.')
+  config_info_line('tensorrt', 'Build with TensorRT support.')
 
 if __name__ == '__main__':
   main()
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 9099463c4f..3d2411a266 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -370,6 +370,14 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+# TODO(laigd): consider removing this option and make TensorRT enabled
+# automatically when CUDA is enabled.
+config_setting(
+    name = "with_tensorrt_support",
+    values = {"define": "with_tensorrt_support=true"},
+    visibility = ["//visibility:public"],
+)
+
 package_group(
     name = "internal",
     packages = [
@@ -558,6 +566,7 @@ filegroup(
         "//tensorflow/contrib/tensor_forest/proto:all_files",
         "//tensorflow/contrib/tensorboard:all_files",
         "//tensorflow/contrib/tensorboard/db:all_files",
+        "//tensorflow/contrib/tensorrt:all_files",
         "//tensorflow/contrib/testing:all_files",
         "//tensorflow/contrib/text:all_files",
         "//tensorflow/contrib/tfprof:all_files",
diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
new file mode 100644
index 0000000000..28f571e1f0
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -0,0 +1,45 @@
+# Description:
+#   Wrap NVIDIA TensorRT (http://developer.nvidia.com/tensorrt) with tensorflow.
+#   APIs are meant to change over time.
+
+package(default_visibility = ["//tensorflow:__subpackages__"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
+load(
+    "@local_config_tensorrt//:build_defs.bzl",
+    "if_tensorrt",
+)
+
+tf_cuda_cc_test(
+    name = "tensorrt_test_cc",
+    size = "small",
+    srcs = ["tensorrt_test.cc"],
+    tags = [
+        "manual",
+        "notap",
+    ],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ] + if_tensorrt([
+        "@local_config_cuda//cuda:cuda_headers",
+        "@local_config_tensorrt//:nv_infer",
+    ]),
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/tensorrt/tensorrt_test.cc b/tensorflow/contrib/tensorrt/tensorrt_test.cc
new file mode 100644
index 0000000000..e11522ea5b
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/tensorrt_test.cc
@@ -0,0 +1,159 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "cuda/include/cuda.h"
+#include "cuda/include/cuda_runtime_api.h"
+#include "tensorrt/include/NvInfer.h"
+
+namespace tensorflow {
+namespace {
+
+class Logger : public nvinfer1::ILogger {
+ public:
+  void log(nvinfer1::ILogger::Severity severity, const char* msg) override {
+    switch (severity) {
+      case Severity::kINFO:
+        LOG(INFO) << msg;
+        break;
+      case Severity::kWARNING:
+        LOG(WARNING) << msg;
+        break;
+      case Severity::kINTERNAL_ERROR:
+      case Severity::kERROR:
+        LOG(ERROR) << msg;
+        break;
+      default:
+        break;
+    }
+  }
+};
+
+class ScopedWeights {
+ public:
+  ScopedWeights(float value) : value_(value) {
+    w.type = nvinfer1::DataType::kFLOAT;
+    w.values = &value_;
+    w.count = 1;
+  }
+  const nvinfer1::Weights& get() { return w; }
+
+ private:
+  float value_;
+  nvinfer1::Weights w;
+};
+
+const char* kInputTensor = "input";
+const char* kOutputTensor = "output";
+
+// Creates a network to compute y=2x+3.
+nvinfer1::IHostMemory* CreateNetwork() {
+  Logger logger;
+  nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(logger);
+  ScopedWeights weights(2.0);
+  ScopedWeights bias(3.0);
+
+  nvinfer1::INetworkDefinition* network = builder->createNetwork();
+  // Add the input.
+  auto input = network->addInput(kInputTensor, nvinfer1::DataType::kFLOAT,
+                                 nvinfer1::DimsCHW{1, 1, 1});
+  EXPECT_NE(input, nullptr);
+  // Add the hidden layer.
+  auto layer = network->addFullyConnected(*input, 1, weights.get(), bias.get());
+  EXPECT_NE(layer, nullptr);
+  // Mark the output.
+  auto output = layer->getOutput(0);
+  output->setName(kOutputTensor);
+  network->markOutput(*output);
+  // Build the engine
+  builder->setMaxBatchSize(1);
+  builder->setMaxWorkspaceSize(1 << 10);
+  auto engine = builder->buildCudaEngine(*network);
+  EXPECT_NE(engine, nullptr);
+  // Serialize the engine to create a model, then close everything.
+  nvinfer1::IHostMemory* model = engine->serialize();
+  network->destroy();
+  engine->destroy();
+  builder->destroy();
+  return model;
+}
+
+// Executes the network.
+void Execute(nvinfer1::IExecutionContext& context, const float* input,
+             float* output) {
+  const nvinfer1::ICudaEngine& engine = context.getEngine();
+
+  // We have two bindings: input and output.
+  ASSERT_EQ(engine.getNbBindings(), 2);
+  const int input_index = engine.getBindingIndex(kInputTensor);
+  const int output_index = engine.getBindingIndex(kOutputTensor);
+
+  // Create GPU buffers and a stream
+  void* buffers[2];
+  ASSERT_EQ(0, cudaMalloc(&buffers[input_index], sizeof(float)));
+  ASSERT_EQ(0, cudaMalloc(&buffers[output_index], sizeof(float)));
+  cudaStream_t stream;
+  ASSERT_EQ(0, cudaStreamCreate(&stream));
+
+  // Copy the input to the GPU, execute the network, and copy the output back.
+  //
+  // Note that since the host buffer was not created as pinned memory, these
+  // async copies are turned into sync copies. So the following synchronization
+  // could be removed.
+  ASSERT_EQ(0, cudaMemcpyAsync(buffers[input_index], input, sizeof(float),
+                               cudaMemcpyHostToDevice, stream));
+  context.enqueue(1, buffers, stream, nullptr);
+  ASSERT_EQ(0, cudaMemcpyAsync(output, buffers[output_index], sizeof(float),
+                               cudaMemcpyDeviceToHost, stream));
+  cudaStreamSynchronize(stream);
+
+  // Release the stream and the buffers
+  cudaStreamDestroy(stream);
+  ASSERT_EQ(0, cudaFree(buffers[input_index]));
+  ASSERT_EQ(0, cudaFree(buffers[output_index]));
+}
+
+TEST(TensorrtTest, BasicFunctions) {
+  // Create the network model.
+  nvinfer1::IHostMemory* model = CreateNetwork();
+  // Use the model to create an engine and then an execution context.
+  Logger logger;
+  nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(logger);
+  nvinfer1::ICudaEngine* engine =
+      runtime->deserializeCudaEngine(model->data(), model->size(), nullptr);
+  model->destroy();
+  nvinfer1::IExecutionContext* context = engine->createExecutionContext();
+
+  // Execute the network.
+  float input = 1234;
+  float output;
+  Execute(*context, &input, &output);
+  EXPECT_EQ(output, input * 2 + 3);
+
+  // Destroy the engine.
+  context->destroy();
+  engine->destroy();
+  runtime->destroy();
+}
+
+}  // namespace
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 383c97344a..7fe9c98726 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -10,6 +10,10 @@ load(
     "tf_additional_xla_deps_py",
     "if_static",
 )
+load(
+    "@local_config_tensorrt//:build_defs.bzl",
+    "if_tensorrt",
+)
 load(
     "@local_config_cuda//cuda:build_defs.bzl",
     "if_cuda",
@@ -197,6 +201,7 @@ def tf_copts(android_optimization_level_override="-O2", is_external=False):
           "-fno-exceptions",
           "-ftemplate-depth=900"])
       + if_cuda(["-DGOOGLE_CUDA=1"])
+      + if_tensorrt(["-DGOOGLE_TENSORRT=1"])
       + if_mkl(["-DINTEL_MKL=1", "-DEIGEN_USE_VML", "-fopenmp",])
       + if_android_arm(["-mfpu=neon"])
       + if_linux_x86_64(["-msse3"])
@@ -861,9 +866,11 @@ def tf_cuda_library(deps=None, cuda_deps=None, copts=tf_copts(), **kwargs):
 
   When the library is built with --config=cuda:
 
-  - both deps and cuda_deps are used as dependencies
-  - the cuda runtime is added as a dependency (if necessary)
-  - The library additionally passes -DGOOGLE_CUDA=1 to the list of copts
+  - Both deps and cuda_deps are used as dependencies.
+  - The cuda runtime is added as a dependency (if necessary).
+  - The library additionally passes -DGOOGLE_CUDA=1 to the list of copts.
+  - In addition, when the library is also built with TensorRT enabled, it
+      additionally passes -DGOOGLE_TENSORRT=1 to the list of copts.
 
   Args:
   - cuda_deps: BUILD dependencies which will be linked if and only if:
@@ -882,7 +889,8 @@ def tf_cuda_library(deps=None, cuda_deps=None, copts=tf_copts(), **kwargs):
           clean_dep("//tensorflow/core:cuda"),
           "@local_config_cuda//cuda:cuda_headers"
       ]),
-      copts=copts + if_cuda(["-DGOOGLE_CUDA=1"]) + if_mkl(["-DINTEL_MKL=1"]),
+      copts=(copts + if_cuda(["-DGOOGLE_CUDA=1"]) + if_mkl(["-DINTEL_MKL=1"]) +
+             if_tensorrt(["-DGOOGLE_TENSORRT=1"])),
       **kwargs)
 
 register_extension_info(
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 9145d9e58a..f7d9075032 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -1,6 +1,7 @@
 # TensorFlow external dependencies that can be loaded in WORKSPACE files.
 
 load("//third_party/gpus:cuda_configure.bzl", "cuda_configure")
+load("//third_party/tensorrt:tensorrt_configure.bzl", "tensorrt_configure")
 load("//third_party/mkl:build_defs.bzl", "mkl_repository")
 load("//third_party/git:git_configure.bzl", "git_configure")
 load("//third_party/py:python_configure.bzl", "python_configure")
@@ -68,6 +69,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   check_bazel_version_at_least("0.5.4")
   clang6_configure(name="local_config_clang6")
   cuda_configure(name="local_config_cuda")
+  tensorrt_configure(name="local_config_tensorrt")
   git_configure(name="local_config_git")
   sycl_configure(name="local_config_sycl")
   python_configure(name="local_config_python")
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index 2727fa5efe..8e1dd8a54f 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -236,7 +236,7 @@ def _cudnn_install_basedir(repository_ctx):
   return cudnn_install_path
 
 
-def _matches_version(environ_version, detected_version):
+def matches_version(environ_version, detected_version):
   """Checks whether the user-specified version matches the detected version.
 
   This function performs a weak matching so that if the user specifies only the
@@ -317,7 +317,7 @@ def _cuda_version(repository_ctx, cuda_toolkit_path, cpu_value):
   environ_version = ""
   if _TF_CUDA_VERSION in repository_ctx.os.environ:
     environ_version = repository_ctx.os.environ[_TF_CUDA_VERSION].strip()
-  if environ_version and not _matches_version(environ_version, full_version):
+  if environ_version and not matches_version(environ_version, full_version):
     auto_configure_fail(
         ("CUDA version detected from nvcc (%s) does not match " +
          "TF_CUDA_VERSION (%s)") % (full_version, environ_version))
@@ -338,35 +338,49 @@ _DEFINE_CUDNN_MINOR = "#define CUDNN_MINOR"
 _DEFINE_CUDNN_PATCHLEVEL = "#define CUDNN_PATCHLEVEL"
 
 
-def _find_cuda_define(repository_ctx, cudnn_header_dir, define):
-  """Returns the value of a #define in cudnn.h
+def find_cuda_define(repository_ctx, header_dir, header_file, define):
+  """Returns the value of a #define in a header file.
 
-  Greps through cudnn.h and returns the value of the specified #define. If the
-  #define is not found, then raise an error.
+  Greps through a header file and returns the value of the specified #define.
+  If the #define is not found, then raise an error.
 
   Args:
     repository_ctx: The repository context.
-    cudnn_header_dir: The directory containing the cuDNN header.
+    header_dir: The directory containing the header file.
+    header_file: The header file name.
     define: The #define to search for.
 
   Returns:
-    The value of the #define found in cudnn.h.
+    The value of the #define found in the header.
   """
-  # Confirm location of cudnn.h and grep for the line defining CUDNN_MAJOR.
-  cudnn_h_path = repository_ctx.path("%s/cudnn.h" % cudnn_header_dir)
-  if not cudnn_h_path.exists:
-    auto_configure_fail("Cannot find cudnn.h at %s" % str(cudnn_h_path))
-  result = repository_ctx.execute(["grep", "--color=never", "-E", define, str(cudnn_h_path)])
+  # Confirm location of the header and grep for the line defining the macro.
+  h_path = repository_ctx.path("%s/%s" % (header_dir, header_file))
+  if not h_path.exists:
+    auto_configure_fail("Cannot find %s at %s" % (header_file, str(h_path)))
+  result = repository_ctx.execute(
+      # Grep one more lines as some #defines are splitted into two lines.
+      ["grep", "--color=never", "-A1", "-E", define, str(h_path)])
   if result.stderr:
-    auto_configure_fail("Error reading %s: %s" %
-                        (result.stderr, str(cudnn_h_path)))
+    auto_configure_fail("Error reading %s: %s" % (str(h_path), result.stderr))
 
-  # Parse the cuDNN major version from the line defining CUDNN_MAJOR
-  lines = result.stdout.splitlines()
-  if len(lines) == 0 or lines[0].find(define) == -1:
+  # Parse the version from the line defining the macro.
+  if result.stdout.find(define) == -1:
     auto_configure_fail("Cannot find line containing '%s' in %s" %
-                        (define, str(cudnn_h_path)))
-  return lines[0].replace(define, "").strip()
+                        (define, h_path))
+  version = result.stdout
+  # Remove the new line and '\' character if any.
+  version = version.replace("\\", " ")
+  version = version.replace("\n", " ")
+  version = version.replace(define, "").lstrip()
+  # Remove the code after the version number.
+  version_end = version.find(" ")
+  if version_end != -1:
+    if version_end == 0:
+      auto_configure_fail(
+          "Cannot extract the version from line containing '%s' in %s" %
+          (define, str(h_path)))
+    version = version[:version_end].strip()
+  return version
 
 
 def _cudnn_version(repository_ctx, cudnn_install_basedir, cpu_value):
@@ -382,12 +396,12 @@ def _cudnn_version(repository_ctx, cudnn_install_basedir, cpu_value):
   """
   cudnn_header_dir = _find_cudnn_header_dir(repository_ctx,
                                             cudnn_install_basedir)
-  major_version = _find_cuda_define(repository_ctx, cudnn_header_dir,
-                                    _DEFINE_CUDNN_MAJOR)
-  minor_version = _find_cuda_define(repository_ctx, cudnn_header_dir,
-                                    _DEFINE_CUDNN_MINOR)
-  patch_version = _find_cuda_define(repository_ctx, cudnn_header_dir,
-                                    _DEFINE_CUDNN_PATCHLEVEL)
+  major_version = find_cuda_define(
+      repository_ctx, cudnn_header_dir, "cudnn.h", _DEFINE_CUDNN_MAJOR)
+  minor_version = find_cuda_define(
+      repository_ctx, cudnn_header_dir, "cudnn.h", _DEFINE_CUDNN_MINOR)
+  patch_version = find_cuda_define(
+      repository_ctx, cudnn_header_dir, "cudnn.h", _DEFINE_CUDNN_PATCHLEVEL)
   full_version = "%s.%s.%s" % (major_version, minor_version, patch_version)
 
   # Check whether TF_CUDNN_VERSION was set by the user and fail if it does not
@@ -395,7 +409,7 @@ def _cudnn_version(repository_ctx, cudnn_install_basedir, cpu_value):
   environ_version = ""
   if _TF_CUDNN_VERSION in repository_ctx.os.environ:
     environ_version = repository_ctx.os.environ[_TF_CUDNN_VERSION].strip()
-  if environ_version and not _matches_version(environ_version, full_version):
+  if environ_version and not matches_version(environ_version, full_version):
     cudnn_h_path = repository_ctx.path("%s/include/cudnn.h" %
                                        cudnn_install_basedir)
     auto_configure_fail(
@@ -427,7 +441,7 @@ def _compute_capabilities(repository_ctx):
   return capabilities
 
 
-def _cpu_value(repository_ctx):
+def get_cpu_value(repository_ctx):
   """Returns the name of the host operating system.
 
   Args:
@@ -447,7 +461,7 @@ def _cpu_value(repository_ctx):
 
 def _is_windows(repository_ctx):
   """Returns true if the host operating system is windows."""
-  return _cpu_value(repository_ctx) == "Windows"
+  return get_cpu_value(repository_ctx) == "Windows"
 
 def _lib_name(lib, cpu_value, version="", static=False):
   """Constructs the platform-specific name of a library.
@@ -582,11 +596,8 @@ def _find_libs(repository_ctx, cuda_config):
     cuda_config: The CUDA config as returned by _get_cuda_config
 
   Returns:
-    Map of library names to structs of filename and path as returned by
-    _find_cuda_lib and _find_cupti_lib.
+    Map of library names to structs of filename and path.
   """
-  cudnn_version = cuda_config.cudnn_version
-  cudnn_ext = ".%s" % cudnn_version if cudnn_version else ""
   cpu_value = cuda_config.cpu_value
   return {
       "cuda": _find_cuda_lib("cuda", repository_ctx, cpu_value, cuda_config.cuda_toolkit_path),
@@ -611,7 +622,7 @@ def _find_libs(repository_ctx, cuda_config):
       "cudnn": _find_cuda_lib(
           "cudnn", repository_ctx, cpu_value, cuda_config.cudnn_install_basedir,
           cuda_config.cudnn_version),
-      "cupti": _find_cupti_lib(repository_ctx, cuda_config),
+      "cupti": _find_cupti_lib(repository_ctx, cuda_config)
   }
 
 
@@ -654,7 +665,7 @@ def _get_cuda_config(repository_ctx):
       compute_capabilities: A list of the system's CUDA compute capabilities.
       cpu_value: The name of the host operating system.
   """
-  cpu_value = _cpu_value(repository_ctx)
+  cpu_value = get_cpu_value(repository_ctx)
   cuda_toolkit_path = _cuda_toolkit_path(repository_ctx)
   cuda_version = _cuda_version(repository_ctx, cuda_toolkit_path, cpu_value)
   cudnn_install_basedir = _cudnn_install_basedir(repository_ctx)
@@ -712,13 +723,13 @@ error_gpu_disabled()
 
 
 def _create_dummy_repository(repository_ctx):
-  cpu_value = _cpu_value(repository_ctx)
+  cpu_value = get_cpu_value(repository_ctx)
 
   # Set up BUILD file for cuda/.
   _tpl(repository_ctx, "cuda:build_defs.bzl",
        {
            "%{cuda_is_configured}": "False",
-           "%{cuda_extra_copts}": "[]"
+           "%{cuda_extra_copts}": "[]",
        })
   _tpl(repository_ctx, "cuda:BUILD",
        {
@@ -805,8 +816,8 @@ def _norm_path(path):
   return path
 
 
-def _symlink_genrule_for_dir(repository_ctx, src_dir, dest_dir, genrule_name,
-    src_files = [], dest_files = []):
+def symlink_genrule_for_dir(repository_ctx, src_dir, dest_dir, genrule_name,
+                            src_files = [], dest_files = []):
   """Returns a genrule to symlink(or copy if on Windows) a set of files.
 
   If src_dir is passed, files will be read from the given directory; otherwise
@@ -913,11 +924,11 @@ def _create_local_cuda_repository(repository_ctx):
   # cuda_toolkit_path
   cuda_toolkit_path = cuda_config.cuda_toolkit_path
   cuda_include_path = cuda_toolkit_path + "/include"
-  genrules = [_symlink_genrule_for_dir(repository_ctx,
+  genrules = [symlink_genrule_for_dir(repository_ctx,
       cuda_include_path, "cuda/include", "cuda-include")]
-  genrules.append(_symlink_genrule_for_dir(repository_ctx,
+  genrules.append(symlink_genrule_for_dir(repository_ctx,
       cuda_toolkit_path + "/nvvm", "cuda/nvvm", "cuda-nvvm"))
-  genrules.append(_symlink_genrule_for_dir(repository_ctx,
+  genrules.append(symlink_genrule_for_dir(repository_ctx,
       cuda_toolkit_path + "/extras/CUPTI/include",
       "cuda/extras/CUPTI/include", "cuda-extras"))
 
@@ -927,15 +938,15 @@ def _create_local_cuda_repository(repository_ctx):
   for lib in cuda_libs.values():
     cuda_lib_src.append(lib.path)
     cuda_lib_dest.append("cuda/lib/" + lib.file_name)
-  genrules.append(_symlink_genrule_for_dir(repository_ctx, None, "", "cuda-lib",
-                                       cuda_lib_src, cuda_lib_dest))
+  genrules.append(symlink_genrule_for_dir(repository_ctx, None, "", "cuda-lib",
+                                          cuda_lib_src, cuda_lib_dest))
 
-  # Set up the symbolic links for cudnn if cudnn was was not installed to
+  # Set up the symbolic links for cudnn if cndnn was not installed to
   # CUDA_TOOLKIT_PATH.
   included_files = _read_dir(repository_ctx, cuda_include_path).replace(
       cuda_include_path, '').splitlines()
   if '/cudnn.h' not in included_files:
-    genrules.append(_symlink_genrule_for_dir(repository_ctx, None,
+    genrules.append(symlink_genrule_for_dir(repository_ctx, None,
         "cuda/include/", "cudnn-include", [cudnn_header_dir + "/cudnn.h"],
         ["cudnn.h"]))
   else:
@@ -952,7 +963,6 @@ def _create_local_cuda_repository(repository_ctx):
            "%{cuda_is_configured}": "True",
            "%{cuda_extra_copts}": _compute_cuda_extra_copts(
                repository_ctx, cuda_config.compute_capabilities),
-
        })
   _tpl(repository_ctx, "cuda:BUILD",
        {
diff --git a/third_party/tensorrt/BUILD b/third_party/tensorrt/BUILD
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/third_party/tensorrt/BUILD.tpl b/third_party/tensorrt/BUILD.tpl
new file mode 100644
index 0000000000..feaeb0bea6
--- /dev/null
+++ b/third_party/tensorrt/BUILD.tpl
@@ -0,0 +1,67 @@
+# NVIDIA TensorRT
+# A high-performance deep learning inference optimizer and runtime.
+
+licenses(["notice"])
+
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts")
+
+package(default_visibility = ["//visibility:public"])
+
+cc_library(
+    name = "tensorrt_headers",
+    hdrs = [%{tensorrt_headers}],
+    includes = [
+        "include",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "nv_infer",
+    srcs = [%{nv_infer}],
+    data = [%{nv_infer}],
+    includes = [
+        "include",
+    ],
+    copts= cuda_default_copts(),
+    deps = [
+        "@local_config_cuda//cuda:cuda",
+        ":tensorrt_headers",
+    ],
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "nv_infer_plugin",
+    srcs = [%{nv_infer_plugin}],
+    data = [%{nv_infer_plugin}],
+    includes = [
+        "include",
+    ],
+    copts= cuda_default_copts(),
+    deps = [
+        "@local_config_cuda//cuda:cuda",
+        ":nv_infer",
+        ":tensorrt_headers",
+    ],
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "nv_parsers",
+    srcs = [%{nv_parsers}],
+    data = [%{nv_parsers}],
+    includes = [
+        "include",
+    ],
+    copts= cuda_default_copts(),
+    deps = [
+        ":tensorrt_headers",
+    ],
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+)
+
+%{tensorrt_genrules}
diff --git a/third_party/tensorrt/build_defs.bzl.tpl b/third_party/tensorrt/build_defs.bzl.tpl
new file mode 100644
index 0000000000..0dc3a7ba2d
--- /dev/null
+++ b/third_party/tensorrt/build_defs.bzl.tpl
@@ -0,0 +1,7 @@
+# Build configurations for TensorRT.
+
+def if_tensorrt(if_true, if_false=[]):
+  """Tests whether TensorRT was enabled during the configure process."""
+  if %{tensorrt_is_configured}:
+    return if_true
+  return if_false
diff --git a/third_party/tensorrt/tensorrt_configure.bzl b/third_party/tensorrt/tensorrt_configure.bzl
new file mode 100644
index 0000000000..8aa0f28f39
--- /dev/null
+++ b/third_party/tensorrt/tensorrt_configure.bzl
@@ -0,0 +1,224 @@
+# -*- Python -*-
+"""Repository rule for TensorRT configuration.
+
+`tensorrt_configure` depends on the following environment variables:
+
+  * `TF_TENSORRT_VERSION`: The TensorRT libnvinfer version.
+  * `TENSORRT_INSTALL_PATH`: The installation path of the TensorRT library.
+"""
+
+load(
+    "//third_party/gpus:cuda_configure.bzl",
+    "auto_configure_fail",
+    "get_cpu_value",
+    "find_cuda_define",
+    "matches_version",
+    "symlink_genrule_for_dir",
+)
+
+_TENSORRT_INSTALL_PATH = "TENSORRT_INSTALL_PATH"
+_TF_TENSORRT_VERSION = "TF_TENSORRT_VERSION"
+
+_TF_TENSORRT_LIBS = ["nvinfer", "nvinfer_plugin", "nvparsers"]
+_TF_TENSORRT_HEADERS = [
+    "NvInfer.h", "NvInferPlugin.h", "NvCaffeParser.h", "NvUffParser.h",
+    "NvUtils.h"
+]
+
+_DEFINE_TENSORRT_SONAME_MAJOR = "#define NV_TENSORRT_SONAME_MAJOR"
+_DEFINE_TENSORRT_SONAME_MINOR = "#define NV_TENSORRT_SONAME_MINOR"
+_DEFINE_TENSORRT_SONAME_PATCH = "#define NV_TENSORRT_SONAME_PATCH"
+
+
+def _headers_exist(repository_ctx, path):
+  """Returns whether all TensorRT header files could be found in 'path'.
+
+  Args:
+    repository_ctx: The repository context.
+    path: The TensorRT include path to check.
+
+  Returns:
+    True if all TensorRT header files can be found in the path.
+  """
+  for h in _TF_TENSORRT_HEADERS:
+    if not repository_ctx.path("%s/%s" % (path, h)).exists:
+      return False
+  return True
+
+
+def _find_trt_header_dir(repository_ctx, trt_install_path):
+  """Returns the path to the directory containing headers of TensorRT.
+
+  Args:
+    repository_ctx: The repository context.
+    trt_install_path: The TensorRT library install directory.
+
+  Returns:
+    The path of the directory containing the TensorRT header.
+  """
+  if trt_install_path == "/usr/lib/x86_64-linux-gnu":
+    path = "/usr/include/x86_64-linux-gnu"
+    if _headers_exist(repository_ctx, path):
+      return path
+  path = str(repository_ctx.path("%s/../include" % trt_install_path).realpath)
+  if _headers_exist(repository_ctx, path):
+    return path
+  auto_configure_fail(
+      "Cannot find NvInfer.h with TensorRT install path %s" % trt_install_path)
+
+
+def _trt_lib_version(repository_ctx, trt_install_path):
+  """Detects the library (e.g. libnvinfer) version of TensorRT.
+
+  Args:
+    repository_ctx: The repository context.
+    trt_install_path: The TensorRT library install directory.
+
+  Returns:
+    A string containing the library version of TensorRT.
+  """
+  trt_header_dir = _find_trt_header_dir(repository_ctx, trt_install_path)
+  major_version = find_cuda_define(repository_ctx, trt_header_dir, "NvInfer.h",
+                                   _DEFINE_TENSORRT_SONAME_MAJOR)
+  minor_version = find_cuda_define(repository_ctx, trt_header_dir, "NvInfer.h",
+                                   _DEFINE_TENSORRT_SONAME_MINOR)
+  patch_version = find_cuda_define(repository_ctx, trt_header_dir, "NvInfer.h",
+                                   _DEFINE_TENSORRT_SONAME_PATCH)
+  full_version = "%s.%s.%s" % (major_version, minor_version, patch_version)
+  environ_version = repository_ctx.os.environ[_TF_TENSORRT_VERSION].strip()
+  if not matches_version(environ_version, full_version):
+    auto_configure_fail(
+        ("TensorRT library version detected from %s/%s (%s) does not match " +
+         "TF_TENSORRT_VERSION (%s). To fix this rerun configure again.") %
+        (trt_header_dir, "NvInfer.h", full_version, environ_version))
+  return environ_version
+
+
+def _find_trt_libs(repository_ctx, trt_install_path, trt_lib_version):
+  """Finds the given TensorRT library on the system.
+
+  Adapted from code contributed by Sami Kama (https://github.com/samikama).
+
+  Args:
+    repository_ctx: The repository context.
+    trt_install_path: The TensorRT library installation directory.
+    trt_lib_version: The version of TensorRT library files as returned
+      by _trt_lib_version.
+
+  Returns:
+    Map of library names to structs with the following fields:
+      src_file_path: The full path to the library found on the system.
+      dst_file_name: The basename of the target library.
+  """
+  objdump = repository_ctx.which("objdump")
+  result = {}
+  for lib in _TF_TENSORRT_LIBS:
+    dst_file_name = "lib%s.so.%s" % (lib, trt_lib_version)
+    src_file_path = repository_ctx.path("%s/%s" % (trt_install_path,
+                                                   dst_file_name))
+    if not src_file_path.exists:
+      auto_configure_fail(
+          "Cannot find TensorRT library %s" % str(src_file_path))
+    if objdump != None:
+      objdump_out = repository_ctx.execute([objdump, "-p", str(src_file_path)])
+      for line in objdump_out.stdout.splitlines():
+        if "SONAME" in line:
+          dst_file_name = line.strip().split(" ")[-1]
+    result.update({
+        lib:
+            struct(
+                dst_file_name=dst_file_name,
+                src_file_path=str(src_file_path.realpath))
+    })
+  return result
+
+
+def _tpl(repository_ctx, tpl, substitutions):
+  repository_ctx.template(tpl, Label("//third_party/tensorrt:%s.tpl" % tpl),
+                          substitutions)
+
+
+def _create_dummy_repository(repository_ctx):
+  """Create a dummy TensorRT repository."""
+  _tpl(repository_ctx, "build_defs.bzl", {"%{tensorrt_is_configured}": "False"})
+  substitutions = {
+      "%{tensorrt_genrules}": "",
+      "%{tensorrt_headers}": "",
+  }
+  for lib in _TF_TENSORRT_LIBS:
+    k = "%%{%s}" % lib.replace("nv", "nv_")
+    substitutions.update({k: ""})
+  _tpl(repository_ctx, "BUILD", substitutions)
+
+
+def _tensorrt_configure_impl(repository_ctx):
+  """Implementation of the tensorrt_configure repository rule."""
+  if _TENSORRT_INSTALL_PATH not in repository_ctx.os.environ:
+    _create_dummy_repository(repository_ctx)
+    return
+
+  if (get_cpu_value(repository_ctx) != "Linux"):
+    auto_configure_fail("TensorRT is supported only on Linux.")
+  if _TF_TENSORRT_VERSION not in repository_ctx.os.environ:
+    auto_configure_fail("TensorRT library (libnvinfer) version is not set.")
+  trt_install_path = repository_ctx.os.environ[_TENSORRT_INSTALL_PATH].strip()
+  if not repository_ctx.path(trt_install_path).exists:
+    auto_configure_fail(
+        "Cannot find TensorRT install path %s." % trt_install_path)
+
+  # Set up the symbolic links for the library files.
+  trt_lib_version = _trt_lib_version(repository_ctx, trt_install_path)
+  trt_libs = _find_trt_libs(repository_ctx, trt_install_path, trt_lib_version)
+  trt_lib_src = []
+  trt_lib_dest = []
+  for lib in trt_libs.values():
+    trt_lib_src.append(lib.src_file_path)
+    trt_lib_dest.append(lib.dst_file_name)
+  genrules = [
+      symlink_genrule_for_dir(repository_ctx, None, "tensorrt/lib/",
+                              "tensorrt_lib", trt_lib_src, trt_lib_dest)
+  ]
+
+  # Set up the symbolic links for the header files.
+  trt_header_dir = _find_trt_header_dir(repository_ctx, trt_install_path)
+  src_files = [
+      "%s/%s" % (trt_header_dir, header) for header in _TF_TENSORRT_HEADERS
+  ]
+  dest_files = _TF_TENSORRT_HEADERS
+  genrules.append(
+      symlink_genrule_for_dir(repository_ctx, None, "tensorrt/include/",
+                              "tensorrt_include", src_files, dest_files))
+
+  # Set up config file.
+  _tpl(repository_ctx, "build_defs.bzl", {"%{tensorrt_is_configured}": "True"})
+
+  # Set up BUILD file.
+  substitutions = {
+      "%{tensorrt_genrules}": "\n".join(genrules),
+      "%{tensorrt_headers}": '":tensorrt_include"',
+  }
+  for lib in _TF_TENSORRT_LIBS:
+    k = "%%{%s}" % lib.replace("nv", "nv_")
+    v = '"tensorrt/lib/%s"' % trt_libs[lib].dst_file_name
+    substitutions.update({k: v})
+  _tpl(repository_ctx, "BUILD", substitutions)
+
+
+tensorrt_configure = repository_rule(
+    implementation=_tensorrt_configure_impl,
+    environ=[
+        _TENSORRT_INSTALL_PATH,
+        _TF_TENSORRT_VERSION,
+    ],
+)
+"""Detects and configures the local CUDA toolchain.
+
+Add the following to your WORKSPACE FILE:
+
+```python
+tensorrt_configure(name = "local_config_tensorrt")
+```
+
+Args:
+  name: A unique name for this workspace rule.
+"""
-- 
GitLab


From 96cfcf190b900833b2d9a9c3f84c839e54cfb735 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 26 Jan 2018 01:17:13 -0800
Subject: [PATCH 225/966] Fix checkpoint_utils.init_from_checkpoint() to be
 deterministic.

PiperOrigin-RevId: 183354193
---
 tensorflow/python/training/checkpoint_utils.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/training/checkpoint_utils.py b/tensorflow/python/training/checkpoint_utils.py
index 5054873bc1..b5d3e78797 100644
--- a/tensorflow/python/training/checkpoint_utils.py
+++ b/tensorflow/python/training/checkpoint_utils.py
@@ -176,7 +176,8 @@ def init_from_checkpoint(ckpt_dir_or_file, assignment_map):
   ckpt_file = _get_checkpoint_filename(ckpt_dir_or_file)
   reader = load_checkpoint(ckpt_dir_or_file)
   variable_map = reader.get_variable_to_shape_map()
-  for tensor_name_in_ckpt, current_var_or_name in six.iteritems(assignment_map):
+  for tensor_name_in_ckpt, current_var_or_name in sorted(
+      six.iteritems(assignment_map)):
     var = None
     # Check if this is Variable object or list of Variable objects (in case of
     # partitioned variables).
@@ -233,7 +234,7 @@ def init_from_checkpoint(ckpt_dir_or_file, assignment_map):
           if "/part_" in var_name:
             var_name = var_name[:var_name.index("/part_")]
           scope_variables.add(var_name)
-      for var_name in scope_variables:
+      for var_name in sorted(scope_variables):
         # Lookup name with specified prefix and suffix from current variable.
         # If tensor_name given is '/' (root), don't use it for full name.
         full_tensor_name = var_name[len(scopes):]
-- 
GitLab


From 7578785dff668c63ba6b5423a6bf2a5984c7b409 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 26 Jan 2018 03:50:58 -0800
Subject: [PATCH 226/966] Fix override annotations

PiperOrigin-RevId: 183367326
---
 tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.h | 4 ++--
 tensorflow/core/grappler/costs/virtual_scheduler.h       | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.h b/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.h
index f938d08c84..02c0fc687f 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.h
@@ -316,7 +316,7 @@ class DenseClassificationGrowStats : public ClassificationStats {
   void PackToProto(FertileSlot* slot) const override;
 
   void InitLeafClassStats(int best_split_index, LeafStat* left_stats,
-                          LeafStat* right_stats) const;
+                          LeafStat* right_stats) const override;
 
  protected:
   void ClassificationAddSplitStats() override {
@@ -383,7 +383,7 @@ class SparseClassificationGrowStats : public ClassificationStats {
   void PackToProto(FertileSlot* slot) const override;
 
   void InitLeafClassStats(int best_split_index, LeafStat* left_stats,
-                          LeafStat* right_stats) const;
+                          LeafStat* right_stats) const override;
 
  protected:
   void ClassificationAddSplitStats() override {
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.h b/tensorflow/core/grappler/costs/virtual_scheduler.h
index 8ccc51f545..9db6d46266 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.h
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.h
@@ -139,8 +139,8 @@ class FIFOManager : public ReadyNodeManager {
  public:
   FIFOManager() : ReadyNodeManager() {}
   ~FIFOManager() override {}
-  virtual void Init(
-      const std::unordered_map<const NodeDef*, NodeState>* node_state) {}
+  void Init(const std::unordered_map<const NodeDef*, NodeState>* node_state)
+      override {}
   void AddNode(const NodeDef* node) override { nodes_.push_back(node); }
   const NodeDef* GetCurrNode() override {
     CHECK(!nodes_.empty()) << "GetCurrNode(), but there's no ready node";
-- 
GitLab


From c8c2e4932afccb594bfe05e22facea1aba9dd454 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 26 Jan 2018 05:14:47 -0800
Subject: [PATCH 227/966] Remove dead code

PiperOrigin-RevId: 183374040
---
 .../cloud/kernels/bigquery_table_accessor.cc  | 28 -------------
 .../ops/fused_conv2d_bias_activation_op.cc    |  7 ----
 .../tensor_forest/kernels/v4/input_data.cc    |  2 -
 tensorflow/core/framework/op_gen_lib.cc       | 29 -------------
 tensorflow/core/framework/op_kernel.cc        |  7 ----
 tensorflow/core/kernels/summary_kernels.cc    |  2 -
 tensorflow/core/ops/data_flow_ops.cc          | 19 ---------
 tensorflow/core/ops/image_ops.cc              | 36 ----------------
 tensorflow/core/ops/training_ops.cc           | 42 -------------------
 .../core/profiler/internal/tfprof_timeline.h  |  1 -
 .../core/profiler/internal/tfprof_utils.cc    |  3 --
 11 files changed, 176 deletions(-)

diff --git a/tensorflow/contrib/cloud/kernels/bigquery_table_accessor.cc b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor.cc
index deb324634b..1bfd27305d 100644
--- a/tensorflow/contrib/cloud/kernels/bigquery_table_accessor.cc
+++ b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/numbers.h"
 
 namespace tensorflow {
-
 namespace {
 
 constexpr size_t kBufferSize = 1024 * 1024;  // In bytes.
@@ -40,33 +39,6 @@ Status ParseJson(StringPiece json, Json::Value* result) {
   return Status::OK();
 }
 
-string ColumnTypeToString(BigQueryTableAccessor::ColumnType enum_type) {
-  switch (enum_type) {
-    case BigQueryTableAccessor::ColumnType::kRecord:
-      return "RECORD";
-    case BigQueryTableAccessor::ColumnType::kString:
-      return "STRING";
-    case BigQueryTableAccessor::ColumnType::kBytes:
-      return "BYTES";
-    case BigQueryTableAccessor::ColumnType::kInteger:
-      return "INTEGER";
-    case BigQueryTableAccessor::ColumnType::kFloat:
-      return "FLOAT";
-    case BigQueryTableAccessor::ColumnType::kBoolean:
-      return "BOOLEAN";
-    case BigQueryTableAccessor::ColumnType::kTimestamp:
-      return "TIMESTAMP";
-    case BigQueryTableAccessor::ColumnType::kDate:
-      return "DATE";
-    case BigQueryTableAccessor::ColumnType::kTime:
-      return "TIME";
-    case BigQueryTableAccessor::ColumnType::kDatetime:
-      return "DATETIME";
-    case BigQueryTableAccessor::ColumnType::kNone:
-      return "NONE";
-  }
-}
-
 Status ParseColumnType(const string& type,
                        BigQueryTableAccessor::ColumnType* enum_type) {
   if (type == "RECORD") {
diff --git a/tensorflow/contrib/fused_conv/ops/fused_conv2d_bias_activation_op.cc b/tensorflow/contrib/fused_conv/ops/fused_conv2d_bias_activation_op.cc
index 6a56237f67..bafd1d5941 100644
--- a/tensorflow/contrib/fused_conv/ops/fused_conv2d_bias_activation_op.cc
+++ b/tensorflow/contrib/fused_conv/ops/fused_conv2d_bias_activation_op.cc
@@ -25,13 +25,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-namespace {
-// Return the string containing the list of valid activation modes, that can be
-// used as an Attr() in REGISTER_OP.
-string GetAllActivationModeAttrString() { return "activation_mode: {'Relu'}"; }
-
-}  // namespace
-
 // --------------------------------------------------------------------------
 
 // TODO(pauldonnelly): Add support for double inputs and scales to this Op,
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/input_data.cc b/tensorflow/contrib/tensor_forest/kernels/v4/input_data.cc
index 14cb19d36f..bf0fb92450 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/input_data.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/input_data.cc
@@ -21,8 +21,6 @@ namespace tensorflow {
 namespace tensorforest {
 namespace {
 
-const int32 SPARSE_DEFAULT = 0;
-
 bool DecideInequalityTest(const decision_trees::InequalityTest& test,
                           float value) {
   float bias = test.threshold().float_value();
diff --git a/tensorflow/core/framework/op_gen_lib.cc b/tensorflow/core/framework/op_gen_lib.cc
index e78b6ab5d9..870bbb141b 100644
--- a/tensorflow/core/framework/op_gen_lib.cc
+++ b/tensorflow/core/framework/op_gen_lib.cc
@@ -266,35 +266,6 @@ static void StringReplace(const string& from, const string& to, string* s) {
   *s = str_util::Join(split, to.c_str());
 }
 
-static void RenameInDocs(const string& from, const string& to, OpDef* op_def) {
-  const string from_quoted = strings::StrCat("`", from, "`");
-  const string to_quoted = strings::StrCat("`", to, "`");
-  for (int i = 0; i < op_def->input_arg_size(); ++i) {
-    if (!op_def->input_arg(i).description().empty()) {
-      StringReplace(from_quoted, to_quoted,
-                    op_def->mutable_input_arg(i)->mutable_description());
-    }
-  }
-  for (int i = 0; i < op_def->output_arg_size(); ++i) {
-    if (!op_def->output_arg(i).description().empty()) {
-      StringReplace(from_quoted, to_quoted,
-                    op_def->mutable_output_arg(i)->mutable_description());
-    }
-  }
-  for (int i = 0; i < op_def->attr_size(); ++i) {
-    if (!op_def->attr(i).description().empty()) {
-      StringReplace(from_quoted, to_quoted,
-                    op_def->mutable_attr(i)->mutable_description());
-    }
-  }
-  if (!op_def->summary().empty()) {
-    StringReplace(from_quoted, to_quoted, op_def->mutable_summary());
-  }
-  if (!op_def->description().empty()) {
-    StringReplace(from_quoted, to_quoted, op_def->mutable_description());
-  }
-}
-
 static void RenameInDocs(const string& from, const string& to,
                          ApiDef* api_def) {
   const string from_quoted = strings::StrCat("`", from, "`");
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index aee3a0afbc..16bf5c256f 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -943,13 +943,6 @@ Status FindKernelRegistration(const DeviceType& device_type,
   return Status::OK();
 }
 
-Status FindKernelRegistration(const DeviceType& device_type, const Node& node,
-                              const KernelRegistration** reg,
-                              bool* was_attr_mismatch) {
-  return FindKernelRegistration(device_type, node.def(), reg,
-                                was_attr_mismatch);
-}
-
 }  // namespace
 
 // TODO(irving): Change const NodeDef& to const Node&
diff --git a/tensorflow/core/kernels/summary_kernels.cc b/tensorflow/core/kernels/summary_kernels.cc
index da3644779d..d317a8d33d 100644
--- a/tensorflow/core/kernels/summary_kernels.cc
+++ b/tensorflow/core/kernels/summary_kernels.cc
@@ -278,8 +278,6 @@ class WriteAudioSummaryOp : public OpKernel {
 
  private:
   int max_outputs_;
-  bool has_sample_rate_attr_;
-  float sample_rate_attr_;
 };
 REGISTER_KERNEL_BUILDER(Name("WriteAudioSummary").Device(DEVICE_CPU),
                         WriteAudioSummaryOp);
diff --git a/tensorflow/core/ops/data_flow_ops.cc b/tensorflow/core/ops/data_flow_ops.cc
index 12c27c7984..4f946fb3ca 100644
--- a/tensorflow/core/ops/data_flow_ops.cc
+++ b/tensorflow/core/ops/data_flow_ops.cc
@@ -171,29 +171,10 @@ Status TwoElementVectorInputsAndScalarOutputs(InferenceContext* c) {
   return Status::OK();
 }
 
-Status ScalarAndTwoElementVectorInputsAndScalarOutputs(InferenceContext* c) {
-  ShapeHandle handle;
-  DimensionHandle unused_handle;
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
-  for (int i = 1; i < c->num_inputs(); ++i) {
-    TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 1, &handle));
-    TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_handle));
-  }
-  for (int i = 0; i < c->num_outputs(); ++i) {
-    c->set_output(i, c->Scalar());
-  }
-  return Status::OK();
-}
-
 Status TwoElementOutput(InferenceContext* c) {
   c->set_output(0, c->Vector(2));
   return Status::OK();
 }
-
-Status ScalarOutput(InferenceContext* c) {
-  c->set_output(0, c->Scalar());
-  return Status::OK();
-}
 }  // namespace
 
 REGISTER_OP("RandomShuffleQueue")
diff --git a/tensorflow/core/ops/image_ops.cc b/tensorflow/core/ops/image_ops.cc
index 7484ebb078..ef2ac267cc 100644
--- a/tensorflow/core/ops/image_ops.cc
+++ b/tensorflow/core/ops/image_ops.cc
@@ -25,42 +25,6 @@ using shape_inference::ShapeHandle;
 
 namespace {
 
-const char kDecodeJpegCommonDocStr[] = R"doc(
-The attr `channels` indicates the desired number of color channels for the
-decoded image.
-
-Accepted values are:
-
-*   0: Use the number of channels in the JPEG-encoded image.
-*   1: output a grayscale image.
-*   3: output an RGB image.
-
-If needed, the JPEG-encoded image is transformed to match the requested number
-of color channels.
-
-The attr `ratio` allows downscaling the image by an integer factor during
-decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
-downscaling the image later.
-
-)doc";
-
-const char kDecodeJpegCommonParamsDocStr[] = R"doc(
-channels: Number of color channels for the decoded image.
-ratio: Downscaling ratio.
-fancy_upscaling: If true use a slower but nicer upscaling of the
-  chroma planes (yuv420/422 only).
-try_recover_truncated:  If true try to recover an image from truncated input.
-acceptable_fraction: The minimum required fraction of lines before a truncated
-  input is accepted.
-dct_method: string specifying a hint about the algorithm used for
-  decompression.  Defaults to "" which maps to a system-specific
-  default.  Currently valid values are ["INTEGER_FAST",
-  "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
-  jpeg library changes to a version that does not have that specific
-  option.)
-image: 3-D with shape `[height, width, channels]`..
-)doc";
-
 // Sets output[0] to shape [batch_dim,height,width,channel_dim], where
 // height and width come from the size_tensor.
 Status SetOutputToSizedImage(InferenceContext* c, DimensionHandle batch_dim,
diff --git a/tensorflow/core/ops/training_ops.cc b/tensorflow/core/ops/training_ops.cc
index e8d03877c9..6ce9595fb6 100644
--- a/tensorflow/core/ops/training_ops.cc
+++ b/tensorflow/core/ops/training_ops.cc
@@ -22,48 +22,6 @@ using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
 
-const char kAddSignCommonDocStr[] = R"doc(
-Update '*var' according to the AddSign update.
-
-m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-update <- (alpha + sign_decay * sign(g) *sign(m)) * g
-variable <- variable - lr_t * update
-
-var: Should be from a Variable().
-m: Should be from a Variable().
-lr: Scaling factor. Must be a scalar.
-sign_decay: Must be a scalar.
-alpha: Must be a scalar.
-beta: Must be a scalar.
-grad: The gradient.
-)doc";
-
-const char kPowerSignCommonDocStr[] = R"doc(
-Update '*var' according to the AddSign update.
-
-m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-update <- exp(logbase * sign_decay * sign(g) * sign(m_t)) * g
-variable <- variable - lr_t * update
-
-var: Should be from a Variable().
-m: Should be from a Variable().
-lr: Scaling factor. Must be a scalar.
-logbase: Must be a scalar.
-sign_decay: Must be a scalar.
-beta: Must be a scalar.
-grad: The gradient.
-)doc";
-
-const char kOutDocStr[] = R"doc(
-out: Same as "var".
-)doc";
-
-const char kLockDocStr[] = R"doc(
-use_locking: If `True`, updating of the var and m tensors is
-  protected by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-)doc";
-
 static ShapeHandle ShapeOrHandleShape(InferenceContext* c, int input) {
   auto* handle_data = c->input_handle_shapes_and_types(input);
   if (handle_data != nullptr && !handle_data->empty() &&
diff --git a/tensorflow/core/profiler/internal/tfprof_timeline.h b/tensorflow/core/profiler/internal/tfprof_timeline.h
index 4428ab571f..651ad3f0c1 100644
--- a/tensorflow/core/profiler/internal/tfprof_timeline.h
+++ b/tensorflow/core/profiler/internal/tfprof_timeline.h
@@ -178,7 +178,6 @@ class Timeline {
   int64 step_;
   const string outfile_;
   int64 next_pid_ = 0;
-  int64 allocator_pid_ = -1;
   MemoryTracker mem_tracker_;
   ChromeTraceFormatter chrome_formatter_;
   std::map<string, int64> device_pids_;
diff --git a/tensorflow/core/profiler/internal/tfprof_utils.cc b/tensorflow/core/profiler/internal/tfprof_utils.cc
index 2813bb46fa..7712ebd926 100644
--- a/tensorflow/core/profiler/internal/tfprof_utils.cc
+++ b/tensorflow/core/profiler/internal/tfprof_utils.cc
@@ -355,9 +355,6 @@ static const char* const kOpTypes =
 static const char* const kScope =
     "scope: The nodes in the model graph are organized by their names, which "
     "is hierarchical like filesystem.";
-static const char* const kGraph =
-    "graph: The nodes in the model graph are organized by their operation "
-    "input and output.";
 static const char* const kCode =
     "code: When python trace is available, the nodes are python lines and "
     "their are organized by the python call stack.";
-- 
GitLab


From abdc62aee1eeba32be56d761a2f9988306356084 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 26 Jan 2018 05:15:18 -0800
Subject: [PATCH 228/966] Roll CL 179861781 forward with fix: Wrappers for CUDA
 9 warp-synchronous intrinsics.

PiperOrigin-RevId: 183374082
---
 .../kernels/reduce_slice_ops_gpu.cu.cc        |  11 +-
 tensorflow/core/BUILD                         |   7 +
 tensorflow/core/kernels/bias_op_gpu.cu.cc     |  18 +-
 .../core/kernels/depthwise_conv_op_gpu.cu.cc  |  13 +-
 .../core/kernels/scatter_nd_op_gpu.cu.cc      |  21 +
 tensorflow/core/kernels/svd_op_gpu.cu.cc      |   4 +-
 tensorflow/core/util/cuda_device_functions.h  | 499 ++++++++++
 tensorflow/core/util/cuda_kernel_helper.h     | 857 +++---------------
 .../core/util/cuda_kernel_helper_test.cu.cc   |  60 +-
 tensorflow/core/util/cuda_launch_config.h     | 284 ++++++
 10 files changed, 988 insertions(+), 786 deletions(-)
 create mode 100644 tensorflow/core/util/cuda_device_functions.h
 create mode 100644 tensorflow/core/util/cuda_launch_config.h

diff --git a/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops_gpu.cu.cc b/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops_gpu.cu.cc
index 8e6870fadd..501cddb8c8 100644
--- a/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops_gpu.cu.cc
+++ b/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops_gpu.cu.cc
@@ -34,9 +34,9 @@ namespace functor {
   __global__ void ReduceSliceDeviceKernel##reduceop(                           \
       Cuda3DLaunchConfig config, Index indices_width, Index bound,             \
       const T begin, const Index *indices, const T *input, T *out) {           \
-    CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count, x) {                 \
-      CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count, y) {               \
-        CUDA_AXIS_KERNEL_LOOP(z, config.virtual_thread_count, z) {             \
+    CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count.x, X) {               \
+      CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count.y, Y) {             \
+        CUDA_AXIS_KERNEL_LOOP(z, config.virtual_thread_count.z, Z) {           \
           Index outidx = x * config.virtual_thread_count.y *                   \
                              config.virtual_thread_count.z +                   \
                          y * config.virtual_thread_count.z + z;                \
@@ -68,8 +68,9 @@ namespace functor {
       if (sizex * sizey * sizez == 0) {                                        \
         return;                                                                \
       }                                                                        \
-      Cuda3DLaunchConfig config = GetCuda3DLaunchConfig(sizex, sizey, sizez, d,\
-          ReduceSliceDeviceKernel##reduceop<T, Index>, 0, 0);                  \
+      Cuda3DLaunchConfig config = GetCuda3DLaunchConfig(                       \
+          sizex, sizey, sizez, d, ReduceSliceDeviceKernel##reduceop<T, Index>, \
+          0, 0);                                                               \
                                                                                \
       ReduceSliceDeviceKernel##reduceop<T, Index>                              \
           <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(    \
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 94973a0e52..29c515121e 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -1896,6 +1896,13 @@ cc_library(
     ],
 )
 
+tf_cuda_library(
+    name = "cuda_device_functions",
+    hdrs = ["util/cuda_device_functions.h"],
+    visibility = ["//visibility:public"],
+    deps = [":framework_lite"],
+)
+
 # TODO(josh11b): Is this needed, or can we just use ":protos_all_cc"?
 cc_library(
     name = "protos_cc",
diff --git a/tensorflow/core/kernels/bias_op_gpu.cu.cc b/tensorflow/core/kernels/bias_op_gpu.cu.cc
index 42f3db1d79..2ca194a77f 100644
--- a/tensorflow/core/kernels/bias_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/bias_op_gpu.cu.cc
@@ -173,19 +173,13 @@ __global__ void BiasGradNCHW_SharedAtomics(const T* output_backprop,
   // Accumulate the results in the shared memory into the first element.
   // No syncthreads is needed since this is only in the same warp.
   int32 thread_index = threadIdx.x;
-  if (thread_index < 16) {
-    s_data[thread_index] += s_data[thread_index + 16];
-    __syncwarp(0xFFFF);
-    if (thread_index < 8) s_data[thread_index] += s_data[thread_index + 8];
-    __syncwarp(0xFF);
-    if (thread_index < 4) s_data[thread_index] += s_data[thread_index + 4];
-    __syncwarp(0xF);
-    if (thread_index < 2) s_data[thread_index] += s_data[thread_index + 2];
-    __syncwarp(0x3);
+  if (thread_index < 32) {
+    AccT data = s_data[thread_index];
+    for (int32 delta = warpSize / 2; delta > 0; delta /= 2) {
+      data += CudaShuffleXorSync(kCudaWarpAll, data, delta);
+    }
     if (thread_index == 0) {
-      T val = T(s_data[0] + s_data[1]);
-      // The first thread writes out the accumulated result to global location.
-      CudaAtomicAdd(bias_backprop + bias_index, val);
+      CudaAtomicAdd(bias_backprop + bias_index, T(data));
     }
   }
 }
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
index 903aac5d68..5493e33532 100644
--- a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
@@ -34,6 +34,7 @@ limitations under the License.
 
 namespace tensorflow {
 
+typedef Eigen::GpuDevice GPUDevice;
 using Eigen::GpuDevice;
 
 // Returns whether depthwise convolution forward or backward input pass can be
@@ -1028,7 +1029,7 @@ __device__ __forceinline__ T WarpSumReduce(T val) {
   int zeros = sub_warp * kWidth;
   unsigned mask = ((1UL << kWidth) - 1) << zeros;
   for (int delta = kWidth / 2; delta > 0; delta /= 2) {
-    val += CudaShuffleXor(mask, val, delta);
+    val += CudaShuffleXorSync(mask, val, delta);
   }
   return val;
 }
@@ -1145,7 +1146,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall(
 
     // Note: the condition to reach this is uniform across the entire block.
     __syncthreads();
-    unsigned active_threads = CudaBallot(CUDA_WARP_ALL, depth_in_range);
+    unsigned active_threads = CudaBallotSync(kCudaWarpAll, depth_in_range);
 
     if (depth_in_range) {
       const T* const out_ptr = inout_offset + output;
@@ -1159,7 +1160,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall(
           T val = out1 * tile_ptr[0] + out2 * tile_ptr[tile_offset];
           // Warp-accumulate pixels of the same depth and write to accumulator.
           for (int delta = 16; delta >= kBlockSlices; delta /= 2) {
-            val += CudaShuffleDown(active_threads, val, delta);
+            val += CudaShuffleXorSync(active_threads, val, delta);
           }
           if (!(thread_idx & 32 - kBlockSlices) /* lane_idx < kBlockSlices */) {
             *accum_ptr = val;
@@ -1399,7 +1400,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall(
 
     // Note: the condition to reach this is uniform across the entire block.
     __syncthreads();
-    unsigned active_threads = CudaBallot(CUDA_WARP_ALL, slice_in_range);
+    unsigned active_threads = CudaBallotSync(kCudaWarpAll, slice_in_range);
 
     if (slice_in_range) {
       const T* const out_ptr = inout_offset + output;
@@ -1413,10 +1414,10 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall(
           T val = out1 * tile_ptr[0] + out2 * tile_ptr[tile_offset];
           // Warp-accumulate pixels of the same depth and write to accumulator.
           for (int delta = 16 / kBlockSlices; delta > 0; delta /= 2) {
-            val += CudaShuffleDown(active_threads, val, delta);
+            val += CudaShuffleXorSync(active_threads, val, delta);
           }
           if (!(thread_idx & 32 / kBlockSlices - 1)) {
-            *accum_ptr = val;
+            *accum_ptr = val;  // kBlockSlices threads per warp.
           }
           ++shared_offset;
           accum_ptr += accum_increment;
diff --git a/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc b/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
index 31f74671ca..a3c21edc15 100644
--- a/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
@@ -55,6 +55,27 @@ struct LeftUpdate<T, scatter_nd_op::UpdateOp::SUB> {
   }
 };
 
+// Specializations for std::complex, updating real and imaginary part
+// individually. Even though this is not an atomic op anymore, it is safe
+// because there is only one type of op per kernel.
+template <typename T>
+struct LeftUpdate<std::complex<T>, scatter_nd_op::UpdateOp::ADD> {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void operator()(
+      std::complex<T>* out, const std::complex<T>& val) {
+    T* ptr = reinterpret_cast<T*>(out);
+    CudaAtomicAdd(ptr, val.real());
+    CudaAtomicAdd(ptr, val.imag());
+  }
+};
+
+template <typename T>
+struct LeftUpdate<std::complex<T>, scatter_nd_op::UpdateOp::SUB> {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void operator()(
+      std::complex<T>* out, const std::complex<T>& val) {
+    LeftUpdate<std::complex<T>, scatter_nd_op::UpdateOp::ADD>()(out, -val);
+  }
+};
+
 }  // namespace
 
 template <typename T, typename Index, scatter_nd_op::UpdateOp op, int IXDIM>
diff --git a/tensorflow/core/kernels/svd_op_gpu.cu.cc b/tensorflow/core/kernels/svd_op_gpu.cu.cc
index dedc2da60b..8c3a58b108 100644
--- a/tensorflow/core/kernels/svd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/svd_op_gpu.cu.cc
@@ -63,8 +63,8 @@ __global__ void ComputeValueOfVKernel(Cuda2DLaunchConfig config, int64 m,
                                       int64 ldu, const Scalar* M,
                                       const Scalar* U, const Scalar* S,
                                       Scalar* V) {
-  CUDA_AXIS_KERNEL_LOOP(batch, config.virtual_thread_count, x) {
-    CUDA_AXIS_KERNEL_LOOP(i, config.virtual_thread_count, y) {
+  CUDA_AXIS_KERNEL_LOOP(batch, config.virtual_thread_count.x, X) {
+    CUDA_AXIS_KERNEL_LOOP(i, config.virtual_thread_count.y, Y) {
       Scalar v = M[i + m * batch] * U[ldu * (i + m * batch)] * S[batch];
       CudaAtomicAdd(V + batch, v);
     }
diff --git a/tensorflow/core/util/cuda_device_functions.h b/tensorflow/core/util/cuda_device_functions.h
new file mode 100644
index 0000000000..f787687f66
--- /dev/null
+++ b/tensorflow/core/util/cuda_device_functions.h
@@ -0,0 +1,499 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_CUDA_DEVICE_FUNCTIONS_H_
+#define TENSORFLOW_CORE_UTIL_CUDA_DEVICE_FUNCTIONS_H_
+
+/**
+ * Wrappers and helpers for CUDA device code.
+ *
+ * Wraps the warp-cooperative intrinsics introduced in CUDA 9 to provide
+ * backwards compatibility, see go/volta-porting for details.
+ * Provides atomic operations on types that aren't natively supported.
+ */
+
+#if GOOGLE_CUDA
+
+#include <algorithm>
+#include <complex>
+#include "cuda/include/cuda.h"
+#include "cuda/include/device_functions.h"
+#include "tensorflow/core/platform/types.h"
+
+#if CUDA_VERSION >= 7050
+#include "cuda/include/cuda_fp16.h"
+#endif  // CUDA_VERSION >= 7050
+
+namespace tensorflow {
+
+namespace detail {
+
+// Helper for range-based for loop using 'delta' increments.
+// Usage: see CudaGridRange?() functions below.
+template <typename T>
+class CudaGridRange {
+  struct Iterator {
+    __device__ Iterator(T index, T delta) : index_(index), delta_(delta) {}
+    __device__ T operator*() const { return index_; }
+    __device__ Iterator& operator++() {
+      index_ += delta_;
+      return *this;
+    }
+    __device__ bool operator!=(const Iterator& other) const {
+      bool greater = index_ > other.index_;
+      bool less = index_ < other.index_;
+      // Anything past an end iterator (delta_ == 0) is equal.
+      // In range-based for loops, this optimizes to 'return less'.
+      if (!other.delta_) {
+        return less;
+      }
+      if (!delta_) {
+        return greater;
+      }
+      return less || greater;
+    }
+
+   private:
+    T index_;
+    const T delta_;
+  };
+
+ public:
+  __device__ CudaGridRange(T begin, T delta, T end)
+      : begin_(begin), delta_(delta), end_(end) {}
+
+  __device__ Iterator begin() const { return Iterator{begin_, delta_}; }
+  __device__ Iterator end() const { return Iterator{end_, 0}; }
+
+ private:
+  T begin_;
+  T delta_;
+  T end_;
+};
+
+}  // namespace detail
+
+// Helper to visit indices in the range 0 <= i < count, using the x-coordinate
+// of the global thread index. That is, each index i is visited by all threads
+// with the same x-coordinate.
+// Usage: for(int i : CudaGridRangeX(count)) { visit(i); }
+template <typename T>
+__device__ detail::CudaGridRange<T> CudaGridRangeX(T count) {
+  return detail::CudaGridRange<T>(blockIdx.x * blockDim.x + threadIdx.x,
+                                  gridDim.x * blockDim.x, count);
+}
+
+// Helper to visit indices in the range 0 <= i < count using the y-coordinate.
+// Usage: for(int i : CudaGridRangeY(count)) { visit(i); }
+template <typename T>
+__device__ detail::CudaGridRange<T> CudaGridRangeY(T count) {
+  return detail::CudaGridRange<T>(blockIdx.y * blockDim.y + threadIdx.y,
+                                  gridDim.y * blockDim.y, count);
+}
+
+// Helper to visit indices in the range 0 <= i < count using the z-coordinate.
+// Usage: for(int i : CudaGridRangeZ(count)) { visit(i); }
+template <typename T>
+__device__ detail::CudaGridRange<T> CudaGridRangeZ(T count) {
+  return detail::CudaGridRange<T>(blockIdx.z * blockDim.z + threadIdx.z,
+                                  gridDim.z * blockDim.z, count);
+}
+
+// Mask for all 32 threads in a warp.
+const unsigned kCudaWarpAll = 0xffffffff;
+
+// Returns the warp lane ID of the calling thread
+__device__ inline unsigned CudaLaneId() {
+  unsigned int lane_id;
+  asm("mov.u32 %0, %%laneid;" : "=r"(lane_id));
+  return lane_id;
+}
+
+namespace detail {
+// Returns true if mask is a valid parameter for __shfl*sync to return a well
+// defined value, assuming the calling lane will read from src_lane as part of
+// the shuffle operation.
+//
+// Specifically, returns true iff mask has the calling lane bit and the src_lane
+// bit set, and the src_lane calls this function with the same mask value
+// (required for the two threads to wait for each other).
+//
+// On Volta, for some invalid masks, this function hangs or returns false
+// positives, because the implementation shuffles with the same mask that
+// we are validating. Run on Pascal if you suspect that the mask is incorrect.
+__device__ inline bool CudaValidateShuffleSyncMask(unsigned mask,
+                                                   unsigned src_lane) {
+  unsigned src_dst_mask = 1u << CudaLaneId() | 1u << src_lane;
+#if CUDA_VERSION >= 9000
+  unsigned src_lane_mask = __shfl_sync(mask, mask, src_lane);
+#else
+  unsigned src_lane_mask = __shfl(mask, src_lane);
+#endif
+  return (src_dst_mask & ~mask) == 0 && src_lane_mask == mask;
+}
+
+// Returns the actual source lane for shuffle.
+__device__ inline unsigned CudaShuffleGetSrcLane(int src_lane, int width) {
+  int lane_id = CudaLaneId();
+  int lane_base = lane_id & ~width + 1;
+  int lane_offset = src_lane & width - 1;
+  return lane_base + lane_offset;
+}
+
+// Returns the source lane for shuffle up.
+__device__ inline unsigned CudaShuffleUpGetSrcLane(unsigned delta, int width) {
+  unsigned lane_id = CudaLaneId();
+  if ((lane_id & width - 1) < delta) {
+    return lane_id;
+  }
+  return lane_id - delta;
+}
+
+// Returns the source lane for shuffle down.
+__device__ inline unsigned CudaShuffleDownGetSrcLane(unsigned delta,
+                                                     int width) {
+  unsigned lane_id = CudaLaneId();
+  if ((lane_id & width - 1) + delta >= width) {
+    return lane_id;
+  }
+  return lane_id + delta;
+}
+
+// Returns the source lane for shuffle xor.
+__device__ inline unsigned CudaShuffleXorGetSrcLane(int lane_mask, int width) {
+  int lane_id = CudaLaneId();
+  int src_lane = lane_id ^ lane_mask;
+  if (src_lane > (lane_id | width - 1)) {
+    return lane_id;
+  }
+  return src_lane;
+}
+}  // namespace detail
+
+// For all *_sync wrappers below, it is illegal to synchronize threads from
+// different program locations, because that is not supported before sm_70.
+// In other words, all threads in 'mask' must call the functions in convergence.
+// Code that requires sm_70 (and CUDA 9) may use the intrinsic directly.
+//
+// It is also illegal to shuffle with a mask that produces an undefined result
+// for any of the threads. Specifically, all source threads of the shuffle
+// must have their corresponding bit in 'mask' set.
+
+// Wrapper for __syncwarp. No-op for CUDA 8 and earlier.
+__device__ inline void CudaSyncWarp(unsigned mask = kCudaWarpAll) {
+  assert(mask & 1u << CudaLaneId());
+#if CUDA_VERSION >= 9000
+  __syncwarp(mask);
+#endif
+}
+
+// Wrapper for __ballot_sync. All threads in 'mask' must call this function in
+// convergence, see comment above for details.
+__device__ inline unsigned CudaBallotSync(unsigned mask, int pred) {
+  assert(mask & 1u << CudaLaneId());
+#if CUDA_VERSION >= 9000
+  return __ballot_sync(mask, pred);
+#else
+  return __ballot(pred) & mask;  // Apply mask to match __ballot_sync's spec.
+#endif
+}
+
+// Wrapper for __any_sync. All threads in 'mask' must call this function in
+// convergence, see comment above for details.
+__device__ inline int CudaAnySync(unsigned mask, int pred) {
+  assert(mask & 1u << CudaLaneId());
+#if CUDA_VERSION >= 9000
+  return __any_sync(mask, pred);
+#else
+  return __any(pred);
+#endif
+}
+
+// Wrapper for __all_sync. All threads in 'mask' must call this function in
+// convergence, see comment above for details.
+__device__ inline int CudaAllSync(unsigned mask, int pred) {
+  assert(mask & 1u << CudaLaneId());
+#if CUDA_VERSION >= 9000
+  return __all_sync(mask, pred);
+#else
+  return __all(pred);
+#endif
+}
+
+// Wrapper for __shfl_sync. All threads in 'mask' must call this function in
+// convergence, see comment above for details.
+template <typename T>
+__device__ T CudaShuffleSync(unsigned mask, T value, int src_lane,
+                             int width = warpSize) {
+  assert(!(width & width - 1));
+  assert(detail::CudaValidateShuffleSyncMask(
+      mask, detail::CudaShuffleGetSrcLane(src_lane, width)));
+#if CUDA_VERSION >= 9000
+  return __shfl_sync(mask, value, src_lane, width);
+#else
+  return __shfl(value, src_lane, width);
+#endif
+}
+
+// Variant of the (undocumented) version from the CUDA SDK, but using unsigned
+// instead of float for lo and hi (which is incorrect with ftz, for example).
+// See b/69446944.
+__device__ inline double CudaShuffleSync(unsigned mask, double value,
+                                         int src_lane, int width = warpSize) {
+  unsigned lo, hi;
+  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(value));
+  hi = CudaShuffleSync(mask, hi, src_lane, width);
+  lo = CudaShuffleSync(mask, lo, src_lane, width);
+  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(value) : "r"(lo), "r"(hi));
+  return value;
+}
+
+// Wrapper for __shfl_up_sync. All threads in 'mask' must call this function in
+// convergence, see comment above for details.
+template <typename T>
+__device__ inline T CudaShuffleUpSync(unsigned mask, T value, unsigned delta,
+                                      int width = warpSize) {
+  assert(!(width & width - 1));
+  assert(detail::CudaValidateShuffleSyncMask(
+      mask, detail::CudaShuffleUpGetSrcLane(delta, width)));
+#if CUDA_VERSION >= 9000
+  return __shfl_up_sync(mask, value, delta, width);
+#else
+  return __shfl_up(value, delta, width);
+#endif
+}
+
+// Variant of the (undocumented) version from the CUDA SDK, but using unsigned
+// instead of float for lo and hi (which is incorrect with ftz, for example).
+// See b/69446944.
+__device__ inline double CudaShuffleUpSync(unsigned mask, double value,
+                                           unsigned delta,
+                                           int width = warpSize) {
+  unsigned lo, hi;
+  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(value));
+  hi = CudaShuffleUpSync(mask, hi, delta, width);
+  lo = CudaShuffleUpSync(mask, lo, delta, width);
+  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(value) : "r"(lo), "r"(hi));
+  return value;
+}
+
+// Wrapper for __shfl_down_sync. All threads in 'mask' must call this function
+// in convergence, see comment above for details.
+template <typename T>
+__device__ inline T CudaShuffleDownSync(unsigned mask, T value, unsigned delta,
+                                        int width = warpSize) {
+  assert(!(width & width - 1));
+  assert(detail::CudaValidateShuffleSyncMask(
+      mask, detail::CudaShuffleDownGetSrcLane(delta, width)));
+#if CUDA_VERSION >= 9000
+  return __shfl_down_sync(mask, value, delta, width);
+#else
+  return __shfl_down(value, delta, width);
+#endif
+}
+
+// Variant of the (undocumented) version from the CUDA SDK, but using unsigned
+// instead of float for lo and hi (which is incorrect with ftz, for example).
+// See b/69446944.
+__device__ inline double CudaShuffleDownSync(unsigned mask, double value,
+                                             unsigned delta,
+                                             int width = warpSize) {
+  unsigned lo, hi;
+  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(value));
+  hi = CudaShuffleDownSync(mask, hi, delta, width);
+  lo = CudaShuffleDownSync(mask, lo, delta, width);
+  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(value) : "r"(lo), "r"(hi));
+  return value;
+}
+
+// Wrapper for __shfl_xor_sync. All threads in 'mask' must call this function in
+// convergence, see comment above for details.
+template <typename T>
+__device__ T CudaShuffleXorSync(unsigned mask, T value, int lane_mask,
+                                int width = warpSize) {
+  assert(!(width & width - 1));
+  assert(detail::CudaValidateShuffleSyncMask(
+      mask, detail::CudaShuffleXorGetSrcLane(lane_mask, width)));
+#if CUDA_VERSION >= 9000
+  return __shfl_xor_sync(mask, value, lane_mask, width);
+#else
+  return __shfl_xor(value, lane_mask, width);
+#endif
+}
+
+// Variant of the (undocumented) version from the CUDA SDK, but using unsigned
+// instead of float for lo and hi (which is incorrect with ftz, for example).
+// See b/69446944.
+__device__ inline double CudaShuffleXorSync(unsigned mask, double value,
+                                            int lane_mask,
+                                            int width = warpSize) {
+  unsigned lo, hi;
+  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(value));
+  hi = CudaShuffleXorSync(mask, hi, lane_mask, width);
+  lo = CudaShuffleXorSync(mask, lo, lane_mask, width);
+  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(value) : "r"(lo), "r"(hi));
+  return value;
+}
+
+// Wrapper for __ldg.
+template <typename T>
+__host__ __device__ T CudaLdg(const T* address) {
+#if __CUDA_ARCH__ >= 350
+  return __ldg(address);
+#else
+  return *address;
+#endif
+}
+
+__host__ __device__ inline bool CudaLdg(const bool* address) {
+  return CudaLdg(reinterpret_cast<const char*>(address)) != 0;
+}
+
+__host__ __device__ inline std::complex<float> CudaLdg(
+    const std::complex<float>* address) {
+#if __CUDA_ARCH__ >= 350
+  float2 mem = __ldg(reinterpret_cast<const float2*>(address));
+  return std::complex<float>(mem.x, mem.y);
+#else
+  return *address;
+#endif
+}
+
+__host__ __device__ inline std::complex<double> CudaLdg(
+    const std::complex<double>* address) {
+#if __CUDA_ARCH__ >= 350
+  double2 mem = __ldg(reinterpret_cast<const double2*>(address));
+  return std::complex<double>(mem.x, mem.y);
+#else
+  return *address;
+#endif
+}
+
+// Zeroes count elements starting at ptr using all threads of a 1-D grid.
+// Note: this function does not synchronize, and therefore the memory range is
+// not guaranteed to be zero until the next kernel launch.
+template <typename T>
+__global__ void SetZero(const int count, T* ptr) {
+  // Check that the grid is one dimensional and index doesn't overflow.
+  assert(blockDim.y == 1 && blockDim.z == 1);
+  assert(blockDim.x * gridDim.x / blockDim.x == gridDim.x);
+  for (int i : CudaGridRangeX(count)) {
+    ptr[i] = T(0);
+  }
+}
+
+namespace detail {
+// Helper function for atomic accumulation implemented as CAS.
+template <typename T, typename F>
+__device__ T CudaAtomicCasHelper(T* ptr, F accumulate) {
+  T old = *ptr;
+  T assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(ptr, assumed, accumulate(assumed));
+  } while (assumed != old);
+  return old;
+}
+
+// Overload for floating point (using integer comparison to handle NaN
+// correctly).
+template <typename F>
+__device__ float CudaAtomicCasHelper(float* ptr, F accumulate) {
+  return __float_as_int(
+      CudaAtomicCasHelper(reinterpret_cast<int32*>(ptr), [accumulate](int32 a) {
+        return __float_as_int(accumulate(__int_as_float(a)));
+      }));
+}
+template <typename F>
+__device__ double CudaAtomicCasHelper(double* ptr, F accumulate) {
+  return __longlong_as_double(CudaAtomicCasHelper(
+      reinterpret_cast<tensorflow::uint64*>(ptr),
+      [accumulate](tensorflow::uint64 a) {
+        return __double_as_longlong(accumulate(__longlong_as_double(a)));
+      }));
+}
+
+template <typename From, typename To>
+using ToTypeIfConvertible =
+    typename std::enable_if<std::is_convertible<From, To>::value, To>::type;
+
+}  // namespace detail
+
+// CUDA provides atomic ops, but not for all types.  We provide wrappers
+// for some ops and provide implementation for all reasonable types.
+
+template <typename T, typename U>
+__device__ detail::ToTypeIfConvertible<U, T> CudaAtomicAdd(T* ptr, U value) {
+  return atomicAdd(ptr, value);
+}
+#if __CUDA_ARCH__ < 600
+__device__ inline double CudaAtomicAdd(double* ptr, double value) {
+  return detail::CudaAtomicCasHelper(ptr,
+                                     [value](double a) { return a + value; });
+}
+#elif __clang__
+// Clang cannot compile __nvvm_atom_add_gen_d builtin yet, use inline PTX.
+// see https://reviews.llvm.org/D39638
+__device__ inline double CudaAtomicAdd(double* ptr, double value) {
+  double result;
+  asm volatile("atom.add.f64 %0, [%1], %2;"
+               : "=d"(result)
+               : "l"(ptr), "d"(value)
+               : "memory");
+  return result;
+}
+#endif
+
+template <typename T, typename U>
+__device__ detail::ToTypeIfConvertible<U, T> CudaAtomicSub(T* ptr, U value) {
+  return atomicSub(ptr, value);
+}
+// Specializations of substraction which add the negative value.
+__device__ inline float CudaAtomicSub(float* ptr, float value) {
+  return CudaAtomicAdd(ptr, -value);
+}
+__device__ inline double CudaAtomicSub(double* ptr, double value) {
+  return CudaAtomicAdd(ptr, -value);
+}
+__device__ inline tensorflow::uint64 CudaAtomicSub(tensorflow::uint64* ptr,
+                                                   tensorflow::uint64 value) {
+  return CudaAtomicAdd(ptr, -value);
+}
+
+template <typename T, typename U>
+__device__ detail::ToTypeIfConvertible<U, T> CudaAtomicMax(T* ptr, U value) {
+  return atomicMax(ptr, value);
+}
+#if __CUDA_ARCH__ < 320
+__device__ inline tensorflow::uint64 CudaAtomicMax(tensorflow::uint64* ptr,
+                                                   tensorflow::uint64 value) {
+  return detail::CudaAtomicCasHelper(
+      ptr, [value](tensorflow::uint64 a) { return max(a, value); });
+}
+#endif
+
+template <typename T, typename U>
+__device__ detail::ToTypeIfConvertible<U, T> CudaAtomicMul(T* ptr, U value) {
+  return detail::CudaAtomicCasHelper(ptr, [value](T a) { return a * value; });
+}
+template <typename T, typename U>
+__device__ detail::ToTypeIfConvertible<U, T> CudaAtomicDiv(T* ptr, U value) {
+  return detail::CudaAtomicCasHelper(ptr, [value](T a) { return a / value; });
+}
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
+#endif  // TENSORFLOW_CORE_UTIL_CUDA_KERNEL_HELPER_H_
diff --git a/tensorflow/core/util/cuda_kernel_helper.h b/tensorflow/core/util/cuda_kernel_helper.h
index 3e32ec7973..18a4c008f1 100644
--- a/tensorflow/core/util/cuda_kernel_helper.h
+++ b/tensorflow/core/util/cuda_kernel_helper.h
@@ -18,299 +18,133 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 
-#include <algorithm>
+#include "tensorflow/core/util/cuda_device_functions.h"
+#include "tensorflow/core/util/cuda_launch_config.h"
 
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "cuda/include/cuda.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/stream_executor.h"
-#include "tensorflow/core/platform/types.h"
+// Deprecated, use 'for(int i : CudaGridRangeX(n))' instead.
+#define CUDA_1D_KERNEL_LOOP(i, n) \
+  for (int i : ::tensorflow::CudaGridRangeX<int>(n))
+// Deprecated, use 'for(int i : CudaGridRange?(n))' instead.
+#define CUDA_AXIS_KERNEL_LOOP(i, n, axis) \
+  for (int i : ::tensorflow::CudaGridRange##axis<int>(n))
 
-// Mask for all 32 threads in a warp.
-#define CUDA_WARP_ALL 0xFFFFFFFF
-
-#if defined(CUDA_VERSION) && CUDA_VERSION < 9000
-// CUDA 9.0 introduces a new, light-weight barrier synchronization primitive
-// that operates at the warp-scope. This is required to ensure visibility of
-// reads/writes among threads that can make indepenent progress on Volta.
-// For previous CUDA versions these synchronizations not necessary, and we
-// define an empty function as a convenience for backward compatibility.
-__device__ inline void __syncwarp(unsigned mask = CUDA_WARP_ALL) {}
-
-// CUDA 9.0 deprecates the warp-intrinsic functions (shfl, ballot, etc.) in
-// favor of synchronizing versions. These ensure that all warp lanes specified
-// in mask execute the intrinsic in convergence. Here we provide legacy mappings
-// to the less-verbose routines provided in previous versions of CUDA.
-#define __ballot_sync(mask, predicate) __ballot(predicate)
-#define __shfl_sync(mask, val, srcLane, width) __shfl(val, srcLane, width)
-#define __shfl_down_sync(mask, val, delta, width) __shfl_down(val, delta, width)
-#define __shfl_up_sync(mask, val, delta, width) __shfl_up(val, delta, width)
-#define __shfl_xor_sync(mask, val, laneMask, width) \
-  __shfl_xor(val, laneMask, width)
-#endif
-
-// Usage of GetCudaLaunchConfig, GetCuda2DLaunchConfig, and
-// GetCuda3DLaunchConfig:
-//
-// There are two versions of GetCudaLaunchConfig and GetCuda2DLaunchConfig, one
-// version uses heuristics without any knowledge of the device kernel, the other
-// version uses cudaOccupancyMaxPotentialBlockSize to determine the theoretical
-// launch parameters that maximize occupancy. Currently, only the maximum
-// occupancy version of GetCuda3DLaunchConfig is available.
-//
-// For large number of work elements, the convention is that each kernel would
-// iterate through its assigned range. The return value of GetCudaLaunchConfig
-// is struct CudaLaunchConfig, which contains all the information needed for the
-// kernel launch, including: virtual number of threads, the number of threads
-// per block and number of threads per block used inside <<< >>> of a kernel
-// launch. GetCuda2DLaunchConfig and GetCuda3DLaunchConfig does the same thing
-// as CudaLaunchConfig. The only difference is the dimension. The macros
-// CUDA_1D_KERNEL_LOOP and CUDA_AXIS_KERNEL_LOOP might be used to do inner loop.
-//
-/* Sample code:
-
-__global__ void MyKernel1D(CudaLaunchConfig config, other_args...) {
-  CUDA_1D_KERNEL_LOOP(x, config.virtual_thread_count) {
-    do_your_job_here;
-  }
+namespace tensorflow {
+__host__ __device__ inline tensorflow::bfloat16 CudaLdg(
+    const tensorflow::bfloat16* address) {
+  tensorflow::bfloat16 return_value;
+  return_value.value = CudaLdg(reinterpret_cast<const uint16_t*>(address));
+  return return_value;
 }
 
-__global__ void MyKernel2D(Cuda2DLaunchConfig config, other_args...) {
-  CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count, x) {
-    CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count, y) {
-      do_your_job_here;
-    }
-  }
+template <typename T>
+__host__ __device__ inline T ldg(const T* ptr) {
+  return CudaLdg(ptr);
 }
 
-__global__ void MyKernel3D(Cuda3DLaunchConfig config, other_args...) {
-  CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count, x) {
-    CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count, y) {
-      CUDA_AXIS_KERNEL_LOOP(z, config.virtual_thread_count, z) {
-        do_your_job_here;
-      }
-    }
-  }
+template <typename T>
+__host__ __device__ inline const T& tf_min(const T& x, const T& y) {
+  return x < y ? x : y;
 }
 
-void MyDriverFunc(const GPUDevice &d) {
-  // use heuristics
-  CudaLaunchConfig cfg1 = GetCudaLaunchConfig(10240, d);
-  MyKernel1D <<<config.block_count,
-                config.thread_per_block, 0, d.stream()>>> (cfg1, other_args...);
-  Cuda2DLaunchConfig cfg2 = GetCuda2DLaunchConfig(10240, 10240, d);
-  MyKernel2D <<<config.block_count,
-                config.thread_per_block, 0, d.stream()>>> (cfg2, other_args...);
-  Cuda3DLaunchConfig cfg3 = GetCuda3DLaunchConfig(4096, 4096, 100, d);
-  MyKernel3D <<<config.block_count,
-                config.thread_per_block, 0, d.stream()>>> (cfg3, other_args...);
-
-  // maximize occupancy
-  CudaLaunchConfig cfg4 = GetCudaLaunchConfig(10240, d, MyKernel1D, 0, 0 );
-  MyKernel1D <<<config.block_count,
-                config.thread_per_block, 0, d.stream()>>> (cfg4, other_args...);
-  Cuda2DLaunchConfig cfg5 = GetCuda2DLaunchConfig(10240, 10240, d,
-                                                  MyKernel1D, 0, 0);
-  MyKernel2D <<<config.block_count,
-                config.thread_per_block, 0, d.stream()>>> (cfg5, other_args...);
-  Cuda3DLaunchConfig cfg6 = GetCuda3DLaunchConfig(4096, 4096, 100, d,
-                                                  MyKernel1D, 0, 0);
-  MyKernel3D <<<config.block_count,
-                config.thread_per_block, 0, d.stream()>>> (cfg6, other_args...);
+template <typename T>
+__host__ __device__ inline const T& tf_max(const T& x, const T& y) {
+  return x < y ? y : x;
 }
 
-// See the test for this for more example:
-//
-https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/util/cuda_kernel_helper_test.cu.cc
-
-*/
-
-#define CUDA_1D_KERNEL_LOOP(i, n)                            \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
-       i += blockDim.x * gridDim.x)
-
-#define CUDA_AXIS_KERNEL_LOOP(i, n, axis)                                  \
-  for (int i = blockIdx.axis * blockDim.axis + threadIdx.axis; i < n.axis; \
-       i += blockDim.axis * gridDim.axis)
-
-#define DIV_UP(a, b) (((a) + (b)-1) / (b))
-
-namespace tensorflow {
-
-typedef Eigen::GpuDevice GPUDevice;
-
-struct CudaLaunchConfig {
-  // Logical number of thread that works on the elements. If each logical
-  // thread works on exactly a single element, this is the same as the working
-  // element count.
-  int virtual_thread_count = -1;
-  // Number of threads per block.
-  int thread_per_block = -1;
-  // Number of blocks for Cuda kernel launch.
-  int block_count = -1;
-};
-
-// Calculate the Cuda launch config we should use for a kernel launch.
-// This is assuming the kernel is quite simple and will largely be
-// memory-limited.
-// REQUIRES: work_element_count > 0.
-inline CudaLaunchConfig GetCudaLaunchConfig(int work_element_count,
-                                            const GPUDevice& d) {
-  CHECK_GT(work_element_count, 0);
-  CudaLaunchConfig config;
-  const int virtual_thread_count = work_element_count;
-  const int physical_thread_count = std::min(
-      d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor(),
-      virtual_thread_count);
-  const int thread_per_block = std::min(1024, d.maxCudaThreadsPerBlock());
-  const int block_count =
-      std::min(DIV_UP(physical_thread_count, thread_per_block),
-               d.getNumCudaMultiProcessors());
-
-  config.virtual_thread_count = virtual_thread_count;
-  config.thread_per_block = thread_per_block;
-  config.block_count = block_count;
-  return config;
+// Overloads of the above functions for float and double.
+__host__ __device__ inline float tf_min(float x, float y) {
+  return fminf(x, y);
 }
-
-// Calculate the Cuda launch config we should use for a kernel launch. This
-// variant takes the resource limits of func into account to maximize occupancy.
-// REQUIRES: work_element_count > 0.
-template <typename DeviceFunc>
-inline CudaLaunchConfig GetCudaLaunchConfig(int work_element_count,
-                                            const GPUDevice& d, DeviceFunc func,
-                                            size_t dynamic_shared_memory_size,
-                                            int block_size_limit) {
-  CHECK_GT(work_element_count, 0);
-  CudaLaunchConfig config;
-  int block_count = 0;
-  int thread_per_block = 0;
-
-  cudaError_t err = cudaOccupancyMaxPotentialBlockSize(
-      &block_count, &thread_per_block, func, dynamic_shared_memory_size,
-      block_size_limit);
-  CHECK_EQ(err, cudaSuccess);
-
-  block_count =
-      std::min(block_count, DIV_UP(work_element_count, thread_per_block));
-
-  config.virtual_thread_count = work_element_count;
-  config.thread_per_block = thread_per_block;
-  config.block_count = block_count;
-  return config;
+__host__ __device__ inline double tf_min(double x, double y) {
+  return fmin(x, y);
+}
+__host__ __device__ inline float tf_max(float x, float y) {
+  return fmaxf(x, y);
+}
+__host__ __device__ inline double tf_max(double x, double y) {
+  return fmax(x, y);
 }
 
-struct Cuda2DLaunchConfig {
-  dim3 virtual_thread_count = dim3(0, 0, 0);
-  dim3 thread_per_block = dim3(0, 0, 0);
-  dim3 block_count = dim3(0, 0, 0);
-};
-
-inline Cuda2DLaunchConfig GetCuda2DLaunchConfig(int xdim, int ydim,
-                                                const GPUDevice& d) {
-  Cuda2DLaunchConfig config;
-
-  if (xdim <= 0 || ydim <= 0) {
-    return config;
-  }
-
-  const int kThreadsPerBlock = 256;
-  int block_cols = std::min(xdim, kThreadsPerBlock);
-  // ok to round down here and just do more loops in the kernel
-  int block_rows = std::max(kThreadsPerBlock / block_cols, 1);
-
-  const int physical_thread_count =
-      d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor();
-
-  const int max_blocks = std::max(physical_thread_count / kThreadsPerBlock, 1);
-
-  config.virtual_thread_count = dim3(xdim, ydim, 1);
-  config.thread_per_block = dim3(block_cols, block_rows, 1);
-
-  int grid_x = std::min(DIV_UP(xdim, block_cols), max_blocks);
+__device__ inline Eigen::half CudaShuffleSync(unsigned mask, Eigen::half value,
+                                              int src_lane,
+                                              int width = warpSize) {
+  return Eigen::half(
+      CudaShuffleSync(mask, static_cast<uint16>(value), src_lane, width));
+}
 
-  config.block_count = dim3(
-      grid_x, std::min(max_blocks / grid_x, std::max(ydim / block_rows, 1)), 1);
-  return config;
+__device__ EIGEN_ALWAYS_INLINE Eigen::half CudaShuffleUpSync(
+    unsigned mask, Eigen::half value, int delta, int width = warpSize) {
+  return Eigen::half(
+      CudaShuffleUpSync(mask, static_cast<uint16>(value), delta, width));
 }
 
-// Calculate the Cuda 2D and 3D launch config we should use for a kernel launch.
-// This variant takes the resource limits of func into account to maximize
-// occupancy.
-using Cuda3DLaunchConfig = Cuda2DLaunchConfig;
+__device__ EIGEN_ALWAYS_INLINE Eigen::half CudaShuffleDownSync(
+    unsigned mask, Eigen::half value, int delta, int width = warpSize) {
+  return Eigen::half(
+      CudaShuffleDownSync(mask, static_cast<uint16>(value), delta, width));
+}
 
-template <typename DeviceFunc>
-inline Cuda3DLaunchConfig GetCuda3DLaunchConfig(
-    int xdim, int ydim, int zdim, const GPUDevice& d, DeviceFunc func,
-    size_t dynamic_shared_memory_size, int block_size_limit) {
-  Cuda3DLaunchConfig config;
+__device__ EIGEN_ALWAYS_INLINE Eigen::half CudaShuffleXorSync(
+    unsigned mask, Eigen::half value, int lane_mask, int width = warpSize) {
+  return Eigen::half(
+      CudaShuffleXorSync(mask, static_cast<uint16>(value), lane_mask, width));
+}
 
-  if (xdim <= 0 || ydim <= 0 || zdim <= 0) {
-    return config;
+namespace detail {
+// Overload of above function for half. Note that we don't have
+// atomicCAS() for anything less than 32 bits, so we need to include the
+// other 16 bits in the operation.
+//
+// This version is going to be very slow
+// under high concurrency, since most threads will be spinning on failing
+// their compare-and-swap tests. (The fact that we get false sharing on the
+// neighboring fp16 makes this even worse.) If you are doing a large reduction,
+// you are much better off with doing the intermediate steps in fp32 and then
+// switching to fp16 as late as you can in the calculations.
+//
+// Note: Assumes little endian.
+template <typename F>
+__device__ Eigen::half CudaAtomicCasHelper(Eigen::half* ptr, F accumulate) {
+#if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__)
+  static_assert(__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__, "Not little endian");
+#endif
+  namespace half_impl = Eigen::half_impl;
+  intptr_t intptr = reinterpret_cast<intptr_t>(ptr);
+  assert(!(intptr & 0x1));  // should be 2-aligned.
+  if (intptr & 0x2) {
+    // The half is in the second part of the uint32 (upper 16 bits).
+    uint32* address = reinterpret_cast<uint32*>(intptr - 2);
+    uint32 result = CudaAtomicCasHelper(address, [accumulate](uint32 arg) {
+      unsigned short high = static_cast<unsigned short>(arg >> 16);
+      Eigen::half acc = accumulate(half_impl::raw_uint16_to_half(high));
+      return (static_cast<uint32>(acc.x) << 16) | (arg & 0xffff);
+    });
+    return half_impl::raw_uint16_to_half(static_cast<uint16>(result >> 16));
+  } else {
+    // The half is in the first part of the uint32 (lower 16 bits).
+    uint32* address = reinterpret_cast<uint32*>(intptr);
+    uint32 result = CudaAtomicCasHelper(address, [accumulate](uint32 arg) {
+      unsigned short low = static_cast<unsigned short>(arg & 0xffff);
+      Eigen::half acc = accumulate(half_impl::raw_uint16_to_half(low));
+      return (arg & 0xffff0000) | static_cast<uint32>(acc.x);
+    });
+    return half_impl::raw_uint16_to_half(static_cast<uint16>(result & 0xffff));
   }
-
-  int dev;
-  cudaGetDevice(&dev);
-  cudaDeviceProp deviceProp;
-  cudaGetDeviceProperties(&deviceProp, dev);
-  int xthreadlimit = deviceProp.maxThreadsDim[0];
-  int ythreadlimit = deviceProp.maxThreadsDim[1];
-  int zthreadlimit = deviceProp.maxThreadsDim[2];
-  int xgridlimit = deviceProp.maxGridSize[0];
-  int ygridlimit = deviceProp.maxGridSize[1];
-  int zgridlimit = deviceProp.maxGridSize[2];
-
-  int block_count = 0;
-  int thread_per_block = 0;
-  cudaError_t err = cudaOccupancyMaxPotentialBlockSize(
-      &block_count, &thread_per_block, func, dynamic_shared_memory_size,
-      block_size_limit);
-  CHECK_EQ(err, cudaSuccess);
-
-#define MIN3(a, b, c) std::min((a), std::min((b), (c)))
-  int threadsx = MIN3(xdim, thread_per_block, xthreadlimit);
-  int threadsy =
-      MIN3(ydim, std::max(thread_per_block / threadsx, 1), ythreadlimit);
-  int threadsz =
-      MIN3(zdim, std::max(thread_per_block / (threadsx * threadsy), 1),
-           zthreadlimit);
-
-  int blocksx = MIN3(block_count, DIV_UP(xdim, threadsx), xgridlimit);
-  int blocksy =
-      MIN3(DIV_UP(block_count, blocksx), DIV_UP(ydim, threadsy), ygridlimit);
-  int blocksz = MIN3(DIV_UP(block_count, (blocksx * blocksy)),
-                     DIV_UP(zdim, threadsz), zgridlimit);
-#undef MIN3
-
-  config.virtual_thread_count = dim3(xdim, ydim, zdim);
-  config.thread_per_block = dim3(threadsx, threadsy, threadsz);
-  config.block_count = dim3(blocksx, blocksy, blocksz);
-  return config;
 }
+}  // namespace detail
 
-template <typename DeviceFunc>
-inline Cuda2DLaunchConfig GetCuda2DLaunchConfig(
-    int xdim, int ydim, const GPUDevice& d, DeviceFunc func,
-    size_t dynamic_shared_memory_size, int block_size_limit) {
-  return GetCuda3DLaunchConfig(xdim, ydim, 1, d, func,
-                               dynamic_shared_memory_size, block_size_limit);
+__device__ inline Eigen::half CudaAtomicAdd(Eigen::half* ptr,
+                                            Eigen::half value) {
+  return detail::CudaAtomicCasHelper(
+      ptr, [value](Eigen::half a) { return a + value; });
 }
-
-// Returns a raw reference to the current cuda stream.  Required by a
-// number of kernel calls (for which StreamInterface* does not work), i.e.
-// CUB and certain cublas primitives.
-inline const cudaStream_t& GetCudaStream(OpKernelContext* context) {
-  const cudaStream_t* ptr = CHECK_NOTNULL(
-      reinterpret_cast<const cudaStream_t*>(context->op_device_context()
-                                                ->stream()
-                                                ->implementation()
-                                                ->CudaStreamMemberHack()));
-  return *ptr;
+__device__ inline Eigen::half CudaAtomicSub(Eigen::half* ptr,
+                                            Eigen::half value) {
+  return detail::CudaAtomicCasHelper(
+      ptr, [value](Eigen::half a) { return a - value; });
 }
 
 namespace cuda_helper {
-
 template <typename IntType>
 __device__ IntType upper_bound(IntType* first, IntType count, IntType val) {
   IntType* orig = first;
@@ -330,495 +164,8 @@ __device__ IntType upper_bound(IntType* first, IntType count, IntType val) {
 
   return first - orig;
 }
-
 }  // namespace cuda_helper
-
-template <typename T>
-__device__ __host__ inline T ldg(const T* address) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
-  return __ldg(address);
-#else
-  return *address;
-#endif
-}
-
-template <>
-__device__ __host__ inline std::complex<float> ldg(
-    const std::complex<float>* address) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
-  float2 mem = __ldg(reinterpret_cast<const float2*>(address));
-  return std::complex<float>(mem.x, mem.y);
-#else
-  return *address;
-#endif
-}
-
-template <>
-__device__ __host__ inline std::complex<double> ldg(
-    const std::complex<double>* address) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
-  double2 mem = __ldg(reinterpret_cast<const double2*>(address));
-  return std::complex<double>(mem.x, mem.y);
-#else
-  return *address;
-#endif
-}
-
-template <>
-__device__ __host__ inline Eigen::half ldg(const Eigen::half* address) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
-  return Eigen::half_impl::raw_uint16_to_half(
-      __ldg(reinterpret_cast<const uint16_t*>(address)));
-#else
-  return *address;
-#endif
-}
-
-template <>
-__device__ __host__ inline tensorflow::bfloat16 ldg(
-    const tensorflow::bfloat16* address) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
-  tensorflow::bfloat16 return_value;
-  asm volatile("ld.global.nc.u16 %0, [%1];"
-               : "=h"(return_value.value)
-               : "l"(address));
-  return return_value;
-#else
-  return *address;
-#endif
-}
-
-template <>
-__device__ __host__ inline bool ldg(const bool* address) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
-  return *reinterpret_cast<const bool*>(
-      __ldg(reinterpret_cast<const char*>(address)));
-#else
-  return *address;
-#endif
-}
-
-// CUDA provides atomic ops, but not for all types.  We provide wrappers
-// for some ops and provide implementation for all reasonable types.
-#define CUDA_ATOMIC_WRAPPER(op, T) \
-  __device__ __forceinline__ T CudaAtomic##op(T* address, T val)
-
-#define USE_CUDA_ATOMIC(op, T) \
-  CUDA_ATOMIC_WRAPPER(op, T) { return atomic##op(address, val); }
-
-// For atomicAdd.
-USE_CUDA_ATOMIC(Add, int32);
-USE_CUDA_ATOMIC(Add, uint32);
-USE_CUDA_ATOMIC(Add, uint64);
-USE_CUDA_ATOMIC(Add, float);
-
-// For atomicMax.
-USE_CUDA_ATOMIC(Max, int32);
-USE_CUDA_ATOMIC(Max, uint32);
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
-USE_CUDA_ATOMIC(Max, uint64);
-#else
-// The uint64 overload of atomicMax() is only available for __CUDA_ARCH__ >=
-// 350.  If not satisfied, we provide a custom implementation using atomicCAS().
-CUDA_ATOMIC_WRAPPER(Max, uint64) {
-  uint64* address_as_ull = reinterpret_cast<uint64*>(address);
-  uint64 old = *address_as_ull, assumed;
-
-  do {
-    assumed = old;
-    old = atomicCAS(address_as_ull, assumed, max(val, assumed));
-  } while (assumed != old);
-
-  return old;
-}
-#endif
-
-// Custom implementation of atomicAdd for double.
-// This implementation is copied from CUDA manual.
-CUDA_ATOMIC_WRAPPER(Add, double) {
-  uint64* address_as_ull = reinterpret_cast<uint64*>(address);
-  uint64 old = *address_as_ull, assumed;
-
-  do {
-    assumed = old;
-    old = atomicCAS(address_as_ull, assumed,
-                    __double_as_longlong(val + __longlong_as_double(assumed)));
-
-    // Note: uses integer comparison to avoid hang in case of NaN
-  } while (assumed != old);
-
-  return __longlong_as_double(old);
-}
-
-// Custom implementation of atomicAdd for std::complex<float>.
-// This implementation performs to atomic additions on the components.
-CUDA_ATOMIC_WRAPPER(Add, std::complex<float>) {
-#if defined(__CUDA_ARCH__)
-#if __CUDA_ARCH__ >= 350
-  float2* addr_as_float2 = reinterpret_cast<float2*>(address);
-  float2* val_as_float2 = reinterpret_cast<float2*>(&val);
-  CudaAtomicAdd(&(addr_as_float2->x), val_as_float2->x);
-  CudaAtomicAdd(&(addr_as_float2->y), val_as_float2->y);
-#else
-  static_assert(sizeof(std::complex<float>) == 2 * sizeof(float),
-                "Unable to compile CudaAtomicAdd for complex64 because "
-                "sizeof(complex64) != 2*sizeof(float32)");
-  float* addr_as_float = reinterpret_cast<float*>(address);
-  float* val_as_float = reinterpret_cast<float*>(&val);
-  CudaAtomicAdd(addr_as_float, *val_as_float);
-  CudaAtomicAdd(addr_as_float + 1, *(val_as_float + 1));
-#endif
-#endif
-  return *address;
-}
-
-// Custom implementation of atomicAdd for std::complex<double>.
-// This implementation performs to atomic additions on the components
-// using the double atomic wrapper above.
-CUDA_ATOMIC_WRAPPER(Add, complex128) {
-#if defined(__CUDA_ARCH__)
-#if __CUDA_ARCH__ >= 350
-  double2* addr_as_double2 = reinterpret_cast<double2*>(address);
-  double2* val_as_double2 = reinterpret_cast<double2*>(&val);
-  CudaAtomicAdd(&(addr_as_double2->x), val_as_double2->x);
-  CudaAtomicAdd(&(addr_as_double2->y), val_as_double2->y);
-#else
-  static_assert(sizeof(std::complex<double>) == 2 * sizeof(double),
-                "Unable to compile CudaAtomicAdd for complex128 because "
-                "sizeof(complex128) != 2*sizeof(float64)");
-  double* addr_as_double = reinterpret_cast<double*>(address);
-  double* val_as_double = reinterpret_cast<double*>(&val);
-  CudaAtomicAdd(addr_as_double, *val_as_double);
-  CudaAtomicAdd(addr_as_double + 1, *(val_as_double + 1));
-#endif
-#endif
-  return *address;
-}
-
-// Helper functions for CudaAtomicAdd(half*, half), below.
-//
-// Note that if __CUDA_ARCH__ >= 530, we could probably use __hadd2()
-// for a more efficient implementation, assuming that adding -0.0
-// will never harm the neighboring value. In this version, we take special
-// care to guarantee the bits of the untouched value are unchanged.
-inline __device__ uint32 add_to_low_half(uint32 val, float x) {
-  Eigen::half low_half;
-  low_half.x = static_cast<uint16>(val & 0xffffu);
-  low_half = static_cast<Eigen::half>(static_cast<float>(low_half) + x);
-  return (val & 0xffff0000u) | low_half.x;
-}
-
-inline __device__ uint32 add_to_high_half(uint32 val, float x) {
-  Eigen::half high_half;
-  high_half.x = static_cast<uint16>(val >> 16);
-  high_half = static_cast<Eigen::half>(static_cast<float>(high_half) + x);
-  return (val & 0xffffu) | (high_half.x << 16);
-}
-
-// Custom implementation of atomicAdd for half. Note that we don't have
-// atomicCAS() for anything less than 32 bits, so we need to include the
-// other 16 bits in the operation.
-//
-// Unlike the other atomic adds, this version is going to be very slow
-// under high concurrency, since most threads will be spinning on failing
-// their compare-and-swap tests. (The fact that we get false sharing on the
-// neighboring fp16 makes this even worse.) If you are doing a large reduction,
-// you are much better off with doing the intermediate steps in fp32 and then
-// switching to fp16 as late as you can in the calculations.
-//
-// Note: Assumes little endian.
-CUDA_ATOMIC_WRAPPER(Add, Eigen::half) {
-  float val_as_float(val);
-  intptr_t address_int = reinterpret_cast<intptr_t>(address);
-  if ((address_int & 0x2) == 0) {
-    // The half is in the first part of the uint32 (lower 16 bits).
-    uint32* address_as_uint32 = reinterpret_cast<uint32*>(address);
-    assert(((intptr_t)address_as_uint32 & 0x3) == 0);
-    uint32 old = *address_as_uint32, assumed;
-
-    do {
-      assumed = old;
-      old = atomicCAS(address_as_uint32, assumed,
-                      add_to_low_half(assumed, val_as_float));
-
-      // Note: uses integer comparison to avoid hang in case of NaN
-    } while (assumed != old);
-
-    Eigen::half ret;
-    ret.x = old & 0xffffu;
-    return ret;
-  } else {
-    // The half is in the second part of the uint32 (upper 16 bits).
-    uint32* address_as_uint32 = reinterpret_cast<uint32*>(address_int - 2);
-    assert(((intptr_t)address_as_uint32 & 0x3) == 0);
-    uint32 old = *address_as_uint32, assumed;
-
-    do {
-      assumed = old;
-      old = atomicCAS(address_as_uint32, assumed,
-                      add_to_high_half(assumed, val_as_float));
-
-      // Note: uses integer comparison to avoid hang in case of NaN
-    } while (assumed != old);
-
-    Eigen::half ret;
-    ret.x = old >> 16;
-    return ret;
-  }
-}
-
-template <typename T>
-__global__ void SetZero(const int nthreads, T* bottom_diff) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) { *(bottom_diff + index) = T(0); }
-}
-
-// For atomicSub.
-
-// Custom implementation for sub by just negating the value.
-#define WRAPPED_ATOMIC_SUB(T) \
-  CUDA_ATOMIC_WRAPPER(Sub, T) { return CudaAtomicAdd(address, -val); }
-
-WRAPPED_ATOMIC_SUB(uint64);
-WRAPPED_ATOMIC_SUB(int32);
-WRAPPED_ATOMIC_SUB(uint32);
-WRAPPED_ATOMIC_SUB(Eigen::half);
-WRAPPED_ATOMIC_SUB(float);
-WRAPPED_ATOMIC_SUB(double);
-
-CUDA_ATOMIC_WRAPPER(Sub, complex64) {
-  const std::complex<float> Tneg(-val.real(), -val.imag());
-  return CudaAtomicAdd(address, Tneg);
-}
-
-CUDA_ATOMIC_WRAPPER(Sub, complex128) {
-  const std::complex<double> Tneg(-val.real(), -val.imag());
-  return CudaAtomicAdd(address, Tneg);
-}
-
-#undef WRAPPED_ATOMIC_SUB
-
-// For atomicMul.
-CUDA_ATOMIC_WRAPPER(Mul, int32) {
-  int32 old = *address, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(address, assumed, val * assumed);
-  } while (assumed != old);
-  return old;
-}
-
-CUDA_ATOMIC_WRAPPER(Mul, uint32) {
-  uint32 old = *address, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(address, assumed, val * assumed);
-  } while (assumed != old);
-  return old;
-}
-
-CUDA_ATOMIC_WRAPPER(Mul, uint64) {
-  uint64 old = *address, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(address, assumed, val * assumed);
-  } while (assumed != old);
-  return old;
-}
-
-CUDA_ATOMIC_WRAPPER(Mul, float) {
-  int32* address_as_int = reinterpret_cast<int32*>(address);
-  int32 old = *address_as_int, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(address_as_int, assumed,
-                    __float_as_int(val * __int_as_float(assumed)));
-  } while (assumed != old);
-  return __int_as_float(old);
-}
-
-CUDA_ATOMIC_WRAPPER(Mul, double) {
-  uint64* address_as_ull = reinterpret_cast<uint64*>(address);
-  uint64 old = *address_as_ull, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(address_as_ull, assumed,
-                    __double_as_longlong(val * __longlong_as_double(assumed)));
-  } while (assumed != old);
-  return __longlong_as_double(old);
-}
-
-// For atomicDiv.
-CUDA_ATOMIC_WRAPPER(Div, int32) {
-  int32 old = *address, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(address, assumed, assumed / val);
-  } while (assumed != old);
-  return old;
-}
-
-CUDA_ATOMIC_WRAPPER(Div, uint32) {
-  uint32 old = *address, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(address, assumed, assumed / val);
-  } while (assumed != old);
-  return old;
-}
-
-CUDA_ATOMIC_WRAPPER(Div, uint64) {
-  uint64 old = *address, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(address, assumed, assumed / val);
-  } while (assumed != old);
-  return old;
-}
-
-CUDA_ATOMIC_WRAPPER(Div, float) {
-  int32* address_as_int = reinterpret_cast<int32*>(address);
-  int32 old = *address_as_int, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(address_as_int, assumed,
-                    __float_as_int(__int_as_float(assumed) / val));
-  } while (assumed != old);
-  return __int_as_float(old);
-}
-
-CUDA_ATOMIC_WRAPPER(Div, double) {
-  uint64* address_as_ull = reinterpret_cast<uint64*>(address);
-  uint64 old = *address_as_ull, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(address_as_ull, assumed,
-                    __double_as_longlong(__longlong_as_double(assumed) / val));
-  } while (assumed != old);
-  return __longlong_as_double(old);
-}
-
-#undef USE_CUDA_ATOMIC
-#undef CUDA_ATOMIC_WRAPPER
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T tf_min(const T& x, const T& y) {
-  return x > y ? y : x;
-}
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T tf_max(const T& x, const T& y) {
-  return x < y ? y : x;
-}
-
-__device__ EIGEN_ALWAYS_INLINE unsigned CudaBallot(unsigned mask,
-                                                   int predicate) {
-  return __ballot_sync(mask, predicate);
-}
-
-template <typename T>
-__device__ EIGEN_ALWAYS_INLINE T CudaShuffle(unsigned mask, T value,
-                                             int srcLane,
-                                             int width = warpSize) {
-  return __shfl_sync(mask, value, srcLane, width);
-}
-
-// Variant of the (undocumented) version from the CUDA SDK, but using unsigned
-// instead of float for lo and hi (which is incorrect with ftz, for example).
-// A bug has been filed with NVIDIA and will be fixed in the next CUDA release.
-// TODO(csigg): remove when the bug is fixed in the next CUDA release.
-__device__ EIGEN_ALWAYS_INLINE double CudaShuffle(unsigned mask, double value,
-                                                  int srcLane,
-                                                  int width = warpSize) {
-  unsigned lo, hi;
-  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(value));
-  hi = __shfl_sync(mask, hi, srcLane, width);
-  lo = __shfl_sync(mask, lo, srcLane, width);
-  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(value) : "r"(lo), "r"(hi));
-  return value;
-}
-
-template <typename T>
-__device__ EIGEN_ALWAYS_INLINE T CudaShuffleUp(unsigned mask, T value,
-                                               int delta,
-                                               int width = warpSize) {
-  return __shfl_up_sync(mask, value, delta, width);
-}
-
-// Variant of the (undocumented) version from the CUDA SDK, but using unsigned
-// instead of float for lo and hi (which is incorrect with ftz, for example).
-// A bug has been filed with NVIDIA and will be fixed in the next CUDA release.
-// TODO(csigg): remove when the bug is fixed in the next CUDA release.
-__device__ EIGEN_ALWAYS_INLINE double CudaShuffleUp(unsigned mask, double value,
-                                                    int delta,
-                                                    int width = warpSize) {
-  unsigned lo, hi;
-  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(value));
-  hi = __shfl_up_sync(mask, hi, delta, width);
-  lo = __shfl_up_sync(mask, lo, delta, width);
-  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(value) : "r"(lo), "r"(hi));
-  return value;
-}
-
-template <typename T>
-__device__ EIGEN_ALWAYS_INLINE T CudaShuffleDown(unsigned mask, T value,
-                                                 int delta,
-                                                 int width = warpSize) {
-  return __shfl_down_sync(mask, value, delta, width);
-}
-
-__device__ EIGEN_ALWAYS_INLINE Eigen::half CudaShuffleDown(
-    unsigned mask, Eigen::half value, int delta, int width = warpSize) {
-  return Eigen::half(
-      __shfl_down_sync(mask, static_cast<uint16>(value), delta, width));
-}
-
-// Variant of the (undocumented) version from the CUDA SDK, but using unsigned
-// instead of float for lo and hi (which is incorrect with ftz, for example).
-// A bug has been filed with NVIDIA and will be fixed in the next CUDA release.
-// TODO(csigg): remove when the bug is fixed in the next CUDA release.
-__device__ EIGEN_ALWAYS_INLINE double CudaShuffleDown(unsigned mask,
-                                                      double value, int delta,
-                                                      int width = warpSize) {
-  unsigned lo, hi;
-  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(value));
-  hi = __shfl_down_sync(mask, hi, delta, width);
-  lo = __shfl_down_sync(mask, lo, delta, width);
-  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(value) : "r"(lo), "r"(hi));
-  return value;
-}
-
-template <typename T>
-__device__ EIGEN_ALWAYS_INLINE T CudaShuffleXor(unsigned mask, T value,
-                                                int laneMask,
-                                                int width = warpSize) {
-  return __shfl_xor_sync(mask, value, laneMask, width);
-}
-
-__device__ EIGEN_ALWAYS_INLINE Eigen::half CudaShuffleXor(
-    unsigned mask, Eigen::half value, int laneMask, int width = warpSize) {
-  return Eigen::half(
-      __shfl_xor_sync(mask, static_cast<uint16>(value), laneMask, width));
-}
-
-// Variant of the (undocumented) version from the CUDA SDK, but using unsigned
-// instead of float for lo and hi (which is incorrect with ftz, for example).
-// A bug has been filed with NVIDIA and will be fixed in the next CUDA release.
-// TODO(csigg): remove when the bug is fixed in the next CUDA release.
-__device__ EIGEN_ALWAYS_INLINE double CudaShuffleXor(unsigned mask,
-                                                     double value, int laneMask,
-                                                     int width = warpSize) {
-  unsigned lo, hi;
-  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(value));
-  hi = __shfl_xor_sync(mask, hi, laneMask, width);
-  lo = __shfl_xor_sync(mask, lo, laneMask, width);
-  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(value) : "r"(lo), "r"(hi));
-  return value;
-}
-
 }  // namespace tensorflow
 
-#undef DIV_UP
-
 #endif  // GOOGLE_CUDA
-
 #endif  // TENSORFLOW_CORE_UTIL_CUDA_KERNEL_HELPER_H_
diff --git a/tensorflow/core/util/cuda_kernel_helper_test.cu.cc b/tensorflow/core/util/cuda_kernel_helper_test.cu.cc
index 6991554eff..bd4c356ea0 100644
--- a/tensorflow/core/util/cuda_kernel_helper_test.cu.cc
+++ b/tensorflow/core/util/cuda_kernel_helper_test.cu.cc
@@ -52,11 +52,11 @@ __global__ void Count1D(CudaLaunchConfig config, int bufsize, int* outbuf) {
   }
 }
 __global__ void Count2D(Cuda2DLaunchConfig config, int bufsize, int* outbuf) {
-  CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count, x) {
+  CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count.x, X) {
     if (x < 0) {  // x might overflow when testing extreme case
       break;
     }
-    CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count, y) {
+    CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count.y, Y) {
       if (y < 0) {  // y might overflow when testing extreme case
         break;
       }
@@ -66,15 +66,15 @@ __global__ void Count2D(Cuda2DLaunchConfig config, int bufsize, int* outbuf) {
   }
 }
 __global__ void Count3D(Cuda3DLaunchConfig config, int bufsize, int* outbuf) {
-  CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count, x) {
+  CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count.x, X) {
     if (x < 0) {  // x might overflow when testing extreme case
       break;
     }
-    CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count, y) {
+    CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count.y, Y) {
       if (y < 0) {  // y might overflow when testing extreme case
         break;
       }
-      CUDA_AXIS_KERNEL_LOOP(z, config.virtual_thread_count, z) {
+      CUDA_AXIS_KERNEL_LOOP(z, config.virtual_thread_count.z, Z) {
         if (z < 0) {  // z might overflow when testing extreme case
           break;
         }
@@ -87,6 +87,44 @@ __global__ void Count3D(Cuda3DLaunchConfig config, int bufsize, int* outbuf) {
   }
 }
 
+__global__ void CudaShuffleGetSrcLaneTest(unsigned* failure_count) {
+  unsigned lane_id = CudaLaneId();
+  for (int width = warpSize; width > 1; width /= 2) {
+    auto check_result = [&](const char* op_name, int param, unsigned actual,
+                            unsigned expected) {
+      if (actual != expected) {
+        printf("Cuda%sGetSrcLane(%d, %d) for lane %d returned %d, not %d\n",
+               op_name, param, width, lane_id, actual, expected);
+        CudaAtomicAdd(failure_count, 1);
+      }
+    };
+    for (int src_lane = -warpSize; src_lane <= warpSize; ++src_lane) {
+      unsigned actual_lane = detail::CudaShuffleGetSrcLane(src_lane, width);
+      unsigned expect_lane =
+          CudaShuffleSync(kCudaWarpAll, lane_id, src_lane, width);
+      check_result("Shuffle", src_lane, actual_lane, expect_lane);
+    }
+    for (unsigned delta = 0; delta <= warpSize; ++delta) {
+      unsigned actual_lane = detail::CudaShuffleUpGetSrcLane(delta, width);
+      unsigned expect_lane =
+          CudaShuffleUpSync(kCudaWarpAll, lane_id, delta, width);
+      check_result("ShuffleUp", delta, actual_lane, expect_lane);
+    }
+    for (unsigned delta = 0; delta <= warpSize; ++delta) {
+      unsigned actual_lane = detail::CudaShuffleDownGetSrcLane(delta, width);
+      unsigned expect_lane =
+          CudaShuffleDownSync(kCudaWarpAll, lane_id, delta, width);
+      check_result("ShuffleDown", delta, actual_lane, expect_lane);
+    }
+    for (int lane_lane = warpSize; lane_lane > 0; lane_lane /= 2) {
+      unsigned actual_lane = detail::CudaShuffleXorGetSrcLane(lane_lane, width);
+      unsigned expect_lane =
+          CudaShuffleXorSync(kCudaWarpAll, lane_id, lane_lane, width);
+      check_result("ShuffleXor", lane_lane, actual_lane, expect_lane);
+    }
+  }
+}
+
 }  // namespace
 
 class CudaLaunchConfigTest : public ::testing::Test {
@@ -94,7 +132,7 @@ class CudaLaunchConfigTest : public ::testing::Test {
   const int bufsize = 1024;
   int* outbuf = nullptr;
   Eigen::CudaStreamDevice stream;
-  GPUDevice d = GPUDevice(&stream);
+  Eigen::GpuDevice d = Eigen::GpuDevice(&stream);
 
   virtual void SetUp() {
     cudaError_t err = cudaMallocManaged(&outbuf, sizeof(int) * bufsize);
@@ -229,6 +267,16 @@ TEST_F(CudaLaunchConfigTest, GetCuda3DLaunchConfig) {
   #undef TEST_LAUNCH_PARAMETER
 }
 
+TEST(CudaDeviceFunctionsTest, ShuffleGetSrcLane) {
+  unsigned* failure_count;
+  ASSERT_EQ(cudaMallocManaged(&failure_count, sizeof(unsigned)), cudaSuccess);
+  *failure_count = 0;
+  CudaShuffleGetSrcLaneTest<<<1, 32>>>(failure_count);
+  ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess);
+  ASSERT_EQ(*failure_count, 0);
+  cudaFree(failure_count);
+}
+
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/util/cuda_launch_config.h b/tensorflow/core/util/cuda_launch_config.h
new file mode 100644
index 0000000000..3ea33ee6cf
--- /dev/null
+++ b/tensorflow/core/util/cuda_launch_config.h
@@ -0,0 +1,284 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_CUDA_LAUNCH_CONFIG_H_
+#define TENSORFLOW_CORE_UTIL_CUDA_LAUNCH_CONFIG_H_
+
+#if GOOGLE_CUDA
+
+#include <algorithm>
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "cuda/include/cuda.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/platform/types.h"
+
+// Usage of GetCudaLaunchConfig, GetCuda2DLaunchConfig, and
+// GetCuda3DLaunchConfig:
+//
+// There are two versions of GetCudaLaunchConfig and GetCuda2DLaunchConfig, one
+// version uses heuristics without any knowledge of the device kernel, the other
+// version uses cudaOccupancyMaxPotentialBlockSize to determine the theoretical
+// launch parameters that maximize occupancy. Currently, only the maximum
+// occupancy version of GetCuda3DLaunchConfig is available.
+//
+// For large number of work elements, the convention is that each kernel would
+// iterate through its assigned range. The return value of GetCudaLaunchConfig
+// is struct CudaLaunchConfig, which contains all the information needed for the
+// kernel launch, including: virtual number of threads, the number of threads
+// per block and number of threads per block used inside <<< >>> of a kernel
+// launch. GetCuda2DLaunchConfig and GetCuda3DLaunchConfig does the same thing
+// as CudaLaunchConfig. The only difference is the dimension. The macros
+// CUDA_1D_KERNEL_LOOP and CUDA_AXIS_KERNEL_LOOP might be used to do inner loop.
+//
+/* Sample code:
+
+__global__ void MyKernel1D(CudaLaunchConfig config, other_args...) {
+  CUDA_1D_KERNEL_LOOP(x, config.virtual_thread_count) {
+    do_your_job_here;
+  }
+}
+
+__global__ void MyKernel2D(Cuda2DLaunchConfig config, other_args...) {
+  CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count, x) {
+    CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count, y) {
+      do_your_job_here;
+    }
+  }
+}
+
+__global__ void MyKernel3D(Cuda3DLaunchConfig config, other_args...) {
+  CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count, x) {
+    CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count, y) {
+      CUDA_AXIS_KERNEL_LOOP(z, config.virtual_thread_count, z) {
+        do_your_job_here;
+      }
+    }
+  }
+}
+
+void MyDriverFunc(const Eigen::GpuDevice &d) {
+  // use heuristics
+  CudaLaunchConfig cfg1 = GetCudaLaunchConfig(10240, d);
+  MyKernel1D <<<config.block_count,
+                config.thread_per_block, 0, d.stream()>>> (cfg1, other_args...);
+  Cuda2DLaunchConfig cfg2 = GetCuda2DLaunchConfig(10240, 10240, d);
+  MyKernel2D <<<config.block_count,
+                config.thread_per_block, 0, d.stream()>>> (cfg2, other_args...);
+  Cuda3DLaunchConfig cfg3 = GetCuda3DLaunchConfig(4096, 4096, 100, d);
+  MyKernel3D <<<config.block_count,
+                config.thread_per_block, 0, d.stream()>>> (cfg3, other_args...);
+
+  // maximize occupancy
+  CudaLaunchConfig cfg4 = GetCudaLaunchConfig(10240, d, MyKernel1D, 0, 0 );
+  MyKernel1D <<<config.block_count,
+                config.thread_per_block, 0, d.stream()>>> (cfg4, other_args...);
+  Cuda2DLaunchConfig cfg5 = GetCuda2DLaunchConfig(10240, 10240, d,
+                                                  MyKernel1D, 0, 0);
+  MyKernel2D <<<config.block_count,
+                config.thread_per_block, 0, d.stream()>>> (cfg5, other_args...);
+  Cuda3DLaunchConfig cfg6 = GetCuda3DLaunchConfig(4096, 4096, 100, d,
+                                                  MyKernel1D, 0, 0);
+  MyKernel3D <<<config.block_count,
+                config.thread_per_block, 0, d.stream()>>> (cfg6, other_args...);
+}
+
+// See the test for this for more example:
+//
+https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/util/cuda_kernel_helper_test.cu.cc
+
+*/
+
+namespace tensorflow {
+
+inline int DivUp(int a, int b) { return (a + b - 1) / b; }
+
+struct CudaLaunchConfig {
+  // Logical number of thread that works on the elements. If each logical
+  // thread works on exactly a single element, this is the same as the working
+  // element count.
+  int virtual_thread_count = -1;
+  // Number of threads per block.
+  int thread_per_block = -1;
+  // Number of blocks for Cuda kernel launch.
+  int block_count = -1;
+};
+
+// Calculate the Cuda launch config we should use for a kernel launch.
+// This is assuming the kernel is quite simple and will largely be
+// memory-limited.
+// REQUIRES: work_element_count > 0.
+inline CudaLaunchConfig GetCudaLaunchConfig(int work_element_count,
+                                            const Eigen::GpuDevice& d) {
+  CHECK_GT(work_element_count, 0);
+  CudaLaunchConfig config;
+  const int virtual_thread_count = work_element_count;
+  const int physical_thread_count = std::min(
+      d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor(),
+      virtual_thread_count);
+  const int thread_per_block = std::min(1024, d.maxCudaThreadsPerBlock());
+  const int block_count =
+      std::min(DivUp(physical_thread_count, thread_per_block),
+               d.getNumCudaMultiProcessors());
+
+  config.virtual_thread_count = virtual_thread_count;
+  config.thread_per_block = thread_per_block;
+  config.block_count = block_count;
+  return config;
+}
+
+// Calculate the Cuda launch config we should use for a kernel launch. This
+// variant takes the resource limits of func into account to maximize occupancy.
+// REQUIRES: work_element_count > 0.
+template <typename DeviceFunc>
+inline CudaLaunchConfig GetCudaLaunchConfig(int work_element_count,
+                                            const Eigen::GpuDevice& d,
+                                            DeviceFunc func,
+                                            size_t dynamic_shared_memory_size,
+                                            int block_size_limit) {
+  CHECK_GT(work_element_count, 0);
+  CudaLaunchConfig config;
+  int block_count = 0;
+  int thread_per_block = 0;
+
+  cudaError_t err = cudaOccupancyMaxPotentialBlockSize(
+      &block_count, &thread_per_block, func, dynamic_shared_memory_size,
+      block_size_limit);
+  CHECK_EQ(err, cudaSuccess);
+
+  block_count =
+      std::min(block_count, DivUp(work_element_count, thread_per_block));
+
+  config.virtual_thread_count = work_element_count;
+  config.thread_per_block = thread_per_block;
+  config.block_count = block_count;
+  return config;
+}
+
+struct Cuda2DLaunchConfig {
+  dim3 virtual_thread_count = dim3(0, 0, 0);
+  dim3 thread_per_block = dim3(0, 0, 0);
+  dim3 block_count = dim3(0, 0, 0);
+};
+
+inline Cuda2DLaunchConfig GetCuda2DLaunchConfig(int xdim, int ydim,
+                                                const Eigen::GpuDevice& d) {
+  Cuda2DLaunchConfig config;
+
+  if (xdim <= 0 || ydim <= 0) {
+    return config;
+  }
+
+  const int kThreadsPerBlock = 256;
+  int block_cols = std::min(xdim, kThreadsPerBlock);
+  // ok to round down here and just do more loops in the kernel
+  int block_rows = std::max(kThreadsPerBlock / block_cols, 1);
+
+  const int physical_thread_count =
+      d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor();
+
+  const int max_blocks = std::max(physical_thread_count / kThreadsPerBlock, 1);
+
+  config.virtual_thread_count = dim3(xdim, ydim, 1);
+  config.thread_per_block = dim3(block_cols, block_rows, 1);
+
+  int grid_x = std::min(DivUp(xdim, block_cols), max_blocks);
+
+  config.block_count = dim3(
+      grid_x, std::min(max_blocks / grid_x, std::max(ydim / block_rows, 1)), 1);
+  return config;
+}
+
+// Calculate the Cuda 2D and 3D launch config we should use for a kernel launch.
+// This variant takes the resource limits of func into account to maximize
+// occupancy.
+using Cuda3DLaunchConfig = Cuda2DLaunchConfig;
+
+template <typename DeviceFunc>
+inline Cuda3DLaunchConfig GetCuda3DLaunchConfig(
+    int xdim, int ydim, int zdim, const Eigen::GpuDevice& d, DeviceFunc func,
+    size_t dynamic_shared_memory_size, int block_size_limit) {
+  Cuda3DLaunchConfig config;
+
+  if (xdim <= 0 || ydim <= 0 || zdim <= 0) {
+    return config;
+  }
+
+  int dev;
+  cudaGetDevice(&dev);
+  cudaDeviceProp deviceProp;
+  cudaGetDeviceProperties(&deviceProp, dev);
+  int xthreadlimit = deviceProp.maxThreadsDim[0];
+  int ythreadlimit = deviceProp.maxThreadsDim[1];
+  int zthreadlimit = deviceProp.maxThreadsDim[2];
+  int xgridlimit = deviceProp.maxGridSize[0];
+  int ygridlimit = deviceProp.maxGridSize[1];
+  int zgridlimit = deviceProp.maxGridSize[2];
+
+  int block_count = 0;
+  int thread_per_block = 0;
+  cudaError_t err = cudaOccupancyMaxPotentialBlockSize(
+      &block_count, &thread_per_block, func, dynamic_shared_memory_size,
+      block_size_limit);
+  CHECK_EQ(err, cudaSuccess);
+
+  auto min3 = [](int a, int b, int c) { return std::min(a, std::min(b, c)); };
+
+  int threadsx = min3(xdim, thread_per_block, xthreadlimit);
+  int threadsy =
+      min3(ydim, std::max(thread_per_block / threadsx, 1), ythreadlimit);
+  int threadsz =
+      min3(zdim, std::max(thread_per_block / (threadsx * threadsy), 1),
+           zthreadlimit);
+
+  int blocksx = min3(block_count, DivUp(xdim, threadsx), xgridlimit);
+  int blocksy =
+      min3(DivUp(block_count, blocksx), DivUp(ydim, threadsy), ygridlimit);
+  int blocksz = min3(DivUp(block_count, (blocksx * blocksy)),
+                     DivUp(zdim, threadsz), zgridlimit);
+
+  config.virtual_thread_count = dim3(xdim, ydim, zdim);
+  config.thread_per_block = dim3(threadsx, threadsy, threadsz);
+  config.block_count = dim3(blocksx, blocksy, blocksz);
+  return config;
+}
+
+template <typename DeviceFunc>
+inline Cuda2DLaunchConfig GetCuda2DLaunchConfig(
+    int xdim, int ydim, const Eigen::GpuDevice& d, DeviceFunc func,
+    size_t dynamic_shared_memory_size, int block_size_limit) {
+  return GetCuda3DLaunchConfig(xdim, ydim, 1, d, func,
+                               dynamic_shared_memory_size, block_size_limit);
+}
+
+// Returns a raw reference to the current cuda stream.  Required by a
+// number of kernel calls (for which StreamInterface* does not work), i.e.
+// CUB and certain cublas primitives.
+inline const cudaStream_t& GetCudaStream(OpKernelContext* context) {
+  const cudaStream_t* ptr = CHECK_NOTNULL(
+      reinterpret_cast<const cudaStream_t*>(context->op_device_context()
+                                                ->stream()
+                                                ->implementation()
+                                                ->CudaStreamMemberHack()));
+  return *ptr;
+}
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CORE_UTIL_CUDA_KERNEL_HELPER_H_
-- 
GitLab


From c5c2eab10bdeb48c864ccb364cf08b4820a8c031 Mon Sep 17 00:00:00 2001
From: Keiji Ariyama <keiji@users.noreply.github.com>
Date: Sat, 27 Jan 2018 01:33:29 +0900
Subject: [PATCH 229/966] Fixing the url for the pip3. (#16447)

---
 tensorflow/docs_src/install/install_mac.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/docs_src/install/install_mac.md b/tensorflow/docs_src/install/install_mac.md
index e13ddadab7..555a6837d8 100644
--- a/tensorflow/docs_src/install/install_mac.md
+++ b/tensorflow/docs_src/install/install_mac.md
@@ -115,7 +115,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      TensorFlow in the active Virtualenv is as follows:
 
      <pre> $ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0rc1-py2-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0rc1-py3-none-any.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common-installation-problems).
@@ -238,7 +238,7 @@ take the following steps:
      issue the following command:
 
      <pre> $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0rc1-py2-none-any.whl</b> </pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0rc1-py3-none-any.whl</b> </pre>
 
      If the preceding command fails, see
      [installation problems](#common-installation-problems).
-- 
GitLab


From 76f70f5d62f35b5cc95121e6dfffa63a8214b626 Mon Sep 17 00:00:00 2001
From: Santiago Castro <bryant@montevideo.com.uy>
Date: Fri, 26 Jan 2018 13:39:07 -0300
Subject: [PATCH 230/966] Update contrib/HVX readme (#16131)

* Update contrib/HVX readme

* Fix PR comments
---
 tensorflow/contrib/hvx/README.md              | 137 ++++++++++--------
 .../contrib/makefile/build_all_android.sh     |   4 +-
 .../build_and_run_inception_hexagon.sh        |   4 +-
 3 files changed, 84 insertions(+), 61 deletions(-)

diff --git a/tensorflow/contrib/hvx/README.md b/tensorflow/contrib/hvx/README.md
index 5a6f2f3086..cb3a1087de 100644
--- a/tensorflow/contrib/hvx/README.md
+++ b/tensorflow/contrib/hvx/README.md
@@ -1,60 +1,67 @@
 # TensorFlow Runtime with HVX Acceleration
 
-## Description
+This README explain how to build and use the TensorFlow runtime with HVX Acceleration. HVX is an extension of Hexagon, a DSP provided by Qualcomm, which can compute vector calculations faster using less energy than ARM processors.
 
-This README explain how to build and use the TensorFlow Runtime with HVX Acceleration. HVX is an extension of Hexagon which is a DSP provided by qualcomm which can compute vector calculations faster using lower energy than ARM processors.
+## Dependencies
+
+* [Android SDK](https://developer.android.com/studio/index.html).
+* [Android NDK](https://developer.android.com/ndk/index.html). Save the path in `${NDK_ROOT}`.
+* A rooted Qualcomm-based Android device connected to the computer (preferably, a [Snapdragon Development Board](https://developer.qualcomm.com/hardware/additional-snapdragon), but it could be a rooted phone with a Qualcomm SoC, albeit this guide may not work with it). The device needs to be rooted for development and testing purposes, and shouldn't be needed in production. See [Behold, The Snapdragon MDP](https://developer.qualcomm.com/blog/behold-snapdragon-mdp) for more information.
+* [Hexagon SDK v3.0](https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools). Save the path in `${QUALCOMM_SDK}`.
+* The current directory should be TensorFlow source code (`git clone https://github.com/tensorflow/tensorflow.git && cd tensorflow`), and saved into `${TF_ROOT_DIR}`.
+
+You may also need to add a test signature in the device to run HVX-based binaries. Follow the instructions in `${QUALCOMM_SDK}/docs/Tools_Signing.html`, using Python 2.
+
+Note that if the device is not rooted, you may not be able to get the serial number, push the test signature and/or run binary files that call HVX libraries.
 
 ## Quick Start Guide
 
-We provides several tools to build and run inference with this runtime quickly.
+We provide several tools to build and run inference with this runtime quickly.
 
-#### All-in-one script to run inception model with prebuild hexagon library
-If you don’t need to build your own implementation of hexagon HVX, we provide a shortcut to execute graphs by using pre-compiled binaries.
+### Run inception model with a prebuilt Hexagon library
 
+If you don’t need to build your own implementation of Hexagon HVX, we provide a shortcut to execute graphs by using pre-compiled binaries.
+
+```shell
+./tensorflow/contrib/makefile/samples/build_and_run_inception_hexagon.sh -p
 ```
-git clone https://github.com/tensorflow/tensorflow.git
-cd tensorflow
-NDK_ROOT="/path/to/ndk" ./tensorflow/contrib/makefile/build_all_android.sh -X
-```
-(-X downloads dependencies to hexagon HVX and graphs, and copy all dependencies to android and execute a test)
 
-#### All-in-one script to run inception model by building entire libraries from source code
- If you want to build your own implementation of hexagon HVX, we provide a sample all-in-one script to execute graphs which downloads source and build everything for hexagon.
+The `-p` option makes the script download dependencies (i.e., Hexagon HVX binaries and graphs models), copy them to the Android device and execute a test.
 
-```
-git clone https://github.com/tensorflow/tensorflow.git
-cd tensorflow
-QUALCOMM_SDK="/path/to/qualcomm/sdk" NDK_ROOT="/path/to/ndk" ./tensorflow/contrib/makefile/samples/build_and_run_inception_hexagon.sh
+### Run inception model by building all from the source code
+
+If you want to build your own implementation of Hexagon HVX, we provide a sample all-in-one script to execute graphs which downloads the source and builds everything that's necessary.
+
+```shell
+./tensorflow/contrib/makefile/samples/build_and_run_inception_hexagon.sh
 ```
 
 ## Building libraries
 
 If you've finished walking through the quick start guide, you may want to try building each binary manually.
 
-#### Build libhexagon_nn_skel.so
-Download hexagon nn library from codeaurora.org and build it.
+### Build libhexagon\_nn\_skel.so
 
-```
+Download Hexagon NN library from codeaurora.org and build it.
+
+```shell
 git clone https://source.codeaurora.org/quic/hexagon_nn/nnlib
 cd nnlib
 ```
 
-(Just follow instructions in README.HOW_TO_BUILD. You can find libhexagon_nn_skel.so in hexagon_Release_dynamic_toolv72_v60/ship)
-Then copy the generated binary to GEN_LIBS_DIR
+Just follow the instructions in `README.HOW_TO_BUILD`. You can find the file `libhexagon_nn_skel.so` in `hexagon_Release_dynamic_toolv72_v60/ship`.
+Then copy the generated binary to `${GEN_LIBS_DIR}`.
 
-```
+```shell
 GEN_LIBS_DIR="/path/to/a/dir/to/store/hexagon/libraries"
 cp -v "hexagon_Release_dynamic_toolv72_v60/ship/libhexagon_nn_skel.so" "${GEN_LIBS_DIR}"
 ```
 
-#### Build libhexagon_controller.so
+### Build libhexagon\_controller.so
+
 Download tensorflow and build hexagon controller.
 
-```
-git clone https://github.com/tensorflow/tensorflow.git
-cd tensorflow
-TF_ROOT_DIR="$(pwd)"
-QUALCOMM_SDK="/path/to/qualcomm/sdk"
+```shell
 GENERATED_NNLIB_DIRECTORY="/path/to/nnlib"
 GENERATED_HEXAGON_CONTROLLER_DIRECTORY="${QUALCOMM_SDK}/examples/common/generated_hexagon_controller"
 rm -rf "${GENERATED_HEXAGON_CONTROLLER_DIRECTORY}"
@@ -70,12 +77,12 @@ make tree VERBOSE=1 V=android_Release
 cp -v "${GENERATED_HEXAGON_CONTROLLER_DIRECTORY}/android_Release/ship/libhexagon_controller.so" "${GEN_LIBS_DIR}"
 ```
 
-#### Build tensorflow linking hexagon library
-Build tensorflow with the build_all_android.sh with specifying -x option.
+### Build TensorFlow linking Hexagon library
 
-```
+Build TensorFlow with `build_all_android.sh` specifying the `-x` option.
+
+```shell
 BUILD_ALL_ANDROID_PATH="${TF_ROOT_DIR}/tensorflow/contrib/makefile/build_all_android.sh"
-NDK_ROOT="/path/to/ndk/root"
 
 CC_PREFIX=${CC_PREFIX} NDK_ROOT=${NDK_ROOT} "${BUILD_ALL_ANDROID_PATH}" \
 -x "${GEN_LIBS_DIR}" \
@@ -83,11 +90,11 @@ CC_PREFIX=${CC_PREFIX} NDK_ROOT=${NDK_ROOT} "${BUILD_ALL_ANDROID_PATH}" \
 -t hexagon_graph_execution
 ```
 
-#### Push binaries to your Android device
+### Push binaries to your Android device
 
 Before running tests on your Android device, you need to push several binaries to it.
 
-```
+```shell
 adb push "${GEN_LIBS_DIR}/libhexagon_controller.so" "/data/local/tmp"
 adb push "${GEN_LIBS_DIR}/libhexagon_nn_skel.so" "/vendor/lib/rfsa/adsp"
 adb push -p \
@@ -100,40 +107,54 @@ adb shell chmod "${ANDROID_EXEC_FILE_MODE}" \
 adb wait-for-device
 ```
 
-#### Run tests on the device
+### Run tests on the device
 
 Finally, you can run the inference tests on your device.
 
-```
+```shell
 adb shell 'LD_LIBRARY_PATH=/data/local/tmp:$LD_LIBRARY_PATH' \
 "/data/local/tmp/hexagon_graph_execution"
 ```
 
-#### Troubleshooting
-If you're using the Open-Q 820 Snapdragon development kit, you may run into an issue with running the executable due to a missing testsig library. From the Hexagon SDK documentation: *Dynamic shared objects are required to be digitally signed and then authenticated at runtime before they are allowed to be loaded and executed.* Generating a testsig library is necessary to run the unsigned sample library built from this project.
+### Troubleshooting
+
+#### Testsig issue
+
+If you're using the Open-Q 820 Snapdragon Development Kit, you may run into an issue with running the executable due to a missing `testsig` library. From the Hexagon SDK documentation: *Dynamic shared objects are required to be digitally signed and then authenticated at runtime before they are allowed to be loaded and executed.* Generating a testsig library is necessary to run the unsigned sample library built from this project.
 
-If the lack of a testsig library is your problem, you will see errors of the type:
+If the lack of a `testsig` library is your problem, you will see errors of the type:
 `vendor/qcom/proprietary/adsprpc/src/fastrpc_apps_user.c:169::error: -1: 0 == (nErr = remotectl_open(name, (int*)ph, dlerrstr, sizeof(dlerrstr), &dlerr))`
-appearing in adb logcat.
-
-There are several ways to create the testsig library, the only prerequisite is Python and the correct version of the Hexagon-SDK. The following steps is one way to create this library:
-1. Run adb as root: `adb root`
-2. Run the command `adb shell cat /sys/devices/soc0/serial_number`
-3. Convert the decimal number you get as output to hex
-4. Run the python script: `python ${QUALCOMM_SDK}/tools/elfsigner/elfsigner.py -t $(SERIAL_NUMBER_HEX_VALUE)`
-5. The output of the python script is a shared library stored in ${QUALCOMM_SDK}/tools/elfsigner/output/testsig-$(SERIAL_NUMBER_HEX_VALUE).so
-6. Push the shared library to your device:
+appearing in `adb logcat` or ["Expected: (version) >= (1), actual: 0 vs 1" while running a binary from adb](https://github.com/tensorflow/tensorflow/issues/11210).
+
+You need to add a test signature, as described at the beginning of this README. After rebooting your device, you should be able to run the sample application.
+
+#### Qualcomm SDK Linux installation fails with "Malformed \uxxxx encoding"
+
+The installation file is based on LaunchAnywhere, which fails in Linux if the `PS1` env variable contains non-common Unicode chars:
+
 ```
-adb root
-adb wait-for-device
-adb remount
-adb wait-for-device
-adb shell mkdir /system/lib/rfsa
-adb shell mkdir /system/lib/rfsa/adsp
-adb push ${QUALCOMM_SDK}/tools/elfsigner/output/testsig-$(SERIAL_NUMBER_HEX_VALUE).so /system/lib/rfsa/adsp/
+Preparing to install...
+Extracting the JRE from the installer archive...
+Unpacking the JRE...
+Extracting the installation resources from the installer archive...
+Configuring the installer for this system's environment...
+
+Launching installer...
+
+An internal LaunchAnywhere application error has occured and this application cannot proceed. (LAX)
+
+Stack Trace:
+java.lang.IllegalArgumentException: Malformed \uxxxx encoding.
+	at java.util.Properties.loadConvert(Properties.java:574)
+	at java.util.Properties.load0(Properties.java:391)
+	at java.util.Properties.load(Properties.java:317)
+	at com.zerog.common.java.util.PropertiesUtil.loadProperties(Unknown Source)
+	at com.zerog.lax.LAX.<init>(Unknown Source)
+	at com.zerog.lax.LAX.main(Unknown Source)
 ```
 
-After rebooting your device, you should be able to run the sample application.
+It can be solved by temporarily assigning the `PS1` environment variable to something simple, such as '$'.
+
+## Maintainers
 
-Maintainers:
-- Satoshi Kataoka (satok@google.com, github.com/satok16)
+* Satoshi Kataoka (satok@google.com, github.com/satok16)
diff --git a/tensorflow/contrib/makefile/build_all_android.sh b/tensorflow/contrib/makefile/build_all_android.sh
index 980a44a595..281c4653c6 100755
--- a/tensorflow/contrib/makefile/build_all_android.sh
+++ b/tensorflow/contrib/makefile/build_all_android.sh
@@ -18,7 +18,7 @@
 set -e
 
 usage() {
-  echo "Usage: NDK_ROOT=<path to ndk root> $(basename "$0") [-Es:t:Tx:a:X]"
+  echo "Usage: NDK_ROOT=<path to ndk root> $(basename "$0") [-Es:t:Tx:a]"
   echo "-E enable experimental hexnn ops"
   echo "-s [sub_makefiles] sub makefiles separated by white space"
   echo "-t [build_target] build target for Android makefile [default=all]"
@@ -37,7 +37,7 @@ fi
 
 ARCH=armeabi-v7a
 
-while getopts "Es:t:Tx:a:" opt_name; do
+while getopts "Es:t:Tx:a" opt_name; do
   case "$opt_name" in
     E) ENABLE_EXPERIMENTAL_HEXNN_OPS="true";;
     s) SUB_MAKEFILES="${OPTARG}";;
diff --git a/tensorflow/contrib/makefile/samples/build_and_run_inception_hexagon.sh b/tensorflow/contrib/makefile/samples/build_and_run_inception_hexagon.sh
index 861bb885c7..203ff4f890 100755
--- a/tensorflow/contrib/makefile/samples/build_and_run_inception_hexagon.sh
+++ b/tensorflow/contrib/makefile/samples/build_and_run_inception_hexagon.sh
@@ -76,6 +76,8 @@ GEN_LIBS_DIR="${GEN_DIR}/libs"
 GEN_DOWNLOAD_DIR="${GEN_DIR}/downloads"
 URL_BASE="https://storage.googleapis.com/download.tensorflow.org"
 
+ARCH="armeabi-v7a"
+
 source "${SCRIPT_DIR}/../build_helper.subr"
 
 rm -rf "${GEN_DIR}"
@@ -219,7 +221,7 @@ if [[ "${BUILD_ONLY}" != "true" ]]; then
     adb push "${GEN_LIBS_DIR}/libhexagon_nn_skel.so" "/vendor/lib/rfsa/adsp"
 
     adb push -p \
-        "${TF_ROOT_DIR}/tensorflow/contrib/makefile/gen/bin/hexagon_graph_execution" \
+        "${TF_ROOT_DIR}/tensorflow/contrib/makefile/gen/bin/android_${ARCH}/hexagon_graph_execution" \
         "/data/local/tmp/"
     adb wait-for-device
     adb shell chmod "${ANDROID_EXEC_FILE_MODE}" \
-- 
GitLab


From 78021a9a70923f1fdaa65b41271ad0ea70cd7e67 Mon Sep 17 00:00:00 2001
From: namrata-ibm <bhavenamrata@gmail.com>
Date: Fri, 26 Jan 2018 22:10:04 +0530
Subject: [PATCH 231/966] Decoding contents of BMP file on big endian (#16145)

* Decoding contents of BMP file on big endian

* Updated as per review comments

* Update decode_bmp_op.cc

Corrected function name
---
 tensorflow/core/kernels/decode_bmp_op.cc | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/kernels/decode_bmp_op.cc b/tensorflow/core/kernels/decode_bmp_op.cc
index c778278e8f..b7d120a617 100644
--- a/tensorflow/core/kernels/decode_bmp_op.cc
+++ b/tensorflow/core/kernels/decode_bmp_op.cc
@@ -39,6 +39,13 @@ class DecodeBmpOp : public OpKernel {
         errors::InvalidArgument("channels must be 0, 1, 3 or 4, got ",
                                 channels_));
   }
+  inline int32 ByteSwapInt32ForBigEndian(int32 x) {
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    return le32toh(x);
+#else
+    return x;
+#endif
+  }
 
   void Compute(OpKernelContext* context) override {
     const Tensor& contents = context->input(0);
@@ -56,14 +63,18 @@ class DecodeBmpOp : public OpKernel {
                                         input.size(), " bytes"));
 
     const uint8* img_bytes = reinterpret_cast<const uint8*>(input.data());
-    const int32 header_size = internal::SubtleMustCopy(
+    int32 header_size_ = internal::SubtleMustCopy(
         *(reinterpret_cast<const int32*>(img_bytes + 10)));
-    const int32 width = internal::SubtleMustCopy(
+    const int32 header_size = ByteSwapInt32ForBigEndian(header_size_);
+    int32 width_ = internal::SubtleMustCopy(
         *(reinterpret_cast<const int32*>(img_bytes + 18)));
-    const int32 height = internal::SubtleMustCopy(
+    const int32 width = ByteSwapInt32ForBigEndian(width_);
+    int32 height_ = internal::SubtleMustCopy(
         *(reinterpret_cast<const int32*>(img_bytes + 22)));
-    const int32 bpp = internal::SubtleMustCopy(
+    const int32 height = ByteSwapInt32ForBigEndian(height_);
+    int32 bpp_ = internal::SubtleMustCopy(
         *(reinterpret_cast<const int32*>(img_bytes + 28)));
+    const int32 bpp = ByteSwapInt32ForBigEndian(bpp_);
 
     if (channels_) {
       OP_REQUIRES(context, (channels_ == bpp / 8),
-- 
GitLab


From 9efe775925ab61e938414e91cd3cf48f0c0efae2 Mon Sep 17 00:00:00 2001
From: namrata-ibm <bhavenamrata@gmail.com>
Date: Fri, 26 Jan 2018 22:18:26 +0530
Subject: [PATCH 232/966] Compare_and_bitpack function for bool for big endian
 (#16398)

---
 .../core/kernels/compare_and_bitpack_op.cc       | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/kernels/compare_and_bitpack_op.cc b/tensorflow/core/kernels/compare_and_bitpack_op.cc
index 9f626a274a..39e4f24ed5 100644
--- a/tensorflow/core/kernels/compare_and_bitpack_op.cc
+++ b/tensorflow/core/kernels/compare_and_bitpack_op.cc
@@ -110,7 +110,20 @@ struct ComputeShard<T,
       typename TTypes<bool>::ConstMatrix input,
       typename TTypes<uint8>::Matrix output, bool /*thresh*/, int64 start,
       int64 limit) {
-    // NOTE(ebrevdo): This assumes memory is little-endian.
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+    for (int64 i = start; i < limit; ++i) {
+      uint8* out = output.data() + i;
+      const int64 block = *reinterpret_cast<const int64*>(input.data() + 8 * i);
+      *out =
+          ((((block & (1LL << (7 * 8))) >> (7 * 8 - 7))) |
+           (((block & (1LL << (6 * 8))) >> (6 * 8 - 6))) |
+           (((block & (1LL << (5 * 8))) >> (5 * 8 - 5))) |
+           (((block & (1LL << (4 * 8))) >> (4 * 8 - 4))) |
+           (((block & (1LL << (3 * 8))) >> (3 * 8 - 3))) |
+           (((block & (1LL << (2 * 8))) >> (2 * 8 - 2))) |
+           (((block & (1LL << 8)) >> (1 * 8 - 1))) | (((block & (1LL)))));
+    }
+#else
     for (int64 i = start; i < limit; ++i) {
       uint8* out = output.data() + i;
       const int64 block = *reinterpret_cast<const int64*>(input.data() + 8 * i);
@@ -123,6 +136,7 @@ struct ComputeShard<T,
            (((block & (1LL << (2 * 8))) >> (2 * 8 - 5))) |
            (((block & (1LL << 8)) >> (1 * 8 - 6))) | (((block & (1LL)) << 7)));
     }
+#endif
   }
 };
 
-- 
GitLab


From 7fc61bfb50aac4e2d0ff9dab9d99a6001aa5cccf Mon Sep 17 00:00:00 2001
From: "Joshua V. Dillon" <jvdillon@google.com>
Date: Fri, 26 Jan 2018 08:57:10 -0800
Subject: [PATCH 233/966] Change `reduce_logsumexp` to internally use `reshape`
 rather than `squeeze` since the latter requires the `axis` arg to be a Python
 `list`.

PiperOrigin-RevId: 183396533
---
 tensorflow/python/ops/math_ops.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 9ad1031354..827e3caa36 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -1841,12 +1841,11 @@ def reduce_logsumexp(input_tensor,
         reduce_sum(
             gen_math_ops.exp(input_tensor - my_max),
             axis,
-            keepdims=True,
-            reduction_indices=reduction_indices)) + my_max
+            keepdims=keepdims,
+            reduction_indices=reduction_indices))
     if not keepdims:
-      if isinstance(axis, int):
-        axis = [axis]
-      result = array_ops.squeeze(result, axis)
+      my_max = array_ops.reshape(my_max, array_ops.shape(result))
+    result += my_max
     return _may_reduce_to_scalar(keepdims, axis, reduction_indices, result)
 
 
-- 
GitLab


From c4ace4e2abf6f19f34357e53ba4aebce5113af01 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 26 Jan 2018 09:03:49 -0800
Subject: [PATCH 234/966] Kernel utils to support broadcast add and mul.

PiperOrigin-RevId: 183397494
---
 tensorflow/contrib/lite/kernels/BUILD         |  29 +++-
 .../contrib/lite/kernels/kernel_util.cc       |  26 +++
 tensorflow/contrib/lite/kernels/kernel_util.h |  17 ++
 .../contrib/lite/kernels/kernel_util_test.cc  | 150 ++++++++++++++++++
 4 files changed, 220 insertions(+), 2 deletions(-)
 create mode 100644 tensorflow/contrib/lite/kernels/kernel_util_test.cc

diff --git a/tensorflow/contrib/lite/kernels/BUILD b/tensorflow/contrib/lite/kernels/BUILD
index 4195e7553c..b5428d3246 100644
--- a/tensorflow/contrib/lite/kernels/BUILD
+++ b/tensorflow/contrib/lite/kernels/BUILD
@@ -71,6 +71,32 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "kernel_util",
+    srcs = [
+        "kernel_util.cc",
+    ],
+    hdrs = [
+        "kernel_util.h",
+    ],
+    deps = [
+        "//tensorflow/contrib/lite:builtin_op_data",
+        "//tensorflow/contrib/lite:context",
+        "//tensorflow/contrib/lite/kernels/internal:round",
+    ],
+)
+
+tf_cc_test(
+    name = "kernel_util_test",
+    size = "small",
+    srcs = ["kernel_util_test.cc"],
+    deps = [
+        ":kernel_util",
+        "//tensorflow/contrib/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_library(
     name = "builtin_ops",
     srcs = [
@@ -87,7 +113,6 @@ cc_library(
         "fully_connected.cc",
         "gather.cc",
         "hashtable_lookup.cc",
-        "kernel_util.cc",
         "l2norm.cc",
         "local_response_norm.cc",
         "lsh_projection.cc",
@@ -111,7 +136,6 @@ cc_library(
         "unidirectional_sequence_rnn.cc",
     ],
     hdrs = [
-        "kernel_util.h",
         "padding.h",
         "register.h",
     ],
@@ -125,6 +149,7 @@ cc_library(
     }),
     deps = [
         ":activation_functor",
+        ":kernel_util",
         ":op_macros",
         "//tensorflow/contrib/lite:builtin_op_data",
         "//tensorflow/contrib/lite:framework",
diff --git a/tensorflow/contrib/lite/kernels/kernel_util.cc b/tensorflow/contrib/lite/kernels/kernel_util.cc
index b0546c00cf..955e8c5764 100644
--- a/tensorflow/contrib/lite/kernels/kernel_util.cc
+++ b/tensorflow/contrib/lite/kernels/kernel_util.cc
@@ -13,8 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/contrib/lite/kernels/kernel_util.h"
+
 #include <algorithm>
 #include <cmath>
+#include <memory>
+
 #include "tensorflow/contrib/lite/kernels/internal/round.h"
 
 namespace tflite {
@@ -84,4 +87,27 @@ void CalculateActivationRangeFloat(TfLiteFusedActivation activation,
   }
 }
 
+bool HaveSameShapes(TfLiteTensor* input1, TfLiteTensor* input2) {
+  return TfLiteIntArrayEqual(input1->dims, input2->dims);
+}
+
+TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context,
+                                        TfLiteTensor* input1,
+                                        TfLiteTensor* input2,
+                                        TfLiteIntArray** output_shape) {
+  int64_t dims1 = NumDimensions(input1);
+  int64_t dims2 = NumDimensions(input2);
+  int64_t out_dims = std::max(dims1, dims2);
+  std::unique_ptr<TfLiteIntArray, void (*)(TfLiteIntArray*)> shape(
+      TfLiteIntArrayCreate(out_dims), TfLiteIntArrayFree);
+  for (int i = 0; i < out_dims; ++i) {
+    int64_t d1 = i >= dims1 ? 1 : SizeOfDimension(input1, dims1 - i - 1);
+    int64_t d2 = i >= dims2 ? 1 : SizeOfDimension(input2, dims2 - i - 1);
+    TF_LITE_ENSURE(context, d1 == d2 || d1 == 1 || d2 == 1);
+    shape->data[out_dims - i - 1] = std::max(d1, d2);
+  }
+  *output_shape = shape.release();
+  return kTfLiteOk;
+}
+
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/kernel_util.h b/tensorflow/contrib/lite/kernels/kernel_util.h
index bfdfba00f5..3cfa72615a 100644
--- a/tensorflow/contrib/lite/kernels/kernel_util.h
+++ b/tensorflow/contrib/lite/kernels/kernel_util.h
@@ -35,6 +35,14 @@ inline TfLiteTensor* GetOutput(TfLiteContext* context, TfLiteNode* node,
 inline int NumInputs(const TfLiteNode* node) { return node->inputs->size; }
 inline int NumOutputs(const TfLiteNode* node) { return node->outputs->size; }
 
+inline int64_t NumElements(const TfLiteTensor* t) {
+  int64_t count = 1;
+  for (int i = 0; i < NumDimensions(t); ++i) {
+    count *= SizeOfDimension(t, i);
+  }
+  return count;
+}
+
 inline TfLiteTensor* GetOptionalInputTensor(TfLiteContext* context,
                                             const TfLiteNode* node, int index) {
   const bool use_tensor = node->inputs->data[index] != kOptionalTensor;
@@ -76,6 +84,15 @@ void CalculateActivationRangeFloat(TfLiteFusedActivation activation,
                                    float* activation_min,
                                    float* activation_max);
 
+// Return true if the given tensors have the same shape.
+bool HaveSameShapes(TfLiteTensor* input1, TfLiteTensor* input2);
+
+// Calculate the output_shape that is necessary for element-wise operations
+// with broadcasting involving the two input tensors.
+TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context,
+                                        TfLiteTensor* input1,
+                                        TfLiteTensor* input2,
+                                        TfLiteIntArray** output_shape);
 }  // namespace tflite
 
 #endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_KERNEL_UTIL_H_
diff --git a/tensorflow/contrib/lite/kernels/kernel_util_test.cc b/tensorflow/contrib/lite/kernels/kernel_util_test.cc
new file mode 100644
index 0000000000..63a317f338
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/kernel_util_test.cc
@@ -0,0 +1,150 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/testing/util.h"
+
+namespace tflite {
+namespace {
+
+void ReportError(TfLiteContext* context, const char* format, ...) {}
+
+class KernelUtilTest : public ::testing::Test {
+ public:
+  KernelUtilTest() {
+    context_.ReportError = ReportError;
+
+    tensor1_.dims = nullptr;
+    tensor2_.dims = nullptr;
+  }
+  ~KernelUtilTest() {
+    TfLiteTensorFree(&tensor1_);
+    TfLiteTensorFree(&tensor2_);
+  }
+
+  void SetShape(TfLiteTensor* tensor, std::initializer_list<int> dims) {
+    TfLiteTensorFree(tensor);
+    tensor->dims = TfLiteIntArrayCreate(dims.size());
+    int i = 0;
+    for (int d : dims) {
+      tensor->dims->data[i] = d;
+      ++i;
+    }
+  }
+
+  std::vector<int> GetShape(TfLiteIntArray* dims) {
+    std::vector<int> result;
+    for (int i = 0; i < dims->size; ++i) {
+      result.push_back(dims->data[i]);
+    }
+    return result;
+  }
+
+ protected:
+  TfLiteContext context_;
+  TfLiteTensor tensor1_;
+  TfLiteTensor tensor2_;
+};
+
+TEST_F(KernelUtilTest, SameShapeEmpty) {
+  EXPECT_TRUE(HaveSameShapes(&tensor1_, &tensor2_));
+
+  SetShape(&tensor1_, {1, 2, 3});
+  EXPECT_FALSE(HaveSameShapes(&tensor1_, &tensor2_));
+
+  SetShape(&tensor2_, {1, 2});
+  EXPECT_FALSE(HaveSameShapes(&tensor1_, &tensor2_));
+
+  SetShape(&tensor2_, {1, 2, 3, 4});
+  EXPECT_FALSE(HaveSameShapes(&tensor1_, &tensor2_));
+
+  SetShape(&tensor2_, {1, 2, 3});
+  EXPECT_TRUE(HaveSameShapes(&tensor1_, &tensor2_));
+
+  SetShape(&tensor2_, {});
+  EXPECT_FALSE(HaveSameShapes(&tensor1_, &tensor2_));
+
+  SetShape(&tensor1_, {});
+  EXPECT_TRUE(HaveSameShapes(&tensor1_, &tensor2_));
+}
+
+TEST_F(KernelUtilTest, BroadcastShapeIncompatibleDim) {
+  TfLiteIntArray* output = nullptr;
+  SetShape(&tensor1_, {1, 2});
+  SetShape(&tensor2_, {1, 3});
+  EXPECT_NE(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
+                                                  &tensor2_, &output));
+  EXPECT_EQ(output, nullptr);
+}
+
+TEST_F(KernelUtilTest, BroadcastShapeOnes) {
+  TfLiteIntArray* output = nullptr;
+  SetShape(&tensor1_, {1, 1});
+  SetShape(&tensor2_, {1, 3});
+  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
+                                                  &tensor2_, &output));
+  TfLiteIntArrayFree(output);
+
+  SetShape(&tensor1_, {1, 2});
+  SetShape(&tensor2_, {1, 1});
+  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
+                                                  &tensor2_, &output));
+  TfLiteIntArrayFree(output);
+}
+
+TEST_F(KernelUtilTest, BroadcastShapeScalars) {
+  TfLiteIntArray* output = nullptr;
+  SetShape(&tensor1_, {1, 2});
+  SetShape(&tensor2_, {});
+  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
+                                                  &tensor2_, &output));
+  EXPECT_THAT(GetShape(output), ::testing::ElementsAre(1, 2));
+  TfLiteIntArrayFree(output);
+
+  SetShape(&tensor1_, {});
+  SetShape(&tensor2_, {2});
+  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
+                                                  &tensor2_, &output));
+  EXPECT_THAT(GetShape(output), ::testing::ElementsAre(2));
+  TfLiteIntArrayFree(output);
+}
+
+TEST_F(KernelUtilTest, BroadcastShapeDifferentSizes) {
+  TfLiteIntArray* output = nullptr;
+  SetShape(&tensor1_, {1, 2});
+  SetShape(&tensor2_, {3, 1, 1});
+  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
+                                                  &tensor2_, &output));
+  EXPECT_THAT(GetShape(output), ::testing::ElementsAre(3, 1, 2));
+  TfLiteIntArrayFree(output);
+
+  SetShape(&tensor1_, {1, 2, 3, 4});
+  SetShape(&tensor2_, {1, 3, 1});
+  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
+                                                  &tensor2_, &output));
+  EXPECT_THAT(GetShape(output), ::testing::ElementsAre(1, 2, 3, 4));
+  TfLiteIntArrayFree(output);
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
-- 
GitLab


From b773acad69605a2afdc0d7a08923484eb5e7f17a Mon Sep 17 00:00:00 2001
From: Sergii Khomenko <x-sam@brainscode.com>
Date: Fri, 26 Jan 2018 18:25:36 +0100
Subject: [PATCH 235/966] Fix a couple minor typos (#16459)

---
 tensorflow/python/data/util/nest.py   | 4 ++--
 tensorflow/python/data/util/sparse.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/data/util/nest.py b/tensorflow/python/data/util/nest.py
index 2455395635..df5498be5f 100644
--- a/tensorflow/python/data/util/nest.py
+++ b/tensorflow/python/data/util/nest.py
@@ -266,7 +266,7 @@ def map_structure(func, *structure, **check_types_dict):
   and the return value will contain the results in the same structure.
 
   Args:
-    func: A callable that acceps as many arguments are there are structures.
+    func: A callable that accepts as many arguments are there are structures.
     *structure: scalar, or tuple or list of constructed scalars and/or other
       tuples/lists, or scalars.  Note: numpy arrays are considered scalars.
     **check_types_dict: only valid keyword argument is `check_types`. If set to
@@ -479,7 +479,7 @@ def map_structure_up_to(shallow_tree, func, *inputs):
   The `inputs`, can be thought of as having the same structure as
   `shallow_tree`, but with leaf nodes that are themselves tree structures.
 
-  This function therefore will return something with the same base structure as
+  This function, therefore, will return something with the same base structure as
   `shallow_tree`.
 
   Examples:
diff --git a/tensorflow/python/data/util/sparse.py b/tensorflow/python/data/util/sparse.py
index 5ebcb4ea81..5e6d224709 100644
--- a/tensorflow/python/data/util/sparse.py
+++ b/tensorflow/python/data/util/sparse.py
@@ -141,7 +141,7 @@ def serialize_sparse_tensors(tensors):
     tensors: a tensor structure to serialize.
 
   Returns:
-    `tensors` with any sparse tensors replaced by the their serialized version.
+    `tensors` with any sparse tensors replaced by their serialized version.
   """
 
   ret = nest.pack_sequence_as(tensors, [
-- 
GitLab


From c1338b14149b6313280bea455ec1dec2a336bd31 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 26 Jan 2018 09:47:58 -0800
Subject: [PATCH 236/966] Updating sparsify_gather.

PiperOrigin-RevId: 183402917
---
 .../tools/graph_transforms/sparsify_gather.cc | 109 ++++++++++++------
 .../graph_transforms/sparsify_gather_test.cc  |  40 ++++++-
 2 files changed, 110 insertions(+), 39 deletions(-)

diff --git a/tensorflow/tools/graph_transforms/sparsify_gather.cc b/tensorflow/tools/graph_transforms/sparsify_gather.cc
index 96324d0dea..593c654f9f 100644
--- a/tensorflow/tools/graph_transforms/sparsify_gather.cc
+++ b/tensorflow/tools/graph_transforms/sparsify_gather.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <cmath>
 #include <memory>
+#include <unordered_map>
 
 #include "tensorflow/c/checkpoint_reader.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -28,9 +29,10 @@ limitations under the License.
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
-using strings::StrCat;
 using str_util::Join;
 using str_util::Split;
+using str_util::StringReplace;
+using strings::StrCat;
 
 namespace graph_transforms {
 
@@ -89,7 +91,7 @@ Status ObtainTensorSlice(const GraphDef& input_graph_def,
                          string* shape_slice_string) {
   string restore_node_name;
   for (const auto& node : input_graph_def.node()) {
-    std::vector<string> node_name_parts = str_util::Split(node.name(), "/");
+    std::vector<string> node_name_parts = Split(node.name(), "/");
     if (node_name_parts.size() == 2 &&
         StringPiece(node_name_parts[0]).starts_with("save") &&
         StringPiece(node_name_parts[1]).starts_with("Assign") &&
@@ -119,13 +121,13 @@ Status ObtainTensorSlice(const GraphDef& input_graph_def,
 }
 
 string GetMonolithicTensorKey(const string& tensor_slice_name) {
-  std::vector<string> names = str_util::Split(tensor_slice_name, "/");
+  std::vector<string> names = Split(tensor_slice_name, "/");
   CHECK_GE(names.size(), 2);
   CHECK(StringPiece(names[names.size() - 1]).starts_with("part_"));
 
   // Remove the "part_x" suffix
   names.pop_back();
-  return str_util::Join(names, "/");
+  return Join(names, "/");
 }
 
 Status ReadTensorFromCheckpoint(
@@ -193,6 +195,15 @@ Status SparsifyGatherInternal(
   GraphDef current_graph_def = input_graph_def;
   bool any_match_found = false;
 
+  // Populate references.
+  std::unordered_map<string, int> refs;
+  for (const auto& node : current_graph_def.node()) {
+    for (const auto& input : node.input()) {
+      auto parsed_input = StringReplace(input, "^", "", true);
+      refs[parsed_input] += 1;
+    }
+  }
+
   // The subgraphs may have overlapping components, therefore GraphMatcher
   // doesn't return all subgraphs in one round -- this has to be multi-round
   // update.
@@ -200,15 +211,15 @@ Status SparsifyGatherInternal(
     any_match_found = false;
     GraphDef replaced_graph_def = current_graph_def;
     std::vector<string> init_table_node_names;
-    std::vector<string> removed_variable_names;
+    std::vector<string> removed_node_names;
 
     TF_RETURN_IF_ERROR(ReplaceMatchingOpTypes(
         current_graph_def, pattern,
         [&ckpt_reader, &any_match_found, &init_table_node_names,
-         &shapes_and_slices, &removed_variable_names](
-            const NodeMatch& match, const std::set<string>& input_nodes,
-            const std::set<string>& output_nodes,
-            std::vector<NodeDef>* new_nodes) {
+         &shapes_and_slices, &removed_node_names,
+         &refs](const NodeMatch& match, const std::set<string>& input_nodes,
+                const std::set<string>& output_nodes,
+                std::vector<NodeDef>* new_nodes) {
           any_match_found = true;
 
           // The captured subgraph should be of the following pattern:
@@ -291,8 +302,12 @@ Status SparsifyGatherInternal(
                 weights_node.name(), ckpt_reader,
                 (*shapes_and_slices)[weights_node.name()], &weight));
             // Add both both weight and identity node names.
-            removed_variable_names.push_back(weights_node.name());
-            removed_variable_names.push_back(match.inputs[0].node.name());
+            removed_node_names.push_back(weights_node.name());
+            removed_node_names.push_back(match.inputs[0].node.name());
+            for (auto input_node : match.inputs[0].node.input()) {
+              auto parsed_input = StringReplace(input_node, "^", "", true);
+              refs[parsed_input]--;
+            }
           }
           Tensor indices_tensor;
           Tensor values_tensor;
@@ -362,15 +377,23 @@ Status SparsifyGatherInternal(
 
           // Connect nodes
           AddNodeInput(hashtable_node.name(), &init_table_node);
+          refs[hashtable_node.name()]++;
           AddNodeInput(indices_node.name(), &init_table_node);
+          refs[indices_node.name()]++;
           AddNodeInput(values_node.name(), &init_table_node);
+          refs[values_node.name()]++;
 
           AddNodeInput(hashtable_node.name(), &lookup_node);
+          refs[hashtable_node.name()]++;
           AddNodeInput(gather_node.input(1), &lookup_node);
+          refs[gather_node.input(1)]++;
           AddNodeInput(default_value_node.name(), &lookup_node);
+          refs[default_value_node.name()]++;
 
           AddNodeInput(lookup_node.name(), &expand_dims_node);
+          refs[lookup_node.name()]++;
           AddNodeInput(dim_idx_node.name(), &expand_dims_node);
+          refs[dim_idx_node.name()]++;
 
           // Copy 'ids' input of original 'Gather'
           new_nodes->push_back(match.inputs[1].node);
@@ -404,22 +427,44 @@ Status SparsifyGatherInternal(
     for (const string& name : init_table_node_names) {
       // Add control dependence from init_table_node to group_deps_node
       AddNodeInput(StrCat("^", name), init_op);
+      refs[name]++;
+    }
+
+    // Erase inputs and outputs as they are not considered for deletion.
+    for (const auto& output : context.output_names) {
+      refs.erase(output);
+    }
+
+    for (const auto& input : context.input_names) {
+      refs.erase(input);
     }
 
-    // Remove all dependencies associated with removed variables.
-    while (!removed_variable_names.empty()) {
-      auto name = removed_variable_names.back();
-      removed_variable_names.pop_back();
+    // Add nodes with a reference count of 0 for deletion.
+    for (auto entry : refs) {
+      if (entry.second == 0) {
+        removed_node_names.push_back(entry.first);
+      }
+    }
+
+    while (!removed_node_names.empty()) {
+      auto name = removed_node_names.back();
+      removed_node_names.pop_back();
+
       int i = 0;
       while (i < replaced_graph_def.node_size()) {
-        if (!replaced_graph_def.node(i).input_size()) {
-          if (replaced_graph_def.node(i).name() == name) {
-            replaced_graph_def.mutable_node()->SwapElements(
-                i, replaced_graph_def.node_size() - 1);
-            replaced_graph_def.mutable_node()->RemoveLast();
-            continue;
+        // Revisit this to see if we can safely remove RestoreV2 nodes.
+        if ((replaced_graph_def.node(i).name() == name) &&
+            (replaced_graph_def.node(i).op() != "RestoreV2")) {
+          for (const auto& input : replaced_graph_def.node(i).input()) {
+            auto parsed_input = StringReplace(input, "^", "", true);
+            refs[parsed_input] -= 1;
+            if (refs[parsed_input] == 0) {
+              removed_node_names.push_back(parsed_input);
+            }
           }
-          i++;
+          replaced_graph_def.mutable_node()->SwapElements(
+              i, replaced_graph_def.node_size() - 1);
+          replaced_graph_def.mutable_node()->RemoveLast();
           continue;
         }
         int j = 0;
@@ -433,18 +478,16 @@ Status SparsifyGatherInternal(
           }
           j++;
         }
-        if ((replaced_graph_def.node(i).input_size() == 0) ||
-            (replaced_graph_def.node(i).op() == "Assign" &&
-             replaced_graph_def.node(i).input_size() == 1)) {
-          removed_variable_names.push_back(replaced_graph_def.node(i).name());
-          if (replaced_graph_def.node(i).input_size() == 1) {
-            removed_variable_names.push_back(
-                replaced_graph_def.node(i).input(0));
+        if (!replaced_graph_def.node(i).input_size()) {
+          if ((refs.find(replaced_graph_def.node(i).name()) != refs.end()) &&
+              (refs[replaced_graph_def.node(i).name()] == 0)) {
+            removed_node_names.push_back(replaced_graph_def.node(i).name());
           }
-          replaced_graph_def.mutable_node()->SwapElements(
-              i, replaced_graph_def.node_size() - 1);
-          replaced_graph_def.mutable_node()->RemoveLast();
-          continue;
+        }
+
+        if (replaced_graph_def.node(i).op() == "Assign" &&
+            replaced_graph_def.node(i).input_size() == 1) {
+          removed_node_names.push_back(replaced_graph_def.node(i).name());
         }
         i++;
       }
diff --git a/tensorflow/tools/graph_transforms/sparsify_gather_test.cc b/tensorflow/tools/graph_transforms/sparsify_gather_test.cc
index 000568a0cc..6627df1331 100644
--- a/tensorflow/tools/graph_transforms/sparsify_gather_test.cc
+++ b/tensorflow/tools/graph_transforms/sparsify_gather_test.cc
@@ -80,6 +80,8 @@ class SparsifyGatherTest : public ::testing::Test {
     // Build the graph.
     NodeDef* input_node = CreateNode("ids", "Const", {}, &graph_def);
     NodeDef* w_node;
+    NodeDef* zeros_const;
+    NodeDef* zeros_shape;
     NodeDef* zeros_node;
     NodeDef* assign_node;
 
@@ -92,8 +94,12 @@ class SparsifyGatherTest : public ::testing::Test {
     } else {
       w_node = CreateNode("w/part_1", "VariableV2", {}, &graph_def);
 
-      zeros_node =
-          CreateNode("w/part_1/Initializer/zeros", "Const", {}, &graph_def);
+      zeros_shape = CreateNode("w/part_1/Initializer/zeros/shape_as_tensor",
+                               "Const", {}, &graph_def);
+      zeros_const = CreateNode("w/part_1/Initializer/zeros/Const", "Const", {},
+                               &graph_def);
+      zeros_node = CreateNode("w/part_1/Initializer/zeros", "Fill",
+                              {zeros_shape, zeros_const}, &graph_def);
       assign_node = CreateNode("w/part_1/Assign", "Assign",
                                {w_node, zeros_node}, &graph_def);
 
@@ -151,6 +157,9 @@ class SparsifyGatherTest : public ::testing::Test {
     MapNamesToNodes(result, &node_lookup);
 
     // Check nodes.
+    EXPECT_EQ(0,
+              node_lookup.count("w/part_1/Initializer/zeros/shape_as_tensor"));
+    EXPECT_EQ(0, node_lookup.count("w/part_1/Initializer/zeros/Const"));
     EXPECT_EQ(0, node_lookup.count("w/part_1/Initializer/zeros"));
     EXPECT_EQ(0, node_lookup.count("w/part_1/Assign"));
 
@@ -247,7 +256,11 @@ class SparsifyGatherTest : public ::testing::Test {
     // Two partitions
     NodeDef* w_node1;
     NodeDef* w_node2;
+    NodeDef* zeros_const1;
+    NodeDef* zeros_shape1;
     NodeDef* zeros_node1;
+    NodeDef* zeros_const2;
+    NodeDef* zeros_shape2;
     NodeDef* zeros_node2;
     NodeDef* assign_node1;
     NodeDef* assign_node2;
@@ -261,8 +274,13 @@ class SparsifyGatherTest : public ::testing::Test {
       SetNodeTensorAttr<float>("value", weights, w_node2);
     } else {
       w_node1 = CreateNode("w1/part_1", "VariableV2", {}, &graph_def);
-      zeros_node1 =
-          CreateNode("w1/part_1/Initializer/zeros", "Const", {}, &graph_def);
+
+      zeros_shape1 = CreateNode("w1/part_1/Initializer/zeros/shape_as_tensor",
+                                "Const", {}, &graph_def);
+      zeros_const1 = CreateNode("w1/part_1/Initializer/zeros/Const", "Const",
+                                {}, &graph_def);
+      zeros_node1 = CreateNode("w1/part_1/Initializer/zeros", "Fill",
+                               {zeros_shape1, zeros_const1}, &graph_def);
       assign_node1 = CreateNode("w1/part_1/Assign", "Assign",
                                 {w_node1, zeros_node1}, &graph_def);
 
@@ -285,8 +303,12 @@ class SparsifyGatherTest : public ::testing::Test {
       CreateNode("save/Assign", "Assign", {w_node1, restore_node1}, &graph_def);
 
       w_node2 = CreateNode("w2/part_1", "VariableV2", {}, &graph_def);
-      zeros_node2 =
-          CreateNode("w2/part_1/Initializer/zeros", "Const", {}, &graph_def);
+      zeros_shape2 = CreateNode("w2/part_1/Initializer/zeros/shape_as_tensor",
+                                "Const", {}, &graph_def);
+      zeros_const2 = CreateNode("w2/part_1/Initializer/zeros/Const", "Const",
+                                {}, &graph_def);
+      zeros_node2 = CreateNode("w2/part_1/Initializer/zeros", "Fill",
+                               {zeros_shape2, zeros_const2}, &graph_def);
       assign_node2 = CreateNode("w2/part_1/Assign", "Assign",
                                 {w_node2, zeros_node2}, &graph_def);
 
@@ -350,8 +372,14 @@ class SparsifyGatherTest : public ::testing::Test {
     MapNamesToNodes(result, &node_lookup);
 
     // Check nodes.
+    EXPECT_EQ(0,
+              node_lookup.count("w1/part_1/Initializer/zeros/shape_as_tensor"));
+    EXPECT_EQ(0, node_lookup.count("w1/part_1/Initializer/zeros/Const"));
     EXPECT_EQ(0, node_lookup.count("w1/part_1/Initializer/zeros"));
     EXPECT_EQ(0, node_lookup.count("w1/part_1/Assign"));
+    EXPECT_EQ(0,
+              node_lookup.count("w2/part_1/Initializer/zeros/shape_as_tensor"));
+    EXPECT_EQ(0, node_lookup.count("w2/part_1/Initializer/zeros/Const"));
     EXPECT_EQ(0, node_lookup.count("w2/part_1/Initializer/zeros"));
     EXPECT_EQ(0, node_lookup.count("w2/part_1/Assign"));
     EXPECT_EQ(1, node_lookup.count("ids"));
-- 
GitLab


From 72e0eaa9c2e408ae720bff55d6f31a75d6accf29 Mon Sep 17 00:00:00 2001
From: cclauss <cclauss@bluewin.ch>
Date: Fri, 26 Jan 2018 19:10:55 +0100
Subject: [PATCH 237/966] from six.moves import xrange (#16438)

---
 .../cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py     | 1 +
 .../contrib/eager/python/examples/resnet50/resnet50_test.py      | 1 +
 tensorflow/contrib/eager/python/examples/spinn/spinn_test.py     | 1 +
 tensorflow/python/client/session_benchmark.py                    | 1 +
 tensorflow/python/kernel_tests/conv_ops_test.py                  | 1 +
 tensorflow/python/kernel_tests/decode_jpeg_op_test.py            | 1 +
 tensorflow/python/kernel_tests/rnn_test.py                       | 1 +
 7 files changed, 7 insertions(+)

diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py
index 4fc5ff1bd1..56c562a3ba 100644
--- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py
+++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import time
 
+from six.moves import xrange
 from tensorflow.contrib import rnn as contrib_rnn
 from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops
 from tensorflow.contrib.rnn.python.ops import lstm_ops
diff --git a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py
index 76e06269b6..1f7beee685 100644
--- a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py
+++ b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py
@@ -22,6 +22,7 @@ import gc
 import tempfile
 import time
 
+from six.moves import xrange
 import tensorflow as tf
 
 import tensorflow.contrib.eager as tfe
diff --git a/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py b/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py
index 84e25cf81a..19b0104c80 100644
--- a/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py
+++ b/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py
@@ -26,6 +26,7 @@ import tempfile
 import time
 
 import numpy as np
+from six.moves import xrange
 import tensorflow as tf
 
 # pylint: disable=g-bad-import-order
diff --git a/tensorflow/python/client/session_benchmark.py b/tensorflow/python/client/session_benchmark.py
index 721bca91b7..06e9a09926 100644
--- a/tensorflow/python/client/session_benchmark.py
+++ b/tensorflow/python/client/session_benchmark.py
@@ -22,6 +22,7 @@ import time
 
 import numpy as np
 
+from six.moves import xrange
 from tensorflow.python.client import session
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
diff --git a/tensorflow/python/kernel_tests/conv_ops_test.py b/tensorflow/python/kernel_tests/conv_ops_test.py
index 3e9bd3dade..c5446326ba 100644
--- a/tensorflow/python/kernel_tests/conv_ops_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_test.py
@@ -24,6 +24,7 @@ import time
 
 import numpy as np
 
+from six.moves import xrange
 from tensorflow.contrib import layers
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.framework import constant_op
diff --git a/tensorflow/python/kernel_tests/decode_jpeg_op_test.py b/tensorflow/python/kernel_tests/decode_jpeg_op_test.py
index ead55cd03b..89fd26c544 100644
--- a/tensorflow/python/kernel_tests/decode_jpeg_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_jpeg_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import os
 import time
 
+from six.moves import xrange
 from tensorflow.python.client import session
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
diff --git a/tensorflow/python/kernel_tests/rnn_test.py b/tensorflow/python/kernel_tests/rnn_test.py
index 0c77d1db92..a86b65affe 100644
--- a/tensorflow/python/kernel_tests/rnn_test.py
+++ b/tensorflow/python/kernel_tests/rnn_test.py
@@ -23,6 +23,7 @@ import timeit
 
 import numpy as np
 
+from six.moves import xrange
 from tensorflow.contrib import rnn as contrib_rnn
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
-- 
GitLab


From 8bab27948aa53db37cb2bbbd89d0009ed2b90014 Mon Sep 17 00:00:00 2001
From: Jerome <jvgonzalvo@up.edu.ph>
Date: Sat, 27 Jan 2018 02:11:11 +0800
Subject: [PATCH 238/966]  Imported lstm1d and lstm2d in ndlstm __init__.py.
 (#16434)

* Added ctc_loss_dense_labels. This does the conversion of dense labels into sparse ones to be passed into the core ctc_loss function.

* Removed constant_op from the import.

* Matched ctc_loss_dense_labels with the other layers ops.

* Added ctc_loss_dense_labels to contrib.layers __init__.py file

* Added missing comma to list of ops.

* Reordred arguments for ctc_loss_dense_labels

Labels should be first then inputs for ctc_loss.

* Removed ctc_loss_dense_labels.

Replaced it with dense_to_sparse instead so that there'll be only one ctc_loss function.

* Replaced ctc_loss_dense_labels with dense_to_sparse

* Fixed dense_to_sparse. Some of the names of the variables did not match with that of the parameters.

* Updated documentation for dense_to_sparse since it can accept a tensor of any shape.

* Added test case for dense_to_sparse.

* Updated documentation. Dense to sparse accepts int tensors.

* Fixed testDenseFromConstantToSparse.

The sparse_to_dense order of arguments in the test are wrong and the expected constant should be of int64.

* Modified implementation of ndlstm_base_dynamic.

It now uses a BasicLSTMCell that has state_is_tuple=True to address deprecation. Right now it is still unknown why it was set to false in the first place.

* Imported lstm1d and lstm2d in ndlstm __init__.py.

Makes importing ndlstm modules easier.

* Added testGetBlocks in lstm2d_test.

* Removed testGetBlocks.py
---
 tensorflow/contrib/ndlstm/__init__.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/contrib/ndlstm/__init__.py b/tensorflow/contrib/ndlstm/__init__.py
index 52e83069cb..da89bb4ab6 100644
--- a/tensorflow/contrib/ndlstm/__init__.py
+++ b/tensorflow/contrib/ndlstm/__init__.py
@@ -16,3 +16,6 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+
+from tensorflow.contrib.ndlstm.python import lstm2d
+from tensorflow.contrib.ndlstm.python import lstm1d
-- 
GitLab


From d623b8240687944483ab2b7df870fd2f1435cec9 Mon Sep 17 00:00:00 2001
From: Shengpeng Liu <lspvic@qq.com>
Date: Sat, 27 Jan 2018 02:11:30 +0800
Subject: [PATCH 239/966] Make raw_rnn accept scalar or TensorArray values for
 state. (#16441)

---
 tensorflow/python/ops/rnn.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tensorflow/python/ops/rnn.py b/tensorflow/python/ops/rnn.py
index a10e1963d1..e0052b8869 100644
--- a/tensorflow/python/ops/rnn.py
+++ b/tensorflow/python/ops/rnn.py
@@ -1125,6 +1125,12 @@ def raw_rnn(cell, loop_fn,
       def _copy_some_through(current, candidate):
         """Copy some tensors through via array_ops.where."""
         def copy_fn(cur_i, cand_i):
+          # TensorArray and scalar get passed through.
+          if isinstance(cur_i, tensor_array_ops.TensorArray):
+            return cand_i
+          if cur_i.shape.ndims == 0:
+            return cand_i
+          # Otherwise propagate the old or the new value.
           with ops.colocate_with(cand_i):
             return array_ops.where(elements_finished, cur_i, cand_i)
         return nest.map_structure(copy_fn, current, candidate)
-- 
GitLab


From 4078ecd4e107cb61da8e4a3ce0293bbb8d9dbb62 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Fri, 26 Jan 2018 10:11:50 -0800
Subject: [PATCH 240/966] Improve profiler error message when graph_path is not
 available. (#16463)

This fix tries to address the issue raised in 16451 to provide
a better error message when graph_path is not available for profiler.

Previously if graph_path is not available, the process will crash
with not very imformative message and a core dump:
```
2018-01-26 01:43:29.458032: F tensorflow/core/profiler/profiler.cc:206] Non-OK-status: ReadProtoFile(Env::Default(), FLAGS_graph_path, graph.get(), false) status: Not found: ; No such file or directory
Aborted (core dumped)
```

With this fix, the error message is improved to:
```
Failed to read graph_path: Invalid argument: Cannot parse proto file.
```
and the process exit with 1.

This fix fixes 16451.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/core/profiler/profiler.cc | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/profiler/profiler.cc b/tensorflow/core/profiler/profiler.cc
index 2cc212d589..808e3c853b 100644
--- a/tensorflow/core/profiler/profiler.cc
+++ b/tensorflow/core/profiler/profiler.cc
@@ -206,8 +206,12 @@ int Run(int argc, char** argv) {
         "graph_path,op_log_path,run_meta_path\n");
     std::unique_ptr<GraphDef> graph(new GraphDef());
     if (!FLAGS_graph_path.empty()) {
-      TF_CHECK_OK(
-          ReadProtoFile(Env::Default(), FLAGS_graph_path, graph.get(), false));
+      s = ReadProtoFile(Env::Default(), FLAGS_graph_path, graph.get(), false);
+      if (!s.ok()) {
+        fprintf(stderr, "Failed to read graph_path: %s\n",
+                s.ToString().c_str());
+        return 1;
+      }
     }
 
     std::unique_ptr<OpLogProto> op_log(new OpLogProto());
-- 
GitLab


From a7d4f82660e1e0bfb8b4bd3a4378e389795b7f9e Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Fri, 26 Jan 2018 10:09:52 -0800
Subject: [PATCH 241/966] [tf.data] Move slow-path-related code into the slow
 path in IteratorHandleOp::Compute().

This slightly reduces the amount of work performed when an iterator is accessed (after the first access), and potentially reduces contention if concurrent steps are accessing the same iterator.

PiperOrigin-RevId: 183406221
---
 tensorflow/core/kernels/data/iterator_ops.cc | 95 +++++++++-----------
 1 file changed, 43 insertions(+), 52 deletions(-)

diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index 56044a3d41..ca22f10a85 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -430,13 +430,10 @@ class IteratorStateVariant {
 REGISTER_UNARY_VARIANT_DECODE_FUNCTION(IteratorStateVariant,
                                        kIteratorVariantTypeName);
 
-// TODO(mrry): Can we simply use the template kernel here?
 class IteratorHandleOp : public OpKernel {
  public:
   explicit IteratorHandleOp(OpKernelConstruction* ctx)
       : OpKernel(ctx), graph_def_version_(ctx->graph_def_version()) {
-    OP_REQUIRES_OK(ctx, ctx->allocate_persistent(DT_STRING, TensorShape({2}),
-                                                 &handle_, nullptr));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_dtypes_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("shared_name", &name_));
@@ -460,56 +457,51 @@ class IteratorHandleOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* context) override LOCKS_EXCLUDED(mu_) {
-    mutex_lock l(mu_);
-    FunctionLibraryRuntime* lib = context->function_library();
-    std::unique_ptr<DeviceMgr> device_mgr(nullptr);
-    std::unique_ptr<FunctionLibraryDefinition> flib_def(nullptr);
-    std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(nullptr);
-    // If the iterator is shared then we construct a new FLR, and pass that in.
-    // NOTE(mrry,rohanj): In this case it is not possible to call remote
-    // functions from the iterator. We may add this functionality if there
-    // is sufficient demand, but it will require a significant refactoring.
-    if (!name_.empty()) {
-      lib = CreateFLR(context, &device_mgr, &flib_def, &pflr);
-    }
+    {
+      mutex_lock l(mu_);
+      if (resource_ == nullptr) {
+        FunctionLibraryRuntime* lib = context->function_library();
+        std::unique_ptr<DeviceMgr> device_mgr(nullptr);
+        std::unique_ptr<FunctionLibraryDefinition> flib_def(nullptr);
+        std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(nullptr);
+        // If the iterator is shared then we construct a new FLR, and pass that
+        // in. NOTE(mrry,rohanj): In this case it is not possible to call remote
+        // functions from the iterator. We may add this functionality if there
+        // is sufficient demand, but it will require a significant refactoring.
+        if (!name_.empty()) {
+          lib = CreatePrivateFLR(context, &device_mgr, &flib_def, &pflr);
+        }
 
-    if (resource_ == nullptr) {
-      ResourceMgr* mgr = context->resource_manager();
-      OP_REQUIRES_OK(context, cinfo_.Init(mgr, def()));
+        ResourceMgr* mgr = context->resource_manager();
+        OP_REQUIRES_OK(context, cinfo_.Init(mgr, def()));
+
+        IteratorResource* resource;
+        OP_REQUIRES_OK(
+            context,
+            mgr->LookupOrCreate<IteratorResource>(
+                cinfo_.container(), cinfo_.name(), &resource,
+                [lib, &device_mgr, &flib_def, &pflr,
+                 this](IteratorResource** ret) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+                  *ret = new IteratorResource(
+                      output_dtypes_, output_shapes_, graph_def_version_,
+                      std::move(device_mgr), std::move(flib_def),
+                      std::move(pflr), lib);
+                  return Status::OK();
+                }));
+
+        Status s = VerifyResource(resource);
+        if (TF_PREDICT_FALSE(!s.ok())) {
+          resource->Unref();
+          context->SetStatus(s);
+          return;
+        }
 
-      IteratorResource* resource;
-      OP_REQUIRES_OK(
-          context,
-          mgr->LookupOrCreate<IteratorResource>(
-              cinfo_.container(), cinfo_.name(), &resource,
-              [lib, &device_mgr, &flib_def, &pflr, this](IteratorResource** ret)
-                  EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-                    *ret = new IteratorResource(
-                        output_dtypes_, output_shapes_, graph_def_version_,
-                        std::move(device_mgr), std::move(flib_def),
-                        std::move(pflr), lib);
-                    return Status::OK();
-                  }));
-
-      Status s = VerifyResource(resource);
-      if (TF_PREDICT_FALSE(!s.ok())) {
-        resource->Unref();
-        context->SetStatus(s);
-        return;
+        resource_ = resource;
       }
-
-      auto h = handle_.AccessTensor(context)->template flat<string>();
-      h(0) = cinfo_.container();
-      h(1) = cinfo_.name();
-      resource_ = resource;
-    }
-    if (context->expected_output_dtype(0) == DT_RESOURCE) {
-      OP_REQUIRES_OK(context, MakeResourceHandleToOutput(
-                                  context, 0, cinfo_.container(), cinfo_.name(),
-                                  MakeTypeIndex<IteratorResource>()));
-    } else {
-      context->set_output_ref(0, &mu_, handle_.AccessTensor(context));
     }
+    OP_REQUIRES_OK(context, MakeResourceHandleToOutput(
+                                context, 0, cinfo_.container(), cinfo_.name(),
+                                MakeTypeIndex<IteratorResource>()));
   }
 
  private:
@@ -526,7 +518,7 @@ class IteratorHandleOp : public OpKernel {
     return Status::OK();
   }
 
-  FunctionLibraryRuntime* CreateFLR(
+  FunctionLibraryRuntime* CreatePrivateFLR(
       OpKernelContext* ctx, std::unique_ptr<DeviceMgr>* device_mgr,
       std::unique_ptr<FunctionLibraryDefinition>* flib_def,
       std::unique_ptr<ProcessFunctionLibraryRuntime>* pflr) {
@@ -546,9 +538,8 @@ class IteratorHandleOp : public OpKernel {
   }
 
   mutex mu_;
-  ContainerInfo cinfo_ GUARDED_BY(mu_);
+  ContainerInfo cinfo_;  // Written once under mu_ then constant afterwards.
   IteratorResource* resource_ GUARDED_BY(mu_) = nullptr;
-  PersistentTensor handle_ GUARDED_BY(mu_);
   DataTypeVector output_dtypes_;
   std::vector<PartialTensorShape> output_shapes_;
   const int graph_def_version_;
-- 
GitLab


From 3222a8bccc20bd43e657d700c6f9d95a2caf8c1e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 26 Jan 2018 10:11:09 -0800
Subject: [PATCH 242/966] Cleanup: Ran clang-format on all *.{cc,h} in under
 grappler.

PiperOrigin-RevId: 183406440
---
 tensorflow/core/grappler/clusters/cluster.cc        | 3 +--
 tensorflow/core/grappler/costs/virtual_scheduler.h  | 2 +-
 tensorflow/core/grappler/optimizers/auto_parallel.h | 2 +-
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/grappler/clusters/cluster.cc b/tensorflow/core/grappler/clusters/cluster.cc
index 01a618ed77..39bfca244e 100644
--- a/tensorflow/core/grappler/clusters/cluster.cc
+++ b/tensorflow/core/grappler/clusters/cluster.cc
@@ -23,8 +23,7 @@ Cluster::Cluster(int timeout_s) : timeout_s_(timeout_s) {
   DisableDetailedStats(false);
 }
 
-Cluster::~Cluster() {
-}
+Cluster::~Cluster() {}
 
 void Cluster::AllowSoftPlacement(bool soft_placement_state) {
   options_.config.set_allow_soft_placement(soft_placement_state);
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.h b/tensorflow/core/grappler/costs/virtual_scheduler.h
index 9db6d46266..5116c8183c 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.h
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.h
@@ -325,7 +325,7 @@ class VirtualScheduler {
   // Boolean field for whether the cost is accurate.
   std::map<string, std::pair<int, bool>> op_costs_;
 
-  Costs graph_costs_;                // Graph cost.
+  Costs graph_costs_;                   // Graph cost.
   std::map<string, Costs> op_to_cost_;  // Per-op cost.
 
   // Auxilliary data structures for constructing NodeState and DeviceState.
diff --git a/tensorflow/core/grappler/optimizers/auto_parallel.h b/tensorflow/core/grappler/optimizers/auto_parallel.h
index c5d2d47782..8d1098d877 100644
--- a/tensorflow/core/grappler/optimizers/auto_parallel.h
+++ b/tensorflow/core/grappler/optimizers/auto_parallel.h
@@ -16,8 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_
 #define TENSORFLOW_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_
 
-#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 #include "tensorflow/core/framework/variable.pb.h"
+#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
-- 
GitLab


From a12bb2e5719e3cd17916690cc8caef0ec7bcdf4d Mon Sep 17 00:00:00 2001
From: cclauss <cclauss@bluewin.ch>
Date: Fri, 26 Jan 2018 19:45:30 +0100
Subject: [PATCH 243/966] long was removed in Python 3 (#16439)

---
 tensorflow/python/tools/saved_model_cli.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/tools/saved_model_cli.py b/tensorflow/python/tools/saved_model_cli.py
index 21e8e803fc..5b0a584c10 100644
--- a/tensorflow/python/tools/saved_model_cli.py
+++ b/tensorflow/python/tools/saved_model_cli.py
@@ -31,6 +31,7 @@ import warnings
 
 import numpy as np
 
+from six import integer_types
 from tensorflow.contrib.saved_model.python.saved_model import reader
 from tensorflow.contrib.saved_model.python.saved_model import signature_def_utils
 from tensorflow.core.example import example_pb2
@@ -440,7 +441,7 @@ def _create_example_string(example_dict):
     elif isinstance(feature_list[0], str):
       example.features.feature[feature_name].bytes_list.value.extend(
           feature_list)
-    elif isinstance(feature_list[0], (int, long)):
+    elif isinstance(feature_list[0], integer_types):
       example.features.feature[feature_name].int64_list.value.extend(
           feature_list)
     else:
-- 
GitLab


From c33286926af522f6fe16a0b0d8e76d83d120bdfb Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 26 Jan 2018 11:00:37 -0800
Subject: [PATCH 244/966] Increase shard count of
 //third_party/tensorflow/python:nn_batchnorm_test to avoid timeouts

When run under asan, the test runs for about 5 minutes, and sometimes
longer, causing frequent timeouts.

This change increases the shard count of the test to 4, which brings the run time
of the longest running shard under asan to about 2 minutes.

PiperOrigin-RevId: 183414888
---
 tensorflow/python/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 01b3e92d2d..a323d5bc39 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -2668,6 +2668,7 @@ cuda_py_test(
         ":nn_ops_gen",
         "//third_party/py/numpy",
     ],
+    shard_count = 4,
     tags = ["no_windows"],
 )
 
-- 
GitLab


From a0a524e2b6e0e34ab79595a1cba80f5357ee8724 Mon Sep 17 00:00:00 2001
From: Suharsh Sivakumar <suharshs@google.com>
Date: Fri, 26 Jan 2018 11:04:47 -0800
Subject: [PATCH 245/966] Add available choices to toco flags and fix minor
 formatting issues.

PiperOrigin-RevId: 183415713
---
 .../contrib/lite/toco/toco_cmdline_flags.cc   | 36 +++++++++----------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc b/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
index f8281f3a57..c5a62fdb62 100644
--- a/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
+++ b/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
@@ -44,9 +44,11 @@ bool ParseTocoFlagsFromCommandLineFlags(
            "For Protobuf formats, the binary format will be used."),
       Flag("input_format", parsed_flags.input_format.bind(),
            parsed_flags.input_format.default_value(),
-           "Input file format. One of: tensorflow_graphdef, "),
+           "Input file format. One of: TENSORFLOW_GRAPHDEF, TFLITE."),
       Flag("output_format", parsed_flags.output_format.bind(),
-           parsed_flags.output_format.default_value(), "Output file format."),
+           parsed_flags.output_format.default_value(),
+           "Output file format. "
+           "One of TENSORFLOW_GRAPHDEF, TFLITE, GRAPHVIZ_DOT."),
       Flag("default_ranges_min", parsed_flags.default_ranges_min.bind(),
            parsed_flags.default_ranges_min.default_value(),
            "If defined, will be used as the default value for the min bound "
@@ -58,11 +60,13 @@ bool ParseTocoFlagsFromCommandLineFlags(
       Flag("inference_type", parsed_flags.inference_type.bind(),
            parsed_flags.inference_type.default_value(),
            "Target data type of arrays in the output file (for input_arrays, "
-           "this may be overridden by inference_input_type)."),
+           "this may be overridden by inference_input_type). "
+           "One of FLOAT, QUANTIZED_UINT8."),
       Flag("inference_input_type", parsed_flags.inference_input_type.bind(),
            parsed_flags.inference_input_type.default_value(),
-           "Target data type of input arrays. If not specified, inference_type "
-           "is used."),
+           "Target data type of input arrays. "
+           "If not specified, inference_type is used. "
+           "One of FLOAT, QUANTIZED_UINT8."),
       Flag("input_type", parsed_flags.input_type.bind(),
            parsed_flags.input_type.default_value(),
            "Deprecated ambiguous flag that set both --input_data_types and "
@@ -76,35 +80,31 @@ bool ParseTocoFlagsFromCommandLineFlags(
 
       Flag("drop_fake_quant", parsed_flags.drop_fake_quant.bind(),
            parsed_flags.drop_fake_quant.default_value(),
-           "Ignore and discard FakeQuant nodes. For instance, that can be used "
-           "to "
+           "Ignore and discard FakeQuant nodes. For instance, to "
            "generate plain float code without fake-quantization from a "
-           "quantized "
-           "graph."),
+           "quantized graph."),
       Flag(
           "reorder_across_fake_quant",
           parsed_flags.reorder_across_fake_quant.bind(),
           parsed_flags.reorder_across_fake_quant.default_value(),
           "Normally, FakeQuant nodes must be strict boundaries for graph "
           "transformations, in order to ensure that quantized inference has "
-          "the "
-          "exact same arithmetic behavior as quantized training --- which is "
-          "the "
-          "whole point of quantized training and of FakeQuant nodes in the "
-          "first "
-          "place. However, that entails subtle requirements on where exactly "
+          "the exact same arithmetic behavior as quantized training --- which "
+          "is the whole point of quantized training and of FakeQuant nodes in "
+          "the first place. "
+          "However, that entails subtle requirements on where exactly "
           "FakeQuant nodes must be placed in the graph. Some quantized graphs "
           "have FakeQuant nodes at unexpected locations, that prevent graph "
           "transformations that are necessary in order to generate inference "
           "code for these graphs. Such graphs should be fixed, but as a "
           "temporary work-around, setting this reorder_across_fake_quant flag "
-          "allows toco to perform necessary graph transformaitons on them, "
+          "allows TOCO to perform necessary graph transformaitons on them, "
           "at the cost of no longer faithfully matching inference and training "
           "arithmetic."),
       Flag("allow_custom_ops", parsed_flags.allow_custom_ops.bind(),
            parsed_flags.allow_custom_ops.default_value(),
-           "If true, allow TOCO to create TF Lite Custom operators for all the"
-           "unsupported Tensorflow ops."),
+           "If true, allow TOCO to create TF Lite Custom operators for all the "
+           "unsupported TensorFlow ops."),
       Flag(
           "drop_control_dependency",
           parsed_flags.drop_control_dependency.bind(),
-- 
GitLab


From c99daaf104105a4e711d91dca8c73c1badecbe5f Mon Sep 17 00:00:00 2001
From: Rohan Jain <rohanj@google.com>
Date: Fri, 26 Jan 2018 11:23:02 -0800
Subject: [PATCH 246/966] Performance improvements to some GPU code to use
 shared locks instead of unique locks for some hotspot cases.

PiperOrigin-RevId: 183418559
---
 .../core/common_runtime/gpu/process_state.cc  | 18 ++++++++++++++-
 tensorflow/stream_executor/executor_cache.cc  | 23 +++++++++++++++----
 .../stream_executor/multi_platform_manager.cc |  4 ++--
 3 files changed, 37 insertions(+), 8 deletions(-)

diff --git a/tensorflow/core/common_runtime/gpu/process_state.cc b/tensorflow/core/common_runtime/gpu/process_state.cc
index 995fd1253f..2f13cf8bd7 100644
--- a/tensorflow/core/common_runtime/gpu/process_state.cc
+++ b/tensorflow/core/common_runtime/gpu/process_state.cc
@@ -230,8 +230,24 @@ Allocator* ProcessState::GetCUDAHostAllocator(int numa_node) {
   // TODO(tucker): actually maintain separate CPUAllocators for
   // different numa_nodes.  For now, just one.
   numa_node = 0;
-  mutex_lock lock(mu_);
 
+  {
+    // Here we optimize the most common use case where cuda_host_allocators_
+    // and cuda_al_ have already been populated and since we're only reading
+    // these vectors, we can get by with a shared lock. In the slower case,
+    // we take a unique lock and populate these vectors.
+    tf_shared_lock lock(mu_);
+
+    if (FLAGS_brain_gpu_record_mem_types &&
+        static_cast<int>(cuda_al_.size()) > 0) {
+      return cuda_al_[0];
+    }
+    if (static_cast<int>(cuda_host_allocators_.size()) > numa_node) {
+      return cuda_host_allocators_[0];
+    }
+  }
+
+  mutex_lock lock(mu_);
   // Find the first valid StreamExecutor to request CUDA host memory
   // through, since any will work.
   //
diff --git a/tensorflow/stream_executor/executor_cache.cc b/tensorflow/stream_executor/executor_cache.cc
index a23d6a70ba..d1a8aae167 100644
--- a/tensorflow/stream_executor/executor_cache.cc
+++ b/tensorflow/stream_executor/executor_cache.cc
@@ -23,6 +23,14 @@ namespace gputools {
 port::StatusOr<StreamExecutor*> ExecutorCache::GetOrCreate(
     const StreamExecutorConfig& config,
     const std::function<ExecutorFactory>& factory) {
+  // In the fast path case, the cache already has an entry and we can just
+  // return after Get() which only takes a shared lock and not a unique lock.
+  // If we need to create, we take a unique lock on cache_.
+  auto fast_result = Get(config);
+  if (fast_result.ok()) {
+    return fast_result;
+  }
+
   Entry* entry = nullptr;
   {
     mutex_lock lock{mutex_};
@@ -59,12 +67,17 @@ port::StatusOr<StreamExecutor*> ExecutorCache::Get(
     const StreamExecutorConfig& config) {
   Entry* entry = nullptr;
   {
-    mutex_lock lock{mutex_};
-    entry = &cache_[config.ordinal];
-    // Release the map lock; the address of 'entry' is stable because
-    // std::map guarantees reference stability.
+    tf_shared_lock lock{mutex_};
+    auto it = cache_.find(config.ordinal);
+    if (it != cache_.end()) {
+      entry = &it->second;
+    } else {
+      return port::Status(port::error::NOT_FOUND,
+                          port::Printf("No executors registered for ordinal %d",
+                                       config.ordinal));
+    }
   }
-  mutex_lock lock{entry->configurations_mutex};
+  tf_shared_lock lock{entry->configurations_mutex};
   if (entry->configurations.empty()) {
     return port::Status(
         port::error::NOT_FOUND,
diff --git a/tensorflow/stream_executor/multi_platform_manager.cc b/tensorflow/stream_executor/multi_platform_manager.cc
index cc32a6beaa..f23224ae77 100644
--- a/tensorflow/stream_executor/multi_platform_manager.cc
+++ b/tensorflow/stream_executor/multi_platform_manager.cc
@@ -45,7 +45,7 @@ namespace gputools {
 
 /* static */ port::StatusOr<Platform*> MultiPlatformManager::PlatformWithName(
     const string& target) {
-  mutex_lock lock(GetPlatformsMutex());
+  tf_shared_lock lock(GetPlatformsMutex());
   auto it = GetPlatformMap()->find(port::Lowercase(target));
 
   if (it == GetPlatformMap()->end()) {
@@ -59,7 +59,7 @@ namespace gputools {
 
 /* static */ port::StatusOr<Platform*> MultiPlatformManager::PlatformWithId(
     const Platform::Id& id) {
-  mutex_lock lock(GetPlatformsMutex());
+  tf_shared_lock lock(GetPlatformsMutex());
   auto it = GetPlatformByIdMap()->find(id);
   if (it == GetPlatformByIdMap()->end()) {
     return port::Status(
-- 
GitLab


From a169db5d066efd696d80fd989091a54cc4fe5baf Mon Sep 17 00:00:00 2001
From: Mikalai Drabovich <nick.drabovich@amd.com>
Date: Fri, 26 Jan 2018 11:27:28 -0800
Subject: [PATCH 247/966] Update README.md (#16437)

* Update README.md

-png, --sample_index options are not available in google-perftools (2.4-0ubuntu5.16.04.1).
Also, since Ubuntu 16.04 wrongly recommends to install pprof from 'tau' package
"
The program 'pprof' is currently not installed. You can install it by typing:
sudo apt install tau
"
the typical user command should probably be

google-pprof --pdf --nodecount=100 <filename>

* Add `google-perftools` installation Note.
---
 tensorflow/core/profiler/README.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/profiler/README.md b/tensorflow/core/profiler/README.md
index 9e628b1065..7997bdfa05 100644
--- a/tensorflow/core/profiler/README.md
+++ b/tensorflow/core/profiler/README.md
@@ -240,8 +240,9 @@ Open a Chrome browser, enter URL chrome://tracing and load the timeline file.
 # can also generate memory profile using `-select bytes`
 tfprof> code -select accelerator_micros -max_depth 100000 -output pprof:outfile=<filename>  -trim_name_regexes .*apply_op.*
 
-# Use pprof to visualize the generated file.
-pprof -png --nodecount=100 --sample_index=1 <filename>
+# Use google-pprof, from the google-perftools package to visualize the generated file.
+# On Ubuntu you can install it with `apt-get install it google-perftools`.
+google-pprof --pdf --nodecount=100 <filename>
 ```
 
 ![PprofGraph](g3doc/pprof.jpg)
-- 
GitLab


From 62b54bfc4bf112eab2a60034fe0ff7b6cfb2ddd7 Mon Sep 17 00:00:00 2001
From: Shrinidhi KL <klshrinidhi@users.noreply.github.com>
Date: Fri, 26 Jan 2018 14:28:34 -0500
Subject: [PATCH 248/966] Fix build errors in contrib/mpi introduced by commit
 6042b5d267f (#16427)

* Add missing header.

* Fix typo. Should refer to incoming argument.
---
 tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc | 2 +-
 tensorflow/contrib/mpi/mpi_rendezvous_mgr.h  | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc b/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc
index 8d14a3ef04..0252bc7992 100644
--- a/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc
+++ b/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc
@@ -152,7 +152,7 @@ MPIRemoteRendezvous::~MPIRemoteRendezvous() {}
 void MPIRendezvousMgr::AddRequest(RecvTensorRequest request,
                                   const int mpi_dst) {
   TF_CHECK_OK(recv_tensor_recent_request_ids_.TrackUnique(
-      req.request_id(), "RecvTensor (MPIRendezvousMgr)", req));
+      request.request_id(), "RecvTensor (MPIRendezvousMgr)", request));
   const int64 step_id = request.step_id();
   const std::string& key = request.rendezvous_key();
   Rendezvous::ParsedKey parsed;
diff --git a/tensorflow/contrib/mpi/mpi_rendezvous_mgr.h b/tensorflow/contrib/mpi/mpi_rendezvous_mgr.h
index ca42ee2f6d..d35e65363f 100644
--- a/tensorflow/contrib/mpi/mpi_rendezvous_mgr.h
+++ b/tensorflow/contrib/mpi/mpi_rendezvous_mgr.h
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/contrib/mpi/mpi_msg.pb.h"
 #include "tensorflow/contrib/mpi/mpi_utils.h"
 #include "tensorflow/core/distributed_runtime/base_rendezvous_mgr.h"
+#include "tensorflow/core/distributed_runtime/recent_request_ids.h"
 #include "tensorflow/core/distributed_runtime/request_id.h"
 #include "tensorflow/core/distributed_runtime/worker_env.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
-- 
GitLab


From de946383a20ca5aada87dd4ca956371ec4a743b2 Mon Sep 17 00:00:00 2001
From: Chris Leary <leary@google.com>
Date: Fri, 26 Jan 2018 11:32:38 -0800
Subject: [PATCH 249/966] [XLA] Improve error message for bad slices.

PiperOrigin-RevId: 183420038
---
 .../compiler/xla/service/shape_inference.cc   | 55 +++++++++++--------
 .../xla/service/shape_inference_test.cc       | 15 +++++
 2 files changed, 47 insertions(+), 23 deletions(-)

diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index a6d6c8b27f..4ba6da6ccc 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -37,6 +37,9 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
 
+using tensorflow::str_util::Join;
+using tensorflow::strings::Printf;
+
 namespace xla {
 
 namespace {
@@ -934,7 +937,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
       "inferring shape for <%s>(%s, %s) with broadcast_dimensions={%s}",
       BinaryOperation_Name(operation).c_str(),
       ShapeUtil::HumanString(lhs).c_str(), ShapeUtil::HumanString(rhs).c_str(),
-      tensorflow::str_util::Join(broadcast_dimensions, ", ").c_str());
+      Join(broadcast_dimensions, ", ").c_str());
   TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(lhs));
   TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(rhs));
 
@@ -1097,7 +1100,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     return InvalidArgument(
         "Map operation requires all operands to have the same shape; got: "
         "%s",
-        tensorflow::str_util::Join(pieces, ", ").c_str());
+        Join(pieces, ", ").c_str());
   }
 
   // Check that dimensions.size == arg_shape.dimensions_size() (we currently
@@ -1114,7 +1117,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     if (dimensions[i] != i) {
       return InvalidArgument(
           "Map requires monotonically increasing dimension numbers, found: %s ",
-          tensorflow::str_util::Join(dimensions, ", ").c_str());
+          Join(dimensions, ", ").c_str());
     }
   }
 
@@ -1914,21 +1917,28 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     const Shape& arg, tensorflow::gtl::ArraySlice<int64> starts,
     tensorflow::gtl::ArraySlice<int64> limits,
     tensorflow::gtl::ArraySlice<int64> strides) {
+  auto error = [&](const string& message) {
+    return InvalidArgument(
+        "%s in slice operation; argument shape: %s; starts: {%s}; limits: "
+        "{%s}; strides: {%s}",
+        message.c_str(), ShapeUtil::HumanString(arg).c_str(),
+        Join(starts, ",").c_str(), Join(limits, ",").c_str(),
+        Join(strides, ",").c_str());
+  };
   TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(arg, "operand of slice"));
   VLOG(2) << tensorflow::strings::Printf(
       "slicing shape %s starts={%s} limits={%s}",
-      ShapeUtil::HumanString(arg).c_str(),
-      tensorflow::str_util::Join(starts, ", ").c_str(),
-      tensorflow::str_util::Join(limits, ", ").c_str());
+      ShapeUtil::HumanString(arg).c_str(), Join(starts, ", ").c_str(),
+      Join(limits, ", ").c_str());
 
   if (starts.size() != limits.size()) {
-    return InvalidArgument("slice start and limit sizes differ: %zu vs %zu",
-                           starts.size(), limits.size());
+    return error(Printf("slice start and limit sizes differ: %zu vs %zu",
+                        starts.size(), limits.size()));
   }
 
   if (starts.size() != strides.size()) {
-    return InvalidArgument("slice start and strides sizes differ: %zu vs %zu",
-                           starts.size(), strides.size());
+    return error(Printf("slice start and strides sizes differ: %zu vs %zu",
+                        starts.size(), strides.size()));
   }
 
   if (starts.size() != ShapeUtil::Rank(arg)) {
@@ -1947,20 +1957,20 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
                              start_index);
     }
     if (limit_index > arg.dimensions(dimension)) {
-      return InvalidArgument(
-          "limit index (%lld) must be less than or equal to dimension "
-          "size (%lld)",
-          limit_index, arg.dimensions(dimension));
+      return error(
+          Printf("limit index (%lld) must be less than or equal to dimension "
+                 "size (%lld)",
+                 limit_index, arg.dimensions(dimension)));
     }
     VLOG(2) << tensorflow::strings::Printf("starts[%lld] = %lld", dimension,
                                            start_index);
     VLOG(2) << tensorflow::strings::Printf("limits[%lld] = %lld", dimension,
                                            limit_index);
     if (start_index > limit_index) {
-      return InvalidArgument(
-          "limit index (%lld) must be greater or equal to "
-          "start index (%lld) in slice with positive stride",
-          limit_index, start_index);
+      return error(
+          Printf("limit index (%lld) must be greater or equal to "
+                 "start index (%lld) in slice with positive stride",
+                 limit_index, start_index));
     }
     if (stride <= 0) {
       return InvalidArgument("stride (%lld) must be positive", stride);
@@ -1983,7 +1993,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
       "slicing shape %s at dynamic start_indices %s with slice_sizes={%s}",
       ShapeUtil::HumanString(operand_shape).c_str(),
       ShapeUtil::HumanString(start_indices_shape).c_str(),
-      tensorflow::str_util::Join(slice_sizes, ", ").c_str());
+      Join(slice_sizes, ", ").c_str());
 
   if (ShapeUtil::Rank(start_indices_shape) != 1) {
     return InvalidArgument(
@@ -2280,8 +2290,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     return InvalidArgument(
         "Reshape dimensions [%s] are not a permutation of the operand "
         "dimensions (operand shape is %s).",
-        tensorflow::str_util::Join(dimensions, ",").c_str(),
-        ShapeUtil::HumanString(operand).c_str());
+        Join(dimensions, ",").c_str(), ShapeUtil::HumanString(operand).c_str());
   }
 
   return inferred_shape;
@@ -2373,8 +2382,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
   // The applied function's arity equals the number of arguments.
   if (arg_shapes.size() != to_apply.parameters_size()) {
     string computation_signature = ShapeUtil::HumanString(to_apply);
-    string argument_shapes = tensorflow::str_util::Join(
-        arg_shapes, ", ", [](string* out, const Shape* shape) {
+    string argument_shapes =
+        Join(arg_shapes, ", ", [](string* out, const Shape* shape) {
           tensorflow::strings::StrAppend(out, ShapeUtil::HumanString(*shape));
         });
     return InvalidArgument(
diff --git a/tensorflow/compiler/xla/service/shape_inference_test.cc b/tensorflow/compiler/xla/service/shape_inference_test.cc
index 99d87f3b55..026c021165 100644
--- a/tensorflow/compiler/xla/service/shape_inference_test.cc
+++ b/tensorflow/compiler/xla/service/shape_inference_test.cc
@@ -1512,5 +1512,20 @@ TEST_F(ShapeInferenceTest, Conditional) {
                         "must have the same shape"));
 }
 
+TEST_F(ShapeInferenceTest, BadSlice) {
+  auto arg = ShapeUtil::MakeShape(F32, {4});
+  StatusOr<Shape> statusor =
+      ShapeInference::InferSliceShape(arg, {0}, {5}, {1});
+  ASSERT_FALSE(statusor.ok());
+
+  LOG(INFO) << statusor.status();
+
+  EXPECT_THAT(statusor.status().error_message(),
+              HasSubstr("less than or equal to dimension size"))
+      << statusor.status();
+  EXPECT_THAT(statusor.status().error_message(), HasSubstr("argument shape"))
+      << statusor.status();
+}
+
 }  // namespace
 }  // namespace xla
-- 
GitLab


From 8d5019f7e11d51234a45b3e77541c9f3879bfd93 Mon Sep 17 00:00:00 2001
From: Gunhan Gulsoy <gunan@google.com>
Date: Fri, 26 Jan 2018 11:46:21 -0800
Subject: [PATCH 250/966] Fix py3 build rules for all py tests under py2tf.

PiperOrigin-RevId: 183422144
---
 tensorflow/contrib/py2tf/BUILD                      | 3 +++
 tensorflow/contrib/py2tf/converters/BUILD           | 8 ++++++++
 tensorflow/contrib/py2tf/pyct/BUILD                 | 5 +++++
 tensorflow/contrib/py2tf/pyct/static_analysis/BUILD | 3 +++
 4 files changed, 19 insertions(+)

diff --git a/tensorflow/contrib/py2tf/BUILD b/tensorflow/contrib/py2tf/BUILD
index d395de986d..3e846aefeb 100644
--- a/tensorflow/contrib/py2tf/BUILD
+++ b/tensorflow/contrib/py2tf/BUILD
@@ -57,6 +57,7 @@ py_library(
 py_test(
     name = "api_test",
     srcs = ["api_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":py2tf_internal",
         "//tensorflow/python:client_testlib",
@@ -66,6 +67,7 @@ py_test(
 py_test(
     name = "conversion_test",
     srcs = ["conversion_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":py2tf_internal",
         "//tensorflow/python:client_testlib",
@@ -76,6 +78,7 @@ py_test(
 py_test(
     name = "naming_test",
     srcs = ["naming_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":py2tf_internal",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/py2tf/converters/BUILD b/tensorflow/contrib/py2tf/converters/BUILD
index 2b0a1234e6..4f90f94e09 100644
--- a/tensorflow/contrib/py2tf/converters/BUILD
+++ b/tensorflow/contrib/py2tf/converters/BUILD
@@ -52,6 +52,7 @@ py_library(
 py_test(
     name = "break_canonicalization_test",
     srcs = ["break_canonicalization_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":test_lib",
         "//tensorflow/contrib/py2tf/pyct",
@@ -62,6 +63,7 @@ py_test(
 py_test(
     name = "call_trees_test",
     srcs = ["call_trees_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":test_lib",
         "//tensorflow/contrib/py2tf/pyct",
@@ -72,6 +74,7 @@ py_test(
 py_test(
     name = "continue_canonicalization_test",
     srcs = ["continue_canonicalization_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":test_lib",
         "//tensorflow/contrib/py2tf/pyct",
@@ -82,6 +85,7 @@ py_test(
 py_test(
     name = "control_flow_test",
     srcs = ["control_flow_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":test_lib",
         "//tensorflow/contrib/py2tf/pyct",
@@ -92,6 +96,7 @@ py_test(
 py_test(
     name = "builtin_functions_test",
     srcs = ["builtin_functions_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":test_lib",
         "//tensorflow/contrib/py2tf/pyct",
@@ -112,6 +117,7 @@ py_test(
 py_test(
     name = "logical_expressions_test",
     srcs = ["logical_expressions_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":test_lib",
         "//tensorflow/contrib/py2tf/pyct",
@@ -122,6 +128,7 @@ py_test(
 py_test(
     name = "print_functions_test",
     srcs = ["print_functions_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":test_lib",
         "//tensorflow/contrib/py2tf/pyct",
@@ -133,6 +140,7 @@ py_test(
 py_test(
     name = "side_effect_guards_test",
     srcs = ["side_effect_guards_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":test_lib",
         "//tensorflow/contrib/py2tf/pyct",
diff --git a/tensorflow/contrib/py2tf/pyct/BUILD b/tensorflow/contrib/py2tf/pyct/BUILD
index e0331dbc97..88902dea84 100644
--- a/tensorflow/contrib/py2tf/pyct/BUILD
+++ b/tensorflow/contrib/py2tf/pyct/BUILD
@@ -38,6 +38,7 @@ py_library(
 py_test(
     name = "anno_test",
     srcs = ["anno_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":pyct",
         "//tensorflow/python:client_testlib",
@@ -47,6 +48,7 @@ py_test(
 py_test(
     name = "compiler_test",
     srcs = ["compiler_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":pyct",
         "//tensorflow/python:client_testlib",
@@ -57,6 +59,7 @@ py_test(
 py_test(
     name = "parser_test",
     srcs = ["parser_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":pyct",
         "//tensorflow/python:client_testlib",
@@ -66,6 +69,7 @@ py_test(
 py_test(
     name = "pretty_printer_test",
     srcs = ["pretty_printer_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":pyct",
         "//tensorflow/python:client_testlib",
@@ -75,6 +79,7 @@ py_test(
 py_test(
     name = "templates_test",
     srcs = ["templates_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":pyct",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/BUILD b/tensorflow/contrib/py2tf/pyct/static_analysis/BUILD
index abaf953678..32e2954fff 100644
--- a/tensorflow/contrib/py2tf/pyct/static_analysis/BUILD
+++ b/tensorflow/contrib/py2tf/pyct/static_analysis/BUILD
@@ -32,6 +32,7 @@ py_library(
 py_test(
     name = "access_test",
     srcs = ["access_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":static_analysis",
         "//tensorflow/contrib/py2tf/pyct",
@@ -43,6 +44,7 @@ py_test(
 py_test(
     name = "live_values_test",
     srcs = ["live_values_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":static_analysis",
         "//tensorflow/contrib/py2tf/pyct",
@@ -53,6 +55,7 @@ py_test(
 py_test(
     name = "type_info_test",
     srcs = ["type_info_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":static_analysis",
         "//tensorflow/contrib/py2tf/pyct",
-- 
GitLab


From 05eb6df4c5c28bd24606b6ddf971a7c4b3fa3e78 Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Fri, 26 Jan 2018 11:46:44 -0800
Subject: [PATCH 251/966] Fix bug with Operation._control_inputs setter.

PiperOrigin-RevId: 183422192
---
 tensorflow/python/framework/ops.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index b107670275..e3a52141a0 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -2103,6 +2103,10 @@ class Operation(object):
     logging.warning("Operation._control_inputs is private, use "
                     "Operation.control_inputs instead. "
                     "Operation._control_inputs will eventually be removed.")
+    # Copy value because it may be self._control_inputs_val (in particular if
+    # this is called from self._control_inputs += ...), and we don't want to
+    # clear value below.
+    value = copy.copy(value)
     self._remove_all_control_inputs()
     self._add_control_inputs(value)
 
-- 
GitLab


From ff6463f4277f412b98d6e6bb1283841ff66902de Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Fri, 26 Jan 2018 11:51:03 -0800
Subject: [PATCH 252/966] Make softmax_op_test.py work with C API enabled.

PiperOrigin-RevId: 183422829
---
 tensorflow/python/kernel_tests/softmax_op_test.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/kernel_tests/softmax_op_test.py b/tensorflow/python/kernel_tests/softmax_op_test.py
index be72c19407..bb3f6970e4 100644
--- a/tensorflow/python/kernel_tests/softmax_op_test.py
+++ b/tensorflow/python/kernel_tests/softmax_op_test.py
@@ -25,11 +25,13 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.platform import test
 
 
+@test_util.with_c_api
 class SoftmaxTest(test.TestCase):
 
   def _npSoftmax(self, features, dim=-1, log=False):
@@ -174,8 +176,11 @@ class SoftmaxTest(test.TestCase):
 
   def testDimTooLarge(self):
     with self.test_session():
+      # Use placeholder to make sure we get runtime error instead of shape
+      # inference error.
+      dim = array_ops.placeholder_with_default(100, shape=[])
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        nn_ops.softmax([1., 2., 3., 4.], dim=100).eval()
+        nn_ops.softmax([1., 2., 3., 4.], dim=dim).eval()
 
   def testLargeDims(self):
     # Make sure that we properly handle large inputs. See
-- 
GitLab


From 2c505193696b106e5e705c2e7e7bcc6e9bb2a566 Mon Sep 17 00:00:00 2001
From: Yoni Tsafir <yoni@joytunes.com>
Date: Fri, 26 Jan 2018 22:01:56 +0200
Subject: [PATCH 253/966] Add a way to provide target nodes in Android (#14722)

* Add a way to provide target nodes in Android

This is required when running some models as a step for initializing the graph etc.

* Fix enableStats mistake on overload

Falsely passed enableStats as false instead of the parameter for the non overloaded version.
---
 .../android/TensorFlowInferenceInterface.java        | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java b/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java
index dc5b9fb887..e51e3f747b 100644
--- a/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java
+++ b/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java
@@ -194,6 +194,13 @@ public class TensorFlowInferenceInterface {
    * @param outputNames A list of output nodes which should be filled by the inference pass.
    */
   public void run(String[] outputNames, boolean enableStats) {
+    run(outputNames, enableStats, new String[] {});
+  }
+
+  /**
+   * An overloaded version of runInference that allows supplying targetNodeNames as well
+   */
+  public void run(String[] outputNames, boolean enableStats, String[] targetNodeNames) {
     // Release any Tensors from the previous run calls.
     closeFetches();
 
@@ -204,6 +211,11 @@ public class TensorFlowInferenceInterface {
       runner.fetch(tid.name, tid.outputIndex);
     }
 
+	// Add targets.
+    for (String t : targetNodeNames) {
+      runner.addTarget(t);
+    }
+
     // Run the session.
     try {
       if (enableStats) {
-- 
GitLab


From 0f65c8f572201f8838189f3e3c3e455759112c14 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 26 Jan 2018 11:59:56 -0800
Subject: [PATCH 254/966] Cleanup: Ran clang-format on all *.{cc,h} files in
 tensorflow/core/kernels.

PiperOrigin-RevId: 183423961
---
 tensorflow/core/kernels/adjust_contrast_op.cc |   4 +-
 .../core/kernels/adjust_contrast_op_test.cc   |   3 +-
 .../core/kernels/adjust_saturation_op.cc      |   5 +-
 tensorflow/core/kernels/aggregate_ops_cpu.h   |  12 +-
 tensorflow/core/kernels/attention_ops.cc      |   5 +-
 tensorflow/core/kernels/avgpooling_op.h       |   5 +-
 .../core/kernels/avgpooling_op_gpu.cu.cc      |  14 +-
 tensorflow/core/kernels/barrier_ops.cc        |  34 +-
 tensorflow/core/kernels/batch_kernels.cc      |   4 +-
 .../core/kernels/batch_matmul_op_impl.h       |  27 +-
 .../core/kernels/batch_matmul_op_real.cc      |   2 +-
 .../core/kernels/batch_matmul_op_test.cc      |   7 +-
 tensorflow/core/kernels/batch_norm_op.cc      |   2 +-
 tensorflow/core/kernels/batch_norm_op_test.cc |   2 +-
 tensorflow/core/kernels/batchtospace_op.cc    |   7 +-
 tensorflow/core/kernels/bcast_ops.cc          |   2 +-
 tensorflow/core/kernels/bias_op_gpu.cu.cc     |  43 +-
 tensorflow/core/kernels/bounds_check.h        |   2 +-
 .../core/kernels/candidate_sampler_ops.cc     |  17 +-
 tensorflow/core/kernels/cast_op.cc            |  17 +-
 tensorflow/core/kernels/cast_op.h             |   3 +-
 tensorflow/core/kernels/cast_op_impl.h        |  28 +-
 tensorflow/core/kernels/cast_op_test.cc       |  12 +-
 tensorflow/core/kernels/colorspace_op.cc      |  65 +-
 tensorflow/core/kernels/colorspace_op.h       |   7 +-
 .../core/kernels/colorspace_op_gpu.cu.cc      |   4 +-
 tensorflow/core/kernels/colorspace_op_test.cc |  56 +-
 tensorflow/core/kernels/concat_lib.h          |  20 +-
 tensorflow/core/kernels/concat_lib_cpu.cc     |  32 +-
 tensorflow/core/kernels/concat_lib_cpu.h      |   6 +-
 tensorflow/core/kernels/concat_op.cc          |  21 +-
 tensorflow/core/kernels/concat_op_test.cc     |   3 +-
 .../kernels/conditional_accumulator_base.h    |   2 +-
 .../kernels/conditional_accumulator_op.cc     |   8 +-
 tensorflow/core/kernels/constant_op.cc        |   1 -
 tensorflow/core/kernels/control_flow_ops.cc   |  96 +--
 .../core/kernels/control_flow_ops_test.cc     |   1 +
 tensorflow/core/kernels/conv_ops.cc           |   2 +-
 tensorflow/core/kernels/conv_ops_fused.cc     |  78 ++-
 tensorflow/core/kernels/conv_ops_gpu.h        |   1 -
 tensorflow/core/kernels/conv_ops_gpu_3.cu.cc  |  31 +-
 .../core/kernels/conv_ops_using_gemm.cc       |  29 +-
 tensorflow/core/kernels/cross_op_gpu.cu.cc    |   2 +-
 tensorflow/core/kernels/ctc_decoder_ops.cc    |  11 +-
 tensorflow/core/kernels/ctc_loss_op.cc        |   4 +-
 tensorflow/core/kernels/cwise_op_abs.cc       |   2 +-
 tensorflow/core/kernels/cwise_op_acos.cc      |   2 +-
 tensorflow/core/kernels/cwise_op_acosh.cc     |   6 +-
 tensorflow/core/kernels/cwise_op_add_1.cc     |   3 +-
 tensorflow/core/kernels/cwise_op_add_2.cc     |   4 +-
 tensorflow/core/kernels/cwise_op_asin.cc      |   2 +-
 tensorflow/core/kernels/cwise_op_asinh.cc     |   8 +-
 tensorflow/core/kernels/cwise_op_atan.cc      |   2 +-
 tensorflow/core/kernels/cwise_op_atanh.cc     |   4 +-
 tensorflow/core/kernels/cwise_op_ceil.cc      |   2 +-
 tensorflow/core/kernels/cwise_op_cos.cc       |   2 +-
 tensorflow/core/kernels/cwise_op_cosh.cc      |  16 +-
 tensorflow/core/kernels/cwise_op_div.cc       |   2 +-
 tensorflow/core/kernels/cwise_op_exp.cc       |   2 +-
 tensorflow/core/kernels/cwise_op_expm1.cc     |   2 +-
 tensorflow/core/kernels/cwise_op_floor.cc     |   2 +-
 tensorflow/core/kernels/cwise_op_floor_div.cc |   2 +-
 tensorflow/core/kernels/cwise_op_floor_mod.cc |   2 +-
 .../core/kernels/cwise_op_gpu_conj.cu.cc      |   4 +-
 .../core/kernels/cwise_op_gpu_equal_to.cu.cc  |   2 +-
 .../core/kernels/cwise_op_gpu_select.cu.cc    |  24 +-
 tensorflow/core/kernels/cwise_op_greater.cc   |   2 +-
 .../core/kernels/cwise_op_greater_equal.cc    |   5 +-
 tensorflow/core/kernels/cwise_op_invert.cc    |   2 +-
 tensorflow/core/kernels/cwise_op_isfinite.cc  |   2 +-
 tensorflow/core/kernels/cwise_op_isinf.cc     |   2 +-
 tensorflow/core/kernels/cwise_op_isnan.cc     |   2 +-
 tensorflow/core/kernels/cwise_op_less.cc      |   2 +-
 .../core/kernels/cwise_op_less_equal.cc       |   2 +-
 tensorflow/core/kernels/cwise_op_log.cc       |   2 +-
 tensorflow/core/kernels/cwise_op_log1p.cc     |   2 +-
 tensorflow/core/kernels/cwise_op_maximum.cc   |   2 +-
 tensorflow/core/kernels/cwise_op_minimum.cc   |   2 +-
 tensorflow/core/kernels/cwise_op_mul_1.cc     |   8 +-
 tensorflow/core/kernels/cwise_op_mul_2.cc     |   6 +-
 tensorflow/core/kernels/cwise_op_neg.cc       |   2 +-
 .../core/kernels/cwise_op_not_equal_to_1.cc   |   2 +-
 .../core/kernels/cwise_op_not_equal_to_2.cc   |   2 +-
 .../core/kernels/cwise_op_reciprocal.cc       |   4 +-
 tensorflow/core/kernels/cwise_op_select.cc    |  30 +-
 tensorflow/core/kernels/cwise_op_sigmoid.cc   |   4 +-
 tensorflow/core/kernels/cwise_op_sign.cc      |   2 +-
 tensorflow/core/kernels/cwise_op_sin.cc       |   2 +-
 tensorflow/core/kernels/cwise_op_sinh.cc      |  16 +-
 tensorflow/core/kernels/cwise_op_sqrt.cc      |   4 +-
 tensorflow/core/kernels/cwise_op_square.cc    |   2 +-
 tensorflow/core/kernels/cwise_op_sub.cc       |   2 +-
 tensorflow/core/kernels/cwise_op_tan.cc       |   2 +-
 tensorflow/core/kernels/cwise_op_tanh.cc      |   2 +-
 tensorflow/core/kernels/cwise_ops_common.cc   |   6 +-
 .../core/kernels/cwise_ops_gpu_gradients.cu.h |  14 +-
 tensorflow/core/kernels/cwise_ops_gradients.h |   3 +-
 .../core/kernels/cwise_ops_sycl_common.h      |  10 +-
 tensorflow/core/kernels/cwise_ops_test.cc     |  42 +-
 tensorflow/core/kernels/debug_ops.cc          |   6 +-
 tensorflow/core/kernels/debug_ops.h           |   4 +-
 tensorflow/core/kernels/decode_csv_op.cc      |  40 +-
 tensorflow/core/kernels/decode_image_op.cc    |  32 +-
 tensorflow/core/kernels/deep_conv2d.cc        |   6 +-
 tensorflow/core/kernels/dense_update_ops.cc   |  16 +-
 .../core/kernels/depthwise_conv_grad_op.cc    |   4 +-
 tensorflow/core/kernels/depthwise_conv_op.cc  |  15 +-
 .../core/kernels/depthwise_conv_op_gpu.cu.cc  |   4 +-
 tensorflow/core/kernels/diag_op.cc            |  56 +-
 tensorflow/core/kernels/diag_op.h             |   8 +-
 tensorflow/core/kernels/diag_op_gpu.cu.cc     |  52 +-
 tensorflow/core/kernels/diag_op_test.cc       |   5 +-
 tensorflow/core/kernels/dilation_ops.cc       |  26 +-
 .../core/kernels/dilation_ops_gpu.cu.cc       |  15 +-
 .../core/kernels/draw_bounding_box_op.cc      |  28 +-
 .../core/kernels/dynamic_partition_op.cc      |   6 +-
 .../kernels/dynamic_partition_op_gpu.cu.cc    |  57 +-
 tensorflow/core/kernels/eigen_activations.h   |  38 +-
 .../core/kernels/eigen_activations_test.cc    |   2 +-
 tensorflow/core/kernels/eigen_attention.h     |  83 ++-
 .../core/kernels/eigen_attention_test.cc      |   2 +-
 .../eigen_backward_spatial_convolutions.h     |  60 +-
 ...igen_backward_spatial_convolutions_test.cc |   2 +-
 tensorflow/core/kernels/eigen_pooling.h       |   8 +-
 tensorflow/core/kernels/eigen_pooling_test.cc |   2 +-
 tensorflow/core/kernels/eigen_softmax.h       |  61 +-
 tensorflow/core/kernels/eigen_softmax_test.cc |   2 +-
 .../core/kernels/eigen_spatial_convolutions.h |  32 +-
 tensorflow/core/kernels/encode_jpeg_op.cc     |  16 +-
 .../core/kernels/example_parsing_ops.cc       |  29 +-
 tensorflow/core/kernels/fact_op.cc            |  12 +-
 .../core/kernels/fake_quant_ops_test.cc       |  44 +-
 tensorflow/core/kernels/fifo_queue.cc         | 169 +++--
 tensorflow/core/kernels/fill_functor.cc       |  10 +-
 .../core/kernels/fractional_avg_pool_op.cc    |   5 +-
 tensorflow/core/kernels/function_ops.cc       |  31 +-
 .../core/kernels/fused_batch_norm_op.cu.cc    |   3 +-
 tensorflow/core/kernels/gather_functor.cc     |  10 +-
 tensorflow/core/kernels/gather_functor.h      |  52 +-
 tensorflow/core/kernels/gather_op.cc          |   3 +-
 tensorflow/core/kernels/hinge-loss.h          |   5 +-
 .../core/kernels/histogram_op_gpu.cu.cc       |   8 +-
 tensorflow/core/kernels/image_resizer_state.h |  10 +-
 tensorflow/core/kernels/in_topk_op.cc         |  64 +-
 tensorflow/core/kernels/inplace_ops.cc        |  10 +-
 tensorflow/core/kernels/l2loss_op.cc          |   2 +-
 tensorflow/core/kernels/linalg_ops_common.cc  |   1 -
 tensorflow/core/kernels/lmdb_reader_op.cc     |  15 +-
 tensorflow/core/kernels/logistic-loss.h       |   7 +-
 tensorflow/core/kernels/loss_test.cc          | 201 +++---
 tensorflow/core/kernels/lrn_op.cc             |  25 +-
 tensorflow/core/kernels/matching_files_op.cc  |  11 +-
 tensorflow/core/kernels/matmul_op.cc          |   4 +-
 tensorflow/core/kernels/matmul_op.h           |   3 +-
 .../core/kernels/matrix_exponential_op.cc     |  12 +-
 .../core/kernels/matrix_logarithm_op.cc       |  12 +-
 tensorflow/core/kernels/matrix_set_diag_op.cc |   4 +-
 tensorflow/core/kernels/maxpooling_op.cc      |   6 +-
 .../core/kernels/maxpooling_op_gpu.cu.cc      |   4 +-
 tensorflow/core/kernels/meta_support.cc       |   6 +-
 tensorflow/core/kernels/mfcc.cc               |  26 +-
 tensorflow/core/kernels/mfcc.h                |   5 +-
 .../core/kernels/mfcc_mel_filterbank.cc       |  46 +-
 tensorflow/core/kernels/mfcc_mel_filterbank.h |   8 +-
 .../core/kernels/mfcc_mel_filterbank_test.cc  |  15 +-
 tensorflow/core/kernels/mfcc_test.cc          |   9 +-
 tensorflow/core/kernels/mirror_pad_op.cc      |   8 +-
 tensorflow/core/kernels/mkl_avgpooling_op.cc  | 279 ++++----
 .../core/kernels/mkl_batch_matmul_op.cc       |   1 -
 tensorflow/core/kernels/mkl_concat_op.cc      | 132 ++--
 .../core/kernels/mkl_conv_grad_bias_ops.cc    |   2 +-
 .../core/kernels/mkl_conv_grad_filter_ops.cc  | 182 ++---
 .../core/kernels/mkl_conv_grad_input_ops.cc   | 107 ++-
 tensorflow/core/kernels/mkl_conv_ops.cc       | 252 +++----
 tensorflow/core/kernels/mkl_conv_ops.h        | 230 +++----
 .../core/kernels/mkl_fused_batch_norm_op.cc   | 506 ++++++--------
 .../core/kernels/mkl_input_conversion_op.cc   |  10 +-
 tensorflow/core/kernels/mkl_lrn_op.cc         | 647 +++++++++---------
 tensorflow/core/kernels/mkl_maxpooling_op.cc  | 497 +++++++-------
 .../core/kernels/mkl_pooling_ops_common.cc    |  14 +-
 .../core/kernels/mkl_pooling_ops_common.h     | 328 +++++----
 tensorflow/core/kernels/mkl_relu_op.cc        | 212 +++---
 tensorflow/core/kernels/mkl_reshape_op.cc     | 105 ++-
 tensorflow/core/kernels/mkl_tfconv_op.cc      |   2 +-
 .../core/kernels/non_max_suppression_op.cc    |  24 +-
 .../kernels/non_max_suppression_op_test.cc    |  30 +-
 tensorflow/core/kernels/nth_element_op.cc     |  39 +-
 tensorflow/core/kernels/nth_element_op.h      |   6 +-
 tensorflow/core/kernels/one_hot_op_gpu.cu.cc  |   6 +-
 tensorflow/core/kernels/ops_util_test.cc      |  13 +-
 tensorflow/core/kernels/pack_op.cc            |   4 +-
 .../parameterized_truncated_normal_op.cc      |  40 +-
 ...arameterized_truncated_normal_op_gpu.cu.cc |  13 +-
 tensorflow/core/kernels/parse_tensor_op.cc    |   1 -
 tensorflow/core/kernels/pooling_ops_3d.cc     |   6 +-
 tensorflow/core/kernels/pooling_ops_3d_sycl.h |  17 +-
 tensorflow/core/kernels/pooling_ops_common.h  |   2 -
 .../core/kernels/quantization_utils_test.cc   |   8 +-
 .../core/kernels/quantize_and_dequantize_op.h |   3 +-
 tensorflow/core/kernels/quantize_op_test.cc   |   3 +-
 .../core/kernels/quantized_batch_norm_op.cc   |   2 +-
 .../core/kernels/quantized_concat_op.cc       |   8 +-
 tensorflow/core/kernels/quantized_conv_ops.cc |   7 +-
 .../core/kernels/quantized_instance_norm.cc   |   8 +-
 .../core/kernels/quantized_matmul_op.cc       |   6 +-
 .../core/kernels/quantized_matmul_op_test.cc  |  39 +-
 tensorflow/core/kernels/quantized_mul_op.cc   |   5 +-
 .../core/kernels/quantized_mul_op_test.cc     |  11 +-
 tensorflow/core/kernels/queue_base.cc         |   4 +-
 tensorflow/core/kernels/queue_ops.cc          |  11 +-
 tensorflow/core/kernels/random_crop_op.cc     |   8 +-
 tensorflow/core/kernels/random_op.cc          | 253 ++++---
 tensorflow/core/kernels/random_op_gpu.cu.cc   |   5 +-
 tensorflow/core/kernels/random_poisson_op.cc  |   2 +-
 .../core/kernels/random_shuffle_queue_op.cc   | 169 +++--
 .../core/kernels/reduction_gpu_kernels.cu.h   |   6 +-
 tensorflow/core/kernels/relu_op.cc            |  13 +-
 tensorflow/core/kernels/relu_op_functor.h     |  11 +-
 tensorflow/core/kernels/resize_bicubic_op.cc  |   2 +-
 .../core/kernels/resize_bicubic_op_test.cc    |  15 +-
 .../core/kernels/resize_bilinear_op_gpu.cu.cc |  20 +-
 tensorflow/core/kernels/reverse_op.cc         |   8 +-
 tensorflow/core/kernels/reverse_op_gpu.cu.cc  |  16 +-
 .../core/kernels/reverse_sequence_op.cc       |  68 +-
 .../kernels/reverse_sequence_op_gpu.cu.cc     |  12 +-
 .../core/kernels/save_restore_tensor.cc       |  10 +-
 tensorflow/core/kernels/scatter_functor.h     |  16 +-
 .../core/kernels/scatter_functor_gpu.cu.h     |  11 +-
 .../core/kernels/scatter_nd_op_cpu_impl.h     |   4 +-
 tensorflow/core/kernels/scatter_op.cc         |  30 +-
 tensorflow/core/kernels/sdca_internal.cc      |  45 +-
 tensorflow/core/kernels/sdca_internal.h       |   3 +-
 tensorflow/core/kernels/sdca_ops.cc           |  14 +-
 .../core/kernels/segment_reduction_ops.cc     |  30 +-
 .../core/kernels/segment_reduction_ops.h      |  19 +-
 .../kernels/segment_reduction_ops_gpu.cu.cc   |  12 +-
 .../core/kernels/self_adjoint_eig_op.cc       |   1 -
 tensorflow/core/kernels/sendrecv_ops.cc       |   6 +-
 tensorflow/core/kernels/sequence_ops.cc       |   8 +-
 tensorflow/core/kernels/session_ops.cc        |   2 +-
 tensorflow/core/kernels/shape_ops.h           |   8 +-
 tensorflow/core/kernels/slice_op.cc           | 168 +++--
 tensorflow/core/kernels/slice_op.h            |   1 -
 tensorflow/core/kernels/slice_op_cpu_impl.h   |   2 +-
 tensorflow/core/kernels/softmax_op.cc         |   6 +-
 .../kernels/spacetobatch_benchmark_test.cc    |   2 +-
 .../core/kernels/spacetobatch_functor.cc      |   2 +-
 .../core/kernels/spacetobatch_functor.h       |   2 +-
 .../kernels/spacetobatch_functor_gpu.cu.cc    |  10 +-
 tensorflow/core/kernels/spacetobatch_op.cc    |   7 +-
 tensorflow/core/kernels/sparse_add_grad_op.cc |  12 +-
 tensorflow/core/kernels/sparse_add_op.cc      |  15 +-
 tensorflow/core/kernels/sparse_add_op_test.cc |   4 +-
 .../sparse_conditional_accumulator_op.cc      |   5 +-
 tensorflow/core/kernels/sparse_cross_op.cc    |  16 +-
 .../kernels/sparse_dense_binary_op_shared.cc  |  10 +-
 .../sparse_dense_binary_op_shared_test.cc     |  16 +-
 tensorflow/core/kernels/sparse_matmul_op.cc   |  24 +-
 tensorflow/core/kernels/sparse_matmul_op.h    |   8 +-
 .../core/kernels/sparse_matmul_op_test.cc     |   8 +-
 .../core/kernels/sparse_reduce_sum_op_test.cc |   8 +-
 tensorflow/core/kernels/sparse_softmax_op.cc  |   5 +-
 .../kernels/sparse_sparse_binary_op_shared.cc |  15 +-
 tensorflow/core/kernels/sparse_split_op.cc    |  26 +-
 tensorflow/core/kernels/sparse_to_dense_op.cc |   5 +-
 .../core/kernels/sparse_to_dense_op_test.cc   |   1 -
 tensorflow/core/kernels/sparse_xent_op.cc     |   8 +-
 .../core/kernels/sparse_xent_op_test.cc       |   6 +-
 tensorflow/core/kernels/split_lib.h           |   2 +-
 tensorflow/core/kernels/split_lib_cpu.cc      |   4 +-
 tensorflow/core/kernels/split_op.cc           |  37 +-
 tensorflow/core/kernels/split_v_op.cc         |  14 +-
 tensorflow/core/kernels/stack_ops.cc          |  18 +-
 tensorflow/core/kernels/stage_op.cc           |  74 +-
 tensorflow/core/kernels/strided_slice_op.cc   |   2 +-
 .../core/kernels/strided_slice_op_impl.h      |   2 +-
 tensorflow/core/kernels/string_join_op.cc     |   6 +-
 tensorflow/core/kernels/substr_op.cc          |   6 +-
 tensorflow/core/kernels/summary_image_op.cc   |  16 +-
 tensorflow/core/kernels/summary_op.cc         |  11 +-
 tensorflow/core/kernels/tile_functor_cpu.cc   |   2 +-
 tensorflow/core/kernels/tile_ops_cpu_impl.h   |   2 +-
 tensorflow/core/kernels/training_ops.cc       |  42 +-
 .../core/kernels/training_ops_gpu.cu.cc       |  24 +-
 tensorflow/core/kernels/training_ops_test.cc  |   5 +-
 tensorflow/core/kernels/transpose_op.cc       |   7 +-
 tensorflow/core/kernels/typed_queue.h         |   6 +-
 tensorflow/core/kernels/unpack_op.cc          |   7 +-
 tensorflow/core/kernels/word2vec_kernels.cc   |   6 +-
 tensorflow/core/kernels/xent_op.cc            |  14 +-
 tensorflow/core/kernels/xsmm_conv2d_test.cc   | 344 +++++-----
 291 files changed, 4072 insertions(+), 4227 deletions(-)

diff --git a/tensorflow/core/kernels/adjust_contrast_op.cc b/tensorflow/core/kernels/adjust_contrast_op.cc
index 37976f7183..72155fd037 100644
--- a/tensorflow/core/kernels/adjust_contrast_op.cc
+++ b/tensorflow/core/kernels/adjust_contrast_op.cc
@@ -40,8 +40,8 @@ typedef Eigen::SyclDevice SYCLDevice;
 template <typename Device, typename T>
 class AdjustContrastOp : public OpKernel {
  public:
-  explicit AdjustContrastOp(OpKernelConstruction* context) : OpKernel(context) {
-  }
+  explicit AdjustContrastOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
 
   void Compute(OpKernelContext* context) override {
     const Tensor& input = context->input(0);
diff --git a/tensorflow/core/kernels/adjust_contrast_op_test.cc b/tensorflow/core/kernels/adjust_contrast_op_test.cc
index 0fc03b5a23..7522b32040 100644
--- a/tensorflow/core/kernels/adjust_contrast_op_test.cc
+++ b/tensorflow/core/kernels/adjust_contrast_op_test.cc
@@ -29,8 +29,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-class AdjustContrastOpTest : public OpsTestBase {
-};
+class AdjustContrastOpTest : public OpsTestBase {};
 
 TEST_F(AdjustContrastOpTest, Simple_1113) {
   TF_EXPECT_OK(NodeDefBuilder("adjust_contrast_op", "AdjustContrastv2")
diff --git a/tensorflow/core/kernels/adjust_saturation_op.cc b/tensorflow/core/kernels/adjust_saturation_op.cc
index 4643d4e6ef..f0c6ae499d 100644
--- a/tensorflow/core/kernels/adjust_saturation_op.cc
+++ b/tensorflow/core/kernels/adjust_saturation_op.cc
@@ -192,8 +192,9 @@ class AdjustSaturationOp<CPUDevice> : public AdjustSaturationOpBase {
     const DeviceBase::CpuWorkerThreads& worker_threads =
         *context->device()->tensorflow_cpu_worker_threads();
     Shard(worker_threads.num_threads, worker_threads.workers, channel_count,
-          kCostPerChannel, [channel_count, &input_data, &output_data, scale_h](
-                               int64 start_channel, int64 end_channel) {
+          kCostPerChannel,
+          [channel_count, &input_data, &output_data, scale_h](
+              int64 start_channel, int64 end_channel) {
             const float* p = input_data.data() + start_channel * kChannelSize;
             float* q = output_data.data() + start_channel * kChannelSize;
             for (int i = start_channel; i < end_channel; i++) {
diff --git a/tensorflow/core/kernels/aggregate_ops_cpu.h b/tensorflow/core/kernels/aggregate_ops_cpu.h
index dfa3fe585e..aa1cead928 100644
--- a/tensorflow/core/kernels/aggregate_ops_cpu.h
+++ b/tensorflow/core/kernels/aggregate_ops_cpu.h
@@ -25,7 +25,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 namespace tensorflow {
 
@@ -201,7 +201,7 @@ struct Add7Functor<SYCLDevice, T> {
                   typename TTypes<T>::ConstFlat in6,
                   typename TTypes<T>::ConstFlat in7) {
     Add7EigenImpl<SYCLDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
-                                         in7);
+                                          in7);
   }
 };
 
@@ -214,7 +214,7 @@ struct Add8Functor<SYCLDevice, T> {
       typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6,
       typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8) {
     Add8EigenImpl<SYCLDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
-                                         in7, in8);
+                                          in7, in8);
   }
 };
 
@@ -227,7 +227,7 @@ struct Add8pFunctor<SYCLDevice, T> {
       typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6,
       typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8) {
     Add8pEigenImpl<SYCLDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
-                                          in7, in8);
+                                           in7, in8);
   }
 };
 
@@ -241,10 +241,10 @@ struct Add9Functor<SYCLDevice, T> {
       typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8,
       typename TTypes<T>::ConstFlat in9) {
     Add9EigenImpl<SYCLDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
-                                         in7, in8, in9);
+                                          in7, in8, in9);
   }
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
 
diff --git a/tensorflow/core/kernels/attention_ops.cc b/tensorflow/core/kernels/attention_ops.cc
index cc8f122cab..ce2fce92e4 100644
--- a/tensorflow/core/kernels/attention_ops.cc
+++ b/tensorflow/core/kernels/attention_ops.cc
@@ -52,8 +52,9 @@ class ExtractGlimpseOp : public OpKernel {
     const int64 batch_size = input_shape.dim_size(0);
 
     const Tensor& window_size = context->input(1);
-    OP_REQUIRES(context, (window_size.shape().dims() == 1) &&
-                             window_size.shape().dim_size(0) == 2,
+    OP_REQUIRES(context,
+                (window_size.shape().dims() == 1) &&
+                    window_size.shape().dim_size(0) == 2,
                 errors::InvalidArgument(
                     "input must be a vector of size 2 (height, width)",
                     window_size.shape().DebugString()));
diff --git a/tensorflow/core/kernels/avgpooling_op.h b/tensorflow/core/kernels/avgpooling_op.h
index dea2683184..f5e81dbc09 100644
--- a/tensorflow/core/kernels/avgpooling_op.h
+++ b/tensorflow/core/kernels/avgpooling_op.h
@@ -48,9 +48,8 @@ struct SpatialAvgPooling {
 
 typedef Eigen::GpuDevice GPUDevice;
 
-// Launch a custom GPU kernels from Yanqing for the avgpooling backward operation
-// that works NHWC data formats.
-// Arguments:
+// Launch a custom GPU kernels from Yanqing for the avgpooling backward
+// operation that works NHWC data formats. Arguments:
 //   top_diff: backprop to the output of the pooling layer
 //   num: number of input batches
 //   height: input height
diff --git a/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc b/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc
index 2be330d142..6537b42f1e 100644
--- a/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc
@@ -71,8 +71,8 @@ __global__ void AvePoolBackwardNHWC(const int nthreads,
         hstart = max(hstart, 0);
         wstart = max(wstart, 0);
         int pool_size = (hend - hstart) * (wend - wstart);
-        gradient +=
-            top_diff_slice[(ph * pooled_width + pw) * channels] / dtype(pool_size);
+        gradient += top_diff_slice[(ph * pooled_width + pw) * channels] /
+                    dtype(pool_size);
       }
     }
     bottom_diff[index] = gradient;
@@ -90,11 +90,11 @@ bool RunAvePoolBackwardNHWC(const T* const top_diff, const int num,
                             const GPUDevice& d) {
   int x_size = num * height * width * channels;
   CudaLaunchConfig config = GetCudaLaunchConfig(x_size, d);
-  AvePoolBackwardNHWC<
-      T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-      config.virtual_thread_count, top_diff, num, height, width, channels,
-      pooled_height, pooled_width, kernel_h, kernel_w, stride_h, stride_w,
-      pad_t, pad_t, bottom_diff);
+  AvePoolBackwardNHWC<T>
+      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+          config.virtual_thread_count, top_diff, num, height, width, channels,
+          pooled_height, pooled_width, kernel_h, kernel_w, stride_h, stride_w,
+          pad_t, pad_t, bottom_diff);
 
   return d.ok();
 }
diff --git a/tensorflow/core/kernels/barrier_ops.cc b/tensorflow/core/kernels/barrier_ops.cc
index d0bbea9fe2..944564dfba 100644
--- a/tensorflow/core/kernels/barrier_ops.cc
+++ b/tensorflow/core/kernels/barrier_ops.cc
@@ -111,13 +111,14 @@ class Barrier : public ResourceBase {
       mutex_lock lock(mu_);
       if (closed_) {
         OP_REQUIRES_ASYNC(
-            ctx, !cancel_pending_enqueues_ &&
-                     (num_inserted == 0 || !incomplete_.empty()),
+            ctx,
+            !cancel_pending_enqueues_ &&
+                (num_inserted == 0 || !incomplete_.empty()),
             errors::Cancelled(
                 "Barrier ", name_, " is closed.  Pending enqueues cancelled: ",
-                cancel_pending_enqueues_, ".  Number of new insertions: ",
-                num_inserted, ".  Number of incomplete keys: ",
-                incomplete_.size(), "."),
+                cancel_pending_enqueues_,
+                ".  Number of new insertions: ", num_inserted,
+                ".  Number of incomplete keys: ", incomplete_.size(), "."),
             callback);
       }
 
@@ -128,9 +129,10 @@ class Barrier : public ResourceBase {
 
       for (int i = 0; i < num_inserted; ++i) {
         OP_REQUIRES_OK_ASYNC(
-            ctx, InsertOneLocked<T>(ctx, keys, values, element_shape,
-                                    component_index, i, &ready_tuples,
-                                    &new_elements),
+            ctx,
+            InsertOneLocked<T>(ctx, keys, values, element_shape,
+                               component_index, i, &ready_tuples,
+                               &new_elements),
             callback);
       }
 
@@ -317,8 +319,9 @@ class Barrier : public ResourceBase {
         return errors::Cancelled(
             "Barrier ", name_,
             " is closed, but attempted to insert a brand new key: ",
-            keys_vec(i), ".  Pending enqueues cancelled: ",
-            cancel_pending_enqueues_, ".  Insertion index: ", i,
+            keys_vec(i),
+            ".  Pending enqueues cancelled: ", cancel_pending_enqueues_,
+            ".  Insertion index: ", i,
             ".  Number of incomplete keys: ", incomplete_.size(), ".");
       }
     } else {
@@ -532,13 +535,14 @@ class InsertManyOp : public BarrierOpKernel {
     OP_REQUIRES_ASYNC(
         ctx, component_index_ < barrier->num_components(),
         errors::InvalidArgument("The component ID is out of range ",
-                                component_index_, " > num_components", " (= ",
-                                barrier->num_components(), ")"),
+                                component_index_, " > num_components",
+                                " (= ", barrier->num_components(), ")"),
         callback);
     OP_REQUIRES_OK_ASYNC(
-        ctx, ctx->MatchSignature({DT_STRING_REF, DT_STRING,
-                                  barrier->component_type(component_index_)},
-                                 {}),
+        ctx,
+        ctx->MatchSignature({DT_STRING_REF, DT_STRING,
+                             barrier->component_type(component_index_)},
+                            {}),
         callback);
 
     const Tensor* keys;
diff --git a/tensorflow/core/kernels/batch_kernels.cc b/tensorflow/core/kernels/batch_kernels.cc
index 5b4e1a809f..c447db842d 100644
--- a/tensorflow/core/kernels/batch_kernels.cc
+++ b/tensorflow/core/kernels/batch_kernels.cc
@@ -13,22 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_util.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/batching_util/shared_batch_scheduler.h"
 #include "tensorflow/core/kernels/batching_util/periodic_function.h"
+#include "tensorflow/core/kernels/batching_util/shared_batch_scheduler.h"
 #include "tensorflow/core/kernels/concat_lib.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/kernels/split_lib.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/macros.h"
 
-
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
diff --git a/tensorflow/core/kernels/batch_matmul_op_impl.h b/tensorflow/core/kernels/batch_matmul_op_impl.h
index 93c3918319..43e716c542 100644
--- a/tensorflow/core/kernels/batch_matmul_op_impl.h
+++ b/tensorflow/core/kernels/batch_matmul_op_impl.h
@@ -41,7 +41,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 namespace {
 
@@ -429,14 +429,13 @@ template <typename Scalar>
 struct LaunchBatchMatMul<SYCLDevice, Scalar> {
   static void Launch(OpKernelContext* context, const Tensor& in_x,
                      const Tensor& in_y, bool adj_x, bool adj_y, Tensor* out) {
-
-  // Number of matrix multiplies i.e. size of the batch.
-  const int64 batch_size = in_x.dim_size(0);
-  ParallelMatMulKernelSYCL<Scalar>::Run(context, in_x, in_y, adj_x, adj_y, out,
-                                        0, batch_size);
+    // Number of matrix multiplies i.e. size of the batch.
+    const int64 batch_size = in_x.dim_size(0);
+    ParallelMatMulKernelSYCL<Scalar>::Run(context, in_x, in_y, adj_x, adj_y,
+                                          out, 0, batch_size);
   }
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename Scalar>
 class BatchMatMul : public OpKernel {
@@ -462,10 +461,10 @@ class BatchMatMul : public OpKernel {
     TensorShape out_shape;
     for (int i = 0; i < ndims - 2; ++i) {
       OP_REQUIRES(ctx, in0.dim_size(i) == in1.dim_size(i),
-                  errors::InvalidArgument("In[0].dim(", i, ") and In[1].dim(",
-                                          i, ") must be the same: ",
-                                          in0.shape().DebugString(), " vs ",
-                                          in1.shape().DebugString()));
+                  errors::InvalidArgument(
+                      "In[0].dim(", i, ") and In[1].dim(", i,
+                      ") must be the same: ", in0.shape().DebugString(), " vs ",
+                      in1.shape().DebugString()));
       out_shape.AddDim(in0.dim_size(i));
     }
     auto n = (ndims == 2) ? 1 : out_shape.num_elements();
@@ -507,12 +506,12 @@ class BatchMatMul : public OpKernel {
   bool adj_y_;
 };
 
-#define REGISTER_BATCH_MATMUL_CPU(TYPE)                                              \
+#define REGISTER_BATCH_MATMUL_CPU(TYPE)                                 \
   REGISTER_KERNEL_BUILDER(                                              \
       Name("BatchMatMul").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"), \
       BatchMatMul<CPUDevice, TYPE>)
 
-#define REGISTER_BATCH_MATMUL_GPU(TYPE)                                              \
+#define REGISTER_BATCH_MATMUL_GPU(TYPE)                                 \
   REGISTER_KERNEL_BUILDER(                                              \
       Name("BatchMatMul").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"), \
       BatchMatMul<GPUDevice, TYPE>)
@@ -522,5 +521,5 @@ class BatchMatMul : public OpKernel {
   REGISTER_KERNEL_BUILDER(                                               \
       Name("BatchMatMul").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
       BatchMatMul<SYCLDevice, TYPE>)
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/batch_matmul_op_real.cc b/tensorflow/core/kernels/batch_matmul_op_real.cc
index 8d155ca62b..7e1e2aa4ec 100644
--- a/tensorflow/core/kernels/batch_matmul_op_real.cc
+++ b/tensorflow/core/kernels/batch_matmul_op_real.cc
@@ -35,5 +35,5 @@ TF_CALL_half(REGISTER_BATCH_MATMUL_GPU);
 #ifdef TENSORFLOW_USE_SYCL
 TF_CALL_float(REGISTER_BATCH_MATMUL_SYCL);
 TF_CALL_double(REGISTER_BATCH_MATMUL_SYCL);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/batch_matmul_op_test.cc b/tensorflow/core/kernels/batch_matmul_op_test.cc
index 7923f34155..c3932cd7b9 100644
--- a/tensorflow/core/kernels/batch_matmul_op_test.cc
+++ b/tensorflow/core/kernels/batch_matmul_op_test.cc
@@ -53,9 +53,10 @@ static Graph* BatchMatmul(int b, int m, int k, int n, bool adjoint_a,
 /* Uncomment to enable benchmarks for double & complex types: */
 // BM_BatchMatmulDev(B, M, K, N, TA, TB, std::complex<float>, DT_COMPLEX64,
 // gpu);
-// BM_BatchMatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, cpu);                    \
-// BM_BatchMatmulDev(M, K, N, TA, TB, std::complex<double>, DT_COMPLEX128, cpu);  \
-// BM_BatchMatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, gpu);                    \
+// BM_BatchMatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, cpu); \
+// BM_BatchMatmulDev(M, K, N, TA, TB, std::complex<double>, DT_COMPLEX128, cpu);
+// \
+// BM_BatchMatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, gpu); \
 // BM_BatchMatmulDev(M, K, N, TA, TB, std::complex<double>, DT_COMPLEX128, gpu);
 
 // Typical fully connected layers
diff --git a/tensorflow/core/kernels/batch_norm_op.cc b/tensorflow/core/kernels/batch_norm_op.cc
index d3ed617f71..c34ea14bf6 100644
--- a/tensorflow/core/kernels/batch_norm_op.cc
+++ b/tensorflow/core/kernels/batch_norm_op.cc
@@ -30,7 +30,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 class BatchNormOp : public OpKernel {
diff --git a/tensorflow/core/kernels/batch_norm_op_test.cc b/tensorflow/core/kernels/batch_norm_op_test.cc
index 5e3fcd2114..45ddc85329 100644
--- a/tensorflow/core/kernels/batch_norm_op_test.cc
+++ b/tensorflow/core/kernels/batch_norm_op_test.cc
@@ -54,7 +54,7 @@ TEST_F(BatchNormOpTest, Simple) {
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 6, 2}));
   test::FillValues<float>(
       &expected, {-17.86f, -22.00f, -15.87f, -20.59f, -13.87f, -19.18f, -21.86f,
-                  -33.31f, -23.85f, -34.72f, -25.85f, -36.13f });
+                  -33.31f, -23.85f, -34.72f, -25.85f, -36.13f});
   test::ExpectTensorNear<float>(expected, *GetOutput(0), 0.01);
 }
 
diff --git a/tensorflow/core/kernels/batchtospace_op.cc b/tensorflow/core/kernels/batchtospace_op.cc
index c1c0d6d329..b07c5fd718 100644
--- a/tensorflow/core/kernels/batchtospace_op.cc
+++ b/tensorflow/core/kernels/batchtospace_op.cc
@@ -56,9 +56,10 @@ static void BatchToSpaceOpCompute(OpKernelContext* context,
       errors::InvalidArgument("input rank should be >= ", 1 + block_dims,
                               " instead of ", orig_input_tensor.dims()));
 
-  OP_REQUIRES(context, TensorShapeUtils::IsMatrix(orig_crops.shape()) &&
-                           block_dims == orig_crops.dim_size(0) &&
-                           2 == orig_crops.dim_size(1),
+  OP_REQUIRES(context,
+              TensorShapeUtils::IsMatrix(orig_crops.shape()) &&
+                  block_dims == orig_crops.dim_size(0) &&
+                  2 == orig_crops.dim_size(1),
               errors::InvalidArgument("crops should have shape [", block_dims,
                                       ", 2] instead of ",
                                       orig_crops.shape().DebugString()));
diff --git a/tensorflow/core/kernels/bcast_ops.cc b/tensorflow/core/kernels/bcast_ops.cc
index 7fc4b1762d..8e4f08e473 100644
--- a/tensorflow/core/kernels/bcast_ops.cc
+++ b/tensorflow/core/kernels/bcast_ops.cc
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/util/bcast.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/bcast.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/bias_op_gpu.cu.cc b/tensorflow/core/kernels/bias_op_gpu.cu.cc
index 2ca194a77f..754b93b073 100644
--- a/tensorflow/core/kernels/bias_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/bias_op_gpu.cu.cc
@@ -77,14 +77,14 @@ void BiasGPU<T>::compute(const GPUDevice& d, const T* input, const T* bias,
   }
   CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
   if (data_format == FORMAT_NHWC) {
-    BiasNHWCKernel<
-        T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        config.virtual_thread_count, input, bias, output, bias_size);
+    BiasNHWCKernel<T>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            config.virtual_thread_count, input, bias, output, bias_size);
   } else {
-    BiasNCHWKernel<
-        T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        config.virtual_thread_count, input, bias, output, bias_size,
-        image_size);
+    BiasNCHWKernel<T>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            config.virtual_thread_count, input, bias, output, bias_size,
+            image_size);
   }
 }
 
@@ -206,10 +206,10 @@ void BiasGradGPU<T>::compute(const GPUDevice& d, const T* output_backprop,
   // Check if we have enough shared memory.
   if (shared_memory_size <= max_shared_memory_size) {
     if (data_format == FORMAT_NHWC) {
-      BiasGradNHWC_SharedAtomics<
-          T><<<config.block_count, config.thread_per_block, shared_memory_size,
-               d.stream()>>>(total_count, output_backprop, bias_backprop,
-                             bias_size);
+      BiasGradNHWC_SharedAtomics<T>
+          <<<config.block_count, config.thread_per_block, shared_memory_size,
+             d.stream()>>>(total_count, output_backprop, bias_backprop,
+                           bias_size);
     } else {
       // Round up the block count to multiple of bias_size.
       int group_size = (config.block_count + bias_size - 1) / bias_size;
@@ -217,23 +217,24 @@ void BiasGradGPU<T>::compute(const GPUDevice& d, const T* output_backprop,
       if (config.thread_per_block < kWarpSize) {
         config.thread_per_block = kWarpSize;
       }
-      BiasGradNCHW_SharedAtomics<
-          T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-          output_backprop, bias_backprop, batch, bias_size, image_size,
-          group_size);
+      BiasGradNCHW_SharedAtomics<T>
+          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+              output_backprop, bias_backprop, batch, bias_size, image_size,
+              group_size);
     }
   } else {
     // Note that even if we don't have enough shared memory to fit the entire
     // output block, it is possible to process one group of elements at a time.
     // But for now, we simply fall back to the naive implementation.
     if (data_format == FORMAT_NHWC) {
-      BiasGradNHWC_Naive<
-          T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-          total_count, output_backprop, bias_backprop, bias_size);
+      BiasGradNHWC_Naive<T>
+          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+              total_count, output_backprop, bias_backprop, bias_size);
     } else {
-      BiasGradNCHW_Naive<
-          T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-          total_count, output_backprop, bias_backprop, bias_size, image_size);
+      BiasGradNCHW_Naive<T>
+          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+              total_count, output_backprop, bias_backprop, bias_size,
+              image_size);
     }
   }
 }
diff --git a/tensorflow/core/kernels/bounds_check.h b/tensorflow/core/kernels/bounds_check.h
index e35f42ad41..c8c60c5524 100644
--- a/tensorflow/core/kernels/bounds_check.h
+++ b/tensorflow/core/kernels/bounds_check.h
@@ -48,7 +48,7 @@ EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC const T SubtleMustCopy(const T &x) {
   auto *to_x = reinterpret_cast<const volatile T *>(&x);
   return *to_x;
 }
-}  // namespace tensorflow::internal
+}  // namespace internal
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_UTIL_BOUNDS_CHECK_H_
diff --git a/tensorflow/core/kernels/candidate_sampler_ops.cc b/tensorflow/core/kernels/candidate_sampler_ops.cc
index e937c4f11b..654d99301a 100644
--- a/tensorflow/core/kernels/candidate_sampler_ops.cc
+++ b/tensorflow/core/kernels/candidate_sampler_ops.cc
@@ -126,13 +126,13 @@ REGISTER_KERNEL_BUILDER(Name("UniformCandidateSampler").Device(DEVICE_CPU),
 REGISTER_KERNEL_BUILDER(Name("LogUniformCandidateSampler").Device(DEVICE_CPU),
                         SimpleCandidateSamplerOp<LogUniformSampler>);
 
-REGISTER_KERNEL_BUILDER(Name("LearnedUnigramCandidateSampler")
-                            .Device(DEVICE_CPU),
-                        SimpleCandidateSamplerOp<UnigramSampler>);
+REGISTER_KERNEL_BUILDER(
+    Name("LearnedUnigramCandidateSampler").Device(DEVICE_CPU),
+    SimpleCandidateSamplerOp<UnigramSampler>);
 
-REGISTER_KERNEL_BUILDER(Name("ThreadUnsafeUnigramCandidateSampler")
-                            .Device(DEVICE_CPU),
-                        SimpleCandidateSamplerOp<ThreadUnsafeUnigramSampler>);
+REGISTER_KERNEL_BUILDER(
+    Name("ThreadUnsafeUnigramCandidateSampler").Device(DEVICE_CPU),
+    SimpleCandidateSamplerOp<ThreadUnsafeUnigramSampler>);
 
 class AllCandidateSamplerOp : public BaseCandidateSamplerOp {
  public:
@@ -197,8 +197,9 @@ class ComputeAccidentalHitsOp : public OpKernel {
   void Compute(OpKernelContext* context) override {
     const Tensor& in_true_candidates = context->input(0);
     const TensorShape& in_true_candidates_shape = in_true_candidates.shape();
-    OP_REQUIRES(context, TensorShapeUtils::IsMatrix(in_true_candidates_shape) &&
-                             in_true_candidates_shape.dim_size(1) == num_true_,
+    OP_REQUIRES(context,
+                TensorShapeUtils::IsMatrix(in_true_candidates_shape) &&
+                    in_true_candidates_shape.dim_size(1) == num_true_,
                 errors::InvalidArgument(
                     "true_candidates must be a batch_size * num_true matrix"));
 
diff --git a/tensorflow/core/kernels/cast_op.cc b/tensorflow/core/kernels/cast_op.cc
index f16abb2b79..626db9131a 100644
--- a/tensorflow/core/kernels/cast_op.cc
+++ b/tensorflow/core/kernels/cast_op.cc
@@ -36,7 +36,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #define CURRY_TYPES2(FN, arg0)   \
   FN(arg0, bool);                \
@@ -223,11 +223,11 @@ class SyclCastOp : public CastOpBase {
   }
 };
 
-#define REGISTER_CAST_SYCL(srctype, dsttype)                    \
-  REGISTER_KERNEL_BUILDER(Name("Cast")                          \
-                              .TypeConstraint<srctype>("SrcT")  \
-                              .TypeConstraint<dsttype>("DstT")  \
-                              .Device(DEVICE_SYCL),             \
+#define REGISTER_CAST_SYCL(srctype, dsttype)                   \
+  REGISTER_KERNEL_BUILDER(Name("Cast")                         \
+                              .TypeConstraint<srctype>("SrcT") \
+                              .TypeConstraint<dsttype>("DstT") \
+                              .Device(DEVICE_SYCL),            \
                           SyclCastOp)
 CURRY_TYPES2(REGISTER_CAST_SYCL, bool);
 CURRY_TYPES2(REGISTER_CAST_SYCL, int32);
@@ -237,7 +237,7 @@ CURRY_TYPES2(REGISTER_CAST_SYCL, double);
 
 #undef REGISTER_CAST_SYCL
 
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #undef CURRY_TYPES2
 
@@ -250,6 +250,5 @@ REGISTER_KERNEL_BUILDER(
 REGISTER_KERNEL_BUILDER(
     Name("_HostCast").Device(DEVICE_SYCL).HostMemory("x").HostMemory("y"),
     CpuCastOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // end namespace tensorflow
-
diff --git a/tensorflow/core/kernels/cast_op.h b/tensorflow/core/kernels/cast_op.h
index 8fedf2c271..fd4e75d26f 100644
--- a/tensorflow/core/kernels/cast_op.h
+++ b/tensorflow/core/kernels/cast_op.h
@@ -131,7 +131,8 @@ struct scalar_cast_op<::tensorflow::bfloat16, float> {
     p[0] = a.value;
     p[1] = 0;
 #else
-    static_assert(::tensorflow::port::kLittleEndian, "Not a little endian system!");
+    static_assert(::tensorflow::port::kLittleEndian,
+                  "Not a little endian system!");
     p[0] = 0;
     p[1] = a.value;
 #endif
diff --git a/tensorflow/core/kernels/cast_op_impl.h b/tensorflow/core/kernels/cast_op_impl.h
index 470e9e0804..3ae9f2ab4d 100644
--- a/tensorflow/core/kernels/cast_op_impl.h
+++ b/tensorflow/core/kernels/cast_op_impl.h
@@ -41,25 +41,25 @@ struct CastFunctor<Eigen::SyclDevice, O, I> {
     o.device(d) = i.template cast<O>();
   }
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
 
-#define CURRY_TYPES3_NO_HALF(FN, arg0, arg1)   \
-  FN(arg0, arg1, bool);                        \
-  FN(arg0, arg1, uint8);                       \
-  FN(arg0, arg1, int8);                        \
-  FN(arg0, arg1, uint16);                      \
-  FN(arg0, arg1, int16);                       \
-  FN(arg0, arg1, int32);                       \
-  FN(arg0, arg1, int64);                       \
-  FN(arg0, arg1, float);                       \
-  FN(arg0, arg1, double);                      \
-  FN(arg0, arg1, std::complex<float>);         \
+#define CURRY_TYPES3_NO_HALF(FN, arg0, arg1) \
+  FN(arg0, arg1, bool);                      \
+  FN(arg0, arg1, uint8);                     \
+  FN(arg0, arg1, int8);                      \
+  FN(arg0, arg1, uint16);                    \
+  FN(arg0, arg1, int16);                     \
+  FN(arg0, arg1, int32);                     \
+  FN(arg0, arg1, int64);                     \
+  FN(arg0, arg1, float);                     \
+  FN(arg0, arg1, double);                    \
+  FN(arg0, arg1, std::complex<float>);       \
   FN(arg0, arg1, std::complex<double>)
 
-#define CURRY_TYPES3(FN, arg0, arg1)           \
-  CURRY_TYPES3_NO_HALF(FN, arg0, arg1)         \
+#define CURRY_TYPES3(FN, arg0, arg1)   \
+  CURRY_TYPES3_NO_HALF(FN, arg0, arg1) \
   FN(arg0, arg1, Eigen::half);
 
 #define CAST_CASE(DEVICE, IN, OUT)                                         \
diff --git a/tensorflow/core/kernels/cast_op_test.cc b/tensorflow/core/kernels/cast_op_test.cc
index a106f287c1..057e209a71 100644
--- a/tensorflow/core/kernels/cast_op_test.cc
+++ b/tensorflow/core/kernels/cast_op_test.cc
@@ -107,10 +107,10 @@ static void BM_gpu_float_int64(int iters, int num) {
   testing::UseRealTime();
 #if GOOGLE_CUDA
   test::Benchmark("gpu", Cast<float, int64>(num)).Run(iters);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
   test::Benchmark("sycl", Cast<float, int64>(num)).Run(iters);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }
 BENCHMARK(BM_gpu_float_int64)->Arg(64 << 10)->Arg(32 << 20);
 
@@ -130,10 +130,10 @@ static void BM_gpu_bool_float(int iters, int num) {
   testing::UseRealTime();
 #if GOOGLE_CUDA
   test::Benchmark("gpu", Cast<bool, float>(num)).Run(iters);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
   test::Benchmark("sycl", Cast<bool, float>(num)).Run(iters);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }
 BENCHMARK(BM_gpu_bool_float)->Arg(64 << 10)->Arg(32 << 20);
 
@@ -180,7 +180,7 @@ static void BM_gpu_float_half(int iters, int num) {
   testing::UseRealTime();
 #if GOOGLE_CUDA
   test::Benchmark("gpu", Cast<float, Eigen::half>(num)).Run(iters);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 }
 BENCHMARK(BM_gpu_float_half)->Arg(64 << 10)->Arg(32 << 20);
 
@@ -191,7 +191,7 @@ static void BM_gpu_half_float(int iters, int num) {
   testing::UseRealTime();
 #if GOOGLE_CUDA
   test::Benchmark("gpu", Cast<Eigen::half, float>(num)).Run(iters);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 }
 BENCHMARK(BM_gpu_half_float)->Arg(64 << 10)->Arg(32 << 20);
 
diff --git a/tensorflow/core/kernels/colorspace_op.cc b/tensorflow/core/kernels/colorspace_op.cc
index ba100b32e7..9cc2e67bbe 100644
--- a/tensorflow/core/kernels/colorspace_op.cc
+++ b/tensorflow/core/kernels/colorspace_op.cc
@@ -107,14 +107,14 @@ class HSVToRGBOp : public OpKernel {
   }
 };
 
-#define REGISTER_CPU(T)                                       \
-  REGISTER_KERNEL_BUILDER(Name("RGBToHSV").Device(DEVICE_CPU) \
-                              .TypeConstraint<T>("T"),        \
-                          RGBToHSVOp<CPUDevice, T>);          \
-  template class RGBToHSVOp<CPUDevice, T>;                    \
-  REGISTER_KERNEL_BUILDER(Name("HSVToRGB").Device(DEVICE_CPU) \
-                              .TypeConstraint<T>("T"),        \
-                          HSVToRGBOp<CPUDevice, T>);          \
+#define REGISTER_CPU(T)                                           \
+  REGISTER_KERNEL_BUILDER(                                        \
+      Name("RGBToHSV").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      RGBToHSVOp<CPUDevice, T>);                                  \
+  template class RGBToHSVOp<CPUDevice, T>;                        \
+  REGISTER_KERNEL_BUILDER(                                        \
+      Name("HSVToRGB").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      HSVToRGBOp<CPUDevice, T>);                                  \
   template class HSVToRGBOp<CPUDevice, T>;
 TF_CALL_float(REGISTER_CPU);
 TF_CALL_double(REGISTER_CPU);
@@ -123,40 +123,39 @@ TF_CALL_double(REGISTER_CPU);
 // Forward declarations of the function specializations for GPU (to prevent
 // building the GPU versions here, they will be built compiling _gpu.cu.cc).
 namespace functor {
-#define DECLARE_GPU(T)                                        \
-  template <>                                                 \
-  void RGBToHSV<GPUDevice, T>::operator()(const GPUDevice& d, \
-      TTypes<T, 2>::ConstTensor input_data,                   \
-      TTypes<T, 1>::Tensor range,                             \
-      TTypes<T, 2>::Tensor output_data);                      \
-  extern template struct RGBToHSV<GPUDevice, T>;              \
-  template <>                                                 \
-  void HSVToRGB<GPUDevice, T>::operator()(const GPUDevice& d, \
-      TTypes<T, 2>::ConstTensor input_data,                   \
-      TTypes<T, 2>::Tensor output_data);                      \
+#define DECLARE_GPU(T)                                               \
+  template <>                                                        \
+  void RGBToHSV<GPUDevice, T>::operator()(                           \
+      const GPUDevice& d, TTypes<T, 2>::ConstTensor input_data,      \
+      TTypes<T, 1>::Tensor range, TTypes<T, 2>::Tensor output_data); \
+  extern template struct RGBToHSV<GPUDevice, T>;                     \
+  template <>                                                        \
+  void HSVToRGB<GPUDevice, T>::operator()(                           \
+      const GPUDevice& d, TTypes<T, 2>::ConstTensor input_data,      \
+      TTypes<T, 2>::Tensor output_data);                             \
   extern template struct HSVToRGB<GPUDevice, T>;
 TF_CALL_float(DECLARE_GPU);
 TF_CALL_double(DECLARE_GPU);
 }  // namespace functor
-#define REGISTER_GPU(T)                                       \
-  REGISTER_KERNEL_BUILDER(Name("RGBToHSV").Device(DEVICE_GPU) \
-                              .TypeConstraint<T>("T"),        \
-                          RGBToHSVOp<GPUDevice, T>);          \
-  REGISTER_KERNEL_BUILDER(Name("HSVToRGB").Device(DEVICE_GPU) \
-                              .TypeConstraint<T>("T"),        \
-                          HSVToRGBOp<GPUDevice, T>);
+#define REGISTER_GPU(T)                                           \
+  REGISTER_KERNEL_BUILDER(                                        \
+      Name("RGBToHSV").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+      RGBToHSVOp<GPUDevice, T>);                                  \
+  REGISTER_KERNEL_BUILDER(                                        \
+      Name("HSVToRGB").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+      HSVToRGBOp<GPUDevice, T>);
 TF_CALL_float(REGISTER_GPU);
 TF_CALL_double(REGISTER_GPU);
 #endif
 
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL(T)                                       \
-  REGISTER_KERNEL_BUILDER(Name("RGBToHSV").Device(DEVICE_SYCL) \
-                              .TypeConstraint<T>("T"),         \
-                          RGBToHSVOp<SYCLDevice, T>);          \
-  REGISTER_KERNEL_BUILDER(Name("HSVToRGB").Device(DEVICE_SYCL) \
-                              .TypeConstraint<T>("T"),         \
-                          HSVToRGBOp<SYCLDevice, T>);
+#define REGISTER_SYCL(T)                                           \
+  REGISTER_KERNEL_BUILDER(                                         \
+      Name("RGBToHSV").Device(DEVICE_SYCL).TypeConstraint<T>("T"), \
+      RGBToHSVOp<SYCLDevice, T>);                                  \
+  REGISTER_KERNEL_BUILDER(                                         \
+      Name("HSVToRGB").Device(DEVICE_SYCL).TypeConstraint<T>("T"), \
+      HSVToRGBOp<SYCLDevice, T>);
 TF_CALL_float(REGISTER_SYCL);
 TF_CALL_double(REGISTER_SYCL);
 #endif
diff --git a/tensorflow/core/kernels/colorspace_op.h b/tensorflow/core/kernels/colorspace_op.h
index c5721ef6dd..90bfce1419 100644
--- a/tensorflow/core/kernels/colorspace_op.h
+++ b/tensorflow/core/kernels/colorspace_op.h
@@ -54,10 +54,9 @@ struct RGBToHSV {
     // TODO(wicke): all these assignments are only necessary because a combined
     // expression is larger than kernel parameter space. A custom kernel is
     // probably in order.
-    H.device(d) = (R == V).select(norm * (G - B),
-                                  (G == V).select(
-                                      norm * (B - R) + T(2) / T(6),
-                                      norm * (R - G) + T(4) / T(6)));
+    H.device(d) = (R == V).select(
+        norm * (G - B), (G == V).select(norm * (B - R) + T(2) / T(6),
+                                        norm * (R - G) + T(4) / T(6)));
     H.device(d) = (range > T(0)).select(H, H.constant(T(0)));
     H.device(d) = (H < T(0)).select(H + T(1), H);
   }
diff --git a/tensorflow/core/kernels/colorspace_op_gpu.cu.cc b/tensorflow/core/kernels/colorspace_op_gpu.cu.cc
index e19d0b14d5..61f9ba44c4 100644
--- a/tensorflow/core/kernels/colorspace_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/colorspace_op_gpu.cu.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/kernels/colorspace_op.h"
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/colorspace_op.h"
 
 namespace tensorflow {
 
@@ -29,6 +29,6 @@ typedef Eigen::GpuDevice GPUDevice;
   template class functor::HSVToRGB<GPUDevice, T>;
 TF_CALL_float(INSTANTIATE_GPU);
 TF_CALL_double(INSTANTIATE_GPU);
-}
+}  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/colorspace_op_test.cc b/tensorflow/core/kernels/colorspace_op_test.cc
index 8c6fb732ab..bd82826770 100644
--- a/tensorflow/core/kernels/colorspace_op_test.cc
+++ b/tensorflow/core/kernels/colorspace_op_test.cc
@@ -224,34 +224,34 @@ class HSVToRGBOpTest : public OpsTestBase {
   }
 };
 
-#define TEST_COLORSPACE(test, dt)                               \
-  TEST_F(test, CheckBlack) {                                    \
-    MakeOp(dt);                                                 \
-    CheckBlack(dt);                                             \
-  }                                                             \
-  TEST_F(test, CheckGray) {                                     \
-    MakeOp(dt);                                                 \
-    CheckGray(dt);                                              \
-  }                                                             \
-  TEST_F(test, CheckWhite) {                                    \
-    MakeOp(dt);                                                 \
-    CheckWhite(dt);                                             \
-  }                                                             \
-  TEST_F(test, CheckRedMax) {                                   \
-    MakeOp(dt);                                                 \
-    CheckRedMax(dt);                                            \
-  }                                                             \
-  TEST_F(test, CheckGreenMax) {                                 \
-    MakeOp(dt);                                                 \
-    CheckGreenMax(dt);                                          \
-  }                                                             \
-  TEST_F(test, CheckBlueMax) {                                  \
-    MakeOp(dt);                                                 \
-    CheckBlueMax(dt);                                           \
-  }                                                             \
-  TEST_F(test, CheckNegativeDifference) {                       \
-    MakeOp(dt);                                                 \
-    CheckNegativeDifference(dt);                                \
+#define TEST_COLORSPACE(test, dt)         \
+  TEST_F(test, CheckBlack) {              \
+    MakeOp(dt);                           \
+    CheckBlack(dt);                       \
+  }                                       \
+  TEST_F(test, CheckGray) {               \
+    MakeOp(dt);                           \
+    CheckGray(dt);                        \
+  }                                       \
+  TEST_F(test, CheckWhite) {              \
+    MakeOp(dt);                           \
+    CheckWhite(dt);                       \
+  }                                       \
+  TEST_F(test, CheckRedMax) {             \
+    MakeOp(dt);                           \
+    CheckRedMax(dt);                      \
+  }                                       \
+  TEST_F(test, CheckGreenMax) {           \
+    MakeOp(dt);                           \
+    CheckGreenMax(dt);                    \
+  }                                       \
+  TEST_F(test, CheckBlueMax) {            \
+    MakeOp(dt);                           \
+    CheckBlueMax(dt);                     \
+  }                                       \
+  TEST_F(test, CheckNegativeDifference) { \
+    MakeOp(dt);                           \
+    CheckNegativeDifference(dt);          \
   }
 
 typedef RGBToHSVOpTest<float> rgb_to_hsv_float;
diff --git a/tensorflow/core/kernels/concat_lib.h b/tensorflow/core/kernels/concat_lib.h
index 526f9420d7..16784c4770 100644
--- a/tensorflow/core/kernels/concat_lib.h
+++ b/tensorflow/core/kernels/concat_lib.h
@@ -41,10 +41,11 @@ namespace tensorflow {
 
 // Assumes all inputs are nonempty
 template <typename T>
-void ConcatCPU(DeviceBase* d,
-               const std::vector<
-                   std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs,
-               typename TTypes<T, 2>::Matrix* output);
+void ConcatCPU(
+    DeviceBase* d,
+    const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
+        inputs,
+    typename TTypes<T, 2>::Matrix* output);
 #if GOOGLE_CUDA
 template <typename T>
 void ConcatGPU(
@@ -57,11 +58,12 @@ void ConcatGPU(
 
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
-void ConcatSYCL(const Eigen::SyclDevice& d,
-               const std::vector<
-                   std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs,
-               typename TTypes<T, 2>::Matrix* output);
-#endif // TENSORFLOW_USE_SYCL
+void ConcatSYCL(
+    const Eigen::SyclDevice& d,
+    const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
+        inputs,
+    typename TTypes<T, 2>::Matrix* output);
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_KERNELS_CONCAT_LIB_H_
diff --git a/tensorflow/core/kernels/concat_lib_cpu.cc b/tensorflow/core/kernels/concat_lib_cpu.cc
index 43731114c0..fc5a3e6288 100644
--- a/tensorflow/core/kernels/concat_lib_cpu.cc
+++ b/tensorflow/core/kernels/concat_lib_cpu.cc
@@ -48,10 +48,11 @@ struct MemCpyCopier<ResourceHandle> {
 }  // namespace
 
 template <typename T>
-void ConcatCPU(DeviceBase* d,
-               const std::vector<
-                   std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs,
-               typename TTypes<T, 2>::Matrix* output) {
+void ConcatCPU(
+    DeviceBase* d,
+    const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
+        inputs,
+    typename TTypes<T, 2>::Matrix* output) {
   if (std::is_same<T, string>::value) {
     // use a large cost here to force strings to be handled by separate threads
     ConcatCPUImpl<T>(d, inputs, 100000, MemCpyCopier<T>(), output);
@@ -86,21 +87,22 @@ TF_CALL_variant(REGISTER)
 
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
-void ConcatSYCL(const Eigen::SyclDevice& d,
-               const std::vector<
-                   std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs,
-               typename TTypes<T, 2>::Matrix* output) {
+void ConcatSYCL(
+    const Eigen::SyclDevice& d,
+    const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
+        inputs,
+    typename TTypes<T, 2>::Matrix* output) {
   ConcatSYCLImpl<T>(d, inputs, sizeof(T) /* cost_per_unit */, MemCpyCopier<T>(),
-                   output);
+                    output);
 }
-#define REGISTER_SYCL(T)                                                      \
- template void ConcatSYCL<T>(                                                 \
-     const Eigen::SyclDevice&,                                                \
-     const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&, \
-     typename TTypes<T, 2>::Matrix* output);
+#define REGISTER_SYCL(T)                                                       \
+  template void ConcatSYCL<T>(                                                 \
+      const Eigen::SyclDevice&,                                                \
+      const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&, \
+      typename TTypes<T, 2>::Matrix* output);
 
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL)
 
 #undef REGISTER_SYCL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/concat_lib_cpu.h b/tensorflow/core/kernels/concat_lib_cpu.h
index 6a933efde4..720b506537 100644
--- a/tensorflow/core/kernels/concat_lib_cpu.h
+++ b/tensorflow/core/kernels/concat_lib_cpu.h
@@ -15,9 +15,9 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include "tensorflow/core/kernels/concat_lib.h"
 #include <vector>
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/concat_lib.h"
 #include "tensorflow/core/util/work_sharder.h"
 
 namespace tensorflow {
@@ -73,7 +73,7 @@ void ConcatCPUImpl(
 
   // Sharded mode.
   auto work = [&row_size, &sizes, &inputs, &output, &copier, &num_inputs](
-      int64 start, int64 end) {
+                  int64 start, int64 end) {
     int64 skipped_rows = start / row_size;
     T* out = output->data() + skipped_rows * row_size;
     T* out_start = output->data() + start;
@@ -160,5 +160,5 @@ void ConcatSYCLImpl(
     }
   }
 }
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/concat_op.cc b/tensorflow/core/kernels/concat_op.cc
index ae1b5da32e..7011550f7e 100644
--- a/tensorflow/core/kernels/concat_op.cc
+++ b/tensorflow/core/kernels/concat_op.cc
@@ -37,7 +37,7 @@ typedef Eigen::GpuDevice GPUDevice;
 #endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 enum AxisArgumentName { NAME_IS_AXIS, NAME_IS_CONCAT_DIM };
 
@@ -71,8 +71,9 @@ class ConcatBaseOp : public OpKernel {
     const TensorShape& input_shape = values[0].shape();
 
     int32 axis = concat_dim < 0 ? concat_dim + input_dims : concat_dim;
-    OP_REQUIRES(c, (0 <= axis && axis < input_dims) ||
-                       (allow_legacy_scalars() && concat_dim == 0),
+    OP_REQUIRES(c,
+                (0 <= axis && axis < input_dims) ||
+                    (allow_legacy_scalars() && concat_dim == 0),
                 errors::InvalidArgument(
                     "ConcatOp : Expected concatenating dimensions in the range "
                     "[",
@@ -97,8 +98,8 @@ class ConcatBaseOp : public OpKernel {
           c, in.dims() == input_dims || (input_is_scalar && in_is_scalar),
           errors::InvalidArgument(
               "ConcatOp : Ranks of all input tensors should match: shape[0] = ",
-              input_shape.DebugString(), " vs. shape[", i, "] = ",
-              in.shape().DebugString()));
+              input_shape.DebugString(), " vs. shape[", i,
+              "] = ", in.shape().DebugString()));
       for (int j = 0; j < input_dims; ++j) {
         if (j == axis) {
           continue;
@@ -107,8 +108,8 @@ class ConcatBaseOp : public OpKernel {
             c, in.dim_size(j) == input_shape.dim_size(j),
             errors::InvalidArgument(
                 "ConcatOp : Dimensions of inputs should match: shape[0] = ",
-                input_shape.DebugString(), " vs. shape[", i, "] = ",
-                in.shape().DebugString()));
+                input_shape.DebugString(), " vs. shape[", i,
+                "] = ", in.shape().DebugString()));
       }
       if (in.NumElements() > 0) {
         int64 inputs_flat_dim1 = in.NumElements() / inputs_flat_dim0;
@@ -142,7 +143,7 @@ class ConcatBaseOp : public OpKernel {
         ConcatSYCL<T>(c->eigen_sycl_device(), inputs_flat, &output_flat);
         return;
       }
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
       ConcatCPU<T>(c->device(), inputs_flat, &output_flat);
     }
   }
@@ -252,7 +253,7 @@ REGISTER_KERNEL_BUILDER(Name("ConcatV2")
                         ConcatV2Op<CPUDevice, int32>);
 
 #undef REGISTER_SYCL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 class ConcatOffsetOp : public OpKernel {
  public:
@@ -347,5 +348,5 @@ REGISTER_KERNEL_BUILDER(Name("ConcatOffset")
                             .HostMemory("shape")
                             .HostMemory("offset"),
                         ConcatOffsetOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/concat_op_test.cc b/tensorflow/core/kernels/concat_op_test.cc
index c5bded9daf..e3ba8ae9f6 100644
--- a/tensorflow/core/kernels/concat_op_test.cc
+++ b/tensorflow/core/kernels/concat_op_test.cc
@@ -157,7 +157,8 @@ BENCHMARK(BM_MemcpyAlternativeDim0)->Arg(1000)->Arg(100000)->Arg(1000000);
 BENCHMARK(BM_MemcpyAlternativeDim1)->Arg(1000)->Arg(100000)->Arg(1000000);
 
 typedef Eigen::TensorMap<Eigen::Tensor<bfloat16, 1, Eigen::RowMajor>,
-                         Eigen::Unaligned> EigenMap;
+                         Eigen::Unaligned>
+    EigenMap;
 static void MemcpyManyAlternative1(int iters, int dim2) {
   testing::StopTiming();
 
diff --git a/tensorflow/core/kernels/conditional_accumulator_base.h b/tensorflow/core/kernels/conditional_accumulator_base.h
index 794ac6fa6d..c7c7c98369 100644
--- a/tensorflow/core/kernels/conditional_accumulator_base.h
+++ b/tensorflow/core/kernels/conditional_accumulator_base.h
@@ -160,7 +160,7 @@ class ConditionalAccumulatorBase : public ResourceBase {
  * Modifications to convenience macros defined in core/framework/op_kernel.h.
  * The below macros return a boolean if the test fails, so that the calling
  * function can get an indication that a failure has occurred.
-*/
+ */
 #define OP_REQUIRES_BOOLEAN(CTX, EXP, STATUS)          \
   do {                                                 \
     if (!TF_PREDICT_TRUE(EXP)) {                       \
diff --git a/tensorflow/core/kernels/conditional_accumulator_op.cc b/tensorflow/core/kernels/conditional_accumulator_op.cc
index fa37916eab..e13bf8a4c6 100644
--- a/tensorflow/core/kernels/conditional_accumulator_op.cc
+++ b/tensorflow/core/kernels/conditional_accumulator_op.cc
@@ -99,9 +99,10 @@ class AccumulatorTakeGradientOp
                       ConditionalAccumulatorBase* accumulator,
                       DoneCallback callback) override {
     // Check signature
-    OP_REQUIRES_OK_ASYNC(ctx, ctx->MatchSignature({DT_STRING_REF, DT_INT32},
-                                                  {accumulator->dtype()}),
-                         callback);
+    OP_REQUIRES_OK_ASYNC(
+        ctx,
+        ctx->MatchSignature({DT_STRING_REF, DT_INT32}, {accumulator->dtype()}),
+        callback);
   }
 
  private:
@@ -111,5 +112,4 @@ class AccumulatorTakeGradientOp
 REGISTER_KERNEL_BUILDER(Name("AccumulatorTakeGradient").Device(DEVICE_CPU),
                         AccumulatorTakeGradientOp);
 
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc
index 59f9f69315..920cd87858 100644
--- a/tensorflow/core/kernels/constant_op.cc
+++ b/tensorflow/core/kernels/constant_op.cc
@@ -146,7 +146,6 @@ typedef Eigen::GpuDevice GPUDevice;
 typedef Eigen::SyclDevice SYCLDevice;
 #endif  // TENSORFLOW_USE_SYCL
 
-
 template <typename Device, typename T, typename Index>
 class FillOp : public OpKernel {
  public:
diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc
index 8fe82d118a..7d5d54e5be 100644
--- a/tensorflow/core/kernels/control_flow_ops.cc
+++ b/tensorflow/core/kernels/control_flow_ops.cc
@@ -113,47 +113,47 @@ REGISTER_GPU_HOST_REF_KERNEL(string);
 #undef REGISTER_GPU_HOST_REF_KERNEL
 
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_SWITCH(type)                       \
-  REGISTER_KERNEL_BUILDER(Name("Switch")                 \
-                              .Device(DEVICE_SYCL)       \
-                              .HostMemory("pred")        \
-                              .TypeConstraint<type>("T"),\
+#define REGISTER_SYCL_SWITCH(type)                        \
+  REGISTER_KERNEL_BUILDER(Name("Switch")                  \
+                              .Device(DEVICE_SYCL)        \
+                              .HostMemory("pred")         \
+                              .TypeConstraint<type>("T"), \
                           SwitchOp)
 TF_CALL_REAL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_SWITCH);
 
-#define REGISTER_SYCL_REF_SWITCH(type)                     \
-  REGISTER_KERNEL_BUILDER(Name("RefSwitch")                \
-                              .Device(DEVICE_SYCL)         \
-                              .HostMemory("pred")          \
-                              .TypeConstraint<type>("T"),  \
+#define REGISTER_SYCL_REF_SWITCH(type)                    \
+  REGISTER_KERNEL_BUILDER(Name("RefSwitch")               \
+                              .Device(DEVICE_SYCL)        \
+                              .HostMemory("pred")         \
+                              .TypeConstraint<type>("T"), \
                           SwitchOp)
 TF_CALL_REAL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_SWITCH);
 
 #undef REGISTER_SYCL_SWITCH
 #undef REGISTER_SYCL_REF_SWITCH
 
-#define REGISTER_SYCL_HOST_KERNEL(type)                  \
-  REGISTER_KERNEL_BUILDER(Name("Switch")                 \
-                              .Device(DEVICE_SYCL)       \
-                              .HostMemory("data")        \
-                              .HostMemory("pred")        \
-                              .HostMemory("output_false")\
-                              .HostMemory("output_true") \
-                              .TypeConstraint<type>("T"),\
+#define REGISTER_SYCL_HOST_KERNEL(type)                   \
+  REGISTER_KERNEL_BUILDER(Name("Switch")                  \
+                              .Device(DEVICE_SYCL)        \
+                              .HostMemory("data")         \
+                              .HostMemory("pred")         \
+                              .HostMemory("output_false") \
+                              .HostMemory("output_true")  \
+                              .TypeConstraint<type>("T"), \
                           SwitchOp)
 
 REGISTER_SYCL_HOST_KERNEL(bool);
 REGISTER_SYCL_HOST_KERNEL(string);
 REGISTER_SYCL_HOST_KERNEL(int32);
 
-#define REGISTER_SYCL_HOST_REF_KERNEL(type)                \
-  REGISTER_KERNEL_BUILDER(Name("RefSwitch")                \
-                              .Device(DEVICE_SYCL)         \
-                              .HostMemory("data")          \
-                              .HostMemory("pred")          \
-                              .HostMemory("output_false")  \
-                              .HostMemory("output_true")   \
-                              .TypeConstraint<type>("T"),  \
+#define REGISTER_SYCL_HOST_REF_KERNEL(type)               \
+  REGISTER_KERNEL_BUILDER(Name("RefSwitch")               \
+                              .Device(DEVICE_SYCL)        \
+                              .HostMemory("data")         \
+                              .HostMemory("pred")         \
+                              .HostMemory("output_false") \
+                              .HostMemory("output_true")  \
+                              .TypeConstraint<type>("T"), \
                           SwitchOp)
 
 REGISTER_SYCL_HOST_REF_KERNEL(int32);
@@ -162,7 +162,7 @@ REGISTER_SYCL_HOST_REF_KERNEL(string);
 
 #undef REGISTER_SYCL_HOST_KERNEL
 #undef REGISTER_SYCL_HOST_REF_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 class RefSelectOp : public OpKernel {
  public:
@@ -282,7 +282,7 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_KERNEL);
 
 #undef REGISTER_SYCL_KERNEL
 #undef REGISTER_SYCL_REF_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // Special GPU kernels for int32 and string.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
@@ -331,7 +331,7 @@ REGISTER_SYCL_HOST_KERNEL(string);
 REGISTER_SYCL_HOST_KERNEL(ResourceHandle);
 
 #undef REGISTER_SYCL_HOST_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 void EnterOp::Compute(OpKernelContext* context) {
   if (IsRefType(context->input_dtype(0))) {
@@ -360,14 +360,14 @@ REGISTER_GPU_REF_KERNEL(bool);
 #undef REGISTER_GPU_REF_KERNEL
 
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)  \
-  REGISTER_KERNEL_BUILDER(          \
+#define REGISTER_SYCL_KERNEL(type) \
+  REGISTER_KERNEL_BUILDER(         \
       Name("Enter").Device(DEVICE_SYCL).TypeConstraint<type>("T"), EnterOp)
 REGISTER_SYCL_KERNEL(bool);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
 
-#define REGISTER_SYCL_REF_KERNEL(type)  \
-  REGISTER_KERNEL_BUILDER(              \
+#define REGISTER_SYCL_REF_KERNEL(type) \
+  REGISTER_KERNEL_BUILDER(             \
       Name("RefEnter").Device(DEVICE_SYCL).TypeConstraint<type>("T"), EnterOp)
 REGISTER_SYCL_REF_KERNEL(bool);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_KERNEL);
@@ -398,7 +398,7 @@ REGISTER_SYCL_HOST_KERNEL(ResourceHandle);
 
 #undef REGISTER_SYCL_HOST_KERNEL
 #undef REGISTER_SYCL_HOST_REF_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // Special GPU kernels for int32 and string.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
@@ -455,10 +455,10 @@ REGISTER_GPU_REF_KERNEL(bool);
 #undef REGISTER_GPU_REF_KERNEL
 
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)                                           \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("Exit").Device(DEVICE_SYCL).TypeConstraint<type>("T"), ExitOp);   \
-  REGISTER_KERNEL_BUILDER(                                                   \
+#define REGISTER_SYCL_KERNEL(type)                                         \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("Exit").Device(DEVICE_SYCL).TypeConstraint<type>("T"), ExitOp); \
+  REGISTER_KERNEL_BUILDER(                                                 \
       Name("RefExit").Device(DEVICE_SYCL).TypeConstraint<type>("T"), ExitOp);
 REGISTER_SYCL_KERNEL(bool);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
@@ -483,7 +483,7 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
 REGISTER_SYCL_HOST_KERNEL(int32);
 REGISTER_SYCL_HOST_KERNEL(string);
 #undef REGISTER_SYCL_HOST_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // Special GPU kernels for int32 and string.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
@@ -556,12 +556,12 @@ REGISTER_GPU_HOST_KERNEL(string);
 #undef REGISTER_GPU_HOST_KERNEL
 
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)                                           \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("NextIteration").Device(DEVICE_SYCL).TypeConstraint<type>("T"),   \
-      NextIterationOp);                                                      \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("RefNextIteration").Device(DEVICE_SYCL).TypeConstraint<type>("T"),\
+#define REGISTER_SYCL_KERNEL(type)                                            \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("NextIteration").Device(DEVICE_SYCL).TypeConstraint<type>("T"),    \
+      NextIterationOp);                                                       \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("RefNextIteration").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
       NextIterationOp)
 REGISTER_SYCL_KERNEL(bool);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
@@ -585,7 +585,7 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
 REGISTER_SYCL_HOST_KERNEL(int32);
 REGISTER_SYCL_HOST_KERNEL(string);
 #undef REGISTER_SYCL_HOST_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // A LoopCond op has one input and one output. The input is a boolean
 // scalar representing the taken branches of the "pivot" Switch that
@@ -619,7 +619,7 @@ REGISTER_KERNEL_BUILDER(Name("LoopCond")
                             .HostMemory("input")
                             .HostMemory("output"),
                         LoopCondOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // ControlTrigger kernels
 REGISTER_KERNEL_BUILDER(Name("ControlTrigger").Device(DEVICE_CPU),
@@ -631,7 +631,7 @@ REGISTER_KERNEL_BUILDER(Name("ControlTrigger").Device(DEVICE_GPU),
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(Name("ControlTrigger").Device(DEVICE_SYCL),
                         ControlTriggerOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // When called, abort op will abort the current process. This can be used to
 // abort remote PSs when needed.
diff --git a/tensorflow/core/kernels/control_flow_ops_test.cc b/tensorflow/core/kernels/control_flow_ops_test.cc
index affa0e8ca6..a2f7bd4069 100644
--- a/tensorflow/core/kernels/control_flow_ops_test.cc
+++ b/tensorflow/core/kernels/control_flow_ops_test.cc
@@ -91,6 +91,7 @@ class KilledBySignal {
  public:
   explicit KilledBySignal(int signum) : signum_(signum) {}
   bool operator()(int exit_status) const { return exit_status == signum_; }
+
  private:
   const int signum_;
 };
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index 985586d626..dbddaf3dc6 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -688,7 +688,7 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
   static int64 ConvolveScratchSize = GetCudnnWorkspaceLimit(
       // default value is in bytes despite the name of the environment variable
       "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32  // 4GB
-      );
+  );
 
   int device_id = stream->parent()->device_ordinal();
   DataType dtype = input.dtype();
diff --git a/tensorflow/core/kernels/conv_ops_fused.cc b/tensorflow/core/kernels/conv_ops_fused.cc
index 291ebf2298..1b40ad81f4 100644
--- a/tensorflow/core/kernels/conv_ops_fused.cc
+++ b/tensorflow/core/kernels/conv_ops_fused.cc
@@ -679,8 +679,9 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
 
     const int dims = resized_shape.dims();
     OP_REQUIRES(
-        context, TensorShapeUtils::IsMatrix(paddings.shape()) &&
-                     paddings.dim_size(1) == 2,
+        context,
+        TensorShapeUtils::IsMatrix(paddings.shape()) &&
+            paddings.dim_size(1) == 2,
         errors::InvalidArgument("paddings must be a matrix with 2 columns: ",
                                 paddings.shape().DebugString()));
     const int fixed_dims =
@@ -715,20 +716,22 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
       const int32 after =
           paddings_matrix(d, 1);  // Pad after existing elements.
       OP_REQUIRES(context, before >= 0 && after >= 0,
-                  errors::InvalidArgument("paddings must be non-negative: ",
-                                          before, " ", after));
+                  errors::InvalidArgument(
+                      "paddings must be non-negative: ", before, " ", after));
       if (offset_ == 0) {  // SYMMETRIC mode.
         OP_REQUIRES(
-            context, before <= resized_shape.dim_size(d) &&
-                         after <= resized_shape.dim_size(d),
+            context,
+            before <= resized_shape.dim_size(d) &&
+                after <= resized_shape.dim_size(d),
             errors::InvalidArgument("paddings must be no greater "
                                     "than the dimension size: ",
                                     before, ", ", after, " greater than ",
                                     resized_shape.dim_size(d)));
       } else if (offset_ == 1) {  // REFLECT mode.
         OP_REQUIRES(
-            context, before < resized_shape.dim_size(d) &&
-                         after < resized_shape.dim_size(d),
+            context,
+            before < resized_shape.dim_size(d) &&
+                after < resized_shape.dim_size(d),
             errors::InvalidArgument("paddings must be less than"
                                     " the dimension size: ",
                                     before, ", ", after, " not less than ",
@@ -767,18 +770,19 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
     // We only check the first three dims, since the depth is accessed as an
     // int64 below.
     for (int i = 0; i < 3; i++) {
-      OP_REQUIRES(context, FastBoundsCheck(filter.dim_size(i),
-                                           std::numeric_limits<int>::max()),
-                  errors::InvalidArgument("filter too large"));
+      OP_REQUIRES(
+          context,
+          FastBoundsCheck(filter.dim_size(i), std::numeric_limits<int>::max()),
+          errors::InvalidArgument("filter too large"));
     }
 
     // The last dimension for input is in_depth. It must be the same as the
     // filter's in_depth.
     const int64 in_depth = padded_shape.dim_size(3);
-    OP_REQUIRES(
-        context, in_depth == filter.dim_size(2),
-        errors::InvalidArgument("input and filter must have the same depth: ",
-                                in_depth, " vs ", filter.dim_size(2)));
+    OP_REQUIRES(context, in_depth == filter.dim_size(2),
+                errors::InvalidArgument(
+                    "input and filter must have the same depth: ", in_depth,
+                    " vs ", filter.dim_size(2)));
 
     // The last dimension for filter is out_depth.
     const int out_depth = static_cast<int>(filter.dim_size(3));
@@ -786,9 +790,10 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
     // The second dimension for input is rows/height.
     // The first dimension for filter is rows/height.
     const int64 padded_rows_raw = padded_shape.dim_size(1);
-    OP_REQUIRES(context, FastBoundsCheck(padded_rows_raw,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("Input rows too large"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(padded_rows_raw, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("Input rows too large"));
     const int padded_rows = static_cast<int>(padded_rows_raw);
     const int filter_rows = static_cast<int>(filter.dim_size(0));
     const int resized_rows = static_cast<int>(resized_shape.dim_size(1));
@@ -796,9 +801,10 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
     // The third dimension for input is columns/width.
     // The second dimension for filter is columns/width.
     const int64 padded_cols_raw = padded_shape.dim_size(2);
-    OP_REQUIRES(context, FastBoundsCheck(padded_cols_raw,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("Input cols too large"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(padded_cols_raw, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("Input cols too large"));
     const int padded_cols = static_cast<int>(padded_cols_raw);
     const int filter_cols = static_cast<int>(filter.dim_size(1));
     const int resized_cols = static_cast<int>(resized_shape.dim_size(2));
@@ -864,24 +870,26 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(FusedResizeConv2DUsingGemmOp);
 };
 
-#define REGISTER_FUSED(T)                                                    \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("FusedResizeAndPadConv2D")                                        \
-          .Device(DEVICE_CPU)                                                \
-          .TypeConstraint<T>("T"),                                           \
-      FusedResizeConv2DUsingGemmOp<                                          \
-          T, FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
-                                          BILINEAR>,                         \
+#define REGISTER_FUSED(T)                                                 \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("FusedResizeAndPadConv2D")                                     \
+          .Device(DEVICE_CPU)                                             \
+          .TypeConstraint<T>("T"),                                        \
+      FusedResizeConv2DUsingGemmOp<                                       \
+          T,                                                              \
+          FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
+                                       BILINEAR>,                         \
           true>);
 
 TF_CALL_float(REGISTER_FUSED);
 
-#define REGISTER_PAD_ONLY_FUSED(T)                                           \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("FusedPadConv2D").Device(DEVICE_CPU).TypeConstraint<T>("T"),      \
-      FusedResizeConv2DUsingGemmOp<                                          \
-          T, FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
-                                          NEAREST>,                          \
+#define REGISTER_PAD_ONLY_FUSED(T)                                        \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("FusedPadConv2D").Device(DEVICE_CPU).TypeConstraint<T>("T"),   \
+      FusedResizeConv2DUsingGemmOp<                                       \
+          T,                                                              \
+          FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
+                                       NEAREST>,                          \
           false>);
 
 TF_CALL_float(REGISTER_PAD_ONLY_FUSED);
diff --git a/tensorflow/core/kernels/conv_ops_gpu.h b/tensorflow/core/kernels/conv_ops_gpu.h
index 57e196c67c..f0085be3a5 100644
--- a/tensorflow/core/kernels/conv_ops_gpu.h
+++ b/tensorflow/core/kernels/conv_ops_gpu.h
@@ -27,7 +27,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-
 // Get the Cudnn workspace limit from the environment variable, which is in MB.
 // Return the workspace memory limit in bytes. If no value is set, return the
 // default value.
diff --git a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
index af6013c974..e58f5f61f3 100644
--- a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
+++ b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
@@ -25,9 +25,9 @@ limitations under the License.
 #include "cuda/include/cuda.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/lib/math/math_util.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
 #include "tensorflow/core/util/tensor_format.h"
-#include "tensorflow/core/lib/math/math_util.h"
 
 namespace tensorflow {
 
@@ -252,11 +252,14 @@ __global__ void SwapDimension1And2InTensor3UsingTiles(
   int x = threadIdx.x;
 
   Dimension<3> output_dims = {
-      input_dims[0], input_dims[2], input_dims[1],
+      input_dims[0],
+      input_dims[2],
+      input_dims[1],
   };
 
   Dimension<3> input_dims_in_tiles = {
-      input_dims[0], (input_dims[1] + TileSizeI - 1) / TileSizeI,
+      input_dims[0],
+      (input_dims[1] + TileSizeI - 1) / TileSizeI,
       (input_dims[2] + TileSizeJ - 1) / TileSizeJ,
   };
 
@@ -264,7 +267,8 @@ __global__ void SwapDimension1And2InTensor3UsingTiles(
       FlatToTensorIndex(blockIdx.x, input_dims_in_tiles);
 
   Index<3> input_tile_origin = {
-      input_tile_index[0], input_tile_index[1] * TileSizeI,
+      input_tile_index[0],
+      input_tile_index[1] * TileSizeI,
       input_tile_index[2] * TileSizeJ,
   };
 
@@ -322,11 +326,14 @@ __global__ void SwapDimension1And2InTensor3UsingTiles(
   __syncthreads();
 
   Index<3> output_tile_index = {
-      input_tile_index[0], input_tile_index[2], input_tile_index[1],
+      input_tile_index[0],
+      input_tile_index[2],
+      input_tile_index[1],
   };
 
   Index<3> output_tile_origin = {
-      output_tile_index[0], output_tile_index[1] * TileSizeJ,
+      output_tile_index[0],
+      output_tile_index[1] * TileSizeJ,
       output_tile_index[2] * TileSizeI,
   };
 
@@ -799,7 +806,7 @@ struct TransposeElemType<16> {
 // A helper function to make RunSwapDimension1And2InTensor3 concise. This
 // helper function looks at the data type and input matrix sizes and decides
 // the thread numbers and tile sizes to use.
-template <typename T, bool conjugate = false >
+template <typename T, bool conjugate = false>
 void SwapDimension1And2InTensor3WithNarrowMatrices(
     const GPUDevice& d, const T* input, const Dimension<3>& input_dims,
     T* output, const int kMinDimensionToUseTiles) {
@@ -902,19 +909,21 @@ void RunSwapDimension1And2InTensor3(const GPUDevice& d, const T* input,
     constexpr int kNumThreads = 256;
 
     Dimension<3> input_dims_in_tiles = {
-        input_dims[0], MathUtil::CeilOfRatio<int>(input_dims[1], kTileSize),
+        input_dims[0],
+        MathUtil::CeilOfRatio<int>(input_dims[1], kTileSize),
         MathUtil::CeilOfRatio<int>(input_dims[2], kTileSize),
     };
 
     int total_tiles_count = input_dims_in_tiles[0] * input_dims_in_tiles[1] *
                             input_dims_in_tiles[2];
-    SwapDimension1And2InTensor3UsingTiles<T, kNumThreads, kTileSize, kTileSize, conjugate>
+    SwapDimension1And2InTensor3UsingTiles<T, kNumThreads, kTileSize, kTileSize,
+                                          conjugate>
         <<<total_tiles_count, kNumThreads, 0, d.stream()>>>(input, input_dims,
                                                             output);
 
   } else if (narrow_matrix) {
-    SwapDimension1And2InTensor3WithNarrowMatrices<T, conjugate>(d, input, input_dims, output,
-                                                  kMinDimensionToUseTiles);
+    SwapDimension1And2InTensor3WithNarrowMatrices<T, conjugate>(
+        d, input, input_dims, output, kMinDimensionToUseTiles);
   } else {
     int total_element_count = input_dims[0] * input_dims[1] * input_dims[2];
     CudaLaunchConfig config = GetCudaLaunchConfig(total_element_count, d);
diff --git a/tensorflow/core/kernels/conv_ops_using_gemm.cc b/tensorflow/core/kernels/conv_ops_using_gemm.cc
index 20da77c36f..af0a9fa82e 100644
--- a/tensorflow/core/kernels/conv_ops_using_gemm.cc
+++ b/tensorflow/core/kernels/conv_ops_using_gemm.cc
@@ -468,18 +468,19 @@ class Conv2DUsingGemmOp : public BinaryOp<T> {
                                         filter.shape().DebugString()));
 
     for (int i = 0; i < 3; i++) {
-      OP_REQUIRES(context, FastBoundsCheck(filter.dim_size(i),
-                                           std::numeric_limits<int>::max()),
-                  errors::InvalidArgument("filter too large"));
+      OP_REQUIRES(
+          context,
+          FastBoundsCheck(filter.dim_size(i), std::numeric_limits<int>::max()),
+          errors::InvalidArgument("filter too large"));
     }
 
     // The last dimension for input is in_depth. It must be the same as the
     // filter's in_depth.
     const int64 in_depth = GetTensorDim(input, data_format_, 'C');
-    OP_REQUIRES(
-        context, in_depth == filter.dim_size(2),
-        errors::InvalidArgument("input and filter must have the same depth: ",
-                                in_depth, " vs ", filter.dim_size(2)));
+    OP_REQUIRES(context, in_depth == filter.dim_size(2),
+                errors::InvalidArgument(
+                    "input and filter must have the same depth: ", in_depth,
+                    " vs ", filter.dim_size(2)));
 
     // The last dimension for filter is out_depth.
     const int out_depth = static_cast<int>(filter.dim_size(3));
@@ -487,18 +488,20 @@ class Conv2DUsingGemmOp : public BinaryOp<T> {
     // The second dimension for input is rows/height.
     // The first dimension for filter is rows/height.
     const int64 input_rows_raw = GetTensorDim(input, data_format_, 'H');
-    OP_REQUIRES(context, FastBoundsCheck(input_rows_raw,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("Input rows too large"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(input_rows_raw, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("Input rows too large"));
     const int input_rows = static_cast<int>(input_rows_raw);
     const int filter_rows = static_cast<int>(filter.dim_size(0));
 
     // The third dimension for input is columns/width.
     // The second dimension for filter is columns/width.
     const int64 input_cols_raw = GetTensorDim(input, data_format_, 'W');
-    OP_REQUIRES(context, FastBoundsCheck(input_cols_raw,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("Input cols too large"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(input_cols_raw, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("Input cols too large"));
     const int input_cols = static_cast<int>(input_cols_raw);
     const int filter_cols = static_cast<int>(filter.dim_size(1));
 
diff --git a/tensorflow/core/kernels/cross_op_gpu.cu.cc b/tensorflow/core/kernels/cross_op_gpu.cu.cc
index 7ea0b3be0c..4a37f6cfbb 100644
--- a/tensorflow/core/kernels/cross_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/cross_op_gpu.cu.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/kernels/cross_op.h"
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/cross_op.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/ctc_decoder_ops.cc b/tensorflow/core/kernels/ctc_decoder_ops.cc
index 73ee310604..96bdb6a241 100644
--- a/tensorflow/core/kernels/ctc_decoder_ops.cc
+++ b/tensorflow/core/kernels/ctc_decoder_ops.cc
@@ -19,13 +19,13 @@ limitations under the License.
 
 #include <limits>
 
-#include "tensorflow/core/util/ctc/ctc_beam_search.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/util/ctc/ctc_beam_search.h"
 #include "tensorflow/core/util/sparse/sparse_tensor.h"
 
 namespace tensorflow {
@@ -80,16 +80,17 @@ class CTCDecodeHelper {
 
     if (!(batch_size == (*seq_len)->dim_size(0))) {
       return errors::FailedPrecondition(
-          "len(sequence_length) != batch_size.  ", "len(sequence_length):  ",
-          (*seq_len)->dim_size(0), " batch_size: ", batch_size);
+          "len(sequence_length) != batch_size.  ",
+          "len(sequence_length):  ", (*seq_len)->dim_size(0),
+          " batch_size: ", batch_size);
     }
 
     auto seq_len_t = (*seq_len)->vec<int32>();
 
     for (int b = 0; b < batch_size; ++b) {
       if (!(seq_len_t(b) <= max_time)) {
-        return errors::FailedPrecondition("sequence_length(", b, ") <= ",
-                                          max_time);
+        return errors::FailedPrecondition("sequence_length(", b,
+                                          ") <= ", max_time);
       }
     }
 
diff --git a/tensorflow/core/kernels/ctc_loss_op.cc b/tensorflow/core/kernels/ctc_loss_op.cc
index fb03adb7a5..b38d838bf1 100644
--- a/tensorflow/core/kernels/ctc_loss_op.cc
+++ b/tensorflow/core/kernels/ctc_loss_op.cc
@@ -113,8 +113,8 @@ class CTCLossOp : public OpKernel {
       const int64 batch_indices = g.group()[0];
       OP_REQUIRES(ctx, FastBoundsCheck(batch_indices, batch_size),
                   errors::InvalidArgument("labels batch index must be between ",
-                                          0, " and ", batch_size, " but saw: ",
-                                          batch_indices));
+                                          0, " and ", batch_size,
+                                          " but saw: ", batch_indices));
 
       auto values = g.values<int32>();
       std::vector<int>* b_values = &labels_t[batch_indices];
diff --git a/tensorflow/core/kernels/cwise_op_abs.cc b/tensorflow/core/kernels/cwise_op_abs.cc
index 5fd38d9dc2..1466f24202 100644
--- a/tensorflow/core/kernels/cwise_op_abs.cc
+++ b/tensorflow/core/kernels/cwise_op_abs.cc
@@ -45,5 +45,5 @@ REGISTER_KERNEL_BUILDER(Name("Abs")
                             .HostMemory("y")
                             .TypeConstraint<int32>("T"),
                         UnaryOp<CPUDevice, functor::abs<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_acos.cc b/tensorflow/core/kernels/cwise_op_acos.cc
index 12cc6c8bdd..4919122607 100644
--- a/tensorflow/core/kernels/cwise_op_acos.cc
+++ b/tensorflow/core/kernels/cwise_op_acos.cc
@@ -24,5 +24,5 @@ REGISTER2(UnaryOp, GPU, "Acos", functor::acos, float, double);
 
 #if TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Acos", functor::acos, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_acosh.cc b/tensorflow/core/kernels/cwise_op_acosh.cc
index 39c8814073..c2b355ab7f 100644
--- a/tensorflow/core/kernels/cwise_op_acosh.cc
+++ b/tensorflow/core/kernels/cwise_op_acosh.cc
@@ -17,12 +17,12 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_gradients.h"
 
 namespace tensorflow {
-REGISTER4(UnaryOp, CPU, "Acosh", functor::acosh, float, double,
-          complex64, complex128);
+REGISTER4(UnaryOp, CPU, "Acosh", functor::acosh, float, double, complex64,
+          complex128);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Acosh", functor::acosh, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
 REGISTER2(UnaryOp, GPU, "Acosh", functor::acosh, float, double);
diff --git a/tensorflow/core/kernels/cwise_op_add_1.cc b/tensorflow/core/kernels/cwise_op_add_1.cc
index 608a6dce3d..bf32c8a54b 100644
--- a/tensorflow/core/kernels/cwise_op_add_1.cc
+++ b/tensorflow/core/kernels/cwise_op_add_1.cc
@@ -44,7 +44,6 @@ REGISTER_KERNEL_BUILDER(Name("AddV2")
                         BinaryOp<CPUDevice, functor::add<int32>>);
 #endif
 
-
 #if TENSORFLOW_USE_SYCL
 #define REGISTER_KERNEL(type)                          \
   REGISTER(BinaryOp, SYCL, "Add", functor::add, type); \
@@ -66,5 +65,5 @@ REGISTER_KERNEL_BUILDER(Name("AddV2")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::add<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_add_2.cc b/tensorflow/core/kernels/cwise_op_add_2.cc
index ac21ca06c9..e8acbac285 100644
--- a/tensorflow/core/kernels/cwise_op_add_2.cc
+++ b/tensorflow/core/kernels/cwise_op_add_2.cc
@@ -22,8 +22,8 @@ namespace tensorflow {
 // sharded files, only make its register calls when not __ANDROID_TYPES_SLIM__.
 #if !defined(__ANDROID_TYPES_SLIM__)
 
-REGISTER6(BinaryOp, CPU, "Add", functor::add, int8, int16, complex64,
-          uint8, complex128, string);
+REGISTER6(BinaryOp, CPU, "Add", functor::add, int8, int16, complex64, uint8,
+          complex128, string);
 // Notice: String is excluded to allow marking AddV2 is_commutative and
 // is_aggregate.
 REGISTER5(BinaryOp, CPU, "AddV2", functor::add, int8, int16, complex64, uint8,
diff --git a/tensorflow/core/kernels/cwise_op_asin.cc b/tensorflow/core/kernels/cwise_op_asin.cc
index c28e27d95a..fe8dfea117 100644
--- a/tensorflow/core/kernels/cwise_op_asin.cc
+++ b/tensorflow/core/kernels/cwise_op_asin.cc
@@ -24,5 +24,5 @@ REGISTER2(UnaryOp, GPU, "Asin", functor::asin, float, double);
 
 #if TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Asin", functor::asin, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_asinh.cc b/tensorflow/core/kernels/cwise_op_asinh.cc
index 0aec6aac34..7cf0405f52 100644
--- a/tensorflow/core/kernels/cwise_op_asinh.cc
+++ b/tensorflow/core/kernels/cwise_op_asinh.cc
@@ -1,10 +1,10 @@
-  /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
-  http://www.apache.org/licenses/LICENSE-2.0
+http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@@ -17,8 +17,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_gradients.h"
 
 namespace tensorflow {
-REGISTER4(UnaryOp, CPU, "Asinh", functor::asinh, float, double,
-          complex64, complex128);
+REGISTER4(UnaryOp, CPU, "Asinh", functor::asinh, float, double, complex64,
+          complex128);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Asinh", functor::asinh, float, double);
diff --git a/tensorflow/core/kernels/cwise_op_atan.cc b/tensorflow/core/kernels/cwise_op_atan.cc
index 7d73de4810..09f0448874 100644
--- a/tensorflow/core/kernels/cwise_op_atan.cc
+++ b/tensorflow/core/kernels/cwise_op_atan.cc
@@ -24,5 +24,5 @@ REGISTER2(UnaryOp, GPU, "Atan", functor::atan, float, double);
 
 #if TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Atan", functor::atan, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_atanh.cc b/tensorflow/core/kernels/cwise_op_atanh.cc
index 7b688db4c5..6170683fa6 100644
--- a/tensorflow/core/kernels/cwise_op_atanh.cc
+++ b/tensorflow/core/kernels/cwise_op_atanh.cc
@@ -17,8 +17,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_gradients.h"
 
 namespace tensorflow {
-REGISTER4(UnaryOp, CPU, "Atanh", functor::atanh, float, double,
-          complex64, complex128);
+REGISTER4(UnaryOp, CPU, "Atanh", functor::atanh, float, double, complex64,
+          complex128);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Atanh", functor::atanh, float, double);
diff --git a/tensorflow/core/kernels/cwise_op_ceil.cc b/tensorflow/core/kernels/cwise_op_ceil.cc
index 0111e9d5fd..816eadc80e 100644
--- a/tensorflow/core/kernels/cwise_op_ceil.cc
+++ b/tensorflow/core/kernels/cwise_op_ceil.cc
@@ -24,5 +24,5 @@ REGISTER3(UnaryOp, GPU, "Ceil", functor::ceil, float, Eigen::half, double);
 
 #if TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Ceil", functor::ceil, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_cos.cc b/tensorflow/core/kernels/cwise_op_cos.cc
index d4b3b0e393..71ad0ff0dc 100644
--- a/tensorflow/core/kernels/cwise_op_cos.cc
+++ b/tensorflow/core/kernels/cwise_op_cos.cc
@@ -25,5 +25,5 @@ REGISTER3(UnaryOp, GPU, "Cos", functor::cos, float, Eigen::half, double);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Cos", functor::cos, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_cosh.cc b/tensorflow/core/kernels/cwise_op_cosh.cc
index bca99a4f89..31b4bb3cad 100644
--- a/tensorflow/core/kernels/cwise_op_cosh.cc
+++ b/tensorflow/core/kernels/cwise_op_cosh.cc
@@ -16,20 +16,18 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER4(UnaryOp, CPU, "Cosh", functor::cosh, float, double,
-          complex64, complex128);
+REGISTER4(UnaryOp, CPU, "Cosh", functor::cosh, float, double, complex64,
+          complex128);
 
 #if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-                          Name("Cosh")                                \
-                          .Device(DEVICE_SYCL)                        \
-                          .TypeConstraint<TYPE>("T"),                 \
-                          UnaryOp<SYCLDevice, functor::cosh<TYPE>>);
+#define REGISTER_SYCL_KERNEL(TYPE)                                \
+  REGISTER_KERNEL_BUILDER(                                        \
+      Name("Cosh").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
+      UnaryOp<SYCLDevice, functor::cosh<TYPE>>);
 REGISTER_SYCL_KERNEL(float);
 REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
 REGISTER2(UnaryOp, GPU, "Cosh", functor::cosh, float, double);
diff --git a/tensorflow/core/kernels/cwise_op_div.cc b/tensorflow/core/kernels/cwise_op_div.cc
index d44c1bf473..c71c756e44 100644
--- a/tensorflow/core/kernels/cwise_op_div.cc
+++ b/tensorflow/core/kernels/cwise_op_div.cc
@@ -54,5 +54,5 @@ REGISTER_KERNEL_BUILDER(Name("Div")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::safe_div<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_exp.cc b/tensorflow/core/kernels/cwise_op_exp.cc
index 66d7b7d22e..8f4ac98016 100644
--- a/tensorflow/core/kernels/cwise_op_exp.cc
+++ b/tensorflow/core/kernels/cwise_op_exp.cc
@@ -26,5 +26,5 @@ REGISTER5(UnaryOp, GPU, "Exp", functor::exp, float, Eigen::half, double,
 
 #if TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Exp", functor::exp, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_expm1.cc b/tensorflow/core/kernels/cwise_op_expm1.cc
index 4f72308006..ce03ad5de6 100644
--- a/tensorflow/core/kernels/cwise_op_expm1.cc
+++ b/tensorflow/core/kernels/cwise_op_expm1.cc
@@ -23,5 +23,5 @@ REGISTER3(UnaryOp, GPU, "Expm1", functor::expm1, float, Eigen::half, double);
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Expm1", functor::expm1, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_floor.cc b/tensorflow/core/kernels/cwise_op_floor.cc
index 5a142b9ce9..d554d41c41 100644
--- a/tensorflow/core/kernels/cwise_op_floor.cc
+++ b/tensorflow/core/kernels/cwise_op_floor.cc
@@ -23,5 +23,5 @@ REGISTER3(UnaryOp, GPU, "Floor", functor::floor, float, Eigen::half, double);
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Floor", functor::floor, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_floor_div.cc b/tensorflow/core/kernels/cwise_op_floor_div.cc
index fa81ef0872..fecbf85989 100644
--- a/tensorflow/core/kernels/cwise_op_floor_div.cc
+++ b/tensorflow/core/kernels/cwise_op_floor_div.cc
@@ -49,5 +49,5 @@ REGISTER_KERNEL_BUILDER(Name("FloorDiv")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::safe_floor_div<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_floor_mod.cc b/tensorflow/core/kernels/cwise_op_floor_mod.cc
index 55f8a30461..29340b8850 100644
--- a/tensorflow/core/kernels/cwise_op_floor_mod.cc
+++ b/tensorflow/core/kernels/cwise_op_floor_mod.cc
@@ -40,5 +40,5 @@ REGISTER_KERNEL_BUILDER(Name("FloorMod")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::safe_floor_mod<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_gpu_conj.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_conj.cu.cc
index e7dff5d0ac..77723b3169 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_conj.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_conj.cu.cc
@@ -19,8 +19,8 @@ limitations under the License.
 
 namespace tensorflow {
 namespace functor {
-  DEFINE_UNARY1(conj, complex64);
-  DEFINE_UNARY1(conj, complex128);
+DEFINE_UNARY1(conj, complex64);
+DEFINE_UNARY1(conj, complex128);
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cwise_op_gpu_equal_to.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_equal_to.cu.cc
index 3675398126..26748ef0e7 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_equal_to.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_equal_to.cu.cc
@@ -20,7 +20,7 @@ limitations under the License.
 namespace tensorflow {
 namespace functor {
 DEFINE_BINARY10(equal_to, float, Eigen::half, double, uint8, int8, int16, int64,
-               complex64, complex128, bool);
+                complex64, complex128, bool);
 DEFINE_APPROXIMATE_EQUAL2(float, double);
 }  // namespace functor
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc
index a54dbdfc24..627ecc8c80 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc
@@ -15,8 +15,10 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 
-#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+#define EIGEN_USE_GPU
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
 
 namespace tensorflow {
 namespace functor {
@@ -38,19 +40,17 @@ struct SelectScalarFunctor<GPUDevice, T> {
                   typename TTypes<bool>::ConstScalar cond,
                   typename TTypes<T>::ConstFlat then_flat,
                   typename TTypes<T>::ConstFlat else_flat) {
-
 #if !defined(EIGEN_HAS_INDEX_LIST)
-  Eigen::array<int, 1> rank1{1};
+    Eigen::array<int, 1> rank1{1};
 #else
-  Eigen::IndexList<Eigen::type2index<1>> rank1;
+    Eigen::IndexList<Eigen::type2index<1> > rank1;
 #endif
-  const int size  = then_flat.dimension(0);
-  Eigen::array<int, 1> broadcast_dims{size};
-
-  To32Bit(out).device(d) = cond.reshape(rank1)
-                               .broadcast(broadcast_dims)
-                               .select(then_flat, else_flat);
+    const int size = then_flat.dimension(0);
+    Eigen::array<int, 1> broadcast_dims{size};
 
+    To32Bit(out).device(d) = cond.reshape(rank1)
+                                 .broadcast(broadcast_dims)
+                                 .select(then_flat, else_flat);
   }
 };
 
@@ -89,8 +89,8 @@ struct BatchSelectFunctor<GPUDevice, T> {
   }
 };
 
-#define SELECT_FUNCTOR(T)                      \
-  template struct SelectFunctor<GPUDevice, T>; \
+#define SELECT_FUNCTOR(T)                            \
+  template struct SelectFunctor<GPUDevice, T>;       \
   template struct SelectScalarFunctor<GPUDevice, T>; \
   template struct BatchSelectFunctor<GPUDevice, T>;
 
diff --git a/tensorflow/core/kernels/cwise_op_greater.cc b/tensorflow/core/kernels/cwise_op_greater.cc
index ba89899fb3..a4ea408836 100644
--- a/tensorflow/core/kernels/cwise_op_greater.cc
+++ b/tensorflow/core/kernels/cwise_op_greater.cc
@@ -43,5 +43,5 @@ REGISTER_KERNEL_BUILDER(Name("Greater")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::greater<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_greater_equal.cc b/tensorflow/core/kernels/cwise_op_greater_equal.cc
index 8f0c483aec..3f34d6269e 100644
--- a/tensorflow/core/kernels/cwise_op_greater_equal.cc
+++ b/tensorflow/core/kernels/cwise_op_greater_equal.cc
@@ -35,7 +35,8 @@ REGISTER_KERNEL_BUILDER(Name("GreaterEqual")
 #endif
 
 #ifdef TENSORFLOW_USE_SYCL
-REGISTER2(BinaryOp, SYCL, "GreaterEqual", functor::greater_equal, float, double);
+REGISTER2(BinaryOp, SYCL, "GreaterEqual", functor::greater_equal, float,
+          double);
 
 REGISTER_KERNEL_BUILDER(Name("GreaterEqual")
                             .Device(DEVICE_SYCL)
@@ -44,5 +45,5 @@ REGISTER_KERNEL_BUILDER(Name("GreaterEqual")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::greater_equal<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_invert.cc b/tensorflow/core/kernels/cwise_op_invert.cc
index df2c02e42e..f5cafcc780 100644
--- a/tensorflow/core/kernels/cwise_op_invert.cc
+++ b/tensorflow/core/kernels/cwise_op_invert.cc
@@ -21,7 +21,7 @@ REGISTER6(UnaryOp, CPU, "Invert", functor::invert, int8, int16, int32, int64,
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER6(UnaryOp, SYCL, "Invert", functor::invert, int8, int16, int32, int64,
-         uint8, uint16);
+          uint8, uint16);
 #endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_isfinite.cc b/tensorflow/core/kernels/cwise_op_isfinite.cc
index 53ec1c1c63..ae1e590d24 100644
--- a/tensorflow/core/kernels/cwise_op_isfinite.cc
+++ b/tensorflow/core/kernels/cwise_op_isfinite.cc
@@ -26,5 +26,5 @@ REGISTER3(UnaryOp, GPU, "IsFinite", functor::isfinite, float, Eigen::half,
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "IsFinite", functor::isfinite, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_isinf.cc b/tensorflow/core/kernels/cwise_op_isinf.cc
index 4b34744304..f22ca21e1c 100644
--- a/tensorflow/core/kernels/cwise_op_isinf.cc
+++ b/tensorflow/core/kernels/cwise_op_isinf.cc
@@ -24,5 +24,5 @@ REGISTER3(UnaryOp, GPU, "IsInf", functor::isinf, float, Eigen::half, double);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "IsInf", functor::isinf, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_isnan.cc b/tensorflow/core/kernels/cwise_op_isnan.cc
index ad2dd3f722..aa180c247e 100644
--- a/tensorflow/core/kernels/cwise_op_isnan.cc
+++ b/tensorflow/core/kernels/cwise_op_isnan.cc
@@ -24,5 +24,5 @@ REGISTER3(UnaryOp, GPU, "IsNan", functor::isnan, float, Eigen::half, double);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "IsNan", functor::isnan, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_less.cc b/tensorflow/core/kernels/cwise_op_less.cc
index 136c3666df..00cdecdbd1 100644
--- a/tensorflow/core/kernels/cwise_op_less.cc
+++ b/tensorflow/core/kernels/cwise_op_less.cc
@@ -42,5 +42,5 @@ REGISTER_KERNEL_BUILDER(Name("Less")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::less<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_less_equal.cc b/tensorflow/core/kernels/cwise_op_less_equal.cc
index 97a2508d12..11806c5fc7 100644
--- a/tensorflow/core/kernels/cwise_op_less_equal.cc
+++ b/tensorflow/core/kernels/cwise_op_less_equal.cc
@@ -44,5 +44,5 @@ REGISTER_KERNEL_BUILDER(Name("LessEqual")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::less_equal<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_log.cc b/tensorflow/core/kernels/cwise_op_log.cc
index 7fdfdff0e3..98936e0f96 100644
--- a/tensorflow/core/kernels/cwise_op_log.cc
+++ b/tensorflow/core/kernels/cwise_op_log.cc
@@ -25,5 +25,5 @@ REGISTER3(UnaryOp, GPU, "Log", functor::log, float, Eigen::half, double);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Log", functor::log, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_log1p.cc b/tensorflow/core/kernels/cwise_op_log1p.cc
index 25ad7b24bb..162ca9e07c 100644
--- a/tensorflow/core/kernels/cwise_op_log1p.cc
+++ b/tensorflow/core/kernels/cwise_op_log1p.cc
@@ -25,5 +25,5 @@ REGISTER3(UnaryOp, GPU, "Log1p", functor::log1p, float, Eigen::half, double);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Log1p", functor::log1p, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_maximum.cc b/tensorflow/core/kernels/cwise_op_maximum.cc
index 87d54e380b..8c54f22f10 100644
--- a/tensorflow/core/kernels/cwise_op_maximum.cc
+++ b/tensorflow/core/kernels/cwise_op_maximum.cc
@@ -43,5 +43,5 @@ REGISTER_KERNEL_BUILDER(Name("Maximum")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::maximum<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_minimum.cc b/tensorflow/core/kernels/cwise_op_minimum.cc
index 442171193b..dff83df828 100644
--- a/tensorflow/core/kernels/cwise_op_minimum.cc
+++ b/tensorflow/core/kernels/cwise_op_minimum.cc
@@ -43,6 +43,6 @@ REGISTER_KERNEL_BUILDER(Name("Minimum")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::minimum<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_mul_1.cc b/tensorflow/core/kernels/cwise_op_mul_1.cc
index 023eb07ca3..0e8d2e3735 100644
--- a/tensorflow/core/kernels/cwise_op_mul_1.cc
+++ b/tensorflow/core/kernels/cwise_op_mul_1.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 namespace tensorflow {
 
-REGISTER5(BinaryOp, CPU, "Mul", functor::mul, float, Eigen::half, double,
-          uint8, int32);
+REGISTER5(BinaryOp, CPU, "Mul", functor::mul, float, Eigen::half, double, uint8,
+          int32);
 #if defined(__ANDROID_TYPES_SLIM__)
 // We only register the first type when we have multi-argument calls in the
 // case where we're trying to reduce executable size, but it turns out that the
@@ -28,7 +28,7 @@ REGISTER(BinaryOp, CPU, "Mul", functor::mul, int32);
 
 #if GOOGLE_CUDA
 REGISTER4(BinaryOp, GPU, "Mul", functor::mul, float, Eigen::half, double,
-           uint8);
+          uint8);
 // A special GPU kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
 // registration requires all int32 inputs and outputs to be in host memory.
@@ -50,5 +50,5 @@ REGISTER_KERNEL_BUILDER(Name("Mul")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::mul<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_mul_2.cc b/tensorflow/core/kernels/cwise_op_mul_2.cc
index 7be5857cc0..6aa8f88364 100644
--- a/tensorflow/core/kernels/cwise_op_mul_2.cc
+++ b/tensorflow/core/kernels/cwise_op_mul_2.cc
@@ -22,11 +22,11 @@ namespace tensorflow {
 // sharded files, only make its register calls when not __ANDROID_TYPES_SLIM__.
 #if !defined(__ANDROID_TYPES_SLIM__)
 
-REGISTER6(BinaryOp, CPU, "Mul", functor::mul,
-          int8, uint16, int16, int64, complex64, complex128);
+REGISTER6(BinaryOp, CPU, "Mul", functor::mul, int8, uint16, int16, int64,
+          complex64, complex128);
 #if GOOGLE_CUDA
 REGISTER6(BinaryOp, GPU, "Mul", functor::mul, int8, uint16, int16, int64,
-           complex64, complex128);
+          complex64, complex128);
 
 #endif  // GOOGLE_CUDA
 
diff --git a/tensorflow/core/kernels/cwise_op_neg.cc b/tensorflow/core/kernels/cwise_op_neg.cc
index 536891b548..a136769b91 100644
--- a/tensorflow/core/kernels/cwise_op_neg.cc
+++ b/tensorflow/core/kernels/cwise_op_neg.cc
@@ -27,7 +27,7 @@ REGISTER_KERNEL_BUILDER(Name("Neg")
                             .HostMemory("y")
                             .TypeConstraint<int32>("T"),
                         UnaryOp<CPUDevice, functor::neg<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
 REGISTER6(UnaryOp, GPU, "Neg", functor::neg, float, Eigen::half, double, int64,
diff --git a/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc b/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc
index 7bd81ee127..02cd298745 100644
--- a/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc
+++ b/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 namespace tensorflow {
 REGISTER6(BinaryOp, CPU, "NotEqual", functor::not_equal_to, float, Eigen::half,
-           double, uint8, int8, int16);
+          double, uint8, int8, int16);
 #if GOOGLE_CUDA
 REGISTER4(BinaryOp, GPU, "NotEqual", functor::not_equal_to, float, Eigen::half,
           double, uint8);
diff --git a/tensorflow/core/kernels/cwise_op_not_equal_to_2.cc b/tensorflow/core/kernels/cwise_op_not_equal_to_2.cc
index 7d4ecec59f..05bdea6636 100644
--- a/tensorflow/core/kernels/cwise_op_not_equal_to_2.cc
+++ b/tensorflow/core/kernels/cwise_op_not_equal_to_2.cc
@@ -30,5 +30,5 @@ REGISTER6(BinaryOp, GPU, "NotEqual", functor::not_equal_to, int8, int16, int64,
 
 #endif  // GOOGLE_CUDA
 
-#endif   // !defined(__ANDROID_TYPES_SLIM__)
+#endif  // !defined(__ANDROID_TYPES_SLIM__)
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_reciprocal.cc b/tensorflow/core/kernels/cwise_op_reciprocal.cc
index 8c0e21f9cf..aee25747b8 100644
--- a/tensorflow/core/kernels/cwise_op_reciprocal.cc
+++ b/tensorflow/core/kernels/cwise_op_reciprocal.cc
@@ -38,7 +38,7 @@ REGISTER4(UnaryOp, GPU, "Reciprocal", functor::inverse, float, Eigen::half,
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER(UnaryOp, SYCL, "Reciprocal", functor::inverse, float);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 REGISTER5(SimpleBinaryOp, CPU, "ReciprocalGrad", functor::inverse_grad, float,
           Eigen::half, double, complex64, complex128);
@@ -48,5 +48,5 @@ REGISTER3(SimpleBinaryOp, GPU, "ReciprocalGrad", functor::inverse_grad, float,
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER(SimpleBinaryOp, SYCL, "ReciprocalGrad", functor::inverse_grad, float);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_select.cc b/tensorflow/core/kernels/cwise_op_select.cc
index 3dd9de8d89..e259daaba4 100644
--- a/tensorflow/core/kernels/cwise_op_select.cc
+++ b/tensorflow/core/kernels/cwise_op_select.cc
@@ -30,7 +30,7 @@ typedef Eigen::GpuDevice GPUDevice;
 
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 class SelectOp : public OpKernel {
@@ -185,7 +185,7 @@ REGISTER_SELECT_SYCL(double);
 REGISTER_SELECT_SYCL(int32);
 REGISTER_SELECT_SYCL(int64);
 #undef REGISTER_SELECT_SYCL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 namespace functor {
 
@@ -201,13 +201,11 @@ struct SelectFunctorBase {
 };
 
 template <typename T>
-struct SelectFunctor<CPUDevice, T>
-        : SelectFunctorBase<CPUDevice, T> {};
+struct SelectFunctor<CPUDevice, T> : SelectFunctorBase<CPUDevice, T> {};
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
-struct SelectFunctor<SYCLDevice, T>
-        : SelectFunctorBase<SYCLDevice, T> {};
-#endif // TENSORFLOW_USE_SYCL
+struct SelectFunctor<SYCLDevice, T> : SelectFunctorBase<SYCLDevice, T> {};
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 struct SelectScalarFunctorBase {
@@ -222,12 +220,12 @@ struct SelectScalarFunctorBase {
 // CPU Specializations of Select functors with scalar
 template <typename T>
 struct SelectScalarFunctor<CPUDevice, T>
-        : SelectScalarFunctorBase<CPUDevice, T> {};
+    : SelectScalarFunctorBase<CPUDevice, T> {};
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
 struct SelectScalarFunctor<SYCLDevice, T>
-        : SelectScalarFunctorBase<SYCLDevice, T> {};
-#endif // TENSORFLOW_USE_SYCL
+    : SelectScalarFunctorBase<SYCLDevice, T> {};
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 struct BatchSelectFunctorBase {
@@ -240,8 +238,8 @@ struct BatchSelectFunctorBase {
     const Eigen::DenseIndex all_but_batch = then_flat_outer_dims.dimension(1);
 
 #if !defined(EIGEN_HAS_INDEX_LIST)
-    Eigen::array<Eigen::DenseIndex, 2> broadcast_dims{{ 1, all_but_batch }};
-    Eigen::Tensor<Eigen::DenseIndex, 2>::Dimensions reshape_dims{{ batch, 1 }};
+    Eigen::array<Eigen::DenseIndex, 2> broadcast_dims{{1, all_but_batch}};
+    Eigen::Tensor<Eigen::DenseIndex, 2>::Dimensions reshape_dims{{batch, 1}};
 #else
     Eigen::IndexList<Eigen::type2index<1>, Eigen::DenseIndex> broadcast_dims;
     broadcast_dims.set(1, all_but_batch);
@@ -257,13 +255,13 @@ struct BatchSelectFunctorBase {
 };
 
 template <typename T>
-struct BatchSelectFunctor<CPUDevice, T>
-        : BatchSelectFunctorBase<CPUDevice, T> {};
+struct BatchSelectFunctor<CPUDevice, T> : BatchSelectFunctorBase<CPUDevice, T> {
+};
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
 struct BatchSelectFunctor<SYCLDevice, T>
-        : BatchSelectFunctorBase<SYCLDevice, T> {};
-#endif // TENSORFLOW_USE_SYCL
+    : BatchSelectFunctorBase<SYCLDevice, T> {};
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
 
diff --git a/tensorflow/core/kernels/cwise_op_sigmoid.cc b/tensorflow/core/kernels/cwise_op_sigmoid.cc
index a76a088ac8..c132fdb63f 100644
--- a/tensorflow/core/kernels/cwise_op_sigmoid.cc
+++ b/tensorflow/core/kernels/cwise_op_sigmoid.cc
@@ -25,7 +25,7 @@ REGISTER3(UnaryOp, GPU, "Sigmoid", functor::sigmoid, float, Eigen::half,
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER(UnaryOp, SYCL, "Sigmoid", functor::sigmoid, float);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 REGISTER5(SimpleBinaryOp, CPU, "SigmoidGrad", functor::sigmoid_grad, float,
           Eigen::half, double, complex64, complex128);
@@ -35,6 +35,6 @@ REGISTER3(SimpleBinaryOp, GPU, "SigmoidGrad", functor::sigmoid_grad, float,
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER(SimpleBinaryOp, SYCL, "SigmoidGrad", functor::sigmoid_grad, float);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_sign.cc b/tensorflow/core/kernels/cwise_op_sign.cc
index a4084d5ad1..02915ff4ce 100644
--- a/tensorflow/core/kernels/cwise_op_sign.cc
+++ b/tensorflow/core/kernels/cwise_op_sign.cc
@@ -41,6 +41,6 @@ REGISTER_KERNEL_BUILDER(Name("Sign")
                             .HostMemory("y")
                             .TypeConstraint<int32>("T"),
                         UnaryOp<CPUDevice, functor::sign<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_sin.cc b/tensorflow/core/kernels/cwise_op_sin.cc
index b91ff1ac30..16c6057864 100644
--- a/tensorflow/core/kernels/cwise_op_sin.cc
+++ b/tensorflow/core/kernels/cwise_op_sin.cc
@@ -25,5 +25,5 @@ REGISTER3(UnaryOp, GPU, "Sin", functor::sin, float, Eigen::half, double);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Sin", functor::sin, float, double);
-#endif // TENSORFLOW_USE_SYC
+#endif  // TENSORFLOW_USE_SYC
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_sinh.cc b/tensorflow/core/kernels/cwise_op_sinh.cc
index 055f0b12e1..26b7a940aa 100644
--- a/tensorflow/core/kernels/cwise_op_sinh.cc
+++ b/tensorflow/core/kernels/cwise_op_sinh.cc
@@ -16,20 +16,18 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER4(UnaryOp, CPU, "Sinh", functor::sinh, float, double,
-          complex64, complex128);
+REGISTER4(UnaryOp, CPU, "Sinh", functor::sinh, float, double, complex64,
+          complex128);
 
 #if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-                          Name("Sinh")                                \
-                          .Device(DEVICE_SYCL)                        \
-                          .TypeConstraint<TYPE>("T"),                 \
-                          UnaryOp<SYCLDevice, functor::sinh<TYPE>>);
+#define REGISTER_SYCL_KERNEL(TYPE)                                \
+  REGISTER_KERNEL_BUILDER(                                        \
+      Name("Sinh").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
+      UnaryOp<SYCLDevice, functor::sinh<TYPE>>);
 REGISTER_SYCL_KERNEL(float);
 REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYC
+#endif  // TENSORFLOW_USE_SYC
 
 #if GOOGLE_CUDA
 REGISTER2(UnaryOp, GPU, "Sinh", functor::sinh, float, double);
diff --git a/tensorflow/core/kernels/cwise_op_sqrt.cc b/tensorflow/core/kernels/cwise_op_sqrt.cc
index 00efbb00f1..497756133d 100644
--- a/tensorflow/core/kernels/cwise_op_sqrt.cc
+++ b/tensorflow/core/kernels/cwise_op_sqrt.cc
@@ -25,7 +25,7 @@ REGISTER3(UnaryOp, GPU, "Sqrt", functor::sqrt, float, Eigen::half, double);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Sqrt", functor::sqrt, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 REGISTER5(SimpleBinaryOp, CPU, "SqrtGrad", functor::sqrt_grad, float,
           Eigen::half, double, complex64, complex128);
@@ -36,5 +36,5 @@ REGISTER3(SimpleBinaryOp, GPU, "SqrtGrad", functor::sqrt_grad, float,
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(SimpleBinaryOp, SYCL, "SqrtGrad", functor::sqrt_grad, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_square.cc b/tensorflow/core/kernels/cwise_op_square.cc
index 07a4b0b084..7fc2f6bf08 100644
--- a/tensorflow/core/kernels/cwise_op_square.cc
+++ b/tensorflow/core/kernels/cwise_op_square.cc
@@ -42,5 +42,5 @@ REGISTER_KERNEL_BUILDER(Name("Square")
                             .HostMemory("y")
                             .TypeConstraint<int32>("T"),
                         UnaryOp<CPUDevice, functor::square<int32>>);
-#endif // TENSORFLOW_USE_SYC
+#endif  // TENSORFLOW_USE_SYC
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_sub.cc b/tensorflow/core/kernels/cwise_op_sub.cc
index 6adaecba04..025041946a 100644
--- a/tensorflow/core/kernels/cwise_op_sub.cc
+++ b/tensorflow/core/kernels/cwise_op_sub.cc
@@ -53,5 +53,5 @@ REGISTER_KERNEL_BUILDER(Name("Sub")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::sub<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_tan.cc b/tensorflow/core/kernels/cwise_op_tan.cc
index 7891b1183d..c1a25767d3 100644
--- a/tensorflow/core/kernels/cwise_op_tan.cc
+++ b/tensorflow/core/kernels/cwise_op_tan.cc
@@ -24,5 +24,5 @@ REGISTER2(UnaryOp, GPU, "Tan", functor::tan, float, double);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Tan", functor::tan, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_tanh.cc b/tensorflow/core/kernels/cwise_op_tanh.cc
index 8b3900892c..c5005f5ea8 100644
--- a/tensorflow/core/kernels/cwise_op_tanh.cc
+++ b/tensorflow/core/kernels/cwise_op_tanh.cc
@@ -26,7 +26,7 @@ REGISTER3(UnaryOp, GPU, "Tanh", functor::tanh, float, Eigen::half, double);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Tanh", functor::tanh, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 REGISTER5(SimpleBinaryOp, CPU, "TanhGrad", functor::tanh_grad, float,
           Eigen::half, double, complex64, complex128);
diff --git a/tensorflow/core/kernels/cwise_ops_common.cc b/tensorflow/core/kernels/cwise_ops_common.cc
index e561e59cf5..980edffceb 100644
--- a/tensorflow/core/kernels/cwise_ops_common.cc
+++ b/tensorflow/core/kernels/cwise_ops_common.cc
@@ -57,9 +57,9 @@ BinaryOpShared::BinaryOpState::BinaryOpState(OpKernelContext* ctx)
       in1(ctx->input(1)),
       bcast(BCast::FromShape(in0.shape()), BCast::FromShape(in1.shape())) {
   if (!bcast.IsValid()) {
-    ctx->SetStatus(errors::InvalidArgument("Incompatible shapes: ",
-                                           in0.shape().DebugString(), " vs. ",
-                                           in1.shape().DebugString()));
+    ctx->SetStatus(errors::InvalidArgument(
+        "Incompatible shapes: ", in0.shape().DebugString(), " vs. ",
+        in1.shape().DebugString()));
     return;
   }
   const TensorShape output_shape = BCast::ToShape(bcast.output_shape());
diff --git a/tensorflow/core/kernels/cwise_ops_gpu_gradients.cu.h b/tensorflow/core/kernels/cwise_ops_gpu_gradients.cu.h
index 4394770708..e81b840a50 100644
--- a/tensorflow/core/kernels/cwise_ops_gpu_gradients.cu.h
+++ b/tensorflow/core/kernels/cwise_ops_gpu_gradients.cu.h
@@ -50,16 +50,16 @@ struct SimpleBinaryFunctor<GPUDevice, Functor> {
 
 // Macros to explicitly instantiate kernels on GPU for multiple types
 // (T0, T1, etc.) for SimpleBiaryFunctor (e.g., functor::tanh_grad).
-#define DEFINE_SIMPLE_BINARY1(F, T)                  \
+#define DEFINE_SIMPLE_BINARY1(F, T) \
   template struct SimpleBinaryFunctor<GPUDevice, F<T> >
-#define DEFINE_SIMPLE_BINARY2(F, T0, T1)             \
-  DEFINE_SIMPLE_BINARY1(F, T0);                      \
+#define DEFINE_SIMPLE_BINARY2(F, T0, T1) \
+  DEFINE_SIMPLE_BINARY1(F, T0);          \
   DEFINE_SIMPLE_BINARY1(F, T1)
-#define DEFINE_SIMPLE_BINARY3(F, T0, T1, T2)         \
-  DEFINE_SIMPLE_BINARY2(F, T0, T1);                  \
+#define DEFINE_SIMPLE_BINARY3(F, T0, T1, T2) \
+  DEFINE_SIMPLE_BINARY2(F, T0, T1);          \
   DEFINE_SIMPLE_BINARY1(F, T2)
-#define DEFINE_SIMPLE_BINARY4(F, T0, T1, T2, T3)     \
-  DEFINE_SIMPLE_BINARY2(F, T0, T1);                  \
+#define DEFINE_SIMPLE_BINARY4(F, T0, T1, T2, T3) \
+  DEFINE_SIMPLE_BINARY2(F, T0, T1);              \
   DEFINE_SIMPLE_BINARY2(F, T2, T3)
 #define DEFINE_SIMPLE_BINARY5(F, T0, T1, T2, T3, T4) \
   DEFINE_SIMPLE_BINARY2(F, T0, T1);                  \
diff --git a/tensorflow/core/kernels/cwise_ops_gradients.h b/tensorflow/core/kernels/cwise_ops_gradients.h
index 77b330f589..82cdae9a34 100644
--- a/tensorflow/core/kernels/cwise_ops_gradients.h
+++ b/tensorflow/core/kernels/cwise_ops_gradients.h
@@ -171,7 +171,6 @@ struct SimpleBinaryFunctor<CPUDevice, Functor> {
   }
 };
 
-
 #ifdef TENSORFLOW_USE_SYCL
 // Partial specialization of BinaryFunctor for SYCL devices
 typedef Eigen::SyclDevice SYCLDevice;
@@ -184,7 +183,7 @@ struct SimpleBinaryFunctor<SYCLDevice, Functor> {
   }
 };
 
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename T>
 struct tanh_grad : base<T, Eigen::internal::scalar_tanh_gradient_op<T>> {};
diff --git a/tensorflow/core/kernels/cwise_ops_sycl_common.h b/tensorflow/core/kernels/cwise_ops_sycl_common.h
index 3f6ff7303d..3e107cee04 100644
--- a/tensorflow/core/kernels/cwise_ops_sycl_common.h
+++ b/tensorflow/core/kernels/cwise_ops_sycl_common.h
@@ -51,7 +51,8 @@ struct BinaryFunctor<SYCLDevice, Functor, NDIMS, has_errors> {
   void operator()(const SYCLDevice& d, typename Functor::tout_type out,
                   typename Functor::tin_type in0,
                   typename Functor::tin_type in1, bool* error) {
-    To32Bit(out).device(d) = To32Bit(in0).binaryExpr(To32Bit(in1), typename Functor::func());
+    To32Bit(out).device(d) =
+        To32Bit(in0).binaryExpr(To32Bit(in1), typename Functor::func());
   }
 
   void Left(const SYCLDevice& d, typename Functor::tout_type out,
@@ -61,7 +62,9 @@ struct BinaryFunctor<SYCLDevice, Functor, NDIMS, has_errors> {
     constexpr int NumDims = Functor::tin_type::NumDimensions;
     static_assert(NumDims == 1, "Unexpected size");
     Eigen::Sizes<1> scalar_dim;
-    out.device(d) = scalar.reshape(scalar_dim).broadcast(in.dimensions()).binaryExpr(in, Binary());
+    out.device(d) = scalar.reshape(scalar_dim)
+                        .broadcast(in.dimensions())
+                        .binaryExpr(in, Binary());
   }
 
   void Right(const SYCLDevice& d, typename Functor::tout_type out,
@@ -71,7 +74,8 @@ struct BinaryFunctor<SYCLDevice, Functor, NDIMS, has_errors> {
     constexpr int NumDims = Functor::tin_type::NumDimensions;
     static_assert(NumDims == 1, "Unexpected size");
     Eigen::Sizes<1> scalar_dim;
-    out.device(d) = in.binaryExpr(scalar.reshape(scalar_dim).broadcast(in.dimensions()), Binary());
+    out.device(d) = in.binaryExpr(
+        scalar.reshape(scalar_dim).broadcast(in.dimensions()), Binary());
   }
 
   void BCast(const SYCLDevice& d,
diff --git a/tensorflow/core/kernels/cwise_ops_test.cc b/tensorflow/core/kernels/cwise_ops_test.cc
index bca0f1004d..39f497e716 100644
--- a/tensorflow/core/kernels/cwise_ops_test.cc
+++ b/tensorflow/core/kernels/cwise_ops_test.cc
@@ -54,36 +54,36 @@ int ColsFromArg(int arg) { return (arg % kRows); }
 BM_UNARY(cpu, Floor, float, DT_FLOAT);
 #if GOOGLE_CUDA
 BM_UNARY(gpu, Floor, float, DT_FLOAT);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
 BM_UNARY(sycl, Floor, float, DT_FLOAT);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 BM_UNARY(cpu, Floor, double, DT_DOUBLE);
 #if GOOGLE_CUDA
 BM_UNARY(gpu, Floor, double, DT_DOUBLE);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
 BM_UNARY(sycl, Floor, double, DT_DOUBLE);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 BM_UNARY(cpu, Conj, std::complex<float>, DT_COMPLEX64);
 #if GOOGLE_CUDA
 BM_UNARY(gpu, Conj, std::complex<float>, DT_COMPLEX64);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 BM_UNARY(cpu, Conj, std::complex<double>, DT_COMPLEX128);
 #if GOOGLE_CUDA
 BM_UNARY(gpu, Conj, std::complex<double>, DT_COMPLEX128);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 
 BM_UNARY(cpu, Rint, double, DT_DOUBLE);
 #if GOOGLE_CUDA
 BM_UNARY(gpu, Rint, double, DT_DOUBLE);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 BM_UNARY(cpu, Rint, float, DT_FLOAT);
 #if GOOGLE_CUDA
 BM_UNARY(gpu, Rint, float, DT_FLOAT);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 
 // data func scalar.
 Graph* BinaryScalar(int num, const string& func) {
@@ -113,18 +113,18 @@ Graph* BinaryScalar(int num, const string& func) {
 BM_BINARY_SCALAR(cpu, Less);
 #if GOOGLE_CUDA
 BM_BINARY_SCALAR(gpu, Less);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
 BM_BINARY_SCALAR(sycl, Less);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 BM_BINARY_SCALAR(cpu, Add);
 #if GOOGLE_CUDA
 BM_BINARY_SCALAR(gpu, Add);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
 BM_BINARY_SCALAR(sycl, Add);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 #undef BM_BINARY_SCALAR
 
 template <class T>
@@ -163,11 +163,11 @@ using Eigen::half;
 BM_BIAS_ADD_ALL(cpu, float, DT_FLOAT);
 #if GOOGLE_CUDA
 BM_BIAS_ADD_ALL(gpu, float, DT_FLOAT);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 BM_BIAS_ADD_ALL(cpu, half, DT_HALF);
 #if GOOGLE_CUDA
 BM_BIAS_ADD_ALL(gpu, half, DT_HALF);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 #undef BM_BIAS_ADD_ALL
 #undef BM_BIAS_ADD
 
@@ -217,15 +217,15 @@ using Eigen::half;
 #if GOOGLE_CUDA
 BM_BIAS_ADD_GRAD_ALL(gpu, NCHW, float, DT_FLOAT);
 BM_BIAS_ADD_GRAD_ALL(gpu, NCHW, half, DT_HALF);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 BM_BIAS_ADD_GRAD_ALL(cpu, NHWC, float, DT_FLOAT);
 #if GOOGLE_CUDA
 BM_BIAS_ADD_GRAD_ALL(gpu, NHWC, float, DT_FLOAT);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 BM_BIAS_ADD_GRAD_ALL(cpu, NHWC, half, DT_HALF);
 #if GOOGLE_CUDA
 BM_BIAS_ADD_GRAD_ALL(gpu, NHWC, half, DT_HALF);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 #undef BM_BIAS_ADD_GRAD_ALL
 #undef BM_BIAS_ADD_GRAD
 
@@ -265,10 +265,10 @@ Graph* BcastAdd(int rows, int cols, int dim) {
 BM_BCAST_ADD_ROW_ALL(cpu);
 #if GOOGLE_CUDA
 BM_BCAST_ADD_ROW_ALL(gpu);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
 BM_BCAST_ADD_ROW_ALL(sycl);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 #undef BM_BCAST_ADD_ROW_ALL
 #undef BM_BCAST_ADD_ROW
 
@@ -291,10 +291,10 @@ BM_BCAST_ADD_ROW_ALL(sycl);
 BM_BCAST_ADD_COL_ALL(cpu);
 #if GOOGLE_CUDA
 BM_BCAST_ADD_COL_ALL(gpu);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
 BM_BCAST_ADD_COL_ALL(sycl);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 #undef BM_BCAST_ADD_COL_ALL
 #undef BM_BCAST_ADD_COL
 
diff --git a/tensorflow/core/kernels/debug_ops.cc b/tensorflow/core/kernels/debug_ops.cc
index 965a60c7e0..1b94ea0544 100644
--- a/tensorflow/core/kernels/debug_ops.cc
+++ b/tensorflow/core/kernels/debug_ops.cc
@@ -46,7 +46,7 @@ REGISTER_KERNEL_BUILDER(Name("CopyHost")
                             .HostMemory("input")
                             .HostMemory("output"),
                         CopyOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // Register debug identity (non-ref and ref) ops.
 REGISTER_KERNEL_BUILDER(Name("DebugIdentity").Device(DEVICE_CPU),
@@ -66,7 +66,7 @@ REGISTER_KERNEL_BUILDER(Name("DebugIdentity")
                             .HostMemory("input")
                             .HostMemory("output"),
                         DebugIdentityOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // Register debug NaN-counter (non-ref and ref) ops.
 #define REGISTER_DEBUG_NAN_COUNT(type)                                    \
@@ -98,7 +98,7 @@ REGISTER_GPU_DEBUG_NAN_COUNT(double);
                           DebugNanCountOp<type>);
 REGISTER_GPU_DEBUG_NAN_COUNT(float);
 REGISTER_GPU_DEBUG_NAN_COUNT(double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // Register debug numeric summary ops.
 #define REGISTER_DEBUG_NUMERIC_SUMMARY_COUNT(type)        \
diff --git a/tensorflow/core/kernels/debug_ops.h b/tensorflow/core/kernels/debug_ops.h
index 381add3fb3..53a23b1306 100644
--- a/tensorflow/core/kernels/debug_ops.h
+++ b/tensorflow/core/kernels/debug_ops.h
@@ -21,7 +21,7 @@ limitations under the License.
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 #include "tensorflow/core/common_runtime/sycl/sycl_util.h"
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 #include "tensorflow/core/debug/debug_io_utils.h"
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -91,7 +91,7 @@ class CopyOp : public OpKernel {
       Device* device = static_cast<Device*>(context->device());
       // Determine if the input tensor is not on CPU (e.g., on GPU).
       const bool off_host_input = device->device_type() == DEVICE_SYCL &&
-                            !context->input_alloc_attr(0).on_host();
+                                  !context->input_alloc_attr(0).on_host();
 
       if (off_host_input) {
         SYCLmemcpy(context->eigen_sycl_device(), src_tensor, copied_tensor);
diff --git a/tensorflow/core/kernels/decode_csv_op.cc b/tensorflow/core/kernels/decode_csv_op.cc
index c4555db453..0c42f63252 100644
--- a/tensorflow/core/kernels/decode_csv_op.cc
+++ b/tensorflow/core/kernels/decode_csv_op.cc
@@ -91,9 +91,9 @@ class DecodeCSVOp : public OpKernel {
             } else {
               int32 value;
               OP_REQUIRES(ctx, strings::safe_strto32(fields[f], &value),
-                          errors::InvalidArgument("Field ", f, " in record ", i,
-                                                  " is not a valid int32: ",
-                                                  fields[f]));
+                          errors::InvalidArgument(
+                              "Field ", f, " in record ", i,
+                              " is not a valid int32: ", fields[f]));
               output[f]->flat<int32>()(i) = value;
             }
             break;
@@ -111,9 +111,9 @@ class DecodeCSVOp : public OpKernel {
             } else {
               int64 value;
               OP_REQUIRES(ctx, strings::safe_strto64(fields[f], &value),
-                          errors::InvalidArgument("Field ", f, " in record ", i,
-                                                  " is not a valid int64: ",
-                                                  fields[f]));
+                          errors::InvalidArgument(
+                              "Field ", f, " in record ", i,
+                              " is not a valid int64: ", fields[f]));
               output[f]->flat<int64>()(i) = value;
             }
             break;
@@ -130,9 +130,9 @@ class DecodeCSVOp : public OpKernel {
             } else {
               float value;
               OP_REQUIRES(ctx, strings::safe_strtof(fields[f].c_str(), &value),
-                          errors::InvalidArgument("Field ", f, " in record ", i,
-                                                  " is not a valid float: ",
-                                                  fields[f]));
+                          errors::InvalidArgument(
+                              "Field ", f, " in record ", i,
+                              " is not a valid float: ", fields[f]));
               output[f]->flat<float>()(i) = value;
             }
             break;
@@ -150,9 +150,9 @@ class DecodeCSVOp : public OpKernel {
             } else {
               double value;
               OP_REQUIRES(ctx, strings::safe_strtod(fields[f].c_str(), &value),
-                          errors::InvalidArgument("Field ", f, " in record ", i,
-                                                  " is not a valid double: ",
-                                                  fields[f]));
+                          errors::InvalidArgument(
+                              "Field ", f, " in record ", i,
+                              " is not a valid double: ", fields[f]));
               output[f]->flat<double>()(i) = value;
             }
             break;
@@ -208,9 +208,10 @@ class DecodeCSVOp : public OpKernel {
         if (!quoted) {
           while (static_cast<size_t>(current_idx) < input.size() &&
                  input[current_idx] != delim_) {
-            OP_REQUIRES(ctx, (!use_quote_delim_ || input[current_idx] != '"') &&
-                                 input[current_idx] != '\n' &&
-                                 input[current_idx] != '\r',
+            OP_REQUIRES(ctx,
+                        (!use_quote_delim_ || input[current_idx] != '"') &&
+                            input[current_idx] != '\n' &&
+                            input[current_idx] != '\r',
                         errors::InvalidArgument(
                             "Unquoted fields cannot have quotes/CRLFs inside"));
             field += input[current_idx];
@@ -238,10 +239,11 @@ class DecodeCSVOp : public OpKernel {
           }
 
           OP_REQUIRES(
-              ctx, (static_cast<size_t>(current_idx) < input.size() &&
-                    input[current_idx] == '"' &&
-                    (static_cast<size_t>(current_idx) == input.size() - 1 ||
-                     input[current_idx + 1] == delim_)),
+              ctx,
+              (static_cast<size_t>(current_idx) < input.size() &&
+               input[current_idx] == '"' &&
+               (static_cast<size_t>(current_idx) == input.size() - 1 ||
+                input[current_idx + 1] == delim_)),
               errors::InvalidArgument("Quoted field has to end with quote "
                                       "followed by delim or end"));
 
diff --git a/tensorflow/core/kernels/decode_image_op.cc b/tensorflow/core/kernels/decode_image_op.cc
index 44dcbf834c..912d04c153 100644
--- a/tensorflow/core/kernels/decode_image_op.cc
+++ b/tensorflow/core/kernels/decode_image_op.cc
@@ -87,10 +87,11 @@ class DecodeImageOp : public OpKernel {
       channels_ = 3;
     } else {
       OP_REQUIRES_OK(context, context->GetAttr("channels", &channels_));
-      OP_REQUIRES(context, channels_ == 0 || channels_ == 1 || channels_ == 3 ||
-                               channels_ == 4,
-                  errors::InvalidArgument(
-                      "channels must be 0, 1, 3, or 4, got ", channels_));
+      OP_REQUIRES(
+          context,
+          channels_ == 0 || channels_ == 1 || channels_ == 3 || channels_ == 4,
+          errors::InvalidArgument("channels must be 0, 1, 3, or 4, got ",
+                                  channels_));
     }
     flags_.components = channels_;
 
@@ -114,8 +115,9 @@ class DecodeImageOp : public OpKernel {
 
     if (format_ == kJpgFormat) {
       OP_REQUIRES_OK(context, context->GetAttr("ratio", &flags_.ratio));
-      OP_REQUIRES(context, flags_.ratio == 1 || flags_.ratio == 2 ||
-                               flags_.ratio == 4 || flags_.ratio == 8,
+      OP_REQUIRES(context,
+                  flags_.ratio == 1 || flags_.ratio == 2 || flags_.ratio == 4 ||
+                      flags_.ratio == 8,
                   errors::InvalidArgument("ratio must be 1, 2, 4, or 8, got ",
                                           flags_.ratio));
       OP_REQUIRES_OK(context, context->GetAttr("fancy_upscaling",
@@ -130,8 +132,9 @@ class DecodeImageOp : public OpKernel {
       string dct_method;
       OP_REQUIRES_OK(context, context->GetAttr("dct_method", &dct_method));
       OP_REQUIRES(
-          context, (dct_method.empty() || dct_method == "INTEGER_FAST" ||
-                    dct_method == "INTEGER_ACCURATE"),
+          context,
+          (dct_method.empty() || dct_method == "INTEGER_FAST" ||
+           dct_method == "INTEGER_ACCURATE"),
           errors::InvalidArgument("dct_method must be one of "
                                   "{'', 'INTEGER_FAST', 'INTEGER_ACCURATE'}"));
       if (dct_method == "INTEGER_FAST") {
@@ -157,9 +160,9 @@ class DecodeImageOp : public OpKernel {
         errors::InvalidArgument("Expected image (JPEG, PNG, or GIF), got ",
                                 FileFormatString(magic, input)));
     OP_REQUIRES(context, input.size() <= std::numeric_limits<int>::max(),
-                errors::InvalidArgument(FileFormatString(magic, input),
-                                        " contents are too large for int: ",
-                                        input.size()));
+                errors::InvalidArgument(
+                    FileFormatString(magic, input),
+                    " contents are too large for int: ", input.size()));
     OP_REQUIRES(context, magic == kPngFormat || channel_bits_ == 8,
                 errors::InvalidArgument(FileFormatString(magic, input),
                                         " does not support uint16 output"));
@@ -212,9 +215,10 @@ class DecodeImageOp : public OpKernel {
             input.data(), input.size(), flags, nullptr /* nwarn */,
             [=, &output](int width, int height, int channels) -> uint8* {
               Status status(context->allocate_output(
-                  0, format_ == kGifFormat
-                         ? TensorShape({1, height, width, channels})
-                         : TensorShape({height, width, channels}),
+                  0,
+                  format_ == kGifFormat
+                      ? TensorShape({1, height, width, channels})
+                      : TensorShape({height, width, channels}),
                   &output));
               if (!status.ok()) {
                 VLOG(1) << status;
diff --git a/tensorflow/core/kernels/deep_conv2d.cc b/tensorflow/core/kernels/deep_conv2d.cc
index 8e9b8a7e2e..829155fb31 100644
--- a/tensorflow/core/kernels/deep_conv2d.cc
+++ b/tensorflow/core/kernels/deep_conv2d.cc
@@ -120,9 +120,9 @@ bool CanUseDeepConv2D(int stride_rows, int stride_cols, int filter_rows,
 
   VLOG(2) << "CanUseDeepConv2D"
           << " deep_conv_cost: " << deep_conv_cost
-          << " direct_conv_cost: " << direct_conv_cost
-          << " deep_direct_ratio: " << (static_cast<float>(deep_conv_cost) /
-                                        static_cast<float>(direct_conv_cost))
+          << " direct_conv_cost: " << direct_conv_cost << " deep_direct_ratio: "
+          << (static_cast<float>(deep_conv_cost) /
+              static_cast<float>(direct_conv_cost))
           << " use_deep_conv: " << (deep_conv_cost < direct_conv_cost);
   return deep_conv_cost < direct_conv_cost;
 }
diff --git a/tensorflow/core/kernels/dense_update_ops.cc b/tensorflow/core/kernels/dense_update_ops.cc
index 6d44a92fa3..6497c8f371 100644
--- a/tensorflow/core/kernels/dense_update_ops.cc
+++ b/tensorflow/core/kernels/dense_update_ops.cc
@@ -89,7 +89,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #define REGISTER_KERNELS(type)                                     \
   REGISTER_KERNEL_BUILDER(                                         \
@@ -113,14 +113,14 @@ TF_CALL_GPU_ALL_TYPES(REGISTER_GPU_KERNELS);
 #endif  // GOOGLE_CUDA
 
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNELS(type)                                \
-REGISTER_KERNEL_BUILDER(                                           \
-    Name("Assign").Device(DEVICE_SYCL).TypeConstraint<type>("T"),  \
-    AssignOpT<SYCLDevice, type>);
+#define REGISTER_SYCL_KERNELS(type)                                 \
+  REGISTER_KERNEL_BUILDER(                                          \
+      Name("Assign").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
+      AssignOpT<SYCLDevice, type>);
 
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS);
 #undef REGISTER_SYCL_KERNELS
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #define REGISTER_KERNELS(type)                                        \
   REGISTER_KERNEL_BUILDER(                                            \
@@ -146,7 +146,7 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
 #endif  // end GOOGLE_CUDA
 
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNELS(type)                                         \
+#define REGISTER_SYCL_KERNELS(type)                                    \
   REGISTER_KERNEL_BUILDER(                                             \
       Name("AssignAdd").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
       DenseUpdateOp<SYCLDevice, type, DenseUpdateType::ADD>);          \
@@ -156,5 +156,5 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
 
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS);
 #undef REGISTER_SYCL_KERNELS
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/depthwise_conv_grad_op.cc b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
index 9347978d51..91a9587174 100644
--- a/tensorflow/core/kernels/depthwise_conv_grad_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
@@ -400,7 +400,7 @@ struct LaunchDepthwiseConvBackpropInputOp<CPUDevice, T> {
 
     // Computes one shard of depthwise conv2d backprop input.
     auto shard = [&ctx, &args, &out_backprop, &filter_data, &in_backprop](
-        int64 start, int64 limit) {
+                     int64 start, int64 limit) {
       static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
 
       const int64 input_image_size =
@@ -750,7 +750,7 @@ struct LaunchDepthwiseConvBackpropFilterOp<CPUDevice, T> {
 
     // Computes one shard of depthwise conv2d backprop filter.
     auto shard = [&ctx, &args, &out_backprop, &input, &output_buffer_data](
-        int64 start, int64 limit) {
+                     int64 start, int64 limit) {
       static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
       const int64 filter_spatial_size = args.filter_rows * args.filter_cols;
       const int64 padded_out_depth_size =
diff --git a/tensorflow/core/kernels/depthwise_conv_op.cc b/tensorflow/core/kernels/depthwise_conv_op.cc
index a5fd07fbe1..c060b2e14d 100644
--- a/tensorflow/core/kernels/depthwise_conv_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op.cc
@@ -308,10 +308,10 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
 
     // in_depth for input and filter must match.
     const int64 in_depth = GetTensorDim(input, data_format_, 'C');
-    OP_REQUIRES(
-        context, in_depth == filter.dim_size(2),
-        errors::InvalidArgument("input and filter must have the same depth: ",
-                                in_depth, " vs ", filter.dim_size(2)));
+    OP_REQUIRES(context, in_depth == filter.dim_size(2),
+                errors::InvalidArgument(
+                    "input and filter must have the same depth: ", in_depth,
+                    " vs ", filter.dim_size(2)));
 
     // The last dimension for filter is depth multiplier.
     const int32 depth_multiplier = filter.dim_size(3);
@@ -430,9 +430,10 @@ TF_CALL_double(REGISTER_CPU_KERNEL);
 #endif
 
 #if GOOGLE_CUDA
-REGISTER_KERNEL_BUILDER(
-    Name("DepthwiseConv2dNative").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"),
-    DepthwiseConv2dNativeOp<GPUDevice, Eigen::half>);
+REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNative")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<Eigen::half>("T"),
+                        DepthwiseConv2dNativeOp<GPUDevice, Eigen::half>);
 
 REGISTER_KERNEL_BUILDER(
     Name("DepthwiseConv2dNative").Device(DEVICE_GPU).TypeConstraint<float>("T"),
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
index 5493e33532..126b64f73d 100644
--- a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
@@ -17,12 +17,12 @@ limitations under the License.
 #define EIGEN_USE_GPU
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "external/cub_archive/cub/util_ptx.cuh"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/kernels/depthwise_conv_op.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
 #include "tensorflow/core/util/tensor_format.h"
-#include "external/cub_archive/cub/util_ptx.cuh"
 
 #if !defined(_MSC_VER)
 #define UNROLL _Pragma("unroll")
@@ -1021,7 +1021,7 @@ __global__ void __launch_bounds__(640, 2)
 
 // Device function to compute sub-warp sum reduction for a power-of-two group of
 // neighboring threads.
-template<int kWidth, typename T>
+template <int kWidth, typename T>
 __device__ __forceinline__ T WarpSumReduce(T val) {
   // support only power-of-two widths.
   assert(__popc(kWidth) == 1);
diff --git a/tensorflow/core/kernels/diag_op.cc b/tensorflow/core/kernels/diag_op.cc
index 86fa7dce36..d228153d4c 100644
--- a/tensorflow/core/kernels/diag_op.cc
+++ b/tensorflow/core/kernels/diag_op.cc
@@ -29,8 +29,8 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/work_sharder.h"
 
 namespace tensorflow {
@@ -47,8 +47,9 @@ class DiagOp : public OpKernel {
   void Compute(OpKernelContext* context) override {
     const Tensor& diagonal = context->input(0);
     const int num_dims = diagonal.dims();
-    OP_REQUIRES(context, 0 != num_dims, errors::InvalidArgument(
-        "Input must be at least rank 1, got 0"));
+    OP_REQUIRES(
+        context, 0 != num_dims,
+        errors::InvalidArgument("Input must be at least rank 1, got 0"));
     TensorShape out_shape;
     for (int i = 0; i < num_dims; ++i) {
       out_shape.AddDim(diagonal.dim_size(i));
@@ -60,10 +61,9 @@ class DiagOp : public OpKernel {
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, out_shape, &output_tensor));
     functor::DiagFunctor<Device, T> diagFunc;
-    Status s = diagFunc(context,
-                        diagonal.NumElements(),
-                        diagonal.flat<T>().data(),
-                        output_tensor->flat<T>().data());
+    Status s =
+        diagFunc(context, diagonal.NumElements(), diagonal.flat<T>().data(),
+                 output_tensor->flat<T>().data());
     OP_REQUIRES_OK(context, s);
   }
 };
@@ -82,12 +82,12 @@ class DiagPartOp : public OpKernel {
                 errors::InvalidArgument("The rank of the tensor should be \
                                          even and positive, got shape ",
                                         tensor.shape().DebugString()));
-    for (int i = 0; i < out_dims; i++){
-      OP_REQUIRES(context, tensor.dim_size(i) == tensor.dim_size(i + out_dims),
-                  errors::InvalidArgument(
-                    "Invalid shape ", tensor.shape().DebugString(),
-                    ": dimensions ", i, " and ", i + out_dims, " do not match.")
-                  );
+    for (int i = 0; i < out_dims; i++) {
+      OP_REQUIRES(
+          context, tensor.dim_size(i) == tensor.dim_size(i + out_dims),
+          errors::InvalidArgument("Invalid shape ",
+                                  tensor.shape().DebugString(), ": dimensions ",
+                                  i, " and ", i + out_dims, " do not match."));
     }
 
     TensorShape out_shape;
@@ -96,13 +96,10 @@ class DiagPartOp : public OpKernel {
     }
 
     Tensor* output = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, out_shape, &output));
+    OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
     functor::DiagPartFunctor<Device, T> diagPartFunc;
-    Status s = diagPartFunc(context,
-                            out_shape.num_elements(),
-                            tensor.flat<T>().data(),
-                            output->flat<T>().data());
+    Status s = diagPartFunc(context, out_shape.num_elements(),
+                            tensor.flat<T>().data(), output->flat<T>().data());
     OP_REQUIRES_OK(context, s);
   }
 };
@@ -129,9 +126,8 @@ class DiagPartOp : public OpKernel {
 namespace functor {
 template <typename T>
 struct DiagFunctor<CPUDevice, T> {
-  EIGEN_ALWAYS_INLINE Status
-  operator() (OpKernelContext* context, const int64 size,
-              const T* in, T* out) {
+  EIGEN_ALWAYS_INLINE Status operator()(OpKernelContext* context,
+                                        const int64 size, const T* in, T* out) {
     // This subprocess is responsible for writing values in index range
     // [start*size, limit*size)
     auto subDiag = [in, out, size](int64 start, int64 limit) {
@@ -143,17 +139,16 @@ struct DiagFunctor<CPUDevice, T> {
 
     // Here, 5 is a empirical factor of cost_per_unit.
     auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
-    Shard(worker_threads.num_threads, worker_threads.workers, size,
-        5 * size, subDiag);
+    Shard(worker_threads.num_threads, worker_threads.workers, size, 5 * size,
+          subDiag);
     return Status::OK();
   }
 };
 
 template <typename T>
 struct DiagPartFunctor<CPUDevice, T> {
-  EIGEN_ALWAYS_INLINE Status
-  operator() (OpKernelContext* context, const int64 size,
-              const T* in, T* out) {
+  EIGEN_ALWAYS_INLINE Status operator()(OpKernelContext* context,
+                                        const int64 size, const T* in, T* out) {
     // This subprocess is responsible for extracting values in index range
     // [start, limit)
     auto subDiagPart = [in, out, size](int64 start, int64 limit) {
@@ -164,14 +159,13 @@ struct DiagPartFunctor<CPUDevice, T> {
 
     // Here, 5 is a empirical factor of cost_per_unit.
     auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
-    Shard(worker_threads.num_threads, worker_threads.workers, size,
-        5, subDiagPart);
+    Shard(worker_threads.num_threads, worker_threads.workers, size, 5,
+          subDiagPart);
     return Status::OK();
   }
 };
 }  // namespace functor
 
-
 // Register the CPU kernels.
 #define REGISTER_DIAGOP(T)                                    \
   REGISTER_KERNEL_BUILDER(                                    \
@@ -250,6 +244,4 @@ TF_CALL_complex128(REGISTER_DIAGPARTOP_GPU);
 
 #endif  // GOOGLE_CUDA
 
-
 }  // namespace tensorflow
-
diff --git a/tensorflow/core/kernels/diag_op.h b/tensorflow/core/kernels/diag_op.h
index c6ca6a2047..baf16ddb4b 100644
--- a/tensorflow/core/kernels/diag_op.h
+++ b/tensorflow/core/kernels/diag_op.h
@@ -26,14 +26,14 @@ namespace functor {
 
 template <typename Device, typename T>
 struct DiagFunctor {
-  Status operator() (OpKernelContext* context, const int64 size,
-                     const T* in, T* out);
+  Status operator()(OpKernelContext* context, const int64 size, const T* in,
+                    T* out);
 };
 
 template <typename Device, typename T>
 struct DiagPartFunctor {
-  Status operator() (OpKernelContext* context, const int64 size,
-                     const T* in, T* out);
+  Status operator()(OpKernelContext* context, const int64 size, const T* in,
+                    T* out);
 };
 
 }  // namespace functor
diff --git a/tensorflow/core/kernels/diag_op_gpu.cu.cc b/tensorflow/core/kernels/diag_op_gpu.cu.cc
index d3c529d784..910f3093b2 100644
--- a/tensorflow/core/kernels/diag_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/diag_op_gpu.cu.cc
@@ -19,8 +19,8 @@ limitations under the License.
 
 #include <complex>
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/util/cuda_kernel_helper.h"
 #include "tensorflow/core/kernels/diag_op.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
 
 namespace tensorflow {
 namespace functor {
@@ -28,10 +28,8 @@ namespace functor {
 typedef Eigen::GpuDevice GPUDevice;
 
 template <typename T>
-__global__ void DiagCudaKernel(const int num_threads,
-                               const int64 size,
-                               const T* in,
-                               T* out) {
+__global__ void DiagCudaKernel(const int num_threads, const int64 size,
+                               const T* in, T* out) {
   CUDA_1D_KERNEL_LOOP(index, num_threads) {
     // Fill the diagonal elements or set to zero in other place.
     if (index % (1 + size) == 0) {
@@ -44,9 +42,8 @@ __global__ void DiagCudaKernel(const int num_threads,
 
 template <typename T>
 struct DiagFunctor<GPUDevice, T> {
-  EIGEN_ALWAYS_INLINE Status
-  operator() (OpKernelContext* context, const int64 size,
-              const T* in, T* out) {
+  EIGEN_ALWAYS_INLINE Status operator()(OpKernelContext* context,
+                                        const int64 size, const T* in, T* out) {
     // Empty tensor couldn't launch the kernel.
     if (size == 0) {
       return Status::OK();
@@ -56,25 +53,22 @@ struct DiagFunctor<GPUDevice, T> {
     // so this may overflow for `size*size` in extreme cases,
     // here is checking the multiplication overflow for integer.
     if (size && (int(size * size) / size) != size) {
-      return errors::Internal(
-          "DiagOp got input size too large.");
+      return errors::Internal("DiagOp got input size too large.");
     }
     int virtual_thread_count = int(size * size);
 
     // Launch the GPU kernel.
     const GPUDevice& device = context->eigen_device<GPUDevice>();
-    CudaLaunchConfig diag_config = GetCudaLaunchConfig(
-        virtual_thread_count, device);
-    DiagCudaKernel<<<diag_config.block_count,
-                     diag_config.thread_per_block,
-                     0, device.stream()>>>(
-        diag_config.virtual_thread_count, size, in, out);
+    CudaLaunchConfig diag_config =
+        GetCudaLaunchConfig(virtual_thread_count, device);
+    DiagCudaKernel<<<diag_config.block_count, diag_config.thread_per_block, 0,
+                     device.stream()>>>(diag_config.virtual_thread_count, size,
+                                        in, out);
 
     auto err = cudaGetLastError();
     if (err != cudaSuccess) {
       return errors::Internal(
-          "Could not launch DiagOp kernel: ",
-          cudaGetErrorString(err), ".");
+          "Could not launch DiagOp kernel: ", cudaGetErrorString(err), ".");
     }
     return Status::OK();
   }
@@ -87,12 +81,9 @@ template struct DiagFunctor<GPUDevice, int64>;
 template struct DiagFunctor<GPUDevice, complex64>;
 template struct DiagFunctor<GPUDevice, complex128>;
 
-
 template <typename T>
-__global__ void DiagPartCudaKernel(const int num_threads,
-                                   const int64 size,
-                                   const T* in,
-                                   T* out) {
+__global__ void DiagPartCudaKernel(const int num_threads, const int64 size,
+                                   const T* in, T* out) {
   CUDA_1D_KERNEL_LOOP(index, num_threads) {
     out[index] = in[(1 + size) * index];
   }
@@ -100,9 +91,8 @@ __global__ void DiagPartCudaKernel(const int num_threads,
 
 template <typename T>
 struct DiagPartFunctor<GPUDevice, T> {
-  EIGEN_ALWAYS_INLINE Status
-  operator() (OpKernelContext* context, const int64 size,
-              const T* in, T* out) {
+  EIGEN_ALWAYS_INLINE Status operator()(OpKernelContext* context,
+                                        const int64 size, const T* in, T* out) {
     // Empty tensor couldn't launch the kernel.
     if (size == 0) {
       return Status::OK();
@@ -111,16 +101,14 @@ struct DiagPartFunctor<GPUDevice, T> {
 
     // Extract the diagonal elements.
     CudaLaunchConfig diag_config = GetCudaLaunchConfig(size, device);
-    DiagPartCudaKernel<<<diag_config.block_count,
-                     diag_config.thread_per_block,
-                     0, device.stream()>>>(
-        diag_config.virtual_thread_count, size, in, out);
+    DiagPartCudaKernel<<<diag_config.block_count, diag_config.thread_per_block,
+                         0, device.stream()>>>(diag_config.virtual_thread_count,
+                                               size, in, out);
 
     auto err = cudaGetLastError();
     if (err != cudaSuccess) {
       return errors::Internal(
-          "Could not launch DiagPartOp kernel: ",
-          cudaGetErrorString(err), ".");
+          "Could not launch DiagPartOp kernel: ", cudaGetErrorString(err), ".");
     }
     return Status::OK();
   }
diff --git a/tensorflow/core/kernels/diag_op_test.cc b/tensorflow/core/kernels/diag_op_test.cc
index 2d1417854c..a708e53dd0 100644
--- a/tensorflow/core/kernels/diag_op_test.cc
+++ b/tensorflow/core/kernels/diag_op_test.cc
@@ -30,8 +30,8 @@ static Graph* Diag(int n, DataType type) {
   return g;
 }
 
-#define BM_DiagDev(N, T, TFTYPE, DEVICE)                           \
-  static void BM_Diag##_##N##_##TFTYPE##_##DEVICE(int iters) {   \
+#define BM_DiagDev(N, T, TFTYPE, DEVICE)                        \
+  static void BM_Diag##_##N##_##TFTYPE##_##DEVICE(int iters) {  \
     testing::UseRealTime();                                     \
     testing::ItemsProcessed(static_cast<int64>(iters) * N * N); \
     test::Benchmark(#DEVICE, Diag<T>(N, TFTYPE)).Run(iters);    \
@@ -51,4 +51,3 @@ BM_Diag(128);
 BM_Diag(512);
 
 }  // end namespace tensorflow
-
diff --git a/tensorflow/core/kernels/dilation_ops.cc b/tensorflow/core/kernels/dilation_ops.cc
index 6f5c0e9156..441a63465c 100644
--- a/tensorflow/core/kernels/dilation_ops.cc
+++ b/tensorflow/core/kernels/dilation_ops.cc
@@ -91,10 +91,10 @@ void ParseSizes(OpKernelContext* context, const std::vector<int32>& strides,
                                       filter.shape().DebugString()));
   const int filter_rows = filter.dim_size(0);
   const int filter_cols = filter.dim_size(1);
-  OP_REQUIRES(
-      context, depth == filter.dim_size(2),
-      errors::InvalidArgument("input and filter must have the same depth: ",
-                              depth, " vs ", filter.dim_size(2)));
+  OP_REQUIRES(context, depth == filter.dim_size(2),
+              errors::InvalidArgument(
+                  "input and filter must have the same depth: ", depth, " vs ",
+                  filter.dim_size(2)));
 
   // Effective filter size, after introducing rate - 1 zeros between each
   // non-zero filter element.
@@ -234,10 +234,11 @@ class DilationBackpropInputOp : public OpKernel {
     // [ batch, out_rows, out_cols, depth ]
     const int batch = input.dim_size(0);
     const int depth = input.dim_size(3);
-    OP_REQUIRES(context, batch == out_backprop.dim_size(0) &&
-                             out_rows == out_backprop.dim_size(1) &&
-                             out_cols == out_backprop.dim_size(2) &&
-                             depth == out_backprop.dim_size(3),
+    OP_REQUIRES(context,
+                batch == out_backprop.dim_size(0) &&
+                    out_rows == out_backprop.dim_size(1) &&
+                    out_cols == out_backprop.dim_size(2) &&
+                    depth == out_backprop.dim_size(3),
                 errors::InvalidArgument("out_backprop has incompatible size."));
 
     // The computed in_backprop has the same dimensions as the input:
@@ -353,10 +354,11 @@ class DilationBackpropFilterOp : public OpKernel {
     // [ batch, out_rows, out_cols, depth ]
     const int batch = input.dim_size(0);
     const int depth = input.dim_size(3);
-    OP_REQUIRES(context, batch == out_backprop.dim_size(0) &&
-                             out_rows == out_backprop.dim_size(1) &&
-                             out_cols == out_backprop.dim_size(2) &&
-                             depth == out_backprop.dim_size(3),
+    OP_REQUIRES(context,
+                batch == out_backprop.dim_size(0) &&
+                    out_rows == out_backprop.dim_size(1) &&
+                    out_cols == out_backprop.dim_size(2) &&
+                    depth == out_backprop.dim_size(3),
                 errors::InvalidArgument("out_backprop has incompatible size."));
 
     // The computed filter_backprop has the same dimensions as the filter:
diff --git a/tensorflow/core/kernels/dilation_ops_gpu.cu.cc b/tensorflow/core/kernels/dilation_ops_gpu.cu.cc
index ac0775fbef..c63806a7f6 100644
--- a/tensorflow/core/kernels/dilation_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/dilation_ops_gpu.cu.cc
@@ -61,9 +61,8 @@ __global__ void DilationKernel(const int32 nthreads, const T* input_ptr,
           const int w_in = w_beg + w * rate_cols;
           if (w_in >= 0 && w_in < input_cols) {
             const T val =
-                input_ptr[d +
-                          depth *
-                              (w_in + input_cols * (h_in + input_rows * b))] +
+                input_ptr[d + depth * (w_in +
+                                       input_cols * (h_in + input_rows * b))] +
                 filter_ptr[d + depth * (w + filter_cols * h)];
             if (val > cur_val) {
               cur_val = val;
@@ -106,9 +105,8 @@ __global__ void DilationBackpropInputKernel(
           const int w_in = w_beg + w * rate_cols;
           if (w_in >= 0 && w_in < input_cols) {
             const T val =
-                input_ptr[d +
-                          depth *
-                              (w_in + input_cols * (h_in + input_rows * b))] +
+                input_ptr[d + depth * (w_in +
+                                       input_cols * (h_in + input_rows * b))] +
                 filter_ptr[d + depth * (w + filter_cols * h)];
             if (val > cur_val) {
               cur_val = val;
@@ -156,9 +154,8 @@ __global__ void DilationBackpropFilterKernel(
           const int w_in = w_beg + w * rate_cols;
           if (w_in >= 0 && w_in < input_cols) {
             const T val =
-                input_ptr[d +
-                          depth *
-                              (w_in + input_cols * (h_in + input_rows * b))] +
+                input_ptr[d + depth * (w_in +
+                                       input_cols * (h_in + input_rows * b))] +
                 filter_ptr[d + depth * (w + filter_cols * h)];
             if (val > cur_val) {
               cur_val = val;
diff --git a/tensorflow/core/kernels/draw_bounding_box_op.cc b/tensorflow/core/kernels/draw_bounding_box_op.cc
index a8818b7385..b5d5b880bb 100644
--- a/tensorflow/core/kernels/draw_bounding_box_op.cc
+++ b/tensorflow/core/kernels/draw_bounding_box_op.cc
@@ -29,8 +29,7 @@ template <class T>
 class DrawBoundingBoxesOp : public OpKernel {
  public:
   explicit DrawBoundingBoxesOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-  }
+      : OpKernel(context) {}
 
   void Compute(OpKernelContext* context) override {
     const Tensor& images = context->input(0);
@@ -94,35 +93,28 @@ class DrawBoundingBoxesOp : public OpKernel {
         int64 color_index = bb % color_table_length;
         const int64 min_box_row =
             static_cast<float>(tboxes(b, bb, 0)) * (height - 1);
-        const int64 min_box_row_clamp =
-            std::max<int64>(min_box_row, 0);
+        const int64 min_box_row_clamp = std::max<int64>(min_box_row, 0);
         const int64 max_box_row =
             static_cast<float>(tboxes(b, bb, 2)) * (height - 1);
         const int64 max_box_row_clamp =
             std::min<int64>(max_box_row, height - 1);
         const int64 min_box_col =
             static_cast<float>(tboxes(b, bb, 1)) * (width - 1);
-        const int64 min_box_col_clamp =
-            std::max<int64>(min_box_col, 0);
+        const int64 min_box_col_clamp = std::max<int64>(min_box_col, 0);
         const int64 max_box_col =
             static_cast<float>(tboxes(b, bb, 3)) * (width - 1);
-        const int64 max_box_col_clamp =
-            std::min<int64>(max_box_col, width - 1);
+        const int64 max_box_col_clamp = std::min<int64>(max_box_col, width - 1);
 
         if (min_box_row > max_box_row || min_box_col > max_box_col) {
-          LOG(WARNING) << "Bounding box (" << min_box_row
-                       << "," << min_box_col
-                       << "," << max_box_row
-                       << "," << max_box_col
+          LOG(WARNING) << "Bounding box (" << min_box_row << "," << min_box_col
+                       << "," << max_box_row << "," << max_box_col
                        << ") is inverted and will not be drawn.";
           continue;
         }
-        if (min_box_row >= height || max_box_row < 0 ||
-            min_box_col >= width || max_box_col < 0) {
-          LOG(WARNING) << "Bounding box (" << min_box_row
-                       << "," << min_box_col
-                       << "," << max_box_row
-                       << "," << max_box_col
+        if (min_box_row >= height || max_box_row < 0 || min_box_col >= width ||
+            max_box_col < 0) {
+          LOG(WARNING) << "Bounding box (" << min_box_row << "," << min_box_col
+                       << "," << max_box_row << "," << max_box_col
                        << ") is completely outside the image"
                        << " and will not be drawn.";
           continue;
diff --git a/tensorflow/core/kernels/dynamic_partition_op.cc b/tensorflow/core/kernels/dynamic_partition_op.cc
index 861e16b2fd..3c988db5e6 100644
--- a/tensorflow/core/kernels/dynamic_partition_op.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op.cc
@@ -103,7 +103,8 @@ class DynamicPartitionOp : public DynamicPartitionOp_Shared {
       // Walk through data and copy the data to the appropriate output tensor
       const auto data_flat = data->flat<T>();
       std::vector<Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor>,
-                                   Eigen::Aligned> > out_vec;
+                                   Eigen::Aligned> >
+          out_vec;
       out_vec.reserve(num_partitions_);
       for (int p = 0; p < num_partitions_; p++) {
         out_vec.push_back(outputs[p]->vec<T>());
@@ -124,7 +125,8 @@ class DynamicPartitionOp : public DynamicPartitionOp_Shared {
     } else {
       // If data has extra dimensions, use Eigen slices
       std::vector<Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor>,
-                                   Eigen::Aligned> > out_flat;
+                                   Eigen::Aligned> >
+          out_flat;
       out_flat.reserve(num_partitions_);
       for (int p = 0; p < num_partitions_; p++) {
         out_flat.push_back(outputs[p]->flat_outer_dims<T>());
diff --git a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
index 9bb58b13f3..9dfeccff0e 100644
--- a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
@@ -79,9 +79,9 @@ template <typename T>
 void RangeInit(const GPUDevice& d, const T start, const T delta,
                const int32 size, typename TTypes<T>::Flat out) {
   CudaLaunchConfig config = GetCudaLaunchConfig(size, d);
-  RangeInitKernel<
-      T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-      start, delta, size, out.data());
+  RangeInitKernel<T>
+      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+          start, delta, size, out.data());
 }
 
 // Given *num_runs pairs (key, value), this function moves the value
@@ -103,11 +103,10 @@ void CallGatherKernel(const GPUDevice& d, const T* params, const int32* indices,
                       T* out, int64 gather_dim_size, int64 indices_size,
                       int64 slice_size, int64 out_size) {
   CudaLaunchConfig config = GetCudaLaunchConfig(out_size, d);
-  GatherOpKernel<
-      T, int32,
-      true><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-      params, indices, out, gather_dim_size, indices_size, slice_size,
-      out_size);
+  GatherOpKernel<T, int32, true>
+      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+          params, indices, out, gather_dim_size, indices_size, slice_size,
+          out_size);
 }
 
 struct IdentityOp {
@@ -231,10 +230,10 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
 
     OP_REQUIRES_ASYNC(
         c, TensorShapeUtils::StartsWith(data.shape(), partitions.shape()),
-        errors::InvalidArgument("data.shape must start with partitions.shape, ",
-                                "got data.shape = ", data.shape().DebugString(),
-                                ", partitions.shape = ",
-                                partitions.shape().DebugString()),
+        errors::InvalidArgument(
+            "data.shape must start with partitions.shape, ",
+            "got data.shape = ", data.shape().DebugString(),
+            ", partitions.shape = ", partitions.shape().DebugString()),
         done);
 
     Tensor partition_count;
@@ -245,8 +244,9 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
       AllocatorAttributes alloc_attr;
       alloc_attr.set_on_host(true);
       OP_REQUIRES_OK_ASYNC(
-          c, c->allocate_temp(DT_INT32, TensorShape({num_partitions_}),
-                              &partition_count, alloc_attr),
+          c,
+          c->allocate_temp(DT_INT32, TensorShape({num_partitions_}),
+                           &partition_count, alloc_attr),
           done);
       auto e_part_count = partition_count.flat<int32>();
       for (int i = 0; i < num_partitions_; i++) e_part_count(i) = 0;
@@ -259,8 +259,9 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
 
     // Prepare for counting.
     OP_REQUIRES_OK_ASYNC(
-        c, c->allocate_temp(DT_INT32, TensorShape({num_partitions_}),
-                            &partition_count),
+        c,
+        c->allocate_temp(DT_INT32, TensorShape({num_partitions_}),
+                         &partition_count),
         done);
     Tensor indices_out;
     // Count how many times each partition index occurs.
@@ -280,8 +281,9 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
     alloc_attr.set_on_host(true);
     alloc_attr.set_gpu_compatible(true);
     OP_REQUIRES_OK_ASYNC(
-        c, c->allocate_temp(partition_count.dtype(), partition_count.shape(),
-                            &cpu_tensor, alloc_attr),
+        c,
+        c->allocate_temp(partition_count.dtype(), partition_count.shape(),
+                         &cpu_tensor, alloc_attr),
         done);
     perftools::gputools::DeviceMemoryBase wrapped(
         partition_count.flat<int32>().data(), num_partitions_ * sizeof(int32));
@@ -340,9 +342,10 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
         indices_in_ptr, indices_out_ptr, N, 0, sizeof(int32) * 8, cu_stream);
     // Allocate temporary storage.
     OP_REQUIRES_OK_ASYNC(
-        c, c->allocate_temp(
-               DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}),
-               &cub_temp_storage),
+        c,
+        c->allocate_temp(DT_INT8,
+                         TensorShape({static_cast<int64>(temp_storage_bytes)}),
+                         &cub_temp_storage),
         done);
     // Radix-sort the partition information.
     cub::DeviceRadixSort::SortPairs(
@@ -376,8 +379,9 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
     zero_functor(device, partition_count->flat<int32>());
     // Allocate memory for aggregates_out.
     OP_REQUIRES_OK_ASYNC(
-        c, c->allocate_temp(DT_INT32, TensorShape({num_partitions_}),
-                            &aggregates_out),
+        c,
+        c->allocate_temp(DT_INT32, TensorShape({num_partitions_}),
+                         &aggregates_out),
         done);
     // Obtain the pointers to inner buffers.
     int32* keys_in_ptr = partitions_out.flat<int32>().data();
@@ -408,9 +412,10 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
                                    num_runs_ptr, reduction_op, N, cu_stream);
     // Allocate temporary storage.
     OP_REQUIRES_OK_ASYNC(
-        c, c->allocate_temp(
-               DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}),
-               &cub_temp_storage),
+        c,
+        c->allocate_temp(DT_INT8,
+                         TensorShape({static_cast<int64>(temp_storage_bytes)}),
+                         &cub_temp_storage),
         done);
     // Run reduce-by-key. The effect is that we count how many times
     // each index appears in partitions. The distinct indices are stored
diff --git a/tensorflow/core/kernels/eigen_activations.h b/tensorflow/core/kernels/eigen_activations.h
index 99b4b2abe6..302033e47c 100644
--- a/tensorflow/core/kernels/eigen_activations.h
+++ b/tensorflow/core/kernels/eigen_activations.h
@@ -21,13 +21,13 @@ limitations under the License.
 namespace Eigen {
 
 /** scalar_sigmoid_fast_derivative_op
-  * \ingroup CXX11_NeuralNetworks_Module
-  * \brief Template functor to compute the fast derivative of a sigmoid
-  *
-  * Input should be the backpropagated gradient.
-  *
-  * \sa class CwiseUnaryOp, Cwise::sigmoid_fast_derivative()
-  */
+ * \ingroup CXX11_NeuralNetworks_Module
+ * \brief Template functor to compute the fast derivative of a sigmoid
+ *
+ * Input should be the backpropagated gradient.
+ *
+ * \sa class CwiseUnaryOp, Cwise::sigmoid_fast_derivative()
+ */
 template <typename T>
 struct scalar_sigmoid_fast_derivative_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_sigmoid_fast_derivative_op)
@@ -55,13 +55,13 @@ struct functor_traits<scalar_sigmoid_fast_derivative_op<T> > {
 }  // namespace internal
 
 /** scalar_tanh_fast_derivative_op
-  * \ingroup CXX11_NeuralNetworks_Module
-  * \brief Template functor to compute the fast derivative of a tanh
-  *
-  * Input should be the backpropagated gradient.
-  *
-  * \sa class CwiseUnaryOp, Cwise::tanh_fast_derivative()
-  */
+ * \ingroup CXX11_NeuralNetworks_Module
+ * \brief Template functor to compute the fast derivative of a tanh
+ *
+ * Input should be the backpropagated gradient.
+ *
+ * \sa class CwiseUnaryOp, Cwise::tanh_fast_derivative()
+ */
 template <typename T>
 struct scalar_tanh_fast_derivative_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_tanh_fast_derivative_op)
@@ -89,11 +89,11 @@ struct functor_traits<scalar_tanh_fast_derivative_op<T> > {
 }  // namespace internal
 
 /**
-  * \ingroup CXX11_NeuralNetworks_Module
-  * \brief Template functor to clip the magnitude of the first scalar.
-  *
-  * \sa class CwiseBinaryOp, MatrixBase::Clip
-  */
+ * \ingroup CXX11_NeuralNetworks_Module
+ * \brief Template functor to clip the magnitude of the first scalar.
+ *
+ * \sa class CwiseBinaryOp, MatrixBase::Clip
+ */
 template <typename Scalar>
 struct scalar_clip_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_clip_op)
diff --git a/tensorflow/core/kernels/eigen_activations_test.cc b/tensorflow/core/kernels/eigen_activations_test.cc
index 907233103d..34952f5abb 100644
--- a/tensorflow/core/kernels/eigen_activations_test.cc
+++ b/tensorflow/core/kernels/eigen_activations_test.cc
@@ -23,7 +23,7 @@ namespace {
 void EigenApprox(float a, float b) {
   ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3);
 }
-}
+}  // namespace
 
 TEST(EigenBackwardSpatialConvolutionsTest, SigmoidFastDerivative) {
   const ptrdiff_t depth = 3;
diff --git a/tensorflow/core/kernels/eigen_attention.h b/tensorflow/core/kernels/eigen_attention.h
index 3a94b8c993..4d86f9deb9 100644
--- a/tensorflow/core/kernels/eigen_attention.h
+++ b/tensorflow/core/kernels/eigen_attention.h
@@ -21,35 +21,47 @@ limitations under the License.
 namespace Eigen {
 
 /** ExtractGlimpses
-  * \ingroup CXX11_NeuralNetworks_Module
-  *
-  * \brief Extract glimpses from an input tensor.
-  *
-  * The input parameter is expected to be a col-major tensor with a rank of 4 (depth, x, y, and batch).
-  * The width and height parameters specify the extension of the returned glimpses.
-  * The offsets parameter specifies the x, y locations of the center of the glimpses relative to the center of the input image. The vector is expected to contain one IndexPair for each image in the batch dimension.
-  * The normalized boolean indicates if incoming coordinates are normalized so that 0.0 and 1.0 correspond to the minimum and maximum of each height and width dimension.
-  * The centered boolean indicates if incoming coordinates are centered relative to the image, in which case -1.0 and 1.0 correspond to minimum and maximum of each dimension while 0.0 corresponds to the center.
-  *
-  * The result can be assigned to a tensor of rank equal to that of the input. The result will be laid out in col-major order (depth, x, y, batch).
-  * The dimensions of the result will be equal to the dimensions of the input except for width and height which will be equal to the requested glimpse size.
-  */
+ * \ingroup CXX11_NeuralNetworks_Module
+ *
+ * \brief Extract glimpses from an input tensor.
+ *
+ * The input parameter is expected to be a col-major tensor with a rank of 4
+ * (depth, x, y, and batch). The width and height parameters specify the
+ * extension of the returned glimpses. The offsets parameter specifies the x, y
+ * locations of the center of the glimpses relative to the center of the input
+ * image. The vector is expected to contain one IndexPair for each image in the
+ * batch dimension. The normalized boolean indicates if incoming coordinates are
+ * normalized so that 0.0 and 1.0 correspond to the minimum and maximum of each
+ * height and width dimension. The centered boolean indicates if incoming
+ * coordinates are centered relative to the image, in which case -1.0 and 1.0
+ * correspond to minimum and maximum of each dimension while 0.0 corresponds to
+ * the center.
+ *
+ * The result can be assigned to a tensor of rank equal to that of the input.
+ * The result will be laid out in col-major order (depth, x, y, batch). The
+ * dimensions of the result will be equal to the dimensions of the input except
+ * for width and height which will be equal to the requested glimpse size.
+ */
 namespace {
 template <typename Index>
 struct GlimpseExtractionOp {
   GlimpseExtractionOp(const Index width, const Index height,
                       const std::vector<IndexPair<float> >& offsets,
-                      const bool normalized,
-                      const bool centered,
-                      const bool uniform_noise) :
-      width_(width), height_(height), offsets_(offsets),
-      normalized_(normalized), centered_(centered), uniform_noise_(uniform_noise) { }
+                      const bool normalized, const bool centered,
+                      const bool uniform_noise)
+      : width_(width),
+        height_(height),
+        offsets_(offsets),
+        normalized_(normalized),
+        centered_(centered),
+        uniform_noise_(uniform_noise) {}
 
   template <typename Input>
   DSizes<Index, 4> dimensions(const Input& input) const {
     typedef typename internal::traits<Input>::Index IndexType;
     typedef TensorRef<Tensor<typename internal::traits<Input>::Scalar, 4,
-                             internal::traits<Input>::Layout, IndexType> > Ref;
+                             internal::traits<Input>::Layout, IndexType> >
+        Ref;
     Ref in(input);
 
     DSizes<Index, 4> dims = in.dimensions();
@@ -62,12 +74,12 @@ struct GlimpseExtractionOp {
   }
 
   template <typename Input, typename Output, typename Device>
-  EIGEN_DEVICE_FUNC
-  void eval(const Input& input, Output& output, const Device& device) const
-  {
+  EIGEN_DEVICE_FUNC void eval(const Input& input, Output& output,
+                              const Device& device) const {
     typedef typename internal::traits<Input>::Index IndexType;
     typedef TensorRef<Tensor<typename internal::traits<Input>::Scalar, 4,
-                             internal::traits<Input>::Layout, IndexType> > Ref;
+                             internal::traits<Input>::Layout, IndexType> >
+        Ref;
     Ref in(input);
     const Index num_channels = in.dimension(0);
     const Index input_width = in.dimension(1);
@@ -97,8 +109,8 @@ struct GlimpseExtractionOp {
       x -= width_ / 2.0f;
       y -= height_ / 2.0f;
 
-      const Index offset_x = (Index) x;
-      const Index offset_y = (Index) y;
+      const Index offset_x = (Index)x;
+      const Index offset_y = (Index)y;
       Index glimpse_width = width_;
       Index glimpse_height = height_;
       bool partial_overlap = false;
@@ -135,7 +147,7 @@ struct GlimpseExtractionOp {
         if (uniform_noise_) {
           // Initialize the glimpse with uniform noise.
           typedef typename internal::remove_const<
-            typename internal::traits<Input>::Scalar>::type Scalar;
+              typename internal::traits<Input>::Scalar>::type Scalar;
           TensorFixedSize<Scalar, Sizes<> > mini;
           mini.device(device) = input.template chip<3>(i).minimum();
           TensorFixedSize<float, Sizes<> > range;
@@ -215,21 +227,22 @@ struct GlimpseExtractionOp {
   const bool centered_;
   const bool uniform_noise_;
 };
-}
-
+}  // namespace
 
 template <typename Input>
-EIGEN_ALWAYS_INLINE
-static const TensorCustomUnaryOp<const GlimpseExtractionOp<typename internal::traits<Input>::Index>, const Input>
+EIGEN_ALWAYS_INLINE static const TensorCustomUnaryOp<
+    const GlimpseExtractionOp<typename internal::traits<Input>::Index>,
+    const Input>
 ExtractGlimpses(const Input& input,
                 const typename internal::traits<Input>::Index width,
                 const typename internal::traits<Input>::Index height,
                 const std::vector<IndexPair<float> >& offsets,
                 const bool normalized = true, const bool centered = true,
-                const bool uniform_noise = true)
-{
-  EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == ColMajor, YOU_MADE_A_PROGRAMMING_MISTAKE);
-  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 4, YOU_MADE_A_PROGRAMMING_MISTAKE);
+                const bool uniform_noise = true) {
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == ColMajor,
+                      YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 4,
+                      YOU_MADE_A_PROGRAMMING_MISTAKE);
 
   typedef typename internal::traits<Input>::Index Index;
   const GlimpseExtractionOp<Index> op(width, height, offsets, normalized,
@@ -237,6 +250,6 @@ ExtractGlimpses(const Input& input,
   return input.customOp(op);
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
 #endif  // TENSORFLOW_CORE_KERNELS_EIGEN_ATTENTION_H_
diff --git a/tensorflow/core/kernels/eigen_attention_test.cc b/tensorflow/core/kernels/eigen_attention_test.cc
index 3a2eeb0595..08f6187718 100644
--- a/tensorflow/core/kernels/eigen_attention_test.cc
+++ b/tensorflow/core/kernels/eigen_attention_test.cc
@@ -23,7 +23,7 @@ namespace {
 void EigenApprox(float a, float b) {
   ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3);
 }
-}
+}  // namespace
 
 TEST(EigenAttentionTest, Simple) {
   const ptrdiff_t depth = 3;
diff --git a/tensorflow/core/kernels/eigen_backward_spatial_convolutions.h b/tensorflow/core/kernels/eigen_backward_spatial_convolutions.h
index aec7697810..099696105b 100644
--- a/tensorflow/core/kernels/eigen_backward_spatial_convolutions.h
+++ b/tensorflow/core/kernels/eigen_backward_spatial_convolutions.h
@@ -21,29 +21,29 @@ limitations under the License.
 namespace Eigen {
 
 /** SpatialConvolutionBackwardInput
-  * \ingroup CXX11_NeuralNetworks_Module
-  *
-  * \brief Computes the backprop for the input of a 2D convolution.
-  *
-  * The output_backward parameter is expected to be a tensor with a rank of 3 or
+ * \ingroup CXX11_NeuralNetworks_Module
+ *
+ * \brief Computes the backprop for the input of a 2D convolution.
+ *
+ * The output_backward parameter is expected to be a tensor with a rank of 3 or
  * more (channels, height, width, and optionally others)
-  * The kernel parameter is expected to be a 4D tensor (filters, channels,
+ * The kernel parameter is expected to be a 4D tensor (filters, channels,
  * kernel_height, kernel_width)
-  * The output_backward and the kernel must both be in col-major layout. The
+ * The output_backward and the kernel must both be in col-major layout. The
  * result will also be in col-major layout.
-  *
-  * If row_in_stride, col_in_stride > 1, then applies convolution with holes
+ *
+ * If row_in_stride, col_in_stride > 1, then applies convolution with holes
  * (aka atrous convolution), sampling every row_in_stride, col_in_stride input
  * pixels.
-  *
-  * The result can be assigned to a tensor of rank equal to the rank of the
+ *
+ * The result can be assigned to a tensor of rank equal to the rank of the
  * output_backward. The dimensions of the result will be filters, height, width
  * (and others if applicable).
-  *
-  * It is possible to swap the order of the width and height dimensions provided
+ *
+ * It is possible to swap the order of the width and height dimensions provided
  * that the same order is used in the input, the kernel, and the output.
-  *
-  */
+ *
+ */
 #ifdef EIGEN_HAS_INDEX_LIST
 typedef IndexList<type2index<0>, type2index<0>, type2index<1>, type2index<1> >
     ReverseColMajor;
@@ -293,29 +293,29 @@ SpatialConvolutionBackwardInput(
 }
 
 /** SpatialConvolutionBackwardKernel
-  * \ingroup CXX11_NeuralNetworks_Module
-  *
-  * \brief Computes the backprop for the filter of a 2D convolution.
-  *
-  * The output_backward parameter is expected to be a tensor with a rank of 3 or
+ * \ingroup CXX11_NeuralNetworks_Module
+ *
+ * \brief Computes the backprop for the filter of a 2D convolution.
+ *
+ * The output_backward parameter is expected to be a tensor with a rank of 3 or
  * more (channels, height, width, and optionally others)
-  * The kernel parameter is expected to be a 4D tensor (filters, channels,
+ * The kernel parameter is expected to be a 4D tensor (filters, channels,
  * kernel_height, kernel_width)
-  * The output_backward and the kernel must both be in col-major layout. The
+ * The output_backward and the kernel must both be in col-major layout. The
  * result will also be in col-major layout.
-  *
-  * If row_in_stride, col_stride > 1, then applies convolution with holes (aka
+ *
+ * If row_in_stride, col_stride > 1, then applies convolution with holes (aka
  * atrous convolution), sampling every row_in_stride, col_in_stride input
  * pixels.
-  *
-  * The result can be assigned to a tensor of rank equal to the rank of the
+ *
+ * The result can be assigned to a tensor of rank equal to the rank of the
  * output_backward. The dimensions of the result will be filters, height, width
  * (and others if applicable).
-  *
-  * It is possible to swap the order of the width and height dimensions provided
+ *
+ * It is possible to swap the order of the width and height dimensions provided
  * that the same order is used in the input, the kernel, and the output.
-  *
-  */
+ *
+ */
 
 template <typename OutputBackward, typename Input>
 EIGEN_ALWAYS_INLINE static const typename internal::conditional<
diff --git a/tensorflow/core/kernels/eigen_backward_spatial_convolutions_test.cc b/tensorflow/core/kernels/eigen_backward_spatial_convolutions_test.cc
index 1758067829..2229ec9659 100644
--- a/tensorflow/core/kernels/eigen_backward_spatial_convolutions_test.cc
+++ b/tensorflow/core/kernels/eigen_backward_spatial_convolutions_test.cc
@@ -25,7 +25,7 @@ void EigenApprox(float a, float b) {
   ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3);
 }
 static int ceil_div(int a, int b) { return (a + b - 1) / b; }
-}
+}  // namespace
 
 TEST(EigenBackwardSpatialConvolutionsTest,
      test_simple_spatial_convolution_backward_input_valid) {
diff --git a/tensorflow/core/kernels/eigen_pooling.h b/tensorflow/core/kernels/eigen_pooling.h
index 972036833f..896c995761 100644
--- a/tensorflow/core/kernels/eigen_pooling.h
+++ b/tensorflow/core/kernels/eigen_pooling.h
@@ -309,10 +309,10 @@ struct AvgPoolMeanReducer {
   _mm512_castsi512_ps( \
       _mm512_maskz_set1_epi32(_mm512_cmp_ps_mask(a, b, _CMP_EQ_UQ), -1))
 
-// The ternarylogic function immediate determines the values in the result
-// In the case below, 0xd8 implies (false_mask) ? (b) : (a)
-// For details, refer to the vpternlogd instruction table at
-// http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-vol-2c-manual.pdf
+  // The ternarylogic function immediate determines the values in the result
+  // In the case below, 0xd8 implies (false_mask) ? (b) : (a)
+  // For details, refer to the vpternlogd instruction table at
+  // http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-vol-2c-manual.pdf
 
 #define psel(a, b, false_mask)                        \
   _mm512_castsi512_ps(_mm512_ternarylogic_epi32(      \
diff --git a/tensorflow/core/kernels/eigen_pooling_test.cc b/tensorflow/core/kernels/eigen_pooling_test.cc
index 9383972b9f..47b6665e68 100644
--- a/tensorflow/core/kernels/eigen_pooling_test.cc
+++ b/tensorflow/core/kernels/eigen_pooling_test.cc
@@ -23,7 +23,7 @@ namespace {
 void EigenApprox(float a, float b) {
   ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3);
 }
-}
+}  // namespace
 
 TEST(EigenPoolingTest, Simple) {
   const int depth = 10;
diff --git a/tensorflow/core/kernels/eigen_softmax.h b/tensorflow/core/kernels/eigen_softmax.h
index a2930a726f..12148c54b3 100644
--- a/tensorflow/core/kernels/eigen_softmax.h
+++ b/tensorflow/core/kernels/eigen_softmax.h
@@ -21,19 +21,21 @@ limitations under the License.
 namespace Eigen {
 
 /** SoftMax
-  * \ingroup CXX11_NeuralNetworks_Module
-  *
-  * \brief Applies a softmax
-  *
-  * The input parameter is expected to be a col-major tensor with a rank of 2 (depth and other).
-  *
-  * The result can be assigned to a tensor of rank and dimensions equal to that of the input. The result will be laid out in col-major order.
-  *
-*/
+ * \ingroup CXX11_NeuralNetworks_Module
+ *
+ * \brief Applies a softmax
+ *
+ * The input parameter is expected to be a col-major tensor with a rank of 2
+ * (depth and other).
+ *
+ * The result can be assigned to a tensor of rank and dimensions equal to that
+ * of the input. The result will be laid out in col-major order.
+ *
+ */
 
 namespace {
 struct SoftmaxOp {
-  SoftmaxOp(const float beta) : beta_(beta) { }
+  SoftmaxOp(const float beta) : beta_(beta) {}
 
   template <typename Input>
   typename Input::Dimensions dimensions(const Input& input) const {
@@ -41,8 +43,7 @@ struct SoftmaxOp {
   }
 
   template <typename Input, typename Output, typename Device>
-  void eval(const Input& input, Output& output, const Device& device) const
-  {
+  void eval(const Input& input, Output& output, const Device& device) const {
 #if !defined(EIGEN_HAS_INDEX_LIST)
     // nvcc doesn't support cxx11
     Eigen::array<typename internal::traits<Input>::Index, 1> depth_dim;
@@ -56,35 +57,43 @@ struct SoftmaxOp {
 #else
     // Take advantage of cxx11 to give the compiler information it can use to
     // optimize the code.
-    Eigen::IndexList<Eigen::type2index<0>> depth_dim;
-    Eigen::IndexList<int, Eigen::type2index<1>> bcast;
+    Eigen::IndexList<Eigen::type2index<0> > depth_dim;
+    Eigen::IndexList<int, Eigen::type2index<1> > bcast;
     bcast.set(0, dimensions(input)[0]);
-    Eigen::IndexList<Eigen::type2index<1>, typename internal::traits<Input>::Index> dims2d;
+    Eigen::IndexList<Eigen::type2index<1>,
+                     typename internal::traits<Input>::Index>
+        dims2d;
     dims2d.set(1, dimensions(input)[1]);
 #endif
 
-    output.device(device) = ((input - input.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast)) * beta_).exp();
-    output.device(device) = output / (output.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast));
+    output.device(device) =
+        ((input -
+          input.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast)) *
+         beta_)
+            .exp();
+    output.device(device) =
+        output /
+        (output.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast));
   }
 
  private:
   const float beta_;
 };
-}
-
+}  // namespace
 
 template <typename Input>
-EIGEN_ALWAYS_INLINE
-static const TensorCustomUnaryOp<const SoftmaxOp, const Input>
-SoftMax(const Input& input, const float beta)
-{
-  EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == ColMajor, YOU_MADE_A_PROGRAMMING_MISTAKE);
-  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 2, YOU_MADE_A_PROGRAMMING_MISTAKE);
+EIGEN_ALWAYS_INLINE static const TensorCustomUnaryOp<const SoftmaxOp,
+                                                     const Input>
+SoftMax(const Input& input, const float beta) {
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == ColMajor,
+                      YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 2,
+                      YOU_MADE_A_PROGRAMMING_MISTAKE);
 
   const SoftmaxOp op(beta);
   return input.customOp(op);
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
 #endif  // TENSORFLOW_CORE_KERNELS_EIGEN_SOFTMAX_H_
diff --git a/tensorflow/core/kernels/eigen_softmax_test.cc b/tensorflow/core/kernels/eigen_softmax_test.cc
index ba681d68ab..7f985d7136 100644
--- a/tensorflow/core/kernels/eigen_softmax_test.cc
+++ b/tensorflow/core/kernels/eigen_softmax_test.cc
@@ -23,7 +23,7 @@ namespace {
 void EigenApprox(float a, float b) {
   ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3);
 }
-}
+}  // namespace
 
 TEST(EigenSoftmaxTest, Simple) {
   const int depth = 1024;
diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions.h b/tensorflow/core/kernels/eigen_spatial_convolutions.h
index 2fe64cd72a..1acbe3a658 100644
--- a/tensorflow/core/kernels/eigen_spatial_convolutions.h
+++ b/tensorflow/core/kernels/eigen_spatial_convolutions.h
@@ -877,29 +877,29 @@ struct gemm_pack_rhs<
 }  // end namespace internal
 
 /** SpatialConvolution
-  * \ingroup CXX11_NeuralNetworks_Module
-  *
-  * \brief Applies a 2D convolution over a multichannel input image.
-  *
-  * The input parameter is expected to be a tensor with a rank of 3 or more
+ * \ingroup CXX11_NeuralNetworks_Module
+ *
+ * \brief Applies a 2D convolution over a multichannel input image.
+ *
+ * The input parameter is expected to be a tensor with a rank of 3 or more
  * (channels, height, width, and optionally others)
-  * The kernel parameter is expected to be a 4D tensor (filters, channels,
+ * The kernel parameter is expected to be a 4D tensor (filters, channels,
  * kernel_height, kernel_width)
-  * The input and the kernel must both be in col-major layout. The result will
+ * The input and the kernel must both be in col-major layout. The result will
  * also be in col-major layout.
-  *
-  * If col_in_stride, row_in_stride > 1, then applies convolution with holes
+ *
+ * If col_in_stride, row_in_stride > 1, then applies convolution with holes
  * (aka atrous convolution), sampling every col_in_stride, row_in_stride input
  * pixels.
-  *
-  * The result can be assigned to a tensor of rank equal to the rank of the
+ *
+ * The result can be assigned to a tensor of rank equal to the rank of the
  * input. The dimensions of the result will be filters, height, width (and
  * others if applicable).
-  *
-  * It is possible to swap the order of the width and height dimensions provided
+ *
+ * It is possible to swap the order of the width and height dimensions provided
  * that the same order is used in the input, the kernel, and the output.
-  *
-  */
+ *
+ */
 template <typename Input, typename Kernel>
 EIGEN_DEVICE_FUNC
     EIGEN_ALWAYS_INLINE static const typename internal::conditional<
@@ -993,7 +993,7 @@ EIGEN_DEVICE_FUNC
     default:
       // Initialize unused variables to avoid a compiler warning
       out_height = 0;
-      out_width  = 0;
+      out_width = 0;
       eigen_assert(false && "unexpected padding");
   }
 
diff --git a/tensorflow/core/kernels/encode_jpeg_op.cc b/tensorflow/core/kernels/encode_jpeg_op.cc
index 4fcae25aa6..1a5b0f2b67 100644
--- a/tensorflow/core/kernels/encode_jpeg_op.cc
+++ b/tensorflow/core/kernels/encode_jpeg_op.cc
@@ -80,10 +80,11 @@ class EncodeJpegOp : public OpKernel {
                 errors::InvalidArgument("image must be 3-dimensional",
                                         image.shape().DebugString()));
 
-    OP_REQUIRES(context, FastBoundsCheck(image.NumElements(),
-                                         std::numeric_limits<int32>::max()),
-                errors::InvalidArgument(
-                    "Cannot encode images with >= max int32 elements"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(image.NumElements(), std::numeric_limits<int32>::max()),
+        errors::InvalidArgument(
+            "Cannot encode images with >= max int32 elements"));
 
     const int32 dim_size0 = static_cast<int32>(image.dim_size(0));
     const int32 dim_size1 = static_cast<int32>(image.dim_size(1));
@@ -100,9 +101,10 @@ class EncodeJpegOp : public OpKernel {
       } else if (channels == 3) {
         adjusted_flags.format = jpeg::FORMAT_RGB;
       } else {
-        OP_REQUIRES(context, false, errors::InvalidArgument(
-                                        "image must have 1 or 3 channels, got ",
-                                        image.shape().DebugString()));
+        OP_REQUIRES(
+            context, false,
+            errors::InvalidArgument("image must have 1 or 3 channels, got ",
+                                    image.shape().DebugString()));
       }
     } else {
       if (flags_.format == jpeg::FORMAT_GRAYSCALE) {
diff --git a/tensorflow/core/kernels/example_parsing_ops.cc b/tensorflow/core/kernels/example_parsing_ops.cc
index 268a059275..83cd0e9b47 100644
--- a/tensorflow/core/kernels/example_parsing_ops.cc
+++ b/tensorflow/core/kernels/example_parsing_ops.cc
@@ -346,8 +346,9 @@ class SingleSequenceExampleParserOp : public OpKernel {
           feature_list_sparse_keys[di].scalar<string>()();
     }
     OP_REQUIRES(
-        ctx, TensorShapeUtils::IsVector(
-                 feature_list_dense_missing_assumed_empty->shape()),
+        ctx,
+        TensorShapeUtils::IsVector(
+            feature_list_dense_missing_assumed_empty->shape()),
         errors::InvalidArgument(
             "Expected feature_list_dense_missing_assumed_empty ",
             "to be a vector, got shape: ",
@@ -386,12 +387,12 @@ class SingleSequenceExampleParserOp : public OpKernel {
       required[d] = (def_value.NumElements() == 0);  // No default provided.
 
       if (def_value.NumElements() > 0) {
-        OP_REQUIRES(
-            ctx, def_value.shape() == attrs_.context_dense_shapes[d],
-            errors::InvalidArgument(
-                "def_value[", d, "].shape() == ",
-                def_value.shape().DebugString(), " != context_dense_shapes_[",
-                d, "] == ", attrs_.context_dense_shapes[d].DebugString()));
+        OP_REQUIRES(ctx, def_value.shape() == attrs_.context_dense_shapes[d],
+                    errors::InvalidArgument(
+                        "def_value[", d,
+                        "].shape() == ", def_value.shape().DebugString(),
+                        " != context_dense_shapes_[", d,
+                        "] == ", attrs_.context_dense_shapes[d].DebugString()));
         OP_REQUIRES(
             ctx, def_value.dtype() == attrs_.context_dense_types[d],
             errors::InvalidArgument(
@@ -576,12 +577,12 @@ class SingleSequenceExampleParserOp : public OpKernel {
         const Feature& f = fl.feature(t);
         bool types_match;
         OP_REQUIRES_OK(ctx, CheckTypesMatch(f, dtype, &types_match));
-        OP_REQUIRES(
-            ctx, types_match,
-            errors::InvalidArgument(
-                "Name: ", name, ", Feature list: ", key, ", Index: ", t,
-                ".  Data types don't match. ", "Expected type: ",
-                DataTypeString(dtype), "  Feature is: ", ProtoDebugString(f)));
+        OP_REQUIRES(ctx, types_match,
+                    errors::InvalidArgument(
+                        "Name: ", name, ", Feature list: ", key, ", Index: ", t,
+                        ".  Data types don't match. ",
+                        "Expected type: ", DataTypeString(dtype),
+                        "  Feature is: ", ProtoDebugString(f)));
         OP_REQUIRES_OK(ctx, FeatureDenseCopy(t, name, key, dtype, shape, f,
                                              feature_list_dense_values[d]));
       }
diff --git a/tensorflow/core/kernels/fact_op.cc b/tensorflow/core/kernels/fact_op.cc
index 4fbf76d2d0..4a1aa433bc 100644
--- a/tensorflow/core/kernels/fact_op.cc
+++ b/tensorflow/core/kernels/fact_op.cc
@@ -122,13 +122,9 @@ static string D(const char* s) {
   return ret;
 }
 
-REGISTER_KERNEL_BUILDER(Name("Fact")
-                            .Device(DEVICE_CPU)
-                            .Label(D("Yoxmos").c_str()),
-                        FactOpKernel2);
-REGISTER_KERNEL_BUILDER(Name("Fact")
-                            .Device(DEVICE_CPU)
-                            .Label(D("yoxmos").c_str()),
-                        FactOpKernel2);
+REGISTER_KERNEL_BUILDER(
+    Name("Fact").Device(DEVICE_CPU).Label(D("Yoxmos").c_str()), FactOpKernel2);
+REGISTER_KERNEL_BUILDER(
+    Name("Fact").Device(DEVICE_CPU).Label(D("yoxmos").c_str()), FactOpKernel2);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/fake_quant_ops_test.cc b/tensorflow/core/kernels/fake_quant_ops_test.cc
index 5953db1476..af3a42135d 100644
--- a/tensorflow/core/kernels/fake_quant_ops_test.cc
+++ b/tensorflow/core/kernels/fake_quant_ops_test.cc
@@ -378,9 +378,8 @@ TEST_F(QuantOpsTest, WithArgsGradient_RegularRange) {
   Tensor* output = GetOutput(0);
   auto input_flat = GetInput(0).flat<float>();
   Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
-  FillValues<float>(&expected,
-                    {0.0f, input_flat(1), input_flat(2),
-                     input_flat(3), input_flat(4), 0.0f});
+  FillValues<float>(&expected, {0.0f, input_flat(1), input_flat(2),
+                                input_flat(3), input_flat(4), 0.0f});
   ExpectClose(expected, *output);
 }
 
@@ -2167,21 +2166,19 @@ TEST_F(QuantOpsTest,
   Tensor* output_bprop_wrt_input = GetOutput(0);
   Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({2, 3}));
   auto grad_flat = GetInput(0).flat<float>();
-  FillValues<float>(&expected_bprop_wrt_input,
-                    {0.0f, grad_flat(1), grad_flat(2),
-                     grad_flat(3), grad_flat(4), 0.0f});
+  FillValues<float>(
+      &expected_bprop_wrt_input,
+      {0.0f, grad_flat(1), grad_flat(2), grad_flat(3), grad_flat(4), 0.0f});
   ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
 
   Tensor* output_bprop_wrt_min = GetOutput(1);
   Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({3}));
-  FillValues<float>(&expected_bprop_wrt_min,
-                    {grad_flat(0), 0.0f, 0.0f});
+  FillValues<float>(&expected_bprop_wrt_min, {grad_flat(0), 0.0f, 0.0f});
   ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
 
   Tensor* output_bprop_wrt_max = GetOutput(2);
   Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({3}));
-  FillValues<float>(&expected_bprop_wrt_max,
-                    {0.0f, 0.0f, grad_flat(5)});
+  FillValues<float>(&expected_bprop_wrt_max, {0.0f, 0.0f, grad_flat(5)});
   ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
 }
 
@@ -2215,21 +2212,19 @@ TEST_F(QuantOpsTest, WithVarsPerChannelDim2GradientNudgedUp_4Bits_NarrowRange) {
   Tensor* output_bprop_wrt_input = GetOutput(0);
   Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({2, 3}));
   auto grad_flat = GetInput(0).flat<float>();
-  FillValues<float>(&expected_bprop_wrt_input,
-                    {0.0f, grad_flat(1), grad_flat(2),
-                     grad_flat(3), grad_flat(4), 0.0f});
+  FillValues<float>(
+      &expected_bprop_wrt_input,
+      {0.0f, grad_flat(1), grad_flat(2), grad_flat(3), grad_flat(4), 0.0f});
   ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
 
   Tensor* output_bprop_wrt_min = GetOutput(1);
   Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({3}));
-  FillValues<float>(&expected_bprop_wrt_min,
-                    {grad_flat(0), 0.0f, 0.0f});
+  FillValues<float>(&expected_bprop_wrt_min, {grad_flat(0), 0.0f, 0.0f});
   ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
 
   Tensor* output_bprop_wrt_max = GetOutput(2);
   Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({3}));
-  FillValues<float>(&expected_bprop_wrt_max,
-                    {0.0f, 0.0f, grad_flat(5)});
+  FillValues<float>(&expected_bprop_wrt_max, {0.0f, 0.0f, grad_flat(5)});
   ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
 }
 
@@ -2270,14 +2265,13 @@ TEST_F(QuantOpsTest,
   Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT,
                                   TensorShape({1, 2, 3, 4}));
   auto grad_flat = GetInput(0).flat<float>();
-  FillValues<float>(
-      &expected_bprop_wrt_input,
-      {0.0f, grad_flat(1), grad_flat(2), 0.0f,
-       0.0f, grad_flat(5), grad_flat(6), 0.0f,
-       0.0f, grad_flat(9), grad_flat(10), 0.0f,
-       0.0f, grad_flat(13), grad_flat(14), 0.0f,
-       0.0f, grad_flat(17), grad_flat(18), 0.0f,
-       0.0f, grad_flat(21), grad_flat(22), 0.0f});
+  FillValues<float>(&expected_bprop_wrt_input,
+                    {0.0f, grad_flat(1),  grad_flat(2),  0.0f,
+                     0.0f, grad_flat(5),  grad_flat(6),  0.0f,
+                     0.0f, grad_flat(9),  grad_flat(10), 0.0f,
+                     0.0f, grad_flat(13), grad_flat(14), 0.0f,
+                     0.0f, grad_flat(17), grad_flat(18), 0.0f,
+                     0.0f, grad_flat(21), grad_flat(22), 0.0f});
   ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
 
   Tensor* output_bprop_wrt_min = GetOutput(1);
diff --git a/tensorflow/core/kernels/fifo_queue.cc b/tensorflow/core/kernels/fifo_queue.cc
index 82ec879119..479f7be4b5 100644
--- a/tensorflow/core/kernels/fifo_queue.cc
+++ b/tensorflow/core/kernels/fifo_queue.cc
@@ -255,97 +255,96 @@ void FIFOQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
       // TODO(josh11b): This makes two copies of callback, avoid this if possible.
       dequeue_attempts_.emplace_back(
           num_elements, [callback]() { callback(Tuple()); }, ctx, cm, token,
-          [callback, allow_small_batch, this](Attempt* attempt)
-              EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-                int64 queue_size = queues_[0].size();
+          [callback, allow_small_batch,
+           this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+            int64 queue_size = queues_[0].size();
 
-                if (closed_ && queue_size < attempt->elements_requested) {
-                  // If we don't have enough for a full dequeue, we have
-                  // to reset the attempt tuple.
-                  if (!attempt->tuple.empty()) {
-                    // Restore already-dequeued elements to the front of the
-                    // queue.
-                    for (int64 i = attempt->tuple[0].dim_size(0) -
-                                   attempt->elements_requested - 1;
-                         i >= 0; --i) {
-                      for (int j = 0; j < num_components(); ++j) {
-                        PersistentTensor element;
-                        Status s = GetElementComponentFromBatch(
-                            attempt->tuple, i, j, attempt->context, &element);
-                        if (!s.ok()) {
-                          attempt->context->SetStatus(
-                              errors::DataLoss("Failed to restore element from "
-                                               "partially-dequeued batch "
-                                               "to FIFOQueue: ",
-                                               s.error_message()));
-                        }
-                        queues_[j].push_front(element);
-                      }
-                    }
-                  }
-                  if (allow_small_batch && !queues_[0].empty()) {
-                    // Request all remaining elements in the queue.
-                    queue_size = queues_[0].size();
-                    attempt->tuple.clear();
-                    attempt->elements_requested = queue_size;
-                  } else {
-                    if (allow_small_batch) {
-                      // There may be some other attempts containing
-                      // values.  If so, we'll yield and wait for them
-                      // to add elements to the queue.
-                      if (!enqueue_attempts_.empty()) return kProgress;
-                    }
-                    if (attempt->context->status().ok()) {
-                      attempt->context->SetStatus(errors::OutOfRange(
-                          "FIFOQueue '", name_, "' is closed and has ",
-                          "insufficient elements (requested ",
-                          attempt->elements_requested, ", current size ",
-                          queue_size, ")"));
+            if (closed_ && queue_size < attempt->elements_requested) {
+              // If we don't have enough for a full dequeue, we have
+              // to reset the attempt tuple.
+              if (!attempt->tuple.empty()) {
+                // Restore already-dequeued elements to the front of the
+                // queue.
+                for (int64 i = attempt->tuple[0].dim_size(0) -
+                               attempt->elements_requested - 1;
+                     i >= 0; --i) {
+                  for (int j = 0; j < num_components(); ++j) {
+                    PersistentTensor element;
+                    Status s = GetElementComponentFromBatch(
+                        attempt->tuple, i, j, attempt->context, &element);
+                    if (!s.ok()) {
+                      attempt->context->SetStatus(
+                          errors::DataLoss("Failed to restore element from "
+                                           "partially-dequeued batch "
+                                           "to FIFOQueue: ",
+                                           s.error_message()));
                     }
-                    return kComplete;
+                    queues_[j].push_front(element);
                   }
                 }
+              }
+              if (allow_small_batch && !queues_[0].empty()) {
+                // Request all remaining elements in the queue.
+                queue_size = queues_[0].size();
+                attempt->tuple.clear();
+                attempt->elements_requested = queue_size;
+              } else {
+                if (allow_small_batch) {
+                  // There may be some other attempts containing
+                  // values.  If so, we'll yield and wait for them
+                  // to add elements to the queue.
+                  if (!enqueue_attempts_.empty()) return kProgress;
+                }
+                if (attempt->context->status().ok()) {
+                  attempt->context->SetStatus(errors::OutOfRange(
+                      "FIFOQueue '", name_, "' is closed and has ",
+                      "insufficient elements (requested ",
+                      attempt->elements_requested, ", current size ",
+                      queue_size, ")"));
+                }
+                return kComplete;
+              }
+            }
 
-                RunResult result = kNoProgress;
-                for (; queue_size > 0; --queue_size) {
-                  if (attempt->tuple.empty()) {
-                    // Only allocate tuple when we have something to dequeue
-                    // so we don't use excessive memory when there are many
-                    // blocked dequeue attempts waiting.
-                    attempt->tuple.reserve(num_components());
-                    for (int i = 0; i < num_components(); ++i) {
-                      const TensorShape shape =
-                          ManyOutShape(i, attempt->elements_requested);
-                      Tensor element;
-                      attempt->context->SetStatus(
-                          attempt->context->allocate_temp(component_dtypes_[i],
-                                                          shape, &element));
-                      if (!attempt->context->status().ok()) return kComplete;
-                      attempt->tuple.emplace_back(element);
-                    }
-                  }
-                  result = kProgress;
-                  Tuple tuple;
-                  DequeueLocked(attempt->context, &tuple);
-                  const int64 index = attempt->tuple[0].dim_size(0) -
-                                      attempt->elements_requested;
-                  for (int i = 0; i < num_components(); ++i) {
-                    attempt->context->SetStatus(batch_util::CopyElementToSlice(
-                        std::move(tuple[i]), &attempt->tuple[i], index));
-                    if (!attempt->context->status().ok()) return kComplete;
-                  }
-                  tuple.clear();
-                  --attempt->elements_requested;
-                  if (attempt->elements_requested == 0) {
-                    tuple = attempt->tuple;
-                    attempt->done_callback = [callback, tuple]() {
-                      callback(tuple);
-                    };
-                    return kComplete;
-                  }
+            RunResult result = kNoProgress;
+            for (; queue_size > 0; --queue_size) {
+              if (attempt->tuple.empty()) {
+                // Only allocate tuple when we have something to dequeue
+                // so we don't use excessive memory when there are many
+                // blocked dequeue attempts waiting.
+                attempt->tuple.reserve(num_components());
+                for (int i = 0; i < num_components(); ++i) {
+                  const TensorShape shape =
+                      ManyOutShape(i, attempt->elements_requested);
+                  Tensor element;
+                  attempt->context->SetStatus(attempt->context->allocate_temp(
+                      component_dtypes_[i], shape, &element));
+                  if (!attempt->context->status().ok()) return kComplete;
+                  attempt->tuple.emplace_back(element);
                 }
-                return result;
-              });
+              }
+              result = kProgress;
+              Tuple tuple;
+              DequeueLocked(attempt->context, &tuple);
+              const int64 index =
+                  attempt->tuple[0].dim_size(0) - attempt->elements_requested;
+              for (int i = 0; i < num_components(); ++i) {
+                attempt->context->SetStatus(batch_util::CopyElementToSlice(
+                    std::move(tuple[i]), &attempt->tuple[i], index));
+                if (!attempt->context->status().ok()) return kComplete;
+              }
+              tuple.clear();
+              --attempt->elements_requested;
+              if (attempt->elements_requested == 0) {
+                tuple = attempt->tuple;
+                attempt->done_callback = [callback, tuple]() {
+                  callback(tuple);
+                };
+                return kComplete;
+              }
+            }
+            return result;
+          });
     }
   }
   if (!already_cancelled) {
diff --git a/tensorflow/core/kernels/fill_functor.cc b/tensorflow/core/kernels/fill_functor.cc
index bde39770de..7090417dfd 100644
--- a/tensorflow/core/kernels/fill_functor.cc
+++ b/tensorflow/core/kernels/fill_functor.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/variant_encode_decode.h"
 
@@ -60,7 +60,7 @@ DEFINE_SETZERO_CPU(Variant);
 template <typename T>
 void SetZeroFunctor<Eigen::SyclDevice, T>::operator()(
     const Eigen::SyclDevice& d, typename TTypes<T>::Flat out) {
-      To32Bit(out).device(d) = To32Bit(out).constant(T(0));
+  To32Bit(out).device(d) = To32Bit(out).constant(T(0));
 }
 
 #define DEFINE_SETZERO_SYCL(T) \
@@ -118,7 +118,8 @@ DEFINE_SETONE_SYCL(double);
 
 template <typename T>
 struct FillFunctor<Eigen::ThreadPoolDevice, T> {
-  void operator()(const Eigen::ThreadPoolDevice& d, typename TTypes<T>::Flat out,
+  void operator()(const Eigen::ThreadPoolDevice& d,
+                  typename TTypes<T>::Flat out,
                   typename TTypes<T>::ConstScalar in) {
     out.device(d) = out.constant(in());
   }
@@ -150,8 +151,7 @@ struct FillFunctor<Eigen::SyclDevice, T> {
   }
 };
 
-#define DEFINE_FILL_SYCL(T) \
-  template struct FillFunctor<Eigen::SyclDevice, T>;
+#define DEFINE_FILL_SYCL(T) template struct FillFunctor<Eigen::SyclDevice, T>;
 DEFINE_FILL_SYCL(float);
 DEFINE_FILL_SYCL(double);
 TF_CALL_INTEGRAL_TYPES(DEFINE_FILL_SYCL)
diff --git a/tensorflow/core/kernels/fractional_avg_pool_op.cc b/tensorflow/core/kernels/fractional_avg_pool_op.cc
index 47f4189c30..135d002345 100644
--- a/tensorflow/core/kernels/fractional_avg_pool_op.cc
+++ b/tensorflow/core/kernels/fractional_avg_pool_op.cc
@@ -232,8 +232,9 @@ class FractionalAvgPoolGradOp : public OpKernel {
 
     // Grab the inputs.
     const Tensor& orig_input_tensor_shape = context->input(0);
-    OP_REQUIRES(context, orig_input_tensor_shape.dims() == 1 &&
-                             orig_input_tensor_shape.NumElements() == 4,
+    OP_REQUIRES(context,
+                orig_input_tensor_shape.dims() == 1 &&
+                    orig_input_tensor_shape.NumElements() == 4,
                 errors::InvalidArgument("original input tensor shape must be"
                                         "1-dimensional and 4 elements"));
     const Tensor& out_backprop = context->input(1);
diff --git a/tensorflow/core/kernels/function_ops.cc b/tensorflow/core/kernels/function_ops.cc
index ef9e848413..9d4bc35ba8 100644
--- a/tensorflow/core/kernels/function_ops.cc
+++ b/tensorflow/core/kernels/function_ops.cc
@@ -253,22 +253,21 @@ class SymbolicGradientOp : public AsyncOpKernel {
       args.push_back(ctx->input(i));
     }
     std::vector<Tensor>* rets = new std::vector<Tensor>;
-    lib->Run(
-        opts, handle, args, rets, [ctx, done, rets](const Status& status) {
-          if (!status.ok()) {
-            ctx->SetStatus(status);
-          } else if (rets->size() != ctx->num_outputs()) {
-            ctx->SetStatus(errors::InvalidArgument(
-                "SymGrad expects to return ", ctx->num_outputs(),
-                " tensor(s), but get ", rets->size(), " tensor(s) instead."));
-          } else {
-            for (size_t i = 0; i < rets->size(); ++i) {
-              ctx->set_output(i, (*rets)[i]);
-            }
-          }
-          delete rets;
-          done();
-        });
+    lib->Run(opts, handle, args, rets, [ctx, done, rets](const Status& status) {
+      if (!status.ok()) {
+        ctx->SetStatus(status);
+      } else if (rets->size() != ctx->num_outputs()) {
+        ctx->SetStatus(errors::InvalidArgument(
+            "SymGrad expects to return ", ctx->num_outputs(),
+            " tensor(s), but get ", rets->size(), " tensor(s) instead."));
+      } else {
+        for (size_t i = 0; i < rets->size(); ++i) {
+          ctx->set_output(i, (*rets)[i]);
+        }
+      }
+      delete rets;
+      done();
+    });
   }
 
  private:
diff --git a/tensorflow/core/kernels/fused_batch_norm_op.cu.cc b/tensorflow/core/kernels/fused_batch_norm_op.cu.cc
index a8484390b9..4a67b2b3a3 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op.cu.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op.cu.cc
@@ -68,7 +68,8 @@ void InvVarianceToVariance<T>::operator()(const Eigen::GpuDevice& d,
 template <class T>
 void SetNanFunctor<T>::operator()(const Eigen::GpuDevice& d,
                                   typename TTypes<T>::Flat out) {
-  To32Bit(out).device(d) = To32Bit(out).constant(Eigen::NumTraits<T>::quiet_NaN());
+  To32Bit(out).device(d) =
+      To32Bit(out).constant(Eigen::NumTraits<T>::quiet_NaN());
 }
 
 template class VarianceToInvVariance<float>;
diff --git a/tensorflow/core/kernels/gather_functor.cc b/tensorflow/core/kernels/gather_functor.cc
index dde08b37ea..e6fefe643b 100644
--- a/tensorflow/core/kernels/gather_functor.cc
+++ b/tensorflow/core/kernels/gather_functor.cc
@@ -25,12 +25,12 @@ typedef Eigen::GpuDevice GPUDevice;
 namespace functor {
 
 // Forward declarations of the functor specializations for GPU.
-#define DECLARE_GPU_SPECS_INDEX(T, Index)                             \
-  template <>                                                         \
-  int64 GatherFunctor<GPUDevice, T, Index>::operator()(               \
+#define DECLARE_GPU_SPECS_INDEX(T, Index)                               \
+  template <>                                                           \
+  int64 GatherFunctor<GPUDevice, T, Index>::operator()(                 \
       OpKernelContext* ctx, typename TTypes<T, 3>::ConstTensor Tparams, \
-      typename TTypes<Index>::ConstFlat Tindices,                     \
-      typename TTypes<T, 3>::Tensor Tout);                            \
+      typename TTypes<Index>::ConstFlat Tindices,                       \
+      typename TTypes<T, 3>::Tensor Tout);                              \
   extern template struct GatherFunctor<GPUDevice, T, Index>;
 
 #define DECLARE_GPU_SPECS(T)         \
diff --git a/tensorflow/core/kernels/gather_functor.h b/tensorflow/core/kernels/gather_functor.h
index 1e429a037e..16ccb03b85 100644
--- a/tensorflow/core/kernels/gather_functor.h
+++ b/tensorflow/core/kernels/gather_functor.h
@@ -18,12 +18,12 @@ limitations under the License.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/type_traits.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/platform/prefetch.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/util/work_sharder.h"
 
 namespace tensorflow {
@@ -52,21 +52,23 @@ SliceIndex HandleCopies(OpKernelContext* ctx,
   const size_t slice_bytes = slice_elems * sizeof(T);
   auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
   mutex mu;
-  // Store the value of invalidate index for printing error information, it's a shared variable.
+  // Store the value of invalidate index for printing error information, it's a
+  // shared variable.
   SliceIndex result = -1;
-  auto work = [&] (int64 start, int64 end) {
+  auto work = [&](int64 start, int64 end) {
     SliceIndex batch_idx = static_cast<SliceIndex>(start / indices_size);
     SliceIndex indices_idx = static_cast<SliceIndex>(start % indices_size);
     SliceIndex batch_idx_end = static_cast<SliceIndex>(end / indices_size);
     SliceIndex indices_idx_end = static_cast<SliceIndex>(end % indices_size);
 
     while ((batch_idx < batch_idx_end) ||
-            (batch_idx == batch_idx_end && indices_idx < indices_idx_end)) {
+           (batch_idx == batch_idx_end && indices_idx < indices_idx_end)) {
       SliceIndex i_next = indices_idx + 1;
       SliceIndex b_next = batch_idx + 1;
       if ((batch_idx == batch_idx_end && i_next < indices_idx_end) ||
-              (i_next < indices_size)) {
-        port::prefetch<port::PREFETCH_HINT_T0>(&params(batch_idx, indices(i_next), 0));
+          (i_next < indices_size)) {
+        port::prefetch<port::PREFETCH_HINT_T0>(
+            &params(batch_idx, indices(i_next), 0));
         port::prefetch<port::PREFETCH_HINT_T0>(&out(batch_idx, i_next, 0));
         b_next = batch_idx;
       } else if (b_next <= batch_idx_end) {
@@ -85,11 +87,12 @@ SliceIndex HandleCopies(OpKernelContext* ctx,
       // ahead-of-time compilation binary size).
       if (is_simple_type<T>::value) {
         // Avoid auto-promotion to Index from SliceIndex by casting.
-        memcpy(out_base + (batch_idx * indices_size + indices_idx) * slice_elems,
-               params_base + (batch_idx * static_cast<SliceIndex>(limit) +
-                              static_cast<SliceIndex>(index)) *
-                             slice_elems,
-               slice_bytes);
+        memcpy(
+            out_base + (batch_idx * indices_size + indices_idx) * slice_elems,
+            params_base + (batch_idx * static_cast<SliceIndex>(limit) +
+                           static_cast<SliceIndex>(index)) *
+                              slice_elems,
+            slice_bytes);
       } else {
         // For non-"simple" types (e.g. strings).
         out.template chip<1>(indices_idx) = params.template chip<1>(index);
@@ -99,8 +102,8 @@ SliceIndex HandleCopies(OpKernelContext* ctx,
     }
   };
 
-  Shard(worker_threads->num_threads, worker_threads->workers, batch_size*indices_size,
-        slice_elems * sizeof(T), work);
+  Shard(worker_threads->num_threads, worker_threads->workers,
+        batch_size * indices_size, slice_elems * sizeof(T), work);
   return result;
 }
 
@@ -117,16 +120,16 @@ struct GatherFunctorCPU {
     bool use_large = (slice_size > std::numeric_limits<int32>::max() ||
                       params.size() > std::numeric_limits<int32>::max() ||
                       N > std::numeric_limits<int32>::max());
-#define CALL(elems)                                                        \
-  do {                                                                     \
-    if (use_large) {                                                       \
-      bad_i = HandleCopies<T, Index, int64, elems>(ctx, params, indices,   \
-                                                   slice_size, out);       \
-    } else {                                                               \
-      const int32 small_slice = static_cast<int32>(slice_size);            \
-      bad_i = HandleCopies<T, Index, int32, elems>(ctx, params, indices,   \
-                                                   small_slice, out);      \
-    }                                                                      \
+#define CALL(elems)                                                      \
+  do {                                                                   \
+    if (use_large) {                                                     \
+      bad_i = HandleCopies<T, Index, int64, elems>(ctx, params, indices, \
+                                                   slice_size, out);     \
+    } else {                                                             \
+      const int32 small_slice = static_cast<int32>(slice_size);          \
+      bad_i = HandleCopies<T, Index, int32, elems>(ctx, params, indices, \
+                                                   small_slice, out);    \
+    }                                                                    \
   } while (0)
 
     if (slice_size == 10)
@@ -143,7 +146,8 @@ struct GatherFunctorCPU {
 
 template <typename Device, typename T, typename Index>
 struct GatherFunctor {
-  int64 operator()(OpKernelContext* ctx, typename TTypes<T, 3>::ConstTensor params,
+  int64 operator()(OpKernelContext* ctx,
+                   typename TTypes<T, 3>::ConstTensor params,
                    typename TTypes<Index>::ConstFlat indices,
                    typename TTypes<T, 3>::Tensor out);
 };
diff --git a/tensorflow/core/kernels/gather_op.cc b/tensorflow/core/kernels/gather_op.cc
index 239d5d2e99..d6cbcf1d93 100644
--- a/tensorflow/core/kernels/gather_op.cc
+++ b/tensorflow/core/kernels/gather_op.cc
@@ -106,8 +106,7 @@ class GatherOp : public OpKernel {
       auto out_flat = out->shaped<T, 3>({outer_size, N, inner_size});
 
       functor::GatherFunctor<Device, T, Index> functor;
-      int64 bad_i = functor(c, params_flat,
-                            indices_flat, out_flat);
+      int64 bad_i = functor(c, params_flat, indices_flat, out_flat);
 
       OP_REQUIRES(
           c, bad_i < 0,
diff --git a/tensorflow/core/kernels/hinge-loss.h b/tensorflow/core/kernels/hinge-loss.h
index 789a7ce7a3..d303e9c877 100644
--- a/tensorflow/core/kernels/hinge-loss.h
+++ b/tensorflow/core/kernels/hinge-loss.h
@@ -50,9 +50,8 @@ class HingeLossUpdater : public DualLossUpdater {
     // valid value for new dual = 0
     // c. new optimal value > 1.0. Then new optimal value should be set to 1.0.
     const double candidate_optimal_dual =
-        current_dual +
-        (label - wx) /
-            (num_loss_partitions * example_weight * weighted_example_norm);
+        current_dual + (label - wx) / (num_loss_partitions * example_weight *
+                                       weighted_example_norm);
     if (label * candidate_optimal_dual < 0) {
       return 0.0;
     }
diff --git a/tensorflow/core/kernels/histogram_op_gpu.cu.cc b/tensorflow/core/kernels/histogram_op_gpu.cu.cc
index c2bb958be8..a88e9b0ddc 100644
--- a/tensorflow/core/kernels/histogram_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/histogram_op_gpu.cu.cc
@@ -17,16 +17,16 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/kernels/histogram_op.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "external/cub_archive/cub/device/device_histogram.cuh"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/histogram_op.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 namespace tensorflow {
 
@@ -104,8 +104,8 @@ struct HistogramFixedWidthFunctor<GPUDevice, T, Tout> {
         /* num_samples */ num_samples,
         /* stream */ stream);
     if (err != cudaSuccess) {
-      return errors::Internal("Could not launch HistogramRange: ",
-                              cudaGetErrorString(err), ".");
+      return errors::Internal(
+          "Could not launch HistogramRange: ", cudaGetErrorString(err), ".");
     }
 
     return Status::OK();
diff --git a/tensorflow/core/kernels/image_resizer_state.h b/tensorflow/core/kernels/image_resizer_state.h
index f088315ff5..faf997be05 100644
--- a/tensorflow/core/kernels/image_resizer_state.h
+++ b/tensorflow/core/kernels/image_resizer_state.h
@@ -109,8 +109,9 @@ struct ImageResizerState {
     ValidateAndCalculateOutputSize(context, input);
     if (!context->status().ok()) return;
     OP_REQUIRES_OK(context, context->allocate_output(
-                                0, TensorShape({input.dim_size(0), out_height,
-                                                out_width, input.dim_size(3)}),
+                                0,
+                                TensorShape({input.dim_size(0), out_height,
+                                             out_width, input.dim_size(3)}),
                                 &output));
   }
 
@@ -168,8 +169,9 @@ struct ImageResizerGradientState {
         CalculateResizeScale(original_width, resized_width, align_corners_);
     output = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(
-                                0, TensorShape({batch_size, original_height,
-                                                original_width, channels}),
+                                0,
+                                TensorShape({batch_size, original_height,
+                                             original_width, channels}),
                                 &output));
   }
 
diff --git a/tensorflow/core/kernels/in_topk_op.cc b/tensorflow/core/kernels/in_topk_op.cc
index e2861ae090..c37055239c 100644
--- a/tensorflow/core/kernels/in_topk_op.cc
+++ b/tensorflow/core/kernels/in_topk_op.cc
@@ -17,11 +17,11 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/kernels/bounds_check.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 namespace tensorflow {
 
@@ -98,36 +98,36 @@ class InTopK : public OpKernel {
   int k_;
 };
 
-REGISTER_KERNEL_BUILDER(
-    Name("InTopK").Device(DEVICE_CPU)
-    .HostMemory("predictions")
-    .HostMemory("targets")
-    .HostMemory("precision")
-    .TypeConstraint<int32>("T"),
-    InTopK<float, int32>);
-REGISTER_KERNEL_BUILDER(
-    Name("InTopK").Device(DEVICE_CPU)
-    .HostMemory("predictions")
-    .HostMemory("targets")
-    .HostMemory("precision")
-    .TypeConstraint<int64>("T"),
-    InTopK<float, int64>);
-
-REGISTER_KERNEL_BUILDER(
-    Name("InTopKV2").Device(DEVICE_CPU)
-    .HostMemory("predictions")
-    .HostMemory("targets")
-    .HostMemory("k")
-    .HostMemory("precision")
-    .TypeConstraint<int32>("T"),
-    InTopK<float, int32>);
-REGISTER_KERNEL_BUILDER(
-    Name("InTopKV2").Device(DEVICE_CPU)
-    .HostMemory("predictions")
-    .HostMemory("targets")
-    .HostMemory("k")
-    .HostMemory("precision")
-    .TypeConstraint<int64>("T"),
-    InTopK<float, int64>);
+REGISTER_KERNEL_BUILDER(Name("InTopK")
+                            .Device(DEVICE_CPU)
+                            .HostMemory("predictions")
+                            .HostMemory("targets")
+                            .HostMemory("precision")
+                            .TypeConstraint<int32>("T"),
+                        InTopK<float, int32>);
+REGISTER_KERNEL_BUILDER(Name("InTopK")
+                            .Device(DEVICE_CPU)
+                            .HostMemory("predictions")
+                            .HostMemory("targets")
+                            .HostMemory("precision")
+                            .TypeConstraint<int64>("T"),
+                        InTopK<float, int64>);
+
+REGISTER_KERNEL_BUILDER(Name("InTopKV2")
+                            .Device(DEVICE_CPU)
+                            .HostMemory("predictions")
+                            .HostMemory("targets")
+                            .HostMemory("k")
+                            .HostMemory("precision")
+                            .TypeConstraint<int32>("T"),
+                        InTopK<float, int32>);
+REGISTER_KERNEL_BUILDER(Name("InTopKV2")
+                            .Device(DEVICE_CPU)
+                            .HostMemory("predictions")
+                            .HostMemory("targets")
+                            .HostMemory("k")
+                            .HostMemory("precision")
+                            .TypeConstraint<int64>("T"),
+                        InTopK<float, int64>);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/inplace_ops.cc b/tensorflow/core/kernels/inplace_ops.cc
index 7728ba850c..a71d047ed1 100644
--- a/tensorflow/core/kernels/inplace_ops.cc
+++ b/tensorflow/core/kernels/inplace_ops.cc
@@ -27,13 +27,13 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SyclDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 namespace functor {
 
 template <typename Device, typename T>
-Status DoParallelConcatUpdate(const Device& d, const Tensor& value,
-                              int32 loc, Tensor* output) {
+Status DoParallelConcatUpdate(const Device& d, const Tensor& value, int32 loc,
+                              Tensor* output) {
   auto Tvalue = value.shaped<T, 2>({1, value.NumElements()});
   auto Toutput = output->flat_outer_dims<T>();
   auto nrows = Toutput.dimension(0);
@@ -74,7 +74,7 @@ Status DoParallelConcat(const SyclDevice& d, const Tensor& value, int32 loc,
       return errors::InvalidArgument("Unsupported data type: ", value.dtype());
   }
 }
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // end namespace functor
 
@@ -207,7 +207,7 @@ REGISTER_KERNEL_BUILDER(Name("_ParallelConcatUpdate")
                             .HostMemory("output")
                             .TypeConstraint<int32>("T"),
                         ParallelConcatUpdate<CPUDevice>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
 
diff --git a/tensorflow/core/kernels/l2loss_op.cc b/tensorflow/core/kernels/l2loss_op.cc
index f8ed935157..f561287f7a 100644
--- a/tensorflow/core/kernels/l2loss_op.cc
+++ b/tensorflow/core/kernels/l2loss_op.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/kernels/l2loss_op.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
diff --git a/tensorflow/core/kernels/linalg_ops_common.cc b/tensorflow/core/kernels/linalg_ops_common.cc
index 36907fb571..b58bcf5834 100644
--- a/tensorflow/core/kernels/linalg_ops_common.cc
+++ b/tensorflow/core/kernels/linalg_ops_common.cc
@@ -108,7 +108,6 @@ void LinearAlgebraOp<Scalar>::Compute(OpKernelContext* context) {
   auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
   Shard(worker_threads.num_threads, worker_threads.workers,
         batch_shape.num_elements(), GetCostPerUnit(input_matrix_shapes), shard);
-
 }
 
 template <typename Scalar>
diff --git a/tensorflow/core/kernels/lmdb_reader_op.cc b/tensorflow/core/kernels/lmdb_reader_op.cc
index 31a427f2c9..1335a95dce 100755
--- a/tensorflow/core/kernels/lmdb_reader_op.cc
+++ b/tensorflow/core/kernels/lmdb_reader_op.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/reader_op_kernel.h"
 #include "tensorflow/core/framework/reader_base.h"
+#include "tensorflow/core/framework/reader_op_kernel.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 #include <sys/stat.h>
@@ -77,15 +77,13 @@ class LMDBReader : public ReaderBase {
         *at_end = true;
         return Status::OK();
       }
-    }
-    else {
+    } else {
       if (Seek(MDB_NEXT) == false) {
         *at_end = true;
         return Status::OK();
       }
     }
-    *key = string(static_cast<const char*>(mdb_key_.mv_data),
-                  mdb_key_.mv_size);
+    *key = string(static_cast<const char*>(mdb_key_.mv_data), mdb_key_.mv_size);
     *value = string(static_cast<const char*>(mdb_value_.mv_data),
                     mdb_value_.mv_size);
     *produced = true;
@@ -123,13 +121,10 @@ class LMDBReaderOp : public ReaderOpKernel {
   explicit LMDBReaderOp(OpKernelConstruction* context)
       : ReaderOpKernel(context) {
     Env* env = context->env();
-    SetReaderFactory([this, env]() {
-      return new LMDBReader(name(), env);
-    });
+    SetReaderFactory([this, env]() { return new LMDBReader(name(), env); });
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("LMDBReader").Device(DEVICE_CPU),
-                        LMDBReaderOp);
+REGISTER_KERNEL_BUILDER(Name("LMDBReader").Device(DEVICE_CPU), LMDBReaderOp);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/logistic-loss.h b/tensorflow/core/kernels/logistic-loss.h
index 2765f42bbd..6479e6f5dc 100644
--- a/tensorflow/core/kernels/logistic-loss.h
+++ b/tensorflow/core/kernels/logistic-loss.h
@@ -122,10 +122,9 @@ class LogisticLossUpdater : public DualLossUpdater {
                              num_loss_partitions * weighted_example_norm *
                                  example_weight *
                                  (0.5 * (1 + tanhx) / label - current_dual);
-    const double denominator = -2 * label -
-                               num_loss_partitions * weighted_example_norm *
-                                   example_weight * (1 - tanhx * tanhx) * 0.5 /
-                                   label;
+    const double denominator =
+        -2 * label - num_loss_partitions * weighted_example_norm *
+                         example_weight * (1 - tanhx * tanhx) * 0.5 / label;
     return x - numerator / denominator;
   }
 };
diff --git a/tensorflow/core/kernels/loss_test.cc b/tensorflow/core/kernels/loss_test.cc
index 89f0677e1f..460d65c5c2 100644
--- a/tensorflow/core/kernels/loss_test.cc
+++ b/tensorflow/core/kernels/loss_test.cc
@@ -32,14 +32,17 @@ namespace {
 
 TEST(LogisticLoss, ComputePrimalLoss) {
   LogisticLossUpdater loss_updater;
-  EXPECT_NEAR(0.693147, loss_updater.ComputePrimalLoss(
-                            0 /* wx */, 1 /* label */, 1 /* example weight */),
+  EXPECT_NEAR(0.693147,
+              loss_updater.ComputePrimalLoss(0 /* wx */, 1 /* label */,
+                                             1 /* example weight */),
               1e-3);
-  EXPECT_NEAR(0.0, loss_updater.ComputePrimalLoss(70 /* wx */, 1 /* label */,
-                                                  1 /* example weight */),
+  EXPECT_NEAR(0.0,
+              loss_updater.ComputePrimalLoss(70 /* wx */, 1 /* label */,
+                                             1 /* example weight */),
               1e-3);
-  EXPECT_NEAR(0.0, loss_updater.ComputePrimalLoss(-70 /* wx */, -1 /* label */,
-                                                  1 /* example weight */),
+  EXPECT_NEAR(0.0,
+              loss_updater.ComputePrimalLoss(-70 /* wx */, -1 /* label */,
+                                             1 /* example weight */),
               1e-3);
 }
 
@@ -53,31 +56,35 @@ TEST(LogisticLoss, ComputeDualLoss) {
               loss_updater.ComputeDualLoss(1 /* current dual */, 1 /* label */,
                                            1 /* example weight */),
               1e-3);
-  EXPECT_NEAR(-0.693147, loss_updater.ComputeDualLoss(0.5 /* current dual */,
-                                                      1 /* label */,
-                                                      1 /* example weight */),
-              1e-3);
+  EXPECT_NEAR(
+      -0.693147,
+      loss_updater.ComputeDualLoss(0.5 /* current dual */, 1 /* label */,
+                                   1 /* example weight */),
+      1e-3);
 }
 
 TEST(LogisticLoss, ComputeUpdatedDual) {
   LogisticLossUpdater loss_updater;
-  EXPECT_NEAR(0.479, loss_updater.ComputeUpdatedDual(
-                         1 /* num partitions */, 1.0 /* label */,
-                         1.0 /* example weight */, 0.5 /* current_dual */,
-                         0.3 /* wx */, 10.0 /* weighted_example_norm */),
+  EXPECT_NEAR(0.479,
+              loss_updater.ComputeUpdatedDual(
+                  1 /* num partitions */, 1.0 /* label */,
+                  1.0 /* example weight */, 0.5 /* current_dual */,
+                  0.3 /* wx */, 10.0 /* weighted_example_norm */),
               1e-3);
 
-  EXPECT_NEAR(-0.031, loss_updater.ComputeUpdatedDual(
-                          2 /* num partitions */, -1.0 /* label */,
-                          1.0 /* example weight */, 0.1 /* current_dual */,
-                          -0.8 /* wx */, 10.0 /* weighted_example_norm */),
+  EXPECT_NEAR(-0.031,
+              loss_updater.ComputeUpdatedDual(
+                  2 /* num partitions */, -1.0 /* label */,
+                  1.0 /* example weight */, 0.1 /* current_dual */,
+                  -0.8 /* wx */, 10.0 /* weighted_example_norm */),
               1e-3);
 }
 
 TEST(SquaredLoss, ComputePrimalLoss) {
   SquaredLossUpdater loss_updater;
-  EXPECT_NEAR(0.5, loss_updater.ComputePrimalLoss(0.0 /* wx */, 1.0 /* label */,
-                                                  1.0 /* example weight */),
+  EXPECT_NEAR(0.5,
+              loss_updater.ComputePrimalLoss(0.0 /* wx */, 1.0 /* label */,
+                                             1.0 /* example weight */),
               1e-3);
   EXPECT_NEAR(40.5,
               loss_updater.ComputePrimalLoss(10.0 /* wx */, 1.0 /* label */,
@@ -95,43 +102,50 @@ TEST(SquaredLoss, ComputePrimalLoss) {
 
 TEST(SquaredLoss, ComputeDualLoss) {
   SquaredLossUpdater loss_updater;
-  EXPECT_NEAR(0.0, loss_updater.ComputeDualLoss(0.0 /* current dual */,
-                                                -1.0 /* label */,
-                                                1.0 /* example weight */),
-              1e-3);
-  EXPECT_NEAR(0.66, loss_updater.ComputeDualLoss(0.2 /* current dual */,
-                                                 -1.0 /* label */,
-                                                 3.0 /* example weight */),
-              1e-3);
-  EXPECT_NEAR(-0.375, loss_updater.ComputeDualLoss(1.5 /* current dual */,
-                                                   1.0 /* label */,
-                                                   1.0 /* example weight */),
-              1e-3);
-  EXPECT_NEAR(-1.125, loss_updater.ComputeDualLoss(0.5 /* current dual */,
-                                                   1.0 /* label */,
-                                                   3.0 /* example weight */),
-              1e-3);
+  EXPECT_NEAR(
+      0.0,
+      loss_updater.ComputeDualLoss(0.0 /* current dual */, -1.0 /* label */,
+                                   1.0 /* example weight */),
+      1e-3);
+  EXPECT_NEAR(
+      0.66,
+      loss_updater.ComputeDualLoss(0.2 /* current dual */, -1.0 /* label */,
+                                   3.0 /* example weight */),
+      1e-3);
+  EXPECT_NEAR(
+      -0.375,
+      loss_updater.ComputeDualLoss(1.5 /* current dual */, 1.0 /* label */,
+                                   1.0 /* example weight */),
+      1e-3);
+  EXPECT_NEAR(
+      -1.125,
+      loss_updater.ComputeDualLoss(0.5 /* current dual */, 1.0 /* label */,
+                                   3.0 /* example weight */),
+      1e-3);
 }
 
 TEST(SquaredLoss, ComputeUpdatedDual) {
   SquaredLossUpdater loss_updater;
-  EXPECT_NEAR(0.336, loss_updater.ComputeUpdatedDual(
-                         1 /* num partitions */, 1.0 /* label */,
-                         1.0 /* example weight */, 0.3 /* current_dual */,
-                         0.3 /* wx */, 10.0 /* weighted_example_norm */),
+  EXPECT_NEAR(0.336,
+              loss_updater.ComputeUpdatedDual(
+                  1 /* num partitions */, 1.0 /* label */,
+                  1.0 /* example weight */, 0.3 /* current_dual */,
+                  0.3 /* wx */, 10.0 /* weighted_example_norm */),
               1e-3);
 
-  EXPECT_NEAR(-0.427, loss_updater.ComputeUpdatedDual(
-                          5 /* num partitions */, -1.0 /* label */,
-                          1.0 /* example weight */, -0.4 /* current_dual */,
-                          0.8 /* wx */, 10.0 /* weighted_example_norm */),
+  EXPECT_NEAR(-0.427,
+              loss_updater.ComputeUpdatedDual(
+                  5 /* num partitions */, -1.0 /* label */,
+                  1.0 /* example weight */, -0.4 /* current_dual */,
+                  0.8 /* wx */, 10.0 /* weighted_example_norm */),
               1e-3);
 }
 
 TEST(HingeLoss, ComputePrimalLoss) {
   HingeLossUpdater loss_updater;
-  EXPECT_NEAR(1.0, loss_updater.ComputePrimalLoss(0.0 /* wx */, 1.0 /* label */,
-                                                  1.0 /* example weight */),
+  EXPECT_NEAR(1.0,
+              loss_updater.ComputePrimalLoss(0.0 /* wx */, 1.0 /* label */,
+                                             1.0 /* example weight */),
               1e-3);
   EXPECT_NEAR(0.0,
               loss_updater.ComputePrimalLoss(10.0 /* wx */, 1.0 /* label */,
@@ -149,10 +163,11 @@ TEST(HingeLoss, ComputePrimalLoss) {
 
 TEST(HingeLoss, ComputeDualLoss) {
   HingeLossUpdater loss_updater;
-  EXPECT_NEAR(0.0, loss_updater.ComputeDualLoss(0.0 /* current dual */,
-                                                -1.0 /* label */,
-                                                1.0 /* example weight */),
-              1e-3);
+  EXPECT_NEAR(
+      0.0,
+      loss_updater.ComputeDualLoss(0.0 /* current dual */, -1.0 /* label */,
+                                   1.0 /* example weight */),
+      1e-3);
   EXPECT_NEAR(
       std::numeric_limits<double>::max(),
       loss_updater.ComputeDualLoss(0.2 /* current dual */, -1.0 /* label */,
@@ -163,10 +178,11 @@ TEST(HingeLoss, ComputeDualLoss) {
       loss_updater.ComputeDualLoss(1.5 /* current dual */, 1.0 /* label */,
                                    1.0 /* example weight */),
       1e-3);
-  EXPECT_NEAR(-1.5, loss_updater.ComputeDualLoss(0.5 /* current dual */,
-                                                 1.0 /* label */,
-                                                 3.0 /* example weight */),
-              1e-3);
+  EXPECT_NEAR(
+      -1.5,
+      loss_updater.ComputeDualLoss(0.5 /* current dual */, 1.0 /* label */,
+                                   3.0 /* example weight */),
+      1e-3);
 }
 
 TEST(HingeLoss, ConvertLabel) {
@@ -195,28 +211,31 @@ TEST(HingeLoss, ComputeUpdatedDual) {
   // weighted_example_norm=100.0, it turns out that the optimal value to update
   // the dual to is 0.507 which is within the permitted range and thus should be
   // the value returned.
-  EXPECT_NEAR(0.507, loss_updater.ComputeUpdatedDual(
-                         1 /* num partitions */, 1.0 /* label */,
-                         1.0 /* example weight */, 0.5 /* current_dual */,
-                         0.3 /* wx */, 100.0 /* weighted_example_norm */),
+  EXPECT_NEAR(0.507,
+              loss_updater.ComputeUpdatedDual(
+                  1 /* num partitions */, 1.0 /* label */,
+                  1.0 /* example weight */, 0.5 /* current_dual */,
+                  0.3 /* wx */, 100.0 /* weighted_example_norm */),
               1e-3);
   // When label=-1.0, example_weight=1.0, current_dual=0.4, wx=0.6,
   // weighted_example_norm=10.0 and num_loss_partitions=10, it turns out that
   // the optimal value to update the dual to is 0.384 which is within the
   // permitted range and thus should be the value returned.
-  EXPECT_NEAR(-0.416, loss_updater.ComputeUpdatedDual(
-                          10 /* num partitions */, -1.0 /* label */,
-                          1.0 /* example weight */, -0.4 /* current_dual */,
-                          0.6 /* wx */, 10.0 /* weighted_example_norm */),
+  EXPECT_NEAR(-0.416,
+              loss_updater.ComputeUpdatedDual(
+                  10 /* num partitions */, -1.0 /* label */,
+                  1.0 /* example weight */, -0.4 /* current_dual */,
+                  0.6 /* wx */, 10.0 /* weighted_example_norm */),
               1e-3);
   // When label=1.0, example_weight=1.0, current_dual=-0.5, wx=0.3 and
   // weighted_example_norm=10.0, it turns out that the optimal value to update
   // the dual to is -0.43. However, this is outside the allowed [0.0, 1.0] range
   // and hence the closest permitted value (0.0) should be returned instead.
-  EXPECT_NEAR(0.0, loss_updater.ComputeUpdatedDual(
-                       1 /* num partitions */, 1.0 /* label */,
-                       1.0 /* example weight */, -0.5 /* current_dual */,
-                       0.3 /* wx */, 10.0 /* weighted_example_norm */),
+  EXPECT_NEAR(0.0,
+              loss_updater.ComputeUpdatedDual(
+                  1 /* num partitions */, 1.0 /* label */,
+                  1.0 /* example weight */, -0.5 /* current_dual */,
+                  0.3 /* wx */, 10.0 /* weighted_example_norm */),
               1e-3);
 
   // When label=-1.0, example_weight=2.0, current_dual=-1.0, wx=0.3 and
@@ -224,17 +243,19 @@ TEST(HingeLoss, ComputeUpdatedDual) {
   // the dual to is -1.065. However, this is outside the allowed [-1.0, 0.0]
   // range and hence the closest permitted value (-1.0) should be returned
   // instead.
-  EXPECT_NEAR(-1.0, loss_updater.ComputeUpdatedDual(
-                        1 /* num partitions */, -1.0 /* label */,
-                        2.0 /* example weight */, -1.0 /* current_dual */,
-                        0.3 /* wx */, 10.0 /* weighted_example_norm */),
+  EXPECT_NEAR(-1.0,
+              loss_updater.ComputeUpdatedDual(
+                  1 /* num partitions */, -1.0 /* label */,
+                  2.0 /* example weight */, -1.0 /* current_dual */,
+                  0.3 /* wx */, 10.0 /* weighted_example_norm */),
               1e-3);
 }
 
 TEST(SmoothHingeLoss, ComputePrimalLoss) {
   SmoothHingeLossUpdater loss_updater;
-  EXPECT_NEAR(0.5, loss_updater.ComputePrimalLoss(0.0 /* wx */, 1.0 /* label */,
-                                                  1.0 /* example weight */),
+  EXPECT_NEAR(0.5,
+              loss_updater.ComputePrimalLoss(0.0 /* wx */, 1.0 /* label */,
+                                             1.0 /* example weight */),
               1e-3);
   EXPECT_NEAR(0.0,
               loss_updater.ComputePrimalLoss(10.0 /* wx */, 1.0 /* label */,
@@ -252,10 +273,11 @@ TEST(SmoothHingeLoss, ComputePrimalLoss) {
 
 TEST(SmoothHingeLoss, ComputeDualLoss) {
   SmoothHingeLossUpdater loss_updater;
-  EXPECT_NEAR(0.0, loss_updater.ComputeDualLoss(0.0 /* current dual */,
-                                                -1.0 /* label */,
-                                                1.0 /* example weight */),
-              1e-3);
+  EXPECT_NEAR(
+      0.0,
+      loss_updater.ComputeDualLoss(0.0 /* current dual */, -1.0 /* label */,
+                                   1.0 /* example weight */),
+      1e-3);
   EXPECT_NEAR(
       std::numeric_limits<double>::max(),
       loss_updater.ComputeDualLoss(0.2 /* current dual */, -1.0 /* label */,
@@ -266,24 +288,27 @@ TEST(SmoothHingeLoss, ComputeDualLoss) {
       loss_updater.ComputeDualLoss(1.5 /* current dual */, 1.0 /* label */,
                                    1.0 /* example weight */),
       1e-3);
-  EXPECT_NEAR(-1.125, loss_updater.ComputeDualLoss(0.5 /* current dual */,
-                                                   1.0 /* label */,
-                                                   3.0 /* example weight */),
-              1e-3);
+  EXPECT_NEAR(
+      -1.125,
+      loss_updater.ComputeDualLoss(0.5 /* current dual */, 1.0 /* label */,
+                                   3.0 /* example weight */),
+      1e-3);
 }
 
 TEST(SmoothHingeLoss, ComputeUpdatedDual) {
   SmoothHingeLossUpdater loss_updater;
-  EXPECT_NEAR(0.336, loss_updater.ComputeUpdatedDual(
-                         1 /* num partitions */, 1.0 /* label */,
-                         1.0 /* example weight */, 0.3 /* current_dual */,
-                         0.3 /* wx */, 10.0 /* weighted_example_norm */),
+  EXPECT_NEAR(0.336,
+              loss_updater.ComputeUpdatedDual(
+                  1 /* num partitions */, 1.0 /* label */,
+                  1.0 /* example weight */, 0.3 /* current_dual */,
+                  0.3 /* wx */, 10.0 /* weighted_example_norm */),
               1e-3);
 
-  EXPECT_NEAR(-0.427, loss_updater.ComputeUpdatedDual(
-                          5 /* num partitions */, -1.0 /* label */,
-                          1.0 /* example weight */, -0.4 /* current_dual */,
-                          0.8 /* wx */, 10.0 /* weighted_example_norm */),
+  EXPECT_NEAR(-0.427,
+              loss_updater.ComputeUpdatedDual(
+                  5 /* num partitions */, -1.0 /* label */,
+                  1.0 /* example weight */, -0.4 /* current_dual */,
+                  0.8 /* wx */, 10.0 /* weighted_example_norm */),
               1e-3);
 }
 
diff --git a/tensorflow/core/kernels/lrn_op.cc b/tensorflow/core/kernels/lrn_op.cc
index c905ebc84a..c3a59c9576 100644
--- a/tensorflow/core/kernels/lrn_op.cc
+++ b/tensorflow/core/kernels/lrn_op.cc
@@ -229,10 +229,11 @@ class LRNOp : public OpKernel {
   explicit LRNOp(OpKernelConstruction* context) : OpKernel(context) {
     int64 depth_radius64;
     OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
-    OP_REQUIRES(context, FastBoundsCheck(depth_radius64,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("depth_radius = ", depth_radius64,
-                                        " larger than int max"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("depth_radius = ", depth_radius64,
+                                " larger than int max"));
     depth_radius_ = static_cast<int>(depth_radius64);
     float tmp;
     OP_REQUIRES_OK(context, context->GetAttr("bias", &tmp));
@@ -247,9 +248,10 @@ class LRNOp : public OpKernel {
     const Tensor& in = context->input(0);
     OP_REQUIRES(context, in.dims() == 4,
                 errors::InvalidArgument("in must be 4-dimensional"));
-    OP_REQUIRES(context, FastBoundsCheck(in.NumElements(),
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("argument to LRN too large"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(in.NumElements(), std::numeric_limits<int>::max()),
+        errors::InvalidArgument("argument to LRN too large"));
     // Cast to platform-specific int to avoid conversion warnings.
     const int batch = static_cast<int>(in.dim_size(0));
     const int rows = static_cast<int>(in.dim_size(1));
@@ -448,10 +450,11 @@ class LRNGradOp : public OpKernel {
   explicit LRNGradOp(OpKernelConstruction* context) : OpKernel(context) {
     int64 depth_radius64;
     OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
-    OP_REQUIRES(context, FastBoundsCheck(depth_radius64,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("depth_radius = ", depth_radius64,
-                                        " larger than int max"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("depth_radius = ", depth_radius64,
+                                " larger than int max"));
     depth_radius_ = static_cast<int>(depth_radius64);
     float tmp;
     OP_REQUIRES_OK(context, context->GetAttr("bias", &tmp));
diff --git a/tensorflow/core/kernels/matching_files_op.cc b/tensorflow/core/kernels/matching_files_op.cc
index 5eb060f664..cdff7bad5f 100644
--- a/tensorflow/core/kernels/matching_files_op.cc
+++ b/tensorflow/core/kernels/matching_files_op.cc
@@ -45,15 +45,14 @@ class MatchingFilesOp : public OpKernel {
     int num_files = 0;
     std::vector<std::vector<string>> all_fnames(num_patterns);
     for (int i = 0; i < num_patterns; i++) {
-      OP_REQUIRES_OK(
-          context,
-          context->env()->GetMatchingPaths(patterns(i), &all_fnames[i]));
+      OP_REQUIRES_OK(context, context->env()->GetMatchingPaths(patterns(i),
+                                                               &all_fnames[i]));
       num_files += all_fnames[i].size();
     }
     Tensor* output_t = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(
-                       "filenames", TensorShape({num_files}), &output_t));
+    OP_REQUIRES_OK(
+        context, context->allocate_output("filenames", TensorShape({num_files}),
+                                          &output_t));
     auto output = output_t->vec<string>();
     int index = 0;
     for (int i = 0; i < num_patterns; ++i) {
diff --git a/tensorflow/core/kernels/matmul_op.cc b/tensorflow/core/kernels/matmul_op.cc
index cb68690f28..f499ce6519 100644
--- a/tensorflow/core/kernels/matmul_op.cc
+++ b/tensorflow/core/kernels/matmul_op.cc
@@ -261,12 +261,12 @@ struct LaunchMatMul<GPUDevice, T, true /* USE_CUBLAS */> {
       std::vector<int64>* algorithms, bool use_autotune, Tensor* out) {
     using perftools::gputools::blas::AlgorithmConfig;
     using perftools::gputools::blas::ComputationType;
-    using perftools::gputools::blas::ProfileResult;
-    using perftools::gputools::blas::Transpose;
     using perftools::gputools::blas::kDefaultAlgorithm;
     using perftools::gputools::blas::kDefaultBlasGemm;
     using perftools::gputools::blas::kDefaultBlasGemv;
     using perftools::gputools::blas::kNoAlgorithm;
+    using perftools::gputools::blas::ProfileResult;
+    using perftools::gputools::blas::Transpose;
     Transpose trans[] = {Transpose::kNoTranspose, Transpose::kTranspose};
     const uint64 m = a.dim_size(1 - dim_pair[0].first);
     const uint64 k = a.dim_size(dim_pair[0].first);
diff --git a/tensorflow/core/kernels/matmul_op.h b/tensorflow/core/kernels/matmul_op.h
index 6398da2fb9..628895ca86 100644
--- a/tensorflow/core/kernels/matmul_op.h
+++ b/tensorflow/core/kernels/matmul_op.h
@@ -30,7 +30,8 @@ struct MatMulTypes {
   typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned>
       out_type;
   typedef Eigen::TensorMap<Eigen::Tensor<const T, 2, Eigen::RowMajor>,
-                           Eigen::Aligned> in_type;
+                           Eigen::Aligned>
+      in_type;
 };
 
 template <typename Device, typename In0, typename In1, typename Out,
diff --git a/tensorflow/core/kernels/matrix_exponential_op.cc b/tensorflow/core/kernels/matrix_exponential_op.cc
index 4cc3f32f7e..99db898301 100644
--- a/tensorflow/core/kernels/matrix_exponential_op.cc
+++ b/tensorflow/core/kernels/matrix_exponential_op.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
-
 namespace tensorflow {
 
 template <class Scalar>
@@ -40,7 +39,8 @@ class MatrixExponentialOp : public LinearAlgebraOp<Scalar> {
                      MatrixMaps* outputs) final {
     const ConstMatrixMap& input = inputs[0];
     if (input.rows() == 0) return;
-    using Matrix = Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+    using Matrix =
+        Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
     Matrix tmp = input;
     outputs->at(0) = tmp.exp();
   }
@@ -51,9 +51,9 @@ class MatrixExponentialOp : public LinearAlgebraOp<Scalar> {
 
 REGISTER_LINALG_OP("MatrixExponential", (MatrixExponentialOp<float>), float);
 REGISTER_LINALG_OP("MatrixExponential", (MatrixExponentialOp<double>), double);
-REGISTER_LINALG_OP("MatrixExponential",
-                   (MatrixExponentialOp<complex64>), complex64);
-REGISTER_LINALG_OP("MatrixExponential",
-                   (MatrixExponentialOp<complex128>), complex128);
+REGISTER_LINALG_OP("MatrixExponential", (MatrixExponentialOp<complex64>),
+                   complex64);
+REGISTER_LINALG_OP("MatrixExponential", (MatrixExponentialOp<complex128>),
+                   complex128);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/matrix_logarithm_op.cc b/tensorflow/core/kernels/matrix_logarithm_op.cc
index cf0007b5b6..22ca094e24 100644
--- a/tensorflow/core/kernels/matrix_logarithm_op.cc
+++ b/tensorflow/core/kernels/matrix_logarithm_op.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
-
 namespace tensorflow {
 
 template <class Scalar>
@@ -40,7 +39,8 @@ class MatrixLogarithmOp : public LinearAlgebraOp<Scalar> {
                      MatrixMaps* outputs) final {
     const ConstMatrixMap& input = inputs[0];
     if (input.rows() == 0) return;
-    using Matrix = Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+    using Matrix =
+        Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
     Matrix tmp = input;
     outputs->at(0) = tmp.log();
   }
@@ -53,9 +53,9 @@ class MatrixLogarithmOp : public LinearAlgebraOp<Scalar> {
 // logarithm. If all eigenvalues are positive, then this returns the correct
 // logarithm, however checking for positive definiteness adds significant
 // overhead. Therefore at present we only register this Op for complex types.
-REGISTER_LINALG_OP("MatrixLogarithm",
-                   (MatrixLogarithmOp<complex64>), complex64);
-REGISTER_LINALG_OP("MatrixLogarithm",
-                   (MatrixLogarithmOp<complex128>), complex128);
+REGISTER_LINALG_OP("MatrixLogarithm", (MatrixLogarithmOp<complex64>),
+                   complex64);
+REGISTER_LINALG_OP("MatrixLogarithm", (MatrixLogarithmOp<complex128>),
+                   complex128);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/matrix_set_diag_op.cc b/tensorflow/core/kernels/matrix_set_diag_op.cc
index 9dd665392b..502d593474 100644
--- a/tensorflow/core/kernels/matrix_set_diag_op.cc
+++ b/tensorflow/core/kernels/matrix_set_diag_op.cc
@@ -69,8 +69,8 @@ class MatrixSetDiagOp : public OpKernel {
                 errors::InvalidArgument(
                     "must have diagonal.shape == input.shape[:-2] + "
                     "min(input.shape[-2:]), but received input shape: ",
-                    input_shape.DebugString(), " and diagonal shape: ",
-                    diag_shape.DebugString()));
+                    input_shape.DebugString(),
+                    " and diagonal shape: ", diag_shape.DebugString()));
 
     if (input.NumElements() == 0) {
       // This is a no-op.
diff --git a/tensorflow/core/kernels/maxpooling_op.cc b/tensorflow/core/kernels/maxpooling_op.cc
index 2eefadad49..9be7408012 100644
--- a/tensorflow/core/kernels/maxpooling_op.cc
+++ b/tensorflow/core/kernels/maxpooling_op.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/maxpooling_op.h"
 
 #include <vector>
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -37,7 +38,6 @@ limitations under the License.
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
 #include "tensorflow/core/util/use_cudnn.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 #if GOOGLE_CUDA
 #include "tensorflow/core/kernels/maxpooling_op_gpu.h"
@@ -89,7 +89,6 @@ static void SpatialMaxPoolWithArgMaxHelper(
   //    max value.
   auto shard = [&params, &in_mat, &out_mat, &out_arg_max_mat, &input_backprop,
                 &output_arg_max, &out_backprop](int64 start, int64 limit) {
-
     const int32 depth = params.depth;
     const int32 in_rows = params.tensor_in_rows;
     const int32 in_cols = params.tensor_in_cols;
@@ -180,7 +179,6 @@ static void SpatialMaxPoolWithArgMaxHelper(
         input_backprop_flat(input_backprop_index) += out_backprop_flat(index);
       }
     }
-
   };
 
   const int64 shard_cost = params.tensor_in_rows * params.tensor_in_cols *
@@ -567,7 +565,7 @@ class MaxPoolingGradGradOp : public OpKernel {
     //    tensor_out_as_matrix with the corresponding values in
     //    top_diff_as_matrix.
     auto shard = [&params, &in_mat, &out_mat, &top_diff_mat, &bottom_diff_mat](
-        int64 start, int64 limit) {
+                     int64 start, int64 limit) {
       const int32 depth = params.depth;
       const int32 in_rows = params.tensor_in_rows;
       const int32 in_cols = params.tensor_in_cols;
diff --git a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
index f8daaca4c9..0c7a236b2f 100644
--- a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
@@ -450,10 +450,10 @@ bool MaxPoolBackwardWithArgmax<T>::operator()(
     T* bottom_diff, const Eigen::GpuDevice& d) {
   const int kThreadsPerBlock = 1024;
   SetZero<<<(input_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
-    kThreadsPerBlock, 0, d.stream()>>>(input_size, bottom_diff);
+            kThreadsPerBlock, 0, d.stream()>>>(input_size, bottom_diff);
   MaxPoolBackward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
                     kThreadsPerBlock, 0, d.stream()>>>(
-                                        output_size, top_diff, mask, top_offset, bottom_offset, bottom_diff);
+      output_size, top_diff, mask, top_offset, bottom_offset, bottom_diff);
   return d.ok();
 }
 
diff --git a/tensorflow/core/kernels/meta_support.cc b/tensorflow/core/kernels/meta_support.cc
index 9fed01189f..39e60c9fce 100644
--- a/tensorflow/core/kernels/meta_support.cc
+++ b/tensorflow/core/kernels/meta_support.cc
@@ -98,9 +98,9 @@ typedef gemmlowp::meta::SimpleContext<gemmlowp::WorkersPool> LocalContext;
 template <typename Context, typename Params>
 void MultiThreadGemm(Context* context, const Params& params) {
   if (params.m <= 4) {
-      gemmlowp::meta::MultiThreadGemm<
-          Context, gemmlowp::meta::GemmExecutorPackLHSCacheFriendly<>, Params,
-          1, 8, 8>(context, params);
+    gemmlowp::meta::MultiThreadGemm<
+        Context, gemmlowp::meta::GemmExecutorPackLHSCacheFriendly<>, Params, 1,
+        8, 8>(context, params);
   } else {
     if (params.m >= params.n) {
       gemmlowp::meta::MultiThreadGemm<
diff --git a/tensorflow/core/kernels/mfcc.cc b/tensorflow/core/kernels/mfcc.cc
index 2793005aa2..8c755e0df8 100644
--- a/tensorflow/core/kernels/mfcc.cc
+++ b/tensorflow/core/kernels/mfcc.cc
@@ -27,21 +27,19 @@ const double kFilterbankFloor = 1e-12;
 const int kDefaultFilterbankChannelCount = 40;
 const int kDefaultDCTCoefficientCount = 13;
 
-Mfcc::Mfcc() : initialized_(false),
-               lower_frequency_limit_(kDefaultLowerFrequencyLimit),
-               upper_frequency_limit_(kDefaultUpperFrequencyLimit),
-               filterbank_channel_count_(kDefaultFilterbankChannelCount),
-               dct_coefficient_count_(kDefaultDCTCoefficientCount) { }
+Mfcc::Mfcc()
+    : initialized_(false),
+      lower_frequency_limit_(kDefaultLowerFrequencyLimit),
+      upper_frequency_limit_(kDefaultUpperFrequencyLimit),
+      filterbank_channel_count_(kDefaultFilterbankChannelCount),
+      dct_coefficient_count_(kDefaultDCTCoefficientCount) {}
 
-bool Mfcc::Initialize(int input_length,
-                      double input_sample_rate) {
-  bool initialized = mel_filterbank_.Initialize(input_length,
-                                                input_sample_rate,
-                                                filterbank_channel_count_,
-                                                lower_frequency_limit_,
-                                                upper_frequency_limit_);
-  initialized &= dct_.Initialize(filterbank_channel_count_,
-                                 dct_coefficient_count_);
+bool Mfcc::Initialize(int input_length, double input_sample_rate) {
+  bool initialized = mel_filterbank_.Initialize(
+      input_length, input_sample_rate, filterbank_channel_count_,
+      lower_frequency_limit_, upper_frequency_limit_);
+  initialized &=
+      dct_.Initialize(filterbank_channel_count_, dct_coefficient_count_);
   initialized_ = initialized;
   return initialized;
 }
diff --git a/tensorflow/core/kernels/mfcc.h b/tensorflow/core/kernels/mfcc.h
index 8268f47203..8eee76f7f0 100644
--- a/tensorflow/core/kernels/mfcc.h
+++ b/tensorflow/core/kernels/mfcc.h
@@ -20,18 +20,17 @@ limitations under the License.
 
 #include <vector>
 
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/kernels/mfcc_dct.h"
 #include "tensorflow/core/kernels/mfcc_mel_filterbank.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/framework/op_kernel.h"
 
 namespace tensorflow {
 
 class Mfcc {
  public:
   Mfcc();
-  bool Initialize(int input_length,
-                  double input_sample_rate);
+  bool Initialize(int input_length, double input_sample_rate);
 
   // Input is a single squared-magnitude spectrogram frame. The input spectrum
   // is converted to linear magnitude and weighted into bands using a
diff --git a/tensorflow/core/kernels/mfcc_mel_filterbank.cc b/tensorflow/core/kernels/mfcc_mel_filterbank.cc
index 630de8a5a3..3db3b51e8b 100644
--- a/tensorflow/core/kernels/mfcc_mel_filterbank.cc
+++ b/tensorflow/core/kernels/mfcc_mel_filterbank.cc
@@ -38,13 +38,12 @@ namespace tensorflow {
 
 MfccMelFilterbank::MfccMelFilterbank() : initialized_(false) {}
 
-bool MfccMelFilterbank::Initialize(int input_length,
-                               double input_sample_rate,
-                               int output_channel_count,
-                               double lower_frequency_limit,
-                               double upper_frequency_limit) {
+bool MfccMelFilterbank::Initialize(int input_length, double input_sample_rate,
+                                   int output_channel_count,
+                                   double lower_frequency_limit,
+                                   double upper_frequency_limit) {
   num_channels_ = output_channel_count;
-  sample_rate_  = input_sample_rate;
+  sample_rate_ = input_sample_rate;
   input_length_ = input_length;
 
   if (num_channels_ < 1) {
@@ -85,10 +84,9 @@ bool MfccMelFilterbank::Initialize(int input_length,
   }
 
   // Always exclude DC; emulate HTK.
-  const double hz_per_sbin = 0.5 * sample_rate_ /
-      static_cast<double>(input_length_ - 1);
-  start_index_ = static_cast<int>(1.5 + (lower_frequency_limit /
-                                           hz_per_sbin));
+  const double hz_per_sbin =
+      0.5 * sample_rate_ / static_cast<double>(input_length_ - 1);
+  start_index_ = static_cast<int>(1.5 + (lower_frequency_limit / hz_per_sbin));
   end_index_ = static_cast<int>(upper_frequency_limit / hz_per_sbin);
 
   // Maps the input spectrum bin indices to filter bank channels/indices. For
@@ -121,12 +119,12 @@ bool MfccMelFilterbank::Initialize(int input_length,
       weights_[i] = 0.0;
     } else {
       if (channel >= 0) {
-        weights_[i] = (center_frequencies_[channel + 1] -
-                       FreqToMel(i * hz_per_sbin)) /
+        weights_[i] =
+            (center_frequencies_[channel + 1] - FreqToMel(i * hz_per_sbin)) /
             (center_frequencies_[channel + 1] - center_frequencies_[channel]);
       } else {
         weights_[i] = (center_frequencies_[0] - FreqToMel(i * hz_per_sbin)) /
-            (center_frequencies_[0] - mel_low);
+                      (center_frequencies_[0] - mel_low);
       }
     }
   }
@@ -152,16 +150,16 @@ bool MfccMelFilterbank::Initialize(int input_length,
     }
   }
   if (!bad_channels.empty()) {
-    LOG(ERROR) << "Missing " << bad_channels.size() << " bands " <<
-        " starting at " << bad_channels[0] <<
-        " in mel-frequency design. " <<
-        "Perhaps too many channels or " <<
-        "not enough frequency resolution in spectrum. (" <<
-        "input_length: " << input_length <<
-        " input_sample_rate: " << input_sample_rate <<
-        " output_channel_count: " << output_channel_count <<
-        " lower_frequency_limit: " << lower_frequency_limit <<
-        " upper_frequency_limit: " << upper_frequency_limit;
+    LOG(ERROR) << "Missing " << bad_channels.size() << " bands "
+               << " starting at " << bad_channels[0]
+               << " in mel-frequency design. "
+               << "Perhaps too many channels or "
+               << "not enough frequency resolution in spectrum. ("
+               << "input_length: " << input_length
+               << " input_sample_rate: " << input_sample_rate
+               << " output_channel_count: " << output_channel_count
+               << " lower_frequency_limit: " << lower_frequency_limit
+               << " upper_frequency_limit: " << upper_frequency_limit;
   }
   initialized_ = true;
   return true;
@@ -171,7 +169,7 @@ bool MfccMelFilterbank::Initialize(int input_length,
 // square root, then summing FFT magnitudes under triangular integration windows
 // whose widths increase with frequency.
 void MfccMelFilterbank::Compute(const std::vector<double> &input,
-                            std::vector<double> *output) const {
+                                std::vector<double> *output) const {
   if (!initialized_) {
     LOG(ERROR) << "Mel Filterbank not initialized.";
     return;
diff --git a/tensorflow/core/kernels/mfcc_mel_filterbank.h b/tensorflow/core/kernels/mfcc_mel_filterbank.h
index 1bdc2dc93b..37c3936e80 100644
--- a/tensorflow/core/kernels/mfcc_mel_filterbank.h
+++ b/tensorflow/core/kernels/mfcc_mel_filterbank.h
@@ -27,10 +27,8 @@ class MfccMelFilterbank {
  public:
   MfccMelFilterbank();
   bool Initialize(int input_length,  // Number of unique FFT bins fftsize/2+1.
-                  double input_sample_rate,
-                  int output_channel_count,
-                  double lower_frequency_limit,
-                  double upper_frequency_limit);
+                  double input_sample_rate, int output_channel_count,
+                  double lower_frequency_limit, double upper_frequency_limit);
 
   // Takes a squared-magnitude spectrogram slice as input, computes a
   // triangular-mel-weighted linear-magnitude filterbank, and places the result
@@ -56,7 +54,7 @@ class MfccMelFilterbank {
   // FFT bin i contributes to the upper side of mel channel band_mapper_[i]
   std::vector<int> band_mapper_;
   int start_index_;  // Lowest FFT bin used to calculate mel spectrum.
-  int end_index_;  // Highest FFT bin used to calculate mel spectrum.
+  int end_index_;    // Highest FFT bin used to calculate mel spectrum.
 
   TF_DISALLOW_COPY_AND_ASSIGN(MfccMelFilterbank);
 };
diff --git a/tensorflow/core/kernels/mfcc_mel_filterbank_test.cc b/tensorflow/core/kernels/mfcc_mel_filterbank_test.cc
index 602dfeb4e5..54f31e1699 100644
--- a/tensorflow/core/kernels/mfcc_mel_filterbank_test.cc
+++ b/tensorflow/core/kernels/mfcc_mel_filterbank_test.cc
@@ -34,11 +34,9 @@ TEST(MfccMelFilterbankTest, AgreesWithPythonGoldenValues) {
     input.push_back(i + 1);
   }
   const int kChannelCount = 20;
-  filterbank.Initialize(input.size(),
-                        22050 /* sample rate */,
-                        kChannelCount /* channels */,
-                        20.0 /*  lower frequency limit */,
-                        4000.0 /* upper frequency limit */);
+  filterbank.Initialize(
+      input.size(), 22050 /* sample rate */, kChannelCount /* channels */,
+      20.0 /*  lower frequency limit */, 4000.0 /* upper frequency limit */);
 
   std::vector<double> output;
   filterbank.Compute(input, &output);
@@ -65,13 +63,10 @@ TEST(MfccMelFilterbankTest, IgnoresExistingContentOfOutputVector) {
   std::vector<double> input;
   std::vector<double> output;
 
-  filterbank.Initialize(kSampleCount,
-                        22050 /* sample rate */,
-                        20 /* channels */,
-                        20.0 /*  lower frequency limit */,
+  filterbank.Initialize(kSampleCount, 22050 /* sample rate */,
+                        20 /* channels */, 20.0 /*  lower frequency limit */,
                         4000.0 /* upper frequency limit */);
 
-
   // First call with nonzero input value, and an empty output vector,
   // will resize the output and fill it with the correct, nonzero outputs.
   input.assign(kSampleCount, 1.0);
diff --git a/tensorflow/core/kernels/mfcc_test.cc b/tensorflow/core/kernels/mfcc_test.cc
index cb32df8811..72c1d331d6 100644
--- a/tensorflow/core/kernels/mfcc_test.cc
+++ b/tensorflow/core/kernels/mfcc_test.cc
@@ -36,11 +36,10 @@ TEST(MfccTest, AgreesWithPythonGoldenValues) {
   std::vector<double> output;
   mfcc.Compute(input, &output);
 
-  std::vector<double> expected = {29.13970072, -6.41568601, -0.61903012,
-                             -0.96778652, -0.26819878, -0.40907028,
-                             -0.15614748, -0.23203119, -0.10481487,
-                             -0.1543029,  -0.0769791,  -0.10806114,
-                             -0.06047613};
+  std::vector<double> expected = {
+      29.13970072, -6.41568601, -0.61903012, -0.96778652, -0.26819878,
+      -0.40907028, -0.15614748, -0.23203119, -0.10481487, -0.1543029,
+      -0.0769791,  -0.10806114, -0.06047613};
 
   ASSERT_EQ(expected.size(), output.size());
   for (int i = 0; i < output.size(); ++i) {
diff --git a/tensorflow/core/kernels/mirror_pad_op.cc b/tensorflow/core/kernels/mirror_pad_op.cc
index fbdeaf43eb..26e1082989 100644
--- a/tensorflow/core/kernels/mirror_pad_op.cc
+++ b/tensorflow/core/kernels/mirror_pad_op.cc
@@ -87,8 +87,8 @@ class MirrorPadOp : public OpKernel {
       const Tpaddings before = paddings(d, 0);  // Pad before existing elements.
       const Tpaddings after = paddings(d, 1);   // Pad after existing elements.
       OP_REQUIRES(context, before >= 0 && after >= 0,
-                  errors::InvalidArgument("paddings must be non-negative: ",
-                                          before, " ", after));
+                  errors::InvalidArgument(
+                      "paddings must be non-negative: ", before, " ", after));
       if (offset_ == 0) {  // SYMMETRIC mode.
         OP_REQUIRES(context,
                     before <= in0.dim_size(d) && after <= in0.dim_size(d),
@@ -296,8 +296,8 @@ class MirrorPadGradOp : public OpKernel {
       const Tpaddings before = paddings(d, 0);  // Pad before existing elements.
       const Tpaddings after = paddings(d, 1);   // Pad after existing elements.
       OP_REQUIRES(context, before >= 0 && after >= 0,
-                  errors::InvalidArgument("Paddings must be non-negative: ",
-                                          before, ", ", after));
+                  errors::InvalidArgument(
+                      "Paddings must be non-negative: ", before, ", ", after));
 
       const int64 out_size = in0.dim_size(d) - (before + after);
       if (offset_ == 0) {  // SYMMETRIC mode.
diff --git a/tensorflow/core/kernels/mkl_avgpooling_op.cc b/tensorflow/core/kernels/mkl_avgpooling_op.cc
index d751a70fc8..a7c569ee05 100644
--- a/tensorflow/core/kernels/mkl_avgpooling_op.cc
+++ b/tensorflow/core/kernels/mkl_avgpooling_op.cc
@@ -26,14 +26,14 @@
 
 #ifdef INTEL_MKL_DNN
 #include "mkldnn.hpp"
-using mkldnn::memory;
+using mkldnn::algorithm;
+using mkldnn::engine;
 using mkldnn::error;
-using mkldnn::pooling_forward;
-using mkldnn::pooling_backward;
+using mkldnn::memory;
 using mkldnn::padding_kind;
-using mkldnn::engine;
+using mkldnn::pooling_backward;
+using mkldnn::pooling_forward;
 using mkldnn::prop_kind;
-using mkldnn::algorithm;
 #endif
 
 namespace tensorflow {
@@ -358,10 +358,11 @@ class MklAvgPoolingGradOp : public OpKernel {
       if (!outbackprop_in_mkl_format) {
         // For avgpooling, tensor_in_shape should have 1 dimension, and 4
         // elements.
-        OP_REQUIRES(context, tensor_in_shape.dims() == 1 &&
-                                 tensor_in_shape.NumElements() == 4,
-                    errors::InvalidArgument("original input shape must be "
-                                            "1-dimensional and 4 elements"));
+        OP_REQUIRES(
+            context,
+            tensor_in_shape.dims() == 1 && tensor_in_shape.NumElements() == 4,
+            errors::InvalidArgument("original input shape must be "
+                                    "1-dimensional and 4 elements"));
 
         // For avgpooling, out_backprop should have 4 dimensions.
         OP_REQUIRES(context, out_backprop.dims() == 4,
@@ -428,14 +429,13 @@ class MklAvgPoolingGradOp : public OpKernel {
   TensorFormat data_format_;
 };  // MklAvgPoolingGradOp
 
-
 #else  // INTEL_MKL_DNN is defined
 
 template <typename Device, typename T>
 class MklAvgPoolingOp : public MklPoolingForwardOpBase<T> {
  public:
   explicit MklAvgPoolingOp(OpKernelConstruction* context)
-  : MklPoolingForwardOpBase<T>(context) {
+      : MklPoolingForwardOpBase<T>(context) {
     // Workspace is an MKLDNN construct that is only used in Max Pooling.
     // So set workspace_enabled_ to false.
     this->workspace_enabled_ = false;
@@ -444,8 +444,8 @@ class MklAvgPoolingOp : public MklPoolingForwardOpBase<T> {
   void Compute(OpKernelContext* context) override {
     try {
       auto cpu_engine = engine(engine::cpu, 0);
-      const Tensor& input_tensor = MklGetInput(context,
-              this->kInputTensorIndexInput);
+      const Tensor& input_tensor =
+          MklGetInput(context, this->kInputTensorIndexInput);
       MklDnnShape dnn_shape_input;
       GetMklShape(context, this->kInputTensorIndexInput, &dnn_shape_input);
       this->SanityCheckInput(context, input_tensor, dnn_shape_input);
@@ -457,9 +457,8 @@ class MklAvgPoolingOp : public MklPoolingForwardOpBase<T> {
       // initialize variables for the pooling op
       MklPoolParameters pool_params;
       // Get the input tensor and initialize the pooling parameters
-      this->ConfigureInput(context, dnn_shape_input,
-                          input_tensor, &pool_params,
-                          &dnn_data_input);
+      this->ConfigureInput(context, dnn_shape_input, input_tensor, &pool_params,
+                           &dnn_data_input);
       OP_REQUIRES_OK(context, context->status());
 
       // Declare output tensor
@@ -470,56 +469,52 @@ class MklAvgPoolingOp : public MklPoolingForwardOpBase<T> {
       // If input is in Mkl layout, then just get the memory format from it
       // directly, instead of using input data_format to AvgPool.
       if (dnn_shape_input.IsMklTensor()) {
-        dnn_data_output.SetUsrMem(output_dims_mkl_order,
-                static_cast<memory::format>(dnn_data_input.GetUsrMemDesc()
-                    .data.format));
+        dnn_data_output.SetUsrMem(
+            output_dims_mkl_order,
+            static_cast<memory::format>(
+                dnn_data_input.GetUsrMemDesc().data.format));
 
       } else {
-          dnn_data_output.SetUsrMem(output_dims_mkl_order,
-              this->data_format_mkldnn_);
+        dnn_data_output.SetUsrMem(output_dims_mkl_order,
+                                  this->data_format_mkldnn_);
       }
 
-        // describe the memory layout
+      // describe the memory layout
       dnn_data_output.SetOpMemDesc(output_dims_mkl_order, memory::format::any);
 
       // 3. create a pooling primitive descriptor
-      auto pool_desc = pooling_forward::desc(prop_kind::forward,
-              algorithm::pooling_avg_exclude_padding,
-              dnn_data_input.GetUsrMemDesc(),
-              dnn_data_output.GetUsrMemDesc(),
-              memory::dims({  pool_params.row_stride,
-                              pool_params.col_stride}),
-              memory::dims({  pool_params.window_rows,
-                              pool_params.window_cols}),
-              memory::dims({  static_cast<int>(pool_params.pad_top),
-                              static_cast<int>(pool_params.pad_left)}),
-              memory::dims({  static_cast<int>(pool_params.pad_bottom),
-                              static_cast<int>(pool_params.pad_right)}),
-              TFPaddingToMklDnnPadding(this->padding_));
-      auto pool_prim_desc = pooling_forward::primitive_desc(pool_desc,
-                                                 cpu_engine);
+      auto pool_desc = pooling_forward::desc(
+          prop_kind::forward, algorithm::pooling_avg_exclude_padding,
+          dnn_data_input.GetUsrMemDesc(), dnn_data_output.GetUsrMemDesc(),
+          memory::dims({pool_params.row_stride, pool_params.col_stride}),
+          memory::dims({pool_params.window_rows, pool_params.window_cols}),
+          memory::dims({static_cast<int>(pool_params.pad_top),
+                        static_cast<int>(pool_params.pad_left)}),
+          memory::dims({static_cast<int>(pool_params.pad_bottom),
+                        static_cast<int>(pool_params.pad_right)}),
+          TFPaddingToMklDnnPadding(this->padding_));
+      auto pool_prim_desc =
+          pooling_forward::primitive_desc(pool_desc, cpu_engine);
 
       this->AllocateOutputTensor(context, pool_prim_desc, output_dims_mkl_order,
-                            this->data_format_mkldnn_, &output_tensor);
+                                 this->data_format_mkldnn_, &output_tensor);
       CHECK_NOTNULL(output_tensor);
 
       OP_REQUIRES_OK(context, context->status());
       dnn_data_output.SetUsrMemDataHandle(output_tensor);
 
-      this->PrepareAndExecuteNet(pool_prim_desc,
-                                &dnn_data_input,
-                                &dnn_data_output);
-    } catch (mkldnn::error &e) {
-        string error_msg = "Status: " + std::to_string(e.status) +
-                        ", message: " + string(e.message) +
-                        ", in file " + string(__FILE__) + ":" +
-                        std::to_string(__LINE__);
-        OP_REQUIRES_OK(context,
-                        errors::Aborted("Operation received an exception:",
-                                         error_msg));
+      this->PrepareAndExecuteNet(pool_prim_desc, &dnn_data_input,
+                                 &dnn_data_output);
+    } catch (mkldnn::error& e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
     }
   }  // Compute
-};  // MklAvgPoolingOp
+};   // MklAvgPoolingOp
 
 //-----------------------------------------------------------------------------
 
@@ -527,27 +522,23 @@ template <class Device, class T>
 class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
  public:
   explicit MklAvgPoolingGradOp(OpKernelConstruction* context)
-      : MklPoolingBackwardOpBase<T>(context) {
-  }
+      : MklPoolingBackwardOpBase<T>(context) {}
 
   void Compute(OpKernelContext* context) override {
     try {
       auto cpu_engine = engine(engine::cpu, 0);
       MklDnnShape original_input_mkl_shape, input_gradient_mkl_shape;
-      const Tensor& tensor_in_shape = MklGetInput(context,
-          kInputTensorIndexInputShape);
-      const Tensor& input_gradient_tensor = MklGetInput(context,
-          kInputTensorIndexInputGradient);
+      const Tensor& tensor_in_shape =
+          MklGetInput(context, kInputTensorIndexInputShape);
+      const Tensor& input_gradient_tensor =
+          MklGetInput(context, kInputTensorIndexInputGradient);
       GetMklShape(context, kInputTensorIndexInputShape,
-            &original_input_mkl_shape);
+                  &original_input_mkl_shape);
       GetMklShape(context, kInputTensorIndexInputGradient,
-            &input_gradient_mkl_shape);
-
+                  &input_gradient_mkl_shape);
 
-      SanityCheckInputs(context, tensor_in_shape,
-                        input_gradient_tensor,
-                        original_input_mkl_shape,
-                        input_gradient_mkl_shape);
+      SanityCheckInputs(context, tensor_in_shape, input_gradient_tensor,
+                        original_input_mkl_shape, input_gradient_mkl_shape);
       if (!context->status().ok()) return;
 
       // Used to allocate output_diff_src/diff_src
@@ -562,90 +553,70 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
       MklPoolParameters pool_params;
       memory::dims output_dims_mkl_order, original_input_dims_nchw;
       // Configure the original input memory descriptor
-      memory::desc original_input_md = ConfigureOriginalInput(context,
-                                      tensor_in_shape,
-                                      original_input_mkl_shape,
-                                      &original_input_dims_nchw,
-                                      &pool_params,
-                                      &original_input_shape);
+      memory::desc original_input_md = ConfigureOriginalInput(
+          context, tensor_in_shape, original_input_mkl_shape,
+          &original_input_dims_nchw, &pool_params, &original_input_shape);
 
       // configure the original output memory descriptor
       // by definition, the shape of the original output is the same
       // as the shape of the gradient diff_dst
       memory::desc original_output_md = this->ConfigureOriginalOutput(
-                pool_params, input_gradient_mkl_shape, output_dims_mkl_order);
+          pool_params, input_gradient_mkl_shape, output_dims_mkl_order);
 
       memory::desc target_diff_dst_md = this->ConfigureInputGradient(
-                                    input_gradient_mkl_shape,
-                                    input_gradient_tensor,
-                                    &input_gradient_diff_dst,
-                                    original_output_md);
+          input_gradient_mkl_shape, input_gradient_tensor,
+          &input_gradient_diff_dst, original_output_md);
       // The shape of the output diff src needs to be the same shape as the
       // original input. But we will set its format to be same as the format of
       // input gradient. We won't use format of original input since it will
       // always be in Tensorflow layout (given that AvgPoolGrad gets shape of
       // the input rather than actual input).
-      output_diff_src.SetUsrMem(original_input_dims_nchw,
-                                static_cast<memory::format>(
-                                  target_diff_dst_md.data.format));
+      output_diff_src.SetUsrMem(
+          original_input_dims_nchw,
+          static_cast<memory::format>(target_diff_dst_md.data.format));
 
       // Create the forward pooling primitive descriptor so we can reference it
       // in the backward pooling primitive descriptor
-      auto pool_fwd_desc = pooling_forward::desc(prop_kind::forward,
-              algorithm::pooling_avg_exclude_padding,
-              original_input_md,
-              original_output_md,
-              memory::dims({  pool_params.row_stride,
-                              pool_params.col_stride}),
-              memory::dims({  pool_params.window_rows,
-                              pool_params.window_cols}),
-              memory::dims({  static_cast<int>(pool_params.pad_top),
-                              static_cast<int>(pool_params.pad_left)}),
-              memory::dims({  static_cast<int>(pool_params.pad_bottom),
-                              static_cast<int>(pool_params.pad_right)}),
-              TFPaddingToMklDnnPadding(this->padding_));
-      auto pool_fwd_prim_desc
-              = pooling_forward::primitive_desc(pool_fwd_desc,
-                                                  cpu_engine);
+      auto pool_fwd_desc = pooling_forward::desc(
+          prop_kind::forward, algorithm::pooling_avg_exclude_padding,
+          original_input_md, original_output_md,
+          memory::dims({pool_params.row_stride, pool_params.col_stride}),
+          memory::dims({pool_params.window_rows, pool_params.window_cols}),
+          memory::dims({static_cast<int>(pool_params.pad_top),
+                        static_cast<int>(pool_params.pad_left)}),
+          memory::dims({static_cast<int>(pool_params.pad_bottom),
+                        static_cast<int>(pool_params.pad_right)}),
+          TFPaddingToMklDnnPadding(this->padding_));
+      auto pool_fwd_prim_desc =
+          pooling_forward::primitive_desc(pool_fwd_desc, cpu_engine);
 
       auto pool_bkwd_desc = pooling_backward::desc(
-              algorithm::pooling_avg_exclude_padding,
-              output_diff_src.GetUsrMemDesc(),
-              target_diff_dst_md,
-              memory::dims({  pool_params.row_stride,
-                              pool_params.col_stride}),
-              memory::dims({  pool_params.window_rows,
-                              pool_params.window_cols}),
-              memory::dims({  static_cast<int>(pool_params.pad_top),
-                              static_cast<int>(pool_params.pad_left)}),
-              memory::dims({  static_cast<int>(pool_params.pad_bottom),
-                              static_cast<int>(pool_params.pad_right)}),
-              TFPaddingToMklDnnPadding(this->padding_));
-      auto pool_bkwd_prim_desc
-                = pooling_backward::primitive_desc(pool_bkwd_desc,
-                                              cpu_engine,
-                                              pool_fwd_prim_desc);
-      this->AllocateOutputTensor(context, pool_bkwd_prim_desc,
-                      original_input_dims_nchw,
-                      this->data_format_mkldnn_,
-                      &output_tensor_diff_src);
+          algorithm::pooling_avg_exclude_padding,
+          output_diff_src.GetUsrMemDesc(), target_diff_dst_md,
+          memory::dims({pool_params.row_stride, pool_params.col_stride}),
+          memory::dims({pool_params.window_rows, pool_params.window_cols}),
+          memory::dims({static_cast<int>(pool_params.pad_top),
+                        static_cast<int>(pool_params.pad_left)}),
+          memory::dims({static_cast<int>(pool_params.pad_bottom),
+                        static_cast<int>(pool_params.pad_right)}),
+          TFPaddingToMklDnnPadding(this->padding_));
+      auto pool_bkwd_prim_desc = pooling_backward::primitive_desc(
+          pool_bkwd_desc, cpu_engine, pool_fwd_prim_desc);
+      this->AllocateOutputTensor(
+          context, pool_bkwd_prim_desc, original_input_dims_nchw,
+          this->data_format_mkldnn_, &output_tensor_diff_src);
 
       output_diff_src.SetUsrMemDataHandle(output_tensor_diff_src);
 
-      this->PrepareAndExecuteNet(pool_bkwd_prim_desc,
-                          &input_gradient_diff_dst,
-                          &output_diff_src,
-                          memory::primitive_desc(
-                              target_diff_dst_md,
-                              cpu_engine));
-    } catch (mkldnn::error &e) {
+      this->PrepareAndExecuteNet(
+          pool_bkwd_prim_desc, &input_gradient_diff_dst, &output_diff_src,
+          memory::primitive_desc(target_diff_dst_md, cpu_engine));
+    } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
-                      ", message: " + string(e.message) +
-                      ", in file " + string(__FILE__) + ":" +
-                      std::to_string(__LINE__);
-      OP_REQUIRES_OK(context,
-                      errors::Aborted("Compute received an exception:",
-                                      error_msg));
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(context, errors::Aborted("Compute received an exception:",
+                                              error_msg));
     }
   }  // Compute
 
@@ -655,12 +626,11 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
   const int kInputTensorIndexInputShape = 0;
   const int kInputTensorIndexInputGradient = 1;
 
-  memory::desc ConfigureOriginalInput(OpKernelContext* context,
-        const Tensor& tensor_original_input_shape,
-        const MklDnnShape& original_input_mkl_shape,
-        memory::dims* original_input_dims_mkl_order,
-        MklPoolParameters* pool_params,
-        TensorShape* input_tensor_shape) {
+  memory::desc ConfigureOriginalInput(
+      OpKernelContext* context, const Tensor& tensor_original_input_shape,
+      const MklDnnShape& original_input_mkl_shape,
+      memory::dims* original_input_dims_mkl_order,
+      MklPoolParameters* pool_params, TensorShape* input_tensor_shape) {
     CHECK_NOTNULL(original_input_dims_mkl_order);
     CHECK_NOTNULL(pool_params);
     CHECK_NOTNULL(input_tensor_shape);
@@ -672,46 +642,42 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
     }
 
     return MklPoolingBackwardOpBase<T>::ConfigureOriginalInput(
-                                              context,
-                                              tensor_original_input_shape,
-                                              original_input_mkl_shape,
-                                              original_input_dims_mkl_order,
-                                              pool_params,
-                                              *input_tensor_shape);
-}
+        context, tensor_original_input_shape, original_input_mkl_shape,
+        original_input_dims_mkl_order, pool_params, *input_tensor_shape);
+  }
 
   void SanityCheckInputs(OpKernelContext* context,
-                        const Tensor& tensor_in_shape,
-                        const Tensor& input_gradient_tensor,
-                        const MklDnnShape& original_input_mkl_shape,
-                        const MklDnnShape& input_gradient_mkl_shape) {
+                         const Tensor& tensor_in_shape,
+                         const Tensor& input_gradient_tensor,
+                         const MklDnnShape& original_input_mkl_shape,
+                         const MklDnnShape& input_gradient_mkl_shape) {
     if (!original_input_mkl_shape.IsMklTensor()) {
-      OP_REQUIRES(context, tensor_in_shape.dims() == 1 &&
-          tensor_in_shape.NumElements() == 4,
+      OP_REQUIRES(
+          context,
+          tensor_in_shape.dims() == 1 && tensor_in_shape.NumElements() == 4,
           errors::InvalidArgument("original input shape must be "
-                "1-dimensional and 4 elements"));
+                                  "1-dimensional and 4 elements"));
     } else {
-      OP_REQUIRES(context, original_input_mkl_shape.GetDimension() == 1 &&
-          original_input_mkl_shape.DimSize(0) == 4,
-          errors::InvalidArgument("original input shape must be "
-                "1-dimensional and 4 elements"));
+      OP_REQUIRES(context,
+                  original_input_mkl_shape.GetDimension() == 1 &&
+                      original_input_mkl_shape.DimSize(0) == 4,
+                  errors::InvalidArgument("original input shape must be "
+                                          "1-dimensional and 4 elements"));
     }
 
     if (!input_gradient_mkl_shape.IsMklTensor()) {
       // For avgpooling, input_gradient_diff_dst should have 4 dimensions.
       OP_REQUIRES(context, input_gradient_tensor.dims() == 4,
-          errors::InvalidArgument("Gradient shape must be "
-                              "4-dimensional"));
+                  errors::InvalidArgument("Gradient shape must be "
+                                          "4-dimensional"));
     } else {
       OP_REQUIRES(context, input_gradient_mkl_shape.GetDimension() == 4,
-          errors::InvalidArgument("Gradient shape must be "
-                              "4-dimensional"));
+                  errors::InvalidArgument("Gradient shape must be "
+                                          "4-dimensional"));
     }
   }
 };  // MklAvgPoolingGradOp
 
-
-
 #endif  // INTEL_MKL_DNN
 
 REGISTER_KERNEL_BUILDER(Name("_MklAvgPool")
@@ -728,4 +694,3 @@ REGISTER_KERNEL_BUILDER(Name("_MklAvgPoolGrad")
 
 }  // namespace tensorflow
 #endif  // INTEL_MKL
-
diff --git a/tensorflow/core/kernels/mkl_batch_matmul_op.cc b/tensorflow/core/kernels/mkl_batch_matmul_op.cc
index 9fee94f946..d9713075be 100644
--- a/tensorflow/core/kernels/mkl_batch_matmul_op.cc
+++ b/tensorflow/core/kernels/mkl_batch_matmul_op.cc
@@ -40,7 +40,6 @@ limitations under the License.
 #include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 #define MKL_Complex8 tensorflow::complex64
 #define MKL_Complex16 tensorflow::complex128
diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl_concat_op.cc
index d109bb6bcf..7da63604d2 100644
--- a/tensorflow/core/kernels/mkl_concat_op.cc
+++ b/tensorflow/core/kernels/mkl_concat_op.cc
@@ -33,8 +33,8 @@ limitations under the License.
 #ifdef INTEL_MKL_DNN
 #include "mkldnn.hpp"
 
-using mkldnn::stream;
 using mkldnn::concat;
+using mkldnn::stream;
 #endif
 
 namespace tensorflow {
@@ -45,7 +45,6 @@ typedef std::vector<TensorShape> TensorShapeList;
 
 enum AxisArgumentName { NAME_IS_AXIS, NAME_IS_CONCAT_DIM };
 
-
 // TODO(intelft) Check if we can reuse existing EigenConcatOp using Mutable
 // reference inputs.
 // --------------------------------------------------------------------------
@@ -152,8 +151,8 @@ class EigenConcatBaseOp : public OpKernel {
 
 #else  // MKL_DNN
 
-void Compute(OpKernelContext* c, const std::vector<Tensor>& values,
-                        const TensorShapeList& input_shapes) {
+  void Compute(OpKernelContext* c, const std::vector<Tensor>& values,
+               const TensorShapeList& input_shapes) {
     const Tensor* concat_dim_tensor;
     const char* axis_attribute_name =
         AxisArgName == NAME_IS_AXIS
@@ -197,7 +196,8 @@ void Compute(OpKernelContext* c, const std::vector<Tensor>& values,
       const auto in = values[i];
       const bool in_is_scalar = IsLegacyScalar(input_shapes[i]);
       OP_REQUIRES(
-          c, (input_shapes[i].dims() == input_dims) ||
+          c,
+          (input_shapes[i].dims() == input_dims) ||
               (input_is_scalar && in_is_scalar),
           errors::InvalidArgument(
               "ConcatOp : Ranks of all input tensors should match: shape[0] = ",
@@ -208,8 +208,8 @@ void Compute(OpKernelContext* c, const std::vector<Tensor>& values,
         inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
             in.shaped<T, 2>({inputs_flat_dim0, inputs_flat_dim1})));
       }
-      output_concat_dim += input_shapes[i].dims() > 0 ?
-                           input_shapes[i].dim_size(axis) : 1;
+      output_concat_dim +=
+          input_shapes[i].dims() > 0 ? input_shapes[i].dim_size(axis) : 1;
     }
 
     TensorShape output_shape(input_shape);
@@ -418,7 +418,6 @@ class MklConcatOp : public OpKernel {
     OP_REQUIRES_OK(context, context->status());
   }
 
-
  private:
   typedef struct {
     TensorFormat data_format;
@@ -590,39 +589,45 @@ class MklConcatOp : public OpKernel {
       GetMklShapeList(context, "values", &input_shapes);
 
       const Tensor& concat_dim_tensor = (AxisArgName == NAME_IS_CONCAT_DIM)
-                    ? MklGetInput(context, 0) : MklGetInput(context, N);
+                                            ? MklGetInput(context, 0)
+                                            : MklGetInput(context, N);
       // Sanity checks
-      OP_REQUIRES(context, IsLegacyScalar(concat_dim_tensor.shape()),
-        errors::InvalidArgument(
-            "Concat dim tensor should be a scalar integer, but got shape ",
-            concat_dim_tensor.shape().DebugString()));
-      int32 concat_dim = internal::SubtleMustCopy(
-                           concat_dim_tensor.scalar<int32>()());
+      OP_REQUIRES(
+          context, IsLegacyScalar(concat_dim_tensor.shape()),
+          errors::InvalidArgument(
+              "Concat dim tensor should be a scalar integer, but got shape ",
+              concat_dim_tensor.shape().DebugString()));
+      int32 concat_dim =
+          internal::SubtleMustCopy(concat_dim_tensor.scalar<int32>()());
 
       // check that ranks of all tensors match
       // and that their shapes match except for concat_dim.
       int i = 0;
       bool invoke_eigen = false;
       bool are_all_mkl_inputs = true, are_all_tf_inputs = true;
-      const TensorShape expected_shape = input_shapes[0].IsMklTensor() ?
-                                         input_shapes[0].GetTfShape() :
-                                         input_tensors[0].shape();
+      const TensorShape expected_shape = input_shapes[0].IsMklTensor()
+                                             ? input_shapes[0].GetTfShape()
+                                             : input_tensors[0].shape();
       size_t expected_dims = expected_shape.dims();
 
       if (concat_dim < 0) concat_dim = expected_dims + concat_dim;
 
       for (auto& s : input_shapes) {
-        if (s == expected_shape) {++i; continue;}
+        if (s == expected_shape) {
+          ++i;
+          continue;
+        }
 
-        TensorShape s_shape = s.IsMklTensor() ? s.GetTfShape() :
-                      input_tensors[i].shape();
+        TensorShape s_shape =
+            s.IsMklTensor() ? s.GetTfShape() : input_tensors[i].shape();
         size_t s_dims = s_shape.dims();
 
-        OP_REQUIRES(context, s_dims == expected_dims,
-                  errors::InvalidArgument(
-                      "_MklConcatOp : Ranks of all input tensors should match:"
-                      " input dimensions = ",
-                      s_dims, " vs. expected rank = ", expected_dims));
+        OP_REQUIRES(
+            context, s_dims == expected_dims,
+            errors::InvalidArgument(
+                "_MklConcatOp : Ranks of all input tensors should match:"
+                " input dimensions = ",
+                s_dims, " vs. expected rank = ", expected_dims));
 
         for (int d = 0; d < expected_dims; ++d) {
           if (d == concat_dim) continue;
@@ -630,10 +635,11 @@ class MklConcatOp : public OpKernel {
           size_t expected_size = expected_shape.dim_size(d);
           size_t s_size = s_shape.dim_size(d);
           OP_REQUIRES(
-            context, expected_size == s_size,
-            errors::InvalidArgument("_MklConcatOp : Dimensions of inputs "
-                    "should match: shape[0][", d, "]= ", expected_size,
-                    " vs. shape[", i, "][", d, "] = ", s_size));
+              context, expected_size == s_size,
+              errors::InvalidArgument("_MklConcatOp : Dimensions of inputs "
+                                      "should match: shape[0][",
+                                      d, "]= ", expected_size, " vs. shape[", i,
+                                      "][", d, "] = ", s_size));
         }
 
         if (s.IsMklTensor())
@@ -657,8 +663,8 @@ class MklConcatOp : public OpKernel {
         TensorShapeList tf_input_shapes;
         i = 0;
         for (auto& s : input_shapes) {
-          TensorShape s_shape = s.IsMklTensor() ? s.GetTfShape() :
-                                input_tensors[i].shape();
+          TensorShape s_shape =
+              s.IsMklTensor() ? s.GetTfShape() : input_tensors[i].shape();
           tf_input_shapes.push_back(s_shape);
           ++i;
         }
@@ -678,21 +684,22 @@ class MklConcatOp : public OpKernel {
       std::vector<memory::primitive_desc> srcs_pd;
       std::vector<MklDnnData<T>> srcs(N, MklDnnData<T>(&cpu_engine));
       int64 dst_concat_dim_size = 0;
-      for (int k =0; k < N; k++) {
+      for (int k = 0; k < N; k++) {
         bool is_mkl_tensor = input_shapes[k].IsMklTensor();
         memory::dims src_dims;
 
         // Same comment as dst_dims for src_dims.
-        src_dims = (is_mkl_tensor) ?
-                   TFShapeToMklDnnDims(input_shapes[k].GetTfShape()) :
-                   TFShapeToMklDnnDims(input_tensors[k].shape());
+        src_dims = (is_mkl_tensor)
+                       ? TFShapeToMklDnnDims(input_shapes[k].GetTfShape())
+                       : TFShapeToMklDnnDims(input_tensors[k].shape());
 
         dst_concat_dim_size += src_dims[concat_dim];
-        auto src_md = is_mkl_tensor ? input_shapes[k].GetMklLayout() :
-          // It does not matter what data format we use here (NHWC or NCHW).
-          // We just need to ensure that output of Concat uses same data format
-          // as input.
-                  memory::desc(src_dims, MklDnnType<T>(), memory::format::nchw);
+        auto src_md =
+            is_mkl_tensor ? input_shapes[k].GetMklLayout() :
+                          // It does not matter what data format we use here
+                          // (NHWC or NCHW). We just need to ensure that output
+                          // of Concat uses same data format as input.
+                memory::desc(src_dims, MklDnnType<T>(), memory::format::nchw);
 
         srcs[k].SetUsrMem(src_md, &input_tensors[k]);
         auto src_mpd = srcs[k].GetUsrMemPrimDesc();
@@ -707,14 +714,15 @@ class MklConcatOp : public OpKernel {
         // Since we are passing a specific format for destination,
         // we need to have dst_dims in MklDnn order (NCHW).
         auto orig_tf_format = input_shapes[0].GetTfDataFormat();
-        dst_dims_in_nchw = MklDnnDimsInNCHW(dst_dims,
-                               MklDnnDataFormatToTFDataFormat(orig_tf_format));
+        dst_dims_in_nchw = MklDnnDimsInNCHW(
+            dst_dims, MklDnnDataFormatToTFDataFormat(orig_tf_format));
         // We will set the output in the same format as input to avoid layout
         // conversions.
         // Currently we are setting dst format same as input format.
         // See if we can make this choice in a better way.
-        dst_md = memory::desc(dst_dims_in_nchw, MklDnnType<T>(),
-                 (memory::format) input_shapes[0].GetMklLayout().data.format);
+        dst_md = memory::desc(
+            dst_dims_in_nchw, MklDnnType<T>(),
+            (memory::format)input_shapes[0].GetMklLayout().data.format);
       } else {
         // Again, format does not matter here. We just need to make it same as
         // input format.
@@ -722,7 +730,7 @@ class MklConcatOp : public OpKernel {
       }
 
       std::vector<primitive::at> inputs;
-      for (int k=0; k < input_tensors.size(); k++)
+      for (int k = 0; k < input_tensors.size(); k++)
         inputs.push_back(srcs[k].GetOpMem());
 
       // If all inputs are in MKL format, then meaning of concat_dim needs to
@@ -732,8 +740,7 @@ class MklConcatOp : public OpKernel {
       // But ifinput tensors are in NHWC order, then semantics need to change.
       // E.g., if we are concatinating over Channel (dimension 3 for NHWC),
       // then since MklDnn order is NCHW, concat_dim needs to be 1.
-      if (are_all_mkl_inputs)
-        concat_dim = input_shapes[0].TfDimIdx(concat_dim);
+      if (are_all_mkl_inputs) concat_dim = input_shapes[0].TfDimIdx(concat_dim);
 
       auto concat_pd = concat::primitive_desc(dst_md, concat_dim, srcs_pd);
 
@@ -752,24 +759,25 @@ class MklConcatOp : public OpKernel {
         dnn_shape_dst.SetMklTensor(false);
         tf_shape_dst = MklDnnDimsToTFShape(dst_dims);
       }
-      AllocateOutputSetMklShape(context, 0, &dst_tensor,
-                                tf_shape_dst, dnn_shape_dst);
+      AllocateOutputSetMklShape(context, 0, &dst_tensor, tf_shape_dst,
+                                dnn_shape_dst);
       CHECK_NOTNULL(dst_tensor);
 
-      dst_md = dnn_shape_dst.IsMklTensor() ?
-               dnn_shape_dst.GetMklLayout() : dst_md;
+      dst_md =
+          dnn_shape_dst.IsMklTensor() ? dnn_shape_dst.GetMklLayout() : dst_md;
       dst.SetUsrMem(dst_md, dst_tensor);
 
       auto concat_op = concat(concat_pd, inputs, dst.GetOpMem());
       std::vector<primitive> net;
       net.push_back(concat_op);
       stream(stream::kind::eager).submit(net).wait();
-    } catch (mkldnn::error &e) {
-        string error_msg = "Status: " + std::to_string(e.status) +
-               ", message: " + string(e.message) + ", in file " +
-               string(__FILE__) + ":" + std::to_string(__LINE__);
-        OP_REQUIRES_OK(context, errors::Aborted(
-                "Operation received an exception:", error_msg));
+    } catch (mkldnn::error& e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
     }
   }
 
@@ -790,11 +798,9 @@ class MklConcatOp : public OpKernel {
     dnn_shape_output.SetDimensions(4);
     Tensor* output_tensor = nullptr;
     TensorShape tf_shape_output;
-    tf_shape_output.AddDim(
-        dnn_shape_output.GetSerializeBufferSize());
-    context->allocate_output(
-        GetTensorMetaDataIndex(0, context->num_outputs()),
-        tf_shape_output, &output_tensor);
+    tf_shape_output.AddDim(dnn_shape_output.GetSerializeBufferSize());
+    context->allocate_output(GetTensorMetaDataIndex(0, context->num_outputs()),
+                             tf_shape_output, &output_tensor);
     dnn_shape_output.SerializeMklDnnShape(
         output_tensor->flat<uint8>().data(),
         output_tensor->flat<uint8>().size() * sizeof(uint8));
diff --git a/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
index 0f1a218fe6..25c2573741 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
@@ -38,9 +38,9 @@ limitations under the License.
 #include "tensorflow/core/util/use_cudnn.h"
 #include "tensorflow/core/util/work_sharder.h"
 
-#include "tensorflow/core/util/mkl_util.h"
 #include "mkl_dnn.h"
 #include "mkl_dnn_types.h"
+#include "tensorflow/core/util/mkl_util.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
index 54d4916d49..ef3f8cfec1 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
@@ -38,17 +38,17 @@ limitations under the License.
 #include "tensorflow/core/util/use_cudnn.h"
 #include "tensorflow/core/util/work_sharder.h"
 
-#include "tensorflow/core/util/mkl_util.h"
 #include "mkl_dnn.h"
 #include "mkl_dnn_types.h"
+#include "tensorflow/core/util/mkl_util.h"
 
 #ifdef INTEL_MKL_DNN
 #include "mkldnn.hpp"
 
-using mkldnn::stream;
-using mkldnn::prop_kind;
 using mkldnn::convolution_backward_weights;
 using mkldnn::memory;
+using mkldnn::prop_kind;
+using mkldnn::stream;
 #endif
 
 namespace tensorflow {
@@ -360,8 +360,8 @@ class MklConv2DCustomBackpropFilterOp : public OpKernel {
           (mkl_convert_input) ? mkl_buf_convert_input : mkl_buf_input;
 
       const Tensor& out_backprop = MklGetInput(context, 2);
-      void* mkl_buf_out_backprop = const_cast<void*>(static_cast<const void*>(
-                                      out_backprop.flat<T>().data()));
+      void* mkl_buf_out_backprop = const_cast<void*>(
+          static_cast<const void*>(out_backprop.flat<T>().data()));
 
       CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_out_backprop,
                                                 prim_conv_bwdfilter,
@@ -371,10 +371,11 @@ class MklConv2DCustomBackpropFilterOp : public OpKernel {
           !dnnLayoutCompare_F32(mkl_lt_internal_out_backprop, lt_out_backprop);
       if (mkl_convert_out_backprop) {
         CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_out_backprop,
-                      lt_out_backprop, mkl_lt_internal_out_backprop),
+                                         lt_out_backprop,
+                                         mkl_lt_internal_out_backprop),
                  E_SUCCESS);
         AllocTmpBuffer(context, mkl_tmp_out_backprop_buf_tensor,
-            lt_out_backprop, &mkl_buf_convert_out_backprop);
+                       lt_out_backprop, &mkl_buf_convert_out_backprop);
         CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_out_backprop,
                                           mkl_buf_out_backprop,
                                           mkl_buf_convert_out_backprop),
@@ -428,18 +429,18 @@ class MklConv2DCustomBackpropFilterOp : public OpKernel {
                               .Device(DEVICE_CPU)                   \
                               .TypeConstraint<T>("T")               \
                               .Label(mkl_op_registry::kMklOpLabel), \
-              MklConv2DCustomBackpropFilterOp<CPUDevice, T>);
+                          MklConv2DCustomBackpropFilterOp<CPUDevice, T>);
 TF_CALL_float(REGISTER_MKL_FILTER_KERNELS);
 #undef REGISTER_MKL_FILTER_KERNELS
 
 #else
 
 template <typename Device, class T, bool biasEnabled>
-class MklConv2DCustomBackpropFilterOp :
-  public MklConv2DBackpropCommonOp<Device, T> {
+class MklConv2DCustomBackpropFilterOp
+    : public MklConv2DBackpropCommonOp<Device, T> {
  public:
   explicit MklConv2DCustomBackpropFilterOp(OpKernelConstruction* context)
-      : MklConv2DBackpropCommonOp<Device, T>(context) { }
+      : MklConv2DBackpropCommonOp<Device, T>(context) {}
   ~MklConv2DCustomBackpropFilterOp() {}
 
  private:
@@ -447,7 +448,7 @@ class MklConv2DCustomBackpropFilterOp :
                          const MklDnnShape& filter_mkl_shape,
                          const MklDnnShape& obp_mkl_shape) {
     CHECK(!filter_mkl_shape.IsMklTensor())
-      << "Conv2DBackpropFilter: filter should not be in MKL Layout";
+        << "Conv2DBackpropFilter: filter should not be in MKL Layout";
   }
 
   size_t GetInputTensorIndexWithSizes() { return 1; /* filter index */ }
@@ -462,8 +463,10 @@ class MklConv2DCustomBackpropFilterOp :
                                 const Tensor& filter_tensor) {
     TensorShape filter_tf_shape;
     CHECK_EQ(TensorShapeUtils::IsVector(filter_tensor.shape()), true);
-    CHECK_EQ(TensorShapeUtils::MakeShape(
-             filter_tensor.vec<int32>(), &filter_tf_shape).ok(), true);
+    CHECK_EQ(TensorShapeUtils::MakeShape(filter_tensor.vec<int32>(),
+                                         &filter_tf_shape)
+                 .ok(),
+             true);
     return filter_tf_shape;
   }
 
@@ -485,16 +488,13 @@ class MklConv2DCustomBackpropFilterOp :
     return memory::format::hwio;
   }
 
-  void CreatePrimitive(OpKernelContext* context,
-                       const engine& cpu_engine,
+  void CreatePrimitive(OpKernelContext* context, const engine& cpu_engine,
                        const convolution_forward::primitive_desc& conv_fwd_pd,
                        MklDnnData<T>* input, MklDnnData<T>* filter,
                        MklDnnData<T>* outbackprop, MklDnnData<T>* output,
-                       Tensor** output_tensor,
-                       const memory::dims& strides,
+                       Tensor** output_tensor, const memory::dims& strides,
                        const memory::dims& padding_l,
-                       const memory::dims& padding_r,
-                       padding_kind padding,
+                       const memory::dims& padding_r, padding_kind padding,
                        const memory::dims& bwd_output_dims,
                        memory::format bwd_output_format) {
     CHECK_NOTNULL(context);
@@ -508,34 +508,35 @@ class MklConv2DCustomBackpropFilterOp :
     int depth = 0;
     if (biasEnabled) {
       // Data structure for bias_grad
-      bias_grad = new MklDnnData<T> (&cpu_engine);
+      bias_grad = new MklDnnData<T>(&cpu_engine);
       TensorShape obp_tf_shape = GetTfShape(context, 2);
-      depth = (MklConv2DBackpropCommonOp<Device, T>::GetTFDataFormat()
-                == FORMAT_NCHW) ?
-          obp_tf_shape.dim_size(1) : obp_tf_shape.dim_size(3);
+      depth = (MklConv2DBackpropCommonOp<Device, T>::GetTFDataFormat() ==
+               FORMAT_NCHW)
+                  ? obp_tf_shape.dim_size(1)
+                  : obp_tf_shape.dim_size(3);
       memory::dims bias_grad_dims = {depth};
       bias_grad->SetOpMemDesc(bias_grad_dims, memory::format::x);
     }
 
     // Create convolution backward weights primitive.
-    auto bwd_desc = (biasEnabled && (bias_grad != nullptr))?
-        convolution_backward_weights::desc(convolution_direct,
-                                input->GetOpMemDesc(), output->GetOpMemDesc(),
-                                bias_grad->GetOpMemDesc(),
-                                outbackprop->GetOpMemDesc(), strides, padding_l,
-                                padding_r, padding) :
-        convolution_backward_weights::desc(convolution_direct,
-                          input->GetOpMemDesc(), output->GetOpMemDesc(),
-                          outbackprop->GetOpMemDesc(), strides, padding_l,
-                          padding_r, padding);
-
-    auto bwd_pd = convolution_backward_weights::primitive_desc(bwd_desc,
-                                                            cpu_engine,
-                                                            conv_fwd_pd);
+    auto bwd_desc =
+        (biasEnabled && (bias_grad != nullptr))
+            ? convolution_backward_weights::desc(
+                  convolution_direct, input->GetOpMemDesc(),
+                  output->GetOpMemDesc(), bias_grad->GetOpMemDesc(),
+                  outbackprop->GetOpMemDesc(), strides, padding_l, padding_r,
+                  padding)
+            : convolution_backward_weights::desc(
+                  convolution_direct, input->GetOpMemDesc(),
+                  output->GetOpMemDesc(), outbackprop->GetOpMemDesc(), strides,
+                  padding_l, padding_r, padding);
+
+    auto bwd_pd = convolution_backward_weights::primitive_desc(
+        bwd_desc, cpu_engine, conv_fwd_pd);
 
     // Allocate output tensor.
-    AllocateOutputTensor(context, bwd_pd, bwd_output_dims,
-                         bwd_output_format, output_tensor);
+    AllocateOutputTensor(context, bwd_pd, bwd_output_dims, bwd_output_format,
+                         output_tensor);
 
     CHECK_NOTNULL(*output_tensor);
     // Set buffer handle using allocated output tensor.
@@ -548,8 +549,8 @@ class MklConv2DCustomBackpropFilterOp :
       AllocateBiasGradTensor(context, bias_grad_shape, &bias_grad_tensor);
       memory::dims bias_grad_dims = {depth};
       // Since Bias is 1D, we use format::x from MKLDNN to represent it.
-      auto bias_grad_md = memory::desc({bias_grad_dims}, MklDnnType<T>(),
-                                       memory::format::x);
+      auto bias_grad_md =
+          memory::desc({bias_grad_dims}, MklDnnType<T>(), memory::format::x);
       bias_grad->SetUsrMem(bias_grad_md, bias_grad_tensor);
       bias_grad->SetUsrMemDataHandle(bias_grad_tensor);
     }
@@ -562,28 +563,29 @@ class MklConv2DCustomBackpropFilterOp :
   }
 
   // Allocate output tensor.
-  void AllocateOutputTensor(OpKernelContext* context,
-                  const convolution_backward_weights::primitive_desc& conv_pd,
-                  const memory::dims& output_dims_mkl_order,
-                  memory::format output_tf_format, Tensor** output_tensor) {
-      CHECK_NOTNULL(output_tensor);
-
-      // For BackpropFilter, we convert the output tensor back in Tensorflow
-      // layout. Because typically, BackpropFilter is the last operator in the
-      // graph that emit filter gradient that is provided to ApplyGradient
-      // method to update the filter. But it may be possible to eliminate this
-      // by forwarding filter in MKL layout if we support ApplyGradient method
-      // for MKL layout propagation.
-      MklDnnShape output_mkl_shape;
-      output_mkl_shape.SetMklTensor(false);
-      // output_dims_mkl_order is in OIHW format.
-      // Allocate shape of TF tensor in HWIO format.
-      TensorShape output_tf_shape({output_dims_mkl_order[MklDnnDims::Dim_H],
-                                   output_dims_mkl_order[MklDnnDims::Dim_W],
-                                   output_dims_mkl_order[MklDnnDims::Dim_I],
-                                   output_dims_mkl_order[MklDnnDims::Dim_O]});
-      AllocateOutputSetMklShape(context, 0, output_tensor, output_tf_shape,
-                                output_mkl_shape);
+  void AllocateOutputTensor(
+      OpKernelContext* context,
+      const convolution_backward_weights::primitive_desc& conv_pd,
+      const memory::dims& output_dims_mkl_order,
+      memory::format output_tf_format, Tensor** output_tensor) {
+    CHECK_NOTNULL(output_tensor);
+
+    // For BackpropFilter, we convert the output tensor back in Tensorflow
+    // layout. Because typically, BackpropFilter is the last operator in the
+    // graph that emit filter gradient that is provided to ApplyGradient
+    // method to update the filter. But it may be possible to eliminate this
+    // by forwarding filter in MKL layout if we support ApplyGradient method
+    // for MKL layout propagation.
+    MklDnnShape output_mkl_shape;
+    output_mkl_shape.SetMklTensor(false);
+    // output_dims_mkl_order is in OIHW format.
+    // Allocate shape of TF tensor in HWIO format.
+    TensorShape output_tf_shape({output_dims_mkl_order[MklDnnDims::Dim_H],
+                                 output_dims_mkl_order[MklDnnDims::Dim_W],
+                                 output_dims_mkl_order[MklDnnDims::Dim_I],
+                                 output_dims_mkl_order[MklDnnDims::Dim_O]});
+    AllocateOutputSetMklShape(context, 0, output_tensor, output_tf_shape,
+                              output_mkl_shape);
   }
 
   // Allocate tensor for bias grad
@@ -600,9 +602,9 @@ class MklConv2DCustomBackpropFilterOp :
 
   // Prepare and execute net - checks for input and output reorders.
   void PrepareAndExecutePrimitive(
-                  const convolution_backward_weights::primitive_desc& conv_pd,
-                  MklDnnData<T>* input, MklDnnData<T>* obp,
-                  MklDnnData<T>* output, MklDnnData<T>* bias_grad = nullptr) {
+      const convolution_backward_weights::primitive_desc& conv_pd,
+      MklDnnData<T>* input, MklDnnData<T>* obp, MklDnnData<T>* output,
+      MklDnnData<T>* bias_grad = nullptr) {
     // Create reorders between user layout and MKL layout if it is needed and
     // add it to the net before convolution.
     std::vector<primitive> net;
@@ -612,15 +614,15 @@ class MklConv2DCustomBackpropFilterOp :
     // For BackpropFilter, we convert the output tensor back in Tensorflow
     // layout.
     bool output_reorder_required = output->PrepareReorderToUserMemIfReq(
-                                      conv_pd.diff_weights_primitive_desc());
+        conv_pd.diff_weights_primitive_desc());
 
     if (biasEnabled && (bias_grad != nullptr)) {
-      net.push_back(convolution_backward_weights(conv_pd, input->GetOpMem(),
-                                      obp->GetOpMem(), output->GetOpMem(),
-                                      bias_grad->GetOpMem()));
+      net.push_back(convolution_backward_weights(
+          conv_pd, input->GetOpMem(), obp->GetOpMem(), output->GetOpMem(),
+          bias_grad->GetOpMem()));
     } else {
-      net.push_back(convolution_backward_weights(conv_pd, input->GetOpMem(),
-                                      obp->GetOpMem(), output->GetOpMem()));
+      net.push_back(convolution_backward_weights(
+          conv_pd, input->GetOpMem(), obp->GetOpMem(), output->GetOpMem()));
     }
 
     if (output_reorder_required) {
@@ -631,22 +633,24 @@ class MklConv2DCustomBackpropFilterOp :
   }
 };
 
-#define REGISTER_MKL_FILTER_KERNELS(T)                              \
-  REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropFilter")          \
-                              .Device(DEVICE_CPU)                   \
-                              .TypeConstraint<T>("T")               \
-                              .Label(mkl_op_registry::kMklOpLabel), \
-              MklConv2DCustomBackpropFilterOp<CPUDevice, T, false>);\
-  REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropFilterWithBias")  \
-                              .Device(DEVICE_CPU)                   \
-                              .TypeConstraint<T>("T")               \
-                              .Label(mkl_op_registry::kMklOpLabel), \
-              MklConv2DCustomBackpropFilterOp<CPUDevice, T, true>); \
-  REGISTER_KERNEL_BUILDER(Name("__MklDummyConv2DBackpropFilterWithBias")  \
-                              .Device(DEVICE_CPU)                   \
-                              .TypeConstraint<T>("T")               \
-                              .Label(mkl_op_registry::kMklOpLabel), \
-              MklDummyOp<CPUDevice, T>);
+#define REGISTER_MKL_FILTER_KERNELS(T)                                   \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("_MklConv2DBackpropFilter")                                   \
+          .Device(DEVICE_CPU)                                            \
+          .TypeConstraint<T>("T")                                        \
+          .Label(mkl_op_registry::kMklOpLabel),                          \
+      MklConv2DCustomBackpropFilterOp<CPUDevice, T, false>);             \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("_MklConv2DBackpropFilterWithBias")                           \
+          .Device(DEVICE_CPU)                                            \
+          .TypeConstraint<T>("T")                                        \
+          .Label(mkl_op_registry::kMklOpLabel),                          \
+      MklConv2DCustomBackpropFilterOp<CPUDevice, T, true>);              \
+  REGISTER_KERNEL_BUILDER(Name("__MklDummyConv2DBackpropFilterWithBias") \
+                              .Device(DEVICE_CPU)                        \
+                              .TypeConstraint<T>("T")                    \
+                              .Label(mkl_op_registry::kMklOpLabel),      \
+                          MklDummyOp<CPUDevice, T>);
 
 TF_CALL_float(REGISTER_MKL_FILTER_KERNELS);
 #undef REGISTER_MKL_FILTER_KERNELS
diff --git a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
index ef6db58d31..a6745489f4 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
@@ -23,6 +23,8 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 #include <algorithm>
 #include <vector>
+#include "mkl_dnn.h"
+#include "mkl_dnn_types.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -41,15 +43,13 @@ limitations under the License.
 #include "tensorflow/core/util/tensor_format.h"
 #include "tensorflow/core/util/use_cudnn.h"
 #include "tensorflow/core/util/work_sharder.h"
-#include "mkl_dnn.h"
-#include "mkl_dnn_types.h"
 
 #ifdef INTEL_MKL_DNN
 #include "mkldnn.hpp"
 
-using mkldnn::stream;
-using mkldnn::prop_kind;
 using mkldnn::convolution_backward_data;
+using mkldnn::prop_kind;
+using mkldnn::stream;
 #endif
 
 namespace tensorflow {
@@ -359,16 +359,15 @@ class MklConv2DCustomBackpropInputOp : public OpKernel {
 #else
 
 template <typename Device, class T>
-class MklConv2DCustomBackpropInputOp :
-  public MklConv2DBackpropCommonOp<Device, T> {
+class MklConv2DCustomBackpropInputOp
+    : public MklConv2DBackpropCommonOp<Device, T> {
  public:
   explicit MklConv2DCustomBackpropInputOp(OpKernelConstruction* context)
-      : MklConv2DBackpropCommonOp<Device, T>(context) { }
+      : MklConv2DBackpropCommonOp<Device, T>(context) {}
   ~MklConv2DCustomBackpropInputOp() {}
 
  private:
-  const int kInputIndex_Filter = 1,
-            kInputIndex_InputSizes = 0,
+  const int kInputIndex_Filter = 1, kInputIndex_InputSizes = 0,
             kInputIndex_OutBackProp = 2;
   void ValidateMklShapes(const MklDnnShape& input_mkl_shape,
                          const MklDnnShape& filter_mkl_shape,
@@ -377,7 +376,7 @@ class MklConv2DCustomBackpropInputOp :
     // of the Tensor and never an actual tensor. So it will never be in MKL
     // layout.
     CHECK(!input_mkl_shape.IsMklTensor())
-      << "Conv2DBackpropInput: input should not be in MKL Layout";
+        << "Conv2DBackpropInput: input should not be in MKL Layout";
   }
 
   size_t GetInputTensorIndexWithSizes() { return kInputIndex_InputSizes; }
@@ -386,8 +385,10 @@ class MklConv2DCustomBackpropInputOp :
                                const Tensor& input_tensor) {
     TensorShape input_tf_shape;
     CHECK_EQ(TensorShapeUtils::IsVector(input_tensor.shape()), true);
-    CHECK_EQ(TensorShapeUtils::MakeShape(input_tensor.vec<int32>(),
-                                         &input_tf_shape).ok(), true);
+    CHECK_EQ(
+        TensorShapeUtils::MakeShape(input_tensor.vec<int32>(), &input_tf_shape)
+            .ok(),
+        true);
     return input_tf_shape;
   }
 
@@ -414,16 +415,13 @@ class MklConv2DCustomBackpropInputOp :
     return data_format;
   }
 
-  void CreatePrimitive(OpKernelContext* context,
-                       const engine& cpu_engine,
+  void CreatePrimitive(OpKernelContext* context, const engine& cpu_engine,
                        const convolution_forward::primitive_desc& conv_fwd_pd,
                        MklDnnData<T>* input, MklDnnData<T>* filter,
                        MklDnnData<T>* outbackprop, MklDnnData<T>* output,
-                       Tensor** output_tensor,
-                       const memory::dims& strides,
+                       Tensor** output_tensor, const memory::dims& strides,
                        const memory::dims& padding_l,
-                       const memory::dims& padding_r,
-                       padding_kind padding,
+                       const memory::dims& padding_r, padding_kind padding,
                        const memory::dims& bwd_output_dims,
                        memory::format bwd_output_format) {
     CHECK_NOTNULL(context);
@@ -434,19 +432,16 @@ class MklConv2DCustomBackpropInputOp :
     CHECK_NOTNULL(output_tensor);
 
     // Create convolution backward data primitive.
-    auto bwd_desc = convolution_backward_data::desc(convolution_direct,
-                      output->GetOpMemDesc(), filter->GetOpMemDesc(),
-                      outbackprop->GetOpMemDesc(), strides, padding_l,
-                      padding_r, padding);
-
-    auto bwd_pd = convolution_backward_data::primitive_desc(bwd_desc,
-                                                          cpu_engine,
-                                                          conv_fwd_pd);
+    auto bwd_desc = convolution_backward_data::desc(
+        convolution_direct, output->GetOpMemDesc(), filter->GetOpMemDesc(),
+        outbackprop->GetOpMemDesc(), strides, padding_l, padding_r, padding);
 
+    auto bwd_pd = convolution_backward_data::primitive_desc(
+        bwd_desc, cpu_engine, conv_fwd_pd);
 
     // Allocate output tensor in TensorFlow and MKL layout.
-    AllocateOutputTensor(context, bwd_pd, bwd_output_dims,
-                         bwd_output_format, output_tensor);
+    AllocateOutputTensor(context, bwd_pd, bwd_output_dims, bwd_output_format,
+                         output_tensor);
     CHECK_NOTNULL(*output_tensor);
     // Set buffer handle using allocated output tensor.
     output->SetUsrMemDataHandle(*output_tensor);
@@ -455,44 +450,44 @@ class MklConv2DCustomBackpropInputOp :
   }
 
   // Allocate output tensor.
-  void AllocateOutputTensor(OpKernelContext* context,
-                  const convolution_backward_data::primitive_desc& conv_pd,
-                  const memory::dims& output_dims_mkl_order,
-                  memory::format output_tf_format, Tensor** output_tensor) {
-      CHECK_NOTNULL(output_tensor);
-
-      // Output primitive descriptor for backward data is diff_src.
-      auto dst_pd = conv_pd.diff_src_primitive_desc();
-
-      // Allocate shape of Mkl tensor.
-      MklDnnShape output_mkl_shape;
-      output_mkl_shape.SetMklTensor(true);
-      output_mkl_shape.SetMklLayout(&dst_pd);
-      output_mkl_shape.SetElemType(MklDnnType<T>());
-      output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
-                                   output_dims_mkl_order, output_tf_format);
-
-      // Allocate shape of TF tensor.
-      TensorShape output_tf_shape;
-      output_tf_shape.AddDim(dst_pd.get_size() / sizeof(T));
-
-      AllocateOutputSetMklShape(context, 0, output_tensor, output_tf_shape,
-                                output_mkl_shape);
+  void AllocateOutputTensor(
+      OpKernelContext* context,
+      const convolution_backward_data::primitive_desc& conv_pd,
+      const memory::dims& output_dims_mkl_order,
+      memory::format output_tf_format, Tensor** output_tensor) {
+    CHECK_NOTNULL(output_tensor);
+
+    // Output primitive descriptor for backward data is diff_src.
+    auto dst_pd = conv_pd.diff_src_primitive_desc();
+
+    // Allocate shape of Mkl tensor.
+    MklDnnShape output_mkl_shape;
+    output_mkl_shape.SetMklTensor(true);
+    output_mkl_shape.SetMklLayout(&dst_pd);
+    output_mkl_shape.SetElemType(MklDnnType<T>());
+    output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
+                                 output_dims_mkl_order, output_tf_format);
+
+    // Allocate shape of TF tensor.
+    TensorShape output_tf_shape;
+    output_tf_shape.AddDim(dst_pd.get_size() / sizeof(T));
+
+    AllocateOutputSetMklShape(context, 0, output_tensor, output_tf_shape,
+                              output_mkl_shape);
   }
 
   // Prepare and execute net - checks for input and output reorders.
   void PrepareAndExecutePrimitive(
-                  const convolution_backward_data::primitive_desc& conv_pd,
-                  MklDnnData<T>* filter, MklDnnData<T>* obp,
-                  MklDnnData<T>* output) {
+      const convolution_backward_data::primitive_desc& conv_pd,
+      MklDnnData<T>* filter, MklDnnData<T>* obp, MklDnnData<T>* output) {
     // Create reorders between user layout and MKL layout if it is needed and
     // add it to the net before convolution.
     std::vector<primitive> net;
     filter->CheckReorderToOpMem(conv_pd.weights_primitive_desc(), &net);
     obp->CheckReorderToOpMem(conv_pd.diff_dst_primitive_desc(), &net);
 
-    net.push_back(convolution_backward_data(conv_pd, obp->GetOpMem(),
-                                    filter->GetOpMem(), output->GetOpMem()));
+    net.push_back(convolution_backward_data(
+        conv_pd, obp->GetOpMem(), filter->GetOpMem(), output->GetOpMem()));
 
     stream(stream::kind::eager).submit(net).wait();
   }
diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc
index 0e77b45993..e44fba754b 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@@ -18,8 +18,8 @@ limitations under the License.
 
 #include <string.h>
 #include <map>
-#include <vector>
 #include <string>
+#include <vector>
 
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -41,15 +41,14 @@ limitations under the License.
 
 #include "tensorflow/core/util/mkl_util.h"
 
-
 #ifdef INTEL_MKL_DNN
 #include "mkldnn.hpp"
 
-using mkldnn::stream;
 using mkldnn::prop_kind;
+using mkldnn::stream;
 
-using mkldnn::convolution_forward;
 using mkldnn::convolution_direct;
+using mkldnn::convolution_forward;
 #else
 #include "mkl_dnn.h"
 #include "mkl_dnn_types.h"
@@ -116,18 +115,19 @@ class MklConv2DOp : public OpKernel {
                                         filter.shape().DebugString()));
 
     for (int i = 0; i < 3; i++) {
-      OP_REQUIRES(context, FastBoundsCheck(filter.dim_size(i),
-                                           std::numeric_limits<int>::max()),
-                  errors::InvalidArgument("filter too large"));
+      OP_REQUIRES(
+          context,
+          FastBoundsCheck(filter.dim_size(i), std::numeric_limits<int>::max()),
+          errors::InvalidArgument("filter too large"));
     }
 
     const int64 input_depth =
         input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'C')
                             : GetTensorDim(input, data_format_, 'C');
-    OP_REQUIRES(
-        context, input_depth == filter.dim_size(2),
-        errors::InvalidArgument("input and filter must have the same depth: ",
-                                input_depth, " vs ", filter.dim_size(2)));
+    OP_REQUIRES(context, input_depth == filter.dim_size(2),
+                errors::InvalidArgument(
+                    "input and filter must have the same depth: ", input_depth,
+                    " vs ", filter.dim_size(2)));
     // The last dimension for filter is out_depth.
     const int out_depth = static_cast<int>(filter.dim_size(3));
 
@@ -136,9 +136,10 @@ class MklConv2DOp : public OpKernel {
     const int64 input_rows_raw =
         input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'H')
                             : GetTensorDim(input, data_format_, 'H');
-    OP_REQUIRES(context, FastBoundsCheck(input_rows_raw,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("Input rows too large"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(input_rows_raw, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("Input rows too large"));
     const int input_rows = static_cast<int>(input_rows_raw);
     const int filter_rows = static_cast<int>(filter.dim_size(0));
 
@@ -147,9 +148,10 @@ class MklConv2DOp : public OpKernel {
     const int64 input_cols_raw =
         input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'W')
                             : GetTensorDim(input, data_format_, 'W');
-    OP_REQUIRES(context, FastBoundsCheck(input_cols_raw,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("Input cols too large"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(input_cols_raw, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("Input cols too large"));
     const int input_cols = static_cast<int>(input_cols_raw);
     const int filter_cols = static_cast<int>(filter.dim_size(1));
 
@@ -157,9 +159,10 @@ class MklConv2DOp : public OpKernel {
     const int64 input_batch_raw =
         input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'N')
                             : GetTensorDim(input, data_format_, 'N');
-    OP_REQUIRES(context, FastBoundsCheck(input_batch_raw,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("batch is too large"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(input_batch_raw, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("batch is too large"));
     const int batch = static_cast<int>(input_batch_raw);
 
     // For now we take the stride from the second and third dimensions only (we
@@ -313,8 +316,7 @@ class MklConv2DOp : public OpKernel {
     // Temp tensor used to allocate tmp buffers
     Tensor mkl_tmp_input_buf_tensor, mkl_tmp_filter_buf_tensor,
         mkl_tmp_bias_buf_tensor;
-    mkl_context.MklPrepareConvolutionInputs(context,
-                                            &mkl_tmp_input_buf_tensor,
+    mkl_context.MklPrepareConvolutionInputs(context, &mkl_tmp_input_buf_tensor,
                                             &mkl_tmp_filter_buf_tensor,
                                             &mkl_tmp_bias_buf_tensor);
 
@@ -398,8 +400,9 @@ class MklConv2DOp : public OpKernel {
       mkl_convert_input =
           !dnnLayoutCompare_F32(mkl_lt_internal_input, lt_input);
       if (mkl_convert_input) {
-        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input,
-                 lt_input, mkl_lt_internal_input), E_SUCCESS);
+        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input, lt_input,
+                                         mkl_lt_internal_input),
+                 E_SUCCESS);
         AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, mkl_lt_internal_input,
                        &mkl_buf_convert_input);
         CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_input, mkl_buf_input,
@@ -517,8 +520,8 @@ class MklConv2DOp : public OpKernel {
       GetMklShape(context, kInputIndex_Src, &src_mkl_shape);
       GetMklShape(context, kInputIndex_Filter, &filter_mkl_shape);
       OP_REQUIRES(context, filter_mkl_shape.IsMklTensor() == false,
-            errors::InvalidArgument("Filter should not be in "
-            "Mkl Layout"));
+                  errors::InvalidArgument("Filter should not be in "
+                                          "Mkl Layout"));
 
       MklDnnData<T> src(&cpu_engine);
       MklDnnData<T> filter(&cpu_engine);
@@ -531,11 +534,10 @@ class MklConv2DOp : public OpKernel {
       MklDnnConvUtil conv_utl(context, strides_, padding_, data_format_);
       auto src_tf_shape = GetTfShape(context, kInputIndex_Src);
       auto filter_tf_shape = GetTfShape(context, kInputIndex_Filter);
-      conv_utl.GetConvFwdSizesInMklOrder(src_tf_shape, filter_tf_shape,
-                                         &src_dims, &filter_dims, &strides,
-                                         &output_dims_tf_order,
-                                         &output_dims_mkl_order, &padding_l,
-                                         &padding_r);
+      conv_utl.GetConvFwdSizesInMklOrder(
+          src_tf_shape, filter_tf_shape, &src_dims, &filter_dims, &strides,
+          &output_dims_tf_order, &output_dims_mkl_order, &padding_l,
+          &padding_r);
       if (!context->status().ok()) return;
 
       // Check for corner case - if there is nothing to compute, return.
@@ -543,21 +545,20 @@ class MklConv2DOp : public OpKernel {
 
       // Corner cases: output with 0 elements and 0 batch size.
       Tensor* output_tensor = nullptr;
-      if (output_tf_shape.num_elements() == 0 ||
-          output_dims_tf_order[0] == 0) {
+      if (output_tf_shape.num_elements() == 0 || output_dims_tf_order[0] == 0) {
         // TODO(jbobba): Verify correctness here
         //               Need semantics for Null MKL tensor
         MklDnnShape output_mkl_shape;
         output_mkl_shape.SetMklTensor(false);
         AllocateOutputSetMklShape(context, kOutputIndex_Dst, &output_tensor,
-                                    src_tf_shape, output_mkl_shape);
+                                  src_tf_shape, output_mkl_shape);
 
         // MklConv2D also outputs converted filter as 2nd output of Conv2D.
         filter_mkl_shape.SetMklTensor(false);
         Tensor* output_filter_tensor = nullptr;
         AllocateOutputSetMklShape(context, kOutputIndex_Filter,
-                                  &output_filter_tensor,
-                                  filter_tf_shape, filter_mkl_shape);
+                                  &output_filter_tensor, filter_tf_shape,
+                                  filter_mkl_shape);
         return;
       }
 
@@ -570,14 +571,15 @@ class MklConv2DOp : public OpKernel {
       // (src_dims) required is in MKL-DNN order, the layout is Tensorflow's
       // layout (NHWC or NCHW depending on data format).
       auto src_md = src_mkl_shape.IsMklTensor()
-                    ? src_mkl_shape.GetMklLayout()
-                    : memory::desc(src_dims, MklDnnType<T>(), tf_fmt);
+                        ? src_mkl_shape.GetMklLayout()
+                        : memory::desc(src_dims, MklDnnType<T>(), tf_fmt);
       src.SetUsrMem(src_md, &src_tensor);
       // Although filter shape (filter_dims) required is in MKL-DNN order,
       // the layout is Tensorflow's layout (HWIO).
       auto filter_md = filter_mkl_shape.IsMklTensor()  // Should NEVER be true
-                    ? filter_mkl_shape.GetMklLayout()
-          : memory::desc(filter_dims, MklDnnType<T>(), memory::format::hwio);
+                           ? filter_mkl_shape.GetMklLayout()
+                           : memory::desc(filter_dims, MklDnnType<T>(),
+                                          memory::format::hwio);
       filter.SetUsrMem(filter_md, &filter_tensor);
 
       // Set output shape (output_dims) required in MKL-DNN order.
@@ -601,34 +603,34 @@ class MklConv2DOp : public OpKernel {
         bias.SetOpMemDesc(bias_size, memory::format::any);
 
         // Create convolution primitive with Bias.
-        auto conv_desc = convolution_forward::desc(prop_kind::forward,
-            convolution_direct, src.GetOpMemDesc(), filter.GetOpMemDesc(),
-            bias.GetOpMemDesc(), output.GetOpMemDesc(), strides,
-            padding_l, padding_r, TFPaddingToMklDnnPadding(padding_));
-
-        auto conv_prim_desc = convolution_forward::primitive_desc(conv_desc,
-                                                                cpu_engine);
-        AllocateOutputTensor(context, conv_prim_desc,
-                             output_dims_mkl_order, tf_fmt, &output_tensor);
+        auto conv_desc = convolution_forward::desc(
+            prop_kind::forward, convolution_direct, src.GetOpMemDesc(),
+            filter.GetOpMemDesc(), bias.GetOpMemDesc(), output.GetOpMemDesc(),
+            strides, padding_l, padding_r, TFPaddingToMklDnnPadding(padding_));
+
+        auto conv_prim_desc =
+            convolution_forward::primitive_desc(conv_desc, cpu_engine);
+        AllocateOutputTensor(context, conv_prim_desc, output_dims_mkl_order,
+                             tf_fmt, &output_tensor);
         // Set data handle for output.
         output.SetUsrMemDataHandle(output_tensor);
 
         Tensor* filter_out_tensor = nullptr;
         AllocateFilterOutputTensor(context, conv_prim_desc,
-                        TFShapeToMklDnnDims(filter_tf_shape),
-                        &filter_out_tensor);
+                                   TFShapeToMklDnnDims(filter_tf_shape),
+                                   &filter_out_tensor);
 
-        PrepareAndExecuteNet(conv_prim_desc, &src, &filter,
-                            &bias, &output, filter_out_tensor);
+        PrepareAndExecuteNet(conv_prim_desc, &src, &filter, &bias, &output,
+                             filter_out_tensor);
       } else {
         // Create convolution primitive without Bias.
-        auto conv_desc = convolution_forward::desc(prop_kind::forward,
-            convolution_direct, src.GetOpMemDesc(), filter.GetOpMemDesc(),
-            output.GetOpMemDesc(), strides, padding_l, padding_r,
-            TFPaddingToMklDnnPadding(padding_));
+        auto conv_desc = convolution_forward::desc(
+            prop_kind::forward, convolution_direct, src.GetOpMemDesc(),
+            filter.GetOpMemDesc(), output.GetOpMemDesc(), strides, padding_l,
+            padding_r, TFPaddingToMklDnnPadding(padding_));
 
-        auto conv_prim_desc = convolution_forward::primitive_desc(conv_desc,
-                                                                cpu_engine);
+        auto conv_prim_desc =
+            convolution_forward::primitive_desc(conv_desc, cpu_engine);
         AllocateOutputTensor(context, conv_prim_desc, output_dims_mkl_order,
                              tf_fmt, &output_tensor);
         // Set data handle for output.
@@ -636,18 +638,18 @@ class MklConv2DOp : public OpKernel {
 
         Tensor* filter_out_tensor = nullptr;
         AllocateFilterOutputTensor(context, conv_prim_desc,
-                TFShapeToMklDnnDims(filter_tf_shape),
-                &filter_out_tensor);
-        PrepareAndExecuteNet(conv_prim_desc, &src, &filter,
-                            nullptr, &output, filter_out_tensor);
+                                   TFShapeToMklDnnDims(filter_tf_shape),
+                                   &filter_out_tensor);
+        PrepareAndExecuteNet(conv_prim_desc, &src, &filter, nullptr, &output,
+                             filter_out_tensor);
       }
-    } catch (mkldnn::error &e) {
+    } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
-                       ", message: " + std::string(e.message) +
-                       ", in file " + std::string(__FILE__) + ":" +
-                       std::to_string(__LINE__);
-      OP_REQUIRES_OK(context,
-        errors::Aborted("Operation received an exception:", error_msg));
+                         ", message: " + std::string(e.message) + ", in file " +
+                         std::string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
     }
   }
 
@@ -655,71 +657,67 @@ class MklConv2DOp : public OpKernel {
   std::vector<int32> strides_;
   Padding padding_;
   TensorFormat data_format_;
-  const int kInputIndex_Src = 0,
-            kInputIndex_Filter = 1,
-            kInputIndex_Bias = 2;
+  const int kInputIndex_Src = 0, kInputIndex_Filter = 1, kInputIndex_Bias = 2;
   const int kOutputIndex_Dst = 0, kOutputIndex_Filter = 1;
 
   // Allocate output tensor.
   void AllocateOutputTensor(
-                  OpKernelContext* context,
-                  const convolution_forward::primitive_desc& conv_prim_desc,
-                  const memory::dims& output_dims_mkl_order,
-                  memory::format output_tf_format, Tensor** output_tensor) {
-      CHECK_NOTNULL(output_tensor);
-      auto dst_pd = conv_prim_desc.dst_primitive_desc();
-
-      // Allocate shape of Mkl tensor.
-      MklDnnShape output_mkl_shape;
-      output_mkl_shape.SetMklTensor(true);
-      output_mkl_shape.SetMklLayout(&dst_pd);
-      output_mkl_shape.SetElemType(MklDnnType<T>());
-      output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
-              output_dims_mkl_order, output_tf_format);
-
-      // Allocate shape of TF tensor.
-      TensorShape output_tf_shape;
-      output_tf_shape.AddDim((dst_pd.get_size() / sizeof(T)));
-
-      AllocateOutputSetMklShape(context, kOutputIndex_Dst, output_tensor,
-                                output_tf_shape, output_mkl_shape);
+      OpKernelContext* context,
+      const convolution_forward::primitive_desc& conv_prim_desc,
+      const memory::dims& output_dims_mkl_order,
+      memory::format output_tf_format, Tensor** output_tensor) {
+    CHECK_NOTNULL(output_tensor);
+    auto dst_pd = conv_prim_desc.dst_primitive_desc();
+
+    // Allocate shape of Mkl tensor.
+    MklDnnShape output_mkl_shape;
+    output_mkl_shape.SetMklTensor(true);
+    output_mkl_shape.SetMklLayout(&dst_pd);
+    output_mkl_shape.SetElemType(MklDnnType<T>());
+    output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
+                                 output_dims_mkl_order, output_tf_format);
+
+    // Allocate shape of TF tensor.
+    TensorShape output_tf_shape;
+    output_tf_shape.AddDim((dst_pd.get_size() / sizeof(T)));
+
+    AllocateOutputSetMklShape(context, kOutputIndex_Dst, output_tensor,
+                              output_tf_shape, output_mkl_shape);
   }
 
   // Allocate output tensor.
   void AllocateFilterOutputTensor(
-                  OpKernelContext* context,
-                  const convolution_forward::primitive_desc& conv_prim_desc,
-                  const memory::dims& filter_dims_tf_order,
-                  Tensor** filter_tensor) {
-      CHECK_NOTNULL(filter_tensor);
-      auto filter_pd = conv_prim_desc.weights_primitive_desc();
-
-      // Allocate shape of Mkl tensor.
-      MklDnnShape filter_mkl_shape;
-      filter_mkl_shape.SetMklTensor(true);
-      filter_mkl_shape.SetMklLayout(&filter_pd);
-      filter_mkl_shape.SetElemType(MklDnnType<T>());
-
-      // The format of the filter is actually OIhw8i8o, but TF doesn't support
-      // this format. Just use format::blocked for now because the layout
-      // is stored in the MKL data.
-      filter_mkl_shape.SetTfLayout(filter_dims_tf_order.size(),
-                  filter_dims_tf_order, memory::format::blocked);
-
-      // Allocate the data space for the filter to propagate as TF tensor.
-      TensorShape filter_tf_shape;
-      filter_tf_shape.AddDim((filter_pd.get_size() / sizeof(T)));
-
-      AllocateOutputSetMklShape(context, kOutputIndex_Filter, filter_tensor,
-              filter_tf_shape, filter_mkl_shape);
+      OpKernelContext* context,
+      const convolution_forward::primitive_desc& conv_prim_desc,
+      const memory::dims& filter_dims_tf_order, Tensor** filter_tensor) {
+    CHECK_NOTNULL(filter_tensor);
+    auto filter_pd = conv_prim_desc.weights_primitive_desc();
+
+    // Allocate shape of Mkl tensor.
+    MklDnnShape filter_mkl_shape;
+    filter_mkl_shape.SetMklTensor(true);
+    filter_mkl_shape.SetMklLayout(&filter_pd);
+    filter_mkl_shape.SetElemType(MklDnnType<T>());
+
+    // The format of the filter is actually OIhw8i8o, but TF doesn't support
+    // this format. Just use format::blocked for now because the layout
+    // is stored in the MKL data.
+    filter_mkl_shape.SetTfLayout(filter_dims_tf_order.size(),
+                                 filter_dims_tf_order, memory::format::blocked);
+
+    // Allocate the data space for the filter to propagate as TF tensor.
+    TensorShape filter_tf_shape;
+    filter_tf_shape.AddDim((filter_pd.get_size() / sizeof(T)));
+
+    AllocateOutputSetMklShape(context, kOutputIndex_Filter, filter_tensor,
+                              filter_tf_shape, filter_mkl_shape);
   }
 
   // Prepare and execute net - checks for input and output reorders.
   void PrepareAndExecuteNet(
-                  const convolution_forward::primitive_desc& conv_prim_desc,
-                  MklDnnData<T>* src, MklDnnData<T>* filter,
-                  MklDnnData<T>* bias, MklDnnData<T>* output,
-                  Tensor* filter_out_tensor) {
+      const convolution_forward::primitive_desc& conv_prim_desc,
+      MklDnnData<T>* src, MklDnnData<T>* filter, MklDnnData<T>* bias,
+      MklDnnData<T>* output, Tensor* filter_out_tensor) {
     CHECK_NOTNULL(filter_out_tensor);
 
     // Create reorders between user layout and MKL layout if it is needed and
@@ -731,18 +729,20 @@ class MklConv2DOp : public OpKernel {
     // rather than re-order to a temp buffer, reorder directly to the
     // filter output tensor
     filter->CheckReorderToOpMem(conv_prim_desc.weights_primitive_desc(),
-                    filter->GetTensorBuffer(filter_out_tensor), &net);
+                                filter->GetTensorBuffer(filter_out_tensor),
+                                &net);
 
     // Create convolution primitive and add it to net.
     if (bias) {
       CHECK_EQ(biasEnabled, true);
       net.push_back(convolution_forward(conv_prim_desc, src->GetOpMem(),
-                                    filter->GetOpMem(), bias->GetOpMem(),
-                                    output->GetOpMem()));
+                                        filter->GetOpMem(), bias->GetOpMem(),
+                                        output->GetOpMem()));
     } else {
       CHECK_EQ(biasEnabled, false);
       net.push_back(convolution_forward(conv_prim_desc, src->GetOpMem(),
-                                    filter->GetOpMem(), output->GetOpMem()));
+                                        filter->GetOpMem(),
+                                        output->GetOpMem()));
     }
 
     stream(stream::kind::eager).submit(net).wait();
diff --git a/tensorflow/core/kernels/mkl_conv_ops.h b/tensorflow/core/kernels/mkl_conv_ops.h
index c6456bd5c3..8b65eaea0d 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.h
+++ b/tensorflow/core/kernels/mkl_conv_ops.h
@@ -16,9 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_MKL_CONV_OPS_H_
 #define TENSORFLOW_CORE_KERNELS_MKL_CONV_OPS_H_
 
-#include <vector>
 #include <limits>
 #include <string>
+#include <vector>
 
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -27,8 +27,8 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
 #include "tensorflow/core/kernels/bounds_check.h"
-#include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/kernels/conv_grad_ops.h"
+#include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/strings/numbers.h"
@@ -43,11 +43,11 @@ limitations under the License.
 #ifdef INTEL_MKL_DNN
 #include "mkldnn.hpp"
 
-using mkldnn::stream;
 using mkldnn::prop_kind;
+using mkldnn::stream;
 
-using mkldnn::convolution_forward;
 using mkldnn::convolution_direct;
+using mkldnn::convolution_forward;
 #endif
 
 namespace tensorflow {
@@ -63,13 +63,13 @@ class MklDnnConvUtil {
 
  public:
   MklDnnConvUtil(OpKernelContext* context, const std::vector<int32>& strides,
-                 Padding pad, TensorFormat fm) : context_(context),
-    strides_(strides), padding_(pad), data_format_(fm) {}
+                 Padding pad, TensorFormat fm)
+      : context_(context), strides_(strides), padding_(pad), data_format_(fm) {}
 
   virtual ~MklDnnConvUtil() { context_ = nullptr; }
 
   // Calculate Convolution strides
-  virtual inline void GetStridesInMklOrder(memory::dims *strides) {
+  virtual inline void GetStridesInMklOrder(memory::dims* strides) {
     // For now we take the stride from the second and third dimensions only
     // (we do not support striding on the batch or depth dimension).
     CHECK_NOTNULL(strides);
@@ -82,14 +82,14 @@ class MklDnnConvUtil {
   // requires input in NCHW format. Function does not return anything.
   // But errors arising from sanity checks are returned in context's
   // status.
-  virtual inline void
-  GetInputSizeInMklOrder(const TensorShape& input_shape,
-                         memory::dims *input_dims) {
-  #define CHECK_BOUNDS(val, err_msg) do {                     \
-    OP_REQUIRES(context_, FastBoundsCheck(val,                \
-                            std::numeric_limits<int>::max()), \
-                errors::InvalidArgument(err_msg));            \
-  }while(0)
+  virtual inline void GetInputSizeInMklOrder(const TensorShape& input_shape,
+                                             memory::dims* input_dims) {
+#define CHECK_BOUNDS(val, err_msg)                                     \
+  do {                                                                 \
+    OP_REQUIRES(context_,                                              \
+                FastBoundsCheck(val, std::numeric_limits<int>::max()), \
+                errors::InvalidArgument(err_msg));                     \
+  } while (0)
 
     CHECK_NOTNULL(input_dims);
 
@@ -112,7 +112,7 @@ class MklDnnConvUtil {
     CHECK_BOUNDS(input_batch_raw, "Input batch too large");
     int input_batch = static_cast<int>(input_batch_raw);
 
-  #undef CHECK_BOUNDS
+#undef CHECK_BOUNDS
 
     // MKL-DNN always requires input in NCHW format.
     std::vector<int> mkldnn_sizes(4, -1);
@@ -138,10 +138,9 @@ class MklDnnConvUtil {
   // forward gets actual tensor as input).
   //
   // TODO(nhasabni): Add similar function for input and filter in MklShape.
-  virtual inline void
-  GetFilterSizeInMklOrder(const TensorShape& input_shape,
-                          const TensorShape& filter_shape,
-                          memory::dims *filter_dims) {
+  virtual inline void GetFilterSizeInMklOrder(const TensorShape& input_shape,
+                                              const TensorShape& filter_shape,
+                                              memory::dims* filter_dims) {
     CHECK_NOTNULL(filter_dims);
 
     OP_REQUIRES(context_, filter_shape.dims() == 4,
@@ -149,17 +148,18 @@ class MklDnnConvUtil {
                                         filter_shape.DebugString()));
 
     for (int i = 0; i < 3; i++) {
-      OP_REQUIRES(context_, FastBoundsCheck(filter_shape.dim_size(i),
-                                           std::numeric_limits<int>::max()),
-                errors::InvalidArgument("filter too large"));
+      OP_REQUIRES(context_,
+                  FastBoundsCheck(filter_shape.dim_size(i),
+                                  std::numeric_limits<int>::max()),
+                  errors::InvalidArgument("filter too large"));
     }
 
     int input_depth = GetTensorDim(input_shape, data_format_, 'C');
 
-    OP_REQUIRES(
-        context_, input_depth == filter_shape.dim_size(2),
-        errors::InvalidArgument("input and filter must have the same depth: ",
-                                input_depth, " vs ", filter_shape.dim_size(2)));
+    OP_REQUIRES(context_, input_depth == filter_shape.dim_size(2),
+                errors::InvalidArgument(
+                    "input and filter must have the same depth: ", input_depth,
+                    " vs ", filter_shape.dim_size(2)));
 
     // TF filter is always in (rows, cols, in_depth, out_depth) order.
     int filter_rows = static_cast<int>(filter_shape.dim_size(0));
@@ -182,25 +182,24 @@ class MklDnnConvUtil {
   // requires filter in OIHW format. Function does not return anything.
   // But errors arising from sanity checks are returned in context's
   // status.
-  virtual inline void
-  GetFilterSizeInMklOrder(size_t src_index, size_t filter_index,
-                          memory::dims *filter_dims) {
+  virtual inline void GetFilterSizeInMklOrder(size_t src_index,
+                                              size_t filter_index,
+                                              memory::dims* filter_dims) {
     CHECK_NOTNULL(filter_dims);
     GetFilterSizeInMklOrder(GetTfShape(context_, src_index),
-                            GetTfShape(context_, filter_index),
-                            filter_dims);
+                            GetTfShape(context_, filter_index), filter_dims);
   }
 
   // Calculate Bias size for 2D Convolution. Function does not return
   // anything, but sets error in context status.
-  virtual inline void
-  GetBiasSizeInMklOrder(size_t bias_index, memory::dims *bias_dims) {
+  virtual inline void GetBiasSizeInMklOrder(size_t bias_index,
+                                            memory::dims* bias_dims) {
     const Tensor& bias = MklGetInput(context_, bias_index);
     OP_REQUIRES(context_, bias.dims() == 1,
                 errors::InvalidArgument("bias must be 1-dimensional: ",
                                         bias.shape().DebugString()));
 
-    *bias_dims = { static_cast<int>(bias.dim_size(0)) };
+    *bias_dims = {static_cast<int>(bias.dim_size(0))};
   }
 
   // Function to calculate output and padding size for 2D convolution.
@@ -212,13 +211,11 @@ class MklDnnConvUtil {
   // status is returned via context status.
   //
   // TODO(nhasabni): Add similar function for input and filter in MklShape.
-  virtual inline void
-  GetOutputAndPadSizeInMklOrder(const TensorShape& input_shape,
-                                const TensorShape& filter_shape,
-                                const memory::dims& strides,
-                                memory::dims *output_dims_tf_order,
-                                memory::dims *output_dims_mkl_order,
-                                memory::dims *pad_l, memory::dims *pad_r) {
+  virtual inline void GetOutputAndPadSizeInMklOrder(
+      const TensorShape& input_shape, const TensorShape& filter_shape,
+      const memory::dims& strides, memory::dims* output_dims_tf_order,
+      memory::dims* output_dims_mkl_order, memory::dims* pad_l,
+      memory::dims* pad_r) {
     CHECK_NOTNULL(output_dims_tf_order);
     CHECK_NOTNULL(output_dims_mkl_order);
     CHECK_NOTNULL(pad_l);
@@ -244,16 +241,16 @@ class MklDnnConvUtil {
     int64 out_rows = 0, out_cols = 0;
     int64 pad_top = 0, pad_bottom = 0, pad_left, pad_right;
 
-    OP_REQUIRES_OK(context_,
-            GetWindowedOutputSizeVerbose(input_rows, filter_rows, stride_rows,
-                                 padding_, &out_rows, &pad_top, &pad_bottom));
-    OP_REQUIRES_OK(context_,
-            GetWindowedOutputSizeVerbose(input_cols, filter_cols, stride_cols,
-                                 padding_, &out_cols, &pad_left, &pad_right));
+    OP_REQUIRES_OK(context_, GetWindowedOutputSizeVerbose(
+                                 input_rows, filter_rows, stride_rows, padding_,
+                                 &out_rows, &pad_top, &pad_bottom));
+    OP_REQUIRES_OK(context_, GetWindowedOutputSizeVerbose(
+                                 input_cols, filter_cols, stride_cols, padding_,
+                                 &out_cols, &pad_left, &pad_right));
 
     // Tensorflow output is in data_format order. (NHWC or NCHW)
-    TensorShape out_shape = ShapeFromFormat(data_format_, out_batch,
-                                            out_rows, out_cols, out_depth);
+    TensorShape out_shape =
+        ShapeFromFormat(data_format_, out_batch, out_rows, out_cols, out_depth);
     *output_dims_tf_order = TFShapeToMklDnnDims(out_shape);
 
     // MKL-DNN always needs output in NCHW format.
@@ -273,12 +270,10 @@ class MklDnnConvUtil {
   // See comment on GetConvOutputAndPadSizeInMklOrder for parameters.
   //
   // Function does not return anything, but sets error in context status.
-  inline void
-  GetOutputAndPadSizeInMklOrder(size_t src_index, size_t filter_index,
-                                const memory::dims& strides,
-                                memory::dims *output_dims_tf_order,
-                                memory::dims *output_dims_mkl_order,
-                                memory::dims *pad_l, memory::dims *pad_r) {
+  inline void GetOutputAndPadSizeInMklOrder(
+      size_t src_index, size_t filter_index, const memory::dims& strides,
+      memory::dims* output_dims_tf_order, memory::dims* output_dims_mkl_order,
+      memory::dims* pad_l, memory::dims* pad_r) {
     CHECK_NOTNULL(output_dims_tf_order);
     CHECK_NOTNULL(output_dims_mkl_order);
     CHECK_NOTNULL(pad_l);
@@ -289,11 +284,11 @@ class MklDnnConvUtil {
 
     OP_REQUIRES(context_, input_tf_shape.dims() == 4,
                 errors::InvalidArgument("input must be 4-dimensional",
-                                          input_tf_shape.DebugString()));
+                                        input_tf_shape.DebugString()));
 
-    GetOutputAndPadSizeInMklOrder(input_tf_shape, filter_tf_shape,
-                                  strides, output_dims_tf_order,
-                                  output_dims_mkl_order, pad_l, pad_r);
+    GetOutputAndPadSizeInMklOrder(input_tf_shape, filter_tf_shape, strides,
+                                  output_dims_tf_order, output_dims_mkl_order,
+                                  pad_l, pad_r);
   }
 
   // Wrapper function to calculate input, filter, and output sizes of
@@ -302,15 +297,12 @@ class MklDnnConvUtil {
   // also calculates strides and paddings for 2D Convolution.
   //
   // Function does not return anything, but sets error in context status.
-  inline void GetConvFwdSizesInMklOrder(const TensorShape& input_shape,
-                                        const TensorShape& filter_shape,
-                                        memory::dims *input_dims,
-                                        memory::dims *filter_dims,
-                                        memory::dims *strides,
-                                        memory::dims *output_dims_tf_order,
-                                        memory::dims *output_dims_mkl_order,
-                                        memory::dims *pad_l,
-                                        memory::dims *pad_r) {
+  inline void GetConvFwdSizesInMklOrder(
+      const TensorShape& input_shape, const TensorShape& filter_shape,
+      memory::dims* input_dims, memory::dims* filter_dims,
+      memory::dims* strides, memory::dims* output_dims_tf_order,
+      memory::dims* output_dims_mkl_order, memory::dims* pad_l,
+      memory::dims* pad_r) {
     CHECK_NOTNULL(input_dims);
     CHECK_NOTNULL(filter_dims);
     CHECK_NOTNULL(strides);
@@ -325,8 +317,7 @@ class MklDnnConvUtil {
     if (!context_->status().ok()) return;
     GetStridesInMklOrder(strides);
     GetOutputAndPadSizeInMklOrder(input_shape, filter_shape, *strides,
-                                  output_dims_tf_order,
-                                  output_dims_mkl_order,
+                                  output_dims_tf_order, output_dims_mkl_order,
                                   pad_l, pad_r);
     if (!context_->status().ok()) return;
   }
@@ -337,7 +328,7 @@ class MklDnnConvUtil {
 /////////////////////////////////////////////////////////////////////
 
 template <typename Device, class T>
-class MklConv2DBackpropCommonOp :  public OpKernel {
+class MklConv2DBackpropCommonOp : public OpKernel {
  public:
   ~MklConv2DBackpropCommonOp() {}
   explicit MklConv2DBackpropCommonOp(OpKernelConstruction* context)
@@ -397,12 +388,11 @@ class MklConv2DBackpropCommonOp :  public OpKernel {
           outbprop_tf_shape.num_elements() == 0) {
         MklDnnShape output_mkl_shape;
         output_mkl_shape.SetMklTensor(false);
-        TensorShape output_tf_shape = GetOutputTfShape(input_tf_shape,
-                                                       filter_tf_shape,
-                                                       outbprop_tf_shape);
+        TensorShape output_tf_shape = GetOutputTfShape(
+            input_tf_shape, filter_tf_shape, outbprop_tf_shape);
         const int kOutputIdx = 0;
         AllocateOutputSetMklShape(context, kOutputIdx, &output_tensor,
-                                    output_tf_shape, output_mkl_shape);
+                                  output_tf_shape, output_mkl_shape);
         CHECK_NOTNULL(output_tensor);
 
         // if output tensor has more than 0 elements, we need to 0 them out.
@@ -421,12 +411,10 @@ class MklConv2DBackpropCommonOp :  public OpKernel {
 
       // Get forward convolution parameters.
       MklDnnConvUtil conv_utl(context, strides_, padding_, data_format_);
-      conv_utl.GetConvFwdSizesInMklOrder(input_tf_shape, filter_tf_shape,
-                                         &fwd_input_dims, &fwd_filter_dims,
-                                         &strides,
-                                         &fwd_output_dims_tf_order,
-                                         &fwd_output_dims,
-                                         &padding_l, &padding_r);
+      conv_utl.GetConvFwdSizesInMklOrder(
+          input_tf_shape, filter_tf_shape, &fwd_input_dims, &fwd_filter_dims,
+          &strides, &fwd_output_dims_tf_order, &fwd_output_dims, &padding_l,
+          &padding_r);
       if (!context->status().ok()) return;
 
       // Create Convolution forward descriptor since Convolution backward
@@ -437,20 +425,22 @@ class MklConv2DBackpropCommonOp :  public OpKernel {
       // construct input TF layout. For TF layout, although input shape
       // required is in MKL-DNN order, the layout is Tensorflow's layout
       // (NHWC or NCHW depending on data format).
-      auto fwd_input_md = input_mkl_shape.IsMklTensor() ?
-                          input_mkl_shape.GetMklLayout() :
-                       memory::desc(fwd_input_dims, MklDnnType<T>(), tf_fmt);
+      auto fwd_input_md =
+          input_mkl_shape.IsMklTensor()
+              ? input_mkl_shape.GetMklLayout()
+              : memory::desc(fwd_input_dims, MklDnnType<T>(), tf_fmt);
       // If filter is in MKL layout, then simply grab filter layout; otherwise
       // construct filter in TF layout. For TF layout, filter is in HWIO format.
-      auto fwd_filter_md = filter_mkl_shape.IsMklTensor() ?
-                          filter_mkl_shape.GetMklLayout() :
-                          memory::desc(fwd_filter_dims, MklDnnType<T>(),
-                                       memory::format::hwio);
+      auto fwd_filter_md = filter_mkl_shape.IsMklTensor()
+                               ? filter_mkl_shape.GetMklLayout()
+                               : memory::desc(fwd_filter_dims, MklDnnType<T>(),
+                                              memory::format::hwio);
       // Tensorflow Output of Conv2D is in data_format order.
       auto fwd_out_md = memory::desc(fwd_output_dims, MklDnnType<T>(), tf_fmt);
-      auto fwd_desc = convolution_forward::desc(prop_kind::forward,
-            convolution_direct, fwd_input_md, fwd_filter_md, fwd_out_md,
-            strides, padding_l, padding_r, TFPaddingToMklDnnPadding(padding_));
+      auto fwd_desc = convolution_forward::desc(
+          prop_kind::forward, convolution_direct, fwd_input_md, fwd_filter_md,
+          fwd_out_md, strides, padding_l, padding_r,
+          TFPaddingToMklDnnPadding(padding_));
       auto fwd_pd = convolution_forward::primitive_desc(fwd_desc, cpu_engine);
 
       // Create memory for user data. Describe how the inputs and outputs of
@@ -495,17 +485,16 @@ class MklConv2DBackpropCommonOp :  public OpKernel {
 
       // Operator-specific call to create and execute primitive.
       CreatePrimitive(context, cpu_engine, fwd_pd, &input, &filter,
-                      &outbackprop, &output, &output_tensor,
-                      strides, padding_l, padding_r,
-                      TFPaddingToMklDnnPadding(padding_),
+                      &outbackprop, &output, &output_tensor, strides, padding_l,
+                      padding_r, TFPaddingToMklDnnPadding(padding_),
                       bwd_output_dims, bwd_output_format);
-    } catch (mkldnn::error &e) {
-     string error_msg = "Status: " + std::to_string(e.status) +
-                       ", message: " + string(e.message) +
-                       ", in file " + string(__FILE__) + ":" +
-                       std::to_string(__LINE__);
-     OP_REQUIRES_OK(context, errors::Aborted("Operation received an exception:",
-                                            error_msg));
+    } catch (mkldnn::error& e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
     }
   }
 
@@ -523,11 +512,11 @@ class MklConv2DBackpropCommonOp :  public OpKernel {
 
   /// Get TensorFlow shape of input tensor.
   virtual TensorShape MakeInputTfShape(OpKernelContext* context,
-                                      const Tensor& input_tensor) = 0;
+                                       const Tensor& input_tensor) = 0;
 
   /// Get TensorFlow shape of filter tensor.
   virtual TensorShape MakeFilterTfShape(OpKernelContext* context,
-                                       const Tensor& filter_tensor) = 0;
+                                        const Tensor& filter_tensor) = 0;
 
   /// Get the TensorFlow shape of output tensor.
   virtual TensorShape GetOutputTfShape(const TensorShape& input_shape,
@@ -536,9 +525,9 @@ class MklConv2DBackpropCommonOp :  public OpKernel {
 
   /// Get shape of output in MKL-DNN order. Computes shape of output from
   /// input shape (fwd_input_dims) and filter shape (fwd_filter_dims).
-  virtual
-  const memory::dims& GetOutputDims(const memory::dims& fwd_input_dims,
-                                    const memory::dims& fwd_filter_dims) = 0;
+  virtual const memory::dims& GetOutputDims(
+      const memory::dims& fwd_input_dims,
+      const memory::dims& fwd_filter_dims) = 0;
 
   /// Get data_format of output in MKL-DNN order. If output data format is
   /// same as input data format, then it simply returns value of data_format
@@ -546,17 +535,18 @@ class MklConv2DBackpropCommonOp :  public OpKernel {
   virtual memory::format GetOutputFormat(const memory::format data_format) = 0;
 
   /// Create and execute the primitive storing output in the output_tensor.
-  virtual void CreatePrimitive(OpKernelContext* context,
-    const engine& cpu_engine,
-    const convolution_forward::primitive_desc& conv_fwd_pd,
-    MklDnnData<T>* input, MklDnnData<T>* filter, MklDnnData<T>* outbackprop,
-    MklDnnData<T>* output, Tensor** output_tensor, const memory::dims& strides,
-    const memory::dims& padding_l, const memory::dims& padding_r,
-    padding_kind padding, const memory::dims& bwd_output_dims,
-    memory::format bwd_output_format) = 0;
+  virtual void CreatePrimitive(
+      OpKernelContext* context, const engine& cpu_engine,
+      const convolution_forward::primitive_desc& conv_fwd_pd,
+      MklDnnData<T>* input, MklDnnData<T>* filter, MklDnnData<T>* outbackprop,
+      MklDnnData<T>* output, Tensor** output_tensor,
+      const memory::dims& strides, const memory::dims& padding_l,
+      const memory::dims& padding_r, padding_kind padding,
+      const memory::dims& bwd_output_dims,
+      memory::format bwd_output_format) = 0;
 
   // Get the data_format {NCHW, NHWC}
-  TensorFormat GetTFDataFormat () { return data_format_; }
+  TensorFormat GetTFDataFormat() { return data_format_; }
 
  private:
   std::vector<int32> strides_;
@@ -575,12 +565,12 @@ class MklDummyOp : public OpKernel {
  public:
   ~MklDummyOp() {}
 
-  explicit MklDummyOp(OpKernelConstruction* context) :
-    OpKernel(context) {}
+  explicit MklDummyOp(OpKernelConstruction* context) : OpKernel(context) {}
 
   void Compute(OpKernelContext* context) override {
-    TF_CHECK_OK(errors::Unimplemented("This is a dummy op."
-                                      "It should not have been invoked."));
+    TF_CHECK_OK(
+        errors::Unimplemented("This is a dummy op."
+                              "It should not have been invoked."));
   }
 };
 
diff --git a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
index 8340a91d05..0b6d838e09 100644
--- a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
@@ -28,12 +28,12 @@ limitations under the License.
 #ifdef INTEL_MKL_DNN
 #include "mkldnn.hpp"
 
-using mkldnn::stream;
+using mkldnn::batch_normalization_backward;
+using mkldnn::batch_normalization_forward;
 using mkldnn::prop_kind;
-using mkldnn::use_scale_shift;
+using mkldnn::stream;
 using mkldnn::use_global_stats;
-using mkldnn::batch_normalization_forward;
-using mkldnn::batch_normalization_backward;
+using mkldnn::use_scale_shift;
 #endif
 
 // TODO(inteltf) Address comments from PR 8968.
@@ -601,7 +601,7 @@ class MklFusedBatchNormGradOp : public OpKernel {
       mkl_res_batchnorm_bwd[dnnResourceSrc] =
           (mkl_convert_input) ? mkl_buf_converted_input : mkl_buf_input;
 
-     bool mkl_convert_out_backprop;
+      bool mkl_convert_out_backprop;
       dnnPrimitive_t mkl_prim_convert_out_backprop = nullptr;
       dnnLayout_t mkl_lt_internal_out_backprop = nullptr;
       void* mkl_buf_converted_out_backprop = nullptr;
@@ -709,12 +709,11 @@ class MklFusedBatchNormOp : public OpKernel {
       const size_t kMeanIndex = 3;      // index of est_mean tensor
       const size_t kVarianceIndex = 4;  // index of est_variance tensor
 
-      const Tensor& src_tensor          = MklGetInput(context, kSrcIndex);
-      const Tensor& scale_tensor        = MklGetInput(context, kScaleIndex);
-      const Tensor& shift_tensor        = MklGetInput(context, kShiftIndex);
-      const Tensor& est_mean_tensor     = MklGetInput(context, kMeanIndex);
-      const Tensor& est_variance_tensor = MklGetInput(context,
-                                                      kVarianceIndex);
+      const Tensor& src_tensor = MklGetInput(context, kSrcIndex);
+      const Tensor& scale_tensor = MklGetInput(context, kScaleIndex);
+      const Tensor& shift_tensor = MklGetInput(context, kShiftIndex);
+      const Tensor& est_mean_tensor = MklGetInput(context, kMeanIndex);
+      const Tensor& est_variance_tensor = MklGetInput(context, kVarianceIndex);
 
       TensorShape tf_shape_src;
       MklDnnShape dnn_shape_src;
@@ -723,37 +722,34 @@ class MklFusedBatchNormOp : public OpKernel {
       if (dnn_shape_src.IsMklTensor()) {
         tf_shape_src = dnn_shape_src.GetTfShape();
         OP_REQUIRES(context, dnn_shape_src.GetDimension() == 4,
-                    errors::InvalidArgument(
-                        "input must be 4-dimensional",
-                        src_tensor.shape().DebugString()));
+                    errors::InvalidArgument("input must be 4-dimensional",
+                                            src_tensor.shape().DebugString()));
       } else {
         tf_shape_src = src_tensor.shape();
         OP_REQUIRES(context, src_tensor.dims() == 4,
-                    errors::InvalidArgument(
-                        "input must be 4-dimensional",
-                        src_tensor.shape().DebugString()));
+                    errors::InvalidArgument("input must be 4-dimensional",
+                                            src_tensor.shape().DebugString()));
       }
       OP_REQUIRES(context, scale_tensor.dims() == 1,
-                  errors::InvalidArgument(
-                      "scale must be 1-dimensional",
-                      scale_tensor.shape().DebugString()));
+                  errors::InvalidArgument("scale must be 1-dimensional",
+                                          scale_tensor.shape().DebugString()));
       OP_REQUIRES(context, shift_tensor.dims() == 1,
                   errors::InvalidArgument("offset must be 1-dimensional",
-                                        shift_tensor.shape().DebugString()));
-      OP_REQUIRES(context, est_mean_tensor.dims() == 1,
-                  errors::InvalidArgument(
-                      "estimated_mean must be 1-dimensional",
-                      est_mean_tensor.shape().DebugString()));
-      OP_REQUIRES(context, est_variance_tensor.dims() == 1,
-                  errors::InvalidArgument(
-                      "estimated_variance must be 1-dimensional",
-                      est_variance_tensor.shape().DebugString()));
+                                          shift_tensor.shape().DebugString()));
+      OP_REQUIRES(
+          context, est_mean_tensor.dims() == 1,
+          errors::InvalidArgument("estimated_mean must be 1-dimensional",
+                                  est_mean_tensor.shape().DebugString()));
+      OP_REQUIRES(
+          context, est_variance_tensor.dims() == 1,
+          errors::InvalidArgument("estimated_variance must be 1-dimensional",
+                                  est_variance_tensor.shape().DebugString()));
 
       if (is_training_) {
-        OP_REQUIRES(context, est_mean_tensor.dim_size(0) == 0,
-                    errors::InvalidArgument(
-                        "estimated_mean must be empty for training",
-                        est_mean_tensor.shape().DebugString()));
+        OP_REQUIRES(
+            context, est_mean_tensor.dim_size(0) == 0,
+            errors::InvalidArgument("estimated_mean must be empty for training",
+                                    est_mean_tensor.shape().DebugString()));
         OP_REQUIRES(context, est_variance_tensor.dim_size(0) == 0,
                     errors::InvalidArgument(
                         "estimated_variance must be empty for training",
@@ -763,11 +759,9 @@ class MklFusedBatchNormOp : public OpKernel {
       // special case: input with 0 element and 0 batch size
       Tensor* dst_tensor = nullptr;
       if (tf_shape_src.num_elements() == 0) {
-         HandleEmptyInput(context,
-                          tf_shape_src,
-                          scale_tensor.shape(),
-                          &dst_tensor);
-         return;
+        HandleEmptyInput(context, tf_shape_src, scale_tensor.shape(),
+                         &dst_tensor);
+        return;
       }
 
       if (dnn_shape_src.IsMklTensor())
@@ -783,11 +777,8 @@ class MklFusedBatchNormOp : public OpKernel {
       Tensor* batch_variance_tensor = nullptr;
       Tensor* saved_mean_tensor = nullptr;
       Tensor* saved_variance_tensor = nullptr;
-      AllocateTFOutputs(context,
-                        scale_tensor.shape(),
-                        &batch_mean_tensor,
-                        &batch_variance_tensor,
-                        &saved_mean_tensor,
+      AllocateTFOutputs(context, scale_tensor.shape(), &batch_mean_tensor,
+                        &batch_variance_tensor, &saved_mean_tensor,
                         &saved_variance_tensor);
 
       if (is_training_)
@@ -815,69 +806,63 @@ class MklFusedBatchNormOp : public OpKernel {
         src_dims = TFShapeToMklDnnDimsInNCHW(dnn_shape_src.GetTfShape(),
                                              tensor_format_);
       } else {
-        src_dims = TFShapeToMklDnnDimsInNCHW(src_tensor.shape(),
-                                             tensor_format_);
+        src_dims =
+            TFShapeToMklDnnDimsInNCHW(src_tensor.shape(), tensor_format_);
       }
 
       auto src_md = dnn_shape_src.IsMklTensor()
-                    ? dnn_shape_src.GetMklLayout()
-                    : memory::desc(src_dims, MklDnnType<T>(), format_m);
+                        ? dnn_shape_src.GetMklLayout()
+                        : memory::desc(src_dims, MklDnnType<T>(), format_m);
       src.SetUsrMem(src_md, &src_tensor);
 
       // set weights primitive
       // MKL-DNN packs scale & shift as "weights":
       // <scale>...<scale><shift>...<shift>
-      auto weights_desc = memory::desc({2, depth_},
-                                       MklDnnType<T>(),
-                                       memory::format::nc);
+      auto weights_desc =
+          memory::desc({2, depth_}, MklDnnType<T>(), memory::format::nc);
       auto weights_pd = memory::primitive_desc(weights_desc, cpu_engine);
       auto weights_m = memory(weights_pd);
-      T* weights_data = reinterpret_cast<T*>(
-                        weights_m.get_data_handle());
-      T* scale_tf = reinterpret_cast<T*>(
-                    const_cast<T*>(scale_tensor.flat<T>().data()));
-      T* shift_tf = reinterpret_cast<T*>(
-                    const_cast<T*>(shift_tensor.flat<T>().data()));
-
-      for (int k=0; k < depth_; k++) {
+      T* weights_data = reinterpret_cast<T*>(weights_m.get_data_handle());
+      T* scale_tf =
+          reinterpret_cast<T*>(const_cast<T*>(scale_tensor.flat<T>().data()));
+      T* shift_tf =
+          reinterpret_cast<T*>(const_cast<T*>(shift_tensor.flat<T>().data()));
+
+      for (int k = 0; k < depth_; k++) {
         weights_data[k] = scale_tf[k];
         weights_data[k + depth_] = shift_tf[k];
       }
 
       // set mean primitive
-      auto mean_desc = memory::desc({1, depth_},
-                                    MklDnnType<T>(),
-                                    memory::format::nc);
+      auto mean_desc =
+          memory::desc({1, depth_}, MklDnnType<T>(), memory::format::nc);
       auto mean_pd = memory::primitive_desc(mean_desc, cpu_engine);
-      char* saved_mean_data_tf = reinterpret_cast<char*>
-                                 (saved_mean_tensor->flat<T>().data());
-      std::memcpy(saved_mean_data_tf,
-                  reinterpret_cast<char*>(mean_values_),
-                  depth_*sizeof(T));
-      auto mean_m = memory(mean_pd,
-                           reinterpret_cast<void*>(saved_mean_data_tf));
+      char* saved_mean_data_tf =
+          reinterpret_cast<char*>(saved_mean_tensor->flat<T>().data());
+      std::memcpy(saved_mean_data_tf, reinterpret_cast<char*>(mean_values_),
+                  depth_ * sizeof(T));
+      auto mean_m =
+          memory(mean_pd, reinterpret_cast<void*>(saved_mean_data_tf));
 
       // set variance primitive
-      auto variance_desc = memory::desc({1, depth_},
-                                    MklDnnType<T>(),
-                                    memory::format::nc);
+      auto variance_desc =
+          memory::desc({1, depth_}, MklDnnType<T>(), memory::format::nc);
       auto variance_pd = memory::primitive_desc(variance_desc, cpu_engine);
-      char* saved_variance_data_tf = reinterpret_cast<char*>
-                  (saved_variance_tensor->flat<T>().data());
+      char* saved_variance_data_tf =
+          reinterpret_cast<char*>(saved_variance_tensor->flat<T>().data());
       std::memcpy(saved_variance_data_tf,
                   reinterpret_cast<char*>(variance_values_),
-                  depth_*sizeof(T));
+                  depth_ * sizeof(T));
       auto variance_m = memory(variance_pd, saved_variance_data_tf);
 
-      prop_kind pk = (is_training_) ?
-                     prop_kind::forward_training :
-                     prop_kind::forward_scoring;
+      prop_kind pk = (is_training_) ? prop_kind::forward_training
+                                    : prop_kind::forward_scoring;
       auto bnrm_fwd_desc = batch_normalization_forward::desc(
-                               pk, src.GetUsrMemDesc(), epsilon_,
-                               is_training_ ? use_scale_shift :
-                               (use_scale_shift | use_global_stats));
+          pk, src.GetUsrMemDesc(), epsilon_,
+          is_training_ ? use_scale_shift
+                       : (use_scale_shift | use_global_stats));
       auto bnrm_fwd_pd = batch_normalization_forward::primitive_desc(
-                             bnrm_fwd_desc, cpu_engine);
+          bnrm_fwd_desc, cpu_engine);
 
       // allocate dst tensor
       MklDnnShape dnn_shape_dst;
@@ -887,47 +872,39 @@ class MklFusedBatchNormOp : public OpKernel {
         auto dst_pd = bnrm_fwd_pd.dst_primitive_desc();
         dnn_shape_dst.SetMklLayout(&dst_pd);
         dnn_shape_dst.SetElemType(MklDnnType<T>());
-        dnn_shape_dst.SetTfLayout(dnn_shape_src.GetDimension(),
-                                  src_dims, format_m);
-        tf_shape_dst.AddDim(dst_pd.get_size()/sizeof(T));
+        dnn_shape_dst.SetTfLayout(dnn_shape_src.GetDimension(), src_dims,
+                                  format_m);
+        tf_shape_dst.AddDim(dst_pd.get_size() / sizeof(T));
       } else {
         dnn_shape_dst.SetMklTensor(false);
         tf_shape_dst = src_tensor.shape();
       }
-      AllocateOutputSetMklShape(context, kDstIndex, &dst_tensor,
-                                tf_shape_dst, dnn_shape_dst);
+      AllocateOutputSetMklShape(context, kDstIndex, &dst_tensor, tf_shape_dst,
+                                dnn_shape_dst);
 
       // Output of batchnorm has same shape as input.
       dst.SetUsrMem(src_md, dst_tensor);
 
       primitive bnrm_fwd_op;
       if (is_training_) {
-        bnrm_fwd_op = batch_normalization_forward(
-                          bnrm_fwd_pd,
-                          src.GetOpMem(),
-                          weights_m,
-                          dst.GetOpMem(),
-                          mean_m,
-                          variance_m);
+        bnrm_fwd_op =
+            batch_normalization_forward(bnrm_fwd_pd, src.GetOpMem(), weights_m,
+                                        dst.GetOpMem(), mean_m, variance_m);
       } else {
         bnrm_fwd_op = batch_normalization_forward(
-                          bnrm_fwd_pd,
-                          src.GetOpMem(),
-                          mean_m,
-                          variance_m,
-                          (const primitive::at) weights_m,
-                          dst.GetOpMem());
+            bnrm_fwd_pd, src.GetOpMem(), mean_m, variance_m,
+            (const primitive::at)weights_m, dst.GetOpMem());
       }
       std::vector<primitive> net;
       net.push_back(bnrm_fwd_op);
       stream(stream::kind::eager).submit(net).wait();
 
       // copy batch_mean data
-      T* batch_mean_data_tf = reinterpret_cast<T*>(
-                                batch_mean_tensor->flat<T>().data());
+      T* batch_mean_data_tf =
+          reinterpret_cast<T*>(batch_mean_tensor->flat<T>().data());
       std::memcpy(reinterpret_cast<char*>(batch_mean_data_tf),
                   reinterpret_cast<char*>(mean_m.get_data_handle()),
-                  depth_*sizeof(T));
+                  depth_ * sizeof(T));
 
       // copy batch_variance data with Bessel's correction
       // if training mode is on
@@ -937,18 +914,17 @@ class MklFusedBatchNormOp : public OpKernel {
         size_t adjust_size = orig_size - 1;
         adjust_factor = (static_cast<float>(orig_size)) / adjust_size;
       }
-      for (int k=0; k < depth_; k++)
+      for (int k = 0; k < depth_; k++)
         batch_variance_tensor->flat<T>().data()[k] =
-            (reinterpret_cast<T*>(variance_m.get_data_handle()))[k]
-            * adjust_factor;
-    } catch (mkldnn::error &e) {
+            (reinterpret_cast<T*>(variance_m.get_data_handle()))[k] *
+            adjust_factor;
+    } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
-                         ", message: " + string(e.message) +
-                         ", in file " + string(__FILE__) + ":" +
-                         std::to_string(__LINE__);
-      OP_REQUIRES_OK(context,
-                     errors::Aborted("Operation received an exception:",
-                     error_msg));
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
     }
   }
 
@@ -958,7 +934,7 @@ class MklFusedBatchNormOp : public OpKernel {
   bool is_training_;
   T* mean_values_;
   T* variance_values_;
-  size_t depth_;          // batch normalization is done for per channel.
+  size_t depth_;  // batch normalization is done for per channel.
 
   void ExtractParams(OpKernelContext* context) {
     const Tensor& input = MklGetInput(context, 0);
@@ -966,23 +942,20 @@ class MklFusedBatchNormOp : public OpKernel {
   }
 
   void SetMeanVariance(const Tensor& mean, const Tensor& variance) {
-    mean_values_ = reinterpret_cast<T*>(
-                       const_cast<T*>(mean.flat<T>().data()));
-    variance_values_ = reinterpret_cast<T*>(
-                       const_cast<T*>(variance.flat<T>().data()));
+    mean_values_ = reinterpret_cast<T*>(const_cast<T*>(mean.flat<T>().data()));
+    variance_values_ =
+        reinterpret_cast<T*>(const_cast<T*>(variance.flat<T>().data()));
   }
 
-  void HandleEmptyInput(OpKernelContext* context,
-                        TensorShape tf_shape_src,
-                        TensorShape tf_shape_scale,
-                        Tensor** dst_tensor) {
+  void HandleEmptyInput(OpKernelContext* context, TensorShape tf_shape_src,
+                        TensorShape tf_shape_scale, Tensor** dst_tensor) {
     CHECK_NOTNULL(dst_tensor);
 
     const size_t kDstIndex = 0;
     MklDnnShape dnn_shape_dst;
     dnn_shape_dst.SetMklTensor(false);
-    AllocateOutputSetMklShape(context, kDstIndex, dst_tensor,
-                              tf_shape_src, dnn_shape_dst);
+    AllocateOutputSetMklShape(context, kDstIndex, dst_tensor, tf_shape_src,
+                              dnn_shape_dst);
     CHECK_NOTNULL(*dst_tensor);
     memset(const_cast<char*>((*dst_tensor)->tensor_data().data()), 0,
            (*dst_tensor)->tensor_data().size());
@@ -991,15 +964,12 @@ class MklFusedBatchNormOp : public OpKernel {
     Tensor* batch_variance_tensor = nullptr;
     Tensor* saved_mean_tensor = nullptr;
     Tensor* saved_variance_tensor = nullptr;
-    AllocateTFOutputs(context, tf_shape_scale,
-                      &batch_mean_tensor,
-                      &batch_variance_tensor,
-                      &saved_mean_tensor,
+    AllocateTFOutputs(context, tf_shape_scale, &batch_mean_tensor,
+                      &batch_variance_tensor, &saved_mean_tensor,
                       &saved_variance_tensor);
   }
 
-  void AllocateTFOutputs(OpKernelContext* context,
-                         TensorShape tf_shape_scale,
+  void AllocateTFOutputs(OpKernelContext* context, TensorShape tf_shape_scale,
                          Tensor** batch_mean_tensor,
                          Tensor** batch_variance_tensor,
                          Tensor** saved_mean_tensor,
@@ -1017,51 +987,43 @@ class MklFusedBatchNormOp : public OpKernel {
     // allocate batch mean output tensor
     MklDnnShape mkl_shape_batch_mean;
     mkl_shape_batch_mean.SetMklTensor(false);
-    AllocateOutputSetMklShape(context,
-                              kBatchMeanIndex,
-                              batch_mean_tensor,
-                              tf_shape_scale,
-                              mkl_shape_batch_mean);
+    AllocateOutputSetMklShape(context, kBatchMeanIndex, batch_mean_tensor,
+                              tf_shape_scale, mkl_shape_batch_mean);
     CHECK_NOTNULL(*batch_mean_tensor);
     // set NAN mean value in case of empty input tensor
-    for (int k=0; k < tf_shape_scale.num_elements(); k++)
+    for (int k = 0; k < tf_shape_scale.num_elements(); k++)
       (*batch_mean_tensor)->flat<T>().data()[k] = NAN;
 
     // allocate batch variance output tensor
     MklDnnShape mkl_shape_batch_variance;
     mkl_shape_batch_variance.SetMklTensor(false);
-    AllocateOutputSetMklShape(context,
-                              kBatchVarianceIndex,
-                              batch_variance_tensor,
-                              tf_shape_scale,
+    AllocateOutputSetMklShape(context, kBatchVarianceIndex,
+                              batch_variance_tensor, tf_shape_scale,
                               mkl_shape_batch_variance);
     CHECK_NOTNULL(*batch_variance_tensor);
     // set NAN variance value in case of empty input tensor
-    for (int k=0; k < tf_shape_scale.num_elements(); k++)
+    for (int k = 0; k < tf_shape_scale.num_elements(); k++)
       (*batch_variance_tensor)->flat<T>().data()[k] = NAN;
 
     // Mean and variance (without Bessel's correction) saved for backward
     // computation to serve as pre-computed mean and variance.
     MklDnnShape mkl_shape_saved_mean;
     mkl_shape_saved_mean.SetMklTensor(false);
-    AllocateOutputSetMklShape(context, kSavedMeanIndex,
-                              saved_mean_tensor,
-                              tf_shape_scale,
-                              mkl_shape_saved_mean);
+    AllocateOutputSetMklShape(context, kSavedMeanIndex, saved_mean_tensor,
+                              tf_shape_scale, mkl_shape_saved_mean);
     CHECK_NOTNULL(*saved_mean_tensor);
     // set NAN mean value in case of empty input tensor
-    for (int k=0; k < tf_shape_scale.num_elements(); k++)
+    for (int k = 0; k < tf_shape_scale.num_elements(); k++)
       (*saved_mean_tensor)->flat<T>().data()[k] = NAN;
 
     MklDnnShape mkl_shape_saved_variance;
     mkl_shape_saved_variance.SetMklTensor(false);
     AllocateOutputSetMklShape(context, kSavedVarianceIndex,
-                              saved_variance_tensor,
-                              tf_shape_scale,
+                              saved_variance_tensor, tf_shape_scale,
                               mkl_shape_saved_variance);
     CHECK_NOTNULL(*saved_variance_tensor);
     // set NAN variance value in case of empty input tensor
-    for (int k=0; k < tf_shape_scale.num_elements(); k++)
+    for (int k = 0; k < tf_shape_scale.num_elements(); k++)
       (*saved_variance_tensor)->flat<T>().data()[k] = NAN;
   }
 };
@@ -1093,8 +1055,8 @@ class MklFusedBatchNormGradOp : public OpKernel {
       const Tensor& src_tensor = MklGetInput(context, kSrcIndex);
       const Tensor& scale_tensor = MklGetInput(context, kScaleIndex);
       const Tensor& saved_mean_tensor = MklGetInput(context, kMeanIndex);
-      const Tensor& saved_variance_tensor = MklGetInput(context,
-                                            kVarianceIndex);
+      const Tensor& saved_variance_tensor =
+          MklGetInput(context, kVarianceIndex);
 
       MklDnnShape dnn_shape_src, dnn_shape_diff_dst;
       GetMklShape(context, kSrcIndex, &dnn_shape_src);
@@ -1103,53 +1065,49 @@ class MklFusedBatchNormGradOp : public OpKernel {
 
       if (dnn_shape_diff_dst.IsMklTensor()) {
         tf_shape_diff_dst = dnn_shape_diff_dst.GetTfShape();
-        OP_REQUIRES(context, dnn_shape_diff_dst.GetDimension() == 4,
-                    errors::InvalidArgument(
-                        "input must be 4-dimensional",
-                        diff_dst_tensor.shape().DebugString()));
+        OP_REQUIRES(
+            context, dnn_shape_diff_dst.GetDimension() == 4,
+            errors::InvalidArgument("input must be 4-dimensional",
+                                    diff_dst_tensor.shape().DebugString()));
       } else {
         tf_shape_diff_dst = diff_dst_tensor.shape();
-        OP_REQUIRES(context, diff_dst_tensor.dims() == 4,
-                    errors::InvalidArgument(
-                        "input must be 4-dimensional",
-                        diff_dst_tensor.shape().DebugString()));
+        OP_REQUIRES(
+            context, diff_dst_tensor.dims() == 4,
+            errors::InvalidArgument("input must be 4-dimensional",
+                                    diff_dst_tensor.shape().DebugString()));
       }
 
       if (dnn_shape_src.IsMklTensor()) {
         tf_shape_src = dnn_shape_src.GetTfShape();
         OP_REQUIRES(context, dnn_shape_src.GetDimension() == 4,
-                    errors::InvalidArgument(
-                        "input must be 4-dimensional",
-                         src_tensor.shape().DebugString()));
+                    errors::InvalidArgument("input must be 4-dimensional",
+                                            src_tensor.shape().DebugString()));
       } else {
         tf_shape_src = src_tensor.shape();
         OP_REQUIRES(context, src_tensor.dims() == 4,
-                    errors::InvalidArgument(
-                        "input must be 4-dimensional",
-                        src_tensor.shape().DebugString()));
+                    errors::InvalidArgument("input must be 4-dimensional",
+                                            src_tensor.shape().DebugString()));
       }
 
       OP_REQUIRES(context, scale_tensor.dims() == 1,
-                  errors::InvalidArgument(
-                      "scale must be 1-dimensional",
-                      scale_tensor.shape().DebugString()));
-      OP_REQUIRES(context, saved_mean_tensor.dims() == 1,
-                  errors::InvalidArgument(
-                      "saved mean must be 1-dimensional",
-                       saved_mean_tensor.shape().DebugString()));
-
-      OP_REQUIRES(context, saved_variance_tensor.dims() == 1,
-                  errors::InvalidArgument(
-                      "saved variance must be 1-dimensional",
-                      saved_variance_tensor.shape().DebugString()));
+                  errors::InvalidArgument("scale must be 1-dimensional",
+                                          scale_tensor.shape().DebugString()));
+      OP_REQUIRES(
+          context, saved_mean_tensor.dims() == 1,
+          errors::InvalidArgument("saved mean must be 1-dimensional",
+                                  saved_mean_tensor.shape().DebugString()));
+
+      OP_REQUIRES(
+          context, saved_variance_tensor.dims() == 1,
+          errors::InvalidArgument("saved variance must be 1-dimensional",
+                                  saved_variance_tensor.shape().DebugString()));
 
       Tensor* diff_src_tensor = nullptr;
       if (tf_shape_src.num_elements() == 0 ||
           tf_shape_diff_dst.num_elements() == 0) {
-         HandleEmptyInput(context, tf_shape_src,
-                          scale_tensor.shape(),
-                          &diff_src_tensor);
-         return;
+        HandleEmptyInput(context, tf_shape_src, scale_tensor.shape(),
+                         &diff_src_tensor);
+        return;
       }
 
       if (dnn_shape_src.IsMklTensor())
@@ -1175,20 +1133,18 @@ class MklFusedBatchNormGradOp : public OpKernel {
 
       memory::dims src_dims, diff_dst_dims;
       if (dnn_shape_src.IsMklTensor())
-        src_dims = TFShapeToMklDnnDimsInNCHW(
-                       dnn_shape_src.GetTfShape(), tensor_format_);
+        src_dims = TFShapeToMklDnnDimsInNCHW(dnn_shape_src.GetTfShape(),
+                                             tensor_format_);
       else
-        src_dims = TFShapeToMklDnnDimsInNCHW(
-                       src_tensor.shape(), tensor_format_);
+        src_dims =
+            TFShapeToMklDnnDimsInNCHW(src_tensor.shape(), tensor_format_);
 
       if (dnn_shape_diff_dst.IsMklTensor())
         diff_dst_dims = TFShapeToMklDnnDimsInNCHW(
-                            dnn_shape_diff_dst.GetTfShape(),
-                            tensor_format_);
+            dnn_shape_diff_dst.GetTfShape(), tensor_format_);
       else
-        diff_dst_dims = TFShapeToMklDnnDimsInNCHW(
-                            diff_dst_tensor.shape(),
-                            tensor_format_);
+        diff_dst_dims =
+            TFShapeToMklDnnDimsInNCHW(diff_dst_tensor.shape(), tensor_format_);
 
       // set src and diff_dst primitives
       memory::desc src_md({}, memory::data_undef, memory::format_undef);
@@ -1202,7 +1158,7 @@ class MklFusedBatchNormGradOp : public OpKernel {
           src_md = diff_dst_md;
         }
       } else {
-        src_md =  memory::desc(src_dims, MklDnnType<T>(), format_m);
+        src_md = memory::desc(src_dims, MklDnnType<T>(), format_m);
         diff_dst_md = src_md;
       }
       src.SetUsrMem(src_md, &src_tensor);
@@ -1210,55 +1166,47 @@ class MklFusedBatchNormGradOp : public OpKernel {
 
       // weights -- DNN packs scales/shifts as weights in order of
       // scale, ..., scale, shift, ..., shift
-      auto weights_desc = memory::desc({2, depth_},
-                                       MklDnnType<T>(),
-                                       memory::format::nc);
+      auto weights_desc =
+          memory::desc({2, depth_}, MklDnnType<T>(), memory::format::nc);
       auto weights_pd = memory::primitive_desc(weights_desc, cpu_engine);
       auto weights_m = memory(weights_pd);
       T* weights_data = reinterpret_cast<T*>(weights_m.get_data_handle());
-      T* scale_tf = reinterpret_cast<T*>(const_cast<T*>
-                                        (scale_tensor.flat<T>().data()));
-      for (int k=0; k < depth_; k++) {
+      T* scale_tf =
+          reinterpret_cast<T*>(const_cast<T*>(scale_tensor.flat<T>().data()));
+      for (int k = 0; k < depth_; k++) {
         weights_data[k] = scale_tf[k];
         weights_data[k + depth_] = 0;
       }
 
       // set mean primitive
       memory::dims mv_dims = GetMeanVarianceDims();
-      mean.SetUsrMem(mv_dims,
-                     memory::format::nc,
-                     const_cast<void*>(static_cast<const void*>
-                     (saved_mean_tensor.flat<T>().data())));
+      mean.SetUsrMem(mv_dims, memory::format::nc,
+                     const_cast<void*>(static_cast<const void*>(
+                         saved_mean_tensor.flat<T>().data())));
       mean.SetOpMemDesc(mv_dims, memory::format::nc);
 
       // set variance primitive
-      variance.SetUsrMem(mv_dims,  memory::format::nc,
-                         const_cast<void*>(static_cast<const void*>
-                         (saved_variance_tensor.flat<T>().data())));
+      variance.SetUsrMem(mv_dims, memory::format::nc,
+                         const_cast<void*>(static_cast<const void*>(
+                             saved_variance_tensor.flat<T>().data())));
       variance.SetOpMemDesc(mv_dims, memory::format::nc);
 
       // set diff_weight primitive
-      auto diff_weights_desc = memory::desc(
-                                 {2, depth_},
-                                 MklDnnType<T>(),
-                                 memory::format::nc);
-      auto diff_weights_pd = memory::primitive_desc(
-                                diff_weights_desc,
-                                cpu_engine);
+      auto diff_weights_desc =
+          memory::desc({2, depth_}, MklDnnType<T>(), memory::format::nc);
+      auto diff_weights_pd =
+          memory::primitive_desc(diff_weights_desc, cpu_engine);
       auto diff_weights_m = memory(diff_weights_pd);
 
       auto bnrm_fwd_desc = batch_normalization_forward::desc(
-                               prop_kind::forward_training,
-                               src.GetUsrMemDesc(),
-                               epsilon_,
-                               is_training_ ? use_scale_shift :
-                               (use_scale_shift | use_global_stats));
+          prop_kind::forward_training, src.GetUsrMemDesc(), epsilon_,
+          is_training_ ? use_scale_shift
+                       : (use_scale_shift | use_global_stats));
       auto bnrm_fwd_pd = batch_normalization_forward::primitive_desc(
-                                bnrm_fwd_desc,
-                                cpu_engine);
+          bnrm_fwd_desc, cpu_engine);
 
       // Indices of output tensors
-      const size_t kDiffSrcIndex = 0;    // index of diff_src tensor
+      const size_t kDiffSrcIndex = 0;  // index of diff_src tensor
 
       // allocate diff_src tensor
       MklDnnShape dnn_shape_diff_src;
@@ -1268,14 +1216,11 @@ class MklFusedBatchNormGradOp : public OpKernel {
         auto diff_src_pd = bnrm_fwd_pd.dst_primitive_desc();
         dnn_shape_diff_src.SetMklLayout(&diff_src_pd);
         dnn_shape_diff_src.SetElemType(MklDnnType<T>());
-        dnn_shape_diff_src.SetTfLayout(
-                              dnn_shape_src.GetDimension(),
-                              src_dims,
-                              format_m);
-        dnn_shape_diff_src.SetTfDimOrder(
-                              dnn_shape_src.GetDimension(),
-                              tensor_format_);
-        tf_shape_diff_src.AddDim(diff_src_pd.get_size()/sizeof(T));
+        dnn_shape_diff_src.SetTfLayout(dnn_shape_src.GetDimension(), src_dims,
+                                       format_m);
+        dnn_shape_diff_src.SetTfDimOrder(dnn_shape_src.GetDimension(),
+                                         tensor_format_);
+        tf_shape_diff_src.AddDim(diff_src_pd.get_size() / sizeof(T));
       } else {
         dnn_shape_diff_src.SetMklTensor(false);
         tf_shape_diff_src = src_tensor.shape();
@@ -1287,33 +1232,22 @@ class MklFusedBatchNormGradOp : public OpKernel {
 
       prop_kind pk = prop_kind::backward;
       auto bnrm_bwd_desc = batch_normalization_backward::desc(
-                               pk,
-                               diff_src.GetUsrMemDesc(),
-                               src.GetUsrMemDesc(),
-                               epsilon_,
-                               /* for inference, specify use_global_stats
-                                  1. on fwd prop, use mean and variance
-                                     provided as inputs
-                                  2. on bwd prop, mean and variance are
-                                     considered as constants. Thus, 
-                                     reduce the amout of MKL computations
-                               */
-                               is_training_ ? use_scale_shift :
-                               (use_scale_shift | use_global_stats));
+          pk, diff_src.GetUsrMemDesc(), src.GetUsrMemDesc(), epsilon_,
+          /* for inference, specify use_global_stats
+             1. on fwd prop, use mean and variance
+                provided as inputs
+             2. on bwd prop, mean and variance are
+                considered as constants. Thus,
+                reduce the amout of MKL computations
+          */
+          is_training_ ? use_scale_shift
+                       : (use_scale_shift | use_global_stats));
       auto bnrm_bwd_pd = batch_normalization_backward::primitive_desc(
-                               bnrm_bwd_desc,
-                               cpu_engine,
-                               bnrm_fwd_pd);
+          bnrm_bwd_desc, cpu_engine, bnrm_fwd_pd);
 
       auto bnrm_bwd_op = batch_normalization_backward(
-                               bnrm_bwd_pd,
-                               src.GetOpMem(),
-                               mean.GetOpMem(),
-                               variance.GetOpMem(),
-                               diff_dst.GetOpMem(),
-                               weights_m,
-                               diff_src.GetOpMem(),
-                               diff_weights_m);
+          bnrm_bwd_pd, src.GetOpMem(), mean.GetOpMem(), variance.GetOpMem(),
+          diff_dst.GetOpMem(), weights_m, diff_src.GetOpMem(), diff_weights_m);
 
       std::vector<primitive> net;
       net.push_back(bnrm_bwd_op);
@@ -1322,43 +1256,39 @@ class MklFusedBatchNormGradOp : public OpKernel {
       // allocate 4 output TF tensors
       Tensor* diff_scale_tensor = nullptr;
       Tensor* diff_shift_tensor = nullptr;
-      AllocateTFOutputs(context, scale_tensor.shape(),
-                        &diff_scale_tensor,
+      AllocateTFOutputs(context, scale_tensor.shape(), &diff_scale_tensor,
                         &diff_shift_tensor);
 
       // copy data: diff_scale and diff_shift
-      T* diff_weights_data_dnn = reinterpret_cast<T*>
-                                 (diff_weights_m.get_data_handle());
+      T* diff_weights_data_dnn =
+          reinterpret_cast<T*>(diff_weights_m.get_data_handle());
       for (int i = 0; i < depth_; i++) {
-        diff_scale_tensor->flat<T>().data()[i] =
-                              diff_weights_data_dnn[i];
+        diff_scale_tensor->flat<T>().data()[i] = diff_weights_data_dnn[i];
         diff_shift_tensor->flat<T>().data()[i] =
-                              diff_weights_data_dnn[i + depth_];
+            diff_weights_data_dnn[i + depth_];
       }
-    } catch (mkldnn::error &e) {
+    } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
-                          ", message: " + string(e.message) +
-                          ", in file " + string(__FILE__) + ":" +
-                          std::to_string(__LINE__);
-      OP_REQUIRES_OK(context,
-                     errors::Aborted("Operation received an exception:",
-                     error_msg));
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
     }
   }
 
  private:
   T epsilon_;
   TensorFormat tensor_format_;
-  int depth_;             // batch normalization is done for per channel.
+  int depth_;  // batch normalization is done for per channel.
   bool is_training_;
 
   void ExtractParams(OpKernelContext* context) {
-      const Tensor& input = MklGetInput(context, 0);
-      depth_ = static_cast<int>(GetTensorDim(input, tensor_format_, 'C'));
+    const Tensor& input = MklGetInput(context, 0);
+    depth_ = static_cast<int>(GetTensorDim(input, tensor_format_, 'C'));
   }
 
-  void HandleEmptyInput(OpKernelContext* context,
-                        TensorShape tf_shape_src,
+  void HandleEmptyInput(OpKernelContext* context, TensorShape tf_shape_src,
                         TensorShape tf_shape_scale_shift,
                         Tensor** diff_src_tensor) {
     const size_t kDiffSrcIndex = 0;
@@ -1366,22 +1296,20 @@ class MklFusedBatchNormGradOp : public OpKernel {
     MklDnnShape dnn_shape_diff_src;
     dnn_shape_diff_src.SetMklTensor(false);
     AllocateOutputSetMklShape(context, kDiffSrcIndex, diff_src_tensor,
-                                tf_shape_src, dnn_shape_diff_src);
-    for (size_t i=0; i < (*diff_src_tensor)->shape().num_elements(); i++)
-       (*diff_src_tensor)->flat<T>().data()[i] = 0;
+                              tf_shape_src, dnn_shape_diff_src);
+    for (size_t i = 0; i < (*diff_src_tensor)->shape().num_elements(); i++)
+      (*diff_src_tensor)->flat<T>().data()[i] = 0;
 
     Tensor* diff_scale_tensor = nullptr;
     Tensor* diff_shift_tensor = nullptr;
-    AllocateTFOutputs(context,
-                      tf_shape_scale_shift,
-                      &diff_scale_tensor,
+    AllocateTFOutputs(context, tf_shape_scale_shift, &diff_scale_tensor,
                       &diff_shift_tensor);
   }
 
   void AllocateTFOutputs(OpKernelContext* context,
-                        TensorShape tf_shape_scale_shift,
-                        Tensor** diff_scale_tensor,
-                        Tensor** diff_shift_tensor) {
+                         TensorShape tf_shape_scale_shift,
+                         Tensor** diff_scale_tensor,
+                         Tensor** diff_shift_tensor) {
     CHECK_NOTNULL(diff_scale_tensor);
     CHECK_NOTNULL(diff_shift_tensor);
 
@@ -1396,31 +1324,29 @@ class MklFusedBatchNormGradOp : public OpKernel {
     AllocateOutputSetMklShape(context, kDiffScaleIndex, diff_scale_tensor,
                               tf_shape_scale_shift, mkl_shape_diff_scale);
     CHECK_NOTNULL(*diff_scale_tensor);
-    for (size_t i=0; i < (*diff_scale_tensor)->shape().num_elements(); i++)
-       (*diff_scale_tensor)->flat<T>().data()[i] = 0;
+    for (size_t i = 0; i < (*diff_scale_tensor)->shape().num_elements(); i++)
+      (*diff_scale_tensor)->flat<T>().data()[i] = 0;
 
     MklDnnShape mkl_shape_diff_shift;
     mkl_shape_diff_shift.SetMklTensor(false);
     AllocateOutputSetMklShape(context, kDiffShiftIndex, diff_shift_tensor,
                               tf_shape_scale_shift, mkl_shape_diff_shift);
     CHECK_NOTNULL(*diff_shift_tensor);
-    for (size_t i=0; i < (*diff_shift_tensor)->shape().num_elements(); i++)
-       (*diff_shift_tensor)->flat<T>().data()[i] = 0;
+    for (size_t i = 0; i < (*diff_shift_tensor)->shape().num_elements(); i++)
+      (*diff_shift_tensor)->flat<T>().data()[i] = 0;
 
     // Placeholders for estimated_mean and estimated_variance, which are
     // used for inference and thus not needed here for gradient computation.
-    Tensor* p1_tensor = nullptr, *p2_tensor = nullptr;
+    Tensor *p1_tensor = nullptr, *p2_tensor = nullptr;
     MklDnnShape mkl_shape_p;
     mkl_shape_p.SetMklTensor(false);
-    AllocateOutputSetMklShape(context, kP1Index, &p1_tensor,
-                              TensorShape({}), mkl_shape_p);
-    AllocateOutputSetMklShape(context, kP2Index, &p2_tensor,
-                              TensorShape({}), mkl_shape_p);
+    AllocateOutputSetMklShape(context, kP1Index, &p1_tensor, TensorShape({}),
+                              mkl_shape_p);
+    AllocateOutputSetMklShape(context, kP2Index, &p2_tensor, TensorShape({}),
+                              mkl_shape_p);
   }
 
-  memory::dims GetMeanVarianceDims() {
-    return memory::dims({1, depth_});
-  }
+  memory::dims GetMeanVarianceDims() { return memory::dims({1, depth_}); }
 };
 
 #endif
diff --git a/tensorflow/core/kernels/mkl_input_conversion_op.cc b/tensorflow/core/kernels/mkl_input_conversion_op.cc
index 4b5f7b8310..73d41efce1 100644
--- a/tensorflow/core/kernels/mkl_input_conversion_op.cc
+++ b/tensorflow/core/kernels/mkl_input_conversion_op.cc
@@ -271,8 +271,8 @@ class MklInputConversionOp : public OpKernel {
     MklDnnShape input_shape_1;
     GetMklShape(context, 1, &input_shape_1);
 
-    bool tf_shapes_are_same = context->input(0).shape() ==
-                              context->input(1).shape();
+    bool tf_shapes_are_same =
+        context->input(0).shape() == context->input(1).shape();
 
     VLOG(1) << "MklInputConversionOp: Input shapes are "
             << (tf_shapes_are_same ? "*same*" : "*different*") << ": "
@@ -400,9 +400,9 @@ class MklInputConversionOp : public OpKernel {
 
       // Create reorder between tensorflow layout and Mkl layout.
       std::vector<primitive> net;
-      CHECK_EQ(tf_input.CheckReorderToOpMem(memory::primitive_desc(
-                                            output_mkl_md, cpu_engine),
-                                            tensor_out, &net),
+      CHECK_EQ(tf_input.CheckReorderToOpMem(
+                   memory::primitive_desc(output_mkl_md, cpu_engine),
+                   tensor_out, &net),
                true);
       stream(stream::kind::eager).submit(net).wait();
 
diff --git a/tensorflow/core/kernels/mkl_lrn_op.cc b/tensorflow/core/kernels/mkl_lrn_op.cc
index 95e0404ba8..a8b45004b7 100644
--- a/tensorflow/core/kernels/mkl_lrn_op.cc
+++ b/tensorflow/core/kernels/mkl_lrn_op.cc
@@ -22,6 +22,9 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 #include <vector>
+#include "mkl_dnn.h"
+#include "mkl_dnn_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -30,9 +33,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/tensor_format.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "mkl_dnn.h"
-#include "mkl_dnn_types.h"
 
 #if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/core/util/work_sharder.h"
@@ -40,10 +40,10 @@ limitations under the License.
 
 #ifdef INTEL_MKL_DNN
 #include "mkldnn.hpp"
-using mkldnn::lrn_forward;
+using mkldnn::lrn_across_channels;
 using mkldnn::lrn_backward;
+using mkldnn::lrn_forward;
 using mkldnn::prop_kind;
-using mkldnn::lrn_across_channels;
 using mkldnn::stream;
 #endif
 
@@ -77,10 +77,11 @@ class MklLRNOp : public OpKernel {
   explicit MklLRNOp(OpKernelConstruction* context) : OpKernel(context) {
     int64 depth_radius64;
     OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
-    OP_REQUIRES(context, FastBoundsCheck(depth_radius64,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("depth_radius = ", depth_radius64,
-                                        " larger than int max"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("depth_radius = ", depth_radius64,
+                                " larger than int max"));
     depth_radius_ = static_cast<size_t>(depth_radius64);
 
     OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_));
@@ -103,9 +104,10 @@ class MklLRNOp : public OpKernel {
                               : input.dims();
     OP_REQUIRES(context, mkl_context.in_dims == 4,
                 errors::InvalidArgument("input must be 4-dimensional"));
-    OP_REQUIRES(context, FastBoundsCheck(input.NumElements(),
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("argument to LRN too large"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(input.NumElements(), std::numeric_limits<int>::max()),
+        errors::InvalidArgument("argument to LRN too large"));
 
     if (!input_in_mkl_format) {
       mkl_context.MklDefaultToEigen(context, depth_radius_, bias_, alpha_,
@@ -339,17 +341,17 @@ class MklLRNOp : public OpKernel {
   float beta_;
 };
 
-
 template <typename T>
 class MklLRNGradOp : public OpKernel {
  public:
   explicit MklLRNGradOp(OpKernelConstruction* context) : OpKernel(context) {
     int64 depth_radius64;
     OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
-    OP_REQUIRES(context, FastBoundsCheck(depth_radius64,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("depth_radius = ", depth_radius64,
-                                        " larger than int max"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("depth_radius = ", depth_radius64,
+                                " larger than int max"));
     depth_radius_ = static_cast<int>(depth_radius64);
     OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_));
     OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_));
@@ -740,10 +742,11 @@ class MklLRNOp : public OpKernel {
   explicit MklLRNOp(OpKernelConstruction* context) : OpKernel(context) {
     int64 depth_radius64;
     OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
-    OP_REQUIRES(context, FastBoundsCheck(depth_radius64,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("depth_radius = ", depth_radius64,
-                                        " larger than int max"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("depth_radius = ", depth_radius64,
+                                " larger than int max"));
     depth_radius_ = static_cast<size_t>(depth_radius64);
 
     OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_));
@@ -773,10 +776,10 @@ class MklLRNOp : public OpKernel {
       if (!src_dnn_shape.IsMklTensor()) {
         MklDefaultToEigen(context, src_tensor);
         return;
-      } else if (!src_dnn_shape.IsMklChannelDim(
-                  src_dnn_shape.GetDimension() - 1) ) {
+      } else if (!src_dnn_shape.IsMklChannelDim(src_dnn_shape.GetDimension() -
+                                                1)) {
         Tensor converted_tensor =
-          ConvertMklToTF<T>(context, src_tensor, src_dnn_shape);
+            ConvertMklToTF<T>(context, src_tensor, src_dnn_shape);
         MklDefaultToEigen(context, converted_tensor);
         return;
       }
@@ -807,18 +810,16 @@ class MklLRNOp : public OpKernel {
       // Create LRN primitive descriptor.
       // Tensorflow's normalization semantics is across channels.
       // MKL-DNN also supports normalization within channel.
-      auto lrn_desc = lrn_forward::desc(prop_kind::forward,
-                                        lrn_across_channels,
+      auto lrn_desc = lrn_forward::desc(prop_kind::forward, lrn_across_channels,
                                         src_dnn_data.GetUsrMemDesc(),
-                                        kernel_size,
-                                        new_alpha, beta_, bias_);
+                                        kernel_size, new_alpha, beta_, bias_);
       auto lrn_prim_desc = lrn_forward::primitive_desc(lrn_desc, cpu_engine);
 
       // Allocate output_dnn_data tensor.
       Tensor* output_tensor = nullptr;
       memory::format input_format = src_dnn_shape.GetTfDataFormat();
-      AllocateOutputTensor(context, lrn_prim_desc, input_dims,
-                    input_format, &output_tensor);
+      AllocateOutputTensor(context, lrn_prim_desc, input_dims, input_format,
+                           &output_tensor);
       OP_REQUIRES_OK(context, context->status());
       CHECK_NOTNULL(output_tensor);
       dst_dnn_data.SetUsrMemDataHandle(output_tensor);
@@ -827,25 +828,23 @@ class MklLRNOp : public OpKernel {
       AllocateWorkspaceTensor(context, lrn_prim_desc, &workspace_dnn_data);
       OP_REQUIRES_OK(context, context->status());
 
-      PrepareAndExecuteNet(lrn_prim_desc, &src_dnn_data,
-                          &dst_dnn_data, &workspace_dnn_data);
-    } catch (mkldnn::error &e) {
+      PrepareAndExecuteNet(lrn_prim_desc, &src_dnn_data, &dst_dnn_data,
+                           &workspace_dnn_data);
+    } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
-                       ", message: " + string(e.message) +
-                       ", in file " + string(__FILE__) + ":" +
-                       std::to_string(__LINE__);
-      OP_REQUIRES_OK(context,
-                     errors::Aborted("Operation received an exception:",
-                     error_msg));
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
     }
   }
 
  private:
-  void PrepareAndExecuteNet(
-                const lrn_forward::primitive_desc& lrn_fwd_desc,
-                MklDnnData<T>* src_dnn_data,
-                MklDnnData<T>* dst_dnn_data,
-                MklDnnData<uint8>* wksp_dnn_data = nullptr) {
+  void PrepareAndExecuteNet(const lrn_forward::primitive_desc& lrn_fwd_desc,
+                            MklDnnData<T>* src_dnn_data,
+                            MklDnnData<T>* dst_dnn_data,
+                            MklDnnData<uint8>* wksp_dnn_data = nullptr) {
     std::vector<primitive> net;
 
     // Check for input reorder
@@ -853,23 +852,21 @@ class MklLRNOp : public OpKernel {
 
     // Create pooling primitive and add it to net
     if (wksp_dnn_data != nullptr) {
-        net.push_back(lrn_forward(lrn_fwd_desc,
-                        src_dnn_data->GetOpMem(),
-                        wksp_dnn_data->GetOpMem(),
-                        dst_dnn_data->GetOpMem()));
+      net.push_back(lrn_forward(lrn_fwd_desc, src_dnn_data->GetOpMem(),
+                                wksp_dnn_data->GetOpMem(),
+                                dst_dnn_data->GetOpMem()));
     } else {
-        net.push_back(lrn_forward(lrn_fwd_desc,
-            src_dnn_data->GetOpMem(),
-            dst_dnn_data->GetOpMem()));
+      net.push_back(lrn_forward(lrn_fwd_desc, src_dnn_data->GetOpMem(),
+                                dst_dnn_data->GetOpMem()));
     }
     stream(stream::kind::eager).submit(net).wait();
   }
 
-  void AllocateOutputTensor(OpKernelContext* context,
-            const lrn_forward::primitive_desc& lrn_fwd_prim_desc,
-            const memory::dims output_dims_mkl_order,
-            const memory::format& output_tf_format,
-            Tensor** output_tensor) {
+  void AllocateOutputTensor(
+      OpKernelContext* context,
+      const lrn_forward::primitive_desc& lrn_fwd_prim_desc,
+      const memory::dims output_dims_mkl_order,
+      const memory::format& output_tf_format, Tensor** output_tensor) {
     CHECK_NOTNULL(output_tensor);
     memory::primitive_desc dst_pd = lrn_fwd_prim_desc.dst_primitive_desc();
 
@@ -880,111 +877,106 @@ class MklLRNOp : public OpKernel {
     output_mkl_shape.SetMklLayout(&dst_pd);
     output_mkl_shape.SetElemType(MklDnnType<T>());
     output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
-                              output_dims_mkl_order,
-                              output_tf_format);
+                                 output_dims_mkl_order, output_tf_format);
     TensorShape output_tf_shape;
     // only allocate enough space for the elements we need.
     size_t num_bytes = dst_pd.get_size();
     CHECK_EQ(num_bytes % sizeof(T), 0);
     output_tf_shape.AddDim(num_bytes / sizeof(T));
-    AllocateOutputSetMklShape(context, kIdxOutput,
-                            output_tensor,
-                            output_tf_shape, output_mkl_shape);
-    }
-
-    // Fallback implementation - Taken from lrn_op.cc
-    // TODO(inteltf) Check if we can use EigenLRNOp directly instead of making a
-    // copy.
-    void MklDefaultToEigen(OpKernelContext* context,
-                           const Tensor& input) {
-      const int batch = static_cast<int>(input.dim_size(0));
-      const int rows = static_cast<int>(input.dim_size(1));
-      const int cols = static_cast<int>(input.dim_size(2));
-      const int depth = static_cast<int>(input.dim_size(3));
-      const int nodes = cols * rows;
-
-      auto in_shaped = input.shaped<T, 2>({nodes * batch, depth});
-      // Multiplying the input with the band matrix has the effect of reducing
-      // the
-      // correct patch along the depth.
-      Eigen::Tensor<T, 2, Eigen::RowMajor> multiplier(depth, depth);
-      GetBandMatrix<T>(depth, depth_radius_, &multiplier);
+    AllocateOutputSetMklShape(context, kIdxOutput, output_tensor,
+                              output_tf_shape, output_mkl_shape);
+  }
 
-      Tensor *output_dnn_data = nullptr;
-      MklDnnShape mkl_output_mkl_shape;
-      mkl_output_mkl_shape.SetMklTensor(false);
-      mkl_output_mkl_shape.SetDimensions(4);
-      AllocateOutputSetMklShape(context, kIdxOutput, &output_dnn_data,
-                                input.shape(), mkl_output_mkl_shape);
-      CHECK_NOTNULL(output_dnn_data);
-
-      Tensor* workspace_tensor = nullptr;
-      MklDnnShape workspace_mkl_shape;
-      workspace_mkl_shape.SetMklTensor(false);
-      TensorShape workspace_tf_shape;
-      workspace_tf_shape.AddDim(0);
-      AllocateOutputSetMklShape(context, kIdxWorkspace,
-                              &workspace_tensor,
+  // Fallback implementation - Taken from lrn_op.cc
+  // TODO(inteltf) Check if we can use EigenLRNOp directly instead of making a
+  // copy.
+  void MklDefaultToEigen(OpKernelContext* context, const Tensor& input) {
+    const int batch = static_cast<int>(input.dim_size(0));
+    const int rows = static_cast<int>(input.dim_size(1));
+    const int cols = static_cast<int>(input.dim_size(2));
+    const int depth = static_cast<int>(input.dim_size(3));
+    const int nodes = cols * rows;
+
+    auto in_shaped = input.shaped<T, 2>({nodes * batch, depth});
+    // Multiplying the input with the band matrix has the effect of reducing
+    // the
+    // correct patch along the depth.
+    Eigen::Tensor<T, 2, Eigen::RowMajor> multiplier(depth, depth);
+    GetBandMatrix<T>(depth, depth_radius_, &multiplier);
+
+    Tensor* output_dnn_data = nullptr;
+    MklDnnShape mkl_output_mkl_shape;
+    mkl_output_mkl_shape.SetMklTensor(false);
+    mkl_output_mkl_shape.SetDimensions(4);
+    AllocateOutputSetMklShape(context, kIdxOutput, &output_dnn_data,
+                              input.shape(), mkl_output_mkl_shape);
+    CHECK_NOTNULL(output_dnn_data);
+
+    Tensor* workspace_tensor = nullptr;
+    MklDnnShape workspace_mkl_shape;
+    workspace_mkl_shape.SetMklTensor(false);
+    TensorShape workspace_tf_shape;
+    workspace_tf_shape.AddDim(0);
+    AllocateOutputSetMklShape(context, kIdxWorkspace, &workspace_tensor,
                               workspace_tf_shape, workspace_mkl_shape);
-      CHECK_NOTNULL(workspace_tensor);
-
-      auto out_shaped = output_dnn_data->shaped<T, 2>({nodes * batch, depth});
-      Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
-      auto tmp = in_shaped.square().contract(multiplier, dims) * alpha_ + bias_;
-      if (beta_ == T(1)) {
-        out_shaped.device(context->eigen_cpu_device()) =
-            in_shaped * tmp.inverse();
-      } else if (beta_ == T(0.5)) {
-        out_shaped.device(context->eigen_cpu_device()) =
-            in_shaped * tmp.rsqrt();
-      } else {
-        out_shaped.device(context->eigen_cpu_device()) =
-            in_shaped * (tmp.log() * -beta_).exp();
-      }
+    CHECK_NOTNULL(workspace_tensor);
+
+    auto out_shaped = output_dnn_data->shaped<T, 2>({nodes * batch, depth});
+    Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
+    auto tmp = in_shaped.square().contract(multiplier, dims) * alpha_ + bias_;
+    if (beta_ == T(1)) {
+      out_shaped.device(context->eigen_cpu_device()) =
+          in_shaped * tmp.inverse();
+    } else if (beta_ == T(0.5)) {
+      out_shaped.device(context->eigen_cpu_device()) = in_shaped * tmp.rsqrt();
+    } else {
+      out_shaped.device(context->eigen_cpu_device()) =
+          in_shaped * (tmp.log() * -beta_).exp();
     }
+  }
 
-    void AllocateWorkspaceTensor(OpKernelContext* context,
-                const lrn_forward::primitive_desc& lrn_fwd_prim_desc,
-                MklDnnData<uint8>* dnn_data_wksp) {
-      CHECK_NOTNULL(dnn_data_wksp);
-      Tensor* workspace_tensor = nullptr;
-      memory::primitive_desc workspace_pd
-                  = lrn_fwd_prim_desc.workspace_primitive_desc();
-      size_t workspace_bytes = workspace_pd.get_size();
-      MklDnnShape workspace_mkl_shape;
-      // the workspace tensor is a uint8 tensor that has
-      // exactly the number of bytes necessary
-      workspace_mkl_shape.SetMklTensor(false);
-      TensorShape workspace_tf_shape;
-      workspace_tf_shape.AddDim(workspace_bytes);
-      AllocateOutputSetMklShape(context, kIdxWorkspace,
-                              &workspace_tensor,
+  void AllocateWorkspaceTensor(
+      OpKernelContext* context,
+      const lrn_forward::primitive_desc& lrn_fwd_prim_desc,
+      MklDnnData<uint8>* dnn_data_wksp) {
+    CHECK_NOTNULL(dnn_data_wksp);
+    Tensor* workspace_tensor = nullptr;
+    memory::primitive_desc workspace_pd =
+        lrn_fwd_prim_desc.workspace_primitive_desc();
+    size_t workspace_bytes = workspace_pd.get_size();
+    MklDnnShape workspace_mkl_shape;
+    // the workspace tensor is a uint8 tensor that has
+    // exactly the number of bytes necessary
+    workspace_mkl_shape.SetMklTensor(false);
+    TensorShape workspace_tf_shape;
+    workspace_tf_shape.AddDim(workspace_bytes);
+    AllocateOutputSetMklShape(context, kIdxWorkspace, &workspace_tensor,
                               workspace_tf_shape, workspace_mkl_shape);
-      CHECK_NOTNULL(workspace_tensor);
-      dnn_data_wksp->SetUsrMem(workspace_pd, workspace_tensor);
-    }
+    CHECK_NOTNULL(workspace_tensor);
+    dnn_data_wksp->SetUsrMem(workspace_pd, workspace_tensor);
+  }
 
   void SanityCheckInputs(OpKernelContext* context) {
     const Tensor& src_tensor = MklGetInput(context, kIdxInput);
     MklDnnShape src_dnn_shape;
     GetMklShape(context, kIdxInput, &src_dnn_shape);
     if (src_dnn_shape.IsMklTensor()) {
-        OP_REQUIRES(context, src_dnn_shape.GetDimension() == 4,
-                    errors::InvalidArgument("input must be 4-dimensional"));
-        OP_REQUIRES(context, FastBoundsCheck(src_tensor.NumElements(),
-                                            std::numeric_limits<int>::max()),
-                        errors::InvalidArgument("argument to LRN too large"));
+      OP_REQUIRES(context, src_dnn_shape.GetDimension() == 4,
+                  errors::InvalidArgument("input must be 4-dimensional"));
+      OP_REQUIRES(context,
+                  FastBoundsCheck(src_tensor.NumElements(),
+                                  std::numeric_limits<int>::max()),
+                  errors::InvalidArgument("argument to LRN too large"));
     } else {
-        OP_REQUIRES(context, src_tensor.dims() == 4,
-                    errors::InvalidArgument("input must be 4-dimensional"));
-        OP_REQUIRES(context, FastBoundsCheck(src_tensor.NumElements(),
-                                            std::numeric_limits<int>::max()),
-                        errors::InvalidArgument("argument to LRN too large"));
+      OP_REQUIRES(context, src_tensor.dims() == 4,
+                  errors::InvalidArgument("input must be 4-dimensional"));
+      OP_REQUIRES(context,
+                  FastBoundsCheck(src_tensor.NumElements(),
+                                  std::numeric_limits<int>::max()),
+                  errors::InvalidArgument("argument to LRN too large"));
     }
   }
-  const int kIdxInput = 0,
-            kIdxOutput = 0,
-            kIdxWorkspace = 1;
+  const int kIdxInput = 0, kIdxOutput = 0, kIdxWorkspace = 1;
 
   typedef typename Eigen::Tensor<T, 1, Eigen::RowMajor>::DimensionPair DimPair;
   bool workspace_enabled_;
@@ -994,17 +986,17 @@ class MklLRNOp : public OpKernel {
   float beta_;
 };
 
-
 template <typename T>
 class MklLRNGradOp : public OpKernel {
  public:
   explicit MklLRNGradOp(OpKernelConstruction* context) : OpKernel(context) {
     int64 depth_radius64;
     OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
-    OP_REQUIRES(context, FastBoundsCheck(depth_radius64,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("depth_radius = ", depth_radius64,
-                                        " larger than int max"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("depth_radius = ", depth_radius64,
+                                " larger than int max"));
     depth_radius_ = static_cast<int>(depth_radius64);
     OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_));
     OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_));
@@ -1025,7 +1017,7 @@ class MklLRNGradOp : public OpKernel {
       MklDnnData<T> output_dnn_data(&cpu_engine);
 
       MklDnnShape input_grad_dnn_shape, orig_input_dnn_shape,
-                  orig_output_dnn_shape;
+          orig_output_dnn_shape;
       GetMklShape(context, kIdxGradient, &input_grad_dnn_shape);
       GetMklShape(context, kIdxOrigInput, &orig_input_dnn_shape);
       GetMklShape(context, kIdxOrigOutput, &orig_output_dnn_shape);
@@ -1037,16 +1029,16 @@ class MklLRNGradOp : public OpKernel {
                             orig_input_dnn_shape.IsMklTensor() &&
                             orig_output_dnn_shape.IsMklTensor() &&
                             input_grad_dnn_shape.IsMklChannelDim(
-                              input_grad_dnn_shape.GetDimension() - 1) &&
+                                input_grad_dnn_shape.GetDimension() - 1) &&
                             orig_input_dnn_shape.IsMklChannelDim(
-                              orig_input_dnn_shape.GetDimension() - 1) &&
+                                orig_input_dnn_shape.GetDimension() - 1) &&
                             orig_output_dnn_shape.IsMklChannelDim(
-                              orig_output_dnn_shape.GetDimension() - 1);
+                                orig_output_dnn_shape.GetDimension() - 1);
 
       if (!can_use_mkldnn) {
-          // Fallback to eigen
-          MklDefaultToEigen(context);
-          return;
+        // Fallback to eigen
+        MklDefaultToEigen(context);
+        return;
       }
       // At this point, we have the all clear to use MklDnn constructs
       // Naming: diff_dst is input_gradient_tensor; src is orig_input_tensor.
@@ -1059,13 +1051,11 @@ class MklLRNGradOp : public OpKernel {
       // NHWC format.
       memory::desc original_output_md = orig_output_dnn_shape.GetCurLayout();
       memory::desc target_diff_dst_md = ConfigureInputGradient(
-                                input_grad_tensor,
-                                input_grad_dnn_shape,
-                                &input_grad_dnn_data);
+          input_grad_tensor, input_grad_dnn_shape, &input_grad_dnn_data);
 
       memory::desc orig_input_md = orig_input_dnn_shape.GetCurLayout();
       memory::dims orig_input_dims =
-                orig_input_dnn_shape.GetSizesAsMklDnnDims();
+          orig_input_dnn_shape.GetSizesAsMklDnnDims();
       orig_input_dnn_data.SetUsrMem(orig_input_md, &orig_input_tensor);
       orig_input_dnn_data.SetOpMemDesc(orig_input_dims, memory::format::nhwc);
 
@@ -1079,27 +1069,21 @@ class MklLRNGradOp : public OpKernel {
 
       // Create LRN backward primitive descriptor. It requires LRN forward
       // primitive descriptor also.
-      auto lrn_fwd_desc = lrn_forward::desc(prop_kind::forward,
-                                        lrn_across_channels,
-                                        orig_input_md,
-                                        kernel_size,
-                                        new_alpha, beta_, bias_);
-      auto lrn_fwd_prim_desc = lrn_forward::primitive_desc(lrn_fwd_desc,
-                                                           cpu_engine);
-      auto lrn_bwd_desc = lrn_backward::desc(lrn_across_channels,
-                                        original_output_md,
-                                        target_diff_dst_md,
-                                        kernel_size,
-                                        new_alpha, beta_, bias_);
-      auto lrn_bwd_prim_desc = lrn_backward::primitive_desc(lrn_bwd_desc,
-                                                          cpu_engine,
-                                                          lrn_fwd_prim_desc);
+      auto lrn_fwd_desc = lrn_forward::desc(
+          prop_kind::forward, lrn_across_channels, orig_input_md, kernel_size,
+          new_alpha, beta_, bias_);
+      auto lrn_fwd_prim_desc =
+          lrn_forward::primitive_desc(lrn_fwd_desc, cpu_engine);
+      auto lrn_bwd_desc = lrn_backward::desc(
+          lrn_across_channels, original_output_md, target_diff_dst_md,
+          kernel_size, new_alpha, beta_, bias_);
+      auto lrn_bwd_prim_desc = lrn_backward::primitive_desc(
+          lrn_bwd_desc, cpu_engine, lrn_fwd_prim_desc);
 
       Tensor* output_tensor = nullptr;
-      memory::format orig_input_format
-                    = orig_input_dnn_shape.GetTfDataFormat();
-      AllocateOutputTensor(context, lrn_bwd_prim_desc,
-            orig_input_dims, orig_input_format, &output_tensor);
+      memory::format orig_input_format = orig_input_dnn_shape.GetTfDataFormat();
+      AllocateOutputTensor(context, lrn_bwd_prim_desc, orig_input_dims,
+                           orig_input_format, &output_tensor);
       OP_REQUIRES_OK(context, context->status());
       CHECK_NOTNULL(output_tensor);
       output_dnn_data.SetUsrMemDataHandle(output_tensor);
@@ -1110,35 +1094,32 @@ class MklLRNGradOp : public OpKernel {
       const Tensor& workspace_tensor = MklGetInput(context, kIdxWorkspace);
       MklDnnData<uint8> workspace_dnn_data(&cpu_engine);
       ConfigureWorkspace(workspace_tensor,
-                          lrn_fwd_prim_desc.workspace_primitive_desc(),
-                          &workspace_dnn_data);
-
-      PrepareAndExecuteNet(lrn_bwd_prim_desc,
-                lrn_fwd_prim_desc,
-                &orig_input_dnn_data,
-                &input_grad_dnn_data,
-                &output_dnn_data,
-                memory::primitive_desc(target_diff_dst_md, cpu_engine),
-                &workspace_dnn_data);
-    } catch (mkldnn::error &e) {
+                         lrn_fwd_prim_desc.workspace_primitive_desc(),
+                         &workspace_dnn_data);
+
+      PrepareAndExecuteNet(
+          lrn_bwd_prim_desc, lrn_fwd_prim_desc, &orig_input_dnn_data,
+          &input_grad_dnn_data, &output_dnn_data,
+          memory::primitive_desc(target_diff_dst_md, cpu_engine),
+          &workspace_dnn_data);
+    } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
-                       ", message: " + string(e.message) +
-                       ", in file " + string(__FILE__) + ":" +
-                       std::to_string(__LINE__);
-      OP_REQUIRES_OK(context,
-                     errors::Aborted("Operation received an exception:",
-                     error_msg));
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
     }
   }
 
-  void AllocateOutputTensor(OpKernelContext* context,
-            const lrn_backward::primitive_desc& lrn_bkwd_prim_desc,
-            const memory::dims output_dims_mkl_order,
-            const memory::format& output_tf_format,
-            Tensor** output_tensor) {
+  void AllocateOutputTensor(
+      OpKernelContext* context,
+      const lrn_backward::primitive_desc& lrn_bkwd_prim_desc,
+      const memory::dims output_dims_mkl_order,
+      const memory::format& output_tf_format, Tensor** output_tensor) {
     CHECK_NOTNULL(output_tensor);
-    memory::primitive_desc dst_pd
-                = lrn_bkwd_prim_desc.diff_src_primitive_desc();
+    memory::primitive_desc dst_pd =
+        lrn_bkwd_prim_desc.diff_src_primitive_desc();
     MklDnnShape output_mkl_shape;
 
     // We assume that all outputs at this point are MKL Tensors
@@ -1146,170 +1127,153 @@ class MklLRNGradOp : public OpKernel {
     output_mkl_shape.SetMklLayout(&dst_pd);
     output_mkl_shape.SetElemType(MklDnnType<T>());
     output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
-                              output_dims_mkl_order,
-                              output_tf_format);
+                                 output_dims_mkl_order, output_tf_format);
 
     TensorShape output_tf_shape;
     size_t num_bytes = dst_pd.get_size();
     CHECK_EQ(num_bytes % sizeof(T), 0);
     output_tf_shape.AddDim(num_bytes / sizeof(T));
-    AllocateOutputSetMklShape(context, kIdxOutput,
-                            output_tensor,
-                            output_tf_shape, output_mkl_shape);
+    AllocateOutputSetMklShape(context, kIdxOutput, output_tensor,
+                              output_tf_shape, output_mkl_shape);
   }
 
   memory::desc ConfigureInputGradient(const Tensor& input_grad_tensor,
-                    const MklDnnShape& input_grad_dnn_shape,
-                    MklDnnData<T> *input_grad_dnn_data) {
+                                      const MklDnnShape& input_grad_dnn_shape,
+                                      MklDnnData<T>* input_grad_dnn_data) {
     CHECK_NOTNULL(input_grad_dnn_data);
     // This shouldn't be necessary at this point, but just in case
     CHECK_EQ(input_grad_dnn_shape.IsMklTensor(), true);
 
     memory::desc input_grad_md = input_grad_dnn_shape.GetCurLayout();
-    memory::dims orig_input_dims =
-              input_grad_dnn_shape.GetSizesAsMklDnnDims();
+    memory::dims orig_input_dims = input_grad_dnn_shape.GetSizesAsMklDnnDims();
     input_grad_dnn_data->SetUsrMem(input_grad_md, &input_grad_tensor);
     input_grad_dnn_data->SetOpMemDesc(orig_input_dims, memory::format::nhwc);
     return input_grad_md;
   }
 
   void PrepareAndExecuteNet(
-        const lrn_backward::primitive_desc& lrn_bkwd_desc,
-        const lrn_forward::primitive_desc& lrn_fwd_desc,
-        MklDnnData<T>* src_dnn_data,
-        MklDnnData<T>* input_gradient_diff_dst,
-        MklDnnData<T>* output_diff_src,
-        const memory::primitive_desc& target_diff_dst_pd,
-        const MklDnnData<uint8>* workspace_dnn_data = nullptr) {
+      const lrn_backward::primitive_desc& lrn_bkwd_desc,
+      const lrn_forward::primitive_desc& lrn_fwd_desc,
+      MklDnnData<T>* src_dnn_data, MklDnnData<T>* input_gradient_diff_dst,
+      MklDnnData<T>* output_diff_src,
+      const memory::primitive_desc& target_diff_dst_pd,
+      const MklDnnData<uint8>* workspace_dnn_data = nullptr) {
     std::vector<primitive> net;
 
     // Check for input reordering on the diff dst input
     input_gradient_diff_dst->CheckReorderToOpMem(
-                lrn_bkwd_desc.diff_dst_primitive_desc(), &net);
+        lrn_bkwd_desc.diff_dst_primitive_desc(), &net);
 
     // Check for input reordering on the original input
-    src_dnn_data->CheckReorderToOpMem(lrn_fwd_desc.src_primitive_desc(),
-                                      &net);
+    src_dnn_data->CheckReorderToOpMem(lrn_fwd_desc.src_primitive_desc(), &net);
     // Create pooling primitive and add it to net
     if (nullptr == workspace_dnn_data) {
-      net.push_back(lrn_backward(lrn_bkwd_desc,
-                              src_dnn_data->GetOpMem(),
-                              input_gradient_diff_dst->GetOpMem(),
-                              output_diff_src->GetOpMem()));
+      net.push_back(lrn_backward(lrn_bkwd_desc, src_dnn_data->GetOpMem(),
+                                 input_gradient_diff_dst->GetOpMem(),
+                                 output_diff_src->GetOpMem()));
     } else {
-      net.push_back(lrn_backward(lrn_bkwd_desc,
-                                src_dnn_data->GetOpMem(),
-                                  input_gradient_diff_dst->GetOpMem(),
-                                  workspace_dnn_data->GetOpMem(),
-                                  output_diff_src->GetOpMem()));
+      net.push_back(lrn_backward(lrn_bkwd_desc, src_dnn_data->GetOpMem(),
+                                 input_gradient_diff_dst->GetOpMem(),
+                                 workspace_dnn_data->GetOpMem(),
+                                 output_diff_src->GetOpMem()));
     }
     stream(stream::kind::eager).submit(net).wait();
   }
 
   void ConfigureWorkspace(const Tensor& workspace_tensor,
-                    memory::primitive_desc workspace_pd,
-                    MklDnnData<uint8> *workspace_dnn_data) {
+                          memory::primitive_desc workspace_pd,
+                          MklDnnData<uint8>* workspace_dnn_data) {
     CHECK_NOTNULL(workspace_dnn_data);
 
     workspace_dnn_data->SetUsrMem(workspace_pd, &workspace_tensor);
   }
 
-    // Fallback implementation - Taken from lrn_op.cc
-    // TODO(intelft) Check if we can use EigenLRNOp directly instead of making a
-    // copy.
-    void MklDefaultToEigen(OpKernelContext* context) {
-      Tensor input_gradient_tensor;
-      Tensor orig_input_tensor;
-      Tensor orig_output_tensor;
-
-      MklDnnShape input_grad_dnn_shape, orig_input_dnn_shape,
-                  orig_output_dnn_shape;
-      GetMklShape(context, kIdxGradient, &input_grad_dnn_shape);
-      GetMklShape(context, kIdxOrigInput, &orig_input_dnn_shape);
-      GetMklShape(context, kIdxOrigOutput, &orig_output_dnn_shape);
-
-      if (input_grad_dnn_shape.IsMklTensor()) {
-        input_gradient_tensor =
-            ConvertMklToTF<T>(context,
-                              MklGetInput(context, kIdxGradient),
-                              input_grad_dnn_shape);
-      } else {
-        input_gradient_tensor = MklGetInput(context, kIdxGradient);
-      }
-
-      if (orig_input_dnn_shape.IsMklTensor()) {
-        orig_input_tensor =
-            ConvertMklToTF<T>(context,
-                              MklGetInput(context, kIdxOrigInput),
-                              orig_input_dnn_shape);
-      } else {
-        orig_input_tensor = MklGetInput(context, kIdxOrigInput);
-      }
+  // Fallback implementation - Taken from lrn_op.cc
+  // TODO(intelft) Check if we can use EigenLRNOp directly instead of making a
+  // copy.
+  void MklDefaultToEigen(OpKernelContext* context) {
+    Tensor input_gradient_tensor;
+    Tensor orig_input_tensor;
+    Tensor orig_output_tensor;
+
+    MklDnnShape input_grad_dnn_shape, orig_input_dnn_shape,
+        orig_output_dnn_shape;
+    GetMklShape(context, kIdxGradient, &input_grad_dnn_shape);
+    GetMklShape(context, kIdxOrigInput, &orig_input_dnn_shape);
+    GetMklShape(context, kIdxOrigOutput, &orig_output_dnn_shape);
+
+    if (input_grad_dnn_shape.IsMklTensor()) {
+      input_gradient_tensor = ConvertMklToTF<T>(
+          context, MklGetInput(context, kIdxGradient), input_grad_dnn_shape);
+    } else {
+      input_gradient_tensor = MklGetInput(context, kIdxGradient);
+    }
 
-      if (orig_output_dnn_shape.IsMklTensor()) {
-        orig_output_tensor =
-            ConvertMklToTF<T>(context,
-                              MklGetInput(context, kIdxOrigOutput),
-                              orig_output_dnn_shape);
-      } else {
-        orig_output_tensor = MklGetInput(context, kIdxOrigOutput);
-      }
+    if (orig_input_dnn_shape.IsMklTensor()) {
+      orig_input_tensor = ConvertMklToTF<T>(
+          context, MklGetInput(context, kIdxOrigInput), orig_input_dnn_shape);
+    } else {
+      orig_input_tensor = MklGetInput(context, kIdxOrigInput);
+    }
 
-      const int64 batch = static_cast<int64>(input_gradient_tensor.dim_size(0));
-      const int64 rows = static_cast<int64>(input_gradient_tensor.dim_size(1));
-      const int64 cols = static_cast<int64>(input_gradient_tensor.dim_size(2));
-      const int64 depth = static_cast<int64>(input_gradient_tensor.dim_size(3));
-      const auto nodes = cols * rows;
+    if (orig_output_dnn_shape.IsMklTensor()) {
+      orig_output_tensor = ConvertMklToTF<T>(
+          context, MklGetInput(context, kIdxOrigOutput), orig_output_dnn_shape);
+    } else {
+      orig_output_tensor = MklGetInput(context, kIdxOrigOutput);
+    }
 
-      auto grads_shaped =
-          input_gradient_tensor.shaped<T, 2>({nodes * batch, depth});
+    const int64 batch = static_cast<int64>(input_gradient_tensor.dim_size(0));
+    const int64 rows = static_cast<int64>(input_gradient_tensor.dim_size(1));
+    const int64 cols = static_cast<int64>(input_gradient_tensor.dim_size(2));
+    const int64 depth = static_cast<int64>(input_gradient_tensor.dim_size(3));
+    const auto nodes = cols * rows;
 
-      auto in_shaped = orig_input_tensor.shaped<T, 2>({nodes * batch, depth});
-      auto activations =
-              orig_output_tensor.shaped<T, 2>({nodes * batch, depth});
+    auto grads_shaped =
+        input_gradient_tensor.shaped<T, 2>({nodes * batch, depth});
 
-      Tensor* output_dnn_data;
-      MklShape mkl_output_mkl_shape;
-      mkl_output_mkl_shape.SetMklTensor(false);
-      mkl_output_mkl_shape.SetDimensions(4);
-      AllocateOutputSetMklShape(context, kIdxOutput,
-                      &output_dnn_data,
-                      input_gradient_tensor.shape(),
-                      mkl_output_mkl_shape);
+    auto in_shaped = orig_input_tensor.shaped<T, 2>({nodes * batch, depth});
+    auto activations = orig_output_tensor.shaped<T, 2>({nodes * batch, depth});
 
-      auto out_shaped = output_dnn_data->shaped<T, 2>({nodes * batch, depth});
-      out_shaped.setZero();
-      auto shard = [this, activations, in_shaped, grads_shaped, out_shaped,
-                    depth](int64 begin, int64 end) {
-        for (int64 i = begin; i < end; ++i) {
-          for (int64 j = 0; j < depth; ++j) {
-            int64 depth_begin = std::max<int64>(0, j - depth_radius_);
-            int64 depth_end = std::min<int64>(depth, j + depth_radius_ + 1);
+    Tensor* output_dnn_data;
+    MklShape mkl_output_mkl_shape;
+    mkl_output_mkl_shape.SetMklTensor(false);
+    mkl_output_mkl_shape.SetDimensions(4);
+    AllocateOutputSetMklShape(context, kIdxOutput, &output_dnn_data,
+                              input_gradient_tensor.shape(),
+                              mkl_output_mkl_shape);
 
-            T norm(0);
-            for (int64 k = depth_begin; k < depth_end; ++k) {
-              norm += in_shaped(i, k) * in_shaped(i, k);
-            }
-            norm = alpha_ * norm + bias_;
-            DCHECK_GT(norm, T(1e-6));
-            for (int64 k = depth_begin; k < depth_end; ++k) {
-              T dyi = T(-2) * alpha_ * beta_ * in_shaped(i, k) *
-                      activations(i, j) / norm;
-              if (k == j) {
-                dyi += Eigen::numext::pow(norm, -beta_);
-              }
-              dyi *= grads_shaped(i, j);
-              const_cast<typename TTypes<T, 2>::Tensor&>(out_shaped)(i, k) +=
-                  dyi;
+    auto out_shaped = output_dnn_data->shaped<T, 2>({nodes * batch, depth});
+    out_shaped.setZero();
+    auto shard = [this, activations, in_shaped, grads_shaped, out_shaped,
+                  depth](int64 begin, int64 end) {
+      for (int64 i = begin; i < end; ++i) {
+        for (int64 j = 0; j < depth; ++j) {
+          int64 depth_begin = std::max<int64>(0, j - depth_radius_);
+          int64 depth_end = std::min<int64>(depth, j + depth_radius_ + 1);
+
+          T norm(0);
+          for (int64 k = depth_begin; k < depth_end; ++k) {
+            norm += in_shaped(i, k) * in_shaped(i, k);
+          }
+          norm = alpha_ * norm + bias_;
+          DCHECK_GT(norm, T(1e-6));
+          for (int64 k = depth_begin; k < depth_end; ++k) {
+            T dyi = T(-2) * alpha_ * beta_ * in_shaped(i, k) *
+                    activations(i, j) / norm;
+            if (k == j) {
+              dyi += Eigen::numext::pow(norm, -beta_);
             }
+            dyi *= grads_shaped(i, j);
+            const_cast<typename TTypes<T, 2>::Tensor&>(out_shaped)(i, k) += dyi;
           }
         }
-      };
-      auto worker_threads =
-          *(context->device()->tensorflow_cpu_worker_threads());
-      Shard(worker_threads.num_threads, worker_threads.workers, nodes * batch,
-            depth * depth, shard);
-    }
+      }
+    };
+    auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
+    Shard(worker_threads.num_threads, worker_threads.workers, nodes * batch,
+          depth * depth, shard);
+  }
 
   void SanityCheckInputs(OpKernelContext* context) {
     const Tensor& input_gradient_tensor = MklGetInput(context, kIdxGradient);
@@ -1317,59 +1281,59 @@ class MklLRNGradOp : public OpKernel {
     const Tensor& orig_output_tensor = MklGetInput(context, kIdxOrigOutput);
     const Tensor& workspace_tensor = MklGetInput(context, kIdxWorkspace);
     MklDnnShape in_grads_dnn_shape, in_image_dnn_shape, out_image_dnn_shape,
-                workspace_dnn_shape;
+        workspace_dnn_shape;
     GetMklShape(context, kIdxGradient, &in_grads_dnn_shape);
     GetMklShape(context, kIdxOrigInput, &in_image_dnn_shape);
     GetMklShape(context, kIdxOrigOutput, &out_image_dnn_shape);
     GetMklShape(context, kIdxWorkspace, &workspace_dnn_shape);
     if (in_grads_dnn_shape.IsMklTensor()) {
       OP_REQUIRES(context, in_grads_dnn_shape.GetDimension() == 4,
-                errors::InvalidArgument("Input gradient must be "
-                "4-dimensional"));
+                  errors::InvalidArgument("Input gradient must be "
+                                          "4-dimensional"));
     } else {
-      OP_REQUIRES(context, input_gradient_tensor.dims() == 4,
-            errors::InvalidArgument("input gradient must be 4-dimensional"));
+      OP_REQUIRES(
+          context, input_gradient_tensor.dims() == 4,
+          errors::InvalidArgument("input gradient must be 4-dimensional"));
     }
 
     if (in_image_dnn_shape.IsMklTensor()) {
       OP_REQUIRES(context, in_image_dnn_shape.GetDimension() == 4,
-                errors::InvalidArgument("input images must be "
-                "4-dimensional"));
+                  errors::InvalidArgument("input images must be "
+                                          "4-dimensional"));
     } else {
       OP_REQUIRES(context, orig_input_tensor.dims() == 4,
                   errors::InvalidArgument("input images must be "
-                  "4-dimensional"));
+                                          "4-dimensional"));
     }
 
     if (out_image_dnn_shape.IsMklTensor()) {
       OP_REQUIRES(context, out_image_dnn_shape.GetDimension() == 4,
-                errors::InvalidArgument("Output image must be "
-                "4-dimensional"));
+                  errors::InvalidArgument("Output image must be "
+                                          "4-dimensional"));
     } else {
-      OP_REQUIRES(context, orig_output_tensor.dims() == 4,
-            errors::InvalidArgument("Output image must be 4-dimensional"));
+      OP_REQUIRES(
+          context, orig_output_tensor.dims() == 4,
+          errors::InvalidArgument("Output image must be 4-dimensional"));
     }
 
     if (workspace_enabled_) {
       if (workspace_dnn_shape.IsMklTensor()) {
-        OP_REQUIRES(context, workspace_dnn_shape.IsMklTensor() == false,
-              errors::InvalidArgument("Workspace should not be MKL Tensor."));
+        OP_REQUIRES(
+            context, workspace_dnn_shape.IsMklTensor() == false,
+            errors::InvalidArgument("Workspace should not be MKL Tensor."));
       } else {
         OP_REQUIRES(context, workspace_tensor.dims() == 1,
-              errors::InvalidArgument("Workspace must be 1-dimensional"));
+                    errors::InvalidArgument("Workspace must be 1-dimensional"));
       }
     }
   }
 
-// Input("input_grads: T")
-// Input("input_image: T")
-// Input("output_image: T")
-// Input("workspace: uint8")
-  const int kIdxGradient = 0,
-            kIdxOrigInput = 1,
-            kIdxOrigOutput = 2,
-            kIdxWorkspace = 3,
-            kIdxOutput = 0;
+  // Input("input_grads: T")
+  // Input("input_image: T")
+  // Input("output_image: T")
+  // Input("workspace: uint8")
+  const int kIdxGradient = 0, kIdxOrigInput = 1, kIdxOrigOutput = 2,
+            kIdxWorkspace = 3, kIdxOutput = 0;
 
   typedef typename Eigen::Tensor<T, 1, Eigen::RowMajor>::DimensionPair DimPair;
   bool workspace_enabled_;
@@ -1393,7 +1357,6 @@ class MklLRNGradOp : public OpKernel {
                               .Label(mkl_op_registry::kMklOpLabel), \
                           MklLRNGradOp<T>);
 
-
 TF_CALL_float(REGISTER_MKL_LRN_CPU);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/mkl_maxpooling_op.cc b/tensorflow/core/kernels/mkl_maxpooling_op.cc
index 82c5229bab..0de27ccd60 100644
--- a/tensorflow/core/kernels/mkl_maxpooling_op.cc
+++ b/tensorflow/core/kernels/mkl_maxpooling_op.cc
@@ -25,14 +25,14 @@ limitations under the License.
 #ifdef INTEL_MKL_DNN
 #include <algorithm>
 #include "mkldnn.hpp"
-using mkldnn::memory;
+using mkldnn::algorithm;
+using mkldnn::engine;
 using mkldnn::error;
-using mkldnn::pooling_forward;
-using mkldnn::pooling_backward;
+using mkldnn::memory;
 using mkldnn::padding_kind;
-using mkldnn::engine;
+using mkldnn::pooling_backward;
+using mkldnn::pooling_forward;
 using mkldnn::prop_kind;
-using mkldnn::algorithm;
 #endif
 
 namespace tensorflow {
@@ -397,18 +397,19 @@ class MklMaxPoolingGradOp : public OpKernel {
       if (workspace_enabled == false) {
         if (convert_input != nullptr) {
           if (input_in_mkl_format == false) {
-            CHECK_EQ(
-                dnnConversionExecute_F32(
-                    convert_input, const_cast<void*>(static_cast<const void*>(
-                                       tensor_in.flat<T>().data())),
-                    input_buf),
-                E_SUCCESS);
+            CHECK_EQ(dnnConversionExecute_F32(
+                         convert_input,
+                         const_cast<void*>(static_cast<const void*>(
+                             tensor_in.flat<T>().data())),
+                         input_buf),
+                     E_SUCCESS);
             CHECK_EQ(dnnDelete_F32(convert_input), E_SUCCESS);
             convert_input = nullptr;
           } else {
             input_shape.GetConvertedFlatData(
-                lt_input_prim, const_cast<void*>(static_cast<const void*>(
-                                   tensor_in.flat<T>().data())),
+                lt_input_prim,
+                const_cast<void*>(
+                    static_cast<const void*>(tensor_in.flat<T>().data())),
                 input_buf);
           }
           pooling_resfwd[dnnResourceSrc] = input_buf;
@@ -453,8 +454,9 @@ class MklMaxPoolingGradOp : public OpKernel {
           CHECK_EQ(dnnDelete_F32(convert_outbackprop), E_SUCCESS);
         } else {
           output_backprop_shape.GetConvertedFlatData(
-              lt_outbackprop_prim, const_cast<void*>(static_cast<const void*>(
-                                       out_backprop.flat<T>().data())),
+              lt_outbackprop_prim,
+              const_cast<void*>(
+                  static_cast<const void*>(out_backprop.flat<T>().data())),
               outbackprop_buf);
         }
         pooling_res[dnnResourceDiffDst] = outbackprop_buf;
@@ -499,7 +501,7 @@ template <typename Device, typename T>
 class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
  public:
   explicit MklMaxPoolingOp(OpKernelConstruction* context)
-            : MklPoolingForwardOpBase<T>(context) {
+      : MklPoolingForwardOpBase<T>(context) {
     // In Max Pooling, MKLDNN does not allow passing workspace as NULL.
     // So we set workspace_enabled_ to true.
     this->workspace_enabled_ = true;
@@ -508,8 +510,8 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
   void Compute(OpKernelContext* context) override {
     try {
       auto cpu_engine = engine(engine::cpu, 0);
-      const Tensor& input_tensor = MklGetInput(context,
-                this->kInputTensorIndexInput);
+      const Tensor& input_tensor =
+          MklGetInput(context, this->kInputTensorIndexInput);
       MklDnnShape dnn_shape_input;
       GetMklShape(context, this->kInputTensorIndexInput, &dnn_shape_input);
       this->SanityCheckInput(context, input_tensor, dnn_shape_input);
@@ -522,9 +524,8 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
       // initialize variables for the pooling op
       MklPoolParameters pool_params;
       // Get the input tensor and initialize the pooling parameters
-      this->ConfigureInput(context, dnn_shape_input,
-                        input_tensor, &pool_params,
-                        &dnn_data_input);
+      this->ConfigureInput(context, dnn_shape_input, input_tensor, &pool_params,
+                           &dnn_data_input);
       OP_REQUIRES_OK(context, context->status());
 
       // Declare output tensor
@@ -535,9 +536,10 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
       // If input is in Mkl layout, then just get the memory format from it
       // directly, instead of using input data_format to MaxPool.
       if (dnn_shape_input.IsMklTensor()) {
-        dnn_data_output.SetUsrMem(output_dims_mkl_order,
-                                  static_cast<memory::format>(
-              dnn_data_input.GetUsrMemDesc().data.format));
+        dnn_data_output.SetUsrMem(
+            output_dims_mkl_order,
+            static_cast<memory::format>(
+                dnn_data_input.GetUsrMemDesc().data.format));
       } else {
         dnn_data_output.SetUsrMem(output_dims_mkl_order,
                                   this->data_format_mkldnn_);
@@ -546,24 +548,21 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
       // describe the memory layout; let mkl-dnn choose the best for the op
       dnn_data_output.SetOpMemDesc(output_dims_mkl_order, memory::format::any);
 
-      auto pool_desc = pooling_forward::desc(prop_kind::forward,
-            algorithm::pooling_max,
-            dnn_data_input.GetUsrMemDesc(),
-            dnn_data_output.GetUsrMemDesc(),
-            memory::dims({  pool_params.row_stride,
-                            pool_params.col_stride}),
-            memory::dims({  pool_params.window_rows,
-                            pool_params.window_cols}),
-            memory::dims({  static_cast<int>(pool_params.pad_top),
-                            static_cast<int>(pool_params.pad_left)}),
-            memory::dims({  static_cast<int>(pool_params.pad_bottom),
-                            static_cast<int>(pool_params.pad_right)}),
-            TFPaddingToMklDnnPadding(this->padding_));
-        auto pool_fwd_desc = pooling_forward::primitive_desc(pool_desc,
-            cpu_engine);
+      auto pool_desc = pooling_forward::desc(
+          prop_kind::forward, algorithm::pooling_max,
+          dnn_data_input.GetUsrMemDesc(), dnn_data_output.GetUsrMemDesc(),
+          memory::dims({pool_params.row_stride, pool_params.col_stride}),
+          memory::dims({pool_params.window_rows, pool_params.window_cols}),
+          memory::dims({static_cast<int>(pool_params.pad_top),
+                        static_cast<int>(pool_params.pad_left)}),
+          memory::dims({static_cast<int>(pool_params.pad_bottom),
+                        static_cast<int>(pool_params.pad_right)}),
+          TFPaddingToMklDnnPadding(this->padding_));
+      auto pool_fwd_desc =
+          pooling_forward::primitive_desc(pool_desc, cpu_engine);
 
       this->AllocateOutputTensor(context, pool_fwd_desc, output_dims_mkl_order,
-                            this->data_format_mkldnn_, &output_tensor);
+                                 this->data_format_mkldnn_, &output_tensor);
       OP_REQUIRES_OK(context, context->status());
       dnn_data_output.SetUsrMemDataHandle(output_tensor);
 
@@ -571,39 +570,38 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
       OP_REQUIRES_OK(context, context->status());
 
       this->PrepareAndExecuteNet(pool_fwd_desc, &dnn_data_input,
-                        &dnn_data_output, &dnn_data_wksp);
-    } catch (mkldnn::error &e) {
-        string error_msg = "Status: " + std::to_string(e.status) +
-                        ", message: " + string(e.message) +
-                        ", in file " + string(__FILE__) + ":" +
-                        std::to_string(__LINE__);
-        OP_REQUIRES_OK(context,
-                        errors::Aborted("Compute received an exception:",
-                                         error_msg));
+                                 &dnn_data_output, &dnn_data_wksp);
+    } catch (mkldnn::error& e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(context, errors::Aborted("Compute received an exception:",
+                                              error_msg));
     }
   }  // Compute
 
  private:
-    const int kOutputTensorIndexWorkspace = 1;
-
-    void AllocateWorkspaceTensor(OpKernelContext* context,
-                const pooling_forward::primitive_desc& pool_fwd_prim_desc,
-                MklDnnData<uint8>* dnn_data_wksp) {
-        CHECK_NOTNULL(dnn_data_wksp);
-        Tensor* workspace_tensor = nullptr;
-        memory::primitive_desc workspace_pd
-                    = pool_fwd_prim_desc.workspace_primitive_desc();
-        size_t workspace_bytes = workspace_pd.get_size();
-        MklDnnShape workspace_mkl_shape;
-        workspace_mkl_shape.SetMklTensor(false);
-        TensorShape workspace_tf_shape;
-        workspace_tf_shape.AddDim(workspace_bytes);
-        AllocateOutputSetMklShape(context, kOutputTensorIndexWorkspace,
-                                &workspace_tensor,
-                                workspace_tf_shape, workspace_mkl_shape);
-        CHECK_NOTNULL(workspace_tensor);
-        dnn_data_wksp->SetUsrMem(workspace_pd, workspace_tensor);
-    }
+  const int kOutputTensorIndexWorkspace = 1;
+
+  void AllocateWorkspaceTensor(
+      OpKernelContext* context,
+      const pooling_forward::primitive_desc& pool_fwd_prim_desc,
+      MklDnnData<uint8>* dnn_data_wksp) {
+    CHECK_NOTNULL(dnn_data_wksp);
+    Tensor* workspace_tensor = nullptr;
+    memory::primitive_desc workspace_pd =
+        pool_fwd_prim_desc.workspace_primitive_desc();
+    size_t workspace_bytes = workspace_pd.get_size();
+    MklDnnShape workspace_mkl_shape;
+    workspace_mkl_shape.SetMklTensor(false);
+    TensorShape workspace_tf_shape;
+    workspace_tf_shape.AddDim(workspace_bytes);
+    AllocateOutputSetMklShape(context, kOutputTensorIndexWorkspace,
+                              &workspace_tensor, workspace_tf_shape,
+                              workspace_mkl_shape);
+    CHECK_NOTNULL(workspace_tensor);
+    dnn_data_wksp->SetUsrMem(workspace_pd, workspace_tensor);
+  }
 };
 
 // The operation to compute MaxPool gradients.
@@ -616,218 +614,183 @@ template <class Device, class T>
 class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase<T> {
  public:
   explicit MklMaxPoolingGradOp(OpKernelConstruction* context)
-      : MklPoolingBackwardOpBase<T>(context) {
-  }
+      : MklPoolingBackwardOpBase<T>(context) {}
 
   void Compute(OpKernelContext* context) override {
     try {
-        auto cpu_engine = engine(engine::cpu, 0);
-        const Tensor& orig_input_tensor = MklGetInput(context,
-            kInputTensorIndexOrigInput);
-        const Tensor& orig_output_tensor = MklGetInput(context,
-            kInputTensorIndexOrigOutput);
-        const Tensor& grad_tensor = MklGetInput(context,
-            kInputTensorIndexGradient);
-        const Tensor& workspace_tensor = MklGetInput(context,
-            kInputTensorIndexWorkspace);
-        MklDnnShape orig_input_mkl_shape,
-                    orig_output_mkl_shape,
-                    grad_mkl_shape,
-                    workspace_mkl_shape;
-        GetMklShape(context, kInputTensorIndexOrigInput,
-            &orig_input_mkl_shape);
-        GetMklShape(context, kInputTensorIndexOrigOutput,
-            &orig_output_mkl_shape);
-        GetMklShape(context, kInputTensorIndexGradient,
-            &grad_mkl_shape);
-        GetMklShape(context, kInputTensorIndexWorkspace,
-            &workspace_mkl_shape);
-
-        SanityCheckInputs(context,
-                            orig_input_tensor, orig_output_tensor,
-                            grad_tensor, workspace_tensor,
-                            orig_input_mkl_shape, orig_output_mkl_shape,
-                            grad_mkl_shape, workspace_mkl_shape);
-        if (!context->status().ok()) return;
-
-        MklDnnData<T> grad_dnn_data(&cpu_engine);
-        MklDnnData<uint8> workspace_dnn_data(&cpu_engine);
-        MklDnnData<T> output_dnn_data(&cpu_engine);
-        Tensor* output_tensor = nullptr;
-        MklPoolParameters pool_params;
-        TensorShape orig_input_shape;
-        memory::dims output_dims_mkl_order, orig_input_dims_mkl_order;
-        memory::desc original_input_md = ConfigureOriginalInput(context,
-                                orig_input_tensor,
-                                orig_input_mkl_shape,
-                                &orig_input_dims_mkl_order,
-                                &pool_params,
-                                &orig_input_shape);
-
-        memory::desc original_output_md = this->ConfigureOriginalOutput(
-                                pool_params,
-                                orig_output_mkl_shape,
-                                output_dims_mkl_order);
-
-        memory::desc target_diff_dst_md =  this->ConfigureInputGradient(
-                                        grad_mkl_shape,
-                                        grad_tensor,
-                                        &grad_dnn_data,
-                                        original_output_md);
-
-        output_dnn_data.SetUsrMem(original_input_md);
-
-        // Create the forward pooling primitive descriptor so we can
-        // pass it as a hint to the backward pooling primitive descriptor
-        auto pool_fwd_desc = pooling_forward::desc(prop_kind::forward,
-                algorithm::pooling_max,
-                original_input_md,
-                original_output_md,
-                memory::dims({  pool_params.row_stride,
-                                pool_params.col_stride}),
-                memory::dims({  pool_params.window_rows,
-                                pool_params.window_cols}),
-                memory::dims({  static_cast<int>(pool_params.pad_top),
-                                static_cast<int>(pool_params.pad_left)}),
-                memory::dims({  static_cast<int>(pool_params.pad_bottom),
-                                static_cast<int>(pool_params.pad_right)}),
-                TFPaddingToMklDnnPadding(this->padding_));
-        auto pool_fwd_prim_desc
-                = pooling_forward::primitive_desc(pool_fwd_desc,
-                                                    cpu_engine);
-
-        auto pool_bkwd_desc = pooling_backward::desc(
-                algorithm::pooling_max,
-                output_dnn_data.GetUsrMemDesc(),
-                target_diff_dst_md,
-                memory::dims({  pool_params.row_stride,
-                                pool_params.col_stride}),
-                memory::dims({  pool_params.window_rows,
-                                pool_params.window_cols}),
-                memory::dims({  static_cast<int>(pool_params.pad_top),
-                                static_cast<int>(pool_params.pad_left)}),
-                memory::dims({  static_cast<int>(pool_params.pad_bottom),
-                                static_cast<int>(pool_params.pad_right)}),
-                TFPaddingToMklDnnPadding(this->padding_));
-        auto pool_bkwd_prim_desc
-            = pooling_backward::primitive_desc(pool_bkwd_desc,
-                                                cpu_engine,
-                                                pool_fwd_prim_desc);
-
-        this->AllocateOutputTensor(context, pool_bkwd_prim_desc,
-            orig_input_dims_mkl_order,
-            this->data_format_mkldnn_,
-            &output_tensor);
-        output_dnn_data.SetUsrMemDataHandle(output_tensor);
-
-        ConfigureWorkspace(workspace_tensor,
-                pool_fwd_prim_desc.workspace_primitive_desc(),
-                &workspace_dnn_data);
-        this->PrepareAndExecuteNet(pool_bkwd_prim_desc,
-                            &grad_dnn_data,
-                            &output_dnn_data,
-                            memory::primitive_desc(
-                                target_diff_dst_md,
-                                cpu_engine),
-                            &workspace_dnn_data);
-    } catch (mkldnn::error &e) {
-        string error_msg = "Status: " + std::to_string(e.status) +
-                        ", message: " + string(e.message) +
-                        ", in file " + string(__FILE__) + ":" +
-                        std::to_string(__LINE__);
-        OP_REQUIRES_OK(context,
-                        errors::Aborted("Compute received an exception:",
-                                         error_msg));
+      auto cpu_engine = engine(engine::cpu, 0);
+      const Tensor& orig_input_tensor =
+          MklGetInput(context, kInputTensorIndexOrigInput);
+      const Tensor& orig_output_tensor =
+          MklGetInput(context, kInputTensorIndexOrigOutput);
+      const Tensor& grad_tensor =
+          MklGetInput(context, kInputTensorIndexGradient);
+      const Tensor& workspace_tensor =
+          MklGetInput(context, kInputTensorIndexWorkspace);
+      MklDnnShape orig_input_mkl_shape, orig_output_mkl_shape, grad_mkl_shape,
+          workspace_mkl_shape;
+      GetMklShape(context, kInputTensorIndexOrigInput, &orig_input_mkl_shape);
+      GetMklShape(context, kInputTensorIndexOrigOutput, &orig_output_mkl_shape);
+      GetMklShape(context, kInputTensorIndexGradient, &grad_mkl_shape);
+      GetMklShape(context, kInputTensorIndexWorkspace, &workspace_mkl_shape);
+
+      SanityCheckInputs(context, orig_input_tensor, orig_output_tensor,
+                        grad_tensor, workspace_tensor, orig_input_mkl_shape,
+                        orig_output_mkl_shape, grad_mkl_shape,
+                        workspace_mkl_shape);
+      if (!context->status().ok()) return;
+
+      MklDnnData<T> grad_dnn_data(&cpu_engine);
+      MklDnnData<uint8> workspace_dnn_data(&cpu_engine);
+      MklDnnData<T> output_dnn_data(&cpu_engine);
+      Tensor* output_tensor = nullptr;
+      MklPoolParameters pool_params;
+      TensorShape orig_input_shape;
+      memory::dims output_dims_mkl_order, orig_input_dims_mkl_order;
+      memory::desc original_input_md = ConfigureOriginalInput(
+          context, orig_input_tensor, orig_input_mkl_shape,
+          &orig_input_dims_mkl_order, &pool_params, &orig_input_shape);
+
+      memory::desc original_output_md = this->ConfigureOriginalOutput(
+          pool_params, orig_output_mkl_shape, output_dims_mkl_order);
+
+      memory::desc target_diff_dst_md = this->ConfigureInputGradient(
+          grad_mkl_shape, grad_tensor, &grad_dnn_data, original_output_md);
+
+      output_dnn_data.SetUsrMem(original_input_md);
+
+      // Create the forward pooling primitive descriptor so we can
+      // pass it as a hint to the backward pooling primitive descriptor
+      auto pool_fwd_desc = pooling_forward::desc(
+          prop_kind::forward, algorithm::pooling_max, original_input_md,
+          original_output_md,
+          memory::dims({pool_params.row_stride, pool_params.col_stride}),
+          memory::dims({pool_params.window_rows, pool_params.window_cols}),
+          memory::dims({static_cast<int>(pool_params.pad_top),
+                        static_cast<int>(pool_params.pad_left)}),
+          memory::dims({static_cast<int>(pool_params.pad_bottom),
+                        static_cast<int>(pool_params.pad_right)}),
+          TFPaddingToMklDnnPadding(this->padding_));
+      auto pool_fwd_prim_desc =
+          pooling_forward::primitive_desc(pool_fwd_desc, cpu_engine);
+
+      auto pool_bkwd_desc = pooling_backward::desc(
+          algorithm::pooling_max, output_dnn_data.GetUsrMemDesc(),
+          target_diff_dst_md,
+          memory::dims({pool_params.row_stride, pool_params.col_stride}),
+          memory::dims({pool_params.window_rows, pool_params.window_cols}),
+          memory::dims({static_cast<int>(pool_params.pad_top),
+                        static_cast<int>(pool_params.pad_left)}),
+          memory::dims({static_cast<int>(pool_params.pad_bottom),
+                        static_cast<int>(pool_params.pad_right)}),
+          TFPaddingToMklDnnPadding(this->padding_));
+      auto pool_bkwd_prim_desc = pooling_backward::primitive_desc(
+          pool_bkwd_desc, cpu_engine, pool_fwd_prim_desc);
+
+      this->AllocateOutputTensor(context, pool_bkwd_prim_desc,
+                                 orig_input_dims_mkl_order,
+                                 this->data_format_mkldnn_, &output_tensor);
+      output_dnn_data.SetUsrMemDataHandle(output_tensor);
+
+      ConfigureWorkspace(workspace_tensor,
+                         pool_fwd_prim_desc.workspace_primitive_desc(),
+                         &workspace_dnn_data);
+      this->PrepareAndExecuteNet(
+          pool_bkwd_prim_desc, &grad_dnn_data, &output_dnn_data,
+          memory::primitive_desc(target_diff_dst_md, cpu_engine),
+          &workspace_dnn_data);
+    } catch (mkldnn::error& e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(context, errors::Aborted("Compute received an exception:",
+                                              error_msg));
     }
   }  // Compute
 
  private:
-    // .Input("orig_input: T")
-    // .Input("orig_output: T")
-    // .Input("grad: T")
-    // .Input("workspace: T")
-    const int kInputTensorIndexOrigInput = 0;
-    const int kInputTensorIndexOrigOutput = 1;
-    const int kInputTensorIndexGradient = 2;
-    const int kInputTensorIndexWorkspace = 3;
-    //  Output("output: T") in Base Class
-
-    memory::desc ConfigureOriginalInput(OpKernelContext* context,
-                                const Tensor& tensor_original_input,
-                                const MklDnnShape& original_input_mkl_shape,
-                                memory::dims* original_input_dims_mkl_order,
-                                MklPoolParameters* pool_params,
-                                TensorShape* input_tensor_shape) {
-        *input_tensor_shape = tensor_original_input.shape();
-        return MklPoolingBackwardOpBase<T>::ConfigureOriginalInput(
-                                        context,
-                                        tensor_original_input,
-                                        original_input_mkl_shape,
-                                        original_input_dims_mkl_order,
-                                        pool_params,
-                                        *input_tensor_shape);
-    }
+  // .Input("orig_input: T")
+  // .Input("orig_output: T")
+  // .Input("grad: T")
+  // .Input("workspace: T")
+  const int kInputTensorIndexOrigInput = 0;
+  const int kInputTensorIndexOrigOutput = 1;
+  const int kInputTensorIndexGradient = 2;
+  const int kInputTensorIndexWorkspace = 3;
+  //  Output("output: T") in Base Class
+
+  memory::desc ConfigureOriginalInput(
+      OpKernelContext* context, const Tensor& tensor_original_input,
+      const MklDnnShape& original_input_mkl_shape,
+      memory::dims* original_input_dims_mkl_order,
+      MklPoolParameters* pool_params, TensorShape* input_tensor_shape) {
+    *input_tensor_shape = tensor_original_input.shape();
+    return MklPoolingBackwardOpBase<T>::ConfigureOriginalInput(
+        context, tensor_original_input, original_input_mkl_shape,
+        original_input_dims_mkl_order, pool_params, *input_tensor_shape);
+  }
 
-    void ConfigureWorkspace(const Tensor& workspace_tensor,
-                        memory::primitive_desc workspace_pd,
-                        MklDnnData<uint8> *workspace_dnn_data) {
-        CHECK_NOTNULL(workspace_dnn_data);
+  void ConfigureWorkspace(const Tensor& workspace_tensor,
+                          memory::primitive_desc workspace_pd,
+                          MklDnnData<uint8>* workspace_dnn_data) {
+    CHECK_NOTNULL(workspace_dnn_data);
 
-        workspace_dnn_data->SetUsrMem(workspace_pd, &workspace_tensor);
-    }
+    workspace_dnn_data->SetUsrMem(workspace_pd, &workspace_tensor);
+  }
 
-    void SanityCheckInputs(OpKernelContext* context,
-                            const Tensor& orig_input_tensor,
-                            const Tensor& orig_output_tensor,
-                            const Tensor& grad_tensor,
-                            const Tensor& workspace_tensor,
-                            const MklDnnShape& orig_input_mkl_shape,
-                            const MklDnnShape& orig_output_mkl_shape,
-                            const MklDnnShape& grad_mkl_shape,
-                            const MklDnnShape& workspace_mkl_shape) {
-        if (!orig_input_mkl_shape.IsMklTensor()) {
-            OP_REQUIRES(context, orig_input_tensor.dims() == 4,
-                errors::InvalidArgument("Original input shape must be "
-                "4-dimensional"));
-        } else {
-            OP_REQUIRES(context, orig_input_mkl_shape.GetDimension() == 4,
-                    errors::InvalidArgument("Original input shape must be "
-                    "4-dimensional"));
-        }
-        if (!orig_output_mkl_shape.IsMklTensor()) {
-            OP_REQUIRES(context, orig_output_tensor.dims() == 4,
-                errors::InvalidArgument("Original output must be "
-                        "4-dimensional"));
-        } else {
-            OP_REQUIRES(context, orig_output_mkl_shape.GetDimension() == 4,
-                    errors::InvalidArgument("Original output must be "
-                    "4-dimensional"));
-        }
-        if (!grad_mkl_shape.IsMklTensor()) {
-            OP_REQUIRES(context, grad_tensor.dims() == 4,
-                errors::InvalidArgument("Gradient must be 4-dimensional"));
-        } else {
-            OP_REQUIRES(context, grad_mkl_shape.GetDimension() == 4,
-                    errors::InvalidArgument("Gradient must be "
-                    "4-dimensional"));
-        }
-        if (this->workspace_enabled_) {
-            // The workspace should not be an MKL tensor
-            OP_REQUIRES(context, workspace_mkl_shape.IsMklTensor() == false,
-                    errors::InvalidArgument("Workspace tensor should not"
-                                            " be an MKL Tensor."));
-            // It should only have one dimension
-            OP_REQUIRES(context, workspace_tensor.dims() == 1,
-                    errors::InvalidArgument("Workspace tensor must be "
-                                "1-dimensional"));
-        } else {
-            OP_REQUIRES(context, this->workspace_enabled_,
-                    errors::Unimplemented("MKL-DNN Max Pooling does not "
+  void SanityCheckInputs(OpKernelContext* context,
+                         const Tensor& orig_input_tensor,
+                         const Tensor& orig_output_tensor,
+                         const Tensor& grad_tensor,
+                         const Tensor& workspace_tensor,
+                         const MklDnnShape& orig_input_mkl_shape,
+                         const MklDnnShape& orig_output_mkl_shape,
+                         const MklDnnShape& grad_mkl_shape,
+                         const MklDnnShape& workspace_mkl_shape) {
+    if (!orig_input_mkl_shape.IsMklTensor()) {
+      OP_REQUIRES(context, orig_input_tensor.dims() == 4,
+                  errors::InvalidArgument("Original input shape must be "
+                                          "4-dimensional"));
+    } else {
+      OP_REQUIRES(context, orig_input_mkl_shape.GetDimension() == 4,
+                  errors::InvalidArgument("Original input shape must be "
+                                          "4-dimensional"));
+    }
+    if (!orig_output_mkl_shape.IsMklTensor()) {
+      OP_REQUIRES(context, orig_output_tensor.dims() == 4,
+                  errors::InvalidArgument("Original output must be "
+                                          "4-dimensional"));
+    } else {
+      OP_REQUIRES(context, orig_output_mkl_shape.GetDimension() == 4,
+                  errors::InvalidArgument("Original output must be "
+                                          "4-dimensional"));
+    }
+    if (!grad_mkl_shape.IsMklTensor()) {
+      OP_REQUIRES(context, grad_tensor.dims() == 4,
+                  errors::InvalidArgument("Gradient must be 4-dimensional"));
+    } else {
+      OP_REQUIRES(context, grad_mkl_shape.GetDimension() == 4,
+                  errors::InvalidArgument("Gradient must be "
+                                          "4-dimensional"));
+    }
+    if (this->workspace_enabled_) {
+      // The workspace should not be an MKL tensor
+      OP_REQUIRES(context, workspace_mkl_shape.IsMklTensor() == false,
+                  errors::InvalidArgument("Workspace tensor should not"
+                                          " be an MKL Tensor."));
+      // It should only have one dimension
+      OP_REQUIRES(context, workspace_tensor.dims() == 1,
+                  errors::InvalidArgument("Workspace tensor must be "
+                                          "1-dimensional"));
+    } else {
+      OP_REQUIRES(
+          context, this->workspace_enabled_,
+          errors::Unimplemented("MKL-DNN Max Pooling does not "
                                 "yet support the use case "
                                 "where MaxPoolGrad is called without first"
                                 " calling MaxPool."));
-        }
     }
+  }
 };  // MklMaxPoolingGradOp
 
 #endif  // INTEL_MKL_DNN
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.cc b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
index f7cadffd39..ef8597b057 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.cc
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
@@ -15,9 +15,9 @@ limitations under the License.
 
 #ifdef INTEL_MKL
 
-#include <vector>
-#include <limits>
 #include "tensorflow/core/kernels/mkl_pooling_ops_common.h"
+#include <limits>
+#include <vector>
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/kernels/bounds_check.h"
@@ -111,17 +111,17 @@ void MklPoolParameters::Init(OpKernelContext* context,
     // TF can work with int64, but mkldnn only supports int32
     // Fail if the height or width are greater than MAX_INT
 
-    OP_REQUIRES(context, FastBoundsCheck(out_height,
-                                         std::numeric_limits<int>::max()),
+    OP_REQUIRES(context,
+                FastBoundsCheck(out_height, std::numeric_limits<int>::max()),
                 errors::InvalidArgument("output height is too large"));
 
-    OP_REQUIRES(context, FastBoundsCheck(out_width,
-                                         std::numeric_limits<int>::max()),
+    OP_REQUIRES(context,
+                FastBoundsCheck(out_width, std::numeric_limits<int>::max()),
                 errors::InvalidArgument("output width is too large"));
 
 #endif
     out_depth = depth;  // output will have the same depth as the input
-  } else {  // we are pooling in the depth dimension
+  } else {              // we are pooling in the depth dimension
     // Our current version of depthwise max pooling does not support
     // any padding, and expects the depth_window to equal the depth
     // stride (no overlapping).
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.h b/tensorflow/core/kernels/mkl_pooling_ops_common.h
index b974b2c59a..880e45ab1e 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.h
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.h
@@ -17,16 +17,16 @@ limitations under the License.
 #define TENSORFLOW_CORE_KERNELS_MKL_POOLING_OPS_COMMON_H_
 
 #ifdef INTEL_MKL
-#include <vector>
 #include <string>
+#include <vector>
 #include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/padding.h"
 
 #ifdef INTEL_MKL_DNN
 #include "mkldnn.hpp"
 using mkldnn::memory;
-using mkldnn::pooling_forward;
 using mkldnn::pooling_backward;
+using mkldnn::pooling_forward;
 using mkldnn::stream;
 #endif
 
@@ -61,13 +61,25 @@ struct MklPoolParameters {
 
   TensorFormat data_format;
   MklPoolParameters()
-    : depth(0)
-    , tensor_in_cols(0), tensor_in_rows(0), tensor_in_batch(0)
-    , window_rows(0), window_cols(0), depth_window(0)
-    , row_stride(0), col_stride(0), depth_stride(0)
-    , out_height(0), out_width(0), out_depth(0)
-    , pad_left(0), pad_right(0), pad_top(0), pad_bottom(0), pad_depth(0)
-    , data_format(TensorFormat::FORMAT_NCHW) {}
+      : depth(0),
+        tensor_in_cols(0),
+        tensor_in_rows(0),
+        tensor_in_batch(0),
+        window_rows(0),
+        window_cols(0),
+        depth_window(0),
+        row_stride(0),
+        col_stride(0),
+        depth_stride(0),
+        out_height(0),
+        out_width(0),
+        out_depth(0),
+        pad_left(0),
+        pad_right(0),
+        pad_top(0),
+        pad_bottom(0),
+        pad_depth(0),
+        data_format(TensorFormat::FORMAT_NCHW) {}
 
   // Updates context->status if there is an invalid input.
   void Init(OpKernelContext* context, const std::vector<int32>& ksize,
@@ -96,33 +108,31 @@ template <class T>
 class MklPoolingOpBase : public OpKernel {
  public:
   explicit MklPoolingOpBase(OpKernelConstruction* context)
-            : OpKernel(context)
-            , workspace_enabled_(false) {
-      string data_format;
-      OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
-      OP_REQUIRES(context,
-            FormatFromString(data_format, &this->data_format_tf_),
-            errors::InvalidArgument("Invalid data format"));
-      this->data_format_mkldnn_
-                = TFDataFormatToMklDnnDataFormat(this->data_format_tf_);
-      OP_REQUIRES_OK(context, context->GetAttr("ksize", &this->ksize_));
-      OP_REQUIRES(context, this->ksize_.size() == 4,
-                  errors::InvalidArgument("Sliding window ksize field must "
-                                          "specify 4 dimensions"));
-      OP_REQUIRES_OK(context, context->GetAttr("strides", &this->stride_));
-      OP_REQUIRES(context, this->stride_.size() == 4,
-                  errors::InvalidArgument("Sliding window strides field must "
-                                          "specify 4 dimensions"));
-      OP_REQUIRES_OK(context, context->GetAttr("padding", &this->padding_));
-      OP_REQUIRES(context, this->ksize_[0] == 1 && this->stride_[0] == 1,
-                  errors::Unimplemented("Pooling is not yet supported on the "
-                                        "batch dimension."));
-
-      // We may not get this attribute for this node if it does not go through
-      // graph rewrite pass. So we do not check for error while retrieving this
-      // attribute value.
-      context->GetAttr("workspace_enabled", &this->workspace_enabled_);
-    }
+      : OpKernel(context), workspace_enabled_(false) {
+    string data_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &this->data_format_tf_),
+                errors::InvalidArgument("Invalid data format"));
+    this->data_format_mkldnn_ =
+        TFDataFormatToMklDnnDataFormat(this->data_format_tf_);
+    OP_REQUIRES_OK(context, context->GetAttr("ksize", &this->ksize_));
+    OP_REQUIRES(context, this->ksize_.size() == 4,
+                errors::InvalidArgument("Sliding window ksize field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &this->stride_));
+    OP_REQUIRES(context, this->stride_.size() == 4,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &this->padding_));
+    OP_REQUIRES(context, this->ksize_[0] == 1 && this->stride_[0] == 1,
+                errors::Unimplemented("Pooling is not yet supported on the "
+                                      "batch dimension."));
+
+    // We may not get this attribute for this node if it does not go through
+    // graph rewrite pass. So we do not check for error while retrieving this
+    // attribute value.
+    context->GetAttr("workspace_enabled", &this->workspace_enabled_);
+  }
   void Compute(OpKernelContext* context) override = 0;
 
  protected:
@@ -132,24 +142,24 @@ class MklPoolingOpBase : public OpKernel {
   // output height and output width to have already been int32
   // bounds-checked
   void GetOutputDims(const MklPoolParameters& mkl_pool_params,
-                    memory::dims* output_dims_mkl_order) {
+                     memory::dims* output_dims_mkl_order) {
     // MKL-DNN always needs output in NCHW format.
-    *output_dims_mkl_order = { mkl_pool_params.tensor_in_batch,
+    *output_dims_mkl_order = {mkl_pool_params.tensor_in_batch,
                               mkl_pool_params.out_depth,
                               static_cast<int>(mkl_pool_params.out_height),
                               static_cast<int>(mkl_pool_params.out_width)};
   }
 
   void InitMklPoolParameters(OpKernelContext* context,
-                      MklPoolParameters* pool_params,
-                      const MklDnnShape& original_input_mkl_shape,
-                      const TensorShape& input_tensor_shape) {
+                             MklPoolParameters* pool_params,
+                             const MklDnnShape& original_input_mkl_shape,
+                             const TensorShape& input_tensor_shape) {
     if (!original_input_mkl_shape.IsMklTensor()) {
       pool_params->Init(context, this->ksize_, this->stride_, this->padding_,
-          this->data_format_tf_, input_tensor_shape);
+                        this->data_format_tf_, input_tensor_shape);
     } else {
       pool_params->Init(context, this->ksize_, this->stride_, this->padding_,
-          this->data_format_tf_, &original_input_mkl_shape);
+                        this->data_format_tf_, &original_input_mkl_shape);
     }
   }
 
@@ -159,13 +169,12 @@ class MklPoolingOpBase : public OpKernel {
   size_t GetNumTElements(const memory::primitive_desc& pd) {
     size_t num_bytes = pd.get_size();
     size_t ret_val = num_bytes / sizeof(T);
-    if ( num_bytes % sizeof(T) != 0 ) {
-        ret_val++;
+    if (num_bytes % sizeof(T) != 0) {
+      ret_val++;
     }
     return ret_val;
   }
 
-
   std::vector<int32> ksize_;
   std::vector<int32> stride_;
   Padding padding_;
@@ -183,30 +192,29 @@ class MklPoolingForwardOpBase : public MklPoolingOpBase<T> {
 
  protected:
   void ConfigureInput(OpKernelContext* context,
-                    const MklDnnShape& input_mkl_shape,
-                    const Tensor& input_tensor,
-                    MklPoolParameters* pool_params,
-                    MklDnnData<T>* dnn_data_input) {
+                      const MklDnnShape& input_mkl_shape,
+                      const Tensor& input_tensor,
+                      MklPoolParameters* pool_params,
+                      MklDnnData<T>* dnn_data_input) {
     CHECK_NOTNULL(pool_params);
     CHECK_NOTNULL(dnn_data_input);
     TensorShape input_tensor_shape = input_tensor.shape();
-    memory::desc input_md = input_mkl_shape.IsMklTensor()
-                        ? input_mkl_shape.GetMklLayout()
-                        : memory::desc(
-                              TFShapeToMklDnnDimsInNCHW(
-                                  input_tensor_shape, this->data_format_tf_),
-                              MklDnnType<T>(),
-                              this->data_format_mkldnn_);
+    memory::desc input_md =
+        input_mkl_shape.IsMklTensor()
+            ? input_mkl_shape.GetMklLayout()
+            : memory::desc(TFShapeToMklDnnDimsInNCHW(input_tensor_shape,
+                                                     this->data_format_tf_),
+                           MklDnnType<T>(), this->data_format_mkldnn_);
     dnn_data_input->SetUsrMem(input_md, &input_tensor);
-    this->InitMklPoolParameters(context, pool_params,
-                      input_mkl_shape, input_tensor_shape);
+    this->InitMklPoolParameters(context, pool_params, input_mkl_shape,
+                                input_tensor_shape);
   }
 
-  void AllocateOutputTensor(OpKernelContext* context,
-            const pooling_forward::primitive_desc& pool_fwd_prim_desc,
-            const memory::dims output_dims_mkl_order,
-            const memory::format& output_tf_format,
-            Tensor** output_tensor) {
+  void AllocateOutputTensor(
+      OpKernelContext* context,
+      const pooling_forward::primitive_desc& pool_fwd_prim_desc,
+      const memory::dims output_dims_mkl_order,
+      const memory::format& output_tf_format, Tensor** output_tensor) {
     CHECK_NOTNULL(output_tensor);
     memory::primitive_desc dst_pd = pool_fwd_prim_desc.dst_primitive_desc();
 
@@ -215,50 +223,42 @@ class MklPoolingForwardOpBase : public MklPoolingOpBase<T> {
     output_mkl_shape.SetMklLayout(&dst_pd);
     output_mkl_shape.SetElemType(MklDnnType<T>());
     output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
-                              output_dims_mkl_order,
-                              output_tf_format);
+                                 output_dims_mkl_order, output_tf_format);
     TensorShape output_tf_shape;
 
     // only allocate enough space for the elements we need.
     output_tf_shape.AddDim(this->GetNumTElements(dst_pd));
-    AllocateOutputSetMklShape(context, kOutputTensorIndexOutput,
-                            output_tensor,
-                            output_tf_shape, output_mkl_shape);
+    AllocateOutputSetMklShape(context, kOutputTensorIndexOutput, output_tensor,
+                              output_tf_shape, output_mkl_shape);
     CHECK_NOTNULL(*output_tensor);
   }
 
   void PrepareAndExecuteNet(
-                  const pooling_forward::primitive_desc& pool_fwd_desc,
-                  const MklDnnData<T>* src,
-                  MklDnnData<T>* dst,
-                  MklDnnData<uint8>* wksp = nullptr) {
+      const pooling_forward::primitive_desc& pool_fwd_desc,
+      const MklDnnData<T>* src, MklDnnData<T>* dst,
+      MklDnnData<uint8>* wksp = nullptr) {
     std::vector<primitive> net;
 
     // Create pooling primitive and add it to net
     if (wksp != nullptr) {
-        net.push_back(pooling_forward(pool_fwd_desc,
-                        src->GetOpMem(),
-                        dst->GetOpMem(),
-                        wksp->GetOpMem()));
+      net.push_back(pooling_forward(pool_fwd_desc, src->GetOpMem(),
+                                    dst->GetOpMem(), wksp->GetOpMem()));
     } else {
-        net.push_back(pooling_forward(pool_fwd_desc,
-            src->GetOpMem(),
-            dst->GetOpMem()));
+      net.push_back(
+          pooling_forward(pool_fwd_desc, src->GetOpMem(), dst->GetOpMem()));
     }
     stream(stream::kind::eager).submit(net).wait();
   }
 
-
-  void SanityCheckInput(OpKernelContext* context,
-                  const Tensor& input_tensor,
-                  const MklDnnShape& input_mkl_shape) {
+  void SanityCheckInput(OpKernelContext* context, const Tensor& input_tensor,
+                        const MklDnnShape& input_mkl_shape) {
     if (!input_mkl_shape.IsMklTensor()) {
       OP_REQUIRES(context, input_tensor.dims() == 4,
-          errors::InvalidArgument("Input must be 4-dimensional"));
+                  errors::InvalidArgument("Input must be 4-dimensional"));
     } else {
-        OP_REQUIRES(context, input_mkl_shape.GetDimension() == 4,
-                errors::InvalidArgument("Input shape must be "
-                "4-dimensional"));
+      OP_REQUIRES(context, input_mkl_shape.GetDimension() == 4,
+                  errors::InvalidArgument("Input shape must be "
+                                          "4-dimensional"));
     }
   }
   // .Input("value: T")
@@ -267,66 +267,58 @@ class MklPoolingForwardOpBase : public MklPoolingOpBase<T> {
   const int kOutputTensorIndexOutput = 0;
 };  // MklPoolingForwardBaseOp
 
-
 template <class T>
 class MklPoolingBackwardOpBase : public MklPoolingOpBase<T> {
  public:
   explicit MklPoolingBackwardOpBase<T>(OpKernelConstruction* context)
-          : MklPoolingOpBase<T>(context) { }
+      : MklPoolingOpBase<T>(context) {}
   void Compute(OpKernelContext* context) override = 0;
 
  protected:
   const int kOutputTensorIndexOutput = 0;
 
-  void AllocateOutputTensor(OpKernelContext* context,
-            const pooling_backward::primitive_desc& pool_bkwd_prim_desc,
-            const memory::dims output_dims_mkl_order,
-            const memory::format& output_tf_format,
-            Tensor** output_tensor) {
+  void AllocateOutputTensor(
+      OpKernelContext* context,
+      const pooling_backward::primitive_desc& pool_bkwd_prim_desc,
+      const memory::dims output_dims_mkl_order,
+      const memory::format& output_tf_format, Tensor** output_tensor) {
     CHECK_NOTNULL(output_tensor);
-    memory::primitive_desc dst_pd
-                = pool_bkwd_prim_desc.diff_src_primitive_desc();
+    memory::primitive_desc dst_pd =
+        pool_bkwd_prim_desc.diff_src_primitive_desc();
     MklDnnShape output_mkl_shape;
     output_mkl_shape.SetMklTensor(true);
     output_mkl_shape.SetMklLayout(&dst_pd);
     output_mkl_shape.SetElemType(MklDnnType<T>());
     output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
-                              output_dims_mkl_order,
-                              output_tf_format);
+                                 output_dims_mkl_order, output_tf_format);
 
     TensorShape output_tf_shape;
     output_tf_shape.AddDim(this->GetNumTElements(dst_pd));
-    AllocateOutputSetMklShape(context, kOutputTensorIndexOutput,
-                            output_tensor,
-                            output_tf_shape, output_mkl_shape);
+    AllocateOutputSetMklShape(context, kOutputTensorIndexOutput, output_tensor,
+                              output_tf_shape, output_mkl_shape);
     CHECK_NOTNULL(*output_tensor);
   }
 
   void PrepareAndExecuteNet(
-    const pooling_backward::primitive_desc& pool_bkwd_desc,
-    MklDnnData<T>* input_gradient_diff_dst,
-    MklDnnData<T>* output_diff_src,
-    const memory::primitive_desc& target_diff_dst_pd,
-    const MklDnnData<uint8>* workspace = nullptr) {
-
+      const pooling_backward::primitive_desc& pool_bkwd_desc,
+      MklDnnData<T>* input_gradient_diff_dst, MklDnnData<T>* output_diff_src,
+      const memory::primitive_desc& target_diff_dst_pd,
+      const MklDnnData<uint8>* workspace = nullptr) {
     std::vector<primitive> net;
 
     // If the input gradient isn't in the same format as the output
     // reorder it to the same format as the output
-    input_gradient_diff_dst->CheckReorderToOpMem(
-            target_diff_dst_pd,
-            &net);
+    input_gradient_diff_dst->CheckReorderToOpMem(target_diff_dst_pd, &net);
 
     // Create pooling primitive and add it to net
     if (nullptr == workspace) {
       net.push_back(pooling_backward(pool_bkwd_desc,
-                              input_gradient_diff_dst->GetOpMem(),
-                              output_diff_src->GetOpMem()));
+                                     input_gradient_diff_dst->GetOpMem(),
+                                     output_diff_src->GetOpMem()));
     } else {
-      net.push_back(pooling_backward(pool_bkwd_desc,
-                                  input_gradient_diff_dst->GetOpMem(),
-                                  workspace->GetOpMem(),
-                                  output_diff_src->GetOpMem()));
+      net.push_back(
+          pooling_backward(pool_bkwd_desc, input_gradient_diff_dst->GetOpMem(),
+                           workspace->GetOpMem(), output_diff_src->GetOpMem()));
     }
     stream(stream::kind::eager).submit(net).wait();
   }
@@ -334,77 +326,73 @@ class MklPoolingBackwardOpBase : public MklPoolingOpBase<T> {
   // Max Pooling and Avg Pooling have slightly different implementations
   // Takes the Tensor containing original input data and the original
   // mkl Dnn Shape and populates other data
-  memory::desc ConfigureOriginalInput(OpKernelContext* context,
-                              const Tensor& tensor_original_input_shape,
-                              const MklDnnShape& original_input_mkl_shape,
-                              memory::dims* original_input_dims_nchw,
-                              MklPoolParameters* pool_params,
-                              const TensorShape& input_tensor_shape) {
+  memory::desc ConfigureOriginalInput(
+      OpKernelContext* context, const Tensor& tensor_original_input_shape,
+      const MklDnnShape& original_input_mkl_shape,
+      memory::dims* original_input_dims_nchw, MklPoolParameters* pool_params,
+      const TensorShape& input_tensor_shape) {
     CHECK_NOTNULL(original_input_dims_nchw);
     CHECK_NOTNULL(pool_params);
-    this->InitMklPoolParameters(context, pool_params,
-                          original_input_mkl_shape,
-                          input_tensor_shape);
-
-    *original_input_dims_nchw
-          = original_input_mkl_shape.IsMklTensor()
-          ? original_input_mkl_shape.GetSizesAsMklDnnDims()
-          : TFShapeToMklDnnDimsInNCHW(input_tensor_shape,
-        this->data_format_tf_);
-
-    return  original_input_mkl_shape.IsMklTensor()
-      ? original_input_mkl_shape.GetMklLayout()
-      : memory::desc(*original_input_dims_nchw,
-                      MklDnnType<T>(),
-                      this->data_format_mkldnn_);
+    this->InitMklPoolParameters(context, pool_params, original_input_mkl_shape,
+                                input_tensor_shape);
+
+    *original_input_dims_nchw =
+        original_input_mkl_shape.IsMklTensor()
+            ? original_input_mkl_shape.GetSizesAsMklDnnDims()
+            : TFShapeToMklDnnDimsInNCHW(input_tensor_shape,
+                                        this->data_format_tf_);
+
+    return original_input_mkl_shape.IsMklTensor()
+               ? original_input_mkl_shape.GetMklLayout()
+               : memory::desc(*original_input_dims_nchw, MklDnnType<T>(),
+                              this->data_format_mkldnn_);
   }
 
-  memory::desc ConfigureOriginalOutput(const MklPoolParameters& pool_params,
-                                const MklDnnShape& original_output_mkl_shape,
-                                      memory::dims output_dims_mkl_order) {
+  memory::desc ConfigureOriginalOutput(
+      const MklPoolParameters& pool_params,
+      const MklDnnShape& original_output_mkl_shape,
+      memory::dims output_dims_mkl_order) {
     this->GetOutputDims(pool_params, &output_dims_mkl_order);
 
     return original_output_mkl_shape.IsMklTensor()
-            ? original_output_mkl_shape.GetMklLayout()
-            : memory::desc(output_dims_mkl_order,
-                         MklDnnType<T>(),
-                         this->data_format_mkldnn_);
+               ? original_output_mkl_shape.GetMklLayout()
+               : memory::desc(output_dims_mkl_order, MklDnnType<T>(),
+                              this->data_format_mkldnn_);
   }
 
   memory::desc ConfigureInputGradient(
-        const MklDnnShape& input_gradient_mkl_shape,
-        const Tensor& input_gradient_tensor,
-        MklDnnData<T>* input_gradient_dnn_data,
-        const memory::desc& original_output_md) {
+      const MklDnnShape& input_gradient_mkl_shape,
+      const Tensor& input_gradient_tensor,
+      MklDnnData<T>* input_gradient_dnn_data,
+      const memory::desc& original_output_md) {
     // Configure the gradient as is
-    memory::desc original_input_grad_md
-          = input_gradient_mkl_shape.IsMklTensor()
-          ? input_gradient_mkl_shape.GetMklLayout()
-          : memory::desc(TFShapeToMklDnnDimsInNCHW(
-                    input_gradient_tensor.shape(),
-                    this->data_format_tf_),
-                    MklDnnType<T>(), this->data_format_mkldnn_);
+    memory::desc original_input_grad_md =
+        input_gradient_mkl_shape.IsMklTensor()
+            ? input_gradient_mkl_shape.GetMklLayout()
+            : memory::desc(
+                  TFShapeToMklDnnDimsInNCHW(input_gradient_tensor.shape(),
+                                            this->data_format_tf_),
+                  MklDnnType<T>(), this->data_format_mkldnn_);
 
     input_gradient_dnn_data->SetUsrMem(original_input_grad_md,
-                &input_gradient_tensor);
+                                       &input_gradient_tensor);
 
     // Check to see if input grad diff dst is in the right format
     // Create a new memory descriptor with the same shape as the
     // original, but the format of the other tensors.
     memory::format original_output_format =
-            static_cast<memory::format>(original_output_md.data.format);
-    bool grad_reorder_needed = input_gradient_dnn_data->IsReorderNeeded(
-                                    original_output_format);
-    memory::dims diff_dst_dims = input_gradient_mkl_shape.IsMklTensor()
-        ? input_gradient_mkl_shape.GetSizesAsMklDnnDims()
-        : TFShapeToMklDnnDimsInNCHW(input_gradient_tensor.shape(),
-                    this->data_format_tf_);
-    memory::desc target_diff_dst_md = memory::desc(diff_dst_dims,
-        MklDnnType<T>(), original_output_format);
-
-    return grad_reorder_needed
-            ? target_diff_dst_md
-            : original_input_grad_md;
+        static_cast<memory::format>(original_output_md.data.format);
+    bool grad_reorder_needed =
+        input_gradient_dnn_data->IsReorderNeeded(original_output_format);
+    memory::dims diff_dst_dims =
+        input_gradient_mkl_shape.IsMklTensor()
+            ? input_gradient_mkl_shape.GetSizesAsMklDnnDims()
+            : TFShapeToMklDnnDimsInNCHW(input_gradient_tensor.shape(),
+                                        this->data_format_tf_);
+    memory::desc target_diff_dst_md =
+        memory::desc(diff_dst_dims, MklDnnType<T>(), original_output_format);
+
+    return grad_reorder_needed ? target_diff_dst_md : original_input_grad_md;
   }
 };
 #endif  // INTEL_MKL_DNN
diff --git a/tensorflow/core/kernels/mkl_relu_op.cc b/tensorflow/core/kernels/mkl_relu_op.cc
index dc899d8c7e..873aca30ca 100644
--- a/tensorflow/core/kernels/mkl_relu_op.cc
+++ b/tensorflow/core/kernels/mkl_relu_op.cc
@@ -16,29 +16,29 @@ limitations under the License.
 // See docs in ../ops/nn_ops.cc.
 #ifdef INTEL_MKL
 
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
-#include "tensorflow/core/platform/default/logging.h"
-#include "tensorflow/core/util/mkl_util.h"
 #include "mkl_dnn.h"
 #include "mkl_dnn_types.h"
+#include "tensorflow/core/platform/default/logging.h"
+#include "tensorflow/core/util/mkl_util.h"
 
 #ifdef INTEL_MKL_DNN
 #include "mkldnn.hpp"
 
-using mkldnn::stream;
-using mkldnn::prop_kind;
 using mkldnn::algorithm;
-using mkldnn::relu_forward;
-using mkldnn::relu_backward;
-using mkldnn::eltwise_relu;
 using mkldnn::eltwise_elu;
+using mkldnn::eltwise_relu;
 using mkldnn::eltwise_tanh;
+using mkldnn::prop_kind;
+using mkldnn::relu_backward;
+using mkldnn::relu_forward;
+using mkldnn::stream;
 #endif
 
 namespace tensorflow {
@@ -180,7 +180,6 @@ class MklReluOp : public OpKernel {
   } MklReluOpContext;
 };
 
-
 template <typename Device, typename T>
 class MklReluGradOp : public OpKernel {
  public:
@@ -214,10 +213,11 @@ class MklReluGradOp : public OpKernel {
       if (!dnnLayoutCompare_F32(lt_input, lt_grad)) {
         AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, lt_grad,
                        &mkl_buffer_convert);
-        CHECK_EQ(dnnConversionCreate_F32(&cv_input_to_grad, lt_input,
-                   lt_grad), E_SUCCESS);
+        CHECK_EQ(dnnConversionCreate_F32(&cv_input_to_grad, lt_input, lt_grad),
+                 E_SUCCESS);
         CHECK_EQ(dnnConversionExecute_F32(cv_input_to_grad, buf_input,
-                                          mkl_buffer_convert), E_SUCCESS);
+                                          mkl_buffer_convert),
+                 E_SUCCESS);
         relu_res[dnnResourceSrc] = mkl_buffer_convert;
         dnnDelete_F32(cv_input_to_grad);
       } else {
@@ -325,7 +325,8 @@ void MklReluGradOp<Device, T>::Compute(OpKernelContext* context) {
   float negative_slope = 0.0;
   CHECK_EQ(dnnReLUCreateBackward_F32(&mkl_context.prim_relu_bwd, NULL,
                                      mkl_context.lt_grad, mkl_context.lt_grad,
-                                     negative_slope), E_SUCCESS);
+                                     negative_slope),
+           E_SUCCESS);
   Tensor mkl_tmp_input_buf_tensor;
   mkl_context.MklPrepareReluGradInputs(context, &mkl_tmp_input_buf_tensor);
 
@@ -348,7 +349,8 @@ void MklReluGradOp<Device, T>::Compute(OpKernelContext* context) {
     }
 
     tf_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
-                    mkl_context.output_shape.GetMklLayout())) / sizeof(T));
+                        mkl_context.output_shape.GetMklLayout())) /
+                    sizeof(T));
     AllocateOutputSetMklShape(context, 0, &output, tf_shape,
                               mkl_context.output_shape);
   } else {
@@ -361,13 +363,11 @@ void MklReluGradOp<Device, T>::Compute(OpKernelContext* context) {
   mkl_context.relu_res[dnnResourceDiffSrc] =
       static_cast<void*>(output->flat<T>().data());
 
-  CHECK_EQ(dnnExecute_F32(mkl_context.prim_relu_bwd,
-                          mkl_context.relu_res),
-                          E_SUCCESS);
+  CHECK_EQ(dnnExecute_F32(mkl_context.prim_relu_bwd, mkl_context.relu_res),
+           E_SUCCESS);
   mkl_context.MklCleanup();
 }
 
-
 #else  // INTEL_MKL_DNN
 
 template <typename Device, typename T, algorithm alg_kind>
@@ -375,8 +375,7 @@ class MklReluOpBase : public OpKernel {
  public:
   ~MklReluOpBase() {}
 
-  explicit MklReluOpBase(OpKernelConstruction* context) : OpKernel(context) {
-  }
+  explicit MklReluOpBase(OpKernelConstruction* context) : OpKernel(context) {}
 
   virtual void Compute_Scalar(OpKernelContext* context) = 0;
 
@@ -413,12 +412,12 @@ class MklReluOpBase : public OpKernel {
 
       T alpha = 0, beta = 0;
       std::shared_ptr<relu_forward::primitive_desc> relu_fwd_pd;
-      auto relu_fwd_desc = relu_forward::desc(prop_kind::forward_training,
+      auto relu_fwd_desc = relu_forward::desc(
+          prop_kind::forward_training,
           // Operator memory descriptor is same as user memory descriptor.
-                                              alg_kind, src.GetUsrMemDesc(),
-                                              alpha, beta);
-      relu_fwd_pd.reset(new relu_forward::primitive_desc(relu_fwd_desc,
-                                                         cpu_engine));
+          alg_kind, src.GetUsrMemDesc(), alpha, beta);
+      relu_fwd_pd.reset(
+          new relu_forward::primitive_desc(relu_fwd_desc, cpu_engine));
 
       // allocate dst tensor
       MklDnnShape dnn_shape_dst;
@@ -431,7 +430,7 @@ class MklReluOpBase : public OpKernel {
         dnn_shape_dst.SetTfLayout(dnn_shape_src.GetDimension(),
                                   dnn_shape_src.GetSizesAsMklDnnDims(),
                                   dnn_shape_src.GetTfDataFormat());
-        tf_shape_dst.AddDim(dst_pd.get_size()/sizeof(T));
+        tf_shape_dst.AddDim(dst_pd.get_size() / sizeof(T));
       } else {
         dnn_shape_dst.SetMklTensor(false);
         tf_shape_dst = src_tensor.shape();
@@ -445,34 +444,32 @@ class MklReluOpBase : public OpKernel {
 
       // execute net
       std::vector<primitive> net;
-      auto relu_fwd = relu_forward(*relu_fwd_pd, src.GetOpMem(),
-                                   dst.GetOpMem());
+      auto relu_fwd =
+          relu_forward(*relu_fwd_pd, src.GetOpMem(), dst.GetOpMem());
       net.push_back(relu_fwd);
       stream(stream::kind::eager).submit(net).wait();
-    } catch (mkldnn::error &e) {
+    } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
-                         ", message: " + string(e.message) +
-                         ", in file " + string(__FILE__) + ":" +
-                         std::to_string(__LINE__);
-      OP_REQUIRES_OK(context,
-                     errors::Aborted("Operation received an exception:",
-                        error_msg));
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
     }
   }
 };
 
-
 template <typename Device, typename T, algorithm alg_kind>
 class MklReluGradOpBase : public OpKernel {
  public:
   ~MklReluGradOpBase() {}
 
-  explicit MklReluGradOpBase(OpKernelConstruction* context) :
-    OpKernel(context) {}
+  explicit MklReluGradOpBase(OpKernelConstruction* context)
+      : OpKernel(context) {}
 
   virtual void Compute_Scalar(OpKernelContext* context) = 0;
 
-  void Compute(OpKernelContext* context)  {
+  void Compute(OpKernelContext* context) {
     try {
       auto cpu_engine = engine(engine::cpu, 0);
       MklDnnData<T> src(&cpu_engine);
@@ -483,9 +480,9 @@ class MklReluGradOpBase : public OpKernel {
       const size_t src_index = 1;       // index of src input tensor
       const size_t diff_src_index = 0;  // index of diff_src output tensor
 
-      const Tensor& src_tensor      = MklGetInput(context, src_index);
+      const Tensor& src_tensor = MklGetInput(context, src_index);
       const Tensor& diff_dst_tensor = MklGetInput(context, diff_dst_index);
-      Tensor* diff_src_tensor       = nullptr;
+      Tensor* diff_src_tensor = nullptr;
 
       MklDnnShape dnn_shape_src, dnn_shape_diff_dst;
       GetMklShape(context, src_index, &dnn_shape_src);
@@ -526,25 +523,25 @@ class MklReluGradOpBase : public OpKernel {
         src_md = dnn_shape_src.GetMklLayout();
 
         memory::format src_mkl_data_format = dnn_shape_src.GetTfDataFormat();
-        auto src_tf_data_format = MklDnnDataFormatToTFDataFormat(
-                                    src_mkl_data_format);
+        auto src_tf_data_format =
+            MklDnnDataFormatToTFDataFormat(src_mkl_data_format);
         auto diff_dst_dims = TFShapeToMklDnnDimsInNCHW(diff_dst_tensor.shape(),
                                                        src_tf_data_format);
-        diff_dst_md = memory::desc(diff_dst_dims, MklDnnType<T>(),
-                                   src_mkl_data_format);
+        diff_dst_md =
+            memory::desc(diff_dst_dims, MklDnnType<T>(), src_mkl_data_format);
       } else if (!dnn_shape_src.IsMklTensor() &&
-                  dnn_shape_diff_dst.IsMklTensor()) {
+                 dnn_shape_diff_dst.IsMklTensor()) {
         // Same comment as above.
         diff_dst_md = dnn_shape_diff_dst.GetMklLayout();
 
         memory::format diff_dst_mkl_data_format =
-          dnn_shape_diff_dst.GetTfDataFormat();
-        auto diff_dst_tf_data_format = MklDnnDataFormatToTFDataFormat(
-                                          diff_dst_mkl_data_format);
+            dnn_shape_diff_dst.GetTfDataFormat();
+        auto diff_dst_tf_data_format =
+            MklDnnDataFormatToTFDataFormat(diff_dst_mkl_data_format);
         auto src_dims = TFShapeToMklDnnDimsInNCHW(src_tensor.shape(),
                                                   diff_dst_tf_data_format);
-        src_md = memory::desc(src_dims, MklDnnType<T>(),
-                              diff_dst_mkl_data_format);
+        src_md =
+            memory::desc(src_dims, MklDnnType<T>(), diff_dst_mkl_data_format);
       } else {
         // If both the inputs are in MKL format, we use Mkl layout of the input
         // tensors.
@@ -572,12 +569,12 @@ class MklReluGradOpBase : public OpKernel {
       std::shared_ptr<relu_forward::primitive_desc> relu_fwd_pd;
       auto relu_fwd_desc = relu_forward::desc(prop_kind::forward_training,
                                               alg_kind, src_md, alpha, beta);
-      relu_fwd_pd.reset(new relu_forward::primitive_desc(relu_fwd_desc,
-                                                         cpu_engine));
-      auto relu_bwd_desc = relu_backward::desc(alg_kind, common_md, common_md,
-                                                alpha, beta);
-      auto relu_bwd_pd  = relu_backward::primitive_desc(relu_bwd_desc,
-                                                cpu_engine, *relu_fwd_pd);
+      relu_fwd_pd.reset(
+          new relu_forward::primitive_desc(relu_fwd_desc, cpu_engine));
+      auto relu_bwd_desc =
+          relu_backward::desc(alg_kind, common_md, common_md, alpha, beta);
+      auto relu_bwd_pd = relu_backward::primitive_desc(
+          relu_bwd_desc, cpu_engine, *relu_fwd_pd);
 
       // allocate diff_src tensor
       MklDnnShape dnn_shape_diff_src;
@@ -590,33 +587,32 @@ class MklReluGradOpBase : public OpKernel {
         dnn_shape_diff_src.SetTfLayout(dnn_shape_src.GetDimension(),
                                        dnn_shape_src.GetSizesAsMklDnnDims(),
                                        dnn_shape_src.GetTfDataFormat());
-        tf_shape_diff_src.AddDim(diff_src_pd.get_size()/sizeof(T));
+        tf_shape_diff_src.AddDim(diff_src_pd.get_size() / sizeof(T));
       } else {
         dnn_shape_diff_src.SetMklTensor(false);
         tf_shape_diff_src = src_tensor.shape();
       }
       AllocateOutputSetMklShape(context, diff_src_index, &diff_src_tensor,
-                                 tf_shape_diff_src, dnn_shape_diff_src);
+                                tf_shape_diff_src, dnn_shape_diff_src);
 
       // diff_src memory descriptor is same as memory descriptor for both
       // inputs.
       diff_src.SetUsrMem(common_md, diff_src_tensor);
 
       PrepareAndExecuteNet(relu_bwd_pd, &src, &diff_src, &diff_dst);
-     } catch (mkldnn::error &e) {
-       string error_msg = "Status: " + std::to_string(e.status) +
-                          ", message: " + string(e.message) +
-                          ", in file " + string(__FILE__) + ":" +
-                          std::to_string(__LINE__);
-       OP_REQUIRES_OK(context,
-                      errors::Aborted("Operation received an exception:",
-                                      error_msg));
+    } catch (mkldnn::error& e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
     }
   }
 
   void PrepareAndExecuteNet(const relu_backward::primitive_desc& relu_prim_desc,
-                  MklDnnData<T>* src, MklDnnData<T>* diff_src, MklDnnData<T>*
-                  diff_dst) {
+                            MklDnnData<T>* src, MklDnnData<T>* diff_src,
+                            MklDnnData<T>* diff_dst) {
     std::vector<primitive> net;
 
     // Check if we need to reorder original input tensors into common_md layout
@@ -632,14 +628,13 @@ class MklReluGradOpBase : public OpKernel {
   }
 };
 
-
 template <typename Device, typename T>
 class MklReluOp : public MklReluOpBase<Device, T, eltwise_relu> {
  public:
   ~MklReluOp() {}
 
-  explicit MklReluOp(OpKernelConstruction* context) :
-  MklReluOpBase<Device, T, eltwise_relu>(context) {}
+  explicit MklReluOp(OpKernelConstruction* context)
+      : MklReluOpBase<Device, T, eltwise_relu>(context) {}
 
   virtual void Compute_Scalar(OpKernelContext* context) {
     const size_t src_index = 0;  // index of src input tensor
@@ -649,15 +644,15 @@ class MklReluOp : public MklReluOpBase<Device, T, eltwise_relu> {
     GetMklShape(context, src_index, &dnn_shape_src);
 
     Tensor* dst_tensor = nullptr;
-    void* user_i = static_cast<void*>(const_cast<T*>(
-                         src_tensor.flat<T>().data()));
+    void* user_i =
+        static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
     MklDnnShape dnn_shape_dst;
     dnn_shape_dst.SetMklTensor(false);
     AllocateOutputSetMklShape(context, dst_index, &dst_tensor,
                               src_tensor.shape(), dnn_shape_dst);
     void* out_o = static_cast<void*>(dst_tensor->flat<T>().data());
     (static_cast<T*>(out_o))[0] =
-              std::max((static_cast<T*>(user_i))[0], static_cast<T>(0));
+        std::max((static_cast<T*>(user_i))[0], static_cast<T>(0));
     return;
   }
 };
@@ -667,14 +662,14 @@ class MklReluGradOp : public MklReluGradOpBase<Device, T, eltwise_relu> {
  public:
   ~MklReluGradOp() {}
 
-  explicit MklReluGradOp(OpKernelConstruction* context) :
-  MklReluGradOpBase<Device, T, eltwise_relu>(context) {}
+  explicit MklReluGradOp(OpKernelConstruction* context)
+      : MklReluGradOpBase<Device, T, eltwise_relu>(context) {}
 
   virtual void Compute_Scalar(OpKernelContext* context) {
     const size_t diff_dst_index = 0;  // index of diff_dst input tensor
     const size_t src_index = 1;       // index of src input tensor
     const size_t diff_src_index = 0;  // index of diff_src output tensor
-    const Tensor& src_tensor    = MklGetInput(context, src_index);
+    const Tensor& src_tensor = MklGetInput(context, src_index);
     const Tensor& diff_dst_tensor = MklGetInput(context, diff_dst_index);
     Tensor* diff_src_tensor = nullptr;
 
@@ -687,11 +682,11 @@ class MklReluGradOp : public MklReluGradOpBase<Device, T, eltwise_relu> {
                               diff_dst_tensor.shape(), dnn_shape_diff_src);
     void* out_o = static_cast<void*>(diff_src_tensor->flat<T>().data());
     void* user_i =
-          static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
+        static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
     void* user_g =
-          static_cast<void*>(const_cast<T*>(diff_dst_tensor.flat<T>().data()));
-    (static_cast<T*>(out_o))[0] = (static_cast<T*>(user_g))[0] *
-                                  ((static_cast<T*>(user_i))[0] > 0);
+        static_cast<void*>(const_cast<T*>(diff_dst_tensor.flat<T>().data()));
+    (static_cast<T*>(out_o))[0] =
+        (static_cast<T*>(user_g))[0] * ((static_cast<T*>(user_i))[0] > 0);
     return;
   }
 };
@@ -701,8 +696,8 @@ class MklEluOp : public MklReluOpBase<Device, T, eltwise_elu> {
  public:
   ~MklEluOp() {}
 
-  explicit MklEluOp(OpKernelConstruction* context) :
-  MklReluOpBase<Device, T, eltwise_elu>(context) {}
+  explicit MklEluOp(OpKernelConstruction* context)
+      : MklReluOpBase<Device, T, eltwise_elu>(context) {}
 
   virtual void Compute_Scalar(OpKernelContext* context) {
     const size_t src_index = 0;  // index of src input tensor
@@ -712,8 +707,8 @@ class MklEluOp : public MklReluOpBase<Device, T, eltwise_elu> {
     GetMklShape(context, src_index, &dnn_shape_src);
 
     Tensor* dst_tensor = nullptr;
-    void* user_i = static_cast<void*>(const_cast<T*>(
-                         src_tensor.flat<T>().data()));
+    void* user_i =
+        static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
     MklDnnShape dnn_shape_dst;
     dnn_shape_dst.SetMklTensor(false);
     AllocateOutputSetMklShape(context, dst_index, &dst_tensor,
@@ -734,14 +729,14 @@ class MklEluGradOp : public MklReluGradOpBase<Device, T, eltwise_elu> {
  public:
   ~MklEluGradOp() {}
 
-  explicit MklEluGradOp(OpKernelConstruction* context) :
-  MklReluGradOpBase<Device, T, eltwise_elu>(context) {}
+  explicit MklEluGradOp(OpKernelConstruction* context)
+      : MklReluGradOpBase<Device, T, eltwise_elu>(context) {}
 
   virtual void Compute_Scalar(OpKernelContext* context) {
     const size_t diff_dst_index = 0;  // index of diff_dst input tensor
     const size_t src_index = 1;       // index of src input tensor
     const size_t diff_src_index = 0;  // index of diff_src output tensor
-    const Tensor& src_tensor    = MklGetInput(context, src_index);
+    const Tensor& src_tensor = MklGetInput(context, src_index);
     const Tensor& diff_dst_tensor = MklGetInput(context, diff_dst_index);
     Tensor* diff_src_tensor = nullptr;
 
@@ -754,9 +749,9 @@ class MklEluGradOp : public MklReluGradOpBase<Device, T, eltwise_elu> {
                               diff_dst_tensor.shape(), dnn_shape_diff_src);
     void* out_o = static_cast<void*>(diff_src_tensor->flat<T>().data());
     void* user_i =
-          static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
+        static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
     void* user_g =
-          static_cast<void*>(const_cast<T*>(diff_dst_tensor.flat<T>().data()));
+        static_cast<void*>(const_cast<T*>(diff_dst_tensor.flat<T>().data()));
     // gradient of elu(x) = 1 if x > 0; elu(x) + 1 otherwise
     T feature = (static_cast<T*>(user_i))[0];
     if (feature > 0) {
@@ -773,8 +768,8 @@ class MklTanhOp : public MklReluOpBase<Device, T, eltwise_tanh> {
  public:
   ~MklTanhOp() {}
 
-  explicit MklTanhOp(OpKernelConstruction* context) :
-  MklReluOpBase<Device, T, eltwise_tanh>(context) {}
+  explicit MklTanhOp(OpKernelConstruction* context)
+      : MklReluOpBase<Device, T, eltwise_tanh>(context) {}
 
   virtual void Compute_Scalar(OpKernelContext* context) {
     const size_t src_index = 0;  // index of src input tensor
@@ -784,8 +779,8 @@ class MklTanhOp : public MklReluOpBase<Device, T, eltwise_tanh> {
     GetMklShape(context, src_index, &dnn_shape_src);
 
     Tensor* dst_tensor = nullptr;
-    void* user_i = static_cast<void*>(const_cast<T*>(
-                         src_tensor.flat<T>().data()));
+    void* user_i =
+        static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
     MklDnnShape dnn_shape_dst;
     dnn_shape_dst.SetMklTensor(false);
     AllocateOutputSetMklShape(context, dst_index, &dst_tensor,
@@ -795,7 +790,7 @@ class MklTanhOp : public MklReluOpBase<Device, T, eltwise_tanh> {
     T feature = (static_cast<T*>(user_i))[0];
     T e1 = std::exp(feature);
     T e2 = std::exp(-feature);
-    (static_cast<T*>(out_o))[0] = (e1 - e2)/(e1 + e2);
+    (static_cast<T*>(out_o))[0] = (e1 - e2) / (e1 + e2);
     return;
   }
 };
@@ -805,14 +800,14 @@ class MklTanhGradOp : public MklReluGradOpBase<Device, T, eltwise_tanh> {
  public:
   ~MklTanhGradOp() {}
 
-  explicit MklTanhGradOp(OpKernelConstruction* context) :
-  MklReluGradOpBase<Device, T, eltwise_tanh>(context) {}
+  explicit MklTanhGradOp(OpKernelConstruction* context)
+      : MklReluGradOpBase<Device, T, eltwise_tanh>(context) {}
 
   virtual void Compute_Scalar(OpKernelContext* context) {
     const size_t diff_dst_index = 0;  // index of diff_dst input tensor
     const size_t src_index = 1;       // index of src input tensor
     const size_t diff_src_index = 0;  // index of diff_src output tensor
-    const Tensor& src_tensor    = MklGetInput(context, src_index);
+    const Tensor& src_tensor = MklGetInput(context, src_index);
     const Tensor& diff_dst_tensor = MklGetInput(context, diff_dst_index);
     Tensor* diff_src_tensor = nullptr;
 
@@ -825,16 +820,16 @@ class MklTanhGradOp : public MklReluGradOpBase<Device, T, eltwise_tanh> {
                               diff_dst_tensor.shape(), dnn_shape_diff_src);
     void* out_o = static_cast<void*>(diff_src_tensor->flat<T>().data());
     void* user_i =
-          static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
+        static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
     // gradient of tanh(x) = 1 - tanh(x)^2
     T feature = (static_cast<T*>(user_i))[0];
     T e1 = std::exp(feature);
     T e2 = std::exp(-feature);
-    T tanh = (e1 - e2)/(e1 + e2);
+    T tanh = (e1 - e2) / (e1 + e2);
     void* user_g =
-          static_cast<void*>(const_cast<T*>(diff_dst_tensor.flat<T>().data()));
-    (static_cast<T*>(out_o))[0] = (static_cast<T*>(user_g))[0] *
-                                  (1 - tanh * tanh);
+        static_cast<void*>(const_cast<T*>(diff_dst_tensor.flat<T>().data()));
+    (static_cast<T*>(out_o))[0] =
+        (static_cast<T*>(user_g))[0] * (1 - tanh * tanh);
   }
 };
 
@@ -857,13 +852,13 @@ TF_CALL_float(REGISTER_RELU_MKL_SUPPORTED_KERNELS_TYPES);
 #ifdef INTEL_MKL_DNN
 
 // register dnn kernels for supported operations and supported types
-#define REGISTER_ELU_MKL_SUPPORTED_KERNELS_TYPES(type)             \
-  REGISTER_KERNEL_BUILDER(Name("_MklElu")                          \
+#define REGISTER_ELU_MKL_SUPPORTED_KERNELS_TYPES(type)              \
+  REGISTER_KERNEL_BUILDER(Name("_MklElu")                           \
                               .Device(DEVICE_CPU)                   \
                               .TypeConstraint<type>("T")            \
                               .Label(mkl_op_registry::kMklOpLabel), \
-                          MklEluOp<CPUDevice, type>);              \
-  REGISTER_KERNEL_BUILDER(Name("_MklEluGrad")                      \
+                          MklEluOp<CPUDevice, type>);               \
+  REGISTER_KERNEL_BUILDER(Name("_MklEluGrad")                       \
                               .Device(DEVICE_CPU)                   \
                               .TypeConstraint<type>("T")            \
                               .Label(mkl_op_registry::kMklOpLabel), \
@@ -888,4 +883,3 @@ TF_CALL_float(REGISTER_TANH_MKL_SUPPORTED_KERNELS_TYPES);
 }  // namespace tensorflow
 
 #endif  // INTEL_MKL
-
diff --git a/tensorflow/core/kernels/mkl_reshape_op.cc b/tensorflow/core/kernels/mkl_reshape_op.cc
index b41e529357..7d471e1e4c 100644
--- a/tensorflow/core/kernels/mkl_reshape_op.cc
+++ b/tensorflow/core/kernels/mkl_reshape_op.cc
@@ -166,9 +166,9 @@ class MklReshapeOp : public OpKernel {
     MklDnnShape mkl_shape_input;
     GetMklShape(context, kInputSlotIdx, &mkl_shape_input);
     bool input_in_mkl_format = mkl_shape_input.IsMklTensor();
-    const int64 nelems = input_in_mkl_format ?
-                         mkl_shape_input.GetTfShape().num_elements()
-                         : input_tensor.NumElements();
+    const int64 nelems = input_in_mkl_format
+                             ? mkl_shape_input.GetTfShape().num_elements()
+                             : input_tensor.NumElements();
 
     // Preliminary validation of sizes.
     OP_REQUIRES(context, IsLegacyVector(sizes.shape()),
@@ -210,11 +210,11 @@ class MklReshapeOp : public OpKernel {
               product));
       shape.set_dim(unknown_index, missing);
     }
-    OP_REQUIRES(context, shape.num_elements() == nelems,
-                errors::InvalidArgument("Input to reshape is a tensor with ",
-                                        nelems,
-                                        " values, but the requested shape has ",
-                                        shape.num_elements()));
+    OP_REQUIRES(
+        context, shape.num_elements() == nelems,
+        errors::InvalidArgument("Input to reshape is a tensor with ", nelems,
+                                " values, but the requested shape has ",
+                                shape.num_elements()));
 
     if (input_in_mkl_format) {
       TensorShape& shape_to = shape;
@@ -237,38 +237,38 @@ class MklReshapeOp : public OpKernel {
           // need to update MklDnnShape object associated with the input
           // tensor to reflect the shape change expected by reshape.
           if (!SkipReorder(mkl_shape_input, shape_to)) {
-              // If dimensions that are being expanded or collapsed are not
-              // maintained contiguously by MKLDNN, then we use reorder.
-
-              // Get Mkl layout of input tensor.
-              auto input_mkl_md = mkl_shape_input.GetMklLayout();
-              // Set input Mkl layout as the user layout.
-              dnn_data_input.SetUsrMem(input_mkl_md, &input_tensor);
-              // Get expected Tensorflow layout of input tensor.
-              auto output_tf_md = mkl_shape_input.GetTfLayout();
-              auto output_tf_pd = memory::primitive_desc(output_tf_md,
-                                                         cpu_engine);
-
-              Tensor* output_tensor = nullptr;
-              MklShape mkl_shape_output;
-              mkl_shape_output.SetMklTensor(false);
-              // We allocate output tensor in the shape expected by Reshape.
-              AllocateOutputSetMklShape(context, kOutputSlotIdx, &output_tensor,
-                                        shape_to, mkl_shape_output);
-
-              // Insert reorder between Mkl layout and TensorFlow layout if
-              // needed. If reorder is not needed but reshape is needed (since
-              // shape_from != shape_to), then we just copy input tensor to
-              // output tensor with target shape (we cannot forward Mkl layout
-              // in such case because shape has changed.)
-              std::vector<primitive> net;
-              if (dnn_data_input.CheckReorderToOpMem(output_tf_pd,
-                       output_tensor, &net)) {
-                stream(stream::kind::eager).submit(net).wait();
-              } else {
-                output_tensor->CopyFrom(input_tensor, shape_to);
-              }
-              return;
+            // If dimensions that are being expanded or collapsed are not
+            // maintained contiguously by MKLDNN, then we use reorder.
+
+            // Get Mkl layout of input tensor.
+            auto input_mkl_md = mkl_shape_input.GetMklLayout();
+            // Set input Mkl layout as the user layout.
+            dnn_data_input.SetUsrMem(input_mkl_md, &input_tensor);
+            // Get expected Tensorflow layout of input tensor.
+            auto output_tf_md = mkl_shape_input.GetTfLayout();
+            auto output_tf_pd =
+                memory::primitive_desc(output_tf_md, cpu_engine);
+
+            Tensor* output_tensor = nullptr;
+            MklShape mkl_shape_output;
+            mkl_shape_output.SetMklTensor(false);
+            // We allocate output tensor in the shape expected by Reshape.
+            AllocateOutputSetMklShape(context, kOutputSlotIdx, &output_tensor,
+                                      shape_to, mkl_shape_output);
+
+            // Insert reorder between Mkl layout and TensorFlow layout if
+            // needed. If reorder is not needed but reshape is needed (since
+            // shape_from != shape_to), then we just copy input tensor to
+            // output tensor with target shape (we cannot forward Mkl layout
+            // in such case because shape has changed.)
+            std::vector<primitive> net;
+            if (dnn_data_input.CheckReorderToOpMem(output_tf_pd, output_tensor,
+                                                   &net)) {
+              stream(stream::kind::eager).submit(net).wait();
+            } else {
+              output_tensor->CopyFrom(input_tensor, shape_to);
+            }
+            return;
           } else {
             // If dimensions that are being expanded or collapsed are
             // maintained contiguously by MKLDNN, then we skip reorder, just
@@ -276,10 +276,10 @@ class MklReshapeOp : public OpKernel {
             // Tensorflow tensor as it is to the output.
             auto output_dims = TFShapeToMklDnnDims(shape_to);
             auto output_strides = CalculateTFStrides(output_dims);
-            auto output_tf_md = MklDnnData<T>::CreateBlockedMemDesc(output_dims,
-                                                               output_strides);
-            auto output_tf_pd = memory::primitive_desc(output_tf_md,
-                                                       cpu_engine);
+            auto output_tf_md = MklDnnData<T>::CreateBlockedMemDesc(
+                output_dims, output_strides);
+            auto output_tf_pd =
+                memory::primitive_desc(output_tf_md, cpu_engine);
 
             // Set MklDnnShape
             MklDnnShape mkl_shape_output;
@@ -291,18 +291,17 @@ class MklReshapeOp : public OpKernel {
 
             // We now simply forward input Mkl tensor to output and change its
             // output MklDnnShape object.
-            ForwardMklTensorInToOutWithMklShape(context, kInputSlotIdx,
-                                              kOutputSlotIdx, mkl_shape_output);
+            ForwardMklTensorInToOutWithMklShape(
+                context, kInputSlotIdx, kOutputSlotIdx, mkl_shape_output);
             return;
           }
-        } catch (mkldnn::error &e) {
+        } catch (mkldnn::error& e) {
           string error_msg = "Status: " + std::to_string(e.status) +
-                       ", message: " + string(e.message) +
-                       ", in file " + string(__FILE__) + ":" +
-                       std::to_string(__LINE__);
-          OP_REQUIRES_OK(context,
-                   errors::Aborted("Operation received an exception:",
-                      error_msg));
+                             ", message: " + string(e.message) + ", in file " +
+                             string(__FILE__) + ":" + std::to_string(__LINE__);
+          OP_REQUIRES_OK(
+              context,
+              errors::Aborted("Operation received an exception:", error_msg));
         }
       }
     } else {
diff --git a/tensorflow/core/kernels/mkl_tfconv_op.cc b/tensorflow/core/kernels/mkl_tfconv_op.cc
index b48c735d12..c35f857cfe 100644
--- a/tensorflow/core/kernels/mkl_tfconv_op.cc
+++ b/tensorflow/core/kernels/mkl_tfconv_op.cc
@@ -28,9 +28,9 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/util/tensor_format.h"
 
-#include "tensorflow/core/util/mkl_util.h"
 #include "mkl_dnn.h"
 #include "mkl_dnn_types.h"
+#include "tensorflow/core/util/mkl_util.h"
 
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
diff --git a/tensorflow/core/kernels/non_max_suppression_op.cc b/tensorflow/core/kernels/non_max_suppression_op.cc
index 64bdef0008..5d28b87e6b 100644
--- a/tensorflow/core/kernels/non_max_suppression_op.cc
+++ b/tensorflow/core/kernels/non_max_suppression_op.cc
@@ -92,13 +92,11 @@ static inline bool IOUGreaterThanThreshold(
   return iou > iou_threshold;
 }
 
-void DoNonMaxSuppressionOp(OpKernelContext* context,
-                           const Tensor& boxes,
-                           const Tensor& scores,
-                           const Tensor& max_output_size,
+void DoNonMaxSuppressionOp(OpKernelContext* context, const Tensor& boxes,
+                           const Tensor& scores, const Tensor& max_output_size,
                            const float iou_threshold) {
   OP_REQUIRES(context, iou_threshold >= 0 && iou_threshold <= 1,
-      errors::InvalidArgument("iou_threshold must be in [0, 1]"));
+              errors::InvalidArgument("iou_threshold must be in [0, 1]"));
 
   int num_boxes = 0;
   ParseAndCheckBoxSizes(context, boxes, scores, &num_boxes);
@@ -106,10 +104,8 @@ void DoNonMaxSuppressionOp(OpKernelContext* context,
     return;
   }
 
-  const int output_size =
-      std::min(max_output_size.scalar<int>()(), num_boxes);
-  typename TTypes<float, 2>::ConstTensor boxes_data =
-      boxes.tensor<float, 2>();
+  const int output_size = std::min(max_output_size.scalar<int>()(), num_boxes);
+  typename TTypes<float, 2>::ConstTensor boxes_data = boxes.tensor<float, 2>();
 
   std::vector<float> scores_data(num_boxes);
   std::copy_n(scores.flat<float>().data(), num_boxes, scores_data.begin());
@@ -181,8 +177,7 @@ template <typename Device>
 class NonMaxSuppressionV2Op : public OpKernel {
  public:
   explicit NonMaxSuppressionV2Op(OpKernelConstruction* context)
-      : OpKernel(context) {
-  }
+      : OpKernel(context) {}
 
   void Compute(OpKernelContext* context) override {
     // boxes: [num_boxes, 4]
@@ -197,10 +192,9 @@ class NonMaxSuppressionV2Op : public OpKernel {
                                 max_output_size.shape().DebugString()));
     // iou_threshold: scalar
     const Tensor& iou_threshold = context->input(3);
-    OP_REQUIRES(
-        context, TensorShapeUtils::IsScalar(iou_threshold.shape()),
-        errors::InvalidArgument("iou_threshold must be 0-D, got shape ",
-                                iou_threshold.shape().DebugString()));
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(iou_threshold.shape()),
+                errors::InvalidArgument("iou_threshold must be 0-D, got shape ",
+                                        iou_threshold.shape().DebugString()));
 
     const float iou_threshold_val = iou_threshold.scalar<float>()();
 
diff --git a/tensorflow/core/kernels/non_max_suppression_op_test.cc b/tensorflow/core/kernels/non_max_suppression_op_test.cc
index fdbcf05b89..67d9217b95 100644
--- a/tensorflow/core/kernels/non_max_suppression_op_test.cc
+++ b/tensorflow/core/kernels/non_max_suppression_op_test.cc
@@ -43,9 +43,10 @@ class NonMaxSuppressionOpTest : public OpsTestBase {
 
 TEST_F(NonMaxSuppressionOpTest, TestSelectFromThreeClusters) {
   MakeOp(.5);
-  AddInputFromArray<float>(TensorShape({6, 4}),
-                           {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
-                            0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,  1, 101});
+  AddInputFromArray<float>(
+      TensorShape({6, 4}),
+      {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+       0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,   1, 101});
   AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
   AddInputFromArray<int>(TensorShape({}), {3});
   TF_ASSERT_OK(RunOpKernel());
@@ -58,7 +59,7 @@ TEST_F(NonMaxSuppressionOpTest, TestSelectFromThreeClusters) {
 TEST_F(NonMaxSuppressionOpTest, TestSelectFromThreeClustersFlippedCoordinates) {
   MakeOp(.5);
   AddInputFromArray<float>(TensorShape({6, 4}),
-                           {1, 1,  0, 0,  0, 0.1f,  1, 1.1f,  0, .9f,  1, -0.1f,
+                           {1, 1,  0, 0,  0, 0.1f,  1, 1.1f,  0, .9f, 1, -0.1f,
                             0, 10, 1, 11, 1, 10.1f, 0, 11.1f, 1, 101, 0, 100});
   AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
   AddInputFromArray<int>(TensorShape({}), {3});
@@ -71,9 +72,10 @@ TEST_F(NonMaxSuppressionOpTest, TestSelectFromThreeClustersFlippedCoordinates) {
 
 TEST_F(NonMaxSuppressionOpTest, TestSelectAtMostTwoBoxesFromThreeClusters) {
   MakeOp(.5);
-  AddInputFromArray<float>(TensorShape({6, 4}),
-                           {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
-                            0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,  1, 101});
+  AddInputFromArray<float>(
+      TensorShape({6, 4}),
+      {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+       0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,   1, 101});
   AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
   AddInputFromArray<int>(TensorShape({}), {2});
   TF_ASSERT_OK(RunOpKernel());
@@ -85,9 +87,10 @@ TEST_F(NonMaxSuppressionOpTest, TestSelectAtMostTwoBoxesFromThreeClusters) {
 
 TEST_F(NonMaxSuppressionOpTest, TestSelectAtMostThirtyBoxesFromThreeClusters) {
   MakeOp(.5);
-  AddInputFromArray<float>(TensorShape({6, 4}),
-                           {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
-                            0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,  1, 101});
+  AddInputFromArray<float>(
+      TensorShape({6, 4}),
+      {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+       0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,   1, 101});
   AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
   AddInputFromArray<int>(TensorShape({}), {30});
   TF_ASSERT_OK(RunOpKernel());
@@ -134,9 +137,10 @@ TEST_F(NonMaxSuppressionOpTest, TestSelectFromTenIdenticalBoxes) {
 
 TEST_F(NonMaxSuppressionOpTest, TestInconsistentBoxAndScoreShapes) {
   MakeOp(.5);
-  AddInputFromArray<float>(TensorShape({6, 4}),
-                           {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
-                            0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,  1, 101});
+  AddInputFromArray<float>(
+      TensorShape({6, 4}),
+      {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+       0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,   1, 101});
   AddInputFromArray<float>(TensorShape({5}), {.9f, .75f, .6f, .95f, .5f});
   AddInputFromArray<int>(TensorShape({}), {30});
   Status s = RunOpKernel();
diff --git a/tensorflow/core/kernels/nth_element_op.cc b/tensorflow/core/kernels/nth_element_op.cc
index da825e408c..7f12eb953a 100644
--- a/tensorflow/core/kernels/nth_element_op.cc
+++ b/tensorflow/core/kernels/nth_element_op.cc
@@ -16,15 +16,15 @@ limitations under the License.
 // See docs in ../ops/nn_ops.cc.
 #include "tensorflow/core/kernels/nth_element_op.h"
 
+#include <algorithm>
+#include <iostream>
+#include <vector>
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/work_sharder.h"
-#include <vector>
-#include <algorithm>
-#include <iostream>
 
 namespace tensorflow {
 
@@ -54,8 +54,9 @@ class NthElementOp : public OpKernel {
                 errors::InvalidArgument("Input must be >= 1-D, got shape ",
                                         input_in.shape().DebugString()));
     // The last dimension of input tensor must be greater than N.
-    OP_REQUIRES(context, input_in.dim_size(num_dims-1) > n,
-                errors::InvalidArgument("Input must have at least n+1 columns"));
+    OP_REQUIRES(
+        context, input_in.dim_size(num_dims - 1) > n,
+        errors::InvalidArgument("Input must have at least n+1 columns"));
 
     // std::nth_element only support the nth-smallest selection.
     if (reverse_) {
@@ -64,7 +65,7 @@ class NthElementOp : public OpKernel {
 
     // Assume input_shape is [d1,d2,...dk], and output_shape is [d1,d2...dk-1].
     TensorShape out_shape;
-    for (int i = 0; i < num_dims-1; ++i) {
+    for (int i = 0; i < num_dims - 1; ++i) {
       out_shape.AddDim(input_in.dim_size(i));
     }
     Tensor* output_tensor = nullptr;
@@ -83,32 +84,28 @@ namespace functor {
 
 template <typename T>
 struct NthElementFunctor<CPUDevice, T> {
-  void operator() (OpKernelContext* context,
-                   const Tensor& input_tensor,
-                   Tensor& output_tensor,
-                   int n,
-                   bool reverse) {
+  void operator()(OpKernelContext* context, const Tensor& input_tensor,
+                  Tensor& output_tensor, int n, bool reverse) {
     const T* input = input_tensor.flat<T>().data();
     T* output = output_tensor.flat<T>().data();
 
     // Assume input_shape is [d1,d2,...dk], and output_shape is [d1,d2...dk-1],
     // then num_rows = d1*d2...dk-1, last_dim = dk.
     const int num_rows = output_tensor.NumElements();
-    const int last_dim = input_tensor.dim_size(input_tensor.dims()-1);
+    const int last_dim = input_tensor.dim_size(input_tensor.dims() - 1);
 
     // Allocate each row to different shard.
-    auto SubNthElement = [&, input, output, last_dim, n](int start,
-                                                         int limit) {
+    auto SubNthElement = [&, input, output, last_dim, n](int start, int limit) {
       // std::nth_element would rearrange the array, so we need a new buffer.
       std::vector<T> buf(last_dim);
 
       for (int b = start; b < limit; ++b) {
         // Copy from one row of elements to buffer
         const T* input_start = input + b * last_dim;
-        const T* input_end = input + (b+1) * last_dim;
+        const T* input_end = input + (b + 1) * last_dim;
         std::copy(input_start, input_end, buf.begin());
 
-        std::nth_element(buf.begin(), buf.begin()+n, buf.end());
+        std::nth_element(buf.begin(), buf.begin() + n, buf.end());
         // The element placed in the nth position is exactly the element that
         // would occur in this position if the range was fully sorted.
         output[b] = buf[n];
@@ -116,9 +113,9 @@ struct NthElementFunctor<CPUDevice, T> {
     };
 
     auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
-    // The average time complexity of partition-based nth_element (BFPRT) is O(n),
-    // althought the worst time complexity could be O(n^2).
-    // Here, 20 is a empirical factor of cost_per_unit.
+    // The average time complexity of partition-based nth_element (BFPRT) is
+    // O(n), althought the worst time complexity could be O(n^2). Here, 20 is a
+    // empirical factor of cost_per_unit.
     Shard(worker_threads.num_threads, worker_threads.workers, num_rows,
           20 * last_dim, SubNthElement);
   }
@@ -126,7 +123,6 @@ struct NthElementFunctor<CPUDevice, T> {
 
 }  // namespace functor
 
-
 #define REGISTER_NTHOP(T)                                           \
   REGISTER_KERNEL_BUILDER(                                          \
       Name("NthElement").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
@@ -136,4 +132,3 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_NTHOP);
 #undef REGISTER_NTHOP
 
 }  // end namespace tensorflow
-
diff --git a/tensorflow/core/kernels/nth_element_op.h b/tensorflow/core/kernels/nth_element_op.h
index 11a6c996b0..e7d25daecc 100644
--- a/tensorflow/core/kernels/nth_element_op.h
+++ b/tensorflow/core/kernels/nth_element_op.h
@@ -26,10 +26,8 @@ namespace functor {
 
 template <typename Device, typename T>
 struct NthElementFunctor {
-  void operator() (OpKernelContext* context,
-                   const Tensor& input_tensor,
-                   Tensor& output_tensor,
-                   int n);
+  void operator()(OpKernelContext* context, const Tensor& input_tensor,
+                  Tensor& output_tensor, int n);
 };
 
 }  // namespace functor
diff --git a/tensorflow/core/kernels/one_hot_op_gpu.cu.cc b/tensorflow/core/kernels/one_hot_op_gpu.cu.cc
index 49fd4bdeba..647515ae38 100644
--- a/tensorflow/core/kernels/one_hot_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/one_hot_op_gpu.cu.cc
@@ -19,16 +19,16 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/kernels/one_hot_op.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/one_hot_op.h"
 
 namespace tensorflow {
 
 typedef Eigen::GpuDevice GPUDevice;
 
-#define DEFINE_GPU_SPEC_INDEX(T, TI)                   \
-  template class generator::OneGenerator<T, TI>;       \
+#define DEFINE_GPU_SPEC_INDEX(T, TI)             \
+  template class generator::OneGenerator<T, TI>; \
   template struct functor::OneHot<GPUDevice, T, TI>;
 
 #define DEFINE_GPU_SPEC(T)         \
diff --git a/tensorflow/core/kernels/ops_util_test.cc b/tensorflow/core/kernels/ops_util_test.cc
index 9d53882dee..13427d71ff 100644
--- a/tensorflow/core/kernels/ops_util_test.cc
+++ b/tensorflow/core/kernels/ops_util_test.cc
@@ -218,7 +218,8 @@ TEST_F(OpsUtilTest, GetBroadcastTest3_3_1_2) {
 // in_size = 3, ksize = 3, stride = 2, pad_size = 0
 TEST_F(OpsUtilTest, GetBroadcastTest3_3_2_0) {
   bcast_struct bcast[] = {
-      {{0, 3, 3, 2, 0}, {0, 3}}, {{1, 3, 3, 2, 0}, {2, 1}},
+      {{0, 3, 3, 2, 0}, {0, 3}},
+      {{1, 3, 3, 2, 0}, {2, 1}},
   };
   for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) {
     VerifyBcastValues(bcast[i]);
@@ -228,7 +229,8 @@ TEST_F(OpsUtilTest, GetBroadcastTest3_3_2_0) {
 // in_size = 3, ksize = 3, stride = 2, pad_size = 1
 TEST_F(OpsUtilTest, GetBroadcastTest3_3_2_1) {
   bcast_struct bcast[] = {
-      {{0, 3, 3, 2, 1}, {0, 2}}, {{1, 3, 3, 2, 1}, {1, 2}},
+      {{0, 3, 3, 2, 1}, {0, 2}},
+      {{1, 3, 3, 2, 1}, {1, 2}},
   };
   for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) {
     VerifyBcastValues(bcast[i]);
@@ -258,7 +260,8 @@ TEST_F(OpsUtilTest, GetBroadcastTest3_3_3_0) {
 // in_size = 3, ksize = 3, stride = 3, pad_size = 1
 TEST_F(OpsUtilTest, GetBroadcastTest3_3_3_1) {
   bcast_struct bcast[] = {
-      {{0, 3, 3, 3, 1}, {0, 2}}, {{1, 3, 3, 3, 1}, {2, 1}},
+      {{0, 3, 3, 3, 1}, {0, 2}},
+      {{1, 3, 3, 3, 1}, {2, 1}},
   };
   for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) {
     VerifyBcastValues(bcast[i]);
@@ -348,8 +351,8 @@ TEST_F(OpsUtilTest, Misaligned1DSlice) {
 
 TEST_F(OpsUtilTest, Aligned2DSliceOfDim0) {
 #if EIGEN_MAX_ALIGN_BYTES == 0
-  // When EIGEN_MAX_ALIGN_BYTES is 0 and the size of the first dimension is nonzero,
-  // a multidimensional tensor is always aligned.
+  // When EIGEN_MAX_ALIGN_BYTES is 0 and the size of the first dimension is
+  // nonzero, a multidimensional tensor is always aligned.
   Tensor t(DT_FLOAT, TensorShape({3, 4}));
   int64 start = 1;
   int64 end = 2;
diff --git a/tensorflow/core/kernels/pack_op.cc b/tensorflow/core/kernels/pack_op.cc
index 2033fbf5dc..e0ae5de0f4 100644
--- a/tensorflow/core/kernels/pack_op.cc
+++ b/tensorflow/core/kernels/pack_op.cc
@@ -36,7 +36,7 @@ typedef Eigen::GpuDevice GPUDevice;
 #endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // --------------------------------------------------------------------------
 template <typename Device, typename T>
@@ -123,7 +123,7 @@ class PackOp : public OpKernel {
         ConcatSYCL<T>(c->eigen_sycl_device(), inputs_flat, &output_flat);
         return;
       }
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
       ConcatCPU<T>(c->device(), inputs_flat, &output_flat);
     }
   }
diff --git a/tensorflow/core/kernels/parameterized_truncated_normal_op.cc b/tensorflow/core/kernels/parameterized_truncated_normal_op.cc
index b232ba16a7..0ab9ff9f65 100644
--- a/tensorflow/core/kernels/parameterized_truncated_normal_op.cc
+++ b/tensorflow/core/kernels/parameterized_truncated_normal_op.cc
@@ -95,9 +95,10 @@ struct TruncatedNormalFunctor<CPUDevice, T> {
         int64 sample = b * samples_per_batch;
 
         // On GPU, this check will just fill samples with NAN if it fails.
-        OP_REQUIRES(ctx, stddev > T(0) && minval < maxval &&
-                             (Eigen::numext::isfinite(minval) ||
-                              Eigen::numext::isfinite(maxval)),
+        OP_REQUIRES(ctx,
+                    stddev > T(0) && minval < maxval &&
+                        (Eigen::numext::isfinite(minval) ||
+                         Eigen::numext::isfinite(maxval)),
                     errors::InvalidArgument("Invalid parameters"));
 
         int numIterations = 0;
@@ -118,8 +119,9 @@ struct TruncatedNormalFunctor<CPUDevice, T> {
         // Determine the method to use.
         const T sqrtFactor = Eigen::numext::sqrt((normMin * normMin) + T(4));
         const T cutoff =
-            T(2) * Eigen::numext::exp(
-                       T(0.5) + (normMin * (normMin - sqrtFactor)) / T(4)) /
+            T(2) *
+            Eigen::numext::exp(T(0.5) +
+                               (normMin * (normMin - sqrtFactor)) / T(4)) /
             (normMin + sqrtFactor);
         const T diff = normMax - normMin;
         if (diff < cutoff) {
@@ -309,30 +311,34 @@ class ParameterizedTruncatedNormalOp : public OpKernel {
     } else {
       // Parameters must be broadcastable to the shape [num_batches].
       OP_REQUIRES(
-          ctx, TensorShapeUtils::IsScalar(means_tensor.shape()) ||
-                   means_tensor.dim_size(0) == 1 ||
-                   means_tensor.dim_size(0) == num_batches,
+          ctx,
+          TensorShapeUtils::IsScalar(means_tensor.shape()) ||
+              means_tensor.dim_size(0) == 1 ||
+              means_tensor.dim_size(0) == num_batches,
           errors::InvalidArgument(
               "Input means should have length 1 or shape[0], got shape: ",
               means_tensor.shape().DebugString()));
       OP_REQUIRES(
-          ctx, TensorShapeUtils::IsScalar(stddevs_tensor.shape()) ||
-                   stddevs_tensor.dim_size(0) == 1 ||
-                   stddevs_tensor.dim_size(0) == num_batches,
+          ctx,
+          TensorShapeUtils::IsScalar(stddevs_tensor.shape()) ||
+              stddevs_tensor.dim_size(0) == 1 ||
+              stddevs_tensor.dim_size(0) == num_batches,
           errors::InvalidArgument(
               "Input stddevs should have length 1 or shape[0], got shape: ",
               stddevs_tensor.shape().DebugString()));
       OP_REQUIRES(
-          ctx, TensorShapeUtils::IsScalar(minvals_tensor.shape()) ||
-                   minvals_tensor.dim_size(0) == 1 ||
-                   minvals_tensor.dim_size(0) == num_batches,
+          ctx,
+          TensorShapeUtils::IsScalar(minvals_tensor.shape()) ||
+              minvals_tensor.dim_size(0) == 1 ||
+              minvals_tensor.dim_size(0) == num_batches,
           errors::InvalidArgument(
               "Input minvals should have length 1 or shape[0], got shape: ",
               minvals_tensor.shape().DebugString()));
       OP_REQUIRES(
-          ctx, TensorShapeUtils::IsScalar(maxvals_tensor.shape()) ||
-                   maxvals_tensor.dim_size(0) == 1 ||
-                   maxvals_tensor.dim_size(0) == num_batches,
+          ctx,
+          TensorShapeUtils::IsScalar(maxvals_tensor.shape()) ||
+              maxvals_tensor.dim_size(0) == 1 ||
+              maxvals_tensor.dim_size(0) == num_batches,
           errors::InvalidArgument(
               "Input maxvals should have length 1 or shape[0], got shape: ",
               maxvals_tensor.shape().DebugString()));
diff --git a/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc b/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc
index 933de65c15..ddfeb1bb79 100644
--- a/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc
@@ -202,12 +202,13 @@ struct TruncatedNormalFunctor<GPUDevice, T> {
                   typename TTypes<T>::Flat output) {
     const auto config = GetCudaLaunchConfig(num_elements, d);
 
-    TruncatedNormalKernel<
-        T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        gen, output.data(), num_batches, samples_per_batch, num_elements,
-        means.data(), means.dimension(0) == 1, stddevs.data(),
-        stddevs.dimension(0) == 1, minvals.data(), minvals.dimension(0) == 1,
-        maxvals.data(), maxvals.dimension(0) == 1, kMaxIterations);
+    TruncatedNormalKernel<T>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            gen, output.data(), num_batches, samples_per_batch, num_elements,
+            means.data(), means.dimension(0) == 1, stddevs.data(),
+            stddevs.dimension(0) == 1, minvals.data(),
+            minvals.dimension(0) == 1, maxvals.data(),
+            maxvals.dimension(0) == 1, kMaxIterations);
   };
 };
 
diff --git a/tensorflow/core/kernels/parse_tensor_op.cc b/tensorflow/core/kernels/parse_tensor_op.cc
index 6b599612ad..dd41744f02 100644
--- a/tensorflow/core/kernels/parse_tensor_op.cc
+++ b/tensorflow/core/kernels/parse_tensor_op.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/framework/register_types.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/pooling_ops_3d.cc b/tensorflow/core/kernels/pooling_ops_3d.cc
index a406317213..01bcfede1e 100644
--- a/tensorflow/core/kernels/pooling_ops_3d.cc
+++ b/tensorflow/core/kernels/pooling_ops_3d.cc
@@ -258,7 +258,7 @@ struct LaunchMaxPooling3dGradOp<CPUDevice, T> {
           Eigen::array<int, 5> bcast = {1, csize, rsize, psize, 1};
 #else
           Eigen::IndexList<Eigen::type2index<1>, int, int, int,
-                           Eigen::type2index<1> >
+                           Eigen::type2index<1>>
               bcast;
           bcast.set(1, csize);
           bcast.set(2, rsize);
@@ -431,7 +431,7 @@ struct LaunchAvgPooling3dGradOp<CPUDevice, T> {
           Eigen::array<int, 5> bcast = {1, csize, rsize, psize, 1};
 #else
           Eigen::IndexList<Eigen::type2index<1>, int, int, int,
-                           Eigen::type2index<1> >
+                           Eigen::type2index<1>>
               bcast;
           bcast.set(1, csize);
           bcast.set(2, rsize);
@@ -833,7 +833,7 @@ TF_CALL_float(REGISTER_GPU_KERNELS) TF_CALL_half(REGISTER_GPU_KERNELS)
 
 #ifdef TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNELS(T) REGISTER_KERNELS(SYCL, T)
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS)
+    TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS)
 #undef REGISTER_SYCL_KERNELS
 #endif  // TENSORFLOW_USE_SYCL
 
diff --git a/tensorflow/core/kernels/pooling_ops_3d_sycl.h b/tensorflow/core/kernels/pooling_ops_3d_sycl.h
index c1bc5af498..b4bead2456 100644
--- a/tensorflow/core/kernels/pooling_ops_3d_sycl.h
+++ b/tensorflow/core/kernels/pooling_ops_3d_sycl.h
@@ -281,12 +281,11 @@ class MaxPool3DGradSYCL {
 
     const T* input_data_n =
         input_data + n * p_.in_planes_ * p_.in_cols_ * p_.in_rows_ * p_.depth_;
-    const T* output_data_n =
-        output_data +
-        n * p_.out_planes_ * p_.out_cols_ * p_.out_rows_ * p_.depth_;
-    const T* input_backprop_n =
-        input_backprop +
-        n * p_.out_planes_ * p_.out_cols_ * p_.out_rows_ * p_.depth_;
+    const T* output_data_n = output_data + n * p_.out_planes_ * p_.out_cols_ *
+                                               p_.out_rows_ * p_.depth_;
+    const T* input_backprop_n = input_backprop + n * p_.out_planes_ *
+                                                     p_.out_cols_ *
+                                                     p_.out_rows_ * p_.depth_;
     for (int poolp = poolpstart; poolp < poolpend; ++poolp) {
       int pstart = poolp * p_.stride_planes_ - p_.pad_planes_;
       const int pend = std::min(pstart + p_.window_planes_, p_.in_planes_);
@@ -678,9 +677,9 @@ class AvgPool3DGradSYCL {
     n /= p_.in_planes_;
 
     T gradient = T(0);
-    const T* input_backprop_n =
-        input_backprop +
-        n * p_.out_planes_ * p_.out_cols_ * p_.out_rows_ * p_.depth_;
+    const T* input_backprop_n = input_backprop + n * p_.out_planes_ *
+                                                     p_.out_cols_ *
+                                                     p_.out_rows_ * p_.depth_;
     for (int poolp = poolpstart; poolp < poolpend; ++poolp) {
       int pstart = poolp * p_.stride_planes_ - p_.pad_planes_;
       const int pend = std::min(pstart + p_.window_planes_, p_.in_planes_);
diff --git a/tensorflow/core/kernels/pooling_ops_common.h b/tensorflow/core/kernels/pooling_ops_common.h
index e3131b804f..fc7cb437b8 100644
--- a/tensorflow/core/kernels/pooling_ops_common.h
+++ b/tensorflow/core/kernels/pooling_ops_common.h
@@ -195,7 +195,6 @@ class MaxPoolingOp : public OpKernel {
       //    and updates the corresponding column(s) in output_as_matrix with the
       //    max value.
       auto shard = [&params, &in_mat, &out_mat](int64 start, int64 limit) {
-
         const int32 in_rows = params.tensor_in_rows;
         const int32 in_cols = params.tensor_in_cols;
         const int32 pad_rows = params.pad_rows;
@@ -443,7 +442,6 @@ class MaxPoolingV2Op : public OpKernel {
       //    and updates the corresponding column(s) in output_as_matrix with the
       //    max value.
       auto shard = [&params, &in_mat, &out_mat](int64 start, int64 limit) {
-
         const int32 in_rows = params.tensor_in_rows;
         const int32 in_cols = params.tensor_in_cols;
         const int32 pad_rows = params.pad_rows;
diff --git a/tensorflow/core/kernels/quantization_utils_test.cc b/tensorflow/core/kernels/quantization_utils_test.cc
index d148c9f78d..176720c22c 100644
--- a/tensorflow/core/kernels/quantization_utils_test.cc
+++ b/tensorflow/core/kernels/quantization_utils_test.cc
@@ -385,8 +385,12 @@ void TestQuantizedToFloatInPlaceUsingEigen(
   // These are the float values we're going to test the conversions on.
   typedef std::pair<float, float> FPair;
   for (FPair min_and_max : std::vector<FPair>{
-           FPair(-255.0f, 255.0f), FPair(-1.0f, 1.0f), FPair(-1.0f, 255.0f),
-           FPair(0.0f, 1e6), FPair(0.0f, 1.0f), FPair(-31.0f, 13.0f),
+           FPair(-255.0f, 255.0f),
+           FPair(-1.0f, 1.0f),
+           FPair(-1.0f, 255.0f),
+           FPair(0.0f, 1e6),
+           FPair(0.0f, 1.0f),
+           FPair(-31.0f, 13.0f),
            FPair(-5.89505e+08, 5.89505e+08),
        }) {
     const float f_min = min_and_max.first;
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op.h b/tensorflow/core/kernels/quantize_and_dequantize_op.h
index 1363c7e325..3b09ea2527 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op.h
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op.h
@@ -71,7 +71,8 @@ struct QuantizeAndDequantizeOneScaleImpl {
         out.device(d) =
             ((input.cwiseMin(max_range).cwiseMax(min_range) - min_range) *
                  scale +
-             T(0.5)).floor() *
+             T(0.5))
+                    .floor() *
                 inverse_scale +
             min_range;
       } else {
diff --git a/tensorflow/core/kernels/quantize_op_test.cc b/tensorflow/core/kernels/quantize_op_test.cc
index d2cc55a94d..57982bdf76 100644
--- a/tensorflow/core/kernels/quantize_op_test.cc
+++ b/tensorflow/core/kernels/quantize_op_test.cc
@@ -250,7 +250,8 @@ TEST_F(QuantizedOpTest, QuantizeV2_32Bit) {
   Tensor expected(allocator(), DT_QINT32, TensorShape({element_count}));
   test::FillValues<qint32>(&expected,
                            {
-                               std::numeric_limits<int32>::min(), 0,
+                               std::numeric_limits<int32>::min(),
+                               0,
                                static_cast<int32>(1.0f * (1 << 23)),
                                static_cast<int32>(1.25f * (1 << 23)),
                                static_cast<int32>(1.75f * (1 << 23)),
diff --git a/tensorflow/core/kernels/quantized_batch_norm_op.cc b/tensorflow/core/kernels/quantized_batch_norm_op.cc
index 18d83b4149..b03da7ad17 100644
--- a/tensorflow/core/kernels/quantized_batch_norm_op.cc
+++ b/tensorflow/core/kernels/quantized_batch_norm_op.cc
@@ -16,11 +16,11 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/kernels/quantization_utils.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/quantization_utils.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/quantized_concat_op.cc b/tensorflow/core/kernels/quantized_concat_op.cc
index d67f1ab3ec..b03ac8e87d 100644
--- a/tensorflow/core/kernels/quantized_concat_op.cc
+++ b/tensorflow/core/kernels/quantized_concat_op.cc
@@ -135,8 +135,8 @@ class QuantizedConcatOp : public OpKernel {
           context, in.dims() == input_dims || (input_is_scalar && in_is_scalar),
           errors::InvalidArgument(
               "ConcatOp : Ranks of all input tensors should match: shape[0] = ",
-              input_shape.DebugString(), " vs. shape[", i, "] = ",
-              in.shape().DebugString()));
+              input_shape.DebugString(), " vs. shape[", i,
+              "] = ", in.shape().DebugString()));
       for (int j = 0; j < input_dims; ++j) {
         if (j == concat_dim) {
           continue;
@@ -145,8 +145,8 @@ class QuantizedConcatOp : public OpKernel {
             context, in.dim_size(j) == input_shape.dim_size(j),
             errors::InvalidArgument(
                 "ConcatOp : Dimensions of inputs should match: shape[0] = ",
-                input_shape.DebugString(), " vs. shape[", i, "] = ",
-                in.shape().DebugString()));
+                input_shape.DebugString(), " vs. shape[", i,
+                "] = ", in.shape().DebugString()));
       }
       if (in.NumElements() > 0) {
         int64 inputs_flat_dim1 = in.NumElements() / inputs_flat_dim0;
diff --git a/tensorflow/core/kernels/quantized_conv_ops.cc b/tensorflow/core/kernels/quantized_conv_ops.cc
index 1921b83d12..5b3570edff 100644
--- a/tensorflow/core/kernels/quantized_conv_ops.cc
+++ b/tensorflow/core/kernels/quantized_conv_ops.cc
@@ -278,10 +278,9 @@ class Im2ColConvFunctor {
           *resource = new Im2ColBufferResource<T1, chunk_value_count>();
           return Status::OK();
         };
-    OP_REQUIRES_OK(
-        context,
-        context->resource_manager()->LookupOrCreate(
-            "Conv2d", "im2col_buffer", &im2col_buffer_resource, creator));
+    OP_REQUIRES_OK(context, context->resource_manager()->LookupOrCreate(
+                                "Conv2d", "im2col_buffer",
+                                &im2col_buffer_resource, creator));
     // This means that multiple ops can't be run simultaneously on different
     // threads, because we have a single shared resource. The platforms this is
     // aimed at have intra-op parallelism as their focus though, so it shouldn't
diff --git a/tensorflow/core/kernels/quantized_instance_norm.cc b/tensorflow/core/kernels/quantized_instance_norm.cc
index c29f534f31..d62094cc9f 100644
--- a/tensorflow/core/kernels/quantized_instance_norm.cc
+++ b/tensorflow/core/kernels/quantized_instance_norm.cc
@@ -278,10 +278,10 @@ class QuantizedInstanceNorm : public OpKernel {
     float input_max = context->input(2).flat<float>()(0);
     float input_scale = (input_max - input_min) / 255.0f;
 
-    OP_REQUIRES(
-        context, input_min < input_max,
-        errors::InvalidArgument("input_min must be less than input_max : ",
-                                input_min, " >= ", input_max));
+    OP_REQUIRES(context, input_min < input_max,
+                errors::InvalidArgument(
+                    "input_min must be less than input_max : ", input_min,
+                    " >= ", input_max));
 
     auto input_tensor = input.tensor<quint8, 4>();
     auto N = input_tensor.dimension(0);
diff --git a/tensorflow/core/kernels/quantized_matmul_op.cc b/tensorflow/core/kernels/quantized_matmul_op.cc
index afb30d5f62..da8c46dc51 100644
--- a/tensorflow/core/kernels/quantized_matmul_op.cc
+++ b/tensorflow/core/kernels/quantized_matmul_op.cc
@@ -104,9 +104,9 @@ class QuantizedMatMulOp : public OpKernel {
 
     OP_REQUIRES(context,
                 a.dim_size(dim_pair[0].first) == b.dim_size(dim_pair[0].second),
-                errors::InvalidArgument("Matrix size-compatible: In[0]: ",
-                                        a.shape().DebugString(), ", In[1]: ",
-                                        b.shape().DebugString()));
+                errors::InvalidArgument(
+                    "Matrix size-compatible: In[0]: ", a.shape().DebugString(),
+                    ", In[1]: ", b.shape().DebugString()));
 
     OP_REQUIRES(context, ((shift_c >= 0) && (shift_c <= 31)),
                 errors::InvalidArgument("shift_c must be between 0 and 31, "
diff --git a/tensorflow/core/kernels/quantized_matmul_op_test.cc b/tensorflow/core/kernels/quantized_matmul_op_test.cc
index 535b5115c3..c9f05dbc10 100644
--- a/tensorflow/core/kernels/quantized_matmul_op_test.cc
+++ b/tensorflow/core/kernels/quantized_matmul_op_test.cc
@@ -206,17 +206,32 @@ TEST_F(QuantizedMatMulTest, Small_WithParams) {
   // We have set the transpose_a flag to true, so the matrix is transposed, and
   // for filling the values the in-memory storage order is effectively
   // column major, rather than the default row-major.
-  AddInputFromArray<quint8>(TensorShape({a_rows, a_cols}),
-                            {
-                                11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                            });
+  AddInputFromArray<quint8>(TensorShape({a_rows, a_cols}), {
+                                                               11,
+                                                               10,
+                                                               9,
+                                                               8,
+                                                               7,
+                                                               6,
+                                                               5,
+                                                               4,
+                                                               3,
+                                                               2,
+                                                               1,
+                                                               0,
+                                                           });
 
   // The B matrix is:
   // |   1 |   4|
   // |   2 |   5|
   // |   3 |   6|
   AddInputFromArray<quint8>(TensorShape({b_rows, b_cols}), {
-                                                               1, 4, 2, 5, 3, 6,
+                                                               1,
+                                                               4,
+                                                               2,
+                                                               5,
+                                                               3,
+                                                               6,
                                                            });
   AddInputFromArray<float>(TensorShape({1}), {-12.0f});
   AddInputFromArray<float>(TensorShape({1}), {243.0f});
@@ -238,10 +253,16 @@ TEST_F(QuantizedMatMulTest, Small_WithParams) {
   // |  -50 | -113 |
   // |  -56 | -128 |
   Tensor expected(allocator(), DT_QINT32, TensorShape({a_cols, b_cols}));
-  test::FillValues<qint32>(&expected,
-                           {
-                               -38, -83, -44, -98, -50, -113, -56, -128,
-                           });
+  test::FillValues<qint32>(&expected, {
+                                          -38,
+                                          -83,
+                                          -44,
+                                          -98,
+                                          -50,
+                                          -113,
+                                          -56,
+                                          -128,
+                                      });
   test::ExpectTensorEqual<qint32>(expected, *GetOutput(0));
 }
 
diff --git a/tensorflow/core/kernels/quantized_mul_op.cc b/tensorflow/core/kernels/quantized_mul_op.cc
index eaa5e667f7..3c7536e037 100644
--- a/tensorflow/core/kernels/quantized_mul_op.cc
+++ b/tensorflow/core/kernels/quantized_mul_op.cc
@@ -298,9 +298,8 @@ class QuantizedMulOp : public OpKernel {
       return;
     }
     Tensor* z;
-    OP_REQUIRES_OK(
-        context,
-        context->allocate_output(0, BCast::ToShape(bcast.output_shape()), &z));
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                0, BCast::ToShape(bcast.output_shape()), &z));
 
     // Make sure that we have valid quantization ranges for the input buffers.
     // If the difference between the min and max is negative or zero, it makes
diff --git a/tensorflow/core/kernels/quantized_mul_op_test.cc b/tensorflow/core/kernels/quantized_mul_op_test.cc
index b0550c8260..a4e407c7a9 100644
--- a/tensorflow/core/kernels/quantized_mul_op_test.cc
+++ b/tensorflow/core/kernels/quantized_mul_op_test.cc
@@ -188,11 +188,12 @@ void TestManualScalar() {
       10.0f, {1}, {10.0f}, -100.0f, 100.0f, {10},
       {10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f, 90.0f, 100.0f},
       3.0f);
-  TestMul({1}, {10.0f}, -100.0f, 100.0f, {10},
-          {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f}, 0.0f,
-          10.0f, {10}, {10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f,
-                        90.0f, 100.0f},
-          3.0f);
+  TestMul(
+      {1}, {10.0f}, -100.0f, 100.0f, {10},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f}, 0.0f,
+      10.0f, {10},
+      {10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f, 90.0f, 100.0f},
+      3.0f);
 }
 
 void TestScalar() {
diff --git a/tensorflow/core/kernels/queue_base.cc b/tensorflow/core/kernels/queue_base.cc
index 330d161c32..de495c19cb 100644
--- a/tensorflow/core/kernels/queue_base.cc
+++ b/tensorflow/core/kernels/queue_base.cc
@@ -39,8 +39,8 @@ Status HandleSliceToElement(const Tensor& parent, Tensor* element,
     return errors::Internal(
         "HandleSliceToElement Cannot copy slice: number of elements does not "
         "match.  Shapes are: [element]: ",
-        element->shape().DebugString(), ", [parent slice]: ",
-        chip_shape.DebugString());
+        element->shape().DebugString(),
+        ", [parent slice]: ", chip_shape.DebugString());
   }
   auto parent_as_matrix = parent.flat_outer_dims<T>();
   element->flat<T>() = parent_as_matrix.chip(index, 0);
diff --git a/tensorflow/core/kernels/queue_ops.cc b/tensorflow/core/kernels/queue_ops.cc
index 17831b7437..46a02854d7 100644
--- a/tensorflow/core/kernels/queue_ops.cc
+++ b/tensorflow/core/kernels/queue_ops.cc
@@ -428,13 +428,14 @@ REGISTER_KERNEL_BUILDER(Name("QueueSizeV2").Device(DEVICE_CPU), QueueSizeOp);
 class QueueIsClosedOp : public QueueOpKernel {
  public:
   explicit QueueIsClosedOp(OpKernelConstruction* context)
-     : QueueOpKernel(context) {}
+      : QueueOpKernel(context) {}
 
  protected:
   void ComputeAsync(OpKernelContext* ctx, QueueInterface* queue,
                     DoneCallback callback) override {
     Tensor* Tqueue_is_closed = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &Tqueue_is_closed));
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output(0, TensorShape({}), &Tqueue_is_closed));
     Tqueue_is_closed->flat<bool>().setConstant(queue->is_closed());
     callback();
   }
@@ -443,8 +444,10 @@ class QueueIsClosedOp : public QueueOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(QueueIsClosedOp);
 };
 
-REGISTER_KERNEL_BUILDER(Name("QueueIsClosed").Device(DEVICE_CPU), QueueIsClosedOp);
-REGISTER_KERNEL_BUILDER(Name("QueueIsClosedV2").Device(DEVICE_CPU), QueueIsClosedOp);
+REGISTER_KERNEL_BUILDER(Name("QueueIsClosed").Device(DEVICE_CPU),
+                        QueueIsClosedOp);
+REGISTER_KERNEL_BUILDER(Name("QueueIsClosedV2").Device(DEVICE_CPU),
+                        QueueIsClosedOp);
 
 class FakeQueueOp : public OpKernel {
  public:
diff --git a/tensorflow/core/kernels/random_crop_op.cc b/tensorflow/core/kernels/random_crop_op.cc
index ba94d6be5c..554909760a 100644
--- a/tensorflow/core/kernels/random_crop_op.cc
+++ b/tensorflow/core/kernels/random_crop_op.cc
@@ -68,10 +68,10 @@ class RandomCropOp : public OpKernel {
     // Edge case. The target dimensions are larger then the image, so
     // zero-pad the image. This guarantees that the image will *always*
     // be [target_height, target_width] in size.
-    OP_REQUIRES(
-        context, width >= target_width,
-        errors::FailedPrecondition("width must be >= target_width: width = ",
-                                   width, ", target_width = ", target_width));
+    OP_REQUIRES(context, width >= target_width,
+                errors::FailedPrecondition(
+                    "width must be >= target_width: width = ", width,
+                    ", target_width = ", target_width));
     OP_REQUIRES(context, height >= target_height,
                 errors::FailedPrecondition(
                     "height must be >= target_height: height = ", height,
diff --git a/tensorflow/core/kernels/random_op.cc b/tensorflow/core/kernels/random_op.cc
index 55a8b9c9b6..78ff7948fb 100644
--- a/tensorflow/core/kernels/random_op.cc
+++ b/tensorflow/core/kernels/random_op.cc
@@ -50,7 +50,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 namespace functor {
 using random::PhiloxRandom;
@@ -271,9 +271,10 @@ class RandomGammaOp : public OpKernel {
     const Tensor& shape_t = ctx->input(0);
     const Tensor& alpha_t = ctx->input(1);
 
-    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(shape_t.shape()) &&
-                         (shape_t.dtype() == DataType::DT_INT32 ||
-                          shape_t.dtype() == DataType::DT_INT64),
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsVector(shape_t.shape()) &&
+                    (shape_t.dtype() == DataType::DT_INT32 ||
+                     shape_t.dtype() == DataType::DT_INT64),
                 errors::InvalidArgument(
                     "shape must be a vector of {int32,int64}, got shape: ",
                     shape_t.DebugString()));
@@ -325,7 +326,7 @@ class RandomGammaOp : public OpKernel {
     // avoid a couple flops which can be done on a per-alpha basis.
 
     auto DoWork = [num_samples, num_alphas, &rng, samples_flat, alpha_flat](
-        int start_output, int limit_output) {
+                      int start_output, int limit_output) {
       using Eigen::numext::exp;
       using Eigen::numext::log;
       using Eigen::numext::pow;
@@ -448,40 +449,40 @@ class RandomGammaOp : public OpKernel {
 
 }  // namespace
 
-#define REGISTER(TYPE)                                                      \
-  template struct functor::FillPhiloxRandom<                                \
-      CPUDevice, random::UniformDistribution<random::PhiloxRandom, TYPE> >; \
-  template struct functor::FillPhiloxRandom<                                \
-      CPUDevice, random::NormalDistribution<random::PhiloxRandom, TYPE> >;  \
-  template struct functor::FillPhiloxRandom<                                \
-      CPUDevice,                                                            \
-      random::TruncatedNormalDistribution<                                  \
-          random::SingleSampleAdapter<random::PhiloxRandom>, TYPE> >;       \
-  REGISTER_KERNEL_BUILDER(                                                  \
-      Name("RandomUniform")                                                 \
-          .Device(DEVICE_CPU)                                               \
-          .HostMemory("shape")                                              \
-          .TypeConstraint<TYPE>("dtype"),                                   \
-      PhiloxRandomOp<CPUDevice, random::UniformDistribution<                \
-                                    random::PhiloxRandom, TYPE> >);         \
-  REGISTER_KERNEL_BUILDER(                                                  \
-      Name("RandomStandardNormal")                                          \
-          .Device(DEVICE_CPU)                                               \
-          .HostMemory("shape")                                              \
-          .TypeConstraint<TYPE>("dtype"),                                   \
-      PhiloxRandomOp<CPUDevice, random::NormalDistribution<                 \
-                                    random::PhiloxRandom, TYPE> >);         \
-  REGISTER_KERNEL_BUILDER(                                                  \
-      Name("TruncatedNormal")                                               \
-          .Device(DEVICE_CPU)                                               \
-          .HostMemory("shape")                                              \
-          .TypeConstraint<TYPE>("dtype"),                                   \
-      PhiloxRandomOp<                                                       \
-          CPUDevice,                                                        \
-          random::TruncatedNormalDistribution<                              \
-              random::SingleSampleAdapter<random::PhiloxRandom>, TYPE> >);  \
-  REGISTER_KERNEL_BUILDER(                                                  \
-      Name("RandomGamma").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"),     \
+#define REGISTER(TYPE)                                                         \
+  template struct functor::FillPhiloxRandom<                                   \
+      CPUDevice, random::UniformDistribution<random::PhiloxRandom, TYPE>>;     \
+  template struct functor::FillPhiloxRandom<                                   \
+      CPUDevice, random::NormalDistribution<random::PhiloxRandom, TYPE>>;      \
+  template struct functor::FillPhiloxRandom<                                   \
+      CPUDevice,                                                               \
+      random::TruncatedNormalDistribution<                                     \
+          random::SingleSampleAdapter<random::PhiloxRandom>, TYPE>>;           \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("RandomUniform")                                                    \
+          .Device(DEVICE_CPU)                                                  \
+          .HostMemory("shape")                                                 \
+          .TypeConstraint<TYPE>("dtype"),                                      \
+      PhiloxRandomOp<CPUDevice, random::UniformDistribution<                   \
+                                    random::PhiloxRandom, TYPE>>);             \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("RandomStandardNormal")                                             \
+          .Device(DEVICE_CPU)                                                  \
+          .HostMemory("shape")                                                 \
+          .TypeConstraint<TYPE>("dtype"),                                      \
+      PhiloxRandomOp<CPUDevice,                                                \
+                     random::NormalDistribution<random::PhiloxRandom, TYPE>>); \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("TruncatedNormal")                                                  \
+          .Device(DEVICE_CPU)                                                  \
+          .HostMemory("shape")                                                 \
+          .TypeConstraint<TYPE>("dtype"),                                      \
+      PhiloxRandomOp<                                                          \
+          CPUDevice,                                                           \
+          random::TruncatedNormalDistribution<                                 \
+              random::SingleSampleAdapter<random::PhiloxRandom>, TYPE>>);      \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("RandomGamma").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"),        \
       RandomGammaOp<TYPE>)
 
 #define REGISTER_INT(IntType)                                   \
@@ -504,33 +505,33 @@ TF_CALL_int64(REGISTER_INT);
 
 #if GOOGLE_CUDA
 
-#define REGISTER(TYPE)                                              \
-  REGISTER_KERNEL_BUILDER(                                          \
-      Name("RandomUniform")                                         \
-          .Device(DEVICE_GPU)                                       \
-          .HostMemory("shape")                                      \
-          .TypeConstraint<int32>("T")                               \
-          .TypeConstraint<TYPE>("dtype"),                           \
-      PhiloxRandomOp<GPUDevice, random::UniformDistribution<        \
-                                    random::PhiloxRandom, TYPE> >); \
-  REGISTER_KERNEL_BUILDER(                                          \
-      Name("RandomStandardNormal")                                  \
-          .Device(DEVICE_GPU)                                       \
-          .HostMemory("shape")                                      \
-          .TypeConstraint<int32>("T")                               \
-          .TypeConstraint<TYPE>("dtype"),                           \
-      PhiloxRandomOp<GPUDevice, random::NormalDistribution<         \
-                                    random::PhiloxRandom, TYPE> >); \
-  REGISTER_KERNEL_BUILDER(                                          \
-      Name("TruncatedNormal")                                       \
-          .Device(DEVICE_GPU)                                       \
-          .HostMemory("shape")                                      \
-          .TypeConstraint<int32>("T")                               \
-          .TypeConstraint<TYPE>("dtype"),                           \
-      PhiloxRandomOp<                                               \
-          GPUDevice,                                                \
-          random::TruncatedNormalDistribution<                      \
-              random::SingleSampleAdapter<random::PhiloxRandom>, TYPE> >);
+#define REGISTER(TYPE)                                                         \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("RandomUniform")                                                    \
+          .Device(DEVICE_GPU)                                                  \
+          .HostMemory("shape")                                                 \
+          .TypeConstraint<int32>("T")                                          \
+          .TypeConstraint<TYPE>("dtype"),                                      \
+      PhiloxRandomOp<GPUDevice, random::UniformDistribution<                   \
+                                    random::PhiloxRandom, TYPE>>);             \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("RandomStandardNormal")                                             \
+          .Device(DEVICE_GPU)                                                  \
+          .HostMemory("shape")                                                 \
+          .TypeConstraint<int32>("T")                                          \
+          .TypeConstraint<TYPE>("dtype"),                                      \
+      PhiloxRandomOp<GPUDevice,                                                \
+                     random::NormalDistribution<random::PhiloxRandom, TYPE>>); \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("TruncatedNormal")                                                  \
+          .Device(DEVICE_GPU)                                                  \
+          .HostMemory("shape")                                                 \
+          .TypeConstraint<int32>("T")                                          \
+          .TypeConstraint<TYPE>("dtype"),                                      \
+      PhiloxRandomOp<                                                          \
+          GPUDevice,                                                           \
+          random::TruncatedNormalDistribution<                                 \
+              random::SingleSampleAdapter<random::PhiloxRandom>, TYPE>>);
 
 #define REGISTER_INT(IntType)                                   \
   REGISTER_KERNEL_BUILDER(Name("RandomUniformInt")              \
@@ -565,13 +566,12 @@ struct FillPhiloxRandomKernel;
 template <class Distribution>
 struct FillPhiloxRandomKernel<Distribution, false> {
   typedef typename Distribution::ResultElementType T;
-  using write_accessor = sycl::accessor<uint8_t, 1, sycl::access::mode::write, sycl::access::target::global_buffer>;
+  using write_accessor = sycl::accessor<uint8_t, 1, sycl::access::mode::write,
+                                        sycl::access::target::global_buffer>;
 
-  FillPhiloxRandomKernel(write_accessor& data, random::PhiloxRandom& gen, Distribution& dist)
-      : data_(data),
-        gen_(gen),
-        dist_(dist) {
-  }
+  FillPhiloxRandomKernel(write_accessor& data, random::PhiloxRandom& gen,
+                         Distribution& dist)
+      : data_(data), gen_(gen), dist_(dist) {}
 
   void operator()(sycl::nd_item<1> item) {
     const size_t kGroupSize = Distribution::kResultElementCount;
@@ -597,7 +597,7 @@ struct FillPhiloxRandomKernel<Distribution, false> {
     const typename Distribution::ResultType samples = dist_(&gen_);
     for (size_t i = 0; i < kGroupSize; ++i) {
       if (offset >= size) {
-          return;
+        return;
       }
       data[offset] = samples[i];
       ++offset;
@@ -610,17 +610,15 @@ struct FillPhiloxRandomKernel<Distribution, false> {
   Distribution dist_;
 };
 
-
 template <class Distribution>
 struct FillPhiloxRandomKernel<Distribution, true> {
   typedef typename Distribution::ResultElementType T;
-  using write_accessor = sycl::accessor<uint8_t, 1, sycl::access::mode::write, sycl::access::target::global_buffer>;
+  using write_accessor = sycl::accessor<uint8_t, 1, sycl::access::mode::write,
+                                        sycl::access::target::global_buffer>;
 
-  FillPhiloxRandomKernel(write_accessor& data, random::PhiloxRandom& gen, Distribution& dist)
-      : data_(data),
-        gen_(gen),
-        dist_(dist) {
-  }
+  FillPhiloxRandomKernel(write_accessor& data, random::PhiloxRandom& gen,
+                         Distribution& dist)
+      : data_(data), gen_(gen), dist_(dist) {}
 
   void operator()(sycl::nd_item<1> item) {
     using random::PhiloxRandom;
@@ -628,9 +626,9 @@ struct FillPhiloxRandomKernel<Distribution, true> {
 
     const size_t kReservedSamplesPerOutput = 256;
     const size_t kGroupSize = Distribution::kResultElementCount;
-    const size_t kGeneratorSkipPerOutputGroup = kGroupSize *
-                                                kReservedSamplesPerOutput /
-                                                PhiloxRandom::kResultElementCount;
+    const size_t kGeneratorSkipPerOutputGroup =
+        kGroupSize * kReservedSamplesPerOutput /
+        PhiloxRandom::kResultElementCount;
 
     const size_t item_id = item.get_global(0);
     const size_t total_item_count = item.get_global_range();
@@ -674,10 +672,9 @@ class FillRandomKernel;
 // It splits the work into several tasks and run them in parallel
 template <class Distribution>
 void FillPhiloxRandom<SYCLDevice, Distribution>::operator()(
-    OpKernelContext* context, const SYCLDevice& device, random::PhiloxRandom gen,
-    typename Distribution::ResultElementType* data, int64 size,
-    Distribution dist) {
-
+    OpKernelContext* context, const SYCLDevice& device,
+    random::PhiloxRandom gen, typename Distribution::ResultElementType* data,
+    int64 size, Distribution dist) {
   const size_t group_size = device.maxSyclThreadsPerBlock();
   const size_t group_count = (size + group_size - 1) / group_size;
 
@@ -686,50 +683,52 @@ void FillPhiloxRandom<SYCLDevice, Distribution>::operator()(
   device.sycl_queue().submit([&](sycl::handler& cgh) {
     auto access = buffer.template get_access<sycl::access::mode::write>(cgh);
 
-    FillPhiloxRandomKernel<Distribution, Distribution::kVariableSamplesPerOutput> task(access, gen, dist);
+    FillPhiloxRandomKernel<Distribution,
+                           Distribution::kVariableSamplesPerOutput>
+        task(access, gen, dist);
     cgh.parallel_for<class FillRandomKernel<Distribution>>(
-      sycl::nd_range<1>(sycl::range<1>(group_count * group_size), sycl::range<1>(group_size)),
-      task
-    );
+        sycl::nd_range<1>(sycl::range<1>(group_count * group_size),
+                          sycl::range<1>(group_size)),
+        task);
   });
 }
 
-}
+}  // namespace functor
+
+#define REGISTER(TYPE)                                                         \
+  template struct functor::FillPhiloxRandom<                                   \
+      SYCLDevice, random::UniformDistribution<random::PhiloxRandom, TYPE>>;    \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("RandomUniform")                                                    \
+          .Device(DEVICE_SYCL)                                                 \
+          .HostMemory("shape")                                                 \
+          .TypeConstraint<TYPE>("dtype"),                                      \
+      PhiloxRandomOp<SYCLDevice, random::UniformDistribution<                  \
+                                     random::PhiloxRandom, TYPE>>);            \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("RandomStandardNormal")                                             \
+          .Device(DEVICE_SYCL)                                                 \
+          .HostMemory("shape")                                                 \
+          .TypeConstraint<TYPE>("dtype"),                                      \
+      PhiloxRandomOp<SYCLDevice,                                               \
+                     random::NormalDistribution<random::PhiloxRandom, TYPE>>); \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("TruncatedNormal")                                                  \
+          .Device(DEVICE_SYCL)                                                 \
+          .HostMemory("shape")                                                 \
+          .TypeConstraint<TYPE>("dtype"),                                      \
+      PhiloxRandomOp<                                                          \
+          SYCLDevice,                                                          \
+          random::TruncatedNormalDistribution<                                 \
+              random::SingleSampleAdapter<random::PhiloxRandom>, TYPE>>);
 
-#define REGISTER(TYPE)                                                       \
-  template struct functor::FillPhiloxRandom<                                 \
-      SYCLDevice, random::UniformDistribution<random::PhiloxRandom, TYPE> >; \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("RandomUniform")                                                  \
-          .Device(DEVICE_SYCL)                                               \
-          .HostMemory("shape")                                               \
-          .TypeConstraint<TYPE>("dtype"),                                    \
-      PhiloxRandomOp<SYCLDevice, random::UniformDistribution<                \
-                                    random::PhiloxRandom, TYPE> >);          \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("RandomStandardNormal")                                           \
-          .Device(DEVICE_SYCL)                                               \
-          .HostMemory("shape")                                               \
-          .TypeConstraint<TYPE>("dtype"),                                    \
-      PhiloxRandomOp<SYCLDevice, random::NormalDistribution<                 \
-                                    random::PhiloxRandom, TYPE> >);          \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("TruncatedNormal")                                                \
-          .Device(DEVICE_SYCL)                                               \
-          .HostMemory("shape")                                               \
-          .TypeConstraint<TYPE>("dtype"),                                    \
-      PhiloxRandomOp<                                                        \
-          SYCLDevice,                                                        \
-          random::TruncatedNormalDistribution<                               \
-              random::SingleSampleAdapter<random::PhiloxRandom>, TYPE> >);
-
-#define REGISTER_INT(IntType)                                    \
-  REGISTER_KERNEL_BUILDER(Name("RandomUniformInt")               \
-                              .Device(DEVICE_SYCL)               \
-                              .HostMemory("shape")               \
-                              .HostMemory("minval")              \
-                              .HostMemory("maxval")              \
-                              .TypeConstraint<IntType>("Tout"),  \
+#define REGISTER_INT(IntType)                                   \
+  REGISTER_KERNEL_BUILDER(Name("RandomUniformInt")              \
+                              .Device(DEVICE_SYCL)              \
+                              .HostMemory("shape")              \
+                              .HostMemory("minval")             \
+                              .HostMemory("maxval")             \
+                              .TypeConstraint<IntType>("Tout"), \
                           RandomUniformIntOp<SYCLDevice, IntType>);
 
 TF_CALL_float(REGISTER);
@@ -740,6 +739,6 @@ TF_CALL_int64(REGISTER_INT);
 #undef REGISTER
 #undef REGISTER_INT
 
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/random_op_gpu.cu.cc b/tensorflow/core/kernels/random_op_gpu.cu.cc
index 7afa6974c6..3393b39faf 100644
--- a/tensorflow/core/kernels/random_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/random_op_gpu.cu.cc
@@ -222,9 +222,8 @@ void FillPhiloxRandom<GPUDevice, Distribution>::operator()(
       (d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor()) /
       block_size;
 
-  FillPhiloxRandomKernelLaunch<
-      Distribution><<<num_blocks, block_size, 0, d.stream()>>>(gen, data, size,
-                                                               dist);
+  FillPhiloxRandomKernelLaunch<Distribution>
+      <<<num_blocks, block_size, 0, d.stream()>>>(gen, data, size, dist);
 };
 
 // Explicit instantiation of the GPU distributions functors
diff --git a/tensorflow/core/kernels/random_poisson_op.cc b/tensorflow/core/kernels/random_poisson_op.cc
index bf1d83ec75..64fb4a5c22 100644
--- a/tensorflow/core/kernels/random_poisson_op.cc
+++ b/tensorflow/core/kernels/random_poisson_op.cc
@@ -103,7 +103,7 @@ struct PoissonFunctor<CPUDevice, T, U> {
     typedef random::UniformDistribution<random::PhiloxRandom, CT> Uniform;
 
     auto DoWork = [num_samples, num_rate, &rng, samples_flat, rate_flat](
-        int start_output, int limit_output) {
+                      int start_output, int limit_output) {
       // Capturing "rng" by value would only make a copy for the _shared_
       // lambda.  Since we want to let each worker have its own copy, we pass
       // "rng" by reference and explicitly do a copy assignment.
diff --git a/tensorflow/core/kernels/random_shuffle_queue_op.cc b/tensorflow/core/kernels/random_shuffle_queue_op.cc
index e9695cfde3..87fc943331 100644
--- a/tensorflow/core/kernels/random_shuffle_queue_op.cc
+++ b/tensorflow/core/kernels/random_shuffle_queue_op.cc
@@ -334,96 +334,95 @@ void RandomShuffleQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
       // TODO(josh11b): This makes two copies of callback, avoid this if possible.
       dequeue_attempts_.emplace_back(
           num_elements, [callback]() { callback(Tuple()); }, ctx, cm, token,
-          [callback, allow_small_batch, this](Attempt* attempt)
-              EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-                int32 queue_size = queues_[0].size();
-                if (closed_ && queue_size < attempt->elements_requested) {
-                  // If we don't have enough for a full dequeue, we have
-                  // to reset the attempt tuple.
-                  if (!attempt->tuple.empty()) {
-                    // Restore already-dequeued elements to the queue.
-                    for (int64 i = attempt->tuple[0].dim_size(0) -
-                                   attempt->elements_requested - 1;
-                         i >= 0; --i) {
-                      for (int j = 0; j < num_components(); ++j) {
-                        PersistentTensor element;
-                        Status s = GetElementComponentFromBatch(
-                            attempt->tuple, i, j, attempt->context, &element);
-                        if (!s.ok()) {
-                          attempt->context->SetStatus(
-                              errors::DataLoss("Failed to restore element from "
-                                               "partially-dequeued batch "
-                                               "to RandomShuffleQueue: ",
-                                               s.error_message()));
-                        }
-                        queues_[j].push_back(element);
-                      }
-                    }
-                  }
-                  if (allow_small_batch && !queues_[0].empty()) {
-                    // Request all remaining elements in the queue.
-                    queue_size = queues_[0].size();
-                    attempt->tuple.clear();
-                    attempt->elements_requested = queue_size;
-                  } else {
-                    if (allow_small_batch) {
-                      // There may be some other attempts containing
-                      // values.  If so, we'll yield and wait for them
-                      // to add elements to the queue.
-                      if (!enqueue_attempts_.empty()) return kProgress;
-                    }
-                    if (attempt->context->status().ok()) {
-                      attempt->context->SetStatus(errors::OutOfRange(
-                          "RandomShuffleQueue '", name_, "' is closed and has ",
-                          "insufficient elements (requested ",
-                          attempt->elements_requested, ", current size ",
-                          queue_size, ")"));
+          [callback, allow_small_batch,
+           this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+            int32 queue_size = queues_[0].size();
+            if (closed_ && queue_size < attempt->elements_requested) {
+              // If we don't have enough for a full dequeue, we have
+              // to reset the attempt tuple.
+              if (!attempt->tuple.empty()) {
+                // Restore already-dequeued elements to the queue.
+                for (int64 i = attempt->tuple[0].dim_size(0) -
+                               attempt->elements_requested - 1;
+                     i >= 0; --i) {
+                  for (int j = 0; j < num_components(); ++j) {
+                    PersistentTensor element;
+                    Status s = GetElementComponentFromBatch(
+                        attempt->tuple, i, j, attempt->context, &element);
+                    if (!s.ok()) {
+                      attempt->context->SetStatus(
+                          errors::DataLoss("Failed to restore element from "
+                                           "partially-dequeued batch "
+                                           "to RandomShuffleQueue: ",
+                                           s.error_message()));
                     }
-                    return kComplete;
+                    queues_[j].push_back(element);
                   }
                 }
+              }
+              if (allow_small_batch && !queues_[0].empty()) {
+                // Request all remaining elements in the queue.
+                queue_size = queues_[0].size();
+                attempt->tuple.clear();
+                attempt->elements_requested = queue_size;
+              } else {
+                if (allow_small_batch) {
+                  // There may be some other attempts containing
+                  // values.  If so, we'll yield and wait for them
+                  // to add elements to the queue.
+                  if (!enqueue_attempts_.empty()) return kProgress;
+                }
+                if (attempt->context->status().ok()) {
+                  attempt->context->SetStatus(errors::OutOfRange(
+                      "RandomShuffleQueue '", name_, "' is closed and has ",
+                      "insufficient elements (requested ",
+                      attempt->elements_requested, ", current size ",
+                      queue_size, ")"));
+                }
+                return kComplete;
+              }
+            }
 
-                RunResult result = kNoProgress;
-                if (!closed_) queue_size -= min_after_dequeue_;
-                for (; queue_size > 0; --queue_size) {
-                  if (attempt->tuple.empty()) {
-                    // Only allocate tuple when we have something to dequeue
-                    // so we don't use excessive memory when there are many
-                    // blocked dequeue attempts waiting.
-                    attempt->tuple.reserve(num_components());
-                    for (int i = 0; i < num_components(); ++i) {
-                      const TensorShape shape =
-                          ManyOutShape(i, attempt->elements_requested);
-                      Tensor element;
-                      attempt->context->SetStatus(
-                          attempt->context->allocate_temp(component_dtypes_[i],
-                                                          shape, &element));
-                      if (!attempt->context->status().ok()) return kComplete;
-                      attempt->tuple.emplace_back(element);
-                    }
-                  }
-                  result = kProgress;
-                  Tuple tuple;
-                  DequeueLocked(attempt->context, &tuple);
-                  const int index = attempt->tuple[0].dim_size(0) -
-                                    attempt->elements_requested;
-                  for (int i = 0; i < num_components(); ++i) {
-                    attempt->context->SetStatus(batch_util::CopyElementToSlice(
-                        std::move(tuple[i]), &attempt->tuple[i], index));
-                    if (!attempt->context->status().ok()) return kComplete;
-                  }
-                  tuple.clear();
-                  --attempt->elements_requested;
-                  if (attempt->elements_requested == 0) {
-                    tuple = attempt->tuple;
-                    attempt->done_callback = [callback, tuple]() {
-                      callback(tuple);
-                    };
-                    return kComplete;
-                  }
+            RunResult result = kNoProgress;
+            if (!closed_) queue_size -= min_after_dequeue_;
+            for (; queue_size > 0; --queue_size) {
+              if (attempt->tuple.empty()) {
+                // Only allocate tuple when we have something to dequeue
+                // so we don't use excessive memory when there are many
+                // blocked dequeue attempts waiting.
+                attempt->tuple.reserve(num_components());
+                for (int i = 0; i < num_components(); ++i) {
+                  const TensorShape shape =
+                      ManyOutShape(i, attempt->elements_requested);
+                  Tensor element;
+                  attempt->context->SetStatus(attempt->context->allocate_temp(
+                      component_dtypes_[i], shape, &element));
+                  if (!attempt->context->status().ok()) return kComplete;
+                  attempt->tuple.emplace_back(element);
                 }
-                return result;
-              });
+              }
+              result = kProgress;
+              Tuple tuple;
+              DequeueLocked(attempt->context, &tuple);
+              const int index =
+                  attempt->tuple[0].dim_size(0) - attempt->elements_requested;
+              for (int i = 0; i < num_components(); ++i) {
+                attempt->context->SetStatus(batch_util::CopyElementToSlice(
+                    std::move(tuple[i]), &attempt->tuple[i], index));
+                if (!attempt->context->status().ok()) return kComplete;
+              }
+              tuple.clear();
+              --attempt->elements_requested;
+              if (attempt->elements_requested == 0) {
+                tuple = attempt->tuple;
+                attempt->done_callback = [callback, tuple]() {
+                  callback(tuple);
+                };
+                return kComplete;
+              }
+            }
+            return result;
+          });
     }
   }
   if (!already_cancelled) {
diff --git a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
index 36ca7f834f..15ae4c1fc5 100644
--- a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
+++ b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
@@ -312,8 +312,7 @@ __global__ void ColumnReduceKernel(
   int col = blockIdx.x * 32 + threadIdx.x;
 
   value_type sum = initVal;
-  if (row < num_rows && col < num_cols)
-    sum = in[row * num_cols + col];
+  if (row < num_rows && col < num_cols) sum = in[row * num_cols + col];
 
   // 1D array necessary due to bug in CUDA 9 compiler.
   // TODO(nluehr) revert to 2D array when compiler is ready.
@@ -366,8 +365,7 @@ __global__ void CleanupSegments(
   const int tid = threadIdx.x + blockIdx.x * blockDim.x;
 
   value_type val = initVal;
-  if (tid < segment_size * num_cols)
-    val = partial_sums[tid];
+  if (tid < segment_size * num_cols) val = partial_sums[tid];
 
   typedef cub::WarpReduce<value_type> WarpReduce;
 
diff --git a/tensorflow/core/kernels/relu_op.cc b/tensorflow/core/kernels/relu_op.cc
index afad288cc0..d52358737f 100644
--- a/tensorflow/core/kernels/relu_op.cc
+++ b/tensorflow/core/kernels/relu_op.cc
@@ -31,7 +31,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #define REGISTER_RELU_KERNELS(type)                                   \
   REGISTER_KERNEL_BUILDER(                                            \
@@ -113,8 +113,7 @@ namespace functor {
                                                                                \
   template <>                                                                  \
   void Selu<GPUDevice, T>::operator()(                                         \
-      const GPUDevice& d,                                                      \
-      typename TTypes<T>::ConstTensor features,                                \
+      const GPUDevice& d, typename TTypes<T>::ConstTensor features,            \
       typename TTypes<T>::Tensor activations);                                 \
   extern template struct Selu<GPUDevice, T>;                                   \
                                                                                \
@@ -125,8 +124,6 @@ namespace functor {
       typename TTypes<T>::Tensor backprops);                                   \
   extern template struct SeluGrad<GPUDevice, T>;
 
-
-
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
 }  // namespace functor
 
@@ -157,8 +154,6 @@ TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
       Name("SeluGrad").Device(DEVICE_GPU).TypeConstraint<type>("T"),  \
       SeluGradOp<GPUDevice, type>)
 
-
-
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
 
@@ -192,10 +187,8 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
       Name("SeluGrad").Device(DEVICE_SYCL).TypeConstraint<type>("T"),  \
       SeluGradOp<SYCLDevice, type>)
 
-
-
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS);
 #undef REGISTER_SYCL_KERNELS
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/relu_op_functor.h b/tensorflow/core/kernels/relu_op_functor.h
index 24b789c543..3bc5ba8a50 100644
--- a/tensorflow/core/kernels/relu_op_functor.h
+++ b/tensorflow/core/kernels/relu_op_functor.h
@@ -85,10 +85,9 @@ struct Relu6Grad {
     // make sure not to propagate the associated gradient
     // value. This allows "features" to be either the input or the output of
     // the relu6.
-    backprops.device(d) =
-        gradients *
-        ((features > static_cast<T>(0)) * (features < static_cast<T>(6)))
-            .template cast<T>();
+    backprops.device(d) = gradients * ((features > static_cast<T>(0)) *
+                                       (features < static_cast<T>(6)))
+                                          .template cast<T>();
   }
 };
 
@@ -161,8 +160,8 @@ struct SeluGrad {
     const auto scale = static_cast<T>(1.0507009873554804934193349852946);
     const auto scale_alpha = static_cast<T>(1.7580993408473768599402175208123);
     backprops.device(d) =
-        (activations < static_cast<T>(0)).select(
-            gradients * (activations + scale_alpha), gradients * scale);
+        (activations < static_cast<T>(0))
+            .select(gradients * (activations + scale_alpha), gradients * scale);
   }
 };
 
diff --git a/tensorflow/core/kernels/resize_bicubic_op.cc b/tensorflow/core/kernels/resize_bicubic_op.cc
index 1a9cf4c640..86e61bbcef 100644
--- a/tensorflow/core/kernels/resize_bicubic_op.cc
+++ b/tensorflow/core/kernels/resize_bicubic_op.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <algorithm>
 #include <array>
 
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -28,7 +29,6 @@ limitations under the License.
 #include "tensorflow/core/kernels/image_resizer_state.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/core/kernels/resize_bicubic_op_test.cc b/tensorflow/core/kernels/resize_bicubic_op_test.cc
index 9e10fec423..25a37d5e1a 100644
--- a/tensorflow/core/kernels/resize_bicubic_op_test.cc
+++ b/tensorflow/core/kernels/resize_bicubic_op_test.cc
@@ -286,13 +286,14 @@ BM_ResizeBicubicDev(32, 128, 3);
 BM_ResizeBicubicDev(32, 512, 3);
 BM_ResizeBicubicDev(32, 1024, 3);
 
-#define BM_ResizeBicubicExpand(BATCH, SIZE, CHANNELS)                          \
-  static void BM_ResizeBicubicExpand##_##BATCH##_##SIZE##_##CHANNELS(int iters) { \
-    testing::ItemsProcessed(static_cast<int64>(iters) * BATCH * SIZE * SIZE *  \
-                            CHANNELS * 8 * 8);                                 \
-    test::Benchmark("cpu", ResizeBicubic(BATCH, SIZE, CHANNELS, 8, 8))         \
-        .Run(iters);                                                           \
-  }                                                                            \
+#define BM_ResizeBicubicExpand(BATCH, SIZE, CHANNELS)                         \
+  static void BM_ResizeBicubicExpand##_##BATCH##_##SIZE##_##CHANNELS(         \
+      int iters) {                                                            \
+    testing::ItemsProcessed(static_cast<int64>(iters) * BATCH * SIZE * SIZE * \
+                            CHANNELS * 8 * 8);                                \
+    test::Benchmark("cpu", ResizeBicubic(BATCH, SIZE, CHANNELS, 8, 8))        \
+        .Run(iters);                                                          \
+  }                                                                           \
   BENCHMARK(BM_ResizeBicubicExpand##_##BATCH##_##SIZE##_##CHANNELS);
 
 BM_ResizeBicubicExpand(12, 48, 1);
diff --git a/tensorflow/core/kernels/resize_bilinear_op_gpu.cu.cc b/tensorflow/core/kernels/resize_bilinear_op_gpu.cu.cc
index a7da7a0777..f82c3fcd9f 100644
--- a/tensorflow/core/kernels/resize_bilinear_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/resize_bilinear_op_gpu.cu.cc
@@ -164,11 +164,11 @@ struct ResizeBilinear<GPUDevice, T> {
     if (total_count == 0) return;
 
     CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
-    ResizeBilinearKernel<
-        T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        config.virtual_thread_count, images.data(), height_scale, width_scale,
-        batch, in_height, in_width, channels, out_height, out_width,
-        output.data());
+    ResizeBilinearKernel<T>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            config.virtual_thread_count, images.data(), height_scale,
+            width_scale, batch, in_height, in_width, channels, out_height,
+            out_width, output.data());
   }
 };
 
@@ -200,11 +200,11 @@ struct ResizeBilinearGrad<GPUDevice, T> {
     // Accumulate.
     total_count = batch * resized_height * resized_width * channels;
     config = GetCudaLaunchConfig(total_count, d);
-    ResizeBilinearGradKernel<
-        T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        config.virtual_thread_count, input_grad.data(), height_scale,
-        width_scale, batch, original_height, original_width, channels,
-        resized_height, resized_width, output_grad.data());
+    ResizeBilinearGradKernel<T>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            config.virtual_thread_count, input_grad.data(), height_scale,
+            width_scale, batch, original_height, original_width, channels,
+            resized_height, resized_width, output_grad.data());
   }
 };
 
diff --git a/tensorflow/core/kernels/reverse_op.cc b/tensorflow/core/kernels/reverse_op.cc
index 8f82784d93..bb96c42f10 100644
--- a/tensorflow/core/kernels/reverse_op.cc
+++ b/tensorflow/core/kernels/reverse_op.cc
@@ -269,10 +269,10 @@ class ReverseV2Op : public OpKernel {
       OP_REQUIRES_OK(context,
                      context->allocate_output(0, input.shape(), &output));
 
-// TODO(cwhipkey): we can do dimension folding to reduce, e.g., a reverse of
-// a single dimension to the dims=3 or dims=2 case, regardless of the number
-// of dimensions in the tensor. This would let some ops use faster
-// lower-dimension code (and use optimized versions).
+      // TODO(cwhipkey): we can do dimension folding to reduce, e.g., a reverse
+      // of a single dimension to the dims=3 or dims=2 case, regardless of the
+      // number of dimensions in the tensor. This would let some ops use faster
+      // lower-dimension code (and use optimized versions).
 
 #define HANDLE_REVERSE(NDIMS)                                           \
   case NDIMS:                                                           \
diff --git a/tensorflow/core/kernels/reverse_op_gpu.cu.cc b/tensorflow/core/kernels/reverse_op_gpu.cu.cc
index b05a7c5550..3ee49db669 100644
--- a/tensorflow/core/kernels/reverse_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/reverse_op_gpu.cu.cc
@@ -28,14 +28,14 @@ typedef Eigen::GpuDevice GPUDevice;
 #define DEFINE_REVERSE(T, DIM) \
   template struct functor::Reverse<GPUDevice, T, DIM>;
 #define DEFINE_REVERSE_ALL_DIMS(T) \
-  DEFINE_REVERSE(T, 0) \
-  DEFINE_REVERSE(T, 1) \
-  DEFINE_REVERSE(T, 2) \
-  DEFINE_REVERSE(T, 3) \
-  DEFINE_REVERSE(T, 4) \
-  DEFINE_REVERSE(T, 5) \
-  DEFINE_REVERSE(T, 6) \
-  DEFINE_REVERSE(T, 7) \
+  DEFINE_REVERSE(T, 0)             \
+  DEFINE_REVERSE(T, 1)             \
+  DEFINE_REVERSE(T, 2)             \
+  DEFINE_REVERSE(T, 3)             \
+  DEFINE_REVERSE(T, 4)             \
+  DEFINE_REVERSE(T, 5)             \
+  DEFINE_REVERSE(T, 6)             \
+  DEFINE_REVERSE(T, 7)             \
   DEFINE_REVERSE(T, 8)
 
 TF_CALL_uint8(DEFINE_REVERSE_ALL_DIMS);
diff --git a/tensorflow/core/kernels/reverse_sequence_op.cc b/tensorflow/core/kernels/reverse_sequence_op.cc
index d1980d4b65..15a707a9c6 100644
--- a/tensorflow/core/kernels/reverse_sequence_op.cc
+++ b/tensorflow/core/kernels/reverse_sequence_op.cc
@@ -51,8 +51,7 @@ void CheckErrors(OpKernelContext* context, int batch_dim, int seq_dim) {
 
   // Copy seq_len info down for validity checks
   context->eigen_device<Device>().memcpyDeviceToHost(
-      seq_lens_vec.data(), seq_lens_t.data(),
-      sizeof(Tlen) * seq_lens_t.size());
+      seq_lens_vec.data(), seq_lens_t.data(), sizeof(Tlen) * seq_lens_t.size());
 
   OP_REQUIRES(context, batch_dim != seq_dim,
               errors::InvalidArgument("batch_dim == seq_dim == ", seq_dim));
@@ -76,8 +75,7 @@ void CheckErrors(OpKernelContext* context, int batch_dim, int seq_dim) {
   }
 }
 
-void CheckErrorsGPU(OpKernelContext* context, int batch_dim,
-                            int seq_dim) {
+void CheckErrorsGPU(OpKernelContext* context, int batch_dim, int seq_dim) {
   const Tensor& input = context->input(0);
   const Tensor& seq_lens = context->input(1);
 
@@ -98,13 +96,13 @@ void CheckErrorsGPU(OpKernelContext* context, int batch_dim,
 
 template <>
 void CheckErrors<GPUDevice, int32>(OpKernelContext* context, int batch_dim,
-                            int seq_dim) {
+                                   int seq_dim) {
   CheckErrorsGPU(context, batch_dim, seq_dim);
 }
 
 template <>
 void CheckErrors<GPUDevice, int64>(OpKernelContext* context, int batch_dim,
-                            int seq_dim) {
+                                   int seq_dim) {
   CheckErrorsGPU(context, batch_dim, seq_dim);
 }
 
@@ -164,14 +162,15 @@ class ReverseSequenceOp : public OpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(ReverseSequenceOp);
 };
 
-#define REGISTER_REVERSE_SEQUENCE(type, len_type)                           \
-  REGISTER_KERNEL_BUILDER(                                                  \
-      Name("ReverseSequence").Device(DEVICE_CPU).TypeConstraint<type>("T"). \
-      TypeConstraint<len_type>("Tlen"),                                     \
-      ReverseSequenceOp<CPUDevice, type, len_type>);
+#define REGISTER_REVERSE_SEQUENCE(type, len_type)                \
+  REGISTER_KERNEL_BUILDER(Name("ReverseSequence")                \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<len_type>("Tlen"), \
+                          ReverseSequenceOp<CPUDevice, type, len_type>);
 
-#define REGISTER_REVERSE_SEQUENCE_LEN(type)    \
-  REGISTER_REVERSE_SEQUENCE(type, int32);      \
+#define REGISTER_REVERSE_SEQUENCE_LEN(type) \
+  REGISTER_REVERSE_SEQUENCE(type, int32);   \
   REGISTER_REVERSE_SEQUENCE(type, int64);
 
 TF_CALL_NUMBER_TYPES(REGISTER_REVERSE_SEQUENCE_LEN);
@@ -181,23 +180,23 @@ TF_CALL_bool(REGISTER_REVERSE_SEQUENCE_LEN);
 
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
-#define DECLARE_GPU_SPEC(T, Tlen, Dims)                                     \
-  template <>                                                               \
-  void ReverseSequence<GPUDevice, T, Tlen, Dims>::Compute(                  \
-      const GPUDevice& d, typename TTypes<T, Dims>::ConstTensor input,      \
-      int32 batch_dim, int32 seq_dim,                                       \
-      typename TTypes<Tlen>::ConstVec seq_lens,                             \
-      typename TTypes<T, Dims>::Tensor output);                             \
+#define DECLARE_GPU_SPEC(T, Tlen, Dims)                                \
+  template <>                                                          \
+  void ReverseSequence<GPUDevice, T, Tlen, Dims>::Compute(             \
+      const GPUDevice& d, typename TTypes<T, Dims>::ConstTensor input, \
+      int32 batch_dim, int32 seq_dim,                                  \
+      typename TTypes<Tlen>::ConstVec seq_lens,                        \
+      typename TTypes<T, Dims>::Tensor output);                        \
   extern template struct ReverseSequence<GPUDevice, T, Tlen, Dims>;
 
-#define DECLARE_GPU_SPEC_LEN(T, Dims)    \
-  DECLARE_GPU_SPEC(T, int32, Dims);      \
+#define DECLARE_GPU_SPEC_LEN(T, Dims) \
+  DECLARE_GPU_SPEC(T, int32, Dims);   \
   DECLARE_GPU_SPEC(T, int64, Dims);
 
-#define DECLARE_GPU_SPECS(T)     \
-  DECLARE_GPU_SPEC_LEN(T, 2);    \
-  DECLARE_GPU_SPEC_LEN(T, 3);    \
-  DECLARE_GPU_SPEC_LEN(T, 4);    \
+#define DECLARE_GPU_SPECS(T)  \
+  DECLARE_GPU_SPEC_LEN(T, 2); \
+  DECLARE_GPU_SPEC_LEN(T, 3); \
+  DECLARE_GPU_SPEC_LEN(T, 4); \
   DECLARE_GPU_SPEC_LEN(T, 5);
 
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
@@ -206,14 +205,15 @@ TF_CALL_bool(DECLARE_GPU_SPECS);
 }  // namespace functor
 
 // Registration of the GPU implementations.
-#define REGISTER_REVERSE_SEQUENCE_GPU(type, len_type)                       \
-  REGISTER_KERNEL_BUILDER(                                                  \
-      Name("ReverseSequence").Device(DEVICE_GPU).TypeConstraint<type>("T"). \
-      TypeConstraint<len_type>("Tlen"),                                     \
-      ReverseSequenceOp<GPUDevice, type, len_type>);
-
-#define REGISTER_REVERSE_SEQUENCE_GPU_LEN(type)   \
-  REGISTER_REVERSE_SEQUENCE_GPU(type, int32);     \
+#define REGISTER_REVERSE_SEQUENCE_GPU(type, len_type)            \
+  REGISTER_KERNEL_BUILDER(Name("ReverseSequence")                \
+                              .Device(DEVICE_GPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<len_type>("Tlen"), \
+                          ReverseSequenceOp<GPUDevice, type, len_type>);
+
+#define REGISTER_REVERSE_SEQUENCE_GPU_LEN(type) \
+  REGISTER_REVERSE_SEQUENCE_GPU(type, int32);   \
   REGISTER_REVERSE_SEQUENCE_GPU(type, int64);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_REVERSE_SEQUENCE_GPU_LEN);
diff --git a/tensorflow/core/kernels/reverse_sequence_op_gpu.cu.cc b/tensorflow/core/kernels/reverse_sequence_op_gpu.cu.cc
index cb49f14525..4a2136a2cd 100644
--- a/tensorflow/core/kernels/reverse_sequence_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/reverse_sequence_op_gpu.cu.cc
@@ -28,14 +28,14 @@ typedef Eigen::GpuDevice GPUDevice;
   template class generator::ReverseGenerator<T, Tlen, dims>; \
   template struct functor::ReverseSequence<GPUDevice, T, Tlen, dims>;
 
-#define DEFINE_GPU_SPEC_LEN(T, dims)  \
-  DEFINE_GPU_SPEC(T, int32, dims);    \
+#define DEFINE_GPU_SPEC_LEN(T, dims) \
+  DEFINE_GPU_SPEC(T, int32, dims);   \
   DEFINE_GPU_SPEC(T, int64, dims);
 
-#define DEFINE_GPU_SPECS(T) \
-  DEFINE_GPU_SPEC_LEN(T, 2);    \
-  DEFINE_GPU_SPEC_LEN(T, 3);    \
-  DEFINE_GPU_SPEC_LEN(T, 4);    \
+#define DEFINE_GPU_SPECS(T)  \
+  DEFINE_GPU_SPEC_LEN(T, 2); \
+  DEFINE_GPU_SPEC_LEN(T, 3); \
+  DEFINE_GPU_SPEC_LEN(T, 4); \
   DEFINE_GPU_SPEC_LEN(T, 5);
 
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/save_restore_tensor.cc b/tensorflow/core/kernels/save_restore_tensor.cc
index df60eda759..990bd2bff9 100644
--- a/tensorflow/core/kernels/save_restore_tensor.cc
+++ b/tensorflow/core/kernels/save_restore_tensor.cc
@@ -106,11 +106,11 @@ void SaveTensors(
       OP_REQUIRES_OK(context, checkpoint::ParseShapeAndSlice(
                                   shape_spec, &shape, &slice, &slice_shape));
       OP_REQUIRES(context, slice_shape.IsSameSize(input.shape()),
-                  errors::InvalidArgument("Slice in shape_and_slice "
-                                          "specification does not match the "
-                                          "shape of the tensor to  save: ",
-                                          shape_spec, ", tensor: ",
-                                          input.shape().DebugString()));
+                  errors::InvalidArgument(
+                      "Slice in shape_and_slice "
+                      "specification does not match the "
+                      "shape of the tensor to  save: ",
+                      shape_spec, ", tensor: ", input.shape().DebugString()));
     }
 
 #define WRITER_ADD(T)                                           \
diff --git a/tensorflow/core/kernels/scatter_functor.h b/tensorflow/core/kernels/scatter_functor.h
index c6e35fe329..079f15e101 100644
--- a/tensorflow/core/kernels/scatter_functor.h
+++ b/tensorflow/core/kernels/scatter_functor.h
@@ -29,7 +29,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 namespace scatter_op {
 
@@ -117,7 +117,7 @@ struct AssignSYCL<scatter_op::UpdateOp::DIV> {
     p.device(d) = p / u;
   }
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace internal
 }  // namespace scatter_op
@@ -156,7 +156,7 @@ struct ScatterFunctorBase {
 
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T, typename Index, scatter_op::UpdateOp op>
-struct ScatterFunctorBase <SYCLDevice, T, Index, op> {
+struct ScatterFunctorBase<SYCLDevice, T, Index, op> {
   Index operator()(OpKernelContext* c, const SYCLDevice& d,
                    typename TTypes<T>::Matrix params,
                    typename TTypes<T>::ConstMatrix updates,
@@ -171,13 +171,13 @@ struct ScatterFunctorBase <SYCLDevice, T, Index, op> {
       const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
       if (!FastBoundsCheck(index, limit)) return i;
       // Copy last Ndim-1 dimensions of updates[i] to params[index]
-      scatter_op::internal::AssignSYCL<op>::Run(d, params.template chip<0>(index),
-                                            updates.template chip<0>(i));
+      scatter_op::internal::AssignSYCL<op>::Run(
+          d, params.template chip<0>(index), updates.template chip<0>(i));
     }
     return -1;
   }
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename T, typename Index>
 struct ScatterFunctorBase<CPUDevice, T, Index, scatter_op::UpdateOp::ASSIGN> {
@@ -217,7 +217,7 @@ struct ScatterFunctorBase<CPUDevice, T, Index, scatter_op::UpdateOp::ASSIGN> {
 
 template <typename T, typename Index, scatter_op::UpdateOp op>
 struct ScatterFunctor<CPUDevice, T, Index, op>
-        : ScatterFunctorBase<CPUDevice, T, Index, op>{};
+    : ScatterFunctorBase<CPUDevice, T, Index, op> {};
 
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T, typename Index, scatter_op::UpdateOp op>
@@ -239,7 +239,7 @@ struct ScatterFunctorSYCL {
     return -1;
   }
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/scatter_functor_gpu.cu.h b/tensorflow/core/kernels/scatter_functor_gpu.cu.h
index e116077d3c..be18658543 100644
--- a/tensorflow/core/kernels/scatter_functor_gpu.cu.h
+++ b/tensorflow/core/kernels/scatter_functor_gpu.cu.h
@@ -30,9 +30,10 @@ namespace tensorflow {
 typedef Eigen::GpuDevice GPUDevice;
 
 template <typename T, typename Index, scatter_op::UpdateOp op>
-__global__ void ScatterOpCustomKernel(
-    T* params, const T* updates, const Index* indices,
-    Index first_dim_size, Index updates_size, Index indices_size) {
+__global__ void ScatterOpCustomKernel(T* params, const T* updates,
+                                      const Index* indices,
+                                      Index first_dim_size, Index updates_size,
+                                      Index indices_size) {
   Index update_block = updates_size / indices_size;
   CUDA_1D_KERNEL_LOOP(i, updates_size) {
     int indices_i = i / update_block;
@@ -85,8 +86,8 @@ struct ScatterFunctor<GPUDevice, T, Index, op> {
     CudaLaunchConfig config = GetCudaLaunchConfig(updates_size, d);
     ScatterOpCustomKernel<T, Index, op>
         <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-            params.data(), updates.data(), indices.data(),
-            first_dim_size, updates_size, indices_size);
+            params.data(), updates.data(), indices.data(), first_dim_size,
+            updates_size, indices_size);
     return -1;
   }
 };
diff --git a/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h b/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
index c6c9d4e658..e82660dcc1 100644
--- a/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
+++ b/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
@@ -40,7 +40,7 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 class OpKernelContext;
 
@@ -251,7 +251,7 @@ REGISTER_SCATTER_ND_MATH_SYCL(int32);
 #undef REGISTER_SCATTER_ND_INDEX_SYCL
 #undef REGISTER_SCATTER_ND_FULL_SYCL
 
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
 
diff --git a/tensorflow/core/kernels/scatter_op.cc b/tensorflow/core/kernels/scatter_op.cc
index 8607c7f95a..282165349f 100644
--- a/tensorflow/core/kernels/scatter_op.cc
+++ b/tensorflow/core/kernels/scatter_op.cc
@@ -25,7 +25,7 @@ limitations under the License.
 
 #ifdef TENSORFLOW_USE_SYCL
 #include "tensorflow/core/common_runtime/sycl/sycl_util.h"
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 namespace tensorflow {
 
@@ -33,7 +33,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // Check whether updates.shape = indices.shape + params.shape[1:]
 static bool ValidShapes(const Tensor& params, const Tensor& updates,
@@ -102,11 +102,12 @@ class ScatterUpdateOp : public OpKernel {
 
     // Check that we have enough index space
     const int64 N_big = indices.NumElements();
-    OP_REQUIRES(c, N_big <= std::numeric_limits<Index>::max(),
-                errors::InvalidArgument(
-                    "indices has too many elements for ",
-                    DataTypeString(DataTypeToEnum<Index>::v()), " indexing: ",
-                    N_big, " > ", std::numeric_limits<Index>::max()));
+    OP_REQUIRES(
+        c, N_big <= std::numeric_limits<Index>::max(),
+        errors::InvalidArgument("indices has too many elements for ",
+                                DataTypeString(DataTypeToEnum<Index>::v()),
+                                " indexing: ", N_big, " > ",
+                                std::numeric_limits<Index>::max()));
     const Index N = static_cast<Index>(indices.NumElements());
     OP_REQUIRES(
         c, params.dim_size(0) <= std::numeric_limits<Index>::max(),
@@ -137,7 +138,7 @@ class ScatterUpdateOp : public OpKernel {
 
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T, typename Index, scatter_op::UpdateOp op>
-class ScatterUpdateOp <SYCLDevice, T, Index, op> : public OpKernel {
+class ScatterUpdateOp<SYCLDevice, T, Index, op> : public OpKernel {
  public:
   explicit ScatterUpdateOp(OpKernelConstruction* c) : OpKernel(c) {
     OP_REQUIRES_OK(c, c->GetAttr("use_locking", &use_exclusive_lock_));
@@ -165,11 +166,12 @@ class ScatterUpdateOp <SYCLDevice, T, Index, op> : public OpKernel {
 
     // Check that we have enough index space
     const int64 N_big = indices.NumElements();
-    OP_REQUIRES(c, N_big <= std::numeric_limits<Index>::max(),
-                errors::InvalidArgument(
-                    "indices has too many elements for ",
-                    DataTypeString(DataTypeToEnum<Index>::v()), " indexing: ",
-                    N_big, " > ", std::numeric_limits<Index>::max()));
+    OP_REQUIRES(
+        c, N_big <= std::numeric_limits<Index>::max(),
+        errors::InvalidArgument("indices has too many elements for ",
+                                DataTypeString(DataTypeToEnum<Index>::v()),
+                                " indexing: ", N_big, " > ",
+                                std::numeric_limits<Index>::max()));
     const Index N = static_cast<Index>(indices.NumElements());
     OP_REQUIRES(
         c, params.dim_size(0) <= std::numeric_limits<Index>::max(),
@@ -206,7 +208,7 @@ class ScatterUpdateOp <SYCLDevice, T, Index, op> : public OpKernel {
     }
   }
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #define REGISTER_SCATTER_KERNEL_INDEX(type, index_type, dev, name, op) \
   REGISTER_KERNEL_BUILDER(Name(name)                                   \
diff --git a/tensorflow/core/kernels/sdca_internal.cc b/tensorflow/core/kernels/sdca_internal.cc
index 863c123b43..066a4b80a2 100644
--- a/tensorflow/core/kernels/sdca_internal.cc
+++ b/tensorflow/core/kernels/sdca_internal.cc
@@ -37,9 +37,8 @@ void FeatureWeightsDenseStorage::UpdateDenseDeltaWeights(
   const size_t num_weight_vectors = normalized_bounded_dual_delta.size();
   if (num_weight_vectors == 1) {
     deltas_.device(device) =
-        deltas_ +
-        dense_vector.RowAsMatrix() *
-            deltas_.constant(normalized_bounded_dual_delta[0]);
+        deltas_ + dense_vector.RowAsMatrix() *
+                      deltas_.constant(normalized_bounded_dual_delta[0]);
   } else {
     // Transform the dual vector into a column matrix.
     const Eigen::TensorMap<Eigen::Tensor<const double, 2, Eigen::RowMajor>>
@@ -61,9 +60,8 @@ void FeatureWeightsSparseStorage::UpdateSparseDeltaWeights(
     const Example::SparseFeatures& sparse_features,
     const std::vector<double>& normalized_bounded_dual_delta) {
   for (int64 k = 0; k < sparse_features.indices->size(); ++k) {
-    const double feature_value = sparse_features.values == nullptr
-                                     ? 1.0
-                                     : (*sparse_features.values)(k);
+    const double feature_value =
+        sparse_features.values == nullptr ? 1.0 : (*sparse_features.values)(k);
     auto it = indices_to_id_.find((*sparse_features.indices)(k));
     for (size_t l = 0; l < normalized_bounded_dual_delta.size(); ++l) {
       deltas_(l, it->second) +=
@@ -122,23 +120,24 @@ Status ModelWeights::Initialize(OpKernelContext* const context) {
   }
 
   // Reads in the weights, and allocates and initializes the delta weights.
-  const auto initialize_weights = [&](
-      const OpInputList& weight_inputs, OpOutputList* const weight_outputs,
-      std::vector<FeatureWeightsDenseStorage>* const feature_weights) {
-    for (int i = 0; i < weight_inputs.size(); ++i) {
-      Tensor* delta_t;
-      TF_RETURN_IF_ERROR(
-          weight_outputs->allocate(i, weight_inputs[i].shape(), &delta_t));
-      // Convert the input vector to a row matrix in internal representation.
-      auto deltas = delta_t->shaped<float, 2>({1, delta_t->NumElements()});
-      deltas.setZero();
-      feature_weights->emplace_back(
-          FeatureWeightsDenseStorage{weight_inputs[i].shaped<float, 2>(
-                                         {1, weight_inputs[i].NumElements()}),
-                                     deltas});
-    }
-    return Status::OK();
-  };
+  const auto initialize_weights =
+      [&](const OpInputList& weight_inputs, OpOutputList* const weight_outputs,
+          std::vector<FeatureWeightsDenseStorage>* const feature_weights) {
+        for (int i = 0; i < weight_inputs.size(); ++i) {
+          Tensor* delta_t;
+          TF_RETURN_IF_ERROR(
+              weight_outputs->allocate(i, weight_inputs[i].shape(), &delta_t));
+          // Convert the input vector to a row matrix in internal
+          // representation.
+          auto deltas = delta_t->shaped<float, 2>({1, delta_t->NumElements()});
+          deltas.setZero();
+          feature_weights->emplace_back(FeatureWeightsDenseStorage{
+              weight_inputs[i].shaped<float, 2>(
+                  {1, weight_inputs[i].NumElements()}),
+              deltas});
+        }
+        return Status::OK();
+      };
 
   return initialize_weights(dense_weights_inputs, &dense_weights_outputs,
                             &dense_weights_);
diff --git a/tensorflow/core/kernels/sdca_internal.h b/tensorflow/core/kernels/sdca_internal.h
index 9f07270075..45915693ac 100644
--- a/tensorflow/core/kernels/sdca_internal.h
+++ b/tensorflow/core/kernels/sdca_internal.h
@@ -149,7 +149,8 @@ class Example {
   // 1.0f.
   struct SparseFeatures {
     std::unique_ptr<TTypes<const int64>::UnalignedConstVec> indices;
-    std::unique_ptr<TTypes<const float>::UnalignedConstVec> values;  // nullptr encodes optional.
+    std::unique_ptr<TTypes<const float>::UnalignedConstVec>
+        values;  // nullptr encodes optional.
   };
 
   // A dense vector which is a row-slice of the underlying matrix.
diff --git a/tensorflow/core/kernels/sdca_ops.cc b/tensorflow/core/kernels/sdca_ops.cc
index 0f5c2424b3..dbe0177dda 100644
--- a/tensorflow/core/kernels/sdca_ops.cc
+++ b/tensorflow/core/kernels/sdca_ops.cc
@@ -57,11 +57,11 @@ namespace tensorflow {
 
 namespace {
 
-using sdca::Regularizations;
 using sdca::Example;
 using sdca::Examples;
 using sdca::ExampleStatistics;
 using sdca::ModelWeights;
+using sdca::Regularizations;
 
 struct ComputeOptions {
   explicit ComputeOptions(OpKernelConstruction* const context) {
@@ -76,8 +76,9 @@ struct ComputeOptions {
     } else if (loss_type == "smooth_hinge_loss") {
       loss_updater.reset(new SmoothHingeLossUpdater);
     } else {
-      OP_REQUIRES(context, false, errors::InvalidArgument(
-                                      "Unsupported loss type: ", loss_type));
+      OP_REQUIRES(
+          context, false,
+          errors::InvalidArgument("Unsupported loss type: ", loss_type));
     }
     OP_REQUIRES_OK(context, context->GetAttr("adaptative", &adaptative));
     OP_REQUIRES_OK(
@@ -90,9 +91,10 @@ struct ComputeOptions {
         context, num_sparse_features + num_dense_features > 0,
         errors::InvalidArgument("Requires at least one feature to train."));
 
-    OP_REQUIRES(context, static_cast<int64>(num_sparse_features) +
-                                 static_cast<int64>(num_dense_features) <=
-                             std::numeric_limits<int>::max(),
+    OP_REQUIRES(context,
+                static_cast<int64>(num_sparse_features) +
+                        static_cast<int64>(num_dense_features) <=
+                    std::numeric_limits<int>::max(),
                 errors::InvalidArgument(
                     strings::Printf("Too many feature groups: %lld > %d",
                                     static_cast<int64>(num_sparse_features) +
diff --git a/tensorflow/core/kernels/segment_reduction_ops.cc b/tensorflow/core/kernels/segment_reduction_ops.cc
index 3ef1cd1e06..27b8081eb8 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops.cc
@@ -115,7 +115,7 @@ class SegmentReductionOp : public OpKernel {
     Eigen::DSizes<Eigen::DenseIndex, 1> dims_to_reduce;
     dims_to_reduce[0] = 0;
 #else
-    Eigen::IndexList<Eigen::type2index<0>> dims_to_reduce;
+    Eigen::IndexList<Eigen::type2index<0> > dims_to_reduce;
 #endif
     Index start = 0, end = 1;
 
@@ -359,7 +359,8 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_SORTED_KERNELS_ALL);
 namespace functor {
 
 // UnsortedSegmentSumFunctor implementation for CPUDevice.
-// todo: Remove duplicate code in UnsortedSegmentSumFunctor and UnsortedSegmentMaxFunctor.
+// todo: Remove duplicate code in UnsortedSegmentSumFunctor and
+// UnsortedSegmentMaxFunctor.
 template <typename T, typename Index>
 struct UnsortedSegmentSumFunctor<CPUDevice, T, Index>
     : UnsortedSegmentBaseFunctor<CPUDevice, T, Index> {
@@ -461,9 +462,10 @@ class UnsortedSegmentBaseOp : public OpKernel {
 
     auto data_ptr = data.template flat<T>().data();
     reduction_functor_(context, context->template eigen_device<Device>(),
-                     output_rows, segment_ids.shape(), segment_flat,
-                     data.NumElements(), data_ptr, output_flat);
+                       output_rows, segment_ids.shape(), segment_flat,
+                       data.NumElements(), data_ptr, output_flat);
   }
+
  private:
   functor::UnsortedSegmentBaseFunctor<Device, T, Index>& reduction_functor_;
 };
@@ -472,22 +474,20 @@ template <typename Device, class T, class Index>
 class UnsortedSegmentSumOp : public UnsortedSegmentBaseOp<Device, T, Index> {
  public:
   explicit UnsortedSegmentSumOp(OpKernelConstruction* context)
-      : UnsortedSegmentBaseOp<Device, T, Index>(
-            context,
-            sum_functor_) {}
+      : UnsortedSegmentBaseOp<Device, T, Index>(context, sum_functor_) {}
+
  private:
-    functor::UnsortedSegmentSumFunctor<Device, T, Index> sum_functor_;
+  functor::UnsortedSegmentSumFunctor<Device, T, Index> sum_functor_;
 };
 
 template <typename Device, class T, class Index>
 class UnsortedSegmentMaxOp : public UnsortedSegmentBaseOp<Device, T, Index> {
  public:
   explicit UnsortedSegmentMaxOp(OpKernelConstruction* context)
-      : UnsortedSegmentBaseOp<Device, T, Index>(
-            context,
-            max_functor_) {}
+      : UnsortedSegmentBaseOp<Device, T, Index>(context, max_functor_) {}
+
  private:
-    functor::UnsortedSegmentMaxFunctor<Device, T, Index> max_functor_;
+  functor::UnsortedSegmentMaxFunctor<Device, T, Index> max_functor_;
 };
 
 #define REGISTER_REAL_CPU_UNSORTED_KERNELS(type, index_type)                  \
@@ -663,9 +663,9 @@ class SparseSegmentReductionOpBase : public OpKernel {
           Reduce(input_flat, indices_vec, start, end - start, out);
       OP_REQUIRES(context, bad_offset < 0,
                   errors::InvalidArgument(
-                      "Bad: indices[", start + bad_offset, "] == ",
-                      indices_vec(start + bad_offset), " out of range [0, ",
-                      input_flat.dimension(0), ")"));
+                      "Bad: indices[", start + bad_offset,
+                      "] == ", indices_vec(start + bad_offset),
+                      " out of range [0, ", input_flat.dimension(0), ")"));
 
       start = end;
       ++end;
diff --git a/tensorflow/core/kernels/segment_reduction_ops.h b/tensorflow/core/kernels/segment_reduction_ops.h
index bcdd42c80c..5c9cfe0906 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.h
+++ b/tensorflow/core/kernels/segment_reduction_ops.h
@@ -51,13 +51,14 @@ struct SegmentSumFunctor {
 // BaseFunctor for definition of UnsorteSegmentReductionOp
 // for usage without templates.
 template <typename Device, typename T, typename Index>
-struct UnsortedSegmentBaseFunctor{
-  virtual ~UnsortedSegmentBaseFunctor(){}
+struct UnsortedSegmentBaseFunctor {
+  virtual ~UnsortedSegmentBaseFunctor() {}
   virtual void operator()(OpKernelContext* ctx, const Device& d,
-                  const Index output_rows, const TensorShape& segment_ids_shape,
-                  typename TTypes<Index>::ConstFlat segment_ids,
-                  const Index data_size, const T* data,
-                  typename TTypes<T, 2>::Tensor output){};
+                          const Index output_rows,
+                          const TensorShape& segment_ids_shape,
+                          typename TTypes<Index>::ConstFlat segment_ids,
+                          const Index data_size, const T* data,
+                          typename TTypes<T, 2>::Tensor output){};
 };
 
 // Functor for UnsortedSegmentSumOp.
@@ -70,7 +71,8 @@ struct UnsortedSegmentBaseFunctor{
 // data: input data tensor.
 // output: output reshaped to {output_rows, output.size/output_rows}
 template <typename Device, typename T, typename Index>
-struct UnsortedSegmentSumFunctor: public UnsortedSegmentBaseFunctor<Device, T, Index> {
+struct UnsortedSegmentSumFunctor
+    : public UnsortedSegmentBaseFunctor<Device, T, Index> {
   void operator()(OpKernelContext* ctx, const Device& d,
                   const Index output_rows, const TensorShape& segment_ids_shape,
                   typename TTypes<Index>::ConstFlat segment_ids,
@@ -88,7 +90,8 @@ struct UnsortedSegmentSumFunctor: public UnsortedSegmentBaseFunctor<Device, T, I
 // data: input data tensor.
 // output: output reshaped to {output_rows, output.size/output_rows}
 template <typename Device, typename T, typename Index>
-struct UnsortedSegmentMaxFunctor: public UnsortedSegmentBaseFunctor<Device, T, Index> {
+struct UnsortedSegmentMaxFunctor
+    : public UnsortedSegmentBaseFunctor<Device, T, Index> {
   void operator()(OpKernelContext* ctx, const Device& d,
                   const Index output_rows, const TensorShape& segment_ids_shape,
                   typename TTypes<Index>::ConstFlat segment_ids,
diff --git a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc
index 159fada621..39d520698e 100644
--- a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc
@@ -194,7 +194,8 @@ void SegmentSumFunctor<T, Index>::operator()(
 
 // UnsortedSegmentSumFunctor implementation for GPUDevice.
 template <typename T, typename Index>
-struct UnsortedSegmentSumFunctor<GPUDevice, T, Index>: UnsortedSegmentBaseFunctor<GPUDevice, T, Index> {
+struct UnsortedSegmentSumFunctor<GPUDevice, T, Index>
+    : UnsortedSegmentBaseFunctor<GPUDevice, T, Index> {
   void operator()(OpKernelContext* ctx, const GPUDevice& d,
                   const Index output_rows, const TensorShape& segment_ids_shape,
                   typename TTypes<Index>::ConstFlat segment_ids,
@@ -221,11 +222,10 @@ struct UnsortedSegmentSumFunctor<GPUDevice, T, Index>: UnsortedSegmentBaseFuncto
     const Index input_inner_dim_size = input_total_size / input_outer_dim_size;
 
     config = GetCudaLaunchConfig(input_total_size, d);
-    UnsortedSegmentSumCustomKernel<
-        T,
-        Index><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        input_outer_dim_size, input_inner_dim_size, output_rows,
-        segment_ids.data(), data, output.data());
+    UnsortedSegmentSumCustomKernel<T, Index>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            input_outer_dim_size, input_inner_dim_size, output_rows,
+            segment_ids.data(), data, output.data());
   }
 };
 
diff --git a/tensorflow/core/kernels/self_adjoint_eig_op.cc b/tensorflow/core/kernels/self_adjoint_eig_op.cc
index 9765780726..bcd8877390 100644
--- a/tensorflow/core/kernels/self_adjoint_eig_op.cc
+++ b/tensorflow/core/kernels/self_adjoint_eig_op.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
-
 namespace tensorflow {
 
 template <class Scalar>
diff --git a/tensorflow/core/kernels/sendrecv_ops.cc b/tensorflow/core/kernels/sendrecv_ops.cc
index 206fd40fa6..688e61fcad 100644
--- a/tensorflow/core/kernels/sendrecv_ops.cc
+++ b/tensorflow/core/kernels/sendrecv_ops.cc
@@ -114,7 +114,7 @@ REGISTER_KERNEL_BUILDER(Name("_Send").Device(DEVICE_GPU), SendOp);
 REGISTER_KERNEL_BUILDER(Name("_Send").Device(DEVICE_SYCL), SendOp);
 REGISTER_KERNEL_BUILDER(
     Name("_HostSend").Device(DEVICE_SYCL).HostMemory("tensor"), SendOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 REGISTER_KERNEL_BUILDER(Name("_HostSend").Device(DEVICE_CPU), SendOp);
 REGISTER_KERNEL_BUILDER(
@@ -198,7 +198,7 @@ REGISTER_KERNEL_BUILDER(Name("_Recv").Device(DEVICE_GPU), RecvOp);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(Name("_Recv").Device(DEVICE_SYCL), RecvOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 REGISTER_KERNEL_BUILDER(Name("_HostRecv").Device(DEVICE_CPU), RecvOp);
 REGISTER_KERNEL_BUILDER(
@@ -207,6 +207,6 @@ REGISTER_KERNEL_BUILDER(
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(
     Name("_HostRecv").Device(DEVICE_SYCL).HostMemory("tensor"), RecvOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/sequence_ops.cc b/tensorflow/core/kernels/sequence_ops.cc
index e2e3758d87..9db0bd4d98 100644
--- a/tensorflow/core/kernels/sequence_ops.cc
+++ b/tensorflow/core/kernels/sequence_ops.cc
@@ -53,13 +53,13 @@ class RangeOp : public OpKernel {
     if (delta > 0) {
       OP_REQUIRES(
           context, start <= limit,
-          errors::InvalidArgument("Requires start <= limit when delta > 0: ",
-                                  start, "/", limit));
+          errors::InvalidArgument(
+              "Requires start <= limit when delta > 0: ", start, "/", limit));
     } else {
       OP_REQUIRES(
           context, start >= limit,
-          errors::InvalidArgument("Requires start >= limit when delta < 0: ",
-                                  start, "/", limit));
+          errors::InvalidArgument(
+              "Requires start >= limit when delta < 0: ", start, "/", limit));
     }
     int64 size = (std::is_integral<T>::value
                       ? ((std::abs(limit - start) + std::abs(delta) - 1) /
diff --git a/tensorflow/core/kernels/session_ops.cc b/tensorflow/core/kernels/session_ops.cc
index 185c5b248f..f2dd2812b5 100644
--- a/tensorflow/core/kernels/session_ops.cc
+++ b/tensorflow/core/kernels/session_ops.cc
@@ -144,7 +144,7 @@ REGISTER_GPU_KERNEL(bool);
 TF_CALL_NUMBER_TYPES(REGISTER_SYCL_KERNEL);
 REGISTER_SYCL_KERNEL(bool);
 #undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 class DeleteSessionTensorOp : public OpKernel {
  public:
diff --git a/tensorflow/core/kernels/shape_ops.h b/tensorflow/core/kernels/shape_ops.h
index 8d9d0ea846..55be308901 100644
--- a/tensorflow/core/kernels/shape_ops.h
+++ b/tensorflow/core/kernels/shape_ops.h
@@ -235,10 +235,10 @@ class SqueezeOp : public OpKernel {
       if (!wrapped_squeeze_dims.empty()) {
         if (wrapped_squeeze_dims.count(i) > 0) {
           OP_REQUIRES(ctx, existing_dim == 1,
-                      errors::InvalidArgument("Tried to explicitly squeeze "
-                                              "dimension ",
-                                              i, " but dimension was not 1: ",
-                                              existing_dim));
+                      errors::InvalidArgument(
+                          "Tried to explicitly squeeze "
+                          "dimension ",
+                          i, " but dimension was not 1: ", existing_dim));
         } else {
           // This dimension is not being squeezed.
           new_shape.push_back(existing_dim);
diff --git a/tensorflow/core/kernels/slice_op.cc b/tensorflow/core/kernels/slice_op.cc
index 82595de779..79369fd4a9 100644
--- a/tensorflow/core/kernels/slice_op.cc
+++ b/tensorflow/core/kernels/slice_op.cc
@@ -58,7 +58,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // Shared code that is not dependent on the type of T.  We do this to reduce
 // code size by not duplicating all this for all T (float, double, int32, etc.)
@@ -72,10 +72,11 @@ static void SharedValidation(OpKernelContext* context,
   const Tensor& size_tensor = context->input(2);
 
   OP_REQUIRES(
-      context, context->op_kernel().IsLegacyVector(begin_tensor.shape()) &&
-                   context->op_kernel().IsLegacyVector(size_tensor.shape()) &&
-                   begin_tensor.NumElements() == input.dims() &&
-                   size_tensor.NumElements() == input.dims(),
+      context,
+      context->op_kernel().IsLegacyVector(begin_tensor.shape()) &&
+          context->op_kernel().IsLegacyVector(size_tensor.shape()) &&
+          begin_tensor.NumElements() == input.dims() &&
+          size_tensor.NumElements() == input.dims(),
       errors::InvalidArgument(
           "Expected begin and size arguments to be 1-D tensors of size ",
           input.dims(), ", but got shapes ", begin_tensor.shape().DebugString(),
@@ -125,8 +126,7 @@ static void SharedSliceCommonCases(OpKernelContext* context,
                                    TensorShape* output_shape,
                                    gtl::InlinedVector<int64, 4>* begin,
                                    gtl::InlinedVector<int64, 4>* size,
-                                   Tensor** result,
-                                   bool* done) {
+                                   Tensor** result, bool* done) {
   bool is_identity = true;
   bool slice_dim0 = true;
   *done = false;
@@ -142,8 +142,8 @@ static void SharedSliceCommonCases(OpKernelContext* context,
     return;
   }
 
-  if (slice_dim0 && IsDim0SliceAligned<T>(input.shape(), (*begin)[0],
-                                          (*size)[0])) {
+  if (slice_dim0 &&
+      IsDim0SliceAligned<T>(input.shape(), (*begin)[0], (*size)[0])) {
     VLOG(1) << "Slice dim 0: " << input.shape().DebugString();
     CHECK_GE(input.dims(), 1);  // Otherwise, is_identity should be true.
     context->set_output(0, input.Slice((*begin)[0], (*begin)[0] + (*size)[0]));
@@ -154,7 +154,6 @@ static void SharedSliceCommonCases(OpKernelContext* context,
   OP_REQUIRES_OK(context, context->allocate_output(0, *output_shape, result));
 }
 
-
 template <typename Device, typename T>
 class SliceOp : public OpKernel {
  public:
@@ -206,8 +205,9 @@ class SliceOp : public OpKernel {
 
 #undef HANDLE_DIM
 
-      OP_REQUIRES(context, false, errors::Unimplemented(
-                                      "SliceOp : Unhandled input dimensions"));
+      OP_REQUIRES(
+          context, false,
+          errors::Unimplemented("SliceOp : Unhandled input dimensions"));
     }
   }
 
@@ -280,8 +280,9 @@ class MklSliceOp : public OpKernel {
 
 #undef HANDLE_DIM
 
-      OP_REQUIRES(context, false, errors::Unimplemented(
-                                      "SliceOp : Unhandled input dimensions"));
+      OP_REQUIRES(
+          context, false,
+          errors::Unimplemented("SliceOp : Unhandled input dimensions"));
     }
   }
 
@@ -292,9 +293,9 @@ class MklSliceOp : public OpKernel {
   // as the sizes of all the dimensions of the input except slice_dim, then
   // returns True. Otherwise, returns False.
   bool DoesSliceShapeDifferInOnly1DHelper(const TensorShape& input_shape,
-                          const gtl::ArraySlice<int64>& begin,
-                          const gtl::ArraySlice<int64>& size,
-                          int slice_dim) {
+                                          const gtl::ArraySlice<int64>& begin,
+                                          const gtl::ArraySlice<int64>& size,
+                                          int slice_dim) {
     for (int dim = 0; dim < 4; dim++) {
       if (dim != slice_dim &&
           (begin[dim] != 0 || size[dim] != input_shape.dim_size(dim))) {
@@ -316,9 +317,9 @@ class MklSliceOp : public OpKernel {
   // Returns True if Slicing over a single dimension, and sets slice_dim
   // to the number of the dimension that satisfies criteria.
   bool DoesSliceShapeDifferInOnly1D(const TensorShape& input_shape,
-                          const gtl::ArraySlice<int64>& begin,
-                          const gtl::ArraySlice<int64>& size,
-                          int* slice_dim) {
+                                    const gtl::ArraySlice<int64>& begin,
+                                    const gtl::ArraySlice<int64>& size,
+                                    int* slice_dim) {
     for (int dim = 0; dim < 4; dim++) {
       if (DoesSliceShapeDifferInOnly1DHelper(input_shape, begin, size, dim)) {
         *slice_dim = dim;
@@ -329,8 +330,7 @@ class MklSliceOp : public OpKernel {
   }
 
   template <int NDIM>
-  void HandleCase(OpKernelContext* context,
-                  const gtl::ArraySlice<int64>& begin,
+  void HandleCase(OpKernelContext* context, const gtl::ArraySlice<int64>& begin,
                   const gtl::ArraySlice<int64>& size, Tensor* result) {
     int slice_dim = -1;
     TensorShape in_shape = context->input(0).shape();
@@ -340,67 +340,63 @@ class MklSliceOp : public OpKernel {
     // format over channel dimension.
     if (NDIM == 4 &&
         DoesSliceShapeDifferInOnly1D(in_shape, begin, size, &slice_dim)) {
-        size_t in_strides[4] = { (size_t) in_shape.dim_size(1) *
-                                          in_shape.dim_size(2) *
-                                          in_shape.dim_size(3),
-                                 (size_t) in_shape.dim_size(2) *
-                                          in_shape.dim_size(3),
-                                 (size_t) in_shape.dim_size(3),
-                                 (size_t) 1
-                               };
-
-        size_t out_strides[4] = { (size_t) size[1] * size[2] * size[3],
-                                  (size_t) size[2] * size[3],
-                                  (size_t) size[3],
-                                  (size_t) 1 };
-
-        T *in_buf = const_cast<T*>(const_cast<const T*>(
-                    context->input(0).flat<T>().data()));
-        T *op_buf = result->flat<T>().data();
-
-        if (slice_dim == 1) {
-          /* data format = NCHW */
-
-          #pragma omp parallel for
-          for (size_t d0 = begin[0]; d0 < begin[0] + size[0]; d0++) {
-              T *ip  = in_buf + (d0 * in_strides[0]);
-              T *op  = op_buf + ((d0 - begin[0]) * out_strides[0]);
-            #pragma omp parallel for
-            for (size_t d1 = begin[1]; d1 < begin[1] + size[1]; d1++) {
-              T *ip1 = ip + (d1 * in_strides[1]);
-              T *op1 = op + ((d1 - begin[1]) * out_strides[1]);
-              // For NCHW, H and W will be contiguous. So we can copy
-              // both with one memcpy.
-              memcpy(static_cast<void*>(op1), static_cast<void*>(ip1),
-                     sizeof(T) * in_strides[1]);
-            }
+      size_t in_strides[4] = {
+          (size_t)in_shape.dim_size(1) * in_shape.dim_size(2) *
+              in_shape.dim_size(3),
+          (size_t)in_shape.dim_size(2) * in_shape.dim_size(3),
+          (size_t)in_shape.dim_size(3), (size_t)1};
+
+      size_t out_strides[4] = {(size_t)size[1] * size[2] * size[3],
+                               (size_t)size[2] * size[3], (size_t)size[3],
+                               (size_t)1};
+
+      T* in_buf = const_cast<T*>(
+          const_cast<const T*>(context->input(0).flat<T>().data()));
+      T* op_buf = result->flat<T>().data();
+
+      if (slice_dim == 1) {
+        /* data format = NCHW */
+
+#pragma omp parallel for
+        for (size_t d0 = begin[0]; d0 < begin[0] + size[0]; d0++) {
+          T* ip = in_buf + (d0 * in_strides[0]);
+          T* op = op_buf + ((d0 - begin[0]) * out_strides[0]);
+#pragma omp parallel for
+          for (size_t d1 = begin[1]; d1 < begin[1] + size[1]; d1++) {
+            T* ip1 = ip + (d1 * in_strides[1]);
+            T* op1 = op + ((d1 - begin[1]) * out_strides[1]);
+            // For NCHW, H and W will be contiguous. So we can copy
+            // both with one memcpy.
+            memcpy(static_cast<void*>(op1), static_cast<void*>(ip1),
+                   sizeof(T) * in_strides[1]);
           }
-          return;
-        } else if (slice_dim == 3) {
-          /* data_format = NHWC */
-
-          #pragma omp parallel for
-          for (size_t d0 = begin[0]; d0 < begin[0] + size[0]; d0++) {
-              T *ip = in_buf + (d0 * in_strides[0]);
-              T *op = op_buf + ((d0 - begin[0]) * out_strides[0]);
-            #pragma omp parallel for
-            for (size_t d1 = begin[1]; d1 < begin[1] + size[1]; d1++) {
-              T *ip1 = ip + (d1 * in_strides[1]);
-              T *op1 = op + ((d1 - begin[1]) * out_strides[1]);
-              #pragma omp parallel for
-              for (size_t d2 = begin[2]; d2 < begin[2] + size[2]; d2++) {
-                T *ip2 = ip1 + (d2 * in_strides[2]);
-                T *ip3 = ip2 + begin[3];
-                T *op2 = op1 + ((d2 - begin[2]) * out_strides[2]);
-                T *op3 = op2;
-                memcpy(static_cast<void*>(op3), static_cast<void*>(ip3),
-                       sizeof(T) * size[3]);
-              }
+        }
+        return;
+      } else if (slice_dim == 3) {
+        /* data_format = NHWC */
+
+#pragma omp parallel for
+        for (size_t d0 = begin[0]; d0 < begin[0] + size[0]; d0++) {
+          T* ip = in_buf + (d0 * in_strides[0]);
+          T* op = op_buf + ((d0 - begin[0]) * out_strides[0]);
+#pragma omp parallel for
+          for (size_t d1 = begin[1]; d1 < begin[1] + size[1]; d1++) {
+            T* ip1 = ip + (d1 * in_strides[1]);
+            T* op1 = op + ((d1 - begin[1]) * out_strides[1]);
+#pragma omp parallel for
+            for (size_t d2 = begin[2]; d2 < begin[2] + size[2]; d2++) {
+              T* ip2 = ip1 + (d2 * in_strides[2]);
+              T* ip3 = ip2 + begin[3];
+              T* op2 = op1 + ((d2 - begin[2]) * out_strides[2]);
+              T* op3 = op2;
+              memcpy(static_cast<void*>(op3), static_cast<void*>(ip3),
+                     sizeof(T) * size[3]);
             }
           }
-          return;
         }
-        // slice_dim is not 1 or 3, then we fallback to Eigen implementation.
+        return;
+      }
+      // slice_dim is not 1 or 3, then we fallback to Eigen implementation.
     }
 
     Eigen::DSizes<Eigen::DenseIndex, NDIM> indices;
@@ -535,13 +531,13 @@ REGISTER_KERNEL_BUILDER(Name("Slice")
 #ifdef TENSORFLOW_USE_SYCL
 // Forward declarations of the functor specializations for SYCL.
 namespace functor {
-#define DECLARE_SYCL_SPEC(T, NDIM)                                 \
-  template <>                                                      \
-  void Slice<SYCLDevice, T, NDIM>::operator()(                     \
-      const SYCLDevice& d, typename TTypes<T, NDIM>::Tensor output,\
-      typename TTypes<T, NDIM>::ConstTensor input,                 \
-      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& indices,       \
-      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& sizes);        \
+#define DECLARE_SYCL_SPEC(T, NDIM)                                  \
+  template <>                                                       \
+  void Slice<SYCLDevice, T, NDIM>::operator()(                      \
+      const SYCLDevice& d, typename TTypes<T, NDIM>::Tensor output, \
+      typename TTypes<T, NDIM>::ConstTensor input,                  \
+      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& indices,        \
+      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& sizes);         \
   extern template struct Slice<SYCLDevice, T, NDIM>;
 
 #define DECLARE_FOR_N(T)   \
diff --git a/tensorflow/core/kernels/slice_op.h b/tensorflow/core/kernels/slice_op.h
index 0362a02133..db7eded745 100644
--- a/tensorflow/core/kernels/slice_op.h
+++ b/tensorflow/core/kernels/slice_op.h
@@ -24,7 +24,6 @@ limitations under the License.
 namespace tensorflow {
 namespace functor {
 
-
 template <typename Device, typename T, int NDIMS>
 struct Slice {
   void operator()(const Device& d, typename TTypes<T, NDIMS>::Tensor output,
diff --git a/tensorflow/core/kernels/slice_op_cpu_impl.h b/tensorflow/core/kernels/slice_op_cpu_impl.h
index 47f1d5342a..64b6948190 100644
--- a/tensorflow/core/kernels/slice_op_cpu_impl.h
+++ b/tensorflow/core/kernels/slice_op_cpu_impl.h
@@ -43,7 +43,7 @@ TF_CALL_GPU_NUMBER_TYPES(DEFINE_SYCL_KERNELS);
 DEFINE_SYCL_KERNELS(int32);
 
 #undef DEFINE_SYCL_KERNELS
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/softmax_op.cc b/tensorflow/core/kernels/softmax_op.cc
index 590f01c469..e1712ac239 100644
--- a/tensorflow/core/kernels/softmax_op.cc
+++ b/tensorflow/core/kernels/softmax_op.cc
@@ -30,7 +30,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // Partial specialization for a CPUDevice, that uses the Eigen implementation
 // from SoftmaxEigenImpl.
@@ -48,7 +48,7 @@ struct SoftmaxFunctor<CPUDevice, T> : SoftmaxFunctorBase<CPUDevice, T> {};
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
 struct SoftmaxFunctor<SYCLDevice, T> : SoftmaxFunctorBase<SYCLDevice, T> {};
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace functor
 
 template <typename Device, typename T>
@@ -100,5 +100,5 @@ REGISTER_KERNEL_BUILDER(
 REGISTER_KERNEL_BUILDER(
     Name("Softmax").Device(DEVICE_SYCL).TypeConstraint<double>("T"),
     SoftmaxOp<SYCLDevice, double>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/spacetobatch_benchmark_test.cc b/tensorflow/core/kernels/spacetobatch_benchmark_test.cc
index c25ce2d8bb..92ddf8edbf 100644
--- a/tensorflow/core/kernels/spacetobatch_benchmark_test.cc
+++ b/tensorflow/core/kernels/spacetobatch_benchmark_test.cc
@@ -70,7 +70,7 @@ static Graph* ConstructSpaceToBatchGraph(
   }                                                                                                     \
   BENCHMARK(                                                                                            \
       BM_##OP##_##DEVICE##_##DTYPE##_##B##_##H##_##W##_##D##_bs##BS##_pad##P00##_##P01##_##P10##_##P11);
-#define BM_SpaceToBatch(OP, ...)                      \
+#define BM_SpaceToBatch(OP, ...)                                 \
   BM_Expand(BM_SpaceToBatchDev(OP, cpu, DT_FLOAT, __VA_ARGS__)); \
   BM_Expand(BM_SpaceToBatchDev(OP, gpu, DT_FLOAT, __VA_ARGS__)); \
   BM_Expand(BM_SpaceToBatchDev(OP, cpu, DT_HALF, __VA_ARGS__));  \
diff --git a/tensorflow/core/kernels/spacetobatch_functor.cc b/tensorflow/core/kernels/spacetobatch_functor.cc
index 23d8a5f9ed..4c374b8d99 100644
--- a/tensorflow/core/kernels/spacetobatch_functor.cc
+++ b/tensorflow/core/kernels/spacetobatch_functor.cc
@@ -154,7 +154,7 @@ struct SpaceToBatchFunctor<CPUDevice, T, NUM_BLOCK_DIMS, B2S> {
 #define INSTANTIATE(NUM_BLOCK_DIMS, T)                                      \
   template struct SpaceToBatchFunctor<CPUDevice, T, NUM_BLOCK_DIMS, false>; \
   template struct SpaceToBatchFunctor<CPUDevice, T, NUM_BLOCK_DIMS, true>;  \
-/**/
+  /**/
 
 #define INSTANTIATE_FOR_T(T) \
   TF_SPACETOBATCH_FOR_EACH_NUM_BLOCK_DIMS(INSTANTIATE, T)
diff --git a/tensorflow/core/kernels/spacetobatch_functor.h b/tensorflow/core/kernels/spacetobatch_functor.h
index 06813650c0..f46a84da1e 100644
--- a/tensorflow/core/kernels/spacetobatch_functor.h
+++ b/tensorflow/core/kernels/spacetobatch_functor.h
@@ -44,7 +44,7 @@ constexpr int kMaxSpaceToBatchBlockDims = 4;
   MACRO(2 /**/, ##__VA_ARGS__)                              \
   MACRO(3 /**/, ##__VA_ARGS__)                              \
   MACRO(4 /**/, ##__VA_ARGS__)                              \
-/**/
+  /**/
 
 namespace internal {
 namespace spacetobatch {
diff --git a/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc b/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc
index db8d419c38..5687141c9e 100644
--- a/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc
@@ -141,10 +141,10 @@ struct SpaceToBatchFunctor<GPUDevice, T, NUM_BLOCK_DIMS, B2S> {
     }
     CudaLaunchConfig config =
         GetCudaLaunchConfig(static_cast<int32>(total_count), d);
-    S2B<T, NUM_BLOCK_DIMS,
-        B2S><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        config.virtual_thread_count, const_cast<T*>(space_tensor.data()), args,
-        const_cast<T*>(batch_tensor.data()));
+    S2B<T, NUM_BLOCK_DIMS, B2S>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            config.virtual_thread_count, const_cast<T*>(space_tensor.data()),
+            args, const_cast<T*>(batch_tensor.data()));
     return Status::OK();
   }
 };
@@ -153,7 +153,7 @@ struct SpaceToBatchFunctor<GPUDevice, T, NUM_BLOCK_DIMS, B2S> {
 #define INSTANTIATE(NUM_BLOCK_DIMS, T)                                      \
   template struct SpaceToBatchFunctor<GPUDevice, T, NUM_BLOCK_DIMS, false>; \
   template struct SpaceToBatchFunctor<GPUDevice, T, NUM_BLOCK_DIMS, true>;  \
-/**/
+  /**/
 
 #define INSTANTIATE_FOR_T(T) \
   TF_SPACETOBATCH_FOR_EACH_NUM_BLOCK_DIMS(INSTANTIATE, T)
diff --git a/tensorflow/core/kernels/spacetobatch_op.cc b/tensorflow/core/kernels/spacetobatch_op.cc
index 95c1f5e7e8..fdc08ec8e3 100644
--- a/tensorflow/core/kernels/spacetobatch_op.cc
+++ b/tensorflow/core/kernels/spacetobatch_op.cc
@@ -58,9 +58,10 @@ void SpaceToBatchOpCompute(OpKernelContext* context,
       errors::InvalidArgument("input rank should be >= ", 1 + block_dims,
                               " instead of ", orig_input_tensor.dims()));
 
-  OP_REQUIRES(context, TensorShapeUtils::IsMatrix(orig_paddings.shape()) &&
-                           block_dims == orig_paddings.dim_size(0) &&
-                           2 == orig_paddings.dim_size(1),
+  OP_REQUIRES(context,
+              TensorShapeUtils::IsMatrix(orig_paddings.shape()) &&
+                  block_dims == orig_paddings.dim_size(0) &&
+                  2 == orig_paddings.dim_size(1),
               errors::InvalidArgument("paddings should have shape [",
                                       block_dims, ", 2] instead of ",
                                       orig_paddings.shape().DebugString()));
diff --git a/tensorflow/core/kernels/sparse_add_grad_op.cc b/tensorflow/core/kernels/sparse_add_grad_op.cc
index d8ed0c6f0c..8597f3a8f7 100644
--- a/tensorflow/core/kernels/sparse_add_grad_op.cc
+++ b/tensorflow/core/kernels/sparse_add_grad_op.cc
@@ -35,9 +35,10 @@ class SparseAddGradOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->input("b_indices", &b_indices));
     OP_REQUIRES_OK(ctx, ctx->input("sum_indices", &sum_indices));
 
-    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(a_indices->shape()) &&
-                         TensorShapeUtils::IsMatrix(b_indices->shape()) &&
-                         TensorShapeUtils::IsMatrix(sum_indices->shape()),
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsMatrix(a_indices->shape()) &&
+                    TensorShapeUtils::IsMatrix(b_indices->shape()) &&
+                    TensorShapeUtils::IsMatrix(sum_indices->shape()),
                 errors::InvalidArgument(
                     "Input indices should be matrices but received shapes: ",
                     a_indices->shape().DebugString(), " and ",
@@ -49,8 +50,9 @@ class SparseAddGradOp : public OpKernel {
             "Input backprop_val_grad should be a vector but received shape: ",
             backprop_val_grad->shape().DebugString()));
     OP_REQUIRES(
-        ctx, a_indices->dim_size(1) == b_indices->dim_size(1) &&
-                 b_indices->dim_size(1) == sum_indices->dim_size(1),
+        ctx,
+        a_indices->dim_size(1) == b_indices->dim_size(1) &&
+            b_indices->dim_size(1) == sum_indices->dim_size(1),
         errors::InvalidArgument("The densified operands should have the same "
                                 "ndims; for A, B, sum got: ",
                                 a_indices->dim_size(1), b_indices->dim_size(1),
diff --git a/tensorflow/core/kernels/sparse_add_op.cc b/tensorflow/core/kernels/sparse_add_op.cc
index bd91dfdce6..d16317af67 100644
--- a/tensorflow/core/kernels/sparse_add_op.cc
+++ b/tensorflow/core/kernels/sparse_add_op.cc
@@ -34,8 +34,9 @@ class SparseAddOp : public OpKernel {
 
     OP_REQUIRES_OK(ctx, ctx->input("a_indices", &a_indices));
     OP_REQUIRES_OK(ctx, ctx->input("b_indices", &b_indices));
-    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(a_indices->shape()) &&
-                         TensorShapeUtils::IsMatrix(b_indices->shape()),
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsMatrix(a_indices->shape()) &&
+                    TensorShapeUtils::IsMatrix(b_indices->shape()),
                 errors::InvalidArgument(
                     "Input indices should be matrices but received shapes: ",
                     a_indices->shape().DebugString(), " and ",
@@ -46,8 +47,9 @@ class SparseAddOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->input("a_values", &a_values_t));
     OP_REQUIRES_OK(ctx, ctx->input("b_values", &b_values_t));
 
-    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(a_values_t->shape()) &&
-                         TensorShapeUtils::IsVector(b_values_t->shape()),
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsVector(a_values_t->shape()) &&
+                    TensorShapeUtils::IsVector(b_values_t->shape()),
                 errors::InvalidArgument(
                     "Input values should be vectors but received shapes: ",
                     a_values_t->shape().DebugString(), " and ",
@@ -62,8 +64,9 @@ class SparseAddOp : public OpKernel {
 
     OP_REQUIRES_OK(ctx, ctx->input("a_shape", &a_shape));
     OP_REQUIRES_OK(ctx, ctx->input("b_shape", &b_shape));
-    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(a_shape->shape()) &&
-                         TensorShapeUtils::IsVector(b_shape->shape()),
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsVector(a_shape->shape()) &&
+                    TensorShapeUtils::IsVector(b_shape->shape()),
                 errors::InvalidArgument(
                     "Input shapes should be a vector but received shapes ",
                     a_shape->shape().DebugString(), " and ",
diff --git a/tensorflow/core/kernels/sparse_add_op_test.cc b/tensorflow/core/kernels/sparse_add_op_test.cc
index 4cad02bbee..1f08e6c5ce 100644
--- a/tensorflow/core/kernels/sparse_add_op_test.cc
+++ b/tensorflow/core/kernels/sparse_add_op_test.cc
@@ -61,9 +61,9 @@ TEST_F(SparseAddOpTest, TwoD_AddSparseTensorWithSelf) {
   // [3   4]
 
   const auto indices_shape = TensorShape({4, 2});
-  std::initializer_list<int64> in{ 0, 1, 1, 0, 2, 0, 2, 1 };
+  std::initializer_list<int64> in{0, 1, 1, 0, 2, 0, 2, 1};
   const gtl::ArraySlice<int64> indices(in);
-  std::initializer_list<int64> sh{ 3, 2 };
+  std::initializer_list<int64> sh{3, 2};
   const gtl::ArraySlice<int64> shape(sh);
 
 #define ADD_TENSOR_INPUT()                                  \
diff --git a/tensorflow/core/kernels/sparse_conditional_accumulator_op.cc b/tensorflow/core/kernels/sparse_conditional_accumulator_op.cc
index c122616cf1..80bc1f1934 100644
--- a/tensorflow/core/kernels/sparse_conditional_accumulator_op.cc
+++ b/tensorflow/core/kernels/sparse_conditional_accumulator_op.cc
@@ -103,8 +103,9 @@ class SparseAccumulatorTakeGradientOp
                       DoneCallback callback) override {
     // Check signature
     OP_REQUIRES_OK_ASYNC(
-        ctx, ctx->MatchSignature({DT_STRING_REF, DT_INT32},
-                                 {DT_INT64, accumulator->dtype(), DT_INT64}),
+        ctx,
+        ctx->MatchSignature({DT_STRING_REF, DT_INT32},
+                            {DT_INT64, accumulator->dtype(), DT_INT64}),
         callback);
   }
 
diff --git a/tensorflow/core/kernels/sparse_cross_op.cc b/tensorflow/core/kernels/sparse_cross_op.cc
index 07d935d55f..7cd4532ad6 100644
--- a/tensorflow/core/kernels/sparse_cross_op.cc
+++ b/tensorflow/core/kernels/sparse_cross_op.cc
@@ -288,8 +288,7 @@ struct CrossTraits<true, int64> {
 template <bool HASHED_OUTPUT, typename InternalType>
 class SparseCrossOp : public OpKernel {
  public:
-  explicit SparseCrossOp(OpKernelConstruction* context)
-      : OpKernel(context) {
+  explicit SparseCrossOp(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("num_buckets", &num_buckets_));
     // Read signed_hash_key_ as int64 since uint64 attributes are not
     // supported by REGISTER_OP.
@@ -316,8 +315,8 @@ class SparseCrossOp : public OpKernel {
         GenerateColumnsFromInput(indices_list_in, values_list_in,
                                  shapes_list_in, dense_list_in);
 
-    typename CrossTraits<HASHED_OUTPUT, InternalType>::Crosser
-        crosser(columns, num_buckets_, hash_key_);
+    typename CrossTraits<HASHED_OUTPUT, InternalType>::Crosser crosser(
+        columns, num_buckets_, hash_key_);
     Tensor* indices_out;
     Tensor* values_out;
     Tensor* shape_out;
@@ -326,8 +325,8 @@ class SparseCrossOp : public OpKernel {
     CreateOutputTensors(columns, batch_size, context, &indices_out, &values_out,
                         &shape_out, &output_start_indices);
 
-    typename CrossTraits<HASHED_OUTPUT, InternalType>::Updater
-        updater(output_start_indices, indices_out, values_out);
+    typename CrossTraits<HASHED_OUTPUT, InternalType>::Updater updater(
+        output_start_indices, indices_out, values_out);
     auto do_work = [this, &columns, crosser, updater](int64 begin, int64 end) {
       for (int b = begin; b < end; b++) {
         ProductIterator<InternalType> product_iterator(columns, b);
@@ -381,8 +380,9 @@ class SparseCrossOp : public OpKernel {
               "Input values should be a std::vector but received shape ",
               values_list_in[i].shape().DebugString(), " at position ", i));
       OP_REQUIRES(
-          context, indices_list_in[i].shape().dim_size(0) ==
-                       values_list_in[i].shape().dim_size(0),
+          context,
+          indices_list_in[i].shape().dim_size(0) ==
+              values_list_in[i].shape().dim_size(0),
           errors::InvalidArgument(
               "Expected size of values to be ",
               indices_list_in[i].shape().dim_size(0), " got ",
diff --git a/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc b/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc
index cc0f86ce05..ac48202ada 100644
--- a/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc
+++ b/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc
@@ -70,8 +70,9 @@ class SparseDenseBinaryOpShared : public OpKernel {
                 errors::InvalidArgument(
                     "Input sp_indices should be a matrix but received shape: ",
                     indices_t->shape().DebugString()));
-    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(values_t->shape()) &&
-                         TensorShapeUtils::IsVector(shape_t->shape()),
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsVector(values_t->shape()) &&
+                    TensorShapeUtils::IsVector(shape_t->shape()),
                 errors::InvalidArgument(
                     "Inputs sp_values and sp_shape should be vectors "
                     "but received shapes: ",
@@ -150,8 +151,9 @@ class SparseDenseBinaryOpShared : public OpKernel {
       CASE(4);
       CASE(5);
       default:
-        OP_REQUIRES(ctx, false, errors::InvalidArgument(
-                                    "Only tensors with ranks between 1 and 5 "
+        OP_REQUIRES(
+            ctx, false,
+            errors::InvalidArgument("Only tensors with ranks between 1 and 5 "
                                     "are currently supported.  Tensor rank: ",
                                     ndims));
 #undef CASE
diff --git a/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc b/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc
index eaf1884243..fe198af7e6 100644
--- a/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc
+++ b/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc
@@ -96,9 +96,9 @@ TEST_F(SparseDenseCDivTest, SameShape) {
   // [2    ]  cdiv [dense: same shape, all 1's]
   // [3   4]
   const auto indices_shape = TensorShape({4, 2});
-  std::initializer_list<int64> in{ 0, 1, 1, 0, 2, 0, 2, 1 };
+  std::initializer_list<int64> in{0, 1, 1, 0, 2, 0, 2, 1};
   const gtl::ArraySlice<int64> indices(in);
-  std::initializer_list<int64> sh{ 3, 2 };
+  std::initializer_list<int64> sh{3, 2};
   const gtl::ArraySlice<int64> shape(sh);
 
   // Tensor dense(DT_FLOAT, TensorShape({3, 1}));
@@ -125,9 +125,9 @@ TEST_F(SparseDenseCDivTest, BroadcastDenseSameDims) {
   // [2    ]  cdiv [dense: shape [3,1], all 1's]
   // [3   4]
   const auto indices_shape = TensorShape({4, 2});
-  std::initializer_list<int64> in{ 0, 1, 1, 0, 2, 0, 2, 1 };
+  std::initializer_list<int64> in{0, 1, 1, 0, 2, 0, 2, 1};
   const gtl::ArraySlice<int64> indices(in);
-  std::initializer_list<int64> sh{ 3, 2 };
+  std::initializer_list<int64> sh{3, 2};
   const gtl::ArraySlice<int64> shape(sh);
 
   Tensor dense(DT_FLOAT, TensorShape({3, 1}));
@@ -152,9 +152,9 @@ TEST_F(SparseDenseCDivTest, BroadcastDenseFewerDims) {
   // [2    ]  cdiv [dense: shape [2]]
   // [3   4]
   const auto indices_shape = TensorShape({4, 2});
-  std::initializer_list<int64> in{ 0, 1, 1, 0, 2, 0, 2, 1 };
+  std::initializer_list<int64> in{0, 1, 1, 0, 2, 0, 2, 1};
   const gtl::ArraySlice<int64> indices(in);
-  std::initializer_list<int64> sh{ 3, 2 };
+  std::initializer_list<int64> sh{3, 2};
   const gtl::ArraySlice<int64> shape(sh);
 
   Tensor dense(DT_FLOAT, TensorShape({2}));
@@ -184,9 +184,9 @@ TEST_F(SparseDenseCMulTest, BroadcastDense) {
   // [1   ?]  where ? remains implicitly zero.
   // [1.5 0]
   const auto indices_shape = TensorShape({4, 2});
-  std::initializer_list<int64> in{ 0, 1, 1, 0, 2, 0, 2, 1 };
+  std::initializer_list<int64> in{0, 1, 1, 0, 2, 0, 2, 1};
   const gtl::ArraySlice<int64> indices(in);
-  std::initializer_list<int64> sh{ 3, 2 };
+  std::initializer_list<int64> sh{3, 2};
   const gtl::ArraySlice<int64> shape(sh);
 
   Tensor dense(DT_FLOAT, TensorShape({2}));
diff --git a/tensorflow/core/kernels/sparse_matmul_op.cc b/tensorflow/core/kernels/sparse_matmul_op.cc
index 8ab23b64d3..a1f9667b78 100644
--- a/tensorflow/core/kernels/sparse_matmul_op.cc
+++ b/tensorflow/core/kernels/sparse_matmul_op.cc
@@ -159,8 +159,8 @@ struct SparseSlice {
 
 template <typename T>
 template <bool Transpose>
-void SparseSlice<T>::Initialize(const typename SparseSlice<T>::ConstMatrixMap& mat,
-                                int col_offset) {
+void SparseSlice<T>::Initialize(
+    const typename SparseSlice<T>::ConstMatrixMap& mat, int col_offset) {
   const int mat_rows = Transpose ? mat.dimension(1) : mat.dimension(0);
   const int mat_cols = Transpose ? mat.dimension(0) : mat.dimension(1);
   DCHECK_LE(num_rows, mat_rows);
@@ -278,9 +278,9 @@ ALWAYS_INLINE float ConvertBfloat16ToFloat(const bfloat16* src) {
   float out = 0;
   auto tmp = reinterpret_cast<bfloat16*>(&out);
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-    tmp[0] = *src;
+  tmp[0] = *src;
 #else
-    tmp[1] = *src;
+  tmp[1] = *src;
 #endif
   return out;
 }
@@ -970,9 +970,9 @@ class SparseMatMulOp : public OpKernel {
     const int k2 = transpose_b_ ? b.dim_size(1) : b.dim_size(0);
 
     OP_REQUIRES(ctx, k == k2,
-                errors::InvalidArgument("Matrix size incompatible: a: ",
-                                        a.shape().DebugString(), ", b: ",
-                                        b.shape().DebugString()));
+                errors::InvalidArgument(
+                    "Matrix size incompatible: a: ", a.shape().DebugString(),
+                    ", b: ", b.shape().DebugString()));
     Tensor* output = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({m, n}), &output));
 
@@ -1224,8 +1224,9 @@ ALWAYS_INLINE void CopyAndMayBeInterleave(void* dst, const void* src,
 
 template <typename TL, typename TR>
 inline BlockingCounter* SparseMatMul<TL, TR>::ShuffleMatrix(
-    const typename SparseMatMul<TL, TR>::ConstMatrixMapR& mat, int slice_row_start,
-    int slice_num_rows, int slice_col_start, int slice_num_cols, const int N,
+    const typename SparseMatMul<TL, TR>::ConstMatrixMapR& mat,
+    int slice_row_start, int slice_num_rows, int slice_col_start,
+    int slice_num_cols, const int N,
     const DeviceBase::CpuWorkerThreads* thread_pool, MatrixR* buffer) {
   DCHECK_EQ(N % 2, 0);
   DCHECK_LE(kNumOperands * sizeof(float) / sizeof(TR), N);
@@ -1306,8 +1307,9 @@ inline std::unique_ptr<BlockingCounter> SparseMatMul<TL, TR>::CreateDenseSlices(
 template <typename TL, typename TR>
 inline void SparseMatMul<TL, TR>::ComputeBlockSizes(
     const typename SparseMatMul<TL, TR>::ConstMatrixMapL& left,
-    const typename SparseMatMul<TL, TR>::ConstMatrixMapR& right, bool transpose_left,
-    int num_threads, int* KR, int* NR, int* KL, int* JB, int* IB) {
+    const typename SparseMatMul<TL, TR>::ConstMatrixMapR& right,
+    bool transpose_left, int num_threads, int* KR, int* NR, int* KL, int* JB,
+    int* IB) {
   // Heuristics for calculating block sizes
   // Assume two hyperthreads per core.
   const int est_num_cores = std::max(1, (num_threads + 1) / 2);
diff --git a/tensorflow/core/kernels/sparse_matmul_op.h b/tensorflow/core/kernels/sparse_matmul_op.h
index cca52558ae..14ef2ed704 100644
--- a/tensorflow/core/kernels/sparse_matmul_op.h
+++ b/tensorflow/core/kernels/sparse_matmul_op.h
@@ -159,25 +159,25 @@ EIGEN_STRONG_INLINE Packet4f pload2bf16<Packet4f>(const float* from) {
 // Return a packet with the first value of the input Packet replicated
 template <>
 EIGEN_STRONG_INLINE Packet4f pbroadcast_first<Packet4f>(const Packet4f& a) {
-  return vec_splat (a, 0);
+  return vec_splat(a, 0);
 }
 
 // Return a packet with the second value of the input Packet replicated
 template <>
 EIGEN_STRONG_INLINE Packet4f pbroadcast_second<Packet4f>(const Packet4f& a) {
-  return vec_splat (a, 1);
+  return vec_splat(a, 1);
 }
 
 // Return a packet with the third value of the input Packet replicated
 template <>
 EIGEN_STRONG_INLINE Packet4f pbroadcast_third<Packet4f>(const Packet4f& a) {
-  return vec_splat (a, 2);
+  return vec_splat(a, 2);
 }
 
 // Return a packet with the fourth value of the input Packet replicated
 template <>
 EIGEN_STRONG_INLINE Packet4f pbroadcast_fourth<Packet4f>(const Packet4f& a) {
-  return vec_splat (a, 3);
+  return vec_splat(a, 3);
 }
 #endif
 
diff --git a/tensorflow/core/kernels/sparse_matmul_op_test.cc b/tensorflow/core/kernels/sparse_matmul_op_test.cc
index f815ca9e34..ebc6d8fa4e 100644
--- a/tensorflow/core/kernels/sparse_matmul_op_test.cc
+++ b/tensorflow/core/kernels/sparse_matmul_op_test.cc
@@ -284,11 +284,11 @@ class SparseMatmulOpTest : public ::testing::Test {
       uint16_t* data3_bfloat16_p =
           reinterpret_cast<uint16_t*>(data3_bfloat16) + i;
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-            data3_p[1] = 0;
-            data3_bfloat16_p[0] = data3_p[0];
+      data3_p[1] = 0;
+      data3_bfloat16_p[0] = data3_p[0];
 #else
-            data3_p[0] = 0;
-            data3_bfloat16_p[0] = data3_p[1];
+      data3_p[0] = 0;
+      data3_bfloat16_p[0] = data3_p[1];
 #endif
     }
   }
diff --git a/tensorflow/core/kernels/sparse_reduce_sum_op_test.cc b/tensorflow/core/kernels/sparse_reduce_sum_op_test.cc
index 110376be42..96246c7a71 100644
--- a/tensorflow/core/kernels/sparse_reduce_sum_op_test.cc
+++ b/tensorflow/core/kernels/sparse_reduce_sum_op_test.cc
@@ -51,9 +51,9 @@ TEST_F(SparseReduceSumOpTest, SimpleReduce) {
   // [3   4]
 
   const auto indices_shape = TensorShape({4, 2});
-  std::initializer_list<int64> in{ 0, 1, 1, 0, 2, 0, 2, 1 };
+  std::initializer_list<int64> in{0, 1, 1, 0, 2, 0, 2, 1};
   const gtl::ArraySlice<int64> indices(in);
-  std::initializer_list<int64> sh{ 3, 2 };
+  std::initializer_list<int64> sh{3, 2};
   const gtl::ArraySlice<int64> shape(sh);
 
   AddInputFromArray<int64>(indices_shape, indices);
@@ -93,9 +93,9 @@ TEST_F(SparseReduceSumSparseOpTest, SimpleReduce) {
   // [3   4]
 
   const auto indices_shape = TensorShape({4, 2});
-  std::initializer_list<int64> in{ 0, 1, 1, 0, 2, 0, 2, 1 };
+  std::initializer_list<int64> in{0, 1, 1, 0, 2, 0, 2, 1};
   const gtl::ArraySlice<int64> indices(in);
-  std::initializer_list<int64> sh{ 3, 2 };
+  std::initializer_list<int64> sh{3, 2};
   const gtl::ArraySlice<int64> shape(sh);
 
   AddInputFromArray<int64>(indices_shape, indices);
diff --git a/tensorflow/core/kernels/sparse_softmax_op.cc b/tensorflow/core/kernels/sparse_softmax_op.cc
index 327a94b8a1..444a5f657a 100644
--- a/tensorflow/core/kernels/sparse_softmax_op.cc
+++ b/tensorflow/core/kernels/sparse_softmax_op.cc
@@ -50,8 +50,9 @@ class SparseSoftmaxOp : public OpKernel {
                 errors::InvalidArgument(
                     "Input sp_indices should be a matrix but received shape: ",
                     indices_t->shape().DebugString()));
-    OP_REQUIRES(context, TensorShapeUtils::IsVector(values_t->shape()) &&
-                             TensorShapeUtils::IsVector(shape_t->shape()),
+    OP_REQUIRES(context,
+                TensorShapeUtils::IsVector(values_t->shape()) &&
+                    TensorShapeUtils::IsVector(shape_t->shape()),
                 errors::InvalidArgument(
                     "Inputs sp_values and sp_shape should be vectors "
                     "but received shapes: ",
diff --git a/tensorflow/core/kernels/sparse_sparse_binary_op_shared.cc b/tensorflow/core/kernels/sparse_sparse_binary_op_shared.cc
index b027adba6b..09cb2a6a71 100644
--- a/tensorflow/core/kernels/sparse_sparse_binary_op_shared.cc
+++ b/tensorflow/core/kernels/sparse_sparse_binary_op_shared.cc
@@ -132,14 +132,16 @@ class SparseSparseBinaryOpShared : public OpKernel {
 
     // Validations.
     OP_REQUIRES(
-        ctx, TensorShapeUtils::IsMatrix(a_indices_t->shape()) &&
-                 TensorShapeUtils::IsMatrix(b_indices_t->shape()),
+        ctx,
+        TensorShapeUtils::IsMatrix(a_indices_t->shape()) &&
+            TensorShapeUtils::IsMatrix(b_indices_t->shape()),
         errors::InvalidArgument("Inputs a_indices and b_indices should be "
                                 "matrices but received shapes: ",
                                 a_indices_t->shape().DebugString(), ", ",
                                 b_indices_t->shape().DebugString()));
-    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(a_values_t->shape()) &&
-                         TensorShapeUtils::IsVector(b_values_t->shape()),
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsVector(a_values_t->shape()) &&
+                    TensorShapeUtils::IsVector(b_values_t->shape()),
                 errors::InvalidArgument(
                     "Inputs a_values and b_values should be vectors "
                     "but received shapes: ",
@@ -157,8 +159,9 @@ class SparseSparseBinaryOpShared : public OpKernel {
                                 " non-empty input values, got ",
                                 a_values.size(), " and ", b_values.size()));
 
-    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(a_shape_t->shape()) &&
-                         TensorShapeUtils::IsVector(b_shape_t->shape()),
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsVector(a_shape_t->shape()) &&
+                    TensorShapeUtils::IsVector(b_shape_t->shape()),
                 errors::InvalidArgument(
                     "Input shapes should be a vector but received shapes ",
                     a_shape_t->shape().DebugString(), " and ",
diff --git a/tensorflow/core/kernels/sparse_split_op.cc b/tensorflow/core/kernels/sparse_split_op.cc
index 6171b532aa..67dcf05a6c 100644
--- a/tensorflow/core/kernels/sparse_split_op.cc
+++ b/tensorflow/core/kernels/sparse_split_op.cc
@@ -48,18 +48,20 @@ class SparseSplitOp : public OpKernel {
                     "Input shape should be a vector but received shape ",
                     input_shape.shape().DebugString()));
 
-    OP_REQUIRES(context, input_shape.dim_size(0) &&
-                             split_dim < input_shape.vec<int64>().size(),
-                errors::InvalidArgument(
-                    "Input split_dim should be between 0 and rank (",
-                    input_shape.vec<int64>().size(), "), got ", split_dim));
-
-    OP_REQUIRES(context, num_split_ >= 1 &&
-                             num_split_ <= input_shape.vec<int64>()(split_dim),
-                errors::InvalidArgument("Input num_split should be between 1 "
-                                        "and the splitting dimension size (",
-                                        input_shape.vec<int64>()(split_dim),
-                                        "), got ", num_split_));
+    OP_REQUIRES(
+        context,
+        input_shape.dim_size(0) && split_dim < input_shape.vec<int64>().size(),
+        errors::InvalidArgument(
+            "Input split_dim should be between 0 and rank (",
+            input_shape.vec<int64>().size(), "), got ", split_dim));
+
+    OP_REQUIRES(
+        context,
+        num_split_ >= 1 && num_split_ <= input_shape.vec<int64>()(split_dim),
+        errors::InvalidArgument("Input num_split should be between 1 "
+                                "and the splitting dimension size (",
+                                input_shape.vec<int64>()(split_dim), "), got ",
+                                num_split_));
 
     sparse::SparseTensor sparse_tensor(input_indices, input_values,
                                        TensorShape(input_shape.vec<int64>()));
diff --git a/tensorflow/core/kernels/sparse_to_dense_op.cc b/tensorflow/core/kernels/sparse_to_dense_op.cc
index 6a6cc3d813..ba3da21a43 100644
--- a/tensorflow/core/kernels/sparse_to_dense_op.cc
+++ b/tensorflow/core/kernels/sparse_to_dense_op.cc
@@ -73,8 +73,9 @@ class SparseToDense : public OpKernel {
     // sparse_values
     const Tensor& sparse_values = c->input(2);
     const int64 num_values = sparse_values.NumElements();
-    OP_REQUIRES(c, sparse_values.dims() == 0 ||
-                       (sparse_values.dims() == 1 && num_values == num_elems),
+    OP_REQUIRES(c,
+                sparse_values.dims() == 0 ||
+                    (sparse_values.dims() == 1 && num_values == num_elems),
                 errors::InvalidArgument("sparse_values has incorrect shape ",
                                         sparse_values.shape().DebugString(),
                                         ", should be [] or [", num_elems, "]"));
diff --git a/tensorflow/core/kernels/sparse_to_dense_op_test.cc b/tensorflow/core/kernels/sparse_to_dense_op_test.cc
index f0d19da804..d8b0f93082 100644
--- a/tensorflow/core/kernels/sparse_to_dense_op_test.cc
+++ b/tensorflow/core/kernels/sparse_to_dense_op_test.cc
@@ -38,7 +38,6 @@ namespace {
 
 class SparseToDenseTest : public OpsTestBase {
  protected:
-
   void MakeOp(int dim, DataType index_type, DataType value_type) {
     TF_ASSERT_OK(NodeDefBuilder("sparsetodense", "SparseToDense")
                      .Input(FakeInput(index_type))
diff --git a/tensorflow/core/kernels/sparse_xent_op.cc b/tensorflow/core/kernels/sparse_xent_op.cc
index c35ba42db2..f84ffd5323 100644
--- a/tensorflow/core/kernels/sparse_xent_op.cc
+++ b/tensorflow/core/kernels/sparse_xent_op.cc
@@ -39,10 +39,10 @@ Status CheckInvalidLabelIndex(const Tensor& labels, int64 max_index) {
   if (*min_max_dim_value.first < 0 || *min_max_dim_value.second >= max_index) {
     bad_index = (*min_max_dim_value.first < 0) ? *min_max_dim_value.first
                                                : *min_max_dim_value.second;
-    return errors::InvalidArgument("Received a label value of ", bad_index,
-                                   " which is outside the valid range of [0, ",
-                                   max_index, ").  Label values: ",
-                                   labels.SummarizeValue(labels.NumElements()));
+    return errors::InvalidArgument(
+        "Received a label value of ", bad_index,
+        " which is outside the valid range of [0, ", max_index,
+        ").  Label values: ", labels.SummarizeValue(labels.NumElements()));
   }
   return Status::OK();
 }
diff --git a/tensorflow/core/kernels/sparse_xent_op_test.cc b/tensorflow/core/kernels/sparse_xent_op_test.cc
index b8ea0d2d7e..afb0bf7626 100644
--- a/tensorflow/core/kernels/sparse_xent_op_test.cc
+++ b/tensorflow/core/kernels/sparse_xent_op_test.cc
@@ -41,10 +41,10 @@ static Graph* SparseXent(int batch_size, int num_classes) {
   return g;
 }
 
-#define BM_SparseXentDev(BATCH, CLASS, DEVICE)                                \
-  static void BM_SparseXent##_##BATCH##_##CLASS##_##DEVICE(int iters) {       \
+#define BM_SparseXentDev(BATCH, CLASS, DEVICE)                          \
+  static void BM_SparseXent##_##BATCH##_##CLASS##_##DEVICE(int iters) { \
     testing::ItemsProcessed(static_cast<int64>(iters) * BATCH * CLASS); \
-    test::Benchmark(#DEVICE, SparseXent(BATCH, CLASS)).Run(iters);            \
+    test::Benchmark(#DEVICE, SparseXent(BATCH, CLASS)).Run(iters);      \
   }                                                                     \
   BENCHMARK(BM_SparseXent##_##BATCH##_##CLASS##_##DEVICE);
 
diff --git a/tensorflow/core/kernels/split_lib.h b/tensorflow/core/kernels/split_lib.h
index ff92ffeeb3..a08949e626 100644
--- a/tensorflow/core/kernels/split_lib.h
+++ b/tensorflow/core/kernels/split_lib.h
@@ -57,7 +57,7 @@ struct Split<Eigen::SyclDevice, T> {
                   const Eigen::DSizes<Eigen::DenseIndex, 3>& slice_indices,
                   const Eigen::DSizes<Eigen::DenseIndex, 3>& slice_sizes);
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/split_lib_cpu.cc b/tensorflow/core/kernels/split_lib_cpu.cc
index 25026208d1..771c633b15 100644
--- a/tensorflow/core/kernels/split_lib_cpu.cc
+++ b/tensorflow/core/kernels/split_lib_cpu.cc
@@ -49,13 +49,13 @@ void Split<Eigen::SyclDevice, T>::operator()(
     typename TTypes<T, 3>::ConstTensor input,
     const Eigen::DSizes<Eigen::DenseIndex, 3>& slice_indices,
     const Eigen::DSizes<Eigen::DenseIndex, 3>& slice_sizes) {
-    output.device(d) = input.slice(slice_indices, slice_sizes);
+  output.device(d) = input.slice(slice_indices, slice_sizes);
 }
 
 #define DEFINE_SYCL_KERNELS(T) template struct Split<Eigen::SyclDevice, T>;
 
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(DEFINE_SYCL_KERNELS);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/split_op.cc b/tensorflow/core/kernels/split_op.cc
index 78badde27e..85f529326d 100644
--- a/tensorflow/core/kernels/split_op.cc
+++ b/tensorflow/core/kernels/split_op.cc
@@ -39,7 +39,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 class SplitOpBase : public OpKernel {
@@ -142,8 +142,9 @@ class SplitOpCPU : public SplitOpBase<CPUDevice, T> {
 
     // Android also uses int32 indexing, so check here also.
     OP_REQUIRES(
-        context, FastBoundsCheck(input.NumElements(),
-                                 std::numeric_limits<Eigen::DenseIndex>::max()),
+        context,
+        FastBoundsCheck(input.NumElements(),
+                        std::numeric_limits<Eigen::DenseIndex>::max()),
         errors::InvalidArgument("Split requires input size < ",
                                 std::numeric_limits<Eigen::DenseIndex>::max()));
 
@@ -245,10 +246,11 @@ class SplitOpGPU : public SplitOpBase<GPUDevice, T> {
     const int32 split_dim =
         split_dim_orig < 0 ? split_dim_orig + input.dims() : split_dim_orig;
     const int32 num_split = Base::num_outputs();
-    OP_REQUIRES(context, FastBoundsCheck(input.NumElements(),
-                                         std::numeric_limits<int32>::max()),
-                errors::InvalidArgument("Split on GPU requires input size "
-                                        "< max int32"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(input.NumElements(), std::numeric_limits<int32>::max()),
+        errors::InvalidArgument("Split on GPU requires input size "
+                                "< max int32"));
     int32 prefix_dim_size;
     int32 split_dim_size;
     int32 suffix_dim_size;
@@ -304,8 +306,9 @@ class SplitOpSYCL : public SplitOpBase<SYCLDevice, T> {
 
     // Android also uses int32 indexing, so check here also.
     OP_REQUIRES(
-        context, FastBoundsCheck(input.NumElements(),
-                                 std::numeric_limits<Eigen::DenseIndex>::max()),
+        context,
+        FastBoundsCheck(input.NumElements(),
+                        std::numeric_limits<Eigen::DenseIndex>::max()),
         errors::InvalidArgument("Split requires input size < ",
                                 std::numeric_limits<Eigen::DenseIndex>::max()));
 
@@ -342,14 +345,14 @@ class SplitOpSYCL : public SplitOpBase<SYCLDevice, T> {
             {prefix_dim_size, split_dim_output_size, suffix_dim_size});
 
         functor::Split<SYCLDevice, T>()(context->eigen_device<SYCLDevice>(),
-                                       result_shaped, input_reshaped,
-                                       slice_indices, slice_sizes);
+                                        result_shaped, input_reshaped,
+                                        slice_indices, slice_sizes);
       }
       indices[1] += split_dim_output_size;
     }
   }
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #define REGISTER_SPLIT(type)                             \
   REGISTER_KERNEL_BUILDER(Name("Split")                  \
@@ -381,11 +384,11 @@ REGISTER_GPU(bfloat16);
 #endif  // GOOGLE_CUDA
 
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL(type)                               \
-  REGISTER_KERNEL_BUILDER(Name("Split")                   \
-                              .Device(DEVICE_SYCL)        \
-                              .TypeConstraint<type>("T")  \
-                              .HostMemory("split_dim"),   \
+#define REGISTER_SYCL(type)                              \
+  REGISTER_KERNEL_BUILDER(Name("Split")                  \
+                              .Device(DEVICE_SYCL)       \
+                              .TypeConstraint<type>("T") \
+                              .HostMemory("split_dim"),  \
                           SplitOpSYCL<type>)
 
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL);
diff --git a/tensorflow/core/kernels/split_v_op.cc b/tensorflow/core/kernels/split_v_op.cc
index f1078ac349..7ff5df47d7 100644
--- a/tensorflow/core/kernels/split_v_op.cc
+++ b/tensorflow/core/kernels/split_v_op.cc
@@ -197,8 +197,9 @@ class SplitVOpCPU : public SplitVOpBase<CPUDevice, T, Tlen> {
 
     // Android also uses int32 indexing, so check here also.
     OP_REQUIRES(
-        context, FastBoundsCheck(input.NumElements(),
-                                 std::numeric_limits<Eigen::DenseIndex>::max()),
+        context,
+        FastBoundsCheck(input.NumElements(),
+                        std::numeric_limits<Eigen::DenseIndex>::max()),
         errors::InvalidArgument("Split requires input size < ",
                                 std::numeric_limits<Eigen::DenseIndex>::max()));
 
@@ -305,10 +306,11 @@ class SplitVOpGPU : public SplitVOpBase<GPUDevice, T, Tlen> {
     const int32 split_dim_orig = context->input(2).flat<int32>()(0);
     const int32 split_dim =
         split_dim_orig < 0 ? split_dim_orig + input.dims() : split_dim_orig;
-    OP_REQUIRES(context, FastBoundsCheck(input.NumElements(),
-                                         std::numeric_limits<int32>::max()),
-                errors::InvalidArgument("Split on GPU requires input size "
-                                        "< max int32"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(input.NumElements(), std::numeric_limits<int32>::max()),
+        errors::InvalidArgument("Split on GPU requires input size "
+                                "< max int32"));
 
     int32 prefix_dim_size;
     int32 split_dim_size;
diff --git a/tensorflow/core/kernels/stack_ops.cc b/tensorflow/core/kernels/stack_ops.cc
index affe81a555..65296f61fd 100644
--- a/tensorflow/core/kernels/stack_ops.cc
+++ b/tensorflow/core/kernels/stack_ops.cc
@@ -42,7 +42,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 class Stack : public ResourceBase {
  public:
@@ -242,7 +242,7 @@ REGISTER_KERNEL_BUILDER(Name("StackV2")
                             .HostMemory("max_size")
                             .HostMemory("handle"),
                         StackOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device>
 class StackPushOp : public AsyncOpKernel {
@@ -274,11 +274,11 @@ class StackPushOp : public AsyncOpKernel {
     static constexpr int kCopyThreshold = 2048;
     static constexpr double kOccupancy = 0.7;
     if (swap_memory_ && !alloc_attrs.on_host() &&
-        ( std::is_same<Device, GPUDevice>::value
+        (std::is_same<Device, GPUDevice>::value
 #ifdef TENSORFLOW_USE_SYCL
-          || std::is_same<Device, SYCLDevice>::value
-#endif // TENSORFLOW_USE_SYCL
-        ) &&
+         || std::is_same<Device, SYCLDevice>::value
+#endif  // TENSORFLOW_USE_SYCL
+         ) &&
         tensor.TotalBytes() > kCopyThreshold && stack->IsUsefulToSwap(tensor)) {
       DeviceContext* device_ctxt = ctx->op_device_context();
       auto device = static_cast<tensorflow::Device*>(ctx->device());
@@ -391,7 +391,7 @@ REGISTER_SYCL_HOST_KERNEL(int32);
 REGISTER_SYCL_HOST_KERNEL(bool);
 #undef REGISTER_SYCL_KERNEL
 #undef REGISTER_SYCL_HOST_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 class StackPopOp : public AsyncOpKernel {
  public:
@@ -498,7 +498,7 @@ REGISTER_SYCL_HOST_KERNEL(bool);
 
 #undef REGISTER_SYCL_KERNEL
 #undef REGISTER_SYCL_HOST_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 class StackCloseOp : public OpKernel {
  public:
@@ -526,6 +526,6 @@ REGISTER_KERNEL_BUILDER(
 REGISTER_KERNEL_BUILDER(
     Name("StackCloseV2").Device(DEVICE_SYCL).HostMemory("handle"),
     StackCloseOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/stage_op.cc b/tensorflow/core/kernels/stage_op.cc
index 0fae46dea6..03fc4467a1 100644
--- a/tensorflow/core/kernels/stage_op.cc
+++ b/tensorflow/core/kernels/stage_op.cc
@@ -70,12 +70,11 @@ class Buffer : public ResourceBase {
     return bytes + current_bytes_ > memory_limit_;
   }
 
-  std::size_t GetTupleBytes(const Tuple & tuple)
-  {
+  std::size_t GetTupleBytes(const Tuple& tuple) {
     return std::accumulate(tuple.begin(), tuple.end(), 0,
-      [](const std::size_t & lhs, const Tensor & rhs) {
-        return lhs + rhs.TotalBytes();
-    });
+                           [](const std::size_t& lhs, const Tensor& rhs) {
+                             return lhs + rhs.TotalBytes();
+                           });
   }
 
  public:
@@ -90,19 +89,22 @@ class Buffer : public ResourceBase {
     std::size_t tuple_bytes = GetTupleBytes(*tuple);
 
     // Sanity check so that we don't block for ever below
-    if(memory_limit_ > 0 && tuple_bytes > memory_limit_) {
-      return Status(errors::ResourceExhausted("Attempted to insert "
-        "tensors with combined size of '", tuple_bytes, "' bytes into "
-        "Staging Area with a memory limit of '", memory_limit_, "'."));
+    if (memory_limit_ > 0 && tuple_bytes > memory_limit_) {
+      return Status(
+          errors::ResourceExhausted("Attempted to insert "
+                                    "tensors with combined size of '",
+                                    tuple_bytes,
+                                    "' bytes into "
+                                    "Staging Area with a memory limit of '",
+                                    memory_limit_, "'."));
     }
 
-
     // If buffer capacity is bounded wait until elements have been removed
-    if(IsBounded()) {
+    if (IsBounded()) {
       full_cond_var_.wait(lock, [tuple_bytes, this]() {
         // If there's a memory limit, check if there's space for insertion
-        bool memory_limit_valid = memory_limit_ > 0 ?
-            !WouldExceedMemoryLimit(tuple_bytes) : true;
+        bool memory_limit_valid =
+            memory_limit_ > 0 ? !WouldExceedMemoryLimit(tuple_bytes) : true;
         // If we're configured for capacity check if there's space for insertion
         bool capacity_valid = capacity_ > 0 ? !IsCapacityFull() : true;
 
@@ -186,8 +188,7 @@ Status GetBuffer(OpKernelContext* ctx, const NodeDef& ndef, Buffer** buf) {
   ContainerInfo cinfo;
 
   // Lambda for creating the Staging Area
-  auto create_fn = [&ndef](Buffer** ret) -> Status
-  {
+  auto create_fn = [&ndef](Buffer** ret) -> Status {
     int64 capacity;
     int64 memory_limit;
     TF_RETURN_IF_ERROR(GetNodeAttr(ndef, "capacity", &capacity));
@@ -196,7 +197,6 @@ Status GetBuffer(OpKernelContext* ctx, const NodeDef& ndef, Buffer** buf) {
     return Status::OK();
   };
 
-
   TF_RETURN_IF_ERROR(cinfo.Init(rm, ndef, true /* use name() */));
   TF_RETURN_IF_ERROR(rm->LookupOrCreate<Buffer>(cinfo.container(), cinfo.name(),
                                                 buf, create_fn));
@@ -228,7 +228,7 @@ REGISTER_KERNEL_BUILDER(Name("Stage").Device(DEVICE_GPU), StageOp);
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(Name("Stage").Device(DEVICE_SYCL), StageOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 class UnstageOp : public OpKernel {
  public:
@@ -244,7 +244,8 @@ class UnstageOp : public OpKernel {
 
     buf->Get(&tuple);
 
-    OP_REQUIRES(ctx, tuple.size() == (size_t)ctx->num_outputs(),
+    OP_REQUIRES(
+        ctx, tuple.size() == (size_t)ctx->num_outputs(),
         errors::InvalidArgument("Mismatch stage/unstage: ", tuple.size(),
                                 " vs. ", ctx->num_outputs()));
 
@@ -260,7 +261,7 @@ REGISTER_KERNEL_BUILDER(Name("Unstage").Device(DEVICE_GPU), UnstageOp);
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(Name("Unstage").Device(DEVICE_SYCL), UnstageOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 class StagePeekOp : public OpKernel {
  public:
@@ -278,7 +279,8 @@ class StagePeekOp : public OpKernel {
 
     OP_REQUIRES_OK(ctx, buf->Peek(index, &tuple));
 
-    OP_REQUIRES(ctx, tuple.size() == (size_t)ctx->num_outputs(),
+    OP_REQUIRES(
+        ctx, tuple.size() == (size_t)ctx->num_outputs(),
         errors::InvalidArgument("Mismatch stage/unstage: ", tuple.size(),
                                 " vs. ", ctx->num_outputs()));
 
@@ -288,17 +290,15 @@ class StagePeekOp : public OpKernel {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("StagePeek").Device(DEVICE_CPU),
-                                              StagePeekOp);
+REGISTER_KERNEL_BUILDER(Name("StagePeek").Device(DEVICE_CPU), StagePeekOp);
 #if GOOGLE_CUDA
-REGISTER_KERNEL_BUILDER(Name("StagePeek").HostMemory("index").
-                            Device(DEVICE_GPU), StagePeekOp);
+REGISTER_KERNEL_BUILDER(
+    Name("StagePeek").HostMemory("index").Device(DEVICE_GPU), StagePeekOp);
 #endif
 #ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("StagePeek").HostMemory("index")
-                          .Device(DEVICE_SYCL), StagePeekOp);
-#endif // TENSORFLOW_USE_SYCL
-
+REGISTER_KERNEL_BUILDER(
+    Name("StagePeek").HostMemory("index").Device(DEVICE_SYCL), StagePeekOp);
+#endif  // TENSORFLOW_USE_SYCL
 
 class StageSizeOp : public OpKernel {
  public:
@@ -312,9 +312,8 @@ class StageSizeOp : public OpKernel {
     core::ScopedUnref scope(buf);
 
     // Allocate size output tensor
-    Tensor * size = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}),
-                                                     &size));
+    Tensor* size = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &size));
 
     // Set it to the actual size
     size->scalar<int32>().setConstant(buf->Size());
@@ -323,13 +322,13 @@ class StageSizeOp : public OpKernel {
 
 REGISTER_KERNEL_BUILDER(Name("StageSize").Device(DEVICE_CPU), StageSizeOp);
 #if GOOGLE_CUDA
-REGISTER_KERNEL_BUILDER(Name("StageSize").HostMemory("size")
-                        .Device(DEVICE_GPU), StageSizeOp);
+REGISTER_KERNEL_BUILDER(Name("StageSize").HostMemory("size").Device(DEVICE_GPU),
+                        StageSizeOp);
 #endif
 #ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("StageSize").HostMemory("size")
-                        .Device(DEVICE_SYCL), StageSizeOp);
-#endif // TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(
+    Name("StageSize").HostMemory("size").Device(DEVICE_SYCL), StageSizeOp);
+#endif  // TENSORFLOW_USE_SYCL
 
 class StageClearOp : public OpKernel {
  public:
@@ -352,7 +351,6 @@ REGISTER_KERNEL_BUILDER(Name("StageClear").Device(DEVICE_GPU), StageClearOp);
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(Name("StageClear").Device(DEVICE_SYCL), StageClearOp);
-#endif // TENSORFLOW_USE_SYCL
-
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/strided_slice_op.cc b/tensorflow/core/kernels/strided_slice_op.cc
index 7c213e14d2..8f7f91c9df 100644
--- a/tensorflow/core/kernels/strided_slice_op.cc
+++ b/tensorflow/core/kernels/strided_slice_op.cc
@@ -541,5 +541,5 @@ REGISTER_KERNEL_BUILDER(Name("ResourceStridedSliceAssign")
                             .HostMemory("strides"),
                         StridedSliceAssignOp<CPUDevice, int32>)
 #undef REGISTER_SYCL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/strided_slice_op_impl.h b/tensorflow/core/kernels/strided_slice_op_impl.h
index a84ba38ef4..ac1259a9ac 100644
--- a/tensorflow/core/kernels/strided_slice_op_impl.h
+++ b/tensorflow/core/kernels/strided_slice_op_impl.h
@@ -302,7 +302,7 @@ DECLARE_FOR_N_SYCL(int32);
 DECLARE_FOR_N_SYCL(int64);
 
 #undef DECLARE_FOR_N_SYCL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #undef INSTANTIATE
 #undef DECLARE_FOR_N_CPU
diff --git a/tensorflow/core/kernels/string_join_op.cc b/tensorflow/core/kernels/string_join_op.cc
index 721702bec6..28cca9f448 100644
--- a/tensorflow/core/kernels/string_join_op.cc
+++ b/tensorflow/core/kernels/string_join_op.cc
@@ -50,9 +50,9 @@ class StringJoinOp : public OpKernel {
         } else {
           OP_REQUIRES(
               context, input_shape == input.shape(),
-              errors::InvalidArgument("Input shapes do not match: ",
-                                      input_shape.DebugString(), " vs. ",
-                                      input.shape().DebugString()));
+              errors::InvalidArgument(
+                  "Input shapes do not match: ", input_shape.DebugString(),
+                  " vs. ", input.shape().DebugString()));
         }
       }
     }
diff --git a/tensorflow/core/kernels/substr_op.cc b/tensorflow/core/kernels/substr_op.cc
index 743f113150..e29f67297f 100644
--- a/tensorflow/core/kernels/substr_op.cc
+++ b/tensorflow/core/kernels/substr_op.cc
@@ -95,9 +95,9 @@ class SubstrOp : public OpKernel {
       // Create BCast helper with shape of input and pos/len
       BCast bcast(BCast::FromShape(input_shape), BCast::FromShape(pos_shape));
       OP_REQUIRES(context, bcast.IsValid(),
-                  errors::InvalidArgument("Incompatible shapes: ",
-                                          input_shape.DebugString(), " vs. ",
-                                          pos_shape.DebugString()));
+                  errors::InvalidArgument(
+                      "Incompatible shapes: ", input_shape.DebugString(),
+                      " vs. ", pos_shape.DebugString()));
       TensorShape output_shape = BCast::ToShape(bcast.result_shape());
       int ndims = output_shape.dims();
       Tensor* output_tensor = nullptr;
diff --git a/tensorflow/core/kernels/summary_image_op.cc b/tensorflow/core/kernels/summary_image_op.cc
index 233b824bcc..29b21ee735 100644
--- a/tensorflow/core/kernels/summary_image_op.cc
+++ b/tensorflow/core/kernels/summary_image_op.cc
@@ -54,18 +54,20 @@ class SummaryImageOp : public OpKernel {
     const Tensor& tensor = c->input(1);
     OP_REQUIRES(c, IsLegacyScalar(tags.shape()),
                 errors::InvalidArgument("Tags must be a scalar"));
-    OP_REQUIRES(c, tensor.dims() == 4 &&
-                       (tensor.dim_size(3) == 1 || tensor.dim_size(3) == 3 ||
-                        tensor.dim_size(3) == 4),
+    OP_REQUIRES(c,
+                tensor.dims() == 4 &&
+                    (tensor.dim_size(3) == 1 || tensor.dim_size(3) == 3 ||
+                     tensor.dim_size(3) == 4),
                 errors::InvalidArgument(
                     "Tensor must be 4-D with last dim 1, 3, or 4, not ",
                     tensor.shape().DebugString()));
     const string& base_tag = tags.scalar<string>()();
 
-    OP_REQUIRES(c, tensor.dim_size(0) < (1LL << 31) &&
-                       tensor.dim_size(1) < (1LL << 31) &&
-                       tensor.dim_size(2) < (1LL << 31) &&
-                       (tensor.dim_size(1) * tensor.dim_size(2)) < (1LL << 29),
+    OP_REQUIRES(c,
+                tensor.dim_size(0) < (1LL << 31) &&
+                    tensor.dim_size(1) < (1LL << 31) &&
+                    tensor.dim_size(2) < (1LL << 31) &&
+                    (tensor.dim_size(1) * tensor.dim_size(2)) < (1LL << 29),
                 errors::InvalidArgument("Tensor too large for summary ",
                                         tensor.shape().DebugString()));
 
diff --git a/tensorflow/core/kernels/summary_op.cc b/tensorflow/core/kernels/summary_op.cc
index b818724ec2..1f4e3418f4 100644
--- a/tensorflow/core/kernels/summary_op.cc
+++ b/tensorflow/core/kernels/summary_op.cc
@@ -41,11 +41,12 @@ class SummaryScalarOp : public OpKernel {
     const Tensor& values = c->input(1);
 
     OP_REQUIRES(
-        c, tags.IsSameSize(values) ||
-               (IsLegacyScalar(tags.shape()) && IsLegacyScalar(values.shape())),
-        errors::InvalidArgument("tags and values not the same shape: ",
-                                tags.shape().DebugString(), " != ",
-                                values.shape().DebugString(), SingleTag(tags)));
+        c,
+        tags.IsSameSize(values) ||
+            (IsLegacyScalar(tags.shape()) && IsLegacyScalar(values.shape())),
+        errors::InvalidArgument(
+            "tags and values not the same shape: ", tags.shape().DebugString(),
+            " != ", values.shape().DebugString(), SingleTag(tags)));
     auto Ttags = tags.flat<string>();
     auto Tvalues = values.flat<T>();
     Summary s;
diff --git a/tensorflow/core/kernels/tile_functor_cpu.cc b/tensorflow/core/kernels/tile_functor_cpu.cc
index b2fd669541..f814486701 100644
--- a/tensorflow/core/kernels/tile_functor_cpu.cc
+++ b/tensorflow/core/kernels/tile_functor_cpu.cc
@@ -15,10 +15,10 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include "tensorflow/core/kernels/tile_functor.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/tile_functor.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/tile_ops_cpu_impl.h b/tensorflow/core/kernels/tile_ops_cpu_impl.h
index 054b31ef9e..df6a666cd4 100644
--- a/tensorflow/core/kernels/tile_ops_cpu_impl.h
+++ b/tensorflow/core/kernels/tile_ops_cpu_impl.h
@@ -63,7 +63,7 @@ TF_CALL_int64(DEFINE_TYPE);
 
 #undef DEFINE_DIM
 #undef DEFINE_TYPE
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // end namespace functor
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc
index 38e77ab60f..07befa27bc 100644
--- a/tensorflow/core/kernels/training_ops.cc
+++ b/tensorflow/core/kernels/training_ops.cc
@@ -3279,7 +3279,6 @@ REGISTER_KERNELS(double, int64);
 
 #undef REGISTER_KERNELS
 
-
 template <typename Device, typename T>
 class ApplyAddSignOp : public OpKernel {
  public:
@@ -3362,17 +3361,15 @@ TF_CALL_double(REGISTER_CPU_KERNELS);
 #if GOOGLE_CUDA
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
-#define DECLARE_GPU_SPEC(T)                                               \
-  template <>                                                             \
-  void ApplyAddSign<GPUDevice, T>::operator()(                            \
-      const GPUDevice& d,                                                 \
-      typename TTypes<T>::Flat var,                                       \
-      typename TTypes<T>::Flat m,                                         \
-      typename TTypes<T>::ConstScalar lr,                                 \
-      typename TTypes<T>::ConstScalar alpha,                              \
-      typename TTypes<T>::ConstScalar sign_decay,                         \
-      typename TTypes<T>::ConstScalar beta,                               \
-      typename TTypes<T>::ConstFlat grad);                                \
+#define DECLARE_GPU_SPEC(T)                                           \
+  template <>                                                         \
+  void ApplyAddSign<GPUDevice, T>::operator()(                        \
+      const GPUDevice& d, typename TTypes<T>::Flat var,               \
+      typename TTypes<T>::Flat m, typename TTypes<T>::ConstScalar lr, \
+      typename TTypes<T>::ConstScalar alpha,                          \
+      typename TTypes<T>::ConstScalar sign_decay,                     \
+      typename TTypes<T>::ConstScalar beta,                           \
+      typename TTypes<T>::ConstFlat grad);                            \
   extern template struct ApplyAddSign<GPUDevice, T>;
 DECLARE_GPU_SPEC(Eigen::half);
 DECLARE_GPU_SPEC(float);
@@ -3387,7 +3384,6 @@ REGISTER_KERNELS(GPU, double);
 #undef REGISTER_CPU_KERNELS
 #undef REGISTER_KERNELS
 
-
 template <typename Device, typename T>
 class ApplyPowerSignOp : public OpKernel {
  public:
@@ -3470,17 +3466,15 @@ TF_CALL_double(REGISTER_CPU_KERNELS);
 #if GOOGLE_CUDA
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
-#define DECLARE_GPU_SPEC(T)                                               \
-  template <>                                                             \
-  void ApplyPowerSign<GPUDevice, T>::operator()(                          \
-      const GPUDevice& d,                                                 \
-      typename TTypes<T>::Flat var,                                       \
-      typename TTypes<T>::Flat m,                                         \
-      typename TTypes<T>::ConstScalar lr,                                 \
-      typename TTypes<T>::ConstScalar logbase,                            \
-      typename TTypes<T>::ConstScalar sign_decay,                         \
-      typename TTypes<T>::ConstScalar beta,                               \
-      typename TTypes<T>::ConstFlat grad);                                \
+#define DECLARE_GPU_SPEC(T)                                           \
+  template <>                                                         \
+  void ApplyPowerSign<GPUDevice, T>::operator()(                      \
+      const GPUDevice& d, typename TTypes<T>::Flat var,               \
+      typename TTypes<T>::Flat m, typename TTypes<T>::ConstScalar lr, \
+      typename TTypes<T>::ConstScalar logbase,                        \
+      typename TTypes<T>::ConstScalar sign_decay,                     \
+      typename TTypes<T>::ConstScalar beta,                           \
+      typename TTypes<T>::ConstFlat grad);                            \
   extern template struct ApplyPowerSign<GPUDevice, T>;
 DECLARE_GPU_SPEC(Eigen::half);
 DECLARE_GPU_SPEC(float);
diff --git a/tensorflow/core/kernels/training_ops_gpu.cu.cc b/tensorflow/core/kernels/training_ops_gpu.cu.cc
index d443a6b3c1..0376a3b2c6 100644
--- a/tensorflow/core/kernels/training_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/training_ops_gpu.cu.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/kernels/training_ops.h"
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/training_ops.h"
 
 namespace tensorflow {
 
@@ -115,13 +115,11 @@ struct ApplyAdam<GPUDevice, T> {
     Eigen::Sizes<1> single;
     const auto one = static_cast<T>(1.0);
     m.device(d) =
-        m +
-        (beta1.constant(one) - beta1).reshape(single).broadcast(bcast) *
-            (grad - m);
+        m + (beta1.constant(one) - beta1).reshape(single).broadcast(bcast) *
+                (grad - m);
     v.device(d) =
-        v +
-        (beta2.constant(one) - beta2).reshape(single).broadcast(bcast) *
-            (grad.square() - v);
+        v + (beta2.constant(one) - beta2).reshape(single).broadcast(bcast) *
+                (grad.square() - v);
 
     if (use_nesterov) {
       var.device(d) -=
@@ -157,9 +155,9 @@ struct ApplyRMSProp<GPUDevice, T> {
     bcast[0] = grad.dimension(0);
     Eigen::Sizes<1> single;
     const auto one = static_cast<T>(1.0);
-    ms.device(d) = ms +
-                   (rho.constant(one) - rho).reshape(single).broadcast(bcast) *
-                       (grad.square() - ms);
+    ms.device(d) =
+        ms + (rho.constant(one) - rho).reshape(single).broadcast(bcast) *
+                 (grad.square() - ms);
     mom.device(d) =
         mom * momentum.reshape(single).broadcast(bcast) +
         lr.reshape(single).broadcast(bcast) * grad /
@@ -212,7 +210,7 @@ struct ApplyAddSign<GPUDevice, T> {
     auto beta_bcast = beta.reshape(single).broadcast(bcast);
     auto one_minus_beta =
         (beta.constant(one) - beta).reshape(single).broadcast(bcast);
-    m.device(d) =  m * beta_bcast + grad * one_minus_beta;
+    m.device(d) = m * beta_bcast + grad * one_minus_beta;
 
     // The following is the GPU equivalent of the CPU version:
     // var.device(d) -= lr() * (alpha() + sign_decay() * sign_gm) * grad;
@@ -244,7 +242,7 @@ struct ApplyPowerSign<GPUDevice, T> {
     auto beta_bcast = beta.reshape(single).broadcast(bcast);
     auto one_minus_beta =
         (beta.constant(one) - beta).reshape(single).broadcast(bcast);
-    m.device(d) =  m * beta_bcast + grad * one_minus_beta;
+    m.device(d) = m * beta_bcast + grad * one_minus_beta;
 
     // The following is the GPU equivalent of the CPU version:
     // auto grad_scale = (logbase() * sign_decay() * sign_gm).exp();
@@ -253,7 +251,7 @@ struct ApplyPowerSign<GPUDevice, T> {
     auto lr_bcast = lr.reshape(single).broadcast(bcast);
     auto logbase_bcast = logbase.reshape(single).broadcast(bcast);
     auto sign_decay_bcast = sign_decay.reshape(single).broadcast(bcast);
-    auto grad_scale =  (logbase_bcast * sign_decay_bcast * sign_gm).exp();
+    auto grad_scale = (logbase_bcast * sign_decay_bcast * sign_gm).exp();
     var.device(d) -= lr_bcast * grad_scale * grad;
   }
 };
diff --git a/tensorflow/core/kernels/training_ops_test.cc b/tensorflow/core/kernels/training_ops_test.cc
index ffa7f87c9e..2dcc4a500e 100644
--- a/tensorflow/core/kernels/training_ops_test.cc
+++ b/tensorflow/core/kernels/training_ops_test.cc
@@ -176,8 +176,9 @@ static void Adam(int32 n, Graph** init_g, Graph** train_g) {
     auto beta2 = Scalar(g, 0.99);
     auto epsilon = Scalar(g, 1e-8);
     auto grad = Random(g, n);
-    test::graph::Multi(g, "ApplyAdam", {var, m, v, beta1_power, beta2_power, lr,
-                                        beta1, beta2, epsilon, grad});
+    test::graph::Multi(
+        g, "ApplyAdam",
+        {var, m, v, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad});
     *train_g = g;
   }
 }
diff --git a/tensorflow/core/kernels/transpose_op.cc b/tensorflow/core/kernels/transpose_op.cc
index 2e0d18b634..7177ad7888 100644
--- a/tensorflow/core/kernels/transpose_op.cc
+++ b/tensorflow/core/kernels/transpose_op.cc
@@ -176,9 +176,10 @@ void TransposeOp::Compute(OpKernelContext* ctx) {
     }
   }
   for (int i = 0; i < dims; ++i) {
-    OP_REQUIRES(ctx, bits[i], errors::InvalidArgument(
-                                  i, " is missing from {",
-                                  str_util::Join(permutation, ","), "}."));
+    OP_REQUIRES(
+        ctx, bits[i],
+        errors::InvalidArgument(i, " is missing from {",
+                                str_util::Join(permutation, ","), "}."));
   }
 
   // 0-D, 1-D, and identity transposes do nothing.
diff --git a/tensorflow/core/kernels/typed_queue.h b/tensorflow/core/kernels/typed_queue.h
index 0d608d9b87..43dcb4cef7 100644
--- a/tensorflow/core/kernels/typed_queue.h
+++ b/tensorflow/core/kernels/typed_queue.h
@@ -58,9 +58,9 @@ Status TypedQueue<SubQueue>::Initialize() {
   if (!component_shapes_.empty() &&
       component_dtypes_.size() != component_shapes_.size()) {
     return errors::InvalidArgument(
-        "Different number of component types.  ", "Types: ",
-        DataTypeSliceString(component_dtypes_), ", Shapes: ",
-        ShapeListString(component_shapes_));
+        "Different number of component types.  ",
+        "Types: ", DataTypeSliceString(component_dtypes_),
+        ", Shapes: ", ShapeListString(component_shapes_));
   }
 
   mutex_lock lock(mu_);
diff --git a/tensorflow/core/kernels/unpack_op.cc b/tensorflow/core/kernels/unpack_op.cc
index 397bdd5670..764b6a252a 100644
--- a/tensorflow/core/kernels/unpack_op.cc
+++ b/tensorflow/core/kernels/unpack_op.cc
@@ -34,7 +34,7 @@ typedef Eigen::GpuDevice GPUDevice;
 
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 class UnpackOp : public OpKernel {
@@ -65,8 +65,9 @@ class UnpackOp : public OpKernel {
     output_shape.RemoveDim(axis);
     const int64 output_size = output_shape.num_elements();
     OP_REQUIRES(
-        context, FastBoundsCheck(output_size,
-                                 std::numeric_limits<Eigen::DenseIndex>::max()),
+        context,
+        FastBoundsCheck(output_size,
+                        std::numeric_limits<Eigen::DenseIndex>::max()),
         errors::InvalidArgument("output size must fit in Eigen DenseIndex"));
 
 // This optimization is currently not applicable for SYCL devices
diff --git a/tensorflow/core/kernels/word2vec_kernels.cc b/tensorflow/core/kernels/word2vec_kernels.cc
index 2d05d72bff..3477445197 100644
--- a/tensorflow/core/kernels/word2vec_kernels.cc
+++ b/tensorflow/core/kernels/word2vec_kernels.cc
@@ -188,9 +188,9 @@ class SkipgramOp : public OpKernel {
       ++corpus_size_;
     }
     if (corpus_size_ < window_size_ * 10) {
-      return errors::InvalidArgument("The text file ", filename,
-                                     " contains too little data: ",
-                                     corpus_size_, " words");
+      return errors::InvalidArgument(
+          "The text file ", filename,
+          " contains too little data: ", corpus_size_, " words");
     }
     typedef std::pair<string, int32> WordFreq;
     std::vector<WordFreq> ordered;
diff --git a/tensorflow/core/kernels/xent_op.cc b/tensorflow/core/kernels/xent_op.cc
index 0f8d027caa..a6a71fdfaf 100644
--- a/tensorflow/core/kernels/xent_op.cc
+++ b/tensorflow/core/kernels/xent_op.cc
@@ -30,7 +30,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 class SoftmaxXentWithLogitsOp : public OpKernel {
@@ -44,8 +44,8 @@ class SoftmaxXentWithLogitsOp : public OpKernel {
     OP_REQUIRES(context, logits_in.IsSameSize(labels_in),
                 errors::InvalidArgument(
                     "logits and labels must be same size: logits_size=",
-                    logits_in.shape().DebugString(), " labels_size=",
-                    labels_in.shape().DebugString()));
+                    logits_in.shape().DebugString(),
+                    " labels_size=", labels_in.shape().DebugString()));
     OP_REQUIRES(context, TensorShapeUtils::IsMatrix(logits_in.shape()),
                 errors::InvalidArgument("logits must be 2-dimensional"));
     // As we already tested that both inputs have the same shape no need to
@@ -72,7 +72,7 @@ class SoftmaxXentWithLogitsOp : public OpKernel {
       functor(context->eigen_device<Device>(), logits_in.matrix<T>(),
               labels_in.matrix<T>(), scratch.matrix<T>(), loss_out->vec<T>(),
               back_out->matrix<T>());
-      }
+    }
   }
 };
 
@@ -87,7 +87,7 @@ struct XentFunctorBase {
                   typename TTypes<T>::Vec loss,
                   typename TTypes<T>::Matrix backprop) {
     XentEigenImpl<Device, T>::Compute(d, logits, labels, scratch, loss,
-                                         backprop);
+                                      backprop);
   }
 };
 
@@ -97,7 +97,7 @@ struct XentFunctor<CPUDevice, T> : XentFunctorBase<CPUDevice, T> {};
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
 struct XentFunctor<SYCLDevice, T> : XentFunctorBase<SYCLDevice, T> {};
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace functor
 
 #define REGISTER_CPU(T)                                         \
@@ -129,6 +129,6 @@ REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits")
                             .Device(DEVICE_SYCL)
                             .TypeConstraint<float>("T"),
                         SoftmaxXentWithLogitsOp<SYCLDevice, float>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/xsmm_conv2d_test.cc b/tensorflow/core/kernels/xsmm_conv2d_test.cc
index e294701246..481f3b7ba4 100644
--- a/tensorflow/core/kernels/xsmm_conv2d_test.cc
+++ b/tensorflow/core/kernels/xsmm_conv2d_test.cc
@@ -13,18 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/conv_ops.h"
-#include "tensorflow/core/platform/test.h"
+#include "include/libxsmm.h"
+#include "tensorflow/core/framework/fake_input.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/kernels/conv_ops.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
-#include "include/libxsmm.h"
-#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
 namespace {
 
-
 typedef struct {
   int nImg;
   int nIfm;
@@ -49,45 +48,41 @@ typedef struct {
   int stride_w;
 } naive_conv_t;
 
-
-LIBXSMM_INLINE void naive_copy_NCHW_to_NHWC(const float* nchw, Tensor &nhwc, int N, int H, int W, int C)
-{
-  LIBXSMM_VLA_DECL(4, const float,  input, nchw, C, H, W);
+LIBXSMM_INLINE void naive_copy_NCHW_to_NHWC(const float* nchw, Tensor& nhwc,
+                                            int N, int H, int W, int C) {
+  LIBXSMM_VLA_DECL(4, const float, input, nchw, C, H, W);
   int n, h, w, c;
-  auto output =  nhwc.flat<float>();
-  for ( n = 0; n < N; n++ ) {
-    for ( h = 0; h < H; h++ ) {
-      for ( w = 0; w < W; w++ ) {
-        for ( c = 0; c < C; c++ ) {
-          output(n*H*W*C + h*W*C +w*C + c)  =
-          LIBXSMM_VLA_ACCESS(4,  input, n, c, h, w, C, H, W);
+  auto output = nhwc.flat<float>();
+  for (n = 0; n < N; n++) {
+    for (h = 0; h < H; h++) {
+      for (w = 0; w < W; w++) {
+        for (c = 0; c < C; c++) {
+          output(n * H * W * C + h * W * C + w * C + c) =
+              LIBXSMM_VLA_ACCESS(4, input, n, c, h, w, C, H, W);
         }
       }
     }
   }
 }
 
-
-LIBXSMM_INLINE void naive_copy_KCRS_to_RSCK(const float* kcrs, Tensor  &rsck, int R, int S, int C, int K)
-{
-  LIBXSMM_VLA_DECL(4, const float,  input, kcrs, C, R, S);
+LIBXSMM_INLINE void naive_copy_KCRS_to_RSCK(const float* kcrs, Tensor& rsck,
+                                            int R, int S, int C, int K) {
+  LIBXSMM_VLA_DECL(4, const float, input, kcrs, C, R, S);
   int r, s, c, k;
-  auto output =  rsck.flat<float>();
-
-  for ( r = 0; r < R; r++ ) {
-    for ( s = 0; s < S; s++ ) {
-      for ( c = 0; c < C; c++ ) {
-        for ( k = 0; k < K; k++ ) {
-          output(r*S*C*K + s*C*K + c*K + k) =
-          LIBXSMM_VLA_ACCESS(4,  input, k, c, r, s, C, R, S);
+  auto output = rsck.flat<float>();
+
+  for (r = 0; r < R; r++) {
+    for (s = 0; s < S; s++) {
+      for (c = 0; c < C; c++) {
+        for (k = 0; k < K; k++) {
+          output(r * S * C * K + s * C * K + c * K + k) =
+              LIBXSMM_VLA_ACCESS(4, input, k, c, r, s, C, R, S);
         }
       }
     }
   }
 }
 
-
-
 LIBXSMM_INLINE void zero_buf(float* buf, long size) {
   int i;
   for (i = 0; i < size; ++i) {
@@ -95,52 +90,53 @@ LIBXSMM_INLINE void zero_buf(float* buf, long size) {
   }
 }
 
-LIBXSMM_INLINE void copy_buf(Tensor &dst,float *src,long size) {
-  long  i;
-  auto output =  dst.flat<float>();
-  for (i = 0; i < size; ++i)
-          output(i) = src[i];
+LIBXSMM_INLINE void copy_buf(Tensor& dst, float* src, long size) {
+  long i;
+  auto output = dst.flat<float>();
+  for (i = 0; i < size; ++i) output(i) = src[i];
 }
 
-LIBXSMM_INLINE void init_buf(float* buf, long size, int initPos, int initOne)
-{
+LIBXSMM_INLINE void init_buf(float* buf, long size, int initPos, int initOne) {
   int i;
   zero_buf(buf, size);
   for (i = 0; i < size; ++i) {
-    buf[i] = (float)((initOne != 0) ? 1.0 : ((initPos != 0) ? drand48() : (0.05 - drand48()/10.0)));
+    buf[i] =
+        (float)((initOne != 0)
+                    ? 1.0
+                    : ((initPos != 0) ? drand48() : (0.05 - drand48() / 10.0)));
   }
 }
 
-
-
-LIBXSMM_INLINE void naive_conv_fp(naive_conv_t* param, const float* input, float* output, const float* filter)
-{
-  int nImg      = param->nImg;
-  int nIfm      = param->nIfm;
-  int nOfm      = param->nOfm;
-  int ifhp      = param->ifhp;
-  int ifwp      = param->ifwp;
-  int ofhp      = param->ofhp;
-  int ofwp      = param->ofwp;
-  int ifh       = param->ifh;
-  int ifw       = param->ifw;
-  int ofh       = param->ofh;
-  int ofw       = param->ofw;
-  int pad_h     = param->pad_h;
-  int pad_w     = param->pad_w;
-  int pad_h_in  = param->pad_h_in;
-  int pad_w_in  = param->pad_w_in;
+LIBXSMM_INLINE void naive_conv_fp(naive_conv_t* param, const float* input,
+                                  float* output, const float* filter) {
+  int nImg = param->nImg;
+  int nIfm = param->nIfm;
+  int nOfm = param->nOfm;
+  int ifhp = param->ifhp;
+  int ifwp = param->ifwp;
+  int ofhp = param->ofhp;
+  int ofwp = param->ofwp;
+  int ifh = param->ifh;
+  int ifw = param->ifw;
+  int ofh = param->ofh;
+  int ofw = param->ofw;
+  int pad_h = param->pad_h;
+  int pad_w = param->pad_w;
+  int pad_h_in = param->pad_h_in;
+  int pad_w_in = param->pad_w_in;
   int pad_h_out = param->pad_h_out;
   int pad_w_out = param->pad_w_out;
-  int kh        = param->kh;
-  int kw        = param->kw;
-  int stride_h  = param->stride_h;
-  int stride_w  = param->stride_w;
+  int kh = param->kh;
+  int kw = param->kw;
+  int stride_h = param->stride_h;
+  int stride_w = param->stride_w;
   /* loop counters */
   int img, ofm, ifm, oj, oi, ij, ii, kj, ki;
 
-  LIBXSMM_VLA_DECL(4,       float, output_t, output + (pad_w_out * ofwp + pad_h_out), nOfm, ofhp, ofwp);
-  LIBXSMM_VLA_DECL(4, const float,  input_t,  input + (pad_w_in * ifwp + pad_h_in), nIfm, ifhp, ifwp);
+  LIBXSMM_VLA_DECL(4, float, output_t, output + (pad_w_out * ofwp + pad_h_out),
+                   nOfm, ofhp, ofwp);
+  LIBXSMM_VLA_DECL(4, const float, input_t,
+                   input + (pad_w_in * ifwp + pad_h_in), nIfm, ifhp, ifwp);
   LIBXSMM_VLA_DECL(4, const float, filter_t, filter, nIfm, kh, kw);
 
   for (img = 0; img < nImg; ++img) {
@@ -151,12 +147,15 @@ LIBXSMM_INLINE void naive_conv_fp(naive_conv_t* param, const float* input, float
           for (oi = 0; oi < ofw; ++oi) {
             ii = oi * stride_w - pad_w;
             for (kj = 0; kj < kh; ++kj) {
-              if(ij+kj < 0 || ij+kj >= ifh) continue;
+              if (ij + kj < 0 || ij + kj >= ifh) continue;
               for (ki = 0; ki < kw; ++ki) {
-                if(ii+ki < 0 || ii+ki >= ifw) continue;
-                LIBXSMM_VLA_ACCESS(  4, output_t, img, ofm, oj, oi, nOfm, ofhp, ofwp) +=
-                  LIBXSMM_VLA_ACCESS(4,  input_t, img, ifm, ij + kj, ii + ki, nIfm, ifhp, ifwp)
-                * LIBXSMM_VLA_ACCESS(4, filter_t, ofm, ifm, kj, ki, nIfm, kh, kw);
+                if (ii + ki < 0 || ii + ki >= ifw) continue;
+                LIBXSMM_VLA_ACCESS(4, output_t, img, ofm, oj, oi, nOfm, ofhp,
+                                   ofwp) +=
+                    LIBXSMM_VLA_ACCESS(4, input_t, img, ifm, ij + kj, ii + ki,
+                                       nIfm, ifhp, ifwp) *
+                    LIBXSMM_VLA_ACCESS(4, filter_t, ofm, ifm, kj, ki, nIfm, kh,
+                                       kw);
               }
             }
           }
@@ -168,134 +167,118 @@ LIBXSMM_INLINE void naive_conv_fp(naive_conv_t* param, const float* input, float
 
 void RunXsmmVsGeneric() {}
 
-
 class XsmmConv2DTest : public OpsTestBase {
  protected:
   void MakeOp(int stride) {
-
     TF_CHECK_OK(NodeDefBuilder("xsmm", "Conv2D")
-                      .Input(FakeInput(DT_FLOAT))
-                      .Input(FakeInput(DT_FLOAT))
-                      .Attr("strides", {1, stride,stride, 1})
-                      .Attr("padding", "VALID" )
-                      .Finalize(node_def()));
-
+                    .Input(FakeInput(DT_FLOAT))
+                    .Input(FakeInput(DT_FLOAT))
+                    .Attr("strides", {1, stride, stride, 1})
+                    .Attr("padding", "VALID")
+                    .Finalize(node_def()));
 
     TF_ASSERT_OK(InitOp());
   }
 };
 
 TEST_F(XsmmConv2DTest, Basic) {
-     MakeOp(1);
+  MakeOp(1);
 
-     // setup scoped allocator, which uses cpu_allocator() for this scope
-     const libxsmm_tf_allocator<libxsmm_scratch_allocator> tf_allocator;
+  // setup scoped allocator, which uses cpu_allocator() for this scope
+  const libxsmm_tf_allocator<libxsmm_scratch_allocator> tf_allocator;
 
-     int ifw = 14;           /* input width, "W" */
-     int ifh = 14;           /* input height, "H" */
-     int nImg = 32;          /* mini-batch size, "N" */
-     int nIfm = 64;         /* number of input feature maps, "C" */
-     int nOfm = 64;         /* number of output feature maps, "K" */
-     int kh = 3;             /* filter height, "R" */
-     int kw = 3;             /* filter width, "S" */
-     int pad = 0;            /* padding in output */
-     int stride = 1;         /* stride when accessing inputs */
+  int ifw = 14;   /* input width, "W" */
+  int ifh = 14;   /* input height, "H" */
+  int nImg = 32;  /* mini-batch size, "N" */
+  int nIfm = 64;  /* number of input feature maps, "C" */
+  int nOfm = 64;  /* number of output feature maps, "K" */
+  int kh = 3;     /* filter height, "R" */
+  int kw = 3;     /* filter width, "S" */
+  int pad = 0;    /* padding in output */
+  int stride = 1; /* stride when accessing inputs */
 
+  int stride_w = stride;
+  int stride_h = stride;
+  int pad_h = pad;
+  int pad_w = pad;
 
-     int stride_w = stride;
-     int stride_h = stride;
-     int pad_h = pad;
-     int pad_w = pad;
+  int pad_h_in = pad_h;
+  int pad_w_in = pad_w;
 
-     int pad_h_in = pad_h;
-     int pad_w_in = pad_w;
-
-     int pad_h_out = 0;
-     int pad_w_out = 0;
+  int pad_h_out = 0;
+  int pad_w_out = 0;
 
   /* deriving some values for naive code */
-     int ofh = (ifh + 2 * pad_h - kh) / stride_h + 1;
-     int ofw = (ifw + 2 * pad_w - kw) / stride_w + 1;
-     int ifhp = ifh + 2 * pad_h_in;
-     int ifwp = ifw + 2 * pad_w_in;
-     int ofhp = ofh + 2 * pad_h_out;
-     int ofwp = ofw + 2 * pad_w_out;
-
-
-    //Initialization of Filter and Image
-
-    /* allocate data */
-     float *naive_input           = (float*)libxsmm_aligned_scratch( nImg*nIfm*ifhp*ifwp*sizeof(float), 2097152);
-     float *naive_output          = (float*)libxsmm_aligned_scratch( nImg*nOfm*ofhp*ofwp*sizeof(float), 2097152);
-     float *naive_filter          = (float*)libxsmm_aligned_scratch( nOfm*nIfm*kh*kw*    sizeof(float), 2097152);
-     /* initialize data */
-     init_buf(naive_input,          nImg*nIfm*ifhp*ifwp, 0, 0);
-     zero_buf(naive_output,         nImg*nOfm*ofhp*ofwp);
-     init_buf(naive_filter,         nOfm*nIfm*kh*kw, 0, 0);
-
-
-     Tensor image(DT_FLOAT,
-                 {nImg, ifhp, ifwp, nIfm});
-
-
-     Tensor filter(DT_FLOAT, {kh,kw,nIfm,nOfm});
-
-
-     naive_copy_NCHW_to_NHWC(naive_input, image, nImg, ifhp, ifwp, nIfm);
-     naive_copy_KCRS_to_RSCK(naive_filter, filter, kh, kw, nIfm, nOfm);
-
-
-    //Run naive convolution
-
-     naive_conv_t naive_param;
-
-     naive_param.nImg = nImg;
-     naive_param.nIfm = nIfm;
-     naive_param.nOfm = nOfm;
-     naive_param.ifhp = ifhp;
-     naive_param.ifwp = ifwp;
-     naive_param.ofhp = ofhp;
-     naive_param.ofwp = ofwp;
-     naive_param.ifh = ifh;
-     naive_param.ifw = ifw;
-     naive_param.ofh = ofh;
-     naive_param.ofw = ofw;
-     naive_param.pad_h = pad_h;
-     naive_param.pad_w = pad_w;
-     naive_param.pad_h_in = pad_h_in;
-     naive_param.pad_w_in = pad_w_in;
-     naive_param.pad_h_out = pad_h_out;
-     naive_param.pad_w_out = pad_w_out;
-     naive_param.kh = kh;
-     naive_param.kw = kw;
-     naive_param.stride_h = stride_h;
-     naive_param.stride_w = stride_w;
-
-
-     naive_conv_fp(&naive_param, naive_input, naive_output, naive_filter);
-
-
-
-     AddInputFromArray<float>(image.shape(), image.flat<float>());
-     AddInputFromArray<float>(filter.shape(), filter.flat<float>());
-
-
-
-     //Run Op (TF)
-     TF_ASSERT_OK(RunOpKernel());
-
-     // Check the output.
-     Tensor expected(DT_FLOAT, {nImg,ofhp,ofwp, nOfm});
-     naive_copy_NCHW_to_NHWC(naive_output, expected, nImg, ofhp, ofwp, nOfm);
-
-
-     test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
-     libxsmm_free(naive_input);
-     libxsmm_free(naive_output);
-     libxsmm_free(naive_filter);
-
-
-
+  int ofh = (ifh + 2 * pad_h - kh) / stride_h + 1;
+  int ofw = (ifw + 2 * pad_w - kw) / stride_w + 1;
+  int ifhp = ifh + 2 * pad_h_in;
+  int ifwp = ifw + 2 * pad_w_in;
+  int ofhp = ofh + 2 * pad_h_out;
+  int ofwp = ofw + 2 * pad_w_out;
+
+  // Initialization of Filter and Image
+
+  /* allocate data */
+  float* naive_input = (float*)libxsmm_aligned_scratch(
+      nImg * nIfm * ifhp * ifwp * sizeof(float), 2097152);
+  float* naive_output = (float*)libxsmm_aligned_scratch(
+      nImg * nOfm * ofhp * ofwp * sizeof(float), 2097152);
+  float* naive_filter = (float*)libxsmm_aligned_scratch(
+      nOfm * nIfm * kh * kw * sizeof(float), 2097152);
+  /* initialize data */
+  init_buf(naive_input, nImg * nIfm * ifhp * ifwp, 0, 0);
+  zero_buf(naive_output, nImg * nOfm * ofhp * ofwp);
+  init_buf(naive_filter, nOfm * nIfm * kh * kw, 0, 0);
+
+  Tensor image(DT_FLOAT, {nImg, ifhp, ifwp, nIfm});
+
+  Tensor filter(DT_FLOAT, {kh, kw, nIfm, nOfm});
+
+  naive_copy_NCHW_to_NHWC(naive_input, image, nImg, ifhp, ifwp, nIfm);
+  naive_copy_KCRS_to_RSCK(naive_filter, filter, kh, kw, nIfm, nOfm);
+
+  // Run naive convolution
+
+  naive_conv_t naive_param;
+
+  naive_param.nImg = nImg;
+  naive_param.nIfm = nIfm;
+  naive_param.nOfm = nOfm;
+  naive_param.ifhp = ifhp;
+  naive_param.ifwp = ifwp;
+  naive_param.ofhp = ofhp;
+  naive_param.ofwp = ofwp;
+  naive_param.ifh = ifh;
+  naive_param.ifw = ifw;
+  naive_param.ofh = ofh;
+  naive_param.ofw = ofw;
+  naive_param.pad_h = pad_h;
+  naive_param.pad_w = pad_w;
+  naive_param.pad_h_in = pad_h_in;
+  naive_param.pad_w_in = pad_w_in;
+  naive_param.pad_h_out = pad_h_out;
+  naive_param.pad_w_out = pad_w_out;
+  naive_param.kh = kh;
+  naive_param.kw = kw;
+  naive_param.stride_h = stride_h;
+  naive_param.stride_w = stride_w;
+
+  naive_conv_fp(&naive_param, naive_input, naive_output, naive_filter);
+
+  AddInputFromArray<float>(image.shape(), image.flat<float>());
+  AddInputFromArray<float>(filter.shape(), filter.flat<float>());
+
+  // Run Op (TF)
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(DT_FLOAT, {nImg, ofhp, ofwp, nOfm});
+  naive_copy_NCHW_to_NHWC(naive_output, expected, nImg, ofhp, ofwp, nOfm);
+
+  test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
+  libxsmm_free(naive_input);
+  libxsmm_free(naive_output);
+  libxsmm_free(naive_filter);
 }
 
 /*
@@ -325,7 +308,8 @@ TEST(XsmmConv2DTest, Basic) {
     desc.threads = num_threads;
     desc.algo = LIBXSMM_DNN_CONV_ALGO_DIRECT;
     desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_NHWC;
-    desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM;//LIBXSMM_DNN_TENSOR_FORMAT_RSCK;
+    desc.filter_format =
+LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM;//LIBXSMM_DNN_TENSOR_FORMAT_RSCK;
     desc.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE;
     desc.options = LIBXSMM_DNN_CONV_OPTION_NONE;
     desc.datatype = LIBXSMM_DNN_DATATYPE_F32;
-- 
GitLab


From 995378c4c9ff156cae7a365cfdc1480a3ee6d0bf Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Fri, 26 Jan 2018 12:08:47 -0800
Subject: [PATCH 255/966] Switch over to max_pool_v2 in Python (#14983)

* Switch over to max_pool_v2 in Python

This fix is a follow up to 11875 so that MaxPool in Python
use v2 version. As 11875 has been merged some time ago,
this fix conforms to the deprecation policy.

This fix is realted to 11875 and 4746.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Update test cases in contrib/specs/python/specs_test due to MaxPool -> MaxPoolV2

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Update tensorflow/contrib/receptive_field

Update tensorflow/contrib/receptive_field
due to max_pool's strides and ksize from  attr -> input

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Remove const restriction for strides and ksize

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Register MaxPoolV2 with XLA

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Reformat with clang-format -i --style=Google

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 .../compiler/tf2xla/kernels/pooling_ops.cc    | 67 ++++++++++++++-----
 .../python/util/parse_layer_parameters.py     | 58 +++++++++++-----
 tensorflow/contrib/specs/python/specs_test.py | 14 ++--
 .../python/kernel_tests/pooling_ops_test.py   | 21 +++---
 tensorflow/python/ops/nn_ops.py               | 12 ++--
 5 files changed, 115 insertions(+), 57 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
index 0b5a38967a..d092e2e8d6 100644
--- a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
@@ -37,21 +37,23 @@ class PoolingOp : public XlaOpKernel {
  public:
   PoolingOp(OpKernelConstruction* ctx, int num_spatial_dims)
       : XlaOpKernel(ctx), num_spatial_dims_(num_spatial_dims) {
-    std::vector<int32> ksize_int;
-    std::vector<int32> stride_int;
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("ksize", &ksize_int));
-    OP_REQUIRES(ctx, ksize_int.size() == num_dims(),
-                errors::InvalidArgument("Sliding window ksize field must "
-                                        "specify ",
-                                        num_dims(), " dimensions"));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("strides", &stride_int));
-    OP_REQUIRES(ctx, stride_int.size() == num_dims(),
-                errors::InvalidArgument("Sliding window stride field must "
-                                        "specify ",
-                                        num_dims(), " dimensions"));
-    for (int i = 0; i < num_dims(); ++i) {
-      ksize_.push_back(ksize_int[i]);
-      stride_.push_back(stride_int[i]);
+    if (ctx->num_inputs() == 1) {
+      std::vector<int32> ksize_int;
+      std::vector<int32> stride_int;
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("ksize", &ksize_int));
+      OP_REQUIRES(ctx, ksize_int.size() == num_dims(),
+                  errors::InvalidArgument("Sliding window ksize field must "
+                                          "specify ",
+                                          num_dims(), " dimensions"));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("strides", &stride_int));
+      OP_REQUIRES(ctx, stride_int.size() == num_dims(),
+                  errors::InvalidArgument("Sliding window stride field must "
+                                          "specify ",
+                                          num_dims(), " dimensions"));
+      for (int i = 0; i < num_dims(); ++i) {
+        ksize_.push_back(ksize_int[i]);
+        stride_.push_back(stride_int[i]);
+      }
     }
     Padding padding;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("padding", &padding));
@@ -77,6 +79,33 @@ class PoolingOp : public XlaOpKernel {
     xla::ComputationDataHandle input = ctx->Input(0);
     const TensorShape input_shape = ctx->InputShape(0);
 
+    std::vector<int64> ksize = ksize_;
+    std::vector<int64> stride = stride_;
+    if (ctx->num_inputs() != 1) {
+      const TensorShape ksize_shape = ctx->InputShape(1);
+      // Validate input sizes.
+      OP_REQUIRES(ctx, TensorShapeUtils::IsVector(ksize_shape),
+                  errors::InvalidArgument("ksize must be a vector, not shape ",
+                                          ksize_shape.DebugString()));
+      OP_REQUIRES(ctx, ksize_shape.num_elements() == num_dims(),
+                  errors::InvalidArgument("Sliding window ksize field must "
+                                          "specify ",
+                                          num_dims(), " dimensions"));
+      ksize.clear();
+      OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(1, &ksize));
+
+      const TensorShape stride_shape = ctx->InputShape(2);
+      // Validate input sizes.
+      OP_REQUIRES(ctx, TensorShapeUtils::IsVector(stride_shape),
+                  errors::InvalidArgument("stride must be a vector, not shape ",
+                                          stride_shape.DebugString()));
+      OP_REQUIRES(ctx, stride_shape.num_elements() == num_dims(),
+                  errors::InvalidArgument("Sliding window stride field must "
+                                          "specify ",
+                                          num_dims(), " dimensions"));
+      stride.clear();
+      OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(2, &stride));
+    }
     OP_REQUIRES(ctx, input_shape.dims() == num_dims(),
                 errors::InvalidArgument("Input to ", type_string(),
                                         " operator must have ", num_dims(),
@@ -84,8 +113,8 @@ class PoolingOp : public XlaOpKernel {
 
     const DataType type = input_type(0);
     xla::ComputationDataHandle pooled = ctx->builder()->ReduceWindow(
-        input, InitValue(ctx->builder(), type), *Reduction(ctx, type), ksize_,
-        stride_, padding_);
+        input, InitValue(ctx->builder(), type), *Reduction(ctx, type), ksize,
+        stride, padding_);
     ctx->SetOutput(0, PostProcessOutput(ctx, pooled, type, input_shape));
   }
 
@@ -130,6 +159,10 @@ class MaxPool2DOp : public MaxPoolOp {
   }
 };
 REGISTER_XLA_OP(Name("MaxPool"), MaxPool2DOp);
+REGISTER_XLA_OP(Name("MaxPoolV2")
+                    .CompileTimeConstInput("ksize")
+                    .CompileTimeConstInput("strides"),
+                MaxPool2DOp);
 
 class MaxPool3DOp : public MaxPoolOp {
  public:
diff --git a/tensorflow/contrib/receptive_field/python/util/parse_layer_parameters.py b/tensorflow/contrib/receptive_field/python/util/parse_layer_parameters.py
index 44998b3b65..69188a461b 100644
--- a/tensorflow/contrib/receptive_field/python/util/parse_layer_parameters.py
+++ b/tensorflow/contrib/receptive_field/python/util/parse_layer_parameters.py
@@ -35,7 +35,7 @@ _VALID_PADDING = ["VALID", b"VALID"]
 _SAME_PADDING = ["SAME", b"SAME"]
 
 
-def _stride_size(node):
+def _stride_size(node, name_to_node):
   """Computes stride size given a TF node.
 
   Args:
@@ -45,10 +45,20 @@ def _stride_size(node):
     stride_x: Stride size for horizontal direction (integer).
     stride_y: Stride size for vertical direction (integer).
   """
-  strides_attr = node.attr["strides"]
-  logging.vlog(4, "strides_attr = %s", strides_attr)
-  stride_y = strides_attr.list.i[1]
-  stride_x = strides_attr.list.i[2]
+  if node.op == "MaxPoolV2":
+    strides_input_name = node.input[2]
+    if not strides_input_name.endswith("/strides"):
+      raise ValueError("Strides name does not end with '/strides'")
+    strides_node = name_to_node[strides_input_name]
+    value = strides_node.attr["value"]
+    t = make_ndarray(value.tensor)
+    stride_y = t[1]
+    stride_x = t[2]
+  else:
+    strides_attr = node.attr["strides"]
+    logging.vlog(4, "strides_attr = %s", strides_attr)
+    stride_y = strides_attr.list.i[1]
+    stride_x = strides_attr.list.i[2]
   return stride_x, stride_y
 
 
@@ -144,7 +154,7 @@ def _padding_size_conv_pool(node, kernel_size, stride, input_resolution=None):
   return total_padding, padding
 
 
-def _pool_kernel_size(node):
+def _pool_kernel_size(node, name_to_node):
   """Computes kernel size given a TF pooling node.
 
   Args:
@@ -157,13 +167,27 @@ def _pool_kernel_size(node):
   Raises:
     ValueError: If pooling is invalid.
   """
-  ksize = node.attr["ksize"]
-  kernel_size_y = ksize.list.i[1]
-  kernel_size_x = ksize.list.i[2]
-  if ksize.list.i[0] != 1:
-    raise ValueError("pool ksize for first dim is not 1")
-  if ksize.list.i[3] != 1:
-    raise ValueError("pool ksize for last dim is not 1")
+  if node.op == "MaxPoolV2":
+    ksize_input_name = node.input[1]
+    if not ksize_input_name.endswith("/ksize"):
+      raise ValueError("Kernel size name does not end with '/ksize'")
+    ksize_node = name_to_node[ksize_input_name]
+    value = ksize_node.attr["value"]
+    t = make_ndarray(value.tensor)
+    kernel_size_y = t[1]
+    kernel_size_x = t[2]
+    if t[0] != 1:
+      raise ValueError("pool ksize for first dim is not 1")
+    if t[3] != 1:
+      raise ValueError("pool ksize for last dim is not 1")
+  else:
+    ksize = node.attr["ksize"]
+    kernel_size_y = ksize.list.i[1]
+    kernel_size_x = ksize.list.i[2]
+    if ksize.list.i[0] != 1:
+      raise ValueError("pool ksize for first dim is not 1")
+    if ksize.list.i[3] != 1:
+      raise ValueError("pool ksize for last dim is not 1")
   return kernel_size_x, kernel_size_y
 
 
@@ -243,7 +267,7 @@ def get_layer_params(node, name_to_node, input_resolution=None, force=False):
   logging.vlog(3, "node.op = %s", node.op)
   logging.vlog(4, "node = %s", node)
   if node.op == "Conv2D" or node.op == "DepthwiseConv2dNative":
-    stride_x, stride_y = _stride_size(node)
+    stride_x, stride_y = _stride_size(node, name_to_node)
     kernel_size_x, kernel_size_y = _conv_kernel_size(node, name_to_node)
     # Compute the padding for this node separately for each direction.
     total_padding_x, padding_x = _padding_size_conv_pool(
@@ -260,9 +284,9 @@ def get_layer_params(node, name_to_node, input_resolution=None, force=False):
     stride_y = 1
     total_padding_x, padding_x, total_padding_y, padding_y = (
         _padding_size_pad_layer(node, name_to_node))
-  elif node.op == "MaxPool" or node.op == "AvgPool":
-    stride_x, stride_y = _stride_size(node)
-    kernel_size_x, kernel_size_y = _pool_kernel_size(node)
+  elif node.op == "MaxPool" or node.op == "MaxPoolV2" or node.op == "AvgPool":
+    stride_x, stride_y = _stride_size(node, name_to_node)
+    kernel_size_x, kernel_size_y = _pool_kernel_size(node, name_to_node)
     # Compute the padding for this node separately for each direction.
     total_padding_x, padding_x = _padding_size_conv_pool(
         node, kernel_size_x, stride_x, input_resolution[1]
diff --git a/tensorflow/contrib/specs/python/specs_test.py b/tensorflow/contrib/specs/python/specs_test.py
index 41782a9fc9..d5f61d1b69 100644
--- a/tensorflow/contrib/specs/python/specs_test.py
+++ b/tensorflow/contrib/specs/python/specs_test.py
@@ -87,7 +87,7 @@ class SpecsTest(test.TestCase):
       self.assertEqual(tuple(result.shape), (1, 8, 8, 5))
       self.assertEqual(
           summaries.tf_spec_structure(spec, inputs),
-          "_ maxpool maxpool maxpool")
+          "_ _ _ maxpoolv2 _ _ maxpoolv2 _ _ maxpoolv2")
 
   def testAbbrevPower(self):
     with self.test_session():
@@ -100,10 +100,10 @@ class SpecsTest(test.TestCase):
       self.assertEqual(tuple(result.shape), (1, 8, 8, 5))
       self.assertEqual(
           summaries.tf_spec_structure(spec, inputs),
-          "_ variablev2 conv variablev2 biasadd relu maxpool"
+          "_ variablev2 conv variablev2 biasadd relu _ _ maxpoolv2"
           " variablev2 conv variablev2"
-          " biasadd relu maxpool variablev2 conv variablev2"
-          " biasadd relu maxpool")
+          " biasadd relu _ _ maxpoolv2 variablev2 conv variablev2"
+          " biasadd relu _ _ maxpoolv2")
 
   def testAbbrevPower2(self):
     with self.test_session():
@@ -117,10 +117,10 @@ class SpecsTest(test.TestCase):
       self.assertEqual(tuple(result.shape), (1, 8, 8, 5))
       self.assertEqual(
           summaries.tf_spec_structure(spec, inputs),
-          "_ variablev2 conv variablev2 biasadd relu maxpool"
+          "_ variablev2 conv variablev2 biasadd relu _ _ maxpoolv2"
           " variablev2 conv variablev2 biasadd relu"
-          " maxpool variablev2 conv variablev2 biasadd relu"
-          " maxpool")
+          " _ _ maxpoolv2 variablev2 conv variablev2 biasadd relu"
+          " _ _ maxpoolv2")
 
   def testConc(self):
     with self.test_session():
diff --git a/tensorflow/python/kernel_tests/pooling_ops_test.py b/tensorflow/python/kernel_tests/pooling_ops_test.py
index 3263ed1a60..4466beeec9 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_test.py
@@ -1811,16 +1811,17 @@ class PoolingTest(test.TestCase):
       if test.is_gpu_available():
         pool_funcs.append(nn_ops.max_pool_with_argmax)
       for pool_func in pool_funcs:
-        # Illegal strides.
-        with self.assertRaisesRegexp(
-            errors_impl.UnimplementedError,
-            "Pooling is not yet supported on the batch"):
-          sess.run(
-              pool_func(
-                  array_ops.placeholder(dtypes.float32),
-                  ksize=[1, 1, 1, 1],
-                  strides=[2, 1, 1, 1],
-                  padding="SAME"))
+        if pool_func != nn_ops.max_pool:
+          # Illegal strides.
+          with self.assertRaisesRegexp(
+              errors_impl.UnimplementedError,
+              "Pooling is not yet supported on the batch"):
+            sess.run(
+                pool_func(
+                    array_ops.placeholder(dtypes.float32),
+                    ksize=[1, 1, 1, 1],
+                    strides=[2, 1, 1, 1],
+                    padding="SAME"))
 
         # Filter too large.
         with self.assertRaisesRegexp(ValueError, "Negative dimension size"):
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 32b14f86b5..644bb3af8a 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -2070,12 +2070,12 @@ def max_pool(value, ksize, strides, padding, data_format="NHWC", name=None):
   """
   with ops.name_scope(name, "MaxPool", [value]) as name:
     value = ops.convert_to_tensor(value, name="input")
-    return gen_nn_ops._max_pool(value,
-                                ksize=ksize,
-                                strides=strides,
-                                padding=padding,
-                                data_format=data_format,
-                                name=name)
+    return gen_nn_ops._max_pool_v2(value,
+                                  ksize=ksize,
+                                  strides=strides,
+                                  padding=padding,
+                                  data_format=data_format,
+                                  name=name)
 
 
 @ops.RegisterStatistics("Conv2D", "flops")
-- 
GitLab


From 4bfaa1135a52e45b592ea60a7691ee0b812e5220 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 26 Jan 2018 12:12:26 -0800
Subject: [PATCH 256/966] Fix the documentation for the dense layer for how
 rank > 2 inputs are handled.

PiperOrigin-RevId: 183425868
---
 tensorflow/python/layers/core.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/tensorflow/python/layers/core.py b/tensorflow/python/layers/core.py
index e5b93a54f7..7bf62d45b8 100644
--- a/tensorflow/python/layers/core.py
+++ b/tensorflow/python/layers/core.py
@@ -49,9 +49,6 @@ class Dense(base.Layer):
   and `bias` is a bias vector created by the layer
   (only if `use_bias` is `True`).
 
-  Note: if the input to the layer has a rank greater than 2, then it is
-  flattened prior to the initial matrix multiply by `kernel`.
-
   Arguments:
     units: Integer or Long, dimensionality of the output space.
     activation: Activation function (callable). Set it to None to maintain a
@@ -199,9 +196,6 @@ def dense(
   and `bias` is a bias vector created by the layer
   (only if `use_bias` is `True`).
 
-  Note: if the `inputs` tensor has a rank greater than 2, then it is
-  flattened prior to the initial matrix multiply by `kernel`.
-
   Arguments:
     inputs: Tensor input.
     units: Integer or Long, dimensionality of the output space.
@@ -230,7 +224,8 @@ def dense(
       by the same name.
 
   Returns:
-    Output tensor.
+    Output tensor the same shape as `inputs` except the last dimension is of
+    size `units`.
 
   Raises:
     ValueError: if eager execution is enabled.
-- 
GitLab


From 095b1f13c64ddc815b7b47291adcd9553496fca6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 26 Jan 2018 12:40:01 -0800
Subject: [PATCH 257/966] Cleanup: Ran clang-format on all *.{cc,h} in
 tensorflow/core/ops.

PiperOrigin-RevId: 183429339
---
 tensorflow/core/ops/array_ops.cc                | 17 +++++++++--------
 tensorflow/core/ops/array_ops_test.cc           |  9 +++++++--
 .../core/ops/candidate_sampling_ops_test.cc     |  9 ++++++---
 tensorflow/core/ops/functional_grad.cc          |  2 +-
 tensorflow/core/ops/math_ops.cc                 |  8 ++++----
 tensorflow/core/ops/nn_ops.cc                   | 12 ++++++------
 tensorflow/core/ops/sdca_ops.cc                 |  2 +-
 tensorflow/core/ops/string_ops.cc               |  6 +++---
 tensorflow/core/ops/training_ops_test.cc        |  2 +-
 9 files changed, 38 insertions(+), 29 deletions(-)

diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index 279a5876f9..fb9e8ad50c 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -977,8 +977,8 @@ REGISTER_OP("GatherNd")
       if (c->Value(r_dim) > c->Rank(params)) {
         return errors::InvalidArgument(
             "indices.shape[-1] must be <= params.rank, but saw indices shape: ",
-            c->DebugString(indices), " and params shape: ",
-            c->DebugString(params));
+            c->DebugString(indices),
+            " and params shape: ", c->DebugString(params));
       }
 
       // Remove r_dim from indices to get output.
@@ -1252,12 +1252,12 @@ REGISTER_OP("ReverseSequence")
       // Validate batch_dim and seq_dim against input.
       const int32 input_rank = c->Rank(input);
       if (batch_dim >= input_rank) {
-        return errors::InvalidArgument("batch_dim must be < input rank: ",
-                                       batch_dim, " vs. ", input_rank);
+        return errors::InvalidArgument(
+            "batch_dim must be < input rank: ", batch_dim, " vs. ", input_rank);
       }
       if (seq_dim >= input_rank) {
-        return errors::InvalidArgument("seq_dim must be < input rank: ",
-                                       seq_dim, " vs. ", input_rank);
+        return errors::InvalidArgument(
+            "seq_dim must be < input rank: ", seq_dim, " vs. ", input_rank);
       }
 
       DimensionHandle batch_dim_dim = c->Dim(input, batch_dim);
@@ -2638,8 +2638,9 @@ Status ScatterNdShape(InferenceContext* c) {
       Status s = c->Merge(prefix_indices, prefix_updates, &unused);
       if (!s.ok()) {
         return errors::InvalidArgument(
-            "The outer ", outer_dims, " dimensions of indices.shape=",
-            c->DebugString(indices_shape), " must match the outer ", outer_dims,
+            "The outer ", outer_dims,
+            " dimensions of indices.shape=", c->DebugString(indices_shape),
+            " must match the outer ", outer_dims,
             " dimensions of updates.shape=", c->DebugString(updates_shape),
             ": ", s.error_message());
       }
diff --git a/tensorflow/core/ops/array_ops_test.cc b/tensorflow/core/ops/array_ops_test.cc
index a182fd1c47..86d64635f4 100644
--- a/tensorflow/core/ops/array_ops_test.cc
+++ b/tensorflow/core/ops/array_ops_test.cc
@@ -142,8 +142,13 @@ TEST(ArrayOpsTest, Const_ShapeFn) {
 
 TEST(ArrayOpsTest, UnchangedShapes_ShapeFn) {
   for (const char* op_name : {
-           "CheckNumerics", "Identity", "RefIdentity", "QuantizeAndDequantize",
-           "StopGradient", "ZerosLike", "OnesLike",
+           "CheckNumerics",
+           "Identity",
+           "RefIdentity",
+           "QuantizeAndDequantize",
+           "StopGradient",
+           "ZerosLike",
+           "OnesLike",
        }) {
     ShapeInferenceTestOp op(op_name);
     INFER_OK(op, "?", "in0");
diff --git a/tensorflow/core/ops/candidate_sampling_ops_test.cc b/tensorflow/core/ops/candidate_sampling_ops_test.cc
index c79b443914..f367371604 100644
--- a/tensorflow/core/ops/candidate_sampling_ops_test.cc
+++ b/tensorflow/core/ops/candidate_sampling_ops_test.cc
@@ -23,9 +23,12 @@ namespace tensorflow {
 
 TEST(CandidateSamplerOpsTest, CandidateSampler_ShapeFn) {
   for (const char* op_name : {
-           "AllCandidateSampler", "FixedUnigramCandidateSampler",
-           "LearnedUnigramCandidateSampler", "LogUniformCandidateSampler",
-           "ThreadUnsafeUnigramCandidateSampler", "UniformCandidateSampler",
+           "AllCandidateSampler",
+           "FixedUnigramCandidateSampler",
+           "LearnedUnigramCandidateSampler",
+           "LogUniformCandidateSampler",
+           "ThreadUnsafeUnigramCandidateSampler",
+           "UniformCandidateSampler",
        }) {
     ShapeInferenceTestOp op(op_name);
     TF_ASSERT_OK(NodeDefBuilder("test", op.name)
diff --git a/tensorflow/core/ops/functional_grad.cc b/tensorflow/core/ops/functional_grad.cc
index 6df3536795..eeccb72da6 100644
--- a/tensorflow/core/ops/functional_grad.cc
+++ b/tensorflow/core/ops/functional_grad.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/function.h"
 #include <vector>
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index dd484c3ee7..872ebe98c1 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -1172,12 +1172,12 @@ Status RangeSize(const Tensor* start_t, const Tensor* limit_t,
   T limit = limit_t->scalar<T>()();
   T delta = delta_t->scalar<T>()();
   if (start > limit && delta > 0) {
-    return errors::InvalidArgument("Requires start <= limit when delta > 0: ",
-                                   start, "/", limit);
+    return errors::InvalidArgument(
+        "Requires start <= limit when delta > 0: ", start, "/", limit);
   }
   if (start < limit && delta < 0) {
-    return errors::InvalidArgument("Requires start >= limit when delta < 0: ",
-                                   start, "/", limit);
+    return errors::InvalidArgument(
+        "Requires start >= limit when delta < 0: ", start, "/", limit);
   }
   if (delta == 0) {
     return errors::InvalidArgument("Requires delta != 0");
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index 3f72b41569..62661fe4bd 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -1155,9 +1155,9 @@ Status TopKShapeFn(InferenceContext* c) {
   DimensionHandle last_dim = c->Dim(input, -1);
   if (c->ValueKnown(last_dim) && c->ValueKnown(k_dim) &&
       c->Value(last_dim) < c->Value(k_dim)) {
-    return errors::InvalidArgument("input must have last dimension >= k = ",
-                                   c->Value(k_dim), " but is ",
-                                   c->Value(last_dim));
+    return errors::InvalidArgument(
+        "input must have last dimension >= k = ", c->Value(k_dim), " but is ",
+        c->Value(last_dim));
   }
 
   // Replace last_dim with k_dim.
@@ -1211,9 +1211,9 @@ REGISTER_OP("NthElement")
       DimensionHandle last_dim = c->Dim(input, -1);
       if (c->ValueKnown(last_dim) && c->ValueKnown(n_dim) &&
           c->Value(last_dim) <= c->Value(n_dim)) {
-        return errors::InvalidArgument("Input must have last dimension > n = ",
-                                       c->Value(n_dim), " but is ",
-                                       c->Value(last_dim));
+        return errors::InvalidArgument(
+            "Input must have last dimension > n = ", c->Value(n_dim),
+            " but is ", c->Value(last_dim));
       }
 
       // Reduce last_dim for output tensor
diff --git a/tensorflow/core/ops/sdca_ops.cc b/tensorflow/core/ops/sdca_ops.cc
index e67d95fa8c..4025070adb 100644
--- a/tensorflow/core/ops/sdca_ops.cc
+++ b/tensorflow/core/ops/sdca_ops.cc
@@ -19,8 +19,8 @@ limitations under the License.
 
 namespace tensorflow {
 
-using shape_inference::ShapeHandle;
 using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
 
 // --------------------------------------------------------------------------
 static Status ApplySdcaOptimizerShapeFn(InferenceContext* c) {
diff --git a/tensorflow/core/ops/string_ops.cc b/tensorflow/core/ops/string_ops.cc
index 8beb28de0a..e4c5bcfb54 100644
--- a/tensorflow/core/ops/string_ops.cc
+++ b/tensorflow/core/ops/string_ops.cc
@@ -137,9 +137,9 @@ REGISTER_OP("Substr")
         DimensionHandle pos_dim = c->Dim(pos_shape, i);
         DimensionHandle len_dim = c->Dim(len_shape, i);
         if (c->Value(pos_dim) != c->Value(len_dim)) {
-          return errors::InvalidArgument("pos and len shapes must match: ",
-                                         c->DebugString(pos_shape), " vs. ",
-                                         c->DebugString(len_shape));
+          return errors::InvalidArgument(
+              "pos and len shapes must match: ", c->DebugString(pos_shape),
+              " vs. ", c->DebugString(len_shape));
         }
       }
       // c->input(0) is the ShapeHandle to input strings
diff --git a/tensorflow/core/ops/training_ops_test.cc b/tensorflow/core/ops/training_ops_test.cc
index de4e3cd9e7..0f309c1f4e 100644
--- a/tensorflow/core/ops/training_ops_test.cc
+++ b/tensorflow/core/ops/training_ops_test.cc
@@ -24,7 +24,7 @@ static void TestGradAndIndicesErrorHandling(const ShapeInferenceTestOp& op,
                                             string shape_spec_middle,
                                             const string& shape_spec_end = "") {
   auto shape_spec = [&shape_spec_middle, shape_spec_end](
-      const char* var_spec, const char* grad_indices_spec) {
+                        const char* var_spec, const char* grad_indices_spec) {
     return strings::StrCat(var_spec, ";", shape_spec_middle, ";",
                            grad_indices_spec, shape_spec_end);
   };
-- 
GitLab


From 16a2a9f6bb9bf4421119b591fa56d58ee95c9a0e Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Fri, 26 Jan 2018 12:45:35 -0800
Subject: [PATCH 258/966] Add KafkaReader for processing streaming data with
 Apache Kafka (#14098)

* Add KafkaReader for processing streaming data with Apache Kafka

Apache Kafka is a widely used distributed streaming platform in
open source community. The goal of this fix is to create a contrib
Reader ops (inherits ReaderBase and is similiar to
TextLineReader/TFRecordReader) so that it is possible to reader
Kafka streaming data from TensorFlow in a similiar fashion.

This fix uses a C/C++ Apache Kafka client library librdkafka which
is released under the 2-clause BSD license, and is widely used in
a number of Kafka bindings such as Go, Python, C#/.Net, etc.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add KafkaReader Python wrapper.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add BUILD file and op registration for KafkaReader.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add C++ Kernel for KafkaReader

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add librdkafka to third_party packages in Bazel

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add contrib/kafka to part of the contrib bazel file.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Update workspace.bzl

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Comment out clean_deps of `tensorflow/core:framework` and `tensorflow/core:lib`

so that it is possible to build with ReaderBase.

See 1419 for details.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add group id flag.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Sync offset

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add test cases and scipt to start and stop Kafka server (with docker)

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Convert to KafkaConsumer from the legacy Consumer with librdkafka

so that thread join does not hang.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Only output offset as the key.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add timeout attr so that Kafka Consumer could use

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Build Kafka kernels by default, so that to get around the linkage issue.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Convert KafkaReader to KafkaDataset.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Fix workspace.bzl for kafka with tf_http_archive

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add public visibility

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Address review feedbacks

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Optionally select Kafka support through ./configure

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 configure.py                                  |   3 +
 tensorflow/BUILD                              |   6 +
 tensorflow/contrib/BUILD                      |   2 +
 tensorflow/contrib/kafka/BUILD                | 104 ++++++
 tensorflow/contrib/kafka/__init__.py          |  32 ++
 .../kafka/kernels/kafka_dataset_ops.cc        | 321 ++++++++++++++++++
 tensorflow/contrib/kafka/ops/kafka_ops.cc     |  44 +++
 .../kafka/python/kernel_tests/kafka_test.py   | 117 +++++++
 .../kafka/python/kernel_tests/kafka_test.sh   |  34 ++
 .../kafka/python/ops/kafka_dataset_ops.py     |  72 ++++
 .../core/platform/default/build_config.bzl    |   6 +
 tensorflow/workspace.bzl                      |  12 +
 third_party/kafka/BUILD                       | 147 ++++++++
 third_party/kafka/config.patch                |  44 +++
 14 files changed, 944 insertions(+)
 create mode 100644 tensorflow/contrib/kafka/BUILD
 create mode 100644 tensorflow/contrib/kafka/__init__.py
 create mode 100644 tensorflow/contrib/kafka/kernels/kafka_dataset_ops.cc
 create mode 100644 tensorflow/contrib/kafka/ops/kafka_ops.cc
 create mode 100644 tensorflow/contrib/kafka/python/kernel_tests/kafka_test.py
 create mode 100644 tensorflow/contrib/kafka/python/kernel_tests/kafka_test.sh
 create mode 100644 tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py
 create mode 100644 third_party/kafka/BUILD
 create mode 100644 third_party/kafka/config.patch

diff --git a/configure.py b/configure.py
index 083fed1710..16763b8c0d 100644
--- a/configure.py
+++ b/configure.py
@@ -1354,6 +1354,7 @@ def main():
     environ_cp['TF_NEED_GCP'] = '0'
     environ_cp['TF_NEED_HDFS'] = '0'
     environ_cp['TF_NEED_JEMALLOC'] = '0'
+    environ_cp['TF_NEED_KAFKA'] = '0'
     environ_cp['TF_NEED_OPENCL_SYCL'] = '0'
     environ_cp['TF_NEED_COMPUTECPP'] = '0'
     environ_cp['TF_NEED_OPENCL'] = '0'
@@ -1372,6 +1373,8 @@ def main():
                 'with_hdfs_support', True, 'hdfs')
   set_build_var(environ_cp, 'TF_NEED_S3', 'Amazon S3 File System',
                 'with_s3_support', True, 's3')
+  set_build_var(environ_cp, 'TF_NEED_KAFKA', 'Apache Kafka Platform',
+                'with_kafka_support', False, 'kafka')
   set_build_var(environ_cp, 'TF_ENABLE_XLA', 'XLA JIT', 'with_xla_support',
                 False, 'xla')
   set_build_var(environ_cp, 'TF_NEED_GDR', 'GDR', 'with_gdr_support',
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index b26c525525..9e69613c79 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -211,6 +211,12 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+config_setting(
+    name = "with_kafka_support",
+    define_values = {"with_kafka_support": "true"},
+    visibility = ["//visibility:public"],
+)
+
 # Crosses between platforms and file system libraries not supported on those
 # platforms due to limitations in nested select() statements.
 config_setting(
diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index f1e54432fa..5ac5955626 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -48,6 +48,7 @@ py_library(
         "//tensorflow/contrib/image:single_image_random_dot_stereograms_py",
         "//tensorflow/contrib/input_pipeline:input_pipeline_py",
         "//tensorflow/contrib/integrate:integrate_py",
+        "//tensorflow/contrib/kafka",
         "//tensorflow/contrib/keras",
         "//tensorflow/contrib/kernel_methods",
         "//tensorflow/contrib/kfac",
@@ -139,6 +140,7 @@ cc_library(
         "//tensorflow/contrib/factorization:all_ops",
         "//tensorflow/contrib/framework:all_ops",
         "//tensorflow/contrib/input_pipeline:input_pipeline_ops_op_lib",
+        "//tensorflow/contrib/kafka:kafka_ops_op_lib",
         "//tensorflow/contrib/layers:sparse_feature_cross_op_op_lib",
         "//tensorflow/contrib/nccl:nccl_ops_op_lib",
         "//tensorflow/contrib/nearest_neighbor:nearest_neighbor_ops_op_lib",
diff --git a/tensorflow/contrib/kafka/BUILD b/tensorflow/contrib/kafka/BUILD
new file mode 100644
index 0000000000..f7593aa462
--- /dev/null
+++ b/tensorflow/contrib/kafka/BUILD
@@ -0,0 +1,104 @@
+package(
+    default_visibility = ["//visibility:private"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "tf_gen_op_libs")
+load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
+load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+
+tf_kernel_library(
+    name = "kafka_kernels",
+    srcs = ["kernels/kafka_dataset_ops.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels:bounds_check_lib",
+        "//tensorflow/core/kernels:dataset",
+        "//third_party/eigen3",
+        "@kafka//:kafka",
+    ],
+)
+
+tf_gen_op_libs(
+    op_lib_names = ["kafka_ops"],
+    deps = [
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_gen_op_wrapper_py(
+    name = "gen_kafka_ops",
+    out = "python/ops/gen_kafka_ops.py",
+    require_shape_functions = True,
+    deps = [":kafka_ops_op_lib"],
+)
+
+py_library(
+    name = "kafka",
+    srcs = [
+        "__init__.py",
+        "python/ops/kafka_dataset_ops.py",
+    ],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":gen_kafka_ops",
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python/data/ops:readers",
+    ],
+)
+
+# The Kafka server has to be setup before running the test.
+# The Kafka server is setup through Docker so the Docker engine
+# has to be installed.
+#
+# Once the Docker engine is ready:
+# To setup the Kafka server:
+# $ bash tensorflow/contrib/kafka/python/kernel_tests/kafka_test.sh start kafka
+#
+# After the test is complete:
+# To team down the Kafka server:
+# $ bash tensorflow/contrib/kafka/python/kernel_tests/kafka_test.sh stop kafka
+tf_py_test(
+    name = "kafka_test",
+    srcs = ["python/kernel_tests/kafka_test.py"],
+    additional_deps = [
+        ":kafka",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+    tags = [
+        "manual",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/kafka/__init__.py b/tensorflow/contrib/kafka/__init__.py
new file mode 100644
index 0000000000..4d755c4056
--- /dev/null
+++ b/tensorflow/contrib/kafka/__init__.py
@@ -0,0 +1,32 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Kafka Dataset.
+
+@@KafkaDataset
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.kafka.python.ops.kafka_dataset_ops import KafkaDataset
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = [
+    "KafkaDataset",
+]
+
+remove_undocumented(__name__)
diff --git a/tensorflow/contrib/kafka/kernels/kafka_dataset_ops.cc b/tensorflow/contrib/kafka/kernels/kafka_dataset_ops.cc
new file mode 100644
index 0000000000..88ef5f3571
--- /dev/null
+++ b/tensorflow/contrib/kafka/kernels/kafka_dataset_ops.cc
@@ -0,0 +1,321 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/dataset.h"
+
+#include "tensorflow/core/framework/tensor.h"
+
+#include "src-cpp/rdkafkacpp.h"
+
+namespace tensorflow {
+
+class KafkaDatasetOp : public DatasetOpKernel {
+ public:
+  using DatasetOpKernel::DatasetOpKernel;
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override {
+    const Tensor* topics_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("topics", &topics_tensor));
+    OP_REQUIRES(
+        ctx, topics_tensor->dims() <= 1,
+        errors::InvalidArgument("`topics` must be a scalar or a vector."));
+
+    std::vector<string> topics;
+    topics.reserve(topics_tensor->NumElements());
+    for (int i = 0; i < topics_tensor->NumElements(); ++i) {
+      topics.push_back(topics_tensor->flat<string>()(i));
+    }
+
+    std::string servers = "";
+    OP_REQUIRES_OK(ctx,
+                   ParseScalarArgument<std::string>(ctx, "servers", &servers));
+    std::string group = "";
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<std::string>(ctx, "group", &group));
+    bool eof = false;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<bool>(ctx, "eof", &eof));
+    int64 timeout = -1;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "timeout", &timeout));
+    OP_REQUIRES(ctx, (timeout > 0),
+                errors::InvalidArgument(
+                    "Timeout value should be large than 0, got ", timeout));
+    *output = new Dataset(ctx, std::move(topics), servers, group, eof, timeout);
+  }
+
+ private:
+  class Dataset : public GraphDatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, std::vector<string> topics,
+            const string& servers, const string& group, const bool eof,
+            const int64 timeout)
+        : GraphDatasetBase(ctx),
+          topics_(std::move(topics)),
+          servers_(servers),
+          group_(group),
+          eof_(eof),
+          timeout_(timeout) {}
+
+    std::unique_ptr<IteratorBase> MakeIterator(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(
+          new Iterator({this, strings::StrCat(prefix, "::Kafka")}));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      static DataTypeVector* dtypes = new DataTypeVector({DT_STRING});
+      return *dtypes;
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      static std::vector<PartialTensorShape>* shapes =
+          new std::vector<PartialTensorShape>({{}});
+      return *shapes;
+    }
+
+    string DebugString() override { return "KafkaDatasetOp::Dataset"; }
+
+   protected:
+    Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* topics = nullptr;
+      TF_RETURN_IF_ERROR(b->AddVector(topics_, &topics));
+      Node* servers = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(servers_, &servers));
+      Node* group = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(group_, &group));
+      Node* eof = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(eof_, &eof));
+      Node* timeout = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(timeout_, &timeout));
+      TF_RETURN_IF_ERROR(
+          b->AddDataset(this, {topics, servers, group, eof, timeout}, output));
+      return Status::OK();
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params) {}
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        do {
+          // We are currently processing a topic, so try to read the next line.
+          if (consumer_.get()) {
+            while (true) {
+              if (limit_ >= 0 &&
+                  (topic_partition_->offset() >= limit_ || offset_ >= limit_)) {
+                // EOF current topic
+                break;
+              }
+              std::unique_ptr<RdKafka::Message> message(
+                  consumer_->consume(dataset()->timeout_));
+              if (message->err() == RdKafka::ERR_NO_ERROR) {
+                // Produce the line as output.
+                Tensor line_tensor(cpu_allocator(), DT_STRING, {});
+                line_tensor.scalar<string>()() =
+                    std::string(static_cast<const char*>(message->payload()),
+                                message->len());
+                out_tensors->emplace_back(std::move(line_tensor));
+                *end_of_sequence = false;
+                // Sync offset
+                offset_ = message->offset();
+                return Status::OK();
+              }
+
+              if (message->err() == RdKafka::ERR__PARTITION_EOF &&
+                  dataset()->eof_) {
+                // EOF current topic
+                break;
+              }
+              if (message->err() != RdKafka::ERR__TIMED_OUT) {
+                return errors::Internal("Failed to consume:",
+                                        message->errstr());
+              }
+              message.reset(nullptr);
+              consumer_->poll(0);
+            }
+
+            // We have reached the end of the current topic, so maybe
+            // move on to next topic.
+            ResetStreamsLocked();
+            ++current_topic_index_;
+          }
+
+          // Iteration ends when there are no more topic to process.
+          if (current_topic_index_ == dataset()->topics_.size()) {
+            *end_of_sequence = true;
+            return Status::OK();
+          }
+
+          TF_RETURN_IF_ERROR(SetupStreamsLocked(ctx->env()));
+        } while (true);
+      }
+
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("current_topic_index"),
+                                               current_topic_index_));
+
+        // `consumer_` is empty if
+        // 1. GetNext has not been called even once.
+        // 2. All topics have been read and iterator has been exhausted.
+        if (consumer_.get()) {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("current_pos"), offset_));
+        }
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        ResetStreamsLocked();
+        int64 current_topic_index;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("current_topic_index"),
+                                              &current_topic_index));
+        current_topic_index_ = size_t(current_topic_index);
+        // The key "current_pos" is written only if the iterator was saved
+        // with an open topic.
+        if (reader->Contains(full_name("current_pos"))) {
+          int64 current_pos;
+          TF_RETURN_IF_ERROR(
+              reader->ReadScalar(full_name("current_pos"), &current_pos));
+
+          TF_RETURN_IF_ERROR(SetupStreamsLocked(ctx->env()));
+          topic_partition_->set_offset(current_pos);
+          if (topic_partition_->offset() != current_pos) {
+            return errors::Internal("Failed to restore to offset ",
+                                    current_pos);
+          }
+          offset_ = current_pos;
+        }
+        return Status::OK();
+      }
+
+     private:
+      // Sets up Kafka streams to read from the topic at
+      // `current_topic_index_`.
+      Status SetupStreamsLocked(Env* env) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        if (current_topic_index_ >= dataset()->topics_.size()) {
+          return errors::InvalidArgument(
+              "current_topic_index_:", current_topic_index_,
+              " >= topics_.size():", dataset()->topics_.size());
+        }
+
+        // Actually move on to next topic.
+        string entry = dataset()->topics_[current_topic_index_];
+
+        std::vector<string> parts = str_util::Split(entry, ":");
+        if (parts.size() < 1) {
+          return errors::InvalidArgument("Invalid parameters: ", entry);
+        }
+        string topic = parts[0];
+        int32 partition = 0;
+        if (parts.size() > 1) {
+          if (!strings::safe_strto32(parts[1], &partition)) {
+            return errors::InvalidArgument("Invalid parameters: ", entry);
+          }
+        }
+        int64 offset = 0;
+        if (parts.size() > 2) {
+          if (!strings::safe_strto64(parts[2], &offset)) {
+            return errors::InvalidArgument("Invalid parameters: ", entry);
+          }
+        }
+
+        topic_partition_.reset(
+            RdKafka::TopicPartition::create(topic, partition, offset));
+
+        offset_ = topic_partition_->offset();
+        limit_ = -1;
+        if (parts.size() > 3) {
+          if (!strings::safe_strto64(parts[3], &limit_)) {
+            return errors::InvalidArgument("Invalid parameters: ", entry);
+          }
+        }
+
+        std::unique_ptr<RdKafka::Conf> conf(
+            RdKafka::Conf::create(RdKafka::Conf::CONF_GLOBAL));
+        std::unique_ptr<RdKafka::Conf> topic_conf(
+            RdKafka::Conf::create(RdKafka::Conf::CONF_TOPIC));
+
+        std::string errstr;
+
+        RdKafka::Conf::ConfResult result =
+            conf->set("default_topic_conf", topic_conf.get(), errstr);
+        if (result != RdKafka::Conf::CONF_OK) {
+          return errors::Internal("Failed to set default_topic_conf:", errstr);
+        }
+
+        result = conf->set("bootstrap.servers", dataset()->servers_, errstr);
+        if (result != RdKafka::Conf::CONF_OK) {
+          return errors::Internal("Failed to set bootstrap.servers ",
+                                  dataset()->servers_, ":", errstr);
+        }
+        result = conf->set("group.id", dataset()->group_, errstr);
+        if (result != RdKafka::Conf::CONF_OK) {
+          return errors::Internal("Failed to set group.id ", dataset()->group_,
+                                  ":", errstr);
+        }
+
+        consumer_.reset(RdKafka::KafkaConsumer::create(conf.get(), errstr));
+        if (!consumer_.get()) {
+          return errors::Internal("Failed to create consumer:", errstr);
+        }
+
+        std::vector<RdKafka::TopicPartition*> partitions;
+        partitions.emplace_back(topic_partition_.get());
+        RdKafka::ErrorCode err = consumer_->assign(partitions);
+        if (err != RdKafka::ERR_NO_ERROR) {
+          return errors::Internal(
+              "Failed to assign partition [", topic_partition_->topic(), ", ",
+              topic_partition_->partition(), ", ", topic_partition_->offset(),
+              "]:", RdKafka::err2str(err));
+        }
+
+        return Status::OK();
+      }
+
+      // Resets all Kafka streams.
+      void ResetStreamsLocked() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        consumer_->unassign();
+        consumer_->close();
+        consumer_.reset(nullptr);
+      }
+
+      mutex mu_;
+      size_t current_topic_index_ GUARDED_BY(mu_) = 0;
+      int64 offset_ GUARDED_BY(mu_) = 0;
+      int64 limit_ GUARDED_BY(mu_) = -1;
+      std::unique_ptr<RdKafka::TopicPartition> topic_partition_ GUARDED_BY(mu_);
+      std::unique_ptr<RdKafka::KafkaConsumer> consumer_ GUARDED_BY(mu_);
+    };
+
+    const std::vector<string> topics_;
+    const std::string servers_;
+    const std::string group_;
+    const bool eof_;
+    const int64 timeout_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("KafkaDataset").Device(DEVICE_CPU),
+                        KafkaDatasetOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/kafka/ops/kafka_ops.cc b/tensorflow/contrib/kafka/ops/kafka_ops.cc
new file mode 100644
index 0000000000..8cdf16103b
--- /dev/null
+++ b/tensorflow/contrib/kafka/ops/kafka_ops.cc
@@ -0,0 +1,44 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+REGISTER_OP("KafkaDataset")
+    .Input("topics: string")
+    .Input("servers: string")
+    .Input("group: string")
+    .Input("eof: bool")
+    .Input("timeout: int64")
+    .Output("handle: variant")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a dataset that emits the messages of one or more Kafka topics.
+
+topics: A `tf.string` tensor containing one or more subscriptions,
+  in the format of [topic:partition:offset:length],
+  by default length is -1 for unlimited.
+servers: A list of bootstrap servers.
+group: The consumer group id.
+eof: If True, the kafka reader will stop on EOF.
+timeout: The timeout value for the Kafka Consumer to wait
+  (in millisecond).
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/kafka/python/kernel_tests/kafka_test.py b/tensorflow/contrib/kafka/python/kernel_tests/kafka_test.py
new file mode 100644
index 0000000000..94cf6b5ace
--- /dev/null
+++ b/tensorflow/contrib/kafka/python/kernel_tests/kafka_test.py
@@ -0,0 +1,117 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License.  You may obtain a copy of
+# the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+# License for the specific language governing permissions and limitations under
+# the License.
+# ==============================================================================
+"""Tests for KafkaDataset."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import os
+
+from tensorflow.contrib.kafka.python.ops import kafka_dataset_ops
+from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.lib.io import python_io
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import io_ops
+from tensorflow.python.platform import test
+from tensorflow.python.util import compat
+
+class KafkaDatasetTest(test.TestCase):
+
+  def setUp(self):
+    # The Kafka server has to be setup before the test
+    # and tear down after the test manually.
+    # The docker engine has to be installed.
+    #
+    # To setup the Kafka server:
+    # $ bash kafka_test.sh start kafka
+    #
+    # To team down the Kafka server:
+    # $ bash kafka_test.sh stop kafka
+    pass
+
+  def testKafkaDataset(self):
+    topics = array_ops.placeholder(dtypes.string, shape=[None])
+    num_epochs = array_ops.placeholder(dtypes.int64, shape=[])
+    batch_size = array_ops.placeholder(dtypes.int64, shape=[])
+
+    repeat_dataset = kafka_dataset_ops.KafkaDataset(
+        topics, group="test", eof=True).repeat(num_epochs)
+    batch_dataset = repeat_dataset.batch(batch_size)
+
+    iterator = iterator_ops.Iterator.from_structure(batch_dataset.output_types)
+    init_op = iterator.make_initializer(repeat_dataset)
+    init_batch_op = iterator.make_initializer(batch_dataset)
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      # Basic test: read from topic 0.
+      sess.run(
+          init_op, feed_dict={topics: ["test:0:0:4"],
+                              num_epochs: 1})
+      for i in range(5):
+        self.assertEqual("D"+str(i), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Basic test: read from topic 1.
+      sess.run(
+          init_op, feed_dict={topics: ["test:0:5:-1"],
+                              num_epochs: 1})
+      for i in range(5):
+        self.assertEqual("D"+str(i + 5), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Basic test: read from both topics.
+      sess.run(init_op, feed_dict={topics: ["test:0:0:4", "test:0:5:-1"],
+                                   num_epochs: 1})
+      for j in range(2):
+        for i in range(5):
+          self.assertEqual("D"+str(i + j * 5), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Test repeated iteration through both files.
+      sess.run(init_op, feed_dict={topics: ["test:0:0:4", "test:0:5:-1"],
+                                   num_epochs: 10})
+      for _ in range(10):
+        for j in range(2):
+          for i in range(5):
+            self.assertEqual("D"+str(i + j * 5), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Test batched and repeated iteration through both files.
+      sess.run(
+          init_batch_op,
+          feed_dict={topics: ["test:0:0:4", "test:0:5:-1"],
+                     num_epochs: 10,
+                     batch_size: 5})
+      for _ in range(10):
+        self.assertAllEqual(["D"+str(i) for i in range(5)],
+                            sess.run(get_next))
+        self.assertAllEqual(["D"+str(i + 5) for i in range(5)],
+                            sess.run(get_next))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/kafka/python/kernel_tests/kafka_test.sh b/tensorflow/contrib/kafka/python/kernel_tests/kafka_test.sh
new file mode 100644
index 0000000000..7997c12731
--- /dev/null
+++ b/tensorflow/contrib/kafka/python/kernel_tests/kafka_test.sh
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+
+set -e
+set -o pipefail
+
+if [ "$#" -ne 2 ]; then
+  echo "Usage: $0 start|stop <kafka container name>" >&2
+  exit 1
+fi
+
+container=$2
+if [ "$1" == "start" ]; then
+    docker run -d --rm --net=host --name=$container spotify/kafka
+    echo Wait 5 secs until kafka is up and running
+    sleep 5
+    echo Create test topic
+    docker exec $container bash -c '/opt/kafka_2.11-0.10.1.0/bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 1 --topic test'
+    echo Create test message
+    docker exec $container bash -c 'echo -e "D0\nD1\nD2\nD3\nD4\nD5\nD6\nD7\nD8\nD9" > /test'
+    echo Produce test message
+    docker exec $container bash -c '/opt/kafka_2.11-0.10.1.0/bin/kafka-console-producer.sh --topic test --broker-list 127.0.0.1:9092 < /test'
+
+    echo Container $container started successfully
+elif [ "$1" == "stop" ]; then
+    docker rm -f $container
+
+    echo Container $container stopped successfully
+else
+  echo "Usage: $0 start|stop <kafka container name>" >&2
+  exit 1
+fi
+
+
+
diff --git a/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py b/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py
new file mode 100644
index 0000000000..6590d86ebb
--- /dev/null
+++ b/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py
@@ -0,0 +1,72 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Kafka Dataset."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.kafka.python.ops import gen_kafka_ops
+from tensorflow.contrib.util import loader
+from tensorflow.python.data.ops.readers import Dataset
+from tensorflow.python.framework import common_shapes
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.platform import resource_loader
+
+class KafkaDataset(Dataset):
+  """A Kafka Dataset that consumes the message.
+  """
+
+  def __init__(self, topics, servers="localhost", group="", eof=False, timeout=1000):
+    """Create a KafkaReader.
+
+    Args:
+      topics: A `tf.string` tensor containing one or more subscriptions,
+              in the format of [topic:partition:offset:length],
+              by default length is -1 for unlimited.
+      servers: A list of bootstrap servers.
+      group: The consumer group id.
+      eof: If True, the kafka reader will stop on EOF.
+      timeout: The timeout value for the Kafka Consumer to wait
+               (in millisecond).
+    """
+    super(KafkaDataset, self).__init__()
+    self._topics = ops.convert_to_tensor(
+        topics, dtype=dtypes.string, name="topics")
+    self._servers = ops.convert_to_tensor(
+        servers, dtype=dtypes.string, name="servers")
+    self._group = ops.convert_to_tensor(
+        group, dtype=dtypes.string, name="group")
+    self._eof = ops.convert_to_tensor(
+        eof, dtype=dtypes.bool, name="eof")
+    self._timeout = ops.convert_to_tensor(
+        timeout, dtype=dtypes.int64, name="timeout")
+
+  def _as_variant_tensor(self):
+    return gen_kafka_ops.kafka_dataset(
+        self._topics, self._servers, self._group, self._eof, self._timeout)
+
+  @property
+  def output_classes(self):
+    return ops.Tensor
+
+  @property
+  def output_shapes(self):
+    return tensor_shape.scalar()
+
+  @property
+  def output_types(self):
+    return dtypes.string
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index 2102c5cca3..119ffa3d9e 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -489,6 +489,12 @@ def tf_additional_core_deps():
           "//tensorflow/core/platform/s3:s3_file_system",
       ],
       "//conditions:default": [],
+  }) + select({
+      "//tensorflow:with_kafka_support": [
+          "//tensorflow/contrib/kafka:kafka_kernels",
+          "//tensorflow/contrib/kafka:kafka_ops_op_lib",
+      ],
+      "//conditions:default": [],
   })
 
 # TODO(jart, jhseu): Delete when GCP is default on.
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index f7d9075032..f9c13e55e6 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -560,6 +560,18 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = str(Label("//third_party:nccl.BUILD")),
   )
 
+  tf_http_archive(
+      name = "kafka",
+      urls = [
+          "https://mirror.bazel.build/github.com/edenhill/librdkafka/archive/v0.11.1.tar.gz",
+          "https://github.com/edenhill/librdkafka/archive/v0.11.1.tar.gz",
+      ],
+      sha256 = "dd035d57c8f19b0b612dd6eefe6e5eebad76f506e302cccb7c2066f25a83585e",
+      strip_prefix = "librdkafka-0.11.1",
+      build_file = str(Label("//third_party:kafka/BUILD")),
+      patch_file = str(Label("//third_party/kafka:config.patch")),
+  )
+
   tf_http_archive(
       name = "aws",
       urls = [
diff --git a/third_party/kafka/BUILD b/third_party/kafka/BUILD
new file mode 100644
index 0000000000..a61a9e1f6c
--- /dev/null
+++ b/third_party/kafka/BUILD
@@ -0,0 +1,147 @@
+# Description:
+#   Kafka C/C++ (librdkafka) client library
+
+licenses(["notice"])  # 2-clause BSD license
+
+exports_files(["LICENSE"])
+
+cc_library(
+    name = "kafka",
+    srcs = [
+        "config.h",
+        "src-cpp/ConfImpl.cpp",
+        "src-cpp/ConsumerImpl.cpp",
+        "src-cpp/HandleImpl.cpp",
+        "src-cpp/KafkaConsumerImpl.cpp",
+        "src-cpp/MessageImpl.cpp",
+        "src-cpp/MetadataImpl.cpp",
+        "src-cpp/QueueImpl.cpp",
+        "src-cpp/RdKafka.cpp",
+        "src-cpp/TopicImpl.cpp",
+        "src-cpp/TopicPartitionImpl.cpp",
+        "src/crc32c.c",
+        "src/crc32c.h",
+        "src/lz4.c",
+        "src/lz4.h",
+        "src/lz4frame.c",
+        "src/lz4frame.h",
+        "src/lz4frame_static.h",
+        "src/lz4hc.c",
+        "src/lz4hc.h",
+        "src/lz4opt.h",
+        "src/queue.h",
+        "src/rd.h",
+        "src/rdaddr.c",
+        "src/rdaddr.h",
+        "src/rdatomic.h",
+        "src/rdavg.h",
+        "src/rdavl.c",
+        "src/rdavl.h",
+        "src/rdbuf.c",
+        "src/rdbuf.h",
+        "src/rdcrc32.h",
+        "src/rddl.h",
+        "src/rdendian.h",
+        "src/rdgz.c",
+        "src/rdgz.h",
+        "src/rdinterval.h",
+        "src/rdkafka.c",
+        "src/rdkafka.h",
+        "src/rdkafka_assignor.c",
+        "src/rdkafka_assignor.h",
+        "src/rdkafka_broker.c",
+        "src/rdkafka_broker.h",
+        "src/rdkafka_buf.c",
+        "src/rdkafka_buf.h",
+        "src/rdkafka_cgrp.c",
+        "src/rdkafka_cgrp.h",
+        "src/rdkafka_conf.c",
+        "src/rdkafka_conf.h",
+        "src/rdkafka_event.h",
+        "src/rdkafka_feature.c",
+        "src/rdkafka_feature.h",
+        "src/rdkafka_int.h",
+        "src/rdkafka_interceptor.c",
+        "src/rdkafka_interceptor.h",
+        "src/rdkafka_lz4.c",
+        "src/rdkafka_lz4.h",
+        "src/rdkafka_metadata.c",
+        "src/rdkafka_metadata.h",
+        "src/rdkafka_metadata_cache.c",
+        "src/rdkafka_msg.c",
+        "src/rdkafka_msg.h",
+        "src/rdkafka_msgset.h",
+        "src/rdkafka_msgset_reader.c",
+        "src/rdkafka_msgset_writer.c",
+        "src/rdkafka_offset.c",
+        "src/rdkafka_offset.h",
+        "src/rdkafka_op.c",
+        "src/rdkafka_op.h",
+        "src/rdkafka_partition.c",
+        "src/rdkafka_partition.h",
+        "src/rdkafka_pattern.c",
+        "src/rdkafka_pattern.h",
+        "src/rdkafka_proto.h",
+        "src/rdkafka_queue.c",
+        "src/rdkafka_queue.h",
+        "src/rdkafka_range_assignor.c",
+        "src/rdkafka_request.c",
+        "src/rdkafka_request.h",
+        "src/rdkafka_roundrobin_assignor.c",
+        "src/rdkafka_sasl.c",
+        "src/rdkafka_sasl.h",
+        "src/rdkafka_sasl_int.h",
+        "src/rdkafka_sasl_plain.c",
+        "src/rdkafka_subscription.c",
+        "src/rdkafka_subscription.h",
+        "src/rdkafka_timer.c",
+        "src/rdkafka_timer.h",
+        "src/rdkafka_topic.c",
+        "src/rdkafka_topic.h",
+        "src/rdkafka_transport.c",
+        "src/rdkafka_transport.h",
+        "src/rdkafka_transport_int.h",
+        "src/rdlist.c",
+        "src/rdlist.h",
+        "src/rdlog.c",
+        "src/rdlog.h",
+        "src/rdports.c",
+        "src/rdports.h",
+        "src/rdposix.h",
+        "src/rdrand.c",
+        "src/rdrand.h",
+        "src/rdregex.c",
+        "src/rdregex.h",
+        "src/rdstring.c",
+        "src/rdstring.h",
+        "src/rdsysqueue.h",
+        "src/rdtime.h",
+        "src/rdtypes.h",
+        "src/rdunittest.c",
+        "src/rdunittest.h",
+        "src/rdvarint.c",
+        "src/rdvarint.h",
+        "src/snappy.c",
+        "src/snappy.h",
+        "src/tinycthread.c",
+        "src/tinycthread.h",
+        "src/xxhash.c",
+        "src/xxhash.h",
+    ],
+    hdrs = [
+        "config.h",
+    ],
+    defines = [
+    ],
+    includes = [
+        "src",
+        "src-cpp",
+    ],
+    linkopts = [
+        "-lpthread",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "@boringssl//:ssl",
+    ],
+)
diff --git a/third_party/kafka/config.patch b/third_party/kafka/config.patch
new file mode 100644
index 0000000000..fa5c2d35b4
--- /dev/null
+++ b/third_party/kafka/config.patch
@@ -0,0 +1,44 @@
+diff -Naur a/config.h b/config.h
+--- a/config.h	1970-01-01 00:00:00.000000000 +0000
++++ b/config.h	2017-10-28 00:57:03.316957390 +0000
+@@ -0,0 +1,40 @@
++#pragma once
++#define WITHOUT_OPTIMIZATION 0
++#define ENABLE_DEVEL 0
++#define ENABLE_REFCNT_DEBUG 0
++#define ENABLE_SHAREDPTR_DEBUG 0
++
++#define HAVE_ATOMICS_32 1
++#define HAVE_ATOMICS_32_SYNC 1
++
++#if (HAVE_ATOMICS_32)
++# if (HAVE_ATOMICS_32_SYNC)
++#  define ATOMIC_OP32(OP1,OP2,PTR,VAL) __sync_ ## OP1 ## _and_ ## OP2(PTR, VAL)
++# else
++#  define ATOMIC_OP32(OP1,OP2,PTR,VAL) __atomic_ ## OP1 ## _ ## OP2(PTR, VAL, __ATOMIC_SEQ_CST)
++# endif
++#endif
++
++#define HAVE_ATOMICS_64 1
++#define HAVE_ATOMICS_64_SYNC 1
++
++#if (HAVE_ATOMICS_64)
++# if (HAVE_ATOMICS_64_SYNC)
++#  define ATOMIC_OP64(OP1,OP2,PTR,VAL) __sync_ ## OP1 ## _and_ ## OP2(PTR, VAL)
++# else
++#  define ATOMIC_OP64(OP1,OP2,PTR,VAL) __atomic_ ## OP1 ## _ ## OP2(PTR, VAL, __ATOMIC_SEQ_CST)
++# endif
++#endif
++
++
++#define WITH_ZLIB 1
++#define WITH_LIBDL 1
++#define WITH_PLUGINS 0
++#define WITH_SNAPPY 1
++#define WITH_SOCKEM 1
++#define WITH_SSL 1
++#define WITH_SASL 0
++#define WITH_SASL_SCRAM 0
++#define WITH_SASL_CYRUS 0
++#define HAVE_REGEX 1
++#define HAVE_STRNDUP 1
-- 
GitLab


From 12cfeb2c5291b1d2af55bf0905374043be599c5a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 26 Jan 2018 12:41:27 -0800
Subject: [PATCH 259/966] Cleanup: Ran clang-format on all *.{h,cc} files in
 tensorflow/core/framework.

PiperOrigin-RevId: 183429540
---
 tensorflow/core/framework/bfloat16.cc         | 28 ++++++------
 tensorflow/core/framework/common_shape_fns.cc |  9 ++--
 tensorflow/core/framework/fake_input.cc       | 12 ++---
 tensorflow/core/framework/function.cc         |  4 +-
 tensorflow/core/framework/function.h          |  2 +-
 tensorflow/core/framework/graph_def_util.cc   | 10 ++---
 tensorflow/core/framework/op_def_util.cc      | 24 +++++-----
 tensorflow/core/framework/op_def_util_test.cc | 30 +++++++------
 tensorflow/core/framework/op_gen_lib.cc       |  1 -
 tensorflow/core/framework/op_gen_lib.h        |  1 -
 tensorflow/core/framework/op_kernel.cc        |  3 +-
 tensorflow/core/framework/op_kernel_test.cc   | 28 +++++-------
 tensorflow/core/framework/reader_base.cc      |  6 +--
 tensorflow/core/framework/register_types.h    | 10 ++---
 .../core/framework/register_types_traits.h    |  6 +--
 tensorflow/core/framework/rendezvous_test.cc  |  8 ++--
 tensorflow/core/framework/shape_inference.h   |  2 +-
 .../core/framework/shape_inference_test.cc    |  5 ++-
 .../core/framework/tensor_shape_test.cc       |  3 +-
 tensorflow/core/framework/tensor_testutil.cc  |  2 +-
 tensorflow/core/framework/tensor_types.h      | 44 ++++++++++++-------
 tensorflow/core/framework/types_test.cc       |  4 +-
 22 files changed, 127 insertions(+), 115 deletions(-)

diff --git a/tensorflow/core/framework/bfloat16.cc b/tensorflow/core/framework/bfloat16.cc
index 0efe43fde2..6025be5170 100644
--- a/tensorflow/core/framework/bfloat16.cc
+++ b/tensorflow/core/framework/bfloat16.cc
@@ -21,13 +21,13 @@ void FloatToBFloat16(const float* src, bfloat16* dst, int64 size) {
   const uint16_t* p = reinterpret_cast<const uint16_t*>(src);
   uint16_t* q = reinterpret_cast<uint16_t*>(dst);
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-    for (; size != 0; p += 2, q++, size--) {
-      *q = p[0];
-    }
+  for (; size != 0; p += 2, q++, size--) {
+    *q = p[0];
+  }
 #else
-    for (; size != 0; p += 2, q++, size--) {
-     *q = p[1];
-    }
+  for (; size != 0; p += 2, q++, size--) {
+    *q = p[1];
+  }
 #endif
 }
 
@@ -35,15 +35,15 @@ void BFloat16ToFloat(const bfloat16* src, float* dst, int64 size) {
   const uint16_t* p = reinterpret_cast<const uint16_t*>(src);
   uint16_t* q = reinterpret_cast<uint16_t*>(dst);
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-    for (; size != 0; p++, q += 2, size--) {
-      q[0] = *p;
-      q[1] = 0;
-    }
+  for (; size != 0; p++, q += 2, size--) {
+    q[0] = *p;
+    q[1] = 0;
+  }
 #else
-    for (; size != 0; p++, q += 2, size--) {
-      q[0] = 0;
-      q[1] = *p;
-    }
+  for (; size != 0; p++, q += 2, size--) {
+    q[0] = 0;
+    q[1] = *p;
+  }
 #endif
 }
 
diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc
index 7ab8e3ec18..8bb87483e1 100644
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@@ -1356,10 +1356,11 @@ Status ScatterNdUpdateShape(InferenceContext* c) {
       Status s = c->Merge(prefix_indices, prefix_updates, &unused);
       if (!s.ok()) {
         return errors::InvalidArgument(
-            "The outer ", num_outer_dims, " dimensions of indices.shape=",
-            c->DebugString(indices_shape), " must match the outer ",
-            num_outer_dims, " dimensions of updates.shape=",
-            c->DebugString(updates_shape), ": ", s.error_message());
+            "The outer ", num_outer_dims,
+            " dimensions of indices.shape=", c->DebugString(indices_shape),
+            " must match the outer ", num_outer_dims,
+            " dimensions of updates.shape=", c->DebugString(updates_shape),
+            ": ", s.error_message());
       }
 
       ShapeHandle input_suffix;
diff --git a/tensorflow/core/framework/fake_input.cc b/tensorflow/core/framework/fake_input.cc
index ad301a8aa4..70d1e20a17 100644
--- a/tensorflow/core/framework/fake_input.cc
+++ b/tensorflow/core/framework/fake_input.cc
@@ -104,8 +104,8 @@ Status FakeInputImpl::AddInputToBuilder() {
       Status status = GetNodeAttr(*node_def_, arg_->type_list_attr(), &dts);
       if (!status.ok()) {
         return errors::InvalidArgument(
-            "Could not infer list of types for input '", arg_->name(), "': ",
-            status.error_message());
+            "Could not infer list of types for input '", arg_->name(),
+            "': ", status.error_message());
       }
       SourceList(dts);
       return Status::OK();
@@ -131,8 +131,8 @@ Status FakeInputImpl::GetN(int* n) const {
     Status status = GetNodeAttr(*node_def_, arg_->number_attr(), n);
     if (!status.ok()) {
       return errors::InvalidArgument("Could not infer length of input '",
-                                     arg_->name(), "': ",
-                                     status.error_message());
+                                     arg_->name(),
+                                     "': ", status.error_message());
     }
   }
   return Status::OK();
@@ -153,8 +153,8 @@ Status FakeInputImpl::GetDataType(DataType* dt) const {
         *dt = attr->default_value().type();
       } else {
         return errors::InvalidArgument("Could not infer type for input '",
-                                       arg_->name(), "': ",
-                                       status.error_message());
+                                       arg_->name(),
+                                       "': ", status.error_message());
       }
     }
   } else {
diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc
index 0224f25227..d6b576166c 100644
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@@ -1264,8 +1264,8 @@ FunctionDef FunctionDefHelper::Define(const string& name,
     }
     for (const string& a : src.arg) {
       const auto iter = ret_index.find(a);
-      CHECK(iter != ret_index.end()) << "Node input '" << a << "' in '"
-                                     << src.ret[0] << "' of " << name;
+      CHECK(iter != ret_index.end())
+          << "Node input '" << a << "' in '" << src.ret[0] << "' of " << name;
       n->add_input(iter->second);
     }
     for (const string& d : src.dep) {
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index 3bb5638cdf..b933ee0b0e 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -656,7 +656,7 @@ bool RegisterOp(const string& op, Creator func);
 // Returns OK the gradient creator for the "op" is found (may be
 // nullptr if REGISTER_OP_NO_GRADIENT is used.
 Status GetOpGradientCreator(const string& op, Creator* creator);
-};
+};  // namespace gradient
 
 // Declare explicit instantiations of GetAttr
 #define GET_ATTR(T)                                          \
diff --git a/tensorflow/core/framework/graph_def_util.cc b/tensorflow/core/framework/graph_def_util.cc
index bd018b7243..1f670535d5 100644
--- a/tensorflow/core/framework/graph_def_util.cc
+++ b/tensorflow/core/framework/graph_def_util.cc
@@ -35,8 +35,8 @@ namespace tensorflow {
 
 string SummarizeGraphDef(const GraphDef& graph_def) {
   string ret;
-  strings::StrAppend(&ret, "versions = ",
-                     ProtoShortDebugString(graph_def.versions()), ";\n");
+  strings::StrAppend(
+      &ret, "versions = ", ProtoShortDebugString(graph_def.versions()), ";\n");
   for (const NodeDef& node : graph_def.node()) {
     strings::StrAppend(&ret, SummarizeNodeDef(node), ";\n");
   }
@@ -90,9 +90,9 @@ static Status RemoveNewDefaultAttrsFromNodeDef(
           FindAttr(attr.first, *producer_op_def);
       if (producer_attr_def == nullptr) {
         return errors::InvalidArgument(
-            "Attr '", attr.first, "' missing in producer's OpDef: ",
-            SummarizeOpDef(*producer_op_def), " but found in node: ",
-            SummarizeNodeDef(*node_def));
+            "Attr '", attr.first,
+            "' missing in producer's OpDef: ", SummarizeOpDef(*producer_op_def),
+            " but found in node: ", SummarizeNodeDef(*node_def));
       }
       // ...and it has the same value as the default in producer,
       if (producer_attr_def->has_default_value() &&
diff --git a/tensorflow/core/framework/op_def_util.cc b/tensorflow/core/framework/op_def_util.cc
index a4e8add6c4..2d035ab90d 100644
--- a/tensorflow/core/framework/op_def_util.cc
+++ b/tensorflow/core/framework/op_def_util.cc
@@ -170,20 +170,20 @@ const OpDef::ArgDef* FindInputArg(StringPiece name, const OpDef& op_def) {
   return nullptr;
 }
 
-#define VALIDATE(EXPR, ...)                                          \
-  do {                                                               \
-    if (!(EXPR)) {                                                   \
-      return errors::InvalidArgument(__VA_ARGS__, "; in OpDef: ",    \
-                                     ProtoShortDebugString(op_def)); \
-    }                                                                \
+#define VALIDATE(EXPR, ...)                                            \
+  do {                                                                 \
+    if (!(EXPR)) {                                                     \
+      return errors::InvalidArgument(                                  \
+          __VA_ARGS__, "; in OpDef: ", ProtoShortDebugString(op_def)); \
+    }                                                                  \
   } while (false)
 
 static Status ValidateArg(const OpDef::ArgDef& arg, const OpDef& op_def,
                           bool output, std::set<string>* names) {
   const string suffix = strings::StrCat(
       output ? " for output '" : " for input '", arg.name(), "'");
-  VALIDATE(gtl::InsertIfNotPresent(names, arg.name()), "Duplicate name: ",
-           arg.name());
+  VALIDATE(gtl::InsertIfNotPresent(names, arg.name()),
+           "Duplicate name: ", arg.name());
   VALIDATE(HasAttrStyleType(arg), "Missing type", suffix);
 
   if (!arg.number_attr().empty()) {
@@ -250,8 +250,8 @@ Status ValidateOpDef(const OpDef& op_def) {
   std::set<string> names;  // for detecting duplicate names
   for (const auto& attr : op_def.attr()) {
     // Validate name
-    VALIDATE(gtl::InsertIfNotPresent(&names, attr.name()), "Duplicate name: ",
-             attr.name());
+    VALIDATE(gtl::InsertIfNotPresent(&names, attr.name()),
+             "Duplicate name: ", attr.name());
     DataType dt;
     VALIDATE(!DataTypeFromString(attr.name(), &dt), "Attr can't have name ",
              attr.name(), " that matches a data type");
@@ -680,8 +680,8 @@ Status OpDefAddedDefaultsUnchanged(const OpDef& old_op,
     if (!penultimate_attr.has_default_value() ||
         !new_attr->has_default_value()) {
       return errors::InvalidArgument("Missing default for attr '",
-                                     penultimate_attr.name(), "' in op: ",
-                                     SummarizeOpDef(new_op));
+                                     penultimate_attr.name(),
+                                     "' in op: ", SummarizeOpDef(new_op));
     }
 
     // Actually test that the attr's default value hasn't changed.
diff --git a/tensorflow/core/framework/op_def_util_test.cc b/tensorflow/core/framework/op_def_util_test.cc
index 28809c11c5..2b9812d4fc 100644
--- a/tensorflow/core/framework/op_def_util_test.cc
+++ b/tensorflow/core/framework/op_def_util_test.cc
@@ -200,10 +200,11 @@ TEST_F(ValidateOpDefTest, BadAttrDefault) {
                           "default_value { list { s: ['foo'] } } }"),
                 "Length for attr 'a' of 1 must be at least minimum 2\n\t in Op "
                 "'BadAttrDef'");
-  ExpectFailure(TestBuilder(OpDefBuilder("GoodAttrDef")
-                                .Attr("a: list(type) >=2 = [DT_STRING]")),
-                "Length for attr 'a' of 1 must be at least minimum 2\n\t in Op "
-                "'GoodAttrDef'");
+  ExpectFailure(
+      TestBuilder(
+          OpDefBuilder("GoodAttrDef").Attr("a: list(type) >=2 = [DT_STRING]")),
+      "Length for attr 'a' of 1 must be at least minimum 2\n\t in Op "
+      "'GoodAttrDef'");
 }
 
 TEST_F(ValidateOpDefTest, NoRefTypes) {
@@ -213,9 +214,10 @@ TEST_F(ValidateOpDefTest, NoRefTypes) {
   ExpectFailure(
       TestBuilder(OpDefBuilder("BadAttrDef").Attr("T: type = DT_INT32_REF")),
       "AttrValue must not have reference type value of int32_ref");
-  ExpectFailure(TestBuilder(OpDefBuilder("BadAttrDef")
-                                .Attr("T: list(type) = [DT_STRING_REF]")),
-                "AttrValue must not have reference type value of string_ref");
+  ExpectFailure(
+      TestBuilder(
+          OpDefBuilder("BadAttrDef").Attr("T: list(type) = [DT_STRING_REF]")),
+      "AttrValue must not have reference type value of string_ref");
 }
 
 TEST_F(ValidateOpDefTest, BadAttrMin) {
@@ -245,9 +247,10 @@ TEST_F(ValidateOpDefTest, BadAttrAllowed) {
   TF_EXPECT_OK(TestBuilder(
       OpDefBuilder("GoodAttrtude").Attr("x: numbertype = DT_INT32")));
   // Not in list of allowed types.
-  ExpectFailure(TestBuilder(OpDefBuilder("BadAttrtude")
-                                .Attr("x: numbertype = DT_STRING")),
-                "attr 'x' of string is not in the list of allowed values");
+  ExpectFailure(
+      TestBuilder(
+          OpDefBuilder("BadAttrtude").Attr("x: numbertype = DT_STRING")),
+      "attr 'x' of string is not in the list of allowed values");
   ExpectFailure(
       TestBuilder(OpDefBuilder("BadAttrtude")
                       .Attr("x: list(realnumbertype) = [DT_COMPLEX64]")),
@@ -260,9 +263,10 @@ TEST_F(ValidateOpDefTest, BadAttrAllowed) {
   TF_EXPECT_OK(TestBuilder(
       OpDefBuilder("GoodAttrtude").Attr("x: {'foo', 'bar'} = 'bar'")));
   // Not in list of allowed strings.
-  ExpectFailure(TestBuilder(OpDefBuilder("BadAttrtude")
-                                .Attr("x: {'foo', 'bar'} = 'baz'")),
-                "attr 'x' of \"baz\" is not in the list of allowed values");
+  ExpectFailure(
+      TestBuilder(
+          OpDefBuilder("BadAttrtude").Attr("x: {'foo', 'bar'} = 'baz'")),
+      "attr 'x' of \"baz\" is not in the list of allowed values");
   ExpectFailure(TestBuilder(OpDefBuilder("BadAttrtude")
                                 .Attr("x: list({'foo', 'bar'}) = ['baz']")),
                 "attr 'x' of \"baz\" is not in the list of allowed values");
diff --git a/tensorflow/core/framework/op_gen_lib.cc b/tensorflow/core/framework/op_gen_lib.cc
index 870bbb141b..5f2eb9d99a 100644
--- a/tensorflow/core/framework/op_gen_lib.cc
+++ b/tensorflow/core/framework/op_gen_lib.cc
@@ -296,7 +296,6 @@ static void RenameInDocs(const string& from, const string& to,
   }
 }
 
-
 namespace {
 
 // Initializes given ApiDef with data in OpDef.
diff --git a/tensorflow/core/framework/op_gen_lib.h b/tensorflow/core/framework/op_gen_lib.h
index 94fe194a1a..ff38e4b221 100644
--- a/tensorflow/core/framework/op_gen_lib.h
+++ b/tensorflow/core/framework/op_gen_lib.h
@@ -47,7 +47,6 @@ string PBTxtToMultiline(StringPiece pbtxt,
                         const std::vector<string>& multi_line_fields);
 string PBTxtFromMultiline(StringPiece multiline_pbtxt);
 
-
 // Takes a list of files with ApiDefs text protos, and allows you to
 // look up the specific ApiDef for any given op.
 class ApiDefMap {
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index 16bf5c256f..fd2d06be98 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -101,7 +101,8 @@ OpKernel::OpKernel(OpKernelConstruction* context)
 
   // Kernels executing on GPU/SYCL tie very few resources on the CPU where the
   // scheduler runs: we consider them as inexpensive.
-  expensive_ = context->device_type() != DeviceType(DEVICE_GPU) && context->device_type() != DeviceType(DEVICE_SYCL);
+  expensive_ = context->device_type() != DeviceType(DEVICE_GPU) &&
+               context->device_type() != DeviceType(DEVICE_SYCL);
 }
 
 OpKernel::~OpKernel() {}
diff --git a/tensorflow/core/framework/op_kernel_test.cc b/tensorflow/core/framework/op_kernel_test.cc
index 94a9d1335a..b53b877f28 100644
--- a/tensorflow/core/framework/op_kernel_test.cc
+++ b/tensorflow/core/framework/op_kernel_test.cc
@@ -510,10 +510,9 @@ TEST_F(OpKernelBuilderTest, BuilderBoth) {
 }
 
 REGISTER_OP("BuildTypeAttr").Attr("T: type");
-REGISTER_KERNEL_BUILDER(Name("BuildTypeAttr")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<float>("T"),
-                        DummyKernel);
+REGISTER_KERNEL_BUILDER(
+    Name("BuildTypeAttr").Device(DEVICE_CPU).TypeConstraint<float>("T"),
+    DummyKernel);
 
 TEST_F(OpKernelBuilderTest, BuilderTypeAttr) {
   ExpectSuccess("BuildTypeAttr", DEVICE_CPU, {"T|type|DT_FLOAT"});
@@ -525,10 +524,9 @@ TEST_F(OpKernelBuilderTest, BuilderTypeAttr) {
 }
 
 REGISTER_OP("BuildTypeListAttr").Attr("T: list(type)");
-REGISTER_KERNEL_BUILDER(Name("BuildTypeListAttr")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<bool>("T"),
-                        DummyKernel);
+REGISTER_KERNEL_BUILDER(
+    Name("BuildTypeListAttr").Device(DEVICE_CPU).TypeConstraint<bool>("T"),
+    DummyKernel);
 
 TEST_F(OpKernelBuilderTest, BuilderTypeListAttr) {
   ExpectSuccess("BuildTypeListAttr", DEVICE_CPU, {"T|list(type)|[]"});
@@ -574,14 +572,12 @@ TEST_F(OpKernelBuilderTest, DuplicateKernel) {
 }
 
 REGISTER_OP("DuplicateKernelForT").Attr("T: type");
-REGISTER_KERNEL_BUILDER(Name("DuplicateKernelForT")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<float>("T"),
-                        DummyKernel);
-REGISTER_KERNEL_BUILDER(Name("DuplicateKernelForT")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<float>("T"),
-                        DummyKernel);
+REGISTER_KERNEL_BUILDER(
+    Name("DuplicateKernelForT").Device(DEVICE_CPU).TypeConstraint<float>("T"),
+    DummyKernel);
+REGISTER_KERNEL_BUILDER(
+    Name("DuplicateKernelForT").Device(DEVICE_CPU).TypeConstraint<float>("T"),
+    DummyKernel);
 
 TEST_F(OpKernelBuilderTest, DuplicateKernelForT) {
   const NodeDef ndef =
diff --git a/tensorflow/core/framework/reader_base.cc b/tensorflow/core/framework/reader_base.cc
index b8c771a0a1..f84ef0f953 100644
--- a/tensorflow/core/framework/reader_base.cc
+++ b/tensorflow/core/framework/reader_base.cc
@@ -178,9 +178,9 @@ void ReaderBase::Read(QueueInterface* queue, string* key, string* value,
           " must set *at_end=true, *produced=true, or return an error.");
     }
     if (!status.ok() && produced) {
-      status = errors::Internal("ReadLocked() for ", name(),
-                                " set *produced=true *and* returned an error: ",
-                                status.ToString());
+      status = errors::Internal(
+          "ReadLocked() for ", name(),
+          " set *produced=true *and* returned an error: ", status.ToString());
     }
     if (status.ok() && at_end) {
       status = OnWorkFinishedLocked();
diff --git a/tensorflow/core/framework/register_types.h b/tensorflow/core/framework/register_types.h
index edc93aec7f..f88025fd33 100644
--- a/tensorflow/core/framework/register_types.h
+++ b/tensorflow/core/framework/register_types.h
@@ -211,14 +211,12 @@ limitations under the License.
 #define TF_CALL_SYCL_double(m)
 #else  // TENSORFLOW_SYCL_NO_DOUBLE
 #define TF_CALL_SYCL_double(m) TF_CALL_double(m)
-#endif // TENSORFLOW_SYCL_NO_DOUBLE
+#endif  // TENSORFLOW_SYCL_NO_DOUBLE
 
 #ifdef __ANDROID_TYPES_SLIM__
-#define TF_CALL_SYCL_NUMBER_TYPES(m)  TF_CALL_float(m)
+#define TF_CALL_SYCL_NUMBER_TYPES(m) TF_CALL_float(m)
 #else  // __ANDROID_TYPES_SLIM__
-#define TF_CALL_SYCL_NUMBER_TYPES(m)    \
-    TF_CALL_float(m)                    \
-    TF_CALL_SYCL_double(m)
-#endif // __ANDROID_TYPES_SLIM__
+#define TF_CALL_SYCL_NUMBER_TYPES(m) TF_CALL_float(m) TF_CALL_SYCL_double(m)
+#endif  // __ANDROID_TYPES_SLIM__
 
 #endif  // TENSORFLOW_FRAMEWORK_REGISTER_TYPES_H_
diff --git a/tensorflow/core/framework/register_types_traits.h b/tensorflow/core/framework/register_types_traits.h
index c1fe5517c6..ab35c2f095 100644
--- a/tensorflow/core/framework/register_types_traits.h
+++ b/tensorflow/core/framework/register_types_traits.h
@@ -23,7 +23,7 @@ typedef Eigen::GpuDevice GPUDevice;
 
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/core/platform/types.h"
@@ -79,7 +79,7 @@ template <>
 struct proxy_type_pod<SYCLDevice, 4> {
   typedef float type;
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 /// If POD we use proxy_type_pod, otherwise this maps to identiy.
 template <typename Device, typename T>
@@ -99,7 +99,7 @@ struct proxy_type {
 #ifdef TENSORFLOW_USE_SYCL
 #define TF_CALL_SYCL_PROXY_TYPES(m) \
   TF_CALL_double(m) TF_CALL_float(m) TF_CALL_int32(m)
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_FRAMEWORK_REGISTER_TYPES_TRAITS_H_
diff --git a/tensorflow/core/framework/rendezvous_test.cc b/tensorflow/core/framework/rendezvous_test.cc
index 32b8ad784d..de148f0bd3 100644
--- a/tensorflow/core/framework/rendezvous_test.cc
+++ b/tensorflow/core/framework/rendezvous_test.cc
@@ -69,9 +69,7 @@ class LocalRendezvousTest : public ::testing::Test {
     rendez_ = NewLocalRendezvous();
   }
 
-  ~LocalRendezvousTest() override {
-    rendez_->Unref();
-  }
+  ~LocalRendezvousTest() override { rendez_->Unref(); }
 
   void SchedClosure(std::function<void()> fn) {
     threads_.Schedule(std::move(fn));
@@ -99,8 +97,8 @@ string V(const Tensor& tensor) {
 
 Rendezvous::ParsedKey MakeKey(const string& name) {
   string s = Rendezvous::CreateKey("/job:mnist/replica:1/task:2/CPU:0", 7890,
-                                   "/job:mnist/replica:1/task:2/device:GPU:0", name,
-                                   FrameAndIter(0, 0));
+                                   "/job:mnist/replica:1/task:2/device:GPU:0",
+                                   name, FrameAndIter(0, 0));
   Rendezvous::ParsedKey k;
   TF_EXPECT_OK(Rendezvous::ParseKey(s, &k));
   return k;
diff --git a/tensorflow/core/framework/shape_inference.h b/tensorflow/core/framework/shape_inference.h
index d552ec1693..e3cc848a16 100644
--- a/tensorflow/core/framework/shape_inference.h
+++ b/tensorflow/core/framework/shape_inference.h
@@ -32,7 +32,7 @@ class ShapeRefinerTest;
 namespace grappler {
 class GraphProperties;
 class SymbolicShapeManager;
-}
+}  // namespace grappler
 
 namespace shape_inference {
 
diff --git a/tensorflow/core/framework/shape_inference_test.cc b/tensorflow/core/framework/shape_inference_test.cc
index a9b63ca60e..f48a7b9c47 100644
--- a/tensorflow/core/framework/shape_inference_test.cc
+++ b/tensorflow/core/framework/shape_inference_test.cc
@@ -760,7 +760,10 @@ TEST_F(ShapeInferenceTest, MergePrefix) {
   NodeDef def;
   InferenceContext c(kVersion, &def, MakeOpDef(4, 2),
                      {
-                         Unknown(), S({-1, 2}), S({1, -1, 3}), S({2, 4}),
+                         Unknown(),
+                         S({-1, 2}),
+                         S({1, -1, 3}),
+                         S({2, 4}),
                      },
                      {}, {}, {});
 
diff --git a/tensorflow/core/framework/tensor_shape_test.cc b/tensorflow/core/framework/tensor_shape_test.cc
index d8a9c0bac5..d7517bb311 100644
--- a/tensorflow/core/framework/tensor_shape_test.cc
+++ b/tensorflow/core/framework/tensor_shape_test.cc
@@ -582,7 +582,8 @@ TEST(TensorShapeTest, Large) {
 TEST(TensorShapeTest, Overflow) {
   int64 one = 1;
   std::vector<std::vector<int64>> overflows = {
-      {1 << 30, 1 << 30, 1 << 30}, {1 << 5, (one << 60) + 1},
+      {1 << 30, 1 << 30, 1 << 30},
+      {1 << 5, (one << 60) + 1},
   };
   for (const auto& overflow : overflows) {
     TensorShapeProto proto;
diff --git a/tensorflow/core/framework/tensor_testutil.cc b/tensorflow/core/framework/tensor_testutil.cc
index a8d1412300..8f480d65f2 100644
--- a/tensorflow/core/framework/tensor_testutil.cc
+++ b/tensorflow/core/framework/tensor_testutil.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <cmath>
 #include "tensorflow/core/framework/tensor_testutil.h"
+#include <cmath>
 
 namespace tensorflow {
 namespace test {
diff --git a/tensorflow/core/framework/tensor_types.h b/tensorflow/core/framework/tensor_types.h
index 921f88dc0b..a5c1a56bfc 100644
--- a/tensorflow/core/framework/tensor_types.h
+++ b/tensorflow/core/framework/tensor_types.h
@@ -25,7 +25,8 @@ template <typename T, int NDIMS = 1, typename IndexType = Eigen::DenseIndex>
 struct TTypes {
   // Rank-<NDIMS> tensor of scalar type T.
   typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, IndexType>,
-                           Eigen::Aligned> Tensor;
+                           Eigen::Aligned>
+      Tensor;
   typedef Eigen::TensorMap<
       Eigen::Tensor<const T, NDIMS, Eigen::RowMajor, IndexType>, Eigen::Aligned>
       ConstTensor;
@@ -33,35 +34,42 @@ struct TTypes {
   // Unaligned Rank-<NDIMS> tensor of scalar type T.
   typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, IndexType> >
       UnalignedTensor;
-  typedef Eigen::TensorMap<Eigen::Tensor<const T, NDIMS, Eigen::RowMajor,
-                                         IndexType> > UnalignedConstTensor;
+  typedef Eigen::TensorMap<
+      Eigen::Tensor<const T, NDIMS, Eigen::RowMajor, IndexType> >
+      UnalignedConstTensor;
 
   typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, int>,
-                           Eigen::Aligned> Tensor32Bit;
+                           Eigen::Aligned>
+      Tensor32Bit;
 
   // Scalar tensor (implemented as a rank-0 tensor) of scalar type T.
   typedef Eigen::TensorMap<
       Eigen::TensorFixedSize<T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>,
-      Eigen::Aligned> Scalar;
+      Eigen::Aligned>
+      Scalar;
   typedef Eigen::TensorMap<Eigen::TensorFixedSize<const T, Eigen::Sizes<>,
                                                   Eigen::RowMajor, IndexType>,
-                           Eigen::Aligned> ConstScalar;
+                           Eigen::Aligned>
+      ConstScalar;
 
   // Unaligned Scalar tensor of scalar type T.
-  typedef Eigen::TensorMap<Eigen::TensorFixedSize<
-      T, Eigen::Sizes<>, Eigen::RowMajor, IndexType> > UnalignedScalar;
+  typedef Eigen::TensorMap<
+      Eigen::TensorFixedSize<T, Eigen::Sizes<>, Eigen::RowMajor, IndexType> >
+      UnalignedScalar;
   typedef Eigen::TensorMap<Eigen::TensorFixedSize<const T, Eigen::Sizes<>,
                                                   Eigen::RowMajor, IndexType> >
       UnalignedConstScalar;
 
   // Rank-1 tensor (vector) of scalar type T.
   typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>,
-                           Eigen::Aligned> Flat;
+                           Eigen::Aligned>
+      Flat;
   typedef Eigen::TensorMap<
       Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned>
       ConstFlat;
   typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>,
-                           Eigen::Aligned> Vec;
+                           Eigen::Aligned>
+      Vec;
   typedef Eigen::TensorMap<
       Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned>
       ConstVec;
@@ -69,16 +77,19 @@ struct TTypes {
   // Unaligned Rank-1 tensor (vector) of scalar type T.
   typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType> >
       UnalignedFlat;
-  typedef Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor,
-                                         IndexType> > UnalignedConstFlat;
+  typedef Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType> >
+      UnalignedConstFlat;
   typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType> >
       UnalignedVec;
   typedef Eigen::TensorMap<
-      Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType> > UnalignedConstVec;
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType> >
+      UnalignedConstVec;
 
   // Rank-2 tensor (matrix) of scalar type T.
   typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, IndexType>,
-                           Eigen::Aligned> Matrix;
+                           Eigen::Aligned>
+      Matrix;
   typedef Eigen::TensorMap<
       Eigen::Tensor<const T, 2, Eigen::RowMajor, IndexType>, Eigen::Aligned>
       ConstMatrix;
@@ -86,8 +97,9 @@ struct TTypes {
   // Unaligned Rank-2 tensor (matrix) of scalar type T.
   typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, IndexType> >
       UnalignedMatrix;
-  typedef Eigen::TensorMap<Eigen::Tensor<const T, 2, Eigen::RowMajor,
-                                         IndexType> > UnalignedConstMatrix;
+  typedef Eigen::TensorMap<
+      Eigen::Tensor<const T, 2, Eigen::RowMajor, IndexType> >
+      UnalignedConstMatrix;
 };
 
 typedef typename TTypes<float, 1>::Tensor32Bit::Index Index32;
diff --git a/tensorflow/core/framework/types_test.cc b/tensorflow/core/framework/types_test.cc
index 5ddc986563..60f2b4135a 100644
--- a/tensorflow/core/framework/types_test.cc
+++ b/tensorflow/core/framework/types_test.cc
@@ -70,8 +70,8 @@ TEST(TypesTest, kDataTypeRefOffset) {
       << "Extra reference enum "
       << enum_descriptor->FindValueByNumber(e_ref)->name()
       << " without corresponding base enum with value " << e;
-  ASSERT_LT(DataType_MAX, e_ref) << "Gap in reference types, missing value for "
-                                 << e_ref;
+  ASSERT_LT(DataType_MAX, e_ref)
+      << "Gap in reference types, missing value for " << e_ref;
 
   // Make sure there are no enums defined after the last regular type before
   // the first reference type.
-- 
GitLab


From f6a53e7abd54afdff4d1377535d61dbc1efd174c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 26 Jan 2018 12:52:54 -0800
Subject: [PATCH 260/966] Make the graph generation of TFBT deterministic.

PiperOrigin-RevId: 183431139
---
 .../python/ops/batch_ops_utils.py              | 18 +++++++++---------
 .../python/training/functions/gbdt_batch.py    |  7 ++++---
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/tensorflow/contrib/boosted_trees/python/ops/batch_ops_utils.py b/tensorflow/contrib/boosted_trees/python/ops/batch_ops_utils.py
index b281a4c6d1..7a5f329b7a 100644
--- a/tensorflow/contrib/boosted_trees/python/ops/batch_ops_utils.py
+++ b/tensorflow/contrib/boosted_trees/python/ops/batch_ops_utils.py
@@ -81,32 +81,32 @@ def _scheduled_stamp_resource_op_runner(batch, stamp):
   if not batch:
     return
   arg_keys = set(batch[0].args.keys())
-  grouped_args = collections.defaultdict(list)
+  grouped_args = collections.OrderedDict()
   resource_handles = []
   # Check that the set of arguments is the same across all the scheduled ops.
   for op in batch:
     if set(op.args.keys()) != arg_keys:
       raise ValueError("Mismatching arguments: %s, %s.", op.args, arg_keys)
     for key in arg_keys:
-      grouped_args[key].append(op.args[key])
+      grouped_args.setdefault(key, []).append(op.args[key])
     resource_handles.append(op.resource_handle)
   # Move all the inputs to the op device in one RPC.
-  grouped_args = {
-      k: _move_tensors(v, resource_handles[0].device)
-      for k, v in grouped_args.items()
-  }
+  grouped_args = collections.OrderedDict(
+      (k, _move_tensors(v, resource_handles[0].device))
+      for k, v in sorted(grouped_args.items()))
   with ops.device(resource_handles[0].device):
     return batch[0].op(resource_handles, stamp, **grouped_args)
 
 
 def run_handler_scheduled_ops(per_handler_ops, stamp, worker_device):
   """Given a dictionary of ops for each handler, runs them in batch."""
-  batched_ops = collections.defaultdict(list)
+  batched_ops = collections.OrderedDict()
   # Group the ops by their batching_key. Ops that share the same batching key
   # can be executed together.
-  for handler in sorted(per_handler_ops.keys()):
+  for handler in per_handler_ops.keys():
     for op in per_handler_ops[handler]:
-      batched_ops[(op.batching_key(), op.batch_runner_fn())].append(op)
+      key = (op.batching_key(), op.batch_runner_fn())
+      batched_ops.setdefault(key, []).append(op)
   op_results = {}
   for batch in batched_ops.values():
     # Run each of the batched ops using its runner.
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
index b95956dae2..f0b66dcbbe 100644
--- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import copy
 
 from tensorflow.contrib import learn
@@ -163,7 +164,7 @@ def extract_features(features, feature_columns):
     scope = "gbdt"
     with variable_scope.variable_scope(scope):
       feature_columns = list(feature_columns)
-      transformed_features = {}
+      transformed_features = collections.OrderedDict()
       for fc in feature_columns:
         # pylint: disable=protected-access
         if isinstance(fc, feature_column_lib._EmbeddingColumn):
@@ -681,13 +682,13 @@ class GradientBoostedDecisionTreeModel(object):
                               control_flow_ops.no_op))
 
     # Update handler stats.
-    handler_reads = {}
+    handler_reads = collections.OrderedDict()
     for handler in handlers:
       handler_reads[handler] = handler.scheduled_reads()
 
     handler_results = batch_ops_utils.run_handler_scheduled_ops(
         handler_reads, ensemble_stamp, worker_device)
-    per_handler_updates = {}
+    per_handler_updates = collections.OrderedDict()
     # Two values per handler. First one is if the handler is active for the
     # current layer. The second one is if the handler is going to be active
     # for the next layer.
-- 
GitLab


From f84623507b46d1226385bffbf115cafa9c25e892 Mon Sep 17 00:00:00 2001
From: ted chang <htchang@us.ibm.com>
Date: Fri, 26 Jan 2018 13:15:47 -0800
Subject: [PATCH 261/966] Fix a bug in PR #15906 (#16467)

---
 tensorflow/python/tools/freeze_graph.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/tools/freeze_graph.py b/tensorflow/python/tools/freeze_graph.py
index a2e86a1c43..fd78f44c99 100644
--- a/tensorflow/python/tools/freeze_graph.py
+++ b/tensorflow/python/tools/freeze_graph.py
@@ -251,7 +251,7 @@ def main(unused_args):
                FLAGS.output_graph, FLAGS.clear_devices, FLAGS.initializer_nodes,
                FLAGS.variable_names_whitelist, FLAGS.variable_names_blacklist,
                FLAGS.input_meta_graph, FLAGS.input_saved_model_dir,
-               FLAGS.saved_model_tags, checkpoint_version=checkpoint_version)
+               FLAGS.saved_model_tags, FLAGS.checkpoint_version)
 
 
 if __name__ == "__main__":
-- 
GitLab


From d1910fa9eb274717719c4dcff3247498ea30caa4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 26 Jan 2018 13:20:42 -0800
Subject: [PATCH 262/966] Add more tests to validate the bucket boundaries for
 inputs with equal distributions.

PiperOrigin-RevId: 183435084
---
 .../python/kernel_tests/quantile_ops_test.py  | 30 +++++++++++++++++--
 1 file changed, 27 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/boosted_trees/python/kernel_tests/quantile_ops_test.py b/tensorflow/contrib/boosted_trees/python/kernel_tests/quantile_ops_test.py
index eefa7ef0dc..81f58de28c 100644
--- a/tensorflow/contrib/boosted_trees/python/kernel_tests/quantile_ops_test.py
+++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/quantile_ops_test.py
@@ -183,11 +183,10 @@ class QuantileBucketsOpTest(test_util.TensorFlowTestCase):
       self.assertEqual(num_quantiles + 1, len(buckets))
       self.assertAllEqual([2030, 2040, 2050, 2060], buckets)
 
-  def _testStreamingQuantileBucketsHelper(self, inputs):
+  def _testStreamingQuantileBucketsHelper(
+      self, inputs, num_quantiles=3, expected_buckets=None):
     """Helper to test quantile buckets on different inputs."""
 
-    # Use 3 quantiles, 4 boundaries for simplicity.
-    num_quantiles = 3
     # set generate_quantiles to True since the test will generate fewer
     # boundaries otherwise.
     with self.test_session() as sess:
@@ -213,7 +212,10 @@ class QuantileBucketsOpTest(test_util.TensorFlowTestCase):
       buckets, are_ready_flush = (sess.run(
           [buckets, are_ready_flush]))
       self.assertEqual(True, are_ready_flush)
+      # By default, use 3 quantiles, 4 boundaries for simplicity.
       self.assertEqual(num_quantiles + 1, len(buckets))
+      if expected_buckets:
+        self.assertAllEqual(buckets, expected_buckets)
 
   def testStreamingQuantileBucketsRepeatedSingleValue(self):
     inputs = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
@@ -231,6 +233,28 @@ class QuantileBucketsOpTest(test_util.TensorFlowTestCase):
     inputs = [5]
     self._testStreamingQuantileBucketsHelper(inputs)
 
+  def testStreamingQuantileBucketsEqualDistributionInSequence(self):
+    # Input pattern is of the form [1, 1, 1, 2, 2, 2, 3, 3, 3, ...]
+    ones = 100 * [1]
+    inputs = []
+    for i in range(1, 101):
+      inputs += [i * k for k in ones]
+    # Expect 100 equally spaced buckets.
+    expected_buckets = range(1, 101)
+    self._testStreamingQuantileBucketsHelper(
+        inputs, num_quantiles=99, expected_buckets=expected_buckets)
+
+  def testStreamingQuantileBucketsEqualDistributionInterleaved(self):
+    # Input pattern is of the form [1, 2, 3, 1, 2, 3, 1, 2, 3, ...]
+    sequence = range(1, 101)
+    inputs = []
+    for _ in range(1, 101):
+      inputs += sequence
+    # Expect 100 equally spaced buckets.
+    expected_buckets = range(1, 101)
+    self._testStreamingQuantileBucketsHelper(
+        inputs, num_quantiles=99, expected_buckets=expected_buckets)
+
   def testStreamingQuantileBuckets(self):
     """Sets up the quantile summary op test as follows.
 
-- 
GitLab


From 079a0e53311b4cf913be5ec5bd26bbb2b0649e93 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <bsteiner@google.com>
Date: Fri, 26 Jan 2018 13:23:02 -0800
Subject: [PATCH 263/966] Improved heuristics for swapping

PiperOrigin-RevId: 183435438
---
 .../core/grappler/costs/cost_estimator.h      |   3 +
 tensorflow/core/grappler/optimizers/BUILD     |   1 +
 .../grappler/optimizers/memory_optimizer.cc   | 149 ++++++++++++------
 .../optimizers/memory_optimizer_test.cc       |   3 +-
 4 files changed, 108 insertions(+), 48 deletions(-)

diff --git a/tensorflow/core/grappler/costs/cost_estimator.h b/tensorflow/core/grappler/costs/cost_estimator.h
index b7eaf8dc63..d442861339 100644
--- a/tensorflow/core/grappler/costs/cost_estimator.h
+++ b/tensorflow/core/grappler/costs/cost_estimator.h
@@ -78,6 +78,9 @@ struct Costs {
     MilliSeconds asMilliSeconds() const {
       return std::chrono::duration_cast<std::chrono::milliseconds>(*this);
     }
+    static NanoSeconds infinity() {
+      return NanoSeconds(std::chrono::nanoseconds::max());
+    }
   };
 
   // We store all our times in nanoseconds. If needs be, we can always switch to
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 791ad34bbe..68de03e81c 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -285,6 +285,7 @@ cc_library(
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/clusters:virtual_cluster",
         "//tensorflow/core/grappler/costs:graph_memory",
         "//tensorflow/core/grappler/costs:graph_properties",
         "//tensorflow/core/grappler/utils:topological_sort",
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer.cc b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
index f537ecc41b..6f95a00fa3 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/grappler/clusters/virtual_cluster.h"
 #include "tensorflow/core/grappler/costs/graph_memory.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/graph_view.h"
@@ -828,8 +829,7 @@ static NodeDef* FindSwapOutTrigger(
   const std::unordered_set<GraphView::InputPort, GraphView::HashPort>& fanout =
       view.GetFanout(generator);
   NodeDef* trigger = nullptr;
-  Costs::NanoSeconds earliest_fanout(
-      static_cast<double>(std::numeric_limits<int64>::max() >> 2));
+  Costs::NanoSeconds earliest_fanout(Costs::NanoSeconds::infinity());
 
   for (const auto& port : fanout) {
     if (port.node == node) {
@@ -861,6 +861,15 @@ static bool IsSwappable(GraphView::InputPort input) {
   return !IsRefType(dtype);
 }
 
+struct MemInfo {
+  GraphView::OutputPort port;
+  int64 memory_used;
+  std::vector<GraphView::InputPort> uses_left;
+  double fitness;
+
+  bool operator<(const MemInfo& other) const { return fitness < other.fitness; }
+};
+
 static bool IdentifySwappingCandidates(
     Cluster* cluster, GrapplerItem* item, std::unordered_set<string>* skip_list,
     std::unordered_map<NodeDef*, SwapInfo>* nodes_to_swap) {
@@ -890,31 +899,56 @@ static bool IdentifySwappingCandidates(
       continue;
     }
     int64 required_savings = mem_usage.used_memory - prop.memory_size();
-    // TODO(bsteiner): sort the tensors by how long they're live.
 
-    std::unordered_map<string, Costs::NanoSeconds> execution_times;
+    std::unordered_map<string, Costs::NanoSeconds> op_completion_times;
     {
-      std::unordered_map<const NodeDef*, Costs::NanoSeconds>
-          tmp_execution_times;
-      if (!EstimateEarliestExecutionTimes(*item, cluster, &tmp_execution_times)
-               .ok()) {
+      VirtualCluster vcluster(cluster->GetDevices());
+      if (!vcluster.Provision().ok()) {
         return false;
       }
-      for (const auto& exec_time : tmp_execution_times) {
-        execution_times.emplace(exec_time.first->name(), exec_time.second);
+      if (!vcluster.Initialize(*item).ok()) {
+        return false;
+      }
+      RunMetadata metadata;
+      Status s = vcluster.Run(item->graph, item->feed, item->fetch, &metadata);
+      if (!s.ok() && s.code() != error::RESOURCE_EXHAUSTED) {
+        return false;
+      }
+
+      for (const auto& dev_stats : metadata.step_stats().dev_stats()) {
+        for (const auto& node_stats : dev_stats.node_stats()) {
+          Costs::NanoSeconds exec_time =
+              Costs::NanoSeconds(1) +
+              Costs::MicroSeconds(node_stats.all_start_micros() +
+                                  node_stats.op_end_rel_micros());
+          op_completion_times.emplace(node_stats.node_name(), exec_time);
+        }
       }
     }
 
+    Costs::Duration peak_time = -1;
+    for (const auto& live_tensor : mem_usage.live_tensors) {
+      if (live_tensor.allocation_time > peak_time) {
+        peak_time = live_tensor.allocation_time;
+      }
+    }
+
+    std::vector<MemInfo> mem_state;
+
     GraphView graph(&item->graph);
     for (const auto& live_tensor : mem_usage.live_tensors) {
+      if (live_tensor.memory_used <= 1024) {
+        // Don't bother with small tensors.
+        continue;
+      }
       if (live_tensor.deallocation_time - live_tensor.allocation_time <=
           Costs::Duration(1e6)) {
         // Not enough time to swap.
         VLOG(1) << "Not enough time to swap: skipping " << live_tensor.node;
         continue;
       }
-      if (live_tensor.memory_used <= 1024) {
-        // Don't bother with small tensors.
+
+      if (skip_list->find(live_tensor.node) != skip_list->end()) {
         continue;
       }
       GraphView::OutputPort port =
@@ -922,56 +956,77 @@ static bool IdentifySwappingCandidates(
       if (!IsSwappable(graph, port)) {
         continue;
       }
-      Costs::NanoSeconds execution_time(-1);
-      GraphView::InputPort fanout_to_swap;
+      MemInfo mem_info;
+      mem_info.port = port;
+      mem_info.memory_used = live_tensor.memory_used;
+      Costs::Duration allocation_time = live_tensor.allocation_time;
+      Costs::Duration earliest_use(Costs::Duration::infinity());
+      bool valid = true;
       for (GraphView::InputPort input : graph.GetFanout(port)) {
-        if (skip_list->find(input.node->name()) != skip_list->end()) {
+        // Get execution time.
+        auto it = op_completion_times.find(input.node->name());
+        if (it == op_completion_times.end()) {
+          valid = false;
+          break;
+        }
+        if (it->second <= peak_time) {
           continue;
         }
+
+        if (skip_list->find(input.node->name()) != skip_list->end()) {
+          valid = false;
+          break;
+        }
         string input_name =
             strings::StrCat(input.node->name(), ":", input.port_id);
         if (skip_list->find(input_name) != skip_list->end()) {
-          continue;
+          valid = false;
+          break;
         }
         if (!IsSwappable(input)) {
-          continue;
-        }
-        auto it = execution_times.find(input.node->name());
-        if (it != execution_times.end()) {
-          if (it->second > execution_time) {
-            fanout_to_swap = input;
-            execution_time = it->second;
-          }
+          valid = false;
+          break;
         }
+
+        // Set earliest use time that's after peak.
+        mem_info.uses_left.emplace_back(input);
+        earliest_use = std::min(earliest_use, it->second);
       }
-      // Annotate the fanout to request the tensor to be swapped if it's not
-      // already been done.
-      bool found = false;
-      if (!fanout_to_swap.node) {
-        continue;
-      }
-      auto it = fanout_to_swap.node->attr().find("_swap_to_host");
-      if (it != fanout_to_swap.node->attr().end()) {
-        const AttrValue& val = it->second;
-        for (int port_id : val.list().i()) {
-          if (port_id == fanout_to_swap.port_id) {
-            found = true;
-            break;
-          }
-        }
+      if (valid && !mem_info.uses_left.empty()) {
+        // Compute the fitness: we need the tensor to be generated way away of
+        // the time of peak memory usage (to ensure there is enough time to swap
+        // it out). We also need to ensure it's used way after the peak time, to
+        // ensure that swapping the tensor back in won't recreate the memory
+        // bottleneck. Last but not least, we want the tensor to have as few
+        // remaining uses as possible.
+        mem_info.fitness = std::pow((earliest_use - peak_time).count(), 2);
+        mem_info.fitness /= std::pow(mem_info.uses_left.size(), 2);
+        mem_info.fitness += std::pow((allocation_time - peak_time).count(), 2);
+        mem_info.fitness = -mem_info.fitness;
+        mem_state.push_back(mem_info);
       }
-      if (!found) {
+    }
+
+    // Sort by fitness
+    std::sort(mem_state.begin(), mem_state.end());
+
+    for (const MemInfo& mem_info : mem_state) {
+      for (const GraphView::InputPort fanout_to_swap : mem_info.uses_left) {
+        VLOG(1) << "Will swap fanout " << fanout_to_swap.node->name() << ":"
+                << fanout_to_swap.port_id << " of tensor "
+                << mem_info.port.node->name() << ":" << mem_info.port.port_id
+                << " of size " << mem_info.memory_used;
+
         (*nodes_to_swap)[fanout_to_swap.node].inputs_to_swap.push_back(
             fanout_to_swap.port_id);
-        required_savings -= live_tensor.memory_used;
-        updated_graph = true;
-        if (required_savings < 0) {
-          break;
-        }
+      }
+      required_savings -= mem_info.memory_used;
+      updated_graph = true;
+      if (required_savings < 0) {
+        break;
       }
     }
   }
-
   return updated_graph;
 }
 
@@ -1011,7 +1066,7 @@ bool SwappingPass(RewriterConfig::MemOptType optimization_level,
   }
   for (auto& swap : nodes_to_swap) {
     const NodeDef* node = swap.first;
-    std::vector<OpInfo::TensorProperties> props =
+    const std::vector<OpInfo::TensorProperties>& props =
         properties.GetInputProperties(node->name());
     SwapInfo& swap_info = swap.second;
     int64 bytes_to_swap = 0;
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc b/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
index dd2d20d8d6..f5d9c87992 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
@@ -337,8 +337,9 @@ TEST_F(MemoryOptimizerTest, UnswappableInputs) {
   for (const auto& node : output.node()) {
     if (node.name() == "e") {
       // The d node isn't swappable.
-      EXPECT_EQ(4, node.input_size());
+      EXPECT_EQ(5, node.input_size());
       EXPECT_EQ("d", node.input(2));
+      EXPECT_EQ("^swap_out_d_2", node.input(4));
     }
   }
 }
-- 
GitLab


From 982549ea3423df4270ff154e5c764beb43d472da Mon Sep 17 00:00:00 2001
From: Rasmus Munk Larsen <rmlarsen@google.com>
Date: Fri, 26 Jan 2018 13:32:16 -0800
Subject: [PATCH 264/966] Branch 183429339 (#16469)

* Change `reduce_logsumexp` to internally use `reshape` rather than `squeeze`
since the latter requires the `axis` arg to be a Python `list`.

PiperOrigin-RevId: 183396533

* Kernel utils to support broadcast add and mul.

PiperOrigin-RevId: 183397494

* Updating sparsify_gather.

PiperOrigin-RevId: 183402917

* [tf.data] Move slow-path-related code into the slow path in IteratorHandleOp::Compute().

This slightly reduces the amount of work performed when an iterator is accessed (after the first access), and potentially reduces contention if concurrent steps are accessing the same iterator.

PiperOrigin-RevId: 183406221

* Cleanup: Ran clang-format on all *.{cc,h} in under grappler.

PiperOrigin-RevId: 183406440

* Increase shard count of //third_party/tensorflow/python:nn_batchnorm_test to avoid timeouts

When run under asan, the test runs for about 5 minutes, and sometimes
longer, causing frequent timeouts.

This change increases the shard count of the test to 4, which brings the run time
of the longest running shard under asan to about 2 minutes.

PiperOrigin-RevId: 183414888

* Add available choices to toco flags and fix minor formatting issues.

PiperOrigin-RevId: 183415713

* Performance improvements to some GPU code to use shared locks instead of unique locks for some hotspot cases.

PiperOrigin-RevId: 183418559

* [XLA] Improve error message for bad slices.

PiperOrigin-RevId: 183420038

* Fix py3 build rules for all py tests under py2tf.

PiperOrigin-RevId: 183422144

* Fix bug with Operation._control_inputs setter.

PiperOrigin-RevId: 183422192

* Make softmax_op_test.py work with C API enabled.

PiperOrigin-RevId: 183422829

* Cleanup: Ran clang-format on all *.{cc,h} files in tensorflow/core/kernels.

PiperOrigin-RevId: 183423961

* Fix the documentation for the dense layer for how rank > 2 inputs are handled.

PiperOrigin-RevId: 183425868

* Cleanup: Ran clang-format on all *.{cc,h} in tensorflow/core/ops.

PiperOrigin-RevId: 183429339
---
 .../compiler/xla/service/shape_inference.cc   |  55 +-
 .../xla/service/shape_inference_test.cc       |  15 +
 tensorflow/contrib/lite/kernels/BUILD         |  29 +-
 .../contrib/lite/kernels/kernel_util.cc       |  26 +
 tensorflow/contrib/lite/kernels/kernel_util.h |  17 +
 .../contrib/lite/kernels/kernel_util_test.cc  | 150 ++++
 .../contrib/lite/toco/toco_cmdline_flags.cc   |  36 +-
 tensorflow/contrib/py2tf/BUILD                |   3 +
 tensorflow/contrib/py2tf/converters/BUILD     |   8 +
 tensorflow/contrib/py2tf/pyct/BUILD           |   5 +
 .../contrib/py2tf/pyct/static_analysis/BUILD  |   3 +
 .../core/common_runtime/gpu/process_state.cc  |  18 +-
 tensorflow/core/grappler/clusters/cluster.cc  |   3 +-
 .../core/grappler/costs/virtual_scheduler.h   |   2 +-
 .../core/grappler/optimizers/auto_parallel.h  |   2 +-
 tensorflow/core/kernels/adjust_contrast_op.cc |   4 +-
 .../core/kernels/adjust_contrast_op_test.cc   |   3 +-
 .../core/kernels/adjust_saturation_op.cc      |   5 +-
 tensorflow/core/kernels/aggregate_ops_cpu.h   |  12 +-
 tensorflow/core/kernels/attention_ops.cc      |   5 +-
 tensorflow/core/kernels/avgpooling_op.h       |   5 +-
 .../core/kernels/avgpooling_op_gpu.cu.cc      |  14 +-
 tensorflow/core/kernels/barrier_ops.cc        |  34 +-
 tensorflow/core/kernels/batch_kernels.cc      |   4 +-
 .../core/kernels/batch_matmul_op_impl.h       |  27 +-
 .../core/kernels/batch_matmul_op_real.cc      |   2 +-
 .../core/kernels/batch_matmul_op_test.cc      |   7 +-
 tensorflow/core/kernels/batch_norm_op.cc      |   2 +-
 tensorflow/core/kernels/batch_norm_op_test.cc |   2 +-
 tensorflow/core/kernels/batchtospace_op.cc    |   7 +-
 tensorflow/core/kernels/bcast_ops.cc          |   2 +-
 tensorflow/core/kernels/bias_op_gpu.cu.cc     |  43 +-
 tensorflow/core/kernels/bounds_check.h        |   2 +-
 .../core/kernels/candidate_sampler_ops.cc     |  17 +-
 tensorflow/core/kernels/cast_op.cc            |  17 +-
 tensorflow/core/kernels/cast_op.h             |   3 +-
 tensorflow/core/kernels/cast_op_impl.h        |  28 +-
 tensorflow/core/kernels/cast_op_test.cc       |  12 +-
 tensorflow/core/kernels/colorspace_op.cc      |  65 +-
 tensorflow/core/kernels/colorspace_op.h       |   7 +-
 .../core/kernels/colorspace_op_gpu.cu.cc      |   4 +-
 tensorflow/core/kernels/colorspace_op_test.cc |  56 +-
 tensorflow/core/kernels/concat_lib.h          |  20 +-
 tensorflow/core/kernels/concat_lib_cpu.cc     |  32 +-
 tensorflow/core/kernels/concat_lib_cpu.h      |   6 +-
 tensorflow/core/kernels/concat_op.cc          |  21 +-
 tensorflow/core/kernels/concat_op_test.cc     |   3 +-
 .../kernels/conditional_accumulator_base.h    |   2 +-
 .../kernels/conditional_accumulator_op.cc     |   8 +-
 tensorflow/core/kernels/constant_op.cc        |   1 -
 tensorflow/core/kernels/control_flow_ops.cc   |  96 +--
 .../core/kernels/control_flow_ops_test.cc     |   1 +
 tensorflow/core/kernels/conv_ops.cc           |   2 +-
 tensorflow/core/kernels/conv_ops_fused.cc     |  78 ++-
 tensorflow/core/kernels/conv_ops_gpu.h        |   1 -
 tensorflow/core/kernels/conv_ops_gpu_3.cu.cc  |  31 +-
 .../core/kernels/conv_ops_using_gemm.cc       |  29 +-
 tensorflow/core/kernels/cross_op_gpu.cu.cc    |   2 +-
 tensorflow/core/kernels/ctc_decoder_ops.cc    |  11 +-
 tensorflow/core/kernels/ctc_loss_op.cc        |   4 +-
 tensorflow/core/kernels/cwise_op_abs.cc       |   2 +-
 tensorflow/core/kernels/cwise_op_acos.cc      |   2 +-
 tensorflow/core/kernels/cwise_op_acosh.cc     |   6 +-
 tensorflow/core/kernels/cwise_op_add_1.cc     |   3 +-
 tensorflow/core/kernels/cwise_op_add_2.cc     |   4 +-
 tensorflow/core/kernels/cwise_op_asin.cc      |   2 +-
 tensorflow/core/kernels/cwise_op_asinh.cc     |   8 +-
 tensorflow/core/kernels/cwise_op_atan.cc      |   2 +-
 tensorflow/core/kernels/cwise_op_atanh.cc     |   4 +-
 tensorflow/core/kernels/cwise_op_ceil.cc      |   2 +-
 tensorflow/core/kernels/cwise_op_cos.cc       |   2 +-
 tensorflow/core/kernels/cwise_op_cosh.cc      |  16 +-
 tensorflow/core/kernels/cwise_op_div.cc       |   2 +-
 tensorflow/core/kernels/cwise_op_exp.cc       |   2 +-
 tensorflow/core/kernels/cwise_op_expm1.cc     |   2 +-
 tensorflow/core/kernels/cwise_op_floor.cc     |   2 +-
 tensorflow/core/kernels/cwise_op_floor_div.cc |   2 +-
 tensorflow/core/kernels/cwise_op_floor_mod.cc |   2 +-
 .../core/kernels/cwise_op_gpu_conj.cu.cc      |   4 +-
 .../core/kernels/cwise_op_gpu_equal_to.cu.cc  |   2 +-
 .../core/kernels/cwise_op_gpu_select.cu.cc    |  24 +-
 tensorflow/core/kernels/cwise_op_greater.cc   |   2 +-
 .../core/kernels/cwise_op_greater_equal.cc    |   5 +-
 tensorflow/core/kernels/cwise_op_invert.cc    |   2 +-
 tensorflow/core/kernels/cwise_op_isfinite.cc  |   2 +-
 tensorflow/core/kernels/cwise_op_isinf.cc     |   2 +-
 tensorflow/core/kernels/cwise_op_isnan.cc     |   2 +-
 tensorflow/core/kernels/cwise_op_less.cc      |   2 +-
 .../core/kernels/cwise_op_less_equal.cc       |   2 +-
 tensorflow/core/kernels/cwise_op_log.cc       |   2 +-
 tensorflow/core/kernels/cwise_op_log1p.cc     |   2 +-
 tensorflow/core/kernels/cwise_op_maximum.cc   |   2 +-
 tensorflow/core/kernels/cwise_op_minimum.cc   |   2 +-
 tensorflow/core/kernels/cwise_op_mul_1.cc     |   8 +-
 tensorflow/core/kernels/cwise_op_mul_2.cc     |   6 +-
 tensorflow/core/kernels/cwise_op_neg.cc       |   2 +-
 .../core/kernels/cwise_op_not_equal_to_1.cc   |   2 +-
 .../core/kernels/cwise_op_not_equal_to_2.cc   |   2 +-
 .../core/kernels/cwise_op_reciprocal.cc       |   4 +-
 tensorflow/core/kernels/cwise_op_select.cc    |  30 +-
 tensorflow/core/kernels/cwise_op_sigmoid.cc   |   4 +-
 tensorflow/core/kernels/cwise_op_sign.cc      |   2 +-
 tensorflow/core/kernels/cwise_op_sin.cc       |   2 +-
 tensorflow/core/kernels/cwise_op_sinh.cc      |  16 +-
 tensorflow/core/kernels/cwise_op_sqrt.cc      |   4 +-
 tensorflow/core/kernels/cwise_op_square.cc    |   2 +-
 tensorflow/core/kernels/cwise_op_sub.cc       |   2 +-
 tensorflow/core/kernels/cwise_op_tan.cc       |   2 +-
 tensorflow/core/kernels/cwise_op_tanh.cc      |   2 +-
 tensorflow/core/kernels/cwise_ops_common.cc   |   6 +-
 .../core/kernels/cwise_ops_gpu_gradients.cu.h |  14 +-
 tensorflow/core/kernels/cwise_ops_gradients.h |   3 +-
 .../core/kernels/cwise_ops_sycl_common.h      |  10 +-
 tensorflow/core/kernels/cwise_ops_test.cc     |  42 +-
 tensorflow/core/kernels/data/iterator_ops.cc  |  95 ++-
 tensorflow/core/kernels/debug_ops.cc          |   6 +-
 tensorflow/core/kernels/debug_ops.h           |   4 +-
 tensorflow/core/kernels/decode_csv_op.cc      |  40 +-
 tensorflow/core/kernels/decode_image_op.cc    |  32 +-
 tensorflow/core/kernels/deep_conv2d.cc        |   6 +-
 tensorflow/core/kernels/dense_update_ops.cc   |  16 +-
 .../core/kernels/depthwise_conv_grad_op.cc    |   4 +-
 tensorflow/core/kernels/depthwise_conv_op.cc  |  15 +-
 .../core/kernels/depthwise_conv_op_gpu.cu.cc  |   4 +-
 tensorflow/core/kernels/diag_op.cc            |  56 +-
 tensorflow/core/kernels/diag_op.h             |   8 +-
 tensorflow/core/kernels/diag_op_gpu.cu.cc     |  52 +-
 tensorflow/core/kernels/diag_op_test.cc       |   5 +-
 tensorflow/core/kernels/dilation_ops.cc       |  26 +-
 .../core/kernels/dilation_ops_gpu.cu.cc       |  15 +-
 .../core/kernels/draw_bounding_box_op.cc      |  28 +-
 .../core/kernels/dynamic_partition_op.cc      |   6 +-
 .../kernels/dynamic_partition_op_gpu.cu.cc    |  57 +-
 tensorflow/core/kernels/eigen_activations.h   |  38 +-
 .../core/kernels/eigen_activations_test.cc    |   2 +-
 tensorflow/core/kernels/eigen_attention.h     |  83 ++-
 .../core/kernels/eigen_attention_test.cc      |   2 +-
 .../eigen_backward_spatial_convolutions.h     |  60 +-
 ...igen_backward_spatial_convolutions_test.cc |   2 +-
 tensorflow/core/kernels/eigen_pooling.h       |   8 +-
 tensorflow/core/kernels/eigen_pooling_test.cc |   2 +-
 tensorflow/core/kernels/eigen_softmax.h       |  61 +-
 tensorflow/core/kernels/eigen_softmax_test.cc |   2 +-
 .../core/kernels/eigen_spatial_convolutions.h |  32 +-
 tensorflow/core/kernels/encode_jpeg_op.cc     |  16 +-
 .../core/kernels/example_parsing_ops.cc       |  29 +-
 tensorflow/core/kernels/fact_op.cc            |  12 +-
 .../core/kernels/fake_quant_ops_test.cc       |  44 +-
 tensorflow/core/kernels/fifo_queue.cc         | 169 +++--
 tensorflow/core/kernels/fill_functor.cc       |  10 +-
 .../core/kernels/fractional_avg_pool_op.cc    |   5 +-
 tensorflow/core/kernels/function_ops.cc       |  31 +-
 .../core/kernels/fused_batch_norm_op.cu.cc    |   3 +-
 tensorflow/core/kernels/gather_functor.cc     |  10 +-
 tensorflow/core/kernels/gather_functor.h      |  52 +-
 tensorflow/core/kernels/gather_op.cc          |   3 +-
 tensorflow/core/kernels/hinge-loss.h          |   5 +-
 .../core/kernels/histogram_op_gpu.cu.cc       |   8 +-
 tensorflow/core/kernels/image_resizer_state.h |  10 +-
 tensorflow/core/kernels/in_topk_op.cc         |  64 +-
 tensorflow/core/kernels/inplace_ops.cc        |  10 +-
 tensorflow/core/kernels/l2loss_op.cc          |   2 +-
 tensorflow/core/kernels/linalg_ops_common.cc  |   1 -
 tensorflow/core/kernels/lmdb_reader_op.cc     |  15 +-
 tensorflow/core/kernels/logistic-loss.h       |   7 +-
 tensorflow/core/kernels/loss_test.cc          | 201 +++---
 tensorflow/core/kernels/lrn_op.cc             |  25 +-
 tensorflow/core/kernels/matching_files_op.cc  |  11 +-
 tensorflow/core/kernels/matmul_op.cc          |   4 +-
 tensorflow/core/kernels/matmul_op.h           |   3 +-
 .../core/kernels/matrix_exponential_op.cc     |  12 +-
 .../core/kernels/matrix_logarithm_op.cc       |  12 +-
 tensorflow/core/kernels/matrix_set_diag_op.cc |   4 +-
 tensorflow/core/kernels/maxpooling_op.cc      |   6 +-
 .../core/kernels/maxpooling_op_gpu.cu.cc      |   4 +-
 tensorflow/core/kernels/meta_support.cc       |   6 +-
 tensorflow/core/kernels/mfcc.cc               |  26 +-
 tensorflow/core/kernels/mfcc.h                |   5 +-
 .../core/kernels/mfcc_mel_filterbank.cc       |  46 +-
 tensorflow/core/kernels/mfcc_mel_filterbank.h |   8 +-
 .../core/kernels/mfcc_mel_filterbank_test.cc  |  15 +-
 tensorflow/core/kernels/mfcc_test.cc          |   9 +-
 tensorflow/core/kernels/mirror_pad_op.cc      |   8 +-
 tensorflow/core/kernels/mkl_avgpooling_op.cc  | 279 ++++----
 .../core/kernels/mkl_batch_matmul_op.cc       |   1 -
 tensorflow/core/kernels/mkl_concat_op.cc      | 132 ++--
 .../core/kernels/mkl_conv_grad_bias_ops.cc    |   2 +-
 .../core/kernels/mkl_conv_grad_filter_ops.cc  | 182 ++---
 .../core/kernels/mkl_conv_grad_input_ops.cc   | 107 ++-
 tensorflow/core/kernels/mkl_conv_ops.cc       | 252 +++----
 tensorflow/core/kernels/mkl_conv_ops.h        | 230 +++----
 .../core/kernels/mkl_fused_batch_norm_op.cc   | 506 ++++++--------
 .../core/kernels/mkl_input_conversion_op.cc   |  10 +-
 tensorflow/core/kernels/mkl_lrn_op.cc         | 647 +++++++++---------
 tensorflow/core/kernels/mkl_maxpooling_op.cc  | 497 +++++++-------
 .../core/kernels/mkl_pooling_ops_common.cc    |  14 +-
 .../core/kernels/mkl_pooling_ops_common.h     | 328 +++++----
 tensorflow/core/kernels/mkl_relu_op.cc        | 212 +++---
 tensorflow/core/kernels/mkl_reshape_op.cc     | 105 ++-
 tensorflow/core/kernels/mkl_tfconv_op.cc      | 124 ++++
 .../core/kernels/non_max_suppression_op.cc    |  24 +-
 .../kernels/non_max_suppression_op_test.cc    |  30 +-
 tensorflow/core/kernels/nth_element_op.cc     |  39 +-
 tensorflow/core/kernels/nth_element_op.h      |   6 +-
 tensorflow/core/kernels/one_hot_op_gpu.cu.cc  |   6 +-
 tensorflow/core/kernels/ops_util_test.cc      |  13 +-
 tensorflow/core/kernels/pack_op.cc            |   4 +-
 .../parameterized_truncated_normal_op.cc      |  40 +-
 ...arameterized_truncated_normal_op_gpu.cu.cc |  13 +-
 tensorflow/core/kernels/parse_tensor_op.cc    |   1 -
 tensorflow/core/kernels/pooling_ops_3d.cc     |   6 +-
 tensorflow/core/kernels/pooling_ops_3d_sycl.h |  17 +-
 tensorflow/core/kernels/pooling_ops_common.h  |   2 -
 .../core/kernels/quantization_utils_test.cc   |   8 +-
 .../core/kernels/quantize_and_dequantize_op.h |   3 +-
 tensorflow/core/kernels/quantize_op_test.cc   |   3 +-
 .../core/kernels/quantized_batch_norm_op.cc   |   2 +-
 .../core/kernels/quantized_concat_op.cc       |   8 +-
 tensorflow/core/kernels/quantized_conv_ops.cc |   7 +-
 .../core/kernels/quantized_instance_norm.cc   |   8 +-
 .../core/kernels/quantized_matmul_op.cc       |   6 +-
 .../core/kernels/quantized_matmul_op_test.cc  |  39 +-
 tensorflow/core/kernels/quantized_mul_op.cc   |   5 +-
 .../core/kernels/quantized_mul_op_test.cc     |  11 +-
 tensorflow/core/kernels/queue_base.cc         |   4 +-
 tensorflow/core/kernels/queue_ops.cc          |  11 +-
 tensorflow/core/kernels/random_crop_op.cc     |   8 +-
 tensorflow/core/kernels/random_op.cc          | 253 ++++---
 tensorflow/core/kernels/random_op_gpu.cu.cc   |   5 +-
 tensorflow/core/kernels/random_poisson_op.cc  |   2 +-
 .../core/kernels/random_shuffle_queue_op.cc   | 169 +++--
 .../core/kernels/reduction_gpu_kernels.cu.h   |   6 +-
 tensorflow/core/kernels/relu_op.cc            |  13 +-
 tensorflow/core/kernels/relu_op_functor.h     |  11 +-
 tensorflow/core/kernels/resize_bicubic_op.cc  |   2 +-
 .../core/kernels/resize_bicubic_op_test.cc    |  15 +-
 .../core/kernels/resize_bilinear_op_gpu.cu.cc |  20 +-
 tensorflow/core/kernels/reverse_op.cc         |   8 +-
 tensorflow/core/kernels/reverse_op_gpu.cu.cc  |  16 +-
 .../core/kernels/reverse_sequence_op.cc       |  68 +-
 .../kernels/reverse_sequence_op_gpu.cu.cc     |  12 +-
 .../core/kernels/save_restore_tensor.cc       |  10 +-
 tensorflow/core/kernels/scatter_functor.h     |  16 +-
 .../core/kernels/scatter_functor_gpu.cu.h     |  11 +-
 .../core/kernels/scatter_nd_op_cpu_impl.h     |   4 +-
 tensorflow/core/kernels/scatter_op.cc         |  30 +-
 tensorflow/core/kernels/sdca_internal.cc      |  45 +-
 tensorflow/core/kernels/sdca_internal.h       |   3 +-
 tensorflow/core/kernels/sdca_ops.cc           |  14 +-
 .../core/kernels/segment_reduction_ops.cc     |  30 +-
 .../core/kernels/segment_reduction_ops.h      |  19 +-
 .../kernels/segment_reduction_ops_gpu.cu.cc   |  12 +-
 .../core/kernels/self_adjoint_eig_op.cc       |   1 -
 tensorflow/core/kernels/sendrecv_ops.cc       |   6 +-
 tensorflow/core/kernels/sequence_ops.cc       |   8 +-
 tensorflow/core/kernels/session_ops.cc        |   2 +-
 tensorflow/core/kernels/shape_ops.h           |   8 +-
 tensorflow/core/kernels/slice_op.cc           | 168 +++--
 tensorflow/core/kernels/slice_op.h            |   1 -
 tensorflow/core/kernels/slice_op_cpu_impl.h   |   2 +-
 tensorflow/core/kernels/softmax_op.cc         |   6 +-
 .../kernels/spacetobatch_benchmark_test.cc    |   2 +-
 .../core/kernels/spacetobatch_functor.cc      |   2 +-
 .../core/kernels/spacetobatch_functor.h       |   2 +-
 .../kernels/spacetobatch_functor_gpu.cu.cc    |  10 +-
 tensorflow/core/kernels/spacetobatch_op.cc    |   7 +-
 tensorflow/core/kernels/sparse_add_grad_op.cc |  12 +-
 tensorflow/core/kernels/sparse_add_op.cc      |  15 +-
 tensorflow/core/kernels/sparse_add_op_test.cc |   4 +-
 .../sparse_conditional_accumulator_op.cc      |   5 +-
 tensorflow/core/kernels/sparse_cross_op.cc    |  16 +-
 .../kernels/sparse_dense_binary_op_shared.cc  |  10 +-
 .../sparse_dense_binary_op_shared_test.cc     |  16 +-
 tensorflow/core/kernels/sparse_matmul_op.cc   |  24 +-
 tensorflow/core/kernels/sparse_matmul_op.h    |   8 +-
 .../core/kernels/sparse_matmul_op_test.cc     |   8 +-
 .../core/kernels/sparse_reduce_sum_op_test.cc |   8 +-
 tensorflow/core/kernels/sparse_softmax_op.cc  |   5 +-
 .../kernels/sparse_sparse_binary_op_shared.cc |  15 +-
 tensorflow/core/kernels/sparse_split_op.cc    |  26 +-
 tensorflow/core/kernels/sparse_to_dense_op.cc |   5 +-
 .../core/kernels/sparse_to_dense_op_test.cc   |   1 -
 tensorflow/core/kernels/sparse_xent_op.cc     |   8 +-
 .../core/kernels/sparse_xent_op_test.cc       |   6 +-
 tensorflow/core/kernels/split_lib.h           |   2 +-
 tensorflow/core/kernels/split_lib_cpu.cc      |   4 +-
 tensorflow/core/kernels/split_op.cc           |  37 +-
 tensorflow/core/kernels/split_v_op.cc         |  14 +-
 tensorflow/core/kernels/stack_ops.cc          |  18 +-
 tensorflow/core/kernels/stage_op.cc           |  74 +-
 tensorflow/core/kernels/strided_slice_op.cc   |   2 +-
 .../core/kernels/strided_slice_op_impl.h      |   2 +-
 tensorflow/core/kernels/string_join_op.cc     |   6 +-
 tensorflow/core/kernels/substr_op.cc          |   6 +-
 tensorflow/core/kernels/summary_image_op.cc   |  16 +-
 tensorflow/core/kernels/summary_op.cc         |  11 +-
 tensorflow/core/kernels/tile_functor_cpu.cc   |   2 +-
 tensorflow/core/kernels/tile_ops_cpu_impl.h   |   2 +-
 tensorflow/core/kernels/training_ops.cc       |  42 +-
 .../core/kernels/training_ops_gpu.cu.cc       |  24 +-
 tensorflow/core/kernels/training_ops_test.cc  |   5 +-
 tensorflow/core/kernels/transpose_op.cc       |   7 +-
 tensorflow/core/kernels/typed_queue.h         |   6 +-
 tensorflow/core/kernels/unpack_op.cc          |   7 +-
 tensorflow/core/kernels/word2vec_kernels.cc   |   6 +-
 tensorflow/core/kernels/xent_op.cc            |  14 +-
 tensorflow/core/kernels/xsmm_conv2d_test.cc   | 344 +++++-----
 tensorflow/core/ops/array_ops.cc              |  17 +-
 tensorflow/core/ops/array_ops_test.cc         |   9 +-
 .../core/ops/candidate_sampling_ops_test.cc   |   9 +-
 tensorflow/core/ops/functional_grad.cc        |   2 +-
 tensorflow/core/ops/math_ops.cc               |   8 +-
 tensorflow/core/ops/nn_ops.cc                 |  12 +-
 tensorflow/core/ops/sdca_ops.cc               |   2 +-
 tensorflow/core/ops/string_ops.cc             |   6 +-
 tensorflow/core/ops/training_ops_test.cc      |   2 +-
 tensorflow/python/BUILD                       |   1 +
 tensorflow/python/framework/ops.py            |   4 +
 .../python/kernel_tests/softmax_op_test.py    |   7 +-
 tensorflow/python/layers/core.py              |   9 +-
 tensorflow/python/ops/math_ops.py             |   9 +-
 tensorflow/stream_executor/executor_cache.cc  |  23 +-
 .../stream_executor/multi_platform_manager.cc |   4 +-
 .../tools/graph_transforms/sparsify_gather.cc | 109 ++-
 .../graph_transforms/sparsify_gather_test.cc  |  40 +-
 325 files changed, 4747 insertions(+), 4414 deletions(-)
 create mode 100644 tensorflow/contrib/lite/kernels/kernel_util_test.cc
 create mode 100644 tensorflow/core/kernels/mkl_tfconv_op.cc

diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index a6d6c8b27f..4ba6da6ccc 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -37,6 +37,9 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
 
+using tensorflow::str_util::Join;
+using tensorflow::strings::Printf;
+
 namespace xla {
 
 namespace {
@@ -934,7 +937,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
       "inferring shape for <%s>(%s, %s) with broadcast_dimensions={%s}",
       BinaryOperation_Name(operation).c_str(),
       ShapeUtil::HumanString(lhs).c_str(), ShapeUtil::HumanString(rhs).c_str(),
-      tensorflow::str_util::Join(broadcast_dimensions, ", ").c_str());
+      Join(broadcast_dimensions, ", ").c_str());
   TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(lhs));
   TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(rhs));
 
@@ -1097,7 +1100,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     return InvalidArgument(
         "Map operation requires all operands to have the same shape; got: "
         "%s",
-        tensorflow::str_util::Join(pieces, ", ").c_str());
+        Join(pieces, ", ").c_str());
   }
 
   // Check that dimensions.size == arg_shape.dimensions_size() (we currently
@@ -1114,7 +1117,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     if (dimensions[i] != i) {
       return InvalidArgument(
           "Map requires monotonically increasing dimension numbers, found: %s ",
-          tensorflow::str_util::Join(dimensions, ", ").c_str());
+          Join(dimensions, ", ").c_str());
     }
   }
 
@@ -1914,21 +1917,28 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     const Shape& arg, tensorflow::gtl::ArraySlice<int64> starts,
     tensorflow::gtl::ArraySlice<int64> limits,
     tensorflow::gtl::ArraySlice<int64> strides) {
+  auto error = [&](const string& message) {
+    return InvalidArgument(
+        "%s in slice operation; argument shape: %s; starts: {%s}; limits: "
+        "{%s}; strides: {%s}",
+        message.c_str(), ShapeUtil::HumanString(arg).c_str(),
+        Join(starts, ",").c_str(), Join(limits, ",").c_str(),
+        Join(strides, ",").c_str());
+  };
   TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(arg, "operand of slice"));
   VLOG(2) << tensorflow::strings::Printf(
       "slicing shape %s starts={%s} limits={%s}",
-      ShapeUtil::HumanString(arg).c_str(),
-      tensorflow::str_util::Join(starts, ", ").c_str(),
-      tensorflow::str_util::Join(limits, ", ").c_str());
+      ShapeUtil::HumanString(arg).c_str(), Join(starts, ", ").c_str(),
+      Join(limits, ", ").c_str());
 
   if (starts.size() != limits.size()) {
-    return InvalidArgument("slice start and limit sizes differ: %zu vs %zu",
-                           starts.size(), limits.size());
+    return error(Printf("slice start and limit sizes differ: %zu vs %zu",
+                        starts.size(), limits.size()));
   }
 
   if (starts.size() != strides.size()) {
-    return InvalidArgument("slice start and strides sizes differ: %zu vs %zu",
-                           starts.size(), strides.size());
+    return error(Printf("slice start and strides sizes differ: %zu vs %zu",
+                        starts.size(), strides.size()));
   }
 
   if (starts.size() != ShapeUtil::Rank(arg)) {
@@ -1947,20 +1957,20 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
                              start_index);
     }
     if (limit_index > arg.dimensions(dimension)) {
-      return InvalidArgument(
-          "limit index (%lld) must be less than or equal to dimension "
-          "size (%lld)",
-          limit_index, arg.dimensions(dimension));
+      return error(
+          Printf("limit index (%lld) must be less than or equal to dimension "
+                 "size (%lld)",
+                 limit_index, arg.dimensions(dimension)));
     }
     VLOG(2) << tensorflow::strings::Printf("starts[%lld] = %lld", dimension,
                                            start_index);
     VLOG(2) << tensorflow::strings::Printf("limits[%lld] = %lld", dimension,
                                            limit_index);
     if (start_index > limit_index) {
-      return InvalidArgument(
-          "limit index (%lld) must be greater or equal to "
-          "start index (%lld) in slice with positive stride",
-          limit_index, start_index);
+      return error(
+          Printf("limit index (%lld) must be greater or equal to "
+                 "start index (%lld) in slice with positive stride",
+                 limit_index, start_index));
     }
     if (stride <= 0) {
       return InvalidArgument("stride (%lld) must be positive", stride);
@@ -1983,7 +1993,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
       "slicing shape %s at dynamic start_indices %s with slice_sizes={%s}",
       ShapeUtil::HumanString(operand_shape).c_str(),
       ShapeUtil::HumanString(start_indices_shape).c_str(),
-      tensorflow::str_util::Join(slice_sizes, ", ").c_str());
+      Join(slice_sizes, ", ").c_str());
 
   if (ShapeUtil::Rank(start_indices_shape) != 1) {
     return InvalidArgument(
@@ -2280,8 +2290,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     return InvalidArgument(
         "Reshape dimensions [%s] are not a permutation of the operand "
         "dimensions (operand shape is %s).",
-        tensorflow::str_util::Join(dimensions, ",").c_str(),
-        ShapeUtil::HumanString(operand).c_str());
+        Join(dimensions, ",").c_str(), ShapeUtil::HumanString(operand).c_str());
   }
 
   return inferred_shape;
@@ -2373,8 +2382,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
   // The applied function's arity equals the number of arguments.
   if (arg_shapes.size() != to_apply.parameters_size()) {
     string computation_signature = ShapeUtil::HumanString(to_apply);
-    string argument_shapes = tensorflow::str_util::Join(
-        arg_shapes, ", ", [](string* out, const Shape* shape) {
+    string argument_shapes =
+        Join(arg_shapes, ", ", [](string* out, const Shape* shape) {
           tensorflow::strings::StrAppend(out, ShapeUtil::HumanString(*shape));
         });
     return InvalidArgument(
diff --git a/tensorflow/compiler/xla/service/shape_inference_test.cc b/tensorflow/compiler/xla/service/shape_inference_test.cc
index 99d87f3b55..026c021165 100644
--- a/tensorflow/compiler/xla/service/shape_inference_test.cc
+++ b/tensorflow/compiler/xla/service/shape_inference_test.cc
@@ -1512,5 +1512,20 @@ TEST_F(ShapeInferenceTest, Conditional) {
                         "must have the same shape"));
 }
 
+TEST_F(ShapeInferenceTest, BadSlice) {
+  auto arg = ShapeUtil::MakeShape(F32, {4});
+  StatusOr<Shape> statusor =
+      ShapeInference::InferSliceShape(arg, {0}, {5}, {1});
+  ASSERT_FALSE(statusor.ok());
+
+  LOG(INFO) << statusor.status();
+
+  EXPECT_THAT(statusor.status().error_message(),
+              HasSubstr("less than or equal to dimension size"))
+      << statusor.status();
+  EXPECT_THAT(statusor.status().error_message(), HasSubstr("argument shape"))
+      << statusor.status();
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/contrib/lite/kernels/BUILD b/tensorflow/contrib/lite/kernels/BUILD
index 4195e7553c..b5428d3246 100644
--- a/tensorflow/contrib/lite/kernels/BUILD
+++ b/tensorflow/contrib/lite/kernels/BUILD
@@ -71,6 +71,32 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "kernel_util",
+    srcs = [
+        "kernel_util.cc",
+    ],
+    hdrs = [
+        "kernel_util.h",
+    ],
+    deps = [
+        "//tensorflow/contrib/lite:builtin_op_data",
+        "//tensorflow/contrib/lite:context",
+        "//tensorflow/contrib/lite/kernels/internal:round",
+    ],
+)
+
+tf_cc_test(
+    name = "kernel_util_test",
+    size = "small",
+    srcs = ["kernel_util_test.cc"],
+    deps = [
+        ":kernel_util",
+        "//tensorflow/contrib/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_library(
     name = "builtin_ops",
     srcs = [
@@ -87,7 +113,6 @@ cc_library(
         "fully_connected.cc",
         "gather.cc",
         "hashtable_lookup.cc",
-        "kernel_util.cc",
         "l2norm.cc",
         "local_response_norm.cc",
         "lsh_projection.cc",
@@ -111,7 +136,6 @@ cc_library(
         "unidirectional_sequence_rnn.cc",
     ],
     hdrs = [
-        "kernel_util.h",
         "padding.h",
         "register.h",
     ],
@@ -125,6 +149,7 @@ cc_library(
     }),
     deps = [
         ":activation_functor",
+        ":kernel_util",
         ":op_macros",
         "//tensorflow/contrib/lite:builtin_op_data",
         "//tensorflow/contrib/lite:framework",
diff --git a/tensorflow/contrib/lite/kernels/kernel_util.cc b/tensorflow/contrib/lite/kernels/kernel_util.cc
index b0546c00cf..955e8c5764 100644
--- a/tensorflow/contrib/lite/kernels/kernel_util.cc
+++ b/tensorflow/contrib/lite/kernels/kernel_util.cc
@@ -13,8 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/contrib/lite/kernels/kernel_util.h"
+
 #include <algorithm>
 #include <cmath>
+#include <memory>
+
 #include "tensorflow/contrib/lite/kernels/internal/round.h"
 
 namespace tflite {
@@ -84,4 +87,27 @@ void CalculateActivationRangeFloat(TfLiteFusedActivation activation,
   }
 }
 
+bool HaveSameShapes(TfLiteTensor* input1, TfLiteTensor* input2) {
+  return TfLiteIntArrayEqual(input1->dims, input2->dims);
+}
+
+TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context,
+                                        TfLiteTensor* input1,
+                                        TfLiteTensor* input2,
+                                        TfLiteIntArray** output_shape) {
+  int64_t dims1 = NumDimensions(input1);
+  int64_t dims2 = NumDimensions(input2);
+  int64_t out_dims = std::max(dims1, dims2);
+  std::unique_ptr<TfLiteIntArray, void (*)(TfLiteIntArray*)> shape(
+      TfLiteIntArrayCreate(out_dims), TfLiteIntArrayFree);
+  for (int i = 0; i < out_dims; ++i) {
+    int64_t d1 = i >= dims1 ? 1 : SizeOfDimension(input1, dims1 - i - 1);
+    int64_t d2 = i >= dims2 ? 1 : SizeOfDimension(input2, dims2 - i - 1);
+    TF_LITE_ENSURE(context, d1 == d2 || d1 == 1 || d2 == 1);
+    shape->data[out_dims - i - 1] = std::max(d1, d2);
+  }
+  *output_shape = shape.release();
+  return kTfLiteOk;
+}
+
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/kernel_util.h b/tensorflow/contrib/lite/kernels/kernel_util.h
index bfdfba00f5..3cfa72615a 100644
--- a/tensorflow/contrib/lite/kernels/kernel_util.h
+++ b/tensorflow/contrib/lite/kernels/kernel_util.h
@@ -35,6 +35,14 @@ inline TfLiteTensor* GetOutput(TfLiteContext* context, TfLiteNode* node,
 inline int NumInputs(const TfLiteNode* node) { return node->inputs->size; }
 inline int NumOutputs(const TfLiteNode* node) { return node->outputs->size; }
 
+inline int64_t NumElements(const TfLiteTensor* t) {
+  int64_t count = 1;
+  for (int i = 0; i < NumDimensions(t); ++i) {
+    count *= SizeOfDimension(t, i);
+  }
+  return count;
+}
+
 inline TfLiteTensor* GetOptionalInputTensor(TfLiteContext* context,
                                             const TfLiteNode* node, int index) {
   const bool use_tensor = node->inputs->data[index] != kOptionalTensor;
@@ -76,6 +84,15 @@ void CalculateActivationRangeFloat(TfLiteFusedActivation activation,
                                    float* activation_min,
                                    float* activation_max);
 
+// Return true if the given tensors have the same shape.
+bool HaveSameShapes(TfLiteTensor* input1, TfLiteTensor* input2);
+
+// Calculate the output_shape that is necessary for element-wise operations
+// with broadcasting involving the two input tensors.
+TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context,
+                                        TfLiteTensor* input1,
+                                        TfLiteTensor* input2,
+                                        TfLiteIntArray** output_shape);
 }  // namespace tflite
 
 #endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_KERNEL_UTIL_H_
diff --git a/tensorflow/contrib/lite/kernels/kernel_util_test.cc b/tensorflow/contrib/lite/kernels/kernel_util_test.cc
new file mode 100644
index 0000000000..63a317f338
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/kernel_util_test.cc
@@ -0,0 +1,150 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/testing/util.h"
+
+namespace tflite {
+namespace {
+
+void ReportError(TfLiteContext* context, const char* format, ...) {}
+
+class KernelUtilTest : public ::testing::Test {
+ public:
+  KernelUtilTest() {
+    context_.ReportError = ReportError;
+
+    tensor1_.dims = nullptr;
+    tensor2_.dims = nullptr;
+  }
+  ~KernelUtilTest() {
+    TfLiteTensorFree(&tensor1_);
+    TfLiteTensorFree(&tensor2_);
+  }
+
+  void SetShape(TfLiteTensor* tensor, std::initializer_list<int> dims) {
+    TfLiteTensorFree(tensor);
+    tensor->dims = TfLiteIntArrayCreate(dims.size());
+    int i = 0;
+    for (int d : dims) {
+      tensor->dims->data[i] = d;
+      ++i;
+    }
+  }
+
+  std::vector<int> GetShape(TfLiteIntArray* dims) {
+    std::vector<int> result;
+    for (int i = 0; i < dims->size; ++i) {
+      result.push_back(dims->data[i]);
+    }
+    return result;
+  }
+
+ protected:
+  TfLiteContext context_;
+  TfLiteTensor tensor1_;
+  TfLiteTensor tensor2_;
+};
+
+TEST_F(KernelUtilTest, SameShapeEmpty) {
+  EXPECT_TRUE(HaveSameShapes(&tensor1_, &tensor2_));
+
+  SetShape(&tensor1_, {1, 2, 3});
+  EXPECT_FALSE(HaveSameShapes(&tensor1_, &tensor2_));
+
+  SetShape(&tensor2_, {1, 2});
+  EXPECT_FALSE(HaveSameShapes(&tensor1_, &tensor2_));
+
+  SetShape(&tensor2_, {1, 2, 3, 4});
+  EXPECT_FALSE(HaveSameShapes(&tensor1_, &tensor2_));
+
+  SetShape(&tensor2_, {1, 2, 3});
+  EXPECT_TRUE(HaveSameShapes(&tensor1_, &tensor2_));
+
+  SetShape(&tensor2_, {});
+  EXPECT_FALSE(HaveSameShapes(&tensor1_, &tensor2_));
+
+  SetShape(&tensor1_, {});
+  EXPECT_TRUE(HaveSameShapes(&tensor1_, &tensor2_));
+}
+
+TEST_F(KernelUtilTest, BroadcastShapeIncompatibleDim) {
+  TfLiteIntArray* output = nullptr;
+  SetShape(&tensor1_, {1, 2});
+  SetShape(&tensor2_, {1, 3});
+  EXPECT_NE(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
+                                                  &tensor2_, &output));
+  EXPECT_EQ(output, nullptr);
+}
+
+TEST_F(KernelUtilTest, BroadcastShapeOnes) {
+  TfLiteIntArray* output = nullptr;
+  SetShape(&tensor1_, {1, 1});
+  SetShape(&tensor2_, {1, 3});
+  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
+                                                  &tensor2_, &output));
+  TfLiteIntArrayFree(output);
+
+  SetShape(&tensor1_, {1, 2});
+  SetShape(&tensor2_, {1, 1});
+  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
+                                                  &tensor2_, &output));
+  TfLiteIntArrayFree(output);
+}
+
+TEST_F(KernelUtilTest, BroadcastShapeScalars) {
+  TfLiteIntArray* output = nullptr;
+  SetShape(&tensor1_, {1, 2});
+  SetShape(&tensor2_, {});
+  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
+                                                  &tensor2_, &output));
+  EXPECT_THAT(GetShape(output), ::testing::ElementsAre(1, 2));
+  TfLiteIntArrayFree(output);
+
+  SetShape(&tensor1_, {});
+  SetShape(&tensor2_, {2});
+  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
+                                                  &tensor2_, &output));
+  EXPECT_THAT(GetShape(output), ::testing::ElementsAre(2));
+  TfLiteIntArrayFree(output);
+}
+
+TEST_F(KernelUtilTest, BroadcastShapeDifferentSizes) {
+  TfLiteIntArray* output = nullptr;
+  SetShape(&tensor1_, {1, 2});
+  SetShape(&tensor2_, {3, 1, 1});
+  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
+                                                  &tensor2_, &output));
+  EXPECT_THAT(GetShape(output), ::testing::ElementsAre(3, 1, 2));
+  TfLiteIntArrayFree(output);
+
+  SetShape(&tensor1_, {1, 2, 3, 4});
+  SetShape(&tensor2_, {1, 3, 1});
+  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
+                                                  &tensor2_, &output));
+  EXPECT_THAT(GetShape(output), ::testing::ElementsAre(1, 2, 3, 4));
+  TfLiteIntArrayFree(output);
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc b/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
index f8281f3a57..c5a62fdb62 100644
--- a/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
+++ b/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
@@ -44,9 +44,11 @@ bool ParseTocoFlagsFromCommandLineFlags(
            "For Protobuf formats, the binary format will be used."),
       Flag("input_format", parsed_flags.input_format.bind(),
            parsed_flags.input_format.default_value(),
-           "Input file format. One of: tensorflow_graphdef, "),
+           "Input file format. One of: TENSORFLOW_GRAPHDEF, TFLITE."),
       Flag("output_format", parsed_flags.output_format.bind(),
-           parsed_flags.output_format.default_value(), "Output file format."),
+           parsed_flags.output_format.default_value(),
+           "Output file format. "
+           "One of TENSORFLOW_GRAPHDEF, TFLITE, GRAPHVIZ_DOT."),
       Flag("default_ranges_min", parsed_flags.default_ranges_min.bind(),
            parsed_flags.default_ranges_min.default_value(),
            "If defined, will be used as the default value for the min bound "
@@ -58,11 +60,13 @@ bool ParseTocoFlagsFromCommandLineFlags(
       Flag("inference_type", parsed_flags.inference_type.bind(),
            parsed_flags.inference_type.default_value(),
            "Target data type of arrays in the output file (for input_arrays, "
-           "this may be overridden by inference_input_type)."),
+           "this may be overridden by inference_input_type). "
+           "One of FLOAT, QUANTIZED_UINT8."),
       Flag("inference_input_type", parsed_flags.inference_input_type.bind(),
            parsed_flags.inference_input_type.default_value(),
-           "Target data type of input arrays. If not specified, inference_type "
-           "is used."),
+           "Target data type of input arrays. "
+           "If not specified, inference_type is used. "
+           "One of FLOAT, QUANTIZED_UINT8."),
       Flag("input_type", parsed_flags.input_type.bind(),
            parsed_flags.input_type.default_value(),
            "Deprecated ambiguous flag that set both --input_data_types and "
@@ -76,35 +80,31 @@ bool ParseTocoFlagsFromCommandLineFlags(
 
       Flag("drop_fake_quant", parsed_flags.drop_fake_quant.bind(),
            parsed_flags.drop_fake_quant.default_value(),
-           "Ignore and discard FakeQuant nodes. For instance, that can be used "
-           "to "
+           "Ignore and discard FakeQuant nodes. For instance, to "
            "generate plain float code without fake-quantization from a "
-           "quantized "
-           "graph."),
+           "quantized graph."),
       Flag(
           "reorder_across_fake_quant",
           parsed_flags.reorder_across_fake_quant.bind(),
           parsed_flags.reorder_across_fake_quant.default_value(),
           "Normally, FakeQuant nodes must be strict boundaries for graph "
           "transformations, in order to ensure that quantized inference has "
-          "the "
-          "exact same arithmetic behavior as quantized training --- which is "
-          "the "
-          "whole point of quantized training and of FakeQuant nodes in the "
-          "first "
-          "place. However, that entails subtle requirements on where exactly "
+          "the exact same arithmetic behavior as quantized training --- which "
+          "is the whole point of quantized training and of FakeQuant nodes in "
+          "the first place. "
+          "However, that entails subtle requirements on where exactly "
           "FakeQuant nodes must be placed in the graph. Some quantized graphs "
           "have FakeQuant nodes at unexpected locations, that prevent graph "
           "transformations that are necessary in order to generate inference "
           "code for these graphs. Such graphs should be fixed, but as a "
           "temporary work-around, setting this reorder_across_fake_quant flag "
-          "allows toco to perform necessary graph transformaitons on them, "
+          "allows TOCO to perform necessary graph transformaitons on them, "
           "at the cost of no longer faithfully matching inference and training "
           "arithmetic."),
       Flag("allow_custom_ops", parsed_flags.allow_custom_ops.bind(),
            parsed_flags.allow_custom_ops.default_value(),
-           "If true, allow TOCO to create TF Lite Custom operators for all the"
-           "unsupported Tensorflow ops."),
+           "If true, allow TOCO to create TF Lite Custom operators for all the "
+           "unsupported TensorFlow ops."),
       Flag(
           "drop_control_dependency",
           parsed_flags.drop_control_dependency.bind(),
diff --git a/tensorflow/contrib/py2tf/BUILD b/tensorflow/contrib/py2tf/BUILD
index d395de986d..3e846aefeb 100644
--- a/tensorflow/contrib/py2tf/BUILD
+++ b/tensorflow/contrib/py2tf/BUILD
@@ -57,6 +57,7 @@ py_library(
 py_test(
     name = "api_test",
     srcs = ["api_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":py2tf_internal",
         "//tensorflow/python:client_testlib",
@@ -66,6 +67,7 @@ py_test(
 py_test(
     name = "conversion_test",
     srcs = ["conversion_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":py2tf_internal",
         "//tensorflow/python:client_testlib",
@@ -76,6 +78,7 @@ py_test(
 py_test(
     name = "naming_test",
     srcs = ["naming_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":py2tf_internal",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/py2tf/converters/BUILD b/tensorflow/contrib/py2tf/converters/BUILD
index 2b0a1234e6..4f90f94e09 100644
--- a/tensorflow/contrib/py2tf/converters/BUILD
+++ b/tensorflow/contrib/py2tf/converters/BUILD
@@ -52,6 +52,7 @@ py_library(
 py_test(
     name = "break_canonicalization_test",
     srcs = ["break_canonicalization_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":test_lib",
         "//tensorflow/contrib/py2tf/pyct",
@@ -62,6 +63,7 @@ py_test(
 py_test(
     name = "call_trees_test",
     srcs = ["call_trees_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":test_lib",
         "//tensorflow/contrib/py2tf/pyct",
@@ -72,6 +74,7 @@ py_test(
 py_test(
     name = "continue_canonicalization_test",
     srcs = ["continue_canonicalization_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":test_lib",
         "//tensorflow/contrib/py2tf/pyct",
@@ -82,6 +85,7 @@ py_test(
 py_test(
     name = "control_flow_test",
     srcs = ["control_flow_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":test_lib",
         "//tensorflow/contrib/py2tf/pyct",
@@ -92,6 +96,7 @@ py_test(
 py_test(
     name = "builtin_functions_test",
     srcs = ["builtin_functions_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":test_lib",
         "//tensorflow/contrib/py2tf/pyct",
@@ -112,6 +117,7 @@ py_test(
 py_test(
     name = "logical_expressions_test",
     srcs = ["logical_expressions_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":test_lib",
         "//tensorflow/contrib/py2tf/pyct",
@@ -122,6 +128,7 @@ py_test(
 py_test(
     name = "print_functions_test",
     srcs = ["print_functions_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":test_lib",
         "//tensorflow/contrib/py2tf/pyct",
@@ -133,6 +140,7 @@ py_test(
 py_test(
     name = "side_effect_guards_test",
     srcs = ["side_effect_guards_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":test_lib",
         "//tensorflow/contrib/py2tf/pyct",
diff --git a/tensorflow/contrib/py2tf/pyct/BUILD b/tensorflow/contrib/py2tf/pyct/BUILD
index e0331dbc97..88902dea84 100644
--- a/tensorflow/contrib/py2tf/pyct/BUILD
+++ b/tensorflow/contrib/py2tf/pyct/BUILD
@@ -38,6 +38,7 @@ py_library(
 py_test(
     name = "anno_test",
     srcs = ["anno_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":pyct",
         "//tensorflow/python:client_testlib",
@@ -47,6 +48,7 @@ py_test(
 py_test(
     name = "compiler_test",
     srcs = ["compiler_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":pyct",
         "//tensorflow/python:client_testlib",
@@ -57,6 +59,7 @@ py_test(
 py_test(
     name = "parser_test",
     srcs = ["parser_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":pyct",
         "//tensorflow/python:client_testlib",
@@ -66,6 +69,7 @@ py_test(
 py_test(
     name = "pretty_printer_test",
     srcs = ["pretty_printer_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":pyct",
         "//tensorflow/python:client_testlib",
@@ -75,6 +79,7 @@ py_test(
 py_test(
     name = "templates_test",
     srcs = ["templates_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":pyct",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/BUILD b/tensorflow/contrib/py2tf/pyct/static_analysis/BUILD
index abaf953678..32e2954fff 100644
--- a/tensorflow/contrib/py2tf/pyct/static_analysis/BUILD
+++ b/tensorflow/contrib/py2tf/pyct/static_analysis/BUILD
@@ -32,6 +32,7 @@ py_library(
 py_test(
     name = "access_test",
     srcs = ["access_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":static_analysis",
         "//tensorflow/contrib/py2tf/pyct",
@@ -43,6 +44,7 @@ py_test(
 py_test(
     name = "live_values_test",
     srcs = ["live_values_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":static_analysis",
         "//tensorflow/contrib/py2tf/pyct",
@@ -53,6 +55,7 @@ py_test(
 py_test(
     name = "type_info_test",
     srcs = ["type_info_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":static_analysis",
         "//tensorflow/contrib/py2tf/pyct",
diff --git a/tensorflow/core/common_runtime/gpu/process_state.cc b/tensorflow/core/common_runtime/gpu/process_state.cc
index 995fd1253f..2f13cf8bd7 100644
--- a/tensorflow/core/common_runtime/gpu/process_state.cc
+++ b/tensorflow/core/common_runtime/gpu/process_state.cc
@@ -230,8 +230,24 @@ Allocator* ProcessState::GetCUDAHostAllocator(int numa_node) {
   // TODO(tucker): actually maintain separate CPUAllocators for
   // different numa_nodes.  For now, just one.
   numa_node = 0;
-  mutex_lock lock(mu_);
 
+  {
+    // Here we optimize the most common use case where cuda_host_allocators_
+    // and cuda_al_ have already been populated and since we're only reading
+    // these vectors, we can get by with a shared lock. In the slower case,
+    // we take a unique lock and populate these vectors.
+    tf_shared_lock lock(mu_);
+
+    if (FLAGS_brain_gpu_record_mem_types &&
+        static_cast<int>(cuda_al_.size()) > 0) {
+      return cuda_al_[0];
+    }
+    if (static_cast<int>(cuda_host_allocators_.size()) > numa_node) {
+      return cuda_host_allocators_[0];
+    }
+  }
+
+  mutex_lock lock(mu_);
   // Find the first valid StreamExecutor to request CUDA host memory
   // through, since any will work.
   //
diff --git a/tensorflow/core/grappler/clusters/cluster.cc b/tensorflow/core/grappler/clusters/cluster.cc
index 01a618ed77..39bfca244e 100644
--- a/tensorflow/core/grappler/clusters/cluster.cc
+++ b/tensorflow/core/grappler/clusters/cluster.cc
@@ -23,8 +23,7 @@ Cluster::Cluster(int timeout_s) : timeout_s_(timeout_s) {
   DisableDetailedStats(false);
 }
 
-Cluster::~Cluster() {
-}
+Cluster::~Cluster() {}
 
 void Cluster::AllowSoftPlacement(bool soft_placement_state) {
   options_.config.set_allow_soft_placement(soft_placement_state);
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.h b/tensorflow/core/grappler/costs/virtual_scheduler.h
index 9db6d46266..5116c8183c 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.h
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.h
@@ -325,7 +325,7 @@ class VirtualScheduler {
   // Boolean field for whether the cost is accurate.
   std::map<string, std::pair<int, bool>> op_costs_;
 
-  Costs graph_costs_;                // Graph cost.
+  Costs graph_costs_;                   // Graph cost.
   std::map<string, Costs> op_to_cost_;  // Per-op cost.
 
   // Auxilliary data structures for constructing NodeState and DeviceState.
diff --git a/tensorflow/core/grappler/optimizers/auto_parallel.h b/tensorflow/core/grappler/optimizers/auto_parallel.h
index c5d2d47782..8d1098d877 100644
--- a/tensorflow/core/grappler/optimizers/auto_parallel.h
+++ b/tensorflow/core/grappler/optimizers/auto_parallel.h
@@ -16,8 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_
 #define TENSORFLOW_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_
 
-#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 #include "tensorflow/core/framework/variable.pb.h"
+#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/adjust_contrast_op.cc b/tensorflow/core/kernels/adjust_contrast_op.cc
index 37976f7183..72155fd037 100644
--- a/tensorflow/core/kernels/adjust_contrast_op.cc
+++ b/tensorflow/core/kernels/adjust_contrast_op.cc
@@ -40,8 +40,8 @@ typedef Eigen::SyclDevice SYCLDevice;
 template <typename Device, typename T>
 class AdjustContrastOp : public OpKernel {
  public:
-  explicit AdjustContrastOp(OpKernelConstruction* context) : OpKernel(context) {
-  }
+  explicit AdjustContrastOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
 
   void Compute(OpKernelContext* context) override {
     const Tensor& input = context->input(0);
diff --git a/tensorflow/core/kernels/adjust_contrast_op_test.cc b/tensorflow/core/kernels/adjust_contrast_op_test.cc
index 0fc03b5a23..7522b32040 100644
--- a/tensorflow/core/kernels/adjust_contrast_op_test.cc
+++ b/tensorflow/core/kernels/adjust_contrast_op_test.cc
@@ -29,8 +29,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-class AdjustContrastOpTest : public OpsTestBase {
-};
+class AdjustContrastOpTest : public OpsTestBase {};
 
 TEST_F(AdjustContrastOpTest, Simple_1113) {
   TF_EXPECT_OK(NodeDefBuilder("adjust_contrast_op", "AdjustContrastv2")
diff --git a/tensorflow/core/kernels/adjust_saturation_op.cc b/tensorflow/core/kernels/adjust_saturation_op.cc
index 4643d4e6ef..f0c6ae499d 100644
--- a/tensorflow/core/kernels/adjust_saturation_op.cc
+++ b/tensorflow/core/kernels/adjust_saturation_op.cc
@@ -192,8 +192,9 @@ class AdjustSaturationOp<CPUDevice> : public AdjustSaturationOpBase {
     const DeviceBase::CpuWorkerThreads& worker_threads =
         *context->device()->tensorflow_cpu_worker_threads();
     Shard(worker_threads.num_threads, worker_threads.workers, channel_count,
-          kCostPerChannel, [channel_count, &input_data, &output_data, scale_h](
-                               int64 start_channel, int64 end_channel) {
+          kCostPerChannel,
+          [channel_count, &input_data, &output_data, scale_h](
+              int64 start_channel, int64 end_channel) {
             const float* p = input_data.data() + start_channel * kChannelSize;
             float* q = output_data.data() + start_channel * kChannelSize;
             for (int i = start_channel; i < end_channel; i++) {
diff --git a/tensorflow/core/kernels/aggregate_ops_cpu.h b/tensorflow/core/kernels/aggregate_ops_cpu.h
index dfa3fe585e..aa1cead928 100644
--- a/tensorflow/core/kernels/aggregate_ops_cpu.h
+++ b/tensorflow/core/kernels/aggregate_ops_cpu.h
@@ -25,7 +25,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 namespace tensorflow {
 
@@ -201,7 +201,7 @@ struct Add7Functor<SYCLDevice, T> {
                   typename TTypes<T>::ConstFlat in6,
                   typename TTypes<T>::ConstFlat in7) {
     Add7EigenImpl<SYCLDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
-                                         in7);
+                                          in7);
   }
 };
 
@@ -214,7 +214,7 @@ struct Add8Functor<SYCLDevice, T> {
       typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6,
       typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8) {
     Add8EigenImpl<SYCLDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
-                                         in7, in8);
+                                          in7, in8);
   }
 };
 
@@ -227,7 +227,7 @@ struct Add8pFunctor<SYCLDevice, T> {
       typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6,
       typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8) {
     Add8pEigenImpl<SYCLDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
-                                          in7, in8);
+                                           in7, in8);
   }
 };
 
@@ -241,10 +241,10 @@ struct Add9Functor<SYCLDevice, T> {
       typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8,
       typename TTypes<T>::ConstFlat in9) {
     Add9EigenImpl<SYCLDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
-                                         in7, in8, in9);
+                                          in7, in8, in9);
   }
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
 
diff --git a/tensorflow/core/kernels/attention_ops.cc b/tensorflow/core/kernels/attention_ops.cc
index cc8f122cab..ce2fce92e4 100644
--- a/tensorflow/core/kernels/attention_ops.cc
+++ b/tensorflow/core/kernels/attention_ops.cc
@@ -52,8 +52,9 @@ class ExtractGlimpseOp : public OpKernel {
     const int64 batch_size = input_shape.dim_size(0);
 
     const Tensor& window_size = context->input(1);
-    OP_REQUIRES(context, (window_size.shape().dims() == 1) &&
-                             window_size.shape().dim_size(0) == 2,
+    OP_REQUIRES(context,
+                (window_size.shape().dims() == 1) &&
+                    window_size.shape().dim_size(0) == 2,
                 errors::InvalidArgument(
                     "input must be a vector of size 2 (height, width)",
                     window_size.shape().DebugString()));
diff --git a/tensorflow/core/kernels/avgpooling_op.h b/tensorflow/core/kernels/avgpooling_op.h
index dea2683184..f5e81dbc09 100644
--- a/tensorflow/core/kernels/avgpooling_op.h
+++ b/tensorflow/core/kernels/avgpooling_op.h
@@ -48,9 +48,8 @@ struct SpatialAvgPooling {
 
 typedef Eigen::GpuDevice GPUDevice;
 
-// Launch a custom GPU kernels from Yanqing for the avgpooling backward operation
-// that works NHWC data formats.
-// Arguments:
+// Launch a custom GPU kernels from Yanqing for the avgpooling backward
+// operation that works NHWC data formats. Arguments:
 //   top_diff: backprop to the output of the pooling layer
 //   num: number of input batches
 //   height: input height
diff --git a/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc b/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc
index 2be330d142..6537b42f1e 100644
--- a/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc
@@ -71,8 +71,8 @@ __global__ void AvePoolBackwardNHWC(const int nthreads,
         hstart = max(hstart, 0);
         wstart = max(wstart, 0);
         int pool_size = (hend - hstart) * (wend - wstart);
-        gradient +=
-            top_diff_slice[(ph * pooled_width + pw) * channels] / dtype(pool_size);
+        gradient += top_diff_slice[(ph * pooled_width + pw) * channels] /
+                    dtype(pool_size);
       }
     }
     bottom_diff[index] = gradient;
@@ -90,11 +90,11 @@ bool RunAvePoolBackwardNHWC(const T* const top_diff, const int num,
                             const GPUDevice& d) {
   int x_size = num * height * width * channels;
   CudaLaunchConfig config = GetCudaLaunchConfig(x_size, d);
-  AvePoolBackwardNHWC<
-      T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-      config.virtual_thread_count, top_diff, num, height, width, channels,
-      pooled_height, pooled_width, kernel_h, kernel_w, stride_h, stride_w,
-      pad_t, pad_t, bottom_diff);
+  AvePoolBackwardNHWC<T>
+      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+          config.virtual_thread_count, top_diff, num, height, width, channels,
+          pooled_height, pooled_width, kernel_h, kernel_w, stride_h, stride_w,
+          pad_t, pad_t, bottom_diff);
 
   return d.ok();
 }
diff --git a/tensorflow/core/kernels/barrier_ops.cc b/tensorflow/core/kernels/barrier_ops.cc
index d0bbea9fe2..944564dfba 100644
--- a/tensorflow/core/kernels/barrier_ops.cc
+++ b/tensorflow/core/kernels/barrier_ops.cc
@@ -111,13 +111,14 @@ class Barrier : public ResourceBase {
       mutex_lock lock(mu_);
       if (closed_) {
         OP_REQUIRES_ASYNC(
-            ctx, !cancel_pending_enqueues_ &&
-                     (num_inserted == 0 || !incomplete_.empty()),
+            ctx,
+            !cancel_pending_enqueues_ &&
+                (num_inserted == 0 || !incomplete_.empty()),
             errors::Cancelled(
                 "Barrier ", name_, " is closed.  Pending enqueues cancelled: ",
-                cancel_pending_enqueues_, ".  Number of new insertions: ",
-                num_inserted, ".  Number of incomplete keys: ",
-                incomplete_.size(), "."),
+                cancel_pending_enqueues_,
+                ".  Number of new insertions: ", num_inserted,
+                ".  Number of incomplete keys: ", incomplete_.size(), "."),
             callback);
       }
 
@@ -128,9 +129,10 @@ class Barrier : public ResourceBase {
 
       for (int i = 0; i < num_inserted; ++i) {
         OP_REQUIRES_OK_ASYNC(
-            ctx, InsertOneLocked<T>(ctx, keys, values, element_shape,
-                                    component_index, i, &ready_tuples,
-                                    &new_elements),
+            ctx,
+            InsertOneLocked<T>(ctx, keys, values, element_shape,
+                               component_index, i, &ready_tuples,
+                               &new_elements),
             callback);
       }
 
@@ -317,8 +319,9 @@ class Barrier : public ResourceBase {
         return errors::Cancelled(
             "Barrier ", name_,
             " is closed, but attempted to insert a brand new key: ",
-            keys_vec(i), ".  Pending enqueues cancelled: ",
-            cancel_pending_enqueues_, ".  Insertion index: ", i,
+            keys_vec(i),
+            ".  Pending enqueues cancelled: ", cancel_pending_enqueues_,
+            ".  Insertion index: ", i,
             ".  Number of incomplete keys: ", incomplete_.size(), ".");
       }
     } else {
@@ -532,13 +535,14 @@ class InsertManyOp : public BarrierOpKernel {
     OP_REQUIRES_ASYNC(
         ctx, component_index_ < barrier->num_components(),
         errors::InvalidArgument("The component ID is out of range ",
-                                component_index_, " > num_components", " (= ",
-                                barrier->num_components(), ")"),
+                                component_index_, " > num_components",
+                                " (= ", barrier->num_components(), ")"),
         callback);
     OP_REQUIRES_OK_ASYNC(
-        ctx, ctx->MatchSignature({DT_STRING_REF, DT_STRING,
-                                  barrier->component_type(component_index_)},
-                                 {}),
+        ctx,
+        ctx->MatchSignature({DT_STRING_REF, DT_STRING,
+                             barrier->component_type(component_index_)},
+                            {}),
         callback);
 
     const Tensor* keys;
diff --git a/tensorflow/core/kernels/batch_kernels.cc b/tensorflow/core/kernels/batch_kernels.cc
index 5b4e1a809f..c447db842d 100644
--- a/tensorflow/core/kernels/batch_kernels.cc
+++ b/tensorflow/core/kernels/batch_kernels.cc
@@ -13,22 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_util.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/batching_util/shared_batch_scheduler.h"
 #include "tensorflow/core/kernels/batching_util/periodic_function.h"
+#include "tensorflow/core/kernels/batching_util/shared_batch_scheduler.h"
 #include "tensorflow/core/kernels/concat_lib.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/kernels/split_lib.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/macros.h"
 
-
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
diff --git a/tensorflow/core/kernels/batch_matmul_op_impl.h b/tensorflow/core/kernels/batch_matmul_op_impl.h
index 93c3918319..43e716c542 100644
--- a/tensorflow/core/kernels/batch_matmul_op_impl.h
+++ b/tensorflow/core/kernels/batch_matmul_op_impl.h
@@ -41,7 +41,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 namespace {
 
@@ -429,14 +429,13 @@ template <typename Scalar>
 struct LaunchBatchMatMul<SYCLDevice, Scalar> {
   static void Launch(OpKernelContext* context, const Tensor& in_x,
                      const Tensor& in_y, bool adj_x, bool adj_y, Tensor* out) {
-
-  // Number of matrix multiplies i.e. size of the batch.
-  const int64 batch_size = in_x.dim_size(0);
-  ParallelMatMulKernelSYCL<Scalar>::Run(context, in_x, in_y, adj_x, adj_y, out,
-                                        0, batch_size);
+    // Number of matrix multiplies i.e. size of the batch.
+    const int64 batch_size = in_x.dim_size(0);
+    ParallelMatMulKernelSYCL<Scalar>::Run(context, in_x, in_y, adj_x, adj_y,
+                                          out, 0, batch_size);
   }
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename Scalar>
 class BatchMatMul : public OpKernel {
@@ -462,10 +461,10 @@ class BatchMatMul : public OpKernel {
     TensorShape out_shape;
     for (int i = 0; i < ndims - 2; ++i) {
       OP_REQUIRES(ctx, in0.dim_size(i) == in1.dim_size(i),
-                  errors::InvalidArgument("In[0].dim(", i, ") and In[1].dim(",
-                                          i, ") must be the same: ",
-                                          in0.shape().DebugString(), " vs ",
-                                          in1.shape().DebugString()));
+                  errors::InvalidArgument(
+                      "In[0].dim(", i, ") and In[1].dim(", i,
+                      ") must be the same: ", in0.shape().DebugString(), " vs ",
+                      in1.shape().DebugString()));
       out_shape.AddDim(in0.dim_size(i));
     }
     auto n = (ndims == 2) ? 1 : out_shape.num_elements();
@@ -507,12 +506,12 @@ class BatchMatMul : public OpKernel {
   bool adj_y_;
 };
 
-#define REGISTER_BATCH_MATMUL_CPU(TYPE)                                              \
+#define REGISTER_BATCH_MATMUL_CPU(TYPE)                                 \
   REGISTER_KERNEL_BUILDER(                                              \
       Name("BatchMatMul").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"), \
       BatchMatMul<CPUDevice, TYPE>)
 
-#define REGISTER_BATCH_MATMUL_GPU(TYPE)                                              \
+#define REGISTER_BATCH_MATMUL_GPU(TYPE)                                 \
   REGISTER_KERNEL_BUILDER(                                              \
       Name("BatchMatMul").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"), \
       BatchMatMul<GPUDevice, TYPE>)
@@ -522,5 +521,5 @@ class BatchMatMul : public OpKernel {
   REGISTER_KERNEL_BUILDER(                                               \
       Name("BatchMatMul").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
       BatchMatMul<SYCLDevice, TYPE>)
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/batch_matmul_op_real.cc b/tensorflow/core/kernels/batch_matmul_op_real.cc
index 8d155ca62b..7e1e2aa4ec 100644
--- a/tensorflow/core/kernels/batch_matmul_op_real.cc
+++ b/tensorflow/core/kernels/batch_matmul_op_real.cc
@@ -35,5 +35,5 @@ TF_CALL_half(REGISTER_BATCH_MATMUL_GPU);
 #ifdef TENSORFLOW_USE_SYCL
 TF_CALL_float(REGISTER_BATCH_MATMUL_SYCL);
 TF_CALL_double(REGISTER_BATCH_MATMUL_SYCL);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/batch_matmul_op_test.cc b/tensorflow/core/kernels/batch_matmul_op_test.cc
index 7923f34155..c3932cd7b9 100644
--- a/tensorflow/core/kernels/batch_matmul_op_test.cc
+++ b/tensorflow/core/kernels/batch_matmul_op_test.cc
@@ -53,9 +53,10 @@ static Graph* BatchMatmul(int b, int m, int k, int n, bool adjoint_a,
 /* Uncomment to enable benchmarks for double & complex types: */
 // BM_BatchMatmulDev(B, M, K, N, TA, TB, std::complex<float>, DT_COMPLEX64,
 // gpu);
-// BM_BatchMatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, cpu);                    \
-// BM_BatchMatmulDev(M, K, N, TA, TB, std::complex<double>, DT_COMPLEX128, cpu);  \
-// BM_BatchMatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, gpu);                    \
+// BM_BatchMatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, cpu); \
+// BM_BatchMatmulDev(M, K, N, TA, TB, std::complex<double>, DT_COMPLEX128, cpu);
+// \
+// BM_BatchMatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, gpu); \
 // BM_BatchMatmulDev(M, K, N, TA, TB, std::complex<double>, DT_COMPLEX128, gpu);
 
 // Typical fully connected layers
diff --git a/tensorflow/core/kernels/batch_norm_op.cc b/tensorflow/core/kernels/batch_norm_op.cc
index d3ed617f71..c34ea14bf6 100644
--- a/tensorflow/core/kernels/batch_norm_op.cc
+++ b/tensorflow/core/kernels/batch_norm_op.cc
@@ -30,7 +30,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 class BatchNormOp : public OpKernel {
diff --git a/tensorflow/core/kernels/batch_norm_op_test.cc b/tensorflow/core/kernels/batch_norm_op_test.cc
index 5e3fcd2114..45ddc85329 100644
--- a/tensorflow/core/kernels/batch_norm_op_test.cc
+++ b/tensorflow/core/kernels/batch_norm_op_test.cc
@@ -54,7 +54,7 @@ TEST_F(BatchNormOpTest, Simple) {
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 6, 2}));
   test::FillValues<float>(
       &expected, {-17.86f, -22.00f, -15.87f, -20.59f, -13.87f, -19.18f, -21.86f,
-                  -33.31f, -23.85f, -34.72f, -25.85f, -36.13f });
+                  -33.31f, -23.85f, -34.72f, -25.85f, -36.13f});
   test::ExpectTensorNear<float>(expected, *GetOutput(0), 0.01);
 }
 
diff --git a/tensorflow/core/kernels/batchtospace_op.cc b/tensorflow/core/kernels/batchtospace_op.cc
index c1c0d6d329..b07c5fd718 100644
--- a/tensorflow/core/kernels/batchtospace_op.cc
+++ b/tensorflow/core/kernels/batchtospace_op.cc
@@ -56,9 +56,10 @@ static void BatchToSpaceOpCompute(OpKernelContext* context,
       errors::InvalidArgument("input rank should be >= ", 1 + block_dims,
                               " instead of ", orig_input_tensor.dims()));
 
-  OP_REQUIRES(context, TensorShapeUtils::IsMatrix(orig_crops.shape()) &&
-                           block_dims == orig_crops.dim_size(0) &&
-                           2 == orig_crops.dim_size(1),
+  OP_REQUIRES(context,
+              TensorShapeUtils::IsMatrix(orig_crops.shape()) &&
+                  block_dims == orig_crops.dim_size(0) &&
+                  2 == orig_crops.dim_size(1),
               errors::InvalidArgument("crops should have shape [", block_dims,
                                       ", 2] instead of ",
                                       orig_crops.shape().DebugString()));
diff --git a/tensorflow/core/kernels/bcast_ops.cc b/tensorflow/core/kernels/bcast_ops.cc
index 7fc4b1762d..8e4f08e473 100644
--- a/tensorflow/core/kernels/bcast_ops.cc
+++ b/tensorflow/core/kernels/bcast_ops.cc
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/util/bcast.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/bcast.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/bias_op_gpu.cu.cc b/tensorflow/core/kernels/bias_op_gpu.cu.cc
index 2ca194a77f..754b93b073 100644
--- a/tensorflow/core/kernels/bias_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/bias_op_gpu.cu.cc
@@ -77,14 +77,14 @@ void BiasGPU<T>::compute(const GPUDevice& d, const T* input, const T* bias,
   }
   CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
   if (data_format == FORMAT_NHWC) {
-    BiasNHWCKernel<
-        T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        config.virtual_thread_count, input, bias, output, bias_size);
+    BiasNHWCKernel<T>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            config.virtual_thread_count, input, bias, output, bias_size);
   } else {
-    BiasNCHWKernel<
-        T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        config.virtual_thread_count, input, bias, output, bias_size,
-        image_size);
+    BiasNCHWKernel<T>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            config.virtual_thread_count, input, bias, output, bias_size,
+            image_size);
   }
 }
 
@@ -206,10 +206,10 @@ void BiasGradGPU<T>::compute(const GPUDevice& d, const T* output_backprop,
   // Check if we have enough shared memory.
   if (shared_memory_size <= max_shared_memory_size) {
     if (data_format == FORMAT_NHWC) {
-      BiasGradNHWC_SharedAtomics<
-          T><<<config.block_count, config.thread_per_block, shared_memory_size,
-               d.stream()>>>(total_count, output_backprop, bias_backprop,
-                             bias_size);
+      BiasGradNHWC_SharedAtomics<T>
+          <<<config.block_count, config.thread_per_block, shared_memory_size,
+             d.stream()>>>(total_count, output_backprop, bias_backprop,
+                           bias_size);
     } else {
       // Round up the block count to multiple of bias_size.
       int group_size = (config.block_count + bias_size - 1) / bias_size;
@@ -217,23 +217,24 @@ void BiasGradGPU<T>::compute(const GPUDevice& d, const T* output_backprop,
       if (config.thread_per_block < kWarpSize) {
         config.thread_per_block = kWarpSize;
       }
-      BiasGradNCHW_SharedAtomics<
-          T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-          output_backprop, bias_backprop, batch, bias_size, image_size,
-          group_size);
+      BiasGradNCHW_SharedAtomics<T>
+          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+              output_backprop, bias_backprop, batch, bias_size, image_size,
+              group_size);
     }
   } else {
     // Note that even if we don't have enough shared memory to fit the entire
     // output block, it is possible to process one group of elements at a time.
     // But for now, we simply fall back to the naive implementation.
     if (data_format == FORMAT_NHWC) {
-      BiasGradNHWC_Naive<
-          T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-          total_count, output_backprop, bias_backprop, bias_size);
+      BiasGradNHWC_Naive<T>
+          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+              total_count, output_backprop, bias_backprop, bias_size);
     } else {
-      BiasGradNCHW_Naive<
-          T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-          total_count, output_backprop, bias_backprop, bias_size, image_size);
+      BiasGradNCHW_Naive<T>
+          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+              total_count, output_backprop, bias_backprop, bias_size,
+              image_size);
     }
   }
 }
diff --git a/tensorflow/core/kernels/bounds_check.h b/tensorflow/core/kernels/bounds_check.h
index e35f42ad41..c8c60c5524 100644
--- a/tensorflow/core/kernels/bounds_check.h
+++ b/tensorflow/core/kernels/bounds_check.h
@@ -48,7 +48,7 @@ EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC const T SubtleMustCopy(const T &x) {
   auto *to_x = reinterpret_cast<const volatile T *>(&x);
   return *to_x;
 }
-}  // namespace tensorflow::internal
+}  // namespace internal
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_UTIL_BOUNDS_CHECK_H_
diff --git a/tensorflow/core/kernels/candidate_sampler_ops.cc b/tensorflow/core/kernels/candidate_sampler_ops.cc
index e937c4f11b..654d99301a 100644
--- a/tensorflow/core/kernels/candidate_sampler_ops.cc
+++ b/tensorflow/core/kernels/candidate_sampler_ops.cc
@@ -126,13 +126,13 @@ REGISTER_KERNEL_BUILDER(Name("UniformCandidateSampler").Device(DEVICE_CPU),
 REGISTER_KERNEL_BUILDER(Name("LogUniformCandidateSampler").Device(DEVICE_CPU),
                         SimpleCandidateSamplerOp<LogUniformSampler>);
 
-REGISTER_KERNEL_BUILDER(Name("LearnedUnigramCandidateSampler")
-                            .Device(DEVICE_CPU),
-                        SimpleCandidateSamplerOp<UnigramSampler>);
+REGISTER_KERNEL_BUILDER(
+    Name("LearnedUnigramCandidateSampler").Device(DEVICE_CPU),
+    SimpleCandidateSamplerOp<UnigramSampler>);
 
-REGISTER_KERNEL_BUILDER(Name("ThreadUnsafeUnigramCandidateSampler")
-                            .Device(DEVICE_CPU),
-                        SimpleCandidateSamplerOp<ThreadUnsafeUnigramSampler>);
+REGISTER_KERNEL_BUILDER(
+    Name("ThreadUnsafeUnigramCandidateSampler").Device(DEVICE_CPU),
+    SimpleCandidateSamplerOp<ThreadUnsafeUnigramSampler>);
 
 class AllCandidateSamplerOp : public BaseCandidateSamplerOp {
  public:
@@ -197,8 +197,9 @@ class ComputeAccidentalHitsOp : public OpKernel {
   void Compute(OpKernelContext* context) override {
     const Tensor& in_true_candidates = context->input(0);
     const TensorShape& in_true_candidates_shape = in_true_candidates.shape();
-    OP_REQUIRES(context, TensorShapeUtils::IsMatrix(in_true_candidates_shape) &&
-                             in_true_candidates_shape.dim_size(1) == num_true_,
+    OP_REQUIRES(context,
+                TensorShapeUtils::IsMatrix(in_true_candidates_shape) &&
+                    in_true_candidates_shape.dim_size(1) == num_true_,
                 errors::InvalidArgument(
                     "true_candidates must be a batch_size * num_true matrix"));
 
diff --git a/tensorflow/core/kernels/cast_op.cc b/tensorflow/core/kernels/cast_op.cc
index f16abb2b79..626db9131a 100644
--- a/tensorflow/core/kernels/cast_op.cc
+++ b/tensorflow/core/kernels/cast_op.cc
@@ -36,7 +36,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #define CURRY_TYPES2(FN, arg0)   \
   FN(arg0, bool);                \
@@ -223,11 +223,11 @@ class SyclCastOp : public CastOpBase {
   }
 };
 
-#define REGISTER_CAST_SYCL(srctype, dsttype)                    \
-  REGISTER_KERNEL_BUILDER(Name("Cast")                          \
-                              .TypeConstraint<srctype>("SrcT")  \
-                              .TypeConstraint<dsttype>("DstT")  \
-                              .Device(DEVICE_SYCL),             \
+#define REGISTER_CAST_SYCL(srctype, dsttype)                   \
+  REGISTER_KERNEL_BUILDER(Name("Cast")                         \
+                              .TypeConstraint<srctype>("SrcT") \
+                              .TypeConstraint<dsttype>("DstT") \
+                              .Device(DEVICE_SYCL),            \
                           SyclCastOp)
 CURRY_TYPES2(REGISTER_CAST_SYCL, bool);
 CURRY_TYPES2(REGISTER_CAST_SYCL, int32);
@@ -237,7 +237,7 @@ CURRY_TYPES2(REGISTER_CAST_SYCL, double);
 
 #undef REGISTER_CAST_SYCL
 
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #undef CURRY_TYPES2
 
@@ -250,6 +250,5 @@ REGISTER_KERNEL_BUILDER(
 REGISTER_KERNEL_BUILDER(
     Name("_HostCast").Device(DEVICE_SYCL).HostMemory("x").HostMemory("y"),
     CpuCastOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // end namespace tensorflow
-
diff --git a/tensorflow/core/kernels/cast_op.h b/tensorflow/core/kernels/cast_op.h
index 8fedf2c271..fd4e75d26f 100644
--- a/tensorflow/core/kernels/cast_op.h
+++ b/tensorflow/core/kernels/cast_op.h
@@ -131,7 +131,8 @@ struct scalar_cast_op<::tensorflow::bfloat16, float> {
     p[0] = a.value;
     p[1] = 0;
 #else
-    static_assert(::tensorflow::port::kLittleEndian, "Not a little endian system!");
+    static_assert(::tensorflow::port::kLittleEndian,
+                  "Not a little endian system!");
     p[0] = 0;
     p[1] = a.value;
 #endif
diff --git a/tensorflow/core/kernels/cast_op_impl.h b/tensorflow/core/kernels/cast_op_impl.h
index 470e9e0804..3ae9f2ab4d 100644
--- a/tensorflow/core/kernels/cast_op_impl.h
+++ b/tensorflow/core/kernels/cast_op_impl.h
@@ -41,25 +41,25 @@ struct CastFunctor<Eigen::SyclDevice, O, I> {
     o.device(d) = i.template cast<O>();
   }
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
 
-#define CURRY_TYPES3_NO_HALF(FN, arg0, arg1)   \
-  FN(arg0, arg1, bool);                        \
-  FN(arg0, arg1, uint8);                       \
-  FN(arg0, arg1, int8);                        \
-  FN(arg0, arg1, uint16);                      \
-  FN(arg0, arg1, int16);                       \
-  FN(arg0, arg1, int32);                       \
-  FN(arg0, arg1, int64);                       \
-  FN(arg0, arg1, float);                       \
-  FN(arg0, arg1, double);                      \
-  FN(arg0, arg1, std::complex<float>);         \
+#define CURRY_TYPES3_NO_HALF(FN, arg0, arg1) \
+  FN(arg0, arg1, bool);                      \
+  FN(arg0, arg1, uint8);                     \
+  FN(arg0, arg1, int8);                      \
+  FN(arg0, arg1, uint16);                    \
+  FN(arg0, arg1, int16);                     \
+  FN(arg0, arg1, int32);                     \
+  FN(arg0, arg1, int64);                     \
+  FN(arg0, arg1, float);                     \
+  FN(arg0, arg1, double);                    \
+  FN(arg0, arg1, std::complex<float>);       \
   FN(arg0, arg1, std::complex<double>)
 
-#define CURRY_TYPES3(FN, arg0, arg1)           \
-  CURRY_TYPES3_NO_HALF(FN, arg0, arg1)         \
+#define CURRY_TYPES3(FN, arg0, arg1)   \
+  CURRY_TYPES3_NO_HALF(FN, arg0, arg1) \
   FN(arg0, arg1, Eigen::half);
 
 #define CAST_CASE(DEVICE, IN, OUT)                                         \
diff --git a/tensorflow/core/kernels/cast_op_test.cc b/tensorflow/core/kernels/cast_op_test.cc
index a106f287c1..057e209a71 100644
--- a/tensorflow/core/kernels/cast_op_test.cc
+++ b/tensorflow/core/kernels/cast_op_test.cc
@@ -107,10 +107,10 @@ static void BM_gpu_float_int64(int iters, int num) {
   testing::UseRealTime();
 #if GOOGLE_CUDA
   test::Benchmark("gpu", Cast<float, int64>(num)).Run(iters);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
   test::Benchmark("sycl", Cast<float, int64>(num)).Run(iters);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }
 BENCHMARK(BM_gpu_float_int64)->Arg(64 << 10)->Arg(32 << 20);
 
@@ -130,10 +130,10 @@ static void BM_gpu_bool_float(int iters, int num) {
   testing::UseRealTime();
 #if GOOGLE_CUDA
   test::Benchmark("gpu", Cast<bool, float>(num)).Run(iters);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
   test::Benchmark("sycl", Cast<bool, float>(num)).Run(iters);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }
 BENCHMARK(BM_gpu_bool_float)->Arg(64 << 10)->Arg(32 << 20);
 
@@ -180,7 +180,7 @@ static void BM_gpu_float_half(int iters, int num) {
   testing::UseRealTime();
 #if GOOGLE_CUDA
   test::Benchmark("gpu", Cast<float, Eigen::half>(num)).Run(iters);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 }
 BENCHMARK(BM_gpu_float_half)->Arg(64 << 10)->Arg(32 << 20);
 
@@ -191,7 +191,7 @@ static void BM_gpu_half_float(int iters, int num) {
   testing::UseRealTime();
 #if GOOGLE_CUDA
   test::Benchmark("gpu", Cast<Eigen::half, float>(num)).Run(iters);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 }
 BENCHMARK(BM_gpu_half_float)->Arg(64 << 10)->Arg(32 << 20);
 
diff --git a/tensorflow/core/kernels/colorspace_op.cc b/tensorflow/core/kernels/colorspace_op.cc
index ba100b32e7..9cc2e67bbe 100644
--- a/tensorflow/core/kernels/colorspace_op.cc
+++ b/tensorflow/core/kernels/colorspace_op.cc
@@ -107,14 +107,14 @@ class HSVToRGBOp : public OpKernel {
   }
 };
 
-#define REGISTER_CPU(T)                                       \
-  REGISTER_KERNEL_BUILDER(Name("RGBToHSV").Device(DEVICE_CPU) \
-                              .TypeConstraint<T>("T"),        \
-                          RGBToHSVOp<CPUDevice, T>);          \
-  template class RGBToHSVOp<CPUDevice, T>;                    \
-  REGISTER_KERNEL_BUILDER(Name("HSVToRGB").Device(DEVICE_CPU) \
-                              .TypeConstraint<T>("T"),        \
-                          HSVToRGBOp<CPUDevice, T>);          \
+#define REGISTER_CPU(T)                                           \
+  REGISTER_KERNEL_BUILDER(                                        \
+      Name("RGBToHSV").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      RGBToHSVOp<CPUDevice, T>);                                  \
+  template class RGBToHSVOp<CPUDevice, T>;                        \
+  REGISTER_KERNEL_BUILDER(                                        \
+      Name("HSVToRGB").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      HSVToRGBOp<CPUDevice, T>);                                  \
   template class HSVToRGBOp<CPUDevice, T>;
 TF_CALL_float(REGISTER_CPU);
 TF_CALL_double(REGISTER_CPU);
@@ -123,40 +123,39 @@ TF_CALL_double(REGISTER_CPU);
 // Forward declarations of the function specializations for GPU (to prevent
 // building the GPU versions here, they will be built compiling _gpu.cu.cc).
 namespace functor {
-#define DECLARE_GPU(T)                                        \
-  template <>                                                 \
-  void RGBToHSV<GPUDevice, T>::operator()(const GPUDevice& d, \
-      TTypes<T, 2>::ConstTensor input_data,                   \
-      TTypes<T, 1>::Tensor range,                             \
-      TTypes<T, 2>::Tensor output_data);                      \
-  extern template struct RGBToHSV<GPUDevice, T>;              \
-  template <>                                                 \
-  void HSVToRGB<GPUDevice, T>::operator()(const GPUDevice& d, \
-      TTypes<T, 2>::ConstTensor input_data,                   \
-      TTypes<T, 2>::Tensor output_data);                      \
+#define DECLARE_GPU(T)                                               \
+  template <>                                                        \
+  void RGBToHSV<GPUDevice, T>::operator()(                           \
+      const GPUDevice& d, TTypes<T, 2>::ConstTensor input_data,      \
+      TTypes<T, 1>::Tensor range, TTypes<T, 2>::Tensor output_data); \
+  extern template struct RGBToHSV<GPUDevice, T>;                     \
+  template <>                                                        \
+  void HSVToRGB<GPUDevice, T>::operator()(                           \
+      const GPUDevice& d, TTypes<T, 2>::ConstTensor input_data,      \
+      TTypes<T, 2>::Tensor output_data);                             \
   extern template struct HSVToRGB<GPUDevice, T>;
 TF_CALL_float(DECLARE_GPU);
 TF_CALL_double(DECLARE_GPU);
 }  // namespace functor
-#define REGISTER_GPU(T)                                       \
-  REGISTER_KERNEL_BUILDER(Name("RGBToHSV").Device(DEVICE_GPU) \
-                              .TypeConstraint<T>("T"),        \
-                          RGBToHSVOp<GPUDevice, T>);          \
-  REGISTER_KERNEL_BUILDER(Name("HSVToRGB").Device(DEVICE_GPU) \
-                              .TypeConstraint<T>("T"),        \
-                          HSVToRGBOp<GPUDevice, T>);
+#define REGISTER_GPU(T)                                           \
+  REGISTER_KERNEL_BUILDER(                                        \
+      Name("RGBToHSV").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+      RGBToHSVOp<GPUDevice, T>);                                  \
+  REGISTER_KERNEL_BUILDER(                                        \
+      Name("HSVToRGB").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+      HSVToRGBOp<GPUDevice, T>);
 TF_CALL_float(REGISTER_GPU);
 TF_CALL_double(REGISTER_GPU);
 #endif
 
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL(T)                                       \
-  REGISTER_KERNEL_BUILDER(Name("RGBToHSV").Device(DEVICE_SYCL) \
-                              .TypeConstraint<T>("T"),         \
-                          RGBToHSVOp<SYCLDevice, T>);          \
-  REGISTER_KERNEL_BUILDER(Name("HSVToRGB").Device(DEVICE_SYCL) \
-                              .TypeConstraint<T>("T"),         \
-                          HSVToRGBOp<SYCLDevice, T>);
+#define REGISTER_SYCL(T)                                           \
+  REGISTER_KERNEL_BUILDER(                                         \
+      Name("RGBToHSV").Device(DEVICE_SYCL).TypeConstraint<T>("T"), \
+      RGBToHSVOp<SYCLDevice, T>);                                  \
+  REGISTER_KERNEL_BUILDER(                                         \
+      Name("HSVToRGB").Device(DEVICE_SYCL).TypeConstraint<T>("T"), \
+      HSVToRGBOp<SYCLDevice, T>);
 TF_CALL_float(REGISTER_SYCL);
 TF_CALL_double(REGISTER_SYCL);
 #endif
diff --git a/tensorflow/core/kernels/colorspace_op.h b/tensorflow/core/kernels/colorspace_op.h
index c5721ef6dd..90bfce1419 100644
--- a/tensorflow/core/kernels/colorspace_op.h
+++ b/tensorflow/core/kernels/colorspace_op.h
@@ -54,10 +54,9 @@ struct RGBToHSV {
     // TODO(wicke): all these assignments are only necessary because a combined
     // expression is larger than kernel parameter space. A custom kernel is
     // probably in order.
-    H.device(d) = (R == V).select(norm * (G - B),
-                                  (G == V).select(
-                                      norm * (B - R) + T(2) / T(6),
-                                      norm * (R - G) + T(4) / T(6)));
+    H.device(d) = (R == V).select(
+        norm * (G - B), (G == V).select(norm * (B - R) + T(2) / T(6),
+                                        norm * (R - G) + T(4) / T(6)));
     H.device(d) = (range > T(0)).select(H, H.constant(T(0)));
     H.device(d) = (H < T(0)).select(H + T(1), H);
   }
diff --git a/tensorflow/core/kernels/colorspace_op_gpu.cu.cc b/tensorflow/core/kernels/colorspace_op_gpu.cu.cc
index e19d0b14d5..61f9ba44c4 100644
--- a/tensorflow/core/kernels/colorspace_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/colorspace_op_gpu.cu.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/kernels/colorspace_op.h"
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/colorspace_op.h"
 
 namespace tensorflow {
 
@@ -29,6 +29,6 @@ typedef Eigen::GpuDevice GPUDevice;
   template class functor::HSVToRGB<GPUDevice, T>;
 TF_CALL_float(INSTANTIATE_GPU);
 TF_CALL_double(INSTANTIATE_GPU);
-}
+}  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/colorspace_op_test.cc b/tensorflow/core/kernels/colorspace_op_test.cc
index 8c6fb732ab..bd82826770 100644
--- a/tensorflow/core/kernels/colorspace_op_test.cc
+++ b/tensorflow/core/kernels/colorspace_op_test.cc
@@ -224,34 +224,34 @@ class HSVToRGBOpTest : public OpsTestBase {
   }
 };
 
-#define TEST_COLORSPACE(test, dt)                               \
-  TEST_F(test, CheckBlack) {                                    \
-    MakeOp(dt);                                                 \
-    CheckBlack(dt);                                             \
-  }                                                             \
-  TEST_F(test, CheckGray) {                                     \
-    MakeOp(dt);                                                 \
-    CheckGray(dt);                                              \
-  }                                                             \
-  TEST_F(test, CheckWhite) {                                    \
-    MakeOp(dt);                                                 \
-    CheckWhite(dt);                                             \
-  }                                                             \
-  TEST_F(test, CheckRedMax) {                                   \
-    MakeOp(dt);                                                 \
-    CheckRedMax(dt);                                            \
-  }                                                             \
-  TEST_F(test, CheckGreenMax) {                                 \
-    MakeOp(dt);                                                 \
-    CheckGreenMax(dt);                                          \
-  }                                                             \
-  TEST_F(test, CheckBlueMax) {                                  \
-    MakeOp(dt);                                                 \
-    CheckBlueMax(dt);                                           \
-  }                                                             \
-  TEST_F(test, CheckNegativeDifference) {                       \
-    MakeOp(dt);                                                 \
-    CheckNegativeDifference(dt);                                \
+#define TEST_COLORSPACE(test, dt)         \
+  TEST_F(test, CheckBlack) {              \
+    MakeOp(dt);                           \
+    CheckBlack(dt);                       \
+  }                                       \
+  TEST_F(test, CheckGray) {               \
+    MakeOp(dt);                           \
+    CheckGray(dt);                        \
+  }                                       \
+  TEST_F(test, CheckWhite) {              \
+    MakeOp(dt);                           \
+    CheckWhite(dt);                       \
+  }                                       \
+  TEST_F(test, CheckRedMax) {             \
+    MakeOp(dt);                           \
+    CheckRedMax(dt);                      \
+  }                                       \
+  TEST_F(test, CheckGreenMax) {           \
+    MakeOp(dt);                           \
+    CheckGreenMax(dt);                    \
+  }                                       \
+  TEST_F(test, CheckBlueMax) {            \
+    MakeOp(dt);                           \
+    CheckBlueMax(dt);                     \
+  }                                       \
+  TEST_F(test, CheckNegativeDifference) { \
+    MakeOp(dt);                           \
+    CheckNegativeDifference(dt);          \
   }
 
 typedef RGBToHSVOpTest<float> rgb_to_hsv_float;
diff --git a/tensorflow/core/kernels/concat_lib.h b/tensorflow/core/kernels/concat_lib.h
index 526f9420d7..16784c4770 100644
--- a/tensorflow/core/kernels/concat_lib.h
+++ b/tensorflow/core/kernels/concat_lib.h
@@ -41,10 +41,11 @@ namespace tensorflow {
 
 // Assumes all inputs are nonempty
 template <typename T>
-void ConcatCPU(DeviceBase* d,
-               const std::vector<
-                   std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs,
-               typename TTypes<T, 2>::Matrix* output);
+void ConcatCPU(
+    DeviceBase* d,
+    const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
+        inputs,
+    typename TTypes<T, 2>::Matrix* output);
 #if GOOGLE_CUDA
 template <typename T>
 void ConcatGPU(
@@ -57,11 +58,12 @@ void ConcatGPU(
 
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
-void ConcatSYCL(const Eigen::SyclDevice& d,
-               const std::vector<
-                   std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs,
-               typename TTypes<T, 2>::Matrix* output);
-#endif // TENSORFLOW_USE_SYCL
+void ConcatSYCL(
+    const Eigen::SyclDevice& d,
+    const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
+        inputs,
+    typename TTypes<T, 2>::Matrix* output);
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_KERNELS_CONCAT_LIB_H_
diff --git a/tensorflow/core/kernels/concat_lib_cpu.cc b/tensorflow/core/kernels/concat_lib_cpu.cc
index 43731114c0..fc5a3e6288 100644
--- a/tensorflow/core/kernels/concat_lib_cpu.cc
+++ b/tensorflow/core/kernels/concat_lib_cpu.cc
@@ -48,10 +48,11 @@ struct MemCpyCopier<ResourceHandle> {
 }  // namespace
 
 template <typename T>
-void ConcatCPU(DeviceBase* d,
-               const std::vector<
-                   std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs,
-               typename TTypes<T, 2>::Matrix* output) {
+void ConcatCPU(
+    DeviceBase* d,
+    const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
+        inputs,
+    typename TTypes<T, 2>::Matrix* output) {
   if (std::is_same<T, string>::value) {
     // use a large cost here to force strings to be handled by separate threads
     ConcatCPUImpl<T>(d, inputs, 100000, MemCpyCopier<T>(), output);
@@ -86,21 +87,22 @@ TF_CALL_variant(REGISTER)
 
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
-void ConcatSYCL(const Eigen::SyclDevice& d,
-               const std::vector<
-                   std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs,
-               typename TTypes<T, 2>::Matrix* output) {
+void ConcatSYCL(
+    const Eigen::SyclDevice& d,
+    const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
+        inputs,
+    typename TTypes<T, 2>::Matrix* output) {
   ConcatSYCLImpl<T>(d, inputs, sizeof(T) /* cost_per_unit */, MemCpyCopier<T>(),
-                   output);
+                    output);
 }
-#define REGISTER_SYCL(T)                                                      \
- template void ConcatSYCL<T>(                                                 \
-     const Eigen::SyclDevice&,                                                \
-     const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&, \
-     typename TTypes<T, 2>::Matrix* output);
+#define REGISTER_SYCL(T)                                                       \
+  template void ConcatSYCL<T>(                                                 \
+      const Eigen::SyclDevice&,                                                \
+      const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&, \
+      typename TTypes<T, 2>::Matrix* output);
 
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL)
 
 #undef REGISTER_SYCL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/concat_lib_cpu.h b/tensorflow/core/kernels/concat_lib_cpu.h
index 6a933efde4..720b506537 100644
--- a/tensorflow/core/kernels/concat_lib_cpu.h
+++ b/tensorflow/core/kernels/concat_lib_cpu.h
@@ -15,9 +15,9 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include "tensorflow/core/kernels/concat_lib.h"
 #include <vector>
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/concat_lib.h"
 #include "tensorflow/core/util/work_sharder.h"
 
 namespace tensorflow {
@@ -73,7 +73,7 @@ void ConcatCPUImpl(
 
   // Sharded mode.
   auto work = [&row_size, &sizes, &inputs, &output, &copier, &num_inputs](
-      int64 start, int64 end) {
+                  int64 start, int64 end) {
     int64 skipped_rows = start / row_size;
     T* out = output->data() + skipped_rows * row_size;
     T* out_start = output->data() + start;
@@ -160,5 +160,5 @@ void ConcatSYCLImpl(
     }
   }
 }
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/concat_op.cc b/tensorflow/core/kernels/concat_op.cc
index ae1b5da32e..7011550f7e 100644
--- a/tensorflow/core/kernels/concat_op.cc
+++ b/tensorflow/core/kernels/concat_op.cc
@@ -37,7 +37,7 @@ typedef Eigen::GpuDevice GPUDevice;
 #endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 enum AxisArgumentName { NAME_IS_AXIS, NAME_IS_CONCAT_DIM };
 
@@ -71,8 +71,9 @@ class ConcatBaseOp : public OpKernel {
     const TensorShape& input_shape = values[0].shape();
 
     int32 axis = concat_dim < 0 ? concat_dim + input_dims : concat_dim;
-    OP_REQUIRES(c, (0 <= axis && axis < input_dims) ||
-                       (allow_legacy_scalars() && concat_dim == 0),
+    OP_REQUIRES(c,
+                (0 <= axis && axis < input_dims) ||
+                    (allow_legacy_scalars() && concat_dim == 0),
                 errors::InvalidArgument(
                     "ConcatOp : Expected concatenating dimensions in the range "
                     "[",
@@ -97,8 +98,8 @@ class ConcatBaseOp : public OpKernel {
           c, in.dims() == input_dims || (input_is_scalar && in_is_scalar),
           errors::InvalidArgument(
               "ConcatOp : Ranks of all input tensors should match: shape[0] = ",
-              input_shape.DebugString(), " vs. shape[", i, "] = ",
-              in.shape().DebugString()));
+              input_shape.DebugString(), " vs. shape[", i,
+              "] = ", in.shape().DebugString()));
       for (int j = 0; j < input_dims; ++j) {
         if (j == axis) {
           continue;
@@ -107,8 +108,8 @@ class ConcatBaseOp : public OpKernel {
             c, in.dim_size(j) == input_shape.dim_size(j),
             errors::InvalidArgument(
                 "ConcatOp : Dimensions of inputs should match: shape[0] = ",
-                input_shape.DebugString(), " vs. shape[", i, "] = ",
-                in.shape().DebugString()));
+                input_shape.DebugString(), " vs. shape[", i,
+                "] = ", in.shape().DebugString()));
       }
       if (in.NumElements() > 0) {
         int64 inputs_flat_dim1 = in.NumElements() / inputs_flat_dim0;
@@ -142,7 +143,7 @@ class ConcatBaseOp : public OpKernel {
         ConcatSYCL<T>(c->eigen_sycl_device(), inputs_flat, &output_flat);
         return;
       }
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
       ConcatCPU<T>(c->device(), inputs_flat, &output_flat);
     }
   }
@@ -252,7 +253,7 @@ REGISTER_KERNEL_BUILDER(Name("ConcatV2")
                         ConcatV2Op<CPUDevice, int32>);
 
 #undef REGISTER_SYCL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 class ConcatOffsetOp : public OpKernel {
  public:
@@ -347,5 +348,5 @@ REGISTER_KERNEL_BUILDER(Name("ConcatOffset")
                             .HostMemory("shape")
                             .HostMemory("offset"),
                         ConcatOffsetOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/concat_op_test.cc b/tensorflow/core/kernels/concat_op_test.cc
index c5bded9daf..e3ba8ae9f6 100644
--- a/tensorflow/core/kernels/concat_op_test.cc
+++ b/tensorflow/core/kernels/concat_op_test.cc
@@ -157,7 +157,8 @@ BENCHMARK(BM_MemcpyAlternativeDim0)->Arg(1000)->Arg(100000)->Arg(1000000);
 BENCHMARK(BM_MemcpyAlternativeDim1)->Arg(1000)->Arg(100000)->Arg(1000000);
 
 typedef Eigen::TensorMap<Eigen::Tensor<bfloat16, 1, Eigen::RowMajor>,
-                         Eigen::Unaligned> EigenMap;
+                         Eigen::Unaligned>
+    EigenMap;
 static void MemcpyManyAlternative1(int iters, int dim2) {
   testing::StopTiming();
 
diff --git a/tensorflow/core/kernels/conditional_accumulator_base.h b/tensorflow/core/kernels/conditional_accumulator_base.h
index 794ac6fa6d..c7c7c98369 100644
--- a/tensorflow/core/kernels/conditional_accumulator_base.h
+++ b/tensorflow/core/kernels/conditional_accumulator_base.h
@@ -160,7 +160,7 @@ class ConditionalAccumulatorBase : public ResourceBase {
  * Modifications to convenience macros defined in core/framework/op_kernel.h.
  * The below macros return a boolean if the test fails, so that the calling
  * function can get an indication that a failure has occurred.
-*/
+ */
 #define OP_REQUIRES_BOOLEAN(CTX, EXP, STATUS)          \
   do {                                                 \
     if (!TF_PREDICT_TRUE(EXP)) {                       \
diff --git a/tensorflow/core/kernels/conditional_accumulator_op.cc b/tensorflow/core/kernels/conditional_accumulator_op.cc
index fa37916eab..e13bf8a4c6 100644
--- a/tensorflow/core/kernels/conditional_accumulator_op.cc
+++ b/tensorflow/core/kernels/conditional_accumulator_op.cc
@@ -99,9 +99,10 @@ class AccumulatorTakeGradientOp
                       ConditionalAccumulatorBase* accumulator,
                       DoneCallback callback) override {
     // Check signature
-    OP_REQUIRES_OK_ASYNC(ctx, ctx->MatchSignature({DT_STRING_REF, DT_INT32},
-                                                  {accumulator->dtype()}),
-                         callback);
+    OP_REQUIRES_OK_ASYNC(
+        ctx,
+        ctx->MatchSignature({DT_STRING_REF, DT_INT32}, {accumulator->dtype()}),
+        callback);
   }
 
  private:
@@ -111,5 +112,4 @@ class AccumulatorTakeGradientOp
 REGISTER_KERNEL_BUILDER(Name("AccumulatorTakeGradient").Device(DEVICE_CPU),
                         AccumulatorTakeGradientOp);
 
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc
index 59f9f69315..920cd87858 100644
--- a/tensorflow/core/kernels/constant_op.cc
+++ b/tensorflow/core/kernels/constant_op.cc
@@ -146,7 +146,6 @@ typedef Eigen::GpuDevice GPUDevice;
 typedef Eigen::SyclDevice SYCLDevice;
 #endif  // TENSORFLOW_USE_SYCL
 
-
 template <typename Device, typename T, typename Index>
 class FillOp : public OpKernel {
  public:
diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc
index 8fe82d118a..7d5d54e5be 100644
--- a/tensorflow/core/kernels/control_flow_ops.cc
+++ b/tensorflow/core/kernels/control_flow_ops.cc
@@ -113,47 +113,47 @@ REGISTER_GPU_HOST_REF_KERNEL(string);
 #undef REGISTER_GPU_HOST_REF_KERNEL
 
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_SWITCH(type)                       \
-  REGISTER_KERNEL_BUILDER(Name("Switch")                 \
-                              .Device(DEVICE_SYCL)       \
-                              .HostMemory("pred")        \
-                              .TypeConstraint<type>("T"),\
+#define REGISTER_SYCL_SWITCH(type)                        \
+  REGISTER_KERNEL_BUILDER(Name("Switch")                  \
+                              .Device(DEVICE_SYCL)        \
+                              .HostMemory("pred")         \
+                              .TypeConstraint<type>("T"), \
                           SwitchOp)
 TF_CALL_REAL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_SWITCH);
 
-#define REGISTER_SYCL_REF_SWITCH(type)                     \
-  REGISTER_KERNEL_BUILDER(Name("RefSwitch")                \
-                              .Device(DEVICE_SYCL)         \
-                              .HostMemory("pred")          \
-                              .TypeConstraint<type>("T"),  \
+#define REGISTER_SYCL_REF_SWITCH(type)                    \
+  REGISTER_KERNEL_BUILDER(Name("RefSwitch")               \
+                              .Device(DEVICE_SYCL)        \
+                              .HostMemory("pred")         \
+                              .TypeConstraint<type>("T"), \
                           SwitchOp)
 TF_CALL_REAL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_SWITCH);
 
 #undef REGISTER_SYCL_SWITCH
 #undef REGISTER_SYCL_REF_SWITCH
 
-#define REGISTER_SYCL_HOST_KERNEL(type)                  \
-  REGISTER_KERNEL_BUILDER(Name("Switch")                 \
-                              .Device(DEVICE_SYCL)       \
-                              .HostMemory("data")        \
-                              .HostMemory("pred")        \
-                              .HostMemory("output_false")\
-                              .HostMemory("output_true") \
-                              .TypeConstraint<type>("T"),\
+#define REGISTER_SYCL_HOST_KERNEL(type)                   \
+  REGISTER_KERNEL_BUILDER(Name("Switch")                  \
+                              .Device(DEVICE_SYCL)        \
+                              .HostMemory("data")         \
+                              .HostMemory("pred")         \
+                              .HostMemory("output_false") \
+                              .HostMemory("output_true")  \
+                              .TypeConstraint<type>("T"), \
                           SwitchOp)
 
 REGISTER_SYCL_HOST_KERNEL(bool);
 REGISTER_SYCL_HOST_KERNEL(string);
 REGISTER_SYCL_HOST_KERNEL(int32);
 
-#define REGISTER_SYCL_HOST_REF_KERNEL(type)                \
-  REGISTER_KERNEL_BUILDER(Name("RefSwitch")                \
-                              .Device(DEVICE_SYCL)         \
-                              .HostMemory("data")          \
-                              .HostMemory("pred")          \
-                              .HostMemory("output_false")  \
-                              .HostMemory("output_true")   \
-                              .TypeConstraint<type>("T"),  \
+#define REGISTER_SYCL_HOST_REF_KERNEL(type)               \
+  REGISTER_KERNEL_BUILDER(Name("RefSwitch")               \
+                              .Device(DEVICE_SYCL)        \
+                              .HostMemory("data")         \
+                              .HostMemory("pred")         \
+                              .HostMemory("output_false") \
+                              .HostMemory("output_true")  \
+                              .TypeConstraint<type>("T"), \
                           SwitchOp)
 
 REGISTER_SYCL_HOST_REF_KERNEL(int32);
@@ -162,7 +162,7 @@ REGISTER_SYCL_HOST_REF_KERNEL(string);
 
 #undef REGISTER_SYCL_HOST_KERNEL
 #undef REGISTER_SYCL_HOST_REF_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 class RefSelectOp : public OpKernel {
  public:
@@ -282,7 +282,7 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_KERNEL);
 
 #undef REGISTER_SYCL_KERNEL
 #undef REGISTER_SYCL_REF_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // Special GPU kernels for int32 and string.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
@@ -331,7 +331,7 @@ REGISTER_SYCL_HOST_KERNEL(string);
 REGISTER_SYCL_HOST_KERNEL(ResourceHandle);
 
 #undef REGISTER_SYCL_HOST_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 void EnterOp::Compute(OpKernelContext* context) {
   if (IsRefType(context->input_dtype(0))) {
@@ -360,14 +360,14 @@ REGISTER_GPU_REF_KERNEL(bool);
 #undef REGISTER_GPU_REF_KERNEL
 
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)  \
-  REGISTER_KERNEL_BUILDER(          \
+#define REGISTER_SYCL_KERNEL(type) \
+  REGISTER_KERNEL_BUILDER(         \
       Name("Enter").Device(DEVICE_SYCL).TypeConstraint<type>("T"), EnterOp)
 REGISTER_SYCL_KERNEL(bool);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
 
-#define REGISTER_SYCL_REF_KERNEL(type)  \
-  REGISTER_KERNEL_BUILDER(              \
+#define REGISTER_SYCL_REF_KERNEL(type) \
+  REGISTER_KERNEL_BUILDER(             \
       Name("RefEnter").Device(DEVICE_SYCL).TypeConstraint<type>("T"), EnterOp)
 REGISTER_SYCL_REF_KERNEL(bool);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_KERNEL);
@@ -398,7 +398,7 @@ REGISTER_SYCL_HOST_KERNEL(ResourceHandle);
 
 #undef REGISTER_SYCL_HOST_KERNEL
 #undef REGISTER_SYCL_HOST_REF_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // Special GPU kernels for int32 and string.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
@@ -455,10 +455,10 @@ REGISTER_GPU_REF_KERNEL(bool);
 #undef REGISTER_GPU_REF_KERNEL
 
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)                                           \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("Exit").Device(DEVICE_SYCL).TypeConstraint<type>("T"), ExitOp);   \
-  REGISTER_KERNEL_BUILDER(                                                   \
+#define REGISTER_SYCL_KERNEL(type)                                         \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("Exit").Device(DEVICE_SYCL).TypeConstraint<type>("T"), ExitOp); \
+  REGISTER_KERNEL_BUILDER(                                                 \
       Name("RefExit").Device(DEVICE_SYCL).TypeConstraint<type>("T"), ExitOp);
 REGISTER_SYCL_KERNEL(bool);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
@@ -483,7 +483,7 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
 REGISTER_SYCL_HOST_KERNEL(int32);
 REGISTER_SYCL_HOST_KERNEL(string);
 #undef REGISTER_SYCL_HOST_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // Special GPU kernels for int32 and string.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
@@ -556,12 +556,12 @@ REGISTER_GPU_HOST_KERNEL(string);
 #undef REGISTER_GPU_HOST_KERNEL
 
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)                                           \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("NextIteration").Device(DEVICE_SYCL).TypeConstraint<type>("T"),   \
-      NextIterationOp);                                                      \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("RefNextIteration").Device(DEVICE_SYCL).TypeConstraint<type>("T"),\
+#define REGISTER_SYCL_KERNEL(type)                                            \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("NextIteration").Device(DEVICE_SYCL).TypeConstraint<type>("T"),    \
+      NextIterationOp);                                                       \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("RefNextIteration").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
       NextIterationOp)
 REGISTER_SYCL_KERNEL(bool);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
@@ -585,7 +585,7 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
 REGISTER_SYCL_HOST_KERNEL(int32);
 REGISTER_SYCL_HOST_KERNEL(string);
 #undef REGISTER_SYCL_HOST_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // A LoopCond op has one input and one output. The input is a boolean
 // scalar representing the taken branches of the "pivot" Switch that
@@ -619,7 +619,7 @@ REGISTER_KERNEL_BUILDER(Name("LoopCond")
                             .HostMemory("input")
                             .HostMemory("output"),
                         LoopCondOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // ControlTrigger kernels
 REGISTER_KERNEL_BUILDER(Name("ControlTrigger").Device(DEVICE_CPU),
@@ -631,7 +631,7 @@ REGISTER_KERNEL_BUILDER(Name("ControlTrigger").Device(DEVICE_GPU),
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(Name("ControlTrigger").Device(DEVICE_SYCL),
                         ControlTriggerOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // When called, abort op will abort the current process. This can be used to
 // abort remote PSs when needed.
diff --git a/tensorflow/core/kernels/control_flow_ops_test.cc b/tensorflow/core/kernels/control_flow_ops_test.cc
index affa0e8ca6..a2f7bd4069 100644
--- a/tensorflow/core/kernels/control_flow_ops_test.cc
+++ b/tensorflow/core/kernels/control_flow_ops_test.cc
@@ -91,6 +91,7 @@ class KilledBySignal {
  public:
   explicit KilledBySignal(int signum) : signum_(signum) {}
   bool operator()(int exit_status) const { return exit_status == signum_; }
+
  private:
   const int signum_;
 };
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index 985586d626..dbddaf3dc6 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -688,7 +688,7 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
   static int64 ConvolveScratchSize = GetCudnnWorkspaceLimit(
       // default value is in bytes despite the name of the environment variable
       "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32  // 4GB
-      );
+  );
 
   int device_id = stream->parent()->device_ordinal();
   DataType dtype = input.dtype();
diff --git a/tensorflow/core/kernels/conv_ops_fused.cc b/tensorflow/core/kernels/conv_ops_fused.cc
index 291ebf2298..1b40ad81f4 100644
--- a/tensorflow/core/kernels/conv_ops_fused.cc
+++ b/tensorflow/core/kernels/conv_ops_fused.cc
@@ -679,8 +679,9 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
 
     const int dims = resized_shape.dims();
     OP_REQUIRES(
-        context, TensorShapeUtils::IsMatrix(paddings.shape()) &&
-                     paddings.dim_size(1) == 2,
+        context,
+        TensorShapeUtils::IsMatrix(paddings.shape()) &&
+            paddings.dim_size(1) == 2,
         errors::InvalidArgument("paddings must be a matrix with 2 columns: ",
                                 paddings.shape().DebugString()));
     const int fixed_dims =
@@ -715,20 +716,22 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
       const int32 after =
           paddings_matrix(d, 1);  // Pad after existing elements.
       OP_REQUIRES(context, before >= 0 && after >= 0,
-                  errors::InvalidArgument("paddings must be non-negative: ",
-                                          before, " ", after));
+                  errors::InvalidArgument(
+                      "paddings must be non-negative: ", before, " ", after));
       if (offset_ == 0) {  // SYMMETRIC mode.
         OP_REQUIRES(
-            context, before <= resized_shape.dim_size(d) &&
-                         after <= resized_shape.dim_size(d),
+            context,
+            before <= resized_shape.dim_size(d) &&
+                after <= resized_shape.dim_size(d),
             errors::InvalidArgument("paddings must be no greater "
                                     "than the dimension size: ",
                                     before, ", ", after, " greater than ",
                                     resized_shape.dim_size(d)));
       } else if (offset_ == 1) {  // REFLECT mode.
         OP_REQUIRES(
-            context, before < resized_shape.dim_size(d) &&
-                         after < resized_shape.dim_size(d),
+            context,
+            before < resized_shape.dim_size(d) &&
+                after < resized_shape.dim_size(d),
             errors::InvalidArgument("paddings must be less than"
                                     " the dimension size: ",
                                     before, ", ", after, " not less than ",
@@ -767,18 +770,19 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
     // We only check the first three dims, since the depth is accessed as an
     // int64 below.
     for (int i = 0; i < 3; i++) {
-      OP_REQUIRES(context, FastBoundsCheck(filter.dim_size(i),
-                                           std::numeric_limits<int>::max()),
-                  errors::InvalidArgument("filter too large"));
+      OP_REQUIRES(
+          context,
+          FastBoundsCheck(filter.dim_size(i), std::numeric_limits<int>::max()),
+          errors::InvalidArgument("filter too large"));
     }
 
     // The last dimension for input is in_depth. It must be the same as the
     // filter's in_depth.
     const int64 in_depth = padded_shape.dim_size(3);
-    OP_REQUIRES(
-        context, in_depth == filter.dim_size(2),
-        errors::InvalidArgument("input and filter must have the same depth: ",
-                                in_depth, " vs ", filter.dim_size(2)));
+    OP_REQUIRES(context, in_depth == filter.dim_size(2),
+                errors::InvalidArgument(
+                    "input and filter must have the same depth: ", in_depth,
+                    " vs ", filter.dim_size(2)));
 
     // The last dimension for filter is out_depth.
     const int out_depth = static_cast<int>(filter.dim_size(3));
@@ -786,9 +790,10 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
     // The second dimension for input is rows/height.
     // The first dimension for filter is rows/height.
     const int64 padded_rows_raw = padded_shape.dim_size(1);
-    OP_REQUIRES(context, FastBoundsCheck(padded_rows_raw,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("Input rows too large"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(padded_rows_raw, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("Input rows too large"));
     const int padded_rows = static_cast<int>(padded_rows_raw);
     const int filter_rows = static_cast<int>(filter.dim_size(0));
     const int resized_rows = static_cast<int>(resized_shape.dim_size(1));
@@ -796,9 +801,10 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
     // The third dimension for input is columns/width.
     // The second dimension for filter is columns/width.
     const int64 padded_cols_raw = padded_shape.dim_size(2);
-    OP_REQUIRES(context, FastBoundsCheck(padded_cols_raw,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("Input cols too large"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(padded_cols_raw, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("Input cols too large"));
     const int padded_cols = static_cast<int>(padded_cols_raw);
     const int filter_cols = static_cast<int>(filter.dim_size(1));
     const int resized_cols = static_cast<int>(resized_shape.dim_size(2));
@@ -864,24 +870,26 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(FusedResizeConv2DUsingGemmOp);
 };
 
-#define REGISTER_FUSED(T)                                                    \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("FusedResizeAndPadConv2D")                                        \
-          .Device(DEVICE_CPU)                                                \
-          .TypeConstraint<T>("T"),                                           \
-      FusedResizeConv2DUsingGemmOp<                                          \
-          T, FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
-                                          BILINEAR>,                         \
+#define REGISTER_FUSED(T)                                                 \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("FusedResizeAndPadConv2D")                                     \
+          .Device(DEVICE_CPU)                                             \
+          .TypeConstraint<T>("T"),                                        \
+      FusedResizeConv2DUsingGemmOp<                                       \
+          T,                                                              \
+          FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
+                                       BILINEAR>,                         \
           true>);
 
 TF_CALL_float(REGISTER_FUSED);
 
-#define REGISTER_PAD_ONLY_FUSED(T)                                           \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("FusedPadConv2D").Device(DEVICE_CPU).TypeConstraint<T>("T"),      \
-      FusedResizeConv2DUsingGemmOp<                                          \
-          T, FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
-                                          NEAREST>,                          \
+#define REGISTER_PAD_ONLY_FUSED(T)                                        \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("FusedPadConv2D").Device(DEVICE_CPU).TypeConstraint<T>("T"),   \
+      FusedResizeConv2DUsingGemmOp<                                       \
+          T,                                                              \
+          FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
+                                       NEAREST>,                          \
           false>);
 
 TF_CALL_float(REGISTER_PAD_ONLY_FUSED);
diff --git a/tensorflow/core/kernels/conv_ops_gpu.h b/tensorflow/core/kernels/conv_ops_gpu.h
index 57e196c67c..f0085be3a5 100644
--- a/tensorflow/core/kernels/conv_ops_gpu.h
+++ b/tensorflow/core/kernels/conv_ops_gpu.h
@@ -27,7 +27,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-
 // Get the Cudnn workspace limit from the environment variable, which is in MB.
 // Return the workspace memory limit in bytes. If no value is set, return the
 // default value.
diff --git a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
index af6013c974..e58f5f61f3 100644
--- a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
+++ b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
@@ -25,9 +25,9 @@ limitations under the License.
 #include "cuda/include/cuda.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/lib/math/math_util.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
 #include "tensorflow/core/util/tensor_format.h"
-#include "tensorflow/core/lib/math/math_util.h"
 
 namespace tensorflow {
 
@@ -252,11 +252,14 @@ __global__ void SwapDimension1And2InTensor3UsingTiles(
   int x = threadIdx.x;
 
   Dimension<3> output_dims = {
-      input_dims[0], input_dims[2], input_dims[1],
+      input_dims[0],
+      input_dims[2],
+      input_dims[1],
   };
 
   Dimension<3> input_dims_in_tiles = {
-      input_dims[0], (input_dims[1] + TileSizeI - 1) / TileSizeI,
+      input_dims[0],
+      (input_dims[1] + TileSizeI - 1) / TileSizeI,
       (input_dims[2] + TileSizeJ - 1) / TileSizeJ,
   };
 
@@ -264,7 +267,8 @@ __global__ void SwapDimension1And2InTensor3UsingTiles(
       FlatToTensorIndex(blockIdx.x, input_dims_in_tiles);
 
   Index<3> input_tile_origin = {
-      input_tile_index[0], input_tile_index[1] * TileSizeI,
+      input_tile_index[0],
+      input_tile_index[1] * TileSizeI,
       input_tile_index[2] * TileSizeJ,
   };
 
@@ -322,11 +326,14 @@ __global__ void SwapDimension1And2InTensor3UsingTiles(
   __syncthreads();
 
   Index<3> output_tile_index = {
-      input_tile_index[0], input_tile_index[2], input_tile_index[1],
+      input_tile_index[0],
+      input_tile_index[2],
+      input_tile_index[1],
   };
 
   Index<3> output_tile_origin = {
-      output_tile_index[0], output_tile_index[1] * TileSizeJ,
+      output_tile_index[0],
+      output_tile_index[1] * TileSizeJ,
       output_tile_index[2] * TileSizeI,
   };
 
@@ -799,7 +806,7 @@ struct TransposeElemType<16> {
 // A helper function to make RunSwapDimension1And2InTensor3 concise. This
 // helper function looks at the data type and input matrix sizes and decides
 // the thread numbers and tile sizes to use.
-template <typename T, bool conjugate = false >
+template <typename T, bool conjugate = false>
 void SwapDimension1And2InTensor3WithNarrowMatrices(
     const GPUDevice& d, const T* input, const Dimension<3>& input_dims,
     T* output, const int kMinDimensionToUseTiles) {
@@ -902,19 +909,21 @@ void RunSwapDimension1And2InTensor3(const GPUDevice& d, const T* input,
     constexpr int kNumThreads = 256;
 
     Dimension<3> input_dims_in_tiles = {
-        input_dims[0], MathUtil::CeilOfRatio<int>(input_dims[1], kTileSize),
+        input_dims[0],
+        MathUtil::CeilOfRatio<int>(input_dims[1], kTileSize),
         MathUtil::CeilOfRatio<int>(input_dims[2], kTileSize),
     };
 
     int total_tiles_count = input_dims_in_tiles[0] * input_dims_in_tiles[1] *
                             input_dims_in_tiles[2];
-    SwapDimension1And2InTensor3UsingTiles<T, kNumThreads, kTileSize, kTileSize, conjugate>
+    SwapDimension1And2InTensor3UsingTiles<T, kNumThreads, kTileSize, kTileSize,
+                                          conjugate>
         <<<total_tiles_count, kNumThreads, 0, d.stream()>>>(input, input_dims,
                                                             output);
 
   } else if (narrow_matrix) {
-    SwapDimension1And2InTensor3WithNarrowMatrices<T, conjugate>(d, input, input_dims, output,
-                                                  kMinDimensionToUseTiles);
+    SwapDimension1And2InTensor3WithNarrowMatrices<T, conjugate>(
+        d, input, input_dims, output, kMinDimensionToUseTiles);
   } else {
     int total_element_count = input_dims[0] * input_dims[1] * input_dims[2];
     CudaLaunchConfig config = GetCudaLaunchConfig(total_element_count, d);
diff --git a/tensorflow/core/kernels/conv_ops_using_gemm.cc b/tensorflow/core/kernels/conv_ops_using_gemm.cc
index 20da77c36f..af0a9fa82e 100644
--- a/tensorflow/core/kernels/conv_ops_using_gemm.cc
+++ b/tensorflow/core/kernels/conv_ops_using_gemm.cc
@@ -468,18 +468,19 @@ class Conv2DUsingGemmOp : public BinaryOp<T> {
                                         filter.shape().DebugString()));
 
     for (int i = 0; i < 3; i++) {
-      OP_REQUIRES(context, FastBoundsCheck(filter.dim_size(i),
-                                           std::numeric_limits<int>::max()),
-                  errors::InvalidArgument("filter too large"));
+      OP_REQUIRES(
+          context,
+          FastBoundsCheck(filter.dim_size(i), std::numeric_limits<int>::max()),
+          errors::InvalidArgument("filter too large"));
     }
 
     // The last dimension for input is in_depth. It must be the same as the
     // filter's in_depth.
     const int64 in_depth = GetTensorDim(input, data_format_, 'C');
-    OP_REQUIRES(
-        context, in_depth == filter.dim_size(2),
-        errors::InvalidArgument("input and filter must have the same depth: ",
-                                in_depth, " vs ", filter.dim_size(2)));
+    OP_REQUIRES(context, in_depth == filter.dim_size(2),
+                errors::InvalidArgument(
+                    "input and filter must have the same depth: ", in_depth,
+                    " vs ", filter.dim_size(2)));
 
     // The last dimension for filter is out_depth.
     const int out_depth = static_cast<int>(filter.dim_size(3));
@@ -487,18 +488,20 @@ class Conv2DUsingGemmOp : public BinaryOp<T> {
     // The second dimension for input is rows/height.
     // The first dimension for filter is rows/height.
     const int64 input_rows_raw = GetTensorDim(input, data_format_, 'H');
-    OP_REQUIRES(context, FastBoundsCheck(input_rows_raw,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("Input rows too large"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(input_rows_raw, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("Input rows too large"));
     const int input_rows = static_cast<int>(input_rows_raw);
     const int filter_rows = static_cast<int>(filter.dim_size(0));
 
     // The third dimension for input is columns/width.
     // The second dimension for filter is columns/width.
     const int64 input_cols_raw = GetTensorDim(input, data_format_, 'W');
-    OP_REQUIRES(context, FastBoundsCheck(input_cols_raw,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("Input cols too large"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(input_cols_raw, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("Input cols too large"));
     const int input_cols = static_cast<int>(input_cols_raw);
     const int filter_cols = static_cast<int>(filter.dim_size(1));
 
diff --git a/tensorflow/core/kernels/cross_op_gpu.cu.cc b/tensorflow/core/kernels/cross_op_gpu.cu.cc
index 7ea0b3be0c..4a37f6cfbb 100644
--- a/tensorflow/core/kernels/cross_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/cross_op_gpu.cu.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/kernels/cross_op.h"
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/cross_op.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/ctc_decoder_ops.cc b/tensorflow/core/kernels/ctc_decoder_ops.cc
index 73ee310604..96bdb6a241 100644
--- a/tensorflow/core/kernels/ctc_decoder_ops.cc
+++ b/tensorflow/core/kernels/ctc_decoder_ops.cc
@@ -19,13 +19,13 @@ limitations under the License.
 
 #include <limits>
 
-#include "tensorflow/core/util/ctc/ctc_beam_search.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/util/ctc/ctc_beam_search.h"
 #include "tensorflow/core/util/sparse/sparse_tensor.h"
 
 namespace tensorflow {
@@ -80,16 +80,17 @@ class CTCDecodeHelper {
 
     if (!(batch_size == (*seq_len)->dim_size(0))) {
       return errors::FailedPrecondition(
-          "len(sequence_length) != batch_size.  ", "len(sequence_length):  ",
-          (*seq_len)->dim_size(0), " batch_size: ", batch_size);
+          "len(sequence_length) != batch_size.  ",
+          "len(sequence_length):  ", (*seq_len)->dim_size(0),
+          " batch_size: ", batch_size);
     }
 
     auto seq_len_t = (*seq_len)->vec<int32>();
 
     for (int b = 0; b < batch_size; ++b) {
       if (!(seq_len_t(b) <= max_time)) {
-        return errors::FailedPrecondition("sequence_length(", b, ") <= ",
-                                          max_time);
+        return errors::FailedPrecondition("sequence_length(", b,
+                                          ") <= ", max_time);
       }
     }
 
diff --git a/tensorflow/core/kernels/ctc_loss_op.cc b/tensorflow/core/kernels/ctc_loss_op.cc
index fb03adb7a5..b38d838bf1 100644
--- a/tensorflow/core/kernels/ctc_loss_op.cc
+++ b/tensorflow/core/kernels/ctc_loss_op.cc
@@ -113,8 +113,8 @@ class CTCLossOp : public OpKernel {
       const int64 batch_indices = g.group()[0];
       OP_REQUIRES(ctx, FastBoundsCheck(batch_indices, batch_size),
                   errors::InvalidArgument("labels batch index must be between ",
-                                          0, " and ", batch_size, " but saw: ",
-                                          batch_indices));
+                                          0, " and ", batch_size,
+                                          " but saw: ", batch_indices));
 
       auto values = g.values<int32>();
       std::vector<int>* b_values = &labels_t[batch_indices];
diff --git a/tensorflow/core/kernels/cwise_op_abs.cc b/tensorflow/core/kernels/cwise_op_abs.cc
index 5fd38d9dc2..1466f24202 100644
--- a/tensorflow/core/kernels/cwise_op_abs.cc
+++ b/tensorflow/core/kernels/cwise_op_abs.cc
@@ -45,5 +45,5 @@ REGISTER_KERNEL_BUILDER(Name("Abs")
                             .HostMemory("y")
                             .TypeConstraint<int32>("T"),
                         UnaryOp<CPUDevice, functor::abs<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_acos.cc b/tensorflow/core/kernels/cwise_op_acos.cc
index 12cc6c8bdd..4919122607 100644
--- a/tensorflow/core/kernels/cwise_op_acos.cc
+++ b/tensorflow/core/kernels/cwise_op_acos.cc
@@ -24,5 +24,5 @@ REGISTER2(UnaryOp, GPU, "Acos", functor::acos, float, double);
 
 #if TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Acos", functor::acos, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_acosh.cc b/tensorflow/core/kernels/cwise_op_acosh.cc
index 39c8814073..c2b355ab7f 100644
--- a/tensorflow/core/kernels/cwise_op_acosh.cc
+++ b/tensorflow/core/kernels/cwise_op_acosh.cc
@@ -17,12 +17,12 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_gradients.h"
 
 namespace tensorflow {
-REGISTER4(UnaryOp, CPU, "Acosh", functor::acosh, float, double,
-          complex64, complex128);
+REGISTER4(UnaryOp, CPU, "Acosh", functor::acosh, float, double, complex64,
+          complex128);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Acosh", functor::acosh, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
 REGISTER2(UnaryOp, GPU, "Acosh", functor::acosh, float, double);
diff --git a/tensorflow/core/kernels/cwise_op_add_1.cc b/tensorflow/core/kernels/cwise_op_add_1.cc
index 608a6dce3d..bf32c8a54b 100644
--- a/tensorflow/core/kernels/cwise_op_add_1.cc
+++ b/tensorflow/core/kernels/cwise_op_add_1.cc
@@ -44,7 +44,6 @@ REGISTER_KERNEL_BUILDER(Name("AddV2")
                         BinaryOp<CPUDevice, functor::add<int32>>);
 #endif
 
-
 #if TENSORFLOW_USE_SYCL
 #define REGISTER_KERNEL(type)                          \
   REGISTER(BinaryOp, SYCL, "Add", functor::add, type); \
@@ -66,5 +65,5 @@ REGISTER_KERNEL_BUILDER(Name("AddV2")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::add<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_add_2.cc b/tensorflow/core/kernels/cwise_op_add_2.cc
index ac21ca06c9..e8acbac285 100644
--- a/tensorflow/core/kernels/cwise_op_add_2.cc
+++ b/tensorflow/core/kernels/cwise_op_add_2.cc
@@ -22,8 +22,8 @@ namespace tensorflow {
 // sharded files, only make its register calls when not __ANDROID_TYPES_SLIM__.
 #if !defined(__ANDROID_TYPES_SLIM__)
 
-REGISTER6(BinaryOp, CPU, "Add", functor::add, int8, int16, complex64,
-          uint8, complex128, string);
+REGISTER6(BinaryOp, CPU, "Add", functor::add, int8, int16, complex64, uint8,
+          complex128, string);
 // Notice: String is excluded to allow marking AddV2 is_commutative and
 // is_aggregate.
 REGISTER5(BinaryOp, CPU, "AddV2", functor::add, int8, int16, complex64, uint8,
diff --git a/tensorflow/core/kernels/cwise_op_asin.cc b/tensorflow/core/kernels/cwise_op_asin.cc
index c28e27d95a..fe8dfea117 100644
--- a/tensorflow/core/kernels/cwise_op_asin.cc
+++ b/tensorflow/core/kernels/cwise_op_asin.cc
@@ -24,5 +24,5 @@ REGISTER2(UnaryOp, GPU, "Asin", functor::asin, float, double);
 
 #if TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Asin", functor::asin, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_asinh.cc b/tensorflow/core/kernels/cwise_op_asinh.cc
index 0aec6aac34..7cf0405f52 100644
--- a/tensorflow/core/kernels/cwise_op_asinh.cc
+++ b/tensorflow/core/kernels/cwise_op_asinh.cc
@@ -1,10 +1,10 @@
-  /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
-  http://www.apache.org/licenses/LICENSE-2.0
+http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@@ -17,8 +17,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_gradients.h"
 
 namespace tensorflow {
-REGISTER4(UnaryOp, CPU, "Asinh", functor::asinh, float, double,
-          complex64, complex128);
+REGISTER4(UnaryOp, CPU, "Asinh", functor::asinh, float, double, complex64,
+          complex128);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Asinh", functor::asinh, float, double);
diff --git a/tensorflow/core/kernels/cwise_op_atan.cc b/tensorflow/core/kernels/cwise_op_atan.cc
index 7d73de4810..09f0448874 100644
--- a/tensorflow/core/kernels/cwise_op_atan.cc
+++ b/tensorflow/core/kernels/cwise_op_atan.cc
@@ -24,5 +24,5 @@ REGISTER2(UnaryOp, GPU, "Atan", functor::atan, float, double);
 
 #if TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Atan", functor::atan, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_atanh.cc b/tensorflow/core/kernels/cwise_op_atanh.cc
index 7b688db4c5..6170683fa6 100644
--- a/tensorflow/core/kernels/cwise_op_atanh.cc
+++ b/tensorflow/core/kernels/cwise_op_atanh.cc
@@ -17,8 +17,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_gradients.h"
 
 namespace tensorflow {
-REGISTER4(UnaryOp, CPU, "Atanh", functor::atanh, float, double,
-          complex64, complex128);
+REGISTER4(UnaryOp, CPU, "Atanh", functor::atanh, float, double, complex64,
+          complex128);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Atanh", functor::atanh, float, double);
diff --git a/tensorflow/core/kernels/cwise_op_ceil.cc b/tensorflow/core/kernels/cwise_op_ceil.cc
index 0111e9d5fd..816eadc80e 100644
--- a/tensorflow/core/kernels/cwise_op_ceil.cc
+++ b/tensorflow/core/kernels/cwise_op_ceil.cc
@@ -24,5 +24,5 @@ REGISTER3(UnaryOp, GPU, "Ceil", functor::ceil, float, Eigen::half, double);
 
 #if TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Ceil", functor::ceil, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_cos.cc b/tensorflow/core/kernels/cwise_op_cos.cc
index d4b3b0e393..71ad0ff0dc 100644
--- a/tensorflow/core/kernels/cwise_op_cos.cc
+++ b/tensorflow/core/kernels/cwise_op_cos.cc
@@ -25,5 +25,5 @@ REGISTER3(UnaryOp, GPU, "Cos", functor::cos, float, Eigen::half, double);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Cos", functor::cos, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_cosh.cc b/tensorflow/core/kernels/cwise_op_cosh.cc
index bca99a4f89..31b4bb3cad 100644
--- a/tensorflow/core/kernels/cwise_op_cosh.cc
+++ b/tensorflow/core/kernels/cwise_op_cosh.cc
@@ -16,20 +16,18 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER4(UnaryOp, CPU, "Cosh", functor::cosh, float, double,
-          complex64, complex128);
+REGISTER4(UnaryOp, CPU, "Cosh", functor::cosh, float, double, complex64,
+          complex128);
 
 #if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-                          Name("Cosh")                                \
-                          .Device(DEVICE_SYCL)                        \
-                          .TypeConstraint<TYPE>("T"),                 \
-                          UnaryOp<SYCLDevice, functor::cosh<TYPE>>);
+#define REGISTER_SYCL_KERNEL(TYPE)                                \
+  REGISTER_KERNEL_BUILDER(                                        \
+      Name("Cosh").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
+      UnaryOp<SYCLDevice, functor::cosh<TYPE>>);
 REGISTER_SYCL_KERNEL(float);
 REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
 REGISTER2(UnaryOp, GPU, "Cosh", functor::cosh, float, double);
diff --git a/tensorflow/core/kernels/cwise_op_div.cc b/tensorflow/core/kernels/cwise_op_div.cc
index d44c1bf473..c71c756e44 100644
--- a/tensorflow/core/kernels/cwise_op_div.cc
+++ b/tensorflow/core/kernels/cwise_op_div.cc
@@ -54,5 +54,5 @@ REGISTER_KERNEL_BUILDER(Name("Div")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::safe_div<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_exp.cc b/tensorflow/core/kernels/cwise_op_exp.cc
index 66d7b7d22e..8f4ac98016 100644
--- a/tensorflow/core/kernels/cwise_op_exp.cc
+++ b/tensorflow/core/kernels/cwise_op_exp.cc
@@ -26,5 +26,5 @@ REGISTER5(UnaryOp, GPU, "Exp", functor::exp, float, Eigen::half, double,
 
 #if TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Exp", functor::exp, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_expm1.cc b/tensorflow/core/kernels/cwise_op_expm1.cc
index 4f72308006..ce03ad5de6 100644
--- a/tensorflow/core/kernels/cwise_op_expm1.cc
+++ b/tensorflow/core/kernels/cwise_op_expm1.cc
@@ -23,5 +23,5 @@ REGISTER3(UnaryOp, GPU, "Expm1", functor::expm1, float, Eigen::half, double);
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Expm1", functor::expm1, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_floor.cc b/tensorflow/core/kernels/cwise_op_floor.cc
index 5a142b9ce9..d554d41c41 100644
--- a/tensorflow/core/kernels/cwise_op_floor.cc
+++ b/tensorflow/core/kernels/cwise_op_floor.cc
@@ -23,5 +23,5 @@ REGISTER3(UnaryOp, GPU, "Floor", functor::floor, float, Eigen::half, double);
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Floor", functor::floor, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_floor_div.cc b/tensorflow/core/kernels/cwise_op_floor_div.cc
index fa81ef0872..fecbf85989 100644
--- a/tensorflow/core/kernels/cwise_op_floor_div.cc
+++ b/tensorflow/core/kernels/cwise_op_floor_div.cc
@@ -49,5 +49,5 @@ REGISTER_KERNEL_BUILDER(Name("FloorDiv")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::safe_floor_div<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_floor_mod.cc b/tensorflow/core/kernels/cwise_op_floor_mod.cc
index 55f8a30461..29340b8850 100644
--- a/tensorflow/core/kernels/cwise_op_floor_mod.cc
+++ b/tensorflow/core/kernels/cwise_op_floor_mod.cc
@@ -40,5 +40,5 @@ REGISTER_KERNEL_BUILDER(Name("FloorMod")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::safe_floor_mod<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_gpu_conj.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_conj.cu.cc
index e7dff5d0ac..77723b3169 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_conj.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_conj.cu.cc
@@ -19,8 +19,8 @@ limitations under the License.
 
 namespace tensorflow {
 namespace functor {
-  DEFINE_UNARY1(conj, complex64);
-  DEFINE_UNARY1(conj, complex128);
+DEFINE_UNARY1(conj, complex64);
+DEFINE_UNARY1(conj, complex128);
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cwise_op_gpu_equal_to.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_equal_to.cu.cc
index 3675398126..26748ef0e7 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_equal_to.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_equal_to.cu.cc
@@ -20,7 +20,7 @@ limitations under the License.
 namespace tensorflow {
 namespace functor {
 DEFINE_BINARY10(equal_to, float, Eigen::half, double, uint8, int8, int16, int64,
-               complex64, complex128, bool);
+                complex64, complex128, bool);
 DEFINE_APPROXIMATE_EQUAL2(float, double);
 }  // namespace functor
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc
index a54dbdfc24..627ecc8c80 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc
@@ -15,8 +15,10 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 
-#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+#define EIGEN_USE_GPU
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
 
 namespace tensorflow {
 namespace functor {
@@ -38,19 +40,17 @@ struct SelectScalarFunctor<GPUDevice, T> {
                   typename TTypes<bool>::ConstScalar cond,
                   typename TTypes<T>::ConstFlat then_flat,
                   typename TTypes<T>::ConstFlat else_flat) {
-
 #if !defined(EIGEN_HAS_INDEX_LIST)
-  Eigen::array<int, 1> rank1{1};
+    Eigen::array<int, 1> rank1{1};
 #else
-  Eigen::IndexList<Eigen::type2index<1>> rank1;
+    Eigen::IndexList<Eigen::type2index<1> > rank1;
 #endif
-  const int size  = then_flat.dimension(0);
-  Eigen::array<int, 1> broadcast_dims{size};
-
-  To32Bit(out).device(d) = cond.reshape(rank1)
-                               .broadcast(broadcast_dims)
-                               .select(then_flat, else_flat);
+    const int size = then_flat.dimension(0);
+    Eigen::array<int, 1> broadcast_dims{size};
 
+    To32Bit(out).device(d) = cond.reshape(rank1)
+                                 .broadcast(broadcast_dims)
+                                 .select(then_flat, else_flat);
   }
 };
 
@@ -89,8 +89,8 @@ struct BatchSelectFunctor<GPUDevice, T> {
   }
 };
 
-#define SELECT_FUNCTOR(T)                      \
-  template struct SelectFunctor<GPUDevice, T>; \
+#define SELECT_FUNCTOR(T)                            \
+  template struct SelectFunctor<GPUDevice, T>;       \
   template struct SelectScalarFunctor<GPUDevice, T>; \
   template struct BatchSelectFunctor<GPUDevice, T>;
 
diff --git a/tensorflow/core/kernels/cwise_op_greater.cc b/tensorflow/core/kernels/cwise_op_greater.cc
index ba89899fb3..a4ea408836 100644
--- a/tensorflow/core/kernels/cwise_op_greater.cc
+++ b/tensorflow/core/kernels/cwise_op_greater.cc
@@ -43,5 +43,5 @@ REGISTER_KERNEL_BUILDER(Name("Greater")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::greater<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_greater_equal.cc b/tensorflow/core/kernels/cwise_op_greater_equal.cc
index 8f0c483aec..3f34d6269e 100644
--- a/tensorflow/core/kernels/cwise_op_greater_equal.cc
+++ b/tensorflow/core/kernels/cwise_op_greater_equal.cc
@@ -35,7 +35,8 @@ REGISTER_KERNEL_BUILDER(Name("GreaterEqual")
 #endif
 
 #ifdef TENSORFLOW_USE_SYCL
-REGISTER2(BinaryOp, SYCL, "GreaterEqual", functor::greater_equal, float, double);
+REGISTER2(BinaryOp, SYCL, "GreaterEqual", functor::greater_equal, float,
+          double);
 
 REGISTER_KERNEL_BUILDER(Name("GreaterEqual")
                             .Device(DEVICE_SYCL)
@@ -44,5 +45,5 @@ REGISTER_KERNEL_BUILDER(Name("GreaterEqual")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::greater_equal<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_invert.cc b/tensorflow/core/kernels/cwise_op_invert.cc
index df2c02e42e..f5cafcc780 100644
--- a/tensorflow/core/kernels/cwise_op_invert.cc
+++ b/tensorflow/core/kernels/cwise_op_invert.cc
@@ -21,7 +21,7 @@ REGISTER6(UnaryOp, CPU, "Invert", functor::invert, int8, int16, int32, int64,
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER6(UnaryOp, SYCL, "Invert", functor::invert, int8, int16, int32, int64,
-         uint8, uint16);
+          uint8, uint16);
 #endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_isfinite.cc b/tensorflow/core/kernels/cwise_op_isfinite.cc
index 53ec1c1c63..ae1e590d24 100644
--- a/tensorflow/core/kernels/cwise_op_isfinite.cc
+++ b/tensorflow/core/kernels/cwise_op_isfinite.cc
@@ -26,5 +26,5 @@ REGISTER3(UnaryOp, GPU, "IsFinite", functor::isfinite, float, Eigen::half,
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "IsFinite", functor::isfinite, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_isinf.cc b/tensorflow/core/kernels/cwise_op_isinf.cc
index 4b34744304..f22ca21e1c 100644
--- a/tensorflow/core/kernels/cwise_op_isinf.cc
+++ b/tensorflow/core/kernels/cwise_op_isinf.cc
@@ -24,5 +24,5 @@ REGISTER3(UnaryOp, GPU, "IsInf", functor::isinf, float, Eigen::half, double);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "IsInf", functor::isinf, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_isnan.cc b/tensorflow/core/kernels/cwise_op_isnan.cc
index ad2dd3f722..aa180c247e 100644
--- a/tensorflow/core/kernels/cwise_op_isnan.cc
+++ b/tensorflow/core/kernels/cwise_op_isnan.cc
@@ -24,5 +24,5 @@ REGISTER3(UnaryOp, GPU, "IsNan", functor::isnan, float, Eigen::half, double);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "IsNan", functor::isnan, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_less.cc b/tensorflow/core/kernels/cwise_op_less.cc
index 136c3666df..00cdecdbd1 100644
--- a/tensorflow/core/kernels/cwise_op_less.cc
+++ b/tensorflow/core/kernels/cwise_op_less.cc
@@ -42,5 +42,5 @@ REGISTER_KERNEL_BUILDER(Name("Less")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::less<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_less_equal.cc b/tensorflow/core/kernels/cwise_op_less_equal.cc
index 97a2508d12..11806c5fc7 100644
--- a/tensorflow/core/kernels/cwise_op_less_equal.cc
+++ b/tensorflow/core/kernels/cwise_op_less_equal.cc
@@ -44,5 +44,5 @@ REGISTER_KERNEL_BUILDER(Name("LessEqual")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::less_equal<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_log.cc b/tensorflow/core/kernels/cwise_op_log.cc
index 7fdfdff0e3..98936e0f96 100644
--- a/tensorflow/core/kernels/cwise_op_log.cc
+++ b/tensorflow/core/kernels/cwise_op_log.cc
@@ -25,5 +25,5 @@ REGISTER3(UnaryOp, GPU, "Log", functor::log, float, Eigen::half, double);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Log", functor::log, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_log1p.cc b/tensorflow/core/kernels/cwise_op_log1p.cc
index 25ad7b24bb..162ca9e07c 100644
--- a/tensorflow/core/kernels/cwise_op_log1p.cc
+++ b/tensorflow/core/kernels/cwise_op_log1p.cc
@@ -25,5 +25,5 @@ REGISTER3(UnaryOp, GPU, "Log1p", functor::log1p, float, Eigen::half, double);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Log1p", functor::log1p, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_maximum.cc b/tensorflow/core/kernels/cwise_op_maximum.cc
index 87d54e380b..8c54f22f10 100644
--- a/tensorflow/core/kernels/cwise_op_maximum.cc
+++ b/tensorflow/core/kernels/cwise_op_maximum.cc
@@ -43,5 +43,5 @@ REGISTER_KERNEL_BUILDER(Name("Maximum")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::maximum<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_minimum.cc b/tensorflow/core/kernels/cwise_op_minimum.cc
index 442171193b..dff83df828 100644
--- a/tensorflow/core/kernels/cwise_op_minimum.cc
+++ b/tensorflow/core/kernels/cwise_op_minimum.cc
@@ -43,6 +43,6 @@ REGISTER_KERNEL_BUILDER(Name("Minimum")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::minimum<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_mul_1.cc b/tensorflow/core/kernels/cwise_op_mul_1.cc
index 023eb07ca3..0e8d2e3735 100644
--- a/tensorflow/core/kernels/cwise_op_mul_1.cc
+++ b/tensorflow/core/kernels/cwise_op_mul_1.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 namespace tensorflow {
 
-REGISTER5(BinaryOp, CPU, "Mul", functor::mul, float, Eigen::half, double,
-          uint8, int32);
+REGISTER5(BinaryOp, CPU, "Mul", functor::mul, float, Eigen::half, double, uint8,
+          int32);
 #if defined(__ANDROID_TYPES_SLIM__)
 // We only register the first type when we have multi-argument calls in the
 // case where we're trying to reduce executable size, but it turns out that the
@@ -28,7 +28,7 @@ REGISTER(BinaryOp, CPU, "Mul", functor::mul, int32);
 
 #if GOOGLE_CUDA
 REGISTER4(BinaryOp, GPU, "Mul", functor::mul, float, Eigen::half, double,
-           uint8);
+          uint8);
 // A special GPU kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
 // registration requires all int32 inputs and outputs to be in host memory.
@@ -50,5 +50,5 @@ REGISTER_KERNEL_BUILDER(Name("Mul")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::mul<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_mul_2.cc b/tensorflow/core/kernels/cwise_op_mul_2.cc
index 7be5857cc0..6aa8f88364 100644
--- a/tensorflow/core/kernels/cwise_op_mul_2.cc
+++ b/tensorflow/core/kernels/cwise_op_mul_2.cc
@@ -22,11 +22,11 @@ namespace tensorflow {
 // sharded files, only make its register calls when not __ANDROID_TYPES_SLIM__.
 #if !defined(__ANDROID_TYPES_SLIM__)
 
-REGISTER6(BinaryOp, CPU, "Mul", functor::mul,
-          int8, uint16, int16, int64, complex64, complex128);
+REGISTER6(BinaryOp, CPU, "Mul", functor::mul, int8, uint16, int16, int64,
+          complex64, complex128);
 #if GOOGLE_CUDA
 REGISTER6(BinaryOp, GPU, "Mul", functor::mul, int8, uint16, int16, int64,
-           complex64, complex128);
+          complex64, complex128);
 
 #endif  // GOOGLE_CUDA
 
diff --git a/tensorflow/core/kernels/cwise_op_neg.cc b/tensorflow/core/kernels/cwise_op_neg.cc
index 536891b548..a136769b91 100644
--- a/tensorflow/core/kernels/cwise_op_neg.cc
+++ b/tensorflow/core/kernels/cwise_op_neg.cc
@@ -27,7 +27,7 @@ REGISTER_KERNEL_BUILDER(Name("Neg")
                             .HostMemory("y")
                             .TypeConstraint<int32>("T"),
                         UnaryOp<CPUDevice, functor::neg<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
 REGISTER6(UnaryOp, GPU, "Neg", functor::neg, float, Eigen::half, double, int64,
diff --git a/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc b/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc
index 7bd81ee127..02cd298745 100644
--- a/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc
+++ b/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 namespace tensorflow {
 REGISTER6(BinaryOp, CPU, "NotEqual", functor::not_equal_to, float, Eigen::half,
-           double, uint8, int8, int16);
+          double, uint8, int8, int16);
 #if GOOGLE_CUDA
 REGISTER4(BinaryOp, GPU, "NotEqual", functor::not_equal_to, float, Eigen::half,
           double, uint8);
diff --git a/tensorflow/core/kernels/cwise_op_not_equal_to_2.cc b/tensorflow/core/kernels/cwise_op_not_equal_to_2.cc
index 7d4ecec59f..05bdea6636 100644
--- a/tensorflow/core/kernels/cwise_op_not_equal_to_2.cc
+++ b/tensorflow/core/kernels/cwise_op_not_equal_to_2.cc
@@ -30,5 +30,5 @@ REGISTER6(BinaryOp, GPU, "NotEqual", functor::not_equal_to, int8, int16, int64,
 
 #endif  // GOOGLE_CUDA
 
-#endif   // !defined(__ANDROID_TYPES_SLIM__)
+#endif  // !defined(__ANDROID_TYPES_SLIM__)
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_reciprocal.cc b/tensorflow/core/kernels/cwise_op_reciprocal.cc
index 8c0e21f9cf..aee25747b8 100644
--- a/tensorflow/core/kernels/cwise_op_reciprocal.cc
+++ b/tensorflow/core/kernels/cwise_op_reciprocal.cc
@@ -38,7 +38,7 @@ REGISTER4(UnaryOp, GPU, "Reciprocal", functor::inverse, float, Eigen::half,
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER(UnaryOp, SYCL, "Reciprocal", functor::inverse, float);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 REGISTER5(SimpleBinaryOp, CPU, "ReciprocalGrad", functor::inverse_grad, float,
           Eigen::half, double, complex64, complex128);
@@ -48,5 +48,5 @@ REGISTER3(SimpleBinaryOp, GPU, "ReciprocalGrad", functor::inverse_grad, float,
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER(SimpleBinaryOp, SYCL, "ReciprocalGrad", functor::inverse_grad, float);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_select.cc b/tensorflow/core/kernels/cwise_op_select.cc
index 3dd9de8d89..e259daaba4 100644
--- a/tensorflow/core/kernels/cwise_op_select.cc
+++ b/tensorflow/core/kernels/cwise_op_select.cc
@@ -30,7 +30,7 @@ typedef Eigen::GpuDevice GPUDevice;
 
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 class SelectOp : public OpKernel {
@@ -185,7 +185,7 @@ REGISTER_SELECT_SYCL(double);
 REGISTER_SELECT_SYCL(int32);
 REGISTER_SELECT_SYCL(int64);
 #undef REGISTER_SELECT_SYCL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 namespace functor {
 
@@ -201,13 +201,11 @@ struct SelectFunctorBase {
 };
 
 template <typename T>
-struct SelectFunctor<CPUDevice, T>
-        : SelectFunctorBase<CPUDevice, T> {};
+struct SelectFunctor<CPUDevice, T> : SelectFunctorBase<CPUDevice, T> {};
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
-struct SelectFunctor<SYCLDevice, T>
-        : SelectFunctorBase<SYCLDevice, T> {};
-#endif // TENSORFLOW_USE_SYCL
+struct SelectFunctor<SYCLDevice, T> : SelectFunctorBase<SYCLDevice, T> {};
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 struct SelectScalarFunctorBase {
@@ -222,12 +220,12 @@ struct SelectScalarFunctorBase {
 // CPU Specializations of Select functors with scalar
 template <typename T>
 struct SelectScalarFunctor<CPUDevice, T>
-        : SelectScalarFunctorBase<CPUDevice, T> {};
+    : SelectScalarFunctorBase<CPUDevice, T> {};
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
 struct SelectScalarFunctor<SYCLDevice, T>
-        : SelectScalarFunctorBase<SYCLDevice, T> {};
-#endif // TENSORFLOW_USE_SYCL
+    : SelectScalarFunctorBase<SYCLDevice, T> {};
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 struct BatchSelectFunctorBase {
@@ -240,8 +238,8 @@ struct BatchSelectFunctorBase {
     const Eigen::DenseIndex all_but_batch = then_flat_outer_dims.dimension(1);
 
 #if !defined(EIGEN_HAS_INDEX_LIST)
-    Eigen::array<Eigen::DenseIndex, 2> broadcast_dims{{ 1, all_but_batch }};
-    Eigen::Tensor<Eigen::DenseIndex, 2>::Dimensions reshape_dims{{ batch, 1 }};
+    Eigen::array<Eigen::DenseIndex, 2> broadcast_dims{{1, all_but_batch}};
+    Eigen::Tensor<Eigen::DenseIndex, 2>::Dimensions reshape_dims{{batch, 1}};
 #else
     Eigen::IndexList<Eigen::type2index<1>, Eigen::DenseIndex> broadcast_dims;
     broadcast_dims.set(1, all_but_batch);
@@ -257,13 +255,13 @@ struct BatchSelectFunctorBase {
 };
 
 template <typename T>
-struct BatchSelectFunctor<CPUDevice, T>
-        : BatchSelectFunctorBase<CPUDevice, T> {};
+struct BatchSelectFunctor<CPUDevice, T> : BatchSelectFunctorBase<CPUDevice, T> {
+};
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
 struct BatchSelectFunctor<SYCLDevice, T>
-        : BatchSelectFunctorBase<SYCLDevice, T> {};
-#endif // TENSORFLOW_USE_SYCL
+    : BatchSelectFunctorBase<SYCLDevice, T> {};
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
 
diff --git a/tensorflow/core/kernels/cwise_op_sigmoid.cc b/tensorflow/core/kernels/cwise_op_sigmoid.cc
index a76a088ac8..c132fdb63f 100644
--- a/tensorflow/core/kernels/cwise_op_sigmoid.cc
+++ b/tensorflow/core/kernels/cwise_op_sigmoid.cc
@@ -25,7 +25,7 @@ REGISTER3(UnaryOp, GPU, "Sigmoid", functor::sigmoid, float, Eigen::half,
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER(UnaryOp, SYCL, "Sigmoid", functor::sigmoid, float);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 REGISTER5(SimpleBinaryOp, CPU, "SigmoidGrad", functor::sigmoid_grad, float,
           Eigen::half, double, complex64, complex128);
@@ -35,6 +35,6 @@ REGISTER3(SimpleBinaryOp, GPU, "SigmoidGrad", functor::sigmoid_grad, float,
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER(SimpleBinaryOp, SYCL, "SigmoidGrad", functor::sigmoid_grad, float);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_sign.cc b/tensorflow/core/kernels/cwise_op_sign.cc
index a4084d5ad1..02915ff4ce 100644
--- a/tensorflow/core/kernels/cwise_op_sign.cc
+++ b/tensorflow/core/kernels/cwise_op_sign.cc
@@ -41,6 +41,6 @@ REGISTER_KERNEL_BUILDER(Name("Sign")
                             .HostMemory("y")
                             .TypeConstraint<int32>("T"),
                         UnaryOp<CPUDevice, functor::sign<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_sin.cc b/tensorflow/core/kernels/cwise_op_sin.cc
index b91ff1ac30..16c6057864 100644
--- a/tensorflow/core/kernels/cwise_op_sin.cc
+++ b/tensorflow/core/kernels/cwise_op_sin.cc
@@ -25,5 +25,5 @@ REGISTER3(UnaryOp, GPU, "Sin", functor::sin, float, Eigen::half, double);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Sin", functor::sin, float, double);
-#endif // TENSORFLOW_USE_SYC
+#endif  // TENSORFLOW_USE_SYC
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_sinh.cc b/tensorflow/core/kernels/cwise_op_sinh.cc
index 055f0b12e1..26b7a940aa 100644
--- a/tensorflow/core/kernels/cwise_op_sinh.cc
+++ b/tensorflow/core/kernels/cwise_op_sinh.cc
@@ -16,20 +16,18 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER4(UnaryOp, CPU, "Sinh", functor::sinh, float, double,
-          complex64, complex128);
+REGISTER4(UnaryOp, CPU, "Sinh", functor::sinh, float, double, complex64,
+          complex128);
 
 #if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-                          Name("Sinh")                                \
-                          .Device(DEVICE_SYCL)                        \
-                          .TypeConstraint<TYPE>("T"),                 \
-                          UnaryOp<SYCLDevice, functor::sinh<TYPE>>);
+#define REGISTER_SYCL_KERNEL(TYPE)                                \
+  REGISTER_KERNEL_BUILDER(                                        \
+      Name("Sinh").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
+      UnaryOp<SYCLDevice, functor::sinh<TYPE>>);
 REGISTER_SYCL_KERNEL(float);
 REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYC
+#endif  // TENSORFLOW_USE_SYC
 
 #if GOOGLE_CUDA
 REGISTER2(UnaryOp, GPU, "Sinh", functor::sinh, float, double);
diff --git a/tensorflow/core/kernels/cwise_op_sqrt.cc b/tensorflow/core/kernels/cwise_op_sqrt.cc
index 00efbb00f1..497756133d 100644
--- a/tensorflow/core/kernels/cwise_op_sqrt.cc
+++ b/tensorflow/core/kernels/cwise_op_sqrt.cc
@@ -25,7 +25,7 @@ REGISTER3(UnaryOp, GPU, "Sqrt", functor::sqrt, float, Eigen::half, double);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Sqrt", functor::sqrt, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 REGISTER5(SimpleBinaryOp, CPU, "SqrtGrad", functor::sqrt_grad, float,
           Eigen::half, double, complex64, complex128);
@@ -36,5 +36,5 @@ REGISTER3(SimpleBinaryOp, GPU, "SqrtGrad", functor::sqrt_grad, float,
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(SimpleBinaryOp, SYCL, "SqrtGrad", functor::sqrt_grad, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_square.cc b/tensorflow/core/kernels/cwise_op_square.cc
index 07a4b0b084..7fc2f6bf08 100644
--- a/tensorflow/core/kernels/cwise_op_square.cc
+++ b/tensorflow/core/kernels/cwise_op_square.cc
@@ -42,5 +42,5 @@ REGISTER_KERNEL_BUILDER(Name("Square")
                             .HostMemory("y")
                             .TypeConstraint<int32>("T"),
                         UnaryOp<CPUDevice, functor::square<int32>>);
-#endif // TENSORFLOW_USE_SYC
+#endif  // TENSORFLOW_USE_SYC
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_sub.cc b/tensorflow/core/kernels/cwise_op_sub.cc
index 6adaecba04..025041946a 100644
--- a/tensorflow/core/kernels/cwise_op_sub.cc
+++ b/tensorflow/core/kernels/cwise_op_sub.cc
@@ -53,5 +53,5 @@ REGISTER_KERNEL_BUILDER(Name("Sub")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::sub<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_tan.cc b/tensorflow/core/kernels/cwise_op_tan.cc
index 7891b1183d..c1a25767d3 100644
--- a/tensorflow/core/kernels/cwise_op_tan.cc
+++ b/tensorflow/core/kernels/cwise_op_tan.cc
@@ -24,5 +24,5 @@ REGISTER2(UnaryOp, GPU, "Tan", functor::tan, float, double);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Tan", functor::tan, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_tanh.cc b/tensorflow/core/kernels/cwise_op_tanh.cc
index 8b3900892c..c5005f5ea8 100644
--- a/tensorflow/core/kernels/cwise_op_tanh.cc
+++ b/tensorflow/core/kernels/cwise_op_tanh.cc
@@ -26,7 +26,7 @@ REGISTER3(UnaryOp, GPU, "Tanh", functor::tanh, float, Eigen::half, double);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Tanh", functor::tanh, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 REGISTER5(SimpleBinaryOp, CPU, "TanhGrad", functor::tanh_grad, float,
           Eigen::half, double, complex64, complex128);
diff --git a/tensorflow/core/kernels/cwise_ops_common.cc b/tensorflow/core/kernels/cwise_ops_common.cc
index e561e59cf5..980edffceb 100644
--- a/tensorflow/core/kernels/cwise_ops_common.cc
+++ b/tensorflow/core/kernels/cwise_ops_common.cc
@@ -57,9 +57,9 @@ BinaryOpShared::BinaryOpState::BinaryOpState(OpKernelContext* ctx)
       in1(ctx->input(1)),
       bcast(BCast::FromShape(in0.shape()), BCast::FromShape(in1.shape())) {
   if (!bcast.IsValid()) {
-    ctx->SetStatus(errors::InvalidArgument("Incompatible shapes: ",
-                                           in0.shape().DebugString(), " vs. ",
-                                           in1.shape().DebugString()));
+    ctx->SetStatus(errors::InvalidArgument(
+        "Incompatible shapes: ", in0.shape().DebugString(), " vs. ",
+        in1.shape().DebugString()));
     return;
   }
   const TensorShape output_shape = BCast::ToShape(bcast.output_shape());
diff --git a/tensorflow/core/kernels/cwise_ops_gpu_gradients.cu.h b/tensorflow/core/kernels/cwise_ops_gpu_gradients.cu.h
index 4394770708..e81b840a50 100644
--- a/tensorflow/core/kernels/cwise_ops_gpu_gradients.cu.h
+++ b/tensorflow/core/kernels/cwise_ops_gpu_gradients.cu.h
@@ -50,16 +50,16 @@ struct SimpleBinaryFunctor<GPUDevice, Functor> {
 
 // Macros to explicitly instantiate kernels on GPU for multiple types
 // (T0, T1, etc.) for SimpleBiaryFunctor (e.g., functor::tanh_grad).
-#define DEFINE_SIMPLE_BINARY1(F, T)                  \
+#define DEFINE_SIMPLE_BINARY1(F, T) \
   template struct SimpleBinaryFunctor<GPUDevice, F<T> >
-#define DEFINE_SIMPLE_BINARY2(F, T0, T1)             \
-  DEFINE_SIMPLE_BINARY1(F, T0);                      \
+#define DEFINE_SIMPLE_BINARY2(F, T0, T1) \
+  DEFINE_SIMPLE_BINARY1(F, T0);          \
   DEFINE_SIMPLE_BINARY1(F, T1)
-#define DEFINE_SIMPLE_BINARY3(F, T0, T1, T2)         \
-  DEFINE_SIMPLE_BINARY2(F, T0, T1);                  \
+#define DEFINE_SIMPLE_BINARY3(F, T0, T1, T2) \
+  DEFINE_SIMPLE_BINARY2(F, T0, T1);          \
   DEFINE_SIMPLE_BINARY1(F, T2)
-#define DEFINE_SIMPLE_BINARY4(F, T0, T1, T2, T3)     \
-  DEFINE_SIMPLE_BINARY2(F, T0, T1);                  \
+#define DEFINE_SIMPLE_BINARY4(F, T0, T1, T2, T3) \
+  DEFINE_SIMPLE_BINARY2(F, T0, T1);              \
   DEFINE_SIMPLE_BINARY2(F, T2, T3)
 #define DEFINE_SIMPLE_BINARY5(F, T0, T1, T2, T3, T4) \
   DEFINE_SIMPLE_BINARY2(F, T0, T1);                  \
diff --git a/tensorflow/core/kernels/cwise_ops_gradients.h b/tensorflow/core/kernels/cwise_ops_gradients.h
index 77b330f589..82cdae9a34 100644
--- a/tensorflow/core/kernels/cwise_ops_gradients.h
+++ b/tensorflow/core/kernels/cwise_ops_gradients.h
@@ -171,7 +171,6 @@ struct SimpleBinaryFunctor<CPUDevice, Functor> {
   }
 };
 
-
 #ifdef TENSORFLOW_USE_SYCL
 // Partial specialization of BinaryFunctor for SYCL devices
 typedef Eigen::SyclDevice SYCLDevice;
@@ -184,7 +183,7 @@ struct SimpleBinaryFunctor<SYCLDevice, Functor> {
   }
 };
 
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename T>
 struct tanh_grad : base<T, Eigen::internal::scalar_tanh_gradient_op<T>> {};
diff --git a/tensorflow/core/kernels/cwise_ops_sycl_common.h b/tensorflow/core/kernels/cwise_ops_sycl_common.h
index 3f6ff7303d..3e107cee04 100644
--- a/tensorflow/core/kernels/cwise_ops_sycl_common.h
+++ b/tensorflow/core/kernels/cwise_ops_sycl_common.h
@@ -51,7 +51,8 @@ struct BinaryFunctor<SYCLDevice, Functor, NDIMS, has_errors> {
   void operator()(const SYCLDevice& d, typename Functor::tout_type out,
                   typename Functor::tin_type in0,
                   typename Functor::tin_type in1, bool* error) {
-    To32Bit(out).device(d) = To32Bit(in0).binaryExpr(To32Bit(in1), typename Functor::func());
+    To32Bit(out).device(d) =
+        To32Bit(in0).binaryExpr(To32Bit(in1), typename Functor::func());
   }
 
   void Left(const SYCLDevice& d, typename Functor::tout_type out,
@@ -61,7 +62,9 @@ struct BinaryFunctor<SYCLDevice, Functor, NDIMS, has_errors> {
     constexpr int NumDims = Functor::tin_type::NumDimensions;
     static_assert(NumDims == 1, "Unexpected size");
     Eigen::Sizes<1> scalar_dim;
-    out.device(d) = scalar.reshape(scalar_dim).broadcast(in.dimensions()).binaryExpr(in, Binary());
+    out.device(d) = scalar.reshape(scalar_dim)
+                        .broadcast(in.dimensions())
+                        .binaryExpr(in, Binary());
   }
 
   void Right(const SYCLDevice& d, typename Functor::tout_type out,
@@ -71,7 +74,8 @@ struct BinaryFunctor<SYCLDevice, Functor, NDIMS, has_errors> {
     constexpr int NumDims = Functor::tin_type::NumDimensions;
     static_assert(NumDims == 1, "Unexpected size");
     Eigen::Sizes<1> scalar_dim;
-    out.device(d) = in.binaryExpr(scalar.reshape(scalar_dim).broadcast(in.dimensions()), Binary());
+    out.device(d) = in.binaryExpr(
+        scalar.reshape(scalar_dim).broadcast(in.dimensions()), Binary());
   }
 
   void BCast(const SYCLDevice& d,
diff --git a/tensorflow/core/kernels/cwise_ops_test.cc b/tensorflow/core/kernels/cwise_ops_test.cc
index bca0f1004d..39f497e716 100644
--- a/tensorflow/core/kernels/cwise_ops_test.cc
+++ b/tensorflow/core/kernels/cwise_ops_test.cc
@@ -54,36 +54,36 @@ int ColsFromArg(int arg) { return (arg % kRows); }
 BM_UNARY(cpu, Floor, float, DT_FLOAT);
 #if GOOGLE_CUDA
 BM_UNARY(gpu, Floor, float, DT_FLOAT);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
 BM_UNARY(sycl, Floor, float, DT_FLOAT);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 BM_UNARY(cpu, Floor, double, DT_DOUBLE);
 #if GOOGLE_CUDA
 BM_UNARY(gpu, Floor, double, DT_DOUBLE);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
 BM_UNARY(sycl, Floor, double, DT_DOUBLE);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 BM_UNARY(cpu, Conj, std::complex<float>, DT_COMPLEX64);
 #if GOOGLE_CUDA
 BM_UNARY(gpu, Conj, std::complex<float>, DT_COMPLEX64);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 BM_UNARY(cpu, Conj, std::complex<double>, DT_COMPLEX128);
 #if GOOGLE_CUDA
 BM_UNARY(gpu, Conj, std::complex<double>, DT_COMPLEX128);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 
 BM_UNARY(cpu, Rint, double, DT_DOUBLE);
 #if GOOGLE_CUDA
 BM_UNARY(gpu, Rint, double, DT_DOUBLE);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 BM_UNARY(cpu, Rint, float, DT_FLOAT);
 #if GOOGLE_CUDA
 BM_UNARY(gpu, Rint, float, DT_FLOAT);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 
 // data func scalar.
 Graph* BinaryScalar(int num, const string& func) {
@@ -113,18 +113,18 @@ Graph* BinaryScalar(int num, const string& func) {
 BM_BINARY_SCALAR(cpu, Less);
 #if GOOGLE_CUDA
 BM_BINARY_SCALAR(gpu, Less);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
 BM_BINARY_SCALAR(sycl, Less);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 BM_BINARY_SCALAR(cpu, Add);
 #if GOOGLE_CUDA
 BM_BINARY_SCALAR(gpu, Add);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
 BM_BINARY_SCALAR(sycl, Add);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 #undef BM_BINARY_SCALAR
 
 template <class T>
@@ -163,11 +163,11 @@ using Eigen::half;
 BM_BIAS_ADD_ALL(cpu, float, DT_FLOAT);
 #if GOOGLE_CUDA
 BM_BIAS_ADD_ALL(gpu, float, DT_FLOAT);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 BM_BIAS_ADD_ALL(cpu, half, DT_HALF);
 #if GOOGLE_CUDA
 BM_BIAS_ADD_ALL(gpu, half, DT_HALF);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 #undef BM_BIAS_ADD_ALL
 #undef BM_BIAS_ADD
 
@@ -217,15 +217,15 @@ using Eigen::half;
 #if GOOGLE_CUDA
 BM_BIAS_ADD_GRAD_ALL(gpu, NCHW, float, DT_FLOAT);
 BM_BIAS_ADD_GRAD_ALL(gpu, NCHW, half, DT_HALF);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 BM_BIAS_ADD_GRAD_ALL(cpu, NHWC, float, DT_FLOAT);
 #if GOOGLE_CUDA
 BM_BIAS_ADD_GRAD_ALL(gpu, NHWC, float, DT_FLOAT);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 BM_BIAS_ADD_GRAD_ALL(cpu, NHWC, half, DT_HALF);
 #if GOOGLE_CUDA
 BM_BIAS_ADD_GRAD_ALL(gpu, NHWC, half, DT_HALF);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 #undef BM_BIAS_ADD_GRAD_ALL
 #undef BM_BIAS_ADD_GRAD
 
@@ -265,10 +265,10 @@ Graph* BcastAdd(int rows, int cols, int dim) {
 BM_BCAST_ADD_ROW_ALL(cpu);
 #if GOOGLE_CUDA
 BM_BCAST_ADD_ROW_ALL(gpu);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
 BM_BCAST_ADD_ROW_ALL(sycl);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 #undef BM_BCAST_ADD_ROW_ALL
 #undef BM_BCAST_ADD_ROW
 
@@ -291,10 +291,10 @@ BM_BCAST_ADD_ROW_ALL(sycl);
 BM_BCAST_ADD_COL_ALL(cpu);
 #if GOOGLE_CUDA
 BM_BCAST_ADD_COL_ALL(gpu);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
 BM_BCAST_ADD_COL_ALL(sycl);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 #undef BM_BCAST_ADD_COL_ALL
 #undef BM_BCAST_ADD_COL
 
diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index 56044a3d41..ca22f10a85 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -430,13 +430,10 @@ class IteratorStateVariant {
 REGISTER_UNARY_VARIANT_DECODE_FUNCTION(IteratorStateVariant,
                                        kIteratorVariantTypeName);
 
-// TODO(mrry): Can we simply use the template kernel here?
 class IteratorHandleOp : public OpKernel {
  public:
   explicit IteratorHandleOp(OpKernelConstruction* ctx)
       : OpKernel(ctx), graph_def_version_(ctx->graph_def_version()) {
-    OP_REQUIRES_OK(ctx, ctx->allocate_persistent(DT_STRING, TensorShape({2}),
-                                                 &handle_, nullptr));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_dtypes_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("shared_name", &name_));
@@ -460,56 +457,51 @@ class IteratorHandleOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* context) override LOCKS_EXCLUDED(mu_) {
-    mutex_lock l(mu_);
-    FunctionLibraryRuntime* lib = context->function_library();
-    std::unique_ptr<DeviceMgr> device_mgr(nullptr);
-    std::unique_ptr<FunctionLibraryDefinition> flib_def(nullptr);
-    std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(nullptr);
-    // If the iterator is shared then we construct a new FLR, and pass that in.
-    // NOTE(mrry,rohanj): In this case it is not possible to call remote
-    // functions from the iterator. We may add this functionality if there
-    // is sufficient demand, but it will require a significant refactoring.
-    if (!name_.empty()) {
-      lib = CreateFLR(context, &device_mgr, &flib_def, &pflr);
-    }
+    {
+      mutex_lock l(mu_);
+      if (resource_ == nullptr) {
+        FunctionLibraryRuntime* lib = context->function_library();
+        std::unique_ptr<DeviceMgr> device_mgr(nullptr);
+        std::unique_ptr<FunctionLibraryDefinition> flib_def(nullptr);
+        std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(nullptr);
+        // If the iterator is shared then we construct a new FLR, and pass that
+        // in. NOTE(mrry,rohanj): In this case it is not possible to call remote
+        // functions from the iterator. We may add this functionality if there
+        // is sufficient demand, but it will require a significant refactoring.
+        if (!name_.empty()) {
+          lib = CreatePrivateFLR(context, &device_mgr, &flib_def, &pflr);
+        }
 
-    if (resource_ == nullptr) {
-      ResourceMgr* mgr = context->resource_manager();
-      OP_REQUIRES_OK(context, cinfo_.Init(mgr, def()));
+        ResourceMgr* mgr = context->resource_manager();
+        OP_REQUIRES_OK(context, cinfo_.Init(mgr, def()));
+
+        IteratorResource* resource;
+        OP_REQUIRES_OK(
+            context,
+            mgr->LookupOrCreate<IteratorResource>(
+                cinfo_.container(), cinfo_.name(), &resource,
+                [lib, &device_mgr, &flib_def, &pflr,
+                 this](IteratorResource** ret) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+                  *ret = new IteratorResource(
+                      output_dtypes_, output_shapes_, graph_def_version_,
+                      std::move(device_mgr), std::move(flib_def),
+                      std::move(pflr), lib);
+                  return Status::OK();
+                }));
+
+        Status s = VerifyResource(resource);
+        if (TF_PREDICT_FALSE(!s.ok())) {
+          resource->Unref();
+          context->SetStatus(s);
+          return;
+        }
 
-      IteratorResource* resource;
-      OP_REQUIRES_OK(
-          context,
-          mgr->LookupOrCreate<IteratorResource>(
-              cinfo_.container(), cinfo_.name(), &resource,
-              [lib, &device_mgr, &flib_def, &pflr, this](IteratorResource** ret)
-                  EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-                    *ret = new IteratorResource(
-                        output_dtypes_, output_shapes_, graph_def_version_,
-                        std::move(device_mgr), std::move(flib_def),
-                        std::move(pflr), lib);
-                    return Status::OK();
-                  }));
-
-      Status s = VerifyResource(resource);
-      if (TF_PREDICT_FALSE(!s.ok())) {
-        resource->Unref();
-        context->SetStatus(s);
-        return;
+        resource_ = resource;
       }
-
-      auto h = handle_.AccessTensor(context)->template flat<string>();
-      h(0) = cinfo_.container();
-      h(1) = cinfo_.name();
-      resource_ = resource;
-    }
-    if (context->expected_output_dtype(0) == DT_RESOURCE) {
-      OP_REQUIRES_OK(context, MakeResourceHandleToOutput(
-                                  context, 0, cinfo_.container(), cinfo_.name(),
-                                  MakeTypeIndex<IteratorResource>()));
-    } else {
-      context->set_output_ref(0, &mu_, handle_.AccessTensor(context));
     }
+    OP_REQUIRES_OK(context, MakeResourceHandleToOutput(
+                                context, 0, cinfo_.container(), cinfo_.name(),
+                                MakeTypeIndex<IteratorResource>()));
   }
 
  private:
@@ -526,7 +518,7 @@ class IteratorHandleOp : public OpKernel {
     return Status::OK();
   }
 
-  FunctionLibraryRuntime* CreateFLR(
+  FunctionLibraryRuntime* CreatePrivateFLR(
       OpKernelContext* ctx, std::unique_ptr<DeviceMgr>* device_mgr,
       std::unique_ptr<FunctionLibraryDefinition>* flib_def,
       std::unique_ptr<ProcessFunctionLibraryRuntime>* pflr) {
@@ -546,9 +538,8 @@ class IteratorHandleOp : public OpKernel {
   }
 
   mutex mu_;
-  ContainerInfo cinfo_ GUARDED_BY(mu_);
+  ContainerInfo cinfo_;  // Written once under mu_ then constant afterwards.
   IteratorResource* resource_ GUARDED_BY(mu_) = nullptr;
-  PersistentTensor handle_ GUARDED_BY(mu_);
   DataTypeVector output_dtypes_;
   std::vector<PartialTensorShape> output_shapes_;
   const int graph_def_version_;
diff --git a/tensorflow/core/kernels/debug_ops.cc b/tensorflow/core/kernels/debug_ops.cc
index 965a60c7e0..1b94ea0544 100644
--- a/tensorflow/core/kernels/debug_ops.cc
+++ b/tensorflow/core/kernels/debug_ops.cc
@@ -46,7 +46,7 @@ REGISTER_KERNEL_BUILDER(Name("CopyHost")
                             .HostMemory("input")
                             .HostMemory("output"),
                         CopyOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // Register debug identity (non-ref and ref) ops.
 REGISTER_KERNEL_BUILDER(Name("DebugIdentity").Device(DEVICE_CPU),
@@ -66,7 +66,7 @@ REGISTER_KERNEL_BUILDER(Name("DebugIdentity")
                             .HostMemory("input")
                             .HostMemory("output"),
                         DebugIdentityOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // Register debug NaN-counter (non-ref and ref) ops.
 #define REGISTER_DEBUG_NAN_COUNT(type)                                    \
@@ -98,7 +98,7 @@ REGISTER_GPU_DEBUG_NAN_COUNT(double);
                           DebugNanCountOp<type>);
 REGISTER_GPU_DEBUG_NAN_COUNT(float);
 REGISTER_GPU_DEBUG_NAN_COUNT(double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // Register debug numeric summary ops.
 #define REGISTER_DEBUG_NUMERIC_SUMMARY_COUNT(type)        \
diff --git a/tensorflow/core/kernels/debug_ops.h b/tensorflow/core/kernels/debug_ops.h
index 381add3fb3..53a23b1306 100644
--- a/tensorflow/core/kernels/debug_ops.h
+++ b/tensorflow/core/kernels/debug_ops.h
@@ -21,7 +21,7 @@ limitations under the License.
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 #include "tensorflow/core/common_runtime/sycl/sycl_util.h"
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 #include "tensorflow/core/debug/debug_io_utils.h"
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -91,7 +91,7 @@ class CopyOp : public OpKernel {
       Device* device = static_cast<Device*>(context->device());
       // Determine if the input tensor is not on CPU (e.g., on GPU).
       const bool off_host_input = device->device_type() == DEVICE_SYCL &&
-                            !context->input_alloc_attr(0).on_host();
+                                  !context->input_alloc_attr(0).on_host();
 
       if (off_host_input) {
         SYCLmemcpy(context->eigen_sycl_device(), src_tensor, copied_tensor);
diff --git a/tensorflow/core/kernels/decode_csv_op.cc b/tensorflow/core/kernels/decode_csv_op.cc
index c4555db453..0c42f63252 100644
--- a/tensorflow/core/kernels/decode_csv_op.cc
+++ b/tensorflow/core/kernels/decode_csv_op.cc
@@ -91,9 +91,9 @@ class DecodeCSVOp : public OpKernel {
             } else {
               int32 value;
               OP_REQUIRES(ctx, strings::safe_strto32(fields[f], &value),
-                          errors::InvalidArgument("Field ", f, " in record ", i,
-                                                  " is not a valid int32: ",
-                                                  fields[f]));
+                          errors::InvalidArgument(
+                              "Field ", f, " in record ", i,
+                              " is not a valid int32: ", fields[f]));
               output[f]->flat<int32>()(i) = value;
             }
             break;
@@ -111,9 +111,9 @@ class DecodeCSVOp : public OpKernel {
             } else {
               int64 value;
               OP_REQUIRES(ctx, strings::safe_strto64(fields[f], &value),
-                          errors::InvalidArgument("Field ", f, " in record ", i,
-                                                  " is not a valid int64: ",
-                                                  fields[f]));
+                          errors::InvalidArgument(
+                              "Field ", f, " in record ", i,
+                              " is not a valid int64: ", fields[f]));
               output[f]->flat<int64>()(i) = value;
             }
             break;
@@ -130,9 +130,9 @@ class DecodeCSVOp : public OpKernel {
             } else {
               float value;
               OP_REQUIRES(ctx, strings::safe_strtof(fields[f].c_str(), &value),
-                          errors::InvalidArgument("Field ", f, " in record ", i,
-                                                  " is not a valid float: ",
-                                                  fields[f]));
+                          errors::InvalidArgument(
+                              "Field ", f, " in record ", i,
+                              " is not a valid float: ", fields[f]));
               output[f]->flat<float>()(i) = value;
             }
             break;
@@ -150,9 +150,9 @@ class DecodeCSVOp : public OpKernel {
             } else {
               double value;
               OP_REQUIRES(ctx, strings::safe_strtod(fields[f].c_str(), &value),
-                          errors::InvalidArgument("Field ", f, " in record ", i,
-                                                  " is not a valid double: ",
-                                                  fields[f]));
+                          errors::InvalidArgument(
+                              "Field ", f, " in record ", i,
+                              " is not a valid double: ", fields[f]));
               output[f]->flat<double>()(i) = value;
             }
             break;
@@ -208,9 +208,10 @@ class DecodeCSVOp : public OpKernel {
         if (!quoted) {
           while (static_cast<size_t>(current_idx) < input.size() &&
                  input[current_idx] != delim_) {
-            OP_REQUIRES(ctx, (!use_quote_delim_ || input[current_idx] != '"') &&
-                                 input[current_idx] != '\n' &&
-                                 input[current_idx] != '\r',
+            OP_REQUIRES(ctx,
+                        (!use_quote_delim_ || input[current_idx] != '"') &&
+                            input[current_idx] != '\n' &&
+                            input[current_idx] != '\r',
                         errors::InvalidArgument(
                             "Unquoted fields cannot have quotes/CRLFs inside"));
             field += input[current_idx];
@@ -238,10 +239,11 @@ class DecodeCSVOp : public OpKernel {
           }
 
           OP_REQUIRES(
-              ctx, (static_cast<size_t>(current_idx) < input.size() &&
-                    input[current_idx] == '"' &&
-                    (static_cast<size_t>(current_idx) == input.size() - 1 ||
-                     input[current_idx + 1] == delim_)),
+              ctx,
+              (static_cast<size_t>(current_idx) < input.size() &&
+               input[current_idx] == '"' &&
+               (static_cast<size_t>(current_idx) == input.size() - 1 ||
+                input[current_idx + 1] == delim_)),
               errors::InvalidArgument("Quoted field has to end with quote "
                                       "followed by delim or end"));
 
diff --git a/tensorflow/core/kernels/decode_image_op.cc b/tensorflow/core/kernels/decode_image_op.cc
index 44dcbf834c..912d04c153 100644
--- a/tensorflow/core/kernels/decode_image_op.cc
+++ b/tensorflow/core/kernels/decode_image_op.cc
@@ -87,10 +87,11 @@ class DecodeImageOp : public OpKernel {
       channels_ = 3;
     } else {
       OP_REQUIRES_OK(context, context->GetAttr("channels", &channels_));
-      OP_REQUIRES(context, channels_ == 0 || channels_ == 1 || channels_ == 3 ||
-                               channels_ == 4,
-                  errors::InvalidArgument(
-                      "channels must be 0, 1, 3, or 4, got ", channels_));
+      OP_REQUIRES(
+          context,
+          channels_ == 0 || channels_ == 1 || channels_ == 3 || channels_ == 4,
+          errors::InvalidArgument("channels must be 0, 1, 3, or 4, got ",
+                                  channels_));
     }
     flags_.components = channels_;
 
@@ -114,8 +115,9 @@ class DecodeImageOp : public OpKernel {
 
     if (format_ == kJpgFormat) {
       OP_REQUIRES_OK(context, context->GetAttr("ratio", &flags_.ratio));
-      OP_REQUIRES(context, flags_.ratio == 1 || flags_.ratio == 2 ||
-                               flags_.ratio == 4 || flags_.ratio == 8,
+      OP_REQUIRES(context,
+                  flags_.ratio == 1 || flags_.ratio == 2 || flags_.ratio == 4 ||
+                      flags_.ratio == 8,
                   errors::InvalidArgument("ratio must be 1, 2, 4, or 8, got ",
                                           flags_.ratio));
       OP_REQUIRES_OK(context, context->GetAttr("fancy_upscaling",
@@ -130,8 +132,9 @@ class DecodeImageOp : public OpKernel {
       string dct_method;
       OP_REQUIRES_OK(context, context->GetAttr("dct_method", &dct_method));
       OP_REQUIRES(
-          context, (dct_method.empty() || dct_method == "INTEGER_FAST" ||
-                    dct_method == "INTEGER_ACCURATE"),
+          context,
+          (dct_method.empty() || dct_method == "INTEGER_FAST" ||
+           dct_method == "INTEGER_ACCURATE"),
           errors::InvalidArgument("dct_method must be one of "
                                   "{'', 'INTEGER_FAST', 'INTEGER_ACCURATE'}"));
       if (dct_method == "INTEGER_FAST") {
@@ -157,9 +160,9 @@ class DecodeImageOp : public OpKernel {
         errors::InvalidArgument("Expected image (JPEG, PNG, or GIF), got ",
                                 FileFormatString(magic, input)));
     OP_REQUIRES(context, input.size() <= std::numeric_limits<int>::max(),
-                errors::InvalidArgument(FileFormatString(magic, input),
-                                        " contents are too large for int: ",
-                                        input.size()));
+                errors::InvalidArgument(
+                    FileFormatString(magic, input),
+                    " contents are too large for int: ", input.size()));
     OP_REQUIRES(context, magic == kPngFormat || channel_bits_ == 8,
                 errors::InvalidArgument(FileFormatString(magic, input),
                                         " does not support uint16 output"));
@@ -212,9 +215,10 @@ class DecodeImageOp : public OpKernel {
             input.data(), input.size(), flags, nullptr /* nwarn */,
             [=, &output](int width, int height, int channels) -> uint8* {
               Status status(context->allocate_output(
-                  0, format_ == kGifFormat
-                         ? TensorShape({1, height, width, channels})
-                         : TensorShape({height, width, channels}),
+                  0,
+                  format_ == kGifFormat
+                      ? TensorShape({1, height, width, channels})
+                      : TensorShape({height, width, channels}),
                   &output));
               if (!status.ok()) {
                 VLOG(1) << status;
diff --git a/tensorflow/core/kernels/deep_conv2d.cc b/tensorflow/core/kernels/deep_conv2d.cc
index 8e9b8a7e2e..829155fb31 100644
--- a/tensorflow/core/kernels/deep_conv2d.cc
+++ b/tensorflow/core/kernels/deep_conv2d.cc
@@ -120,9 +120,9 @@ bool CanUseDeepConv2D(int stride_rows, int stride_cols, int filter_rows,
 
   VLOG(2) << "CanUseDeepConv2D"
           << " deep_conv_cost: " << deep_conv_cost
-          << " direct_conv_cost: " << direct_conv_cost
-          << " deep_direct_ratio: " << (static_cast<float>(deep_conv_cost) /
-                                        static_cast<float>(direct_conv_cost))
+          << " direct_conv_cost: " << direct_conv_cost << " deep_direct_ratio: "
+          << (static_cast<float>(deep_conv_cost) /
+              static_cast<float>(direct_conv_cost))
           << " use_deep_conv: " << (deep_conv_cost < direct_conv_cost);
   return deep_conv_cost < direct_conv_cost;
 }
diff --git a/tensorflow/core/kernels/dense_update_ops.cc b/tensorflow/core/kernels/dense_update_ops.cc
index 6d44a92fa3..6497c8f371 100644
--- a/tensorflow/core/kernels/dense_update_ops.cc
+++ b/tensorflow/core/kernels/dense_update_ops.cc
@@ -89,7 +89,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #define REGISTER_KERNELS(type)                                     \
   REGISTER_KERNEL_BUILDER(                                         \
@@ -113,14 +113,14 @@ TF_CALL_GPU_ALL_TYPES(REGISTER_GPU_KERNELS);
 #endif  // GOOGLE_CUDA
 
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNELS(type)                                \
-REGISTER_KERNEL_BUILDER(                                           \
-    Name("Assign").Device(DEVICE_SYCL).TypeConstraint<type>("T"),  \
-    AssignOpT<SYCLDevice, type>);
+#define REGISTER_SYCL_KERNELS(type)                                 \
+  REGISTER_KERNEL_BUILDER(                                          \
+      Name("Assign").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
+      AssignOpT<SYCLDevice, type>);
 
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS);
 #undef REGISTER_SYCL_KERNELS
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #define REGISTER_KERNELS(type)                                        \
   REGISTER_KERNEL_BUILDER(                                            \
@@ -146,7 +146,7 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
 #endif  // end GOOGLE_CUDA
 
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNELS(type)                                         \
+#define REGISTER_SYCL_KERNELS(type)                                    \
   REGISTER_KERNEL_BUILDER(                                             \
       Name("AssignAdd").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
       DenseUpdateOp<SYCLDevice, type, DenseUpdateType::ADD>);          \
@@ -156,5 +156,5 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
 
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS);
 #undef REGISTER_SYCL_KERNELS
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/depthwise_conv_grad_op.cc b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
index 9347978d51..91a9587174 100644
--- a/tensorflow/core/kernels/depthwise_conv_grad_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
@@ -400,7 +400,7 @@ struct LaunchDepthwiseConvBackpropInputOp<CPUDevice, T> {
 
     // Computes one shard of depthwise conv2d backprop input.
     auto shard = [&ctx, &args, &out_backprop, &filter_data, &in_backprop](
-        int64 start, int64 limit) {
+                     int64 start, int64 limit) {
       static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
 
       const int64 input_image_size =
@@ -750,7 +750,7 @@ struct LaunchDepthwiseConvBackpropFilterOp<CPUDevice, T> {
 
     // Computes one shard of depthwise conv2d backprop filter.
     auto shard = [&ctx, &args, &out_backprop, &input, &output_buffer_data](
-        int64 start, int64 limit) {
+                     int64 start, int64 limit) {
       static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
       const int64 filter_spatial_size = args.filter_rows * args.filter_cols;
       const int64 padded_out_depth_size =
diff --git a/tensorflow/core/kernels/depthwise_conv_op.cc b/tensorflow/core/kernels/depthwise_conv_op.cc
index a5fd07fbe1..c060b2e14d 100644
--- a/tensorflow/core/kernels/depthwise_conv_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op.cc
@@ -308,10 +308,10 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
 
     // in_depth for input and filter must match.
     const int64 in_depth = GetTensorDim(input, data_format_, 'C');
-    OP_REQUIRES(
-        context, in_depth == filter.dim_size(2),
-        errors::InvalidArgument("input and filter must have the same depth: ",
-                                in_depth, " vs ", filter.dim_size(2)));
+    OP_REQUIRES(context, in_depth == filter.dim_size(2),
+                errors::InvalidArgument(
+                    "input and filter must have the same depth: ", in_depth,
+                    " vs ", filter.dim_size(2)));
 
     // The last dimension for filter is depth multiplier.
     const int32 depth_multiplier = filter.dim_size(3);
@@ -430,9 +430,10 @@ TF_CALL_double(REGISTER_CPU_KERNEL);
 #endif
 
 #if GOOGLE_CUDA
-REGISTER_KERNEL_BUILDER(
-    Name("DepthwiseConv2dNative").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"),
-    DepthwiseConv2dNativeOp<GPUDevice, Eigen::half>);
+REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNative")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<Eigen::half>("T"),
+                        DepthwiseConv2dNativeOp<GPUDevice, Eigen::half>);
 
 REGISTER_KERNEL_BUILDER(
     Name("DepthwiseConv2dNative").Device(DEVICE_GPU).TypeConstraint<float>("T"),
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
index 5493e33532..126b64f73d 100644
--- a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
@@ -17,12 +17,12 @@ limitations under the License.
 #define EIGEN_USE_GPU
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "external/cub_archive/cub/util_ptx.cuh"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/kernels/depthwise_conv_op.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
 #include "tensorflow/core/util/tensor_format.h"
-#include "external/cub_archive/cub/util_ptx.cuh"
 
 #if !defined(_MSC_VER)
 #define UNROLL _Pragma("unroll")
@@ -1021,7 +1021,7 @@ __global__ void __launch_bounds__(640, 2)
 
 // Device function to compute sub-warp sum reduction for a power-of-two group of
 // neighboring threads.
-template<int kWidth, typename T>
+template <int kWidth, typename T>
 __device__ __forceinline__ T WarpSumReduce(T val) {
   // support only power-of-two widths.
   assert(__popc(kWidth) == 1);
diff --git a/tensorflow/core/kernels/diag_op.cc b/tensorflow/core/kernels/diag_op.cc
index 86fa7dce36..d228153d4c 100644
--- a/tensorflow/core/kernels/diag_op.cc
+++ b/tensorflow/core/kernels/diag_op.cc
@@ -29,8 +29,8 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/work_sharder.h"
 
 namespace tensorflow {
@@ -47,8 +47,9 @@ class DiagOp : public OpKernel {
   void Compute(OpKernelContext* context) override {
     const Tensor& diagonal = context->input(0);
     const int num_dims = diagonal.dims();
-    OP_REQUIRES(context, 0 != num_dims, errors::InvalidArgument(
-        "Input must be at least rank 1, got 0"));
+    OP_REQUIRES(
+        context, 0 != num_dims,
+        errors::InvalidArgument("Input must be at least rank 1, got 0"));
     TensorShape out_shape;
     for (int i = 0; i < num_dims; ++i) {
       out_shape.AddDim(diagonal.dim_size(i));
@@ -60,10 +61,9 @@ class DiagOp : public OpKernel {
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, out_shape, &output_tensor));
     functor::DiagFunctor<Device, T> diagFunc;
-    Status s = diagFunc(context,
-                        diagonal.NumElements(),
-                        diagonal.flat<T>().data(),
-                        output_tensor->flat<T>().data());
+    Status s =
+        diagFunc(context, diagonal.NumElements(), diagonal.flat<T>().data(),
+                 output_tensor->flat<T>().data());
     OP_REQUIRES_OK(context, s);
   }
 };
@@ -82,12 +82,12 @@ class DiagPartOp : public OpKernel {
                 errors::InvalidArgument("The rank of the tensor should be \
                                          even and positive, got shape ",
                                         tensor.shape().DebugString()));
-    for (int i = 0; i < out_dims; i++){
-      OP_REQUIRES(context, tensor.dim_size(i) == tensor.dim_size(i + out_dims),
-                  errors::InvalidArgument(
-                    "Invalid shape ", tensor.shape().DebugString(),
-                    ": dimensions ", i, " and ", i + out_dims, " do not match.")
-                  );
+    for (int i = 0; i < out_dims; i++) {
+      OP_REQUIRES(
+          context, tensor.dim_size(i) == tensor.dim_size(i + out_dims),
+          errors::InvalidArgument("Invalid shape ",
+                                  tensor.shape().DebugString(), ": dimensions ",
+                                  i, " and ", i + out_dims, " do not match."));
     }
 
     TensorShape out_shape;
@@ -96,13 +96,10 @@ class DiagPartOp : public OpKernel {
     }
 
     Tensor* output = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, out_shape, &output));
+    OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
     functor::DiagPartFunctor<Device, T> diagPartFunc;
-    Status s = diagPartFunc(context,
-                            out_shape.num_elements(),
-                            tensor.flat<T>().data(),
-                            output->flat<T>().data());
+    Status s = diagPartFunc(context, out_shape.num_elements(),
+                            tensor.flat<T>().data(), output->flat<T>().data());
     OP_REQUIRES_OK(context, s);
   }
 };
@@ -129,9 +126,8 @@ class DiagPartOp : public OpKernel {
 namespace functor {
 template <typename T>
 struct DiagFunctor<CPUDevice, T> {
-  EIGEN_ALWAYS_INLINE Status
-  operator() (OpKernelContext* context, const int64 size,
-              const T* in, T* out) {
+  EIGEN_ALWAYS_INLINE Status operator()(OpKernelContext* context,
+                                        const int64 size, const T* in, T* out) {
     // This subprocess is responsible for writing values in index range
     // [start*size, limit*size)
     auto subDiag = [in, out, size](int64 start, int64 limit) {
@@ -143,17 +139,16 @@ struct DiagFunctor<CPUDevice, T> {
 
     // Here, 5 is a empirical factor of cost_per_unit.
     auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
-    Shard(worker_threads.num_threads, worker_threads.workers, size,
-        5 * size, subDiag);
+    Shard(worker_threads.num_threads, worker_threads.workers, size, 5 * size,
+          subDiag);
     return Status::OK();
   }
 };
 
 template <typename T>
 struct DiagPartFunctor<CPUDevice, T> {
-  EIGEN_ALWAYS_INLINE Status
-  operator() (OpKernelContext* context, const int64 size,
-              const T* in, T* out) {
+  EIGEN_ALWAYS_INLINE Status operator()(OpKernelContext* context,
+                                        const int64 size, const T* in, T* out) {
     // This subprocess is responsible for extracting values in index range
     // [start, limit)
     auto subDiagPart = [in, out, size](int64 start, int64 limit) {
@@ -164,14 +159,13 @@ struct DiagPartFunctor<CPUDevice, T> {
 
     // Here, 5 is a empirical factor of cost_per_unit.
     auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
-    Shard(worker_threads.num_threads, worker_threads.workers, size,
-        5, subDiagPart);
+    Shard(worker_threads.num_threads, worker_threads.workers, size, 5,
+          subDiagPart);
     return Status::OK();
   }
 };
 }  // namespace functor
 
-
 // Register the CPU kernels.
 #define REGISTER_DIAGOP(T)                                    \
   REGISTER_KERNEL_BUILDER(                                    \
@@ -250,6 +244,4 @@ TF_CALL_complex128(REGISTER_DIAGPARTOP_GPU);
 
 #endif  // GOOGLE_CUDA
 
-
 }  // namespace tensorflow
-
diff --git a/tensorflow/core/kernels/diag_op.h b/tensorflow/core/kernels/diag_op.h
index c6ca6a2047..baf16ddb4b 100644
--- a/tensorflow/core/kernels/diag_op.h
+++ b/tensorflow/core/kernels/diag_op.h
@@ -26,14 +26,14 @@ namespace functor {
 
 template <typename Device, typename T>
 struct DiagFunctor {
-  Status operator() (OpKernelContext* context, const int64 size,
-                     const T* in, T* out);
+  Status operator()(OpKernelContext* context, const int64 size, const T* in,
+                    T* out);
 };
 
 template <typename Device, typename T>
 struct DiagPartFunctor {
-  Status operator() (OpKernelContext* context, const int64 size,
-                     const T* in, T* out);
+  Status operator()(OpKernelContext* context, const int64 size, const T* in,
+                    T* out);
 };
 
 }  // namespace functor
diff --git a/tensorflow/core/kernels/diag_op_gpu.cu.cc b/tensorflow/core/kernels/diag_op_gpu.cu.cc
index d3c529d784..910f3093b2 100644
--- a/tensorflow/core/kernels/diag_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/diag_op_gpu.cu.cc
@@ -19,8 +19,8 @@ limitations under the License.
 
 #include <complex>
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/util/cuda_kernel_helper.h"
 #include "tensorflow/core/kernels/diag_op.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
 
 namespace tensorflow {
 namespace functor {
@@ -28,10 +28,8 @@ namespace functor {
 typedef Eigen::GpuDevice GPUDevice;
 
 template <typename T>
-__global__ void DiagCudaKernel(const int num_threads,
-                               const int64 size,
-                               const T* in,
-                               T* out) {
+__global__ void DiagCudaKernel(const int num_threads, const int64 size,
+                               const T* in, T* out) {
   CUDA_1D_KERNEL_LOOP(index, num_threads) {
     // Fill the diagonal elements or set to zero in other place.
     if (index % (1 + size) == 0) {
@@ -44,9 +42,8 @@ __global__ void DiagCudaKernel(const int num_threads,
 
 template <typename T>
 struct DiagFunctor<GPUDevice, T> {
-  EIGEN_ALWAYS_INLINE Status
-  operator() (OpKernelContext* context, const int64 size,
-              const T* in, T* out) {
+  EIGEN_ALWAYS_INLINE Status operator()(OpKernelContext* context,
+                                        const int64 size, const T* in, T* out) {
     // Empty tensor couldn't launch the kernel.
     if (size == 0) {
       return Status::OK();
@@ -56,25 +53,22 @@ struct DiagFunctor<GPUDevice, T> {
     // so this may overflow for `size*size` in extreme cases,
     // here is checking the multiplication overflow for integer.
     if (size && (int(size * size) / size) != size) {
-      return errors::Internal(
-          "DiagOp got input size too large.");
+      return errors::Internal("DiagOp got input size too large.");
     }
     int virtual_thread_count = int(size * size);
 
     // Launch the GPU kernel.
     const GPUDevice& device = context->eigen_device<GPUDevice>();
-    CudaLaunchConfig diag_config = GetCudaLaunchConfig(
-        virtual_thread_count, device);
-    DiagCudaKernel<<<diag_config.block_count,
-                     diag_config.thread_per_block,
-                     0, device.stream()>>>(
-        diag_config.virtual_thread_count, size, in, out);
+    CudaLaunchConfig diag_config =
+        GetCudaLaunchConfig(virtual_thread_count, device);
+    DiagCudaKernel<<<diag_config.block_count, diag_config.thread_per_block, 0,
+                     device.stream()>>>(diag_config.virtual_thread_count, size,
+                                        in, out);
 
     auto err = cudaGetLastError();
     if (err != cudaSuccess) {
       return errors::Internal(
-          "Could not launch DiagOp kernel: ",
-          cudaGetErrorString(err), ".");
+          "Could not launch DiagOp kernel: ", cudaGetErrorString(err), ".");
     }
     return Status::OK();
   }
@@ -87,12 +81,9 @@ template struct DiagFunctor<GPUDevice, int64>;
 template struct DiagFunctor<GPUDevice, complex64>;
 template struct DiagFunctor<GPUDevice, complex128>;
 
-
 template <typename T>
-__global__ void DiagPartCudaKernel(const int num_threads,
-                                   const int64 size,
-                                   const T* in,
-                                   T* out) {
+__global__ void DiagPartCudaKernel(const int num_threads, const int64 size,
+                                   const T* in, T* out) {
   CUDA_1D_KERNEL_LOOP(index, num_threads) {
     out[index] = in[(1 + size) * index];
   }
@@ -100,9 +91,8 @@ __global__ void DiagPartCudaKernel(const int num_threads,
 
 template <typename T>
 struct DiagPartFunctor<GPUDevice, T> {
-  EIGEN_ALWAYS_INLINE Status
-  operator() (OpKernelContext* context, const int64 size,
-              const T* in, T* out) {
+  EIGEN_ALWAYS_INLINE Status operator()(OpKernelContext* context,
+                                        const int64 size, const T* in, T* out) {
     // Empty tensor couldn't launch the kernel.
     if (size == 0) {
       return Status::OK();
@@ -111,16 +101,14 @@ struct DiagPartFunctor<GPUDevice, T> {
 
     // Extract the diagonal elements.
     CudaLaunchConfig diag_config = GetCudaLaunchConfig(size, device);
-    DiagPartCudaKernel<<<diag_config.block_count,
-                     diag_config.thread_per_block,
-                     0, device.stream()>>>(
-        diag_config.virtual_thread_count, size, in, out);
+    DiagPartCudaKernel<<<diag_config.block_count, diag_config.thread_per_block,
+                         0, device.stream()>>>(diag_config.virtual_thread_count,
+                                               size, in, out);
 
     auto err = cudaGetLastError();
     if (err != cudaSuccess) {
       return errors::Internal(
-          "Could not launch DiagPartOp kernel: ",
-          cudaGetErrorString(err), ".");
+          "Could not launch DiagPartOp kernel: ", cudaGetErrorString(err), ".");
     }
     return Status::OK();
   }
diff --git a/tensorflow/core/kernels/diag_op_test.cc b/tensorflow/core/kernels/diag_op_test.cc
index 2d1417854c..a708e53dd0 100644
--- a/tensorflow/core/kernels/diag_op_test.cc
+++ b/tensorflow/core/kernels/diag_op_test.cc
@@ -30,8 +30,8 @@ static Graph* Diag(int n, DataType type) {
   return g;
 }
 
-#define BM_DiagDev(N, T, TFTYPE, DEVICE)                           \
-  static void BM_Diag##_##N##_##TFTYPE##_##DEVICE(int iters) {   \
+#define BM_DiagDev(N, T, TFTYPE, DEVICE)                        \
+  static void BM_Diag##_##N##_##TFTYPE##_##DEVICE(int iters) {  \
     testing::UseRealTime();                                     \
     testing::ItemsProcessed(static_cast<int64>(iters) * N * N); \
     test::Benchmark(#DEVICE, Diag<T>(N, TFTYPE)).Run(iters);    \
@@ -51,4 +51,3 @@ BM_Diag(128);
 BM_Diag(512);
 
 }  // end namespace tensorflow
-
diff --git a/tensorflow/core/kernels/dilation_ops.cc b/tensorflow/core/kernels/dilation_ops.cc
index 6f5c0e9156..441a63465c 100644
--- a/tensorflow/core/kernels/dilation_ops.cc
+++ b/tensorflow/core/kernels/dilation_ops.cc
@@ -91,10 +91,10 @@ void ParseSizes(OpKernelContext* context, const std::vector<int32>& strides,
                                       filter.shape().DebugString()));
   const int filter_rows = filter.dim_size(0);
   const int filter_cols = filter.dim_size(1);
-  OP_REQUIRES(
-      context, depth == filter.dim_size(2),
-      errors::InvalidArgument("input and filter must have the same depth: ",
-                              depth, " vs ", filter.dim_size(2)));
+  OP_REQUIRES(context, depth == filter.dim_size(2),
+              errors::InvalidArgument(
+                  "input and filter must have the same depth: ", depth, " vs ",
+                  filter.dim_size(2)));
 
   // Effective filter size, after introducing rate - 1 zeros between each
   // non-zero filter element.
@@ -234,10 +234,11 @@ class DilationBackpropInputOp : public OpKernel {
     // [ batch, out_rows, out_cols, depth ]
     const int batch = input.dim_size(0);
     const int depth = input.dim_size(3);
-    OP_REQUIRES(context, batch == out_backprop.dim_size(0) &&
-                             out_rows == out_backprop.dim_size(1) &&
-                             out_cols == out_backprop.dim_size(2) &&
-                             depth == out_backprop.dim_size(3),
+    OP_REQUIRES(context,
+                batch == out_backprop.dim_size(0) &&
+                    out_rows == out_backprop.dim_size(1) &&
+                    out_cols == out_backprop.dim_size(2) &&
+                    depth == out_backprop.dim_size(3),
                 errors::InvalidArgument("out_backprop has incompatible size."));
 
     // The computed in_backprop has the same dimensions as the input:
@@ -353,10 +354,11 @@ class DilationBackpropFilterOp : public OpKernel {
     // [ batch, out_rows, out_cols, depth ]
     const int batch = input.dim_size(0);
     const int depth = input.dim_size(3);
-    OP_REQUIRES(context, batch == out_backprop.dim_size(0) &&
-                             out_rows == out_backprop.dim_size(1) &&
-                             out_cols == out_backprop.dim_size(2) &&
-                             depth == out_backprop.dim_size(3),
+    OP_REQUIRES(context,
+                batch == out_backprop.dim_size(0) &&
+                    out_rows == out_backprop.dim_size(1) &&
+                    out_cols == out_backprop.dim_size(2) &&
+                    depth == out_backprop.dim_size(3),
                 errors::InvalidArgument("out_backprop has incompatible size."));
 
     // The computed filter_backprop has the same dimensions as the filter:
diff --git a/tensorflow/core/kernels/dilation_ops_gpu.cu.cc b/tensorflow/core/kernels/dilation_ops_gpu.cu.cc
index ac0775fbef..c63806a7f6 100644
--- a/tensorflow/core/kernels/dilation_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/dilation_ops_gpu.cu.cc
@@ -61,9 +61,8 @@ __global__ void DilationKernel(const int32 nthreads, const T* input_ptr,
           const int w_in = w_beg + w * rate_cols;
           if (w_in >= 0 && w_in < input_cols) {
             const T val =
-                input_ptr[d +
-                          depth *
-                              (w_in + input_cols * (h_in + input_rows * b))] +
+                input_ptr[d + depth * (w_in +
+                                       input_cols * (h_in + input_rows * b))] +
                 filter_ptr[d + depth * (w + filter_cols * h)];
             if (val > cur_val) {
               cur_val = val;
@@ -106,9 +105,8 @@ __global__ void DilationBackpropInputKernel(
           const int w_in = w_beg + w * rate_cols;
           if (w_in >= 0 && w_in < input_cols) {
             const T val =
-                input_ptr[d +
-                          depth *
-                              (w_in + input_cols * (h_in + input_rows * b))] +
+                input_ptr[d + depth * (w_in +
+                                       input_cols * (h_in + input_rows * b))] +
                 filter_ptr[d + depth * (w + filter_cols * h)];
             if (val > cur_val) {
               cur_val = val;
@@ -156,9 +154,8 @@ __global__ void DilationBackpropFilterKernel(
           const int w_in = w_beg + w * rate_cols;
           if (w_in >= 0 && w_in < input_cols) {
             const T val =
-                input_ptr[d +
-                          depth *
-                              (w_in + input_cols * (h_in + input_rows * b))] +
+                input_ptr[d + depth * (w_in +
+                                       input_cols * (h_in + input_rows * b))] +
                 filter_ptr[d + depth * (w + filter_cols * h)];
             if (val > cur_val) {
               cur_val = val;
diff --git a/tensorflow/core/kernels/draw_bounding_box_op.cc b/tensorflow/core/kernels/draw_bounding_box_op.cc
index a8818b7385..b5d5b880bb 100644
--- a/tensorflow/core/kernels/draw_bounding_box_op.cc
+++ b/tensorflow/core/kernels/draw_bounding_box_op.cc
@@ -29,8 +29,7 @@ template <class T>
 class DrawBoundingBoxesOp : public OpKernel {
  public:
   explicit DrawBoundingBoxesOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-  }
+      : OpKernel(context) {}
 
   void Compute(OpKernelContext* context) override {
     const Tensor& images = context->input(0);
@@ -94,35 +93,28 @@ class DrawBoundingBoxesOp : public OpKernel {
         int64 color_index = bb % color_table_length;
         const int64 min_box_row =
             static_cast<float>(tboxes(b, bb, 0)) * (height - 1);
-        const int64 min_box_row_clamp =
-            std::max<int64>(min_box_row, 0);
+        const int64 min_box_row_clamp = std::max<int64>(min_box_row, 0);
         const int64 max_box_row =
             static_cast<float>(tboxes(b, bb, 2)) * (height - 1);
         const int64 max_box_row_clamp =
             std::min<int64>(max_box_row, height - 1);
         const int64 min_box_col =
             static_cast<float>(tboxes(b, bb, 1)) * (width - 1);
-        const int64 min_box_col_clamp =
-            std::max<int64>(min_box_col, 0);
+        const int64 min_box_col_clamp = std::max<int64>(min_box_col, 0);
         const int64 max_box_col =
             static_cast<float>(tboxes(b, bb, 3)) * (width - 1);
-        const int64 max_box_col_clamp =
-            std::min<int64>(max_box_col, width - 1);
+        const int64 max_box_col_clamp = std::min<int64>(max_box_col, width - 1);
 
         if (min_box_row > max_box_row || min_box_col > max_box_col) {
-          LOG(WARNING) << "Bounding box (" << min_box_row
-                       << "," << min_box_col
-                       << "," << max_box_row
-                       << "," << max_box_col
+          LOG(WARNING) << "Bounding box (" << min_box_row << "," << min_box_col
+                       << "," << max_box_row << "," << max_box_col
                        << ") is inverted and will not be drawn.";
           continue;
         }
-        if (min_box_row >= height || max_box_row < 0 ||
-            min_box_col >= width || max_box_col < 0) {
-          LOG(WARNING) << "Bounding box (" << min_box_row
-                       << "," << min_box_col
-                       << "," << max_box_row
-                       << "," << max_box_col
+        if (min_box_row >= height || max_box_row < 0 || min_box_col >= width ||
+            max_box_col < 0) {
+          LOG(WARNING) << "Bounding box (" << min_box_row << "," << min_box_col
+                       << "," << max_box_row << "," << max_box_col
                        << ") is completely outside the image"
                        << " and will not be drawn.";
           continue;
diff --git a/tensorflow/core/kernels/dynamic_partition_op.cc b/tensorflow/core/kernels/dynamic_partition_op.cc
index 861e16b2fd..3c988db5e6 100644
--- a/tensorflow/core/kernels/dynamic_partition_op.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op.cc
@@ -103,7 +103,8 @@ class DynamicPartitionOp : public DynamicPartitionOp_Shared {
       // Walk through data and copy the data to the appropriate output tensor
       const auto data_flat = data->flat<T>();
       std::vector<Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor>,
-                                   Eigen::Aligned> > out_vec;
+                                   Eigen::Aligned> >
+          out_vec;
       out_vec.reserve(num_partitions_);
       for (int p = 0; p < num_partitions_; p++) {
         out_vec.push_back(outputs[p]->vec<T>());
@@ -124,7 +125,8 @@ class DynamicPartitionOp : public DynamicPartitionOp_Shared {
     } else {
       // If data has extra dimensions, use Eigen slices
       std::vector<Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor>,
-                                   Eigen::Aligned> > out_flat;
+                                   Eigen::Aligned> >
+          out_flat;
       out_flat.reserve(num_partitions_);
       for (int p = 0; p < num_partitions_; p++) {
         out_flat.push_back(outputs[p]->flat_outer_dims<T>());
diff --git a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
index 9bb58b13f3..9dfeccff0e 100644
--- a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
@@ -79,9 +79,9 @@ template <typename T>
 void RangeInit(const GPUDevice& d, const T start, const T delta,
                const int32 size, typename TTypes<T>::Flat out) {
   CudaLaunchConfig config = GetCudaLaunchConfig(size, d);
-  RangeInitKernel<
-      T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-      start, delta, size, out.data());
+  RangeInitKernel<T>
+      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+          start, delta, size, out.data());
 }
 
 // Given *num_runs pairs (key, value), this function moves the value
@@ -103,11 +103,10 @@ void CallGatherKernel(const GPUDevice& d, const T* params, const int32* indices,
                       T* out, int64 gather_dim_size, int64 indices_size,
                       int64 slice_size, int64 out_size) {
   CudaLaunchConfig config = GetCudaLaunchConfig(out_size, d);
-  GatherOpKernel<
-      T, int32,
-      true><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-      params, indices, out, gather_dim_size, indices_size, slice_size,
-      out_size);
+  GatherOpKernel<T, int32, true>
+      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+          params, indices, out, gather_dim_size, indices_size, slice_size,
+          out_size);
 }
 
 struct IdentityOp {
@@ -231,10 +230,10 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
 
     OP_REQUIRES_ASYNC(
         c, TensorShapeUtils::StartsWith(data.shape(), partitions.shape()),
-        errors::InvalidArgument("data.shape must start with partitions.shape, ",
-                                "got data.shape = ", data.shape().DebugString(),
-                                ", partitions.shape = ",
-                                partitions.shape().DebugString()),
+        errors::InvalidArgument(
+            "data.shape must start with partitions.shape, ",
+            "got data.shape = ", data.shape().DebugString(),
+            ", partitions.shape = ", partitions.shape().DebugString()),
         done);
 
     Tensor partition_count;
@@ -245,8 +244,9 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
       AllocatorAttributes alloc_attr;
       alloc_attr.set_on_host(true);
       OP_REQUIRES_OK_ASYNC(
-          c, c->allocate_temp(DT_INT32, TensorShape({num_partitions_}),
-                              &partition_count, alloc_attr),
+          c,
+          c->allocate_temp(DT_INT32, TensorShape({num_partitions_}),
+                           &partition_count, alloc_attr),
           done);
       auto e_part_count = partition_count.flat<int32>();
       for (int i = 0; i < num_partitions_; i++) e_part_count(i) = 0;
@@ -259,8 +259,9 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
 
     // Prepare for counting.
     OP_REQUIRES_OK_ASYNC(
-        c, c->allocate_temp(DT_INT32, TensorShape({num_partitions_}),
-                            &partition_count),
+        c,
+        c->allocate_temp(DT_INT32, TensorShape({num_partitions_}),
+                         &partition_count),
         done);
     Tensor indices_out;
     // Count how many times each partition index occurs.
@@ -280,8 +281,9 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
     alloc_attr.set_on_host(true);
     alloc_attr.set_gpu_compatible(true);
     OP_REQUIRES_OK_ASYNC(
-        c, c->allocate_temp(partition_count.dtype(), partition_count.shape(),
-                            &cpu_tensor, alloc_attr),
+        c,
+        c->allocate_temp(partition_count.dtype(), partition_count.shape(),
+                         &cpu_tensor, alloc_attr),
         done);
     perftools::gputools::DeviceMemoryBase wrapped(
         partition_count.flat<int32>().data(), num_partitions_ * sizeof(int32));
@@ -340,9 +342,10 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
         indices_in_ptr, indices_out_ptr, N, 0, sizeof(int32) * 8, cu_stream);
     // Allocate temporary storage.
     OP_REQUIRES_OK_ASYNC(
-        c, c->allocate_temp(
-               DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}),
-               &cub_temp_storage),
+        c,
+        c->allocate_temp(DT_INT8,
+                         TensorShape({static_cast<int64>(temp_storage_bytes)}),
+                         &cub_temp_storage),
         done);
     // Radix-sort the partition information.
     cub::DeviceRadixSort::SortPairs(
@@ -376,8 +379,9 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
     zero_functor(device, partition_count->flat<int32>());
     // Allocate memory for aggregates_out.
     OP_REQUIRES_OK_ASYNC(
-        c, c->allocate_temp(DT_INT32, TensorShape({num_partitions_}),
-                            &aggregates_out),
+        c,
+        c->allocate_temp(DT_INT32, TensorShape({num_partitions_}),
+                         &aggregates_out),
         done);
     // Obtain the pointers to inner buffers.
     int32* keys_in_ptr = partitions_out.flat<int32>().data();
@@ -408,9 +412,10 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
                                    num_runs_ptr, reduction_op, N, cu_stream);
     // Allocate temporary storage.
     OP_REQUIRES_OK_ASYNC(
-        c, c->allocate_temp(
-               DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}),
-               &cub_temp_storage),
+        c,
+        c->allocate_temp(DT_INT8,
+                         TensorShape({static_cast<int64>(temp_storage_bytes)}),
+                         &cub_temp_storage),
         done);
     // Run reduce-by-key. The effect is that we count how many times
     // each index appears in partitions. The distinct indices are stored
diff --git a/tensorflow/core/kernels/eigen_activations.h b/tensorflow/core/kernels/eigen_activations.h
index 99b4b2abe6..302033e47c 100644
--- a/tensorflow/core/kernels/eigen_activations.h
+++ b/tensorflow/core/kernels/eigen_activations.h
@@ -21,13 +21,13 @@ limitations under the License.
 namespace Eigen {
 
 /** scalar_sigmoid_fast_derivative_op
-  * \ingroup CXX11_NeuralNetworks_Module
-  * \brief Template functor to compute the fast derivative of a sigmoid
-  *
-  * Input should be the backpropagated gradient.
-  *
-  * \sa class CwiseUnaryOp, Cwise::sigmoid_fast_derivative()
-  */
+ * \ingroup CXX11_NeuralNetworks_Module
+ * \brief Template functor to compute the fast derivative of a sigmoid
+ *
+ * Input should be the backpropagated gradient.
+ *
+ * \sa class CwiseUnaryOp, Cwise::sigmoid_fast_derivative()
+ */
 template <typename T>
 struct scalar_sigmoid_fast_derivative_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_sigmoid_fast_derivative_op)
@@ -55,13 +55,13 @@ struct functor_traits<scalar_sigmoid_fast_derivative_op<T> > {
 }  // namespace internal
 
 /** scalar_tanh_fast_derivative_op
-  * \ingroup CXX11_NeuralNetworks_Module
-  * \brief Template functor to compute the fast derivative of a tanh
-  *
-  * Input should be the backpropagated gradient.
-  *
-  * \sa class CwiseUnaryOp, Cwise::tanh_fast_derivative()
-  */
+ * \ingroup CXX11_NeuralNetworks_Module
+ * \brief Template functor to compute the fast derivative of a tanh
+ *
+ * Input should be the backpropagated gradient.
+ *
+ * \sa class CwiseUnaryOp, Cwise::tanh_fast_derivative()
+ */
 template <typename T>
 struct scalar_tanh_fast_derivative_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_tanh_fast_derivative_op)
@@ -89,11 +89,11 @@ struct functor_traits<scalar_tanh_fast_derivative_op<T> > {
 }  // namespace internal
 
 /**
-  * \ingroup CXX11_NeuralNetworks_Module
-  * \brief Template functor to clip the magnitude of the first scalar.
-  *
-  * \sa class CwiseBinaryOp, MatrixBase::Clip
-  */
+ * \ingroup CXX11_NeuralNetworks_Module
+ * \brief Template functor to clip the magnitude of the first scalar.
+ *
+ * \sa class CwiseBinaryOp, MatrixBase::Clip
+ */
 template <typename Scalar>
 struct scalar_clip_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_clip_op)
diff --git a/tensorflow/core/kernels/eigen_activations_test.cc b/tensorflow/core/kernels/eigen_activations_test.cc
index 907233103d..34952f5abb 100644
--- a/tensorflow/core/kernels/eigen_activations_test.cc
+++ b/tensorflow/core/kernels/eigen_activations_test.cc
@@ -23,7 +23,7 @@ namespace {
 void EigenApprox(float a, float b) {
   ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3);
 }
-}
+}  // namespace
 
 TEST(EigenBackwardSpatialConvolutionsTest, SigmoidFastDerivative) {
   const ptrdiff_t depth = 3;
diff --git a/tensorflow/core/kernels/eigen_attention.h b/tensorflow/core/kernels/eigen_attention.h
index 3a94b8c993..4d86f9deb9 100644
--- a/tensorflow/core/kernels/eigen_attention.h
+++ b/tensorflow/core/kernels/eigen_attention.h
@@ -21,35 +21,47 @@ limitations under the License.
 namespace Eigen {
 
 /** ExtractGlimpses
-  * \ingroup CXX11_NeuralNetworks_Module
-  *
-  * \brief Extract glimpses from an input tensor.
-  *
-  * The input parameter is expected to be a col-major tensor with a rank of 4 (depth, x, y, and batch).
-  * The width and height parameters specify the extension of the returned glimpses.
-  * The offsets parameter specifies the x, y locations of the center of the glimpses relative to the center of the input image. The vector is expected to contain one IndexPair for each image in the batch dimension.
-  * The normalized boolean indicates if incoming coordinates are normalized so that 0.0 and 1.0 correspond to the minimum and maximum of each height and width dimension.
-  * The centered boolean indicates if incoming coordinates are centered relative to the image, in which case -1.0 and 1.0 correspond to minimum and maximum of each dimension while 0.0 corresponds to the center.
-  *
-  * The result can be assigned to a tensor of rank equal to that of the input. The result will be laid out in col-major order (depth, x, y, batch).
-  * The dimensions of the result will be equal to the dimensions of the input except for width and height which will be equal to the requested glimpse size.
-  */
+ * \ingroup CXX11_NeuralNetworks_Module
+ *
+ * \brief Extract glimpses from an input tensor.
+ *
+ * The input parameter is expected to be a col-major tensor with a rank of 4
+ * (depth, x, y, and batch). The width and height parameters specify the
+ * extension of the returned glimpses. The offsets parameter specifies the x, y
+ * locations of the center of the glimpses relative to the center of the input
+ * image. The vector is expected to contain one IndexPair for each image in the
+ * batch dimension. The normalized boolean indicates if incoming coordinates are
+ * normalized so that 0.0 and 1.0 correspond to the minimum and maximum of each
+ * height and width dimension. The centered boolean indicates if incoming
+ * coordinates are centered relative to the image, in which case -1.0 and 1.0
+ * correspond to minimum and maximum of each dimension while 0.0 corresponds to
+ * the center.
+ *
+ * The result can be assigned to a tensor of rank equal to that of the input.
+ * The result will be laid out in col-major order (depth, x, y, batch). The
+ * dimensions of the result will be equal to the dimensions of the input except
+ * for width and height which will be equal to the requested glimpse size.
+ */
 namespace {
 template <typename Index>
 struct GlimpseExtractionOp {
   GlimpseExtractionOp(const Index width, const Index height,
                       const std::vector<IndexPair<float> >& offsets,
-                      const bool normalized,
-                      const bool centered,
-                      const bool uniform_noise) :
-      width_(width), height_(height), offsets_(offsets),
-      normalized_(normalized), centered_(centered), uniform_noise_(uniform_noise) { }
+                      const bool normalized, const bool centered,
+                      const bool uniform_noise)
+      : width_(width),
+        height_(height),
+        offsets_(offsets),
+        normalized_(normalized),
+        centered_(centered),
+        uniform_noise_(uniform_noise) {}
 
   template <typename Input>
   DSizes<Index, 4> dimensions(const Input& input) const {
     typedef typename internal::traits<Input>::Index IndexType;
     typedef TensorRef<Tensor<typename internal::traits<Input>::Scalar, 4,
-                             internal::traits<Input>::Layout, IndexType> > Ref;
+                             internal::traits<Input>::Layout, IndexType> >
+        Ref;
     Ref in(input);
 
     DSizes<Index, 4> dims = in.dimensions();
@@ -62,12 +74,12 @@ struct GlimpseExtractionOp {
   }
 
   template <typename Input, typename Output, typename Device>
-  EIGEN_DEVICE_FUNC
-  void eval(const Input& input, Output& output, const Device& device) const
-  {
+  EIGEN_DEVICE_FUNC void eval(const Input& input, Output& output,
+                              const Device& device) const {
     typedef typename internal::traits<Input>::Index IndexType;
     typedef TensorRef<Tensor<typename internal::traits<Input>::Scalar, 4,
-                             internal::traits<Input>::Layout, IndexType> > Ref;
+                             internal::traits<Input>::Layout, IndexType> >
+        Ref;
     Ref in(input);
     const Index num_channels = in.dimension(0);
     const Index input_width = in.dimension(1);
@@ -97,8 +109,8 @@ struct GlimpseExtractionOp {
       x -= width_ / 2.0f;
       y -= height_ / 2.0f;
 
-      const Index offset_x = (Index) x;
-      const Index offset_y = (Index) y;
+      const Index offset_x = (Index)x;
+      const Index offset_y = (Index)y;
       Index glimpse_width = width_;
       Index glimpse_height = height_;
       bool partial_overlap = false;
@@ -135,7 +147,7 @@ struct GlimpseExtractionOp {
         if (uniform_noise_) {
           // Initialize the glimpse with uniform noise.
           typedef typename internal::remove_const<
-            typename internal::traits<Input>::Scalar>::type Scalar;
+              typename internal::traits<Input>::Scalar>::type Scalar;
           TensorFixedSize<Scalar, Sizes<> > mini;
           mini.device(device) = input.template chip<3>(i).minimum();
           TensorFixedSize<float, Sizes<> > range;
@@ -215,21 +227,22 @@ struct GlimpseExtractionOp {
   const bool centered_;
   const bool uniform_noise_;
 };
-}
-
+}  // namespace
 
 template <typename Input>
-EIGEN_ALWAYS_INLINE
-static const TensorCustomUnaryOp<const GlimpseExtractionOp<typename internal::traits<Input>::Index>, const Input>
+EIGEN_ALWAYS_INLINE static const TensorCustomUnaryOp<
+    const GlimpseExtractionOp<typename internal::traits<Input>::Index>,
+    const Input>
 ExtractGlimpses(const Input& input,
                 const typename internal::traits<Input>::Index width,
                 const typename internal::traits<Input>::Index height,
                 const std::vector<IndexPair<float> >& offsets,
                 const bool normalized = true, const bool centered = true,
-                const bool uniform_noise = true)
-{
-  EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == ColMajor, YOU_MADE_A_PROGRAMMING_MISTAKE);
-  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 4, YOU_MADE_A_PROGRAMMING_MISTAKE);
+                const bool uniform_noise = true) {
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == ColMajor,
+                      YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 4,
+                      YOU_MADE_A_PROGRAMMING_MISTAKE);
 
   typedef typename internal::traits<Input>::Index Index;
   const GlimpseExtractionOp<Index> op(width, height, offsets, normalized,
@@ -237,6 +250,6 @@ ExtractGlimpses(const Input& input,
   return input.customOp(op);
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
 #endif  // TENSORFLOW_CORE_KERNELS_EIGEN_ATTENTION_H_
diff --git a/tensorflow/core/kernels/eigen_attention_test.cc b/tensorflow/core/kernels/eigen_attention_test.cc
index 3a2eeb0595..08f6187718 100644
--- a/tensorflow/core/kernels/eigen_attention_test.cc
+++ b/tensorflow/core/kernels/eigen_attention_test.cc
@@ -23,7 +23,7 @@ namespace {
 void EigenApprox(float a, float b) {
   ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3);
 }
-}
+}  // namespace
 
 TEST(EigenAttentionTest, Simple) {
   const ptrdiff_t depth = 3;
diff --git a/tensorflow/core/kernels/eigen_backward_spatial_convolutions.h b/tensorflow/core/kernels/eigen_backward_spatial_convolutions.h
index aec7697810..099696105b 100644
--- a/tensorflow/core/kernels/eigen_backward_spatial_convolutions.h
+++ b/tensorflow/core/kernels/eigen_backward_spatial_convolutions.h
@@ -21,29 +21,29 @@ limitations under the License.
 namespace Eigen {
 
 /** SpatialConvolutionBackwardInput
-  * \ingroup CXX11_NeuralNetworks_Module
-  *
-  * \brief Computes the backprop for the input of a 2D convolution.
-  *
-  * The output_backward parameter is expected to be a tensor with a rank of 3 or
+ * \ingroup CXX11_NeuralNetworks_Module
+ *
+ * \brief Computes the backprop for the input of a 2D convolution.
+ *
+ * The output_backward parameter is expected to be a tensor with a rank of 3 or
  * more (channels, height, width, and optionally others)
-  * The kernel parameter is expected to be a 4D tensor (filters, channels,
+ * The kernel parameter is expected to be a 4D tensor (filters, channels,
  * kernel_height, kernel_width)
-  * The output_backward and the kernel must both be in col-major layout. The
+ * The output_backward and the kernel must both be in col-major layout. The
  * result will also be in col-major layout.
-  *
-  * If row_in_stride, col_in_stride > 1, then applies convolution with holes
+ *
+ * If row_in_stride, col_in_stride > 1, then applies convolution with holes
  * (aka atrous convolution), sampling every row_in_stride, col_in_stride input
  * pixels.
-  *
-  * The result can be assigned to a tensor of rank equal to the rank of the
+ *
+ * The result can be assigned to a tensor of rank equal to the rank of the
  * output_backward. The dimensions of the result will be filters, height, width
  * (and others if applicable).
-  *
-  * It is possible to swap the order of the width and height dimensions provided
+ *
+ * It is possible to swap the order of the width and height dimensions provided
  * that the same order is used in the input, the kernel, and the output.
-  *
-  */
+ *
+ */
 #ifdef EIGEN_HAS_INDEX_LIST
 typedef IndexList<type2index<0>, type2index<0>, type2index<1>, type2index<1> >
     ReverseColMajor;
@@ -293,29 +293,29 @@ SpatialConvolutionBackwardInput(
 }
 
 /** SpatialConvolutionBackwardKernel
-  * \ingroup CXX11_NeuralNetworks_Module
-  *
-  * \brief Computes the backprop for the filter of a 2D convolution.
-  *
-  * The output_backward parameter is expected to be a tensor with a rank of 3 or
+ * \ingroup CXX11_NeuralNetworks_Module
+ *
+ * \brief Computes the backprop for the filter of a 2D convolution.
+ *
+ * The output_backward parameter is expected to be a tensor with a rank of 3 or
  * more (channels, height, width, and optionally others)
-  * The kernel parameter is expected to be a 4D tensor (filters, channels,
+ * The kernel parameter is expected to be a 4D tensor (filters, channels,
  * kernel_height, kernel_width)
-  * The output_backward and the kernel must both be in col-major layout. The
+ * The output_backward and the kernel must both be in col-major layout. The
  * result will also be in col-major layout.
-  *
-  * If row_in_stride, col_stride > 1, then applies convolution with holes (aka
+ *
+ * If row_in_stride, col_stride > 1, then applies convolution with holes (aka
  * atrous convolution), sampling every row_in_stride, col_in_stride input
  * pixels.
-  *
-  * The result can be assigned to a tensor of rank equal to the rank of the
+ *
+ * The result can be assigned to a tensor of rank equal to the rank of the
  * output_backward. The dimensions of the result will be filters, height, width
  * (and others if applicable).
-  *
-  * It is possible to swap the order of the width and height dimensions provided
+ *
+ * It is possible to swap the order of the width and height dimensions provided
  * that the same order is used in the input, the kernel, and the output.
-  *
-  */
+ *
+ */
 
 template <typename OutputBackward, typename Input>
 EIGEN_ALWAYS_INLINE static const typename internal::conditional<
diff --git a/tensorflow/core/kernels/eigen_backward_spatial_convolutions_test.cc b/tensorflow/core/kernels/eigen_backward_spatial_convolutions_test.cc
index 1758067829..2229ec9659 100644
--- a/tensorflow/core/kernels/eigen_backward_spatial_convolutions_test.cc
+++ b/tensorflow/core/kernels/eigen_backward_spatial_convolutions_test.cc
@@ -25,7 +25,7 @@ void EigenApprox(float a, float b) {
   ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3);
 }
 static int ceil_div(int a, int b) { return (a + b - 1) / b; }
-}
+}  // namespace
 
 TEST(EigenBackwardSpatialConvolutionsTest,
      test_simple_spatial_convolution_backward_input_valid) {
diff --git a/tensorflow/core/kernels/eigen_pooling.h b/tensorflow/core/kernels/eigen_pooling.h
index 972036833f..896c995761 100644
--- a/tensorflow/core/kernels/eigen_pooling.h
+++ b/tensorflow/core/kernels/eigen_pooling.h
@@ -309,10 +309,10 @@ struct AvgPoolMeanReducer {
   _mm512_castsi512_ps( \
       _mm512_maskz_set1_epi32(_mm512_cmp_ps_mask(a, b, _CMP_EQ_UQ), -1))
 
-// The ternarylogic function immediate determines the values in the result
-// In the case below, 0xd8 implies (false_mask) ? (b) : (a)
-// For details, refer to the vpternlogd instruction table at
-// http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-vol-2c-manual.pdf
+  // The ternarylogic function immediate determines the values in the result
+  // In the case below, 0xd8 implies (false_mask) ? (b) : (a)
+  // For details, refer to the vpternlogd instruction table at
+  // http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-vol-2c-manual.pdf
 
 #define psel(a, b, false_mask)                        \
   _mm512_castsi512_ps(_mm512_ternarylogic_epi32(      \
diff --git a/tensorflow/core/kernels/eigen_pooling_test.cc b/tensorflow/core/kernels/eigen_pooling_test.cc
index 9383972b9f..47b6665e68 100644
--- a/tensorflow/core/kernels/eigen_pooling_test.cc
+++ b/tensorflow/core/kernels/eigen_pooling_test.cc
@@ -23,7 +23,7 @@ namespace {
 void EigenApprox(float a, float b) {
   ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3);
 }
-}
+}  // namespace
 
 TEST(EigenPoolingTest, Simple) {
   const int depth = 10;
diff --git a/tensorflow/core/kernels/eigen_softmax.h b/tensorflow/core/kernels/eigen_softmax.h
index a2930a726f..12148c54b3 100644
--- a/tensorflow/core/kernels/eigen_softmax.h
+++ b/tensorflow/core/kernels/eigen_softmax.h
@@ -21,19 +21,21 @@ limitations under the License.
 namespace Eigen {
 
 /** SoftMax
-  * \ingroup CXX11_NeuralNetworks_Module
-  *
-  * \brief Applies a softmax
-  *
-  * The input parameter is expected to be a col-major tensor with a rank of 2 (depth and other).
-  *
-  * The result can be assigned to a tensor of rank and dimensions equal to that of the input. The result will be laid out in col-major order.
-  *
-*/
+ * \ingroup CXX11_NeuralNetworks_Module
+ *
+ * \brief Applies a softmax
+ *
+ * The input parameter is expected to be a col-major tensor with a rank of 2
+ * (depth and other).
+ *
+ * The result can be assigned to a tensor of rank and dimensions equal to that
+ * of the input. The result will be laid out in col-major order.
+ *
+ */
 
 namespace {
 struct SoftmaxOp {
-  SoftmaxOp(const float beta) : beta_(beta) { }
+  SoftmaxOp(const float beta) : beta_(beta) {}
 
   template <typename Input>
   typename Input::Dimensions dimensions(const Input& input) const {
@@ -41,8 +43,7 @@ struct SoftmaxOp {
   }
 
   template <typename Input, typename Output, typename Device>
-  void eval(const Input& input, Output& output, const Device& device) const
-  {
+  void eval(const Input& input, Output& output, const Device& device) const {
 #if !defined(EIGEN_HAS_INDEX_LIST)
     // nvcc doesn't support cxx11
     Eigen::array<typename internal::traits<Input>::Index, 1> depth_dim;
@@ -56,35 +57,43 @@ struct SoftmaxOp {
 #else
     // Take advantage of cxx11 to give the compiler information it can use to
     // optimize the code.
-    Eigen::IndexList<Eigen::type2index<0>> depth_dim;
-    Eigen::IndexList<int, Eigen::type2index<1>> bcast;
+    Eigen::IndexList<Eigen::type2index<0> > depth_dim;
+    Eigen::IndexList<int, Eigen::type2index<1> > bcast;
     bcast.set(0, dimensions(input)[0]);
-    Eigen::IndexList<Eigen::type2index<1>, typename internal::traits<Input>::Index> dims2d;
+    Eigen::IndexList<Eigen::type2index<1>,
+                     typename internal::traits<Input>::Index>
+        dims2d;
     dims2d.set(1, dimensions(input)[1]);
 #endif
 
-    output.device(device) = ((input - input.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast)) * beta_).exp();
-    output.device(device) = output / (output.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast));
+    output.device(device) =
+        ((input -
+          input.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast)) *
+         beta_)
+            .exp();
+    output.device(device) =
+        output /
+        (output.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast));
   }
 
  private:
   const float beta_;
 };
-}
-
+}  // namespace
 
 template <typename Input>
-EIGEN_ALWAYS_INLINE
-static const TensorCustomUnaryOp<const SoftmaxOp, const Input>
-SoftMax(const Input& input, const float beta)
-{
-  EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == ColMajor, YOU_MADE_A_PROGRAMMING_MISTAKE);
-  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 2, YOU_MADE_A_PROGRAMMING_MISTAKE);
+EIGEN_ALWAYS_INLINE static const TensorCustomUnaryOp<const SoftmaxOp,
+                                                     const Input>
+SoftMax(const Input& input, const float beta) {
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == ColMajor,
+                      YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 2,
+                      YOU_MADE_A_PROGRAMMING_MISTAKE);
 
   const SoftmaxOp op(beta);
   return input.customOp(op);
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
 #endif  // TENSORFLOW_CORE_KERNELS_EIGEN_SOFTMAX_H_
diff --git a/tensorflow/core/kernels/eigen_softmax_test.cc b/tensorflow/core/kernels/eigen_softmax_test.cc
index ba681d68ab..7f985d7136 100644
--- a/tensorflow/core/kernels/eigen_softmax_test.cc
+++ b/tensorflow/core/kernels/eigen_softmax_test.cc
@@ -23,7 +23,7 @@ namespace {
 void EigenApprox(float a, float b) {
   ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3);
 }
-}
+}  // namespace
 
 TEST(EigenSoftmaxTest, Simple) {
   const int depth = 1024;
diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions.h b/tensorflow/core/kernels/eigen_spatial_convolutions.h
index 2fe64cd72a..1acbe3a658 100644
--- a/tensorflow/core/kernels/eigen_spatial_convolutions.h
+++ b/tensorflow/core/kernels/eigen_spatial_convolutions.h
@@ -877,29 +877,29 @@ struct gemm_pack_rhs<
 }  // end namespace internal
 
 /** SpatialConvolution
-  * \ingroup CXX11_NeuralNetworks_Module
-  *
-  * \brief Applies a 2D convolution over a multichannel input image.
-  *
-  * The input parameter is expected to be a tensor with a rank of 3 or more
+ * \ingroup CXX11_NeuralNetworks_Module
+ *
+ * \brief Applies a 2D convolution over a multichannel input image.
+ *
+ * The input parameter is expected to be a tensor with a rank of 3 or more
  * (channels, height, width, and optionally others)
-  * The kernel parameter is expected to be a 4D tensor (filters, channels,
+ * The kernel parameter is expected to be a 4D tensor (filters, channels,
  * kernel_height, kernel_width)
-  * The input and the kernel must both be in col-major layout. The result will
+ * The input and the kernel must both be in col-major layout. The result will
  * also be in col-major layout.
-  *
-  * If col_in_stride, row_in_stride > 1, then applies convolution with holes
+ *
+ * If col_in_stride, row_in_stride > 1, then applies convolution with holes
  * (aka atrous convolution), sampling every col_in_stride, row_in_stride input
  * pixels.
-  *
-  * The result can be assigned to a tensor of rank equal to the rank of the
+ *
+ * The result can be assigned to a tensor of rank equal to the rank of the
  * input. The dimensions of the result will be filters, height, width (and
  * others if applicable).
-  *
-  * It is possible to swap the order of the width and height dimensions provided
+ *
+ * It is possible to swap the order of the width and height dimensions provided
  * that the same order is used in the input, the kernel, and the output.
-  *
-  */
+ *
+ */
 template <typename Input, typename Kernel>
 EIGEN_DEVICE_FUNC
     EIGEN_ALWAYS_INLINE static const typename internal::conditional<
@@ -993,7 +993,7 @@ EIGEN_DEVICE_FUNC
     default:
       // Initialize unused variables to avoid a compiler warning
       out_height = 0;
-      out_width  = 0;
+      out_width = 0;
       eigen_assert(false && "unexpected padding");
   }
 
diff --git a/tensorflow/core/kernels/encode_jpeg_op.cc b/tensorflow/core/kernels/encode_jpeg_op.cc
index 4fcae25aa6..1a5b0f2b67 100644
--- a/tensorflow/core/kernels/encode_jpeg_op.cc
+++ b/tensorflow/core/kernels/encode_jpeg_op.cc
@@ -80,10 +80,11 @@ class EncodeJpegOp : public OpKernel {
                 errors::InvalidArgument("image must be 3-dimensional",
                                         image.shape().DebugString()));
 
-    OP_REQUIRES(context, FastBoundsCheck(image.NumElements(),
-                                         std::numeric_limits<int32>::max()),
-                errors::InvalidArgument(
-                    "Cannot encode images with >= max int32 elements"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(image.NumElements(), std::numeric_limits<int32>::max()),
+        errors::InvalidArgument(
+            "Cannot encode images with >= max int32 elements"));
 
     const int32 dim_size0 = static_cast<int32>(image.dim_size(0));
     const int32 dim_size1 = static_cast<int32>(image.dim_size(1));
@@ -100,9 +101,10 @@ class EncodeJpegOp : public OpKernel {
       } else if (channels == 3) {
         adjusted_flags.format = jpeg::FORMAT_RGB;
       } else {
-        OP_REQUIRES(context, false, errors::InvalidArgument(
-                                        "image must have 1 or 3 channels, got ",
-                                        image.shape().DebugString()));
+        OP_REQUIRES(
+            context, false,
+            errors::InvalidArgument("image must have 1 or 3 channels, got ",
+                                    image.shape().DebugString()));
       }
     } else {
       if (flags_.format == jpeg::FORMAT_GRAYSCALE) {
diff --git a/tensorflow/core/kernels/example_parsing_ops.cc b/tensorflow/core/kernels/example_parsing_ops.cc
index 268a059275..83cd0e9b47 100644
--- a/tensorflow/core/kernels/example_parsing_ops.cc
+++ b/tensorflow/core/kernels/example_parsing_ops.cc
@@ -346,8 +346,9 @@ class SingleSequenceExampleParserOp : public OpKernel {
           feature_list_sparse_keys[di].scalar<string>()();
     }
     OP_REQUIRES(
-        ctx, TensorShapeUtils::IsVector(
-                 feature_list_dense_missing_assumed_empty->shape()),
+        ctx,
+        TensorShapeUtils::IsVector(
+            feature_list_dense_missing_assumed_empty->shape()),
         errors::InvalidArgument(
             "Expected feature_list_dense_missing_assumed_empty ",
             "to be a vector, got shape: ",
@@ -386,12 +387,12 @@ class SingleSequenceExampleParserOp : public OpKernel {
       required[d] = (def_value.NumElements() == 0);  // No default provided.
 
       if (def_value.NumElements() > 0) {
-        OP_REQUIRES(
-            ctx, def_value.shape() == attrs_.context_dense_shapes[d],
-            errors::InvalidArgument(
-                "def_value[", d, "].shape() == ",
-                def_value.shape().DebugString(), " != context_dense_shapes_[",
-                d, "] == ", attrs_.context_dense_shapes[d].DebugString()));
+        OP_REQUIRES(ctx, def_value.shape() == attrs_.context_dense_shapes[d],
+                    errors::InvalidArgument(
+                        "def_value[", d,
+                        "].shape() == ", def_value.shape().DebugString(),
+                        " != context_dense_shapes_[", d,
+                        "] == ", attrs_.context_dense_shapes[d].DebugString()));
         OP_REQUIRES(
             ctx, def_value.dtype() == attrs_.context_dense_types[d],
             errors::InvalidArgument(
@@ -576,12 +577,12 @@ class SingleSequenceExampleParserOp : public OpKernel {
         const Feature& f = fl.feature(t);
         bool types_match;
         OP_REQUIRES_OK(ctx, CheckTypesMatch(f, dtype, &types_match));
-        OP_REQUIRES(
-            ctx, types_match,
-            errors::InvalidArgument(
-                "Name: ", name, ", Feature list: ", key, ", Index: ", t,
-                ".  Data types don't match. ", "Expected type: ",
-                DataTypeString(dtype), "  Feature is: ", ProtoDebugString(f)));
+        OP_REQUIRES(ctx, types_match,
+                    errors::InvalidArgument(
+                        "Name: ", name, ", Feature list: ", key, ", Index: ", t,
+                        ".  Data types don't match. ",
+                        "Expected type: ", DataTypeString(dtype),
+                        "  Feature is: ", ProtoDebugString(f)));
         OP_REQUIRES_OK(ctx, FeatureDenseCopy(t, name, key, dtype, shape, f,
                                              feature_list_dense_values[d]));
       }
diff --git a/tensorflow/core/kernels/fact_op.cc b/tensorflow/core/kernels/fact_op.cc
index 4fbf76d2d0..4a1aa433bc 100644
--- a/tensorflow/core/kernels/fact_op.cc
+++ b/tensorflow/core/kernels/fact_op.cc
@@ -122,13 +122,9 @@ static string D(const char* s) {
   return ret;
 }
 
-REGISTER_KERNEL_BUILDER(Name("Fact")
-                            .Device(DEVICE_CPU)
-                            .Label(D("Yoxmos").c_str()),
-                        FactOpKernel2);
-REGISTER_KERNEL_BUILDER(Name("Fact")
-                            .Device(DEVICE_CPU)
-                            .Label(D("yoxmos").c_str()),
-                        FactOpKernel2);
+REGISTER_KERNEL_BUILDER(
+    Name("Fact").Device(DEVICE_CPU).Label(D("Yoxmos").c_str()), FactOpKernel2);
+REGISTER_KERNEL_BUILDER(
+    Name("Fact").Device(DEVICE_CPU).Label(D("yoxmos").c_str()), FactOpKernel2);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/fake_quant_ops_test.cc b/tensorflow/core/kernels/fake_quant_ops_test.cc
index 5953db1476..af3a42135d 100644
--- a/tensorflow/core/kernels/fake_quant_ops_test.cc
+++ b/tensorflow/core/kernels/fake_quant_ops_test.cc
@@ -378,9 +378,8 @@ TEST_F(QuantOpsTest, WithArgsGradient_RegularRange) {
   Tensor* output = GetOutput(0);
   auto input_flat = GetInput(0).flat<float>();
   Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
-  FillValues<float>(&expected,
-                    {0.0f, input_flat(1), input_flat(2),
-                     input_flat(3), input_flat(4), 0.0f});
+  FillValues<float>(&expected, {0.0f, input_flat(1), input_flat(2),
+                                input_flat(3), input_flat(4), 0.0f});
   ExpectClose(expected, *output);
 }
 
@@ -2167,21 +2166,19 @@ TEST_F(QuantOpsTest,
   Tensor* output_bprop_wrt_input = GetOutput(0);
   Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({2, 3}));
   auto grad_flat = GetInput(0).flat<float>();
-  FillValues<float>(&expected_bprop_wrt_input,
-                    {0.0f, grad_flat(1), grad_flat(2),
-                     grad_flat(3), grad_flat(4), 0.0f});
+  FillValues<float>(
+      &expected_bprop_wrt_input,
+      {0.0f, grad_flat(1), grad_flat(2), grad_flat(3), grad_flat(4), 0.0f});
   ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
 
   Tensor* output_bprop_wrt_min = GetOutput(1);
   Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({3}));
-  FillValues<float>(&expected_bprop_wrt_min,
-                    {grad_flat(0), 0.0f, 0.0f});
+  FillValues<float>(&expected_bprop_wrt_min, {grad_flat(0), 0.0f, 0.0f});
   ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
 
   Tensor* output_bprop_wrt_max = GetOutput(2);
   Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({3}));
-  FillValues<float>(&expected_bprop_wrt_max,
-                    {0.0f, 0.0f, grad_flat(5)});
+  FillValues<float>(&expected_bprop_wrt_max, {0.0f, 0.0f, grad_flat(5)});
   ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
 }
 
@@ -2215,21 +2212,19 @@ TEST_F(QuantOpsTest, WithVarsPerChannelDim2GradientNudgedUp_4Bits_NarrowRange) {
   Tensor* output_bprop_wrt_input = GetOutput(0);
   Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({2, 3}));
   auto grad_flat = GetInput(0).flat<float>();
-  FillValues<float>(&expected_bprop_wrt_input,
-                    {0.0f, grad_flat(1), grad_flat(2),
-                     grad_flat(3), grad_flat(4), 0.0f});
+  FillValues<float>(
+      &expected_bprop_wrt_input,
+      {0.0f, grad_flat(1), grad_flat(2), grad_flat(3), grad_flat(4), 0.0f});
   ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
 
   Tensor* output_bprop_wrt_min = GetOutput(1);
   Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({3}));
-  FillValues<float>(&expected_bprop_wrt_min,
-                    {grad_flat(0), 0.0f, 0.0f});
+  FillValues<float>(&expected_bprop_wrt_min, {grad_flat(0), 0.0f, 0.0f});
   ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
 
   Tensor* output_bprop_wrt_max = GetOutput(2);
   Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({3}));
-  FillValues<float>(&expected_bprop_wrt_max,
-                    {0.0f, 0.0f, grad_flat(5)});
+  FillValues<float>(&expected_bprop_wrt_max, {0.0f, 0.0f, grad_flat(5)});
   ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
 }
 
@@ -2270,14 +2265,13 @@ TEST_F(QuantOpsTest,
   Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT,
                                   TensorShape({1, 2, 3, 4}));
   auto grad_flat = GetInput(0).flat<float>();
-  FillValues<float>(
-      &expected_bprop_wrt_input,
-      {0.0f, grad_flat(1), grad_flat(2), 0.0f,
-       0.0f, grad_flat(5), grad_flat(6), 0.0f,
-       0.0f, grad_flat(9), grad_flat(10), 0.0f,
-       0.0f, grad_flat(13), grad_flat(14), 0.0f,
-       0.0f, grad_flat(17), grad_flat(18), 0.0f,
-       0.0f, grad_flat(21), grad_flat(22), 0.0f});
+  FillValues<float>(&expected_bprop_wrt_input,
+                    {0.0f, grad_flat(1),  grad_flat(2),  0.0f,
+                     0.0f, grad_flat(5),  grad_flat(6),  0.0f,
+                     0.0f, grad_flat(9),  grad_flat(10), 0.0f,
+                     0.0f, grad_flat(13), grad_flat(14), 0.0f,
+                     0.0f, grad_flat(17), grad_flat(18), 0.0f,
+                     0.0f, grad_flat(21), grad_flat(22), 0.0f});
   ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
 
   Tensor* output_bprop_wrt_min = GetOutput(1);
diff --git a/tensorflow/core/kernels/fifo_queue.cc b/tensorflow/core/kernels/fifo_queue.cc
index 82ec879119..479f7be4b5 100644
--- a/tensorflow/core/kernels/fifo_queue.cc
+++ b/tensorflow/core/kernels/fifo_queue.cc
@@ -255,97 +255,96 @@ void FIFOQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
       // TODO(josh11b): This makes two copies of callback, avoid this if possible.
       dequeue_attempts_.emplace_back(
           num_elements, [callback]() { callback(Tuple()); }, ctx, cm, token,
-          [callback, allow_small_batch, this](Attempt* attempt)
-              EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-                int64 queue_size = queues_[0].size();
+          [callback, allow_small_batch,
+           this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+            int64 queue_size = queues_[0].size();
 
-                if (closed_ && queue_size < attempt->elements_requested) {
-                  // If we don't have enough for a full dequeue, we have
-                  // to reset the attempt tuple.
-                  if (!attempt->tuple.empty()) {
-                    // Restore already-dequeued elements to the front of the
-                    // queue.
-                    for (int64 i = attempt->tuple[0].dim_size(0) -
-                                   attempt->elements_requested - 1;
-                         i >= 0; --i) {
-                      for (int j = 0; j < num_components(); ++j) {
-                        PersistentTensor element;
-                        Status s = GetElementComponentFromBatch(
-                            attempt->tuple, i, j, attempt->context, &element);
-                        if (!s.ok()) {
-                          attempt->context->SetStatus(
-                              errors::DataLoss("Failed to restore element from "
-                                               "partially-dequeued batch "
-                                               "to FIFOQueue: ",
-                                               s.error_message()));
-                        }
-                        queues_[j].push_front(element);
-                      }
-                    }
-                  }
-                  if (allow_small_batch && !queues_[0].empty()) {
-                    // Request all remaining elements in the queue.
-                    queue_size = queues_[0].size();
-                    attempt->tuple.clear();
-                    attempt->elements_requested = queue_size;
-                  } else {
-                    if (allow_small_batch) {
-                      // There may be some other attempts containing
-                      // values.  If so, we'll yield and wait for them
-                      // to add elements to the queue.
-                      if (!enqueue_attempts_.empty()) return kProgress;
-                    }
-                    if (attempt->context->status().ok()) {
-                      attempt->context->SetStatus(errors::OutOfRange(
-                          "FIFOQueue '", name_, "' is closed and has ",
-                          "insufficient elements (requested ",
-                          attempt->elements_requested, ", current size ",
-                          queue_size, ")"));
+            if (closed_ && queue_size < attempt->elements_requested) {
+              // If we don't have enough for a full dequeue, we have
+              // to reset the attempt tuple.
+              if (!attempt->tuple.empty()) {
+                // Restore already-dequeued elements to the front of the
+                // queue.
+                for (int64 i = attempt->tuple[0].dim_size(0) -
+                               attempt->elements_requested - 1;
+                     i >= 0; --i) {
+                  for (int j = 0; j < num_components(); ++j) {
+                    PersistentTensor element;
+                    Status s = GetElementComponentFromBatch(
+                        attempt->tuple, i, j, attempt->context, &element);
+                    if (!s.ok()) {
+                      attempt->context->SetStatus(
+                          errors::DataLoss("Failed to restore element from "
+                                           "partially-dequeued batch "
+                                           "to FIFOQueue: ",
+                                           s.error_message()));
                     }
-                    return kComplete;
+                    queues_[j].push_front(element);
                   }
                 }
+              }
+              if (allow_small_batch && !queues_[0].empty()) {
+                // Request all remaining elements in the queue.
+                queue_size = queues_[0].size();
+                attempt->tuple.clear();
+                attempt->elements_requested = queue_size;
+              } else {
+                if (allow_small_batch) {
+                  // There may be some other attempts containing
+                  // values.  If so, we'll yield and wait for them
+                  // to add elements to the queue.
+                  if (!enqueue_attempts_.empty()) return kProgress;
+                }
+                if (attempt->context->status().ok()) {
+                  attempt->context->SetStatus(errors::OutOfRange(
+                      "FIFOQueue '", name_, "' is closed and has ",
+                      "insufficient elements (requested ",
+                      attempt->elements_requested, ", current size ",
+                      queue_size, ")"));
+                }
+                return kComplete;
+              }
+            }
 
-                RunResult result = kNoProgress;
-                for (; queue_size > 0; --queue_size) {
-                  if (attempt->tuple.empty()) {
-                    // Only allocate tuple when we have something to dequeue
-                    // so we don't use excessive memory when there are many
-                    // blocked dequeue attempts waiting.
-                    attempt->tuple.reserve(num_components());
-                    for (int i = 0; i < num_components(); ++i) {
-                      const TensorShape shape =
-                          ManyOutShape(i, attempt->elements_requested);
-                      Tensor element;
-                      attempt->context->SetStatus(
-                          attempt->context->allocate_temp(component_dtypes_[i],
-                                                          shape, &element));
-                      if (!attempt->context->status().ok()) return kComplete;
-                      attempt->tuple.emplace_back(element);
-                    }
-                  }
-                  result = kProgress;
-                  Tuple tuple;
-                  DequeueLocked(attempt->context, &tuple);
-                  const int64 index = attempt->tuple[0].dim_size(0) -
-                                      attempt->elements_requested;
-                  for (int i = 0; i < num_components(); ++i) {
-                    attempt->context->SetStatus(batch_util::CopyElementToSlice(
-                        std::move(tuple[i]), &attempt->tuple[i], index));
-                    if (!attempt->context->status().ok()) return kComplete;
-                  }
-                  tuple.clear();
-                  --attempt->elements_requested;
-                  if (attempt->elements_requested == 0) {
-                    tuple = attempt->tuple;
-                    attempt->done_callback = [callback, tuple]() {
-                      callback(tuple);
-                    };
-                    return kComplete;
-                  }
+            RunResult result = kNoProgress;
+            for (; queue_size > 0; --queue_size) {
+              if (attempt->tuple.empty()) {
+                // Only allocate tuple when we have something to dequeue
+                // so we don't use excessive memory when there are many
+                // blocked dequeue attempts waiting.
+                attempt->tuple.reserve(num_components());
+                for (int i = 0; i < num_components(); ++i) {
+                  const TensorShape shape =
+                      ManyOutShape(i, attempt->elements_requested);
+                  Tensor element;
+                  attempt->context->SetStatus(attempt->context->allocate_temp(
+                      component_dtypes_[i], shape, &element));
+                  if (!attempt->context->status().ok()) return kComplete;
+                  attempt->tuple.emplace_back(element);
                 }
-                return result;
-              });
+              }
+              result = kProgress;
+              Tuple tuple;
+              DequeueLocked(attempt->context, &tuple);
+              const int64 index =
+                  attempt->tuple[0].dim_size(0) - attempt->elements_requested;
+              for (int i = 0; i < num_components(); ++i) {
+                attempt->context->SetStatus(batch_util::CopyElementToSlice(
+                    std::move(tuple[i]), &attempt->tuple[i], index));
+                if (!attempt->context->status().ok()) return kComplete;
+              }
+              tuple.clear();
+              --attempt->elements_requested;
+              if (attempt->elements_requested == 0) {
+                tuple = attempt->tuple;
+                attempt->done_callback = [callback, tuple]() {
+                  callback(tuple);
+                };
+                return kComplete;
+              }
+            }
+            return result;
+          });
     }
   }
   if (!already_cancelled) {
diff --git a/tensorflow/core/kernels/fill_functor.cc b/tensorflow/core/kernels/fill_functor.cc
index bde39770de..7090417dfd 100644
--- a/tensorflow/core/kernels/fill_functor.cc
+++ b/tensorflow/core/kernels/fill_functor.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/variant_encode_decode.h"
 
@@ -60,7 +60,7 @@ DEFINE_SETZERO_CPU(Variant);
 template <typename T>
 void SetZeroFunctor<Eigen::SyclDevice, T>::operator()(
     const Eigen::SyclDevice& d, typename TTypes<T>::Flat out) {
-      To32Bit(out).device(d) = To32Bit(out).constant(T(0));
+  To32Bit(out).device(d) = To32Bit(out).constant(T(0));
 }
 
 #define DEFINE_SETZERO_SYCL(T) \
@@ -118,7 +118,8 @@ DEFINE_SETONE_SYCL(double);
 
 template <typename T>
 struct FillFunctor<Eigen::ThreadPoolDevice, T> {
-  void operator()(const Eigen::ThreadPoolDevice& d, typename TTypes<T>::Flat out,
+  void operator()(const Eigen::ThreadPoolDevice& d,
+                  typename TTypes<T>::Flat out,
                   typename TTypes<T>::ConstScalar in) {
     out.device(d) = out.constant(in());
   }
@@ -150,8 +151,7 @@ struct FillFunctor<Eigen::SyclDevice, T> {
   }
 };
 
-#define DEFINE_FILL_SYCL(T) \
-  template struct FillFunctor<Eigen::SyclDevice, T>;
+#define DEFINE_FILL_SYCL(T) template struct FillFunctor<Eigen::SyclDevice, T>;
 DEFINE_FILL_SYCL(float);
 DEFINE_FILL_SYCL(double);
 TF_CALL_INTEGRAL_TYPES(DEFINE_FILL_SYCL)
diff --git a/tensorflow/core/kernels/fractional_avg_pool_op.cc b/tensorflow/core/kernels/fractional_avg_pool_op.cc
index 47f4189c30..135d002345 100644
--- a/tensorflow/core/kernels/fractional_avg_pool_op.cc
+++ b/tensorflow/core/kernels/fractional_avg_pool_op.cc
@@ -232,8 +232,9 @@ class FractionalAvgPoolGradOp : public OpKernel {
 
     // Grab the inputs.
     const Tensor& orig_input_tensor_shape = context->input(0);
-    OP_REQUIRES(context, orig_input_tensor_shape.dims() == 1 &&
-                             orig_input_tensor_shape.NumElements() == 4,
+    OP_REQUIRES(context,
+                orig_input_tensor_shape.dims() == 1 &&
+                    orig_input_tensor_shape.NumElements() == 4,
                 errors::InvalidArgument("original input tensor shape must be"
                                         "1-dimensional and 4 elements"));
     const Tensor& out_backprop = context->input(1);
diff --git a/tensorflow/core/kernels/function_ops.cc b/tensorflow/core/kernels/function_ops.cc
index ef9e848413..9d4bc35ba8 100644
--- a/tensorflow/core/kernels/function_ops.cc
+++ b/tensorflow/core/kernels/function_ops.cc
@@ -253,22 +253,21 @@ class SymbolicGradientOp : public AsyncOpKernel {
       args.push_back(ctx->input(i));
     }
     std::vector<Tensor>* rets = new std::vector<Tensor>;
-    lib->Run(
-        opts, handle, args, rets, [ctx, done, rets](const Status& status) {
-          if (!status.ok()) {
-            ctx->SetStatus(status);
-          } else if (rets->size() != ctx->num_outputs()) {
-            ctx->SetStatus(errors::InvalidArgument(
-                "SymGrad expects to return ", ctx->num_outputs(),
-                " tensor(s), but get ", rets->size(), " tensor(s) instead."));
-          } else {
-            for (size_t i = 0; i < rets->size(); ++i) {
-              ctx->set_output(i, (*rets)[i]);
-            }
-          }
-          delete rets;
-          done();
-        });
+    lib->Run(opts, handle, args, rets, [ctx, done, rets](const Status& status) {
+      if (!status.ok()) {
+        ctx->SetStatus(status);
+      } else if (rets->size() != ctx->num_outputs()) {
+        ctx->SetStatus(errors::InvalidArgument(
+            "SymGrad expects to return ", ctx->num_outputs(),
+            " tensor(s), but get ", rets->size(), " tensor(s) instead."));
+      } else {
+        for (size_t i = 0; i < rets->size(); ++i) {
+          ctx->set_output(i, (*rets)[i]);
+        }
+      }
+      delete rets;
+      done();
+    });
   }
 
  private:
diff --git a/tensorflow/core/kernels/fused_batch_norm_op.cu.cc b/tensorflow/core/kernels/fused_batch_norm_op.cu.cc
index a8484390b9..4a67b2b3a3 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op.cu.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op.cu.cc
@@ -68,7 +68,8 @@ void InvVarianceToVariance<T>::operator()(const Eigen::GpuDevice& d,
 template <class T>
 void SetNanFunctor<T>::operator()(const Eigen::GpuDevice& d,
                                   typename TTypes<T>::Flat out) {
-  To32Bit(out).device(d) = To32Bit(out).constant(Eigen::NumTraits<T>::quiet_NaN());
+  To32Bit(out).device(d) =
+      To32Bit(out).constant(Eigen::NumTraits<T>::quiet_NaN());
 }
 
 template class VarianceToInvVariance<float>;
diff --git a/tensorflow/core/kernels/gather_functor.cc b/tensorflow/core/kernels/gather_functor.cc
index dde08b37ea..e6fefe643b 100644
--- a/tensorflow/core/kernels/gather_functor.cc
+++ b/tensorflow/core/kernels/gather_functor.cc
@@ -25,12 +25,12 @@ typedef Eigen::GpuDevice GPUDevice;
 namespace functor {
 
 // Forward declarations of the functor specializations for GPU.
-#define DECLARE_GPU_SPECS_INDEX(T, Index)                             \
-  template <>                                                         \
-  int64 GatherFunctor<GPUDevice, T, Index>::operator()(               \
+#define DECLARE_GPU_SPECS_INDEX(T, Index)                               \
+  template <>                                                           \
+  int64 GatherFunctor<GPUDevice, T, Index>::operator()(                 \
       OpKernelContext* ctx, typename TTypes<T, 3>::ConstTensor Tparams, \
-      typename TTypes<Index>::ConstFlat Tindices,                     \
-      typename TTypes<T, 3>::Tensor Tout);                            \
+      typename TTypes<Index>::ConstFlat Tindices,                       \
+      typename TTypes<T, 3>::Tensor Tout);                              \
   extern template struct GatherFunctor<GPUDevice, T, Index>;
 
 #define DECLARE_GPU_SPECS(T)         \
diff --git a/tensorflow/core/kernels/gather_functor.h b/tensorflow/core/kernels/gather_functor.h
index 1e429a037e..16ccb03b85 100644
--- a/tensorflow/core/kernels/gather_functor.h
+++ b/tensorflow/core/kernels/gather_functor.h
@@ -18,12 +18,12 @@ limitations under the License.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/type_traits.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/platform/prefetch.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/util/work_sharder.h"
 
 namespace tensorflow {
@@ -52,21 +52,23 @@ SliceIndex HandleCopies(OpKernelContext* ctx,
   const size_t slice_bytes = slice_elems * sizeof(T);
   auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
   mutex mu;
-  // Store the value of invalidate index for printing error information, it's a shared variable.
+  // Store the value of invalidate index for printing error information, it's a
+  // shared variable.
   SliceIndex result = -1;
-  auto work = [&] (int64 start, int64 end) {
+  auto work = [&](int64 start, int64 end) {
     SliceIndex batch_idx = static_cast<SliceIndex>(start / indices_size);
     SliceIndex indices_idx = static_cast<SliceIndex>(start % indices_size);
     SliceIndex batch_idx_end = static_cast<SliceIndex>(end / indices_size);
     SliceIndex indices_idx_end = static_cast<SliceIndex>(end % indices_size);
 
     while ((batch_idx < batch_idx_end) ||
-            (batch_idx == batch_idx_end && indices_idx < indices_idx_end)) {
+           (batch_idx == batch_idx_end && indices_idx < indices_idx_end)) {
       SliceIndex i_next = indices_idx + 1;
       SliceIndex b_next = batch_idx + 1;
       if ((batch_idx == batch_idx_end && i_next < indices_idx_end) ||
-              (i_next < indices_size)) {
-        port::prefetch<port::PREFETCH_HINT_T0>(&params(batch_idx, indices(i_next), 0));
+          (i_next < indices_size)) {
+        port::prefetch<port::PREFETCH_HINT_T0>(
+            &params(batch_idx, indices(i_next), 0));
         port::prefetch<port::PREFETCH_HINT_T0>(&out(batch_idx, i_next, 0));
         b_next = batch_idx;
       } else if (b_next <= batch_idx_end) {
@@ -85,11 +87,12 @@ SliceIndex HandleCopies(OpKernelContext* ctx,
       // ahead-of-time compilation binary size).
       if (is_simple_type<T>::value) {
         // Avoid auto-promotion to Index from SliceIndex by casting.
-        memcpy(out_base + (batch_idx * indices_size + indices_idx) * slice_elems,
-               params_base + (batch_idx * static_cast<SliceIndex>(limit) +
-                              static_cast<SliceIndex>(index)) *
-                             slice_elems,
-               slice_bytes);
+        memcpy(
+            out_base + (batch_idx * indices_size + indices_idx) * slice_elems,
+            params_base + (batch_idx * static_cast<SliceIndex>(limit) +
+                           static_cast<SliceIndex>(index)) *
+                              slice_elems,
+            slice_bytes);
       } else {
         // For non-"simple" types (e.g. strings).
         out.template chip<1>(indices_idx) = params.template chip<1>(index);
@@ -99,8 +102,8 @@ SliceIndex HandleCopies(OpKernelContext* ctx,
     }
   };
 
-  Shard(worker_threads->num_threads, worker_threads->workers, batch_size*indices_size,
-        slice_elems * sizeof(T), work);
+  Shard(worker_threads->num_threads, worker_threads->workers,
+        batch_size * indices_size, slice_elems * sizeof(T), work);
   return result;
 }
 
@@ -117,16 +120,16 @@ struct GatherFunctorCPU {
     bool use_large = (slice_size > std::numeric_limits<int32>::max() ||
                       params.size() > std::numeric_limits<int32>::max() ||
                       N > std::numeric_limits<int32>::max());
-#define CALL(elems)                                                        \
-  do {                                                                     \
-    if (use_large) {                                                       \
-      bad_i = HandleCopies<T, Index, int64, elems>(ctx, params, indices,   \
-                                                   slice_size, out);       \
-    } else {                                                               \
-      const int32 small_slice = static_cast<int32>(slice_size);            \
-      bad_i = HandleCopies<T, Index, int32, elems>(ctx, params, indices,   \
-                                                   small_slice, out);      \
-    }                                                                      \
+#define CALL(elems)                                                      \
+  do {                                                                   \
+    if (use_large) {                                                     \
+      bad_i = HandleCopies<T, Index, int64, elems>(ctx, params, indices, \
+                                                   slice_size, out);     \
+    } else {                                                             \
+      const int32 small_slice = static_cast<int32>(slice_size);          \
+      bad_i = HandleCopies<T, Index, int32, elems>(ctx, params, indices, \
+                                                   small_slice, out);    \
+    }                                                                    \
   } while (0)
 
     if (slice_size == 10)
@@ -143,7 +146,8 @@ struct GatherFunctorCPU {
 
 template <typename Device, typename T, typename Index>
 struct GatherFunctor {
-  int64 operator()(OpKernelContext* ctx, typename TTypes<T, 3>::ConstTensor params,
+  int64 operator()(OpKernelContext* ctx,
+                   typename TTypes<T, 3>::ConstTensor params,
                    typename TTypes<Index>::ConstFlat indices,
                    typename TTypes<T, 3>::Tensor out);
 };
diff --git a/tensorflow/core/kernels/gather_op.cc b/tensorflow/core/kernels/gather_op.cc
index 239d5d2e99..d6cbcf1d93 100644
--- a/tensorflow/core/kernels/gather_op.cc
+++ b/tensorflow/core/kernels/gather_op.cc
@@ -106,8 +106,7 @@ class GatherOp : public OpKernel {
       auto out_flat = out->shaped<T, 3>({outer_size, N, inner_size});
 
       functor::GatherFunctor<Device, T, Index> functor;
-      int64 bad_i = functor(c, params_flat,
-                            indices_flat, out_flat);
+      int64 bad_i = functor(c, params_flat, indices_flat, out_flat);
 
       OP_REQUIRES(
           c, bad_i < 0,
diff --git a/tensorflow/core/kernels/hinge-loss.h b/tensorflow/core/kernels/hinge-loss.h
index 789a7ce7a3..d303e9c877 100644
--- a/tensorflow/core/kernels/hinge-loss.h
+++ b/tensorflow/core/kernels/hinge-loss.h
@@ -50,9 +50,8 @@ class HingeLossUpdater : public DualLossUpdater {
     // valid value for new dual = 0
     // c. new optimal value > 1.0. Then new optimal value should be set to 1.0.
     const double candidate_optimal_dual =
-        current_dual +
-        (label - wx) /
-            (num_loss_partitions * example_weight * weighted_example_norm);
+        current_dual + (label - wx) / (num_loss_partitions * example_weight *
+                                       weighted_example_norm);
     if (label * candidate_optimal_dual < 0) {
       return 0.0;
     }
diff --git a/tensorflow/core/kernels/histogram_op_gpu.cu.cc b/tensorflow/core/kernels/histogram_op_gpu.cu.cc
index c2bb958be8..a88e9b0ddc 100644
--- a/tensorflow/core/kernels/histogram_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/histogram_op_gpu.cu.cc
@@ -17,16 +17,16 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/kernels/histogram_op.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "external/cub_archive/cub/device/device_histogram.cuh"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/histogram_op.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 namespace tensorflow {
 
@@ -104,8 +104,8 @@ struct HistogramFixedWidthFunctor<GPUDevice, T, Tout> {
         /* num_samples */ num_samples,
         /* stream */ stream);
     if (err != cudaSuccess) {
-      return errors::Internal("Could not launch HistogramRange: ",
-                              cudaGetErrorString(err), ".");
+      return errors::Internal(
+          "Could not launch HistogramRange: ", cudaGetErrorString(err), ".");
     }
 
     return Status::OK();
diff --git a/tensorflow/core/kernels/image_resizer_state.h b/tensorflow/core/kernels/image_resizer_state.h
index f088315ff5..faf997be05 100644
--- a/tensorflow/core/kernels/image_resizer_state.h
+++ b/tensorflow/core/kernels/image_resizer_state.h
@@ -109,8 +109,9 @@ struct ImageResizerState {
     ValidateAndCalculateOutputSize(context, input);
     if (!context->status().ok()) return;
     OP_REQUIRES_OK(context, context->allocate_output(
-                                0, TensorShape({input.dim_size(0), out_height,
-                                                out_width, input.dim_size(3)}),
+                                0,
+                                TensorShape({input.dim_size(0), out_height,
+                                             out_width, input.dim_size(3)}),
                                 &output));
   }
 
@@ -168,8 +169,9 @@ struct ImageResizerGradientState {
         CalculateResizeScale(original_width, resized_width, align_corners_);
     output = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(
-                                0, TensorShape({batch_size, original_height,
-                                                original_width, channels}),
+                                0,
+                                TensorShape({batch_size, original_height,
+                                             original_width, channels}),
                                 &output));
   }
 
diff --git a/tensorflow/core/kernels/in_topk_op.cc b/tensorflow/core/kernels/in_topk_op.cc
index e2861ae090..c37055239c 100644
--- a/tensorflow/core/kernels/in_topk_op.cc
+++ b/tensorflow/core/kernels/in_topk_op.cc
@@ -17,11 +17,11 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/kernels/bounds_check.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 namespace tensorflow {
 
@@ -98,36 +98,36 @@ class InTopK : public OpKernel {
   int k_;
 };
 
-REGISTER_KERNEL_BUILDER(
-    Name("InTopK").Device(DEVICE_CPU)
-    .HostMemory("predictions")
-    .HostMemory("targets")
-    .HostMemory("precision")
-    .TypeConstraint<int32>("T"),
-    InTopK<float, int32>);
-REGISTER_KERNEL_BUILDER(
-    Name("InTopK").Device(DEVICE_CPU)
-    .HostMemory("predictions")
-    .HostMemory("targets")
-    .HostMemory("precision")
-    .TypeConstraint<int64>("T"),
-    InTopK<float, int64>);
-
-REGISTER_KERNEL_BUILDER(
-    Name("InTopKV2").Device(DEVICE_CPU)
-    .HostMemory("predictions")
-    .HostMemory("targets")
-    .HostMemory("k")
-    .HostMemory("precision")
-    .TypeConstraint<int32>("T"),
-    InTopK<float, int32>);
-REGISTER_KERNEL_BUILDER(
-    Name("InTopKV2").Device(DEVICE_CPU)
-    .HostMemory("predictions")
-    .HostMemory("targets")
-    .HostMemory("k")
-    .HostMemory("precision")
-    .TypeConstraint<int64>("T"),
-    InTopK<float, int64>);
+REGISTER_KERNEL_BUILDER(Name("InTopK")
+                            .Device(DEVICE_CPU)
+                            .HostMemory("predictions")
+                            .HostMemory("targets")
+                            .HostMemory("precision")
+                            .TypeConstraint<int32>("T"),
+                        InTopK<float, int32>);
+REGISTER_KERNEL_BUILDER(Name("InTopK")
+                            .Device(DEVICE_CPU)
+                            .HostMemory("predictions")
+                            .HostMemory("targets")
+                            .HostMemory("precision")
+                            .TypeConstraint<int64>("T"),
+                        InTopK<float, int64>);
+
+REGISTER_KERNEL_BUILDER(Name("InTopKV2")
+                            .Device(DEVICE_CPU)
+                            .HostMemory("predictions")
+                            .HostMemory("targets")
+                            .HostMemory("k")
+                            .HostMemory("precision")
+                            .TypeConstraint<int32>("T"),
+                        InTopK<float, int32>);
+REGISTER_KERNEL_BUILDER(Name("InTopKV2")
+                            .Device(DEVICE_CPU)
+                            .HostMemory("predictions")
+                            .HostMemory("targets")
+                            .HostMemory("k")
+                            .HostMemory("precision")
+                            .TypeConstraint<int64>("T"),
+                        InTopK<float, int64>);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/inplace_ops.cc b/tensorflow/core/kernels/inplace_ops.cc
index 7728ba850c..a71d047ed1 100644
--- a/tensorflow/core/kernels/inplace_ops.cc
+++ b/tensorflow/core/kernels/inplace_ops.cc
@@ -27,13 +27,13 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SyclDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 namespace functor {
 
 template <typename Device, typename T>
-Status DoParallelConcatUpdate(const Device& d, const Tensor& value,
-                              int32 loc, Tensor* output) {
+Status DoParallelConcatUpdate(const Device& d, const Tensor& value, int32 loc,
+                              Tensor* output) {
   auto Tvalue = value.shaped<T, 2>({1, value.NumElements()});
   auto Toutput = output->flat_outer_dims<T>();
   auto nrows = Toutput.dimension(0);
@@ -74,7 +74,7 @@ Status DoParallelConcat(const SyclDevice& d, const Tensor& value, int32 loc,
       return errors::InvalidArgument("Unsupported data type: ", value.dtype());
   }
 }
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // end namespace functor
 
@@ -207,7 +207,7 @@ REGISTER_KERNEL_BUILDER(Name("_ParallelConcatUpdate")
                             .HostMemory("output")
                             .TypeConstraint<int32>("T"),
                         ParallelConcatUpdate<CPUDevice>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
 
diff --git a/tensorflow/core/kernels/l2loss_op.cc b/tensorflow/core/kernels/l2loss_op.cc
index f8ed935157..f561287f7a 100644
--- a/tensorflow/core/kernels/l2loss_op.cc
+++ b/tensorflow/core/kernels/l2loss_op.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/kernels/l2loss_op.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
diff --git a/tensorflow/core/kernels/linalg_ops_common.cc b/tensorflow/core/kernels/linalg_ops_common.cc
index 36907fb571..b58bcf5834 100644
--- a/tensorflow/core/kernels/linalg_ops_common.cc
+++ b/tensorflow/core/kernels/linalg_ops_common.cc
@@ -108,7 +108,6 @@ void LinearAlgebraOp<Scalar>::Compute(OpKernelContext* context) {
   auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
   Shard(worker_threads.num_threads, worker_threads.workers,
         batch_shape.num_elements(), GetCostPerUnit(input_matrix_shapes), shard);
-
 }
 
 template <typename Scalar>
diff --git a/tensorflow/core/kernels/lmdb_reader_op.cc b/tensorflow/core/kernels/lmdb_reader_op.cc
index 31a427f2c9..1335a95dce 100755
--- a/tensorflow/core/kernels/lmdb_reader_op.cc
+++ b/tensorflow/core/kernels/lmdb_reader_op.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/reader_op_kernel.h"
 #include "tensorflow/core/framework/reader_base.h"
+#include "tensorflow/core/framework/reader_op_kernel.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 #include <sys/stat.h>
@@ -77,15 +77,13 @@ class LMDBReader : public ReaderBase {
         *at_end = true;
         return Status::OK();
       }
-    }
-    else {
+    } else {
       if (Seek(MDB_NEXT) == false) {
         *at_end = true;
         return Status::OK();
       }
     }
-    *key = string(static_cast<const char*>(mdb_key_.mv_data),
-                  mdb_key_.mv_size);
+    *key = string(static_cast<const char*>(mdb_key_.mv_data), mdb_key_.mv_size);
     *value = string(static_cast<const char*>(mdb_value_.mv_data),
                     mdb_value_.mv_size);
     *produced = true;
@@ -123,13 +121,10 @@ class LMDBReaderOp : public ReaderOpKernel {
   explicit LMDBReaderOp(OpKernelConstruction* context)
       : ReaderOpKernel(context) {
     Env* env = context->env();
-    SetReaderFactory([this, env]() {
-      return new LMDBReader(name(), env);
-    });
+    SetReaderFactory([this, env]() { return new LMDBReader(name(), env); });
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("LMDBReader").Device(DEVICE_CPU),
-                        LMDBReaderOp);
+REGISTER_KERNEL_BUILDER(Name("LMDBReader").Device(DEVICE_CPU), LMDBReaderOp);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/logistic-loss.h b/tensorflow/core/kernels/logistic-loss.h
index 2765f42bbd..6479e6f5dc 100644
--- a/tensorflow/core/kernels/logistic-loss.h
+++ b/tensorflow/core/kernels/logistic-loss.h
@@ -122,10 +122,9 @@ class LogisticLossUpdater : public DualLossUpdater {
                              num_loss_partitions * weighted_example_norm *
                                  example_weight *
                                  (0.5 * (1 + tanhx) / label - current_dual);
-    const double denominator = -2 * label -
-                               num_loss_partitions * weighted_example_norm *
-                                   example_weight * (1 - tanhx * tanhx) * 0.5 /
-                                   label;
+    const double denominator =
+        -2 * label - num_loss_partitions * weighted_example_norm *
+                         example_weight * (1 - tanhx * tanhx) * 0.5 / label;
     return x - numerator / denominator;
   }
 };
diff --git a/tensorflow/core/kernels/loss_test.cc b/tensorflow/core/kernels/loss_test.cc
index 89f0677e1f..460d65c5c2 100644
--- a/tensorflow/core/kernels/loss_test.cc
+++ b/tensorflow/core/kernels/loss_test.cc
@@ -32,14 +32,17 @@ namespace {
 
 TEST(LogisticLoss, ComputePrimalLoss) {
   LogisticLossUpdater loss_updater;
-  EXPECT_NEAR(0.693147, loss_updater.ComputePrimalLoss(
-                            0 /* wx */, 1 /* label */, 1 /* example weight */),
+  EXPECT_NEAR(0.693147,
+              loss_updater.ComputePrimalLoss(0 /* wx */, 1 /* label */,
+                                             1 /* example weight */),
               1e-3);
-  EXPECT_NEAR(0.0, loss_updater.ComputePrimalLoss(70 /* wx */, 1 /* label */,
-                                                  1 /* example weight */),
+  EXPECT_NEAR(0.0,
+              loss_updater.ComputePrimalLoss(70 /* wx */, 1 /* label */,
+                                             1 /* example weight */),
               1e-3);
-  EXPECT_NEAR(0.0, loss_updater.ComputePrimalLoss(-70 /* wx */, -1 /* label */,
-                                                  1 /* example weight */),
+  EXPECT_NEAR(0.0,
+              loss_updater.ComputePrimalLoss(-70 /* wx */, -1 /* label */,
+                                             1 /* example weight */),
               1e-3);
 }
 
@@ -53,31 +56,35 @@ TEST(LogisticLoss, ComputeDualLoss) {
               loss_updater.ComputeDualLoss(1 /* current dual */, 1 /* label */,
                                            1 /* example weight */),
               1e-3);
-  EXPECT_NEAR(-0.693147, loss_updater.ComputeDualLoss(0.5 /* current dual */,
-                                                      1 /* label */,
-                                                      1 /* example weight */),
-              1e-3);
+  EXPECT_NEAR(
+      -0.693147,
+      loss_updater.ComputeDualLoss(0.5 /* current dual */, 1 /* label */,
+                                   1 /* example weight */),
+      1e-3);
 }
 
 TEST(LogisticLoss, ComputeUpdatedDual) {
   LogisticLossUpdater loss_updater;
-  EXPECT_NEAR(0.479, loss_updater.ComputeUpdatedDual(
-                         1 /* num partitions */, 1.0 /* label */,
-                         1.0 /* example weight */, 0.5 /* current_dual */,
-                         0.3 /* wx */, 10.0 /* weighted_example_norm */),
+  EXPECT_NEAR(0.479,
+              loss_updater.ComputeUpdatedDual(
+                  1 /* num partitions */, 1.0 /* label */,
+                  1.0 /* example weight */, 0.5 /* current_dual */,
+                  0.3 /* wx */, 10.0 /* weighted_example_norm */),
               1e-3);
 
-  EXPECT_NEAR(-0.031, loss_updater.ComputeUpdatedDual(
-                          2 /* num partitions */, -1.0 /* label */,
-                          1.0 /* example weight */, 0.1 /* current_dual */,
-                          -0.8 /* wx */, 10.0 /* weighted_example_norm */),
+  EXPECT_NEAR(-0.031,
+              loss_updater.ComputeUpdatedDual(
+                  2 /* num partitions */, -1.0 /* label */,
+                  1.0 /* example weight */, 0.1 /* current_dual */,
+                  -0.8 /* wx */, 10.0 /* weighted_example_norm */),
               1e-3);
 }
 
 TEST(SquaredLoss, ComputePrimalLoss) {
   SquaredLossUpdater loss_updater;
-  EXPECT_NEAR(0.5, loss_updater.ComputePrimalLoss(0.0 /* wx */, 1.0 /* label */,
-                                                  1.0 /* example weight */),
+  EXPECT_NEAR(0.5,
+              loss_updater.ComputePrimalLoss(0.0 /* wx */, 1.0 /* label */,
+                                             1.0 /* example weight */),
               1e-3);
   EXPECT_NEAR(40.5,
               loss_updater.ComputePrimalLoss(10.0 /* wx */, 1.0 /* label */,
@@ -95,43 +102,50 @@ TEST(SquaredLoss, ComputePrimalLoss) {
 
 TEST(SquaredLoss, ComputeDualLoss) {
   SquaredLossUpdater loss_updater;
-  EXPECT_NEAR(0.0, loss_updater.ComputeDualLoss(0.0 /* current dual */,
-                                                -1.0 /* label */,
-                                                1.0 /* example weight */),
-              1e-3);
-  EXPECT_NEAR(0.66, loss_updater.ComputeDualLoss(0.2 /* current dual */,
-                                                 -1.0 /* label */,
-                                                 3.0 /* example weight */),
-              1e-3);
-  EXPECT_NEAR(-0.375, loss_updater.ComputeDualLoss(1.5 /* current dual */,
-                                                   1.0 /* label */,
-                                                   1.0 /* example weight */),
-              1e-3);
-  EXPECT_NEAR(-1.125, loss_updater.ComputeDualLoss(0.5 /* current dual */,
-                                                   1.0 /* label */,
-                                                   3.0 /* example weight */),
-              1e-3);
+  EXPECT_NEAR(
+      0.0,
+      loss_updater.ComputeDualLoss(0.0 /* current dual */, -1.0 /* label */,
+                                   1.0 /* example weight */),
+      1e-3);
+  EXPECT_NEAR(
+      0.66,
+      loss_updater.ComputeDualLoss(0.2 /* current dual */, -1.0 /* label */,
+                                   3.0 /* example weight */),
+      1e-3);
+  EXPECT_NEAR(
+      -0.375,
+      loss_updater.ComputeDualLoss(1.5 /* current dual */, 1.0 /* label */,
+                                   1.0 /* example weight */),
+      1e-3);
+  EXPECT_NEAR(
+      -1.125,
+      loss_updater.ComputeDualLoss(0.5 /* current dual */, 1.0 /* label */,
+                                   3.0 /* example weight */),
+      1e-3);
 }
 
 TEST(SquaredLoss, ComputeUpdatedDual) {
   SquaredLossUpdater loss_updater;
-  EXPECT_NEAR(0.336, loss_updater.ComputeUpdatedDual(
-                         1 /* num partitions */, 1.0 /* label */,
-                         1.0 /* example weight */, 0.3 /* current_dual */,
-                         0.3 /* wx */, 10.0 /* weighted_example_norm */),
+  EXPECT_NEAR(0.336,
+              loss_updater.ComputeUpdatedDual(
+                  1 /* num partitions */, 1.0 /* label */,
+                  1.0 /* example weight */, 0.3 /* current_dual */,
+                  0.3 /* wx */, 10.0 /* weighted_example_norm */),
               1e-3);
 
-  EXPECT_NEAR(-0.427, loss_updater.ComputeUpdatedDual(
-                          5 /* num partitions */, -1.0 /* label */,
-                          1.0 /* example weight */, -0.4 /* current_dual */,
-                          0.8 /* wx */, 10.0 /* weighted_example_norm */),
+  EXPECT_NEAR(-0.427,
+              loss_updater.ComputeUpdatedDual(
+                  5 /* num partitions */, -1.0 /* label */,
+                  1.0 /* example weight */, -0.4 /* current_dual */,
+                  0.8 /* wx */, 10.0 /* weighted_example_norm */),
               1e-3);
 }
 
 TEST(HingeLoss, ComputePrimalLoss) {
   HingeLossUpdater loss_updater;
-  EXPECT_NEAR(1.0, loss_updater.ComputePrimalLoss(0.0 /* wx */, 1.0 /* label */,
-                                                  1.0 /* example weight */),
+  EXPECT_NEAR(1.0,
+              loss_updater.ComputePrimalLoss(0.0 /* wx */, 1.0 /* label */,
+                                             1.0 /* example weight */),
               1e-3);
   EXPECT_NEAR(0.0,
               loss_updater.ComputePrimalLoss(10.0 /* wx */, 1.0 /* label */,
@@ -149,10 +163,11 @@ TEST(HingeLoss, ComputePrimalLoss) {
 
 TEST(HingeLoss, ComputeDualLoss) {
   HingeLossUpdater loss_updater;
-  EXPECT_NEAR(0.0, loss_updater.ComputeDualLoss(0.0 /* current dual */,
-                                                -1.0 /* label */,
-                                                1.0 /* example weight */),
-              1e-3);
+  EXPECT_NEAR(
+      0.0,
+      loss_updater.ComputeDualLoss(0.0 /* current dual */, -1.0 /* label */,
+                                   1.0 /* example weight */),
+      1e-3);
   EXPECT_NEAR(
       std::numeric_limits<double>::max(),
       loss_updater.ComputeDualLoss(0.2 /* current dual */, -1.0 /* label */,
@@ -163,10 +178,11 @@ TEST(HingeLoss, ComputeDualLoss) {
       loss_updater.ComputeDualLoss(1.5 /* current dual */, 1.0 /* label */,
                                    1.0 /* example weight */),
       1e-3);
-  EXPECT_NEAR(-1.5, loss_updater.ComputeDualLoss(0.5 /* current dual */,
-                                                 1.0 /* label */,
-                                                 3.0 /* example weight */),
-              1e-3);
+  EXPECT_NEAR(
+      -1.5,
+      loss_updater.ComputeDualLoss(0.5 /* current dual */, 1.0 /* label */,
+                                   3.0 /* example weight */),
+      1e-3);
 }
 
 TEST(HingeLoss, ConvertLabel) {
@@ -195,28 +211,31 @@ TEST(HingeLoss, ComputeUpdatedDual) {
   // weighted_example_norm=100.0, it turns out that the optimal value to update
   // the dual to is 0.507 which is within the permitted range and thus should be
   // the value returned.
-  EXPECT_NEAR(0.507, loss_updater.ComputeUpdatedDual(
-                         1 /* num partitions */, 1.0 /* label */,
-                         1.0 /* example weight */, 0.5 /* current_dual */,
-                         0.3 /* wx */, 100.0 /* weighted_example_norm */),
+  EXPECT_NEAR(0.507,
+              loss_updater.ComputeUpdatedDual(
+                  1 /* num partitions */, 1.0 /* label */,
+                  1.0 /* example weight */, 0.5 /* current_dual */,
+                  0.3 /* wx */, 100.0 /* weighted_example_norm */),
               1e-3);
   // When label=-1.0, example_weight=1.0, current_dual=0.4, wx=0.6,
   // weighted_example_norm=10.0 and num_loss_partitions=10, it turns out that
   // the optimal value to update the dual to is 0.384 which is within the
   // permitted range and thus should be the value returned.
-  EXPECT_NEAR(-0.416, loss_updater.ComputeUpdatedDual(
-                          10 /* num partitions */, -1.0 /* label */,
-                          1.0 /* example weight */, -0.4 /* current_dual */,
-                          0.6 /* wx */, 10.0 /* weighted_example_norm */),
+  EXPECT_NEAR(-0.416,
+              loss_updater.ComputeUpdatedDual(
+                  10 /* num partitions */, -1.0 /* label */,
+                  1.0 /* example weight */, -0.4 /* current_dual */,
+                  0.6 /* wx */, 10.0 /* weighted_example_norm */),
               1e-3);
   // When label=1.0, example_weight=1.0, current_dual=-0.5, wx=0.3 and
   // weighted_example_norm=10.0, it turns out that the optimal value to update
   // the dual to is -0.43. However, this is outside the allowed [0.0, 1.0] range
   // and hence the closest permitted value (0.0) should be returned instead.
-  EXPECT_NEAR(0.0, loss_updater.ComputeUpdatedDual(
-                       1 /* num partitions */, 1.0 /* label */,
-                       1.0 /* example weight */, -0.5 /* current_dual */,
-                       0.3 /* wx */, 10.0 /* weighted_example_norm */),
+  EXPECT_NEAR(0.0,
+              loss_updater.ComputeUpdatedDual(
+                  1 /* num partitions */, 1.0 /* label */,
+                  1.0 /* example weight */, -0.5 /* current_dual */,
+                  0.3 /* wx */, 10.0 /* weighted_example_norm */),
               1e-3);
 
   // When label=-1.0, example_weight=2.0, current_dual=-1.0, wx=0.3 and
@@ -224,17 +243,19 @@ TEST(HingeLoss, ComputeUpdatedDual) {
   // the dual to is -1.065. However, this is outside the allowed [-1.0, 0.0]
   // range and hence the closest permitted value (-1.0) should be returned
   // instead.
-  EXPECT_NEAR(-1.0, loss_updater.ComputeUpdatedDual(
-                        1 /* num partitions */, -1.0 /* label */,
-                        2.0 /* example weight */, -1.0 /* current_dual */,
-                        0.3 /* wx */, 10.0 /* weighted_example_norm */),
+  EXPECT_NEAR(-1.0,
+              loss_updater.ComputeUpdatedDual(
+                  1 /* num partitions */, -1.0 /* label */,
+                  2.0 /* example weight */, -1.0 /* current_dual */,
+                  0.3 /* wx */, 10.0 /* weighted_example_norm */),
               1e-3);
 }
 
 TEST(SmoothHingeLoss, ComputePrimalLoss) {
   SmoothHingeLossUpdater loss_updater;
-  EXPECT_NEAR(0.5, loss_updater.ComputePrimalLoss(0.0 /* wx */, 1.0 /* label */,
-                                                  1.0 /* example weight */),
+  EXPECT_NEAR(0.5,
+              loss_updater.ComputePrimalLoss(0.0 /* wx */, 1.0 /* label */,
+                                             1.0 /* example weight */),
               1e-3);
   EXPECT_NEAR(0.0,
               loss_updater.ComputePrimalLoss(10.0 /* wx */, 1.0 /* label */,
@@ -252,10 +273,11 @@ TEST(SmoothHingeLoss, ComputePrimalLoss) {
 
 TEST(SmoothHingeLoss, ComputeDualLoss) {
   SmoothHingeLossUpdater loss_updater;
-  EXPECT_NEAR(0.0, loss_updater.ComputeDualLoss(0.0 /* current dual */,
-                                                -1.0 /* label */,
-                                                1.0 /* example weight */),
-              1e-3);
+  EXPECT_NEAR(
+      0.0,
+      loss_updater.ComputeDualLoss(0.0 /* current dual */, -1.0 /* label */,
+                                   1.0 /* example weight */),
+      1e-3);
   EXPECT_NEAR(
       std::numeric_limits<double>::max(),
       loss_updater.ComputeDualLoss(0.2 /* current dual */, -1.0 /* label */,
@@ -266,24 +288,27 @@ TEST(SmoothHingeLoss, ComputeDualLoss) {
       loss_updater.ComputeDualLoss(1.5 /* current dual */, 1.0 /* label */,
                                    1.0 /* example weight */),
       1e-3);
-  EXPECT_NEAR(-1.125, loss_updater.ComputeDualLoss(0.5 /* current dual */,
-                                                   1.0 /* label */,
-                                                   3.0 /* example weight */),
-              1e-3);
+  EXPECT_NEAR(
+      -1.125,
+      loss_updater.ComputeDualLoss(0.5 /* current dual */, 1.0 /* label */,
+                                   3.0 /* example weight */),
+      1e-3);
 }
 
 TEST(SmoothHingeLoss, ComputeUpdatedDual) {
   SmoothHingeLossUpdater loss_updater;
-  EXPECT_NEAR(0.336, loss_updater.ComputeUpdatedDual(
-                         1 /* num partitions */, 1.0 /* label */,
-                         1.0 /* example weight */, 0.3 /* current_dual */,
-                         0.3 /* wx */, 10.0 /* weighted_example_norm */),
+  EXPECT_NEAR(0.336,
+              loss_updater.ComputeUpdatedDual(
+                  1 /* num partitions */, 1.0 /* label */,
+                  1.0 /* example weight */, 0.3 /* current_dual */,
+                  0.3 /* wx */, 10.0 /* weighted_example_norm */),
               1e-3);
 
-  EXPECT_NEAR(-0.427, loss_updater.ComputeUpdatedDual(
-                          5 /* num partitions */, -1.0 /* label */,
-                          1.0 /* example weight */, -0.4 /* current_dual */,
-                          0.8 /* wx */, 10.0 /* weighted_example_norm */),
+  EXPECT_NEAR(-0.427,
+              loss_updater.ComputeUpdatedDual(
+                  5 /* num partitions */, -1.0 /* label */,
+                  1.0 /* example weight */, -0.4 /* current_dual */,
+                  0.8 /* wx */, 10.0 /* weighted_example_norm */),
               1e-3);
 }
 
diff --git a/tensorflow/core/kernels/lrn_op.cc b/tensorflow/core/kernels/lrn_op.cc
index c905ebc84a..c3a59c9576 100644
--- a/tensorflow/core/kernels/lrn_op.cc
+++ b/tensorflow/core/kernels/lrn_op.cc
@@ -229,10 +229,11 @@ class LRNOp : public OpKernel {
   explicit LRNOp(OpKernelConstruction* context) : OpKernel(context) {
     int64 depth_radius64;
     OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
-    OP_REQUIRES(context, FastBoundsCheck(depth_radius64,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("depth_radius = ", depth_radius64,
-                                        " larger than int max"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("depth_radius = ", depth_radius64,
+                                " larger than int max"));
     depth_radius_ = static_cast<int>(depth_radius64);
     float tmp;
     OP_REQUIRES_OK(context, context->GetAttr("bias", &tmp));
@@ -247,9 +248,10 @@ class LRNOp : public OpKernel {
     const Tensor& in = context->input(0);
     OP_REQUIRES(context, in.dims() == 4,
                 errors::InvalidArgument("in must be 4-dimensional"));
-    OP_REQUIRES(context, FastBoundsCheck(in.NumElements(),
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("argument to LRN too large"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(in.NumElements(), std::numeric_limits<int>::max()),
+        errors::InvalidArgument("argument to LRN too large"));
     // Cast to platform-specific int to avoid conversion warnings.
     const int batch = static_cast<int>(in.dim_size(0));
     const int rows = static_cast<int>(in.dim_size(1));
@@ -448,10 +450,11 @@ class LRNGradOp : public OpKernel {
   explicit LRNGradOp(OpKernelConstruction* context) : OpKernel(context) {
     int64 depth_radius64;
     OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
-    OP_REQUIRES(context, FastBoundsCheck(depth_radius64,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("depth_radius = ", depth_radius64,
-                                        " larger than int max"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("depth_radius = ", depth_radius64,
+                                " larger than int max"));
     depth_radius_ = static_cast<int>(depth_radius64);
     float tmp;
     OP_REQUIRES_OK(context, context->GetAttr("bias", &tmp));
diff --git a/tensorflow/core/kernels/matching_files_op.cc b/tensorflow/core/kernels/matching_files_op.cc
index 5eb060f664..cdff7bad5f 100644
--- a/tensorflow/core/kernels/matching_files_op.cc
+++ b/tensorflow/core/kernels/matching_files_op.cc
@@ -45,15 +45,14 @@ class MatchingFilesOp : public OpKernel {
     int num_files = 0;
     std::vector<std::vector<string>> all_fnames(num_patterns);
     for (int i = 0; i < num_patterns; i++) {
-      OP_REQUIRES_OK(
-          context,
-          context->env()->GetMatchingPaths(patterns(i), &all_fnames[i]));
+      OP_REQUIRES_OK(context, context->env()->GetMatchingPaths(patterns(i),
+                                                               &all_fnames[i]));
       num_files += all_fnames[i].size();
     }
     Tensor* output_t = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(
-                       "filenames", TensorShape({num_files}), &output_t));
+    OP_REQUIRES_OK(
+        context, context->allocate_output("filenames", TensorShape({num_files}),
+                                          &output_t));
     auto output = output_t->vec<string>();
     int index = 0;
     for (int i = 0; i < num_patterns; ++i) {
diff --git a/tensorflow/core/kernels/matmul_op.cc b/tensorflow/core/kernels/matmul_op.cc
index cb68690f28..f499ce6519 100644
--- a/tensorflow/core/kernels/matmul_op.cc
+++ b/tensorflow/core/kernels/matmul_op.cc
@@ -261,12 +261,12 @@ struct LaunchMatMul<GPUDevice, T, true /* USE_CUBLAS */> {
       std::vector<int64>* algorithms, bool use_autotune, Tensor* out) {
     using perftools::gputools::blas::AlgorithmConfig;
     using perftools::gputools::blas::ComputationType;
-    using perftools::gputools::blas::ProfileResult;
-    using perftools::gputools::blas::Transpose;
     using perftools::gputools::blas::kDefaultAlgorithm;
     using perftools::gputools::blas::kDefaultBlasGemm;
     using perftools::gputools::blas::kDefaultBlasGemv;
     using perftools::gputools::blas::kNoAlgorithm;
+    using perftools::gputools::blas::ProfileResult;
+    using perftools::gputools::blas::Transpose;
     Transpose trans[] = {Transpose::kNoTranspose, Transpose::kTranspose};
     const uint64 m = a.dim_size(1 - dim_pair[0].first);
     const uint64 k = a.dim_size(dim_pair[0].first);
diff --git a/tensorflow/core/kernels/matmul_op.h b/tensorflow/core/kernels/matmul_op.h
index 6398da2fb9..628895ca86 100644
--- a/tensorflow/core/kernels/matmul_op.h
+++ b/tensorflow/core/kernels/matmul_op.h
@@ -30,7 +30,8 @@ struct MatMulTypes {
   typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned>
       out_type;
   typedef Eigen::TensorMap<Eigen::Tensor<const T, 2, Eigen::RowMajor>,
-                           Eigen::Aligned> in_type;
+                           Eigen::Aligned>
+      in_type;
 };
 
 template <typename Device, typename In0, typename In1, typename Out,
diff --git a/tensorflow/core/kernels/matrix_exponential_op.cc b/tensorflow/core/kernels/matrix_exponential_op.cc
index 4cc3f32f7e..99db898301 100644
--- a/tensorflow/core/kernels/matrix_exponential_op.cc
+++ b/tensorflow/core/kernels/matrix_exponential_op.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
-
 namespace tensorflow {
 
 template <class Scalar>
@@ -40,7 +39,8 @@ class MatrixExponentialOp : public LinearAlgebraOp<Scalar> {
                      MatrixMaps* outputs) final {
     const ConstMatrixMap& input = inputs[0];
     if (input.rows() == 0) return;
-    using Matrix = Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+    using Matrix =
+        Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
     Matrix tmp = input;
     outputs->at(0) = tmp.exp();
   }
@@ -51,9 +51,9 @@ class MatrixExponentialOp : public LinearAlgebraOp<Scalar> {
 
 REGISTER_LINALG_OP("MatrixExponential", (MatrixExponentialOp<float>), float);
 REGISTER_LINALG_OP("MatrixExponential", (MatrixExponentialOp<double>), double);
-REGISTER_LINALG_OP("MatrixExponential",
-                   (MatrixExponentialOp<complex64>), complex64);
-REGISTER_LINALG_OP("MatrixExponential",
-                   (MatrixExponentialOp<complex128>), complex128);
+REGISTER_LINALG_OP("MatrixExponential", (MatrixExponentialOp<complex64>),
+                   complex64);
+REGISTER_LINALG_OP("MatrixExponential", (MatrixExponentialOp<complex128>),
+                   complex128);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/matrix_logarithm_op.cc b/tensorflow/core/kernels/matrix_logarithm_op.cc
index cf0007b5b6..22ca094e24 100644
--- a/tensorflow/core/kernels/matrix_logarithm_op.cc
+++ b/tensorflow/core/kernels/matrix_logarithm_op.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
-
 namespace tensorflow {
 
 template <class Scalar>
@@ -40,7 +39,8 @@ class MatrixLogarithmOp : public LinearAlgebraOp<Scalar> {
                      MatrixMaps* outputs) final {
     const ConstMatrixMap& input = inputs[0];
     if (input.rows() == 0) return;
-    using Matrix = Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+    using Matrix =
+        Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
     Matrix tmp = input;
     outputs->at(0) = tmp.log();
   }
@@ -53,9 +53,9 @@ class MatrixLogarithmOp : public LinearAlgebraOp<Scalar> {
 // logarithm. If all eigenvalues are positive, then this returns the correct
 // logarithm, however checking for positive definiteness adds significant
 // overhead. Therefore at present we only register this Op for complex types.
-REGISTER_LINALG_OP("MatrixLogarithm",
-                   (MatrixLogarithmOp<complex64>), complex64);
-REGISTER_LINALG_OP("MatrixLogarithm",
-                   (MatrixLogarithmOp<complex128>), complex128);
+REGISTER_LINALG_OP("MatrixLogarithm", (MatrixLogarithmOp<complex64>),
+                   complex64);
+REGISTER_LINALG_OP("MatrixLogarithm", (MatrixLogarithmOp<complex128>),
+                   complex128);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/matrix_set_diag_op.cc b/tensorflow/core/kernels/matrix_set_diag_op.cc
index 9dd665392b..502d593474 100644
--- a/tensorflow/core/kernels/matrix_set_diag_op.cc
+++ b/tensorflow/core/kernels/matrix_set_diag_op.cc
@@ -69,8 +69,8 @@ class MatrixSetDiagOp : public OpKernel {
                 errors::InvalidArgument(
                     "must have diagonal.shape == input.shape[:-2] + "
                     "min(input.shape[-2:]), but received input shape: ",
-                    input_shape.DebugString(), " and diagonal shape: ",
-                    diag_shape.DebugString()));
+                    input_shape.DebugString(),
+                    " and diagonal shape: ", diag_shape.DebugString()));
 
     if (input.NumElements() == 0) {
       // This is a no-op.
diff --git a/tensorflow/core/kernels/maxpooling_op.cc b/tensorflow/core/kernels/maxpooling_op.cc
index 2eefadad49..9be7408012 100644
--- a/tensorflow/core/kernels/maxpooling_op.cc
+++ b/tensorflow/core/kernels/maxpooling_op.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/maxpooling_op.h"
 
 #include <vector>
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -37,7 +38,6 @@ limitations under the License.
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
 #include "tensorflow/core/util/use_cudnn.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 #if GOOGLE_CUDA
 #include "tensorflow/core/kernels/maxpooling_op_gpu.h"
@@ -89,7 +89,6 @@ static void SpatialMaxPoolWithArgMaxHelper(
   //    max value.
   auto shard = [&params, &in_mat, &out_mat, &out_arg_max_mat, &input_backprop,
                 &output_arg_max, &out_backprop](int64 start, int64 limit) {
-
     const int32 depth = params.depth;
     const int32 in_rows = params.tensor_in_rows;
     const int32 in_cols = params.tensor_in_cols;
@@ -180,7 +179,6 @@ static void SpatialMaxPoolWithArgMaxHelper(
         input_backprop_flat(input_backprop_index) += out_backprop_flat(index);
       }
     }
-
   };
 
   const int64 shard_cost = params.tensor_in_rows * params.tensor_in_cols *
@@ -567,7 +565,7 @@ class MaxPoolingGradGradOp : public OpKernel {
     //    tensor_out_as_matrix with the corresponding values in
     //    top_diff_as_matrix.
     auto shard = [&params, &in_mat, &out_mat, &top_diff_mat, &bottom_diff_mat](
-        int64 start, int64 limit) {
+                     int64 start, int64 limit) {
       const int32 depth = params.depth;
       const int32 in_rows = params.tensor_in_rows;
       const int32 in_cols = params.tensor_in_cols;
diff --git a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
index f8daaca4c9..0c7a236b2f 100644
--- a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
@@ -450,10 +450,10 @@ bool MaxPoolBackwardWithArgmax<T>::operator()(
     T* bottom_diff, const Eigen::GpuDevice& d) {
   const int kThreadsPerBlock = 1024;
   SetZero<<<(input_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
-    kThreadsPerBlock, 0, d.stream()>>>(input_size, bottom_diff);
+            kThreadsPerBlock, 0, d.stream()>>>(input_size, bottom_diff);
   MaxPoolBackward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
                     kThreadsPerBlock, 0, d.stream()>>>(
-                                        output_size, top_diff, mask, top_offset, bottom_offset, bottom_diff);
+      output_size, top_diff, mask, top_offset, bottom_offset, bottom_diff);
   return d.ok();
 }
 
diff --git a/tensorflow/core/kernels/meta_support.cc b/tensorflow/core/kernels/meta_support.cc
index 9fed01189f..39e60c9fce 100644
--- a/tensorflow/core/kernels/meta_support.cc
+++ b/tensorflow/core/kernels/meta_support.cc
@@ -98,9 +98,9 @@ typedef gemmlowp::meta::SimpleContext<gemmlowp::WorkersPool> LocalContext;
 template <typename Context, typename Params>
 void MultiThreadGemm(Context* context, const Params& params) {
   if (params.m <= 4) {
-      gemmlowp::meta::MultiThreadGemm<
-          Context, gemmlowp::meta::GemmExecutorPackLHSCacheFriendly<>, Params,
-          1, 8, 8>(context, params);
+    gemmlowp::meta::MultiThreadGemm<
+        Context, gemmlowp::meta::GemmExecutorPackLHSCacheFriendly<>, Params, 1,
+        8, 8>(context, params);
   } else {
     if (params.m >= params.n) {
       gemmlowp::meta::MultiThreadGemm<
diff --git a/tensorflow/core/kernels/mfcc.cc b/tensorflow/core/kernels/mfcc.cc
index 2793005aa2..8c755e0df8 100644
--- a/tensorflow/core/kernels/mfcc.cc
+++ b/tensorflow/core/kernels/mfcc.cc
@@ -27,21 +27,19 @@ const double kFilterbankFloor = 1e-12;
 const int kDefaultFilterbankChannelCount = 40;
 const int kDefaultDCTCoefficientCount = 13;
 
-Mfcc::Mfcc() : initialized_(false),
-               lower_frequency_limit_(kDefaultLowerFrequencyLimit),
-               upper_frequency_limit_(kDefaultUpperFrequencyLimit),
-               filterbank_channel_count_(kDefaultFilterbankChannelCount),
-               dct_coefficient_count_(kDefaultDCTCoefficientCount) { }
+Mfcc::Mfcc()
+    : initialized_(false),
+      lower_frequency_limit_(kDefaultLowerFrequencyLimit),
+      upper_frequency_limit_(kDefaultUpperFrequencyLimit),
+      filterbank_channel_count_(kDefaultFilterbankChannelCount),
+      dct_coefficient_count_(kDefaultDCTCoefficientCount) {}
 
-bool Mfcc::Initialize(int input_length,
-                      double input_sample_rate) {
-  bool initialized = mel_filterbank_.Initialize(input_length,
-                                                input_sample_rate,
-                                                filterbank_channel_count_,
-                                                lower_frequency_limit_,
-                                                upper_frequency_limit_);
-  initialized &= dct_.Initialize(filterbank_channel_count_,
-                                 dct_coefficient_count_);
+bool Mfcc::Initialize(int input_length, double input_sample_rate) {
+  bool initialized = mel_filterbank_.Initialize(
+      input_length, input_sample_rate, filterbank_channel_count_,
+      lower_frequency_limit_, upper_frequency_limit_);
+  initialized &=
+      dct_.Initialize(filterbank_channel_count_, dct_coefficient_count_);
   initialized_ = initialized;
   return initialized;
 }
diff --git a/tensorflow/core/kernels/mfcc.h b/tensorflow/core/kernels/mfcc.h
index 8268f47203..8eee76f7f0 100644
--- a/tensorflow/core/kernels/mfcc.h
+++ b/tensorflow/core/kernels/mfcc.h
@@ -20,18 +20,17 @@ limitations under the License.
 
 #include <vector>
 
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/kernels/mfcc_dct.h"
 #include "tensorflow/core/kernels/mfcc_mel_filterbank.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/framework/op_kernel.h"
 
 namespace tensorflow {
 
 class Mfcc {
  public:
   Mfcc();
-  bool Initialize(int input_length,
-                  double input_sample_rate);
+  bool Initialize(int input_length, double input_sample_rate);
 
   // Input is a single squared-magnitude spectrogram frame. The input spectrum
   // is converted to linear magnitude and weighted into bands using a
diff --git a/tensorflow/core/kernels/mfcc_mel_filterbank.cc b/tensorflow/core/kernels/mfcc_mel_filterbank.cc
index 630de8a5a3..3db3b51e8b 100644
--- a/tensorflow/core/kernels/mfcc_mel_filterbank.cc
+++ b/tensorflow/core/kernels/mfcc_mel_filterbank.cc
@@ -38,13 +38,12 @@ namespace tensorflow {
 
 MfccMelFilterbank::MfccMelFilterbank() : initialized_(false) {}
 
-bool MfccMelFilterbank::Initialize(int input_length,
-                               double input_sample_rate,
-                               int output_channel_count,
-                               double lower_frequency_limit,
-                               double upper_frequency_limit) {
+bool MfccMelFilterbank::Initialize(int input_length, double input_sample_rate,
+                                   int output_channel_count,
+                                   double lower_frequency_limit,
+                                   double upper_frequency_limit) {
   num_channels_ = output_channel_count;
-  sample_rate_  = input_sample_rate;
+  sample_rate_ = input_sample_rate;
   input_length_ = input_length;
 
   if (num_channels_ < 1) {
@@ -85,10 +84,9 @@ bool MfccMelFilterbank::Initialize(int input_length,
   }
 
   // Always exclude DC; emulate HTK.
-  const double hz_per_sbin = 0.5 * sample_rate_ /
-      static_cast<double>(input_length_ - 1);
-  start_index_ = static_cast<int>(1.5 + (lower_frequency_limit /
-                                           hz_per_sbin));
+  const double hz_per_sbin =
+      0.5 * sample_rate_ / static_cast<double>(input_length_ - 1);
+  start_index_ = static_cast<int>(1.5 + (lower_frequency_limit / hz_per_sbin));
   end_index_ = static_cast<int>(upper_frequency_limit / hz_per_sbin);
 
   // Maps the input spectrum bin indices to filter bank channels/indices. For
@@ -121,12 +119,12 @@ bool MfccMelFilterbank::Initialize(int input_length,
       weights_[i] = 0.0;
     } else {
       if (channel >= 0) {
-        weights_[i] = (center_frequencies_[channel + 1] -
-                       FreqToMel(i * hz_per_sbin)) /
+        weights_[i] =
+            (center_frequencies_[channel + 1] - FreqToMel(i * hz_per_sbin)) /
             (center_frequencies_[channel + 1] - center_frequencies_[channel]);
       } else {
         weights_[i] = (center_frequencies_[0] - FreqToMel(i * hz_per_sbin)) /
-            (center_frequencies_[0] - mel_low);
+                      (center_frequencies_[0] - mel_low);
       }
     }
   }
@@ -152,16 +150,16 @@ bool MfccMelFilterbank::Initialize(int input_length,
     }
   }
   if (!bad_channels.empty()) {
-    LOG(ERROR) << "Missing " << bad_channels.size() << " bands " <<
-        " starting at " << bad_channels[0] <<
-        " in mel-frequency design. " <<
-        "Perhaps too many channels or " <<
-        "not enough frequency resolution in spectrum. (" <<
-        "input_length: " << input_length <<
-        " input_sample_rate: " << input_sample_rate <<
-        " output_channel_count: " << output_channel_count <<
-        " lower_frequency_limit: " << lower_frequency_limit <<
-        " upper_frequency_limit: " << upper_frequency_limit;
+    LOG(ERROR) << "Missing " << bad_channels.size() << " bands "
+               << " starting at " << bad_channels[0]
+               << " in mel-frequency design. "
+               << "Perhaps too many channels or "
+               << "not enough frequency resolution in spectrum. ("
+               << "input_length: " << input_length
+               << " input_sample_rate: " << input_sample_rate
+               << " output_channel_count: " << output_channel_count
+               << " lower_frequency_limit: " << lower_frequency_limit
+               << " upper_frequency_limit: " << upper_frequency_limit;
   }
   initialized_ = true;
   return true;
@@ -171,7 +169,7 @@ bool MfccMelFilterbank::Initialize(int input_length,
 // square root, then summing FFT magnitudes under triangular integration windows
 // whose widths increase with frequency.
 void MfccMelFilterbank::Compute(const std::vector<double> &input,
-                            std::vector<double> *output) const {
+                                std::vector<double> *output) const {
   if (!initialized_) {
     LOG(ERROR) << "Mel Filterbank not initialized.";
     return;
diff --git a/tensorflow/core/kernels/mfcc_mel_filterbank.h b/tensorflow/core/kernels/mfcc_mel_filterbank.h
index 1bdc2dc93b..37c3936e80 100644
--- a/tensorflow/core/kernels/mfcc_mel_filterbank.h
+++ b/tensorflow/core/kernels/mfcc_mel_filterbank.h
@@ -27,10 +27,8 @@ class MfccMelFilterbank {
  public:
   MfccMelFilterbank();
   bool Initialize(int input_length,  // Number of unique FFT bins fftsize/2+1.
-                  double input_sample_rate,
-                  int output_channel_count,
-                  double lower_frequency_limit,
-                  double upper_frequency_limit);
+                  double input_sample_rate, int output_channel_count,
+                  double lower_frequency_limit, double upper_frequency_limit);
 
   // Takes a squared-magnitude spectrogram slice as input, computes a
   // triangular-mel-weighted linear-magnitude filterbank, and places the result
@@ -56,7 +54,7 @@ class MfccMelFilterbank {
   // FFT bin i contributes to the upper side of mel channel band_mapper_[i]
   std::vector<int> band_mapper_;
   int start_index_;  // Lowest FFT bin used to calculate mel spectrum.
-  int end_index_;  // Highest FFT bin used to calculate mel spectrum.
+  int end_index_;    // Highest FFT bin used to calculate mel spectrum.
 
   TF_DISALLOW_COPY_AND_ASSIGN(MfccMelFilterbank);
 };
diff --git a/tensorflow/core/kernels/mfcc_mel_filterbank_test.cc b/tensorflow/core/kernels/mfcc_mel_filterbank_test.cc
index 602dfeb4e5..54f31e1699 100644
--- a/tensorflow/core/kernels/mfcc_mel_filterbank_test.cc
+++ b/tensorflow/core/kernels/mfcc_mel_filterbank_test.cc
@@ -34,11 +34,9 @@ TEST(MfccMelFilterbankTest, AgreesWithPythonGoldenValues) {
     input.push_back(i + 1);
   }
   const int kChannelCount = 20;
-  filterbank.Initialize(input.size(),
-                        22050 /* sample rate */,
-                        kChannelCount /* channels */,
-                        20.0 /*  lower frequency limit */,
-                        4000.0 /* upper frequency limit */);
+  filterbank.Initialize(
+      input.size(), 22050 /* sample rate */, kChannelCount /* channels */,
+      20.0 /*  lower frequency limit */, 4000.0 /* upper frequency limit */);
 
   std::vector<double> output;
   filterbank.Compute(input, &output);
@@ -65,13 +63,10 @@ TEST(MfccMelFilterbankTest, IgnoresExistingContentOfOutputVector) {
   std::vector<double> input;
   std::vector<double> output;
 
-  filterbank.Initialize(kSampleCount,
-                        22050 /* sample rate */,
-                        20 /* channels */,
-                        20.0 /*  lower frequency limit */,
+  filterbank.Initialize(kSampleCount, 22050 /* sample rate */,
+                        20 /* channels */, 20.0 /*  lower frequency limit */,
                         4000.0 /* upper frequency limit */);
 
-
   // First call with nonzero input value, and an empty output vector,
   // will resize the output and fill it with the correct, nonzero outputs.
   input.assign(kSampleCount, 1.0);
diff --git a/tensorflow/core/kernels/mfcc_test.cc b/tensorflow/core/kernels/mfcc_test.cc
index cb32df8811..72c1d331d6 100644
--- a/tensorflow/core/kernels/mfcc_test.cc
+++ b/tensorflow/core/kernels/mfcc_test.cc
@@ -36,11 +36,10 @@ TEST(MfccTest, AgreesWithPythonGoldenValues) {
   std::vector<double> output;
   mfcc.Compute(input, &output);
 
-  std::vector<double> expected = {29.13970072, -6.41568601, -0.61903012,
-                             -0.96778652, -0.26819878, -0.40907028,
-                             -0.15614748, -0.23203119, -0.10481487,
-                             -0.1543029,  -0.0769791,  -0.10806114,
-                             -0.06047613};
+  std::vector<double> expected = {
+      29.13970072, -6.41568601, -0.61903012, -0.96778652, -0.26819878,
+      -0.40907028, -0.15614748, -0.23203119, -0.10481487, -0.1543029,
+      -0.0769791,  -0.10806114, -0.06047613};
 
   ASSERT_EQ(expected.size(), output.size());
   for (int i = 0; i < output.size(); ++i) {
diff --git a/tensorflow/core/kernels/mirror_pad_op.cc b/tensorflow/core/kernels/mirror_pad_op.cc
index fbdeaf43eb..26e1082989 100644
--- a/tensorflow/core/kernels/mirror_pad_op.cc
+++ b/tensorflow/core/kernels/mirror_pad_op.cc
@@ -87,8 +87,8 @@ class MirrorPadOp : public OpKernel {
       const Tpaddings before = paddings(d, 0);  // Pad before existing elements.
       const Tpaddings after = paddings(d, 1);   // Pad after existing elements.
       OP_REQUIRES(context, before >= 0 && after >= 0,
-                  errors::InvalidArgument("paddings must be non-negative: ",
-                                          before, " ", after));
+                  errors::InvalidArgument(
+                      "paddings must be non-negative: ", before, " ", after));
       if (offset_ == 0) {  // SYMMETRIC mode.
         OP_REQUIRES(context,
                     before <= in0.dim_size(d) && after <= in0.dim_size(d),
@@ -296,8 +296,8 @@ class MirrorPadGradOp : public OpKernel {
       const Tpaddings before = paddings(d, 0);  // Pad before existing elements.
       const Tpaddings after = paddings(d, 1);   // Pad after existing elements.
       OP_REQUIRES(context, before >= 0 && after >= 0,
-                  errors::InvalidArgument("Paddings must be non-negative: ",
-                                          before, ", ", after));
+                  errors::InvalidArgument(
+                      "Paddings must be non-negative: ", before, ", ", after));
 
       const int64 out_size = in0.dim_size(d) - (before + after);
       if (offset_ == 0) {  // SYMMETRIC mode.
diff --git a/tensorflow/core/kernels/mkl_avgpooling_op.cc b/tensorflow/core/kernels/mkl_avgpooling_op.cc
index d751a70fc8..a7c569ee05 100644
--- a/tensorflow/core/kernels/mkl_avgpooling_op.cc
+++ b/tensorflow/core/kernels/mkl_avgpooling_op.cc
@@ -26,14 +26,14 @@
 
 #ifdef INTEL_MKL_DNN
 #include "mkldnn.hpp"
-using mkldnn::memory;
+using mkldnn::algorithm;
+using mkldnn::engine;
 using mkldnn::error;
-using mkldnn::pooling_forward;
-using mkldnn::pooling_backward;
+using mkldnn::memory;
 using mkldnn::padding_kind;
-using mkldnn::engine;
+using mkldnn::pooling_backward;
+using mkldnn::pooling_forward;
 using mkldnn::prop_kind;
-using mkldnn::algorithm;
 #endif
 
 namespace tensorflow {
@@ -358,10 +358,11 @@ class MklAvgPoolingGradOp : public OpKernel {
       if (!outbackprop_in_mkl_format) {
         // For avgpooling, tensor_in_shape should have 1 dimension, and 4
         // elements.
-        OP_REQUIRES(context, tensor_in_shape.dims() == 1 &&
-                                 tensor_in_shape.NumElements() == 4,
-                    errors::InvalidArgument("original input shape must be "
-                                            "1-dimensional and 4 elements"));
+        OP_REQUIRES(
+            context,
+            tensor_in_shape.dims() == 1 && tensor_in_shape.NumElements() == 4,
+            errors::InvalidArgument("original input shape must be "
+                                    "1-dimensional and 4 elements"));
 
         // For avgpooling, out_backprop should have 4 dimensions.
         OP_REQUIRES(context, out_backprop.dims() == 4,
@@ -428,14 +429,13 @@ class MklAvgPoolingGradOp : public OpKernel {
   TensorFormat data_format_;
 };  // MklAvgPoolingGradOp
 
-
 #else  // INTEL_MKL_DNN is defined
 
 template <typename Device, typename T>
 class MklAvgPoolingOp : public MklPoolingForwardOpBase<T> {
  public:
   explicit MklAvgPoolingOp(OpKernelConstruction* context)
-  : MklPoolingForwardOpBase<T>(context) {
+      : MklPoolingForwardOpBase<T>(context) {
     // Workspace is an MKLDNN construct that is only used in Max Pooling.
     // So set workspace_enabled_ to false.
     this->workspace_enabled_ = false;
@@ -444,8 +444,8 @@ class MklAvgPoolingOp : public MklPoolingForwardOpBase<T> {
   void Compute(OpKernelContext* context) override {
     try {
       auto cpu_engine = engine(engine::cpu, 0);
-      const Tensor& input_tensor = MklGetInput(context,
-              this->kInputTensorIndexInput);
+      const Tensor& input_tensor =
+          MklGetInput(context, this->kInputTensorIndexInput);
       MklDnnShape dnn_shape_input;
       GetMklShape(context, this->kInputTensorIndexInput, &dnn_shape_input);
       this->SanityCheckInput(context, input_tensor, dnn_shape_input);
@@ -457,9 +457,8 @@ class MklAvgPoolingOp : public MklPoolingForwardOpBase<T> {
       // initialize variables for the pooling op
       MklPoolParameters pool_params;
       // Get the input tensor and initialize the pooling parameters
-      this->ConfigureInput(context, dnn_shape_input,
-                          input_tensor, &pool_params,
-                          &dnn_data_input);
+      this->ConfigureInput(context, dnn_shape_input, input_tensor, &pool_params,
+                           &dnn_data_input);
       OP_REQUIRES_OK(context, context->status());
 
       // Declare output tensor
@@ -470,56 +469,52 @@ class MklAvgPoolingOp : public MklPoolingForwardOpBase<T> {
       // If input is in Mkl layout, then just get the memory format from it
       // directly, instead of using input data_format to AvgPool.
       if (dnn_shape_input.IsMklTensor()) {
-        dnn_data_output.SetUsrMem(output_dims_mkl_order,
-                static_cast<memory::format>(dnn_data_input.GetUsrMemDesc()
-                    .data.format));
+        dnn_data_output.SetUsrMem(
+            output_dims_mkl_order,
+            static_cast<memory::format>(
+                dnn_data_input.GetUsrMemDesc().data.format));
 
       } else {
-          dnn_data_output.SetUsrMem(output_dims_mkl_order,
-              this->data_format_mkldnn_);
+        dnn_data_output.SetUsrMem(output_dims_mkl_order,
+                                  this->data_format_mkldnn_);
       }
 
-        // describe the memory layout
+      // describe the memory layout
       dnn_data_output.SetOpMemDesc(output_dims_mkl_order, memory::format::any);
 
       // 3. create a pooling primitive descriptor
-      auto pool_desc = pooling_forward::desc(prop_kind::forward,
-              algorithm::pooling_avg_exclude_padding,
-              dnn_data_input.GetUsrMemDesc(),
-              dnn_data_output.GetUsrMemDesc(),
-              memory::dims({  pool_params.row_stride,
-                              pool_params.col_stride}),
-              memory::dims({  pool_params.window_rows,
-                              pool_params.window_cols}),
-              memory::dims({  static_cast<int>(pool_params.pad_top),
-                              static_cast<int>(pool_params.pad_left)}),
-              memory::dims({  static_cast<int>(pool_params.pad_bottom),
-                              static_cast<int>(pool_params.pad_right)}),
-              TFPaddingToMklDnnPadding(this->padding_));
-      auto pool_prim_desc = pooling_forward::primitive_desc(pool_desc,
-                                                 cpu_engine);
+      auto pool_desc = pooling_forward::desc(
+          prop_kind::forward, algorithm::pooling_avg_exclude_padding,
+          dnn_data_input.GetUsrMemDesc(), dnn_data_output.GetUsrMemDesc(),
+          memory::dims({pool_params.row_stride, pool_params.col_stride}),
+          memory::dims({pool_params.window_rows, pool_params.window_cols}),
+          memory::dims({static_cast<int>(pool_params.pad_top),
+                        static_cast<int>(pool_params.pad_left)}),
+          memory::dims({static_cast<int>(pool_params.pad_bottom),
+                        static_cast<int>(pool_params.pad_right)}),
+          TFPaddingToMklDnnPadding(this->padding_));
+      auto pool_prim_desc =
+          pooling_forward::primitive_desc(pool_desc, cpu_engine);
 
       this->AllocateOutputTensor(context, pool_prim_desc, output_dims_mkl_order,
-                            this->data_format_mkldnn_, &output_tensor);
+                                 this->data_format_mkldnn_, &output_tensor);
       CHECK_NOTNULL(output_tensor);
 
       OP_REQUIRES_OK(context, context->status());
       dnn_data_output.SetUsrMemDataHandle(output_tensor);
 
-      this->PrepareAndExecuteNet(pool_prim_desc,
-                                &dnn_data_input,
-                                &dnn_data_output);
-    } catch (mkldnn::error &e) {
-        string error_msg = "Status: " + std::to_string(e.status) +
-                        ", message: " + string(e.message) +
-                        ", in file " + string(__FILE__) + ":" +
-                        std::to_string(__LINE__);
-        OP_REQUIRES_OK(context,
-                        errors::Aborted("Operation received an exception:",
-                                         error_msg));
+      this->PrepareAndExecuteNet(pool_prim_desc, &dnn_data_input,
+                                 &dnn_data_output);
+    } catch (mkldnn::error& e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
     }
   }  // Compute
-};  // MklAvgPoolingOp
+};   // MklAvgPoolingOp
 
 //-----------------------------------------------------------------------------
 
@@ -527,27 +522,23 @@ template <class Device, class T>
 class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
  public:
   explicit MklAvgPoolingGradOp(OpKernelConstruction* context)
-      : MklPoolingBackwardOpBase<T>(context) {
-  }
+      : MklPoolingBackwardOpBase<T>(context) {}
 
   void Compute(OpKernelContext* context) override {
     try {
       auto cpu_engine = engine(engine::cpu, 0);
       MklDnnShape original_input_mkl_shape, input_gradient_mkl_shape;
-      const Tensor& tensor_in_shape = MklGetInput(context,
-          kInputTensorIndexInputShape);
-      const Tensor& input_gradient_tensor = MklGetInput(context,
-          kInputTensorIndexInputGradient);
+      const Tensor& tensor_in_shape =
+          MklGetInput(context, kInputTensorIndexInputShape);
+      const Tensor& input_gradient_tensor =
+          MklGetInput(context, kInputTensorIndexInputGradient);
       GetMklShape(context, kInputTensorIndexInputShape,
-            &original_input_mkl_shape);
+                  &original_input_mkl_shape);
       GetMklShape(context, kInputTensorIndexInputGradient,
-            &input_gradient_mkl_shape);
-
+                  &input_gradient_mkl_shape);
 
-      SanityCheckInputs(context, tensor_in_shape,
-                        input_gradient_tensor,
-                        original_input_mkl_shape,
-                        input_gradient_mkl_shape);
+      SanityCheckInputs(context, tensor_in_shape, input_gradient_tensor,
+                        original_input_mkl_shape, input_gradient_mkl_shape);
       if (!context->status().ok()) return;
 
       // Used to allocate output_diff_src/diff_src
@@ -562,90 +553,70 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
       MklPoolParameters pool_params;
       memory::dims output_dims_mkl_order, original_input_dims_nchw;
       // Configure the original input memory descriptor
-      memory::desc original_input_md = ConfigureOriginalInput(context,
-                                      tensor_in_shape,
-                                      original_input_mkl_shape,
-                                      &original_input_dims_nchw,
-                                      &pool_params,
-                                      &original_input_shape);
+      memory::desc original_input_md = ConfigureOriginalInput(
+          context, tensor_in_shape, original_input_mkl_shape,
+          &original_input_dims_nchw, &pool_params, &original_input_shape);
 
       // configure the original output memory descriptor
       // by definition, the shape of the original output is the same
       // as the shape of the gradient diff_dst
       memory::desc original_output_md = this->ConfigureOriginalOutput(
-                pool_params, input_gradient_mkl_shape, output_dims_mkl_order);
+          pool_params, input_gradient_mkl_shape, output_dims_mkl_order);
 
       memory::desc target_diff_dst_md = this->ConfigureInputGradient(
-                                    input_gradient_mkl_shape,
-                                    input_gradient_tensor,
-                                    &input_gradient_diff_dst,
-                                    original_output_md);
+          input_gradient_mkl_shape, input_gradient_tensor,
+          &input_gradient_diff_dst, original_output_md);
       // The shape of the output diff src needs to be the same shape as the
       // original input. But we will set its format to be same as the format of
       // input gradient. We won't use format of original input since it will
       // always be in Tensorflow layout (given that AvgPoolGrad gets shape of
       // the input rather than actual input).
-      output_diff_src.SetUsrMem(original_input_dims_nchw,
-                                static_cast<memory::format>(
-                                  target_diff_dst_md.data.format));
+      output_diff_src.SetUsrMem(
+          original_input_dims_nchw,
+          static_cast<memory::format>(target_diff_dst_md.data.format));
 
       // Create the forward pooling primitive descriptor so we can reference it
       // in the backward pooling primitive descriptor
-      auto pool_fwd_desc = pooling_forward::desc(prop_kind::forward,
-              algorithm::pooling_avg_exclude_padding,
-              original_input_md,
-              original_output_md,
-              memory::dims({  pool_params.row_stride,
-                              pool_params.col_stride}),
-              memory::dims({  pool_params.window_rows,
-                              pool_params.window_cols}),
-              memory::dims({  static_cast<int>(pool_params.pad_top),
-                              static_cast<int>(pool_params.pad_left)}),
-              memory::dims({  static_cast<int>(pool_params.pad_bottom),
-                              static_cast<int>(pool_params.pad_right)}),
-              TFPaddingToMklDnnPadding(this->padding_));
-      auto pool_fwd_prim_desc
-              = pooling_forward::primitive_desc(pool_fwd_desc,
-                                                  cpu_engine);
+      auto pool_fwd_desc = pooling_forward::desc(
+          prop_kind::forward, algorithm::pooling_avg_exclude_padding,
+          original_input_md, original_output_md,
+          memory::dims({pool_params.row_stride, pool_params.col_stride}),
+          memory::dims({pool_params.window_rows, pool_params.window_cols}),
+          memory::dims({static_cast<int>(pool_params.pad_top),
+                        static_cast<int>(pool_params.pad_left)}),
+          memory::dims({static_cast<int>(pool_params.pad_bottom),
+                        static_cast<int>(pool_params.pad_right)}),
+          TFPaddingToMklDnnPadding(this->padding_));
+      auto pool_fwd_prim_desc =
+          pooling_forward::primitive_desc(pool_fwd_desc, cpu_engine);
 
       auto pool_bkwd_desc = pooling_backward::desc(
-              algorithm::pooling_avg_exclude_padding,
-              output_diff_src.GetUsrMemDesc(),
-              target_diff_dst_md,
-              memory::dims({  pool_params.row_stride,
-                              pool_params.col_stride}),
-              memory::dims({  pool_params.window_rows,
-                              pool_params.window_cols}),
-              memory::dims({  static_cast<int>(pool_params.pad_top),
-                              static_cast<int>(pool_params.pad_left)}),
-              memory::dims({  static_cast<int>(pool_params.pad_bottom),
-                              static_cast<int>(pool_params.pad_right)}),
-              TFPaddingToMklDnnPadding(this->padding_));
-      auto pool_bkwd_prim_desc
-                = pooling_backward::primitive_desc(pool_bkwd_desc,
-                                              cpu_engine,
-                                              pool_fwd_prim_desc);
-      this->AllocateOutputTensor(context, pool_bkwd_prim_desc,
-                      original_input_dims_nchw,
-                      this->data_format_mkldnn_,
-                      &output_tensor_diff_src);
+          algorithm::pooling_avg_exclude_padding,
+          output_diff_src.GetUsrMemDesc(), target_diff_dst_md,
+          memory::dims({pool_params.row_stride, pool_params.col_stride}),
+          memory::dims({pool_params.window_rows, pool_params.window_cols}),
+          memory::dims({static_cast<int>(pool_params.pad_top),
+                        static_cast<int>(pool_params.pad_left)}),
+          memory::dims({static_cast<int>(pool_params.pad_bottom),
+                        static_cast<int>(pool_params.pad_right)}),
+          TFPaddingToMklDnnPadding(this->padding_));
+      auto pool_bkwd_prim_desc = pooling_backward::primitive_desc(
+          pool_bkwd_desc, cpu_engine, pool_fwd_prim_desc);
+      this->AllocateOutputTensor(
+          context, pool_bkwd_prim_desc, original_input_dims_nchw,
+          this->data_format_mkldnn_, &output_tensor_diff_src);
 
       output_diff_src.SetUsrMemDataHandle(output_tensor_diff_src);
 
-      this->PrepareAndExecuteNet(pool_bkwd_prim_desc,
-                          &input_gradient_diff_dst,
-                          &output_diff_src,
-                          memory::primitive_desc(
-                              target_diff_dst_md,
-                              cpu_engine));
-    } catch (mkldnn::error &e) {
+      this->PrepareAndExecuteNet(
+          pool_bkwd_prim_desc, &input_gradient_diff_dst, &output_diff_src,
+          memory::primitive_desc(target_diff_dst_md, cpu_engine));
+    } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
-                      ", message: " + string(e.message) +
-                      ", in file " + string(__FILE__) + ":" +
-                      std::to_string(__LINE__);
-      OP_REQUIRES_OK(context,
-                      errors::Aborted("Compute received an exception:",
-                                      error_msg));
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(context, errors::Aborted("Compute received an exception:",
+                                              error_msg));
     }
   }  // Compute
 
@@ -655,12 +626,11 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
   const int kInputTensorIndexInputShape = 0;
   const int kInputTensorIndexInputGradient = 1;
 
-  memory::desc ConfigureOriginalInput(OpKernelContext* context,
-        const Tensor& tensor_original_input_shape,
-        const MklDnnShape& original_input_mkl_shape,
-        memory::dims* original_input_dims_mkl_order,
-        MklPoolParameters* pool_params,
-        TensorShape* input_tensor_shape) {
+  memory::desc ConfigureOriginalInput(
+      OpKernelContext* context, const Tensor& tensor_original_input_shape,
+      const MklDnnShape& original_input_mkl_shape,
+      memory::dims* original_input_dims_mkl_order,
+      MklPoolParameters* pool_params, TensorShape* input_tensor_shape) {
     CHECK_NOTNULL(original_input_dims_mkl_order);
     CHECK_NOTNULL(pool_params);
     CHECK_NOTNULL(input_tensor_shape);
@@ -672,46 +642,42 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
     }
 
     return MklPoolingBackwardOpBase<T>::ConfigureOriginalInput(
-                                              context,
-                                              tensor_original_input_shape,
-                                              original_input_mkl_shape,
-                                              original_input_dims_mkl_order,
-                                              pool_params,
-                                              *input_tensor_shape);
-}
+        context, tensor_original_input_shape, original_input_mkl_shape,
+        original_input_dims_mkl_order, pool_params, *input_tensor_shape);
+  }
 
   void SanityCheckInputs(OpKernelContext* context,
-                        const Tensor& tensor_in_shape,
-                        const Tensor& input_gradient_tensor,
-                        const MklDnnShape& original_input_mkl_shape,
-                        const MklDnnShape& input_gradient_mkl_shape) {
+                         const Tensor& tensor_in_shape,
+                         const Tensor& input_gradient_tensor,
+                         const MklDnnShape& original_input_mkl_shape,
+                         const MklDnnShape& input_gradient_mkl_shape) {
     if (!original_input_mkl_shape.IsMklTensor()) {
-      OP_REQUIRES(context, tensor_in_shape.dims() == 1 &&
-          tensor_in_shape.NumElements() == 4,
+      OP_REQUIRES(
+          context,
+          tensor_in_shape.dims() == 1 && tensor_in_shape.NumElements() == 4,
           errors::InvalidArgument("original input shape must be "
-                "1-dimensional and 4 elements"));
+                                  "1-dimensional and 4 elements"));
     } else {
-      OP_REQUIRES(context, original_input_mkl_shape.GetDimension() == 1 &&
-          original_input_mkl_shape.DimSize(0) == 4,
-          errors::InvalidArgument("original input shape must be "
-                "1-dimensional and 4 elements"));
+      OP_REQUIRES(context,
+                  original_input_mkl_shape.GetDimension() == 1 &&
+                      original_input_mkl_shape.DimSize(0) == 4,
+                  errors::InvalidArgument("original input shape must be "
+                                          "1-dimensional and 4 elements"));
     }
 
     if (!input_gradient_mkl_shape.IsMklTensor()) {
       // For avgpooling, input_gradient_diff_dst should have 4 dimensions.
       OP_REQUIRES(context, input_gradient_tensor.dims() == 4,
-          errors::InvalidArgument("Gradient shape must be "
-                              "4-dimensional"));
+                  errors::InvalidArgument("Gradient shape must be "
+                                          "4-dimensional"));
     } else {
       OP_REQUIRES(context, input_gradient_mkl_shape.GetDimension() == 4,
-          errors::InvalidArgument("Gradient shape must be "
-                              "4-dimensional"));
+                  errors::InvalidArgument("Gradient shape must be "
+                                          "4-dimensional"));
     }
   }
 };  // MklAvgPoolingGradOp
 
-
-
 #endif  // INTEL_MKL_DNN
 
 REGISTER_KERNEL_BUILDER(Name("_MklAvgPool")
@@ -728,4 +694,3 @@ REGISTER_KERNEL_BUILDER(Name("_MklAvgPoolGrad")
 
 }  // namespace tensorflow
 #endif  // INTEL_MKL
-
diff --git a/tensorflow/core/kernels/mkl_batch_matmul_op.cc b/tensorflow/core/kernels/mkl_batch_matmul_op.cc
index 9fee94f946..d9713075be 100644
--- a/tensorflow/core/kernels/mkl_batch_matmul_op.cc
+++ b/tensorflow/core/kernels/mkl_batch_matmul_op.cc
@@ -40,7 +40,6 @@ limitations under the License.
 #include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 #define MKL_Complex8 tensorflow::complex64
 #define MKL_Complex16 tensorflow::complex128
diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl_concat_op.cc
index d109bb6bcf..7da63604d2 100644
--- a/tensorflow/core/kernels/mkl_concat_op.cc
+++ b/tensorflow/core/kernels/mkl_concat_op.cc
@@ -33,8 +33,8 @@ limitations under the License.
 #ifdef INTEL_MKL_DNN
 #include "mkldnn.hpp"
 
-using mkldnn::stream;
 using mkldnn::concat;
+using mkldnn::stream;
 #endif
 
 namespace tensorflow {
@@ -45,7 +45,6 @@ typedef std::vector<TensorShape> TensorShapeList;
 
 enum AxisArgumentName { NAME_IS_AXIS, NAME_IS_CONCAT_DIM };
 
-
 // TODO(intelft) Check if we can reuse existing EigenConcatOp using Mutable
 // reference inputs.
 // --------------------------------------------------------------------------
@@ -152,8 +151,8 @@ class EigenConcatBaseOp : public OpKernel {
 
 #else  // MKL_DNN
 
-void Compute(OpKernelContext* c, const std::vector<Tensor>& values,
-                        const TensorShapeList& input_shapes) {
+  void Compute(OpKernelContext* c, const std::vector<Tensor>& values,
+               const TensorShapeList& input_shapes) {
     const Tensor* concat_dim_tensor;
     const char* axis_attribute_name =
         AxisArgName == NAME_IS_AXIS
@@ -197,7 +196,8 @@ void Compute(OpKernelContext* c, const std::vector<Tensor>& values,
       const auto in = values[i];
       const bool in_is_scalar = IsLegacyScalar(input_shapes[i]);
       OP_REQUIRES(
-          c, (input_shapes[i].dims() == input_dims) ||
+          c,
+          (input_shapes[i].dims() == input_dims) ||
               (input_is_scalar && in_is_scalar),
           errors::InvalidArgument(
               "ConcatOp : Ranks of all input tensors should match: shape[0] = ",
@@ -208,8 +208,8 @@ void Compute(OpKernelContext* c, const std::vector<Tensor>& values,
         inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
             in.shaped<T, 2>({inputs_flat_dim0, inputs_flat_dim1})));
       }
-      output_concat_dim += input_shapes[i].dims() > 0 ?
-                           input_shapes[i].dim_size(axis) : 1;
+      output_concat_dim +=
+          input_shapes[i].dims() > 0 ? input_shapes[i].dim_size(axis) : 1;
     }
 
     TensorShape output_shape(input_shape);
@@ -418,7 +418,6 @@ class MklConcatOp : public OpKernel {
     OP_REQUIRES_OK(context, context->status());
   }
 
-
  private:
   typedef struct {
     TensorFormat data_format;
@@ -590,39 +589,45 @@ class MklConcatOp : public OpKernel {
       GetMklShapeList(context, "values", &input_shapes);
 
       const Tensor& concat_dim_tensor = (AxisArgName == NAME_IS_CONCAT_DIM)
-                    ? MklGetInput(context, 0) : MklGetInput(context, N);
+                                            ? MklGetInput(context, 0)
+                                            : MklGetInput(context, N);
       // Sanity checks
-      OP_REQUIRES(context, IsLegacyScalar(concat_dim_tensor.shape()),
-        errors::InvalidArgument(
-            "Concat dim tensor should be a scalar integer, but got shape ",
-            concat_dim_tensor.shape().DebugString()));
-      int32 concat_dim = internal::SubtleMustCopy(
-                           concat_dim_tensor.scalar<int32>()());
+      OP_REQUIRES(
+          context, IsLegacyScalar(concat_dim_tensor.shape()),
+          errors::InvalidArgument(
+              "Concat dim tensor should be a scalar integer, but got shape ",
+              concat_dim_tensor.shape().DebugString()));
+      int32 concat_dim =
+          internal::SubtleMustCopy(concat_dim_tensor.scalar<int32>()());
 
       // check that ranks of all tensors match
       // and that their shapes match except for concat_dim.
       int i = 0;
       bool invoke_eigen = false;
       bool are_all_mkl_inputs = true, are_all_tf_inputs = true;
-      const TensorShape expected_shape = input_shapes[0].IsMklTensor() ?
-                                         input_shapes[0].GetTfShape() :
-                                         input_tensors[0].shape();
+      const TensorShape expected_shape = input_shapes[0].IsMklTensor()
+                                             ? input_shapes[0].GetTfShape()
+                                             : input_tensors[0].shape();
       size_t expected_dims = expected_shape.dims();
 
       if (concat_dim < 0) concat_dim = expected_dims + concat_dim;
 
       for (auto& s : input_shapes) {
-        if (s == expected_shape) {++i; continue;}
+        if (s == expected_shape) {
+          ++i;
+          continue;
+        }
 
-        TensorShape s_shape = s.IsMklTensor() ? s.GetTfShape() :
-                      input_tensors[i].shape();
+        TensorShape s_shape =
+            s.IsMklTensor() ? s.GetTfShape() : input_tensors[i].shape();
         size_t s_dims = s_shape.dims();
 
-        OP_REQUIRES(context, s_dims == expected_dims,
-                  errors::InvalidArgument(
-                      "_MklConcatOp : Ranks of all input tensors should match:"
-                      " input dimensions = ",
-                      s_dims, " vs. expected rank = ", expected_dims));
+        OP_REQUIRES(
+            context, s_dims == expected_dims,
+            errors::InvalidArgument(
+                "_MklConcatOp : Ranks of all input tensors should match:"
+                " input dimensions = ",
+                s_dims, " vs. expected rank = ", expected_dims));
 
         for (int d = 0; d < expected_dims; ++d) {
           if (d == concat_dim) continue;
@@ -630,10 +635,11 @@ class MklConcatOp : public OpKernel {
           size_t expected_size = expected_shape.dim_size(d);
           size_t s_size = s_shape.dim_size(d);
           OP_REQUIRES(
-            context, expected_size == s_size,
-            errors::InvalidArgument("_MklConcatOp : Dimensions of inputs "
-                    "should match: shape[0][", d, "]= ", expected_size,
-                    " vs. shape[", i, "][", d, "] = ", s_size));
+              context, expected_size == s_size,
+              errors::InvalidArgument("_MklConcatOp : Dimensions of inputs "
+                                      "should match: shape[0][",
+                                      d, "]= ", expected_size, " vs. shape[", i,
+                                      "][", d, "] = ", s_size));
         }
 
         if (s.IsMklTensor())
@@ -657,8 +663,8 @@ class MklConcatOp : public OpKernel {
         TensorShapeList tf_input_shapes;
         i = 0;
         for (auto& s : input_shapes) {
-          TensorShape s_shape = s.IsMklTensor() ? s.GetTfShape() :
-                                input_tensors[i].shape();
+          TensorShape s_shape =
+              s.IsMklTensor() ? s.GetTfShape() : input_tensors[i].shape();
           tf_input_shapes.push_back(s_shape);
           ++i;
         }
@@ -678,21 +684,22 @@ class MklConcatOp : public OpKernel {
       std::vector<memory::primitive_desc> srcs_pd;
       std::vector<MklDnnData<T>> srcs(N, MklDnnData<T>(&cpu_engine));
       int64 dst_concat_dim_size = 0;
-      for (int k =0; k < N; k++) {
+      for (int k = 0; k < N; k++) {
         bool is_mkl_tensor = input_shapes[k].IsMklTensor();
         memory::dims src_dims;
 
         // Same comment as dst_dims for src_dims.
-        src_dims = (is_mkl_tensor) ?
-                   TFShapeToMklDnnDims(input_shapes[k].GetTfShape()) :
-                   TFShapeToMklDnnDims(input_tensors[k].shape());
+        src_dims = (is_mkl_tensor)
+                       ? TFShapeToMklDnnDims(input_shapes[k].GetTfShape())
+                       : TFShapeToMklDnnDims(input_tensors[k].shape());
 
         dst_concat_dim_size += src_dims[concat_dim];
-        auto src_md = is_mkl_tensor ? input_shapes[k].GetMklLayout() :
-          // It does not matter what data format we use here (NHWC or NCHW).
-          // We just need to ensure that output of Concat uses same data format
-          // as input.
-                  memory::desc(src_dims, MklDnnType<T>(), memory::format::nchw);
+        auto src_md =
+            is_mkl_tensor ? input_shapes[k].GetMklLayout() :
+                          // It does not matter what data format we use here
+                          // (NHWC or NCHW). We just need to ensure that output
+                          // of Concat uses same data format as input.
+                memory::desc(src_dims, MklDnnType<T>(), memory::format::nchw);
 
         srcs[k].SetUsrMem(src_md, &input_tensors[k]);
         auto src_mpd = srcs[k].GetUsrMemPrimDesc();
@@ -707,14 +714,15 @@ class MklConcatOp : public OpKernel {
         // Since we are passing a specific format for destination,
         // we need to have dst_dims in MklDnn order (NCHW).
         auto orig_tf_format = input_shapes[0].GetTfDataFormat();
-        dst_dims_in_nchw = MklDnnDimsInNCHW(dst_dims,
-                               MklDnnDataFormatToTFDataFormat(orig_tf_format));
+        dst_dims_in_nchw = MklDnnDimsInNCHW(
+            dst_dims, MklDnnDataFormatToTFDataFormat(orig_tf_format));
         // We will set the output in the same format as input to avoid layout
         // conversions.
         // Currently we are setting dst format same as input format.
         // See if we can make this choice in a better way.
-        dst_md = memory::desc(dst_dims_in_nchw, MklDnnType<T>(),
-                 (memory::format) input_shapes[0].GetMklLayout().data.format);
+        dst_md = memory::desc(
+            dst_dims_in_nchw, MklDnnType<T>(),
+            (memory::format)input_shapes[0].GetMklLayout().data.format);
       } else {
         // Again, format does not matter here. We just need to make it same as
         // input format.
@@ -722,7 +730,7 @@ class MklConcatOp : public OpKernel {
       }
 
       std::vector<primitive::at> inputs;
-      for (int k=0; k < input_tensors.size(); k++)
+      for (int k = 0; k < input_tensors.size(); k++)
         inputs.push_back(srcs[k].GetOpMem());
 
       // If all inputs are in MKL format, then meaning of concat_dim needs to
@@ -732,8 +740,7 @@ class MklConcatOp : public OpKernel {
       // But ifinput tensors are in NHWC order, then semantics need to change.
       // E.g., if we are concatinating over Channel (dimension 3 for NHWC),
       // then since MklDnn order is NCHW, concat_dim needs to be 1.
-      if (are_all_mkl_inputs)
-        concat_dim = input_shapes[0].TfDimIdx(concat_dim);
+      if (are_all_mkl_inputs) concat_dim = input_shapes[0].TfDimIdx(concat_dim);
 
       auto concat_pd = concat::primitive_desc(dst_md, concat_dim, srcs_pd);
 
@@ -752,24 +759,25 @@ class MklConcatOp : public OpKernel {
         dnn_shape_dst.SetMklTensor(false);
         tf_shape_dst = MklDnnDimsToTFShape(dst_dims);
       }
-      AllocateOutputSetMklShape(context, 0, &dst_tensor,
-                                tf_shape_dst, dnn_shape_dst);
+      AllocateOutputSetMklShape(context, 0, &dst_tensor, tf_shape_dst,
+                                dnn_shape_dst);
       CHECK_NOTNULL(dst_tensor);
 
-      dst_md = dnn_shape_dst.IsMklTensor() ?
-               dnn_shape_dst.GetMklLayout() : dst_md;
+      dst_md =
+          dnn_shape_dst.IsMklTensor() ? dnn_shape_dst.GetMklLayout() : dst_md;
       dst.SetUsrMem(dst_md, dst_tensor);
 
       auto concat_op = concat(concat_pd, inputs, dst.GetOpMem());
       std::vector<primitive> net;
       net.push_back(concat_op);
       stream(stream::kind::eager).submit(net).wait();
-    } catch (mkldnn::error &e) {
-        string error_msg = "Status: " + std::to_string(e.status) +
-               ", message: " + string(e.message) + ", in file " +
-               string(__FILE__) + ":" + std::to_string(__LINE__);
-        OP_REQUIRES_OK(context, errors::Aborted(
-                "Operation received an exception:", error_msg));
+    } catch (mkldnn::error& e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
     }
   }
 
@@ -790,11 +798,9 @@ class MklConcatOp : public OpKernel {
     dnn_shape_output.SetDimensions(4);
     Tensor* output_tensor = nullptr;
     TensorShape tf_shape_output;
-    tf_shape_output.AddDim(
-        dnn_shape_output.GetSerializeBufferSize());
-    context->allocate_output(
-        GetTensorMetaDataIndex(0, context->num_outputs()),
-        tf_shape_output, &output_tensor);
+    tf_shape_output.AddDim(dnn_shape_output.GetSerializeBufferSize());
+    context->allocate_output(GetTensorMetaDataIndex(0, context->num_outputs()),
+                             tf_shape_output, &output_tensor);
     dnn_shape_output.SerializeMklDnnShape(
         output_tensor->flat<uint8>().data(),
         output_tensor->flat<uint8>().size() * sizeof(uint8));
diff --git a/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
index 0f1a218fe6..25c2573741 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
@@ -38,9 +38,9 @@ limitations under the License.
 #include "tensorflow/core/util/use_cudnn.h"
 #include "tensorflow/core/util/work_sharder.h"
 
-#include "tensorflow/core/util/mkl_util.h"
 #include "mkl_dnn.h"
 #include "mkl_dnn_types.h"
+#include "tensorflow/core/util/mkl_util.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
index 54d4916d49..ef3f8cfec1 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
@@ -38,17 +38,17 @@ limitations under the License.
 #include "tensorflow/core/util/use_cudnn.h"
 #include "tensorflow/core/util/work_sharder.h"
 
-#include "tensorflow/core/util/mkl_util.h"
 #include "mkl_dnn.h"
 #include "mkl_dnn_types.h"
+#include "tensorflow/core/util/mkl_util.h"
 
 #ifdef INTEL_MKL_DNN
 #include "mkldnn.hpp"
 
-using mkldnn::stream;
-using mkldnn::prop_kind;
 using mkldnn::convolution_backward_weights;
 using mkldnn::memory;
+using mkldnn::prop_kind;
+using mkldnn::stream;
 #endif
 
 namespace tensorflow {
@@ -360,8 +360,8 @@ class MklConv2DCustomBackpropFilterOp : public OpKernel {
           (mkl_convert_input) ? mkl_buf_convert_input : mkl_buf_input;
 
       const Tensor& out_backprop = MklGetInput(context, 2);
-      void* mkl_buf_out_backprop = const_cast<void*>(static_cast<const void*>(
-                                      out_backprop.flat<T>().data()));
+      void* mkl_buf_out_backprop = const_cast<void*>(
+          static_cast<const void*>(out_backprop.flat<T>().data()));
 
       CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_out_backprop,
                                                 prim_conv_bwdfilter,
@@ -371,10 +371,11 @@ class MklConv2DCustomBackpropFilterOp : public OpKernel {
           !dnnLayoutCompare_F32(mkl_lt_internal_out_backprop, lt_out_backprop);
       if (mkl_convert_out_backprop) {
         CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_out_backprop,
-                      lt_out_backprop, mkl_lt_internal_out_backprop),
+                                         lt_out_backprop,
+                                         mkl_lt_internal_out_backprop),
                  E_SUCCESS);
         AllocTmpBuffer(context, mkl_tmp_out_backprop_buf_tensor,
-            lt_out_backprop, &mkl_buf_convert_out_backprop);
+                       lt_out_backprop, &mkl_buf_convert_out_backprop);
         CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_out_backprop,
                                           mkl_buf_out_backprop,
                                           mkl_buf_convert_out_backprop),
@@ -428,18 +429,18 @@ class MklConv2DCustomBackpropFilterOp : public OpKernel {
                               .Device(DEVICE_CPU)                   \
                               .TypeConstraint<T>("T")               \
                               .Label(mkl_op_registry::kMklOpLabel), \
-              MklConv2DCustomBackpropFilterOp<CPUDevice, T>);
+                          MklConv2DCustomBackpropFilterOp<CPUDevice, T>);
 TF_CALL_float(REGISTER_MKL_FILTER_KERNELS);
 #undef REGISTER_MKL_FILTER_KERNELS
 
 #else
 
 template <typename Device, class T, bool biasEnabled>
-class MklConv2DCustomBackpropFilterOp :
-  public MklConv2DBackpropCommonOp<Device, T> {
+class MklConv2DCustomBackpropFilterOp
+    : public MklConv2DBackpropCommonOp<Device, T> {
  public:
   explicit MklConv2DCustomBackpropFilterOp(OpKernelConstruction* context)
-      : MklConv2DBackpropCommonOp<Device, T>(context) { }
+      : MklConv2DBackpropCommonOp<Device, T>(context) {}
   ~MklConv2DCustomBackpropFilterOp() {}
 
  private:
@@ -447,7 +448,7 @@ class MklConv2DCustomBackpropFilterOp :
                          const MklDnnShape& filter_mkl_shape,
                          const MklDnnShape& obp_mkl_shape) {
     CHECK(!filter_mkl_shape.IsMklTensor())
-      << "Conv2DBackpropFilter: filter should not be in MKL Layout";
+        << "Conv2DBackpropFilter: filter should not be in MKL Layout";
   }
 
   size_t GetInputTensorIndexWithSizes() { return 1; /* filter index */ }
@@ -462,8 +463,10 @@ class MklConv2DCustomBackpropFilterOp :
                                 const Tensor& filter_tensor) {
     TensorShape filter_tf_shape;
     CHECK_EQ(TensorShapeUtils::IsVector(filter_tensor.shape()), true);
-    CHECK_EQ(TensorShapeUtils::MakeShape(
-             filter_tensor.vec<int32>(), &filter_tf_shape).ok(), true);
+    CHECK_EQ(TensorShapeUtils::MakeShape(filter_tensor.vec<int32>(),
+                                         &filter_tf_shape)
+                 .ok(),
+             true);
     return filter_tf_shape;
   }
 
@@ -485,16 +488,13 @@ class MklConv2DCustomBackpropFilterOp :
     return memory::format::hwio;
   }
 
-  void CreatePrimitive(OpKernelContext* context,
-                       const engine& cpu_engine,
+  void CreatePrimitive(OpKernelContext* context, const engine& cpu_engine,
                        const convolution_forward::primitive_desc& conv_fwd_pd,
                        MklDnnData<T>* input, MklDnnData<T>* filter,
                        MklDnnData<T>* outbackprop, MklDnnData<T>* output,
-                       Tensor** output_tensor,
-                       const memory::dims& strides,
+                       Tensor** output_tensor, const memory::dims& strides,
                        const memory::dims& padding_l,
-                       const memory::dims& padding_r,
-                       padding_kind padding,
+                       const memory::dims& padding_r, padding_kind padding,
                        const memory::dims& bwd_output_dims,
                        memory::format bwd_output_format) {
     CHECK_NOTNULL(context);
@@ -508,34 +508,35 @@ class MklConv2DCustomBackpropFilterOp :
     int depth = 0;
     if (biasEnabled) {
       // Data structure for bias_grad
-      bias_grad = new MklDnnData<T> (&cpu_engine);
+      bias_grad = new MklDnnData<T>(&cpu_engine);
       TensorShape obp_tf_shape = GetTfShape(context, 2);
-      depth = (MklConv2DBackpropCommonOp<Device, T>::GetTFDataFormat()
-                == FORMAT_NCHW) ?
-          obp_tf_shape.dim_size(1) : obp_tf_shape.dim_size(3);
+      depth = (MklConv2DBackpropCommonOp<Device, T>::GetTFDataFormat() ==
+               FORMAT_NCHW)
+                  ? obp_tf_shape.dim_size(1)
+                  : obp_tf_shape.dim_size(3);
       memory::dims bias_grad_dims = {depth};
       bias_grad->SetOpMemDesc(bias_grad_dims, memory::format::x);
     }
 
     // Create convolution backward weights primitive.
-    auto bwd_desc = (biasEnabled && (bias_grad != nullptr))?
-        convolution_backward_weights::desc(convolution_direct,
-                                input->GetOpMemDesc(), output->GetOpMemDesc(),
-                                bias_grad->GetOpMemDesc(),
-                                outbackprop->GetOpMemDesc(), strides, padding_l,
-                                padding_r, padding) :
-        convolution_backward_weights::desc(convolution_direct,
-                          input->GetOpMemDesc(), output->GetOpMemDesc(),
-                          outbackprop->GetOpMemDesc(), strides, padding_l,
-                          padding_r, padding);
-
-    auto bwd_pd = convolution_backward_weights::primitive_desc(bwd_desc,
-                                                            cpu_engine,
-                                                            conv_fwd_pd);
+    auto bwd_desc =
+        (biasEnabled && (bias_grad != nullptr))
+            ? convolution_backward_weights::desc(
+                  convolution_direct, input->GetOpMemDesc(),
+                  output->GetOpMemDesc(), bias_grad->GetOpMemDesc(),
+                  outbackprop->GetOpMemDesc(), strides, padding_l, padding_r,
+                  padding)
+            : convolution_backward_weights::desc(
+                  convolution_direct, input->GetOpMemDesc(),
+                  output->GetOpMemDesc(), outbackprop->GetOpMemDesc(), strides,
+                  padding_l, padding_r, padding);
+
+    auto bwd_pd = convolution_backward_weights::primitive_desc(
+        bwd_desc, cpu_engine, conv_fwd_pd);
 
     // Allocate output tensor.
-    AllocateOutputTensor(context, bwd_pd, bwd_output_dims,
-                         bwd_output_format, output_tensor);
+    AllocateOutputTensor(context, bwd_pd, bwd_output_dims, bwd_output_format,
+                         output_tensor);
 
     CHECK_NOTNULL(*output_tensor);
     // Set buffer handle using allocated output tensor.
@@ -548,8 +549,8 @@ class MklConv2DCustomBackpropFilterOp :
       AllocateBiasGradTensor(context, bias_grad_shape, &bias_grad_tensor);
       memory::dims bias_grad_dims = {depth};
       // Since Bias is 1D, we use format::x from MKLDNN to represent it.
-      auto bias_grad_md = memory::desc({bias_grad_dims}, MklDnnType<T>(),
-                                       memory::format::x);
+      auto bias_grad_md =
+          memory::desc({bias_grad_dims}, MklDnnType<T>(), memory::format::x);
       bias_grad->SetUsrMem(bias_grad_md, bias_grad_tensor);
       bias_grad->SetUsrMemDataHandle(bias_grad_tensor);
     }
@@ -562,28 +563,29 @@ class MklConv2DCustomBackpropFilterOp :
   }
 
   // Allocate output tensor.
-  void AllocateOutputTensor(OpKernelContext* context,
-                  const convolution_backward_weights::primitive_desc& conv_pd,
-                  const memory::dims& output_dims_mkl_order,
-                  memory::format output_tf_format, Tensor** output_tensor) {
-      CHECK_NOTNULL(output_tensor);
-
-      // For BackpropFilter, we convert the output tensor back in Tensorflow
-      // layout. Because typically, BackpropFilter is the last operator in the
-      // graph that emit filter gradient that is provided to ApplyGradient
-      // method to update the filter. But it may be possible to eliminate this
-      // by forwarding filter in MKL layout if we support ApplyGradient method
-      // for MKL layout propagation.
-      MklDnnShape output_mkl_shape;
-      output_mkl_shape.SetMklTensor(false);
-      // output_dims_mkl_order is in OIHW format.
-      // Allocate shape of TF tensor in HWIO format.
-      TensorShape output_tf_shape({output_dims_mkl_order[MklDnnDims::Dim_H],
-                                   output_dims_mkl_order[MklDnnDims::Dim_W],
-                                   output_dims_mkl_order[MklDnnDims::Dim_I],
-                                   output_dims_mkl_order[MklDnnDims::Dim_O]});
-      AllocateOutputSetMklShape(context, 0, output_tensor, output_tf_shape,
-                                output_mkl_shape);
+  void AllocateOutputTensor(
+      OpKernelContext* context,
+      const convolution_backward_weights::primitive_desc& conv_pd,
+      const memory::dims& output_dims_mkl_order,
+      memory::format output_tf_format, Tensor** output_tensor) {
+    CHECK_NOTNULL(output_tensor);
+
+    // For BackpropFilter, we convert the output tensor back in Tensorflow
+    // layout. Because typically, BackpropFilter is the last operator in the
+    // graph that emit filter gradient that is provided to ApplyGradient
+    // method to update the filter. But it may be possible to eliminate this
+    // by forwarding filter in MKL layout if we support ApplyGradient method
+    // for MKL layout propagation.
+    MklDnnShape output_mkl_shape;
+    output_mkl_shape.SetMklTensor(false);
+    // output_dims_mkl_order is in OIHW format.
+    // Allocate shape of TF tensor in HWIO format.
+    TensorShape output_tf_shape({output_dims_mkl_order[MklDnnDims::Dim_H],
+                                 output_dims_mkl_order[MklDnnDims::Dim_W],
+                                 output_dims_mkl_order[MklDnnDims::Dim_I],
+                                 output_dims_mkl_order[MklDnnDims::Dim_O]});
+    AllocateOutputSetMklShape(context, 0, output_tensor, output_tf_shape,
+                              output_mkl_shape);
   }
 
   // Allocate tensor for bias grad
@@ -600,9 +602,9 @@ class MklConv2DCustomBackpropFilterOp :
 
   // Prepare and execute net - checks for input and output reorders.
   void PrepareAndExecutePrimitive(
-                  const convolution_backward_weights::primitive_desc& conv_pd,
-                  MklDnnData<T>* input, MklDnnData<T>* obp,
-                  MklDnnData<T>* output, MklDnnData<T>* bias_grad = nullptr) {
+      const convolution_backward_weights::primitive_desc& conv_pd,
+      MklDnnData<T>* input, MklDnnData<T>* obp, MklDnnData<T>* output,
+      MklDnnData<T>* bias_grad = nullptr) {
     // Create reorders between user layout and MKL layout if it is needed and
     // add it to the net before convolution.
     std::vector<primitive> net;
@@ -612,15 +614,15 @@ class MklConv2DCustomBackpropFilterOp :
     // For BackpropFilter, we convert the output tensor back in Tensorflow
     // layout.
     bool output_reorder_required = output->PrepareReorderToUserMemIfReq(
-                                      conv_pd.diff_weights_primitive_desc());
+        conv_pd.diff_weights_primitive_desc());
 
     if (biasEnabled && (bias_grad != nullptr)) {
-      net.push_back(convolution_backward_weights(conv_pd, input->GetOpMem(),
-                                      obp->GetOpMem(), output->GetOpMem(),
-                                      bias_grad->GetOpMem()));
+      net.push_back(convolution_backward_weights(
+          conv_pd, input->GetOpMem(), obp->GetOpMem(), output->GetOpMem(),
+          bias_grad->GetOpMem()));
     } else {
-      net.push_back(convolution_backward_weights(conv_pd, input->GetOpMem(),
-                                      obp->GetOpMem(), output->GetOpMem()));
+      net.push_back(convolution_backward_weights(
+          conv_pd, input->GetOpMem(), obp->GetOpMem(), output->GetOpMem()));
     }
 
     if (output_reorder_required) {
@@ -631,22 +633,24 @@ class MklConv2DCustomBackpropFilterOp :
   }
 };
 
-#define REGISTER_MKL_FILTER_KERNELS(T)                              \
-  REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropFilter")          \
-                              .Device(DEVICE_CPU)                   \
-                              .TypeConstraint<T>("T")               \
-                              .Label(mkl_op_registry::kMklOpLabel), \
-              MklConv2DCustomBackpropFilterOp<CPUDevice, T, false>);\
-  REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropFilterWithBias")  \
-                              .Device(DEVICE_CPU)                   \
-                              .TypeConstraint<T>("T")               \
-                              .Label(mkl_op_registry::kMklOpLabel), \
-              MklConv2DCustomBackpropFilterOp<CPUDevice, T, true>); \
-  REGISTER_KERNEL_BUILDER(Name("__MklDummyConv2DBackpropFilterWithBias")  \
-                              .Device(DEVICE_CPU)                   \
-                              .TypeConstraint<T>("T")               \
-                              .Label(mkl_op_registry::kMklOpLabel), \
-              MklDummyOp<CPUDevice, T>);
+#define REGISTER_MKL_FILTER_KERNELS(T)                                   \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("_MklConv2DBackpropFilter")                                   \
+          .Device(DEVICE_CPU)                                            \
+          .TypeConstraint<T>("T")                                        \
+          .Label(mkl_op_registry::kMklOpLabel),                          \
+      MklConv2DCustomBackpropFilterOp<CPUDevice, T, false>);             \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("_MklConv2DBackpropFilterWithBias")                           \
+          .Device(DEVICE_CPU)                                            \
+          .TypeConstraint<T>("T")                                        \
+          .Label(mkl_op_registry::kMklOpLabel),                          \
+      MklConv2DCustomBackpropFilterOp<CPUDevice, T, true>);              \
+  REGISTER_KERNEL_BUILDER(Name("__MklDummyConv2DBackpropFilterWithBias") \
+                              .Device(DEVICE_CPU)                        \
+                              .TypeConstraint<T>("T")                    \
+                              .Label(mkl_op_registry::kMklOpLabel),      \
+                          MklDummyOp<CPUDevice, T>);
 
 TF_CALL_float(REGISTER_MKL_FILTER_KERNELS);
 #undef REGISTER_MKL_FILTER_KERNELS
diff --git a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
index ef6db58d31..a6745489f4 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
@@ -23,6 +23,8 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 #include <algorithm>
 #include <vector>
+#include "mkl_dnn.h"
+#include "mkl_dnn_types.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -41,15 +43,13 @@ limitations under the License.
 #include "tensorflow/core/util/tensor_format.h"
 #include "tensorflow/core/util/use_cudnn.h"
 #include "tensorflow/core/util/work_sharder.h"
-#include "mkl_dnn.h"
-#include "mkl_dnn_types.h"
 
 #ifdef INTEL_MKL_DNN
 #include "mkldnn.hpp"
 
-using mkldnn::stream;
-using mkldnn::prop_kind;
 using mkldnn::convolution_backward_data;
+using mkldnn::prop_kind;
+using mkldnn::stream;
 #endif
 
 namespace tensorflow {
@@ -359,16 +359,15 @@ class MklConv2DCustomBackpropInputOp : public OpKernel {
 #else
 
 template <typename Device, class T>
-class MklConv2DCustomBackpropInputOp :
-  public MklConv2DBackpropCommonOp<Device, T> {
+class MklConv2DCustomBackpropInputOp
+    : public MklConv2DBackpropCommonOp<Device, T> {
  public:
   explicit MklConv2DCustomBackpropInputOp(OpKernelConstruction* context)
-      : MklConv2DBackpropCommonOp<Device, T>(context) { }
+      : MklConv2DBackpropCommonOp<Device, T>(context) {}
   ~MklConv2DCustomBackpropInputOp() {}
 
  private:
-  const int kInputIndex_Filter = 1,
-            kInputIndex_InputSizes = 0,
+  const int kInputIndex_Filter = 1, kInputIndex_InputSizes = 0,
             kInputIndex_OutBackProp = 2;
   void ValidateMklShapes(const MklDnnShape& input_mkl_shape,
                          const MklDnnShape& filter_mkl_shape,
@@ -377,7 +376,7 @@ class MklConv2DCustomBackpropInputOp :
     // of the Tensor and never an actual tensor. So it will never be in MKL
     // layout.
     CHECK(!input_mkl_shape.IsMklTensor())
-      << "Conv2DBackpropInput: input should not be in MKL Layout";
+        << "Conv2DBackpropInput: input should not be in MKL Layout";
   }
 
   size_t GetInputTensorIndexWithSizes() { return kInputIndex_InputSizes; }
@@ -386,8 +385,10 @@ class MklConv2DCustomBackpropInputOp :
                                const Tensor& input_tensor) {
     TensorShape input_tf_shape;
     CHECK_EQ(TensorShapeUtils::IsVector(input_tensor.shape()), true);
-    CHECK_EQ(TensorShapeUtils::MakeShape(input_tensor.vec<int32>(),
-                                         &input_tf_shape).ok(), true);
+    CHECK_EQ(
+        TensorShapeUtils::MakeShape(input_tensor.vec<int32>(), &input_tf_shape)
+            .ok(),
+        true);
     return input_tf_shape;
   }
 
@@ -414,16 +415,13 @@ class MklConv2DCustomBackpropInputOp :
     return data_format;
   }
 
-  void CreatePrimitive(OpKernelContext* context,
-                       const engine& cpu_engine,
+  void CreatePrimitive(OpKernelContext* context, const engine& cpu_engine,
                        const convolution_forward::primitive_desc& conv_fwd_pd,
                        MklDnnData<T>* input, MklDnnData<T>* filter,
                        MklDnnData<T>* outbackprop, MklDnnData<T>* output,
-                       Tensor** output_tensor,
-                       const memory::dims& strides,
+                       Tensor** output_tensor, const memory::dims& strides,
                        const memory::dims& padding_l,
-                       const memory::dims& padding_r,
-                       padding_kind padding,
+                       const memory::dims& padding_r, padding_kind padding,
                        const memory::dims& bwd_output_dims,
                        memory::format bwd_output_format) {
     CHECK_NOTNULL(context);
@@ -434,19 +432,16 @@ class MklConv2DCustomBackpropInputOp :
     CHECK_NOTNULL(output_tensor);
 
     // Create convolution backward data primitive.
-    auto bwd_desc = convolution_backward_data::desc(convolution_direct,
-                      output->GetOpMemDesc(), filter->GetOpMemDesc(),
-                      outbackprop->GetOpMemDesc(), strides, padding_l,
-                      padding_r, padding);
-
-    auto bwd_pd = convolution_backward_data::primitive_desc(bwd_desc,
-                                                          cpu_engine,
-                                                          conv_fwd_pd);
+    auto bwd_desc = convolution_backward_data::desc(
+        convolution_direct, output->GetOpMemDesc(), filter->GetOpMemDesc(),
+        outbackprop->GetOpMemDesc(), strides, padding_l, padding_r, padding);
 
+    auto bwd_pd = convolution_backward_data::primitive_desc(
+        bwd_desc, cpu_engine, conv_fwd_pd);
 
     // Allocate output tensor in TensorFlow and MKL layout.
-    AllocateOutputTensor(context, bwd_pd, bwd_output_dims,
-                         bwd_output_format, output_tensor);
+    AllocateOutputTensor(context, bwd_pd, bwd_output_dims, bwd_output_format,
+                         output_tensor);
     CHECK_NOTNULL(*output_tensor);
     // Set buffer handle using allocated output tensor.
     output->SetUsrMemDataHandle(*output_tensor);
@@ -455,44 +450,44 @@ class MklConv2DCustomBackpropInputOp :
   }
 
   // Allocate output tensor.
-  void AllocateOutputTensor(OpKernelContext* context,
-                  const convolution_backward_data::primitive_desc& conv_pd,
-                  const memory::dims& output_dims_mkl_order,
-                  memory::format output_tf_format, Tensor** output_tensor) {
-      CHECK_NOTNULL(output_tensor);
-
-      // Output primitive descriptor for backward data is diff_src.
-      auto dst_pd = conv_pd.diff_src_primitive_desc();
-
-      // Allocate shape of Mkl tensor.
-      MklDnnShape output_mkl_shape;
-      output_mkl_shape.SetMklTensor(true);
-      output_mkl_shape.SetMklLayout(&dst_pd);
-      output_mkl_shape.SetElemType(MklDnnType<T>());
-      output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
-                                   output_dims_mkl_order, output_tf_format);
-
-      // Allocate shape of TF tensor.
-      TensorShape output_tf_shape;
-      output_tf_shape.AddDim(dst_pd.get_size() / sizeof(T));
-
-      AllocateOutputSetMklShape(context, 0, output_tensor, output_tf_shape,
-                                output_mkl_shape);
+  void AllocateOutputTensor(
+      OpKernelContext* context,
+      const convolution_backward_data::primitive_desc& conv_pd,
+      const memory::dims& output_dims_mkl_order,
+      memory::format output_tf_format, Tensor** output_tensor) {
+    CHECK_NOTNULL(output_tensor);
+
+    // Output primitive descriptor for backward data is diff_src.
+    auto dst_pd = conv_pd.diff_src_primitive_desc();
+
+    // Allocate shape of Mkl tensor.
+    MklDnnShape output_mkl_shape;
+    output_mkl_shape.SetMklTensor(true);
+    output_mkl_shape.SetMklLayout(&dst_pd);
+    output_mkl_shape.SetElemType(MklDnnType<T>());
+    output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
+                                 output_dims_mkl_order, output_tf_format);
+
+    // Allocate shape of TF tensor.
+    TensorShape output_tf_shape;
+    output_tf_shape.AddDim(dst_pd.get_size() / sizeof(T));
+
+    AllocateOutputSetMklShape(context, 0, output_tensor, output_tf_shape,
+                              output_mkl_shape);
   }
 
   // Prepare and execute net - checks for input and output reorders.
   void PrepareAndExecutePrimitive(
-                  const convolution_backward_data::primitive_desc& conv_pd,
-                  MklDnnData<T>* filter, MklDnnData<T>* obp,
-                  MklDnnData<T>* output) {
+      const convolution_backward_data::primitive_desc& conv_pd,
+      MklDnnData<T>* filter, MklDnnData<T>* obp, MklDnnData<T>* output) {
     // Create reorders between user layout and MKL layout if it is needed and
     // add it to the net before convolution.
     std::vector<primitive> net;
     filter->CheckReorderToOpMem(conv_pd.weights_primitive_desc(), &net);
     obp->CheckReorderToOpMem(conv_pd.diff_dst_primitive_desc(), &net);
 
-    net.push_back(convolution_backward_data(conv_pd, obp->GetOpMem(),
-                                    filter->GetOpMem(), output->GetOpMem()));
+    net.push_back(convolution_backward_data(
+        conv_pd, obp->GetOpMem(), filter->GetOpMem(), output->GetOpMem()));
 
     stream(stream::kind::eager).submit(net).wait();
   }
diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc
index 0e77b45993..e44fba754b 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@@ -18,8 +18,8 @@ limitations under the License.
 
 #include <string.h>
 #include <map>
-#include <vector>
 #include <string>
+#include <vector>
 
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -41,15 +41,14 @@ limitations under the License.
 
 #include "tensorflow/core/util/mkl_util.h"
 
-
 #ifdef INTEL_MKL_DNN
 #include "mkldnn.hpp"
 
-using mkldnn::stream;
 using mkldnn::prop_kind;
+using mkldnn::stream;
 
-using mkldnn::convolution_forward;
 using mkldnn::convolution_direct;
+using mkldnn::convolution_forward;
 #else
 #include "mkl_dnn.h"
 #include "mkl_dnn_types.h"
@@ -116,18 +115,19 @@ class MklConv2DOp : public OpKernel {
                                         filter.shape().DebugString()));
 
     for (int i = 0; i < 3; i++) {
-      OP_REQUIRES(context, FastBoundsCheck(filter.dim_size(i),
-                                           std::numeric_limits<int>::max()),
-                  errors::InvalidArgument("filter too large"));
+      OP_REQUIRES(
+          context,
+          FastBoundsCheck(filter.dim_size(i), std::numeric_limits<int>::max()),
+          errors::InvalidArgument("filter too large"));
     }
 
     const int64 input_depth =
         input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'C')
                             : GetTensorDim(input, data_format_, 'C');
-    OP_REQUIRES(
-        context, input_depth == filter.dim_size(2),
-        errors::InvalidArgument("input and filter must have the same depth: ",
-                                input_depth, " vs ", filter.dim_size(2)));
+    OP_REQUIRES(context, input_depth == filter.dim_size(2),
+                errors::InvalidArgument(
+                    "input and filter must have the same depth: ", input_depth,
+                    " vs ", filter.dim_size(2)));
     // The last dimension for filter is out_depth.
     const int out_depth = static_cast<int>(filter.dim_size(3));
 
@@ -136,9 +136,10 @@ class MklConv2DOp : public OpKernel {
     const int64 input_rows_raw =
         input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'H')
                             : GetTensorDim(input, data_format_, 'H');
-    OP_REQUIRES(context, FastBoundsCheck(input_rows_raw,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("Input rows too large"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(input_rows_raw, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("Input rows too large"));
     const int input_rows = static_cast<int>(input_rows_raw);
     const int filter_rows = static_cast<int>(filter.dim_size(0));
 
@@ -147,9 +148,10 @@ class MklConv2DOp : public OpKernel {
     const int64 input_cols_raw =
         input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'W')
                             : GetTensorDim(input, data_format_, 'W');
-    OP_REQUIRES(context, FastBoundsCheck(input_cols_raw,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("Input cols too large"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(input_cols_raw, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("Input cols too large"));
     const int input_cols = static_cast<int>(input_cols_raw);
     const int filter_cols = static_cast<int>(filter.dim_size(1));
 
@@ -157,9 +159,10 @@ class MklConv2DOp : public OpKernel {
     const int64 input_batch_raw =
         input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'N')
                             : GetTensorDim(input, data_format_, 'N');
-    OP_REQUIRES(context, FastBoundsCheck(input_batch_raw,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("batch is too large"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(input_batch_raw, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("batch is too large"));
     const int batch = static_cast<int>(input_batch_raw);
 
     // For now we take the stride from the second and third dimensions only (we
@@ -313,8 +316,7 @@ class MklConv2DOp : public OpKernel {
     // Temp tensor used to allocate tmp buffers
     Tensor mkl_tmp_input_buf_tensor, mkl_tmp_filter_buf_tensor,
         mkl_tmp_bias_buf_tensor;
-    mkl_context.MklPrepareConvolutionInputs(context,
-                                            &mkl_tmp_input_buf_tensor,
+    mkl_context.MklPrepareConvolutionInputs(context, &mkl_tmp_input_buf_tensor,
                                             &mkl_tmp_filter_buf_tensor,
                                             &mkl_tmp_bias_buf_tensor);
 
@@ -398,8 +400,9 @@ class MklConv2DOp : public OpKernel {
       mkl_convert_input =
           !dnnLayoutCompare_F32(mkl_lt_internal_input, lt_input);
       if (mkl_convert_input) {
-        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input,
-                 lt_input, mkl_lt_internal_input), E_SUCCESS);
+        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input, lt_input,
+                                         mkl_lt_internal_input),
+                 E_SUCCESS);
         AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, mkl_lt_internal_input,
                        &mkl_buf_convert_input);
         CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_input, mkl_buf_input,
@@ -517,8 +520,8 @@ class MklConv2DOp : public OpKernel {
       GetMklShape(context, kInputIndex_Src, &src_mkl_shape);
       GetMklShape(context, kInputIndex_Filter, &filter_mkl_shape);
       OP_REQUIRES(context, filter_mkl_shape.IsMklTensor() == false,
-            errors::InvalidArgument("Filter should not be in "
-            "Mkl Layout"));
+                  errors::InvalidArgument("Filter should not be in "
+                                          "Mkl Layout"));
 
       MklDnnData<T> src(&cpu_engine);
       MklDnnData<T> filter(&cpu_engine);
@@ -531,11 +534,10 @@ class MklConv2DOp : public OpKernel {
       MklDnnConvUtil conv_utl(context, strides_, padding_, data_format_);
       auto src_tf_shape = GetTfShape(context, kInputIndex_Src);
       auto filter_tf_shape = GetTfShape(context, kInputIndex_Filter);
-      conv_utl.GetConvFwdSizesInMklOrder(src_tf_shape, filter_tf_shape,
-                                         &src_dims, &filter_dims, &strides,
-                                         &output_dims_tf_order,
-                                         &output_dims_mkl_order, &padding_l,
-                                         &padding_r);
+      conv_utl.GetConvFwdSizesInMklOrder(
+          src_tf_shape, filter_tf_shape, &src_dims, &filter_dims, &strides,
+          &output_dims_tf_order, &output_dims_mkl_order, &padding_l,
+          &padding_r);
       if (!context->status().ok()) return;
 
       // Check for corner case - if there is nothing to compute, return.
@@ -543,21 +545,20 @@ class MklConv2DOp : public OpKernel {
 
       // Corner cases: output with 0 elements and 0 batch size.
       Tensor* output_tensor = nullptr;
-      if (output_tf_shape.num_elements() == 0 ||
-          output_dims_tf_order[0] == 0) {
+      if (output_tf_shape.num_elements() == 0 || output_dims_tf_order[0] == 0) {
         // TODO(jbobba): Verify correctness here
         //               Need semantics for Null MKL tensor
         MklDnnShape output_mkl_shape;
         output_mkl_shape.SetMklTensor(false);
         AllocateOutputSetMklShape(context, kOutputIndex_Dst, &output_tensor,
-                                    src_tf_shape, output_mkl_shape);
+                                  src_tf_shape, output_mkl_shape);
 
         // MklConv2D also outputs converted filter as 2nd output of Conv2D.
         filter_mkl_shape.SetMklTensor(false);
         Tensor* output_filter_tensor = nullptr;
         AllocateOutputSetMklShape(context, kOutputIndex_Filter,
-                                  &output_filter_tensor,
-                                  filter_tf_shape, filter_mkl_shape);
+                                  &output_filter_tensor, filter_tf_shape,
+                                  filter_mkl_shape);
         return;
       }
 
@@ -570,14 +571,15 @@ class MklConv2DOp : public OpKernel {
       // (src_dims) required is in MKL-DNN order, the layout is Tensorflow's
       // layout (NHWC or NCHW depending on data format).
       auto src_md = src_mkl_shape.IsMklTensor()
-                    ? src_mkl_shape.GetMklLayout()
-                    : memory::desc(src_dims, MklDnnType<T>(), tf_fmt);
+                        ? src_mkl_shape.GetMklLayout()
+                        : memory::desc(src_dims, MklDnnType<T>(), tf_fmt);
       src.SetUsrMem(src_md, &src_tensor);
       // Although filter shape (filter_dims) required is in MKL-DNN order,
       // the layout is Tensorflow's layout (HWIO).
       auto filter_md = filter_mkl_shape.IsMklTensor()  // Should NEVER be true
-                    ? filter_mkl_shape.GetMklLayout()
-          : memory::desc(filter_dims, MklDnnType<T>(), memory::format::hwio);
+                           ? filter_mkl_shape.GetMklLayout()
+                           : memory::desc(filter_dims, MklDnnType<T>(),
+                                          memory::format::hwio);
       filter.SetUsrMem(filter_md, &filter_tensor);
 
       // Set output shape (output_dims) required in MKL-DNN order.
@@ -601,34 +603,34 @@ class MklConv2DOp : public OpKernel {
         bias.SetOpMemDesc(bias_size, memory::format::any);
 
         // Create convolution primitive with Bias.
-        auto conv_desc = convolution_forward::desc(prop_kind::forward,
-            convolution_direct, src.GetOpMemDesc(), filter.GetOpMemDesc(),
-            bias.GetOpMemDesc(), output.GetOpMemDesc(), strides,
-            padding_l, padding_r, TFPaddingToMklDnnPadding(padding_));
-
-        auto conv_prim_desc = convolution_forward::primitive_desc(conv_desc,
-                                                                cpu_engine);
-        AllocateOutputTensor(context, conv_prim_desc,
-                             output_dims_mkl_order, tf_fmt, &output_tensor);
+        auto conv_desc = convolution_forward::desc(
+            prop_kind::forward, convolution_direct, src.GetOpMemDesc(),
+            filter.GetOpMemDesc(), bias.GetOpMemDesc(), output.GetOpMemDesc(),
+            strides, padding_l, padding_r, TFPaddingToMklDnnPadding(padding_));
+
+        auto conv_prim_desc =
+            convolution_forward::primitive_desc(conv_desc, cpu_engine);
+        AllocateOutputTensor(context, conv_prim_desc, output_dims_mkl_order,
+                             tf_fmt, &output_tensor);
         // Set data handle for output.
         output.SetUsrMemDataHandle(output_tensor);
 
         Tensor* filter_out_tensor = nullptr;
         AllocateFilterOutputTensor(context, conv_prim_desc,
-                        TFShapeToMklDnnDims(filter_tf_shape),
-                        &filter_out_tensor);
+                                   TFShapeToMklDnnDims(filter_tf_shape),
+                                   &filter_out_tensor);
 
-        PrepareAndExecuteNet(conv_prim_desc, &src, &filter,
-                            &bias, &output, filter_out_tensor);
+        PrepareAndExecuteNet(conv_prim_desc, &src, &filter, &bias, &output,
+                             filter_out_tensor);
       } else {
         // Create convolution primitive without Bias.
-        auto conv_desc = convolution_forward::desc(prop_kind::forward,
-            convolution_direct, src.GetOpMemDesc(), filter.GetOpMemDesc(),
-            output.GetOpMemDesc(), strides, padding_l, padding_r,
-            TFPaddingToMklDnnPadding(padding_));
+        auto conv_desc = convolution_forward::desc(
+            prop_kind::forward, convolution_direct, src.GetOpMemDesc(),
+            filter.GetOpMemDesc(), output.GetOpMemDesc(), strides, padding_l,
+            padding_r, TFPaddingToMklDnnPadding(padding_));
 
-        auto conv_prim_desc = convolution_forward::primitive_desc(conv_desc,
-                                                                cpu_engine);
+        auto conv_prim_desc =
+            convolution_forward::primitive_desc(conv_desc, cpu_engine);
         AllocateOutputTensor(context, conv_prim_desc, output_dims_mkl_order,
                              tf_fmt, &output_tensor);
         // Set data handle for output.
@@ -636,18 +638,18 @@ class MklConv2DOp : public OpKernel {
 
         Tensor* filter_out_tensor = nullptr;
         AllocateFilterOutputTensor(context, conv_prim_desc,
-                TFShapeToMklDnnDims(filter_tf_shape),
-                &filter_out_tensor);
-        PrepareAndExecuteNet(conv_prim_desc, &src, &filter,
-                            nullptr, &output, filter_out_tensor);
+                                   TFShapeToMklDnnDims(filter_tf_shape),
+                                   &filter_out_tensor);
+        PrepareAndExecuteNet(conv_prim_desc, &src, &filter, nullptr, &output,
+                             filter_out_tensor);
       }
-    } catch (mkldnn::error &e) {
+    } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
-                       ", message: " + std::string(e.message) +
-                       ", in file " + std::string(__FILE__) + ":" +
-                       std::to_string(__LINE__);
-      OP_REQUIRES_OK(context,
-        errors::Aborted("Operation received an exception:", error_msg));
+                         ", message: " + std::string(e.message) + ", in file " +
+                         std::string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
     }
   }
 
@@ -655,71 +657,67 @@ class MklConv2DOp : public OpKernel {
   std::vector<int32> strides_;
   Padding padding_;
   TensorFormat data_format_;
-  const int kInputIndex_Src = 0,
-            kInputIndex_Filter = 1,
-            kInputIndex_Bias = 2;
+  const int kInputIndex_Src = 0, kInputIndex_Filter = 1, kInputIndex_Bias = 2;
   const int kOutputIndex_Dst = 0, kOutputIndex_Filter = 1;
 
   // Allocate output tensor.
   void AllocateOutputTensor(
-                  OpKernelContext* context,
-                  const convolution_forward::primitive_desc& conv_prim_desc,
-                  const memory::dims& output_dims_mkl_order,
-                  memory::format output_tf_format, Tensor** output_tensor) {
-      CHECK_NOTNULL(output_tensor);
-      auto dst_pd = conv_prim_desc.dst_primitive_desc();
-
-      // Allocate shape of Mkl tensor.
-      MklDnnShape output_mkl_shape;
-      output_mkl_shape.SetMklTensor(true);
-      output_mkl_shape.SetMklLayout(&dst_pd);
-      output_mkl_shape.SetElemType(MklDnnType<T>());
-      output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
-              output_dims_mkl_order, output_tf_format);
-
-      // Allocate shape of TF tensor.
-      TensorShape output_tf_shape;
-      output_tf_shape.AddDim((dst_pd.get_size() / sizeof(T)));
-
-      AllocateOutputSetMklShape(context, kOutputIndex_Dst, output_tensor,
-                                output_tf_shape, output_mkl_shape);
+      OpKernelContext* context,
+      const convolution_forward::primitive_desc& conv_prim_desc,
+      const memory::dims& output_dims_mkl_order,
+      memory::format output_tf_format, Tensor** output_tensor) {
+    CHECK_NOTNULL(output_tensor);
+    auto dst_pd = conv_prim_desc.dst_primitive_desc();
+
+    // Allocate shape of Mkl tensor.
+    MklDnnShape output_mkl_shape;
+    output_mkl_shape.SetMklTensor(true);
+    output_mkl_shape.SetMklLayout(&dst_pd);
+    output_mkl_shape.SetElemType(MklDnnType<T>());
+    output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
+                                 output_dims_mkl_order, output_tf_format);
+
+    // Allocate shape of TF tensor.
+    TensorShape output_tf_shape;
+    output_tf_shape.AddDim((dst_pd.get_size() / sizeof(T)));
+
+    AllocateOutputSetMklShape(context, kOutputIndex_Dst, output_tensor,
+                              output_tf_shape, output_mkl_shape);
   }
 
   // Allocate output tensor.
   void AllocateFilterOutputTensor(
-                  OpKernelContext* context,
-                  const convolution_forward::primitive_desc& conv_prim_desc,
-                  const memory::dims& filter_dims_tf_order,
-                  Tensor** filter_tensor) {
-      CHECK_NOTNULL(filter_tensor);
-      auto filter_pd = conv_prim_desc.weights_primitive_desc();
-
-      // Allocate shape of Mkl tensor.
-      MklDnnShape filter_mkl_shape;
-      filter_mkl_shape.SetMklTensor(true);
-      filter_mkl_shape.SetMklLayout(&filter_pd);
-      filter_mkl_shape.SetElemType(MklDnnType<T>());
-
-      // The format of the filter is actually OIhw8i8o, but TF doesn't support
-      // this format. Just use format::blocked for now because the layout
-      // is stored in the MKL data.
-      filter_mkl_shape.SetTfLayout(filter_dims_tf_order.size(),
-                  filter_dims_tf_order, memory::format::blocked);
-
-      // Allocate the data space for the filter to propagate as TF tensor.
-      TensorShape filter_tf_shape;
-      filter_tf_shape.AddDim((filter_pd.get_size() / sizeof(T)));
-
-      AllocateOutputSetMklShape(context, kOutputIndex_Filter, filter_tensor,
-              filter_tf_shape, filter_mkl_shape);
+      OpKernelContext* context,
+      const convolution_forward::primitive_desc& conv_prim_desc,
+      const memory::dims& filter_dims_tf_order, Tensor** filter_tensor) {
+    CHECK_NOTNULL(filter_tensor);
+    auto filter_pd = conv_prim_desc.weights_primitive_desc();
+
+    // Allocate shape of Mkl tensor.
+    MklDnnShape filter_mkl_shape;
+    filter_mkl_shape.SetMklTensor(true);
+    filter_mkl_shape.SetMklLayout(&filter_pd);
+    filter_mkl_shape.SetElemType(MklDnnType<T>());
+
+    // The format of the filter is actually OIhw8i8o, but TF doesn't support
+    // this format. Just use format::blocked for now because the layout
+    // is stored in the MKL data.
+    filter_mkl_shape.SetTfLayout(filter_dims_tf_order.size(),
+                                 filter_dims_tf_order, memory::format::blocked);
+
+    // Allocate the data space for the filter to propagate as TF tensor.
+    TensorShape filter_tf_shape;
+    filter_tf_shape.AddDim((filter_pd.get_size() / sizeof(T)));
+
+    AllocateOutputSetMklShape(context, kOutputIndex_Filter, filter_tensor,
+                              filter_tf_shape, filter_mkl_shape);
   }
 
   // Prepare and execute net - checks for input and output reorders.
   void PrepareAndExecuteNet(
-                  const convolution_forward::primitive_desc& conv_prim_desc,
-                  MklDnnData<T>* src, MklDnnData<T>* filter,
-                  MklDnnData<T>* bias, MklDnnData<T>* output,
-                  Tensor* filter_out_tensor) {
+      const convolution_forward::primitive_desc& conv_prim_desc,
+      MklDnnData<T>* src, MklDnnData<T>* filter, MklDnnData<T>* bias,
+      MklDnnData<T>* output, Tensor* filter_out_tensor) {
     CHECK_NOTNULL(filter_out_tensor);
 
     // Create reorders between user layout and MKL layout if it is needed and
@@ -731,18 +729,20 @@ class MklConv2DOp : public OpKernel {
     // rather than re-order to a temp buffer, reorder directly to the
     // filter output tensor
     filter->CheckReorderToOpMem(conv_prim_desc.weights_primitive_desc(),
-                    filter->GetTensorBuffer(filter_out_tensor), &net);
+                                filter->GetTensorBuffer(filter_out_tensor),
+                                &net);
 
     // Create convolution primitive and add it to net.
     if (bias) {
       CHECK_EQ(biasEnabled, true);
       net.push_back(convolution_forward(conv_prim_desc, src->GetOpMem(),
-                                    filter->GetOpMem(), bias->GetOpMem(),
-                                    output->GetOpMem()));
+                                        filter->GetOpMem(), bias->GetOpMem(),
+                                        output->GetOpMem()));
     } else {
       CHECK_EQ(biasEnabled, false);
       net.push_back(convolution_forward(conv_prim_desc, src->GetOpMem(),
-                                    filter->GetOpMem(), output->GetOpMem()));
+                                        filter->GetOpMem(),
+                                        output->GetOpMem()));
     }
 
     stream(stream::kind::eager).submit(net).wait();
diff --git a/tensorflow/core/kernels/mkl_conv_ops.h b/tensorflow/core/kernels/mkl_conv_ops.h
index c6456bd5c3..8b65eaea0d 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.h
+++ b/tensorflow/core/kernels/mkl_conv_ops.h
@@ -16,9 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_MKL_CONV_OPS_H_
 #define TENSORFLOW_CORE_KERNELS_MKL_CONV_OPS_H_
 
-#include <vector>
 #include <limits>
 #include <string>
+#include <vector>
 
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -27,8 +27,8 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
 #include "tensorflow/core/kernels/bounds_check.h"
-#include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/kernels/conv_grad_ops.h"
+#include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/strings/numbers.h"
@@ -43,11 +43,11 @@ limitations under the License.
 #ifdef INTEL_MKL_DNN
 #include "mkldnn.hpp"
 
-using mkldnn::stream;
 using mkldnn::prop_kind;
+using mkldnn::stream;
 
-using mkldnn::convolution_forward;
 using mkldnn::convolution_direct;
+using mkldnn::convolution_forward;
 #endif
 
 namespace tensorflow {
@@ -63,13 +63,13 @@ class MklDnnConvUtil {
 
  public:
   MklDnnConvUtil(OpKernelContext* context, const std::vector<int32>& strides,
-                 Padding pad, TensorFormat fm) : context_(context),
-    strides_(strides), padding_(pad), data_format_(fm) {}
+                 Padding pad, TensorFormat fm)
+      : context_(context), strides_(strides), padding_(pad), data_format_(fm) {}
 
   virtual ~MklDnnConvUtil() { context_ = nullptr; }
 
   // Calculate Convolution strides
-  virtual inline void GetStridesInMklOrder(memory::dims *strides) {
+  virtual inline void GetStridesInMklOrder(memory::dims* strides) {
     // For now we take the stride from the second and third dimensions only
     // (we do not support striding on the batch or depth dimension).
     CHECK_NOTNULL(strides);
@@ -82,14 +82,14 @@ class MklDnnConvUtil {
   // requires input in NCHW format. Function does not return anything.
   // But errors arising from sanity checks are returned in context's
   // status.
-  virtual inline void
-  GetInputSizeInMklOrder(const TensorShape& input_shape,
-                         memory::dims *input_dims) {
-  #define CHECK_BOUNDS(val, err_msg) do {                     \
-    OP_REQUIRES(context_, FastBoundsCheck(val,                \
-                            std::numeric_limits<int>::max()), \
-                errors::InvalidArgument(err_msg));            \
-  }while(0)
+  virtual inline void GetInputSizeInMklOrder(const TensorShape& input_shape,
+                                             memory::dims* input_dims) {
+#define CHECK_BOUNDS(val, err_msg)                                     \
+  do {                                                                 \
+    OP_REQUIRES(context_,                                              \
+                FastBoundsCheck(val, std::numeric_limits<int>::max()), \
+                errors::InvalidArgument(err_msg));                     \
+  } while (0)
 
     CHECK_NOTNULL(input_dims);
 
@@ -112,7 +112,7 @@ class MklDnnConvUtil {
     CHECK_BOUNDS(input_batch_raw, "Input batch too large");
     int input_batch = static_cast<int>(input_batch_raw);
 
-  #undef CHECK_BOUNDS
+#undef CHECK_BOUNDS
 
     // MKL-DNN always requires input in NCHW format.
     std::vector<int> mkldnn_sizes(4, -1);
@@ -138,10 +138,9 @@ class MklDnnConvUtil {
   // forward gets actual tensor as input).
   //
   // TODO(nhasabni): Add similar function for input and filter in MklShape.
-  virtual inline void
-  GetFilterSizeInMklOrder(const TensorShape& input_shape,
-                          const TensorShape& filter_shape,
-                          memory::dims *filter_dims) {
+  virtual inline void GetFilterSizeInMklOrder(const TensorShape& input_shape,
+                                              const TensorShape& filter_shape,
+                                              memory::dims* filter_dims) {
     CHECK_NOTNULL(filter_dims);
 
     OP_REQUIRES(context_, filter_shape.dims() == 4,
@@ -149,17 +148,18 @@ class MklDnnConvUtil {
                                         filter_shape.DebugString()));
 
     for (int i = 0; i < 3; i++) {
-      OP_REQUIRES(context_, FastBoundsCheck(filter_shape.dim_size(i),
-                                           std::numeric_limits<int>::max()),
-                errors::InvalidArgument("filter too large"));
+      OP_REQUIRES(context_,
+                  FastBoundsCheck(filter_shape.dim_size(i),
+                                  std::numeric_limits<int>::max()),
+                  errors::InvalidArgument("filter too large"));
     }
 
     int input_depth = GetTensorDim(input_shape, data_format_, 'C');
 
-    OP_REQUIRES(
-        context_, input_depth == filter_shape.dim_size(2),
-        errors::InvalidArgument("input and filter must have the same depth: ",
-                                input_depth, " vs ", filter_shape.dim_size(2)));
+    OP_REQUIRES(context_, input_depth == filter_shape.dim_size(2),
+                errors::InvalidArgument(
+                    "input and filter must have the same depth: ", input_depth,
+                    " vs ", filter_shape.dim_size(2)));
 
     // TF filter is always in (rows, cols, in_depth, out_depth) order.
     int filter_rows = static_cast<int>(filter_shape.dim_size(0));
@@ -182,25 +182,24 @@ class MklDnnConvUtil {
   // requires filter in OIHW format. Function does not return anything.
   // But errors arising from sanity checks are returned in context's
   // status.
-  virtual inline void
-  GetFilterSizeInMklOrder(size_t src_index, size_t filter_index,
-                          memory::dims *filter_dims) {
+  virtual inline void GetFilterSizeInMklOrder(size_t src_index,
+                                              size_t filter_index,
+                                              memory::dims* filter_dims) {
     CHECK_NOTNULL(filter_dims);
     GetFilterSizeInMklOrder(GetTfShape(context_, src_index),
-                            GetTfShape(context_, filter_index),
-                            filter_dims);
+                            GetTfShape(context_, filter_index), filter_dims);
   }
 
   // Calculate Bias size for 2D Convolution. Function does not return
   // anything, but sets error in context status.
-  virtual inline void
-  GetBiasSizeInMklOrder(size_t bias_index, memory::dims *bias_dims) {
+  virtual inline void GetBiasSizeInMklOrder(size_t bias_index,
+                                            memory::dims* bias_dims) {
     const Tensor& bias = MklGetInput(context_, bias_index);
     OP_REQUIRES(context_, bias.dims() == 1,
                 errors::InvalidArgument("bias must be 1-dimensional: ",
                                         bias.shape().DebugString()));
 
-    *bias_dims = { static_cast<int>(bias.dim_size(0)) };
+    *bias_dims = {static_cast<int>(bias.dim_size(0))};
   }
 
   // Function to calculate output and padding size for 2D convolution.
@@ -212,13 +211,11 @@ class MklDnnConvUtil {
   // status is returned via context status.
   //
   // TODO(nhasabni): Add similar function for input and filter in MklShape.
-  virtual inline void
-  GetOutputAndPadSizeInMklOrder(const TensorShape& input_shape,
-                                const TensorShape& filter_shape,
-                                const memory::dims& strides,
-                                memory::dims *output_dims_tf_order,
-                                memory::dims *output_dims_mkl_order,
-                                memory::dims *pad_l, memory::dims *pad_r) {
+  virtual inline void GetOutputAndPadSizeInMklOrder(
+      const TensorShape& input_shape, const TensorShape& filter_shape,
+      const memory::dims& strides, memory::dims* output_dims_tf_order,
+      memory::dims* output_dims_mkl_order, memory::dims* pad_l,
+      memory::dims* pad_r) {
     CHECK_NOTNULL(output_dims_tf_order);
     CHECK_NOTNULL(output_dims_mkl_order);
     CHECK_NOTNULL(pad_l);
@@ -244,16 +241,16 @@ class MklDnnConvUtil {
     int64 out_rows = 0, out_cols = 0;
     int64 pad_top = 0, pad_bottom = 0, pad_left, pad_right;
 
-    OP_REQUIRES_OK(context_,
-            GetWindowedOutputSizeVerbose(input_rows, filter_rows, stride_rows,
-                                 padding_, &out_rows, &pad_top, &pad_bottom));
-    OP_REQUIRES_OK(context_,
-            GetWindowedOutputSizeVerbose(input_cols, filter_cols, stride_cols,
-                                 padding_, &out_cols, &pad_left, &pad_right));
+    OP_REQUIRES_OK(context_, GetWindowedOutputSizeVerbose(
+                                 input_rows, filter_rows, stride_rows, padding_,
+                                 &out_rows, &pad_top, &pad_bottom));
+    OP_REQUIRES_OK(context_, GetWindowedOutputSizeVerbose(
+                                 input_cols, filter_cols, stride_cols, padding_,
+                                 &out_cols, &pad_left, &pad_right));
 
     // Tensorflow output is in data_format order. (NHWC or NCHW)
-    TensorShape out_shape = ShapeFromFormat(data_format_, out_batch,
-                                            out_rows, out_cols, out_depth);
+    TensorShape out_shape =
+        ShapeFromFormat(data_format_, out_batch, out_rows, out_cols, out_depth);
     *output_dims_tf_order = TFShapeToMklDnnDims(out_shape);
 
     // MKL-DNN always needs output in NCHW format.
@@ -273,12 +270,10 @@ class MklDnnConvUtil {
   // See comment on GetConvOutputAndPadSizeInMklOrder for parameters.
   //
   // Function does not return anything, but sets error in context status.
-  inline void
-  GetOutputAndPadSizeInMklOrder(size_t src_index, size_t filter_index,
-                                const memory::dims& strides,
-                                memory::dims *output_dims_tf_order,
-                                memory::dims *output_dims_mkl_order,
-                                memory::dims *pad_l, memory::dims *pad_r) {
+  inline void GetOutputAndPadSizeInMklOrder(
+      size_t src_index, size_t filter_index, const memory::dims& strides,
+      memory::dims* output_dims_tf_order, memory::dims* output_dims_mkl_order,
+      memory::dims* pad_l, memory::dims* pad_r) {
     CHECK_NOTNULL(output_dims_tf_order);
     CHECK_NOTNULL(output_dims_mkl_order);
     CHECK_NOTNULL(pad_l);
@@ -289,11 +284,11 @@ class MklDnnConvUtil {
 
     OP_REQUIRES(context_, input_tf_shape.dims() == 4,
                 errors::InvalidArgument("input must be 4-dimensional",
-                                          input_tf_shape.DebugString()));
+                                        input_tf_shape.DebugString()));
 
-    GetOutputAndPadSizeInMklOrder(input_tf_shape, filter_tf_shape,
-                                  strides, output_dims_tf_order,
-                                  output_dims_mkl_order, pad_l, pad_r);
+    GetOutputAndPadSizeInMklOrder(input_tf_shape, filter_tf_shape, strides,
+                                  output_dims_tf_order, output_dims_mkl_order,
+                                  pad_l, pad_r);
   }
 
   // Wrapper function to calculate input, filter, and output sizes of
@@ -302,15 +297,12 @@ class MklDnnConvUtil {
   // also calculates strides and paddings for 2D Convolution.
   //
   // Function does not return anything, but sets error in context status.
-  inline void GetConvFwdSizesInMklOrder(const TensorShape& input_shape,
-                                        const TensorShape& filter_shape,
-                                        memory::dims *input_dims,
-                                        memory::dims *filter_dims,
-                                        memory::dims *strides,
-                                        memory::dims *output_dims_tf_order,
-                                        memory::dims *output_dims_mkl_order,
-                                        memory::dims *pad_l,
-                                        memory::dims *pad_r) {
+  inline void GetConvFwdSizesInMklOrder(
+      const TensorShape& input_shape, const TensorShape& filter_shape,
+      memory::dims* input_dims, memory::dims* filter_dims,
+      memory::dims* strides, memory::dims* output_dims_tf_order,
+      memory::dims* output_dims_mkl_order, memory::dims* pad_l,
+      memory::dims* pad_r) {
     CHECK_NOTNULL(input_dims);
     CHECK_NOTNULL(filter_dims);
     CHECK_NOTNULL(strides);
@@ -325,8 +317,7 @@ class MklDnnConvUtil {
     if (!context_->status().ok()) return;
     GetStridesInMklOrder(strides);
     GetOutputAndPadSizeInMklOrder(input_shape, filter_shape, *strides,
-                                  output_dims_tf_order,
-                                  output_dims_mkl_order,
+                                  output_dims_tf_order, output_dims_mkl_order,
                                   pad_l, pad_r);
     if (!context_->status().ok()) return;
   }
@@ -337,7 +328,7 @@ class MklDnnConvUtil {
 /////////////////////////////////////////////////////////////////////
 
 template <typename Device, class T>
-class MklConv2DBackpropCommonOp :  public OpKernel {
+class MklConv2DBackpropCommonOp : public OpKernel {
  public:
   ~MklConv2DBackpropCommonOp() {}
   explicit MklConv2DBackpropCommonOp(OpKernelConstruction* context)
@@ -397,12 +388,11 @@ class MklConv2DBackpropCommonOp :  public OpKernel {
           outbprop_tf_shape.num_elements() == 0) {
         MklDnnShape output_mkl_shape;
         output_mkl_shape.SetMklTensor(false);
-        TensorShape output_tf_shape = GetOutputTfShape(input_tf_shape,
-                                                       filter_tf_shape,
-                                                       outbprop_tf_shape);
+        TensorShape output_tf_shape = GetOutputTfShape(
+            input_tf_shape, filter_tf_shape, outbprop_tf_shape);
         const int kOutputIdx = 0;
         AllocateOutputSetMklShape(context, kOutputIdx, &output_tensor,
-                                    output_tf_shape, output_mkl_shape);
+                                  output_tf_shape, output_mkl_shape);
         CHECK_NOTNULL(output_tensor);
 
         // if output tensor has more than 0 elements, we need to 0 them out.
@@ -421,12 +411,10 @@ class MklConv2DBackpropCommonOp :  public OpKernel {
 
       // Get forward convolution parameters.
       MklDnnConvUtil conv_utl(context, strides_, padding_, data_format_);
-      conv_utl.GetConvFwdSizesInMklOrder(input_tf_shape, filter_tf_shape,
-                                         &fwd_input_dims, &fwd_filter_dims,
-                                         &strides,
-                                         &fwd_output_dims_tf_order,
-                                         &fwd_output_dims,
-                                         &padding_l, &padding_r);
+      conv_utl.GetConvFwdSizesInMklOrder(
+          input_tf_shape, filter_tf_shape, &fwd_input_dims, &fwd_filter_dims,
+          &strides, &fwd_output_dims_tf_order, &fwd_output_dims, &padding_l,
+          &padding_r);
       if (!context->status().ok()) return;
 
       // Create Convolution forward descriptor since Convolution backward
@@ -437,20 +425,22 @@ class MklConv2DBackpropCommonOp :  public OpKernel {
       // construct input TF layout. For TF layout, although input shape
       // required is in MKL-DNN order, the layout is Tensorflow's layout
       // (NHWC or NCHW depending on data format).
-      auto fwd_input_md = input_mkl_shape.IsMklTensor() ?
-                          input_mkl_shape.GetMklLayout() :
-                       memory::desc(fwd_input_dims, MklDnnType<T>(), tf_fmt);
+      auto fwd_input_md =
+          input_mkl_shape.IsMklTensor()
+              ? input_mkl_shape.GetMklLayout()
+              : memory::desc(fwd_input_dims, MklDnnType<T>(), tf_fmt);
       // If filter is in MKL layout, then simply grab filter layout; otherwise
       // construct filter in TF layout. For TF layout, filter is in HWIO format.
-      auto fwd_filter_md = filter_mkl_shape.IsMklTensor() ?
-                          filter_mkl_shape.GetMklLayout() :
-                          memory::desc(fwd_filter_dims, MklDnnType<T>(),
-                                       memory::format::hwio);
+      auto fwd_filter_md = filter_mkl_shape.IsMklTensor()
+                               ? filter_mkl_shape.GetMklLayout()
+                               : memory::desc(fwd_filter_dims, MklDnnType<T>(),
+                                              memory::format::hwio);
       // Tensorflow Output of Conv2D is in data_format order.
       auto fwd_out_md = memory::desc(fwd_output_dims, MklDnnType<T>(), tf_fmt);
-      auto fwd_desc = convolution_forward::desc(prop_kind::forward,
-            convolution_direct, fwd_input_md, fwd_filter_md, fwd_out_md,
-            strides, padding_l, padding_r, TFPaddingToMklDnnPadding(padding_));
+      auto fwd_desc = convolution_forward::desc(
+          prop_kind::forward, convolution_direct, fwd_input_md, fwd_filter_md,
+          fwd_out_md, strides, padding_l, padding_r,
+          TFPaddingToMklDnnPadding(padding_));
       auto fwd_pd = convolution_forward::primitive_desc(fwd_desc, cpu_engine);
 
       // Create memory for user data. Describe how the inputs and outputs of
@@ -495,17 +485,16 @@ class MklConv2DBackpropCommonOp :  public OpKernel {
 
       // Operator-specific call to create and execute primitive.
       CreatePrimitive(context, cpu_engine, fwd_pd, &input, &filter,
-                      &outbackprop, &output, &output_tensor,
-                      strides, padding_l, padding_r,
-                      TFPaddingToMklDnnPadding(padding_),
+                      &outbackprop, &output, &output_tensor, strides, padding_l,
+                      padding_r, TFPaddingToMklDnnPadding(padding_),
                       bwd_output_dims, bwd_output_format);
-    } catch (mkldnn::error &e) {
-     string error_msg = "Status: " + std::to_string(e.status) +
-                       ", message: " + string(e.message) +
-                       ", in file " + string(__FILE__) + ":" +
-                       std::to_string(__LINE__);
-     OP_REQUIRES_OK(context, errors::Aborted("Operation received an exception:",
-                                            error_msg));
+    } catch (mkldnn::error& e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
     }
   }
 
@@ -523,11 +512,11 @@ class MklConv2DBackpropCommonOp :  public OpKernel {
 
   /// Get TensorFlow shape of input tensor.
   virtual TensorShape MakeInputTfShape(OpKernelContext* context,
-                                      const Tensor& input_tensor) = 0;
+                                       const Tensor& input_tensor) = 0;
 
   /// Get TensorFlow shape of filter tensor.
   virtual TensorShape MakeFilterTfShape(OpKernelContext* context,
-                                       const Tensor& filter_tensor) = 0;
+                                        const Tensor& filter_tensor) = 0;
 
   /// Get the TensorFlow shape of output tensor.
   virtual TensorShape GetOutputTfShape(const TensorShape& input_shape,
@@ -536,9 +525,9 @@ class MklConv2DBackpropCommonOp :  public OpKernel {
 
   /// Get shape of output in MKL-DNN order. Computes shape of output from
   /// input shape (fwd_input_dims) and filter shape (fwd_filter_dims).
-  virtual
-  const memory::dims& GetOutputDims(const memory::dims& fwd_input_dims,
-                                    const memory::dims& fwd_filter_dims) = 0;
+  virtual const memory::dims& GetOutputDims(
+      const memory::dims& fwd_input_dims,
+      const memory::dims& fwd_filter_dims) = 0;
 
   /// Get data_format of output in MKL-DNN order. If output data format is
   /// same as input data format, then it simply returns value of data_format
@@ -546,17 +535,18 @@ class MklConv2DBackpropCommonOp :  public OpKernel {
   virtual memory::format GetOutputFormat(const memory::format data_format) = 0;
 
   /// Create and execute the primitive storing output in the output_tensor.
-  virtual void CreatePrimitive(OpKernelContext* context,
-    const engine& cpu_engine,
-    const convolution_forward::primitive_desc& conv_fwd_pd,
-    MklDnnData<T>* input, MklDnnData<T>* filter, MklDnnData<T>* outbackprop,
-    MklDnnData<T>* output, Tensor** output_tensor, const memory::dims& strides,
-    const memory::dims& padding_l, const memory::dims& padding_r,
-    padding_kind padding, const memory::dims& bwd_output_dims,
-    memory::format bwd_output_format) = 0;
+  virtual void CreatePrimitive(
+      OpKernelContext* context, const engine& cpu_engine,
+      const convolution_forward::primitive_desc& conv_fwd_pd,
+      MklDnnData<T>* input, MklDnnData<T>* filter, MklDnnData<T>* outbackprop,
+      MklDnnData<T>* output, Tensor** output_tensor,
+      const memory::dims& strides, const memory::dims& padding_l,
+      const memory::dims& padding_r, padding_kind padding,
+      const memory::dims& bwd_output_dims,
+      memory::format bwd_output_format) = 0;
 
   // Get the data_format {NCHW, NHWC}
-  TensorFormat GetTFDataFormat () { return data_format_; }
+  TensorFormat GetTFDataFormat() { return data_format_; }
 
  private:
   std::vector<int32> strides_;
@@ -575,12 +565,12 @@ class MklDummyOp : public OpKernel {
  public:
   ~MklDummyOp() {}
 
-  explicit MklDummyOp(OpKernelConstruction* context) :
-    OpKernel(context) {}
+  explicit MklDummyOp(OpKernelConstruction* context) : OpKernel(context) {}
 
   void Compute(OpKernelContext* context) override {
-    TF_CHECK_OK(errors::Unimplemented("This is a dummy op."
-                                      "It should not have been invoked."));
+    TF_CHECK_OK(
+        errors::Unimplemented("This is a dummy op."
+                              "It should not have been invoked."));
   }
 };
 
diff --git a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
index 8340a91d05..0b6d838e09 100644
--- a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
@@ -28,12 +28,12 @@ limitations under the License.
 #ifdef INTEL_MKL_DNN
 #include "mkldnn.hpp"
 
-using mkldnn::stream;
+using mkldnn::batch_normalization_backward;
+using mkldnn::batch_normalization_forward;
 using mkldnn::prop_kind;
-using mkldnn::use_scale_shift;
+using mkldnn::stream;
 using mkldnn::use_global_stats;
-using mkldnn::batch_normalization_forward;
-using mkldnn::batch_normalization_backward;
+using mkldnn::use_scale_shift;
 #endif
 
 // TODO(inteltf) Address comments from PR 8968.
@@ -601,7 +601,7 @@ class MklFusedBatchNormGradOp : public OpKernel {
       mkl_res_batchnorm_bwd[dnnResourceSrc] =
           (mkl_convert_input) ? mkl_buf_converted_input : mkl_buf_input;
 
-     bool mkl_convert_out_backprop;
+      bool mkl_convert_out_backprop;
       dnnPrimitive_t mkl_prim_convert_out_backprop = nullptr;
       dnnLayout_t mkl_lt_internal_out_backprop = nullptr;
       void* mkl_buf_converted_out_backprop = nullptr;
@@ -709,12 +709,11 @@ class MklFusedBatchNormOp : public OpKernel {
       const size_t kMeanIndex = 3;      // index of est_mean tensor
       const size_t kVarianceIndex = 4;  // index of est_variance tensor
 
-      const Tensor& src_tensor          = MklGetInput(context, kSrcIndex);
-      const Tensor& scale_tensor        = MklGetInput(context, kScaleIndex);
-      const Tensor& shift_tensor        = MklGetInput(context, kShiftIndex);
-      const Tensor& est_mean_tensor     = MklGetInput(context, kMeanIndex);
-      const Tensor& est_variance_tensor = MklGetInput(context,
-                                                      kVarianceIndex);
+      const Tensor& src_tensor = MklGetInput(context, kSrcIndex);
+      const Tensor& scale_tensor = MklGetInput(context, kScaleIndex);
+      const Tensor& shift_tensor = MklGetInput(context, kShiftIndex);
+      const Tensor& est_mean_tensor = MklGetInput(context, kMeanIndex);
+      const Tensor& est_variance_tensor = MklGetInput(context, kVarianceIndex);
 
       TensorShape tf_shape_src;
       MklDnnShape dnn_shape_src;
@@ -723,37 +722,34 @@ class MklFusedBatchNormOp : public OpKernel {
       if (dnn_shape_src.IsMklTensor()) {
         tf_shape_src = dnn_shape_src.GetTfShape();
         OP_REQUIRES(context, dnn_shape_src.GetDimension() == 4,
-                    errors::InvalidArgument(
-                        "input must be 4-dimensional",
-                        src_tensor.shape().DebugString()));
+                    errors::InvalidArgument("input must be 4-dimensional",
+                                            src_tensor.shape().DebugString()));
       } else {
         tf_shape_src = src_tensor.shape();
         OP_REQUIRES(context, src_tensor.dims() == 4,
-                    errors::InvalidArgument(
-                        "input must be 4-dimensional",
-                        src_tensor.shape().DebugString()));
+                    errors::InvalidArgument("input must be 4-dimensional",
+                                            src_tensor.shape().DebugString()));
       }
       OP_REQUIRES(context, scale_tensor.dims() == 1,
-                  errors::InvalidArgument(
-                      "scale must be 1-dimensional",
-                      scale_tensor.shape().DebugString()));
+                  errors::InvalidArgument("scale must be 1-dimensional",
+                                          scale_tensor.shape().DebugString()));
       OP_REQUIRES(context, shift_tensor.dims() == 1,
                   errors::InvalidArgument("offset must be 1-dimensional",
-                                        shift_tensor.shape().DebugString()));
-      OP_REQUIRES(context, est_mean_tensor.dims() == 1,
-                  errors::InvalidArgument(
-                      "estimated_mean must be 1-dimensional",
-                      est_mean_tensor.shape().DebugString()));
-      OP_REQUIRES(context, est_variance_tensor.dims() == 1,
-                  errors::InvalidArgument(
-                      "estimated_variance must be 1-dimensional",
-                      est_variance_tensor.shape().DebugString()));
+                                          shift_tensor.shape().DebugString()));
+      OP_REQUIRES(
+          context, est_mean_tensor.dims() == 1,
+          errors::InvalidArgument("estimated_mean must be 1-dimensional",
+                                  est_mean_tensor.shape().DebugString()));
+      OP_REQUIRES(
+          context, est_variance_tensor.dims() == 1,
+          errors::InvalidArgument("estimated_variance must be 1-dimensional",
+                                  est_variance_tensor.shape().DebugString()));
 
       if (is_training_) {
-        OP_REQUIRES(context, est_mean_tensor.dim_size(0) == 0,
-                    errors::InvalidArgument(
-                        "estimated_mean must be empty for training",
-                        est_mean_tensor.shape().DebugString()));
+        OP_REQUIRES(
+            context, est_mean_tensor.dim_size(0) == 0,
+            errors::InvalidArgument("estimated_mean must be empty for training",
+                                    est_mean_tensor.shape().DebugString()));
         OP_REQUIRES(context, est_variance_tensor.dim_size(0) == 0,
                     errors::InvalidArgument(
                         "estimated_variance must be empty for training",
@@ -763,11 +759,9 @@ class MklFusedBatchNormOp : public OpKernel {
       // special case: input with 0 element and 0 batch size
       Tensor* dst_tensor = nullptr;
       if (tf_shape_src.num_elements() == 0) {
-         HandleEmptyInput(context,
-                          tf_shape_src,
-                          scale_tensor.shape(),
-                          &dst_tensor);
-         return;
+        HandleEmptyInput(context, tf_shape_src, scale_tensor.shape(),
+                         &dst_tensor);
+        return;
       }
 
       if (dnn_shape_src.IsMklTensor())
@@ -783,11 +777,8 @@ class MklFusedBatchNormOp : public OpKernel {
       Tensor* batch_variance_tensor = nullptr;
       Tensor* saved_mean_tensor = nullptr;
       Tensor* saved_variance_tensor = nullptr;
-      AllocateTFOutputs(context,
-                        scale_tensor.shape(),
-                        &batch_mean_tensor,
-                        &batch_variance_tensor,
-                        &saved_mean_tensor,
+      AllocateTFOutputs(context, scale_tensor.shape(), &batch_mean_tensor,
+                        &batch_variance_tensor, &saved_mean_tensor,
                         &saved_variance_tensor);
 
       if (is_training_)
@@ -815,69 +806,63 @@ class MklFusedBatchNormOp : public OpKernel {
         src_dims = TFShapeToMklDnnDimsInNCHW(dnn_shape_src.GetTfShape(),
                                              tensor_format_);
       } else {
-        src_dims = TFShapeToMklDnnDimsInNCHW(src_tensor.shape(),
-                                             tensor_format_);
+        src_dims =
+            TFShapeToMklDnnDimsInNCHW(src_tensor.shape(), tensor_format_);
       }
 
       auto src_md = dnn_shape_src.IsMklTensor()
-                    ? dnn_shape_src.GetMklLayout()
-                    : memory::desc(src_dims, MklDnnType<T>(), format_m);
+                        ? dnn_shape_src.GetMklLayout()
+                        : memory::desc(src_dims, MklDnnType<T>(), format_m);
       src.SetUsrMem(src_md, &src_tensor);
 
       // set weights primitive
       // MKL-DNN packs scale & shift as "weights":
       // <scale>...<scale><shift>...<shift>
-      auto weights_desc = memory::desc({2, depth_},
-                                       MklDnnType<T>(),
-                                       memory::format::nc);
+      auto weights_desc =
+          memory::desc({2, depth_}, MklDnnType<T>(), memory::format::nc);
       auto weights_pd = memory::primitive_desc(weights_desc, cpu_engine);
       auto weights_m = memory(weights_pd);
-      T* weights_data = reinterpret_cast<T*>(
-                        weights_m.get_data_handle());
-      T* scale_tf = reinterpret_cast<T*>(
-                    const_cast<T*>(scale_tensor.flat<T>().data()));
-      T* shift_tf = reinterpret_cast<T*>(
-                    const_cast<T*>(shift_tensor.flat<T>().data()));
-
-      for (int k=0; k < depth_; k++) {
+      T* weights_data = reinterpret_cast<T*>(weights_m.get_data_handle());
+      T* scale_tf =
+          reinterpret_cast<T*>(const_cast<T*>(scale_tensor.flat<T>().data()));
+      T* shift_tf =
+          reinterpret_cast<T*>(const_cast<T*>(shift_tensor.flat<T>().data()));
+
+      for (int k = 0; k < depth_; k++) {
         weights_data[k] = scale_tf[k];
         weights_data[k + depth_] = shift_tf[k];
       }
 
       // set mean primitive
-      auto mean_desc = memory::desc({1, depth_},
-                                    MklDnnType<T>(),
-                                    memory::format::nc);
+      auto mean_desc =
+          memory::desc({1, depth_}, MklDnnType<T>(), memory::format::nc);
       auto mean_pd = memory::primitive_desc(mean_desc, cpu_engine);
-      char* saved_mean_data_tf = reinterpret_cast<char*>
-                                 (saved_mean_tensor->flat<T>().data());
-      std::memcpy(saved_mean_data_tf,
-                  reinterpret_cast<char*>(mean_values_),
-                  depth_*sizeof(T));
-      auto mean_m = memory(mean_pd,
-                           reinterpret_cast<void*>(saved_mean_data_tf));
+      char* saved_mean_data_tf =
+          reinterpret_cast<char*>(saved_mean_tensor->flat<T>().data());
+      std::memcpy(saved_mean_data_tf, reinterpret_cast<char*>(mean_values_),
+                  depth_ * sizeof(T));
+      auto mean_m =
+          memory(mean_pd, reinterpret_cast<void*>(saved_mean_data_tf));
 
       // set variance primitive
-      auto variance_desc = memory::desc({1, depth_},
-                                    MklDnnType<T>(),
-                                    memory::format::nc);
+      auto variance_desc =
+          memory::desc({1, depth_}, MklDnnType<T>(), memory::format::nc);
       auto variance_pd = memory::primitive_desc(variance_desc, cpu_engine);
-      char* saved_variance_data_tf = reinterpret_cast<char*>
-                  (saved_variance_tensor->flat<T>().data());
+      char* saved_variance_data_tf =
+          reinterpret_cast<char*>(saved_variance_tensor->flat<T>().data());
       std::memcpy(saved_variance_data_tf,
                   reinterpret_cast<char*>(variance_values_),
-                  depth_*sizeof(T));
+                  depth_ * sizeof(T));
       auto variance_m = memory(variance_pd, saved_variance_data_tf);
 
-      prop_kind pk = (is_training_) ?
-                     prop_kind::forward_training :
-                     prop_kind::forward_scoring;
+      prop_kind pk = (is_training_) ? prop_kind::forward_training
+                                    : prop_kind::forward_scoring;
       auto bnrm_fwd_desc = batch_normalization_forward::desc(
-                               pk, src.GetUsrMemDesc(), epsilon_,
-                               is_training_ ? use_scale_shift :
-                               (use_scale_shift | use_global_stats));
+          pk, src.GetUsrMemDesc(), epsilon_,
+          is_training_ ? use_scale_shift
+                       : (use_scale_shift | use_global_stats));
       auto bnrm_fwd_pd = batch_normalization_forward::primitive_desc(
-                             bnrm_fwd_desc, cpu_engine);
+          bnrm_fwd_desc, cpu_engine);
 
       // allocate dst tensor
       MklDnnShape dnn_shape_dst;
@@ -887,47 +872,39 @@ class MklFusedBatchNormOp : public OpKernel {
         auto dst_pd = bnrm_fwd_pd.dst_primitive_desc();
         dnn_shape_dst.SetMklLayout(&dst_pd);
         dnn_shape_dst.SetElemType(MklDnnType<T>());
-        dnn_shape_dst.SetTfLayout(dnn_shape_src.GetDimension(),
-                                  src_dims, format_m);
-        tf_shape_dst.AddDim(dst_pd.get_size()/sizeof(T));
+        dnn_shape_dst.SetTfLayout(dnn_shape_src.GetDimension(), src_dims,
+                                  format_m);
+        tf_shape_dst.AddDim(dst_pd.get_size() / sizeof(T));
       } else {
         dnn_shape_dst.SetMklTensor(false);
         tf_shape_dst = src_tensor.shape();
       }
-      AllocateOutputSetMklShape(context, kDstIndex, &dst_tensor,
-                                tf_shape_dst, dnn_shape_dst);
+      AllocateOutputSetMklShape(context, kDstIndex, &dst_tensor, tf_shape_dst,
+                                dnn_shape_dst);
 
       // Output of batchnorm has same shape as input.
       dst.SetUsrMem(src_md, dst_tensor);
 
       primitive bnrm_fwd_op;
       if (is_training_) {
-        bnrm_fwd_op = batch_normalization_forward(
-                          bnrm_fwd_pd,
-                          src.GetOpMem(),
-                          weights_m,
-                          dst.GetOpMem(),
-                          mean_m,
-                          variance_m);
+        bnrm_fwd_op =
+            batch_normalization_forward(bnrm_fwd_pd, src.GetOpMem(), weights_m,
+                                        dst.GetOpMem(), mean_m, variance_m);
       } else {
         bnrm_fwd_op = batch_normalization_forward(
-                          bnrm_fwd_pd,
-                          src.GetOpMem(),
-                          mean_m,
-                          variance_m,
-                          (const primitive::at) weights_m,
-                          dst.GetOpMem());
+            bnrm_fwd_pd, src.GetOpMem(), mean_m, variance_m,
+            (const primitive::at)weights_m, dst.GetOpMem());
       }
       std::vector<primitive> net;
       net.push_back(bnrm_fwd_op);
       stream(stream::kind::eager).submit(net).wait();
 
       // copy batch_mean data
-      T* batch_mean_data_tf = reinterpret_cast<T*>(
-                                batch_mean_tensor->flat<T>().data());
+      T* batch_mean_data_tf =
+          reinterpret_cast<T*>(batch_mean_tensor->flat<T>().data());
       std::memcpy(reinterpret_cast<char*>(batch_mean_data_tf),
                   reinterpret_cast<char*>(mean_m.get_data_handle()),
-                  depth_*sizeof(T));
+                  depth_ * sizeof(T));
 
       // copy batch_variance data with Bessel's correction
       // if training mode is on
@@ -937,18 +914,17 @@ class MklFusedBatchNormOp : public OpKernel {
         size_t adjust_size = orig_size - 1;
         adjust_factor = (static_cast<float>(orig_size)) / adjust_size;
       }
-      for (int k=0; k < depth_; k++)
+      for (int k = 0; k < depth_; k++)
         batch_variance_tensor->flat<T>().data()[k] =
-            (reinterpret_cast<T*>(variance_m.get_data_handle()))[k]
-            * adjust_factor;
-    } catch (mkldnn::error &e) {
+            (reinterpret_cast<T*>(variance_m.get_data_handle()))[k] *
+            adjust_factor;
+    } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
-                         ", message: " + string(e.message) +
-                         ", in file " + string(__FILE__) + ":" +
-                         std::to_string(__LINE__);
-      OP_REQUIRES_OK(context,
-                     errors::Aborted("Operation received an exception:",
-                     error_msg));
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
     }
   }
 
@@ -958,7 +934,7 @@ class MklFusedBatchNormOp : public OpKernel {
   bool is_training_;
   T* mean_values_;
   T* variance_values_;
-  size_t depth_;          // batch normalization is done for per channel.
+  size_t depth_;  // batch normalization is done for per channel.
 
   void ExtractParams(OpKernelContext* context) {
     const Tensor& input = MklGetInput(context, 0);
@@ -966,23 +942,20 @@ class MklFusedBatchNormOp : public OpKernel {
   }
 
   void SetMeanVariance(const Tensor& mean, const Tensor& variance) {
-    mean_values_ = reinterpret_cast<T*>(
-                       const_cast<T*>(mean.flat<T>().data()));
-    variance_values_ = reinterpret_cast<T*>(
-                       const_cast<T*>(variance.flat<T>().data()));
+    mean_values_ = reinterpret_cast<T*>(const_cast<T*>(mean.flat<T>().data()));
+    variance_values_ =
+        reinterpret_cast<T*>(const_cast<T*>(variance.flat<T>().data()));
   }
 
-  void HandleEmptyInput(OpKernelContext* context,
-                        TensorShape tf_shape_src,
-                        TensorShape tf_shape_scale,
-                        Tensor** dst_tensor) {
+  void HandleEmptyInput(OpKernelContext* context, TensorShape tf_shape_src,
+                        TensorShape tf_shape_scale, Tensor** dst_tensor) {
     CHECK_NOTNULL(dst_tensor);
 
     const size_t kDstIndex = 0;
     MklDnnShape dnn_shape_dst;
     dnn_shape_dst.SetMklTensor(false);
-    AllocateOutputSetMklShape(context, kDstIndex, dst_tensor,
-                              tf_shape_src, dnn_shape_dst);
+    AllocateOutputSetMklShape(context, kDstIndex, dst_tensor, tf_shape_src,
+                              dnn_shape_dst);
     CHECK_NOTNULL(*dst_tensor);
     memset(const_cast<char*>((*dst_tensor)->tensor_data().data()), 0,
            (*dst_tensor)->tensor_data().size());
@@ -991,15 +964,12 @@ class MklFusedBatchNormOp : public OpKernel {
     Tensor* batch_variance_tensor = nullptr;
     Tensor* saved_mean_tensor = nullptr;
     Tensor* saved_variance_tensor = nullptr;
-    AllocateTFOutputs(context, tf_shape_scale,
-                      &batch_mean_tensor,
-                      &batch_variance_tensor,
-                      &saved_mean_tensor,
+    AllocateTFOutputs(context, tf_shape_scale, &batch_mean_tensor,
+                      &batch_variance_tensor, &saved_mean_tensor,
                       &saved_variance_tensor);
   }
 
-  void AllocateTFOutputs(OpKernelContext* context,
-                         TensorShape tf_shape_scale,
+  void AllocateTFOutputs(OpKernelContext* context, TensorShape tf_shape_scale,
                          Tensor** batch_mean_tensor,
                          Tensor** batch_variance_tensor,
                          Tensor** saved_mean_tensor,
@@ -1017,51 +987,43 @@ class MklFusedBatchNormOp : public OpKernel {
     // allocate batch mean output tensor
     MklDnnShape mkl_shape_batch_mean;
     mkl_shape_batch_mean.SetMklTensor(false);
-    AllocateOutputSetMklShape(context,
-                              kBatchMeanIndex,
-                              batch_mean_tensor,
-                              tf_shape_scale,
-                              mkl_shape_batch_mean);
+    AllocateOutputSetMklShape(context, kBatchMeanIndex, batch_mean_tensor,
+                              tf_shape_scale, mkl_shape_batch_mean);
     CHECK_NOTNULL(*batch_mean_tensor);
     // set NAN mean value in case of empty input tensor
-    for (int k=0; k < tf_shape_scale.num_elements(); k++)
+    for (int k = 0; k < tf_shape_scale.num_elements(); k++)
       (*batch_mean_tensor)->flat<T>().data()[k] = NAN;
 
     // allocate batch variance output tensor
     MklDnnShape mkl_shape_batch_variance;
     mkl_shape_batch_variance.SetMklTensor(false);
-    AllocateOutputSetMklShape(context,
-                              kBatchVarianceIndex,
-                              batch_variance_tensor,
-                              tf_shape_scale,
+    AllocateOutputSetMklShape(context, kBatchVarianceIndex,
+                              batch_variance_tensor, tf_shape_scale,
                               mkl_shape_batch_variance);
     CHECK_NOTNULL(*batch_variance_tensor);
     // set NAN variance value in case of empty input tensor
-    for (int k=0; k < tf_shape_scale.num_elements(); k++)
+    for (int k = 0; k < tf_shape_scale.num_elements(); k++)
       (*batch_variance_tensor)->flat<T>().data()[k] = NAN;
 
     // Mean and variance (without Bessel's correction) saved for backward
     // computation to serve as pre-computed mean and variance.
     MklDnnShape mkl_shape_saved_mean;
     mkl_shape_saved_mean.SetMklTensor(false);
-    AllocateOutputSetMklShape(context, kSavedMeanIndex,
-                              saved_mean_tensor,
-                              tf_shape_scale,
-                              mkl_shape_saved_mean);
+    AllocateOutputSetMklShape(context, kSavedMeanIndex, saved_mean_tensor,
+                              tf_shape_scale, mkl_shape_saved_mean);
     CHECK_NOTNULL(*saved_mean_tensor);
     // set NAN mean value in case of empty input tensor
-    for (int k=0; k < tf_shape_scale.num_elements(); k++)
+    for (int k = 0; k < tf_shape_scale.num_elements(); k++)
       (*saved_mean_tensor)->flat<T>().data()[k] = NAN;
 
     MklDnnShape mkl_shape_saved_variance;
     mkl_shape_saved_variance.SetMklTensor(false);
     AllocateOutputSetMklShape(context, kSavedVarianceIndex,
-                              saved_variance_tensor,
-                              tf_shape_scale,
+                              saved_variance_tensor, tf_shape_scale,
                               mkl_shape_saved_variance);
     CHECK_NOTNULL(*saved_variance_tensor);
     // set NAN variance value in case of empty input tensor
-    for (int k=0; k < tf_shape_scale.num_elements(); k++)
+    for (int k = 0; k < tf_shape_scale.num_elements(); k++)
       (*saved_variance_tensor)->flat<T>().data()[k] = NAN;
   }
 };
@@ -1093,8 +1055,8 @@ class MklFusedBatchNormGradOp : public OpKernel {
       const Tensor& src_tensor = MklGetInput(context, kSrcIndex);
       const Tensor& scale_tensor = MklGetInput(context, kScaleIndex);
       const Tensor& saved_mean_tensor = MklGetInput(context, kMeanIndex);
-      const Tensor& saved_variance_tensor = MklGetInput(context,
-                                            kVarianceIndex);
+      const Tensor& saved_variance_tensor =
+          MklGetInput(context, kVarianceIndex);
 
       MklDnnShape dnn_shape_src, dnn_shape_diff_dst;
       GetMklShape(context, kSrcIndex, &dnn_shape_src);
@@ -1103,53 +1065,49 @@ class MklFusedBatchNormGradOp : public OpKernel {
 
       if (dnn_shape_diff_dst.IsMklTensor()) {
         tf_shape_diff_dst = dnn_shape_diff_dst.GetTfShape();
-        OP_REQUIRES(context, dnn_shape_diff_dst.GetDimension() == 4,
-                    errors::InvalidArgument(
-                        "input must be 4-dimensional",
-                        diff_dst_tensor.shape().DebugString()));
+        OP_REQUIRES(
+            context, dnn_shape_diff_dst.GetDimension() == 4,
+            errors::InvalidArgument("input must be 4-dimensional",
+                                    diff_dst_tensor.shape().DebugString()));
       } else {
         tf_shape_diff_dst = diff_dst_tensor.shape();
-        OP_REQUIRES(context, diff_dst_tensor.dims() == 4,
-                    errors::InvalidArgument(
-                        "input must be 4-dimensional",
-                        diff_dst_tensor.shape().DebugString()));
+        OP_REQUIRES(
+            context, diff_dst_tensor.dims() == 4,
+            errors::InvalidArgument("input must be 4-dimensional",
+                                    diff_dst_tensor.shape().DebugString()));
       }
 
       if (dnn_shape_src.IsMklTensor()) {
         tf_shape_src = dnn_shape_src.GetTfShape();
         OP_REQUIRES(context, dnn_shape_src.GetDimension() == 4,
-                    errors::InvalidArgument(
-                        "input must be 4-dimensional",
-                         src_tensor.shape().DebugString()));
+                    errors::InvalidArgument("input must be 4-dimensional",
+                                            src_tensor.shape().DebugString()));
       } else {
         tf_shape_src = src_tensor.shape();
         OP_REQUIRES(context, src_tensor.dims() == 4,
-                    errors::InvalidArgument(
-                        "input must be 4-dimensional",
-                        src_tensor.shape().DebugString()));
+                    errors::InvalidArgument("input must be 4-dimensional",
+                                            src_tensor.shape().DebugString()));
       }
 
       OP_REQUIRES(context, scale_tensor.dims() == 1,
-                  errors::InvalidArgument(
-                      "scale must be 1-dimensional",
-                      scale_tensor.shape().DebugString()));
-      OP_REQUIRES(context, saved_mean_tensor.dims() == 1,
-                  errors::InvalidArgument(
-                      "saved mean must be 1-dimensional",
-                       saved_mean_tensor.shape().DebugString()));
-
-      OP_REQUIRES(context, saved_variance_tensor.dims() == 1,
-                  errors::InvalidArgument(
-                      "saved variance must be 1-dimensional",
-                      saved_variance_tensor.shape().DebugString()));
+                  errors::InvalidArgument("scale must be 1-dimensional",
+                                          scale_tensor.shape().DebugString()));
+      OP_REQUIRES(
+          context, saved_mean_tensor.dims() == 1,
+          errors::InvalidArgument("saved mean must be 1-dimensional",
+                                  saved_mean_tensor.shape().DebugString()));
+
+      OP_REQUIRES(
+          context, saved_variance_tensor.dims() == 1,
+          errors::InvalidArgument("saved variance must be 1-dimensional",
+                                  saved_variance_tensor.shape().DebugString()));
 
       Tensor* diff_src_tensor = nullptr;
       if (tf_shape_src.num_elements() == 0 ||
           tf_shape_diff_dst.num_elements() == 0) {
-         HandleEmptyInput(context, tf_shape_src,
-                          scale_tensor.shape(),
-                          &diff_src_tensor);
-         return;
+        HandleEmptyInput(context, tf_shape_src, scale_tensor.shape(),
+                         &diff_src_tensor);
+        return;
       }
 
       if (dnn_shape_src.IsMklTensor())
@@ -1175,20 +1133,18 @@ class MklFusedBatchNormGradOp : public OpKernel {
 
       memory::dims src_dims, diff_dst_dims;
       if (dnn_shape_src.IsMklTensor())
-        src_dims = TFShapeToMklDnnDimsInNCHW(
-                       dnn_shape_src.GetTfShape(), tensor_format_);
+        src_dims = TFShapeToMklDnnDimsInNCHW(dnn_shape_src.GetTfShape(),
+                                             tensor_format_);
       else
-        src_dims = TFShapeToMklDnnDimsInNCHW(
-                       src_tensor.shape(), tensor_format_);
+        src_dims =
+            TFShapeToMklDnnDimsInNCHW(src_tensor.shape(), tensor_format_);
 
       if (dnn_shape_diff_dst.IsMklTensor())
         diff_dst_dims = TFShapeToMklDnnDimsInNCHW(
-                            dnn_shape_diff_dst.GetTfShape(),
-                            tensor_format_);
+            dnn_shape_diff_dst.GetTfShape(), tensor_format_);
       else
-        diff_dst_dims = TFShapeToMklDnnDimsInNCHW(
-                            diff_dst_tensor.shape(),
-                            tensor_format_);
+        diff_dst_dims =
+            TFShapeToMklDnnDimsInNCHW(diff_dst_tensor.shape(), tensor_format_);
 
       // set src and diff_dst primitives
       memory::desc src_md({}, memory::data_undef, memory::format_undef);
@@ -1202,7 +1158,7 @@ class MklFusedBatchNormGradOp : public OpKernel {
           src_md = diff_dst_md;
         }
       } else {
-        src_md =  memory::desc(src_dims, MklDnnType<T>(), format_m);
+        src_md = memory::desc(src_dims, MklDnnType<T>(), format_m);
         diff_dst_md = src_md;
       }
       src.SetUsrMem(src_md, &src_tensor);
@@ -1210,55 +1166,47 @@ class MklFusedBatchNormGradOp : public OpKernel {
 
       // weights -- DNN packs scales/shifts as weights in order of
       // scale, ..., scale, shift, ..., shift
-      auto weights_desc = memory::desc({2, depth_},
-                                       MklDnnType<T>(),
-                                       memory::format::nc);
+      auto weights_desc =
+          memory::desc({2, depth_}, MklDnnType<T>(), memory::format::nc);
       auto weights_pd = memory::primitive_desc(weights_desc, cpu_engine);
       auto weights_m = memory(weights_pd);
       T* weights_data = reinterpret_cast<T*>(weights_m.get_data_handle());
-      T* scale_tf = reinterpret_cast<T*>(const_cast<T*>
-                                        (scale_tensor.flat<T>().data()));
-      for (int k=0; k < depth_; k++) {
+      T* scale_tf =
+          reinterpret_cast<T*>(const_cast<T*>(scale_tensor.flat<T>().data()));
+      for (int k = 0; k < depth_; k++) {
         weights_data[k] = scale_tf[k];
         weights_data[k + depth_] = 0;
       }
 
       // set mean primitive
       memory::dims mv_dims = GetMeanVarianceDims();
-      mean.SetUsrMem(mv_dims,
-                     memory::format::nc,
-                     const_cast<void*>(static_cast<const void*>
-                     (saved_mean_tensor.flat<T>().data())));
+      mean.SetUsrMem(mv_dims, memory::format::nc,
+                     const_cast<void*>(static_cast<const void*>(
+                         saved_mean_tensor.flat<T>().data())));
       mean.SetOpMemDesc(mv_dims, memory::format::nc);
 
       // set variance primitive
-      variance.SetUsrMem(mv_dims,  memory::format::nc,
-                         const_cast<void*>(static_cast<const void*>
-                         (saved_variance_tensor.flat<T>().data())));
+      variance.SetUsrMem(mv_dims, memory::format::nc,
+                         const_cast<void*>(static_cast<const void*>(
+                             saved_variance_tensor.flat<T>().data())));
       variance.SetOpMemDesc(mv_dims, memory::format::nc);
 
       // set diff_weight primitive
-      auto diff_weights_desc = memory::desc(
-                                 {2, depth_},
-                                 MklDnnType<T>(),
-                                 memory::format::nc);
-      auto diff_weights_pd = memory::primitive_desc(
-                                diff_weights_desc,
-                                cpu_engine);
+      auto diff_weights_desc =
+          memory::desc({2, depth_}, MklDnnType<T>(), memory::format::nc);
+      auto diff_weights_pd =
+          memory::primitive_desc(diff_weights_desc, cpu_engine);
       auto diff_weights_m = memory(diff_weights_pd);
 
       auto bnrm_fwd_desc = batch_normalization_forward::desc(
-                               prop_kind::forward_training,
-                               src.GetUsrMemDesc(),
-                               epsilon_,
-                               is_training_ ? use_scale_shift :
-                               (use_scale_shift | use_global_stats));
+          prop_kind::forward_training, src.GetUsrMemDesc(), epsilon_,
+          is_training_ ? use_scale_shift
+                       : (use_scale_shift | use_global_stats));
       auto bnrm_fwd_pd = batch_normalization_forward::primitive_desc(
-                                bnrm_fwd_desc,
-                                cpu_engine);
+          bnrm_fwd_desc, cpu_engine);
 
       // Indices of output tensors
-      const size_t kDiffSrcIndex = 0;    // index of diff_src tensor
+      const size_t kDiffSrcIndex = 0;  // index of diff_src tensor
 
       // allocate diff_src tensor
       MklDnnShape dnn_shape_diff_src;
@@ -1268,14 +1216,11 @@ class MklFusedBatchNormGradOp : public OpKernel {
         auto diff_src_pd = bnrm_fwd_pd.dst_primitive_desc();
         dnn_shape_diff_src.SetMklLayout(&diff_src_pd);
         dnn_shape_diff_src.SetElemType(MklDnnType<T>());
-        dnn_shape_diff_src.SetTfLayout(
-                              dnn_shape_src.GetDimension(),
-                              src_dims,
-                              format_m);
-        dnn_shape_diff_src.SetTfDimOrder(
-                              dnn_shape_src.GetDimension(),
-                              tensor_format_);
-        tf_shape_diff_src.AddDim(diff_src_pd.get_size()/sizeof(T));
+        dnn_shape_diff_src.SetTfLayout(dnn_shape_src.GetDimension(), src_dims,
+                                       format_m);
+        dnn_shape_diff_src.SetTfDimOrder(dnn_shape_src.GetDimension(),
+                                         tensor_format_);
+        tf_shape_diff_src.AddDim(diff_src_pd.get_size() / sizeof(T));
       } else {
         dnn_shape_diff_src.SetMklTensor(false);
         tf_shape_diff_src = src_tensor.shape();
@@ -1287,33 +1232,22 @@ class MklFusedBatchNormGradOp : public OpKernel {
 
       prop_kind pk = prop_kind::backward;
       auto bnrm_bwd_desc = batch_normalization_backward::desc(
-                               pk,
-                               diff_src.GetUsrMemDesc(),
-                               src.GetUsrMemDesc(),
-                               epsilon_,
-                               /* for inference, specify use_global_stats
-                                  1. on fwd prop, use mean and variance
-                                     provided as inputs
-                                  2. on bwd prop, mean and variance are
-                                     considered as constants. Thus, 
-                                     reduce the amout of MKL computations
-                               */
-                               is_training_ ? use_scale_shift :
-                               (use_scale_shift | use_global_stats));
+          pk, diff_src.GetUsrMemDesc(), src.GetUsrMemDesc(), epsilon_,
+          /* for inference, specify use_global_stats
+             1. on fwd prop, use mean and variance
+                provided as inputs
+             2. on bwd prop, mean and variance are
+                considered as constants. Thus,
+                reduce the amout of MKL computations
+          */
+          is_training_ ? use_scale_shift
+                       : (use_scale_shift | use_global_stats));
       auto bnrm_bwd_pd = batch_normalization_backward::primitive_desc(
-                               bnrm_bwd_desc,
-                               cpu_engine,
-                               bnrm_fwd_pd);
+          bnrm_bwd_desc, cpu_engine, bnrm_fwd_pd);
 
       auto bnrm_bwd_op = batch_normalization_backward(
-                               bnrm_bwd_pd,
-                               src.GetOpMem(),
-                               mean.GetOpMem(),
-                               variance.GetOpMem(),
-                               diff_dst.GetOpMem(),
-                               weights_m,
-                               diff_src.GetOpMem(),
-                               diff_weights_m);
+          bnrm_bwd_pd, src.GetOpMem(), mean.GetOpMem(), variance.GetOpMem(),
+          diff_dst.GetOpMem(), weights_m, diff_src.GetOpMem(), diff_weights_m);
 
       std::vector<primitive> net;
       net.push_back(bnrm_bwd_op);
@@ -1322,43 +1256,39 @@ class MklFusedBatchNormGradOp : public OpKernel {
       // allocate 4 output TF tensors
       Tensor* diff_scale_tensor = nullptr;
       Tensor* diff_shift_tensor = nullptr;
-      AllocateTFOutputs(context, scale_tensor.shape(),
-                        &diff_scale_tensor,
+      AllocateTFOutputs(context, scale_tensor.shape(), &diff_scale_tensor,
                         &diff_shift_tensor);
 
       // copy data: diff_scale and diff_shift
-      T* diff_weights_data_dnn = reinterpret_cast<T*>
-                                 (diff_weights_m.get_data_handle());
+      T* diff_weights_data_dnn =
+          reinterpret_cast<T*>(diff_weights_m.get_data_handle());
       for (int i = 0; i < depth_; i++) {
-        diff_scale_tensor->flat<T>().data()[i] =
-                              diff_weights_data_dnn[i];
+        diff_scale_tensor->flat<T>().data()[i] = diff_weights_data_dnn[i];
         diff_shift_tensor->flat<T>().data()[i] =
-                              diff_weights_data_dnn[i + depth_];
+            diff_weights_data_dnn[i + depth_];
       }
-    } catch (mkldnn::error &e) {
+    } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
-                          ", message: " + string(e.message) +
-                          ", in file " + string(__FILE__) + ":" +
-                          std::to_string(__LINE__);
-      OP_REQUIRES_OK(context,
-                     errors::Aborted("Operation received an exception:",
-                     error_msg));
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
     }
   }
 
  private:
   T epsilon_;
   TensorFormat tensor_format_;
-  int depth_;             // batch normalization is done for per channel.
+  int depth_;  // batch normalization is done for per channel.
   bool is_training_;
 
   void ExtractParams(OpKernelContext* context) {
-      const Tensor& input = MklGetInput(context, 0);
-      depth_ = static_cast<int>(GetTensorDim(input, tensor_format_, 'C'));
+    const Tensor& input = MklGetInput(context, 0);
+    depth_ = static_cast<int>(GetTensorDim(input, tensor_format_, 'C'));
   }
 
-  void HandleEmptyInput(OpKernelContext* context,
-                        TensorShape tf_shape_src,
+  void HandleEmptyInput(OpKernelContext* context, TensorShape tf_shape_src,
                         TensorShape tf_shape_scale_shift,
                         Tensor** diff_src_tensor) {
     const size_t kDiffSrcIndex = 0;
@@ -1366,22 +1296,20 @@ class MklFusedBatchNormGradOp : public OpKernel {
     MklDnnShape dnn_shape_diff_src;
     dnn_shape_diff_src.SetMklTensor(false);
     AllocateOutputSetMklShape(context, kDiffSrcIndex, diff_src_tensor,
-                                tf_shape_src, dnn_shape_diff_src);
-    for (size_t i=0; i < (*diff_src_tensor)->shape().num_elements(); i++)
-       (*diff_src_tensor)->flat<T>().data()[i] = 0;
+                              tf_shape_src, dnn_shape_diff_src);
+    for (size_t i = 0; i < (*diff_src_tensor)->shape().num_elements(); i++)
+      (*diff_src_tensor)->flat<T>().data()[i] = 0;
 
     Tensor* diff_scale_tensor = nullptr;
     Tensor* diff_shift_tensor = nullptr;
-    AllocateTFOutputs(context,
-                      tf_shape_scale_shift,
-                      &diff_scale_tensor,
+    AllocateTFOutputs(context, tf_shape_scale_shift, &diff_scale_tensor,
                       &diff_shift_tensor);
   }
 
   void AllocateTFOutputs(OpKernelContext* context,
-                        TensorShape tf_shape_scale_shift,
-                        Tensor** diff_scale_tensor,
-                        Tensor** diff_shift_tensor) {
+                         TensorShape tf_shape_scale_shift,
+                         Tensor** diff_scale_tensor,
+                         Tensor** diff_shift_tensor) {
     CHECK_NOTNULL(diff_scale_tensor);
     CHECK_NOTNULL(diff_shift_tensor);
 
@@ -1396,31 +1324,29 @@ class MklFusedBatchNormGradOp : public OpKernel {
     AllocateOutputSetMklShape(context, kDiffScaleIndex, diff_scale_tensor,
                               tf_shape_scale_shift, mkl_shape_diff_scale);
     CHECK_NOTNULL(*diff_scale_tensor);
-    for (size_t i=0; i < (*diff_scale_tensor)->shape().num_elements(); i++)
-       (*diff_scale_tensor)->flat<T>().data()[i] = 0;
+    for (size_t i = 0; i < (*diff_scale_tensor)->shape().num_elements(); i++)
+      (*diff_scale_tensor)->flat<T>().data()[i] = 0;
 
     MklDnnShape mkl_shape_diff_shift;
     mkl_shape_diff_shift.SetMklTensor(false);
     AllocateOutputSetMklShape(context, kDiffShiftIndex, diff_shift_tensor,
                               tf_shape_scale_shift, mkl_shape_diff_shift);
     CHECK_NOTNULL(*diff_shift_tensor);
-    for (size_t i=0; i < (*diff_shift_tensor)->shape().num_elements(); i++)
-       (*diff_shift_tensor)->flat<T>().data()[i] = 0;
+    for (size_t i = 0; i < (*diff_shift_tensor)->shape().num_elements(); i++)
+      (*diff_shift_tensor)->flat<T>().data()[i] = 0;
 
     // Placeholders for estimated_mean and estimated_variance, which are
     // used for inference and thus not needed here for gradient computation.
-    Tensor* p1_tensor = nullptr, *p2_tensor = nullptr;
+    Tensor *p1_tensor = nullptr, *p2_tensor = nullptr;
     MklDnnShape mkl_shape_p;
     mkl_shape_p.SetMklTensor(false);
-    AllocateOutputSetMklShape(context, kP1Index, &p1_tensor,
-                              TensorShape({}), mkl_shape_p);
-    AllocateOutputSetMklShape(context, kP2Index, &p2_tensor,
-                              TensorShape({}), mkl_shape_p);
+    AllocateOutputSetMklShape(context, kP1Index, &p1_tensor, TensorShape({}),
+                              mkl_shape_p);
+    AllocateOutputSetMklShape(context, kP2Index, &p2_tensor, TensorShape({}),
+                              mkl_shape_p);
   }
 
-  memory::dims GetMeanVarianceDims() {
-    return memory::dims({1, depth_});
-  }
+  memory::dims GetMeanVarianceDims() { return memory::dims({1, depth_}); }
 };
 
 #endif
diff --git a/tensorflow/core/kernels/mkl_input_conversion_op.cc b/tensorflow/core/kernels/mkl_input_conversion_op.cc
index 4b5f7b8310..73d41efce1 100644
--- a/tensorflow/core/kernels/mkl_input_conversion_op.cc
+++ b/tensorflow/core/kernels/mkl_input_conversion_op.cc
@@ -271,8 +271,8 @@ class MklInputConversionOp : public OpKernel {
     MklDnnShape input_shape_1;
     GetMklShape(context, 1, &input_shape_1);
 
-    bool tf_shapes_are_same = context->input(0).shape() ==
-                              context->input(1).shape();
+    bool tf_shapes_are_same =
+        context->input(0).shape() == context->input(1).shape();
 
     VLOG(1) << "MklInputConversionOp: Input shapes are "
             << (tf_shapes_are_same ? "*same*" : "*different*") << ": "
@@ -400,9 +400,9 @@ class MklInputConversionOp : public OpKernel {
 
       // Create reorder between tensorflow layout and Mkl layout.
       std::vector<primitive> net;
-      CHECK_EQ(tf_input.CheckReorderToOpMem(memory::primitive_desc(
-                                            output_mkl_md, cpu_engine),
-                                            tensor_out, &net),
+      CHECK_EQ(tf_input.CheckReorderToOpMem(
+                   memory::primitive_desc(output_mkl_md, cpu_engine),
+                   tensor_out, &net),
                true);
       stream(stream::kind::eager).submit(net).wait();
 
diff --git a/tensorflow/core/kernels/mkl_lrn_op.cc b/tensorflow/core/kernels/mkl_lrn_op.cc
index 95e0404ba8..a8b45004b7 100644
--- a/tensorflow/core/kernels/mkl_lrn_op.cc
+++ b/tensorflow/core/kernels/mkl_lrn_op.cc
@@ -22,6 +22,9 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 #include <vector>
+#include "mkl_dnn.h"
+#include "mkl_dnn_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -30,9 +33,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/tensor_format.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "mkl_dnn.h"
-#include "mkl_dnn_types.h"
 
 #if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/core/util/work_sharder.h"
@@ -40,10 +40,10 @@ limitations under the License.
 
 #ifdef INTEL_MKL_DNN
 #include "mkldnn.hpp"
-using mkldnn::lrn_forward;
+using mkldnn::lrn_across_channels;
 using mkldnn::lrn_backward;
+using mkldnn::lrn_forward;
 using mkldnn::prop_kind;
-using mkldnn::lrn_across_channels;
 using mkldnn::stream;
 #endif
 
@@ -77,10 +77,11 @@ class MklLRNOp : public OpKernel {
   explicit MklLRNOp(OpKernelConstruction* context) : OpKernel(context) {
     int64 depth_radius64;
     OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
-    OP_REQUIRES(context, FastBoundsCheck(depth_radius64,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("depth_radius = ", depth_radius64,
-                                        " larger than int max"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("depth_radius = ", depth_radius64,
+                                " larger than int max"));
     depth_radius_ = static_cast<size_t>(depth_radius64);
 
     OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_));
@@ -103,9 +104,10 @@ class MklLRNOp : public OpKernel {
                               : input.dims();
     OP_REQUIRES(context, mkl_context.in_dims == 4,
                 errors::InvalidArgument("input must be 4-dimensional"));
-    OP_REQUIRES(context, FastBoundsCheck(input.NumElements(),
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("argument to LRN too large"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(input.NumElements(), std::numeric_limits<int>::max()),
+        errors::InvalidArgument("argument to LRN too large"));
 
     if (!input_in_mkl_format) {
       mkl_context.MklDefaultToEigen(context, depth_radius_, bias_, alpha_,
@@ -339,17 +341,17 @@ class MklLRNOp : public OpKernel {
   float beta_;
 };
 
-
 template <typename T>
 class MklLRNGradOp : public OpKernel {
  public:
   explicit MklLRNGradOp(OpKernelConstruction* context) : OpKernel(context) {
     int64 depth_radius64;
     OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
-    OP_REQUIRES(context, FastBoundsCheck(depth_radius64,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("depth_radius = ", depth_radius64,
-                                        " larger than int max"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("depth_radius = ", depth_radius64,
+                                " larger than int max"));
     depth_radius_ = static_cast<int>(depth_radius64);
     OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_));
     OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_));
@@ -740,10 +742,11 @@ class MklLRNOp : public OpKernel {
   explicit MklLRNOp(OpKernelConstruction* context) : OpKernel(context) {
     int64 depth_radius64;
     OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
-    OP_REQUIRES(context, FastBoundsCheck(depth_radius64,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("depth_radius = ", depth_radius64,
-                                        " larger than int max"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("depth_radius = ", depth_radius64,
+                                " larger than int max"));
     depth_radius_ = static_cast<size_t>(depth_radius64);
 
     OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_));
@@ -773,10 +776,10 @@ class MklLRNOp : public OpKernel {
       if (!src_dnn_shape.IsMklTensor()) {
         MklDefaultToEigen(context, src_tensor);
         return;
-      } else if (!src_dnn_shape.IsMklChannelDim(
-                  src_dnn_shape.GetDimension() - 1) ) {
+      } else if (!src_dnn_shape.IsMklChannelDim(src_dnn_shape.GetDimension() -
+                                                1)) {
         Tensor converted_tensor =
-          ConvertMklToTF<T>(context, src_tensor, src_dnn_shape);
+            ConvertMklToTF<T>(context, src_tensor, src_dnn_shape);
         MklDefaultToEigen(context, converted_tensor);
         return;
       }
@@ -807,18 +810,16 @@ class MklLRNOp : public OpKernel {
       // Create LRN primitive descriptor.
       // Tensorflow's normalization semantics is across channels.
       // MKL-DNN also supports normalization within channel.
-      auto lrn_desc = lrn_forward::desc(prop_kind::forward,
-                                        lrn_across_channels,
+      auto lrn_desc = lrn_forward::desc(prop_kind::forward, lrn_across_channels,
                                         src_dnn_data.GetUsrMemDesc(),
-                                        kernel_size,
-                                        new_alpha, beta_, bias_);
+                                        kernel_size, new_alpha, beta_, bias_);
       auto lrn_prim_desc = lrn_forward::primitive_desc(lrn_desc, cpu_engine);
 
       // Allocate output_dnn_data tensor.
       Tensor* output_tensor = nullptr;
       memory::format input_format = src_dnn_shape.GetTfDataFormat();
-      AllocateOutputTensor(context, lrn_prim_desc, input_dims,
-                    input_format, &output_tensor);
+      AllocateOutputTensor(context, lrn_prim_desc, input_dims, input_format,
+                           &output_tensor);
       OP_REQUIRES_OK(context, context->status());
       CHECK_NOTNULL(output_tensor);
       dst_dnn_data.SetUsrMemDataHandle(output_tensor);
@@ -827,25 +828,23 @@ class MklLRNOp : public OpKernel {
       AllocateWorkspaceTensor(context, lrn_prim_desc, &workspace_dnn_data);
       OP_REQUIRES_OK(context, context->status());
 
-      PrepareAndExecuteNet(lrn_prim_desc, &src_dnn_data,
-                          &dst_dnn_data, &workspace_dnn_data);
-    } catch (mkldnn::error &e) {
+      PrepareAndExecuteNet(lrn_prim_desc, &src_dnn_data, &dst_dnn_data,
+                           &workspace_dnn_data);
+    } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
-                       ", message: " + string(e.message) +
-                       ", in file " + string(__FILE__) + ":" +
-                       std::to_string(__LINE__);
-      OP_REQUIRES_OK(context,
-                     errors::Aborted("Operation received an exception:",
-                     error_msg));
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
     }
   }
 
  private:
-  void PrepareAndExecuteNet(
-                const lrn_forward::primitive_desc& lrn_fwd_desc,
-                MklDnnData<T>* src_dnn_data,
-                MklDnnData<T>* dst_dnn_data,
-                MklDnnData<uint8>* wksp_dnn_data = nullptr) {
+  void PrepareAndExecuteNet(const lrn_forward::primitive_desc& lrn_fwd_desc,
+                            MklDnnData<T>* src_dnn_data,
+                            MklDnnData<T>* dst_dnn_data,
+                            MklDnnData<uint8>* wksp_dnn_data = nullptr) {
     std::vector<primitive> net;
 
     // Check for input reorder
@@ -853,23 +852,21 @@ class MklLRNOp : public OpKernel {
 
     // Create pooling primitive and add it to net
     if (wksp_dnn_data != nullptr) {
-        net.push_back(lrn_forward(lrn_fwd_desc,
-                        src_dnn_data->GetOpMem(),
-                        wksp_dnn_data->GetOpMem(),
-                        dst_dnn_data->GetOpMem()));
+      net.push_back(lrn_forward(lrn_fwd_desc, src_dnn_data->GetOpMem(),
+                                wksp_dnn_data->GetOpMem(),
+                                dst_dnn_data->GetOpMem()));
     } else {
-        net.push_back(lrn_forward(lrn_fwd_desc,
-            src_dnn_data->GetOpMem(),
-            dst_dnn_data->GetOpMem()));
+      net.push_back(lrn_forward(lrn_fwd_desc, src_dnn_data->GetOpMem(),
+                                dst_dnn_data->GetOpMem()));
     }
     stream(stream::kind::eager).submit(net).wait();
   }
 
-  void AllocateOutputTensor(OpKernelContext* context,
-            const lrn_forward::primitive_desc& lrn_fwd_prim_desc,
-            const memory::dims output_dims_mkl_order,
-            const memory::format& output_tf_format,
-            Tensor** output_tensor) {
+  void AllocateOutputTensor(
+      OpKernelContext* context,
+      const lrn_forward::primitive_desc& lrn_fwd_prim_desc,
+      const memory::dims output_dims_mkl_order,
+      const memory::format& output_tf_format, Tensor** output_tensor) {
     CHECK_NOTNULL(output_tensor);
     memory::primitive_desc dst_pd = lrn_fwd_prim_desc.dst_primitive_desc();
 
@@ -880,111 +877,106 @@ class MklLRNOp : public OpKernel {
     output_mkl_shape.SetMklLayout(&dst_pd);
     output_mkl_shape.SetElemType(MklDnnType<T>());
     output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
-                              output_dims_mkl_order,
-                              output_tf_format);
+                                 output_dims_mkl_order, output_tf_format);
     TensorShape output_tf_shape;
     // only allocate enough space for the elements we need.
     size_t num_bytes = dst_pd.get_size();
     CHECK_EQ(num_bytes % sizeof(T), 0);
     output_tf_shape.AddDim(num_bytes / sizeof(T));
-    AllocateOutputSetMklShape(context, kIdxOutput,
-                            output_tensor,
-                            output_tf_shape, output_mkl_shape);
-    }
-
-    // Fallback implementation - Taken from lrn_op.cc
-    // TODO(inteltf) Check if we can use EigenLRNOp directly instead of making a
-    // copy.
-    void MklDefaultToEigen(OpKernelContext* context,
-                           const Tensor& input) {
-      const int batch = static_cast<int>(input.dim_size(0));
-      const int rows = static_cast<int>(input.dim_size(1));
-      const int cols = static_cast<int>(input.dim_size(2));
-      const int depth = static_cast<int>(input.dim_size(3));
-      const int nodes = cols * rows;
-
-      auto in_shaped = input.shaped<T, 2>({nodes * batch, depth});
-      // Multiplying the input with the band matrix has the effect of reducing
-      // the
-      // correct patch along the depth.
-      Eigen::Tensor<T, 2, Eigen::RowMajor> multiplier(depth, depth);
-      GetBandMatrix<T>(depth, depth_radius_, &multiplier);
+    AllocateOutputSetMklShape(context, kIdxOutput, output_tensor,
+                              output_tf_shape, output_mkl_shape);
+  }
 
-      Tensor *output_dnn_data = nullptr;
-      MklDnnShape mkl_output_mkl_shape;
-      mkl_output_mkl_shape.SetMklTensor(false);
-      mkl_output_mkl_shape.SetDimensions(4);
-      AllocateOutputSetMklShape(context, kIdxOutput, &output_dnn_data,
-                                input.shape(), mkl_output_mkl_shape);
-      CHECK_NOTNULL(output_dnn_data);
-
-      Tensor* workspace_tensor = nullptr;
-      MklDnnShape workspace_mkl_shape;
-      workspace_mkl_shape.SetMklTensor(false);
-      TensorShape workspace_tf_shape;
-      workspace_tf_shape.AddDim(0);
-      AllocateOutputSetMklShape(context, kIdxWorkspace,
-                              &workspace_tensor,
+  // Fallback implementation - Taken from lrn_op.cc
+  // TODO(inteltf) Check if we can use EigenLRNOp directly instead of making a
+  // copy.
+  void MklDefaultToEigen(OpKernelContext* context, const Tensor& input) {
+    const int batch = static_cast<int>(input.dim_size(0));
+    const int rows = static_cast<int>(input.dim_size(1));
+    const int cols = static_cast<int>(input.dim_size(2));
+    const int depth = static_cast<int>(input.dim_size(3));
+    const int nodes = cols * rows;
+
+    auto in_shaped = input.shaped<T, 2>({nodes * batch, depth});
+    // Multiplying the input with the band matrix has the effect of reducing
+    // the
+    // correct patch along the depth.
+    Eigen::Tensor<T, 2, Eigen::RowMajor> multiplier(depth, depth);
+    GetBandMatrix<T>(depth, depth_radius_, &multiplier);
+
+    Tensor* output_dnn_data = nullptr;
+    MklDnnShape mkl_output_mkl_shape;
+    mkl_output_mkl_shape.SetMklTensor(false);
+    mkl_output_mkl_shape.SetDimensions(4);
+    AllocateOutputSetMklShape(context, kIdxOutput, &output_dnn_data,
+                              input.shape(), mkl_output_mkl_shape);
+    CHECK_NOTNULL(output_dnn_data);
+
+    Tensor* workspace_tensor = nullptr;
+    MklDnnShape workspace_mkl_shape;
+    workspace_mkl_shape.SetMklTensor(false);
+    TensorShape workspace_tf_shape;
+    workspace_tf_shape.AddDim(0);
+    AllocateOutputSetMklShape(context, kIdxWorkspace, &workspace_tensor,
                               workspace_tf_shape, workspace_mkl_shape);
-      CHECK_NOTNULL(workspace_tensor);
-
-      auto out_shaped = output_dnn_data->shaped<T, 2>({nodes * batch, depth});
-      Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
-      auto tmp = in_shaped.square().contract(multiplier, dims) * alpha_ + bias_;
-      if (beta_ == T(1)) {
-        out_shaped.device(context->eigen_cpu_device()) =
-            in_shaped * tmp.inverse();
-      } else if (beta_ == T(0.5)) {
-        out_shaped.device(context->eigen_cpu_device()) =
-            in_shaped * tmp.rsqrt();
-      } else {
-        out_shaped.device(context->eigen_cpu_device()) =
-            in_shaped * (tmp.log() * -beta_).exp();
-      }
+    CHECK_NOTNULL(workspace_tensor);
+
+    auto out_shaped = output_dnn_data->shaped<T, 2>({nodes * batch, depth});
+    Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
+    auto tmp = in_shaped.square().contract(multiplier, dims) * alpha_ + bias_;
+    if (beta_ == T(1)) {
+      out_shaped.device(context->eigen_cpu_device()) =
+          in_shaped * tmp.inverse();
+    } else if (beta_ == T(0.5)) {
+      out_shaped.device(context->eigen_cpu_device()) = in_shaped * tmp.rsqrt();
+    } else {
+      out_shaped.device(context->eigen_cpu_device()) =
+          in_shaped * (tmp.log() * -beta_).exp();
     }
+  }
 
-    void AllocateWorkspaceTensor(OpKernelContext* context,
-                const lrn_forward::primitive_desc& lrn_fwd_prim_desc,
-                MklDnnData<uint8>* dnn_data_wksp) {
-      CHECK_NOTNULL(dnn_data_wksp);
-      Tensor* workspace_tensor = nullptr;
-      memory::primitive_desc workspace_pd
-                  = lrn_fwd_prim_desc.workspace_primitive_desc();
-      size_t workspace_bytes = workspace_pd.get_size();
-      MklDnnShape workspace_mkl_shape;
-      // the workspace tensor is a uint8 tensor that has
-      // exactly the number of bytes necessary
-      workspace_mkl_shape.SetMklTensor(false);
-      TensorShape workspace_tf_shape;
-      workspace_tf_shape.AddDim(workspace_bytes);
-      AllocateOutputSetMklShape(context, kIdxWorkspace,
-                              &workspace_tensor,
+  void AllocateWorkspaceTensor(
+      OpKernelContext* context,
+      const lrn_forward::primitive_desc& lrn_fwd_prim_desc,
+      MklDnnData<uint8>* dnn_data_wksp) {
+    CHECK_NOTNULL(dnn_data_wksp);
+    Tensor* workspace_tensor = nullptr;
+    memory::primitive_desc workspace_pd =
+        lrn_fwd_prim_desc.workspace_primitive_desc();
+    size_t workspace_bytes = workspace_pd.get_size();
+    MklDnnShape workspace_mkl_shape;
+    // the workspace tensor is a uint8 tensor that has
+    // exactly the number of bytes necessary
+    workspace_mkl_shape.SetMklTensor(false);
+    TensorShape workspace_tf_shape;
+    workspace_tf_shape.AddDim(workspace_bytes);
+    AllocateOutputSetMklShape(context, kIdxWorkspace, &workspace_tensor,
                               workspace_tf_shape, workspace_mkl_shape);
-      CHECK_NOTNULL(workspace_tensor);
-      dnn_data_wksp->SetUsrMem(workspace_pd, workspace_tensor);
-    }
+    CHECK_NOTNULL(workspace_tensor);
+    dnn_data_wksp->SetUsrMem(workspace_pd, workspace_tensor);
+  }
 
   void SanityCheckInputs(OpKernelContext* context) {
     const Tensor& src_tensor = MklGetInput(context, kIdxInput);
     MklDnnShape src_dnn_shape;
     GetMklShape(context, kIdxInput, &src_dnn_shape);
     if (src_dnn_shape.IsMklTensor()) {
-        OP_REQUIRES(context, src_dnn_shape.GetDimension() == 4,
-                    errors::InvalidArgument("input must be 4-dimensional"));
-        OP_REQUIRES(context, FastBoundsCheck(src_tensor.NumElements(),
-                                            std::numeric_limits<int>::max()),
-                        errors::InvalidArgument("argument to LRN too large"));
+      OP_REQUIRES(context, src_dnn_shape.GetDimension() == 4,
+                  errors::InvalidArgument("input must be 4-dimensional"));
+      OP_REQUIRES(context,
+                  FastBoundsCheck(src_tensor.NumElements(),
+                                  std::numeric_limits<int>::max()),
+                  errors::InvalidArgument("argument to LRN too large"));
     } else {
-        OP_REQUIRES(context, src_tensor.dims() == 4,
-                    errors::InvalidArgument("input must be 4-dimensional"));
-        OP_REQUIRES(context, FastBoundsCheck(src_tensor.NumElements(),
-                                            std::numeric_limits<int>::max()),
-                        errors::InvalidArgument("argument to LRN too large"));
+      OP_REQUIRES(context, src_tensor.dims() == 4,
+                  errors::InvalidArgument("input must be 4-dimensional"));
+      OP_REQUIRES(context,
+                  FastBoundsCheck(src_tensor.NumElements(),
+                                  std::numeric_limits<int>::max()),
+                  errors::InvalidArgument("argument to LRN too large"));
     }
   }
-  const int kIdxInput = 0,
-            kIdxOutput = 0,
-            kIdxWorkspace = 1;
+  const int kIdxInput = 0, kIdxOutput = 0, kIdxWorkspace = 1;
 
   typedef typename Eigen::Tensor<T, 1, Eigen::RowMajor>::DimensionPair DimPair;
   bool workspace_enabled_;
@@ -994,17 +986,17 @@ class MklLRNOp : public OpKernel {
   float beta_;
 };
 
-
 template <typename T>
 class MklLRNGradOp : public OpKernel {
  public:
   explicit MklLRNGradOp(OpKernelConstruction* context) : OpKernel(context) {
     int64 depth_radius64;
     OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
-    OP_REQUIRES(context, FastBoundsCheck(depth_radius64,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("depth_radius = ", depth_radius64,
-                                        " larger than int max"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("depth_radius = ", depth_radius64,
+                                " larger than int max"));
     depth_radius_ = static_cast<int>(depth_radius64);
     OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_));
     OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_));
@@ -1025,7 +1017,7 @@ class MklLRNGradOp : public OpKernel {
       MklDnnData<T> output_dnn_data(&cpu_engine);
 
       MklDnnShape input_grad_dnn_shape, orig_input_dnn_shape,
-                  orig_output_dnn_shape;
+          orig_output_dnn_shape;
       GetMklShape(context, kIdxGradient, &input_grad_dnn_shape);
       GetMklShape(context, kIdxOrigInput, &orig_input_dnn_shape);
       GetMklShape(context, kIdxOrigOutput, &orig_output_dnn_shape);
@@ -1037,16 +1029,16 @@ class MklLRNGradOp : public OpKernel {
                             orig_input_dnn_shape.IsMklTensor() &&
                             orig_output_dnn_shape.IsMklTensor() &&
                             input_grad_dnn_shape.IsMklChannelDim(
-                              input_grad_dnn_shape.GetDimension() - 1) &&
+                                input_grad_dnn_shape.GetDimension() - 1) &&
                             orig_input_dnn_shape.IsMklChannelDim(
-                              orig_input_dnn_shape.GetDimension() - 1) &&
+                                orig_input_dnn_shape.GetDimension() - 1) &&
                             orig_output_dnn_shape.IsMklChannelDim(
-                              orig_output_dnn_shape.GetDimension() - 1);
+                                orig_output_dnn_shape.GetDimension() - 1);
 
       if (!can_use_mkldnn) {
-          // Fallback to eigen
-          MklDefaultToEigen(context);
-          return;
+        // Fallback to eigen
+        MklDefaultToEigen(context);
+        return;
       }
       // At this point, we have the all clear to use MklDnn constructs
       // Naming: diff_dst is input_gradient_tensor; src is orig_input_tensor.
@@ -1059,13 +1051,11 @@ class MklLRNGradOp : public OpKernel {
       // NHWC format.
       memory::desc original_output_md = orig_output_dnn_shape.GetCurLayout();
       memory::desc target_diff_dst_md = ConfigureInputGradient(
-                                input_grad_tensor,
-                                input_grad_dnn_shape,
-                                &input_grad_dnn_data);
+          input_grad_tensor, input_grad_dnn_shape, &input_grad_dnn_data);
 
       memory::desc orig_input_md = orig_input_dnn_shape.GetCurLayout();
       memory::dims orig_input_dims =
-                orig_input_dnn_shape.GetSizesAsMklDnnDims();
+          orig_input_dnn_shape.GetSizesAsMklDnnDims();
       orig_input_dnn_data.SetUsrMem(orig_input_md, &orig_input_tensor);
       orig_input_dnn_data.SetOpMemDesc(orig_input_dims, memory::format::nhwc);
 
@@ -1079,27 +1069,21 @@ class MklLRNGradOp : public OpKernel {
 
       // Create LRN backward primitive descriptor. It requires LRN forward
       // primitive descriptor also.
-      auto lrn_fwd_desc = lrn_forward::desc(prop_kind::forward,
-                                        lrn_across_channels,
-                                        orig_input_md,
-                                        kernel_size,
-                                        new_alpha, beta_, bias_);
-      auto lrn_fwd_prim_desc = lrn_forward::primitive_desc(lrn_fwd_desc,
-                                                           cpu_engine);
-      auto lrn_bwd_desc = lrn_backward::desc(lrn_across_channels,
-                                        original_output_md,
-                                        target_diff_dst_md,
-                                        kernel_size,
-                                        new_alpha, beta_, bias_);
-      auto lrn_bwd_prim_desc = lrn_backward::primitive_desc(lrn_bwd_desc,
-                                                          cpu_engine,
-                                                          lrn_fwd_prim_desc);
+      auto lrn_fwd_desc = lrn_forward::desc(
+          prop_kind::forward, lrn_across_channels, orig_input_md, kernel_size,
+          new_alpha, beta_, bias_);
+      auto lrn_fwd_prim_desc =
+          lrn_forward::primitive_desc(lrn_fwd_desc, cpu_engine);
+      auto lrn_bwd_desc = lrn_backward::desc(
+          lrn_across_channels, original_output_md, target_diff_dst_md,
+          kernel_size, new_alpha, beta_, bias_);
+      auto lrn_bwd_prim_desc = lrn_backward::primitive_desc(
+          lrn_bwd_desc, cpu_engine, lrn_fwd_prim_desc);
 
       Tensor* output_tensor = nullptr;
-      memory::format orig_input_format
-                    = orig_input_dnn_shape.GetTfDataFormat();
-      AllocateOutputTensor(context, lrn_bwd_prim_desc,
-            orig_input_dims, orig_input_format, &output_tensor);
+      memory::format orig_input_format = orig_input_dnn_shape.GetTfDataFormat();
+      AllocateOutputTensor(context, lrn_bwd_prim_desc, orig_input_dims,
+                           orig_input_format, &output_tensor);
       OP_REQUIRES_OK(context, context->status());
       CHECK_NOTNULL(output_tensor);
       output_dnn_data.SetUsrMemDataHandle(output_tensor);
@@ -1110,35 +1094,32 @@ class MklLRNGradOp : public OpKernel {
       const Tensor& workspace_tensor = MklGetInput(context, kIdxWorkspace);
       MklDnnData<uint8> workspace_dnn_data(&cpu_engine);
       ConfigureWorkspace(workspace_tensor,
-                          lrn_fwd_prim_desc.workspace_primitive_desc(),
-                          &workspace_dnn_data);
-
-      PrepareAndExecuteNet(lrn_bwd_prim_desc,
-                lrn_fwd_prim_desc,
-                &orig_input_dnn_data,
-                &input_grad_dnn_data,
-                &output_dnn_data,
-                memory::primitive_desc(target_diff_dst_md, cpu_engine),
-                &workspace_dnn_data);
-    } catch (mkldnn::error &e) {
+                         lrn_fwd_prim_desc.workspace_primitive_desc(),
+                         &workspace_dnn_data);
+
+      PrepareAndExecuteNet(
+          lrn_bwd_prim_desc, lrn_fwd_prim_desc, &orig_input_dnn_data,
+          &input_grad_dnn_data, &output_dnn_data,
+          memory::primitive_desc(target_diff_dst_md, cpu_engine),
+          &workspace_dnn_data);
+    } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
-                       ", message: " + string(e.message) +
-                       ", in file " + string(__FILE__) + ":" +
-                       std::to_string(__LINE__);
-      OP_REQUIRES_OK(context,
-                     errors::Aborted("Operation received an exception:",
-                     error_msg));
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
     }
   }
 
-  void AllocateOutputTensor(OpKernelContext* context,
-            const lrn_backward::primitive_desc& lrn_bkwd_prim_desc,
-            const memory::dims output_dims_mkl_order,
-            const memory::format& output_tf_format,
-            Tensor** output_tensor) {
+  void AllocateOutputTensor(
+      OpKernelContext* context,
+      const lrn_backward::primitive_desc& lrn_bkwd_prim_desc,
+      const memory::dims output_dims_mkl_order,
+      const memory::format& output_tf_format, Tensor** output_tensor) {
     CHECK_NOTNULL(output_tensor);
-    memory::primitive_desc dst_pd
-                = lrn_bkwd_prim_desc.diff_src_primitive_desc();
+    memory::primitive_desc dst_pd =
+        lrn_bkwd_prim_desc.diff_src_primitive_desc();
     MklDnnShape output_mkl_shape;
 
     // We assume that all outputs at this point are MKL Tensors
@@ -1146,170 +1127,153 @@ class MklLRNGradOp : public OpKernel {
     output_mkl_shape.SetMklLayout(&dst_pd);
     output_mkl_shape.SetElemType(MklDnnType<T>());
     output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
-                              output_dims_mkl_order,
-                              output_tf_format);
+                                 output_dims_mkl_order, output_tf_format);
 
     TensorShape output_tf_shape;
     size_t num_bytes = dst_pd.get_size();
     CHECK_EQ(num_bytes % sizeof(T), 0);
     output_tf_shape.AddDim(num_bytes / sizeof(T));
-    AllocateOutputSetMklShape(context, kIdxOutput,
-                            output_tensor,
-                            output_tf_shape, output_mkl_shape);
+    AllocateOutputSetMklShape(context, kIdxOutput, output_tensor,
+                              output_tf_shape, output_mkl_shape);
   }
 
   memory::desc ConfigureInputGradient(const Tensor& input_grad_tensor,
-                    const MklDnnShape& input_grad_dnn_shape,
-                    MklDnnData<T> *input_grad_dnn_data) {
+                                      const MklDnnShape& input_grad_dnn_shape,
+                                      MklDnnData<T>* input_grad_dnn_data) {
     CHECK_NOTNULL(input_grad_dnn_data);
     // This shouldn't be necessary at this point, but just in case
     CHECK_EQ(input_grad_dnn_shape.IsMklTensor(), true);
 
     memory::desc input_grad_md = input_grad_dnn_shape.GetCurLayout();
-    memory::dims orig_input_dims =
-              input_grad_dnn_shape.GetSizesAsMklDnnDims();
+    memory::dims orig_input_dims = input_grad_dnn_shape.GetSizesAsMklDnnDims();
     input_grad_dnn_data->SetUsrMem(input_grad_md, &input_grad_tensor);
     input_grad_dnn_data->SetOpMemDesc(orig_input_dims, memory::format::nhwc);
     return input_grad_md;
   }
 
   void PrepareAndExecuteNet(
-        const lrn_backward::primitive_desc& lrn_bkwd_desc,
-        const lrn_forward::primitive_desc& lrn_fwd_desc,
-        MklDnnData<T>* src_dnn_data,
-        MklDnnData<T>* input_gradient_diff_dst,
-        MklDnnData<T>* output_diff_src,
-        const memory::primitive_desc& target_diff_dst_pd,
-        const MklDnnData<uint8>* workspace_dnn_data = nullptr) {
+      const lrn_backward::primitive_desc& lrn_bkwd_desc,
+      const lrn_forward::primitive_desc& lrn_fwd_desc,
+      MklDnnData<T>* src_dnn_data, MklDnnData<T>* input_gradient_diff_dst,
+      MklDnnData<T>* output_diff_src,
+      const memory::primitive_desc& target_diff_dst_pd,
+      const MklDnnData<uint8>* workspace_dnn_data = nullptr) {
     std::vector<primitive> net;
 
     // Check for input reordering on the diff dst input
     input_gradient_diff_dst->CheckReorderToOpMem(
-                lrn_bkwd_desc.diff_dst_primitive_desc(), &net);
+        lrn_bkwd_desc.diff_dst_primitive_desc(), &net);
 
     // Check for input reordering on the original input
-    src_dnn_data->CheckReorderToOpMem(lrn_fwd_desc.src_primitive_desc(),
-                                      &net);
+    src_dnn_data->CheckReorderToOpMem(lrn_fwd_desc.src_primitive_desc(), &net);
     // Create pooling primitive and add it to net
     if (nullptr == workspace_dnn_data) {
-      net.push_back(lrn_backward(lrn_bkwd_desc,
-                              src_dnn_data->GetOpMem(),
-                              input_gradient_diff_dst->GetOpMem(),
-                              output_diff_src->GetOpMem()));
+      net.push_back(lrn_backward(lrn_bkwd_desc, src_dnn_data->GetOpMem(),
+                                 input_gradient_diff_dst->GetOpMem(),
+                                 output_diff_src->GetOpMem()));
     } else {
-      net.push_back(lrn_backward(lrn_bkwd_desc,
-                                src_dnn_data->GetOpMem(),
-                                  input_gradient_diff_dst->GetOpMem(),
-                                  workspace_dnn_data->GetOpMem(),
-                                  output_diff_src->GetOpMem()));
+      net.push_back(lrn_backward(lrn_bkwd_desc, src_dnn_data->GetOpMem(),
+                                 input_gradient_diff_dst->GetOpMem(),
+                                 workspace_dnn_data->GetOpMem(),
+                                 output_diff_src->GetOpMem()));
     }
     stream(stream::kind::eager).submit(net).wait();
   }
 
   void ConfigureWorkspace(const Tensor& workspace_tensor,
-                    memory::primitive_desc workspace_pd,
-                    MklDnnData<uint8> *workspace_dnn_data) {
+                          memory::primitive_desc workspace_pd,
+                          MklDnnData<uint8>* workspace_dnn_data) {
     CHECK_NOTNULL(workspace_dnn_data);
 
     workspace_dnn_data->SetUsrMem(workspace_pd, &workspace_tensor);
   }
 
-    // Fallback implementation - Taken from lrn_op.cc
-    // TODO(intelft) Check if we can use EigenLRNOp directly instead of making a
-    // copy.
-    void MklDefaultToEigen(OpKernelContext* context) {
-      Tensor input_gradient_tensor;
-      Tensor orig_input_tensor;
-      Tensor orig_output_tensor;
-
-      MklDnnShape input_grad_dnn_shape, orig_input_dnn_shape,
-                  orig_output_dnn_shape;
-      GetMklShape(context, kIdxGradient, &input_grad_dnn_shape);
-      GetMklShape(context, kIdxOrigInput, &orig_input_dnn_shape);
-      GetMklShape(context, kIdxOrigOutput, &orig_output_dnn_shape);
-
-      if (input_grad_dnn_shape.IsMklTensor()) {
-        input_gradient_tensor =
-            ConvertMklToTF<T>(context,
-                              MklGetInput(context, kIdxGradient),
-                              input_grad_dnn_shape);
-      } else {
-        input_gradient_tensor = MklGetInput(context, kIdxGradient);
-      }
-
-      if (orig_input_dnn_shape.IsMklTensor()) {
-        orig_input_tensor =
-            ConvertMklToTF<T>(context,
-                              MklGetInput(context, kIdxOrigInput),
-                              orig_input_dnn_shape);
-      } else {
-        orig_input_tensor = MklGetInput(context, kIdxOrigInput);
-      }
+  // Fallback implementation - Taken from lrn_op.cc
+  // TODO(intelft) Check if we can use EigenLRNOp directly instead of making a
+  // copy.
+  void MklDefaultToEigen(OpKernelContext* context) {
+    Tensor input_gradient_tensor;
+    Tensor orig_input_tensor;
+    Tensor orig_output_tensor;
+
+    MklDnnShape input_grad_dnn_shape, orig_input_dnn_shape,
+        orig_output_dnn_shape;
+    GetMklShape(context, kIdxGradient, &input_grad_dnn_shape);
+    GetMklShape(context, kIdxOrigInput, &orig_input_dnn_shape);
+    GetMklShape(context, kIdxOrigOutput, &orig_output_dnn_shape);
+
+    if (input_grad_dnn_shape.IsMklTensor()) {
+      input_gradient_tensor = ConvertMklToTF<T>(
+          context, MklGetInput(context, kIdxGradient), input_grad_dnn_shape);
+    } else {
+      input_gradient_tensor = MklGetInput(context, kIdxGradient);
+    }
 
-      if (orig_output_dnn_shape.IsMklTensor()) {
-        orig_output_tensor =
-            ConvertMklToTF<T>(context,
-                              MklGetInput(context, kIdxOrigOutput),
-                              orig_output_dnn_shape);
-      } else {
-        orig_output_tensor = MklGetInput(context, kIdxOrigOutput);
-      }
+    if (orig_input_dnn_shape.IsMklTensor()) {
+      orig_input_tensor = ConvertMklToTF<T>(
+          context, MklGetInput(context, kIdxOrigInput), orig_input_dnn_shape);
+    } else {
+      orig_input_tensor = MklGetInput(context, kIdxOrigInput);
+    }
 
-      const int64 batch = static_cast<int64>(input_gradient_tensor.dim_size(0));
-      const int64 rows = static_cast<int64>(input_gradient_tensor.dim_size(1));
-      const int64 cols = static_cast<int64>(input_gradient_tensor.dim_size(2));
-      const int64 depth = static_cast<int64>(input_gradient_tensor.dim_size(3));
-      const auto nodes = cols * rows;
+    if (orig_output_dnn_shape.IsMklTensor()) {
+      orig_output_tensor = ConvertMklToTF<T>(
+          context, MklGetInput(context, kIdxOrigOutput), orig_output_dnn_shape);
+    } else {
+      orig_output_tensor = MklGetInput(context, kIdxOrigOutput);
+    }
 
-      auto grads_shaped =
-          input_gradient_tensor.shaped<T, 2>({nodes * batch, depth});
+    const int64 batch = static_cast<int64>(input_gradient_tensor.dim_size(0));
+    const int64 rows = static_cast<int64>(input_gradient_tensor.dim_size(1));
+    const int64 cols = static_cast<int64>(input_gradient_tensor.dim_size(2));
+    const int64 depth = static_cast<int64>(input_gradient_tensor.dim_size(3));
+    const auto nodes = cols * rows;
 
-      auto in_shaped = orig_input_tensor.shaped<T, 2>({nodes * batch, depth});
-      auto activations =
-              orig_output_tensor.shaped<T, 2>({nodes * batch, depth});
+    auto grads_shaped =
+        input_gradient_tensor.shaped<T, 2>({nodes * batch, depth});
 
-      Tensor* output_dnn_data;
-      MklShape mkl_output_mkl_shape;
-      mkl_output_mkl_shape.SetMklTensor(false);
-      mkl_output_mkl_shape.SetDimensions(4);
-      AllocateOutputSetMklShape(context, kIdxOutput,
-                      &output_dnn_data,
-                      input_gradient_tensor.shape(),
-                      mkl_output_mkl_shape);
+    auto in_shaped = orig_input_tensor.shaped<T, 2>({nodes * batch, depth});
+    auto activations = orig_output_tensor.shaped<T, 2>({nodes * batch, depth});
 
-      auto out_shaped = output_dnn_data->shaped<T, 2>({nodes * batch, depth});
-      out_shaped.setZero();
-      auto shard = [this, activations, in_shaped, grads_shaped, out_shaped,
-                    depth](int64 begin, int64 end) {
-        for (int64 i = begin; i < end; ++i) {
-          for (int64 j = 0; j < depth; ++j) {
-            int64 depth_begin = std::max<int64>(0, j - depth_radius_);
-            int64 depth_end = std::min<int64>(depth, j + depth_radius_ + 1);
+    Tensor* output_dnn_data;
+    MklShape mkl_output_mkl_shape;
+    mkl_output_mkl_shape.SetMklTensor(false);
+    mkl_output_mkl_shape.SetDimensions(4);
+    AllocateOutputSetMklShape(context, kIdxOutput, &output_dnn_data,
+                              input_gradient_tensor.shape(),
+                              mkl_output_mkl_shape);
 
-            T norm(0);
-            for (int64 k = depth_begin; k < depth_end; ++k) {
-              norm += in_shaped(i, k) * in_shaped(i, k);
-            }
-            norm = alpha_ * norm + bias_;
-            DCHECK_GT(norm, T(1e-6));
-            for (int64 k = depth_begin; k < depth_end; ++k) {
-              T dyi = T(-2) * alpha_ * beta_ * in_shaped(i, k) *
-                      activations(i, j) / norm;
-              if (k == j) {
-                dyi += Eigen::numext::pow(norm, -beta_);
-              }
-              dyi *= grads_shaped(i, j);
-              const_cast<typename TTypes<T, 2>::Tensor&>(out_shaped)(i, k) +=
-                  dyi;
+    auto out_shaped = output_dnn_data->shaped<T, 2>({nodes * batch, depth});
+    out_shaped.setZero();
+    auto shard = [this, activations, in_shaped, grads_shaped, out_shaped,
+                  depth](int64 begin, int64 end) {
+      for (int64 i = begin; i < end; ++i) {
+        for (int64 j = 0; j < depth; ++j) {
+          int64 depth_begin = std::max<int64>(0, j - depth_radius_);
+          int64 depth_end = std::min<int64>(depth, j + depth_radius_ + 1);
+
+          T norm(0);
+          for (int64 k = depth_begin; k < depth_end; ++k) {
+            norm += in_shaped(i, k) * in_shaped(i, k);
+          }
+          norm = alpha_ * norm + bias_;
+          DCHECK_GT(norm, T(1e-6));
+          for (int64 k = depth_begin; k < depth_end; ++k) {
+            T dyi = T(-2) * alpha_ * beta_ * in_shaped(i, k) *
+                    activations(i, j) / norm;
+            if (k == j) {
+              dyi += Eigen::numext::pow(norm, -beta_);
             }
+            dyi *= grads_shaped(i, j);
+            const_cast<typename TTypes<T, 2>::Tensor&>(out_shaped)(i, k) += dyi;
           }
         }
-      };
-      auto worker_threads =
-          *(context->device()->tensorflow_cpu_worker_threads());
-      Shard(worker_threads.num_threads, worker_threads.workers, nodes * batch,
-            depth * depth, shard);
-    }
+      }
+    };
+    auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
+    Shard(worker_threads.num_threads, worker_threads.workers, nodes * batch,
+          depth * depth, shard);
+  }
 
   void SanityCheckInputs(OpKernelContext* context) {
     const Tensor& input_gradient_tensor = MklGetInput(context, kIdxGradient);
@@ -1317,59 +1281,59 @@ class MklLRNGradOp : public OpKernel {
     const Tensor& orig_output_tensor = MklGetInput(context, kIdxOrigOutput);
     const Tensor& workspace_tensor = MklGetInput(context, kIdxWorkspace);
     MklDnnShape in_grads_dnn_shape, in_image_dnn_shape, out_image_dnn_shape,
-                workspace_dnn_shape;
+        workspace_dnn_shape;
     GetMklShape(context, kIdxGradient, &in_grads_dnn_shape);
     GetMklShape(context, kIdxOrigInput, &in_image_dnn_shape);
     GetMklShape(context, kIdxOrigOutput, &out_image_dnn_shape);
     GetMklShape(context, kIdxWorkspace, &workspace_dnn_shape);
     if (in_grads_dnn_shape.IsMklTensor()) {
       OP_REQUIRES(context, in_grads_dnn_shape.GetDimension() == 4,
-                errors::InvalidArgument("Input gradient must be "
-                "4-dimensional"));
+                  errors::InvalidArgument("Input gradient must be "
+                                          "4-dimensional"));
     } else {
-      OP_REQUIRES(context, input_gradient_tensor.dims() == 4,
-            errors::InvalidArgument("input gradient must be 4-dimensional"));
+      OP_REQUIRES(
+          context, input_gradient_tensor.dims() == 4,
+          errors::InvalidArgument("input gradient must be 4-dimensional"));
     }
 
     if (in_image_dnn_shape.IsMklTensor()) {
       OP_REQUIRES(context, in_image_dnn_shape.GetDimension() == 4,
-                errors::InvalidArgument("input images must be "
-                "4-dimensional"));
+                  errors::InvalidArgument("input images must be "
+                                          "4-dimensional"));
     } else {
       OP_REQUIRES(context, orig_input_tensor.dims() == 4,
                   errors::InvalidArgument("input images must be "
-                  "4-dimensional"));
+                                          "4-dimensional"));
     }
 
     if (out_image_dnn_shape.IsMklTensor()) {
       OP_REQUIRES(context, out_image_dnn_shape.GetDimension() == 4,
-                errors::InvalidArgument("Output image must be "
-                "4-dimensional"));
+                  errors::InvalidArgument("Output image must be "
+                                          "4-dimensional"));
     } else {
-      OP_REQUIRES(context, orig_output_tensor.dims() == 4,
-            errors::InvalidArgument("Output image must be 4-dimensional"));
+      OP_REQUIRES(
+          context, orig_output_tensor.dims() == 4,
+          errors::InvalidArgument("Output image must be 4-dimensional"));
     }
 
     if (workspace_enabled_) {
       if (workspace_dnn_shape.IsMklTensor()) {
-        OP_REQUIRES(context, workspace_dnn_shape.IsMklTensor() == false,
-              errors::InvalidArgument("Workspace should not be MKL Tensor."));
+        OP_REQUIRES(
+            context, workspace_dnn_shape.IsMklTensor() == false,
+            errors::InvalidArgument("Workspace should not be MKL Tensor."));
       } else {
         OP_REQUIRES(context, workspace_tensor.dims() == 1,
-              errors::InvalidArgument("Workspace must be 1-dimensional"));
+                    errors::InvalidArgument("Workspace must be 1-dimensional"));
       }
     }
   }
 
-// Input("input_grads: T")
-// Input("input_image: T")
-// Input("output_image: T")
-// Input("workspace: uint8")
-  const int kIdxGradient = 0,
-            kIdxOrigInput = 1,
-            kIdxOrigOutput = 2,
-            kIdxWorkspace = 3,
-            kIdxOutput = 0;
+  // Input("input_grads: T")
+  // Input("input_image: T")
+  // Input("output_image: T")
+  // Input("workspace: uint8")
+  const int kIdxGradient = 0, kIdxOrigInput = 1, kIdxOrigOutput = 2,
+            kIdxWorkspace = 3, kIdxOutput = 0;
 
   typedef typename Eigen::Tensor<T, 1, Eigen::RowMajor>::DimensionPair DimPair;
   bool workspace_enabled_;
@@ -1393,7 +1357,6 @@ class MklLRNGradOp : public OpKernel {
                               .Label(mkl_op_registry::kMklOpLabel), \
                           MklLRNGradOp<T>);
 
-
 TF_CALL_float(REGISTER_MKL_LRN_CPU);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/mkl_maxpooling_op.cc b/tensorflow/core/kernels/mkl_maxpooling_op.cc
index 82c5229bab..0de27ccd60 100644
--- a/tensorflow/core/kernels/mkl_maxpooling_op.cc
+++ b/tensorflow/core/kernels/mkl_maxpooling_op.cc
@@ -25,14 +25,14 @@ limitations under the License.
 #ifdef INTEL_MKL_DNN
 #include <algorithm>
 #include "mkldnn.hpp"
-using mkldnn::memory;
+using mkldnn::algorithm;
+using mkldnn::engine;
 using mkldnn::error;
-using mkldnn::pooling_forward;
-using mkldnn::pooling_backward;
+using mkldnn::memory;
 using mkldnn::padding_kind;
-using mkldnn::engine;
+using mkldnn::pooling_backward;
+using mkldnn::pooling_forward;
 using mkldnn::prop_kind;
-using mkldnn::algorithm;
 #endif
 
 namespace tensorflow {
@@ -397,18 +397,19 @@ class MklMaxPoolingGradOp : public OpKernel {
       if (workspace_enabled == false) {
         if (convert_input != nullptr) {
           if (input_in_mkl_format == false) {
-            CHECK_EQ(
-                dnnConversionExecute_F32(
-                    convert_input, const_cast<void*>(static_cast<const void*>(
-                                       tensor_in.flat<T>().data())),
-                    input_buf),
-                E_SUCCESS);
+            CHECK_EQ(dnnConversionExecute_F32(
+                         convert_input,
+                         const_cast<void*>(static_cast<const void*>(
+                             tensor_in.flat<T>().data())),
+                         input_buf),
+                     E_SUCCESS);
             CHECK_EQ(dnnDelete_F32(convert_input), E_SUCCESS);
             convert_input = nullptr;
           } else {
             input_shape.GetConvertedFlatData(
-                lt_input_prim, const_cast<void*>(static_cast<const void*>(
-                                   tensor_in.flat<T>().data())),
+                lt_input_prim,
+                const_cast<void*>(
+                    static_cast<const void*>(tensor_in.flat<T>().data())),
                 input_buf);
           }
           pooling_resfwd[dnnResourceSrc] = input_buf;
@@ -453,8 +454,9 @@ class MklMaxPoolingGradOp : public OpKernel {
           CHECK_EQ(dnnDelete_F32(convert_outbackprop), E_SUCCESS);
         } else {
           output_backprop_shape.GetConvertedFlatData(
-              lt_outbackprop_prim, const_cast<void*>(static_cast<const void*>(
-                                       out_backprop.flat<T>().data())),
+              lt_outbackprop_prim,
+              const_cast<void*>(
+                  static_cast<const void*>(out_backprop.flat<T>().data())),
               outbackprop_buf);
         }
         pooling_res[dnnResourceDiffDst] = outbackprop_buf;
@@ -499,7 +501,7 @@ template <typename Device, typename T>
 class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
  public:
   explicit MklMaxPoolingOp(OpKernelConstruction* context)
-            : MklPoolingForwardOpBase<T>(context) {
+      : MklPoolingForwardOpBase<T>(context) {
     // In Max Pooling, MKLDNN does not allow passing workspace as NULL.
     // So we set workspace_enabled_ to true.
     this->workspace_enabled_ = true;
@@ -508,8 +510,8 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
   void Compute(OpKernelContext* context) override {
     try {
       auto cpu_engine = engine(engine::cpu, 0);
-      const Tensor& input_tensor = MklGetInput(context,
-                this->kInputTensorIndexInput);
+      const Tensor& input_tensor =
+          MklGetInput(context, this->kInputTensorIndexInput);
       MklDnnShape dnn_shape_input;
       GetMklShape(context, this->kInputTensorIndexInput, &dnn_shape_input);
       this->SanityCheckInput(context, input_tensor, dnn_shape_input);
@@ -522,9 +524,8 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
       // initialize variables for the pooling op
       MklPoolParameters pool_params;
       // Get the input tensor and initialize the pooling parameters
-      this->ConfigureInput(context, dnn_shape_input,
-                        input_tensor, &pool_params,
-                        &dnn_data_input);
+      this->ConfigureInput(context, dnn_shape_input, input_tensor, &pool_params,
+                           &dnn_data_input);
       OP_REQUIRES_OK(context, context->status());
 
       // Declare output tensor
@@ -535,9 +536,10 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
       // If input is in Mkl layout, then just get the memory format from it
       // directly, instead of using input data_format to MaxPool.
       if (dnn_shape_input.IsMklTensor()) {
-        dnn_data_output.SetUsrMem(output_dims_mkl_order,
-                                  static_cast<memory::format>(
-              dnn_data_input.GetUsrMemDesc().data.format));
+        dnn_data_output.SetUsrMem(
+            output_dims_mkl_order,
+            static_cast<memory::format>(
+                dnn_data_input.GetUsrMemDesc().data.format));
       } else {
         dnn_data_output.SetUsrMem(output_dims_mkl_order,
                                   this->data_format_mkldnn_);
@@ -546,24 +548,21 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
       // describe the memory layout; let mkl-dnn choose the best for the op
       dnn_data_output.SetOpMemDesc(output_dims_mkl_order, memory::format::any);
 
-      auto pool_desc = pooling_forward::desc(prop_kind::forward,
-            algorithm::pooling_max,
-            dnn_data_input.GetUsrMemDesc(),
-            dnn_data_output.GetUsrMemDesc(),
-            memory::dims({  pool_params.row_stride,
-                            pool_params.col_stride}),
-            memory::dims({  pool_params.window_rows,
-                            pool_params.window_cols}),
-            memory::dims({  static_cast<int>(pool_params.pad_top),
-                            static_cast<int>(pool_params.pad_left)}),
-            memory::dims({  static_cast<int>(pool_params.pad_bottom),
-                            static_cast<int>(pool_params.pad_right)}),
-            TFPaddingToMklDnnPadding(this->padding_));
-        auto pool_fwd_desc = pooling_forward::primitive_desc(pool_desc,
-            cpu_engine);
+      auto pool_desc = pooling_forward::desc(
+          prop_kind::forward, algorithm::pooling_max,
+          dnn_data_input.GetUsrMemDesc(), dnn_data_output.GetUsrMemDesc(),
+          memory::dims({pool_params.row_stride, pool_params.col_stride}),
+          memory::dims({pool_params.window_rows, pool_params.window_cols}),
+          memory::dims({static_cast<int>(pool_params.pad_top),
+                        static_cast<int>(pool_params.pad_left)}),
+          memory::dims({static_cast<int>(pool_params.pad_bottom),
+                        static_cast<int>(pool_params.pad_right)}),
+          TFPaddingToMklDnnPadding(this->padding_));
+      auto pool_fwd_desc =
+          pooling_forward::primitive_desc(pool_desc, cpu_engine);
 
       this->AllocateOutputTensor(context, pool_fwd_desc, output_dims_mkl_order,
-                            this->data_format_mkldnn_, &output_tensor);
+                                 this->data_format_mkldnn_, &output_tensor);
       OP_REQUIRES_OK(context, context->status());
       dnn_data_output.SetUsrMemDataHandle(output_tensor);
 
@@ -571,39 +570,38 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
       OP_REQUIRES_OK(context, context->status());
 
       this->PrepareAndExecuteNet(pool_fwd_desc, &dnn_data_input,
-                        &dnn_data_output, &dnn_data_wksp);
-    } catch (mkldnn::error &e) {
-        string error_msg = "Status: " + std::to_string(e.status) +
-                        ", message: " + string(e.message) +
-                        ", in file " + string(__FILE__) + ":" +
-                        std::to_string(__LINE__);
-        OP_REQUIRES_OK(context,
-                        errors::Aborted("Compute received an exception:",
-                                         error_msg));
+                                 &dnn_data_output, &dnn_data_wksp);
+    } catch (mkldnn::error& e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(context, errors::Aborted("Compute received an exception:",
+                                              error_msg));
     }
   }  // Compute
 
  private:
-    const int kOutputTensorIndexWorkspace = 1;
-
-    void AllocateWorkspaceTensor(OpKernelContext* context,
-                const pooling_forward::primitive_desc& pool_fwd_prim_desc,
-                MklDnnData<uint8>* dnn_data_wksp) {
-        CHECK_NOTNULL(dnn_data_wksp);
-        Tensor* workspace_tensor = nullptr;
-        memory::primitive_desc workspace_pd
-                    = pool_fwd_prim_desc.workspace_primitive_desc();
-        size_t workspace_bytes = workspace_pd.get_size();
-        MklDnnShape workspace_mkl_shape;
-        workspace_mkl_shape.SetMklTensor(false);
-        TensorShape workspace_tf_shape;
-        workspace_tf_shape.AddDim(workspace_bytes);
-        AllocateOutputSetMklShape(context, kOutputTensorIndexWorkspace,
-                                &workspace_tensor,
-                                workspace_tf_shape, workspace_mkl_shape);
-        CHECK_NOTNULL(workspace_tensor);
-        dnn_data_wksp->SetUsrMem(workspace_pd, workspace_tensor);
-    }
+  const int kOutputTensorIndexWorkspace = 1;
+
+  void AllocateWorkspaceTensor(
+      OpKernelContext* context,
+      const pooling_forward::primitive_desc& pool_fwd_prim_desc,
+      MklDnnData<uint8>* dnn_data_wksp) {
+    CHECK_NOTNULL(dnn_data_wksp);
+    Tensor* workspace_tensor = nullptr;
+    memory::primitive_desc workspace_pd =
+        pool_fwd_prim_desc.workspace_primitive_desc();
+    size_t workspace_bytes = workspace_pd.get_size();
+    MklDnnShape workspace_mkl_shape;
+    workspace_mkl_shape.SetMklTensor(false);
+    TensorShape workspace_tf_shape;
+    workspace_tf_shape.AddDim(workspace_bytes);
+    AllocateOutputSetMklShape(context, kOutputTensorIndexWorkspace,
+                              &workspace_tensor, workspace_tf_shape,
+                              workspace_mkl_shape);
+    CHECK_NOTNULL(workspace_tensor);
+    dnn_data_wksp->SetUsrMem(workspace_pd, workspace_tensor);
+  }
 };
 
 // The operation to compute MaxPool gradients.
@@ -616,218 +614,183 @@ template <class Device, class T>
 class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase<T> {
  public:
   explicit MklMaxPoolingGradOp(OpKernelConstruction* context)
-      : MklPoolingBackwardOpBase<T>(context) {
-  }
+      : MklPoolingBackwardOpBase<T>(context) {}
 
   void Compute(OpKernelContext* context) override {
     try {
-        auto cpu_engine = engine(engine::cpu, 0);
-        const Tensor& orig_input_tensor = MklGetInput(context,
-            kInputTensorIndexOrigInput);
-        const Tensor& orig_output_tensor = MklGetInput(context,
-            kInputTensorIndexOrigOutput);
-        const Tensor& grad_tensor = MklGetInput(context,
-            kInputTensorIndexGradient);
-        const Tensor& workspace_tensor = MklGetInput(context,
-            kInputTensorIndexWorkspace);
-        MklDnnShape orig_input_mkl_shape,
-                    orig_output_mkl_shape,
-                    grad_mkl_shape,
-                    workspace_mkl_shape;
-        GetMklShape(context, kInputTensorIndexOrigInput,
-            &orig_input_mkl_shape);
-        GetMklShape(context, kInputTensorIndexOrigOutput,
-            &orig_output_mkl_shape);
-        GetMklShape(context, kInputTensorIndexGradient,
-            &grad_mkl_shape);
-        GetMklShape(context, kInputTensorIndexWorkspace,
-            &workspace_mkl_shape);
-
-        SanityCheckInputs(context,
-                            orig_input_tensor, orig_output_tensor,
-                            grad_tensor, workspace_tensor,
-                            orig_input_mkl_shape, orig_output_mkl_shape,
-                            grad_mkl_shape, workspace_mkl_shape);
-        if (!context->status().ok()) return;
-
-        MklDnnData<T> grad_dnn_data(&cpu_engine);
-        MklDnnData<uint8> workspace_dnn_data(&cpu_engine);
-        MklDnnData<T> output_dnn_data(&cpu_engine);
-        Tensor* output_tensor = nullptr;
-        MklPoolParameters pool_params;
-        TensorShape orig_input_shape;
-        memory::dims output_dims_mkl_order, orig_input_dims_mkl_order;
-        memory::desc original_input_md = ConfigureOriginalInput(context,
-                                orig_input_tensor,
-                                orig_input_mkl_shape,
-                                &orig_input_dims_mkl_order,
-                                &pool_params,
-                                &orig_input_shape);
-
-        memory::desc original_output_md = this->ConfigureOriginalOutput(
-                                pool_params,
-                                orig_output_mkl_shape,
-                                output_dims_mkl_order);
-
-        memory::desc target_diff_dst_md =  this->ConfigureInputGradient(
-                                        grad_mkl_shape,
-                                        grad_tensor,
-                                        &grad_dnn_data,
-                                        original_output_md);
-
-        output_dnn_data.SetUsrMem(original_input_md);
-
-        // Create the forward pooling primitive descriptor so we can
-        // pass it as a hint to the backward pooling primitive descriptor
-        auto pool_fwd_desc = pooling_forward::desc(prop_kind::forward,
-                algorithm::pooling_max,
-                original_input_md,
-                original_output_md,
-                memory::dims({  pool_params.row_stride,
-                                pool_params.col_stride}),
-                memory::dims({  pool_params.window_rows,
-                                pool_params.window_cols}),
-                memory::dims({  static_cast<int>(pool_params.pad_top),
-                                static_cast<int>(pool_params.pad_left)}),
-                memory::dims({  static_cast<int>(pool_params.pad_bottom),
-                                static_cast<int>(pool_params.pad_right)}),
-                TFPaddingToMklDnnPadding(this->padding_));
-        auto pool_fwd_prim_desc
-                = pooling_forward::primitive_desc(pool_fwd_desc,
-                                                    cpu_engine);
-
-        auto pool_bkwd_desc = pooling_backward::desc(
-                algorithm::pooling_max,
-                output_dnn_data.GetUsrMemDesc(),
-                target_diff_dst_md,
-                memory::dims({  pool_params.row_stride,
-                                pool_params.col_stride}),
-                memory::dims({  pool_params.window_rows,
-                                pool_params.window_cols}),
-                memory::dims({  static_cast<int>(pool_params.pad_top),
-                                static_cast<int>(pool_params.pad_left)}),
-                memory::dims({  static_cast<int>(pool_params.pad_bottom),
-                                static_cast<int>(pool_params.pad_right)}),
-                TFPaddingToMklDnnPadding(this->padding_));
-        auto pool_bkwd_prim_desc
-            = pooling_backward::primitive_desc(pool_bkwd_desc,
-                                                cpu_engine,
-                                                pool_fwd_prim_desc);
-
-        this->AllocateOutputTensor(context, pool_bkwd_prim_desc,
-            orig_input_dims_mkl_order,
-            this->data_format_mkldnn_,
-            &output_tensor);
-        output_dnn_data.SetUsrMemDataHandle(output_tensor);
-
-        ConfigureWorkspace(workspace_tensor,
-                pool_fwd_prim_desc.workspace_primitive_desc(),
-                &workspace_dnn_data);
-        this->PrepareAndExecuteNet(pool_bkwd_prim_desc,
-                            &grad_dnn_data,
-                            &output_dnn_data,
-                            memory::primitive_desc(
-                                target_diff_dst_md,
-                                cpu_engine),
-                            &workspace_dnn_data);
-    } catch (mkldnn::error &e) {
-        string error_msg = "Status: " + std::to_string(e.status) +
-                        ", message: " + string(e.message) +
-                        ", in file " + string(__FILE__) + ":" +
-                        std::to_string(__LINE__);
-        OP_REQUIRES_OK(context,
-                        errors::Aborted("Compute received an exception:",
-                                         error_msg));
+      auto cpu_engine = engine(engine::cpu, 0);
+      const Tensor& orig_input_tensor =
+          MklGetInput(context, kInputTensorIndexOrigInput);
+      const Tensor& orig_output_tensor =
+          MklGetInput(context, kInputTensorIndexOrigOutput);
+      const Tensor& grad_tensor =
+          MklGetInput(context, kInputTensorIndexGradient);
+      const Tensor& workspace_tensor =
+          MklGetInput(context, kInputTensorIndexWorkspace);
+      MklDnnShape orig_input_mkl_shape, orig_output_mkl_shape, grad_mkl_shape,
+          workspace_mkl_shape;
+      GetMklShape(context, kInputTensorIndexOrigInput, &orig_input_mkl_shape);
+      GetMklShape(context, kInputTensorIndexOrigOutput, &orig_output_mkl_shape);
+      GetMklShape(context, kInputTensorIndexGradient, &grad_mkl_shape);
+      GetMklShape(context, kInputTensorIndexWorkspace, &workspace_mkl_shape);
+
+      SanityCheckInputs(context, orig_input_tensor, orig_output_tensor,
+                        grad_tensor, workspace_tensor, orig_input_mkl_shape,
+                        orig_output_mkl_shape, grad_mkl_shape,
+                        workspace_mkl_shape);
+      if (!context->status().ok()) return;
+
+      MklDnnData<T> grad_dnn_data(&cpu_engine);
+      MklDnnData<uint8> workspace_dnn_data(&cpu_engine);
+      MklDnnData<T> output_dnn_data(&cpu_engine);
+      Tensor* output_tensor = nullptr;
+      MklPoolParameters pool_params;
+      TensorShape orig_input_shape;
+      memory::dims output_dims_mkl_order, orig_input_dims_mkl_order;
+      memory::desc original_input_md = ConfigureOriginalInput(
+          context, orig_input_tensor, orig_input_mkl_shape,
+          &orig_input_dims_mkl_order, &pool_params, &orig_input_shape);
+
+      memory::desc original_output_md = this->ConfigureOriginalOutput(
+          pool_params, orig_output_mkl_shape, output_dims_mkl_order);
+
+      memory::desc target_diff_dst_md = this->ConfigureInputGradient(
+          grad_mkl_shape, grad_tensor, &grad_dnn_data, original_output_md);
+
+      output_dnn_data.SetUsrMem(original_input_md);
+
+      // Create the forward pooling primitive descriptor so we can
+      // pass it as a hint to the backward pooling primitive descriptor
+      auto pool_fwd_desc = pooling_forward::desc(
+          prop_kind::forward, algorithm::pooling_max, original_input_md,
+          original_output_md,
+          memory::dims({pool_params.row_stride, pool_params.col_stride}),
+          memory::dims({pool_params.window_rows, pool_params.window_cols}),
+          memory::dims({static_cast<int>(pool_params.pad_top),
+                        static_cast<int>(pool_params.pad_left)}),
+          memory::dims({static_cast<int>(pool_params.pad_bottom),
+                        static_cast<int>(pool_params.pad_right)}),
+          TFPaddingToMklDnnPadding(this->padding_));
+      auto pool_fwd_prim_desc =
+          pooling_forward::primitive_desc(pool_fwd_desc, cpu_engine);
+
+      auto pool_bkwd_desc = pooling_backward::desc(
+          algorithm::pooling_max, output_dnn_data.GetUsrMemDesc(),
+          target_diff_dst_md,
+          memory::dims({pool_params.row_stride, pool_params.col_stride}),
+          memory::dims({pool_params.window_rows, pool_params.window_cols}),
+          memory::dims({static_cast<int>(pool_params.pad_top),
+                        static_cast<int>(pool_params.pad_left)}),
+          memory::dims({static_cast<int>(pool_params.pad_bottom),
+                        static_cast<int>(pool_params.pad_right)}),
+          TFPaddingToMklDnnPadding(this->padding_));
+      auto pool_bkwd_prim_desc = pooling_backward::primitive_desc(
+          pool_bkwd_desc, cpu_engine, pool_fwd_prim_desc);
+
+      this->AllocateOutputTensor(context, pool_bkwd_prim_desc,
+                                 orig_input_dims_mkl_order,
+                                 this->data_format_mkldnn_, &output_tensor);
+      output_dnn_data.SetUsrMemDataHandle(output_tensor);
+
+      ConfigureWorkspace(workspace_tensor,
+                         pool_fwd_prim_desc.workspace_primitive_desc(),
+                         &workspace_dnn_data);
+      this->PrepareAndExecuteNet(
+          pool_bkwd_prim_desc, &grad_dnn_data, &output_dnn_data,
+          memory::primitive_desc(target_diff_dst_md, cpu_engine),
+          &workspace_dnn_data);
+    } catch (mkldnn::error& e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(context, errors::Aborted("Compute received an exception:",
+                                              error_msg));
     }
   }  // Compute
 
  private:
-    // .Input("orig_input: T")
-    // .Input("orig_output: T")
-    // .Input("grad: T")
-    // .Input("workspace: T")
-    const int kInputTensorIndexOrigInput = 0;
-    const int kInputTensorIndexOrigOutput = 1;
-    const int kInputTensorIndexGradient = 2;
-    const int kInputTensorIndexWorkspace = 3;
-    //  Output("output: T") in Base Class
-
-    memory::desc ConfigureOriginalInput(OpKernelContext* context,
-                                const Tensor& tensor_original_input,
-                                const MklDnnShape& original_input_mkl_shape,
-                                memory::dims* original_input_dims_mkl_order,
-                                MklPoolParameters* pool_params,
-                                TensorShape* input_tensor_shape) {
-        *input_tensor_shape = tensor_original_input.shape();
-        return MklPoolingBackwardOpBase<T>::ConfigureOriginalInput(
-                                        context,
-                                        tensor_original_input,
-                                        original_input_mkl_shape,
-                                        original_input_dims_mkl_order,
-                                        pool_params,
-                                        *input_tensor_shape);
-    }
+  // .Input("orig_input: T")
+  // .Input("orig_output: T")
+  // .Input("grad: T")
+  // .Input("workspace: T")
+  const int kInputTensorIndexOrigInput = 0;
+  const int kInputTensorIndexOrigOutput = 1;
+  const int kInputTensorIndexGradient = 2;
+  const int kInputTensorIndexWorkspace = 3;
+  //  Output("output: T") in Base Class
+
+  memory::desc ConfigureOriginalInput(
+      OpKernelContext* context, const Tensor& tensor_original_input,
+      const MklDnnShape& original_input_mkl_shape,
+      memory::dims* original_input_dims_mkl_order,
+      MklPoolParameters* pool_params, TensorShape* input_tensor_shape) {
+    *input_tensor_shape = tensor_original_input.shape();
+    return MklPoolingBackwardOpBase<T>::ConfigureOriginalInput(
+        context, tensor_original_input, original_input_mkl_shape,
+        original_input_dims_mkl_order, pool_params, *input_tensor_shape);
+  }
 
-    void ConfigureWorkspace(const Tensor& workspace_tensor,
-                        memory::primitive_desc workspace_pd,
-                        MklDnnData<uint8> *workspace_dnn_data) {
-        CHECK_NOTNULL(workspace_dnn_data);
+  void ConfigureWorkspace(const Tensor& workspace_tensor,
+                          memory::primitive_desc workspace_pd,
+                          MklDnnData<uint8>* workspace_dnn_data) {
+    CHECK_NOTNULL(workspace_dnn_data);
 
-        workspace_dnn_data->SetUsrMem(workspace_pd, &workspace_tensor);
-    }
+    workspace_dnn_data->SetUsrMem(workspace_pd, &workspace_tensor);
+  }
 
-    void SanityCheckInputs(OpKernelContext* context,
-                            const Tensor& orig_input_tensor,
-                            const Tensor& orig_output_tensor,
-                            const Tensor& grad_tensor,
-                            const Tensor& workspace_tensor,
-                            const MklDnnShape& orig_input_mkl_shape,
-                            const MklDnnShape& orig_output_mkl_shape,
-                            const MklDnnShape& grad_mkl_shape,
-                            const MklDnnShape& workspace_mkl_shape) {
-        if (!orig_input_mkl_shape.IsMklTensor()) {
-            OP_REQUIRES(context, orig_input_tensor.dims() == 4,
-                errors::InvalidArgument("Original input shape must be "
-                "4-dimensional"));
-        } else {
-            OP_REQUIRES(context, orig_input_mkl_shape.GetDimension() == 4,
-                    errors::InvalidArgument("Original input shape must be "
-                    "4-dimensional"));
-        }
-        if (!orig_output_mkl_shape.IsMklTensor()) {
-            OP_REQUIRES(context, orig_output_tensor.dims() == 4,
-                errors::InvalidArgument("Original output must be "
-                        "4-dimensional"));
-        } else {
-            OP_REQUIRES(context, orig_output_mkl_shape.GetDimension() == 4,
-                    errors::InvalidArgument("Original output must be "
-                    "4-dimensional"));
-        }
-        if (!grad_mkl_shape.IsMklTensor()) {
-            OP_REQUIRES(context, grad_tensor.dims() == 4,
-                errors::InvalidArgument("Gradient must be 4-dimensional"));
-        } else {
-            OP_REQUIRES(context, grad_mkl_shape.GetDimension() == 4,
-                    errors::InvalidArgument("Gradient must be "
-                    "4-dimensional"));
-        }
-        if (this->workspace_enabled_) {
-            // The workspace should not be an MKL tensor
-            OP_REQUIRES(context, workspace_mkl_shape.IsMklTensor() == false,
-                    errors::InvalidArgument("Workspace tensor should not"
-                                            " be an MKL Tensor."));
-            // It should only have one dimension
-            OP_REQUIRES(context, workspace_tensor.dims() == 1,
-                    errors::InvalidArgument("Workspace tensor must be "
-                                "1-dimensional"));
-        } else {
-            OP_REQUIRES(context, this->workspace_enabled_,
-                    errors::Unimplemented("MKL-DNN Max Pooling does not "
+  void SanityCheckInputs(OpKernelContext* context,
+                         const Tensor& orig_input_tensor,
+                         const Tensor& orig_output_tensor,
+                         const Tensor& grad_tensor,
+                         const Tensor& workspace_tensor,
+                         const MklDnnShape& orig_input_mkl_shape,
+                         const MklDnnShape& orig_output_mkl_shape,
+                         const MklDnnShape& grad_mkl_shape,
+                         const MklDnnShape& workspace_mkl_shape) {
+    if (!orig_input_mkl_shape.IsMklTensor()) {
+      OP_REQUIRES(context, orig_input_tensor.dims() == 4,
+                  errors::InvalidArgument("Original input shape must be "
+                                          "4-dimensional"));
+    } else {
+      OP_REQUIRES(context, orig_input_mkl_shape.GetDimension() == 4,
+                  errors::InvalidArgument("Original input shape must be "
+                                          "4-dimensional"));
+    }
+    if (!orig_output_mkl_shape.IsMklTensor()) {
+      OP_REQUIRES(context, orig_output_tensor.dims() == 4,
+                  errors::InvalidArgument("Original output must be "
+                                          "4-dimensional"));
+    } else {
+      OP_REQUIRES(context, orig_output_mkl_shape.GetDimension() == 4,
+                  errors::InvalidArgument("Original output must be "
+                                          "4-dimensional"));
+    }
+    if (!grad_mkl_shape.IsMklTensor()) {
+      OP_REQUIRES(context, grad_tensor.dims() == 4,
+                  errors::InvalidArgument("Gradient must be 4-dimensional"));
+    } else {
+      OP_REQUIRES(context, grad_mkl_shape.GetDimension() == 4,
+                  errors::InvalidArgument("Gradient must be "
+                                          "4-dimensional"));
+    }
+    if (this->workspace_enabled_) {
+      // The workspace should not be an MKL tensor
+      OP_REQUIRES(context, workspace_mkl_shape.IsMklTensor() == false,
+                  errors::InvalidArgument("Workspace tensor should not"
+                                          " be an MKL Tensor."));
+      // It should only have one dimension
+      OP_REQUIRES(context, workspace_tensor.dims() == 1,
+                  errors::InvalidArgument("Workspace tensor must be "
+                                          "1-dimensional"));
+    } else {
+      OP_REQUIRES(
+          context, this->workspace_enabled_,
+          errors::Unimplemented("MKL-DNN Max Pooling does not "
                                 "yet support the use case "
                                 "where MaxPoolGrad is called without first"
                                 " calling MaxPool."));
-        }
     }
+  }
 };  // MklMaxPoolingGradOp
 
 #endif  // INTEL_MKL_DNN
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.cc b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
index f7cadffd39..ef8597b057 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.cc
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
@@ -15,9 +15,9 @@ limitations under the License.
 
 #ifdef INTEL_MKL
 
-#include <vector>
-#include <limits>
 #include "tensorflow/core/kernels/mkl_pooling_ops_common.h"
+#include <limits>
+#include <vector>
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/kernels/bounds_check.h"
@@ -111,17 +111,17 @@ void MklPoolParameters::Init(OpKernelContext* context,
     // TF can work with int64, but mkldnn only supports int32
     // Fail if the height or width are greater than MAX_INT
 
-    OP_REQUIRES(context, FastBoundsCheck(out_height,
-                                         std::numeric_limits<int>::max()),
+    OP_REQUIRES(context,
+                FastBoundsCheck(out_height, std::numeric_limits<int>::max()),
                 errors::InvalidArgument("output height is too large"));
 
-    OP_REQUIRES(context, FastBoundsCheck(out_width,
-                                         std::numeric_limits<int>::max()),
+    OP_REQUIRES(context,
+                FastBoundsCheck(out_width, std::numeric_limits<int>::max()),
                 errors::InvalidArgument("output width is too large"));
 
 #endif
     out_depth = depth;  // output will have the same depth as the input
-  } else {  // we are pooling in the depth dimension
+  } else {              // we are pooling in the depth dimension
     // Our current version of depthwise max pooling does not support
     // any padding, and expects the depth_window to equal the depth
     // stride (no overlapping).
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.h b/tensorflow/core/kernels/mkl_pooling_ops_common.h
index b974b2c59a..880e45ab1e 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.h
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.h
@@ -17,16 +17,16 @@ limitations under the License.
 #define TENSORFLOW_CORE_KERNELS_MKL_POOLING_OPS_COMMON_H_
 
 #ifdef INTEL_MKL
-#include <vector>
 #include <string>
+#include <vector>
 #include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/padding.h"
 
 #ifdef INTEL_MKL_DNN
 #include "mkldnn.hpp"
 using mkldnn::memory;
-using mkldnn::pooling_forward;
 using mkldnn::pooling_backward;
+using mkldnn::pooling_forward;
 using mkldnn::stream;
 #endif
 
@@ -61,13 +61,25 @@ struct MklPoolParameters {
 
   TensorFormat data_format;
   MklPoolParameters()
-    : depth(0)
-    , tensor_in_cols(0), tensor_in_rows(0), tensor_in_batch(0)
-    , window_rows(0), window_cols(0), depth_window(0)
-    , row_stride(0), col_stride(0), depth_stride(0)
-    , out_height(0), out_width(0), out_depth(0)
-    , pad_left(0), pad_right(0), pad_top(0), pad_bottom(0), pad_depth(0)
-    , data_format(TensorFormat::FORMAT_NCHW) {}
+      : depth(0),
+        tensor_in_cols(0),
+        tensor_in_rows(0),
+        tensor_in_batch(0),
+        window_rows(0),
+        window_cols(0),
+        depth_window(0),
+        row_stride(0),
+        col_stride(0),
+        depth_stride(0),
+        out_height(0),
+        out_width(0),
+        out_depth(0),
+        pad_left(0),
+        pad_right(0),
+        pad_top(0),
+        pad_bottom(0),
+        pad_depth(0),
+        data_format(TensorFormat::FORMAT_NCHW) {}
 
   // Updates context->status if there is an invalid input.
   void Init(OpKernelContext* context, const std::vector<int32>& ksize,
@@ -96,33 +108,31 @@ template <class T>
 class MklPoolingOpBase : public OpKernel {
  public:
   explicit MklPoolingOpBase(OpKernelConstruction* context)
-            : OpKernel(context)
-            , workspace_enabled_(false) {
-      string data_format;
-      OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
-      OP_REQUIRES(context,
-            FormatFromString(data_format, &this->data_format_tf_),
-            errors::InvalidArgument("Invalid data format"));
-      this->data_format_mkldnn_
-                = TFDataFormatToMklDnnDataFormat(this->data_format_tf_);
-      OP_REQUIRES_OK(context, context->GetAttr("ksize", &this->ksize_));
-      OP_REQUIRES(context, this->ksize_.size() == 4,
-                  errors::InvalidArgument("Sliding window ksize field must "
-                                          "specify 4 dimensions"));
-      OP_REQUIRES_OK(context, context->GetAttr("strides", &this->stride_));
-      OP_REQUIRES(context, this->stride_.size() == 4,
-                  errors::InvalidArgument("Sliding window strides field must "
-                                          "specify 4 dimensions"));
-      OP_REQUIRES_OK(context, context->GetAttr("padding", &this->padding_));
-      OP_REQUIRES(context, this->ksize_[0] == 1 && this->stride_[0] == 1,
-                  errors::Unimplemented("Pooling is not yet supported on the "
-                                        "batch dimension."));
-
-      // We may not get this attribute for this node if it does not go through
-      // graph rewrite pass. So we do not check for error while retrieving this
-      // attribute value.
-      context->GetAttr("workspace_enabled", &this->workspace_enabled_);
-    }
+      : OpKernel(context), workspace_enabled_(false) {
+    string data_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &this->data_format_tf_),
+                errors::InvalidArgument("Invalid data format"));
+    this->data_format_mkldnn_ =
+        TFDataFormatToMklDnnDataFormat(this->data_format_tf_);
+    OP_REQUIRES_OK(context, context->GetAttr("ksize", &this->ksize_));
+    OP_REQUIRES(context, this->ksize_.size() == 4,
+                errors::InvalidArgument("Sliding window ksize field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &this->stride_));
+    OP_REQUIRES(context, this->stride_.size() == 4,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &this->padding_));
+    OP_REQUIRES(context, this->ksize_[0] == 1 && this->stride_[0] == 1,
+                errors::Unimplemented("Pooling is not yet supported on the "
+                                      "batch dimension."));
+
+    // We may not get this attribute for this node if it does not go through
+    // graph rewrite pass. So we do not check for error while retrieving this
+    // attribute value.
+    context->GetAttr("workspace_enabled", &this->workspace_enabled_);
+  }
   void Compute(OpKernelContext* context) override = 0;
 
  protected:
@@ -132,24 +142,24 @@ class MklPoolingOpBase : public OpKernel {
   // output height and output width to have already been int32
   // bounds-checked
   void GetOutputDims(const MklPoolParameters& mkl_pool_params,
-                    memory::dims* output_dims_mkl_order) {
+                     memory::dims* output_dims_mkl_order) {
     // MKL-DNN always needs output in NCHW format.
-    *output_dims_mkl_order = { mkl_pool_params.tensor_in_batch,
+    *output_dims_mkl_order = {mkl_pool_params.tensor_in_batch,
                               mkl_pool_params.out_depth,
                               static_cast<int>(mkl_pool_params.out_height),
                               static_cast<int>(mkl_pool_params.out_width)};
   }
 
   void InitMklPoolParameters(OpKernelContext* context,
-                      MklPoolParameters* pool_params,
-                      const MklDnnShape& original_input_mkl_shape,
-                      const TensorShape& input_tensor_shape) {
+                             MklPoolParameters* pool_params,
+                             const MklDnnShape& original_input_mkl_shape,
+                             const TensorShape& input_tensor_shape) {
     if (!original_input_mkl_shape.IsMklTensor()) {
       pool_params->Init(context, this->ksize_, this->stride_, this->padding_,
-          this->data_format_tf_, input_tensor_shape);
+                        this->data_format_tf_, input_tensor_shape);
     } else {
       pool_params->Init(context, this->ksize_, this->stride_, this->padding_,
-          this->data_format_tf_, &original_input_mkl_shape);
+                        this->data_format_tf_, &original_input_mkl_shape);
     }
   }
 
@@ -159,13 +169,12 @@ class MklPoolingOpBase : public OpKernel {
   size_t GetNumTElements(const memory::primitive_desc& pd) {
     size_t num_bytes = pd.get_size();
     size_t ret_val = num_bytes / sizeof(T);
-    if ( num_bytes % sizeof(T) != 0 ) {
-        ret_val++;
+    if (num_bytes % sizeof(T) != 0) {
+      ret_val++;
     }
     return ret_val;
   }
 
-
   std::vector<int32> ksize_;
   std::vector<int32> stride_;
   Padding padding_;
@@ -183,30 +192,29 @@ class MklPoolingForwardOpBase : public MklPoolingOpBase<T> {
 
  protected:
   void ConfigureInput(OpKernelContext* context,
-                    const MklDnnShape& input_mkl_shape,
-                    const Tensor& input_tensor,
-                    MklPoolParameters* pool_params,
-                    MklDnnData<T>* dnn_data_input) {
+                      const MklDnnShape& input_mkl_shape,
+                      const Tensor& input_tensor,
+                      MklPoolParameters* pool_params,
+                      MklDnnData<T>* dnn_data_input) {
     CHECK_NOTNULL(pool_params);
     CHECK_NOTNULL(dnn_data_input);
     TensorShape input_tensor_shape = input_tensor.shape();
-    memory::desc input_md = input_mkl_shape.IsMklTensor()
-                        ? input_mkl_shape.GetMklLayout()
-                        : memory::desc(
-                              TFShapeToMklDnnDimsInNCHW(
-                                  input_tensor_shape, this->data_format_tf_),
-                              MklDnnType<T>(),
-                              this->data_format_mkldnn_);
+    memory::desc input_md =
+        input_mkl_shape.IsMklTensor()
+            ? input_mkl_shape.GetMklLayout()
+            : memory::desc(TFShapeToMklDnnDimsInNCHW(input_tensor_shape,
+                                                     this->data_format_tf_),
+                           MklDnnType<T>(), this->data_format_mkldnn_);
     dnn_data_input->SetUsrMem(input_md, &input_tensor);
-    this->InitMklPoolParameters(context, pool_params,
-                      input_mkl_shape, input_tensor_shape);
+    this->InitMklPoolParameters(context, pool_params, input_mkl_shape,
+                                input_tensor_shape);
   }
 
-  void AllocateOutputTensor(OpKernelContext* context,
-            const pooling_forward::primitive_desc& pool_fwd_prim_desc,
-            const memory::dims output_dims_mkl_order,
-            const memory::format& output_tf_format,
-            Tensor** output_tensor) {
+  void AllocateOutputTensor(
+      OpKernelContext* context,
+      const pooling_forward::primitive_desc& pool_fwd_prim_desc,
+      const memory::dims output_dims_mkl_order,
+      const memory::format& output_tf_format, Tensor** output_tensor) {
     CHECK_NOTNULL(output_tensor);
     memory::primitive_desc dst_pd = pool_fwd_prim_desc.dst_primitive_desc();
 
@@ -215,50 +223,42 @@ class MklPoolingForwardOpBase : public MklPoolingOpBase<T> {
     output_mkl_shape.SetMklLayout(&dst_pd);
     output_mkl_shape.SetElemType(MklDnnType<T>());
     output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
-                              output_dims_mkl_order,
-                              output_tf_format);
+                                 output_dims_mkl_order, output_tf_format);
     TensorShape output_tf_shape;
 
     // only allocate enough space for the elements we need.
     output_tf_shape.AddDim(this->GetNumTElements(dst_pd));
-    AllocateOutputSetMklShape(context, kOutputTensorIndexOutput,
-                            output_tensor,
-                            output_tf_shape, output_mkl_shape);
+    AllocateOutputSetMklShape(context, kOutputTensorIndexOutput, output_tensor,
+                              output_tf_shape, output_mkl_shape);
     CHECK_NOTNULL(*output_tensor);
   }
 
   void PrepareAndExecuteNet(
-                  const pooling_forward::primitive_desc& pool_fwd_desc,
-                  const MklDnnData<T>* src,
-                  MklDnnData<T>* dst,
-                  MklDnnData<uint8>* wksp = nullptr) {
+      const pooling_forward::primitive_desc& pool_fwd_desc,
+      const MklDnnData<T>* src, MklDnnData<T>* dst,
+      MklDnnData<uint8>* wksp = nullptr) {
     std::vector<primitive> net;
 
     // Create pooling primitive and add it to net
     if (wksp != nullptr) {
-        net.push_back(pooling_forward(pool_fwd_desc,
-                        src->GetOpMem(),
-                        dst->GetOpMem(),
-                        wksp->GetOpMem()));
+      net.push_back(pooling_forward(pool_fwd_desc, src->GetOpMem(),
+                                    dst->GetOpMem(), wksp->GetOpMem()));
     } else {
-        net.push_back(pooling_forward(pool_fwd_desc,
-            src->GetOpMem(),
-            dst->GetOpMem()));
+      net.push_back(
+          pooling_forward(pool_fwd_desc, src->GetOpMem(), dst->GetOpMem()));
     }
     stream(stream::kind::eager).submit(net).wait();
   }
 
-
-  void SanityCheckInput(OpKernelContext* context,
-                  const Tensor& input_tensor,
-                  const MklDnnShape& input_mkl_shape) {
+  void SanityCheckInput(OpKernelContext* context, const Tensor& input_tensor,
+                        const MklDnnShape& input_mkl_shape) {
     if (!input_mkl_shape.IsMklTensor()) {
       OP_REQUIRES(context, input_tensor.dims() == 4,
-          errors::InvalidArgument("Input must be 4-dimensional"));
+                  errors::InvalidArgument("Input must be 4-dimensional"));
     } else {
-        OP_REQUIRES(context, input_mkl_shape.GetDimension() == 4,
-                errors::InvalidArgument("Input shape must be "
-                "4-dimensional"));
+      OP_REQUIRES(context, input_mkl_shape.GetDimension() == 4,
+                  errors::InvalidArgument("Input shape must be "
+                                          "4-dimensional"));
     }
   }
   // .Input("value: T")
@@ -267,66 +267,58 @@ class MklPoolingForwardOpBase : public MklPoolingOpBase<T> {
   const int kOutputTensorIndexOutput = 0;
 };  // MklPoolingForwardBaseOp
 
-
 template <class T>
 class MklPoolingBackwardOpBase : public MklPoolingOpBase<T> {
  public:
   explicit MklPoolingBackwardOpBase<T>(OpKernelConstruction* context)
-          : MklPoolingOpBase<T>(context) { }
+      : MklPoolingOpBase<T>(context) {}
   void Compute(OpKernelContext* context) override = 0;
 
  protected:
   const int kOutputTensorIndexOutput = 0;
 
-  void AllocateOutputTensor(OpKernelContext* context,
-            const pooling_backward::primitive_desc& pool_bkwd_prim_desc,
-            const memory::dims output_dims_mkl_order,
-            const memory::format& output_tf_format,
-            Tensor** output_tensor) {
+  void AllocateOutputTensor(
+      OpKernelContext* context,
+      const pooling_backward::primitive_desc& pool_bkwd_prim_desc,
+      const memory::dims output_dims_mkl_order,
+      const memory::format& output_tf_format, Tensor** output_tensor) {
     CHECK_NOTNULL(output_tensor);
-    memory::primitive_desc dst_pd
-                = pool_bkwd_prim_desc.diff_src_primitive_desc();
+    memory::primitive_desc dst_pd =
+        pool_bkwd_prim_desc.diff_src_primitive_desc();
     MklDnnShape output_mkl_shape;
     output_mkl_shape.SetMklTensor(true);
     output_mkl_shape.SetMklLayout(&dst_pd);
     output_mkl_shape.SetElemType(MklDnnType<T>());
     output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
-                              output_dims_mkl_order,
-                              output_tf_format);
+                                 output_dims_mkl_order, output_tf_format);
 
     TensorShape output_tf_shape;
     output_tf_shape.AddDim(this->GetNumTElements(dst_pd));
-    AllocateOutputSetMklShape(context, kOutputTensorIndexOutput,
-                            output_tensor,
-                            output_tf_shape, output_mkl_shape);
+    AllocateOutputSetMklShape(context, kOutputTensorIndexOutput, output_tensor,
+                              output_tf_shape, output_mkl_shape);
     CHECK_NOTNULL(*output_tensor);
   }
 
   void PrepareAndExecuteNet(
-    const pooling_backward::primitive_desc& pool_bkwd_desc,
-    MklDnnData<T>* input_gradient_diff_dst,
-    MklDnnData<T>* output_diff_src,
-    const memory::primitive_desc& target_diff_dst_pd,
-    const MklDnnData<uint8>* workspace = nullptr) {
-
+      const pooling_backward::primitive_desc& pool_bkwd_desc,
+      MklDnnData<T>* input_gradient_diff_dst, MklDnnData<T>* output_diff_src,
+      const memory::primitive_desc& target_diff_dst_pd,
+      const MklDnnData<uint8>* workspace = nullptr) {
     std::vector<primitive> net;
 
     // If the input gradient isn't in the same format as the output
     // reorder it to the same format as the output
-    input_gradient_diff_dst->CheckReorderToOpMem(
-            target_diff_dst_pd,
-            &net);
+    input_gradient_diff_dst->CheckReorderToOpMem(target_diff_dst_pd, &net);
 
     // Create pooling primitive and add it to net
     if (nullptr == workspace) {
       net.push_back(pooling_backward(pool_bkwd_desc,
-                              input_gradient_diff_dst->GetOpMem(),
-                              output_diff_src->GetOpMem()));
+                                     input_gradient_diff_dst->GetOpMem(),
+                                     output_diff_src->GetOpMem()));
     } else {
-      net.push_back(pooling_backward(pool_bkwd_desc,
-                                  input_gradient_diff_dst->GetOpMem(),
-                                  workspace->GetOpMem(),
-                                  output_diff_src->GetOpMem()));
+      net.push_back(
+          pooling_backward(pool_bkwd_desc, input_gradient_diff_dst->GetOpMem(),
+                           workspace->GetOpMem(), output_diff_src->GetOpMem()));
     }
     stream(stream::kind::eager).submit(net).wait();
   }
@@ -334,77 +326,73 @@ class MklPoolingBackwardOpBase : public MklPoolingOpBase<T> {
   // Max Pooling and Avg Pooling have slightly different implementations
   // Takes the Tensor containing original input data and the original
   // mkl Dnn Shape and populates other data
-  memory::desc ConfigureOriginalInput(OpKernelContext* context,
-                              const Tensor& tensor_original_input_shape,
-                              const MklDnnShape& original_input_mkl_shape,
-                              memory::dims* original_input_dims_nchw,
-                              MklPoolParameters* pool_params,
-                              const TensorShape& input_tensor_shape) {
+  memory::desc ConfigureOriginalInput(
+      OpKernelContext* context, const Tensor& tensor_original_input_shape,
+      const MklDnnShape& original_input_mkl_shape,
+      memory::dims* original_input_dims_nchw, MklPoolParameters* pool_params,
+      const TensorShape& input_tensor_shape) {
     CHECK_NOTNULL(original_input_dims_nchw);
     CHECK_NOTNULL(pool_params);
-    this->InitMklPoolParameters(context, pool_params,
-                          original_input_mkl_shape,
-                          input_tensor_shape);
-
-    *original_input_dims_nchw
-          = original_input_mkl_shape.IsMklTensor()
-          ? original_input_mkl_shape.GetSizesAsMklDnnDims()
-          : TFShapeToMklDnnDimsInNCHW(input_tensor_shape,
-        this->data_format_tf_);
-
-    return  original_input_mkl_shape.IsMklTensor()
-      ? original_input_mkl_shape.GetMklLayout()
-      : memory::desc(*original_input_dims_nchw,
-                      MklDnnType<T>(),
-                      this->data_format_mkldnn_);
+    this->InitMklPoolParameters(context, pool_params, original_input_mkl_shape,
+                                input_tensor_shape);
+
+    *original_input_dims_nchw =
+        original_input_mkl_shape.IsMklTensor()
+            ? original_input_mkl_shape.GetSizesAsMklDnnDims()
+            : TFShapeToMklDnnDimsInNCHW(input_tensor_shape,
+                                        this->data_format_tf_);
+
+    return original_input_mkl_shape.IsMklTensor()
+               ? original_input_mkl_shape.GetMklLayout()
+               : memory::desc(*original_input_dims_nchw, MklDnnType<T>(),
+                              this->data_format_mkldnn_);
   }
 
-  memory::desc ConfigureOriginalOutput(const MklPoolParameters& pool_params,
-                                const MklDnnShape& original_output_mkl_shape,
-                                      memory::dims output_dims_mkl_order) {
+  memory::desc ConfigureOriginalOutput(
+      const MklPoolParameters& pool_params,
+      const MklDnnShape& original_output_mkl_shape,
+      memory::dims output_dims_mkl_order) {
     this->GetOutputDims(pool_params, &output_dims_mkl_order);
 
     return original_output_mkl_shape.IsMklTensor()
-            ? original_output_mkl_shape.GetMklLayout()
-            : memory::desc(output_dims_mkl_order,
-                         MklDnnType<T>(),
-                         this->data_format_mkldnn_);
+               ? original_output_mkl_shape.GetMklLayout()
+               : memory::desc(output_dims_mkl_order, MklDnnType<T>(),
+                              this->data_format_mkldnn_);
   }
 
   memory::desc ConfigureInputGradient(
-        const MklDnnShape& input_gradient_mkl_shape,
-        const Tensor& input_gradient_tensor,
-        MklDnnData<T>* input_gradient_dnn_data,
-        const memory::desc& original_output_md) {
+      const MklDnnShape& input_gradient_mkl_shape,
+      const Tensor& input_gradient_tensor,
+      MklDnnData<T>* input_gradient_dnn_data,
+      const memory::desc& original_output_md) {
     // Configure the gradient as is
-    memory::desc original_input_grad_md
-          = input_gradient_mkl_shape.IsMklTensor()
-          ? input_gradient_mkl_shape.GetMklLayout()
-          : memory::desc(TFShapeToMklDnnDimsInNCHW(
-                    input_gradient_tensor.shape(),
-                    this->data_format_tf_),
-                    MklDnnType<T>(), this->data_format_mkldnn_);
+    memory::desc original_input_grad_md =
+        input_gradient_mkl_shape.IsMklTensor()
+            ? input_gradient_mkl_shape.GetMklLayout()
+            : memory::desc(
+                  TFShapeToMklDnnDimsInNCHW(input_gradient_tensor.shape(),
+                                            this->data_format_tf_),
+                  MklDnnType<T>(), this->data_format_mkldnn_);
 
     input_gradient_dnn_data->SetUsrMem(original_input_grad_md,
-                &input_gradient_tensor);
+                                       &input_gradient_tensor);
 
     // Check to see if input grad diff dst is in the right format
     // Create a new memory descriptor with the same shape as the
     // original, but the format of the other tensors.
     memory::format original_output_format =
-            static_cast<memory::format>(original_output_md.data.format);
-    bool grad_reorder_needed = input_gradient_dnn_data->IsReorderNeeded(
-                                    original_output_format);
-    memory::dims diff_dst_dims = input_gradient_mkl_shape.IsMklTensor()
-        ? input_gradient_mkl_shape.GetSizesAsMklDnnDims()
-        : TFShapeToMklDnnDimsInNCHW(input_gradient_tensor.shape(),
-                    this->data_format_tf_);
-    memory::desc target_diff_dst_md = memory::desc(diff_dst_dims,
-        MklDnnType<T>(), original_output_format);
-
-    return grad_reorder_needed
-            ? target_diff_dst_md
-            : original_input_grad_md;
+        static_cast<memory::format>(original_output_md.data.format);
+    bool grad_reorder_needed =
+        input_gradient_dnn_data->IsReorderNeeded(original_output_format);
+    memory::dims diff_dst_dims =
+        input_gradient_mkl_shape.IsMklTensor()
+            ? input_gradient_mkl_shape.GetSizesAsMklDnnDims()
+            : TFShapeToMklDnnDimsInNCHW(input_gradient_tensor.shape(),
+                                        this->data_format_tf_);
+    memory::desc target_diff_dst_md =
+        memory::desc(diff_dst_dims, MklDnnType<T>(), original_output_format);
+
+    return grad_reorder_needed ? target_diff_dst_md : original_input_grad_md;
   }
 };
 #endif  // INTEL_MKL_DNN
diff --git a/tensorflow/core/kernels/mkl_relu_op.cc b/tensorflow/core/kernels/mkl_relu_op.cc
index dc899d8c7e..873aca30ca 100644
--- a/tensorflow/core/kernels/mkl_relu_op.cc
+++ b/tensorflow/core/kernels/mkl_relu_op.cc
@@ -16,29 +16,29 @@ limitations under the License.
 // See docs in ../ops/nn_ops.cc.
 #ifdef INTEL_MKL
 
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
-#include "tensorflow/core/platform/default/logging.h"
-#include "tensorflow/core/util/mkl_util.h"
 #include "mkl_dnn.h"
 #include "mkl_dnn_types.h"
+#include "tensorflow/core/platform/default/logging.h"
+#include "tensorflow/core/util/mkl_util.h"
 
 #ifdef INTEL_MKL_DNN
 #include "mkldnn.hpp"
 
-using mkldnn::stream;
-using mkldnn::prop_kind;
 using mkldnn::algorithm;
-using mkldnn::relu_forward;
-using mkldnn::relu_backward;
-using mkldnn::eltwise_relu;
 using mkldnn::eltwise_elu;
+using mkldnn::eltwise_relu;
 using mkldnn::eltwise_tanh;
+using mkldnn::prop_kind;
+using mkldnn::relu_backward;
+using mkldnn::relu_forward;
+using mkldnn::stream;
 #endif
 
 namespace tensorflow {
@@ -180,7 +180,6 @@ class MklReluOp : public OpKernel {
   } MklReluOpContext;
 };
 
-
 template <typename Device, typename T>
 class MklReluGradOp : public OpKernel {
  public:
@@ -214,10 +213,11 @@ class MklReluGradOp : public OpKernel {
       if (!dnnLayoutCompare_F32(lt_input, lt_grad)) {
         AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, lt_grad,
                        &mkl_buffer_convert);
-        CHECK_EQ(dnnConversionCreate_F32(&cv_input_to_grad, lt_input,
-                   lt_grad), E_SUCCESS);
+        CHECK_EQ(dnnConversionCreate_F32(&cv_input_to_grad, lt_input, lt_grad),
+                 E_SUCCESS);
         CHECK_EQ(dnnConversionExecute_F32(cv_input_to_grad, buf_input,
-                                          mkl_buffer_convert), E_SUCCESS);
+                                          mkl_buffer_convert),
+                 E_SUCCESS);
         relu_res[dnnResourceSrc] = mkl_buffer_convert;
         dnnDelete_F32(cv_input_to_grad);
       } else {
@@ -325,7 +325,8 @@ void MklReluGradOp<Device, T>::Compute(OpKernelContext* context) {
   float negative_slope = 0.0;
   CHECK_EQ(dnnReLUCreateBackward_F32(&mkl_context.prim_relu_bwd, NULL,
                                      mkl_context.lt_grad, mkl_context.lt_grad,
-                                     negative_slope), E_SUCCESS);
+                                     negative_slope),
+           E_SUCCESS);
   Tensor mkl_tmp_input_buf_tensor;
   mkl_context.MklPrepareReluGradInputs(context, &mkl_tmp_input_buf_tensor);
 
@@ -348,7 +349,8 @@ void MklReluGradOp<Device, T>::Compute(OpKernelContext* context) {
     }
 
     tf_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
-                    mkl_context.output_shape.GetMklLayout())) / sizeof(T));
+                        mkl_context.output_shape.GetMklLayout())) /
+                    sizeof(T));
     AllocateOutputSetMklShape(context, 0, &output, tf_shape,
                               mkl_context.output_shape);
   } else {
@@ -361,13 +363,11 @@ void MklReluGradOp<Device, T>::Compute(OpKernelContext* context) {
   mkl_context.relu_res[dnnResourceDiffSrc] =
       static_cast<void*>(output->flat<T>().data());
 
-  CHECK_EQ(dnnExecute_F32(mkl_context.prim_relu_bwd,
-                          mkl_context.relu_res),
-                          E_SUCCESS);
+  CHECK_EQ(dnnExecute_F32(mkl_context.prim_relu_bwd, mkl_context.relu_res),
+           E_SUCCESS);
   mkl_context.MklCleanup();
 }
 
-
 #else  // INTEL_MKL_DNN
 
 template <typename Device, typename T, algorithm alg_kind>
@@ -375,8 +375,7 @@ class MklReluOpBase : public OpKernel {
  public:
   ~MklReluOpBase() {}
 
-  explicit MklReluOpBase(OpKernelConstruction* context) : OpKernel(context) {
-  }
+  explicit MklReluOpBase(OpKernelConstruction* context) : OpKernel(context) {}
 
   virtual void Compute_Scalar(OpKernelContext* context) = 0;
 
@@ -413,12 +412,12 @@ class MklReluOpBase : public OpKernel {
 
       T alpha = 0, beta = 0;
       std::shared_ptr<relu_forward::primitive_desc> relu_fwd_pd;
-      auto relu_fwd_desc = relu_forward::desc(prop_kind::forward_training,
+      auto relu_fwd_desc = relu_forward::desc(
+          prop_kind::forward_training,
           // Operator memory descriptor is same as user memory descriptor.
-                                              alg_kind, src.GetUsrMemDesc(),
-                                              alpha, beta);
-      relu_fwd_pd.reset(new relu_forward::primitive_desc(relu_fwd_desc,
-                                                         cpu_engine));
+          alg_kind, src.GetUsrMemDesc(), alpha, beta);
+      relu_fwd_pd.reset(
+          new relu_forward::primitive_desc(relu_fwd_desc, cpu_engine));
 
       // allocate dst tensor
       MklDnnShape dnn_shape_dst;
@@ -431,7 +430,7 @@ class MklReluOpBase : public OpKernel {
         dnn_shape_dst.SetTfLayout(dnn_shape_src.GetDimension(),
                                   dnn_shape_src.GetSizesAsMklDnnDims(),
                                   dnn_shape_src.GetTfDataFormat());
-        tf_shape_dst.AddDim(dst_pd.get_size()/sizeof(T));
+        tf_shape_dst.AddDim(dst_pd.get_size() / sizeof(T));
       } else {
         dnn_shape_dst.SetMklTensor(false);
         tf_shape_dst = src_tensor.shape();
@@ -445,34 +444,32 @@ class MklReluOpBase : public OpKernel {
 
       // execute net
       std::vector<primitive> net;
-      auto relu_fwd = relu_forward(*relu_fwd_pd, src.GetOpMem(),
-                                   dst.GetOpMem());
+      auto relu_fwd =
+          relu_forward(*relu_fwd_pd, src.GetOpMem(), dst.GetOpMem());
       net.push_back(relu_fwd);
       stream(stream::kind::eager).submit(net).wait();
-    } catch (mkldnn::error &e) {
+    } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
-                         ", message: " + string(e.message) +
-                         ", in file " + string(__FILE__) + ":" +
-                         std::to_string(__LINE__);
-      OP_REQUIRES_OK(context,
-                     errors::Aborted("Operation received an exception:",
-                        error_msg));
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
     }
   }
 };
 
-
 template <typename Device, typename T, algorithm alg_kind>
 class MklReluGradOpBase : public OpKernel {
  public:
   ~MklReluGradOpBase() {}
 
-  explicit MklReluGradOpBase(OpKernelConstruction* context) :
-    OpKernel(context) {}
+  explicit MklReluGradOpBase(OpKernelConstruction* context)
+      : OpKernel(context) {}
 
   virtual void Compute_Scalar(OpKernelContext* context) = 0;
 
-  void Compute(OpKernelContext* context)  {
+  void Compute(OpKernelContext* context) {
     try {
       auto cpu_engine = engine(engine::cpu, 0);
       MklDnnData<T> src(&cpu_engine);
@@ -483,9 +480,9 @@ class MklReluGradOpBase : public OpKernel {
       const size_t src_index = 1;       // index of src input tensor
       const size_t diff_src_index = 0;  // index of diff_src output tensor
 
-      const Tensor& src_tensor      = MklGetInput(context, src_index);
+      const Tensor& src_tensor = MklGetInput(context, src_index);
       const Tensor& diff_dst_tensor = MklGetInput(context, diff_dst_index);
-      Tensor* diff_src_tensor       = nullptr;
+      Tensor* diff_src_tensor = nullptr;
 
       MklDnnShape dnn_shape_src, dnn_shape_diff_dst;
       GetMklShape(context, src_index, &dnn_shape_src);
@@ -526,25 +523,25 @@ class MklReluGradOpBase : public OpKernel {
         src_md = dnn_shape_src.GetMklLayout();
 
         memory::format src_mkl_data_format = dnn_shape_src.GetTfDataFormat();
-        auto src_tf_data_format = MklDnnDataFormatToTFDataFormat(
-                                    src_mkl_data_format);
+        auto src_tf_data_format =
+            MklDnnDataFormatToTFDataFormat(src_mkl_data_format);
         auto diff_dst_dims = TFShapeToMklDnnDimsInNCHW(diff_dst_tensor.shape(),
                                                        src_tf_data_format);
-        diff_dst_md = memory::desc(diff_dst_dims, MklDnnType<T>(),
-                                   src_mkl_data_format);
+        diff_dst_md =
+            memory::desc(diff_dst_dims, MklDnnType<T>(), src_mkl_data_format);
       } else if (!dnn_shape_src.IsMklTensor() &&
-                  dnn_shape_diff_dst.IsMklTensor()) {
+                 dnn_shape_diff_dst.IsMklTensor()) {
         // Same comment as above.
         diff_dst_md = dnn_shape_diff_dst.GetMklLayout();
 
         memory::format diff_dst_mkl_data_format =
-          dnn_shape_diff_dst.GetTfDataFormat();
-        auto diff_dst_tf_data_format = MklDnnDataFormatToTFDataFormat(
-                                          diff_dst_mkl_data_format);
+            dnn_shape_diff_dst.GetTfDataFormat();
+        auto diff_dst_tf_data_format =
+            MklDnnDataFormatToTFDataFormat(diff_dst_mkl_data_format);
         auto src_dims = TFShapeToMklDnnDimsInNCHW(src_tensor.shape(),
                                                   diff_dst_tf_data_format);
-        src_md = memory::desc(src_dims, MklDnnType<T>(),
-                              diff_dst_mkl_data_format);
+        src_md =
+            memory::desc(src_dims, MklDnnType<T>(), diff_dst_mkl_data_format);
       } else {
         // If both the inputs are in MKL format, we use Mkl layout of the input
         // tensors.
@@ -572,12 +569,12 @@ class MklReluGradOpBase : public OpKernel {
       std::shared_ptr<relu_forward::primitive_desc> relu_fwd_pd;
       auto relu_fwd_desc = relu_forward::desc(prop_kind::forward_training,
                                               alg_kind, src_md, alpha, beta);
-      relu_fwd_pd.reset(new relu_forward::primitive_desc(relu_fwd_desc,
-                                                         cpu_engine));
-      auto relu_bwd_desc = relu_backward::desc(alg_kind, common_md, common_md,
-                                                alpha, beta);
-      auto relu_bwd_pd  = relu_backward::primitive_desc(relu_bwd_desc,
-                                                cpu_engine, *relu_fwd_pd);
+      relu_fwd_pd.reset(
+          new relu_forward::primitive_desc(relu_fwd_desc, cpu_engine));
+      auto relu_bwd_desc =
+          relu_backward::desc(alg_kind, common_md, common_md, alpha, beta);
+      auto relu_bwd_pd = relu_backward::primitive_desc(
+          relu_bwd_desc, cpu_engine, *relu_fwd_pd);
 
       // allocate diff_src tensor
       MklDnnShape dnn_shape_diff_src;
@@ -590,33 +587,32 @@ class MklReluGradOpBase : public OpKernel {
         dnn_shape_diff_src.SetTfLayout(dnn_shape_src.GetDimension(),
                                        dnn_shape_src.GetSizesAsMklDnnDims(),
                                        dnn_shape_src.GetTfDataFormat());
-        tf_shape_diff_src.AddDim(diff_src_pd.get_size()/sizeof(T));
+        tf_shape_diff_src.AddDim(diff_src_pd.get_size() / sizeof(T));
       } else {
         dnn_shape_diff_src.SetMklTensor(false);
         tf_shape_diff_src = src_tensor.shape();
       }
       AllocateOutputSetMklShape(context, diff_src_index, &diff_src_tensor,
-                                 tf_shape_diff_src, dnn_shape_diff_src);
+                                tf_shape_diff_src, dnn_shape_diff_src);
 
       // diff_src memory descriptor is same as memory descriptor for both
       // inputs.
       diff_src.SetUsrMem(common_md, diff_src_tensor);
 
       PrepareAndExecuteNet(relu_bwd_pd, &src, &diff_src, &diff_dst);
-     } catch (mkldnn::error &e) {
-       string error_msg = "Status: " + std::to_string(e.status) +
-                          ", message: " + string(e.message) +
-                          ", in file " + string(__FILE__) + ":" +
-                          std::to_string(__LINE__);
-       OP_REQUIRES_OK(context,
-                      errors::Aborted("Operation received an exception:",
-                                      error_msg));
+    } catch (mkldnn::error& e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
     }
   }
 
   void PrepareAndExecuteNet(const relu_backward::primitive_desc& relu_prim_desc,
-                  MklDnnData<T>* src, MklDnnData<T>* diff_src, MklDnnData<T>*
-                  diff_dst) {
+                            MklDnnData<T>* src, MklDnnData<T>* diff_src,
+                            MklDnnData<T>* diff_dst) {
     std::vector<primitive> net;
 
     // Check if we need to reorder original input tensors into common_md layout
@@ -632,14 +628,13 @@ class MklReluGradOpBase : public OpKernel {
   }
 };
 
-
 template <typename Device, typename T>
 class MklReluOp : public MklReluOpBase<Device, T, eltwise_relu> {
  public:
   ~MklReluOp() {}
 
-  explicit MklReluOp(OpKernelConstruction* context) :
-  MklReluOpBase<Device, T, eltwise_relu>(context) {}
+  explicit MklReluOp(OpKernelConstruction* context)
+      : MklReluOpBase<Device, T, eltwise_relu>(context) {}
 
   virtual void Compute_Scalar(OpKernelContext* context) {
     const size_t src_index = 0;  // index of src input tensor
@@ -649,15 +644,15 @@ class MklReluOp : public MklReluOpBase<Device, T, eltwise_relu> {
     GetMklShape(context, src_index, &dnn_shape_src);
 
     Tensor* dst_tensor = nullptr;
-    void* user_i = static_cast<void*>(const_cast<T*>(
-                         src_tensor.flat<T>().data()));
+    void* user_i =
+        static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
     MklDnnShape dnn_shape_dst;
     dnn_shape_dst.SetMklTensor(false);
     AllocateOutputSetMklShape(context, dst_index, &dst_tensor,
                               src_tensor.shape(), dnn_shape_dst);
     void* out_o = static_cast<void*>(dst_tensor->flat<T>().data());
     (static_cast<T*>(out_o))[0] =
-              std::max((static_cast<T*>(user_i))[0], static_cast<T>(0));
+        std::max((static_cast<T*>(user_i))[0], static_cast<T>(0));
     return;
   }
 };
@@ -667,14 +662,14 @@ class MklReluGradOp : public MklReluGradOpBase<Device, T, eltwise_relu> {
  public:
   ~MklReluGradOp() {}
 
-  explicit MklReluGradOp(OpKernelConstruction* context) :
-  MklReluGradOpBase<Device, T, eltwise_relu>(context) {}
+  explicit MklReluGradOp(OpKernelConstruction* context)
+      : MklReluGradOpBase<Device, T, eltwise_relu>(context) {}
 
   virtual void Compute_Scalar(OpKernelContext* context) {
     const size_t diff_dst_index = 0;  // index of diff_dst input tensor
     const size_t src_index = 1;       // index of src input tensor
     const size_t diff_src_index = 0;  // index of diff_src output tensor
-    const Tensor& src_tensor    = MklGetInput(context, src_index);
+    const Tensor& src_tensor = MklGetInput(context, src_index);
     const Tensor& diff_dst_tensor = MklGetInput(context, diff_dst_index);
     Tensor* diff_src_tensor = nullptr;
 
@@ -687,11 +682,11 @@ class MklReluGradOp : public MklReluGradOpBase<Device, T, eltwise_relu> {
                               diff_dst_tensor.shape(), dnn_shape_diff_src);
     void* out_o = static_cast<void*>(diff_src_tensor->flat<T>().data());
     void* user_i =
-          static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
+        static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
     void* user_g =
-          static_cast<void*>(const_cast<T*>(diff_dst_tensor.flat<T>().data()));
-    (static_cast<T*>(out_o))[0] = (static_cast<T*>(user_g))[0] *
-                                  ((static_cast<T*>(user_i))[0] > 0);
+        static_cast<void*>(const_cast<T*>(diff_dst_tensor.flat<T>().data()));
+    (static_cast<T*>(out_o))[0] =
+        (static_cast<T*>(user_g))[0] * ((static_cast<T*>(user_i))[0] > 0);
     return;
   }
 };
@@ -701,8 +696,8 @@ class MklEluOp : public MklReluOpBase<Device, T, eltwise_elu> {
  public:
   ~MklEluOp() {}
 
-  explicit MklEluOp(OpKernelConstruction* context) :
-  MklReluOpBase<Device, T, eltwise_elu>(context) {}
+  explicit MklEluOp(OpKernelConstruction* context)
+      : MklReluOpBase<Device, T, eltwise_elu>(context) {}
 
   virtual void Compute_Scalar(OpKernelContext* context) {
     const size_t src_index = 0;  // index of src input tensor
@@ -712,8 +707,8 @@ class MklEluOp : public MklReluOpBase<Device, T, eltwise_elu> {
     GetMklShape(context, src_index, &dnn_shape_src);
 
     Tensor* dst_tensor = nullptr;
-    void* user_i = static_cast<void*>(const_cast<T*>(
-                         src_tensor.flat<T>().data()));
+    void* user_i =
+        static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
     MklDnnShape dnn_shape_dst;
     dnn_shape_dst.SetMklTensor(false);
     AllocateOutputSetMklShape(context, dst_index, &dst_tensor,
@@ -734,14 +729,14 @@ class MklEluGradOp : public MklReluGradOpBase<Device, T, eltwise_elu> {
  public:
   ~MklEluGradOp() {}
 
-  explicit MklEluGradOp(OpKernelConstruction* context) :
-  MklReluGradOpBase<Device, T, eltwise_elu>(context) {}
+  explicit MklEluGradOp(OpKernelConstruction* context)
+      : MklReluGradOpBase<Device, T, eltwise_elu>(context) {}
 
   virtual void Compute_Scalar(OpKernelContext* context) {
     const size_t diff_dst_index = 0;  // index of diff_dst input tensor
     const size_t src_index = 1;       // index of src input tensor
     const size_t diff_src_index = 0;  // index of diff_src output tensor
-    const Tensor& src_tensor    = MklGetInput(context, src_index);
+    const Tensor& src_tensor = MklGetInput(context, src_index);
     const Tensor& diff_dst_tensor = MklGetInput(context, diff_dst_index);
     Tensor* diff_src_tensor = nullptr;
 
@@ -754,9 +749,9 @@ class MklEluGradOp : public MklReluGradOpBase<Device, T, eltwise_elu> {
                               diff_dst_tensor.shape(), dnn_shape_diff_src);
     void* out_o = static_cast<void*>(diff_src_tensor->flat<T>().data());
     void* user_i =
-          static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
+        static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
     void* user_g =
-          static_cast<void*>(const_cast<T*>(diff_dst_tensor.flat<T>().data()));
+        static_cast<void*>(const_cast<T*>(diff_dst_tensor.flat<T>().data()));
     // gradient of elu(x) = 1 if x > 0; elu(x) + 1 otherwise
     T feature = (static_cast<T*>(user_i))[0];
     if (feature > 0) {
@@ -773,8 +768,8 @@ class MklTanhOp : public MklReluOpBase<Device, T, eltwise_tanh> {
  public:
   ~MklTanhOp() {}
 
-  explicit MklTanhOp(OpKernelConstruction* context) :
-  MklReluOpBase<Device, T, eltwise_tanh>(context) {}
+  explicit MklTanhOp(OpKernelConstruction* context)
+      : MklReluOpBase<Device, T, eltwise_tanh>(context) {}
 
   virtual void Compute_Scalar(OpKernelContext* context) {
     const size_t src_index = 0;  // index of src input tensor
@@ -784,8 +779,8 @@ class MklTanhOp : public MklReluOpBase<Device, T, eltwise_tanh> {
     GetMklShape(context, src_index, &dnn_shape_src);
 
     Tensor* dst_tensor = nullptr;
-    void* user_i = static_cast<void*>(const_cast<T*>(
-                         src_tensor.flat<T>().data()));
+    void* user_i =
+        static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
     MklDnnShape dnn_shape_dst;
     dnn_shape_dst.SetMklTensor(false);
     AllocateOutputSetMklShape(context, dst_index, &dst_tensor,
@@ -795,7 +790,7 @@ class MklTanhOp : public MklReluOpBase<Device, T, eltwise_tanh> {
     T feature = (static_cast<T*>(user_i))[0];
     T e1 = std::exp(feature);
     T e2 = std::exp(-feature);
-    (static_cast<T*>(out_o))[0] = (e1 - e2)/(e1 + e2);
+    (static_cast<T*>(out_o))[0] = (e1 - e2) / (e1 + e2);
     return;
   }
 };
@@ -805,14 +800,14 @@ class MklTanhGradOp : public MklReluGradOpBase<Device, T, eltwise_tanh> {
  public:
   ~MklTanhGradOp() {}
 
-  explicit MklTanhGradOp(OpKernelConstruction* context) :
-  MklReluGradOpBase<Device, T, eltwise_tanh>(context) {}
+  explicit MklTanhGradOp(OpKernelConstruction* context)
+      : MklReluGradOpBase<Device, T, eltwise_tanh>(context) {}
 
   virtual void Compute_Scalar(OpKernelContext* context) {
     const size_t diff_dst_index = 0;  // index of diff_dst input tensor
     const size_t src_index = 1;       // index of src input tensor
     const size_t diff_src_index = 0;  // index of diff_src output tensor
-    const Tensor& src_tensor    = MklGetInput(context, src_index);
+    const Tensor& src_tensor = MklGetInput(context, src_index);
     const Tensor& diff_dst_tensor = MklGetInput(context, diff_dst_index);
     Tensor* diff_src_tensor = nullptr;
 
@@ -825,16 +820,16 @@ class MklTanhGradOp : public MklReluGradOpBase<Device, T, eltwise_tanh> {
                               diff_dst_tensor.shape(), dnn_shape_diff_src);
     void* out_o = static_cast<void*>(diff_src_tensor->flat<T>().data());
     void* user_i =
-          static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
+        static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
     // gradient of tanh(x) = 1 - tanh(x)^2
     T feature = (static_cast<T*>(user_i))[0];
     T e1 = std::exp(feature);
     T e2 = std::exp(-feature);
-    T tanh = (e1 - e2)/(e1 + e2);
+    T tanh = (e1 - e2) / (e1 + e2);
     void* user_g =
-          static_cast<void*>(const_cast<T*>(diff_dst_tensor.flat<T>().data()));
-    (static_cast<T*>(out_o))[0] = (static_cast<T*>(user_g))[0] *
-                                  (1 - tanh * tanh);
+        static_cast<void*>(const_cast<T*>(diff_dst_tensor.flat<T>().data()));
+    (static_cast<T*>(out_o))[0] =
+        (static_cast<T*>(user_g))[0] * (1 - tanh * tanh);
   }
 };
 
@@ -857,13 +852,13 @@ TF_CALL_float(REGISTER_RELU_MKL_SUPPORTED_KERNELS_TYPES);
 #ifdef INTEL_MKL_DNN
 
 // register dnn kernels for supported operations and supported types
-#define REGISTER_ELU_MKL_SUPPORTED_KERNELS_TYPES(type)             \
-  REGISTER_KERNEL_BUILDER(Name("_MklElu")                          \
+#define REGISTER_ELU_MKL_SUPPORTED_KERNELS_TYPES(type)              \
+  REGISTER_KERNEL_BUILDER(Name("_MklElu")                           \
                               .Device(DEVICE_CPU)                   \
                               .TypeConstraint<type>("T")            \
                               .Label(mkl_op_registry::kMklOpLabel), \
-                          MklEluOp<CPUDevice, type>);              \
-  REGISTER_KERNEL_BUILDER(Name("_MklEluGrad")                      \
+                          MklEluOp<CPUDevice, type>);               \
+  REGISTER_KERNEL_BUILDER(Name("_MklEluGrad")                       \
                               .Device(DEVICE_CPU)                   \
                               .TypeConstraint<type>("T")            \
                               .Label(mkl_op_registry::kMklOpLabel), \
@@ -888,4 +883,3 @@ TF_CALL_float(REGISTER_TANH_MKL_SUPPORTED_KERNELS_TYPES);
 }  // namespace tensorflow
 
 #endif  // INTEL_MKL
-
diff --git a/tensorflow/core/kernels/mkl_reshape_op.cc b/tensorflow/core/kernels/mkl_reshape_op.cc
index b41e529357..7d471e1e4c 100644
--- a/tensorflow/core/kernels/mkl_reshape_op.cc
+++ b/tensorflow/core/kernels/mkl_reshape_op.cc
@@ -166,9 +166,9 @@ class MklReshapeOp : public OpKernel {
     MklDnnShape mkl_shape_input;
     GetMklShape(context, kInputSlotIdx, &mkl_shape_input);
     bool input_in_mkl_format = mkl_shape_input.IsMklTensor();
-    const int64 nelems = input_in_mkl_format ?
-                         mkl_shape_input.GetTfShape().num_elements()
-                         : input_tensor.NumElements();
+    const int64 nelems = input_in_mkl_format
+                             ? mkl_shape_input.GetTfShape().num_elements()
+                             : input_tensor.NumElements();
 
     // Preliminary validation of sizes.
     OP_REQUIRES(context, IsLegacyVector(sizes.shape()),
@@ -210,11 +210,11 @@ class MklReshapeOp : public OpKernel {
               product));
       shape.set_dim(unknown_index, missing);
     }
-    OP_REQUIRES(context, shape.num_elements() == nelems,
-                errors::InvalidArgument("Input to reshape is a tensor with ",
-                                        nelems,
-                                        " values, but the requested shape has ",
-                                        shape.num_elements()));
+    OP_REQUIRES(
+        context, shape.num_elements() == nelems,
+        errors::InvalidArgument("Input to reshape is a tensor with ", nelems,
+                                " values, but the requested shape has ",
+                                shape.num_elements()));
 
     if (input_in_mkl_format) {
       TensorShape& shape_to = shape;
@@ -237,38 +237,38 @@ class MklReshapeOp : public OpKernel {
           // need to update MklDnnShape object associated with the input
           // tensor to reflect the shape change expected by reshape.
           if (!SkipReorder(mkl_shape_input, shape_to)) {
-              // If dimensions that are being expanded or collapsed are not
-              // maintained contiguously by MKLDNN, then we use reorder.
-
-              // Get Mkl layout of input tensor.
-              auto input_mkl_md = mkl_shape_input.GetMklLayout();
-              // Set input Mkl layout as the user layout.
-              dnn_data_input.SetUsrMem(input_mkl_md, &input_tensor);
-              // Get expected Tensorflow layout of input tensor.
-              auto output_tf_md = mkl_shape_input.GetTfLayout();
-              auto output_tf_pd = memory::primitive_desc(output_tf_md,
-                                                         cpu_engine);
-
-              Tensor* output_tensor = nullptr;
-              MklShape mkl_shape_output;
-              mkl_shape_output.SetMklTensor(false);
-              // We allocate output tensor in the shape expected by Reshape.
-              AllocateOutputSetMklShape(context, kOutputSlotIdx, &output_tensor,
-                                        shape_to, mkl_shape_output);
-
-              // Insert reorder between Mkl layout and TensorFlow layout if
-              // needed. If reorder is not needed but reshape is needed (since
-              // shape_from != shape_to), then we just copy input tensor to
-              // output tensor with target shape (we cannot forward Mkl layout
-              // in such case because shape has changed.)
-              std::vector<primitive> net;
-              if (dnn_data_input.CheckReorderToOpMem(output_tf_pd,
-                       output_tensor, &net)) {
-                stream(stream::kind::eager).submit(net).wait();
-              } else {
-                output_tensor->CopyFrom(input_tensor, shape_to);
-              }
-              return;
+            // If dimensions that are being expanded or collapsed are not
+            // maintained contiguously by MKLDNN, then we use reorder.
+
+            // Get Mkl layout of input tensor.
+            auto input_mkl_md = mkl_shape_input.GetMklLayout();
+            // Set input Mkl layout as the user layout.
+            dnn_data_input.SetUsrMem(input_mkl_md, &input_tensor);
+            // Get expected Tensorflow layout of input tensor.
+            auto output_tf_md = mkl_shape_input.GetTfLayout();
+            auto output_tf_pd =
+                memory::primitive_desc(output_tf_md, cpu_engine);
+
+            Tensor* output_tensor = nullptr;
+            MklShape mkl_shape_output;
+            mkl_shape_output.SetMklTensor(false);
+            // We allocate output tensor in the shape expected by Reshape.
+            AllocateOutputSetMklShape(context, kOutputSlotIdx, &output_tensor,
+                                      shape_to, mkl_shape_output);
+
+            // Insert reorder between Mkl layout and TensorFlow layout if
+            // needed. If reorder is not needed but reshape is needed (since
+            // shape_from != shape_to), then we just copy input tensor to
+            // output tensor with target shape (we cannot forward Mkl layout
+            // in such case because shape has changed.)
+            std::vector<primitive> net;
+            if (dnn_data_input.CheckReorderToOpMem(output_tf_pd, output_tensor,
+                                                   &net)) {
+              stream(stream::kind::eager).submit(net).wait();
+            } else {
+              output_tensor->CopyFrom(input_tensor, shape_to);
+            }
+            return;
           } else {
             // If dimensions that are being expanded or collapsed are
             // maintained contiguously by MKLDNN, then we skip reorder, just
@@ -276,10 +276,10 @@ class MklReshapeOp : public OpKernel {
             // Tensorflow tensor as it is to the output.
             auto output_dims = TFShapeToMklDnnDims(shape_to);
             auto output_strides = CalculateTFStrides(output_dims);
-            auto output_tf_md = MklDnnData<T>::CreateBlockedMemDesc(output_dims,
-                                                               output_strides);
-            auto output_tf_pd = memory::primitive_desc(output_tf_md,
-                                                       cpu_engine);
+            auto output_tf_md = MklDnnData<T>::CreateBlockedMemDesc(
+                output_dims, output_strides);
+            auto output_tf_pd =
+                memory::primitive_desc(output_tf_md, cpu_engine);
 
             // Set MklDnnShape
             MklDnnShape mkl_shape_output;
@@ -291,18 +291,17 @@ class MklReshapeOp : public OpKernel {
 
             // We now simply forward input Mkl tensor to output and change its
             // output MklDnnShape object.
-            ForwardMklTensorInToOutWithMklShape(context, kInputSlotIdx,
-                                              kOutputSlotIdx, mkl_shape_output);
+            ForwardMklTensorInToOutWithMklShape(
+                context, kInputSlotIdx, kOutputSlotIdx, mkl_shape_output);
             return;
           }
-        } catch (mkldnn::error &e) {
+        } catch (mkldnn::error& e) {
           string error_msg = "Status: " + std::to_string(e.status) +
-                       ", message: " + string(e.message) +
-                       ", in file " + string(__FILE__) + ":" +
-                       std::to_string(__LINE__);
-          OP_REQUIRES_OK(context,
-                   errors::Aborted("Operation received an exception:",
-                      error_msg));
+                             ", message: " + string(e.message) + ", in file " +
+                             string(__FILE__) + ":" + std::to_string(__LINE__);
+          OP_REQUIRES_OK(
+              context,
+              errors::Aborted("Operation received an exception:", error_msg));
         }
       }
     } else {
diff --git a/tensorflow/core/kernels/mkl_tfconv_op.cc b/tensorflow/core/kernels/mkl_tfconv_op.cc
new file mode 100644
index 0000000000..c35f857cfe
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_tfconv_op.cc
@@ -0,0 +1,124 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef INTEL_MKL
+
+#include <algorithm>
+#include <vector>
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+#include "mkl_dnn.h"
+#include "mkl_dnn_types.h"
+#include "tensorflow/core/util/mkl_util.h"
+
+namespace tensorflow {
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+///////////////////////////////////////////////////////////
+//               Op kernel
+///////////////////////////////////////////////////////////
+
+template <typename Device, typename T>
+class MklToTfOp : public OpKernel {
+ public:
+  explicit MklToTfOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format_str));
+    OP_REQUIRES_OK(context, context->GetAttr("T", &op_data_type));
+    has_avx512f_ = port::TestCPUFeature(port::CPUFeature::AVX512F);
+  }
+
+  void Compute(OpKernelContext* context) override {
+    // Check that input tensor is in MKL format.
+    const Tensor& input_tensor = MklGetInput(context, 0);
+    MklShape input_shape;
+    GetMklShape(context, 0, &input_shape);
+
+    // if input is already in Tf format, then just copy input tensor to output.
+    if (!input_shape.IsMklTensor()) {
+      context->set_output(0, input_tensor);
+      VLOG(1) << "MKLToTFConversion: No conversion needed, "
+              << "copying input to output";
+      return;
+    }
+
+    // Check that input data type is same as operator data type and that it is
+    // same as output data type.
+    DataType input_data_type = input_type(0);
+    DataType output_data_type = output_type(0);
+    CHECK_EQ(op_data_type, input_data_type);
+    CHECK_EQ(op_data_type, output_data_type);
+
+    TensorShape output_shape;
+    size_t ndims = input_shape.GetDimension();
+    size_t* in_sizes = new size_t[ndims];
+    for (size_t i = 0; i < ndims; i++) {
+      // Outermost to innermost dimension
+      output_shape.AddDim(input_shape.GetSizes()[input_shape.tf_dim_idx(i)]);
+      in_sizes[i] = input_shape.GetSizes()[i];
+    }
+
+    // Allocate output tensor.
+    Tensor* output_tensor = NULL;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, output_shape, &output_tensor));
+
+    dnnLayout_t output_layout =
+        static_cast<dnnLayout_t>(input_shape.GetTfLayout());
+    // Execute DNNConversion.
+    void* input_buffer =
+        static_cast<void*>(const_cast<T*>(input_tensor.flat<T>().data()));
+    delete[] in_sizes;
+    void* output_buffer =
+        static_cast<void*>(const_cast<T*>(output_tensor->flat<T>().data()));
+    input_shape.GetConvertedFlatData(output_layout, input_buffer,
+                                     output_buffer);
+    VLOG(1) << "MKLToTFConversion complete successfully.";
+  }
+
+ private:
+  /// Data format of the operation
+  string data_format_str;
+
+  /// Data type of the operation
+  DataType op_data_type;
+
+  /// CPUIDInfo
+  bool has_avx512f_ = false;
+};
+
+///////////////////////////////////////////////////////////
+//               Register kernel
+///////////////////////////////////////////////////////////
+
+#define REGISTER_CPU(T)                                             \
+  REGISTER_KERNEL_BUILDER(Name("_MklToTf")                          \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<T>("T")               \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklToTfOp<CPUDevice, T>);
+
+TF_CALL_float(REGISTER_CPU);
+#undef REGISTER_CPU
+}  // namespace tensorflow
+#endif /* INTEL_MKL */
diff --git a/tensorflow/core/kernels/non_max_suppression_op.cc b/tensorflow/core/kernels/non_max_suppression_op.cc
index 64bdef0008..5d28b87e6b 100644
--- a/tensorflow/core/kernels/non_max_suppression_op.cc
+++ b/tensorflow/core/kernels/non_max_suppression_op.cc
@@ -92,13 +92,11 @@ static inline bool IOUGreaterThanThreshold(
   return iou > iou_threshold;
 }
 
-void DoNonMaxSuppressionOp(OpKernelContext* context,
-                           const Tensor& boxes,
-                           const Tensor& scores,
-                           const Tensor& max_output_size,
+void DoNonMaxSuppressionOp(OpKernelContext* context, const Tensor& boxes,
+                           const Tensor& scores, const Tensor& max_output_size,
                            const float iou_threshold) {
   OP_REQUIRES(context, iou_threshold >= 0 && iou_threshold <= 1,
-      errors::InvalidArgument("iou_threshold must be in [0, 1]"));
+              errors::InvalidArgument("iou_threshold must be in [0, 1]"));
 
   int num_boxes = 0;
   ParseAndCheckBoxSizes(context, boxes, scores, &num_boxes);
@@ -106,10 +104,8 @@ void DoNonMaxSuppressionOp(OpKernelContext* context,
     return;
   }
 
-  const int output_size =
-      std::min(max_output_size.scalar<int>()(), num_boxes);
-  typename TTypes<float, 2>::ConstTensor boxes_data =
-      boxes.tensor<float, 2>();
+  const int output_size = std::min(max_output_size.scalar<int>()(), num_boxes);
+  typename TTypes<float, 2>::ConstTensor boxes_data = boxes.tensor<float, 2>();
 
   std::vector<float> scores_data(num_boxes);
   std::copy_n(scores.flat<float>().data(), num_boxes, scores_data.begin());
@@ -181,8 +177,7 @@ template <typename Device>
 class NonMaxSuppressionV2Op : public OpKernel {
  public:
   explicit NonMaxSuppressionV2Op(OpKernelConstruction* context)
-      : OpKernel(context) {
-  }
+      : OpKernel(context) {}
 
   void Compute(OpKernelContext* context) override {
     // boxes: [num_boxes, 4]
@@ -197,10 +192,9 @@ class NonMaxSuppressionV2Op : public OpKernel {
                                 max_output_size.shape().DebugString()));
     // iou_threshold: scalar
     const Tensor& iou_threshold = context->input(3);
-    OP_REQUIRES(
-        context, TensorShapeUtils::IsScalar(iou_threshold.shape()),
-        errors::InvalidArgument("iou_threshold must be 0-D, got shape ",
-                                iou_threshold.shape().DebugString()));
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(iou_threshold.shape()),
+                errors::InvalidArgument("iou_threshold must be 0-D, got shape ",
+                                        iou_threshold.shape().DebugString()));
 
     const float iou_threshold_val = iou_threshold.scalar<float>()();
 
diff --git a/tensorflow/core/kernels/non_max_suppression_op_test.cc b/tensorflow/core/kernels/non_max_suppression_op_test.cc
index fdbcf05b89..67d9217b95 100644
--- a/tensorflow/core/kernels/non_max_suppression_op_test.cc
+++ b/tensorflow/core/kernels/non_max_suppression_op_test.cc
@@ -43,9 +43,10 @@ class NonMaxSuppressionOpTest : public OpsTestBase {
 
 TEST_F(NonMaxSuppressionOpTest, TestSelectFromThreeClusters) {
   MakeOp(.5);
-  AddInputFromArray<float>(TensorShape({6, 4}),
-                           {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
-                            0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,  1, 101});
+  AddInputFromArray<float>(
+      TensorShape({6, 4}),
+      {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+       0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,   1, 101});
   AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
   AddInputFromArray<int>(TensorShape({}), {3});
   TF_ASSERT_OK(RunOpKernel());
@@ -58,7 +59,7 @@ TEST_F(NonMaxSuppressionOpTest, TestSelectFromThreeClusters) {
 TEST_F(NonMaxSuppressionOpTest, TestSelectFromThreeClustersFlippedCoordinates) {
   MakeOp(.5);
   AddInputFromArray<float>(TensorShape({6, 4}),
-                           {1, 1,  0, 0,  0, 0.1f,  1, 1.1f,  0, .9f,  1, -0.1f,
+                           {1, 1,  0, 0,  0, 0.1f,  1, 1.1f,  0, .9f, 1, -0.1f,
                             0, 10, 1, 11, 1, 10.1f, 0, 11.1f, 1, 101, 0, 100});
   AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
   AddInputFromArray<int>(TensorShape({}), {3});
@@ -71,9 +72,10 @@ TEST_F(NonMaxSuppressionOpTest, TestSelectFromThreeClustersFlippedCoordinates) {
 
 TEST_F(NonMaxSuppressionOpTest, TestSelectAtMostTwoBoxesFromThreeClusters) {
   MakeOp(.5);
-  AddInputFromArray<float>(TensorShape({6, 4}),
-                           {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
-                            0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,  1, 101});
+  AddInputFromArray<float>(
+      TensorShape({6, 4}),
+      {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+       0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,   1, 101});
   AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
   AddInputFromArray<int>(TensorShape({}), {2});
   TF_ASSERT_OK(RunOpKernel());
@@ -85,9 +87,10 @@ TEST_F(NonMaxSuppressionOpTest, TestSelectAtMostTwoBoxesFromThreeClusters) {
 
 TEST_F(NonMaxSuppressionOpTest, TestSelectAtMostThirtyBoxesFromThreeClusters) {
   MakeOp(.5);
-  AddInputFromArray<float>(TensorShape({6, 4}),
-                           {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
-                            0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,  1, 101});
+  AddInputFromArray<float>(
+      TensorShape({6, 4}),
+      {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+       0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,   1, 101});
   AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
   AddInputFromArray<int>(TensorShape({}), {30});
   TF_ASSERT_OK(RunOpKernel());
@@ -134,9 +137,10 @@ TEST_F(NonMaxSuppressionOpTest, TestSelectFromTenIdenticalBoxes) {
 
 TEST_F(NonMaxSuppressionOpTest, TestInconsistentBoxAndScoreShapes) {
   MakeOp(.5);
-  AddInputFromArray<float>(TensorShape({6, 4}),
-                           {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
-                            0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,  1, 101});
+  AddInputFromArray<float>(
+      TensorShape({6, 4}),
+      {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+       0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,   1, 101});
   AddInputFromArray<float>(TensorShape({5}), {.9f, .75f, .6f, .95f, .5f});
   AddInputFromArray<int>(TensorShape({}), {30});
   Status s = RunOpKernel();
diff --git a/tensorflow/core/kernels/nth_element_op.cc b/tensorflow/core/kernels/nth_element_op.cc
index da825e408c..7f12eb953a 100644
--- a/tensorflow/core/kernels/nth_element_op.cc
+++ b/tensorflow/core/kernels/nth_element_op.cc
@@ -16,15 +16,15 @@ limitations under the License.
 // See docs in ../ops/nn_ops.cc.
 #include "tensorflow/core/kernels/nth_element_op.h"
 
+#include <algorithm>
+#include <iostream>
+#include <vector>
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/work_sharder.h"
-#include <vector>
-#include <algorithm>
-#include <iostream>
 
 namespace tensorflow {
 
@@ -54,8 +54,9 @@ class NthElementOp : public OpKernel {
                 errors::InvalidArgument("Input must be >= 1-D, got shape ",
                                         input_in.shape().DebugString()));
     // The last dimension of input tensor must be greater than N.
-    OP_REQUIRES(context, input_in.dim_size(num_dims-1) > n,
-                errors::InvalidArgument("Input must have at least n+1 columns"));
+    OP_REQUIRES(
+        context, input_in.dim_size(num_dims - 1) > n,
+        errors::InvalidArgument("Input must have at least n+1 columns"));
 
     // std::nth_element only support the nth-smallest selection.
     if (reverse_) {
@@ -64,7 +65,7 @@ class NthElementOp : public OpKernel {
 
     // Assume input_shape is [d1,d2,...dk], and output_shape is [d1,d2...dk-1].
     TensorShape out_shape;
-    for (int i = 0; i < num_dims-1; ++i) {
+    for (int i = 0; i < num_dims - 1; ++i) {
       out_shape.AddDim(input_in.dim_size(i));
     }
     Tensor* output_tensor = nullptr;
@@ -83,32 +84,28 @@ namespace functor {
 
 template <typename T>
 struct NthElementFunctor<CPUDevice, T> {
-  void operator() (OpKernelContext* context,
-                   const Tensor& input_tensor,
-                   Tensor& output_tensor,
-                   int n,
-                   bool reverse) {
+  void operator()(OpKernelContext* context, const Tensor& input_tensor,
+                  Tensor& output_tensor, int n, bool reverse) {
     const T* input = input_tensor.flat<T>().data();
     T* output = output_tensor.flat<T>().data();
 
     // Assume input_shape is [d1,d2,...dk], and output_shape is [d1,d2...dk-1],
     // then num_rows = d1*d2...dk-1, last_dim = dk.
     const int num_rows = output_tensor.NumElements();
-    const int last_dim = input_tensor.dim_size(input_tensor.dims()-1);
+    const int last_dim = input_tensor.dim_size(input_tensor.dims() - 1);
 
     // Allocate each row to different shard.
-    auto SubNthElement = [&, input, output, last_dim, n](int start,
-                                                         int limit) {
+    auto SubNthElement = [&, input, output, last_dim, n](int start, int limit) {
       // std::nth_element would rearrange the array, so we need a new buffer.
       std::vector<T> buf(last_dim);
 
       for (int b = start; b < limit; ++b) {
         // Copy from one row of elements to buffer
         const T* input_start = input + b * last_dim;
-        const T* input_end = input + (b+1) * last_dim;
+        const T* input_end = input + (b + 1) * last_dim;
         std::copy(input_start, input_end, buf.begin());
 
-        std::nth_element(buf.begin(), buf.begin()+n, buf.end());
+        std::nth_element(buf.begin(), buf.begin() + n, buf.end());
         // The element placed in the nth position is exactly the element that
         // would occur in this position if the range was fully sorted.
         output[b] = buf[n];
@@ -116,9 +113,9 @@ struct NthElementFunctor<CPUDevice, T> {
     };
 
     auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
-    // The average time complexity of partition-based nth_element (BFPRT) is O(n),
-    // althought the worst time complexity could be O(n^2).
-    // Here, 20 is a empirical factor of cost_per_unit.
+    // The average time complexity of partition-based nth_element (BFPRT) is
+    // O(n), althought the worst time complexity could be O(n^2). Here, 20 is a
+    // empirical factor of cost_per_unit.
     Shard(worker_threads.num_threads, worker_threads.workers, num_rows,
           20 * last_dim, SubNthElement);
   }
@@ -126,7 +123,6 @@ struct NthElementFunctor<CPUDevice, T> {
 
 }  // namespace functor
 
-
 #define REGISTER_NTHOP(T)                                           \
   REGISTER_KERNEL_BUILDER(                                          \
       Name("NthElement").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
@@ -136,4 +132,3 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_NTHOP);
 #undef REGISTER_NTHOP
 
 }  // end namespace tensorflow
-
diff --git a/tensorflow/core/kernels/nth_element_op.h b/tensorflow/core/kernels/nth_element_op.h
index 11a6c996b0..e7d25daecc 100644
--- a/tensorflow/core/kernels/nth_element_op.h
+++ b/tensorflow/core/kernels/nth_element_op.h
@@ -26,10 +26,8 @@ namespace functor {
 
 template <typename Device, typename T>
 struct NthElementFunctor {
-  void operator() (OpKernelContext* context,
-                   const Tensor& input_tensor,
-                   Tensor& output_tensor,
-                   int n);
+  void operator()(OpKernelContext* context, const Tensor& input_tensor,
+                  Tensor& output_tensor, int n);
 };
 
 }  // namespace functor
diff --git a/tensorflow/core/kernels/one_hot_op_gpu.cu.cc b/tensorflow/core/kernels/one_hot_op_gpu.cu.cc
index 49fd4bdeba..647515ae38 100644
--- a/tensorflow/core/kernels/one_hot_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/one_hot_op_gpu.cu.cc
@@ -19,16 +19,16 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/kernels/one_hot_op.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/one_hot_op.h"
 
 namespace tensorflow {
 
 typedef Eigen::GpuDevice GPUDevice;
 
-#define DEFINE_GPU_SPEC_INDEX(T, TI)                   \
-  template class generator::OneGenerator<T, TI>;       \
+#define DEFINE_GPU_SPEC_INDEX(T, TI)             \
+  template class generator::OneGenerator<T, TI>; \
   template struct functor::OneHot<GPUDevice, T, TI>;
 
 #define DEFINE_GPU_SPEC(T)         \
diff --git a/tensorflow/core/kernels/ops_util_test.cc b/tensorflow/core/kernels/ops_util_test.cc
index 9d53882dee..13427d71ff 100644
--- a/tensorflow/core/kernels/ops_util_test.cc
+++ b/tensorflow/core/kernels/ops_util_test.cc
@@ -218,7 +218,8 @@ TEST_F(OpsUtilTest, GetBroadcastTest3_3_1_2) {
 // in_size = 3, ksize = 3, stride = 2, pad_size = 0
 TEST_F(OpsUtilTest, GetBroadcastTest3_3_2_0) {
   bcast_struct bcast[] = {
-      {{0, 3, 3, 2, 0}, {0, 3}}, {{1, 3, 3, 2, 0}, {2, 1}},
+      {{0, 3, 3, 2, 0}, {0, 3}},
+      {{1, 3, 3, 2, 0}, {2, 1}},
   };
   for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) {
     VerifyBcastValues(bcast[i]);
@@ -228,7 +229,8 @@ TEST_F(OpsUtilTest, GetBroadcastTest3_3_2_0) {
 // in_size = 3, ksize = 3, stride = 2, pad_size = 1
 TEST_F(OpsUtilTest, GetBroadcastTest3_3_2_1) {
   bcast_struct bcast[] = {
-      {{0, 3, 3, 2, 1}, {0, 2}}, {{1, 3, 3, 2, 1}, {1, 2}},
+      {{0, 3, 3, 2, 1}, {0, 2}},
+      {{1, 3, 3, 2, 1}, {1, 2}},
   };
   for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) {
     VerifyBcastValues(bcast[i]);
@@ -258,7 +260,8 @@ TEST_F(OpsUtilTest, GetBroadcastTest3_3_3_0) {
 // in_size = 3, ksize = 3, stride = 3, pad_size = 1
 TEST_F(OpsUtilTest, GetBroadcastTest3_3_3_1) {
   bcast_struct bcast[] = {
-      {{0, 3, 3, 3, 1}, {0, 2}}, {{1, 3, 3, 3, 1}, {2, 1}},
+      {{0, 3, 3, 3, 1}, {0, 2}},
+      {{1, 3, 3, 3, 1}, {2, 1}},
   };
   for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) {
     VerifyBcastValues(bcast[i]);
@@ -348,8 +351,8 @@ TEST_F(OpsUtilTest, Misaligned1DSlice) {
 
 TEST_F(OpsUtilTest, Aligned2DSliceOfDim0) {
 #if EIGEN_MAX_ALIGN_BYTES == 0
-  // When EIGEN_MAX_ALIGN_BYTES is 0 and the size of the first dimension is nonzero,
-  // a multidimensional tensor is always aligned.
+  // When EIGEN_MAX_ALIGN_BYTES is 0 and the size of the first dimension is
+  // nonzero, a multidimensional tensor is always aligned.
   Tensor t(DT_FLOAT, TensorShape({3, 4}));
   int64 start = 1;
   int64 end = 2;
diff --git a/tensorflow/core/kernels/pack_op.cc b/tensorflow/core/kernels/pack_op.cc
index 2033fbf5dc..e0ae5de0f4 100644
--- a/tensorflow/core/kernels/pack_op.cc
+++ b/tensorflow/core/kernels/pack_op.cc
@@ -36,7 +36,7 @@ typedef Eigen::GpuDevice GPUDevice;
 #endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // --------------------------------------------------------------------------
 template <typename Device, typename T>
@@ -123,7 +123,7 @@ class PackOp : public OpKernel {
         ConcatSYCL<T>(c->eigen_sycl_device(), inputs_flat, &output_flat);
         return;
       }
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
       ConcatCPU<T>(c->device(), inputs_flat, &output_flat);
     }
   }
diff --git a/tensorflow/core/kernels/parameterized_truncated_normal_op.cc b/tensorflow/core/kernels/parameterized_truncated_normal_op.cc
index b232ba16a7..0ab9ff9f65 100644
--- a/tensorflow/core/kernels/parameterized_truncated_normal_op.cc
+++ b/tensorflow/core/kernels/parameterized_truncated_normal_op.cc
@@ -95,9 +95,10 @@ struct TruncatedNormalFunctor<CPUDevice, T> {
         int64 sample = b * samples_per_batch;
 
         // On GPU, this check will just fill samples with NAN if it fails.
-        OP_REQUIRES(ctx, stddev > T(0) && minval < maxval &&
-                             (Eigen::numext::isfinite(minval) ||
-                              Eigen::numext::isfinite(maxval)),
+        OP_REQUIRES(ctx,
+                    stddev > T(0) && minval < maxval &&
+                        (Eigen::numext::isfinite(minval) ||
+                         Eigen::numext::isfinite(maxval)),
                     errors::InvalidArgument("Invalid parameters"));
 
         int numIterations = 0;
@@ -118,8 +119,9 @@ struct TruncatedNormalFunctor<CPUDevice, T> {
         // Determine the method to use.
         const T sqrtFactor = Eigen::numext::sqrt((normMin * normMin) + T(4));
         const T cutoff =
-            T(2) * Eigen::numext::exp(
-                       T(0.5) + (normMin * (normMin - sqrtFactor)) / T(4)) /
+            T(2) *
+            Eigen::numext::exp(T(0.5) +
+                               (normMin * (normMin - sqrtFactor)) / T(4)) /
             (normMin + sqrtFactor);
         const T diff = normMax - normMin;
         if (diff < cutoff) {
@@ -309,30 +311,34 @@ class ParameterizedTruncatedNormalOp : public OpKernel {
     } else {
       // Parameters must be broadcastable to the shape [num_batches].
       OP_REQUIRES(
-          ctx, TensorShapeUtils::IsScalar(means_tensor.shape()) ||
-                   means_tensor.dim_size(0) == 1 ||
-                   means_tensor.dim_size(0) == num_batches,
+          ctx,
+          TensorShapeUtils::IsScalar(means_tensor.shape()) ||
+              means_tensor.dim_size(0) == 1 ||
+              means_tensor.dim_size(0) == num_batches,
           errors::InvalidArgument(
               "Input means should have length 1 or shape[0], got shape: ",
               means_tensor.shape().DebugString()));
       OP_REQUIRES(
-          ctx, TensorShapeUtils::IsScalar(stddevs_tensor.shape()) ||
-                   stddevs_tensor.dim_size(0) == 1 ||
-                   stddevs_tensor.dim_size(0) == num_batches,
+          ctx,
+          TensorShapeUtils::IsScalar(stddevs_tensor.shape()) ||
+              stddevs_tensor.dim_size(0) == 1 ||
+              stddevs_tensor.dim_size(0) == num_batches,
           errors::InvalidArgument(
               "Input stddevs should have length 1 or shape[0], got shape: ",
               stddevs_tensor.shape().DebugString()));
       OP_REQUIRES(
-          ctx, TensorShapeUtils::IsScalar(minvals_tensor.shape()) ||
-                   minvals_tensor.dim_size(0) == 1 ||
-                   minvals_tensor.dim_size(0) == num_batches,
+          ctx,
+          TensorShapeUtils::IsScalar(minvals_tensor.shape()) ||
+              minvals_tensor.dim_size(0) == 1 ||
+              minvals_tensor.dim_size(0) == num_batches,
           errors::InvalidArgument(
               "Input minvals should have length 1 or shape[0], got shape: ",
               minvals_tensor.shape().DebugString()));
       OP_REQUIRES(
-          ctx, TensorShapeUtils::IsScalar(maxvals_tensor.shape()) ||
-                   maxvals_tensor.dim_size(0) == 1 ||
-                   maxvals_tensor.dim_size(0) == num_batches,
+          ctx,
+          TensorShapeUtils::IsScalar(maxvals_tensor.shape()) ||
+              maxvals_tensor.dim_size(0) == 1 ||
+              maxvals_tensor.dim_size(0) == num_batches,
           errors::InvalidArgument(
               "Input maxvals should have length 1 or shape[0], got shape: ",
               maxvals_tensor.shape().DebugString()));
diff --git a/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc b/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc
index 933de65c15..ddfeb1bb79 100644
--- a/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc
@@ -202,12 +202,13 @@ struct TruncatedNormalFunctor<GPUDevice, T> {
                   typename TTypes<T>::Flat output) {
     const auto config = GetCudaLaunchConfig(num_elements, d);
 
-    TruncatedNormalKernel<
-        T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        gen, output.data(), num_batches, samples_per_batch, num_elements,
-        means.data(), means.dimension(0) == 1, stddevs.data(),
-        stddevs.dimension(0) == 1, minvals.data(), minvals.dimension(0) == 1,
-        maxvals.data(), maxvals.dimension(0) == 1, kMaxIterations);
+    TruncatedNormalKernel<T>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            gen, output.data(), num_batches, samples_per_batch, num_elements,
+            means.data(), means.dimension(0) == 1, stddevs.data(),
+            stddevs.dimension(0) == 1, minvals.data(),
+            minvals.dimension(0) == 1, maxvals.data(),
+            maxvals.dimension(0) == 1, kMaxIterations);
   };
 };
 
diff --git a/tensorflow/core/kernels/parse_tensor_op.cc b/tensorflow/core/kernels/parse_tensor_op.cc
index 6b599612ad..dd41744f02 100644
--- a/tensorflow/core/kernels/parse_tensor_op.cc
+++ b/tensorflow/core/kernels/parse_tensor_op.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/framework/register_types.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/pooling_ops_3d.cc b/tensorflow/core/kernels/pooling_ops_3d.cc
index a406317213..01bcfede1e 100644
--- a/tensorflow/core/kernels/pooling_ops_3d.cc
+++ b/tensorflow/core/kernels/pooling_ops_3d.cc
@@ -258,7 +258,7 @@ struct LaunchMaxPooling3dGradOp<CPUDevice, T> {
           Eigen::array<int, 5> bcast = {1, csize, rsize, psize, 1};
 #else
           Eigen::IndexList<Eigen::type2index<1>, int, int, int,
-                           Eigen::type2index<1> >
+                           Eigen::type2index<1>>
               bcast;
           bcast.set(1, csize);
           bcast.set(2, rsize);
@@ -431,7 +431,7 @@ struct LaunchAvgPooling3dGradOp<CPUDevice, T> {
           Eigen::array<int, 5> bcast = {1, csize, rsize, psize, 1};
 #else
           Eigen::IndexList<Eigen::type2index<1>, int, int, int,
-                           Eigen::type2index<1> >
+                           Eigen::type2index<1>>
               bcast;
           bcast.set(1, csize);
           bcast.set(2, rsize);
@@ -833,7 +833,7 @@ TF_CALL_float(REGISTER_GPU_KERNELS) TF_CALL_half(REGISTER_GPU_KERNELS)
 
 #ifdef TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNELS(T) REGISTER_KERNELS(SYCL, T)
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS)
+    TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS)
 #undef REGISTER_SYCL_KERNELS
 #endif  // TENSORFLOW_USE_SYCL
 
diff --git a/tensorflow/core/kernels/pooling_ops_3d_sycl.h b/tensorflow/core/kernels/pooling_ops_3d_sycl.h
index c1bc5af498..b4bead2456 100644
--- a/tensorflow/core/kernels/pooling_ops_3d_sycl.h
+++ b/tensorflow/core/kernels/pooling_ops_3d_sycl.h
@@ -281,12 +281,11 @@ class MaxPool3DGradSYCL {
 
     const T* input_data_n =
         input_data + n * p_.in_planes_ * p_.in_cols_ * p_.in_rows_ * p_.depth_;
-    const T* output_data_n =
-        output_data +
-        n * p_.out_planes_ * p_.out_cols_ * p_.out_rows_ * p_.depth_;
-    const T* input_backprop_n =
-        input_backprop +
-        n * p_.out_planes_ * p_.out_cols_ * p_.out_rows_ * p_.depth_;
+    const T* output_data_n = output_data + n * p_.out_planes_ * p_.out_cols_ *
+                                               p_.out_rows_ * p_.depth_;
+    const T* input_backprop_n = input_backprop + n * p_.out_planes_ *
+                                                     p_.out_cols_ *
+                                                     p_.out_rows_ * p_.depth_;
     for (int poolp = poolpstart; poolp < poolpend; ++poolp) {
       int pstart = poolp * p_.stride_planes_ - p_.pad_planes_;
       const int pend = std::min(pstart + p_.window_planes_, p_.in_planes_);
@@ -678,9 +677,9 @@ class AvgPool3DGradSYCL {
     n /= p_.in_planes_;
 
     T gradient = T(0);
-    const T* input_backprop_n =
-        input_backprop +
-        n * p_.out_planes_ * p_.out_cols_ * p_.out_rows_ * p_.depth_;
+    const T* input_backprop_n = input_backprop + n * p_.out_planes_ *
+                                                     p_.out_cols_ *
+                                                     p_.out_rows_ * p_.depth_;
     for (int poolp = poolpstart; poolp < poolpend; ++poolp) {
       int pstart = poolp * p_.stride_planes_ - p_.pad_planes_;
       const int pend = std::min(pstart + p_.window_planes_, p_.in_planes_);
diff --git a/tensorflow/core/kernels/pooling_ops_common.h b/tensorflow/core/kernels/pooling_ops_common.h
index e3131b804f..fc7cb437b8 100644
--- a/tensorflow/core/kernels/pooling_ops_common.h
+++ b/tensorflow/core/kernels/pooling_ops_common.h
@@ -195,7 +195,6 @@ class MaxPoolingOp : public OpKernel {
       //    and updates the corresponding column(s) in output_as_matrix with the
       //    max value.
       auto shard = [&params, &in_mat, &out_mat](int64 start, int64 limit) {
-
         const int32 in_rows = params.tensor_in_rows;
         const int32 in_cols = params.tensor_in_cols;
         const int32 pad_rows = params.pad_rows;
@@ -443,7 +442,6 @@ class MaxPoolingV2Op : public OpKernel {
       //    and updates the corresponding column(s) in output_as_matrix with the
       //    max value.
       auto shard = [&params, &in_mat, &out_mat](int64 start, int64 limit) {
-
         const int32 in_rows = params.tensor_in_rows;
         const int32 in_cols = params.tensor_in_cols;
         const int32 pad_rows = params.pad_rows;
diff --git a/tensorflow/core/kernels/quantization_utils_test.cc b/tensorflow/core/kernels/quantization_utils_test.cc
index d148c9f78d..176720c22c 100644
--- a/tensorflow/core/kernels/quantization_utils_test.cc
+++ b/tensorflow/core/kernels/quantization_utils_test.cc
@@ -385,8 +385,12 @@ void TestQuantizedToFloatInPlaceUsingEigen(
   // These are the float values we're going to test the conversions on.
   typedef std::pair<float, float> FPair;
   for (FPair min_and_max : std::vector<FPair>{
-           FPair(-255.0f, 255.0f), FPair(-1.0f, 1.0f), FPair(-1.0f, 255.0f),
-           FPair(0.0f, 1e6), FPair(0.0f, 1.0f), FPair(-31.0f, 13.0f),
+           FPair(-255.0f, 255.0f),
+           FPair(-1.0f, 1.0f),
+           FPair(-1.0f, 255.0f),
+           FPair(0.0f, 1e6),
+           FPair(0.0f, 1.0f),
+           FPair(-31.0f, 13.0f),
            FPair(-5.89505e+08, 5.89505e+08),
        }) {
     const float f_min = min_and_max.first;
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op.h b/tensorflow/core/kernels/quantize_and_dequantize_op.h
index 1363c7e325..3b09ea2527 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op.h
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op.h
@@ -71,7 +71,8 @@ struct QuantizeAndDequantizeOneScaleImpl {
         out.device(d) =
             ((input.cwiseMin(max_range).cwiseMax(min_range) - min_range) *
                  scale +
-             T(0.5)).floor() *
+             T(0.5))
+                    .floor() *
                 inverse_scale +
             min_range;
       } else {
diff --git a/tensorflow/core/kernels/quantize_op_test.cc b/tensorflow/core/kernels/quantize_op_test.cc
index d2cc55a94d..57982bdf76 100644
--- a/tensorflow/core/kernels/quantize_op_test.cc
+++ b/tensorflow/core/kernels/quantize_op_test.cc
@@ -250,7 +250,8 @@ TEST_F(QuantizedOpTest, QuantizeV2_32Bit) {
   Tensor expected(allocator(), DT_QINT32, TensorShape({element_count}));
   test::FillValues<qint32>(&expected,
                            {
-                               std::numeric_limits<int32>::min(), 0,
+                               std::numeric_limits<int32>::min(),
+                               0,
                                static_cast<int32>(1.0f * (1 << 23)),
                                static_cast<int32>(1.25f * (1 << 23)),
                                static_cast<int32>(1.75f * (1 << 23)),
diff --git a/tensorflow/core/kernels/quantized_batch_norm_op.cc b/tensorflow/core/kernels/quantized_batch_norm_op.cc
index 18d83b4149..b03da7ad17 100644
--- a/tensorflow/core/kernels/quantized_batch_norm_op.cc
+++ b/tensorflow/core/kernels/quantized_batch_norm_op.cc
@@ -16,11 +16,11 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/kernels/quantization_utils.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/quantization_utils.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/quantized_concat_op.cc b/tensorflow/core/kernels/quantized_concat_op.cc
index d67f1ab3ec..b03ac8e87d 100644
--- a/tensorflow/core/kernels/quantized_concat_op.cc
+++ b/tensorflow/core/kernels/quantized_concat_op.cc
@@ -135,8 +135,8 @@ class QuantizedConcatOp : public OpKernel {
           context, in.dims() == input_dims || (input_is_scalar && in_is_scalar),
           errors::InvalidArgument(
               "ConcatOp : Ranks of all input tensors should match: shape[0] = ",
-              input_shape.DebugString(), " vs. shape[", i, "] = ",
-              in.shape().DebugString()));
+              input_shape.DebugString(), " vs. shape[", i,
+              "] = ", in.shape().DebugString()));
       for (int j = 0; j < input_dims; ++j) {
         if (j == concat_dim) {
           continue;
@@ -145,8 +145,8 @@ class QuantizedConcatOp : public OpKernel {
             context, in.dim_size(j) == input_shape.dim_size(j),
             errors::InvalidArgument(
                 "ConcatOp : Dimensions of inputs should match: shape[0] = ",
-                input_shape.DebugString(), " vs. shape[", i, "] = ",
-                in.shape().DebugString()));
+                input_shape.DebugString(), " vs. shape[", i,
+                "] = ", in.shape().DebugString()));
       }
       if (in.NumElements() > 0) {
         int64 inputs_flat_dim1 = in.NumElements() / inputs_flat_dim0;
diff --git a/tensorflow/core/kernels/quantized_conv_ops.cc b/tensorflow/core/kernels/quantized_conv_ops.cc
index 1921b83d12..5b3570edff 100644
--- a/tensorflow/core/kernels/quantized_conv_ops.cc
+++ b/tensorflow/core/kernels/quantized_conv_ops.cc
@@ -278,10 +278,9 @@ class Im2ColConvFunctor {
           *resource = new Im2ColBufferResource<T1, chunk_value_count>();
           return Status::OK();
         };
-    OP_REQUIRES_OK(
-        context,
-        context->resource_manager()->LookupOrCreate(
-            "Conv2d", "im2col_buffer", &im2col_buffer_resource, creator));
+    OP_REQUIRES_OK(context, context->resource_manager()->LookupOrCreate(
+                                "Conv2d", "im2col_buffer",
+                                &im2col_buffer_resource, creator));
     // This means that multiple ops can't be run simultaneously on different
     // threads, because we have a single shared resource. The platforms this is
     // aimed at have intra-op parallelism as their focus though, so it shouldn't
diff --git a/tensorflow/core/kernels/quantized_instance_norm.cc b/tensorflow/core/kernels/quantized_instance_norm.cc
index c29f534f31..d62094cc9f 100644
--- a/tensorflow/core/kernels/quantized_instance_norm.cc
+++ b/tensorflow/core/kernels/quantized_instance_norm.cc
@@ -278,10 +278,10 @@ class QuantizedInstanceNorm : public OpKernel {
     float input_max = context->input(2).flat<float>()(0);
     float input_scale = (input_max - input_min) / 255.0f;
 
-    OP_REQUIRES(
-        context, input_min < input_max,
-        errors::InvalidArgument("input_min must be less than input_max : ",
-                                input_min, " >= ", input_max));
+    OP_REQUIRES(context, input_min < input_max,
+                errors::InvalidArgument(
+                    "input_min must be less than input_max : ", input_min,
+                    " >= ", input_max));
 
     auto input_tensor = input.tensor<quint8, 4>();
     auto N = input_tensor.dimension(0);
diff --git a/tensorflow/core/kernels/quantized_matmul_op.cc b/tensorflow/core/kernels/quantized_matmul_op.cc
index afb30d5f62..da8c46dc51 100644
--- a/tensorflow/core/kernels/quantized_matmul_op.cc
+++ b/tensorflow/core/kernels/quantized_matmul_op.cc
@@ -104,9 +104,9 @@ class QuantizedMatMulOp : public OpKernel {
 
     OP_REQUIRES(context,
                 a.dim_size(dim_pair[0].first) == b.dim_size(dim_pair[0].second),
-                errors::InvalidArgument("Matrix size-compatible: In[0]: ",
-                                        a.shape().DebugString(), ", In[1]: ",
-                                        b.shape().DebugString()));
+                errors::InvalidArgument(
+                    "Matrix size-compatible: In[0]: ", a.shape().DebugString(),
+                    ", In[1]: ", b.shape().DebugString()));
 
     OP_REQUIRES(context, ((shift_c >= 0) && (shift_c <= 31)),
                 errors::InvalidArgument("shift_c must be between 0 and 31, "
diff --git a/tensorflow/core/kernels/quantized_matmul_op_test.cc b/tensorflow/core/kernels/quantized_matmul_op_test.cc
index 535b5115c3..c9f05dbc10 100644
--- a/tensorflow/core/kernels/quantized_matmul_op_test.cc
+++ b/tensorflow/core/kernels/quantized_matmul_op_test.cc
@@ -206,17 +206,32 @@ TEST_F(QuantizedMatMulTest, Small_WithParams) {
   // We have set the transpose_a flag to true, so the matrix is transposed, and
   // for filling the values the in-memory storage order is effectively
   // column major, rather than the default row-major.
-  AddInputFromArray<quint8>(TensorShape({a_rows, a_cols}),
-                            {
-                                11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                            });
+  AddInputFromArray<quint8>(TensorShape({a_rows, a_cols}), {
+                                                               11,
+                                                               10,
+                                                               9,
+                                                               8,
+                                                               7,
+                                                               6,
+                                                               5,
+                                                               4,
+                                                               3,
+                                                               2,
+                                                               1,
+                                                               0,
+                                                           });
 
   // The B matrix is:
   // |   1 |   4|
   // |   2 |   5|
   // |   3 |   6|
   AddInputFromArray<quint8>(TensorShape({b_rows, b_cols}), {
-                                                               1, 4, 2, 5, 3, 6,
+                                                               1,
+                                                               4,
+                                                               2,
+                                                               5,
+                                                               3,
+                                                               6,
                                                            });
   AddInputFromArray<float>(TensorShape({1}), {-12.0f});
   AddInputFromArray<float>(TensorShape({1}), {243.0f});
@@ -238,10 +253,16 @@ TEST_F(QuantizedMatMulTest, Small_WithParams) {
   // |  -50 | -113 |
   // |  -56 | -128 |
   Tensor expected(allocator(), DT_QINT32, TensorShape({a_cols, b_cols}));
-  test::FillValues<qint32>(&expected,
-                           {
-                               -38, -83, -44, -98, -50, -113, -56, -128,
-                           });
+  test::FillValues<qint32>(&expected, {
+                                          -38,
+                                          -83,
+                                          -44,
+                                          -98,
+                                          -50,
+                                          -113,
+                                          -56,
+                                          -128,
+                                      });
   test::ExpectTensorEqual<qint32>(expected, *GetOutput(0));
 }
 
diff --git a/tensorflow/core/kernels/quantized_mul_op.cc b/tensorflow/core/kernels/quantized_mul_op.cc
index eaa5e667f7..3c7536e037 100644
--- a/tensorflow/core/kernels/quantized_mul_op.cc
+++ b/tensorflow/core/kernels/quantized_mul_op.cc
@@ -298,9 +298,8 @@ class QuantizedMulOp : public OpKernel {
       return;
     }
     Tensor* z;
-    OP_REQUIRES_OK(
-        context,
-        context->allocate_output(0, BCast::ToShape(bcast.output_shape()), &z));
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                0, BCast::ToShape(bcast.output_shape()), &z));
 
     // Make sure that we have valid quantization ranges for the input buffers.
     // If the difference between the min and max is negative or zero, it makes
diff --git a/tensorflow/core/kernels/quantized_mul_op_test.cc b/tensorflow/core/kernels/quantized_mul_op_test.cc
index b0550c8260..a4e407c7a9 100644
--- a/tensorflow/core/kernels/quantized_mul_op_test.cc
+++ b/tensorflow/core/kernels/quantized_mul_op_test.cc
@@ -188,11 +188,12 @@ void TestManualScalar() {
       10.0f, {1}, {10.0f}, -100.0f, 100.0f, {10},
       {10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f, 90.0f, 100.0f},
       3.0f);
-  TestMul({1}, {10.0f}, -100.0f, 100.0f, {10},
-          {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f}, 0.0f,
-          10.0f, {10}, {10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f,
-                        90.0f, 100.0f},
-          3.0f);
+  TestMul(
+      {1}, {10.0f}, -100.0f, 100.0f, {10},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f}, 0.0f,
+      10.0f, {10},
+      {10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f, 90.0f, 100.0f},
+      3.0f);
 }
 
 void TestScalar() {
diff --git a/tensorflow/core/kernels/queue_base.cc b/tensorflow/core/kernels/queue_base.cc
index 330d161c32..de495c19cb 100644
--- a/tensorflow/core/kernels/queue_base.cc
+++ b/tensorflow/core/kernels/queue_base.cc
@@ -39,8 +39,8 @@ Status HandleSliceToElement(const Tensor& parent, Tensor* element,
     return errors::Internal(
         "HandleSliceToElement Cannot copy slice: number of elements does not "
         "match.  Shapes are: [element]: ",
-        element->shape().DebugString(), ", [parent slice]: ",
-        chip_shape.DebugString());
+        element->shape().DebugString(),
+        ", [parent slice]: ", chip_shape.DebugString());
   }
   auto parent_as_matrix = parent.flat_outer_dims<T>();
   element->flat<T>() = parent_as_matrix.chip(index, 0);
diff --git a/tensorflow/core/kernels/queue_ops.cc b/tensorflow/core/kernels/queue_ops.cc
index 17831b7437..46a02854d7 100644
--- a/tensorflow/core/kernels/queue_ops.cc
+++ b/tensorflow/core/kernels/queue_ops.cc
@@ -428,13 +428,14 @@ REGISTER_KERNEL_BUILDER(Name("QueueSizeV2").Device(DEVICE_CPU), QueueSizeOp);
 class QueueIsClosedOp : public QueueOpKernel {
  public:
   explicit QueueIsClosedOp(OpKernelConstruction* context)
-     : QueueOpKernel(context) {}
+      : QueueOpKernel(context) {}
 
  protected:
   void ComputeAsync(OpKernelContext* ctx, QueueInterface* queue,
                     DoneCallback callback) override {
     Tensor* Tqueue_is_closed = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &Tqueue_is_closed));
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output(0, TensorShape({}), &Tqueue_is_closed));
     Tqueue_is_closed->flat<bool>().setConstant(queue->is_closed());
     callback();
   }
@@ -443,8 +444,10 @@ class QueueIsClosedOp : public QueueOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(QueueIsClosedOp);
 };
 
-REGISTER_KERNEL_BUILDER(Name("QueueIsClosed").Device(DEVICE_CPU), QueueIsClosedOp);
-REGISTER_KERNEL_BUILDER(Name("QueueIsClosedV2").Device(DEVICE_CPU), QueueIsClosedOp);
+REGISTER_KERNEL_BUILDER(Name("QueueIsClosed").Device(DEVICE_CPU),
+                        QueueIsClosedOp);
+REGISTER_KERNEL_BUILDER(Name("QueueIsClosedV2").Device(DEVICE_CPU),
+                        QueueIsClosedOp);
 
 class FakeQueueOp : public OpKernel {
  public:
diff --git a/tensorflow/core/kernels/random_crop_op.cc b/tensorflow/core/kernels/random_crop_op.cc
index ba94d6be5c..554909760a 100644
--- a/tensorflow/core/kernels/random_crop_op.cc
+++ b/tensorflow/core/kernels/random_crop_op.cc
@@ -68,10 +68,10 @@ class RandomCropOp : public OpKernel {
     // Edge case. The target dimensions are larger then the image, so
     // zero-pad the image. This guarantees that the image will *always*
     // be [target_height, target_width] in size.
-    OP_REQUIRES(
-        context, width >= target_width,
-        errors::FailedPrecondition("width must be >= target_width: width = ",
-                                   width, ", target_width = ", target_width));
+    OP_REQUIRES(context, width >= target_width,
+                errors::FailedPrecondition(
+                    "width must be >= target_width: width = ", width,
+                    ", target_width = ", target_width));
     OP_REQUIRES(context, height >= target_height,
                 errors::FailedPrecondition(
                     "height must be >= target_height: height = ", height,
diff --git a/tensorflow/core/kernels/random_op.cc b/tensorflow/core/kernels/random_op.cc
index 55a8b9c9b6..78ff7948fb 100644
--- a/tensorflow/core/kernels/random_op.cc
+++ b/tensorflow/core/kernels/random_op.cc
@@ -50,7 +50,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 namespace functor {
 using random::PhiloxRandom;
@@ -271,9 +271,10 @@ class RandomGammaOp : public OpKernel {
     const Tensor& shape_t = ctx->input(0);
     const Tensor& alpha_t = ctx->input(1);
 
-    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(shape_t.shape()) &&
-                         (shape_t.dtype() == DataType::DT_INT32 ||
-                          shape_t.dtype() == DataType::DT_INT64),
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsVector(shape_t.shape()) &&
+                    (shape_t.dtype() == DataType::DT_INT32 ||
+                     shape_t.dtype() == DataType::DT_INT64),
                 errors::InvalidArgument(
                     "shape must be a vector of {int32,int64}, got shape: ",
                     shape_t.DebugString()));
@@ -325,7 +326,7 @@ class RandomGammaOp : public OpKernel {
     // avoid a couple flops which can be done on a per-alpha basis.
 
     auto DoWork = [num_samples, num_alphas, &rng, samples_flat, alpha_flat](
-        int start_output, int limit_output) {
+                      int start_output, int limit_output) {
       using Eigen::numext::exp;
       using Eigen::numext::log;
       using Eigen::numext::pow;
@@ -448,40 +449,40 @@ class RandomGammaOp : public OpKernel {
 
 }  // namespace
 
-#define REGISTER(TYPE)                                                      \
-  template struct functor::FillPhiloxRandom<                                \
-      CPUDevice, random::UniformDistribution<random::PhiloxRandom, TYPE> >; \
-  template struct functor::FillPhiloxRandom<                                \
-      CPUDevice, random::NormalDistribution<random::PhiloxRandom, TYPE> >;  \
-  template struct functor::FillPhiloxRandom<                                \
-      CPUDevice,                                                            \
-      random::TruncatedNormalDistribution<                                  \
-          random::SingleSampleAdapter<random::PhiloxRandom>, TYPE> >;       \
-  REGISTER_KERNEL_BUILDER(                                                  \
-      Name("RandomUniform")                                                 \
-          .Device(DEVICE_CPU)                                               \
-          .HostMemory("shape")                                              \
-          .TypeConstraint<TYPE>("dtype"),                                   \
-      PhiloxRandomOp<CPUDevice, random::UniformDistribution<                \
-                                    random::PhiloxRandom, TYPE> >);         \
-  REGISTER_KERNEL_BUILDER(                                                  \
-      Name("RandomStandardNormal")                                          \
-          .Device(DEVICE_CPU)                                               \
-          .HostMemory("shape")                                              \
-          .TypeConstraint<TYPE>("dtype"),                                   \
-      PhiloxRandomOp<CPUDevice, random::NormalDistribution<                 \
-                                    random::PhiloxRandom, TYPE> >);         \
-  REGISTER_KERNEL_BUILDER(                                                  \
-      Name("TruncatedNormal")                                               \
-          .Device(DEVICE_CPU)                                               \
-          .HostMemory("shape")                                              \
-          .TypeConstraint<TYPE>("dtype"),                                   \
-      PhiloxRandomOp<                                                       \
-          CPUDevice,                                                        \
-          random::TruncatedNormalDistribution<                              \
-              random::SingleSampleAdapter<random::PhiloxRandom>, TYPE> >);  \
-  REGISTER_KERNEL_BUILDER(                                                  \
-      Name("RandomGamma").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"),     \
+#define REGISTER(TYPE)                                                         \
+  template struct functor::FillPhiloxRandom<                                   \
+      CPUDevice, random::UniformDistribution<random::PhiloxRandom, TYPE>>;     \
+  template struct functor::FillPhiloxRandom<                                   \
+      CPUDevice, random::NormalDistribution<random::PhiloxRandom, TYPE>>;      \
+  template struct functor::FillPhiloxRandom<                                   \
+      CPUDevice,                                                               \
+      random::TruncatedNormalDistribution<                                     \
+          random::SingleSampleAdapter<random::PhiloxRandom>, TYPE>>;           \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("RandomUniform")                                                    \
+          .Device(DEVICE_CPU)                                                  \
+          .HostMemory("shape")                                                 \
+          .TypeConstraint<TYPE>("dtype"),                                      \
+      PhiloxRandomOp<CPUDevice, random::UniformDistribution<                   \
+                                    random::PhiloxRandom, TYPE>>);             \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("RandomStandardNormal")                                             \
+          .Device(DEVICE_CPU)                                                  \
+          .HostMemory("shape")                                                 \
+          .TypeConstraint<TYPE>("dtype"),                                      \
+      PhiloxRandomOp<CPUDevice,                                                \
+                     random::NormalDistribution<random::PhiloxRandom, TYPE>>); \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("TruncatedNormal")                                                  \
+          .Device(DEVICE_CPU)                                                  \
+          .HostMemory("shape")                                                 \
+          .TypeConstraint<TYPE>("dtype"),                                      \
+      PhiloxRandomOp<                                                          \
+          CPUDevice,                                                           \
+          random::TruncatedNormalDistribution<                                 \
+              random::SingleSampleAdapter<random::PhiloxRandom>, TYPE>>);      \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("RandomGamma").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"),        \
       RandomGammaOp<TYPE>)
 
 #define REGISTER_INT(IntType)                                   \
@@ -504,33 +505,33 @@ TF_CALL_int64(REGISTER_INT);
 
 #if GOOGLE_CUDA
 
-#define REGISTER(TYPE)                                              \
-  REGISTER_KERNEL_BUILDER(                                          \
-      Name("RandomUniform")                                         \
-          .Device(DEVICE_GPU)                                       \
-          .HostMemory("shape")                                      \
-          .TypeConstraint<int32>("T")                               \
-          .TypeConstraint<TYPE>("dtype"),                           \
-      PhiloxRandomOp<GPUDevice, random::UniformDistribution<        \
-                                    random::PhiloxRandom, TYPE> >); \
-  REGISTER_KERNEL_BUILDER(                                          \
-      Name("RandomStandardNormal")                                  \
-          .Device(DEVICE_GPU)                                       \
-          .HostMemory("shape")                                      \
-          .TypeConstraint<int32>("T")                               \
-          .TypeConstraint<TYPE>("dtype"),                           \
-      PhiloxRandomOp<GPUDevice, random::NormalDistribution<         \
-                                    random::PhiloxRandom, TYPE> >); \
-  REGISTER_KERNEL_BUILDER(                                          \
-      Name("TruncatedNormal")                                       \
-          .Device(DEVICE_GPU)                                       \
-          .HostMemory("shape")                                      \
-          .TypeConstraint<int32>("T")                               \
-          .TypeConstraint<TYPE>("dtype"),                           \
-      PhiloxRandomOp<                                               \
-          GPUDevice,                                                \
-          random::TruncatedNormalDistribution<                      \
-              random::SingleSampleAdapter<random::PhiloxRandom>, TYPE> >);
+#define REGISTER(TYPE)                                                         \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("RandomUniform")                                                    \
+          .Device(DEVICE_GPU)                                                  \
+          .HostMemory("shape")                                                 \
+          .TypeConstraint<int32>("T")                                          \
+          .TypeConstraint<TYPE>("dtype"),                                      \
+      PhiloxRandomOp<GPUDevice, random::UniformDistribution<                   \
+                                    random::PhiloxRandom, TYPE>>);             \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("RandomStandardNormal")                                             \
+          .Device(DEVICE_GPU)                                                  \
+          .HostMemory("shape")                                                 \
+          .TypeConstraint<int32>("T")                                          \
+          .TypeConstraint<TYPE>("dtype"),                                      \
+      PhiloxRandomOp<GPUDevice,                                                \
+                     random::NormalDistribution<random::PhiloxRandom, TYPE>>); \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("TruncatedNormal")                                                  \
+          .Device(DEVICE_GPU)                                                  \
+          .HostMemory("shape")                                                 \
+          .TypeConstraint<int32>("T")                                          \
+          .TypeConstraint<TYPE>("dtype"),                                      \
+      PhiloxRandomOp<                                                          \
+          GPUDevice,                                                           \
+          random::TruncatedNormalDistribution<                                 \
+              random::SingleSampleAdapter<random::PhiloxRandom>, TYPE>>);
 
 #define REGISTER_INT(IntType)                                   \
   REGISTER_KERNEL_BUILDER(Name("RandomUniformInt")              \
@@ -565,13 +566,12 @@ struct FillPhiloxRandomKernel;
 template <class Distribution>
 struct FillPhiloxRandomKernel<Distribution, false> {
   typedef typename Distribution::ResultElementType T;
-  using write_accessor = sycl::accessor<uint8_t, 1, sycl::access::mode::write, sycl::access::target::global_buffer>;
+  using write_accessor = sycl::accessor<uint8_t, 1, sycl::access::mode::write,
+                                        sycl::access::target::global_buffer>;
 
-  FillPhiloxRandomKernel(write_accessor& data, random::PhiloxRandom& gen, Distribution& dist)
-      : data_(data),
-        gen_(gen),
-        dist_(dist) {
-  }
+  FillPhiloxRandomKernel(write_accessor& data, random::PhiloxRandom& gen,
+                         Distribution& dist)
+      : data_(data), gen_(gen), dist_(dist) {}
 
   void operator()(sycl::nd_item<1> item) {
     const size_t kGroupSize = Distribution::kResultElementCount;
@@ -597,7 +597,7 @@ struct FillPhiloxRandomKernel<Distribution, false> {
     const typename Distribution::ResultType samples = dist_(&gen_);
     for (size_t i = 0; i < kGroupSize; ++i) {
       if (offset >= size) {
-          return;
+        return;
       }
       data[offset] = samples[i];
       ++offset;
@@ -610,17 +610,15 @@ struct FillPhiloxRandomKernel<Distribution, false> {
   Distribution dist_;
 };
 
-
 template <class Distribution>
 struct FillPhiloxRandomKernel<Distribution, true> {
   typedef typename Distribution::ResultElementType T;
-  using write_accessor = sycl::accessor<uint8_t, 1, sycl::access::mode::write, sycl::access::target::global_buffer>;
+  using write_accessor = sycl::accessor<uint8_t, 1, sycl::access::mode::write,
+                                        sycl::access::target::global_buffer>;
 
-  FillPhiloxRandomKernel(write_accessor& data, random::PhiloxRandom& gen, Distribution& dist)
-      : data_(data),
-        gen_(gen),
-        dist_(dist) {
-  }
+  FillPhiloxRandomKernel(write_accessor& data, random::PhiloxRandom& gen,
+                         Distribution& dist)
+      : data_(data), gen_(gen), dist_(dist) {}
 
   void operator()(sycl::nd_item<1> item) {
     using random::PhiloxRandom;
@@ -628,9 +626,9 @@ struct FillPhiloxRandomKernel<Distribution, true> {
 
     const size_t kReservedSamplesPerOutput = 256;
     const size_t kGroupSize = Distribution::kResultElementCount;
-    const size_t kGeneratorSkipPerOutputGroup = kGroupSize *
-                                                kReservedSamplesPerOutput /
-                                                PhiloxRandom::kResultElementCount;
+    const size_t kGeneratorSkipPerOutputGroup =
+        kGroupSize * kReservedSamplesPerOutput /
+        PhiloxRandom::kResultElementCount;
 
     const size_t item_id = item.get_global(0);
     const size_t total_item_count = item.get_global_range();
@@ -674,10 +672,9 @@ class FillRandomKernel;
 // It splits the work into several tasks and run them in parallel
 template <class Distribution>
 void FillPhiloxRandom<SYCLDevice, Distribution>::operator()(
-    OpKernelContext* context, const SYCLDevice& device, random::PhiloxRandom gen,
-    typename Distribution::ResultElementType* data, int64 size,
-    Distribution dist) {
-
+    OpKernelContext* context, const SYCLDevice& device,
+    random::PhiloxRandom gen, typename Distribution::ResultElementType* data,
+    int64 size, Distribution dist) {
   const size_t group_size = device.maxSyclThreadsPerBlock();
   const size_t group_count = (size + group_size - 1) / group_size;
 
@@ -686,50 +683,52 @@ void FillPhiloxRandom<SYCLDevice, Distribution>::operator()(
   device.sycl_queue().submit([&](sycl::handler& cgh) {
     auto access = buffer.template get_access<sycl::access::mode::write>(cgh);
 
-    FillPhiloxRandomKernel<Distribution, Distribution::kVariableSamplesPerOutput> task(access, gen, dist);
+    FillPhiloxRandomKernel<Distribution,
+                           Distribution::kVariableSamplesPerOutput>
+        task(access, gen, dist);
     cgh.parallel_for<class FillRandomKernel<Distribution>>(
-      sycl::nd_range<1>(sycl::range<1>(group_count * group_size), sycl::range<1>(group_size)),
-      task
-    );
+        sycl::nd_range<1>(sycl::range<1>(group_count * group_size),
+                          sycl::range<1>(group_size)),
+        task);
   });
 }
 
-}
+}  // namespace functor
+
+#define REGISTER(TYPE)                                                         \
+  template struct functor::FillPhiloxRandom<                                   \
+      SYCLDevice, random::UniformDistribution<random::PhiloxRandom, TYPE>>;    \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("RandomUniform")                                                    \
+          .Device(DEVICE_SYCL)                                                 \
+          .HostMemory("shape")                                                 \
+          .TypeConstraint<TYPE>("dtype"),                                      \
+      PhiloxRandomOp<SYCLDevice, random::UniformDistribution<                  \
+                                     random::PhiloxRandom, TYPE>>);            \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("RandomStandardNormal")                                             \
+          .Device(DEVICE_SYCL)                                                 \
+          .HostMemory("shape")                                                 \
+          .TypeConstraint<TYPE>("dtype"),                                      \
+      PhiloxRandomOp<SYCLDevice,                                               \
+                     random::NormalDistribution<random::PhiloxRandom, TYPE>>); \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("TruncatedNormal")                                                  \
+          .Device(DEVICE_SYCL)                                                 \
+          .HostMemory("shape")                                                 \
+          .TypeConstraint<TYPE>("dtype"),                                      \
+      PhiloxRandomOp<                                                          \
+          SYCLDevice,                                                          \
+          random::TruncatedNormalDistribution<                                 \
+              random::SingleSampleAdapter<random::PhiloxRandom>, TYPE>>);
 
-#define REGISTER(TYPE)                                                       \
-  template struct functor::FillPhiloxRandom<                                 \
-      SYCLDevice, random::UniformDistribution<random::PhiloxRandom, TYPE> >; \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("RandomUniform")                                                  \
-          .Device(DEVICE_SYCL)                                               \
-          .HostMemory("shape")                                               \
-          .TypeConstraint<TYPE>("dtype"),                                    \
-      PhiloxRandomOp<SYCLDevice, random::UniformDistribution<                \
-                                    random::PhiloxRandom, TYPE> >);          \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("RandomStandardNormal")                                           \
-          .Device(DEVICE_SYCL)                                               \
-          .HostMemory("shape")                                               \
-          .TypeConstraint<TYPE>("dtype"),                                    \
-      PhiloxRandomOp<SYCLDevice, random::NormalDistribution<                 \
-                                    random::PhiloxRandom, TYPE> >);          \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("TruncatedNormal")                                                \
-          .Device(DEVICE_SYCL)                                               \
-          .HostMemory("shape")                                               \
-          .TypeConstraint<TYPE>("dtype"),                                    \
-      PhiloxRandomOp<                                                        \
-          SYCLDevice,                                                        \
-          random::TruncatedNormalDistribution<                               \
-              random::SingleSampleAdapter<random::PhiloxRandom>, TYPE> >);
-
-#define REGISTER_INT(IntType)                                    \
-  REGISTER_KERNEL_BUILDER(Name("RandomUniformInt")               \
-                              .Device(DEVICE_SYCL)               \
-                              .HostMemory("shape")               \
-                              .HostMemory("minval")              \
-                              .HostMemory("maxval")              \
-                              .TypeConstraint<IntType>("Tout"),  \
+#define REGISTER_INT(IntType)                                   \
+  REGISTER_KERNEL_BUILDER(Name("RandomUniformInt")              \
+                              .Device(DEVICE_SYCL)              \
+                              .HostMemory("shape")              \
+                              .HostMemory("minval")             \
+                              .HostMemory("maxval")             \
+                              .TypeConstraint<IntType>("Tout"), \
                           RandomUniformIntOp<SYCLDevice, IntType>);
 
 TF_CALL_float(REGISTER);
@@ -740,6 +739,6 @@ TF_CALL_int64(REGISTER_INT);
 #undef REGISTER
 #undef REGISTER_INT
 
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/random_op_gpu.cu.cc b/tensorflow/core/kernels/random_op_gpu.cu.cc
index 7afa6974c6..3393b39faf 100644
--- a/tensorflow/core/kernels/random_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/random_op_gpu.cu.cc
@@ -222,9 +222,8 @@ void FillPhiloxRandom<GPUDevice, Distribution>::operator()(
       (d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor()) /
       block_size;
 
-  FillPhiloxRandomKernelLaunch<
-      Distribution><<<num_blocks, block_size, 0, d.stream()>>>(gen, data, size,
-                                                               dist);
+  FillPhiloxRandomKernelLaunch<Distribution>
+      <<<num_blocks, block_size, 0, d.stream()>>>(gen, data, size, dist);
 };
 
 // Explicit instantiation of the GPU distributions functors
diff --git a/tensorflow/core/kernels/random_poisson_op.cc b/tensorflow/core/kernels/random_poisson_op.cc
index bf1d83ec75..64fb4a5c22 100644
--- a/tensorflow/core/kernels/random_poisson_op.cc
+++ b/tensorflow/core/kernels/random_poisson_op.cc
@@ -103,7 +103,7 @@ struct PoissonFunctor<CPUDevice, T, U> {
     typedef random::UniformDistribution<random::PhiloxRandom, CT> Uniform;
 
     auto DoWork = [num_samples, num_rate, &rng, samples_flat, rate_flat](
-        int start_output, int limit_output) {
+                      int start_output, int limit_output) {
       // Capturing "rng" by value would only make a copy for the _shared_
       // lambda.  Since we want to let each worker have its own copy, we pass
       // "rng" by reference and explicitly do a copy assignment.
diff --git a/tensorflow/core/kernels/random_shuffle_queue_op.cc b/tensorflow/core/kernels/random_shuffle_queue_op.cc
index e9695cfde3..87fc943331 100644
--- a/tensorflow/core/kernels/random_shuffle_queue_op.cc
+++ b/tensorflow/core/kernels/random_shuffle_queue_op.cc
@@ -334,96 +334,95 @@ void RandomShuffleQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
       // TODO(josh11b): This makes two copies of callback, avoid this if possible.
       dequeue_attempts_.emplace_back(
           num_elements, [callback]() { callback(Tuple()); }, ctx, cm, token,
-          [callback, allow_small_batch, this](Attempt* attempt)
-              EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-                int32 queue_size = queues_[0].size();
-                if (closed_ && queue_size < attempt->elements_requested) {
-                  // If we don't have enough for a full dequeue, we have
-                  // to reset the attempt tuple.
-                  if (!attempt->tuple.empty()) {
-                    // Restore already-dequeued elements to the queue.
-                    for (int64 i = attempt->tuple[0].dim_size(0) -
-                                   attempt->elements_requested - 1;
-                         i >= 0; --i) {
-                      for (int j = 0; j < num_components(); ++j) {
-                        PersistentTensor element;
-                        Status s = GetElementComponentFromBatch(
-                            attempt->tuple, i, j, attempt->context, &element);
-                        if (!s.ok()) {
-                          attempt->context->SetStatus(
-                              errors::DataLoss("Failed to restore element from "
-                                               "partially-dequeued batch "
-                                               "to RandomShuffleQueue: ",
-                                               s.error_message()));
-                        }
-                        queues_[j].push_back(element);
-                      }
-                    }
-                  }
-                  if (allow_small_batch && !queues_[0].empty()) {
-                    // Request all remaining elements in the queue.
-                    queue_size = queues_[0].size();
-                    attempt->tuple.clear();
-                    attempt->elements_requested = queue_size;
-                  } else {
-                    if (allow_small_batch) {
-                      // There may be some other attempts containing
-                      // values.  If so, we'll yield and wait for them
-                      // to add elements to the queue.
-                      if (!enqueue_attempts_.empty()) return kProgress;
-                    }
-                    if (attempt->context->status().ok()) {
-                      attempt->context->SetStatus(errors::OutOfRange(
-                          "RandomShuffleQueue '", name_, "' is closed and has ",
-                          "insufficient elements (requested ",
-                          attempt->elements_requested, ", current size ",
-                          queue_size, ")"));
+          [callback, allow_small_batch,
+           this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+            int32 queue_size = queues_[0].size();
+            if (closed_ && queue_size < attempt->elements_requested) {
+              // If we don't have enough for a full dequeue, we have
+              // to reset the attempt tuple.
+              if (!attempt->tuple.empty()) {
+                // Restore already-dequeued elements to the queue.
+                for (int64 i = attempt->tuple[0].dim_size(0) -
+                               attempt->elements_requested - 1;
+                     i >= 0; --i) {
+                  for (int j = 0; j < num_components(); ++j) {
+                    PersistentTensor element;
+                    Status s = GetElementComponentFromBatch(
+                        attempt->tuple, i, j, attempt->context, &element);
+                    if (!s.ok()) {
+                      attempt->context->SetStatus(
+                          errors::DataLoss("Failed to restore element from "
+                                           "partially-dequeued batch "
+                                           "to RandomShuffleQueue: ",
+                                           s.error_message()));
                     }
-                    return kComplete;
+                    queues_[j].push_back(element);
                   }
                 }
+              }
+              if (allow_small_batch && !queues_[0].empty()) {
+                // Request all remaining elements in the queue.
+                queue_size = queues_[0].size();
+                attempt->tuple.clear();
+                attempt->elements_requested = queue_size;
+              } else {
+                if (allow_small_batch) {
+                  // There may be some other attempts containing
+                  // values.  If so, we'll yield and wait for them
+                  // to add elements to the queue.
+                  if (!enqueue_attempts_.empty()) return kProgress;
+                }
+                if (attempt->context->status().ok()) {
+                  attempt->context->SetStatus(errors::OutOfRange(
+                      "RandomShuffleQueue '", name_, "' is closed and has ",
+                      "insufficient elements (requested ",
+                      attempt->elements_requested, ", current size ",
+                      queue_size, ")"));
+                }
+                return kComplete;
+              }
+            }
 
-                RunResult result = kNoProgress;
-                if (!closed_) queue_size -= min_after_dequeue_;
-                for (; queue_size > 0; --queue_size) {
-                  if (attempt->tuple.empty()) {
-                    // Only allocate tuple when we have something to dequeue
-                    // so we don't use excessive memory when there are many
-                    // blocked dequeue attempts waiting.
-                    attempt->tuple.reserve(num_components());
-                    for (int i = 0; i < num_components(); ++i) {
-                      const TensorShape shape =
-                          ManyOutShape(i, attempt->elements_requested);
-                      Tensor element;
-                      attempt->context->SetStatus(
-                          attempt->context->allocate_temp(component_dtypes_[i],
-                                                          shape, &element));
-                      if (!attempt->context->status().ok()) return kComplete;
-                      attempt->tuple.emplace_back(element);
-                    }
-                  }
-                  result = kProgress;
-                  Tuple tuple;
-                  DequeueLocked(attempt->context, &tuple);
-                  const int index = attempt->tuple[0].dim_size(0) -
-                                    attempt->elements_requested;
-                  for (int i = 0; i < num_components(); ++i) {
-                    attempt->context->SetStatus(batch_util::CopyElementToSlice(
-                        std::move(tuple[i]), &attempt->tuple[i], index));
-                    if (!attempt->context->status().ok()) return kComplete;
-                  }
-                  tuple.clear();
-                  --attempt->elements_requested;
-                  if (attempt->elements_requested == 0) {
-                    tuple = attempt->tuple;
-                    attempt->done_callback = [callback, tuple]() {
-                      callback(tuple);
-                    };
-                    return kComplete;
-                  }
+            RunResult result = kNoProgress;
+            if (!closed_) queue_size -= min_after_dequeue_;
+            for (; queue_size > 0; --queue_size) {
+              if (attempt->tuple.empty()) {
+                // Only allocate tuple when we have something to dequeue
+                // so we don't use excessive memory when there are many
+                // blocked dequeue attempts waiting.
+                attempt->tuple.reserve(num_components());
+                for (int i = 0; i < num_components(); ++i) {
+                  const TensorShape shape =
+                      ManyOutShape(i, attempt->elements_requested);
+                  Tensor element;
+                  attempt->context->SetStatus(attempt->context->allocate_temp(
+                      component_dtypes_[i], shape, &element));
+                  if (!attempt->context->status().ok()) return kComplete;
+                  attempt->tuple.emplace_back(element);
                 }
-                return result;
-              });
+              }
+              result = kProgress;
+              Tuple tuple;
+              DequeueLocked(attempt->context, &tuple);
+              const int index =
+                  attempt->tuple[0].dim_size(0) - attempt->elements_requested;
+              for (int i = 0; i < num_components(); ++i) {
+                attempt->context->SetStatus(batch_util::CopyElementToSlice(
+                    std::move(tuple[i]), &attempt->tuple[i], index));
+                if (!attempt->context->status().ok()) return kComplete;
+              }
+              tuple.clear();
+              --attempt->elements_requested;
+              if (attempt->elements_requested == 0) {
+                tuple = attempt->tuple;
+                attempt->done_callback = [callback, tuple]() {
+                  callback(tuple);
+                };
+                return kComplete;
+              }
+            }
+            return result;
+          });
     }
   }
   if (!already_cancelled) {
diff --git a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
index 36ca7f834f..15ae4c1fc5 100644
--- a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
+++ b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
@@ -312,8 +312,7 @@ __global__ void ColumnReduceKernel(
   int col = blockIdx.x * 32 + threadIdx.x;
 
   value_type sum = initVal;
-  if (row < num_rows && col < num_cols)
-    sum = in[row * num_cols + col];
+  if (row < num_rows && col < num_cols) sum = in[row * num_cols + col];
 
   // 1D array necessary due to bug in CUDA 9 compiler.
   // TODO(nluehr) revert to 2D array when compiler is ready.
@@ -366,8 +365,7 @@ __global__ void CleanupSegments(
   const int tid = threadIdx.x + blockIdx.x * blockDim.x;
 
   value_type val = initVal;
-  if (tid < segment_size * num_cols)
-    val = partial_sums[tid];
+  if (tid < segment_size * num_cols) val = partial_sums[tid];
 
   typedef cub::WarpReduce<value_type> WarpReduce;
 
diff --git a/tensorflow/core/kernels/relu_op.cc b/tensorflow/core/kernels/relu_op.cc
index afad288cc0..d52358737f 100644
--- a/tensorflow/core/kernels/relu_op.cc
+++ b/tensorflow/core/kernels/relu_op.cc
@@ -31,7 +31,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #define REGISTER_RELU_KERNELS(type)                                   \
   REGISTER_KERNEL_BUILDER(                                            \
@@ -113,8 +113,7 @@ namespace functor {
                                                                                \
   template <>                                                                  \
   void Selu<GPUDevice, T>::operator()(                                         \
-      const GPUDevice& d,                                                      \
-      typename TTypes<T>::ConstTensor features,                                \
+      const GPUDevice& d, typename TTypes<T>::ConstTensor features,            \
       typename TTypes<T>::Tensor activations);                                 \
   extern template struct Selu<GPUDevice, T>;                                   \
                                                                                \
@@ -125,8 +124,6 @@ namespace functor {
       typename TTypes<T>::Tensor backprops);                                   \
   extern template struct SeluGrad<GPUDevice, T>;
 
-
-
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
 }  // namespace functor
 
@@ -157,8 +154,6 @@ TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
       Name("SeluGrad").Device(DEVICE_GPU).TypeConstraint<type>("T"),  \
       SeluGradOp<GPUDevice, type>)
 
-
-
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
 
@@ -192,10 +187,8 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
       Name("SeluGrad").Device(DEVICE_SYCL).TypeConstraint<type>("T"),  \
       SeluGradOp<SYCLDevice, type>)
 
-
-
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS);
 #undef REGISTER_SYCL_KERNELS
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/relu_op_functor.h b/tensorflow/core/kernels/relu_op_functor.h
index 24b789c543..3bc5ba8a50 100644
--- a/tensorflow/core/kernels/relu_op_functor.h
+++ b/tensorflow/core/kernels/relu_op_functor.h
@@ -85,10 +85,9 @@ struct Relu6Grad {
     // make sure not to propagate the associated gradient
     // value. This allows "features" to be either the input or the output of
     // the relu6.
-    backprops.device(d) =
-        gradients *
-        ((features > static_cast<T>(0)) * (features < static_cast<T>(6)))
-            .template cast<T>();
+    backprops.device(d) = gradients * ((features > static_cast<T>(0)) *
+                                       (features < static_cast<T>(6)))
+                                          .template cast<T>();
   }
 };
 
@@ -161,8 +160,8 @@ struct SeluGrad {
     const auto scale = static_cast<T>(1.0507009873554804934193349852946);
     const auto scale_alpha = static_cast<T>(1.7580993408473768599402175208123);
     backprops.device(d) =
-        (activations < static_cast<T>(0)).select(
-            gradients * (activations + scale_alpha), gradients * scale);
+        (activations < static_cast<T>(0))
+            .select(gradients * (activations + scale_alpha), gradients * scale);
   }
 };
 
diff --git a/tensorflow/core/kernels/resize_bicubic_op.cc b/tensorflow/core/kernels/resize_bicubic_op.cc
index 1a9cf4c640..86e61bbcef 100644
--- a/tensorflow/core/kernels/resize_bicubic_op.cc
+++ b/tensorflow/core/kernels/resize_bicubic_op.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <algorithm>
 #include <array>
 
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -28,7 +29,6 @@ limitations under the License.
 #include "tensorflow/core/kernels/image_resizer_state.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/core/kernels/resize_bicubic_op_test.cc b/tensorflow/core/kernels/resize_bicubic_op_test.cc
index 9e10fec423..25a37d5e1a 100644
--- a/tensorflow/core/kernels/resize_bicubic_op_test.cc
+++ b/tensorflow/core/kernels/resize_bicubic_op_test.cc
@@ -286,13 +286,14 @@ BM_ResizeBicubicDev(32, 128, 3);
 BM_ResizeBicubicDev(32, 512, 3);
 BM_ResizeBicubicDev(32, 1024, 3);
 
-#define BM_ResizeBicubicExpand(BATCH, SIZE, CHANNELS)                          \
-  static void BM_ResizeBicubicExpand##_##BATCH##_##SIZE##_##CHANNELS(int iters) { \
-    testing::ItemsProcessed(static_cast<int64>(iters) * BATCH * SIZE * SIZE *  \
-                            CHANNELS * 8 * 8);                                 \
-    test::Benchmark("cpu", ResizeBicubic(BATCH, SIZE, CHANNELS, 8, 8))         \
-        .Run(iters);                                                           \
-  }                                                                            \
+#define BM_ResizeBicubicExpand(BATCH, SIZE, CHANNELS)                         \
+  static void BM_ResizeBicubicExpand##_##BATCH##_##SIZE##_##CHANNELS(         \
+      int iters) {                                                            \
+    testing::ItemsProcessed(static_cast<int64>(iters) * BATCH * SIZE * SIZE * \
+                            CHANNELS * 8 * 8);                                \
+    test::Benchmark("cpu", ResizeBicubic(BATCH, SIZE, CHANNELS, 8, 8))        \
+        .Run(iters);                                                          \
+  }                                                                           \
   BENCHMARK(BM_ResizeBicubicExpand##_##BATCH##_##SIZE##_##CHANNELS);
 
 BM_ResizeBicubicExpand(12, 48, 1);
diff --git a/tensorflow/core/kernels/resize_bilinear_op_gpu.cu.cc b/tensorflow/core/kernels/resize_bilinear_op_gpu.cu.cc
index a7da7a0777..f82c3fcd9f 100644
--- a/tensorflow/core/kernels/resize_bilinear_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/resize_bilinear_op_gpu.cu.cc
@@ -164,11 +164,11 @@ struct ResizeBilinear<GPUDevice, T> {
     if (total_count == 0) return;
 
     CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
-    ResizeBilinearKernel<
-        T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        config.virtual_thread_count, images.data(), height_scale, width_scale,
-        batch, in_height, in_width, channels, out_height, out_width,
-        output.data());
+    ResizeBilinearKernel<T>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            config.virtual_thread_count, images.data(), height_scale,
+            width_scale, batch, in_height, in_width, channels, out_height,
+            out_width, output.data());
   }
 };
 
@@ -200,11 +200,11 @@ struct ResizeBilinearGrad<GPUDevice, T> {
     // Accumulate.
     total_count = batch * resized_height * resized_width * channels;
     config = GetCudaLaunchConfig(total_count, d);
-    ResizeBilinearGradKernel<
-        T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        config.virtual_thread_count, input_grad.data(), height_scale,
-        width_scale, batch, original_height, original_width, channels,
-        resized_height, resized_width, output_grad.data());
+    ResizeBilinearGradKernel<T>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            config.virtual_thread_count, input_grad.data(), height_scale,
+            width_scale, batch, original_height, original_width, channels,
+            resized_height, resized_width, output_grad.data());
   }
 };
 
diff --git a/tensorflow/core/kernels/reverse_op.cc b/tensorflow/core/kernels/reverse_op.cc
index 8f82784d93..bb96c42f10 100644
--- a/tensorflow/core/kernels/reverse_op.cc
+++ b/tensorflow/core/kernels/reverse_op.cc
@@ -269,10 +269,10 @@ class ReverseV2Op : public OpKernel {
       OP_REQUIRES_OK(context,
                      context->allocate_output(0, input.shape(), &output));
 
-// TODO(cwhipkey): we can do dimension folding to reduce, e.g., a reverse of
-// a single dimension to the dims=3 or dims=2 case, regardless of the number
-// of dimensions in the tensor. This would let some ops use faster
-// lower-dimension code (and use optimized versions).
+      // TODO(cwhipkey): we can do dimension folding to reduce, e.g., a reverse
+      // of a single dimension to the dims=3 or dims=2 case, regardless of the
+      // number of dimensions in the tensor. This would let some ops use faster
+      // lower-dimension code (and use optimized versions).
 
 #define HANDLE_REVERSE(NDIMS)                                           \
   case NDIMS:                                                           \
diff --git a/tensorflow/core/kernels/reverse_op_gpu.cu.cc b/tensorflow/core/kernels/reverse_op_gpu.cu.cc
index b05a7c5550..3ee49db669 100644
--- a/tensorflow/core/kernels/reverse_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/reverse_op_gpu.cu.cc
@@ -28,14 +28,14 @@ typedef Eigen::GpuDevice GPUDevice;
 #define DEFINE_REVERSE(T, DIM) \
   template struct functor::Reverse<GPUDevice, T, DIM>;
 #define DEFINE_REVERSE_ALL_DIMS(T) \
-  DEFINE_REVERSE(T, 0) \
-  DEFINE_REVERSE(T, 1) \
-  DEFINE_REVERSE(T, 2) \
-  DEFINE_REVERSE(T, 3) \
-  DEFINE_REVERSE(T, 4) \
-  DEFINE_REVERSE(T, 5) \
-  DEFINE_REVERSE(T, 6) \
-  DEFINE_REVERSE(T, 7) \
+  DEFINE_REVERSE(T, 0)             \
+  DEFINE_REVERSE(T, 1)             \
+  DEFINE_REVERSE(T, 2)             \
+  DEFINE_REVERSE(T, 3)             \
+  DEFINE_REVERSE(T, 4)             \
+  DEFINE_REVERSE(T, 5)             \
+  DEFINE_REVERSE(T, 6)             \
+  DEFINE_REVERSE(T, 7)             \
   DEFINE_REVERSE(T, 8)
 
 TF_CALL_uint8(DEFINE_REVERSE_ALL_DIMS);
diff --git a/tensorflow/core/kernels/reverse_sequence_op.cc b/tensorflow/core/kernels/reverse_sequence_op.cc
index d1980d4b65..15a707a9c6 100644
--- a/tensorflow/core/kernels/reverse_sequence_op.cc
+++ b/tensorflow/core/kernels/reverse_sequence_op.cc
@@ -51,8 +51,7 @@ void CheckErrors(OpKernelContext* context, int batch_dim, int seq_dim) {
 
   // Copy seq_len info down for validity checks
   context->eigen_device<Device>().memcpyDeviceToHost(
-      seq_lens_vec.data(), seq_lens_t.data(),
-      sizeof(Tlen) * seq_lens_t.size());
+      seq_lens_vec.data(), seq_lens_t.data(), sizeof(Tlen) * seq_lens_t.size());
 
   OP_REQUIRES(context, batch_dim != seq_dim,
               errors::InvalidArgument("batch_dim == seq_dim == ", seq_dim));
@@ -76,8 +75,7 @@ void CheckErrors(OpKernelContext* context, int batch_dim, int seq_dim) {
   }
 }
 
-void CheckErrorsGPU(OpKernelContext* context, int batch_dim,
-                            int seq_dim) {
+void CheckErrorsGPU(OpKernelContext* context, int batch_dim, int seq_dim) {
   const Tensor& input = context->input(0);
   const Tensor& seq_lens = context->input(1);
 
@@ -98,13 +96,13 @@ void CheckErrorsGPU(OpKernelContext* context, int batch_dim,
 
 template <>
 void CheckErrors<GPUDevice, int32>(OpKernelContext* context, int batch_dim,
-                            int seq_dim) {
+                                   int seq_dim) {
   CheckErrorsGPU(context, batch_dim, seq_dim);
 }
 
 template <>
 void CheckErrors<GPUDevice, int64>(OpKernelContext* context, int batch_dim,
-                            int seq_dim) {
+                                   int seq_dim) {
   CheckErrorsGPU(context, batch_dim, seq_dim);
 }
 
@@ -164,14 +162,15 @@ class ReverseSequenceOp : public OpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(ReverseSequenceOp);
 };
 
-#define REGISTER_REVERSE_SEQUENCE(type, len_type)                           \
-  REGISTER_KERNEL_BUILDER(                                                  \
-      Name("ReverseSequence").Device(DEVICE_CPU).TypeConstraint<type>("T"). \
-      TypeConstraint<len_type>("Tlen"),                                     \
-      ReverseSequenceOp<CPUDevice, type, len_type>);
+#define REGISTER_REVERSE_SEQUENCE(type, len_type)                \
+  REGISTER_KERNEL_BUILDER(Name("ReverseSequence")                \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<len_type>("Tlen"), \
+                          ReverseSequenceOp<CPUDevice, type, len_type>);
 
-#define REGISTER_REVERSE_SEQUENCE_LEN(type)    \
-  REGISTER_REVERSE_SEQUENCE(type, int32);      \
+#define REGISTER_REVERSE_SEQUENCE_LEN(type) \
+  REGISTER_REVERSE_SEQUENCE(type, int32);   \
   REGISTER_REVERSE_SEQUENCE(type, int64);
 
 TF_CALL_NUMBER_TYPES(REGISTER_REVERSE_SEQUENCE_LEN);
@@ -181,23 +180,23 @@ TF_CALL_bool(REGISTER_REVERSE_SEQUENCE_LEN);
 
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
-#define DECLARE_GPU_SPEC(T, Tlen, Dims)                                     \
-  template <>                                                               \
-  void ReverseSequence<GPUDevice, T, Tlen, Dims>::Compute(                  \
-      const GPUDevice& d, typename TTypes<T, Dims>::ConstTensor input,      \
-      int32 batch_dim, int32 seq_dim,                                       \
-      typename TTypes<Tlen>::ConstVec seq_lens,                             \
-      typename TTypes<T, Dims>::Tensor output);                             \
+#define DECLARE_GPU_SPEC(T, Tlen, Dims)                                \
+  template <>                                                          \
+  void ReverseSequence<GPUDevice, T, Tlen, Dims>::Compute(             \
+      const GPUDevice& d, typename TTypes<T, Dims>::ConstTensor input, \
+      int32 batch_dim, int32 seq_dim,                                  \
+      typename TTypes<Tlen>::ConstVec seq_lens,                        \
+      typename TTypes<T, Dims>::Tensor output);                        \
   extern template struct ReverseSequence<GPUDevice, T, Tlen, Dims>;
 
-#define DECLARE_GPU_SPEC_LEN(T, Dims)    \
-  DECLARE_GPU_SPEC(T, int32, Dims);      \
+#define DECLARE_GPU_SPEC_LEN(T, Dims) \
+  DECLARE_GPU_SPEC(T, int32, Dims);   \
   DECLARE_GPU_SPEC(T, int64, Dims);
 
-#define DECLARE_GPU_SPECS(T)     \
-  DECLARE_GPU_SPEC_LEN(T, 2);    \
-  DECLARE_GPU_SPEC_LEN(T, 3);    \
-  DECLARE_GPU_SPEC_LEN(T, 4);    \
+#define DECLARE_GPU_SPECS(T)  \
+  DECLARE_GPU_SPEC_LEN(T, 2); \
+  DECLARE_GPU_SPEC_LEN(T, 3); \
+  DECLARE_GPU_SPEC_LEN(T, 4); \
   DECLARE_GPU_SPEC_LEN(T, 5);
 
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
@@ -206,14 +205,15 @@ TF_CALL_bool(DECLARE_GPU_SPECS);
 }  // namespace functor
 
 // Registration of the GPU implementations.
-#define REGISTER_REVERSE_SEQUENCE_GPU(type, len_type)                       \
-  REGISTER_KERNEL_BUILDER(                                                  \
-      Name("ReverseSequence").Device(DEVICE_GPU).TypeConstraint<type>("T"). \
-      TypeConstraint<len_type>("Tlen"),                                     \
-      ReverseSequenceOp<GPUDevice, type, len_type>);
-
-#define REGISTER_REVERSE_SEQUENCE_GPU_LEN(type)   \
-  REGISTER_REVERSE_SEQUENCE_GPU(type, int32);     \
+#define REGISTER_REVERSE_SEQUENCE_GPU(type, len_type)            \
+  REGISTER_KERNEL_BUILDER(Name("ReverseSequence")                \
+                              .Device(DEVICE_GPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<len_type>("Tlen"), \
+                          ReverseSequenceOp<GPUDevice, type, len_type>);
+
+#define REGISTER_REVERSE_SEQUENCE_GPU_LEN(type) \
+  REGISTER_REVERSE_SEQUENCE_GPU(type, int32);   \
   REGISTER_REVERSE_SEQUENCE_GPU(type, int64);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_REVERSE_SEQUENCE_GPU_LEN);
diff --git a/tensorflow/core/kernels/reverse_sequence_op_gpu.cu.cc b/tensorflow/core/kernels/reverse_sequence_op_gpu.cu.cc
index cb49f14525..4a2136a2cd 100644
--- a/tensorflow/core/kernels/reverse_sequence_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/reverse_sequence_op_gpu.cu.cc
@@ -28,14 +28,14 @@ typedef Eigen::GpuDevice GPUDevice;
   template class generator::ReverseGenerator<T, Tlen, dims>; \
   template struct functor::ReverseSequence<GPUDevice, T, Tlen, dims>;
 
-#define DEFINE_GPU_SPEC_LEN(T, dims)  \
-  DEFINE_GPU_SPEC(T, int32, dims);    \
+#define DEFINE_GPU_SPEC_LEN(T, dims) \
+  DEFINE_GPU_SPEC(T, int32, dims);   \
   DEFINE_GPU_SPEC(T, int64, dims);
 
-#define DEFINE_GPU_SPECS(T) \
-  DEFINE_GPU_SPEC_LEN(T, 2);    \
-  DEFINE_GPU_SPEC_LEN(T, 3);    \
-  DEFINE_GPU_SPEC_LEN(T, 4);    \
+#define DEFINE_GPU_SPECS(T)  \
+  DEFINE_GPU_SPEC_LEN(T, 2); \
+  DEFINE_GPU_SPEC_LEN(T, 3); \
+  DEFINE_GPU_SPEC_LEN(T, 4); \
   DEFINE_GPU_SPEC_LEN(T, 5);
 
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/save_restore_tensor.cc b/tensorflow/core/kernels/save_restore_tensor.cc
index df60eda759..990bd2bff9 100644
--- a/tensorflow/core/kernels/save_restore_tensor.cc
+++ b/tensorflow/core/kernels/save_restore_tensor.cc
@@ -106,11 +106,11 @@ void SaveTensors(
       OP_REQUIRES_OK(context, checkpoint::ParseShapeAndSlice(
                                   shape_spec, &shape, &slice, &slice_shape));
       OP_REQUIRES(context, slice_shape.IsSameSize(input.shape()),
-                  errors::InvalidArgument("Slice in shape_and_slice "
-                                          "specification does not match the "
-                                          "shape of the tensor to  save: ",
-                                          shape_spec, ", tensor: ",
-                                          input.shape().DebugString()));
+                  errors::InvalidArgument(
+                      "Slice in shape_and_slice "
+                      "specification does not match the "
+                      "shape of the tensor to  save: ",
+                      shape_spec, ", tensor: ", input.shape().DebugString()));
     }
 
 #define WRITER_ADD(T)                                           \
diff --git a/tensorflow/core/kernels/scatter_functor.h b/tensorflow/core/kernels/scatter_functor.h
index c6e35fe329..079f15e101 100644
--- a/tensorflow/core/kernels/scatter_functor.h
+++ b/tensorflow/core/kernels/scatter_functor.h
@@ -29,7 +29,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 namespace scatter_op {
 
@@ -117,7 +117,7 @@ struct AssignSYCL<scatter_op::UpdateOp::DIV> {
     p.device(d) = p / u;
   }
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace internal
 }  // namespace scatter_op
@@ -156,7 +156,7 @@ struct ScatterFunctorBase {
 
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T, typename Index, scatter_op::UpdateOp op>
-struct ScatterFunctorBase <SYCLDevice, T, Index, op> {
+struct ScatterFunctorBase<SYCLDevice, T, Index, op> {
   Index operator()(OpKernelContext* c, const SYCLDevice& d,
                    typename TTypes<T>::Matrix params,
                    typename TTypes<T>::ConstMatrix updates,
@@ -171,13 +171,13 @@ struct ScatterFunctorBase <SYCLDevice, T, Index, op> {
       const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
       if (!FastBoundsCheck(index, limit)) return i;
       // Copy last Ndim-1 dimensions of updates[i] to params[index]
-      scatter_op::internal::AssignSYCL<op>::Run(d, params.template chip<0>(index),
-                                            updates.template chip<0>(i));
+      scatter_op::internal::AssignSYCL<op>::Run(
+          d, params.template chip<0>(index), updates.template chip<0>(i));
     }
     return -1;
   }
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename T, typename Index>
 struct ScatterFunctorBase<CPUDevice, T, Index, scatter_op::UpdateOp::ASSIGN> {
@@ -217,7 +217,7 @@ struct ScatterFunctorBase<CPUDevice, T, Index, scatter_op::UpdateOp::ASSIGN> {
 
 template <typename T, typename Index, scatter_op::UpdateOp op>
 struct ScatterFunctor<CPUDevice, T, Index, op>
-        : ScatterFunctorBase<CPUDevice, T, Index, op>{};
+    : ScatterFunctorBase<CPUDevice, T, Index, op> {};
 
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T, typename Index, scatter_op::UpdateOp op>
@@ -239,7 +239,7 @@ struct ScatterFunctorSYCL {
     return -1;
   }
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/scatter_functor_gpu.cu.h b/tensorflow/core/kernels/scatter_functor_gpu.cu.h
index e116077d3c..be18658543 100644
--- a/tensorflow/core/kernels/scatter_functor_gpu.cu.h
+++ b/tensorflow/core/kernels/scatter_functor_gpu.cu.h
@@ -30,9 +30,10 @@ namespace tensorflow {
 typedef Eigen::GpuDevice GPUDevice;
 
 template <typename T, typename Index, scatter_op::UpdateOp op>
-__global__ void ScatterOpCustomKernel(
-    T* params, const T* updates, const Index* indices,
-    Index first_dim_size, Index updates_size, Index indices_size) {
+__global__ void ScatterOpCustomKernel(T* params, const T* updates,
+                                      const Index* indices,
+                                      Index first_dim_size, Index updates_size,
+                                      Index indices_size) {
   Index update_block = updates_size / indices_size;
   CUDA_1D_KERNEL_LOOP(i, updates_size) {
     int indices_i = i / update_block;
@@ -85,8 +86,8 @@ struct ScatterFunctor<GPUDevice, T, Index, op> {
     CudaLaunchConfig config = GetCudaLaunchConfig(updates_size, d);
     ScatterOpCustomKernel<T, Index, op>
         <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-            params.data(), updates.data(), indices.data(),
-            first_dim_size, updates_size, indices_size);
+            params.data(), updates.data(), indices.data(), first_dim_size,
+            updates_size, indices_size);
     return -1;
   }
 };
diff --git a/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h b/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
index c6c9d4e658..e82660dcc1 100644
--- a/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
+++ b/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
@@ -40,7 +40,7 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 class OpKernelContext;
 
@@ -251,7 +251,7 @@ REGISTER_SCATTER_ND_MATH_SYCL(int32);
 #undef REGISTER_SCATTER_ND_INDEX_SYCL
 #undef REGISTER_SCATTER_ND_FULL_SYCL
 
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
 
diff --git a/tensorflow/core/kernels/scatter_op.cc b/tensorflow/core/kernels/scatter_op.cc
index 8607c7f95a..282165349f 100644
--- a/tensorflow/core/kernels/scatter_op.cc
+++ b/tensorflow/core/kernels/scatter_op.cc
@@ -25,7 +25,7 @@ limitations under the License.
 
 #ifdef TENSORFLOW_USE_SYCL
 #include "tensorflow/core/common_runtime/sycl/sycl_util.h"
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 namespace tensorflow {
 
@@ -33,7 +33,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // Check whether updates.shape = indices.shape + params.shape[1:]
 static bool ValidShapes(const Tensor& params, const Tensor& updates,
@@ -102,11 +102,12 @@ class ScatterUpdateOp : public OpKernel {
 
     // Check that we have enough index space
     const int64 N_big = indices.NumElements();
-    OP_REQUIRES(c, N_big <= std::numeric_limits<Index>::max(),
-                errors::InvalidArgument(
-                    "indices has too many elements for ",
-                    DataTypeString(DataTypeToEnum<Index>::v()), " indexing: ",
-                    N_big, " > ", std::numeric_limits<Index>::max()));
+    OP_REQUIRES(
+        c, N_big <= std::numeric_limits<Index>::max(),
+        errors::InvalidArgument("indices has too many elements for ",
+                                DataTypeString(DataTypeToEnum<Index>::v()),
+                                " indexing: ", N_big, " > ",
+                                std::numeric_limits<Index>::max()));
     const Index N = static_cast<Index>(indices.NumElements());
     OP_REQUIRES(
         c, params.dim_size(0) <= std::numeric_limits<Index>::max(),
@@ -137,7 +138,7 @@ class ScatterUpdateOp : public OpKernel {
 
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T, typename Index, scatter_op::UpdateOp op>
-class ScatterUpdateOp <SYCLDevice, T, Index, op> : public OpKernel {
+class ScatterUpdateOp<SYCLDevice, T, Index, op> : public OpKernel {
  public:
   explicit ScatterUpdateOp(OpKernelConstruction* c) : OpKernel(c) {
     OP_REQUIRES_OK(c, c->GetAttr("use_locking", &use_exclusive_lock_));
@@ -165,11 +166,12 @@ class ScatterUpdateOp <SYCLDevice, T, Index, op> : public OpKernel {
 
     // Check that we have enough index space
     const int64 N_big = indices.NumElements();
-    OP_REQUIRES(c, N_big <= std::numeric_limits<Index>::max(),
-                errors::InvalidArgument(
-                    "indices has too many elements for ",
-                    DataTypeString(DataTypeToEnum<Index>::v()), " indexing: ",
-                    N_big, " > ", std::numeric_limits<Index>::max()));
+    OP_REQUIRES(
+        c, N_big <= std::numeric_limits<Index>::max(),
+        errors::InvalidArgument("indices has too many elements for ",
+                                DataTypeString(DataTypeToEnum<Index>::v()),
+                                " indexing: ", N_big, " > ",
+                                std::numeric_limits<Index>::max()));
     const Index N = static_cast<Index>(indices.NumElements());
     OP_REQUIRES(
         c, params.dim_size(0) <= std::numeric_limits<Index>::max(),
@@ -206,7 +208,7 @@ class ScatterUpdateOp <SYCLDevice, T, Index, op> : public OpKernel {
     }
   }
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #define REGISTER_SCATTER_KERNEL_INDEX(type, index_type, dev, name, op) \
   REGISTER_KERNEL_BUILDER(Name(name)                                   \
diff --git a/tensorflow/core/kernels/sdca_internal.cc b/tensorflow/core/kernels/sdca_internal.cc
index 863c123b43..066a4b80a2 100644
--- a/tensorflow/core/kernels/sdca_internal.cc
+++ b/tensorflow/core/kernels/sdca_internal.cc
@@ -37,9 +37,8 @@ void FeatureWeightsDenseStorage::UpdateDenseDeltaWeights(
   const size_t num_weight_vectors = normalized_bounded_dual_delta.size();
   if (num_weight_vectors == 1) {
     deltas_.device(device) =
-        deltas_ +
-        dense_vector.RowAsMatrix() *
-            deltas_.constant(normalized_bounded_dual_delta[0]);
+        deltas_ + dense_vector.RowAsMatrix() *
+                      deltas_.constant(normalized_bounded_dual_delta[0]);
   } else {
     // Transform the dual vector into a column matrix.
     const Eigen::TensorMap<Eigen::Tensor<const double, 2, Eigen::RowMajor>>
@@ -61,9 +60,8 @@ void FeatureWeightsSparseStorage::UpdateSparseDeltaWeights(
     const Example::SparseFeatures& sparse_features,
     const std::vector<double>& normalized_bounded_dual_delta) {
   for (int64 k = 0; k < sparse_features.indices->size(); ++k) {
-    const double feature_value = sparse_features.values == nullptr
-                                     ? 1.0
-                                     : (*sparse_features.values)(k);
+    const double feature_value =
+        sparse_features.values == nullptr ? 1.0 : (*sparse_features.values)(k);
     auto it = indices_to_id_.find((*sparse_features.indices)(k));
     for (size_t l = 0; l < normalized_bounded_dual_delta.size(); ++l) {
       deltas_(l, it->second) +=
@@ -122,23 +120,24 @@ Status ModelWeights::Initialize(OpKernelContext* const context) {
   }
 
   // Reads in the weights, and allocates and initializes the delta weights.
-  const auto initialize_weights = [&](
-      const OpInputList& weight_inputs, OpOutputList* const weight_outputs,
-      std::vector<FeatureWeightsDenseStorage>* const feature_weights) {
-    for (int i = 0; i < weight_inputs.size(); ++i) {
-      Tensor* delta_t;
-      TF_RETURN_IF_ERROR(
-          weight_outputs->allocate(i, weight_inputs[i].shape(), &delta_t));
-      // Convert the input vector to a row matrix in internal representation.
-      auto deltas = delta_t->shaped<float, 2>({1, delta_t->NumElements()});
-      deltas.setZero();
-      feature_weights->emplace_back(
-          FeatureWeightsDenseStorage{weight_inputs[i].shaped<float, 2>(
-                                         {1, weight_inputs[i].NumElements()}),
-                                     deltas});
-    }
-    return Status::OK();
-  };
+  const auto initialize_weights =
+      [&](const OpInputList& weight_inputs, OpOutputList* const weight_outputs,
+          std::vector<FeatureWeightsDenseStorage>* const feature_weights) {
+        for (int i = 0; i < weight_inputs.size(); ++i) {
+          Tensor* delta_t;
+          TF_RETURN_IF_ERROR(
+              weight_outputs->allocate(i, weight_inputs[i].shape(), &delta_t));
+          // Convert the input vector to a row matrix in internal
+          // representation.
+          auto deltas = delta_t->shaped<float, 2>({1, delta_t->NumElements()});
+          deltas.setZero();
+          feature_weights->emplace_back(FeatureWeightsDenseStorage{
+              weight_inputs[i].shaped<float, 2>(
+                  {1, weight_inputs[i].NumElements()}),
+              deltas});
+        }
+        return Status::OK();
+      };
 
   return initialize_weights(dense_weights_inputs, &dense_weights_outputs,
                             &dense_weights_);
diff --git a/tensorflow/core/kernels/sdca_internal.h b/tensorflow/core/kernels/sdca_internal.h
index 9f07270075..45915693ac 100644
--- a/tensorflow/core/kernels/sdca_internal.h
+++ b/tensorflow/core/kernels/sdca_internal.h
@@ -149,7 +149,8 @@ class Example {
   // 1.0f.
   struct SparseFeatures {
     std::unique_ptr<TTypes<const int64>::UnalignedConstVec> indices;
-    std::unique_ptr<TTypes<const float>::UnalignedConstVec> values;  // nullptr encodes optional.
+    std::unique_ptr<TTypes<const float>::UnalignedConstVec>
+        values;  // nullptr encodes optional.
   };
 
   // A dense vector which is a row-slice of the underlying matrix.
diff --git a/tensorflow/core/kernels/sdca_ops.cc b/tensorflow/core/kernels/sdca_ops.cc
index 0f5c2424b3..dbe0177dda 100644
--- a/tensorflow/core/kernels/sdca_ops.cc
+++ b/tensorflow/core/kernels/sdca_ops.cc
@@ -57,11 +57,11 @@ namespace tensorflow {
 
 namespace {
 
-using sdca::Regularizations;
 using sdca::Example;
 using sdca::Examples;
 using sdca::ExampleStatistics;
 using sdca::ModelWeights;
+using sdca::Regularizations;
 
 struct ComputeOptions {
   explicit ComputeOptions(OpKernelConstruction* const context) {
@@ -76,8 +76,9 @@ struct ComputeOptions {
     } else if (loss_type == "smooth_hinge_loss") {
       loss_updater.reset(new SmoothHingeLossUpdater);
     } else {
-      OP_REQUIRES(context, false, errors::InvalidArgument(
-                                      "Unsupported loss type: ", loss_type));
+      OP_REQUIRES(
+          context, false,
+          errors::InvalidArgument("Unsupported loss type: ", loss_type));
     }
     OP_REQUIRES_OK(context, context->GetAttr("adaptative", &adaptative));
     OP_REQUIRES_OK(
@@ -90,9 +91,10 @@ struct ComputeOptions {
         context, num_sparse_features + num_dense_features > 0,
         errors::InvalidArgument("Requires at least one feature to train."));
 
-    OP_REQUIRES(context, static_cast<int64>(num_sparse_features) +
-                                 static_cast<int64>(num_dense_features) <=
-                             std::numeric_limits<int>::max(),
+    OP_REQUIRES(context,
+                static_cast<int64>(num_sparse_features) +
+                        static_cast<int64>(num_dense_features) <=
+                    std::numeric_limits<int>::max(),
                 errors::InvalidArgument(
                     strings::Printf("Too many feature groups: %lld > %d",
                                     static_cast<int64>(num_sparse_features) +
diff --git a/tensorflow/core/kernels/segment_reduction_ops.cc b/tensorflow/core/kernels/segment_reduction_ops.cc
index 3ef1cd1e06..27b8081eb8 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops.cc
@@ -115,7 +115,7 @@ class SegmentReductionOp : public OpKernel {
     Eigen::DSizes<Eigen::DenseIndex, 1> dims_to_reduce;
     dims_to_reduce[0] = 0;
 #else
-    Eigen::IndexList<Eigen::type2index<0>> dims_to_reduce;
+    Eigen::IndexList<Eigen::type2index<0> > dims_to_reduce;
 #endif
     Index start = 0, end = 1;
 
@@ -359,7 +359,8 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_SORTED_KERNELS_ALL);
 namespace functor {
 
 // UnsortedSegmentSumFunctor implementation for CPUDevice.
-// todo: Remove duplicate code in UnsortedSegmentSumFunctor and UnsortedSegmentMaxFunctor.
+// todo: Remove duplicate code in UnsortedSegmentSumFunctor and
+// UnsortedSegmentMaxFunctor.
 template <typename T, typename Index>
 struct UnsortedSegmentSumFunctor<CPUDevice, T, Index>
     : UnsortedSegmentBaseFunctor<CPUDevice, T, Index> {
@@ -461,9 +462,10 @@ class UnsortedSegmentBaseOp : public OpKernel {
 
     auto data_ptr = data.template flat<T>().data();
     reduction_functor_(context, context->template eigen_device<Device>(),
-                     output_rows, segment_ids.shape(), segment_flat,
-                     data.NumElements(), data_ptr, output_flat);
+                       output_rows, segment_ids.shape(), segment_flat,
+                       data.NumElements(), data_ptr, output_flat);
   }
+
  private:
   functor::UnsortedSegmentBaseFunctor<Device, T, Index>& reduction_functor_;
 };
@@ -472,22 +474,20 @@ template <typename Device, class T, class Index>
 class UnsortedSegmentSumOp : public UnsortedSegmentBaseOp<Device, T, Index> {
  public:
   explicit UnsortedSegmentSumOp(OpKernelConstruction* context)
-      : UnsortedSegmentBaseOp<Device, T, Index>(
-            context,
-            sum_functor_) {}
+      : UnsortedSegmentBaseOp<Device, T, Index>(context, sum_functor_) {}
+
  private:
-    functor::UnsortedSegmentSumFunctor<Device, T, Index> sum_functor_;
+  functor::UnsortedSegmentSumFunctor<Device, T, Index> sum_functor_;
 };
 
 template <typename Device, class T, class Index>
 class UnsortedSegmentMaxOp : public UnsortedSegmentBaseOp<Device, T, Index> {
  public:
   explicit UnsortedSegmentMaxOp(OpKernelConstruction* context)
-      : UnsortedSegmentBaseOp<Device, T, Index>(
-            context,
-            max_functor_) {}
+      : UnsortedSegmentBaseOp<Device, T, Index>(context, max_functor_) {}
+
  private:
-    functor::UnsortedSegmentMaxFunctor<Device, T, Index> max_functor_;
+  functor::UnsortedSegmentMaxFunctor<Device, T, Index> max_functor_;
 };
 
 #define REGISTER_REAL_CPU_UNSORTED_KERNELS(type, index_type)                  \
@@ -663,9 +663,9 @@ class SparseSegmentReductionOpBase : public OpKernel {
           Reduce(input_flat, indices_vec, start, end - start, out);
       OP_REQUIRES(context, bad_offset < 0,
                   errors::InvalidArgument(
-                      "Bad: indices[", start + bad_offset, "] == ",
-                      indices_vec(start + bad_offset), " out of range [0, ",
-                      input_flat.dimension(0), ")"));
+                      "Bad: indices[", start + bad_offset,
+                      "] == ", indices_vec(start + bad_offset),
+                      " out of range [0, ", input_flat.dimension(0), ")"));
 
       start = end;
       ++end;
diff --git a/tensorflow/core/kernels/segment_reduction_ops.h b/tensorflow/core/kernels/segment_reduction_ops.h
index bcdd42c80c..5c9cfe0906 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.h
+++ b/tensorflow/core/kernels/segment_reduction_ops.h
@@ -51,13 +51,14 @@ struct SegmentSumFunctor {
 // BaseFunctor for definition of UnsorteSegmentReductionOp
 // for usage without templates.
 template <typename Device, typename T, typename Index>
-struct UnsortedSegmentBaseFunctor{
-  virtual ~UnsortedSegmentBaseFunctor(){}
+struct UnsortedSegmentBaseFunctor {
+  virtual ~UnsortedSegmentBaseFunctor() {}
   virtual void operator()(OpKernelContext* ctx, const Device& d,
-                  const Index output_rows, const TensorShape& segment_ids_shape,
-                  typename TTypes<Index>::ConstFlat segment_ids,
-                  const Index data_size, const T* data,
-                  typename TTypes<T, 2>::Tensor output){};
+                          const Index output_rows,
+                          const TensorShape& segment_ids_shape,
+                          typename TTypes<Index>::ConstFlat segment_ids,
+                          const Index data_size, const T* data,
+                          typename TTypes<T, 2>::Tensor output){};
 };
 
 // Functor for UnsortedSegmentSumOp.
@@ -70,7 +71,8 @@ struct UnsortedSegmentBaseFunctor{
 // data: input data tensor.
 // output: output reshaped to {output_rows, output.size/output_rows}
 template <typename Device, typename T, typename Index>
-struct UnsortedSegmentSumFunctor: public UnsortedSegmentBaseFunctor<Device, T, Index> {
+struct UnsortedSegmentSumFunctor
+    : public UnsortedSegmentBaseFunctor<Device, T, Index> {
   void operator()(OpKernelContext* ctx, const Device& d,
                   const Index output_rows, const TensorShape& segment_ids_shape,
                   typename TTypes<Index>::ConstFlat segment_ids,
@@ -88,7 +90,8 @@ struct UnsortedSegmentSumFunctor: public UnsortedSegmentBaseFunctor<Device, T, I
 // data: input data tensor.
 // output: output reshaped to {output_rows, output.size/output_rows}
 template <typename Device, typename T, typename Index>
-struct UnsortedSegmentMaxFunctor: public UnsortedSegmentBaseFunctor<Device, T, Index> {
+struct UnsortedSegmentMaxFunctor
+    : public UnsortedSegmentBaseFunctor<Device, T, Index> {
   void operator()(OpKernelContext* ctx, const Device& d,
                   const Index output_rows, const TensorShape& segment_ids_shape,
                   typename TTypes<Index>::ConstFlat segment_ids,
diff --git a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc
index 159fada621..39d520698e 100644
--- a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc
@@ -194,7 +194,8 @@ void SegmentSumFunctor<T, Index>::operator()(
 
 // UnsortedSegmentSumFunctor implementation for GPUDevice.
 template <typename T, typename Index>
-struct UnsortedSegmentSumFunctor<GPUDevice, T, Index>: UnsortedSegmentBaseFunctor<GPUDevice, T, Index> {
+struct UnsortedSegmentSumFunctor<GPUDevice, T, Index>
+    : UnsortedSegmentBaseFunctor<GPUDevice, T, Index> {
   void operator()(OpKernelContext* ctx, const GPUDevice& d,
                   const Index output_rows, const TensorShape& segment_ids_shape,
                   typename TTypes<Index>::ConstFlat segment_ids,
@@ -221,11 +222,10 @@ struct UnsortedSegmentSumFunctor<GPUDevice, T, Index>: UnsortedSegmentBaseFuncto
     const Index input_inner_dim_size = input_total_size / input_outer_dim_size;
 
     config = GetCudaLaunchConfig(input_total_size, d);
-    UnsortedSegmentSumCustomKernel<
-        T,
-        Index><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        input_outer_dim_size, input_inner_dim_size, output_rows,
-        segment_ids.data(), data, output.data());
+    UnsortedSegmentSumCustomKernel<T, Index>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            input_outer_dim_size, input_inner_dim_size, output_rows,
+            segment_ids.data(), data, output.data());
   }
 };
 
diff --git a/tensorflow/core/kernels/self_adjoint_eig_op.cc b/tensorflow/core/kernels/self_adjoint_eig_op.cc
index 9765780726..bcd8877390 100644
--- a/tensorflow/core/kernels/self_adjoint_eig_op.cc
+++ b/tensorflow/core/kernels/self_adjoint_eig_op.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
-
 namespace tensorflow {
 
 template <class Scalar>
diff --git a/tensorflow/core/kernels/sendrecv_ops.cc b/tensorflow/core/kernels/sendrecv_ops.cc
index 206fd40fa6..688e61fcad 100644
--- a/tensorflow/core/kernels/sendrecv_ops.cc
+++ b/tensorflow/core/kernels/sendrecv_ops.cc
@@ -114,7 +114,7 @@ REGISTER_KERNEL_BUILDER(Name("_Send").Device(DEVICE_GPU), SendOp);
 REGISTER_KERNEL_BUILDER(Name("_Send").Device(DEVICE_SYCL), SendOp);
 REGISTER_KERNEL_BUILDER(
     Name("_HostSend").Device(DEVICE_SYCL).HostMemory("tensor"), SendOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 REGISTER_KERNEL_BUILDER(Name("_HostSend").Device(DEVICE_CPU), SendOp);
 REGISTER_KERNEL_BUILDER(
@@ -198,7 +198,7 @@ REGISTER_KERNEL_BUILDER(Name("_Recv").Device(DEVICE_GPU), RecvOp);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(Name("_Recv").Device(DEVICE_SYCL), RecvOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 REGISTER_KERNEL_BUILDER(Name("_HostRecv").Device(DEVICE_CPU), RecvOp);
 REGISTER_KERNEL_BUILDER(
@@ -207,6 +207,6 @@ REGISTER_KERNEL_BUILDER(
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(
     Name("_HostRecv").Device(DEVICE_SYCL).HostMemory("tensor"), RecvOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/sequence_ops.cc b/tensorflow/core/kernels/sequence_ops.cc
index e2e3758d87..9db0bd4d98 100644
--- a/tensorflow/core/kernels/sequence_ops.cc
+++ b/tensorflow/core/kernels/sequence_ops.cc
@@ -53,13 +53,13 @@ class RangeOp : public OpKernel {
     if (delta > 0) {
       OP_REQUIRES(
           context, start <= limit,
-          errors::InvalidArgument("Requires start <= limit when delta > 0: ",
-                                  start, "/", limit));
+          errors::InvalidArgument(
+              "Requires start <= limit when delta > 0: ", start, "/", limit));
     } else {
       OP_REQUIRES(
           context, start >= limit,
-          errors::InvalidArgument("Requires start >= limit when delta < 0: ",
-                                  start, "/", limit));
+          errors::InvalidArgument(
+              "Requires start >= limit when delta < 0: ", start, "/", limit));
     }
     int64 size = (std::is_integral<T>::value
                       ? ((std::abs(limit - start) + std::abs(delta) - 1) /
diff --git a/tensorflow/core/kernels/session_ops.cc b/tensorflow/core/kernels/session_ops.cc
index 185c5b248f..f2dd2812b5 100644
--- a/tensorflow/core/kernels/session_ops.cc
+++ b/tensorflow/core/kernels/session_ops.cc
@@ -144,7 +144,7 @@ REGISTER_GPU_KERNEL(bool);
 TF_CALL_NUMBER_TYPES(REGISTER_SYCL_KERNEL);
 REGISTER_SYCL_KERNEL(bool);
 #undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 class DeleteSessionTensorOp : public OpKernel {
  public:
diff --git a/tensorflow/core/kernels/shape_ops.h b/tensorflow/core/kernels/shape_ops.h
index 8d9d0ea846..55be308901 100644
--- a/tensorflow/core/kernels/shape_ops.h
+++ b/tensorflow/core/kernels/shape_ops.h
@@ -235,10 +235,10 @@ class SqueezeOp : public OpKernel {
       if (!wrapped_squeeze_dims.empty()) {
         if (wrapped_squeeze_dims.count(i) > 0) {
           OP_REQUIRES(ctx, existing_dim == 1,
-                      errors::InvalidArgument("Tried to explicitly squeeze "
-                                              "dimension ",
-                                              i, " but dimension was not 1: ",
-                                              existing_dim));
+                      errors::InvalidArgument(
+                          "Tried to explicitly squeeze "
+                          "dimension ",
+                          i, " but dimension was not 1: ", existing_dim));
         } else {
           // This dimension is not being squeezed.
           new_shape.push_back(existing_dim);
diff --git a/tensorflow/core/kernels/slice_op.cc b/tensorflow/core/kernels/slice_op.cc
index 82595de779..79369fd4a9 100644
--- a/tensorflow/core/kernels/slice_op.cc
+++ b/tensorflow/core/kernels/slice_op.cc
@@ -58,7 +58,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // Shared code that is not dependent on the type of T.  We do this to reduce
 // code size by not duplicating all this for all T (float, double, int32, etc.)
@@ -72,10 +72,11 @@ static void SharedValidation(OpKernelContext* context,
   const Tensor& size_tensor = context->input(2);
 
   OP_REQUIRES(
-      context, context->op_kernel().IsLegacyVector(begin_tensor.shape()) &&
-                   context->op_kernel().IsLegacyVector(size_tensor.shape()) &&
-                   begin_tensor.NumElements() == input.dims() &&
-                   size_tensor.NumElements() == input.dims(),
+      context,
+      context->op_kernel().IsLegacyVector(begin_tensor.shape()) &&
+          context->op_kernel().IsLegacyVector(size_tensor.shape()) &&
+          begin_tensor.NumElements() == input.dims() &&
+          size_tensor.NumElements() == input.dims(),
       errors::InvalidArgument(
           "Expected begin and size arguments to be 1-D tensors of size ",
           input.dims(), ", but got shapes ", begin_tensor.shape().DebugString(),
@@ -125,8 +126,7 @@ static void SharedSliceCommonCases(OpKernelContext* context,
                                    TensorShape* output_shape,
                                    gtl::InlinedVector<int64, 4>* begin,
                                    gtl::InlinedVector<int64, 4>* size,
-                                   Tensor** result,
-                                   bool* done) {
+                                   Tensor** result, bool* done) {
   bool is_identity = true;
   bool slice_dim0 = true;
   *done = false;
@@ -142,8 +142,8 @@ static void SharedSliceCommonCases(OpKernelContext* context,
     return;
   }
 
-  if (slice_dim0 && IsDim0SliceAligned<T>(input.shape(), (*begin)[0],
-                                          (*size)[0])) {
+  if (slice_dim0 &&
+      IsDim0SliceAligned<T>(input.shape(), (*begin)[0], (*size)[0])) {
     VLOG(1) << "Slice dim 0: " << input.shape().DebugString();
     CHECK_GE(input.dims(), 1);  // Otherwise, is_identity should be true.
     context->set_output(0, input.Slice((*begin)[0], (*begin)[0] + (*size)[0]));
@@ -154,7 +154,6 @@ static void SharedSliceCommonCases(OpKernelContext* context,
   OP_REQUIRES_OK(context, context->allocate_output(0, *output_shape, result));
 }
 
-
 template <typename Device, typename T>
 class SliceOp : public OpKernel {
  public:
@@ -206,8 +205,9 @@ class SliceOp : public OpKernel {
 
 #undef HANDLE_DIM
 
-      OP_REQUIRES(context, false, errors::Unimplemented(
-                                      "SliceOp : Unhandled input dimensions"));
+      OP_REQUIRES(
+          context, false,
+          errors::Unimplemented("SliceOp : Unhandled input dimensions"));
     }
   }
 
@@ -280,8 +280,9 @@ class MklSliceOp : public OpKernel {
 
 #undef HANDLE_DIM
 
-      OP_REQUIRES(context, false, errors::Unimplemented(
-                                      "SliceOp : Unhandled input dimensions"));
+      OP_REQUIRES(
+          context, false,
+          errors::Unimplemented("SliceOp : Unhandled input dimensions"));
     }
   }
 
@@ -292,9 +293,9 @@ class MklSliceOp : public OpKernel {
   // as the sizes of all the dimensions of the input except slice_dim, then
   // returns True. Otherwise, returns False.
   bool DoesSliceShapeDifferInOnly1DHelper(const TensorShape& input_shape,
-                          const gtl::ArraySlice<int64>& begin,
-                          const gtl::ArraySlice<int64>& size,
-                          int slice_dim) {
+                                          const gtl::ArraySlice<int64>& begin,
+                                          const gtl::ArraySlice<int64>& size,
+                                          int slice_dim) {
     for (int dim = 0; dim < 4; dim++) {
       if (dim != slice_dim &&
           (begin[dim] != 0 || size[dim] != input_shape.dim_size(dim))) {
@@ -316,9 +317,9 @@ class MklSliceOp : public OpKernel {
   // Returns True if Slicing over a single dimension, and sets slice_dim
   // to the number of the dimension that satisfies criteria.
   bool DoesSliceShapeDifferInOnly1D(const TensorShape& input_shape,
-                          const gtl::ArraySlice<int64>& begin,
-                          const gtl::ArraySlice<int64>& size,
-                          int* slice_dim) {
+                                    const gtl::ArraySlice<int64>& begin,
+                                    const gtl::ArraySlice<int64>& size,
+                                    int* slice_dim) {
     for (int dim = 0; dim < 4; dim++) {
       if (DoesSliceShapeDifferInOnly1DHelper(input_shape, begin, size, dim)) {
         *slice_dim = dim;
@@ -329,8 +330,7 @@ class MklSliceOp : public OpKernel {
   }
 
   template <int NDIM>
-  void HandleCase(OpKernelContext* context,
-                  const gtl::ArraySlice<int64>& begin,
+  void HandleCase(OpKernelContext* context, const gtl::ArraySlice<int64>& begin,
                   const gtl::ArraySlice<int64>& size, Tensor* result) {
     int slice_dim = -1;
     TensorShape in_shape = context->input(0).shape();
@@ -340,67 +340,63 @@ class MklSliceOp : public OpKernel {
     // format over channel dimension.
     if (NDIM == 4 &&
         DoesSliceShapeDifferInOnly1D(in_shape, begin, size, &slice_dim)) {
-        size_t in_strides[4] = { (size_t) in_shape.dim_size(1) *
-                                          in_shape.dim_size(2) *
-                                          in_shape.dim_size(3),
-                                 (size_t) in_shape.dim_size(2) *
-                                          in_shape.dim_size(3),
-                                 (size_t) in_shape.dim_size(3),
-                                 (size_t) 1
-                               };
-
-        size_t out_strides[4] = { (size_t) size[1] * size[2] * size[3],
-                                  (size_t) size[2] * size[3],
-                                  (size_t) size[3],
-                                  (size_t) 1 };
-
-        T *in_buf = const_cast<T*>(const_cast<const T*>(
-                    context->input(0).flat<T>().data()));
-        T *op_buf = result->flat<T>().data();
-
-        if (slice_dim == 1) {
-          /* data format = NCHW */
-
-          #pragma omp parallel for
-          for (size_t d0 = begin[0]; d0 < begin[0] + size[0]; d0++) {
-              T *ip  = in_buf + (d0 * in_strides[0]);
-              T *op  = op_buf + ((d0 - begin[0]) * out_strides[0]);
-            #pragma omp parallel for
-            for (size_t d1 = begin[1]; d1 < begin[1] + size[1]; d1++) {
-              T *ip1 = ip + (d1 * in_strides[1]);
-              T *op1 = op + ((d1 - begin[1]) * out_strides[1]);
-              // For NCHW, H and W will be contiguous. So we can copy
-              // both with one memcpy.
-              memcpy(static_cast<void*>(op1), static_cast<void*>(ip1),
-                     sizeof(T) * in_strides[1]);
-            }
+      size_t in_strides[4] = {
+          (size_t)in_shape.dim_size(1) * in_shape.dim_size(2) *
+              in_shape.dim_size(3),
+          (size_t)in_shape.dim_size(2) * in_shape.dim_size(3),
+          (size_t)in_shape.dim_size(3), (size_t)1};
+
+      size_t out_strides[4] = {(size_t)size[1] * size[2] * size[3],
+                               (size_t)size[2] * size[3], (size_t)size[3],
+                               (size_t)1};
+
+      T* in_buf = const_cast<T*>(
+          const_cast<const T*>(context->input(0).flat<T>().data()));
+      T* op_buf = result->flat<T>().data();
+
+      if (slice_dim == 1) {
+        /* data format = NCHW */
+
+#pragma omp parallel for
+        for (size_t d0 = begin[0]; d0 < begin[0] + size[0]; d0++) {
+          T* ip = in_buf + (d0 * in_strides[0]);
+          T* op = op_buf + ((d0 - begin[0]) * out_strides[0]);
+#pragma omp parallel for
+          for (size_t d1 = begin[1]; d1 < begin[1] + size[1]; d1++) {
+            T* ip1 = ip + (d1 * in_strides[1]);
+            T* op1 = op + ((d1 - begin[1]) * out_strides[1]);
+            // For NCHW, H and W will be contiguous. So we can copy
+            // both with one memcpy.
+            memcpy(static_cast<void*>(op1), static_cast<void*>(ip1),
+                   sizeof(T) * in_strides[1]);
           }
-          return;
-        } else if (slice_dim == 3) {
-          /* data_format = NHWC */
-
-          #pragma omp parallel for
-          for (size_t d0 = begin[0]; d0 < begin[0] + size[0]; d0++) {
-              T *ip = in_buf + (d0 * in_strides[0]);
-              T *op = op_buf + ((d0 - begin[0]) * out_strides[0]);
-            #pragma omp parallel for
-            for (size_t d1 = begin[1]; d1 < begin[1] + size[1]; d1++) {
-              T *ip1 = ip + (d1 * in_strides[1]);
-              T *op1 = op + ((d1 - begin[1]) * out_strides[1]);
-              #pragma omp parallel for
-              for (size_t d2 = begin[2]; d2 < begin[2] + size[2]; d2++) {
-                T *ip2 = ip1 + (d2 * in_strides[2]);
-                T *ip3 = ip2 + begin[3];
-                T *op2 = op1 + ((d2 - begin[2]) * out_strides[2]);
-                T *op3 = op2;
-                memcpy(static_cast<void*>(op3), static_cast<void*>(ip3),
-                       sizeof(T) * size[3]);
-              }
+        }
+        return;
+      } else if (slice_dim == 3) {
+        /* data_format = NHWC */
+
+#pragma omp parallel for
+        for (size_t d0 = begin[0]; d0 < begin[0] + size[0]; d0++) {
+          T* ip = in_buf + (d0 * in_strides[0]);
+          T* op = op_buf + ((d0 - begin[0]) * out_strides[0]);
+#pragma omp parallel for
+          for (size_t d1 = begin[1]; d1 < begin[1] + size[1]; d1++) {
+            T* ip1 = ip + (d1 * in_strides[1]);
+            T* op1 = op + ((d1 - begin[1]) * out_strides[1]);
+#pragma omp parallel for
+            for (size_t d2 = begin[2]; d2 < begin[2] + size[2]; d2++) {
+              T* ip2 = ip1 + (d2 * in_strides[2]);
+              T* ip3 = ip2 + begin[3];
+              T* op2 = op1 + ((d2 - begin[2]) * out_strides[2]);
+              T* op3 = op2;
+              memcpy(static_cast<void*>(op3), static_cast<void*>(ip3),
+                     sizeof(T) * size[3]);
             }
           }
-          return;
         }
-        // slice_dim is not 1 or 3, then we fallback to Eigen implementation.
+        return;
+      }
+      // slice_dim is not 1 or 3, then we fallback to Eigen implementation.
     }
 
     Eigen::DSizes<Eigen::DenseIndex, NDIM> indices;
@@ -535,13 +531,13 @@ REGISTER_KERNEL_BUILDER(Name("Slice")
 #ifdef TENSORFLOW_USE_SYCL
 // Forward declarations of the functor specializations for SYCL.
 namespace functor {
-#define DECLARE_SYCL_SPEC(T, NDIM)                                 \
-  template <>                                                      \
-  void Slice<SYCLDevice, T, NDIM>::operator()(                     \
-      const SYCLDevice& d, typename TTypes<T, NDIM>::Tensor output,\
-      typename TTypes<T, NDIM>::ConstTensor input,                 \
-      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& indices,       \
-      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& sizes);        \
+#define DECLARE_SYCL_SPEC(T, NDIM)                                  \
+  template <>                                                       \
+  void Slice<SYCLDevice, T, NDIM>::operator()(                      \
+      const SYCLDevice& d, typename TTypes<T, NDIM>::Tensor output, \
+      typename TTypes<T, NDIM>::ConstTensor input,                  \
+      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& indices,        \
+      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& sizes);         \
   extern template struct Slice<SYCLDevice, T, NDIM>;
 
 #define DECLARE_FOR_N(T)   \
diff --git a/tensorflow/core/kernels/slice_op.h b/tensorflow/core/kernels/slice_op.h
index 0362a02133..db7eded745 100644
--- a/tensorflow/core/kernels/slice_op.h
+++ b/tensorflow/core/kernels/slice_op.h
@@ -24,7 +24,6 @@ limitations under the License.
 namespace tensorflow {
 namespace functor {
 
-
 template <typename Device, typename T, int NDIMS>
 struct Slice {
   void operator()(const Device& d, typename TTypes<T, NDIMS>::Tensor output,
diff --git a/tensorflow/core/kernels/slice_op_cpu_impl.h b/tensorflow/core/kernels/slice_op_cpu_impl.h
index 47f1d5342a..64b6948190 100644
--- a/tensorflow/core/kernels/slice_op_cpu_impl.h
+++ b/tensorflow/core/kernels/slice_op_cpu_impl.h
@@ -43,7 +43,7 @@ TF_CALL_GPU_NUMBER_TYPES(DEFINE_SYCL_KERNELS);
 DEFINE_SYCL_KERNELS(int32);
 
 #undef DEFINE_SYCL_KERNELS
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/softmax_op.cc b/tensorflow/core/kernels/softmax_op.cc
index 590f01c469..e1712ac239 100644
--- a/tensorflow/core/kernels/softmax_op.cc
+++ b/tensorflow/core/kernels/softmax_op.cc
@@ -30,7 +30,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // Partial specialization for a CPUDevice, that uses the Eigen implementation
 // from SoftmaxEigenImpl.
@@ -48,7 +48,7 @@ struct SoftmaxFunctor<CPUDevice, T> : SoftmaxFunctorBase<CPUDevice, T> {};
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
 struct SoftmaxFunctor<SYCLDevice, T> : SoftmaxFunctorBase<SYCLDevice, T> {};
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace functor
 
 template <typename Device, typename T>
@@ -100,5 +100,5 @@ REGISTER_KERNEL_BUILDER(
 REGISTER_KERNEL_BUILDER(
     Name("Softmax").Device(DEVICE_SYCL).TypeConstraint<double>("T"),
     SoftmaxOp<SYCLDevice, double>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/spacetobatch_benchmark_test.cc b/tensorflow/core/kernels/spacetobatch_benchmark_test.cc
index c25ce2d8bb..92ddf8edbf 100644
--- a/tensorflow/core/kernels/spacetobatch_benchmark_test.cc
+++ b/tensorflow/core/kernels/spacetobatch_benchmark_test.cc
@@ -70,7 +70,7 @@ static Graph* ConstructSpaceToBatchGraph(
   }                                                                                                     \
   BENCHMARK(                                                                                            \
       BM_##OP##_##DEVICE##_##DTYPE##_##B##_##H##_##W##_##D##_bs##BS##_pad##P00##_##P01##_##P10##_##P11);
-#define BM_SpaceToBatch(OP, ...)                      \
+#define BM_SpaceToBatch(OP, ...)                                 \
   BM_Expand(BM_SpaceToBatchDev(OP, cpu, DT_FLOAT, __VA_ARGS__)); \
   BM_Expand(BM_SpaceToBatchDev(OP, gpu, DT_FLOAT, __VA_ARGS__)); \
   BM_Expand(BM_SpaceToBatchDev(OP, cpu, DT_HALF, __VA_ARGS__));  \
diff --git a/tensorflow/core/kernels/spacetobatch_functor.cc b/tensorflow/core/kernels/spacetobatch_functor.cc
index 23d8a5f9ed..4c374b8d99 100644
--- a/tensorflow/core/kernels/spacetobatch_functor.cc
+++ b/tensorflow/core/kernels/spacetobatch_functor.cc
@@ -154,7 +154,7 @@ struct SpaceToBatchFunctor<CPUDevice, T, NUM_BLOCK_DIMS, B2S> {
 #define INSTANTIATE(NUM_BLOCK_DIMS, T)                                      \
   template struct SpaceToBatchFunctor<CPUDevice, T, NUM_BLOCK_DIMS, false>; \
   template struct SpaceToBatchFunctor<CPUDevice, T, NUM_BLOCK_DIMS, true>;  \
-/**/
+  /**/
 
 #define INSTANTIATE_FOR_T(T) \
   TF_SPACETOBATCH_FOR_EACH_NUM_BLOCK_DIMS(INSTANTIATE, T)
diff --git a/tensorflow/core/kernels/spacetobatch_functor.h b/tensorflow/core/kernels/spacetobatch_functor.h
index 06813650c0..f46a84da1e 100644
--- a/tensorflow/core/kernels/spacetobatch_functor.h
+++ b/tensorflow/core/kernels/spacetobatch_functor.h
@@ -44,7 +44,7 @@ constexpr int kMaxSpaceToBatchBlockDims = 4;
   MACRO(2 /**/, ##__VA_ARGS__)                              \
   MACRO(3 /**/, ##__VA_ARGS__)                              \
   MACRO(4 /**/, ##__VA_ARGS__)                              \
-/**/
+  /**/
 
 namespace internal {
 namespace spacetobatch {
diff --git a/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc b/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc
index db8d419c38..5687141c9e 100644
--- a/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc
@@ -141,10 +141,10 @@ struct SpaceToBatchFunctor<GPUDevice, T, NUM_BLOCK_DIMS, B2S> {
     }
     CudaLaunchConfig config =
         GetCudaLaunchConfig(static_cast<int32>(total_count), d);
-    S2B<T, NUM_BLOCK_DIMS,
-        B2S><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        config.virtual_thread_count, const_cast<T*>(space_tensor.data()), args,
-        const_cast<T*>(batch_tensor.data()));
+    S2B<T, NUM_BLOCK_DIMS, B2S>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            config.virtual_thread_count, const_cast<T*>(space_tensor.data()),
+            args, const_cast<T*>(batch_tensor.data()));
     return Status::OK();
   }
 };
@@ -153,7 +153,7 @@ struct SpaceToBatchFunctor<GPUDevice, T, NUM_BLOCK_DIMS, B2S> {
 #define INSTANTIATE(NUM_BLOCK_DIMS, T)                                      \
   template struct SpaceToBatchFunctor<GPUDevice, T, NUM_BLOCK_DIMS, false>; \
   template struct SpaceToBatchFunctor<GPUDevice, T, NUM_BLOCK_DIMS, true>;  \
-/**/
+  /**/
 
 #define INSTANTIATE_FOR_T(T) \
   TF_SPACETOBATCH_FOR_EACH_NUM_BLOCK_DIMS(INSTANTIATE, T)
diff --git a/tensorflow/core/kernels/spacetobatch_op.cc b/tensorflow/core/kernels/spacetobatch_op.cc
index 95c1f5e7e8..fdc08ec8e3 100644
--- a/tensorflow/core/kernels/spacetobatch_op.cc
+++ b/tensorflow/core/kernels/spacetobatch_op.cc
@@ -58,9 +58,10 @@ void SpaceToBatchOpCompute(OpKernelContext* context,
       errors::InvalidArgument("input rank should be >= ", 1 + block_dims,
                               " instead of ", orig_input_tensor.dims()));
 
-  OP_REQUIRES(context, TensorShapeUtils::IsMatrix(orig_paddings.shape()) &&
-                           block_dims == orig_paddings.dim_size(0) &&
-                           2 == orig_paddings.dim_size(1),
+  OP_REQUIRES(context,
+              TensorShapeUtils::IsMatrix(orig_paddings.shape()) &&
+                  block_dims == orig_paddings.dim_size(0) &&
+                  2 == orig_paddings.dim_size(1),
               errors::InvalidArgument("paddings should have shape [",
                                       block_dims, ", 2] instead of ",
                                       orig_paddings.shape().DebugString()));
diff --git a/tensorflow/core/kernels/sparse_add_grad_op.cc b/tensorflow/core/kernels/sparse_add_grad_op.cc
index d8ed0c6f0c..8597f3a8f7 100644
--- a/tensorflow/core/kernels/sparse_add_grad_op.cc
+++ b/tensorflow/core/kernels/sparse_add_grad_op.cc
@@ -35,9 +35,10 @@ class SparseAddGradOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->input("b_indices", &b_indices));
     OP_REQUIRES_OK(ctx, ctx->input("sum_indices", &sum_indices));
 
-    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(a_indices->shape()) &&
-                         TensorShapeUtils::IsMatrix(b_indices->shape()) &&
-                         TensorShapeUtils::IsMatrix(sum_indices->shape()),
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsMatrix(a_indices->shape()) &&
+                    TensorShapeUtils::IsMatrix(b_indices->shape()) &&
+                    TensorShapeUtils::IsMatrix(sum_indices->shape()),
                 errors::InvalidArgument(
                     "Input indices should be matrices but received shapes: ",
                     a_indices->shape().DebugString(), " and ",
@@ -49,8 +50,9 @@ class SparseAddGradOp : public OpKernel {
             "Input backprop_val_grad should be a vector but received shape: ",
             backprop_val_grad->shape().DebugString()));
     OP_REQUIRES(
-        ctx, a_indices->dim_size(1) == b_indices->dim_size(1) &&
-                 b_indices->dim_size(1) == sum_indices->dim_size(1),
+        ctx,
+        a_indices->dim_size(1) == b_indices->dim_size(1) &&
+            b_indices->dim_size(1) == sum_indices->dim_size(1),
         errors::InvalidArgument("The densified operands should have the same "
                                 "ndims; for A, B, sum got: ",
                                 a_indices->dim_size(1), b_indices->dim_size(1),
diff --git a/tensorflow/core/kernels/sparse_add_op.cc b/tensorflow/core/kernels/sparse_add_op.cc
index bd91dfdce6..d16317af67 100644
--- a/tensorflow/core/kernels/sparse_add_op.cc
+++ b/tensorflow/core/kernels/sparse_add_op.cc
@@ -34,8 +34,9 @@ class SparseAddOp : public OpKernel {
 
     OP_REQUIRES_OK(ctx, ctx->input("a_indices", &a_indices));
     OP_REQUIRES_OK(ctx, ctx->input("b_indices", &b_indices));
-    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(a_indices->shape()) &&
-                         TensorShapeUtils::IsMatrix(b_indices->shape()),
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsMatrix(a_indices->shape()) &&
+                    TensorShapeUtils::IsMatrix(b_indices->shape()),
                 errors::InvalidArgument(
                     "Input indices should be matrices but received shapes: ",
                     a_indices->shape().DebugString(), " and ",
@@ -46,8 +47,9 @@ class SparseAddOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->input("a_values", &a_values_t));
     OP_REQUIRES_OK(ctx, ctx->input("b_values", &b_values_t));
 
-    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(a_values_t->shape()) &&
-                         TensorShapeUtils::IsVector(b_values_t->shape()),
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsVector(a_values_t->shape()) &&
+                    TensorShapeUtils::IsVector(b_values_t->shape()),
                 errors::InvalidArgument(
                     "Input values should be vectors but received shapes: ",
                     a_values_t->shape().DebugString(), " and ",
@@ -62,8 +64,9 @@ class SparseAddOp : public OpKernel {
 
     OP_REQUIRES_OK(ctx, ctx->input("a_shape", &a_shape));
     OP_REQUIRES_OK(ctx, ctx->input("b_shape", &b_shape));
-    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(a_shape->shape()) &&
-                         TensorShapeUtils::IsVector(b_shape->shape()),
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsVector(a_shape->shape()) &&
+                    TensorShapeUtils::IsVector(b_shape->shape()),
                 errors::InvalidArgument(
                     "Input shapes should be a vector but received shapes ",
                     a_shape->shape().DebugString(), " and ",
diff --git a/tensorflow/core/kernels/sparse_add_op_test.cc b/tensorflow/core/kernels/sparse_add_op_test.cc
index 4cad02bbee..1f08e6c5ce 100644
--- a/tensorflow/core/kernels/sparse_add_op_test.cc
+++ b/tensorflow/core/kernels/sparse_add_op_test.cc
@@ -61,9 +61,9 @@ TEST_F(SparseAddOpTest, TwoD_AddSparseTensorWithSelf) {
   // [3   4]
 
   const auto indices_shape = TensorShape({4, 2});
-  std::initializer_list<int64> in{ 0, 1, 1, 0, 2, 0, 2, 1 };
+  std::initializer_list<int64> in{0, 1, 1, 0, 2, 0, 2, 1};
   const gtl::ArraySlice<int64> indices(in);
-  std::initializer_list<int64> sh{ 3, 2 };
+  std::initializer_list<int64> sh{3, 2};
   const gtl::ArraySlice<int64> shape(sh);
 
 #define ADD_TENSOR_INPUT()                                  \
diff --git a/tensorflow/core/kernels/sparse_conditional_accumulator_op.cc b/tensorflow/core/kernels/sparse_conditional_accumulator_op.cc
index c122616cf1..80bc1f1934 100644
--- a/tensorflow/core/kernels/sparse_conditional_accumulator_op.cc
+++ b/tensorflow/core/kernels/sparse_conditional_accumulator_op.cc
@@ -103,8 +103,9 @@ class SparseAccumulatorTakeGradientOp
                       DoneCallback callback) override {
     // Check signature
     OP_REQUIRES_OK_ASYNC(
-        ctx, ctx->MatchSignature({DT_STRING_REF, DT_INT32},
-                                 {DT_INT64, accumulator->dtype(), DT_INT64}),
+        ctx,
+        ctx->MatchSignature({DT_STRING_REF, DT_INT32},
+                            {DT_INT64, accumulator->dtype(), DT_INT64}),
         callback);
   }
 
diff --git a/tensorflow/core/kernels/sparse_cross_op.cc b/tensorflow/core/kernels/sparse_cross_op.cc
index 07d935d55f..7cd4532ad6 100644
--- a/tensorflow/core/kernels/sparse_cross_op.cc
+++ b/tensorflow/core/kernels/sparse_cross_op.cc
@@ -288,8 +288,7 @@ struct CrossTraits<true, int64> {
 template <bool HASHED_OUTPUT, typename InternalType>
 class SparseCrossOp : public OpKernel {
  public:
-  explicit SparseCrossOp(OpKernelConstruction* context)
-      : OpKernel(context) {
+  explicit SparseCrossOp(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("num_buckets", &num_buckets_));
     // Read signed_hash_key_ as int64 since uint64 attributes are not
     // supported by REGISTER_OP.
@@ -316,8 +315,8 @@ class SparseCrossOp : public OpKernel {
         GenerateColumnsFromInput(indices_list_in, values_list_in,
                                  shapes_list_in, dense_list_in);
 
-    typename CrossTraits<HASHED_OUTPUT, InternalType>::Crosser
-        crosser(columns, num_buckets_, hash_key_);
+    typename CrossTraits<HASHED_OUTPUT, InternalType>::Crosser crosser(
+        columns, num_buckets_, hash_key_);
     Tensor* indices_out;
     Tensor* values_out;
     Tensor* shape_out;
@@ -326,8 +325,8 @@ class SparseCrossOp : public OpKernel {
     CreateOutputTensors(columns, batch_size, context, &indices_out, &values_out,
                         &shape_out, &output_start_indices);
 
-    typename CrossTraits<HASHED_OUTPUT, InternalType>::Updater
-        updater(output_start_indices, indices_out, values_out);
+    typename CrossTraits<HASHED_OUTPUT, InternalType>::Updater updater(
+        output_start_indices, indices_out, values_out);
     auto do_work = [this, &columns, crosser, updater](int64 begin, int64 end) {
       for (int b = begin; b < end; b++) {
         ProductIterator<InternalType> product_iterator(columns, b);
@@ -381,8 +380,9 @@ class SparseCrossOp : public OpKernel {
               "Input values should be a std::vector but received shape ",
               values_list_in[i].shape().DebugString(), " at position ", i));
       OP_REQUIRES(
-          context, indices_list_in[i].shape().dim_size(0) ==
-                       values_list_in[i].shape().dim_size(0),
+          context,
+          indices_list_in[i].shape().dim_size(0) ==
+              values_list_in[i].shape().dim_size(0),
           errors::InvalidArgument(
               "Expected size of values to be ",
               indices_list_in[i].shape().dim_size(0), " got ",
diff --git a/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc b/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc
index cc0f86ce05..ac48202ada 100644
--- a/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc
+++ b/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc
@@ -70,8 +70,9 @@ class SparseDenseBinaryOpShared : public OpKernel {
                 errors::InvalidArgument(
                     "Input sp_indices should be a matrix but received shape: ",
                     indices_t->shape().DebugString()));
-    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(values_t->shape()) &&
-                         TensorShapeUtils::IsVector(shape_t->shape()),
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsVector(values_t->shape()) &&
+                    TensorShapeUtils::IsVector(shape_t->shape()),
                 errors::InvalidArgument(
                     "Inputs sp_values and sp_shape should be vectors "
                     "but received shapes: ",
@@ -150,8 +151,9 @@ class SparseDenseBinaryOpShared : public OpKernel {
       CASE(4);
       CASE(5);
       default:
-        OP_REQUIRES(ctx, false, errors::InvalidArgument(
-                                    "Only tensors with ranks between 1 and 5 "
+        OP_REQUIRES(
+            ctx, false,
+            errors::InvalidArgument("Only tensors with ranks between 1 and 5 "
                                     "are currently supported.  Tensor rank: ",
                                     ndims));
 #undef CASE
diff --git a/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc b/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc
index eaf1884243..fe198af7e6 100644
--- a/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc
+++ b/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc
@@ -96,9 +96,9 @@ TEST_F(SparseDenseCDivTest, SameShape) {
   // [2    ]  cdiv [dense: same shape, all 1's]
   // [3   4]
   const auto indices_shape = TensorShape({4, 2});
-  std::initializer_list<int64> in{ 0, 1, 1, 0, 2, 0, 2, 1 };
+  std::initializer_list<int64> in{0, 1, 1, 0, 2, 0, 2, 1};
   const gtl::ArraySlice<int64> indices(in);
-  std::initializer_list<int64> sh{ 3, 2 };
+  std::initializer_list<int64> sh{3, 2};
   const gtl::ArraySlice<int64> shape(sh);
 
   // Tensor dense(DT_FLOAT, TensorShape({3, 1}));
@@ -125,9 +125,9 @@ TEST_F(SparseDenseCDivTest, BroadcastDenseSameDims) {
   // [2    ]  cdiv [dense: shape [3,1], all 1's]
   // [3   4]
   const auto indices_shape = TensorShape({4, 2});
-  std::initializer_list<int64> in{ 0, 1, 1, 0, 2, 0, 2, 1 };
+  std::initializer_list<int64> in{0, 1, 1, 0, 2, 0, 2, 1};
   const gtl::ArraySlice<int64> indices(in);
-  std::initializer_list<int64> sh{ 3, 2 };
+  std::initializer_list<int64> sh{3, 2};
   const gtl::ArraySlice<int64> shape(sh);
 
   Tensor dense(DT_FLOAT, TensorShape({3, 1}));
@@ -152,9 +152,9 @@ TEST_F(SparseDenseCDivTest, BroadcastDenseFewerDims) {
   // [2    ]  cdiv [dense: shape [2]]
   // [3   4]
   const auto indices_shape = TensorShape({4, 2});
-  std::initializer_list<int64> in{ 0, 1, 1, 0, 2, 0, 2, 1 };
+  std::initializer_list<int64> in{0, 1, 1, 0, 2, 0, 2, 1};
   const gtl::ArraySlice<int64> indices(in);
-  std::initializer_list<int64> sh{ 3, 2 };
+  std::initializer_list<int64> sh{3, 2};
   const gtl::ArraySlice<int64> shape(sh);
 
   Tensor dense(DT_FLOAT, TensorShape({2}));
@@ -184,9 +184,9 @@ TEST_F(SparseDenseCMulTest, BroadcastDense) {
   // [1   ?]  where ? remains implicitly zero.
   // [1.5 0]
   const auto indices_shape = TensorShape({4, 2});
-  std::initializer_list<int64> in{ 0, 1, 1, 0, 2, 0, 2, 1 };
+  std::initializer_list<int64> in{0, 1, 1, 0, 2, 0, 2, 1};
   const gtl::ArraySlice<int64> indices(in);
-  std::initializer_list<int64> sh{ 3, 2 };
+  std::initializer_list<int64> sh{3, 2};
   const gtl::ArraySlice<int64> shape(sh);
 
   Tensor dense(DT_FLOAT, TensorShape({2}));
diff --git a/tensorflow/core/kernels/sparse_matmul_op.cc b/tensorflow/core/kernels/sparse_matmul_op.cc
index 8ab23b64d3..a1f9667b78 100644
--- a/tensorflow/core/kernels/sparse_matmul_op.cc
+++ b/tensorflow/core/kernels/sparse_matmul_op.cc
@@ -159,8 +159,8 @@ struct SparseSlice {
 
 template <typename T>
 template <bool Transpose>
-void SparseSlice<T>::Initialize(const typename SparseSlice<T>::ConstMatrixMap& mat,
-                                int col_offset) {
+void SparseSlice<T>::Initialize(
+    const typename SparseSlice<T>::ConstMatrixMap& mat, int col_offset) {
   const int mat_rows = Transpose ? mat.dimension(1) : mat.dimension(0);
   const int mat_cols = Transpose ? mat.dimension(0) : mat.dimension(1);
   DCHECK_LE(num_rows, mat_rows);
@@ -278,9 +278,9 @@ ALWAYS_INLINE float ConvertBfloat16ToFloat(const bfloat16* src) {
   float out = 0;
   auto tmp = reinterpret_cast<bfloat16*>(&out);
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-    tmp[0] = *src;
+  tmp[0] = *src;
 #else
-    tmp[1] = *src;
+  tmp[1] = *src;
 #endif
   return out;
 }
@@ -970,9 +970,9 @@ class SparseMatMulOp : public OpKernel {
     const int k2 = transpose_b_ ? b.dim_size(1) : b.dim_size(0);
 
     OP_REQUIRES(ctx, k == k2,
-                errors::InvalidArgument("Matrix size incompatible: a: ",
-                                        a.shape().DebugString(), ", b: ",
-                                        b.shape().DebugString()));
+                errors::InvalidArgument(
+                    "Matrix size incompatible: a: ", a.shape().DebugString(),
+                    ", b: ", b.shape().DebugString()));
     Tensor* output = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({m, n}), &output));
 
@@ -1224,8 +1224,9 @@ ALWAYS_INLINE void CopyAndMayBeInterleave(void* dst, const void* src,
 
 template <typename TL, typename TR>
 inline BlockingCounter* SparseMatMul<TL, TR>::ShuffleMatrix(
-    const typename SparseMatMul<TL, TR>::ConstMatrixMapR& mat, int slice_row_start,
-    int slice_num_rows, int slice_col_start, int slice_num_cols, const int N,
+    const typename SparseMatMul<TL, TR>::ConstMatrixMapR& mat,
+    int slice_row_start, int slice_num_rows, int slice_col_start,
+    int slice_num_cols, const int N,
     const DeviceBase::CpuWorkerThreads* thread_pool, MatrixR* buffer) {
   DCHECK_EQ(N % 2, 0);
   DCHECK_LE(kNumOperands * sizeof(float) / sizeof(TR), N);
@@ -1306,8 +1307,9 @@ inline std::unique_ptr<BlockingCounter> SparseMatMul<TL, TR>::CreateDenseSlices(
 template <typename TL, typename TR>
 inline void SparseMatMul<TL, TR>::ComputeBlockSizes(
     const typename SparseMatMul<TL, TR>::ConstMatrixMapL& left,
-    const typename SparseMatMul<TL, TR>::ConstMatrixMapR& right, bool transpose_left,
-    int num_threads, int* KR, int* NR, int* KL, int* JB, int* IB) {
+    const typename SparseMatMul<TL, TR>::ConstMatrixMapR& right,
+    bool transpose_left, int num_threads, int* KR, int* NR, int* KL, int* JB,
+    int* IB) {
   // Heuristics for calculating block sizes
   // Assume two hyperthreads per core.
   const int est_num_cores = std::max(1, (num_threads + 1) / 2);
diff --git a/tensorflow/core/kernels/sparse_matmul_op.h b/tensorflow/core/kernels/sparse_matmul_op.h
index cca52558ae..14ef2ed704 100644
--- a/tensorflow/core/kernels/sparse_matmul_op.h
+++ b/tensorflow/core/kernels/sparse_matmul_op.h
@@ -159,25 +159,25 @@ EIGEN_STRONG_INLINE Packet4f pload2bf16<Packet4f>(const float* from) {
 // Return a packet with the first value of the input Packet replicated
 template <>
 EIGEN_STRONG_INLINE Packet4f pbroadcast_first<Packet4f>(const Packet4f& a) {
-  return vec_splat (a, 0);
+  return vec_splat(a, 0);
 }
 
 // Return a packet with the second value of the input Packet replicated
 template <>
 EIGEN_STRONG_INLINE Packet4f pbroadcast_second<Packet4f>(const Packet4f& a) {
-  return vec_splat (a, 1);
+  return vec_splat(a, 1);
 }
 
 // Return a packet with the third value of the input Packet replicated
 template <>
 EIGEN_STRONG_INLINE Packet4f pbroadcast_third<Packet4f>(const Packet4f& a) {
-  return vec_splat (a, 2);
+  return vec_splat(a, 2);
 }
 
 // Return a packet with the fourth value of the input Packet replicated
 template <>
 EIGEN_STRONG_INLINE Packet4f pbroadcast_fourth<Packet4f>(const Packet4f& a) {
-  return vec_splat (a, 3);
+  return vec_splat(a, 3);
 }
 #endif
 
diff --git a/tensorflow/core/kernels/sparse_matmul_op_test.cc b/tensorflow/core/kernels/sparse_matmul_op_test.cc
index f815ca9e34..ebc6d8fa4e 100644
--- a/tensorflow/core/kernels/sparse_matmul_op_test.cc
+++ b/tensorflow/core/kernels/sparse_matmul_op_test.cc
@@ -284,11 +284,11 @@ class SparseMatmulOpTest : public ::testing::Test {
       uint16_t* data3_bfloat16_p =
           reinterpret_cast<uint16_t*>(data3_bfloat16) + i;
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-            data3_p[1] = 0;
-            data3_bfloat16_p[0] = data3_p[0];
+      data3_p[1] = 0;
+      data3_bfloat16_p[0] = data3_p[0];
 #else
-            data3_p[0] = 0;
-            data3_bfloat16_p[0] = data3_p[1];
+      data3_p[0] = 0;
+      data3_bfloat16_p[0] = data3_p[1];
 #endif
     }
   }
diff --git a/tensorflow/core/kernels/sparse_reduce_sum_op_test.cc b/tensorflow/core/kernels/sparse_reduce_sum_op_test.cc
index 110376be42..96246c7a71 100644
--- a/tensorflow/core/kernels/sparse_reduce_sum_op_test.cc
+++ b/tensorflow/core/kernels/sparse_reduce_sum_op_test.cc
@@ -51,9 +51,9 @@ TEST_F(SparseReduceSumOpTest, SimpleReduce) {
   // [3   4]
 
   const auto indices_shape = TensorShape({4, 2});
-  std::initializer_list<int64> in{ 0, 1, 1, 0, 2, 0, 2, 1 };
+  std::initializer_list<int64> in{0, 1, 1, 0, 2, 0, 2, 1};
   const gtl::ArraySlice<int64> indices(in);
-  std::initializer_list<int64> sh{ 3, 2 };
+  std::initializer_list<int64> sh{3, 2};
   const gtl::ArraySlice<int64> shape(sh);
 
   AddInputFromArray<int64>(indices_shape, indices);
@@ -93,9 +93,9 @@ TEST_F(SparseReduceSumSparseOpTest, SimpleReduce) {
   // [3   4]
 
   const auto indices_shape = TensorShape({4, 2});
-  std::initializer_list<int64> in{ 0, 1, 1, 0, 2, 0, 2, 1 };
+  std::initializer_list<int64> in{0, 1, 1, 0, 2, 0, 2, 1};
   const gtl::ArraySlice<int64> indices(in);
-  std::initializer_list<int64> sh{ 3, 2 };
+  std::initializer_list<int64> sh{3, 2};
   const gtl::ArraySlice<int64> shape(sh);
 
   AddInputFromArray<int64>(indices_shape, indices);
diff --git a/tensorflow/core/kernels/sparse_softmax_op.cc b/tensorflow/core/kernels/sparse_softmax_op.cc
index 327a94b8a1..444a5f657a 100644
--- a/tensorflow/core/kernels/sparse_softmax_op.cc
+++ b/tensorflow/core/kernels/sparse_softmax_op.cc
@@ -50,8 +50,9 @@ class SparseSoftmaxOp : public OpKernel {
                 errors::InvalidArgument(
                     "Input sp_indices should be a matrix but received shape: ",
                     indices_t->shape().DebugString()));
-    OP_REQUIRES(context, TensorShapeUtils::IsVector(values_t->shape()) &&
-                             TensorShapeUtils::IsVector(shape_t->shape()),
+    OP_REQUIRES(context,
+                TensorShapeUtils::IsVector(values_t->shape()) &&
+                    TensorShapeUtils::IsVector(shape_t->shape()),
                 errors::InvalidArgument(
                     "Inputs sp_values and sp_shape should be vectors "
                     "but received shapes: ",
diff --git a/tensorflow/core/kernels/sparse_sparse_binary_op_shared.cc b/tensorflow/core/kernels/sparse_sparse_binary_op_shared.cc
index b027adba6b..09cb2a6a71 100644
--- a/tensorflow/core/kernels/sparse_sparse_binary_op_shared.cc
+++ b/tensorflow/core/kernels/sparse_sparse_binary_op_shared.cc
@@ -132,14 +132,16 @@ class SparseSparseBinaryOpShared : public OpKernel {
 
     // Validations.
     OP_REQUIRES(
-        ctx, TensorShapeUtils::IsMatrix(a_indices_t->shape()) &&
-                 TensorShapeUtils::IsMatrix(b_indices_t->shape()),
+        ctx,
+        TensorShapeUtils::IsMatrix(a_indices_t->shape()) &&
+            TensorShapeUtils::IsMatrix(b_indices_t->shape()),
         errors::InvalidArgument("Inputs a_indices and b_indices should be "
                                 "matrices but received shapes: ",
                                 a_indices_t->shape().DebugString(), ", ",
                                 b_indices_t->shape().DebugString()));
-    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(a_values_t->shape()) &&
-                         TensorShapeUtils::IsVector(b_values_t->shape()),
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsVector(a_values_t->shape()) &&
+                    TensorShapeUtils::IsVector(b_values_t->shape()),
                 errors::InvalidArgument(
                     "Inputs a_values and b_values should be vectors "
                     "but received shapes: ",
@@ -157,8 +159,9 @@ class SparseSparseBinaryOpShared : public OpKernel {
                                 " non-empty input values, got ",
                                 a_values.size(), " and ", b_values.size()));
 
-    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(a_shape_t->shape()) &&
-                         TensorShapeUtils::IsVector(b_shape_t->shape()),
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsVector(a_shape_t->shape()) &&
+                    TensorShapeUtils::IsVector(b_shape_t->shape()),
                 errors::InvalidArgument(
                     "Input shapes should be a vector but received shapes ",
                     a_shape_t->shape().DebugString(), " and ",
diff --git a/tensorflow/core/kernels/sparse_split_op.cc b/tensorflow/core/kernels/sparse_split_op.cc
index 6171b532aa..67dcf05a6c 100644
--- a/tensorflow/core/kernels/sparse_split_op.cc
+++ b/tensorflow/core/kernels/sparse_split_op.cc
@@ -48,18 +48,20 @@ class SparseSplitOp : public OpKernel {
                     "Input shape should be a vector but received shape ",
                     input_shape.shape().DebugString()));
 
-    OP_REQUIRES(context, input_shape.dim_size(0) &&
-                             split_dim < input_shape.vec<int64>().size(),
-                errors::InvalidArgument(
-                    "Input split_dim should be between 0 and rank (",
-                    input_shape.vec<int64>().size(), "), got ", split_dim));
-
-    OP_REQUIRES(context, num_split_ >= 1 &&
-                             num_split_ <= input_shape.vec<int64>()(split_dim),
-                errors::InvalidArgument("Input num_split should be between 1 "
-                                        "and the splitting dimension size (",
-                                        input_shape.vec<int64>()(split_dim),
-                                        "), got ", num_split_));
+    OP_REQUIRES(
+        context,
+        input_shape.dim_size(0) && split_dim < input_shape.vec<int64>().size(),
+        errors::InvalidArgument(
+            "Input split_dim should be between 0 and rank (",
+            input_shape.vec<int64>().size(), "), got ", split_dim));
+
+    OP_REQUIRES(
+        context,
+        num_split_ >= 1 && num_split_ <= input_shape.vec<int64>()(split_dim),
+        errors::InvalidArgument("Input num_split should be between 1 "
+                                "and the splitting dimension size (",
+                                input_shape.vec<int64>()(split_dim), "), got ",
+                                num_split_));
 
     sparse::SparseTensor sparse_tensor(input_indices, input_values,
                                        TensorShape(input_shape.vec<int64>()));
diff --git a/tensorflow/core/kernels/sparse_to_dense_op.cc b/tensorflow/core/kernels/sparse_to_dense_op.cc
index 6a6cc3d813..ba3da21a43 100644
--- a/tensorflow/core/kernels/sparse_to_dense_op.cc
+++ b/tensorflow/core/kernels/sparse_to_dense_op.cc
@@ -73,8 +73,9 @@ class SparseToDense : public OpKernel {
     // sparse_values
     const Tensor& sparse_values = c->input(2);
     const int64 num_values = sparse_values.NumElements();
-    OP_REQUIRES(c, sparse_values.dims() == 0 ||
-                       (sparse_values.dims() == 1 && num_values == num_elems),
+    OP_REQUIRES(c,
+                sparse_values.dims() == 0 ||
+                    (sparse_values.dims() == 1 && num_values == num_elems),
                 errors::InvalidArgument("sparse_values has incorrect shape ",
                                         sparse_values.shape().DebugString(),
                                         ", should be [] or [", num_elems, "]"));
diff --git a/tensorflow/core/kernels/sparse_to_dense_op_test.cc b/tensorflow/core/kernels/sparse_to_dense_op_test.cc
index f0d19da804..d8b0f93082 100644
--- a/tensorflow/core/kernels/sparse_to_dense_op_test.cc
+++ b/tensorflow/core/kernels/sparse_to_dense_op_test.cc
@@ -38,7 +38,6 @@ namespace {
 
 class SparseToDenseTest : public OpsTestBase {
  protected:
-
   void MakeOp(int dim, DataType index_type, DataType value_type) {
     TF_ASSERT_OK(NodeDefBuilder("sparsetodense", "SparseToDense")
                      .Input(FakeInput(index_type))
diff --git a/tensorflow/core/kernels/sparse_xent_op.cc b/tensorflow/core/kernels/sparse_xent_op.cc
index c35ba42db2..f84ffd5323 100644
--- a/tensorflow/core/kernels/sparse_xent_op.cc
+++ b/tensorflow/core/kernels/sparse_xent_op.cc
@@ -39,10 +39,10 @@ Status CheckInvalidLabelIndex(const Tensor& labels, int64 max_index) {
   if (*min_max_dim_value.first < 0 || *min_max_dim_value.second >= max_index) {
     bad_index = (*min_max_dim_value.first < 0) ? *min_max_dim_value.first
                                                : *min_max_dim_value.second;
-    return errors::InvalidArgument("Received a label value of ", bad_index,
-                                   " which is outside the valid range of [0, ",
-                                   max_index, ").  Label values: ",
-                                   labels.SummarizeValue(labels.NumElements()));
+    return errors::InvalidArgument(
+        "Received a label value of ", bad_index,
+        " which is outside the valid range of [0, ", max_index,
+        ").  Label values: ", labels.SummarizeValue(labels.NumElements()));
   }
   return Status::OK();
 }
diff --git a/tensorflow/core/kernels/sparse_xent_op_test.cc b/tensorflow/core/kernels/sparse_xent_op_test.cc
index b8ea0d2d7e..afb0bf7626 100644
--- a/tensorflow/core/kernels/sparse_xent_op_test.cc
+++ b/tensorflow/core/kernels/sparse_xent_op_test.cc
@@ -41,10 +41,10 @@ static Graph* SparseXent(int batch_size, int num_classes) {
   return g;
 }
 
-#define BM_SparseXentDev(BATCH, CLASS, DEVICE)                                \
-  static void BM_SparseXent##_##BATCH##_##CLASS##_##DEVICE(int iters) {       \
+#define BM_SparseXentDev(BATCH, CLASS, DEVICE)                          \
+  static void BM_SparseXent##_##BATCH##_##CLASS##_##DEVICE(int iters) { \
     testing::ItemsProcessed(static_cast<int64>(iters) * BATCH * CLASS); \
-    test::Benchmark(#DEVICE, SparseXent(BATCH, CLASS)).Run(iters);            \
+    test::Benchmark(#DEVICE, SparseXent(BATCH, CLASS)).Run(iters);      \
   }                                                                     \
   BENCHMARK(BM_SparseXent##_##BATCH##_##CLASS##_##DEVICE);
 
diff --git a/tensorflow/core/kernels/split_lib.h b/tensorflow/core/kernels/split_lib.h
index ff92ffeeb3..a08949e626 100644
--- a/tensorflow/core/kernels/split_lib.h
+++ b/tensorflow/core/kernels/split_lib.h
@@ -57,7 +57,7 @@ struct Split<Eigen::SyclDevice, T> {
                   const Eigen::DSizes<Eigen::DenseIndex, 3>& slice_indices,
                   const Eigen::DSizes<Eigen::DenseIndex, 3>& slice_sizes);
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/split_lib_cpu.cc b/tensorflow/core/kernels/split_lib_cpu.cc
index 25026208d1..771c633b15 100644
--- a/tensorflow/core/kernels/split_lib_cpu.cc
+++ b/tensorflow/core/kernels/split_lib_cpu.cc
@@ -49,13 +49,13 @@ void Split<Eigen::SyclDevice, T>::operator()(
     typename TTypes<T, 3>::ConstTensor input,
     const Eigen::DSizes<Eigen::DenseIndex, 3>& slice_indices,
     const Eigen::DSizes<Eigen::DenseIndex, 3>& slice_sizes) {
-    output.device(d) = input.slice(slice_indices, slice_sizes);
+  output.device(d) = input.slice(slice_indices, slice_sizes);
 }
 
 #define DEFINE_SYCL_KERNELS(T) template struct Split<Eigen::SyclDevice, T>;
 
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(DEFINE_SYCL_KERNELS);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/split_op.cc b/tensorflow/core/kernels/split_op.cc
index 78badde27e..85f529326d 100644
--- a/tensorflow/core/kernels/split_op.cc
+++ b/tensorflow/core/kernels/split_op.cc
@@ -39,7 +39,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 class SplitOpBase : public OpKernel {
@@ -142,8 +142,9 @@ class SplitOpCPU : public SplitOpBase<CPUDevice, T> {
 
     // Android also uses int32 indexing, so check here also.
     OP_REQUIRES(
-        context, FastBoundsCheck(input.NumElements(),
-                                 std::numeric_limits<Eigen::DenseIndex>::max()),
+        context,
+        FastBoundsCheck(input.NumElements(),
+                        std::numeric_limits<Eigen::DenseIndex>::max()),
         errors::InvalidArgument("Split requires input size < ",
                                 std::numeric_limits<Eigen::DenseIndex>::max()));
 
@@ -245,10 +246,11 @@ class SplitOpGPU : public SplitOpBase<GPUDevice, T> {
     const int32 split_dim =
         split_dim_orig < 0 ? split_dim_orig + input.dims() : split_dim_orig;
     const int32 num_split = Base::num_outputs();
-    OP_REQUIRES(context, FastBoundsCheck(input.NumElements(),
-                                         std::numeric_limits<int32>::max()),
-                errors::InvalidArgument("Split on GPU requires input size "
-                                        "< max int32"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(input.NumElements(), std::numeric_limits<int32>::max()),
+        errors::InvalidArgument("Split on GPU requires input size "
+                                "< max int32"));
     int32 prefix_dim_size;
     int32 split_dim_size;
     int32 suffix_dim_size;
@@ -304,8 +306,9 @@ class SplitOpSYCL : public SplitOpBase<SYCLDevice, T> {
 
     // Android also uses int32 indexing, so check here also.
     OP_REQUIRES(
-        context, FastBoundsCheck(input.NumElements(),
-                                 std::numeric_limits<Eigen::DenseIndex>::max()),
+        context,
+        FastBoundsCheck(input.NumElements(),
+                        std::numeric_limits<Eigen::DenseIndex>::max()),
         errors::InvalidArgument("Split requires input size < ",
                                 std::numeric_limits<Eigen::DenseIndex>::max()));
 
@@ -342,14 +345,14 @@ class SplitOpSYCL : public SplitOpBase<SYCLDevice, T> {
             {prefix_dim_size, split_dim_output_size, suffix_dim_size});
 
         functor::Split<SYCLDevice, T>()(context->eigen_device<SYCLDevice>(),
-                                       result_shaped, input_reshaped,
-                                       slice_indices, slice_sizes);
+                                        result_shaped, input_reshaped,
+                                        slice_indices, slice_sizes);
       }
       indices[1] += split_dim_output_size;
     }
   }
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #define REGISTER_SPLIT(type)                             \
   REGISTER_KERNEL_BUILDER(Name("Split")                  \
@@ -381,11 +384,11 @@ REGISTER_GPU(bfloat16);
 #endif  // GOOGLE_CUDA
 
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL(type)                               \
-  REGISTER_KERNEL_BUILDER(Name("Split")                   \
-                              .Device(DEVICE_SYCL)        \
-                              .TypeConstraint<type>("T")  \
-                              .HostMemory("split_dim"),   \
+#define REGISTER_SYCL(type)                              \
+  REGISTER_KERNEL_BUILDER(Name("Split")                  \
+                              .Device(DEVICE_SYCL)       \
+                              .TypeConstraint<type>("T") \
+                              .HostMemory("split_dim"),  \
                           SplitOpSYCL<type>)
 
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL);
diff --git a/tensorflow/core/kernels/split_v_op.cc b/tensorflow/core/kernels/split_v_op.cc
index f1078ac349..7ff5df47d7 100644
--- a/tensorflow/core/kernels/split_v_op.cc
+++ b/tensorflow/core/kernels/split_v_op.cc
@@ -197,8 +197,9 @@ class SplitVOpCPU : public SplitVOpBase<CPUDevice, T, Tlen> {
 
     // Android also uses int32 indexing, so check here also.
     OP_REQUIRES(
-        context, FastBoundsCheck(input.NumElements(),
-                                 std::numeric_limits<Eigen::DenseIndex>::max()),
+        context,
+        FastBoundsCheck(input.NumElements(),
+                        std::numeric_limits<Eigen::DenseIndex>::max()),
         errors::InvalidArgument("Split requires input size < ",
                                 std::numeric_limits<Eigen::DenseIndex>::max()));
 
@@ -305,10 +306,11 @@ class SplitVOpGPU : public SplitVOpBase<GPUDevice, T, Tlen> {
     const int32 split_dim_orig = context->input(2).flat<int32>()(0);
     const int32 split_dim =
         split_dim_orig < 0 ? split_dim_orig + input.dims() : split_dim_orig;
-    OP_REQUIRES(context, FastBoundsCheck(input.NumElements(),
-                                         std::numeric_limits<int32>::max()),
-                errors::InvalidArgument("Split on GPU requires input size "
-                                        "< max int32"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(input.NumElements(), std::numeric_limits<int32>::max()),
+        errors::InvalidArgument("Split on GPU requires input size "
+                                "< max int32"));
 
     int32 prefix_dim_size;
     int32 split_dim_size;
diff --git a/tensorflow/core/kernels/stack_ops.cc b/tensorflow/core/kernels/stack_ops.cc
index affe81a555..65296f61fd 100644
--- a/tensorflow/core/kernels/stack_ops.cc
+++ b/tensorflow/core/kernels/stack_ops.cc
@@ -42,7 +42,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 class Stack : public ResourceBase {
  public:
@@ -242,7 +242,7 @@ REGISTER_KERNEL_BUILDER(Name("StackV2")
                             .HostMemory("max_size")
                             .HostMemory("handle"),
                         StackOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device>
 class StackPushOp : public AsyncOpKernel {
@@ -274,11 +274,11 @@ class StackPushOp : public AsyncOpKernel {
     static constexpr int kCopyThreshold = 2048;
     static constexpr double kOccupancy = 0.7;
     if (swap_memory_ && !alloc_attrs.on_host() &&
-        ( std::is_same<Device, GPUDevice>::value
+        (std::is_same<Device, GPUDevice>::value
 #ifdef TENSORFLOW_USE_SYCL
-          || std::is_same<Device, SYCLDevice>::value
-#endif // TENSORFLOW_USE_SYCL
-        ) &&
+         || std::is_same<Device, SYCLDevice>::value
+#endif  // TENSORFLOW_USE_SYCL
+         ) &&
         tensor.TotalBytes() > kCopyThreshold && stack->IsUsefulToSwap(tensor)) {
       DeviceContext* device_ctxt = ctx->op_device_context();
       auto device = static_cast<tensorflow::Device*>(ctx->device());
@@ -391,7 +391,7 @@ REGISTER_SYCL_HOST_KERNEL(int32);
 REGISTER_SYCL_HOST_KERNEL(bool);
 #undef REGISTER_SYCL_KERNEL
 #undef REGISTER_SYCL_HOST_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 class StackPopOp : public AsyncOpKernel {
  public:
@@ -498,7 +498,7 @@ REGISTER_SYCL_HOST_KERNEL(bool);
 
 #undef REGISTER_SYCL_KERNEL
 #undef REGISTER_SYCL_HOST_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 class StackCloseOp : public OpKernel {
  public:
@@ -526,6 +526,6 @@ REGISTER_KERNEL_BUILDER(
 REGISTER_KERNEL_BUILDER(
     Name("StackCloseV2").Device(DEVICE_SYCL).HostMemory("handle"),
     StackCloseOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/stage_op.cc b/tensorflow/core/kernels/stage_op.cc
index 0fae46dea6..03fc4467a1 100644
--- a/tensorflow/core/kernels/stage_op.cc
+++ b/tensorflow/core/kernels/stage_op.cc
@@ -70,12 +70,11 @@ class Buffer : public ResourceBase {
     return bytes + current_bytes_ > memory_limit_;
   }
 
-  std::size_t GetTupleBytes(const Tuple & tuple)
-  {
+  std::size_t GetTupleBytes(const Tuple& tuple) {
     return std::accumulate(tuple.begin(), tuple.end(), 0,
-      [](const std::size_t & lhs, const Tensor & rhs) {
-        return lhs + rhs.TotalBytes();
-    });
+                           [](const std::size_t& lhs, const Tensor& rhs) {
+                             return lhs + rhs.TotalBytes();
+                           });
   }
 
  public:
@@ -90,19 +89,22 @@ class Buffer : public ResourceBase {
     std::size_t tuple_bytes = GetTupleBytes(*tuple);
 
     // Sanity check so that we don't block for ever below
-    if(memory_limit_ > 0 && tuple_bytes > memory_limit_) {
-      return Status(errors::ResourceExhausted("Attempted to insert "
-        "tensors with combined size of '", tuple_bytes, "' bytes into "
-        "Staging Area with a memory limit of '", memory_limit_, "'."));
+    if (memory_limit_ > 0 && tuple_bytes > memory_limit_) {
+      return Status(
+          errors::ResourceExhausted("Attempted to insert "
+                                    "tensors with combined size of '",
+                                    tuple_bytes,
+                                    "' bytes into "
+                                    "Staging Area with a memory limit of '",
+                                    memory_limit_, "'."));
     }
 
-
     // If buffer capacity is bounded wait until elements have been removed
-    if(IsBounded()) {
+    if (IsBounded()) {
       full_cond_var_.wait(lock, [tuple_bytes, this]() {
         // If there's a memory limit, check if there's space for insertion
-        bool memory_limit_valid = memory_limit_ > 0 ?
-            !WouldExceedMemoryLimit(tuple_bytes) : true;
+        bool memory_limit_valid =
+            memory_limit_ > 0 ? !WouldExceedMemoryLimit(tuple_bytes) : true;
         // If we're configured for capacity check if there's space for insertion
         bool capacity_valid = capacity_ > 0 ? !IsCapacityFull() : true;
 
@@ -186,8 +188,7 @@ Status GetBuffer(OpKernelContext* ctx, const NodeDef& ndef, Buffer** buf) {
   ContainerInfo cinfo;
 
   // Lambda for creating the Staging Area
-  auto create_fn = [&ndef](Buffer** ret) -> Status
-  {
+  auto create_fn = [&ndef](Buffer** ret) -> Status {
     int64 capacity;
     int64 memory_limit;
     TF_RETURN_IF_ERROR(GetNodeAttr(ndef, "capacity", &capacity));
@@ -196,7 +197,6 @@ Status GetBuffer(OpKernelContext* ctx, const NodeDef& ndef, Buffer** buf) {
     return Status::OK();
   };
 
-
   TF_RETURN_IF_ERROR(cinfo.Init(rm, ndef, true /* use name() */));
   TF_RETURN_IF_ERROR(rm->LookupOrCreate<Buffer>(cinfo.container(), cinfo.name(),
                                                 buf, create_fn));
@@ -228,7 +228,7 @@ REGISTER_KERNEL_BUILDER(Name("Stage").Device(DEVICE_GPU), StageOp);
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(Name("Stage").Device(DEVICE_SYCL), StageOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 class UnstageOp : public OpKernel {
  public:
@@ -244,7 +244,8 @@ class UnstageOp : public OpKernel {
 
     buf->Get(&tuple);
 
-    OP_REQUIRES(ctx, tuple.size() == (size_t)ctx->num_outputs(),
+    OP_REQUIRES(
+        ctx, tuple.size() == (size_t)ctx->num_outputs(),
         errors::InvalidArgument("Mismatch stage/unstage: ", tuple.size(),
                                 " vs. ", ctx->num_outputs()));
 
@@ -260,7 +261,7 @@ REGISTER_KERNEL_BUILDER(Name("Unstage").Device(DEVICE_GPU), UnstageOp);
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(Name("Unstage").Device(DEVICE_SYCL), UnstageOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 class StagePeekOp : public OpKernel {
  public:
@@ -278,7 +279,8 @@ class StagePeekOp : public OpKernel {
 
     OP_REQUIRES_OK(ctx, buf->Peek(index, &tuple));
 
-    OP_REQUIRES(ctx, tuple.size() == (size_t)ctx->num_outputs(),
+    OP_REQUIRES(
+        ctx, tuple.size() == (size_t)ctx->num_outputs(),
         errors::InvalidArgument("Mismatch stage/unstage: ", tuple.size(),
                                 " vs. ", ctx->num_outputs()));
 
@@ -288,17 +290,15 @@ class StagePeekOp : public OpKernel {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("StagePeek").Device(DEVICE_CPU),
-                                              StagePeekOp);
+REGISTER_KERNEL_BUILDER(Name("StagePeek").Device(DEVICE_CPU), StagePeekOp);
 #if GOOGLE_CUDA
-REGISTER_KERNEL_BUILDER(Name("StagePeek").HostMemory("index").
-                            Device(DEVICE_GPU), StagePeekOp);
+REGISTER_KERNEL_BUILDER(
+    Name("StagePeek").HostMemory("index").Device(DEVICE_GPU), StagePeekOp);
 #endif
 #ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("StagePeek").HostMemory("index")
-                          .Device(DEVICE_SYCL), StagePeekOp);
-#endif // TENSORFLOW_USE_SYCL
-
+REGISTER_KERNEL_BUILDER(
+    Name("StagePeek").HostMemory("index").Device(DEVICE_SYCL), StagePeekOp);
+#endif  // TENSORFLOW_USE_SYCL
 
 class StageSizeOp : public OpKernel {
  public:
@@ -312,9 +312,8 @@ class StageSizeOp : public OpKernel {
     core::ScopedUnref scope(buf);
 
     // Allocate size output tensor
-    Tensor * size = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}),
-                                                     &size));
+    Tensor* size = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &size));
 
     // Set it to the actual size
     size->scalar<int32>().setConstant(buf->Size());
@@ -323,13 +322,13 @@ class StageSizeOp : public OpKernel {
 
 REGISTER_KERNEL_BUILDER(Name("StageSize").Device(DEVICE_CPU), StageSizeOp);
 #if GOOGLE_CUDA
-REGISTER_KERNEL_BUILDER(Name("StageSize").HostMemory("size")
-                        .Device(DEVICE_GPU), StageSizeOp);
+REGISTER_KERNEL_BUILDER(Name("StageSize").HostMemory("size").Device(DEVICE_GPU),
+                        StageSizeOp);
 #endif
 #ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("StageSize").HostMemory("size")
-                        .Device(DEVICE_SYCL), StageSizeOp);
-#endif // TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(
+    Name("StageSize").HostMemory("size").Device(DEVICE_SYCL), StageSizeOp);
+#endif  // TENSORFLOW_USE_SYCL
 
 class StageClearOp : public OpKernel {
  public:
@@ -352,7 +351,6 @@ REGISTER_KERNEL_BUILDER(Name("StageClear").Device(DEVICE_GPU), StageClearOp);
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(Name("StageClear").Device(DEVICE_SYCL), StageClearOp);
-#endif // TENSORFLOW_USE_SYCL
-
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/strided_slice_op.cc b/tensorflow/core/kernels/strided_slice_op.cc
index 7c213e14d2..8f7f91c9df 100644
--- a/tensorflow/core/kernels/strided_slice_op.cc
+++ b/tensorflow/core/kernels/strided_slice_op.cc
@@ -541,5 +541,5 @@ REGISTER_KERNEL_BUILDER(Name("ResourceStridedSliceAssign")
                             .HostMemory("strides"),
                         StridedSliceAssignOp<CPUDevice, int32>)
 #undef REGISTER_SYCL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/strided_slice_op_impl.h b/tensorflow/core/kernels/strided_slice_op_impl.h
index a84ba38ef4..ac1259a9ac 100644
--- a/tensorflow/core/kernels/strided_slice_op_impl.h
+++ b/tensorflow/core/kernels/strided_slice_op_impl.h
@@ -302,7 +302,7 @@ DECLARE_FOR_N_SYCL(int32);
 DECLARE_FOR_N_SYCL(int64);
 
 #undef DECLARE_FOR_N_SYCL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #undef INSTANTIATE
 #undef DECLARE_FOR_N_CPU
diff --git a/tensorflow/core/kernels/string_join_op.cc b/tensorflow/core/kernels/string_join_op.cc
index 721702bec6..28cca9f448 100644
--- a/tensorflow/core/kernels/string_join_op.cc
+++ b/tensorflow/core/kernels/string_join_op.cc
@@ -50,9 +50,9 @@ class StringJoinOp : public OpKernel {
         } else {
           OP_REQUIRES(
               context, input_shape == input.shape(),
-              errors::InvalidArgument("Input shapes do not match: ",
-                                      input_shape.DebugString(), " vs. ",
-                                      input.shape().DebugString()));
+              errors::InvalidArgument(
+                  "Input shapes do not match: ", input_shape.DebugString(),
+                  " vs. ", input.shape().DebugString()));
         }
       }
     }
diff --git a/tensorflow/core/kernels/substr_op.cc b/tensorflow/core/kernels/substr_op.cc
index 743f113150..e29f67297f 100644
--- a/tensorflow/core/kernels/substr_op.cc
+++ b/tensorflow/core/kernels/substr_op.cc
@@ -95,9 +95,9 @@ class SubstrOp : public OpKernel {
       // Create BCast helper with shape of input and pos/len
       BCast bcast(BCast::FromShape(input_shape), BCast::FromShape(pos_shape));
       OP_REQUIRES(context, bcast.IsValid(),
-                  errors::InvalidArgument("Incompatible shapes: ",
-                                          input_shape.DebugString(), " vs. ",
-                                          pos_shape.DebugString()));
+                  errors::InvalidArgument(
+                      "Incompatible shapes: ", input_shape.DebugString(),
+                      " vs. ", pos_shape.DebugString()));
       TensorShape output_shape = BCast::ToShape(bcast.result_shape());
       int ndims = output_shape.dims();
       Tensor* output_tensor = nullptr;
diff --git a/tensorflow/core/kernels/summary_image_op.cc b/tensorflow/core/kernels/summary_image_op.cc
index 233b824bcc..29b21ee735 100644
--- a/tensorflow/core/kernels/summary_image_op.cc
+++ b/tensorflow/core/kernels/summary_image_op.cc
@@ -54,18 +54,20 @@ class SummaryImageOp : public OpKernel {
     const Tensor& tensor = c->input(1);
     OP_REQUIRES(c, IsLegacyScalar(tags.shape()),
                 errors::InvalidArgument("Tags must be a scalar"));
-    OP_REQUIRES(c, tensor.dims() == 4 &&
-                       (tensor.dim_size(3) == 1 || tensor.dim_size(3) == 3 ||
-                        tensor.dim_size(3) == 4),
+    OP_REQUIRES(c,
+                tensor.dims() == 4 &&
+                    (tensor.dim_size(3) == 1 || tensor.dim_size(3) == 3 ||
+                     tensor.dim_size(3) == 4),
                 errors::InvalidArgument(
                     "Tensor must be 4-D with last dim 1, 3, or 4, not ",
                     tensor.shape().DebugString()));
     const string& base_tag = tags.scalar<string>()();
 
-    OP_REQUIRES(c, tensor.dim_size(0) < (1LL << 31) &&
-                       tensor.dim_size(1) < (1LL << 31) &&
-                       tensor.dim_size(2) < (1LL << 31) &&
-                       (tensor.dim_size(1) * tensor.dim_size(2)) < (1LL << 29),
+    OP_REQUIRES(c,
+                tensor.dim_size(0) < (1LL << 31) &&
+                    tensor.dim_size(1) < (1LL << 31) &&
+                    tensor.dim_size(2) < (1LL << 31) &&
+                    (tensor.dim_size(1) * tensor.dim_size(2)) < (1LL << 29),
                 errors::InvalidArgument("Tensor too large for summary ",
                                         tensor.shape().DebugString()));
 
diff --git a/tensorflow/core/kernels/summary_op.cc b/tensorflow/core/kernels/summary_op.cc
index b818724ec2..1f4e3418f4 100644
--- a/tensorflow/core/kernels/summary_op.cc
+++ b/tensorflow/core/kernels/summary_op.cc
@@ -41,11 +41,12 @@ class SummaryScalarOp : public OpKernel {
     const Tensor& values = c->input(1);
 
     OP_REQUIRES(
-        c, tags.IsSameSize(values) ||
-               (IsLegacyScalar(tags.shape()) && IsLegacyScalar(values.shape())),
-        errors::InvalidArgument("tags and values not the same shape: ",
-                                tags.shape().DebugString(), " != ",
-                                values.shape().DebugString(), SingleTag(tags)));
+        c,
+        tags.IsSameSize(values) ||
+            (IsLegacyScalar(tags.shape()) && IsLegacyScalar(values.shape())),
+        errors::InvalidArgument(
+            "tags and values not the same shape: ", tags.shape().DebugString(),
+            " != ", values.shape().DebugString(), SingleTag(tags)));
     auto Ttags = tags.flat<string>();
     auto Tvalues = values.flat<T>();
     Summary s;
diff --git a/tensorflow/core/kernels/tile_functor_cpu.cc b/tensorflow/core/kernels/tile_functor_cpu.cc
index b2fd669541..f814486701 100644
--- a/tensorflow/core/kernels/tile_functor_cpu.cc
+++ b/tensorflow/core/kernels/tile_functor_cpu.cc
@@ -15,10 +15,10 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include "tensorflow/core/kernels/tile_functor.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/tile_functor.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/tile_ops_cpu_impl.h b/tensorflow/core/kernels/tile_ops_cpu_impl.h
index 054b31ef9e..df6a666cd4 100644
--- a/tensorflow/core/kernels/tile_ops_cpu_impl.h
+++ b/tensorflow/core/kernels/tile_ops_cpu_impl.h
@@ -63,7 +63,7 @@ TF_CALL_int64(DEFINE_TYPE);
 
 #undef DEFINE_DIM
 #undef DEFINE_TYPE
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // end namespace functor
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc
index 38e77ab60f..07befa27bc 100644
--- a/tensorflow/core/kernels/training_ops.cc
+++ b/tensorflow/core/kernels/training_ops.cc
@@ -3279,7 +3279,6 @@ REGISTER_KERNELS(double, int64);
 
 #undef REGISTER_KERNELS
 
-
 template <typename Device, typename T>
 class ApplyAddSignOp : public OpKernel {
  public:
@@ -3362,17 +3361,15 @@ TF_CALL_double(REGISTER_CPU_KERNELS);
 #if GOOGLE_CUDA
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
-#define DECLARE_GPU_SPEC(T)                                               \
-  template <>                                                             \
-  void ApplyAddSign<GPUDevice, T>::operator()(                            \
-      const GPUDevice& d,                                                 \
-      typename TTypes<T>::Flat var,                                       \
-      typename TTypes<T>::Flat m,                                         \
-      typename TTypes<T>::ConstScalar lr,                                 \
-      typename TTypes<T>::ConstScalar alpha,                              \
-      typename TTypes<T>::ConstScalar sign_decay,                         \
-      typename TTypes<T>::ConstScalar beta,                               \
-      typename TTypes<T>::ConstFlat grad);                                \
+#define DECLARE_GPU_SPEC(T)                                           \
+  template <>                                                         \
+  void ApplyAddSign<GPUDevice, T>::operator()(                        \
+      const GPUDevice& d, typename TTypes<T>::Flat var,               \
+      typename TTypes<T>::Flat m, typename TTypes<T>::ConstScalar lr, \
+      typename TTypes<T>::ConstScalar alpha,                          \
+      typename TTypes<T>::ConstScalar sign_decay,                     \
+      typename TTypes<T>::ConstScalar beta,                           \
+      typename TTypes<T>::ConstFlat grad);                            \
   extern template struct ApplyAddSign<GPUDevice, T>;
 DECLARE_GPU_SPEC(Eigen::half);
 DECLARE_GPU_SPEC(float);
@@ -3387,7 +3384,6 @@ REGISTER_KERNELS(GPU, double);
 #undef REGISTER_CPU_KERNELS
 #undef REGISTER_KERNELS
 
-
 template <typename Device, typename T>
 class ApplyPowerSignOp : public OpKernel {
  public:
@@ -3470,17 +3466,15 @@ TF_CALL_double(REGISTER_CPU_KERNELS);
 #if GOOGLE_CUDA
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
-#define DECLARE_GPU_SPEC(T)                                               \
-  template <>                                                             \
-  void ApplyPowerSign<GPUDevice, T>::operator()(                          \
-      const GPUDevice& d,                                                 \
-      typename TTypes<T>::Flat var,                                       \
-      typename TTypes<T>::Flat m,                                         \
-      typename TTypes<T>::ConstScalar lr,                                 \
-      typename TTypes<T>::ConstScalar logbase,                            \
-      typename TTypes<T>::ConstScalar sign_decay,                         \
-      typename TTypes<T>::ConstScalar beta,                               \
-      typename TTypes<T>::ConstFlat grad);                                \
+#define DECLARE_GPU_SPEC(T)                                           \
+  template <>                                                         \
+  void ApplyPowerSign<GPUDevice, T>::operator()(                      \
+      const GPUDevice& d, typename TTypes<T>::Flat var,               \
+      typename TTypes<T>::Flat m, typename TTypes<T>::ConstScalar lr, \
+      typename TTypes<T>::ConstScalar logbase,                        \
+      typename TTypes<T>::ConstScalar sign_decay,                     \
+      typename TTypes<T>::ConstScalar beta,                           \
+      typename TTypes<T>::ConstFlat grad);                            \
   extern template struct ApplyPowerSign<GPUDevice, T>;
 DECLARE_GPU_SPEC(Eigen::half);
 DECLARE_GPU_SPEC(float);
diff --git a/tensorflow/core/kernels/training_ops_gpu.cu.cc b/tensorflow/core/kernels/training_ops_gpu.cu.cc
index d443a6b3c1..0376a3b2c6 100644
--- a/tensorflow/core/kernels/training_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/training_ops_gpu.cu.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/kernels/training_ops.h"
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/training_ops.h"
 
 namespace tensorflow {
 
@@ -115,13 +115,11 @@ struct ApplyAdam<GPUDevice, T> {
     Eigen::Sizes<1> single;
     const auto one = static_cast<T>(1.0);
     m.device(d) =
-        m +
-        (beta1.constant(one) - beta1).reshape(single).broadcast(bcast) *
-            (grad - m);
+        m + (beta1.constant(one) - beta1).reshape(single).broadcast(bcast) *
+                (grad - m);
     v.device(d) =
-        v +
-        (beta2.constant(one) - beta2).reshape(single).broadcast(bcast) *
-            (grad.square() - v);
+        v + (beta2.constant(one) - beta2).reshape(single).broadcast(bcast) *
+                (grad.square() - v);
 
     if (use_nesterov) {
       var.device(d) -=
@@ -157,9 +155,9 @@ struct ApplyRMSProp<GPUDevice, T> {
     bcast[0] = grad.dimension(0);
     Eigen::Sizes<1> single;
     const auto one = static_cast<T>(1.0);
-    ms.device(d) = ms +
-                   (rho.constant(one) - rho).reshape(single).broadcast(bcast) *
-                       (grad.square() - ms);
+    ms.device(d) =
+        ms + (rho.constant(one) - rho).reshape(single).broadcast(bcast) *
+                 (grad.square() - ms);
     mom.device(d) =
         mom * momentum.reshape(single).broadcast(bcast) +
         lr.reshape(single).broadcast(bcast) * grad /
@@ -212,7 +210,7 @@ struct ApplyAddSign<GPUDevice, T> {
     auto beta_bcast = beta.reshape(single).broadcast(bcast);
     auto one_minus_beta =
         (beta.constant(one) - beta).reshape(single).broadcast(bcast);
-    m.device(d) =  m * beta_bcast + grad * one_minus_beta;
+    m.device(d) = m * beta_bcast + grad * one_minus_beta;
 
     // The following is the GPU equivalent of the CPU version:
     // var.device(d) -= lr() * (alpha() + sign_decay() * sign_gm) * grad;
@@ -244,7 +242,7 @@ struct ApplyPowerSign<GPUDevice, T> {
     auto beta_bcast = beta.reshape(single).broadcast(bcast);
     auto one_minus_beta =
         (beta.constant(one) - beta).reshape(single).broadcast(bcast);
-    m.device(d) =  m * beta_bcast + grad * one_minus_beta;
+    m.device(d) = m * beta_bcast + grad * one_minus_beta;
 
     // The following is the GPU equivalent of the CPU version:
     // auto grad_scale = (logbase() * sign_decay() * sign_gm).exp();
@@ -253,7 +251,7 @@ struct ApplyPowerSign<GPUDevice, T> {
     auto lr_bcast = lr.reshape(single).broadcast(bcast);
     auto logbase_bcast = logbase.reshape(single).broadcast(bcast);
     auto sign_decay_bcast = sign_decay.reshape(single).broadcast(bcast);
-    auto grad_scale =  (logbase_bcast * sign_decay_bcast * sign_gm).exp();
+    auto grad_scale = (logbase_bcast * sign_decay_bcast * sign_gm).exp();
     var.device(d) -= lr_bcast * grad_scale * grad;
   }
 };
diff --git a/tensorflow/core/kernels/training_ops_test.cc b/tensorflow/core/kernels/training_ops_test.cc
index ffa7f87c9e..2dcc4a500e 100644
--- a/tensorflow/core/kernels/training_ops_test.cc
+++ b/tensorflow/core/kernels/training_ops_test.cc
@@ -176,8 +176,9 @@ static void Adam(int32 n, Graph** init_g, Graph** train_g) {
     auto beta2 = Scalar(g, 0.99);
     auto epsilon = Scalar(g, 1e-8);
     auto grad = Random(g, n);
-    test::graph::Multi(g, "ApplyAdam", {var, m, v, beta1_power, beta2_power, lr,
-                                        beta1, beta2, epsilon, grad});
+    test::graph::Multi(
+        g, "ApplyAdam",
+        {var, m, v, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad});
     *train_g = g;
   }
 }
diff --git a/tensorflow/core/kernels/transpose_op.cc b/tensorflow/core/kernels/transpose_op.cc
index 2e0d18b634..7177ad7888 100644
--- a/tensorflow/core/kernels/transpose_op.cc
+++ b/tensorflow/core/kernels/transpose_op.cc
@@ -176,9 +176,10 @@ void TransposeOp::Compute(OpKernelContext* ctx) {
     }
   }
   for (int i = 0; i < dims; ++i) {
-    OP_REQUIRES(ctx, bits[i], errors::InvalidArgument(
-                                  i, " is missing from {",
-                                  str_util::Join(permutation, ","), "}."));
+    OP_REQUIRES(
+        ctx, bits[i],
+        errors::InvalidArgument(i, " is missing from {",
+                                str_util::Join(permutation, ","), "}."));
   }
 
   // 0-D, 1-D, and identity transposes do nothing.
diff --git a/tensorflow/core/kernels/typed_queue.h b/tensorflow/core/kernels/typed_queue.h
index 0d608d9b87..43dcb4cef7 100644
--- a/tensorflow/core/kernels/typed_queue.h
+++ b/tensorflow/core/kernels/typed_queue.h
@@ -58,9 +58,9 @@ Status TypedQueue<SubQueue>::Initialize() {
   if (!component_shapes_.empty() &&
       component_dtypes_.size() != component_shapes_.size()) {
     return errors::InvalidArgument(
-        "Different number of component types.  ", "Types: ",
-        DataTypeSliceString(component_dtypes_), ", Shapes: ",
-        ShapeListString(component_shapes_));
+        "Different number of component types.  ",
+        "Types: ", DataTypeSliceString(component_dtypes_),
+        ", Shapes: ", ShapeListString(component_shapes_));
   }
 
   mutex_lock lock(mu_);
diff --git a/tensorflow/core/kernels/unpack_op.cc b/tensorflow/core/kernels/unpack_op.cc
index 397bdd5670..764b6a252a 100644
--- a/tensorflow/core/kernels/unpack_op.cc
+++ b/tensorflow/core/kernels/unpack_op.cc
@@ -34,7 +34,7 @@ typedef Eigen::GpuDevice GPUDevice;
 
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 class UnpackOp : public OpKernel {
@@ -65,8 +65,9 @@ class UnpackOp : public OpKernel {
     output_shape.RemoveDim(axis);
     const int64 output_size = output_shape.num_elements();
     OP_REQUIRES(
-        context, FastBoundsCheck(output_size,
-                                 std::numeric_limits<Eigen::DenseIndex>::max()),
+        context,
+        FastBoundsCheck(output_size,
+                        std::numeric_limits<Eigen::DenseIndex>::max()),
         errors::InvalidArgument("output size must fit in Eigen DenseIndex"));
 
 // This optimization is currently not applicable for SYCL devices
diff --git a/tensorflow/core/kernels/word2vec_kernels.cc b/tensorflow/core/kernels/word2vec_kernels.cc
index 2d05d72bff..3477445197 100644
--- a/tensorflow/core/kernels/word2vec_kernels.cc
+++ b/tensorflow/core/kernels/word2vec_kernels.cc
@@ -188,9 +188,9 @@ class SkipgramOp : public OpKernel {
       ++corpus_size_;
     }
     if (corpus_size_ < window_size_ * 10) {
-      return errors::InvalidArgument("The text file ", filename,
-                                     " contains too little data: ",
-                                     corpus_size_, " words");
+      return errors::InvalidArgument(
+          "The text file ", filename,
+          " contains too little data: ", corpus_size_, " words");
     }
     typedef std::pair<string, int32> WordFreq;
     std::vector<WordFreq> ordered;
diff --git a/tensorflow/core/kernels/xent_op.cc b/tensorflow/core/kernels/xent_op.cc
index 0f8d027caa..a6a71fdfaf 100644
--- a/tensorflow/core/kernels/xent_op.cc
+++ b/tensorflow/core/kernels/xent_op.cc
@@ -30,7 +30,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 class SoftmaxXentWithLogitsOp : public OpKernel {
@@ -44,8 +44,8 @@ class SoftmaxXentWithLogitsOp : public OpKernel {
     OP_REQUIRES(context, logits_in.IsSameSize(labels_in),
                 errors::InvalidArgument(
                     "logits and labels must be same size: logits_size=",
-                    logits_in.shape().DebugString(), " labels_size=",
-                    labels_in.shape().DebugString()));
+                    logits_in.shape().DebugString(),
+                    " labels_size=", labels_in.shape().DebugString()));
     OP_REQUIRES(context, TensorShapeUtils::IsMatrix(logits_in.shape()),
                 errors::InvalidArgument("logits must be 2-dimensional"));
     // As we already tested that both inputs have the same shape no need to
@@ -72,7 +72,7 @@ class SoftmaxXentWithLogitsOp : public OpKernel {
       functor(context->eigen_device<Device>(), logits_in.matrix<T>(),
               labels_in.matrix<T>(), scratch.matrix<T>(), loss_out->vec<T>(),
               back_out->matrix<T>());
-      }
+    }
   }
 };
 
@@ -87,7 +87,7 @@ struct XentFunctorBase {
                   typename TTypes<T>::Vec loss,
                   typename TTypes<T>::Matrix backprop) {
     XentEigenImpl<Device, T>::Compute(d, logits, labels, scratch, loss,
-                                         backprop);
+                                      backprop);
   }
 };
 
@@ -97,7 +97,7 @@ struct XentFunctor<CPUDevice, T> : XentFunctorBase<CPUDevice, T> {};
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
 struct XentFunctor<SYCLDevice, T> : XentFunctorBase<SYCLDevice, T> {};
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace functor
 
 #define REGISTER_CPU(T)                                         \
@@ -129,6 +129,6 @@ REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits")
                             .Device(DEVICE_SYCL)
                             .TypeConstraint<float>("T"),
                         SoftmaxXentWithLogitsOp<SYCLDevice, float>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/xsmm_conv2d_test.cc b/tensorflow/core/kernels/xsmm_conv2d_test.cc
index e294701246..481f3b7ba4 100644
--- a/tensorflow/core/kernels/xsmm_conv2d_test.cc
+++ b/tensorflow/core/kernels/xsmm_conv2d_test.cc
@@ -13,18 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/conv_ops.h"
-#include "tensorflow/core/platform/test.h"
+#include "include/libxsmm.h"
+#include "tensorflow/core/framework/fake_input.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/kernels/conv_ops.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
-#include "include/libxsmm.h"
-#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
 namespace {
 
-
 typedef struct {
   int nImg;
   int nIfm;
@@ -49,45 +48,41 @@ typedef struct {
   int stride_w;
 } naive_conv_t;
 
-
-LIBXSMM_INLINE void naive_copy_NCHW_to_NHWC(const float* nchw, Tensor &nhwc, int N, int H, int W, int C)
-{
-  LIBXSMM_VLA_DECL(4, const float,  input, nchw, C, H, W);
+LIBXSMM_INLINE void naive_copy_NCHW_to_NHWC(const float* nchw, Tensor& nhwc,
+                                            int N, int H, int W, int C) {
+  LIBXSMM_VLA_DECL(4, const float, input, nchw, C, H, W);
   int n, h, w, c;
-  auto output =  nhwc.flat<float>();
-  for ( n = 0; n < N; n++ ) {
-    for ( h = 0; h < H; h++ ) {
-      for ( w = 0; w < W; w++ ) {
-        for ( c = 0; c < C; c++ ) {
-          output(n*H*W*C + h*W*C +w*C + c)  =
-          LIBXSMM_VLA_ACCESS(4,  input, n, c, h, w, C, H, W);
+  auto output = nhwc.flat<float>();
+  for (n = 0; n < N; n++) {
+    for (h = 0; h < H; h++) {
+      for (w = 0; w < W; w++) {
+        for (c = 0; c < C; c++) {
+          output(n * H * W * C + h * W * C + w * C + c) =
+              LIBXSMM_VLA_ACCESS(4, input, n, c, h, w, C, H, W);
         }
       }
     }
   }
 }
 
-
-LIBXSMM_INLINE void naive_copy_KCRS_to_RSCK(const float* kcrs, Tensor  &rsck, int R, int S, int C, int K)
-{
-  LIBXSMM_VLA_DECL(4, const float,  input, kcrs, C, R, S);
+LIBXSMM_INLINE void naive_copy_KCRS_to_RSCK(const float* kcrs, Tensor& rsck,
+                                            int R, int S, int C, int K) {
+  LIBXSMM_VLA_DECL(4, const float, input, kcrs, C, R, S);
   int r, s, c, k;
-  auto output =  rsck.flat<float>();
-
-  for ( r = 0; r < R; r++ ) {
-    for ( s = 0; s < S; s++ ) {
-      for ( c = 0; c < C; c++ ) {
-        for ( k = 0; k < K; k++ ) {
-          output(r*S*C*K + s*C*K + c*K + k) =
-          LIBXSMM_VLA_ACCESS(4,  input, k, c, r, s, C, R, S);
+  auto output = rsck.flat<float>();
+
+  for (r = 0; r < R; r++) {
+    for (s = 0; s < S; s++) {
+      for (c = 0; c < C; c++) {
+        for (k = 0; k < K; k++) {
+          output(r * S * C * K + s * C * K + c * K + k) =
+              LIBXSMM_VLA_ACCESS(4, input, k, c, r, s, C, R, S);
         }
       }
     }
   }
 }
 
-
-
 LIBXSMM_INLINE void zero_buf(float* buf, long size) {
   int i;
   for (i = 0; i < size; ++i) {
@@ -95,52 +90,53 @@ LIBXSMM_INLINE void zero_buf(float* buf, long size) {
   }
 }
 
-LIBXSMM_INLINE void copy_buf(Tensor &dst,float *src,long size) {
-  long  i;
-  auto output =  dst.flat<float>();
-  for (i = 0; i < size; ++i)
-          output(i) = src[i];
+LIBXSMM_INLINE void copy_buf(Tensor& dst, float* src, long size) {
+  long i;
+  auto output = dst.flat<float>();
+  for (i = 0; i < size; ++i) output(i) = src[i];
 }
 
-LIBXSMM_INLINE void init_buf(float* buf, long size, int initPos, int initOne)
-{
+LIBXSMM_INLINE void init_buf(float* buf, long size, int initPos, int initOne) {
   int i;
   zero_buf(buf, size);
   for (i = 0; i < size; ++i) {
-    buf[i] = (float)((initOne != 0) ? 1.0 : ((initPos != 0) ? drand48() : (0.05 - drand48()/10.0)));
+    buf[i] =
+        (float)((initOne != 0)
+                    ? 1.0
+                    : ((initPos != 0) ? drand48() : (0.05 - drand48() / 10.0)));
   }
 }
 
-
-
-LIBXSMM_INLINE void naive_conv_fp(naive_conv_t* param, const float* input, float* output, const float* filter)
-{
-  int nImg      = param->nImg;
-  int nIfm      = param->nIfm;
-  int nOfm      = param->nOfm;
-  int ifhp      = param->ifhp;
-  int ifwp      = param->ifwp;
-  int ofhp      = param->ofhp;
-  int ofwp      = param->ofwp;
-  int ifh       = param->ifh;
-  int ifw       = param->ifw;
-  int ofh       = param->ofh;
-  int ofw       = param->ofw;
-  int pad_h     = param->pad_h;
-  int pad_w     = param->pad_w;
-  int pad_h_in  = param->pad_h_in;
-  int pad_w_in  = param->pad_w_in;
+LIBXSMM_INLINE void naive_conv_fp(naive_conv_t* param, const float* input,
+                                  float* output, const float* filter) {
+  int nImg = param->nImg;
+  int nIfm = param->nIfm;
+  int nOfm = param->nOfm;
+  int ifhp = param->ifhp;
+  int ifwp = param->ifwp;
+  int ofhp = param->ofhp;
+  int ofwp = param->ofwp;
+  int ifh = param->ifh;
+  int ifw = param->ifw;
+  int ofh = param->ofh;
+  int ofw = param->ofw;
+  int pad_h = param->pad_h;
+  int pad_w = param->pad_w;
+  int pad_h_in = param->pad_h_in;
+  int pad_w_in = param->pad_w_in;
   int pad_h_out = param->pad_h_out;
   int pad_w_out = param->pad_w_out;
-  int kh        = param->kh;
-  int kw        = param->kw;
-  int stride_h  = param->stride_h;
-  int stride_w  = param->stride_w;
+  int kh = param->kh;
+  int kw = param->kw;
+  int stride_h = param->stride_h;
+  int stride_w = param->stride_w;
   /* loop counters */
   int img, ofm, ifm, oj, oi, ij, ii, kj, ki;
 
-  LIBXSMM_VLA_DECL(4,       float, output_t, output + (pad_w_out * ofwp + pad_h_out), nOfm, ofhp, ofwp);
-  LIBXSMM_VLA_DECL(4, const float,  input_t,  input + (pad_w_in * ifwp + pad_h_in), nIfm, ifhp, ifwp);
+  LIBXSMM_VLA_DECL(4, float, output_t, output + (pad_w_out * ofwp + pad_h_out),
+                   nOfm, ofhp, ofwp);
+  LIBXSMM_VLA_DECL(4, const float, input_t,
+                   input + (pad_w_in * ifwp + pad_h_in), nIfm, ifhp, ifwp);
   LIBXSMM_VLA_DECL(4, const float, filter_t, filter, nIfm, kh, kw);
 
   for (img = 0; img < nImg; ++img) {
@@ -151,12 +147,15 @@ LIBXSMM_INLINE void naive_conv_fp(naive_conv_t* param, const float* input, float
           for (oi = 0; oi < ofw; ++oi) {
             ii = oi * stride_w - pad_w;
             for (kj = 0; kj < kh; ++kj) {
-              if(ij+kj < 0 || ij+kj >= ifh) continue;
+              if (ij + kj < 0 || ij + kj >= ifh) continue;
               for (ki = 0; ki < kw; ++ki) {
-                if(ii+ki < 0 || ii+ki >= ifw) continue;
-                LIBXSMM_VLA_ACCESS(  4, output_t, img, ofm, oj, oi, nOfm, ofhp, ofwp) +=
-                  LIBXSMM_VLA_ACCESS(4,  input_t, img, ifm, ij + kj, ii + ki, nIfm, ifhp, ifwp)
-                * LIBXSMM_VLA_ACCESS(4, filter_t, ofm, ifm, kj, ki, nIfm, kh, kw);
+                if (ii + ki < 0 || ii + ki >= ifw) continue;
+                LIBXSMM_VLA_ACCESS(4, output_t, img, ofm, oj, oi, nOfm, ofhp,
+                                   ofwp) +=
+                    LIBXSMM_VLA_ACCESS(4, input_t, img, ifm, ij + kj, ii + ki,
+                                       nIfm, ifhp, ifwp) *
+                    LIBXSMM_VLA_ACCESS(4, filter_t, ofm, ifm, kj, ki, nIfm, kh,
+                                       kw);
               }
             }
           }
@@ -168,134 +167,118 @@ LIBXSMM_INLINE void naive_conv_fp(naive_conv_t* param, const float* input, float
 
 void RunXsmmVsGeneric() {}
 
-
 class XsmmConv2DTest : public OpsTestBase {
  protected:
   void MakeOp(int stride) {
-
     TF_CHECK_OK(NodeDefBuilder("xsmm", "Conv2D")
-                      .Input(FakeInput(DT_FLOAT))
-                      .Input(FakeInput(DT_FLOAT))
-                      .Attr("strides", {1, stride,stride, 1})
-                      .Attr("padding", "VALID" )
-                      .Finalize(node_def()));
-
+                    .Input(FakeInput(DT_FLOAT))
+                    .Input(FakeInput(DT_FLOAT))
+                    .Attr("strides", {1, stride, stride, 1})
+                    .Attr("padding", "VALID")
+                    .Finalize(node_def()));
 
     TF_ASSERT_OK(InitOp());
   }
 };
 
 TEST_F(XsmmConv2DTest, Basic) {
-     MakeOp(1);
+  MakeOp(1);
 
-     // setup scoped allocator, which uses cpu_allocator() for this scope
-     const libxsmm_tf_allocator<libxsmm_scratch_allocator> tf_allocator;
+  // setup scoped allocator, which uses cpu_allocator() for this scope
+  const libxsmm_tf_allocator<libxsmm_scratch_allocator> tf_allocator;
 
-     int ifw = 14;           /* input width, "W" */
-     int ifh = 14;           /* input height, "H" */
-     int nImg = 32;          /* mini-batch size, "N" */
-     int nIfm = 64;         /* number of input feature maps, "C" */
-     int nOfm = 64;         /* number of output feature maps, "K" */
-     int kh = 3;             /* filter height, "R" */
-     int kw = 3;             /* filter width, "S" */
-     int pad = 0;            /* padding in output */
-     int stride = 1;         /* stride when accessing inputs */
+  int ifw = 14;   /* input width, "W" */
+  int ifh = 14;   /* input height, "H" */
+  int nImg = 32;  /* mini-batch size, "N" */
+  int nIfm = 64;  /* number of input feature maps, "C" */
+  int nOfm = 64;  /* number of output feature maps, "K" */
+  int kh = 3;     /* filter height, "R" */
+  int kw = 3;     /* filter width, "S" */
+  int pad = 0;    /* padding in output */
+  int stride = 1; /* stride when accessing inputs */
 
+  int stride_w = stride;
+  int stride_h = stride;
+  int pad_h = pad;
+  int pad_w = pad;
 
-     int stride_w = stride;
-     int stride_h = stride;
-     int pad_h = pad;
-     int pad_w = pad;
+  int pad_h_in = pad_h;
+  int pad_w_in = pad_w;
 
-     int pad_h_in = pad_h;
-     int pad_w_in = pad_w;
-
-     int pad_h_out = 0;
-     int pad_w_out = 0;
+  int pad_h_out = 0;
+  int pad_w_out = 0;
 
   /* deriving some values for naive code */
-     int ofh = (ifh + 2 * pad_h - kh) / stride_h + 1;
-     int ofw = (ifw + 2 * pad_w - kw) / stride_w + 1;
-     int ifhp = ifh + 2 * pad_h_in;
-     int ifwp = ifw + 2 * pad_w_in;
-     int ofhp = ofh + 2 * pad_h_out;
-     int ofwp = ofw + 2 * pad_w_out;
-
-
-    //Initialization of Filter and Image
-
-    /* allocate data */
-     float *naive_input           = (float*)libxsmm_aligned_scratch( nImg*nIfm*ifhp*ifwp*sizeof(float), 2097152);
-     float *naive_output          = (float*)libxsmm_aligned_scratch( nImg*nOfm*ofhp*ofwp*sizeof(float), 2097152);
-     float *naive_filter          = (float*)libxsmm_aligned_scratch( nOfm*nIfm*kh*kw*    sizeof(float), 2097152);
-     /* initialize data */
-     init_buf(naive_input,          nImg*nIfm*ifhp*ifwp, 0, 0);
-     zero_buf(naive_output,         nImg*nOfm*ofhp*ofwp);
-     init_buf(naive_filter,         nOfm*nIfm*kh*kw, 0, 0);
-
-
-     Tensor image(DT_FLOAT,
-                 {nImg, ifhp, ifwp, nIfm});
-
-
-     Tensor filter(DT_FLOAT, {kh,kw,nIfm,nOfm});
-
-
-     naive_copy_NCHW_to_NHWC(naive_input, image, nImg, ifhp, ifwp, nIfm);
-     naive_copy_KCRS_to_RSCK(naive_filter, filter, kh, kw, nIfm, nOfm);
-
-
-    //Run naive convolution
-
-     naive_conv_t naive_param;
-
-     naive_param.nImg = nImg;
-     naive_param.nIfm = nIfm;
-     naive_param.nOfm = nOfm;
-     naive_param.ifhp = ifhp;
-     naive_param.ifwp = ifwp;
-     naive_param.ofhp = ofhp;
-     naive_param.ofwp = ofwp;
-     naive_param.ifh = ifh;
-     naive_param.ifw = ifw;
-     naive_param.ofh = ofh;
-     naive_param.ofw = ofw;
-     naive_param.pad_h = pad_h;
-     naive_param.pad_w = pad_w;
-     naive_param.pad_h_in = pad_h_in;
-     naive_param.pad_w_in = pad_w_in;
-     naive_param.pad_h_out = pad_h_out;
-     naive_param.pad_w_out = pad_w_out;
-     naive_param.kh = kh;
-     naive_param.kw = kw;
-     naive_param.stride_h = stride_h;
-     naive_param.stride_w = stride_w;
-
-
-     naive_conv_fp(&naive_param, naive_input, naive_output, naive_filter);
-
-
-
-     AddInputFromArray<float>(image.shape(), image.flat<float>());
-     AddInputFromArray<float>(filter.shape(), filter.flat<float>());
-
-
-
-     //Run Op (TF)
-     TF_ASSERT_OK(RunOpKernel());
-
-     // Check the output.
-     Tensor expected(DT_FLOAT, {nImg,ofhp,ofwp, nOfm});
-     naive_copy_NCHW_to_NHWC(naive_output, expected, nImg, ofhp, ofwp, nOfm);
-
-
-     test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
-     libxsmm_free(naive_input);
-     libxsmm_free(naive_output);
-     libxsmm_free(naive_filter);
-
-
-
+  int ofh = (ifh + 2 * pad_h - kh) / stride_h + 1;
+  int ofw = (ifw + 2 * pad_w - kw) / stride_w + 1;
+  int ifhp = ifh + 2 * pad_h_in;
+  int ifwp = ifw + 2 * pad_w_in;
+  int ofhp = ofh + 2 * pad_h_out;
+  int ofwp = ofw + 2 * pad_w_out;
+
+  // Initialization of Filter and Image
+
+  /* allocate data */
+  float* naive_input = (float*)libxsmm_aligned_scratch(
+      nImg * nIfm * ifhp * ifwp * sizeof(float), 2097152);
+  float* naive_output = (float*)libxsmm_aligned_scratch(
+      nImg * nOfm * ofhp * ofwp * sizeof(float), 2097152);
+  float* naive_filter = (float*)libxsmm_aligned_scratch(
+      nOfm * nIfm * kh * kw * sizeof(float), 2097152);
+  /* initialize data */
+  init_buf(naive_input, nImg * nIfm * ifhp * ifwp, 0, 0);
+  zero_buf(naive_output, nImg * nOfm * ofhp * ofwp);
+  init_buf(naive_filter, nOfm * nIfm * kh * kw, 0, 0);
+
+  Tensor image(DT_FLOAT, {nImg, ifhp, ifwp, nIfm});
+
+  Tensor filter(DT_FLOAT, {kh, kw, nIfm, nOfm});
+
+  naive_copy_NCHW_to_NHWC(naive_input, image, nImg, ifhp, ifwp, nIfm);
+  naive_copy_KCRS_to_RSCK(naive_filter, filter, kh, kw, nIfm, nOfm);
+
+  // Run naive convolution
+
+  naive_conv_t naive_param;
+
+  naive_param.nImg = nImg;
+  naive_param.nIfm = nIfm;
+  naive_param.nOfm = nOfm;
+  naive_param.ifhp = ifhp;
+  naive_param.ifwp = ifwp;
+  naive_param.ofhp = ofhp;
+  naive_param.ofwp = ofwp;
+  naive_param.ifh = ifh;
+  naive_param.ifw = ifw;
+  naive_param.ofh = ofh;
+  naive_param.ofw = ofw;
+  naive_param.pad_h = pad_h;
+  naive_param.pad_w = pad_w;
+  naive_param.pad_h_in = pad_h_in;
+  naive_param.pad_w_in = pad_w_in;
+  naive_param.pad_h_out = pad_h_out;
+  naive_param.pad_w_out = pad_w_out;
+  naive_param.kh = kh;
+  naive_param.kw = kw;
+  naive_param.stride_h = stride_h;
+  naive_param.stride_w = stride_w;
+
+  naive_conv_fp(&naive_param, naive_input, naive_output, naive_filter);
+
+  AddInputFromArray<float>(image.shape(), image.flat<float>());
+  AddInputFromArray<float>(filter.shape(), filter.flat<float>());
+
+  // Run Op (TF)
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(DT_FLOAT, {nImg, ofhp, ofwp, nOfm});
+  naive_copy_NCHW_to_NHWC(naive_output, expected, nImg, ofhp, ofwp, nOfm);
+
+  test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
+  libxsmm_free(naive_input);
+  libxsmm_free(naive_output);
+  libxsmm_free(naive_filter);
 }
 
 /*
@@ -325,7 +308,8 @@ TEST(XsmmConv2DTest, Basic) {
     desc.threads = num_threads;
     desc.algo = LIBXSMM_DNN_CONV_ALGO_DIRECT;
     desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_NHWC;
-    desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM;//LIBXSMM_DNN_TENSOR_FORMAT_RSCK;
+    desc.filter_format =
+LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM;//LIBXSMM_DNN_TENSOR_FORMAT_RSCK;
     desc.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE;
     desc.options = LIBXSMM_DNN_CONV_OPTION_NONE;
     desc.datatype = LIBXSMM_DNN_DATATYPE_F32;
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index 279a5876f9..fb9e8ad50c 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -977,8 +977,8 @@ REGISTER_OP("GatherNd")
       if (c->Value(r_dim) > c->Rank(params)) {
         return errors::InvalidArgument(
             "indices.shape[-1] must be <= params.rank, but saw indices shape: ",
-            c->DebugString(indices), " and params shape: ",
-            c->DebugString(params));
+            c->DebugString(indices),
+            " and params shape: ", c->DebugString(params));
       }
 
       // Remove r_dim from indices to get output.
@@ -1252,12 +1252,12 @@ REGISTER_OP("ReverseSequence")
       // Validate batch_dim and seq_dim against input.
       const int32 input_rank = c->Rank(input);
       if (batch_dim >= input_rank) {
-        return errors::InvalidArgument("batch_dim must be < input rank: ",
-                                       batch_dim, " vs. ", input_rank);
+        return errors::InvalidArgument(
+            "batch_dim must be < input rank: ", batch_dim, " vs. ", input_rank);
       }
       if (seq_dim >= input_rank) {
-        return errors::InvalidArgument("seq_dim must be < input rank: ",
-                                       seq_dim, " vs. ", input_rank);
+        return errors::InvalidArgument(
+            "seq_dim must be < input rank: ", seq_dim, " vs. ", input_rank);
       }
 
       DimensionHandle batch_dim_dim = c->Dim(input, batch_dim);
@@ -2638,8 +2638,9 @@ Status ScatterNdShape(InferenceContext* c) {
       Status s = c->Merge(prefix_indices, prefix_updates, &unused);
       if (!s.ok()) {
         return errors::InvalidArgument(
-            "The outer ", outer_dims, " dimensions of indices.shape=",
-            c->DebugString(indices_shape), " must match the outer ", outer_dims,
+            "The outer ", outer_dims,
+            " dimensions of indices.shape=", c->DebugString(indices_shape),
+            " must match the outer ", outer_dims,
             " dimensions of updates.shape=", c->DebugString(updates_shape),
             ": ", s.error_message());
       }
diff --git a/tensorflow/core/ops/array_ops_test.cc b/tensorflow/core/ops/array_ops_test.cc
index a182fd1c47..86d64635f4 100644
--- a/tensorflow/core/ops/array_ops_test.cc
+++ b/tensorflow/core/ops/array_ops_test.cc
@@ -142,8 +142,13 @@ TEST(ArrayOpsTest, Const_ShapeFn) {
 
 TEST(ArrayOpsTest, UnchangedShapes_ShapeFn) {
   for (const char* op_name : {
-           "CheckNumerics", "Identity", "RefIdentity", "QuantizeAndDequantize",
-           "StopGradient", "ZerosLike", "OnesLike",
+           "CheckNumerics",
+           "Identity",
+           "RefIdentity",
+           "QuantizeAndDequantize",
+           "StopGradient",
+           "ZerosLike",
+           "OnesLike",
        }) {
     ShapeInferenceTestOp op(op_name);
     INFER_OK(op, "?", "in0");
diff --git a/tensorflow/core/ops/candidate_sampling_ops_test.cc b/tensorflow/core/ops/candidate_sampling_ops_test.cc
index c79b443914..f367371604 100644
--- a/tensorflow/core/ops/candidate_sampling_ops_test.cc
+++ b/tensorflow/core/ops/candidate_sampling_ops_test.cc
@@ -23,9 +23,12 @@ namespace tensorflow {
 
 TEST(CandidateSamplerOpsTest, CandidateSampler_ShapeFn) {
   for (const char* op_name : {
-           "AllCandidateSampler", "FixedUnigramCandidateSampler",
-           "LearnedUnigramCandidateSampler", "LogUniformCandidateSampler",
-           "ThreadUnsafeUnigramCandidateSampler", "UniformCandidateSampler",
+           "AllCandidateSampler",
+           "FixedUnigramCandidateSampler",
+           "LearnedUnigramCandidateSampler",
+           "LogUniformCandidateSampler",
+           "ThreadUnsafeUnigramCandidateSampler",
+           "UniformCandidateSampler",
        }) {
     ShapeInferenceTestOp op(op_name);
     TF_ASSERT_OK(NodeDefBuilder("test", op.name)
diff --git a/tensorflow/core/ops/functional_grad.cc b/tensorflow/core/ops/functional_grad.cc
index 6df3536795..eeccb72da6 100644
--- a/tensorflow/core/ops/functional_grad.cc
+++ b/tensorflow/core/ops/functional_grad.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/function.h"
 #include <vector>
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index dd484c3ee7..872ebe98c1 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -1172,12 +1172,12 @@ Status RangeSize(const Tensor* start_t, const Tensor* limit_t,
   T limit = limit_t->scalar<T>()();
   T delta = delta_t->scalar<T>()();
   if (start > limit && delta > 0) {
-    return errors::InvalidArgument("Requires start <= limit when delta > 0: ",
-                                   start, "/", limit);
+    return errors::InvalidArgument(
+        "Requires start <= limit when delta > 0: ", start, "/", limit);
   }
   if (start < limit && delta < 0) {
-    return errors::InvalidArgument("Requires start >= limit when delta < 0: ",
-                                   start, "/", limit);
+    return errors::InvalidArgument(
+        "Requires start >= limit when delta < 0: ", start, "/", limit);
   }
   if (delta == 0) {
     return errors::InvalidArgument("Requires delta != 0");
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index 3f72b41569..62661fe4bd 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -1155,9 +1155,9 @@ Status TopKShapeFn(InferenceContext* c) {
   DimensionHandle last_dim = c->Dim(input, -1);
   if (c->ValueKnown(last_dim) && c->ValueKnown(k_dim) &&
       c->Value(last_dim) < c->Value(k_dim)) {
-    return errors::InvalidArgument("input must have last dimension >= k = ",
-                                   c->Value(k_dim), " but is ",
-                                   c->Value(last_dim));
+    return errors::InvalidArgument(
+        "input must have last dimension >= k = ", c->Value(k_dim), " but is ",
+        c->Value(last_dim));
   }
 
   // Replace last_dim with k_dim.
@@ -1211,9 +1211,9 @@ REGISTER_OP("NthElement")
       DimensionHandle last_dim = c->Dim(input, -1);
       if (c->ValueKnown(last_dim) && c->ValueKnown(n_dim) &&
           c->Value(last_dim) <= c->Value(n_dim)) {
-        return errors::InvalidArgument("Input must have last dimension > n = ",
-                                       c->Value(n_dim), " but is ",
-                                       c->Value(last_dim));
+        return errors::InvalidArgument(
+            "Input must have last dimension > n = ", c->Value(n_dim),
+            " but is ", c->Value(last_dim));
       }
 
       // Reduce last_dim for output tensor
diff --git a/tensorflow/core/ops/sdca_ops.cc b/tensorflow/core/ops/sdca_ops.cc
index e67d95fa8c..4025070adb 100644
--- a/tensorflow/core/ops/sdca_ops.cc
+++ b/tensorflow/core/ops/sdca_ops.cc
@@ -19,8 +19,8 @@ limitations under the License.
 
 namespace tensorflow {
 
-using shape_inference::ShapeHandle;
 using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
 
 // --------------------------------------------------------------------------
 static Status ApplySdcaOptimizerShapeFn(InferenceContext* c) {
diff --git a/tensorflow/core/ops/string_ops.cc b/tensorflow/core/ops/string_ops.cc
index 8beb28de0a..e4c5bcfb54 100644
--- a/tensorflow/core/ops/string_ops.cc
+++ b/tensorflow/core/ops/string_ops.cc
@@ -137,9 +137,9 @@ REGISTER_OP("Substr")
         DimensionHandle pos_dim = c->Dim(pos_shape, i);
         DimensionHandle len_dim = c->Dim(len_shape, i);
         if (c->Value(pos_dim) != c->Value(len_dim)) {
-          return errors::InvalidArgument("pos and len shapes must match: ",
-                                         c->DebugString(pos_shape), " vs. ",
-                                         c->DebugString(len_shape));
+          return errors::InvalidArgument(
+              "pos and len shapes must match: ", c->DebugString(pos_shape),
+              " vs. ", c->DebugString(len_shape));
         }
       }
       // c->input(0) is the ShapeHandle to input strings
diff --git a/tensorflow/core/ops/training_ops_test.cc b/tensorflow/core/ops/training_ops_test.cc
index de4e3cd9e7..0f309c1f4e 100644
--- a/tensorflow/core/ops/training_ops_test.cc
+++ b/tensorflow/core/ops/training_ops_test.cc
@@ -24,7 +24,7 @@ static void TestGradAndIndicesErrorHandling(const ShapeInferenceTestOp& op,
                                             string shape_spec_middle,
                                             const string& shape_spec_end = "") {
   auto shape_spec = [&shape_spec_middle, shape_spec_end](
-      const char* var_spec, const char* grad_indices_spec) {
+                        const char* var_spec, const char* grad_indices_spec) {
     return strings::StrCat(var_spec, ";", shape_spec_middle, ";",
                            grad_indices_spec, shape_spec_end);
   };
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 01b3e92d2d..a323d5bc39 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -2668,6 +2668,7 @@ cuda_py_test(
         ":nn_ops_gen",
         "//third_party/py/numpy",
     ],
+    shard_count = 4,
     tags = ["no_windows"],
 )
 
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index b107670275..e3a52141a0 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -2103,6 +2103,10 @@ class Operation(object):
     logging.warning("Operation._control_inputs is private, use "
                     "Operation.control_inputs instead. "
                     "Operation._control_inputs will eventually be removed.")
+    # Copy value because it may be self._control_inputs_val (in particular if
+    # this is called from self._control_inputs += ...), and we don't want to
+    # clear value below.
+    value = copy.copy(value)
     self._remove_all_control_inputs()
     self._add_control_inputs(value)
 
diff --git a/tensorflow/python/kernel_tests/softmax_op_test.py b/tensorflow/python/kernel_tests/softmax_op_test.py
index be72c19407..bb3f6970e4 100644
--- a/tensorflow/python/kernel_tests/softmax_op_test.py
+++ b/tensorflow/python/kernel_tests/softmax_op_test.py
@@ -25,11 +25,13 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.platform import test
 
 
+@test_util.with_c_api
 class SoftmaxTest(test.TestCase):
 
   def _npSoftmax(self, features, dim=-1, log=False):
@@ -174,8 +176,11 @@ class SoftmaxTest(test.TestCase):
 
   def testDimTooLarge(self):
     with self.test_session():
+      # Use placeholder to make sure we get runtime error instead of shape
+      # inference error.
+      dim = array_ops.placeholder_with_default(100, shape=[])
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        nn_ops.softmax([1., 2., 3., 4.], dim=100).eval()
+        nn_ops.softmax([1., 2., 3., 4.], dim=dim).eval()
 
   def testLargeDims(self):
     # Make sure that we properly handle large inputs. See
diff --git a/tensorflow/python/layers/core.py b/tensorflow/python/layers/core.py
index e5b93a54f7..7bf62d45b8 100644
--- a/tensorflow/python/layers/core.py
+++ b/tensorflow/python/layers/core.py
@@ -49,9 +49,6 @@ class Dense(base.Layer):
   and `bias` is a bias vector created by the layer
   (only if `use_bias` is `True`).
 
-  Note: if the input to the layer has a rank greater than 2, then it is
-  flattened prior to the initial matrix multiply by `kernel`.
-
   Arguments:
     units: Integer or Long, dimensionality of the output space.
     activation: Activation function (callable). Set it to None to maintain a
@@ -199,9 +196,6 @@ def dense(
   and `bias` is a bias vector created by the layer
   (only if `use_bias` is `True`).
 
-  Note: if the `inputs` tensor has a rank greater than 2, then it is
-  flattened prior to the initial matrix multiply by `kernel`.
-
   Arguments:
     inputs: Tensor input.
     units: Integer or Long, dimensionality of the output space.
@@ -230,7 +224,8 @@ def dense(
       by the same name.
 
   Returns:
-    Output tensor.
+    Output tensor the same shape as `inputs` except the last dimension is of
+    size `units`.
 
   Raises:
     ValueError: if eager execution is enabled.
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index b8e8207bb2..9a8ac93de9 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -1841,12 +1841,11 @@ def reduce_logsumexp(input_tensor,
         reduce_sum(
             gen_math_ops.exp(input_tensor - my_max),
             axis,
-            keepdims=True,
-            reduction_indices=reduction_indices)) + my_max
+            keepdims=keepdims,
+            reduction_indices=reduction_indices))
     if not keepdims:
-      if isinstance(axis, int):
-        axis = [axis]
-      result = array_ops.squeeze(result, axis)
+      my_max = array_ops.reshape(my_max, array_ops.shape(result))
+    result += my_max
     return _may_reduce_to_scalar(keepdims, axis, reduction_indices, result)
 
 
diff --git a/tensorflow/stream_executor/executor_cache.cc b/tensorflow/stream_executor/executor_cache.cc
index a23d6a70ba..d1a8aae167 100644
--- a/tensorflow/stream_executor/executor_cache.cc
+++ b/tensorflow/stream_executor/executor_cache.cc
@@ -23,6 +23,14 @@ namespace gputools {
 port::StatusOr<StreamExecutor*> ExecutorCache::GetOrCreate(
     const StreamExecutorConfig& config,
     const std::function<ExecutorFactory>& factory) {
+  // In the fast path case, the cache already has an entry and we can just
+  // return after Get() which only takes a shared lock and not a unique lock.
+  // If we need to create, we take a unique lock on cache_.
+  auto fast_result = Get(config);
+  if (fast_result.ok()) {
+    return fast_result;
+  }
+
   Entry* entry = nullptr;
   {
     mutex_lock lock{mutex_};
@@ -59,12 +67,17 @@ port::StatusOr<StreamExecutor*> ExecutorCache::Get(
     const StreamExecutorConfig& config) {
   Entry* entry = nullptr;
   {
-    mutex_lock lock{mutex_};
-    entry = &cache_[config.ordinal];
-    // Release the map lock; the address of 'entry' is stable because
-    // std::map guarantees reference stability.
+    tf_shared_lock lock{mutex_};
+    auto it = cache_.find(config.ordinal);
+    if (it != cache_.end()) {
+      entry = &it->second;
+    } else {
+      return port::Status(port::error::NOT_FOUND,
+                          port::Printf("No executors registered for ordinal %d",
+                                       config.ordinal));
+    }
   }
-  mutex_lock lock{entry->configurations_mutex};
+  tf_shared_lock lock{entry->configurations_mutex};
   if (entry->configurations.empty()) {
     return port::Status(
         port::error::NOT_FOUND,
diff --git a/tensorflow/stream_executor/multi_platform_manager.cc b/tensorflow/stream_executor/multi_platform_manager.cc
index cc32a6beaa..f23224ae77 100644
--- a/tensorflow/stream_executor/multi_platform_manager.cc
+++ b/tensorflow/stream_executor/multi_platform_manager.cc
@@ -45,7 +45,7 @@ namespace gputools {
 
 /* static */ port::StatusOr<Platform*> MultiPlatformManager::PlatformWithName(
     const string& target) {
-  mutex_lock lock(GetPlatformsMutex());
+  tf_shared_lock lock(GetPlatformsMutex());
   auto it = GetPlatformMap()->find(port::Lowercase(target));
 
   if (it == GetPlatformMap()->end()) {
@@ -59,7 +59,7 @@ namespace gputools {
 
 /* static */ port::StatusOr<Platform*> MultiPlatformManager::PlatformWithId(
     const Platform::Id& id) {
-  mutex_lock lock(GetPlatformsMutex());
+  tf_shared_lock lock(GetPlatformsMutex());
   auto it = GetPlatformByIdMap()->find(id);
   if (it == GetPlatformByIdMap()->end()) {
     return port::Status(
diff --git a/tensorflow/tools/graph_transforms/sparsify_gather.cc b/tensorflow/tools/graph_transforms/sparsify_gather.cc
index 96324d0dea..593c654f9f 100644
--- a/tensorflow/tools/graph_transforms/sparsify_gather.cc
+++ b/tensorflow/tools/graph_transforms/sparsify_gather.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <cmath>
 #include <memory>
+#include <unordered_map>
 
 #include "tensorflow/c/checkpoint_reader.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -28,9 +29,10 @@ limitations under the License.
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
-using strings::StrCat;
 using str_util::Join;
 using str_util::Split;
+using str_util::StringReplace;
+using strings::StrCat;
 
 namespace graph_transforms {
 
@@ -89,7 +91,7 @@ Status ObtainTensorSlice(const GraphDef& input_graph_def,
                          string* shape_slice_string) {
   string restore_node_name;
   for (const auto& node : input_graph_def.node()) {
-    std::vector<string> node_name_parts = str_util::Split(node.name(), "/");
+    std::vector<string> node_name_parts = Split(node.name(), "/");
     if (node_name_parts.size() == 2 &&
         StringPiece(node_name_parts[0]).starts_with("save") &&
         StringPiece(node_name_parts[1]).starts_with("Assign") &&
@@ -119,13 +121,13 @@ Status ObtainTensorSlice(const GraphDef& input_graph_def,
 }
 
 string GetMonolithicTensorKey(const string& tensor_slice_name) {
-  std::vector<string> names = str_util::Split(tensor_slice_name, "/");
+  std::vector<string> names = Split(tensor_slice_name, "/");
   CHECK_GE(names.size(), 2);
   CHECK(StringPiece(names[names.size() - 1]).starts_with("part_"));
 
   // Remove the "part_x" suffix
   names.pop_back();
-  return str_util::Join(names, "/");
+  return Join(names, "/");
 }
 
 Status ReadTensorFromCheckpoint(
@@ -193,6 +195,15 @@ Status SparsifyGatherInternal(
   GraphDef current_graph_def = input_graph_def;
   bool any_match_found = false;
 
+  // Populate references.
+  std::unordered_map<string, int> refs;
+  for (const auto& node : current_graph_def.node()) {
+    for (const auto& input : node.input()) {
+      auto parsed_input = StringReplace(input, "^", "", true);
+      refs[parsed_input] += 1;
+    }
+  }
+
   // The subgraphs may have overlapping components, therefore GraphMatcher
   // doesn't return all subgraphs in one round -- this has to be multi-round
   // update.
@@ -200,15 +211,15 @@ Status SparsifyGatherInternal(
     any_match_found = false;
     GraphDef replaced_graph_def = current_graph_def;
     std::vector<string> init_table_node_names;
-    std::vector<string> removed_variable_names;
+    std::vector<string> removed_node_names;
 
     TF_RETURN_IF_ERROR(ReplaceMatchingOpTypes(
         current_graph_def, pattern,
         [&ckpt_reader, &any_match_found, &init_table_node_names,
-         &shapes_and_slices, &removed_variable_names](
-            const NodeMatch& match, const std::set<string>& input_nodes,
-            const std::set<string>& output_nodes,
-            std::vector<NodeDef>* new_nodes) {
+         &shapes_and_slices, &removed_node_names,
+         &refs](const NodeMatch& match, const std::set<string>& input_nodes,
+                const std::set<string>& output_nodes,
+                std::vector<NodeDef>* new_nodes) {
           any_match_found = true;
 
           // The captured subgraph should be of the following pattern:
@@ -291,8 +302,12 @@ Status SparsifyGatherInternal(
                 weights_node.name(), ckpt_reader,
                 (*shapes_and_slices)[weights_node.name()], &weight));
             // Add both both weight and identity node names.
-            removed_variable_names.push_back(weights_node.name());
-            removed_variable_names.push_back(match.inputs[0].node.name());
+            removed_node_names.push_back(weights_node.name());
+            removed_node_names.push_back(match.inputs[0].node.name());
+            for (auto input_node : match.inputs[0].node.input()) {
+              auto parsed_input = StringReplace(input_node, "^", "", true);
+              refs[parsed_input]--;
+            }
           }
           Tensor indices_tensor;
           Tensor values_tensor;
@@ -362,15 +377,23 @@ Status SparsifyGatherInternal(
 
           // Connect nodes
           AddNodeInput(hashtable_node.name(), &init_table_node);
+          refs[hashtable_node.name()]++;
           AddNodeInput(indices_node.name(), &init_table_node);
+          refs[indices_node.name()]++;
           AddNodeInput(values_node.name(), &init_table_node);
+          refs[values_node.name()]++;
 
           AddNodeInput(hashtable_node.name(), &lookup_node);
+          refs[hashtable_node.name()]++;
           AddNodeInput(gather_node.input(1), &lookup_node);
+          refs[gather_node.input(1)]++;
           AddNodeInput(default_value_node.name(), &lookup_node);
+          refs[default_value_node.name()]++;
 
           AddNodeInput(lookup_node.name(), &expand_dims_node);
+          refs[lookup_node.name()]++;
           AddNodeInput(dim_idx_node.name(), &expand_dims_node);
+          refs[dim_idx_node.name()]++;
 
           // Copy 'ids' input of original 'Gather'
           new_nodes->push_back(match.inputs[1].node);
@@ -404,22 +427,44 @@ Status SparsifyGatherInternal(
     for (const string& name : init_table_node_names) {
       // Add control dependence from init_table_node to group_deps_node
       AddNodeInput(StrCat("^", name), init_op);
+      refs[name]++;
+    }
+
+    // Erase inputs and outputs as they are not considered for deletion.
+    for (const auto& output : context.output_names) {
+      refs.erase(output);
+    }
+
+    for (const auto& input : context.input_names) {
+      refs.erase(input);
     }
 
-    // Remove all dependencies associated with removed variables.
-    while (!removed_variable_names.empty()) {
-      auto name = removed_variable_names.back();
-      removed_variable_names.pop_back();
+    // Add nodes with a reference count of 0 for deletion.
+    for (auto entry : refs) {
+      if (entry.second == 0) {
+        removed_node_names.push_back(entry.first);
+      }
+    }
+
+    while (!removed_node_names.empty()) {
+      auto name = removed_node_names.back();
+      removed_node_names.pop_back();
+
       int i = 0;
       while (i < replaced_graph_def.node_size()) {
-        if (!replaced_graph_def.node(i).input_size()) {
-          if (replaced_graph_def.node(i).name() == name) {
-            replaced_graph_def.mutable_node()->SwapElements(
-                i, replaced_graph_def.node_size() - 1);
-            replaced_graph_def.mutable_node()->RemoveLast();
-            continue;
+        // Revisit this to see if we can safely remove RestoreV2 nodes.
+        if ((replaced_graph_def.node(i).name() == name) &&
+            (replaced_graph_def.node(i).op() != "RestoreV2")) {
+          for (const auto& input : replaced_graph_def.node(i).input()) {
+            auto parsed_input = StringReplace(input, "^", "", true);
+            refs[parsed_input] -= 1;
+            if (refs[parsed_input] == 0) {
+              removed_node_names.push_back(parsed_input);
+            }
           }
-          i++;
+          replaced_graph_def.mutable_node()->SwapElements(
+              i, replaced_graph_def.node_size() - 1);
+          replaced_graph_def.mutable_node()->RemoveLast();
           continue;
         }
         int j = 0;
@@ -433,18 +478,16 @@ Status SparsifyGatherInternal(
           }
           j++;
         }
-        if ((replaced_graph_def.node(i).input_size() == 0) ||
-            (replaced_graph_def.node(i).op() == "Assign" &&
-             replaced_graph_def.node(i).input_size() == 1)) {
-          removed_variable_names.push_back(replaced_graph_def.node(i).name());
-          if (replaced_graph_def.node(i).input_size() == 1) {
-            removed_variable_names.push_back(
-                replaced_graph_def.node(i).input(0));
+        if (!replaced_graph_def.node(i).input_size()) {
+          if ((refs.find(replaced_graph_def.node(i).name()) != refs.end()) &&
+              (refs[replaced_graph_def.node(i).name()] == 0)) {
+            removed_node_names.push_back(replaced_graph_def.node(i).name());
           }
-          replaced_graph_def.mutable_node()->SwapElements(
-              i, replaced_graph_def.node_size() - 1);
-          replaced_graph_def.mutable_node()->RemoveLast();
-          continue;
+        }
+
+        if (replaced_graph_def.node(i).op() == "Assign" &&
+            replaced_graph_def.node(i).input_size() == 1) {
+          removed_node_names.push_back(replaced_graph_def.node(i).name());
         }
         i++;
       }
diff --git a/tensorflow/tools/graph_transforms/sparsify_gather_test.cc b/tensorflow/tools/graph_transforms/sparsify_gather_test.cc
index 000568a0cc..6627df1331 100644
--- a/tensorflow/tools/graph_transforms/sparsify_gather_test.cc
+++ b/tensorflow/tools/graph_transforms/sparsify_gather_test.cc
@@ -80,6 +80,8 @@ class SparsifyGatherTest : public ::testing::Test {
     // Build the graph.
     NodeDef* input_node = CreateNode("ids", "Const", {}, &graph_def);
     NodeDef* w_node;
+    NodeDef* zeros_const;
+    NodeDef* zeros_shape;
     NodeDef* zeros_node;
     NodeDef* assign_node;
 
@@ -92,8 +94,12 @@ class SparsifyGatherTest : public ::testing::Test {
     } else {
       w_node = CreateNode("w/part_1", "VariableV2", {}, &graph_def);
 
-      zeros_node =
-          CreateNode("w/part_1/Initializer/zeros", "Const", {}, &graph_def);
+      zeros_shape = CreateNode("w/part_1/Initializer/zeros/shape_as_tensor",
+                               "Const", {}, &graph_def);
+      zeros_const = CreateNode("w/part_1/Initializer/zeros/Const", "Const", {},
+                               &graph_def);
+      zeros_node = CreateNode("w/part_1/Initializer/zeros", "Fill",
+                              {zeros_shape, zeros_const}, &graph_def);
       assign_node = CreateNode("w/part_1/Assign", "Assign",
                                {w_node, zeros_node}, &graph_def);
 
@@ -151,6 +157,9 @@ class SparsifyGatherTest : public ::testing::Test {
     MapNamesToNodes(result, &node_lookup);
 
     // Check nodes.
+    EXPECT_EQ(0,
+              node_lookup.count("w/part_1/Initializer/zeros/shape_as_tensor"));
+    EXPECT_EQ(0, node_lookup.count("w/part_1/Initializer/zeros/Const"));
     EXPECT_EQ(0, node_lookup.count("w/part_1/Initializer/zeros"));
     EXPECT_EQ(0, node_lookup.count("w/part_1/Assign"));
 
@@ -247,7 +256,11 @@ class SparsifyGatherTest : public ::testing::Test {
     // Two partitions
     NodeDef* w_node1;
     NodeDef* w_node2;
+    NodeDef* zeros_const1;
+    NodeDef* zeros_shape1;
     NodeDef* zeros_node1;
+    NodeDef* zeros_const2;
+    NodeDef* zeros_shape2;
     NodeDef* zeros_node2;
     NodeDef* assign_node1;
     NodeDef* assign_node2;
@@ -261,8 +274,13 @@ class SparsifyGatherTest : public ::testing::Test {
       SetNodeTensorAttr<float>("value", weights, w_node2);
     } else {
       w_node1 = CreateNode("w1/part_1", "VariableV2", {}, &graph_def);
-      zeros_node1 =
-          CreateNode("w1/part_1/Initializer/zeros", "Const", {}, &graph_def);
+
+      zeros_shape1 = CreateNode("w1/part_1/Initializer/zeros/shape_as_tensor",
+                                "Const", {}, &graph_def);
+      zeros_const1 = CreateNode("w1/part_1/Initializer/zeros/Const", "Const",
+                                {}, &graph_def);
+      zeros_node1 = CreateNode("w1/part_1/Initializer/zeros", "Fill",
+                               {zeros_shape1, zeros_const1}, &graph_def);
       assign_node1 = CreateNode("w1/part_1/Assign", "Assign",
                                 {w_node1, zeros_node1}, &graph_def);
 
@@ -285,8 +303,12 @@ class SparsifyGatherTest : public ::testing::Test {
       CreateNode("save/Assign", "Assign", {w_node1, restore_node1}, &graph_def);
 
       w_node2 = CreateNode("w2/part_1", "VariableV2", {}, &graph_def);
-      zeros_node2 =
-          CreateNode("w2/part_1/Initializer/zeros", "Const", {}, &graph_def);
+      zeros_shape2 = CreateNode("w2/part_1/Initializer/zeros/shape_as_tensor",
+                                "Const", {}, &graph_def);
+      zeros_const2 = CreateNode("w2/part_1/Initializer/zeros/Const", "Const",
+                                {}, &graph_def);
+      zeros_node2 = CreateNode("w2/part_1/Initializer/zeros", "Fill",
+                               {zeros_shape2, zeros_const2}, &graph_def);
       assign_node2 = CreateNode("w2/part_1/Assign", "Assign",
                                 {w_node2, zeros_node2}, &graph_def);
 
@@ -350,8 +372,14 @@ class SparsifyGatherTest : public ::testing::Test {
     MapNamesToNodes(result, &node_lookup);
 
     // Check nodes.
+    EXPECT_EQ(0,
+              node_lookup.count("w1/part_1/Initializer/zeros/shape_as_tensor"));
+    EXPECT_EQ(0, node_lookup.count("w1/part_1/Initializer/zeros/Const"));
     EXPECT_EQ(0, node_lookup.count("w1/part_1/Initializer/zeros"));
     EXPECT_EQ(0, node_lookup.count("w1/part_1/Assign"));
+    EXPECT_EQ(0,
+              node_lookup.count("w2/part_1/Initializer/zeros/shape_as_tensor"));
+    EXPECT_EQ(0, node_lookup.count("w2/part_1/Initializer/zeros/Const"));
     EXPECT_EQ(0, node_lookup.count("w2/part_1/Initializer/zeros"));
     EXPECT_EQ(0, node_lookup.count("w2/part_1/Assign"));
     EXPECT_EQ(1, node_lookup.count("ids"));
-- 
GitLab


From 1bd5b2e6fada5334b04e2db87cf246d1a83fa533 Mon Sep 17 00:00:00 2001
From: Adam Roberts <adarob@google.com>
Date: Fri, 26 Jan 2018 13:43:47 -0800
Subject: [PATCH 265/966] Automated g4 rollback of changelist 183321394

PiperOrigin-RevId: 183438398
---
 .../contrib/seq2seq/python/ops/helper.py      | 36 -------------------
 1 file changed, 36 deletions(-)

diff --git a/tensorflow/contrib/seq2seq/python/ops/helper.py b/tensorflow/contrib/seq2seq/python/ops/helper.py
index 6d8f786223..ef3722ee41 100644
--- a/tensorflow/contrib/seq2seq/python/ops/helper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/helper.py
@@ -72,14 +72,6 @@ class Helper(object):
     """
     raise NotImplementedError("batch_size has not been implemented")
 
-  @abc.abstractproperty
-  def input_shape(self):
-    """Shape of each input element in batch.
-
-    Returns a `TensorShape`.
-    """
-    raise NotImplementedError("input_shape has not been implemented")
-
   @abc.abstractproperty
   def sample_ids_shape(self):
     """Shape of tensor returned by `sample`, excluding the batch dimension.
@@ -135,7 +127,6 @@ class CustomHelper(Helper):
     self._sample_fn = sample_fn
     self._next_inputs_fn = next_inputs_fn
     self._batch_size = None
-    self._input_shape = None
     self._sample_ids_shape = tensor_shape.TensorShape(sample_ids_shape or [])
     self._sample_ids_dtype = sample_ids_dtype or dtypes.int32
 
@@ -158,8 +149,6 @@ class CustomHelper(Helper):
       (finished, next_inputs) = self._initialize_fn()
       if self._batch_size is None:
         self._batch_size = array_ops.size(finished)
-      if self._input_shape is None:
-        self._input_shape = next_inputs.shape[1:]
     return (finished, next_inputs)
 
   def sample(self, time, outputs, state, name=None):
@@ -195,7 +184,6 @@ class TrainingHelper(Helper):
     """
     with ops.name_scope(name, "TrainingHelper", [inputs, sequence_length]):
       inputs = ops.convert_to_tensor(inputs, name="inputs")
-      self._inputs = inputs
       if not time_major:
         inputs = nest.map_structure(_transpose_batch_time, inputs)
 
@@ -211,16 +199,11 @@ class TrainingHelper(Helper):
           lambda inp: array_ops.zeros_like(inp[0, :]), inputs)
 
       self._batch_size = array_ops.size(sequence_length)
-      self._input_shape = inputs.shape[2:]
 
   @property
   def batch_size(self):
     return self._batch_size
 
-  @property
-  def input_shape(self):
-    return self._input_shape
-
   @property
   def sample_ids_shape(self):
     return tensor_shape.TensorShape([])
@@ -229,14 +212,6 @@ class TrainingHelper(Helper):
   def sample_ids_dtype(self):
     return dtypes.int32
 
-  @property
-  def inputs(self):
-    return self._inputs
-
-  @property
-  def sequence_length(self):
-    return self._sequence_length
-
   def initialize(self, name=None):
     with ops.name_scope(name, "TrainingHelperInitialize"):
       finished = math_ops.equal(0, self._sequence_length)
@@ -541,16 +516,11 @@ class GreedyEmbeddingHelper(Helper):
     if self._end_token.get_shape().ndims != 0:
       raise ValueError("end_token must be a scalar")
     self._start_inputs = self._embedding_fn(self._start_tokens)
-    self._input_shape = self._start_inputs.shape[1:]
 
   @property
   def batch_size(self):
     return self._batch_size
 
-  @property
-  def input_shape(self):
-    return self._input_shape
-
   @property
   def sample_ids_shape(self):
     return tensor_shape.TensorShape([])
@@ -662,8 +632,6 @@ class InferenceHelper(Helper):
     self._sample_dtype = sample_dtype
     self._next_inputs_fn = next_inputs_fn
     self._batch_size = array_ops.shape(start_inputs)[0]
-    self._input_shape = start_inputs.shape[1:]
-
     self._start_inputs = ops.convert_to_tensor(
         start_inputs, name="start_inputs")
 
@@ -671,10 +639,6 @@ class InferenceHelper(Helper):
   def batch_size(self):
     return self._batch_size
 
-  @property
-  def input_shape(self):
-    return self._input_shape
-
   @property
   def sample_ids_shape(self):
     return self._sample_shape
-- 
GitLab


From ea25bf9558a79747d82220e493d4347901853976 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <bsteiner@google.com>
Date: Fri, 26 Jan 2018 14:02:33 -0800
Subject: [PATCH 266/966] Add op level memory usage estimation to the
 op_level_cost_estimator

PiperOrigin-RevId: 183441321
---
 .../costs/analytical_cost_estimator_test.cc   |  2 +-
 .../core/grappler/costs/cost_estimator.h      |  6 ++
 .../grappler/costs/op_level_cost_estimator.cc | 58 ++++++++++++++-----
 .../grappler/costs/op_level_cost_estimator.h  |  2 +
 4 files changed, 52 insertions(+), 16 deletions(-)

diff --git a/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc b/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc
index 1c2c171383..f241922471 100644
--- a/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc
+++ b/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc
@@ -102,7 +102,7 @@ TEST_F(AnalyticalCostEstimatorTest, SimpleTest) {
   Costs summary;
   TF_ASSERT_OK(estimator.PredictCosts(item.graph, &cost_graph, &summary));
 
-  EXPECT_EQ(Costs::NanoSeconds(9150), summary.execution_time);
+  EXPECT_EQ(Costs::NanoSeconds(9151), summary.execution_time);
 
   // Make this estimate accurate:
   // TODO(http://b/70031255): Accurate estimator for RandomUniform op needed
diff --git a/tensorflow/core/grappler/costs/cost_estimator.h b/tensorflow/core/grappler/costs/cost_estimator.h
index d442861339..9e01ec5ff5 100644
--- a/tensorflow/core/grappler/costs/cost_estimator.h
+++ b/tensorflow/core/grappler/costs/cost_estimator.h
@@ -100,6 +100,8 @@ struct Costs {
   // requirements of a graph. For example, it might assume that all activations
   // are live for all of a graph's execution.
   int64 max_memory;  // Maximum main memory requirement in bytes over all ops.
+  int64 persistent_memory;
+  int64 temporary_memory;
 
   // These fields are used for TPU-related estimations. They are per-op
   // maximums, so each op is evaluated independently, but we want the maximum of
@@ -132,6 +134,8 @@ Costs::Costs() {
   compute_time = Duration::zero();
   memory_time = Duration::zero();
   max_memory = kMemoryUnknown;
+  persistent_memory = kMemoryUnknown;
+  temporary_memory = kMemoryUnknown;
   max_per_op_buffers = kMemoryUnknown;
   max_per_op_streaming = kMemoryUnknown;
 }
@@ -142,6 +146,8 @@ Costs Costs::ZeroCosts() {
   costs.compute_time = Duration::zero();
   costs.memory_time = Duration::zero();
   costs.max_memory = kZeroMemory;
+  costs.persistent_memory = kZeroMemory;
+  costs.temporary_memory = kZeroMemory;
   costs.max_per_op_buffers = kZeroMemory;
   costs.max_per_op_streaming = kZeroMemory;
   return costs;
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
index 6bc136a3f8..cf317374cf 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -47,6 +47,8 @@ constexpr char kSize[] = "Size";
 constexpr char kStopGradient[] = "StopGradient";
 constexpr char kPreventGradient[] = "PreventGradient";
 
+static const Costs::Duration kMinComputeTime(1);
+
 namespace {
 
 string GetDataFormat(const OpInfo& op_features) {
@@ -163,18 +165,20 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
       {kSparseMatMul, wrap(&OpLevelCostEstimator::PredictMatMul)},
       {kBatchMatMul, wrap(&OpLevelCostEstimator::PredictBatchMatMul)},
 
-      {kPlaceholder, wrap(&OpLevelCostEstimator::PredictNoOp)},
-      {kIdentity, wrap(&OpLevelCostEstimator::PredictNoOp)},
-      {kRefIdentity, wrap(&OpLevelCostEstimator::PredictNoOp)},
-      {kStopGradient, wrap(&OpLevelCostEstimator::PredictNoOp)},
-      {kPreventGradient, wrap(&OpLevelCostEstimator::PredictNoOp)},
       {kNoOp, wrap(&OpLevelCostEstimator::PredictNoOp)},
-      {kReshape, wrap(&OpLevelCostEstimator::PredictNoOp)},
-      {kRecv, wrap(&OpLevelCostEstimator::PredictNoOp)},
-      {kSend, wrap(&OpLevelCostEstimator::PredictNoOp)},
-      {kConst, wrap(&OpLevelCostEstimator::PredictNoOp)},
-      {kVariable, wrap(&OpLevelCostEstimator::PredictNoOp)},
-      {kVariableV2, wrap(&OpLevelCostEstimator::PredictNoOp)},
+
+      {kPlaceholder, wrap(&OpLevelCostEstimator::PredictIdentity)},
+      {kIdentity, wrap(&OpLevelCostEstimator::PredictIdentity)},
+      {kRefIdentity, wrap(&OpLevelCostEstimator::PredictIdentity)},
+      {kStopGradient, wrap(&OpLevelCostEstimator::PredictIdentity)},
+      {kPreventGradient, wrap(&OpLevelCostEstimator::PredictIdentity)},
+      {kReshape, wrap(&OpLevelCostEstimator::PredictIdentity)},
+      {kRecv, wrap(&OpLevelCostEstimator::PredictIdentity)},
+      {kSend, wrap(&OpLevelCostEstimator::PredictIdentity)},
+
+      {kConst, wrap(&OpLevelCostEstimator::PredictVariable)},
+      {kVariable, wrap(&OpLevelCostEstimator::PredictVariable)},
+      {kVariableV2, wrap(&OpLevelCostEstimator::PredictVariable)},
 
       {kRank, wrap(&OpLevelCostEstimator::PredictMetadata)},
       {kShape, wrap(&OpLevelCostEstimator::PredictMetadata)},
@@ -429,6 +433,7 @@ Costs OpLevelCostEstimator::PredictOpCountBasedCost(
     costs.execution_time = compute_cost + memory_cost;
   }
   costs.inaccurate = found_unknown_shapes;
+  costs.max_memory = total_output_size;
   return costs;
 }
 
@@ -885,6 +890,30 @@ Costs OpLevelCostEstimator::PredictNoOp(const OpContext& op_context) const {
   return Costs::ZeroCosts();
 }
 
+Costs OpLevelCostEstimator::PredictIdentity(const OpContext& op_context) const {
+  const auto& op_features = op_context.op_info;
+  VLOG(1) << "Op:" << op_features.op() << " Execution Time 0 (ns)";
+  Costs result = Costs::ZeroCosts();
+  result.max_memory = CalculateOutputSize(op_features, &result.inaccurate);
+  // Assign the minimum amount of time we can represent to the identity op since
+  // it tends to be really cheap.
+  result.compute_time = kMinComputeTime;
+  result.execution_time = result.compute_time;
+  return result;
+}
+
+Costs OpLevelCostEstimator::PredictVariable(const OpContext& op_context) const {
+  const auto& op_features = op_context.op_info;
+  VLOG(1) << "Op:" << op_features.op() << " Execution Time 0 (ns)";
+  Costs result = Costs::ZeroCosts();
+  result.persistent_memory =
+      CalculateOutputSize(op_features, &result.inaccurate);
+
+  result.compute_time = kMinComputeTime;
+  result.execution_time = result.execution_time;
+  return result;
+}
+
 Costs OpLevelCostEstimator::PredictBatchMatMul(
     const OpContext& op_context) const {
   const auto& op_features = op_context.op_info;
@@ -898,13 +927,12 @@ Costs OpLevelCostEstimator::PredictBatchMatMul(
 
 Costs OpLevelCostEstimator::PredictMetadata(const OpContext& op_context) const {
   const auto& op_features = op_context.op_info;
-  Costs costs;
+  Costs costs = Costs::ZeroCosts();
   costs.max_memory = CalculateOutputSize(op_features, &costs.inaccurate);
   // Metadata operations are so cheap we assume they take the minimum amount of
   // time we can represent (1 ns).
-  costs.execution_time = 1;
-  costs.compute_time = 1;
-  costs.memory_time = 0;
+  costs.compute_time = kMinComputeTime;
+  costs.execution_time = costs.compute_time;
 
   return costs;
 }
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.h b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
index 5f541ccf04..a292e5e97f 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.h
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
@@ -132,6 +132,8 @@ class OpLevelCostEstimator {
   Costs PredictConv2DBackpropFilter(const OpContext& op_context) const;
   Costs PredictMatMul(const OpContext& op_context) const;
   Costs PredictNoOp(const OpContext& op_context) const;
+  Costs PredictIdentity(const OpContext& op_context) const;
+  Costs PredictVariable(const OpContext& op_context) const;
   Costs PredictBatchMatMul(const OpContext& op_context) const;
   Costs PredictMetadata(const OpContext& op_context) const;
 
-- 
GitLab


From 7c41a89e22b6bd895082a30e6f2847ae56c5db31 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Fri, 26 Jan 2018 14:14:34 -0800
Subject: [PATCH 267/966] Fix build error with GCC 7.2.1 on AWS Linux 2
 (#16470)

This fix fixes a build failure when compiling with
GCC 7.2.1 on AWS Linux 2:
```
gcc version 7.2.1 20170915 (Red Hat 7.2.1-2) (GCC)
```

The eror output was:
```
...
./tensorflow/contrib/lite/toco/model.h:1567:25: error: 'std::function' has not been declared
   void EraseArrays(std::function<bool(const string&)> discardable) {
  .....
```

This fix is related to 16046.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/contrib/lite/toco/model.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/contrib/lite/toco/model.h b/tensorflow/contrib/lite/toco/model.h
index d1af371fd4..6fba8f2629 100644
--- a/tensorflow/contrib/lite/toco/model.h
+++ b/tensorflow/contrib/lite/toco/model.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_LITE_TOCO_MODEL_H_
 #define TENSORFLOW_CONTRIB_LITE_TOCO_MODEL_H_
 
+#include <functional>
 #include <initializer_list>
 #include <memory>
 #include <string>
-- 
GitLab


From 1cbd7bcd4caf0925efa0fad62c1167400ba4ab36 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 26 Jan 2018 14:14:30 -0800
Subject: [PATCH 268/966] Delete mkl_tfconv_op.cc which seem to be a duplicate
 of mkl_tfconv_op.h, and did not exist in the external github TF repository.

PiperOrigin-RevId: 183443347
---
 tensorflow/core/kernels/mkl_tfconv_op.cc | 124 -----------------------
 1 file changed, 124 deletions(-)
 delete mode 100644 tensorflow/core/kernels/mkl_tfconv_op.cc

diff --git a/tensorflow/core/kernels/mkl_tfconv_op.cc b/tensorflow/core/kernels/mkl_tfconv_op.cc
deleted file mode 100644
index c35f857cfe..0000000000
--- a/tensorflow/core/kernels/mkl_tfconv_op.cc
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifdef INTEL_MKL
-
-#include <algorithm>
-#include <vector>
-#include "tensorflow/core/framework/numeric_op.h"
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/ops_util.h"
-#include "tensorflow/core/platform/cpu_info.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/util/tensor_format.h"
-
-#include "mkl_dnn.h"
-#include "mkl_dnn_types.h"
-#include "tensorflow/core/util/mkl_util.h"
-
-namespace tensorflow {
-typedef Eigen::ThreadPoolDevice CPUDevice;
-
-///////////////////////////////////////////////////////////
-//               Op kernel
-///////////////////////////////////////////////////////////
-
-template <typename Device, typename T>
-class MklToTfOp : public OpKernel {
- public:
-  explicit MklToTfOp(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format_str));
-    OP_REQUIRES_OK(context, context->GetAttr("T", &op_data_type));
-    has_avx512f_ = port::TestCPUFeature(port::CPUFeature::AVX512F);
-  }
-
-  void Compute(OpKernelContext* context) override {
-    // Check that input tensor is in MKL format.
-    const Tensor& input_tensor = MklGetInput(context, 0);
-    MklShape input_shape;
-    GetMklShape(context, 0, &input_shape);
-
-    // if input is already in Tf format, then just copy input tensor to output.
-    if (!input_shape.IsMklTensor()) {
-      context->set_output(0, input_tensor);
-      VLOG(1) << "MKLToTFConversion: No conversion needed, "
-              << "copying input to output";
-      return;
-    }
-
-    // Check that input data type is same as operator data type and that it is
-    // same as output data type.
-    DataType input_data_type = input_type(0);
-    DataType output_data_type = output_type(0);
-    CHECK_EQ(op_data_type, input_data_type);
-    CHECK_EQ(op_data_type, output_data_type);
-
-    TensorShape output_shape;
-    size_t ndims = input_shape.GetDimension();
-    size_t* in_sizes = new size_t[ndims];
-    for (size_t i = 0; i < ndims; i++) {
-      // Outermost to innermost dimension
-      output_shape.AddDim(input_shape.GetSizes()[input_shape.tf_dim_idx(i)]);
-      in_sizes[i] = input_shape.GetSizes()[i];
-    }
-
-    // Allocate output tensor.
-    Tensor* output_tensor = NULL;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, output_shape, &output_tensor));
-
-    dnnLayout_t output_layout =
-        static_cast<dnnLayout_t>(input_shape.GetTfLayout());
-    // Execute DNNConversion.
-    void* input_buffer =
-        static_cast<void*>(const_cast<T*>(input_tensor.flat<T>().data()));
-    delete[] in_sizes;
-    void* output_buffer =
-        static_cast<void*>(const_cast<T*>(output_tensor->flat<T>().data()));
-    input_shape.GetConvertedFlatData(output_layout, input_buffer,
-                                     output_buffer);
-    VLOG(1) << "MKLToTFConversion complete successfully.";
-  }
-
- private:
-  /// Data format of the operation
-  string data_format_str;
-
-  /// Data type of the operation
-  DataType op_data_type;
-
-  /// CPUIDInfo
-  bool has_avx512f_ = false;
-};
-
-///////////////////////////////////////////////////////////
-//               Register kernel
-///////////////////////////////////////////////////////////
-
-#define REGISTER_CPU(T)                                             \
-  REGISTER_KERNEL_BUILDER(Name("_MklToTf")                          \
-                              .Device(DEVICE_CPU)                   \
-                              .TypeConstraint<T>("T")               \
-                              .Label(mkl_op_registry::kMklOpLabel), \
-                          MklToTfOp<CPUDevice, T>);
-
-TF_CALL_float(REGISTER_CPU);
-#undef REGISTER_CPU
-}  // namespace tensorflow
-#endif /* INTEL_MKL */
-- 
GitLab


From 84d7f94efd1489b939a4672b0f47d6aa66d9eb91 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 26 Jan 2018 14:16:19 -0800
Subject: [PATCH 269/966] To add __init__.py to some paths that are imported by
 other modules.

PiperOrigin-RevId: 183443656
---
 .../boosted_trees/python/training/__init__.py  | 18 ++++++++++++++++++
 .../python/training/functions/__init__.py      | 18 ++++++++++++++++++
 .../boosted_trees/python/utils/__init__.py     | 18 ++++++++++++++++++
 3 files changed, 54 insertions(+)
 create mode 100644 tensorflow/contrib/boosted_trees/python/training/__init__.py
 create mode 100644 tensorflow/contrib/boosted_trees/python/training/functions/__init__.py
 create mode 100644 tensorflow/contrib/boosted_trees/python/utils/__init__.py

diff --git a/tensorflow/contrib/boosted_trees/python/training/__init__.py b/tensorflow/contrib/boosted_trees/python/training/__init__.py
new file mode 100644
index 0000000000..b569ac5fdb
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/python/training/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""training module under boosted_trees."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/__init__.py b/tensorflow/contrib/boosted_trees/python/training/functions/__init__.py
new file mode 100644
index 0000000000..c1750117cd
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""functions module under boosted_trees."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/contrib/boosted_trees/python/utils/__init__.py b/tensorflow/contrib/boosted_trees/python/utils/__init__.py
new file mode 100644
index 0000000000..6ceb150c26
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/python/utils/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""utils module under boosted_trees."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
-- 
GitLab


From 887509ffe25387efd4c869ffbe46b47ba6049860 Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Fri, 26 Jan 2018 14:32:45 -0800
Subject: [PATCH 270/966] tfe.metrics.{Mean,Accuracy} return their inputs.

This makes chaining them easier. Control dependencies to ensure updates
happen are implicitly added by the function code.

PiperOrigin-RevId: 183446211
---
 tensorflow/contrib/eager/python/metrics_impl.py | 12 ++++++++++++
 tensorflow/contrib/eager/python/metrics_test.py | 13 +++++++++++++
 2 files changed, 25 insertions(+)

diff --git a/tensorflow/contrib/eager/python/metrics_impl.py b/tensorflow/contrib/eager/python/metrics_impl.py
index bf029ca5f9..ea8dbf2b46 100644
--- a/tensorflow/contrib/eager/python/metrics_impl.py
+++ b/tensorflow/contrib/eager/python/metrics_impl.py
@@ -291,6 +291,9 @@ class Mean(Metric):
     Args:
       values: Tensor with the per-example value.
       weights: Optional weighting of each example. Defaults to 1.
+
+    Returns:
+      The arguments, for easy chaining.
     """
     if weights is None:
       self.denom.assign_add(
@@ -302,6 +305,9 @@ class Mean(Metric):
       self.denom.assign_add(math_ops.reduce_sum(weights))
       values = math_ops.cast(values, self.dtype) * weights
       self.numer.assign_add(math_ops.reduce_sum(values))
+    if weights is None:
+      return values
+    return values, weights
 
   def result(self):
     t = self.numer / self.denom
@@ -329,7 +335,13 @@ class Accuracy(Mean):
         per element of the Tensor.
       predictions: Tensor with the predicted label for each example.
       weights: Optional weighting of each example. Defaults to 1.
+
+    Returns:
+      The arguments, for easy chaining.
     """
     matches = math_ops.equal(labels, predictions)
     matches = math_ops.cast(matches, dtypes.float64)
     super(Accuracy, self).call(matches, weights=weights)
+    if weights is None:
+      return labels, predictions
+    return labels, predictions, weights
diff --git a/tensorflow/contrib/eager/python/metrics_test.py b/tensorflow/contrib/eager/python/metrics_test.py
index 9cf34fd9b2..a9ecaa3f8b 100644
--- a/tensorflow/contrib/eager/python/metrics_test.py
+++ b/tensorflow/contrib/eager/python/metrics_test.py
@@ -180,6 +180,19 @@ class MetricsTest(test.TestCase):
         m2 = metrics.Mean()
         m2(2)
 
+  def testMetricsChain(self):
+    with context.graph_mode(), self.test_session():
+      m1 = metrics.Mean()
+      m2 = metrics.Mean(name="m2")
+      update_m2 = m2(3.0)
+      update_m2_2 = m2(m1(1.0))
+      m1.init_variables().run()
+      m2.init_variables().run()
+      update_m2.eval()
+      update_m2_2.eval()
+      self.assertAllEqual(m2.result().eval(), 2.0)
+      self.assertAllEqual(m1.result().eval(), 1.0)
+
 
 if __name__ == "__main__":
   test.main()
-- 
GitLab


From b45c09ce715fb1d4d0a0f7e09586bd532031049e Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Fri, 26 Jan 2018 14:35:04 -0800
Subject: [PATCH 271/966] Improvements to eager linear regression benchmark:  
 1. Using _shape_tuple   2. Bypassing * over math_ops.mul etc   3. Flatmaps in
 the tape code   4. Cache for ones similar to for zeros   5. Fast path for
 _SubGrad   6. Fast global_step += 1 for resource variables   7. Bypassing
 deprecated args decorator in eager mode

PiperOrigin-RevId: 183446593
---
 tensorflow/c/eager/tape.h               | 51 ++++++++++++-------------
 tensorflow/python/eager/backprop.py     |  8 ++--
 tensorflow/python/ops/math_grad.py      | 26 ++++++++-----
 tensorflow/python/training/optimizer.py | 10 ++++-
 tensorflow/python/util/deprecation.py   |  5 ++-
 5 files changed, 60 insertions(+), 40 deletions(-)

diff --git a/tensorflow/c/eager/tape.h b/tensorflow/c/eager/tape.h
index 2b65e38f54..bdb0815d6b 100644
--- a/tensorflow/c/eager/tape.h
+++ b/tensorflow/c/eager/tape.h
@@ -18,12 +18,12 @@ limitations under the License.
 // Language-agnostic gradient tape. Does not perform backpropagation, just
 // maintains the data structures required to do so.
 
-#include <unordered_map>
-#include <unordered_set>
 #include <vector>
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -54,11 +54,11 @@ struct OpTapeEntry {
 // Map from tensor_id to internally-defined operation-id of the operation which
 // produced this tensor. A value of -1 means that the tensor was directly
 // watched and not the result of any operation in the tape.
-using TensorTape = std::unordered_map<int64, int64>;
+using TensorTape = gtl::FlatMap<int64, int64>;
 
 // Map from operation-id to tape entry.
 template <typename BackwardFunction>
-using OpTape = std::unordered_map<int64, OpTapeEntry<BackwardFunction>>;
+using OpTape = gtl::FlatMap<int64, OpTapeEntry<BackwardFunction>>;
 
 // Operations the tape needs to perform on tensors to do backpropagation. Named
 // "vspace" because a subset of these are related to a vector space, such as
@@ -159,7 +159,7 @@ class GradientTape {
 
   // Map from tensor id to number of remaining usages (i.e. how many entries in
   // the tape refer to it); to aid in tape garbage collection.
-  std::unordered_map<int64, int64> tensor_usage_;
+  gtl::FlatMap<int64, int64> tensor_usage_;
 
   // If false, all activations are deleted in the first call to ComputeGradient.
   // Else, only when this is destructed.
@@ -286,11 +286,11 @@ struct BackpropInitialState {
 
   // Map from tensor ID to how many references still exist for this tensor in
   // the tape.
-  std::unordered_map<int64, int64> tensor_usage_counts;
+  gtl::FlatMap<int64, int64> tensor_usage_counts;
 
   // Maps from op ID to how many output tensors of this op still need to have
   // their gradients computed.
-  std::unordered_map<int64, int64> op_missing_tensor;
+  gtl::FlatMap<int64, int64> op_missing_tensor;
 };
 
 // If `persistent_tape` is true, op_tape is not changed and none of the
@@ -301,8 +301,8 @@ struct BackpropInitialState {
 template <typename BackwardFunction>
 BackpropInitialState<BackwardFunction> PrepareBackprop(
     gtl::ArraySlice<int64> target, const TensorTape& tensor_tape,
-    OpTape<BackwardFunction>* op_tape,
-    const std::unordered_set<int64>& sources_set, bool persistent_tape) {
+    OpTape<BackwardFunction>* op_tape, const gtl::FlatSet<int64>& sources_set,
+    bool persistent_tape) {
   std::vector<int64> tensor_stack;
   tensor_stack.reserve(target.size());
   for (auto t : target) {
@@ -362,7 +362,7 @@ BackpropInitialState<BackwardFunction> PrepareBackprop(
 template <typename BackwardFunction>
 std::vector<int64> InitialStack(
     const OpTape<BackwardFunction>& op_tape,
-    const std::unordered_map<int64, int64>& op_missing_tensor) {
+    const gtl::FlatMap<int64, int64>& op_missing_tensor) {
   std::vector<int64> result;
   for (auto& op_entry : op_tape) {
     if (op_missing_tensor.find(op_entry.first) == op_missing_tensor.end()) {
@@ -373,13 +373,13 @@ std::vector<int64> InitialStack(
 }
 
 template <typename Gradient, typename BackwardFunction>
-Status InitialGradients(
-    const VSpace<Gradient, BackwardFunction>& vspace,
-    gtl::ArraySlice<int64> target_tensor_ids,
-    gtl::ArraySlice<Gradient*> output_gradients, const TensorTape& tensor_tape,
-    const OpTape<BackwardFunction>& op_tape,
-    const std::unordered_map<int64, int64>& tensor_usage_counts,
-    std::unordered_map<int64, std::vector<Gradient*>>* result) {
+Status InitialGradients(const VSpace<Gradient, BackwardFunction>& vspace,
+                        gtl::ArraySlice<int64> target_tensor_ids,
+                        gtl::ArraySlice<Gradient*> output_gradients,
+                        const TensorTape& tensor_tape,
+                        const OpTape<BackwardFunction>& op_tape,
+                        const gtl::FlatMap<int64, int64>& tensor_usage_counts,
+                        gtl::FlatMap<int64, std::vector<Gradient*>>* result) {
   for (int i = 0; i < target_tensor_ids.size(); ++i) {
     const int64 id = target_tensor_ids[i];
     if (tensor_usage_counts.find(id) != tensor_usage_counts.end()) {
@@ -441,13 +441,13 @@ Status GradientTape<Gradient, BackwardFunction>::ComputeGradient(
     gtl::ArraySlice<int64> source_tensor_ids,
     gtl::ArraySlice<Gradient*> output_gradients,
     std::vector<Gradient*>* result) {
-  std::unordered_set<int64> sources_set(source_tensor_ids.begin(),
-                                        source_tensor_ids.end());
+  gtl::FlatSet<int64> sources_set(source_tensor_ids.begin(),
+                                  source_tensor_ids.end());
   BackpropInitialState<BackwardFunction> state = PrepareBackprop(
       target_tensor_ids, tensor_tape_, &op_tape_, sources_set, persistent_);
   std::vector<int64> op_stack =
       InitialStack(state.op_tape, state.op_missing_tensor);
-  std::unordered_map<int64, std::vector<Gradient*>> gradients;
+  gtl::FlatMap<int64, std::vector<Gradient*>> gradients;
   Status s = InitialGradients(vspace, target_tensor_ids, output_gradients,
                               tensor_tape_, state.op_tape,
                               state.tensor_usage_counts, &gradients);
@@ -463,7 +463,7 @@ Status GradientTape<Gradient, BackwardFunction>::ComputeGradient(
     cleanup();
     return s;
   }
-  std::unordered_map<int64, int64> gradients_size;
+  gtl::FlatMap<int64, int64> gradients_size;
   // TODO(apassos) multiple threads could be dequeuing from op_stack at the same
   // time, for better CPU backprop performance.
   VLOG(1) << "Initial stack:";
@@ -472,11 +472,10 @@ Status GradientTape<Gradient, BackwardFunction>::ComputeGradient(
       VLOG(1) << "  " << t;
     }
   }
-  std::unordered_map<string, std::unordered_set<int>>
-      functions_accept_none_for_indices({
-          {"SoftmaxCrossEntropyWithLogits", {1}},
-          {"FusedBatchNorm", {1, 2, 3, 4}},
-      });
+  gtl::FlatMap<string, gtl::FlatSet<int>> functions_accept_none_for_indices({
+      {"SoftmaxCrossEntropyWithLogits", {1}},
+      {"FusedBatchNorm", {1, 2, 3, 4}},
+  });
   while (!op_stack.empty()) {
     const int64 op = op_stack.back();
     VLOG(1) << "Popped " << op;
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index a2a3e230bb..d79d1fc0a6 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -734,7 +734,7 @@ def _num_elements(grad):
   raise ValueError("`grad` not a Tensor or IndexedSlices.")
 
 
-_last_shape_dtype = [None, None]
+_last_zero_shape_dtype = [None, None]
 _last_zero = [None]
 
 
@@ -748,13 +748,15 @@ def _zeros(shape, dtype):
     # TODO(apassos): need to save enough information about variant tensors to do
     # a zeros
     return None
-  if [shape, dtype] != _last_shape_dtype:
-    _last_shape_dtype[:] = [shape, dtype]
+  if [shape, dtype] != _last_zero_shape_dtype:
+    _last_zero_shape_dtype[:] = [shape, dtype]
     _last_zero[0] = _fast_fill(0, shape, dtype)
   return _last_zero[0]
 
 
 def _ones(shape, dtype):
+  if shape == ():  # pylint: disable=g-explicit-bool-comparison
+    return constant_op.constant(1, dtype=dtype)
   return _fast_fill(1, shape, dtype)
 
 
diff --git a/tensorflow/python/ops/math_grad.py b/tensorflow/python/ops/math_grad.py
index bca4c665d2..3cb71eba8c 100644
--- a/tensorflow/python/ops/math_grad.py
+++ b/tensorflow/python/ops/math_grad.py
@@ -40,15 +40,16 @@ def _SumGrad(op, grad):
   """Gradient for Sum."""
   # Fast path for when reducing to a scalar and ndims is known: adds only
   # Reshape and Tile ops (and possibly a Shape).
-  if op.inputs[0].get_shape().ndims is not None:
+  input_0_shape = op.inputs[0]._shape_tuple()  # pylint: disable=protected-access
+  if input_0_shape is not None:
     axes = tensor_util.constant_value(op.inputs[1])
     if axes is not None:
-      rank = op.inputs[0].get_shape().ndims
+      rank = len(input_0_shape)
       if np.array_equal(axes, np.arange(rank)):  # Reduce all dims.
         grad = array_ops.reshape(grad, [1] * rank)
         # If shape is not fully defined (but rank is), we use Shape.
-        if op.inputs[0].get_shape().is_fully_defined():
-          input_shape = op.inputs[0].get_shape().as_list()
+        if None not in input_0_shape:
+          input_shape = input_0_shape
         else:
           input_shape = array_ops.shape(op.inputs[0])
         return [array_ops.tile(grad, input_shape), None]
@@ -96,9 +97,12 @@ def _MinGrad(op, grad):
 def _MeanGrad(op, grad):
   """Gradient for Mean."""
   sum_grad = _SumGrad(op, grad)[0]
-  input_size = op.inputs[0].get_shape().num_elements()
-  output_size = op.outputs[0].get_shape().num_elements()
-  if input_size is not None and output_size is not None:
+  input_shape = op.inputs[0]._shape_tuple()  # pylint: disable=protected-access
+  output_shape = op.outputs[0]._shape_tuple()  # pylint: disable=protected-access
+  if (input_shape is not None and output_shape is not None and
+      None not in input_shape and None not in output_shape):
+    input_size = np.prod(input_shape)
+    output_size = np.prod(output_shape)
     factor = input_size // max(output_size, 1)
     factor = constant_op.constant(factor, dtype=sum_grad.dtype)
   else:
@@ -106,7 +110,7 @@ def _MeanGrad(op, grad):
     output_shape = array_ops.shape(op.outputs[0])
     factor = _safe_shape_div(
         math_ops.reduce_prod(input_shape), math_ops.reduce_prod(output_shape))
-  return sum_grad / math_ops.cast(factor, sum_grad.dtype), None
+  return math_ops.truediv(sum_grad, math_ops.cast(factor, sum_grad.dtype)), None
 
 
 @ops.RegisterGradient("Prod")
@@ -330,7 +334,7 @@ def _SquareGrad(op, grad):
   # Added control dependencies to prevent 2*x from being computed too early.
   with ops.control_dependencies([grad]):
     x = math_ops.conj(x)
-    return grad * (2.0 * x)
+    return math_ops.multiply(grad, math_ops.multiply(x, 2.0))
 
 
 @ops.RegisterGradient("Sqrt")
@@ -756,8 +760,12 @@ def _AddGrad(op, grad):
 
 @ops.RegisterGradient("Sub")
 def _SubGrad(op, grad):
+  """Gradient for Sub."""
   x = op.inputs[0]
   y = op.inputs[1]
+  if (isinstance(grad, ops.Tensor) and
+      _ShapesFullySpecifiedAndEqual(x, y, grad)):
+    return grad, -grad
   sx = array_ops.shape(x)
   sy = array_ops.shape(y)
   # pylint: disable=protected-access
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index 719b83e5ca..a06b3eada6 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -533,7 +533,15 @@ class Optimizer(object):
       else:
         with ops.control_dependencies([self._finish(update_ops, "update")]):
           with ops.colocate_with(global_step):
-            apply_updates = state_ops.assign_add(global_step, 1, name=name)
+            if isinstance(global_step, resource_variable_ops.ResourceVariable):
+              # TODO(apassos): the implicit read in assign_add is slow; consider
+              # making it less so.
+              apply_updates = resource_variable_ops.assign_add_variable_op(
+                  global_step.handle,
+                  ops.convert_to_tensor(1, dtype=global_step.dtype),
+                  name=name)
+            else:
+              apply_updates = state_ops.assign_add(global_step, 1, name=name)
 
       if context.in_graph_mode():
         if isinstance(apply_updates, ops.Tensor):
diff --git a/tensorflow/python/util/deprecation.py b/tensorflow/python/util/deprecation.py
index 8a66f0435a..2110fc64cf 100644
--- a/tensorflow/python/util/deprecation.py
+++ b/tensorflow/python/util/deprecation.py
@@ -22,6 +22,7 @@ import collections
 import functools
 import re
 
+from tensorflow.python.eager import context
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import decorator_utils
 from tensorflow.python.util import tf_contextlib
@@ -284,7 +285,9 @@ def deprecated_args(date, instructions, *deprecated_arg_names_or_tuples,
     @functools.wraps(func)
     def new_func(*args, **kwargs):
       """Deprecation wrapper."""
-      if _PRINT_DEPRECATION_WARNINGS:
+      # TODO(apassos) figure out a way to have reasonable performance with
+      # deprecation warnings and eager mode.
+      if context.in_graph_mode() and _PRINT_DEPRECATION_WARNINGS:
         invalid_args = []
         named_args = tf_inspect.getcallargs(func, *args, **kwargs)
         for arg_name, spec in iter(deprecated_positions.items()):
-- 
GitLab


From 201d957baf20c3adabcae3f6a616430ae81b94ae Mon Sep 17 00:00:00 2001
From: Martin Wicke <wicke@google.com>
Date: Fri, 26 Jan 2018 14:46:01 -0800
Subject: [PATCH 272/966] Add a security document discussing high level best
 practices and explain vulnerability reporting process.

PiperOrigin-RevId: 183448435
---
 tensorflow/SECURITY.md                   | 239 +++++++++++++++++++++++
 tensorflow/docs_src/community/welcome.md |   4 +-
 2 files changed, 241 insertions(+), 2 deletions(-)
 create mode 100644 tensorflow/SECURITY.md

diff --git a/tensorflow/SECURITY.md b/tensorflow/SECURITY.md
new file mode 100644
index 0000000000..074eed2951
--- /dev/null
+++ b/tensorflow/SECURITY.md
@@ -0,0 +1,239 @@
+# Using TensorFlow Securely
+
+This document discusses how to safely deal with untrusted programs (models or
+model parameters), and input data. Below, we also provide guidelines on how to
+report vulnerabilities in TensorFlow.
+
+## TensorFlow models are programs
+
+TensorFlow's runtime system interprets and executes programs. What machine 
+learning practitioners term
+[**models**](https://developers.google.com/machine-learning/glossary/#model) are
+expressed as programs that TensorFlow executes.  TensorFlow programs are encoded
+as computation
+[**graphs**](https://developers.google.com/machine-learning/glossary/#graph).
+The model's parameters are often stored separately in **checkpoints**.
+
+At runtime, TensorFlow executes the computation graph using the parameters
+provided. Note that the behavior of the computation graph may change
+depending on the parameters provided. TensorFlow itself is not a sandbox. When
+executing the computation graph, TensorFlow may read and write files, send and
+receive data over the network, and even spawn additional processes. All these
+tasks are performed with the permissions of the TensorFlow process. Allowing
+for this flexibility makes for a powerful machine learning platform,
+but it has implications for security.
+
+The computation graph may also accept **inputs**. Those inputs are the
+data you supply to TensorFlow to train a model, or to use a model to run
+inference on the data.
+
+**TensorFlow models are programs, and need to be treated as such from a security
+perspective.** 
+
+## Running untrusted models
+
+As a general rule: **Always** execute untrusted models inside a sandbox (e.g.,
+[nsjail](https://github.com/google/nsjail)). 
+
+There are several ways in which a model could become untrusted. Obviously, if an
+untrusted party supplies TensorFlow kernels, arbitrary code may be executed.
+The same is true if the untrusted party provides Python code, such as the
+Python code that generates TensorFlow graphs.
+
+Even if the untrusted party only supplies the serialized computation
+graph (in form of a `GraphDef`, `SavedModel`, or equivalent on-disk format), the
+set of computation primitives available to TensorFlow is powerful enough that
+you should assume that the TensorFlow process effectively executes arbitrary
+code. One common solution is to whitelist only a few safe Ops. While this is
+possible in theory, we still recommend you sandbox the execution.
+
+It depends on the computation graph whether a user provided checkpoint is safe.
+It is easily possible to create computation graphs in which malicious
+checkpoints can trigger unsafe behavior. For example, consider a graph that
+contains a `tf.cond` depending on the value of a `tf.Variable`. One branch of
+the `tf.cond` is harmless, but the other is unsafe. Since the `tf.Variable` is
+stored in the checkpoint, whoever provides the checkpoint now has the ability to
+trigger unsafe behavior, even though the graph is not under their control.
+
+In other words, graphs can contain vulnerabilities of their own. To allow users
+to provide checkpoints to a model you run on their behalf (e.g., in order to
+compare model quality for a fixed model architecture), you must carefully audit
+your model, and we recommend you run the TensorFlow process in a sandbox.
+
+## Accepting untrusted Inputs
+
+It is possible to write models that are secure in a sense that they can safely
+process untrusted inputs assuming there are no bugs. There are two main reasons
+to not rely on this: first, it is easy to write models which must not be exposed
+to untrusted inputs, and second, there are bugs in any software system of
+sufficient complexity. Letting users control inputs could allow them to trigger
+bugs either in TensorFlow or in dependent libraries.
+
+In general, it is good practice to isolate parts of any system which is exposed
+to untrusted (e.g., user-provided) inputs in a sandbox.
+
+A useful analogy to how any TensorFlow graph is executed is any interpreted
+programming language, such as Python. While it is possible to write secure
+Python code which can be exposed to user supplied inputs (by, e.g., carefully
+quoting and sanitizing input strings, size-checking input blobs, etc.), it is
+very easy to write Python programs which are insecure. Even secure Python code
+could be rendered insecure by a bug in the Python interpreter, or in a bug in a
+Python library used (e.g.,
+[this one](https://www.cvedetails.com/cve/CVE-2017-12852/)).
+
+## Running a TensorFlow server
+
+TensorFlow is a platform for distributed computing, and as such there is a
+TensorFlow server (`tf.train.Server`). **The TensorFlow server is meant for
+internal communication only. It is not built for use in an untrusted network.**
+
+For performance reasons, the default TensorFlow server does not include any
+authorization protocol and sends messages unencrypted. It accepts connections
+from anywhere, and executes the graphs it is sent without performing any checks.
+Therefore, if you run a `tf.train.Server` in your network, anybody with
+access to the network can execute what you should consider arbitrary code with
+the privileges of the process running the `tf.train.Server`.
+
+When running distributed TensorFlow, you must isolate the network in which the
+cluster lives. Cloud providers provide instructions for setting up isolated
+networks, which are sometimes branded as "virtual private cloud." Refer to the
+instructions for
+[GCP](https://cloud.google.com/compute/docs/networks-and-firewalls) and
+[AWS](https://aws.amazon.com/vpc/)) for details.
+
+Note that `tf.train.Server` is different from the server created by
+`tensorflow/serving` (the default binary for which is called `ModelServer`).
+By default, `ModelServer` also has no built-in mechanism for authentication.
+Connecting it to an untrusted network allows anyone on this network to run the
+graphs known to the `ModelServer`. This means that an attacker may run
+graphs using untrusted inputs as described above, but they would not be able to
+execute arbitrary graphs. It is possible to safely expose a `ModelServer`
+directly to an untrusted network, **but only if the graphs it is configured to
+use have been carefully audited to be safe**. 
+
+Similar to best practices for other servers, we recommend running any
+`ModelServer` with appropriate privileges (i.e., using a separate user with
+reduced permisisons). In the spirit of defense in depth, we recommend
+authenticating requests to any TensorFlow server connected to an untrusted
+network, as well as sandboxing the server to minimize the adverse effects of
+any breach.
+
+## Vulnerabilities in TensorFlow
+
+TensorFlow is a large and complex system. It also depends on a large set of
+third party libraries (e.g., `numpy`, `libjpeg-turbo`, PNG parsers, `protobuf`).
+It is possible that TensorFlow or its dependent libraries contain
+vulnerabilities that would allow triggering unexpected or dangerous behavior
+with specially crafted inputs.
+
+### What is a vulnerability?
+
+Given TensorFlow's flexibility, it is possible to specify computation graphs
+which exhibit unexpected or unwanted behaviors. The fact that TensorFlow models
+can perform arbitrary computations means that they may read and write files,
+communicate via the network, produce deadlocks and infinite loops, or run out
+of memory. It is only when these behaviors are outside the specifications of the
+operations involved that such behavior is a vulnerability. 
+
+A `FileWriter` writing a file is not unexpected behavior and therefore is not a
+vulnerability in TensorFlow. A `MatMul` allowing arbitrary binary code execution
+**is** a vulnerability.
+
+This is more subtle from a system perspective. For example, it is easy to cause
+a TensorFlow process to try to allocate more memory than available by specifying
+a computation graph containing an ill-considered `tf.tile` operation. TensorFlow
+should exit cleanly in this case (it would raise an exception in Python, or
+return an error `Status` in C++). However, if the surrounding system is not
+expecting the possibility, such behavior could be used in a denial of service
+attack (or worse). Because TensorFlow behaves correctly, this is not a
+vulnerability in TensorFlow (although it would be a vulnerability of this
+hypothetical system).
+
+As a general rule, it is incorrect behavior for Tensorflow to access memory it
+does not own, or to terminate in an unclean way. Bugs in TensorFlow that lead to
+such behaviors constitute a vulnerability.
+
+One of the most critical parts of any system is input handling. If malicious
+input can trigger side effects or incorrect behavior, this is a bug, and likely
+a vulnerability.
+
+### Reporting vulnerabilities
+
+Please email reports about any security related issues you find to
+`security@tensorflow.org`. This mail is delivered to a small security team. Your
+email will be acknowledged within one business day, and you'll receive a more
+detailed response to your email within 7 days indicating the next steps in
+handling your report. For critical problems, you may encrypt your report (see
+below).
+
+Please use a descriptive subject line for your report email. After the initial
+reply to your report, the security team will endeavor to keep you informed of
+the progress being made towards a fix and announcement. 
+
+If you believe that an existing (public) issue is security-related, please send
+an email to `security@tensorflow.org`. The email should include the issue ID and
+a short description of why it should be handled according to this security
+policy.
+
+Once an issue is reported, TensorFlow uses the following disclosure process:
+
+* When a report is received, we confirm the issue and determine its severity.
+* If we know of specific third-party services or software based on TensorFlow
+  that require mitigation before publication, those projects will be notified.
+* An advisory is prepared (but not published) which details the problem and
+  steps for mitigation.
+* Wherever possible, fixes are prepared for the last minor release of the two
+  latest major releases, as well as the master branch. We will attempt to
+  commit these fixes as soon as possible, and as close together as
+  possible.
+* Patch releases are published for all fixed released versions, a
+  notification is sent to discuss@tensorflow.org, and the advisory is published.
+
+Past security advisories are listed below. We credit reporters for identifying
+security issues, although we keep your name confidential if you request it.
+
+#### Encryption key for `security@tensorflow.org`
+
+If your disclosure is extremely sensitive, you may choose to encrypt your
+report using the key below. Please only use this for critical security
+reports.
+
+```
+-----BEGIN PGP PUBLIC KEY BLOCK-----
+
+mQENBFpqdzwBCADTeAHLNEe9Vm77AxhmGP+CdjlY84O6DouOCDSq00zFYdIU/7aI
+LjYwhEmDEvLnRCYeFGdIHVtW9YrVktqYE9HXVQC7nULU6U6cvkQbwHCdrjaDaylP
+aJUXkNrrxibhx9YYdy465CfusAaZ0aM+T9DpcZg98SmsSml/HAiiY4mbg/yNVdPs
+SEp/Ui4zdIBNNs6at2gGZrd4qWhdM0MqGJlehqdeUKRICE/mdedXwsWLM8AfEA0e
+OeTVhZ+EtYCypiF4fVl/NsqJ/zhBJpCx/1FBI1Uf/lu2TE4eOS1FgmIqb2j4T+jY
+e+4C8kGB405PAC0n50YpOrOs6k7fiQDjYmbNABEBAAG0LVRlbnNvckZsb3cgU2Vj
+dXJpdHkgPHNlY3VyaXR5QHRlbnNvcmZsb3cub3JnPokBTgQTAQgAOBYhBEkvXzHm
+gOJBnwP4Wxnef3wVoM2yBQJaanc8AhsDBQsJCAcCBhUKCQgLAgQWAgMBAh4BAheA
+AAoJEBnef3wVoM2yNlkIAICqetv33MD9W6mPAXH3eon+KJoeHQHYOuwWfYkUF6CC
+o+X2dlPqBSqMG3bFuTrrcwjr9w1V8HkNuzzOJvCm1CJVKaxMzPuXhBq5+DeT67+a
+T/wK1L2R1bF0gs7Pp40W3np8iAFEh8sgqtxXvLGJLGDZ1Lnfdprg3HciqaVAiTum
+HBFwszszZZ1wAnKJs5KVteFN7GSSng3qBcj0E0ql2nPGEqCVh+6RG/TU5C8gEsEf
+3DX768M4okmFDKTzLNBm+l08kkBFt+P43rNK8dyC4PXk7yJa93SmS/dlK6DZ16Yw
+2FS1StiZSVqygTW59rM5XNwdhKVXy2mf/RtNSr84gSi5AQ0EWmp3PAEIALInfBLR
+N6fAUGPFj+K3za3PeD0fWDijlC9f4Ety/icwWPkOBdYVBn0atzI21thPRbfuUxfe
+zr76xNNrtRRlbDSAChA1J5T86EflowcQor8dNC6fS+oHFCGeUjfEAm16P6mGTo0p
+osdG2XnnTHOOEFbEUeWOwR/zT0QRaGGknoy2pc4doWcJptqJIdTl1K8xyBieik/b
+nSoClqQdZJa4XA3H9G+F4NmoZGEguC5GGb2P9NHYAJ3MLHBHywZip8g9oojIwda+
+OCLL4UPEZ89cl0EyhXM0nIAmGn3Chdjfu3ebF0SeuToGN8E1goUs3qSE77ZdzIsR
+BzZSDFrgmZH+uP0AEQEAAYkBNgQYAQgAIBYhBEkvXzHmgOJBnwP4Wxnef3wVoM2y
+BQJaanc8AhsMAAoJEBnef3wVoM2yX4wIALcYZbQhSEzCsTl56UHofze6C3QuFQIH
+J4MIKrkTfwiHlCujv7GASGU2Vtis5YEyOoMidUVLlwnebE388MmaJYRm0fhYq6lP
+A3vnOCcczy1tbo846bRdv012zdUA+wY+mOITdOoUjAhYulUR0kiA2UdLSfYzbWwy
+7Obq96Jb/cPRxk8jKUu2rqC/KDrkFDtAtjdIHh6nbbQhFuaRuWntISZgpIJxd8Bt
+Gwi0imUVd9m9wZGuTbDGi6YTNk0GPpX5OMF5hjtM/objzTihSw9UN+65Y/oSQM81
+v//Fw6ZeY+HmRDFdirjD7wXtIuER4vqCryIqR6Xe9X8oJXz9L/Jhslc=
+=CDME
+-----END PGP PUBLIC KEY BLOCK-----
+```
+
+### Known vulnerabilities
+
+| Type | Versions affected | Reported by | Additional Information |
+|------|:-----------------:|---------------------------------------|
+| out of bounds read| <=1.4 | @zhangbo5891001 | [issue report](https://github.com/tensorflow/tensorflow/issues/14959) |
+
diff --git a/tensorflow/docs_src/community/welcome.md b/tensorflow/docs_src/community/welcome.md
index a3abf25507..d2d3f9edae 100644
--- a/tensorflow/docs_src/community/welcome.md
+++ b/tensorflow/docs_src/community/welcome.md
@@ -12,7 +12,6 @@ The source code for TensorFlow is on
 Before contributing to TensorFlow source code, please review the
 [Contribution guidelines](https://github.com/tensorflow/tensorflow/blob/master/CONTRIBUTING.md).
 
-
 ### Projects developed by the TensorFlow community
 
 The TensorFlow community has created many great projects around TensorFlow, including:
@@ -65,5 +64,6 @@ please read the following list carefully:
     [TensorFlow issues tracker](https://github.com/tensorflow/tensorflow/issues)
     on GitHub.  For example, use the issue tracker to request a
     new operation in TensorFlow.
-
+  * To report vulnerabilities, please follow our
+    [vulnerability disclosure guidelines](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md).
 
-- 
GitLab


From 8ce4986d26f904ccc6e7d5291883931571dc2713 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 26 Jan 2018 14:54:42 -0800
Subject: [PATCH 273/966] Raise shard count to 10 for
 tensorflow/python/kernel_tests:metrics_test

The test was sometimes taking over six minutes to run in asan mode,
causing it to hit the 5 minute timeout.   Setting the shard count
to 6 was inufficient, but setting it to 10 brought the runtime
down to about 3:30 in the worst case over 100 runs.

PiperOrigin-RevId: 183449941
---
 tensorflow/python/kernel_tests/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 8c1d16c2a8..a49d6fb44a 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -2821,7 +2821,7 @@ tf_py_test(
         "//tensorflow/python:random_ops",
         "//tensorflow/python:variables",
     ],
-    shard_count = 3,
+    shard_count = 10,
     tags = ["no_windows_gpu"],
 )
 
-- 
GitLab


From b9494ce8990cab65a20f3d5110f4c2c4402342be Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 26 Jan 2018 14:57:03 -0800
Subject: [PATCH 274/966] Remove dead code

PiperOrigin-RevId: 183450369
---
 .../boosted_trees/kernels/quantile_ops.cc     |  3 --
 tensorflow/core/debug/debug_io_utils.cc       |  2 --
 .../grappler/optimizers/layout_optimizer.cc   | 11 -------
 .../data/group_by_window_dataset_op.cc        |  6 ----
 .../core/platform/cloud/gcs_file_system.cc    |  4 ---
 tensorflow/python/eager/pywrap_tfe_src.cc     | 29 ++-----------------
 .../tools/graph_transforms/quantize_nodes.cc  | 16 ----------
 7 files changed, 3 insertions(+), 68 deletions(-)

diff --git a/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc b/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc
index 88f3006407..e91232bf10 100644
--- a/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc
@@ -42,7 +42,6 @@ using boosted_trees::QuantileStreamResource;
 namespace {
 const char* const kExampleWeightsName = "example_weights";
 const char* const kMaxElementsName = "max_elements";
-const char* const kHandleName = "handle";
 const char* const kNextStampTokenName = "next_stamp_token";
 const char* const kStampTokenName = "stamp_token";
 const char* const kAreBucketsReadyName = "are_buckets_ready";
@@ -52,7 +51,6 @@ const char* const kNumSparseFeaturesName = "num_sparse_features";
 const char* const kSparseBucketsName = "sparse_buckets";
 const char* const kSparseValuesName = "sparse_values";
 const char* const kSparseIndicesName = "sparse_indices";
-const char* const kSparseStreamsStateName = "sparse_streams_state";
 const char* const kSparseSummariesName = "sparse_summaries";
 const char* const kSparseConfigName = "sparse_config";
 const char* const kSparseOutputTensorName = "sparse_quantiles";
@@ -60,7 +58,6 @@ const char* const kSparseOutputTensorName = "sparse_quantiles";
 const char* const kDenseBucketsName = "dense_buckets";
 const char* const kDenseConfigName = "dense_config";
 const char* const kDenseOutputTensorName = "dense_quantiles";
-const char* const kDenseStreamsStateName = "dense_streams_state";
 const char* const kDenseSummariesName = "dense_summaries";
 const char* const kDenseValuesName = "dense_values";
 const char* const kNumDenseFeaturesName = "num_dense_features";
diff --git a/tensorflow/core/debug/debug_io_utils.cc b/tensorflow/core/debug/debug_io_utils.cc
index f81445c20b..baa8c08fdf 100644
--- a/tensorflow/core/debug/debug_io_utils.cc
+++ b/tensorflow/core/debug/debug_io_utils.cc
@@ -574,8 +574,6 @@ Status DebugIO::CloseDebugURL(const string& debug_url) {
   }
 }
 
-static Status CloseDebugURL(const string& debug_url) { return Status::OK(); }
-
 Status DebugFileIO::DumpTensorToDir(const DebugNodeKey& debug_node_key,
                                     const Tensor& tensor,
                                     const uint64 wall_time_us,
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.cc b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
index 735d78e7ee..433b3564fe 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
@@ -2041,17 +2041,6 @@ class DataLayoutOptimizer : GraphProcessor {
   const LayoutOptimizer::TuningConfig& config_;
 };
 
-int GetNumTranspose(const GraphDef& graph) {
-  int number = 0;
-  for (const auto& node : graph.node()) {
-    if (IsTranspose(node)) {
-      number++;
-    }
-  }
-  VLOG(1) << "Number of Transpose nodes: " << number;
-  return number;
-}
-
 int GetNumGPUs(const Cluster& cluster) {
   auto devices = cluster.GetDevices();
   int num_gpus = 0;
diff --git a/tensorflow/core/kernels/data/group_by_window_dataset_op.cc b/tensorflow/core/kernels/data/group_by_window_dataset_op.cc
index eb047e10ec..834c06bb93 100644
--- a/tensorflow/core/kernels/data/group_by_window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/group_by_window_dataset_op.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/core/lib/random/random.h"
 
 namespace tensorflow {
-
 namespace {
 
 // See documentation in ../ops/dataset_ops.cc for a high-level
@@ -510,10 +509,6 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
       return Status::OK();
     }
 
-    // A resource name for the temporary window dataset that is
-    // created as the input to the reduce function.
-    static constexpr const char* kWindowResourceName = "__window_dataset";
-
     const DatasetBase* const input_;
     const NameAttrList key_func_;
     const NameAttrList reduce_func_;
@@ -537,5 +532,4 @@ REGISTER_KERNEL_BUILDER(Name("GroupByWindowDataset").Device(DEVICE_CPU),
                         GroupByWindowDatasetOp);
 
 }  // namespace
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc
index 520720372d..91d381bd6f 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system.cc
@@ -50,7 +50,6 @@ limitations under the License.
 #endif
 
 namespace tensorflow {
-
 namespace {
 
 constexpr char kGcsUriBase[] = "https://www.googleapis.com/storage/v1/";
@@ -59,9 +58,6 @@ constexpr char kGcsUploadUriBase[] =
 constexpr char kStorageHost[] = "storage.googleapis.com";
 constexpr size_t kReadAppendableFileBufferSize = 1024 * 1024;  // In bytes.
 constexpr int kGetChildrenDefaultPageSize = 1000;
-// Initial delay before retrying a GCS upload.
-// Subsequent delays can be larger due to exponential back-off.
-constexpr uint64 kUploadRetryDelayMicros = 1000000L;
 // The HTTP response code "308 Resume Incomplete".
 constexpr uint64 HTTP_CODE_RESUME_INCOMPLETE = 308;
 // The environment variable that overrides the size of the readahead buffer.
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index 647f03351d..836998cfdc 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -86,30 +86,6 @@ bool ParseBoolValue(const string& key, PyObject* py_value, TF_Status* status,
   return true;
 }
 
-const char* ParseProtoValue(const string& key, const char* proto_name,
-                            PyObject* py_value, size_t* size,
-                            TF_Status* status) {
-  char* output = nullptr;
-  Py_ssize_t py_size;
-  if (PyBytes_Check(py_value) &&
-      PyBytes_AsStringAndSize(py_value, &output, &py_size) >= 0) {
-    *size = static_cast<size_t>(py_size);
-    return output;
-  }
-#if PY_MAJOR_VERSION >= 3
-  if (PyUnicode_Check(py_value) &&
-      (output = PyUnicode_AsUTF8AndSize(py_value, &py_size)) != nullptr) {
-    *size = static_cast<size_t>(py_size);
-    return output;
-  }
-#endif
-  TF_SetStatus(status, TF_INVALID_ARGUMENT,
-               tensorflow::strings::StrCat("Expecting a string (serialized ",
-                                           proto_name, ") value for attr ", key)
-                   .c_str());
-  return nullptr;
-}
-
 bool SetOpAttrList(TFE_Op* op, const char* key, PyObject* py_list,
                    TF_AttrType type, TF_Status* status) {
   if (!PySequence_Check(py_list)) {
@@ -329,8 +305,9 @@ void SetOpAttrs(TFE_Context* ctx, TFE_Op* op, PyObject* attrs, int start_index,
 tensorflow::mutex exception_class_mutex(tensorflow::LINKER_INITIALIZED);
 PyObject* exception_class GUARDED_BY(exception_class_mutex) = nullptr;
 
-static tensorflow::mutex _uid_mutex(tensorflow::LINKER_INITIALIZED);
-static tensorflow::int64 _uid GUARDED_BY(_uid_mutex) = 0;
+tensorflow::mutex _uid_mutex(tensorflow::LINKER_INITIALIZED);
+tensorflow::int64 _uid GUARDED_BY(_uid_mutex) = 0;
+
 }  // namespace
 
 void TFE_Py_Execute(TFE_Context* ctx, const char* device_name,
diff --git a/tensorflow/tools/graph_transforms/quantize_nodes.cc b/tensorflow/tools/graph_transforms/quantize_nodes.cc
index 5ccd88cfa1..a022f57926 100644
--- a/tensorflow/tools/graph_transforms/quantize_nodes.cc
+++ b/tensorflow/tools/graph_transforms/quantize_nodes.cc
@@ -183,22 +183,6 @@ Status ExtractRangeFromParams(const TransformFuncContext& context,
   return Status::OK();
 }
 
-bool AreAttrsEqual(const NodeDef* current_node, const NodeDef* other_node) {
-  if (current_node->attr_size() != other_node->attr_size()) {
-    return false;
-  }
-  string current_serialized;
-  string other_serialized;
-  for (const auto& attr : other_node->attr()) {
-    auto iter = current_node->attr().find(attr.first);
-    if (iter == current_node->attr().end()) return false;
-    iter->second.SerializeToString(&current_serialized);
-    attr.second.SerializeToString(&other_serialized);
-    if (current_serialized != other_serialized) return false;
-  }
-  return true;
-}
-
 }  // namespace
 
 // Analyzes all the nodes in the graph to figure out which ones are duplicates
-- 
GitLab


From 3e9bf0874ed19b1f96f835c444a4b80167de4663 Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <ndesaulniers@google.com>
Date: Fri, 26 Jan 2018 15:01:40 -0800
Subject: [PATCH 275/966] [XLA] optimize NearComparator#ExpectLiteralsNear()

While tracking down the issue of timeouts when running THE ISOLATOR, it was
observed that NearComparator#ExpectLiteralsNear() could be optimized in the
case of matching layouts to not compute multi indexes.

In the process of tracking down timeouts in THE ISOLATOR, I had assumed that
time spent was dominated by either generating input data, executing the input
data on various backends, or comparing the data. Never assume you know where
the time is spent in a program; the profiler may surprise you.

After making that optimization and then profiling the code before and after, I
was surprised by the profile. Image the shock, horror, and disgust I
experienced when discovering that runs of THE ISOLATOR were dominated (45%) by
calls to Literal#ToString() in NearComparator#ExpectLiteralsNear() for huge
(>120 million elements) literals that failed comparisons.  No wonder passing
shards of THE ISOLATOR were fast, and failing shards were slow.

Further, computing multi indexes many times is expensive enough (18%) to show
up in profiles, so avoid calculating it until it is necessary.

The optimizations in this patch:
* Don't call Literal#ToString() on huge literals that are going to get written
  to disk anyways. The utility of printing said literal to stdout is suspect.
* Initialize NearComparator#miscompares_ to false, only update miscompares_ and
  other stats when miscompare occurs.
* Split NearComparator#ExpectLiteralsNear() into two, since we only need to log
  and update stats if an actual miscompare occurs.
* Add fast path in NearComparator#ExpectLiteralsNear() for case of matching
  layouts, being careful not to compute multi index unless mismatch actually
  occurs.

This optimized NearComparator#ExpectLiteralsNear() for the case of many element
literals, with few miscompares. For many miscompares, we cannot avoid
calculating multi indexes, but can fast path for equal layouts. For zero
miscompares, we can at least fast path in the case of matching layouts.

Before this CL, a run of THE ISOLATOR for a single literal with >120 million
elements and a few miscompares took 379s (6.3m). With this CL, the same test
case now takes 44s.

Beautiful flame graphs omitted from public commit message, regrettably.

PiperOrigin-RevId: 183451138
---
 tensorflow/compiler/xla/literal_util.h        |   1 +
 .../compiler/xla/tests/literal_test_util.cc   | 186 +++++++++++++-----
 2 files changed, 138 insertions(+), 49 deletions(-)

diff --git a/tensorflow/compiler/xla/literal_util.h b/tensorflow/compiler/xla/literal_util.h
index e0196509a7..2b68b8f177 100644
--- a/tensorflow/compiler/xla/literal_util.h
+++ b/tensorflow/compiler/xla/literal_util.h
@@ -486,6 +486,7 @@ class Literal {
       std::vector<std::unique_ptr<Literal>> elements);
 
   // Returns a string representation of the literal value.
+  // Warning: this function can take minutes for multi-million element Literals.
   string ToString(bool print_layout = false) const;
 
   // Invokes the "per cell" callback for each element in the provided
diff --git a/tensorflow/compiler/xla/tests/literal_test_util.cc b/tensorflow/compiler/xla/tests/literal_test_util.cc
index f8205de702..39c07297d6 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util.cc
+++ b/tensorflow/compiler/xla/tests/literal_test_util.cc
@@ -355,9 +355,9 @@ class NearComparator {
   // temporary files on failure. Returns true if  literals match.
   bool ExpectNear(const Literal& expected, const Literal& actual) {
     VLOG(1) << "expected:";
-    XLA_VLOG_LINES(1, expected.ToString());
+    XLA_VLOG_LINES(1, TruncateHugeLiteral(expected));
     VLOG(1) << "actual:";
-    XLA_VLOG_LINES(1, actual.ToString());
+    XLA_VLOG_LINES(1, TruncateHugeLiteral(actual));
 
     // If the shapes mismatch, we simply fail the expectation instead of
     // printing out data, as it's a type error rather than a value error.
@@ -377,6 +377,7 @@ class NearComparator {
     max_rel_err_ = 0.0;
     max_abs_err_ = 0.0;
     miscompares_ = Literal(ShapeUtil::ChangeElementType(actual.shape(), PRED));
+    miscompares_.PopulateWithValue(false);
     multi_index_.resize(expected.shape().dimensions_size(), 0);
 
     switch (expected.shape().element_type()) {
@@ -404,21 +405,33 @@ class NearComparator {
     if (num_miscompares_ > 0) {
       if (!VLOG_IS_ON(1)) {
         LOG(INFO) << "expected: " << ShapeUtil::HumanString(expected.shape())
-                  << " " << expected.ToString();
+                  << " " << TruncateHugeLiteral(expected);
         LOG(INFO) << "actual:   " << ShapeUtil::HumanString(actual.shape())
-                  << " " << actual.ToString();
+                  << " " << TruncateHugeLiteral(actual);
+        LOG(INFO) << "Dumping literals to temp files...";
+        WriteLiteralToTempFile(expected, "expected");
+        WriteLiteralToTempFile(actual, "actual");
+        WriteLiteralToTempFile(miscompares_, "miscompares");
       }
       EXPECT_TRUE(num_miscompares_ == 0)
           << "\nmax relative mismatch at index "
-          << LiteralTestUtil::MultiIndexAsString(max_rel_multi_index_)
+          << LiteralTestUtil::MultiIndexAsString(
+                 IndexUtil::LinearIndexToMultidimensionalIndex(
+                     actual.shape(), max_rel_linear_index_))
           << "\nmaximum relative error " << max_rel_err_
           << "\nmax absolute mismatch at index "
-          << LiteralTestUtil::MultiIndexAsString(max_abs_multi_index_)
+          << LiteralTestUtil::MultiIndexAsString(
+                 IndexUtil::LinearIndexToMultidimensionalIndex(
+                     actual.shape(), max_abs_linear_index_))
           << "\nmaximum absolute error " << max_abs_err_
           << "\nfirst mismatch at index "
-          << LiteralTestUtil::MultiIndexAsString(first_multi_index_)
+          << LiteralTestUtil::MultiIndexAsString(
+                 IndexUtil::LinearIndexToMultidimensionalIndex(
+                     actual.shape(), first_linear_index_))
           << "\nlast mismatch at index "
-          << LiteralTestUtil::MultiIndexAsString(last_multi_index_)
+          << LiteralTestUtil::MultiIndexAsString(
+                 IndexUtil::LinearIndexToMultidimensionalIndex(
+                     actual.shape(), last_linear_index_))
           << "\ntotal absolute error " << abs_diff_sum_
           << "\ntotal absolute error of miscompares "
           << abs_diff_miscompare_sum_ << "\ntotal relative error "
@@ -426,10 +439,6 @@ class NearComparator {
           << "\ntotal relative error of miscompares "
           << (abs_diff_miscompare_sum_ / abs_expected_miscompare_sum_)
           << "\nfailure count " << num_miscompares_;
-
-      WriteLiteralToTempFile(expected, "expected");
-      WriteLiteralToTempFile(actual, "actual");
-      WriteLiteralToTempFile(miscompares_, "miscompares");
     }
     return num_miscompares_ == 0;
   }
@@ -457,57 +466,93 @@ class NearComparator {
       return true;
     }
 
-    float abs_diff = std::abs(actual - expected);
-    float rel_err = abs_diff / std::abs(expected);
+    const float abs_diff = std::abs(actual - expected);
+    const float rel_err = abs_diff / std::abs(expected);
+    const bool nan_mismatch = NanMismatch<NativeT>(expected, actual);
+    const bool mismatch =
+        (nan_mismatch || (abs_diff >= error_.abs && rel_err >= error_.rel));
+    return !mismatch;
+  }
+
+  // Assumes that expected vs actual fail ExpectValuesNear.
+  template <typename NativeT>
+  void UpdateAndLogMiscompares(const NativeT expected, const NativeT actual,
+                               const Shape& shape, const int64 linear_index) {
+    const float abs_diff = std::abs(actual - expected);
+    const float rel_err = abs_diff / std::abs(expected);
     abs_diff_sum_ += abs_diff;
     abs_expected_sum_ += std::abs(expected);
     if (rel_err > max_rel_err_) {
       max_rel_err_ = rel_err;
-      max_rel_multi_index_ = multi_index_;
+      max_rel_linear_index_ = linear_index;
     }
     if (abs_diff > max_abs_err_) {
       max_abs_err_ = abs_diff;
-      max_abs_multi_index_ = multi_index_;
+      max_abs_linear_index_ = linear_index;
     }
-    VLOG(10) << tensorflow::strings::Printf(
-        "index %s abs_diff %f rel_err %f",
-        LiteralTestUtil::MultiIndexAsString(multi_index_).c_str(), abs_diff,
-        rel_err);
-    bool nan_mismatch = NanMismatch<NativeT>(expected, actual);
-    bool mismatch =
-        (nan_mismatch || (abs_diff >= error_.abs && rel_err >= error_.rel));
-    if (mismatch) {
-      abs_diff_miscompare_sum_ += abs_diff;
-      abs_expected_miscompare_sum_ += std::abs(expected);
-      const int64 kMaxFailures = 2;
-      if (num_miscompares_ < kMaxFailures) {
-        ::testing::Message msg;
-        msg << "mismatch at index "
-            << LiteralTestUtil::MultiIndexAsString(multi_index_) << " abs diff "
-            << abs_diff << " rel err " << rel_err << " failure #"
-            << num_miscompares_;
-        ExpectNear<NativeT>(expected, actual, msg);
-      } else if (num_miscompares_ == kMaxFailures) {
-        LOG(ERROR)
-            << "reached max 'loud' failure count; silently proceeding...";
-      }
-      if (num_miscompares_ == 0) {
-        first_multi_index_ = multi_index_;
-      }
-      num_miscompares_++;
-      last_multi_index_ = multi_index_;
+    if (VLOG_IS_ON(10)) {
+      VLOG(10) << tensorflow::strings::Printf(
+          "index %s abs_diff %f rel_err %f",
+          LiteralTestUtil::MultiIndexAsString(
+              IndexUtil::LinearIndexToMultidimensionalIndex(shape,
+                                                            linear_index))
+              .c_str(),
+          abs_diff, rel_err);
     }
-    return !mismatch;
+    abs_diff_miscompare_sum_ += abs_diff;
+    abs_expected_miscompare_sum_ += std::abs(expected);
+    const int64 kMaxFailures = 2;
+    if (num_miscompares_ < kMaxFailures) {
+      const auto multi_index =
+          IndexUtil::LinearIndexToMultidimensionalIndex(shape, linear_index);
+      ::testing::Message msg;
+      msg << "mismatch at index "
+          << LiteralTestUtil::MultiIndexAsString(multi_index) << " abs diff "
+          << abs_diff << " rel err " << rel_err << " failure #"
+          << num_miscompares_;
+      ExpectNear<NativeT>(expected, actual, msg);
+    } else if (num_miscompares_ == kMaxFailures) {
+      LOG(ERROR) << "reached max 'loud' failure count; silently proceeding...";
+    }
+    if (num_miscompares_ == 0) {
+      first_linear_index_ = linear_index;
+    }
+    num_miscompares_++;
+    last_linear_index_ = linear_index;
+    miscompares_.data<bool>()[linear_index] = true;
   }
 
   // Recursive function which compares the two given literals elementwise.
   template <typename NativeT>
   void ExpectLiteralsNear(const Literal& expected, const Literal& actual,
                           int64 dimension) {
+    // Fast path optimization for the case were layouts match.
+    if (LayoutUtil::Equal(actual.shape().layout(), expected.shape().layout())) {
+      tensorflow::gtl::ArraySlice<const NativeT> expected_data =
+          expected.data<NativeT>();
+      tensorflow::gtl::ArraySlice<const NativeT> actual_data =
+          actual.data<NativeT>();
+      const int64 len = expected_data.size();
+      for (int64 i = 0; i < len; ++i) {
+        const bool near = ExpectValuesNear(expected_data[i], actual_data[i]);
+        if (!near) {
+          UpdateAndLogMiscompares<NativeT>(expected_data[i], actual_data[i],
+                                           actual.shape(), i);
+        }
+      }
+      return;
+    }
+
     if (dimension == expected.shape().dimensions_size()) {
       bool near = ExpectValuesNear(expected.Get<NativeT>(multi_index_),
                                    actual.Get<NativeT>(multi_index_));
-      miscompares_.Set<bool>(multi_index_, !near);
+      if (!near) {
+        UpdateAndLogMiscompares<NativeT>(
+            expected.Get<NativeT>(multi_index_),
+            actual.Get<NativeT>(multi_index_), actual.shape(),
+            IndexUtil::MultidimensionalIndexToLinearIndex(actual.shape(),
+                                                          multi_index_));
+      }
     } else {
       for (int64 i = 0; i < expected.shape().dimensions(dimension); ++i) {
         multi_index_[dimension] = i;
@@ -528,6 +573,32 @@ class NearComparator {
     LOG(ERROR) << "wrote to " << name << " file: " << filename;
   }
 
+  // Gets the total element count.  For tuples, this is not the count of tuple
+  // elements, but the sum of elements of each tuple element.
+  int64 RecursiveElementCount(const Shape& shape) {
+    if (ShapeUtil::IsTuple(shape)) {
+      const int64 tuple_elements = ShapeUtil::TupleElementCount(shape);
+      int64 total = 0;
+      for (int64 i = 0; i < tuple_elements; ++i) {
+        total +=
+            RecursiveElementCount(ShapeUtil::GetTupleElementShape(shape, i));
+      }
+      return total;
+    } else {
+      return ShapeUtil::ElementsIn(shape);
+    }
+  }
+
+  // Calling ToString on a literal with over 100 million elements takes around
+  // 3 minutes.  The utility of printing a literal with >1000 elements is
+  // questionable, especially when writing the Literal proto to disk is orders
+  // of magnitude faster.
+  string TruncateHugeLiteral(const Literal& literal) {
+    return RecursiveElementCount(literal.shape()) < 1000
+               ? literal.ToString()
+               : "[TRUNCATED, Literal with more than 1000 values]";
+  }
+
   ErrorSpec error_;
 
   // Number of element miscomparisons encountered so far.
@@ -548,10 +619,10 @@ class NearComparator {
   double abs_expected_miscompare_sum_;
   float max_rel_err_;
   float max_abs_err_;
-  std::vector<int64> first_multi_index_;
-  std::vector<int64> last_multi_index_;
-  std::vector<int64> max_rel_multi_index_;
-  std::vector<int64> max_abs_multi_index_;
+  int64 first_linear_index_;
+  int64 last_linear_index_;
+  int64 max_rel_linear_index_;
+  int64 max_abs_linear_index_;
 };
 
 template <>
@@ -584,6 +655,23 @@ bool NearComparator::ExpectValuesNear<half>(half expected, half actual) {
                           static_cast<float>(std::move(actual)));
 }
 
+template <>
+void NearComparator::UpdateAndLogMiscompares<bfloat16>(
+    const bfloat16 expected, const bfloat16 actual, const Shape& shape,
+    const int64 linear_index) {
+  UpdateAndLogMiscompares(static_cast<float>(expected),
+                          static_cast<float>(actual), shape, linear_index);
+}
+
+template <>
+void NearComparator::UpdateAndLogMiscompares<half>(half expected, half actual,
+                                                   const Shape& shape,
+                                                   const int64 linear_index) {
+  UpdateAndLogMiscompares(static_cast<float>(std::move(expected)),
+                          static_cast<float>(std::move(actual)), shape,
+                          linear_index);
+}
+
 }  // namespace
 
 /* static */ ::testing::AssertionResult LiteralTestUtil::Near(
-- 
GitLab


From e4a628adf84a2373d773103cdeabc96cbffd7b47 Mon Sep 17 00:00:00 2001
From: AG Ramesh <ag.ramesh@intel.com>
Date: Fri, 26 Jan 2018 16:43:30 -0700
Subject: [PATCH 276/966] Making MKL-DNN default build choice (#16474)

---
 tensorflow/core/graph/mkl_layout_pass.cc      |  6 ++--
 tensorflow/core/graph/mkl_layout_pass_test.cc |  6 ++--
 tensorflow/core/kernels/mkl_aggregate_ops.cc  |  6 ++--
 tensorflow/core/kernels/mkl_avgpooling_op.cc  | 16 +++++++---
 tensorflow/core/kernels/mkl_concat_op.cc      |  6 ++--
 .../core/kernels/mkl_conv_grad_filter_ops.cc  |  6 ++--
 .../core/kernels/mkl_conv_grad_input_ops.cc   |  6 ++--
 tensorflow/core/kernels/mkl_conv_ops.cc       |  9 ++++--
 tensorflow/core/kernels/mkl_conv_ops.h        |  6 ++--
 .../core/kernels/mkl_fused_batch_norm_op.cc   |  6 ++--
 tensorflow/core/kernels/mkl_identity_op.cc    |  4 +--
 .../core/kernels/mkl_input_conversion_op.cc   |  4 +--
 tensorflow/core/kernels/mkl_lrn_op.cc         |  6 ++--
 tensorflow/core/kernels/mkl_maxpooling_op.cc  | 10 +++---
 .../core/kernels/mkl_pooling_ops_common.cc    |  6 ++--
 .../core/kernels/mkl_pooling_ops_common.h     |  8 ++---
 tensorflow/core/kernels/mkl_relu_op.cc        | 11 ++++---
 tensorflow/core/kernels/mkl_reshape_op.cc     |  6 ++--
 tensorflow/core/kernels/mkl_softmax_op.cc     |  4 +--
 tensorflow/core/kernels/mkl_tfconv_op.h       |  4 +--
 tensorflow/core/ops/nn_ops.cc                 |  8 ++---
 tensorflow/core/util/mkl_util.h               | 32 +++++++++----------
 tensorflow/core/util/mkl_util_test.cc         |  4 +--
 23 files changed, 96 insertions(+), 84 deletions(-)

diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index 55bc401b9d..911d931a52 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -42,7 +42,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
 // This pass implements rewriting of graph to support following scenarios:
 // (A) Merging nodes in the graph
@@ -2215,7 +2215,7 @@ Status MklLayoutRewritePass::Run(
   return Status::OK();
 }
 
-#else  // INTEL_MKL_DNN
+#else  // INTEL_MKL_ML
 
 // This pass implements rewriting of graph to support following scenarios:
 // (A) Merging nodes in the graph
@@ -4325,7 +4325,7 @@ Status MklLayoutRewritePass::Run(
 
   return Status::OK();
 }
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 }  // namespace tensorflow
 
 #endif
diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc
index 75f7ca2d4d..7b8c3cccc5 100644
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@@ -38,7 +38,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
 namespace {
 
@@ -1885,7 +1885,7 @@ BENCHMARK(BM_MklLayoutRewritePass)->Arg(1000)->Arg(10000);
 
 }  // namespace
 
-#else  // INTEL_MKL_DNN
+#else  // INTEL_MKL_ML
 
 namespace {
 
@@ -3503,7 +3503,7 @@ BENCHMARK(BM_MklLayoutRewritePass)->Arg(1000)->Arg(10000);
 
 }  // namespace
 
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/mkl_aggregate_ops.cc b/tensorflow/core/kernels/mkl_aggregate_ops.cc
index 89d37d2f87..49c34fed02 100644
--- a/tensorflow/core/kernels/mkl_aggregate_ops.cc
+++ b/tensorflow/core/kernels/mkl_aggregate_ops.cc
@@ -28,7 +28,7 @@ limitations under the License.
 #include "mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 using mkldnn::stream;
 using mkldnn::sum;
@@ -37,7 +37,7 @@ using mkldnn::sum;
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
 template <typename Device, typename T>
 class MklAddNOp : public OpKernel {
@@ -285,7 +285,7 @@ class MklAddNOp : public OpKernel {
   } MklAddNOpContext;
 };
 
-#else  // INTEL_MKL_DNN
+#else  // INTEL_MKL_ML
 template <typename Device, typename T>
 class MklAddNOp : public OpKernel {
  public:
diff --git a/tensorflow/core/kernels/mkl_avgpooling_op.cc b/tensorflow/core/kernels/mkl_avgpooling_op.cc
index a7c569ee05..ebaa0f4e2a 100644
--- a/tensorflow/core/kernels/mkl_avgpooling_op.cc
+++ b/tensorflow/core/kernels/mkl_avgpooling_op.cc
@@ -24,7 +24,7 @@
 
 #include "tensorflow/core/kernels/mkl_pooling_ops_common.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 using mkldnn::algorithm;
 using mkldnn::engine;
@@ -40,8 +40,7 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-// For now, MKL-ML is default. So making MKL-DNN not a default choice.
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
 template <typename Device, typename T>
 class MklAvgPoolingOp : public OpKernel {
@@ -429,7 +428,10 @@ class MklAvgPoolingGradOp : public OpKernel {
   TensorFormat data_format_;
 };  // MklAvgPoolingGradOp
 
-#else  // INTEL_MKL_DNN is defined
+
+
+#else
+
 
 template <typename Device, typename T>
 class MklAvgPoolingOp : public MklPoolingForwardOpBase<T> {
@@ -678,7 +680,11 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
   }
 };  // MklAvgPoolingGradOp
 
-#endif  // INTEL_MKL_DNN
+
+
+
+#endif  // INTEL_MKL_ML
+
 
 REGISTER_KERNEL_BUILDER(Name("_MklAvgPool")
                             .Device(DEVICE_CPU)
diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl_concat_op.cc
index 7da63604d2..f1f267e849 100644
--- a/tensorflow/core/kernels/mkl_concat_op.cc
+++ b/tensorflow/core/kernels/mkl_concat_op.cc
@@ -30,7 +30,7 @@ limitations under the License.
 #include "mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 
 using mkldnn::concat;
@@ -62,7 +62,7 @@ class EigenConcatBaseOp : public OpKernel {
   // we need to have empty Compute because Compute is pure virtual function.
   void Compute(OpKernelContext* c) {}
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
   void Compute(OpKernelContext* c, const std::vector<Tensor>& values) {
     const Tensor* concat_dim_tensor;
@@ -230,7 +230,7 @@ class EigenConcatBaseOp : public OpKernel {
 #endif
 };
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
 // --------------------------------------------------------------------------
 //                      Mkl Concat Op
diff --git a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
index ef3f8cfec1..1401bc65a4 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
@@ -42,7 +42,7 @@ limitations under the License.
 #include "mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 
 using mkldnn::convolution_backward_weights;
@@ -55,7 +55,7 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
 template <typename Device, class T>
 class MklConv2DCustomBackpropFilterOp : public OpKernel {
@@ -655,7 +655,7 @@ class MklConv2DCustomBackpropFilterOp
 TF_CALL_float(REGISTER_MKL_FILTER_KERNELS);
 #undef REGISTER_MKL_FILTER_KERNELS
 
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
index a6745489f4..eeed009531 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
@@ -44,7 +44,7 @@ limitations under the License.
 #include "tensorflow/core/util/use_cudnn.h"
 #include "tensorflow/core/util/work_sharder.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 
 using mkldnn::convolution_backward_data;
@@ -56,7 +56,7 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
 template <typename Device, class T>
 class MklConv2DCustomBackpropInputOp : public OpKernel {
@@ -493,7 +493,7 @@ class MklConv2DCustomBackpropInputOp
   }
 };
 
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 
 #define REGISTER_MKL_CPU_KERNELS(T)                                 \
   REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropInput")           \
diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc
index e44fba754b..cbda12689f 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@@ -41,7 +41,10 @@ limitations under the License.
 
 #include "tensorflow/core/util/mkl_util.h"
 
-#ifdef INTEL_MKL_DNN
+
+
+#ifndef INTEL_MKL_ML
+
 #include "mkldnn.hpp"
 
 using mkldnn::prop_kind;
@@ -58,8 +61,8 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-// For now, MKL-ML is default. So making MKL-DNN not a default choice.
-#ifndef INTEL_MKL_DNN
+// MKL-DNN is now default. MKL-ML must be specified explicitly.
+#ifdef INTEL_MKL_ML
 
 template <typename Device, typename T, bool biasEnabled>
 class MklConv2DOp : public OpKernel {
diff --git a/tensorflow/core/kernels/mkl_conv_ops.h b/tensorflow/core/kernels/mkl_conv_ops.h
index 8b65eaea0d..9dd88221a8 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.h
+++ b/tensorflow/core/kernels/mkl_conv_ops.h
@@ -40,7 +40,7 @@ limitations under the License.
 
 #include "tensorflow/core/util/mkl_util.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 
 using mkldnn::prop_kind;
@@ -52,7 +52,7 @@ using mkldnn::convolution_forward;
 
 namespace tensorflow {
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 
 class MklDnnConvUtil {
  protected:
@@ -553,7 +553,7 @@ class MklConv2DBackpropCommonOp : public OpKernel {
   Padding padding_;
   TensorFormat data_format_;
 };
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 
 /////////////////////////////////////////////////////////////////////
 ///  Dummy Mkl op that is just used for operators that are intermediate
diff --git a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
index 0b6d838e09..8313224d7f 100644
--- a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 
 using mkldnn::batch_normalization_backward;
@@ -41,7 +41,7 @@ using mkldnn::use_scale_shift;
 namespace tensorflow {
 using CPUDevice = Eigen::ThreadPoolDevice;
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
 template <typename Device, typename T>
 class MklFusedBatchNormOp : public OpKernel {
@@ -683,7 +683,7 @@ class MklFusedBatchNormGradOp : public OpKernel {
 };
 #endif
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 
 template <typename Device, typename T>
 class MklFusedBatchNormOp : public OpKernel {
diff --git a/tensorflow/core/kernels/mkl_identity_op.cc b/tensorflow/core/kernels/mkl_identity_op.cc
index 9ee27ee21c..6c027f8e72 100644
--- a/tensorflow/core/kernels/mkl_identity_op.cc
+++ b/tensorflow/core/kernels/mkl_identity_op.cc
@@ -28,14 +28,14 @@ limitations under the License.
 #include "mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 #endif
 
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
 template <typename Device, typename T>
 class MklIdentityOp : public OpKernel {
diff --git a/tensorflow/core/kernels/mkl_input_conversion_op.cc b/tensorflow/core/kernels/mkl_input_conversion_op.cc
index 73d41efce1..4337e4b49e 100644
--- a/tensorflow/core/kernels/mkl_input_conversion_op.cc
+++ b/tensorflow/core/kernels/mkl_input_conversion_op.cc
@@ -31,7 +31,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/mkl_tfconv_op.h"
 #include "tensorflow/core/util/mkl_util.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 
 using mkldnn::stream;
@@ -59,7 +59,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 //     convert the TF format input to MKL format
 ///////////////////////////////////////////////////////////
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 template <typename Device, typename T>
 class MklInputConversionOp : public OpKernel {
  public:
diff --git a/tensorflow/core/kernels/mkl_lrn_op.cc b/tensorflow/core/kernels/mkl_lrn_op.cc
index a8b45004b7..5f0a12a1fb 100644
--- a/tensorflow/core/kernels/mkl_lrn_op.cc
+++ b/tensorflow/core/kernels/mkl_lrn_op.cc
@@ -38,7 +38,7 @@ limitations under the License.
 #include "tensorflow/core/util/work_sharder.h"
 #endif
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 using mkldnn::lrn_across_channels;
 using mkldnn::lrn_backward;
@@ -67,7 +67,7 @@ void GetBandMatrix(int depth, int depth_radius,
 
 }  // namespace
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
 template <typename T>
 class MklLRNOp : public OpKernel {
@@ -1343,7 +1343,7 @@ class MklLRNGradOp : public OpKernel {
   float beta_;
 };
 
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 
 #define REGISTER_MKL_LRN_CPU(T)                                     \
   REGISTER_KERNEL_BUILDER(Name("_MklLRN")                           \
diff --git a/tensorflow/core/kernels/mkl_maxpooling_op.cc b/tensorflow/core/kernels/mkl_maxpooling_op.cc
index 0de27ccd60..14607f26e0 100644
--- a/tensorflow/core/kernels/mkl_maxpooling_op.cc
+++ b/tensorflow/core/kernels/mkl_maxpooling_op.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/padding.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include <algorithm>
 #include "mkldnn.hpp"
 using mkldnn::algorithm;
@@ -39,8 +39,8 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-// For now, MKL-ML is default. So making MKL-DNN not a default choice.
-#ifndef INTEL_MKL_DNN
+// MKL-DNN is now default. MKL-ML must be specified explicitly.
+#ifdef INTEL_MKL_ML
 
 // An implementation of MaxPooling (forward).
 template <typename Device, typename T>
@@ -494,7 +494,7 @@ class MklMaxPoolingGradOp : public OpKernel {
   bool workspace_enabled_;
 };  // MklMaxPoolingGradOp
 
-#else  // INTEL_MKL_DNN is defined
+#else
 
 // An implementation of MaxPooling (forward).
 template <typename Device, typename T>
@@ -793,7 +793,7 @@ class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase<T> {
   }
 };  // MklMaxPoolingGradOp
 
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 
 REGISTER_KERNEL_BUILDER(Name("_MklMaxPool")
                             .Device(DEVICE_CPU)
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.cc b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
index ef8597b057..5ef6ce2a57 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.cc
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
@@ -42,7 +42,7 @@ void MklPoolParameters::Init(OpKernelContext* context,
   Init(context, ksize, stride, padding, data_format);
 }
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 // Initialization for MKL format
 void MklPoolParameters::Init(OpKernelContext* context,
                              const std::vector<int32>& ksize,
@@ -72,7 +72,7 @@ void MklPoolParameters::Init(OpKernelContext* context,
 
   Init(context, ksize, stride, padding, data_format);
 }
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 // Common Initialization for TensorFlow and MKL formats
 void MklPoolParameters::Init(OpKernelContext* context,
                              const std::vector<int32>& ksize,
@@ -107,7 +107,7 @@ void MklPoolParameters::Init(OpKernelContext* context,
     OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
                                 tensor_in_cols, window_cols, col_stride,
                                 padding, &out_width, &pad_left, &pad_right));
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
     // TF can work with int64, but mkldnn only supports int32
     // Fail if the height or width are greater than MAX_INT
 
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.h b/tensorflow/core/kernels/mkl_pooling_ops_common.h
index 880e45ab1e..279167aba2 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.h
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.h
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/padding.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 using mkldnn::memory;
 using mkldnn::pooling_backward;
@@ -85,7 +85,7 @@ struct MklPoolParameters {
   void Init(OpKernelContext* context, const std::vector<int32>& ksize,
             const std::vector<int32>& stride, Padding padding,
             TensorFormat data_format, const TensorShape& tensor_in_shape);
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
   void Init(OpKernelContext* context, const std::vector<int32>& ksize,
             const std::vector<int32>& stride, Padding padding,
             TensorFormat data_format, const MklShape* mkl_in_shape);
@@ -102,7 +102,7 @@ struct MklPoolParameters {
             TensorFormat data_format);
 };
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 
 template <class T>
 class MklPoolingOpBase : public OpKernel {
@@ -395,7 +395,7 @@ class MklPoolingBackwardOpBase : public MklPoolingOpBase<T> {
     return grad_reorder_needed ? target_diff_dst_md : original_input_grad_md;
   }
 };
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 
 //-------------------------------------------------------------------
 // Utility functions
diff --git a/tensorflow/core/kernels/mkl_relu_op.cc b/tensorflow/core/kernels/mkl_relu_op.cc
index 873aca30ca..0be8355afa 100644
--- a/tensorflow/core/kernels/mkl_relu_op.cc
+++ b/tensorflow/core/kernels/mkl_relu_op.cc
@@ -28,7 +28,7 @@ limitations under the License.
 #include "tensorflow/core/platform/default/logging.h"
 #include "tensorflow/core/util/mkl_util.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 
 using mkldnn::algorithm;
@@ -58,7 +58,7 @@ struct MklReluHelpers {
   }
 };
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
 template <typename Device, typename T>
 class MklReluOp : public OpKernel {
@@ -368,7 +368,10 @@ void MklReluGradOp<Device, T>::Compute(OpKernelContext* context) {
   mkl_context.MklCleanup();
 }
 
-#else  // INTEL_MKL_DNN
+
+
+#else  // INTEL_MKL_ML
+
 
 template <typename Device, typename T, algorithm alg_kind>
 class MklReluOpBase : public OpKernel {
@@ -849,7 +852,7 @@ class MklTanhGradOp : public MklReluGradOpBase<Device, T, eltwise_tanh> {
                           MklReluGradOp<CPUDevice, type>);
 TF_CALL_float(REGISTER_RELU_MKL_SUPPORTED_KERNELS_TYPES);
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 
 // register dnn kernels for supported operations and supported types
 #define REGISTER_ELU_MKL_SUPPORTED_KERNELS_TYPES(type)              \
diff --git a/tensorflow/core/kernels/mkl_reshape_op.cc b/tensorflow/core/kernels/mkl_reshape_op.cc
index 7d471e1e4c..5dbc4a2709 100644
--- a/tensorflow/core/kernels/mkl_reshape_op.cc
+++ b/tensorflow/core/kernels/mkl_reshape_op.cc
@@ -28,7 +28,7 @@ limitations under the License.
 #include "mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 using mkldnn::stream;
 #endif
@@ -40,7 +40,7 @@ class MklReshapeOp : public OpKernel {
  public:
   explicit MklReshapeOp(OpKernelConstruction* context) : OpKernel(context) {}
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
   void Compute(OpKernelContext* context) override {
     const Tensor& input = MklGetInput(context, 0);
     const Tensor& sizes = MklGetInput(context, 1);
@@ -312,7 +312,7 @@ class MklReshapeOp : public OpKernel {
     }
   }
 
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 
  private:
   const int kInputSlotIdx = 0;
diff --git a/tensorflow/core/kernels/mkl_softmax_op.cc b/tensorflow/core/kernels/mkl_softmax_op.cc
index c46eabdde1..aceef1e234 100644
--- a/tensorflow/core/kernels/mkl_softmax_op.cc
+++ b/tensorflow/core/kernels/mkl_softmax_op.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 // See docs in ../ops/nn_ops.cc.
 #ifdef INTEL_MKL
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/numeric_op.h"
@@ -156,5 +156,5 @@ TF_CALL_float(REGISTER_SOFTMAX_MKL_SUPPORTED_KERNELS_TYPES);
 
 }  // namespace tensorflow
 
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_tfconv_op.h b/tensorflow/core/kernels/mkl_tfconv_op.h
index c4d5a45d3c..5fafa14b5d 100644
--- a/tensorflow/core/kernels/mkl_tfconv_op.h
+++ b/tensorflow/core/kernels/mkl_tfconv_op.h
@@ -35,7 +35,7 @@ limitations under the License.
 #include "mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 using mkldnn::stream;
 #endif
 
@@ -61,7 +61,7 @@ class MklToTfOp : public OpKernel {
     VLOG(1) << "MKLToTFConversion complete successfully.";
   }
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
   static void ConvertMklToTf(OpKernel* op_kernel, OpKernelContext* context,
                              string data_format_str, DataType op_data_type,
                              bool has_avx512f, uint input_number) {
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index 62661fe4bd..67481fd202 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -1818,7 +1818,7 @@ REGISTER_OP("_MklMaxPool")
     .Input("input: T")
     .Input("mkl_input: uint8")
     .Output("output: T")
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
     .Output("workspace: T")
 #else
     .Output("workspace: uint8")
@@ -1844,7 +1844,7 @@ REGISTER_OP("_MklMaxPoolGrad")
     .Input("orig_input: T")
     .Input("orig_output: T")
     .Input("grad: T")
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
     .Input("workspace: T")
 #else
     .Input("workspace: uint8")
@@ -1916,7 +1916,7 @@ REGISTER_OP("_MklLRN")
     .Input("input: T")
     .Input("mkl_input: uint8")
     .Output("output: T")
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
     .Output("workspace: T")
 #else
     .Output("workspace: uint8")
@@ -1944,7 +1944,7 @@ REGISTER_OP("_MklLRNGrad")
     .Input("input_grads: T")
     .Input("input_image: T")
     .Input("output_image: T")
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
     .Input("workspace: T")
 #else
     .Input("workspace: uint8")
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 2caf5fc56d..34ef7ba21b 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -35,7 +35,7 @@ limitations under the License.
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 
 using mkldnn::engine;
@@ -324,7 +324,7 @@ class MklShape {
       nullptr;  // TF dimension corresponding to this MKL dimension
 };
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 
 // Forward decl
 TensorFormat MklDnnDataFormatToTFDataFormat(memory::format format);
@@ -659,7 +659,7 @@ class MklDnnShape {
 
 typedef std::vector<MklShape> MklShapeList;
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 typedef std::vector<MklDnnShape> MklDnnShapeList;
 #endif
 
@@ -673,7 +673,7 @@ inline bool AreAllMklTensors(const MklShapeList& shapes) {
   return true;
 }
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 template <typename T>
 inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
                              const MklShape& mkl_shape) {
@@ -724,7 +724,7 @@ inline void GetMklShape(OpKernelContext* ctext, int n, MklShape* mklshape) {
           sizeof(uint8));
 }
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 inline void GetMklShape(OpKernelContext* ctext, int n, MklDnnShape* mklshape) {
   mklshape->DeSerializeMklDnnShape(
       ctext->input(GetTensorMetaDataIndex(n, ctext->num_inputs()))
@@ -749,7 +749,7 @@ inline void GetMklInputList(OpKernelContext* ctext, StringPiece name,
 }
 
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
 inline void GetMklShapeList(OpKernelContext* ctext, StringPiece name,
                             MklShapeList* mkl_shapes) {
@@ -779,7 +779,7 @@ inline void GetMklShapeList(OpKernelContext* ctext, StringPiece name,
 
 #endif
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 /// Get shape of input tensor pointed by 'input_idx' in TensorShape format.
 /// If the input tensor is in MKL layout, then obtains TensorShape from
 /// MklShape.
@@ -814,7 +814,7 @@ inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
       second_tensor->flat<uint8>().size() * sizeof(uint8));
 }
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 // Allocate the second output tensor that will contain
 // the MKL shape serialized
 inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
@@ -851,7 +851,7 @@ inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
       second_tensor->flat<uint8>().size() * sizeof(uint8));
 }
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 // Allocate the output tensor, create a second output tensor that will contain
 // the MKL shape serialized
 inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
@@ -875,7 +875,7 @@ inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
 
 // Allocates a temp tensor and returns the data buffer for temporary storage.
 // Currently
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 template <typename T>
 inline void AllocTmpBuffer(OpKernelContext* context, Tensor* tensor_out,
                            const memory::primitive_desc& pd, void** buf_out) {
@@ -994,7 +994,7 @@ inline void CopyMklTensorInToOut(OpKernelContext* context,
   context->set_output(idx_meta_out, meta_output);
 }
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 inline void CopyTfTensorInToOutWithShape(OpKernelContext* context,
                                          int idx_in, int idx_out,
                                          const TensorShape& shape) {
@@ -1032,7 +1032,7 @@ inline void CopyTfTensorInToOutWithShape(OpKernelContext* context,
 }
 #endif
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
 inline void ForwardTfTensorInToOut(OpKernelContext* context,
                                   int idx_in, int idx_out) {
@@ -1090,7 +1090,7 @@ inline void ForwardMklTensorInToOut(OpKernelContext* context,
   }
 }
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 inline void ForwardMklTensorInToOutWithMklShape(OpKernelContext* context,
                                              int idx_in, int idx_out,
                                              const MklDnnShape& mkl_shape) {
@@ -1132,7 +1132,7 @@ inline void SetDummyMklShapeOutput(OpKernelContext* context,
   AllocateOutputSetMklShape(context, idx_data_out, mkl_shape_output);
 }
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 // We don't need these functions in MKLDNN. We have defined equality operator
 // on MklDnnShape class directly.
 
@@ -1242,7 +1242,7 @@ inline void MklNCHWToNHWC(const Tensor& input, Tensor** output) {
 
 // -------------------------------------------------------------------
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 
 /// Return MKL-DNN data type (memory::data_type) for input type T
 ///
@@ -1753,7 +1753,7 @@ class MklDnnData {
   }
 };
 
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 
 }  // namespace tensorflow
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/util/mkl_util_test.cc b/tensorflow/core/util/mkl_util_test.cc
index 8b73eadb40..cd1d0713ad 100644
--- a/tensorflow/core/util/mkl_util_test.cc
+++ b/tensorflow/core/util/mkl_util_test.cc
@@ -22,7 +22,7 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 
 TEST(MklUtilTest, MklDnnTfShape) {
   auto cpu_engine = engine(engine::cpu, 0);
@@ -84,7 +84,7 @@ TEST(MklUtilTest, MklDnnBlockedFormatTest) {
   EXPECT_EQ(b_md2.data.format, mkldnn_blocked);
 }
 
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 }  // namespace
 }  // namespace tensorflow
 
-- 
GitLab


From 90033a0c87bb3d289144cbd429a9067a4a4881d6 Mon Sep 17 00:00:00 2001
From: cclauss <cclauss@bluewin.ch>
Date: Sat, 27 Jan 2018 00:43:48 +0100
Subject: [PATCH 277/966] Placate pylint on jupyter_notebook_config.py (#16449)

---
 tensorflow/tools/docker/jupyter_notebook_config.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/tools/docker/jupyter_notebook_config.py b/tensorflow/tools/docker/jupyter_notebook_config.py
index 0acbf6fcee..05dcefb099 100644
--- a/tensorflow/tools/docker/jupyter_notebook_config.py
+++ b/tensorflow/tools/docker/jupyter_notebook_config.py
@@ -15,6 +15,7 @@
 import os
 from IPython.lib import passwd
 
+c = c  # pylint:disable=undefined-variable
 c.NotebookApp.ip = '*'
 c.NotebookApp.port = int(os.getenv('PORT', 8888))
 c.NotebookApp.open_browser = False
-- 
GitLab


From 1d1a50e3a5f0e297e6d4d480cf28ca5be51d7c73 Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Fri, 26 Jan 2018 15:54:38 -0800
Subject: [PATCH 278/966] [XLA] (Re-land) Add HLO matcher for CustomCall that
 accepts a call target.

Now with less build breakage!

PiperOrigin-RevId: 183458987
---
 .../compiler/xla/service/hlo_matchers.cc      | 30 +++++++++++
 .../compiler/xla/service/hlo_matchers.h       | 53 +++++++++++++++++--
 .../compiler/xla/service/hlo_matchers_test.cc | 33 ++++++++++++
 3 files changed, 113 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_matchers.cc b/tensorflow/compiler/xla/service/hlo_matchers.cc
index 4255d60866..bc74c4bc10 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers.cc
+++ b/tensorflow/compiler/xla/service/hlo_matchers.cc
@@ -102,6 +102,36 @@ bool HloGetTupleElementMatcher::MatchAndExplain(
   return true;
 }
 
+void HloCustomCallMatcher::DescribeTo(std::ostream* os) const {
+  HloMatcher::DescribeTo(os);
+  *os << " with call target that ";
+  call_target_matcher_.DescribeTo(os);
+}
+
+bool HloCustomCallMatcher::MatchAndExplain(
+    const HloInstruction* instruction,
+    ::testing::MatchResultListener* listener) const {
+  if (!HloMatcher::MatchAndExplain(instruction, listener)) {
+    return false;
+  }
+  ::testing::StringMatchResultListener sub_listener;
+  bool result = ExplainMatchResult(
+      call_target_matcher_, instruction->custom_call_target(), &sub_listener);
+  if (sub_listener.str().empty()) {
+    sub_listener << " that ";
+
+    std::stringstream desc_stream;
+    if (result) {
+      call_target_matcher_.DescribeTo(&desc_stream);
+    } else {
+      call_target_matcher_.DescribeNegationTo(&desc_stream);
+    }
+    sub_listener << desc_stream.str();
+  }
+  *listener << "custom-call with call target" << sub_listener.str();
+  return result;
+}
+
 }  // namespace testing
 
 void PrintTo(const HloInstruction* inst, ::std::ostream* os) {
diff --git a/tensorflow/compiler/xla/service/hlo_matchers.h b/tensorflow/compiler/xla/service/hlo_matchers.h
index 9206cdac05..103f04a2cb 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers.h
+++ b/tensorflow/compiler/xla/service/hlo_matchers.h
@@ -56,8 +56,8 @@ class HloParameterMatcher : public HloMatcher {
 // index to match.
 class HloGetTupleElementMatcher : public HloMatcher {
  public:
-  explicit HloGetTupleElementMatcher(
-      ::testing::Matcher<const HloInstruction*> operand, int64 tuple_index)
+  HloGetTupleElementMatcher(::testing::Matcher<const HloInstruction*> operand,
+                            int64 tuple_index)
       : HloMatcher(HloOpcode::kGetTupleElement, /*operands=*/{operand}),
         tuple_index_(tuple_index) {}
 
@@ -68,6 +68,24 @@ class HloGetTupleElementMatcher : public HloMatcher {
   int64 tuple_index_;
 };
 
+// Custom matcher for custom-call instructions, which accepts a matcher for its
+// call target.
+class HloCustomCallMatcher : public HloMatcher {
+ public:
+  HloCustomCallMatcher(
+      ::testing::Matcher<string> call_target_matcher,
+      std::vector<::testing::Matcher<const HloInstruction*>> operands)
+      : HloMatcher(HloOpcode::kCustomCall, operands),
+        call_target_matcher_(call_target_matcher) {}
+
+  bool MatchAndExplain(const HloInstruction* instruction,
+                       ::testing::MatchResultListener* listener) const override;
+  void DescribeTo(std::ostream* os) const override;
+
+ private:
+  ::testing::Matcher<string> call_target_matcher_;
+};
+
 // HloInstruction* matchers for opcode and operands. Example:
 //   namespace op = xla::opcode_matchers;
 //   EXPECT_THAT(instruction,
@@ -94,7 +112,6 @@ HLO_MATCHER(Convert);
 HLO_MATCHER(Convolution);
 HLO_MATCHER(Copy);
 HLO_MATCHER(CrossReplicaSum);
-HLO_MATCHER(CustomCall);
 HLO_MATCHER(Divide);
 HLO_MATCHER(Dot);
 HLO_MATCHER(DynamicSlice);
@@ -184,6 +201,36 @@ inline ::testing::Matcher<const ::xla::HloInstruction*> GetTupleElement() {
       new ::xla::testing::HloMatcher(HloOpcode::kGetTupleElement, {}));
 }
 
+// - CustomCall(T, operand1, ..., operandN) matches a CustomCall with call
+//   target T and the given operands.
+//
+// - CustomCall(operand1, ..., operandN) matches any CustomCall HLO with the
+//   given operands.
+//
+// - CustomCall() matches any CustomCall HLO at all.
+template <typename... M>
+inline ::testing::Matcher<const ::xla::HloInstruction*> CustomCall(
+    ::testing::Matcher<string> call_target_matcher, M... operands) {
+  return ::testing::MakeMatcher(new ::xla::testing::HloCustomCallMatcher(
+      call_target_matcher, {operands...}));
+}
+// This overload of CustomCall(A, B, C, ...) exists iff A is not convertible to
+// ::testing::Matcher<string>.  In that case, we want to prefer the overload
+// above.
+template <typename FirstM, typename... M,
+          typename Dummy = typename std::enable_if<
+              !std::is_convertible<FirstM, ::testing::Matcher<string>>::value,
+              void>::type*>
+inline ::testing::Matcher<const ::xla::HloInstruction*> CustomCall(
+    FirstM operands_first, M... operands_rest) {
+  return ::testing::MakeMatcher(new ::xla::testing::HloMatcher(
+      HloOpcode::kCustomCall, {operands_first, operands_rest...}));
+}
+inline ::testing::Matcher<const ::xla::HloInstruction*> CustomCall() {
+  return ::testing::MakeMatcher(
+      new ::xla::testing::HloMatcher(HloOpcode::kCustomCall, {}));
+}
+
 #undef HLO_MATCHER
 }  // namespace opcode_matchers
 
diff --git a/tensorflow/compiler/xla/service/hlo_matchers_test.cc b/tensorflow/compiler/xla/service/hlo_matchers_test.cc
index 1465d1cacd..1c21703a45 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_matchers_test.cc
@@ -23,6 +23,12 @@ using ::testing::Eq;
 namespace xla {
 namespace {
 
+string DescribeHloMatcher(const ::testing::Matcher<const HloInstruction*>& m) {
+  std::stringstream ss;
+  m.DescribeTo(&ss);
+  return ss.str();
+}
+
 template <typename M, typename T>
 string Explain(const T& t, const M& m) {
   ::testing::StringMatchResultListener listener;
@@ -67,5 +73,32 @@ TEST(HloMatchersTest, Test) {
          "add"));
 }
 
+TEST(HloMatchersTest, CustomCallMatcher) {
+  auto c1 = HloInstruction::CreateConstant(Literal::CreateR1<float>({1, 2, 3}));
+  auto c2 = HloInstruction::CreateConstant(Literal::CreateR1<int32>({1, 2, 3}));
+  auto call = HloInstruction::CreateCustomCall(
+      ShapeUtil::MakeShape(F32, {1}), {c1.get(), c2.get()}, "foo_target");
+
+  EXPECT_THAT(call.get(), op::CustomCall());
+  EXPECT_THAT(call.get(), op::CustomCall(c1.get(), c2.get()));
+  EXPECT_THAT(call.get(), op::CustomCall("foo_target"));
+  EXPECT_THAT(call.get(), op::CustomCall("foo_target", c1.get(), c2.get()));
+  EXPECT_THAT(call.get(), op::CustomCall(::testing::StartsWith("foo")));
+  EXPECT_THAT(call.get(),
+              op::CustomCall(::testing::Not(::testing::StartsWith("bar"))));
+
+  // Wrong number of operands.
+  EXPECT_THAT(call.get(), ::testing::Not(op::CustomCall(c1.get())));
+
+  // Call target does not match.
+  EXPECT_THAT(call.get(),
+              ::testing::Not(op::CustomCall(::testing::StartsWith("bar"))));
+
+  EXPECT_THAT(Explain(call.get(), op::CustomCall("bar")),
+              R"(custom-call with call target that isn't equal to "bar")");
+  EXPECT_THAT(DescribeHloMatcher(op::CustomCall("foo_target")),
+              R"(custom-call with call target that is equal to "foo_target")");
+}
+
 }  // namespace
 }  // namespace xla
-- 
GitLab


From e9dc418b4bafa359654fd66c72b5f4ba371b68db Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 26 Jan 2018 16:23:40 -0800
Subject: [PATCH 279/966] [XLA] Don't print "{no layout}" if there is no
 layout.

PiperOrigin-RevId: 183463264
---
 tensorflow/compiler/xla/shape_util.cc | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index cba73322fa..d63e16ce2b 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -475,8 +475,6 @@ StatusOr<PrimitiveType> StringToPrimitiveType(const string& name) {
       if (LayoutUtil::HasLayout(shape)) {
         tensorflow::strings::StrAppend(&result,
                                        LayoutUtil::HumanString(shape.layout()));
-      } else {
-        tensorflow::strings::StrAppend(&result, "{no layout}");
       }
     }
     return result;
-- 
GitLab


From a977a77299f292e556ace48c75251a5a11d118ff Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 26 Jan 2018 16:36:55 -0800
Subject: [PATCH 280/966] Add bidirectional sequence RNN to TFLite Ops.

PiperOrigin-RevId: 183465032
---
 tensorflow/contrib/lite/kernels/BUILD         |  13 +
 .../kernels/bidirectional_sequence_rnn.cc     | 249 +++++
 .../bidirectional_sequence_rnn_test.cc        | 931 ++++++++++++++++++
 tensorflow/contrib/lite/kernels/register.cc   |   3 +
 tensorflow/contrib/lite/model.cc              |   1 +
 tensorflow/contrib/lite/nnapi_delegate.cc     |   1 +
 tensorflow/contrib/lite/schema/schema.fbs     |   7 +
 7 files changed, 1205 insertions(+)
 create mode 100644 tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn.cc
 create mode 100644 tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn_test.cc

diff --git a/tensorflow/contrib/lite/kernels/BUILD b/tensorflow/contrib/lite/kernels/BUILD
index b5428d3246..d9051f3516 100644
--- a/tensorflow/contrib/lite/kernels/BUILD
+++ b/tensorflow/contrib/lite/kernels/BUILD
@@ -104,6 +104,7 @@ cc_library(
         "add.cc",
         "basic_rnn.cc",
         "batch_to_space_nd.cc",
+        "bidirectional_sequence_rnn.cc",
         "concatenation.cc",
         "conv.cc",
         "depthwise_conv.cc",
@@ -288,6 +289,18 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "bidirectional_sequence_rnn_test",
+    size = "small",
+    srcs = ["bidirectional_sequence_rnn_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 tf_cc_test(
     name = "unidirectional_sequence_rnn_test",
     size = "small",
diff --git a/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn.cc b/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn.cc
new file mode 100644
index 0000000000..f540816235
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn.cc
@@ -0,0 +1,249 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <unistd.h>
+#include <cassert>
+#include <cmath>
+#include <cstdlib>
+#include <cstdio>
+#include <iostream>
+#include <limits>
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/activation_functor.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace bidirectional_sequence_rnn {
+
+constexpr int kInputTensor = 0;
+// Forward and backward cell tensors.
+constexpr int kFwWeightsTensor = 1;
+constexpr int kFwRecurrentWeightsTensor = 2;
+constexpr int kFwBiasTensor = 3;
+constexpr int kBwWeightsTensor = 4;
+constexpr int kBwRecurrentWeightsTensor = 5;
+constexpr int kBwBiasTensor = 6;
+// State and output tensors.
+constexpr int kFwHiddenStateTensor = 0;
+constexpr int kFwOutputTensor = 1;
+constexpr int kBwHiddenStateTensor = 2;
+constexpr int kBwOutputTensor = 3;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  // Check we have all the inputs and outputs we need.
+  TF_LITE_ENSURE_EQ(context, node->inputs->size, 7);
+  TF_LITE_ENSURE_EQ(context, node->outputs->size, 4);
+
+  TfLiteTensor* input = &context->tensors[node->inputs->data[kInputTensor]];
+  TfLiteTensor* fw_input_weights =
+      &context->tensors[node->inputs->data[kFwWeightsTensor]];
+  TfLiteTensor* fw_recurrent_weights =
+      &context->tensors[node->inputs->data[kFwRecurrentWeightsTensor]];
+  TfLiteTensor* fw_bias = &context->tensors[node->inputs->data[kFwBiasTensor]];
+  TfLiteTensor* bw_input_weights =
+      &context->tensors[node->inputs->data[kBwWeightsTensor]];
+  TfLiteTensor* bw_recurrent_weights =
+      &context->tensors[node->inputs->data[kBwRecurrentWeightsTensor]];
+  TfLiteTensor* bw_bias = &context->tensors[node->inputs->data[kBwBiasTensor]];
+
+  // Check all the parameters of tensor match within themselves and match the
+  // input configuration.
+  const int batch_size = input->dims->data[0];
+  const int max_time = input->dims->data[1];
+  const int fw_num_units = fw_input_weights->dims->data[0];
+  const int bw_num_units = bw_input_weights->dims->data[0];
+  TF_LITE_ASSERT_EQ(input->dims->data[2], fw_input_weights->dims->data[1]);
+  TF_LITE_ASSERT_EQ(input->dims->data[2], bw_input_weights->dims->data[1]);
+  TF_LITE_ASSERT_EQ(fw_input_weights->dims->data[0], fw_bias->dims->data[0]);
+  TF_LITE_ASSERT_EQ(bw_input_weights->dims->data[0], bw_bias->dims->data[0]);
+  TF_LITE_ASSERT_EQ(fw_recurrent_weights->dims->data[0],
+                    fw_bias->dims->data[0]);
+  TF_LITE_ASSERT_EQ(bw_recurrent_weights->dims->data[1],
+                    bw_bias->dims->data[0]);
+
+  TfLiteTensor* fw_output =
+      &context->tensors[node->outputs->data[kFwOutputTensor]];
+  TfLiteTensor* bw_output =
+      &context->tensors[node->outputs->data[kBwOutputTensor]];
+
+  // Resize hidden states.
+  TfLiteIntArray* fw_hidden_state_size_array = TfLiteIntArrayCreate(2);
+  fw_hidden_state_size_array->data[0] = batch_size;
+  fw_hidden_state_size_array->data[1] = fw_num_units;
+  TfLiteTensor* fw_hidden_state =
+      &context->tensors[node->outputs->data[kFwHiddenStateTensor]];
+  TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, fw_hidden_state,
+                                                   fw_hidden_state_size_array));
+
+  TfLiteIntArray* bw_hidden_state_size_array = TfLiteIntArrayCreate(2);
+  bw_hidden_state_size_array->data[0] = batch_size;
+  bw_hidden_state_size_array->data[1] = fw_num_units;
+  TfLiteTensor* bw_hidden_state =
+      &context->tensors[node->outputs->data[kBwHiddenStateTensor]];
+  TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, bw_hidden_state,
+                                                   bw_hidden_state_size_array));
+
+  // Mark hidden states as a persistent tensor.
+  fw_hidden_state->allocation_type = kTfLiteArenaRwPersistent;
+  bw_hidden_state->allocation_type = kTfLiteArenaRwPersistent;
+
+  // Resize outputs.
+  TfLiteIntArray* fw_output_size_array = TfLiteIntArrayCreate(3);
+  fw_output_size_array->data[0] = batch_size;
+  fw_output_size_array->data[1] = max_time;
+  fw_output_size_array->data[2] = fw_num_units;
+  TF_LITE_ENSURE_OK(
+      context, context->ResizeTensor(context, fw_output, fw_output_size_array));
+  TfLiteIntArray* bw_output_size_array = TfLiteIntArrayCreate(3);
+  bw_output_size_array->data[0] = batch_size;
+  bw_output_size_array->data[1] = max_time;
+  bw_output_size_array->data[2] = bw_num_units;
+  TF_LITE_ENSURE_OK(
+      context, context->ResizeTensor(context, bw_output, bw_output_size_array));
+
+  return kTfLiteOk;
+}
+
+namespace {
+// Performs one RNN computation step for the input specified by input_ptr_batch.
+// The RNN cell is specified by the pointers to its weights and biases, along
+// with the input size, number of units, strides, activation.
+// The pointers to the hidden state and the output are updated as a result.
+// TODO(mirkov): factor out this function to a shared library.
+void RnnStep(const float* input_ptr_batch, const float* input_weights_ptr,
+             const float* recurrent_weights_ptr, const float* bias_ptr,
+             int input_size, int num_units, int input_weights_stride,
+             int recurrent_weights_stride, TfLiteFusedActivation activation,
+             float* hidden_state_ptr_batch, float* output_ptr_batch) {
+  // Output = bias
+  for (int o = 0; o < num_units; o++) {
+    output_ptr_batch[o] = bias_ptr[o];
+  }
+
+  // Output += input * input_weights
+  for (int o = 0; o < num_units; o++) {
+    for (int i = 0; i < input_size; i++) {
+      output_ptr_batch[o] += input_ptr_batch[i] * input_weights_ptr[i];
+    }
+    input_weights_ptr += input_weights_stride;
+  }
+
+  // Output += recurrent_weights * hidden_state
+  for (int o = 0; o < num_units; o++) {
+    for (int h = 0; h < num_units; h++) {
+      output_ptr_batch[o] +=
+          hidden_state_ptr_batch[h] * recurrent_weights_ptr[h];
+    }
+    recurrent_weights_ptr += recurrent_weights_stride;
+  }
+
+  // Output = activation(Output) and update hidden_state
+  for (int o = 0; o < num_units; o++) {
+    output_ptr_batch[o] = (ActivationFunctor(activation))(output_ptr_batch[o]);
+    hidden_state_ptr_batch[o] = output_ptr_batch[o];
+  }
+}
+}  // namespace
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteSequenceRNNParams*>(node->builtin_data);
+
+  TfLiteTensor* input = &context->tensors[node->inputs->data[kInputTensor]];
+  TfLiteTensor* fw_input_weights =
+      &context->tensors[node->inputs->data[kFwWeightsTensor]];
+  TfLiteTensor* fw_recurrent_weights =
+      &context->tensors[node->inputs->data[kFwRecurrentWeightsTensor]];
+  TfLiteTensor* fw_bias = &context->tensors[node->inputs->data[kFwBiasTensor]];
+  TfLiteTensor* fw_hidden_state =
+      &context->tensors[node->outputs->data[kFwHiddenStateTensor]];
+  TfLiteTensor* fw_output =
+      &context->tensors[node->outputs->data[kFwOutputTensor]];
+
+  TfLiteTensor* bw_input_weights =
+      &context->tensors[node->inputs->data[kBwWeightsTensor]];
+  TfLiteTensor* bw_recurrent_weights =
+      &context->tensors[node->inputs->data[kBwRecurrentWeightsTensor]];
+  TfLiteTensor* bw_bias = &context->tensors[node->inputs->data[kBwBiasTensor]];
+  TfLiteTensor* bw_hidden_state =
+      &context->tensors[node->outputs->data[kBwHiddenStateTensor]];
+  TfLiteTensor* bw_output =
+      &context->tensors[node->outputs->data[kBwOutputTensor]];
+
+  const int batch_size = input->dims->data[0];
+  const int max_time = input->dims->data[1];
+  const int input_size = input->dims->data[2];
+
+  const int fw_num_units = fw_input_weights->dims->data[0];
+  const int fw_input_weights_stride = fw_input_weights->dims->data[1];
+  const int fw_recurrent_weights_stride = fw_recurrent_weights->dims->data[1];
+  const float* fw_bias_ptr = fw_bias->data.f;
+  const float* fw_input_weights_ptr = fw_input_weights->data.f;
+  const float* fw_recurrent_weights_ptr = fw_recurrent_weights->data.f;
+
+  const int bw_num_units = bw_input_weights->dims->data[0];
+  const int bw_input_weights_stride = bw_input_weights->dims->data[1];
+  const int bw_recurrent_weights_stride = bw_recurrent_weights->dims->data[1];
+  const float* bw_bias_ptr = bw_bias->data.f;
+  const float* bw_input_weights_ptr = bw_input_weights->data.f;
+  const float* bw_recurrent_weights_ptr = bw_recurrent_weights->data.f;
+
+  for (int b = 0; b < batch_size; b++) {
+    // Forward cell.
+    float* fw_hidden_state_ptr_batch =
+        fw_hidden_state->data.f + b * fw_num_units;
+    for (int s = 0; s < max_time; s++) {
+      const float* input_ptr_batch =
+          input->data.f + b * input_size * max_time + s * input_size;
+      float* output_ptr_batch =
+          fw_output->data.f + b * fw_num_units * max_time + s * fw_num_units;
+
+      RnnStep(input_ptr_batch, fw_input_weights_ptr, fw_recurrent_weights_ptr,
+              fw_bias_ptr, input_size, fw_num_units, fw_input_weights_stride,
+              fw_recurrent_weights_stride, params->activation,
+              fw_hidden_state_ptr_batch, output_ptr_batch);
+    }
+    // Backward cell.
+    float* bw_hidden_state_ptr_batch =
+        bw_hidden_state->data.f + b * bw_num_units;
+    for (int s = max_time - 1; s >= 0; s--) {
+      const float* input_ptr_batch =
+          input->data.f + b * input_size * max_time + s * input_size;
+      float* output_ptr_batch =
+          bw_output->data.f + b * bw_num_units * max_time + s * bw_num_units;
+
+      RnnStep(input_ptr_batch, bw_input_weights_ptr, bw_recurrent_weights_ptr,
+              bw_bias_ptr, input_size, bw_num_units, bw_input_weights_stride,
+              bw_recurrent_weights_stride, params->activation,
+              bw_hidden_state_ptr_batch, output_ptr_batch);
+    }
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace bidirectional_sequence_rnn
+
+TfLiteRegistration* Register_BIDIRECTIONAL_SEQUENCE_RNN() {
+  static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
+                                 bidirectional_sequence_rnn::Prepare,
+                                 bidirectional_sequence_rnn::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn_test.cc b/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn_test.cc
new file mode 100644
index 0000000000..12f4ff97cf
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn_test.cc
@@ -0,0 +1,931 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Unit test for TFLite Bidirectional RNN op.
+
+#include <vector>
+#include <iomanip>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+static float rnn_input[] = {
+    0.23689353,   0.285385,     0.037029743, -0.19858193,  -0.27569133,
+    0.43773448,   0.60379338,   0.35562468,  -0.69424844,  -0.93421471,
+    -0.87287879,  0.37144363,   -0.62476718, 0.23791671,   0.40060222,
+    0.1356622,    -0.99774903,  -0.98858172, -0.38952237,  -0.47685933,
+    0.31073618,   0.71511042,   -0.63767755, -0.31729108,  0.33468103,
+    0.75801885,   0.30660987,   -0.37354088, 0.77002847,   -0.62747043,
+    -0.68572164,  0.0069220066, 0.65791464,  0.35130811,   0.80834007,
+    -0.61777675,  -0.21095741,  0.41213346,  0.73784804,   0.094794154,
+    0.47791874,   0.86496925,   -0.53376222, 0.85315156,   0.10288584,
+    0.86684,      -0.011186242, 0.10513687,  0.87825835,   0.59929144,
+    0.62827742,   0.18899453,   0.31440187,  0.99059987,   0.87170351,
+    -0.35091716,  0.74861872,   0.17831337,  0.2755419,    0.51864719,
+    0.55084288,   0.58982027,   -0.47443086, 0.20875752,   -0.058871567,
+    -0.66609079,  0.59098077,   0.73017097,  0.74604273,   0.32882881,
+    -0.17503482,  0.22396147,   0.19379807,  0.29120302,   0.077113032,
+    -0.70331609,  0.15804303,   -0.93407321, 0.40182066,   0.036301374,
+    0.66521823,   0.0300982,    -0.7747041,  -0.02038002,  0.020698071,
+    -0.90300065,  0.62870288,   -0.23068321, 0.27531278,   -0.095755219,
+    -0.712036,    -0.17384434,  -0.50593495, -0.18646687,  -0.96508682,
+    0.43519354,   0.14744234,   0.62589407,  0.1653645,    -0.10651493,
+    -0.045277178, 0.99032974,   -0.88255352, -0.85147917,  0.28153265,
+    0.19455957,   -0.55479527,  -0.56042433, 0.26048636,   0.84702539,
+    0.47587705,   -0.074295521, -0.12287641, 0.70117295,   0.90532446,
+    0.89782166,   0.79817224,   0.53402734,  -0.33286154,  0.073485017,
+    -0.56172788,  -0.044897556, 0.89964068,  -0.067662835, 0.76863563,
+    0.93455386,   -0.6324693,   -0.083922029};
+
+static float rnn_golden_fw_output[] = {
+    0.496726,   0,          0.965996,  0,         0.0584254, 0,
+    0,          0.12315,    0,         0,         0.612266,  0.456601,
+    0,          0.52286,    1.16099,   0.0291232,
+
+    0,          0,          0.524901,  0,         0,         0,
+    0,          1.02116,    0,         1.35762,   0,         0.356909,
+    0.436415,   0.0355727,  0,         0,
+
+    0,          0,          0,         0.262335,  0,         0,
+    0,          1.33992,    0,         2.9739,    0,         0,
+    1.31914,    2.66147,    0,         0,
+
+    0.942568,   0,          0,         0,         0.025507,  0,
+    0,          0,          0.321429,  0.569141,  1.25274,   1.57719,
+    0.8158,     1.21805,    0.586239,  0.25427,
+
+    1.04436,    0,          0.630725,  0,         0.133801,  0.210693,
+    0.363026,   0,          0.533426,  0,         1.25926,   0.722707,
+    0,          1.22031,    1.30117,   0.495867,
+
+    0.222187,   0,          0.72725,   0,         0.767003,  0,
+    0,          0.147835,   0,         0,         0,         0.608758,
+    0.469394,   0.00720298, 0.927537,  0,
+
+    0.856974,   0.424257,   0,         0,         0.937329,  0,
+    0,          0,          0.476425,  0,         0.566017,  0.418462,
+    0.141911,   0.996214,   1.13063,   0,
+
+    0.967899,   0,          0,         0,         0.0831304, 0,
+    0,          1.00378,    0,         0,         0,         1.44818,
+    1.01768,    0.943891,   0.502745,  0,
+
+    0.940135,   0,          0,         0,         0,         0,
+    0,          2.13243,    0,         0.71208,   0.123918,  1.53907,
+    1.30225,    1.59644,    0.70222,   0,
+
+    0.804329,   0,          0.430576,  0,         0.505872,  0.509603,
+    0.343448,   0,          0.107756,  0.614544,  1.44549,   1.52311,
+    0.0454298,  0.300267,   0.562784,  0.395095,
+
+    0.228154,   0,          0.675323,  0,         1.70536,   0.766217,
+    0,          0,          0,         0.735363,  0.0759267, 1.91017,
+    0.941888,   0,          0,         0,
+
+    0,          0,          1.5909,    0,         0,         0,
+    0,          0.5755,     0,         0.184687,  0,         1.56296,
+    0.625285,   0,          0,         0,
+
+    0,          0,          0.0857888, 0,         0,         0,
+    0,          0.488383,   0.252786,  0,         0,         0,
+    1.02817,    1.85665,    0,         0,
+
+    0.00981836, 0,          1.06371,   0,         0,         0,
+    0,          0,          0,         0.290445,  0.316406,  0,
+    0.304161,   1.25079,    0.0707152, 0,
+
+    0.986264,   0.309201,   0,         0,         0,         0,
+    0,          1.64896,    0.346248,  0,         0.918175,  0.78884,
+    0.524981,   1.92076,    2.07013,   0.333244,
+
+    0.415153,   0.210318,   0,         0,         0,         0,
+    0,          2.02616,    0,         0.728256,  0.84183,   0.0907453,
+    0.628881,   3.58099,    1.49974,   0};
+
+static float rnn_golden_bw_output[] = {
+    0.496726, 0,          1.00883,   0,         0.0584256, 0,         0,
+    0.236412, 0,          0,         0.612267,  0.487726,  0,         0.54883,
+    1.16099,  0.0291233,  0,         0,         0.428302,  0,         0,
+    0,        0,          1.13262,   0,         1.64415,   0,         0.311249,
+    0.570804, 0.259696,   0,         0,         0,         0,         0,
+    0.262334, 0,          0,         0,         1.23781,   0,         2.86532,
+    0,        0,          1.34389,   2.76409,   0,         0,         1.03969,
+    0,        0.00410865, 0,         0.0470295, 0,         0,         0,
+    0.371556, 0.27175,    1.36614,   1.63956,   0.683887,  1.06176,   0.719552,
+    0.301314, 0.971195,   0,         0.697143,  0,         0.215219,  0.210693,
+    0.363027, 0,          0.501283,  0,         1.13399,   0.623774,  0,
+    1.09851,  1.33313,    0.470441,  0.210965,  0,         0.664178,  0,
+    0.839686, 0,          0,         0.147834,  0,         0,         0,
+    0.58786,  0.490128,   0,         0.905806,  0,         0.932134,  0.424257,
+    0,        0,          0.860629,  0,         0,         0,         0.476425,
+    0,        0.566017,   0.513721,  0.207341,  1.09508,   1.08385,   0,
+    0.973787, 0,          0,         0,         0,         0,         0,
+    1.20698,  0,          0,         0,         1.56135,   1.12369,   0.99588,
+    0.459803, 0,          0.915854,  0,         0,         0,         0,
+    0,        0,          2.03206,   0,         0.773264,  0.267228,  1.55012,
+    1.202,    1.51611,    0.701202,  0,         0.725088,  0,         0.509069,
+    0,        0.671349,   0.581129,  0.343447,  0,         0.107755,  0.611838,
+    1.4331,   1.55871,    0.015242,  0.140624,  0.492562,  0.395095,  0.147722,
+    0,        0.784925,   0,         1.65477,   0.715257,  0,         0,
+    0,        0.685024,   0,         1.89505,   1.00037,   0,         0,
+    0,        0,          0,         1.52659,   0,         0,         0,
+    0,        0.618583,   0,         0.11115,   0,         1.37194,   0.630225,
+    0,        0,          0,         0,         0,         0.0322124, 0,
+    0,        0,          0,         0.430834,  0.252786,  0,         0,
+    0,        0.991297,   1.98451,   0,         0,         0.111511,  0,
+    1.05513,  0,          0,         0,         0,         0,         0,
+    0.290445, 0.412559,   0.0429958, 0.256564,  1.27858,   0.289948,  0,
+    1.01693,  0.327141,   0,         0,         0,         0,         0,
+    1.83508,  0.346248,   0,         0.961535,  0.790026,  0.552203,  2.13457,
+    2.19233,  0.333244,   0.316526,  0.179398,  0,         0,         0,
+    0,        0,          1.86126,   0,         0.728256,  0.750013,  0.011861,
+    0.576383, 3.38891,    1.29273,   0};
+
+constexpr std::initializer_list<float> weights = {
+    0.461459,    0.153381,   0.529743,    -0.00371218, 0.676267,   -0.211346,
+    0.317493,    0.969689,   -0.343251,   0.186423,    0.398151,   0.152399,
+    0.448504,    0.317662,   0.523556,    -0.323514,   0.480877,   0.333113,
+    -0.757714,   -0.674487,  -0.643585,   0.217766,    -0.0251462, 0.79512,
+    -0.595574,   -0.422444,  0.371572,    -0.452178,   -0.556069,  -0.482188,
+    -0.685456,   -0.727851,  0.841829,    0.551535,    -0.232336,  0.729158,
+    -0.00294906, -0.69754,   0.766073,    -0.178424,   0.369513,   -0.423241,
+    0.548547,    -0.0152023, -0.757482,   -0.85491,    0.251331,   -0.989183,
+    0.306261,    -0.340716,  0.886103,    -0.0726757,  -0.723523,  -0.784303,
+    0.0354295,   0.566564,   -0.485469,   -0.620498,   0.832546,   0.697884,
+    -0.279115,   0.294415,   -0.584313,   0.548772,    0.0648819,  0.968726,
+    0.723834,    -0.0080452, -0.350386,   -0.272803,   0.115121,   -0.412644,
+    -0.824713,   -0.992843,  -0.592904,   -0.417893,   0.863791,   -0.423461,
+    -0.147601,   -0.770664,  -0.479006,   0.654782,    0.587314,   -0.639158,
+    0.816969,    -0.337228,  0.659878,    0.73107,     0.754768,   -0.337042,
+    0.0960841,   0.368357,   0.244191,    -0.817703,   -0.211223,  0.442012,
+    0.37225,     -0.623598,  -0.405423,   0.455101,    0.673656,   -0.145345,
+    -0.511346,   -0.901675,  -0.81252,    -0.127006,   0.809865,   -0.721884,
+    0.636255,    0.868989,   -0.347973,   -0.10179,    -0.777449,  0.917274,
+    0.819286,    0.206218,   -0.00785118, 0.167141,    0.45872,    0.972934,
+    -0.276798,   0.837861,   0.747958,    -0.0151566,  -0.330057,  -0.469077,
+    0.277308,    0.415818};
+
+static float endtoend_input[] = {
+    0.996808, 0.060710, 0.981855, 0.570017, 0.525164, 0.796859, 0.696547,
+    0.505925, 0.991844, 0.461208, 0.949371, 0.027624, 0.539236, 0.841854,
+    0.915222, 0.538569, 0.069375, 0.237905, 0.903700, 0.441703, 0.536196,
+    0.402724, 0.761635, 0.025063, 0.082592, 0.688245, 0.239310, 0.256931,
+    0.658900, 0.105695, 0.301983, 0.655708, 0.166405, 0.283837, 0.225725,
+    0.691569, 0.080696, 0.922272, 0.197494, 0.072540, 0.383481, 0.146865,
+    0.100163, 0.922717, 0.988720, 0.015386, 0.461286, 0.058095, 0.253290,
+    0.364986, 0.499797, 0.789487, 0.767709, 0.261433, 0.814549, 0.850302,
+    0.949678, 0.053859, 0.107233, 0.608577, 0.159554, 0.409215, 0.264285,
+    0.325960, 0.693053, 0.490011, 0.017529, 0.773749, 0.412283, 0.215023,
+    0.846288, 0.795764, 0.361889, 0.946452, 0.718481, 0.350608, 0.961837,
+    0.179767, 0.408703, 0.215128, 0.544753, 0.908500, 0.004614, 0.312462,
+    0.169933, 0.819163, 0.162764, 0.119611, 0.873022, 0.269997, 0.728188,
+    0.032576, 0.679212, 0.992474, 0.358536, 0.372265, 0.482484, 0.376065,
+    0.146014, 0.894767, 0.591088, 0.992302, 0.690531, 0.952977, 0.938754,
+    0.409012, 0.303585, 0.900591, 0.588780, 0.712287, 0.115719, 0.133533,
+    0.620788, 0.120334, 0.445995, 0.790720, 0.939497, 0.608759, 0.910331,
+    0.812519, 0.878756, 0.638519, 0.845096, 0.557968, 0.630993, 0.203632,
+    0.930233, 0.113477, 0.579697, 0.076247, 0.008244, 0.170785, 0.068549,
+    0.698776, 0.123761, 0.007303, 0.107788, 0.427346, 0.907894, 0.696568,
+    0.139633, 0.023613, 0.830100, 0.760421, 0.143947, 0.276096, 0.551141,
+    0.083444, 0.884855, 0.461472, 0.895963, 0.763611, 0.099992, 0.741059,
+    0.321579, 0.730984, 0.944691, 0.251812, 0.844461, 0.524388, 0.328059,
+    0.852706, 0.695172, 0.396607, 0.551482, 0.818934, 0.403910, 0.659270,
+    0.246280, 0.311804, 0.355838, 0.385913, 0.335418, 0.185938, 0.146334,
+    0.479364, 0.462034, 0.697475, 0.562808, 0.346888, 0.158948, 0.458771,
+    0.110499, 0.258939, 0.199830, 0.432078, 0.989924, 0.144521, 0.683890,
+    0.834385, 0.668908, 0.011949, 0.687091, 0.364081, 0.408556, 0.238572,
+    0.183015, 0.812466, 0.897842, 0.429294, 0.124271, 0.253680, 0.815207,
+    0.459688, 0.439618, 0.961541, 0.939053, 0.901651, 0.659016, 0.501861,
+    0.248539, 0.817964, 0.960632, 0.359038, 0.076903, 0.160462, 0.791117,
+    0.066826, 0.304983, 0.475007, 0.901211, 0.973891, 0.486955, 0.588302,
+    0.337972, 0.895512, 0.826874, 0.520987, 0.707978, 0.724716, 0.950281,
+    0.832249, 0.978396, 0.765488, 0.291937, 0.418014, 0.727029, 0.230990,
+    0.319665, 0.386045, 0.732850, 0.568204, 0.204009, 0.693482, 0.927242,
+    0.280912, 0.853944, 0.718359, 0.347738, 0.158927, 0.193366, 0.248950,
+    0.132818, 0.680321, 0.837252, 0.470790, 0.575833, 0.664126, 0.991777,
+    0.283811, 0.388843, 0.942058, 0.116060, 0.367239, 0.707546, 0.407997,
+    0.785253, 0.434575, 0.638986, 0.104917, 0.820620, 0.371837, 0.673121,
+    0.024629, 0.065319, 0.600363, 0.305541, 0.919263, 0.318722, 0.653279,
+    0.078190, 0.512088, 0.902229, 0.211009, 0.192409, 0.739480, 0.681799,
+    0.768242, 0.403607, 0.673576, 0.052052, 0.792450, 0.615634, 0.168112,
+    0.159689, 0.323180, 0.576109, 0.944941, 0.757755, 0.215095, 0.049858,
+    0.578375, 0.586932, 0.722979, 0.603003, 0.652251, 0.323343, 0.908544,
+    0.571514, 0.642065, 0.561823, 0.649704, 0.154153, 0.464051, 0.860713,
+    0.346562, 0.203532, 0.542512, 0.114804, 0.607139, 0.216088, 0.166856,
+    0.399588, 0.831722, 0.334968, 0.559277, 0.154902, 0.911077, 0.504218,
+    0.912656, 0.126172, 0.554076, 0.491031, 0.713104, 0.277055, 0.094034,
+    0.365355, 0.600398, 0.002578, 0.936869, 0.242463, 0.564401, 0.586574,
+    0.396616, 0.028452, 0.447287, 0.743178, 0.231984, 0.989799, 0.857982,
+    0.839122, 0.205887, 0.024838, 0.238711, 0.037608, 0.359806, 0.797987,
+    0.192510, 0.270883, 0.302205, 0.105166, 0.397055, 0.856281, 0.596197,
+    0.110160, 0.133336, 0.690231, 0.475515, 0.733734, 0.692809, 0.412384,
+    0.976196, 0.257209, 0.998958, 0.372812, 0.285661, 0.446245, 0.115990,
+    0.517645, 0.436044, 0.973972, 0.356767, 0.641930, 0.998810, 0.595478,
+    0.679539, 0.358617, 0.393465, 0.872049, 0.629500, 0.695670, 0.977215,
+    0.026555, 0.551951, 0.573412, 0.136715, 0.685287, 0.263643, 0.612229,
+    0.419020, 0.956451, 0.024613, 0.395216, 0.213661, 0.023572, 0.768029,
+    0.499322, 0.469816, 0.884019, 0.016967, 0.905860, 0.857991, 0.373734,
+    0.547791, 0.856802, 0.969211, 0.227330, 0.215418, 0.362676, 0.099378,
+    0.844918, 0.058346, 0.076594, 0.871473, 0.610297, 0.650006, 0.008188,
+    0.295583, 0.913648, 0.620417, 0.714603, 0.870100, 0.645031, 0.109820,
+    0.083760, 0.668602, 0.877849, 0.583082, 0.138419, 0.761868, 0.600049,
+    0.044279, 0.619859, 0.973783, 0.592069, 0.476661, 0.942994, 0.819399,
+    0.692079, 0.305670, 0.918778, 0.536997, 0.364016, 0.995371, 0.408470,
+    0.974313, 0.645377, 0.416658, 0.269896, 0.559025, 0.037075, 0.984499,
+    0.429125, 0.682105, 0.094319, 0.512885, 0.350707, 0.972168, 0.095967,
+    0.489126, 0.734035, 0.696016, 0.533405, 0.353894, 0.669799, 0.125474,
+    0.830555, 0.612793, 0.944873, 0.522634, 0.918463, 0.863651, 0.059631,
+    0.282479, 0.859022, 0.468101, 0.256791, 0.504398, 0.884758, 0.526687,
+    0.063423, 0.921833, 0.511186, 0.492548, 0.603939, 0.605505, 0.005433,
+    0.954646, 0.577673, 0.101400, 0.443772, 0.311708, 0.797417, 0.977176,
+    0.665602, 0.467216, 0.102650, 0.496157, 0.080009, 0.047524, 0.018791,
+    0.998471, 0.911174, 0.078422, 0.280950, 0.770196, 0.546523, 0.537741,
+    0.274594, 0.431281, 0.064428, 0.338017, 0.353115, 0.575615, 0.830565,
+    0.957053, 0.181120, 0.835998, 0.911699, 0.758793, 0.937398, 0.355471,
+    0.070501, 0.734815, 0.332647, 0.736103, 0.202031, 0.435297, 0.232261,
+    0.282039, 0.482821, 0.251052, 0.280511, 0.393995, 0.329474, 0.561460,
+    0.164191, 0.875997, 0.099202, 0.438785, 0.307278, 0.163630, 0.776802,
+    0.660393, 0.739244, 0.607367, 0.617446, 0.920364, 0.443365, 0.529145,
+    0.679157, 0.380763, 0.884616, 0.749658, 0.115578, 0.217263, 0.485761,
+    0.317609, 0.652560, 0.718021, 0.599648, 0.135381, 0.969073, 0.880159,
+    0.529376, 0.298547, 0.441619, 0.693567, 0.174544, 0.540821, 0.132351,
+    0.481822, 0.704450, 0.909153, 0.142215, 0.443695, 0.516520, 0.759661,
+    0.364059, 0.959885, 0.288806, 0.043216, 0.340648, 0.173422, 0.792874,
+    0.456226, 0.390685, 0.278634, 0.773834, 0.043245, 0.996656, 0.373483,
+    0.178625, 0.965729, 0.253641, 0.708001, 0.264276, 0.695260, 0.401568,
+    0.438820, 0.236081, 0.533919, 0.920642, 0.940531, 0.443072, 0.062857,
+    0.384226, 0.959592, 0.822518, 0.748285, 0.919477, 0.111325, 0.791501,
+    0.260124, 0.284747, 0.584375, 0.716350, 0.675431, 0.863009, 0.490184,
+    0.718676, 0.859665, 0.863666, 0.897301, 0.825393, 0.117308, 0.605302,
+    0.089669, 0.812568, 0.006870, 0.528489, 0.048649, 0.540788, 0.449131,
+    0.989180, 0.983860, 0.511988, 0.373407, 0.943452, 0.334506, 0.121692,
+    0.862929, 0.445831, 0.913193, 0.123053, 0.730578, 0.497568, 0.839402,
+    0.406009, 0.360577, 0.329586, 0.124685, 0.220241, 0.193253, 0.021986,
+    0.045634, 0.310560, 0.627288, 0.135303, 0.123128, 0.634158, 0.663792,
+    0.171777, 0.174946, 0.112923, 0.160958, 0.158806, 0.624911, 0.534364,
+    0.102259, 0.959418, 0.656056, 0.965187, 0.405249, 0.569249, 0.088240,
+    0.135827, 0.066817, 0.927642, 0.541836, 0.427393, 0.257229, 0.666520,
+    0.647634, 0.450481, 0.688506, 0.693269, 0.761042, 0.315794, 0.828572,
+    0.884170, 0.949952, 0.492364, 0.055947, 0.124898, 0.605288, 0.216905,
+    0.283705, 0.230199, 0.751269, 0.385963, 0.189616, 0.407326, 0.351151,
+    0.594865, 0.976575, 0.439391, 0.730692, 0.043392, 0.367033, 0.272527,
+    0.470785, 0.624261, 0.939048, 0.118419, 0.074743, 0.627554, 0.811688,
+    0.835784, 0.943348, 0.640260, 0.719954, 0.893300, 0.132625, 0.775901,
+    0.018199, 0.737913, 0.992806, 0.301903, 0.968111, 0.744076, 0.687867,
+    0.157728, 0.151401, 0.039017, 0.752593, 0.127976, 0.478408, 0.483284,
+    0.171368, 0.845441, 0.755811, 0.642153, 0.469702, 0.694859, 0.760572,
+    0.544445, 0.322413, 0.572260, 0.380229, 0.265761, 0.212521, 0.100183,
+    0.159062, 0.345146, 0.876084, 0.177261, 0.083058, 0.868891, 0.479164,
+    0.051169, 0.612966, 0.167030, 0.208897, 0.764367, 0.206048, 0.961490,
+    0.892343, 0.684456, 0.444774, 0.063711, 0.529896, 0.200585, 0.705863,
+    0.999598, 0.895444, 0.466435, 0.544043, 0.217857, 0.038696, 0.924272,
+    0.483618, 0.251217, 0.024455, 0.642680, 0.596362, 0.900539, 0.819941,
+    0.679420, 0.769430, 0.299105, 0.730590, 0.382396, 0.466135, 0.939487,
+    0.146763, 0.672183, 0.900977, 0.039106, 0.356638, 0.345750, 0.102817,
+    0.886535, 0.546336, 0.808681, 0.886133, 0.441780, 0.275116, 0.430176,
+    0.659637, 0.313812, 0.354448, 0.143255, 0.565028, 0.378903, 0.785935,
+    0.161391, 0.279443, 0.605876, 0.840811, 0.048873, 0.904980, 0.571401,
+    0.431269, 0.371115, 0.510887, 0.578032, 0.043298, 0.411864, 0.617138,
+    0.399936, 0.757614, 0.719955, 0.286471, 0.303950, 0.528636, 0.172604,
+    0.745730, 0.803752, 0.602780, 0.405367, 0.117564, 0.957228, 0.548622,
+    0.682592, 0.336131, 0.334557, 0.843983, 0.615574, 0.940433, 0.684794,
+    0.664447, 0.845413, 0.256194, 0.095715, 0.216529, 0.767082, 0.673747,
+    0.259827, 0.178946, 0.290885, 0.659763, 0.936560, 0.010840, 0.946234,
+    0.240510, 0.539476, 0.118838, 0.986240, 0.343228, 0.721618, 0.391606,
+    0.460792, 0.678846, 0.940228, 0.143384, 0.014977, 0.274785, 0.987367,
+    0.630551, 0.215218, 0.672161, 0.294998, 0.060631, 0.928355, 0.390713,
+    0.277160, 0.695436, 0.064460, 0.536987, 0.874382, 0.355345, 0.196751,
+    0.810942, 0.366185, 0.142985, 0.051452, 0.905661, 0.261823, 0.037691,
+    0.248889, 0.983441, 0.429297, 0.709681, 0.662286, 0.369525, 0.853066,
+    0.677263, 0.644310, 0.840433, 0.307814, 0.859528, 0.512593, 0.602812,
+    0.920160, 0.440948, 0.993525, 0.197320, 0.136384, 0.057984, 0.734307,
+    0.010766, 0.413329, 0.931058, 0.821707, 0.779514, 0.074043, 0.873159,
+    0.685175, 0.335865, 0.910850, 0.934065, 0.319306, 0.340147, 0.643746,
+    0.981592, 0.709673, 0.496812, 0.658856, 0.353983, 0.337245, 0.966670,
+    0.213511, 0.849838, 0.569482, 0.133671, 0.290786, 0.563007, 0.330991,
+    0.427170, 0.620991, 0.065299, 0.437936, 0.034320, 0.996356, 0.259643,
+    0.813834, 0.070399, 0.132802, 0.499009, 0.406265, 0.043652, 0.433074,
+    0.725570, 0.383800, 0.076820, 0.707163, 0.093473, 0.573632, 0.366018,
+    0.447456, 0.910877, 0.332688, 0.660967, 0.760714, 0.902170, 0.794638,
+    0.051500, 0.465177, 0.125630, 0.478670, 0.086168, 0.190928, 0.916605,
+    0.120488, 0.187285, 0.176248, 0.934322, 0.257684, 0.309050, 0.433331,
+    0.663949, 0.352703, 0.866405, 0.389519, 0.736502, 0.943226, 0.096682,
+    0.829975, 0.516858, 0.462700, 0.277430, 0.427734, 0.795388, 0.938398,
+    0.188449, 0.697558, 0.733036, 0.239948, 0.162735, 0.858666, 0.718618,
+    0.248903, 0.049594, 0.635223, 0.369391, 0.236879, 0.811472, 0.303713,
+    0.494563, 0.120522, 0.737044, 0.158511, 0.473225, 0.603450, 0.548030,
+    0.209727, 0.546675, 0.644712, 0.039702, 0.063533, 0.107412, 0.317132,
+    0.491267, 0.902800, 0.255530, 0.679716, 0.600359, 0.988566, 0.919664,
+    0.763094, 0.847232, 0.638283, 0.011997, 0.896825, 0.273506, 0.381388,
+    0.133704, 0.084978, 0.685101, 0.628267, 0.205500, 0.422145, 0.786778,
+    0.678725, 0.025595, 0.334808, 0.888452, 0.572271, 0.979520, 0.928154,
+    0.635804, 0.086932, 0.245286, 0.127071, 0.989732, 0.500816, 0.806787,
+    0.590091, 0.489382, 0.726451, 0.353185, 0.336614, 0.364734, 0.365182,
+    0.233439, 0.638240, 0.746570, 0.367143, 0.723218, 0.431671, 0.995410,
+    0.928718, 0.853816, 0.782188, 0.607442, 0.879411, 0.116995, 0.495894,
+    0.451682, 0.096515, 0.424048, 0.087485, 0.183447, 0.669334, 0.214556,
+    0.173179, 0.170151, 0.021343, 0.763269, 0.659533, 0.747794, 0.116454,
+    0.996147, 0.112528, 0.481635, 0.229586, 0.750768, 0.228205, 0.596730,
+    0.473985, 0.659876, 0.592139, 0.402703, 0.513692, 0.374327, 0.010145,
+    0.393103, 0.491322, 0.506039, 0.844785, 0.587837, 0.930088, 0.932270,
+    0.771284, 0.599422, 0.146826, 0.944463, 0.769573, 0.168169, 0.707732,
+    0.429106, 0.915964, 0.824186, 0.425253, 0.028492, 0.305821, 0.654839,
+    0.779259, 0.534026, 0.251569, 0.253245, 0.193901, 0.843708, 0.655947,
+    0.707593, 0.218035, 0.666093, 0.100696, 0.709357, 0.172132, 0.945481,
+    0.297195, 0.102220, 0.877751, 0.068479, 0.701642, 0.024577, 0.012941,
+    0.471215, 0.192747, 0.720673, 0.900321, 0.108710, 0.544859, 0.325574,
+    0.137202, 0.850679, 0.980413, 0.916462, 0.384705, 0.231982, 0.169706,
+    0.578607, 0.075690, 0.825654, 0.286200, 0.293725, 0.491746, 0.386896,
+    0.003083, 0.663878, 0.332377, 0.300278, 0.766098, 0.210128, 0.368756,
+    0.467740, 0.234705, 0.381697, 0.938955, 0.427451, 0.102370, 0.839275,
+    0.536162, 0.647229, 0.164849, 0.673364, 0.497908, 0.145262, 0.589825,
+    0.882613, 0.377244, 0.759532, 0.461220, 0.452934, 0.585185, 0.747420,
+    0.746660, 0.076932, 0.134316, 0.749743, 0.740810, 0.466692, 0.050020,
+    0.506908, 0.676820, 0.418776, 0.974648, 0.911525, 0.800474, 0.913602,
+    0.338976, 0.902844, 0.752878, 0.875138, 0.550072, 0.917727, 0.548502,
+    0.047981, 0.062989, 0.138327, 0.930594, 0.440233, 0.897859, 0.391814,
+    0.893168, 0.483044, 0.139234, 0.639828, 0.559975, 0.273549, 0.389570,
+    0.300785, 0.740242, 0.439590, 0.807693, 0.417062, 0.858367, 0.782341,
+    0.328586, 0.658840, 0.695943, 0.667562, 0.561684, 0.448821, 0.542700,
+    0.111756, 0.366548, 0.091202, 0.159737, 0.429537, 0.229529, 0.090331,
+    0.869770, 0.127388, 0.482145, 0.762938, 0.610432, 0.621379, 0.402765,
+    0.170407, 0.894928, 0.792336, 0.471192, 0.635170, 0.231926, 0.278886,
+    0.052232, 0.090293, 0.061226, 0.380818, 0.749133, 0.757170, 0.048380,
+    0.310817, 0.205990, 0.591080, 0.422573, 0.572538, 0.682282, 0.582310,
+    0.002075, 0.911812, 0.672641, 0.871845, 0.039199, 0.154786, 0.634783,
+    0.649631, 0.776165, 0.037548, 0.820038, 0.671093, 0.829884, 0.291231,
+    0.306263, 0.061810, 0.570116, 0.358495, 0.152103, 0.631343, 0.739313,
+    0.901236, 0.388512, 0.787693, 0.212053, 0.594503, 0.378773, 0.634626,
+    0.167040, 0.061056, 0.216937, 0.169115, 0.972867, 0.889578, 0.040960,
+    0.012067, 0.044364, 0.675743, 0.661698, 0.820529, 0.713291, 0.481736,
+    0.491623, 0.543175, 0.772966, 0.797886, 0.604985, 0.343083, 0.156380,
+    0.757088, 0.974425, 0.895693, 0.658324, 0.362938, 0.683386, 0.870376,
+    0.957440, 0.062159, 0.505002, 0.124481, 0.123215, 0.721939, 0.293596,
+    0.096082, 0.611517, 0.334556, 0.108149, 0.655881, 0.010299, 0.769846,
+    0.476411, 0.723590, 0.251582, 0.968033, 0.266765, 0.024548, 0.765919,
+    0.871750, 0.367631, 0.922299, 0.628838, 0.342056, 0.817992, 0.287162,
+    0.704994, 0.501378, 0.157538, 0.662434, 0.563537, 0.662541, 0.786915,
+    0.686752, 0.384480, 0.080511, 0.782834, 0.995997, 0.415067, 0.890983,
+    0.651878, 0.425365, 0.660829, 0.128289, 0.148956, 0.912411, 0.096322,
+    0.415721, 0.936959, 0.862241, 0.287471, 0.304590, 0.784540, 0.916309,
+    0.646646, 0.602533, 0.203471, 0.351640, 0.103911, 0.361009, 0.014074,
+    0.667448, 0.023550, 0.800989, 0.354200, 0.408030, 0.881500, 0.137034,
+    0.404026, 0.296566, 0.028017, 0.055904, 0.721932, 0.688846, 0.184193,
+    0.870887, 0.601257, 0.280515, 0.286608, 0.538216, 0.142755, 0.574079,
+    0.842806, 0.927296, 0.490388, 0.489452, 0.529828, 0.693859, 0.841092,
+    0.633739, 0.054869, 0.855167, 0.301187, 0.078419, 0.656156, 0.655388,
+    0.486448, 0.537656, 0.792422, 0.890475, 0.834222, 0.820439, 0.946379,
+    0.556153, 0.509285, 0.130571, 0.427041, 0.110542, 0.411086, 0.713648,
+    0.648758, 0.553842, 0.287727, 0.491563, 0.481137, 0.778116, 0.981015,
+    0.010966, 0.471975, 0.822107, 0.644705, 0.526844, 0.677274, 0.945892,
+    0.605263, 0.333430, 0.601280, 0.091711, 0.871086, 0.393702, 0.982186,
+    0.705307, 0.214141, 0.928564, 0.261461, 0.723426, 0.059136, 0.688501,
+    0.833968, 0.470222, 0.402150, 0.482725, 0.024063, 0.689877, 0.974289,
+    0.505201, 0.467993, 0.955304, 0.516166, 0.939968, 0.777411, 0.160871,
+    0.466812, 0.454685, 0.106763, 0.072075, 0.788115, 0.708043, 0.163786,
+    0.659201, 0.101744, 0.145971, 0.364508, 0.315885, 0.074536, 0.625969,
+    0.039311, 0.133672, 0.314471, 0.873279, 0.603893, 0.716620, 0.356004,
+    0.627957, 0.406498, 0.330292, 0.133157, 0.874490, 0.285596, 0.649324,
+    0.814458, 0.063007, 0.810195, 0.281270, 0.517693, 0.916958, 0.353345,
+    0.305808, 0.625000, 0.517131, 0.965009, 0.726745, 0.663102, 0.329518,
+    0.042630, 0.737638, 0.955487, 0.081940, 0.871310, 0.269957, 0.955219,
+    0.475203, 0.986578, 0.311223, 0.103160, 0.393075, 0.641515, 0.236317,
+    0.267566, 0.927112, 0.885641, 0.082024, 0.990119, 0.695835, 0.363295,
+    0.507812, 0.612793, 0.716640, 0.813620, 0.237793, 0.233770, 0.778629,
+    0.964538, 0.896872, 0.108147, 0.007167, 0.634510, 0.063633, 0.089108,
+    0.505820, 0.333591, 0.044327, 0.981023, 0.320168, 0.355550, 0.084182,
+    0.713244, 0.997065, 0.320499, 0.980810, 0.924177, 0.206140, 0.062834,
+    0.914296, 0.901975, 0.426129, 0.422107, 0.514768, 0.142768, 0.235727,
+    0.752561, 0.376539, 0.014356, 0.717099, 0.273411, 0.122502, 0.724266,
+    0.907921, 0.186136, 0.813374, 0.413741, 0.519726, 0.857701, 0.394764,
+    0.839895, 0.213251, 0.478946, 0.553139, 0.210317, 0.799446, 0.533948,
+    0.134493, 0.005586, 0.596782, 0.048789, 0.907561, 0.022911, 0.470896,
+    0.422329, 0.165679, 0.706623, 0.174890, 0.542218, 0.720979, 0.891989,
+    0.815629, 0.843481, 0.616255, 0.723551, 0.029617, 0.429630, 0.137292,
+    0.549343, 0.287331, 0.532056, 0.389238, 0.500583, 0.011002, 0.942377,
+    0.710899, 0.810448, 0.476326, 0.845392, 0.816033, 0.073108, 0.894181,
+    0.723594, 0.096019, 0.365077, 0.145923, 0.261699, 0.071700, 0.320813,
+    0.803917, 0.792679, 0.212802, 0.619546, 0.636160, 0.829057, 0.343096,
+    0.665777, 0.258687, 0.480388, 0.215121, 0.546018, 0.012444, 0.604359,
+    0.046601, 0.023446, 0.546736, 0.757500, 0.833893, 0.023062, 0.602892,
+    0.649927, 0.096170, 0.497074, 0.373521, 0.192189, 0.862151, 0.519444,
+    0.453887, 0.933851, 0.840257, 0.257804, 0.726531, 0.053058, 0.877350,
+    0.362691, 0.882115, 0.220446, 0.028468, 0.140802, 0.700834, 0.243589,
+    0.686821, 0.713278, 0.847948, 0.733421, 0.736723, 0.394684, 0.490921,
+    0.570617, 0.417746, 0.093813, 0.220543, 0.513916, 0.590887, 0.594064,
+    0.706105, 0.453038, 0.113508, 0.159992, 0.386889, 0.953765, 0.417796,
+    0.113420, 0.006823, 0.295146, 0.476111, 0.888938, 0.515592, 0.504579,
+    0.029741, 0.216426, 0.748168, 0.716561, 0.929703, 0.596117, 0.449982,
+    0.666427, 0.990801, 0.940903, 0.237043, 0.408547, 0.034717, 0.457587,
+    0.922463, 0.625603, 0.051651, 0.628568, 0.078641, 0.165159, 0.788560,
+    0.465530, 0.118923, 0.206356, 0.578950, 0.125746, 0.501502, 0.055060,
+    0.014685, 0.017094, 0.559640, 0.044425, 0.233519, 0.307808, 0.760986,
+    0.163223, 0.903925, 0.210969, 0.829650, 0.894726, 0.151872, 0.066693,
+    0.303273, 0.186589, 0.524279, 0.225736, 0.812192, 0.575930, 0.854304,
+    0.890833, 0.741089, 0.642864, 0.356363, 0.860012, 0.849220, 0.935313,
+    0.985758, 0.350722, 0.990373, 0.000443, 0.367815, 0.550013, 0.044868,
+    0.601335, 0.857820, 0.805855, 0.764557, 0.761745, 0.016823, 0.594207,
+    0.656471, 0.168696, 0.660900, 0.959744, 0.355284, 0.185179, 0.185480,
+    0.167477, 0.761110, 0.039784, 0.058310, 0.502199, 0.682648, 0.414673,
+    0.362211, 0.531868, 0.349985, 0.347969, 0.882589, 0.340358, 0.348412,
+    0.250404, 0.890371, 0.393280, 0.851739, 0.748191, 0.199135, 0.616297,
+    0.509936, 0.215958, 0.210504, 0.166407, 0.384654, 0.871404, 0.126151,
+    0.739938, 0.056583, 0.311631, 0.907415, 0.817693, 0.351415, 0.965724,
+    0.319891, 0.034062, 0.380397, 0.682102, 0.565930, 0.730382, 0.030072,
+    0.448519, 0.070741, 0.378484, 0.698924, 0.961112, 0.771764, 0.550663,
+    0.709303, 0.970899, 0.166959, 0.219239, 0.186857, 0.377463, 0.385647,
+    0.571511, 0.248867, 0.511798, 0.311449, 0.305450, 0.823429, 0.218864,
+    0.123142, 0.174844, 0.184588, 0.443034, 0.208906, 0.564986, 0.125136,
+    0.774836, 0.295368, 0.155207, 0.223355, 0.366109, 0.533691, 0.922279,
+    0.327221, 0.305455, 0.472942, 0.036524, 0.276354, 0.639901, 0.255763,
+    0.463211, 0.017364, 0.641410, 0.034722, 0.266231, 0.153207, 0.346171,
+    0.571680, 0.976636, 0.565036, 0.694822, 0.151480, 0.749624, 0.137856,
+    0.360386, 0.314610, 0.262992, 0.135222, 0.609978, 0.418200, 0.358578,
+    0.976087, 0.951891, 0.280856, 0.303307, 0.257346, 0.753798, 0.339831,
+    0.533700, 0.393699, 0.595594, 0.996911, 0.411063, 0.237003, 0.031634,
+    0.677294, 0.390211, 0.377805, 0.248974, 0.366847, 0.942841, 0.943796,
+    0.518327, 0.692465, 0.081653, 0.878713, 0.007074, 0.344645, 0.013936,
+    0.617052, 0.762845, 0.372513, 0.593138, 0.714736, 0.653370, 0.896446,
+    0.972082, 0.407168, 0.236276, 0.505782, 0.800867, 0.831870, 0.502693,
+    0.211930, 0.068873, 0.534327, 0.889224, 0.459084, 0.912132, 0.138197,
+    0.825931, 0.854972, 0.081994, 0.344259, 0.547437, 0.163646, 0.222972,
+    0.554511, 0.508291, 0.236908, 0.171563, 0.271135, 0.609421, 0.764701,
+    0.985871, 0.262790, 0.661147, 0.957953, 0.669958, 0.897423, 0.463734,
+    0.470825, 0.729293, 0.966427, 0.682755, 0.798166, 0.500754, 0.571978,
+    0.257251, 0.412886, 0.710176, 0.083182, 0.267858, 0.792169, 0.427441,
+    0.815295, 0.955815, 0.650413, 0.369805, 0.464106, 0.887320, 0.541368,
+    0.735242, 0.496741, 0.306069, 0.721113, 0.759531, 0.967216, 0.679065,
+    0.429489, 0.864639, 0.142799, 0.900314, 0.593932, 0.109227, 0.583069,
+    0.392098, 0.609981, 0.155047, 0.649349, 0.022867, 0.865222, 0.732531,
+    0.290725, 0.657392, 0.159972, 0.106019, 0.613207, 0.810384, 0.475824,
+    0.077313, 0.697704, 0.017192, 0.812555};
+
+static float golden_endtoend_output[] = {
+    -1.881211, -0.028385, -3.585066, 1.939770,  -3.461155, 1.280415,  -4.408978,
+    0.608663,  -2.704937, 1.859742,  -5.777429, 2.691839,  -1.049012, 1.640870,
+    -4.856245, 1.604236,  0.992707,  0.422858,  -4.307465, 1.887332,  -0.884831,
+    -0.154277, -2.634801, 0.586827,  -1.849960, 1.399608,  -4.531559, 1.943591,
+    0.271676,  -2.893054, -2.066826, 0.235467,  -1.248263, -1.164534, -2.640174,
+    -0.112878, -4.386484, 1.253024,  -4.135623, 1.068984,  -0.043579, -0.832957,
+    -3.257258, -0.514396, -1.651174, 0.638630,  -4.364372, 1.548441,  -0.289455,
+    0.539845,  -4.097627, 0.635001,  -0.465071, -0.927701, -2.481498, 0.356616,
+    -2.355012, 0.728806,  -3.340283, 1.609038,  -4.786268, -0.532272, -1.886150,
+    0.254797,  0.746620,  -1.657134, -3.264265, 0.525551,  -1.756837, 0.845446,
+    -5.572190, 1.715797,  -2.856942, 3.394245,  -5.803662, 2.281806,  -3.014739,
+    2.616136,  -4.728482, 1.659984,  -2.106307, 2.711709,  -6.173832, 1.352869,
+    -0.038035, 0.107619,  -4.279774, 2.341930,  -0.980413, -0.119538, -4.049717,
+    1.172128,  -3.477744, 2.602274,  -6.231380, 2.537300,  -0.862214, 0.568722,
+    -3.858362, 0.197867,  -1.725885, 3.687312,  -7.067363, 2.403544,  -0.944963,
+    0.235639,  -3.250094, 0.659117,  -1.459576, 0.426128,  -3.637207, 1.030386,
+    -4.224351, 3.516220,  -6.053367, 0.993473,  -2.182416, -0.762625, -1.884405,
+    -0.113736, -2.572602, 0.329290,  -1.913233, 0.517418,  -0.019757, 0.203176,
+    -3.715881, 0.482136,  -1.912823, 1.357907,  -5.473043, 1.714658,  -3.177160,
+    0.089285,  -3.127669, 1.268076,  0.772498,  -1.622712, -3.850314, 0.436124,
+    -1.495983, 3.439982,  -7.623405, 1.726721,  -0.423979, 0.180201,  -2.902406,
+    0.986457,  -1.845638, 0.460903,  -5.359343, -1.133931, -1.074456, 0.717304,
+    -3.519856, 1.012126,  -0.562301, 1.881967,  -6.716627, 2.525036,  0.945480,
+    0.337081,  -5.210562, 2.572035,  -0.943370, 0.442026,  -2.666313, 0.411296,
+    0.002787,  -0.000735, -2.498933, 0.771719,  -3.568153, 3.833721,  -6.617026,
+    2.813922,  -0.573970, 1.025208,  -3.909923, 1.722648,  -1.406849, 0.719783,
+    -5.207438, 1.819442,  -0.530895, -0.010887, -2.939614, 0.971225,  -1.660297,
+    1.345243,  -4.454571, 2.244876,  -2.021213, 1.756090,  -4.880947, 0.364597,
+    -2.380270, 2.763117,  -5.613013, 2.137534,  0.289101,  -2.279400, -3.365582,
+    0.170028,  -1.142254, -0.709604, -3.656223, 1.804870,  -0.854690, 0.592102,
+    -5.010415, 2.462687,  -1.474710, 0.566002,  -3.621819, -0.391946, -0.423524,
+    -0.631428, -3.513310, 0.962825,  -1.480262, 0.319791,  -3.610137, 1.842339,
+    -0.250073, 1.182022,  -6.249267, 1.604172,  1.153759,  -0.734054, -4.620415,
+    -0.030858, 0.050911,  1.524406,  -4.724010, 1.451846,  -3.277104, 2.414182,
+    -4.605285, 1.846092,  -1.503047, -0.618200, -2.746546, -0.459332, -0.980326,
+    -1.199977, -2.043865, -0.165793, -2.214698, 3.108281,  -7.127830, -0.123065,
+    1.244948,  -3.039923, -4.660061, -0.225957, -0.307210, -1.513205, -2.456005,
+    0.840048,  -0.741445, 2.328635,  -6.015267, 2.723240,  -1.381171, -0.728878,
+    -5.114925, -0.362034, -0.574923, 0.518080,  -3.892457, 1.798948,  0.435119,
+    -0.371696, -2.807571, 1.302864,  -2.063052, 1.036388,  -4.232038, 1.397059,
+    -1.615668, -1.511019, -3.095508, 1.290955,  -3.428723, 2.000287,  -4.196487,
+    1.566983,  0.196957,  0.224343,  -4.926359, -0.691975, -0.214941, 1.546821,
+    -5.384868, 2.290820,  -1.878865, 0.493692,  -4.129823, 2.112036,  0.516558,
+    -2.553077, -2.717338, 0.017146,  -2.016057, 1.628995,  -4.240602, 1.189533,
+    -5.460220, 1.254738,  -4.214903, 0.755659,  -2.893235, 2.937762,  -6.169453,
+    2.035456,  -5.613212, -0.122254, -1.973646, -0.060619, -2.119598, 1.413512,
+    -4.938738, 1.890244,  0.544169,  -2.062413, -3.329637, -0.062515, -1.855805,
+    -0.791297, -2.570353, 0.607615,  0.305812,  0.338930,  -4.150270, 2.274937,
+    0.042653,  0.133825,  -3.538155, 1.523639,  -3.173690, -1.496599, -2.414655,
+    0.464687,  -1.448998, -0.368907, -3.520129, 0.203382,  -2.443626, 1.266233,
+    -3.393848, 0.605911,  -0.015353, 1.402006,  -4.441003, 1.419281,  0.603587,
+    0.434146,  -4.966566, 2.171872,  -0.688264, -0.009981, -4.461103, 1.538354,
+    -5.029816, -0.264424, -1.713510, -0.315258, -1.891606, 0.252074,  -2.419428,
+    0.043970,  -1.291143, 2.048704,  -4.590105, 0.524734,  -1.889576, 0.134836,
+    -3.462745, 1.390663,  -0.112773, 0.402735,  -4.203784, 1.381043,  -1.201634,
+    -1.968277, -1.425637, -0.181725, -1.250742, -2.102041, -3.925464, -1.256797,
+    -3.701354, -1.754610, -1.917231, -1.455910, -1.838006, 2.041781,  -5.666212,
+    2.752957,  -2.659553, 2.553637,  -4.872212, 1.443437,  -2.081846, 3.311263,
+    -5.912457, 1.871049,  0.196148,  -0.307044, -4.024967, 2.149149,  0.361809,
+    0.620415,  -5.939984, 0.180672,  -1.209180, -0.269122, -3.240285, 1.460315,
+    -1.040803, 1.125700,  -6.060366, 0.887767,  -3.214111, 1.314368,  -3.026808,
+    1.023640,  -3.815175, 1.795642,  -4.355603, 1.064454,  -0.046472, 0.618463,
+    -5.941646, 2.861891,  -2.852155, -0.990457, -2.624445, 1.794494,  -1.176747,
+    -0.358159, -3.206776, 1.138721,  -2.819523, -1.825522, -1.450902, -0.187312,
+    -0.808727, 0.636872,  -4.120567, 1.192623,  0.810731,  -1.768519, -3.699450,
+    1.527116,  -2.772720, 3.012835,  -5.912736, 1.599365,  -4.696381, 2.234591,
+    -4.139552, 1.061768,  -1.880089, 3.596274,  -7.006379, 2.382152,  -3.158115,
+    3.844430,  -7.044156, 2.307596,  -2.473970, 1.312644,  -5.467269, 0.197154,
+    -1.530040, 1.762275,  -5.550757, 0.630276,  -3.048947, 1.043777,  -3.096658,
+    1.345893,  -1.329494, 2.065748,  -4.711032, 2.227600,  -0.413321, -0.032428,
+    -4.599650, 1.668734,  -4.351490, -0.200022, -2.359903, 0.021997,  0.116028,
+    1.159718,  -5.093972, -0.142951, -2.409895, 0.906133,  -2.728812, 0.809932,
+    -2.597363, 0.494130,  -2.357861, 0.369825,  -2.165235, 1.148522,  -3.130562,
+    0.759034,  0.646335,  -1.463660, -3.508299, 1.059679,  -1.485465, 1.007319,
+    -4.340716, 1.789864,  -1.590654, 1.612324,  -4.452007, 2.389805,  -5.200148,
+    -1.068398, -1.306923, -0.472408, -0.392165, -0.524996, -2.933478, 1.518430,
+    -1.287781, 0.113422,  -3.020525, 1.338359,  -0.105982, 0.936014,  -4.132197,
+    1.836807,  -0.616589, -1.029716, -3.271347, 0.284889,  -2.653359, 2.135829,
+    -4.643613, 1.627981,  0.287733,  -2.017263, -2.776574, 1.184792,  1.004161,
+    -1.483019, -4.339290, -0.787322, 0.582420,  1.137839,  -5.673941, -0.001862,
+    -1.219142, 0.532561,  -4.457245, 1.826807,  -3.343291, 3.034610,  -6.179855,
+    2.235917,  -4.369989, 4.018128,  -6.632714, 0.926585,  -0.485469, 0.536073,
+    -4.179557, 1.489637,  -0.521762, 1.636089,  -6.137912, 1.500867,  -4.086009,
+    1.961372,  -3.688977, 1.358220,  -1.544034, 1.763837,  -4.357567, 1.852201,
+    -2.018725, 1.046264,  -6.211127, 1.609419,  -0.118441, 1.602284,  -6.242423,
+    1.518578,  -0.604078, 1.106613,  -5.393445, 2.595629,  0.142712,  -1.903953,
+    -2.821177, 0.032758,  -0.009152, 0.184628,  -4.227636, 2.046843,  -2.240138,
+    1.256176,  -5.108516, -0.308447, -2.998571, 4.657396,  -7.582112, 2.510951,
+    -3.535784, 1.704560,  -5.068484, 1.318466,  -3.058265, 3.073172,  -6.998089,
+    3.178849,  -2.420286, 2.277806,  -4.999528, 1.423890,  -1.672914, 0.447460,
+    -4.088940, 1.351087,  -1.051546, -0.417955, -4.042147, 1.604102,  -1.700931,
+    2.796663,  -6.497579, 2.857974,  -0.240828, 0.858001,  -5.778933, 2.778508,
+    -0.406211, 1.300766,  -5.073671, 2.089362,  -0.201673, 1.588396,  -6.000150,
+    2.185055,  -2.332125, 0.768216,  -2.609184, 0.327277,  -3.358943, -1.020736,
+    -2.389984, 0.315512,  -0.561905, 1.948740,  -6.408485, 2.231985,  -0.603652,
+    0.661829,  -5.070386, -1.063058, -0.624796, 1.375772,  -4.379606, 1.929358,
+    -1.047263, 0.739100,  -5.217857, 2.127625,  -5.025338, 0.650344,  -2.068460,
+    0.076936,  -0.457505, -1.050984, -1.917765, 1.150908,  0.782625,  0.855595,
+    -5.321719, 0.787209,  -0.460232, 1.106736,  -5.552326, 2.801043,  -0.360217,
+    -0.434432, -4.273378, 0.967556,  -0.972652, 0.874811,  -5.429918, -0.331039,
+    0.115477,  0.111883,  -5.418786, 1.240546,  -1.842794, 0.505880,  -3.676064,
+    -0.682369, 1.858984,  -0.742566, -5.784060, 0.673239,  -1.280398, 0.280842,
+    -4.848077, 2.214860,  -0.785100, -0.588488, -2.438206, 0.786651,  -1.568752,
+    1.935400,  -6.320256, 2.125338,  -1.476457, -1.651941, -2.695734, 0.007338,
+    -3.280860, 2.310385,  -5.319578, 1.890123,  -0.775723, 0.630606,  -4.321582,
+    1.085521,  -1.847371, 1.188521,  -4.596577, 2.056443,  -2.340172, -0.108501,
+    -3.156392, 0.933279,  -0.495331, 0.122405,  -5.171133, 1.763245,  -0.796913,
+    2.310487,  -7.247197, 2.401678,  -1.908860, 0.043798,  -2.393796, 0.573806,
+    -0.608531, 0.154710,  -4.669001, 0.750680,  0.468380,  0.392591,  -4.755001,
+    2.615217,  -1.957774, 1.153513,  -4.530099, 1.124362,  -3.569415, 1.697154,
+    -3.536335, 0.910758,  -2.976264, 1.833129,  -4.287203, -0.547050, -2.409768,
+    0.061585,  -1.324116, 0.268497,  -2.962222, -1.524245, -2.063413, 0.442058,
+    -4.292337, 3.538863,  -6.699603, 1.718664,  -2.290363, 1.994596,  -6.245037,
+    -0.433084, -0.367059, 1.020297,  -4.940721, 2.902264,  -0.577056, -0.709887,
+    -5.001413, -0.268316, -1.112048, -1.083307, -1.753492, 0.209973,  0.139540,
+    0.917602,  -5.232745, 2.538467,  -2.139234, -0.187388, -1.837249, -0.478582,
+    -0.731653, -0.481550, -2.531261, 1.044770,  0.707750,  0.279971,  -3.221119,
+    1.552074,  -2.373144, 0.859518,  -3.665156, 1.620278,  -1.440871, -0.525581,
+    -2.758271, 1.491873,  -2.302013, 1.119935,  -5.257080, 2.627170,  -3.174739,
+    1.363282,  -4.831639, 1.101076,  -4.337008, 2.689639,  -5.165915, 1.069201,
+    -1.882078, -0.120370, -2.287967, 1.147619,  -1.403616, 1.077150,  -5.084296,
+    1.658236,  -0.919642, 0.487423,  -3.001075, 0.741268,  0.107300,  0.943556,
+    -3.544311, 1.000239,  -1.627171, 2.871253,  -5.179172, 1.429893,  -0.826040,
+    0.188670,  -4.499894, 1.013447,  -2.101299, 0.317516,  -3.452141, -0.833776,
+    -1.362144, 1.272437,  -4.449355, 1.613591,  -2.039873, 2.613175,  -6.229640,
+    1.659790,  -1.595520, -0.237462, -2.744997, 0.337841,  0.148981,  -1.703771,
+    -2.388023, 1.276469,  1.058508,  -0.401642, -4.680769, 0.861881,  -1.336381,
+    1.153080,  -2.834378, 0.721075,  0.900115,  1.360511,  -5.573611, 0.949182,
+    -2.970844, 2.017563,  -5.186108, -0.201038, -1.192824, 0.610142,  -4.450919,
+    -0.897114, -1.812093, 0.422310,  -5.245487, 0.256549,  0.320275,  -2.324150,
+    -2.967040, -0.260536, -0.721467, 0.454148,  -5.058031, 0.526370,  -0.895656,
+    0.732240,  -3.327363, 1.353953,  -1.277912, -0.483171, -1.926713, 0.065044,
+    -2.167506, -0.196606, -1.923437, 0.604962,  -2.088319, 1.406834,  -5.227296,
+    2.247351,  -4.421744, 1.729791,  -5.007922, 1.264769,  -0.897019, 0.922902,
+    -3.887108, 2.087432,  -1.310226, -0.101938, -3.359082, -0.079662, -0.514988,
+    -0.963179, -4.038209, 2.223278,  -0.590083, -2.310458, -1.748338, 0.363406,
+    -0.540731, -0.885913, -4.179595, 2.216781,  -3.044339, -0.447100, -2.446098,
+    0.931101,  -1.676190, 2.096175,  -4.980755, 2.262151,  -1.095047, 1.897516,
+    -5.996138, 2.191038,  0.297128,  -0.780974, -2.884299, 1.195408,  -0.521065,
+    -1.955837, -3.091064, -0.404183, -1.961519, 4.076096,  -7.521851, 2.242064,
+    -1.988043, 0.303300,  -2.422585, 0.322230,  -3.377634, 3.499955,  -7.084434,
+    2.375587,  -0.718851, 2.150076,  -5.412241, 2.374280,  -2.006088, 2.229828,
+    -5.848188, 2.543077,  -2.171042, 2.096026,  -5.300007, 0.141405,  -1.187745,
+    0.105340,  -4.003816, 1.034281,  -3.980804, 1.856709,  -5.103042, 0.623737,
+    -2.080307, 0.896140,  -3.104050, 0.983158,  -0.424898, -1.154270, -3.805728,
+    1.978917,  -1.314387, 1.235096,  -3.148906, 1.113173,  0.111713,  2.055213,
+    -7.565283, 2.100342};
+constexpr std::initializer_list<float> biases = {
+    0.065691948, -0.69055247, 0.1107955,  -0.97084129, -0.23957068, -0.23566568,
+    -0.389184,   0.47481549,  -0.4791103, 0.29931796,  0.10463274,  0.83918178,
+    0.37197268,  0.61957061,  0.3956964,  -0.37609905};
+
+constexpr std::initializer_list<float> recurrent_weights = {
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1};
+
+class BidirectionalRNNOpModel : public SingleOpModel {
+ public:
+  BidirectionalRNNOpModel(int batches, int sequence_len, int fw_units,
+                          int bw_units, int input_size)
+      : batches_(batches),
+        sequence_len_(sequence_len),
+        fw_units_(fw_units),
+        bw_units_(bw_units),
+        input_size_(input_size) {
+    input_ = AddInput(TensorType_FLOAT32);
+    fw_weights_ = AddInput(TensorType_FLOAT32);
+    fw_recurrent_weights_ = AddInput(TensorType_FLOAT32);
+    fw_bias_ = AddInput(TensorType_FLOAT32);
+    fw_hidden_state_ = AddOutput(TensorType_FLOAT32);
+    fw_output_ = AddOutput(TensorType_FLOAT32);
+    bw_weights_ = AddInput(TensorType_FLOAT32);
+    bw_recurrent_weights_ = AddInput(TensorType_FLOAT32);
+    bw_bias_ = AddInput(TensorType_FLOAT32);
+    bw_hidden_state_ = AddOutput(TensorType_FLOAT32);
+    bw_output_ = AddOutput(TensorType_FLOAT32);
+    SetBuiltinOp(BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN,
+                 BuiltinOptions_SequenceRNNOptions,
+                 CreateSequenceRNNOptions(builder_, /*time_major=*/false,
+                                          ActivationFunctionType_RELU)
+                     .Union());
+    BuildInterpreter({
+        {batches_, sequence_len_, input_size_},  // input
+        {fw_units_, input_size_},                // fw_weights
+        {fw_units_, fw_units_},                  // fw_recurrent_weights
+        {fw_units_},                             // fw_bias
+        {bw_units_, input_size_},                // bw_weights
+        {bw_units_, bw_units_},                  // bw_recurrent_weights
+        {bw_units_}                              // bw_bias
+    });
+  }
+
+  void SetFwBias(std::initializer_list<float> f) {
+    PopulateTensor(fw_bias_, f);
+  }
+
+  void SetBwBias(std::initializer_list<float> f) {
+    PopulateTensor(bw_bias_, f);
+  }
+
+  void SetFwWeights(std::initializer_list<float> f) {
+    PopulateTensor(fw_weights_, f);
+  }
+
+  void SetBwWeights(std::initializer_list<float> f) {
+    PopulateTensor(bw_weights_, f);
+  }
+
+  void SetFwRecurrentWeights(std::initializer_list<float> f) {
+    PopulateTensor(fw_recurrent_weights_, f);
+  }
+
+  void SetBwRecurrentWeights(std::initializer_list<float> f) {
+    PopulateTensor(bw_recurrent_weights_, f);
+  }
+
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor(input_, data);
+  }
+
+  void SetInput(int offset, float* begin, float* end) {
+    PopulateTensor(input_, offset, begin, end);
+  }
+
+  void ResetHiddenStates() {
+    const int fw_zero_buffer_size = fw_units_ * batches_;
+    std::unique_ptr<float[]> fw_zero_buffer(new float[fw_zero_buffer_size]);
+    memset(fw_zero_buffer.get(), 0, fw_zero_buffer_size * sizeof(float));
+    PopulateTensor(fw_hidden_state_, 0, fw_zero_buffer.get(),
+                   fw_zero_buffer.get() + fw_zero_buffer_size);
+    const int bw_zero_buffer_size = bw_units_ * batches_;
+    std::unique_ptr<float[]> bw_zero_buffer(new float[bw_zero_buffer_size]);
+    memset(bw_zero_buffer.get(), 0, bw_zero_buffer_size * sizeof(float));
+    PopulateTensor(bw_hidden_state_, 0, bw_zero_buffer.get(),
+                   bw_zero_buffer.get() + bw_zero_buffer_size);
+  }
+
+  std::vector<float> GetFwOutput() { return ExtractVector<float>(fw_output_); }
+  std::vector<float> GetBwOutput() { return ExtractVector<float>(bw_output_); }
+
+  int input_size() { return input_size_; }
+  int num_fw_units() { return fw_units_; }
+  int num_bw_units() { return bw_units_; }
+  int num_batches() { return batches_; }
+  int sequence_len() { return sequence_len_; }
+
+ private:
+  int input_;
+  int fw_weights_;
+  int fw_recurrent_weights_;
+  int fw_bias_;
+  int fw_hidden_state_;
+  int fw_output_;
+  int bw_weights_;
+  int bw_recurrent_weights_;
+  int bw_bias_;
+  int bw_hidden_state_;
+  int bw_output_;
+
+  int batches_;
+  int sequence_len_;
+  int fw_units_;
+  int bw_units_;
+  int input_size_;
+};
+
+// TODO(mirkov): add another test which directly compares to TF once TOCO
+// supports the conversion from dynamic_rnn with BasicRNNCell.
+TEST(BidirectionalRNNOpTest, BlackBoxTest) {
+  BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
+                              /*fw_units=*/16, /*bw_units=*/16,
+                              /*input_size=*/8);
+  rnn.SetFwWeights(weights);
+  rnn.SetBwWeights(weights);
+  rnn.SetFwBias(biases);
+  rnn.SetBwBias(biases);
+  rnn.SetFwRecurrentWeights(recurrent_weights);
+  rnn.SetBwRecurrentWeights(recurrent_weights);
+
+  rnn.ResetHiddenStates();
+  const int input_sequence_size = rnn.input_size() * rnn.sequence_len();
+  float* batch_start = rnn_input;
+  float* batch_end = batch_start + input_sequence_size;
+  rnn.SetInput(0, batch_start, batch_end);
+  rnn.SetInput(input_sequence_size, batch_start, batch_end);
+
+  rnn.Invoke();
+
+  float* golden_fw_start = rnn_golden_fw_output;
+  float* golden_fw_end =
+      golden_fw_start + rnn.num_fw_units() * rnn.sequence_len();
+  std::vector<float> fw_expected;
+  fw_expected.insert(fw_expected.end(), golden_fw_start, golden_fw_end);
+  fw_expected.insert(fw_expected.end(), golden_fw_start, golden_fw_end);
+  EXPECT_THAT(rnn.GetFwOutput(), ElementsAreArray(ArrayFloatNear(fw_expected)));
+
+  float* golden_bw_start = rnn_golden_bw_output;
+  float* golden_bw_end =
+      golden_bw_start + rnn.num_bw_units() * rnn.sequence_len();
+  std::vector<float> bw_expected;
+  bw_expected.insert(bw_expected.end(), golden_bw_start, golden_bw_end);
+  bw_expected.insert(bw_expected.end(), golden_bw_start, golden_bw_end);
+  EXPECT_THAT(rnn.GetBwOutput(), ElementsAreArray(ArrayFloatNear(bw_expected)));
+}
+
+// Check that if the input sequence is reversed the outputs are the same just
+// forward and backward are swapped (and reversed).
+TEST(BidirectionalRNNOpTest, BlackBoxTestReverseInputs) {
+  BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
+                              /*fw_units=*/16, /*bw_units=*/16,
+                              /*input_size=*/8);
+  rnn.SetFwWeights(weights);
+  rnn.SetBwWeights(weights);
+  rnn.SetFwBias(biases);
+  rnn.SetBwBias(biases);
+  rnn.SetFwRecurrentWeights(recurrent_weights);
+  rnn.SetBwRecurrentWeights(recurrent_weights);
+
+  rnn.ResetHiddenStates();
+
+  // Reverse inputs in each batch: in_1, in_2,..., in_k is inserted in the
+  // following order: [in_k,..., in_2, in_1, in_k,...,in_2, in_1].
+  for (int i = 0; i < rnn.sequence_len(); i++) {
+    float* batch_start = rnn_input + i * rnn.input_size();
+    float* batch_end = batch_start + rnn.input_size();
+    const int reverse_idx = rnn.sequence_len() - i - 1;
+    rnn.SetInput(reverse_idx * rnn.input_size(), batch_start, batch_end);
+    rnn.SetInput((rnn.sequence_len() + reverse_idx) * rnn.input_size(),
+                 batch_start, batch_end);
+  }
+
+  rnn.Invoke();
+
+  // The forward and backward outputs are swapped.
+  std::vector<float> fw_expected;  // consider using std::deque instead.
+  for (int i = 0; i < rnn.sequence_len(); i++) {
+    float* golden_fw_start = rnn_golden_bw_output + i * rnn.num_fw_units();
+    float* golden_fw_end = golden_fw_start + rnn.num_fw_units();
+    fw_expected.insert(fw_expected.begin(), golden_fw_start, golden_fw_end);
+  }
+  fw_expected.insert(fw_expected.end(), fw_expected.begin(), fw_expected.end());
+  EXPECT_THAT(rnn.GetFwOutput(), ElementsAreArray(ArrayFloatNear(fw_expected)));
+
+  std::vector<float> bw_expected;
+  for (int i = 0; i < rnn.sequence_len(); i++) {
+    float* golden_bw_start = rnn_golden_fw_output + i * rnn.num_bw_units();
+    float* golden_bw_end = golden_bw_start + rnn.num_bw_units();
+    bw_expected.insert(bw_expected.begin(), golden_bw_start, golden_bw_end);
+  }
+  bw_expected.insert(bw_expected.end(), bw_expected.begin(), bw_expected.end());
+  EXPECT_THAT(rnn.GetBwOutput(), ElementsAreArray(ArrayFloatNear(bw_expected)));
+}
+
+// Tests an end-to-end neural network with a Bidirectional RNN followed by a
+// DNN that aggregates the outputs from the two sequences.
+TEST(BidirectionalRNNOpTest, EndToEndTest) {
+  BidirectionalRNNOpModel rnn(/*batches=*/1, /*sequence_len=*/4,
+                              /*fw_units=*/16, /*bw_units=*/16,
+                              /*input_size=*/8);
+  const int output_size = 4;
+  float dnn_weights[] = {
+      -0.5782342,  -0.052212059, 0.73036242,  -0.81216097, -0.80088139,
+      -0.23420811, -0.39647382,  0.31423986,  0.61819065,  -0.73659575,
+      -0.89698344, -0.8931554,   -0.0845688,  0.5617367,   0.38415289,
+      -0.11487955, -0.7617774,   0.17927337,  0.15726972,  0.059798479,
+      0.19009054,  -0.27616632,  -0.39142907, 0.77744663,  -0.046830714,
+      -0.6603595,  0.21945822,   0.051494241, 0.23785079,  0.19239247,
+      -0.53268754, 0.65961659,   -0.85981959, -0.80232513, 0.84745562,
+      -0.66070104, -0.036533296, -0.54901814, 0.65353882,  -0.41834265,
+      -0.28561389, 0.75655544,   -0.31149811, 0.62981737,  0.31829214,
+      -0.92734522, -0.48506218,  0.55651462,  0.25192821,  0.67220747,
+      -0.3836869,  -0.55798125,  -0.60395885, 0.22488403,  -0.78053463,
+      0.3492105,   0.56452453,   0.4389236,   -0.59929526, -0.19762468,
+      -0.36868393, -0.13198286,  -0.53800809, -0.22850353};
+
+  std::initializer_list<float> dnn_biases = {
+    0.29177809, -0.98799044, 0.065919638, 0.68781924};
+
+  rnn.SetFwWeights(weights);
+  rnn.SetBwWeights(weights);
+  rnn.SetFwBias(biases);
+  rnn.SetBwBias(biases);
+  rnn.SetFwRecurrentWeights(recurrent_weights);
+  rnn.SetBwRecurrentWeights(recurrent_weights);
+
+  rnn.ResetHiddenStates();
+
+  const int input_sequence_size = rnn.input_size() * rnn.sequence_len();
+  const int output_sequence_size = output_size * rnn.sequence_len();
+  const int num_examples = 64;
+  for (int k = 0; k < num_examples; k++) {
+    float* batch_start = endtoend_input + k * input_sequence_size;
+    float* batch_end = batch_start + input_sequence_size;
+    rnn.SetInput(0, batch_start, batch_end);
+
+    rnn.Invoke();
+
+    std::vector<float> fw_output = rnn.GetFwOutput();
+    std::vector<float> bw_output = rnn.GetBwOutput();
+    EXPECT_EQ(fw_output.size(), bw_output.size());
+
+    std::transform(fw_output.begin(), fw_output.end(), bw_output.begin(),
+                   fw_output.begin(), std::plus<float>());
+
+    std::vector<float> sequence_result;
+    for (int s = 0; s < rnn.sequence_len(); s++) {
+      const float* rnn_output = fw_output.data() + s * rnn.num_fw_units();
+      std::vector<float> results(dnn_biases);
+      for (int i = 0; i < output_size; i++) {
+        for (int j = 0; j < rnn.num_fw_units(); j++) {
+          results[i] += *(rnn_output + j) * dnn_weights[output_size * j + i];
+        }
+      }
+      sequence_result.insert(sequence_result.end(), results.begin(),
+                             results.end());
+    }
+
+    float* golden_start = golden_endtoend_output + k * output_sequence_size;
+    float* golden_end = golden_start + output_sequence_size;
+
+    std::vector<float> expected;
+    expected.insert(expected.end(), golden_start, golden_end);
+    EXPECT_THAT(sequence_result, ElementsAreArray(ArrayFloatNear(expected)));
+  }
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  // On Linux, add: tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/register.cc b/tensorflow/contrib/lite/kernels/register.cc
index f605deaa5b..1fb779fd51 100644
--- a/tensorflow/contrib/lite/kernels/register.cc
+++ b/tensorflow/contrib/lite/kernels/register.cc
@@ -31,6 +31,7 @@ TfLiteRegistration* Register_CONV_2D();
 TfLiteRegistration* Register_DEPTHWISE_CONV_2D();
 TfLiteRegistration* Register_SVDF();
 TfLiteRegistration* Register_RNN();
+TfLiteRegistration* Register_BIDIRECTIONAL_SEQUENCE_RNN();
 TfLiteRegistration* Register_UNIDIRECTIONAL_SEQUENCE_RNN();
 TfLiteRegistration* Register_EMBEDDING_LOOKUP();
 TfLiteRegistration* Register_EMBEDDING_LOOKUP_SPARSE();
@@ -73,6 +74,8 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_DEPTHWISE_CONV_2D, Register_DEPTHWISE_CONV_2D());
   AddBuiltin(BuiltinOperator_SVDF, Register_SVDF());
   AddBuiltin(BuiltinOperator_RNN, Register_RNN());
+  AddBuiltin(BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN,
+             Register_BIDIRECTIONAL_SEQUENCE_RNN());
   AddBuiltin(BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN,
              Register_UNIDIRECTIONAL_SEQUENCE_RNN());
   AddBuiltin(BuiltinOperator_EMBEDDING_LOOKUP, Register_EMBEDDING_LOOKUP());
diff --git a/tensorflow/contrib/lite/model.cc b/tensorflow/contrib/lite/model.cc
index 415d984ad8..3f53a1abe7 100644
--- a/tensorflow/contrib/lite/model.cc
+++ b/tensorflow/contrib/lite/model.cc
@@ -328,6 +328,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
       builtin_data = reinterpret_cast<void*>(params);
       break;
     }
+    case BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN:
     case BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN: {
       TfLiteSequenceRNNParams* params = MallocPOD<TfLiteSequenceRNNParams>();
       if (auto* sequence_rnn_params =
diff --git a/tensorflow/contrib/lite/nnapi_delegate.cc b/tensorflow/contrib/lite/nnapi_delegate.cc
index d5b9319407..da9ceec2f1 100644
--- a/tensorflow/contrib/lite/nnapi_delegate.cc
+++ b/tensorflow/contrib/lite/nnapi_delegate.cc
@@ -319,6 +319,7 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
       case tflite::BuiltinOperator_SVDF:
       case tflite::BuiltinOperator_HASHTABLE_LOOKUP:
       case tflite::BuiltinOperator_RNN:
+      case tflite::BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN:
       case tflite::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN:
       case tflite::BuiltinOperator_EMBEDDING_LOOKUP:
       case tflite::BuiltinOperator_EMBEDDING_LOOKUP_SPARSE:
diff --git a/tensorflow/contrib/lite/schema/schema.fbs b/tensorflow/contrib/lite/schema/schema.fbs
index ec202cd407..4c82cb9549 100644
--- a/tensorflow/contrib/lite/schema/schema.fbs
+++ b/tensorflow/contrib/lite/schema/schema.fbs
@@ -119,6 +119,7 @@ enum BuiltinOperator : byte {
   SQUEEZE = 43,
   UNIDIRECTIONAL_SEQUENCE_LSTM = 44,
   STRIDED_SLICE = 45,
+  BIDIRECTIONAL_SEQUENCE_RNN = 46,
 }
 
 // Options for the builtin operators.
@@ -224,6 +225,12 @@ table SequenceRNNOptions {
   fused_activation_function:ActivationFunctionType;
 }
 
+// An implementation of TensorFlow bidrectional_dynamic_rnn with RNNCell.
+table BidirectionalSequenceRNNOptions {
+  time_major:bool;
+  fused_activation_function:ActivationFunctionType;
+}
+
 // An implementation of TensorFlow fully_connected (a.k.a Dense) layer.
 table FullyConnectedOptions {
   fused_activation_function:ActivationFunctionType;
-- 
GitLab


From e95537708f070a98607393a8f60bc61f1611a77b Mon Sep 17 00:00:00 2001
From: Shivani Agrawal <shivaniagrawal@google.com>
Date: Fri, 26 Jan 2018 16:51:25 -0800
Subject: [PATCH 281/966] [tf.data] Support for initializing all the tables of
 the given graph.

PiperOrigin-RevId: 183466905
---
 .../contrib/data/python/kernel_tests/BUILD    |  1 +
 .../dataset_serialization_test_base.py        | 19 +++++++++++--------
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index 1cf0202fd8..04a21f2b0f 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -126,6 +126,7 @@ py_library(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:lookup_ops",
         "//tensorflow/python:platform",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:training",
diff --git a/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py b/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py
index 7cde6e05b2..701fc8247e 100644
--- a/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py
+++ b/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py
@@ -27,6 +27,7 @@ from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
@@ -235,8 +236,7 @@ class DatasetSerializationTestBase(test.TestCase):
           ds_fn, sparse_tensors=sparse_tensors)
       with self.test_session(graph=g) as sess:
         self._restore(saver, sess)
-        sess.run(variables.global_variables_initializer())
-        sess.run(init_op)
+        self._initialize(init_op, sess)
         for _ in range(num_outputs):
           actual.append(sess.run(get_next_op))
         if verify_exhausted:
@@ -390,8 +390,7 @@ class DatasetSerializationTestBase(test.TestCase):
       init_op, get_next_op, saver = self._build_graph(
           ds_fn, sparse_tensors=sparse_tensors)
       with self.test_session(graph=g) as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(init_op)
+        self._initialize(init_op, sess)
         for _ in range(break_point):
           sess.run(get_next_op)
         with self.assertRaises(error):
@@ -493,12 +492,10 @@ class DatasetSerializationTestBase(test.TestCase):
         with self.test_session(graph=g) as sess:
           if ckpt_saved:
             if init_before_restore:
-              sess.run(variables.global_variables_initializer())
-              sess.run(init_op)
+              self._initialize(init_op, sess)
             self._restore(saver, sess)
           else:
-            sess.run(variables.global_variables_initializer())
-            sess.run(init_op)
+            self._initialize(init_op, sess)
           start = break_points[i - 1] if i > 0 else 0
           end = break_points[i] if i < len(break_points) else num_outputs
           num_iters = end - start
@@ -621,8 +618,14 @@ class DatasetSerializationTestBase(test.TestCase):
     saver.save(sess, self._ckpt_path())
 
   def _restore(self, saver, sess):
+    sess.run(lookup_ops.tables_initializer())
     saver.restore(sess, self._latest_ckpt())
 
+  def _initialize(self, init_op, sess):
+    sess.run(variables.global_variables_initializer())
+    sess.run(lookup_ops.tables_initializer())
+    sess.run(init_op)
+
   def _import_meta_graph(self):
     meta_file_path = self._ckpt_path() + ".meta"
     return saver_lib.import_meta_graph(meta_file_path)
-- 
GitLab


From aee7f95a027accc94f1f9130f0cfaecd9399bc1d Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Fri, 26 Jan 2018 16:53:59 -0800
Subject: [PATCH 282/966] Add C0301 line-too-long error to pylint sanity check.

PiperOrigin-RevId: 183467186
---
 .../bayesflow/python/ops/custom_grad_impl.py  |   6 +-
 .../python/kernel_tests/bucketing_test.py     |  33 +-
 tensorflow/contrib/eager/python/datasets.py   |   7 +-
 .../contrib/framework/python/ops/arg_scope.py |  14 +-
 .../fused_conv2d_bias_activation_benchmark.py |  10 +-
 .../single_image_random_dot_stereograms.py    |  33 +-
 .../kernel_methods/python/losses_test.py      |  17 +-
 .../layers/python/layers/layers_test.py       | 221 ++++----
 .../learn/python/learn/datasets/mnist.py      |  15 +-
 .../python/learn/datasets/synthetic_test.py   |  56 +-
 .../python/learn/estimators/debug_test.py     | 112 ++--
 .../learn/estimators/estimator_input_test.py  |  56 +-
 .../estimators/logistic_regressor_test.py     |   5 +-
 .../contrib/learn/python/learn/evaluable.py   |  10 +-
 .../contrib/learn/python/learn/experiment.py  | 126 ++---
 .../python/learn/learn_io/data_feeder.py      |  55 +-
 .../contrib/learn/python/learn/trainable.py   |  32 +-
 .../contrib/losses/python/losses/loss_ops.py  | 130 ++---
 .../examples/cifar10/cifar10_pruning.py       |   2 +-
 .../mpi_collectives/python/ops/mpi_ops.py     |  14 +-
 .../rnn/python/kernel_tests/core_rnn_test.py  | 389 ++++++++------
 tensorflow/contrib/session_bundle/exporter.py |  21 +-
 .../contrib/slim/python/slim/learning_test.py |  28 +-
 .../examples/tutorials/mnist/mnist_softmax.py |  16 +-
 tensorflow/python/client/device_lib_test.py   |   3 +-
 tensorflow/python/debug/wrappers/hooks.py     |  21 +-
 tensorflow/python/framework/dtypes.py         | 209 +++++---
 tensorflow/python/framework/importer.py       |  49 +-
 tensorflow/python/framework/tensor_util.py    | 171 +++---
 tensorflow/python/framework/test_util.py      |  99 ++--
 .../kernel_tests/atrous_convolution_test.py   |  46 +-
 .../candidate_sampler_ops_test.py             |   2 +-
 .../kernel_tests/control_flow_ops_py_test.py  | 198 +++----
 .../python/kernel_tests/cwise_ops_test.py     | 111 ++--
 .../kernel_tests/dynamic_stitch_op_test.py    |  33 +-
 .../partitioned_variables_test.py             |  12 +-
 .../kernel_tests/random/random_ops_test.py    |  21 +-
 .../python/kernel_tests/xent_op_test.py       |  73 +--
 tensorflow/python/ops/image_ops_test.py       | 469 +++++++----------
 tensorflow/python/ops/math_grad.py            |  81 +--
 tensorflow/python/ops/matmul_benchmark.py     |  11 +-
 .../python/ops/matmul_benchmark_test.py       |  75 +--
 .../python/ops/nn_fused_batchnorm_test.py     |   3 +-
 tensorflow/python/ops/nn_ops.py               | 487 ++++++++++--------
 .../python/ops/quantized_conv_ops_test.py     |   3 +-
 tensorflow/python/ops/quantized_ops_test.py   |   5 +-
 tensorflow/python/ops/sparse_ops.py           | 239 +++++----
 tensorflow/python/ops/special_math_ops.py     |   4 +-
 tensorflow/python/saved_model/simple_save.py  |   9 +-
 .../python/training/learning_rate_decay.py    | 145 ++++--
 tensorflow/python/training/rmsprop.py         |  15 +-
 tensorflow/tools/ci_build/ci_sanity.sh        |   3 +-
 tensorflow/tools/ci_build/pylintrc            |  12 +-
 .../tools/dist_test/python/mnist_replica.py   |  32 +-
 54 files changed, 2184 insertions(+), 1865 deletions(-)

diff --git a/tensorflow/contrib/bayesflow/python/ops/custom_grad_impl.py b/tensorflow/contrib/bayesflow/python/ops/custom_grad_impl.py
index fdc12e3b21..d44fe6529a 100644
--- a/tensorflow/contrib/bayesflow/python/ops/custom_grad_impl.py
+++ b/tensorflow/contrib/bayesflow/python/ops/custom_grad_impl.py
@@ -31,8 +31,7 @@ __all__ = [
 ]
 
 
-def custom_gradient(fx, gx, x, axis=(),
-                    fx_gx_manually_stopped=False,
+def custom_gradient(fx, gx, x, axis=(), fx_gx_manually_stopped=False,
                     name=None):
   """Enables specifying a custom gradient.
 
@@ -43,7 +42,8 @@ def custom_gradient(fx, gx, x, axis=(),
   h(x) = x * stop_gradient(g(x)) + stop_gradient(f(x) - x * g(x))
   ```
 
-  is such that `h(x) = stop_gradient(f(x))` and `grad[h(x), x] = stop_gradient(g(x)).`
+  is such that `h(x) = stop_gradient(f(x))` and `grad[h(x), x] =
+  stop_gradient(g(x)).`
 
   In addition to scalar-domain/scalar-range functions, this function also
   supports tensor-domain/scalar-range functions. However, in the latter case it
diff --git a/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py b/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py
index 4d984bb4d7..6de93059d8 100644
--- a/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py
@@ -41,8 +41,7 @@ class GroupByWindowTest(test.TestCase):
         dataset_ops.Dataset.from_tensor_slices(components).map(lambda x: x * x)
         .apply(
             grouping.group_by_window(lambda x: x % 2, lambda _, xs: xs.batch(4),
-                                     4))
-        .make_initializable_iterator())
+                                     4)).make_initializable_iterator())
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -53,7 +52,8 @@ class GroupByWindowTest(test.TestCase):
         while True:
           result = sess.run(get_next)
           self.assertTrue(
-              all(x % 2 == 0 for x in result) or all(x % 2 == 1)
+              all(x % 2 == 0
+                  for x in result) or all(x % 2 == 1)
               for x in result)
           counts.append(result.shape[0])
 
@@ -116,8 +116,8 @@ class GroupByWindowTest(test.TestCase):
     iterator = (
         dataset_ops.Dataset.from_tensor_slices(components)
         .map(lambda x: (x, ops.convert_to_tensor([x * x]))).apply(
-            grouping.group_by_window(lambda x, _: x % 2, reduce_func, 32))
-        .make_initializable_iterator())
+            grouping.group_by_window(lambda x, _: x % 2, reduce_func,
+                                     32)).make_initializable_iterator())
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -136,7 +136,8 @@ class GroupByWindowTest(test.TestCase):
           window.padded_batch(
               4, padded_shapes=tensor_shape.TensorShape([None])),
           window.padded_batch(
-              4, padded_shapes=ops.convert_to_tensor([(key + 1) * 10])),))
+              4, padded_shapes=ops.convert_to_tensor([(key + 1) * 10])),
+      ))
 
     iterator = (
         dataset_ops.Dataset.from_tensor_slices(components)
@@ -200,9 +201,10 @@ class BucketTest(test.TestCase):
     # dynamically and does not rely on static shape information about
     # the arguments.
     return dataset_ops.Dataset.zip(
-        (dataset_ops.Dataset.from_tensors(bucket), window.padded_batch(
-            32, (tensor_shape.TensorShape([]), tensor_shape.TensorShape([None]),
-                 tensor_shape.TensorShape([3])))))
+        (dataset_ops.Dataset.from_tensors(bucket),
+         window.padded_batch(
+             32, (tensor_shape.TensorShape([]), tensor_shape.TensorShape(
+                 [None]), tensor_shape.TensorShape([3])))))
 
   def testSingleBucket(self):
 
@@ -307,12 +309,13 @@ class BucketTest(test.TestCase):
 
     def _dynamic_pad_fn(bucket, window, _):
       return dataset_ops.Dataset.zip(
-          (dataset_ops.Dataset.from_tensors(bucket), window.padded_batch(
-              32, {
-                  "x": tensor_shape.TensorShape([]),
-                  "y": tensor_shape.TensorShape([None]),
-                  "z": tensor_shape.TensorShape([3])
-              })))
+          (dataset_ops.Dataset.from_tensors(bucket),
+           window.padded_batch(
+               32, {
+                   "x": tensor_shape.TensorShape([]),
+                   "y": tensor_shape.TensorShape([None]),
+                   "z": tensor_shape.TensorShape([3])
+               })))
 
     input_dataset = (
         dataset_ops.Dataset.from_tensor_slices(math_ops.range(128)).map(_map_fn)
diff --git a/tensorflow/contrib/eager/python/datasets.py b/tensorflow/contrib/eager/python/datasets.py
index 544a3eafc0..d177bfeab2 100644
--- a/tensorflow/contrib/eager/python/datasets.py
+++ b/tensorflow/contrib/eager/python/datasets.py
@@ -112,7 +112,7 @@ class Iterator(object):
         remote_fn.add_to_graph(None)
         target = constant_op.constant("/device:CPU:0")
       with ops.device(self._device):
-        self._buffer_resource_handle = prefetching_ops.function_buffering_resource(
+        self._buffer_resource_handle = prefetching_ops.function_buffering_resource(  # pylint: disable=line-too-long
             string_arg=iter_string_handle,
             f=remote_fn,
             target_device=target,
@@ -120,8 +120,9 @@ class Iterator(object):
             thread_pool_size=1,
             container="",
             shared_name=_generate_shared_name("function_buffer_resource"))
-        self._buffer_resource_deleter = resource_variable_ops.EagerResourceDeleter(
-            handle=self._buffer_resource_handle, handle_device=self._device)
+        self._buffer_resource_deleter = resource_variable_ops.EagerResourceDeleter(  # pylint: disable=line-too-long
+            handle=self._buffer_resource_handle,
+            handle_device=self._device)
 
   def __iter__(self):
     return self
diff --git a/tensorflow/contrib/framework/python/ops/arg_scope.py b/tensorflow/contrib/framework/python/ops/arg_scope.py
index 2bce00fde2..409657fe1d 100644
--- a/tensorflow/contrib/framework/python/ops/arg_scope.py
+++ b/tensorflow/contrib/framework/python/ops/arg_scope.py
@@ -53,7 +53,8 @@
     net = layers.conv2d(net, 256, [5, 5], scope='conv2')
   ```
 
-  Example of how to use tf.contrib.framework.add_arg_scope to enable your function to be called within an arg_scope later:
+  Example of how to use tf.contrib.framework.add_arg_scope to enable your
+  function to be called within an arg_scope later:
 
   @tf.contrib.framework.add_arg_scope
   def conv2d(*args, **kwargs)
@@ -65,11 +66,10 @@ from __future__ import print_function
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_decorator
 
-__all__ = ['arg_scope',
-           'add_arg_scope',
-           'current_arg_scope',
-           'has_arg_scope',
-           'arg_scoped_arguments']
+__all__ = [
+    'arg_scope', 'add_arg_scope', 'current_arg_scope', 'has_arg_scope',
+    'arg_scoped_arguments'
+]
 
 _ARGSTACK = [{}]
 
@@ -172,6 +172,7 @@ def add_arg_scope(func):
   Returns:
     A tuple with the decorated function func_with_args().
   """
+
   def func_with_args(*args, **kwargs):
     current_scope = current_arg_scope()
     current_args = kwargs
@@ -180,6 +181,7 @@ def add_arg_scope(func):
       current_args = current_scope[key_func].copy()
       current_args.update(kwargs)
     return func(*args, **current_args)
+
   _add_op(func)
   setattr(func_with_args, '_key_op', _key_op(func))
   return tf_decorator.make_decorator(func, func_with_args)
diff --git a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_benchmark.py b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_benchmark.py
index a65d4bc50f..96cdd8b1ca 100644
--- a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_benchmark.py
+++ b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_benchmark.py
@@ -116,7 +116,7 @@ def build_fused_conv_bias_relu_graph(device, input_shape, filter_shape, strides,
     for _ in range(1, num_iters):
       with ops.control_dependencies([fused_out]):
         # pylint: disable=g-line-too-long
-        fused_out = fused_conv2d_bias_activation_op.fused_conv2d_bias_activation(
+        fused_out = fused_conv2d_bias_activation_op.fused_conv2d_bias_activation(  # pylint: disable=line-too-long
             inp,
             filt,
             bias,
@@ -166,10 +166,10 @@ class FusedConv2DBiasActivationBenchmark(test.Benchmark):
         duration = (time.time() - start_time) / num_iters
 
         print("%s inputshape:%s filtershape:%s strides:%s padding:%s "
-              "%d iters: %.8f sec" %
-              (device, str(input_shape).replace(" ", ""),
-               str(filter_shape).replace(" ", ""),
-               str(strides).replace(" ", ""), padding, num_iters, duration))
+              "%d iters: %.8f sec" % (device, str(input_shape).replace(" ", ""),
+                                      str(filter_shape).replace(" ", ""),
+                                      str(strides).replace(" ", ""), padding,
+                                      num_iters, duration))
     name_template = (
         "conv2d_{device}_input_shape_{inputshape}_filter_shape_{filtershape}_"
         "strides_{strides}_padding_{padding}")
diff --git a/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py b/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py
index bb766e59d2..d4a6a5bcbb 100755
--- a/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py
+++ b/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py
@@ -26,18 +26,20 @@ _sirds_ops = loader.load_op_library(
     resource_loader.get_path_to_datafile(
         "_single_image_random_dot_stereograms.so"))
 
-def single_image_random_dot_stereograms(
-    depth_values,
-    hidden_surface_removal=None,
-    convergence_dots_size=None,
-    dots_per_inch=None,
-    eye_separation=None, mu=None,
-    normalize=None, normalize_max=None,
-    normalize_min=None,
-    border_level=None,
-    number_colors=None,
-    output_image_shape=None,
-    output_data_window=None):
+
+def single_image_random_dot_stereograms(depth_values,
+                                        hidden_surface_removal=None,
+                                        convergence_dots_size=None,
+                                        dots_per_inch=None,
+                                        eye_separation=None,
+                                        mu=None,
+                                        normalize=None,
+                                        normalize_max=None,
+                                        normalize_min=None,
+                                        border_level=None,
+                                        number_colors=None,
+                                        output_image_shape=None,
+                                        output_data_window=None):
   """Output a RandomDotStereogram Tensor for export via encode_PNG/JPG OP.
 
   Given the 2-D tensor 'depth_values' with encoded Z values, this operation
@@ -45,7 +47,8 @@ def single_image_random_dot_stereograms(
   for the encode_PNG/JPG ops.  Be careful with image compression as this may
   corrupt the encode 3-D data witin the image.
 
-  Based upon [this paper](http://www.learningace.com/doc/4331582/b6ab058d1e206d68ab60e4e1ead2fe6e/sirds-paper).
+  Based upon [this
+  paper](http://www.learningace.com/doc/4331582/b6ab058d1e206d68ab60e4e1ead2fe6e/sirds-paper).
 
   This outputs a SIRDS image as picture_out.png:
 
@@ -113,7 +116,8 @@ def single_image_random_dot_stereograms(
       hidden_surface_removal=hidden_surface_removal,
       convergence_dots_size=convergence_dots_size,
       dots_per_inch=dots_per_inch,
-      eye_separation=eye_separation, mu=mu,
+      eye_separation=eye_separation,
+      mu=mu,
       normalize=normalize,
       normalize_max=normalize_max,
       normalize_min=normalize_min,
@@ -123,4 +127,5 @@ def single_image_random_dot_stereograms(
       output_data_window=output_data_window)
   return result
 
+
 ops.NotDifferentiable("SingleImageRandomDotStereograms")
diff --git a/tensorflow/contrib/kernel_methods/python/losses_test.py b/tensorflow/contrib/kernel_methods/python/losses_test.py
index d38d8041ce..72507539f8 100644
--- a/tensorflow/contrib/kernel_methods/python/losses_test.py
+++ b/tensorflow/contrib/kernel_methods/python/losses_test.py
@@ -119,19 +119,20 @@ class SparseMulticlassHingeLossTest(test.TestCase):
 
   def testUnknownShape(self):
     """Result keeps same with `testZeroLossInt32Labels`"""
-    logits_np = np.array([[1.2, -1.4, -1.0],
-                          [1.4, 1.8, 4.0],
-                          [0.5, 1.8, -1.0]])
+    logits_np = np.array([[1.2, -1.4, -1.0], [1.4, 1.8, 4.0], [0.5, 1.8, -1.0]])
     labels_np = np.array([0, 2, 1], dtype=np.int32)
 
-    logits_shapes = [[3, 3],  # batch_size, num_classes
-                     [None, 3],
-                     [3, None],
-                     [None, None]]
+    logits_shapes = [
+        [3, 3],  # batch_size, num_classes
+        [None, 3],
+        [3, None],
+        [None, None]
+    ]
 
     for batch_size, num_classes in logits_shapes:
       with self.test_session():
-        logits = array_ops.placeholder(dtypes.float32, shape=(batch_size, num_classes))
+        logits = array_ops.placeholder(
+            dtypes.float32, shape=(batch_size, num_classes))
         labels = array_ops.placeholder(dtypes.int32, shape=(batch_size,))
         loss = losses.sparse_multiclass_hinge_loss(labels, logits)
         result = loss.eval(feed_dict={logits: logits_np, labels: labels_np})
diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py
index a9bdbe0138..49b23ce8fa 100644
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@@ -126,8 +126,8 @@ class AvgPool3DTest(test.TestCase):
   def testInvalidDataFormat(self):
     depth, height, width = 3, 6, 9
     images = np.random.uniform(size=(5, depth, height, width, 3))
-    with self.assertRaisesRegexp(ValueError,
-                                 'data_format has to be either NCDHW or NDHWC.'):
+    with self.assertRaisesRegexp(
+        ValueError, 'data_format has to be either NCDHW or NDHWC.'):
       _layers.avg_pool3d(images, [3, 3, 3], data_format='CDHWN')
 
   def testCreateAvgPool(self):
@@ -147,7 +147,8 @@ class AvgPool3DTest(test.TestCase):
   def testCollectOutputs(self):
     depth, height, width = 3, 6, 9
     images = random_ops.random_uniform((5, depth, height, width, 3), seed=1)
-    output = _layers.avg_pool3d(images, [3, 3, 3], outputs_collections='outputs')
+    output = _layers.avg_pool3d(
+        images, [3, 3, 3], outputs_collections='outputs')
     output_collected = ops.get_collection('outputs')[0]
     self.assertEqual(output_collected.aliases, ['AvgPool3D'])
     self.assertEqual(output_collected, output)
@@ -182,7 +183,8 @@ class AvgPool3DTest(test.TestCase):
     depth, height, width = 3, 6, 9
     images = random_ops.random_uniform((5, depth, height, width, 3), seed=1)
     output = _layers.avg_pool3d(images, [3, 3, 3], stride=1, padding='SAME')
-    self.assertListEqual(output.get_shape().as_list(), [5, depth, height, width, 3])
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, depth, height, width, 3])
 
   def testGlobalAvgPool(self):
     depth, height, width = 3, 6, 9
@@ -514,7 +516,9 @@ class ConvolutionTest(test.TestCase):
       with arg_scope(
           [layers_lib.convolution2d],
           normalizer_fn=_layers.batch_norm,
-          normalizer_params={'decay': 0.9}):
+          normalizer_params={
+              'decay': 0.9
+          }):
         net = layers_lib.convolution2d(images, 32, [3, 3])
         net = layers_lib.convolution2d(net, 32, [3, 3])
       self.assertEqual(len(variables.get_variables()), 8)
@@ -528,7 +532,9 @@ class ConvolutionTest(test.TestCase):
       with arg_scope(
           [layers_lib.convolution2d],
           normalizer_fn=_layers.batch_norm,
-          normalizer_params={'decay': 0.9}):
+          normalizer_params={
+              'decay': 0.9
+          }):
         net = layers_lib.convolution2d(images, 32, [3, 3], scope='Conv')
         net = layers_lib.convolution2d(
             net, 32, [3, 3], scope='Conv', reuse=True)
@@ -1030,7 +1036,8 @@ class Convolution2dTransposeTests(test.TestCase):
     for _ in range(10):
       num_filters = 1
       input_size = [
-          1, np.random.randint(1, max_image_size),
+          1,
+          np.random.randint(1, max_image_size),
           np.random.randint(1, max_image_size), 1
       ]
       filter_size = [
@@ -1184,8 +1191,10 @@ class ConvolutionInPlaneTest(test.TestCase):
 
     with self.test_session() as sess:
       sess.run(init_op)
-      result = sess.run(horz_gradients,
-                        feed_dict={image: np.ones((1, 10, 10, 1))})
+      result = sess.run(
+          horz_gradients, feed_dict={
+              image: np.ones((1, 10, 10, 1))
+          })
       expected = np.zeros((1, 10, 9, 1))
 
       self.assertAllEqual(result, expected)
@@ -1406,8 +1415,7 @@ class FlattenTest(test.TestCase):
     with ops.Graph().as_default() as g, self.test_session(g):
       inputs = array_ops.placeholder(dtype=dtypes.float32)
       inputs.set_shape(tensor_shape.TensorShape((5,)))
-      with self.assertRaisesRegexp(ValueError,
-                                   'incompatible with the layer'):
+      with self.assertRaisesRegexp(ValueError, 'incompatible with the layer'):
         _layers.flatten(inputs)
 
   def testUnknownLastDim(self):
@@ -1717,7 +1725,9 @@ class FCTest(test.TestCase):
       with arg_scope(
           [_layers.fully_connected],
           normalizer_fn=_layers.batch_norm,
-          normalizer_params={'decay': 0.9}):
+          normalizer_params={
+              'decay': 0.9
+          }):
         net = _layers.fully_connected(images, 27)
         net = _layers.fully_connected(net, 27)
       self.assertEqual(len(variables.get_variables()), 8)
@@ -1733,7 +1743,9 @@ class FCTest(test.TestCase):
       with arg_scope(
           [_layers.fully_connected],
           normalizer_fn=_layers.batch_norm,
-          normalizer_params={'decay': 0.9}):
+          normalizer_params={
+              'decay': 0.9
+          }):
         net = _layers.fully_connected(images, 27, scope='fc1')
         net = _layers.fully_connected(net, 27, scope='fc1', reuse=True)
       self.assertEqual(len(variables.get_variables()), 4)
@@ -1750,8 +1762,8 @@ class BatchNormTest(test.TestCase):
   def testBatchNormCenterFalse(self):
     a = array_ops.placeholder(dtype=dtypes.float32, shape=(10, 10, 10, 10))
     # Test that center=False builds a valid graph.
-    _layers.batch_norm(a, center=False, data_format='NCHW',
-                       zero_debias_moving_mean=True)
+    _layers.batch_norm(
+        a, center=False, data_format='NCHW', zero_debias_moving_mean=True)
 
   def testUnknownShape(self):
     with ops.Graph().as_default() as g, self.test_session(g):
@@ -1788,8 +1800,8 @@ class BatchNormTest(test.TestCase):
       images = np.random.uniform(size=(5, height, width, 3)).astype(
           dtype.as_numpy_dtype)
       output = _layers.batch_norm(images, fused=fused)
-      expected_name = ('BatchNorm/FusedBatchNorm' if fused else
-                       'BatchNorm/batchnorm')
+      expected_name = ('BatchNorm/FusedBatchNorm'
+                       if fused else 'BatchNorm/batchnorm')
       self.assertTrue(output.op.name.startswith(expected_name))
       self.assertListEqual(output.get_shape().as_list(), [5, height, width, 3])
       self.assertEqual(
@@ -2008,8 +2020,8 @@ class BatchNormTest(test.TestCase):
       expected_var = np.var(image_values, axis=axis)
       if fused:
         # Add Bessel's correction
-        expected_var, _ = self._addBesselsCorrection(batch_size * height *
-                                                     width, expected_var)
+        expected_var, _ = self._addBesselsCorrection(
+            batch_size * height * width, expected_var)
       images = constant_op.constant(
           image_values, shape=image_shape, dtype=dtypes.float32)
       output = _layers.batch_norm(
@@ -2528,8 +2540,8 @@ class BatchNormTest(test.TestCase):
       expected_var = np.var(image_values, axis=axis)
       if fused:
         # Add Bessel's correction
-        expected_var, _ = self._addBesselsCorrection(batch_size * height *
-                                                     width, expected_var)
+        expected_var, _ = self._addBesselsCorrection(
+            batch_size * height * width, expected_var)
       images = constant_op.constant(
           image_values, shape=image_shape, dtype=dtypes.float32)
       output = _layers.batch_norm(
@@ -2559,8 +2571,9 @@ class BatchNormTest(test.TestCase):
         np_output, new_images_gradients = sess.run([output, images_gradients])
         # The outputs should be close to 0.0 mean and 1.0 variance
         self.assertAllClose(
-            np.mean(
-                np_output, axis=axis), [0] * channels, rtol=0.001, atol=0.001)
+            np.mean(np_output, axis=axis), [0] * channels,
+            rtol=0.001,
+            atol=0.001)
         self.assertAllClose(
             np.var(np_output, axis=axis), [1] * channels, rtol=0.01, atol=0.01)
         # The gradients should change slowly while updating moving_mean.
@@ -2588,14 +2601,14 @@ class BatchNormTest(test.TestCase):
     channels = 3
     with self.test_session() as sess:
       images = (np.ones((5, height, width, channels)) * 9.0).astype('f')
-      beta = init_ops.constant_initializer((np.ones(channels) * 5.0).astype(
-          'f'))
-      gamma = init_ops.constant_initializer((np.ones(channels) * 2.0).astype(
-          'f'))
-      mean = init_ops.constant_initializer((np.ones(channels) * 5.0).astype(
-          'f'))
-      variance = init_ops.constant_initializer((np.ones(channels) * 4.0).astype(
-          'f'))
+      beta = init_ops.constant_initializer(
+          (np.ones(channels) * 5.0).astype('f'))
+      gamma = init_ops.constant_initializer(
+          (np.ones(channels) * 2.0).astype('f'))
+      mean = init_ops.constant_initializer(
+          (np.ones(channels) * 5.0).astype('f'))
+      variance = init_ops.constant_initializer(
+          (np.ones(channels) * 4.0).astype('f'))
       output = _layers.batch_norm(
           images,
           is_training=False,
@@ -2616,21 +2629,18 @@ class BatchNormTest(test.TestCase):
     with self.test_session(use_gpu=True) as sess:
       images = np.arange(np.product(shape), dtype=np.float32).reshape(shape)
       beta = init_ops.constant_initializer(
-          np.arange(
-              2, channels + 2, dtype=np.float32))
+          np.arange(2, channels + 2, dtype=np.float32))
       gamma = init_ops.constant_initializer(
-          np.arange(
-              10, channels + 10, dtype=np.float32) * 2.0)
+          np.arange(10, channels + 10, dtype=np.float32) * 2.0)
       mean = init_ops.constant_initializer(
-          np.arange(
-              3, channels + 3, dtype=np.float32) * 5.0)
+          np.arange(3, channels + 3, dtype=np.float32) * 5.0)
       variance = init_ops.constant_initializer(
-          np.arange(
-              1, channels + 1, dtype=np.float32) * 4.0)
+          np.arange(1, channels + 1, dtype=np.float32) * 4.0)
       if data_format == 'NCHW':
         # Reshape inputs from NHWC to NCHW format.
         images = array_ops.transpose(
-            images, [0, len(shape) - 1] + list(range(1, len(shape) - 1)))
+            images, [0, len(shape) - 1] + list(range(1,
+                                                     len(shape) - 1)))
       output = _layers.batch_norm(
           images,
           is_training=is_training,
@@ -2733,16 +2743,16 @@ class BatchNormTest(test.TestCase):
     # Tests that the adjustment is appropriately passed to and used by the core
     # BN layer.
     all_adjustments = []
+
     def _create_adjustment(shape):
       adjustments = [array_ops.ones(shape[-1:]), array_ops.zeros(shape[-1:])]
       all_adjustments.extend(adjustments)
       return adjustments
+
     depth = 8
     images = array_ops.zeros([10, 5, 5, depth])
     output = _layers.batch_norm(
-        images,
-        is_training=True,
-        adjustment=_create_adjustment)
+        images, is_training=True, adjustment=_create_adjustment)
     self.assertListEqual(output.shape.as_list(), images.shape.as_list())
     self.assertEqual(len(all_adjustments), 2)
     self.assertListEqual(all_adjustments[0].shape.as_list(), [depth])
@@ -2807,7 +2817,10 @@ class LayerNormTest(test.TestCase):
       # output_train and output_eval should be the same.
       self.assertAllClose(sess.run([output_train]), sess.run([output_eval]))
 
-  def doOutputTest(self, input_shape, tol=1e-5, begin_norm_axis=1,
+  def doOutputTest(self,
+                   input_shape,
+                   tol=1e-5,
+                   begin_norm_axis=1,
                    dtype=dtypes.float64):
     expected_mean = np.zeros(input_shape[:begin_norm_axis])
     expected_var = np.ones(input_shape[:begin_norm_axis])
@@ -2838,13 +2851,10 @@ class LayerNormTest(test.TestCase):
             # Layer-norm implemented in numpy
             eps = 1e-12
             expected_out = (
-                (gamma * (
-                    input_values
-                    - np.mean(input_values, axis=moments_axis, keepdims=True))
-                 / np.sqrt(
-                     eps
-                     + np.var(input_values, axis=moments_axis, keepdims=True)))
-                + beta)
+                (gamma * (input_values - np.mean(
+                    input_values, axis=moments_axis, keepdims=True)) /
+                 np.sqrt(eps + np.var(
+                     input_values, axis=moments_axis, keepdims=True))) + beta)
             self.assertAllClose(expected_mean, mean, atol=tol, rtol=tol)
             self.assertAllClose(expected_var, var, atol=tol)
             # The full computation gets a bigger tolerance
@@ -2862,10 +2872,10 @@ class LayerNormTest(test.TestCase):
 
   def testOutput4DInputNormOnInnermostAxis(self):
     # Equivalent tests
-    self.doOutputTest((100, 10, 10, 3), begin_norm_axis=3, tol=1e-4,
-                      dtype=dtypes.float64)
-    self.doOutputTest((100, 10, 10, 3), begin_norm_axis=-1, tol=1e-4,
-                      dtype=dtypes.float64)
+    self.doOutputTest(
+        (100, 10, 10, 3), begin_norm_axis=3, tol=1e-4, dtype=dtypes.float64)
+    self.doOutputTest(
+        (100, 10, 10, 3), begin_norm_axis=-1, tol=1e-4, dtype=dtypes.float64)
 
   def testOutputSmallInput(self):
     self.doOutputTest((10, 10, 10, 30))
@@ -2902,7 +2912,7 @@ class GDNTest(test.TestCase):
       x = np.random.uniform(size=(1, 2, 3, 4)[:ndim])
       y = self._runGDN(x, x.shape, False, 'channels_last')
       self.assertEqual(x.shape, y.shape)
-      self.assertAllClose(y, x / np.sqrt(1 + .1 * (x ** 2)), rtol=0, atol=1e-6)
+      self.assertAllClose(y, x / np.sqrt(1 + .1 * (x**2)), rtol=0, atol=1e-6)
 
   def testChannelsFirst(self):
     # `bias_add` doesn't support NCHW on CPU.
@@ -2911,8 +2921,7 @@ class GDNTest(test.TestCase):
         x = np.random.uniform(size=(4, 3, 2, 1)[:ndim])
         y = self._runGDN(x, x.shape, False, 'channels_first')
         self.assertEqual(x.shape, y.shape)
-        self.assertAllClose(
-            y, x / np.sqrt(1 + .1 * (x ** 2)), rtol=0, atol=1e-6)
+        self.assertAllClose(y, x / np.sqrt(1 + .1 * (x**2)), rtol=0, atol=1e-6)
 
   def testWrongDims(self):
     for ndim in [1, 2, 6]:
@@ -2924,7 +2933,7 @@ class GDNTest(test.TestCase):
     x = np.random.uniform(size=(1, 2, 3, 4))
     y = self._runGDN(x, x.shape, True, 'channels_last')
     self.assertEqual(x.shape, y.shape)
-    self.assertAllClose(y, x * np.sqrt(1 + .1 * (x ** 2)), rtol=0, atol=1e-6)
+    self.assertAllClose(y, x * np.sqrt(1 + .1 * (x**2)), rtol=0, atol=1e-6)
 
 
 class MaxPool2DTest(test.TestCase):
@@ -3001,20 +3010,22 @@ class MaxPool3DTest(test.TestCase):
   def testInvalidDataFormat(self):
     depth, height, width = 3, 6, 9
     images = np.random.uniform(size=(5, depth, height, width, 3))
-    with self.assertRaisesRegexp(ValueError,
-                                 'data_format has to be either NCDHW or NDHWC.'):
+    with self.assertRaisesRegexp(
+        ValueError, 'data_format has to be either NCDHW or NDHWC.'):
       _layers.max_pool3d(images, [3, 3, 3], data_format='CDHWN')
 
   def testCreateMaxPool(self):
     depth, height, width = 3, 6, 9
-    images = np.random.uniform(size=(5, depth, height, width, 3)).astype(np.float32)
+    images = np.random.uniform(size=(5, depth, height, width, 3)).astype(
+        np.float32)
     output = _layers.max_pool3d(images, [3, 3, 3])
     self.assertEqual(output.op.name, 'MaxPool3D/MaxPool3D')
     self.assertListEqual(output.get_shape().as_list(), [5, 1, 2, 4, 3])
 
   def testCreateMaxPoolNCDHW(self):
     depth, height, width = 3, 6, 9
-    images = np.random.uniform(size=(5, 3, depth, height, width)).astype(np.float32)
+    images = np.random.uniform(size=(5, 3, depth, height, width)).astype(
+        np.float32)
     output = _layers.max_pool3d(images, [3, 3, 3], data_format='NCDHW')
     self.assertEquals(output.op.name, 'MaxPool3D/transpose_1')
     self.assertListEqual(output.get_shape().as_list(), [5, 3, 1, 2, 4])
@@ -3022,7 +3033,8 @@ class MaxPool3DTest(test.TestCase):
   def testCollectOutputs(self):
     depth, height, width = 3, 6, 9
     images = random_ops.random_uniform((5, depth, height, width, 3), seed=1)
-    output = _layers.max_pool3d(images, [3, 3, 3], outputs_collections='outputs')
+    output = _layers.max_pool3d(
+        images, [3, 3, 3], outputs_collections='outputs')
     output_collected = ops.get_collection('outputs')[0]
     self.assertEqual(output_collected.aliases, ['MaxPool3D'])
     self.assertEqual(output_collected, output)
@@ -3057,7 +3069,8 @@ class MaxPool3DTest(test.TestCase):
     depth, height, width = 3, 6, 9
     images = random_ops.random_uniform((5, depth, height, width, 3), seed=1)
     output = _layers.max_pool3d(images, [3, 3, 3], stride=1, padding='SAME')
-    self.assertListEqual(output.get_shape().as_list(), [5, depth, height, width, 3])
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, depth, height, width, 3])
 
   def testGlobalMaxPool(self):
     depth, height, width = 3, 6, 9
@@ -3469,8 +3482,7 @@ class SpatialSoftmaxTests(test.TestCase):
       sess.run(variables_lib.global_variables_initializer())
       feed_dict = {features: np_features}
       keypoints = sess.run(spatial_softmax, feed_dict)
-      self.assertAllEqual(keypoints.shape,
-                          (batch_shape[0], batch_shape[3] * 2))
+      self.assertAllEqual(keypoints.shape, (batch_shape[0], batch_shape[3] * 2))
 
   def testSpatialSoftmaxShapeNCHW(self):
     batch_shape = (2, 2, 35, 35)
@@ -3481,8 +3493,7 @@ class SpatialSoftmaxTests(test.TestCase):
       sess.run(variables_lib.global_variables_initializer())
       feed_dict = {features: np_features}
       keypoints = sess.run(spatial_softmax, feed_dict)
-      self.assertAllEqual(keypoints.shape,
-                          (batch_shape[0], batch_shape[1] * 2))
+      self.assertAllEqual(keypoints.shape, (batch_shape[0], batch_shape[1] * 2))
 
   def testTwoMaxActivationsSameChannel(self):
     batch_size, height, width, nchannels = (2, 35, 35, 1)
@@ -3501,8 +3512,8 @@ class SpatialSoftmaxTests(test.TestCase):
     x_loc = [avg_x]
     y_loc = [avg_y]
 
-    np_keypoints = self._SpatialSoftmax(
-        x_loc, y_loc, height, width, batch_size, nchannels)
+    np_keypoints = self._SpatialSoftmax(x_loc, y_loc, height, width, batch_size,
+                                        nchannels)
 
     # Make sure expected location keypoints matches actual location keypoints.
     with self.test_session() as sess:
@@ -3520,13 +3531,13 @@ class SpatialSoftmaxTests(test.TestCase):
     spatial_softmax = _layers.spatial_softmax(features)
     np_features = np.zeros(batch_shape, dtype=np.float32)
 
-    edges = [(0, 0), (0, width-1), (height-1, 0), (height-1, width-1)]
+    edges = [(0, 0), (0, width - 1), (height - 1, 0), (height - 1, width - 1)]
     x_loc, y_loc = zip(*edges)
     for c in range(nchannels):
       np_features[:, x_loc[c], y_loc[c], c] = 100.
 
-    np_keypoints = self._SpatialSoftmax(
-        x_loc, y_loc, height, width, batch_size, nchannels)
+    np_keypoints = self._SpatialSoftmax(x_loc, y_loc, height, width, batch_size,
+                                        nchannels)
 
     # Make sure expected location keypoints matches actual location keypoints.
     with self.test_session() as sess:
@@ -3555,10 +3566,10 @@ class SpatialSoftmaxTests(test.TestCase):
       np_features1[:, x_loc[c], y_loc[c], c] = 100.
       np_features2[:, x_loc[c], y_loc[c], c] = 100.
 
-    np_keypoints1 = self._SpatialSoftmax(
-        x_loc, y_loc, height1, width1, batch_size, nchannels)
-    np_keypoints2 = self._SpatialSoftmax(
-        x_loc, y_loc, height2, width2, batch_size, nchannels)
+    np_keypoints1 = self._SpatialSoftmax(x_loc, y_loc, height1, width1,
+                                         batch_size, nchannels)
+    np_keypoints2 = self._SpatialSoftmax(x_loc, y_loc, height2, width2,
+                                         batch_size, nchannels)
 
     # Make sure expected location keypoints matches actual location keypoints.
     with self.test_session() as sess:
@@ -3584,8 +3595,8 @@ class SpatialSoftmaxTests(test.TestCase):
     for c in range(nchannels):
       np_features[:, x_loc[c], y_loc[c], c] = 100.
 
-    np_keypoints = self._SpatialSoftmax(
-        x_loc, y_loc, height, width, batch_size, nchannels)
+    np_keypoints = self._SpatialSoftmax(x_loc, y_loc, height, width, batch_size,
+                                        nchannels)
 
     # Make sure expected location keypoints matches actual location keypoints.
     with self.test_session() as sess:
@@ -3607,8 +3618,8 @@ class SpatialSoftmaxTests(test.TestCase):
     for c in range(nchannels):
       np_features[:, c, x_loc[c], y_loc[c]] = 100.
 
-    np_keypoints = self._SpatialSoftmax(
-        x_loc, y_loc, height, width, batch_size, nchannels)
+    np_keypoints = self._SpatialSoftmax(x_loc, y_loc, height, width, batch_size,
+                                        nchannels)
 
     # Make sure expected location keypoints matches actual location keypoints.
     with self.test_session() as sess:
@@ -3703,8 +3714,7 @@ class UnitNormTests(test.TestCase):
       image = random_ops.random_uniform((height, width, 3))
       output = _layers.unit_norm(image, dim=dim, epsilon=1e-6)
       norms = math_ops.sqrt(
-          math_ops.reduce_sum(
-              math_ops.square(output), reduction_indices=dim))
+          math_ops.reduce_sum(math_ops.square(output), reduction_indices=dim))
 
       shape = [height, width, 3]
       del shape[dim]
@@ -3740,8 +3750,7 @@ class UnitNormTests(test.TestCase):
       image = array_ops.placeholder(dtypes.float32, (None, None, 3))
       output = _layers.unit_norm(image, dim=dim, epsilon=1e-6)
       norms = math_ops.sqrt(
-          math_ops.reduce_sum(
-              math_ops.square(output), reduction_indices=dim))
+          math_ops.reduce_sum(math_ops.square(output), reduction_indices=dim))
 
       with self.test_session():
         actual = norms.eval({image: placeholder_value})
@@ -3805,8 +3814,8 @@ class PoincareNormalizeTest(test.TestCase):
       with self.test_session():
         x_tf = constant_op.constant(x_np, name='x')
         y_tf = _layers.poincare_normalize(x_tf, dim)
-        err = gradient_checker.compute_gradient_error(x_tf, x_shape,
-                                                      y_tf, x_shape)
+        err = gradient_checker.compute_gradient_error(x_tf, x_shape, y_tf,
+                                                      x_shape)
       print('PoinCareNormalize gradient err = %g ' % err)
       self.assertLess(err, 1e-4)
 
@@ -3818,14 +3827,9 @@ class LegacyFullyConnectedTest(test.TestCase):
     test.TestCase.setUp(self)
     random_seed.set_random_seed(1234)
     self.input = constant_op.constant([[1., 2., 3.], [-4., 15., -6.]])
-    self.input_3_dim_arr = [[[1., 1.1, 1.2],
-                             [2., 2.1, 2.2],
-                             [3., 3.1, 3.2],
-                             [4., 4.1, 4.2]],
-                            [[5., 5.1, 5.2],
-                             [6., 6.1, 6.2],
-                             [7., 7.1, 7.2],
-                             [8., 8.1, 8.2]]]
+    self.input_3_dim_arr = [[[1., 1.1, 1.2], [2., 2.1, 2.2], [3., 3.1, 3.2],
+                             [4., 4.1, 4.2]], [[5., 5.1, 5.2], [6., 6.1, 6.2],
+                                               [7., 7.1, 7.2], [8., 8.1, 8.2]]]
     self.input_3_dim = constant_op.constant(self.input_3_dim_arr)
 
     assert not ops.get_collection(ops.GraphKeys.SUMMARIES)
@@ -3920,15 +3924,10 @@ class LegacyFullyConnectedTest(test.TestCase):
     self._custom_initializers(self.input, 2, [[13.0, 13.0], [11.0, 11.0]])
 
   def test_custom_initializers_multi_dim(self):
-    self._custom_initializers(self.input_3_dim, 2,
-                              [[[7.6, 7.6],
-                                [13.6, 13.6],
-                                [19.6, 19.6],
-                                [25.6, 25.6]],
-                               [[31.6, 31.6],
-                                [37.6, 37.6],
-                                [43.6, 43.6],
-                                [49.6, 49.6]]])
+    self._custom_initializers(
+        self.input_3_dim, 2,
+        [[[7.6, 7.6], [13.6, 13.6], [19.6, 19.6], [25.6, 25.6]],
+         [[31.6, 31.6], [37.6, 37.6], [43.6, 43.6], [49.6, 49.6]]])
 
   def test_custom_collections(self):
     layers_lib.legacy_relu(
@@ -4038,12 +4037,16 @@ class LegacyFullyConnectedTest(test.TestCase):
     with self.test_session() as sess:
       variables_lib.global_variables_initializer().run()
       # we can feed in input with first dimension 2
-      shape_value = sess.run(array_ops.shape(y),
-                             feed_dict={x: self.input_3_dim_arr})
+      shape_value = sess.run(
+          array_ops.shape(y), feed_dict={
+              x: self.input_3_dim_arr
+          })
       self.assertAllClose(shape_value, [2, 4, 1])
       # we can feed in input with first dimension 1
-      shape_value = sess.run(array_ops.shape(y),
-                             feed_dict={x: [self.input_3_dim_arr[0]]})
+      shape_value = sess.run(
+          array_ops.shape(y), feed_dict={
+              x: [self.input_3_dim_arr[0]]
+          })
       self.assertAllClose(shape_value, [1, 4, 1])
       # we cannot feed in input with inconsistent dimensions
       with self.assertRaises(ValueError):
diff --git a/tensorflow/contrib/learn/python/learn/datasets/mnist.py b/tensorflow/contrib/learn/python/learn/datasets/mnist.py
index 1f3295747e..37f9175015 100644
--- a/tensorflow/contrib/learn/python/learn/datasets/mnist.py
+++ b/tensorflow/contrib/learn/python/learn/datasets/mnist.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Functions for downloading and reading MNIST data."""
 
 from __future__ import absolute_import
@@ -123,8 +122,8 @@ class DataSet(object):
     numpy.random.seed(seed1 if seed is None else seed2)
     dtype = dtypes.as_dtype(dtype).base_dtype
     if dtype not in (dtypes.uint8, dtypes.float32):
-      raise TypeError('Invalid image dtype %r, expected uint8 or float32' %
-                      dtype)
+      raise TypeError(
+          'Invalid image dtype %r, expected uint8 or float32' % dtype)
     if fake_data:
       self._num_examples = 10000
       self.one_hot = one_hot
@@ -202,7 +201,9 @@ class DataSet(object):
       end = self._index_in_epoch
       images_new_part = self._images[start:end]
       labels_new_part = self._labels[start:end]
-      return numpy.concatenate((images_rest_part, images_new_part), axis=0) , numpy.concatenate((labels_rest_part, labels_new_part), axis=0)
+      return numpy.concatenate(
+          (images_rest_part, images_new_part), axis=0), numpy.concatenate(
+              (labels_rest_part, labels_new_part), axis=0)
     else:
       self._index_in_epoch += batch_size
       end = self._index_in_epoch
@@ -257,16 +258,14 @@ def read_data_sets(train_dir,
     test_labels = extract_labels(f, one_hot=one_hot)
 
   if not 0 <= validation_size <= len(train_images):
-    raise ValueError(
-        'Validation size should be between 0 and {}. Received: {}.'
-        .format(len(train_images), validation_size))
+    raise ValueError('Validation size should be between 0 and {}. Received: {}.'
+                     .format(len(train_images), validation_size))
 
   validation_images = train_images[:validation_size]
   validation_labels = train_labels[:validation_size]
   train_images = train_images[validation_size:]
   train_labels = train_labels[validation_size:]
 
-
   options = dict(dtype=dtype, reshape=reshape, seed=seed)
 
   train = DataSet(train_images, train_labels, **options)
diff --git a/tensorflow/contrib/learn/python/learn/datasets/synthetic_test.py b/tensorflow/contrib/learn/python/learn/datasets/synthetic_test.py
index 5340afab46..613d8d39a3 100644
--- a/tensorflow/contrib/learn/python/learn/datasets/synthetic_test.py
+++ b/tensorflow/contrib/learn/python/learn/datasets/synthetic_test.py
@@ -24,12 +24,14 @@ from tensorflow.python.platform import test
 from tensorflow.contrib.learn.python.learn import datasets
 from tensorflow.contrib.learn.python.learn.datasets import synthetic
 
+
 class SyntheticTest(test.TestCase):
   """Test synthetic dataset generation"""
 
   def test_make_dataset(self):
     """Test if the synthetic routine wrapper complains about the name"""
-    self.assertRaises(ValueError, datasets.make_dataset, name='_non_existing_name')
+    self.assertRaises(
+        ValueError, datasets.make_dataset, name='_non_existing_name')
 
   def test_all_datasets_callable(self):
     """Test if all methods inside the `SYNTHETIC` are callable"""
@@ -52,9 +54,10 @@ class SyntheticTest(test.TestCase):
     """
     n_samples = 100
     n_classes = 2
-    circ = synthetic.circles(n_samples = n_samples, noise = None, n_classes = n_classes)
+    circ = synthetic.circles(
+        n_samples=n_samples, noise=None, n_classes=n_classes)
     self.assertIsInstance(circ, datasets.base.Dataset)
-    self.assertTupleEqual(circ.data.shape, (n_samples,2))
+    self.assertTupleEqual(circ.data.shape, (n_samples, 2))
     self.assertTupleEqual(circ.target.shape, (n_samples,))
     self.assertSetEqual(set(circ.target), set(range(n_classes)))
 
@@ -67,17 +70,24 @@ class SyntheticTest(test.TestCase):
     """
     seed = 42
     noise = 0.1
-    circ0 = synthetic.circles(n_samples = 100, noise = noise, n_classes = 2, seed = seed)
-    circ1 = synthetic.circles(n_samples = 100, noise = noise, n_classes = 2, seed = seed)
+    circ0 = synthetic.circles(
+        n_samples=100, noise=noise, n_classes=2, seed=seed)
+    circ1 = synthetic.circles(
+        n_samples=100, noise=noise, n_classes=2, seed=seed)
     np.testing.assert_array_equal(circ0.data, circ1.data)
     np.testing.assert_array_equal(circ0.target, circ1.target)
 
-    circ1 = synthetic.circles(n_samples = 100, noise = noise, n_classes = 2, seed = seed+1)
-    self.assertRaises(AssertionError, np.testing.assert_array_equal, circ0.data, circ1.data)
-    self.assertRaises(AssertionError, np.testing.assert_array_equal, circ0.target, circ1.target)
+    circ1 = synthetic.circles(
+        n_samples=100, noise=noise, n_classes=2, seed=seed + 1)
+    self.assertRaises(AssertionError, np.testing.assert_array_equal, circ0.data,
+                      circ1.data)
+    self.assertRaises(AssertionError, np.testing.assert_array_equal,
+                      circ0.target, circ1.target)
 
-    circ1 = synthetic.circles(n_samples = 100, noise = noise/2., n_classes = 2, seed = seed)
-    self.assertRaises(AssertionError, np.testing.assert_array_equal, circ0.data, circ1.data)
+    circ1 = synthetic.circles(
+        n_samples=100, noise=noise / 2., n_classes=2, seed=seed)
+    self.assertRaises(AssertionError, np.testing.assert_array_equal, circ0.data,
+                      circ1.data)
 
   def test_spirals(self):
     """Test if the circles are generated correctly
@@ -89,13 +99,14 @@ class SyntheticTest(test.TestCase):
       - returned `target` shape is (n_samples,)
       - set of unique classes range is [0, n_classes)
     """
-    self.assertRaises(ValueError, synthetic.spirals, mode='_unknown_mode_spiral_')
+    self.assertRaises(
+        ValueError, synthetic.spirals, mode='_unknown_mode_spiral_')
     n_samples = 100
     modes = ('archimedes', 'bernoulli', 'fermat')
     for mode in modes:
-      spir = synthetic.spirals(n_samples = n_samples, noise = None, mode = mode)
+      spir = synthetic.spirals(n_samples=n_samples, noise=None, mode=mode)
       self.assertIsInstance(spir, datasets.base.Dataset)
-      self.assertTupleEqual(spir.data.shape, (n_samples,2))
+      self.assertTupleEqual(spir.data.shape, (n_samples, 2))
       self.assertTupleEqual(spir.target.shape, (n_samples,))
       self.assertSetEqual(set(spir.target), set(range(2)))
 
@@ -110,18 +121,21 @@ class SyntheticTest(test.TestCase):
     noise = 0.1
     modes = ('archimedes', 'bernoulli', 'fermat')
     for mode in modes:
-      spir0 = synthetic.spirals(n_samples = 1000, noise = noise, seed = seed)
-      spir1 = synthetic.spirals(n_samples = 1000, noise = noise, seed = seed)
+      spir0 = synthetic.spirals(n_samples=1000, noise=noise, seed=seed)
+      spir1 = synthetic.spirals(n_samples=1000, noise=noise, seed=seed)
       np.testing.assert_array_equal(spir0.data, spir1.data)
       np.testing.assert_array_equal(spir0.target, spir1.target)
 
-      spir1 = synthetic.spirals(n_samples = 1000, noise = noise, seed = seed+1)
-      self.assertRaises(AssertionError, np.testing.assert_array_equal, spir0.data, spir1.data)
-      self.assertRaises(AssertionError, np.testing.assert_array_equal, spir0.target, spir1.target)
+      spir1 = synthetic.spirals(n_samples=1000, noise=noise, seed=seed + 1)
+      self.assertRaises(AssertionError, np.testing.assert_array_equal,
+                        spir0.data, spir1.data)
+      self.assertRaises(AssertionError, np.testing.assert_array_equal,
+                        spir0.target, spir1.target)
 
-      spir1 = synthetic.spirals(n_samples = 1000, noise = noise/2., seed = seed)
-      self.assertRaises(AssertionError, np.testing.assert_array_equal, spir0.data, spir1.data)
+      spir1 = synthetic.spirals(n_samples=1000, noise=noise / 2., seed=seed)
+      self.assertRaises(AssertionError, np.testing.assert_array_equal,
+                        spir0.data, spir1.data)
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/learn/python/learn/estimators/debug_test.py b/tensorflow/contrib/learn/python/learn/estimators/debug_test.py
index 6b125534a4..b968aeed1b 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/debug_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/debug_test.py
@@ -44,7 +44,6 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 from tensorflow.python.training import input as input_lib
 
-
 NUM_EXAMPLES = 100
 N_CLASSES = 5  #  Cardinality of multiclass labels.
 LABEL_DIMENSION = 3  #  Dimensionality of regression labels.
@@ -52,8 +51,10 @@ LABEL_DIMENSION = 3  #  Dimensionality of regression labels.
 
 def _train_test_split(features_and_labels):
   features, labels = features_and_labels
-  train_set = (features[:int(len(features) / 2)], labels[:int(len(features) / 2)])
-  test_set = (features[int(len(features) / 2):], labels[int(len(features) / 2):])
+  train_set = (features[:int(len(features) / 2)],
+               labels[:int(len(features) / 2)])
+  test_set = (features[int(len(features) / 2):],
+              labels[int(len(features) / 2):])
   return train_set, test_set
 
 
@@ -86,17 +87,17 @@ class DebugClassifierTest(test.TestCase):
     (train_features, train_labels), (test_features,
                                      test_labels) = _train_test_split(
                                          [self.features, self.labels])
-    majority_class, _ = max(collections.Counter(train_labels).items(),
-                            key=operator.itemgetter(1))
+    majority_class, _ = max(
+        collections.Counter(train_labels).items(), key=operator.itemgetter(1))
     expected_prediction = np.vstack(
         [[majority_class] for _ in range(test_labels.shape[0])])
 
     classifier = debug.DebugClassifier(n_classes=N_CLASSES)
-    classifier.fit(input_fn=_input_fn_builder(train_features, train_labels),
-                   steps=50)
+    classifier.fit(
+        input_fn=_input_fn_builder(train_features, train_labels), steps=50)
 
-    pred = classifier.predict_classes(input_fn=_input_fn_builder(test_features,
-                                                                 None))
+    pred = classifier.predict_classes(
+        input_fn=_input_fn_builder(test_features, None))
     self.assertAllEqual(expected_prediction, np.vstack(pred))
 
   def testPredictBinary(self):
@@ -105,34 +106,34 @@ class DebugClassifierTest(test.TestCase):
                                      test_labels) = _train_test_split(
                                          [self.features, self.binary_labels])
 
-    majority_class, _ = max(collections.Counter(train_labels).items(),
-                            key=operator.itemgetter(1))
+    majority_class, _ = max(
+        collections.Counter(train_labels).items(), key=operator.itemgetter(1))
     expected_prediction = np.vstack(
         [[majority_class] for _ in range(test_labels.shape[0])])
 
     classifier = debug.DebugClassifier(n_classes=2)
-    classifier.fit(input_fn=_input_fn_builder(train_features, train_labels),
-                   steps=50)
+    classifier.fit(
+        input_fn=_input_fn_builder(train_features, train_labels), steps=50)
 
-    pred = classifier.predict_classes(input_fn=_input_fn_builder(test_features,
-                                                                 None))
+    pred = classifier.predict_classes(
+        input_fn=_input_fn_builder(test_features, None))
     self.assertAllEqual(expected_prediction, np.vstack(pred))
 
-    (train_features, train_labels), (
-        test_features, test_labels) = _train_test_split(
-            [self.features, self.binary_float_labels])
+    (train_features,
+     train_labels), (test_features, test_labels) = _train_test_split(
+         [self.features, self.binary_float_labels])
 
-    majority_class, _ = max(collections.Counter(train_labels).items(),
-                            key=operator.itemgetter(1))
+    majority_class, _ = max(
+        collections.Counter(train_labels).items(), key=operator.itemgetter(1))
     expected_prediction = np.vstack(
         [[majority_class] for _ in range(test_labels.shape[0])])
 
     classifier = debug.DebugClassifier(n_classes=2)
-    classifier.fit(input_fn=_input_fn_builder(train_features, train_labels),
-                   steps=50)
+    classifier.fit(
+        input_fn=_input_fn_builder(train_features, train_labels), steps=50)
 
-    pred = classifier.predict_classes(input_fn=_input_fn_builder(test_features,
-                                                                 None))
+    pred = classifier.predict_classes(
+        input_fn=_input_fn_builder(test_features, None))
     self.assertAllEqual(expected_prediction, np.vstack(pred))
 
   def testPredictProba(self):
@@ -150,8 +151,8 @@ class DebugClassifierTest(test.TestCase):
         [class_distribution for _ in range(test_labels.shape[0])])
 
     classifier = debug.DebugClassifier(n_classes=N_CLASSES)
-    classifier.fit(input_fn=_input_fn_builder(train_features, train_labels),
-                   steps=50)
+    classifier.fit(
+        input_fn=_input_fn_builder(train_features, train_labels), steps=50)
 
     pred = classifier.predict_proba(
         input_fn=_input_fn_builder(test_features, None))
@@ -173,17 +174,17 @@ class DebugClassifierTest(test.TestCase):
         [class_distribution for _ in range(test_labels.shape[0])])
 
     classifier = debug.DebugClassifier(n_classes=2)
-    classifier.fit(input_fn=_input_fn_builder(train_features, train_labels),
-                   steps=50)
+    classifier.fit(
+        input_fn=_input_fn_builder(train_features, train_labels), steps=50)
 
     pred = classifier.predict_proba(
         input_fn=_input_fn_builder(test_features, None))
 
     self.assertAllClose(expected_prediction, np.vstack(pred), atol=0.1)
 
-    (train_features, train_labels), (
-        test_features, test_labels) = _train_test_split(
-            [self.features, self.binary_float_labels])
+    (train_features,
+     train_labels), (test_features, test_labels) = _train_test_split(
+         [self.features, self.binary_float_labels])
 
     class_distribution = np.zeros((1, 2))
     for label in train_labels:
@@ -194,8 +195,8 @@ class DebugClassifierTest(test.TestCase):
         [class_distribution for _ in range(test_labels.shape[0])])
 
     classifier = debug.DebugClassifier(n_classes=2)
-    classifier.fit(input_fn=_input_fn_builder(train_features, train_labels),
-                   steps=50)
+    classifier.fit(
+        input_fn=_input_fn_builder(train_features, train_labels), steps=50)
 
     pred = classifier.predict_proba(
         input_fn=_input_fn_builder(test_features, None))
@@ -232,13 +233,12 @@ class DebugClassifierTest(test.TestCase):
     def _input_fn():
       iris = test_data.prepare_iris_data_for_logistic_regression()
       return {
-          'feature': constant_op.constant(
-              iris.data, dtype=dtypes.float32)
+          'feature': constant_op.constant(iris.data, dtype=dtypes.float32)
       }, constant_op.constant(
           iris.target, shape=[100], dtype=dtypes.int32)
 
-    classifier = debug.DebugClassifier(config=run_config.RunConfig(
-        tf_random_seed=1))
+    classifier = debug.DebugClassifier(
+        config=run_config.RunConfig(tf_random_seed=1))
     classifier.fit(input_fn=_input_fn, steps=5)
     scores = classifier.evaluate(input_fn=_input_fn, steps=1)
     self.assertIn('loss', scores)
@@ -342,8 +342,7 @@ class DebugClassifierTest(test.TestCase):
     def _input_fn():
       iris = base.load_iris()
       return {
-          'feature': constant_op.constant(
-              iris.data, dtype=dtypes.float32)
+          'feature': constant_op.constant(iris.data, dtype=dtypes.float32)
       }, constant_op.constant(
           iris.target, shape=[150], dtype=dtypes.int32)
 
@@ -387,7 +386,9 @@ class DebugClassifierTest(test.TestCase):
       # Create 4 rows, one of them (y = x), three of them (y=Not(x))
       # The logistic prediction should be (y = 0.25).
       labels = constant_op.constant([[1], [0], [0], [0]])
-      features = {'x': array_ops.ones(shape=[4, 1], dtype=dtypes.float32),}
+      features = {
+          'x': array_ops.ones(shape=[4, 1], dtype=dtypes.float32),
+      }
       return features, labels
 
     classifier = debug.DebugClassifier(n_classes=2)
@@ -404,8 +405,7 @@ class DebugClassifierTest(test.TestCase):
       # The logistic prediction should be (y = 0.25).
       labels = constant_op.constant([[1.], [0.], [0.], [0.]])
       features = {
-          'x': array_ops.ones(
-              shape=[4, 1], dtype=dtypes.float32),
+          'x': array_ops.ones(shape=[4, 1], dtype=dtypes.float32),
           'w': constant_op.constant([[1.], [1.], [1.], [1.]])
       }
       return features, labels
@@ -414,8 +414,7 @@ class DebugClassifierTest(test.TestCase):
       # 4 rows, with different weights.
       labels = constant_op.constant([[1.], [0.], [0.], [0.]])
       features = {
-          'x': array_ops.ones(
-              shape=[4, 1], dtype=dtypes.float32),
+          'x': array_ops.ones(shape=[4, 1], dtype=dtypes.float32),
           'w': constant_op.constant([[7.], [1.], [1.], [1.]])
       }
       return features, labels
@@ -438,8 +437,7 @@ class DebugClassifierTest(test.TestCase):
       # than (y=Not(x)) due to the relative higher weight of the first row.
       labels = constant_op.constant([[1], [0], [0], [0]])
       features = {
-          'x': array_ops.ones(
-              shape=[4, 1], dtype=dtypes.float32),
+          'x': array_ops.ones(shape=[4, 1], dtype=dtypes.float32),
           'w': constant_op.constant([[100.], [3.], [2.], [2.]])
       }
       return features, labels
@@ -448,8 +446,7 @@ class DebugClassifierTest(test.TestCase):
       # Create 4 rows (y = x)
       labels = constant_op.constant([[1], [1], [1], [1]])
       features = {
-          'x': array_ops.ones(
-              shape=[4, 1], dtype=dtypes.float32),
+          'x': array_ops.ones(shape=[4, 1], dtype=dtypes.float32),
           'w': constant_op.constant([[1.], [1.], [1.], [1.]])
       }
       return features, labels
@@ -469,8 +466,7 @@ class DebugClassifierTest(test.TestCase):
       features = {
           'x':
               input_lib.limit_epochs(
-                  array_ops.ones(
-                      shape=[4, 1], dtype=dtypes.float32),
+                  array_ops.ones(shape=[4, 1], dtype=dtypes.float32),
                   num_epochs=num_epochs),
       }
       return features, labels
@@ -578,12 +574,11 @@ class DebugClassifierTest(test.TestCase):
     language = feature_column.sparse_column_with_hash_bucket('language', 100)
     feature_columns = [
         feature_column.real_valued_column('age'),
-        feature_column.embedding_column(
-            language, dimension=1)
+        feature_column.embedding_column(language, dimension=1)
     ]
 
-    classifier = debug.DebugClassifier(config=run_config.RunConfig(
-        tf_random_seed=1))
+    classifier = debug.DebugClassifier(
+        config=run_config.RunConfig(tf_random_seed=1))
     classifier.fit(input_fn=input_fn, steps=5)
 
     def default_input_fn(unused_estimator, examples):
@@ -614,8 +609,8 @@ class DebugRegressorTest(test.TestCase):
     classifier.fit(
         input_fn=_input_fn_builder(train_features, train_labels), steps=50)
 
-    pred = classifier.predict_scores(input_fn=_input_fn_builder(test_features,
-                                                                None))
+    pred = classifier.predict_scores(
+        input_fn=_input_fn_builder(test_features, None))
     self.assertAllClose(expected_prediction, np.vstack(pred), atol=0.1)
 
   def testExperimentIntegration(self):
@@ -698,7 +693,9 @@ class DebugRegressorTest(test.TestCase):
       # Create 4 rows, one of them (y = x), three of them (y=Not(x))
       # The algorithm should learn (y = 0.25).
       labels = constant_op.constant([[1.], [0.], [0.], [0.]])
-      features = {'x': array_ops.ones(shape=[4, 1], dtype=dtypes.float32),}
+      features = {
+          'x': array_ops.ones(shape=[4, 1], dtype=dtypes.float32),
+      }
       return features, labels
 
     regressor = debug.DebugRegressor(
@@ -853,5 +850,6 @@ class DebugRegressorTest(test.TestCase):
     predictions2 = list(regressor2.predict_scores(input_fn=predict_input_fn))
     self.assertAllClose(predictions, predictions2)
 
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator_input_test.py b/tensorflow/contrib/learn/python/learn/estimators/estimator_input_test.py
index 9d7c1a099a..d4a46b41d0 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator_input_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator_input_test.py
@@ -41,7 +41,6 @@ from tensorflow.python.platform import test
 from tensorflow.python.training import input as input_lib
 from tensorflow.python.training import queue_runner_impl
 
-
 _BOSTON_INPUT_DIM = 13
 _IRIS_INPUT_DIM = 4
 
@@ -93,8 +92,8 @@ def boston_eval_fn():
       constant_op.constant(boston.data), [n_examples, _BOSTON_INPUT_DIM])
   labels = array_ops.reshape(
       constant_op.constant(boston.target), [n_examples, 1])
-  return array_ops.concat([features, features], 0), array_ops.concat(
-      [labels, labels], 0)
+  return array_ops.concat([features, features],
+                          0), array_ops.concat([labels, labels], 0)
 
 
 def extract(data, key):
@@ -129,7 +128,10 @@ def linear_model_fn(features, labels, mode):
     (_, features), = features.items()
   prediction, loss = (models.linear_regression_zero_init(features, labels))
   train_op = optimizers.optimize_loss(
-      loss, training_util.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
+      loss,
+      training_util.get_global_step(),
+      optimizer='Adagrad',
+      learning_rate=0.1)
   return prediction, loss, train_op
 
 
@@ -139,7 +141,10 @@ def linear_model_fn_with_model_fn_ops(features, labels, mode):
                   model_fn.ModeKeys.INFER)
   prediction, loss = (models.linear_regression_zero_init(features, labels))
   train_op = optimizers.optimize_loss(
-      loss, training_util.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
+      loss,
+      training_util.get_global_step(),
+      optimizer='Adagrad',
+      learning_rate=0.1)
   return model_fn.ModelFnOps(
       mode=mode, predictions=prediction, loss=loss, train_op=train_op)
 
@@ -150,7 +155,10 @@ def logistic_model_no_mode_fn(features, labels):
   labels = array_ops.one_hot(labels, 3, 1, 0)
   prediction, loss = (models.logistic_regression_zero_init(features, labels))
   train_op = optimizers.optimize_loss(
-      loss, training_util.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
+      loss,
+      training_util.get_global_step(),
+      optimizer='Adagrad',
+      learning_rate=0.1)
   return {
       'class': math_ops.argmax(prediction, 1),
       'prob': prediction
@@ -173,7 +181,9 @@ class EstimatorInputTest(test.TestCase):
     scores = est.evaluate(
         x=boston_input,
         y=float64_target,
-        metrics={'MSE': metric_ops.streaming_mean_squared_error})
+        metrics={
+            'MSE': metric_ops.streaming_mean_squared_error
+        })
     del est
     # Create another estimator object with the same output dir.
     est2 = estimator.Estimator(model_fn=linear_model_fn, model_dir=output_dir)
@@ -182,7 +192,9 @@ class EstimatorInputTest(test.TestCase):
     scores2 = est2.evaluate(
         x=boston_input,
         y=float64_target,
-        metrics={'MSE': metric_ops.streaming_mean_squared_error})
+        metrics={
+            'MSE': metric_ops.streaming_mean_squared_error
+        })
     self.assertAllClose(scores2['MSE'], scores['MSE'])
     predictions = np.array(list(est2.predict(x=boston_input)))
     other_score = _sklearn.mean_squared_error(predictions,
@@ -197,7 +209,9 @@ class EstimatorInputTest(test.TestCase):
     scores = est.score(
         x=boston.data,
         y=float64_labels,
-        metrics={'MSE': metric_ops.streaming_mean_squared_error})
+        metrics={
+            'MSE': metric_ops.streaming_mean_squared_error
+        })
     predictions = np.array(list(est.predict(x=boston.data)))
     other_score = _sklearn.mean_squared_error(predictions, boston.target)
     self.assertAllClose(scores['MSE'], other_score)
@@ -213,7 +227,9 @@ class EstimatorInputTest(test.TestCase):
     scores = est.evaluate(
         x=boston_input,
         y=float64_target,
-        metrics={'MSE': metric_ops.streaming_mean_squared_error})
+        metrics={
+            'MSE': metric_ops.streaming_mean_squared_error
+        })
     predictions = np.array(list(est.predict(x=boston_input)))
     other_score = _sklearn.mean_squared_error(predictions, boston.target)
     self.assertAllClose(other_score, scores['MSE'])
@@ -228,14 +244,15 @@ class EstimatorInputTest(test.TestCase):
     scores = est.score(
         x=iris.data,
         y=iris.target,
-        metrics={('accuracy', 'class'): metric_ops.streaming_accuracy})
+        metrics={
+            ('accuracy', 'class'): metric_ops.streaming_accuracy
+        })
     predictions = est.predict(x=iris.data)
     predictions_class = est.predict(x=iris.data, outputs=['class'])['class']
     self.assertEqual(predictions['prob'].shape[0], iris.target.shape[0])
     self.assertAllClose(predictions['class'], predictions_class)
-    self.assertAllClose(
-        predictions['class'], np.argmax(
-            predictions['prob'], axis=1))
+    self.assertAllClose(predictions['class'],
+                        np.argmax(predictions['prob'], axis=1))
     other_score = _sklearn.accuracy_score(iris.target, predictions['class'])
     self.assertAllClose(scores['accuracy'], other_score)
     self.assertTrue('global_step' in scores)
@@ -250,17 +267,18 @@ class EstimatorInputTest(test.TestCase):
     scores = est.evaluate(
         x=iris_data,
         y=iris_target,
-        metrics={('accuracy', 'class'): metric_ops.streaming_accuracy})
+        metrics={
+            ('accuracy', 'class'): metric_ops.streaming_accuracy
+        })
     predictions = list(est.predict(x=iris_data))
     predictions_class = list(est.predict(x=iris_data, outputs=['class']))
     self.assertEqual(len(predictions), iris.target.shape[0])
     classes_batch = np.array([p['class'] for p in predictions])
     self.assertAllClose(classes_batch,
                         np.array([p['class'] for p in predictions_class]))
-    self.assertAllClose(
-        classes_batch,
-        np.argmax(
-            np.array([p['prob'] for p in predictions]), axis=1))
+    self.assertAllClose(classes_batch,
+                        np.argmax(
+                            np.array([p['prob'] for p in predictions]), axis=1))
     other_score = _sklearn.accuracy_score(iris.target, classes_batch)
     self.assertAllClose(other_score, scores['accuracy'])
     self.assertTrue('global_step' in scores)
diff --git a/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor_test.py b/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor_test.py
index 656d68b768..ac2d10011e 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor_test.py
@@ -57,7 +57,10 @@ def _logistic_regression_model_fn(features, labels, mode):
   predictions = math_ops.sigmoid(logits)
   loss = losses.sigmoid_cross_entropy(labels, logits)
   train_op = optimizers.optimize_loss(
-      loss, training_util.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
+      loss,
+      training_util.get_global_step(),
+      optimizer='Adagrad',
+      learning_rate=0.1)
   return predictions, loss, train_op
 
 
diff --git a/tensorflow/contrib/learn/python/learn/evaluable.py b/tensorflow/contrib/learn/python/learn/evaluable.py
index 66e1526517..8f6cd39864 100644
--- a/tensorflow/contrib/learn/python/learn/evaluable.py
+++ b/tensorflow/contrib/learn/python/learn/evaluable.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """`Evaluable` interface."""
 
 from __future__ import absolute_import
@@ -59,9 +58,12 @@ class Evaluable(object):
     for which this evaluation was performed.
 
     Args:
-      x: Matrix of shape [n_samples, n_features...] or dictionary of many matrices
-        containing the input samples for fitting the model. Can be iterator that returns
-        arrays of features or dictionary of array of features. If set, `input_fn` must
+      x: Matrix of shape [n_samples, n_features...] or dictionary of many
+        matrices
+        containing the input samples for fitting the model. Can be iterator that
+          returns
+        arrays of features or dictionary of array of features. If set,
+          `input_fn` must
         be `None`.
       y: Vector or matrix [n_samples] or [n_samples, n_outputs] containing the
         label values (class labels in classification, real numbers in
diff --git a/tensorflow/contrib/learn/python/learn/experiment.py b/tensorflow/contrib/learn/python/learn/experiment.py
index 9576ff21c2..bec976afd2 100644
--- a/tensorflow/contrib/learn/python/learn/experiment.py
+++ b/tensorflow/contrib/learn/python/learn/experiment.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Experiment class collecting information needed for a single training run."""
 
 from __future__ import absolute_import
@@ -43,7 +42,6 @@ from tensorflow.python.training import saver
 from tensorflow.python.training import server_lib
 from tensorflow.python.util import compat
 
-
 __all__ = ["Experiment"]
 
 
@@ -278,8 +276,7 @@ class Experiment(object):
     self._train_steps_per_iteration = train_steps_per_iteration
     if (self._train_steps_per_iteration is not None and
         not isinstance(self._train_steps_per_iteration, int)):
-      raise ValueError(
-          "`train_steps_per_iteration` must be an integer.")
+      raise ValueError("`train_steps_per_iteration` must be an integer.")
 
   @property
   def estimator(self):
@@ -359,9 +356,10 @@ class Experiment(object):
           config.cluster_spec and config.master):
         self._start_server()
     elif config.cluster_spec and config.master:
-      raise ValueError('For distributed runtime, Experiment class only works with'
-                       'tf.contrib.learn.RunConfig for now, but provided {}'
-                       .format(type(config)))
+      raise ValueError(
+          "For distributed runtime, Experiment class only works with"
+          "tf.contrib.learn.RunConfig for now, but provided {}".format(
+              type(config)))
 
     extra_hooks = []
     if delay_secs is None:
@@ -414,11 +412,12 @@ class Experiment(object):
       logging.info("Waiting %d secs before starting eval.", delay_secs)
       time.sleep(delay_secs)
 
-    return self._call_evaluate(input_fn=self._eval_input_fn,
-                               steps=self._eval_steps,
-                               metrics=self._eval_metrics,
-                               name=(name or "one_pass"),
-                               hooks=self._eval_hooks)
+    return self._call_evaluate(
+        input_fn=self._eval_input_fn,
+        steps=self._eval_steps,
+        metrics=self._eval_metrics,
+        name=(name or "one_pass"),
+        hooks=self._eval_hooks)
 
   @deprecated(
       "2016-10-23",
@@ -499,15 +498,12 @@ class Experiment(object):
     previous_path = None
     eval_result = None
     last_warning_time = 0
-    while (not predicate_fn or
-           predicate_fn(
-               eval_result,
-               checkpoint_path=previous_path if eval_result else None)):
+    while (not predicate_fn or predicate_fn(
+        eval_result, checkpoint_path=previous_path if eval_result else None)):
       # Exit if we have already reached number of steps to train.
       if self._has_training_stopped(eval_result):
         logging.info("Exiting continuous eval, global_step=%s >= "
-                     "train_step=%s",
-                     eval_result[ops.GraphKeys.GLOBAL_STEP],
+                     "train_step=%s", eval_result[ops.GraphKeys.GLOBAL_STEP],
                      self._train_steps)
         return
 
@@ -528,12 +524,13 @@ class Experiment(object):
           logging.warning(error_msg)
           last_warning_time = time.time()
       else:
-        eval_result = self._call_evaluate(input_fn=input_fn,
-                                          steps=self._eval_steps,
-                                          metrics=self._eval_metrics,
-                                          name=name,
-                                          checkpoint_path=latest_path,
-                                          hooks=self._eval_hooks)
+        eval_result = self._call_evaluate(
+            input_fn=input_fn,
+            steps=self._eval_steps,
+            metrics=self._eval_metrics,
+            name=name,
+            checkpoint_path=latest_path,
+            hooks=self._eval_hooks)
         # Ensure eval result is not None for next round of evaluation.
         if not eval_result:
           eval_result = {}
@@ -558,8 +555,8 @@ class Experiment(object):
       return False
 
     global_step = eval_result.get(ops.GraphKeys.GLOBAL_STEP)
-    return global_step and self._train_steps and (
-        global_step >= self._train_steps)
+    return global_step and self._train_steps and (global_step >=
+                                                  self._train_steps)
 
   def continuous_eval(self,
                       delay_secs=None,
@@ -678,8 +675,7 @@ class Experiment(object):
       return eval_result, export_results
 
   @experimental
-  def continuous_train_and_eval(self,
-                                continuous_eval_predicate_fn=None):
+  def continuous_train_and_eval(self, continuous_eval_predicate_fn=None):
     """Interleaves training and evaluation.
 
     The frequency of evaluation is controlled by the `train_steps_per_iteration`
@@ -752,10 +748,9 @@ class Experiment(object):
     elif self._train_steps is not None:
       train_steps_per_iteration = int(self._train_steps / 10)
 
-    while (not predicate_fn or
-           predicate_fn(
-               eval_result,
-               checkpoint_path=latest_checkpoint if eval_result else None)):
+    while (not predicate_fn or predicate_fn(
+        eval_result, checkpoint_path=latest_checkpoint
+        if eval_result else None)):
 
       if self._has_training_stopped(eval_result):
         # Exits once max steps of training is satisfied.
@@ -785,8 +780,7 @@ class Experiment(object):
   def _maybe_export(self, eval_result, checkpoint_path=None):
     """Export the Estimator using export_fn, if defined."""
     export_dir_base = os.path.join(
-        compat.as_bytes(self._estimator.model_dir),
-        compat.as_bytes("export"))
+        compat.as_bytes(self._estimator.model_dir), compat.as_bytes("export"))
 
     export_results = []
     for strategy in self._export_strategies:
@@ -824,10 +818,11 @@ class Experiment(object):
         hooks=self._train_monitors,
         saving_listeners=self._saving_listeners)
 
-    eval_result = self._call_evaluate(input_fn=self._eval_input_fn,
-                                      steps=1,
-                                      metrics=self._eval_metrics,
-                                      name="one_pass")
+    eval_result = self._call_evaluate(
+        input_fn=self._eval_input_fn,
+        steps=1,
+        metrics=self._eval_metrics,
+        name="one_pass")
     _ = self._maybe_export(eval_result)
 
     return eval_result
@@ -849,9 +844,14 @@ class Experiment(object):
     server.start()
     return server
 
-  def _call_train(self, _sentinel=None,  # pylint: disable=invalid-name,
-                  input_fn=None, steps=None, hooks=None, max_steps=None,
-                  saving_listeners=None):
+  def _call_train(
+      self,
+      _sentinel=None,  # pylint: disable=invalid-name,
+      input_fn=None,
+      steps=None,
+      hooks=None,
+      max_steps=None,
+      saving_listeners=None):
     if _sentinel is not None:
       raise ValueError("_call_train should be called with keyword args only")
 
@@ -867,14 +867,18 @@ class Experiment(object):
           hooks=hooks,
           saving_listeners=saving_listeners)
     else:
-      return self._estimator.fit(input_fn=input_fn,
-                                 steps=steps,
-                                 max_steps=max_steps,
-                                 monitors=hooks)
-
-  def _call_evaluate(self, _sentinel=None,  # pylint: disable=invalid-name,
-                     input_fn=None, steps=None, metrics=None, name=None,
-                     checkpoint_path=None, hooks=None):
+      return self._estimator.fit(
+          input_fn=input_fn, steps=steps, max_steps=max_steps, monitors=hooks)
+
+  def _call_evaluate(
+      self,
+      _sentinel=None,  # pylint: disable=invalid-name,
+      input_fn=None,
+      steps=None,
+      metrics=None,
+      name=None,
+      checkpoint_path=None,
+      hooks=None):
     if _sentinel is not None:
       raise ValueError("_call_evaluate should be called with keyword args only")
 
@@ -882,18 +886,20 @@ class Experiment(object):
       if metrics is not None:
         raise ValueError(
             "`eval_metrics` must be `None` with `tf.estimator.Estimator`")
-      return self._estimator.evaluate(input_fn=input_fn,
-                                      steps=steps,
-                                      name=name,
-                                      checkpoint_path=checkpoint_path,
-                                      hooks=hooks)
+      return self._estimator.evaluate(
+          input_fn=input_fn,
+          steps=steps,
+          name=name,
+          checkpoint_path=checkpoint_path,
+          hooks=hooks)
     else:
-      return self._estimator.evaluate(input_fn=input_fn,
-                                      steps=steps,
-                                      metrics=metrics,
-                                      name=name,
-                                      checkpoint_path=checkpoint_path,
-                                      hooks=hooks)
+      return self._estimator.evaluate(
+          input_fn=input_fn,
+          steps=steps,
+          metrics=metrics,
+          name=name,
+          checkpoint_path=checkpoint_path,
+          hooks=hooks)
 
 
 @contextlib.contextmanager
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py b/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py
index f36a778b52..96be8b1bc4 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py
@@ -35,6 +35,7 @@ from tensorflow.python.platform import tf_logging as logging
 # pylint: disable=g-multiple-import,g-bad-import-order
 from .pandas_io import HAS_PANDAS, extract_pandas_data, extract_pandas_matrix, extract_pandas_labels
 from .dask_io import HAS_DASK, extract_dask_data, extract_dask_labels
+
 # pylint: enable=g-multiple-import,g-bad-import-order
 
 
@@ -74,11 +75,11 @@ def _get_in_out_shape(x_shape, y_shape, n_classes, batch_size=None):
   if not y_is_dict:
     output_shape = out_el_shape(y_shape, n_classes)
   else:
-    output_shape = dict([
-        (k, out_el_shape(v, n_classes[k]
-                         if n_classes is not None and k in n_classes else None))
-        for k, v in list(y_shape.items())
-    ])
+    output_shape = dict([(k,
+                          out_el_shape(v, n_classes[k]
+                                       if n_classes is not None and
+                                       k in n_classes else None))
+                         for k, v in list(y_shape.items())])
 
   return input_shape, output_shape, batch_size
 
@@ -314,23 +315,23 @@ class DataFeeder(object):
       input_dtype: DType of input (or dictionary of shapes).
       output_dtype: DType of output (or dictionary of shapes.
     """
-    x_is_dict, y_is_dict = isinstance(x, dict), y is not None and isinstance(
-        y, dict)
+    x_is_dict, y_is_dict = isinstance(
+        x, dict), y is not None and isinstance(y, dict)
     if isinstance(y, list):
       y = np.array(y)
 
     self._x = dict([(k, check_array(v, v.dtype)) for k, v in list(x.items())
                    ]) if x_is_dict else check_array(x, x.dtype)
-    self._y = None if y is None else (
-        dict([(k, check_array(v, v.dtype)) for k, v in list(y.items())])
-        if y_is_dict else check_array(y, y.dtype))
+    self._y = None if y is None else (dict(
+        [(k, check_array(v, v.dtype)) for k, v in list(y.items())])
+                                      if y_is_dict else check_array(y, y.dtype))
 
     # self.n_classes is not None means we're converting raw target indices
     # to one-hot.
     if n_classes is not None:
       if not y_is_dict:
-        y_dtype = (np.int64
-                   if n_classes is not None and n_classes > 1 else np.float32)
+        y_dtype = (
+            np.int64 if n_classes is not None and n_classes > 1 else np.float32)
         self._y = (None if y is None else check_array(y, dtype=y_dtype))
 
     self.n_classes = n_classes
@@ -352,8 +353,8 @@ class DataFeeder(object):
     # self._output_dtype == np.float32 when y is None
     self._output_dtype = (
         dict([(k, _check_dtype(v.dtype)) for k, v in list(self._y.items())])
-        if y_is_dict else (
-            _check_dtype(self._y.dtype) if y is not None else np.float32))
+        if y_is_dict else (_check_dtype(self._y.dtype)
+                           if y is not None else np.float32))
 
     # self.n_classes is None means we're passing in raw target indices
     if n_classes is not None and y_is_dict:
@@ -478,8 +479,8 @@ class DataFeeder(object):
 
     # Assign input features from random indices.
     def extract(data, indices):
-      return (np.array(_access(data, indices)).reshape((indices.shape[0], 1)) if
-              len(data.shape) == 1 else _access(data, indices))
+      return (np.array(_access(data, indices)).reshape((indices.shape[0], 1))
+              if len(data.shape) == 1 else _access(data, indices))
 
     # assign labels from random indices
     def assign_label(data, shape, dtype, n_classes, indices):
@@ -511,16 +512,18 @@ class DataFeeder(object):
         feed_dict[self._epoch_placeholder.name] = [self.epoch]
 
       # Take next batch of indices.
-      x_len = list(self._x.values())[0].shape[
-          0] if x_is_dict else self._x.shape[0]
+      x_len = list(
+          self._x.values())[0].shape[0] if x_is_dict else self._x.shape[0]
       end = min(x_len, self.offset + self._batch_size)
       batch_indices = self.indices[self.offset:end]
 
       # adding input placeholder
       feed_dict.update(
           dict([(self._input_placeholder[k].name, extract(v, batch_indices))
-                for k, v in list(self._x.items())]) if x_is_dict else
-          {self._input_placeholder.name: extract(self._x, batch_indices)})
+                for k, v in list(self._x.items())]) if x_is_dict else {
+                    self._input_placeholder.name:
+                        extract(self._x, batch_indices)
+                })
 
       # move offset and reset it if necessary
       self.offset += self._batch_size
@@ -545,7 +548,8 @@ class DataFeeder(object):
                   assign_label(v, shape, dtype, n_classes, batch_indices)
           })
       else:
-        shape, dtype, n_classes = self.output_shape, self._output_dtype, self.n_classes
+        shape, dtype, n_classes = (self.output_shape, self._output_dtype,
+                                   self.n_classes)
         feed_dict.update({
             self._output_placeholder.name:
                 assign_label(self._y, shape, dtype, n_classes, batch_indices)
@@ -621,8 +625,9 @@ class StreamingDataFeeder(DataFeeder):
     elif y is None:
       y_first_el_shape = None
     else:
-      y_first_el_shape = ([1] + list(y_first_el[0].shape if isinstance(
-          y_first_el, list) else y_first_el.shape))
+      y_first_el_shape = (
+          [1] + list(y_first_el[0].shape
+                     if isinstance(y_first_el, list) else y_first_el.shape))
 
     self.input_shape, self.output_shape, self._batch_size = _get_in_out_shape(
         x_first_el_shape, y_first_el_shape, n_classes, batch_size)
@@ -683,8 +688,8 @@ class StreamingDataFeeder(DataFeeder):
         if shape is None:
           return None
         elif isinstance(shape, dict):
-          return dict([(k, np.zeros(shape[k], dtype[k]))
-                       for k in list(shape.keys())])
+          return dict(
+              [(k, np.zeros(shape[k], dtype[k])) for k in list(shape.keys())])
         else:
           return np.zeros(shape, dtype=dtype)
 
diff --git a/tensorflow/contrib/learn/python/learn/trainable.py b/tensorflow/contrib/learn/python/learn/trainable.py
index 972fec026f..429b6040be 100644
--- a/tensorflow/contrib/learn/python/learn/trainable.py
+++ b/tensorflow/contrib/learn/python/learn/trainable.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """`Trainable` interface."""
 
 from __future__ import absolute_import
@@ -28,18 +27,31 @@ class Trainable(object):
   __metaclass__ = abc.ABCMeta
 
   @abc.abstractmethod
-  def fit(self, x=None, y=None, input_fn=None, steps=None, batch_size=None,
-          monitors=None, max_steps=None):
+  def fit(self,
+          x=None,
+          y=None,
+          input_fn=None,
+          steps=None,
+          batch_size=None,
+          monitors=None,
+          max_steps=None):
     """Trains a model given training data `x` predictions and `y` labels.
 
     Args:
-      x: Matrix of shape [n_samples, n_features...] or the dictionary of Matrices.
-         Can be iterator that returns arrays of features or dictionary of arrays of features.
-         The training input samples for fitting the model. If set, `input_fn` must be `None`.
-      y: Vector or matrix [n_samples] or [n_samples, n_outputs] or the dictionary of same.
-         Can be iterator that returns array of labels or dictionary of array of labels.
-         The training label values (class labels in classification, real numbers in regression).
-         If set, `input_fn` must be `None`. Note: For classification, label values must
+      x: Matrix of shape [n_samples, n_features...] or the dictionary of
+        Matrices.
+         Can be iterator that returns arrays of features or dictionary of arrays
+           of features.
+         The training input samples for fitting the model. If set, `input_fn`
+           must be `None`.
+      y: Vector or matrix [n_samples] or [n_samples, n_outputs] or the
+        dictionary of same.
+         Can be iterator that returns array of labels or dictionary of array of
+           labels.
+         The training label values (class labels in classification, real numbers
+           in regression).
+         If set, `input_fn` must be `None`. Note: For classification, label
+           values must
          be integers representing the class index (i.e. values from 0 to
          n_classes-1).
       input_fn: Input function returning a tuple of:
diff --git a/tensorflow/contrib/losses/python/losses/loss_ops.py b/tensorflow/contrib/losses/python/losses/loss_ops.py
index 7c523ad492..8c3a8afe7a 100644
--- a/tensorflow/contrib/losses/python/losses/loss_ops.py
+++ b/tensorflow/contrib/losses/python/losses/loss_ops.py
@@ -30,20 +30,13 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.deprecation import deprecated_args
 
-__all__ = ["absolute_difference",
-           "add_loss",
-           "cosine_distance",
-           "compute_weighted_loss",
-           "get_losses",
-           "get_regularization_losses",
-           "get_total_loss",
-           "hinge_loss",
-           "log_loss",
-           "mean_pairwise_squared_error",
-           "mean_squared_error",
-           "sigmoid_cross_entropy",
-           "softmax_cross_entropy",
-           "sparse_softmax_cross_entropy"]
+__all__ = [
+    "absolute_difference", "add_loss", "cosine_distance",
+    "compute_weighted_loss", "get_losses", "get_regularization_losses",
+    "get_total_loss", "hinge_loss", "log_loss", "mean_pairwise_squared_error",
+    "mean_squared_error", "sigmoid_cross_entropy", "softmax_cross_entropy",
+    "sparse_softmax_cross_entropy"
+]
 
 
 def _scale_losses(losses, weights):
@@ -66,8 +59,8 @@ def _scale_losses(losses, weights):
   # First, compute the sum of the losses over all elements:
   start_index = max(0, weights.get_shape().ndims)
   reduction_indices = list(range(start_index, losses.get_shape().ndims))
-  reduced_losses = math_ops.reduce_sum(losses,
-                                       reduction_indices=reduction_indices)
+  reduced_losses = math_ops.reduce_sum(
+      losses, reduction_indices=reduction_indices)
   reduced_losses = math_ops.multiply(reduced_losses, weights)
   return math_ops.reduce_sum(reduced_losses)
 
@@ -90,9 +83,10 @@ def _safe_div(numerator, denominator, name="value"):
   """
   return array_ops.where(
       math_ops.greater(denominator, 0),
-      math_ops.div(numerator, array_ops.where(
-          math_ops.equal(denominator, 0),
-          array_ops.ones_like(denominator), denominator)),
+      math_ops.div(numerator,
+                   array_ops.where(
+                       math_ops.equal(denominator, 0),
+                       array_ops.ones_like(denominator), denominator)),
       array_ops.zeros_like(numerator),
       name=name)
 
@@ -176,14 +170,15 @@ def _num_present(losses, weights, per_batch=False):
   """
   # If weights is a scalar, its easy to compute:
   if weights.get_shape().ndims == 0:
-    batch_size = array_ops.reshape(array_ops.slice(array_ops.shape(losses),
-                                                   [0], [1]), [])
-    num_per_batch = math_ops.div(math_ops.to_float(array_ops.size(losses)),
-                                 math_ops.to_float(batch_size))
-    num_per_batch = array_ops.where(math_ops.equal(weights, 0),
-                                    0.0, num_per_batch)
-    num_per_batch = math_ops.multiply(array_ops.ones(
-        array_ops.reshape(batch_size, [1])), num_per_batch)
+    batch_size = array_ops.reshape(
+        array_ops.slice(array_ops.shape(losses), [0], [1]), [])
+    num_per_batch = math_ops.div(
+        math_ops.to_float(array_ops.size(losses)),
+        math_ops.to_float(batch_size))
+    num_per_batch = array_ops.where(
+        math_ops.equal(weights, 0), 0.0, num_per_batch)
+    num_per_batch = math_ops.multiply(
+        array_ops.ones(array_ops.reshape(batch_size, [1])), num_per_batch)
     return num_per_batch if per_batch else math_ops.reduce_sum(num_per_batch)
 
   # First, count the number of nonzero weights:
@@ -194,8 +189,8 @@ def _num_present(losses, weights, per_batch=False):
         reduction_indices=reduction_indices)
 
   # Next, determine the number of elements that weights would broadcast to:
-  broadcast_dims = array_ops.slice(array_ops.shape(losses),
-                                   [weights.get_shape().ndims], [-1])
+  broadcast_dims = array_ops.slice(
+      array_ops.shape(losses), [weights.get_shape().ndims], [-1])
   num_to_broadcast = math_ops.to_float(math_ops.reduce_prod(broadcast_dims))
 
   num_per_batch = math_ops.multiply(num_nonzero_per_batch, num_to_broadcast)
@@ -303,8 +298,11 @@ def absolute_difference(predictions, labels=None, weights=1.0, scope=None):
 @deprecated("2016-12-30",
             "Use tf.losses.sigmoid_cross_entropy instead. Note that the order "
             "of the predictions and labels arguments has been changed.")
-def sigmoid_cross_entropy(
-    logits, multi_class_labels, weights=1.0, label_smoothing=0, scope=None):
+def sigmoid_cross_entropy(logits,
+                          multi_class_labels,
+                          weights=1.0,
+                          label_smoothing=0,
+                          scope=None):
   """Creates a cross-entropy loss using tf.nn.sigmoid_cross_entropy_with_logits.
 
   `weights` acts as a coefficient for the loss. If a scalar is provided,
@@ -340,20 +338,22 @@ def sigmoid_cross_entropy(
     multi_class_labels = math_ops.cast(multi_class_labels, logits.dtype)
 
     if label_smoothing > 0:
-      multi_class_labels = (multi_class_labels * (1 - label_smoothing) +
-                            0.5 * label_smoothing)
+      multi_class_labels = (
+          multi_class_labels * (1 - label_smoothing) + 0.5 * label_smoothing)
 
-    losses = nn.sigmoid_cross_entropy_with_logits(labels=multi_class_labels,
-                                                  logits=logits,
-                                                  name="xentropy")
+    losses = nn.sigmoid_cross_entropy_with_logits(
+        labels=multi_class_labels, logits=logits, name="xentropy")
     return compute_weighted_loss(losses, weights, scope=scope)
 
 
 @deprecated("2016-12-30",
             "Use tf.losses.softmax_cross_entropy instead. Note that the order "
             "of the logits and labels arguments has been changed.")
-def softmax_cross_entropy(
-    logits, onehot_labels, weights=1.0, label_smoothing=0, scope=None):
+def softmax_cross_entropy(logits,
+                          onehot_labels,
+                          weights=1.0,
+                          label_smoothing=0,
+                          scope=None):
   """Creates a cross-entropy loss using tf.nn.softmax_cross_entropy_with_logits.
 
   `weights` acts as a coefficient for the loss. If a scalar is provided,
@@ -393,9 +393,8 @@ def softmax_cross_entropy(
       smooth_negatives = label_smoothing / num_classes
       onehot_labels = onehot_labels * smooth_positives + smooth_negatives
 
-    losses = nn.softmax_cross_entropy_with_logits(labels=onehot_labels,
-                                                  logits=logits,
-                                                  name="xentropy")
+    losses = nn.softmax_cross_entropy_with_logits(
+        labels=onehot_labels, logits=logits, name="xentropy")
     return compute_weighted_loss(losses, weights, scope=scope)
 
 
@@ -429,9 +428,8 @@ def sparse_softmax_cross_entropy(logits, labels, weights=1.0, scope=None):
                       [logits, labels, weights]) as scope:
     labels = array_ops.reshape(labels, shape=[array_ops.shape(labels)[0]])
 
-    losses = nn.sparse_softmax_cross_entropy_with_logits(labels=labels,
-                                                         logits=logits,
-                                                         name="xentropy")
+    losses = nn.sparse_softmax_cross_entropy_with_logits(
+        labels=labels, logits=logits, name="xentropy")
     return compute_weighted_loss(losses, weights, scope=scope)
 
 
@@ -470,8 +468,7 @@ def log_loss(predictions, labels=None, weights=1.0, epsilon=1e-7, scope=None):
     predictions = math_ops.to_float(predictions)
     labels = math_ops.to_float(labels)
     losses = -math_ops.multiply(
-        labels,
-        math_ops.log(predictions + epsilon)) - math_ops.multiply(
+        labels, math_ops.log(predictions + epsilon)) - math_ops.multiply(
             (1 - labels), math_ops.log(1 - predictions + epsilon))
     return compute_weighted_loss(losses, weights, scope=scope)
 
@@ -490,7 +487,8 @@ def hinge_loss(logits, labels=None, scope=None):
     scope: The scope for the operations performed in computing the loss.
 
   Returns:
-    An unweighted `Tensor` of same shape as `logits` and `labels` representing the
+    An unweighted `Tensor` of same shape as `logits` and `labels` representing
+    the
       loss values across the batch.
 
   Raises:
@@ -544,8 +542,10 @@ def mean_squared_error(predictions, labels=None, weights=1.0, scope=None):
 @deprecated("2016-12-30",
             "Use tf.losses.mean_pairwise_squared_error instead. Note that the "
             "order of the predictions and labels arguments has been changed.")
-def mean_pairwise_squared_error(
-    predictions, labels=None, weights=1.0, scope=None):
+def mean_pairwise_squared_error(predictions,
+                                labels=None,
+                                weights=1.0,
+                                scope=None):
   """Adds a pairwise-errors-squared loss to the training procedure.
 
   Unlike `mean_squared_error`, which is a measure of the differences between
@@ -602,31 +602,34 @@ def mean_pairwise_squared_error(
     reduction_indices = list(range(1, diffs.get_shape().ndims))
 
     sum_squares_diff_per_batch = math_ops.reduce_sum(
-        math_ops.square(diffs),
-        reduction_indices=reduction_indices)
+        math_ops.square(diffs), reduction_indices=reduction_indices)
     num_present_per_batch = _num_present(diffs, weights, per_batch=True)
 
-    term1 = 2.0 * _safe_div(sum_squares_diff_per_batch,
-                            num_present_per_batch)
+    term1 = 2.0 * _safe_div(sum_squares_diff_per_batch, num_present_per_batch)
 
     sum_diff = math_ops.reduce_sum(diffs, reduction_indices=reduction_indices)
-    term2 = 2.0 * _safe_div(math_ops.square(sum_diff),
-                            math_ops.square(num_present_per_batch))
+    term2 = 2.0 * _safe_div(
+        math_ops.square(sum_diff), math_ops.square(num_present_per_batch))
 
     loss = _scale_losses(term1 - term2, weights)
 
-    mean_loss = array_ops.where(math_ops.reduce_sum(num_present_per_batch) > 0,
-                                loss,
-                                array_ops.zeros_like(loss),
-                                name="value")
+    mean_loss = array_ops.where(
+        math_ops.reduce_sum(num_present_per_batch) > 0,
+        loss,
+        array_ops.zeros_like(loss),
+        name="value")
     add_loss(mean_loss)
     return mean_loss
 
 
 @deprecated("2016-12-30", "Use tf.losses.cosine_distance instead.")
 @deprecated_args(None, "dim is deprecated, use axis instead", "dim")
-def cosine_distance(
-    predictions, labels=None, axis=None, weights=1.0, scope=None, dim=None):
+def cosine_distance(predictions,
+                    labels=None,
+                    axis=None,
+                    weights=1.0,
+                    scope=None,
+                    dim=None):
   """Adds a cosine-distance loss to the training procedure.
 
   Note that the function assumes that `predictions` and `labels` are already
@@ -662,5 +665,8 @@ def cosine_distance(
     labels = math_ops.to_float(labels)
 
     radial_diffs = math_ops.multiply(predictions, labels)
-    losses = 1 - math_ops.reduce_sum(radial_diffs, reduction_indices=[axis,])
+    losses = 1 - math_ops.reduce_sum(
+        radial_diffs, reduction_indices=[
+            axis,
+        ])
     return compute_weighted_loss(losses, weights, scope=scope)
diff --git a/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_pruning.py b/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_pruning.py
index 73dd56398c..660f0168b1 100644
--- a/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_pruning.py
+++ b/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_pruning.py
@@ -48,7 +48,7 @@ from tensorflow.contrib.model_pruning.python import pruning
 # Global constants describing the CIFAR-10 data set.
 IMAGE_SIZE = cifar10_input.IMAGE_SIZE
 NUM_CLASSES = cifar10_input.NUM_CLASSES
-NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = cifar10_input.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN
+NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = cifar10_input.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN  # pylint: disable=line-too-long
 NUM_EXAMPLES_PER_EPOCH_FOR_EVAL = cifar10_input.NUM_EXAMPLES_PER_EPOCH_FOR_EVAL
 BATCH_SIZE = 128
 DATA_DIR = '/tmp/cifar10_data'
diff --git a/tensorflow/contrib/mpi_collectives/python/ops/mpi_ops.py b/tensorflow/contrib/mpi_collectives/python/ops/mpi_ops.py
index f0a116239d..2fbefef0d3 100644
--- a/tensorflow/contrib/mpi_collectives/python/ops/mpi_ops.py
+++ b/tensorflow/contrib/mpi_collectives/python/ops/mpi_ops.py
@@ -26,7 +26,8 @@ from tensorflow.python.framework import ops
 from tensorflow.python.platform import resource_loader
 
 _mpi_ops_so = loader.load_op_library(
-    resource_loader.get_path_to_datafile("_mpi_ops.so"))
+    resource_loader.get_path_to_datafile('_mpi_ops.so'))
+
 
 def size(name=None):
   """An op which returns the number of MPI processes.
@@ -120,15 +121,14 @@ def allgather(tensor, name=None):
   """
   # Specify that first allgather is to collect the tensor gather sizes,
   # indicated by passing in a scalar (0-D tensor) of value 0
-  sizes_flag = tf.constant(0, dtype=tf.int64, name="size_flag_const")
-  my_size = tf.slice(tf.shape(tensor, out_type=tf.int64), [0], [1], name="size_slice")
+  sizes_flag = tf.constant(0, dtype=tf.int64, name='size_flag_const')
+  my_size = tf.slice(
+      tf.shape(tensor, out_type=tf.int64), [0], [1], name='size_slice')
   if name is None:
-    name = "allgather"
-  sizing_name = "{}_sizing".format(name)
+    name = 'allgather'
+  sizing_name = '{}_sizing'.format(name)
   sizes = gen_mpi_ops.mpi_allgather(my_size, sizes_flag, name=sizing_name)
   return gen_mpi_ops.mpi_allgather(tensor, sizes, name=name)
 
 
 ops.NotDifferentiable('MPIAllgather')
-
-
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
index 0258d7202d..57521c6a9b 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
@@ -45,6 +45,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.util import nest
 
+
 class Plus1RNNCell(rnn_lib.RNNCell):
   """RNN Cell generating (output, new_state) = (input + 1, state + 1)."""
 
@@ -160,8 +161,7 @@ class RNNTest(test.TestCase):
     input_size = 5
     max_length = 8  # unrolled up to this length
     inputs = max_length * [
-        array_ops.placeholder(
-            dtypes.float32, shape=(batch_size, input_size))
+        array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size))
     ]
     outputs, state = rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
     self.assertEqual(len(outputs), len(inputs))
@@ -178,10 +178,9 @@ class RNNTest(test.TestCase):
         self.assertAllClose(v, input_value + 1.0)
 
       # Final state
-      self.assertAllClose(
-          values[-1],
-          max_length * np.ones(
-              (batch_size, input_size), dtype=np.float32))
+      self.assertAllClose(values[-1],
+                          max_length * np.ones(
+                              (batch_size, input_size), dtype=np.float32))
 
   def testDropout(self):
     cell = Plus1RNNCell()
@@ -191,8 +190,7 @@ class RNNTest(test.TestCase):
     input_size = 5
     max_length = 8
     inputs = max_length * [
-        array_ops.placeholder(
-            dtypes.float32, shape=(batch_size, input_size))
+        array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size))
     ]
     with variable_scope.variable_scope("share_scope"):
       outputs, state = rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
@@ -207,8 +205,10 @@ class RNNTest(test.TestCase):
     with self.test_session(use_gpu=True) as sess:
       input_value = np.random.randn(batch_size, input_size)
       values = sess.run(outputs + [state], feed_dict={inputs[0]: input_value})
-      full_dropout_values = sess.run(dropped_outputs,
-                                     feed_dict={inputs[0]: input_value})
+      full_dropout_values = sess.run(
+          dropped_outputs, feed_dict={
+              inputs[0]: input_value
+          })
 
       for v in values[:-1]:
         self.assertAllClose(v, input_value + 1.0)
@@ -222,8 +222,7 @@ class RNNTest(test.TestCase):
     input_size = 5
     max_length = 8
     inputs = max_length * [
-        array_ops.placeholder(
-            dtypes.float32, shape=(batch_size, input_size))
+        array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size))
     ]
     with variable_scope.variable_scope("drop_scope"):
       dynamic_outputs, dynamic_state = rnn.static_rnn(
@@ -234,12 +233,16 @@ class RNNTest(test.TestCase):
       input_value = np.random.randn(batch_size, input_size)
       dynamic_values = sess.run(
           dynamic_outputs,
-          feed_dict={inputs[0]: input_value,
-                     sequence_length: [2, 3]})
+          feed_dict={
+              inputs[0]: input_value,
+              sequence_length: [2, 3]
+          })
       dynamic_state_value = sess.run(
           [dynamic_state],
-          feed_dict={inputs[0]: input_value,
-                     sequence_length: [2, 3]})
+          feed_dict={
+              inputs[0]: input_value,
+              sequence_length: [2, 3]
+          })
 
       # outputs are fully calculated for t = 0, 1
       for v in dynamic_values[:2]:
@@ -289,8 +292,7 @@ class RNNTest(test.TestCase):
       input_size = 5
       max_length = 8  # unrolled up to this length
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(batch_size, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size))
       ]
       return rnn.static_rnn(cell, inputs, dtype=dtypes.float32, scope=scope)
 
@@ -316,8 +318,7 @@ class LSTMTest(test.TestCase):
       cell = rnn_cell.LSTMCell(
           num_units, initializer=initializer, state_is_tuple=False)
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(batch_size, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size))
       ]
       outputs, _ = rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
       self.assertEqual(len(outputs), len(inputs))
@@ -343,8 +344,7 @@ class LSTMTest(test.TestCase):
           initializer=initializer,
           state_is_tuple=False)
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(batch_size, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size))
       ]
       outputs, _ = rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
       self.assertEqual(len(outputs), len(inputs))
@@ -374,8 +374,7 @@ class LSTMTest(test.TestCase):
           initializer=initializer,
           state_is_tuple=False)
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(batch_size, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size))
       ]
       with variable_scope.variable_scope("share_scope"):
         outputs, state = rnn.static_state_saving_rnn(
@@ -388,7 +387,9 @@ class LSTMTest(test.TestCase):
       input_value = np.random.randn(batch_size, input_size)
       (last_state_value, saved_state_value) = sess.run(
           [state, state_saver.saved_state["save_lstm"]],
-          feed_dict={inputs[0]: input_value})
+          feed_dict={
+              inputs[0]: input_value
+          })
       self.assertAllEqual(last_state_value, saved_state_value)
 
   def testNoProjNoShardingTupleStateSaver(self):
@@ -406,8 +407,7 @@ class LSTMTest(test.TestCase):
           initializer=initializer,
           state_is_tuple=True)
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(batch_size, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size))
       ]
       with variable_scope.variable_scope("share_scope"):
         outputs, state = rnn.static_state_saving_rnn(
@@ -420,7 +420,9 @@ class LSTMTest(test.TestCase):
       input_value = np.random.randn(batch_size, input_size)
       last_and_saved_states = sess.run(
           state + (state_saver.saved_state["c"], state_saver.saved_state["m"]),
-          feed_dict={inputs[0]: input_value})
+          feed_dict={
+              inputs[0]: input_value
+          })
       self.assertEqual(4, len(last_and_saved_states))
       self.assertAllEqual(last_and_saved_states[:2], last_and_saved_states[2:])
 
@@ -432,16 +434,17 @@ class LSTMTest(test.TestCase):
     with self.test_session(graph=ops_lib.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
-      state_saver = TestStateSaver(batch_size, {
-          "c0": num_units,
-          "m0": num_units,
-          "c1": num_units + 1,
-          "m1": num_units + 1,
-          "c2": num_units + 2,
-          "m2": num_units + 2,
-          "c3": num_units + 3,
-          "m3": num_units + 3
-      })
+      state_saver = TestStateSaver(
+          batch_size, {
+              "c0": num_units,
+              "m0": num_units,
+              "c1": num_units + 1,
+              "m1": num_units + 1,
+              "c2": num_units + 2,
+              "m2": num_units + 2,
+              "c3": num_units + 3,
+              "m3": num_units + 3
+          })
 
       def _cell(i):
         return rnn_cell.LSTMCell(
@@ -459,8 +462,7 @@ class LSTMTest(test.TestCase):
         self.assertEqual(len(cell.state_size[i]), 2)
 
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(batch_size, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size))
       ]
 
       state_names = (("c0", "m0"), ("c1", "m1"), ("c2", "m2"), ("c3", "m3"))
@@ -475,10 +477,15 @@ class LSTMTest(test.TestCase):
 
       variables_lib.global_variables_initializer().run()
       input_value = np.random.randn(batch_size, input_size)
-      last_states = sess.run(list(nest.flatten(state)),
-                             feed_dict={inputs[0]: input_value})
-      saved_states = sess.run(list(state_saver.saved_state.values()),
-                              feed_dict={inputs[0]: input_value})
+      last_states = sess.run(
+          list(nest.flatten(state)), feed_dict={
+              inputs[0]: input_value
+          })
+      saved_states = sess.run(
+          list(state_saver.saved_state.values()),
+          feed_dict={
+              inputs[0]: input_value
+          })
       self.assertEqual(8, len(last_states))
       self.assertEqual(8, len(saved_states))
       flat_state_names = nest.flatten(state_names)
@@ -499,8 +506,7 @@ class LSTMTest(test.TestCase):
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(None, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(None, input_size))
       ]
       cell = rnn_cell.LSTMCell(
           num_units,
@@ -526,8 +532,7 @@ class LSTMTest(test.TestCase):
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(None, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(None, input_size))
       ]
       cell_notuple = rnn_cell.LSTMCell(
           num_units,
@@ -569,14 +574,20 @@ class LSTMTest(test.TestCase):
 
       variables_lib.global_variables_initializer().run()
       input_value = np.random.randn(batch_size, input_size)
-      outputs_notuple_v = sess.run(outputs_notuple,
-                                   feed_dict={inputs[0]: input_value})
-      outputs_tuple_v = sess.run(outputs_tuple,
-                                 feed_dict={inputs[0]: input_value})
+      outputs_notuple_v = sess.run(
+          outputs_notuple, feed_dict={
+              inputs[0]: input_value
+          })
+      outputs_tuple_v = sess.run(
+          outputs_tuple, feed_dict={
+              inputs[0]: input_value
+          })
       self.assertAllEqual(outputs_notuple_v, outputs_tuple_v)
 
-      (state_notuple_v,) = sess.run((state_notuple,),
-                                    feed_dict={inputs[0]: input_value})
+      (state_notuple_v,) = sess.run(
+          (state_notuple,), feed_dict={
+              inputs[0]: input_value
+          })
       state_tuple_v = sess.run(state_tuple, feed_dict={inputs[0]: input_value})
       self.assertAllEqual(state_notuple_v, np.hstack(state_tuple_v))
 
@@ -593,8 +604,7 @@ class LSTMTest(test.TestCase):
           -0.01, 0.01, seed=self._seed)
 
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(None, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(None, input_size))
       ]
 
       cell = rnn_cell.LSTMCell(
@@ -625,8 +635,7 @@ class LSTMTest(test.TestCase):
     with self.test_session(use_gpu=True, graph=ops_lib.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(-1, 1, seed=self._seed)
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float64, shape=(None, input_size))
+          array_ops.placeholder(dtypes.float64, shape=(None, input_size))
       ]
 
       cell = rnn_cell.LSTMCell(
@@ -661,8 +670,7 @@ class LSTMTest(test.TestCase):
     max_length = 8
     with self.test_session(use_gpu=True, graph=ops_lib.Graph()) as sess:
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(None, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(None, input_size))
       ]
       initializer = init_ops.constant_initializer(0.001)
 
@@ -721,8 +729,7 @@ class LSTMTest(test.TestCase):
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float64, shape=(None, input_size))
+          array_ops.placeholder(dtypes.float64, shape=(None, input_size))
       ]
 
       cell = rnn_cell.LSTMCell(
@@ -743,16 +750,21 @@ class LSTMTest(test.TestCase):
 
       self.assertEqual(len(outputs), len(inputs))
 
-      variables_lib.global_variables_initializer().run(
-          feed_dict={sequence_length: [2, 3]})
+      variables_lib.global_variables_initializer().run(feed_dict={
+          sequence_length: [2, 3]
+      })
       input_value = np.asarray(
           np.random.randn(batch_size, input_size), dtype=np.float64)
       values = sess.run(
-          outputs, feed_dict={inputs[0]: input_value,
-                              sequence_length: [2, 3]})
+          outputs, feed_dict={
+              inputs[0]: input_value,
+              sequence_length: [2, 3]
+          })
       state_value = sess.run(
-          [state], feed_dict={inputs[0]: input_value,
-                              sequence_length: [2, 3]})
+          [state], feed_dict={
+              inputs[0]: input_value,
+              sequence_length: [2, 3]
+          })
       self.assertEqual(values[0].dtype, input_value.dtype)
       self.assertEqual(state_value[0].dtype, input_value.dtype)
 
@@ -767,8 +779,7 @@ class LSTMTest(test.TestCase):
       initializer_d = init_ops.random_uniform_initializer(
           -1, 1, seed=self._seed + 1)
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(None, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(None, input_size))
       ]
       cell = rnn_cell.LSTMCell(
           num_units,
@@ -792,8 +803,10 @@ class LSTMTest(test.TestCase):
 
       variables_lib.global_variables_initializer().run()
       input_value = np.random.randn(batch_size, input_size)
-      output_values = sess.run(outputs0 + outputs1 + outputs2,
-                               feed_dict={inputs[0]: input_value})
+      output_values = sess.run(
+          outputs0 + outputs1 + outputs2, feed_dict={
+              inputs[0]: input_value
+          })
       outputs0_values = output_values[:max_length]
       outputs1_values = output_values[max_length:2 * max_length]
       outputs2_values = output_values[2 * max_length:]
@@ -814,8 +827,7 @@ class LSTMTest(test.TestCase):
     with self.test_session(graph=ops_lib.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(-1, 1, seed=self._seed)
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(None, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(None, input_size))
       ]
       cell = rnn_cell.LSTMCell(
           num_units,
@@ -833,8 +845,10 @@ class LSTMTest(test.TestCase):
 
       variables_lib.global_variables_initializer().run()
       input_value = np.random.randn(batch_size, input_size)
-      output_values = sess.run(outputs0 + outputs1,
-                               feed_dict={inputs[0]: input_value})
+      output_values = sess.run(
+          outputs0 + outputs1, feed_dict={
+              inputs[0]: input_value
+          })
       outputs0_values = output_values[:max_length]
       outputs1_values = output_values[max_length:]
       self.assertEqual(len(outputs0_values), len(outputs1_values))
@@ -861,8 +875,7 @@ class LSTMTest(test.TestCase):
           -0.01, 0.01, seed=self._seed)
       if in_graph_mode:
         inputs = max_length * [
-            array_ops.placeholder(
-                dtypes.float32, shape=(None, input_size))
+            array_ops.placeholder(dtypes.float32, shape=(None, input_size))
         ]
       else:
         inputs = max_length * [
@@ -939,8 +952,7 @@ class LSTMTest(test.TestCase):
           -0.01, 0.01, seed=self._seed)
       if in_graph_mode:
         inputs = max_length * [
-            array_ops.placeholder(
-                dtypes.float32, shape=(None, input_size))
+            array_ops.placeholder(dtypes.float32, shape=(None, input_size))
         ]
       else:
         inputs = max_length * [
@@ -1100,8 +1112,8 @@ class LSTMTest(test.TestCase):
         # Test gradients to inputs and variables w.r.t. outputs & final state
         static_grad_values = sess.run(static_gradients, feed_dict=feeds)
 
-        static_individual_grad_values = sess.run(static_individual_gradients,
-                                                 feed_dict=feeds)
+        static_individual_grad_values = sess.run(
+            static_individual_gradients, feed_dict=feeds)
 
         static_individual_var_grad_values = sess.run(
             static_individual_variable_gradients, feed_dict=feeds)
@@ -1148,8 +1160,10 @@ class LSTMTest(test.TestCase):
         # Generate gradients of several individual outputs w.r.t. inputs
         dynamic_individual_gradients = nest.flatten([
             gradients_impl.gradients(y, [concat_inputs])
-            for y in
-            [split_outputs_dynamic[0], split_outputs_dynamic[-1], state_dynamic]
+            for y in [
+                split_outputs_dynamic[0], split_outputs_dynamic[-1],
+                state_dynamic
+            ]
         ])
 
         # Generate gradients of individual variables w.r.t. inputs
@@ -1159,8 +1173,10 @@ class LSTMTest(test.TestCase):
             "Count of trainable variables: %d" % len(trainable_variables))
         dynamic_individual_variable_gradients = nest.flatten([
             gradients_impl.gradients(y, trainable_variables)
-            for y in
-            [split_outputs_dynamic[0], split_outputs_dynamic[-1], state_dynamic]
+            for y in [
+                split_outputs_dynamic[0], split_outputs_dynamic[-1],
+                state_dynamic
+            ]
         ])
 
         # Test forward pass
@@ -1170,8 +1186,8 @@ class LSTMTest(test.TestCase):
         # Test gradients to inputs and variables w.r.t. outputs & final state
         dynamic_grad_values = sess.run(dynamic_gradients, feed_dict=feeds)
 
-        dynamic_individual_grad_values = sess.run(dynamic_individual_gradients,
-                                                  feed_dict=feeds)
+        dynamic_individual_grad_values = sess.run(
+            dynamic_individual_gradients, feed_dict=feeds)
 
         dynamic_individual_var_grad_values = sess.run(
             dynamic_individual_variable_gradients, feed_dict=feeds)
@@ -1207,8 +1223,8 @@ class LSTMTest(test.TestCase):
       for i, (a, b) in enumerate(
           zip(static_individual_var_grad_values,
               dynamic_individual_var_grad_values)):
-        tf_logging.info("Comparing individual variable gradients iteration %d" %
-                        i)
+        tf_logging.info(
+            "Comparing individual variable gradients iteration %d" % i)
         self.assertAllEqual(a, b)
 
   @test_util.run_in_graph_and_eager_modes()
@@ -1223,10 +1239,7 @@ class BidirectionalRNNTest(test.TestCase):
     self._seed = 23489
     np.random.seed(self._seed)
 
-  def _createBidirectionalRNN(self,
-                              use_shape,
-                              use_sequence_length,
-                              scope=None):
+  def _createBidirectionalRNN(self, use_shape, use_sequence_length, scope=None):
     num_units = 3
     input_size = 5
     batch_size = 2
@@ -1270,8 +1283,10 @@ class BidirectionalRNNTest(test.TestCase):
       # Run with pre-specified sequence length of 2, 3
       out, s_fw, s_bw = sess.run(
           [outputs, state_fw, state_bw],
-          feed_dict={inputs[0]: input_value,
-                     sequence_length: [2, 3]})
+          feed_dict={
+              inputs[0]: input_value,
+              sequence_length: [2, 3]
+          })
 
       # Since the forward and backward LSTM cells were initialized with the
       # same parameters, the forward and backward output has to be the same,
@@ -1312,8 +1327,10 @@ class BidirectionalRNNTest(test.TestCase):
       input_value, inputs, outputs, state_fw, state_bw, _ = (
           self._createBidirectionalRNN(use_shape, False))
       variables_lib.global_variables_initializer().run()
-      out, s_fw, s_bw = sess.run([outputs, state_fw, state_bw],
-                                 feed_dict={inputs[0]: input_value})
+      out, s_fw, s_bw = sess.run(
+          [outputs, state_fw, state_bw], feed_dict={
+              inputs[0]: input_value
+          })
 
       # Since the forward and backward LSTM cells were initialized with the
       # same parameters, the forward and backward output has to be the same,
@@ -1396,13 +1413,11 @@ class BidirectionalRNNTest(test.TestCase):
                                    use_time_major, use_sequence_length):
     with self.test_session(use_gpu=True, graph=ops_lib.Graph()) as sess:
       input_value, inputs, outputs, state_fw, state_bw, sequence_length = (
-          self._createBidirectionalDynamicRNN(use_shape,
-                                              use_state_tuple, use_time_major,
-                                              use_sequence_length))
+          self._createBidirectionalDynamicRNN(
+              use_shape, use_state_tuple, use_time_major, use_sequence_length))
       variables_lib.global_variables_initializer().run()
       # Run with pre-specified sequence length of 2, 3
-      feed_dict = (
-          {sequence_length: [2, 3]} if use_sequence_length else {})
+      feed_dict = ({sequence_length: [2, 3]} if use_sequence_length else {})
       feed_dict.update({inputs[0]: input_value})
       if use_state_tuple:
         out, c_fw, m_fw, c_bw, m_bw = sess.run(
@@ -1538,8 +1553,7 @@ class MultiDimensionalLSTMTest(test.TestCase):
     sequence_length = [4, 6]
     with self.test_session(graph=ops_lib.Graph()) as sess:
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(None,) + input_size)
+          array_ops.placeholder(dtypes.float32, shape=(None,) + input_size)
       ]
       inputs_using_dim = max_length * [
           array_ops.placeholder(
@@ -1585,14 +1599,22 @@ class MultiDimensionalLSTMTest(test.TestCase):
 
       input_total_size = (batch_size,) + input_size
       input_value = np.random.randn(*input_total_size)
-      outputs_static_v = sess.run(outputs_static,
-                                  feed_dict={inputs[0]: input_value})
-      outputs_dynamic_v = sess.run(outputs_dynamic,
-                                   feed_dict={inputs[0]: input_value})
-      outputs_bid_v = sess.run(outputs_bid,
-                               feed_dict={inputs_using_dim[0]: input_value})
-      outputs_sav_v = sess.run(outputs_sav,
-                               feed_dict={inputs_using_dim[0]: input_value})
+      outputs_static_v = sess.run(
+          outputs_static, feed_dict={
+              inputs[0]: input_value
+          })
+      outputs_dynamic_v = sess.run(
+          outputs_dynamic, feed_dict={
+              inputs[0]: input_value
+          })
+      outputs_bid_v = sess.run(
+          outputs_bid, feed_dict={
+              inputs_using_dim[0]: input_value
+          })
+      outputs_sav_v = sess.run(
+          outputs_sav, feed_dict={
+              inputs_using_dim[0]: input_value
+          })
 
       self.assertAllEqual(outputs_static_v, outputs_dynamic_v)
       self.assertAllEqual(outputs_static_v, outputs_sav_v)
@@ -1602,16 +1624,26 @@ class MultiDimensionalLSTMTest(test.TestCase):
       outputs_bid_array = np.array(outputs_bid_v)
       self.assertAllEqual(outputs_static_array_double, outputs_bid_array)
 
-      state_static_v = sess.run(state_static,
-                                feed_dict={inputs[0]: input_value})
-      state_dynamic_v = sess.run(state_dynamic,
-                                 feed_dict={inputs[0]: input_value})
-      state_bid_fw_v = sess.run(state_fw,
-                                feed_dict={inputs_using_dim[0]: input_value})
-      state_bid_bw_v = sess.run(state_bw,
-                                feed_dict={inputs_using_dim[0]: input_value})
-      state_sav_v = sess.run(state_sav,
-                             feed_dict={inputs_using_dim[0]: input_value})
+      state_static_v = sess.run(
+          state_static, feed_dict={
+              inputs[0]: input_value
+          })
+      state_dynamic_v = sess.run(
+          state_dynamic, feed_dict={
+              inputs[0]: input_value
+          })
+      state_bid_fw_v = sess.run(
+          state_fw, feed_dict={
+              inputs_using_dim[0]: input_value
+          })
+      state_bid_bw_v = sess.run(
+          state_bw, feed_dict={
+              inputs_using_dim[0]: input_value
+          })
+      state_sav_v = sess.run(
+          state_sav, feed_dict={
+              inputs_using_dim[0]: input_value
+          })
       self.assertAllEqual(np.hstack(state_static_v), np.hstack(state_dynamic_v))
       self.assertAllEqual(np.hstack(state_static_v), np.hstack(state_sav_v))
       self.assertAllEqual(np.hstack(state_static_v), np.hstack(state_bid_fw_v))
@@ -1633,16 +1665,17 @@ class NestedLSTMTest(test.TestCase):
     with self.test_session(graph=ops_lib.Graph()) as sess:
       state_saver = TestStateSaver(batch_size, state_size)
       single_input = (array_ops.placeholder(
-          dtypes.float32, shape=(None, input_size)), array_ops.placeholder(
-              dtypes.float32, shape=(None, input_size)))
+          dtypes.float32, shape=(None, input_size)),
+                      array_ops.placeholder(
+                          dtypes.float32, shape=(None, input_size)))
       inputs = max_length * [single_input]
       inputs_c = (array_ops.stack([input_[0] for input_ in inputs]),
                   array_ops.stack([input_[1] for input_ in inputs]))
-      single_input_using_dim = (
-          array_ops.placeholder(
-              dtypes.float32, shape=(batch_size, input_size)),
-          array_ops.placeholder(
-              dtypes.float32, shape=(batch_size, input_size)))
+      single_input_using_dim = (array_ops.placeholder(
+          dtypes.float32, shape=(batch_size, input_size)),
+                                array_ops.placeholder(
+                                    dtypes.float32,
+                                    shape=(batch_size, input_size)))
       inputs_using_dim = max_length * [single_input_using_dim]
 
       # Create a cell for the whole test. This is fine because the cell has no
@@ -1688,14 +1721,22 @@ class NestedLSTMTest(test.TestCase):
       input_total_size = (batch_size, input_size)
       input_value = (np.random.randn(*input_total_size),
                      np.random.randn(*input_total_size))
-      outputs_dynamic_v = sess.run(outputs_dynamic,
-                                   feed_dict={single_input: input_value})
-      outputs_static_v = sess.run(outputs_static,
-                                  feed_dict={single_input: input_value})
-      outputs_sav_v = sess.run(outputs_sav,
-                               feed_dict={single_input_using_dim: input_value})
-      outputs_bid_v = sess.run(outputs_bid,
-                               feed_dict={single_input_using_dim: input_value})
+      outputs_dynamic_v = sess.run(
+          outputs_dynamic, feed_dict={
+              single_input: input_value
+          })
+      outputs_static_v = sess.run(
+          outputs_static, feed_dict={
+              single_input: input_value
+          })
+      outputs_sav_v = sess.run(
+          outputs_sav, feed_dict={
+              single_input_using_dim: input_value
+          })
+      outputs_bid_v = sess.run(
+          outputs_bid, feed_dict={
+              single_input_using_dim: input_value
+          })
 
       self.assertAllEqual(outputs_static_v,
                           np.transpose(outputs_dynamic_v, (1, 0, 2, 3)))
@@ -1706,16 +1747,26 @@ class NestedLSTMTest(test.TestCase):
       outputs_bid_array = np.array(outputs_bid_v)
       self.assertAllEqual(outputs_static_array_double, outputs_bid_array)
 
-      state_dynamic_v = sess.run(state_dynamic,
-                                 feed_dict={single_input: input_value})
-      state_static_v = sess.run(state_static,
-                                feed_dict={single_input: input_value})
-      state_bid_fw_v = sess.run(state_fw,
-                                feed_dict={single_input_using_dim: input_value})
-      state_bid_bw_v = sess.run(state_bw,
-                                feed_dict={single_input_using_dim: input_value})
-      state_sav_v = sess.run(state_sav,
-                             feed_dict={single_input_using_dim: input_value})
+      state_dynamic_v = sess.run(
+          state_dynamic, feed_dict={
+              single_input: input_value
+          })
+      state_static_v = sess.run(
+          state_static, feed_dict={
+              single_input: input_value
+          })
+      state_bid_fw_v = sess.run(
+          state_fw, feed_dict={
+              single_input_using_dim: input_value
+          })
+      state_bid_bw_v = sess.run(
+          state_bw, feed_dict={
+              single_input_using_dim: input_value
+          })
+      state_sav_v = sess.run(
+          state_sav, feed_dict={
+              single_input_using_dim: input_value
+          })
       self.assertAllEqual(np.hstack(state_static_v), np.hstack(state_dynamic_v))
       self.assertAllEqual(np.hstack(state_static_v), np.hstack(state_sav_v))
       self.assertAllEqual(np.hstack(state_static_v), np.hstack(state_bid_fw_v))
@@ -1764,8 +1815,7 @@ class StateSaverRNNTest(test.TestCase):
           initializer=initializer,
           state_is_tuple=False)
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(batch_size, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size))
       ]
       return rnn.static_state_saving_rnn(
           cell,
@@ -1931,8 +1981,10 @@ class RawRNNTest(test.TestCase):
       (outputs_val, outputs_dynamic_rnn_val, final_state_val,
        final_state_dynamic_rnn_val) = sess.run(
            [outputs, outputs_dynamic_rnn, final_state, final_state_dynamic_rnn],
-           feed_dict={inputs: rand_input,
-                      sequence_length: rand_seq_len})
+           feed_dict={
+               inputs: rand_input,
+               sequence_length: rand_seq_len
+           })
 
       self.assertAllClose(outputs_dynamic_rnn_val, outputs_val)
       self.assertAllClose(final_state_dynamic_rnn_val, final_state_val)
@@ -1945,12 +1997,16 @@ class RawRNNTest(test.TestCase):
         self.assertEqual(len(gradients), len(gradients_dynamic_rnn))
         gradients_val = sess.run(
             gradients,
-            feed_dict={inputs: rand_input,
-                       sequence_length: rand_seq_len})
+            feed_dict={
+                inputs: rand_input,
+                sequence_length: rand_seq_len
+            })
         gradients_dynamic_rnn_val = sess.run(
             gradients_dynamic_rnn,
-            feed_dict={inputs: rand_input,
-                       sequence_length: rand_seq_len})
+            feed_dict={
+                inputs: rand_input,
+                sequence_length: rand_seq_len
+            })
         self.assertEqual(len(gradients_val), len(gradients_dynamic_rnn_val))
         input_gradients_val = gradients_val[0]
         input_gradients_dynamic_rnn_val = gradients_dynamic_rnn_val[0]
@@ -2067,14 +2123,13 @@ class RawRNNTest(test.TestCase):
 
       def loop_fn(time_, cell_output, cell_state, _):
         if cell_output is None:
-          emit_output = (array_ops.zeros(
-              [2, 3], dtype=dtypes.int32), array_ops.zeros(
-                  [unknown_dim], dtype=dtypes.int64))
+          emit_output = (array_ops.zeros([2, 3], dtype=dtypes.int32),
+                         array_ops.zeros([unknown_dim], dtype=dtypes.int64))
           next_state = cell.zero_state(batch_size, dtypes.float32)
         else:
-          emit_output = (array_ops.ones(
-              [batch_size, 2, 3], dtype=dtypes.int32), array_ops.ones(
-                  [batch_size, unknown_dim], dtype=dtypes.int64))
+          emit_output = (array_ops.ones([batch_size, 2, 3], dtype=dtypes.int32),
+                         array_ops.ones(
+                             [batch_size, unknown_dim], dtype=dtypes.int64))
           next_state = cell_state
         elements_finished = array_ops.tile([time_ >= max_time], [batch_size])
         finished = math_ops.reduce_all(elements_finished)
@@ -2193,8 +2248,8 @@ class TensorArrayOnCorrectDeviceTest(test.TestCase):
 
     cell = rnn_cell.LSTMCell(num_units, use_peepholes=True)
     gpu_cell = DeviceWrapperCell(cell, cell_device)
-    inputs = np.random.randn(batch_size, time_steps,
-                             input_size).astype(np.float32)
+    inputs = np.random.randn(batch_size, time_steps, input_size).astype(
+        np.float32)
     sequence_length = np.random.randint(0, time_steps, size=batch_size)
 
     if input_device is not None:
@@ -2262,8 +2317,7 @@ class TensorArrayOnCorrectDeviceTest(test.TestCase):
 
     gpu_dev = test.gpu_device_name()
     run_metadata = self._execute_rnn_on(
-        rnn_device="/cpu:0", cell_device="/cpu:0",
-        input_device=gpu_dev)
+        rnn_device="/cpu:0", cell_device="/cpu:0", input_device=gpu_dev)
     cpu_stats, gpu_stats = self._retrieve_cpu_gpu_stats(run_metadata)
 
     def _assert_in(op_str, in_stats, out_stats):
@@ -2278,8 +2332,7 @@ class TensorArrayOnCorrectDeviceTest(test.TestCase):
       return  # Test requires access to a GPU
 
     gpu_dev = test.gpu_device_name()
-    run_metadata = self._execute_rnn_on(
-        input_device=gpu_dev)
+    run_metadata = self._execute_rnn_on(input_device=gpu_dev)
     cpu_stats, gpu_stats = self._retrieve_cpu_gpu_stats(run_metadata)
 
     def _assert_in(op_str, in_stats, out_stats):
diff --git a/tensorflow/contrib/session_bundle/exporter.py b/tensorflow/contrib/session_bundle/exporter.py
index f6f663aae7..08983337fc 100644
--- a/tensorflow/contrib/session_bundle/exporter.py
+++ b/tensorflow/contrib/session_bundle/exporter.py
@@ -281,11 +281,12 @@ class Exporter(object):
     tmp_export_dir = compat.as_text(export_dir) + "-tmp"
     gfile.MakeDirs(tmp_export_dir)
 
-    self._saver.save(sess,
-                     os.path.join(
-                         compat.as_text(tmp_export_dir),
-                         compat.as_text(constants.EXPORT_BASE_NAME)),
-                     meta_graph_suffix=constants.EXPORT_SUFFIX_NAME)
+    self._saver.save(
+        sess,
+        os.path.join(
+            compat.as_text(tmp_export_dir),
+            compat.as_text(constants.EXPORT_BASE_NAME)),
+        meta_graph_suffix=constants.EXPORT_SUFFIX_NAME)
 
     # Run the asset callback.
     if self._assets_callback and self._assets_to_copy:
@@ -301,12 +302,12 @@ class Exporter(object):
     if exports_to_keep:
       # create a simple parser that pulls the export_version from the directory.
       def parser(path):
-        if os.name == 'nt':
-          match = re.match("^" + export_dir_base.replace('\\','/') + "/(\\d{8})$",
-                           path.path.replace('\\','/'))
+        if os.name == "nt":
+          match = re.match(
+              "^" + export_dir_base.replace("\\", "/") + "/(\\d{8})$",
+              path.path.replace("\\", "/"))
         else:
-          match = re.match("^" + export_dir_base + "/(\\d{8})$",
-                           path.path)
+          match = re.match("^" + export_dir_base + "/(\\d{8})$", path.path)
         if not match:
           return None
         return path._replace(export_version=int(match.group(1)))
diff --git a/tensorflow/contrib/slim/python/slim/learning_test.py b/tensorflow/contrib/slim/python/slim/learning_test.py
index 4e816f9b11..831c6e427a 100644
--- a/tensorflow/contrib/slim/python/slim/learning_test.py
+++ b/tensorflow/contrib/slim/python/slim/learning_test.py
@@ -197,9 +197,7 @@ class MultiplyGradientsTest(test.TestCase):
     gradient = constant_op.constant(self._grad_vec, dtype=dtypes.float32)
     variable = variables_lib.Variable(array_ops.zeros_like(gradient))
     multiplier_flag = variables_lib.Variable(True)
-    tensor_multiplier = array_ops.where(multiplier_flag,
-                                        self._multiplier,
-                                        1.0)
+    tensor_multiplier = array_ops.where(multiplier_flag, self._multiplier, 1.0)
     grad_to_var = (gradient, variable)
     gradient_multipliers = {variable: tensor_multiplier}
 
@@ -212,11 +210,8 @@ class MultiplyGradientsTest(test.TestCase):
       sess.run(multiplier_flag.assign(False))
       gradient_false_flag = sess.run(grad_to_var[0])
     np_testing.assert_almost_equal(gradient_true_flag,
-                                   self._multiplied_grad_vec,
-                                   5)
-    np_testing.assert_almost_equal(gradient_false_flag,
-                                   self._grad_vec,
-                                   5)
+                                   self._multiplied_grad_vec, 5)
+    np_testing.assert_almost_equal(gradient_false_flag, self._grad_vec, 5)
 
 
 def LogisticClassifier(inputs):
@@ -502,6 +497,7 @@ class TrainTest(test.TestCase):
     purpose.
     """
     dump_root = tempfile.mkdtemp()
+
     def dumping_wrapper(sess):  # pylint: disable=invalid-name
       return dumping_wrapper_lib.DumpingDebugWrapperSession(sess, dump_root)
 
@@ -519,16 +515,13 @@ class TrainTest(test.TestCase):
       train_op = learning.create_train_op(total_loss, optimizer)
 
       loss = learning.train(
-          train_op,
-          None,
-          number_of_steps=1,
-          session_wrapper=dumping_wrapper)
+          train_op, None, number_of_steps=1, session_wrapper=dumping_wrapper)
     self.assertIsNotNone(loss)
 
     run_root = glob.glob(os.path.join(dump_root, 'run_*'))[-1]
     dump = debug_data.DebugDumpDir(run_root)
-    self.assertAllEqual(
-        0, dump.get_tensors('global_step', 0, 'DebugIdentity')[0])
+    self.assertAllEqual(0,
+                        dump.get_tensors('global_step', 0, 'DebugIdentity')[0])
 
   def testTrainWithTrace(self):
     logdir = os.path.join(
@@ -961,8 +954,8 @@ class TrainTest(test.TestCase):
     self.assertGreater(losses[0], losses[1])
 
   def testTrainWithEpochLimit(self):
-    logdir = os.path.join(tempfile.mkdtemp(prefix=self.get_temp_dir()),
-                          'tmp_logs')
+    logdir = os.path.join(
+        tempfile.mkdtemp(prefix=self.get_temp_dir()), 'tmp_logs')
     with ops.Graph().as_default():
       random_seed.set_random_seed(0)
       tf_inputs = constant_op.constant(self._inputs, dtype=dtypes.float32)
@@ -982,7 +975,8 @@ class TrainTest(test.TestCase):
     self.assertIsNotNone(loss)
     self.assertLess(loss, .015)
     self.assertTrue(os.path.isfile('{}/model.ckpt-300.index'.format(logdir)))
-    self.assertTrue(os.path.isfile('{}/model.ckpt-300.data-00000-of-00001'.format(logdir)))
+    self.assertTrue(
+        os.path.isfile('{}/model.ckpt-300.data-00000-of-00001'.format(logdir)))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/examples/tutorials/mnist/mnist_softmax.py b/tensorflow/examples/tutorials/mnist/mnist_softmax.py
index fb3ac94203..47dd6a1947 100644
--- a/tensorflow/examples/tutorials/mnist/mnist_softmax.py
+++ b/tensorflow/examples/tutorials/mnist/mnist_softmax.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """A very simple MNIST classifier.
 
 See extensive documentation at
@@ -67,12 +66,19 @@ def main(_):
   # Test trained model
   correct_prediction = tf.equal(tf.argmax(y, 1), y_)
   accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
-  print(sess.run(accuracy, feed_dict={x: mnist.test.images,
-                                      y_: mnist.test.labels}))
+  print(sess.run(
+      accuracy, feed_dict={
+          x: mnist.test.images,
+          y_: mnist.test.labels
+      }))
+
 
 if __name__ == '__main__':
   parser = argparse.ArgumentParser()
-  parser.add_argument('--data_dir', type=str, default='/tmp/tensorflow/mnist/input_data',
-                      help='Directory for storing input data')
+  parser.add_argument(
+      '--data_dir',
+      type=str,
+      default='/tmp/tensorflow/mnist/input_data',
+      help='Directory for storing input data')
   FLAGS, unparsed = parser.parse_known_args()
   tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/python/client/device_lib_test.py b/tensorflow/python/client/device_lib_test.py
index 7bba10efac..aaf41626ab 100644
--- a/tensorflow/python/client/device_lib_test.py
+++ b/tensorflow/python/client/device_lib_test.py
@@ -34,7 +34,8 @@ class DeviceLibTest(test_util.TensorFlowTestCase):
     # GPU test
     if test.is_gpu_available():
       self.assertGreater(len(devices), 1)
-      self.assertTrue("GPU" in [d.device_type for d in devices] or "SYCL" in [d.device_type for d in devices])
+      self.assertTrue("GPU" in [d.device_type for d in devices] or
+                      "SYCL" in [d.device_type for d in devices])
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/debug/wrappers/hooks.py b/tensorflow/python/debug/wrappers/hooks.py
index 989ad801e5..0204254cca 100644
--- a/tensorflow/python/debug/wrappers/hooks.py
+++ b/tensorflow/python/debug/wrappers/hooks.py
@@ -35,10 +35,7 @@ class LocalCLIDebugHook(session_run_hook.SessionRunHook):
   `tf.contrib.learn`'s `Estimator`s and `Experiment`s.
   """
 
-  def __init__(self,
-               ui_type="curses",
-               dump_root=None,
-               thread_name_filter=None):
+  def __init__(self, ui_type="curses", dump_root=None, thread_name_filter=None):
     """Create a local debugger command-line interface (CLI) hook.
 
     Args:
@@ -62,7 +59,8 @@ class LocalCLIDebugHook(session_run_hook.SessionRunHook):
     """Add a tensor filter.
 
     See doc of `LocalCLIDebugWrapperSession.add_tensor_filter()` for details.
-    Override default behavior to accommodate the possibility of this method being
+    Override default behavior to accommodate the possibility of this method
+    being
     called prior to the initialization of the underlying
     `LocalCLIDebugWrapperSession` object.
 
@@ -137,9 +135,7 @@ class LocalCLIDebugHook(session_run_hook.SessionRunHook):
       # pylint: enable=protected-access
 
       with stepper.NodeStepper(
-          run_context.session,
-          run_context.original_args.
-          fetches,
+          run_context.session, run_context.original_args.fetches,
           run_context.original_args.feed_dict) as node_stepper:
         self._session_wrapper.invoke_node_stepper(
             node_stepper, restore_variable_values_on_exit=True)
@@ -149,8 +145,8 @@ class LocalCLIDebugHook(session_run_hook.SessionRunHook):
   def after_run(self, run_context, run_values):
     # Adapt run_context and run_values to OnRunEndRequest and invoke superclass
     # on_run_end()
-    on_run_end_request = framework.OnRunEndRequest(
-        self._performed_action, run_values.run_metadata)
+    on_run_end_request = framework.OnRunEndRequest(self._performed_action,
+                                                   run_values.run_metadata)
     self._session_wrapper.on_run_end(on_run_end_request)
 
 
@@ -260,8 +256,8 @@ class GrpcDebugHook(session_run_hook.SessionRunHook):
     self._thread_name_filter = thread_name_filter
     self._grpc_debug_server_addresses = (
         grpc_debug_server_addresses
-        if isinstance(grpc_debug_server_addresses, list)
-        else [grpc_debug_server_addresses])
+        if isinstance(grpc_debug_server_addresses, list) else
+        [grpc_debug_server_addresses])
 
     self._watch_fn = watch_fn
     self._log_usage = log_usage
@@ -334,6 +330,7 @@ class TensorBoardDebugHook(GrpcDebugHook):
       log_usage: Whether the usage of this class is to be logged (if
         applicable).
     """
+
     def _gated_grpc_watch_fn(fetches, feeds):
       del fetches, feeds  # Unused.
       return framework.WatchOptions(
diff --git a/tensorflow/python/framework/dtypes.py b/tensorflow/python/framework/dtypes.py
index 67ccf990d6..c825114483 100644
--- a/tensorflow/python/framework/dtypes.py
+++ b/tensorflow/python/framework/dtypes.py
@@ -12,20 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Library of dtypes (Tensor element types)."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-
 import numpy as np
 
 from tensorflow.core.framework import types_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.util.tf_export import tf_export
 
-
 _np_bfloat16 = pywrap_tensorflow.TF_bfloat16_type()
 
 
@@ -83,8 +80,8 @@ class DType(object):
     # TODO(mrry): Make the necessary changes (using __new__) to ensure
     # that calling this returns one of the interned values.
     type_enum = int(type_enum)
-    if (type_enum not in types_pb2.DataType.values()
-        or type_enum == types_pb2.DT_INVALID):
+    if (type_enum not in types_pb2.DataType.values() or
+        type_enum == types_pb2.DT_INVALID):
       raise TypeError(
           "type_enum is not a valid types_pb2.DataType: %s" % type_enum)
     self._type_enum = type_enum
@@ -123,10 +120,10 @@ class DType(object):
 
   @property
   def is_numpy_compatible(self):
-    numpy_incompatible = [types_pb2.DT_VARIANT,
-                          types_pb2.DT_VARIANT_REF,
-                          types_pb2.DT_RESOURCE,
-                          types_pb2.DT_RESOURCE_REF]
+    numpy_incompatible = [
+        types_pb2.DT_VARIANT, types_pb2.DT_VARIANT_REF, types_pb2.DT_RESOURCE,
+        types_pb2.DT_RESOURCE_REF
+    ]
     return self._type_enum not in numpy_incompatible
 
   @property
@@ -153,9 +150,9 @@ class DType(object):
   @property
   def is_floating(self):
     """Returns whether this is a (non-quantized, real) floating point type."""
-    return ((self.is_numpy_compatible and np.issubdtype(self.as_numpy_dtype,
-                                                        np.floating))
-            or self.base_dtype == bfloat16)
+    return ((self.is_numpy_compatible and
+             np.issubdtype(self.as_numpy_dtype, np.floating)) or
+            self.base_dtype == bfloat16)
 
   @property
   def is_complex(self):
@@ -190,8 +187,8 @@ class DType(object):
       TypeError: if this is a non-numeric, unordered, or quantized type.
 
     """
-    if (self.is_quantized or self.base_dtype in
-        (bool, string, complex64, complex128)):
+    if (self.is_quantized or
+        self.base_dtype in (bool, string, complex64, complex128)):
       raise TypeError("Cannot find minimum value of %s." % self)
 
     # there is no simple way to get the min value of a dtype, we have to check
@@ -214,8 +211,8 @@ class DType(object):
       TypeError: if this is a non-numeric, unordered, or quantized type.
 
     """
-    if (self.is_quantized or self.base_dtype in
-        (bool, string, complex64, complex128)):
+    if (self.is_quantized or
+        self.base_dtype in (bool, string, complex64, complex128)):
       raise TypeError("Cannot find maximum value of %s." % self)
 
     # there is no simple way to get the max value of a dtype, we have to check
@@ -266,8 +263,8 @@ class DType(object):
       this `DType`.
     """
     other = as_dtype(other)
-    return self._type_enum in (
-        other.as_datatype_enum, other.base_dtype.as_datatype_enum)
+    return self._type_enum in (other.as_datatype_enum,
+                               other.base_dtype.as_datatype_enum)
 
   def __eq__(self, other):
     """Returns True iff this DType refers to the same type as `other`."""
@@ -307,19 +304,22 @@ class DType(object):
       return 1
     return np.dtype(self.as_numpy_dtype).itemsize
 
+
 # Define data type range of numpy dtype
-dtype_range = {np.bool_: (False, True),
-               np.bool8: (False, True),
-               np.uint8: (0, 255),
-               np.uint16: (0, 65535),
-               np.int8: (-128, 127),
-               np.int16: (-32768, 32767),
-               np.int64: (-2**63, 2**63 - 1),
-               np.uint64: (0, 2**64 - 1),
-               np.int32: (-2**31, 2**31 - 1),
-               np.uint32: (0, 2**32 - 1),
-               np.float32: (-1, 1),
-               np.float64: (-1, 1)}
+dtype_range = {
+    np.bool_: (False, True),
+    np.bool8: (False, True),
+    np.uint8: (0, 255),
+    np.uint16: (0, 65535),
+    np.int8: (-128, 127),
+    np.int16: (-32768, 32767),
+    np.int64: (-2**63, 2**63 - 1),
+    np.uint64: (0, 2**64 - 1),
+    np.int32: (-2**31, 2**31 - 1),
+    np.uint32: (0, 2**32 - 1),
+    np.float32: (-1, 1),
+    np.float64: (-1, 1)
+}
 
 # Define standard wrappers for the types_pb2.DataType enum.
 resource = DType(types_pb2.DT_RESOURCE)
@@ -396,7 +396,6 @@ quint16_ref = DType(types_pb2.DT_QUINT16_REF)
 qint32_ref = DType(types_pb2.DT_QINT32_REF)
 bfloat16_ref = DType(types_pb2.DT_BFLOAT16_REF)
 
-
 # Maintain an intern table so that we don't have to create a large
 # number of small objects.
 _INTERN_TABLE = {
@@ -448,7 +447,6 @@ _INTERN_TABLE = {
     types_pb2.DT_VARIANT_REF: variant_ref,
 }
 
-
 # Standard mappings between types_pb2.DataType values and string names.
 _TYPE_TO_STRING = {
     types_pb2.DT_HALF: "float16",
@@ -498,8 +496,10 @@ _TYPE_TO_STRING = {
     types_pb2.DT_RESOURCE_REF: "resource_ref",
     types_pb2.DT_VARIANT_REF: "variant_ref",
 }
-_STRING_TO_TF = {value: _INTERN_TABLE[key]
-                 for key, value in _TYPE_TO_STRING.items()}
+_STRING_TO_TF = {
+    value: _INTERN_TABLE[key]
+    for key, value in _TYPE_TO_STRING.items()
+}
 # Add non-canonical aliases.
 _STRING_TO_TF["half"] = float16
 _STRING_TO_TF["half_ref"] = float16_ref
@@ -508,7 +508,6 @@ _STRING_TO_TF["float_ref"] = float32_ref
 _STRING_TO_TF["double"] = float64
 _STRING_TO_TF["double_ref"] = float64_ref
 
-
 # Numpy representation for quantized dtypes.
 #
 # These are magic strings that are used in the swig wrapper to identify
@@ -551,58 +550,100 @@ _NP_TO_TF = frozenset([
     (_np_bfloat16, bfloat16),
 ])
 _TF_TO_NP = {
-    types_pb2.DT_HALF: np.float16,
-    types_pb2.DT_FLOAT: np.float32,
-    types_pb2.DT_DOUBLE: np.float64,
-    types_pb2.DT_INT32: np.int32,
-    types_pb2.DT_UINT8: np.uint8,
-    types_pb2.DT_UINT16: np.uint16,
-    types_pb2.DT_UINT32: np.uint32,
-    types_pb2.DT_UINT64: np.uint64,
-    types_pb2.DT_INT16: np.int16,
-    types_pb2.DT_INT8: np.int8,
+    types_pb2.DT_HALF:
+        np.float16,
+    types_pb2.DT_FLOAT:
+        np.float32,
+    types_pb2.DT_DOUBLE:
+        np.float64,
+    types_pb2.DT_INT32:
+        np.int32,
+    types_pb2.DT_UINT8:
+        np.uint8,
+    types_pb2.DT_UINT16:
+        np.uint16,
+    types_pb2.DT_UINT32:
+        np.uint32,
+    types_pb2.DT_UINT64:
+        np.uint64,
+    types_pb2.DT_INT16:
+        np.int16,
+    types_pb2.DT_INT8:
+        np.int8,
     # NOTE(touts): For strings we use np.object as it supports variable length
     # strings.
-    types_pb2.DT_STRING: np.object,
-    types_pb2.DT_COMPLEX64: np.complex64,
-    types_pb2.DT_COMPLEX128: np.complex128,
-    types_pb2.DT_INT64: np.int64,
-    types_pb2.DT_BOOL: np.bool,
-    types_pb2.DT_QINT8: _np_qint8,
-    types_pb2.DT_QUINT8: _np_quint8,
-    types_pb2.DT_QINT16: _np_qint16,
-    types_pb2.DT_QUINT16: _np_quint16,
-    types_pb2.DT_QINT32: _np_qint32,
-    types_pb2.DT_BFLOAT16: _np_bfloat16,
+    types_pb2.DT_STRING:
+        np.object,
+    types_pb2.DT_COMPLEX64:
+        np.complex64,
+    types_pb2.DT_COMPLEX128:
+        np.complex128,
+    types_pb2.DT_INT64:
+        np.int64,
+    types_pb2.DT_BOOL:
+        np.bool,
+    types_pb2.DT_QINT8:
+        _np_qint8,
+    types_pb2.DT_QUINT8:
+        _np_quint8,
+    types_pb2.DT_QINT16:
+        _np_qint16,
+    types_pb2.DT_QUINT16:
+        _np_quint16,
+    types_pb2.DT_QINT32:
+        _np_qint32,
+    types_pb2.DT_BFLOAT16:
+        _np_bfloat16,
 
     # Ref types
-    types_pb2.DT_HALF_REF: np.float16,
-    types_pb2.DT_FLOAT_REF: np.float32,
-    types_pb2.DT_DOUBLE_REF: np.float64,
-    types_pb2.DT_INT32_REF: np.int32,
-    types_pb2.DT_UINT32_REF: np.uint32,
-    types_pb2.DT_UINT8_REF: np.uint8,
-    types_pb2.DT_UINT16_REF: np.uint16,
-    types_pb2.DT_INT16_REF: np.int16,
-    types_pb2.DT_INT8_REF: np.int8,
-    types_pb2.DT_STRING_REF: np.object,
-    types_pb2.DT_COMPLEX64_REF: np.complex64,
-    types_pb2.DT_COMPLEX128_REF: np.complex128,
-    types_pb2.DT_INT64_REF: np.int64,
-    types_pb2.DT_UINT64_REF: np.uint64,
-    types_pb2.DT_BOOL_REF: np.bool,
-    types_pb2.DT_QINT8_REF: _np_qint8,
-    types_pb2.DT_QUINT8_REF: _np_quint8,
-    types_pb2.DT_QINT16_REF: _np_qint16,
-    types_pb2.DT_QUINT16_REF: _np_quint16,
-    types_pb2.DT_QINT32_REF: _np_qint32,
-    types_pb2.DT_BFLOAT16_REF: _np_bfloat16,
+    types_pb2.DT_HALF_REF:
+        np.float16,
+    types_pb2.DT_FLOAT_REF:
+        np.float32,
+    types_pb2.DT_DOUBLE_REF:
+        np.float64,
+    types_pb2.DT_INT32_REF:
+        np.int32,
+    types_pb2.DT_UINT32_REF:
+        np.uint32,
+    types_pb2.DT_UINT8_REF:
+        np.uint8,
+    types_pb2.DT_UINT16_REF:
+        np.uint16,
+    types_pb2.DT_INT16_REF:
+        np.int16,
+    types_pb2.DT_INT8_REF:
+        np.int8,
+    types_pb2.DT_STRING_REF:
+        np.object,
+    types_pb2.DT_COMPLEX64_REF:
+        np.complex64,
+    types_pb2.DT_COMPLEX128_REF:
+        np.complex128,
+    types_pb2.DT_INT64_REF:
+        np.int64,
+    types_pb2.DT_UINT64_REF:
+        np.uint64,
+    types_pb2.DT_BOOL_REF:
+        np.bool,
+    types_pb2.DT_QINT8_REF:
+        _np_qint8,
+    types_pb2.DT_QUINT8_REF:
+        _np_quint8,
+    types_pb2.DT_QINT16_REF:
+        _np_qint16,
+    types_pb2.DT_QUINT16_REF:
+        _np_quint16,
+    types_pb2.DT_QINT32_REF:
+        _np_qint32,
+    types_pb2.DT_BFLOAT16_REF:
+        _np_bfloat16,
 }
 
-
-QUANTIZED_DTYPES = frozenset(
-    [qint8, quint8, qint16, quint16, qint32, qint8_ref, quint8_ref, qint16_ref,
-     quint16_ref, qint32_ref])
+QUANTIZED_DTYPES = frozenset([
+    qint8, quint8, qint16, quint16, qint32, qint8_ref, quint8_ref, qint16_ref,
+    quint16_ref, qint32_ref
+])
 tf_export("QUANTIZED_DTYPES").export_constant(__name__, "QUANTIZED_DTYPES")
 
 
@@ -613,7 +654,8 @@ def as_dtype(type_value):
   Args:
     type_value: A value that can be converted to a `tf.DType`
       object. This may currently be a `tf.DType` object, a
-      [`DataType` enum](https://www.tensorflow.org/code/tensorflow/core/framework/types.proto),
+      [`DataType`
+        enum](https://www.tensorflow.org/code/tensorflow/core/framework/types.proto),
       a string type name, or a `numpy.dtype`.
 
   Returns:
@@ -650,5 +692,4 @@ def as_dtype(type_value):
     except TypeError as e:
       raise TypeError("Cannot convert {} to a dtype. {}".format(type_value, e))
 
-  raise TypeError(
-      "Cannot convert value %r to a TensorFlow DType." % type_value)
+  raise TypeError("Cannot convert value %r to a TensorFlow DType." % type_value)
diff --git a/tensorflow/python/framework/importer.py b/tensorflow/python/framework/importer.py
index 00fff8d040..c26644362c 100644
--- a/tensorflow/python/framework/importer.py
+++ b/tensorflow/python/framework/importer.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """A utility function for importing TensorFlow graphs."""
 from __future__ import absolute_import
 from __future__ import division
@@ -43,8 +42,8 @@ from tensorflow.python.util.tf_export import tf_export
 # the logic here.
 def _GetNodeAttr(node_def, attr_name):
   if attr_name not in node_def.attr:
-    raise ValueError('Expected one attr with name %r in %s.'
-                     % (attr_name, str(node_def)))
+    raise ValueError('Expected one attr with name %r in %s.' % (attr_name,
+                                                                str(node_def)))
   return node_def.attr[attr_name]
 
 
@@ -170,9 +169,8 @@ def _ProcessInputMapParam(input_map):
   if input_map is None:
     input_map = {}
   else:
-    if not (isinstance(input_map, dict)
-            and all(isinstance(k, compat.bytes_or_text_types)
-                    for k in input_map.keys())):
+    if not (isinstance(input_map, dict) and all(
+        isinstance(k, compat.bytes_or_text_types) for k in input_map.keys())):
       raise TypeError('input_map must be a dictionary mapping strings to '
                       'Tensor objects.')
   return input_map
@@ -180,9 +178,10 @@ def _ProcessInputMapParam(input_map):
 
 def _ProcessReturnElementsParam(return_elements):
   """Type-checks and possibly canonicalizes `return_elements`."""
-  if return_elements is None: return None
-  if not all(isinstance(x, compat.bytes_or_text_types)
-             for x in return_elements):
+  if return_elements is None:
+    return None
+  if not all(
+      isinstance(x, compat.bytes_or_text_types) for x in return_elements):
     raise TypeError('return_elements must be a list of strings.')
   return tuple(compat.as_str(x) for x in return_elements)
 
@@ -262,14 +261,14 @@ def _PopulateTFImportGraphDefOptions(options, prefix, input_map,
     if input_src.startswith('^'):
       src_name = compat.as_bytes(input_src[1:])
       dst_op = input_dst._as_tf_output().oper  # pylint: disable=protected-access
-      c_api.TF_ImportGraphDefOptionsRemapControlDependency(options, src_name,
-                                                           dst_op)
+      c_api.TF_ImportGraphDefOptionsRemapControlDependency(
+          options, src_name, dst_op)
     else:
       src_name, src_idx = _ParseTensorName(input_src)
       src_name = compat.as_str(src_name)
       dst_output = input_dst._as_tf_output()  # pylint: disable=protected-access
-      c_api.TF_ImportGraphDefOptionsAddInputMapping(options, src_name,
-                                                    src_idx, dst_output)
+      c_api.TF_ImportGraphDefOptionsAddInputMapping(options, src_name, src_idx,
+                                                    dst_output)
   for name in return_elements or []:
     if ':' in name:
       op_name, index = _ParseTensorName(name)
@@ -315,8 +314,8 @@ def _ProcessNewOps(graph):
         coloc_op = graph._get_operation_by_name_unsafe(coloc_op_name)  # pylint: disable=protected-access
       except KeyError:
         raise ValueError('Specified colocation to an op that '
-                         'does not exist during import: %s in %s' % (
-                             coloc_op_name, op.name))
+                         'does not exist during import: %s in %s' %
+                         (coloc_op_name, op.name))
       if coloc_op.device:
         coloc_device = pydev.DeviceSpec.from_string(coloc_op.device)
         break
@@ -373,10 +372,13 @@ def _GatherReturnElements(requested_return_elements, graph, results):
 @tf_export('import_graph_def')
 @deprecated_args(None, 'Please file an issue at '
                  'https://github.com/tensorflow/tensorflow/issues if you depend'
-                 ' on this feature.',
-                 'op_dict')
-def import_graph_def(graph_def, input_map=None, return_elements=None,
-                     name=None, op_dict=None, producer_op_list=None):
+                 ' on this feature.', 'op_dict')
+def import_graph_def(graph_def,
+                     input_map=None,
+                     return_elements=None,
+                     name=None,
+                     op_dict=None,
+                     producer_op_list=None):
   """Imports the graph from `graph_def` into the current default `Graph`.
 
   This function provides a way to import a serialized TensorFlow
@@ -480,11 +482,12 @@ def import_graph_def(graph_def, input_map=None, return_elements=None,
         c_api.TF_ImportGraphDefResultsMissingUnusedInputMappings_wrapper(
             results))
     if missing_unused_input_keys:
-      missing_unused_input_keys = [compat.as_str(s)
-                                   for s in missing_unused_input_keys]
+      missing_unused_input_keys = [
+          compat.as_str(s) for s in missing_unused_input_keys
+      ]
       raise ValueError(
-          'Attempted to map inputs that were not found in graph_def: [%s]'
-          % ', '.join(missing_unused_input_keys))
+          'Attempted to map inputs that were not found in graph_def: [%s]' %
+          ', '.join(missing_unused_input_keys))
 
     if return_elements is None:
       return None
diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index d2b8e80305..0e5f696111 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Utilities to create TensorProtos."""
 from __future__ import absolute_import
 from __future__ import division
@@ -39,6 +38,7 @@ except ImportError:
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.util.tf_export import tf_export
+
 # pylint: enable=g-import-not-at-top
 
 
@@ -47,8 +47,8 @@ def ExtractBitsFromFloat16(x):
 
 
 def SlowAppendFloat16ArrayToTensorProto(tensor_proto, proto_values):
-  tensor_proto.half_val.extend([
-      ExtractBitsFromFloat16(x) for x in proto_values])
+  tensor_proto.half_val.extend(
+      [ExtractBitsFromFloat16(x) for x in proto_values])
 
 
 def ExtractBitsFromBFloat16(x):
@@ -57,31 +57,47 @@ def ExtractBitsFromBFloat16(x):
 
 
 def SlowAppendBFloat16ArrayToTensorProto(tensor_proto, proto_values):
-  tensor_proto.half_val.extend([
-      ExtractBitsFromBFloat16(x) for x in proto_values])
+  tensor_proto.half_val.extend(
+      [ExtractBitsFromBFloat16(x) for x in proto_values])
 
 
 if _FAST_TENSOR_UTIL_AVAILABLE:
   _NP_TO_APPEND_FN = {
-      dtypes.bfloat16.as_numpy_dtype: SlowAppendBFloat16ArrayToTensorProto,
+      dtypes.bfloat16.as_numpy_dtype:
+          SlowAppendBFloat16ArrayToTensorProto,
       # TODO(sesse): We should have a
       # fast_tensor_util.AppendFloat16ArrayToTensorProto,
       # but it seems np.float16_t doesn't exist?
-      np.float16: SlowAppendFloat16ArrayToTensorProto,
-      np.float32: fast_tensor_util.AppendFloat32ArrayToTensorProto,
-      np.float64: fast_tensor_util.AppendFloat64ArrayToTensorProto,
-      np.int32: fast_tensor_util.AppendInt32ArrayToTensorProto,
-      np.int64: fast_tensor_util.AppendInt64ArrayToTensorProto,
-      np.uint8: fast_tensor_util.AppendUInt8ArrayToTensorProto,
-      np.uint16: fast_tensor_util.AppendUInt16ArrayToTensorProto,
-      np.uint32: fast_tensor_util.AppendUInt32ArrayToTensorProto,
-      np.uint64: fast_tensor_util.AppendUInt64ArrayToTensorProto,
-      np.int8: fast_tensor_util.AppendInt8ArrayToTensorProto,
-      np.int16: fast_tensor_util.AppendInt16ArrayToTensorProto,
-      np.complex64: fast_tensor_util.AppendComplex64ArrayToTensorProto,
-      np.complex128: fast_tensor_util.AppendComplex128ArrayToTensorProto,
-      np.object: fast_tensor_util.AppendObjectArrayToTensorProto,
-      np.bool: fast_tensor_util.AppendBoolArrayToTensorProto,
+      np.float16:
+          SlowAppendFloat16ArrayToTensorProto,
+      np.float32:
+          fast_tensor_util.AppendFloat32ArrayToTensorProto,
+      np.float64:
+          fast_tensor_util.AppendFloat64ArrayToTensorProto,
+      np.int32:
+          fast_tensor_util.AppendInt32ArrayToTensorProto,
+      np.int64:
+          fast_tensor_util.AppendInt64ArrayToTensorProto,
+      np.uint8:
+          fast_tensor_util.AppendUInt8ArrayToTensorProto,
+      np.uint16:
+          fast_tensor_util.AppendUInt16ArrayToTensorProto,
+      np.uint32:
+          fast_tensor_util.AppendUInt32ArrayToTensorProto,
+      np.uint64:
+          fast_tensor_util.AppendUInt64ArrayToTensorProto,
+      np.int8:
+          fast_tensor_util.AppendInt8ArrayToTensorProto,
+      np.int16:
+          fast_tensor_util.AppendInt16ArrayToTensorProto,
+      np.complex64:
+          fast_tensor_util.AppendComplex64ArrayToTensorProto,
+      np.complex128:
+          fast_tensor_util.AppendComplex128ArrayToTensorProto,
+      np.object:
+          fast_tensor_util.AppendObjectArrayToTensorProto,
+      np.bool:
+          fast_tensor_util.AppendBoolArrayToTensorProto,
       dtypes.qint8.as_numpy_dtype:
           fast_tensor_util.AppendInt8ArrayToTensorProto,
       dtypes.quint8.as_numpy_dtype:
@@ -118,14 +134,12 @@ else:
     tensor_proto.uint64_val.extend([np.asscalar(x) for x in proto_values])
 
   def SlowAppendComplex64ArrayToTensorProto(tensor_proto, proto_values):
-    tensor_proto.scomplex_val.extend([np.asscalar(v)
-                                      for x in proto_values
-                                      for v in [x.real, x.imag]])
+    tensor_proto.scomplex_val.extend(
+        [np.asscalar(v) for x in proto_values for v in [x.real, x.imag]])
 
   def SlowAppendComplex128ArrayToTensorProto(tensor_proto, proto_values):
-    tensor_proto.dcomplex_val.extend([np.asscalar(v)
-                                      for x in proto_values
-                                      for v in [x.real, x.imag]])
+    tensor_proto.dcomplex_val.extend(
+        [np.asscalar(v) for x in proto_values for v in [x.real, x.imag]])
 
   def SlowAppendObjectArrayToTensorProto(tensor_proto, proto_values):
     tensor_proto.string_val.extend([compat.as_bytes(x) for x in proto_values])
@@ -252,15 +266,16 @@ def _FilterTuple(v):
       return None
   if isinstance(v, list):
     if not any(isinstance(x, (list, tuple)) for x in v):
-      return _FirstNotNone([None if isinstance(x, (list, tuple)) else x for x in v])
+      return _FirstNotNone(
+          [None if isinstance(x, (list, tuple)) else x for x in v])
   return _FirstNotNone([_FilterTuple(x) for x in v])
 
 
 def _FilterInt(v):
   if isinstance(v, (list, tuple)):
     return _FirstNotNone([_FilterInt(x) for x in v])
-  return None if isinstance(v, (compat.integral_types,
-                                tensor_shape.Dimension)) else _NotNone(v)
+  return None if isinstance(
+      v, (compat.integral_types, tensor_shape.Dimension)) else _NotNone(v)
 
 
 def _FilterFloat(v):
@@ -380,8 +395,11 @@ def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False):
   if dtype:
     dtype = dtypes.as_dtype(dtype)
 
-  is_quantized = (dtype in [dtypes.qint8, dtypes.quint8, dtypes.qint16,
-                            dtypes.quint16, dtypes.qint32])
+  is_quantized = (
+      dtype in [
+          dtypes.qint8, dtypes.quint8, dtypes.qint16, dtypes.quint16,
+          dtypes.qint32
+      ])
 
   # We first convert value to a numpy array or scalar.
   if isinstance(values, (np.ndarray, np.generic)):
@@ -419,9 +437,9 @@ def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False):
       if (list(nparray.shape) != _GetDenseDimensions(values) and
           not is_quantized):
         raise ValueError("""Argument must be a dense tensor: %s"""
-                         """ - got shape %s, but wanted %s.""" % (
-                             values, list(nparray.shape),
-                             _GetDenseDimensions(values)))
+                         """ - got shape %s, but wanted %s.""" %
+                         (values, list(nparray.shape),
+                          _GetDenseDimensions(values)))
 
     # python/numpy default float type is float64. We prefer float32 instead.
     if (nparray.dtype == np.float64) and dtype is None:
@@ -446,8 +464,8 @@ def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False):
 
   if dtype is not None and (not hasattr(dtype, "base_dtype") or
                             dtype.base_dtype != numpy_dtype.base_dtype):
-    raise TypeError("Incompatible types: %s vs. %s. Value is %s"
-                    % (dtype, nparray.dtype, values))
+    raise TypeError("Incompatible types: %s vs. %s. Value is %s" %
+                    (dtype, nparray.dtype, values))
 
   # If shape is not given, get the shape from the numpy array.
   if shape is None:
@@ -510,8 +528,8 @@ def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False):
 
   append_fn = GetNumpyAppendFn(proto_values.dtype)
   if append_fn is None:
-    raise TypeError("Element type not supported in TensorProto: %s" %
-                    numpy_dtype.name)
+    raise TypeError(
+        "Element type not supported in TensorProto: %s" % numpy_dtype.name)
   append_fn(tensor_proto, proto_values)
 
   return tensor_proto
@@ -553,19 +571,23 @@ def MakeNdarray(tensor):
       return tmp.reshape(shape)
   elif tensor_dtype == dtypes.float32:
     if len(tensor.float_val) == 1:
-      return np.repeat(np.array(tensor.float_val[0], dtype=dtype),
-                       num_elements).reshape(shape)
+      return np.repeat(
+          np.array(tensor.float_val[0], dtype=dtype),
+          num_elements).reshape(shape)
     else:
       return np.fromiter(tensor.float_val, dtype=dtype).reshape(shape)
   elif tensor_dtype == dtypes.float64:
     if len(tensor.double_val) == 1:
-      return np.repeat(np.array(tensor.double_val[0], dtype=dtype),
-                       num_elements).reshape(shape)
+      return np.repeat(
+          np.array(tensor.double_val[0], dtype=dtype),
+          num_elements).reshape(shape)
     else:
       return np.fromiter(tensor.double_val, dtype=dtype).reshape(shape)
-  elif tensor_dtype in [dtypes.int32, dtypes.uint8, dtypes.uint16, dtypes.int16,
-                        dtypes.int8, dtypes.qint32, dtypes.quint8, dtypes.qint8,
-                        dtypes.qint16, dtypes.quint16, dtypes.bfloat16]:
+  elif tensor_dtype in [
+      dtypes.int32, dtypes.uint8, dtypes.uint16, dtypes.int16, dtypes.int8,
+      dtypes.qint32, dtypes.quint8, dtypes.qint8, dtypes.qint16, dtypes.quint16,
+      dtypes.bfloat16
+  ]:
     if len(tensor.int_val) == 1:
       return np.repeat(np.array(tensor.int_val[0], dtype=dtype),
                        num_elements).reshape(shape)
@@ -573,35 +595,41 @@ def MakeNdarray(tensor):
       return np.fromiter(tensor.int_val, dtype=dtype).reshape(shape)
   elif tensor_dtype == dtypes.int64:
     if len(tensor.int64_val) == 1:
-      return np.repeat(np.array(tensor.int64_val[0], dtype=dtype),
-                       num_elements).reshape(shape)
+      return np.repeat(
+          np.array(tensor.int64_val[0], dtype=dtype),
+          num_elements).reshape(shape)
     else:
       return np.fromiter(tensor.int64_val, dtype=dtype).reshape(shape)
   elif tensor_dtype == dtypes.string:
     if len(tensor.string_val) == 1:
-      return np.repeat(np.array(tensor.string_val[0], dtype=dtype),
-                       num_elements).reshape(shape)
+      return np.repeat(
+          np.array(tensor.string_val[0], dtype=dtype),
+          num_elements).reshape(shape)
     else:
-      return np.array([x for x in tensor.string_val],
-                      dtype=dtype).reshape(shape)
+      return np.array(
+          [x for x in tensor.string_val], dtype=dtype).reshape(shape)
   elif tensor_dtype == dtypes.complex64:
     it = iter(tensor.scomplex_val)
     if len(tensor.scomplex_val) == 2:
-      return np.repeat(np.array(complex(tensor.scomplex_val[0],
-                                        tensor.scomplex_val[1]), dtype=dtype),
-                       num_elements).reshape(shape)
+      return np.repeat(
+          np.array(
+              complex(tensor.scomplex_val[0], tensor.scomplex_val[1]),
+              dtype=dtype), num_elements).reshape(shape)
     else:
-      return np.array([complex(x[0], x[1]) for x in zip(it, it)],
-                      dtype=dtype).reshape(shape)
+      return np.array(
+          [complex(x[0], x[1]) for x in zip(it, it)],
+          dtype=dtype).reshape(shape)
   elif tensor_dtype == dtypes.complex128:
     it = iter(tensor.dcomplex_val)
     if len(tensor.dcomplex_val) == 2:
-      return np.repeat(np.array(complex(tensor.dcomplex_val[0],
-                                        tensor.dcomplex_val[1]), dtype=dtype),
-                       num_elements).reshape(shape)
+      return np.repeat(
+          np.array(
+              complex(tensor.dcomplex_val[0], tensor.dcomplex_val[1]),
+              dtype=dtype), num_elements).reshape(shape)
     else:
-      return np.array([complex(x[0], x[1]) for x in zip(it, it)],
-                      dtype=dtype).reshape(shape)
+      return np.array(
+          [complex(x[0], x[1]) for x in zip(it, it)],
+          dtype=dtype).reshape(shape)
   elif tensor_dtype == dtypes.bool:
     if len(tensor.bool_val) == 1:
       return np.repeat(np.array(tensor.bool_val[0], dtype=dtype),
@@ -645,8 +673,9 @@ def _ConstantValue(tensor, partial):
   elif tensor.op.type == "Shape":
     input_shape = tensor.op.inputs[0].get_shape()
     if input_shape.is_fully_defined():
-      return np.array([dim.value for dim in input_shape.dims],
-                      dtype=tensor.dtype.as_numpy_dtype)
+      return np.array(
+          [dim.value for dim in input_shape.dims],
+          dtype=tensor.dtype.as_numpy_dtype)
     else:
       return None
   elif tensor.op.type == "Size":
@@ -658,8 +687,10 @@ def _ConstantValue(tensor, partial):
   elif tensor.op.type == "Rank":
     input_shape = tensor.op.inputs[0].get_shape()
     if input_shape.ndims is not None:
-      return np.ndarray(shape=(), buffer=np.array([input_shape.ndims], dtype=np.int32),
-                        dtype=np.int32)
+      return np.ndarray(
+          shape=(),
+          buffer=np.array([input_shape.ndims], dtype=np.int32),
+          dtype=np.int32)
     else:
       return None
   elif tensor.op.type == "Range":
@@ -861,8 +892,8 @@ def constant_value_as_shape(tensor):  # pylint: disable=invalid-name
         new_axis_mask = tensor.op.get_attr("new_axis_mask")
         shrink_axis_mask = tensor.op.get_attr("shrink_axis_mask")
         valid_attributes = (not ellipsis_mask and not new_axis_mask and
-                            not shrink_axis_mask and
-                            (not begin_mask or (begin_mask == 1)) and
+                            not shrink_axis_mask and (not begin_mask or
+                                                      (begin_mask == 1)) and
                             (not end_mask or (end_mask == 1)))
         if valid_attributes:  # additional inputs not supported
           prev = constant_value_as_shape(tensor.op.inputs[0])
@@ -878,8 +909,8 @@ def constant_value_as_shape(tensor):  # pylint: disable=invalid-name
   ret = tensor_shape.unknown_shape(shape[0].value)
   value = constant_value(tensor)
   if value is not None:
-    ret = ret.merge_with(tensor_shape.TensorShape(
-        [d if d >= 0 else None for d in value]))
+    ret = ret.merge_with(
+        tensor_shape.TensorShape([d if d >= 0 else None for d in value]))
   return ret
 
 
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 6a7e1d0c89..4a8aa2e258 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -123,11 +123,11 @@ def assert_equal_graph_def(actual, expected, checkpoint_v2=False):
     TypeError: If either argument is not a `GraphDef`.
   """
   if not isinstance(actual, graph_pb2.GraphDef):
-    raise TypeError("Expected tf.GraphDef for actual, got %s" %
-                    type(actual).__name__)
+    raise TypeError(
+        "Expected tf.GraphDef for actual, got %s" % type(actual).__name__)
   if not isinstance(expected, graph_pb2.GraphDef):
-    raise TypeError("Expected tf.GraphDef for expected, got %s" %
-                    type(expected).__name__)
+    raise TypeError(
+        "Expected tf.GraphDef for expected, got %s" % type(expected).__name__)
 
   if checkpoint_v2:
     _strip_checkpoint_v2_randomized(actual)
@@ -152,11 +152,10 @@ def assert_meta_graph_protos_equal(tester, a, b):
       a_proto = proto_type()
       b_proto = proto_type()
       # Number of entries in the collections is the same
-      tester.assertEqual(len(a_value.bytes_list.value),
-                         len(b_value.bytes_list.value))
-      for (a_value_item, b_value_item) in zip(
-          a_value.bytes_list.value,
-          b_value.bytes_list.value):
+      tester.assertEqual(
+          len(a_value.bytes_list.value), len(b_value.bytes_list.value))
+      for (a_value_item, b_value_item) in zip(a_value.bytes_list.value,
+                                              b_value.bytes_list.value):
         a_proto.ParseFromString(a_value_item)
         b_proto.ParseFromString(b_value_item)
         tester.assertProtoEquals(a_proto, b_proto)
@@ -220,10 +219,7 @@ def NHWCToNCHW(input_tensor):
     converted tensor or shape array
   """
   # tensor dim -> new axis order
-  new_axes = {
-      4: [0, 3, 1, 2],
-      5: [0, 4, 1, 2, 3]
-  }
+  new_axes = {4: [0, 3, 1, 2], 5: [0, 4, 1, 2, 3]}
   if isinstance(input_tensor, ops.Tensor):
     ndims = input_tensor.shape.ndims
     return array_ops.transpose(input_tensor, new_axes[ndims])
@@ -250,8 +246,9 @@ def NHWCToNCHW_VECT_C(input_shape_or_tensor):
   """
   permutations = {5: [0, 3, 1, 2, 4], 6: [0, 4, 1, 2, 3, 5]}
   is_tensor = isinstance(input_shape_or_tensor, ops.Tensor)
-  temp_shape = (input_shape_or_tensor.shape.as_list()
-                if is_tensor else input_shape_or_tensor)
+  temp_shape = (
+      input_shape_or_tensor.shape.as_list()
+      if is_tensor else input_shape_or_tensor)
   if temp_shape[-1] % 4 != 0:
     raise ValueError(
         "Last dimension of input must be evenly divisible by 4 to convert to "
@@ -283,8 +280,9 @@ def NCHW_VECT_CToNHWC(input_shape_or_tensor):
   """
   permutations = {5: [0, 2, 3, 1, 4], 6: [0, 2, 3, 4, 1, 5]}
   is_tensor = isinstance(input_shape_or_tensor, ops.Tensor)
-  input_shape = (input_shape_or_tensor.shape.as_list()
-                 if is_tensor else input_shape_or_tensor)
+  input_shape = (
+      input_shape_or_tensor.shape.as_list()
+      if is_tensor else input_shape_or_tensor)
   if input_shape[-1] != 4:
     raise ValueError("Last dimension of NCHW_VECT_C must be 4.")
   permutation = permutations[len(input_shape)]
@@ -307,10 +305,7 @@ def NCHWToNHWC(input_tensor):
     converted tensor or shape array
   """
   # tensor dim -> new axis order
-  new_axes = {
-      4: [0, 2, 3, 1],
-      5: [0, 2, 3, 4, 1]
-  }
+  new_axes = {4: [0, 2, 3, 1], 5: [0, 2, 3, 4, 1]}
   if isinstance(input_tensor, ops.Tensor):
     ndims = input_tensor.shape.ndims
     return array_ops.transpose(input_tensor, new_axes[ndims])
@@ -329,6 +324,8 @@ def _use_c_api_wrapper(fn, use_c_api, *args, **kwargs):
       fn(*args, **kwargs)
   finally:
     ops._USE_C_API = prev_value
+
+
 # pylint: disable=protected-access
 
 
@@ -345,7 +342,9 @@ def skip_if(condition):
   Returns:
     The wrapped function
   """
+
   def real_skip_if(fn):
+
     def wrapper(*args, **kwargs):
       if callable(condition):
         skip = condition()
@@ -353,7 +352,9 @@ def skip_if(condition):
         skip = condition
       if not skip:
         fn(*args, **kwargs)
+
     return wrapper
+
   return real_skip_if
 
 
@@ -370,8 +371,10 @@ def disable_c_api(fn):
   Returns:
     The wrapped function
   """
+
   def wrapper(*args, **kwargs):
     _use_c_api_wrapper(fn, False, *args, **kwargs)
+
   return wrapper
 
 
@@ -388,8 +391,10 @@ def enable_c_api(fn):
   Returns:
     The wrapped function
   """
+
   def wrapper(*args, **kwargs):
     _use_c_api_wrapper(fn, True, *args, **kwargs)
+
   return wrapper
 
 
@@ -561,13 +566,17 @@ def assert_no_garbage_created(f):
     # not hold on to every object in other tests.
     gc.set_debug(previous_debug_flags)
     gc.enable()
+
   return decorator
 
 
-def run_in_graph_and_eager_modes(
-    __unused__=None, graph=None, config=None,
-    use_gpu=False, force_gpu=False,
-    reset_test=True, assert_no_eager_garbage=False):
+def run_in_graph_and_eager_modes(__unused__=None,
+                                 graph=None,
+                                 config=None,
+                                 use_gpu=False,
+                                 force_gpu=False,
+                                 reset_test=True,
+                                 assert_no_eager_garbage=False):
   """Runs the test in both graph and eager modes.
 
   Args:
@@ -596,6 +605,7 @@ def run_in_graph_and_eager_modes(
 
   def decorator(f):
     """Test method decorator."""
+
     def decorated(self, **kwargs):
       """Decorated the test method."""
       with context.graph_mode():
@@ -631,6 +641,7 @@ def run_in_graph_and_eager_modes(
           run_eager_mode(self, **kwargs)
 
     return decorated
+
   return decorator
 
 
@@ -767,8 +778,10 @@ class TensorFlowTestCase(googletest.TestCase):
       self._AssertProtoEquals(expected_message, message)
     elif isinstance(expected_message_maybe_ascii, str):
       expected_message = type(message)()
-      text_format.Merge(expected_message_maybe_ascii, expected_message,
-                        descriptor_pool=descriptor_pool.Default())
+      text_format.Merge(
+          expected_message_maybe_ascii,
+          expected_message,
+          descriptor_pool=descriptor_pool.Default())
       self._AssertProtoEquals(expected_message, message)
     else:
       assert False, ("Can't compare protos of type %s and %s" %
@@ -852,7 +865,8 @@ class TensorFlowTestCase(googletest.TestCase):
     trigger the creation of a new session.
 
     Use the `use_gpu` and `force_gpu` options to control where ops are run. If
-    `force_gpu` is True, all ops are pinned to `/device:GPU:0`. Otherwise, if `use_gpu`
+    `force_gpu` is True, all ops are pinned to `/device:GPU:0`. Otherwise, if
+    `use_gpu`
     is True, TensorFlow tries to run as many ops on the GPU as possible. If both
     `force_gpu and `use_gpu` are False, all ops are pinned to the CPU.
 
@@ -1051,6 +1065,7 @@ class TensorFlowTestCase(googletest.TestCase):
     self._threads.append(ret)
     return ret
 
+
 # pylint: enable=invalid-name
 
   def assertNear(self, f1, f2, err, msg=None):
@@ -1118,7 +1133,8 @@ class TensorFlowTestCase(googletest.TestCase):
       # the absolute difference between a and b.  Here, we want to
       # print out which elements violate such conditions.
       cond = np.logical_or(
-          np.abs(a - b) > atol + rtol * np.abs(b), np.isnan(a) != np.isnan(b))
+          np.abs(a - b) > atol + rtol * np.abs(b),
+          np.isnan(a) != np.isnan(b))
       if a.ndim:
         x = a[np.where(cond)]
         y = b[np.where(cond)]
@@ -1380,8 +1396,11 @@ class TensorFlowTestCase(googletest.TestCase):
 
 
 @tf_export("test.create_local_cluster")
-def create_local_cluster(num_workers, num_ps, protocol="grpc",
-                         worker_config=None, ps_config=None):
+def create_local_cluster(num_workers,
+                         num_ps,
+                         protocol="grpc",
+                         worker_config=None,
+                         ps_config=None):
   """Create and start local servers and return the associated `Server` objects.
 
   Example:
@@ -1431,15 +1450,21 @@ def create_local_cluster(num_workers, num_ps, protocol="grpc",
 
   workers = [
       server_lib.Server(
-          cs, job_name="worker", protocol=protocol, task_index=ix,
-          config=worker_config, start=True)
-      for ix in range(num_workers)
+          cs,
+          job_name="worker",
+          protocol=protocol,
+          task_index=ix,
+          config=worker_config,
+          start=True) for ix in range(num_workers)
   ]
   ps_servers = [
       server_lib.Server(
-          cs, job_name="ps", protocol=protocol, task_index=ix,
-          config=ps_config, start=True)
-      for ix in range(num_ps)
+          cs,
+          job_name="ps",
+          protocol=protocol,
+          task_index=ix,
+          config=ps_config,
+          start=True) for ix in range(num_ps)
   ]
 
   return workers, ps_servers
diff --git a/tensorflow/python/kernel_tests/atrous_convolution_test.py b/tensorflow/python/kernel_tests/atrous_convolution_test.py
index 04248fb2ba..2d1b3d9b7e 100644
--- a/tensorflow/python/kernel_tests/atrous_convolution_test.py
+++ b/tensorflow/python/kernel_tests/atrous_convolution_test.py
@@ -81,6 +81,7 @@ class AtrousConvolutionTest(test.TestCase):
       otherwise, it's delayed after the context.
     """
     checks = []
+
     def add_check(check, *args, **kwargs):
       if context.in_eager_mode():
         args_val, kwargs_val = self.evaluate([args, kwargs])
@@ -96,12 +97,12 @@ class AtrousConvolutionTest(test.TestCase):
 
   def _test_atrous_convolution(self, add_check, input_shape, filter_shape,
                                dilation_rate, **kwargs):
-    filters = np.arange(np.prod(filter_shape),
-                        dtype=np.float32).reshape(filter_shape)
+    filters = np.arange(
+        np.prod(filter_shape), dtype=np.float32).reshape(filter_shape)
     filters_upsampled = upsample_filters(filters, dilation_rate)
     x = np.arange(np.prod(input_shape), dtype=np.float32).reshape(input_shape)
-    y1 = nn_ops.convolution(input=x, filter=filters,
-                            dilation_rate=dilation_rate, **kwargs)
+    y1 = nn_ops.convolution(
+        input=x, filter=filters, dilation_rate=dilation_rate, **kwargs)
     y2 = nn_ops.convolution(input=x, filter=filters_upsampled, **kwargs)
 
     def check(y1_eval, y2_eval):
@@ -112,13 +113,15 @@ class AtrousConvolutionTest(test.TestCase):
   def test_unknown_spatial_dims_for_channel_last_format(self):
     x = array_ops.placeholder(dtypes.float32, [1, None, None, 10])
     w = array_ops.zeros([3, 3, 10, 20])
-    y = nn_ops.convolution(x, w, "VALID", dilation_rate=[2, 2], data_format="NHWC")
+    y = nn_ops.convolution(
+        x, w, "VALID", dilation_rate=[2, 2], data_format="NHWC")
     self.assertEqual(y.shape.as_list(), [1, None, None, 20])
 
   def test_unknown_spatial_dims_for_channel_first_format(self):
     x = array_ops.placeholder(dtypes.float32, [1, 10, None, None])
     w = array_ops.zeros([3, 3, 10, 20])
-    y = nn_ops.convolution(x, w, "VALID", dilation_rate=[2, 2], data_format="NCHW")
+    y = nn_ops.convolution(
+        x, w, "VALID", dilation_rate=[2, 2], data_format="NCHW")
     self.assertEqual(y.shape.as_list(), [1, 20, None, None])
 
   @test_util.run_in_graph_and_eager_modes()
@@ -215,28 +218,35 @@ class AtrousConvolutionTest(test.TestCase):
 
               def combined_op(converted_input, num_spatial_dims, padding_arg):  # pylint: disable=unused-argument
                 # pylint: disable=cell-var-from-loop
-                result = nn_ops.convolution(input=converted_input, filter=f1,
-                                            padding=padding)
-                result = nn_ops.convolution(input=result, filter=f2,
-                                            padding=padding)
+                result = nn_ops.convolution(
+                    input=converted_input, filter=f1, padding=padding)
+                result = nn_ops.convolution(
+                    input=result, filter=f2, padding=padding)
                 # pylint: enable=cell-var-from-loop
                 return result
 
               for rate_height in range(2, 4):
                 for rate_width in range(2, 4):
                   dilation_rate = [rate_height, rate_width]
-                  y1 = nn_ops.convolution(input=x, filter=f1, padding=padding,
-                                          dilation_rate=dilation_rate)
-                  y1 = nn_ops.convolution(input=y1, filter=f2,
-                                          padding=padding,
-                                          dilation_rate=dilation_rate)
+                  y1 = nn_ops.convolution(
+                      input=x,
+                      filter=f1,
+                      padding=padding,
+                      dilation_rate=dilation_rate)
+                  y1 = nn_ops.convolution(
+                      input=y1,
+                      filter=f2,
+                      padding=padding,
+                      dilation_rate=dilation_rate)
                   y2 = nn_ops.with_space_to_batch(
-                      input=x, dilation_rate=dilation_rate, op=combined_op,
+                      input=x,
+                      dilation_rate=dilation_rate,
+                      op=combined_op,
                       padding="VALID")
 
                   def check(y1_eval, y2_eval):
-                    self.assertAllClose(y1_eval, y2_eval, rtol=1e-2,
-                                        atol=1e-2)
+                    self.assertAllClose(y1_eval, y2_eval, rtol=1e-2, atol=1e-2)
+
                   add_check(check, y1, y2)
 
   def _test_gradient(self, x_shape, f_shape, dilation_rate, padding):
diff --git a/tensorflow/python/kernel_tests/candidate_sampler_ops_test.py b/tensorflow/python/kernel_tests/candidate_sampler_ops_test.py
index 88b3f20469..28b3dc45e9 100644
--- a/tensorflow/python/kernel_tests/candidate_sampler_ops_test.py
+++ b/tensorflow/python/kernel_tests/candidate_sampler_ops_test.py
@@ -80,7 +80,7 @@ class RangeSamplerOpsTest(test.TestCase):
     with self.test_session():
       true_classes = constant_op.constant(
           [[1, 2], [0, 4], [3, 3]], dtype=dtypes.int64)
-      _, _, sampled_expected_count = candidate_sampling_ops.all_candidate_sampler(
+      _, _, sampled_expected_count = candidate_sampling_ops.all_candidate_sampler(  # pylint: disable=line-too-long
           true_classes, self.NUM_TRUE, self.NUM_SAMPLED, True)
       sampled_log_expected_count = math_ops.log(sampled_expected_count)
       result = sampled_log_expected_count.eval()
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index 6e18ed132c..5d648bb235 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -181,8 +181,8 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(enter_v_constant.shape, [2])
 
       # Otherwise, the shape should be unknown.
-      enter_v_non_constant = control_flow_ops.enter(v, "frame2",
-                                                    is_constant=False)
+      enter_v_non_constant = control_flow_ops.enter(
+          v, "frame2", is_constant=False)
       self.assertEqual(enter_v_non_constant.shape, None)
 
   def testSwitchMergeIndexedSlices(self):
@@ -736,24 +736,21 @@ class ControlFlowTest(test.TestCase):
     with self.test_session():
       s = constant_op.constant([1, 2, 3, 4, 5])
       r = isum(s, maximum_iterations=3)
-      self.assertAllEqual([1+3, 2+3, 3+3, 4+3, 5+3], r.eval())
+      self.assertAllEqual([1 + 3, 2 + 3, 3 + 3, 4 + 3, 5 + 3], r.eval())
 
   def testWhileWithMaximumIterationsAndSingleArgument(self):
     with self.test_session():
       r = control_flow_ops.while_loop(
-          lambda i: i < 3,
-          lambda i: i + 1,
-          [0],
-          maximum_iterations=1)
+          lambda i: i < 3, lambda i: i + 1, [0], maximum_iterations=1)
       self.assertEqual(1, r.eval())
 
   def testSingleNestedMaximumIterationsWhileLoopGradientInXLAContext(self):
     v = constant_op.constant(1.0)
+
     def training_loop_with_gradient(i):
       out = control_flow_ops.while_loop(
           lambda i_, _: i_ < 3,
-          lambda i_, j: [i_ + 1, j * v],
-          [0, 1.0],
+          lambda i_, j: [i_ + 1, j * v], [0, 1.0],
           maximum_iterations=i)
       g = gradients_impl.gradients(out, v)
       with ops.control_dependencies(g):
@@ -763,8 +760,8 @@ class ControlFlowTest(test.TestCase):
     xla_context.Enter()
     # Create training loop, ensure we can call gradient() of
     # while_loop inside the training loop.
-    loop = control_flow_ops.while_loop(
-        lambda i: i < 3, training_loop_with_gradient, [0])
+    loop = control_flow_ops.while_loop(lambda i: i < 3,
+                                       training_loop_with_gradient, [0])
     xla_context.Exit()
 
     loop_execute = array_ops.identity(loop)  # Because loop is not fetchable.
@@ -774,17 +771,18 @@ class ControlFlowTest(test.TestCase):
 
   def testInvalidMaximumIterationsWhileLoopGradientInXLAContext(self):
     v = constant_op.constant(1.0)
+
     def inner_body(i, x):
       out = control_flow_ops.while_loop(
           lambda i, _: i < 3,
-          lambda i, j: [i + 1, j * v],
-          [0, x],
+          lambda i, j: [i + 1, j * v], [0, x],
           maximum_iterations=i)
       return out
 
     def create_while_loop(maximum_iterations=None):
       return control_flow_ops.while_loop(
-          lambda i, _: i < 3, inner_body, [0, 1.0],
+          lambda i, _: i < 3,
+          inner_body, [0, 1.0],
           maximum_iterations=maximum_iterations)
 
     loop_no_xla = create_while_loop(maximum_iterations=5)
@@ -819,14 +817,17 @@ class ControlFlowTest(test.TestCase):
 
     def create_while_loop():
       max_iter_holder = []
+
       def create_mi():
         max_iter_holder.append(array_ops.placeholder(dtypes.int32, shape=()))
         return 1.0
-      _ = control_flow_ops.cond(constant_op.constant(True),
-                                create_mi, create_mi)
+
+      _ = control_flow_ops.cond(
+          constant_op.constant(True), create_mi, create_mi)
 
       return control_flow_ops.while_loop(
-          lambda i, _: i < 3, lambda i, x: (i + 1, v * x), (0, 1.0),
+          lambda i, _: i < 3,
+          lambda i, x: (i + 1, v * x), (0, 1.0),
           maximum_iterations=max_iter_holder[0])
 
     xla_context = control_flow_ops.XLAControlFlowContext()
@@ -849,28 +850,32 @@ class ControlFlowTest(test.TestCase):
     p = array_ops.placeholder(dtype=dtypes.int32)
 
     def mid_body_builder(iterations):
+
       def mid_body(i, x):
         r = control_flow_ops.while_loop(
             lambda *_: True,
-            lambda i, x: (i + 1, v * x),
-            (0, x),
-            maximum_iterations=iterations, name="inner")
+            lambda i, x: (i + 1, v * x), (0, x),
+            maximum_iterations=iterations,
+            name="inner")
         return (i + 1, gradients_impl.gradients(x + r[1], v)[0])
+
       return mid_body
 
     def outer_body(i, x):
       iterations = array_ops.size(p, name="iterations")
-      return (
-          i + 1,
-          x + control_flow_ops.while_loop(
-              lambda *_: True, mid_body_builder(iterations), (0, x),
-              maximum_iterations=iterations, name="mid")[1])
+      return (i + 1, x + control_flow_ops.while_loop(
+          lambda *_: True,
+          mid_body_builder(iterations), (0, x),
+          maximum_iterations=iterations,
+          name="mid")[1])
 
     def create_while_loop():
       with ops.device("/cpu:0"):
         r = control_flow_ops.while_loop(
-            lambda *_: True, outer_body, (0, 1.0),
-            maximum_iterations=5, name="outer")
+            lambda *_: True,
+            outer_body, (0, 1.0),
+            maximum_iterations=5,
+            name="outer")
         return array_ops.identity(r[1])
 
     xla_context = control_flow_ops.XLAControlFlowContext()
@@ -881,18 +886,19 @@ class ControlFlowTest(test.TestCase):
     final_without_xla_context = create_while_loop()
 
     with self.test_session(use_gpu=False) as sess:
-      opts = config_pb2.RunOptions(
-          trace_level=config_pb2.RunOptions.FULL_TRACE)
+      opts = config_pb2.RunOptions(trace_level=config_pb2.RunOptions.FULL_TRACE)
       run_metadata = config_pb2.RunMetadata()
 
       final_value_without_xla_context = sess.run(
-          final_without_xla_context,
-          feed_dict={p: [0, 0, 0]})
+          final_without_xla_context, feed_dict={
+              p: [0, 0, 0]
+          })
 
       final_value_with_xla_context = sess.run(
           final_with_xla_context,
           feed_dict={p: [0, 0, 0]},
-          options=opts, run_metadata=run_metadata)
+          options=opts,
+          run_metadata=run_metadata)
 
       node_stats = run_metadata.step_stats.dev_stats[0].node_stats
       stack_push_count = len(
@@ -901,8 +907,8 @@ class ControlFlowTest(test.TestCase):
       # the last two "3"s comes from size(p), when p == [0, 0, 0].
       self.assertEqual(stack_push_count, 5 * 3 * 3)
 
-      self.assertAllClose(
-          final_value_with_xla_context, final_value_without_xla_context)
+      self.assertAllClose(final_value_with_xla_context,
+                          final_value_without_xla_context)
 
   # Have more than 10 parallel iterations and hence exercise k-bound
   # most of the time.
@@ -951,8 +957,7 @@ class ControlFlowTest(test.TestCase):
     with self.test_session():
 
       def compute(i, c, o):
-        c = array_ops.strided_slice(x,
-                                    array_ops.expand_dims(i, 0),
+        c = array_ops.strided_slice(x, array_ops.expand_dims(i, 0),
                                     [1] + array_ops.expand_dims(i, 0))
         o = array_ops.concat([o, c], 0)
         i = math_ops.add(i, 1)
@@ -963,11 +968,12 @@ class ControlFlowTest(test.TestCase):
       o = ops.convert_to_tensor([0])
       x = ops.convert_to_tensor([1, 2, 3, 4, 5, 6])
       s = array_ops.size(x)
-      r = control_flow_ops.while_loop(
-          lambda i, c, o: math_ops.less(i, s), compute, [i, c, o], [
-              i.get_shape(), tensor_shape.unknown_shape(),
-              tensor_shape.unknown_shape()
-          ])
+      r = control_flow_ops.while_loop(lambda i, c, o: math_ops.less(i, s),
+                                      compute, [i, c, o], [
+                                          i.get_shape(),
+                                          tensor_shape.unknown_shape(),
+                                          tensor_shape.unknown_shape()
+                                      ])
       result = r[2].eval()
     self.assertAllEqual(np.array([0, 1, 2, 3, 4, 5, 6]), result)
 
@@ -1033,7 +1039,8 @@ class ControlFlowTest(test.TestCase):
         return [new_i, new_j]
 
       r = control_flow_ops.while_loop(
-          c, _b, [i, m], [i.get_shape(), tensor_shape.unknown_shape()])
+          c, _b, [i, m],
+          [i.get_shape(), tensor_shape.unknown_shape()])
       r = r[1] * array_ops.ones([8, 8])
       self.assertAllEqual(np.ones((8, 8)), r.eval())
 
@@ -1065,7 +1072,8 @@ class ControlFlowTest(test.TestCase):
         return [new_i, new_j]
 
       r = control_flow_ops.while_loop(
-          c, b, [i, m], [i.get_shape(), tensor_shape.TensorShape([None, 2])])
+          c, b, [i, m],
+          [i.get_shape(), tensor_shape.TensorShape([None, 2])])
       self.assertTrue(r[1].get_shape()[0].value is None)
       self.assertEqual(r[1].get_shape()[1], tensor_shape.Dimension(2))
 
@@ -1092,20 +1100,22 @@ class ControlFlowTest(test.TestCase):
 
       def b(i, x):
         return [
-            i + 1, sparse_tensor.SparseTensor(x.indices, x.values * 2.0,
-                                              x.dense_shape)
+            i + 1,
+            sparse_tensor.SparseTensor(x.indices, x.values * 2.0, x.dense_shape)
         ]
 
       _, r = control_flow_ops.while_loop(c, b, [i, x])
       self.assertEqual(r.dense_shape.get_shape()[0].value, 1)
 
       _, r = control_flow_ops.while_loop(
-          c, b, [i, x], [i.get_shape(), tensor_shape.TensorShape([None])])
+          c, b, [i, x],
+          [i.get_shape(), tensor_shape.TensorShape([None])])
       self.assertTrue(r.dense_shape.get_shape()[0].value is None)
 
       with self.assertRaisesRegexp(ValueError, "is not compatible with"):
         _, r = control_flow_ops.while_loop(
-            c, b, [i, x], [i.get_shape(), tensor_shape.TensorShape([5])])
+            c, b, [i, x],
+            [i.get_shape(), tensor_shape.TensorShape([5])])
 
   def testWhileShapeInferenceIndexedSlices(self):
     with self.test_session():
@@ -1120,7 +1130,8 @@ class ControlFlowTest(test.TestCase):
 
       def b(i, x):
         return [
-            i + 1, ops.IndexedSlices(x.values * 2.0, x.indices, x.dense_shape)
+            i + 1,
+            ops.IndexedSlices(x.values * 2.0, x.indices, x.dense_shape)
         ]
 
       _, r = control_flow_ops.while_loop(c, b, [i, x])
@@ -1128,14 +1139,16 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(r.values.get_shape(), tensor_shape.TensorShape([2, 2]))
 
       _, r = control_flow_ops.while_loop(
-          c, b, [i, x], [i.get_shape(), tensor_shape.TensorShape([None, 2])])
+          c, b, [i, x],
+          [i.get_shape(), tensor_shape.TensorShape([None, 2])])
       self.assertEqual(r.dense_shape.get_shape()[0].value, 2)
       self.assertTrue(r.values.get_shape()[0].value is None)
       self.assertEqual(r.values.get_shape()[1].value, 2)
 
       with self.assertRaisesRegexp(ValueError, "is not compatible with"):
         _, r = control_flow_ops.while_loop(
-            c, b, [i, x], [i.get_shape(), tensor_shape.TensorShape([None, 5])])
+            c, b, [i, x],
+            [i.get_shape(), tensor_shape.TensorShape([None, 5])])
 
   def _testNestedWhile_1(self, use_gpu):
     with self.test_session(use_gpu=use_gpu):
@@ -1276,16 +1289,17 @@ class ControlFlowTest(test.TestCase):
           "v", [], initializer=init_ops.constant_initializer(2))
       i0 = constant_op.constant(0)
       with ops.control_dependencies([i0]):
+
         def loop_condition(i):
           return i < 4
 
         def loop_body(i):
           some_cond = control_flow_ops.cond(
               constant_op.constant(True),
-              lambda: state_ops.assign(v, math_ops.square(v)),
-              lambda: v)
+              lambda: state_ops.assign(v, math_ops.square(v)), lambda: v)
           with ops.control_dependencies([some_cond]):
             return i + 1
+
       r = control_flow_ops.while_loop(loop_condition, loop_body, (i0,))
       variables.global_variables_initializer().run()
       self.assertEqual(4, r.eval())
@@ -1600,7 +1614,8 @@ class ControlFlowTest(test.TestCase):
 
       _, rx = control_flow_ops.while_loop(
           c1,
-          b1, [r, x], [r.get_shape(), tensor_shape.unknown_shape()],
+          b1, [r, x],
+          [r.get_shape(), tensor_shape.unknown_shape()],
           parallel_iterations=1)
       self.assertEqual(45, rx.eval())
 
@@ -1663,7 +1678,8 @@ class ControlFlowTest(test.TestCase):
       b = lambda i, v: [i + 1, math_ops.multiply(x, v)]
       r = control_flow_ops.while_loop(
           c,
-          b, [n, v], [n.get_shape(), tensor_shape.unknown_shape()],
+          b, [n, v],
+          [n.get_shape(), tensor_shape.unknown_shape()],
           parallel_iterations=1)
 
       r = gradients_impl.gradients(r[1], x)[0]
@@ -1797,8 +1813,8 @@ class ControlFlowTest(test.TestCase):
       named = collections.namedtuple("named", ("a", "b"))
       loop_vars = [
           named(a=constant_op.constant(0.0), b=constant_op.constant(1.0)),
-          (constant_op.constant(2.0),
-           constant_op.constant(3.0)), constant_op.constant(4.0)
+          (constant_op.constant(2.0), constant_op.constant(3.0)),
+          constant_op.constant(4.0)
       ]
       c = lambda lv0, _1, _2: lv0.a < 100.0
 
@@ -1824,8 +1840,8 @@ class ControlFlowTest(test.TestCase):
       named = collections.namedtuple("named", ("a", "b"))
       loop_vars = [
           named(a=constant_op.constant(0.0), b=constant_op.constant(1.0)),
-          (constant_op.constant(2.0),
-           constant_op.constant(3.0)), constant_op.constant(4.0)
+          (constant_op.constant(2.0), constant_op.constant(3.0)),
+          constant_op.constant(4.0)
       ]
       c = lambda lv0, _1, _2: lv0.a < 100.0
 
@@ -2176,7 +2192,8 @@ class ControlFlowTest(test.TestCase):
 
       def b(i, x):
         return [
-            i + 1, ops.IndexedSlices(x.values * 2.0, x.indices, x.dense_shape)
+            i + 1,
+            ops.IndexedSlices(x.values * 2.0, x.indices, x.dense_shape)
         ]
 
       _, r = control_flow_ops.while_loop(c, b, [i, x])
@@ -2197,8 +2214,8 @@ class ControlFlowTest(test.TestCase):
 
       def b(i, x):
         return [
-            i + 1, sparse_tensor.SparseTensor(x.indices, x.values * 2.0,
-                                              x.dense_shape)
+            i + 1,
+            sparse_tensor.SparseTensor(x.indices, x.values * 2.0, x.dense_shape)
         ]
 
       _, r = control_flow_ops.while_loop(c, b, [i, x])
@@ -2220,8 +2237,8 @@ class ControlFlowTest(test.TestCase):
         x1 = x + gradients_impl.gradients(data, params)[0]
         return i + 1, x1
 
-      output_grad = control_flow_ops.while_loop(c, b,
-                                                [i0, constant_op.constant(0.0)])
+      output_grad = control_flow_ops.while_loop(
+          c, b, [i0, constant_op.constant(0.0)])
       self.assertAllClose(600.0, sess.run(output_grad)[1])
 
   def testWhileAndTensorArray(self):
@@ -2359,9 +2376,12 @@ class ControlFlowTest(test.TestCase):
 
   def testStopGradMultiFlows(self):
     with self.test_session():
+
       def body(i, y, r):
         x = variable_scope.get_variable(
-            "x", shape=(), dtype=dtypes.float32,
+            "x",
+            shape=(),
+            dtype=dtypes.float32,
             initializer=init_ops.ones_initializer())
         y *= x
         return [i + 1, y, r + math_ops.reduce_sum(y)]
@@ -2773,7 +2793,8 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(
           lambda i, v: i < 2, lambda i, v: [i + 1, func(v)],
           [constant_op.constant(0), x],
-          [tensor_shape.unknown_shape(), tensor_shape.unknown_shape()])
+          [tensor_shape.unknown_shape(),
+           tensor_shape.unknown_shape()])
       self.assertEqual(r[1].eval(), 65536.0)
 
       r = gradients_impl.gradients(r, x)[0]
@@ -2800,12 +2821,14 @@ class ControlFlowContextCheckTest(test.TestCase):
 
   def _getCondTensor(self):
     cond_tensor = []
+
     def true_fn():
       if not cond_tensor:
         cond_tensor.append(constant_op.constant(1))
       return cond_tensor[0]
-    control_flow_ops.cond(math_ops.less(1, 2), true_fn,
-                          lambda: constant_op.constant(0))
+
+    control_flow_ops.cond(
+        math_ops.less(1, 2), true_fn, lambda: constant_op.constant(0))
     return cond_tensor[0]
 
   def testInvalidContext(self):
@@ -2821,14 +2844,13 @@ class ControlFlowContextCheckTest(test.TestCase):
     # Accessing a while loop tensor in cond is illegal.
     while_tensor = self._getWhileTensor()
     with self.assertRaisesRegexp(
-        ValueError,
-        "Cannot use 'while/Const_1' as input to 'cond/Add' because "
+        ValueError, "Cannot use 'while/Const_1' as input to 'cond/Add' because "
         "'while/Const_1' is in a while loop. See info log for more details."):
       # TODO(skyewm): this passes if we return while_tensor directly instead
       # of using it as input to another op.
-      control_flow_ops.cond(math_ops.less(1, 2),
-                            lambda: math_ops.add(1, while_tensor),
-                            lambda: constant_op.constant(0))
+      control_flow_ops.cond(
+          math_ops.less(1, 2), lambda: math_ops.add(1, while_tensor),
+          lambda: constant_op.constant(0))
 
   def testInvalidContextInWhile(self):
     # Accessing a while loop tensor in a different while loop is illegal.
@@ -2856,6 +2878,7 @@ class ControlFlowContextCheckTest(test.TestCase):
     # Accessing a tensor from a cond context from the other branch's cond
     # context is OK (although dangerous).
     cond_tensor = []
+
     def branch_fn():
       if not cond_tensor:
         cond_tensor.append(constant_op.constant(1))
@@ -2892,12 +2915,13 @@ class ControlFlowContextCheckTest(test.TestCase):
       while_tensor = self._getWhileTensor()
       return control_flow_ops.while_loop(lambda i: i < 3,
                                          lambda i: i + while_tensor, [0])
+
     with self.assertRaisesRegexp(
         ValueError,
         "Cannot use 'cond/while_1/add' as input to 'cond/while/Const_1' because"
         " they are in different while loops. See info log for more details."):
-      control_flow_ops.cond(math_ops.less(1, 2), true_fn,
-                            lambda: constant_op.constant(0))
+      control_flow_ops.cond(
+          math_ops.less(1, 2), true_fn, lambda: constant_op.constant(0))
 
 
 @test_util.with_c_api
@@ -3005,11 +3029,13 @@ class AssertTest(test.TestCase):
       sess.run(unguarded_assert, options=opts, run_metadata=unguarded_metadata)
       guarded_nodestat_names = [
           n.node_name
-          for d in guarded_metadata.step_stats.dev_stats for n in d.node_stats
+          for d in guarded_metadata.step_stats.dev_stats
+          for n in d.node_stats
       ]
       unguarded_nodestat_names = [
           n.node_name
-          for d in unguarded_metadata.step_stats.dev_stats for n in d.node_stats
+          for d in unguarded_metadata.step_stats.dev_stats
+          for n in d.node_stats
       ]
       guarded_memcpy_nodestat_names = [
           n for n in guarded_nodestat_names if "MEMCPYDtoH" in n
@@ -3066,6 +3092,7 @@ class WhileOpBenchmark(test.Benchmark):
     Returns:
       The duration of the run in seconds.
     """
+
     def loop_body(i, x):
       with ops.device("/gpu:0"):
         # Always put loop body on GPU.
@@ -3107,7 +3134,7 @@ class WhileOpBenchmark(test.Benchmark):
       start_time = time.time()
       for _ in xrange(num_iters):
         sess.run(r)
-      return (time.time() - start_time)/num_iters
+      return (time.time() - start_time) / num_iters
 
   def benchmarkWhileOpCrossDevicePlacement(self):
     iters = 10
@@ -3154,23 +3181,20 @@ class EagerTest(test.TestCase):
   def testWhileLoop(self):
     with context.eager_mode():
       tensor = constant_op.constant([1, 2, 3, 4, 5])
-      self.assertAllEqual(isum(tensor).numpy(),
-                          [46, 47, 48, 49, 50])
+      self.assertAllEqual(isum(tensor).numpy(), [46, 47, 48, 49, 50])
 
   def testWhileLoopWithMaxIterations(self):
     with context.eager_mode():
       tensor = constant_op.constant([1, 2, 3, 4, 5])
-      self.assertAllEqual(isum(tensor, maximum_iterations=3).numpy(),
-                          [1+3, 2+3, 3+3, 4+3, 5+3])
+      self.assertAllEqual(
+          isum(tensor, maximum_iterations=3).numpy(),
+          [1 + 3, 2 + 3, 3 + 3, 4 + 3, 5 + 3])
 
   def testWhileWithMaximumIterationsAndSingleArgument(self):
     with context.eager_mode():
       tensor = constant_op.constant(0)
       r = control_flow_ops.while_loop(
-          lambda i: i < 3,
-          lambda i: i + 1,
-          [tensor],
-          maximum_iterations=1)
+          lambda i: i < 3, lambda i: i + 1, [tensor], maximum_iterations=1)
       self.assertEqual(1, r.numpy())
 
   def testWithDependencies(self):
@@ -3197,8 +3221,8 @@ class EagerTest(test.TestCase):
       f2 = lambda: constant_op.constant(23)
       f3 = lambda: constant_op.constant(-1)
 
-      r1 = control_flow_ops.case([(x < y, f1), (x > z, f2)],
-                                 default=f3, exclusive=True)
+      r1 = control_flow_ops.case(
+          [(x < y, f1), (x > z, f2)], default=f3, exclusive=True)
       self.assertAllEqual(r1.numpy(), 17)
 
 
diff --git a/tensorflow/python/kernel_tests/cwise_ops_test.py b/tensorflow/python/kernel_tests/cwise_ops_test.py
index a91917b27f..0d9b46c30d 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_test.py
@@ -71,6 +71,7 @@ def _sparsify(x, thresh=0.5, index_dtype=np.int64):
   return sparse_tensor.SparseTensor(
       indices=x_indices, values=x_values, dense_shape=x_shape), x_values
 
+
 def _default_tolerance(dtype):
   """Returns a sensible default tolerance for comparing results of a given
   type"""
@@ -81,7 +82,7 @@ def _default_tolerance(dtype):
   elif dtype in (np.float64, np.complex128):
     return 1e-5
   else:
-    return None # Fail fast for unexpected types
+    return None  # Fail fast for unexpected types
 
 
 class UnaryOpTest(test.TestCase):
@@ -233,10 +234,10 @@ class UnaryOpTest(test.TestCase):
     self._compareBoth(k, np.arccos, math_ops.acos)
     self._compareBoth(x, np.arctan, math_ops.atan)
     self._compareBoth(x, np.tan, math_ops.tan)
-    self._compareBoth(
-        y,
-        np.vectorize(self._replace_domain_error_with_inf(math.lgamma)),
-        math_ops.lgamma)
+    self._compareBoth(y,
+                      np.vectorize(
+                          self._replace_domain_error_with_inf(math.lgamma)),
+                      math_ops.lgamma)
     self._compareBoth(x, np.vectorize(math.erf), math_ops.erf)
     self._compareBoth(x, np.vectorize(math.erfc), math_ops.erfc)
 
@@ -298,8 +299,8 @@ class UnaryOpTest(test.TestCase):
     w = x - x.min() + 1.02  # all greater than 1
     y = (x + .5).astype(np.float64)  # no zero
     z = (x + 15.5).astype(np.float64)  # all positive
-    k = np.arange(-0.90, 0.90, 0.35).reshape(1, 3, 2).astype(
-        np.float64)  # between -1 and 1
+    k = np.arange(-0.90, 0.90,
+                  0.35).reshape(1, 3, 2).astype(np.float64)  # between -1 and 1
     self._compareBoth(x, np.abs, math_ops.abs)
     self._compareBoth(x, np.abs, _ABS)
     self._compareBoth(x, np.negative, math_ops.negative)
@@ -322,10 +323,10 @@ class UnaryOpTest(test.TestCase):
     self._compareBoth(y, np.sign, math_ops.sign)
     self._compareBoth(x, np.sin, math_ops.sin)
     self._compareBoth(x, np.cos, math_ops.cos)
-    self._compareBoth(
-        y,
-        np.vectorize(self._replace_domain_error_with_inf(math.lgamma)),
-        math_ops.lgamma)
+    self._compareBoth(y,
+                      np.vectorize(
+                          self._replace_domain_error_with_inf(math.lgamma)),
+                      math_ops.lgamma)
     self._compareBoth(x, np.vectorize(math.erf), math_ops.erf)
     self._compareBoth(x, np.vectorize(math.erfc), math_ops.erfc)
     self._compareBoth(x, np.arctan, math_ops.atan)
@@ -362,10 +363,10 @@ class UnaryOpTest(test.TestCase):
     self._compareBoth(y, np.sign, math_ops.sign)
     self._compareBoth(x, np.sin, math_ops.sin)
     self._compareBoth(x, np.cos, math_ops.cos)
-    self._compareBoth(
-        y,
-        np.vectorize(self._replace_domain_error_with_inf(math.lgamma)),
-        math_ops.lgamma)
+    self._compareBoth(y,
+                      np.vectorize(
+                          self._replace_domain_error_with_inf(math.lgamma)),
+                      math_ops.lgamma)
     self._compareBoth(x, np.vectorize(math.erf), math_ops.erf)
     self._compareBoth(x, np.vectorize(math.erfc), math_ops.erfc)
 
@@ -406,8 +407,8 @@ class UnaryOpTest(test.TestCase):
     self._compareBothSparse(x, np.sign, math_ops.sign)
 
   def testComplex64Basic(self):
-    x = np.complex(1, 1) * np.arange(-3, 3).reshape(1, 3,
-                                                    2).astype(np.complex64)
+    x = np.complex(1, 1) * np.arange(-3, 3).reshape(1, 3, 2).astype(
+        np.complex64)
     y = x + np.complex(0.5, 0.5)  # no zeros
     self._compareBoth(x, np.abs, math_ops.abs)
     self._compareBoth(x, np.abs, _ABS)
@@ -450,8 +451,8 @@ class UnaryOpTest(test.TestCase):
     self._compareBothSparse(y, complex_sign, math_ops.sign)
 
   def testComplex128Basic(self):
-    x = np.complex(1, 1) * np.arange(-3, 3).reshape(1, 3,
-                                                    2).astype(np.complex128)
+    x = np.complex(1, 1) * np.arange(-3, 3).reshape(1, 3, 2).astype(
+        np.complex128)
     y = x + np.complex(0.5, 0.5)  # no zeros
     self._compareBoth(x, np.abs, math_ops.abs)
     self._compareBoth(x, np.abs, _ABS)
@@ -805,10 +806,10 @@ class BinaryOpTest(test.TestCase):
     self._compareBoth(x, y, np.mod, _MOD)
 
   def testComplex64Basic(self):
-    x = np.complex(1, 1) * np.linspace(-10, 10, 6).reshape(
-        1, 3, 2).astype(np.complex64)
-    y = np.complex(1, 1) * np.linspace(20, -20, 6).reshape(
-        1, 3, 2).astype(np.complex64)
+    x = np.complex(1, 1) * np.linspace(-10, 10, 6).reshape(1, 3, 2).astype(
+        np.complex64)
+    y = np.complex(1, 1) * np.linspace(20, -20, 6).reshape(1, 3, 2).astype(
+        np.complex64)
     self._compareBoth(x, y, np.add, math_ops.add)
     self._compareBoth(x, y, np.subtract, math_ops.subtract)
     self._compareBoth(x, y, np.multiply, math_ops.multiply)
@@ -819,10 +820,10 @@ class BinaryOpTest(test.TestCase):
     self._compareBoth(x, y + 0.1, np.true_divide, _TRUEDIV)
 
   def testComplex128Basic(self):
-    x = np.complex(1, 1) * np.linspace(-10, 10, 6).reshape(
-        1, 3, 2).astype(np.complex128)
-    y = np.complex(1, 1) * np.linspace(20, -20, 6).reshape(
-        1, 3, 2).astype(np.complex128)
+    x = np.complex(1, 1) * np.linspace(-10, 10, 6).reshape(1, 3, 2).astype(
+        np.complex128)
+    y = np.complex(1, 1) * np.linspace(20, -20, 6).reshape(1, 3, 2).astype(
+        np.complex128)
     self._compareBoth(x, y, np.add, math_ops.add)
     self._compareBoth(x, y, np.subtract, math_ops.subtract)
     self._compareBoth(x, y, np.multiply, math_ops.multiply)
@@ -1127,8 +1128,8 @@ class BinaryOpTest(test.TestCase):
 
   def testMismatchedDimensions(self):
     for func in [
-        math_ops.add, math_ops.subtract, math_ops.multiply, math_ops.div,
-        _ADD, _SUB, _MUL, _TRUEDIV, _FLOORDIV
+        math_ops.add, math_ops.subtract, math_ops.multiply, math_ops.div, _ADD,
+        _SUB, _MUL, _TRUEDIV, _FLOORDIV
     ]:
       with self.assertRaisesWithPredicateMatch(
           ValueError, lambda e: "Dimensions must" in str(e)):
@@ -1161,8 +1162,8 @@ class BinaryOpTest(test.TestCase):
                    (1.2345, float("inf")), (1.2345, -float("inf")),
                    (-4.321, float("inf")), (-4.125, -float("inf")),
                    (float("inf"), float("inf")), (float("inf"), -float("inf")),
-                   (-float("inf"), float("inf")), (-float("inf"),
-                                                   -float("inf")))
+                   (-float("inf"), float("inf")),
+                   (-float("inf"), -float("inf")))
     for dtype in np.float32, np.float64:
       x1 = np.array(x1l).astype(dtype)
       x2 = np.array(x2l).astype(dtype)
@@ -1213,22 +1214,22 @@ class ComparisonOpTest(test.TestCase):
       for x in data:
         for y in data:
           self.assertEqual(self._compareScalar(math_ops.less, x, y, t), x < y)
-          self.assertEqual(self._compareScalar(math_ops.less_equal, x, y, t),
-                           x <= y)
-          self.assertEqual(self._compareScalar(math_ops.greater, x, y, t),
-                           x > y)
+          self.assertEqual(
+              self._compareScalar(math_ops.less_equal, x, y, t), x <= y)
+          self.assertEqual(
+              self._compareScalar(math_ops.greater, x, y, t), x > y)
           self.assertEqual(
               self._compareScalar(math_ops.greater_equal, x, y, t), x >= y)
           self.assertEqual(self._compareScalar(math_ops.equal, x, y, t), x == y)
-          self.assertEqual(self._compareScalar(math_ops.not_equal, x, y, t),
-                           x != y)
+          self.assertEqual(
+              self._compareScalar(math_ops.not_equal, x, y, t), x != y)
     data = [-1, 0, 1, -1j, 1j, 1 + 1j, 1 - 1j]
     for t in [np.complex64, np.complex128]:
       for x in data:
         for y in data:
           self.assertEqual(self._compareScalar(math_ops.equal, x, y, t), x == y)
-          self.assertEqual(self._compareScalar(math_ops.not_equal, x, y, t),
-                           x != y)
+          self.assertEqual(
+              self._compareScalar(math_ops.not_equal, x, y, t), x != y)
 
   def _compare(self, x, y, np_func, tf_func):
     np_ans = np_func(x, y)
@@ -1311,8 +1312,8 @@ class ComparisonOpTest(test.TestCase):
     self._testBCastByFunc(np.equal, math_ops.equal, include_complex=True)
 
   def testBCastNotEqual(self):
-    self._testBCastByFunc(np.not_equal, math_ops.not_equal,
-                          include_complex=True)
+    self._testBCastByFunc(
+        np.not_equal, math_ops.not_equal, include_complex=True)
 
   def testShapeMismatch(self):
     dtypes = [np.float16, np.float32, np.float64, np.int32, np.int64]
@@ -1771,9 +1772,8 @@ class MathOpsOverloadTest(test.TestCase):
   def _compareUnary(self, x, dtype, np_func, tf_func):
     np_ans = np_func(x).astype(dtype.as_numpy_dtype)
     with self.test_session(use_gpu=False):
-      self.assertAllClose(
-          np_ans, tf_func(ops.convert_to_tensor(
-              x, dtype=dtype)).eval())
+      self.assertAllClose(np_ans,
+                          tf_func(ops.convert_to_tensor(x, dtype=dtype)).eval())
 
   def testOverload(self):
     dtypes = [
@@ -1795,8 +1795,8 @@ class MathOpsOverloadTest(test.TestCase):
     ]
     for dtype in dtypes:
       for np_func, tf_func in funcs:
-        if dtype in (dtypes_lib.complex64, dtypes_lib.complex128
-                    ) and tf_func == _FLOORDIV:
+        if dtype in (dtypes_lib.complex64,
+                     dtypes_lib.complex128) and tf_func == _FLOORDIV:
           continue  # floordiv makes no sense for complex
         self._compareBinary(10, 5, dtype, np_func, tf_func)
     # Mod only works for int32 and int64.
@@ -2008,7 +2008,8 @@ class ComplexMakeRealImagTest(test.TestCase):
     # self._compareAngle(cplx, use_gpu=True)
 
   def testRealReal(self):
-    for dtype in dtypes_lib.int32, dtypes_lib.int64, dtypes_lib.float32, dtypes_lib.float64:
+    for dtype in (dtypes_lib.int32, dtypes_lib.int64, dtypes_lib.float32,
+                  dtypes_lib.float64):
       x = array_ops.placeholder(dtype)
       y = math_ops.real(x)
       self.assertEqual(x, y)
@@ -2037,15 +2038,16 @@ class ComplexMakeRealImagTest(test.TestCase):
     self._compareConj(cplx, use_gpu=True)
 
   def testConjReal(self):
-    for dtype in dtypes_lib.int32, dtypes_lib.int64, dtypes_lib.float16, dtypes_lib.float32, dtypes_lib.float64:
+    for dtype in (dtypes_lib.int32, dtypes_lib.int64, dtypes_lib.float16,
+                  dtypes_lib.float32, dtypes_lib.float64):
       x = array_ops.placeholder(dtype)
       y = math_ops.conj(x)
       self.assertEqual(x, y)
 
   def testConjString(self):
     x = array_ops.placeholder(dtypes_lib.string)
-    with self.assertRaisesRegexp(
-        TypeError, r"Expected numeric or variant tensor"):
+    with self.assertRaisesRegexp(TypeError,
+                                 r"Expected numeric or variant tensor"):
       math_ops.conj(x)
 
   def _compareGradient(self, x):
@@ -2060,8 +2062,9 @@ class ComplexMakeRealImagTest(test.TestCase):
       real, imag = array_ops.reshape(real, [-1]), array_ops.reshape(imag, [-1])
       cplx = math_ops.complex(real, imag)
       cplx = math_ops.conj(cplx)
-      loss = math_ops.reduce_sum(math_ops.square(math_ops.real(
-          cplx))) + math_ops.reduce_sum(math_ops.square(math_ops.imag(cplx)))
+      loss = math_ops.reduce_sum(math_ops.square(
+          math_ops.real(cplx))) + math_ops.reduce_sum(
+              math_ops.square(math_ops.imag(cplx)))
       epsilon = 1e-3
       jacob_t, jacob_n = gradient_checker.compute_gradient(
           inx, list(x.shape), loss, [1], x_init_value=x, delta=epsilon)
@@ -2125,8 +2128,8 @@ class AccumulateTest(test.TestCase):
           np.random.rand(16, 16, 16, 16).astype(np.float32) for _ in range(20)
       ]
       random_tensors = [
-          ops.convert_to_tensor(
-              x, dtype=dtypes_lib.float32) for x in random_arrays
+          ops.convert_to_tensor(x, dtype=dtypes_lib.float32)
+          for x in random_arrays
       ]
       tf_val = math_ops.accumulate_n(random_tensors)
       np_val = random_arrays[0]
diff --git a/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py b/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py
index cf723f5eec..a4b30e4319 100644
--- a/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py
+++ b/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py
@@ -48,8 +48,10 @@ class DynamicStitchTestBase(object):
 
   def testShapeInferenceForScalarWithNonConstantIndices(self):
     with self.test_session(use_gpu=True):
-      indices = [array_ops.placeholder(dtype=dtypes.int32),
-                 constant_op.constant(1)]
+      indices = [
+          array_ops.placeholder(dtype=dtypes.int32),
+          constant_op.constant(1)
+      ]
       data = [constant_op.constant(40), constant_op.constant(60)]
       for step in -1, 1:
         stitched_t = self.stitch_op(indices[::step], data)
@@ -61,7 +63,8 @@ class DynamicStitchTestBase(object):
   def testSimpleOneDimensional(self):
     with self.test_session(use_gpu=True):
       indices = [
-          constant_op.constant([0, 4, 7]), constant_op.constant([1, 6, 2, 3, 5])
+          constant_op.constant([0, 4, 7]),
+          constant_op.constant([1, 6, 2, 3, 5])
       ]
       data = [
           constant_op.constant([0, 40, 70]),
@@ -86,7 +89,8 @@ class DynamicStitchTestBase(object):
   def testSimpleTwoDimensional(self):
     with self.test_session(use_gpu=True):
       indices = [
-          constant_op.constant([0, 4, 7]), constant_op.constant([1, 6]),
+          constant_op.constant([0, 4, 7]),
+          constant_op.constant([1, 6]),
           constant_op.constant([2, 3, 5])
       ]
       data = [
@@ -104,7 +108,8 @@ class DynamicStitchTestBase(object):
   def testHigherRank(self):
     with self.test_session(use_gpu=True) as sess:
       indices = [
-          constant_op.constant(6), constant_op.constant([4, 1]),
+          constant_op.constant(6),
+          constant_op.constant([4, 1]),
           constant_op.constant([[5, 2], [0, 3]])
       ]
       data = [
@@ -127,7 +132,8 @@ class DynamicStitchTestBase(object):
 
   def testErrorIndicesMultiDimensional(self):
     indices = [
-        constant_op.constant([0, 4, 7]), constant_op.constant([[1, 6, 2, 3, 5]])
+        constant_op.constant([0, 4, 7]),
+        constant_op.constant([[1, 6, 2, 3, 5]])
     ]
     data = [
         constant_op.constant([[0, 40, 70]]),
@@ -138,7 +144,8 @@ class DynamicStitchTestBase(object):
 
   def testErrorDataNumDimsMismatch(self):
     indices = [
-        constant_op.constant([0, 4, 7]), constant_op.constant([1, 6, 2, 3, 5])
+        constant_op.constant([0, 4, 7]),
+        constant_op.constant([1, 6, 2, 3, 5])
     ]
     data = [
         constant_op.constant([0, 40, 70]),
@@ -149,7 +156,8 @@ class DynamicStitchTestBase(object):
 
   def testErrorDataDimSizeMismatch(self):
     indices = [
-        constant_op.constant([0, 4, 5]), constant_op.constant([1, 6, 2, 3])
+        constant_op.constant([0, 4, 5]),
+        constant_op.constant([1, 6, 2, 3])
     ]
     data = [
         constant_op.constant([[0], [40], [70]]),
@@ -160,7 +168,8 @@ class DynamicStitchTestBase(object):
 
   def testErrorDataAndIndicesSizeMismatch(self):
     indices = [
-        constant_op.constant([0, 4, 7]), constant_op.constant([1, 6, 2, 3, 5])
+        constant_op.constant([0, 4, 7]),
+        constant_op.constant([1, 6, 2, 3, 5])
     ]
     data = [
         constant_op.constant([0, 40, 70]),
@@ -235,13 +244,15 @@ class ParallelDynamicStitchTest(DynamicStitchTestBase, test.TestCase):
   def testHigherRankGPU(self):
     with self.test_session() as sess:
       indices = [
-          constant_op.constant(6), constant_op.constant([4, 1]),
+          constant_op.constant(6),
+          constant_op.constant([4, 1]),
           constant_op.constant([[5, 2], [0, 3]])
       ]
       data = [
           constant_op.constant([61, 62], dtype=dtypes.float32),
           constant_op.constant([[41, 42], [11, 12]], dtype=dtypes.float32),
-          constant_op.constant([[[51, 52], [21, 22]], [[1, 2], [31, 32]]], dtype=dtypes.float32)
+          constant_op.constant(
+              [[[51, 52], [21, 22]], [[1, 2], [31, 32]]], dtype=dtypes.float32)
       ]
       stitched_t = data_flow_ops.dynamic_stitch(indices, data)
       stitched_val = stitched_t.eval()
diff --git a/tensorflow/python/kernel_tests/partitioned_variables_test.py b/tensorflow/python/kernel_tests/partitioned_variables_test.py
index 56a07cb012..f5c6255c34 100644
--- a/tensorflow/python/kernel_tests/partitioned_variables_test.py
+++ b/tensorflow/python/kernel_tests/partitioned_variables_test.py
@@ -50,8 +50,7 @@ class PartitionerCreatorsTest(test.TestCase):
     with self.test_session():
       partitioner = partitioned_variables.fixed_size_partitioner(4, axis=0)
       with variable_scope.variable_scope("root", partitioner=partitioner):
-        v0 = variable_scope.get_variable(
-            "v0", dtype=dtypes.int64, shape=[20])
+        v0 = variable_scope.get_variable("v0", dtype=dtypes.int64, shape=[20])
         v0_list = v0._get_variable_list()
         self.assertEqual(len(v0_list), 4)
 
@@ -169,8 +168,10 @@ class PartitionerCreatorsTest(test.TestCase):
           max_shards=2)
 
       # Use the partitioner with strings
-      partitioner_axis3_str = partitioned_variables.variable_axis_size_partitioner(
-          axis=3, max_shard_bytes=32768, bytes_per_string_element=8)
+      partitioner_axis3_str = partitioned_variables.variable_axis_size_partitioner(  # pylint: disable=line-too-long
+          axis=3,
+          max_shard_bytes=32768,
+          bytes_per_string_element=8)
 
       with variable_scope.variable_scope(
           "root", partitioner=partitioner_axis3_str):
@@ -423,8 +424,7 @@ class PartitionedVariablesTestCase(test.TestCase):
   def testRandomInitUnevenPartitions(self):
     with self.test_session():
       rnd = variables.Variable(
-          random_ops.random_uniform(
-              [20, 43], dtype=dtypes.float64))
+          random_ops.random_uniform([20, 43], dtype=dtypes.float64))
       var_lists = [
           partitioned_variables.create_partitioned_variables(
               rnd.get_shape(), [1, i], rnd.initialized_value())
diff --git a/tensorflow/python/kernel_tests/random/random_ops_test.py b/tensorflow/python/kernel_tests/random/random_ops_test.py
index 5a2903a423..df37dd98ec 100644
--- a/tensorflow/python/kernel_tests/random/random_ops_test.py
+++ b/tensorflow/python/kernel_tests/random/random_ops_test.py
@@ -203,7 +203,8 @@ class RandomUniformTest(test.TestCase):
     return func
 
   def testRange(self):
-    for dt in dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32, dtypes.int64:
+    for dt in (dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32,
+               dtypes.int64):
       sampler = self._Sampler(1000, minv=-2, maxv=8, dtype=dt, use_gpu=True)
       x = sampler()
       self.assertTrue(-2 <= np.min(x))
@@ -213,7 +214,8 @@ class RandomUniformTest(test.TestCase):
   # to see the same sequence of values. Will catch buggy
   # implementations which uses the same random number seed.
   def testDistinct(self):
-    for dt in dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32, dtypes.int64:
+    for dt in (dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32,
+               dtypes.int64):
       maxv = 1.0 if dt.is_floating else 1 << 30
       sampler = self._Sampler(1000, minv=0, maxv=maxv, dtype=dt, use_gpu=True)
       x = sampler()
@@ -251,7 +253,8 @@ class RandomUniformTest(test.TestCase):
   # Checks that the CPU and GPU implementation returns the same results,
   # given the same random seed
   def testCPUGPUMatch(self):
-    for dt in dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32, dtypes.int64:
+    for dt in (dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32,
+               dtypes.int64):
       maxv = 1.0 if dt.is_floating else 17
       results = {}
       for use_gpu in False, True:
@@ -261,7 +264,8 @@ class RandomUniformTest(test.TestCase):
       self.assertAllEqual(results[False], results[True])
 
   def testSeed(self):
-    for dt in dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32, dtypes.int64:
+    for dt in (dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32,
+               dtypes.int64):
       for seed in [345, 2**100, -2**100]:
         sx = self._Sampler(1000, 0, 17, dtype=dt, use_gpu=True, seed=seed)
         sy = self._Sampler(1000, 0, 17, dtype=dt, use_gpu=True, seed=seed)
@@ -285,8 +289,7 @@ class RandomShapeTest(test.TestCase):
     self.assertEqual([1, 2, 3], rnd1.get_shape())
     # Partially known shape.
     rnd2 = random_ops.truncated_normal(
-        array_ops.placeholder(
-            dtypes.int32, shape=(3,)))
+        array_ops.placeholder(dtypes.int32, shape=(3,)))
     self.assertEqual([None, None, None], rnd2.get_shape().as_list())
     # Unknown shape.
     rnd3 = random_ops.truncated_normal(array_ops.placeholder(dtypes.int32))
@@ -298,8 +301,7 @@ class RandomShapeTest(test.TestCase):
     self.assertEqual([1, 2, 3], rnd1.get_shape())
     # Partially known shape.
     rnd2 = random_ops.random_normal(
-        array_ops.placeholder(
-            dtypes.int32, shape=(3,)))
+        array_ops.placeholder(dtypes.int32, shape=(3,)))
     self.assertEqual([None, None, None], rnd2.get_shape().as_list())
     # Unknown shape.
     rnd3 = random_ops.random_normal(array_ops.placeholder(dtypes.int32))
@@ -311,8 +313,7 @@ class RandomShapeTest(test.TestCase):
     self.assertEqual([1, 2, 3], rnd1.get_shape())
     # Partially known shape.
     rnd2 = random_ops.random_uniform(
-        array_ops.placeholder(
-            dtypes.int32, shape=(3,)))
+        array_ops.placeholder(dtypes.int32, shape=(3,)))
     self.assertEqual([None, None, None], rnd2.get_shape().as_list())
     # Unknown shape.
     rnd3 = random_ops.random_uniform(array_ops.placeholder(dtypes.int32))
diff --git a/tensorflow/python/kernel_tests/xent_op_test.py b/tensorflow/python/kernel_tests/xent_op_test.py
index c6c7c4e26c..e152f02d8e 100644
--- a/tensorflow/python/kernel_tests/xent_op_test.py
+++ b/tensorflow/python/kernel_tests/xent_op_test.py
@@ -38,9 +38,8 @@ class XentTest(test.TestCase):
       dim = len(features.shape) - 1
     one_only_on_dim = list(features.shape)
     one_only_on_dim[dim] = 1
-    e = np.exp(features - np.reshape(
-        np.amax(
-            features, axis=dim), one_only_on_dim))
+    e = np.exp(
+        features - np.reshape(np.amax(features, axis=dim), one_only_on_dim))
     probs = e / np.reshape(np.sum(e, axis=dim), one_only_on_dim)
     bp = (probs - labels)
     l = -np.sum(labels * np.log(probs + 1.0e-20), axis=dim)
@@ -85,10 +84,10 @@ class XentTest(test.TestCase):
 
   def testRankTooLarge(self):
     for dtype in np.float16, np.float32:
-      np_features = np.array(
-          [[[1., 1., 1., 1.]], [[1., 2., 3., 4.]]]).astype(dtype)
-      np_labels = np.array(
-          [[[0., 0., 0., 1.]], [[0., .5, .5, 0.]]]).astype(dtype)
+      np_features = np.array([[[1., 1., 1., 1.]], [[1., 2., 3.,
+                                                    4.]]]).astype(dtype)
+      np_labels = np.array([[[0., 0., 0., 1.]], [[0., .5, .5,
+                                                  0.]]]).astype(dtype)
       self.assertRaisesRegexp(ValueError, "must be rank 2",
                               gen_nn_ops._softmax_cross_entropy_with_logits,
                               np_features, np_labels)
@@ -121,8 +120,8 @@ class XentTest(test.TestCase):
     # = [1.3862, 1.9401]
     np_loss, np_backprop = self._npXent(np.array(features), np.array(labels))
     self.assertAllClose(
-        np.array([[0.25, 0.25, 0.25, -0.75],
-                  [0.0321, -0.4129, -0.2632, 0.6439]]),
+        np.array([[0.25, 0.25, 0.25, -0.75], [0.0321, -0.4129, -0.2632,
+                                              0.6439]]),
         np_backprop,
         rtol=1.e-3,
         atol=1.e-3)
@@ -168,15 +167,17 @@ class XentTest(test.TestCase):
           shape=[3, 4],
           dtype=dtypes.float64,
           name="f")
-      x = nn_ops.softmax_cross_entropy_with_logits(labels=l, logits=f,
-                                                   name="xent")
+      x = nn_ops.softmax_cross_entropy_with_logits(
+          labels=l, logits=f, name="xent")
       err = gradient_checker.compute_gradient_error(f, [3, 4], x, [3])
 
       # Check that no extra computation performed. When only first derivative is requested,
       # second derivative must not be computed. So when there is no second derivative,
       # there is no `BatchMatMul` op in the graph.
-      op_names = [op.op_def.name for op in sess.graph.get_operations() if op.op_def]
-      self.assertNotIn('BatchMatMul', op_names)
+      op_names = [
+          op.op_def.name for op in sess.graph.get_operations() if op.op_def
+      ]
+      self.assertNotIn("BatchMatMul", op_names)
 
     print("cross entropy gradient err = ", err)
     self.assertLess(err, 5e-8)
@@ -193,24 +194,29 @@ class XentTest(test.TestCase):
           shape=[3, 4],
           dtype=dtypes.float64,
           name="f")
-      x = nn_ops.softmax_cross_entropy_with_logits_v2(labels=l, logits=f,
-                                                      name="xent")
+      x = nn_ops.softmax_cross_entropy_with_logits_v2(
+          labels=l, logits=f, name="xent")
       err = gradient_checker.compute_gradient_error(l, [3, 4], x, [3])
 
     self.assertLess(err, 5e-8)
 
   def testSecondGradient(self):
     with self.test_session() as sess:
-      l = constant_op.constant([0.0, 0.0, 1.0/3, 0.0,
-                                1.0/3, 0.0, 0.0, 0.0,
-                                0.0, 0.5/3, 0.0, 0.5/3], shape=[12],
-                               dtype=dtypes.float64, name="l")
-      f = constant_op.constant([0.1, 0.2, 0.3, 0.4,
-                                0.1, 0.4, 0.9, 1.6,
-                                0.1, 0.8, 2.7, 6.4], shape=[12],
-                               dtype=dtypes.float64, name="f")
-      x = nn_ops.softmax_cross_entropy_with_logits(labels=l, logits=f,
-                                                   name="xent")
+      l = constant_op.constant(
+          [
+              0.0, 0.0, 1.0 / 3, 0.0, 1.0 / 3, 0.0, 0.0, 0.0, 0.0, 0.5 / 3, 0.0,
+              0.5 / 3
+          ],
+          shape=[12],
+          dtype=dtypes.float64,
+          name="l")
+      f = constant_op.constant(
+          [0.1, 0.2, 0.3, 0.4, 0.1, 0.4, 0.9, 1.6, 0.1, 0.8, 2.7, 6.4],
+          shape=[12],
+          dtype=dtypes.float64,
+          name="f")
+      x = nn_ops.softmax_cross_entropy_with_logits(
+          labels=l, logits=f, name="xent")
       loss = math_ops.reduce_sum(x)
 
       gradients = gradients_impl.gradients(loss, [f])[0]
@@ -219,20 +225,23 @@ class XentTest(test.TestCase):
 
       # Check that second derivative is calculated.
       # (it is equivalent to being `BatchMatMul` op in the graph because of implementation of xentropy grad)
-      op_names = [op.op_def.name for op in sess.graph.get_operations() if op.op_def]
-      self.assertIn('BatchMatMul', op_names)
+      op_names = [
+          op.op_def.name for op in sess.graph.get_operations() if op.op_def
+      ]
+      self.assertIn("BatchMatMul", op_names)
 
     print("cross entropy hessian err = ", err)
     self.assertLess(err, 5e-8)
 
   def testWrapper(self):
-    features = np.array(
-        [[[1., 1., 1., 1.], [1., 2., 3., 4.]],
-         [[2., 3., 4., 5.], [6., 7., 8., 9.]],
-         [[5., 4., 3., 2.], [1., 2., 3., 4.]]]).astype(np.float32)
+    features = np.array([[[1., 1., 1., 1.], [1., 2., 3., 4.]],
+                         [[2., 3., 4., 5.], [6., 7., 8., 9.]],
+                         [[5., 4., 3., 2.], [1., 2., 3., 4.]]]).astype(
+                             np.float32)
     labels = np.array([[[0., 0., 0., 1.], [0., 1., 0., 0.]],
                        [[0., 0.5, 0.5, 0.], [0.5, 0.5, 0., 0.]],
-                       [[0., 1., 0., 0.], [0., 0., 1., 0.]]]).astype(np.float32)
+                       [[0., 1., 0., 0.], [0., 0., 1., 0.]]]).astype(
+                           np.float32)
     self._testXentWrapper(features, labels, dim=0, use_gpu=False)
     self._testXentWrapper(features, labels, dim=0, use_gpu=True)
     self._testXentWrapper(features, labels, dim=1, use_gpu=False)
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index 80911ffe07..47dd8231c0 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -194,11 +194,11 @@ class AdjustGamma(test_util.TensorFlowTestCase):
     with self.test_session():
       x_data = np.random.uniform(0, 255, (8, 8))
       x_np = np.array(x_data, dtype=np.float32)
-    
+
       x = constant_op.constant(x_np, shape=x_np.shape)
 
-      err_msg = 'Gamma should be a non-negative real number.'
-      
+      err_msg = "Gamma should be a non-negative real number."
+
       try:
         image_ops.adjust_gamma(x, gamma=-1)
       except Exception as e:
@@ -212,13 +212,13 @@ class AdjustGamma(test_util.TensorFlowTestCase):
     with self.test_session():
       x_data = np.random.uniform(0, 255, (8, 8))
       x_np = np.array(x_data, dtype=np.float32)
-    
+
       x = constant_op.constant(x_np, shape=x_np.shape)
       y = constant_op.constant(-1.0, dtype=dtypes.float32)
-      
+
       image = image_ops.adjust_gamma(x, gamma=y)
-      
-      err_msg = 'Gamma should be a non-negative real number.'
+
+      err_msg = "Gamma should be a non-negative real number."
       try:
         image.eval()
       except Exception as e:
@@ -226,7 +226,7 @@ class AdjustGamma(test_util.TensorFlowTestCase):
           raise
       else:
         raise AssertionError("Exception not raised: %s" % err_msg)
-      
+
   def test_adjust_gamma_zero(self):
     """White image should be returned for gamma equal to zero"""
     with self.test_session():
@@ -253,13 +253,13 @@ class AdjustGamma(test_util.TensorFlowTestCase):
       y_tf = np.trunc(y.eval())
 
       y_np = np.array(
-          [[0, 31, 45, 55, 63, 71, 78, 84],
-           [90, 95, 100, 105, 110, 115, 119, 123],
-           [127, 131, 135, 139, 142, 146, 149, 153],
-           [156, 159, 162, 165, 168, 171, 174, 177],
-           [180, 183, 186, 188, 191, 194, 196, 199],
-           [201, 204, 206, 209, 211, 214, 216, 218],
-           [221, 223, 225, 228, 230, 232, 234, 236],
+          [[0, 31, 45, 55, 63, 71, 78, 84], [
+              90, 95, 100, 105, 110, 115, 119, 123
+          ], [127, 131, 135, 139, 142, 146, 149, 153], [
+              156, 159, 162, 165, 168, 171, 174, 177
+          ], [180, 183, 186, 188, 191, 194, 196, 199], [
+              201, 204, 206, 209, 211, 214, 216, 218
+          ], [221, 223, 225, 228, 230, 232, 234, 236],
            [238, 241, 243, 245, 247, 249, 251, 253]],
           dtype=np.float32)
 
@@ -274,14 +274,12 @@ class AdjustGamma(test_util.TensorFlowTestCase):
       y_tf = np.trunc(y.eval())
 
       y_np = np.array(
-          [[0, 0, 0, 0, 1, 1, 2, 3],
-           [4, 5, 6, 7, 9, 10, 12, 14],
-           [16, 18, 20, 22, 25, 27, 30, 33],
-           [36, 39, 42, 45, 49, 52, 56, 60],
-           [64, 68, 72, 76, 81, 85, 90, 95],
-           [100, 105, 110, 116, 121, 127, 132, 138],
-           [144, 150, 156, 163, 169, 176, 182, 189],
-           [196, 203, 211, 218, 225, 233, 241, 249]],
+          [[0, 0, 0, 0, 1, 1, 2, 3], [4, 5, 6, 7, 9, 10, 12, 14], [
+              16, 18, 20, 22, 25, 27, 30, 33
+          ], [36, 39, 42, 45, 49, 52, 56, 60], [64, 68, 72, 76, 81, 85, 90, 95],
+           [100, 105, 110, 116, 121, 127, 132, 138], [
+               144, 150, 156, 163, 169, 176, 182, 189
+           ], [196, 203, 211, 218, 225, 233, 241, 249]],
           dtype=np.float32)
 
       self.assertAllClose(y_tf, y_np, 1e-6)
@@ -425,8 +423,7 @@ class FlipImageBenchmark(test.Benchmark):
     with session.Session("", graph=ops.Graph(), config=config) as sess:
       with ops.device(device):
         inputs = variables.Variable(
-            random_ops.random_uniform(
-                image_shape, dtype=dtypes.float32) * 255,
+            random_ops.random_uniform(image_shape, dtype=dtypes.float32) * 255,
             trainable=False,
             dtype=dtypes.float32)
         run_op = image_ops.flip_left_right(inputs)
@@ -456,8 +453,7 @@ class FlipImageBenchmark(test.Benchmark):
     with session.Session("", graph=ops.Graph(), config=config) as sess:
       with ops.device(device):
         inputs = variables.Variable(
-            random_ops.random_uniform(
-                image_shape, dtype=dtypes.float32) * 255,
+            random_ops.random_uniform(image_shape, dtype=dtypes.float32) * 255,
             trainable=False,
             dtype=dtypes.float32)
         run_op = image_ops.random_flip_left_right(inputs)
@@ -508,8 +504,7 @@ class AdjustHueBenchmark(test.Benchmark):
     with session.Session("", graph=ops.Graph(), config=config) as sess:
       with ops.device(device):
         inputs = variables.Variable(
-            random_ops.random_uniform(
-                image_shape, dtype=dtypes.float32) * 255,
+            random_ops.random_uniform(image_shape, dtype=dtypes.float32) * 255,
             trainable=False,
             dtype=dtypes.float32)
         delta = constant_op.constant(0.1, dtype=dtypes.float32)
@@ -553,8 +548,7 @@ class AdjustSaturationBenchmark(test.Benchmark):
     with session.Session("", graph=ops.Graph(), config=config) as sess:
       with ops.device(device):
         inputs = variables.Variable(
-            random_ops.random_uniform(
-                image_shape, dtype=dtypes.float32) * 255,
+            random_ops.random_uniform(image_shape, dtype=dtypes.float32) * 255,
             trainable=False,
             dtype=dtypes.float32)
         delta = constant_op.constant(0.1, dtype=dtypes.float32)
@@ -609,10 +603,11 @@ class ResizeBilinearBenchmark(test.Benchmark):
       results = self.run_op_benchmark(
           sess,
           benchmark_op,
-          name=("resize_bilinear_%s_%s_%s" %
-                (image_size[0], image_size[1], num_channels)))
-      print("%s   : %.2f ms/img" % (results["name"], 1000 * results["wall_time"]
-                                    / (batch_size * num_ops)))
+          name=("resize_bilinear_%s_%s_%s" % (image_size[0], image_size[1],
+                                              num_channels)))
+      print("%s   : %.2f ms/img" %
+            (results["name"],
+             1000 * results["wall_time"] / (batch_size * num_ops)))
 
   def benchmarkSimilar3Channel(self):
     self._benchmarkResize((183, 229), 3)
@@ -659,8 +654,9 @@ class ResizeBicubicBenchmark(test.Benchmark):
           min_iters=20,
           name=("resize_bicubic_%s_%s_%s" % (image_size[0], image_size[1],
                                              num_channels)))
-      print("%s   : %.2f ms/img" % (results["name"], 1000 * results["wall_time"]
-                                    / (batch_size * num_ops)))
+      print("%s   : %.2f ms/img" %
+            (results["name"],
+             1000 * results["wall_time"] / (batch_size * num_ops)))
 
   def benchmarkSimilar3Channel(self):
     self._benchmarkResize((183, 229), 3)
@@ -696,8 +692,8 @@ class ResizeAreaBenchmark(test.Benchmark):
     batch_size = 1
     num_ops = 1000
     img = variables.Variable(
-        random_ops.random_normal([batch_size, image_size[0],
-                                  image_size[1], num_channels]),
+        random_ops.random_normal(
+            [batch_size, image_size[0], image_size[1], num_channels]),
         name="img")
 
     deps = []
@@ -710,12 +706,13 @@ class ResizeAreaBenchmark(test.Benchmark):
     with session.Session() as sess:
       sess.run(variables.global_variables_initializer())
       results = self.run_op_benchmark(
-          sess, benchmark_op,
-          name=("resize_area_%s_%s_%s" %
-                (image_size[0], image_size[1], num_channels)))
-      print("%s   : %.2f ms/img" % (
-          results["name"],
-          1000*results["wall_time"] / (batch_size * num_ops)))
+          sess,
+          benchmark_op,
+          name=("resize_area_%s_%s_%s" % (image_size[0], image_size[1],
+                                          num_channels)))
+      print("%s   : %.2f ms/img" %
+            (results["name"],
+             1000 * results["wall_time"] / (batch_size * num_ops)))
 
   def benchmarkSimilar3Channel(self):
     self._benchmarkResize((183, 229), 3)
@@ -789,8 +786,7 @@ class AdjustSaturationTest(test_util.TensorFlowTestCase):
     flt_image = image_ops.convert_image_dtype(image, dtypes.float32)
     saturation_adjusted_image = gen_image_ops.adjust_saturation(
         flt_image, saturation_factor)
-    return image_ops.convert_image_dtype(saturation_adjusted_image,
-                                         orig_dtype)
+    return image_ops.convert_image_dtype(saturation_adjusted_image, orig_dtype)
 
   def testHalfSaturationFused(self):
     x_shape = [2, 2, 3]
@@ -895,7 +891,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_left_right(x_tf)
-      self.assertTrue(y.op.name.startswith('flip_left_right'))
+      self.assertTrue(y.op.name.startswith("flip_left_right"))
       y_tf = y.eval()
       self.assertAllEqual(y_tf, y_np)
 
@@ -906,7 +902,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.random_flip_left_right(x_tf)
-      self.assertTrue(y.op.name.startswith('random_flip_left_right'))
+      self.assertTrue(y.op.name.startswith("random_flip_left_right"))
 
       count_flipped = 0
       count_unflipped = 0
@@ -937,7 +933,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_up_down(x_tf)
-      self.assertTrue(y.op.name.startswith('flip_up_down'))
+      self.assertTrue(y.op.name.startswith("flip_up_down"))
       y_tf = y.eval()
       self.assertAllEqual(y_tf, y_np)
 
@@ -948,7 +944,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.random_flip_up_down(x_tf)
-      self.assertTrue(y.op.name.startswith('random_flip_up_down'))
+      self.assertTrue(y.op.name.startswith("random_flip_up_down"))
       count_flipped = 0
       count_unflipped = 0
       for _ in range(50):
@@ -978,7 +974,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.transpose_image(x_tf)
-      self.assertTrue(y.op.name.startswith('transpose_image'))
+      self.assertTrue(y.op.name.startswith("transpose_image"))
       y_tf = y.eval()
       self.assertAllEqual(y_tf, y_np)
 
@@ -1203,7 +1199,7 @@ class PerImageWhiteningTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.per_image_standardization(x)
-      self.assertTrue(y.op.name.startswith('per_image_standardization'))
+      self.assertTrue(y.op.name.startswith("per_image_standardization"))
       y_tf = y.eval()
       self.assertAllClose(y_tf, y_np, atol=1e-4)
 
@@ -1375,9 +1371,10 @@ class CropToBoundingBoxTest(test_util.TensorFlowTestCase):
 
     # Each line is a test configuration:
     #   (offset_height, offset_width, target_height, target_width), err_msg
-    test_config = (([-1, 0, 3, 3], "offset_height must be >= 0"),
-                   ([0, -1, 3, 3], "offset_width must be >= 0"),
-                   ([0, 0, 0, 3], "target_height must be > 0"),
+    test_config = (([-1, 0, 3, 3], "offset_height must be >= 0"), ([
+        0, -1, 3, 3
+    ], "offset_width must be >= 0"), ([0, 0, 0, 3],
+                                      "target_height must be > 0"),
                    ([0, 0, 3, 0], "target_width must be > 0"),
                    ([2, 0, 3, 3], "height must be >= target + offset"),
                    ([0, 2, 3, 3], "width must be >= target + offset"))
@@ -1388,7 +1385,7 @@ class CropToBoundingBoxTest(test_util.TensorFlowTestCase):
   def testNameScope(self):
     image = array_ops.placeholder(dtypes.float32, shape=[55, 66, 3])
     y = image_ops.crop_to_bounding_box(image, 0, 0, 55, 66)
-    self.assertTrue(y.name.startswith('crop_to_bounding_box'))
+    self.assertTrue(y.name.startswith("crop_to_bounding_box"))
 
 
 class CentralCropTest(test_util.TensorFlowTestCase):
@@ -1413,9 +1410,10 @@ class CentralCropTest(test_util.TensorFlowTestCase):
 
   def testCropping(self):
     x_shape = [4, 8, 1]
-    x_np = np.array([[1, 2, 3, 4, 5, 6, 7, 8], [1, 2, 3, 4, 5, 6, 7, 8],
-                     [1, 2, 3, 4, 5, 6, 7, 8], [1, 2, 3, 4, 5, 6, 7, 8]],
-                    dtype=np.int32).reshape(x_shape)
+    x_np = np.array(
+        [[1, 2, 3, 4, 5, 6, 7, 8], [1, 2, 3, 4, 5, 6, 7, 8],
+         [1, 2, 3, 4, 5, 6, 7, 8], [1, 2, 3, 4, 5, 6, 7, 8]],
+        dtype=np.int32).reshape(x_shape)
     y_np = np.array([[3, 4, 5, 6], [3, 4, 5, 6]]).reshape([2, 4, 1])
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
@@ -1432,7 +1430,7 @@ class CentralCropTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = array_ops.placeholder(shape=x_shape, dtype=dtypes.int32)
       y = image_ops.central_crop(x, 0.33)
-      y_tf = y.eval(feed_dict={x:x_np})
+      y_tf = y.eval(feed_dict={x: x_np})
       self.assertAllEqual(y_tf, y_np)
       self.assertAllEqual(y_tf.shape, y_np.shape)
 
@@ -1471,7 +1469,7 @@ class CentralCropTest(test_util.TensorFlowTestCase):
     x_np = np.ones(x_shape, dtype=np.float32)
     with self.test_session(use_gpu=True):
       y = image_ops.central_crop(x_np, 1.0)
-      self.assertTrue(y.op.name.startswith('central_crop'))
+      self.assertTrue(y.op.name.startswith("central_crop"))
 
 
 class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
@@ -1544,15 +1542,10 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
     self.assertEqual(y.get_shape().as_list(), post_shape)
 
   def testInt64(self):
-    x = [1, 2, 3,
-         4, 5, 6,
-         7, 8, 9]
+    x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
     x_shape = [3, 3, 1]
 
-    y = [0, 0, 0,
-         1, 2, 3,
-         4, 5, 6,
-         7, 8, 9]
+    y = [0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
     y_shape = [4, 3, 1]
     x = np.array(x).reshape(x_shape)
     y = np.array(y).reshape(y_shape)
@@ -1569,38 +1562,26 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
     self._assertReturns(x, x_shape, offset_height, offset_width, x, x_shape)
 
   def testPadding(self):
-    x = [1, 2, 3,
-         4, 5, 6,
-         7, 8, 9]
+    x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
     x_shape = [3, 3, 1]
 
     offset_height, offset_width = [1, 0]
-    y = [0, 0, 0,
-         1, 2, 3,
-         4, 5, 6,
-         7, 8, 9]
+    y = [0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
     y_shape = [4, 3, 1]
     self._assertReturns(x, x_shape, offset_height, offset_width, y, y_shape)
 
     offset_height, offset_width = [0, 1]
-    y = [0, 1, 2, 3,
-         0, 4, 5, 6,
-         0, 7, 8, 9]
+    y = [0, 1, 2, 3, 0, 4, 5, 6, 0, 7, 8, 9]
     y_shape = [3, 4, 1]
     self._assertReturns(x, x_shape, offset_height, offset_width, y, y_shape)
 
     offset_height, offset_width = [0, 0]
-    y = [1, 2, 3,
-         4, 5, 6,
-         7, 8, 9,
-         0, 0, 0]
+    y = [1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 0]
     y_shape = [4, 3, 1]
     self._assertReturns(x, x_shape, offset_height, offset_width, y, y_shape)
 
     offset_height, offset_width = [0, 0]
-    y = [1, 2, 3, 0,
-         4, 5, 6, 0,
-         7, 8, 9, 0]
+    y = [1, 2, 3, 0, 4, 5, 6, 0, 7, 8, 9, 0]
     y_shape = [3, 4, 1]
     self._assertReturns(x, x_shape, offset_height, offset_width, y, y_shape)
 
@@ -1632,9 +1613,7 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
     # Input image has 0-length dimension(s).
     # Each line is a test configuration:
     #   x_shape, target_height, target_width
-    test_config = (([0, 2, 2], 2, 2),
-                   ([2, 0, 2], 2, 2),
-                   ([2, 2, 0], 2, 2))
+    test_config = (([0, 2, 2], 2, 2), ([2, 0, 2], 2, 2), ([2, 2, 0], 2, 2))
     offset_height, offset_width = [0, 0]
     x = []
 
@@ -1679,7 +1658,7 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
   def testNameScope(self):
     image = array_ops.placeholder(dtypes.float32, shape=[55, 66, 3])
     y = image_ops.pad_to_bounding_box(image, 0, 0, 55, 66)
-    self.assertTrue(y.op.name.startswith('pad_to_bounding_box'))
+    self.assertTrue(y.op.name.startswith("pad_to_bounding_box"))
 
 
 class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
@@ -1692,8 +1671,8 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
                               (bounding_box[2] - bounding_box[0]))
 
     image_size_np = np.array(image.shape, dtype=np.int32)
-    bounding_box_np = (np.array(
-        bounding_box, dtype=np.float32).reshape([1, 1, 4]))
+    bounding_box_np = (
+        np.array(bounding_box, dtype=np.float32).reshape([1, 1, 4]))
 
     aspect_ratios = []
     area_ratios = []
@@ -1738,7 +1717,9 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
       y = array_ops.strided_slice(image_tf, begin, begin + size)
 
       for _ in xrange(num_iter):
-        y_tf = y.eval(feed_dict={min_object_covered_placeholder: min_object_covered})
+        y_tf = y.eval(feed_dict={
+            min_object_covered_placeholder: min_object_covered
+        })
         crop_height = y_tf.shape[0]
         crop_width = y_tf.shape[1]
         aspect_ratio = float(crop_width) / float(crop_height)
@@ -1832,7 +1813,8 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
       bounding_box = constant_op.constant(
           [0.0, 0.0, 1.0, 1.0],
           shape=[4],
-          dtype=dtypes.float32,)
+          dtype=dtypes.float32,
+      )
       begin, end, bbox_for_drawing = image_ops.sample_distorted_bounding_box(
           image_size=image_size,
           bounding_boxes=bounding_box,
@@ -1860,13 +1842,15 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
 
 class ResizeImagesTest(test_util.TensorFlowTestCase):
 
-  OPTIONS = [image_ops.ResizeMethod.BILINEAR,
-             image_ops.ResizeMethod.NEAREST_NEIGHBOR,
-             image_ops.ResizeMethod.BICUBIC,
-             image_ops.ResizeMethod.AREA]
+  OPTIONS = [
+      image_ops.ResizeMethod.BILINEAR, image_ops.ResizeMethod.NEAREST_NEIGHBOR,
+      image_ops.ResizeMethod.BICUBIC, image_ops.ResizeMethod.AREA
+  ]
 
-  TYPES = [np.uint8, np.int8, np.uint16, np.int16, np.int32, np.int64,
-           np.float16, np.float32, np.float64]
+  TYPES = [
+      np.uint8, np.int8, np.uint16, np.int16, np.int32, np.int64, np.float16,
+      np.float32, np.float64
+  ]
 
   def _assertShapeInference(self, pre_shape, size, post_shape):
     # Try single image resize
@@ -1894,12 +1878,10 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
     single_shape = [6, 4, 1]
     # This test is also conducted with int8, so 127 is the maximum
     # value that can be used.
-    data = [127, 127, 64, 64,
-            127, 127, 64, 64,
-            64, 64, 127, 127,
-            64, 64, 127, 127,
-            50, 50, 100, 100,
-            50, 50, 100, 100]
+    data = [
+        127, 127, 64, 64, 127, 127, 64, 64, 64, 64, 127, 127, 64, 64, 127, 127,
+        50, 50, 100, 100, 50, 50, 100, 100
+    ]
     target_height = 6
     target_width = 4
 
@@ -1930,12 +1912,10 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
     single_shape = [6, 4, 1]
     # This test is also conducted with int8, so 127 is the maximum
     # value that can be used.
-    data = [127, 127, 64, 64,
-            127, 127, 64, 64,
-            64, 64, 127, 127,
-            64, 64, 127, 127,
-            50, 50, 100, 100,
-            50, 50, 100, 100]
+    data = [
+        127, 127, 64, 64, 127, 127, 64, 64, 64, 64, 127, 127, 64, 64, 127, 127,
+        50, 50, 100, 100, 50, 50, 100, 100
+    ]
     new_size = array_ops.placeholder(dtypes.int32, shape=(2))
 
     img_np = np.array(data, dtype=np.uint8).reshape(img_shape)
@@ -1989,8 +1969,10 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
                                   image_ops.ResizeMethod.BILINEAR)
 
   def testReturnDtype(self):
-    target_shapes = [[6, 4], [3, 2], [array_ops.placeholder(dtypes.int32),
-                                      array_ops.placeholder(dtypes.int32)]]
+    target_shapes = [[6, 4], [3, 2], [
+        array_ops.placeholder(dtypes.int32),
+        array_ops.placeholder(dtypes.int32)
+    ]]
     for nptype in self.TYPES:
       image = array_ops.placeholder(nptype, shape=[1, 6, 4, 1])
       for opt in self.OPTIONS:
@@ -2007,12 +1989,10 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
     img_shape = [1, 6, 4, 1]
     # This test is also conducted with int8, so 127 is the maximum
     # value that can be used.
-    data = [127, 127, 64, 64,
-            127, 127, 64, 64,
-            64, 64, 127, 127,
-            64, 64, 127, 127,
-            50, 50, 100, 100,
-            50, 50, 100, 100]
+    data = [
+        127, 127, 64, 64, 127, 127, 64, 64, 64, 64, 127, 127, 64, 64, 127, 127,
+        50, 50, 100, 100, 50, 50, 100, 100
+    ]
     # Test size where width is specified as a tensor which is a sum
     # of two tensors.
     width_1 = constant_op.constant(1)
@@ -2034,15 +2014,11 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
   def testResizeDown(self):
     # This test is also conducted with int8, so 127 is the maximum
     # value that can be used.
-    data = [127, 127, 64, 64,
-            127, 127, 64, 64,
-            64, 64, 127, 127,
-            64, 64, 127, 127,
-            50, 50, 100, 100,
-            50, 50, 100, 100]
-    expected_data = [127, 64,
-                     64, 127,
-                     50, 100]
+    data = [
+        127, 127, 64, 64, 127, 127, 64, 64, 64, 64, 127, 127, 64, 64, 127, 127,
+        50, 50, 100, 100, 50, 50, 100, 100
+    ]
+    expected_data = [127, 64, 64, 127, 50, 100]
     target_height = 3
     target_width = 2
 
@@ -2068,39 +2044,31 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
 
   def testResizeUpAlignCornersFalse(self):
     img_shape = [1, 3, 2, 1]
-    data = [64, 32,
-            32, 64,
-            50, 100]
+    data = [64, 32, 32, 64, 50, 100]
     target_height = 6
     target_width = 4
     expected_data = {}
     expected_data[image_ops.ResizeMethod.BILINEAR] = [
-        64.0, 48.0, 32.0, 32.0,
-        48.0, 48.0, 48.0, 48.0,
-        32.0, 48.0, 64.0, 64.0,
-        41.0, 61.5, 82.0, 82.0,
-        50.0, 75.0, 100.0, 100.0,
-        50.0, 75.0, 100.0, 100.0]
+        64.0, 48.0, 32.0, 32.0, 48.0, 48.0, 48.0, 48.0, 32.0, 48.0, 64.0, 64.0,
+        41.0, 61.5, 82.0, 82.0, 50.0, 75.0, 100.0, 100.0, 50.0, 75.0, 100.0,
+        100.0
+    ]
     expected_data[image_ops.ResizeMethod.NEAREST_NEIGHBOR] = [
-        64.0, 64.0, 32.0, 32.0,
-        64.0, 64.0, 32.0, 32.0,
-        32.0, 32.0, 64.0, 64.0,
-        32.0, 32.0, 64.0, 64.0,
-        50.0, 50.0, 100.0, 100.0,
-        50.0, 50.0, 100.0, 100.0]
+        64.0, 64.0, 32.0, 32.0, 64.0, 64.0, 32.0, 32.0, 32.0, 32.0, 64.0, 64.0,
+        32.0, 32.0, 64.0, 64.0, 50.0, 50.0, 100.0, 100.0, 50.0, 50.0, 100.0,
+        100.0
+    ]
     expected_data[image_ops.ResizeMethod.AREA] = [
-        64.0, 64.0, 32.0, 32.0,
-        64.0, 64.0, 32.0, 32.0,
-        32.0, 32.0, 64.0, 64.0,
-        32.0, 32.0, 64.0, 64.0,
-        50.0, 50.0, 100.0, 100.0,
-        50.0, 50.0, 100.0, 100.0]
+        64.0, 64.0, 32.0, 32.0, 64.0, 64.0, 32.0, 32.0, 32.0, 32.0, 64.0, 64.0,
+        32.0, 32.0, 64.0, 64.0, 50.0, 50.0, 100.0, 100.0, 50.0, 50.0, 100.0,
+        100.0
+    ]
 
     for nptype in self.TYPES:
       for opt in [
           image_ops.ResizeMethod.BILINEAR,
-          image_ops.ResizeMethod.NEAREST_NEIGHBOR,
-          image_ops.ResizeMethod.AREA]:
+          image_ops.ResizeMethod.NEAREST_NEIGHBOR, image_ops.ResizeMethod.AREA
+      ]:
         with self.test_session(use_gpu=True):
           img_np = np.array(data, dtype=nptype).reshape(img_shape)
           image = constant_op.constant(img_np, shape=img_shape)
@@ -2113,41 +2081,29 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
 
   def testResizeUpAlignCornersTrue(self):
     img_shape = [1, 3, 2, 1]
-    data = [6, 3,
-            3, 6,
-            6, 9]
+    data = [6, 3, 3, 6, 6, 9]
     target_height = 5
     target_width = 4
     expected_data = {}
     expected_data[image_ops.ResizeMethod.BILINEAR] = [
-        6.0, 5.0, 4.0, 3.0,
-        4.5, 4.5, 4.5, 4.5,
-        3.0, 4.0, 5.0, 6.0,
-        4.5, 5.5, 6.5, 7.5,
-        6.0, 7.0, 8.0, 9.0
+        6.0, 5.0, 4.0, 3.0, 4.5, 4.5, 4.5, 4.5, 3.0, 4.0, 5.0, 6.0, 4.5, 5.5,
+        6.5, 7.5, 6.0, 7.0, 8.0, 9.0
     ]
     expected_data[image_ops.ResizeMethod.NEAREST_NEIGHBOR] = [
-        6.0, 6.0, 3.0, 3.0,
-        3.0, 3.0, 6.0, 6.0,
-        3.0, 3.0, 6.0, 6.0,
-        6.0, 6.0, 9.0, 9.0,
-        6.0, 6.0, 9.0, 9.0
+        6.0, 6.0, 3.0, 3.0, 3.0, 3.0, 6.0, 6.0, 3.0, 3.0, 6.0, 6.0, 6.0, 6.0,
+        9.0, 9.0, 6.0, 6.0, 9.0, 9.0
     ]
     # TODO(b/37749740): Improve alignment of ResizeMethod.AREA when
     # align_corners=True.
     expected_data[image_ops.ResizeMethod.AREA] = [
-        6.0, 6.0, 6.0, 3.0,
-        6.0, 6.0, 6.0, 3.0,
-        3.0, 3.0, 3.0, 6.0,
-        3.0, 3.0, 3.0, 6.0,
-        6.0, 6.0, 6.0, 9.0
+        6.0, 6.0, 6.0, 3.0, 6.0, 6.0, 6.0, 3.0, 3.0, 3.0, 3.0, 6.0, 3.0, 3.0,
+        3.0, 6.0, 6.0, 6.0, 6.0, 9.0
     ]
 
     for nptype in self.TYPES:
       for opt in [
           image_ops.ResizeMethod.BILINEAR,
-          image_ops.ResizeMethod.NEAREST_NEIGHBOR,
-          image_ops.ResizeMethod.AREA
+          image_ops.ResizeMethod.NEAREST_NEIGHBOR, image_ops.ResizeMethod.AREA
       ]:
         with self.test_session(use_gpu=True):
           img_np = np.array(data, dtype=nptype).reshape(img_shape)
@@ -2161,23 +2117,21 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
 
   def testResizeUpBicubic(self):
     img_shape = [1, 6, 6, 1]
-    data = [128, 128, 64, 64, 128, 128, 64, 64,
-            64, 64, 128, 128, 64, 64, 128, 128,
-            50, 50, 100, 100, 50, 50, 100, 100,
-            50, 50, 100, 100, 50, 50, 100, 100,
-            50, 50, 100, 100]
+    data = [
+        128, 128, 64, 64, 128, 128, 64, 64, 64, 64, 128, 128, 64, 64, 128, 128,
+        50, 50, 100, 100, 50, 50, 100, 100, 50, 50, 100, 100, 50, 50, 100, 100,
+        50, 50, 100, 100
+    ]
     img_np = np.array(data, dtype=np.uint8).reshape(img_shape)
 
     target_height = 8
     target_width = 8
-    expected_data = [128, 135, 96, 55, 64, 114, 134, 128,
-                     78, 81, 68, 52, 57, 118, 144, 136,
-                     55, 49, 79, 109, 103, 89, 83, 84,
-                     74, 70, 95, 122, 115, 69, 49, 55,
-                     100, 105, 75, 43, 50, 89, 105, 100,
-                     57, 54, 74, 96, 91, 65, 55, 58,
-                     70, 69, 75, 81, 80, 72, 69, 70,
-                     105, 112, 75, 36, 45, 92, 111, 105]
+    expected_data = [
+        128, 135, 96, 55, 64, 114, 134, 128, 78, 81, 68, 52, 57, 118, 144, 136,
+        55, 49, 79, 109, 103, 89, 83, 84, 74, 70, 95, 122, 115, 69, 49, 55, 100,
+        105, 75, 43, 50, 89, 105, 100, 57, 54, 74, 96, 91, 65, 55, 58, 70, 69,
+        75, 81, 80, 72, 69, 70, 105, 112, 75, 36, 45, 92, 111, 105
+    ]
 
     with self.test_session(use_gpu=True):
       image = constant_op.constant(img_np, shape=img_shape)
@@ -2190,20 +2144,17 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
 
   def testResizeDownArea(self):
     img_shape = [1, 6, 6, 1]
-    data = [128, 64, 32, 16, 8, 4,
-            4, 8, 16, 32, 64, 128,
-            128, 64, 32, 16, 8, 4,
-            5, 10, 15, 20, 25, 30,
-            30, 25, 20, 15, 10, 5,
-            5, 10, 15, 20, 25, 30]
+    data = [
+        128, 64, 32, 16, 8, 4, 4, 8, 16, 32, 64, 128, 128, 64, 32, 16, 8, 4, 5,
+        10, 15, 20, 25, 30, 30, 25, 20, 15, 10, 5, 5, 10, 15, 20, 25, 30
+    ]
     img_np = np.array(data, dtype=np.uint8).reshape(img_shape)
 
     target_height = 4
     target_width = 4
-    expected_data = [73, 33, 23, 39,
-                     73, 33, 23, 39,
-                     14, 16, 19, 21,
-                     14, 16, 19, 21]
+    expected_data = [
+        73, 33, 23, 39, 73, 33, 23, 39, 14, 16, 19, 21, 14, 16, 19, 21
+    ]
 
     with self.test_session(use_gpu=True):
       image = constant_op.constant(img_np, shape=img_shape)
@@ -2290,7 +2241,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       single_image = array_ops.placeholder(dtypes.float32, shape=[50, 60, 3])
       y = image_ops.resize_images(single_image, [55, 66])
-      self.assertTrue(y.op.name.startswith('resize_images'))
+      self.assertTrue(y.op.name.startswith("resize_images"))
 
 
 class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
@@ -2363,133 +2314,93 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
 
   def testPad(self):
     # Pad even along col.
-    x = [1, 2, 3, 4,
-         5, 6, 7, 8]
+    x = [1, 2, 3, 4, 5, 6, 7, 8]
     x_shape = [2, 4, 1]
 
-    y = [0, 1, 2, 3, 4, 0,
-         0, 5, 6, 7, 8, 0]
+    y = [0, 1, 2, 3, 4, 0, 0, 5, 6, 7, 8, 0]
     y_shape = [2, 6, 1]
 
     self._assertReturns(x, x_shape, y, y_shape)
 
     # Pad odd along col.
-    x = [1, 2, 3, 4,
-         5, 6, 7, 8]
+    x = [1, 2, 3, 4, 5, 6, 7, 8]
     x_shape = [2, 4, 1]
 
-    y = [0, 1, 2, 3, 4, 0, 0,
-         0, 5, 6, 7, 8, 0, 0]
+    y = [0, 1, 2, 3, 4, 0, 0, 0, 5, 6, 7, 8, 0, 0]
     y_shape = [2, 7, 1]
 
     self._assertReturns(x, x_shape, y, y_shape)
 
     # Pad even along row.
-    x = [1, 2, 3, 4,
-         5, 6, 7, 8]
+    x = [1, 2, 3, 4, 5, 6, 7, 8]
     x_shape = [2, 4, 1]
 
-    y = [0, 0, 0, 0,
-         1, 2, 3, 4,
-         5, 6, 7, 8,
-         0, 0, 0, 0]
+    y = [0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0]
     y_shape = [4, 4, 1]
 
     self._assertReturns(x, x_shape, y, y_shape)
 
     # Pad odd along row.
-    x = [1, 2, 3, 4,
-         5, 6, 7, 8]
+    x = [1, 2, 3, 4, 5, 6, 7, 8]
     x_shape = [2, 4, 1]
 
-    y = [0, 0, 0, 0,
-         1, 2, 3, 4,
-         5, 6, 7, 8,
-         0, 0, 0, 0,
-         0, 0, 0, 0]
+    y = [0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 0]
     y_shape = [5, 4, 1]
 
     self._assertReturns(x, x_shape, y, y_shape)
 
   def testCrop(self):
     # Crop even along col.
-    x = [1, 2, 3, 4,
-         5, 6, 7, 8]
+    x = [1, 2, 3, 4, 5, 6, 7, 8]
     x_shape = [2, 4, 1]
 
-    y = [2, 3,
-         6, 7]
+    y = [2, 3, 6, 7]
     y_shape = [2, 2, 1]
 
     self._assertReturns(x, x_shape, y, y_shape)
 
     # Crop odd along col.
-    x = [1, 2, 3, 4,  5,  6,
-         7, 8, 9, 10, 11, 12]
+    x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
     x_shape = [2, 6, 1]
 
-    y = [2, 3, 4,
-         8, 9, 10]
+    y = [2, 3, 4, 8, 9, 10]
     y_shape = [2, 3, 1]
 
     self._assertReturns(x, x_shape, y, y_shape)
 
     # Crop even along row.
-    x = [1, 2,
-         3, 4,
-         5, 6,
-         7, 8]
+    x = [1, 2, 3, 4, 5, 6, 7, 8]
     x_shape = [4, 2, 1]
 
-    y = [3, 4,
-         5, 6]
+    y = [3, 4, 5, 6]
     y_shape = [2, 2, 1]
 
     self._assertReturns(x, x_shape, y, y_shape)
 
     # Crop odd along row.
-    x = [1,  2,
-         3,  4,
-         5,  6,
-         7,  8,
-         9,  10,
-         11, 12,
-         13, 14,
-         15, 16]
+    x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
     x_shape = [8, 2, 1]
 
-    y = [3,  4,
-         5,  6,
-         7,  8,
-         9,  10,
-         11, 12]
+    y = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
     y_shape = [5, 2, 1]
 
     self._assertReturns(x, x_shape, y, y_shape)
 
   def testCropAndPad(self):
     # Pad along row but crop along col.
-    x = [1, 2, 3, 4,
-         5, 6, 7, 8]
+    x = [1, 2, 3, 4, 5, 6, 7, 8]
     x_shape = [2, 4, 1]
 
-    y = [0, 0,
-         2, 3,
-         6, 7,
-         0, 0]
+    y = [0, 0, 2, 3, 6, 7, 0, 0]
     y_shape = [4, 2, 1]
 
     self._assertReturns(x, x_shape, y, y_shape)
 
     # Crop along row but pad along col.
-    x = [1, 2,
-         3, 4,
-         5, 6,
-         7, 8]
+    x = [1, 2, 3, 4, 5, 6, 7, 8]
     x_shape = [4, 2, 1]
 
-    y = [0, 3, 4, 0,
-         0, 5, 6, 0]
+    y = [0, 3, 4, 0, 0, 5, 6, 0]
     y_shape = [2, 4, 1]
 
     self._assertReturns(x, x_shape, y, y_shape)
@@ -2570,7 +2481,7 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
   def testNameScope(self):
     image = array_ops.placeholder(dtypes.float32, shape=[50, 60, 3])
     y = image_ops.resize_image_with_crop_or_pad(image, 55, 66)
-    self.assertTrue(y.op.name.startswith('resize_image_with_crop_or_pad'))
+    self.assertTrue(y.op.name.startswith("resize_image_with_crop_or_pad"))
 
 
 def _SimpleColorRamp():
@@ -2839,8 +2750,8 @@ class GifTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True) as sess:
       gif = io_ops.read_file(filename)
       image = image_ops.decode_gif(gif)
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError, "can't process optimized gif"):
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   "can't process optimized gif"):
         gif, image = sess.run([gif, image])
 
   def testValid(self):
@@ -2902,8 +2813,9 @@ class ConvertImageTest(test_util.TensorFlowTestCase):
       y = image_ops.convert_image_dtype(image, output_dtype)
       self.assertTrue(y.dtype == output_dtype)
       self.assertAllClose(y.eval(), y_np, atol=1e-5)
-      if output_dtype in [dtypes.float32, dtypes.float64,
-                          dtypes.int32, dtypes.int64]:
+      if output_dtype in [
+          dtypes.float32, dtypes.float64, dtypes.int32, dtypes.int64
+      ]:
         y_saturate = image_ops.convert_image_dtype(
             image, output_dtype, saturate=True)
         self.assertTrue(y_saturate.dtype == output_dtype)
@@ -2923,8 +2835,8 @@ class ConvertImageTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       self._convert([0, 255], dtypes.uint8, dtypes.int16, [0, 255 * 128])
       self._convert([0, 32767], dtypes.int16, dtypes.uint8, [0, 255])
-      self._convert([0, 2 ** 32], dtypes.int64, dtypes.int32, [0, 1])
-      self._convert([0, 1], dtypes.int32, dtypes.int64, [0, 2 ** 32])
+      self._convert([0, 2**32], dtypes.int64, dtypes.int32, [0, 1])
+      self._convert([0, 1], dtypes.int32, dtypes.int64, [0, 2**32])
 
   def testConvertBetweenFloat(self):
     # Make sure converting to between float types does nothing interesting
@@ -2945,20 +2857,14 @@ class ConvertImageTest(test_util.TensorFlowTestCase):
   def testConvertBetweenInt16AndInt8(self):
     with self.test_session(use_gpu=True):
       # uint8, uint16
-      self._convert([0, 255 * 256], dtypes.uint16, dtypes.uint8,
-                    [0, 255])
-      self._convert([0, 255], dtypes.uint8, dtypes.uint16,
-                    [0, 255 * 256])
+      self._convert([0, 255 * 256], dtypes.uint16, dtypes.uint8, [0, 255])
+      self._convert([0, 255], dtypes.uint8, dtypes.uint16, [0, 255 * 256])
       # int8, uint16
-      self._convert([0, 127 * 2 * 256], dtypes.uint16, dtypes.int8,
-                    [0, 127])
-      self._convert([0, 127], dtypes.int8, dtypes.uint16,
-                    [0, 127 * 2 * 256])
+      self._convert([0, 127 * 2 * 256], dtypes.uint16, dtypes.int8, [0, 127])
+      self._convert([0, 127], dtypes.int8, dtypes.uint16, [0, 127 * 2 * 256])
       # int16, uint16
-      self._convert([0, 255 * 256], dtypes.uint16, dtypes.int16,
-                    [0, 255 * 128])
-      self._convert([0, 255 * 128], dtypes.int16, dtypes.uint16,
-                    [0, 255 * 256])
+      self._convert([0, 255 * 256], dtypes.uint16, dtypes.int16, [0, 255 * 128])
+      self._convert([0, 255 * 128], dtypes.int16, dtypes.uint16, [0, 255 * 256])
 
 
 class TotalVariationTest(test_util.TensorFlowTestCase):
@@ -3091,20 +2997,17 @@ class TotalVariationTest(test_util.TensorFlowTestCase):
     # The following are the sum of absolute differences between the pixels.
     # sum row dif = (4-1) + (7-2) = 3 + 5 = 8
     # sum col dif = (2-1) + (7-4) = 1 + 3 = 4
-    r = [[1, 2],
-         [4, 7]]
+    r = [[1, 2], [4, 7]]
 
     # Blue color channel.
     # sum row dif = 18 + 29 = 47
     # sum col dif = 7 + 18 = 25
-    g = [[11, 18],
-         [29, 47]]
+    g = [[11, 18], [29, 47]]
 
     # Green color channel.
     # sum row dif = 120 + 193 = 313
     # sum col dif = 47 + 120 = 167
-    b = [[73, 120],
-         [193, 313]]
+    b = [[73, 120], [193, 313]]
 
     # Combine the 3 color channels into a single 3-dim array.
     # The shape is (2, 2, 3) corresponding to (height, width and color).
@@ -3133,9 +3036,7 @@ class TotalVariationTest(test_util.TensorFlowTestCase):
 
     # Combine these 3 images into a single array of shape (3, 2, 2, 3)
     # where the first dimension is for the image-number.
-    multi = np.vstack((a[np.newaxis, :],
-                       b[np.newaxis, :],
-                       c[np.newaxis, :]))
+    multi = np.vstack((a[np.newaxis, :], b[np.newaxis, :], c[np.newaxis, :]))
 
     # Check that TensorFlow correctly calculates the total variation
     # for each image individually and returns the correct array.
diff --git a/tensorflow/python/ops/math_grad.py b/tensorflow/python/ops/math_grad.py
index 3cb71eba8c..53308484c4 100644
--- a/tensorflow/python/ops/math_grad.py
+++ b/tensorflow/python/ops/math_grad.py
@@ -173,8 +173,7 @@ def _SegmentMeanGrad(op, grad):
       array_ops.shape(op.inputs[1]),
       array_ops.fill(array_ops.expand_dims(input_rank - 1, 0), 1)
   ], 0)
-  ones = array_ops.fill(ones_shape,
-                        constant_op.constant(1, dtype=grad.dtype))
+  ones = array_ops.fill(ones_shape, constant_op.constant(1, dtype=grad.dtype))
   scaled_grad = math_ops.div(grad, math_ops.segment_sum(ones, op.inputs[1]))
   return array_ops.gather(scaled_grad, op.inputs[1]), None
 
@@ -230,16 +229,19 @@ def _SparseSegmentSqrtNWithNumSegmentsGrad(op, grad):
 
 
 def _SegmentMinOrMaxGrad(op, grad, is_sorted):
-  """Gradient for SegmentMin and (unsorted) SegmentMax. They share similar code."""
-  zeros = array_ops.zeros(array_ops.shape(op.inputs[0]),
-                          dtype=op.inputs[0].dtype)
+  """Gradient for SegmentMin and (unsorted) SegmentMax.
+
+  They share similar code.
+  """
+  zeros = array_ops.zeros(
+      array_ops.shape(op.inputs[0]), dtype=op.inputs[0].dtype)
 
   # Get the number of selected (minimum or maximum) elements in each segment.
   gathered_outputs = array_ops.gather(op.outputs[0], op.inputs[1])
   is_selected = math_ops.equal(op.inputs[0], gathered_outputs)
   if is_sorted:
-    num_selected = math_ops.segment_sum(math_ops.cast(is_selected, grad.dtype),
-                                        op.inputs[1])
+    num_selected = math_ops.segment_sum(
+        math_ops.cast(is_selected, grad.dtype), op.inputs[1])
   else:
     num_selected = math_ops.unsorted_segment_sum(
         math_ops.cast(is_selected, grad.dtype), op.inputs[1], op.inputs[2])
@@ -536,8 +538,8 @@ def _IgammaGrad(op, grad):
   # and Gamma'(a) can grow large.
   partial_x = math_ops.exp(-x + (a - 1) * math_ops.log(x) - math_ops.lgamma(a))
   # TODO(b/36815900): Mark None return values as NotImplemented
-  return (None,
-          array_ops.reshape(math_ops.reduce_sum(partial_x * grad, rx), sx))
+  return (None, array_ops.reshape(
+      math_ops.reduce_sum(partial_x * grad, rx), sx))
 
 
 @ops.RegisterGradient("Igammac")
@@ -563,15 +565,17 @@ def _BetaincGrad(op, grad):
 
   # Perform operations in log space before summing, because terms
   # can grow large.
-  log_beta = (gen_math_ops.lgamma(a) + gen_math_ops.lgamma(b)
-              - gen_math_ops.lgamma(a + b))
-  partial_x = math_ops.exp(
-      (b - 1) * math_ops.log(1 - x) + (a - 1) * math_ops.log(x) - log_beta)
+  log_beta = (
+      gen_math_ops.lgamma(a) + gen_math_ops.lgamma(b) -
+      gen_math_ops.lgamma(a + b))
+  partial_x = math_ops.exp((b - 1) * math_ops.log(1 - x) +
+                           (a - 1) * math_ops.log(x) - log_beta)
 
   # TODO(b/36815900): Mark None return values as NotImplemented
-  return (None,  # da
-          None,  # db
-          array_ops.reshape(math_ops.reduce_sum(partial_x * grad, rx), sx))
+  return (
+      None,  # da
+      None,  # db
+      array_ops.reshape(math_ops.reduce_sum(partial_x * grad, rx), sx))
 
 
 @ops.RegisterGradient("Zeta")
@@ -735,10 +739,8 @@ def _ShapesFullySpecifiedAndEqual(x, y, grad):
   y_shape = y._shape_tuple()
   grad_shape = grad._shape_tuple()
   # pylint: enable=protected-access
-  return (x_shape == y_shape and
-          x_shape == grad_shape and
-          x_shape is not None and
-          None not in x_shape)
+  return (x_shape == y_shape and x_shape == grad_shape and
+          x_shape is not None and None not in x_shape)
 
 
 @ops.RegisterGradient("Add")
@@ -856,10 +858,10 @@ def _RealDivGrad(op, grad):
   x = math_ops.conj(x)
   y = math_ops.conj(y)
   return (array_ops.reshape(
-      math_ops.reduce_sum(math_ops.realdiv(grad, y), rx),
-      sx), array_ops.reshape(
-          math_ops.reduce_sum(grad * math_ops.realdiv(math_ops.realdiv(-x, y), y),
-                              ry), sy))
+      math_ops.reduce_sum(math_ops.realdiv(grad, y), rx), sx),
+          array_ops.reshape(
+              math_ops.reduce_sum(
+                  grad * math_ops.realdiv(math_ops.realdiv(-x, y), y), ry), sy))
 
 
 @ops.RegisterGradient("Pow")
@@ -954,8 +956,8 @@ def _SelectGrad(op, grad):
   c = op.inputs[0]
   x = op.inputs[1]
   zeros = array_ops.zeros_like(x)
-  return (None, array_ops.where(c, grad, zeros),
-          array_ops.where(c, zeros, grad))
+  return (None, array_ops.where(c, grad, zeros), array_ops.where(
+      c, zeros, grad))
 
 
 @ops.RegisterGradient("MatMul")
@@ -1017,21 +1019,20 @@ def _SparseMatMulGrad(op, grad):
   dtype_a = op.inputs[0].dtype
   dtype_b = op.inputs[1].dtype
   if not t_a and not t_b:
-    return (_SparseMatMul(
-        grad, op.inputs[1], dtype_a, transpose_b=True), _SparseMatMul(
-            op.inputs[0], grad, dtype_b, transpose_a=True))
+    return (_SparseMatMul(grad, op.inputs[1], dtype_a, transpose_b=True),
+            _SparseMatMul(op.inputs[0], grad, dtype_b, transpose_a=True))
   elif not t_a and t_b:
-    return (_SparseMatMul(grad, op.inputs[1], dtype_a), _SparseMatMul(
-        grad, op.inputs[0], dtype_b, transpose_a=True))
+    return (_SparseMatMul(grad, op.inputs[1], dtype_a),
+            _SparseMatMul(grad, op.inputs[0], dtype_b, transpose_a=True))
   elif t_a and not t_b:
-    return (_SparseMatMul(
-        op.inputs[1], grad, dtype_a, transpose_b=True),
+    return (_SparseMatMul(op.inputs[1], grad, dtype_a, transpose_b=True),
             _SparseMatMul(op.inputs[0], grad, dtype_b))
   elif t_a and t_b:
     return (_SparseMatMul(
-        op.inputs[1], grad, dtype_a, transpose_a=True,
-        transpose_b=True), _SparseMatMul(
-            grad, op.inputs[0], dtype_b, transpose_a=True, transpose_b=True))
+        op.inputs[1], grad, dtype_a, transpose_a=True, transpose_b=True),
+            _SparseMatMul(
+                grad, op.inputs[0], dtype_b, transpose_a=True,
+                transpose_b=True))
 
 
 @ops.RegisterGradient("Floor")
@@ -1135,8 +1136,8 @@ def _ComplexAbsGrad(op, grad):
   """Returns the gradient of ComplexAbs."""
   # TODO(b/27786104): The cast to complex could be removed once arithmetic
   # supports mixtures of complex64 and real values.
-  return (math_ops.complex(grad, array_ops.zeros_like(grad)) *
-          math_ops.sign(op.inputs[0]))
+  return (math_ops.complex(grad, array_ops.zeros_like(grad)) * math_ops.sign(
+      op.inputs[0]))
 
 
 @ops.RegisterGradient("Cast")
@@ -1166,8 +1167,8 @@ def _CumsumGrad(op, grad):
   exclusive = op.get_attr("exclusive")
   reverse = op.get_attr("reverse")
   return [
-      math_ops.cumsum(
-          grad, axis, exclusive=exclusive, reverse=not reverse), None
+      math_ops.cumsum(grad, axis, exclusive=exclusive, reverse=not reverse),
+      None
   ]
 
 
diff --git a/tensorflow/python/ops/matmul_benchmark.py b/tensorflow/python/ops/matmul_benchmark.py
index f95cf08de1..6e5fe74290 100644
--- a/tensorflow/python/ops/matmul_benchmark.py
+++ b/tensorflow/python/ops/matmul_benchmark.py
@@ -95,8 +95,8 @@ class MatmulBenchmark(test.Benchmark):
         num_items = n * m * k * 2
         throughput = num_items * num_iters / duration / 1e9
         print('%s %s input_info:%s %d %.4fsec, %.4fGitems/s.' %
-              (device, str(dtype), str(n) + 'x' + str(m) + 'x' + str(k) + ',ta:'
-               + str(transpose_a) + '.tb:' + str(transpose_b), num_iters,
+              (device, str(dtype), str(n) + 'x' + str(m) + 'x' + str(k) +
+               ',ta:' + str(transpose_a) + '.tb:' + str(transpose_b), num_iters,
                duration, throughput))
 
     name_template = ('matmul_{device}_{dtype}_input_info_{inputinfo}')
@@ -112,7 +112,8 @@ class MatmulBenchmark(test.Benchmark):
     return duration
 
   def run_test_gpu(self, n, m, k, transpose_a, transpose_b, dtype, num_iters):
-    self.run_graph(test.gpu_device_name(), n, m, k, transpose_a, transpose_b, num_iters, dtype)
+    self.run_graph(test.gpu_device_name(), n, m, k, transpose_a, transpose_b,
+                   num_iters, dtype)
 
   def test_round(self, num_iters):
     dtypes = [np.float32, np.float64]
@@ -124,8 +125,8 @@ class MatmulBenchmark(test.Benchmark):
         self.run_test_gpu(n, m, k, transpose_a, transpose_b, dtype, num_iters)
 
       for n, m, k, (transpose_a, transpose_b) in itertools.product(
-          [200], [1, 8, 20], [10000], [(False, False), (True, False), (False,
-                                                                       True)]):
+          [200], [1, 8, 20], [10000], [(False, False), (True, False),
+                                       (False, True)]):
         self.run_test_gpu(n, m, k, transpose_a, transpose_b, dtype, num_iters)
 
       for (n, m, k), (transpose_a, transpose_b) in itertools.product(
diff --git a/tensorflow/python/ops/matmul_benchmark_test.py b/tensorflow/python/ops/matmul_benchmark_test.py
index 5a9c0a7a49..3df0c66ef9 100644
--- a/tensorflow/python/ops/matmul_benchmark_test.py
+++ b/tensorflow/python/ops/matmul_benchmark_test.py
@@ -33,11 +33,11 @@ def BuildGraphTest(n, m, k, transpose_a, transpose_b, dtype):
 
   def Test(self):
     if not googletest.is_gpu_available():
-      tf_logging.info("Skipping BuildGraphTest %s", (n, m, k, transpose_a,
-                                                     transpose_b))
+      tf_logging.info("Skipping BuildGraphTest %s",
+                      (n, m, k, transpose_a, transpose_b))
       return
-    tf_logging.info("Testing BuildGraphTest %s", (n, m, k, transpose_a,
-                                                  transpose_b))
+    tf_logging.info("Testing BuildGraphTest %s",
+                    (n, m, k, transpose_a, transpose_b))
     self._VerifyBuildGraph(n, m, k, transpose_a, transpose_b, dtype)
 
   return Test
@@ -47,11 +47,11 @@ def RunGraphTest(n, m, k, transpose_a, transpose_b, dtype):
 
   def Test(self):
     if not googletest.is_gpu_available():
-      tf_logging.info("Skipping RunGraphTest %s", (n, m, k, transpose_a,
-                                                   transpose_b))
+      tf_logging.info("Skipping RunGraphTest %s",
+                      (n, m, k, transpose_a, transpose_b))
       return
-    tf_logging.info("Testing RunGraphTest %s", (n, m, k, transpose_a,
-                                                transpose_b))
+    tf_logging.info("Testing RunGraphTest %s",
+                    (n, m, k, transpose_a, transpose_b))
     self._VerifyRunGraph(n, m, k, transpose_a, transpose_b, dtype)
 
   return Test
@@ -71,40 +71,41 @@ class MatmulBenchmarkTest(googletest.TestCase):
   def _VerifyBuildGraph(self, n, m, k, transpose_a, transpose_b, dtype):
     graph = ops.Graph()
     with graph.as_default():
-      matmul_benchmark.build_graph(googletest.gpu_device_name(), n, m, k, transpose_a, transpose_b,
-                                   dtype)
+      matmul_benchmark.build_graph(googletest.gpu_device_name(), n, m, k,
+                                   transpose_a, transpose_b, dtype)
       gd = graph.as_graph_def()
-      dev=googletest.gpu_device_name()
+      dev = googletest.gpu_device_name()
       proto_expected = """
-      node { name: "random_uniform/shape" op: "Const" device: \""""+ dev +"""\" }
-      node { name: "random_uniform/min" op: "Const" device: \""""+ dev +"""\" }
-      node { name: "random_uniform/max" op: "Const" device: \""""+ dev +"""\" }
-      node { name: "random_uniform/RandomUniform" op: "RandomUniform" input: "random_uniform/shape" device: \""""+ dev +"""\" }
-      node { name: "random_uniform/sub" op: "Sub" input: "random_uniform/max" input: "random_uniform/min" device: \""""+ dev +"""\" }
-      node { name: "random_uniform/mul" op: "Mul" input: "random_uniform/RandomUniform" input: "random_uniform/sub" device: \""""+ dev +"""\" }
-      node { name: "random_uniform" op: "Add" input: "random_uniform/mul" input: "random_uniform/min" device: \""""+ dev +"""\" }
-      node { name: "Variable" op: "VariableV2" device: \""""+ dev +"""\" }
-      node { name: "Variable/Assign" op: "Assign" input: "Variable" input: "random_uniform" device: \""""+ dev +"""\" }
-      node { name: "Variable/read" op: "Identity" input: "Variable" device: \""""+ dev +"""\" }
-      node { name: "random_uniform_1/shape" op: "Const" device: \""""+ dev +"""\" }
-      node { name: "random_uniform_1/min" op: "Const" device: \""""+ dev +"""\" }
-      node { name: "random_uniform_1/max" op: "Const" device: \""""+ dev +"""\" }
-      node { name: "random_uniform_1/RandomUniform" op: "RandomUniform" input: "random_uniform_1/shape" device: \""""+ dev +"""\" }
-      node { name: "random_uniform_1/sub" op: "Sub" input: "random_uniform_1/max" input: "random_uniform_1/min" device: \""""+ dev +"""\" }
-      node { name: "random_uniform_1/mul" op: "Mul" input: "random_uniform_1/RandomUniform" input: "random_uniform_1/sub" device: \""""+ dev +"""\" }
-      node { name: "random_uniform_1" op: "Add" input: "random_uniform_1/mul" input: "random_uniform_1/min" device: \""""+ dev +"""\" }
-      node { name: "Variable_1" op: "VariableV2" device: \""""+ dev +"""\" }
-      node { name: "Variable_1/Assign" op: "Assign" input: "Variable_1" input: "random_uniform_1" device: \""""+ dev +"""\" }
-      node { name: "Variable_1/read" op: "Identity" input: "Variable_1" device: \""""+ dev +"""\" }
-      node { name: "MatMul" op: "MatMul" input: "Variable/read" input: "Variable_1/read" device: \""""+ dev +"""\" }
-      node { name: "group_deps" op: "NoOp" input: "^MatMul" device: \""""+ dev +"""\" }
+      node { name: "random_uniform/shape" op: "Const" device: \"""" + dev + """\" }
+      node { name: "random_uniform/min" op: "Const" device: \"""" + dev + """\" }
+      node { name: "random_uniform/max" op: "Const" device: \"""" + dev + """\" }
+      node { name: "random_uniform/RandomUniform" op: "RandomUniform" input: "random_uniform/shape" device: \"""" + dev + """\" }
+      node { name: "random_uniform/sub" op: "Sub" input: "random_uniform/max" input: "random_uniform/min" device: \"""" + dev + """\" }
+      node { name: "random_uniform/mul" op: "Mul" input: "random_uniform/RandomUniform" input: "random_uniform/sub" device: \"""" + dev + """\" }
+      node { name: "random_uniform" op: "Add" input: "random_uniform/mul" input: "random_uniform/min" device: \"""" + dev + """\" }
+      node { name: "Variable" op: "VariableV2" device: \"""" + dev + """\" }
+      node { name: "Variable/Assign" op: "Assign" input: "Variable" input: "random_uniform" device: \"""" + dev + """\" }
+      node { name: "Variable/read" op: "Identity" input: "Variable" device: \"""" + dev + """\" }
+      node { name: "random_uniform_1/shape" op: "Const" device: \"""" + dev + """\" }
+      node { name: "random_uniform_1/min" op: "Const" device: \"""" + dev + """\" }
+      node { name: "random_uniform_1/max" op: "Const" device: \"""" + dev + """\" }
+      node { name: "random_uniform_1/RandomUniform" op: "RandomUniform" input: "random_uniform_1/shape" device: \"""" + dev + """\" }
+      node { name: "random_uniform_1/sub" op: "Sub" input: "random_uniform_1/max" input: "random_uniform_1/min" device: \"""" + dev + """\" }
+      node { name: "random_uniform_1/mul" op: "Mul" input: "random_uniform_1/RandomUniform" input: "random_uniform_1/sub" device: \"""" + dev + """\" }
+      node { name: "random_uniform_1" op: "Add" input: "random_uniform_1/mul" input: "random_uniform_1/min" device: \"""" + dev + """\" }
+      node { name: "Variable_1" op: "VariableV2" device: \"""" + dev + """\" }
+      node { name: "Variable_1/Assign" op: "Assign" input: "Variable_1" input: "random_uniform_1" device: \"""" + dev + """\" }
+      node { name: "Variable_1/read" op: "Identity" input: "Variable_1" device: \"""" + dev + """\" }
+      node { name: "MatMul" op: "MatMul" input: "Variable/read" input: "Variable_1/read" device: \"""" + dev + """\" }
+      node { name: "group_deps" op: "NoOp" input: "^MatMul" device: \"""" + dev + """\" }
                        """
       self.assertProtoEquals(str(proto_expected), self._StripGraph(gd))
 
   def _VerifyRunGraph(self, n, m, k, transpose_a, transpose_b, dtype):
     benchmark_instance = matmul_benchmark.MatmulBenchmark()
-    duration = benchmark_instance.run_graph(googletest.gpu_device_name(), n, m, k, transpose_a,
-                                            transpose_b, 1, dtype)
+    duration = benchmark_instance.run_graph(googletest.gpu_device_name(), n, m,
+                                            k, transpose_a, transpose_b, 1,
+                                            dtype)
     self.assertTrue(duration > 1e-6)
 
 
@@ -113,8 +114,8 @@ if __name__ == "__main__":
   index = 0
   for _dtype in dtypes:
     for _n, _m, (_transpose_a, _transpose_b) in itertools.product(
-        [512, 1024], [1, 8, 16, 128], [(False, False), (True, False), (False,
-                                                                       True)]):
+        [512, 1024], [1, 8, 16, 128], [(False, False), (True, False),
+                                       (False, True)]):
       _k = _n
       setattr(MatmulBenchmarkTest, "testBuildGraph_" + str(index),
               BuildGraphTest(_n, _m, _k, _transpose_a, _transpose_b, _dtype))
diff --git a/tensorflow/python/ops/nn_fused_batchnorm_test.py b/tensorflow/python/ops/nn_fused_batchnorm_test.py
index 0593ed2cfa..a08b836025 100644
--- a/tensorflow/python/ops/nn_fused_batchnorm_test.py
+++ b/tensorflow/python/ops/nn_fused_batchnorm_test.py
@@ -278,7 +278,8 @@ class BatchNormalizationTest(test.TestCase):
         epsilon = y.op.get_attr('epsilon')
         data_format = y.op.get_attr('data_format')
         grad_vals = sess.run([grad_x, grad_scale, grad_offset])
-        grad_internal = nn_grad._BatchNormGrad(grad_y, x, scale, pop_mean, pop_var, epsilon, data_format)
+        grad_internal = nn_grad._BatchNormGrad(grad_y, x, scale, pop_mean,
+                                               pop_var, epsilon, data_format)
         grad_internal_vals = sess.run(list(grad_internal))
         for grad_val, grad_internal_val in zip(grad_vals, grad_internal_vals):
           self.assertAllClose(grad_val, grad_internal_val, atol=err_tolerance)
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 32b14f86b5..9c875b4bcb 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -41,15 +41,19 @@ from tensorflow.python.ops.gen_nn_ops import *
 from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
-
 # Aliases for some automatically-generated names.
 local_response_normalization = gen_nn_ops.lrn
 
 # pylint: disable=protected-access
 
 
-def _non_atrous_convolution(input, filter, padding, data_format=None,  # pylint: disable=redefined-builtin
-                            strides=None, name=None):
+def _non_atrous_convolution(
+    input,
+    filter,
+    padding,
+    data_format=None,  # pylint: disable=redefined-builtin
+    strides=None,
+    name=None):
   """Computes sums of N-D convolutions (actually cross correlation).
 
   It is required that 1 <= N <= 3.
@@ -94,12 +98,13 @@ def _non_atrous_convolution(input, filter, padding, data_format=None,  # pylint:
     input_shape = input.get_shape()
     filter = ops.convert_to_tensor(filter, name="filter")
     filter_shape = filter.get_shape()
-    op = _NonAtrousConvolution(input_shape,
-                               filter_shape=filter_shape,
-                               padding=padding,
-                               data_format=data_format,
-                               strides=strides,
-                               name=scope)
+    op = _NonAtrousConvolution(
+        input_shape,
+        filter_shape=filter_shape,
+        padding=padding,
+        data_format=data_format,
+        strides=strides,
+        name=scope)
     return op(input, filter)
 
 
@@ -119,11 +124,14 @@ class _NonAtrousConvolution(object):
     name: see _non_atrous_convolution.
   """
 
-  def __init__(self,
-               input_shape,
-               filter_shape,  # pylint: disable=redefined-builtin
-               padding, data_format=None,
-               strides=None, name=None):
+  def __init__(
+      self,
+      input_shape,
+      filter_shape,  # pylint: disable=redefined-builtin
+      padding,
+      data_format=None,
+      strides=None,
+      name=None):
     filter_shape = filter_shape.with_rank(input_shape.ndims)
     self.padding = padding
     self.name = name
@@ -137,8 +145,8 @@ class _NonAtrousConvolution(object):
     if strides is None:
       strides = [1] * conv_dims
     elif len(strides) != conv_dims:
-      raise ValueError("len(strides)=%d, but should be %d" %
-                       (len(strides), conv_dims))
+      raise ValueError("len(strides)=%d, but should be %d" % (len(strides),
+                                                              conv_dims))
     if conv_dims == 1:
       # conv1d uses the 2-d data format names
       if data_format is None or data_format == "NWC":
@@ -177,8 +185,14 @@ class _NonAtrousConvolution(object):
   # those for gen_nn_ops.conv2d and gen_nn_ops.conv3d.
   # pylint: disable=redefined-builtin
   def _conv1d(self, input, filter, strides, padding, data_format, name):
-    return conv1d(value=input, filters=filter, stride=strides, padding=padding,
-                  data_format=data_format, name=name)
+    return conv1d(
+        value=input,
+        filters=filter,
+        stride=strides,
+        padding=padding,
+        data_format=data_format,
+        name=name)
+
   # pylint: enable=redefined-builtin
 
   def __call__(self, inp, filter):  # pylint: disable=redefined-builtin
@@ -340,13 +354,14 @@ def with_space_to_batch(
   def build_op(num_spatial_dims, padding):
     return lambda inp, _: op(inp, num_spatial_dims, padding)
 
-  new_op = _WithSpaceToBatch(input_shape,
-                             dilation_rate,
-                             padding,
-                             build_op,
-                             filter_shape=filter_shape,
-                             spatial_dims=spatial_dims,
-                             data_format=data_format)
+  new_op = _WithSpaceToBatch(
+      input_shape,
+      dilation_rate,
+      padding,
+      build_op,
+      filter_shape=filter_shape,
+      spatial_dims=spatial_dims,
+      data_format=data_format)
   return new_op(input, None)
 
 
@@ -377,9 +392,8 @@ class _WithSpaceToBatch(object):
                spatial_dims=None,
                data_format=None):
     """Helper class for _with_space_to_batch."""
-    dilation_rate = ops.convert_to_tensor(dilation_rate,
-                                          dtypes.int32,
-                                          name="dilation_rate")
+    dilation_rate = ops.convert_to_tensor(
+        dilation_rate, dtypes.int32, name="dilation_rate")
     try:
       rate_shape = dilation_rate.get_shape().with_rank(1)
     except ValueError:
@@ -439,9 +453,7 @@ class _WithSpaceToBatch(object):
       if const_filter_shape is not None:
         filter_shape = const_filter_shape
         self.base_paddings = _with_space_to_batch_base_paddings(
-            const_filter_shape,
-            num_spatial_dims,
-            rate_or_const_rate)
+            const_filter_shape, num_spatial_dims, rate_or_const_rate)
       else:
         self.num_spatial_dims = num_spatial_dims
         self.rate_or_const_rate = rate_or_const_rate
@@ -478,9 +490,7 @@ class _WithSpaceToBatch(object):
       # shape was not fully defined.
       filter_shape = array_ops.shape(filter)
       base_paddings = _with_space_to_batch_base_paddings(
-          filter_shape,
-          self.num_spatial_dims,
-          self.rate_or_const_rate)
+          filter_shape, self.num_spatial_dims, self.rate_or_const_rate)
     paddings, crops = array_ops.required_space_to_batch_paddings(
         input_shape=input_spatial_shape,
         base_paddings=base_paddings,
@@ -491,9 +501,7 @@ class _WithSpaceToBatch(object):
     paddings = _with_space_to_batch_adjust(paddings, 0, spatial_dims)
     crops = _with_space_to_batch_adjust(crops, 0, spatial_dims)
     input_converted = array_ops.space_to_batch_nd(
-        input=inp,
-        block_shape=dilation_rate,
-        paddings=paddings)
+        input=inp, block_shape=dilation_rate, paddings=paddings)
 
     result = self.op(input_converted, filter)
 
@@ -519,17 +527,17 @@ def _with_space_to_batch_base_paddings(filter_shape, num_spatial_dims,
   # Spatial dimensions of the filters and the upsampled filters in which we
   # introduce (rate - 1) zeros between consecutive filter values.
   filter_spatial_shape = filter_shape[:num_spatial_dims]
-  dilated_filter_spatial_shape = (filter_spatial_shape +
-                                  (filter_spatial_shape - 1) *
-                                  (rate_or_const_rate - 1))
+  dilated_filter_spatial_shape = (
+      filter_spatial_shape + (filter_spatial_shape - 1) *
+      (rate_or_const_rate - 1))
   pad_extra_shape = dilated_filter_spatial_shape - 1
 
   # When full_padding_shape is odd, we pad more at end, following the same
   # convention as conv2d.
   pad_extra_start = pad_extra_shape // 2
   pad_extra_end = pad_extra_shape - pad_extra_start
-  base_paddings = array_ops.stack([[pad_extra_start[i], pad_extra_end[i]]
-                                   for i in range(num_spatial_dims)])
+  base_paddings = array_ops.stack(
+      [[pad_extra_start[i], pad_extra_end[i]] for i in range(num_spatial_dims)])
   return base_paddings
 
 
@@ -623,8 +631,8 @@ def _get_strides_and_dilation_rate(num_spatial_dims, strides, dilation_rate):
   if strides is None:
     strides = [1] * num_spatial_dims
   elif len(strides) != num_spatial_dims:
-    raise ValueError("len(strides)=%d but should be %d" %
-                     (len(strides), num_spatial_dims))
+    raise ValueError("len(strides)=%d but should be %d" % (len(strides),
+                                                           num_spatial_dims))
   strides = np.array(strides, dtype=np.int32)
   if np.any(strides < 1):
     raise ValueError("all values of strides must be positive")
@@ -636,9 +644,14 @@ def _get_strides_and_dilation_rate(num_spatial_dims, strides, dilation_rate):
 
 
 @tf_export("nn.convolution")
-def convolution(input, filter,  # pylint: disable=redefined-builtin
-                padding, strides=None, dilation_rate=None,
-                name=None, data_format=None):
+def convolution(
+    input,
+    filter,  # pylint: disable=redefined-builtin
+    padding,
+    strides=None,
+    dilation_rate=None,
+    name=None,
+    data_format=None):
   # pylint: disable=line-too-long
   """Computes sums of N-D convolutions (actually cross-correlation).
 
@@ -757,12 +770,14 @@ def convolution(input, filter,  # pylint: disable=redefined-builtin
     input_shape = input.get_shape()
     filter = ops.convert_to_tensor(filter, name="filter")
     filter_shape = filter.get_shape()
-    op = Convolution(input_shape,
-                     filter_shape,
-                     padding,
-                     strides=strides,
-                     dilation_rate=dilation_rate,
-                     name=name, data_format=data_format)
+    op = Convolution(
+        input_shape,
+        filter_shape,
+        padding,
+        strides=strides,
+        dilation_rate=dilation_rate,
+        name=name,
+        data_format=data_format)
     return op(input, filter)
 
 
@@ -786,8 +801,11 @@ class Convolution(object):
   def __init__(self,
                input_shape,
                filter_shape,
-               padding, strides=None, dilation_rate=None,
-               name=None, data_format=None):
+               padding,
+               strides=None,
+               dilation_rate=None,
+               name=None,
+               data_format=None):
     """Helper function for convolution."""
     num_total_dims = filter_shape.ndims
     if num_total_dims is None:
@@ -809,17 +827,17 @@ class Convolution(object):
 
     if data_format is None or not data_format.startswith("NC"):
       input_channels_dim = input_shape[num_spatial_dims + 1]
-      spatial_dims = range(1, num_spatial_dims+1)
+      spatial_dims = range(1, num_spatial_dims + 1)
     else:
       input_channels_dim = input_shape[1]
-      spatial_dims = range(2, num_spatial_dims+2)
+      spatial_dims = range(2, num_spatial_dims + 2)
 
-    if not input_channels_dim.is_compatible_with(filter_shape[
-        num_spatial_dims]):
+    if not input_channels_dim.is_compatible_with(
+        filter_shape[num_spatial_dims]):
       raise ValueError(
           "number of input channels does not match corresponding dimension of "
-          "filter, {} != {}".format(input_channels_dim, filter_shape[
-              num_spatial_dims]))
+          "filter, {} != {}".format(input_channels_dim,
+                                    filter_shape[num_spatial_dims]))
 
     strides, dilation_rate = _get_strides_and_dilation_rate(
         num_spatial_dims, strides, dilation_rate)
@@ -852,14 +870,15 @@ class Convolution(object):
 
 
 @tf_export("nn.pool")
-def pool(input,  # pylint: disable=redefined-builtin
-         window_shape,
-         pooling_type,
-         padding,
-         dilation_rate=None,
-         strides=None,
-         name=None,
-         data_format=None):
+def pool(
+    input,  # pylint: disable=redefined-builtin
+    window_shape,
+    pooling_type,
+    padding,
+    dilation_rate=None,
+    strides=None,
+    name=None,
+    data_format=None):
   # pylint: disable=line-too-long
   """Performs an N-D pooling operation.
 
@@ -941,8 +960,8 @@ def pool(input,  # pylint: disable=redefined-builtin
 
   """
   # pylint: enable=line-too-long
-  with ops.name_scope(name, "%s_pool" %
-                      (pooling_type.lower()), [input]) as scope:
+  with ops.name_scope(name, "%s_pool" % (pooling_type.lower()),
+                      [input]) as scope:
     input = ops.convert_to_tensor(input, name="input")
 
     num_spatial_dims = len(window_shape)
@@ -963,17 +982,18 @@ def pool(input,  # pylint: disable=redefined-builtin
           "strides > window_shape not supported due to inconsistency between "
           "CPU and GPU implementations")
 
-    pooling_ops = {("MAX", 1): max_pool,
-                   ("MAX", 2): max_pool,
-                   ("MAX", 3): max_pool3d,  # pylint: disable=undefined-variable
-                   ("AVG", 1): avg_pool,
-                   ("AVG", 2): avg_pool,
-                   ("AVG", 3): avg_pool3d,  # pylint: disable=undefined-variable
-                  }
+    pooling_ops = {
+        ("MAX", 1): max_pool,
+        ("MAX", 2): max_pool,
+        ("MAX", 3): max_pool3d,  # pylint: disable=undefined-variable
+        ("AVG", 1): avg_pool,
+        ("AVG", 2): avg_pool,
+        ("AVG", 3): avg_pool3d,  # pylint: disable=undefined-variable
+    }
     op_key = (pooling_type, num_spatial_dims)
     if op_key not in pooling_ops:
-      raise ValueError("%d-D %s pooling is not supported." %
-                       (op_key[1], op_key[0]))
+      raise ValueError("%d-D %s pooling is not supported." % (op_key[1],
+                                                              op_key[0]))
 
     if data_format is None or not data_format.startswith("NC"):
       adjusted_window_shape = [1] + list(window_shape) + [1]
@@ -1000,12 +1020,13 @@ def pool(input,  # pylint: disable=redefined-builtin
       if num_spatial_dims == 1:
         converted_input = array_ops.expand_dims(converted_input,
                                                 spatial_dims[0])
-      result = pooling_ops[op_key](converted_input,
-                                   adjusted_window_shape,
-                                   adjusted_strides,
-                                   converted_padding,
-                                   name=scope,
-                                   **data_format_kwargs)
+      result = pooling_ops[op_key](
+          converted_input,
+          adjusted_window_shape,
+          adjusted_strides,
+          converted_padding,
+          name=scope,
+          **data_format_kwargs)
       if num_spatial_dims == 1:
         result = array_ops.squeeze(result, [spatial_dims[0]])
       return result
@@ -1021,7 +1042,9 @@ def pool(input,  # pylint: disable=redefined-builtin
 
 @tf_export("nn.atrous_conv2d")
 def atrous_conv2d(value, filters, rate, padding, name=None):
-  """Atrous convolution (a.k.a. convolution with holes or dilated convolution).
+  """Atrous convolution (a.k.a.
+
+  convolution with holes or dilated convolution).
 
   This function is a simpler wrapper around the more general
   @{tf.nn.convolution}, and exists only for backwards compatibility. You can
@@ -1065,7 +1088,8 @@ def atrous_conv2d(value, filters, rate, padding, name=None):
   that effectively use atrous convolution in different ways are, among others,
   [OverFeat: Integrated Recognition, Localization and Detection using
   Convolutional Networks](http://arxiv.org/abs/1312.6229) and [Fast Image
-  Scanning with Deep Max-Pooling Convolutional Neural Networks](http://arxiv.org/abs/1302.1700).
+  Scanning with Deep Max-Pooling Convolutional Neural
+  Networks](http://arxiv.org/abs/1302.1700).
   Atrous convolution is also closely related to the so-called noble identities
   in multi-rate signal processing.
 
@@ -1156,13 +1180,14 @@ def atrous_conv2d(value, filters, rate, padding, name=None):
 
 
 @tf_export("nn.conv2d_transpose")
-def conv2d_transpose(value,
-                     filter,  # pylint: disable=redefined-builtin
-                     output_shape,
-                     strides,
-                     padding="SAME",
-                     data_format="NHWC",
-                     name=None):
+def conv2d_transpose(
+    value,
+    filter,  # pylint: disable=redefined-builtin
+    output_shape,
+    strides,
+    padding="SAME",
+    data_format="NHWC",
+    name=None):
   """The transpose of `conv2d`.
 
   This operation is sometimes called "deconvolution" after [Deconvolutional
@@ -1207,15 +1232,16 @@ def conv2d_transpose(value,
 
     output_shape_ = ops.convert_to_tensor(output_shape, name="output_shape")
     if not output_shape_.get_shape().is_compatible_with(tensor_shape.vector(4)):
-      raise ValueError("output_shape must have shape (4,), got {}"
-                       .format(output_shape_.get_shape()))
+      raise ValueError("output_shape must have shape (4,), got {}".format(
+          output_shape_.get_shape()))
 
     if isinstance(output_shape, (list, np.ndarray)):
       # output_shape's shape should be == [4] if reached this point.
       if not filter.get_shape()[2].is_compatible_with(output_shape[axis]):
         raise ValueError(
             "output_shape does not match filter's output channels, "
-            "{} != {}".format(output_shape[axis], filter.get_shape()[2]))
+            "{} != {}".format(output_shape[axis],
+                              filter.get_shape()[2]))
 
     if padding != "VALID" and padding != "SAME":
       raise ValueError("padding must be either VALID or SAME:"
@@ -1281,29 +1307,32 @@ def atrous_conv2d_transpose(value,
     if not value.get_shape()[3].is_compatible_with(filters.get_shape()[3]):
       raise ValueError(
           "value's input channels does not match filters' input channels, "
-          "{} != {}".format(value.get_shape()[3], filters.get_shape()[3]))
+          "{} != {}".format(value.get_shape()[3],
+                            filters.get_shape()[3]))
     if rate < 1:
       raise ValueError("rate {} cannot be less than one".format(rate))
 
     if rate == 1:
-      return conv2d_transpose(value,
-                              filters,
-                              output_shape,
-                              strides=[1, 1, 1, 1],
-                              padding=padding,
-                              data_format="NHWC")
+      return conv2d_transpose(
+          value,
+          filters,
+          output_shape,
+          strides=[1, 1, 1, 1],
+          padding=padding,
+          data_format="NHWC")
 
     output_shape_ = ops.convert_to_tensor(output_shape, name="output_shape")
     if not output_shape_.get_shape().is_compatible_with(tensor_shape.vector(4)):
-      raise ValueError("output_shape must have shape (4,), got {}"
-                       .format(output_shape_.get_shape()))
+      raise ValueError("output_shape must have shape (4,), got {}".format(
+          output_shape_.get_shape()))
 
     if isinstance(output_shape, (list, np.ndarray)):
       # output_shape's shape should be == [4] if reached this point.
       if not filters.get_shape()[2].is_compatible_with(output_shape[3]):
         raise ValueError(
             "output_shape does not match filter's output channels, "
-            "{} != {}".format(output_shape[3], filters.get_shape()[2]))
+            "{} != {}".format(output_shape[3],
+                              filters.get_shape()[2]))
 
     # We have two padding contributions. The first is used for converting "SAME"
     # to "VALID". The second is required so that the height and width of the
@@ -1352,14 +1381,13 @@ def atrous_conv2d_transpose(value,
     # component.
     space_to_batch_pad = [[0, pad_bottom_extra], [0, pad_right_extra]]
 
-    value = array_ops.space_to_batch(input=value,
-                                     paddings=space_to_batch_pad,
-                                     block_size=rate)
+    value = array_ops.space_to_batch(
+        input=value, paddings=space_to_batch_pad, block_size=rate)
 
-    input_sizes = [rate * rate * output_shape[0],
-                   (in_height + pad_bottom_extra) // rate,
-                   (in_width + pad_right_extra) // rate,
-                   output_shape[3]]
+    input_sizes = [
+        rate * rate * output_shape[0], (in_height + pad_bottom_extra) // rate,
+        (in_width + pad_right_extra) // rate, output_shape[3]
+    ]
 
     value = gen_nn_ops.conv2d_backprop_input(
         input_sizes=input_sizes,
@@ -1373,19 +1401,19 @@ def atrous_conv2d_transpose(value,
     batch_to_space_crop = [[pad_top, pad_bottom + pad_bottom_extra],
                            [pad_left, pad_right + pad_right_extra]]
 
-    return array_ops.batch_to_space(input=value,
-                                    crops=batch_to_space_crop,
-                                    block_size=rate)
+    return array_ops.batch_to_space(
+        input=value, crops=batch_to_space_crop, block_size=rate)
 
 
 @tf_export("nn.conv3d_transpose")
-def conv3d_transpose(value,
-                     filter,  # pylint: disable=redefined-builtin
-                     output_shape,
-                     strides,
-                     padding="SAME",
-                     data_format="NDHWC",
-                     name=None):
+def conv3d_transpose(
+    value,
+    filter,  # pylint: disable=redefined-builtin
+    output_shape,
+    strides,
+    padding="SAME",
+    data_format="NDHWC",
+    name=None):
   """The transpose of `conv3d`.
 
   This operation is sometimes called "deconvolution" after [Deconvolutional
@@ -1428,27 +1456,29 @@ def conv3d_transpose(value,
 
     output_shape_ = ops.convert_to_tensor(output_shape, name="output_shape")
     if not output_shape_.get_shape().is_compatible_with(tensor_shape.vector(5)):
-      raise ValueError("output_shape must have shape (5,), got {}"
-                       .format(output_shape_.get_shape()))
+      raise ValueError("output_shape must have shape (5,), got {}".format(
+          output_shape_.get_shape()))
 
     if isinstance(output_shape, (list, np.ndarray)):
       # output_shape's shape should be == [5] if reached this point.
       if not filter.get_shape()[3].is_compatible_with(output_shape[4]):
         raise ValueError(
             "output_shape does not match filter's output channels, "
-            "{} != {}".format(output_shape[4], filter.get_shape()[3]))
+            "{} != {}".format(output_shape[4],
+                              filter.get_shape()[3]))
 
     if padding != "VALID" and padding != "SAME":
       raise ValueError("padding must be either VALID or SAME:"
                        " {}".format(padding))
 
-    return gen_nn_ops.conv3d_backprop_input_v2(input_sizes=output_shape_,
-                                               filter=filter,
-                                               out_backprop=value,
-                                               strides=strides,
-                                               padding=padding,
-                                               data_format=data_format,
-                                               name=name)
+    return gen_nn_ops.conv3d_backprop_input_v2(
+        input_sizes=output_shape_,
+        filter=filter,
+        out_backprop=value,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        name=name)
 
 
 # pylint: disable=protected-access
@@ -1514,7 +1544,9 @@ def crelu(features, name=None, axis=-1):
   Concatenates a ReLU which selects only the positive part of the activation
   with a ReLU which selects only the *negative* part of the activation.
   Note that as a result this non-linearity doubles the depth of the activations.
-  Source: [Understanding and Improving Convolutional Neural Networks via Concatenated Rectified Linear Units. W. Shang, et al.](https://arxiv.org/abs/1603.05201)
+  Source: [Understanding and Improving Convolutional Neural Networks via
+  Concatenated Rectified Linear Units. W. Shang, et
+  al.](https://arxiv.org/abs/1603.05201)
 
   Args:
     features: A `Tensor` with type `float`, `double`, `int32`, `int64`, `uint8`,
@@ -1534,7 +1566,9 @@ def crelu(features, name=None, axis=-1):
 @tf_export("nn.relu6")
 def relu6(features, name=None):
   """Computes Rectified Linear 6: `min(max(features, 0), 6)`.
-  Source: [Convolutional Deep Belief Networks on CIFAR-10. A. Krizhevsky](http://www.cs.utoronto.ca/~kriz/conv-cifar10-aug2010.pdf)
+
+  Source: [Convolutional Deep Belief Networks on CIFAR-10. A.
+  Krizhevsky](http://www.cs.utoronto.ca/~kriz/conv-cifar10-aug2010.pdf)
 
   Args:
     features: A `Tensor` with type `float`, `double`, `int32`, `int64`, `uint8`,
@@ -1622,14 +1656,16 @@ def _softmax(logits, compute_op, dim=-1, name=None):
     InvalidArgumentError: if `logits` is empty or `dim` is beyond the last
       dimension of `logits`.
   """
+
   def _swap_axis(logits, dim_index, last_index, name=None):
     """Swaps logits's dim_index and last_index."""
-    return array_ops.transpose(logits,
-                               array_ops.concat([
-                                   math_ops.range(dim_index), [last_index],
-                                   math_ops.range(dim_index + 1, last_index),
-                                   [dim_index]
-                               ], 0), name=name)
+    return array_ops.transpose(
+        logits,
+        array_ops.concat([
+            math_ops.range(dim_index), [last_index],
+            math_ops.range(dim_index + 1, last_index), [dim_index]
+        ], 0),
+        name=name)
 
   logits = ops.convert_to_tensor(logits)
 
@@ -1746,9 +1782,12 @@ def _ensure_xent_args(name, sentinel, labels, logits):
 
 
 @tf_export("nn.softmax_cross_entropy_with_logits_v2")
-def softmax_cross_entropy_with_logits_v2(_sentinel=None,  # pylint: disable=invalid-name
-                                         labels=None, logits=None,
-                                         dim=-1, name=None):
+def softmax_cross_entropy_with_logits_v2(
+    _sentinel=None,  # pylint: disable=invalid-name
+    labels=None,
+    logits=None,
+    dim=-1,
+    name=None):
   """Computes softmax cross entropy between `logits` and `labels`.
 
   Measures the probability error in discrete classification tasks in which the
@@ -1790,19 +1829,19 @@ def softmax_cross_entropy_with_logits_v2(_sentinel=None,  # pylint: disable=inva
     A 1-D `Tensor` of length `batch_size` of the same type as `logits` with the
     softmax cross entropy loss.
   """
-  _ensure_xent_args("softmax_cross_entropy_with_logits", _sentinel,
-                    labels, logits)
+  _ensure_xent_args("softmax_cross_entropy_with_logits", _sentinel, labels,
+                    logits)
 
   # TODO(pcmurray) Raise an error when the labels do not sum to 1. Note: This
   # could break users who call this with bad labels, but disregard the bad
   # results.
 
-  with ops.name_scope(
-      name, "softmax_cross_entropy_with_logits", [logits, labels]) as name:
+  with ops.name_scope(name, "softmax_cross_entropy_with_logits",
+                      [logits, labels]) as name:
     logits = ops.convert_to_tensor(logits, name="logits")
     labels = ops.convert_to_tensor(labels, name="labels")
-    precise_logits = math_ops.cast(logits, dtypes.float32) if (
-        logits.dtype == dtypes.float16) else logits
+    precise_logits = math_ops.cast(
+        logits, dtypes.float32) if (logits.dtype == dtypes.float16) else logits
     # labels and logits must be of the same type
     labels = math_ops.cast(labels, precise_logits.dtype)
     input_rank = array_ops.rank(precise_logits)
@@ -1811,13 +1850,14 @@ def softmax_cross_entropy_with_logits_v2(_sentinel=None,  # pylint: disable=inva
 
     # Move the dim to the end if dim is not the last dimension.
     if dim is not -1:
+
       def _move_dim_to_end(tensor, dim_index, rank):
-        return array_ops.transpose(tensor,
-                                   array_ops.concat([
-                                       math_ops.range(dim_index),
-                                       math_ops.range(dim_index + 1, rank),
-                                       [dim_index]
-                                   ], 0))
+        return array_ops.transpose(
+            tensor,
+            array_ops.concat([
+                math_ops.range(dim_index),
+                math_ops.range(dim_index + 1, rank), [dim_index]
+            ], 0))
 
       precise_logits = _move_dim_to_end(precise_logits, dim, input_rank)
       labels = _move_dim_to_end(labels, dim, input_rank)
@@ -1862,9 +1902,12 @@ See tf.nn.softmax_cross_entropy_with_logits_v2.
 
 @tf_export("nn.softmax_cross_entropy_with_logits")
 @deprecation.deprecated(date=None, instructions=_XENT_DEPRECATION)
-def softmax_cross_entropy_with_logits(_sentinel=None,  # pylint: disable=invalid-name
-                                      labels=None, logits=None,
-                                      dim=-1, name=None):
+def softmax_cross_entropy_with_logits(
+    _sentinel=None,  # pylint: disable=invalid-name
+    labels=None,
+    logits=None,
+    dim=-1,
+    name=None):
   """Computes softmax cross entropy between `logits` and `labels`.
 
   Measures the probability error in discrete classification tasks in which the
@@ -1906,11 +1949,11 @@ def softmax_cross_entropy_with_logits(_sentinel=None,  # pylint: disable=invalid
     A 1-D `Tensor` of length `batch_size` of the same type as `logits` with the
     softmax cross entropy loss.
   """
-  _ensure_xent_args("softmax_cross_entropy_with_logits", _sentinel,
-                    labels, logits)
+  _ensure_xent_args("softmax_cross_entropy_with_logits", _sentinel, labels,
+                    logits)
 
-  with ops.name_scope(
-      name, "softmax_cross_entropy_with_logits_sg", [logits, labels]) as name:
+  with ops.name_scope(name, "softmax_cross_entropy_with_logits_sg",
+                      [logits, labels]) as name:
     labels = array_ops.stop_gradient(labels, name="labels_stop_gradient")
 
   return softmax_cross_entropy_with_logits_v2(
@@ -1918,9 +1961,11 @@ def softmax_cross_entropy_with_logits(_sentinel=None,  # pylint: disable=invalid
 
 
 @tf_export("nn.sparse_softmax_cross_entropy_with_logits")
-def sparse_softmax_cross_entropy_with_logits(_sentinel=None,  # pylint: disable=invalid-name
-                                             labels=None, logits=None,
-                                             name=None):
+def sparse_softmax_cross_entropy_with_logits(
+    _sentinel=None,  # pylint: disable=invalid-name
+    labels=None,
+    logits=None,
+    name=None):
   """Computes sparse softmax cross entropy between `logits` and `labels`.
 
   Measures the probability error in discrete classification tasks in which the
@@ -1976,15 +2021,15 @@ def sparse_softmax_cross_entropy_with_logits(_sentinel=None,  # pylint: disable=
                       [labels, logits]):
     labels = ops.convert_to_tensor(labels)
     logits = ops.convert_to_tensor(logits)
-    precise_logits = math_ops.cast(logits, dtypes.float32) if (
-        dtypes.as_dtype(logits.dtype) == dtypes.float16) else logits
+    precise_logits = math_ops.cast(logits, dtypes.float32) if (dtypes.as_dtype(
+        logits.dtype) == dtypes.float16) else logits
 
     # Store label shape for result later.
     labels_static_shape = labels.get_shape()
     labels_shape = array_ops.shape(labels)
     if logits.get_shape().ndims is not None and logits.get_shape().ndims == 0:
-      raise ValueError("Logits cannot be scalars - received shape %s." %
-                       logits.get_shape())
+      raise ValueError(
+          "Logits cannot be scalars - received shape %s." % logits.get_shape())
     if logits.get_shape().ndims is not None and (
         labels_static_shape.ndims is not None and
         labels_static_shape.ndims != logits.get_shape().ndims - 1):
@@ -2041,12 +2086,13 @@ def avg_pool(value, ksize, strides, padding, data_format="NHWC", name=None):
   """
   with ops.name_scope(name, "AvgPool", [value]) as name:
     value = ops.convert_to_tensor(value, name="input")
-    return gen_nn_ops._avg_pool(value,
-                                ksize=ksize,
-                                strides=strides,
-                                padding=padding,
-                                data_format=data_format,
-                                name=name)
+    return gen_nn_ops._avg_pool(
+        value,
+        ksize=ksize,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        name=name)
 
 
 @tf_export("nn.max_pool")
@@ -2070,12 +2116,13 @@ def max_pool(value, ksize, strides, padding, data_format="NHWC", name=None):
   """
   with ops.name_scope(name, "MaxPool", [value]) as name:
     value = ops.convert_to_tensor(value, name="input")
-    return gen_nn_ops._max_pool(value,
-                                ksize=ksize,
-                                strides=strides,
-                                padding=padding,
-                                data_format=data_format,
-                                name=name)
+    return gen_nn_ops._max_pool(
+        value,
+        ksize=ksize,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        name=name)
 
 
 @ops.RegisterStatistics("Conv2D", "flops")
@@ -2083,8 +2130,8 @@ def _calc_conv_flops(graph, node):
   """Calculates the compute resources needed for Conv2D."""
   input_shape = graph_util.tensor_shape_from_node_def_name(graph, node.input[0])
   input_shape.assert_is_fully_defined()
-  filter_shape = graph_util.tensor_shape_from_node_def_name(graph,
-                                                            node.input[1])
+  filter_shape = graph_util.tensor_shape_from_node_def_name(
+      graph, node.input[1])
   filter_shape.assert_is_fully_defined()
   output_shape = graph_util.tensor_shape_from_node_def_name(graph, node.name)
   output_shape.assert_is_fully_defined()
@@ -2092,8 +2139,9 @@ def _calc_conv_flops(graph, node):
   filter_width = int(filter_shape[1])
   filter_in_depth = int(filter_shape[2])
   output_count = np.prod(output_shape.as_list())
-  return ops.OpStats("flops", (output_count * filter_in_depth * filter_height *
-                               filter_width * 2))
+  return ops.OpStats(
+      "flops",
+      (output_count * filter_in_depth * filter_height * filter_width * 2))
 
 
 @ops.RegisterStatistics("DepthwiseConv2dNative", "flops")
@@ -2101,8 +2149,8 @@ def _calc_depthwise_conv_flops(graph, node):
   """Calculates the compute resources needed for DepthwiseConv2dNative."""
   input_shape = graph_util.tensor_shape_from_node_def_name(graph, node.input[0])
   input_shape.assert_is_fully_defined()
-  filter_shape = graph_util.tensor_shape_from_node_def_name(graph,
-                                                            node.input[1])
+  filter_shape = graph_util.tensor_shape_from_node_def_name(
+      graph, node.input[1])
   filter_shape.assert_is_fully_defined()
   output_shape = graph_util.tensor_shape_from_node_def_name(graph, node.name)
   output_shape.assert_is_fully_defined()
@@ -2210,9 +2258,8 @@ def dropout(x, keep_prob, noise_shape=None, seed=None, name=None):  # pylint: di
     if isinstance(keep_prob, numbers.Real) and not 0 < keep_prob <= 1:
       raise ValueError("keep_prob must be a scalar tensor or a float in the "
                        "range (0, 1], got %g" % keep_prob)
-    keep_prob = ops.convert_to_tensor(keep_prob,
-                                      dtype=x.dtype,
-                                      name="keep_prob")
+    keep_prob = ops.convert_to_tensor(
+        keep_prob, dtype=x.dtype, name="keep_prob")
     keep_prob.get_shape().assert_is_compatible_with(tensor_shape.scalar())
 
     # Do nothing if we know keep_prob == 1
@@ -2222,9 +2269,8 @@ def dropout(x, keep_prob, noise_shape=None, seed=None, name=None):  # pylint: di
     noise_shape = noise_shape if noise_shape is not None else array_ops.shape(x)
     # uniform [keep_prob, 1.0 + keep_prob)
     random_tensor = keep_prob
-    random_tensor += random_ops.random_uniform(noise_shape,
-                                               seed=seed,
-                                               dtype=x.dtype)
+    random_tensor += random_ops.random_uniform(
+        noise_shape, seed=seed, dtype=x.dtype)
     # 0. if [keep_prob, 1.0) and 1. if [1.0, 1.0 + keep_prob)
     binary_tensor = math_ops.floor(random_tensor)
     ret = math_ops.div(x, keep_prob) * binary_tensor
@@ -2293,13 +2339,21 @@ def nth_element(input, n, reverse=False, name=None):
 
 @tf_export("nn.conv1d")
 @deprecation.deprecated_arg_values(
-    None, "`NCHW` for data_format is deprecated, use `NCW` instead",
-    warn_once=True, data_format="NCHW")
+    None,
+    "`NCHW` for data_format is deprecated, use `NCW` instead",
+    warn_once=True,
+    data_format="NCHW")
 @deprecation.deprecated_arg_values(
-    None, "`NHWC` for data_format is deprecated, use `NWC` instead",
-    warn_once=True, data_format="NHWC")
-def conv1d(value, filters, stride, padding,
-           use_cudnn_on_gpu=None, data_format=None,
+    None,
+    "`NHWC` for data_format is deprecated, use `NWC` instead",
+    warn_once=True,
+    data_format="NHWC")
+def conv1d(value,
+           filters,
+           stride,
+           padding,
+           use_cudnn_on_gpu=None,
+           data_format=None,
            name=None):
   r"""Computes a 1-D convolution given 3-D input and filter tensors.
 
@@ -2358,9 +2412,13 @@ def conv1d(value, filters, stride, padding,
       raise ValueError("data_format must be \"NWC\" or \"NCW\".")
     value = array_ops.expand_dims(value, spatial_start_dim)
     filters = array_ops.expand_dims(filters, 0)
-    result = gen_nn_ops.conv2d(value, filters, strides, padding,
-                               use_cudnn_on_gpu=use_cudnn_on_gpu,
-                               data_format=data_format)
+    result = gen_nn_ops.conv2d(
+        value,
+        filters,
+        strides,
+        padding,
+        use_cudnn_on_gpu=use_cudnn_on_gpu,
+        data_format=data_format)
     return array_ops.squeeze(result, [spatial_start_dim])
 
 
@@ -2466,8 +2524,8 @@ def _calc_dilation2d_flops(graph, node):
   """Calculates the compute resources needed for Dilation2D."""
   input_shape = graph_util.tensor_shape_from_node_def_name(graph, node.input[0])
   input_shape.assert_is_fully_defined()
-  filter_shape = graph_util.tensor_shape_from_node_def_name(graph,
-                                                            node.input[1])
+  filter_shape = graph_util.tensor_shape_from_node_def_name(
+      graph, node.input[1])
   filter_shape.assert_is_fully_defined()
   output_shape = graph_util.tensor_shape_from_node_def_name(graph, node.name)
   output_shape.assert_is_fully_defined()
@@ -2527,12 +2585,13 @@ def erosion2d(value, kernel, strides, rates, padding, name=None):
   with ops.name_scope(name, "erosion2d", [value, kernel]) as name:
     # Reduce erosion to dilation by duality.
     return math_ops.negative(
-        gen_nn_ops.dilation2d(input=math_ops.negative(value),
-                              filter=array_ops.reverse_v2(kernel, [0, 1]),
-                              strides=strides,
-                              rates=rates,
-                              padding=padding,
-                              name=name))
+        gen_nn_ops.dilation2d(
+            input=math_ops.negative(value),
+            filter=array_ops.reverse_v2(kernel, [0, 1]),
+            strides=strides,
+            rates=rates,
+            padding=padding,
+            name=name))
 
 
 @tf_export("nn.in_top_k")
@@ -2565,5 +2624,5 @@ def in_top_k(predictions, targets, k, name=None):
   Returns:
     A `Tensor` of type `bool`. Computed Precision at `k` as a `bool Tensor`.
   """
-  with ops.name_scope(name, 'in_top_k'):
+  with ops.name_scope(name, "in_top_k"):
     return gen_nn_ops._in_top_kv2(predictions, targets, k, name=name)
diff --git a/tensorflow/python/ops/quantized_conv_ops_test.py b/tensorflow/python/ops/quantized_conv_ops_test.py
index 5e9e710027..4ac2a8f634 100644
--- a/tensorflow/python/ops/quantized_conv_ops_test.py
+++ b/tensorflow/python/ops/quantized_conv_ops_test.py
@@ -93,7 +93,8 @@ class Conv2DTest(test.TestCase):
     quantized_range = ((quantized_max - quantized_min) * range_adjust)
     range_scale = (quantized_range / number_of_steps)
     lowest_quantized = -(1 << (number_of_bits - 1))
-    result = np.array([(quantized_min + ((float(x) - lowest_quantized) * range_scale))
+    result = np.array([(quantized_min +
+                        ((float(x) - lowest_quantized) * range_scale))
                        for x in quantized.flatten()])
     return result
 
diff --git a/tensorflow/python/ops/quantized_ops_test.py b/tensorflow/python/ops/quantized_ops_test.py
index 4bf3b35e13..d590bc4be6 100644
--- a/tensorflow/python/ops/quantized_ops_test.py
+++ b/tensorflow/python/ops/quantized_ops_test.py
@@ -34,7 +34,10 @@ class QuantizedOpsTest(test.TestCase):
   def testQuantizeOp(self):
     expected_output = [1, 1, 2, 127, 255, 255]
     with self.test_session(use_gpu=False) as sess:
-      x = constant_op.constant([1.0, 1.25, 1.75, 127.0, 255.0, 500.0], shape=[6], dtype=dtypes.float32)
+      x = constant_op.constant(
+          [1.0, 1.25, 1.75, 127.0, 255.0, 500.0],
+          shape=[6],
+          dtype=dtypes.float32)
       x_min = 0.0
       x_max = 255.0
       op = array_ops.quantize(x, x_min, x_max, dtypes.quint8, mode="MIN_FIRST")
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index 3224856d7b..0fbbf5a805 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -227,13 +227,14 @@ def sparse_concat(axis,
             [array_ops.reshape(shape, [1, -1]) for shape in shapes], 0), 0)
     shapes = [
         array_ops.concat([
-            max_shape[:axis], shape[-1:] if axis == -1 else
-            shape[axis:axis + 1], [] if axis == -1 else max_shape[axis + 1:]
+            max_shape[:axis], shape[-1:]
+            if axis == -1 else shape[axis:axis + 1], []
+            if axis == -1 else max_shape[axis + 1:]
         ], 0) for shape in shapes
     ]
 
-  output_ind, output_val, output_shape = (gen_sparse_ops._sparse_concat(
-      inds, vals, shapes, axis, name=name))
+  output_ind, output_val, output_shape = (
+      gen_sparse_ops._sparse_concat(inds, vals, shapes, axis, name=name))
 
   return sparse_tensor.SparseTensor(output_ind, output_val, output_shape)
 
@@ -300,15 +301,14 @@ def sparse_add(a, b, thresh=0):
     b = _convert_to_sparse_tensor(b)
     thresh = ops.convert_to_tensor(
         thresh, dtype=a.values.dtype.real_dtype.base_dtype, name="thresh")
-    output_ind, output_val, output_shape = (gen_sparse_ops._sparse_add(
-        a.indices, a.values, a.dense_shape,
-        b.indices, b.values, b.dense_shape,
-        thresh))
+    output_ind, output_val, output_shape = (
+        gen_sparse_ops._sparse_add(a.indices, a.values, a.dense_shape,
+                                   b.indices, b.values, b.dense_shape, thresh))
 
     # Attempt to get output_shape statically.
     a.get_shape().assert_is_compatible_with(b.get_shape())
-    static_shape = array_ops.broadcast_static_shape(
-        a.get_shape(), b.get_shape())
+    static_shape = array_ops.broadcast_static_shape(a.get_shape(),
+                                                    b.get_shape())
     if static_shape.is_fully_defined():
       output_shape = static_shape.as_list()
 
@@ -317,8 +317,8 @@ def sparse_add(a, b, thresh=0):
     # swap to make `a` the SparseTensor.
     if isinstance(b, sparse_classes):
       a, b = b, a
-    return gen_sparse_ops._sparse_tensor_dense_add(
-        a.indices, a.values, a.dense_shape, b)
+    return gen_sparse_ops._sparse_tensor_dense_add(a.indices, a.values,
+                                                   a.dense_shape, b)
 
 
 def _sparse_cross(inputs, name=None):
@@ -397,19 +397,25 @@ def _sparse_cross_hashed(inputs, num_buckets=0, hash_key=None, name=None):
 _DEFAULT_HASH_KEY = 0xDECAFCAFFE
 
 
-def _sparse_cross_internal(
-    inputs, hashed_output=False, num_buckets=0, hash_key=None, name=None):
+def _sparse_cross_internal(inputs,
+                           hashed_output=False,
+                           num_buckets=0,
+                           hash_key=None,
+                           name=None):
   """See gen_sparse_ops._sparse_cross."""
   if not isinstance(inputs, list):
     raise TypeError("Inputs must be a list")
-  if not all(isinstance(i, sparse_tensor.SparseTensor) or
-             isinstance(i, ops.Tensor) for i in inputs):
+  if not all(
+      isinstance(i, sparse_tensor.SparseTensor) or isinstance(i, ops.Tensor)
+      for i in inputs):
     raise TypeError("All inputs must be SparseTensors")
 
-  sparse_inputs = [i for i in inputs
-                   if isinstance(i, sparse_tensor.SparseTensor)]
-  dense_inputs = [i for i in inputs
-                  if not isinstance(i, sparse_tensor.SparseTensor)]
+  sparse_inputs = [
+      i for i in inputs if isinstance(i, sparse_tensor.SparseTensor)
+  ]
+  dense_inputs = [
+      i for i in inputs if not isinstance(i, sparse_tensor.SparseTensor)
+  ]
 
   indices = [sp_input.indices for sp_input in sparse_inputs]
   values = [sp_input.values for sp_input in sparse_inputs]
@@ -504,8 +510,9 @@ def sparse_reorder(sp_input, name=None):
   """
   sp_input = _convert_to_sparse_tensor(sp_input)
 
-  reordered_ind, reordered_val = (gen_sparse_ops._sparse_reorder(
-      sp_input.indices, sp_input.values, sp_input.dense_shape, name=name))
+  reordered_ind, reordered_val = (
+      gen_sparse_ops._sparse_reorder(
+          sp_input.indices, sp_input.values, sp_input.dense_shape, name=name))
 
   if sp_input.get_shape().is_fully_defined():
     dense_shape = sp_input.get_shape().as_list()
@@ -572,8 +579,8 @@ def sparse_reshape(sp_input, shape, name=None):
         sp_input.indices, sp_input.dense_shape, shape, name=name)
 
     reshaped_shape_const = tensor_util.constant_value(shape)
-    if (reshaped_shape_const is not None
-        and sp_input.get_shape().is_fully_defined()):
+    if (reshaped_shape_const is not None and
+        sp_input.get_shape().is_fully_defined()):
       num_implied = sum((dim == -1) for dim in reshaped_shape_const)
       if num_implied > 1:
         raise ValueError("At most one dimension can be inferred (-1). Found: %s"
@@ -589,15 +596,15 @@ def sparse_reshape(sp_input, shape, name=None):
             in_shape_size // np.prod(non_implied_idx))
       reshaped_size = np.prod(reshaped_shape_const)
       if reshaped_size != in_shape_size:
-        raise ValueError(
-            "Cannot reshape a tensor with %d elements to shape %s "
-            "(%d elements)."
-            % (in_shape_size, original_reshaped_shape, reshaped_size))
+        raise ValueError("Cannot reshape a tensor with %d elements to shape %s "
+                         "(%d elements)." %
+                         (in_shape_size, original_reshaped_shape,
+                          reshaped_size))
       reshaped_shape = reshaped_shape_const
 
-    return sparse_tensor.SparseTensor(
-        reshaped_ind, array_ops.identity(sp_input.values),
-        reshaped_shape)
+    return sparse_tensor.SparseTensor(reshaped_ind,
+                                      array_ops.identity(sp_input.values),
+                                      reshaped_shape)
 
 
 # TODO(aselle): Remove keyword required once for 1.0 final
@@ -610,8 +617,11 @@ class KeywordRequired(object):
 
 @tf_export("sparse_split")
 def sparse_split(keyword_required=KeywordRequired(),
-                 sp_input=None, num_split=None, axis=None,
-                 name=None, split_dim=None):
+                 sp_input=None,
+                 num_split=None,
+                 axis=None,
+                 name=None,
+                 split_dim=None):
   """Split a `SparseTensor` into `num_split` tensors along `axis`.
 
   If the `sp_input.dense_shape[axis]` is not an integer multiple of `num_split`
@@ -660,18 +670,19 @@ def sparse_split(keyword_required=KeywordRequired(),
                                                 split_dim)
   sp_input = _convert_to_sparse_tensor(sp_input)
 
-  output_inds, output_vals, output_shapes = (gen_sparse_ops._sparse_split(
-      axis,
-      sp_input.indices,
-      sp_input.values,
-      sp_input.dense_shape,
-      num_split,
-      name=name))
+  output_inds, output_vals, output_shapes = (
+      gen_sparse_ops._sparse_split(
+          axis,
+          sp_input.indices,
+          sp_input.values,
+          sp_input.dense_shape,
+          num_split,
+          name=name))
   sparse_tensors = []
   for i in range(0, num_split):
     sparse_tensors.append(
-        sparse_tensor.SparseTensor(
-            output_inds[i], output_vals[i], output_shapes[i]))
+        sparse_tensor.SparseTensor(output_inds[i], output_vals[i],
+                                   output_shapes[i]))
   return sparse_tensors
 
 
@@ -713,12 +724,15 @@ def sparse_slice(sp_input, start, size, name=None):
 
   with ops.name_scope(name, "SparseSlice", [sp_input]) as name:
     output_indices, output_values, output_shape = gen_sparse_ops.sparse_slice(
-        sp_input.indices, sp_input.values, sp_input.dense_shape, start, size, name=name)
+        sp_input.indices,
+        sp_input.values,
+        sp_input.dense_shape,
+        start,
+        size,
+        name=name)
 
-    return sparse_tensor.SparseTensor(
-        output_indices,
-        output_values,
-        output_shape)
+    return sparse_tensor.SparseTensor(output_indices, output_values,
+                                      output_shape)
 
 
 @tf_export("sparse_to_dense")
@@ -819,14 +833,14 @@ def sparse_reduce_max(sp_input, axis=None, keep_dims=False,
     The reduced Tensor.
   """
   return gen_sparse_ops.sparse_reduce_max(
-      sp_input.indices, sp_input.values,
-      sp_input.dense_shape,
-      math_ops._ReductionDims(sp_input, axis, reduction_axes),
-      keep_dims)
+      sp_input.indices, sp_input.values, sp_input.dense_shape,
+      math_ops._ReductionDims(sp_input, axis, reduction_axes), keep_dims)
 
 
 @tf_export("sparse_reduce_max_sparse")
-def sparse_reduce_max_sparse(sp_input, axis=None, keep_dims=False,
+def sparse_reduce_max_sparse(sp_input,
+                             axis=None,
+                             keep_dims=False,
                              reduction_axes=None):
   """Computes the max of elements across dimensions of a SparseTensor.
 
@@ -855,10 +869,8 @@ def sparse_reduce_max_sparse(sp_input, axis=None, keep_dims=False,
   """
   output_ind, output_val, output_shape = (
       gen_sparse_ops.sparse_reduce_max_sparse(
-          sp_input.indices, sp_input.values,
-          sp_input.dense_shape, math_ops._ReductionDims(sp_input, axis,
-                                                        reduction_axes),
-          keep_dims))
+          sp_input.indices, sp_input.values, sp_input.dense_shape,
+          math_ops._ReductionDims(sp_input, axis, reduction_axes), keep_dims))
 
   return sparse_tensor.SparseTensor(output_ind, output_val, output_shape)
 
@@ -905,14 +917,14 @@ def sparse_reduce_sum(sp_input, axis=None, keep_dims=False,
     The reduced Tensor.
   """
   return gen_sparse_ops.sparse_reduce_sum(
-      sp_input.indices, sp_input.values,
-      sp_input.dense_shape,
-      math_ops._ReductionDims(sp_input, axis, reduction_axes),
-      keep_dims)
+      sp_input.indices, sp_input.values, sp_input.dense_shape,
+      math_ops._ReductionDims(sp_input, axis, reduction_axes), keep_dims)
 
 
 @tf_export("sparse_reduce_sum_sparse")
-def sparse_reduce_sum_sparse(sp_input, axis=None, keep_dims=False,
+def sparse_reduce_sum_sparse(sp_input,
+                             axis=None,
+                             keep_dims=False,
                              reduction_axes=None):
   """Computes the sum of elements across dimensions of a SparseTensor.
 
@@ -941,10 +953,8 @@ def sparse_reduce_sum_sparse(sp_input, axis=None, keep_dims=False,
   """
   output_ind, output_val, output_shape = (
       gen_sparse_ops.sparse_reduce_sum_sparse(
-          sp_input.indices, sp_input.values,
-          sp_input.dense_shape, math_ops._ReductionDims(sp_input, axis,
-                                                        reduction_axes),
-          keep_dims))
+          sp_input.indices, sp_input.values, sp_input.dense_shape,
+          math_ops._ReductionDims(sp_input, axis, reduction_axes), keep_dims))
 
   return sparse_tensor.SparseTensor(output_ind, output_val, output_shape)
 
@@ -1053,8 +1063,8 @@ def sparse_to_indicator(sp_input, vocab_size, name=None):
   with ops.name_scope(name, "SparseToIndicator", [sp_input]) as name:
     num_entries = array_ops.shape(sp_input.indices)[0]
     new_values = array_ops.fill(array_ops.expand_dims(num_entries, 0), True)
-    sp_values = sparse_tensor.SparseTensor(
-        sp_input.indices, new_values, sp_input.dense_shape)
+    sp_values = sparse_tensor.SparseTensor(sp_input.indices, new_values,
+                                           sp_input.dense_shape)
 
     sp_new = sparse_merge(sp_input, sp_values, vocab_size, name)
 
@@ -1174,8 +1184,7 @@ def sparse_merge(sp_ids, sp_values, vocab_size, name=None,
       raise TypeError("vocab_size has to be a list of Tensors or Python ints. "
                       "Found %s" % type(vocab_size))
     for dim in vocab_size:
-      if not (isinstance(dim, ops.Tensor) or
-              isinstance(dim, numbers.Integral)):
+      if not (isinstance(dim, ops.Tensor) or isinstance(dim, numbers.Integral)):
         raise TypeError(
             "vocab_size has to be a list of Tensors or Python ints. Found %s" %
             type(dim))
@@ -1326,24 +1335,23 @@ def sparse_reset_shape(sp_input, new_shape=None):
     # error before the sparse_tensor.SparseTensor catches it.
     output_shape_tensor.get_shape()[0].merge_with(in_shape.get_shape()[0])
 
-    output_shape_tensor_const = tensor_util.constant_value(
-        output_shape_tensor)
+    output_shape_tensor_const = tensor_util.constant_value(output_shape_tensor)
     # For cases where all shapes are known during graph construction
-    if (output_shape_tensor_const is not None
-        and sp_input.get_shape().is_fully_defined()):
+    if (output_shape_tensor_const is not None and
+        sp_input.get_shape().is_fully_defined()):
       in_shape_const = np.array(sp_input.get_shape().as_list())
       if not np.all(in_shape_const <= output_shape_tensor_const):
         raise ValueError(
             "Requested new_shape should have dimension sizes >= sp_input.shape."
-            "  Found new_shape (%s), sp_input.shape (%s)."
-            % (in_shape_const, output_shape_tensor_const))
+            "  Found new_shape (%s), sp_input.shape (%s)." %
+            (in_shape_const, output_shape_tensor_const))
       output_shape_tensor = output_shape_tensor_const
     else:
       # For cases where shape is not known during graph construction.
-      output_shape_tensor = control_flow_ops.with_dependencies(
-          [check_ops.assert_equal(
-              array_ops.shape(in_shape), array_ops.shape(output_shape_tensor))],
-          output_shape_tensor)
+      output_shape_tensor = control_flow_ops.with_dependencies([
+          check_ops.assert_equal(
+              array_ops.shape(in_shape), array_ops.shape(output_shape_tensor))
+      ], output_shape_tensor)
       output_shape_tensor = control_flow_ops.with_dependencies(
           [check_ops.assert_less_equal(in_shape, output_shape_tensor)],
           output_shape_tensor)
@@ -1409,10 +1417,10 @@ def sparse_fill_empty_rows(sp_input, default_value, name=None):
          values=sp_input.values,
          dense_shape=sp_input.dense_shape,
          default_value=default_value)
-    return (sparse_tensor.SparseTensor(indices=output_indices,
-                                       values=output_values,
-                                       dense_shape=sp_input.dense_shape),
-            empty_row_indicator)
+    return (sparse_tensor.SparseTensor(
+        indices=output_indices,
+        values=output_values,
+        dense_shape=sp_input.dense_shape), empty_row_indicator)
 
 
 @tf_export("serialize_sparse")
@@ -1880,8 +1888,8 @@ def sparse_softmax(sp_input, name=None):
                       [sp_input.indices, sp_input.values]) as name:
     out_vals = gen_sparse_ops.sparse_softmax(sp_input.indices, sp_input.values,
                                              sp_input.dense_shape)
-    return sparse_tensor.SparseTensor(
-        sp_input.indices, out_vals, sp_input.dense_shape)
+    return sparse_tensor.SparseTensor(sp_input.indices, out_vals,
+                                      sp_input.dense_shape)
 
 
 @tf_export("sparse_maximum")
@@ -1907,9 +1915,9 @@ def sparse_maximum(sp_a, sp_b, name=None):
   Returns:
     output: the output SparseTensor.
   """
-  with ops.name_scope(name, "SparseSparseMaximum", [sp_a.indices, sp_a.values,
-                                                    sp_b.indices,
-                                                    sp_b.values]) as name:
+  with ops.name_scope(
+      name, "SparseSparseMaximum",
+      [sp_a.indices, sp_a.values, sp_b.indices, sp_b.values]) as name:
     out_indices, out_values = gen_sparse_ops.sparse_sparse_maximum(
         sp_a.indices,
         sp_a.values,
@@ -1944,9 +1952,9 @@ def sparse_minimum(sp_a, sp_b, name=None):
   Returns:
     output: the output SparseTensor.
   """
-  with ops.name_scope(name, "SparseSparseMinimum", [sp_a.indices, sp_a.values,
-                                                    sp_b.indices,
-                                                    sp_b.values]) as name:
+  with ops.name_scope(
+      name, "SparseSparseMinimum",
+      [sp_a.indices, sp_a.values, sp_b.indices, sp_b.values]) as name:
     out_indices, out_values = gen_sparse_ops.sparse_sparse_minimum(
         sp_a.indices,
         sp_a.values,
@@ -2010,14 +2018,15 @@ def sparse_transpose(sp_input, perm=None, name=None):
       dense_shape = sp_input.dense_shape
       transposed_dense_shape = array_ops.gather(dense_shape, perm)
     transposed_st = sparse_tensor.SparseTensor(
-        transposed_indices, sp_input.values,
-        transposed_dense_shape)
+        transposed_indices, sp_input.values, transposed_dense_shape)
     transposed_st = sparse_reorder(transposed_st)
     return transposed_st
 
 
-def _add_sparse_to_tensors_map(sp_input, container=None,
-                               shared_name=None, name=None):
+def _add_sparse_to_tensors_map(sp_input,
+                               container=None,
+                               shared_name=None,
+                               name=None):
   """Add a `SparseTensor` to a `SparseTensorsMap` and return its handle.
 
   Args:
@@ -2038,12 +2047,18 @@ def _add_sparse_to_tensors_map(sp_input, container=None,
   sp_input = _convert_to_sparse_tensor(sp_input)
 
   return gen_sparse_ops._add_sparse_to_tensors_map(
-      sp_input.indices, sp_input.values, sp_input.dense_shape,
-      container=container, shared_name=shared_name, name=name)
+      sp_input.indices,
+      sp_input.values,
+      sp_input.dense_shape,
+      container=container,
+      shared_name=shared_name,
+      name=name)
 
 
-def _add_many_sparse_to_tensors_map(sp_input, container=None,
-                                    shared_name=None, name=None):
+def _add_many_sparse_to_tensors_map(sp_input,
+                                    container=None,
+                                    shared_name=None,
+                                    name=None):
   """Add a minibatch `SparseTensor` to a `SparseTensorsMap`, return `N` handles.
 
   The `SparseTensor` must have rank `R` greater than 1, and the first dimension
@@ -2072,12 +2087,18 @@ def _add_many_sparse_to_tensors_map(sp_input, container=None,
   sp_input = _convert_to_sparse_tensor(sp_input)
 
   return gen_sparse_ops._add_many_sparse_to_tensors_map(
-      sp_input.indices, sp_input.values, sp_input.dense_shape,
-      container=container, shared_name=shared_name, name=name)
+      sp_input.indices,
+      sp_input.values,
+      sp_input.dense_shape,
+      container=container,
+      shared_name=shared_name,
+      name=name)
 
 
-def _take_many_sparse_from_tensors_map(
-    sparse_map_op, sparse_handles, rank=None, name=None):
+def _take_many_sparse_from_tensors_map(sparse_map_op,
+                                       sparse_handles,
+                                       rank=None,
+                                       name=None):
   """Read `SparseTensors` from a `SparseTensorsMap` and concatenate them.
 
   The input `sparse_handles` must be a string matrix of shape `[N, 1]` where
@@ -2140,16 +2161,18 @@ def _take_many_sparse_from_tensors_map(
     raise TypeError("sparse_map_op be an Operation")
   if sparse_map_op.type not in ("AddSparseToTensorsMap",
                                 "AddManySparseToTensorsMap"):
-    raise TypeError("sparse_map_op must be one of AddSparseToTensorsMap or "
-                    "AddSparseToTensorsMap. Instead, found `%s`." %
-                    sparse_map_op.type)
+    raise TypeError(
+        "sparse_map_op must be one of AddSparseToTensorsMap or "
+        "AddSparseToTensorsMap. Instead, found `%s`." % sparse_map_op.type)
   with ops.colocate_with(sparse_map_op):
     shared_name = sparse_map_op.get_attr("shared_name") or sparse_map_op.name
     output_indices, output_values, output_shape = (
         gen_sparse_ops._take_many_sparse_from_tensors_map(
-            sparse_handles, dtype=sparse_map_op.get_attr("T"),
+            sparse_handles,
+            dtype=sparse_map_op.get_attr("T"),
             container=sparse_map_op.get_attr("container"),
-            shared_name=shared_name, name=name))
+            shared_name=shared_name,
+            name=name))
 
   # Feed rank data back in, if available
   output_indices.set_shape([None, rank])
diff --git a/tensorflow/python/ops/special_math_ops.py b/tensorflow/python/ops/special_math_ops.py
index 15127862a4..6d7eaababc 100644
--- a/tensorflow/python/ops/special_math_ops.py
+++ b/tensorflow/python/ops/special_math_ops.py
@@ -192,8 +192,8 @@ def einsum(equation, *inputs, **kwargs):
       input_count = sum(1 for s in input_axis_labels if a in s)
       if input_count > 2 and a not in output_axis_labels:
         logging.warn(
-            'Falling back to exponential-space implementation of einsum() because'
-            ' index "%s" is summed over more than two inputs.', a)
+            'Falling back to exponential-space implementation of einsum()'
+            ' because index "%s" is summed over more than two inputs.', a)
         return _exponential_space_einsum(equation, *inputs)
 
     temp = inputs[0]
diff --git a/tensorflow/python/saved_model/simple_save.py b/tensorflow/python/saved_model/simple_save.py
index 9a81e5cd80..1e4cc73370 100644
--- a/tensorflow/python/saved_model/simple_save.py
+++ b/tensorflow/python/saved_model/simple_save.py
@@ -40,17 +40,20 @@ def simple_save(session, export_dir, inputs, outputs, legacy_init_op=None):
     - It will be treated as a graph for inference / serving (i.e. uses the tag
       `tag_constants.SERVING`)
     - The SavedModel will load in TensorFlow Serving and supports the
-      [Predict API](https://github.com/tensorflow/serving/blob/master/tensorflow_serving/apis/predict.proto).
+      [Predict
+      API](https://github.com/tensorflow/serving/blob/master/tensorflow_serving/apis/predict.proto).
       To use the Classify, Regress, or MultiInference APIs, please
       use either
       [tf.Estimator](https://www.tensorflow.org/api_docs/python/tf/estimator/Estimator)
       or the lower level
-      [SavedModel APIs](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md).
+      [SavedModel
+      APIs](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md).
     - Some TensorFlow ops depend on information on disk or other information
       called "assets". These are generally handled automatically by adding the
       assets to the `GraphKeys.ASSET_FILEPATHS` collection. Only assets in that
       collection are exported; if you need more custom behavior, you'll need to
-      use the [SavedModelBuilder](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/builder.py).
+      use the
+      [SavedModelBuilder](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/builder.py).
 
   More information about SavedModel and signatures can be found here:
   https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md.
diff --git a/tensorflow/python/training/learning_rate_decay.py b/tensorflow/python/training/learning_rate_decay.py
index 3ee49650e0..343a49cded 100644
--- a/tensorflow/python/training/learning_rate_decay.py
+++ b/tensorflow/python/training/learning_rate_decay.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Various learning rate decay functions."""
 from __future__ import absolute_import
 from __future__ import division
@@ -28,8 +27,12 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 
 
-def exponential_decay(learning_rate, global_step, decay_steps, decay_rate,
-                      staircase=False, name=None):
+def exponential_decay(learning_rate,
+                      global_step,
+                      decay_steps,
+                      decay_rate,
+                      staircase=False,
+                      name=None):
   """Applies exponential decay to the learning rate.
 
   When training a model, it is often recommended to lower the learning rate as
@@ -85,9 +88,9 @@ def exponential_decay(learning_rate, global_step, decay_steps, decay_rate,
   """
   if global_step is None:
     raise ValueError("global_step is required for exponential_decay.")
-  with ops.name_scope(name, "ExponentialDecay",
-                      [learning_rate, global_step,
-                       decay_steps, decay_rate]) as name:
+  with ops.name_scope(
+      name, "ExponentialDecay",
+      [learning_rate, global_step, decay_steps, decay_rate]) as name:
     learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
     dtype = learning_rate.dtype
     global_step = math_ops.cast(global_step, dtype)
@@ -96,8 +99,8 @@ def exponential_decay(learning_rate, global_step, decay_steps, decay_rate,
     p = global_step / decay_steps
     if staircase:
       p = math_ops.floor(p)
-    return math_ops.multiply(learning_rate, math_ops.pow(decay_rate, p),
-                             name=name)
+    return math_ops.multiply(
+        learning_rate, math_ops.pow(decay_rate, p), name=name)
 
 
 def piecewise_constant(x, boundaries, values, name=None):
@@ -156,15 +159,15 @@ def piecewise_constant(x, boundaries, values, name=None):
           boundaries[i] = b
         else:
           raise ValueError(
-              "Boundaries (%s) must have the same dtype as x (%s)." % (
-                  b.dtype.base_dtype, x.dtype.base_dtype))
+              "Boundaries (%s) must have the same dtype as x (%s)." %
+              (b.dtype.base_dtype, x.dtype.base_dtype))
     # TODO(rdipietro): Ensure that boundaries' elements are strictly increasing.
     values = ops.convert_n_to_tensor(values)
     for v in values[1:]:
       if v.dtype.base_dtype != values[0].dtype.base_dtype:
         raise ValueError(
-            "Values must have elements all with the same dtype (%s vs %s)." % (
-                values[0].dtype.base_dtype, v.dtype.base_dtype))
+            "Values must have elements all with the same dtype (%s vs %s)." %
+            (values[0].dtype.base_dtype, v.dtype.base_dtype))
     pred_fn_pairs = []
     pred_fn_pairs.append((x <= boundaries[0], lambda: values[0]))
     pred_fn_pairs.append((x > boundaries[-1], lambda: values[-1]))
@@ -179,9 +182,13 @@ def piecewise_constant(x, boundaries, values, name=None):
     return control_flow_ops.case(pred_fn_pairs, default, exclusive=True)
 
 
-def polynomial_decay(learning_rate, global_step, decay_steps,
-                     end_learning_rate=0.0001, power=1.0,
-                     cycle=False, name=None):
+def polynomial_decay(learning_rate,
+                     global_step,
+                     decay_steps,
+                     end_learning_rate=0.0001,
+                     power=1.0,
+                     cycle=False,
+                     name=None):
   """Applies a polynomial decay to the learning rate.
 
   It is commonly observed that a monotonically decreasing learning rate, whose
@@ -255,9 +262,10 @@ def polynomial_decay(learning_rate, global_step, decay_steps,
   """
   if global_step is None:
     raise ValueError("global_step is required for polynomial_decay.")
-  with ops.name_scope(name, "PolynomialDecay",
-                      [learning_rate, global_step,
-                       decay_steps, end_learning_rate, power]) as name:
+  with ops.name_scope(
+      name, "PolynomialDecay",
+      [learning_rate, global_step, decay_steps, end_learning_rate, power
+      ]) as name:
     learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
     dtype = learning_rate.dtype
     global_step = math_ops.cast(global_step, dtype)
@@ -267,23 +275,28 @@ def polynomial_decay(learning_rate, global_step, decay_steps,
     if cycle:
       # Find the first multiple of decay_steps that is bigger than global_step.
       # If global_step is zero set the multiplier to 1
-      multiplier = control_flow_ops.cond(math_ops.equal(global_step, 0),
-                                         lambda: 1.0,
-                                         lambda: math_ops.ceil(
-                                             global_step / decay_steps))
+      multiplier = control_flow_ops.cond(
+          math_ops.equal(global_step, 0), lambda: 1.0,
+          lambda: math_ops.ceil(global_step / decay_steps))
       decay_steps = math_ops.multiply(decay_steps, multiplier)
     else:
       # Make sure that the global_step used is not bigger than decay_steps.
       global_step = math_ops.minimum(global_step, decay_steps)
 
     p = math_ops.div(global_step, decay_steps)
-    return math_ops.add(math_ops.multiply(learning_rate - end_learning_rate,
-                                          math_ops.pow(1 - p, power)),
-                        end_learning_rate, name=name)
-
-
-def natural_exp_decay(learning_rate, global_step, decay_steps, decay_rate,
-                      staircase=False, name=None):
+    return math_ops.add(
+        math_ops.multiply(learning_rate - end_learning_rate,
+                          math_ops.pow(1 - p, power)),
+        end_learning_rate,
+        name=name)
+
+
+def natural_exp_decay(learning_rate,
+                      global_step,
+                      decay_steps,
+                      decay_rate,
+                      staircase=False,
+                      name=None):
   """Applies natural exponential decay to the initial learning rate.
 
   When training a model, it is often recommended to lower the learning rate as
@@ -349,8 +362,12 @@ def natural_exp_decay(learning_rate, global_step, decay_steps, decay_rate,
     return math_ops.multiply(learning_rate, exponent, name=name)
 
 
-def inverse_time_decay(learning_rate, global_step, decay_steps, decay_rate,
-                       staircase=False, name=None):
+def inverse_time_decay(learning_rate,
+                       global_step,
+                       decay_steps,
+                       decay_rate,
+                       staircase=False,
+                       name=None):
   """Applies inverse time decay to the initial learning rate.
 
   When training a model, it is often recommended to lower the learning rate as
@@ -362,13 +379,15 @@ def inverse_time_decay(learning_rate, global_step, decay_steps, decay_rate,
   The function returns the decayed learning rate.  It is computed as:
 
   ```python
-  decayed_learning_rate = learning_rate / (1 + decay_rate * global_step / decay_step)
+  decayed_learning_rate = learning_rate / (1 + decay_rate * global_step /
+  decay_step)
   ```
 
   or, if `staircase` is `True`, as:
 
   ```python
-  decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step / decay_step))
+  decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step /
+  decay_step))
   ```
 
   Example: decay 1/t with a rate of 0.5:
@@ -379,7 +398,8 @@ def inverse_time_decay(learning_rate, global_step, decay_steps, decay_rate,
   learning_rate = 0.1
   decay_steps = 1.0
   decay_rate = 0.5
-  learning_rate = tf.train.inverse_time_decay(learning_rate, global_step, decay_steps, decay_rate)
+  learning_rate = tf.train.inverse_time_decay(learning_rate, global_step,
+  decay_steps, decay_rate)
 
   # Passing global_step to minimize() will increment it at each step.
   learning_step = (
@@ -424,8 +444,7 @@ def inverse_time_decay(learning_rate, global_step, decay_steps, decay_rate,
     return math_ops.div(learning_rate, denom, name=name)
 
 
-def cosine_decay(learning_rate, global_step, decay_steps, alpha=0.0,
-                 name=None):
+def cosine_decay(learning_rate, global_step, decay_steps, alpha=0.0, name=None):
   """Applies cosine decay to the learning rate.
 
   See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent
@@ -484,8 +503,13 @@ def cosine_decay(learning_rate, global_step, decay_steps, alpha=0.0,
     return math_ops.multiply(learning_rate, decayed)
 
 
-def cosine_decay_restarts(learning_rate, global_step, first_decay_steps,
-                          t_mul=2.0, m_mul=1.0, alpha=0.0, name=None):
+def cosine_decay_restarts(learning_rate,
+                          global_step,
+                          first_decay_steps,
+                          t_mul=2.0,
+                          m_mul=1.0,
+                          alpha=0.0,
+                          name=None):
   """Applies cosine decay with restarts to the learning rate.
 
   See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent
@@ -532,10 +556,9 @@ def cosine_decay_restarts(learning_rate, global_step, first_decay_steps,
   """
   if global_step is None:
     raise ValueError("cosine decay restarts requires global_step")
-  with ops.name_scope(name, "SGDRDecay",
-                      [learning_rate, global_step]) as name:
-    learning_rate = ops.convert_to_tensor(learning_rate,
-                                          name="initial_learning_rate")
+  with ops.name_scope(name, "SGDRDecay", [learning_rate, global_step]) as name:
+    learning_rate = ops.convert_to_tensor(
+        learning_rate, name="initial_learning_rate")
     dtype = learning_rate.dtype
     global_step = math_ops.cast(global_step, dtype)
     first_decay_steps = math_ops.cast(first_decay_steps, dtype)
@@ -547,11 +570,12 @@ def cosine_decay_restarts(learning_rate, global_step, first_decay_steps,
 
     def compute_step(completed_fraction, geometric=False):
       if geometric:
-        i_restart = math_ops.floor(math_ops.log(1.0 - completed_fraction * (
-            1.0 - t_mul)) / math_ops.log(t_mul))
+        i_restart = math_ops.floor(
+            math_ops.log(1.0 - completed_fraction * (1.0 - t_mul)) /
+            math_ops.log(t_mul))
 
-        sum_r = (1.0 - t_mul ** i_restart) / (1.0 - t_mul)
-        completed_fraction = (completed_fraction - sum_r) / t_mul ** i_restart
+        sum_r = (1.0 - t_mul**i_restart) / (1.0 - t_mul)
+        completed_fraction = (completed_fraction - sum_r) / t_mul**i_restart
 
       else:
         i_restart = math_ops.floor(completed_fraction)
@@ -564,16 +588,20 @@ def cosine_decay_restarts(learning_rate, global_step, first_decay_steps,
         lambda: compute_step(completed_fraction, geometric=False),
         lambda: compute_step(completed_fraction, geometric=True))
 
-    m_fac = m_mul ** i_restart
-    cosine_decayed = 0.5 * m_fac * (1.0 + math_ops.cos(
-        constant_op.constant(math.pi) * completed_fraction))
+    m_fac = m_mul**i_restart
+    cosine_decayed = 0.5 * m_fac * (
+        1.0 + math_ops.cos(constant_op.constant(math.pi) * completed_fraction))
     decayed = (1 - alpha) * cosine_decayed + alpha
 
   return math_ops.multiply(learning_rate, decayed, name=name)
 
 
-def linear_cosine_decay(learning_rate, global_step, decay_steps,
-                        num_periods=0.5, alpha=0.0, beta=0.001,
+def linear_cosine_decay(learning_rate,
+                        global_step,
+                        decay_steps,
+                        num_periods=0.5,
+                        alpha=0.0,
+                        beta=0.001,
                         name=None):
   """Applies linear cosine decay to the learning rate.
 
@@ -651,9 +679,14 @@ def linear_cosine_decay(learning_rate, global_step, decay_steps,
     return math_ops.multiply(learning_rate, linear_cosine_decayed, name=name)
 
 
-def noisy_linear_cosine_decay(learning_rate, global_step, decay_steps,
-                              initial_variance=1.0, variance_decay=0.55,
-                              num_periods=0.5, alpha=0.0, beta=0.001,
+def noisy_linear_cosine_decay(learning_rate,
+                              global_step,
+                              decay_steps,
+                              initial_variance=1.0,
+                              variance_decay=0.55,
+                              num_periods=0.5,
+                              alpha=0.0,
+                              beta=0.001,
                               name=None):
   """Applies noisy linear cosine decay to the learning rate.
 
@@ -734,8 +767,8 @@ def noisy_linear_cosine_decay(learning_rate, global_step, decay_steps,
         math_ops.pow(1.0 + global_step, variance_decay))
     std = math_ops.sqrt(variance)
     noisy_linear_decayed = (
-        linear_decayed + random_ops.random_normal(
-            linear_decayed.shape, stddev=std))
+        linear_decayed +
+        random_ops.random_normal(linear_decayed.shape, stddev=std))
 
     completed_fraction = global_step / decay_steps
     fraction = 2.0 * num_periods * completed_fraction
diff --git a/tensorflow/python/training/rmsprop.py b/tensorflow/python/training/rmsprop.py
index ebec725b7b..745e612018 100644
--- a/tensorflow/python/training/rmsprop.py
+++ b/tensorflow/python/training/rmsprop.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """One-line documentation for rmsprop module.
 
 rmsprop algorithm [tieleman2012rmsprop]
@@ -52,7 +51,8 @@ from tensorflow.python.training import training_ops
 class RMSPropOptimizer(optimizer.Optimizer):
   """Optimizer that implements the RMSProp algorithm.
 
-  See the [paper](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf).
+  See the
+  [paper](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf).
   """
 
   def __init__(self,
@@ -113,13 +113,12 @@ class RMSPropOptimizer(optimizer.Optimizer):
       self._zeros_slot(v, "momentum", self._name)
 
   def _prepare(self):
-    self._learning_rate_tensor = ops.convert_to_tensor(self._learning_rate,
-                                                       name="learning_rate")
+    self._learning_rate_tensor = ops.convert_to_tensor(
+        self._learning_rate, name="learning_rate")
     self._decay_tensor = ops.convert_to_tensor(self._decay, name="decay")
-    self._momentum_tensor = ops.convert_to_tensor(self._momentum,
-                                                  name="momentum")
-    self._epsilon_tensor = ops.convert_to_tensor(self._epsilon,
-                                                 name="epsilon")
+    self._momentum_tensor = ops.convert_to_tensor(
+        self._momentum, name="momentum")
+    self._epsilon_tensor = ops.convert_to_tensor(self._epsilon, name="epsilon")
 
   def _apply_dense(self, grad, var):
     rms = self.get_slot(var, "rms")
diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index 27fa1b89ce..6f4f0f9859 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -183,7 +183,8 @@ do_pylint() {
   # W0311 bad-indentation
   # W0312 mixed-indentation
   # C0330 bad-continuation
-  grep -E '(\[E|\[W0311|\[W0312|\[C0330)' ${OUTPUT_FILE} > ${ERRORS_FILE}
+  # C0301 line-too-long
+  grep -E '(\[E|\[W0311|\[W0312|\[C0330|\[C0301)' ${OUTPUT_FILE} > ${ERRORS_FILE}
 
   N_ERRORS=0
   while read -r LINE; do
diff --git a/tensorflow/tools/ci_build/pylintrc b/tensorflow/tools/ci_build/pylintrc
index e71017e621..68fdb61716 100644
--- a/tensorflow/tools/ci_build/pylintrc
+++ b/tensorflow/tools/ci_build/pylintrc
@@ -180,7 +180,17 @@ docstring-min-length=10
 max-line-length=80
 
 # Regexp for a line that is allowed to be longer than the limit.
-ignore-long-lines=^\s*(# )?<?https?://\S+>?$
+ignore-long-lines=(?x)
+  (^\s*(import|from)\s
+   |\$Id:\s\/\/depot\/.+#\d+\s\$
+   |^[a-zA-Z_][a-zA-Z0-9_]*\s*=\s*("[^"]\S+"|'[^']\S+')
+   |^\s*\#\ LINT\.ThenChange
+   |^[^#]*\#\ type:\ [a-zA-Z_][a-zA-Z0-9_.,[\] ]*$
+   |pylint
+   |"""
+   |\#
+   |lambda
+   |(https?|ftp):)
 
 # Allow the body of an if to be on the same line as the test if there is no
 # else.
diff --git a/tensorflow/tools/dist_test/python/mnist_replica.py b/tensorflow/tools/dist_test/python/mnist_replica.py
index e40ecb43f9..a2d12442c4 100644
--- a/tensorflow/tools/dist_test/python/mnist_replica.py
+++ b/tensorflow/tools/dist_test/python/mnist_replica.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Distributed MNIST training and validation, with model replicas.
 
 A simple softmax model with one hidden layer is defined. The parameters
@@ -32,7 +31,6 @@ perform forward computation and gradient calculation in parallel, which
 should lead to increased training speed for the simple model.
 """
 
-
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -45,7 +43,6 @@ import time
 import tensorflow as tf
 from tensorflow.examples.tutorials.mnist import input_data
 
-
 flags = tf.app.flags
 flags.DEFINE_string("data_dir", "/tmp/mnist-data",
                     "Directory for storing mnist data")
@@ -56,8 +53,7 @@ flags.DEFINE_integer("task_index", None,
                      "Worker task index, should be >= 0. task_index=0 is "
                      "the master worker task the performs the variable "
                      "initialization ")
-flags.DEFINE_integer("num_gpus", 1,
-                     "Total number of gpus for each machine."
+flags.DEFINE_integer("num_gpus", 1, "Total number of gpus for each machine."
                      "If you don't use GPU, please set it to '0'")
 flags.DEFINE_integer("replicas_to_aggregate", None,
                      "Number of replicas to aggregate before parameter update"
@@ -69,24 +65,24 @@ flags.DEFINE_integer("train_steps", 200,
                      "Number of (global) training steps to perform")
 flags.DEFINE_integer("batch_size", 100, "Training batch size")
 flags.DEFINE_float("learning_rate", 0.01, "Learning rate")
-flags.DEFINE_boolean("sync_replicas", False,
-                     "Use the sync_replicas (synchronized replicas) mode, "
-                     "wherein the parameter updates from workers are aggregated "
-                     "before applied to avoid stale gradients")
+flags.DEFINE_boolean(
+    "sync_replicas", False,
+    "Use the sync_replicas (synchronized replicas) mode, "
+    "wherein the parameter updates from workers are aggregated "
+    "before applied to avoid stale gradients")
 flags.DEFINE_boolean(
     "existing_servers", False, "Whether servers already exists. If True, "
     "will use the worker hosts via their GRPC URLs (one client process "
     "per worker host). Otherwise, will create an in-process TensorFlow "
     "server.")
-flags.DEFINE_string("ps_hosts","localhost:2222",
+flags.DEFINE_string("ps_hosts", "localhost:2222",
                     "Comma-separated list of hostname:port pairs")
 flags.DEFINE_string("worker_hosts", "localhost:2223,localhost:2224",
                     "Comma-separated list of hostname:port pairs")
-flags.DEFINE_string("job_name", None,"job name: worker or ps")
+flags.DEFINE_string("job_name", None, "job name: worker or ps")
 
 FLAGS = flags.FLAGS
 
-
 IMAGE_PIXELS = 28
 
 
@@ -97,7 +93,7 @@ def main(unused_argv):
 
   if FLAGS.job_name is None or FLAGS.job_name == "":
     raise ValueError("Must specify an explicit `job_name`")
-  if FLAGS.task_index is None or FLAGS.task_index =="":
+  if FLAGS.task_index is None or FLAGS.task_index == "":
     raise ValueError("Must specify an explicit `task_index`")
 
   print("job name = %s" % FLAGS.job_name)
@@ -110,9 +106,7 @@ def main(unused_argv):
   # Get the number of workers.
   num_workers = len(worker_spec)
 
-  cluster = tf.train.ClusterSpec({
-      "ps": ps_spec,
-      "worker": worker_spec})
+  cluster = tf.train.ClusterSpec({"ps": ps_spec, "worker": worker_spec})
 
   if not FLAGS.existing_servers:
     # Not using existing servers. Create an in-process server.
@@ -217,7 +211,8 @@ def main(unused_argv):
     sess_config = tf.ConfigProto(
         allow_soft_placement=True,
         log_device_placement=False,
-        device_filters=["/job:ps", "/job:worker/task:%d" % FLAGS.task_index])
+        device_filters=["/job:ps",
+                        "/job:worker/task:%d" % FLAGS.task_index])
 
     # The chief worker (task_index==0) session will prepare the session,
     # while the remaining workers will wait for the preparation to complete.
@@ -231,8 +226,7 @@ def main(unused_argv):
       server_grpc_url = "grpc://" + worker_spec[FLAGS.task_index]
       print("Using existing server at: %s" % server_grpc_url)
 
-      sess = sv.prepare_or_wait_for_session(server_grpc_url,
-                                            config=sess_config)
+      sess = sv.prepare_or_wait_for_session(server_grpc_url, config=sess_config)
     else:
       sess = sv.prepare_or_wait_for_session(server.target, config=sess_config)
 
-- 
GitLab


From f8da6cc63ae1fd71de1ab5d9e91884872b249e55 Mon Sep 17 00:00:00 2001
From: Mark Heffernan <meheff@google.com>
Date: Fri, 26 Jan 2018 16:58:00 -0800
Subject: [PATCH 283/966] Add reduce-precision to evaluator and add implicit
 broadcast remover pass. The reduce precision support is cribbed from the
 CPU/GPU LLVM-emitted implementation. The implicit broadcast pass removes any
 implicit broadcasts in the module replacing them with the equivalent explicit
 broadcast and reshape instructions.

PiperOrigin-RevId: 183467648
---
 tensorflow/compiler/xla/service/BUILD         |  28 +++
 .../compiler/xla/service/hlo_evaluator.cc     | 110 +++++++++++
 .../xla/service/implicit_broadcast_remover.cc | 124 ++++++++++++
 .../xla/service/implicit_broadcast_remover.h  |  42 +++++
 .../implicit_broadcast_remover_test.cc        | 176 ++++++++++++++++++
 tensorflow/compiler/xla/tests/BUILD           |   1 +
 .../xla/tests/reduce_precision_test.cc        |  20 +-
 7 files changed, 496 insertions(+), 5 deletions(-)
 create mode 100644 tensorflow/compiler/xla/service/implicit_broadcast_remover.cc
 create mode 100644 tensorflow/compiler/xla/service/implicit_broadcast_remover.h
 create mode 100644 tensorflow/compiler/xla/service/implicit_broadcast_remover_test.cc

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 031d077c6a..987367fc68 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -1158,6 +1158,34 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "implicit_broadcast_remover",
+    srcs = ["implicit_broadcast_remover.cc"],
+    hdrs = ["implicit_broadcast_remover.h"],
+    deps = [
+        ":hlo",
+        ":hlo_dce",
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "implicit_broadcast_remover_test",
+    srcs = ["implicit_broadcast_remover_test.cc"],
+    deps = [
+        ":hlo_matchers",
+        ":implicit_broadcast_remover",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+    ],
+)
+
 cc_library(
     name = "dot_decomposer",
     srcs = ["dot_decomposer.cc"],
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index e3f5c17e35..ab604064d5 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/core/lib/core/bitmap.h"
+#include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
@@ -1706,6 +1707,115 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     return HandleCos<ElementwiseT>(cos);
   }
 
+  template <typename NativeT, typename std::enable_if<std::is_same<
+                                  float, NativeT>::value>::type* = nullptr>
+  Status HandleReducePrecision(HloInstruction* reduce_precision) {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[reduce_precision],
+        ElementWiseUnaryOp(reduce_precision, [reduce_precision](
+                                                 ElementwiseT elem) {
+          uint32_t value_as_int = tensorflow::bit_cast<uint32_t>(elem);
+          const uint32_t mantissa_bits = reduce_precision->mantissa_bits();
+          const uint32_t exponent_bits = reduce_precision->exponent_bits();
+
+          // Code is based on the CPU/GPU implementation in LLVM-emitting code.
+          //
+          // Bits in float type:
+          //   mantissa : bits [0:22]
+          //   exponent : bits [23:30]
+          //   sign     : bits [31]
+          if (mantissa_bits < 23) {
+            const uint32_t last_mantissa_bit_mask = 1u << (23 - mantissa_bits);
+
+            // Compute rounding bias for round-to-nearest with ties to even.
+            // This is equal to a base value of 0111... plus one bit if the last
+            // remaining mantissa bit is 1.
+            const uint32_t base_rounding_bias =
+                (last_mantissa_bit_mask >> 1) - 1;
+            const uint32_t x_last_mantissa_bit =
+                (value_as_int & last_mantissa_bit_mask) >> (23 - mantissa_bits);
+            const uint32_t x_rounding_bias =
+                x_last_mantissa_bit + base_rounding_bias;
+
+            // Add rounding bias, and mask out truncated bits.  Note that the
+            // case where adding the rounding bias overflows into the exponent
+            // bits is correct; the non-masked mantissa bits will all be zero,
+            // and the exponent will be incremented by one.
+            const uint32_t truncation_mask = ~(last_mantissa_bit_mask - 1);
+            value_as_int = value_as_int + x_rounding_bias;
+            value_as_int = value_as_int & truncation_mask;
+          }
+          if (exponent_bits < 8) {
+            // Masks for f32 values.
+            const uint32_t f32_sign_bit_mask = 1u << 31;
+            const uint32_t f32_exp_bits_mask = 0xffu << 23;
+
+            // An exponent of 2^(n-1)-1 -- that is, 0111... with the zero in the
+            // most- significant bit -- is equal to 1.0f for all exponent sizes.
+            // Adding 2^(n-1)-1 to this gives us the highest non-infinite
+            // exponent for a bit- size of n, and subtracting 2^(n-1)-1 from
+            // this gives us the lowest' exponent (corresponding to 0.0f).
+            //
+            // Thus, the f32 exponent corresponding to the highest non-infinite
+            // exponent for a bit size of n is (2^7-1) + 2^(n-1)-1, and the f32
+            // exponent corresponding to the lowest exponent for a bit size of n
+            // is (2^7-1) - 2^(n-1)-1.
+            //
+            // Note that we have already checked that exponents_bits >= 1.
+            const uint32_t f32_exponent_bias = (1 << 7) - 1;
+            const uint32_t reduced_exponent_bias =
+                (1 << (exponent_bits - 1)) - 1;
+            const uint32_t reduced_max_exponent =
+                f32_exponent_bias + reduced_exponent_bias;
+            const uint32_t reduced_min_exponent =
+                f32_exponent_bias - reduced_exponent_bias;
+
+            // Do we overflow or underflow?
+            const uint32_t x_exponent = value_as_int & f32_exp_bits_mask;
+            const bool x_overflows = x_exponent > (reduced_max_exponent << 23);
+            const bool x_underflows =
+                x_exponent <= (reduced_min_exponent << 23);
+
+            // Compute appropriately-signed values of zero and infinity.
+            const uint32_t x_signed_zero = value_as_int & f32_sign_bit_mask;
+            const uint32_t x_signed_inf = x_signed_zero | f32_exp_bits_mask;
+
+            // Force to zero or infinity if overflow or underflow.  (Note that
+            // this truncates all denormal values to zero, rather than rounding
+            // them.)
+            value_as_int = x_overflows ? x_signed_inf : value_as_int;
+            value_as_int = x_underflows ? x_signed_zero : value_as_int;
+          }
+
+          float reduced_result = tensorflow::bit_cast<float>(value_as_int);
+          if (std::isnan(elem)) {
+            reduced_result = mantissa_bits > 0
+                                 ? elem
+                                 : std::numeric_limits<float>::infinity();
+          }
+          return reduced_result;
+        }));
+    return Status::OK();
+  }
+
+  template <typename NativeT, typename std::enable_if<std::is_same<
+                                  double, NativeT>::value>::type* = nullptr>
+  Status HandleReducePrecision(HloInstruction* reduce_precision) {
+    return InvalidArgument("Double not supported for reduce precision");
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<std::is_integral<NativeT>::value ||
+                              is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleReducePrecision(HloInstruction* reduce_precision) {
+    return InvalidArgument("Unsupported type for reduce precision");
+  }
+
+  Status HandleReducePrecision(HloInstruction* reduce_precision) override {
+    return HandleReducePrecision<ElementwiseT>(reduce_precision);
+  }
+
  private:
   template <typename IndexT>
   StatusOr<std::unique_ptr<Literal>> DynamicSlice(
diff --git a/tensorflow/compiler/xla/service/implicit_broadcast_remover.cc b/tensorflow/compiler/xla/service/implicit_broadcast_remover.cc
new file mode 100644
index 0000000000..ada2134501
--- /dev/null
+++ b/tensorflow/compiler/xla/service/implicit_broadcast_remover.cc
@@ -0,0 +1,124 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/implicit_broadcast_remover.h"
+
+#include <algorithm>
+#include <memory>
+#include <numeric>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_dce.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+
+namespace {
+
+// Visitor for removing implicit broadcasts.
+class ImplicitBroadcastVisitor : public DfsHloVisitorWithDefault {
+ public:
+  Status DefaultAction(HloInstruction* hlo_instruction) override {
+    return Status::OK();
+  }
+
+  Status HandleElementwiseBinary(HloInstruction* hlo) override {
+    return ReplaceImplicitBroadcastOperands(hlo);
+  }
+
+  Status HandleClamp(HloInstruction* hlo) override {
+    // Clamp is the only element-wise ternary operation.
+    return ReplaceImplicitBroadcastOperands(hlo);
+  }
+
+  // Returns whether any modification has been made to any visited instruction.
+  bool changed() const { return changed_; }
+
+ private:
+  // Iterates through the operands of 'hlo' and replace any operands which are
+  // implicitly broadcast with the equivalent sequence of broadcast and reshape
+  // instructions. An operand is considered to be implicitly broadcast if the
+  // operand shape does have the same dimensions as the shape of 'hlo'.
+  Status ReplaceImplicitBroadcastOperands(HloInstruction* hlo) {
+    auto fadd = [hlo](std::unique_ptr<HloInstruction> x) {
+      return hlo->parent()->AddInstruction(std::move(x));
+    };
+    std::vector<HloInstruction*> operands;
+    bool operands_changed = false;
+    for (int i = 0; i < hlo->operand_count(); ++i) {
+      HloInstruction* operand = hlo->mutable_operand(i);
+      if (!ShapeUtil::SameDimensions(hlo->shape(), operand->shape())) {
+        HloInstruction* new_operand = hlo->parent()->AddInstruction(
+            HloInstruction::CreateBroadcastSequence(hlo->shape(), operand,
+                                                    fadd));
+        operands.push_back(new_operand);
+        operands_changed = true;
+      } else {
+        operands.push_back(operand);
+      }
+    }
+    if (operands_changed) {
+      // Create a new HLO instruction because the HloInstruction::Replace*
+      // methods check that the shape does not change with the replacement.
+      HloInstruction* new_hlo = hlo->parent()->AddInstruction(
+          hlo->CloneWithNewOperands(hlo->shape(), operands));
+      TF_RETURN_IF_ERROR(hlo->ReplaceAllUsesWith(new_hlo));
+      changed_ = true;
+    }
+    return Status::OK();
+  }
+
+  bool changed_ = false;
+};
+
+}  // namespace
+
+StatusOr<bool> ImplicitBroadcastRemover::Run(HloModule* module) {
+  VLOG(1) << "Removing implicit broadcast from module " << module->name();
+  XLA_VLOG_LINES(2,
+                 "Before removing implicit broadcasts:\n" + module->ToString());
+
+  ImplicitBroadcastVisitor visitor;
+  for (HloComputation* computation : module->computations()) {
+    TF_RETURN_IF_ERROR(computation->Accept(&visitor));
+  }
+
+  if (visitor.changed()) {
+    // HLO instructions with implicitly broadcast operands are cloned and left
+    // for dead. Remove them.
+    HloDCE dce;
+    TF_RETURN_IF_ERROR(dce.Run(module).status());
+  }
+
+  XLA_VLOG_LINES(2,
+                 "After removing implicit broadcasts:\n" + module->ToString());
+
+  return visitor.changed();
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/implicit_broadcast_remover.h b/tensorflow/compiler/xla/service/implicit_broadcast_remover.h
new file mode 100644
index 0000000000..aa325dc8a3
--- /dev/null
+++ b/tensorflow/compiler/xla/service/implicit_broadcast_remover.h
@@ -0,0 +1,42 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_IMPLICIT_BROADCAST_REMOVER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_IMPLICIT_BROADCAST_REMOVER_H_
+
+#include <utility>
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+// Pass which replaces all implicit broadcasts with their equivalent sequence of
+// explicit broadcast and reshape instructions.
+class ImplicitBroadcastRemover : public HloPassInterface {
+ public:
+  ImplicitBroadcastRemover() {}
+  ~ImplicitBroadcastRemover() override {}
+
+  tensorflow::StringPiece name() const override {
+    return "implicit-broadcast-remover";
+  }
+
+  StatusOr<bool> Run(HloModule* module) override;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_IMPLICIT_BROADCAST_REMOVER_H_
diff --git a/tensorflow/compiler/xla/service/implicit_broadcast_remover_test.cc b/tensorflow/compiler/xla/service/implicit_broadcast_remover_test.cc
new file mode 100644
index 0000000000..8c7b38dd1b
--- /dev/null
+++ b/tensorflow/compiler/xla/service/implicit_broadcast_remover_test.cc
@@ -0,0 +1,176 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/implicit_broadcast_remover.h"
+
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+
+namespace op = xla::testing::opcode_matchers;
+
+namespace xla {
+namespace {
+
+class ImplicitBroadcastRemoverTest : public HloVerifiedTestBase {
+ protected:
+  ImplicitBroadcastRemover remover_;
+};
+
+TEST_F(ImplicitBroadcastRemoverTest, NoImplicitBroadcast) {
+  auto builder = HloComputation::Builder(TestName());
+
+  const Shape shape = ShapeUtil::MakeShape(F32, {2, 4});
+  auto param0 =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "p0"));
+  auto param1 =
+      builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "p1"));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param0, param1));
+
+  HloComputation* computation = module().AddEntryComputation(builder.Build());
+
+  EXPECT_FALSE(remover_.Run(&module()).ValueOrDie());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Add(op::Parameter(), op::Parameter()));
+}
+
+TEST_F(ImplicitBroadcastRemoverTest, ScalarBroadcast) {
+  auto builder = HloComputation::Builder(TestName());
+
+  const Shape shape = ShapeUtil::MakeShape(F32, {2, 4});
+  auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(F32, {}), "scalar_param"));
+  auto param1 =
+      builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "p1"));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kPower, param0, param1));
+
+  HloComputation* computation = module().AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+
+  EXPECT_FALSE(ShapeUtil::Compatible(root->shape(), root->operand(0)->shape()));
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(1)->shape()));
+
+  EXPECT_TRUE(remover_.Run(&module()).ValueOrDie());
+  root = computation->root_instruction();
+
+  EXPECT_THAT(root, op::Power(op::Broadcast(op::Parameter()), op::Parameter()));
+
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(0)->shape()));
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(1)->shape()));
+}
+
+TEST_F(ImplicitBroadcastRemoverTest, DegenerateDimensionBroadcast) {
+  auto builder = HloComputation::Builder(TestName());
+
+  const Shape shape = ShapeUtil::MakeShape(F32, {2, 4, 6});
+  auto param0 =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "p0"));
+  auto param1 = builder.AddInstruction(HloInstruction::CreateParameter(
+      1, ShapeUtil::MakeShape(F32, {1, 4, 1}), "p1"));
+  builder.AddInstruction(HloInstruction::CreateBinary(
+      shape, HloOpcode::kSubtract, param0, param1));
+
+  HloComputation* computation = module().AddEntryComputation(builder.Build());
+
+  EXPECT_TRUE(remover_.Run(&module()).ValueOrDie());
+
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_THAT(root, op::Subtract(op::Parameter(),
+                                 op::Broadcast(op::Reshape(op::Parameter()))));
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(0)->shape()));
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(1)->shape()));
+}
+
+TEST_F(ImplicitBroadcastRemoverTest, ScalarBroadcastToDegenerateDimensions) {
+  auto builder = HloComputation::Builder(TestName());
+
+  const Shape shape = ShapeUtil::MakeShape(F32, {1, 4, 1});
+  auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(F32, {}), "scalar_param"));
+  auto param1 =
+      builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "p1"));
+  builder.AddInstruction(HloInstruction::CreateBinary(
+      shape, HloOpcode::kSubtract, param0, param1));
+
+  HloComputation* computation = module().AddEntryComputation(builder.Build());
+
+  EXPECT_TRUE(remover_.Run(&module()).ValueOrDie());
+
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_THAT(root,
+              op::Subtract(op::Broadcast(op::Parameter()), op::Parameter()));
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(0)->shape()));
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(1)->shape()));
+}
+
+TEST_F(ImplicitBroadcastRemoverTest, TernaryDegenerateDimensionBroadcast) {
+  auto builder = HloComputation::Builder(TestName());
+
+  const Shape shape = ShapeUtil::MakeShape(F32, {2, 4, 6, 8});
+  auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(F32, {1, 4, 1, 8}), "p0"));
+  auto param1 = builder.AddInstruction(HloInstruction::CreateParameter(
+      1, ShapeUtil::MakeShape(F32, {1, 1, 6, 8}), "p1"));
+  auto param2 = builder.AddInstruction(HloInstruction::CreateParameter(
+      2, ShapeUtil::MakeShape(F32, {2, 1, 6, 8}), "p2"));
+  builder.AddInstruction(HloInstruction::CreateTernary(shape, HloOpcode::kClamp,
+                                                       param0, param1, param2));
+
+  HloComputation* computation = module().AddEntryComputation(builder.Build());
+
+  EXPECT_TRUE(remover_.Run(&module()).ValueOrDie());
+
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_THAT(root, op::Clamp(op::Broadcast(op::Reshape(op::Parameter())),
+                              op::Broadcast(op::Reshape(op::Parameter())),
+                              op::Broadcast(op::Reshape(op::Parameter()))));
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(0)->shape()));
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(1)->shape()));
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(2)->shape()));
+}
+
+TEST_F(ImplicitBroadcastRemoverTest,
+       TernaryScalarAndDegenerateDimensionBroadcast) {
+  auto builder = HloComputation::Builder(TestName());
+
+  const Shape shape = ShapeUtil::MakeShape(F32, {2, 4, 6});
+  auto param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "p0"));
+  auto param1 = builder.AddInstruction(HloInstruction::CreateParameter(
+      1, ShapeUtil::MakeShape(F32, {1, 4, 6}), "p1"));
+  auto param2 =
+      builder.AddInstruction(HloInstruction::CreateParameter(2, shape, "p2"));
+  builder.AddInstruction(HloInstruction::CreateTernary(shape, HloOpcode::kClamp,
+                                                       param0, param1, param2));
+
+  HloComputation* computation = module().AddEntryComputation(builder.Build());
+
+  EXPECT_TRUE(remover_.Run(&module()).ValueOrDie());
+
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_THAT(root, op::Clamp(op::Broadcast(op::Parameter()),
+                              op::Broadcast(op::Reshape(op::Parameter())),
+                              op::Parameter()));
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(0)->shape()));
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(1)->shape()));
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(2)->shape()));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 4410647f84..d4820d1b6d 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -578,6 +578,7 @@ xla_test(
 xla_test(
     name = "reduce_precision_test",
     srcs = ["reduce_precision_test.cc"],
+    tags = ["enable_for_xla_interpreter"],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:literal_util",
diff --git a/tensorflow/compiler/xla/tests/reduce_precision_test.cc b/tensorflow/compiler/xla/tests/reduce_precision_test.cc
index 4756ba0968..dc7ce3253c 100644
--- a/tensorflow/compiler/xla/tests/reduce_precision_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_precision_test.cc
@@ -249,7 +249,9 @@ INSTANTIATE_TEST_CASE_P(ReducePrecisionAccuracyTest,
 // ReducePrecisionInsertion passes.
 class ReducePrecisionInsertionTest : public ClientLibraryTestBase {};
 
-XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionBeforeFusion) {
+// The interpreter has no fusion pass, so skip this test.
+XLA_TEST_F(ReducePrecisionInsertionTest,
+           DISABLED_ON_INTERPRETER(ReducePrecisionBeforeFusion)) {
   ComputationBuilder builder(client_, TestName());
 
   std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({1.00001});
@@ -276,7 +278,9 @@ XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionBeforeFusion) {
   ComputeAndCompareR1<float>(&builder, {0.0f}, {a_data.get()});
 }
 
-XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionSkippedAfterFusion) {
+// The interpreter has no fusion pass, so skip this test.
+XLA_TEST_F(ReducePrecisionInsertionTest,
+           DISABLED_ON_INTERPRETER(ReducePrecisionSkippedAfterFusion)) {
   ComputationBuilder builder(client_, TestName());
 
   std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({1.00001});
@@ -300,7 +304,9 @@ XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionSkippedAfterFusion) {
   ComputeAndCompareR1<float>(&builder, {-1.00001f}, {a_data.get()});
 }
 
-XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionAddedAfterFusion) {
+// The interpreter has no fusion pass, so skip this test.
+XLA_TEST_F(ReducePrecisionInsertionTest,
+           DISABLED_ON_INTERPRETER(ReducePrecisionAddedAfterFusion)) {
   ComputationBuilder builder(client_, TestName());
 
   std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({1.00001});
@@ -322,7 +328,9 @@ XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionAddedAfterFusion) {
   ComputeAndCompareR1<float>(&builder, {-1.0f}, {a_data.get()});
 }
 
-XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionSkippedFusionContains) {
+// The interpreter has no fusion pass, so skip this test.
+XLA_TEST_F(ReducePrecisionInsertionTest,
+           DISABLED_ON_INTERPRETER(ReducePrecisionSkippedFusionContains)) {
   ComputationBuilder builder(client_, TestName());
 
   std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({1.00001});
@@ -345,7 +353,9 @@ XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionSkippedFusionContains) {
   ComputeAndCompareR1<float>(&builder, {-1.00001f}, {a_data.get()});
 }
 
-XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionAddedFusionContains) {
+// The interpreter has no fusion pass, so skip this test.
+XLA_TEST_F(ReducePrecisionInsertionTest,
+           DISABLED_ON_INTERPRETER(ReducePrecisionAddedFusionContains)) {
   ComputationBuilder builder(client_, TestName());
 
   std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({1.00001});
-- 
GitLab


From 86c10063c8521bd5482df13676bac3575540d9e2 Mon Sep 17 00:00:00 2001
From: Andrew Harp <andrewharp@users.noreply.github.com>
Date: Fri, 26 Jan 2018 20:13:45 -0500
Subject: [PATCH 284/966] Simplify Android/Tegra GPU makefile file lists
 (#16471)

* updating CUDA srcs for Makefile build to fix unsatisfied link error

* more makefile refactoring
---
 tensorflow/contrib/makefile/Makefile | 73 ++++++++--------------------
 1 file changed, 21 insertions(+), 52 deletions(-)

diff --git a/tensorflow/contrib/makefile/Makefile b/tensorflow/contrib/makefile/Makefile
index c50f8ceec0..c573cf15da 100644
--- a/tensorflow/contrib/makefile/Makefile
+++ b/tensorflow/contrib/makefile/Makefile
@@ -606,7 +606,8 @@ $(wildcard tensorflow/core/util/*/*.cc) \
 tensorflow/core/util/version_info.cc
 # Remove duplicates (for version_info.cc)
 CORE_CC_ALL_SRCS := $(sort $(CORE_CC_ALL_SRCS))
-CORE_CC_EXCLUDE_SRCS := \
+
+CORE_CC_EXCLUDE_SRCS_NON_GPU := \
 $(wildcard tensorflow/core/*/*test.cc) \
 $(wildcard tensorflow/core/*/*testutil*) \
 $(wildcard tensorflow/core/*/*testlib*) \
@@ -626,49 +627,31 @@ $(wildcard tensorflow/core/lib/jpeg/*) \
 $(wildcard tensorflow/core/lib/png/*) \
 $(wildcard tensorflow/core/util/events_writer.*) \
 $(wildcard tensorflow/core/util/reporter.*) \
-$(wildcard tensorflow/core/platform/default/cuda_libdevice_path.*) \
-$(wildcard tensorflow/core/platform/default/stream_executor.*) \
 $(wildcard tensorflow/core/platform/default/test_benchmark.*) \
-$(wildcard tensorflow/core/platform/cuda.h) \
-$(wildcard tensorflow/core/platform/cuda_libdevice_path.*) \
 $(wildcard tensorflow/core/platform/cloud/*) \
 $(wildcard tensorflow/core/platform/google/*) \
 $(wildcard tensorflow/core/platform/google/*/*) \
 $(wildcard tensorflow/core/platform/jpeg.*) \
 $(wildcard tensorflow/core/platform/png.*) \
 $(wildcard tensorflow/core/platform/s3/*) \
-$(wildcard tensorflow/core/platform/stream_executor.*) \
 $(wildcard tensorflow/core/platform/windows/*) \
-$(wildcard tensorflow/core/user_ops/*.cu.cc) \
-$(wildcard tensorflow/core/common_runtime/gpu/*) \
-$(wildcard tensorflow/core/common_runtime/gpu_device_factory.*) \
 $(wildcard tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.*) \
 $(wildcard tensorflow/core/grappler/inputs/file_input_yielder.*) \
-$(wildcard tensorflow/core/grappler/clusters/single_machine.*)
+$(wildcard tensorflow/core/grappler/clusters/single_machine.*) \
+tensorflow/core/util/cuda_kernel_helper_test.cu.cc
+
+CORE_CC_EXCLUDE_SRCS := \
+$(CORE_CC_EXCLUDE_SRCS_NON_GPU) \
+$(wildcard tensorflow/core/platform/stream_executor.*) \
+$(wildcard tensorflow/core/platform/default/cuda_libdevice_path.*) \
+$(wildcard tensorflow/core/platform/cuda.h) \
+$(wildcard tensorflow/core/platform/cuda_libdevice_path.*) \
+$(wildcard tensorflow/core/user_ops/*.cu.cc) \
+$(wildcard tensorflow/core/common_runtime/gpu/*) \
+$(wildcard tensorflow/core/common_runtime/gpu_device_factory.*)
 
 ifeq ($(BUILD_FOR_TEGRA),1)
-CORE_CC_ALL_SRCS := \
-$(wildcard tensorflow/core/*.cc) \
-$(wildcard tensorflow/core/common_runtime/*.cc) \
-$(wildcard tensorflow/core/common_runtime/gpu/*.cc) \
-$(wildcard tensorflow/core/framework/*.cc) \
-$(wildcard tensorflow/core/graph/*.cc) \
-$(wildcard tensorflow/core/platform/*.cc) \
-$(wildcard tensorflow/core/platform/*/*.cc) \
-$(wildcard tensorflow/core/platform/*/*/*.cc) \
-$(wildcard tensorflow/core/util/*.cc) \
-$(wildcard tensorflow/core/util/*/*.cc) \
-$(wildcard tensorflow/cc/training/*.cc) \
-$(wildcard tensorflow/stream_executor/*.cc) \
-$(wildcard tensorflow/stream_executor/*/*.cc) \
-$(wildcard tensorflow/core/grappler/optimizers/*.cc) \
-$(wildcard tensorflow/core/grappler/*.cc) \
-$(wildcard tensorflow/core/grappler/costs/*.cc) \
-$(wildcard tensorflow/core/grappler/clusters/*.cc) \
-$(wildcard tensorflow/core/grappler/utils/*.cc) \
-$(wildcard tensorflow/core/lib/core/*.cc) \
-$(wildcard tensorflow/core/lib/*/*.cc) \
-tensorflow/core/grappler/inputs/utils.cc \
+CORE_CC_ALL_SRCS := $(CORE_CC_ALL_SRCS) \
 tensorflow/core/kernels/concat_lib_gpu.cc \
 tensorflow/core/kernels/cuda_solvers.cc \
 tensorflow/core/kernels/cudnn_pooling_gpu.cc \
@@ -677,28 +660,14 @@ tensorflow/core/kernels/fractional_avg_pool_op.cc \
 tensorflow/core/kernels/fractional_max_pool_op.cc \
 tensorflow/core/kernels/fractional_pool_common.cc \
 tensorflow/core/kernels/pooling_ops_3d.cc \
-tensorflow/core/kernels/sparse_fill_empty_rows_op.cc
+tensorflow/core/kernels/sparse_fill_empty_rows_op.cc \
+tensorflow/core/kernels/list_kernels.cc \
+$(wildcard tensorflow/core/common_runtime/gpu/*.cc) \
+$(wildcard tensorflow/stream_executor/*.cc) \
+$(wildcard tensorflow/stream_executor/*/*.cc)
 
 CORE_CC_EXCLUDE_SRCS := \
-$(wildcard tensorflow/core/*/*test.cc) \
-$(wildcard tensorflow/core/*/*testutil*) \
-$(wildcard tensorflow/core/*/*testlib*) \
-$(wildcard tensorflow/core/*/*/*test.cc) \
-$(wildcard tensorflow/core/*/*/*testutil*) \
-$(wildcard tensorflow/core/framework/op_gen_lib.cc) \
-$(wildcard tensorflow/core/lib/gif/*) \
-$(wildcard tensorflow/core/lib/jpeg/*) \
-$(wildcard tensorflow/core/lib/png/*) \
-$(wildcard tensorflow/core/lib/db/*) \
-$(wildcard tensorflow/core/platform/jpeg.*) \
-$(wildcard tensorflow/core/platform/png.*) \
-$(wildcard tensorflow/core/platform/cloud/*) \
-$(wildcard tensorflow/core/platform/s3/*) \
-$(wildcard tensorflow/core/platform/windows/*) \
-$(wildcard tensorflow/core/*/*/*testlib*) \
-$(wildcard tensorflow/cc/training/*test.cc) \
-tensorflow/core/lib/io/record_reader.cc \
-tensorflow/core/util/cuda_kernel_helper_test.cu.cc
+$(CORE_CC_EXCLUDE_SRCS_NON_GPU)
 
 CUDA_CC_SRCS := $(wildcard tensorflow/core/kernels/*.cu.cc)
 CUDA_CC_OBJS := $(addprefix $(OBJDIR), $(CUDA_CC_SRCS:.cc=.o))
-- 
GitLab


From 0b164dd43bbf76547836a9ae6ae424b9cda65968 Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Fri, 26 Jan 2018 17:12:23 -0800
Subject: [PATCH 285/966] [XLA] Add a DeviceAllocator* argument to compilation.

In a later change, the GPU backend will use this allocator to reserve
scratch memory when trying out different convolution algorithms during
compilation.

PiperOrigin-RevId: 183469579
---
 .../compiler/jit/kernels/xla_launch_op.cc     |  7 ++--
 .../compiler/jit/xla_compilation_cache.cc     |  1 +
 tensorflow/compiler/tf2xla/xla_compiler.h     | 13 ++++++++
 .../compiler/xla/client/local_client.cc       | 19 ++++++++---
 tensorflow/compiler/xla/client/local_client.h | 12 +++++++
 tensorflow/compiler/xla/service/compiler.h    | 29 ++++++++++++++--
 .../compiler/xla/service/cpu/cpu_compiler.cc  |  6 ++--
 .../compiler/xla/service/cpu/cpu_compiler.h   |  6 ++--
 .../compiler/xla/service/gpu/gpu_compiler.cc  | 13 +++++---
 .../compiler/xla/service/gpu/gpu_compiler.h   |  6 ++--
 tensorflow/compiler/xla/service/hlo_runner.cc |  6 ++--
 .../xla/service/interpreter/compiler.cc       | 10 +++---
 .../xla/service/interpreter/compiler.h        |  9 +++--
 .../compiler/xla/service/llvm_compiler.cc     | 12 ++++---
 .../compiler/xla/service/llvm_compiler.h      |  9 +++--
 .../compiler/xla/service/local_service.cc     |  5 +--
 .../compiler/xla/service/local_service.h      |  7 ++--
 tensorflow/compiler/xla/service/service.cc    | 33 +++++++++++--------
 tensorflow/compiler/xla/service/service.h     | 18 ++++++----
 .../compiler/xla/tests/codegen_test_base.cc   |  6 ++--
 .../compiler/xla/tests/llvm_compiler_test.cc  |  6 ++--
 .../dumped_computation_to_operation_list.cc   |  6 ++--
 .../xla/tools/dumped_computation_to_text.cc   |  6 ++--
 23 files changed, 175 insertions(+), 70 deletions(-)

diff --git a/tensorflow/compiler/jit/kernels/xla_launch_op.cc b/tensorflow/compiler/jit/kernels/xla_launch_op.cc
index 4842877d9a..1d7bd22e60 100644
--- a/tensorflow/compiler/jit/kernels/xla_launch_op.cc
+++ b/tensorflow/compiler/jit/kernels/xla_launch_op.cc
@@ -248,12 +248,16 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
 
   xla::LocalClient* client = static_cast<xla::LocalClient*>(cache->client());
 
+  // Builds an XLA allocator for the device.
+  XlaAllocator xla_allocator(client->platform(), ctx);
+
   XlaCompiler::Options options;
   options.client = client;
   options.device_type = &cache->device_type();
   options.flib_def = ctx->function_library()->GetFunctionLibraryDefinition();
   options.graph_def_version = ctx->function_library()->graph_def_version();
   options.allow_cpu_custom_calls = (platform_id_ == gpu::host::kHostPlatformId);
+  options.device_allocator = &xla_allocator;
 
   const XlaCompiler::CompilationResult* kernel;
   xla::LocalExecutable* executable;
@@ -264,9 +268,6 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
 
   VLOG(1) << "Executing XLA Computation...";
 
-  // Builds an XLA allocator for the device.
-  XlaAllocator xla_allocator(client->platform(), ctx);
-
   std::unique_ptr<xla::ShapedBuffer> output;
   // Build xla::ShapedBuffers that point directly to the Tensor buffers.
   std::vector<std::unique_ptr<xla::ShapedBuffer>> arg_buffers;
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.cc b/tensorflow/compiler/jit/xla_compilation_cache.cc
index bfff52c55a..21d3a54f1b 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.cc
+++ b/tensorflow/compiler/jit/xla_compilation_cache.cc
@@ -223,6 +223,7 @@ Status XlaCompilationCache::BuildExecutable(
   xla::ExecutableBuildOptions build_options;
   build_options.set_device_ordinal(client_->default_device_ordinal());
   build_options.set_result_layout(result.xla_output_shape);
+  build_options.set_device_allocator(options.device_allocator);
 
   auto compile_result =
       client_->Compile(*result.computation, argument_layouts, build_options);
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h
index 6a46e54f61..30d3c05ee9 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.h
+++ b/tensorflow/compiler/tf2xla/xla_compiler.h
@@ -235,6 +235,19 @@ class XlaCompiler {
     // device is created, and can be used to create metadata objects
     // that can be accessed by XLA op kernels.
     std::function<Status(ResourceMgr*)>* populate_resource_manager = nullptr;
+
+    // If not nullptr, this memory allocator can be used by the compiler for
+    // temporary allocations it might want to make during compilation.
+    //
+    // For example, the compiler may want to try out different algorithms and
+    // choose the fastest one, and it might run those algorithms over buffers
+    // created using this allocator.
+    //
+    // The compiler can function correctly without an explicit allocator given
+    // here, but on some devices (notably, GPUs), TensorFlow tends to eagerly
+    // allocate most or all available memory on the device, leaving none for the
+    // compiler to access, unless it can use TensorFlow's allocator.
+    xla::DeviceMemoryAllocator* device_allocator = nullptr;
   };
 
   explicit XlaCompiler(Options options);
diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc
index fbeedfcecd..e45787fca6 100644
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@@ -49,6 +49,16 @@ const Shape* ExecutableBuildOptions::result_layout() const {
   return result_layout_set_ ? &result_layout_ : nullptr;
 }
 
+ExecutableBuildOptions& ExecutableBuildOptions::set_device_allocator(
+    DeviceMemoryAllocator* allocator) {
+  device_allocator_ = allocator;
+  return *this;
+}
+
+DeviceMemoryAllocator* ExecutableBuildOptions::device_allocator() const {
+  return device_allocator_;
+}
+
 namespace {
 StatusOr<Backend::StreamPtr> BorrowStreamForDevice(int device_ordinal,
                                                    Backend* backend) {
@@ -270,10 +280,11 @@ StatusOr<std::unique_ptr<LocalExecutable>> LocalClient::Compile(
   int device_ordinal = options.device_ordinal() == -1
                            ? default_device_ordinal()
                            : options.device_ordinal();
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
-                      local_service_->CompileExecutable(
-                          computation.handle(), argument_layouts,
-                          options.result_layout(), device_ordinal));
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<Executable> executable,
+      local_service_->CompileExecutable(computation.handle(), argument_layouts,
+                                        options.result_layout(), device_ordinal,
+                                        options.device_allocator()));
   return WrapUnique(new LocalExecutable(std::move(executable),
                                         local_service_->mutable_backend(),
                                         device_ordinal, options));
diff --git a/tensorflow/compiler/xla/client/local_client.h b/tensorflow/compiler/xla/client/local_client.h
index 19fd14f76b..843ad7aa85 100644
--- a/tensorflow/compiler/xla/client/local_client.h
+++ b/tensorflow/compiler/xla/client/local_client.h
@@ -53,10 +53,22 @@ class ExecutableBuildOptions {
   ExecutableBuildOptions& set_result_layout(const Shape& shape_with_layout);
   const Shape* result_layout() const;
 
+  // If set, this specifies an allocator that can be used to allocate temporary
+  // space on the device during compilation.  For example, the compiler might
+  // want to run various algorithms on the device and pick the fastest one -- it
+  // might allocate buffers for use by these algorithms using this allocator.
+  //
+  // This does not need to be the same as the DeviceMemoryAllocator passed when
+  // running the executable.
+  ExecutableBuildOptions& set_device_allocator(
+      DeviceMemoryAllocator* allocator);
+  DeviceMemoryAllocator* device_allocator() const;
+
  private:
   int device_ordinal_ = -1;
   Shape result_layout_;
   bool result_layout_set_ = false;
+  DeviceMemoryAllocator* device_allocator_ = nullptr;
 };
 
 class LocalExecutable {
diff --git a/tensorflow/compiler/xla/service/compiler.h b/tensorflow/compiler/xla/service/compiler.h
index fc67330f5c..74fd24edf8 100644
--- a/tensorflow/compiler/xla/service/compiler.h
+++ b/tensorflow/compiler/xla/service/compiler.h
@@ -72,8 +72,18 @@ class AotCompilationOptions {
   // Returns the ID of the platform to which these options apply.
   virtual perftools::gputools::Platform::Id PlatformId() const = 0;
 
+  // Optional allocator that may be used for allocating temp space on the device
+  // during compilation.
+  DeviceMemoryAllocator* device_allocator() const { return device_allocator_; }
+  void set_device_allocator(DeviceMemoryAllocator* device_allocator) {
+    device_allocator_ = device_allocator;
+  }
+
  protected:
   AotCompilationOptions() = default;
+
+ private:
+  DeviceMemoryAllocator* device_allocator_ = nullptr;
 };
 
 // Abstract compiler interface that is subclassed for compilation on a
@@ -99,9 +109,16 @@ class Compiler {
 
   // Runs Hlo passes to optimize the given Hlo module, returns the optimized
   // module.
+  //
+  // If device_allocator is not null, the compiler may use it to allocate temp
+  // space on the device for use during compilation.  For example, the compiler
+  // may allocate buffers on the device and then run variants of a given
+  // algorithm over those buffers, to see which variant is fastest.  Any space
+  // allocated should be deallocated before this function returns.
   virtual StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
       std::unique_ptr<HloModule> module,
-      perftools::gputools::StreamExecutor* executor) = 0;
+      perftools::gputools::StreamExecutor* executor,
+      DeviceMemoryAllocator* device_allocator) = 0;
 
   // Compiles the HLO module for execution on a device given by the executor,
   // and returns an executable object or an error status. No HLO passes are
@@ -112,21 +129,27 @@ class Compiler {
   // The compiler may optionally specialize to the individual device
   // (not just type of device) indicated by the executor.
   //
+  // device_allocator is optional; see RunHloPasses.
+  //
   // Use the overload below to compile computations that run in parallel.
   virtual StatusOr<std::unique_ptr<Executable>> RunBackend(
       std::unique_ptr<HloModule> module,
-      perftools::gputools::StreamExecutor* executor) = 0;
+      perftools::gputools::StreamExecutor* executor,
+      DeviceMemoryAllocator* device_allocator) = 0;
 
   // Compiles a set of HLO modules that can run in parallel, potentially
   // communicating data between the modules, and returns a corresponding
   // sequence of executable objects.
   //
+  // device_allocator is optional; see RunHloPasses.
+  //
   // TODO(b/68666782): Remove this method after adding support for multiple
   // modules to RunHloPasses and RunBackends.
   virtual StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
       std::vector<std::unique_ptr<HloModule>> modules,
       std::vector<std::vector<perftools::gputools::StreamExecutor*>>
-          stream_exec) = 0;
+          stream_exec,
+      DeviceMemoryAllocator* device_allocator) = 0;
 
   // Compiles the HLO module for ahead-of-time execution.  This is intended for
   // use in static compilation.
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index 33af77e1a8..3fdb3d5ca6 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -437,7 +437,8 @@ Status VerifyLlvmModule(const llvm::Module& llvm_module) {
 
 StatusOr<std::unique_ptr<HloModule>> CpuCompiler::RunHloPasses(
     std::unique_ptr<HloModule> module,
-    perftools::gputools::StreamExecutor* /*stream_exec*/) {
+    perftools::gputools::StreamExecutor* /*stream_exec*/,
+    DeviceMemoryAllocator* /*device_allocator*/) {
   VLOG(2) << "Before optimization:";
   XLA_VLOG_LINES(2, module->ToString());
 
@@ -450,7 +451,8 @@ StatusOr<std::unique_ptr<HloModule>> CpuCompiler::RunHloPasses(
 
 StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
     std::unique_ptr<HloModule> module,
-    perftools::gputools::StreamExecutor* stream_exec) {
+    perftools::gputools::StreamExecutor* stream_exec,
+    DeviceMemoryAllocator* /*device_allocator*/) {
   const string timer_message =
       "Compiling [" + module->name() + "] for CPU using JIT";
   XLA_SCOPED_LOGGING_TIMER(timer_message);
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
index ebed7058d8..3498139ab9 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
@@ -118,11 +118,13 @@ class CpuCompiler : public LLVMCompiler {
 
   StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
       std::unique_ptr<HloModule> module,
-      perftools::gputools::StreamExecutor* stream_exec) override;
+      perftools::gputools::StreamExecutor* stream_exec,
+      DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::unique_ptr<Executable>> RunBackend(
       std::unique_ptr<HloModule> module,
-      perftools::gputools::StreamExecutor* stream_exec) override;
+      perftools::gputools::StreamExecutor* stream_exec,
+      DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
   CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index 0cca3ca092..495ae1710f 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -212,7 +212,9 @@ tensorflow::Status OptimizeHloModule(HloModule* hlo_module) {
 
 // Modifies the given HLO module so that it will be accepted by IrEmitter.
 // Unlike optimization passes, the passes are necessary for correctness.
-tensorflow::Status PrepareHloModuleForIrEmitting(HloModule* hlo_module) {
+tensorflow::Status PrepareHloModuleForIrEmitting(
+    HloModule* hlo_module, se::StreamExecutor* stream_exec,
+    DeviceMemoryAllocator* /*device_allocator*/) {
   // In some cases, we have to place the result of an instruction in a temporary
   // buffer. For instance, the buffer that holds an external parameter is
   // assumed immutable at this point, and should not be reused for output
@@ -410,7 +412,8 @@ GpuCompiler::GpuCompiler()
                         .getPointerSize(0 /* default address space */)) {}
 
 StatusOr<std::unique_ptr<HloModule>> GpuCompiler::RunHloPasses(
-    std::unique_ptr<HloModule> module, se::StreamExecutor* /*stream_exec*/) {
+    std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
+    DeviceMemoryAllocator* device_allocator) {
   XLA_SCOPED_LOGGING_TIMER("GpuCompiler::RunHloPasses");
   Tracing::TraceMe annotation("HLO Transforms", module->name(),
                               /*is_expensive=*/true);
@@ -419,12 +422,14 @@ StatusOr<std::unique_ptr<HloModule>> GpuCompiler::RunHloPasses(
 }
 
 StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
-    std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec) {
+    std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
+    DeviceMemoryAllocator* device_allocator) {
   XLA_SCOPED_LOGGING_TIMER("GpuCompiler::RunBackend");
 
   TF_RET_CHECK(stream_exec != nullptr);
 
-  TF_RETURN_IF_ERROR(PrepareHloModuleForIrEmitting(module.get()));
+  TF_RETURN_IF_ERROR(PrepareHloModuleForIrEmitting(module.get(), stream_exec,
+                                                   device_allocator));
 
   llvm::LLVMContext llvm_context;
   std::string buffer;
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
index 18e3434020..c352d4d846 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
@@ -51,11 +51,13 @@ class GpuCompiler : public LLVMCompiler {
 
   StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
       std::unique_ptr<HloModule> module,
-      perftools::gputools::StreamExecutor* stream_exec) override;
+      perftools::gputools::StreamExecutor* stream_exec,
+      DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::unique_ptr<Executable>> RunBackend(
       std::unique_ptr<HloModule> module,
-      perftools::gputools::StreamExecutor* stream_exec) override;
+      perftools::gputools::StreamExecutor* stream_exec,
+      DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
   CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> module,
diff --git a/tensorflow/compiler/xla/service/hlo_runner.cc b/tensorflow/compiler/xla/service/hlo_runner.cc
index 204a8bf748..e281538848 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.cc
+++ b/tensorflow/compiler/xla/service/hlo_runner.cc
@@ -121,12 +121,14 @@ StatusOr<std::unique_ptr<Literal>> HloRunner::ExecuteInternal(
   if (run_hlo_passes) {
     TF_ASSIGN_OR_RETURN(
         module, backend().compiler()->RunHloPasses(
-                    std::move(module), backend().default_stream_executor()));
+                    std::move(module), backend().default_stream_executor(),
+                    /*device_allocator=*/nullptr));
   }
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<Executable> executable,
       backend().compiler()->RunBackend(std::move(module),
-                                       backend().default_stream_executor()));
+                                       backend().default_stream_executor(),
+                                       /*device_allocator=*/nullptr));
 
   se::Stream stream(backend().default_stream_executor());
   stream.Init();
diff --git a/tensorflow/compiler/xla/service/interpreter/compiler.cc b/tensorflow/compiler/xla/service/interpreter/compiler.cc
index dc63a2224d..c83880e030 100644
--- a/tensorflow/compiler/xla/service/interpreter/compiler.cc
+++ b/tensorflow/compiler/xla/service/interpreter/compiler.cc
@@ -70,15 +70,16 @@ Status InterpreterCompiler::RunHloOptimization(HloModule* hlo_module) {
 }
 
 StatusOr<std::unique_ptr<HloModule>> InterpreterCompiler::RunHloPasses(
-    std::unique_ptr<HloModule> hlo_module,
-    se::StreamExecutor* /*stream_exec*/) {
+    std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* /*stream_exec*/,
+    DeviceMemoryAllocator* /*device_allocator*/) {
   VLOG(1) << "Run hlo passes on graph " << hlo_module->name();
   TF_RETURN_IF_ERROR(RunHloOptimization(hlo_module.get()));
   return std::move(hlo_module);
 }
 
 StatusOr<std::unique_ptr<Executable>> InterpreterCompiler::RunBackend(
-    std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* stream_exec) {
+    std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* stream_exec,
+    DeviceMemoryAllocator* /*device_allocator*/) {
   TF_RET_CHECK(stream_exec != nullptr);
 
   VLOG(1) << "Run backend " << hlo_module->name();
@@ -96,7 +97,8 @@ StatusOr<std::unique_ptr<Executable>> InterpreterCompiler::RunBackend(
 
 StatusOr<std::vector<std::unique_ptr<Executable>>> InterpreterCompiler::Compile(
     std::vector<std::unique_ptr<HloModule>> /*hlo_modules*/,
-    std::vector<std::vector<se::StreamExecutor*>> /*stream_execs*/) {
+    std::vector<std::vector<se::StreamExecutor*>> /*stream_execs*/,
+    DeviceMemoryAllocator* /*device_allocator*/) {
   return tensorflow::errors::Unimplemented(
       "Compilation of multiple HLO modules is not supported on Interpreter.");
 }
diff --git a/tensorflow/compiler/xla/service/interpreter/compiler.h b/tensorflow/compiler/xla/service/interpreter/compiler.h
index 278cf51842..c8660c04d8 100644
--- a/tensorflow/compiler/xla/service/interpreter/compiler.h
+++ b/tensorflow/compiler/xla/service/interpreter/compiler.h
@@ -45,16 +45,19 @@ class InterpreterCompiler : public Compiler {
 
   StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
       std::unique_ptr<HloModule> hlo_module,
-      perftools::gputools::StreamExecutor* stream_exec) override;
+      perftools::gputools::StreamExecutor* stream_exec,
+      DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::unique_ptr<Executable>> RunBackend(
       std::unique_ptr<HloModule> hlo_module,
-      perftools::gputools::StreamExecutor* stream_exec) override;
+      perftools::gputools::StreamExecutor* stream_exec,
+      DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
       std::vector<std::unique_ptr<HloModule>> hlo_modules,
       std::vector<std::vector<perftools::gputools::StreamExecutor*>>
-          stream_exec) override;
+          stream_exec,
+      DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
   CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> hlo_modules,
diff --git a/tensorflow/compiler/xla/service/llvm_compiler.cc b/tensorflow/compiler/xla/service/llvm_compiler.cc
index 34f3419269..f98fc0400a 100644
--- a/tensorflow/compiler/xla/service/llvm_compiler.cc
+++ b/tensorflow/compiler/xla/service/llvm_compiler.cc
@@ -18,8 +18,8 @@ limitations under the License.
 namespace xla {
 StatusOr<std::vector<std::unique_ptr<Executable>>> LLVMCompiler::Compile(
     std::vector<std::unique_ptr<HloModule>> modules,
-    std::vector<std::vector<perftools::gputools::StreamExecutor*>>
-        stream_execs) {
+    std::vector<std::vector<perftools::gputools::StreamExecutor*>> stream_execs,
+    DeviceMemoryAllocator* device_allocator) {
   std::vector<std::unique_ptr<Executable>> result;
   for (size_t i = 0; i < modules.size(); i++) {
     if (stream_execs[i].size() != 1) {
@@ -27,10 +27,12 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> LLVMCompiler::Compile(
           "Model partitioning not implemented for the CPU/GPU compilers!");
     }
 
-    TF_ASSIGN_OR_RETURN(
-        modules[i], RunHloPasses(std::move(modules[i]), stream_execs[i][0]));
+    TF_ASSIGN_OR_RETURN(modules[i],
+                        RunHloPasses(std::move(modules[i]), stream_execs[i][0],
+                                     device_allocator));
     TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
-                        RunBackend(std::move(modules[i]), stream_execs[i][0]));
+                        RunBackend(std::move(modules[i]), stream_execs[i][0],
+                                   device_allocator));
     result.push_back(std::move(executable));
   }
 
diff --git a/tensorflow/compiler/xla/service/llvm_compiler.h b/tensorflow/compiler/xla/service/llvm_compiler.h
index c5393cef4f..d74e81bb7f 100644
--- a/tensorflow/compiler/xla/service/llvm_compiler.h
+++ b/tensorflow/compiler/xla/service/llvm_compiler.h
@@ -60,17 +60,20 @@ class LLVMCompiler : public Compiler {
   // Bring in
   //   StatusOr<std::unique_ptr<Executable>> RunBackend(
   //       std::unique_ptr<HloModule> module,
-  //       perftools::gputools::StreamExecutor* stream_exec)
+  //       perftools::gputools::StreamExecutor* stream_exec,
+  //       DeviceMemoryAllocator* device_allocator)
   //   StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
   //       std::unique_ptr<HloModule> module,
-  //       perftools::gputools::StreamExecutor* stream_exec)
+  //       perftools::gputools::StreamExecutor* stream_exec,
+  //       DeviceMemoryAllocator* device_allocator)
   using Compiler::RunBackend;
   using Compiler::RunHloPasses;
 
   StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
       std::vector<std::unique_ptr<HloModule>> modules,
       std::vector<std::vector<perftools::gputools::StreamExecutor*>>
-          stream_execs) override;
+          stream_execs,
+      DeviceMemoryAllocator* device_allocator) override;
 
  protected:
   ModuleHook user_pre_optimization_hook_;
diff --git a/tensorflow/compiler/xla/service/local_service.cc b/tensorflow/compiler/xla/service/local_service.cc
index f30530db08..bb9fd447d9 100644
--- a/tensorflow/compiler/xla/service/local_service.cc
+++ b/tensorflow/compiler/xla/service/local_service.cc
@@ -71,7 +71,8 @@ LocalService::LocalService(const ServiceOptions& options,
 StatusOr<std::unique_ptr<Executable>> LocalService::CompileExecutable(
     const ComputationHandle& computation,
     const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
-    const Shape* result_layout, int device_ordinal) {
+    const Shape* result_layout, int device_ordinal,
+    DeviceMemoryAllocator* device_allocator) {
   TF_ASSIGN_OR_RETURN(UserComputation * user_computation,
                       computation_tracker_.Resolve(computation));
   VersionedComputationHandle versioned_handle =
@@ -135,7 +136,7 @@ StatusOr<std::unique_ptr<Executable>> LocalService::CompileExecutable(
                       execute_backend_->stream_executor(device_ordinal));
 
   return BuildExecutable(versioned_handle, std::move(module_config),
-                         execute_backend_.get(), executor);
+                         execute_backend_.get(), executor, device_allocator);
 }
 
 StatusOr<int> LocalService::ReplicaNumberToDeviceOrdinal(int replica_number) {
diff --git a/tensorflow/compiler/xla/service/local_service.h b/tensorflow/compiler/xla/service/local_service.h
index acbc726825..16c71b25c4 100644
--- a/tensorflow/compiler/xla/service/local_service.h
+++ b/tensorflow/compiler/xla/service/local_service.h
@@ -41,11 +41,14 @@ class LocalService : public Service {
 
   // Builds an Executable with the given argument layouts and options. If
   // result_layout is non-null, then the executable is compiled to produce a
-  // result of the given layout.
+  // result of the given layout.  If device_allocator is non-null, then the
+  // compiler may use it to allocate temp space on the device.  The compiler is
+  // responsible for freeing any memory it allocates this way.
   StatusOr<std::unique_ptr<Executable>> CompileExecutable(
       const ComputationHandle& computation,
       const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
-      const Shape* result_layout, int device_ordinal);
+      const Shape* result_layout, int device_ordinal,
+      DeviceMemoryAllocator* device_allocator);
 
   // Returns the device ordinal that corresponds to the given replica number.
   //
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index 849df1d8e6..fea6956345 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -337,7 +337,8 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
     std::vector<VersionedComputationHandle> versioned_handles,
     std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
     Backend* backend,
-    std::vector<std::vector<perftools::gputools::StreamExecutor*>> executors) {
+    std::vector<std::vector<perftools::gputools::StreamExecutor*>> executors,
+    DeviceMemoryAllocator* device_allocator) {
   VLOG(1) << Printf("BuildExecutable on service %p", this);
 
   // Dump computation proto state if flag is set.
@@ -383,7 +384,8 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
 
   TF_ASSIGN_OR_RETURN(
       std::vector<std::unique_ptr<Executable>> executables,
-      backend->compiler()->Compile(std::move(modules), std::move(executors)));
+      backend->compiler()->Compile(std::move(modules), std::move(executors),
+                                   device_allocator));
 
   for (size_t i = 0; i < versioned_handles.size(); ++i) {
     if (!module_configs[i]->debug_options().xla_dump_executions_to().empty()) {
@@ -396,8 +398,8 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
 
 StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
     const VersionedComputationHandle& versioned_handle,
-    std::unique_ptr<HloModuleConfig> module_config,
-    Backend* backend, se::StreamExecutor* executor) {
+    std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
+    se::StreamExecutor* executor, DeviceMemoryAllocator* device_allocator) {
   VLOG(1) << Printf("BuildExecutable on service %p with handle %s", this,
                     versioned_handle.ToString().c_str());
 
@@ -430,11 +432,12 @@ StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
   TF_RETURN_IF_ERROR(MaybeDumpHloModule(*module));
 
   TF_ASSIGN_OR_RETURN(
-      module, backend->compiler()->RunHloPasses(std::move(module), executor));
+      module, backend->compiler()->RunHloPasses(std::move(module), executor,
+                                                device_allocator));
 
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<Executable> executable,
-      backend->compiler()->RunBackend(std::move(module), executor));
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
+                      backend->compiler()->RunBackend(
+                          std::move(module), executor, device_allocator));
 
   if (!other_directory_path.empty()) {
     executable->set_session_module(std::move(session_module));
@@ -445,9 +448,9 @@ StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
 
 StatusOr<std::shared_ptr<Executable>> Service::BuildAndCacheExecutable(
     const VersionedComputationHandle& versioned_handle,
-    std::unique_ptr<HloModuleConfig> module_config,
-    Backend* backend, perftools::gputools::StreamExecutor* executor,
-    ExecutionProfile* profile) {
+    std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
+    perftools::gputools::StreamExecutor* executor, ExecutionProfile* profile,
+    DeviceMemoryAllocator* device_allocator) {
   std::shared_ptr<Executable> executable =
       compilation_cache_.LookUp(versioned_handle, *module_config);
 
@@ -469,7 +472,7 @@ StatusOr<std::shared_ptr<Executable>> Service::BuildAndCacheExecutable(
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<Executable> executable_unique_ptr,
       BuildExecutable(versioned_handle, std::move(module_config), backend,
-                      executor));
+                      executor, device_allocator));
 
   if (profile != nullptr) {
     uint64 end_micros = tensorflow::Env::Default()->NowMicros();
@@ -771,10 +774,14 @@ tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
 
   // Build the user computations into HloModules and compile to generate the
   // executables.
+  //
+  // TODO(jlebar): There's currently no way to pass a device allocator to
+  // ExecuteParallel, so we have to pass a null device_allocator below.
   TF_ASSIGN_OR_RETURN(
       std::vector<std::unique_ptr<Executable>> executables,
       BuildExecutables(versioned_handles, std::move(module_configs),
-                       execute_backend_.get(), all_executors));
+                       execute_backend_.get(), all_executors,
+                       /*device_allocator=*/nullptr));
   std::vector<Executable*> executable_ptrs;
   executable_ptrs.reserve(executables.size());
   for (const auto& executable : executables) {
diff --git a/tensorflow/compiler/xla/service/service.h b/tensorflow/compiler/xla/service/service.h
index ca77e8fe3a..6ce2419711 100644
--- a/tensorflow/compiler/xla/service/service.h
+++ b/tensorflow/compiler/xla/service/service.h
@@ -280,10 +280,15 @@ class Service : public ServiceInterface {
       const UserComputation& user_computation);
 
   // Builds an Executable for the given parameters.
+  //
+  // If device_allocator is not null, the compiler may use it to allocate temp
+  // buffers, which the compiler is responsible for freeing.  The allocator
+  // given here need not match the allocator used when running the executable.
   StatusOr<std::unique_ptr<Executable>> BuildExecutable(
       const VersionedComputationHandle& versioned_handle,
-      std::unique_ptr<HloModuleConfig> module_config,
-      Backend* backend, perftools::gputools::StreamExecutor* executor);
+      std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
+      perftools::gputools::StreamExecutor* executor,
+      DeviceMemoryAllocator* device_allocator = nullptr);
 
   // Same as BuildExecutable() above, but builds a list of Executables for the
   // given computations that may interact with each other.
@@ -291,16 +296,17 @@ class Service : public ServiceInterface {
       std::vector<VersionedComputationHandle> versioned_handles,
       std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
       Backend* backend,
-      std::vector<std::vector<perftools::gputools::StreamExecutor*>> executors);
+      std::vector<std::vector<perftools::gputools::StreamExecutor*>> executors,
+      DeviceMemoryAllocator* device_allocator);
 
   // Similar to BuildExecutable, but look in the compilation cache for the
   // executable first. If the executable is not in the cache, it is built and
   // inserted into the cache.
   StatusOr<std::shared_ptr<Executable>> BuildAndCacheExecutable(
       const VersionedComputationHandle& versioned_handle,
-      std::unique_ptr<HloModuleConfig> module_config,
-      Backend* backend, perftools::gputools::StreamExecutor* executor,
-      ExecutionProfile* profile);
+      std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
+      perftools::gputools::StreamExecutor* executor, ExecutionProfile* profile,
+      DeviceMemoryAllocator* device_allocator = nullptr);
 
   // Runs the given executable with the given arguments and register the result
   // in the allocation tracker. The handle of the result from the tracker is
diff --git a/tensorflow/compiler/xla/tests/codegen_test_base.cc b/tensorflow/compiler/xla/tests/codegen_test_base.cc
index e472408dcf..022641394f 100644
--- a/tensorflow/compiler/xla/tests/codegen_test_base.cc
+++ b/tensorflow/compiler/xla/tests/codegen_test_base.cc
@@ -21,9 +21,11 @@ StatusOr<std::unique_ptr<Executable>> CodegenTestBase::CompileToExecutable(
     std::unique_ptr<HloModule> hlo_module) {
   TF_ASSIGN_OR_RETURN(hlo_module, backend().compiler()->RunHloPasses(
                                       std::move(hlo_module),
-                                      backend().default_stream_executor()));
+                                      backend().default_stream_executor(),
+                                      /*device_allocator=*/nullptr));
   return backend().compiler()->RunBackend(std::move(hlo_module),
-                                          backend().default_stream_executor());
+                                          backend().default_stream_executor(),
+                                          /*device_allocator=*/nullptr);
 }
 
 StatusOr<std::unique_ptr<AotCompilationResult>>
diff --git a/tensorflow/compiler/xla/tests/llvm_compiler_test.cc b/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
index b5b95967ff..7e92439c49 100644
--- a/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
+++ b/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
@@ -74,7 +74,8 @@ class LLVMCompilerTest : public ::testing::Test {
 
     ASSERT_TRUE(compiler
                     ->RunBackend(std::move(hlo_module),
-                                 backend_->default_stream_executor())
+                                 backend_->default_stream_executor(),
+                                 /*device_allocator=*/nullptr)
                     .ok());
 
     // Test that hooks were called.
@@ -98,7 +99,8 @@ class LLVMCompilerTest : public ::testing::Test {
     executors.push_back({backend_->default_stream_executor()});
     executors.push_back({backend_->default_stream_executor()});
 
-    EXPECT_IS_OK(compiler->Compile(std::move(modules), std::move(executors)));
+    EXPECT_IS_OK(compiler->Compile(std::move(modules), std::move(executors),
+                                   /*device_allocator=*/nullptr));
   }
 
  private:
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
index 5ede37b873..4ad356d045 100644
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
@@ -86,9 +86,9 @@ void RealMain(tensorflow::gtl::ArraySlice<char*> args) {
       layouts.push_back(&program_shape->parameters(i));
     }
     StatusOr<std::unique_ptr<Executable>> executable =
-        local_service->CompileExecutable(computation.handle(), layouts,
-                                         &program_shape->result(),
-                                         /*device_ordinal=*/0);
+        local_service->CompileExecutable(
+            computation.handle(), layouts, &program_shape->result(),
+            /*device_ordinal=*/0, /*device_allocator=*/nullptr);
 
     const HloModule& module = executable.ValueOrDie()->module();
 
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
index 24417a0cb8..5ebb75a31c 100644
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
@@ -61,9 +61,9 @@ void RealMain(tensorflow::gtl::ArraySlice<char*> args, bool compile) {
         layouts.push_back(&program_shape->parameters(i));
       }
       StatusOr<std::unique_ptr<Executable>> executable =
-          local_service->CompileExecutable(computation.handle(), layouts,
-                                           &program_shape->result(),
-                                           /*device_ordinal=*/0);
+          local_service->CompileExecutable(
+              computation.handle(), layouts, &program_shape->result(),
+              /*device_ordinal=*/0, /*device_allocator=*/nullptr);
 
       const HloModule& module = executable.ValueOrDie()->module();
 
-- 
GitLab


From 8fc47fa3af0e7bc1652e7180c699a270bcc71bbd Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 26 Jan 2018 17:18:16 -0800
Subject: [PATCH 286/966] Raise to 4 the shard counts of
 //third_party/tensorflow/contrib/{factorization:kmeans_test,linear_optimizer:sdca_estimator_test}

These tests were getting flaky timeouts when run under asan, sometimes taking
longer than the 5 minute timeout.  Increasing the shard count to 4 seems to be
sufficient to cause them not to time out.

PiperOrigin-RevId: 183470183
---
 tensorflow/contrib/factorization/BUILD    | 1 +
 tensorflow/contrib/linear_optimizer/BUILD | 1 +
 2 files changed, 2 insertions(+)

diff --git a/tensorflow/contrib/factorization/BUILD b/tensorflow/contrib/factorization/BUILD
index fe86a20ab1..180f1b68f3 100644
--- a/tensorflow/contrib/factorization/BUILD
+++ b/tensorflow/contrib/factorization/BUILD
@@ -221,6 +221,7 @@ py_test(
     name = "kmeans_test",
     size = "medium",
     srcs = ["python/ops/kmeans_test.py"],
+    shard_count = 4,
     srcs_version = "PY2AND3",
     tags = ["notsan"],  # b/67512932
     deps = [
diff --git a/tensorflow/contrib/linear_optimizer/BUILD b/tensorflow/contrib/linear_optimizer/BUILD
index fe2f183ac9..cea3627ed5 100644
--- a/tensorflow/contrib/linear_optimizer/BUILD
+++ b/tensorflow/contrib/linear_optimizer/BUILD
@@ -126,6 +126,7 @@ py_library(
 py_test(
     name = "sdca_estimator_test",
     srcs = ["python/sdca_estimator_test.py"],
+    shard_count = 4,
     srcs_version = "PY2AND3",
     deps = [
         ":sdca_estimator_py",
-- 
GitLab


From ca3ac2a464b92f4c0498dfde875f99102a0d410c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 26 Jan 2018 17:38:06 -0800
Subject: [PATCH 287/966] Fixed bug: inconsistency with how damping
 normalization was applied to ConvDiagonalFB blocks.

PiperOrigin-RevId: 183472440
---
 tensorflow/contrib/kfac/python/ops/fisher_blocks.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/kfac/python/ops/fisher_blocks.py b/tensorflow/contrib/kfac/python/ops/fisher_blocks.py
index 9436caf961..0d2fa706f5 100644
--- a/tensorflow/contrib/kfac/python/ops/fisher_blocks.py
+++ b/tensorflow/contrib/kfac/python/ops/fisher_blocks.py
@@ -457,7 +457,9 @@ class ConvDiagonalFB(FisherBlock):
     self._num_locations = (
         inputs_shape[1] * inputs_shape[2] //
         (self._strides[1] * self._strides[2]))
-    self._damping = normalize_damping(damping, self._num_locations)
+
+    self._damping = (self._num_locations
+                     * normalize_damping(damping, self._num_locations))
 
     self._factor = self._layer_collection.make_or_get_factor(
         fisher_factors.ConvDiagonalFactor,
-- 
GitLab


From 704361ad3650ebc891167adc41c459ca93392060 Mon Sep 17 00:00:00 2001
From: Mark Heffernan <meheff@google.com>
Date: Fri, 26 Jan 2018 17:50:03 -0800
Subject: [PATCH 288/966] Create different data for each Literal when creating
 fake data. Thread a generator through the functions for creating fake
 arguments so the same generator can be reused which avoids repeating the same
 data patterns for each argument generated.

Also tweak the position-dependent biasing heuristic to create both positive and
negative numbers for small literals.

PiperOrigin-RevId: 183473588
---
 tensorflow/compiler/xla/tests/test_utils.cc | 188 +++++++++++---------
 1 file changed, 105 insertions(+), 83 deletions(-)

diff --git a/tensorflow/compiler/xla/tests/test_utils.cc b/tensorflow/compiler/xla/tests/test_utils.cc
index 8b10aef5b8..b060fb13b1 100644
--- a/tensorflow/compiler/xla/tests/test_utils.cc
+++ b/tensorflow/compiler/xla/tests/test_utils.cc
@@ -24,51 +24,127 @@ namespace xla {
 namespace {
 
 template <typename FloatT>
-void PopulateWithRandomFloatingPointData(Literal* literal) {
+void PopulateWithRandomFloatingPointData(Literal* literal,
+                                         std::minstd_rand0* engine) {
   CHECK_EQ(literal->shape().element_type(),
            primitive_util::NativeToPrimitiveType<FloatT>());
-  std::minstd_rand0 engine;
-  // Create uniform numbers between 1 and 1.125 ot avoid creating denormal
+  // Create uniform numbers between 1 and 1.125 to avoid creating denormal
   // numbers.
   std::uniform_real_distribution<FloatT> generator(1.0f, 1.125f);
+  const bool should_index_bias = ShapeUtil::ElementsIn(literal->shape()) > 1000;
   TF_CHECK_OK(literal->Populate<FloatT>(
       [&](tensorflow::gtl::ArraySlice<int64> indices) {
-        // Generate a random uniforma number from -0.0625 and 0.0625 and bias it
-        // with  a position dependent nubmer with mean 0.037109375. These number
+        // Generate a random uniform number from -0.0625 and 0.0625 and bias it
+        // with a position dependent number with mean 0.037109375. These number
         // should allow for long chains of accumulation without being too close
-        // to zero or to large to accumulate all numbers accurately.
-        return (generator(engine) - 1.0625) +
-               static_cast<FloatT>(Product(indices) % 113 - 47) /
-                   static_cast<FloatT>(256.0f);
+        // to zero or too large to accumulate all numbers accurately. Only do
+        // this for large literals where the number of elements is much greater
+        // than 47 otherwise only negative values are produced.
+        //
+        // The value is positionally biased using a product of the indices. Add
+        // one to each index value to avoid collapsing to zero if any of the
+        // indices are zero.
+        int64 index_product = 1;
+        for (int64 i : indices) {
+          index_product *= (1 + i);
+        }
+        const int64 negative_bias = should_index_bias ? 47 : 0;
+        FloatT index_bias =
+            static_cast<FloatT>(index_product % 113 - negative_bias) /
+            static_cast<FloatT>(256.0f);
+        return (generator(*engine) - 1.0625) + index_bias;
       }));
 }
 
 // The standard library does not have a case for bfloat16, unsurprisingly, so we
 // handle that one specially.
 template <>
-void PopulateWithRandomFloatingPointData<bfloat16>(Literal* literal) {
+void PopulateWithRandomFloatingPointData<bfloat16>(Literal* literal,
+                                                   std::minstd_rand0* engine) {
   CHECK_EQ(literal->shape().element_type(), BF16);
-  std::minstd_rand0 engine;
   std::uniform_real_distribution<float> generator(-0.9f, 1.0f);
   TF_CHECK_OK(literal->Populate<bfloat16>(
       [&](tensorflow::gtl::ArraySlice<int64> /*indices*/) {
-        return static_cast<bfloat16>(generator(engine));
+        return static_cast<bfloat16>(generator(*engine));
       }));
 }
 
 template <typename IntT>
-void PopulateWithRandomIntegralData(Literal* literal) {
+void PopulateWithRandomIntegralData(Literal* literal,
+                                    std::minstd_rand0* engine) {
   CHECK_EQ(literal->shape().element_type(),
            primitive_util::NativeToPrimitiveType<IntT>());
-  std::minstd_rand0 engine;
   std::uniform_int_distribution<IntT> generator(
       std::numeric_limits<IntT>::lowest(), std::numeric_limits<IntT>::max());
   TF_CHECK_OK(literal->Populate<IntT>(
       [&](tensorflow::gtl::ArraySlice<int64> /*indices*/) {
-        return generator(engine);
+        return generator(*engine);
       }));
 }
 
+// Similar to MakeFakeLiteral but takes a random number generator engine to
+// enable reusing the engine across randomly generated literals.
+StatusOr<std::unique_ptr<Literal>> MakeFakeLiteralInternal(
+    const Shape& shape, std::minstd_rand0* engine) {
+  if (ShapeUtil::IsTuple(shape)) {
+    std::vector<std::unique_ptr<Literal>> elements;
+    for (const Shape& element_shape : shape.tuple_shapes()) {
+      TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> element,
+                          MakeFakeLiteralInternal(element_shape, engine));
+      elements.push_back(std::move(element));
+    }
+    return Literal::MakeTupleOwned(std::move(elements));
+  }
+  std::unique_ptr<Literal> literal = Literal::CreateFromShape(shape);
+  switch (shape.element_type()) {
+    case BF16:
+      PopulateWithRandomFloatingPointData<bfloat16>(literal.get(), engine);
+      break;
+    case F32:
+      PopulateWithRandomFloatingPointData<float>(literal.get(), engine);
+      break;
+    case F64:
+      PopulateWithRandomFloatingPointData<double>(literal.get(), engine);
+      break;
+    case S8:
+      PopulateWithRandomIntegralData<int8>(literal.get(), engine);
+      break;
+    case U8:
+      PopulateWithRandomIntegralData<uint8>(literal.get(), engine);
+      break;
+    case S16:
+      PopulateWithRandomIntegralData<int16>(literal.get(), engine);
+      break;
+    case U16:
+      PopulateWithRandomIntegralData<uint16>(literal.get(), engine);
+      break;
+    case S32:
+      PopulateWithRandomIntegralData<int32>(literal.get(), engine);
+      break;
+    case U32:
+      PopulateWithRandomIntegralData<uint32>(literal.get(), engine);
+      break;
+    case S64:
+      PopulateWithRandomIntegralData<int64>(literal.get(), engine);
+      break;
+    case U64:
+      PopulateWithRandomIntegralData<uint64>(literal.get(), engine);
+      break;
+    case PRED: {
+      std::uniform_int_distribution<int> generator(0, 1);
+      TF_CHECK_OK(literal->Populate<bool>(
+          [&](tensorflow::gtl::ArraySlice<int64> /*indices*/) {
+            return generator(*engine);
+          }));
+      break;
+    }
+    default:
+      return Unimplemented("Unsupported type for fake literal generation: %s",
+                           ShapeUtil::HumanString(shape).c_str());
+  }
+  return std::move(literal);
+}
+
 // Matches binary addition computations.
 bool LooksLikeSum(const HloComputation& computation) {
   const HloInstruction* const root = computation.root_instruction();
@@ -95,15 +171,15 @@ bool NeedsZeroInitValue(const HloUse& use) {
 // Generate random values that are constrained to the input_shape minus the
 // output_shape so as not to produce wrapping slices, for instance.
 std::unique_ptr<Literal> MakeRandomNonwrappingSliceIndex(
-    const Shape& input_shape, const Shape& slice_shape) {
+    const Shape& input_shape, const Shape& slice_shape,
+    std::minstd_rand0* engine) {
   const int64 rank = ShapeUtil::Rank(input_shape);
   std::vector<int32> start_indices(rank);
-  std::minstd_rand0 engine;
   for (int i = 0; i < rank; ++i) {
     const int32 upper_bound = ShapeUtil::GetDimension(input_shape, i) -
                               ShapeUtil::GetDimension(slice_shape, i);
     std::uniform_int_distribution<int32> generator(0, upper_bound);
-    start_indices[i] = generator(engine);
+    start_indices[i] = generator(*engine);
   }
   return Literal::CreateR1<int32>(start_indices);
 }
@@ -150,7 +226,7 @@ std::vector<HloInstruction*> FindConstrainedUses(
 // zero in the case of init_values for reductions).
 StatusOr<std::unique_ptr<Literal>> CreateLiteralForConstrainedUses(
     const tensorflow::gtl::ArraySlice<HloInstruction*> constrained_uses,
-    const HloInstruction& param) {
+    const HloInstruction& param, std::minstd_rand0* engine) {
   HloInstruction* needs_index = nullptr;
   HloInstruction* needs_zero = nullptr;
   for (HloInstruction* use : constrained_uses) {
@@ -185,93 +261,39 @@ StatusOr<std::unique_ptr<Literal>> CreateLiteralForConstrainedUses(
   }
   if (needs_index != nullptr) {
     return MakeRandomNonwrappingSliceIndex(needs_index->operand(0)->shape(),
-                                           needs_index->shape());
+                                           needs_index->shape(), engine);
   } else if (needs_zero != nullptr) {
     return Literal::CreateFromShape(param.shape());
   } else {
-    return MakeFakeLiteral(param.shape());
+    return MakeFakeLiteralInternal(param.shape(), engine);
   }
 }
 
 // Given a module entry parameter, use the dataflow analysis to see if a
 // special case literal must be created, or if we can generate fake data.
 StatusOr<std::unique_ptr<Literal>> MakeConstrainedArgument(
-    const HloDataflowAnalysis& dataflow, const HloInstruction& param) {
+    const HloDataflowAnalysis& dataflow, const HloInstruction& param,
+    std::minstd_rand0* engine) {
   const auto constrained_uses = FindConstrainedUses(dataflow, param);
-  return CreateLiteralForConstrainedUses(constrained_uses, param);
+  return CreateLiteralForConstrainedUses(constrained_uses, param, engine);
 }
 
 }  // namespace
 
 StatusOr<std::unique_ptr<Literal>> MakeFakeLiteral(const Shape& shape) {
-  if (ShapeUtil::IsTuple(shape)) {
-    std::vector<std::unique_ptr<Literal>> elements;
-    for (const Shape& element_shape : shape.tuple_shapes()) {
-      TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> element,
-                          MakeFakeLiteral(element_shape));
-      elements.push_back(std::move(element));
-    }
-    return Literal::MakeTupleOwned(std::move(elements));
-  }
-  std::unique_ptr<Literal> literal = Literal::CreateFromShape(shape);
-  switch (shape.element_type()) {
-    case BF16:
-      PopulateWithRandomFloatingPointData<bfloat16>(literal.get());
-      break;
-    case F32:
-      PopulateWithRandomFloatingPointData<float>(literal.get());
-      break;
-    case F64:
-      PopulateWithRandomFloatingPointData<double>(literal.get());
-      break;
-    case S8:
-      PopulateWithRandomIntegralData<int8>(literal.get());
-      break;
-    case U8:
-      PopulateWithRandomIntegralData<uint8>(literal.get());
-      break;
-    case S16:
-      PopulateWithRandomIntegralData<int16>(literal.get());
-      break;
-    case U16:
-      PopulateWithRandomIntegralData<uint16>(literal.get());
-      break;
-    case S32:
-      PopulateWithRandomIntegralData<int32>(literal.get());
-      break;
-    case U32:
-      PopulateWithRandomIntegralData<uint32>(literal.get());
-      break;
-    case S64:
-      PopulateWithRandomIntegralData<int64>(literal.get());
-      break;
-    case U64:
-      PopulateWithRandomIntegralData<uint64>(literal.get());
-      break;
-    case PRED: {
-      std::uniform_int_distribution<int> generator(0, 1);
-      std::minstd_rand0 engine;
-      TF_CHECK_OK(literal->Populate<bool>(
-          [&](tensorflow::gtl::ArraySlice<int64> /*indices*/) {
-            return generator(engine);
-          }));
-      break;
-    }
-    default:
-      return Unimplemented("Unsupported type for fake literal generation: %s",
-                           ShapeUtil::HumanString(shape).c_str());
-  }
-  return std::move(literal);
+  std::minstd_rand0 engine;
+  return MakeFakeLiteralInternal(shape, &engine);
 }
 
 StatusOr<std::vector<std::unique_ptr<Literal>>> MakeFakeArguments(
     HloModule* const module) {
   TF_ASSIGN_OR_RETURN(auto dataflow, HloDataflowAnalysis::Run(module));
   const auto params = module->entry_computation()->parameter_instructions();
+  std::minstd_rand0 engine;
   std::vector<std::unique_ptr<Literal>> arguments(params.size());
   for (int i = 0; i < params.size(); ++i) {
-    TF_ASSIGN_OR_RETURN(arguments[i],
-                        MakeConstrainedArgument(*dataflow, *params[i]));
+    TF_ASSIGN_OR_RETURN(
+        arguments[i], MakeConstrainedArgument(*dataflow, *params[i], &engine));
   }
   return std::move(arguments);
 }
-- 
GitLab


From 620c36be9a7b07501374eb1ee5f298ca9e0a560d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 26 Jan 2018 17:57:49 -0800
Subject: [PATCH 289/966] Remove protobuf patch that was installed to resolve
 #8394. It appears to not be necessary any longer.

PiperOrigin-RevId: 183474194
---
 tensorflow/workspace.bzl                 |  5 ----
 third_party/protobuf/add_noinlines.patch | 30 ------------------------
 2 files changed, 35 deletions(-)
 delete mode 100644 third_party/protobuf/add_noinlines.patch

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index f7d9075032..26abebe2de 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -357,11 +357,6 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
       sha256 = "e178a25c52efcb6b05988bdbeace4c0d3f2d2fe5b46696d1d9898875c3803d6a",
       strip_prefix = "protobuf-b04e5cba356212e4e8c66c61bbe0c3a20537c5b9",
-      # TODO: remove patching when tensorflow stops linking same protos into
-      #       multiple shared libraries loaded in runtime by python.
-      #       This patch fixes a runtime crash when tensorflow is compiled
-      #       with clang -O2 on Linux (see https://github.com/tensorflow/tensorflow/issues/8394)
-      patch_file = str(Label("//third_party/protobuf:add_noinlines.patch")),
   )
 
   # We need to import the protobuf library under the names com_google_protobuf
diff --git a/third_party/protobuf/add_noinlines.patch b/third_party/protobuf/add_noinlines.patch
deleted file mode 100644
index af74798f06..0000000000
--- a/third_party/protobuf/add_noinlines.patch
+++ /dev/null
@@ -1,30 +0,0 @@
-diff -u -r a/src/google/protobuf/compiler/cpp/cpp_file.cc b/src/google/protobuf/compiler/cpp/cpp_file.cc
---- a/src/google/protobuf/compiler/cpp/cpp_file.cc	2017-02-10 23:55:34.000000000 +0100
-+++ b/src/google/protobuf/compiler/cpp/cpp_file.cc	2017-03-21 13:41:46.931979154 +0100
-@@ -557,7 +557,7 @@
-         "      $metadata$, $enum_descriptors$, $service_descriptors$);\n"
-         "}\n"
-         "\n"
--        "void protobuf_AssignDescriptorsOnce() {\n"
-+        "GOOGLE_ATTRIBUTE_NOINLINE void protobuf_AssignDescriptorsOnce() {\n"
-         "  static GOOGLE_PROTOBUF_DECLARE_ONCE(once);\n"
-         "  ::google::protobuf::GoogleOnceInit(&once, &protobuf_AssignDescriptors);\n"
-         "}\n"
-@@ -656,7 +656,7 @@
-   printer->Print(
-       "}\n"
-       "\n"
--      "void InitDefaults() {\n"
-+      "GOOGLE_ATTRIBUTE_NOINLINE void InitDefaults() {\n"
-       "  static GOOGLE_PROTOBUF_DECLARE_ONCE(once);\n"
-       "  ::google::protobuf::GoogleOnceInit(&once, &TableStruct::InitDefaultsImpl);\n"
-       "}\n");
-@@ -737,7 +737,7 @@
-   printer->Print(
-       "}\n"
-       "\n"
--      "void AddDescriptors() {\n"
-+      "GOOGLE_ATTRIBUTE_NOINLINE void AddDescriptors() {\n"
-       "  static GOOGLE_PROTOBUF_DECLARE_ONCE(once);\n"
-       "  ::google::protobuf::GoogleOnceInit(&once, &AddDescriptorsImpl);\n"
-       "}\n");
-- 
GitLab


From fc21dc489ba8f66938a13615ee899da824eafeb1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 26 Jan 2018 18:00:08 -0800
Subject: [PATCH 290/966] Add a feature to automatically recapture the traces
 when no trace event is collected.

PiperOrigin-RevId: 183474367
---
 .../tpu/profiler/capture_tpu_profile.cc       | 34 ++++++++++++++++---
 .../contrib/tpu/profiler/dump_tpu_profile.cc  |  4 +--
 .../contrib/tpu/profiler/dump_tpu_profile.h   |  3 +-
 tensorflow/contrib/tpu/profiler/version.h     |  2 +-
 4 files changed, 34 insertions(+), 9 deletions(-)

diff --git a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
index 7373d0e17c..cca0a37a89 100644
--- a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
+++ b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
@@ -78,14 +78,19 @@ int main(int argc, char** argv) {
   tensorflow::string FLAGS_service_addr;
   tensorflow::string FLAGS_logdir;
   int FLAGS_duration_ms = 2000;
+  int FLAGS_num_tracing_attempts = 3;
   bool FLAGS_include_dataset_ops = true;
   std::vector<tensorflow::Flag> flag_list = {
       tensorflow::Flag("service_addr", &FLAGS_service_addr,
                        "Address of TPU profiler service e.g. localhost:8466"),
       tensorflow::Flag("logdir", &FLAGS_logdir,
-                       "Path of TensorBoard log directory e.g. /tmp/tb_log"),
+                       "Path of TensorBoard log directory e.g. /tmp/tb_log, "
+                       "gs://tb_bucket"),
       tensorflow::Flag("duration_ms", &FLAGS_duration_ms,
                        "Duration of tracing in ms. Default is 2000ms."),
+      tensorflow::Flag("num_tracing_attempts", &FLAGS_num_tracing_attempts,
+                       "Automatically retry N times when no trace event "
+                       "is collected. Default is 3."),
       tensorflow::Flag("include_dataset_ops", &FLAGS_include_dataset_ops,
                        "Set to false to profile longer TPU device traces."),
   };
@@ -101,11 +106,32 @@ int main(int argc, char** argv) {
   }
   tensorflow::port::InitMain(argv[0], &argc, &argv);
 
-  int duration_ms = FLAGS_duration_ms;
+  // Sets the minimum duration_ms and tracing attempts to one.
+  int duration_ms = max(FLAGS_duration_ms, 1);
+  int remaining_attempts = max(FLAGS_num_tracing_attempts, 1);
   tensorflow::ProfileOptions opts;
   opts.set_include_dataset_ops(FLAGS_include_dataset_ops);
-  tensorflow::ProfileResponse response =
-      tensorflow::tpu::Profile(FLAGS_service_addr, duration_ms, opts);
+  tensorflow::ProfileResponse response;
+
+  while (true) {
+    std::cout << "Starting to profile TPU traces for " << duration_ms << " ms. "
+              << "Remaining attempt(s): " << remaining_attempts-- << std::endl;
+    response = tensorflow::tpu::Profile(FLAGS_service_addr, duration_ms, opts);
+    if (remaining_attempts <= 0 || !response.encoded_trace().empty()) break;
+    std::cout << "No trace event is collected. Automatically retrying."
+              << std::endl
+              << std::endl;
+  }
+
+  if (response.encoded_trace().empty()) {
+    std::cout << "No trace event is collected after "
+              << FLAGS_num_tracing_attempts << " attempt(s). "
+              << "Perhaps, you want to try again (with more attempts?)."
+              << std::endl
+              << "Tip: increase number of attempts with --num_tracing_attempts."
+              << std::endl;
+  }
+
   // Use the current timestamp as the run name.
   tensorflow::string run = tensorflow::tpu::GetCurrentTimeStampAsString();
   TF_CHECK_OK(tensorflow::tpu::WriteTensorboardTPUProfile(
diff --git a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
index b842951eb2..64e4e6275d 100644
--- a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
+++ b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
@@ -152,9 +152,7 @@ Status WriteTensorboardTPUProfile(const string& logdir, const string& run,
 
   // Ignore computation_graph for now.
   const bool empty_trace = response.encoded_trace().empty();
-  if (empty_trace) {
-    *os << "No trace event is collected." << std::endl;
-  } else {
+  if (!empty_trace) {
     LOG(INFO) << "Converting trace events to TraceViewer JSON.";
     TF_RETURN_IF_ERROR(
         DumpTraceToLogDirectory(profile_run_dir, response.encoded_trace(), os));
diff --git a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h
index 25b958bcfe..2f8656a37b 100644
--- a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h
+++ b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h
@@ -27,7 +27,8 @@ namespace tpu {
 // The following tools are supported:
 //   - Trace viewer
 //   - Op profile
-//   - HLO computation graph
+//   - Input pipeline analyzer
+//   - Overview page
 Status WriteTensorboardTPUProfile(const string& logdir, const string& run,
                                   const ProfileResponse& response,
                                   std::ostream* os);
diff --git a/tensorflow/contrib/tpu/profiler/version.h b/tensorflow/contrib/tpu/profiler/version.h
index 0f645a5492..dc6a934891 100644
--- a/tensorflow/contrib/tpu/profiler/version.h
+++ b/tensorflow/contrib/tpu/profiler/version.h
@@ -16,6 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_TPU_PROFILER_VERSION_H_
 #define TENSORFLOW_CONTRIB_TPU_PROFILER_VERSION_H_
 
-#define TPU_PROFILER_VERSION "1.4.3"
+#define TPU_PROFILER_VERSION "1.5.0"
 
 #endif  // TENSORFLOW_CONTRIB_TPU_PROFILER_VERSION_H_
-- 
GitLab


From 1464b97c2fb1908eb3425af9f87febcbeb9bcc8f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 26 Jan 2018 18:59:15 -0800
Subject: [PATCH 291/966] Internal change.

PiperOrigin-RevId: 183479688
---
 .../contrib/lite/schema/schema_generated.h    | 176 ++++++++++++++----
 1 file changed, 142 insertions(+), 34 deletions(-)

diff --git a/tensorflow/contrib/lite/schema/schema_generated.h b/tensorflow/contrib/lite/schema/schema_generated.h
index c04a73a2bf..bafd28b626 100755
--- a/tensorflow/contrib/lite/schema/schema_generated.h
+++ b/tensorflow/contrib/lite/schema/schema_generated.h
@@ -51,6 +51,9 @@ struct RNNOptionsT;
 struct SequenceRNNOptions;
 struct SequenceRNNOptionsT;
 
+struct BidirectionalSequenceRNNOptions;
+struct BidirectionalSequenceRNNOptionsT;
+
 struct FullyConnectedOptions;
 struct FullyConnectedOptionsT;
 
@@ -211,11 +214,12 @@ enum BuiltinOperator {
   BuiltinOperator_SQUEEZE = 43,
   BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM = 44,
   BuiltinOperator_STRIDED_SLICE = 45,
+  BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN = 46,
   BuiltinOperator_MIN = BuiltinOperator_ADD,
-  BuiltinOperator_MAX = BuiltinOperator_STRIDED_SLICE
+  BuiltinOperator_MAX = BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN
 };
 
-inline BuiltinOperator (&EnumValuesBuiltinOperator())[43] {
+inline BuiltinOperator (&EnumValuesBuiltinOperator())[44] {
   static BuiltinOperator values[] = {
       BuiltinOperator_ADD,
       BuiltinOperator_AVERAGE_POOL_2D,
@@ -259,7 +263,8 @@ inline BuiltinOperator (&EnumValuesBuiltinOperator())[43] {
       BuiltinOperator_DIV,
       BuiltinOperator_SQUEEZE,
       BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM,
-      BuiltinOperator_STRIDED_SLICE};
+      BuiltinOperator_STRIDED_SLICE,
+      BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN};
   return values;
 }
 
@@ -310,6 +315,7 @@ inline const char **EnumNamesBuiltinOperator() {
                                 "SQUEEZE",
                                 "UNIDIRECTIONAL_SEQUENCE_LSTM",
                                 "STRIDED_SLICE",
+                                "BIDIRECTIONAL_SEQUENCE_RNN",
                                 nullptr};
   return names;
 }
@@ -2005,6 +2011,85 @@ flatbuffers::Offset<SequenceRNNOptions> CreateSequenceRNNOptions(
     flatbuffers::FlatBufferBuilder &_fbb, const SequenceRNNOptionsT *_o,
     const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct BidirectionalSequenceRNNOptionsT : public flatbuffers::NativeTable {
+  typedef BidirectionalSequenceRNNOptions TableType;
+  bool time_major;
+  ActivationFunctionType fused_activation_function;
+  BidirectionalSequenceRNNOptionsT()
+      : time_major(false),
+        fused_activation_function(ActivationFunctionType_NONE) {}
+};
+
+struct BidirectionalSequenceRNNOptions FLATBUFFERS_FINAL_CLASS
+    : private flatbuffers::Table {
+  typedef BidirectionalSequenceRNNOptionsT NativeTableType;
+  enum { VT_TIME_MAJOR = 4, VT_FUSED_ACTIVATION_FUNCTION = 6 };
+  bool time_major() const { return GetField<uint8_t>(VT_TIME_MAJOR, 0) != 0; }
+  ActivationFunctionType fused_activation_function() const {
+    return static_cast<ActivationFunctionType>(
+        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_TIME_MAJOR) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
+           verifier.EndTable();
+  }
+  BidirectionalSequenceRNNOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      BidirectionalSequenceRNNOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<BidirectionalSequenceRNNOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb,
+      const BidirectionalSequenceRNNOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct BidirectionalSequenceRNNOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_time_major(bool time_major) {
+    fbb_.AddElement<uint8_t>(BidirectionalSequenceRNNOptions::VT_TIME_MAJOR,
+                             static_cast<uint8_t>(time_major), 0);
+  }
+  void add_fused_activation_function(
+      ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(
+        BidirectionalSequenceRNNOptions::VT_FUSED_ACTIVATION_FUNCTION,
+        static_cast<int8_t>(fused_activation_function), 0);
+  }
+  explicit BidirectionalSequenceRNNOptionsBuilder(
+      flatbuffers::FlatBufferBuilder &_fbb)
+      : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  BidirectionalSequenceRNNOptionsBuilder &operator=(
+      const BidirectionalSequenceRNNOptionsBuilder &);
+  flatbuffers::Offset<BidirectionalSequenceRNNOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<BidirectionalSequenceRNNOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<BidirectionalSequenceRNNOptions>
+CreateBidirectionalSequenceRNNOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, bool time_major = false,
+    ActivationFunctionType fused_activation_function =
+        ActivationFunctionType_NONE) {
+  BidirectionalSequenceRNNOptionsBuilder builder_(_fbb);
+  builder_.add_fused_activation_function(fused_activation_function);
+  builder_.add_time_major(time_major);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<BidirectionalSequenceRNNOptions>
+CreateBidirectionalSequenceRNNOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const BidirectionalSequenceRNNOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct FullyConnectedOptionsT : public flatbuffers::NativeTable {
   typedef FullyConnectedOptions TableType;
   ActivationFunctionType fused_activation_function;
@@ -2541,21 +2626,14 @@ flatbuffers::Offset<LSTMOptions> CreateLSTMOptions(
 
 struct ResizeBilinearOptionsT : public flatbuffers::NativeTable {
   typedef ResizeBilinearOptions TableType;
-  int32_t new_height;
-  int32_t new_width;
-  ResizeBilinearOptionsT() : new_height(0), new_width(0) {}
+  ResizeBilinearOptionsT() {}
 };
 
 struct ResizeBilinearOptions FLATBUFFERS_FINAL_CLASS
     : private flatbuffers::Table {
   typedef ResizeBilinearOptionsT NativeTableType;
-  enum { VT_NEW_HEIGHT = 4, VT_NEW_WIDTH = 6 };
-  int32_t new_height() const { return GetField<int32_t>(VT_NEW_HEIGHT, 0); }
-  int32_t new_width() const { return GetField<int32_t>(VT_NEW_WIDTH, 0); }
   bool Verify(flatbuffers::Verifier &verifier) const {
-    return VerifyTableStart(verifier) &&
-           VerifyField<int32_t>(verifier, VT_NEW_HEIGHT) &&
-           VerifyField<int32_t>(verifier, VT_NEW_WIDTH) && verifier.EndTable();
+    return VerifyTableStart(verifier) && verifier.EndTable();
   }
   ResizeBilinearOptionsT *UnPack(
       const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -2570,13 +2648,6 @@ struct ResizeBilinearOptions FLATBUFFERS_FINAL_CLASS
 struct ResizeBilinearOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
-  void add_new_height(int32_t new_height) {
-    fbb_.AddElement<int32_t>(ResizeBilinearOptions::VT_NEW_HEIGHT, new_height,
-                             0);
-  }
-  void add_new_width(int32_t new_width) {
-    fbb_.AddElement<int32_t>(ResizeBilinearOptions::VT_NEW_WIDTH, new_width, 0);
-  }
   explicit ResizeBilinearOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
       : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -2590,11 +2661,8 @@ struct ResizeBilinearOptionsBuilder {
 };
 
 inline flatbuffers::Offset<ResizeBilinearOptions> CreateResizeBilinearOptions(
-    flatbuffers::FlatBufferBuilder &_fbb, int32_t new_height = 0,
-    int32_t new_width = 0) {
+    flatbuffers::FlatBufferBuilder &_fbb) {
   ResizeBilinearOptionsBuilder builder_(_fbb);
-  builder_.add_new_width(new_width);
-  builder_.add_new_height(new_height);
   return builder_.Finish();
 }
 
@@ -5098,6 +5166,56 @@ inline flatbuffers::Offset<SequenceRNNOptions> CreateSequenceRNNOptions(
                                           _fused_activation_function);
 }
 
+inline BidirectionalSequenceRNNOptionsT *
+BidirectionalSequenceRNNOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new BidirectionalSequenceRNNOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void BidirectionalSequenceRNNOptions::UnPackTo(
+    BidirectionalSequenceRNNOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  {
+    auto _e = time_major();
+    _o->time_major = _e;
+  };
+  {
+    auto _e = fused_activation_function();
+    _o->fused_activation_function = _e;
+  };
+}
+
+inline flatbuffers::Offset<BidirectionalSequenceRNNOptions>
+BidirectionalSequenceRNNOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const BidirectionalSequenceRNNOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBidirectionalSequenceRNNOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<BidirectionalSequenceRNNOptions>
+CreateBidirectionalSequenceRNNOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const BidirectionalSequenceRNNOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const BidirectionalSequenceRNNOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _time_major = _o->time_major;
+  auto _fused_activation_function = _o->fused_activation_function;
+  return tflite::CreateBidirectionalSequenceRNNOptions(
+      _fbb, _time_major, _fused_activation_function);
+}
+
 inline FullyConnectedOptionsT *FullyConnectedOptions::UnPack(
     const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new FullyConnectedOptionsT();
@@ -5457,14 +5575,6 @@ inline void ResizeBilinearOptions::UnPackTo(
     const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  {
-    auto _e = new_height();
-    _o->new_height = _e;
-  };
-  {
-    auto _e = new_width();
-    _o->new_width = _e;
-  };
 }
 
 inline flatbuffers::Offset<ResizeBilinearOptions> ResizeBilinearOptions::Pack(
@@ -5484,9 +5594,7 @@ inline flatbuffers::Offset<ResizeBilinearOptions> CreateResizeBilinearOptions(
     const flatbuffers::rehasher_function_t *__rehasher;
   } _va = {&_fbb, _o, _rehasher};
   (void)_va;
-  auto _new_height = _o->new_height;
-  auto _new_width = _o->new_width;
-  return tflite::CreateResizeBilinearOptions(_fbb, _new_height, _new_width);
+  return tflite::CreateResizeBilinearOptions(_fbb);
 }
 
 inline CallOptionsT *CallOptions::UnPack(
-- 
GitLab


From dfe703b3de18d512a08ef44cebc9bdc0c597866b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 26 Jan 2018 20:51:06 -0800
Subject: [PATCH 292/966] Fix build: add std:: to max() in
 tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc.

PiperOrigin-RevId: 183486778
---
 tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
index cca0a37a89..6a05a2abf6 100644
--- a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
+++ b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
@@ -107,8 +107,8 @@ int main(int argc, char** argv) {
   tensorflow::port::InitMain(argv[0], &argc, &argv);
 
   // Sets the minimum duration_ms and tracing attempts to one.
-  int duration_ms = max(FLAGS_duration_ms, 1);
-  int remaining_attempts = max(FLAGS_num_tracing_attempts, 1);
+  int duration_ms = std::max(FLAGS_duration_ms, 1);
+  int remaining_attempts = std::max(FLAGS_num_tracing_attempts, 1);
   tensorflow::ProfileOptions opts;
   opts.set_include_dataset_ops(FLAGS_include_dataset_ops);
   tensorflow::ProfileResponse response;
-- 
GitLab


From 0aed95f11379322b193945fa1e0832ee726c5278 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 26 Jan 2018 22:19:43 -0800
Subject: [PATCH 293/966] [TF:XLA] Update stale comments to match function
 names.

PiperOrigin-RevId: 183491729
---
 tensorflow/compiler/xla/util.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/xla/util.h b/tensorflow/compiler/xla/util.h
index 4bc2d632cd..2da9bb21b7 100644
--- a/tensorflow/compiler/xla/util.h
+++ b/tensorflow/compiler/xla/util.h
@@ -342,7 +342,7 @@ T CeilOfRatio(T dividend, T divisor) {
 }
 
 // Rounds the value up to a multiple of the divisor by first calling CeilOfRatio
-// then multiplying by the divisor. For example: RoundUpToMultiple(13, 8) => 16
+// then multiplying by the divisor. For example: RoundUpToNearest(13, 8) => 16
 template <typename T>
 T RoundUpToNearest(T value, T divisor) {
   return CeilOfRatio(value, divisor) * divisor;
@@ -350,7 +350,7 @@ T RoundUpToNearest(T value, T divisor) {
 
 // Rounds the value down to a multiple of the divisor by first calling
 // FloorOfRatio then multiplying by the divisor. For example:
-// RoundUpToMultiple(13, 8) => 8
+// RoundDownToNearest(13, 8) => 8
 template <typename T>
 T RoundDownToNearest(T value, T divisor) {
   return FloorOfRatio(value, divisor) * divisor;
-- 
GitLab


From 5b028b2f5867ba230963a92bbc334fe538b30fdc Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Fri, 26 Jan 2018 22:57:08 -0800
Subject: [PATCH 294/966] [XLA] Make DeviceMemoryAllocator::platform() a const
 pointer.

PiperOrigin-RevId: 183493603
---
 tensorflow/compiler/jit/kernels/xla_launch_op.cc          | 5 +++--
 .../compiler/xla/service/device_memory_allocator.cc       | 2 +-
 tensorflow/compiler/xla/service/device_memory_allocator.h | 8 ++++----
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/tensorflow/compiler/jit/kernels/xla_launch_op.cc b/tensorflow/compiler/jit/kernels/xla_launch_op.cc
index 1d7bd22e60..17ae2bb25c 100644
--- a/tensorflow/compiler/jit/kernels/xla_launch_op.cc
+++ b/tensorflow/compiler/jit/kernels/xla_launch_op.cc
@@ -45,7 +45,7 @@ namespace tensorflow {
 // see comment on `AllowsAsynchronousDeallocation()`.
 class XlaAllocator : public xla::DeviceMemoryAllocator {
  public:
-  XlaAllocator(gpu::Platform* platform, OpKernelContext* op_context);
+  XlaAllocator(const gpu::Platform* platform, OpKernelContext* op_context);
   ~XlaAllocator() override;
   xla::StatusOr<gpu::DeviceMemoryBase> Allocate(int device_ordinal, uint64 size,
                                                 bool retry_on_failure) override;
@@ -79,7 +79,8 @@ class XlaAllocator : public xla::DeviceMemoryAllocator {
   std::unordered_map<void*, Tensor> tensors_;
 };
 
-XlaAllocator::XlaAllocator(gpu::Platform* platform, OpKernelContext* op_context)
+XlaAllocator::XlaAllocator(const gpu::Platform* platform,
+                           OpKernelContext* op_context)
     : xla::DeviceMemoryAllocator(platform), op_context_(op_context) {}
 
 XlaAllocator::~XlaAllocator() = default;
diff --git a/tensorflow/compiler/xla/service/device_memory_allocator.cc b/tensorflow/compiler/xla/service/device_memory_allocator.cc
index 2e4b0a5230..78e7aa48ac 100644
--- a/tensorflow/compiler/xla/service/device_memory_allocator.cc
+++ b/tensorflow/compiler/xla/service/device_memory_allocator.cc
@@ -24,7 +24,7 @@ limitations under the License.
 namespace xla {
 
 StreamExecutorMemoryAllocator::StreamExecutorMemoryAllocator(
-    perftools::gputools::Platform* platform,
+    const perftools::gputools::Platform* platform,
     tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
         stream_executors)
     : DeviceMemoryAllocator(platform),
diff --git a/tensorflow/compiler/xla/service/device_memory_allocator.h b/tensorflow/compiler/xla/service/device_memory_allocator.h
index 00caefab66..39dfad84c1 100644
--- a/tensorflow/compiler/xla/service/device_memory_allocator.h
+++ b/tensorflow/compiler/xla/service/device_memory_allocator.h
@@ -33,7 +33,7 @@ class DeviceMemoryAllocator {
  public:
   // Parameter platform indicates which platform the allocator allocates memory
   // on. Must be non-null.
-  explicit DeviceMemoryAllocator(perftools::gputools::Platform* platform)
+  explicit DeviceMemoryAllocator(const perftools::gputools::Platform* platform)
       : platform_(platform) {}
   virtual ~DeviceMemoryAllocator() {}
 
@@ -49,14 +49,14 @@ class DeviceMemoryAllocator {
       int device_ordinal, perftools::gputools::DeviceMemoryBase* mem) = 0;
 
   // Return the platform that the allocator allocates memory on.
-  perftools::gputools::Platform* platform() const { return platform_; }
+  const perftools::gputools::Platform* platform() const { return platform_; }
 
   // Can we call Deallocate() as soon as a computation has been scheduled on
   // a stream, or do we have to wait for the computation to complete first?
   virtual bool AllowsAsynchronousDeallocation() const = 0;
 
  protected:
-  perftools::gputools::Platform* platform_;
+  const perftools::gputools::Platform* platform_;
 };
 
 // Default memory allocator for a platform which uses
@@ -64,7 +64,7 @@ class DeviceMemoryAllocator {
 class StreamExecutorMemoryAllocator : public DeviceMemoryAllocator {
  public:
   StreamExecutorMemoryAllocator(
-      perftools::gputools::Platform* platform,
+      const perftools::gputools::Platform* platform,
       tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
           stream_executors);
 
-- 
GitLab


From ec9592211fcd0940336c1f31dcac405709b6a247 Mon Sep 17 00:00:00 2001
From: Martin Wicke <wicke@google.com>
Date: Fri, 26 Jan 2018 23:35:51 -0800
Subject: [PATCH 295/966] Adds a deprecated_alias utility function with which
 to deprecate unmodified aliases.

PiperOrigin-RevId: 183495796
---
 tensorflow/python/util/deprecation.py      | 131 +++++++++++++++++++--
 tensorflow/python/util/deprecation_test.py |  50 ++++++++
 2 files changed, 172 insertions(+), 9 deletions(-)

diff --git a/tensorflow/python/util/deprecation.py b/tensorflow/python/util/deprecation.py
index 2110fc64cf..fbec8fd2d8 100644
--- a/tensorflow/python/util/deprecation.py
+++ b/tensorflow/python/util/deprecation.py
@@ -39,13 +39,14 @@ _PRINTED_WARNING = {}
 
 def _add_deprecated_function_notice_to_docstring(doc, date, instructions):
   """Adds a deprecation notice to a docstring for deprecated functions."""
+  main_text = ['THIS FUNCTION IS DEPRECATED. It will be removed %s.' %
+               ('in a future version' if date is None else ('after %s' % date))]
+  if instructions:
+    main_text.append('Instructions for updating:')
   return decorator_utils.add_notice_to_docstring(
       doc, instructions,
       'DEPRECATED FUNCTION',
-      '(deprecated)', [
-          'THIS FUNCTION IS DEPRECATED. It will be removed %s.' % (
-              'in a future version' if date is None else ('after %s' % date)),
-          'Instructions for updating:'])
+      '(deprecated)', main_text)
 
 
 def _add_deprecated_arg_notice_to_docstring(doc, date, instructions):
@@ -67,23 +68,135 @@ def _validate_deprecation_args(date, instructions):
     raise ValueError('Don\'t deprecate things without conversion instructions!')
 
 
-def _call_location():
+def _call_location(outer=False):
   """Returns call location given level up from current call."""
   frame = tf_inspect.currentframe()
   if frame:
     # CPython internals are available, use them for performance.
     # walk back two frames to get to deprecated function caller.
-    first_frame = frame.f_back
-    second_frame = first_frame.f_back
-    frame = second_frame if second_frame else first_frame
+    frame = frame.f_back
+    if frame.f_back:
+      frame = frame.f_back
+    if outer and frame.f_back:
+      frame = frame.f_back
     return '%s:%d' % (frame.f_code.co_filename, frame.f_lineno)
   else:
     # Slow fallback path
     stack = tf_inspect.stack(0)  # 0 avoids generating unused context
-    entry = stack[2]
+    entry = stack[3 if outer else 2]
     return '%s:%d' % (entry[1], entry[2])
 
 
+def deprecated_alias(deprecated_name, name, func_or_class, warn_once=True):
+  """Deprecate a symbol in favor of a new name with identical semantics.
+
+  This function is meant to be used when defining a backwards-compatibility
+  alias for a symbol which has been moved. For example:
+
+  module1.py:
+  ```python
+  class NewNameForClass: pass
+  ```
+
+  module2.py:
+  ```python
+  import module1
+
+  DeprecatedNameForClass = deprecated_alias(
+    deprecated_name='module2.DeprecatedNameForClass',
+    name='module1.NewNameForClass',
+    module1.NewNameForClass)
+  ```
+
+  This function works for classes and functions.
+
+  For classes, it creates a new class which is functionally identical (it
+  inherits from the original, and overrides its constructor), but which prints
+  a deprecation warning when an instance is created. It also adds a deprecation
+  notice to the class' docstring.
+
+  For functions, it returns a function wrapped by `tf_decorator.make_decorator`.
+  That function prints a warning when used, and has a deprecation notice in its
+  docstring. This is more or less equivalent (the deprecation warning has
+  slightly different text) to writing:
+
+  ```python
+  @deprecated
+  def deprecated_alias(original_args):
+    real_function(original_args)
+  ```
+
+  Args:
+    deprecated_name: The name of the symbol that is being deprecated, to be used
+      in the warning message. This should be its fully qualified name to avoid
+      confusion.
+    name: The name of the symbol that is to be used instead of the deprecated
+      name. This should be a fully qualified name to avoid confusion.
+    func_or_class: The (non-deprecated) class or function for which a deprecated
+      alias should be created.
+    warn_once: If True (the default), only print a deprecation warning the first
+      time this function is used, or the class is instantiated.
+
+  Returns:
+    A wrapped version of `func_or_class` which prints a deprecation warning on
+    use and has a modified docstring.
+  """
+  if tf_inspect.isclass(func_or_class):
+
+    # Make a new class with __init__ wrapped in a warning.
+    class NewClass(func_or_class):  # pylint: disable=missing-docstring
+      __doc__ = decorator_utils.add_notice_to_docstring(
+          func_or_class.__doc__, 'Please use %s instead.' % name,
+          'DEPRECATED CLASS',
+          '(deprecated)', ['THIS CLASS IS DEPRECATED. '
+                           'It will be removed in a future version. '])
+      __name__ = func_or_class.__name__
+      __module__ = _call_location(outer=True)
+
+      def __init__(self, *args, **kwargs):
+        if hasattr(NewClass.__init__, '__func__'):
+          # Python 2
+          NewClass.__init__.__func__.__doc__ = func_or_class.__init__.__doc__
+        else:
+          # Python 3
+          NewClass.__init__.__doc__ = func_or_class.__init__.__doc__
+
+        if _PRINT_DEPRECATION_WARNINGS:
+          # We're making the alias as we speak. The original may have other
+          # aliases, so we cannot use it to check for whether it's already been
+          # warned about.
+          if NewClass.__init__ not in _PRINTED_WARNING:
+            if warn_once:
+              _PRINTED_WARNING[NewClass.__init__] = True
+            logging.warning(
+                'From %s: The name %s is deprecated. Please use %s instead.\n',
+                _call_location(), deprecated_name, name)
+        super(NewClass, self).__init__(*args, **kwargs)
+
+    return NewClass
+  else:
+    decorator_utils.validate_callable(func_or_class, 'deprecated')
+
+    # Make a wrapper for the original
+    @functools.wraps(func_or_class)
+    def new_func(*args, **kwargs):  # pylint: disable=missing-docstring
+      if _PRINT_DEPRECATION_WARNINGS:
+        # We're making the alias as we speak. The original may have other
+        # aliases, so we cannot use it to check for whether it's already been
+        # warned about.
+        if new_func not in _PRINTED_WARNING:
+          if warn_once:
+            _PRINTED_WARNING[new_func] = True
+          logging.warning(
+              'From %s: The name %s is deprecated. Please use %s instead.\n',
+              _call_location(), deprecated_name, name)
+      return func_or_class(*args, **kwargs)
+    return tf_decorator.make_decorator(
+        func_or_class, new_func, 'deprecated',
+        _add_deprecated_function_notice_to_docstring(
+            func_or_class.__doc__, None, 'Please use %s instead.' % name))
+
+
 def deprecated(date, instructions, warn_once=True):
   """Decorator for marking functions or methods deprecated.
 
diff --git a/tensorflow/python/util/deprecation_test.py b/tensorflow/python/util/deprecation_test.py
index e61edb5cfa..bdd0bc48d2 100644
--- a/tensorflow/python/util/deprecation_test.py
+++ b/tensorflow/python/util/deprecation_test.py
@@ -24,6 +24,56 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import deprecation
 
 
+class DeprecatedAliasTest(test.TestCase):
+
+  @test.mock.patch.object(logging, "warning", autospec=True)
+  def test_function_alias(self, mock_warning):
+    deprecated_func = deprecation.deprecated_alias("deprecated.func",
+                                                   "real.func",
+                                                   logging.error)
+
+    logging.error("fake error logged")
+    self.assertEqual(0, mock_warning.call_count)
+    deprecated_func("FAKE ERROR!")
+    self.assertEqual(1, mock_warning.call_count)
+    # Make sure the error points to the right file.
+    self.assertRegexpMatches(mock_warning.call_args[0][1],
+                             r"deprecation_test\.py:")
+    deprecated_func("ANOTHER FAKE ERROR!")
+    self.assertEqual(1, mock_warning.call_count)
+
+  @test.mock.patch.object(logging, "warning", autospec=True)
+  def test_class_alias(self, mock_warning):
+    class MyClass(object):
+      """My docstring."""
+
+      init_args = []
+
+      def __init__(self, arg):
+        MyClass.init_args.append(arg)
+
+    deprecated_cls = deprecation.deprecated_alias("deprecated.cls",
+                                                  "real.cls",
+                                                  MyClass)
+
+    print(deprecated_cls.__name__)
+    print(deprecated_cls.__module__)
+    print(deprecated_cls.__doc__)
+
+    MyClass("test")
+    self.assertEqual(0, mock_warning.call_count)
+    deprecated_cls("deprecated")
+    self.assertEqual(1, mock_warning.call_count)
+    # Make sure the error points to the right file.
+    self.assertRegexpMatches(mock_warning.call_args[0][1],
+                             r"deprecation_test\.py:")
+    deprecated_cls("deprecated again")
+    self.assertEqual(1, mock_warning.call_count)
+
+    self.assertEqual(["test", "deprecated", "deprecated again"],
+                     MyClass.init_args)
+
+
 class DeprecationTest(test.TestCase):
 
   @test.mock.patch.object(logging, "warning", autospec=True)
-- 
GitLab


From 88cdf2c2bf336e9d7c418d824097aa918ef0274a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 27 Jan 2018 05:12:06 -0800
Subject: [PATCH 296/966] Remove unused BUILD dependencies

PiperOrigin-RevId: 183514731
---
 tensorflow/cc/tools/BUILD                 | 1 -
 tensorflow/compiler/xla/service/BUILD     | 2 --
 tensorflow/compiler/xla/service/gpu/BUILD | 1 -
 tensorflow/contrib/tensorboard/db/BUILD   | 1 -
 tensorflow/core/BUILD                     | 2 --
 tensorflow/core/kernels/data/BUILD        | 2 --
 6 files changed, 9 deletions(-)

diff --git a/tensorflow/cc/tools/BUILD b/tensorflow/cc/tools/BUILD
index 0a7c37383f..97f66e79b8 100644
--- a/tensorflow/cc/tools/BUILD
+++ b/tensorflow/cc/tools/BUILD
@@ -23,7 +23,6 @@ cc_library(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:tensorflow",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 987367fc68..4b5590f5c4 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -1112,8 +1112,6 @@ cc_library(
         ":hlo",
         ":hlo_evaluator",
         ":hlo_pass",
-        ":tuple_util",
-        ":while_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/core:lib",
     ],
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 3c3328b9cd..80c2eed109 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -514,7 +514,6 @@ cc_library(
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_pass",
-        "@llvm//:core",
     ],
 )
 
diff --git a/tensorflow/contrib/tensorboard/db/BUILD b/tensorflow/contrib/tensorboard/db/BUILD
index 6ff5a9e2b1..4175d8adb5 100644
--- a/tensorflow/contrib/tensorboard/db/BUILD
+++ b/tensorflow/contrib/tensorboard/db/BUILD
@@ -40,7 +40,6 @@ cc_library(
     hdrs = ["summary_db_writer.h"],
     copts = tf_copts(),
     deps = [
-        ":schema",
         ":summary_converter",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 29c515121e..455da05738 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -381,7 +381,6 @@ cc_library(
     srcs = ["platform/stacktrace_handler.cc"],
     hdrs = ["platform/stacktrace_handler.h"],
     deps = [
-        ":abi",
         ":lib",
         ":lib_platform",
     ],
@@ -2413,7 +2412,6 @@ cc_library(
     deps = [
         ":lib",
         ":lib_internal",
-        ":stacktrace_handler",
         ":test",  # buildcleaner: keep
         "//tensorflow/core/platform/default/build_config:test_main",
     ],
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index 500ee7b43f..45505ef716 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -81,9 +81,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:proto_text",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:session_options",
         "//tensorflow/core/kernels:variable_ops",
     ],
 )
-- 
GitLab


From f68404bdd10a4b6ef2a50439efac4614de024636 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sat, 27 Jan 2018 15:01:31 -0800
Subject: [PATCH 297/966] Update docs for installing CUDA/CUDNN (#16495)

* Update docs for installing CUDA/CUDNN

This fix addresses the issue raised in 16479 where
CUDA/CUDNN versions from the docs do not match TensorFlow v1.5.0.

From the Dockerfile and docker images ENV, the version of CUDA/CUDNN
for TensorFlow v1.5.0:
```
CUDA_VERSION 9.0.176
CUDNN_VERSION 7.0.5.15
```

This fix updates the doc so that CUDA version is changed from `8.0` -> `9.0`,
CUDNN version is changed from `6.0` -> `7.0`.

This fix fixes 16479.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/docs_src/install/install_linux.md   |  6 +++---
 tensorflow/docs_src/install/install_sources.md | 10 +++++-----
 tensorflow/docs_src/install/install_windows.md |  6 +++---
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index 03f12dff08..7289224572 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -31,13 +31,13 @@ If you are installing TensorFlow with GPU support using one of the
 mechanisms described in this guide, then the following NVIDIA software
 must be installed on your system:
 
-  * CUDA® Toolkit 8.0. For details, see
+  * CUDA® Toolkit 9.0. For details, see
     [NVIDIA's documentation](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/#axzz4VZnqTJ2A).
     Ensure that you append the relevant Cuda pathnames to the
     `LD_LIBRARY_PATH` environment variable as described in the
     NVIDIA documentation.
-  * The NVIDIA drivers associated with CUDA Toolkit 8.0.
-  * cuDNN v6.0. For details, see
+  * The NVIDIA drivers associated with CUDA Toolkit 9.0.
+  * cuDNN v7.0. For details, see
     [NVIDIA's documentation](https://developer.nvidia.com/cudnn).
     Ensure that you create the `CUDA_HOME` environment variable as
     described in the NVIDIA documentation.
diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index f494cc7a7c..0d99f9a47d 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -133,7 +133,7 @@ The following NVIDIA <i>hardware</i> must be installed on your system:
 
 The following NVIDIA <i>software</i> must be installed on your system:
 
-  * NVIDIA's Cuda Toolkit (>= 7.0). We recommend version 8.0.
+  * NVIDIA's Cuda Toolkit (>= 7.0). We recommend version 9.0.
     For details, see
     [NVIDIA's documentation](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/#axzz4VZnqTJ2A).
     Ensure that you append the relevant Cuda pathnames to the
@@ -291,11 +291,11 @@ Do you wish to build TensorFlow with CUDA support? [y/N] <b>Y</b>
 CUDA support will be enabled for TensorFlow
 Do you want to use clang as CUDA compiler? [y/N]
 nvcc will be used as CUDA compiler
-Please specify the Cuda SDK version you want to use, e.g. 7.0. [Leave empty to default to CUDA 8.0]: <b>8.0</b>
-Please specify the location where CUDA 8.0 toolkit is installed. Refer to README.md for more details. [Default is /usr/local/cuda]:
+Please specify the Cuda SDK version you want to use, e.g. 7.0. [Leave empty to default to CUDA 9.0]: <b>9.0</b>
+Please specify the location where CUDA 9.0 toolkit is installed. Refer to README.md for more details. [Default is /usr/local/cuda]:
 Please specify which gcc should be used by nvcc as the host compiler. [Default is /usr/bin/gcc]:
-Please specify the cuDNN version you want to use. [Leave empty to default to cuDNN 6.0]: <b>6</b>
-Please specify the location where cuDNN 6 library is installed. Refer to README.md for more details. [Default is /usr/local/cuda]:
+Please specify the cuDNN version you want to use. [Leave empty to default to cuDNN 7.0]: <b>7</b>
+Please specify the location where cuDNN 7 library is installed. Refer to README.md for more details. [Default is /usr/local/cuda]:
 Please specify a list of comma-separated Cuda compute capabilities you want to build with.
 You can find the compute capability of your device at: https://developer.nvidia.com/cuda-gpus.
 Please note that each additional compute capability significantly increases your build time and binary size.
diff --git a/tensorflow/docs_src/install/install_windows.md b/tensorflow/docs_src/install/install_windows.md
index 8d0eb7966f..86a111c2ec 100644
--- a/tensorflow/docs_src/install/install_windows.md
+++ b/tensorflow/docs_src/install/install_windows.md
@@ -30,13 +30,13 @@ If you are installing TensorFlow with GPU support using one of the mechanisms
 described in this guide, then the following NVIDIA software must be
 installed on your system:
 
-  * CUDA® Toolkit 8.0. For details, see
+  * CUDA® Toolkit 9.0. For details, see
     [NVIDIA's
     documentation](http://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/)
     Ensure that you append the relevant Cuda pathnames to the `%PATH%`
     environment variable as described in the NVIDIA documentation.
-  * The NVIDIA drivers associated with CUDA Toolkit 8.0.
-  * cuDNN v6.0. For details, see
+  * The NVIDIA drivers associated with CUDA Toolkit 9.0.
+  * cuDNN v7.0. For details, see
     [NVIDIA's documentation](https://developer.nvidia.com/cudnn).
     Note that cuDNN is typically installed in a different location from the
     other CUDA DLLs. Ensure that you add the directory where you installed
-- 
GitLab


From f1f4169e42320419b0a5df39c454ca39a57f5a42 Mon Sep 17 00:00:00 2001
From: Zhixian Yan <zhixianyan@google.com>
Date: Sat, 27 Jan 2018 17:38:59 -0800
Subject: [PATCH 298/966] Internal Change

PiperOrigin-RevId: 183551521
---
 .../graph_transformations/identify_lstm.cc    | 18 ---------------
 tensorflow/contrib/lite/toco/tflite/BUILD     |  1 +
 tensorflow/contrib/lite/toco/tflite/import.cc | 13 +++++++++--
 tensorflow/contrib/lite/toco/tooling_util.cc  | 23 +++++++++++++++++--
 tensorflow/contrib/lite/toco/tooling_util.h   |  3 +++
 5 files changed, 36 insertions(+), 22 deletions(-)

diff --git a/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm.cc b/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm.cc
index 082820fddc..c363b93394 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "absl/strings/string_view.h"
 #include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
 #include "tensorflow/contrib/lite/toco/model.h"
 #include "tensorflow/contrib/lite/toco/tooling_util.h"
@@ -202,23 +201,6 @@ bool MatchOperatorInputs(const Operator& op, const Model& model,
   return true;
 }
 
-absl::string_view FindLongestCommonPrefix(absl::string_view a,
-                                          absl::string_view b) {
-  if (a.empty() || b.empty()) return absl::string_view();
-
-  const char* pa = a.data();
-  const char* pb = b.data();
-  size_t count = 0;
-  const ssize_t limit = std::min(a.size(), b.size());
-  while (count < limit && *pa == *pb) {
-    ++pa;
-    ++pb;
-    ++count;
-  }
-
-  return absl::string_view(a.data(), count);
-}
-
 }  // namespace
 
 bool IdentifyLstmCell::Run(Model* model, std::size_t op_index) {
diff --git a/tensorflow/contrib/lite/toco/tflite/BUILD b/tensorflow/contrib/lite/toco/tflite/BUILD
index 72c9266564..a2b8145a67 100644
--- a/tensorflow/contrib/lite/toco/tflite/BUILD
+++ b/tensorflow/contrib/lite/toco/tflite/BUILD
@@ -117,6 +117,7 @@ cc_library(
         ":types",
         "//tensorflow/contrib/lite/schema:schema_fbs",
         "//tensorflow/contrib/lite/toco:model",
+        "//tensorflow/contrib/lite/toco:tooling_util",
         "@flatbuffers",
     ],
 )
diff --git a/tensorflow/contrib/lite/toco/tflite/import.cc b/tensorflow/contrib/lite/toco/tflite/import.cc
index bbf201fd28..5b1ab514b2 100644
--- a/tensorflow/contrib/lite/toco/tflite/import.cc
+++ b/tensorflow/contrib/lite/toco/tflite/import.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/contrib/lite/schema/schema_generated.h"
 #include "tensorflow/contrib/lite/toco/tflite/operator.h"
 #include "tensorflow/contrib/lite/toco/tflite/types.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
 
 namespace toco {
 
@@ -119,8 +120,16 @@ void ImportOperators(
     auto inputs = input_op->inputs();
     for (int i = 0; i < inputs->Length(); i++) {
       auto input_index = inputs->Get(i);
-      const string& input_name = tensors_table.at(input_index);
-      op->inputs.push_back(input_name);
+      // input_index == -1 indicates optional tensor.
+      if (input_index != -1) {
+        const string& input_name = tensors_table.at(input_index);
+        op->inputs.push_back(input_name);
+      } else {
+        const string& tensor_name =
+            toco::AvailableArrayName(*model, "OptionalTensor");
+        model->CreateOptionalArray(tensor_name);
+        op->inputs.push_back(tensor_name);
+      }
     }
     auto outputs = input_op->outputs();
     for (int i = 0; i < outputs->Length(); i++) {
diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc
index 3728d48659..08d9ac3aff 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.cc
+++ b/tensorflow/contrib/lite/toco/tooling_util.cc
@@ -33,6 +33,24 @@ limitations under the License.
 
 namespace toco {
 
+// Find the longest common prefix of two strings.
+absl::string_view FindLongestCommonPrefix(absl::string_view a,
+                                          absl::string_view b) {
+  if (a.empty() || b.empty()) return absl::string_view();
+
+  const char* pa = a.data();
+  const char* pb = b.data();
+  size_t count = 0;
+  const size_t limit = std::min(a.size(), b.size());
+  while (count < limit && *pa == *pb) {
+    ++pa;
+    ++pb;
+    ++count;
+  }
+
+  return absl::string_view(a.data(), count);
+}
+
 string LogName(const Operator& op) {
   const string& opname = HelpfulOperatorTypeName(op);
   if (op.outputs.empty()) {
@@ -1314,13 +1332,14 @@ bool IsAllocatableTransientArray(const Model& model, const string& array_name) {
 }
 
 string AvailableArrayName(const Model& model, const string& name) {
-  if (!model.HasArray(name) && !model.optional_arrays.count(name)) {
+  if (!model.HasArray(name) && !model.IsOptionalArray(name)) {
     return name;
   }
   const int kNumSuffixesToTry = 1000;
   for (int i = 0; i < kNumSuffixesToTry; i++) {
     const string& name_with_suffix = toco::port::StringF("%s_%d", name, i);
-    if (!model.HasArray(name_with_suffix)) {
+    if (!model.HasArray(name_with_suffix) &&
+        !model.IsOptionalArray(name_with_suffix)) {
       return name_with_suffix;
     }
   }
diff --git a/tensorflow/contrib/lite/toco/tooling_util.h b/tensorflow/contrib/lite/toco/tooling_util.h
index 2ac51c7e5b..4051ba3576 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.h
+++ b/tensorflow/contrib/lite/toco/tooling_util.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/logging.h"
 #if TOCO_SUPPORT_PORTABLE_PROTOS
 #include "third_party/protobuf/src/google/protobuf/text_format.h"
@@ -49,6 +50,8 @@ namespace toco {
 constexpr int kLogLevelModelChanged = 1;
 constexpr int kLogLevelModelUnchanged = 2;
 
+absl::string_view FindLongestCommonPrefix(absl::string_view a,
+                                          absl::string_view b);
 string LogName(const Operator& op);
 
 bool IsInputArray(const Model& model, const string& name);
-- 
GitLab


From 77e2f31b40228f11353654a5fde4d93f7614a11d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 27 Jan 2018 19:59:13 -0800
Subject: [PATCH 299/966] Fix use of uninitialied value.

PiperOrigin-RevId: 183558128
---
 tensorflow/contrib/lite/kernels/kernel_util_test.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/contrib/lite/kernels/kernel_util_test.cc b/tensorflow/contrib/lite/kernels/kernel_util_test.cc
index 63a317f338..c65b68970f 100644
--- a/tensorflow/contrib/lite/kernels/kernel_util_test.cc
+++ b/tensorflow/contrib/lite/kernels/kernel_util_test.cc
@@ -30,6 +30,8 @@ class KernelUtilTest : public ::testing::Test {
 
     tensor1_.dims = nullptr;
     tensor2_.dims = nullptr;
+    tensor1_.allocation_type = kTfLiteMmapRo;
+    tensor2_.allocation_type = kTfLiteMmapRo;
   }
   ~KernelUtilTest() {
     TfLiteTensorFree(&tensor1_);
-- 
GitLab


From 81acc2e6c56af450cd3cd6a5eb602f862e8338bb Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Sun, 28 Jan 2018 10:12:55 -0800
Subject: [PATCH 300/966] [XLA] Reset ShapeVisitor's state between runs of the
 verifier.

We create the ShapeVisitor once per pass pipeline.  Without this change,
after our ShapeVisitor has checked an instruction, it will never again
check that instruction *or any of its transitive inputs*.  Yikes.

PiperOrigin-RevId: 183593437
---
 .../compiler/xla/service/hlo_verifier.cc      |  3 ++-
 .../compiler/xla/service/hlo_verifier.h       | 17 ++++++++----
 .../compiler/xla/service/hlo_verifier_test.cc | 26 +++++++++++++++++++
 3 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index 6e46f945e0..04d4656546 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -687,7 +687,8 @@ StatusOr<bool> HloVerifier::Run(HloModule* module) {
       instructions[instruction->name()] = instruction;
     }
 
-    TF_RETURN_IF_ERROR(computation->Accept(shape_verifier_.get()));
+    std::unique_ptr<ShapeVerifier> shape_verifier = shape_verifier_factory_();
+    TF_RETURN_IF_ERROR(computation->Accept(shape_verifier.get()));
   }
 
   return false;
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h
index 5a1d864e03..26d53dec1e 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.h
+++ b/tensorflow/compiler/xla/service/hlo_verifier.h
@@ -106,10 +106,14 @@ class ShapeVerifier : public DfsHloVisitor {
 class HloVerifier : public HloPassInterface {
  public:
   // Uses standard shape inference.
-  explicit HloVerifier() : shape_verifier_(MakeUnique<ShapeVerifier>()) {}
+  explicit HloVerifier()
+      : shape_verifier_factory_([] { return MakeUnique<ShapeVerifier>(); }) {}
+
   // Uses custom shape verification.
-  explicit HloVerifier(std::unique_ptr<ShapeVerifier> shape_verifier)
-      : shape_verifier_(std::move(shape_verifier)) {}
+  explicit HloVerifier(
+      std::function<std::unique_ptr<ShapeVerifier>()> shape_verifier_factory)
+      : shape_verifier_factory_(std::move(shape_verifier_factory)) {}
+
   ~HloVerifier() override = default;
   tensorflow::StringPiece name() const override { return "verifier"; }
 
@@ -121,8 +125,11 @@ class HloVerifier : public HloPassInterface {
   // CHECKs various invariants of a fusion instruction.
   Status CheckFusionInstruction(HloInstruction* fusion) const;
 
-  // Verifies shapes match inferred expectations.
-  std::unique_ptr<ShapeVerifier> shape_verifier_;
+  // Creates a ShapeVerifier that checks that shapes match inferred
+  // expectations.  This is a factory function because ShapeVerifier,  Note that
+  // ShapeVerifier, being a DfsHloVisitor, is stateful.  We want a clean object
+  // for each run of the verifier.
+  std::function<std::unique_ptr<ShapeVerifier>()> shape_verifier_factory_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_verifier_test.cc b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
index 2a3b55decc..c92db0be14 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
@@ -97,5 +97,31 @@ TEST_F(HloVerifierTest, DifferentOperandParents) {
               HasSubstr("is in a different computation"));
 }
 
+TEST_F(HloVerifierTest, ResetsShapeVerifierState) {
+  HloComputation::Builder builder(TestName());
+  Shape s1 = ShapeUtil::MakeShape(F32, {1});
+  Shape s2 = ShapeUtil::MakeShape(F32, {2});
+
+  HloInstruction* param =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, s1, "param"));
+
+  // Create an add instruction with the incorrect shape.
+  HloInstruction* add = builder.AddInstruction(
+      HloInstruction::CreateBinary(s2, HloOpcode::kAdd, param, param));
+
+  // In order to trigger the bug we're checking for, the instruction with the
+  // bad shape can't be the root of the computation.
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(s2, HloOpcode::kMultiply, add, add));
+
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
+
+  // Run the verifier twice.  It should fail both times, because it shouldn't
+  // carry state in its DFS visitor between runs.
+  EXPECT_FALSE(verifier().Run(module.get()).status().ok());
+  EXPECT_FALSE(verifier().Run(module.get()).status().ok());
+}
+
 }  // namespace
 }  // namespace xla
-- 
GitLab


From 74b5a4cf30fc1f0fa24a41d212f4aa03dcefa990 Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Sun, 28 Jan 2018 10:49:33 -0800
Subject: [PATCH 301/966] [XLA] Show layouts of tuple-shaped instructions
 (other than kTuple) in graphs.

For example the batch-norm ops return a tuple, and those values' layouts
are significant.  We still hide the layout on tuples, since this can be
noisy.

PiperOrigin-RevId: 183594622
---
 .../compiler/xla/service/hlo_graph_dumper.cc  | 21 ++++++++++++-------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index f7c6435002..c744c8ed81 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -1063,14 +1063,19 @@ string HloDotDumper::GetInstructionNodeExtraInfo(const HloInstruction* instr) {
   // node -- there the shape and layout is present in the output node.
   if (instr->opcode() != HloOpcode::kFusion ||
       !ShouldShowFusionSubcomputation(instr)) {
-    string instr_shape = ShapeUtil::HumanString(instr->shape());
-
-    // Show layout of non-tuple shapes with more than one dimension.
-    if (LayoutUtil::HasLayout(instr->shape()) &&
-        instr->shape().dimensions_size() > 1 &&
-        !ShapeUtil::IsTuple(instr->shape())) {
-      StrAppend(&instr_shape, "{",
-                Join(LayoutUtil::MinorToMajor(instr->shape()), ","), "}");
+    // Show layout of instructions with more than one dimension.  Don't show
+    // layout on tuples or tensors with just one dimension (which only have one
+    // possible layout) to avoid visual noise.
+    bool shape_is_multidim = false;
+    ShapeUtil::ForEachSubshape(instr->shape(),
+                               [&](const Shape& s, const ShapeIndex&) {
+                                 shape_is_multidim |= s.dimensions_size() > 1;
+                               });
+    string instr_shape;
+    if (instr->opcode() != HloOpcode::kTuple && shape_is_multidim) {
+      instr_shape = ShapeUtil::HumanStringWithLayout(instr->shape());
+    } else {
+      instr_shape = ShapeUtil::HumanString(instr->shape());
     }
 
     // Some instructions have giant tuples as their shapes, so truncate the
-- 
GitLab


From a6ed38feb42021c7fdf4a76587c1bbf75f3248d1 Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Sun, 28 Jan 2018 11:21:58 -0800
Subject: [PATCH 302/966] [XLA] Set layout of GTE instructions inside fusion
 nodes.

Other than the root and parameters of a fusion computation, most other
instructions in a fusion computation don't have a layout.  GTEs are an
exception; they should inherit their layout from their operand, which
must be another GTE or a parameter.

Previously LayoutAssignment left GTEs alone, assuming they came in with
the right layout.  But this isn't correct, and in fact LayoutAssignment
cleared the layouts of every non-fused instruction before assigning them
for exactly this reason.  If we'd done the same to fused instructions,
it would have caught this bug, so we make that change here as well.  (We
simplify this loop by removing the check for kOutfeed -- outfeeds do not
produce a result, so there's no shape to keep.)

PiperOrigin-RevId: 183595627
---
 tensorflow/compiler/xla/service/BUILD         |  2 +
 .../compiler/xla/service/layout_assignment.cc | 50 +++++++------
 .../xla/service/layout_assignment_test.cc     | 71 +++++++++++++++++++
 3 files changed, 103 insertions(+), 20 deletions(-)

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 4b5590f5c4..bdccfad0d0 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -1853,7 +1853,9 @@ tf_cc_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
+        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
         "//tensorflow/core:lib",
+        "//tensorflow/core:test",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index bbea6bee56..5413b95cfb 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -1236,7 +1236,8 @@ Status CopyOperandIfLayoutsDiffer(const ShapeLayout& operand_layout,
 // instruction itself.
 Status SetFusionLayouts(HloInstruction* fusion) {
   TF_RET_CHECK(fusion->opcode() == HloOpcode::kFusion);
-  for (auto* fused_instruction : fusion->fused_instructions()) {
+  for (auto* fused_instruction :
+       fusion->fused_instructions_computation()->MakeInstructionPostOrder()) {
     if (fused_instruction->opcode() == HloOpcode::kParameter) {
       const HloInstruction* fusion_operand =
           fusion->operand(fused_instruction->parameter_number());
@@ -1251,11 +1252,22 @@ Status SetFusionLayouts(HloInstruction* fusion) {
           ShapeUtil::Compatible(fusion->shape(), fused_instruction->shape()));
       TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes(
           fusion->shape(), fused_instruction->mutable_shape()));
-    } else if (fused_instruction->opcode() != HloOpcode::kConstant &&
-               fused_instruction->opcode() != HloOpcode::kGetTupleElement &&
-               fused_instruction->opcode() != HloOpcode::kInfeed) {
-      // Internal fused instructions with the exception of constants
-      // and infeed need no layout.
+    } else if (fused_instruction->opcode() == HloOpcode::kGetTupleElement) {
+      // A GTE inherits its layout from its operand (which should ultimately be
+      // a parameter).
+      TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes(
+          fused_instruction->operand(0)->shape().tuple_shapes(
+              fused_instruction->tuple_index()),
+          fused_instruction->mutable_shape()));
+    } else if (fused_instruction->opcode() == HloOpcode::kConstant) {
+      // Give constants the layout of their literal.
+      TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes(
+          fused_instruction->literal().shape(),
+          fused_instruction->mutable_shape()));
+    } else if (fused_instruction->opcode() == HloOpcode::kInfeed) {
+      // Nop; leave the infeed layout alone.
+    } else {
+      // Other instructions don't have layouts inside of fusion nodes.
       LayoutUtil::ClearLayout(fused_instruction->mutable_shape());
     }
   }
@@ -1367,20 +1379,6 @@ Status LayoutAssignment::RunOnComputation(
           << ")";
   VLOG(2) << "  ComputationLayout = " << computation_layout.ToString();
 
-  // Clear existing layouts of the instructions. All layouts must be assigned by
-  // the LayoutAssignment pass, except for Infeed, Outfeed, Parameters and the
-  // computation result. The latter two are specified in computation_layout, so
-  // we only need to keep the existing layouts for Infeed and Outfeed. Clearing
-  // the layouts here avoids hiding potential bugs in the layout assignment pass
-  // that may accidently use the existing layout.
-  for (HloInstruction* instruction : computation->instructions()) {
-    if (instruction->opcode() == HloOpcode::kInfeed ||
-        instruction->opcode() == HloOpcode::kOutfeed) {
-      continue;
-    }
-    LayoutUtil::ClearLayout(instruction->mutable_shape());
-  }
-
   // Construct LayoutConstraints with all layout constraints of the computation.
   LayoutConstraints constraints(points_to_analysis, computation);
 
@@ -1458,6 +1456,18 @@ StatusOr<bool> LayoutAssignment::Run(HloModule* module) {
   // is handled before its caller computation. This ensures that the layout of
   // all callers of a computation will agree.
   for (auto* computation : module->MakeComputationPostOrder()) {
+    // Clear existing layouts of the instructions.  All layouts must be assigned
+    // by the LayoutAssignment pass, except for those on infeeds, parameters,
+    // and the computation result. The latter two are specified in
+    // computation_layout, so we only need to keep the existing layouts for
+    // infeeds.  Clearing the layouts here avoids hiding potential bugs in the
+    // layout assignment pass that may accidently use the existing layout.
+    for (HloInstruction* instruction : computation->instructions()) {
+      if (instruction->opcode() != HloOpcode::kInfeed) {
+        LayoutUtil::ClearLayout(instruction->mutable_shape());
+      }
+    }
+
     if (computation == module->entry_computation()) {
       TF_RETURN_IF_ERROR(RunOnComputation(
           *entry_computation_layout_, *points_to_analysis,
diff --git a/tensorflow/compiler/xla/service/layout_assignment_test.cc b/tensorflow/compiler/xla/service/layout_assignment_test.cc
index d51c0d1dfb..e269a13459 100644
--- a/tensorflow/compiler/xla/service/layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment_test.cc
@@ -35,9 +35,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
+#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 
 namespace op = xla::testing::opcode_matchers;
@@ -587,5 +589,74 @@ TEST_F(LayoutAssignmentTest, TransposeToBitcastToUser) {
   EXPECT_TRUE(ShapeUtil::TransposeIsBitcast(transpose->operand(0)->shape(),
                                             transpose->shape(), {2, 3, 0, 1}));
 }
+
+// A GTE inside of a fusion node inherits the layout of its operand (which
+// should, if we keep following operands, eventually be a parameter).
+TEST_F(LayoutAssignmentTest, GTEInheritsLayoutFromOperand) {
+  const char* module_str = R"(
+    HloModule test_module
+
+    fused_computation {
+      fparam = (f32[2,2,2], (f32[2,2,2], f32[2,2,2])) parameter(0)
+      gte0 = f32[2,2,2] get-tuple-element(fparam), index=0
+      gte1 = (f32[2,2,2], f32[2,2,2]) get-tuple-element(fparam), index=1
+      gte1a = f32[2,2,2] get-tuple-element(gte1), index=0
+      gte1b = f32[2,2,2] get-tuple-element(gte1), index=1
+      add = f32[2,2,2] add(gte1a, gte1b)
+      ROOT fresult = f32[2,2,2] add(gte0, add)
+    }
+
+    ENTRY entry_computation {
+      param = (f32[2,2,2], (f32[2,2,2], f32[2,2,2])) parameter(0)
+      ROOT fusion =
+        f32[2,2,2] fusion(param), kind=kLoop, calls=fused_computation
+    }
+  )";
+
+  auto module = tools::Parse(module_str).ValueOrDie();
+  ComputationLayout computation_layout(
+      module->entry_computation()->ComputeProgramShape());
+  Shape param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShapeWithLayout(F32, {2, 2, 2}, {0, 1, 2}),
+       ShapeUtil::MakeTupleShape({
+           ShapeUtil::MakeShapeWithLayout(F32, {2, 2, 2}, {1, 2, 0}),
+           ShapeUtil::MakeShapeWithLayout(F32, {2, 2, 2}, {2, 0, 1}),
+       })});
+  TF_ASSERT_OK(
+      computation_layout.mutable_parameter_layout(0)->CopyLayoutFromShape(
+          param_shape));
+  computation_layout.mutable_result_layout()->ResetLayout(
+      LayoutUtil::MakeLayout({2, 1, 0}));
+  AssignLayouts(module.get(), &computation_layout);
+
+  HloComputation* fused_computation = *std::find_if(
+      module->computations().begin(), module->computations().end(),
+      [](const HloComputation* c) { return c->name() == "fused_computation"; });
+
+  auto fused_instr = [&](const string& name) {
+    auto it = std::find_if(
+        fused_computation->instructions().begin(),
+        fused_computation->instructions().end(),
+        [&](const HloInstruction* i) { return i->name() == name; });
+    CHECK(it != fused_computation->instructions().end());
+    return *it;
+  };
+
+  EXPECT_THAT(fused_instr("gte0")->shape().layout().minor_to_major(),
+              ElementsAre(0, 1, 2));
+  EXPECT_THAT(
+      fused_instr("gte1")->shape().tuple_shapes(0).layout().minor_to_major(),
+      ElementsAre(1, 2, 0));
+  EXPECT_THAT(
+      fused_instr("gte1")->shape().tuple_shapes(1).layout().minor_to_major(),
+      ElementsAre(2, 0, 1));
+  EXPECT_THAT(fused_instr("gte1a")->shape().layout().minor_to_major(),
+              ElementsAre(1, 2, 0));
+  EXPECT_THAT(fused_instr("gte1b")->shape().layout().minor_to_major(),
+              ElementsAre(2, 0, 1));
+  EXPECT_THAT(fused_instr("fresult")->shape().layout().minor_to_major(),
+              ElementsAre(2, 1, 0));
+}
+
 }  // namespace
 }  // namespace xla
-- 
GitLab


From 7d7dce16b8e7aef53467d8eb08d4249ef6cd71fb Mon Sep 17 00:00:00 2001
From: ManHyuk <manhyuk@kw.ac.kr>
Date: Mon, 29 Jan 2018 07:50:48 +0900
Subject: [PATCH 303/966] Fix typo (#16509)

* fix typos
---
 tensorflow/compiler/xla/tools/parser/hlo_parser.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc b/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
index 42e7f91f26..d9c4d094b8 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
+++ b/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
@@ -2173,7 +2173,7 @@ bool HloParser::ParseConvolutionDimensionNumbers(
 //
 //  {[2:3:4], [5:6:7], [8:9]}
 //
-// The the parsed result will be:
+// The parsed result will be:
 //
 //  {/*starts=*/{2, 5, 8}, /*limits=*/{3, 6, 9}, /*strides=*/{4, 7, 1}}
 //
-- 
GitLab


From ad37b47be884b1b897cc4505b856c0f9c51591a1 Mon Sep 17 00:00:00 2001
From: Shanqing Cai <cais@google.com>
Date: Mon, 29 Jan 2018 07:11:29 -0800
Subject: [PATCH 304/966] tfdbg: add tensorboard debugger plugin option to
 three existing examples

PiperOrigin-RevId: 183661140
---
 .../python/debug/examples/debug_fibonacci.py   | 17 ++++++++++++++++-
 .../python/debug/examples/debug_mnist.py       | 17 ++++++++++++++++-
 .../debug/examples/debug_tflearn_iris.py       | 18 ++++++++++++++++--
 3 files changed, 48 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/debug/examples/debug_fibonacci.py b/tensorflow/python/debug/examples/debug_fibonacci.py
index 704dbda357..3821b393ec 100644
--- a/tensorflow/python/debug/examples/debug_fibonacci.py
+++ b/tensorflow/python/debug/examples/debug_fibonacci.py
@@ -44,6 +44,10 @@ def main(_):
   sess.run(tf.global_variables_initializer())
 
   # Wrap the TensorFlow Session object for debugging.
+  if FLAGS.debug and FLAGS.tensorboard_debug_address:
+    raise ValueError(
+        "The --debug and --tensorboard_debug_address flags are mutually "
+        "exclusive.")
   if FLAGS.debug:
     sess = tf_debug.LocalCLIDebugWrapperSession(sess)
 
@@ -52,6 +56,9 @@ def main(_):
 
     sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan)
     sess.add_tensor_filter("has_negative", has_negative)
+  elif FLAGS.tensorboard_debug_address:
+    sess = tf_debug.TensorBoardDebugWrapperSession(
+        sess, FLAGS.tensorboard_debug_address)
 
   print("Fibonacci number at position %d:\n%s" %
         (FLAGS.length, sess.run(n1)))
@@ -82,7 +89,15 @@ if __name__ == "__main__":
       "--debug",
       dest="debug",
       action="store_true",
-      help="Use TensorFlow Debugger (tfdbg).")
+      help="Use TensorFlow Debugger (tfdbg). Mutually exclusive with the "
+      "--tensorboard_debug_address flag.")
+  parser.add_argument(
+      "--tensorboard_debug_address",
+      type=str,
+      default=None,
+      help="Connect to the TensorBoard Debugger Plugin backend specified by "
+      "the gRPC address (e.g., localhost:1234). Mutually exclusive with the "
+      "--debug flag.")
 
   FLAGS, unparsed = parser.parse_known_args()
   tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/python/debug/examples/debug_mnist.py b/tensorflow/python/debug/examples/debug_mnist.py
index 0a6dbf311d..ab1c90371c 100644
--- a/tensorflow/python/debug/examples/debug_mnist.py
+++ b/tensorflow/python/debug/examples/debug_mnist.py
@@ -120,8 +120,15 @@ def main(_):
 
   sess.run(tf.global_variables_initializer())
 
+  if FLAGS.debug and FLAGS.tensorboard_debug_address:
+    raise ValueError(
+        "The --debug and --tensorboard_debug_address flags are mutually "
+        "exclusive.")
   if FLAGS.debug:
     sess = tf_debug.LocalCLIDebugWrapperSession(sess, ui_type=FLAGS.ui_type)
+  elif FLAGS.tensorboard_debug_address:
+    sess = tf_debug.TensorBoardDebugWrapperSession(
+        sess, FLAGS.tensorboard_debug_address)
 
   # Add this point, sess is a debug wrapper around the actual Session if
   # FLAGS.debug is true. In that case, calling run() will launch the CLI.
@@ -173,6 +180,14 @@ if __name__ == "__main__":
       nargs="?",
       const=True,
       default=False,
-      help="Use debugger to track down bad values during training")
+      help="Use debugger to track down bad values during training. "
+      "Mutually exclusive with the --tensorboard_debug_address flag.")
+  parser.add_argument(
+      "--tensorboard_debug_address",
+      type=str,
+      default=None,
+      help="Connect to the TensorBoard Debugger Plugin backend specified by "
+      "the gRPC address (e.g., localhost:1234). Mutually exclusive with the "
+      "--debug flag.")
   FLAGS, unparsed = parser.parse_known_args()
   tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/python/debug/examples/debug_tflearn_iris.py b/tensorflow/python/debug/examples/debug_tflearn_iris.py
index 92314d8dd9..4f4666ee4f 100644
--- a/tensorflow/python/debug/examples/debug_tflearn_iris.py
+++ b/tensorflow/python/debug/examples/debug_tflearn_iris.py
@@ -110,10 +110,16 @@ def main(_):
       model_dir=model_dir)
 
   hooks = None
+  if FLAGS.debug and FLAGS.tensorboard_debug_address:
+    raise ValueError(
+        "The --debug and --tensorboard_debug_address flags are mutually "
+        "exclusive.")
   if FLAGS.debug:
     debug_hook = tf_debug.LocalCLIDebugHook(ui_type=FLAGS.ui_type,
                                             dump_root=FLAGS.dump_root)
-    hooks = [debug_hook]
+  elif FLAGS.tensorboard_debug_address:
+    debug_hook = tf_debug.TensorBoardDebugHook(FLAGS.tensorboard_debug_address)
+  hooks = [debug_hook]
 
   if not FLAGS.use_experiment:
     # Fit model.
@@ -185,11 +191,19 @@ if __name__ == "__main__":
       nargs="?",
       const=True,
       default=False,
-      help="Use debugger to track down bad values during training")
+      help="Use debugger to track down bad values during training. "
+      "Mutually exclusive with the --tensorboard_debug_address flag.")
   parser.add_argument(
       "--dump_root",
       type=str,
       default="",
       help="Optional custom root directory for temporary debug dump data")
+  parser.add_argument(
+      "--tensorboard_debug_address",
+      type=str,
+      default=None,
+      help="Connect to the TensorBoard Debugger Plugin backend specified by "
+      "the gRPC address (e.g., localhost:1234). Mutually exclusive with the "
+      "--debug flag.")
   FLAGS, unparsed = parser.parse_known_args()
   tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
-- 
GitLab


From 0a53ad466e2fe001b80b8addde9a3465f7d5357f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 29 Jan 2018 07:27:48 -0800
Subject: [PATCH 305/966] Remove unused class members

PiperOrigin-RevId: 183662473
---
 tensorflow/core/grappler/clusters/single_machine.cc        | 5 +----
 tensorflow/core/grappler/clusters/single_machine.h         | 1 -
 tensorflow/core/grappler/optimizers/dependency_optimizer.h | 6 ++----
 tensorflow/core/kernels/data/iterator_ops.cc               | 6 ++----
 tensorflow/core/kernels/lmdb_reader_op.cc                  | 4 +---
 5 files changed, 6 insertions(+), 16 deletions(-)

diff --git a/tensorflow/core/grappler/clusters/single_machine.cc b/tensorflow/core/grappler/clusters/single_machine.cc
index 2712c5b679..862ce4ae88 100644
--- a/tensorflow/core/grappler/clusters/single_machine.cc
+++ b/tensorflow/core/grappler/clusters/single_machine.cc
@@ -36,10 +36,7 @@ namespace grappler {
 static std::atomic<bool> already_provisioned(false);
 
 SingleMachine::SingleMachine(int timeout_s, int num_cpu_cores, int num_gpus)
-    : Cluster(timeout_s),
-      num_gpus_(num_gpus),
-      expected_init_time_s_(0),
-      closing_(false) {
+    : Cluster(timeout_s), expected_init_time_s_(0), closing_(false) {
   VLOG(1) << "Number of CPU cores: " << num_cpu_cores
           << " Number of GPUs: " << num_gpus;
   thread_pool_.reset(new thread::ThreadPool(
diff --git a/tensorflow/core/grappler/clusters/single_machine.h b/tensorflow/core/grappler/clusters/single_machine.h
index a254f72f0c..90d6a04cab 100644
--- a/tensorflow/core/grappler/clusters/single_machine.h
+++ b/tensorflow/core/grappler/clusters/single_machine.h
@@ -64,7 +64,6 @@ class SingleMachine : public Cluster {
 
   Status ClearAllocatorStats() const;
 
-  const int num_gpus_;
   std::unique_ptr<Session> session_;
   std::vector<QueueRunnerDef> queue_runner_defs_;
   string last_graph_id_;
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer.h b/tensorflow/core/grappler/optimizers/dependency_optimizer.h
index 02d8a0f32a..cfc5324439 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer.h
@@ -29,9 +29,8 @@ namespace grappler {
 // optimizations, such as removing nodes that are effectively noops.
 class DependencyOptimizer : public GraphOptimizer {
  public:
-  DependencyOptimizer() : opt_level_(RewriterConfig::ON) {}
-  explicit DependencyOptimizer(RewriterConfig::Toggle opt_level)
-      : opt_level_(opt_level) {}
+  DependencyOptimizer() {}
+  explicit DependencyOptimizer(RewriterConfig::Toggle /*unused*/) {}
   ~DependencyOptimizer() override {}
 
   string name() const override { return "dependency_optimizer"; };
@@ -62,7 +61,6 @@ class DependencyOptimizer : public GraphOptimizer {
   // Main driver of dependency optimizations.
   Status OptimizeDependencies();
 
-  RewriterConfig::Toggle opt_level_;
   bool fetch_nodes_known_;
   std::unordered_set<string> nodes_to_preserve_;
   std::unique_ptr<NodeMap> node_map_;
diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index ca22f10a85..b37bd672ad 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -82,7 +82,7 @@ class IteratorResource : public ResourceBase {
  public:
   IteratorResource(const DataTypeVector& output_dtypes,
                    const std::vector<PartialTensorShape>& output_shapes,
-                   const int graph_def_version,
+                   const int /*unused: graph_def_version*/,
                    std::unique_ptr<DeviceMgr> device_mgr,
                    std::unique_ptr<FunctionLibraryDefinition> flib_def,
                    std::unique_ptr<ProcessFunctionLibraryRuntime> pflr,
@@ -93,8 +93,7 @@ class IteratorResource : public ResourceBase {
         lib_(lib),
         iterator_(nullptr),
         output_dtypes_(output_dtypes),
-        output_shapes_(output_shapes),
-        graph_def_version_(graph_def_version) {}
+        output_shapes_(output_shapes) {}
 
   Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
                  bool* end_of_sequence) {
@@ -223,7 +222,6 @@ class IteratorResource : public ResourceBase {
   std::shared_ptr<const FunctionLibraryDefinition> lib_def_ GUARDED_BY(mu_);
   const DataTypeVector output_dtypes_;
   const std::vector<PartialTensorShape> output_shapes_;
-  const int graph_def_version_;
 };
 
 // Helper class for reading data from a VariantTensorData object.
diff --git a/tensorflow/core/kernels/lmdb_reader_op.cc b/tensorflow/core/kernels/lmdb_reader_op.cc
index 1335a95dce..2474fe4d56 100755
--- a/tensorflow/core/kernels/lmdb_reader_op.cc
+++ b/tensorflow/core/kernels/lmdb_reader_op.cc
@@ -26,9 +26,8 @@ namespace tensorflow {
 
 class LMDBReader : public ReaderBase {
  public:
-  LMDBReader(const string& node_name, Env* env)
+  LMDBReader(const string& node_name, Env* /*unused*/)
       : ReaderBase(strings::StrCat("LMDBReader '", node_name, "'")),
-        env_(env),
         mdb_env_(nullptr),
         mdb_dbi_(0),
         mdb_txn_(nullptr),
@@ -107,7 +106,6 @@ class LMDBReader : public ReaderBase {
     }
   }
 
-  Env* const env_;
   MDB_env* mdb_env_;
   MDB_dbi mdb_dbi_;
 
-- 
GitLab


From 8fb12848d3a81a010714a4612ffd735106ea83d8 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Mon, 29 Jan 2018 08:24:13 -0800
Subject: [PATCH 306/966] Disable AWS S3 virtual addressing (#16443)

* Disable AWS S3 virtual addressing

This fix is related to 16397 and 15159. The fix disables
the virtual addressing of AWS S3, as was suggested in the comment.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Fix format issue.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add comment for the passed parameter of virutal addressing.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/core/platform/s3/s3_file_system.cc | 11 +++++++++--
 tensorflow/core/platform/s3/s3_file_system.h  | 12 ++++++++++++
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/platform/s3/s3_file_system.cc b/tensorflow/core/platform/s3/s3_file_system.cc
index 1e89fa77c1..4862fd85be 100644
--- a/tensorflow/core/platform/s3/s3_file_system.cc
+++ b/tensorflow/core/platform/s3/s3_file_system.cc
@@ -306,8 +306,15 @@ std::shared_ptr<Aws::S3::S3Client> S3FileSystem::GetS3Client() {
     };
     Aws::InitAPI(options);
 
-    this->s3_client_ = std::shared_ptr<Aws::S3::S3Client>(
-        new Aws::S3::S3Client(GetDefaultClientConfig()));
+    // The creation of S3Client disables virtual addressing:
+    //   S3Client(clientConfiguration, signPayloads, useVirtualAdressing = true)
+    // The purpose is to address the issue encountered when there is an `.`
+    // in the bucket name. Due to TLS hostname validation or DNS rules,
+    // the bucket may not be resolved. Disabling of virtual addressing
+    // should address the issue. See GitHub issue 16397 for details.
+    this->s3_client_ = std::shared_ptr<Aws::S3::S3Client>(new Aws::S3::S3Client(
+        GetDefaultClientConfig(),
+        Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy::Never, false));
   }
 
   return this->s3_client_;
diff --git a/tensorflow/core/platform/s3/s3_file_system.h b/tensorflow/core/platform/s3/s3_file_system.h
index 168b8007f3..d0d6bb5949 100644
--- a/tensorflow/core/platform/s3/s3_file_system.h
+++ b/tensorflow/core/platform/s3/s3_file_system.h
@@ -57,6 +57,18 @@ class S3FileSystem : public FileSystem {
   Status RenameFile(const string& src, const string& target) override;
  private:
   // Returns the member S3 client, initializing as-needed.
+  // When the client tries to access the object in S3, e.g.,
+  //   s3://bucket-name/path/to/object
+  // the behavior could be controlled by various environmental
+  // variables.
+  // By default S3 access regional endpoint, with region
+  // controlled by `AWS_REGION`. The endpoint could be overridden
+  // with explicity `S3_ENDPOINT`. S3 use HTTPS by default.
+  // If S3_USE_HTTPS=0 is specified, HTTP is used. Also,
+  // S3_VERIFY_SSL=0 could disable SSL verification in case
+  // HTTPS is used.
+  // This S3 Client does not support Virtual Hosted–Style Method
+  // for a bucket.
   std::shared_ptr<Aws::S3::S3Client> GetS3Client();
 
   std::shared_ptr<Aws::S3::S3Client> s3_client_;
-- 
GitLab


From 914b9919f98b6486127c75110034e814d53efcb2 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sun, 19 Nov 2017 13:26:39 -0800
Subject: [PATCH 307/966] Add `tf.unravel_index` as an equivalent of
 `np.unravel_index`

This fix tries to address the issue raised in 2075 where
there was no implementation of  `tf.unravel_index`.
The `tf.unravel_index` could be quite useful in many places.
This fix adds the `tf.unravel_index` in CPU kernel. Note `order`
in `np.unravel_index` has not been added yet.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/core/kernels/BUILD               |  7 ++
 tensorflow/core/kernels/unravel_index_op.cc | 90 +++++++++++++++++++++
 tensorflow/core/ops/array_ops.cc            | 21 +++++
 3 files changed, 118 insertions(+)
 create mode 100644 tensorflow/core/kernels/unravel_index_op.cc

diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index fd99409c9b..db309fc9da 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -629,6 +629,7 @@ cc_library(
         ":transpose_op",
         ":unique_op",
         ":unpack_op",
+        ":unravel_index_op",
         ":where_op",
     ],
 )
@@ -883,6 +884,12 @@ tf_kernel_library(
     deps = ARRAY_DEPS + [":split_lib"],
 )
 
+tf_kernel_library(
+    name = "unravel_index_op",
+    prefix = "unravel_index_op",
+    deps = ARRAY_DEPS,
+)
+
 tf_kernel_library(
     name = "where_op",
     srcs = ["where_op.cc"],
diff --git a/tensorflow/core/kernels/unravel_index_op.cc b/tensorflow/core/kernels/unravel_index_op.cc
new file mode 100644
index 0000000000..9b9f0c6f88
--- /dev/null
+++ b/tensorflow/core/kernels/unravel_index_op.cc
@@ -0,0 +1,90 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+namespace {
+template <typename T>
+struct mod_op {
+  const T operator()(const T& a, const T& b) const {
+    return a % b;
+  }
+};
+}
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+class UnravelIndexOp : public OpKernel {
+ public:
+  explicit UnravelIndexOp(OpKernelConstruction* ctx) : OpKernel(ctx) { }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& indices_tensor = ctx->input(0);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(indices_tensor.shape()) || TensorShapeUtils::IsScalar(indices_tensor.shape()), errors::InvalidArgument("The indices can only be scalar or vector, got \"", indices_tensor.shape().DebugString(), "\""));
+
+    const Tensor& dims_tensor = ctx->input(1);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(dims_tensor.shape()), errors::InvalidArgument("The indices can only be 1-D, got \"", dims_tensor.shape().DebugString(), "\""));
+
+    auto dims = dims_tensor.vec<int32>();
+
+    Eigen::array<bool, 1> reverse({true});
+
+    Tensor strides_tensor;
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_INT32, TensorShape({dims_tensor.NumElements()}), &strides_tensor));
+
+    auto strides = strides_tensor.vec<int32>();
+    strides = dims.reverse(reverse).scan(0, Eigen::internal::ProdReducer<int32>(), false).reverse(reverse);
+
+    Tensor strides_shifted_tensor;
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_INT32, TensorShape({dims_tensor.NumElements()}), &strides_shifted_tensor));
+
+    auto strides_shifted = strides_shifted_tensor.vec<int32>();
+    strides_shifted = dims.reverse(reverse).scan(0, Eigen::internal::ProdReducer<int32>(), true).reverse(reverse);
+
+    Tensor* output_tensor = nullptr;
+    if (TensorShapeUtils::IsScalar(indices_tensor.shape())) {
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({dims_tensor.NumElements()}), &output_tensor));
+
+      auto output = output_tensor->vec<int32>();
+
+      output = output.constant(indices_tensor.scalar<int32>()());
+      output = output.binaryExpr(strides, mod_op<int32>()) / strides_shifted;
+    } else {
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({dims_tensor.NumElements(), indices_tensor.NumElements()}), &output_tensor));
+
+      auto output = output_tensor->matrix<int32>();
+
+      Eigen::array<int64, 2> reshape{{dims_tensor.NumElements(), 1}};
+      Eigen::array<int64, 2> bcast({1, indices_tensor.NumElements()});
+      Eigen::array<int64, 2> indices_reshape{{1, indices_tensor.NumElements()}};
+      Eigen::array<int64, 2> indices_bcast({dims_tensor.NumElements(), 1});
+
+      output = indices_tensor.vec<int32>().reshape(indices_reshape).broadcast(indices_bcast);
+      output = output.binaryExpr(strides.reshape(reshape).broadcast(bcast), mod_op<int32>()) / strides_shifted.reshape(reshape).broadcast(bcast);
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("UnravelIndex").Device(DEVICE_CPU), UnravelIndexOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index fb9e8ad50c..526b0caef3 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -335,6 +335,27 @@ REGISTER_OP("Unpack")
       return Status::OK();
     });
 
+REGISTER_OP("UnravelIndex")
+    .Input("indices: int32")
+    .Input("dims: int32")
+    .Output("output: int32")
+    .SetShapeFn([](InferenceContext* c) {
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Converts a flat index or array of flat indices into a tuple of coordinate
+arrays.
+
+This is equivalent to numpy.unravel_index
+
+indices: An 0-D or 1-D `int` Tensor whose elements are indices into the
+  flattened version of an array of dimensions dims.
+dims: An 1-D `int` Tensor. The shape of the array to use for unraveling
+  indices.
+output: An 2-D (or 1-D if indices is 0-D) tensor where each row has the
+  same shape as the indices array.
+)doc");
+
 // --------------------------------------------------------------------------
 // TODO(josh11b): Remove the >= 2 constraint, once we can rewrite the graph
 // in the N == 1 case to remove the node.
-- 
GitLab


From 88b8c4bca1bb6bdcb5d68e61f6fc32489d9d8918 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sun, 19 Nov 2017 13:31:50 -0800
Subject: [PATCH 308/966] Sanitize unravel_index_op.cc with clang-format

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/core/kernels/unravel_index_op.cc | 57 +++++++++++++++------
 1 file changed, 41 insertions(+), 16 deletions(-)

diff --git a/tensorflow/core/kernels/unravel_index_op.cc b/tensorflow/core/kernels/unravel_index_op.cc
index 9b9f0c6f88..e4ce333eeb 100644
--- a/tensorflow/core/kernels/unravel_index_op.cc
+++ b/tensorflow/core/kernels/unravel_index_op.cc
@@ -26,51 +26,71 @@ namespace tensorflow {
 namespace {
 template <typename T>
 struct mod_op {
-  const T operator()(const T& a, const T& b) const {
-    return a % b;
-  }
+  const T operator()(const T& a, const T& b) const { return a % b; }
 };
-}
+}  // namespace
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
 class UnravelIndexOp : public OpKernel {
  public:
-  explicit UnravelIndexOp(OpKernelConstruction* ctx) : OpKernel(ctx) { }
+  explicit UnravelIndexOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
 
   void Compute(OpKernelContext* ctx) override {
     const Tensor& indices_tensor = ctx->input(0);
-    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(indices_tensor.shape()) || TensorShapeUtils::IsScalar(indices_tensor.shape()), errors::InvalidArgument("The indices can only be scalar or vector, got \"", indices_tensor.shape().DebugString(), "\""));
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsVector(indices_tensor.shape()) ||
+                    TensorShapeUtils::IsScalar(indices_tensor.shape()),
+                errors::InvalidArgument(
+                    "The indices can only be scalar or vector, got \"",
+                    indices_tensor.shape().DebugString(), "\""));
 
     const Tensor& dims_tensor = ctx->input(1);
-    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(dims_tensor.shape()), errors::InvalidArgument("The indices can only be 1-D, got \"", dims_tensor.shape().DebugString(), "\""));
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsVector(dims_tensor.shape()),
+        errors::InvalidArgument("The indices can only be 1-D, got \"",
+                                dims_tensor.shape().DebugString(), "\""));
 
     auto dims = dims_tensor.vec<int32>();
 
     Eigen::array<bool, 1> reverse({true});
 
     Tensor strides_tensor;
-    OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_INT32, TensorShape({dims_tensor.NumElements()}), &strides_tensor));
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(
+                            DT_INT32, TensorShape({dims_tensor.NumElements()}),
+                            &strides_tensor));
 
     auto strides = strides_tensor.vec<int32>();
-    strides = dims.reverse(reverse).scan(0, Eigen::internal::ProdReducer<int32>(), false).reverse(reverse);
+    strides = dims.reverse(reverse)
+                  .scan(0, Eigen::internal::ProdReducer<int32>(), false)
+                  .reverse(reverse);
 
     Tensor strides_shifted_tensor;
-    OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_INT32, TensorShape({dims_tensor.NumElements()}), &strides_shifted_tensor));
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(
+                            DT_INT32, TensorShape({dims_tensor.NumElements()}),
+                            &strides_shifted_tensor));
 
     auto strides_shifted = strides_shifted_tensor.vec<int32>();
-    strides_shifted = dims.reverse(reverse).scan(0, Eigen::internal::ProdReducer<int32>(), true).reverse(reverse);
+    strides_shifted = dims.reverse(reverse)
+                          .scan(0, Eigen::internal::ProdReducer<int32>(), true)
+                          .reverse(reverse);
 
     Tensor* output_tensor = nullptr;
     if (TensorShapeUtils::IsScalar(indices_tensor.shape())) {
-      OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({dims_tensor.NumElements()}), &output_tensor));
+      OP_REQUIRES_OK(
+          ctx, ctx->allocate_output(0, TensorShape({dims_tensor.NumElements()}),
+                                    &output_tensor));
 
       auto output = output_tensor->vec<int32>();
 
       output = output.constant(indices_tensor.scalar<int32>()());
       output = output.binaryExpr(strides, mod_op<int32>()) / strides_shifted;
     } else {
-      OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({dims_tensor.NumElements(), indices_tensor.NumElements()}), &output_tensor));
+      OP_REQUIRES_OK(
+          ctx, ctx->allocate_output(0,
+                                    TensorShape({dims_tensor.NumElements(),
+                                                 indices_tensor.NumElements()}),
+                                    &output_tensor));
 
       auto output = output_tensor->matrix<int32>();
 
@@ -79,12 +99,17 @@ class UnravelIndexOp : public OpKernel {
       Eigen::array<int64, 2> indices_reshape{{1, indices_tensor.NumElements()}};
       Eigen::array<int64, 2> indices_bcast({dims_tensor.NumElements(), 1});
 
-      output = indices_tensor.vec<int32>().reshape(indices_reshape).broadcast(indices_bcast);
-      output = output.binaryExpr(strides.reshape(reshape).broadcast(bcast), mod_op<int32>()) / strides_shifted.reshape(reshape).broadcast(bcast);
+      output = indices_tensor.vec<int32>()
+                   .reshape(indices_reshape)
+                   .broadcast(indices_bcast);
+      output = output.binaryExpr(strides.reshape(reshape).broadcast(bcast),
+                                 mod_op<int32>()) /
+               strides_shifted.reshape(reshape).broadcast(bcast);
     }
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("UnravelIndex").Device(DEVICE_CPU), UnravelIndexOp);
+REGISTER_KERNEL_BUILDER(Name("UnravelIndex").Device(DEVICE_CPU),
+                        UnravelIndexOp);
 
 }  // namespace tensorflow
-- 
GitLab


From f7443e866e994128baaed4019cb3ddbd0c0212ca Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sun, 19 Nov 2017 13:32:14 -0800
Subject: [PATCH 309/966] Expose tf.unravel_index in python

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/array_ops.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 24a0c18619..9541b097a9 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -34,6 +34,7 @@ See the @{$python/array_ops} guide.
 @@reshape
 @@squeeze
 @@expand_dims
+@@unravel_index
 @@meshgrid
 @@slice
 @@strided_slice
-- 
GitLab


From 5b6d2b0652d5b4da03c0e52e648f4b396cbb762f Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sun, 19 Nov 2017 13:32:40 -0800
Subject: [PATCH 310/966] Add test cases for tf.unravel_index.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/kernel_tests/array_ops_test.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index a96b88d96f..2716c4a51f 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -1114,6 +1114,21 @@ class InvertPermutationTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(y.get_shape(), [5])
         self.assertAllEqual(y.eval(), [2, 4, 3, 0, 1])
 
+class UnravelIndexTest(test_util.TensorFlowTestCase):
+
+  def testUnravelIndex(self):
+    with self.test_session():
+      out_1 = array_ops.unravel_index(1621, [6, 7, 8, 9])
+      self.assertAllEqual(out_1.eval(), [3, 1, 4, 1])
+      out_2 = array_ops.unravel_index([1621], [6, 7, 8, 9])
+      self.assertAllEqual(out_2.eval(), [[3],
+                                        [1],
+                                        [4],
+                                        [1]])
+      out_3 = array_ops.unravel_index([22, 41, 37], [7, 6])
+      self.assertAllEqual(out_3.eval(), [[3, 6, 6],
+                                        [4, 5, 1]])
+
 
 class GuaranteeConstOpTest(test_util.TensorFlowTestCase):
 
-- 
GitLab


From babc9bae71d31e35b8d66a715fcd527ee1ee645a Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Fri, 1 Dec 2017 11:09:11 +0000
Subject: [PATCH 311/966] Add int64 support for unravel_index

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/core/kernels/unravel_index_op.cc | 59 +++++++++++----------
 tensorflow/core/ops/array_ops.cc            | 17 +++---
 2 files changed, 41 insertions(+), 35 deletions(-)

diff --git a/tensorflow/core/kernels/unravel_index_op.cc b/tensorflow/core/kernels/unravel_index_op.cc
index e4ce333eeb..da9ab01e8d 100644
--- a/tensorflow/core/kernels/unravel_index_op.cc
+++ b/tensorflow/core/kernels/unravel_index_op.cc
@@ -32,15 +32,15 @@ struct mod_op {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
+template <typename Tidx>
 class UnravelIndexOp : public OpKernel {
  public:
   explicit UnravelIndexOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
 
   void Compute(OpKernelContext* ctx) override {
     const Tensor& indices_tensor = ctx->input(0);
-    OP_REQUIRES(ctx,
-                TensorShapeUtils::IsVector(indices_tensor.shape()) ||
-                    TensorShapeUtils::IsScalar(indices_tensor.shape()),
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(indices_tensor.shape()) ||
+                         TensorShapeUtils::IsScalar(indices_tensor.shape()),
                 errors::InvalidArgument(
                     "The indices can only be scalar or vector, got \"",
                     indices_tensor.shape().DebugString(), "\""));
@@ -51,28 +51,30 @@ class UnravelIndexOp : public OpKernel {
         errors::InvalidArgument("The indices can only be 1-D, got \"",
                                 dims_tensor.shape().DebugString(), "\""));
 
-    auto dims = dims_tensor.vec<int32>();
+    auto dims = dims_tensor.vec<Tidx>();
 
     Eigen::array<bool, 1> reverse({true});
 
     Tensor strides_tensor;
-    OP_REQUIRES_OK(ctx, ctx->allocate_temp(
-                            DT_INT32, TensorShape({dims_tensor.NumElements()}),
-                            &strides_tensor));
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_temp(DataTypeToEnum<Tidx>::value,
+                                      TensorShape({dims_tensor.NumElements()}),
+                                      &strides_tensor));
 
-    auto strides = strides_tensor.vec<int32>();
+    auto strides = strides_tensor.vec<Tidx>();
     strides = dims.reverse(reverse)
-                  .scan(0, Eigen::internal::ProdReducer<int32>(), false)
+                  .scan(0, Eigen::internal::ProdReducer<Tidx>(), false)
                   .reverse(reverse);
 
     Tensor strides_shifted_tensor;
-    OP_REQUIRES_OK(ctx, ctx->allocate_temp(
-                            DT_INT32, TensorShape({dims_tensor.NumElements()}),
-                            &strides_shifted_tensor));
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_temp(DataTypeToEnum<Tidx>::value,
+                                      TensorShape({dims_tensor.NumElements()}),
+                                      &strides_shifted_tensor));
 
-    auto strides_shifted = strides_shifted_tensor.vec<int32>();
+    auto strides_shifted = strides_shifted_tensor.vec<Tidx>();
     strides_shifted = dims.reverse(reverse)
-                          .scan(0, Eigen::internal::ProdReducer<int32>(), true)
+                          .scan(0, Eigen::internal::ProdReducer<Tidx>(), true)
                           .reverse(reverse);
 
     Tensor* output_tensor = nullptr;
@@ -81,35 +83,38 @@ class UnravelIndexOp : public OpKernel {
           ctx, ctx->allocate_output(0, TensorShape({dims_tensor.NumElements()}),
                                     &output_tensor));
 
-      auto output = output_tensor->vec<int32>();
+      auto output = output_tensor->vec<Tidx>();
 
-      output = output.constant(indices_tensor.scalar<int32>()());
-      output = output.binaryExpr(strides, mod_op<int32>()) / strides_shifted;
+      output = output.constant(indices_tensor.scalar<Tidx>()());
+      output = output.binaryExpr(strides, mod_op<Tidx>()) / strides_shifted;
     } else {
-      OP_REQUIRES_OK(
-          ctx, ctx->allocate_output(0,
-                                    TensorShape({dims_tensor.NumElements(),
-                                                 indices_tensor.NumElements()}),
-                                    &output_tensor));
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(
+                              0, TensorShape({dims_tensor.NumElements(),
+                                              indices_tensor.NumElements()}),
+                              &output_tensor));
 
-      auto output = output_tensor->matrix<int32>();
+      auto output = output_tensor->matrix<Tidx>();
 
       Eigen::array<int64, 2> reshape{{dims_tensor.NumElements(), 1}};
       Eigen::array<int64, 2> bcast({1, indices_tensor.NumElements()});
       Eigen::array<int64, 2> indices_reshape{{1, indices_tensor.NumElements()}};
       Eigen::array<int64, 2> indices_bcast({dims_tensor.NumElements(), 1});
 
-      output = indices_tensor.vec<int32>()
+      output = indices_tensor.vec<Tidx>()
                    .reshape(indices_reshape)
                    .broadcast(indices_bcast);
       output = output.binaryExpr(strides.reshape(reshape).broadcast(bcast),
-                                 mod_op<int32>()) /
+                                 mod_op<Tidx>()) /
                strides_shifted.reshape(reshape).broadcast(bcast);
     }
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("UnravelIndex").Device(DEVICE_CPU),
-                        UnravelIndexOp);
+#define REGISTER_KERNEL(type)                                               \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("UnravelIndex").Device(DEVICE_CPU).TypeConstraint<type>("Tidx"), \
+      UnravelIndexOp<type>);
+TF_CALL_int32(REGISTER_KERNEL) TF_CALL_int64(REGISTER_KERNEL)
+#undef REGISTER_KERNEL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index 526b0caef3..f9fdd50e6b 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -336,24 +336,25 @@ REGISTER_OP("Unpack")
     });
 
 REGISTER_OP("UnravelIndex")
-    .Input("indices: int32")
-    .Input("dims: int32")
-    .Output("output: int32")
-    .SetShapeFn([](InferenceContext* c) {
-      return Status::OK();
-    })
+    .Input("indices: Tidx")
+    .Input("dims: Tidx")
+    .Output("output: Tidx")
+    .Attr("Tidx: {int32, int64} = DT_INT32")
+    .SetShapeFn([](InferenceContext* c) { return Status::OK(); })
     .Doc(R"doc(
 Converts a flat index or array of flat indices into a tuple of coordinate
 arrays.
 
-This is equivalent to numpy.unravel_index
-
 indices: An 0-D or 1-D `int` Tensor whose elements are indices into the
   flattened version of an array of dimensions dims.
 dims: An 1-D `int` Tensor. The shape of the array to use for unraveling
   indices.
 output: An 2-D (or 1-D if indices is 0-D) tensor where each row has the
   same shape as the indices array.
+
+@compatibility(numpy)
+Equivalent to np.unravel_index
+@end_compatibility
 )doc");
 
 // --------------------------------------------------------------------------
-- 
GitLab


From b1b5c2e74d9a3e54d2e84279db94027060e20609 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Fri, 1 Dec 2017 11:11:11 +0000
Subject: [PATCH 312/966] Add test cases for int64 support of unravel_index

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 .../python/kernel_tests/array_ops_test.py     | 25 +++++++++++--------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index 2716c4a51f..68b7c3a98a 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -1118,16 +1118,21 @@ class UnravelIndexTest(test_util.TensorFlowTestCase):
 
   def testUnravelIndex(self):
     with self.test_session():
-      out_1 = array_ops.unravel_index(1621, [6, 7, 8, 9])
-      self.assertAllEqual(out_1.eval(), [3, 1, 4, 1])
-      out_2 = array_ops.unravel_index([1621], [6, 7, 8, 9])
-      self.assertAllEqual(out_2.eval(), [[3],
-                                        [1],
-                                        [4],
-                                        [1]])
-      out_3 = array_ops.unravel_index([22, 41, 37], [7, 6])
-      self.assertAllEqual(out_3.eval(), [[3, 6, 6],
-                                        [4, 5, 1]])
+      for dtype in [dtypes.int32, dtypes.int64]:
+        indices_1 = constant_op.constant(1621, dtype=dtype)
+        dims_1 = constant_op.constant([6, 7, 8, 9], dtype=dtype)
+        out_1 = array_ops.unravel_index(indices_1, dims_1)
+        self.assertAllEqual(out_1.eval(), [3, 1, 4, 1])
+
+        indices_2 = constant_op.constant([1621], dtype=dtype)
+        dims_2 = constant_op.constant([6, 7, 8, 9], dtype=dtype)
+        out_2 = array_ops.unravel_index(indices_2, dims_2)
+        self.assertAllEqual(out_2.eval(), [[3], [1], [4], [1]])
+
+        indices_3 = constant_op.constant([22, 41, 37], dtype=dtype)
+        dims_3 = constant_op.constant([7, 6], dtype=dtype)
+        out_3 = array_ops.unravel_index(indices_3, dims_3)
+        self.assertAllEqual(out_3.eval(), [[3, 6, 6], [4, 5, 1]])
 
 
 class GuaranteeConstOpTest(test_util.TensorFlowTestCase):
-- 
GitLab


From 25f8e54ea5021c04c11184fe134fc31b0e7ef88d Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 5 Dec 2017 22:07:01 +0000
Subject: [PATCH 313/966] Address review feedback

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/core/ops/array_ops.cc | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index f9fdd50e6b..45f880265b 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -342,8 +342,12 @@ REGISTER_OP("UnravelIndex")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .SetShapeFn([](InferenceContext* c) { return Status::OK(); })
     .Doc(R"doc(
-Converts a flat index or array of flat indices into a tuple of coordinate
-arrays.
+Converts a flat index or array of flat indices into a tuple of
+coordinate arrays.
+
+@compatibility(numpy)
+Equivalent to np.unravel_index
+@end_compatibility
 
 indices: An 0-D or 1-D `int` Tensor whose elements are indices into the
   flattened version of an array of dimensions dims.
@@ -351,10 +355,6 @@ dims: An 1-D `int` Tensor. The shape of the array to use for unraveling
   indices.
 output: An 2-D (or 1-D if indices is 0-D) tensor where each row has the
   same shape as the indices array.
-
-@compatibility(numpy)
-Equivalent to np.unravel_index
-@end_compatibility
 )doc");
 
 // --------------------------------------------------------------------------
-- 
GitLab


From af499b49e84991152d24ee97c4ef893d6986a081 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Mon, 29 Jan 2018 16:57:37 +0000
Subject: [PATCH 314/966] Update API defs and golden

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 .../base_api/api_def_UnravelIndex.pbtxt       | 32 +++++++++++++++++++
 tensorflow/core/ops/array_ops.cc              | 17 +---------
 tensorflow/tools/api/golden/tensorflow.pbtxt  |  4 +++
 3 files changed, 37 insertions(+), 16 deletions(-)
 create mode 100644 tensorflow/core/api_def/base_api/api_def_UnravelIndex.pbtxt

diff --git a/tensorflow/core/api_def/base_api/api_def_UnravelIndex.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnravelIndex.pbtxt
new file mode 100644
index 0000000000..97c380700a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_UnravelIndex.pbtxt
@@ -0,0 +1,32 @@
+op {
+  graph_op_name: "UnravelIndex"
+  in_arg {
+    name: "indices"
+    description: <<END
+An 0-D or 1-D `int` Tensor whose elements are indices into the
+flattened version of an array of dimensions dims.
+END
+  }
+  in_arg {
+    name: "dims"
+    description: <<END
+An 1-D `int` Tensor. The shape of the array to use for unraveling
+indices.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+An 2-D (or 1-D if indices is 0-D) tensor where each row has the
+same shape as the indices array.
+END
+  }
+  summary: "Converts a flat index or array of flat indices into a tuple of"
+  description: <<END
+coordinate arrays.
+
+@compatibility(numpy)
+Equivalent to np.unravel_index
+@end_compatibility
+END
+}
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index 45f880265b..5ec2a4e9b4 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -340,22 +340,7 @@ REGISTER_OP("UnravelIndex")
     .Input("dims: Tidx")
     .Output("output: Tidx")
     .Attr("Tidx: {int32, int64} = DT_INT32")
-    .SetShapeFn([](InferenceContext* c) { return Status::OK(); })
-    .Doc(R"doc(
-Converts a flat index or array of flat indices into a tuple of
-coordinate arrays.
-
-@compatibility(numpy)
-Equivalent to np.unravel_index
-@end_compatibility
-
-indices: An 0-D or 1-D `int` Tensor whose elements are indices into the
-  flattened version of an array of dimensions dims.
-dims: An 1-D `int` Tensor. The shape of the array to use for unraveling
-  indices.
-output: An 2-D (or 1-D if indices is 0-D) tensor where each row has the
-  same shape as the indices array.
-)doc");
+    .SetShapeFn([](InferenceContext* c) { return Status::OK(); });
 
 // --------------------------------------------------------------------------
 // TODO(josh11b): Remove the >= 2 constraint, once we can rewrite the graph
diff --git a/tensorflow/tools/api/golden/tensorflow.pbtxt b/tensorflow/tools/api/golden/tensorflow.pbtxt
index db1ed42185..dc7c3a2f45 100644
--- a/tensorflow/tools/api/golden/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.pbtxt
@@ -2044,6 +2044,10 @@ tf_module {
     name: "unique_with_counts"
     argspec: "args=[\'x\', \'out_idx\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
   }
+  member_method {
+    name: "unravel_index"
+    argspec: "args=[\'indices\', \'dims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "unsorted_segment_max"
     argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-- 
GitLab


From b9dae47061d4d4c9b8f8a79e73519525413ab84c Mon Sep 17 00:00:00 2001
From: Nupur Garg <nupurgarg@google.com>
Date: Mon, 29 Jan 2018 09:25:42 -0800
Subject: [PATCH 315/966] Make TFLite Transpose op have parity with TF
 Transpose op.

PiperOrigin-RevId: 183676663
---
 tensorflow/contrib/lite/builtin_op_data.h     |   4 -
 tensorflow/contrib/lite/kernels/transpose.cc  |  66 +++++---
 .../contrib/lite/kernels/transpose_test.cc    | 158 ++++++++++++++----
 tensorflow/contrib/lite/model.cc              |   8 -
 tensorflow/contrib/lite/schema/schema.fbs     |   1 -
 .../contrib/lite/schema/schema_generated.h    |  34 +---
 .../contrib/lite/testing/generate_examples.py |  27 ++-
 .../testing/generated_examples_zip_test.cc    |   2 +-
 .../contrib/lite/toco/tflite/operator.cc      |   5 +-
 .../contrib/lite/toco/tflite/operator_test.cc |   9 -
 10 files changed, 197 insertions(+), 117 deletions(-)

diff --git a/tensorflow/contrib/lite/builtin_op_data.h b/tensorflow/contrib/lite/builtin_op_data.h
index 8338fde8ac..8966e5c2b6 100644
--- a/tensorflow/contrib/lite/builtin_op_data.h
+++ b/tensorflow/contrib/lite/builtin_op_data.h
@@ -204,10 +204,6 @@ typedef struct {
 } TfLiteGatherParams;
 
 typedef struct {
-  // TODO(ahentz): We can't have dynamic data in this struct, at least not yet.
-  // For now we will fix the maximum possible number of dimensions.
-  int perm[8];
-  int num_dimensions;
 } TfLiteTransposeParams;
 
 typedef struct {
diff --git a/tensorflow/contrib/lite/kernels/transpose.cc b/tensorflow/contrib/lite/kernels/transpose.cc
index 75d8136b6a..093814bc44 100644
--- a/tensorflow/contrib/lite/kernels/transpose.cc
+++ b/tensorflow/contrib/lite/kernels/transpose.cc
@@ -31,60 +31,78 @@ enum KernelType {
   kReference,
 };
 
-// TODO(nupurgarg): Permutation arrays represented as a tensor are ignored. Only
-// use the `perm` specified in `params`.
 struct TransposeContext {
   TransposeContext(TfLiteContext* context, TfLiteNode* node) {
-    params = reinterpret_cast<TfLiteTransposeParams*>(node->builtin_data);
     input = GetInput(context, node, 0);
+    perm = GetInput(context, node, 1);
     output = GetOutput(context, node, 0);
   }
-  TfLiteTransposeParams* params;
   TfLiteTensor* input;
+  TfLiteTensor* perm;
   TfLiteTensor* output;
 };
 
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  TF_LITE_ENSURE(context, NumInputs(node) == 1 || NumInputs(node) == 2);
-  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
+                                TransposeContext* op_context) {
+  int dims = NumDimensions(op_context->input);
+  const int* perm_data = GetTensorData<int32_t>(op_context->perm);
 
-  TransposeContext op_context(context, node);
-  int dims = NumDimensions(op_context.input);
-
-  // Ensure validity of input tensor and permutation array.
-  TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type);
-  TF_LITE_ENSURE_EQ(context, dims, op_context.params->num_dimensions);
-  TF_LITE_ENSURE_MSG(context, dims <= 4,
-                     "Transpose op only supports 1D-4D input arrays.");
+  // Ensure validity of the permutations tensor as a 1D tensor.
+  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context->perm), 1);
+  TF_LITE_ENSURE_EQ(context, op_context->perm->dims->data[0], dims);
   for (int idx = 0; idx < dims; ++idx) {
-    TF_LITE_ENSURE_MSG(context,
-                       op_context.params->perm[idx] >= 0 &&
-                           op_context.params->perm[idx] < dims,
+    TF_LITE_ENSURE_MSG(context, (perm_data[idx] >= 0 && perm_data[idx] < dims),
                        "Transpose op permutations array is out of bounds.");
   }
 
   // Determine size of output tensor.
-  const TfLiteIntArray* input_size = op_context.input->dims;
-  TfLiteIntArray* output_size = TfLiteIntArrayCreate(dims);
+  TfLiteIntArray* input_size = op_context->input->dims;
+  TfLiteIntArray* output_size = TfLiteIntArrayCopy(input_size);
   for (int idx = 0; idx < dims; ++idx) {
-    output_size->data[idx] = input_size->data[op_context.params->perm[idx]];
+    output_size->data[idx] = input_size->data[perm_data[idx]];
   }
 
-  return context->ResizeTensor(context, op_context.output, output_size);
+  return context->ResizeTensor(context, op_context->output, output_size);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  TransposeContext op_context(context, node);
+
+  // Ensure validity of input tensor.
+  TF_LITE_ENSURE_MSG(context, NumDimensions(op_context.input) <= 4,
+                     "Transpose op only supports 1D-4D input arrays.");
+  TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type);
+
+  if (!IsConstantTensor(op_context.perm)) {
+    SetTensorToDynamic(op_context.output);
+    return kTfLiteOk;
+  }
+  return ResizeOutputTensor(context, &op_context);
 }
 
 template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   TransposeContext op_context(context, node);
 
+  // Resize the output tensor if the output tensor is dynamic.
+  if (IsDynamicTensor(op_context.output)) {
+    TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context));
+    TfLiteTensorRealloc(op_context.output->bytes, op_context.output);
+  }
+
   // Reverse the permuted axes and convert to 4D due to the way Dims are
   // constructed in GetTensorDims.
+  const int* perm_data = GetTensorData<int32_t>(op_context.perm);
+  const int size = op_context.perm->dims->data[0];
   const int kOutputDimensionNum = 4;
   int reversed_perm[kOutputDimensionNum];
-  int size = op_context.params->num_dimensions;
+
   for (int output_k = 0, input_k = size - 1; output_k < size;
        ++output_k, --input_k) {
-    reversed_perm[output_k] = size - op_context.params->perm[input_k] - 1;
+    reversed_perm[output_k] = size - perm_data[input_k] - 1;
   }
   for (int k = size; k < kOutputDimensionNum; ++k) {
     reversed_perm[k] = k;
diff --git a/tensorflow/contrib/lite/kernels/transpose_test.cc b/tensorflow/contrib/lite/kernels/transpose_test.cc
index 7f5832cd5f..337bc144b9 100644
--- a/tensorflow/contrib/lite/kernels/transpose_test.cc
+++ b/tensorflow/contrib/lite/kernels/transpose_test.cc
@@ -127,61 +127,124 @@ TEST(TransposeTest, TestRefOps4D) {
 
 class TransposeOpModel : public SingleOpModel {
  public:
-  TransposeOpModel(std::initializer_list<int> input_shape,
-                   std::initializer_list<int> perm) {
-    input_ = AddInput(TensorType_FLOAT32);
-    output_ = AddOutput(TensorType_FLOAT32);
-    SetBuiltinOp(
-        BuiltinOperator_TRANSPOSE, BuiltinOptions_TransposeOptions,
-        CreateTransposeOptions(builder_, builder_.CreateVector<int>(perm))
-            .Union());
-    BuildInterpreter({input_shape});
-  }
-
   void SetInput(std::initializer_list<float> data) {
     PopulateTensor<float>(input_, data);
   }
 
+  void SetPerm(std::initializer_list<int> data) {
+    PopulateTensor<int>(perm_, data);
+  }
+
   std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
 
- private:
+ protected:
   int input_;
+  int perm_;
   int output_;
 };
 
+// Tests case where perm is a const tensor.
+//
+// Example usage is as follows:
+//    SpaceToBatchNDOpConstModel m(input_shape, perm_shape, perm_data);
+//    m.SetInput(input_data);
+//    m.Invoke();
+class TransposeOpConstModel : public TransposeOpModel {
+ public:
+  TransposeOpConstModel(std::initializer_list<int> input_shape,
+                        std::initializer_list<int> perm_shape,
+                        std::initializer_list<int> perm) {
+    input_ = AddInput(TensorType_FLOAT32);
+    perm_ = AddConstInput(TensorType_INT32, perm, perm_shape);
+    output_ = AddOutput(TensorType_FLOAT32);
+    SetBuiltinOp(BuiltinOperator_TRANSPOSE, BuiltinOptions_TransposeOptions,
+                 CreateTransposeOptions(builder_).Union());
+    BuildInterpreter({input_shape});
+  }
+};
+
+// Tests case where perm is a non-const tensor.
+//
+// Example usage is as follows:
+//    TransposeOpDynamicModel m(input_shape, perm_shape);
+//    m.SetInput(input_data);
+//    m.SetPerm(perm_data);
+//    m.Invoke();
+class TransposeOpDynamicModel : public TransposeOpModel {
+ public:
+  TransposeOpDynamicModel(std::initializer_list<int> input_shape,
+                          std::initializer_list<int> perm_shape) {
+    input_ = AddInput(TensorType_FLOAT32);
+    perm_ = AddInput(TensorType_INT32);
+    output_ = AddOutput(TensorType_FLOAT32);
+    SetBuiltinOp(BuiltinOperator_TRANSPOSE, BuiltinOptions_TransposeOptions,
+                 CreateTransposeOptions(builder_).Union());
+    BuildInterpreter({input_shape, perm_shape});
+  }
+};
+
 TEST(TransposeTest, TestUnequalPermSize) {
-  EXPECT_DEATH(TransposeOpModel({1, 3, 3, 1}, {2, 2}),
-               "dims != op_context.params->num_dimensions");
+  EXPECT_DEATH(TransposeOpConstModel({1, 3, 3, 1}, {2}, {2, 2}), "2 != 4");
 }
 
 TEST(TransposeTest, TestPermOutOfBounds) {
-  EXPECT_DEATH(TransposeOpModel({1, 3, 3, 1}, {0, -1, -2, -3}),
+  EXPECT_DEATH(TransposeOpConstModel({1, 3, 3, 1}, {4}, {0, -1, -2, -3}),
                "Transpose op permutations array is out of bounds.");
-  EXPECT_DEATH(TransposeOpModel({1, 3, 3, 1}, {0, 1, 2, 4}),
+  EXPECT_DEATH(TransposeOpConstModel({1, 3, 3, 1}, {4}, {0, 1, 2, 4}),
                "Transpose op permutations array is out of bounds.");
 }
 
-TEST(TransposeTest, Test1DInputTensor) {
-  TransposeOpModel m({3}, {0});
+TEST(TransposeTest, Test1DInputConstTensor) {
+  TransposeOpConstModel m({3}, {1}, {0});
   m.SetInput({1, 2, 3});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3}));
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3}));
 }
 
-TEST(TransposeTest, Test2DInputTensor) {
-  TransposeOpModel m({3, 2}, {1, 0});
+TEST(TransposeTest, Test1DInputDynamicTensor) {
+  TransposeOpDynamicModel m({3}, {1});
+  m.SetInput({1, 2, 3});
+  m.SetPerm({0});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3}));
+}
+
+TEST(TransposeTest, Test2DInputConstTensor) {
+  TransposeOpConstModel m({3, 2}, {2}, {1, 0});
+  m.SetInput({0, 1, 2, 3, 4, 5});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 3}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 2, 4, 1, 3, 5}));
+}
+
+TEST(TransposeTest, Test2DInputDynamicTensor) {
+  TransposeOpDynamicModel m({3, 2}, {2});
   m.SetInput({0, 1, 2, 3, 4, 5});
+  m.SetPerm({1, 0});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 3}));
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 2, 4, 1, 3, 5}));
 }
 
-TEST(TransposeTest, Test3DInputTensor) {
-  TransposeOpModel m({2, 3, 4}, {2, 0, 1});
+TEST(TransposeTest, Test3DInputConstTensor) {
+  TransposeOpConstModel m({2, 3, 4}, {3}, {2, 0, 1});
+  m.SetInput({0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+              12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 2, 3}));
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray({0, 4, 8,  12, 16, 20, 1, 5, 9,  13, 17, 21,
+                                2, 6, 10, 14, 18, 22, 3, 7, 11, 15, 19, 23}));
+}
+
+TEST(TransposeTest, Test3DInputDynamicTensor) {
+  TransposeOpDynamicModel m({2, 3, 4}, {3});
   m.SetInput({0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
               12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23});
+  m.SetPerm({2, 0, 1});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 2, 3}));
   EXPECT_THAT(m.GetOutput(),
@@ -190,28 +253,64 @@ TEST(TransposeTest, Test3DInputTensor) {
 }
 
 TEST(TransposeTest, Test5DInputTensor) {
-  EXPECT_DEATH(TransposeOpModel({1, 2, 3, 4, 5}, {0, 1, 2, 3, 4}),
+  EXPECT_DEATH(TransposeOpConstModel({1, 2, 3, 4, 5}, {5}, {0, 1, 2, 3, 4}),
                "Transpose op only supports 1D-4D input arrays.");
 }
 
-TEST(TransposeTest, SimpleTestNoReorder) {
-  TransposeOpModel m({1, 2, 3, 1}, {0, 1, 2, 3});
+TEST(TransposeTest, SimpleTestNoReorderConstTensor) {
+  TransposeOpConstModel m({1, 2, 3, 1}, {4}, {0, 1, 2, 3});
+  m.SetInput({1, 2, 3, 4, 5, 6});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2, 3, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
+}
+
+TEST(TransposeTest, SimpleTestNoReorderDynamicTensor) {
+  TransposeOpDynamicModel m({1, 2, 3, 1}, {4});
   m.SetInput({1, 2, 3, 4, 5, 6});
+  m.SetPerm({0, 1, 2, 3});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2, 3, 1}));
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
 }
 
-TEST(TransposeTest, SimpleTestWithReorder) {
-  TransposeOpModel m({1, 2, 3, 1}, {2, 1, 3, 0});
+TEST(TransposeTest, SimpleTestWithReorderConstTensor) {
+  TransposeOpConstModel m({1, 2, 3, 1}, {4}, {2, 1, 3, 0});
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 2, 1, 1}));
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 4, 2, 5, 3, 6}));
 }
 
-TEST(TransposeTest, ComplexTestWithReorder) {
-  TransposeOpModel m({2, 3, 4, 5}, {2, 0, 1, 3});
+TEST(TransposeTest, ComplexTestWithReorderConstTensor) {
+  TransposeOpConstModel m({2, 3, 4, 5}, {4}, {2, 0, 1, 3});
+  m.SetInput({0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,
+              12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,
+              24,  25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,
+              36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,
+              48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+              60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,
+              72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,
+              84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,
+              96,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107,
+              108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 2, 3, 5}));
+  auto result = ElementsAreArray(
+      {0,  1,  2,  3,  4,  20, 21, 22, 23, 24, 40,  41,  42,  43,  44,
+       60, 61, 62, 63, 64, 80, 81, 82, 83, 84, 100, 101, 102, 103, 104,
+       5,  6,  7,  8,  9,  25, 26, 27, 28, 29, 45,  46,  47,  48,  49,
+       65, 66, 67, 68, 69, 85, 86, 87, 88, 89, 105, 106, 107, 108, 109,
+       10, 11, 12, 13, 14, 30, 31, 32, 33, 34, 50,  51,  52,  53,  54,
+       70, 71, 72, 73, 74, 90, 91, 92, 93, 94, 110, 111, 112, 113, 114,
+       15, 16, 17, 18, 19, 35, 36, 37, 38, 39, 55,  56,  57,  58,  59,
+       75, 76, 77, 78, 79, 95, 96, 97, 98, 99, 115, 116, 117, 118, 119});
+  EXPECT_THAT(m.GetOutput(), result);
+}
+
+TEST(TransposeTest, ComplexTestWithReorderDynamicTensor) {
+  TransposeOpDynamicModel m({2, 3, 4, 5}, {4});
   m.SetInput({0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,
               12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,
               24,  25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,
@@ -222,6 +321,7 @@ TEST(TransposeTest, ComplexTestWithReorder) {
               84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,
               96,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107,
               108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119});
+  m.SetPerm({2, 0, 1, 3});
   m.Invoke();
 
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 2, 3, 5}));
diff --git a/tensorflow/contrib/lite/model.cc b/tensorflow/contrib/lite/model.cc
index 3f53a1abe7..ec4d6e3487 100644
--- a/tensorflow/contrib/lite/model.cc
+++ b/tensorflow/contrib/lite/model.cc
@@ -554,14 +554,6 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
       break;
     }
     case BuiltinOperator_TRANSPOSE: {
-      auto* params = MallocPOD<TfLiteTransposeParams>();
-      if (auto* schema_params = op->builtin_options_as_TransposeOptions()) {
-        const auto& perm = schema_params->perm();
-        FlatBufferIntVectorToArray(sizeof(params->perm), perm, params->perm,
-                                   error_reporter);
-        params->num_dimensions = perm->Length();
-      }
-      builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_MEAN: {
diff --git a/tensorflow/contrib/lite/schema/schema.fbs b/tensorflow/contrib/lite/schema/schema.fbs
index 4c82cb9549..50709344ea 100644
--- a/tensorflow/contrib/lite/schema/schema.fbs
+++ b/tensorflow/contrib/lite/schema/schema.fbs
@@ -333,7 +333,6 @@ table GatherOptions {
 }
 
 table TransposeOptions {
-  perm:[int];
 }
 
 table MeanOptions {
diff --git a/tensorflow/contrib/lite/schema/schema_generated.h b/tensorflow/contrib/lite/schema/schema_generated.h
index bafd28b626..f1ee925df2 100755
--- a/tensorflow/contrib/lite/schema/schema_generated.h
+++ b/tensorflow/contrib/lite/schema/schema_generated.h
@@ -3392,19 +3392,13 @@ flatbuffers::Offset<GatherOptions> CreateGatherOptions(
 
 struct TransposeOptionsT : public flatbuffers::NativeTable {
   typedef TransposeOptions TableType;
-  std::vector<int32_t> perm;
   TransposeOptionsT() {}
 };
 
 struct TransposeOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef TransposeOptionsT NativeTableType;
-  enum { VT_PERM = 4 };
-  const flatbuffers::Vector<int32_t> *perm() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_PERM);
-  }
   bool Verify(flatbuffers::Verifier &verifier) const {
-    return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_PERM) &&
-           verifier.Verify(perm()) && verifier.EndTable();
+    return VerifyTableStart(verifier) && verifier.EndTable();
   }
   TransposeOptionsT *UnPack(
       const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -3419,9 +3413,6 @@ struct TransposeOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
 struct TransposeOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
-  void add_perm(flatbuffers::Offset<flatbuffers::Vector<int32_t>> perm) {
-    fbb_.AddOffset(TransposeOptions::VT_PERM, perm);
-  }
   explicit TransposeOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
       : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -3435,20 +3426,11 @@ struct TransposeOptionsBuilder {
 };
 
 inline flatbuffers::Offset<TransposeOptions> CreateTransposeOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> perm = 0) {
+    flatbuffers::FlatBufferBuilder &_fbb) {
   TransposeOptionsBuilder builder_(_fbb);
-  builder_.add_perm(perm);
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<TransposeOptions> CreateTransposeOptionsDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    const std::vector<int32_t> *perm = nullptr) {
-  return tflite::CreateTransposeOptions(
-      _fbb, perm ? _fbb.CreateVector<int32_t>(*perm) : 0);
-}
-
 flatbuffers::Offset<TransposeOptions> CreateTransposeOptions(
     flatbuffers::FlatBufferBuilder &_fbb, const TransposeOptionsT *_o,
     const flatbuffers::rehasher_function_t *_rehasher = nullptr);
@@ -6107,15 +6089,6 @@ inline void TransposeOptions::UnPackTo(
     const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  {
-    auto _e = perm();
-    if (_e) {
-      _o->perm.resize(_e->size());
-      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
-        _o->perm[_i] = _e->Get(_i);
-      }
-    }
-  };
 }
 
 inline flatbuffers::Offset<TransposeOptions> TransposeOptions::Pack(
@@ -6135,8 +6108,7 @@ inline flatbuffers::Offset<TransposeOptions> CreateTransposeOptions(
     const flatbuffers::rehasher_function_t *__rehasher;
   } _va = {&_fbb, _o, _rehasher};
   (void)_va;
-  auto _perm = _o->perm.size() ? _fbb.CreateVector(_o->perm) : 0;
-  return tflite::CreateTransposeOptions(_fbb, _perm);
+  return tflite::CreateTransposeOptions(_fbb);
 }
 
 inline MeanOptionsT *MeanOptions::UnPack(
diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py
index a639351657..bc9b23aeb4 100644
--- a/tensorflow/contrib/lite/testing/generate_examples.py
+++ b/tensorflow/contrib/lite/testing/generate_examples.py
@@ -1397,29 +1397,44 @@ def make_transpose_tests(zip_path):
       "dtype": [tf.int32, tf.int64, tf.float32],
       "input_shape": [[2, 2, 3]],
       "perm": [[0, 1, 2], [0, 2, 1]],
+      "constant_perm": [True, False],
   }, {
       "dtype": [tf.float32],
       "input_shape": [[1, 2, 3, 4]],
       "perm": [[0, 1, 2, 3], [3, 0, 1, 2]],
+      "constant_perm": [True, False],
   }, {
       "dtype": [tf.float32],
       "input_shape": [[1, 2, 3, 4, 5]],
       "perm": [[0, 1, 2, 3, 4]],
+      "constant_perm": [True, False],
   }]
 
   def build_graph(parameters):
+    """Build a transpose graph given `parameters`."""
     input_tensor = tf.placeholder(
         dtype=parameters["dtype"],
         name="input",
         shape=parameters["input_shape"])
-    out = tf.transpose(input_tensor, perm=parameters["perm"])
-    return [input_tensor], [out]
+
+    if parameters["constant_perm"]:
+      perm = parameters["perm"]
+      input_tensors = [input_tensor]
+    else:
+      shape = [len(parameters["perm"]), 2]
+      perm = tf.placeholder(dtype=tf.int32, name="perm", shape=shape)
+      input_tensors = [input_tensor, perm]
+
+    out = tf.transpose(input_tensor, perm=perm)
+    return input_tensors, [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
-    input_values = create_tensor_data(parameters["dtype"],
-                                      parameters["input_shape"])
-    return [input_values], sess.run(
-        outputs, feed_dict=dict(zip(inputs, [input_values])))
+    values = [
+        create_tensor_data(parameters["dtype"], parameters["input_shape"])
+    ]
+    if not parameters["constant_perm"]:
+      values.append(np.array(parameters["perm"]))
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
diff --git a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
index 41652a07d2..2bbfe77a12 100644
--- a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
+++ b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
@@ -92,7 +92,7 @@ std::map<string, string> kBrokenTests = {
     {R"(^\/resize_bilinearalign_corners=True,.*,size=\[5,6\])", "72401483"},
 
     // Transpose only supports 1D-4D input tensors.
-    {R"(^\/transposedtype=.*,input_shape=\[.,.,.,.,.\],perm=.*)", "71545879"},
+    {R"(^\/transpose.*input_shape=\[.,.,.,.,.\])", "71545879"},
 };
 
 // Allows test data to be unzipped into a temporary directory and makes
diff --git a/tensorflow/contrib/lite/toco/tflite/operator.cc b/tensorflow/contrib/lite/toco/tflite/operator.cc
index 298f49025f..2d6bccce2b 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator.cc
@@ -546,14 +546,11 @@ class Transpose
   flatbuffers::Offset<TfLiteOptions> WriteOptions(
       const TocoOperator& op,
       flatbuffers::FlatBufferBuilder* builder) const override {
-    return ::tflite::CreateTransposeOptions(*builder,
-                                            builder->CreateVector(op.perm));
+    return ::tflite::CreateTransposeOptions(*builder);
   }
 
   void ReadOptions(const TfLiteOptions& options,
                    TocoOperator* op) const override {
-    op->perm.insert(op->perm.end(), options.perm()->begin(),
-                    options.perm()->end());
   }
 };
 
diff --git a/tensorflow/contrib/lite/toco/tflite/operator_test.cc b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
index 9036a16d1c..78af3a767d 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator_test.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
@@ -370,15 +370,6 @@ TEST_F(OperatorTest, Svdf) {
   EXPECT_EQ(op.rank, output_toco_op->rank);
 }
 
-TEST_F(OperatorTest, Transpose) {
-  TransposeOperator op;
-  op.perm = {0, 1, 2, 3};
-
-  auto output_toco_op = SerializeAndDeserialize(
-      GetOperator("TRANSPOSE", OperatorType::kTranspose), op);
-  EXPECT_EQ(op.perm, output_toco_op->perm);
-}
-
 TEST_F(OperatorTest, Squeeze) {
   SqueezeOperator op;
   op.squeeze_dims = {-2, -3, 4, 1, 4};
-- 
GitLab


From 9de5db8676a037a42da1e99de77abfcf75d10809 Mon Sep 17 00:00:00 2001
From: AG Ramesh <ag.ramesh@intel.com>
Date: Mon, 29 Jan 2018 10:46:05 -0700
Subject: [PATCH 316/966] Reverting the switch to max_pool_v2 in python
 (#16524)

---
 tensorflow/contrib/specs/python/specs_test.py | 14 +++++++-------
 tensorflow/python/ops/nn_ops.py               |  2 +-
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/tensorflow/contrib/specs/python/specs_test.py b/tensorflow/contrib/specs/python/specs_test.py
index d5f61d1b69..41782a9fc9 100644
--- a/tensorflow/contrib/specs/python/specs_test.py
+++ b/tensorflow/contrib/specs/python/specs_test.py
@@ -87,7 +87,7 @@ class SpecsTest(test.TestCase):
       self.assertEqual(tuple(result.shape), (1, 8, 8, 5))
       self.assertEqual(
           summaries.tf_spec_structure(spec, inputs),
-          "_ _ _ maxpoolv2 _ _ maxpoolv2 _ _ maxpoolv2")
+          "_ maxpool maxpool maxpool")
 
   def testAbbrevPower(self):
     with self.test_session():
@@ -100,10 +100,10 @@ class SpecsTest(test.TestCase):
       self.assertEqual(tuple(result.shape), (1, 8, 8, 5))
       self.assertEqual(
           summaries.tf_spec_structure(spec, inputs),
-          "_ variablev2 conv variablev2 biasadd relu _ _ maxpoolv2"
+          "_ variablev2 conv variablev2 biasadd relu maxpool"
           " variablev2 conv variablev2"
-          " biasadd relu _ _ maxpoolv2 variablev2 conv variablev2"
-          " biasadd relu _ _ maxpoolv2")
+          " biasadd relu maxpool variablev2 conv variablev2"
+          " biasadd relu maxpool")
 
   def testAbbrevPower2(self):
     with self.test_session():
@@ -117,10 +117,10 @@ class SpecsTest(test.TestCase):
       self.assertEqual(tuple(result.shape), (1, 8, 8, 5))
       self.assertEqual(
           summaries.tf_spec_structure(spec, inputs),
-          "_ variablev2 conv variablev2 biasadd relu _ _ maxpoolv2"
+          "_ variablev2 conv variablev2 biasadd relu maxpool"
           " variablev2 conv variablev2 biasadd relu"
-          " _ _ maxpoolv2 variablev2 conv variablev2 biasadd relu"
-          " _ _ maxpoolv2")
+          " maxpool variablev2 conv variablev2 biasadd relu"
+          " maxpool")
 
   def testConc(self):
     with self.test_session():
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 644bb3af8a..9f0cc4a029 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -2070,7 +2070,7 @@ def max_pool(value, ksize, strides, padding, data_format="NHWC", name=None):
   """
   with ops.name_scope(name, "MaxPool", [value]) as name:
     value = ops.convert_to_tensor(value, name="input")
-    return gen_nn_ops._max_pool_v2(value,
+    return gen_nn_ops._max_pool(value,
                                   ksize=ksize,
                                   strides=strides,
                                   padding=padding,
-- 
GitLab


From 219e22879ba981aa33fbe8f54a550cce56bc5d90 Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Mon, 29 Jan 2018 09:44:35 -0800
Subject: [PATCH 317/966] TFTS: Remove a race condition in lstm_test (switch to
 resource variables)

PiperOrigin-RevId: 183679060
---
 .../contrib/timeseries/examples/lstm.py       | 26 ++++++++++---------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/tensorflow/contrib/timeseries/examples/lstm.py b/tensorflow/contrib/timeseries/examples/lstm.py
index c7193cef69..c834430b95 100644
--- a/tensorflow/contrib/timeseries/examples/lstm.py
+++ b/tensorflow/contrib/timeseries/examples/lstm.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
 from os import path
 
 import numpy
@@ -80,18 +81,19 @@ class _LSTMModel(ts_model.SequentialTimeSeriesModel):
       input_statistics: A math_utils.InputStatistics object.
     """
     super(_LSTMModel, self).initialize_graph(input_statistics=input_statistics)
-    self._lstm_cell = tf.nn.rnn_cell.LSTMCell(num_units=self._num_units)
-    # Create templates so we don't have to worry about variable reuse.
-    self._lstm_cell_run = tf.make_template(
-        name_="lstm_cell",
-        func_=self._lstm_cell,
-        create_scope_now_=True)
-    # Transforms LSTM output into mean predictions.
-    self._predict_from_lstm_output = tf.make_template(
-        name_="predict_from_lstm_output",
-        func_=
-        lambda inputs: tf.layers.dense(inputs=inputs, units=self.num_features),
-        create_scope_now_=True)
+    with tf.variable_scope("", use_resource=True):
+      # Use ResourceVariables to avoid race conditions.
+      self._lstm_cell = tf.nn.rnn_cell.LSTMCell(num_units=self._num_units)
+      # Create templates so we don't have to worry about variable reuse.
+      self._lstm_cell_run = tf.make_template(
+          name_="lstm_cell",
+          func_=self._lstm_cell,
+          create_scope_now_=True)
+      # Transforms LSTM output into mean predictions.
+      self._predict_from_lstm_output = tf.make_template(
+          name_="predict_from_lstm_output",
+          func_=functools.partial(tf.layers.dense, units=self.num_features),
+          create_scope_now_=True)
 
   def get_start_state(self):
     """Return initial state for the time series model."""
-- 
GitLab


From 3c7a7313738a1b5a1efa51bab79e377349bb8d7b Mon Sep 17 00:00:00 2001
From: Shivani Agrawal <shivaniagrawal@google.com>
Date: Mon, 29 Jan 2018 09:48:47 -0800
Subject: [PATCH 318/966] [tf.data] Robust `output_types` and `output_shapes`
 if `output_classes` contains SparseTensor.

PiperOrigin-RevId: 183679584
---
 tensorflow/python/data/ops/iterator_ops.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/data/ops/iterator_ops.py b/tensorflow/python/data/ops/iterator_ops.py
index 0cbdb3ab19..53a3244ce1 100644
--- a/tensorflow/python/data/ops/iterator_ops.py
+++ b/tensorflow/python/data/ops/iterator_ops.py
@@ -165,8 +165,10 @@ class Iterator(object):
     iterator_resource = gen_dataset_ops.iterator(
         container="",
         shared_name=shared_name,
-        output_types=nest.flatten(output_types),
-        output_shapes=nest.flatten(output_shapes))
+        output_types=nest.flatten(
+            sparse.as_dense_types(output_types, output_classes)),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(output_shapes, output_classes)))
     return Iterator(iterator_resource, None, output_types, output_shapes,
                     output_classes)
 
@@ -232,8 +234,10 @@ class Iterator(object):
     string_handle = ops.convert_to_tensor(string_handle, dtype=dtypes.string)
     iterator_resource = gen_dataset_ops.iterator_from_string_handle(
         string_handle,
-        output_types=nest.flatten(output_types),
-        output_shapes=nest.flatten(output_shapes))
+        output_types=nest.flatten(
+            sparse.as_dense_types(output_types, output_classes)),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(output_shapes, output_classes)))
     return Iterator(iterator_resource, None, output_types, output_shapes,
                     output_classes)
 
-- 
GitLab


From 75adab6104362d71ce28b0269bf31fd30471b1b6 Mon Sep 17 00:00:00 2001
From: Jie <jiej@nvidia.com>
Date: Mon, 29 Jan 2018 09:56:59 -0800
Subject: [PATCH 319/966] [UPDATE]   addressing code review comments.

  TODO:
    MACRO GOOGLE_TENSORRT to guard c++ files.
    build/test -> current code works on local repo before the master merge.
---
 .../contrib/tensorrt/convert/convert_graph.cc | 37 ++++++-------
 .../contrib/tensorrt/convert/convert_graph.h  |  9 +++-
 .../contrib/tensorrt/convert/convert_nodes.cc | 32 ++---------
 .../contrib/tensorrt/kernels/trt_engine_op.cc | 38 ++-----------
 .../contrib/tensorrt/segment/segment.cc       |  3 +-
 tensorflow/contrib/tensorrt/segment/segment.h |  1 +
 .../contrib/tensorrt/segment/segment_test.cc  |  4 +-
 .../contrib/tensorrt/shape_fn/trt_shfn.cc     | 54 +++----------------
 8 files changed, 40 insertions(+), 138 deletions(-)

diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index 9a1815d6b9..fa5ed28060 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -55,31 +55,28 @@ namespace tensorrt {
 namespace convert {
 namespace {
 
-static std::unordered_set<std::string> output_nodes;
-bool IsTensorRTCandidate(const tensorflow::NodeDef& node_def) {
+static bool IsTensorRTCandidate(const tensorflow::NodeDef& node_def) {
 // LINT.IfChange
+  // TODO(jie): Segmentation shouldn't associated with op name.
+  //            Split it into a registration for each kernel.
   static const std::set<std::string> candidate_ops = {
       "Identity", "Const", "Conv2D", "MaxPool", "BiasAdd", "Relu",
       "Add",      "Mul",   "Sub",    "Rsqrt",   "Pad"  // "Placeholder" ,"Mean"
-                                                       // TODO(ben,jie): ...
   };
 // LINT.ThenChange(
 //    https://www.tensorflow.org/code/tensorflow/contrib/tensorrt/convert/convert_nodes.h)
-
-  if (output_nodes.count(node_def.name())) return false;
   return candidate_ops.count(node_def.op());
 }
 
+
 void GetSubGraphIncomingEdges(const tensorflow::Graph & graph,
                               const std::set<int> &subgraph_node_ids,
                               tensorflow::EdgeSet* incoming_edges) {
   for (int node_id : subgraph_node_ids) {
-    tensorflow::Node const* node = graph.FindNodeId(node_id);
-    LOG(DEBUG) << node->name() << " has incoming edges: ";
-    for (tensorflow::Edge const* edge : node->in_edges()) {
+    const tensorflow::Node* node = graph.FindNodeId(node_id);
+    for (const tensorflow::Edge* edge : node->in_edges()) {
       if (!subgraph_node_ids.count(edge->src()->id()) &&
           !edge->src()->IsSource()) {
-        LOG(DEBUG) << edge->src()->name() << ", ";
         incoming_edges->insert(edge);
       }
     }
@@ -90,9 +87,8 @@ void GetSubGraphOutgoingEdges(const tensorflow::Graph &graph,
                               const std::set<int> &subgraph_node_ids,
                               tensorflow::EdgeSet* outgoing_edges) {
   for (int node_id : subgraph_node_ids) {
-    tensorflow::Node const* node = graph.FindNodeId(node_id);
-    LOG(DEBUG) << node->name() << " has outgoing edges: ";
-    for (tensorflow::Edge const* edge : node->out_edges()) {
+    const tensorflow::Node* node = graph.FindNodeId(node_id);
+    for (const tensorflow::Edge* edge : node->out_edges()) {
       if (!subgraph_node_ids.count(edge->dst()->id()) &&
           !edge->dst()->IsSink()) {
         outgoing_edges->insert(edge);
@@ -138,7 +134,7 @@ tensorflow::Status ConvertSubGraphToTensorRT(
 
 
   // Collect inputs by looking for incoming edges
-  for (tensorflow::Edge const* edge : subgraph_incoming_edges) {
+  for (const tensorflow::Edge* edge : subgraph_incoming_edges) {
     subgraph_inputs.push_back({edge->src()->id(), edge->src_output()});
   }
   std::set<std::pair<int, int>> subgraph_outputs_set;
@@ -155,7 +151,7 @@ tensorflow::Status ConvertSubGraphToTensorRT(
   // Collect outputs referenced from outgoing edges
   tensorflow::EdgeSet subgraph_outgoing_edges;
   GetSubGraphOutgoingEdges(graph, subgraph_node_ids, &subgraph_outgoing_edges);
-  for (tensorflow::Edge const* edge : subgraph_outgoing_edges) {
+  for (const tensorflow::Edge* edge : subgraph_outgoing_edges) {
     subgraph_outputs_set.insert({edge->src()->id(), edge->src_output()});
   }
   // Impose an ordering on the outputs
@@ -177,7 +173,7 @@ tensorflow::Status ConvertSubGraphToTensorRT(
     subgraph_edge_to_output_map.insert({subgraph_outputs.at(i), i});
   }
   TF_RETURN_IF_ERROR(status);
-  for (tensorflow::Edge const* edge : subgraph_outgoing_edges) {
+  for (const tensorflow::Edge* edge : subgraph_outgoing_edges) {
     std::pair<int, int> old_src = {edge->src()->id(), edge->src_output()};
     int new_src_output = subgraph_edge_to_output_map.at(old_src);
     graph.UpdateEdge(trt_node, new_src_output, edge->dst(), edge->dst_input());
@@ -230,12 +226,6 @@ tensorflow::Status ConvertGraphDefToTensorRT(
   gCluster =
     new tensorflow::grappler::VirtualCluster({{"/GPU:0", device_properties}});
 
-  // single machine
-  int num_cpu_cores = tensorflow::grappler::GetNumAvailableLogicalCPUCores();
-  int num_gpus = tensorflow::grappler::GetNumAvailableGPUs();
-  LOG(DEBUG) << "cpu_cores: " << num_cpu_cores;
-  LOG(DEBUG) << "gpus: " << num_gpus;
-
   tensorflow::Status status = optimizer.Optimize(gCluster, item, &gdef);
 
   if (status !=tensorflow::Status::OK())
@@ -261,8 +251,11 @@ tensorflow::Status ConvertGraphDefToTensorRT(
 
   // Segment the graph into subgraphs that can be converted to TensorRT
   tensorflow::tensorrt::segment::SegmentOptions segment_options;
+
   // TODO(ben,jie,sami): exclude output nodes (DISCUSS IT)
-  for (auto node : output_names) output_nodes.insert(node);
+  for (auto node : output_names) {
+    segment_options.exclude_node_list.insert(node);
+  }
 
   // TODO(sami): this should be passed as a knob!!!!
   segment_options.minimum_segment_size = 2;
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.h b/tensorflow/contrib/tensorrt/convert/convert_graph.h
index 7695f66c3d..4b1863a89e 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.h
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.h
@@ -25,10 +25,15 @@ namespace tensorflow {
 namespace tensorrt {
 namespace convert {
 
+// max_batch_size: maximum batch size which can be used for inferencefor;
+//                 optimization targets inference run with max batch size.
+// max_workspace_size: The upper bound of memory allowence for engine building.
 tensorflow::Status ConvertGraphDefToTensorRT(
     const tensorflow::GraphDef& graph_def,
-    const std::vector<std::string>& output_names, size_t max_batch_size,
-    size_t max_workspace_size, tensorflow::GraphDef* new_graph_def);
+    const std::vector<std::string>& output_names,
+    size_t max_batch_size,
+    size_t max_workspace_size,
+    tensorflow::GraphDef* new_graph_def);
 }
 }  // namespace tensorrt
 }  // namespace tensorrt
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index c22d8bcd04..2fd7f659d5 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -51,34 +51,6 @@ namespace convert {
 
 namespace {
 
-inline int get_dtype_size(nvinfer1::DataType trt_dtype) {
-  switch (trt_dtype) {
-    case nvinfer1::DataType::kFLOAT:
-      return 4;
-    case nvinfer1::DataType::kINT8:
-      return 1;
-    case nvinfer1::DataType::kHALF:
-      return 2;
-    default:
-      return -1;
-  }
-}
-
-inline int get_dtype_size(tensorflow::DataType trt_dtype) {
-  switch (trt_dtype) {
-    case tensorflow::DataType::DT_FLOAT:
-      return 4;
-    case tensorflow::DataType::DT_INT8:
-      return 1;
-    case tensorflow::DataType::DT_HALF:
-      return 2;
-    case tensorflow::DataType::DT_INT32:
-      return 4;
-    default:
-      return -1;
-  }
-}
-
 inline tensorflow::Status convert_dtype(tensorflow::DataType tf_dtype,
                                         nvinfer1::DataType* trt_dtype) {
   switch (tf_dtype) {
@@ -166,7 +138,8 @@ class TRT_ShapedWeights {
     return nvinfer1::Weights{trt_type, values_, get_shape_size(shape_)};
   }
   size_t size_bytes() const {
-    return this->count() * get_dtype_size(this->type_);
+    int type_size = tensorflow::DataTypeSize(this->type_);
+    return this->count() * type_size;
   }
   // default converter
   operator nvinfer1::Weights() const { return getWeightsForTRT(); }
@@ -395,6 +368,7 @@ class Converter {
   TRT_ShapedWeights get_temp_weights(tensorflow::DataType type,
                                      nvinfer1::Dims shape) {
     TRT_ShapedWeights weights(type, nullptr, shape);
+    // TODO(jie): check weights size_bytes. 0 means type error
     _temp_bufs.push_back(std::vector<uint8_t>(weights.size_bytes()));
     weights.values_ = _temp_bufs.back().data();
     return weights;
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
index bef508d996..34116f6552 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
@@ -48,36 +48,6 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context) : OpKernel(context) {
   // runtime is safe to delete after engine creation
   infer->destroy();
   std::stringstream oss;
-  // debug iterate through all binding instances
-  for (int i = 0; i < trt_engine_ptr_->getNbBindings(); i++) {
-    LOG(INFO) << "index: " << i
-              << ", binding name: " << trt_engine_ptr_->getBindingName(i);
-
-    if (trt_engine_ptr_->bindingIsInput(i)) {
-      LOG(INFO) << "INPUT";
-    } else {
-      LOG(INFO) << "OUTPUT";
-    }
-    oss << "Dimension: ";
-    auto dims = trt_engine_ptr_->getBindingDimensions(i);
-    oss << " nbDims: " << dims.nbDims << " -> ";
-    for (int j = 0; j < Dims::MAX_DIMS; j++) {
-      oss << dims.d[j] << ", ";
-    }
-    LOG(INFO) << oss.str();
-    oss.str("");
-    switch (trt_engine_ptr_->getBindingDataType(i)) {
-      case nvinfer1::DataType::kFLOAT:
-        LOG(INFO) << "data type float" << std::endl;
-        break;
-      case nvinfer1::DataType::kHALF:
-        LOG(INFO) << "data type half" << std::endl;
-        break;
-      case nvinfer1::DataType::kINT8:
-        LOG(INFO) << "data type int8" << std::endl;
-        break;
-    }
-  }
 
 }
 
@@ -103,7 +73,6 @@ void TRTEngineOp::Compute(OpKernelContext* context) {
     }
     switch (trt_engine_ptr_->getBindingDataType(bindingIndex)) {
       case nvinfer1::DataType::kFLOAT:
-        LOG(INFO) << "float";
         buffers[bindingIndex] = (void*)(input_tensor.flat<float>().data());
         break;
       case nvinfer1::DataType::kHALF:
@@ -115,6 +84,7 @@ void TRTEngineOp::Compute(OpKernelContext* context) {
     }
   }
 
+  // Might want a different way to inform the user of batch size inconsistency
   if (!valid) LOG(WARNING) << "input data inconsistent batch size";
 
   for (int i = 0; i < static_cast<int>(output_nodes_.size()); i++) {
@@ -125,7 +95,6 @@ void TRTEngineOp::Compute(OpKernelContext* context) {
 
     TensorShape output_shape;
     if (bindingIndex != -1) {
-      LOG(INFO) << "got binding " << bindingIndex;
       auto dims = trt_engine_ptr_->getBindingDimensions(bindingIndex);
       std::vector<int> trt_shape(dims.nbDims + 1);
       trt_shape[0] = nbBatch;
@@ -133,7 +102,7 @@ void TRTEngineOp::Compute(OpKernelContext* context) {
       TensorShapeUtils::MakeShape(trt_shape.data(), trt_shape.size(),
                                   &output_shape);
     } else {
-      LOG(INFO) << "no binding ";
+      LOG(FATAL) << "output node not found, at " << output_nodes_[i];
       break;
     }
 
@@ -141,7 +110,6 @@ void TRTEngineOp::Compute(OpKernelContext* context) {
                    context->allocate_output(i, output_shape, &output_tensor));
     switch (trt_engine_ptr_->getBindingDataType(bindingIndex)) {
       case nvinfer1::DataType::kFLOAT:
-        LOG(INFO) << "float";
         buffers[bindingIndex] =
             reinterpret_cast<void*>(output_tensor->flat<float>().data());
         break;
@@ -160,8 +128,8 @@ void TRTEngineOp::Compute(OpKernelContext* context) {
                                                 ->implementation()
                                                 ->CudaStreamMemberHack()));
 
+  // execution handled by TF since we are getting stream from TF.
   trt_execution_context_ptr_->enqueue(nbBatch, &buffers[0], *stream, nullptr);
-  cudaStreamSynchronize(*stream);
 }
 
 REGISTER_KERNEL_BUILDER(Name("TRTEngineOp").Device(DEVICE_GPU), TRTEngineOp);
diff --git a/tensorflow/contrib/tensorrt/segment/segment.cc b/tensorflow/contrib/tensorrt/segment/segment.cc
index 45bd5cdf14..3ae6ca776b 100644
--- a/tensorflow/contrib/tensorrt/segment/segment.cc
+++ b/tensorflow/contrib/tensorrt/segment/segment.cc
@@ -140,7 +140,8 @@ tensorflow::Status SegmentGraph(
   std::vector<UnionFind<tensorflow::Node*>> node_segments;
   for (int i = 0; i < graph.num_node_ids(); ++i) {
     tensorflow::Node* node = graph.FindNodeId(i);
-    if (!candidate_fn(node->def())) {
+    if (options.exclude_node_list.count(node->name())!=0
+        || !candidate_fn(node->def())) {
       node = nullptr;
     }
     node_segments.emplace_back(node);
diff --git a/tensorflow/contrib/tensorrt/segment/segment.h b/tensorflow/contrib/tensorrt/segment/segment.h
index d64c325b9e..a374606026 100644
--- a/tensorflow/contrib/tensorrt/segment/segment.h
+++ b/tensorflow/contrib/tensorrt/segment/segment.h
@@ -32,6 +32,7 @@ using SegmentNodesVector = std::vector<std::set<std::string>>;
 struct SegmentOptions {
   // Segment must contain at least this many nodes.
   int minimum_segment_size = 2;
+  std::set<std::string> exclude_node_list;
 };
 
 // Get the subgraphs of a graph that can be handled by TensorRT.
diff --git a/tensorflow/contrib/tensorrt/segment/segment_test.cc b/tensorflow/contrib/tensorrt/segment/segment_test.cc
index cded3b6b61..c00aa1dbc9 100644
--- a/tensorflow/contrib/tensorrt/segment/segment_test.cc
+++ b/tensorflow/contrib/tensorrt/segment/segment_test.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 
 //------------------------------------------------------------------------------
-using namespace tensorflow;
+namespace tensorflow {
 
 namespace tensorrt {
 namespace segment {
@@ -361,3 +361,5 @@ TEST_F(SegmentTest, BigIfElse) {
 }  // namespace test
 }  // namespace segment
 }  // namespace tensorrt
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc b/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
index 48b12fde76..16a4dc2134 100644
--- a/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
+++ b/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
@@ -29,92 +29,50 @@ tensorflow::Status TRTEngineOpShapeInference(InferenceContext* c) {
   nvinfer1::ICudaEngine* trt_engine = infer->deserializeCudaEngine(
       serialized_engine.c_str(), serialized_engine.size(), nullptr);
 
-  // debug print out engine binding;
-  std::stringstream oss;
-  for (int i = 0; i < trt_engine->getNbBindings(); i++) {
-    LOG(INFO) << "index: " << i
-              << ", binding name: " << trt_engine->getBindingName(i);
-
-    bool input_flag = trt_engine->bindingIsInput(i);
-    oss << "input?: " << (input_flag ? "Y" : "N");
-
-    oss << "Dimension: ";
-    auto dims = trt_engine->getBindingDimensions(i);
-    oss << " nbDims: " << dims.nbDims << " -> ";
-    for (int j = 0; j < dims.nbDims; j++) oss << dims.d[j] << ", ";
-    LOG(INFO) << oss.str();
-    oss.str("");
-    switch (trt_engine->getBindingDataType(i)) {
-      case nvinfer1::DataType::kFLOAT:
-        LOG(INFO) << "data type: float" << std::endl;
-        break;
-      case nvinfer1::DataType::kHALF:
-        LOG(INFO) << "data type: half" << std::endl;
-        break;
-      case nvinfer1::DataType::kINT8:
-        LOG(INFO) << "data type: int8" << std::endl;
-        break;
-    }
-  }
 
   int nbBatch = -1;
   // debug print out input arrays
   std::vector<::tensorflow::DataType> input_type;
   c->GetAttr("InT", &input_type);
-  oss.str("");
   for (size_t i = 0; i < c->num_inputs(); i++) {
     // check if input shape is legit
     auto input_shape = c->input(i);
-    int index = i;
-    oss << "input:" << i << " type: " << input_type[index] << " shape: ";
     for (int j = 0; j < c->Rank(input_shape); j++) {
       auto dimHandler = c->Dim(input_shape, j);
-      if (c->ValueKnown(dimHandler))
-        oss << c->Value(dimHandler) << ", ";
-      else
-        oss << "?" << c->Value(dimHandler) << ", ";
       if (j == 0) {
         if (i == 0)
           nbBatch = c->Value(dimHandler);
         else if (nbBatch != c->Value(dimHandler))
-          LOG(WARNING) << "!!!!!!nbBatch does not match!!!!!!";
-        // assert(nbBatch == c->Value(dimHandler);
+          // TODO(jie): TensorRT engine requires consistent batch between inputs
+          //            tensors. Segmenter should be aware of this.
+          LOG(FATAL) << "TensorRT engine requires consistent batch size";
       }
     }
-    LOG(INFO) << oss.str();
   }
 
   // arrange input here
   std::vector<string> input_nodes;
   c->GetAttr("input_nodes", &input_nodes);
-  for (size_t i = 0; i < input_nodes.size(); i++) {
-    int index = i;
-    LOG(INFO) << "input:" << i << " name: " << input_nodes[index];
-  }
 
   // arrange output here
   std::vector<string> output_nodes;
   c->GetAttr("output_nodes", &output_nodes);
-  oss.str("");
   for (size_t i = 0; i < output_nodes.size(); i++) {
-    int index = i;
     int binding_index =
-        trt_engine->getBindingIndex(output_nodes[index].c_str());
-    oss << "string name " << output_nodes[index];
+        trt_engine->getBindingIndex(output_nodes[i].c_str());
     ShapeHandle output_shape;
     std::vector<DimensionHandle> vecDim;
     vecDim.emplace_back(c->MakeDim(nbBatch));
     if (binding_index != -1) {
-      oss << "got binding " << binding_index;
       auto dims = trt_engine->getBindingDimensions(binding_index);
       for (int j = 0; j < dims.nbDims; j++)
         vecDim.emplace_back(c->MakeDim(dims.d[j]));
     } else {
-      oss << "no binding ";
+      LOG(FATAL) << "TensorRT engine cannot find binding: "
+                 << output_nodes[i];
     }
     output_shape = c->MakeShape(vecDim);
     c->set_output(i, output_shape);
-    LOG(INFO) << oss.str();
   }
 
   return Status::OK();
-- 
GitLab


From 3dd09f3707f5c0a73127165461b430c8729d65e0 Mon Sep 17 00:00:00 2001
From: Akshay Agrawal <akshayka@google.com>
Date: Mon, 29 Jan 2018 09:57:20 -0800
Subject: [PATCH 320/966] Make `init_scope` preserve the active name scope.

This ensures that operations nested under an `init_scope` do not ignore name scopes that were opened in a function-building graph. In light of cl/183320576, which causes `tfe.defun` to automatically hoist variables out of its function-building graph, this change also eliminates logic in make_template that special-cased the first call when `create_graph_function` was True.

PiperOrigin-RevId: 183680794
---
 tensorflow/python/eager/function_test.py      | 18 +++++
 tensorflow/python/framework/ops.py            | 20 +++++-
 tensorflow/python/framework/ops_test.py       | 26 +++++++-
 tensorflow/python/kernel_tests/BUILD          |  1 +
 .../python/kernel_tests/variables_test.py     | 10 +++
 tensorflow/python/ops/template.py             | 66 +++++++------------
 6 files changed, 96 insertions(+), 45 deletions(-)

diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index 0babc29f17..2cb2cfb76c 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -504,6 +504,24 @@ class FunctionTest(test.TestCase):
     self.assertAllEqual(ret[0][2], 10)
     self.assertAllEqual(ret[1], 15)
 
+  def testVariableNamesRespectNameScopesWithDefun(self):
+    @function.defun
+    def create_variable():
+      with ops.name_scope('foo'):
+        v = resource_variable_ops.ResourceVariable(0.0, name='bar')
+      self.assertEqual(v.name, 'foo/bar:0')
+    create_variable()
+
+  def testVariableNamesRespectNameScopesWithDefunInGraph(self):
+    with context.graph_mode():
+      @function.defun
+      def create_variable():
+        with ops.name_scope('foo'):
+          v = resource_variable_ops.ResourceVariable([1.0, 2.0], name='bar')
+        self.assertEqual(v.name, 'foo/bar:0')
+      with ops.get_default_graph().as_default():
+        create_variable()
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index e3a52141a0..d5786cac68 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -5008,9 +5008,22 @@ def init_scope():
   """
   # pylint: enable=g-doc-return-or-yield,line-too-long
 
+  in_graph_mode = context.in_graph_mode()
+  # Retrieve the active name scope: entering an `init_scope` preserves
+  # the name scope of the current context.
+  if in_graph_mode:
+    default_graph = get_default_graph()
+    scope = default_graph.get_name_scope()
+  else:
+    scope = context.context().scope_name
+  if scope and scope[-1] != '/':
+    # Names that end with trailing slashes are treated by `name_scope` as
+    # absolute.
+    scope = scope + '/'
+
   outer_context = None
-  if context.in_graph_mode() and not _default_graph_stack.stack:
-    outer_context = get_default_graph().as_default
+  if in_graph_mode and not _default_graph_stack.stack:
+    outer_context = default_graph.as_default
   else:
     for stack_entry in reversed(context.context_stack.stack):
       if not stack_entry.is_building_function:
@@ -5022,7 +5035,8 @@ def init_scope():
                          "eager context was previously active.")
 
   try:
-    with outer_context(), control_dependencies(None), tape.stop_recording():
+    with outer_context(), name_scope(scope), control_dependencies(
+        None), tape.stop_recording():
       yield
   finally:
     pass
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index 78519f108b..c5e177d521 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -2072,10 +2072,34 @@ class InitScopeTest(test_util.TensorFlowTestCase):
       # pylint: disable=protected-access
       self.assertEqual(len(ops._default_graph_stack.stack), 0)
       with ops.init_scope():
-        self.assertEqual(len(ops._default_graph_stack.stack), 1)
+        self.assertGreater(len(ops._default_graph_stack.stack), 0)
       self.assertEqual(len(ops._default_graph_stack.stack), 0)
       # pylint: enable=protected-access
 
+  def testPreservesNameScopeInGraphConstruction(self):
+    with ops.Graph().as_default():
+      function_graph = ops.Graph()
+      with function_graph.as_default():
+        with ops.name_scope("inner"), ops.init_scope():
+          self.assertEqual(ops.get_name_scope(), "inner")
+      self.assertEqual(ops.get_name_scope(), "")
+
+  def testPreservesNameScopeInEagerExecution(self):
+    with context.eager_mode():
+      def foo():
+        with ops.name_scope("inner"), ops.init_scope():
+          if context.in_graph_mode():
+            self.assertEqual(ops.get_name_scope(), "inner")
+          else:
+            # A trailing slash is always appended when eager execution is
+            # enabled.
+            self.assertEqual(context.context().scope_name, "inner/")
+      foo()
+      self.assertEqual(ops.get_name_scope(), "")
+      foo_compiled = eager_function.defun(foo)
+      foo_compiled()
+      self.assertEqual(ops.get_name_scope(), "")
+
 
 @test_util.with_c_api
 class GraphTest(test_util.TensorFlowTestCase):
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index a49d6fb44a..c87b7652ad 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -1043,6 +1043,7 @@ tf_py_test(
         "//tensorflow/python:training",
         "//tensorflow/python:util",
         "//tensorflow/python:variables",
+        "//tensorflow/python/eager:function",
     ],
 )
 
diff --git a/tensorflow/python/kernel_tests/variables_test.py b/tensorflow/python/kernel_tests/variables_test.py
index f60ebf58f6..b16c8c002c 100644
--- a/tensorflow/python/kernel_tests/variables_test.py
+++ b/tensorflow/python/kernel_tests/variables_test.py
@@ -22,6 +22,7 @@ import operator
 
 import numpy as np
 
+from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
@@ -509,6 +510,15 @@ class VariablesTestCase(test.TestCase):
         "<tf.Variable 'noop:0' shape=(5, 5) dtype=float32_ref>",
         repr(var))
 
+  def testVariableNamesPreserveNameScopesWithDefun(self):
+    @function.defun
+    def create_variable():
+      with ops.name_scope("foo"):
+        v = variables.Variable(0.0, name="bar")
+      self.assertEqual(v.name, "foo/bar:0")
+    with ops.get_default_graph().as_default():
+      create_variable()
+
 
 class IsInitializedTest(test.TestCase):
 
diff --git a/tensorflow/python/ops/template.py b/tensorflow/python/ops/template.py
index 84449e00be..806fdd3da7 100644
--- a/tensorflow/python/ops/template.py
+++ b/tensorflow/python/ops/template.py
@@ -140,7 +140,7 @@ def make_template(name_, func_, create_scope_now_=False, unique_name_=None,
     re-enter the scope and reuse those variables.
 
   Raises:
-    ValueError: if the name is None.
+    ValueError: if `name_` is None.
   """
   return make_template_internal(
       name_,
@@ -176,16 +176,14 @@ def make_template_internal(name_,
     custom_getter_: Optional custom getter for variables used in `func_`. See
       the @{tf.get_variable} `custom_getter` documentation for
       more information.
-    create_graph_function_: When True, the first invocation of the template will
-      execute `func_` as is, to allow for variable creation; however, the second
-      invocation and every invocation thereafter will execute func as a graph
-      function.  In particular, this implies that `func_` must satisfy the
-      properties that `function.defun` requires of functions: See the
-      documentation of `function.defun` for details. When executing eagerly,
-      setting this flag to True can improve performance. Regardless of whether
-      eager execution is enabled, enabling this flag gives the caller access to
-      graph-function semantics, i.e., accesses to variables are totally ordered
-      and side-effecting ops are not pruned.
+    create_graph_function_: When True, `func_` will be executed as a graph
+      function. This implies that `func_` must satisfy the properties that
+      `function.defun` requires of functions: See the documentation of
+      `function.defun` for details. When executing eagerly, setting this flag to
+      True can improve performance. Regardless of whether eager execution is
+      enabled, enabling this flag gives the caller access to graph-function
+      semantics, i.e., accesses to variables are totally ordered and
+      side-effecting ops are not pruned.
     **kwargs: Keyword arguments to apply to `func_`.
 
   Returns:
@@ -198,8 +196,8 @@ def make_template_internal(name_,
     re-enter the scope and reuse those variables.
 
   Raises:
-    ValueError: if the name is None.
-    ValueError: if unique_name_ is not None and eager execution is enabled.
+    ValueError: if `name_` is None.
+    ValueError: if `unique_name_` is not None and eager execution is enabled.
   """
 
   if kwargs:
@@ -266,18 +264,18 @@ class Template(object):
         template of the same scope/unique_name already exists and reuse is
         false, an error is raised. Defaults to None.
       custom_getter: optional custom getter to pass to `variable_scope()`
-      create_graph_function: When True, the first invocation of the template
-        will execute `func` as is, to allow for variable creation; however, the
-        second invocation and every invocation thereafter will execute `func` as
-        a graph function. Enabling this flag gives the caller access to
-        graph-function semantics, i.e., accesses to variables are totally
-        ordered and side-effecting ops are not pruned.
-
+      create_graph_function: When True, `func` will be executed as a graph
+        function. Enabling this flag gives the caller access to graph-function
+        semantics, i.e., accesses to variables are totally ordered and
+        side-effecting ops are not pruned.
 
     Raises:
-      ValueError: if the name is None.
+      ValueError: if `name` is None.
     """
-    self._func = func
+    if create_graph_function:
+      self._func = function.defun(func)
+    else:
+      self._func = func
     self._stacktrace = traceback.format_stack()[:-2]
     self._name = name
     self._unique_name = unique_name
@@ -295,19 +293,13 @@ class Template(object):
     # This variable keeps track of whether the template has been called yet,
     # which is not the same as whether the scope has been created.
     self._variables_created = False
-    self._create_graph_function = create_graph_function
 
   def _call_func(self, args, kwargs):
     try:
       vars_at_start = len(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
       trainable_at_start = len(
           ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
-
       result = self._func(*args, **kwargs)
-      if self._create_graph_function and not self._variables_created:
-        # Only execute self._func as a graph function once variables are
-        # created.
-        self._func = function.defun(self._func)
 
       if self._variables_created:
         # Variables were previously created, implying this is not the first
@@ -542,14 +534,11 @@ class EagerTemplate(Template):
         names of all created Tensors. If set to False, the scope will be created
         at the first call location.
       custom_getter: optional custom getter to pass to `variable_scope()`
-      create_graph_function: When True, the first invocation of the template
-        will execute `func` as is, to allow for variable creation; however, the
-        second invocation and every invocation thereafter will execute `func` as
-        a graph function. Enabling this flag allows the caller to reap the
-        performance benefits associated with executing graphs, at the cost of
-        sacrificing debuggability; however, not all functions can be compiled
-        into graph functions. See the documentation for `function.defun` for
-        details.
+      create_graph_function: When True, `func` will be executed as a graph
+        function. Enabling this flag allows the caller to reap the performance
+        benefits associated with executing graphs, at the cost of sacrificing
+        debuggability; however, not all Python functions can be compiled into
+        graph functions. See the documentation for `function.defun` for details.
 
     Raises:
       RuntimeError: if eager execution is not enabled.
@@ -573,12 +562,7 @@ class EagerTemplate(Template):
     try:
       vars_at_start = self._template_store.variables()
       trainable_at_start = self._template_store.trainable_variables()
-
       result = self._func(*args, **kwargs)
-      if self._create_graph_function and not self._variables_created:
-        # Only execute self._func as a graph function once variables are
-        # created.
-        self._func = function.defun(self._func)
 
       if self._variables_created:
         # Variables were previously created, implying this is not the first
-- 
GitLab


From c1e58361a090678534830f2cbfd868aa63fc2c98 Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Mon, 29 Jan 2018 10:01:50 -0800
Subject: [PATCH 321/966] Internal change

PiperOrigin-RevId: 183681594
---
 tensorflow/core/grappler/inputs/file_input_yielder.h | 6 +++---
 tensorflow/core/kernels/fuzzing/fuzz_session.h       | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tensorflow/core/grappler/inputs/file_input_yielder.h b/tensorflow/core/grappler/inputs/file_input_yielder.h
index a17e1c9ff2..b597319261 100644
--- a/tensorflow/core/grappler/inputs/file_input_yielder.h
+++ b/tensorflow/core/grappler/inputs/file_input_yielder.h
@@ -18,8 +18,8 @@ limitations under the License.
 // that may be stored in the checkpoint are not restored in order to speedup the
 // initialization.
 
-#ifndef LEARNING_BRAIN_EXPERIMENTAL_GRAPPLER_INPUTS_FILE_INPUT_YIELDER_H_
-#define LEARNING_BRAIN_EXPERIMENTAL_GRAPPLER_INPUTS_FILE_INPUT_YIELDER_H_
+#ifndef TENSORFLOW_GRAPPLER_INPUTS_FILE_INPUT_YIELDER_H_
+#define TENSORFLOW_GRAPPLER_INPUTS_FILE_INPUT_YIELDER_H_
 
 #include <stddef.h>
 #include <limits>
@@ -53,4 +53,4 @@ class FileInputYielder : public InputYielder {
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // LEARNING_BRAIN_EXPERIMENTAL_GRAPPLER_INPUTS_FILE_INPUT_YIELDER_H_
+#endif  // TENSORFLOW_GRAPPLER_INPUTS_FILE_INPUT_YIELDER_H_
diff --git a/tensorflow/core/kernels/fuzzing/fuzz_session.h b/tensorflow/core/kernels/fuzzing/fuzz_session.h
index 0c0e548a90..f1f3f199df 100644
--- a/tensorflow/core/kernels/fuzzing/fuzz_session.h
+++ b/tensorflow/core/kernels/fuzzing/fuzz_session.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef LEARNING_BRAIN_KERNELS_FUZZING_FUZZ_SESSION_H_
-#define LEARNING_BRAIN_KERNELS_FUZZING_FUZZ_SESSION_H_
+#ifndef TENSORFLOW_CORE_KERNELS_FUZZING_FUZZ_SESSION_H_
+#define TENSORFLOW_CORE_KERNELS_FUZZING_FUZZ_SESSION_H_
 
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/core/graph/graph.h"
@@ -153,4 +153,4 @@ class FuzzStringInputOp : public FuzzSession {
 }  // end namespace fuzzing
 }  // end namespace tensorflow
 
-#endif  // LEARNING_BRAIN_KERNELS_FUZZING_FUZZ_SESSION_H_
+#endif  // TENSORFLOW_CORE_KERNELS_FUZZING_FUZZ_SESSION_H_
-- 
GitLab


From 197f573f3bd830bbc4dc277e7c8b8cfaca65e14a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 29 Jan 2018 10:12:42 -0800
Subject: [PATCH 322/966] Reduce memory wasted by GCS cache by shrinking buffer
 capacity, after a cache fill completes.

PiperOrigin-RevId: 183683856
---
 tensorflow/core/platform/cloud/file_block_cache.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/core/platform/cloud/file_block_cache.cc b/tensorflow/core/platform/cloud/file_block_cache.cc
index 0375af516b..6add1142a1 100644
--- a/tensorflow/core/platform/cloud/file_block_cache.cc
+++ b/tensorflow/core/platform/cloud/file_block_cache.cc
@@ -131,6 +131,7 @@ Status FileBlockCache::MaybeFetch(const Key& key,
         block->mu.lock();  // Reacquire the lock immediately afterwards
         if (status.ok()) {
           block->data.resize(bytes_transferred, 0);
+          block->data.shrink_to_fit();
           downloaded_block = true;
           block->state = FetchState::FINISHED;
         } else {
-- 
GitLab


From 0905a7ed035e66f3abdb123ab53cb0c640e60f0b Mon Sep 17 00:00:00 2001
From: Yu-Cheng Ling <ycling@google.com>
Date: Mon, 29 Jan 2018 10:19:58 -0800
Subject: [PATCH 323/966] Fix a comment

PiperOrigin-RevId: 183685209
---
 tensorflow/contrib/lite/interpreter.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/lite/interpreter.h b/tensorflow/contrib/lite/interpreter.h
index 4f732769f9..9dc864ead8 100644
--- a/tensorflow/contrib/lite/interpreter.h
+++ b/tensorflow/contrib/lite/interpreter.h
@@ -108,7 +108,7 @@ class Interpreter {
 
   // Adds a node with the given parameters and returns the index of the new
   // node in `node_index` (optionally). Interpreter will take ownership of
-  // `builtin_data` and destroy it with `delete`. Ownership of 'init_data'
+  // `builtin_data` and destroy it with `free`. Ownership of 'init_data'
   // remains with the caller.
   TfLiteStatus AddNodeWithParameters(const std::vector<int>& inputs,
                                      const std::vector<int>& outputs,
-- 
GitLab


From 1f26c65254268730b7409f517d1ed1b554d01e50 Mon Sep 17 00:00:00 2001
From: Shanqing Cai <cais@google.com>
Date: Mon, 29 Jan 2018 10:22:10 -0800
Subject: [PATCH 324/966] tfdbg: let session wrappers handle empty fetches
 correctly

Fixes: #15882
PiperOrigin-RevId: 183685645
---
 .../python/debug/wrappers/dumping_wrapper_test.py  |  5 +++++
 tensorflow/python/debug/wrappers/framework.py      |  9 ++++++++-
 .../debug/wrappers/local_cli_wrapper_test.py       | 14 ++++++++++++++
 3 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/debug/wrappers/dumping_wrapper_test.py b/tensorflow/python/debug/wrappers/dumping_wrapper_test.py
index acea9433e2..254201c393 100644
--- a/tensorflow/python/debug/wrappers/dumping_wrapper_test.py
+++ b/tensorflow/python/debug/wrappers/dumping_wrapper_test.py
@@ -389,6 +389,11 @@ class DumpingDebugWrapperSessionTest(test_util.TensorFlowTestCase):
         r"mode\."):
       sess.invoke_node_stepper(node_stepper)
 
+  def testDumpingWrapperWithEmptyFetchWorks(self):
+    sess = dumping_wrapper.DumpingDebugWrapperSession(
+        self.sess, session_root=self.session_root, log_usage=False)
+    sess.run([])
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/debug/wrappers/framework.py b/tensorflow/python/debug/wrappers/framework.py
index 909150eb6a..c530204bbf 100644
--- a/tensorflow/python/debug/wrappers/framework.py
+++ b/tensorflow/python/debug/wrappers/framework.py
@@ -121,7 +121,9 @@ from tensorflow.python.debug.lib import debug_utils
 from tensorflow.python.debug.lib import stepper
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.platform import tf_logging
 from tensorflow.python.training import monitored_session
+from tensorflow.python.util import nest
 
 
 # Helper function.
@@ -439,7 +441,12 @@ class BaseDebugWrapperSession(session.SessionInterface):
             "callable_runner and fetches/feed_dict are mutually exclusive, but "
             "are used simultaneously.")
 
-    if self._is_disabled_thread():
+    empty_fetches = not nest.flatten(fetches)
+    if empty_fetches:
+      tf_logging.info(
+          "Due to empty fetches, tfdbg Session wrapper is letting a "
+          "Session.run pass through without any debugging actions.")
+    if self._is_disabled_thread() or empty_fetches:
       if callable_runner:
         return callable_runner(*callable_runner_args)
       else:
diff --git a/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py b/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
index 770a496aa9..490812c96d 100644
--- a/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
+++ b/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
@@ -664,6 +664,20 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
         [["run"], ["run"]], monitored_sess)
     self.assertFalse(wrapped_monitored_sess.should_stop())
 
+  def testRunsWithEmptyFetchWorks(self):
+    wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
+        [["run"]], self.sess, dump_root="")
+
+    run_output = wrapped_sess.run([])
+    self.assertEqual([], run_output)
+
+  def testRunsWithEmptyNestedFetchWorks(self):
+    wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
+        [["run"]], self.sess, dump_root="")
+
+    run_output = wrapped_sess.run({"foo": {"baz": []}, "bar": ()})
+    self.assertEqual({"foo": {"baz": []}, "bar": ()}, run_output)
+
 
 if __name__ == "__main__":
   googletest.main()
-- 
GitLab


From 730071d0dca35a9e08f3bdc49661ae34d109da74 Mon Sep 17 00:00:00 2001
From: Nupur Garg <nupurgarg@google.com>
Date: Mon, 29 Jan 2018 10:31:23 -0800
Subject: [PATCH 325/966] Make TFLite BatchToSpaceND op have parity with TF
 BatchToSpaceND op.

PiperOrigin-RevId: 183687487
---
 tensorflow/contrib/lite/builtin_op_data.h     |  8 --
 .../contrib/lite/kernels/batch_to_space_nd.cc | 69 +++++++++-----
 .../lite/kernels/batch_to_space_nd_test.cc    | 91 +++++++++++++++----
 tensorflow/contrib/lite/model.cc              | 15 ---
 tensorflow/contrib/lite/schema/schema.fbs     |  3 -
 .../contrib/lite/schema/schema_generated.h    | 89 +-----------------
 .../contrib/lite/testing/generate_examples.py | 43 +++++++--
 .../testing/generated_examples_zip_test.cc    |  6 +-
 .../contrib/lite/toco/tflite/operator.cc      | 15 +--
 .../contrib/lite/toco/tflite/operator_test.cc | 13 ---
 10 files changed, 160 insertions(+), 192 deletions(-)

diff --git a/tensorflow/contrib/lite/builtin_op_data.h b/tensorflow/contrib/lite/builtin_op_data.h
index 8966e5c2b6..6dd9cb39d2 100644
--- a/tensorflow/contrib/lite/builtin_op_data.h
+++ b/tensorflow/contrib/lite/builtin_op_data.h
@@ -127,14 +127,6 @@ typedef struct {
 } TfLiteSpaceToBatchNDParams;
 
 typedef struct {
-  // Number of spatial dimensions.
-  // For now only NHWC is supported, and the value should always be 2.
-  int num_spatial_dimensions;
-  // TODO(ahentz): We can't have dynamic data in this struct, at least not yet.
-  // For now we will fix the maximum possible number of dimensions.
-  int block_shape[2];
-  int before_crops[2];
-  int after_crops[2];
 } TfLiteBatchToSpaceNDParams;
 
 typedef struct {
diff --git a/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc b/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc
index 0eed680fdc..d84a77039b 100644
--- a/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc
+++ b/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc
@@ -35,12 +35,14 @@ enum KernelType {
 
 struct BatchToSpaceNDContext {
   BatchToSpaceNDContext(TfLiteContext* context, TfLiteNode* node) {
-    params = reinterpret_cast<TfLiteBatchToSpaceNDParams*>(node->builtin_data);
     input = GetInput(context, node, 0);
+    block_shape = GetInput(context, node, 1);
+    crops = GetInput(context, node, 2);
     output = GetOutput(context, node, 0);
   }
-  TfLiteBatchToSpaceNDParams* params;
   TfLiteTensor* input;
+  TfLiteTensor* block_shape;
+  TfLiteTensor* crops;
   TfLiteTensor* output;
 };
 
@@ -48,24 +50,22 @@ struct BatchToSpaceNDContext {
 // The 4D array need to have exactly 2 spatial dimensions.
 // TODO(ycling): Support arbitrary dimension in BatchToSpaceND.
 const int kInputDimensionNum = 4;
-const int kOutputDimensionNum = 4;
+const int kBlockSizeDimensionNum = 1;
 const int kSpatialDimensionNum = 2;
 
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  // The 2nd tensor (block_shape) and the 3rd tensor (crops) are ignored now.
-  TF_LITE_ENSURE(context, NumInputs(node) >= 1 && NumInputs(node) <= 3);
-  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
+                                BatchToSpaceNDContext* op_context) {
+  TfLiteIntArray* input_size = op_context->input->dims;
+  const int* block_shape = GetTensorData<int32>(op_context->block_shape);
 
-  BatchToSpaceNDContext op_context(context, node);
-  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.input),
-                    kInputDimensionNum);
-  TF_LITE_ENSURE_EQ(context, op_context.params->num_spatial_dimensions,
+  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context->block_shape),
+                    kBlockSizeDimensionNum);
+  TF_LITE_ENSURE_EQ(context, op_context->block_shape->dims->data[0],
+                    kSpatialDimensionNum);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context->crops),
                     kSpatialDimensionNum);
-  TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type);
-
-  const TfLiteIntArray* input_size = op_context.input->dims;
-  const int* block_shape = op_context.params->block_shape;
 
+  // TODO(ycling): Add crops as part of calculation.
   // Number of batch must be multiple of (block_shape[0] * block_shape[1]).
   TF_LITE_ENSURE_EQ(context,
                     input_size->data[0] % (block_shape[0] * block_shape[1]), 0);
@@ -76,27 +76,48 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const int output_width = input_size->data[2] * block_shape[1];
   const int output_channel_size = input_size->data[3];
 
-  TfLiteIntArray* output_size = TfLiteIntArrayCreate(kOutputDimensionNum);
+  TfLiteIntArray* output_size = TfLiteIntArrayCopy(input_size);
   output_size->data[0] = output_batch_size;
   output_size->data[1] = output_height;
   output_size->data[2] = output_width;
   output_size->data[3] = output_channel_size;
 
-  return context->ResizeTensor(context, op_context.output, output_size);
+  return context->ResizeTensor(context, op_context->output, output_size);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 3);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  BatchToSpaceNDContext op_context(context, node);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.input),
+                    kInputDimensionNum);
+  TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type);
+
+  if (!IsConstantTensor(op_context.block_shape) ||
+      !IsConstantTensor(op_context.crops)) {
+    SetTensorToDynamic(op_context.output);
+    return kTfLiteOk;
+  }
+  return ResizeOutputTensor(context, &op_context);
 }
 
 template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   BatchToSpaceNDContext op_context(context, node);
 
-  int block_shape_dims_array[1] = {kSpatialDimensionNum};
-  Dims<4> block_shape_dims = GetTensorDims(block_shape_dims_array, 1);
+  // Resize the output tensor if the output tensor is dynamic.
+  if (IsDynamicTensor(op_context.output)) {
+    TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context));
+    TfLiteTensorRealloc(op_context.output->bytes, op_context.output);
+  }
 
-#define TF_LITE_BATCH_TO_SPACE_ND(type, scalar)                          \
-  type::BatchToSpaceND(GetTensorData<scalar>(op_context.input),          \
-                       GetTensorDims(op_context.input),                  \
-                       op_context.params->block_shape, block_shape_dims, \
-                       GetTensorData<scalar>(op_context.output),         \
+#define TF_LITE_BATCH_TO_SPACE_ND(type, scalar)                        \
+  type::BatchToSpaceND(GetTensorData<scalar>(op_context.input),        \
+                       GetTensorDims(op_context.input),                \
+                       GetTensorData<int32_t>(op_context.block_shape), \
+                       GetTensorDims(op_context.block_shape),          \
+                       GetTensorData<scalar>(op_context.output),       \
                        GetTensorDims(op_context.output))
   switch (op_context.input->type) {  // Already know in/out types are same.
     case kTfLiteFloat32:
diff --git a/tensorflow/contrib/lite/kernels/batch_to_space_nd_test.cc b/tensorflow/contrib/lite/kernels/batch_to_space_nd_test.cc
index 3ec4efbebc..c9152bf967 100644
--- a/tensorflow/contrib/lite/kernels/batch_to_space_nd_test.cc
+++ b/tensorflow/contrib/lite/kernels/batch_to_space_nd_test.cc
@@ -26,37 +26,88 @@ using ::testing::ElementsAreArray;
 
 class BatchToSpaceNDOpModel : public SingleOpModel {
  public:
-  BatchToSpaceNDOpModel(std::initializer_list<int> input_shape,
-                        std::initializer_list<int> block_shape,
-                        std::initializer_list<int> before_crops,
-                        std::initializer_list<int> after_crops) {
-    input_ = AddInput(TensorType_FLOAT32);
-    output_ = AddOutput(TensorType_FLOAT32);
-    SetBuiltinOp(BuiltinOperator_BATCH_TO_SPACE_ND,
-                 BuiltinOptions_BatchToSpaceNDOptions,
-                 CreateBatchToSpaceNDOptions(
-                     builder_, builder_.CreateVector<int>(block_shape),
-                     builder_.CreateVector<int>(before_crops),
-                     builder_.CreateVector<int>(after_crops))
-                     .Union());
-    BuildInterpreter({input_shape});
-  }
-
   void SetInput(std::initializer_list<float> data) {
     PopulateTensor<float>(input_, data);
   }
 
+  void SetBlockShape(std::initializer_list<int> data) {
+    PopulateTensor<int>(block_shape_, data);
+  }
+
+  void SetCrops(std::initializer_list<int> data) {
+    PopulateTensor<int>(crops_, data);
+  }
+
   std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
 
- private:
+ protected:
   int input_;
+  int block_shape_;
+  int crops_;
   int output_;
 };
 
-TEST(BatchToSpaceNDOpTest, SimpleTest) {
-  BatchToSpaceNDOpModel m({4, 2, 2, 1}, {2, 2}, {0, 0}, {0, 0});
+// Tests case where block_shape and crops are const tensors.
+//
+// Example usage is as follows:
+//    BatchToSpaceNDOpConstModel m(input_shape, block_shape, crops);
+//    m.SetInput(input_data);
+//    m.Invoke();
+class BatchToSpaceNDOpConstModel : public BatchToSpaceNDOpModel {
+ public:
+  BatchToSpaceNDOpConstModel(std::initializer_list<int> input_shape,
+                             std::initializer_list<int> block_shape,
+                             std::initializer_list<int> crops) {
+    input_ = AddInput(TensorType_FLOAT32);
+    block_shape_ = AddConstInput(TensorType_INT32, block_shape, {2});
+    crops_ = AddConstInput(TensorType_INT32, crops, {2, 2});
+    output_ = AddOutput(TensorType_FLOAT32);
+
+    SetBuiltinOp(BuiltinOperator_BATCH_TO_SPACE_ND,
+                 BuiltinOptions_BatchToSpaceNDOptions,
+                 CreateBatchToSpaceNDOptions(builder_).Union());
+    BuildInterpreter({input_shape});
+  }
+};
+
+// Tests case where block_shape and crops are non-const tensors.
+//
+// Example usage is as follows:
+//    BatchToSpaceNDOpDynamicModel m(input_shape);
+//    m.SetInput(input_data);
+//    m.SetBlockShape(block_shape);
+//    m.SetPaddings(crops);
+//    m.Invoke();
+class BatchToSpaceNDOpDynamicModel : public BatchToSpaceNDOpModel {
+ public:
+  BatchToSpaceNDOpDynamicModel(std::initializer_list<int> input_shape) {
+    input_ = AddInput(TensorType_FLOAT32);
+    block_shape_ = AddInput(TensorType_INT32);
+    crops_ = AddInput(TensorType_INT32);
+    output_ = AddOutput(TensorType_FLOAT32);
+
+    SetBuiltinOp(BuiltinOperator_BATCH_TO_SPACE_ND,
+                 BuiltinOptions_BatchToSpaceNDOptions,
+                 CreateBatchToSpaceNDOptions(builder_).Union());
+    BuildInterpreter({input_shape, {2}, {2, 2}});
+  }
+};
+
+TEST(BatchToSpaceNDOpTest, SimpleConstTest) {
+  BatchToSpaceNDOpConstModel m({4, 2, 2, 1}, {2, 2}, {0, 0, 0, 0});
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 5, 2, 6, 9, 13, 10, 14, 3, 7,
+                                               4, 8, 11, 15, 12, 16}));
+}
+
+TEST(BatchToSpaceNDOpTest, SimpleDynamicTest) {
+  BatchToSpaceNDOpDynamicModel m({4, 2, 2, 1});
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.SetBlockShape({2, 2});
+  m.SetCrops({0, 0, 0, 0});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 5, 2, 6, 9, 13, 10, 14, 3, 7,
@@ -64,7 +115,7 @@ TEST(BatchToSpaceNDOpTest, SimpleTest) {
 }
 
 TEST(BatchToSpaceNDOpTest, InvalidShapeTest) {
-  EXPECT_DEATH(BatchToSpaceNDOpModel({3, 2, 2, 1}, {2, 2}, {0, 0}, {0, 0}),
+  EXPECT_DEATH(BatchToSpaceNDOpConstModel({3, 2, 2, 1}, {2, 2}, {0, 0, 0, 0}),
                "Cannot allocate tensors");
 }
 
diff --git a/tensorflow/contrib/lite/model.cc b/tensorflow/contrib/lite/model.cc
index ec4d6e3487..64b8f55097 100644
--- a/tensorflow/contrib/lite/model.cc
+++ b/tensorflow/contrib/lite/model.cc
@@ -536,21 +536,6 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
       break;
     }
     case BuiltinOperator_BATCH_TO_SPACE_ND: {
-      auto* params = MallocPOD<TfLiteBatchToSpaceNDParams>();
-      if (auto* schema_params =
-              op->builtin_options_as_BatchToSpaceNDOptions()) {
-        const auto& block_shape = schema_params->block_shape();
-        FlatBufferIntVectorToArray(sizeof(params->block_shape), block_shape,
-                                   params->block_shape, error_reporter);
-        const auto& before_crops = schema_params->before_crops();
-        FlatBufferIntVectorToArray(sizeof(params->before_crops), before_crops,
-                                   params->before_crops, error_reporter);
-        const auto& after_crops = schema_params->after_crops();
-        FlatBufferIntVectorToArray(sizeof(params->after_crops), after_crops,
-                                   params->after_crops, error_reporter);
-        params->num_spatial_dimensions = block_shape->Length();
-      }
-      builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_TRANSPOSE: {
diff --git a/tensorflow/contrib/lite/schema/schema.fbs b/tensorflow/contrib/lite/schema/schema.fbs
index 50709344ea..f6217567f8 100644
--- a/tensorflow/contrib/lite/schema/schema.fbs
+++ b/tensorflow/contrib/lite/schema/schema.fbs
@@ -295,9 +295,6 @@ table SpaceToBatchNDOptions {
 }
 
 table BatchToSpaceNDOptions {
-  block_shape:[int];
-  before_crops:[int];
-  after_crops:[int];
 }
 
 table SkipGramOptions {
diff --git a/tensorflow/contrib/lite/schema/schema_generated.h b/tensorflow/contrib/lite/schema/schema_generated.h
index f1ee925df2..ad758529a3 100755
--- a/tensorflow/contrib/lite/schema/schema_generated.h
+++ b/tensorflow/contrib/lite/schema/schema_generated.h
@@ -2929,33 +2929,14 @@ flatbuffers::Offset<SpaceToBatchNDOptions> CreateSpaceToBatchNDOptions(
 
 struct BatchToSpaceNDOptionsT : public flatbuffers::NativeTable {
   typedef BatchToSpaceNDOptions TableType;
-  std::vector<int32_t> block_shape;
-  std::vector<int32_t> before_crops;
-  std::vector<int32_t> after_crops;
   BatchToSpaceNDOptionsT() {}
 };
 
 struct BatchToSpaceNDOptions FLATBUFFERS_FINAL_CLASS
     : private flatbuffers::Table {
   typedef BatchToSpaceNDOptionsT NativeTableType;
-  enum { VT_BLOCK_SHAPE = 4, VT_BEFORE_CROPS = 6, VT_AFTER_CROPS = 8 };
-  const flatbuffers::Vector<int32_t> *block_shape() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_BLOCK_SHAPE);
-  }
-  const flatbuffers::Vector<int32_t> *before_crops() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_BEFORE_CROPS);
-  }
-  const flatbuffers::Vector<int32_t> *after_crops() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_AFTER_CROPS);
-  }
   bool Verify(flatbuffers::Verifier &verifier) const {
-    return VerifyTableStart(verifier) &&
-           VerifyOffset(verifier, VT_BLOCK_SHAPE) &&
-           verifier.Verify(block_shape()) &&
-           VerifyOffset(verifier, VT_BEFORE_CROPS) &&
-           verifier.Verify(before_crops()) &&
-           VerifyOffset(verifier, VT_AFTER_CROPS) &&
-           verifier.Verify(after_crops()) && verifier.EndTable();
+    return VerifyTableStart(verifier) && verifier.EndTable();
   }
   BatchToSpaceNDOptionsT *UnPack(
       const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -2970,18 +2951,6 @@ struct BatchToSpaceNDOptions FLATBUFFERS_FINAL_CLASS
 struct BatchToSpaceNDOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
-  void add_block_shape(
-      flatbuffers::Offset<flatbuffers::Vector<int32_t>> block_shape) {
-    fbb_.AddOffset(BatchToSpaceNDOptions::VT_BLOCK_SHAPE, block_shape);
-  }
-  void add_before_crops(
-      flatbuffers::Offset<flatbuffers::Vector<int32_t>> before_crops) {
-    fbb_.AddOffset(BatchToSpaceNDOptions::VT_BEFORE_CROPS, before_crops);
-  }
-  void add_after_crops(
-      flatbuffers::Offset<flatbuffers::Vector<int32_t>> after_crops) {
-    fbb_.AddOffset(BatchToSpaceNDOptions::VT_AFTER_CROPS, after_crops);
-  }
   explicit BatchToSpaceNDOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
       : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -2995,29 +2964,11 @@ struct BatchToSpaceNDOptionsBuilder {
 };
 
 inline flatbuffers::Offset<BatchToSpaceNDOptions> CreateBatchToSpaceNDOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> block_shape = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> before_crops = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> after_crops = 0) {
+    flatbuffers::FlatBufferBuilder &_fbb) {
   BatchToSpaceNDOptionsBuilder builder_(_fbb);
-  builder_.add_after_crops(after_crops);
-  builder_.add_before_crops(before_crops);
-  builder_.add_block_shape(block_shape);
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<BatchToSpaceNDOptions>
-CreateBatchToSpaceNDOptionsDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    const std::vector<int32_t> *block_shape = nullptr,
-    const std::vector<int32_t> *before_crops = nullptr,
-    const std::vector<int32_t> *after_crops = nullptr) {
-  return tflite::CreateBatchToSpaceNDOptions(
-      _fbb, block_shape ? _fbb.CreateVector<int32_t>(*block_shape) : 0,
-      before_crops ? _fbb.CreateVector<int32_t>(*before_crops) : 0,
-      after_crops ? _fbb.CreateVector<int32_t>(*after_crops) : 0);
-}
-
 flatbuffers::Offset<BatchToSpaceNDOptions> CreateBatchToSpaceNDOptions(
     flatbuffers::FlatBufferBuilder &_fbb, const BatchToSpaceNDOptionsT *_o,
     const flatbuffers::rehasher_function_t *_rehasher = nullptr);
@@ -5774,33 +5725,6 @@ inline void BatchToSpaceNDOptions::UnPackTo(
     const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  {
-    auto _e = block_shape();
-    if (_e) {
-      _o->block_shape.resize(_e->size());
-      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
-        _o->block_shape[_i] = _e->Get(_i);
-      }
-    }
-  };
-  {
-    auto _e = before_crops();
-    if (_e) {
-      _o->before_crops.resize(_e->size());
-      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
-        _o->before_crops[_i] = _e->Get(_i);
-      }
-    }
-  };
-  {
-    auto _e = after_crops();
-    if (_e) {
-      _o->after_crops.resize(_e->size());
-      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
-        _o->after_crops[_i] = _e->Get(_i);
-      }
-    }
-  };
 }
 
 inline flatbuffers::Offset<BatchToSpaceNDOptions> BatchToSpaceNDOptions::Pack(
@@ -5820,14 +5744,7 @@ inline flatbuffers::Offset<BatchToSpaceNDOptions> CreateBatchToSpaceNDOptions(
     const flatbuffers::rehasher_function_t *__rehasher;
   } _va = {&_fbb, _o, _rehasher};
   (void)_va;
-  auto _block_shape =
-      _o->block_shape.size() ? _fbb.CreateVector(_o->block_shape) : 0;
-  auto _before_crops =
-      _o->before_crops.size() ? _fbb.CreateVector(_o->before_crops) : 0;
-  auto _after_crops =
-      _o->after_crops.size() ? _fbb.CreateVector(_o->after_crops) : 0;
-  return tflite::CreateBatchToSpaceNDOptions(_fbb, _block_shape, _before_crops,
-                                             _after_crops);
+  return tflite::CreateBatchToSpaceNDOptions(_fbb);
 }
 
 inline SkipGramOptionsT *SkipGramOptions::UnPack(
diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py
index bc9b23aeb4..4ae6ccb765 100644
--- a/tensorflow/contrib/lite/testing/generate_examples.py
+++ b/tensorflow/contrib/lite/testing/generate_examples.py
@@ -94,7 +94,8 @@ KNOWN_BUGS = {
     r"softmax.*input_shape=\[1,3,4,3\]": "67749831",
     # SpaceToDepth only supports float32.
     r"space_to_depth.*(float16|int32|uint8|int64)": "68018134",
-    # BatchToSpaceND doesn't support cropping.
+    # BatchToSpaceND doesn't support cropping. This catches test cases with
+    # const tensors as crops.
     r"batch_to_space_nd.*crops=\[\[1,1\],\[1,1\]\]": "70594634",
     # BatchToSpaceND only supports 4D tensors.
     r"batch_to_space_nd.*input_shape=\[8,2,2,2,1,1\]": "70594733",
@@ -1361,6 +1362,8 @@ def make_batch_to_space_nd_tests(zip_path):
           "input_shape": [[12, 2, 2, 1]],
           "block_shape": [[1, 4], [2, 2], [3, 4]],
           "crops": [[[0, 0], [0, 0]], [[1, 1], [1, 1]]],
+          "constant_block_shape": [True, False],
+          "constant_crops": [True, False],
       },
       # Non-4D use case: 1 bath dimension, 3 spatial dimensions, 2 others.
       {
@@ -1368,23 +1371,47 @@ def make_batch_to_space_nd_tests(zip_path):
           "input_shape": [[8, 2, 2, 2, 1, 1]],
           "block_shape": [[2, 2, 2]],
           "crops": [[[0, 0], [0, 0], [0, 0]]],
+          "constant_block_shape": [True, False],
+          "constant_crops": [True, False],
       },
   ]
 
   def build_graph(parameters):
+    """Build a batch_to_space graph given `parameters`."""
     input_tensor = tf.placeholder(
         dtype=parameters["dtype"],
         name="input",
         shape=parameters["input_shape"])
-    out = tf.batch_to_space_nd(input_tensor, parameters["block_shape"],
-                               parameters["crops"])
-    return [input_tensor], [out]
+    input_tensors = [input_tensor]
+
+    # Get block_shape either as a const or as a placeholder (tensor).
+    if parameters["constant_block_shape"]:
+      block_shape = parameters["block_shape"]
+    else:
+      shape = [len(parameters["block_shape"])]
+      block_shape = tf.placeholder(dtype=tf.int32, name="shape", shape=shape)
+      input_tensors.append(block_shape)
+
+    # Get crops either as a const or as a placeholder (tensor).
+    if parameters["constant_crops"]:
+      crops = parameters["crops"]
+    else:
+      shape = [len(parameters["crops"]), 2]
+      crops = tf.placeholder(dtype=tf.int32, name="crops", shape=shape)
+      input_tensors.append(crops)
+
+    out = tf.batch_to_space_nd(input_tensor, block_shape, crops)
+    return input_tensors, [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
-    input_values = create_tensor_data(parameters["dtype"],
-                                      parameters["input_shape"])
-    return [input_values], sess.run(
-        outputs, feed_dict=dict(zip(inputs, [input_values])))
+    values = [
+        create_tensor_data(parameters["dtype"], parameters["input_shape"])
+    ]
+    if not parameters["constant_block_shape"]:
+      values.append(np.array(parameters["block_shape"]))
+    if not parameters["constant_crops"]:
+      values.append(np.array(parameters["crops"]))
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
diff --git a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
index 2bbfe77a12..e6b782472a 100644
--- a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
+++ b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
@@ -67,7 +67,11 @@ std::map<string, string> kBrokenTests = {
     // L2Norm only supports tensors with 4D or fewer.
     {R"(^\/l2normdim=.*,epsilon=.*,input_shape=\[.,.,.,.,.*\])", "67963684"},
 
-    // SpaceToBatch only supports 4D tensors.
+    // BatchToSpaceND doesn't support cropping. This catches test cases with
+    // non-const tensors as crops.
+    {R"(^\/batch_to_space_nd.*crops=\[\[1,1\],\[1,1\]\])", "70594634"},
+
+    // SpaceToBatchND only supports 4D tensors.
     {R"(^\/space_to_batch_nd.*input_shape=\[1,4,4,4,1,1\])", "70848787"},
 
     // L2Norm only works for dim=-1.
diff --git a/tensorflow/contrib/lite/toco/tflite/operator.cc b/tensorflow/contrib/lite/toco/tflite/operator.cc
index 2d6bccce2b..853a9f46c2 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator.cc
@@ -211,24 +211,11 @@ class BatchToSpaceND
   flatbuffers::Offset<TfLiteOptions> WriteOptions(
       const TocoOperator& op,
       flatbuffers::FlatBufferBuilder* builder) const override {
-    auto block_shape = builder->CreateVector(op.block_shape);
-    auto before_crops = builder->CreateVector(op.before_crops);
-    auto after_crops = builder->CreateVector(op.after_crops);
-    return ::tflite::CreateBatchToSpaceNDOptions(*builder, block_shape,
-                                                 before_crops, after_crops);
+    return ::tflite::CreateBatchToSpaceNDOptions(*builder);
   }
 
   void ReadOptions(const TfLiteOptions& options,
                    TocoOperator* op) const override {
-    op->block_shape.insert(op->block_shape.end(),
-                           options.block_shape()->begin(),
-                           options.block_shape()->end());
-    op->before_crops.insert(op->before_crops.end(),
-                            options.before_crops()->begin(),
-                            options.before_crops()->end());
-    op->after_crops.insert(op->after_crops.end(),
-                           options.after_crops()->begin(),
-                           options.after_crops()->end());
   }
 };
 
diff --git a/tensorflow/contrib/lite/toco/tflite/operator_test.cc b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
index 78af3a767d..4df9071095 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator_test.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
@@ -132,19 +132,6 @@ TEST_F(OperatorTest, BuiltinSpaceToBatchND) {
   EXPECT_EQ(op.after_paddings, output_toco_op->after_paddings);
 }
 
-TEST_F(OperatorTest, BuiltinBatchToSpaceND) {
-  BatchToSpaceNDOperator op;
-  op.block_shape = {2, 2};
-  op.before_crops = {1, 2};
-  op.after_crops = {3, 4};
-
-  auto output_toco_op = SerializeAndDeserialize(
-      GetOperator("BATCH_TO_SPACE_ND", OperatorType::kBatchToSpaceND), op);
-  EXPECT_EQ(op.block_shape, output_toco_op->block_shape);
-  EXPECT_EQ(op.before_crops, output_toco_op->before_crops);
-  EXPECT_EQ(op.after_crops, output_toco_op->after_crops);
-}
-
 TEST_F(OperatorTest, BuiltinMean) {
   MeanOperator op;
   op.axis = {1, 2};
-- 
GitLab


From fd63d4e30a01cf860baf60b990b223cd54bc895c Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Mon, 29 Jan 2018 10:42:32 -0800
Subject: [PATCH 326/966] Add C0326 bad-whitespace error to pylint sanity
 check.

PiperOrigin-RevId: 183689499
---
 .../python/ops/accumulate_n_v2_test.py        |  20 +-
 .../learn/python/learn/datasets/base.py       |  19 +-
 tensorflow/contrib/lite/tools/visualize.py    | 104 +++----
 .../examples/cifar10/cifar10_input.py         |  52 ++--
 .../python/kernel_tests/core_rnn_cell_test.py | 258 ++++++++++--------
 .../examples/learn/text_classification.py     |  19 +-
 tensorflow/python/client/notebook.py          |  11 +-
 tensorflow/python/estimator/training.py       |  56 ++--
 .../python/kernel_tests/tensordot_op_test.py  |  29 +-
 tensorflow/tools/ci_build/ci_sanity.sh        |   3 +-
 tensorflow/tools/compatibility/tf_upgrade.py  |  93 ++++---
 11 files changed, 354 insertions(+), 310 deletions(-)

diff --git a/tensorflow/contrib/framework/python/ops/accumulate_n_v2_test.py b/tensorflow/contrib/framework/python/ops/accumulate_n_v2_test.py
index b5e9f8df79..6f65fe771e 100644
--- a/tensorflow/contrib/framework/python/ops/accumulate_n_v2_test.py
+++ b/tensorflow/contrib/framework/python/ops/accumulate_n_v2_test.py
@@ -31,7 +31,6 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
 
-
 class AccumulateNV2Test(test_util.TensorFlowTestCase):
   """Tests of the new, differentiable version of accumulate_n"""
 
@@ -62,8 +61,9 @@ class AccumulateNV2Test(test_util.TensorFlowTestCase):
         accum_n = av2.accumulate_n_v2(input_vars)
         sess.run(variables.global_variables_initializer())
         accum_n_grad = gradients.gradients(accum_n, input_vars)
-        self.assertAllEqual(np.repeat(1.0, num_inputs), # d/dx (x + y + ...) = 1
-                            [g.eval() for g in accum_n_grad])
+        self.assertAllEqual(
+            np.repeat(1.0, num_inputs),  # d/dx (x + y + ...) = 1
+            [g.eval() for g in accum_n_grad])
 
   # The tests below used to be in a separate class under cwise_ops_test.py,
   # which did not run in the default test target.
@@ -75,8 +75,8 @@ class AccumulateNV2Test(test_util.TensorFlowTestCase):
           np.random.rand(16, 16, 16, 16).astype(np.float32) for _ in range(20)
       ]
       random_tensors = [
-          ops.convert_to_tensor(
-              x, dtype=dtypes_lib.float32) for x in random_arrays
+          ops.convert_to_tensor(x, dtype=dtypes_lib.float32)
+          for x in random_arrays
       ]
       tf_val = av2.accumulate_n_v2(random_tensors)
       np_val = random_arrays[0]
@@ -95,21 +95,21 @@ class AccumulateNV2Test(test_util.TensorFlowTestCase):
       with self.assertRaises(ValueError):
         a = variables.Variable(0.2)
         b = variables.Variable(0.1)
-        tf_val = av2.accumulate_n_v2([a,b], shape=[2,2]) # Should be shape=[]
+        tf_val = av2.accumulate_n_v2([a, b], shape=[2, 2])  # Should be shape=[]
 
   def testIncompatibleShapes(self):
     with self.test_session():
       with self.assertRaises(ValueError):
-        a = variables.Variable(np.array([0.1,0.2]))
-        b = variables.Variable(np.array([[0.3],[0.4]]))
-        tf_val = av2.accumulate_n_v2([a,b])
+        a = variables.Variable(np.array([0.1, 0.2]))
+        b = variables.Variable(np.array([[0.3], [0.4]]))
+        tf_val = av2.accumulate_n_v2([a, b])
 
   def testWrongType(self):
     with self.test_session():
       with self.assertRaises(TypeError):
         a = variables.Variable(0.2, dtype=np.float32)
         b = variables.Variable(0.1, dtype=np.float32)
-        tf_val = av2.accumulate_n_v2([a,b], tensor_dtype=np.int32)
+        tf_val = av2.accumulate_n_v2([a, b], tensor_dtype=np.int32)
 
   def testWrongTypeOneInput(self):
     # Scenario that used to trigger a bug, even when testWrongType() worked
diff --git a/tensorflow/contrib/learn/python/learn/datasets/base.py b/tensorflow/contrib/learn/python/learn/datasets/base.py
index 71978d4394..18bf16e246 100644
--- a/tensorflow/contrib/learn/python/learn/datasets/base.py
+++ b/tensorflow/contrib/learn/python/learn/datasets/base.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Base utilities for loading datasets."""
 
 from __future__ import absolute_import
@@ -100,9 +99,7 @@ def load_iris(data_path=None):
     module_path = path.dirname(__file__)
     data_path = path.join(module_path, 'data', 'iris.csv')
   return load_csv_with_header(
-      data_path,
-      target_dtype=np.int,
-      features_dtype=np.float)
+      data_path, target_dtype=np.int, features_dtype=np.float)
 
 
 def load_boston(data_path=None):
@@ -118,16 +115,10 @@ def load_boston(data_path=None):
     module_path = path.dirname(__file__)
     data_path = path.join(module_path, 'data', 'boston_house_prices.csv')
   return load_csv_with_header(
-      data_path,
-      target_dtype=np.float,
-      features_dtype=np.float)
+      data_path, target_dtype=np.float, features_dtype=np.float)
 
 
-def retry(initial_delay,
-          max_delay,
-          factor=2.0,
-          jitter=0.25,
-          is_retriable=None):
+def retry(initial_delay, max_delay, factor=2.0, jitter=0.25, is_retriable=None):
   """Simple decorator for wrapping retriable functions.
 
   Args:
@@ -152,7 +143,7 @@ def retry(initial_delay,
   def delays():
     delay = initial_delay
     while delay <= max_delay:
-      yield delay * random.uniform(1 - jitter,  1 + jitter)
+      yield delay * random.uniform(1 - jitter, 1 + jitter)
       delay *= factor
 
   def wrap(fn):
@@ -172,7 +163,9 @@ def retry(initial_delay,
           else:
             raise
       return fn(*args, **kwargs)
+
     return wrapped_fn
+
   return wrap
 
 
diff --git a/tensorflow/contrib/lite/tools/visualize.py b/tensorflow/contrib/lite/tools/visualize.py
index d0d78e3afa..f571dd59da 100644
--- a/tensorflow/contrib/lite/tools/visualize.py
+++ b/tensorflow/contrib/lite/tools/visualize.py
@@ -198,10 +198,13 @@ class TensorMapper(object):
 
 def GenerateGraph(subgraph_idx, g, opcode_mapper):
   """Produces the HTML required to have a d3 visualization of the dag."""
+
   def TensorName(idx):
-    return "t%d"%idx
+    return "t%d" % idx
+
   def OpName(idx):
-    return "o%d"%idx
+    return "o%d" % idx
+
   edges = []
   nodes = []
   first = {}
@@ -210,27 +213,35 @@ def GenerateGraph(subgraph_idx, g, opcode_mapper):
     for tensor_input_position, tensor_index in enumerate(op["inputs"]):
       if tensor_index not in first:
         first[tensor_index] = (
-            op_index*pixel_mult,
-            tensor_input_position*pixel_mult - pixel_mult/2)
-      edges.append(
-          {"source": TensorName(tensor_index), "target": OpName(op_index)})
+            op_index * pixel_mult,
+            tensor_input_position * pixel_mult - pixel_mult / 2)
+      edges.append({
+          "source": TensorName(tensor_index),
+          "target": OpName(op_index)
+      })
     for tensor_index in op["outputs"]:
-      edges.append(
-          {"target": TensorName(tensor_index), "source": OpName(op_index)})
-    nodes.append({"id": OpName(op_index),
-                  "name": opcode_mapper(op["opcode_index"]),
-                  "group": 2,
-                  "x": pixel_mult,
-                  "y": op_index * pixel_mult})
+      edges.append({
+          "target": TensorName(tensor_index),
+          "source": OpName(op_index)
+      })
+    nodes.append({
+        "id": OpName(op_index),
+        "name": opcode_mapper(op["opcode_index"]),
+        "group": 2,
+        "x": pixel_mult,
+        "y": op_index * pixel_mult
+    })
   for tensor_index, tensor in enumerate(g["tensors"]):
-    initial_y = (first[tensor_index] if tensor_index in first
-                 else len(g["operators"]))
-
-    nodes.append({"id": TensorName(tensor_index),
-                  "name": "%s (%d)" % (tensor["name"], tensor_index),
-                  "group": 1,
-                  "x": 2,
-                  "y": initial_y})
+    initial_y = (
+        first[tensor_index] if tensor_index in first else len(g["operators"]))
+
+    nodes.append({
+        "id": TensorName(tensor_index),
+        "name": "%s (%d)" % (tensor["name"], tensor_index),
+        "group": 1,
+        "x": 2,
+        "y": initial_y
+    })
   graph_str = json.dumps({"nodes": nodes, "edges": edges})
 
   html = _D3_HTML_TEMPLATE % (graph_str, subgraph_idx)
@@ -267,7 +278,7 @@ def GenerateTableHtml(items, keys_to_print, display_index=True):
     for h, mapper in keys_to_print:
       val = tensor[h] if h in tensor else None
       val = val if mapper is None else mapper(val)
-      html += "<td>%s</td>\n"%val
+      html += "<td>%s</td>\n" % val
 
     html += "</tr>\n"
   html += "</table>\n"
@@ -279,18 +290,19 @@ def CreateHtmlFile(tflite_input, html_output):
 
   # Convert the model into a JSON flatbuffer using flatc (build if doesn't
   # exist.
-  if  not os.path.exists(tflite_input):
+  if not os.path.exists(tflite_input):
     raise RuntimeError("Invalid filename %r" % tflite_input)
   if tflite_input.endswith(".tflite") or tflite_input.endswith(".bin"):
 
     # Run convert
-    cmd = (_BINARY + " -t "
-           "--strict-json --defaults-json -o /tmp {schema} -- {input}".format(
-               input=tflite_input, schema=_SCHEMA))
+    cmd = (
+        _BINARY + " -t "
+        "--strict-json --defaults-json -o /tmp {schema} -- {input}".format(
+            input=tflite_input, schema=_SCHEMA))
     print(cmd)
     os.system(cmd)
-    real_output = ("/tmp/"+ os.path.splitext(os.path.split(tflite_input)[-1])[0]
-                   + ".json")
+    real_output = ("/tmp/" + os.path.splitext(
+        os.path.split(tflite_input)[-1])[0] + ".json")
 
     data = json.load(open(real_output))
   elif tflite_input.endswith(".json"):
@@ -302,12 +314,13 @@ def CreateHtmlFile(tflite_input, html_output):
   html += "<h1>TensorFlow Lite Model</h2>"
 
   data["filename"] = tflite_input  # Avoid special case
-  toplevel_stuff = [("filename", None), ("version", None),
-                    ("description", None)]
+  toplevel_stuff = [("filename", None), ("version", None), ("description",
+                                                            None)]
 
   html += "<table>\n"
   for key, mapping in toplevel_stuff:
-    if not mapping: mapping = lambda x: x
+    if not mapping:
+      mapping = lambda x: x
     html += "<tr><th>%s</th><td>%s</td></tr>\n" % (key, mapping(data[key]))
   html += "</table>\n"
 
@@ -320,22 +333,22 @@ def CreateHtmlFile(tflite_input, html_output):
     html += "<div class='subgraph'>"
     tensor_mapper = TensorMapper(g)
     opcode_mapper = OpCodeMapper(data)
-    op_keys_to_display = [
-        ("inputs", tensor_mapper), ("outputs", tensor_mapper),
-        ("builtin_options", None), ("opcode_index", opcode_mapper)]
-    tensor_keys_to_display = [
-        ("name", None), ("type", None), ("shape", None), ("buffer", None),
-        ("quantization", None)]
+    op_keys_to_display = [("inputs", tensor_mapper), ("outputs", tensor_mapper),
+                          ("builtin_options", None), ("opcode_index",
+                                                      opcode_mapper)]
+    tensor_keys_to_display = [("name", None), ("type", None), ("shape", None),
+                              ("buffer", None), ("quantization", None)]
 
     html += "<h2>Subgraph %d</h2>\n" % subgraph_idx
 
     # Inputs and outputs.
     html += "<h3>Inputs/Outputs</h3>\n"
-    html += GenerateTableHtml([{"inputs": g["inputs"],
-                                "outputs": g["outputs"]}],
-                              [("inputs", tensor_mapper),
-                               ("outputs", tensor_mapper)],
-                              display_index=False)
+    html += GenerateTableHtml(
+        [{
+            "inputs": g["inputs"],
+            "outputs": g["outputs"]
+        }], [("inputs", tensor_mapper), ("outputs", tensor_mapper)],
+        display_index=False)
 
     # Print the tensors.
     html += "<h3>Tensors</h3>\n"
@@ -357,8 +370,7 @@ def CreateHtmlFile(tflite_input, html_output):
 
   # Operator codes
   html += "<h2>Operator Codes</h2>\n"
-  html += GenerateTableHtml(data["operator_codes"],
-                            operator_keys_to_display)
+  html += GenerateTableHtml(data["operator_codes"], operator_keys_to_display)
 
   html += "</body></html>\n"
 
@@ -370,10 +382,10 @@ def main(argv):
     tflite_input = argv[1]
     html_output = argv[2]
   except IndexError:
-    print ("Usage: %s <input tflite> <output html>" % (argv[0]))
+    print("Usage: %s <input tflite> <output html>" % (argv[0]))
   else:
     CreateHtmlFile(tflite_input, html_output)
 
+
 if __name__ == "__main__":
   main(sys.argv)
-
diff --git a/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_input.py b/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_input.py
index d07fece4bc..6a3b535eb4 100644
--- a/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_input.py
+++ b/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_input.py
@@ -58,6 +58,7 @@ def read_cifar10(filename_queue):
 
   class CIFAR10Record(object):
     pass
+
   result = CIFAR10Record()
 
   # Dimensions of the images in the CIFAR-10 dataset.
@@ -147,8 +148,9 @@ def distorted_inputs(data_dir, batch_size):
     images: Images. 4D tensor of [batch_size, IMAGE_SIZE, IMAGE_SIZE, 3] size.
     labels: Labels. 1D tensor of [batch_size] size.
   """
-  filenames = [os.path.join(data_dir, 'data_batch_%d.bin' % i)
-               for i in xrange(1, 6)]
+  filenames = [
+      os.path.join(data_dir, 'data_batch_%d.bin' % i) for i in xrange(1, 6)
+  ]
   for f in filenames:
     if not tf.gfile.Exists(f):
       raise ValueError('Failed to find file: ' + f)
@@ -174,10 +176,9 @@ def distorted_inputs(data_dir, batch_size):
 
   # Because these operations are not commutative, consider randomizing
   # the order their operation.
-  distorted_image = tf.image.random_brightness(distorted_image,
-                                               max_delta=63)
-  distorted_image = tf.image.random_contrast(distorted_image,
-                                             lower=0.2, upper=1.8)
+  distorted_image = tf.image.random_brightness(distorted_image, max_delta=63)
+  distorted_image = tf.image.random_contrast(
+      distorted_image, lower=0.2, upper=1.8)
 
   # Subtract off the mean and divide by the variance of the pixels.
   float_image = tf.image.per_image_standardization(distorted_image)
@@ -188,15 +189,18 @@ def distorted_inputs(data_dir, batch_size):
 
   # Ensure that the random shuffling has good mixing properties.
   min_fraction_of_examples_in_queue = 0.4
-  min_queue_examples = int(NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN *
-                           min_fraction_of_examples_in_queue)
-  print ('Filling queue with %d CIFAR images before starting to train. '
-         'This will take a few minutes.' % min_queue_examples)
+  min_queue_examples = int(
+      NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN * min_fraction_of_examples_in_queue)
+  print('Filling queue with %d CIFAR images before starting to train. '
+        'This will take a few minutes.' % min_queue_examples)
 
   # Generate a batch of images and labels by building up a queue of examples.
-  return _generate_image_and_label_batch(float_image, read_input.label,
-                                         min_queue_examples, batch_size,
-                                         shuffle=True)
+  return _generate_image_and_label_batch(
+      float_image,
+      read_input.label,
+      min_queue_examples,
+      batch_size,
+      shuffle=True)
 
 
 def inputs(eval_data, data_dir, batch_size):
@@ -212,8 +216,9 @@ def inputs(eval_data, data_dir, batch_size):
     labels: Labels. 1D tensor of [batch_size] size.
   """
   if not eval_data:
-    filenames = [os.path.join(data_dir, 'data_batch_%d.bin' % i)
-                 for i in xrange(1, 6)]
+    filenames = [
+        os.path.join(data_dir, 'data_batch_%d.bin' % i) for i in xrange(1, 6)
+    ]
     num_examples_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN
   else:
     filenames = [os.path.join(data_dir, 'test_batch.bin')]
@@ -235,8 +240,8 @@ def inputs(eval_data, data_dir, batch_size):
 
   # Image processing for evaluation.
   # Crop the central [height, width] of the image.
-  resized_image = tf.image.resize_image_with_crop_or_pad(reshaped_image,
-                                                         width, height)
+  resized_image = tf.image.resize_image_with_crop_or_pad(
+      reshaped_image, width, height)
 
   # Subtract off the mean and divide by the variance of the pixels.
   float_image = tf.image.per_image_standardization(resized_image)
@@ -247,10 +252,13 @@ def inputs(eval_data, data_dir, batch_size):
 
   # Ensure that the random shuffling has good mixing properties.
   min_fraction_of_examples_in_queue = 0.4
-  min_queue_examples = int(num_examples_per_epoch *
-                           min_fraction_of_examples_in_queue)
+  min_queue_examples = int(
+      num_examples_per_epoch * min_fraction_of_examples_in_queue)
 
   # Generate a batch of images and labels by building up a queue of examples.
-  return _generate_image_and_label_batch(float_image, read_input.label,
-                                         min_queue_examples, batch_size,
-                                         shuffle=False)
+  return _generate_image_and_label_batch(
+      float_image,
+      read_input.label,
+      min_queue_examples,
+      batch_size,
+      shuffle=False)
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
index cafeb56ad8..e1838c1739 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
@@ -42,7 +42,6 @@ from tensorflow.python.platform import test
 from tensorflow.python.framework import test_util
 from tensorflow.contrib.rnn.python.ops import rnn_cell as contrib_rnn_cell
 
-
 # pylint: enable=protected-access
 Linear = core_rnn_cell._Linear  # pylint: disable=invalid-name
 
@@ -84,19 +83,22 @@ class RNNCellTest(test.TestCase):
         ], [v.name for v in cell.trainable_variables])
         self.assertFalse(cell.non_trainable_variables)
         sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run(
-            [g], {x.name: np.array([[1., 1.]]),
-                  m.name: np.array([[0.1, 0.1]])})
+        res = sess.run([g], {
+            x.name: np.array([[1., 1.]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
         self.assertEqual(res[0].shape, (1, 2))
 
   def testBasicRNNCellNotTrainable(self):
     with self.test_session() as sess:
+
       def not_trainable_getter(getter, *args, **kwargs):
         kwargs["trainable"] = False
         return getter(*args, **kwargs)
 
       with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5),
+          "root",
+          initializer=init_ops.constant_initializer(0.5),
           custom_getter=not_trainable_getter):
         x = array_ops.zeros([1, 2])
         m = array_ops.zeros([1, 2])
@@ -108,9 +110,10 @@ class RNNCellTest(test.TestCase):
             "root/basic_rnn_cell/%s:0" % rnn_cell_impl._BIAS_VARIABLE_NAME
         ], [v.name for v in cell.non_trainable_variables])
         sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run(
-            [g], {x.name: np.array([[1., 1.]]),
-                  m.name: np.array([[0.1, 0.1]])})
+        res = sess.run([g], {
+            x.name: np.array([[1., 1.]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
         self.assertEqual(res[0].shape, (1, 2))
 
   def testGRUCell(self):
@@ -121,9 +124,10 @@ class RNNCellTest(test.TestCase):
         m = array_ops.zeros([1, 2])
         g, _ = rnn_cell_impl.GRUCell(2)(x, m)
         sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run(
-            [g], {x.name: np.array([[1., 1.]]),
-                  m.name: np.array([[0.1, 0.1]])})
+        res = sess.run([g], {
+            x.name: np.array([[1., 1.]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
         # Smoke test
         self.assertAllClose(res[0], [[0.175991, 0.175991]])
       with variable_scope.variable_scope(
@@ -133,10 +137,10 @@ class RNNCellTest(test.TestCase):
         m = array_ops.zeros([1, 2])
         g, _ = rnn_cell_impl.GRUCell(2)(x, m)
         sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run(
-            [g],
-            {x.name: np.array([[1., 1., 1.]]),
-             m.name: np.array([[0.1, 0.1]])})
+        res = sess.run([g], {
+            x.name: np.array([[1., 1., 1.]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
         # Smoke test
         self.assertAllClose(res[0], [[0.156736, 0.156736]])
 
@@ -148,11 +152,12 @@ class RNNCellTest(test.TestCase):
         m = array_ops.zeros([1, 2])
         g, _ = contrib_rnn_cell.SRUCell(2)(x, m)
         sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run(
-            [g], {x.name: np.array([[1., 1.]]),
-                  m.name: np.array([[0.1, 0.1]])})
+        res = sess.run([g], {
+            x.name: np.array([[1., 1.]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
         # Smoke test
-        self.assertAllClose(res[0], [[0.509682,  0.509682]])
+        self.assertAllClose(res[0], [[0.509682, 0.509682]])
 
   def testBasicLSTMCell(self):
     for dtype in [dtypes.float16, dtypes.float32]:
@@ -164,8 +169,7 @@ class RNNCellTest(test.TestCase):
           m = array_ops.zeros([1, 8], dtype=dtype)
           cell = rnn_cell_impl.MultiRNNCell(
               [
-                  rnn_cell_impl.BasicLSTMCell(
-                      2, state_is_tuple=False)
+                  rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=False)
                   for _ in range(2)
               ],
               state_is_tuple=False)
@@ -183,22 +187,21 @@ class RNNCellTest(test.TestCase):
               "root/multi_rnn_cell/cell_1/basic_lstm_cell/%s:0" %
               rnn_cell_impl._BIAS_VARIABLE_NAME
           ]
-          self.assertEqual(
-              expected_variable_names,
-              [v.name for v in cell.trainable_variables])
+          self.assertEqual(expected_variable_names,
+                           [v.name for v in cell.trainable_variables])
           self.assertFalse(cell.non_trainable_variables)
           sess.run([variables_lib.global_variables_initializer()])
-          res = sess.run(
-              [g, out_m],
-              {x.name: np.array([[1., 1.]]),
-               m.name: 0.1 * np.ones([1, 8])})
+          res = sess.run([g, out_m], {
+              x.name: np.array([[1., 1.]]),
+              m.name: 0.1 * np.ones([1, 8])
+          })
           self.assertEqual(len(res), 2)
           variables = variables_lib.global_variables()
           self.assertEqual(expected_variable_names, [v.name for v in variables])
           # The numbers in results were not calculated, this is just a
           # smoke test.
-          self.assertAllClose(
-              res[0], np.array([[0.240, 0.240]], dtype=np_dtype), 1e-2)
+          self.assertAllClose(res[0], np.array(
+              [[0.240, 0.240]], dtype=np_dtype), 1e-2)
           expected_mem = np.array(
               [[0.689, 0.689, 0.448, 0.448, 0.398, 0.398, 0.240, 0.240]],
               dtype=np_dtype)
@@ -208,13 +211,13 @@ class RNNCellTest(test.TestCase):
           # Test BasicLSTMCell with input_size != num_units.
           x = array_ops.zeros([1, 3], dtype=dtype)
           m = array_ops.zeros([1, 4], dtype=dtype)
-          g, out_m = rnn_cell_impl.BasicLSTMCell(
-              2, state_is_tuple=False)(x, m)
+          g, out_m = rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=False)(x, m)
           sess.run([variables_lib.global_variables_initializer()])
           res = sess.run(
-              [g, out_m],
-              {x.name: np.array([[1., 1., 1.]], dtype=np_dtype),
-               m.name: 0.1 * np.ones([1, 4], dtype=np_dtype)})
+              [g, out_m], {
+                  x.name: np.array([[1., 1., 1.]], dtype=np_dtype),
+                  m.name: 0.1 * np.ones([1, 4], dtype=np_dtype)
+              })
           self.assertEqual(len(res), 2)
 
   def testBasicLSTMCellDimension0Error(self):
@@ -232,9 +235,11 @@ class RNNCellTest(test.TestCase):
           g, out_m = rnn_cell_impl.BasicLSTMCell(
               num_units, state_is_tuple=False)(x, m)
           sess.run([variables_lib.global_variables_initializer()])
-          sess.run([g, out_m],
-                   {x.name: 1 * np.ones([batch_size, input_size]),
-                    m.name: 0.1 * np.ones([batch_size - 1, state_size])})
+          sess.run(
+              [g, out_m], {
+                  x.name: 1 * np.ones([batch_size, input_size]),
+                  m.name: 0.1 * np.ones([batch_size - 1, state_size])
+              })
 
   def testBasicLSTMCellStateSizeError(self):
     """Tests that state_size must be num_units * 2."""
@@ -251,9 +256,11 @@ class RNNCellTest(test.TestCase):
           g, out_m = rnn_cell_impl.BasicLSTMCell(
               num_units, state_is_tuple=False)(x, m)
           sess.run([variables_lib.global_variables_initializer()])
-          sess.run([g, out_m],
-                   {x.name: 1 * np.ones([batch_size, input_size]),
-                    m.name: 0.1 * np.ones([batch_size, state_size])})
+          sess.run(
+              [g, out_m], {
+                  x.name: 1 * np.ones([batch_size, input_size]),
+                  m.name: 0.1 * np.ones([batch_size, state_size])
+              })
 
   def testBasicLSTMCellStateTupleType(self):
     with self.test_session():
@@ -301,11 +308,12 @@ class RNNCellTest(test.TestCase):
             state_is_tuple=True)
         g, (out_m0, out_m1) = cell(x, (m0, m1))
         sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run([g, out_m0, out_m1], {
-            x.name: np.array([[1., 1.]]),
-            m0.name: 0.1 * np.ones([1, 4]),
-            m1.name: 0.1 * np.ones([1, 4])
-        })
+        res = sess.run(
+            [g, out_m0, out_m1], {
+                x.name: np.array([[1., 1.]]),
+                m0.name: 0.1 * np.ones([1, 4]),
+                m1.name: 0.1 * np.ones([1, 4])
+            })
         self.assertEqual(len(res), 3)
         # The numbers in results were not calculated, this is just a smoke test.
         # Note, however, these values should match the original
@@ -336,10 +344,11 @@ class RNNCellTest(test.TestCase):
             state_is_tuple=False)
         output, state = cell(x, m)
         sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run([output, state], {
-            x.name: np.array([[1., 1.], [2., 2.], [3., 3.]]),
-            m.name: 0.1 * np.ones((batch_size, state_size))
-        })
+        res = sess.run(
+            [output, state], {
+                x.name: np.array([[1., 1.], [2., 2.], [3., 3.]]),
+                m.name: 0.1 * np.ones((batch_size, state_size))
+            })
         self.assertEqual(len(res), 2)
         # The numbers in results were not calculated, this is mostly just a
         # smoke test.
@@ -442,10 +451,10 @@ class RNNCellTest(test.TestCase):
             rnn_cell_impl.GRUCell(3), num_proj=3)
         g, new_m = cell(x, m)
         sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run(
-            [g, new_m],
-            {x.name: np.array([[1., 1.]]),
-             m.name: np.array([[0.1, 0.1, 0.1]])})
+        res = sess.run([g, new_m], {
+            x.name: np.array([[1., 1.]]),
+            m.name: np.array([[0.1, 0.1, 0.1]])
+        })
         self.assertEqual(res[1].shape, (1, 3))
         # The numbers in results were not calculated, this is just a smoke test.
         self.assertAllClose(res[0], [[0.154605, 0.154605, 0.154605]])
@@ -479,9 +488,11 @@ class RNNCellTest(test.TestCase):
         base_cell = rnn_cell_impl.GRUCell(3)
         g, m_new = base_cell(x, m)
         variable_scope.get_variable_scope().reuse_variables()
+
         def residual_with_slice_fn(inp, out):
           inp_sliced = array_ops.slice(inp, [0, 0], [-1, 3])
           return inp_sliced + out
+
         g_res, m_new_res = rnn_cell_impl.ResidualWrapper(
             base_cell, residual_with_slice_fn)(x, m)
         sess.run([variables_lib.global_variables_initializer()])
@@ -551,10 +562,10 @@ class RNNCellTest(test.TestCase):
         self.assertEqual(embedding_cell.output_size, 2)
         g, new_m = embedding_cell(x, m)
         sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run(
-            [g, new_m],
-            {x.name: np.array([[1]]),
-             m.name: np.array([[0.1, 0.1]])})
+        res = sess.run([g, new_m], {
+            x.name: np.array([[1]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
         self.assertEqual(res[1].shape, (1, 2))
         # The numbers in results were not calculated, this is just a smoke test.
         self.assertAllClose(res[0], [[0.17139, 0.17139]])
@@ -584,8 +595,8 @@ class RNNCellTest(test.TestCase):
         x = array_ops.zeros([1, 2])
         m = array_ops.zeros([1, 4])
         _, ml = rnn_cell_impl.MultiRNNCell(
-            [rnn_cell_impl.GRUCell(2)
-             for _ in range(2)], state_is_tuple=False)(x, m)
+            [rnn_cell_impl.GRUCell(2) for _ in range(2)],
+            state_is_tuple=False)(x, m)
         sess.run([variables_lib.global_variables_initializer()])
         res = sess.run(ml, {
             x.name: np.array([[1., 1.]]),
@@ -605,19 +616,20 @@ class RNNCellTest(test.TestCase):
         # Test incorrectness of state
         with self.assertRaisesRegexp(ValueError, "Expected state .* a tuple"):
           rnn_cell_impl.MultiRNNCell(
-              [rnn_cell_impl.GRUCell(2)
-               for _ in range(2)], state_is_tuple=True)(x, m_bad)
+              [rnn_cell_impl.GRUCell(2) for _ in range(2)],
+              state_is_tuple=True)(x, m_bad)
 
         _, ml = rnn_cell_impl.MultiRNNCell(
-            [rnn_cell_impl.GRUCell(2)
-             for _ in range(2)], state_is_tuple=True)(x, m_good)
+            [rnn_cell_impl.GRUCell(2) for _ in range(2)],
+            state_is_tuple=True)(x, m_good)
 
         sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run(ml, {
-            x.name: np.array([[1., 1.]]),
-            m_good[0].name: np.array([[0.1, 0.1]]),
-            m_good[1].name: np.array([[0.1, 0.1]])
-        })
+        res = sess.run(
+            ml, {
+                x.name: np.array([[1., 1.]]),
+                m_good[0].name: np.array([[0.1, 0.1]]),
+                m_good[1].name: np.array([[0.1, 0.1]])
+            })
 
         # The numbers in results were not calculated, this is just a
         # smoke test.  However, these numbers should match those of
@@ -628,8 +640,11 @@ class RNNCellTest(test.TestCase):
 
 class DropoutWrapperTest(test.TestCase):
 
-  def _testDropoutWrapper(self, batch_size=None, time_steps=None,
-                          parallel_iterations=None, **kwargs):
+  def _testDropoutWrapper(self,
+                          batch_size=None,
+                          time_steps=None,
+                          parallel_iterations=None,
+                          **kwargs):
     with self.test_session() as sess:
       with variable_scope.variable_scope(
           "root", initializer=init_ops.constant_initializer(0.5)):
@@ -640,14 +655,14 @@ class DropoutWrapperTest(test.TestCase):
           x = constant_op.constant(
               [[[2., 2., 2.]], [[1., 1., 1.]]], dtype=dtypes.float32)
           m = rnn_cell_impl.LSTMStateTuple(
-              *[constant_op.constant([[0.1, 0.1, 0.1]], dtype=dtypes.float32)
-               ] * 2)
+              *[constant_op.constant([[0.1, 0.1, 0.1]], dtype=dtypes.float32
+                                    )] * 2)
         else:
           x = constant_op.constant(
               np.random.randn(time_steps, batch_size, 3).astype(np.float32))
           m = rnn_cell_impl.LSTMStateTuple(*[
-              constant_op.constant(
-                  [[0.1, 0.1, 0.1]] * batch_size, dtype=dtypes.float32)
+              constant_op.
+              constant([[0.1, 0.1, 0.1]] * batch_size, dtype=dtypes.float32)
           ] * 2)
         outputs, final_state = rnn.dynamic_rnn(
             cell=rnn_cell_impl.DropoutWrapper(
@@ -674,8 +689,8 @@ class DropoutWrapperTest(test.TestCase):
     res = self._testDropoutWrapper(
         input_keep_prob=keep, output_keep_prob=keep, state_keep_prob=keep)
     true_full_output = np.array(
-        [[[0.751109, 0.751109, 0.751109]],
-         [[0.895509, 0.895509, 0.895509]]], dtype=np.float32)
+        [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]],
+        dtype=np.float32)
     true_full_final_c = np.array(
         [[1.949385, 1.949385, 1.949385]], dtype=np.float32)
     self.assertAllClose(true_full_output, res[0])
@@ -687,8 +702,8 @@ class DropoutWrapperTest(test.TestCase):
     res = self._testDropoutWrapper(
         input_keep_prob=keep, output_keep_prob=keep, state_keep_prob=keep)
     true_full_output = np.array(
-        [[[0.751109, 0.751109, 0.751109]],
-         [[0.895509, 0.895509, 0.895509]]], dtype=np.float32)
+        [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]],
+        dtype=np.float32)
     true_full_final_c = np.array(
         [[1.949385, 1.949385, 1.949385]], dtype=np.float32)
     self.assertAllClose(true_full_output, res[0])
@@ -703,16 +718,20 @@ class DropoutWrapperTest(test.TestCase):
     ## consistent across both calls.  Otherwise the seed may not end
     ## up being munged consistently across both graphs.
     res_standard_1 = self._testDropoutWrapper(
-        input_keep_prob=keep_some, output_keep_prob=keep_some,
-        state_keep_prob=keep_some, seed=10,
+        input_keep_prob=keep_some,
+        output_keep_prob=keep_some,
+        state_keep_prob=keep_some,
+        seed=10,
         parallel_iterations=1)
     # Clear away the graph and the test session (which keeps variables around)
     ops.reset_default_graph()
     self._ClearCachedSession()
     random_seed.set_random_seed(2)
     res_standard_2 = self._testDropoutWrapper(
-        input_keep_prob=keep_some, output_keep_prob=keep_some,
-        state_keep_prob=keep_some, seed=10,
+        input_keep_prob=keep_some,
+        output_keep_prob=keep_some,
+        state_keep_prob=keep_some,
+        seed=10,
         parallel_iterations=1)
     self.assertAllClose(res_standard_1[0], res_standard_2[0])
     self.assertAllClose(res_standard_1[1].c, res_standard_2[1].c)
@@ -722,11 +741,12 @@ class DropoutWrapperTest(test.TestCase):
     keep_all = variable_scope.get_variable("all", initializer=1.0)
     keep_none = variable_scope.get_variable("none", initializer=1e-10)
     res = self._testDropoutWrapper(
-        input_keep_prob=keep_all, output_keep_prob=keep_none,
+        input_keep_prob=keep_all,
+        output_keep_prob=keep_none,
         state_keep_prob=keep_all)
     true_full_output = np.array(
-        [[[0.751109, 0.751109, 0.751109]],
-         [[0.895509, 0.895509, 0.895509]]], dtype=np.float32)
+        [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]],
+        dtype=np.float32)
     true_full_final_c = np.array(
         [[1.949385, 1.949385, 1.949385]], dtype=np.float32)
     self.assertAllClose(np.zeros(res[0].shape), res[0])
@@ -739,13 +759,13 @@ class DropoutWrapperTest(test.TestCase):
     # Even though we dropout state, by default DropoutWrapper never
     # drops out the memory ("c") term of an LSTMStateTuple.
     res = self._testDropoutWrapper(
-        input_keep_prob=keep_all, output_keep_prob=keep_all,
+        input_keep_prob=keep_all,
+        output_keep_prob=keep_all,
         state_keep_prob=keep_none)
-    true_c_state = np.array(
-        [[1.713925, 1.713925, 1.713925]], dtype=np.float32)
+    true_c_state = np.array([[1.713925, 1.713925, 1.713925]], dtype=np.float32)
     true_full_output = np.array(
-        [[[0.751109, 0.751109, 0.751109]],
-         [[0.895509, 0.895509, 0.895509]]], dtype=np.float32)
+        [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]],
+        dtype=np.float32)
     self.assertAllClose(true_full_output[0], res[0][0])
     # Second output is modified by zero input state
     self.assertGreater(np.linalg.norm(true_full_output[1] - res[0][1]), 1e-4)
@@ -758,13 +778,14 @@ class DropoutWrapperTest(test.TestCase):
     keep_all = variable_scope.get_variable("all", initializer=1.0)
     keep_none = variable_scope.get_variable("none", initializer=1e-10)
     true_full_output = np.array(
-        [[[0.751109, 0.751109, 0.751109]],
-         [[0.895509, 0.895509, 0.895509]]], dtype=np.float32)
+        [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]],
+        dtype=np.float32)
     true_full_final_c = np.array(
         [[1.949385, 1.949385, 1.949385]], dtype=np.float32)
     # All outputs are different because inputs are zeroed out
     res = self._testDropoutWrapper(
-        input_keep_prob=keep_none, output_keep_prob=keep_all,
+        input_keep_prob=keep_none,
+        output_keep_prob=keep_all,
         state_keep_prob=keep_all)
     self.assertGreater(np.linalg.norm(res[0] - true_full_output), 1e-4)
     self.assertGreater(np.linalg.norm(res[1].h - true_full_output[1]), 1e-4)
@@ -774,9 +795,13 @@ class DropoutWrapperTest(test.TestCase):
     keep_some = 0.8
     keep_all = variable_scope.get_variable("all", initializer=1.0)
     res = self._testDropoutWrapper(
-        input_keep_prob=keep_all, output_keep_prob=keep_some,
-        state_keep_prob=keep_all, variational_recurrent=True,
-        input_size=3, batch_size=5, time_steps=7)
+        input_keep_prob=keep_all,
+        output_keep_prob=keep_some,
+        state_keep_prob=keep_all,
+        variational_recurrent=True,
+        input_size=3,
+        batch_size=5,
+        time_steps=7)
     # Ensure the same dropout pattern for all time steps
     output_mask = np.abs(res[0]) > 1e-6
     for m in output_mask[1:]:
@@ -785,9 +810,13 @@ class DropoutWrapperTest(test.TestCase):
   def testDropoutWrapperRecurrentStateInputAndOutput(self):
     keep_some = 0.9
     res = self._testDropoutWrapper(
-        input_keep_prob=keep_some, output_keep_prob=keep_some,
-        state_keep_prob=keep_some, variational_recurrent=True,
-        input_size=3, batch_size=5, time_steps=7)
+        input_keep_prob=keep_some,
+        output_keep_prob=keep_some,
+        state_keep_prob=keep_some,
+        variational_recurrent=True,
+        input_size=3,
+        batch_size=5,
+        time_steps=7)
 
     # Smoke test for the state/input masks.
     output_mask = np.abs(res[0]) > 1e-6
@@ -811,17 +840,27 @@ class DropoutWrapperTest(test.TestCase):
     random_seed.set_random_seed(2347)
     np.random.seed(23487)
     res0 = self._testDropoutWrapper(
-        input_keep_prob=keep_some, output_keep_prob=keep_some,
-        state_keep_prob=keep_some, variational_recurrent=True,
-        input_size=3, batch_size=5, time_steps=7, seed=-234987)
+        input_keep_prob=keep_some,
+        output_keep_prob=keep_some,
+        state_keep_prob=keep_some,
+        variational_recurrent=True,
+        input_size=3,
+        batch_size=5,
+        time_steps=7,
+        seed=-234987)
     ops.reset_default_graph()
     self._ClearCachedSession()
     random_seed.set_random_seed(2347)
     np.random.seed(23487)
     res1 = self._testDropoutWrapper(
-        input_keep_prob=keep_some, output_keep_prob=keep_some,
-        state_keep_prob=keep_some, variational_recurrent=True,
-        input_size=3, batch_size=5, time_steps=7, seed=-234987)
+        input_keep_prob=keep_some,
+        output_keep_prob=keep_some,
+        state_keep_prob=keep_some,
+        variational_recurrent=True,
+        input_size=3,
+        batch_size=5,
+        time_steps=7,
+        seed=-234987)
 
     output_mask = np.abs(res0[0]) > 1e-6
     for time_step in output_mask:
@@ -858,9 +897,10 @@ class SlimRNNCellTest(test.TestCase):
         g, _ = rnn_cell_impl._SlimRNNCell(my_cell)(x, m)
         # pylint: enable=protected-access
         sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run(
-            [g], {x.name: np.array([[1., 1.]]),
-                  m.name: np.array([[0.1, 0.1]])})
+        res = sess.run([g], {
+            x.name: np.array([[1., 1.]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
         self.assertEqual(res[0].shape, (1, 2))
 
   def testBasicRNNCellMatch(self):
diff --git a/tensorflow/examples/learn/text_classification.py b/tensorflow/examples/learn/text_classification.py
index eb117c39a1..e4e61862b0 100644
--- a/tensorflow/examples/learn/text_classification.py
+++ b/tensorflow/examples/learn/text_classification.py
@@ -34,8 +34,7 @@ MAX_LABEL = 15
 WORDS_FEATURE = 'words'  # Name of the input words feature.
 
 
-def estimator_spec_for_softmax_classification(
-    logits, labels, mode):
+def estimator_spec_for_softmax_classification(logits, labels, mode):
   """Returns EstimatorSpec instance for softmax classification."""
   predicted_classes = tf.argmax(logits, 1)
   if mode == tf.estimator.ModeKeys.PREDICT:
@@ -53,8 +52,8 @@ def estimator_spec_for_softmax_classification(
     return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
 
   eval_metric_ops = {
-      'accuracy': tf.metrics.accuracy(
-          labels=labels, predictions=predicted_classes)
+      'accuracy':
+          tf.metrics.accuracy(labels=labels, predictions=predicted_classes)
   }
   return tf.estimator.EstimatorSpec(
       mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
@@ -67,8 +66,7 @@ def bag_of_words_model(features, labels, mode):
   bow_embedding_column = tf.feature_column.embedding_column(
       bow_column, dimension=EMBEDDING_SIZE)
   bow = tf.feature_column.input_layer(
-      features,
-      feature_columns=[bow_embedding_column])
+      features, feature_columns=[bow_embedding_column])
   logits = tf.layers.dense(bow, MAX_LABEL, activation=None)
 
   return estimator_spec_for_softmax_classification(
@@ -110,9 +108,9 @@ def main(unused_argv):
   # Prepare training and testing data
   dbpedia = tf.contrib.learn.datasets.load_dataset(
       'dbpedia', test_with_fake_data=FLAGS.test_with_fake_data)
-  x_train = pandas.Series(dbpedia.train.data[:,1])
+  x_train = pandas.Series(dbpedia.train.data[:, 1])
   y_train = pandas.Series(dbpedia.train.target)
-  x_test = pandas.Series(dbpedia.test.data[:,1])
+  x_test = pandas.Series(dbpedia.test.data[:, 1])
   y_test = pandas.Series(dbpedia.test.target)
 
   # Process vocabulary
@@ -152,10 +150,7 @@ def main(unused_argv):
 
   # Predict.
   test_input_fn = tf.estimator.inputs.numpy_input_fn(
-      x={WORDS_FEATURE: x_test},
-      y=y_test,
-      num_epochs=1,
-      shuffle=False)
+      x={WORDS_FEATURE: x_test}, y=y_test, num_epochs=1, shuffle=False)
   predictions = classifier.predict(input_fn=test_input_fn)
   y_predicted = np.array(list(p['class'] for p in predictions))
   y_predicted = y_predicted.reshape(np.array(y_test).shape)
diff --git a/tensorflow/python/client/notebook.py b/tensorflow/python/client/notebook.py
index 8babe35b32..4b6a0f71ae 100644
--- a/tensorflow/python/client/notebook.py
+++ b/tensorflow/python/client/notebook.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Notebook front-end to TensorFlow.
 
 When you run this binary, you'll see something like below, which indicates
@@ -43,10 +42,8 @@ from tensorflow.python.platform import app
 os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "cpp"
 os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION_VERSION"] = "2"
 
-
 FLAGS = None
 
-
 ORIG_ARGV = sys.argv
 # Main notebook process calls itself with argv[1]="kernel" to start kernel
 # subprocesses.
@@ -73,8 +70,8 @@ def main(unused_argv):
       notebookapp.ip = "0.0.0.0"
       notebookapp.password = passwd(FLAGS.password)
     else:
-      print ("\nNo password specified; Notebook server will only be available"
-             " on the local machine.\n")
+      print("\nNo password specified; Notebook server will only be available"
+            " on the local machine.\n")
     notebookapp.initialize(argv=["--notebook-dir", FLAGS.notebook_dir])
 
     if notebookapp.ip == "0.0.0.0":
@@ -125,8 +122,8 @@ if __name__ == "__main__":
   # kernel app.
   if IS_KERNEL:
     # Drop everything except --flagfile.
-    sys.argv = ([sys.argv[0]] +
-                [x for x in sys.argv[1:] if x.startswith("--flagfile")])
+    sys.argv = (
+        [sys.argv[0]] + [x for x in sys.argv[1:] if x.startswith("--flagfile")])
 
   FLAGS, unparsed = parser.parse_known_args()
   app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/python/estimator/training.py b/tensorflow/python/estimator/training.py
index 52fb1d39ae..2e84c5014f 100644
--- a/tensorflow/python/estimator/training.py
+++ b/tensorflow/python/estimator/training.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Classes and functions related to train_and_evaluate."""
 
 from __future__ import absolute_import
@@ -37,7 +36,6 @@ from tensorflow.python.training import server_lib
 from tensorflow.python.training import session_run_hook
 from tensorflow.python.util import compat
 
-
 _MAX_DELAY_SECS = 60
 _DELAY_SECS_PER_WORKER = 5
 _TF_CONFIG_ENV = 'TF_CONFIG'
@@ -50,8 +48,7 @@ _TRAINER_JOBS = (run_config_lib.TaskType.CHIEF, run_config_lib.TaskType.MASTER,
 def _validate_input_fn(input_fn):
   """Validates the `input_fn`."""
   if not callable(input_fn):
-    raise TypeError(
-        '`input_fn` must be callable, given: {}'.format(input_fn))
+    raise TypeError('`input_fn` must be callable, given: {}'.format(input_fn))
 
 
 def _validate_hooks(hooks):
@@ -125,10 +122,7 @@ class TrainSpec(
   duration. Optional hooks run at various stages of training.
   """
 
-  def __new__(cls,
-              input_fn,
-              max_steps=None,
-              hooks=None):
+  def __new__(cls, input_fn, max_steps=None, hooks=None):
     """Creates a validated `TrainSpec` instance.
 
     Args:
@@ -161,16 +155,13 @@ class TrainSpec(
     hooks = _validate_hooks(hooks)
 
     return super(TrainSpec, cls).__new__(
-        cls,
-        input_fn=input_fn,
-        max_steps=max_steps,
-        hooks=hooks)
+        cls, input_fn=input_fn, max_steps=max_steps, hooks=hooks)
 
 
 class EvalSpec(
     collections.namedtuple('EvalSpec', [
-        'input_fn', 'steps', 'name', 'hooks', 'exporters',
-        'start_delay_secs', 'throttle_secs'
+        'input_fn', 'steps', 'name', 'hooks', 'exporters', 'start_delay_secs',
+        'throttle_secs'
     ])):
   """Configuration for the "eval" part for the `train_and_evaluate` call.
 
@@ -417,8 +408,8 @@ def train_and_evaluate(estimator, train_spec, eval_spec):
   Raises:
     ValueError: if environment variable `TF_CONFIG` is incorrectly set.
   """
-  executor = _TrainingExecutor(estimator=estimator, train_spec=train_spec,
-                               eval_spec=eval_spec)
+  executor = _TrainingExecutor(
+      estimator=estimator, train_spec=train_spec, eval_spec=eval_spec)
 
   config = estimator.config
   if (config.task_type == run_config_lib.TaskType.EVALUATOR and
@@ -561,9 +552,8 @@ class _TrainingExecutor(object):
           self._timer.update_last_triggered_step(global_step_value)
           self._evaluator.evaluate_and_export()
         else:
-          logging.info(
-              'Skip the current checkpoint eval due to throttle secs '
-              '({} secs).'.format(self._eval_throttle_secs))
+          logging.info('Skip the current checkpoint eval due to throttle secs '
+                       '({} secs).'.format(self._eval_throttle_secs))
 
     # Final export signal: For any eval result with global_step >= train
     # max_steps, the evaluator will send the final export signal. There is a
@@ -576,8 +566,8 @@ class _TrainingExecutor(object):
     #
     # But here, throttle_secs will skip the next intermediate checkpoint and,
     # so, the double final export chance is very small.
-    evaluator = _TrainingExecutor._Evaluator(
-        self._estimator, self._eval_spec, self._train_spec.max_steps)
+    evaluator = _TrainingExecutor._Evaluator(self._estimator, self._eval_spec,
+                                             self._train_spec.max_steps)
 
     # When the underlying `Estimator` object saves a new checkpoint, we would
     # like this callback to be called so that evaluation and export can trigger.
@@ -617,8 +607,7 @@ class _TrainingExecutor(object):
       raise ValueError('eval_spec.throttle_secs should be positive, given: {}.'
                        'It is used do determine how long each training '
                        'iteration should go when train and evaluate '
-                       'locally.'.format(
-                           self._eval_spec.throttle_secs))
+                       'locally.'.format(self._eval_spec.throttle_secs))
 
     stop_hook = _StopAtSecsHook(self._eval_spec.throttle_secs)
     train_hooks = (
@@ -663,8 +652,9 @@ class _TrainingExecutor(object):
 
     if not config.master:
       jobs = config.cluster_spec.jobs
-      if (len(jobs) == 1 and len(config.cluster_spec.job_tasks(jobs[0])) == 1
-          and config.task_type in _TRAINER_JOBS):
+      if (len(jobs) == 1 and
+          len(config.cluster_spec.job_tasks(jobs[0])) == 1 and
+          config.task_type in _TRAINER_JOBS):
         # For distributed training, config.master is empty if and only if it has
         # a single node in the cluster spec. In this case, we should not start
         # the server.
@@ -679,9 +669,9 @@ class _TrainingExecutor(object):
     logging.info('Start Tensorflow server.')
 
     if config.session_config is None:
-      session_config=config_pb2.ConfigProto(log_device_placement=False)
+      session_config = config_pb2.ConfigProto(log_device_placement=False)
     else:
-      session_config=config_pb2.ConfigProto(
+      session_config = config_pb2.ConfigProto(
           log_device_placement=False,
           gpu_options=config.session_config.gpu_options)
 
@@ -744,8 +734,7 @@ class _TrainingExecutor(object):
             global_step >= self._train_spec.max_steps):
           logging.info(
               'Exiting evaluation, global_step=%s >= train max_steps=%s',
-              global_step,
-              self._train_spec.max_steps)
+              global_step, self._train_spec.max_steps)
           return
 
       latest_eval_result, should_early_stop = self._execute_evaluator_once(
@@ -781,10 +770,9 @@ class _TrainingExecutor(object):
 
     # Throttle if necessary.
     elapsed_time = time.time() - start
-    difference = throttle_secs  - elapsed_time
+    difference = throttle_secs - elapsed_time
     if difference > 0:
-      logging.info('Waiting %f secs before starting next eval run.',
-                   difference)
+      logging.info('Waiting %f secs before starting next eval run.', difference)
       time.sleep(difference)
 
     return (eval_result, should_early_stop)
@@ -929,8 +917,8 @@ class _EvalResult(
       if checkpoint_path:
         raise ValueError(
             'checkpoint must be `None` if status is not {}; got status {}, '
-            'checkpoint_path {}'.format(
-                _EvalStatus.EVALUATED, status, checkpoint_path))
+            'checkpoint_path {}'.format(_EvalStatus.EVALUATED, status,
+                                        checkpoint_path))
       return super(_EvalResult, cls).__new__(cls, status, metrics,
                                              checkpoint_path)
 
diff --git a/tensorflow/python/kernel_tests/tensordot_op_test.py b/tensorflow/python/kernel_tests/tensordot_op_test.py
index f375157287..f1670a47f5 100644
--- a/tensorflow/python/kernel_tests/tensordot_op_test.py
+++ b/tensorflow/python/kernel_tests/tensordot_op_test.py
@@ -56,9 +56,11 @@ class TensordotTest(test_lib.TestCase):
         axes_ph = array_ops.placeholder(dtypes.int32)
         output = math_ops.tensordot(a_ph, b_ph, axes_ph)
         _ = sess.run(
-            [output], feed_dict={a_ph: a,
-                                 b_ph: b,
-                                 axes_ph: (a_axes, b_axes)})
+            [output], feed_dict={
+                a_ph: a,
+                b_ph: b,
+                axes_ph: (a_axes, b_axes)
+            })
 
   def test_invalid_axes(self):
     a = [[1, 2], [3, 4]]
@@ -81,26 +83,27 @@ class TensordotTest(test_lib.TestCase):
       with self.test_session() as sess:
         with self.assertRaises(errors_impl.InvalidArgumentError):
           _ = sess.run(
-              [output], feed_dict={a_ph: a,
-                                   b_ph: b,
-                                   axes_ph: axes_value})
+              [output], feed_dict={
+                  a_ph: a,
+                  b_ph: b,
+                  axes_ph: axes_value
+              })
 
   # Test case for 11950
   def test_valid_axis(self):
     for axes_value in [1, 2], [[1], [2]]:
       with self.test_session() as sess:
-        np_a = np.ones((3,3))
+        np_a = np.ones((3, 3))
         np_b = np.array([2, 3, 1])[None, None]
         np_ans = np.tensordot(np_a, np_b, axes_value)
 
-        tf_a = array_ops.ones((3,3), dtype=dtypes.float32)
+        tf_a = array_ops.ones((3, 3), dtype=dtypes.float32)
         tf_b = constant_op.constant([2, 3, 1], dtype=dtypes.float32)[None, None]
         tf_ans = math_ops.tensordot(tf_a, tf_b, axes_value).eval()
 
         self.assertAllEqual(tf_ans.shape, np_ans.shape)
         self.assertAllEqual(tf_ans, np_ans)
 
-
   def test_partial_shape_inference(self):
     a = array_ops.placeholder(dtypes.float32)
     b = array_ops.placeholder(dtypes.float32)
@@ -169,9 +172,11 @@ def _get_tensordot_tests(dtype_, rank_a_, rank_b_, num_dims_, dynamic_shape_):
           axes = array_ops.placeholder(dtypes.int32)
           c = math_ops.tensordot(a, b, axes)
           tf_ans = sess.run(
-              c, feed_dict={a: a_np,
-                            b: b_np,
-                            axes: (a_dims_np, b_dims_np)})
+              c, feed_dict={
+                  a: a_np,
+                  b: b_np,
+                  axes: (a_dims_np, b_dims_np)
+              })
         else:
           tf_ans = math_ops.tensordot(a_np, b_np, (a_dims_np, b_dims_np)).eval()
       self.assertAllClose(tf_ans, np_ans, rtol=tol, atol=tol)
diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index 6f4f0f9859..106ea19d46 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -184,7 +184,8 @@ do_pylint() {
   # W0312 mixed-indentation
   # C0330 bad-continuation
   # C0301 line-too-long
-  grep -E '(\[E|\[W0311|\[W0312|\[C0330|\[C0301)' ${OUTPUT_FILE} > ${ERRORS_FILE}
+  # C0326 bad-whitespace
+  grep -E '(\[E|\[W0311|\[W0312|\[C0330|\[C0301|\[C0326)' ${OUTPUT_FILE} > ${ERRORS_FILE}
 
   N_ERRORS=0
   while read -r LINE; do
diff --git a/tensorflow/tools/compatibility/tf_upgrade.py b/tensorflow/tools/compatibility/tf_upgrade.py
index f678681dac..6e90b286c9 100644
--- a/tensorflow/tools/compatibility/tf_upgrade.py
+++ b/tensorflow/tools/compatibility/tf_upgrade.py
@@ -46,8 +46,9 @@ class APIChangeSpec(object):
   """
 
 
-class _FileEditTuple(collections.namedtuple(
-    "_FileEditTuple", ["comment", "line", "start", "old", "new"])):
+class _FileEditTuple(
+    collections.namedtuple("_FileEditTuple",
+                           ["comment", "line", "start", "old", "new"])):
   """Each edit that is recorded by a _FileEditRecorder.
 
   Fields:
@@ -179,8 +180,7 @@ class _ASTCallVisitor(ast.NodeVisitor):
     function_renames = self._api_change_spec.function_renames
     try:
       new_name = function_renames[full_name]
-      self._file_edit.add("Renamed function %r to %r" % (full_name,
-                                                         new_name),
+      self._file_edit.add("Renamed function %r to %r" % (full_name, new_name),
                           node.lineno, node.col_offset, full_name, new_name)
     except KeyError:
       pass
@@ -227,7 +227,7 @@ class _ASTCallVisitor(ast.NodeVisitor):
       # loop over lines
       while 1:
         # Reverse the text to and regular expression search for whitespace
-        text = self._lines[line-1]
+        text = self._lines[line - 1]
         reversed_preceding_text = text[:col][::-1]
         # First find if a [ can be found with only whitespace between it and
         # col.
@@ -248,8 +248,8 @@ class _ASTCallVisitor(ast.NodeVisitor):
             # node ranges to filter out spurious #'s that appear in string
             # literals.
             comment_start = prev_line.find("#")
-            if comment_start ==  -1:
-              col = len(prev_line) -1
+            if comment_start == -1:
+              col = len(prev_line) - 1
             elif find_string_chars.search(prev_line[comment_start:]) is None:
               col = comment_start
             else:
@@ -260,7 +260,6 @@ class _ASTCallVisitor(ast.NodeVisitor):
     # it is not possible to use that in an argument.
     return node.lineno, node.col_offset
 
-
   def visit_Call(self, node):  # pylint: disable=invalid-name
     """Handle visiting a call node in the AST.
 
@@ -268,7 +267,6 @@ class _ASTCallVisitor(ast.NodeVisitor):
       node: Current Node
     """
 
-
     # Find a simple attribute name path e.g. "tf.foo.bar"
     full_name = self._get_attribute_full_path(node.func)
 
@@ -293,18 +291,21 @@ class _ASTCallVisitor(ast.NodeVisitor):
           lineno, col_offset = self._find_true_position(arg)
           if lineno is None or col_offset is None:
             self._file_edit.add(
-                "Failed to add keyword %r to reordered function %r"
-                % (reordered[idx], full_name), arg.lineno, arg.col_offset,
-                "", "",
+                "Failed to add keyword %r to reordered function %r" %
+                (reordered[idx], full_name),
+                arg.lineno,
+                arg.col_offset,
+                "",
+                "",
                 error="A necessary keyword argument failed to be inserted.")
           else:
             keyword_arg = reordered[idx]
             if (full_name in function_keyword_renames and
                 keyword_arg in function_keyword_renames[full_name]):
               keyword_arg = function_keyword_renames[full_name][keyword_arg]
-            self._file_edit.add("Added keyword %r to reordered function %r"
-                                % (reordered[idx], full_name), lineno,
-                                col_offset, "", keyword_arg + "=")
+            self._file_edit.add("Added keyword %r to reordered function %r" %
+                                (reordered[idx], full_name), lineno, col_offset,
+                                "", keyword_arg + "=")
 
       # Examine each keyword argument and convert it to the final renamed form
       renamed_keywords = ({} if full_name not in function_keyword_renames else
@@ -322,11 +323,11 @@ class _ASTCallVisitor(ast.NodeVisitor):
             # value.
             key_start = argval_col_offset - len(argkey) - 1
             key_end = key_start + len(argkey) + 1
-            if (self._lines[argval_lineno - 1][key_start:key_end] ==
-                argkey + "="):
+            if (self._lines[argval_lineno - 1][key_start:key_end] == argkey +
+                "="):
               self._file_edit.add("Renamed keyword argument from %r to %r" %
-                                  (argkey, renamed_keywords[argkey]),
-                                  argval_lineno,
+                                  (argkey,
+                                   renamed_keywords[argkey]), argval_lineno,
                                   argval_col_offset - len(argkey) - 1,
                                   argkey + "=", renamed_keywords[argkey] + "=")
               continue
@@ -335,7 +336,8 @@ class _ASTCallVisitor(ast.NodeVisitor):
               (argkey, renamed_keywords[argkey]),
               argval.lineno,
               argval.col_offset - len(argkey) - 1,
-              "", "",
+              "",
+              "",
               error="Failed to find keyword lexographically. Fix manually.")
 
     ast.NodeVisitor.generic_visit(self, node)
@@ -352,7 +354,7 @@ class _ASTCallVisitor(ast.NodeVisitor):
     if full_name in self._api_change_spec.change_to_function:
       if not hasattr(node, "is_function_for_call"):
         new_text = full_name + "()"
-        self._file_edit.add("Changed %r to %r"%(full_name, new_text),
+        self._file_edit.add("Changed %r to %r" % (full_name, new_text),
                             node.lineno, node.col_offset, full_name, new_text)
 
     ast.NodeVisitor.generic_visit(self, node)
@@ -380,8 +382,8 @@ class ASTCodeUpgrader(object):
     # Write to a temporary file, just in case we are doing an implace modify.
     with open(in_filename, "r") as in_file, \
         tempfile.NamedTemporaryFile("w", delete=False) as temp_file:
-      ret = self.process_opened_file(
-          in_filename, in_file, out_filename, temp_file)
+      ret = self.process_opened_file(in_filename, in_file, out_filename,
+                                     temp_file)
 
     shutil.move(temp_file.name, out_filename)
     return ret
@@ -424,6 +426,7 @@ class ASTCodeUpgrader(object):
         out_file.write(out_text)
     text += "\n"
     return 1, text, process_errors
+
   # pylint: enable=broad-except
 
   def process_tree(self, root_directory, output_root_directory,
@@ -444,16 +447,16 @@ class ASTCodeUpgrader(object):
 
     # make sure output directory doesn't exist
     if output_root_directory and os.path.exists(output_root_directory):
-      print("Output directory %r must not already exist." % (
-          output_root_directory))
+      print("Output directory %r must not already exist." %
+            (output_root_directory))
       sys.exit(1)
 
     # make sure output directory does not overlap with root_directory
     norm_root = os.path.split(os.path.normpath(root_directory))
     norm_output = os.path.split(os.path.normpath(output_root_directory))
     if norm_root == norm_output:
-      print("Output directory %r same as input directory %r" % (
-          root_directory, output_root_directory))
+      print("Output directory %r same as input directory %r" %
+            (root_directory, output_root_directory))
       sys.exit(1)
 
     # Collect list of files to process (we do this to correctly handle if the
@@ -465,14 +468,16 @@ class ASTCodeUpgrader(object):
       copy_files = [f for f in file_list if not f.endswith(".py")]
       for filename in py_files:
         fullpath = os.path.join(dir_name, filename)
-        fullpath_output = os.path.join(
-            output_root_directory, os.path.relpath(fullpath, root_directory))
+        fullpath_output = os.path.join(output_root_directory,
+                                       os.path.relpath(fullpath,
+                                                       root_directory))
         files_to_process.append((fullpath, fullpath_output))
       if copy_other_files:
         for filename in copy_files:
           fullpath = os.path.join(dir_name, filename)
-          fullpath_output = os.path.join(
-              output_root_directory, os.path.relpath(fullpath, root_directory))
+          fullpath_output = os.path.join(output_root_directory,
+                                         os.path.relpath(
+                                             fullpath, root_directory))
           files_to_copy.append((fullpath, fullpath_output))
 
     file_count = 0
@@ -641,18 +646,17 @@ class TFAPIChangeSpec(APIChangeSpec):
         "tf.concat": ["concat_dim", "values", "name"],
         "tf.svd": ["tensor", "compute_uv", "full_matrices", "name"],
         "tf.nn.softmax_cross_entropy_with_logits": [
-            "logits", "labels", "dim", "name"],
+            "logits", "labels", "dim", "name"
+        ],
         "tf.nn.sparse_softmax_cross_entropy_with_logits": [
-            "logits", "labels", "name"],
-        "tf.nn.sigmoid_cross_entropy_with_logits": [
-            "logits", "labels", "name"],
+            "logits", "labels", "name"
+        ],
+        "tf.nn.sigmoid_cross_entropy_with_logits": ["logits", "labels", "name"],
         "tf.op_scope": ["values", "name", "default_name"],
     }
 
     # Specially handled functions.
-    self.function_handle = {
-        "tf.reverse": self._reverse_handler
-    }
+    self.function_handle = {"tf.reverse": self._reverse_handler}
 
   @staticmethod
   def _reverse_handler(file_edit_recorder, node):
@@ -661,12 +665,13 @@ class TFAPIChangeSpec(APIChangeSpec):
     comment = ("ERROR: tf.reverse has had its argument semantics changed\n"
                "significantly the converter cannot detect this reliably, so you"
                "need to inspect this usage manually.\n")
-    file_edit_recorder.add(comment,
-                           node.lineno,
-                           node.col_offset,
-                           "tf.reverse",
-                           "tf.reverse",
-                           error="tf.reverse requires manual check.")
+    file_edit_recorder.add(
+        comment,
+        node.lineno,
+        node.col_offset,
+        "tf.reverse",
+        "tf.reverse",
+        error="tf.reverse requires manual check.")
 
 
 if __name__ == "__main__":
-- 
GitLab


From 98ffa7b2bb5b273b4ae2b052e82fb0d8c054d96b Mon Sep 17 00:00:00 2001
From: gracehoney <31743510+aaroey@users.noreply.github.com>
Date: Mon, 29 Jan 2018 11:15:32 -0800
Subject: [PATCH 327/966] Fix compilation, there are still linking issues which
 will be fixed in a followup commit.

---
 configure.py                                  | 124 ++----------------
 tensorflow/contrib/BUILD                      |   4 +-
 tensorflow/contrib/tensorrt/BUILD             | 111 ++++++++--------
 .../contrib/tensorrt/convert/convert_graph.cc |   4 +-
 .../contrib/tensorrt/convert/convert_nodes.cc |   2 +-
 .../contrib/tensorrt/kernels/trt_engine_op.h  |   3 +-
 tensorflow/contrib/tensorrt/log/trt_logger.h  |   3 +-
 .../contrib/tensorrt/shape_fn/trt_shfn.cc     |   5 +-
 tensorflow/tools/pip_package/BUILD            |   4 +-
 third_party/tensorrt/BUILD.tpl                |   3 +-
 third_party/tensorrt/build_defs.bzl           |  85 ------------
 third_party/tensorrt/build_defs.bzl.tpl       |   3 +-
 12 files changed, 84 insertions(+), 267 deletions(-)
 delete mode 100644 third_party/tensorrt/build_defs.bzl

diff --git a/configure.py b/configure.py
index b621b1bc1b..1567ed697f 100644
--- a/configure.py
+++ b/configure.py
@@ -37,7 +37,6 @@ _TF_BAZELRC = os.path.join(os.path.dirname(os.path.abspath(__file__)),
 _TF_WORKSPACE = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                              'WORKSPACE')
 _DEFAULT_CUDA_VERSION = '9.0'
-_DEFAULT_TENSORRT_VERSION = '4'
 _DEFAULT_CUDNN_VERSION = '7'
 _DEFAULT_CUDA_COMPUTE_CAPABILITIES = '3.5,5.2'
 _DEFAULT_CUDA_PATH = '/usr/local/cuda'
@@ -384,12 +383,13 @@ def set_build_var(environ_cp, var_name, query_item, option_name,
 
   var = str(int(get_var(environ_cp, var_name, query_item, enabled_by_default)))
   environ_cp[var_name] = var
-  # TODO(mikecase): Migrate all users of configure.py to use --config Bazel
-  # options and not to set build configs through environment variables.
-  if var=='1':
-    setting='true'
-    confname=":%s"%(bazel_config_name) if bazel_config_name is not None else ""
-    write_to_bazelrc('build%s --define %s=%s' % (confname,option_name,setting))
+  if var == '1':
+    write_to_bazelrc('build --define %s=true' % option_name)
+  elif bazel_config_name is not None:
+    # TODO(mikecase): Migrate all users of configure.py to use --config Bazel
+    # options and not to set build configs through environment variables.
+    write_to_bazelrc('build:%s --define %s=true'
+                     % (bazel_config_name, option_name))
 
 
 def set_action_env_var(environ_cp,
@@ -439,6 +439,7 @@ def convert_version_to_int(version):
   for seg in version_segments:
     if not seg.isdigit():
       return None
+
   version_str = ''.join(['%03d' % int(seg) for seg in version_segments])
   return int(version_str)
 
@@ -1169,108 +1170,6 @@ def set_other_cuda_vars(environ_cp):
       write_to_bazelrc('test --config=cuda')
 
 
-def set_tf_trt_version(environ_cp):
-  """Set TENSORRT_INSTALL_PATH and TF_TENSORRT_VERSION."""
-  ask_trt_version = (
-      'Please specify the TensorRT (libnvinfer) version you want to use. '
-      '[Leave empty to default to libnvinfer %s]: ') % _DEFAULT_TENSORRT_VERSION
-
-  while True:
-    tf_trt_version = get_from_env_or_user_or_default(
-        environ_cp, 'TF_TENSORRT_VERSION', ask_trt_version,
-        _DEFAULT_TENSORRT_VERSION)
-    # if library version is passed and known
-    default_trt_path = environ_cp.get('TENSORRT_INSTALL_PATH',_DEFAULT_TENSORRT_PATH_LINUX)
-    ask_trt_path = (r'Please specify the location where libnvinfer %s library is '
-                      'installed. Refer to README.md for more details. [Default'
-                      ' is %s]:') % (tf_trt_version, default_trt_path)
-    trt_install_path = get_from_env_or_user_or_default(
-        environ_cp, 'TENSORRT_INSTALL_PATH', ask_trt_path, default_trt_path)
-
-    # Result returned from "read" will be used unexpanded. That make "~"
-    # unusable. Going through one more level of expansion to handle that.
-    trt_install_path = os.path.realpath(
-        os.path.expanduser(trt_install_path))
-    # Simple function to search for libnvinfer in install path
-    # it will find all libnvinfer.so* in user defined install path
-    # and lib64 subdirectory and return absolute paths
-    def find_libs(search_path):
-      fl=set()
-      if os.path.exists(search_path) and os.path.isdir(search_path):
-        fl.update([os.path.realpath(os.path.join(search_path,x)) \
-                   for x in os.listdir(search_path) if 'libnvinfer.so' in x])
-      return fl
-    possible_files=find_libs(trt_install_path)
-    possible_files.update(find_libs(os.path.join(trt_install_path,'lib64')))
-    if is_linux():
-      cudnnpatt=re.compile(".*libcudnn.so\.?(.*) =>.*$")
-      cudapatt =re.compile(".*libcudart.so\.?(.*) =>.*$")
-      def is_compatible(lib,cudaver,cudnnver):
-        ldd_bin=which('ldd') or '/usr/bin/ldd'
-        ldd_out=run_shell([ldd_bin,lib]).split(os.linesep)
-        for l in ldd_out:
-          if 'libcudnn.so' in l:
-            cudnn=cudnnpatt.search(l)
-          elif 'libcudart.so' in l:
-            cudart=cudapatt.search(l)
-        if cudnn:
-          cudnn=convert_version_to_int(cudnn.group(1)) if len(cudnn.group(1)) else 0
-        if cudart:
-          cudart=convert_version_to_int(cudart.group(1)) if len(cudart.group(1)) else 0
-        return (cudnn==cudnnver) and (cudart==cudaver)
-      cudaver=convert_version_to_int(environ_cp['TF_CUDA_VERSION'])
-      cudnnver=convert_version_to_int(environ_cp['TF_CUDNN_VERSION'])
-      valid_libs=[]
-      vfinder=re.compile('.*libnvinfer.so.?(.*)$')
-      highest_ver=[0,None,None]
-
-      for l in possible_files:
-        if is_compatible(l,cudaver,cudnnver):
-          valid_libs.append(l)
-          vstr=vfinder.search(l).group(1)
-          currver=convert_version_to_int(vstr) if len(vstr) else 0
-          if currver > highest_ver[0]:
-            highest_ver= [currver,vstr,l]
-      if highest_ver[1] is not None:
-        trt_install_path=os.path.dirname(highest_ver[2])
-        tf_trt_version=highest_ver[1]
-        break
-      ldconfig_bin = which('ldconfig') or '/sbin/ldconfig'
-      libnvinfer_path_from_ldconfig = run_shell([ldconfig_bin, '-p'])
-      libnvinfer_path_from_ldconfig = re.search('.*libnvinfer.so.* => (.*)',
-                                           libnvinfer_path_from_ldconfig)
-      if libnvinfer_path_from_ldconfig:
-        libnvinfer_path_from_ldconfig = libnvinfer_path_from_ldconfig.group(1)
-        if os.path.exists('%s.%s' % (libnvinfer_path_from_ldconfig,
-                                     tf_trt_version)):
-          trt_install_path = os.path.dirname(libnvinfer_path_from_ldconfig)
-          break
-
-    # Reset and Retry
-    if len(possible_files):
-      print(
-          'Invalid path to TensorRT %s. libnvinfer.so* files found are for incompatible cuda versions '
-           % tf_trt_version)
-      print(trt_install_path)
-      print(os.path.join(trt_install_path,'lib64'))
-    else:
-      print(
-          'Invalid path to TensorRT %s. No libnvinfer.so* files found in '
-          'found:' % tf_trt_version)
-      print(trt_install_path)
-      print(os.path.join(trt_install_path,'lib64'))
-      if is_linux():
-        print('%s.%s' % (libnvinfer_path_from_ldconfig, tf_trt_version))
-
-    environ_cp['TF_TENSORRT_VERSION'] = ''
-
-  # Set TENSORRT_INSTALL_PATH and TENSORRT_CUDNN_VERSION
-  environ_cp['TENSORRT_INSTALL_PATH'] = trt_install_path
-  write_action_env_to_bazelrc('TENSORRT_INSTALL_PATH', trt_install_path)
-  environ_cp['TF_TENSORRT_VERSION'] = tf_trt_version
-  write_action_env_to_bazelrc('TF_TENSORRT_VERSION', tf_trt_version)
-  write_to_bazelrc('build:tensorrt --define using_tensorrt=true')
-
 def set_host_cxx_compiler(environ_cp):
   """Set HOST_CXX_COMPILER."""
   default_cxx_host_compiler = which('g++') or ''
@@ -1455,6 +1354,7 @@ def main():
     environ_cp['TF_NEED_GCP'] = '0'
     environ_cp['TF_NEED_HDFS'] = '0'
     environ_cp['TF_NEED_JEMALLOC'] = '0'
+    environ_cp['TF_NEED_KAFKA'] = '0'
     environ_cp['TF_NEED_OPENCL_SYCL'] = '0'
     environ_cp['TF_NEED_COMPUTECPP'] = '0'
     environ_cp['TF_NEED_OPENCL'] = '0'
@@ -1473,6 +1373,8 @@ def main():
                 'with_hdfs_support', True, 'hdfs')
   set_build_var(environ_cp, 'TF_NEED_S3', 'Amazon S3 File System',
                 'with_s3_support', True, 's3')
+  set_build_var(environ_cp, 'TF_NEED_KAFKA', 'Apache Kafka Platform',
+                'with_kafka_support', False, 'kafka')
   set_build_var(environ_cp, 'TF_ENABLE_XLA', 'XLA JIT', 'with_xla_support',
                 False, 'xla')
   set_build_var(environ_cp, 'TF_NEED_GDR', 'GDR', 'with_gdr_support',
@@ -1520,10 +1422,6 @@ def main():
       if not is_windows():
         set_gcc_host_compiler_path(environ_cp)
     set_other_cuda_vars(environ_cp)
-    # enable tensorrt if desired. Disabled on non-linux
-    set_action_env_var(environ_cp, 'TF_NEED_TENSORRT', 'TensorRT', False)
-    if environ_cp.get('TF_NEED_TENSORRT') == '1':
-      set_tf_trt_version(environ_cp)
 
   set_build_var(environ_cp, 'TF_NEED_MPI', 'MPI', 'with_mpi_support', False)
   if environ_cp.get('TF_NEED_MPI') == '1':
diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index d4c0660285..f745c175b1 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -7,7 +7,7 @@ package(default_visibility = ["//tensorflow:__subpackages__"])
 
 load("//third_party/mpi:mpi.bzl", "if_mpi")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
-load("@local_config_tensorrt//:build_defs.bzl", "if_trt")
+load("@local_config_tensorrt//:build_defs.bzl", "if_tensorrt")
 
 py_library(
     name = "contrib_py",
@@ -106,7 +106,7 @@ py_library(
         "//tensorflow/contrib/util:util_py",
         "//tensorflow/python:util",
     ] + if_mpi(["//tensorflow/contrib/mpi_collectives:mpi_ops_py"])
-    + if_trt(["//tensorflow/contrib/tensorrt:init_py"]),
+    + if_tensorrt(["//tensorflow/contrib/tensorrt:init_py"]),
 
 )
 
diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index eeb308fee8..b179e815c8 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -49,34 +49,34 @@ tf_custom_op_library(
     name = "python/ops/_trt_engine_op.so",
     srcs = [
         "kernels/trt_engine_op.cc",
-        "ops/trt_engine_op.cc",
         "kernels/trt_engine_op.h",
+        "ops/trt_engine_op.cc",
     ],
     gpu_srcs = [],
     deps = [
-        "@local_config_tensorrt//:tensorrt",
         ":trt_shape_function",
         "//tensorflow/core:lib_proto_parsing",
         "//tensorflow/core/kernels:bounds_check_lib",
         "//tensorflow/core/kernels:ops_util_hdrs",
+        "@local_config_tensorrt//:nv_infer",
     ],
 )
 
 cc_library(
     name = "trt_shape_function",
-    srcs=[
+    srcs = [
         "shape_fn/trt_shfn.cc",
     ],
-    hdrs=["shape_fn/trt_shfn.h"],
-    copts=tf_copts(),
-    deps=[
+    hdrs = ["shape_fn/trt_shfn.h"],
+    copts = tf_copts(),
+    deps = [
         ":trt_logging",
+        "//tensorflow/core:framework_headers_lib",
         "//third_party/eigen3",
-        "@local_config_tensorrt//:tensorrt",
-        "@protobuf_archive//:protobuf",
+        "@local_config_tensorrt//:nv_infer",
         "@nsync//:nsync_headers",
-        "//tensorflow/core:framework_headers_lib",
-    ]
+        "@protobuf_archive//:protobuf",
+    ],
 )
 
 tf_kernel_library(
@@ -84,7 +84,7 @@ tf_kernel_library(
     srcs = [
         "kernels/trt_engine_op.cc",
     ],
-    hdrs=[
+    hdrs = [
         "kernels/trt_engine_op.h",
     ],
     gpu_srcs = [
@@ -93,37 +93,37 @@ tf_kernel_library(
         ":trt_logging",
         ":trt_shape_function",
         "//tensorflow/core:framework",
+        "//tensorflow/core:gpu_headers_lib",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_proto_parsing",
         "//third_party/eigen3",
-        "//tensorflow/core:gpu_headers_lib",
-        "@local_config_tensorrt//:tensorrt",
-        "//tensorflow/core:lib_proto_parsing",        
+        "@local_config_tensorrt//:nv_infer",
     ],
-    alwayslink=1,
+    alwayslink = 1,
 )
 
 tf_gen_op_libs(
-   op_lib_names = [
-	"trt_engine_op",
-   ],
-    deps=[
-        "@local_config_tensorrt//:tensorrt",
-    ]
+    op_lib_names = [
+        "trt_engine_op",
+    ],
+    deps = [
+        "@local_config_tensorrt//:nv_infer",
+    ],
 )
 
 cc_library(
-    name="trt_logging",
+    name = "trt_logging",
     srcs = [
-         "log/trt_logger.cc",
+        "log/trt_logger.cc",
     ],
-    hdrs=[
-         "log/trt_logger.h",
+    hdrs = [
+        "log/trt_logger.h",
     ],
-    deps=[
-        "@local_config_tensorrt//:tensorrt",
+    visibility = ["//visibility:public"],
+    deps = [
         "//tensorflow/core:lib_proto_parsing",
+        "@local_config_tensorrt//:nv_infer",
     ],
-    visibility = ["//visibility:public"],
 )
 
 tf_gen_op_wrapper_py(
@@ -137,8 +137,9 @@ tf_gen_op_wrapper_py(
 tf_custom_op_py_library(
     name = "trt_engine_op_loader",
     srcs = ["python/ops/trt_engine_op.py"],
-    dso = [":python/ops/_trt_engine_op.so",
-           "@local_config_tensorrt//:tensorrt",
+    dso = [
+        ":python/ops/_trt_engine_op.so",
+        "@local_config_tensorrt//:nv_infer",
     ],
     srcs_version = "PY2AND3",
     deps = [
@@ -155,33 +156,33 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        ":trt_convert_py",
         ":trt_ops_py",
-        ":trt_convert_py",        
     ],
 )
 
 py_library(
-    name="trt_ops_py",
+    name = "trt_ops_py",
     srcs_version = "PY2AND3",
-    deps=[":trt_engine_op",
-          ":trt_engine_op_loader",
+    deps = [
+        ":trt_engine_op",
+        ":trt_engine_op_loader",
     ],
-    
 )
 
 py_library(
-    name="trt_convert_py",
-    srcs=["python/trt_convert.py"],
+    name = "trt_convert_py",
+    srcs = ["python/trt_convert.py"],
     srcs_version = "PY2AND3",
-    deps=[
-        ":wrap_conversion"
+    deps = [
+        ":wrap_conversion",
     ],
 )
 
 tf_py_wrap_cc(
-    name="wrap_conversion",
-    srcs=["trt_conversion.i"],
-    deps=[
+    name = "wrap_conversion",
+    srcs = ["trt_conversion.i"],
+    deps = [
         ":trt_conversion",
         "//tensorflow/core:framework_lite",
         "//util/python:python_headers",
@@ -189,20 +190,20 @@ tf_py_wrap_cc(
 )
 
 cc_library(
-    name= "trt_conversion",
-    srcs=[
-        "convert/convert_nodes.cc",
+    name = "trt_conversion",
+    srcs = [
         "convert/convert_graph.cc",
+        "convert/convert_nodes.cc",
         "segment/segment.cc",
     ],
-    hdrs=[
-        "convert/convert_nodes.h",
+    hdrs = [
         "convert/convert_graph.h",
+        "convert/convert_nodes.h",
         "segment/segment.h",
         "segment/union_find.h",
     ],
-    deps=[
-        "@local_config_tensorrt//:tensorrt",
+    deps = [
+        "@local_config_tensorrt//:nv_infer",
         "@protobuf_archive//:protobuf_headers",
         "@nsync//:nsync_headers",
         ":trt_logging",
@@ -225,7 +226,7 @@ tf_custom_op_library(
         "ops/tensorrt_ops.cc",
     ],
     deps = [
-        "@local_config_tensorrt//:tensorrt",
+        "@local_config_tensorrt//:nv_infer",
     ],
 )
 
@@ -236,16 +237,16 @@ cc_library(
         "segment/segment.cc",
     ],
     hdrs = [
-        "segment/union_find.h",
         "segment/segment.h",
+        "segment/union_find.h",
     ],
+    linkstatic = 1,
     deps = [
-        "@protobuf_archive//:protobuf_headers",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:lib_proto_parsing",
         "//third_party/eigen3",
+        "@protobuf_archive//:protobuf_headers",
     ],
-    linkstatic = 1,
 )
 
 tf_cc_test(
@@ -265,13 +266,13 @@ tf_cc_test(
 filegroup(
     name = "cppfiles",
     srcs = glob(["**/*.cc"]),
-    visibility=["//visibility:private"],
+    visibility = ["//visibility:private"],
 )
 
 filegroup(
     name = "headers",
     srcs = glob(["**/*.h"]),
-    visibility=["//visibility:private"],
+    visibility = ["//visibility:private"],
 )
 
 filegroup(
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index fa5ed28060..3ade4ec356 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -25,8 +25,6 @@ limitations under the License.
 #include <map>
 #include <utility>
 
-#include "NvInfer.h"
-
 #include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
 #include "tensorflow/contrib/tensorrt/segment/segment.h"
 #include "tensorflow/core/framework/graph.pb.h"
@@ -46,8 +44,8 @@ limitations under the License.
 #include "tensorflow/core/protobuf/device_properties.pb.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/utils.h"
-
 #include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorrt/include/NvInfer.h"
 
 //------------------------------------------------------------------------------
 namespace tensorflow {
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index 2fd7f659d5..19242cd944 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include <unordered_map>
 #include <utility>
 #include <vector>
-#include "NvInfer.h"
 
 #include "tensorflow/contrib/tensorrt/log/trt_logger.h"
 #include "tensorflow/core/framework/graph.pb.h"
@@ -39,6 +38,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorrt/include/NvInfer.h"
 
 #define _TF_LOG_DEBUG ::tensorflow::internal::LogMessage(__FILE__, __LINE__, -1)
 //  Check if the types are equal. Cast to int first so that failure log message
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
index 2f2d453dda..ac188addd7 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
@@ -16,11 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_TENSORRT_KERNELS_TRT_ENGINE_OP_H_
 #define TENSORFLOW_CONTRIB_TENSORRT_KERNELS_TRT_ENGINE_OP_H_
 
-#include <NvInfer.h>
 #include <cuda_runtime_api.h>
 #include <memory>
 #include <string>
 #include <vector>
+
+#include "tensorrt/include/NvInfer.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 
diff --git a/tensorflow/contrib/tensorrt/log/trt_logger.h b/tensorflow/contrib/tensorrt/log/trt_logger.h
index 3a3a29516a..8737813bed 100644
--- a/tensorflow/contrib/tensorrt/log/trt_logger.h
+++ b/tensorflow/contrib/tensorrt/log/trt_logger.h
@@ -18,9 +18,10 @@ limitations under the License.
 #define TENSORFLOW_CONTRIB_TENSORRT_LOG_TRT_LOGGER_H_
 
 // Use TF logging f
-#include <NvInfer.h>
 #include <string>
 
+#include "tensorrt/include/NvInfer.h"
+
 //------------------------------------------------------------------------------
 namespace tensorflow {
 
diff --git a/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc b/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
index 16a4dc2134..7951397b7e 100644
--- a/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
+++ b/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
@@ -13,11 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/shape_fn/trt_shfn.h"
 #include <string>
 #include <vector>
-#include "NvInfer.h"
+
 #include "tensorflow/contrib/tensorrt/log/trt_logger.h"
+#include "tensorflow/contrib/tensorrt/shape_fn/trt_shfn.h"
+#include "tensorrt/include/NvInfer.h"
 
 namespace tensorflow {
 namespace shape_inference {
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index d864d09d8f..63bceaa8a6 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -11,7 +11,7 @@ load(
 )
 load("//third_party/mkl:build_defs.bzl", "if_mkl")
 load("//tensorflow:tensorflow.bzl", "if_cuda")
-load("@local_config_tensorrt//:build_defs.bzl", "if_trt")
+load("@local_config_tensorrt//:build_defs.bzl", "if_tensorrt")
 load("//tensorflow/core:platform/default/build_config_root.bzl", "tf_additional_license_deps")
 
 # This returns a list of headers of all public header libraries (e.g.,
@@ -183,7 +183,7 @@ sh_binary(
             "//tensorflow/tools/dist_test/server:grpc_tensorflow_server",
         ],
     }) + if_mkl(["//third_party/mkl:intel_binary_blob"])
-    + if_trt(["//tensorflow/contrib/tensorrt:init_py"]),
+    + if_tensorrt(["//tensorflow/contrib/tensorrt:init_py"]),
 )
 
 # A genrule for generating a marker file for the pip package on Windows
diff --git a/third_party/tensorrt/BUILD.tpl b/third_party/tensorrt/BUILD.tpl
index 6cb7db7e90..99c0e89498 100644
--- a/third_party/tensorrt/BUILD.tpl
+++ b/third_party/tensorrt/BUILD.tpl
@@ -66,4 +66,5 @@ cc_library(
     visibility = ["//visibility:public"],
 )
 
-%{tensorrt_genrules}
\ No newline at end of file
+%{tensorrt_genrules}
+
diff --git a/third_party/tensorrt/build_defs.bzl b/third_party/tensorrt/build_defs.bzl
deleted file mode 100644
index 392c5e0621..0000000000
--- a/third_party/tensorrt/build_defs.bzl
+++ /dev/null
@@ -1,85 +0,0 @@
-# -*- python -*-
-"""
- add a repo_generator rule for tensorrt
-
-"""
-
-_TENSORRT_INSTALLATION_PATH="TENSORRT_INSTALL_PATH"
-_TF_TENSORRT_VERSION="TF_TENSORRT_VERSION"
-
-def _is_trt_enabled(repo_ctx):
-    if "TF_NEED_TENSORRT" in repo_ctx.os.environ:
-        enable_trt = repo_ctx.os.environ["TF_NEED_TENSORRT"].strip()
-        return enable_trt == "1"
-    return False
-
-def _dummy_repo(repo_ctx):
-
-    repo_ctx.template("BUILD",Label("//third_party/tensorrt:BUILD.tpl"),
-                      {"%{tensorrt_lib}":"","%{tensorrt_genrules}":""},
-                      False)
-    repo_ctx.template("build_defs.bzl",Label("//third_party/tensorrt:build_defs.bzl.tpl"),
-                      {"%{trt_configured}":"False"},False)
-    repo_ctx.file("include/NvUtils.h","",False)
-    repo_ctx.file("include/NvInfer.h","",False)
-
-def _trt_repo_impl(repo_ctx):
-    """
-    Implements local_config_tensorrt
-    """
-
-    if not _is_trt_enabled(repo_ctx):
-        _dummy_repo(repo_ctx)
-        return
-    trt_libdir=repo_ctx.os.environ[_TENSORRT_INSTALLATION_PATH]
-    trt_ver=repo_ctx.os.environ[_TF_TENSORRT_VERSION]
-# if deb installation
-# once a standardized installation between tar and deb
-# is done, we don't need this
-    if trt_libdir == '/usr/lib/x86_64-linux-gnu':
-        incPath='/usr/include/x86_64-linux-gnu'
-        incname='/usr/include/x86_64-linux-gnu/NvInfer.h'
-    else:
-        incPath=str(repo_ctx.path("%s/../include"%trt_libdir).realpath)
-        incname=incPath+'/NvInfer.h'
-    if len(trt_ver)>0:
-        origLib="%s/libnvinfer.so.%s"%(trt_libdir,trt_ver)
-    else:
-        origLib="%s/libnvinfer.so"%trt_libdir        
-    objdump=repo_ctx.which("objdump")
-    if objdump == None:
-        if len(trt_ver)>0:
-            targetlib="lib/libnvinfer.so.%s"%(trt_ver[0])
-        else:
-            targetlib="lib/libnvinfer.so"
-    else:
-        soname=repo_ctx.execute([objdump,"-p",origLib])
-        for l in soname.stdout.splitlines():
-            if "SONAME" in l:
-                lib=l.strip().split(" ")[-1]
-                targetlib="lib/%s"%(lib)
-    
-    if len(trt_ver)>0:
-        repo_ctx.symlink(origLib,targetlib)
-    else:
-        repo_ctx.symlink(origLib,targetlib)
-    grule=('genrule(\n    name = "trtlinks",\n'+
-           '    outs = [\n    "%s",\n    "include/NvInfer.h",\n    "include/NvUtils.h",\n     ],\n'%targetlib +
-           '    cmd="""ln -sf %s $(@D)/%s '%(origLib,targetlib) +
-           '&&\n    ln -sf %s $(@D)/include/NvInfer.h '%(incname) +
-           '&&\n    ln -sf %s/NvUtils.h $(@D)/include/NvUtils.h""",\n)\n'%(incPath))
-    repo_ctx.template("BUILD",Label("//third_party/tensorrt:BUILD.tpl"),
-                      {"%{tensorrt_lib}":'"%s"'%targetlib,"%{tensorrt_genrules}":grule},
-                      False)
-    repo_ctx.template("build_defs.bzl",Label("//third_party/tensorrt:build_defs.bzl.tpl"),
-                      {"%{trt_configured}":"True"},False)
-
-trt_repository=repository_rule(
-    implementation= _trt_repo_impl,
-    local=True,
-    environ=[
-        "TF_NEED_TENSORRT",
-        _TF_TENSORRT_VERSION,
-        _TENSORRT_INSTALLATION_PATH,
-        ],
-    )
diff --git a/third_party/tensorrt/build_defs.bzl.tpl b/third_party/tensorrt/build_defs.bzl.tpl
index 8a89b59bc8..f5348a7c06 100644
--- a/third_party/tensorrt/build_defs.bzl.tpl
+++ b/third_party/tensorrt/build_defs.bzl.tpl
@@ -4,4 +4,5 @@ def if_tensorrt(if_true, if_false=[]):
   """Tests whether TensorRT was enabled during the configure process."""
   if %{tensorrt_is_configured}:
     return if_true
-  return if_false
\ No newline at end of file
+  return if_false
+
-- 
GitLab


From 19c579d97f689b6fc0581f4a1973c3305b5693c5 Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Mon, 29 Jan 2018 11:20:39 -0800
Subject: [PATCH 328/966] Make optimize_for_inference_test.py work the C API
 enabled.

PiperOrigin-RevId: 183696425
---
 tensorflow/python/tools/optimize_for_inference_test.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/tools/optimize_for_inference_test.py b/tensorflow/python/tools/optimize_for_inference_test.py
index 6dd24c0dca..7686bb0f14 100644
--- a/tensorflow/python/tools/optimize_for_inference_test.py
+++ b/tensorflow/python/tools/optimize_for_inference_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import image_ops
@@ -38,6 +39,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.tools import optimize_for_inference_lib
 
 
+@test_util.with_c_api
 class OptimizeForInferenceTest(test.TestCase):
 
   def create_node_def(self, op, name, inputs):
@@ -145,7 +147,7 @@ class OptimizeForInferenceTest(test.TestCase):
           np.array([0.1, 0.6]), shape=[2], dtype=dtypes.float32)
       gamma_op = constant_op.constant(
           np.array([1.0, 2.0]), shape=[2], dtype=dtypes.float32)
-      ops.get_default_graph().graph_def_versions.producer = 8
+      test_util.set_producer_version(ops.get_default_graph(), 8)
       gen_nn_ops._batch_norm_with_global_normalization(
           conv_op,
           mean_op,
-- 
GitLab


From b981f2e93c7edb7f07b82677e5ac85d966dd0ab5 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Mon, 29 Jan 2018 11:31:07 -0800
Subject: [PATCH 329/966] [TF:XLA] Implement ReverseSequence operator.

PiperOrigin-RevId: 183698184
---
 tensorflow/compiler/tests/BUILD               |  12 ++
 .../tests/reverse_sequence_op_test.py         |  93 +++++++++
 tensorflow/compiler/tf2xla/kernels/BUILD      |   1 +
 .../tf2xla/kernels/reverse_sequence_op.cc     | 182 ++++++++++++++++++
 4 files changed, 288 insertions(+)
 create mode 100644 tensorflow/compiler/tests/reverse_sequence_op_test.py
 create mode 100644 tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc

diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 314f5506b1..a3a82df9ad 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -437,6 +437,18 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "reverse_sequence_op_test",
+    size = "small",
+    srcs = ["reverse_sequence_op_test.py"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 tf_xla_py_test(
     name = "rmsprop_test",
     size = "small",
diff --git a/tensorflow/compiler/tests/reverse_sequence_op_test.py b/tensorflow/compiler/tests/reverse_sequence_op_test.py
new file mode 100644
index 0000000000..1a5d05094e
--- /dev/null
+++ b/tensorflow/compiler/tests/reverse_sequence_op_test.py
@@ -0,0 +1,93 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.ops.reverse_sequence_op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class ReverseSequenceTest(XLATestCase):
+
+  def _testReverseSequence(self,
+                           x,
+                           batch_axis,
+                           seq_axis,
+                           seq_lengths,
+                           truth,
+                           expected_err_re=None):
+    with self.test_session():
+      p = array_ops.placeholder(dtypes.as_dtype(x.dtype))
+      lengths = array_ops.placeholder(dtypes.as_dtype(seq_lengths.dtype))
+      with self.test_scope():
+        ans = array_ops.reverse_sequence(
+            p, batch_axis=batch_axis, seq_axis=seq_axis, seq_lengths=lengths)
+      if expected_err_re is None:
+        tf_ans = ans.eval(feed_dict={p: x, lengths: seq_lengths})
+        self.assertAllClose(tf_ans, truth, atol=1e-10)
+      else:
+        with self.assertRaisesOpError(expected_err_re):
+          ans.eval(feed_dict={p: x, lengths: seq_lengths})
+
+  def testSimple(self):
+    x = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.int32)
+    expected = np.array([[1, 2, 3], [6, 5, 4], [8, 7, 9]], dtype=np.int32)
+    self._testReverseSequence(
+        x,
+        batch_axis=0,
+        seq_axis=1,
+        seq_lengths=np.array([1, 3, 2], np.int32),
+        truth=expected)
+
+  def _testBasic(self, dtype, len_dtype):
+    x = np.asarray(
+        [[[1, 2, 3, 4], [5, 6, 7, 8]], [[9, 10, 11, 12], [13, 14, 15, 16]],
+         [[17, 18, 19, 20], [21, 22, 23, 24]]],
+        dtype=dtype)
+    x = x.reshape(3, 2, 4, 1, 1)
+    x = x.transpose([2, 1, 0, 3, 4])  # permute axes 0 <=> 2
+
+    # reverse dim 2 up to (0:3, none, 0:4) along dim=0
+    seq_lengths = np.asarray([3, 0, 4], dtype=len_dtype)
+
+    truth_orig = np.asarray(
+        [
+            [[3, 2, 1, 4], [7, 6, 5, 8]],  # reverse 0:3
+            [[9, 10, 11, 12], [13, 14, 15, 16]],  # reverse none
+            [[20, 19, 18, 17], [24, 23, 22, 21]]
+        ],  # reverse 0:4 (all)
+        dtype=dtype)
+    truth_orig = truth_orig.reshape(3, 2, 4, 1, 1)
+    truth = truth_orig.transpose([2, 1, 0, 3, 4])  # permute axes 0 <=> 2
+
+    seq_axis = 0  # permute seq_axis and batch_axis (originally 2 and 0, resp.)
+    batch_axis = 2
+    self._testReverseSequence(x, batch_axis, seq_axis, seq_lengths, truth)
+
+  def testSeqLength(self):
+    for dtype in self.all_types:
+      for seq_dtype in self.int_types:
+        self._testBasic(dtype, seq_dtype)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index 5e1b01878b..a7e00cb12f 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -58,6 +58,7 @@ tf_kernel_library(
         "reshape_op.cc",
         "retval_op.cc",
         "reverse_op.cc",
+        "reverse_sequence_op.cc",
         "scan_ops.cc",
         "segment_reduction_ops.cc",
         "select_op.cc",
diff --git a/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc b/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc
new file mode 100644
index 0000000000..6bc5d3adb0
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc
@@ -0,0 +1,182 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+
+namespace tensorflow {
+namespace {
+
+class ReverseSequenceOp : public XlaOpKernel {
+ public:
+  explicit ReverseSequenceOp(OpKernelConstruction* context)
+      : XlaOpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("batch_dim", &batch_dim_));
+    OP_REQUIRES_OK(context, context->GetAttr("seq_dim", &seq_dim_));
+  }
+
+  void Compile(XlaOpKernelContext* context) override {
+    const TensorShape input_shape = context->InputShape(0);
+    const TensorShape seq_lens_shape = context->InputShape(1);
+
+    OP_REQUIRES(context, TensorShapeUtils::IsVector(seq_lens_shape),
+                errors::InvalidArgument("seq_lens input must be 1-dim, not ",
+                                        seq_lens_shape.dims()));
+    OP_REQUIRES(context, batch_dim_ != seq_dim_,
+                errors::InvalidArgument("batch_dim == seq_dim == ", seq_dim_));
+    OP_REQUIRES(
+        context, seq_dim_ < input_shape.dims(),
+        errors::InvalidArgument("seq_dim must be < input.dims()", "( ",
+                                seq_dim_, " vs. ", input_shape.dims(), ")"));
+    OP_REQUIRES(
+        context, batch_dim_ < input_shape.dims(),
+        errors::InvalidArgument("batch_dim must be < input.dims()", "( ",
+                                batch_dim_, " vs. ", input_shape.dims(), ")"));
+    OP_REQUIRES(
+        context,
+        seq_lens_shape.num_elements() == input_shape.dim_size(batch_dim_),
+        errors::InvalidArgument("len(seq_lens) != input.dims(", batch_dim_,
+                                "), ", "(", seq_lens_shape.num_elements(),
+                                " vs. ", input_shape.dim_size(batch_dim_)));
+
+    xla::ComputationBuilder* builder = context->builder();
+    const auto input = context->Input(0);
+    const auto seq_lens = context->Input(1);
+
+    const int64 batch_size = input_shape.dim_size(batch_dim_);
+
+    const DataType input_type = context->input_type(0);
+    const DataType seq_lens_type = context->input_type(1);
+    const int64 max_seq_len = input_shape.dim_size(seq_dim_);
+
+    xla::Shape input_xla_shape;
+    OP_REQUIRES_OK(context, TensorShapeToXLAShape(input_type, input_shape,
+                                                  &input_xla_shape));
+    xla::Shape seq_lens_xla_shape;
+    OP_REQUIRES_OK(context, TensorShapeToXLAShape(seq_lens_type, seq_lens_shape,
+                                                  &seq_lens_xla_shape));
+
+    const auto tuple_shape = xla::ShapeUtil::MakeTupleShape({
+        xla::ShapeUtil::MakeShape(seq_lens_xla_shape.element_type(), {}),
+        seq_lens_xla_shape,
+        input_xla_shape,
+    });
+
+    // For each entry in the batch, reverse the sequence.
+    // TODO(b/65689298): generalize the Map() operator to non-scalar cases and
+    // use it here, instead of a While loop.
+
+    // Condition: lambda (i, _, _): i < batch_size
+    auto condition_builder =
+        builder->CreateSubBuilder("reverse_sequence_condition");
+    {
+      auto param = condition_builder->Parameter(0, tuple_shape, "param");
+      auto i = condition_builder->GetTupleElement(param, 0);
+      condition_builder->Lt(
+          i, XlaHelpers::IntegerLiteral(condition_builder.get(), seq_lens_type,
+                                        batch_size));
+    }
+    auto condition = condition_builder->Build();
+    OP_REQUIRES_OK(context, condition.status());
+
+    auto body_builder = builder->CreateSubBuilder("reverse_sequence_body");
+    {
+      auto param = body_builder->Parameter(0, tuple_shape, "param");
+      auto i = body_builder->GetTupleElement(param, 0);
+      auto seq_lens = body_builder->GetTupleElement(param, 1);
+      auto output = body_builder->GetTupleElement(param, 2);
+
+      // seq_len is the sequence length of the current batch element (rank 1)
+      auto seq_len = body_builder->DynamicSlice(
+          seq_lens, body_builder->Reshape(i, {1}), {1});
+
+      // Indices is the offset of the batch element in the input.
+      auto indices = body_builder->Broadcast(
+          XlaHelpers::Zero(body_builder.get(), seq_lens_type),
+          {input_shape.dims()});
+      indices = body_builder->DynamicUpdateSlice(
+          indices, body_builder->Reshape(i, {1}),
+          body_builder->Reshape(
+              XlaHelpers::IntegerLiteral(body_builder.get(), seq_lens_type,
+                                         batch_dim_),
+              {1}));
+
+      // slice_indices is the offset of the start of the reversed sequence in
+      // the input.
+      auto slice_indices = body_builder->DynamicUpdateSlice(
+          indices,
+          body_builder->Sub(XlaHelpers::IntegerLiteral(
+                                body_builder.get(), seq_lens_type, max_seq_len),
+                            seq_len),
+          body_builder->Reshape(
+              XlaHelpers::IntegerLiteral(body_builder.get(), seq_lens_type,
+                                         seq_dim_),
+              {1}));
+
+      // Slice out the reversed sequence. The slice will overflow the end of the
+      // sequence, and the contents of the overflow are implementation-defined.
+      // However, we will mask off these elements and replace them with elements
+      // from the original input so their values do not matter.
+      TensorShape slice_shape = input_shape;
+      slice_shape.set_dim(batch_dim_, 1);
+      auto slice = body_builder->DynamicSlice(output, slice_indices,
+                                              slice_shape.dim_sizes());
+
+      // Shift the reversed sequence to the left.
+      output = body_builder->DynamicUpdateSlice(output, slice, indices);
+
+      body_builder->Tuple(
+          {body_builder->Add(
+               i, XlaHelpers::One(body_builder.get(), seq_lens_type)),
+           seq_lens, output});
+    }
+    auto body = body_builder->Build();
+    OP_REQUIRES_OK(context, body.status());
+
+    auto loop_output = builder->While(
+        condition.ValueOrDie(), body.ValueOrDie(),
+        builder->Tuple({XlaHelpers::Zero(builder, seq_lens_type), seq_lens,
+                        builder->Rev(input, {seq_dim_})}));
+    auto output = builder->GetTupleElement(loop_output, 2);
+
+    // Mask out elements after the sequence length.
+    xla::ComputationDataHandle iota;
+    OP_REQUIRES_OK(
+        context, XlaHelpers::Iota(builder, seq_lens_type, max_seq_len, &iota));
+    std::vector<int64> dims(input_shape.dims(), 1);
+    dims[batch_dim_] = batch_size;
+    auto mask = builder->Lt(iota, builder->Reshape(seq_lens, dims), {seq_dim_});
+
+    // Broadcast the mask up to the input shape.
+    mask =
+        builder->Or(mask, builder->Broadcast(builder->ConstantR0<bool>(false),
+                                             input_shape.dim_sizes()));
+
+    output = builder->Select(mask, output, input);
+    context->SetOutput(0, output);
+  }
+
+ private:
+  int32 batch_dim_;
+  int32 seq_dim_;
+};
+
+REGISTER_XLA_OP(Name("ReverseSequence"), ReverseSequenceOp);
+
+}  // namespace
+}  // namespace tensorflow
-- 
GitLab


From dd808e926da2ccfb1dbe0086bcef5dee55a454a9 Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <ndesaulniers@google.com>
Date: Mon, 29 Jan 2018 11:41:31 -0800
Subject: [PATCH 330/966] [TF:XLA] Add xla_dump_per_pass_hlo_proto_to flag and
 rename xla_dump_hlo_proto_to and xla_dump_prepass_hlo_proto_to.

Renamed:
xla_dump_hlo_proto_to -> xla_dump_optimized_hlo_proto_to
xla_dump_prepass_hlo_proto_to -> xla_dump_unoptimized_hlo_proto_to

xla_dump_per_pass_hlo_proto_to takes a directory from which to dump the serialized HLO
module protos after each HLO pass. This will help us compare HLO modules as
they change during HLO passes, such as what they evaluate to, or how fast they
perform.

The directory passed will contain multiple protos with the filename format:
module_<module id>.<pass number>.<pipeline_name>.after_<pass name>.pb
example:
module_0000.0009.simplification.after_dce.pb
such that pass number 0000_* is the first pass run, followed immediately by
pass number 0001_*.
PiperOrigin-RevId: 183700119
---
 .../xla/legacy_flags/debug_options_flags.cc   | 18 ++++++----
 tensorflow/compiler/xla/service/BUILD         |  1 +
 .../compiler/xla/service/cpu/cpu_compiler.cc  | 20 +++++------
 .../compiler/xla/service/gpu/gpu_compiler.cc  |  8 ++---
 tensorflow/compiler/xla/service/hlo_module.cc | 12 +++++--
 tensorflow/compiler/xla/service/hlo_module.h  | 10 ++++++
 .../compiler/xla/service/hlo_module_test.cc   |  6 ++++
 .../compiler/xla/service/hlo_pass_pipeline.cc | 36 ++++++++++++++++---
 tensorflow/compiler/xla/service/service.cc    |  8 ++---
 .../compiler/xla/tools/hlo_proto_to_json.cc   |  2 +-
 tensorflow/compiler/xla/util.cc               |  2 +-
 tensorflow/compiler/xla/xla.proto             | 15 +++++---
 12 files changed, 100 insertions(+), 38 deletions(-)

diff --git a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
index fe3a4d2f6d..c8ed3e3a2b 100644
--- a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
+++ b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
@@ -221,13 +221,19 @@ void AllocateFlags() {
           flag_values->xla_gpu_disable_multi_streaming(),
           "If true, multi-streaming in the GPU backend is disabled."),
       tensorflow::Flag(
-          "xla_dump_hlo_proto_to", flag_values->mutable_xla_dump_hlo_proto_to(),
-          "Dump compilation artifacts as proto binary into this directory."),
+          "xla_dump_optimized_hlo_proto_to",
+          flag_values->mutable_xla_dump_optimized_hlo_proto_to(),
+          "Dump Hlo after all hlo passes are executed as proto binary into "
+          "this directory."),
       tensorflow::Flag(
-          "xla_dump_prepass_hlo_proto_to",
-          flag_values->mutable_xla_dump_prepass_hlo_proto_to(),
-          "Dump compilation artifacts, before hlo passes are executed, as "
-          "proto binary into this directory."),
+          "xla_dump_unoptimized_hlo_proto_to",
+          flag_values->mutable_xla_dump_unoptimized_hlo_proto_to(),
+          "Dump HLO before any hlo passes are executed as proto binary into "
+          "this directory."),
+      tensorflow::Flag("xla_dump_per_pass_hlo_proto_to",
+                       flag_values->mutable_xla_dump_per_pass_hlo_proto_to(),
+                       "Dump HLO after each pass as an HloProto in binary file "
+                       "format into this directory."),
       tensorflow::Flag(
           "xla_test_all_output_layouts",
           bool_setter_for(&DebugOptions::set_xla_test_all_output_layouts),
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index bdccfad0d0..a04ba7ae3e 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -1886,6 +1886,7 @@ cc_library(
         ":hlo",
         ":hlo_graph_dumper",
         ":hlo_pass",
+        ":hlo_proto_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index 3fdb3d5ca6..d13a97bcc9 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -519,8 +519,8 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
   // ownership is std::moved.
   const bool embed_ir_in_executable =
       module->config().debug_options().xla_embed_ir_in_executable();
-  const string xla_dump_hlo_proto_to =
-      module->config().debug_options().xla_dump_hlo_proto_to();
+  const string xla_dump_optimized_hlo_proto_to =
+      module->config().debug_options().xla_dump_optimized_hlo_proto_to();
 
   if (options::CpuParallelBackendRequested(module->config())) {
     VLOG(1) << "Using parallel cpu backend";
@@ -540,10 +540,10 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
     // print one ourselves.
     XLA_VLOG_LINES(2, assignment->ToString());
 
-    if (!xla_dump_hlo_proto_to.empty()) {
+    if (!xla_dump_optimized_hlo_proto_to.empty()) {
       HloProto proto = MakeHloProto(*module, *assignment);
       TF_RETURN_IF_ERROR(protobuf_util::DumpProtoToDirectory(
-          proto, xla_dump_hlo_proto_to, module->name()));
+          proto, xla_dump_optimized_hlo_proto_to, module->name()));
     }
 
     // If we are using the parallel CPU backend, we need to create map from
@@ -649,10 +649,10 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
     // print one ourselves.
     XLA_VLOG_LINES(2, assignment->ToString());
 
-    if (!xla_dump_hlo_proto_to.empty()) {
+    if (!xla_dump_optimized_hlo_proto_to.empty()) {
       HloProto proto = MakeHloProto(*module, *assignment);
       TF_RETURN_IF_ERROR(protobuf_util::DumpProtoToDirectory(
-          proto, xla_dump_hlo_proto_to, module->name()));
+          proto, xla_dump_optimized_hlo_proto_to, module->name()));
     }
 
     // Each computation is a single function.  Emit all embedded computations
@@ -828,12 +828,12 @@ CpuCompiler::CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
     // print one ourselves.
     XLA_VLOG_LINES(2, assignment->ToString());
 
-    const string xla_dump_hlo_proto_to =
-        module->config().debug_options().xla_dump_hlo_proto_to();
-    if (!xla_dump_hlo_proto_to.empty()) {
+    const string xla_dump_optimized_hlo_proto_to =
+        module->config().debug_options().xla_dump_optimized_hlo_proto_to();
+    if (!xla_dump_optimized_hlo_proto_to.empty()) {
       HloProto proto = MakeHloProto(*module, *assignment);
       TF_RETURN_IF_ERROR(protobuf_util::DumpProtoToDirectory(
-          proto, xla_dump_hlo_proto_to, module->name()));
+          proto, xla_dump_optimized_hlo_proto_to, module->name()));
     }
 
     IrEmitter ir_emitter(*module, *assignment, &llvm_module,
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index 495ae1710f..07543d42e3 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -468,12 +468,12 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
   // print one ourselves.
   XLA_VLOG_LINES(2, buffer_assignment->ToString());
   XLA_VLOG_LINES(2, module->ToString());
-  const string xla_dump_hlo_proto_to =
-      module->config().debug_options().xla_dump_hlo_proto_to();
-  if (!xla_dump_hlo_proto_to.empty()) {
+  const string xla_dump_optimized_hlo_proto_to =
+      module->config().debug_options().xla_dump_optimized_hlo_proto_to();
+  if (!xla_dump_optimized_hlo_proto_to.empty()) {
     HloProto proto = MakeHloProto(*module, *buffer_assignment);
     TF_RETURN_IF_ERROR(protobuf_util::DumpProtoToDirectory(
-        proto, xla_dump_hlo_proto_to, module->name()));
+        proto, xla_dump_optimized_hlo_proto_to, module->name()));
   }
 
   IrEmitterContext ir_emitter_context(module.get(), buffer_assignment.get(),
diff --git a/tensorflow/compiler/xla/service/hlo_module.cc b/tensorflow/compiler/xla/service/hlo_module.cc
index 99d8dd04e5..60270b0595 100644
--- a/tensorflow/compiler/xla/service/hlo_module.cc
+++ b/tensorflow/compiler/xla/service/hlo_module.cc
@@ -38,12 +38,16 @@ HloModule::HloModule(const string& name,
     : name_(NameUniquer::GetSanitizedName(name)),
       config_(config),
       has_entry_computation_handle_(true),
-      entry_computation_handle_(entry_computation_handle) {}
+      entry_computation_handle_(entry_computation_handle),
+      unique_id_(next_unique_module_id_++) {}
 
 HloModule::HloModule(const string& name)
-    : name_(NameUniquer::GetSanitizedName(name)) {}
+    : name_(NameUniquer::GetSanitizedName(name)),
+      unique_id_(next_unique_module_id_++) {}
 HloModule::HloModule(const string& name, const HloModuleConfig& config)
-    : name_(NameUniquer::GetSanitizedName(name)), config_(config) {}
+    : name_(NameUniquer::GetSanitizedName(name)),
+      config_(config),
+      unique_id_(next_unique_module_id_++) {}
 
 HloComputation* HloModule::AddComputationInternal(
     std::unique_ptr<HloComputation> computation, bool is_entry,
@@ -564,4 +568,6 @@ uint64 HloModule::RandomNew64() const {
   return rng_();
 }
 
+/* static */ std::atomic<int> HloModule::next_unique_module_id_(0);
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_module.h b/tensorflow/compiler/xla/service/hlo_module.h
index e377654d02..4bfe8d89ce 100644
--- a/tensorflow/compiler/xla/service/hlo_module.h
+++ b/tensorflow/compiler/xla/service/hlo_module.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_MODULE_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_MODULE_H_
 
+#include <atomic>
 #include <list>
 #include <memory>
 #include <random>
@@ -201,6 +202,10 @@ class HloModule {
   // this point are guaranteed to be in the range [0..NumUniqueInstructionIds())
   int NumUniqueInstructionIds() const { return next_unique_id_; }
 
+  // Returns an id that is unique to this module across all modules created over
+  // the lifetime of this process.
+  int unique_id() const { return unique_id_; }
+
  private:
   HloComputation* AddComputationInternal(
       std::unique_ptr<HloComputation> computation, bool is_entry,
@@ -227,6 +232,11 @@ class HloModule {
   NameUniquer computation_name_uniquer_{/*separator=*/"."};
   NameUniquer instruction_name_uniquer_{/*separator=*/"."};
   int next_unique_id_ = 0;
+
+  // Used to keep track of the next unique module id that should be assigned.
+  static std::atomic<int> next_unique_module_id_;
+  // A unique id to label modules with.
+  int unique_id_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_module_test.cc b/tensorflow/compiler/xla/service/hlo_module_test.cc
index cd51fa4e85..7f28a804bf 100644
--- a/tensorflow/compiler/xla/service/hlo_module_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_test.cc
@@ -188,6 +188,12 @@ TEST_F(HloModuleTest, LargeConstantToString) {
       module->ToString(HloPrintOptions().set_print_large_constants(true)));
 }
 
+TEST_F(HloModuleTest, UniqueModuleId) {
+  auto module_a = CreateNewModule();
+  auto module_b = CreateNewModule();
+  EXPECT_NE(module_a->unique_id(), module_b->unique_id());
+}
+
 }  // namespace
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
index 53bd46a641..5120775737 100644
--- a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
+++ b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <functional>
 
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
+#include "tensorflow/compiler/xla/service/hlo_proto_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -32,12 +33,28 @@ using ::tensorflow::strings::StrCat;
 namespace xla {
 
 namespace {
-void DumpModule(const HloModule& module,
-                const string& message) {
+void DumpModuleGraph(const HloModule& module, const string& message) {
   hlo_graph_dumper::MaybeDumpHloModule(module, message);
   VLOG(3) << "HLO " << message << ":";
   XLA_VLOG_LINES(3, module.ToString());
 }
+
+void DumpModuleProto(const HloModule& module, const string& dump_to,
+                     const string& pipeline_name, const string& pass_name) {
+  static tensorflow::mutex mu(tensorflow::LINKER_INITIALIZED);
+  static auto* const module_id_to_pass_number =
+      new tensorflow::gtl::FlatMap<int64, int64>();
+
+  tensorflow::mutex_lock lock(mu);
+  const int64 pass_number = (*module_id_to_pass_number)[module.unique_id()]++;
+
+  const string mod_name = SanitizeFileName(tensorflow::strings::Printf(
+      "module_%04d.%04lld.%s.after_%s", module.unique_id(), pass_number,
+      pipeline_name.c_str(), pass_name.c_str()));
+
+  TF_QCHECK_OK(protobuf_util::DumpProtoToDirectory(MakeHloProto(module),
+                                                   dump_to, mod_name));
+}
 }  // namespace
 
 StatusOr<bool> HloPassPipeline::Run(HloModule* module) {
@@ -78,6 +95,13 @@ StatusOr<bool> HloPassPipeline::Run(HloModule* module) {
   string message;
   TF_RETURN_IF_ERROR(
       run_invariant_checkers(StrCat("before running pipeline: ", name())));
+  const string xla_dump_per_pass_hlo_proto_to =
+      module->config().debug_options().xla_dump_per_pass_hlo_proto_to();
+  if (!xla_dump_per_pass_hlo_proto_to.empty()) {
+    DumpModuleProto(*module, xla_dump_per_pass_hlo_proto_to, name().ToString(),
+                    "pipeline_start");
+  }
+
   for (auto& pass : passes_) {
     if (disabled_passes.count(pass->name().ToString()) > 0) {
       VLOG(1) << "  Skipping HLO pass " << pass->name()
@@ -90,17 +114,21 @@ StatusOr<bool> HloPassPipeline::Run(HloModule* module) {
     // Emit label containing: "after foo-pass, before bar-pass".
     message.clear();
     StrAppend(&message, prefix, ", before ", pass->name());
-    DumpModule(*module, message);
+    DumpModuleGraph(*module, message);
 
     TF_ASSIGN_OR_RETURN(bool changed_this_pass, pass->Run(module));
     TF_RETURN_IF_ERROR(
         run_invariant_checkers(StrCat("after running pass: ", pass->name())));
+    if (!xla_dump_per_pass_hlo_proto_to.empty()) {
+      DumpModuleProto(*module, xla_dump_per_pass_hlo_proto_to,
+                      name().ToString(), pass->name().ToString());
+    }
 
     changed |= changed_this_pass;
     prefix.clear();
     StrAppend(&prefix, name(), ": after ", pass->name());
   }
-  DumpModule(*module, prefix + ", pipeline end");
+  DumpModuleGraph(*module, prefix + ", pipeline end");
   return changed;
 }
 
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index fea6956345..a57b7e5717 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -1619,14 +1619,14 @@ StatusOr<std::vector<perftools::gputools::StreamExecutor*>> Service::Replicas(
 }
 
 Status Service::MaybeDumpHloModule(const HloModule& module) const {
-  const string xla_dump_prepass_hlo_proto_to =
-      module.config().debug_options().xla_dump_prepass_hlo_proto_to();
-  if (xla_dump_prepass_hlo_proto_to.empty()) {
+  const string xla_dump_unoptimized_hlo_proto_to =
+      module.config().debug_options().xla_dump_unoptimized_hlo_proto_to();
+  if (xla_dump_unoptimized_hlo_proto_to.empty()) {
     return Status::OK();
   }
   HloProto proto = MakeHloProto(module);
   return protobuf_util::DumpProtoToDirectory(
-      proto, xla_dump_prepass_hlo_proto_to, module.name());
+      proto, xla_dump_unoptimized_hlo_proto_to, module.name());
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tools/hlo_proto_to_json.cc b/tensorflow/compiler/xla/tools/hlo_proto_to_json.cc
index 4e02e17db6..8460ae3e49 100644
--- a/tensorflow/compiler/xla/tools/hlo_proto_to_json.cc
+++ b/tensorflow/compiler/xla/tools/hlo_proto_to_json.cc
@@ -19,7 +19,7 @@ limitations under the License.
 //
 // Reads one serilized Hlo module, convert it into JSON format and dump into
 // some output directory. some_binaray_proto is obtained by serializing Hlo
-// module to disk using --xla_dump_hlo_proto_to debug optoin.
+// module to disk using --xla_dump_optimized_hlo_proto_to debug option.
 
 #include <stdio.h>
 #include <string>
diff --git a/tensorflow/compiler/xla/util.cc b/tensorflow/compiler/xla/util.cc
index b020905035..1f0c626bbb 100644
--- a/tensorflow/compiler/xla/util.cc
+++ b/tensorflow/compiler/xla/util.cc
@@ -339,7 +339,7 @@ std::vector<std::pair<int64, int64>> CommonFactors(
 
 string SanitizeFileName(string file_name) {
   for (char& c : file_name) {
-    if (c == '/' || c == '\\' || c == '[' || c == ']') {
+    if (c == '/' || c == '\\' || c == '[' || c == ']' || c == ' ') {
       c = '_';
     }
   }
diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto
index e1ed08c848..56162ab44e 100644
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@@ -82,8 +82,9 @@ message DebugOptions {
   // Dump all HLO modules as text into the provided directory path.
   string xla_generate_hlo_text_to = 7;
 
-  // Dump compilation artifacts in binary proto into this directory.
-  string xla_dump_hlo_proto_to = 8;
+  // Dump Hlo after all hlo passes are executed as proto binary into this
+  // directory.
+  string xla_dump_optimized_hlo_proto_to = 8;
 
   // Instrument the computation to collect per-HLO cycle counts.
   bool xla_hlo_profile = 9;
@@ -179,9 +180,13 @@ message DebugOptions {
   // ops.
   bool xla_gpu_use_cudnn_batchnorm = 94;
 
-  // Dump compilation artifacts, before hlo passes are executed, in binary proto
-  // into this directory.
-  string xla_dump_prepass_hlo_proto_to = 95;
+  // Dump HLO before any hlo passes are executed as proto binary into this
+  // directory.
+  string xla_dump_unoptimized_hlo_proto_to = 95;
+
+  // Dump HLO after each pass as an HloProto in binary file format into this
+  // directory.
+  string xla_dump_per_pass_hlo_proto_to = 96;
 
   // Extra options to pass to the compilation backend; specific interpretation
   // of these values is left to the backend.
-- 
GitLab


From 0050ce16d425b3367010d58a5a3dca30aab894a4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 29 Jan 2018 11:47:02 -0800
Subject: [PATCH 331/966] Make TFLite Mean op have parity with TF Reduce Mean
 op by changing the representation of axis from an attribute to a tensor.

PiperOrigin-RevId: 183701017
---
 tensorflow/contrib/lite/builtin_op_data.h     |   4 -
 tensorflow/contrib/lite/kernels/mean.cc       | 129 +++++++++++-------
 tensorflow/contrib/lite/kernels/mean_test.cc  | 100 ++++++++++----
 tensorflow/contrib/lite/model.cc              |   4 -
 tensorflow/contrib/lite/schema/schema.fbs     |   1 -
 .../contrib/lite/schema/schema_generated.h    |  36 +----
 .../contrib/lite/testing/generate_examples.py |  32 +++--
 .../contrib/lite/toco/tflite/operator.cc      |   5 +-
 .../contrib/lite/toco/tflite/operator_test.cc |   2 -
 9 files changed, 186 insertions(+), 127 deletions(-)

diff --git a/tensorflow/contrib/lite/builtin_op_data.h b/tensorflow/contrib/lite/builtin_op_data.h
index 6dd9cb39d2..7a7e20a41e 100644
--- a/tensorflow/contrib/lite/builtin_op_data.h
+++ b/tensorflow/contrib/lite/builtin_op_data.h
@@ -199,10 +199,6 @@ typedef struct {
 } TfLiteTransposeParams;
 
 typedef struct {
-  // TODO(ahentz): We can't have dynamic data in this struct, at least not yet.
-  // For now we will fix the maximum possible number of dimensions.
-  int axis[8];
-  int num_axis_dimensions;
   bool keep_dims;
 } TfLiteMeanParams;
 
diff --git a/tensorflow/contrib/lite/kernels/mean.cc b/tensorflow/contrib/lite/kernels/mean.cc
index 540e5a364d..ec1c402027 100644
--- a/tensorflow/contrib/lite/kernels/mean.cc
+++ b/tensorflow/contrib/lite/kernels/mean.cc
@@ -35,10 +35,12 @@ struct MeanContext {
   MeanContext(TfLiteContext* context, TfLiteNode* node) {
     params = reinterpret_cast<TfLiteMeanParams*>(node->builtin_data);
     input = GetInput(context, node, 0);
+    axis = GetInput(context, node, 1);
     output = GetOutput(context, node, 0);
   }
   TfLiteMeanParams* params;
   TfLiteTensor* input;
+  TfLiteTensor* axis;
   TfLiteTensor* output;
 };
 
@@ -54,45 +56,26 @@ void Free(TfLiteContext* context, void* buffer) {
   delete reinterpret_cast<int*>(buffer);
 }
 
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  TF_LITE_ENSURE(context, NumInputs(node) == 1 || NumInputs(node) == 2);
-  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-
-  MeanContext op_context(context, node);
-  int input_num_dims = NumDimensions(op_context.input);
-  int axis_num_dims = op_context.params->num_axis_dimensions;
-
-  // Creates a temp index to iterate through input data.
-  int* scratch_tensor_index = reinterpret_cast<int*>(node->user_data);
-  TfLiteIntArrayFree(node->temporaries);
-  node->temporaries = TfLiteIntArrayCreate(2);
-  node->temporaries->data[0] = *scratch_tensor_index;
-  TfLiteTensor* scratch_tensor = &context->tensors[node->temporaries->data[0]];
-  scratch_tensor->type = kTfLiteInt32;
-  scratch_tensor->allocation_type = kTfLiteArenaRw;
-  TfLiteIntArray* index_size = TfLiteIntArrayCreate(1);
-  index_size->data[0] = input_num_dims;
-  TF_LITE_ENSURE_OK(context,
-                    context->ResizeTensor(context, scratch_tensor, index_size));
-
-  // Creates a temp tensor to store resolved axis given input data.
-  node->temporaries->data[1] = *scratch_tensor_index + 1;
-  TfLiteTensor* axis_tensor = &context->tensors[node->temporaries->data[1]];
-  axis_tensor->type = kTfLiteInt32;
-  axis_tensor->allocation_type = kTfLiteArenaRw;
+// Resizes the temp tensor that stores resolved axis.
+TfLiteStatus ResizeTempAxis(TfLiteContext* context, MeanContext* op_context,
+                            TfLiteTensor* resolved_axis) {
   TfLiteIntArray* axis_size = TfLiteIntArrayCreate(1);
-  axis_size->data[0] = op_context.params->num_axis_dimensions;
-  TF_LITE_ENSURE_OK(context,
-                    context->ResizeTensor(context, axis_tensor, axis_size));
+  axis_size->data[0] = static_cast<int>(NumElements(op_context->axis));
+  return context->ResizeTensor(context, resolved_axis, axis_size);
+}
 
-  // Determines size of output tensor.
-  const TfLiteIntArray* input_dims = op_context.input->dims;
-  const int* axis = op_context.params->axis;
-  if (op_context.params->keep_dims) {
+// Resizes output array based on the input size and resolved axis.
+TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
+                                MeanContext* op_context) {
+  size_t num_axis = NumElements(op_context->axis);
+  const TfLiteIntArray* input_dims = op_context->input->dims;
+  int input_num_dims = NumDimensions(op_context->input);
+  const int* axis = GetTensorData<int>(op_context->axis);
+  if (op_context->params->keep_dims) {
     TfLiteIntArray* output_dims = TfLiteIntArrayCreate(input_num_dims);
     for (int idx = 0; idx < input_num_dims; ++idx) {
       bool is_axis = false;
-      for (int axis_idx = 0; axis_idx < axis_num_dims; ++axis_idx) {
+      for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx) {
         if (axis[axis_idx] == idx || axis[axis_idx] + input_num_dims == idx) {
           is_axis = true;
           break;
@@ -104,11 +87,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
         output_dims->data[idx] = input_dims->data[idx];
       }
     }
-    return context->ResizeTensor(context, op_context.output, output_dims);
+    return context->ResizeTensor(context, op_context->output, output_dims);
   } else {
     // Calculates size of reducing axis.
-    int num_reduce_axis = axis_num_dims;
-    for (int i = 0; i < axis_num_dims; ++i) {
+    int num_reduce_axis = num_axis;
+    for (int i = 0; i < num_axis; ++i) {
       int current = axis[i];
       if (current < 0) {
         current += input_num_dims;
@@ -131,7 +114,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     int num_skip_axis = 0;
     for (int idx = 0; idx < input_num_dims; ++idx) {
       bool is_axis = false;
-      for (int axis_idx = 0; axis_idx < axis_num_dims; ++axis_idx) {
+      for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx) {
         if (axis[axis_idx] == idx || axis[axis_idx] + input_num_dims == idx) {
           ++num_skip_axis;
           is_axis = true;
@@ -142,24 +125,76 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
         output_dims->data[idx - num_skip_axis] = input_dims->data[idx];
       }
     }
-    return context->ResizeTensor(context, op_context.output, output_dims);
+    return context->ResizeTensor(context, op_context->output, output_dims);
+  }
+}
+
+// Initializes temp tensors to store index and resolved axis.
+TfLiteStatus InitializeTemporaries(TfLiteContext* context, TfLiteNode* node,
+                                   MeanContext* op_context) {
+  // Creates a temp index to iterate through input data.
+  int* scratch_tensor_index = reinterpret_cast<int*>(node->user_data);
+  TfLiteIntArrayFree(node->temporaries);
+  node->temporaries = TfLiteIntArrayCreate(2);
+  node->temporaries->data[0] = *scratch_tensor_index;
+  TfLiteTensor* scratch_tensor = &context->tensors[node->temporaries->data[0]];
+  scratch_tensor->type = kTfLiteInt32;
+  scratch_tensor->allocation_type = kTfLiteArenaRw;
+  TfLiteIntArray* index_size = TfLiteIntArrayCreate(1);
+  index_size->data[0] = NumDimensions(op_context->input);
+  TF_LITE_ENSURE_OK(context,
+                    context->ResizeTensor(context, scratch_tensor, index_size));
+
+  // Creates a temp tensor to store resolved axis given input data.
+  node->temporaries->data[1] = *scratch_tensor_index + 1;
+  TfLiteTensor* resolved_axis = &context->tensors[node->temporaries->data[1]];
+  resolved_axis->type = kTfLiteInt32;
+  return kTfLiteOk;
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  MeanContext op_context(context, node);
+  TF_LITE_ENSURE_OK(context, InitializeTemporaries(context, node, &op_context));
+
+  TfLiteTensor* resolved_axis = &context->tensors[node->temporaries->data[1]];
+  // Leaves work to Eval if axis is not constant; else resizes output.
+  if (!IsConstantTensor(op_context.axis)) {
+    SetTensorToDynamic(op_context.output);
+    SetTensorToDynamic(resolved_axis);
+    return kTfLiteOk;
   }
+  resolved_axis->allocation_type = kTfLiteArenaRw;
+  TF_LITE_ENSURE_OK(context,
+                    ResizeTempAxis(context, &op_context, resolved_axis));
+  return ResizeOutputTensor(context, &op_context);
 }
 
 template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   MeanContext op_context(context, node);
+  int num_axis = static_cast<int>(NumElements(op_context.axis));
   TfLiteTensor* temp_index = &context->tensors[node->temporaries->data[0]];
   TfLiteTensor* resolved_axis = &context->tensors[node->temporaries->data[1]];
+  // Resize the output tensor if the output tensor is dynamic.
+  if (IsDynamicTensor(op_context.output)) {
+    TF_LITE_ENSURE_OK(context,
+                      ResizeTempAxis(context, &op_context, resolved_axis));
+    TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context));
+    TfLiteTensorRealloc(resolved_axis->bytes, resolved_axis);
+    TfLiteTensorRealloc(op_context.output->bytes, op_context.output);
+  }
 
-#define TF_LITE_MEAN(kernel_type, data_type)                           \
-  kernel_type::Mean<>(                                                 \
-      GetTensorData<data_type>(op_context.input),                      \
-      op_context.input->dims->data, op_context.input->dims->size,      \
-      GetTensorData<data_type>(op_context.output),                     \
-      op_context.output->dims->data, op_context.output->dims->size,    \
-      op_context.params->axis, op_context.params->num_axis_dimensions, \
-      op_context.params->keep_dims, GetTensorData<int>(temp_index),    \
+#define TF_LITE_MEAN(kernel_type, data_type)                        \
+  kernel_type::Mean<>(                                              \
+      GetTensorData<data_type>(op_context.input),                   \
+      op_context.input->dims->data, op_context.input->dims->size,   \
+      GetTensorData<data_type>(op_context.output),                  \
+      op_context.output->dims->data, op_context.output->dims->size, \
+      GetTensorData<int>(op_context.axis), num_axis,                \
+      op_context.params->keep_dims, GetTensorData<int>(temp_index), \
       GetTensorData<int>(resolved_axis))
 
   if (kernel_type == kReference) {
diff --git a/tensorflow/contrib/lite/kernels/mean_test.cc b/tensorflow/contrib/lite/kernels/mean_test.cc
index 4305c0632f..c4c53c2ded 100644
--- a/tensorflow/contrib/lite/kernels/mean_test.cc
+++ b/tensorflow/contrib/lite/kernels/mean_test.cc
@@ -25,58 +25,108 @@ using ::testing::ElementsAreArray;
 
 class BaseMeanOpModel : public SingleOpModel {
  public:
-  BaseMeanOpModel(const TensorData& input, const TensorData& output,
-                  std::initializer_list<int> axis, bool keep_dims) {
-    input_ = AddInput(input);
-    output_ = AddOutput(output);
-    SetBuiltinOp(
-        BuiltinOperator_MEAN, BuiltinOptions_MeanOptions,
-        CreateMeanOptions(builder_, builder_.CreateVector<int>(axis), keep_dims)
-            .Union());
-    BuildInterpreter({GetShape(input_)});
+  void SetAxis(std::initializer_list<int> data) { PopulateTensor(axis_, data); }
+
+  template <class T>
+  void SetInput(std::initializer_list<T> data) {
+    PopulateTensor(input_, data);
   }
 
-  int input() { return input_; }
+  template <class T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
+
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
 
  protected:
   int input_;
+  int axis_;
   int output_;
 };
 
-class FloatMeanOpModel : public BaseMeanOpModel {
+// Model for the tests case where axis is a const tensor.
+class MeanOpConstModel : public BaseMeanOpModel {
  public:
-  using BaseMeanOpModel::BaseMeanOpModel;
-
-  void SetInput(std::initializer_list<float> data) {
-    PopulateTensor(input_, data);
+  MeanOpConstModel(const TensorData& input, const TensorData& output,
+                   std::initializer_list<int> axis_shape,
+                   std::initializer_list<int> axis, bool keep_dims) {
+    input_ = AddInput(input);
+    axis_ = AddConstInput(TensorType_INT32, axis, axis_shape);
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_MEAN, BuiltinOptions_MeanOptions,
+                 CreateMeanOptions(builder_, keep_dims).Union());
+    BuildInterpreter({GetShape(input_)});
   }
+};
 
-  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
-  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+// Model for the tests case where axis is a dynamic tensor.
+class MeanOpDynamicModel : public BaseMeanOpModel {
+ public:
+  MeanOpDynamicModel(const TensorData& input, const TensorData& output,
+                     const TensorData& axis, bool keep_dims) {
+    input_ = AddInput(input);
+    axis_ = AddInput(axis);
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_MEAN, BuiltinOptions_MeanOptions,
+                 CreateMeanOptions(builder_, keep_dims).Union());
+    BuildInterpreter({GetShape(input_)});
+  }
 };
 
-TEST(FloatMeanOpTest, NotKeepDims) {
+TEST(ConstMeanOpTest, NotKeepDims) {
+  std::initializer_list<float> data = {
+      1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0,
+      13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0};
+  MeanOpConstModel m({TensorType_FLOAT32, {4, 3, 2}}, {TensorType_FLOAT32, {2}},
+                     {4}, {1, 0, -3, -3}, false);
+  m.SetInput(data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({12, 13})));
+}
+
+TEST(ConstMeanOpTest, KeepDims) {
+  std::initializer_list<float> data = {
+      1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0,
+      13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0};
+  MeanOpConstModel m({TensorType_FLOAT32, {4, 3, 2}}, {TensorType_FLOAT32, {3}},
+                     {2}, {0, 2}, true);
+  m.SetInput(data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3, 1}));
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear({10.5, 12.5, 14.5})));
+}
+
+TEST(DynamicMeanOpTest, NotKeepDims) {
   std::initializer_list<float> data = {
       1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0,
       13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0};
-  FloatMeanOpModel m({TensorType_FLOAT32, {4, 3, 2}}, {TensorType_FLOAT32, {2}},
-                     {1, 0, -3, -3}, false);
+  MeanOpDynamicModel m({TensorType_FLOAT32, {4, 3, 2}},
+                       {TensorType_FLOAT32, {2}}, {TensorType_INT32, {4}},
+                       false);
+  std::initializer_list<int> axis = {1, 0, -3, -3};
+  m.SetAxis(axis);
   m.SetInput(data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({12, 13})));
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({12, 13})));
 }
 
-TEST(FloatMeanOpTest, KeepDims) {
+TEST(DynamicMeanOpTest, KeepDims) {
   std::initializer_list<float> data = {
       1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0,
       13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0};
-  FloatMeanOpModel m({TensorType_FLOAT32, {4, 3, 2}}, {TensorType_FLOAT32, {3}},
-                     {0, 2}, true);
+  MeanOpDynamicModel m({TensorType_FLOAT32, {4, 3, 2}},
+                       {TensorType_FLOAT32, {3}}, {TensorType_INT32, {2}},
+                       true);
+  std::initializer_list<int> axis = {0, 2};
+  m.SetAxis(axis);
   m.SetInput(data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3, 1}));
-  EXPECT_THAT(m.GetOutput(),
+  EXPECT_THAT(m.GetOutput<float>(),
               ElementsAreArray(ArrayFloatNear({10.5, 12.5, 14.5})));
 }
 
diff --git a/tensorflow/contrib/lite/model.cc b/tensorflow/contrib/lite/model.cc
index 64b8f55097..c82ae27953 100644
--- a/tensorflow/contrib/lite/model.cc
+++ b/tensorflow/contrib/lite/model.cc
@@ -544,11 +544,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
     case BuiltinOperator_MEAN: {
       auto* params = MallocPOD<TfLiteMeanParams>();
       if (auto* schema_params = op->builtin_options_as_MeanOptions()) {
-        const auto& axis = schema_params->axis();
-        FlatBufferIntVectorToArray(sizeof(params->axis), axis, params->axis,
-                                   error_reporter);
         params->keep_dims = schema_params->keep_dims();
-        params->num_axis_dimensions = axis->Length();
       }
       builtin_data = reinterpret_cast<void*>(params);
       break;
diff --git a/tensorflow/contrib/lite/schema/schema.fbs b/tensorflow/contrib/lite/schema/schema.fbs
index f6217567f8..91eac2ab48 100644
--- a/tensorflow/contrib/lite/schema/schema.fbs
+++ b/tensorflow/contrib/lite/schema/schema.fbs
@@ -333,7 +333,6 @@ table TransposeOptions {
 }
 
 table MeanOptions {
-  axis:[int];
   keep_dims: bool;
 }
 
diff --git a/tensorflow/contrib/lite/schema/schema_generated.h b/tensorflow/contrib/lite/schema/schema_generated.h
index ad758529a3..a8370b34c6 100755
--- a/tensorflow/contrib/lite/schema/schema_generated.h
+++ b/tensorflow/contrib/lite/schema/schema_generated.h
@@ -3388,21 +3388,16 @@ flatbuffers::Offset<TransposeOptions> CreateTransposeOptions(
 
 struct MeanOptionsT : public flatbuffers::NativeTable {
   typedef MeanOptions TableType;
-  std::vector<int32_t> axis;
   bool keep_dims;
   MeanOptionsT() : keep_dims(false) {}
 };
 
 struct MeanOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef MeanOptionsT NativeTableType;
-  enum { VT_AXIS = 4, VT_KEEP_DIMS = 6 };
-  const flatbuffers::Vector<int32_t> *axis() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_AXIS);
-  }
+  enum { VT_KEEP_DIMS = 4 };
   bool keep_dims() const { return GetField<uint8_t>(VT_KEEP_DIMS, 0) != 0; }
   bool Verify(flatbuffers::Verifier &verifier) const {
-    return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_AXIS) &&
-           verifier.Verify(axis()) &&
+    return VerifyTableStart(verifier) &&
            VerifyField<uint8_t>(verifier, VT_KEEP_DIMS) && verifier.EndTable();
   }
   MeanOptionsT *UnPack(
@@ -3418,9 +3413,6 @@ struct MeanOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
 struct MeanOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
-  void add_axis(flatbuffers::Offset<flatbuffers::Vector<int32_t>> axis) {
-    fbb_.AddOffset(MeanOptions::VT_AXIS, axis);
-  }
   void add_keep_dims(bool keep_dims) {
     fbb_.AddElement<uint8_t>(MeanOptions::VT_KEEP_DIMS,
                              static_cast<uint8_t>(keep_dims), 0);
@@ -3438,22 +3430,12 @@ struct MeanOptionsBuilder {
 };
 
 inline flatbuffers::Offset<MeanOptions> CreateMeanOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> axis = 0,
-    bool keep_dims = false) {
+    flatbuffers::FlatBufferBuilder &_fbb, bool keep_dims = false) {
   MeanOptionsBuilder builder_(_fbb);
-  builder_.add_axis(axis);
   builder_.add_keep_dims(keep_dims);
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<MeanOptions> CreateMeanOptionsDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    const std::vector<int32_t> *axis = nullptr, bool keep_dims = false) {
-  return tflite::CreateMeanOptions(
-      _fbb, axis ? _fbb.CreateVector<int32_t>(*axis) : 0, keep_dims);
-}
-
 flatbuffers::Offset<MeanOptions> CreateMeanOptions(
     flatbuffers::FlatBufferBuilder &_fbb, const MeanOptionsT *_o,
     const flatbuffers::rehasher_function_t *_rehasher = nullptr);
@@ -6039,15 +6021,6 @@ inline void MeanOptions::UnPackTo(
     MeanOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  {
-    auto _e = axis();
-    if (_e) {
-      _o->axis.resize(_e->size());
-      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
-        _o->axis[_i] = _e->Get(_i);
-      }
-    }
-  };
   {
     auto _e = keep_dims();
     _o->keep_dims = _e;
@@ -6071,9 +6044,8 @@ inline flatbuffers::Offset<MeanOptions> CreateMeanOptions(
     const flatbuffers::rehasher_function_t *__rehasher;
   } _va = {&_fbb, _o, _rehasher};
   (void)_va;
-  auto _axis = _o->axis.size() ? _fbb.CreateVector(_o->axis) : 0;
   auto _keep_dims = _o->keep_dims;
-  return tflite::CreateMeanOptions(_fbb, _axis, _keep_dims);
+  return tflite::CreateMeanOptions(_fbb, _keep_dims);
 }
 
 inline SqueezeOptionsT *SqueezeOptions::UnPack(
diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py
index 4ae6ccb765..fc8149bef9 100644
--- a/tensorflow/contrib/lite/testing/generate_examples.py
+++ b/tensorflow/contrib/lite/testing/generate_examples.py
@@ -695,6 +695,7 @@ def make_mean_tests(zip_path):
           [2, 1], [2, 1, 0], [2, 0, 1], -1, -2, -3, [1, -1], [0, -1], [-1, 0],
           [-1, -2, -3], [0, 0, 0], [2, 2, 0], [1, 0, -3, -3]
       ],
+      "const_axis": [True, False],
       "keep_dims": [True, False],
   }, {
       "input_dtype": [tf.float32, tf.int32, tf.int64],
@@ -705,6 +706,7 @@ def make_mean_tests(zip_path):
           -3, -4, [0, -2], [2, 3, -1, 0], [3, 1, 2, -3], [3, -4], [2, 2, 2],
           [2, 2, 3], [-3, -3, -4], [-3, 2, 1]
       ],
+      "const_axis": [True, False],
       "keep_dims": [True, False],
   }]
 
@@ -714,17 +716,31 @@ def make_mean_tests(zip_path):
         dtype=parameters["input_dtype"],
         name="input",
         shape=parameters["input_shape"])
+
+    # Get axis as either a placeholder or constants.
+    if parameters["const_axis"]:
+      axis = parameters["axis"]
+      input_tensors = [input_tensor]
+    else:
+      if isinstance(parameters["axis"], list):
+        shape = [len(parameters["axis"])]
+      else:
+        shape = [0]  # shape for None or integers.
+      axis = tf.placeholder(dtype=tf.int32, name="axis", shape=shape)
+      input_tensors = [input_tensor, axis]
+
     out = tf.reduce_mean(
-        input_tensor,
-        axis=parameters["axis"],
-        keep_dims=parameters["keep_dims"])
-    return [input_tensor], [out]
+        input_tensor, axis=axis, keep_dims=parameters["keep_dims"])
+    return input_tensors, [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
-    input_values = create_tensor_data(parameters["input_dtype"],
-                                      parameters["input_shape"])
-    return [input_values], sess.run(
-        outputs, feed_dict=dict(zip(inputs, [input_values])))
+    values = [
+        create_tensor_data(parameters["input_dtype"], parameters["input_shape"])
+    ]
+    if not parameters["const_axis"]:
+      if parameters["axis"]:
+        values.append(np.array(parameters["axis"]))
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
diff --git a/tensorflow/contrib/lite/toco/tflite/operator.cc b/tensorflow/contrib/lite/toco/tflite/operator.cc
index 853a9f46c2..e33a5788d8 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator.cc
@@ -548,14 +548,11 @@ class Mean : public BuiltinOperator<MeanOperator, ::tflite::MeanOptions,
   flatbuffers::Offset<TfLiteOptions> WriteOptions(
       const TocoOperator& op,
       flatbuffers::FlatBufferBuilder* builder) const override {
-    auto axis = builder->CreateVector(op.axis);
-    return ::tflite::CreateMeanOptions(*builder, axis, op.keep_dims);
+    return ::tflite::CreateMeanOptions(*builder, op.keep_dims);
   }
 
   void ReadOptions(const TfLiteOptions& options,
                    TocoOperator* op) const override {
-    op->axis.insert(op->axis.end(), options.axis()->begin(),
-                    options.axis()->end());
     op->keep_dims = options.keep_dims();
   }
 };
diff --git a/tensorflow/contrib/lite/toco/tflite/operator_test.cc b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
index 4df9071095..b4ec7bbd50 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator_test.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
@@ -134,12 +134,10 @@ TEST_F(OperatorTest, BuiltinSpaceToBatchND) {
 
 TEST_F(OperatorTest, BuiltinMean) {
   MeanOperator op;
-  op.axis = {1, 2};
   op.keep_dims = false;
 
   auto output_toco_op =
       SerializeAndDeserialize(GetOperator("MEAN", OperatorType::kMean), op);
-  EXPECT_EQ(op.axis, output_toco_op->axis);
   EXPECT_EQ(op.keep_dims, output_toco_op->keep_dims);
 }
 
-- 
GitLab


From 5fb13ffc145af5a9c707a1388c38dd45f793b0a0 Mon Sep 17 00:00:00 2001
From: Adam Roberts <adarob@google.com>
Date: Mon, 29 Jan 2018 11:51:08 -0800
Subject: [PATCH 332/966] Add input and sequence_length accessor to
 TrainingHelper.

PiperOrigin-RevId: 183701716
---
 tensorflow/contrib/seq2seq/python/ops/helper.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tensorflow/contrib/seq2seq/python/ops/helper.py b/tensorflow/contrib/seq2seq/python/ops/helper.py
index ef3722ee41..3245cc5e72 100644
--- a/tensorflow/contrib/seq2seq/python/ops/helper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/helper.py
@@ -184,6 +184,7 @@ class TrainingHelper(Helper):
     """
     with ops.name_scope(name, "TrainingHelper", [inputs, sequence_length]):
       inputs = ops.convert_to_tensor(inputs, name="inputs")
+      self._inputs = inputs
       if not time_major:
         inputs = nest.map_structure(_transpose_batch_time, inputs)
 
@@ -200,6 +201,14 @@ class TrainingHelper(Helper):
 
       self._batch_size = array_ops.size(sequence_length)
 
+  @property
+  def inputs(self):
+    return self._inputs
+
+  @property
+  def sequence_length(self):
+    return self._sequence_length
+
   @property
   def batch_size(self):
     return self._batch_size
-- 
GitLab


From 8bfaa9213b640201b6886f3f245a1ad1a7461030 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Mon, 29 Jan 2018 12:09:05 -0800
Subject: [PATCH 333/966] [tf.data] Handle `tf.SparseTensor` elements in the
 stats ops.

PiperOrigin-RevId: 183705008
---
 tensorflow/contrib/data/python/ops/stats_ops.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/data/python/ops/stats_ops.py b/tensorflow/contrib/data/python/ops/stats_ops.py
index 1dd0729513..9cd1701c39 100644
--- a/tensorflow/contrib/data/python/ops/stats_ops.py
+++ b/tensorflow/contrib/data/python/ops/stats_ops.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import sparse
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import gen_dataset_ops
@@ -161,8 +162,10 @@ class _StatsDataset(dataset_ops.Dataset):
     return self._op_function(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         self._tag,
-        output_shapes=nest.flatten(self.output_shapes),
-        output_types=nest.flatten(self.output_types))
+        output_types=nest.flatten(
+            sparse.as_dense_types(self.output_types, self.output_classes)),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
 
   @property
   def output_shapes(self):
-- 
GitLab


From 95a8af24058c168ce8a5327451e1cfcbc56461eb Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 29 Jan 2018 12:12:41 -0800
Subject: [PATCH 334/966] Ensure that non-recursive conversion is identity
 transformation wrt all types of function calls by only failing on unresolved
 symbols if they're needed. Simplify code structure all around. Remove the
 awkward activity analysis that deemed a function parameter as "modified".
 Consolidate activity analysis by tracking function parameters and returned
 symbols separately. Strengthen the type inference a little by using more
 interpret-like constructs.

PiperOrigin-RevId: 183705547
---
 tensorflow/contrib/py2tf/conversion.py        |  33 ++--
 .../contrib/py2tf/converters/call_trees.py    | 159 ++++++++++--------
 .../py2tf/converters/call_trees_test.py       |  46 ++++-
 .../py2tf/converters/converter_test_base.py   |  19 ++-
 .../py2tf/converters/side_effect_guards.py    |   5 +-
 tensorflow/contrib/py2tf/naming.py            |  68 ++++----
 tensorflow/contrib/py2tf/naming_test.py       |  14 +-
 tensorflow/contrib/py2tf/pyct/context.py      |   3 +-
 tensorflow/contrib/py2tf/pyct/parser.py       |  10 +-
 tensorflow/contrib/py2tf/pyct/parser_test.py  |  11 +-
 .../py2tf/pyct/static_analysis/access.py      |  67 ++++++--
 .../py2tf/pyct/static_analysis/access_test.py |  42 ++---
 .../py2tf/pyct/static_analysis/live_values.py |  52 +++---
 .../pyct/static_analysis/live_values_test.py  |  56 ++++--
 .../py2tf/pyct/static_analysis/type_info.py   |  66 +++-----
 .../pyct/static_analysis/type_info_test.py    |  60 ++++---
 tensorflow/contrib/py2tf/pyct/templates.py    |   3 +-
 tensorflow/contrib/py2tf/pyct/transformer.py  |  17 +-
 18 files changed, 442 insertions(+), 289 deletions(-)

diff --git a/tensorflow/contrib/py2tf/conversion.py b/tensorflow/contrib/py2tf/conversion.py
index b484eebbd5..e277eadec4 100644
--- a/tensorflow/contrib/py2tf/conversion.py
+++ b/tensorflow/contrib/py2tf/conversion.py
@@ -171,7 +171,8 @@ def class_to_graph(c, conversion_map):
 def function_to_graph(f, conversion_map, arg_values, arg_types,
                       owner_type=None):
   """Specialization of `entity_to_graph` for callable functions."""
-  node = parser.parse_object(f).body[0]
+  node, source = parser.parse_entity(f)
+  node = node.body[0]
   namespace = six.get_function_globals(f)
 
   # This is needed for non-global functions.
@@ -185,28 +186,29 @@ def function_to_graph(f, conversion_map, arg_values, arg_types,
   namer = conversion_map.new_namer(namespace)
   ctx = context.EntityContext(
       namer=namer,
-      source_code=tf_inspect.getsource(f),
-      source_file=tf_inspect.getfile(f),
+      source_code=source,
+      source_file='<fragment>',
       namespace=namespace,
       arg_values=arg_values,
-      arg_types=arg_types)
+      arg_types=arg_types,
+      recursive=conversion_map.recursive)
   node = node_to_graph(node, ctx, conversion_map.nocompile_decorators)
 
-  # Simulate a rename to ensure the top level is in the name map. This is needed
-  # for top level functions, and it also helps the consistency verification made
-  # by update_name_map.
-  if owner_type is not None:
-    new_name = namer.compiled_function_name(f.__name__, f, owner_type)
-  else:
-    new_name = namer.compiled_function_name(f.__name__, f)
+  # TODO(mdan): This somewhat duplicates the call rename logic in call_treest.py
+  new_name, did_rename = namer.compiled_function_name(f.__name__, f, owner_type)
+  if not did_rename:
+    new_name = f.__name__
+    if node.name != f.__name__:
+      raise NotImplementedError('Strange corner case. Send us offending code!')
+
   node.name = new_name
   conversion_map.update_name_map(namer)
-  return node, conversion_map.name_map[f]
+  return node, new_name
 
 
 def _static_analysis_pass(node, ctx):
-  node = access.resolve(node)
-  node = live_values.resolve(node, ctx.namespace, config.PYTHON_LITERALS)
+  node = access.resolve(node, ctx)
+  node = live_values.resolve(node, ctx, config.PYTHON_LITERALS)
   node = type_info.resolve(node, ctx)
   return node
 
@@ -259,8 +261,7 @@ def node_to_graph(node, ctx, nocompile_decorators):
 
   node = _static_analysis_pass(node, ctx)
   node = print_functions.transform(node)
-  node = call_trees.transform(node, ctx.namer, ctx.namespace,
-                              config.DEFAULT_UNCOMPILED_MODULES,
+  node = call_trees.transform(node, ctx, config.DEFAULT_UNCOMPILED_MODULES,
                               nocompile_decorators)
   node = control_flow.transform(node, ctx.namer)
   node = logical_expressions.transform(node)
diff --git a/tensorflow/contrib/py2tf/converters/call_trees.py b/tensorflow/contrib/py2tf/converters/call_trees.py
index 0aae030450..4c238b7fb9 100644
--- a/tensorflow/contrib/py2tf/converters/call_trees.py
+++ b/tensorflow/contrib/py2tf/converters/call_trees.py
@@ -29,46 +29,46 @@ import gast
 from tensorflow.contrib.py2tf.pyct import anno
 from tensorflow.contrib.py2tf.pyct import parser
 from tensorflow.contrib.py2tf.pyct import templates
+from tensorflow.contrib.py2tf.pyct import transformer
+from tensorflow.python.util import tf_inspect
 
 
 class FunctionNamer(object):
   """Describes the interface for CallTreeTransformer's namer."""
 
   def compiled_function_name(self,
-                             original_name,
-                             live_object=None,
+                             original_fqn,
+                             live_entity=None,
                              owner_type=None):
     """Generate the name corresponding to the compiled version of a function.
 
     Args:
-      original_name: String
-      live_object: Callable, the actual target function, if known.
+      original_fqn: string or tuple(string)
+      live_entity: Callable, the actual target function, if known.
       owner_type: Optional object. If present, it indicates that the function is
           a member of the given type.
     Returns:
-      String.
+      string, bool
     """
     raise NotImplementedError()
 
-  def compiled_class_name(self, original_name, live_object=None):
+  def compiled_class_name(self, original_fqn, live_entity=None):
     """Generate the name corresponding to the compiled version of a class.
 
     Args:
-      original_name: String
-      live_object: The actual target class, if known.
+      original_fqn: string or tuple(string)
+      live_entity: The actual target class, if known.
     Returns:
-      String.
+      string
     """
     raise NotImplementedError()
 
 
-class CallTreeTransformer(gast.NodeTransformer):
+class CallTreeTransformer(transformer.Base):
   """Transforms the call tree by renaming transformed symbols."""
 
-  def __init__(self, namer, namespace, uncompiled_modules,
-               nocompile_decorators):
-    self.namer = namer
-    self.namespace = namespace
+  def __init__(self, context, uncompiled_modules, nocompile_decorators):
+    super(CallTreeTransformer, self).__init__(context)
     self.uncompiled_modules = uncompiled_modules
     self.nocompile_decorators = nocompile_decorators
 
@@ -78,7 +78,7 @@ class CallTreeTransformer(gast.NodeTransformer):
     if isinstance(node, gast.Call):
       return self._resolve_name(node.func)
     if isinstance(node, gast.Name):
-      return self.namespace.get(node.id)
+      return self.context.namespace.get(node.id)
     if isinstance(node, gast.Attribute):
       parent = self._resolve_name(node.value)
       if parent is not None:
@@ -91,8 +91,12 @@ class CallTreeTransformer(gast.NodeTransformer):
     if anno.hasanno(node, 'live_val'):
       return anno.getanno(node, 'live_val')
     if isinstance(node, gast.Attribute) and anno.hasanno(node, 'type'):
-      member = getattr(anno.getanno(node, 'type'), node.attr)
-      return member
+      owner_type = anno.getanno(node, 'type')
+      if hasattr(owner_type, node.attr):
+        return getattr(owner_type, node.attr)
+      else:
+        raise ValueError('Type "%s" has not attribute "%s". Is it dynamic?' %
+                         (owner_type, node.attr))
     return None
 
   def _should_compile(self, node, fqn):
@@ -106,14 +110,14 @@ class CallTreeTransformer(gast.NodeTransformer):
 
     # The decorators themselves are not to be converted.
     # If present, the decorators should appear as static functions.
-    target_obj = self._try_resolve_target(node.func)
-    if target_obj is not None:
+    target_entity = self._try_resolve_target(node.func)
+    if target_entity is not None:
       # This attribute is set by the decorator itself.
       # TODO(mdan): This may not play nicely with other wrapping decorators.
-      if hasattr(target_obj, '__pyct_is_compile_decorator'):
+      if hasattr(target_entity, '__pyct_is_compile_decorator'):
         return False
 
-      if target_obj in self.nocompile_decorators:
+      if target_entity in self.nocompile_decorators:
         return False
 
       # Inspect the target function decorators. If any include a @convert
@@ -122,7 +126,8 @@ class CallTreeTransformer(gast.NodeTransformer):
       # To parse and re-analize each function for every call site could be quite
       # wasteful. Maybe we could cache the parsed AST?
       try:
-        target_node = parser.parse_object(target_obj).body[0]
+        target_node, _ = parser.parse_entity(target_entity)
+        target_node = target_node.body[0]
       except TypeError:
         # Functions whose source we cannot access are compilable (e.g. wrapped
         # to py_func).
@@ -136,48 +141,57 @@ class CallTreeTransformer(gast.NodeTransformer):
 
     return True
 
+  def _determine_function_owner(self, m):
+    # TODO(mdan): The parent type should be known at analysis. Use that instead.
+    if hasattr(m, 'im_class'):  # Python 2
+      return m.im_class
+    if hasattr(m, '__qualname__'):  # Python 3
+      # Object attributes: should be bound to "self".
+      if hasattr(m, '__self__'):
+        return type(m.__self__)
+
+      # Class attributes: should have the owner name in their namespace.
+      qn = m.__qualname__.split('.')
+      if len(qn) < 2:
+        return None
+      owner_name, func_name = qn[-2:]
+      if func_name != m.__name__:
+        raise ValueError('Inconsistent names detected '
+                         '(__qualname__[1] = "%s", __name__ = "%s") for %s.' %
+                         (func_name, m.__name__, m))
+      if owner_name == '<locals>':
+        return None
+      if owner_name not in self.context.namespace:
+        raise ValueError(
+            'Could not resolve name "%s" while analyzing %s. Namespace:\n%s' %
+            (owner_name, m, self.context.namespace))
+      return self.context.namespace[owner_name]
+    return None
+
   def _rename_compilable_function(self, node):
     assert anno.hasanno(node.func, 'live_val')
     assert anno.hasanno(node.func, 'fqn')
-    target_obj = anno.getanno(node.func, 'live_val')
+    target_entity = anno.getanno(node.func, 'live_val')
     target_fqn = anno.getanno(node.func, 'fqn')
 
     if not self._should_compile(node, target_fqn):
       return node
 
     if anno.hasanno(node, 'is_constructor'):
-      new_name = self.namer.compiled_class_name(
-          '__'.join(target_fqn), live_object=target_obj)
+      new_name = self.context.namer.compiled_class_name(
+          target_fqn, live_entity=target_entity)
+      do_rename = True
     else:
-      new_name = self.namer.compiled_function_name(
-          '__'.join(target_fqn), live_object=target_obj)
-    node.func = gast.Name(new_name, gast.Load(), None)
-    return node
-
-  def _rename_member_function_of_known_type(self, node):
-    assert isinstance(node.func, gast.Attribute)
-
-    type_fqn = anno.getanno(node.func, 'type_fqn')
-    assert anno.hasanno(node.func, 'type')
-    target_type = anno.getanno(node.func, 'type')
-
-    if not self._should_compile(node, type_fqn):
-      return node
-
-    # TODO(mdan): We should not assume that the namer only needs the
-    # member function name.
-    method_name = node.func.attr
-    method_object = getattr(target_type, method_name)
-    new_name = self.namer.compiled_function_name(
-        method_name, live_object=method_object, owner_type=target_type)
-    if new_name != node.func.attr:
-      # If a member function call is renamed, then the new function is no
-      # longer bound to the target object. We then refactor the call from:
-      #   foo.bar(...)
-      # to:
-      #   renamed_foo(bar, ...)
-      # TODO(mdan): This risks causing duplication, if target_type is renamed.
-      node.args = [node.func.value] + node.args
+      owner_type = self._determine_function_owner(target_entity)
+      new_name, do_rename = self.context.namer.compiled_function_name(
+          target_fqn, live_entity=target_entity, owner_type=owner_type)
+
+    if do_rename:
+      if target_entity is not None:
+        if tf_inspect.ismethod(target_entity):
+          # The renaming process will transform it into a regular function.
+          # TODO(mdan): Is this complete? How does it work with nested members?
+          node.args = [node.func.value] + node.args
       node.func = gast.Name(new_name, gast.Load(), None)
     return node
 
@@ -193,7 +207,7 @@ class CallTreeTransformer(gast.NodeTransformer):
     wrapper_def, call_expr = templates.replace(
         template,
         call=node.func,
-        wrapper=self.namer.compiled_function_name(node.func.id),
+        wrapper=self.context.namer.compiled_function_name(node.func.id)[0],
         args=tuple(gast.Name(n, gast.Load(), None) for n in args_scope.used))
     anno.setanno(call_expr.value, 'args_scope', args_scope)
     # TODO(mdan): Rename this annotation to 'graph_ready'
@@ -201,15 +215,15 @@ class CallTreeTransformer(gast.NodeTransformer):
 
     return (wrapper_def, call_expr)
 
-  def _function_is_compilable(self, target_obj):
+  def _function_is_compilable(self, target_entity):
     # TODO(mdan): This is just a placeholder. Implement.
-    return not isinstance(target_obj, types.BuiltinFunctionType)
+    return not isinstance(target_entity, types.BuiltinFunctionType)
 
   def visit_Expr(self, node):
     if isinstance(node.value, gast.Call):
       if anno.hasanno(node.value.func, 'live_val'):
-        target_obj = anno.getanno(node.value.func, 'live_val')
-        if not self._function_is_compilable(target_obj):
+        target_entity = anno.getanno(node.value.func, 'live_val')
+        if not self._function_is_compilable(target_entity):
           if anno.hasanno(node.value.func, 'fqn'):
             target_fqn = anno.getanno(node.value.func, 'fqn')
             if not self._should_compile(node.value, target_fqn):
@@ -227,8 +241,8 @@ class CallTreeTransformer(gast.NodeTransformer):
     # If the function is wrapped by one of the marker decorators,
     # consider it graph ready.
     if anno.hasanno(node.func, 'live_val'):
-      target_obj = anno.getanno(node.func, 'live_val')
-      if target_obj in self.nocompile_decorators:
+      target_entity = anno.getanno(node.func, 'live_val')
+      if target_entity in self.nocompile_decorators:
         if len(node.args) < 1:
           raise ValueError(
               'Found call to decorator function "%s", but it had no arguments. '
@@ -237,28 +251,28 @@ class CallTreeTransformer(gast.NodeTransformer):
 
     self.generic_visit(node)
     if anno.hasanno(node.func, 'live_val'):
-      target_obj = anno.getanno(node.func, 'live_val')
-      if self._function_is_compilable(target_obj):
+      target_entity = anno.getanno(node.func, 'live_val')
+      if self._function_is_compilable(target_entity):
         node = self._rename_compilable_function(node)
       else:
         raise NotImplementedError('py_func with return values')
-    elif anno.hasanno(node.func, 'type_fqn'):
-      node = self._rename_member_function_of_known_type(node)
     else:
-      raise NotImplementedError(
-          'Member function call (of unknown type): %s.' % node.func.id)
+      if self.context.recursive:
+        raise NotImplementedError('Could not resolve target function.')
+      else:
+        # TODO(mdan): Double check. Is this reachable code?
+        pass
     return node
 
   # pylint:enable=invalid-name
 
 
-def transform(node, namer, namespace, uncompiled_modules, nocompile_decorators):
+def transform(node, context, uncompiled_modules, nocompile_decorators):
   """Transform function call to the compiled counterparts.
 
   Args:
     node: AST to transform.
-    namer: FunctionNamer-like.
-    namespace: Dict mapping symbol names to their corresponding live objects.
+    context: An EntityContext object.
     uncompiled_modules: set of string tuples, each tuple represents the fully
         qualified name of a package containing functions that will not be
         compiled.
@@ -269,7 +283,6 @@ def transform(node, namer, namespace, uncompiled_modules, nocompile_decorators):
         node: The transformed AST
         new_names: set(string), containing any newly-generated names
   """
-  transformer = CallTreeTransformer(namer, namespace, uncompiled_modules,
-                                    nocompile_decorators)
-  node = transformer.visit(node)
+  t = CallTreeTransformer(context, uncompiled_modules, nocompile_decorators)
+  node = t.visit(node)
   return node
diff --git a/tensorflow/contrib/py2tf/converters/call_trees_test.py b/tensorflow/contrib/py2tf/converters/call_trees_test.py
index 8cb8d7be0f..e63c10de0f 100644
--- a/tensorflow/contrib/py2tf/converters/call_trees_test.py
+++ b/tensorflow/contrib/py2tf/converters/call_trees_test.py
@@ -28,8 +28,13 @@ from tensorflow.python.platform import test
 
 class TestNamer(call_trees.FunctionNamer):
 
-  def compiled_function_name(self, original_name, live_object=None):
-    return 'renamed_%s' % original_name
+  def compiled_function_name(self,
+                             original_fqn,
+                             live_entity=None,
+                             owner_type=None):
+    if owner_type is not None:
+      return None, False
+    return ('renamed_%s' % '_'.join(original_fqn)), True
 
 
 class CallTreesTest(converter_test_base.TestCase):
@@ -45,14 +50,35 @@ class CallTreesTest(converter_test_base.TestCase):
     def test_fn_2(a):
       return test_fn_1(a) + 1
 
-    node = self.parse_and_analyze(test_fn_2, {'test_fn_1': test_fn_1})
-    node = call_trees.transform(node, TestNamer(), {}, (), ())
+    node = self.parse_and_analyze(
+        test_fn_2, {'test_fn_1': test_fn_1}, namer=TestNamer())
+    node = call_trees.transform(node, self.ctx, (), ())
     result = compiler.ast_to_object(node)
     # Only test_fn_2 is transformed, so we'll insert renamed_test_fn_1 manually.
     setattr(result, 'renamed_test_fn_1', renamed_test_fn_1)
 
     self.assertEquals(3, result.test_fn_2(1))
 
+  def test_simple_methods(self):
+
+    class TestClass(object):
+
+      def test_fn_1(self, a):
+        return a + 1
+
+      def test_fn_2(self, a):
+        return self.test_fn_1(a) + 1
+
+    node = self.parse_and_analyze(
+        TestClass.test_fn_2, {'TestClass': TestClass},
+        namer=TestNamer(),
+        arg_types={'self': (TestClass.__name__, TestClass)})
+    node = call_trees.transform(node, self.ctx, (), ())
+    result = compiler.ast_to_object(node)
+
+    tc = TestClass()
+    self.assertEquals(3, result.test_fn_2(tc, 1))
+
   def test_uncompiled_modules(self):
 
     def test_fn(a):
@@ -60,11 +86,13 @@ class CallTreesTest(converter_test_base.TestCase):
       a = math_ops.add(a, constant_op.constant(1))
       return a
 
-    node = self.parse_and_analyze(test_fn, {
-        'math_ops': math_ops,
-        'constant_op': constant_op
-    })
-    node = call_trees.transform(node, TestNamer(), {},
+    node = self.parse_and_analyze(
+        test_fn, {
+            'math_ops': math_ops,
+            'constant_op': constant_op
+        },
+        namer=TestNamer())
+    node = call_trees.transform(node, self.ctx,
                                 set(((math_ops.__name__,),
                                      (constant_op.__name__,))), ())
     result = compiler.ast_to_object(node)
diff --git a/tensorflow/contrib/py2tf/converters/converter_test_base.py b/tensorflow/contrib/py2tf/converters/converter_test_base.py
index ed006bad6d..6bfa55443c 100644
--- a/tensorflow/contrib/py2tf/converters/converter_test_base.py
+++ b/tensorflow/contrib/py2tf/converters/converter_test_base.py
@@ -31,18 +31,23 @@ class TestCase(test.TestCase):
   def parse_and_analyze(self,
                         test_fn,
                         namespace,
+                        namer=None,
                         arg_types=None,
-                        include_type_analysis=True):
+                        include_type_analysis=True,
+                        recursive=True):
+    node, source = parser.parse_entity(test_fn)
     ctx = context.EntityContext(
-        namer=None,
-        source_code=None,
+        namer=namer,
+        source_code=source,
         source_file=None,
         namespace=namespace,
         arg_values=None,
-        arg_types=arg_types)
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-    node = live_values.resolve(node, namespace, {})
+        arg_types=arg_types,
+        recursive=recursive)
+    node = access.resolve(node, ctx)
+    node = live_values.resolve(node, ctx, {})
     if include_type_analysis:
       node = type_info.resolve(node, ctx)
+      node = live_values.resolve(node, ctx, {})
+    self.ctx = ctx
     return node
diff --git a/tensorflow/contrib/py2tf/converters/side_effect_guards.py b/tensorflow/contrib/py2tf/converters/side_effect_guards.py
index a88828ff80..46a2269c20 100644
--- a/tensorflow/contrib/py2tf/converters/side_effect_guards.py
+++ b/tensorflow/contrib/py2tf/converters/side_effect_guards.py
@@ -94,6 +94,7 @@ class SideEffectGuardTransformer(gast.NodeTransformer):
     return node
 
   def _gate_symbols(self, guard_statement, guarded_args):
+    # TODO(mdan): This won't work for variables.
     template = """
       (args,) = (tf.identity(a) for a in (args,))
     """
@@ -133,8 +134,8 @@ class SideEffectGuardTransformer(gast.NodeTransformer):
       # First, attempt to gate future evaluation of args. If that's not
       # possible, gate all remaining statements (and that may fail too, see
       # _visit_and_reindent.
-      guarded_args = tuple(
-          n for n in args_scope.used if n in args_scope.parent.modified)
+      guarded_args = tuple(args_scope.used & (args_scope.parent.modified
+                                              | args_scope.parent.returned))
       if guarded_args:
         node = tuple(statements[:-1]) + (
             self._gate_symbols(control_deps_guard, guarded_args),)
diff --git a/tensorflow/contrib/py2tf/naming.py b/tensorflow/contrib/py2tf/naming.py
index a90758962b..5c7e4c5f95 100644
--- a/tensorflow/contrib/py2tf/naming.py
+++ b/tensorflow/contrib/py2tf/naming.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.util import tf_inspect
-
 
 class Namer(object):
   """Implementation of the namer interfaces required by various converters.
@@ -45,10 +43,15 @@ class Namer(object):
 
     self.generated_names = set()
 
-  def compiled_class_name(self, original_name, live_object=None):
+  def compiled_class_name(self, original_fqn, live_entity=None):
     """See call_trees.FunctionNamer.compiled_class_name."""
-    if live_object is not None and live_object in self.renamed_calls:
-      return self.renamed_calls[live_object]
+    if live_entity is not None and live_entity in self.renamed_calls:
+      return self.renamed_calls[live_entity]
+
+    if isinstance(original_fqn, tuple):
+      original_name = '__'.join(original_fqn)
+    else:
+      original_name = original_fqn
 
     new_name_root = 'Tf%s' % original_name
     new_name = new_name_root
@@ -57,41 +60,46 @@ class Namer(object):
       n += 1
       new_name = '%s_%d' % (new_name_root, n)
 
-    if live_object is not None:
-      self.renamed_calls[live_object] = new_name
+    if live_entity is not None:
+      self.renamed_calls[live_entity] = new_name
     self.generated_names.add(new_name)
+    if live_entity is not None:
+      self.renamed_calls[live_entity] = new_name
     return new_name
 
   def compiled_function_name(self,
-                             original_name,
-                             live_object=None,
+                             original_fqn,
+                             live_entity=None,
                              owner_type=None):
     """See call_trees.FunctionNamer.compiled_function_name."""
-    if live_object is not None and live_object in self.renamed_calls:
-      return self.renamed_calls[live_object]
 
     if not self.recursive:
-      new_name = original_name
-    elif owner_type is None or owner_type in self.partial_types:
-      # Top level functions: rename
-      new_name_root = 'tf__%s' % original_name
-      new_name = new_name_root
-      n = 0
-      while new_name in self.global_namespace:
-        n += 1
-        new_name = '%s_%d' % (new_name_root, n)
+      return None, False
+
+    if owner_type is not None and owner_type not in self.partial_types:
+      # Members are not renamed when part of an entire converted class.
+      return None, False
+
+    if isinstance(original_fqn, tuple):
+      original_name = '__'.join(original_fqn)
     else:
-      if tf_inspect.isclass(owner_type):
-        # Class members: do not rename (the entire class will be renamed)
-        new_name = original_name
-      else:
-        raise NotImplementedError('Member function "%s" of non-class type: %s' %
-                                  (original_name, owner_type))
-
-    if live_object is not None:
-      self.renamed_calls[live_object] = new_name
+      original_name = original_fqn
+
+    if live_entity is not None and live_entity in self.renamed_calls:
+      return self.renamed_calls[live_entity], True
+
+    new_name_root = 'tf__%s' % original_name
+    new_name = new_name_root
+    n = 0
+    while new_name in self.global_namespace:
+      n += 1
+      new_name = '%s_%d' % (new_name_root, n)
+
+    if live_entity is not None:
+      self.renamed_calls[live_entity] = new_name
     self.generated_names.add(new_name)
-    return new_name
+
+    return new_name, True
 
   def new_symbol(self, name_root, reserved_locals):
     """See control_flow.SymbolNamer.new_symbol."""
diff --git a/tensorflow/contrib/py2tf/naming_test.py b/tensorflow/contrib/py2tf/naming_test.py
index 7bfc9b8733..5cf0a3da2c 100644
--- a/tensorflow/contrib/py2tf/naming_test.py
+++ b/tensorflow/contrib/py2tf/naming_test.py
@@ -29,8 +29,9 @@ class NamerTest(test.TestCase):
       pass
 
     namer = naming.Namer({}, True, None, ())
-    self.assertEqual('tf__foo', namer.compiled_function_name('foo'))
-    self.assertEqual('tf__bar', namer.compiled_function_name('bar', bar))
+    self.assertEqual(('tf__foo', True), namer.compiled_function_name('foo'))
+    self.assertEqual(('tf__bar', True), namer.compiled_function_name(
+        'bar', bar))
     self.assertEqual({bar: 'tf__bar'}, namer.renamed_calls)
     self.assertItemsEqual(('tf__bar', 'tf__foo'), namer.generated_names)
 
@@ -39,15 +40,18 @@ class NamerTest(test.TestCase):
       pass
 
     namer = naming.Namer({}, True, None, ())
-    self.assertEqual('tf__foo', namer.compiled_function_name('foo', foo))
-    self.assertEqual('tf__foo', namer.compiled_function_name('foo', foo))
+    self.assertEqual(('tf__foo', True), namer.compiled_function_name(
+        'foo', foo))
+    self.assertEqual(('tf__foo', True), namer.compiled_function_name(
+        'foo', foo))
 
   def test_compiled_function_name_avoids_global_conflicts(self):
     def foo():
       pass
 
     namer = naming.Namer({'tf__foo': 1}, True, None, ())
-    self.assertEqual('tf__foo_1', namer.compiled_function_name('foo', foo))
+    self.assertEqual(('tf__foo_1', True),
+                     namer.compiled_function_name('foo', foo))
 
   def test_new_symbol_tracks_names(self):
     namer = naming.Namer({}, True, None, ())
diff --git a/tensorflow/contrib/py2tf/pyct/context.py b/tensorflow/contrib/py2tf/pyct/context.py
index 73f3613d09..fef74ebefa 100644
--- a/tensorflow/contrib/py2tf/pyct/context.py
+++ b/tensorflow/contrib/py2tf/pyct/context.py
@@ -33,10 +33,11 @@ class EntityContext(object):
   """
 
   def __init__(self, namer, source_code, source_file, namespace, arg_values,
-               arg_types):
+               arg_types, recursive):
     self.namer = namer
     self.source_code = source_code
     self.source_file = source_file
     self.namespace = namespace
     self.arg_values = {} if arg_values is None else arg_values
     self.arg_types = {} if arg_types is None else arg_types
+    self.recursive = recursive
diff --git a/tensorflow/contrib/py2tf/pyct/parser.py b/tensorflow/contrib/py2tf/pyct/parser.py
index 3daa69b9ce..dc7df883b3 100644
--- a/tensorflow/contrib/py2tf/pyct/parser.py
+++ b/tensorflow/contrib/py2tf/pyct/parser.py
@@ -28,11 +28,13 @@ import gast
 from tensorflow.python.util import tf_inspect
 
 
-def parse_object(obj):
-  """Return the AST of given object."""
-  return parse_str(tf_inspect.getsource(obj))
+def parse_entity(entity):
+  """Return the AST of given entity."""
+  source = tf_inspect.getsource(entity)
+  source = textwrap.dedent(source)
+  return parse_str(source), source
 
 
 def parse_str(src):
   """Return the AST of given piece of code."""
-  return gast.parse(textwrap.dedent(src))
+  return gast.parse(src)
diff --git a/tensorflow/contrib/py2tf/pyct/parser_test.py b/tensorflow/contrib/py2tf/pyct/parser_test.py
index 46f9aa8207..f35dfa04c7 100644
--- a/tensorflow/contrib/py2tf/pyct/parser_test.py
+++ b/tensorflow/contrib/py2tf/pyct/parser_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import textwrap
+
 from tensorflow.contrib.py2tf.pyct import parser
 from tensorflow.python.platform import test
 
@@ -28,15 +30,16 @@ def f(x):
 
 class ParserTest(test.TestCase):
 
-  def test_parse_object(self):
-    mod = parser.parse_object(f)
+  def test_parse_entity(self):
+    mod, _ = parser.parse_entity(f)
     self.assertEqual('f', mod.body[0].name)
 
   def test_parse_str(self):
-    mod = parser.parse_str("""
+    mod = parser.parse_str(
+        textwrap.dedent("""
         def f(x):
           return x + 1
-    """)
+    """))
     self.assertEqual('f', mod.body[0].name)
 
 
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/access.py b/tensorflow/contrib/py2tf/pyct/static_analysis/access.py
index 8f3ac48b68..33629f87d1 100644
--- a/tensorflow/contrib/py2tf/pyct/static_analysis/access.py
+++ b/tensorflow/contrib/py2tf/pyct/static_analysis/access.py
@@ -23,6 +23,7 @@ import copy
 import gast
 
 from tensorflow.contrib.py2tf.pyct import anno
+from tensorflow.contrib.py2tf.pyct import transformer
 
 # TODO(mdan): Add support for PY3 (e.g. Param vs arg).
 
@@ -53,6 +54,8 @@ class Scope(object):
     self.modified = set()
     self.created = set()
     self.used = set()
+    self.params = set()
+    self.returned = set()
 
   # TODO(mdan): Rename to `locals`
   @property
@@ -69,42 +72,73 @@ class Scope(object):
     self.modified = copy.copy(other.modified)
     self.created = copy.copy(other.created)
     self.used = copy.copy(other.used)
+    self.params = copy.copy(other.params)
+    self.returned = copy.copy(other.returned)
 
   def merge_from(self, other):
     self.modified |= other.modified
     self.created |= other.created
     self.used |= other.used
+    self.params |= other.params
+    self.returned |= other.returned
 
   def has(self, name):
-    if name in self.modified:
+    if name in self.modified or name in self.params:
       return True
     elif self.parent is not None:
       return self.parent.has(name)
     return False
 
+  def is_modified_since_entry(self, name):
+    if name in self.modified:
+      return True
+    elif self.parent is not None and not self.isolated:
+      return self.parent.is_modified_since_entry(name)
+    return False
+
+  def is_param(self, name):
+    if name in self.params:
+      return True
+    elif self.parent is not None and not self.isolated:
+      return self.parent.is_param(name)
+    return False
+
   def mark_read(self, name):
     self.used.add(name)
     if self.parent is not None and name not in self.created:
       self.parent.mark_read(name)
 
+  def mark_param(self, name):
+    self.params.add(name)
+
+  def mark_creation(self, name):
+    self.created.add(name)
+
   def mark_write(self, name):
     self.modified.add(name)
     if self.isolated:
-      self.created.add(name)
+      self.mark_creation(name)
     else:
       if self.parent is None:
-        self.created.add(name)
+        self.mark_creation(name)
       else:
         if not self.parent.has(name):
-          self.created.add(name)
+          self.mark_creation(name)
         self.parent.mark_write(name)
 
+  def mark_returned(self, name):
+    self.returned.add(name)
+    if not self.isolated and self.parent is not None:
+      self.parent.mark_returned(name)
+
 
-class AccessResolver(gast.NodeTransformer):
+class AccessResolver(transformer.Base):
   """Annotates nodes with local scope information. See Scope."""
 
-  def __init__(self):
+  def __init__(self, context):
+    super(AccessResolver, self).__init__(context)
     self.scope = Scope(None)
+    self._in_return_statement = False
 
   def visit_Name(self, node):
     # TODO(mdan): This is insufficient for object fields, e.g. hp.learning_rate.
@@ -120,10 +154,17 @@ class AccessResolver(gast.NodeTransformer):
       # TODO(mdan): This bay be incorrect with nested functions.
       # For nested functions, we'll have to add the notion of hiding args from
       # the parent scope, not writing to them.
-      self.scope.mark_write(node.id)
+      self.scope.mark_creation(node.id)
+      self.scope.mark_param(node.id)
     else:
       raise ValueError('Unknown context %s for node %s.' % (type(node.ctx),
                                                             node.id))
+    anno.setanno(node, 'is_modified_since_entry',
+                 self.scope.is_modified_since_entry(node.id))
+    anno.setanno(node, 'is_param', self.scope.is_param(node.id))
+
+    if self._in_return_statement:
+      self.scope.mark_returned(node.id)
     return node
 
   def visit_Print(self, node):
@@ -138,7 +179,7 @@ class AccessResolver(gast.NodeTransformer):
 
   def visit_Call(self, node):
     current_scope = self.scope
-    args_scope = Scope(current_scope)
+    args_scope = Scope(current_scope, isolated=False)
     self.scope = args_scope
     for n in node.args:
       self.visit(n)
@@ -200,6 +241,12 @@ class AccessResolver(gast.NodeTransformer):
         node, ((node.body, 'body'), (node.orelse, 'orelse')))
     return node
 
+  def visit_Return(self, node):
+    self._in_return_statement = True
+    node = self.generic_visit(node)
+    self._in_return_statement = False
+    return node
+
 
-def resolve(node):
-  return AccessResolver().visit(node)
+def resolve(node, context):
+  return AccessResolver(context).visit(node)
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/access_test.py b/tensorflow/contrib/py2tf/pyct/static_analysis/access_test.py
index 0912ebb4c3..df0283b54d 100644
--- a/tensorflow/contrib/py2tf/pyct/static_analysis/access_test.py
+++ b/tensorflow/contrib/py2tf/pyct/static_analysis/access_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import gast
 
 from tensorflow.contrib.py2tf.pyct import anno
+from tensorflow.contrib.py2tf.pyct import context
 from tensorflow.contrib.py2tf.pyct import parser
 from tensorflow.contrib.py2tf.pyct.static_analysis import access
 from tensorflow.python.platform import test
@@ -95,6 +96,19 @@ class ScopeTest(test.TestCase):
 
 class AccessResolverTest(test.TestCase):
 
+  def _parse_and_analyze(self, test_fn):
+    node, source = parser.parse_entity(test_fn)
+    ctx = context.EntityContext(
+        namer=None,
+        source_code=source,
+        source_file=None,
+        namespace={},
+        arg_values=None,
+        arg_types=None,
+        recursive=True)
+    node = access.resolve(node, ctx)
+    return node
+
   def test_local_markers(self):
 
     def test_fn(a):  # pylint:disable=unused-argument
@@ -103,9 +117,7 @@ class AccessResolverTest(test.TestCase):
         b -= 1
       return b
 
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-
+    node = self._parse_and_analyze(test_fn)
     self.assertFalse(anno.getanno(node.body[0].body[0].value,
                                   'is_local'))  # c in b = c
     self.assertTrue(anno.getanno(node.body[0].body[1].test.left,
@@ -126,9 +138,7 @@ class AccessResolverTest(test.TestCase):
       print(a, b)
       return c
 
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-
+    node = self._parse_and_analyze(test_fn)
     print_node = node.body[0].body[2]
     if isinstance(print_node, gast.Print):
       # Python 2
@@ -151,9 +161,7 @@ class AccessResolverTest(test.TestCase):
       foo(a, b)  # pylint:disable=undefined-variable
       return c
 
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-
+    node = self._parse_and_analyze(test_fn)
     call_node = node.body[0].body[2].value
     # We basically need to detect which variables are captured by the call
     # arguments.
@@ -169,15 +177,13 @@ class AccessResolverTest(test.TestCase):
         b -= 1
       return b, c
 
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-
+    node = self._parse_and_analyze(test_fn)
     while_node = node.body[0].body[1]
     self.assertScopeIs(
         anno.getanno(while_node, 'body_scope'), ('b',), ('b', 'c'), ('c',))
     self.assertScopeIs(
         anno.getanno(while_node, 'body_parent_scope'), ('a', 'b', 'c'),
-        ('a', 'b', 'c'), ('a', 'b', 'c'))
+        ('b', 'c'), ('a', 'b', 'c'))
 
   def test_for(self):
 
@@ -188,15 +194,13 @@ class AccessResolverTest(test.TestCase):
         b -= 1
       return b, c
 
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-
+    node = self._parse_and_analyze(test_fn)
     for_node = node.body[0].body[1]
     self.assertScopeIs(
         anno.getanno(for_node, 'body_scope'), ('b',), ('b', 'c'), ('c',))
     self.assertScopeIs(
         anno.getanno(for_node, 'body_parent_scope'), ('a', 'b', 'c'),
-        ('a', 'b', 'c', '_'), ('a', 'b', 'c', '_'))
+        ('b', 'c', '_'), ('a', 'b', 'c', '_'))
 
   def test_if(self):
 
@@ -211,9 +215,7 @@ class AccessResolverTest(test.TestCase):
         u = -y
       return z, u
 
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-
+    node = self._parse_and_analyze(test_fn)
     if_node = node.body[0].body[0]
     self.assertScopeIs(
         anno.getanno(if_node, 'body_scope'), ('x', 'y'), ('x', 'y', 'z'),
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/live_values.py b/tensorflow/contrib/py2tf/pyct/static_analysis/live_values.py
index 242e544b52..5a2903e6b5 100644
--- a/tensorflow/contrib/py2tf/pyct/static_analysis/live_values.py
+++ b/tensorflow/contrib/py2tf/pyct/static_analysis/live_values.py
@@ -26,26 +26,19 @@ from __future__ import print_function
 import gast
 
 from tensorflow.contrib.py2tf.pyct import anno
+from tensorflow.contrib.py2tf.pyct import transformer
 
 
-class LiveValueResolver(gast.NodeTransformer):
+class LiveValueResolver(transformer.Base):
   """Annotates nodes with live values."""
 
-  def __init__(self, namespace, literals):
-    """Create a new resolver.
-
-    Args:
-      namespace: A dict representing the namespace visible to the AST in the
-          intended execution context.
-      literals: A dict mapping literal lymbol names to their value. An example
-          literal is "None".
-    """
-    self.namespace = namespace
+  def __init__(self, context, literals):
+    super(LiveValueResolver, self).__init__(context)
     self.literals = literals
 
   def visit_ClassDef(self, node):
     self.generic_visit(node)
-    anno.setanno(node, 'live_val', self.namespace[node.name])
+    anno.setanno(node, 'live_val', self.context.namespace[node.name])
     return node
 
   def visit_Name(self, node):
@@ -53,20 +46,31 @@ class LiveValueResolver(gast.NodeTransformer):
     if isinstance(node.ctx, gast.Load):
       assert anno.hasanno(node, 'is_local'), node
       symbol_is_local = anno.getanno(node, 'is_local')
-      if not symbol_is_local:
+      assert anno.hasanno(node, 'is_modified_since_entry'), node
+      symbol_is_modified = anno.getanno(node, 'is_modified_since_entry')
+      assert anno.hasanno(node, 'is_param'), node
+      symbol_is_param = anno.getanno(node, 'is_param')
+
+      if not symbol_is_local and not symbol_is_param:
         if node.id in self.literals:
           anno.setanno(node, 'live_val', self.literals[node.id])
           # TODO(mdan): Could live values have FQNs? i.e. 'a'.join()
-        elif node.id in self.namespace:
-          obj = self.namespace[node.id]
+        elif node.id in self.context.namespace:
+          obj = self.context.namespace[node.id]
           anno.setanno(node, 'live_val', obj)
           anno.setanno(node, 'fqn', (obj.__name__,))
         else:
-          raise ValueError('Could not find global symbol %s.' % node.id)
+          raise ValueError('Could not resolve symbol "%s".' % node.id)
       else:
         pass
         # TODO(mdan): Attempt to trace its value through the local chain.
         # TODO(mdan): Use type annotations as fallback.
+
+      if not symbol_is_modified:
+        if node.id in self.context.arg_values:
+          obj = self.context.arg_values[node.id]
+          anno.setanno(node, 'live_val', obj)
+          anno.setanno(node, 'fqn', (obj.__class__.__name__,))
     return node
 
   def visit_Attribute(self, node):
@@ -79,15 +83,25 @@ class LiveValueResolver(gast.NodeTransformer):
                                                          node.attr))
       anno.setanno(node, 'live_val', getattr(parent_object, node.attr))
       anno.setanno(node, 'fqn', anno.getanno(node.value, 'fqn') + (node.attr,))
+    # TODO(mdan): Investigate the role built-in annotations can play here.
+    elif anno.hasanno(node.value, 'type'):
+      parent_type = anno.getanno(node.value, 'type')
+      if hasattr(parent_type, node.attr):
+        # This should hold for static members like methods.
+        # This would not hold for dynamic members like function attributes.
+        # For the dynamic case, we simply leave the node without an annotation,
+        # and let downstream consumers figure out what to do.
+        anno.setanno(node, 'live_val', getattr(parent_type, node.attr))
+        anno.setanno(node, 'fqn',
+                     anno.getanno(node.value, 'type_fqn') + (node.attr,))
     elif isinstance(node.value, gast.Name):
       stem_name = node.value
       # All nonlocal symbols should be fully resolved.
       assert anno.hasanno(stem_name, 'is_local'), stem_name
-      assert anno.getanno(stem_name, 'is_local'), stem_name
       # TODO(mdan): Figure out what to do when calling attribute on local object
       # Maybe just leave as-is?
     return node
 
 
-def resolve(node, namespace, literals):
-  return LiveValueResolver(namespace, literals).visit(node)
+def resolve(node, context, literals):
+  return LiveValueResolver(context, literals).visit(node)
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/live_values_test.py b/tensorflow/contrib/py2tf/pyct/static_analysis/live_values_test.py
index e77497654a..f3057b3466 100644
--- a/tensorflow/contrib/py2tf/pyct/static_analysis/live_values_test.py
+++ b/tensorflow/contrib/py2tf/pyct/static_analysis/live_values_test.py
@@ -19,24 +19,45 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.py2tf.pyct import anno
+from tensorflow.contrib.py2tf.pyct import context
 from tensorflow.contrib.py2tf.pyct import parser
 from tensorflow.contrib.py2tf.pyct.static_analysis import access
 from tensorflow.contrib.py2tf.pyct.static_analysis import live_values
+from tensorflow.contrib.py2tf.pyct.static_analysis import type_info
 from tensorflow.python.framework import constant_op
 from tensorflow.python.platform import test
 
 
 class LiveValuesResolverTest(test.TestCase):
 
+  def _parse_and_analyze(self,
+                         test_fn,
+                         namespace,
+                         literals=None,
+                         arg_types=None):
+    literals = literals or {}
+    arg_types = arg_types or {}
+    node, source = parser.parse_entity(test_fn)
+    ctx = context.EntityContext(
+        namer=None,
+        source_code=source,
+        source_file=None,
+        namespace=namespace,
+        arg_values=None,
+        arg_types=arg_types,
+        recursive=True)
+    node = access.resolve(node, ctx)
+    node = live_values.resolve(node, ctx, literals)
+    node = type_info.resolve(node, ctx)
+    node = live_values.resolve(node, ctx, literals)
+    return node
+
   def test_literals(self):
 
     def test_fn():
       return Foo  # pylint: disable=undefined-variable
 
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-    node = live_values.resolve(node, {}, {'Foo': 'bar'})
-
+    node = self._parse_and_analyze(test_fn, {}, {'Foo': 'bar'})
     retval_node = node.body[0].body[0].value
     self.assertEquals('bar', anno.getanno(retval_node, 'live_val'))
 
@@ -48,10 +69,7 @@ class LiveValuesResolverTest(test.TestCase):
     def test_fn():
       return foo()
 
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-    node = live_values.resolve(node, {'foo': foo}, {})
-
+    node = self._parse_and_analyze(test_fn, {'foo': foo})
     func_node = node.body[0].body[0].value.func
     self.assertEquals(foo, anno.getanno(func_node, 'live_val'))
     self.assertEquals(('foo',), anno.getanno(func_node, 'fqn'))
@@ -61,15 +79,29 @@ class LiveValuesResolverTest(test.TestCase):
     def test_fn():
       return constant_op.constant(0)
 
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-    node = live_values.resolve(node, {'constant_op': constant_op}, {})
-
+    node = self._parse_and_analyze(test_fn, {'constant_op': constant_op})
     func_node = node.body[0].body[0].value.func
     self.assertEquals(constant_op.constant, anno.getanno(func_node, 'live_val'))
     self.assertEquals((constant_op.__name__, 'constant'),
                       anno.getanno(func_node, 'fqn'))
 
+  def test_attributes_with_type_hints(self):
+
+    class TestClass(object):
+
+      def member(self):
+        pass
+
+      def test_fn(self):
+        return self.member()
+
+    node = self._parse_and_analyze(
+        TestClass.test_fn, {'constant_op': constant_op},
+        arg_types={'self': (TestClass.__name__, TestClass)})
+    func_node = node.body[0].body[0].value.func
+    self.assertEquals(TestClass.member, anno.getanno(func_node, 'live_val'))
+    self.assertEquals(('TestClass', 'member'), anno.getanno(func_node, 'fqn'))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/type_info.py b/tensorflow/contrib/py2tf/pyct/static_analysis/type_info.py
index 0042aa90ed..cf74142cbe 100644
--- a/tensorflow/contrib/py2tf/pyct/static_analysis/type_info.py
+++ b/tensorflow/contrib/py2tf/pyct/static_analysis/type_info.py
@@ -36,8 +36,6 @@ class Scope(object):
         most recently assigned to the symbol.
   """
 
-  # TODO(mdan): Should rather use a CFG here?
-
   def __init__(self, parent):
     """Create a new scope.
 
@@ -117,18 +115,32 @@ class TypeInfoResolver(transformer.Base):
     node.orelse = self._visit_block(node.orelse)
     return node
 
+  def _process_function_arg(self, arg_name):
+    if self.function_level == 1 and arg_name in self.context.arg_types:
+      # Forge a node to hold the type information, so that method calls on
+      # it can resolve the type.
+      type_holder = gast.Name(arg_name, gast.Load(), None)
+      type_string, type_obj = self.context.arg_types[arg_name]
+      anno.setanno(type_holder, 'type', type_obj)
+      anno.setanno(type_holder, 'type_fqn', tuple(type_string.split('.')))
+      self.scope.setval(arg_name, type_holder)
+
+  def visit_arg(self, node):
+    self._process_function_arg(node.arg)
+    return node
+
   def visit_Name(self, node):
     self.generic_visit(node)
     if isinstance(node.ctx, gast.Param):
-      self.scope.setval(node.id, gast.Name(node.id, gast.Load(), None))
-      if self.function_level == 1 and node.id in self.context.arg_types:
-        # Forge a node to hold the type information, so that method calls on
-        # it can resolve the type.
-        type_holder = gast.Name(node.id, gast.Load(), None)
-        type_string, type_obj = self.context.arg_types[node.id]
-        anno.setanno(type_holder, 'type', type_obj)
-        anno.setanno(type_holder, 'type_fqn', tuple(type_string.split('.')))
-        self.scope.setval(node.id, type_holder)
+      self._process_function_arg(node.id)
+    elif isinstance(node.ctx, gast.Load) and self.scope.hasval(node.id):
+      # E.g. if we had
+      # a = b
+      # then for future references to `a` we should have traced_source = `b`
+      traced_source = self.scope.getval(node.id)
+      if anno.hasanno(traced_source, 'type'):
+        anno.setanno(node, 'type', anno.getanno(traced_source, 'type'))
+        anno.setanno(node, 'type_fqn', anno.getanno(traced_source, 'type_fqn'))
     return node
 
   def _process_variable_assignment(self, source, targets):
@@ -172,38 +184,6 @@ class TypeInfoResolver(transformer.Base):
     self._process_variable_assignment(node.value, node.targets)
     return node
 
-  def visit_Call(self, node):
-    target = node.func
-    if not anno.hasanno(target, 'live_val'):
-      if not isinstance(target, gast.Attribute):
-        # Suspecting this pattern would reach here:
-        #   foo = bar
-        #   foo()
-        raise ValueError('Dont know how to handle dynamic functions.')
-      if not isinstance(target.value, gast.Name):
-        # Possible example of this kind:
-        #   foo = module.Foo()
-        #   foo.bar.baz()
-        # TODO(mdan): This should be doable by using the FQN.
-        raise ValueError('Dont know how to handle object properties yet.')
-      # In the example below, object_source is 'tr.train.Optimizer()':
-      #   opt = tf.train.Optimizer()
-      #   opt.foo()
-      if self.scope.hasval(target.value.id):
-        object_source = self.scope.getval(target.value.id)
-        if not anno.hasanno(object_source, 'type'):
-          raise ValueError('Could not determine type of "%s". Is it dynamic?' %
-                           (target.value.id))
-        anno.setanno(target, 'type', anno.getanno(object_source, 'type'))
-        anno.setanno(target, 'type_fqn', anno.getanno(object_source,
-                                                      'type_fqn'))
-      else:
-        # TODO(mdan): Figure out what could the user do to get past this.
-        raise ValueError('No info on "%s". Is it dynamically built?' %
-                         (target.value.id))
-    self.generic_visit(node)
-    return node
-
 
 def resolve(node, context):
   return TypeInfoResolver(context).visit(node)
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/type_info_test.py b/tensorflow/contrib/py2tf/pyct/static_analysis/type_info_test.py
index a491f49ca3..68fa1ee92a 100644
--- a/tensorflow/contrib/py2tf/pyct/static_analysis/type_info_test.py
+++ b/tensorflow/contrib/py2tf/pyct/static_analysis/type_info_test.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 from tensorflow.contrib.py2tf.pyct import anno
 from tensorflow.contrib.py2tf.pyct import context
 from tensorflow.contrib.py2tf.pyct import parser
-from tensorflow.contrib.py2tf.pyct import transformer
 from tensorflow.contrib.py2tf.pyct.static_analysis import access
 from tensorflow.contrib.py2tf.pyct.static_analysis import live_values
 from tensorflow.contrib.py2tf.pyct.static_analysis import type_info
@@ -57,17 +56,19 @@ class ScopeTest(test.TestCase):
 class TypeInfoResolverTest(test.TestCase):
 
   def _parse_and_analyze(self, test_fn, namespace, arg_types=None):
+    node, source = parser.parse_entity(test_fn)
     ctx = context.EntityContext(
         namer=None,
-        source_code=None,
+        source_code=source,
         source_file=None,
         namespace=namespace,
         arg_values=None,
-        arg_types=arg_types)
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-    node = live_values.resolve(node, namespace, {})
+        arg_types=arg_types,
+        recursive=True)
+    node = access.resolve(node, ctx)
+    node = live_values.resolve(node, ctx, {})
     node = type_info.resolve(node, ctx)
+    node = live_values.resolve(node, ctx, {})
     return node
 
   def test_constructor_detection(self):
@@ -83,16 +84,16 @@ class TypeInfoResolverTest(test.TestCase):
     self.assertEquals((training.__name__, 'GradientDescentOptimizer'),
                       anno.getanno(call_node, 'type_fqn'))
 
-  def test_class_members(self):
+  def test_class_members_of_detected_constructor(self):
 
     def test_fn():
       opt = training.GradientDescentOptimizer(0.1)
       opt.minimize(0)
 
     node = self._parse_and_analyze(test_fn, {'training': training})
-    attr_call_node = node.body[0].body[1].value.func
-    self.assertEquals((training.__name__, 'GradientDescentOptimizer'),
-                      anno.getanno(attr_call_node, 'type_fqn'))
+    method_call = node.body[0].body[1].value.func
+    self.assertEquals(training.GradientDescentOptimizer.minimize,
+                      anno.getanno(method_call, 'live_val'))
 
   def test_class_members_in_with_stmt(self):
 
@@ -106,11 +107,11 @@ class TypeInfoResolverTest(test.TestCase):
     self.assertEquals((session.__name__, 'Session'),
                       anno.getanno(constructor_call, 'type_fqn'))
 
-    member_call = node.body[0].body[0].body[0].value.func
-    self.assertEquals((session.__name__, 'Session'),
-                      anno.getanno(member_call, 'type_fqn'))
+    method_call = node.body[0].body[0].body[0].value.func
+    self.assertEquals(session.Session.run, anno.getanno(method_call,
+                                                        'live_val'))
 
-  def test_constructor_deta_dependent(self):
+  def test_constructor_data_dependent(self):
 
     def test_fn(x):
       if x > 0:
@@ -119,16 +120,18 @@ class TypeInfoResolverTest(test.TestCase):
         opt = training.GradientDescentOptimizer(0.01)
       opt.minimize(0)
 
-    with self.assertRaises(transformer.PyFlowParseError):
-      self._parse_and_analyze(test_fn, {'training': training})
+    node = self._parse_and_analyze(test_fn, {'training': training})
+    method_call = node.body[0].body[1].value.func
+    self.assertFalse(anno.hasanno(method_call, 'live_val'))
 
   def test_parameter_class_members(self):
 
     def test_fn(opt):
       opt.minimize(0)
 
-    with self.assertRaises(transformer.PyFlowParseError):
-      self._parse_and_analyze(test_fn, {'training': training})
+    node = self._parse_and_analyze(test_fn, {})
+    method_call = node.body[0].body[0].value.func
+    self.assertFalse(anno.hasanno(method_call, 'live_val'))
 
   def test_parameter_class_members_with_value_hints(self):
 
@@ -138,14 +141,13 @@ class TypeInfoResolverTest(test.TestCase):
     node = self._parse_and_analyze(
         test_fn, {'training': training},
         arg_types={
-            'opt': (('%s.GradientDescentOptimizer' % training.__name__),
-                    training.GradientDescentOptimizer(0.1))
+            'opt': (training.GradientDescentOptimizer.__name__,
+                    training.GradientDescentOptimizer)
         })
 
-    attr_call_node = node.body[0].body[0].value.func
-    self.assertEquals(
-        tuple(training.__name__.split('.')) + ('GradientDescentOptimizer',),
-        anno.getanno(attr_call_node, 'type_fqn'))
+    method_call = node.body[0].body[0].value.func
+    self.assertEquals(training.GradientDescentOptimizer.minimize,
+                      anno.getanno(method_call, 'live_val'))
 
   def test_function_variables(self):
 
@@ -156,8 +158,9 @@ class TypeInfoResolverTest(test.TestCase):
       foo = bar
       foo()
 
-    with self.assertRaises(transformer.PyFlowParseError):
-      self._parse_and_analyze(test_fn, {'bar': bar})
+    node = self._parse_and_analyze(test_fn, {'bar': bar})
+    method_call = node.body[0].body[1].value.func
+    self.assertFalse(anno.hasanno(method_call, 'live_val'))
 
   def test_nested_members(self):
 
@@ -165,8 +168,9 @@ class TypeInfoResolverTest(test.TestCase):
       foo = training.GradientDescentOptimizer(0.1)
       foo.bar.baz()
 
-    with self.assertRaises(transformer.PyFlowParseError):
-      self._parse_and_analyze(test_fn, {'training': training})
+    node = self._parse_and_analyze(test_fn, {'training': training})
+    method_call = node.body[0].body[1].value.func
+    self.assertFalse(anno.hasanno(method_call, 'live_val'))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/py2tf/pyct/templates.py b/tensorflow/contrib/py2tf/pyct/templates.py
index 77c5fbe02a..6be526f20d 100644
--- a/tensorflow/contrib/py2tf/pyct/templates.py
+++ b/tensorflow/contrib/py2tf/pyct/templates.py
@@ -23,6 +23,7 @@ from __future__ import print_function
 
 import ast
 import copy
+import textwrap
 
 import gast
 
@@ -119,7 +120,7 @@ def replace(template, **replacements):
   """
   if not isinstance(template, str):
     raise ValueError('Expected string template, got %s' % type(template))
-  tree = parser.parse_str(template)
+  tree = parser.parse_str(textwrap.dedent(template))
   for k in replacements:
     replacements[k] = _strings_to_names(replacements[k])
   return ReplaceTransformer(replacements).visit(tree).body
diff --git a/tensorflow/contrib/py2tf/pyct/transformer.py b/tensorflow/contrib/py2tf/pyct/transformer.py
index d5aa23eaeb..8a836b7c1b 100644
--- a/tensorflow/contrib/py2tf/pyct/transformer.py
+++ b/tensorflow/contrib/py2tf/pyct/transformer.py
@@ -18,7 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import sys
+
 import gast
+import six
 
 from tensorflow.contrib.py2tf.pyct import pretty_printer
 
@@ -48,11 +51,15 @@ class Base(gast.NodeTransformer):
         self._lineno = node.lineno
         self._col_offset = node.col_offset
       return super(Base, self).visit(node)
-    except ValueError as e:
-      msg = '%s\nOccurred at node:\n%s' % (str(e), pretty_printer.fmt(node))
+    except (ValueError, AttributeError, NotImplementedError) as e:
+      msg = '%s: %s\nOccurred at node:\n%s' % (e.__class__.__name__, str(e),
+                                               pretty_printer.fmt(node))
       if source_code:
-        line = self._source.splitlines()[self._lineno - 1]
+        line = source_code.splitlines()[self._lineno - 1]
       else:
         line = '<no source available>'
-      raise PyFlowParseError(
-          msg, (source_file, self._lineno, self._col_offset + 1, line))
+      six.reraise(PyFlowParseError,
+                  PyFlowParseError(
+                      msg,
+                      (source_file, self._lineno, self._col_offset + 1, line)),
+                  sys.exc_info()[2])
-- 
GitLab


From 76ba86d308f1bb255fdca930deed19ae9233ee86 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 29 Jan 2018 12:14:15 -0800
Subject: [PATCH 335/966] Pass maximum_iterations hint to tf.while_loop if
 possible.

PiperOrigin-RevId: 183705773
---
 .../python/ops/bijectors/masked_autoregressive.py         | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py b/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py
index dc8ae1eed1..5251dbcb57 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py
@@ -237,6 +237,11 @@ class MaskedAutoregressiveFlow(bijector_lib.Bijector):
       return y
 
     event_size = array_ops.shape(x)[-1]
+    # If the event size is available at graph construction time, we can inform
+    # the graph compiler of the maximum number of steps. If not,
+    # static_event_size will be None, and the maximum_iterations argument will
+    # have no effect.
+    static_event_size = x.shape.with_rank_at_least(1)[-1].value
     y0 = array_ops.zeros_like(x, name="y0")
     # call the template once to ensure creation
     _ = self._shift_and_log_scale_fn(y0)
@@ -258,7 +263,8 @@ class MaskedAutoregressiveFlow(bijector_lib.Bijector):
     _, y = control_flow_ops.while_loop(
         cond=lambda index, _: index < event_size,
         body=_loop_body,
-        loop_vars=[0, y0])
+        loop_vars=(0, y0),
+        maximum_iterations=static_event_size)
     return y
 
   def _inverse(self, y):
-- 
GitLab


From b78134d0d5ea7f17468bea9276c35fab4a9cb388 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 29 Jan 2018 12:15:12 -0800
Subject: [PATCH 336/966] A few tweaks to add const to functions and member
 fields.

PiperOrigin-RevId: 183705898
---
 tensorflow/c/eager/BUILD                      |  1 +
 tensorflow/c/eager/c_api.cc                   | 10 +---------
 tensorflow/c/eager/c_api_internal.h           | 19 +++++++++++++------
 .../process_function_library_runtime.cc       |  2 +-
 .../process_function_library_runtime.h        |  2 +-
 5 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index 74190cb135..e62310d811 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -46,6 +46,7 @@ tf_cuda_library(
         "//tensorflow/c:c_api",
         "//tensorflow/c:c_api_internal",
         "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:framework_lite",
         "//tensorflow/core:lib_internal",
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index a76c8f5ec0..fd6cecd77b 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -85,15 +85,7 @@ TFE_Context* TFE_NewContext(const TFE_ContextOptions* opts, TF_Status* status) {
     return nullptr;
   }
 
-  TFE_Context* ret = new TFE_Context(session);
-  ret->policy = opts->policy;
-  ret->pflr.reset(new tensorflow::ProcessFunctionLibraryRuntime(
-      ret->session->device_mgr, opts->session_options.options.env,
-      TF_GRAPH_DEF_VERSION, &ret->func_lib_def, {}));
-  ret->rendezvous =
-      new tensorflow::IntraProcessRendezvous(ret->session->device_mgr);
-
-  return ret;
+  return new TFE_Context(*opts, session);
 }
 
 void TFE_DeleteContext(TFE_Context* ctx, TF_Status* status) {
diff --git a/tensorflow/c/eager/c_api_internal.h b/tensorflow/c/eager/c_api_internal.h
index a6f76c732f..dda68471a8 100644
--- a/tensorflow/c/eager/c_api_internal.h
+++ b/tensorflow/c/eager/c_api_internal.h
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/public/version.h"
 
 struct TFE_ContextOptions {
   TF_SessionOptions session_options;
@@ -43,9 +44,15 @@ struct TFE_ContextOptions {
 };
 
 struct TFE_Context {
-  explicit TFE_Context(TF_Session* s) : session(s) {}
+  explicit TFE_Context(const TFE_ContextOptions& opts, TF_Session* s)
+      : policy(opts.policy),
+        session(s),
+        rendezvous(new tensorflow::IntraProcessRendezvous(s->device_mgr)),
+        pflr(new tensorflow::ProcessFunctionLibraryRuntime(
+            session->device_mgr, opts.session_options.options.env,
+            TF_GRAPH_DEF_VERSION, &func_lib_def, {})) {}
 
-  TFE_ContextDevicePlacementPolicy policy;
+  const TFE_ContextDevicePlacementPolicy policy;
 
   // Note: we cannot use C++11 thread_local here as there is no concept of a
   // thread-local-object-local variable in C++11.
@@ -54,8 +61,8 @@ struct TFE_Context {
       thread_local_policies GUARDED_BY(policy_map_mu);
 
   // TFE_Context is an extension of TF_Session. And TF_Session needs a TF_Graph.
-  TF_Session* session;
-  tensorflow::Rendezvous* rendezvous;
+  TF_Session* const session;
+  tensorflow::Rendezvous* const rendezvous;
 
   tensorflow::mutex functions_mu;
   tensorflow::FunctionLibraryDefinition func_lib_def GUARDED_BY(functions_mu){
@@ -64,14 +71,14 @@ struct TFE_Context {
   // One FunctionLibraryRuntime per device.
   // func_libs[i] is the FunctionLibraryRuntime corresponding to
   // session->devices[i].
-  std::unique_ptr<tensorflow::ProcessFunctionLibraryRuntime> pflr;
+  const std::unique_ptr<tensorflow::ProcessFunctionLibraryRuntime> pflr;
 
   tensorflow::mutex cache_mu;
   std::unordered_map<tensorflow::Fprint128, tensorflow::KernelAndDevice*,
                      tensorflow::Fprint128Hasher>
       kernel_cache GUARDED_BY(cache_mu);
 
-  tensorflow::FunctionLibraryRuntime* func_lib(tensorflow::Device* d) {
+  tensorflow::FunctionLibraryRuntime* func_lib(tensorflow::Device* d) const {
     return pflr->GetFLR(d->name());
   }
 
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc
index 12947e284a..dd4bf6a345 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@@ -158,7 +158,7 @@ Status ProcessFunctionLibraryRuntime::GetDeviceContext(
 }
 
 FunctionLibraryRuntime* ProcessFunctionLibraryRuntime::GetFLR(
-    const string& device_name) {
+    const string& device_name) const {
   Device* device = nullptr;
   if (device_name != kDefaultFLRDevice) {
     if (!device_mgr_->LookupDevice(device_name, &device).ok()) {
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.h b/tensorflow/core/common_runtime/process_function_library_runtime.h
index a1adc4b6b3..9c9c92f1ea 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.h
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.h
@@ -85,7 +85,7 @@ class ProcessFunctionLibraryRuntime {
 
   static const char kDefaultFLRDevice[];
   // Returns the FunctionLibraryRuntime for the corresponding device_name.
-  FunctionLibraryRuntime* GetFLR(const string& device_name);
+  FunctionLibraryRuntime* GetFLR(const string& device_name) const;
 
   // Returns the device incarnation for the given device_name.
   Status GetDeviceIncarnation(const string& device_name, int64* incarnation);
-- 
GitLab


From 0a34211774c8c45c8f290e6c51335b99873dcbb9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 29 Jan 2018 12:42:09 -0800
Subject: [PATCH 337/966] Remove the trailing '/' in the tensor name when
 loading checkpoints

PiperOrigin-RevId: 183709590
---
 tensorflow/python/training/checkpoint_utils.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/python/training/checkpoint_utils.py b/tensorflow/python/training/checkpoint_utils.py
index b5d3e78797..63235a1454 100644
--- a/tensorflow/python/training/checkpoint_utils.py
+++ b/tensorflow/python/training/checkpoint_utils.py
@@ -242,6 +242,9 @@ def init_from_checkpoint(ckpt_dir_or_file, assignment_map):
           full_tensor_name = full_tensor_name[1:]
         if tensor_name_in_ckpt != "/":
           full_tensor_name = tensor_name_in_ckpt + full_tensor_name
+        # Remove trailing '/', if any, in the full_tensor_name
+        if full_tensor_name.endswith("/"):
+          full_tensor_name = full_tensor_name[:-1]
         if full_tensor_name not in variable_map:
           raise ValueError(
               "Tensor %s (%s in %s) is not found in %s checkpoint" % (
-- 
GitLab


From 1fcb66651d386e1430b3e83a3ce3d971256ffe70 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 29 Jan 2018 12:43:46 -0800
Subject: [PATCH 338/966] Support shrink_axis_mask argument of StridedSlice Op
 for TfLite.

PiperOrigin-RevId: 183709796
---
 .../internal/reference/reference_ops.h        |  32 +++-
 .../contrib/lite/kernels/strided_slice.cc     |  37 ++--
 .../lite/kernels/strided_slice_test.cc        | 160 +++++++++++++++++-
 tensorflow/contrib/lite/testing/BUILD         |   2 +-
 .../contrib/lite/testing/generate_examples.py |  15 +-
 5 files changed, 216 insertions(+), 30 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index 31bade26f9..4bcf4993e9 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -2370,13 +2370,15 @@ inline int StartIndex(int start, int stride, int dim, bool masked) {
   return masked ? (stride > 0 ? 0 : dim - 1) : start;
 }
 
-inline int StopIndex(int stop, int stride, int dim, bool masked) {
-  return masked ? (stride > 0 ? dim : -1) : stop;
+inline int StopIndex(int start, int stop, int stride, int dim, bool masked,
+                     bool shrink_axis_masked) {
+  return shrink_axis_masked ? stride > 0 ? start + 1 : start - 1
+                            : masked ? (stride > 0 ? dim : -1) : stop;
 }
 
 template <typename T>
 inline void StridedSlice(const T* input_data, const Dims<4>& input_dims,
-                         int begin_mask, int end_mask,
+                         int begin_mask, int end_mask, int shrink_axis_mask,
                          const std::vector<int>& starts,
                          const std::vector<int>& stops,
                          const std::vector<int>& strides, T* output_data,
@@ -2387,19 +2389,23 @@ inline void StridedSlice(const T* input_data, const Dims<4>& input_dims,
   const int start_b =
       StartIndex(starts[3], strides[3], input_dims.sizes[3], begin_mask & 8);
   const int stop_b =
-      StopIndex(stops[3], strides[3], input_dims.sizes[3], end_mask & 8);
+      StopIndex(start_b, stops[3], strides[3], input_dims.sizes[3],
+                end_mask & 8, shrink_axis_mask & 8);
   const int start_h =
       StartIndex(starts[2], strides[2], input_dims.sizes[2], begin_mask & 4);
   const int stop_h =
-      StopIndex(stops[2], strides[2], input_dims.sizes[2], end_mask & 4);
+      StopIndex(start_h, stops[2], strides[2], input_dims.sizes[2],
+                end_mask & 4, shrink_axis_mask & 4);
   const int start_w =
       StartIndex(starts[1], strides[1], input_dims.sizes[1], begin_mask & 2);
   const int stop_w =
-      StopIndex(stops[1], strides[1], input_dims.sizes[1], end_mask & 2);
+      StopIndex(start_w, stops[1], strides[1], input_dims.sizes[1],
+                end_mask & 2, shrink_axis_mask & 2);
   const int start_d =
       StartIndex(starts[0], strides[0], input_dims.sizes[0], begin_mask & 1);
   const int stop_d =
-      StopIndex(stops[0], strides[0], input_dims.sizes[0], end_mask & 1);
+      StopIndex(start_d, stops[0], strides[0], input_dims.sizes[0],
+                end_mask & 1, shrink_axis_mask & 1);
 
   T* out_ptr = output_data;
   for (int in_b = start_b; LoopCondition(in_b, stop_b, strides[3]);
@@ -2417,6 +2423,18 @@ inline void StridedSlice(const T* input_data, const Dims<4>& input_dims,
   }
 }
 
+template <typename T>
+inline void StridedSlice(const T* input_data, const Dims<4>& input_dims,
+                         int begin_mask, int end_mask,
+                         const std::vector<int>& starts,
+                         const std::vector<int>& stops,
+                         const std::vector<int>& strides, T* output_data,
+                         const Dims<4>& output_dims) {
+  StridedSlice(input_data, input_dims, begin_mask, end_mask,
+               /*shrink_axis_mask=*/0, starts, stops, strides, output_data,
+               output_dims);
+}
+
 template <typename T>
 inline void Slice(const T* input_data, const Dims<4>& input_dims,
                   const std::vector<int>& begin, const std::vector<int>& size,
diff --git a/tensorflow/contrib/lite/kernels/strided_slice.cc b/tensorflow/contrib/lite/kernels/strided_slice.cc
index 91ba4a9b78..c510ee3b9f 100644
--- a/tensorflow/contrib/lite/kernels/strided_slice.cc
+++ b/tensorflow/contrib/lite/kernels/strided_slice.cc
@@ -81,8 +81,6 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                      "ellipsis_mask is not implemented yet.");
   TF_LITE_ENSURE_MSG(context, op_context.params->new_axis_mask == 0,
                      "new_axis_mask is not implemented yet.");
-  TF_LITE_ENSURE_MSG(context, op_context.params->shrink_axis_mask == 0,
-                     "shrink_axis_mask is not implemented yet.");
 
   // TODO(soroosh): optimize for constant tensors to do allocation in Prepare
   op_context.output->allocation_type = kTfLiteDynamic;
@@ -153,9 +151,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   std::vector<int> starts;
   std::vector<int> stops;
   std::vector<int> strides;
+  std::vector<int> output_shape_vector;
 
-  // Determine size of output tensor and map indices
-  TfLiteIntArray* output_shape = TfLiteIntArrayCreate(op_context.dims);
   for (int idx = op_context.dims - 1; idx >= 0; --idx) {
     int dim = op_context.input->dims->data[idx];
     int32_t stride = GetTensorData<int32_t>(op_context.strides)[idx];
@@ -174,14 +171,24 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                            pos_stride);
 
     // This is valid for both positive and negative strides
-    output_shape->data[idx] = ceil((end - begin) / static_cast<float>(stride));
-    output_shape->data[idx] =
-        output_shape->data[idx] < 0 ? 0 : output_shape->data[idx];
+    int32_t dim_shape = ceil((end - begin) / static_cast<float>(stride));
+    dim_shape = dim_shape < 0 ? 0 : dim_shape;
+
+    if (!(op_context.params->shrink_axis_mask & (1 << idx))) {
+      output_shape_vector.push_back(dim_shape);
+    }
+
     starts.emplace_back(begin);
     stops.emplace_back(end);
     strides.emplace_back(stride);
   }
 
+  TfLiteIntArray* output_shape =
+      TfLiteIntArrayCreate(output_shape_vector.size());
+
+  std::reverse_copy(output_shape_vector.begin(), output_shape_vector.end(),
+                    output_shape->data);
+
   for (int i = op_context.dims; i < kMaxDim; i++) {
     starts.emplace_back(0);
     stops.emplace_back(1);
@@ -202,13 +209,15 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       ReverseMaskBits(op_context.params->begin_mask, op_context.dims);
   op_context.params->end_mask =
       ReverseMaskBits(op_context.params->end_mask, op_context.dims);
-
-#define TF_LITE_STRIDED_SLICE(kernel_type, data_type)                 \
-  kernel_type::StridedSlice(                                          \
-      GetTensorData<data_type>(op_context.input),                     \
-      GetTensorDims(op_context.input), op_context.params->begin_mask, \
-      op_context.params->end_mask, starts, stops, strides,            \
-      GetTensorData<data_type>(op_context.output),                    \
+  op_context.params->shrink_axis_mask =
+      ReverseMaskBits(op_context.params->shrink_axis_mask, op_context.dims);
+
+#define TF_LITE_STRIDED_SLICE(kernel_type, data_type)                      \
+  kernel_type::StridedSlice(                                               \
+      GetTensorData<data_type>(op_context.input),                          \
+      GetTensorDims(op_context.input), op_context.params->begin_mask,      \
+      op_context.params->end_mask, op_context.params->shrink_axis_mask,    \
+      starts, stops, strides, GetTensorData<data_type>(op_context.output), \
       GetTensorDims(op_context.output))
 
   switch (op_context.input->type) {
diff --git a/tensorflow/contrib/lite/kernels/strided_slice_test.cc b/tensorflow/contrib/lite/kernels/strided_slice_test.cc
index cd4a364682..5bc7dc353b 100644
--- a/tensorflow/contrib/lite/kernels/strided_slice_test.cc
+++ b/tensorflow/contrib/lite/kernels/strided_slice_test.cc
@@ -79,8 +79,6 @@ TEST(StridedSliceOpTest, UnssupportedArgs) {
                "ellipsis_mask is not implemented yet.");
   EXPECT_DEATH(StridedSliceOpModel({3, 2}, {2}, {2}, {2}, 0, 0, 0, 1, 0),
                "new_axis_mask is not implemented yet.");
-  EXPECT_DEATH(StridedSliceOpModel({3, 2}, {2}, {2}, {2}, 0, 0, 0, 0, 1),
-               "shrink_axis_mask is not implemented yet.");
 }
 
 TEST(StridedSliceOpTest, In1D) {
@@ -213,6 +211,7 @@ TEST(StridedSliceOpTest, In1D_EndMask) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3}));
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({2, 3, 4}));
 }
+
 TEST(StridedSliceOpTest, In1D_NegStride) {
   StridedSliceOpModel m({3}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3});
@@ -234,6 +233,7 @@ TEST(StridedSliceOpTest, In1D_EvenLenStride2) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1}));
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({1}));
 }
+
 TEST(StridedSliceOpTest, In1D_OddLenStride2) {
   StridedSliceOpModel m({3}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3});
@@ -255,6 +255,7 @@ TEST(StridedSliceOpTest, In2D_Identity) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 3}));
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
 }
+
 TEST(StridedSliceOpTest, In2D) {
   StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6});
@@ -320,6 +321,7 @@ TEST(StridedSliceOpTest, In2D_NegStrideBeginMask) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3}));
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({6, 5, 4}));
 }
+
 TEST(StridedSliceOpTest, In2D_NegStrideEndMask) {
   StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 2, 0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6});
@@ -354,6 +356,7 @@ TEST(StridedSliceOpTest, In3D_NegStride) {
   EXPECT_THAT(m.GetOutput(),
               ElementsAreArray({12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1}));
 }
+
 TEST(StridedSliceOpTest, In3D_Strided2) {
   StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
@@ -365,6 +368,159 @@ TEST(StridedSliceOpTest, In3D_Strided2) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 5}));
 }
 
+TEST(StridedSliceOpTest, In1D_ShrinkAxisMask1) {
+  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 1);
+  m.SetInput({1, 2, 3, 4});
+  m.SetBegin({1});
+  m.SetEnd({3});
+  m.SetStrides({1});
+  m.Invoke();
+  EXPECT_TRUE(m.GetOutputShape().empty());
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({2}));
+}
+
+TEST(StridedSliceOpTest, In1D_EmptyOutputShrinkAxisMask1) {
+  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 1);
+  m.SetInput({1, 2, 3, 4});
+  m.SetBegin({2});
+  m.SetEnd({1});
+  m.SetStrides({1});
+  m.Invoke();
+  EXPECT_TRUE(m.GetOutputShape().empty());
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({3}));
+}
+
+TEST(StridedSliceOpTest, In1D_BeginMaskShrinkAxisMask1) {
+  StridedSliceOpModel m({4}, {1}, {1}, {1}, 1, 0, 0, 0, 1);
+  m.SetInput({1, 2, 3, 4});
+  m.SetBegin({1});
+  m.SetEnd({3});
+  m.SetStrides({1});
+  m.Invoke();
+  EXPECT_TRUE(m.GetOutputShape().empty());
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1}));
+}
+
+TEST(StridedSliceOpTest, In1D_NegativeBeginNegativeStrideShrinkAxisMask1) {
+  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 1);
+  m.SetInput({1, 2, 3, 4});
+  m.SetBegin({-2});
+  m.SetEnd({-3});
+  m.SetStrides({-1});
+  m.Invoke();
+  EXPECT_TRUE(m.GetOutputShape().empty());
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({3}));
+}
+
+TEST(StridedSliceOpTest, In2D_ShrinkAxisMask1) {
+  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 1);
+  m.SetInput({1, 2, 3, 4, 5, 6});
+  m.SetBegin({0, 0});
+  m.SetEnd({2, 3});
+  m.SetStrides({1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3}));
+}
+
+TEST(StridedSliceOpTest, In2D_ShrinkAxisMask2) {
+  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 2);
+  m.SetInput({1, 2, 3, 4, 5, 6});
+  m.SetBegin({0, 0});
+  m.SetEnd({2, 3});
+  m.SetStrides({1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 4}));
+}
+
+TEST(StridedSliceOpTest, In2D_ShrinkAxisMask3) {
+  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 3);
+  m.SetInput({1, 2, 3, 4, 5, 6});
+  m.SetBegin({0, 0});
+  m.SetEnd({2, 3});
+  m.SetStrides({1, 1});
+  m.Invoke();
+  EXPECT_TRUE(m.GetOutputShape().empty());
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1}));
+}
+
+TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis1) {
+  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 1);
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  m.SetBegin({0, 0, 0});
+  m.SetEnd({2, 3, 2});
+  m.SetStrides({1, 1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
+}
+
+TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis2) {
+  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 2);
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  m.SetBegin({0, 0, 0});
+  m.SetEnd({2, 3, 2});
+  m.SetStrides({1, 1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 7, 8}));
+}
+
+TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis3) {
+  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 3);
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  m.SetBegin({0, 0, 0});
+  m.SetEnd({2, 3, 2});
+  m.SetStrides({1, 1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2}));
+}
+
+TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis4) {
+  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 4);
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  m.SetBegin({0, 0, 0});
+  m.SetEnd({2, 3, 2});
+  m.SetStrides({1, 1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 3}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 3, 5, 7, 9, 11}));
+}
+
+TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis5) {
+  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 5);
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  m.SetBegin({0, 0, 0});
+  m.SetEnd({2, 3, 2});
+  m.SetStrides({1, 1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 3, 5}));
+}
+
+TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis6) {
+  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 6);
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  m.SetBegin({0, 0, 0});
+  m.SetEnd({2, 3, 2});
+  m.SetStrides({1, 1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 7}));
+}
+
+TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis7) {
+  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 7);
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  m.SetBegin({0, 0, 0});
+  m.SetEnd({2, 3, 2});
+  m.SetStrides({1, 1, 1});
+  m.Invoke();
+  EXPECT_TRUE(m.GetOutputShape().empty());
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1}));
+}
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/testing/BUILD b/tensorflow/contrib/lite/testing/BUILD
index 50e8ca75f8..7f84a0ab9b 100644
--- a/tensorflow/contrib/lite/testing/BUILD
+++ b/tensorflow/contrib/lite/testing/BUILD
@@ -197,7 +197,7 @@ cc_binary(
 
 tf_cc_test(
     name = "generated_examples_zip_test",
-    size = "medium",
+    size = "large",
     srcs = ["generated_examples_zip_test.cc"],
     args = [
         "--zip_files_dir=tensorflow/contrib/lite/testing/optest",
diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py
index fc8149bef9..f75d7c4bb9 100644
--- a/tensorflow/contrib/lite/testing/generate_examples.py
+++ b/tensorflow/contrib/lite/testing/generate_examples.py
@@ -1533,8 +1533,9 @@ def make_strided_slice_tests(zip_path):
           "begin": [[0, 0, 0, 0], [1, 0, 1, 0]],
           "end": [[8, 2, 2, 3], [12, 2, 2, 5]],
           "strides": [None, [1, 1, 1, 1], [2, 1, 3, 1]],
-          "begin_mask": [None, 0, 1, 2, 8],
-          "end_mask": [None, 0, 1, 2, 8],
+          "begin_mask": [None, 1, 2, 8],
+          "end_mask": [None, 1, 2, 8],
+          "shrink_axis_mask": [None, 1, 2, 4, 8, 11, 15, -1],
       },
       # 2-D
       {
@@ -1544,8 +1545,9 @@ def make_strided_slice_tests(zip_path):
           "begin": [[0, 0], [1, 0]],
           "end": [[2, 3], [2, 2]],
           "strides": [None, [1, 1], [2, 2]],
-          "begin_mask": [None, 0, 1, 2],
-          "end_mask": [None, 0, 1, 2],
+          "begin_mask": [None, 1, 2],
+          "end_mask": [None, 1, 2],
+          "shrink_axis_mask": [None, 1, 2, 3, -1],
       },
       # Negative strides
       {
@@ -1555,8 +1557,9 @@ def make_strided_slice_tests(zip_path):
           "begin": [[0, -1]],
           "end": [[2, -3]],
           "strides": [[1, -1]],
-          "begin_mask": [None, 0, 1, 2],
-          "end_mask": [None, 0, 1, 2],
+          "begin_mask": [None, 1, 2],
+          "end_mask": [None, 1, 2],
+          "shrink_axis_mask": [None, 1, 2, 3, -1],
       },
   ]
 
-- 
GitLab


From ae740a67bdc01b991ead6ac047c774bff4d7bc8f Mon Sep 17 00:00:00 2001
From: Jie <jiej@nvidia.com>
Date: Mon, 29 Jan 2018 13:26:27 -0800
Subject: [PATCH 339/966] [update]   addressing:   i. some naming conventions  
 ii. add macro to guard cpp files

  TODO:
    cpplint
---
 .../contrib/tensorrt/convert/convert_graph.cc | 14 ++++--
 .../contrib/tensorrt/convert/convert_nodes.cc | 11 ++++-
 .../contrib/tensorrt/kernels/trt_engine_op.cc | 45 ++++++++++---------
 .../contrib/tensorrt/kernels/trt_engine_op.h  | 10 ++++-
 tensorflow/contrib/tensorrt/log/trt_logger.cc |  6 +++
 .../contrib/tensorrt/segment/segment.cc       |  5 +++
 .../contrib/tensorrt/segment/segment_test.cc  |  6 +++
 .../contrib/tensorrt/shape_fn/trt_shfn.cc     |  5 +++
 8 files changed, 75 insertions(+), 27 deletions(-)

diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index 3ade4ec356..aad5b66b6b 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/convert/convert_graph.h"
 
 #include <list>
 #include <set>
@@ -25,8 +24,6 @@ limitations under the License.
 #include <map>
 #include <utility>
 
-#include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
-#include "tensorflow/contrib/tensorrt/segment/segment.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/graph/algorithm.h"
@@ -47,6 +44,14 @@ limitations under the License.
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorrt/include/NvInfer.h"
 
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "NvInfer.h"
+#include "tensorflow/contrib/tensorrt/convert/convert_graph.h"
+#include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
+#include "tensorflow/contrib/tensorrt/segment/segment.h"
+
+
 //------------------------------------------------------------------------------
 namespace tensorflow {
 namespace tensorrt {
@@ -281,3 +286,6 @@ tensorflow::Status ConvertGraphDefToTensorRT(
 }  // namespace convert
 }  // namespace tensorrt
 }  // namespace tensorflow
+
+#endif // GOOGLE_TENSORRT
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index 19242cd944..e46cacf338 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
 
 #include <algorithm>
 #include <fstream>
@@ -27,7 +26,6 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
@@ -43,6 +41,12 @@ limitations under the License.
 #define _TF_LOG_DEBUG ::tensorflow::internal::LogMessage(__FILE__, __LINE__, -1)
 //  Check if the types are equal. Cast to int first so that failure log message
 //  would work!
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
+#include "NvInfer.h"
+#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
 #define CHECK_EQ_TYPE(val1, val2) CHECK_EQ((int)val1, (int)val2)
 //------------------------------------------------------------------------------
 namespace tensorflow {
@@ -1614,3 +1618,6 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
 }  // namespace convert
 }  // namespace tensorrt
 }  // namespace tensorflow
+
+#endif GOOGLE_TENSORRT
+#endif GOOGLE_CUDA
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
index 34116f6552..ab13560263 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
@@ -12,13 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/tensorrt/kernels/trt_engine_op.h"
-#include <cuda_runtime_api.h>
 #include <sstream>
-#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor.h"
 
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
+#include "tensorflow/contrib/tensorrt/kernels/trt_engine_op.h"
+#include <cuda_runtime_api.h>
+#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
 
 namespace tensorflow {
 static ::tensorflow::tensorrt::Logger gLogger;
@@ -52,28 +55,27 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context) : OpKernel(context) {
 }
 
 void TRTEngineOp::Compute(OpKernelContext* context) {
-  int nbBindings = context->num_inputs() + context->num_outputs();
-  // TODO(jjsjann123) multiple input/output
-  std::vector<void*> buffers(nbBindings);
+  int num_binding = context->num_inputs() + context->num_outputs();
+  std::vector<void*> buffers(num_binding);
 
-  size_t bindingIndex;
-  int nbBatch = 0;
+  size_t binding_index;
+  int num_batch = 0;
   bool valid = true;
   for (int i = 0; i < context->num_inputs(); i++) {
     // Grab the input tensor
-    bindingIndex = trt_engine_ptr_->getBindingIndex(input_nodes_[i].c_str());
+    binding_index = trt_engine_ptr_->getBindingIndex(input_nodes_[i].c_str());
 
     const Tensor& input_tensor = context->input(i);
     const TensorShape& input_shape = input_tensor.shape();
     if (i == 0) {
-      nbBatch = input_shape.dim_size(0);
-    } else if (nbBatch != input_shape.dim_size(0)) {
+      num_batch = input_shape.dim_size(0);
+    } else if (num_batch != input_shape.dim_size(0)) {
       valid = false;
       break;
     }
-    switch (trt_engine_ptr_->getBindingDataType(bindingIndex)) {
+    switch (trt_engine_ptr_->getBindingDataType(binding_index)) {
       case nvinfer1::DataType::kFLOAT:
-        buffers[bindingIndex] = (void*)(input_tensor.flat<float>().data());
+        buffers[binding_index] = (void*)(input_tensor.flat<float>().data());
         break;
       case nvinfer1::DataType::kHALF:
         LOG(FATAL) << "half size is not supported yet!";
@@ -90,14 +92,14 @@ void TRTEngineOp::Compute(OpKernelContext* context) {
   for (int i = 0; i < static_cast<int>(output_nodes_.size()); i++) {
     // This is bad that we have to reallocate output buffer every run.
     // Create an output tensor
-    bindingIndex = trt_engine_ptr_->getBindingIndex(output_nodes_[i].c_str());
+    binding_index = trt_engine_ptr_->getBindingIndex(output_nodes_[i].c_str());
     Tensor* output_tensor = NULL;
 
     TensorShape output_shape;
-    if (bindingIndex != -1) {
-      auto dims = trt_engine_ptr_->getBindingDimensions(bindingIndex);
+    if (binding_index != -1) {
+      auto dims = trt_engine_ptr_->getBindingDimensions(binding_index);
       std::vector<int> trt_shape(dims.nbDims + 1);
-      trt_shape[0] = nbBatch;
+      trt_shape[0] = num_batch;
       for (int j = 0; j < dims.nbDims; j++) trt_shape[j + 1] = dims.d[j];
       TensorShapeUtils::MakeShape(trt_shape.data(), trt_shape.size(),
                                   &output_shape);
@@ -108,9 +110,9 @@ void TRTEngineOp::Compute(OpKernelContext* context) {
 
     OP_REQUIRES_OK(context,
                    context->allocate_output(i, output_shape, &output_tensor));
-    switch (trt_engine_ptr_->getBindingDataType(bindingIndex)) {
+    switch (trt_engine_ptr_->getBindingDataType(binding_index)) {
       case nvinfer1::DataType::kFLOAT:
-        buffers[bindingIndex] =
+        buffers[binding_index] =
             reinterpret_cast<void*>(output_tensor->flat<float>().data());
         break;
       case nvinfer1::DataType::kHALF:
@@ -129,9 +131,12 @@ void TRTEngineOp::Compute(OpKernelContext* context) {
                                                 ->CudaStreamMemberHack()));
 
   // execution handled by TF since we are getting stream from TF.
-  trt_execution_context_ptr_->enqueue(nbBatch, &buffers[0], *stream, nullptr);
+  trt_execution_context_ptr_->enqueue(num_batch, &buffers[0], *stream, nullptr);
 }
 
 REGISTER_KERNEL_BUILDER(Name("TRTEngineOp").Device(DEVICE_GPU), TRTEngineOp);
 }  // namespace tensorrt
 }  // namespace tensorflow
+
+#endif // GOOGLE_TENSORRT
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
index ac188addd7..378df1b7a6 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
@@ -16,15 +16,18 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_TENSORRT_KERNELS_TRT_ENGINE_OP_H_
 #define TENSORFLOW_CONTRIB_TENSORRT_KERNELS_TRT_ENGINE_OP_H_
 
-#include <cuda_runtime_api.h>
 #include <memory>
 #include <string>
 #include <vector>
 
-#include "tensorrt/include/NvInfer.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "tensorrt/include/NvInfer.h"
+#include <cuda_runtime_api.h>
+
 namespace tensorflow {
 
 namespace tensorrt {
@@ -53,4 +56,7 @@ class TRTEngineOp : public OpKernel {
 
 }  // namespace tensorflow
 
+#endif // GOOGLE_TENSORRT
+#endif // GOOGLE_CUDA
+
 #endif // TENSORFLOW_CONTRIB_TENSORRT_KERNELS_TRT_ENGINE_OP_H_
diff --git a/tensorflow/contrib/tensorrt/log/trt_logger.cc b/tensorflow/contrib/tensorrt/log/trt_logger.cc
index fc3d778002..3910ed1201 100644
--- a/tensorflow/contrib/tensorrt/log/trt_logger.cc
+++ b/tensorflow/contrib/tensorrt/log/trt_logger.cc
@@ -12,6 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
 #include "tensorflow/contrib/tensorrt/log/trt_logger.h"
 // Use TF logging for TensorRT informations
 #include "tensorflow/core/platform/logging.h"
@@ -54,3 +57,6 @@ void Logger::log(Severity severity, const char* msg) {
 }  // namespace tensorrt
 
 }  // namespace tensorflow
+
+#endif // GOOGLE_TENSORRT
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/contrib/tensorrt/segment/segment.cc b/tensorflow/contrib/tensorrt/segment/segment.cc
index 3ae6ca776b..19833d5e9f 100644
--- a/tensorflow/contrib/tensorrt/segment/segment.cc
+++ b/tensorflow/contrib/tensorrt/segment/segment.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
 #include "tensorflow/contrib/tensorrt/segment/segment.h"
 
 #include <set>
@@ -265,3 +267,6 @@ tensorflow::Status SegmentGraph(
 }  // namespace segment
 }  // namespace tensorrt
 }  // namespace tensorflow
+
+#endif // GOOGLE_TENSORRT
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/contrib/tensorrt/segment/segment_test.cc b/tensorflow/contrib/tensorrt/segment/segment_test.cc
index c00aa1dbc9..02087227ad 100644
--- a/tensorflow/contrib/tensorrt/segment/segment_test.cc
+++ b/tensorflow/contrib/tensorrt/segment/segment_test.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
 #include "tensorflow/contrib/tensorrt/segment/segment.h"
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/core/framework/graph.pb.h"
@@ -363,3 +366,6 @@ TEST_F(SegmentTest, BigIfElse) {
 }  // namespace tensorrt
 
 }  // namespace tensorflow
+
+#endif // GOOGLE_TENSORRT
+#endif // GOOGLE_CUDA
diff --git a/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc b/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
index 7951397b7e..1f6b6d904d 100644
--- a/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
+++ b/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
@@ -16,6 +16,8 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
 #include "tensorflow/contrib/tensorrt/log/trt_logger.h"
 #include "tensorflow/contrib/tensorrt/shape_fn/trt_shfn.h"
 #include "tensorrt/include/NvInfer.h"
@@ -80,3 +82,6 @@ tensorflow::Status TRTEngineOpShapeInference(InferenceContext* c) {
 }
 }  // namespace shape_inference
 }  // namespace tensorflow
+
+#endif // GOOGLE_TENSORRT
+#endif // GOOGLE_CUDA
-- 
GitLab


From ea8bd26b997ec75ca2b8eb48b2ffe4e3d0e7c855 Mon Sep 17 00:00:00 2001
From: Po-Hsien Chu <stegben.benjamin@gmail.com>
Date: Tue, 30 Jan 2018 05:33:48 +0800
Subject: [PATCH 340/966] remove SRU num_units == x.shape[-1] restriction
 (#16404)

---
 .../python/kernel_tests/core_rnn_cell_test.py | 14 +++++++++++
 tensorflow/contrib/rnn/python/ops/rnn_cell.py | 24 ++++---------------
 2 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
index cafeb56ad8..5711f41cc3 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
@@ -153,6 +153,20 @@ class RNNCellTest(test.TestCase):
                   m.name: np.array([[0.1, 0.1]])})
         # Smoke test
         self.assertAllClose(res[0], [[0.509682,  0.509682]])
+    
+  def testSRUCellWithDiffSize(self):
+    with self.test_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 3])
+        m = array_ops.zeros([1, 2])
+        g, _ = contrib_rnn_cell.SRUCell(2)(x, m)
+        sess.run([variables_lib.global_variables_initializer()])
+        res = sess.run(
+            [g], {x.name: np.array([[1., 1., 1.]]),
+                  m.name: np.array([[0.1, 0.1]])})
+        # Smoke test
+        self.assertAllClose(res[0], [[0.55255556, 0.55255556]])
 
   def testBasicLSTMCell(self):
     for dtype in [dtypes.float16, dtypes.float32]:
diff --git a/tensorflow/contrib/rnn/python/ops/rnn_cell.py b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
index 8adf5dce6e..5fee2e93e4 100644
--- a/tensorflow/contrib/rnn/python/ops/rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
@@ -2729,25 +2729,9 @@ class SRUCell(rnn_cell_impl._LayerRNNCell):
 
     input_depth = inputs_shape[1].value
 
-    # Here the contributor believes that the following constraints
-    # are implied. The reasoning is explained here with reference to
-    # the paper https://arxiv.org/pdf/1709.02755.pdf upon which this
-    # implementation is based.
-    # In section 2.1 Equation 5, specifically:
-    # h_t = r_t \odot g(c_t) + (1 - r_t) \odot x_t
-    # the pointwise operation between r_t and x_t means they have
-    # the same shape (since we are implementing an RNN cell, braodcasting
-    # does not happen to input of a single timestep); by the same
-    # reasons, x_t has the same shape as h_t, essentially mandating that
-    # input_depth = unit_num.
-    if input_depth != self._num_units:
-      raise ValueError("SRU requires input_depth == num_units, got "
-                       "input_depth = %s, num_units = %s" % (input_depth,
-                                                             self._num_units))
-
     self._kernel = self.add_variable(
         rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
-        shape=[input_depth, 3 * self._num_units])
+        shape=[input_depth, 4 * self._num_units])
 
     self._bias = self.add_variable(
         rnn_cell_impl._BIAS_VARIABLE_NAME,
@@ -2760,8 +2744,8 @@ class SRUCell(rnn_cell_impl._LayerRNNCell):
     """Simple recurrent unit (SRU) with num_units cells."""
 
     U = math_ops.matmul(inputs, self._kernel)
-    x_bar, f_intermediate, r_intermediate = array_ops.split(
-        value=U, num_or_size_splits=3, axis=1)
+    x_bar, f_intermediate, r_intermediate, x_tx = array_ops.split(
+        value=U, num_or_size_splits=4, axis=1)
 
     f_r = math_ops.sigmoid(
         nn_ops.bias_add(
@@ -2769,7 +2753,7 @@ class SRUCell(rnn_cell_impl._LayerRNNCell):
     f, r = array_ops.split(value=f_r, num_or_size_splits=2, axis=1)
 
     c = f * state + (1.0 - f) * x_bar
-    h = r * self._activation(c) + (1.0 - r) * inputs
+    h = r * self._activation(c) + (1.0 - r) * x_tx
 
     return h, c
 
-- 
GitLab


From 5a756fe53b421bf7276551aa64eda0d4b6d50d95 Mon Sep 17 00:00:00 2001
From: Loo Rong Jie <loorongjie@gmail.com>
Date: Tue, 30 Jan 2018 05:35:45 +0800
Subject: [PATCH 341/966] Check more cpu features for Clang on Windows (#16508)

---
 tensorflow/core/platform/cpu_feature_guard.cc | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/platform/cpu_feature_guard.cc b/tensorflow/core/platform/cpu_feature_guard.cc
index b0d7b3a67a..7caf9d4db6 100644
--- a/tensorflow/core/platform/cpu_feature_guard.cc
+++ b/tensorflow/core/platform/cpu_feature_guard.cc
@@ -97,14 +97,17 @@ std::once_flag g_cpu_feature_guard_warn_once_flag;
 void InfoAboutUnusedCPUFeatures() {
   std::call_once(g_cpu_feature_guard_warn_once_flag, [] {
     string missing_instructions;
-#ifdef PLATFORM_WINDOWS
+#if defined(_MSC_VER) && !defined(__clang__)
+
 #ifndef __AVX__
     CheckIfFeatureUnused(CPUFeature::AVX, "AVX", missing_instructions);
 #endif  // __AVX__
 #ifndef __AVX2__
     CheckIfFeatureUnused(CPUFeature::AVX2, "AVX2", missing_instructions);
 #endif  // __AVX2__
-#else   // ifdef platform windows
+
+#else   // if defined(_MSC_VER) && !defined(__clang__)
+
 #ifndef __SSE__
     CheckIfFeatureUnused(CPUFeature::SSE, "SSE", missing_instructions);
 #endif  // __SSE__
@@ -132,7 +135,7 @@ void InfoAboutUnusedCPUFeatures() {
 #ifndef __FMA__
     CheckIfFeatureUnused(CPUFeature::FMA, "FMA", missing_instructions);
 #endif  // __FMA__
-#endif  // else of ifdef platform windows
+#endif  // else of if defined(_MSC_VER) && !defined(__clang__)
     if (!missing_instructions.empty()) {
       LOG(INFO) << "Your CPU supports instructions that this TensorFlow "
                 << "binary was not compiled to use:" << missing_instructions;
-- 
GitLab


From aeee7753013155cb0a73de24b62a03def11b3655 Mon Sep 17 00:00:00 2001
From: mjwen <wenxx151@gmail.com>
Date: Mon, 29 Jan 2018 15:36:17 -0600
Subject: [PATCH 342/966] Allow step callback for scipy SLSQP (#16312)

* Allow step callback for scipy SLSQP

* Add test_callbacks() for scipy optimizer
---
 .../opt/python/training/external_optimizer.py |  4 --
 .../training/external_optimizer_test.py       | 39 +++++++++++++++++++
 2 files changed, 39 insertions(+), 4 deletions(-)

diff --git a/tensorflow/contrib/opt/python/training/external_optimizer.py b/tensorflow/contrib/opt/python/training/external_optimizer.py
index f243317f1d..82ebca7f20 100644
--- a/tensorflow/contrib/opt/python/training/external_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/external_optimizer.py
@@ -397,10 +397,6 @@ class ScipyOptimizerInterface(ExternalOptimizerInterface):
             'automatically and cannot be injected manually'.format(kwarg))
 
     minimize_kwargs.update(optimizer_kwargs)
-    if method == 'SLSQP':
-      # SLSQP doesn't support step callbacks. Obviate associated warning
-      # message.
-      del minimize_kwargs['callback']
 
     import scipy.optimize  # pylint: disable=g-import-not-at-top
     result = scipy.optimize.minimize(*minimize_args, **minimize_kwargs)
diff --git a/tensorflow/contrib/opt/python/training/external_optimizer_test.py b/tensorflow/contrib/opt/python/training/external_optimizer_test.py
index 0f597d0a24..953586ee70 100644
--- a/tensorflow/contrib/opt/python/training/external_optimizer_test.py
+++ b/tensorflow/contrib/opt/python/training/external_optimizer_test.py
@@ -299,6 +299,45 @@ class ScipyOptimizerInterfaceTest(TestCase):
       method = optimizer.optimizer_kwargs.get('method')
       self.assertEqual('SLSQP', method)
 
+  def test_callbacks(self):
+    vector_val = np.array([7., -2.], dtype=np.float32)
+    vector = variables.Variable(vector_val, 'vector')
+
+    minimum_location_val = np.arange(2)
+    minimum_location = constant_op.constant(
+        minimum_location_val, dtype=dtypes.float32)
+
+    loss = math_ops.reduce_sum(math_ops.square(vector - minimum_location)) / 2.
+    loss_val_first = ((vector_val - minimum_location_val)**2).sum() / 2.
+
+    optimizer = external_optimizer.ScipyOptimizerInterface(loss, method='SLSQP')
+
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+
+      initial_vector_val = sess.run(vector)
+
+      extra_fetches = [loss]
+
+      step_callback = test.mock.Mock()
+      loss_callback = test.mock.Mock()
+
+      optimizer.minimize(
+          sess,
+          fetches=extra_fetches,
+          loss_callback=loss_callback,
+          step_callback=step_callback)
+
+      loss_val_last = sess.run(loss)
+
+      call_first = test.mock.call(loss_val_first)
+      call_last = test.mock.call(loss_val_last)
+      loss_calls = [call_first, call_last]
+      loss_callback.assert_has_calls(loss_calls, any_order=True)
+
+      args, _ = step_callback.call_args
+      self.assertAllClose(minimum_location_val, args[0])
+
 
 if __name__ == '__main__':
   test.main()
-- 
GitLab


From 4f15f9197c4de1a6c7357a3340c6de12a901ed3a Mon Sep 17 00:00:00 2001
From: cclauss <cclauss@bluewin.ch>
Date: Mon, 29 Jan 2018 22:50:39 +0100
Subject: [PATCH 343/966] contrib/learn: Typo in variable name x_exrta -->
 x_extra (#16500)

* contrib/learn: Typo in variable name x_exrta --> x_extra

flake8 testing of https://github.com/tensorflow/tensorflow
---
 tensorflow/contrib/learn/python/learn/datasets/synthetic.py    | 2 +-
 .../contrib/learn/python/learn/datasets/synthetic_test.py      | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/learn/python/learn/datasets/synthetic.py b/tensorflow/contrib/learn/python/learn/datasets/synthetic.py
index 649996c49c..9a843168c2 100644
--- a/tensorflow/contrib/learn/python/learn/datasets/synthetic.py
+++ b/tensorflow/contrib/learn/python/learn/datasets/synthetic.py
@@ -151,7 +151,7 @@ def spirals(n_samples=100,
   # Add more points if n_samples is not divisible by n_classes (unbalanced!)
   extras = n_samples % n_classes
   if extras > 0:
-    x_exrta, y_extra = _modes[mode](np.random.rand(extras) * 2 * np.pi, *args,
+    x_extra, y_extra = _modes[mode](np.random.rand(extras) * 2 * np.pi, *args,
                                     **kwargs)
     spir_x = np.append(spir_x, x_extra)
     spir_y = np.append(spir_y, y_extra)
diff --git a/tensorflow/contrib/learn/python/learn/datasets/synthetic_test.py b/tensorflow/contrib/learn/python/learn/datasets/synthetic_test.py
index 613d8d39a3..19791d7759 100644
--- a/tensorflow/contrib/learn/python/learn/datasets/synthetic_test.py
+++ b/tensorflow/contrib/learn/python/learn/datasets/synthetic_test.py
@@ -136,6 +136,9 @@ class SyntheticTest(test.TestCase):
       self.assertRaises(AssertionError, np.testing.assert_array_equal,
                         spir0.data, spir1.data)
 
+  def test_spirals(self):
+    synthetic.spirals(3)
+
 
 if __name__ == '__main__':
   test.main()
-- 
GitLab


From 32c7607ff8117f1454c62e1762cadaf577f242fd Mon Sep 17 00:00:00 2001
From: Asim Shankar <ashankar@google.com>
Date: Mon, 29 Jan 2018 13:36:09 -0800
Subject: [PATCH 344/966] Java: Update to 1.5.0

PiperOrigin-RevId: 183718481
---
 tensorflow/java/maven/libtensorflow/pom.xml         | 2 +-
 tensorflow/java/maven/libtensorflow_jni/pom.xml     | 2 +-
 tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml | 2 +-
 tensorflow/java/maven/pom.xml                       | 2 +-
 tensorflow/java/maven/proto/pom.xml                 | 2 +-
 tensorflow/java/maven/tensorflow/pom.xml            | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tensorflow/java/maven/libtensorflow/pom.xml b/tensorflow/java/maven/libtensorflow/pom.xml
index 6285ee0483..a9ce5372ae 100644
--- a/tensorflow/java/maven/libtensorflow/pom.xml
+++ b/tensorflow/java/maven/libtensorflow/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.5.0-rc1</version>
+    <version>1.5.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow</artifactId>
diff --git a/tensorflow/java/maven/libtensorflow_jni/pom.xml b/tensorflow/java/maven/libtensorflow_jni/pom.xml
index b0e5c44fec..fe34ca83ff 100644
--- a/tensorflow/java/maven/libtensorflow_jni/pom.xml
+++ b/tensorflow/java/maven/libtensorflow_jni/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.5.0-rc1</version>
+    <version>1.5.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow_jni</artifactId>
diff --git a/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml b/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
index 02c5dca13f..390152808e 100644
--- a/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
+++ b/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.5.0-rc1</version>
+    <version>1.5.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow_jni_gpu</artifactId>
diff --git a/tensorflow/java/maven/pom.xml b/tensorflow/java/maven/pom.xml
index 949597ca7f..524ec45f48 100644
--- a/tensorflow/java/maven/pom.xml
+++ b/tensorflow/java/maven/pom.xml
@@ -6,7 +6,7 @@
   <modelVersion>4.0.0</modelVersion>
   <groupId>org.tensorflow</groupId>
   <artifactId>parentpom</artifactId>
-  <version>1.5.0-rc1</version>
+  <version>1.5.0</version>
   <packaging>pom</packaging>
 
   <url>https://www.tensorflow.org</url>
diff --git a/tensorflow/java/maven/proto/pom.xml b/tensorflow/java/maven/proto/pom.xml
index 9f0ebcf84c..9cf3217f51 100644
--- a/tensorflow/java/maven/proto/pom.xml
+++ b/tensorflow/java/maven/proto/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.5.0-rc1</version>
+    <version>1.5.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>proto</artifactId>
diff --git a/tensorflow/java/maven/tensorflow/pom.xml b/tensorflow/java/maven/tensorflow/pom.xml
index 88d897362a..d619f986a9 100644
--- a/tensorflow/java/maven/tensorflow/pom.xml
+++ b/tensorflow/java/maven/tensorflow/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.5.0-rc1</version>
+    <version>1.5.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>tensorflow</artifactId>
-- 
GitLab


From 82336ab8a1c04961817db19fc42c84872b17d954 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Mon, 29 Jan 2018 13:59:08 -0800
Subject: [PATCH 345/966] Fix incorrect docs for DecodeVideoOp (#16525)

This fix fixes incorrect docs for DecodeVideoOp

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/contrib/ffmpeg/decode_video_op.cc | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/tensorflow/contrib/ffmpeg/decode_video_op.cc b/tensorflow/contrib/ffmpeg/decode_video_op.cc
index d44032968d..6f8ad486d1 100644
--- a/tensorflow/contrib/ffmpeg/decode_video_op.cc
+++ b/tensorflow/contrib/ffmpeg/decode_video_op.cc
@@ -102,16 +102,12 @@ REGISTER_OP("DecodeVideo")
       return Status::OK();
     })
     .Doc(R"doc(
-Processes the contents of an audio file into a tensor using FFmpeg to decode
+Processes the contents of an video file into a tensor using FFmpeg to decode
 the file.
 
-One row of the tensor is created for each channel in the audio file. Each
-channel contains audio samples starting at the beginning of the audio and
-having `1/samples_per_second` time between them. If the `channel_count` is
-different from the contents of the file, channels will be merged or created.
-
-contents: The binary audio file contents, as a string or rank-0 string
-    tensor.
+contents: The binary contents of the video file to decode. This is a
+    scalar.
+output: A rank-4 `Tensor` that has `[frames, height, width, 3]` RGB as output.
 )doc");
 
 }  // namespace ffmpeg
-- 
GitLab


From 412a905ebc68984eee862a29ecdb2a08dae359c0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 29 Jan 2018 14:04:16 -0800
Subject: [PATCH 346/966] Revise the decorator transformer and add tests that
 clarify when can decorator information appear in the AST.

PiperOrigin-RevId: 183723446
---
 .../contrib/py2tf/converters/decorators.py    | 24 +++--
 .../py2tf/converters/decorators_test.py       | 96 +++++++++++++++++++
 tensorflow/contrib/py2tf/pyct/BUILD           |  1 +
 tensorflow/contrib/py2tf/pyct/compiler.py     |  6 +-
 4 files changed, 117 insertions(+), 10 deletions(-)
 create mode 100644 tensorflow/contrib/py2tf/converters/decorators_test.py

diff --git a/tensorflow/contrib/py2tf/converters/decorators.py b/tensorflow/contrib/py2tf/converters/decorators.py
index a4313bfa51..3f620c1cd2 100644
--- a/tensorflow/contrib/py2tf/converters/decorators.py
+++ b/tensorflow/contrib/py2tf/converters/decorators.py
@@ -12,7 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Handles decorators."""
+"""Handles decorators.
+
+Note: this module only deals with functions whose decorators are still recorded
+in the AST. This does not always happen. See the unit test for an example.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -34,17 +38,19 @@ class DecoratorsTransformer(gast.NodeTransformer):
 
   def visit_FunctionDef(self, node):
     self.generic_visit(node)
+    kept_decorators = []
     for dec in node.decorator_list:
       if isinstance(dec, gast.Call):
-        dec = dec.func
-      if not anno.hasanno(dec, 'live_val'):
+        dec_func = dec.func
+      else:
+        dec_func = dec
+      if not anno.hasanno(dec_func, 'live_val'):
         raise ValueError(
-            'Could not resolve decorator: %s' % pretty_printer.fmt(dec))
-      dec_value = anno.getanno(dec, 'live_val')
-      if dec_value in self.remove_decorators:
-        continue
-      raise ValueError('Dont know how to convert decorators for now.')
-    node.decorator_list = []
+            'Could not resolve decorator: %s' % pretty_printer.fmt(dec_func))
+      dec_value = anno.getanno(dec_func, 'live_val')
+      if dec_value not in self.remove_decorators:
+        kept_decorators.append(dec)
+    node.decorator_list = kept_decorators
     return node
 
   # pylint:enable=invalid-name
diff --git a/tensorflow/contrib/py2tf/converters/decorators_test.py b/tensorflow/contrib/py2tf/converters/decorators_test.py
new file mode 100644
index 0000000000..f50d593043
--- /dev/null
+++ b/tensorflow/contrib/py2tf/converters/decorators_test.py
@@ -0,0 +1,96 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for decorators module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import textwrap
+
+from tensorflow.contrib.py2tf.converters import converter_test_base
+from tensorflow.contrib.py2tf.converters import decorators
+from tensorflow.contrib.py2tf.pyct import compiler
+from tensorflow.python.platform import test
+from tensorflow.python.util import tf_inspect
+
+
+class DecoratorsTest(converter_test_base.TestCase):
+
+  def test_function_decorator(self):
+
+    def function_decorator():
+
+      def decorator(f):
+        return lambda a: f(a) + 1
+
+      return decorator
+
+    # The Python parser does capture decorators into the AST.
+    # However, the interpreter desugars them on load, and refering to the
+    # decorated function at runtime usually loses any trace of the decorator.
+    # Below is an example when that doesn't happen.
+    def static_wrapper():
+
+      @function_decorator()
+      def test_fn(a):  # pylint:disable=unused-variable
+        return a
+
+    node = self.parse_and_analyze(static_wrapper,
+                                  {'function_decorator': function_decorator})
+    node = node.body[0].body[0]
+
+    node = decorators.transform(node, remove_decorators=())
+    result = compiler.ast_to_object(
+        node,
+        source_prefix=textwrap.dedent(tf_inspect.getsource(function_decorator)))
+    self.assertEqual(2, result.test_fn(1))
+
+    node = decorators.transform(node, remove_decorators=(function_decorator,))
+    result = compiler.ast_to_object(node)
+    self.assertEqual(1, result.test_fn(1))
+
+  def test_simple_decorator(self):
+
+    def simple_decorator(f):
+      return lambda a: f(a) + 1
+
+    # The Python parser does capture decorators into the AST.
+    # However, the interpreter desugars them upon load, and refering to the
+    # decorated function at runtime usually loses any trace of the decorator.
+    # Below is an example when that doesn't happen.
+    def static_wrapper():
+
+      @simple_decorator
+      def test_fn(a):  # pylint:disable=unused-variable
+        return a
+
+    node = self.parse_and_analyze(static_wrapper,
+                                  {'simple_decorator': simple_decorator})
+    node = node.body[0].body[0]
+
+    node = decorators.transform(node, remove_decorators=())
+    result = compiler.ast_to_object(
+        node,
+        source_prefix=textwrap.dedent(tf_inspect.getsource(simple_decorator)))
+    self.assertEqual(2, result.test_fn(1))
+
+    node = decorators.transform(node, remove_decorators=(simple_decorator,))
+    result = compiler.ast_to_object(node)
+    self.assertEqual(1, result.test_fn(1))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/py2tf/pyct/BUILD b/tensorflow/contrib/py2tf/pyct/BUILD
index 88902dea84..1b2408ba0e 100644
--- a/tensorflow/contrib/py2tf/pyct/BUILD
+++ b/tensorflow/contrib/py2tf/pyct/BUILD
@@ -31,6 +31,7 @@ py_library(
     deps = [
         "@astor_archive//:astor",
         "@gast_archive//:gast",
+        "@six_archive//:six",
         "@termcolor_archive//:termcolor",
     ],
 )
diff --git a/tensorflow/contrib/py2tf/pyct/compiler.py b/tensorflow/contrib/py2tf/pyct/compiler.py
index b09353cc72..fc71469d1e 100644
--- a/tensorflow/contrib/py2tf/pyct/compiler.py
+++ b/tensorflow/contrib/py2tf/pyct/compiler.py
@@ -41,7 +41,7 @@ def ast_to_source(node, indentation):
   return astor.source_repr.pretty_source(generator.result).lstrip()
 
 
-def ast_to_object(node, indentation='  '):
+def ast_to_object(node, indentation='  ', source_prefix=None):
   """Return the Python objects represented by given AST.
 
   Compiling the AST code this way ensures that the source code is readable by
@@ -50,6 +50,7 @@ def ast_to_object(node, indentation='  '):
   Args:
     node: The code to compile, as an AST object.
     indentation: The string to use for indentation.
+    source_prefix: Optional string to print as-is into the source file.
 
   Returns:
     A module object containing the compiled source code.
@@ -58,5 +59,8 @@ def ast_to_object(node, indentation='  '):
 
   with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
     module_name = os.path.basename(f.name[:-3])
+    if source_prefix:
+      f.write(source_prefix)
+      f.write('\n')
     f.write(source)
   return imp.load_source(module_name, f.name)
-- 
GitLab


From e01844e65e0dbd2682a894946bec7f072d36fa27 Mon Sep 17 00:00:00 2001
From: Jie <jiej@nvidia.com>
Date: Mon, 29 Jan 2018 14:23:54 -0800
Subject: [PATCH 347/966] [update]   clang-format & cpplint done.

  TODO: compile and test after build fix.
---
 .../contrib/tensorrt/convert/convert_graph.cc | 55 ++++++--------
 .../contrib/tensorrt/convert/convert_graph.h  | 10 +--
 .../contrib/tensorrt/convert/convert_nodes.cc | 73 +++++++++----------
 .../contrib/tensorrt/convert/convert_nodes.h  |  7 +-
 .../contrib/tensorrt/kernels/trt_engine_op.cc | 15 ++--
 .../contrib/tensorrt/kernels/trt_engine_op.h  |  8 +-
 .../contrib/tensorrt/segment/segment.cc       |  8 +-
 tensorflow/contrib/tensorrt/segment/segment.h |  2 +-
 .../contrib/tensorrt/segment/segment_test.cc  | 33 ++++-----
 .../contrib/tensorrt/shape_fn/trt_shfn.cc     | 11 +--
 10 files changed, 100 insertions(+), 122 deletions(-)

diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index aad5b66b6b..72b9e9d2cb 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -13,16 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-
 #include <list>
+#include <map>
 #include <set>
 #include <sstream>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
-#include <vector>
-#include <map>
 #include <utility>
+#include <vector>
 
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -34,14 +33,14 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 
 #define _TF_LOG_DEBUG ::tensorflow::internal::LogMessage(__FILE__, __LINE__, -1)
-#include "tensorflow/core/grappler/optimizers/constant_folding.h"
-#include "tensorflow/core/grappler/optimizers/layout_optimizer.h" 
-#include "tensorflow/core/grappler/devices.h"
 #include "tensorflow/core/grappler/clusters/virtual_cluster.h"
-#include "tensorflow/core/protobuf/device_properties.pb.h"
+#include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/grappler/devices.h"
 #include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/optimizers/constant_folding.h"
+#include "tensorflow/core/grappler/optimizers/layout_optimizer.h"
 #include "tensorflow/core/grappler/utils.h"
-#include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/protobuf/device_properties.pb.h"
 #include "tensorrt/include/NvInfer.h"
 
 #if GOOGLE_CUDA
@@ -51,7 +50,6 @@ limitations under the License.
 #include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
 #include "tensorflow/contrib/tensorrt/segment/segment.h"
 
-
 //------------------------------------------------------------------------------
 namespace tensorflow {
 namespace tensorrt {
@@ -59,21 +57,20 @@ namespace convert {
 namespace {
 
 static bool IsTensorRTCandidate(const tensorflow::NodeDef& node_def) {
-// LINT.IfChange
+  // LINT.IfChange
   // TODO(jie): Segmentation shouldn't associated with op name.
   //            Split it into a registration for each kernel.
   static const std::set<std::string> candidate_ops = {
       "Identity", "Const", "Conv2D", "MaxPool", "BiasAdd", "Relu",
       "Add",      "Mul",   "Sub",    "Rsqrt",   "Pad"  // "Placeholder" ,"Mean"
   };
-// LINT.ThenChange(
-//    https://www.tensorflow.org/code/tensorflow/contrib/tensorrt/convert/convert_nodes.h)
+  // LINT.ThenChange(
+  //    https://www.tensorflow.org/code/tensorflow/contrib/tensorrt/convert/convert_nodes.h)
   return candidate_ops.count(node_def.op());
 }
 
-
-void GetSubGraphIncomingEdges(const tensorflow::Graph & graph,
-                              const std::set<int> &subgraph_node_ids,
+void GetSubGraphIncomingEdges(const tensorflow::Graph& graph,
+                              const std::set<int>& subgraph_node_ids,
                               tensorflow::EdgeSet* incoming_edges) {
   for (int node_id : subgraph_node_ids) {
     const tensorflow::Node* node = graph.FindNodeId(node_id);
@@ -86,8 +83,8 @@ void GetSubGraphIncomingEdges(const tensorflow::Graph & graph,
   }
 }
 
-void GetSubGraphOutgoingEdges(const tensorflow::Graph &graph,
-                              const std::set<int> &subgraph_node_ids,
+void GetSubGraphOutgoingEdges(const tensorflow::Graph& graph,
+                              const std::set<int>& subgraph_node_ids,
                               tensorflow::EdgeSet* outgoing_edges) {
   for (int node_id : subgraph_node_ids) {
     const tensorflow::Node* node = graph.FindNodeId(node_id);
@@ -126,7 +123,7 @@ std::unordered_map<std::string, std::vector<int>> BuildTensorNameMap(
 tensorflow::Status ConvertSubGraphToTensorRT(
     tensorflow::Graph& graph, const std::vector<std::string>& output_names,
     const std::set<int>& subgraph_node_ids,
-    size_t max_batch_size, // max batch size that engine will be created for
+    size_t max_batch_size,  // max batch size that engine will be created for
     // max amount of memory that engine will be allowed to consume, in bytes
     size_t max_workspace_size,
     const tensorflow::grappler::GraphProperties& graph_properties) {
@@ -135,7 +132,6 @@ tensorflow::Status ConvertSubGraphToTensorRT(
 
   std::vector<std::pair<int, int>> subgraph_inputs;
 
-
   // Collect inputs by looking for incoming edges
   for (const tensorflow::Edge* edge : subgraph_incoming_edges) {
     subgraph_inputs.push_back({edge->src()->id(), edge->src_output()});
@@ -211,7 +207,6 @@ tensorflow::Status ConvertGraphDefToTensorRT(
     const tensorflow::GraphDef& graph_def,
     const std::vector<std::string>& output_names, size_t max_batch_size,
     size_t max_workspace_size, tensorflow::GraphDef* new_graph_def) {
-
   // optimization pass
   tensorflow::grappler::GrapplerItem item;
   item.fetch = output_names;
@@ -227,20 +222,18 @@ tensorflow::Status ConvertGraphDefToTensorRT(
   device_properties.set_type("GPU");
   device_properties.mutable_environment()->insert({"architecture", "6"});
   gCluster =
-    new tensorflow::grappler::VirtualCluster({{"/GPU:0", device_properties}});
+      new tensorflow::grappler::VirtualCluster({{"/GPU:0", device_properties}});
 
   tensorflow::Status status = optimizer.Optimize(gCluster, item, &gdef);
 
-  if (status !=tensorflow::Status::OK())
-    return status;
- 
+  if (status != tensorflow::Status::OK()) return status;
+
   // constant folding
   item.graph = gdef;
   tensorflow::grappler::ConstantFolding fold(nullptr);
   status = fold.Optimize(nullptr, item, &gdef);
-  if (status !=tensorflow::Status::OK())
-    return status;
-  
+  if (status != tensorflow::Status::OK()) return status;
+
   // AJ refactoring shape inference through grappler/GraphProperties.
   tensorflow::grappler::GraphProperties static_graph_properties(item);
   static_graph_properties.InferStatically(false);
@@ -270,9 +263,9 @@ tensorflow::Status ConvertGraphDefToTensorRT(
   }
   std::unordered_map<std::string, tensorflow::Node*> node_map;
   TF_RETURN_IF_ERROR(BuildNodeMap(graph, &node_map));
-  for (const std::set<std::string> &subgraph_node_names : segments) {
+  for (const std::set<std::string>& subgraph_node_names : segments) {
     std::set<int> subgraph_node_ids;
-    for (const std::string &node_name : subgraph_node_names) {
+    for (const std::string& node_name : subgraph_node_names) {
       subgraph_node_ids.insert(node_map.at(node_name)->id());
     }
     TF_RETURN_IF_ERROR(ConvertSubGraphToTensorRT(
@@ -287,5 +280,5 @@ tensorflow::Status ConvertGraphDefToTensorRT(
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif // GOOGLE_TENSORRT
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.h b/tensorflow/contrib/tensorrt/convert/convert_graph.h
index 4b1863a89e..621d428ace 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.h
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.h
@@ -30,11 +30,9 @@ namespace convert {
 // max_workspace_size: The upper bound of memory allowence for engine building.
 tensorflow::Status ConvertGraphDefToTensorRT(
     const tensorflow::GraphDef& graph_def,
-    const std::vector<std::string>& output_names,
-    size_t max_batch_size,
-    size_t max_workspace_size,
-    tensorflow::GraphDef* new_graph_def);
-}
-}  // namespace tensorrt
+    const std::vector<std::string>& output_names, size_t max_batch_size,
+    size_t max_workspace_size, tensorflow::GraphDef* new_graph_def);
+}  // namespace convert
 }  // namespace tensorrt
+}  // namespace tensorflow
 #endif  // TENSORFLOW_CONTRIB_TENSORRT_CONVERT_CONVERT_GRAPH_H_
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index e46cacf338..89996c0c0c 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-
 #include <algorithm>
 #include <fstream>
 #include <list>
@@ -44,8 +43,8 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
-#include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
 #include "NvInfer.h"
+#include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
 #include "tensorflow/contrib/tensorrt/log/trt_logger.h"
 #define CHECK_EQ_TYPE(val1, val2) CHECK_EQ((int)val1, (int)val2)
 //------------------------------------------------------------------------------
@@ -322,7 +321,6 @@ void reorder_rsck_to_kcrs(TRT_ShapedWeights const& iweights,
   }
 }
 
-
 struct InferDeleter {
   template <typename T>
   void operator()(T* obj) const {
@@ -449,9 +447,9 @@ class Converter {
 };
 
 // ****************************************************************************
-//  Constant folding functions
-//  TODO(jie): once optimizer kicks in, we should have done constant folding
-//    there.
+// Constant folding functions
+// TODO(jie): once optimizer kicks in, we should have done constant folding
+// there.
 //*****************************************************************************/
 struct LambdaFactory {
   enum class OP_CATEGORY : int { RSQRT = 0, NEG, ADD, MUL, SUB };
@@ -499,7 +497,7 @@ struct LambdaFactory {
           LOG(DEBUG) << "LAMBDA VAL : " << val;
           return l + val;
         };
-        // return [val](T l)-> T {return l+val;};
+      // return [val](T l)-> T {return l+val;};
       case OP_CATEGORY::SUB:
         return [val](T l) -> T {
           LOG(DEBUG) << "LAMBDA VAL : " << val;
@@ -739,36 +737,34 @@ tensorflow::Status BinaryTensorOpWeight(
   // default to channel-wise
   auto scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
 
-
   if (weights.count() == 1) {
     LOG(DEBUG) << "UNIFORM";
     scale_mode = nvinfer1::ScaleMode::kUNIFORM;
   } else {
     // no broadcasting on Batch dimension;
-    assert(dims_w.d[0]==1);
+    assert(dims_w.d[0] == 1);
 
     // broadcasting on Channel dimension only allowed in kUNIFORM
-    assert(dims_w.d[1]==dims_t.d[0]);
-    assert(dims_w.nbDims==dims_t.nbDims);
+    assert(dims_w.d[1] == dims_t.d[0]);
+    assert(dims_w.nbDims == dims_t.nbDims);
 
     // default is element;
-    for (int i=2; i<dims_w.nbDims; i++) {
-      if (dims_w.d[i]!=dims_t.d[i-1]) {
+    for (int i = 2; i < dims_w.nbDims; i++) {
+      if (dims_w.d[i] != dims_t.d[i - 1]) {
         scale_mode = nvinfer1::ScaleMode::kCHANNEL;
         break;
       }
     }
     if (scale_mode == nvinfer1::ScaleMode::kELEMENTWISE) {
       scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
-      for (int i=2; i<dims_w.nbDims; i++) {
-        if (dims_w.d[i]!=1)
+      for (int i = 2; i < dims_w.nbDims; i++) {
+        if (dims_w.d[i] != 1)
           return tensorflow::errors::InvalidArgument(
-                   "Weight shape not compatible at, " + node_def.name());
+              "Weight shape not compatible at, " + node_def.name());
       }
     }
   }
 
-
   // prepare weights
   TRT_ShapedWeights shiftWeights(weights.type_);
   TRT_ShapedWeights scaleWeights(weights.type_);
@@ -826,9 +822,9 @@ tensorflow::Status BinaryTensorOpTensor(
   CHECK_EQ_TYPE(tensor_r->getType(), dtype);
   auto op_pair = ops.find(node_def.op());
   if (op_pair == ops.end())
-    return tensorflow::errors::Unimplemented(
-        "binary op: " + node_def.op() +
-        " not supported at: " + node_def.name());
+    return tensorflow::errors::Unimplemented("binary op: " + node_def.op() +
+                                             " not supported at: " +
+                                             node_def.name());
 
   nvinfer1::IElementWiseLayer* layer = ctx.network()->addElementWise(
       *const_cast<nvinfer1::ITensor*>(tensor_l),
@@ -878,7 +874,6 @@ tensorflow::Status ConvertConv2D(Converter& ctx,
   nvinfer1::DimsHW kernel_size;
   kernel_size.h() = weights.shape_.d[2];
   kernel_size.w() = weights.shape_.d[3];
-  LOG(DEBUG) << "kernel size: " << kernel_size.h() << ", " << kernel_size.w();
   TFAttrs attrs(node_def);
 
   int h_index = 2;
@@ -890,13 +885,10 @@ tensorflow::Status ConvertConv2D(Converter& ctx,
     h_index = 1;
     w_index = 2;
     // TODO(jie): transpose it
-  } else {
-    LOG(DEBUG) << "NCHW !!!!";
   }
+
   // TODO(jie): stride. (NHWC/NCHW)
   auto tf_stride = attrs.get<std::vector<int>>("strides");
-  LOG(DEBUG) << "h_INDEX" << h_index << ", w_index " << w_index;
-  LOG(DEBUG) << "stride!!!: " << tf_stride[0] << tf_stride[1] << tf_stride[2] << tf_stride[3];
   nvinfer1::DimsHW stride(tf_stride[h_index], tf_stride[w_index]);
 
   auto tensor_dim = tensor->getDimensions();
@@ -906,9 +898,9 @@ tensorflow::Status ConvertConv2D(Converter& ctx,
     // This is NCHW tensor with no batch dimension.
     //  1 -> h
     //  2 -> w
-    padding = createSamePadding(stride, kernel_size,
-                                {static_cast<int>(tensor_dim.d[1]),
-                                 static_cast<int>(tensor_dim.d[2])});
+    padding = createSamePadding(
+        stride, kernel_size,
+        {static_cast<int>(tensor_dim.d[1]), static_cast<int>(tensor_dim.d[2])});
   } else {
     padding = {{0, 0}, {0, 0}};
   }
@@ -917,11 +909,11 @@ tensorflow::Status ConvertConv2D(Converter& ctx,
       padding[1].first != padding[1].second) {
     // TODO(jie): handle asymmetric padding
     LOG(DEBUG) << "padding!!!: " << padding[0].first << padding[0].second
-                                 << padding[1].first << padding[1].second;
+               << padding[1].first << padding[1].second;
 
     auto dim_before = tensor->getDimensions();
-    LOG(DEBUG) << "TENSOR before: " << dim_before.d[0] << ", " << dim_before.d[1]
-                                    << dim_before.d[2] << ", " << dim_before.d[3];
+    LOG(DEBUG) << "TENSOR before: " << dim_before.d[0] << ", "
+               << dim_before.d[1] << dim_before.d[2] << ", " << dim_before.d[3];
     auto padLayer = ctx.network()->addPadding(
         *const_cast<nvinfer1::ITensor*>(tensor),
         nvinfer1::DimsHW(padding[0].first, padding[1].first),
@@ -930,7 +922,7 @@ tensorflow::Status ConvertConv2D(Converter& ctx,
     tensor = padLayer->getOutput(0);
     auto dim_after = tensor->getDimensions();
     LOG(DEBUG) << "TENSOR after: " << dim_after.d[0] << ", " << dim_after.d[1]
-                                   << dim_after.d[2] << ", " << dim_after.d[3];
+               << dim_after.d[2] << ", " << dim_after.d[3];
   }
 
   nvinfer1::IConvolutionLayer* layer =
@@ -944,7 +936,7 @@ tensorflow::Status ConvertConv2D(Converter& ctx,
 
   auto dim_after = output_tensor->getDimensions();
   LOG(DEBUG) << "TENSOR out: " << dim_after.d[0] << ", " << dim_after.d[1]
-                               << dim_after.d[2] << ", " << dim_after.d[3];
+             << dim_after.d[2] << ", " << dim_after.d[3];
 
   if (data_format == "NHWC") {
     // TODO(jie): transpose it back!
@@ -1011,7 +1003,8 @@ tensorflow::Status ConvertPool(Converter& ctx,
   if (padding[0].first != padding[0].second ||
       padding[1].first != padding[1].second) {
     // TODO(jie): handle asymmetric padding
-    LOG(DEBUG) << "padding!!!: " << padding[0].first << padding[0].second << padding[1].first << padding[1].second;
+    LOG(DEBUG) << "padding!!!: " << padding[0].first << padding[0].second
+               << padding[1].first << padding[1].second;
     auto padLayer = ctx.network()->addPadding(
         *const_cast<nvinfer1::ITensor*>(tensor),
         nvinfer1::DimsHW(padding[0].first, padding[1].first),
@@ -1593,12 +1586,12 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
     int output_idx = input_inds.at(i).second;
     // we wired up the input here already, it is redundant to do it again in
     //  ConvertSubGraphToTensorRT(convert_graph.cc)
-    auto incoming_edge = tensorflow::NodeDefBuilder::NodeOut(input_names.at(i),
-                           output_idx, input_dtypes.at(i));
+    auto incoming_edge = tensorflow::NodeDefBuilder::NodeOut(
+        input_names.at(i), output_idx, input_dtypes.at(i));
     income_edges.push_back(incoming_edge);
   }
-  tensorflow::gtl::ArraySlice<tensorflow::NodeDefBuilder::NodeOut>
-    input_list(income_edges);
+  tensorflow::gtl::ArraySlice<tensorflow::NodeDefBuilder::NodeOut> input_list(
+      income_edges);
   op_builder.Input(input_list);
 
   LOG(INFO) << "finished op preparation";
@@ -1619,5 +1612,5 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif GOOGLE_TENSORRT
-#endif GOOGLE_CUDA
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.h b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
index 9446dd5dcb..56331d79f9 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.h
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
@@ -17,13 +17,13 @@ limitations under the License.
 #define TENSORFLOW_CONTRIB_TENSORRT_CONVERT_CONVERT_NODES_H_
 
 #include <set>
-#include <vector>
 #include <utility>
+#include <vector>
 
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
 namespace tensorrt {
@@ -35,7 +35,8 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
         input_inds,  // {node_id, output_idx}
     const std::vector<std::pair<int, int>>&
         output_inds,  // {node_id, output_idx}
-    size_t max_batch_size, size_t max_workspace_size,
+    size_t max_batch_size,
+    size_t max_workspace_size,
     const tensorflow::grappler::GraphProperties& graph_prop,
     tensorflow::NodeDef* trt_node);
 }  // namespace convert
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
index ab13560263..a03e864c2d 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
@@ -19,15 +19,13 @@ limitations under the License.
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
 
-#include "tensorflow/contrib/tensorrt/kernels/trt_engine_op.h"
 #include <cuda_runtime_api.h>
+#include "tensorflow/contrib/tensorrt/kernels/trt_engine_op.h"
 #include "tensorflow/contrib/tensorrt/log/trt_logger.h"
 
 namespace tensorflow {
 static ::tensorflow::tensorrt::Logger gLogger;
 
-using namespace nvinfer1;
-
 namespace tensorrt {
 
 TRTEngineOp::TRTEngineOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -43,15 +41,13 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context) : OpKernel(context) {
   // TODO(samikama) runtime should be taken from a resourcemanager as well.
   //  Only engine should be in the op and context and runtime should be taken
   //  from resourcemanager
-  IRuntime* infer = createInferRuntime(gLogger);
+  nvinfer1::IRuntime* infer = nvinfer1::createInferRuntime(gLogger);
   trt_engine_ptr_.reset(infer->deserializeCudaEngine(
       serialized_engine.c_str(), serialized_engine.size(), nullptr));
 
   trt_execution_context_ptr_.reset(trt_engine_ptr_->createExecutionContext());
   // runtime is safe to delete after engine creation
   infer->destroy();
-  std::stringstream oss;
-
 }
 
 void TRTEngineOp::Compute(OpKernelContext* context) {
@@ -75,7 +71,8 @@ void TRTEngineOp::Compute(OpKernelContext* context) {
     }
     switch (trt_engine_ptr_->getBindingDataType(binding_index)) {
       case nvinfer1::DataType::kFLOAT:
-        buffers[binding_index] = (void*)(input_tensor.flat<float>().data());
+        buffers[binding_index] =
+            reinterpret_cast<void*>(input_tensor.flat<float>().data());
         break;
       case nvinfer1::DataType::kHALF:
         LOG(FATAL) << "half size is not supported yet!";
@@ -138,5 +135,5 @@ REGISTER_KERNEL_BUILDER(Name("TRTEngineOp").Device(DEVICE_GPU), TRTEngineOp);
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif // GOOGLE_TENSORRT
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
index 378df1b7a6..0e3ff45ede 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
@@ -25,8 +25,8 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
-#include "tensorrt/include/NvInfer.h"
 #include <cuda_runtime_api.h>
+#include "tensorrt/include/NvInfer.h"
 
 namespace tensorflow {
 
@@ -56,7 +56,7 @@ class TRTEngineOp : public OpKernel {
 
 }  // namespace tensorflow
 
-#endif // GOOGLE_TENSORRT
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
 
-#endif // TENSORFLOW_CONTRIB_TENSORRT_KERNELS_TRT_ENGINE_OP_H_
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_KERNELS_TRT_ENGINE_OP_H_
diff --git a/tensorflow/contrib/tensorrt/segment/segment.cc b/tensorflow/contrib/tensorrt/segment/segment.cc
index 19833d5e9f..c184253d3a 100644
--- a/tensorflow/contrib/tensorrt/segment/segment.cc
+++ b/tensorflow/contrib/tensorrt/segment/segment.cc
@@ -142,8 +142,8 @@ tensorflow::Status SegmentGraph(
   std::vector<UnionFind<tensorflow::Node*>> node_segments;
   for (int i = 0; i < graph.num_node_ids(); ++i) {
     tensorflow::Node* node = graph.FindNodeId(i);
-    if (options.exclude_node_list.count(node->name())!=0
-        || !candidate_fn(node->def())) {
+    if (options.exclude_node_list.count(node->name()) != 0 ||
+        !candidate_fn(node->def())) {
       node = nullptr;
     }
     node_segments.emplace_back(node);
@@ -268,5 +268,5 @@ tensorflow::Status SegmentGraph(
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif // GOOGLE_TENSORRT
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/contrib/tensorrt/segment/segment.h b/tensorflow/contrib/tensorrt/segment/segment.h
index a374606026..1bb70b82f9 100644
--- a/tensorflow/contrib/tensorrt/segment/segment.h
+++ b/tensorflow/contrib/tensorrt/segment/segment.h
@@ -17,8 +17,8 @@ limitations under the License.
 #define TENSORFLOW_CONTRIB_TENSORRT_SEGMENT_SEGMENT_H_
 
 #include <set>
-#include <vector>
 #include <string>
+#include <vector>
 
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/lib/core/status.h"
diff --git a/tensorflow/contrib/tensorrt/segment/segment_test.cc b/tensorflow/contrib/tensorrt/segment/segment_test.cc
index 02087227ad..ae491c38d1 100644
--- a/tensorflow/contrib/tensorrt/segment/segment_test.cc
+++ b/tensorflow/contrib/tensorrt/segment/segment_test.cc
@@ -160,11 +160,10 @@ TEST_F(SegmentTest, Simple) {
   ASSERT_TRUE(GetGraphDef(graph, &graph_def));
 
   SegmentNodesVector segments;
-  ASSERT_EQ(
-      SegmentGraph(graph_def,
-                   MakeCandidateFn({"add0", "add1", "add2", "add3", "add4"}),
-                   default_options_, &segments),
-      tensorflow::Status::OK());
+  ASSERT_EQ(SegmentGraph(graph_def, MakeCandidateFn({"add0", "add1", "add2",
+                                                     "add3", "add4"}),
+                         default_options_, &segments),
+            tensorflow::Status::OK());
 
   // Expect all Add operations to be collapsed into a single segment
   ASSERT_EQ(segments.size(), 1);
@@ -270,11 +269,11 @@ TEST_F(SegmentTest, Multiple) {
   ASSERT_TRUE(GetGraphDef(graph, &graph_def));
 
   SegmentNodesVector segments;
-  ASSERT_EQ(SegmentGraph(graph_def,
-                         MakeCandidateFn({"add0", "add1", "add2", "add3",
-                                          "add4", "add6", "add7", "add8"}),
-                         default_options_, &segments),
-            tensorflow::Status::OK());
+  ASSERT_EQ(
+      SegmentGraph(graph_def, MakeCandidateFn({"add0", "add1", "add2", "add3",
+                                               "add4", "add6", "add7", "add8"}),
+                   default_options_, &segments),
+      tensorflow::Status::OK());
 
   // Expect two subgraphs
   EXPECT_EQ(segments.size(), 2);
@@ -339,11 +338,11 @@ TEST_F(SegmentTest, BigIfElse) {
   ASSERT_TRUE(GetGraphDef(graph, &graph_def));
 
   SegmentNodesVector segments;
-  ASSERT_EQ(SegmentGraph(graph_def,
-                         MakeCandidateFn({"add0", "add1", "add3", "add4",
-                                          "add5", "add6", "add7"}),
-                         default_options_, &segments),
-            tensorflow::Status::OK());
+  ASSERT_EQ(
+      SegmentGraph(graph_def, MakeCandidateFn({"add0", "add1", "add3", "add4",
+                                               "add5", "add6", "add7"}),
+                   default_options_, &segments),
+      tensorflow::Status::OK());
 
   // Expect 2 subgraphs
   EXPECT_EQ(segments.size(), 2);
@@ -367,5 +366,5 @@ TEST_F(SegmentTest, BigIfElse) {
 
 }  // namespace tensorflow
 
-#endif // GOOGLE_TENSORRT
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc b/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
index 1f6b6d904d..7624237efe 100644
--- a/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
+++ b/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
@@ -32,7 +32,6 @@ tensorflow::Status TRTEngineOpShapeInference(InferenceContext* c) {
   nvinfer1::ICudaEngine* trt_engine = infer->deserializeCudaEngine(
       serialized_engine.c_str(), serialized_engine.size(), nullptr);
 
-
   int nbBatch = -1;
   // debug print out input arrays
   std::vector<::tensorflow::DataType> input_type;
@@ -61,8 +60,7 @@ tensorflow::Status TRTEngineOpShapeInference(InferenceContext* c) {
   std::vector<string> output_nodes;
   c->GetAttr("output_nodes", &output_nodes);
   for (size_t i = 0; i < output_nodes.size(); i++) {
-    int binding_index =
-        trt_engine->getBindingIndex(output_nodes[i].c_str());
+    int binding_index = trt_engine->getBindingIndex(output_nodes[i].c_str());
     ShapeHandle output_shape;
     std::vector<DimensionHandle> vecDim;
     vecDim.emplace_back(c->MakeDim(nbBatch));
@@ -71,8 +69,7 @@ tensorflow::Status TRTEngineOpShapeInference(InferenceContext* c) {
       for (int j = 0; j < dims.nbDims; j++)
         vecDim.emplace_back(c->MakeDim(dims.d[j]));
     } else {
-      LOG(FATAL) << "TensorRT engine cannot find binding: "
-                 << output_nodes[i];
+      LOG(FATAL) << "TensorRT engine cannot find binding: " << output_nodes[i];
     }
     output_shape = c->MakeShape(vecDim);
     c->set_output(i, output_shape);
@@ -83,5 +80,5 @@ tensorflow::Status TRTEngineOpShapeInference(InferenceContext* c) {
 }  // namespace shape_inference
 }  // namespace tensorflow
 
-#endif // GOOGLE_TENSORRT
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
-- 
GitLab


From 8fdef24247814d322991127091cc6d0c1eb60380 Mon Sep 17 00:00:00 2001
From: Russell Power <power@google.com>
Date: Mon, 29 Jan 2018 14:26:43 -0800
Subject: [PATCH 348/966] Switch over to max_pool_v2 in Python

This fix is a follow up to 11875 so that MaxPool in Python
use v2 version. As 11875 has been merged some time ago,
this fix conforms to the deprecation policy.

This fix is realted to 11875 and 4746.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
PiperOrigin-RevId: 183727668
---
 .../compiler/tf2xla/kernels/pooling_ops.cc    | 84 ++++++++++++++-----
 .../python/util/parse_layer_parameters.py     | 63 ++++++++++----
 .../python/kernel_tests/pooling_ops_test.py   | 21 ++---
 tensorflow/python/ops/nn_ops.py               | 13 ++-
 4 files changed, 125 insertions(+), 56 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
index 0b5a38967a..2ba572fd0e 100644
--- a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
@@ -37,21 +37,9 @@ class PoolingOp : public XlaOpKernel {
  public:
   PoolingOp(OpKernelConstruction* ctx, int num_spatial_dims)
       : XlaOpKernel(ctx), num_spatial_dims_(num_spatial_dims) {
-    std::vector<int32> ksize_int;
-    std::vector<int32> stride_int;
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("ksize", &ksize_int));
-    OP_REQUIRES(ctx, ksize_int.size() == num_dims(),
-                errors::InvalidArgument("Sliding window ksize field must "
-                                        "specify ",
-                                        num_dims(), " dimensions"));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("strides", &stride_int));
-    OP_REQUIRES(ctx, stride_int.size() == num_dims(),
-                errors::InvalidArgument("Sliding window stride field must "
-                                        "specify ",
-                                        num_dims(), " dimensions"));
-    for (int i = 0; i < num_dims(); ++i) {
-      ksize_.push_back(ksize_int[i]);
-      stride_.push_back(stride_int[i]);
+    if (ctx->num_inputs() == 1) {
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("ksize", &ksize_));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("strides", &stride_));
     }
     Padding padding;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("padding", &padding));
@@ -77,6 +65,28 @@ class PoolingOp : public XlaOpKernel {
     xla::ComputationDataHandle input = ctx->Input(0);
     const TensorShape input_shape = ctx->InputShape(0);
 
+    if (ctx->num_inputs() != 1) {
+      const TensorShape ksize_shape = ctx->InputShape(1);
+      OP_REQUIRES(ctx, TensorShapeUtils::IsVector(ksize_shape),
+                  errors::InvalidArgument("ksize must be a vector, not shape ",
+                                          ksize_shape.DebugString()));
+      OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(1, &ksize_));
+
+      const TensorShape stride_shape = ctx->InputShape(2);
+      OP_REQUIRES(ctx, TensorShapeUtils::IsVector(stride_shape),
+                  errors::InvalidArgument("stride must be a vector, not shape ",
+                                          stride_shape.DebugString()));
+      OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(2, &stride_));
+    }
+
+    OP_REQUIRES(ctx, ksize_.size() == num_dims(),
+                errors::InvalidArgument("Sliding window ksize field must "
+                                        "specify ",
+                                        num_dims(), " dimensions"));
+    OP_REQUIRES(ctx, stride_.size() == num_dims(),
+                errors::InvalidArgument("Sliding window stride field must "
+                                        "specify ",
+                                        num_dims(), " dimensions"));
     OP_REQUIRES(ctx, input_shape.dims() == num_dims(),
                 errors::InvalidArgument("Input to ", type_string(),
                                         " operator must have ", num_dims(),
@@ -130,6 +140,10 @@ class MaxPool2DOp : public MaxPoolOp {
   }
 };
 REGISTER_XLA_OP(Name("MaxPool"), MaxPool2DOp);
+REGISTER_XLA_OP(Name("MaxPoolV2")
+                    .CompileTimeConstInput("ksize")
+                    .CompileTimeConstInput("strides"),
+                MaxPool2DOp);
 
 class MaxPool3DOp : public MaxPoolOp {
  public:
@@ -243,22 +257,44 @@ class MaxPoolGradOp : public XlaOpKernel {
  public:
   MaxPoolGradOp(OpKernelConstruction* ctx, int num_spatial_dims)
       : XlaOpKernel(ctx), num_spatial_dims_(num_spatial_dims) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("ksize", &ksize_));
+    if (ctx->num_inputs() == 3) {
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("ksize", &ksize_));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("strides", &stride_));
+    }
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("padding", &padding_));
+  }
+
+  int num_dims() const { return num_spatial_dims_ + 2; }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    if (ctx->num_inputs() != 3) {
+      OP_REQUIRES(
+          ctx, ctx->num_inputs() == 5,
+          errors::InvalidArgument("Must supply ksize and stride arguments."));
+      const TensorShape ksize_shape = ctx->InputShape(3);
+      // Validate input sizes.
+      OP_REQUIRES(ctx, TensorShapeUtils::IsVector(ksize_shape),
+                  errors::InvalidArgument("ksize must be a vector, not shape ",
+                                          ksize_shape.DebugString()));
+      OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(3, &ksize_));
+
+      const TensorShape stride_shape = ctx->InputShape(4);
+      // Validate input sizes.
+      OP_REQUIRES(ctx, TensorShapeUtils::IsVector(stride_shape),
+                  errors::InvalidArgument("stride must be a vector, not shape ",
+                                          stride_shape.DebugString()));
+      OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(4, &stride_));
+    }
+
     OP_REQUIRES(ctx, ksize_.size() == num_dims(),
                 errors::InvalidArgument("Sliding window ksize field must "
                                         "specify ",
                                         num_dims(), " dimensions"));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("strides", &stride_));
     OP_REQUIRES(ctx, stride_.size() == num_dims(),
                 errors::InvalidArgument("Sliding window strides field must "
                                         "specify ",
                                         num_dims(), " dimensions"));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("padding", &padding_));
-  }
-
-  int num_dims() const { return num_spatial_dims_ + 2; }
 
-  void Compile(XlaOpKernelContext* ctx) override {
     const TensorShape tensor_in_shape = ctx->InputShape(0);
     const TensorShape tensor_out_shape = ctx->InputShape(1);
     const TensorShape out_backprop_shape = ctx->InputShape(2);
@@ -315,6 +351,10 @@ class MaxPool2DGradOp : public MaxPoolGradOp {
   }
 };
 REGISTER_XLA_OP(Name("MaxPoolGrad"), MaxPool2DGradOp);
+REGISTER_XLA_OP(Name("MaxPoolGradV2")
+                    .CompileTimeConstInput("ksize")
+                    .CompileTimeConstInput("strides"),
+                MaxPool2DGradOp);
 
 class MaxPool3DGradOp : public MaxPoolGradOp {
  public:
diff --git a/tensorflow/contrib/receptive_field/python/util/parse_layer_parameters.py b/tensorflow/contrib/receptive_field/python/util/parse_layer_parameters.py
index 44998b3b65..bc383a8034 100644
--- a/tensorflow/contrib/receptive_field/python/util/parse_layer_parameters.py
+++ b/tensorflow/contrib/receptive_field/python/util/parse_layer_parameters.py
@@ -35,20 +35,34 @@ _VALID_PADDING = ["VALID", b"VALID"]
 _SAME_PADDING = ["SAME", b"SAME"]
 
 
-def _stride_size(node):
+def _stride_size(node, name_to_node):
   """Computes stride size given a TF node.
 
   Args:
     node: Tensorflow node (NodeDef proto).
+    name_to_node: For MaxPoolV2, mapping from variable name Tensorflow node.
 
   Returns:
     stride_x: Stride size for horizontal direction (integer).
     stride_y: Stride size for vertical direction (integer).
+
+  Raises:
+    ValueError: If stride input cannot be found in `name_to_node`.
   """
-  strides_attr = node.attr["strides"]
-  logging.vlog(4, "strides_attr = %s", strides_attr)
-  stride_y = strides_attr.list.i[1]
-  stride_x = strides_attr.list.i[2]
+  if node.op == "MaxPoolV2":
+    strides_input_name = node.input[2]
+    if not strides_input_name.endswith("/strides"):
+      raise ValueError("Strides name does not end with '/strides'")
+    strides_node = name_to_node[strides_input_name]
+    value = strides_node.attr["value"]
+    t = make_ndarray(value.tensor)
+    stride_y = t[1]
+    stride_x = t[2]
+  else:
+    strides_attr = node.attr["strides"]
+    logging.vlog(4, "strides_attr = %s", strides_attr)
+    stride_y = strides_attr.list.i[1]
+    stride_x = strides_attr.list.i[2]
   return stride_x, stride_y
 
 
@@ -144,11 +158,12 @@ def _padding_size_conv_pool(node, kernel_size, stride, input_resolution=None):
   return total_padding, padding
 
 
-def _pool_kernel_size(node):
+def _pool_kernel_size(node, name_to_node):
   """Computes kernel size given a TF pooling node.
 
   Args:
     node: Tensorflow node (NodeDef proto).
+    name_to_node: For MaxPoolV2, mapping from node name to NodeDef.
 
   Returns:
     kernel_size_x: Kernel size for horizontal direction (integer).
@@ -157,13 +172,27 @@ def _pool_kernel_size(node):
   Raises:
     ValueError: If pooling is invalid.
   """
-  ksize = node.attr["ksize"]
-  kernel_size_y = ksize.list.i[1]
-  kernel_size_x = ksize.list.i[2]
-  if ksize.list.i[0] != 1:
-    raise ValueError("pool ksize for first dim is not 1")
-  if ksize.list.i[3] != 1:
-    raise ValueError("pool ksize for last dim is not 1")
+  if node.op == "MaxPoolV2":
+    ksize_input_name = node.input[1]
+    if not ksize_input_name.endswith("/ksize"):
+      raise ValueError("Kernel size name does not end with '/ksize'")
+    ksize_node = name_to_node[ksize_input_name]
+    value = ksize_node.attr["value"]
+    t = make_ndarray(value.tensor)
+    kernel_size_y = t[1]
+    kernel_size_x = t[2]
+    if t[0] != 1:
+      raise ValueError("pool ksize for first dim is not 1")
+    if t[3] != 1:
+      raise ValueError("pool ksize for last dim is not 1")
+  else:
+    ksize = node.attr["ksize"]
+    kernel_size_y = ksize.list.i[1]
+    kernel_size_x = ksize.list.i[2]
+    if ksize.list.i[0] != 1:
+      raise ValueError("pool ksize for first dim is not 1")
+    if ksize.list.i[3] != 1:
+      raise ValueError("pool ksize for last dim is not 1")
   return kernel_size_x, kernel_size_y
 
 
@@ -243,7 +272,7 @@ def get_layer_params(node, name_to_node, input_resolution=None, force=False):
   logging.vlog(3, "node.op = %s", node.op)
   logging.vlog(4, "node = %s", node)
   if node.op == "Conv2D" or node.op == "DepthwiseConv2dNative":
-    stride_x, stride_y = _stride_size(node)
+    stride_x, stride_y = _stride_size(node, name_to_node)
     kernel_size_x, kernel_size_y = _conv_kernel_size(node, name_to_node)
     # Compute the padding for this node separately for each direction.
     total_padding_x, padding_x = _padding_size_conv_pool(
@@ -260,9 +289,9 @@ def get_layer_params(node, name_to_node, input_resolution=None, force=False):
     stride_y = 1
     total_padding_x, padding_x, total_padding_y, padding_y = (
         _padding_size_pad_layer(node, name_to_node))
-  elif node.op == "MaxPool" or node.op == "AvgPool":
-    stride_x, stride_y = _stride_size(node)
-    kernel_size_x, kernel_size_y = _pool_kernel_size(node)
+  elif node.op == "MaxPool" or node.op == "MaxPoolV2" or node.op == "AvgPool":
+    stride_x, stride_y = _stride_size(node, name_to_node)
+    kernel_size_x, kernel_size_y = _pool_kernel_size(node, name_to_node)
     # Compute the padding for this node separately for each direction.
     total_padding_x, padding_x = _padding_size_conv_pool(
         node, kernel_size_x, stride_x, input_resolution[1]
diff --git a/tensorflow/python/kernel_tests/pooling_ops_test.py b/tensorflow/python/kernel_tests/pooling_ops_test.py
index 3263ed1a60..4466beeec9 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_test.py
@@ -1811,16 +1811,17 @@ class PoolingTest(test.TestCase):
       if test.is_gpu_available():
         pool_funcs.append(nn_ops.max_pool_with_argmax)
       for pool_func in pool_funcs:
-        # Illegal strides.
-        with self.assertRaisesRegexp(
-            errors_impl.UnimplementedError,
-            "Pooling is not yet supported on the batch"):
-          sess.run(
-              pool_func(
-                  array_ops.placeholder(dtypes.float32),
-                  ksize=[1, 1, 1, 1],
-                  strides=[2, 1, 1, 1],
-                  padding="SAME"))
+        if pool_func != nn_ops.max_pool:
+          # Illegal strides.
+          with self.assertRaisesRegexp(
+              errors_impl.UnimplementedError,
+              "Pooling is not yet supported on the batch"):
+            sess.run(
+                pool_func(
+                    array_ops.placeholder(dtypes.float32),
+                    ksize=[1, 1, 1, 1],
+                    strides=[2, 1, 1, 1],
+                    padding="SAME"))
 
         # Filter too large.
         with self.assertRaisesRegexp(ValueError, "Negative dimension size"):
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 9c875b4bcb..a691e281ee 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -2116,13 +2116,12 @@ def max_pool(value, ksize, strides, padding, data_format="NHWC", name=None):
   """
   with ops.name_scope(name, "MaxPool", [value]) as name:
     value = ops.convert_to_tensor(value, name="input")
-    return gen_nn_ops._max_pool(
-        value,
-        ksize=ksize,
-        strides=strides,
-        padding=padding,
-        data_format=data_format,
-        name=name)
+    return gen_nn_ops._max_pool(value,
+                                ksize=ksize,
+                                strides=strides,
+                                padding=padding,
+                                data_format=data_format,
+                                name=name)
 
 
 @ops.RegisterStatistics("Conv2D", "flops")
-- 
GitLab


From f751a8a016a4d486073cec0c061c877cb94ca9d4 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Mon, 29 Jan 2018 14:31:51 -0800
Subject: [PATCH 349/966] [TF:XLA] Bump open source llvm revision to r323630

PiperOrigin-RevId: 183728562
---
 tensorflow/workspace.bzl    |  8 ++++----
 third_party/llvm/llvm.BUILD | 24 ++++++++++++++++++++++++
 2 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 26abebe2de..357b04d75f 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -472,11 +472,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "llvm",
       urls = [
-          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/11a2ca6eea8a7fe240a14c0c35fd2017341279be.tar.gz",
-          "https://github.com/llvm-mirror/llvm/archive/11a2ca6eea8a7fe240a14c0c35fd2017341279be.tar.gz",
+          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/674f4d74284e9c431c7b69e30f6e10fc4fcbc9ef.tar.gz",
+          "https://github.com/llvm-mirror/llvm/archive/674f4d74284e9c431c7b69e30f6e10fc4fcbc9ef.tar.gz",
       ],
-      sha256 = "b5429ccf8d57273cb8489714f728c997cd720ec66fc2c0292422ab8f0e729ce0",
-      strip_prefix = "llvm-11a2ca6eea8a7fe240a14c0c35fd2017341279be",
+      sha256 = "3330c50efc9fc5d742e227dc810c2f586c7e36a60ecacd8251fafd2ea591e404",
+      strip_prefix = "llvm-674f4d74284e9c431c7b69e30f6e10fc4fcbc9ef",
       build_file = str(Label("//third_party/llvm:llvm.BUILD")),
   )
 
diff --git a/third_party/llvm/llvm.BUILD b/third_party/llvm/llvm.BUILD
index 5344525ba8..a9e1341a03 100644
--- a/third_party/llvm/llvm.BUILD
+++ b/third_party/llvm/llvm.BUILD
@@ -670,6 +670,28 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "aggressive_inst_combine",
+    srcs = glob([
+        "lib/Transforms/AggressiveInstCombine/*.c",
+        "lib/Transforms/AggressiveInstCombine/*.cpp",
+        "lib/Transforms/AggressiveInstCombine/*.inc",
+        "lib/Transforms/AggressiveInstCombine/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/Transforms/AggressiveInstCombine/*.h",
+        "include/llvm/Transforms/AggressiveInstCombine/*.def",
+        "include/llvm/Transforms/AggressiveInstCombine/*.inc",
+    ]),
+    deps = [
+        ":analysis",
+        ":config",
+        ":core",
+        ":support",
+        ":transform_utils",
+    ],
+)
+
 cc_library(
     name = "analysis",
     srcs = glob([
@@ -1405,6 +1427,7 @@ cc_library(
         "include/llvm/Transforms/IPO/*.inc",
     ]),
     deps = [
+        ":aggressive_inst_combine",
         ":analysis",
         ":bit_reader",
         ":bit_writer",
@@ -1931,6 +1954,7 @@ cc_library(
         "include/llvm/Transforms/IPO/SCCP.h",
     ]),
     deps = [
+        ":aggressive_inst_combine",
         ":analysis",
         ":config",
         ":core",
-- 
GitLab


From 1aa6e1a67a8c952501058c24e8af687fe9340560 Mon Sep 17 00:00:00 2001
From: resec <resec0109@gmail.com>
Date: Tue, 30 Jan 2018 06:36:21 +0800
Subject: [PATCH 350/966] Add nsync lib dep. to cc_library rule
 android_tensorflow_lib_selective_registration (#16117)

---
 tensorflow/core/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 455da05738..3b4a10eedb 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -1151,6 +1151,7 @@ cc_library(
     deps = [
         ":protos_all_cc_impl",
         "//third_party/eigen3",
+        "@nsync//:nsync_cpp",
         "@protobuf_archive//:protobuf",
     ],
     alwayslink = 1,
-- 
GitLab


From c7351055a89b90c3114fb4c24f880a947a15352e Mon Sep 17 00:00:00 2001
From: Russell Power <power@google.com>
Date: Mon, 29 Jan 2018 14:37:17 -0800
Subject: [PATCH 351/966] Set default signal handler for SIGINT (keyboard
 interrupt).

The default signal handling for Python delivers the signal to the thread for
processing; if broad exceptions are being caught, this can often result in the
interrupt being swallowed.

PiperOrigin-RevId: 183729637
---
 .../contrib/tpu/python/tpu/tpu_estimator.py   | 29 +++++++++++++++----
 1 file changed, 24 insertions(+), 5 deletions(-)

diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index 2ae3a26a85..7907d0baa5 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import collections
 from contextlib import contextmanager
 import copy
+import signal
 import threading
 import time
 import traceback
@@ -467,12 +468,12 @@ class _OpQueueContext(object):
 
   def read_iteration_counts(self):
     while True:
-      signal = self._queue.get(block=True)
-      logging.debug('%s read signal %s', self._name, signal)
-      if signal == _SIGNAL.STOP:
-        logging.info('%s received signal, stopping.', self._name)
+      iterations = self._queue.get(block=True)
+      logging.debug('%s read iterations %s', self._name, iterations)
+      if iterations == _SIGNAL.STOP:
+        logging.info('%s received shutdown signal, stopping.', self._name)
         return
-      yield signal
+      yield iterations
 
   def join(self):
     logging.info('Shutting down %s thread.' % self._name)
@@ -1387,6 +1388,23 @@ class ExamplesPerSecondHook(basic_session_run_hooks.StepCounterHook):
     logging.info('examples/sec: %g', examples_per_sec)
 
 
+class InstallSignalHandlerHook(session_run_hook.SessionRunHook):
+  """Change SIGINT (CTRL^C) handler to force quit the process.
+
+  The default behavior often results in hanging processes.
+  The original handler is restored after training/evaluation.
+  """
+
+  def __init__(self):
+    self._signal_fn = signal.getsignal(signal.SIGINT)
+
+  def before_run(self, run_context):
+    signal.signal(signal.SIGINT, signal.SIG_DFL)
+
+  def end(self, session):
+    signal.signal(signal.SIGINT, self._signal_fn)
+
+
 class TPUEstimator(estimator_lib.Estimator):
   """Estimator with TPU support.
 
@@ -1704,6 +1722,7 @@ class TPUEstimator(estimator_lib.Estimator):
           hooks = [
               TPUInfeedOutfeedSessionHook(ctx, enqueue_ops),
               ExamplesPerSecondHook(ctx.global_batch_size),
+              InstallSignalHandlerHook(),
               training.LoggingTensorHook(
                   {
                       'loss': array_ops.identity(loss),
-- 
GitLab


From a807755db184c2d2c3b9bcca65457e9915508650 Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Mon, 29 Jan 2018 14:40:07 -0800
Subject: [PATCH 352/966] Use Popen.communicate() instead of read() in
 stacktrace_handler_test.py.

This avoids potential deadlock, see the warnings in
https://docs.python.org/2/library/subprocess.html#popen-objects. I
found that enabling the C API caused us to deadlock without this
change.

PiperOrigin-RevId: 183730170
---
 tensorflow/python/platform/stacktrace_handler_test.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/platform/stacktrace_handler_test.py b/tensorflow/python/platform/stacktrace_handler_test.py
index 3f0e534f4c..f2071f9d54 100644
--- a/tensorflow/python/platform/stacktrace_handler_test.py
+++ b/tensorflow/python/platform/stacktrace_handler_test.py
@@ -57,7 +57,8 @@ class StacktraceHandlerTest(test.TestCase):
 
     # Capture its output. capture both stdout and stderr and append them.
     # We are not worried about timing or order of messages in this test.
-    child_output = child_process.stdout.read() + child_process.stderr.read()
+    child_stdout, child_stderr = child_process.communicate()
+    child_output = child_stdout + child_stderr
 
     # Make sure the child process is dead before we proceed.
     child_process.wait()
-- 
GitLab


From 8e03944589542bd64559d68989bca4a4705eed93 Mon Sep 17 00:00:00 2001
From: gracehoney <31743510+aaroey@users.noreply.github.com>
Date: Mon, 29 Jan 2018 14:49:13 -0800
Subject: [PATCH 353/966] Fix build (part1): 1. Changed includes of "NvInfer.h"
 to "tensorrt/include/NvInfer.h" 2. Remove build target "tensorrt_ops.so" (src
 file doesn't exist and the    target is not used anywhere) 3. Add missing
 '#if GOOGLE_TENSORRT's 4. Use tf_cuda_library instead of cc_library for some
 targets to get the    tf_copts naturally. 5. Revert the changes that was
 accidentally made (by merging with    upstream head) from configure.py 6.
 Replace exception with LOG(FATAL) in   
 tensorflow/contrib/tensorrt/convert/convert_nodes.cc as exception is    not
 supported. 7. Revert the reinterprete_cast change in   
 tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc 8. Fix minor formatting
 and naming issues according to the style guide.

---
 configure.py                                  |  3 ---
 tensorflow/contrib/tensorrt/BUILD             | 24 +++++--------------
 .../contrib/tensorrt/convert/convert_graph.cc |  3 +--
 .../contrib/tensorrt/convert/convert_nodes.cc |  9 +++----
 .../contrib/tensorrt/convert/convert_nodes.h  |  1 +
 .../contrib/tensorrt/kernels/trt_engine_op.cc |  3 +--
 tensorflow/contrib/tensorrt/log/trt_logger.cc | 16 ++++++-------
 tensorflow/contrib/tensorrt/log/trt_logger.h  | 10 ++++----
 .../contrib/tensorrt/segment/segment.cc       |  9 -------
 tensorflow/contrib/tensorrt/segment/segment.h |  1 +
 .../contrib/tensorrt/segment/union_find.h     |  1 +
 .../contrib/tensorrt/shape_fn/trt_shfn.cc     |  5 ++--
 12 files changed, 32 insertions(+), 53 deletions(-)

diff --git a/configure.py b/configure.py
index 1567ed697f..34de1ad2d5 100644
--- a/configure.py
+++ b/configure.py
@@ -1354,7 +1354,6 @@ def main():
     environ_cp['TF_NEED_GCP'] = '0'
     environ_cp['TF_NEED_HDFS'] = '0'
     environ_cp['TF_NEED_JEMALLOC'] = '0'
-    environ_cp['TF_NEED_KAFKA'] = '0'
     environ_cp['TF_NEED_OPENCL_SYCL'] = '0'
     environ_cp['TF_NEED_COMPUTECPP'] = '0'
     environ_cp['TF_NEED_OPENCL'] = '0'
@@ -1373,8 +1372,6 @@ def main():
                 'with_hdfs_support', True, 'hdfs')
   set_build_var(environ_cp, 'TF_NEED_S3', 'Amazon S3 File System',
                 'with_s3_support', True, 's3')
-  set_build_var(environ_cp, 'TF_NEED_KAFKA', 'Apache Kafka Platform',
-                'with_kafka_support', False, 'kafka')
   set_build_var(environ_cp, 'TF_ENABLE_XLA', 'XLA JIT', 'with_xla_support',
                 False, 'xla')
   set_build_var(environ_cp, 'TF_NEED_GDR', 'GDR', 'with_gdr_support',
diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index b179e815c8..c09f842cbe 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -19,8 +19,8 @@ load(
     "tf_cc_test",
     "tf_kernel_library",
     "tf_cuda_cc_test",
+    "tf_cuda_library",
     "tf_custom_op_py_library",
-    "tf_copts",
 )
 load(
     "@local_config_tensorrt//:build_defs.bzl",
@@ -62,13 +62,12 @@ tf_custom_op_library(
     ],
 )
 
-cc_library(
+tf_cuda_library(
     name = "trt_shape_function",
     srcs = [
         "shape_fn/trt_shfn.cc",
     ],
     hdrs = ["shape_fn/trt_shfn.h"],
-    copts = tf_copts(),
     deps = [
         ":trt_logging",
         "//tensorflow/core:framework_headers_lib",
@@ -111,7 +110,7 @@ tf_gen_op_libs(
     ],
 )
 
-cc_library(
+tf_cuda_library(
     name = "trt_logging",
     srcs = [
         "log/trt_logger.cc",
@@ -130,6 +129,7 @@ tf_gen_op_wrapper_py(
     name = "trt_engine_op",
     deps = [
         ":trt_engine_op_op_lib",
+        ":trt_logging",
         ":trt_shape_function",
     ],
 )
@@ -189,23 +189,21 @@ tf_py_wrap_cc(
     ],
 )
 
-cc_library(
+tf_cuda_library(
     name = "trt_conversion",
     srcs = [
         "convert/convert_graph.cc",
         "convert/convert_nodes.cc",
-        "segment/segment.cc",
     ],
     hdrs = [
         "convert/convert_graph.h",
         "convert/convert_nodes.h",
-        "segment/segment.h",
-        "segment/union_find.h",
     ],
     deps = [
         "@local_config_tensorrt//:nv_infer",
         "@protobuf_archive//:protobuf_headers",
         "@nsync//:nsync_headers",
+        ":segment",
         ":trt_logging",
         "//tensorflow/core:framework_lite",
         "//tensorflow/core:protos_all_cc",
@@ -220,16 +218,6 @@ cc_library(
     ],
 )
 
-tf_custom_op_library(
-    name = "tensorrt_ops.so",
-    srcs = [
-        "ops/tensorrt_ops.cc",
-    ],
-    deps = [
-        "@local_config_tensorrt//:nv_infer",
-    ],
-)
-
 # Library for the segmenting portion of TensorRT operation creation
 cc_library(
     name = "segment",
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index 72b9e9d2cb..eea51d458b 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -41,14 +41,13 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/layout_optimizer.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/protobuf/device_properties.pb.h"
-#include "tensorrt/include/NvInfer.h"
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
-#include "NvInfer.h"
 #include "tensorflow/contrib/tensorrt/convert/convert_graph.h"
 #include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
 #include "tensorflow/contrib/tensorrt/segment/segment.h"
+#include "tensorrt/include/NvInfer.h"
 
 //------------------------------------------------------------------------------
 namespace tensorflow {
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index 89996c0c0c..5879cd3189 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -35,7 +35,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorrt/include/NvInfer.h"
 
 #define _TF_LOG_DEBUG ::tensorflow::internal::LogMessage(__FILE__, __LINE__, -1)
 //  Check if the types are equal. Cast to int first so that failure log message
@@ -43,11 +42,13 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
-#include "NvInfer.h"
+
 #include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
 #include "tensorflow/contrib/tensorrt/log/trt_logger.h"
+#include "tensorrt/include/NvInfer.h"
+
 #define CHECK_EQ_TYPE(val1, val2) CHECK_EQ((int)val1, (int)val2)
-//------------------------------------------------------------------------------
+
 namespace tensorflow {
 namespace tensorrt {
 namespace convert {
@@ -234,7 +235,7 @@ class TFAttrs {
   bool count(std::string key) const { return _attrs.count(key); }
   tensorflow::AttrValue const* at(std::string key) const {
     if (!_attrs.count(key)) {
-      throw std::out_of_range("Attribute not found: " + key);
+      LOG(FATAL) << "Attribute not found: " << key;
     }
     return _attrs.at(key);
   }
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.h b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
index 56331d79f9..a1f9c3f4a1 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.h
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
@@ -39,6 +39,7 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
     size_t max_workspace_size,
     const tensorflow::grappler::GraphProperties& graph_prop,
     tensorflow::NodeDef* trt_node);
+
 }  // namespace convert
 }  // namespace tensorrt
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
index a03e864c2d..dc8b625731 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
@@ -71,8 +71,7 @@ void TRTEngineOp::Compute(OpKernelContext* context) {
     }
     switch (trt_engine_ptr_->getBindingDataType(binding_index)) {
       case nvinfer1::DataType::kFLOAT:
-        buffers[binding_index] =
-            reinterpret_cast<void*>(input_tensor.flat<float>().data());
+        buffers[binding_index] = (void*)(input_tensor.flat<float>().data());
         break;
       case nvinfer1::DataType::kHALF:
         LOG(FATAL) << "half size is not supported yet!";
diff --git a/tensorflow/contrib/tensorrt/log/trt_logger.cc b/tensorflow/contrib/tensorrt/log/trt_logger.cc
index 3910ed1201..fc62c64b6e 100644
--- a/tensorflow/contrib/tensorrt/log/trt_logger.cc
+++ b/tensorflow/contrib/tensorrt/log/trt_logger.cc
@@ -12,24 +12,25 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+
+#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
+
+#include "tensorflow/core/platform/logging.h"
+
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
 
-#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
 // Use TF logging for TensorRT informations
-#include "tensorflow/core/platform/logging.h"
 
 #define _TF_LOG_DEBUG ::tensorflow::internal::LogMessage(__FILE__, __LINE__, -1)
-//------------------------------------------------------------------------------
-namespace tensorflow {
 
-//------------------------------------------------------------------------------
+namespace tensorflow {
 namespace tensorrt {
 
 void Logger::log(Severity severity, const char* msg) {
-  // suppress info-level messages
+  // Suppress info-level messages
   switch (severity) {
-    case Severity::kINFO: {  // mark TRT info messages as debug!
+    case Severity::kINFO: {  // Mark TRT info messages as debug!
       LOG(DEBUG) << msg;
       break;
     }
@@ -55,7 +56,6 @@ void Logger::log(Severity severity, const char* msg) {
 }
 
 }  // namespace tensorrt
-
 }  // namespace tensorflow
 
 #endif // GOOGLE_TENSORRT
diff --git a/tensorflow/contrib/tensorrt/log/trt_logger.h b/tensorflow/contrib/tensorrt/log/trt_logger.h
index 8737813bed..f82f3188e8 100644
--- a/tensorflow/contrib/tensorrt/log/trt_logger.h
+++ b/tensorflow/contrib/tensorrt/log/trt_logger.h
@@ -17,15 +17,13 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_TENSORRT_LOG_TRT_LOGGER_H_
 #define TENSORFLOW_CONTRIB_TENSORRT_LOG_TRT_LOGGER_H_
 
-// Use TF logging f
 #include <string>
 
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
 #include "tensorrt/include/NvInfer.h"
 
-//------------------------------------------------------------------------------
 namespace tensorflow {
-
-//------------------------------------------------------------------------------
 namespace tensorrt {
 
 // Logger for GIE info/warning/errors
@@ -37,6 +35,8 @@ class Logger : public nvinfer1::ILogger {
 };
 
 }  // namespace tensorrt
-
 }  // namespace tensorflow
+
+#endif // GOOGLE_TENSORRT
+#endif // GOOGLE_CUDA
 #endif  // TENSORFLOW_CONTRIB_TENSORRT_LOG_TRT_LOGGER_H_
diff --git a/tensorflow/contrib/tensorrt/segment/segment.cc b/tensorflow/contrib/tensorrt/segment/segment.cc
index c184253d3a..89457b71e8 100644
--- a/tensorflow/contrib/tensorrt/segment/segment.cc
+++ b/tensorflow/contrib/tensorrt/segment/segment.cc
@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
 #include "tensorflow/contrib/tensorrt/segment/segment.h"
 
 #include <set>
@@ -29,15 +27,12 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 
-//------------------------------------------------------------------------------
 namespace tensorflow {
 namespace tensorrt {
 namespace segment {
 
-//------------------------------------------------------------------------------
 namespace {
 
-//------------------------------------------------------------------------------
 bool CanContractEdge(const tensorflow::Edge* edge,
                      const tensorflow::Graph& graph) {
   const tensorflow::Node* src = edge->src();
@@ -122,7 +117,6 @@ void ContractEdge(tensorflow::Edge* edge, tensorflow::Graph* graph,
 
 }  // namespace
 
-//------------------------------------------------------------------------------
 tensorflow::Status SegmentGraph(
     const tensorflow::GraphDef& gdef,
     const std::function<bool(const tensorflow::NodeDef&)>& candidate_fn,
@@ -267,6 +261,3 @@ tensorflow::Status SegmentGraph(
 }  // namespace segment
 }  // namespace tensorrt
 }  // namespace tensorflow
-
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
diff --git a/tensorflow/contrib/tensorrt/segment/segment.h b/tensorflow/contrib/tensorrt/segment/segment.h
index 1bb70b82f9..1c7ccde7d3 100644
--- a/tensorflow/contrib/tensorrt/segment/segment.h
+++ b/tensorflow/contrib/tensorrt/segment/segment.h
@@ -52,4 +52,5 @@ tensorflow::Status SegmentGraph(
 }  // namespace segment
 }  // namespace tensorrt
 }  // namespace tensorflow
+
 #endif  // TENSORFLOW_CONTRIB_TENSORRT_SEGMENT_SEGMENT_H_
diff --git a/tensorflow/contrib/tensorrt/segment/union_find.h b/tensorflow/contrib/tensorrt/segment/union_find.h
index 0d3cf62cdf..1c64ebbb0a 100644
--- a/tensorflow/contrib/tensorrt/segment/union_find.h
+++ b/tensorflow/contrib/tensorrt/segment/union_find.h
@@ -75,4 +75,5 @@ UnionFind<T>* UnionFind<T>::FindRoot() {
 }  // namespace segment
 }  // namespace tensorrt
 }  // namespace tensorflow
+
 #endif  // TENSORFLOW_CONTRIB_TENSORRT_SEGMENT_UNION_FIND_H_
diff --git a/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc b/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
index 7624237efe..fef63c64d8 100644
--- a/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
+++ b/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
@@ -24,11 +24,12 @@ limitations under the License.
 
 namespace tensorflow {
 namespace shape_inference {
+
 tensorflow::Status TRTEngineOpShapeInference(InferenceContext* c) {
-  tensorflow::tensorrt::Logger gLogger;
+  tensorflow::tensorrt::Logger logger;
   string serialized_engine;
   c->GetAttr("serialized_engine", &serialized_engine);
-  nvinfer1::IRuntime* infer = nvinfer1::createInferRuntime(gLogger);
+  nvinfer1::IRuntime* infer = nvinfer1::createInferRuntime(logger);
   nvinfer1::ICudaEngine* trt_engine = infer->deserializeCudaEngine(
       serialized_engine.c_str(), serialized_engine.size(), nullptr);
 
-- 
GitLab


From 70d608a5bd58f8da2dfb8b76a8ce94ac8d03aa4e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carl=20Thom=C3=A9?= <carlthome@gmail.com>
Date: Tue, 30 Jan 2018 00:06:28 +0100
Subject: [PATCH 354/966] Spelling (#16547)

---
 tensorflow/python/training/basic_session_run_hooks.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/training/basic_session_run_hooks.py b/tensorflow/python/training/basic_session_run_hooks.py
index 752d585cd1..864c2e4406 100644
--- a/tensorflow/python/training/basic_session_run_hooks.py
+++ b/tensorflow/python/training/basic_session_run_hooks.py
@@ -331,7 +331,7 @@ class CheckpointSaverListener(object):
   `CheckpointSaverHook`, as in this example:
 
   ```python
-  class ExampleCheckpointSaverListerner(CheckpointSaverListener):
+  class ExampleCheckpointSaverListener(CheckpointSaverListener):
     def begin(self):
       # You can add ops to the graph here.
       print('Starting the session.')
@@ -347,7 +347,7 @@ class CheckpointSaverListener(object):
       print('Done with the session.')
 
   ...
-  listener = ExampleCheckpointSaverListerner()
+  listener = ExampleCheckpointSaverListener()
   saver_hook = tf.train.CheckpointSaverHook(
       checkpoint_dir, listeners=[listener])
   with tf.train.MonitoredTrainingSession(chief_only_hooks=[saver_hook]):
-- 
GitLab


From be50e66950a4534322f55fbcbc300ea710f2b6c6 Mon Sep 17 00:00:00 2001
From: Sami Kama <skama@nvidia.com>
Date: Mon, 29 Jan 2018 15:08:26 -0800
Subject: [PATCH 355/966] Fix compilation issues with new build files

---
 configure.py                                  |  5 +++
 tensorflow/contrib/tensorrt/BUILD             |  1 +
 tensorflow/contrib/tensorrt/log/trt_logger.cc |  7 +----
 third_party/gpus/cuda_configure.bzl           | 23 +++++++++-----
 third_party/tensorrt/BUILD.tpl                | 31 -------------------
 third_party/tensorrt/build_defs.bzl.tpl       |  1 -
 third_party/tensorrt/tensorrt_configure.bzl   |  4 +--
 7 files changed, 25 insertions(+), 47 deletions(-)

diff --git a/configure.py b/configure.py
index 1567ed697f..f136577005 100644
--- a/configure.py
+++ b/configure.py
@@ -1422,6 +1422,11 @@ def main():
       if not is_windows():
         set_gcc_host_compiler_path(environ_cp)
     set_other_cuda_vars(environ_cp)
+    # superceeded by call above
+    # enable tensorrt if desired. Disabled on non-linux
+    # set_action_env_var(environ_cp, 'TF_NEED_TENSORRT', 'TensorRT', False)
+    # if environ_cp.get('TF_NEED_TENSORRT') == '1':
+    #   set_tf_trt_version(environ_cp)
 
   set_build_var(environ_cp, 'TF_NEED_MPI', 'MPI', 'with_mpi_support', False)
   if environ_cp.get('TF_NEED_MPI') == '1':
diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index b179e815c8..57c106c812 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -77,6 +77,7 @@ cc_library(
         "@nsync//:nsync_headers",
         "@protobuf_archive//:protobuf",
     ],
+    visibility = ["//visibility:public"], #for c/c++ linking    
 )
 
 tf_kernel_library(
diff --git a/tensorflow/contrib/tensorrt/log/trt_logger.cc b/tensorflow/contrib/tensorrt/log/trt_logger.cc
index 3910ed1201..d8a0310828 100644
--- a/tensorflow/contrib/tensorrt/log/trt_logger.cc
+++ b/tensorflow/contrib/tensorrt/log/trt_logger.cc
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
 
 #include "tensorflow/contrib/tensorrt/log/trt_logger.h"
 // Use TF logging for TensorRT informations
@@ -30,7 +28,7 @@ void Logger::log(Severity severity, const char* msg) {
   // suppress info-level messages
   switch (severity) {
     case Severity::kINFO: {  // mark TRT info messages as debug!
-      LOG(DEBUG) << msg;
+      VLOG(-1) << msg;
       break;
     }
     case Severity::kWARNING: {
@@ -57,6 +55,3 @@ void Logger::log(Severity severity, const char* msg) {
 }  // namespace tensorrt
 
 }  // namespace tensorflow
-
-#endif // GOOGLE_TENSORRT
-#endif // GOOGLE_CUDA
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index 8e1dd8a54f..7504154735 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -358,8 +358,8 @@ def find_cuda_define(repository_ctx, header_dir, header_file, define):
   if not h_path.exists:
     auto_configure_fail("Cannot find %s at %s" % (header_file, str(h_path)))
   result = repository_ctx.execute(
-      # Grep one more lines as some #defines are splitted into two lines.
-      ["grep", "--color=never", "-A1", "-E", define, str(h_path)])
+    # Grep one more lines as some #defines are splitted into two lines.
+    ["grep", "--color=never", "-A1", "-E", define, str(h_path)])
   if result.stderr:
     auto_configure_fail("Error reading %s: %s" % (str(h_path), result.stderr))
 
@@ -367,11 +367,20 @@ def find_cuda_define(repository_ctx, header_dir, header_file, define):
   if result.stdout.find(define) == -1:
     auto_configure_fail("Cannot find line containing '%s' in %s" %
                         (define, h_path))
-  version = result.stdout
-  # Remove the new line and '\' character if any.
-  version = version.replace("\\", " ")
-  version = version.replace("\n", " ")
-  version = version.replace(define, "").lstrip()
+  #split results to lines
+  lines=result.stdout.split('\n')
+  lenLines=len(lines)
+  for l in range(lenLines):
+    line=lines[l]
+    if define in line: # find the line with define
+      version=line
+      if l != lenLines-1 and line[-1] == '\\': # add next line, if multiline
+        version=version[:-1]+lines[l+1]
+      break
+  #remove any comments
+  version = version.split("//")[0]
+  # remove define name
+  version = version.replace(define, "").strip()
   # Remove the code after the version number.
   version_end = version.find(" ")
   if version_end != -1:
diff --git a/third_party/tensorrt/BUILD.tpl b/third_party/tensorrt/BUILD.tpl
index 99c0e89498..dc7fe0c8c8 100644
--- a/third_party/tensorrt/BUILD.tpl
+++ b/third_party/tensorrt/BUILD.tpl
@@ -34,37 +34,6 @@ cc_library(
     visibility = ["//visibility:public"],
 )
 
-cc_library(
-    name = "nv_infer_plugin",
-    srcs = [%{nv_infer_plugin}],
-    data = [%{nv_infer_plugin}],
-    includes = [
-        "include",
-    ],
-    copts= cuda_default_copts(),
-    deps = [
-        "@local_config_cuda//cuda:cuda",
-        ":nv_infer",
-        ":tensorrt_headers",
-    ],
-    linkstatic = 1,
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "nv_parsers",
-    srcs = [%{nv_parsers}],
-    data = [%{nv_parsers}],
-    includes = [
-        "include",
-    ],
-    copts= cuda_default_copts(),
-    deps = [
-        ":tensorrt_headers",
-    ],
-    linkstatic = 1,
-    visibility = ["//visibility:public"],
-)
 
 %{tensorrt_genrules}
 
diff --git a/third_party/tensorrt/build_defs.bzl.tpl b/third_party/tensorrt/build_defs.bzl.tpl
index f5348a7c06..0dc3a7ba2d 100644
--- a/third_party/tensorrt/build_defs.bzl.tpl
+++ b/third_party/tensorrt/build_defs.bzl.tpl
@@ -5,4 +5,3 @@ def if_tensorrt(if_true, if_false=[]):
   if %{tensorrt_is_configured}:
     return if_true
   return if_false
-
diff --git a/third_party/tensorrt/tensorrt_configure.bzl b/third_party/tensorrt/tensorrt_configure.bzl
index 8aa0f28f39..4a1441500a 100644
--- a/third_party/tensorrt/tensorrt_configure.bzl
+++ b/third_party/tensorrt/tensorrt_configure.bzl
@@ -19,9 +19,9 @@ load(
 _TENSORRT_INSTALL_PATH = "TENSORRT_INSTALL_PATH"
 _TF_TENSORRT_VERSION = "TF_TENSORRT_VERSION"
 
-_TF_TENSORRT_LIBS = ["nvinfer", "nvinfer_plugin", "nvparsers"]
+_TF_TENSORRT_LIBS = ["nvinfer"]
 _TF_TENSORRT_HEADERS = [
-    "NvInfer.h", "NvInferPlugin.h", "NvCaffeParser.h", "NvUffParser.h",
+    "NvInfer.h",
     "NvUtils.h"
 ]
 
-- 
GitLab


From aaaeef5bbc4698ea48c2476ad5c84a94712c8d2f Mon Sep 17 00:00:00 2001
From: Nupur Garg <nupurgarg@google.com>
Date: Mon, 29 Jan 2018 15:06:30 -0800
Subject: [PATCH 356/966] Make TFLite SpaceToBatchND op have parity with TF
 SpaceToBatchND op.

PiperOrigin-RevId: 183734695
---
 tensorflow/contrib/lite/builtin_op_data.h     |   8 -
 .../contrib/lite/kernels/space_to_batch_nd.cc |  95 ++++++------
 .../lite/kernels/space_to_batch_nd_test.cc    | 141 ++++++++++++++----
 tensorflow/contrib/lite/model.cc              |  17 ---
 tensorflow/contrib/lite/schema/schema.fbs     |   3 -
 .../contrib/lite/schema/schema_generated.h    |  89 +----------
 .../contrib/lite/testing/generate_examples.py |  42 +++++-
 .../contrib/lite/toco/tflite/operator.cc      |  15 +-
 .../contrib/lite/toco/tflite/operator_test.cc |  13 --
 9 files changed, 204 insertions(+), 219 deletions(-)

diff --git a/tensorflow/contrib/lite/builtin_op_data.h b/tensorflow/contrib/lite/builtin_op_data.h
index 7a7e20a41e..a1037a525c 100644
--- a/tensorflow/contrib/lite/builtin_op_data.h
+++ b/tensorflow/contrib/lite/builtin_op_data.h
@@ -116,14 +116,6 @@ typedef struct {
 } TfLiteAddParams;
 
 typedef struct {
-  // Number of spatial dimensions.
-  // For now only NHWC is supported, and the value should always be 2.
-  int num_spatial_dimensions;
-  // TODO(ahentz): We can't have dynamic data in this struct, at least not yet.
-  // For now we will fix the maximum possible number of dimensions.
-  int block_shape[2];
-  int before_paddings[2];
-  int after_paddings[2];
 } TfLiteSpaceToBatchNDParams;
 
 typedef struct {
diff --git a/tensorflow/contrib/lite/kernels/space_to_batch_nd.cc b/tensorflow/contrib/lite/kernels/space_to_batch_nd.cc
index 2e22d0db56..e2e1873f77 100644
--- a/tensorflow/contrib/lite/kernels/space_to_batch_nd.cc
+++ b/tensorflow/contrib/lite/kernels/space_to_batch_nd.cc
@@ -33,17 +33,16 @@ enum KernelType {
   kGenericOptimized,
 };
 
-// Inputs specified in the 2nd tensor (block_shape) and 3rd tensor (paddings)
-// are ignored. Only use the `block_shape` and `paddings` specified in params.
-// TODO(nupurgarg): Support inputs as tensors in SpaceToBatchND.
 struct SpaceToBatchNDContext {
   SpaceToBatchNDContext(TfLiteContext* context, TfLiteNode* node) {
-    params = reinterpret_cast<TfLiteSpaceToBatchNDParams*>(node->builtin_data);
     input = GetInput(context, node, 0);
+    block_shape = GetInput(context, node, 1);
+    paddings = GetInput(context, node, 2);
     output = GetOutput(context, node, 0);
   }
-  TfLiteSpaceToBatchNDParams* params;
   TfLiteTensor* input;
+  TfLiteTensor* block_shape;
+  TfLiteTensor* paddings;
   TfLiteTensor* output;
 };
 
@@ -51,32 +50,29 @@ struct SpaceToBatchNDContext {
 // The 4D array need to have exactly 2 spatial dimensions.
 // TODO(nupurgarg): Support arbitrary dimension in SpaceToBatchND.
 const int kInputDimensionNum = 4;
-const int kOutputDimensionNum = 4;
+const int kBlockSizeDimensionNum = 1;
 const int kSpatialDimensionNum = 2;
-const int kPaddingDimensionNum = 4;
 
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  TF_LITE_ENSURE(context, NumInputs(node) >= 1 && NumInputs(node) <= 3);
-  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
+                                SpaceToBatchNDContext* op_context) {
+  TfLiteIntArray* input_size = op_context->input->dims;
+  const int32* block_shape = GetTensorData<int32>(op_context->block_shape);
+  const int32* paddings_data = GetTensorData<int32>(op_context->paddings);
 
-  SpaceToBatchNDContext op_context(context, node);
-  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.input),
-                    kInputDimensionNum);
-  TF_LITE_ENSURE_EQ(context, op_context.params->num_spatial_dimensions,
+  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context->block_shape),
+                    kBlockSizeDimensionNum);
+  TF_LITE_ENSURE_EQ(context, op_context->block_shape->dims->data[0],
+                    kSpatialDimensionNum);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context->paddings),
                     kSpatialDimensionNum);
-  TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type);
-
-  const TfLiteIntArray* input_size = op_context.input->dims;
-  const int* block_shape = op_context.params->block_shape;
 
-  TfLiteIntArray* output_size = TfLiteIntArrayCreate(kOutputDimensionNum);
+  TfLiteIntArray* output_size = TfLiteIntArrayCopy(input_size);
 
   // Ensures the input height and width (with padding) is a multiple of block
   // shape height and width.
   for (int dim = 0; dim < kSpatialDimensionNum; ++dim) {
-    int final_dim_size =
-        (input_size->data[dim + 1] + op_context.params->before_paddings[dim] +
-         op_context.params->after_paddings[dim]);
+    int final_dim_size = (input_size->data[dim + 1] + paddings_data[dim * 2] +
+                          paddings_data[dim * 2 + 1]);
     TF_LITE_ENSURE_EQ(context, final_dim_size % block_shape[dim], 0);
     output_size->data[dim + 1] = final_dim_size / block_shape[dim];
   }
@@ -88,33 +84,44 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   output_size->data[0] = output_batch_size;
   output_size->data[3] = output_channel_size;
 
-  return context->ResizeTensor(context, op_context.output, output_size);
+  return context->ResizeTensor(context, op_context->output, output_size);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 3);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  SpaceToBatchNDContext op_context(context, node);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.input),
+                    kInputDimensionNum);
+  TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type);
+
+  if (!IsConstantTensor(op_context.block_shape) ||
+      !IsConstantTensor(op_context.paddings)) {
+    SetTensorToDynamic(op_context.output);
+    return kTfLiteOk;
+  }
+  return ResizeOutputTensor(context, &op_context);
 }
 
 template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   SpaceToBatchNDContext op_context(context, node);
 
-  int block_shape_dims_array[1] = {kSpatialDimensionNum};
-  Dims<4> block_shape_dims = GetTensorDims(block_shape_dims_array, 1);
-
-  // Initialize padding array in the format accepted by the kernel code.
-  // TODO(nupurgarg): Make kernel code accept padding array format that is
-  // consistent with Pad operation (i.e. before_paddings and after_paddings).
-  TfLiteIntArray* padding_data = TfLiteIntArrayCreate(kPaddingDimensionNum);
-  padding_data->data[0] = op_context.params->before_paddings[0];
-  padding_data->data[1] = op_context.params->after_paddings[0];
-  padding_data->data[2] = op_context.params->before_paddings[1];
-  padding_data->data[3] = op_context.params->after_paddings[1];
-  int padding_dims_array[1] = {kPaddingDimensionNum};
-  Dims<4> padding_dims = GetTensorDims(padding_dims_array, 1);
-
-#define TF_LITE_SPACE_TO_BATCH_ND(type, scalar)                          \
-  type::SpaceToBatchND(GetTensorData<scalar>(op_context.input),          \
-                       GetTensorDims(op_context.input),                  \
-                       op_context.params->block_shape, block_shape_dims, \
-                       padding_data->data, padding_dims,                 \
-                       GetTensorData<scalar>(op_context.output),         \
+  // Resize the output tensor if the output tensor is dynamic.
+  if (IsDynamicTensor(op_context.output)) {
+    TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context));
+    TfLiteTensorRealloc(op_context.output->bytes, op_context.output);
+  }
+
+#define TF_LITE_SPACE_TO_BATCH_ND(type, scalar)                        \
+  type::SpaceToBatchND(GetTensorData<scalar>(op_context.input),        \
+                       GetTensorDims(op_context.input),                \
+                       GetTensorData<int32_t>(op_context.block_shape), \
+                       GetTensorDims(op_context.block_shape),          \
+                       GetTensorData<int32_t>(op_context.paddings),    \
+                       GetTensorDims(op_context.paddings),             \
+                       GetTensorData<scalar>(op_context.output),       \
                        GetTensorDims(op_context.output))
   switch (op_context.input->type) {  // Already know in/out types are same.
     case kTfLiteFloat32:
@@ -151,8 +158,6 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteError;
   }
 #undef TF_LITE_SPACE_TO_BATCH_ND
-
-  TfLiteIntArrayFree(padding_data);
   return kTfLiteOk;
 }
 
diff --git a/tensorflow/contrib/lite/kernels/space_to_batch_nd_test.cc b/tensorflow/contrib/lite/kernels/space_to_batch_nd_test.cc
index 45a6aef73d..92a4a037d5 100644
--- a/tensorflow/contrib/lite/kernels/space_to_batch_nd_test.cc
+++ b/tensorflow/contrib/lite/kernels/space_to_batch_nd_test.cc
@@ -26,41 +26,81 @@ using ::testing::ElementsAreArray;
 
 class SpaceToBatchNDOpModel : public SingleOpModel {
  public:
-  SpaceToBatchNDOpModel(std::initializer_list<int> input_shape,
-                        std::initializer_list<int> block_shape,
-                        std::initializer_list<int> before_paddings,
-                        std::initializer_list<int> after_paddings) {
-    input_ = AddInput(TensorType_FLOAT32);
-    output_ = AddOutput(TensorType_FLOAT32);
-    SetBuiltinOp(BuiltinOperator_SPACE_TO_BATCH_ND,
-                 BuiltinOptions_SpaceToBatchNDOptions,
-                 CreateSpaceToBatchNDOptions(
-                     builder_, builder_.CreateVector<int>(block_shape),
-                     builder_.CreateVector<int>(before_paddings),
-                     builder_.CreateVector<int>(after_paddings))
-                     .Union());
-    BuildInterpreter({input_shape});
-  }
-
   void SetInput(std::initializer_list<float> data) {
     PopulateTensor<float>(input_, data);
   }
 
+  void SetBlockShape(std::initializer_list<int> data) {
+    PopulateTensor<int>(block_shape_, data);
+  }
+
+  void SetPaddings(std::initializer_list<int> data) {
+    PopulateTensor<int>(paddings_, data);
+  }
+
   std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
 
- private:
+ protected:
   int input_;
+  int block_shape_;
+  int paddings_;
   int output_;
 };
 
+// Tests case where block_shape and paddings are const tensors.
+//
+// Example usage is as follows:
+//    SpaceToBatchNDOpConstModel m(input_shape, block_shape, paddings);
+//    m.SetInput(input_data);
+//    m.Invoke();
+class SpaceToBatchNDOpConstModel : public SpaceToBatchNDOpModel {
+ public:
+  SpaceToBatchNDOpConstModel(std::initializer_list<int> input_shape,
+                             std::initializer_list<int> block_shape,
+                             std::initializer_list<int> paddings) {
+    input_ = AddInput(TensorType_FLOAT32);
+    block_shape_ = AddConstInput(TensorType_INT32, block_shape, {2});
+    paddings_ = AddConstInput(TensorType_INT32, paddings, {2, 2});
+    output_ = AddOutput(TensorType_FLOAT32);
+
+    SetBuiltinOp(BuiltinOperator_SPACE_TO_BATCH_ND,
+                 BuiltinOptions_SpaceToBatchNDOptions,
+                 CreateSpaceToBatchNDOptions(builder_).Union());
+    BuildInterpreter({input_shape});
+  }
+};
+
+// Tests case where block_shape and paddings are non-const tensors.
+//
+// Example usage is as follows:
+//    SpaceToBatchNDOpDynamicModel m(input_shape);
+//    m.SetInput(input_data);
+//    m.SetBlockShape(block_shape);
+//    m.SetPaddings(paddings);
+//    m.Invoke();
+class SpaceToBatchNDOpDynamicModel : public SpaceToBatchNDOpModel {
+ public:
+  SpaceToBatchNDOpDynamicModel(std::initializer_list<int> input_shape) {
+    input_ = AddInput(TensorType_FLOAT32);
+    block_shape_ = AddInput(TensorType_INT32);
+    paddings_ = AddInput(TensorType_INT32);
+    output_ = AddOutput(TensorType_FLOAT32);
+
+    SetBuiltinOp(BuiltinOperator_SPACE_TO_BATCH_ND,
+                 BuiltinOptions_SpaceToBatchNDOptions,
+                 CreateSpaceToBatchNDOptions(builder_).Union());
+    BuildInterpreter({input_shape, {2}, {2, 2}});
+  }
+};
+
 TEST(SpaceToBatchNDOpTest, InvalidShapeTest) {
-  EXPECT_DEATH(SpaceToBatchNDOpModel({1, 3, 3, 1}, {2, 2}, {0, 0}, {0, 0}),
+  EXPECT_DEATH(SpaceToBatchNDOpConstModel({1, 3, 3, 1}, {2, 2}, {0, 0, 0, 0}),
                "Cannot allocate tensors");
 }
 
-TEST(SpaceToBatchNDOpTest, SimpleTest) {
-  SpaceToBatchNDOpModel m({1, 4, 4, 1}, {2, 2}, {0, 0}, {0, 0});
+TEST(SpaceToBatchNDOpTest, SimpleConstTest) {
+  SpaceToBatchNDOpConstModel m({1, 4, 4, 1}, {2, 2}, {0, 0, 0, 0});
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 2, 2, 1}));
@@ -68,17 +108,39 @@ TEST(SpaceToBatchNDOpTest, SimpleTest) {
                                                13, 15, 6, 8, 14, 16}));
 }
 
-TEST(SpaceToBatchNDOpTest, MultipleInputBatches) {
-  SpaceToBatchNDOpModel m({2, 2, 4, 1}, {2, 2}, {0, 0}, {0, 0});
+TEST(SpaceToBatchNDOpTest, SimpleDynamicTest) {
+  SpaceToBatchNDOpDynamicModel m({1, 4, 4, 1});
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.SetBlockShape({2, 2});
+  m.SetPaddings({0, 0, 0, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 2, 2, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 3, 9, 11, 2, 4, 10, 12, 5, 7,
+                                               13, 15, 6, 8, 14, 16}));
+}
+
+TEST(SpaceToBatchNDOpTest, MultipleInputBatchesConstTest) {
+  SpaceToBatchNDOpConstModel m({2, 2, 4, 1}, {2, 2}, {0, 0, 0, 0});
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({8, 1, 2, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 3, 9, 11, 2, 4, 10, 12, 5, 7,
+                                               13, 15, 6, 8, 14, 16}));
+}
+
+TEST(SpaceToBatchNDOpTest, MultipleInputBatchesDynamicTest) {
+  SpaceToBatchNDOpDynamicModel m({2, 2, 4, 1});
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.SetBlockShape({2, 2});
+  m.SetPaddings({0, 0, 0, 0});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({8, 1, 2, 1}));
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 3, 9, 11, 2, 4, 10, 12, 5, 7,
                                                13, 15, 6, 8, 14, 16}));
 }
 
-TEST(SpaceToBatchNDOpTest, SimplePadding) {
-  SpaceToBatchNDOpModel m({1, 5, 2, 1}, {3, 2}, {1, 2}, {0, 0});
+TEST(SpaceToBatchNDOpTest, SimplePaddingConstTest) {
+  SpaceToBatchNDOpConstModel m({1, 5, 2, 1}, {3, 2}, {1, 0, 2, 0});
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 2, 1}));
@@ -88,9 +150,36 @@ TEST(SpaceToBatchNDOpTest, SimplePadding) {
                              }));
 }
 
-TEST(SpaceToBatchNDOpTest, ComplexPadding) {
-  SpaceToBatchNDOpModel m({1, 4, 2, 1}, {3, 2}, {1, 2}, {1, 4});
+TEST(SpaceToBatchNDOpTest, SimplePaddingDynamicTest) {
+  SpaceToBatchNDOpDynamicModel m({1, 5, 2, 1});
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+  m.SetBlockShape({3, 2});
+  m.SetPaddings({1, 0, 2, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 2, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+                                 0, 0, 0, 5, 0, 0, 0, 6, 0, 1, 0, 7,
+                                 0, 2, 0, 8, 0, 3, 0, 9, 0, 4, 0, 10,
+                             }));
+}
+
+TEST(SpaceToBatchNDOpTest, ComplexPaddingConstTest) {
+  SpaceToBatchNDOpConstModel m({1, 4, 2, 1}, {3, 2}, {1, 1, 2, 4});
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 4, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+                                 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0,
+                                 0, 1, 0, 0, 0, 7, 0, 0, 0, 2, 0, 0, 0, 8, 0, 0,
+                                 0, 3, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0,
+                             }));
+}
+
+TEST(SpaceToBatchNDOpTest, ComplexPaddingDynamicTest) {
+  SpaceToBatchNDOpDynamicModel m({1, 4, 2, 1});
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8});
+  m.SetBlockShape({3, 2});
+  m.SetPaddings({1, 1, 2, 4});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 4, 1}));
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({
diff --git a/tensorflow/contrib/lite/model.cc b/tensorflow/contrib/lite/model.cc
index c82ae27953..b36bfcef84 100644
--- a/tensorflow/contrib/lite/model.cc
+++ b/tensorflow/contrib/lite/model.cc
@@ -516,23 +516,6 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
       break;
     }
     case BuiltinOperator_SPACE_TO_BATCH_ND: {
-      auto* params = MallocPOD<TfLiteSpaceToBatchNDParams>();
-      if (auto* schema_params =
-              op->builtin_options_as_SpaceToBatchNDOptions()) {
-        const auto& block_shape = schema_params->block_shape();
-        FlatBufferIntVectorToArray(sizeof(params->block_shape), block_shape,
-                                   params->block_shape, error_reporter);
-        const auto& before_paddings = schema_params->before_paddings();
-        FlatBufferIntVectorToArray(sizeof(params->before_paddings),
-                                   before_paddings, params->before_paddings,
-                                   error_reporter);
-        const auto& after_paddings = schema_params->after_paddings();
-        FlatBufferIntVectorToArray(sizeof(params->after_paddings),
-                                   after_paddings, params->after_paddings,
-                                   error_reporter);
-        params->num_spatial_dimensions = block_shape->Length();
-      }
-      builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_BATCH_TO_SPACE_ND: {
diff --git a/tensorflow/contrib/lite/schema/schema.fbs b/tensorflow/contrib/lite/schema/schema.fbs
index 91eac2ab48..c0b220e872 100644
--- a/tensorflow/contrib/lite/schema/schema.fbs
+++ b/tensorflow/contrib/lite/schema/schema.fbs
@@ -289,9 +289,6 @@ table ReshapeOptions {
 }
 
 table SpaceToBatchNDOptions {
-  block_shape:[int];
-  before_paddings:[int];
-  after_paddings:[int];
 }
 
 table BatchToSpaceNDOptions {
diff --git a/tensorflow/contrib/lite/schema/schema_generated.h b/tensorflow/contrib/lite/schema/schema_generated.h
index a8370b34c6..29f3a17be7 100755
--- a/tensorflow/contrib/lite/schema/schema_generated.h
+++ b/tensorflow/contrib/lite/schema/schema_generated.h
@@ -2834,33 +2834,14 @@ flatbuffers::Offset<ReshapeOptions> CreateReshapeOptions(
 
 struct SpaceToBatchNDOptionsT : public flatbuffers::NativeTable {
   typedef SpaceToBatchNDOptions TableType;
-  std::vector<int32_t> block_shape;
-  std::vector<int32_t> before_paddings;
-  std::vector<int32_t> after_paddings;
   SpaceToBatchNDOptionsT() {}
 };
 
 struct SpaceToBatchNDOptions FLATBUFFERS_FINAL_CLASS
     : private flatbuffers::Table {
   typedef SpaceToBatchNDOptionsT NativeTableType;
-  enum { VT_BLOCK_SHAPE = 4, VT_BEFORE_PADDINGS = 6, VT_AFTER_PADDINGS = 8 };
-  const flatbuffers::Vector<int32_t> *block_shape() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_BLOCK_SHAPE);
-  }
-  const flatbuffers::Vector<int32_t> *before_paddings() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_BEFORE_PADDINGS);
-  }
-  const flatbuffers::Vector<int32_t> *after_paddings() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_AFTER_PADDINGS);
-  }
   bool Verify(flatbuffers::Verifier &verifier) const {
-    return VerifyTableStart(verifier) &&
-           VerifyOffset(verifier, VT_BLOCK_SHAPE) &&
-           verifier.Verify(block_shape()) &&
-           VerifyOffset(verifier, VT_BEFORE_PADDINGS) &&
-           verifier.Verify(before_paddings()) &&
-           VerifyOffset(verifier, VT_AFTER_PADDINGS) &&
-           verifier.Verify(after_paddings()) && verifier.EndTable();
+    return VerifyTableStart(verifier) && verifier.EndTable();
   }
   SpaceToBatchNDOptionsT *UnPack(
       const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -2875,18 +2856,6 @@ struct SpaceToBatchNDOptions FLATBUFFERS_FINAL_CLASS
 struct SpaceToBatchNDOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
-  void add_block_shape(
-      flatbuffers::Offset<flatbuffers::Vector<int32_t>> block_shape) {
-    fbb_.AddOffset(SpaceToBatchNDOptions::VT_BLOCK_SHAPE, block_shape);
-  }
-  void add_before_paddings(
-      flatbuffers::Offset<flatbuffers::Vector<int32_t>> before_paddings) {
-    fbb_.AddOffset(SpaceToBatchNDOptions::VT_BEFORE_PADDINGS, before_paddings);
-  }
-  void add_after_paddings(
-      flatbuffers::Offset<flatbuffers::Vector<int32_t>> after_paddings) {
-    fbb_.AddOffset(SpaceToBatchNDOptions::VT_AFTER_PADDINGS, after_paddings);
-  }
   explicit SpaceToBatchNDOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
       : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -2900,29 +2869,11 @@ struct SpaceToBatchNDOptionsBuilder {
 };
 
 inline flatbuffers::Offset<SpaceToBatchNDOptions> CreateSpaceToBatchNDOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> block_shape = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> before_paddings = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> after_paddings = 0) {
+    flatbuffers::FlatBufferBuilder &_fbb) {
   SpaceToBatchNDOptionsBuilder builder_(_fbb);
-  builder_.add_after_paddings(after_paddings);
-  builder_.add_before_paddings(before_paddings);
-  builder_.add_block_shape(block_shape);
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<SpaceToBatchNDOptions>
-CreateSpaceToBatchNDOptionsDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    const std::vector<int32_t> *block_shape = nullptr,
-    const std::vector<int32_t> *before_paddings = nullptr,
-    const std::vector<int32_t> *after_paddings = nullptr) {
-  return tflite::CreateSpaceToBatchNDOptions(
-      _fbb, block_shape ? _fbb.CreateVector<int32_t>(*block_shape) : 0,
-      before_paddings ? _fbb.CreateVector<int32_t>(*before_paddings) : 0,
-      after_paddings ? _fbb.CreateVector<int32_t>(*after_paddings) : 0);
-}
-
 flatbuffers::Offset<SpaceToBatchNDOptions> CreateSpaceToBatchNDOptions(
     flatbuffers::FlatBufferBuilder &_fbb, const SpaceToBatchNDOptionsT *_o,
     const flatbuffers::rehasher_function_t *_rehasher = nullptr);
@@ -5639,33 +5590,6 @@ inline void SpaceToBatchNDOptions::UnPackTo(
     const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  {
-    auto _e = block_shape();
-    if (_e) {
-      _o->block_shape.resize(_e->size());
-      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
-        _o->block_shape[_i] = _e->Get(_i);
-      }
-    }
-  };
-  {
-    auto _e = before_paddings();
-    if (_e) {
-      _o->before_paddings.resize(_e->size());
-      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
-        _o->before_paddings[_i] = _e->Get(_i);
-      }
-    }
-  };
-  {
-    auto _e = after_paddings();
-    if (_e) {
-      _o->after_paddings.resize(_e->size());
-      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
-        _o->after_paddings[_i] = _e->Get(_i);
-      }
-    }
-  };
 }
 
 inline flatbuffers::Offset<SpaceToBatchNDOptions> SpaceToBatchNDOptions::Pack(
@@ -5685,14 +5609,7 @@ inline flatbuffers::Offset<SpaceToBatchNDOptions> CreateSpaceToBatchNDOptions(
     const flatbuffers::rehasher_function_t *__rehasher;
   } _va = {&_fbb, _o, _rehasher};
   (void)_va;
-  auto _block_shape =
-      _o->block_shape.size() ? _fbb.CreateVector(_o->block_shape) : 0;
-  auto _before_paddings =
-      _o->before_paddings.size() ? _fbb.CreateVector(_o->before_paddings) : 0;
-  auto _after_paddings =
-      _o->after_paddings.size() ? _fbb.CreateVector(_o->after_paddings) : 0;
-  return tflite::CreateSpaceToBatchNDOptions(_fbb, _block_shape,
-                                             _before_paddings, _after_paddings);
+  return tflite::CreateSpaceToBatchNDOptions(_fbb);
 }
 
 inline BatchToSpaceNDOptionsT *BatchToSpaceNDOptions::UnPack(
diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py
index f75d7c4bb9..e7606eecc4 100644
--- a/tensorflow/contrib/lite/testing/generate_examples.py
+++ b/tensorflow/contrib/lite/testing/generate_examples.py
@@ -1335,12 +1335,16 @@ def make_space_to_batch_nd_tests(zip_path):
           "input_shape": [[1, 2, 2, 3], [2, 2, 4, 1]],
           "block_shape": [[1, 3], [2, 2]],
           "paddings": [[[0, 0], [0, 0]], [[0, 0], [2, 0]], [[1, 1], [1, 1]]],
+          "constant_block_shape": [True, False],
+          "constant_paddings": [True, False],
       },
       {
           "dtype": [tf.float32],
           "input_shape": [[2, 3, 7, 3]],
           "block_shape": [[1, 3], [2, 2]],
           "paddings": [[[0, 0], [2, 0]], [[1, 0], [1, 0]]],
+          "constant_block_shape": [True, False],
+          "constant_paddings": [True, False],
       },
       # Non-4D use case: 1 bath dimension, 3 spatial dimensions, 2 others.
       {
@@ -1348,23 +1352,47 @@ def make_space_to_batch_nd_tests(zip_path):
           "input_shape": [[1, 4, 4, 4, 1, 1]],
           "block_shape": [[2, 2, 2]],
           "paddings": [[[0, 0], [0, 0], [0, 0]]],
+          "constant_block_shape": [True, False],
+          "constant_paddings": [True, False],
       },
   ]
 
   def build_graph(parameters):
+    """Build a space_to_batch graph given `parameters`."""
     input_tensor = tf.placeholder(
         dtype=parameters["dtype"],
         name="input",
         shape=parameters["input_shape"])
-    out = tf.space_to_batch_nd(input_tensor, parameters["block_shape"],
-                               parameters["paddings"])
-    return [input_tensor], [out]
+    input_tensors = [input_tensor]
+
+    # Get block_shape either as a const or as a placeholder (tensor).
+    if parameters["constant_block_shape"]:
+      block_shape = parameters["block_shape"]
+    else:
+      shape = [len(parameters["block_shape"])]
+      block_shape = tf.placeholder(dtype=tf.int32, name="shape", shape=shape)
+      input_tensors.append(block_shape)
+
+    # Get paddings either as a const or as a placeholder (tensor).
+    if parameters["constant_paddings"]:
+      paddings = parameters["paddings"]
+    else:
+      shape = [len(parameters["paddings"]), 2]
+      paddings = tf.placeholder(dtype=tf.int32, name="paddings", shape=shape)
+      input_tensors.append(paddings)
+
+    out = tf.space_to_batch_nd(input_tensor, block_shape, paddings)
+    return input_tensors, [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
-    input_values = create_tensor_data(parameters["dtype"],
-                                      parameters["input_shape"])
-    return [input_values], sess.run(
-        outputs, feed_dict=dict(zip(inputs, [input_values])))
+    values = [
+        create_tensor_data(parameters["dtype"], parameters["input_shape"])
+    ]
+    if not parameters["constant_block_shape"]:
+      values.append(np.array(parameters["block_shape"]))
+    if not parameters["constant_paddings"]:
+      values.append(np.array(parameters["paddings"]))
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
diff --git a/tensorflow/contrib/lite/toco/tflite/operator.cc b/tensorflow/contrib/lite/toco/tflite/operator.cc
index e33a5788d8..e2162e1493 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator.cc
@@ -140,24 +140,11 @@ class SpaceToBatchND
   flatbuffers::Offset<TfLiteOptions> WriteOptions(
       const TocoOperator& op,
       flatbuffers::FlatBufferBuilder* builder) const override {
-    auto block_shape = builder->CreateVector(op.block_shape);
-    auto before_paddings = builder->CreateVector(op.before_paddings);
-    auto after_paddings = builder->CreateVector(op.after_paddings);
-    return ::tflite::CreateSpaceToBatchNDOptions(
-        *builder, block_shape, before_paddings, after_paddings);
+    return ::tflite::CreateSpaceToBatchNDOptions(*builder);
   }
 
   void ReadOptions(const TfLiteOptions& options,
                    TocoOperator* op) const override {
-    op->block_shape.insert(op->block_shape.end(),
-                           options.block_shape()->begin(),
-                           options.block_shape()->end());
-    op->before_paddings.insert(op->before_paddings.end(),
-                               options.before_paddings()->begin(),
-                               options.before_paddings()->end());
-    op->after_paddings.insert(op->after_paddings.end(),
-                              options.after_paddings()->begin(),
-                              options.after_paddings()->end());
   }
 };
 
diff --git a/tensorflow/contrib/lite/toco/tflite/operator_test.cc b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
index b4ec7bbd50..6daa296282 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator_test.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
@@ -119,19 +119,6 @@ TEST_F(OperatorTest, BuiltinAdd) {
             output_toco_op->fused_activation_function);
 }
 
-TEST_F(OperatorTest, BuiltinSpaceToBatchND) {
-  SpaceToBatchNDOperator op;
-  op.block_shape = {2, 2};
-  op.before_paddings = {1, 2};
-  op.after_paddings = {3, 4};
-
-  auto output_toco_op = SerializeAndDeserialize(
-      GetOperator("SPACE_TO_BATCH_ND", OperatorType::kSpaceToBatchND), op);
-  EXPECT_EQ(op.block_shape, output_toco_op->block_shape);
-  EXPECT_EQ(op.before_paddings, output_toco_op->before_paddings);
-  EXPECT_EQ(op.after_paddings, output_toco_op->after_paddings);
-}
-
 TEST_F(OperatorTest, BuiltinMean) {
   MeanOperator op;
   op.keep_dims = false;
-- 
GitLab


From f6c1dd3264d518d74928676d49171af77a823692 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 29 Jan 2018 15:11:32 -0800
Subject: [PATCH 357/966] Add transformation that exchanges a Reshape followed
 by an activation function.

PiperOrigin-RevId: 183735457
---
 tensorflow/contrib/lite/toco/BUILD            |  1 +
 .../fuse_activation_functions.cc              |  7 +-
 .../graph_transformations.h                   |  1 +
 .../reorder_activation_functions.cc           | 85 +++++++++++++++++++
 tensorflow/contrib/lite/toco/toco_tooling.cc  |  1 +
 tensorflow/contrib/lite/toco/tooling_util.cc  | 13 +++
 tensorflow/contrib/lite/toco/tooling_util.h   |  2 +
 7 files changed, 104 insertions(+), 6 deletions(-)
 create mode 100644 tensorflow/contrib/lite/toco/graph_transformations/reorder_activation_functions.cc

diff --git a/tensorflow/contrib/lite/toco/BUILD b/tensorflow/contrib/lite/toco/BUILD
index 6fc7e5e3fd..20c156a932 100644
--- a/tensorflow/contrib/lite/toco/BUILD
+++ b/tensorflow/contrib/lite/toco/BUILD
@@ -205,6 +205,7 @@ cc_library(
         "graph_transformations/remove_trivial_quantized_activation_func.cc",
         "graph_transformations/remove_trivial_reshape.cc",
         "graph_transformations/remove_unused_op.cc",
+        "graph_transformations/reorder_activation_functions.cc",
         "graph_transformations/resolve_batch_normalization.cc",
         "graph_transformations/resolve_batch_to_space_nd_attributes.cc",
         "graph_transformations/resolve_constant_binary.cc",
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/fuse_activation_functions.cc b/tensorflow/contrib/lite/toco/graph_transformations/fuse_activation_functions.cc
index 88e59664ec..ab943f72d1 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/fuse_activation_functions.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/fuse_activation_functions.cc
@@ -68,12 +68,7 @@ bool FuseActivationFunctions::Run(Model* model, std::size_t op_index) {
     return false;
   }
 
-  // TODO(b/72172404): Great many ops don't support activation function
-  // fusing. Switch to a categorizing function instead.
-  if (op->type == OperatorType::kConcatenation ||
-      op->type == OperatorType::kSlice ||
-      op->type == OperatorType::kTensorFlowReshape ||
-      op->type == OperatorType::kTensorFlowSplit) {
+  if (!OperatorSupportsFusedActivation(op->type)) {
     AddMessageF(
         "Not fusing activation function because the %s op doesn't support it",
         LogName(*op));
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
index e11bebcd4e..cf90ebe996 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
+++ b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
@@ -144,6 +144,7 @@ DECLARE_GRAPH_TRANSFORMATION(ResolveConstantUnaryOperator)
 DECLARE_GRAPH_TRANSFORMATION(CreateIm2colArrays)
 DECLARE_GRAPH_TRANSFORMATION(DropIm2colArrays)
 DECLARE_GRAPH_TRANSFORMATION(ReadFakeQuantMinMax)
+DECLARE_GRAPH_TRANSFORMATION(ReorderActivationFunctions)
 DECLARE_GRAPH_TRANSFORMATION(ResolveReorderAxes)
 DECLARE_GRAPH_TRANSFORMATION(ResolveTensorFlowConcat)
 DECLARE_GRAPH_TRANSFORMATION(ResolveTensorFlowMatMul)
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/reorder_activation_functions.cc b/tensorflow/contrib/lite/toco/graph_transformations/reorder_activation_functions.cc
new file mode 100644
index 0000000000..cabbc4d313
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/reorder_activation_functions.cc
@@ -0,0 +1,85 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/runtime/types.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool ReorderActivationFunctions::Run(Model* model, std::size_t op_index) {
+  const auto ac_it = model->operators.begin() + op_index;
+  std::unique_ptr<Operator>& ac_op = *ac_it;
+  DCHECK(ac_op);
+
+  if (ac_op->type != OperatorType::kRelu6 &&
+      ac_op->type != OperatorType::kRelu1 &&
+      ac_op->type != OperatorType::kRelu) {
+    return false;
+  }
+
+  auto exchange_it = FindOpWithOutput(*model, ac_op->inputs[0]);
+  if (exchange_it == model->operators.end()) return false;
+  // Find the op producing the array passed to this activation function
+  std::unique_ptr<Operator>& exchange_op = *exchange_it;
+  DCHECK(exchange_op);
+
+  if (exchange_op->type != OperatorType::kTensorFlowReshape) {
+    return false;
+  }
+
+  DCHECK_EQ(exchange_op->outputs[0], ac_op->inputs[0]);
+  const auto& exchange_op_input = exchange_op->inputs[0];
+  const auto& intermediate_array = exchange_op->outputs[0];
+  const auto& ac_op_output = ac_op->outputs[0];
+
+  int count_ops_consuming_output =
+      CountOpsWithInput(*model, intermediate_array);
+  DCHECK_GE(count_ops_consuming_output, 1);
+  if (count_ops_consuming_output > 1) {
+    AddMessageF(
+        "Not exchanging activation function with %s because it is consumed by "
+        "more than 1 other operator",
+        LogName(*exchange_op));
+    return false;
+  }
+
+  // Rewire by changing inputs, including all consumers.
+  Operator* consumer = GetFirstOpWithInput(*model, ac_op_output);
+  while (consumer) {
+    for (int i = 0; i < consumer->inputs.size(); ++i) {
+      if (consumer->inputs[i] == ac_op_output) {
+        consumer->inputs[i] = intermediate_array;
+      }
+    }
+    consumer = GetFirstOpWithInput(*model, ac_op_output);
+  }
+  ac_op->inputs[0] = exchange_op_input;
+  exchange_op->inputs[0] = ac_op_output;
+
+  // Finally, reorder operators.  Note that this only works when there are no
+  // other direct descendents of the exchange_op.
+  ac_op.swap(exchange_op);
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/toco_tooling.cc b/tensorflow/contrib/lite/toco/toco_tooling.cc
index 727df1cc76..b715881774 100644
--- a/tensorflow/contrib/lite/toco/toco_tooling.cc
+++ b/tensorflow/contrib/lite/toco/toco_tooling.cc
@@ -68,6 +68,7 @@ void MakeGeneralGraphTransformationsSet(
   transformations->Add(new ResolveTensorFlowMatMul);
   transformations->Add(new FuseBinaryIntoPrecedingAffine);
   transformations->Add(new FuseBinaryIntoFollowingAffine);
+  transformations->Add(new ReorderActivationFunctions);
   transformations->Add(new ResolveBatchNormalization);
   transformations->Add(new ResolveConstantBinaryOperator);
   transformations->Add(new ResolveConstantFill);
diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc
index 08d9ac3aff..d2741a5e9b 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.cc
+++ b/tensorflow/contrib/lite/toco/tooling_util.cc
@@ -304,6 +304,19 @@ string HelpfulOperatorTypeName(const Operator& op) {
   return OperatorTypeName(op.type);
 }
 
+bool OperatorSupportsFusedActivation(OperatorType type) {
+  switch (type) {
+    case OperatorType::kConcatenation:
+    case OperatorType::kSlice:
+    case OperatorType::kSqueeze:
+    case OperatorType::kTensorFlowReshape:
+    case OperatorType::kTensorFlowSplit:
+      return false;
+    default:
+      return true;
+  }
+}
+
 void LogSummary(int log_level, const Model& model) {
   VLOG(log_level) << "Operators summary (" << model.operators.size()
                   << " operators):";
diff --git a/tensorflow/contrib/lite/toco/tooling_util.h b/tensorflow/contrib/lite/toco/tooling_util.h
index 4051ba3576..a7e77a02eb 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.h
+++ b/tensorflow/contrib/lite/toco/tooling_util.h
@@ -82,6 +82,8 @@ std::vector<std::unique_ptr<Operator>>::iterator FindOp(Model& model,
 const char* OperatorTypeName(OperatorType type);
 string HelpfulOperatorTypeName(const Operator& op);
 
+bool OperatorSupportsFusedActivation(OperatorType type);
+
 void DumpGraphvizVideoFrame(const Model& model);
 void LogDump(int log_level, const string& message, const Model& model);
 void LogSummary(int log_level, const string& message, const Model& model);
-- 
GitLab


From e4021e7060166ead2fc14a94c048b5fc5336e495 Mon Sep 17 00:00:00 2001
From: Bjarke Hammersholt Roune <broune@google.com>
Date: Mon, 29 Jan 2018 15:21:16 -0800
Subject: [PATCH 358/966] Add new StrCat utility functions for creating failed
 Status'es.

PiperOrigin-RevId: 183737063
---
 tensorflow/compiler/xla/util.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/tensorflow/compiler/xla/util.h b/tensorflow/compiler/xla/util.h
index 2da9bb21b7..08df5b12b3 100644
--- a/tensorflow/compiler/xla/util.h
+++ b/tensorflow/compiler/xla/util.h
@@ -217,6 +217,24 @@ Status Unavailable(const char* format, ...) TF_PRINTF_ATTRIBUTE(1, 2);
 // Passed-varargs variant of the InvalidArgument factory above.
 Status InvalidArgumentV(const char* format, va_list args);
 
+template <typename... Args>
+Status UnimplementedStrCat(Args&&... concat) {
+  return Unimplemented(
+      "%s", tensorflow::strings::StrCat(std::forward<Args>(concat)...).c_str());
+}
+
+template <typename... Args>
+Status InternalErrorStrCat(Args&&... concat) {
+  return InternalError(
+      "%s", tensorflow::strings::StrCat(std::forward<Args>(concat)...).c_str());
+}
+
+template <typename... Args>
+Status ResourceExhaustedStrCat(Args&&... concat) {
+  return ResourceExhausted(
+      "%s", tensorflow::strings::StrCat(std::forward<Args>(concat)...).c_str());
+}
+
 // Splits the lines of the original, replaces leading whitespace with the prefix
 // given by "indentation", and returns the string joined by newlines again. As a
 // side effect, any additional trailing whitespace is removed.
-- 
GitLab


From 6b23b788980787684fd9fcc716660d06c5e02ad6 Mon Sep 17 00:00:00 2001
From: elilienstein <elilienstein@users.noreply.github.com>
Date: Mon, 29 Jan 2018 15:27:57 -0800
Subject: [PATCH 359/966] Fix typos 'followings' 'optionanl' (#16549)

---
 configure.py                                     | 4 ++--
 tensorflow/contrib/coder/README.md               | 2 +-
 tensorflow/core/kernels/fractional_pool_common.h | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/configure.py b/configure.py
index 16763b8c0d..27519b4aba 100644
--- a/configure.py
+++ b/configure.py
@@ -298,7 +298,7 @@ def get_var(environ_cp,
       System".
     enabled_by_default: boolean for default behavior.
     question: optional string for how to ask for user input.
-    yes_reply: optionanl string for reply when feature is enabled.
+    yes_reply: optional string for reply when feature is enabled.
     no_reply: optional string for reply when feature is disabled.
 
   Returns:
@@ -411,7 +411,7 @@ def set_action_env_var(environ_cp,
       System".
     enabled_by_default: boolean for default behavior.
     question: optional string for how to ask for user input.
-    yes_reply: optionanl string for reply when feature is enabled.
+    yes_reply: optional string for reply when feature is enabled.
     no_reply: optional string for reply when feature is disabled.
   """
   var = int(
diff --git a/tensorflow/contrib/coder/README.md b/tensorflow/contrib/coder/README.md
index e1e867db5a..c6c379c458 100644
--- a/tensorflow/contrib/coder/README.md
+++ b/tensorflow/contrib/coder/README.md
@@ -30,7 +30,7 @@ following sense:
   around,
 - The number of CDF axes does not extend, i.e., `CDF.ndim == data.ndim + 1`.
 
-In the previous example where data has shape (10, 10), the followings are
+In the previous example where data has shape (10, 10), the following are
 acceptable CDF shapes:
 
 - (10, 10, 65)
diff --git a/tensorflow/core/kernels/fractional_pool_common.h b/tensorflow/core/kernels/fractional_pool_common.h
index df0bbbfa06..2d7a230fc0 100644
--- a/tensorflow/core/kernels/fractional_pool_common.h
+++ b/tensorflow/core/kernels/fractional_pool_common.h
@@ -57,7 +57,7 @@ static inline void RandomShuffle(Iter first, Iter last, const Random& uniform) {
 //     * sum(generated_diff_pooling_sequence) = input_length
 //     * Let's define floor(input_length / output_length) = K, then
 //       K <= generated_diff_pooling_sequence[i] <= K+1
-// For example, when input_length = 10, output_length = 6, the followings are
+// For example, when input_length = 10, output_length = 6, the following are
 // valid pooling sequence:
 //     * [1, 2, 2, 1, 2, 2]
 //     * [1, 1, 2, 2, 2, 2]
-- 
GitLab


From 8c1f3b3fcdc5f897dbce3b7315c1b7000069f9c4 Mon Sep 17 00:00:00 2001
From: ImSheridan <xiaoyudong-512@163.com>
Date: Tue, 30 Jan 2018 07:28:27 +0800
Subject: [PATCH 360/966] Fix some typos in the documation
 (tensorflow/docs_src) (#16519)

* Fix some typos in the documation (tensorflow/docs_src)

* update another minor typo fix

* revert typo correction

It seems "infed" was being used as a past-tense verb here and so was correct.
---
 tensorflow/docs_src/api_guides/python/regression_examples.md | 2 +-
 tensorflow/docs_src/get_started/custom_estimators.md         | 2 +-
 tensorflow/docs_src/get_started/datasets_quickstart.md       | 4 ++--
 tensorflow/docs_src/get_started/feature_columns.md           | 4 ++--
 tensorflow/docs_src/get_started/premade_estimators.md        | 2 +-
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tensorflow/docs_src/api_guides/python/regression_examples.md b/tensorflow/docs_src/api_guides/python/regression_examples.md
index 45cb9d829c..dae50a8f03 100644
--- a/tensorflow/docs_src/api_guides/python/regression_examples.md
+++ b/tensorflow/docs_src/api_guides/python/regression_examples.md
@@ -229,4 +229,4 @@ passed through to the `model_fn` when the `model_fn` is called.
 The `model_fn` returns an
 @{tf.estimator.EstimatorSpec$`EstimatorSpec`} which is a simple structure
 indicating to the `Estimator` which operations should be run to accomplish
-varions tasks.
+various tasks.
diff --git a/tensorflow/docs_src/get_started/custom_estimators.md b/tensorflow/docs_src/get_started/custom_estimators.md
index 6343cc4ee4..79c4ee75d0 100644
--- a/tensorflow/docs_src/get_started/custom_estimators.md
+++ b/tensorflow/docs_src/get_started/custom_estimators.md
@@ -15,7 +15,7 @@ git clone https://github.com/tensorflow/models/
 cd models/samples/core/get_started
 ```
 
-In this document we wil be looking at
+In this document we will be looking at
 [`custom_estimator.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/custom_estimator.py).
 You can run it with the following command:
 
diff --git a/tensorflow/docs_src/get_started/datasets_quickstart.md b/tensorflow/docs_src/get_started/datasets_quickstart.md
index ecfbf160f0..a8a2ab6e56 100644
--- a/tensorflow/docs_src/get_started/datasets_quickstart.md
+++ b/tensorflow/docs_src/get_started/datasets_quickstart.md
@@ -169,7 +169,7 @@ the number of examples in the `Dataset` ensures that the data is completely
 shuffled. The Iris data set only contains 150 examples.
 
 The @{tf.data.Dataset.repeat$`repeat`} method has the `Dataset` restart when
-it reaches the end. To limit the number of epochss, set the `count` argument.
+it reaches the end. To limit the number of epochs, set the `count` argument.
 
 The @{tf.data.Dataset.repeat$`batch`} method collects a number of examples and
 stacks them, to create batches. This adds a dimension to their shape. The new
@@ -282,7 +282,7 @@ produce the necessary `(features, label)` pairs.
 
 We will start by building a function to parse a single line.
 
-The following `iris_data.parse_line` function acomplishes this taks using the
+The following `iris_data.parse_line` function accomplishes this task using the
 @{tf.decode_csv} function, and some simple python code:
 
 We must parse each of the lines in the dataset in order to generate the
diff --git a/tensorflow/docs_src/get_started/feature_columns.md b/tensorflow/docs_src/get_started/feature_columns.md
index e3308ed716..ad3e1fe3e3 100644
--- a/tensorflow/docs_src/get_started/feature_columns.md
+++ b/tensorflow/docs_src/get_started/feature_columns.md
@@ -461,8 +461,8 @@ permitting a richer palette of numbers for every cell, an embedding column
 contains far fewer cells than an indicator column.
 
 Let's look at an example comparing indicator and embedding columns. Suppose our
-input examples consists of different words from a limited palette of only 81
-words. Further suppose that the data set provides provides the following input
+input examples consist of different words from a limited palette of only 81
+words. Further suppose that the data set provides the following input
 words in 4 separate examples:
 
 * `"dog"`
diff --git a/tensorflow/docs_src/get_started/premade_estimators.md b/tensorflow/docs_src/get_started/premade_estimators.md
index dbc35065ab..4ef212a5b5 100644
--- a/tensorflow/docs_src/get_started/premade_estimators.md
+++ b/tensorflow/docs_src/get_started/premade_estimators.md
@@ -363,7 +363,7 @@ Test set accuracy: 0.967
 
 We now have a trained model that produces good evaluation results.
 We can now use the trained model to predict the species of an Iris flower
-based on some unlabeled measurments. As with training and evaluation, we make
+based on some unlabeled measurements. As with training and evaluation, we make
 predictions using a single function call:
 
 ```python
-- 
GitLab


From e08f0080f822543d0a306075878c2e35dabf8cc0 Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Mon, 29 Jan 2018 15:34:09 -0800
Subject: [PATCH 361/966] Make with_c_api more robust and enable C API in most
 of saved_model_test.py.

This change makes the test_util.with_c_api decorator call
reset_default_graph() after enabling or disabling the C API instead of
creating a new Graph. This makes it more robust to tests that call
reset_default_graph(), which requires that the current default graph
isn't nested (which the C API-enabled Graph previously was).

In addition, enables the C API with saved_model_test.py (which
required the above change). A few tests still need further changes,
which I'll post in subsequent patches.

PiperOrigin-RevId: 183739148
---
 tensorflow/python/framework/test_util.py      | 13 ++--
 .../python/saved_model/saved_model_test.py    | 61 +++++++++++--------
 2 files changed, 43 insertions(+), 31 deletions(-)

diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 4a8aa2e258..70f6a2acba 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -320,12 +320,17 @@ def _use_c_api_wrapper(fn, use_c_api, *args, **kwargs):
   prev_value = ops._USE_C_API
   ops._USE_C_API = use_c_api
   try:
-    with ops.Graph().as_default():
-      fn(*args, **kwargs)
+    # Reset the default graph so it has the C API enabled. We call
+    # reset_default_graph() instead of creating a new default Graph context to
+    # make this robust to tests that call reset_default_graph(), which requires
+    # that the current default graph isn't nested.
+    ops.reset_default_graph()
+    fn(*args, **kwargs)
   finally:
     ops._USE_C_API = prev_value
-
-
+    # Make sure default graph reflects prev_value in case next test doesn't call
+    # reset_default_graph().
+    ops.reset_default_graph()
 # pylint: disable=protected-access
 
 
diff --git a/tensorflow/python/saved_model/saved_model_test.py b/tensorflow/python/saved_model/saved_model_test.py
index 1ea619ff55..f92247d52e 100644
--- a/tensorflow/python/saved_model/saved_model_test.py
+++ b/tensorflow/python/saved_model/saved_model_test.py
@@ -54,8 +54,14 @@ def tearDownModule():
   file_io.delete_recursively(test.get_temp_dir())
 
 
+@test_util.with_c_api
 class SavedModelTest(test.TestCase):
 
+  def _get_export_dir(self, label):
+    if ops._USE_C_API:
+      label += "_c_api"
+    return os.path.join(test.get_temp_dir(), label)
+
   def _init_and_validate_variable(self, sess, variable_name, variable_value):
     v = variables.Variable(variable_value, name=variable_name)
     sess.run(variables.global_variables_initializer())
@@ -123,8 +129,7 @@ class SavedModelTest(test.TestCase):
     self.assertFalse(loader.maybe_saved_model_directory(base_path))
 
   def testBadSavedModelFileFormat(self):
-    export_dir = os.path.join(test.get_temp_dir(),
-                              "test_bad_saved_model_file_format")
+    export_dir = self._get_export_dir("test_bad_saved_model_file_format")
     # Attempt to load a SavedModel from an export directory that does not exist.
     with self.test_session(graph=ops.Graph()) as sess:
       with self.assertRaisesRegexp(IOError,
@@ -157,8 +162,7 @@ class SavedModelTest(test.TestCase):
         loader.load(sess, ["foo"], export_dir)
 
   def testVerifySessionGraphUsage(self):
-    export_dir = os.path.join(test.get_temp_dir(),
-                              "test_verify_session_graph_usage")
+    export_dir = self._get_export_dir("test_verify_session_graph_usage")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     with self.test_session(graph=ops.Graph()) as sess:
@@ -178,7 +182,7 @@ class SavedModelTest(test.TestCase):
           42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
 
   def testSequence(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_sequence")
+    export_dir = self._get_export_dir("test_sequence")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     # Expect an assertion error since add_meta_graph_and_variables() should be
@@ -195,7 +199,7 @@ class SavedModelTest(test.TestCase):
                         sess, ["baz"])
 
   def testTags(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_tags")
+    export_dir = self._get_export_dir("test_tags")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     # Graph with a single variable. SavedModel invoked to:
@@ -284,7 +288,7 @@ class SavedModelTest(test.TestCase):
                         export_dir)
 
   def testVariables(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_variables")
+    export_dir = self._get_export_dir("test_variables")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     # Graph with two variables. SavedModel invoked to:
@@ -336,7 +340,7 @@ class SavedModelTest(test.TestCase):
                         export_dir)
 
   def testGraphWithoutVariables(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_graph_has_variables")
+    export_dir = self._get_export_dir("test_graph_has_variables")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     # Graph with no variables.
@@ -371,7 +375,7 @@ class SavedModelTest(test.TestCase):
       self.assertEqual(30.0, sess.run(c))
 
   def testNoOverwrite(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_no_overwrite")
+    export_dir = self._get_export_dir("test_no_overwrite")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     # Graph with a single variable. SavedModel invoked to:
@@ -395,7 +399,7 @@ class SavedModelTest(test.TestCase):
                       export_dir)
 
   def testSaveAsText(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_astext")
+    export_dir = self._get_export_dir("test_astext")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     # Graph with a single variable. SavedModel invoked to:
@@ -426,7 +430,7 @@ class SavedModelTest(test.TestCase):
           42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
 
   def testCollections(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_collections")
+    export_dir = self._get_export_dir("test_collections")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     # Graph with a single variable added to a collection. SavedModel invoked to:
@@ -476,7 +480,7 @@ class SavedModelTest(test.TestCase):
       self.assertEqual(len(ops.get_collection("foo_vars")), 0)
 
   def testSignatureDefs(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_signature_defs")
+    export_dir = self._get_export_dir("test_signature_defs")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     # Graph with a single variable and a single entry in the signature def map.
@@ -536,8 +540,7 @@ class SavedModelTest(test.TestCase):
       self.assertEqual("foo_new", bar_signature["foo_key"].method_name)
 
   def testSignatureDefValidation(self):
-    export_dir = os.path.join(test.get_temp_dir(),
-                              "test_signature_def_validation")
+    export_dir = self._get_export_dir("test_signature_def_validation")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     tensor_without_name = meta_graph_pb2.TensorInfo()
@@ -555,7 +558,7 @@ class SavedModelTest(test.TestCase):
     self._validate_outputs_tensor_info(builder, tensor_empty)
 
   def testAssets(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_assets")
+    export_dir = self._get_export_dir("test_assets")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     with self.test_session(graph=ops.Graph()) as sess:
@@ -588,7 +591,7 @@ class SavedModelTest(test.TestCase):
       self.assertFalse(file_io.file_exists(ignored_asset_path))
 
   def testCustomMainOp(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_main_op")
+    export_dir = self._get_export_dir("test_main_op")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     with self.test_session(graph=ops.Graph()) as sess:
@@ -623,7 +626,7 @@ class SavedModelTest(test.TestCase):
       self.assertEqual(3, ops.get_collection("v")[2].eval())
 
   def testLegacyInitOp(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_legacy_init_op")
+    export_dir = self._get_export_dir("test_legacy_init_op")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     with self.test_session(graph=ops.Graph()) as sess:
@@ -657,8 +660,8 @@ class SavedModelTest(test.TestCase):
       self.assertEqual(3, ops.get_collection("v")[2].eval())
 
   def testLegacyInitOpWithNonEmptyCollection(self):
-    export_dir = os.path.join(test.get_temp_dir(),
-                              "test_legacy_init_op_with_non_empty_collection")
+    export_dir = self._get_export_dir(
+        "test_legacy_init_op_with_non_empty_collection")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     with self.test_session(graph=ops.Graph()) as sess:
@@ -685,7 +688,7 @@ class SavedModelTest(test.TestCase):
             sess, ["foo"], legacy_init_op=legacy_init_op)
 
   def testMultipleAssets(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_multiple_assets")
+    export_dir = self._get_export_dir("test_multiple_assets")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     with self.test_session(graph=ops.Graph()) as sess:
@@ -727,7 +730,7 @@ class SavedModelTest(test.TestCase):
                                       "asset_file_tensor:0")
 
   def testDuplicateAssets(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_duplicate_assets")
+    export_dir = self._get_export_dir("test_duplicate_assets")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     with self.test_session(graph=ops.Graph()) as sess:
@@ -775,7 +778,7 @@ class SavedModelTest(test.TestCase):
                                       "asset_file_tensor:0")
 
   def testOp(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_op")
+    export_dir = self._get_export_dir("test_op")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     with session.Session(
@@ -818,7 +821,7 @@ class SavedModelTest(test.TestCase):
       self.assertEqual(3, ops.get_collection("v")[2].eval())
 
   def testCustomSaveable(self):
-    export_dir = os.path.join(test.get_temp_dir(), "custom_saveable")
+    export_dir = self._get_export_dir("custom_saveable")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     with session.Session(
@@ -847,7 +850,7 @@ class SavedModelTest(test.TestCase):
       self.assertEqual(3.0, v1.values().eval())
 
   def testClearDevices(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_clear_devices")
+    export_dir = self._get_export_dir("test_clear_devices")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     # Specify a device and save a variable.
@@ -871,7 +874,9 @@ class SavedModelTest(test.TestCase):
           42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
 
   def testStripDefaultAttrs(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_strip_default_attrs")
+    if ops._USE_C_API: return  # TODO(skyewm): get this working
+
+    export_dir = self._get_export_dir("test_strip_default_attrs")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     # Add a graph with two float32 variables and a Complex Op composing them
@@ -941,8 +946,10 @@ class SavedModelTest(test.TestCase):
     self.assertIn("Tout", node_def.attr)
 
   def testStripDefaultAttrsInconsistentConsumerDefaults(self):
-    export_dir = os.path.join(test.get_temp_dir(),
-                              "test_strip_default_attrs_no_consumer_defaults")
+    if ops._USE_C_API: return  # TODO(skyewm): get this working
+
+    export_dir = self._get_export_dir(
+        "test_strip_default_attrs_no_consumer_defaults")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     # Add a graph with two float32 variables and a Complex Op composing them
-- 
GitLab


From fd48cf4fed0be0abdb77b56a377836b1be0f7257 Mon Sep 17 00:00:00 2001
From: Brennan Saeta <saeta@google.com>
Date: Mon, 29 Jan 2018 15:41:11 -0800
Subject: [PATCH 362/966] GCS: Add client-side throttle.

This throttle is loosely based on a leaky bucket configuration that captures
both the cost of the requests as well as the bandwidth consumed into a single
token value.

PiperOrigin-RevId: 183740223
---
 tensorflow/core/platform/cloud/BUILD          |  25 +++
 .../core/platform/cloud/gcs_file_system.cc    |  99 +++++++----
 .../core/platform/cloud/gcs_file_system.h     |   2 +
 .../core/platform/cloud/gcs_throttle.cc       |  62 +++++++
 tensorflow/core/platform/cloud/gcs_throttle.h | 156 ++++++++++++++++++
 .../core/platform/cloud/gcs_throttle_test.cc  | 101 ++++++++++++
 6 files changed, 415 insertions(+), 30 deletions(-)
 create mode 100644 tensorflow/core/platform/cloud/gcs_throttle.cc
 create mode 100644 tensorflow/core/platform/cloud/gcs_throttle.h
 create mode 100644 tensorflow/core/platform/cloud/gcs_throttle_test.cc

diff --git a/tensorflow/core/platform/cloud/BUILD b/tensorflow/core/platform/cloud/BUILD
index 07aecf8483..9ba25dea4f 100644
--- a/tensorflow/core/platform/cloud/BUILD
+++ b/tensorflow/core/platform/cloud/BUILD
@@ -57,6 +57,17 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "gcs_throttle",
+    srcs = ["gcs_throttle.cc"],
+    hdrs = ["gcs_throttle.h"],
+    copts = tf_copts(),
+    visibility = ["//tensorflow:__subpackages__"],
+    deps = [
+        "//tensorflow/core:lib",
+    ],
+)
+
 cc_library(
     name = "gcs_file_system",
     srcs = ["gcs_file_system.cc"],
@@ -69,6 +80,7 @@ cc_library(
         ":expiring_lru_cache",
         ":file_block_cache",
         ":gcs_dns_cache",
+        ":gcs_throttle",
         ":google_auth_provider",
         ":http_request",
         ":retrying_file_system",
@@ -271,6 +283,19 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "gcs_throttle_test",
+    size = "small",
+    srcs = ["gcs_throttle_test.cc"],
+    linkopts = if_windows(["-DEFAULTLIB:ws2_32.lib"]),
+    deps = [
+        ":gcs_throttle",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 tf_cc_test(
     name = "curl_http_request_test",
     size = "small",
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc
index 91d381bd6f..01ca0d76ba 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system.cc
@@ -116,6 +116,15 @@ constexpr char kWriteRequestTimeout[] = "GCS_WRITE_REQUEST_TIMEOUT_SECS";
 // The environment variable to configure an additional header to send with
 // all requests to GCS (format HEADERNAME:HEADERCONTENT)
 constexpr char kAdditionalRequestHeader[] = "GCS_ADDITIONAL_REQUEST_HEADER";
+// The environment variable to configure the throttle (format: <int64>)
+constexpr char kThrottleRate[] = "GCS_THROTTLE_TOKEN_RATE";
+// The environment variable to configure the token bucket size (format: <int64>)
+constexpr char kThrottleBucket[] = "GCS_THROTTLE_BUCKET_SIZE";
+// The environment variable that controls the number of tokens per request.
+// (format: <int64>)
+constexpr char kTokensPerRequest[] = "GCS_TOKENS_PER_REQUEST";
+// The environment variable to configure the initial tokens (format: <int64>)
+constexpr char kInitialTokens[] = "GCS_INITIAL_TOKENS";
 
 // TODO: DO NOT use a hardcoded path
 Status GetTmpFilename(string* filename) {
@@ -721,6 +730,26 @@ GcsFileSystem::GcsFileSystem()
   if (GetEnvVar(kWriteRequestTimeout, strings::safe_strtou32, &timeout_value)) {
     timeouts_.write = timeout_value;
   }
+
+  int64 token_value;
+  if (GetEnvVar(kThrottleRate, strings::safe_strto64, &token_value)) {
+    GcsThrottleConfig config;
+    config.enabled = true;
+    config.token_rate = token_value;
+
+    if (GetEnvVar(kThrottleBucket, strings::safe_strto64, &token_value)) {
+      config.bucket_size = token_value;
+    }
+
+    if (GetEnvVar(kTokensPerRequest, strings::safe_strto64, &token_value)) {
+      config.tokens_per_request = token_value;
+    }
+
+    if (GetEnvVar(kInitialTokens, strings::safe_strto64, &token_value)) {
+      config.initial_tokens = token_value;
+    }
+    throttle_.SetConfig(config);
+  }
 }
 
 GcsFileSystem::GcsFileSystem(
@@ -774,7 +803,9 @@ Status GcsFileSystem::LoadBufferFromGCS(const string& filename, size_t offset,
   TF_RETURN_IF_ERROR(ParseGcsPath(filename, false, &bucket, &object));
 
   std::unique_ptr<HttpRequest> request;
-  TF_RETURN_IF_ERROR(CreateHttpRequest(&request));
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(CreateHttpRequest(&request),
+                                  "when reading gs://", bucket, "/", object);
+
   request->SetUri(strings::StrCat("https://", kStorageHost, "/", bucket, "/",
                                   request->EscapeString(object)));
   request->SetRange(offset, offset + n - 1);
@@ -789,6 +820,8 @@ Status GcsFileSystem::LoadBufferFromGCS(const string& filename, size_t offset,
   VLOG(1) << "Successful read of gs://" << bucket << "/" << object << " @ "
           << offset << " of size: " << bytes_read;
 
+  throttle_.RecordResponse(bytes_read);
+
   if (bytes_read < block_size()) {
     // Check stat cache to see if we encountered an interrupted read.
     FileStatistics stat;
@@ -926,41 +959,43 @@ Status GcsFileSystem::StatForObject(const string& fname, const string& bucket,
         "'object' must be a non-empty string. (File: %s)", fname.c_str()));
   }
 
-  StatCache::ComputeFunc compute_func =
-      [this, &bucket, &object](const string& fname, FileStatistics* stat) {
-        std::vector<char> output_buffer;
-        std::unique_ptr<HttpRequest> request;
-        TF_RETURN_IF_ERROR(CreateHttpRequest(&request));
-        request->SetUri(strings::StrCat(kGcsUriBase, "b/", bucket, "/o/",
-                                        request->EscapeString(object),
-                                        "?fields=size%2Cupdated"));
-        request->SetResultBuffer(&output_buffer);
-        request->SetTimeouts(timeouts_.connect, timeouts_.idle,
-                             timeouts_.metadata);
+  StatCache::ComputeFunc compute_func = [this, &bucket, &object](
+                                            const string& fname,
+                                            FileStatistics* stat) {
+    std::vector<char> output_buffer;
+    std::unique_ptr<HttpRequest> request;
+    TF_RETURN_WITH_CONTEXT_IF_ERROR(CreateHttpRequest(&request),
+                                    " when reading metadata of gs://", bucket,
+                                    "/", object);
+
+    request->SetUri(strings::StrCat(kGcsUriBase, "b/", bucket, "/o/",
+                                    request->EscapeString(object),
+                                    "?fields=size%2Cupdated"));
+    request->SetResultBuffer(&output_buffer);
+    request->SetTimeouts(timeouts_.connect, timeouts_.idle, timeouts_.metadata);
 
-        TF_RETURN_WITH_CONTEXT_IF_ERROR(request->Send(),
-                                        " when reading metadata of gs://",
-                                        bucket, "/", object);
+    TF_RETURN_WITH_CONTEXT_IF_ERROR(request->Send(),
+                                    " when reading metadata of gs://", bucket,
+                                    "/", object);
 
-        Json::Value root;
-        TF_RETURN_IF_ERROR(ParseJson(output_buffer, &root));
+    Json::Value root;
+    TF_RETURN_IF_ERROR(ParseJson(output_buffer, &root));
 
-        // Parse file size.
-        TF_RETURN_IF_ERROR(GetInt64Value(root, "size", &stat->length));
+    // Parse file size.
+    TF_RETURN_IF_ERROR(GetInt64Value(root, "size", &stat->length));
 
-        // Parse file modification time.
-        string updated;
-        TF_RETURN_IF_ERROR(GetStringValue(root, "updated", &updated));
-        TF_RETURN_IF_ERROR(ParseRfc3339Time(updated, &(stat->mtime_nsec)));
+    // Parse file modification time.
+    string updated;
+    TF_RETURN_IF_ERROR(GetStringValue(root, "updated", &updated));
+    TF_RETURN_IF_ERROR(ParseRfc3339Time(updated, &(stat->mtime_nsec)));
 
-        VLOG(1) << "Stat of: gs://" << bucket << "/" << object << " -- "
-                << " length: " << stat->length
-                << "; mtime_nsec: " << stat->mtime_nsec
-                << "; updated: " << updated;
+    VLOG(1) << "Stat of: gs://" << bucket << "/" << object << " -- "
+            << " length: " << stat->length
+            << "; mtime_nsec: " << stat->mtime_nsec << "; updated: " << updated;
 
-        stat->is_directory = false;
-        return Status::OK();
-      };
+    stat->is_directory = false;
+    return Status::OK();
+  };
 
   TF_RETURN_IF_ERROR(stat_cache_->LookupOrCompute(fname, stat, compute_func));
   if (stat->is_directory) {
@@ -1438,6 +1473,10 @@ Status GcsFileSystem::CreateHttpRequest(std::unique_ptr<HttpRequest>* request) {
                            additional_header_->second);
   }
 
+  if (!throttle_.AdmitRequest()) {
+    return errors::Unavailable("Request throttled");
+  }
+
   *request = std::move(new_request);
   return Status::OK();
 }
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.h b/tensorflow/core/platform/cloud/gcs_file_system.h
index 2eae39608e..e8edde8a44 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.h
+++ b/tensorflow/core/platform/cloud/gcs_file_system.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/platform/cloud/expiring_lru_cache.h"
 #include "tensorflow/core/platform/cloud/file_block_cache.h"
 #include "tensorflow/core/platform/cloud/gcs_dns_cache.h"
+#include "tensorflow/core/platform/cloud/gcs_throttle.h"
 #include "tensorflow/core/platform/cloud/http_request.h"
 #include "tensorflow/core/platform/cloud/retrying_file_system.h"
 #include "tensorflow/core/platform/file_system.h"
@@ -194,6 +195,7 @@ class GcsFileSystem : public FileSystem {
   std::unique_ptr<HttpRequest::Factory> http_request_factory_;
   std::unique_ptr<FileBlockCache> file_block_cache_;
   std::unique_ptr<GcsDnsCache> dns_cache_;
+  GcsThrottle throttle_;
 
   using StatCache = ExpiringLRUCache<FileStatistics>;
   std::unique_ptr<StatCache> stat_cache_;
diff --git a/tensorflow/core/platform/cloud/gcs_throttle.cc b/tensorflow/core/platform/cloud/gcs_throttle.cc
new file mode 100644
index 0000000000..eb5f8958a3
--- /dev/null
+++ b/tensorflow/core/platform/cloud/gcs_throttle.cc
@@ -0,0 +1,62 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/cloud/gcs_throttle.h"
+
+#include <algorithm>
+
+namespace tensorflow {
+
+GcsThrottle::GcsThrottle(EnvTime* env_time)
+    : last_updated_secs_(env_time->NowSeconds()),
+      available_tokens_(0),
+      env_time_(env_time) {}
+
+bool GcsThrottle::AdmitRequest() {
+  mutex_lock l(mu_);
+  if (!config_.enabled) return true;
+  UpdateState();
+  if (available_tokens_ < config_.tokens_per_request) {
+    return false;
+  }
+  available_tokens_ -= config_.tokens_per_request;
+  return true;
+}
+
+void GcsThrottle::RecordResponse(size_t num_bytes) {
+  mutex_lock l(mu_);
+  if (!config_.enabled) return;
+  UpdateState();
+  available_tokens_ -= request_bytes_to_tokens(num_bytes);
+}
+
+void GcsThrottle::SetConfig(GcsThrottleConfig config) {
+  mutex_lock l(mu_);
+  config_ = config;
+  available_tokens_ = config.initial_tokens;
+  last_updated_secs_ = env_time_->NowSeconds();
+}
+
+void GcsThrottle::UpdateState() {
+  // TODO(b/72643279): Switch to a monotonic clock.
+  int64 now = env_time_->NowSeconds();
+  uint64 delta_secs =
+      std::max(0LL, now - static_cast<int64>(last_updated_secs_));
+  available_tokens_ += delta_secs * config_.token_rate;
+  available_tokens_ = std::min(available_tokens_, config_.bucket_size);
+  last_updated_secs_ = now;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/gcs_throttle.h b/tensorflow/core/platform/cloud/gcs_throttle.h
new file mode 100644
index 0000000000..8e46fca6ca
--- /dev/null
+++ b/tensorflow/core/platform/cloud/gcs_throttle.h
@@ -0,0 +1,156 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_CLOUD_GCS_THROTTLE_H_
+#define TENSORFLOW_CORE_PLATFORM_CLOUD_GCS_THROTTLE_H_
+
+#include "tensorflow/core/platform/env.h"
+
+namespace tensorflow {
+
+/**
+ * GcsThrottleConfig is used to configure the GcsThrottle.
+ */
+struct GcsThrottleConfig {
+  /**
+   * enabled is true if GcsThrottle should throttle requests, false otherwise.
+   */
+  bool enabled = false;
+
+  /**
+   * token_rate is the number of tokens accrued every second that can be used
+   * for making requests to the GCS service.
+   */
+  int64 token_rate = 100000;  // Approximately 800 MBits/second bandwidth-only.
+
+  /**
+   * bucket_size is the maximum number of available tokens the GcsThrottle can
+   * accrue.
+   */
+  int64 bucket_size = 10000000;  // 10 million tokens total
+
+  /**
+   * tokens_per_request determines the number of tokens consumed for every
+   * request.
+   *
+   * Note: tokens are also consumed in proportion to the response size.
+   */
+  int64 tokens_per_request = 100;
+
+  /**
+   * initial_tokens determines how many tokens should be available immediately
+   * after the GcsThrottle is constructed.
+   */
+  int64 initial_tokens = 0;
+};
+
+/**
+ * GcsThrottle is used to ensure fair use of the available GCS capacity.
+ *
+ * GcsThrottle operates around a concept of tokens. Tokens are consumed when
+ * making requests to the GCS service. Tokens are consumed both based on the
+ * number of requests made, as well as the bandwidth consumed (response sizes).
+ *
+ * GcsThrottle is thread safe and can be used from multiple threads.
+ */
+class GcsThrottle {
+ public:
+  /**
+   * Constructs a GcsThrottle.
+   */
+  explicit GcsThrottle(EnvTime* env_time = EnvTime::Default());
+
+  /**
+   * AdmitRequest updates the GcsThrottle to record a request will be made.
+   *
+   * AdmitRequest should be called before any request is made. AdmitRequest
+   * returns false if the request should be denied. If AdmitRequest
+   * returns false, no tokens are consumed. If true is returned, the configured
+   * number of tokens are consumed.
+   */
+  bool AdmitRequest();
+
+  /**
+   * RecordResponse updates the GcsThrottle to record a request has been made.
+   *
+   * RecordResponse should be called after the response has been received.
+   * RecordResponse will update the internal state based on the number of bytes
+   * in the response.
+   *
+   * Note: we split up the request and the response in this fashion in order to
+   * avoid penalizing consumers who are using large readahead buffers at higher
+   * layers of the I/O stack.
+   */
+  void RecordResponse(size_t num_bytes);
+
+  /**
+   * SetConfig sets the configuration for GcsThrottle and re-initializes state.
+   *
+   * After calling this, the token pool will be config.initial_tokens.
+   */
+  void SetConfig(GcsThrottleConfig config);
+
+  /**
+   * available_tokens gives a snapshot of how many tokens are available.
+   *
+   * The returned value should not be used to make admission decisions. The
+   * purpose of this function is to make available to monitoring or other
+   * instrumentation the number of available tokens in the pool.
+   */
+  inline int64 available_tokens() {
+    mutex_lock l(mu_);
+    if (!config_.enabled) return 0;
+    UpdateState();
+    return available_tokens_;
+  }
+
+ private:
+  /**
+   * UpdateState updates the available_tokens_ and last_updated_secs_ variables.
+   *
+   * UpdateState should be called in order to mark the passage of time, and
+   * therefore add tokens to the availble_tokens_ pool.
+   */
+  void UpdateState() EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  inline uint64 request_bytes_to_tokens(size_t num_bytes) {
+    return num_bytes >> 8;
+  }
+
+  mutex mu_;
+
+  /**
+   * last_updated_secs_ records the number of seconds since the Unix epoch that
+   * the internal state of the GcsThrottle was updated. This is important when
+   * determining the number of tokens to add to the available_tokens_ pool.
+   */
+  uint64 last_updated_secs_ GUARDED_BY(mu_) = 0;
+
+  /**
+   * available_tokens_ records how many tokens are available to be consumed.
+   *
+   * Note: it is possible for available_tokens_ to become negative. If a
+   * response comes back that consumes more than the available tokens, the count
+   * will go negative, and block future requests until we have available tokens.
+   */
+  int64 available_tokens_ GUARDED_BY(mu_) = 0;
+
+  EnvTime* const env_time_;
+  GcsThrottleConfig config_ GUARDED_BY(mu_);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_CLOUD_GCS_THROTTLE_H_
diff --git a/tensorflow/core/platform/cloud/gcs_throttle_test.cc b/tensorflow/core/platform/cloud/gcs_throttle_test.cc
new file mode 100644
index 0000000000..a1e8167c27
--- /dev/null
+++ b/tensorflow/core/platform/cloud/gcs_throttle_test.cc
@@ -0,0 +1,101 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/cloud/gcs_throttle.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+namespace {
+
+class TestTime : public EnvTime {
+ public:
+  uint64 NowMicros() override { return now_; }
+
+  void SetTime(uint64 now_micros) { now_ = now_micros; }
+
+  void AdvanceSeconds(int64 secs) { now_ += secs * 1000000L; }
+
+ private:
+  uint64 now_ = 1234567890000000ULL;
+};
+
+class GcsThrottleTest : public ::testing::Test {
+ protected:
+  GcsThrottleTest() : throttle_(&time_) {
+    config_.enabled = true;
+    throttle_.SetConfig(config_);
+  }
+
+  GcsThrottleConfig config_;
+  TestTime time_;
+  GcsThrottle throttle_;
+};
+
+TEST_F(GcsThrottleTest, ReplenishTokens) {
+  EXPECT_EQ(0, throttle_.available_tokens());
+  time_.AdvanceSeconds(1);
+  EXPECT_EQ(100000, throttle_.available_tokens());
+  time_.AdvanceSeconds(2);
+  EXPECT_EQ(300000, throttle_.available_tokens());
+}
+
+TEST_F(GcsThrottleTest, RejectRequest) {
+  EXPECT_EQ(0, throttle_.available_tokens());
+  time_.AdvanceSeconds(1);
+  EXPECT_TRUE(throttle_.AdmitRequest());
+  EXPECT_EQ(99900, throttle_.available_tokens());
+  for (int i = 1; i < 1000; i++) {
+    EXPECT_TRUE(throttle_.AdmitRequest());
+  }
+  EXPECT_FALSE(throttle_.AdmitRequest());
+}
+
+TEST_F(GcsThrottleTest, MarkResponses) {
+  time_.AdvanceSeconds(1);
+  EXPECT_TRUE(throttle_.AdmitRequest());
+  throttle_.RecordResponse(32000000);  // 32 MB response
+  EXPECT_EQ(-25100, throttle_.available_tokens());
+  EXPECT_FALSE(throttle_.AdmitRequest());
+  time_.AdvanceSeconds(1);
+  EXPECT_TRUE(throttle_.AdmitRequest())
+      << "Available tokens: " << throttle_.available_tokens();
+}
+
+TEST_F(GcsThrottleTest, Skippingtime_) {
+  EXPECT_EQ(0, throttle_.available_tokens());
+  time_.AdvanceSeconds(90);
+  EXPECT_EQ(9000000, throttle_.available_tokens());
+}
+
+TEST_F(GcsThrottleTest, BucketLimit) {
+  time_.AdvanceSeconds(120);
+  EXPECT_EQ(10000000, throttle_.available_tokens());
+}
+
+TEST_F(GcsThrottleTest, ReverseTime) {
+  time_.AdvanceSeconds(1);
+  EXPECT_EQ(100000, throttle_.available_tokens());
+  time_.AdvanceSeconds(-3600);
+  EXPECT_EQ(100000, throttle_.available_tokens());
+  time_.AdvanceSeconds(1);
+  EXPECT_EQ(200000, throttle_.available_tokens());
+}
+
+}  // namespace
+
+}  // namespace tensorflow
-- 
GitLab


From cc6d79f045a3c00bd1579523bda23aecead6cf71 Mon Sep 17 00:00:00 2001
From: gracehoney <31743510+aaroey@users.noreply.github.com>
Date: Mon, 29 Jan 2018 15:53:07 -0800
Subject: [PATCH 363/966] Resolve merge conflict

---
 tensorflow/contrib/tensorrt/log/trt_logger.cc | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tensorflow/contrib/tensorrt/log/trt_logger.cc b/tensorflow/contrib/tensorrt/log/trt_logger.cc
index 883e0b8dcd..e4ff526172 100644
--- a/tensorflow/contrib/tensorrt/log/trt_logger.cc
+++ b/tensorflow/contrib/tensorrt/log/trt_logger.cc
@@ -27,10 +27,8 @@ namespace tensorrt {
 void Logger::log(Severity severity, const char* msg) {
   // Suppress info-level messages
   switch (severity) {
-    case Severity::kINFO: {  // mark TRT info messages as debug!
-      VLOG(-1) << msg;
     case Severity::kINFO: {  // Mark TRT info messages as debug!
-      LOG(DEBUG) << msg;
+      VLOG(-1) << msg;
       break;
     }
     case Severity::kWARNING: {
-- 
GitLab


From 4ab2d8531c461169cd6a33bc0fef1129b419e9df Mon Sep 17 00:00:00 2001
From: Jean Flaherty <kobejean@me.com>
Date: Mon, 29 Jan 2018 17:03:31 -0700
Subject: [PATCH 364/966] Tensor roll op implementation (#14953)

* Half migrated to manip

Half migrated to tf.manip

* Roll op: polymorphism & GPU attempt

* Roll op: Added support for gradients

Added compile script

* Rebase for roll op code

* Roll op: Migrated to manip namespace

* Roll op: Supports CPU thread pooling

fix namespace error

* Remove roll from user_ops

* Roll op: Optimization

* Roll op: Pylint fix

* Roll op: Updated documentation

* Roll op: Two versions for CPU thread pooling

* Roll op: Huge CPU speed up

Fixed thread pooling issue that was due to a bad cost_per_unit parameter

Also improved readability

* Roll op: Rough draft of DoRollV2

DoRollV2 copies memory in groups instead of element by element.
Not thoroughly tested yet.

Polished DoRollV2 algorithm

* Roll op: Restrict tensor size for GPU implementation

* Roll op: Fixed clang-format and missing include

* Roll op: Minor change

* Roll op GPU bug fix

Roll op GPU bug fix

GPU bug fix

Roll op GPU bug fix

Roll op GPU fix

Roll GPU test BUILD update

* Roll op: Remove GPU code

Fully remove roll op GPU code

Remove compile_cpu.sh

* Roll op: Fixes problems with array_ops_test.py and a size 1 dimension bug

* Roll op: Migrated to manip

Migrated to tf.manip

Roll op registered

Roll op uses InlinedVector

Small improvements

* Roll op: Revert array op changes

* Roll op: Api def fix

* Roll op: review changes

* Roll op: API review changes

Roll op: Docstring fix

* Roll op: Review changes round 1

* Roll op: resolve conflicts

* Roll op: Resolve conflicts

* Roll op: clang-tidy

* Roll op: Review round 2 changes

Roll op: fixed BUILD file

Roll op: api docs update

* Roll op: failure fixes 1

- updates goldens and fixes api compatibility issue
- fixes python op test issue for windows
- fixes makefile issues

* Roll op: Windows CMake failure fix

Windows CMake checks were failing because numpy was on an older version that did not support np.roll on multiple shifts that was use to check the correctness of tf.manip.roll.  manip_ops_test.py now checks for numpy version 1.12.0 before testing multiple shifts, otherwise it'll just test single shift roll.

* Roll op: pylint changes
---
 tensorflow/cc/BUILD                           |   1 +
 tensorflow/contrib/cmake/tf_core_ops.cmake    |   1 +
 tensorflow/contrib/cmake/tf_python.cmake      |   1 +
 tensorflow/contrib/makefile/tf_op_files.txt   |   2 +
 tensorflow/core/BUILD                         |   3 +
 .../core/api_def/base_api/api_def_Roll.pbtxt  |  52 ++
 tensorflow/core/graph/testlib.cc              |  12 +-
 tensorflow/core/graph/testlib.h               |   4 +
 tensorflow/core/kernels/BUILD                 |  39 ++
 tensorflow/core/kernels/roll_op.cc            | 334 ++++++++++++
 tensorflow/core/kernels/roll_op_test.cc       | 484 ++++++++++++++++++
 tensorflow/core/ops/manip_ops.cc              |  33 ++
 tensorflow/python/BUILD                       |  36 ++
 tensorflow/python/__init__.py                 |   2 +
 tensorflow/python/kernel_tests/BUILD          |  13 +
 .../python/kernel_tests/manip_ops_test.py     | 137 +++++
 tensorflow/python/ops/gradients_impl.py       |   1 +
 tensorflow/python/ops/manip_grad.py           |  32 ++
 tensorflow/python/ops/manip_ops.py            |  36 ++
 tensorflow/python/ops/standard_ops.py         |   4 +
 .../tools/api/golden/tensorflow.manip.pbtxt   |   7 +
 tensorflow/tools/api/golden/tensorflow.pbtxt  |   4 +
 22 files changed, 1237 insertions(+), 1 deletion(-)
 create mode 100644 tensorflow/core/api_def/base_api/api_def_Roll.pbtxt
 create mode 100644 tensorflow/core/kernels/roll_op.cc
 create mode 100644 tensorflow/core/kernels/roll_op_test.cc
 create mode 100644 tensorflow/core/ops/manip_ops.cc
 create mode 100644 tensorflow/python/kernel_tests/manip_ops_test.py
 create mode 100644 tensorflow/python/ops/manip_grad.py
 create mode 100644 tensorflow/python/ops/manip_ops.py
 create mode 100644 tensorflow/tools/api/golden/tensorflow.manip.pbtxt

diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index c9ade5fb83..9060c19e9d 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -433,6 +433,7 @@ tf_gen_op_wrappers_cc(
         "linalg_ops",
         "logging_ops",
         "lookup_ops",
+        "manip_ops",
         "math_ops",
         "nn_ops",
         "no_op",
diff --git a/tensorflow/contrib/cmake/tf_core_ops.cmake b/tensorflow/contrib/cmake/tf_core_ops.cmake
index 138993db35..c42bc35ce7 100644
--- a/tensorflow/contrib/cmake/tf_core_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_core_ops.cmake
@@ -30,6 +30,7 @@ set(tf_op_lib_names
 		"list_ops"
     "lookup_ops"
     "logging_ops"
+    "manip_ops"
     "math_ops"
     "nn_ops"
     "no_op"
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index 8862390d2b..b7c816c24f 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -335,6 +335,7 @@ GENERATE_PYTHON_OP_LIB("list_ops")
 GENERATE_PYTHON_OP_LIB("logging_ops")
 GENERATE_PYTHON_OP_LIB("lookup_ops")
 GENERATE_PYTHON_OP_LIB("nn_ops")
+GENERATE_PYTHON_OP_LIB("manip_ops")
 GENERATE_PYTHON_OP_LIB("parsing_ops")
 GENERATE_PYTHON_OP_LIB("random_ops")
 GENERATE_PYTHON_OP_LIB("remote_fused_graph_ops"
diff --git a/tensorflow/contrib/makefile/tf_op_files.txt b/tensorflow/contrib/makefile/tf_op_files.txt
index 5f27566398..9a1ab50317 100644
--- a/tensorflow/contrib/makefile/tf_op_files.txt
+++ b/tensorflow/contrib/makefile/tf_op_files.txt
@@ -91,6 +91,7 @@ tensorflow/core/kernels/reduction_ops_max.cc
 tensorflow/core/kernels/reduction_ops_common.cc
 tensorflow/core/kernels/reduction_ops_any.cc
 tensorflow/core/kernels/reduction_ops_all.cc
+tensorflow/core/kernels/roll_op.cc
 tensorflow/core/kernels/queue_ops.cc
 tensorflow/core/kernels/queue_base.cc
 tensorflow/core/kernels/pooling_ops_common.cc
@@ -270,6 +271,7 @@ tensorflow/core/ops/parsing_ops.cc
 tensorflow/core/ops/no_op.cc
 tensorflow/core/ops/nn_ops.cc
 tensorflow/core/ops/nn_grad.cc
+tensorflow/core/ops/manip_ops.cc
 tensorflow/core/ops/math_ops.cc
 tensorflow/core/ops/math_grad.cc
 tensorflow/core/ops/logging_ops.cc
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 3b4a10eedb..90c2823ea4 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -610,6 +610,7 @@ tf_gen_op_libs(
         "list_ops",
         "lookup_ops",
         "logging_ops",
+        "manip_ops",
         "math_ops",
         "nn_ops",
         "no_op",
@@ -692,6 +693,7 @@ cc_library(
         ":list_ops_op_lib",
         ":logging_ops_op_lib",
         ":lookup_ops_op_lib",
+        ":manip_ops_op_lib",
         ":math_ops_op_lib",
         ":nn_ops_op_lib",
         ":no_op_op_lib",
@@ -829,6 +831,7 @@ cc_library(
         "//tensorflow/core/kernels:list_kernels",
         "//tensorflow/core/kernels:lookup",
         "//tensorflow/core/kernels:logging",
+        "//tensorflow/core/kernels:manip",
         "//tensorflow/core/kernels:math",
         "//tensorflow/core/kernels:multinomial_op",
         "//tensorflow/core/kernels:nn",
diff --git a/tensorflow/core/api_def/base_api/api_def_Roll.pbtxt b/tensorflow/core/api_def/base_api/api_def_Roll.pbtxt
new file mode 100644
index 0000000000..b308ad1f9d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Roll.pbtxt
@@ -0,0 +1,52 @@
+op {
+  graph_op_name: "Roll"
+  in_arg {
+    name: "shift"
+    description: <<END
+Dimension must be 0-D or 1-D. `shift[i]` specifies the number of places by which
+elements are shifted positively (towards larger indices) along the dimension
+specified by `axis[i]`. Negative shifts will roll the elements in the opposite
+direction.
+END
+  }
+  in_arg {
+    name: "axis"
+    description: <<END
+Dimension must be 0-D or 1-D. `axis[i]` specifies the dimension that the shift
+`shift[i]` should occur. If the same axis is referenced more than once, the
+total shift for that axis will be the sum of all the shifts that belong to that
+axis.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Has the same shape and size as the input. The elements are shifted
+positively (towards larger indices) by the offsets of `shift` along the
+dimensions of `axis`.
+END
+  }
+  summary: "Rolls the elements of a tensor along an axis."
+  description: <<END
+The elements are shifted positively (towards larger indices) by the offset of
+`shift` along the dimension of `axis`. Negative `shift` values will shift
+elements in the opposite direction. Elements that roll passed the last position
+will wrap around to the first and vice versa. Multiple shifts along multiple
+axes may be specified.
+
+For example:
+
+```
+# 't' is [0, 1, 2, 3, 4]
+roll(t, shift=2, axis=0) ==> [3, 4, 0, 1, 2]
+
+# shifting along multiple dimensions
+# 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
+roll(t, shift=[1, -2], axis=[0, 1]) ==> [[7, 8, 9, 5, 6], [2, 3, 4, 0, 1]]
+
+# shifting along the same axis multiple times
+# 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
+roll(t, shift=[2, -3], axis=[1, 1]) ==> [[1, 2, 3, 4, 0], [6, 7, 8, 9, 5]]
+```
+END
+}
diff --git a/tensorflow/core/graph/testlib.cc b/tensorflow/core/graph/testlib.cc
index 172471e34b..0d88d1ff72 100644
--- a/tensorflow/core/graph/testlib.cc
+++ b/tensorflow/core/graph/testlib.cc
@@ -40,7 +40,7 @@ REGISTER_KERNEL_BUILDER(
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(
     Name("HostConst").Device(DEVICE_SYCL).HostMemory("output"), HostConstantOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // Register the HostConst Op
 // Returns a constant tensor on the host.  Useful for writing C++ tests
@@ -273,6 +273,16 @@ Node* Reverse(Graph* g, Node* tensor, Node* axis) {
   return Binary(g, "ReverseV2", tensor, axis);
 }
 
+Node* Roll(Graph* g, Node* input, Node* shift, Node* axis) {
+  Node* ret;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Roll", g->op_registry())
+                  .Input(input)
+                  .Input(shift)
+                  .Input(axis)
+                  .Finalize(g, &ret));
+  return ret;
+}
+
 Node* Error(Graph* g, Node* input, const string& errmsg) {
   Node* ret;
   TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Error")
diff --git a/tensorflow/core/graph/testlib.h b/tensorflow/core/graph/testlib.h
index 06597778bb..eb9038d619 100644
--- a/tensorflow/core/graph/testlib.h
+++ b/tensorflow/core/graph/testlib.h
@@ -117,6 +117,10 @@ Node* RandomGamma(Graph* g, Node* shape, Node* alpha);
 // Output dtype determined by lam.
 Node* RandomPoisson(Graph* g, Node* shape, Node* lam);
 
+// Rolls tensor by an offset of <shift> along the corresponding
+// <axis> dimensions.
+Node* Roll(Graph* g, Node* input, Node* shift, Node* axis);
+
 // Generates random parameters from the truncated standard normal distribution
 // of the nput shape
 Node* TruncatedNormal(Graph* g, Node* input, DataType dtype);
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index db309fc9da..e7192ec42f 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -2589,6 +2589,45 @@ tf_cc_tests(
     ],
 )
 
+cc_library(
+    name = "manip",
+    deps = [
+        ":roll_op",
+    ],
+)
+
+MANIP_DEPS = [
+    "//tensorflow/core:framework",
+    "//tensorflow/core:lib",
+    "//tensorflow/core:manip_ops_op_lib",
+    "//third_party/eigen3",
+]
+
+tf_kernel_library(
+    name = "roll_op",
+    prefix = "roll_op",
+    deps = MANIP_DEPS,
+)
+
+tf_cc_test(
+    name = "roll_op_test",
+    size = "small",
+    srcs = ["roll_op_test.cc"],
+    deps = [
+        ":ops_testutil",
+        ":ops_util",
+        ":roll_op",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 MATH_DEPS = [
     ":bounds_check",
     ":fill_functor",
diff --git a/tensorflow/core/kernels/roll_op.cc b/tensorflow/core/kernels/roll_op.cc
new file mode 100644
index 0000000000..bcbdbee058
--- /dev/null
+++ b/tensorflow/core/kernels/roll_op.cc
@@ -0,0 +1,334 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/register_types_traits.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+
+#define EIGEN_USE_THREADS
+using CPUDevice = Eigen::ThreadPoolDevice;
+
+// dim_size - the size of each dimension
+// dim_range - the number of indices over in the flattened tensor
+//    you need to skip in order to make it over from one side of a dimension
+//    to the other. Used to make the shifts wrap around after a threshold.
+// threshold - the index for each dimension that the roll starts to wrap
+//    back to the front
+template <typename T>
+void DoRoll(OpKernelContext* context, const int64 num_elements,
+            const int num_dims, const gtl::ArraySlice<int>& dim_size,
+            const T* input, T* output, const gtl::ArraySlice<int>& threshold,
+            const gtl::ArraySlice<int64>& dim_range) {
+  auto work = [input, output, num_dims, &dim_size, &threshold, &dim_range](
+                  int64 start, int64 end) {
+    // array of indices for each dimension
+    gtl::InlinedVector<int, 4> indices(num_dims);
+    int offset = 0;  // the shift along the flattened tensor for current element
+    // initialize indices and offset
+    for (int i = 0; i < num_dims; i++) {
+      // stride is the number of indices over in the flattened tensor
+      // you need to skip in order to make it over to an adjacent element
+      // along a dimension. dim_size[i] != 0 because we set it to max(dim, 1)
+      const int64 stride = dim_range[i] / dim_size[i];
+      const int shift = dim_size[i] - threshold[i];
+      const int indx = (start / stride) % dim_size[i];
+      indices[i] = indx;
+      // calculate dimension index after the shift
+      const int shifted_indx = (indx + shift) % dim_size[i];
+      offset += (shifted_indx - indx) * stride;
+    }
+
+    for (int64 i = start; i < end; i++) {
+      output[i + offset] = input[i];
+      // create next combination of indices
+      // while at it adjust offset if needed
+      for (int j = num_dims - 1; j >= 0; j--) {
+        const int indx = (indices[j] + 1) % dim_size[j];
+        indices[j] = indx;
+        if (indx != 0) {
+          if (indx == threshold[j]) {  // we've reached the threshold
+            // dim_range[j] = threshold[j] + shift[j]
+            // offset = shift[j] + ... other offsets
+            // offset - dim_range[j] = -threshold[j] + ... other offsets
+            // thus we undo our previous offset as well as add a new offset of
+            // -threshold[j] in one operation
+            offset -= dim_range[j];  // now wraps around
+          }
+          break;                         // indx != 0 don't need to carry
+        } else if (threshold[j] != 0) {  // if threshold is 0 shift is 0
+          offset += dim_range[j];        // indx became 0 so reverse wrap around
+        }
+      }
+    }
+  };
+  // Shard
+  auto worker_threads = context->device()->tensorflow_cpu_worker_threads();
+  // 15 - expiramentally determined with float and bool types
+  const int cost_per_element = 15 * sizeof(T);  // rough esitmate
+  Shard(worker_threads->num_threads, worker_threads->workers, num_elements,
+        cost_per_element, std::move(work));
+}
+
+// dim_size - the size of each dimension
+// dim_range - the number of indices over in the flattened tensor
+//    you need to skip in order to make it over from one side of a dimension
+//    to the other. Used to make the shifts wrap around after a threshold.
+// threshold - the index for each dimension that the roll starts to wrap
+//    back to the front
+// isd - inner shift dimension
+template <typename T>
+// Use memcpy to copy memory in groups when the data type supports memcpy
+void DoRollWithMemcpy(OpKernelContext* context, const int64 num_elements,
+                      const int num_dims, const gtl::ArraySlice<int>& dim_size,
+                      const T* input, T* output,
+                      const gtl::ArraySlice<int>& threshold,
+                      const gtl::ArraySlice<int64>& dim_range,
+                      const int64 isd) {
+  auto work = [input, output, num_dims, &dim_size, &threshold, &dim_range, isd](
+                  int64 start, int64 end) {
+    // the number of indices over in the flattened tensor you need to skip in
+    // order to make it over from one side of the isd to the other
+    const int64 isd_range = std::max<int>(dim_range[isd], 1);
+    // the distance along the flattend tensor to the next element in the isd
+    const int64 isd_stride = isd_range / std::max<int>(dim_size[isd], 1);
+
+    // start and end represent the i-th group currently so we will convert
+    // them into numbers representing the i-th elements.
+    // there are 2 groups per isd one for all elements before threshold[isd]
+    // and another for all elements after threshold[isd].
+    const int64 start_remainder = (start % 2) * threshold[isd] * isd_stride;
+    const int64 end_remainder = (end % 2) * threshold[isd] * isd_stride;
+    start = (start / 2) * isd_range + start_remainder;
+    end = (end / 2) * isd_range + end_remainder;
+
+    const T* in_ptr = &input[0];
+    T* out_ptr = &output[0];
+    in_ptr += start;
+    out_ptr += start;
+
+    // array of indices for each dimension
+    // indicies = [i, j, k, l, m, n]
+    gtl::InlinedVector<int, 4> indicies(num_dims);
+    // the offset needed to make all inner non-shifting dimensions become 0
+    int64 remainder_offset = 0;
+    // initialize indicies
+    for (int i = 0; i < num_dims; i++) {
+      // stride is the number of indices over in the flattened tensor
+      // you need to skip in order to make it over to an adjacent element
+      // along a dimension. dim_size[i] != 0 because we set it to max(dim, 1)
+      const int64 stride = dim_range[i] / dim_size[i];
+      const int shift = dim_size[i] - threshold[i];
+      const int indx = (start / stride) % dim_size[i];
+      indicies[i] = indx;
+      // calculate dimension index after the shift
+      int out_indx = (indx + shift) % dim_size[i];
+      if (i > isd) {
+        // trailing zeroes for indices after the inner shifted dimension
+        out_indx = 0;
+        remainder_offset += (out_indx - indx) * stride;
+      }
+      out_ptr += (out_indx - indx) * stride;
+    }
+    // set trailing zeroes for indices after the inner shifted dimension
+    for (int i = num_dims - 1; i > isd; i--) indicies[i] = 0;
+
+    // the number of indices in the isd dimension the next group will skip
+    // to make it to the next threshold or end point
+    int isd_indx_skip = 0;
+    // the size of the next group
+    int64 group_size = 0;
+    // initialize isd_indx_skip and group_size
+    if (indicies[isd] < threshold[isd]) {
+      isd_indx_skip = threshold[isd] - indicies[isd];
+      group_size = isd_indx_skip * isd_stride + remainder_offset;
+    } else {
+      isd_indx_skip = dim_size[isd] - indicies[isd];
+      group_size = isd_indx_skip * isd_stride + remainder_offset;
+    }
+
+    int64 i = start;
+    while (i < end) {
+      // copy group of elements
+      memcpy(out_ptr, in_ptr, group_size * sizeof(T));
+
+      // shift i and the pointers over to the next group position
+      i += group_size;
+      out_ptr += group_size;
+      in_ptr += group_size;
+
+      // produce next combination of indices and adjust the out_ptr position
+      // to fix the offset if necessary
+      // the isd (inner shift dim) should skip to next threshold or endpoint
+      // all dimensions to the left increment by 1 when a digit is carried
+      // all dimensions to the right remain set to 0
+      //            +1 +1 +1 +isd_indx_skip
+      // indicies = [i, j, k, l, 0, 0]
+      //                      ^isd
+      for (int j = isd; j >= 0; j--) {
+        int inc = 1;
+        if (j == isd) inc = isd_indx_skip;
+        const int indx = (indicies[j] + inc) % dim_size[j];
+        indicies[j] = indx;
+        if (indx != 0) {
+          if (indx == threshold[j]) {
+            out_ptr -= dim_range[j];  // now wraps around
+          }
+          break;                         // indx != 0 don't need to carry
+        } else if (threshold[j] != 0) {  // if threshold is 0 shift is 0
+          out_ptr += dim_range[j];       // indx became 0 so reverse wrap around
+        }
+      }
+
+      // set isd_indx_skip and group_size for next iteration
+      if (indicies[isd] < threshold[isd]) {
+        isd_indx_skip = threshold[isd] - indicies[isd];
+        group_size = isd_indx_skip * isd_stride;
+      } else {
+        isd_indx_skip = dim_size[isd] - indicies[isd];
+        group_size = isd_indx_skip * isd_stride;
+      }
+    }
+  };
+  // Shard
+  auto worker_threads = context->device()->tensorflow_cpu_worker_threads();
+  const int64 ave_group_size = dim_range[isd] / 2;
+  const int total_work = 2 * num_elements / std::max<int>(dim_range[isd], 1);
+  // 25000 - expiramentally determined with float and bool types
+  const int cost_per_group = 25000 * sizeof(T) * ave_group_size;
+  Shard(worker_threads->num_threads, worker_threads->workers, total_work,
+        cost_per_group, std::move(work));
+}
+
+template <typename Device, typename T, typename Tshift, typename Taxis>
+class RollOp : public OpKernel {
+ public:
+  explicit RollOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    // Grab the input tensor
+    const Tensor& input = context->input(0);
+    const Tensor& shift = context->input(1);
+    const Tensor& axis = context->input(2);
+
+    auto shift_flat = shift.flat<Tshift>();
+    auto axis_flat = axis.flat<Taxis>();
+
+    OP_REQUIRES(context, TensorShapeUtils::IsVectorOrHigher(input.shape()),
+                errors::InvalidArgument("input must be 1-D or higher"));
+    OP_REQUIRES(context, shift.shape().dims() <= 1,
+                errors::InvalidArgument(
+                    "shift must be a scalar or a 1-D vector. Found: ",
+                    shift.shape().DebugString()));
+    OP_REQUIRES(context, axis.shape().dims() <= 1,
+                errors::InvalidArgument(
+                    "axis must be a scalar or a 1-D vector. Found: ",
+                    axis.shape().DebugString()));
+    OP_REQUIRES(
+        context, shift.shape() == axis.shape(),
+        errors::InvalidArgument("shift and axis must have the same size"));
+    const int64 num_elements = input.NumElements();
+    const int num_shifts = static_cast<int>(shift_flat.size());
+    const int num_dims = input.dims();
+
+    // if there are any duplicate axes, shift_mod_sum will have the
+    // total modulo sum of shifts for each dimension
+    gtl::InlinedVector<int, 4> shift_mod_sum(num_dims, 0);
+    for (int i = 0; i < num_shifts; i++) {
+      const int axis = axis_flat(i);
+      OP_REQUIRES(context, axis < num_dims,
+                  errors::InvalidArgument("axis ", axis, " is out of range"));
+      const int ds = std::max<int>(static_cast<int>(input.dim_size(axis)), 1);
+      const int sum = shift_mod_sum[axis] + static_cast<int>(shift_flat(i));
+      // modulo that works with negatives: ((x % y) + y) % y
+      shift_mod_sum[axis] = (sum % ds + ds) % ds;
+    }
+    // the size of each dimension
+    gtl::InlinedVector<int, 4> dim_size(num_dims);
+    // threshold[i] is the index that the roll starts to wrap back to the front
+    gtl::InlinedVector<int, 4> threshold(num_dims);
+    // dim_range is the number of indices over in the flattened tensor
+    // you need to skip in order to make it over from one side of a dimension
+    // to the other. Used to make the shifts wrap around after a threshold.
+    gtl::InlinedVector<int64, 4> dim_range(num_dims);
+    int64 dim_size_prod = 1;  // dimension size product
+    // inner shift dimension (inner most shifted dimension)
+    int64 isd = 0;
+    for (int i = num_dims - 1; i >= 0; i--) {
+      if (isd == 0 && shift_mod_sum[i] != 0) isd = i;
+      const int ds = std::max<int>(static_cast<int>(input.dim_size(i)), 1);
+      dim_size[i] = ds;
+      threshold[i] = (ds - shift_mod_sum[i]) % ds;
+      dim_size_prod *= static_cast<int64>(input.dim_size(i));
+      dim_range[i] = dim_size_prod;
+    }
+
+    Tensor* output = NULL;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, input.shape(), &output));
+    auto input_flat = input.flat<T>().data();
+    auto output_flat = output->flat<T>().data();
+
+    if (std::is_same<Device, CPUDevice>::value) {
+      if (DataTypeCanUseMemcpy(DataTypeToEnum<T>::v())) {
+        // V2 copies memory in groups instead of element by element
+        DoRollWithMemcpy<T>(context, num_elements, num_dims, dim_size,
+                            input_flat, output_flat, threshold, dim_range, isd);
+      } else {
+        // incase memcpy does not work for current data type
+        DoRoll<T>(context, num_elements, num_dims, dim_size, input_flat,
+                  output_flat, threshold, dim_range);
+      }
+    }
+  }
+};
+
+// Register the CPU kernels.
+#define REGISTER_CPU(type)                                       \
+  REGISTER_KERNEL_BUILDER(Name("Roll")                           \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<int32>("Tshift")   \
+                              .TypeConstraint<int32>("Taxis"),   \
+                          RollOp<CPUDevice, type, int32, int32>) \
+  REGISTER_KERNEL_BUILDER(Name("Roll")                           \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<int64>("Tshift")   \
+                              .TypeConstraint<int32>("Taxis"),   \
+                          RollOp<CPUDevice, type, int64, int32>) \
+  REGISTER_KERNEL_BUILDER(Name("Roll")                           \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<int32>("Tshift")   \
+                              .TypeConstraint<int64>("Taxis"),   \
+                          RollOp<CPUDevice, type, int32, int64>) \
+  REGISTER_KERNEL_BUILDER(Name("Roll")                           \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<int64>("Tshift")   \
+                              .TypeConstraint<int64>("Taxis"),   \
+                          RollOp<CPUDevice, type, int64, int64>)
+
+TF_CALL_ALL_TYPES(REGISTER_CPU);
+#undef REGISTER_CPU
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/roll_op_test.cc b/tensorflow/core/kernels/roll_op_test.cc
new file mode 100644
index 0000000000..90b6f8d0f3
--- /dev/null
+++ b/tensorflow/core/kernels/roll_op_test.cc
@@ -0,0 +1,484 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <functional>
+#include <memory>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+namespace {
+
+class RollOpTest : public OpsTestBase {
+ protected:
+  void MakeOp(DataType data_type, DataType index_type) {
+    TF_ASSERT_OK(NodeDefBuilder("myop", "Roll")
+                     .Input(FakeInput(data_type))
+                     .Input(FakeInput(index_type))
+                     .Input(FakeInput(index_type))
+                     .Finalize(node_def()));
+    TF_ASSERT_OK(InitOp());
+  }
+};
+
+TEST_F(RollOpTest, ScalarIndices) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({5}), {0, 1, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({}), {3});
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({5}));
+  test::FillValues<float>(&expected, {2, 3, 4, 0, 1});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, ScalarIndices_NoMemcpy) {
+  MakeOp(DT_STRING, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<string>(TensorShape({5}), {"a", "b", "c", "d", "e"});
+  AddInputFromArray<int32>(TensorShape({}), {3});
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_STRING, TensorShape({5}));
+  test::FillValues<string>(&expected, {"c", "d", "e", "a", "b"});
+  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, ScalarIndices_Complex) {
+  MakeOp(DT_COMPLEX64, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<std::complex<float>>(
+      TensorShape({5}), {std::complex<float>(0, 10), std::complex<float>(1, 11),
+                         std::complex<float>(2, 12), std::complex<float>(3, 13),
+                         std::complex<float>(4, 14)});
+  AddInputFromArray<int32>(TensorShape({}), {3});
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_COMPLEX64, TensorShape({5}));
+  test::FillValues<std::complex<float>>(
+      &expected, {std::complex<float>(2, 12), std::complex<float>(3, 13),
+                  std::complex<float>(4, 14), std::complex<float>(0, 10),
+                  std::complex<float>(1, 11)});
+  test::ExpectTensorEqual<std::complex<float>>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Simple_TwoD32) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({3, 5}),
+                           {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
+  AddInputFromArray<int32>(TensorShape({2}), {2, -1});
+  AddInputFromArray<int32>(TensorShape({2}), {0, 1});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({3, 5}));
+  test::FillValues<float>(&expected,
+                          {6, 7, 8, 9, 5, 11, 12, 13, 14, 10, 1, 2, 3, 4, 0});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Simple_TwoD32_NoMemcpy) {
+  MakeOp(DT_STRING, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<string>(TensorShape({3, 5}),
+                            {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j",
+                             "k", "l", "m", "n", "o"});
+  AddInputFromArray<int32>(TensorShape({2}), {2, -1});
+  AddInputFromArray<int32>(TensorShape({2}), {0, 1});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_STRING, TensorShape({3, 5}));
+  test::FillValues<string>(&expected, {"g", "h", "i", "j", "f", "l", "m", "n",
+                                       "o", "k", "b", "c", "d", "e", "a"});
+  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Simple_ThreeD32) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({2, 2, 3}),
+                           {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
+  AddInputFromArray<int32>(TensorShape({3}), {1, -1, -1});
+  AddInputFromArray<int32>(TensorShape({3}), {0, 1, 2});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 2, 3}));
+  test::FillValues<float>(&expected, {10, 11, 9, 7, 8, 6, 4, 5, 3, 1, 2, 0});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Simple_ThreeD32_NoMemcpy) {
+  MakeOp(DT_STRING, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<string>(
+      TensorShape({2, 2, 3}),
+      {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"});
+  AddInputFromArray<int32>(TensorShape({3}), {1, -1, -1});
+  AddInputFromArray<int32>(TensorShape({3}), {0, 1, 2});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_STRING, TensorShape({2, 2, 3}));
+  test::FillValues<string>(
+      &expected, {"k", "l", "j", "h", "i", "g", "e", "f", "d", "b", "c", "a"});
+  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Simple_TwoD64) {
+  MakeOp(DT_FLOAT, DT_INT64);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({5, 3}),
+                           {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
+  AddInputFromArray<int64>(TensorShape({2}), {-1, 4});
+  AddInputFromArray<int64>(TensorShape({2}), {0, 1});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({5, 3}));
+  test::FillValues<float>(&expected,
+                          {5, 3, 4, 8, 6, 7, 11, 9, 10, 14, 12, 13, 2, 0, 1});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Simple_TwoD64_NoMemcpy) {
+  MakeOp(DT_STRING, DT_INT64);
+
+  // Feed and run
+  AddInputFromArray<string>(TensorShape({5, 3}),
+                            {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j",
+                             "k", "l", "m", "n", "o"});
+  AddInputFromArray<int64>(TensorShape({2}), {-1, 4});
+  AddInputFromArray<int64>(TensorShape({2}), {0, 1});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_STRING, TensorShape({5, 3}));
+  test::FillValues<string>(&expected, {"f", "d", "e", "i", "g", "h", "l", "j",
+                                       "k", "o", "m", "n", "c", "a", "b"});
+  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Simple_ThreeD64) {
+  MakeOp(DT_FLOAT, DT_INT64);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({4, 1, 3}),
+                           {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
+  AddInputFromArray<int64>(TensorShape({3}), {4, 3, 2});
+  AddInputFromArray<int64>(TensorShape({3}), {0, 1, 2});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({4, 1, 3}));
+  test::FillValues<float>(&expected, {1, 2, 0, 4, 5, 3, 7, 8, 6, 10, 11, 9});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Simple_ThreeD64_NoMemcpy) {
+  MakeOp(DT_STRING, DT_INT64);
+
+  // Feed and run
+  AddInputFromArray<string>(
+      TensorShape({4, 1, 3}),
+      {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"});
+  AddInputFromArray<int64>(TensorShape({3}), {4, 3, 2});
+  AddInputFromArray<int64>(TensorShape({3}), {0, 1, 2});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_STRING, TensorShape({4, 1, 3}));
+  test::FillValues<string>(
+      &expected, {"b", "c", "a", "e", "f", "d", "h", "i", "g", "k", "l", "j"});
+  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, ZeroShift_ThreeD32) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({2, 2, 3}),
+                           {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
+  AddInputFromArray<int32>(TensorShape({3}), {0, 0, 0});
+  AddInputFromArray<int32>(TensorShape({3}), {0, 1, 2});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 2, 3}));
+  test::FillValues<float>(&expected, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, ZeroShift_ThreeD32_NoMemcpy) {
+  MakeOp(DT_STRING, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<string>(
+      TensorShape({2, 2, 3}),
+      {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"});
+  AddInputFromArray<int32>(TensorShape({3}), {0, 0, 0});
+  AddInputFromArray<int32>(TensorShape({3}), {0, 1, 2});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_STRING, TensorShape({2, 2, 3}));
+  test::FillValues<string>(
+      &expected, {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"});
+  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, ZeroSize_ThreeD32) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({5, 0, 0}), {});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({5, 0, 0}));
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, ZeroSize_ThreeD32_NoMemcpy) {
+  MakeOp(DT_STRING, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<string>(TensorShape({5, 0, 0}), {});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_STRING, TensorShape({5, 0, 0}));
+  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, OneSize_ThreeD32) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({1, 1, 1}), {5});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 1}));
+  test::FillValues<float>(&expected, {5});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, OneSize_ThreeD32_NoMemcpy) {
+  MakeOp(DT_STRING, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<string>(TensorShape({1, 1, 1}), {"a"});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_STRING, TensorShape({1, 1, 1}));
+  test::FillValues<string>(&expected, {"a"});
+  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, MultiShifts_TwoD32) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({3, 5}),
+                           {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
+  AddInputFromArray<int32>(TensorShape({4}), {-2, 2, -1, 1});
+  AddInputFromArray<int32>(TensorShape({4}), {1, 0, 0, 1});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({3, 5}));
+  test::FillValues<float>(&expected,
+                          {11, 12, 13, 14, 10, 1, 2, 3, 4, 0, 6, 7, 8, 9, 5});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, MultiShifts_TwoD32_NoMemcpy) {
+  MakeOp(DT_STRING, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<string>(TensorShape({3, 5}),
+                            {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j",
+                             "k", "l", "m", "n", "o"});
+  AddInputFromArray<int32>(TensorShape({4}), {-2, 2, -1, 1});
+  AddInputFromArray<int32>(TensorShape({4}), {1, 0, 0, 1});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_STRING, TensorShape({3, 5}));
+  test::FillValues<string>(&expected, {"l", "m", "n", "o", "k", "b", "c", "d",
+                                       "e", "a", "g", "h", "i", "j", "f"});
+  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Error_InputMustBeVectorOrHigher) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({}), {7});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  Status s = RunOpKernel();
+  EXPECT_TRUE(StringPiece(s.ToString()).contains("input must be 1-D or higher"))
+      << s;
+}
+
+TEST_F(RollOpTest, Error_AxisMustBeScalarOrVector) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({2, 2}), {1, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  AddInputFromArray<int32>(TensorShape({1, 2}), {0, 1});
+  Status s = RunOpKernel();
+  EXPECT_TRUE(StringPiece(s.ToString())
+                  .contains("axis must be a scalar or a 1-D vector"))
+      << s;
+}
+
+TEST_F(RollOpTest, Error_ShiftMustBeScalarOrVector) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({2, 2}), {1, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({1, 2}), {0, 1});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  Status s = RunOpKernel();
+  EXPECT_TRUE(StringPiece(s.ToString())
+                  .contains("shift must be a scalar or a 1-D vector"))
+      << s;
+}
+
+TEST_F(RollOpTest, Error_ShiftAndAxisMustBeSameSize) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({2, 2}), {1, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({1}), {1});
+  AddInputFromArray<int32>(TensorShape({2}), {0, 1});
+  Status s = RunOpKernel();
+  EXPECT_TRUE(StringPiece(s.ToString())
+                  .contains("shift and axis must have the same size"))
+      << s;
+}
+
+TEST_F(RollOpTest, Error_AxisOutOfRange) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({4}), {1, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  Status s = RunOpKernel();
+  EXPECT_TRUE(StringPiece(s.ToString()).contains("is out of range")) << s;
+}
+
+// isd - (inner shift dimension) The inner most dimension to be shifted.
+//    All outer dimensions will also be shifted for testing.
+static Graph* RollGraph(const TensorShape& shape, int isd) {
+  Graph* g = new Graph(OpRegistry::Global());
+  Tensor input(DT_FLOAT, shape);
+  input.flat<float>().setRandom();
+  const int dims = static_cast<int>(input.dims());
+  Tensor shift(DT_INT32, TensorShape({dims}));
+  for (int i = 0; i < dims; i++) {
+    // shift the inner shift dimension and all outer dimensions
+    shift.flat<int32>()(i) = (i <= isd) ? 2 : 0;
+  }
+  Tensor axis(DT_INT32, TensorShape({dims}));
+  for (int i = 0; i < dims; i++) {
+    axis.flat<int32>()(i) = i;
+  }
+  test::graph::Roll(g, test::graph::Constant(g, input),
+                    test::graph::Constant(g, shift),
+                    test::graph::Constant(g, axis));
+  return g;
+}
+
+#define BM_ROLL_OUTER(DEVICE)                                                 \
+  static void BM_##DEVICE##_roll_outer(int iters, int rows, int columns) {    \
+    TensorShape shape{rows, columns};                                         \
+    const int64 num_items = static_cast<int64>(iters) * shape.num_elements(); \
+    testing::ItemsProcessed(num_items);                                       \
+    testing::BytesProcessed(num_items * sizeof(float));                       \
+    testing::UseRealTime();                                                   \
+    test::Benchmark(#DEVICE, RollGraph(shape, 0)).Run(iters);                 \
+  }                                                                           \
+  BENCHMARK(BM_##DEVICE##_roll_outer)                                         \
+      ->ArgPair(256, 256)                                                     \
+      ->ArgPair(512, 512)                                                     \
+      ->ArgPair(1024, 1024)                                                   \
+      ->ArgPair(2048, 2048)
+
+#define BM_ROLL_ALL(DEVICE)                                                   \
+  static void BM_##DEVICE##_roll_all(int iters, int rows, int columns) {      \
+    TensorShape shape{rows, columns};                                         \
+    const int64 num_items = static_cast<int64>(iters) * shape.num_elements(); \
+    testing::ItemsProcessed(num_items);                                       \
+    testing::BytesProcessed(num_items * sizeof(float));                       \
+    testing::UseRealTime();                                                   \
+    test::Benchmark(#DEVICE, RollGraph(shape, 1)).Run(iters);                 \
+  }                                                                           \
+  BENCHMARK(BM_##DEVICE##_roll_all)                                           \
+      ->ArgPair(256, 256)                                                     \
+      ->ArgPair(512, 512)                                                     \
+      ->ArgPair(1024, 1024)                                                   \
+      ->ArgPair(2048, 2048)
+
+BM_ROLL_OUTER(cpu);
+BM_ROLL_ALL(cpu);
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/manip_ops.cc b/tensorflow/core/ops/manip_ops.cc
new file mode 100644
index 0000000000..95b4774fe6
--- /dev/null
+++ b/tensorflow/core/ops/manip_ops.cc
@@ -0,0 +1,33 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+// --------------------------------------------------------------------------
+REGISTER_OP("Roll")
+    .Input("input: T")
+    .Input("shift: Tshift")
+    .Input("axis: Taxis")
+    .Output("output: T")
+    .Attr("T: type")
+    .Attr("Tshift: {int32,int64}")
+    .Attr("Taxis: {int32,int64}")
+    .SetShapeFn(shape_inference::UnchangedShape);
+
+}  // namespace tensorflow
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index a323d5bc39..c73d6c37ee 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -76,6 +76,7 @@ py_library(
         ":layers",
         ":lib",
         ":list_ops",
+        ":manip_ops",
         ":math_ops",
         ":metrics",
         ":nn",
@@ -1394,6 +1395,14 @@ tf_gen_op_wrapper_private_py(
     ],
 )
 
+tf_gen_op_wrapper_private_py(
+    name = "manip_ops_gen",
+    visibility = [
+        "//learning/brain/python/ops:__pkg__",
+        "//tensorflow/python/kernel_tests:__pkg__",
+    ],
+)
+
 tf_gen_op_wrapper_private_py(
     name = "math_ops_gen",
     visibility = [
@@ -1726,6 +1735,8 @@ py_library(
         ":linalg_grad",
         ":linalg_ops",
         ":logging_ops",
+        ":manip_grad",
+        ":manip_ops",
         ":math_grad",
         ":math_ops",
         ":platform",
@@ -1848,6 +1859,29 @@ py_library(
     ],
 )
 
+py_library(
+    name = "manip_grad",
+    srcs = ["ops/manip_grad.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":control_flow_ops",
+        ":framework_for_generated_wrappers",
+        ":manip_ops",
+    ],
+)
+
+py_library(
+    name = "manip_ops",
+    srcs = ["ops/manip_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":dtypes",
+        ":framework_ops",
+        ":manip_ops_gen",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_library(
     name = "logging_ops",
     srcs = ["ops/logging_ops.py"],
@@ -2310,6 +2344,8 @@ py_library(
         ":linalg_ops",
         ":logging_ops",
         ":lookup_ops",
+        ":manip_grad",
+        ":manip_ops",
         ":math_grad",
         ":math_ops",
         ":numerics",
diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index bc9ddec2a5..ea7604d30f 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -84,6 +84,7 @@ from tensorflow.python.feature_column import feature_column_lib as feature_colum
 from tensorflow.python.layers import layers
 from tensorflow.python.ops import bitwise_ops as bitwise
 from tensorflow.python.ops import image_ops as image
+from tensorflow.python.ops import manip_ops as manip
 from tensorflow.python.ops import metrics
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import sets
@@ -241,6 +242,7 @@ _allowed_symbols.extend([
     'linalg',
     'logging',
     'losses',
+    'manip',
     'metrics',
     'newaxis',
     'nn',
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index c87b7652ad..3a6058054b 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -1601,6 +1601,19 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "manip_ops_test",
+    size = "small",
+    srcs = ["manip_ops_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:manip_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+    ],
+    tags = ["no_windows_gpu"],
+)
+
 cuda_py_test(
     name = "matmul_op_test",
     size = "small",
diff --git a/tensorflow/python/kernel_tests/manip_ops_test.py b/tensorflow/python/kernel_tests/manip_ops_test.py
new file mode 100644
index 0000000000..3044b21aa4
--- /dev/null
+++ b/tensorflow/python/kernel_tests/manip_ops_test.py
@@ -0,0 +1,137 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for manip_ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import manip_ops
+from tensorflow.python.ops import gradient_checker
+from tensorflow.python.platform import test as test_lib
+
+import numpy as np
+
+# pylint: disable=g-import-not-at-top
+try:
+  from distutils.version import StrictVersion as Version
+  # numpy.roll for multiple shifts was introduced in numpy version 1.12.0
+  NP_ROLL_CAN_MULTISHIFT = Version(np.version.version) >= Version('1.12.0')
+except ImportError:
+  NP_ROLL_CAN_MULTISHIFT = False
+# pylint: enable=g-import-not-at-top
+
+class RollTest(test_util.TensorFlowTestCase):
+  def _testRoll(self, np_input, shift, axis):
+    expected_roll = np.roll(np_input, shift, axis)
+    with self.test_session():
+      roll = manip_ops.roll(np_input, shift, axis)
+      self.assertAllEqual(roll.eval(), expected_roll)
+
+  def _testGradient(self, np_input, shift, axis):
+    with self.test_session():
+      inx = constant_op.constant(np_input.tolist())
+      xs = list(np_input.shape)
+      y = manip_ops.roll(inx, shift, axis)
+      # Expected y's shape to be the same
+      ys = xs
+      jacob_t, jacob_n = gradient_checker.compute_gradient(
+          inx, xs, y, ys, x_init_value=np_input)
+      self.assertAllClose(jacob_t, jacob_n, rtol=1e-5, atol=1e-5)
+
+  def _testAll(self, np_input, shift, axis):
+    self._testRoll(np_input, shift, axis)
+    if np_input.dtype == np.float32:
+      self._testGradient(np_input, shift, axis)
+
+  def testIntTypes(self):
+    for t in [np.int32, np.int64]:
+      self._testAll(np.random.randint(-100, 100, (5)).astype(t), 3, 0)
+      if NP_ROLL_CAN_MULTISHIFT:
+        self._testAll(np.random.randint(-100, 100, (4, 4, 3)).astype(t),
+                      [1, -2, 3], [0, 1, 2])
+        self._testAll(np.random.randint(-100, 100, (4, 2, 1, 3)).astype(t),
+                      [0, 1, -2], [1, 2, 3])
+
+  def testFloatTypes(self):
+    for t in [np.float32, np.float64]:
+      self._testAll(np.random.rand(5).astype(t), 2, 0)
+      if NP_ROLL_CAN_MULTISHIFT:
+        self._testAll(np.random.rand(3, 4).astype(t), [1, 2], [1, 0])
+        self._testAll(np.random.rand(1, 3, 4).astype(t), [1, 0, -3], [0, 1, 2])
+
+  def testComplexTypes(self):
+    for t in [np.complex64, np.complex128]:
+      x = np.random.rand(4, 4).astype(t)
+      self._testAll(x + 1j * x, 2, 0)
+      if NP_ROLL_CAN_MULTISHIFT:
+        x = np.random.rand(2, 5).astype(t)
+        self._testAll(x + 1j * x, [1, 2], [1, 0])
+        x = np.random.rand(3, 2, 1, 1).astype(t)
+        self._testAll(x + 1j * x, [2, 1, 1, 0], [0, 3, 1, 2])
+
+
+  def testRollInputMustVectorHigherRaises(self):
+    tensor = 7
+    shift = 1
+    axis = 0
+    with self.test_session():
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "input must be 1-D or higher"):
+        manip_ops.roll(tensor, shift, axis).eval()
+
+  def testRollAxisMustBeScalarOrVectorRaises(self):
+    tensor = [[1, 2],
+              [3, 4]]
+    shift = 1
+    axis = [[0, 1]]
+    with self.test_session():
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "axis must be a scalar or a 1-D vector"):
+        manip_ops.roll(tensor, shift, axis).eval()
+
+  def testRollShiftMustBeScalarOrVectorRaises(self):
+    tensor = [[1, 2],
+              [3, 4]]
+    shift = [[0, 1]]
+    axis = 1
+    with self.test_session():
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "shift must be a scalar or a 1-D vector"):
+        manip_ops.roll(tensor, shift, axis).eval()
+
+  def testRollShiftAndAxisMustBeSameSizeRaises(self):
+    tensor = [[1, 2],
+              [3, 4]]
+    shift = [1]
+    axis = [0, 1]
+    with self.test_session():
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "shift and axis must have the same size"):
+        manip_ops.roll(tensor, shift, axis).eval()
+
+  def testRollAxisOutOfRangeRaises(self):
+    tensor = [1, 2]
+    shift = 1
+    axis = 1
+    with self.test_session():
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "is out of range"):
+        manip_ops.roll(tensor, shift, axis).eval()
+
+if __name__ == "__main__":
+  test_lib.main()
diff --git a/tensorflow/python/ops/gradients_impl.py b/tensorflow/python/ops/gradients_impl.py
index 314726ede6..230b6c5946 100644
--- a/tensorflow/python/ops/gradients_impl.py
+++ b/tensorflow/python/ops/gradients_impl.py
@@ -44,6 +44,7 @@ from tensorflow.python.ops import image_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import linalg_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import linalg_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import logging_ops  # pylint: disable=unused-import
+from tensorflow.python.ops import manip_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import math_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
diff --git a/tensorflow/python/ops/manip_grad.py b/tensorflow/python/ops/manip_grad.py
new file mode 100644
index 0000000000..573e8c0a0d
--- /dev/null
+++ b/tensorflow/python/ops/manip_grad.py
@@ -0,0 +1,32 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Gradients for operators defined in manip_ops.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import manip_ops
+
+
+@ops.RegisterGradient("Roll")
+def _RollGrad(op, grad):
+  # The gradient is just the roll reversed
+  shift = op.inputs[1]
+  axis = op.inputs[2]
+  roll_grad = manip_ops.roll(grad, -shift, axis)
+  return roll_grad, None, None
diff --git a/tensorflow/python/ops/manip_ops.py b/tensorflow/python/ops/manip_ops.py
new file mode 100644
index 0000000000..c5f39784f4
--- /dev/null
+++ b/tensorflow/python/ops/manip_ops.py
@@ -0,0 +1,36 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Operators for manipulating tensors.
+
+@@roll
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import gen_manip_ops as _gen_manip_ops
+from tensorflow.python.util.all_util import remove_undocumented
+
+# pylint: disable=protected-access
+def roll(input, shift, axis):
+  return _gen_manip_ops.roll(input, shift, axis)
+
+roll.__doc__ = _gen_manip_ops.roll.__doc__
+# pylint: enable=protected-access
+
+_allowed_symbols = ['roll']
+
+remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/python/ops/standard_ops.py b/tensorflow/python/ops/standard_ops.py
index 30bf4e4ef1..737b923415 100644
--- a/tensorflow/python/ops/standard_ops.py
+++ b/tensorflow/python/ops/standard_ops.py
@@ -26,6 +26,7 @@ import sys as _sys
 from tensorflow.python.ops import array_grad
 from tensorflow.python.ops import data_flow_grad
 from tensorflow.python.ops import math_grad
+from tensorflow.python.ops import manip_grad
 from tensorflow.python.ops import sparse_grad
 from tensorflow.python.ops import spectral_grad
 from tensorflow.python.ops import state_grad
@@ -59,6 +60,7 @@ from tensorflow.python.ops.logging_ops import Print
 from tensorflow.python.ops.logging_ops import get_summary_op
 from tensorflow.python.ops.lookup_ops import initialize_all_tables
 from tensorflow.python.ops.lookup_ops import tables_initializer
+from tensorflow.python.ops.manip_ops import *
 from tensorflow.python.ops.math_ops import *
 from tensorflow.python.ops.numerics import *
 from tensorflow.python.ops.parsing_ops import *
@@ -105,6 +107,7 @@ from tensorflow.python.ops import init_ops as _init_ops
 from tensorflow.python.ops import io_ops as _io_ops
 from tensorflow.python.ops import linalg_ops as _linalg_ops
 from tensorflow.python.ops import logging_ops as _logging_ops
+from tensorflow.python.ops import manip_ops as _manip_ops
 from tensorflow.python.ops import math_ops as _math_ops
 from tensorflow.python.ops import numerics as _numerics
 from tensorflow.python.ops import parsing_ops as _parsing_ops
@@ -280,6 +283,7 @@ remove_undocumented(__name__, _allowed_symbols,
                      _io_ops,
                      _linalg_ops,
                      _logging_ops,
+                     _manip_ops,
                      _math_ops,
                      _numerics,
                      _parsing_ops,
diff --git a/tensorflow/tools/api/golden/tensorflow.manip.pbtxt b/tensorflow/tools/api/golden/tensorflow.manip.pbtxt
new file mode 100644
index 0000000000..0b84165285
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.manip.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.manip"
+tf_module {
+  member_method {
+    name: "roll"
+    argspec: "args=[\'input\', \'shift\', \'axis\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.pbtxt b/tensorflow/tools/api/golden/tensorflow.pbtxt
index dc7c3a2f45..e8890e9cc0 100644
--- a/tensorflow/tools/api/golden/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.pbtxt
@@ -396,6 +396,10 @@ tf_module {
     name: "losses"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "manip"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "metrics"
     mtype: "<type \'module\'>"
-- 
GitLab


From 90d628b0ba66a6144c4a83292e95f4dde287f5a9 Mon Sep 17 00:00:00 2001
From: Sami Kama <skama@nvidia.com>
Date: Mon, 29 Jan 2018 16:10:13 -0800
Subject: [PATCH 365/966] Remove debug log and use vlog instead

---
 .../contrib/tensorrt/convert/convert_graph.cc |  1 -
 .../contrib/tensorrt/convert/convert_nodes.cc | 99 +++++++++----------
 tensorflow/contrib/tensorrt/log/trt_logger.cc |  1 -
 3 files changed, 49 insertions(+), 52 deletions(-)

diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index eea51d458b..b8dbc7b7c8 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -32,7 +32,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 
-#define _TF_LOG_DEBUG ::tensorflow::internal::LogMessage(__FILE__, __LINE__, -1)
 #include "tensorflow/core/grappler/clusters/virtual_cluster.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/devices.h"
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index 5879cd3189..60e6a1ab96 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -36,7 +36,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 
-#define _TF_LOG_DEBUG ::tensorflow::internal::LogMessage(__FILE__, __LINE__, -1)
 //  Check if the types are equal. Cast to int first so that failure log message
 //  would work!
 
@@ -107,7 +106,7 @@ static std::vector<std::pair<int, int>> createSamePadding(
     int left = p / 2;
     int right = p - left;
 
-    LOG(DEBUG) << "PADDING_" << i << " pre: " << left << ", post: " << right
+    VLOG(-1) << "PADDING_" << i << " pre: " << left << ", post: " << right
                << "paras: " << inputDims[i] << ", " << stride.d[i] << ", "
                << "kernel: " << kernel.d[i];
     padding[i] = {left, right};
@@ -356,7 +355,7 @@ class Converter {
       tensorflow::NodeDef const& node_def) {
     std::vector<TRT_TensorOrWeights> inputs;
     for (auto const& input_name : node_def.input()) {
-      LOG(DEBUG) << "retrieve input: " << input_name;
+      VLOG(-1) << "retrieve input: " << input_name;
       inputs.push_back(_trt_tensors.at(input_name));
     }
     return inputs;
@@ -399,7 +398,7 @@ class Converter {
       if (output.is_tensor()) {
         output.tensor()->setName(output_name.c_str());
       }
-      LOG(DEBUG) << "write out tensor: " << output_name;
+      VLOG(-1) << "write out tensor: " << output_name;
       if (!_trt_tensors.insert({output_name, output}).second) {
         return tensorflow::errors::AlreadyExists(
             "output tensor already exists for op: " + op);
@@ -460,13 +459,13 @@ struct LambdaFactory {
   std::function<T(T)> unary() {
     switch (op) {
       case OP_CATEGORY::RSQRT: {
-        LOG(DEBUG) << "RSQRT GETS DONE";
+        VLOG(-1) << "RSQRT GETS DONE";
         return [](T t) -> T { return 1.0 / std::sqrt(t); };
       }
       case OP_CATEGORY::NEG:
         return [](T t) -> T { return -t; };
       default:
-        LOG(DEBUG) << "not supported op for unary: " << static_cast<int>(op);
+        VLOG(-1) << "not supported op for unary: " << static_cast<int>(op);
         return nullptr;
     }
   }
@@ -491,22 +490,22 @@ struct LambdaFactory {
 
   template <typename T>
   std::function<T(T)> broadcast_r(T val) {
-    LOG(DEBUG) << "LAMBDA VAL : " << val;
+    VLOG(-1) << "LAMBDA VAL : " << val;
     switch (op) {
       case OP_CATEGORY::ADD:
         return [val](T l) -> T {
-          LOG(DEBUG) << "LAMBDA VAL : " << val;
+          VLOG(-1) << "LAMBDA VAL : " << val;
           return l + val;
         };
       // return [val](T l)-> T {return l+val;};
       case OP_CATEGORY::SUB:
         return [val](T l) -> T {
-          LOG(DEBUG) << "LAMBDA VAL : " << val;
+          VLOG(-1) << "LAMBDA VAL : " << val;
           return l - val;
         };
       case OP_CATEGORY::MUL:
         return [val](T l) -> T {
-          LOG(DEBUG) << "LAMBDA VAL : " << val;
+          VLOG(-1) << "LAMBDA VAL : " << val;
           return l * val;
         };
       default:
@@ -520,21 +519,21 @@ struct LambdaFactory {
 
   template <typename T>
   std::function<T(T)> broadcast_l(T val) {
-    LOG(DEBUG) << "LAMBDA VAL : " << val;
+    VLOG(-1) << "LAMBDA VAL : " << val;
     switch (op) {
       case OP_CATEGORY::ADD:
         return [val](T l) -> T {
-          LOG(DEBUG) << "LAMBDA VAL : " << val;
+          VLOG(-1) << "LAMBDA VAL : " << val;
           return val + l;
         };
       case OP_CATEGORY::SUB:
         return [val](T l) -> T {
-          LOG(DEBUG) << "LAMBDA VAL : " << val;
+          VLOG(-1) << "LAMBDA VAL : " << val;
           return val - l;
         };
       case OP_CATEGORY::MUL:
         return [val](T l) -> T {
-          LOG(DEBUG) << "LAMBDA VAL : " << val;
+          VLOG(-1) << "LAMBDA VAL : " << val;
           return val * l;
         };
       default:
@@ -574,7 +573,7 @@ tensorflow::Status BinaryCompute(TRT_ShapedWeights const& iweights_l,
   // assume iweights_l.type == iweight_r.type
   CHECK_EQ(iweights_l.type_, oweights->type_);
   CHECK_EQ(iweights_r.type_, oweights->type_);
-  LOG(DEBUG) << "SANITY CHECK!";
+  VLOG(-1) << "SANITY CHECK!";
 
   switch (iweights_l.type_) {
     case tensorflow::DataType::DT_FLOAT: {
@@ -585,11 +584,11 @@ tensorflow::Status BinaryCompute(TRT_ShapedWeights const& iweights_l,
       if (iweights_l.count() != iweights_r.count()) {
         // we only supports broadcast of RankZero
         if (iweights_l.count() == 1) {
-          LOG(DEBUG) << "I bet it is not working!" << (*inp_l);
+          VLOG(-1) << "I bet it is not working!" << (*inp_l);
           std::transform(inp_r, inp_r + iweights_r.count(), oup,
                          binary_op.broadcast_l<float>(*inp_l));
         } else if (iweights_r.count() == 1) {
-          LOG(DEBUG) << "I bet it is not working!" << (*inp_r);
+          VLOG(-1) << "I bet it is not working!" << (*inp_r);
           std::transform(inp_l, inp_l + iweights_l.count(), oup,
                          binary_op.broadcast_r<float>(*inp_r));
         } else {
@@ -664,7 +663,7 @@ tensorflow::Status ConstantFoldBinary(
   int nbDims = weights_input_l.shape_.nbDims;
   nvinfer1::Dims output_shape;
   output_shape.nbDims = nbDims;
-  LOG(DEBUG) << "nbDims: " << nbDims
+  VLOG(-1) << "nbDims: " << nbDims
              << "the other: " << weights_input_r.shape_.nbDims;
   for (int i = 0; i < nbDims; i++) {
     if (weights_input_l.shape_.d[i] == weights_input_r.shape_.d[i]) {
@@ -677,7 +676,7 @@ tensorflow::Status ConstantFoldBinary(
       return tensorflow::errors::Unimplemented(
           "Binary op with incompatible shape at, " + node_def.op());
     }
-    LOG(DEBUG) << "left: " << weights_input_l.shape_.d[i]
+    VLOG(-1) << "left: " << weights_input_l.shape_.d[i]
                << "right: " << weights_input_r.shape_.d[i]
                << "output: " << output_shape.d[i];
   }
@@ -739,7 +738,7 @@ tensorflow::Status BinaryTensorOpWeight(
   auto scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
 
   if (weights.count() == 1) {
-    LOG(DEBUG) << "UNIFORM";
+    VLOG(-1) << "UNIFORM";
     scale_mode = nvinfer1::ScaleMode::kUNIFORM;
   } else {
     // no broadcasting on Batch dimension;
@@ -842,7 +841,7 @@ tensorflow::Status ConvertPlaceholder(
     Converter& ctx, tensorflow::NodeDef const& node_def,
     std::vector<TRT_TensorOrWeights> const& inputs,
     std::vector<TRT_TensorOrWeights>* outputs) {
-  LOG(DEBUG) << "Placeholder should have been replace already";
+  VLOG(-1) << "Placeholder should have been replace already";
   return tensorflow::errors::Unimplemented("cannot convert Placeholder op");
   // OK this make sense since we are supposed to replace it with input
   TFAttrs attrs(node_def);
@@ -909,11 +908,11 @@ tensorflow::Status ConvertConv2D(Converter& ctx,
   if (padding[0].first != padding[0].second ||
       padding[1].first != padding[1].second) {
     // TODO(jie): handle asymmetric padding
-    LOG(DEBUG) << "padding!!!: " << padding[0].first << padding[0].second
+    VLOG(-1) << "padding!!!: " << padding[0].first << padding[0].second
                << padding[1].first << padding[1].second;
 
     auto dim_before = tensor->getDimensions();
-    LOG(DEBUG) << "TENSOR before: " << dim_before.d[0] << ", "
+    VLOG(-1) << "TENSOR before: " << dim_before.d[0] << ", "
                << dim_before.d[1] << dim_before.d[2] << ", " << dim_before.d[3];
     auto padLayer = ctx.network()->addPadding(
         *const_cast<nvinfer1::ITensor*>(tensor),
@@ -922,7 +921,7 @@ tensorflow::Status ConvertConv2D(Converter& ctx,
     padding = {{0, 0}, {0, 0}};
     tensor = padLayer->getOutput(0);
     auto dim_after = tensor->getDimensions();
-    LOG(DEBUG) << "TENSOR after: " << dim_after.d[0] << ", " << dim_after.d[1]
+    VLOG(-1) << "TENSOR after: " << dim_after.d[0] << ", " << dim_after.d[1]
                << dim_after.d[2] << ", " << dim_after.d[3];
   }
 
@@ -936,14 +935,14 @@ tensorflow::Status ConvertConv2D(Converter& ctx,
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
 
   auto dim_after = output_tensor->getDimensions();
-  LOG(DEBUG) << "TENSOR out: " << dim_after.d[0] << ", " << dim_after.d[1]
+  VLOG(-1) << "TENSOR out: " << dim_after.d[0] << ", " << dim_after.d[1]
              << dim_after.d[2] << ", " << dim_after.d[3];
 
   if (data_format == "NHWC") {
     // TODO(jie): transpose it back!
     output_tensor = ctx.transposeTensor(output_tensor, {0, 2, 3, 1});
   } else {
-    LOG(DEBUG) << "NCHW !!!!";
+    VLOG(-1) << "NCHW !!!!";
   }
   outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return tensorflow::Status::OK();
@@ -965,7 +964,7 @@ tensorflow::Status ConvertPool(Converter& ctx,
     tensor = ctx.transposeTensor(const_cast<nvinfer1::ITensor*>(tensor),
                                  {0, 3, 1, 2});
   } else {
-    LOG(DEBUG) << "NCHW !!!!";
+    VLOG(-1) << "NCHW !!!!";
   }
   nvinfer1::PoolingType type;
   // TODO(jie): support other pooling type
@@ -993,7 +992,7 @@ tensorflow::Status ConvertPool(Converter& ctx,
         {static_cast<int>(tensor_dim.d[1]), static_cast<int>(tensor_dim.d[2])});
   } else if (attrs.get<std::string>("padding") == "VALID") {
     // No padding for valid padding here
-    LOG(DEBUG) << "no padding added for VALID padding in pool"
+    VLOG(-1) << "no padding added for VALID padding in pool"
                << node_def.name();
     padding = {{0, 0}, {0, 0}};
   } else {
@@ -1004,7 +1003,7 @@ tensorflow::Status ConvertPool(Converter& ctx,
   if (padding[0].first != padding[0].second ||
       padding[1].first != padding[1].second) {
     // TODO(jie): handle asymmetric padding
-    LOG(DEBUG) << "padding!!!: " << padding[0].first << padding[0].second
+    VLOG(-1) << "padding!!!: " << padding[0].first << padding[0].second
                << padding[1].first << padding[1].second;
     auto padLayer = ctx.network()->addPadding(
         *const_cast<nvinfer1::ITensor*>(tensor),
@@ -1026,7 +1025,7 @@ tensorflow::Status ConvertPool(Converter& ctx,
     // TODO(jie): transpose it back!
     output_tensor = ctx.transposeTensor(output_tensor, {0, 2, 3, 1});
   } else {
-    LOG(DEBUG) << "NCHW !!!!";
+    VLOG(-1) << "NCHW !!!!";
   }
   outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return tensorflow::Status::OK();
@@ -1068,7 +1067,7 @@ tensorflow::Status ConvertScale(Converter& ctx,
                                  {0, 3, 1, 2});
     // TODO(jie): transpose it
   } else {
-    LOG(DEBUG) << "NCHW !!!!";
+    VLOG(-1) << "NCHW !!!!";
   }
   nvinfer1::IScaleLayer* layer = ctx.network()->addScale(
       *const_cast<nvinfer1::ITensor*>(tensor), nvinfer1::ScaleMode::kCHANNEL,
@@ -1079,7 +1078,7 @@ tensorflow::Status ConvertScale(Converter& ctx,
     // TODO(jie): transpose it back!
     output_tensor = ctx.transposeTensor(output_tensor, {0, 2, 3, 1});
   } else {
-    LOG(DEBUG) << "NCHW !!!!";
+    VLOG(-1) << "NCHW !!!!";
   }
   outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return tensorflow::Status::OK();
@@ -1104,14 +1103,14 @@ tensorflow::Status ConvertConst(Converter& ctx,
 
   TRT_ShapedWeights weights(dtype);
   if (!weights_tensor.float_val().empty()) {
-    LOG(DEBUG) << "SCALAR!!!" << node_def.name();
+    VLOG(-1) << "SCALAR!!!" << node_def.name();
     nvinfer1::Dims scalar_shape;
     if (tensor.dims() > 0) {
-      LOG(DEBUG) << "dimensions: " << tensor.dims();
+      VLOG(-1) << "dimensions: " << tensor.dims();
       weights = TRT_ShapedWeights(dtype, weights_tensor.float_val().data(),
                                   get_tensor_shape(tensor));
     } else {
-      LOG(DEBUG) << "dimensions: " << tensor.dims();
+      VLOG(-1) << "dimensions: " << tensor.dims();
       scalar_shape.nbDims = 1;
       scalar_shape.d[0] = 1;
       scalar_shape.type[0] = nvinfer1::DimensionType::kSPATIAL;
@@ -1123,7 +1122,7 @@ tensorflow::Status ConvertConst(Converter& ctx,
                                   scalar_shape);
     }
   } else if (!weights_tensor.tensor_content().empty()) {
-    LOG(DEBUG) << "TENSOR!!!" << node_def.name();
+    VLOG(-1) << "TENSOR!!!" << node_def.name();
     weights = TRT_ShapedWeights(dtype, weights_tensor.tensor_content().data(),
                                 get_tensor_shape(tensor));
   } else {
@@ -1425,11 +1424,11 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
     }
   }
   // topological order is needed to build TRT network
-  LOG(DEBUG) << "BUILDING 1";
+  VLOG(-1) << "BUILDING 1";
 
   tensorflow::tensorrt::Logger trt_logger;
 
-  LOG(DEBUG) << "BUILDING 2";
+  VLOG(-1) << "BUILDING 2";
 
   auto trt_builder = infer_object(nvinfer1::createInferBuilder(trt_logger));
   if (!trt_builder) {
@@ -1437,7 +1436,7 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
         "failed to create TensorRT builder object");
   }
 
-  LOG(DEBUG) << "BUILDING 3";
+  VLOG(-1) << "BUILDING 3";
 
   auto trt_network = infer_object(trt_builder->createNetwork());
   if (!trt_network) {
@@ -1445,16 +1444,16 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
         "failed to create TensorRT network object");
   }
 
-  LOG(DEBUG) << "BUILDING 4";
+  VLOG(-1) << "BUILDING 4";
 
   // Build the network
   Converter converter(trt_network.get());
 
-  LOG(DEBUG) << "BUILDING 5";
+  VLOG(-1) << "BUILDING 5";
   std::vector<std::string> input_names;
   std::vector<tensorflow::DataType> input_dtypes;
   for (std::pair<int, int> const& input : input_inds) {
-    LOG(DEBUG) << "parsing input!!!!!";
+    VLOG(-1) << "parsing input!!!!!";
     int node_id = input.first;
     int output_idx = input.second;
     tensorflow::Node* node = graph.FindNodeId(node_id);
@@ -1480,7 +1479,7 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
     nvinfer1::DataType dtype(nvinfer1::DataType::kFLOAT);
     TF_CHECK_OK(convert_dtype(tf_dtype, &dtype));
 
-    LOG(DEBUG) << "accessing output index of: " << std::to_string(output_idx)
+    VLOG(-1) << "accessing output index of: " << std::to_string(output_idx)
                << ", at node: " << node_name
                << "with output entry from shape_map: "
                << std::to_string(op_info_vec.size());
@@ -1490,7 +1489,7 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
     for (int i = 0; i < 3; i++) input_dim_psuedo_chw.d[i] = 1;
 
     for (int i = 1; i < op_info.shape().dim_size(); i++) {
-      LOG(DEBUG) << "dimension: " << i
+      VLOG(-1) << "dimension: " << i
                  << " , size: " << op_info.shape().dim(i).size();
       input_dim_psuedo_chw.d[i - 1] = op_info.shape().dim(i).size();
     }
@@ -1506,23 +1505,23 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
     if (!input_tensor)
       return tensorflow::errors::InvalidArgument(
           "Failed to create Input layer");
-    LOG(DEBUG) << "input tensor name :" << input_tensor_name;
+    VLOG(-1) << "input tensor name :" << input_tensor_name;
 
     if (!converter.insert_input_tensor(input_tensor_name, input_tensor))
       return tensorflow::errors::AlreadyExists(
           "output tensor already exists for op: " + input_tensor_name);
   }
 
-  LOG(DEBUG) << "finished sorting";
+  VLOG(-1) << "finished sorting";
 
   for (const tensorflow::Node* node : order) {
     tensorflow::NodeDef const& node_def = node->def();
-    LOG(DEBUG) << "converting node: " << node_def.name() << " , "
+    VLOG(-1) << "converting node: " << node_def.name() << " , "
                << node_def.op();
     TF_RETURN_IF_ERROR(converter.convert_node(node_def));
   }
 
-  LOG(DEBUG) << "finished conversion";
+  VLOG(-1) << "finished conversion";
 
   // Gather output metadata
   std::vector<std::string> output_names;
@@ -1535,7 +1534,7 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
     std::string tensor_name = op_name;
     if (output_idx != 0)
       tensor_name = tensor_name + ":" + std::to_string(output_idx);
-    LOG(DEBUG) << "output tensor name: " << tensor_name;
+    VLOG(-1) << "output tensor name: " << tensor_name;
     output_names.push_back(tensor_name);
     auto tensor_or_weights = converter.get_tensor(tensor_name);
     if (!tensor_or_weights.is_tensor()) {
@@ -1555,7 +1554,7 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
     tensor->setType(trt_dtype);
   }
 
-  LOG(DEBUG) << "finished output";
+  VLOG(-1) << "finished output";
 
   // Build the engine
   trt_builder->setMaxBatchSize(max_batch_size);
diff --git a/tensorflow/contrib/tensorrt/log/trt_logger.cc b/tensorflow/contrib/tensorrt/log/trt_logger.cc
index e4ff526172..35f96511f1 100644
--- a/tensorflow/contrib/tensorrt/log/trt_logger.cc
+++ b/tensorflow/contrib/tensorrt/log/trt_logger.cc
@@ -19,7 +19,6 @@ limitations under the License.
 
 // Use TF logging for TensorRT informations
 
-#define _TF_LOG_DEBUG ::tensorflow::internal::LogMessage(__FILE__, __LINE__, -1)
 
 namespace tensorflow {
 namespace tensorrt {
-- 
GitLab


From ee69436c2dd034ade90e5e278ef233917ad8afcc Mon Sep 17 00:00:00 2001
From: Jianwei Xie <xiejw@google.com>
Date: Mon, 29 Jan 2018 16:22:43 -0800
Subject: [PATCH 366/966] Adds prediction_hooks into EstimatorSpec.

PiperOrigin-RevId: 183747105
---
 tensorflow/python/estimator/estimator.py      |  5 ++++-
 tensorflow/python/estimator/estimator_test.py | 19 +++++++++++++++++++
 tensorflow/python/estimator/model_fn.py       | 15 +++++++++++----
 tensorflow/python/estimator/model_fn_test.py  | 15 +++++++++++++--
 ...tensorflow.estimator.-estimator-spec.pbtxt |  4 ++++
 5 files changed, 51 insertions(+), 7 deletions(-)

diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index face20d530..8d1f1afcff 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -477,13 +477,16 @@ class Estimator(object):
       estimator_spec = self._call_model_fn(
           features, None, model_fn_lib.ModeKeys.PREDICT, self.config)
       predictions = self._extract_keys(estimator_spec.predictions, predict_keys)
+      all_hooks = list(input_hooks)
+      all_hooks.extend(hooks)
+      all_hooks.extend(list(estimator_spec.prediction_hooks or []))
       with training.MonitoredSession(
           session_creator=training.ChiefSessionCreator(
               checkpoint_filename_with_path=checkpoint_path,
               master=self._config.master,
               scaffold=estimator_spec.scaffold,
               config=self._session_config),
-          hooks=input_hooks + hooks) as mon_sess:
+          hooks=all_hooks) as mon_sess:
         while not mon_sess.should_stop():
           preds_evaluated = mon_sess.run(predictions)
           if not isinstance(predictions, dict):
diff --git a/tensorflow/python/estimator/estimator_test.py b/tensorflow/python/estimator/estimator_test.py
index 833f3dcac3..39a5b998eb 100644
--- a/tensorflow/python/estimator/estimator_test.py
+++ b/tensorflow/python/estimator/estimator_test.py
@@ -1355,6 +1355,25 @@ class EstimatorPredictTest(test.TestCase):
     est.train(dummy_input_fn, steps=1)
     self.assertEqual(10., next(est.predict(dummy_input_fn)))
 
+  def test_predictionhooks_are_used(self):
+    hook = test.mock.MagicMock(
+        wraps=training.SessionRunHook(), spec=training.SessionRunHook)
+
+    def _model_fn_hooks(features, labels, mode):
+      _, _ = features, labels
+      return model_fn_lib.EstimatorSpec(
+          mode=mode,
+          loss=constant_op.constant(0.),
+          train_op=state_ops.assign_add(training.get_global_step(), 1),
+          predictions=constant_op.constant([[10.]]),
+          prediction_hooks=[hook])
+
+    est = estimator.Estimator(model_fn=_model_fn_hooks)
+    est.train(dummy_input_fn, steps=1)
+    self.assertFalse(hook.begin.called)
+    next(est.predict(dummy_input_fn))
+    self.assertTrue(hook.begin.called)
+
   def test_warn_if_no_queue_runner(self):
 
     def _model_fn(features, labels, mode):
diff --git a/tensorflow/python/estimator/model_fn.py b/tensorflow/python/estimator/model_fn.py
index da202408c3..b08f83fc56 100644
--- a/tensorflow/python/estimator/model_fn.py
+++ b/tensorflow/python/estimator/model_fn.py
@@ -56,7 +56,7 @@ class EstimatorSpec(
     collections.namedtuple('EstimatorSpec', [
         'mode', 'predictions', 'loss', 'train_op', 'eval_metric_ops',
         'export_outputs', 'training_chief_hooks', 'training_hooks', 'scaffold',
-        'evaluation_hooks'
+        'evaluation_hooks', 'prediction_hooks'
     ])):
   """Ops and objects returned from a `model_fn` and passed to an `Estimator`.
 
@@ -73,7 +73,8 @@ class EstimatorSpec(
               training_chief_hooks=None,
               training_hooks=None,
               scaffold=None,
-              evaluation_hooks=None):
+              evaluation_hooks=None,
+              prediction_hooks=None):
     """Creates a validated `EstimatorSpec` instance.
 
     Depending on the value of `mode`, different arguments are required. Namely
@@ -154,6 +155,8 @@ class EstimatorSpec(
         initialization, saver, and more to be used in training.
       evaluation_hooks: Iterable of `tf.train.SessionRunHook` objects to
         run during evaluation.
+      prediction_hooks: Iterable of `tf.train.SessionRunHook` objects to
+        run during predictions.
 
     Returns:
       A validated `EstimatorSpec` object.
@@ -282,7 +285,10 @@ class EstimatorSpec(
     training_chief_hooks = tuple(training_chief_hooks or [])
     training_hooks = tuple(training_hooks or [])
     evaluation_hooks = tuple(evaluation_hooks or [])
-    for hook in training_hooks + training_chief_hooks + evaluation_hooks:
+    prediction_hooks = tuple(prediction_hooks or [])
+
+    for hook in (training_hooks + training_chief_hooks + evaluation_hooks +
+                 prediction_hooks):
       if not isinstance(hook, session_run_hook.SessionRunHook):
         raise TypeError(
             'All hooks must be SessionRunHook instances, given: {}'.format(
@@ -305,7 +311,8 @@ class EstimatorSpec(
         training_chief_hooks=training_chief_hooks,
         training_hooks=training_hooks,
         scaffold=scaffold,
-        evaluation_hooks=evaluation_hooks)
+        evaluation_hooks=evaluation_hooks,
+        prediction_hooks=prediction_hooks)
 
   def _replace(self, **kwds):
     """Return a new EstimatorSpec replacing specified fields with new values."""
diff --git a/tensorflow/python/estimator/model_fn_test.py b/tensorflow/python/estimator/model_fn_test.py
index d67c4b7161..b7eeeb437c 100644
--- a/tensorflow/python/estimator/model_fn_test.py
+++ b/tensorflow/python/estimator/model_fn_test.py
@@ -72,7 +72,8 @@ class EstimatorSpecTrainTest(test.TestCase):
           training_chief_hooks=[_FakeHook()],
           training_hooks=[_FakeHook()],
           scaffold=monitored_session.Scaffold(),
-          evaluation_hooks=[_FakeHook()])
+          evaluation_hooks=[_FakeHook()],
+          prediction_hooks=[_FakeHook()])
 
   def testLossNumber(self):
     """Tests that error is raised when loss is a number (not Tensor)."""
@@ -465,7 +466,17 @@ class EstimatorSpecInferTest(test.TestCase):
           training_chief_hooks=[_FakeHook()],
           training_hooks=[_FakeHook()],
           scaffold=monitored_session.Scaffold(),
-          evaluation_hooks=[_FakeHook()])
+          evaluation_hooks=[_FakeHook()],
+          prediction_hooks=[_FakeHook()])
+
+  def testPredictionHookInvalid(self):
+    with ops.Graph().as_default(), self.test_session():
+      with self.assertRaisesRegexp(
+          TypeError, 'All hooks must be SessionRunHook instances'):
+        model_fn.EstimatorSpec(
+            mode=model_fn.ModeKeys.PREDICT,
+            predictions=constant_op.constant(1.),
+            prediction_hooks=[_InvalidHook()])
 
   def testPredictionsMissing(self):
     with ops.Graph().as_default(), self.test_session():
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-estimator-spec.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-estimator-spec.pbtxt
index dbcc187f94..aa6ac46613 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-estimator-spec.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-estimator-spec.pbtxt
@@ -23,6 +23,10 @@ tf_class {
     name: "mode"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "prediction_hooks"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "predictions"
     mtype: "<type \'property\'>"
-- 
GitLab


From a9db14e45d799d62914b5cde31d4d85f007b85eb Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 29 Jan 2018 16:50:27 -0800
Subject: [PATCH 367/966] Making a bunch of edits to the Getting Started with
 Premade Estimators doc for consistency and correctness.

PiperOrigin-RevId: 183751330
---
 .../get_started/premade_estimators.md         | 115 ++++++++++--------
 1 file changed, 62 insertions(+), 53 deletions(-)

diff --git a/tensorflow/docs_src/get_started/premade_estimators.md b/tensorflow/docs_src/get_started/premade_estimators.md
index dbc35065ab..45850a8996 100644
--- a/tensorflow/docs_src/get_started/premade_estimators.md
+++ b/tensorflow/docs_src/get_started/premade_estimators.md
@@ -2,37 +2,39 @@
 # Getting Started with TensorFlow
 
 This document introduces the TensorFlow programming environment and shows you
-how to write the Iris classification problem in TensorFlow.
+how to solve the Iris classification problem in TensorFlow.
 
-Prior to reading this document, do the following:
+## Prerequisites
+
+Prior to using the sample code in this document, you'll need to do the
+following:
 
 * @{$install$Install TensorFlow}.
 * If you installed TensorFlow with virtualenv or Anaconda, activate your
   TensorFlow environment.
-* To keep the data import simple, our Iris example uses Pandas. You can
-  install Pandas with:
+* Install or upgrade pandas by issuing the following command:
 
-      `pip install pandas`
+        pip install pandas
 
 ## Getting the sample code
 
-Take the following steps to get the sample code for this program:
+Take the following steps to get the sample code we'll be going through:
 
-1. Clone the TensorFlow Models repository from github by entering the following
+1. Clone the TensorFlow Models repository from GitHub by entering the following
    command:
 
-       `git clone https://github.com/tensorflow/models`
+        git clone https://github.com/tensorflow/models
 
 1. Change directory within that branch to the location containing the examples
    used in this document:
 
-       `cd models/samples/core/get_started/`
+        cd models/samples/core/get_started/
 
 The program described in this document is
 [`premade_estimator.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/premade_estimator.py).
 This program uses
 [`iris_data.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/iris_data.py)
-To fetch its training data.
+to fetch its training data.
 
 ### Running the program
 
@@ -45,7 +47,7 @@ python premade_estimator.py
 The program should output training logs followed by some predictions against
 the test set. For example, the first line in the following output shows that
 the model thinks there is a 99.6% chance that the first example in the test
-set is a Setosa. Since the test set `expected "Setosa"`, this appears to be
+set is a Setosa. Since the test set expected Setosa, this appears to be
 a good prediction.
 
 ``` None
@@ -61,9 +63,9 @@ If the program generates errors instead of answers, ask yourself the following
 questions:
 
 * Did you install TensorFlow properly?
-* Are you using the correct version of tensorflow?
+* Are you using the correct version of TensorFlow?
 * Did you activate the environment you installed TensorFlow in? (This is
-  only relevant in certain installation environments.)
+  only relevant in certain installation mechanisms.)
 
 ## The programming stack
 
@@ -74,18 +76,15 @@ provides a programming stack consisting of multiple API layers:
 <div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px;">
 <img style="width:100%" src="../images/tensorflow_programming_environment.png">
 </div>
-<div style="text-align: center">
-The TensorFlow Programming Environment
-</div>
 
 We strongly recommend writing TensorFlow programs with the following APIs:
 
-* @{tf.estimator$Estimators}, which represent a complete model.
+* @{$programmers_guide/estimators$Estimators}, which represent a complete model.
   The Estimator API provides methods to train the model, to judge the model's
   accuracy, and to generate predictions.
 * @{$get_started/datasets_quickstart$Datasets}, which build a data input
   pipeline. The Dataset API has methods to load and manipulate data, and feed
-  it into your model. The Datasets API meshes well with the Estimators API.
+  it into your model. The Dataset API meshes well with the Estimators API.
 
 ## Classifying irises: an overview
 
@@ -120,7 +119,7 @@ individual Iris flowers:
 * petal length
 * petal width
 
-Our model will represent these features as float32 numerical data.
+Our model will represent these features as `float32` numerical data.
 
 The label identifies the Iris species, which must be one of the following:
 
@@ -154,9 +153,6 @@ The following figure illustrates the features, hidden layers, and predictions
   alt="A diagram of the network architecture: Inputs, 2 hidden layers, and outputs"
   src="../images/custom_estimators/full_network.png">
 </div>
-<div style="text-align: center">
-The Model.
-</div>
 
 ### Inference
 
@@ -174,12 +170,12 @@ example is an Iris Versicolor.
 
 ## Overview of programming with Estimators
 
-An Estimator is TensorFlow's high level representation of a complete model. It
+An Estimator is TensorFlow's high-level representation of a complete model. It
 handles the details of initialization, logging, saving and restoring, and many
 other features so you can concentrate on your model. For more details see
 @{$programmers_guide/estimators}.
 
-An "Estimator" is any class derived from @{tf.estimator.Estimator}. TensorFlow
+An Estimator is any class derived from @{tf.estimator.Estimator}. TensorFlow
 provides a collection of
 [pre-made Estimators](https://developers.google.com/machine-learning/glossary/#pre-made_Estimator)
 (for example, `LinearRegressor`) to implement common ML algorithms. Beyond
@@ -199,7 +195,7 @@ following tasks:
 * Call one or more methods on the Estimator object, passing the appropriate
   input function as the source of the data.
 
-Let's see how those tasks are implemented in Iris.
+Let's see how those tasks are implemented for Iris classification.
 
 ## Create input functions
 
@@ -209,17 +205,30 @@ evaluating, and prediction.
 An **input function** is a function that returns a @{tf.data.Dataset} object
 which outputs the following two-element tuple:
 
-* "features" - A Python dictionary in which:
+* [`features`](https://developers.google.com/machine-learning/glossary/#feature) - A Python dictionary in which:
     * Each key is the name of a feature.
     * Each value is an array containing all of that feature's values.
-* "label" - An array containing the values of the
+* `label` - An array containing the values of the
   [label](https://developers.google.com/machine-learning/glossary/#label) for
   every example.
 
-Your input function may generate the "features" dictionary and "label" list any
-way you like. However, we recommend using TensorFlow's @{tf.data.Dataset} API,
-which can deftly parse all sorts of data. At a high-level,
-the @{tf.data.Dataset} API consists of the following classes:
+Just to demonstrate the format of the input function, here's a simple
+implementation:
+
+```python
+def input_evaluation_set():
+    features = {'SepalLength': np.array([6.4, 5.0]),
+                'SepalWidth':  np.array([2.8, 2.3]),
+                'PetalLength': np.array([5.6, 3.3]),
+                'PetalWidth':  np.array([2.2, 1.0])}
+    labels = np.array([2, 1])
+    return features, labels
+```
+
+Your input function may generate the `features` dictionary and `label` list any
+way you like. However, we recommend using TensorFlow's Dataset API, which can
+parse all sorts of data. At a high level, the Dataset API consists of the
+following classes:
 
 <div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
 <img style="width:100%"
@@ -227,23 +236,23 @@ the @{tf.data.Dataset} API consists of the following classes:
   src="../images/dataset_classes.png">
 </div>
 
+Where the individual members are:
 
-Where:
-
-* Dataset: Base class containing methods to create and transform datasets. Also
-  allows you to initialize a dataset from data in memory, or from a Python
-  generator.
-* TextLineDataset: Reads lines from text files.
-* TFRecordDataset: Reads records from TFRecord files.
-* FixedLengthRecordDataset: Reads fixed size records from binary files.
-* Iterator: Provides a way to access one data set element at a time.
+* `Dataset` - Base class containing methods to create and transform
+  datasets. Also allows you to initialize a dataset from data in memory, or from
+  a Python generator.
+* `TextLineDataset` - Reads lines from text files.
+* `TFRecordDataset` - Reads records from TFRecord files.
+* `FixedLengthRecordDataset` - Reads fixed size records from binary files.
+* `Iterator` - Provides a way to access one data set element at a time.
 
 The Dataset API can handle a lot of common cases for you. For example,
 using the Dataset API, you can easily read in records from a large collection
 of files in parallel and join them into a single stream.
 
-To keep things simple in this example we are going to load the data with pandas,
-and build our input pipeline from this in-memory data.
+To keep things simple in this example we are going to load the data with
+[pandas](https://pandas.pydata.org/), and build our input pipeline from this
+in-memory data.
 
 Here is the input function used for training in this program, which is available
 in [`iris_data.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/iris_data.py):
@@ -258,9 +267,9 @@ def train_input_fn(features, labels, batch_size):
     return dataset.shuffle(1000).repeat().batch(batch_size)
 ```
 
-## Define the Feature Columns
+## Define the feature columns
 
-A [**Feature Column**](https://developers.google.com/machine-learning/glossary/#feature_columns)
+A [**feature column**](https://developers.google.com/machine-learning/glossary/#feature_columns)
 is an object describing how the model should use raw input data from the
 features dictionary. When you build an Estimator model, you pass it a list of
 feature columns that describes each of the features you want the model to use.
@@ -270,7 +279,7 @@ to the model.
 For Iris, the 4 raw features are numeric values, so we'll build a list of
 feature columns to tell the Estimator model to represent each of the four
 features as 32-bit floating-point values. Therefore, the code to create the
-Feature Column is simply:
+feature column is:
 
 ```python
 # Feature columns describe how to use the input.
@@ -279,29 +288,29 @@ for key in train_x.keys():
     my_feature_columns.append(tf.feature_column.numeric_column(key=key))
 ```
 
-Feature Columns can be far more sophisticated than those we're showing here.
-We detail feature columns @{$get_started/feature_columns$later on} in
-getting started.
+Feature columns can be far more sophisticated than those we're showing here.  We
+detail feature columns @{$get_started/feature_columns$later on} in our Getting
+Started guide.
 
 Now that we have the description of how we want the model to represent the raw
 features, we can build the estimator.
 
 
-## Instantiate an Estimator
+## Instantiate an estimator
 
 The Iris problem is a classic classification problem. Fortunately, TensorFlow
 provides several pre-made classifier Estimators, including:
 
-* @{tf.estimator.DNNClassifier}—for deep models that perform multi-class
+* @{tf.estimator.DNNClassifier} for deep models that perform multi-class
   classification.
-* @{tf.estimator.DNNLinearCombinedClassifier}—for wide-n-deep models.
-* @{tf.estimator.LinearClassifier}— for classifiers based on linear models.
+* @{tf.estimator.DNNLinearCombinedClassifier} for wide & deep models.
+* @{tf.estimator.LinearClassifier} for classifiers based on linear models.
 
 For the Iris problem, `tf.estimator.DNNClassifier` seems like the best choice.
 Here's how we instantiated this Estimator:
 
 ```python
-# Build 2 hidden layer DNN with 10, 10 units respectively.
+# Build a DNN with 2 hidden layers and 10 nodes in each hidden layer.
 classifier = tf.estimator.DNNClassifier(
     feature_columns=my_feature_columns,
     # Two hidden layers of 10 nodes each.
-- 
GitLab


From 89a6ff9bff0fac47788f8cfe6693a72316e0eb5d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 29 Jan 2018 17:05:21 -0800
Subject: [PATCH 368/966] Add support for the assert statement.

PiperOrigin-RevId: 183753774
---
 tensorflow/contrib/py2tf/conversion.py        |  2 +
 tensorflow/contrib/py2tf/converters/BUILD     | 23 ++++++++
 .../contrib/py2tf/converters/asserts.py       | 53 +++++++++++++++++++
 .../contrib/py2tf/converters/asserts_test.py  | 42 +++++++++++++++
 4 files changed, 120 insertions(+)
 create mode 100644 tensorflow/contrib/py2tf/converters/asserts.py
 create mode 100644 tensorflow/contrib/py2tf/converters/asserts_test.py

diff --git a/tensorflow/contrib/py2tf/conversion.py b/tensorflow/contrib/py2tf/conversion.py
index e277eadec4..67ca52d194 100644
--- a/tensorflow/contrib/py2tf/conversion.py
+++ b/tensorflow/contrib/py2tf/conversion.py
@@ -23,6 +23,7 @@ import six
 
 from tensorflow.contrib.py2tf import config
 from tensorflow.contrib.py2tf import naming
+from tensorflow.contrib.py2tf.converters import asserts
 from tensorflow.contrib.py2tf.converters import break_canonicalization
 from tensorflow.contrib.py2tf.converters import builtin_functions
 from tensorflow.contrib.py2tf.converters import call_trees
@@ -245,6 +246,7 @@ def node_to_graph(node, ctx, nocompile_decorators):
   node = _static_analysis_pass(node, ctx)
   node = decorators.transform(node, nocompile_decorators)
   node = break_canonicalization.transform(node, ctx.namer)
+  node = asserts.transform(node, ctx)
 
   # Note: sequencing continue canonicalization before for loop one avoids
   # dealing with the extra loop increment operation that the for
diff --git a/tensorflow/contrib/py2tf/converters/BUILD b/tensorflow/contrib/py2tf/converters/BUILD
index 4f90f94e09..b61fda3e91 100644
--- a/tensorflow/contrib/py2tf/converters/BUILD
+++ b/tensorflow/contrib/py2tf/converters/BUILD
@@ -17,6 +17,7 @@ filegroup(
 py_library(
     name = "converters",
     srcs = [
+        "asserts.py",
         "break_canonicalization.py",
         "builtin_functions.py",
         "call_trees.py",
@@ -49,6 +50,17 @@ py_library(
     ],
 )
 
+py_test(
+    name = "asserts_test",
+    srcs = ["asserts_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":test_lib",
+        "//tensorflow/contrib/py2tf/pyct",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
 py_test(
     name = "break_canonicalization_test",
     srcs = ["break_canonicalization_test.py"],
@@ -71,6 +83,17 @@ py_test(
     ],
 )
 
+py_test(
+    name = "decorators_test",
+    srcs = ["decorators_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":test_lib",
+        "//tensorflow/contrib/py2tf/pyct",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
 py_test(
     name = "continue_canonicalization_test",
     srcs = ["continue_canonicalization_test.py"],
diff --git a/tensorflow/contrib/py2tf/converters/asserts.py b/tensorflow/contrib/py2tf/converters/asserts.py
new file mode 100644
index 0000000000..2d6ee1d098
--- /dev/null
+++ b/tensorflow/contrib/py2tf/converters/asserts.py
@@ -0,0 +1,53 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Converts Assert statements to their corresponding TF calls."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gast
+
+from tensorflow.contrib.py2tf.pyct import templates
+from tensorflow.contrib.py2tf.pyct import transformer
+
+
+class AssertsTransformer(transformer.Base):
+  """Transforms Print nodes to Call so they can be handled as functions."""
+
+  # pylint:disable=invalid-name
+
+  def visit_Assert(self, node):
+    self.generic_visit(node)
+
+    # Note: The lone tf.Assert call will be wrapped with control_dependencies
+    # by side_effect_guards.
+    template = """
+      tf.Assert(test, [tf.constant(msg)])
+    """
+
+    if node.msg is None:
+      return templates.replace(
+          template, test=node.test, msg=gast.Str('Assertion error'))
+    elif isinstance(node.msg, gast.Str):
+      return templates.replace(template, test=node.test, msg=node.msg)
+    else:
+      raise NotImplementedError('Can only convert string messages for now.')
+
+  # pylint:enable=invalid-name
+
+
+def transform(node, context):
+  return AssertsTransformer(context).visit(node)
diff --git a/tensorflow/contrib/py2tf/converters/asserts_test.py b/tensorflow/contrib/py2tf/converters/asserts_test.py
new file mode 100644
index 0000000000..6611f2777a
--- /dev/null
+++ b/tensorflow/contrib/py2tf/converters/asserts_test.py
@@ -0,0 +1,42 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for asserts module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gast
+
+from tensorflow.contrib.py2tf.converters import asserts
+from tensorflow.contrib.py2tf.converters import converter_test_base
+from tensorflow.python.platform import test
+
+
+class AssertsTest(converter_test_base.TestCase):
+
+  def test_transform(self):
+
+    def test_fn(a):
+      assert a > 0
+
+    node = self.parse_and_analyze(test_fn, {})
+    node = asserts.transform(node, self.ctx)
+
+    self.assertTrue(isinstance(node.body[0].body[0].value, gast.Call))
+
+
+if __name__ == '__main__':
+  test.main()
-- 
GitLab


From 18447358543598abc3296f5df9e7774528dc53f0 Mon Sep 17 00:00:00 2001
From: Mark Heffernan <meheff@google.com>
Date: Mon, 29 Jan 2018 17:09:21 -0800
Subject: [PATCH 369/966] Remove HloRunner::ReadModule. Replace with methods
 which explicitly specify the HLO file format. ReadModule would automatically
 determine the format of the file (HLO text, text proto, or binary proto).
 However, automatic determination did not work well because the underlying TF
 code which read protos from files unconditionally emitted error messages to
 stderr in case of parsing error resulting in confusing and irrelevant error
 messages to the user.

PiperOrigin-RevId: 183754369
---
 tensorflow/compiler/xla/service/hlo_runner.cc | 54 +++++++++----------
 tensorflow/compiler/xla/service/hlo_runner.h  | 16 ++----
 .../compiler/xla/tests/hlo_test_base.cc       |  4 +-
 3 files changed, 32 insertions(+), 42 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_runner.cc b/tensorflow/compiler/xla/service/hlo_runner.cc
index e281538848..41b079eb79 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.cc
+++ b/tensorflow/compiler/xla/service/hlo_runner.cc
@@ -47,22 +47,11 @@ HloRunner::CreateModuleFromString(const tensorflow::StringPiece hlo_string,
   return tools::Parse(hlo_string, config);
 }
 
-/*static*/ StatusOr<std::unique_ptr<HloModule>>
-HloRunner::ReadModuleFromHloProtoFile(const std::string& filename,
-                                      const DebugOptions& debug_options) {
-  HloProto proto;
-
-  const Status s =
-      tensorflow::ReadBinaryProto(tensorflow::Env::Default(), filename, &proto);
-
-  if (!s.ok()) {
-    const Status s2 =
-        tensorflow::ReadTextProto(tensorflow::Env::Default(), filename, &proto);
-    if (!s2.ok()) {
-      return Status(s2.code(), s.error_message() + "\n" + s2.error_message());
-    }
-  }
+namespace {
 
+// Creates an HloModule from the given proto.
+StatusOr<std::unique_ptr<HloModule>> HloProtoToModule(
+    const HloProto& proto, const DebugOptions& debug_options) {
   TF_ASSIGN_OR_RETURN(
       HloModuleConfig config,
       HloModule::CreateModuleConfigFromProto(proto.hlo_module()));
@@ -72,9 +61,29 @@ HloRunner::ReadModuleFromHloProtoFile(const std::string& filename,
   return std::move(module);
 }
 
+}  // namespace
+
 /*static*/ StatusOr<std::unique_ptr<HloModule>>
-HloRunner::ReadModuleFromHloTextDumpFile(const std::string& filename,
+HloRunner::ReadModuleFromBinaryProtoFile(const std::string& filename,
                                          const DebugOptions& debug_options) {
+  HloProto proto;
+  TF_RETURN_IF_ERROR(tensorflow::ReadBinaryProto(tensorflow::Env::Default(),
+                                                 filename, &proto));
+  return HloProtoToModule(proto, debug_options);
+}
+
+/*static*/ StatusOr<std::unique_ptr<HloModule>>
+HloRunner::ReadModuleFromTextProtoFile(const std::string& filename,
+                                       const DebugOptions& debug_options) {
+  HloProto proto;
+  TF_RETURN_IF_ERROR(
+      tensorflow::ReadTextProto(tensorflow::Env::Default(), filename, &proto));
+  return HloProtoToModule(proto, debug_options);
+}
+
+/*static*/ StatusOr<std::unique_ptr<HloModule>>
+HloRunner::ReadModuleFromHloTextFile(const std::string& filename,
+                                     const DebugOptions& debug_options) {
   string hlo_string;
   TF_RETURN_IF_ERROR(tensorflow::ReadFileToString(tensorflow::Env::Default(),
                                                   filename, &hlo_string));
@@ -83,19 +92,6 @@ HloRunner::ReadModuleFromHloTextDumpFile(const std::string& filename,
   return tools::Parse(hlo_string, config);
 }
 
-/*static*/ StatusOr<std::unique_ptr<HloModule>> HloRunner::ReadModule(
-    const std::string& filename, const DebugOptions& debug_options) {
-  auto module = HloRunner::ReadModuleFromHloProtoFile(filename, debug_options);
-  if (module.ok()) {
-    return module;
-  }
-  const std::string e = module.status().error_message();
-  module = HloRunner::ReadModuleFromHloTextDumpFile(filename, debug_options);
-  return module.ok() ? std::move(module)
-                     : Status(module.status().code(),
-                              e + "\n" + module.status().error_message());
-}
-
 // Define this in .cc file to avoid having to include eigen or forward declare
 // these types in the header.
 struct HloRunner::EigenThreadPoolWrapper {
diff --git a/tensorflow/compiler/xla/service/hlo_runner.h b/tensorflow/compiler/xla/service/hlo_runner.h
index d4b221fb52..cbaebc68be 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.h
+++ b/tensorflow/compiler/xla/service/hlo_runner.h
@@ -52,21 +52,15 @@ class HloRunner {
       const DebugOptions& debug_options);
 
   // Reads the proto file in xla.HloProto format, creates and returns the
-  // HloModule. Will try to parse the filename as binary proto, then try as
-  // text proto if that fails.
-  static StatusOr<std::unique_ptr<HloModule>> ReadModuleFromHloProtoFile(
+  // HloModule.
+  static StatusOr<std::unique_ptr<HloModule>> ReadModuleFromBinaryProtoFile(
+      const std::string& filename, const DebugOptions& debug_options);
+  static StatusOr<std::unique_ptr<HloModule>> ReadModuleFromTextProtoFile(
       const std::string& filename, const DebugOptions& debug_options);
 
   // Reads the hlo text dump file in HloModule::ToString format, creates and
   // returns the HloModule.
-  static StatusOr<std::unique_ptr<HloModule>> ReadModuleFromHloTextDumpFile(
-      const std::string& filename, const DebugOptions& debug_options);
-
-  // Tries to parse the filename specified first as binary proto format, then
-  // as a textual proto format, then textual IR, then gives up if both fail.
-  // ReadModuleFromHloProtoFile or ReadModuleFromHloTextDumpFile should be used
-  // explicitly when you know the format, this if you don't.
-  static StatusOr<std::unique_ptr<HloModule>> ReadModule(
+  static StatusOr<std::unique_ptr<HloModule>> ReadModuleFromHloTextFile(
       const std::string& filename, const DebugOptions& debug_options);
 
   // Executes the given module with given literals as input and returns the
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc
index 7c1a993b47..9f5806c5e1 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc
@@ -230,7 +230,7 @@ template <typename LiteralPtr>
     const string& filename, const tensorflow::gtl::optional<ErrorSpec>& error,
     const std::function<void(HloModule*)>& reference_preprocessor) {
   auto module_or_status =
-      HloRunner::ReadModule(filename, GetDebugOptionsForTest());
+      HloRunner::ReadModuleFromHloTextFile(filename, GetDebugOptionsForTest());
   if (!module_or_status.ok()) {
     return ::testing::AssertionFailure()
            << "failed reading hlo module from file";
@@ -258,7 +258,7 @@ template <typename LiteralPtr>
     const string& filename, const tensorflow::gtl::optional<ErrorSpec>& error,
     const std::function<void(HloModule*)>& reference_preprocessor) {
   auto module_or_status =
-      HloRunner::ReadModule(filename, GetDebugOptionsForTest());
+      HloRunner::ReadModuleFromHloTextFile(filename, GetDebugOptionsForTest());
   if (!module_or_status.ok()) {
     return ::testing::AssertionFailure()
            << "failed reading hlo module from file";
-- 
GitLab


From 0c5e9eb9ea35f560227ee10c5fdb7cd6745f7674 Mon Sep 17 00:00:00 2001
From: Sami Kama <skama@nvidia.com>
Date: Mon, 29 Jan 2018 17:18:37 -0800
Subject: [PATCH 370/966] Update Readme and add GOOGLE_CUDA conditionals back

---
 tensorflow/contrib/tensorrt/BUILD             | 1 -
 tensorflow/contrib/tensorrt/README.md         | 4 +---
 tensorflow/contrib/tensorrt/log/trt_logger.cc | 9 +++++----
 3 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index 38d30f9570..2d4bccd267 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -215,7 +215,6 @@ tf_cuda_library(
         "//tensorflow/core/grappler/clusters:virtual_cluster",
         "//tensorflow/core/grappler:devices",
         "//tensorflow/core/grappler/costs:graph_properties",
-        #"//tensorflow/core/grappler/clusters:single_machine",
     ],
 )
 
diff --git a/tensorflow/contrib/tensorrt/README.md b/tensorflow/contrib/tensorrt/README.md
index 61b348fc60..b362050983 100644
--- a/tensorflow/contrib/tensorrt/README.md
+++ b/tensorflow/contrib/tensorrt/README.md
@@ -15,11 +15,9 @@ configure script should find the necessary components from the system
 automatically. If installed from tar packages, user has to set path to
 location where the library is installed during configuration.
 
-In order to enable TensorRT support, user has to add `--config=tensorrt` to
-the build flags during the compilation such as
 
 ```
-bazel build --config=cuda --config=opt --config=tensorrt //tensorflow/tools/pip_package:build_pip_package
+bazel build --config=cuda --config=opt //tensorflow/tools/pip_package:build_pip_package
 bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/
 ```
 
diff --git a/tensorflow/contrib/tensorrt/log/trt_logger.cc b/tensorflow/contrib/tensorrt/log/trt_logger.cc
index 35f96511f1..deeba43474 100644
--- a/tensorflow/contrib/tensorrt/log/trt_logger.cc
+++ b/tensorflow/contrib/tensorrt/log/trt_logger.cc
@@ -12,14 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
 
 #include "tensorflow/contrib/tensorrt/log/trt_logger.h"
-
 #include "tensorflow/core/platform/logging.h"
 
 // Use TF logging for TensorRT informations
-
-
 namespace tensorflow {
 namespace tensorrt {
 
@@ -50,6 +49,8 @@ void Logger::log(Severity severity, const char* msg) {
     }
   }
 }
-
 }  // namespace tensorrt
 }  // namespace tensorflow
+
+#endif
+#endif
-- 
GitLab


From aca9f4257f064d381be171a7ff2ee114002a8fab Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 29 Jan 2018 17:25:45 -0800
Subject: [PATCH 371/966] [TF:XLA] Complete the TriangularSolve implementation.

The previous version only handled the case of left_side=false, lower=true, transpose_a=true, conjugate_a=false. This updated implementation handles all 16 combinations of those boolean options, and also instantiates the corresponding MatrixTriangularSolve TF op.

To improve compile times and potentially FLOP performance, when lower=true the within-block subroutine used on the diagonal blocks is now a left-looking variant implemented with an XLA HLO While loop.

This update also slightly generalizes BatchDot in tf2xla/lib to accept separate conjugation arguments (in addition to transpose arguments).

PiperOrigin-RevId: 183756639
---
 tensorflow/compiler/tests/BUILD               |  15 +
 .../tests/matrix_triangular_solve_op_test.py  | 130 +++++
 .../tf2xla/g3doc/cpu_supported_ops.md         |  24 +-
 .../tf2xla/g3doc/gpu_supported_ops.md         |  24 +-
 tensorflow/compiler/tf2xla/kernels/BUILD      |   2 +
 .../tf2xla/kernels/batch_matmul_op.cc         |   5 +-
 .../compiler/tf2xla/kernels/cholesky_op.cc    |   2 +-
 .../kernels/matrix_triangular_solve_op.cc     |  50 ++
 tensorflow/compiler/tf2xla/lib/BUILD          |   2 +
 tensorflow/compiler/tf2xla/lib/batch_dot.cc   |   9 +-
 tensorflow/compiler/tf2xla/lib/batch_dot.h    |  10 +-
 tensorflow/compiler/tf2xla/lib/cholesky.cc    |  14 +-
 tensorflow/compiler/tf2xla/lib/cholesky.h     |   1 +
 .../compiler/tf2xla/lib/triangular_solve.cc   | 469 ++++++++++++++++--
 .../compiler/tf2xla/lib/triangular_solve.h    |  51 +-
 .../tf2xla/lib/triangular_solve_test.cc       | 324 +++++++++++-
 tensorflow/compiler/tf2xla/lib/util.cc        |  11 +
 tensorflow/compiler/tf2xla/lib/util.h         |   4 +
 18 files changed, 1055 insertions(+), 92 deletions(-)
 create mode 100644 tensorflow/compiler/tests/matrix_triangular_solve_op_test.py
 create mode 100644 tensorflow/compiler/tf2xla/kernels/matrix_triangular_solve_op.cc

diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index a3a82df9ad..9e64f3e9a3 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -144,6 +144,21 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "matrix_triangular_solve_op_test",
+    size = "small",
+    srcs = ["matrix_triangular_solve_op_test.py"],
+    tags = ["optonly"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:training",
+    ],
+)
+
 tf_xla_py_test(
     name = "clustering_test",
     size = "small",
diff --git a/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py b/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py
new file mode 100644
index 0000000000..cccb7f5789
--- /dev/null
+++ b/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py
@@ -0,0 +1,130 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.ops.tf.MatrixTriangularSolve."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+def MakePlaceholder(x):
+  return array_ops.placeholder(dtypes.as_dtype(x.dtype), shape=x.shape)
+
+
+class MatrixTriangularSolveOpTest(XLATestCase):
+
+  def _VerifyTriangularSolveBase(self, sess, placeholder_a, placeholder_ca,
+                                 placeholder_b, a, clean_a, b, verification,
+                                 atol):
+    feed_dict = {placeholder_a: a, placeholder_ca: clean_a, placeholder_b: b}
+    verification_np = sess.run(verification, feed_dict)
+    self.assertAllClose(b, verification_np, atol=atol)
+
+  def _VerifyTriangularSolve(self, a, b, lower, adjoint, atol):
+    clean_a = np.tril(a) if lower else np.triu(a)
+    with self.test_session() as sess:
+      placeholder_a = MakePlaceholder(a)
+      placeholder_ca = MakePlaceholder(clean_a)
+      placeholder_b = MakePlaceholder(b)
+      with self.test_scope():
+        x = linalg_ops.matrix_triangular_solve(
+            placeholder_a, placeholder_b, lower=lower, adjoint=adjoint)
+      verification = math_ops.matmul(placeholder_ca, x, adjoint_a=adjoint)
+      self._VerifyTriangularSolveBase(sess, placeholder_a, placeholder_ca,
+                                      placeholder_b, a, clean_a, b,
+                                      verification, atol)
+
+  def _VerifyTriangularSolveCombo(self, a, b, atol=1e-4):
+    transp = lambda x: np.swapaxes(x, -1, -2)
+    for lower, adjoint in itertools.product([True, False], repeat=2):
+      self._VerifyTriangularSolve(
+          a if lower else transp(a), b, lower, adjoint, atol)
+
+  def testBasic(self):
+    rng = np.random.RandomState(0)
+    a = np.tril(rng.randn(5, 5))
+    b = rng.randn(5, 7)
+    for dtype in self.float_types:
+      self._VerifyTriangularSolveCombo(a.astype(dtype), b.astype(dtype))
+
+  def testBasicNotActuallyTriangular(self):
+    rng = np.random.RandomState(0)
+    a = rng.randn(5, 5)  # the `a` matrix is not lower-triangular
+    b = rng.randn(5, 7)
+    for dtype in self.float_types:
+      self._VerifyTriangularSolveCombo(a.astype(dtype), b.astype(dtype))
+
+  def testBasicComplexDtypes(self):
+    rng = np.random.RandomState(0)
+    a = np.tril(rng.randn(5, 5) + rng.randn(5, 5) * 1j)
+    b = rng.randn(5, 7) + rng.randn(5, 7) * 1j
+    for dtype in self.complex_types:
+      self._VerifyTriangularSolveCombo(a.astype(dtype), b.astype(dtype))
+
+  def testBatch(self):
+    rng = np.random.RandomState(0)
+    shapes = [((4, 3, 3), (4, 3, 5)), ((1, 2, 2), (1, 2, 1)),
+              ((1, 1, 1), (1, 1, 2)), ((2, 3, 4, 4), (2, 3, 4, 1))]
+    tuples = itertools.product(self.float_types, shapes)
+    for dtype, (a_shape, b_shape) in tuples:
+      n = a_shape[-1]
+      a = np.tril(rng.rand(*a_shape) - 0.5) / (2.0 * n) + np.eye(n)
+      b = rng.randn(*b_shape)
+      self._VerifyTriangularSolveCombo(
+          a.astype(dtype), b.astype(dtype), atol=1e-3)
+
+  def testLarge(self):
+    n = 1024
+    rng = np.random.RandomState(0)
+    a = np.tril(rng.rand(n, n) - 0.5) / (2.0 * n) + np.eye(n)
+    b = rng.randn(n, n)
+    self._VerifyTriangularSolve(
+        a.astype(np.float32), b.astype(np.float32), True, False, 1e-4)
+
+  def testNonSquareCoefficientMatrix(self):
+    rng = np.random.RandomState(0)
+    for dtype in self.float_types:
+      a = rng.randn(3, 4).astype(dtype)
+      b = rng.randn(4, 4).astype(dtype)
+      with self.assertRaises(ValueError):
+        linalg_ops.matrix_triangular_solve(a, b)
+      with self.assertRaises(ValueError):
+        linalg_ops.matrix_triangular_solve(a, b)
+
+  def testWrongDimensions(self):
+    randn = np.random.RandomState(0).randn
+    for dtype in self.float_types:
+      lhs = constant_op.constant(randn(3, 3), dtype=dtype)
+      rhs = constant_op.constant(randn(4, 3), dtype=dtype)
+      with self.assertRaises(ValueError):
+        linalg_ops.matrix_triangular_solve(lhs, rhs)
+      with self.assertRaises(ValueError):
+        linalg_ops.matrix_triangular_solve(lhs, rhs)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tf2xla/g3doc/cpu_supported_ops.md b/tensorflow/compiler/tf2xla/g3doc/cpu_supported_ops.md
index 82b3b46a2f..44f7db5ffd 100644
--- a/tensorflow/compiler/tf2xla/g3doc/cpu_supported_ops.md
+++ b/tensorflow/compiler/tf2xla/g3doc/cpu_supported_ops.md
@@ -6,6 +6,9 @@ Operator                              | Type Constraint
 `Acosh`                               | `T={complex64,double,float}`
 `Add`                                 | `T={complex64,double,float,int32,int64}`
 `AddN`                                | `T={complex64,double,float,int32,int64,uint32,uint64}`
+`AdjustContrastv2`                    |
+`AdjustHue`                           |
+`AdjustSaturation`                    |
 `All`                                 | `Tidx={int32,int64}`
 `Angle`                               | `Tout={double,float}`<br>`T={complex64}`
 `Any`                                 | `Tidx={int32,int64}`
@@ -34,7 +37,7 @@ Operator                              | Type Constraint
 `BroadcastGradientArgs`               | `T={int32,int64}`
 `Cast`                                | `DstT={bool,complex64,double,float,int32,int64,uint32,uint64}`<br>`SrcT={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Ceil`                                | `T={double,float}`
-`Cholesky`                            | `T={complex64,double,float}`
+`Cholesky`                            | `T={double,float}`
 `Complex`                             | `Tout={complex64}`<br>`T={double,float}`
 `ComplexAbs`                          | `Tout={double,float}`<br>`T={complex64}`
 `Concat`                              | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
@@ -68,7 +71,10 @@ Operator                              | Type Constraint
 `Exp`                                 | `T={complex64,double,float}`
 `ExpandDims`                          | `Tdim={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Expm1`                               | `T={complex64,double,float}`
-`Fill`                                | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`FFT`                                 |
+`FFT2D`                               |
+`FFT3D`                               |
+`Fill`                                | `index_type={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Floor`                               | `T={double,float}`
 `FloorDiv`                            | `T={complex64,double,float,int32,int64}`
 `FloorMod`                            | `T={double,float,int32,int64}`
@@ -80,6 +86,13 @@ Operator                              | Type Constraint
 `GatherV2`                            | `Taxis={int32,int64}`<br>`Tindices={int32,int64}`<br>`Tparams={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Greater`                             | `T={double,float,int32,int64,uint32,uint64}`
 `GreaterEqual`                        | `T={double,float,int32,int64,uint32,uint64}`
+`HSVToRGB`                            | `T={double,float}`
+`IFFT`                                |
+`IFFT2D`                              |
+`IFFT3D`                              |
+`IRFFT`                               |
+`IRFFT2D`                             |
+`IRFFT3D`                             |
 `Identity`                            | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `IdentityN`                           | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Imag`                                | `Tout={double,float}`<br>`T={complex64}`
@@ -105,6 +118,7 @@ Operator                              | Type Constraint
 `MatMul`                              | `T={complex64,double,float}`
 `MatrixDiag`                          | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `MatrixDiagPart`                      | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`MatrixTriangularSolve`               | `T={complex64,double,float}`
 `Max`                                 | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
 `MaxPool`                             | `T={double,float,int32,int64}`
 `MaxPool3D`                           | `T={float}`
@@ -131,6 +145,10 @@ Operator                              | Type Constraint
 `PreventGradient`                     | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Prod`                                | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
 `QuantizeAndDequantizeV2`             | `T={double,float}`
+`RFFT`                                |
+`RFFT2D`                              |
+`RFFT3D`                              |
+`RGBToHSV`                            | `T={double,float}`
 `RandomStandardNormal`                | `dtype={float}`
 `RandomUniform`                       | `T={int32,int64}`<br>`dtype={double,float}`
 `RandomUniformInt`                    | `T={int32,int64}`<br>`Tout={int32,int64}`
@@ -146,6 +164,8 @@ Operator                              | Type Constraint
 `Relu6Grad`                           | `T={double,float,int32,int64,uint32,uint64}`
 `ReluGrad`                            | `T={double,float,int32,int64,uint32,uint64}`
 `Reshape`                             | `Tshape={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`ResizeBilinear`                      | `T={double,float,int32,int64}`
+`ResizeBilinearGrad`                  | `T={double,float}`
 `ResourceApplyAdagrad`                | `T={double,float}`
 `ResourceApplyAdam`                   | `T={double,float}`
 `ResourceApplyFtrl`                   | `T={double,float}`
diff --git a/tensorflow/compiler/tf2xla/g3doc/gpu_supported_ops.md b/tensorflow/compiler/tf2xla/g3doc/gpu_supported_ops.md
index d4b7621ad2..eb1f891125 100644
--- a/tensorflow/compiler/tf2xla/g3doc/gpu_supported_ops.md
+++ b/tensorflow/compiler/tf2xla/g3doc/gpu_supported_ops.md
@@ -6,6 +6,9 @@ Operator                              | Type Constraint
 `Acosh`                               | `T={complex64,double,float}`
 `Add`                                 | `T={complex64,double,float,int32,int64}`
 `AddN`                                | `T={complex64,double,float,int32,int64,uint32,uint64}`
+`AdjustContrastv2`                    |
+`AdjustHue`                           |
+`AdjustSaturation`                    |
 `All`                                 | `Tidx={int32,int64}`
 `Angle`                               | `Tout={double,float}`<br>`T={complex64}`
 `Any`                                 | `Tidx={int32,int64}`
@@ -34,7 +37,7 @@ Operator                              | Type Constraint
 `BroadcastGradientArgs`               | `T={int32,int64}`
 `Cast`                                | `DstT={bool,complex64,double,float,int32,int64,uint32,uint64}`<br>`SrcT={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Ceil`                                | `T={double,float}`
-`Cholesky`                            | `T={complex64,double,float}`
+`Cholesky`                            | `T={double,float}`
 `Complex`                             | `Tout={complex64}`<br>`T={double,float}`
 `ComplexAbs`                          | `Tout={double,float}`<br>`T={complex64}`
 `Concat`                              | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
@@ -68,7 +71,10 @@ Operator                              | Type Constraint
 `Exp`                                 | `T={complex64,double,float}`
 `ExpandDims`                          | `Tdim={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Expm1`                               | `T={complex64,double,float}`
-`Fill`                                | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`FFT`                                 |
+`FFT2D`                               |
+`FFT3D`                               |
+`Fill`                                | `index_type={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Floor`                               | `T={double,float}`
 `FloorDiv`                            | `T={complex64,double,float,int32,int64}`
 `FloorMod`                            | `T={double,float,int32,int64}`
@@ -80,6 +86,13 @@ Operator                              | Type Constraint
 `GatherV2`                            | `Taxis={int32,int64}`<br>`Tindices={int32,int64}`<br>`Tparams={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Greater`                             | `T={double,float,int32,int64,uint32,uint64}`
 `GreaterEqual`                        | `T={double,float,int32,int64,uint32,uint64}`
+`HSVToRGB`                            | `T={double,float}`
+`IFFT`                                |
+`IFFT2D`                              |
+`IFFT3D`                              |
+`IRFFT`                               |
+`IRFFT2D`                             |
+`IRFFT3D`                             |
 `Identity`                            | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `IdentityN`                           | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Imag`                                | `Tout={double,float}`<br>`T={complex64}`
@@ -105,6 +118,7 @@ Operator                              | Type Constraint
 `MatMul`                              | `T={complex64,double,float}`
 `MatrixDiag`                          | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `MatrixDiagPart`                      | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`MatrixTriangularSolve`               | `T={complex64,double,float}`
 `Max`                                 | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
 `MaxPool`                             | `T={double,float,int32,int64}`
 `MaxPool3D`                           | `T={float}`
@@ -131,6 +145,10 @@ Operator                              | Type Constraint
 `PreventGradient`                     | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Prod`                                | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
 `QuantizeAndDequantizeV2`             | `T={double,float}`
+`RFFT`                                |
+`RFFT2D`                              |
+`RFFT3D`                              |
+`RGBToHSV`                            | `T={double,float}`
 `Range`                               | `Tidx={double,float,int32,int64}`
 `Rank`                                | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `ReadVariableOp`                      | `dtype={bool,complex64,double,float,int32,int64,uint32,uint64}`
@@ -143,6 +161,8 @@ Operator                              | Type Constraint
 `Relu6Grad`                           | `T={double,float,int32,int64,uint32,uint64}`
 `ReluGrad`                            | `T={double,float,int32,int64,uint32,uint64}`
 `Reshape`                             | `Tshape={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`ResizeBilinear`                      | `T={double,float,int32,int64}`
+`ResizeBilinearGrad`                  | `T={double,float}`
 `ResourceApplyAdagrad`                | `T={double,float}`
 `ResourceApplyAdam`                   | `T={double,float}`
 `ResourceApplyFtrl`                   | `T={double,float}`
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index a7e00cb12f..84fa43f4fb 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -43,6 +43,7 @@ tf_kernel_library(
         "l2loss_op.cc",
         "lrn_ops.cc",
         "matmul_op.cc",
+        "matrix_triangular_solve_op.cc",
         "mirror_pad_op.cc",
         "no_op.cc",
         "one_hot_op.cc",
@@ -93,6 +94,7 @@ tf_kernel_library(
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla/lib:batch_dot",
         "//tensorflow/compiler/tf2xla/lib:cholesky",
+        "//tensorflow/compiler/tf2xla/lib:triangular_solve",
         "//tensorflow/compiler/tf2xla/lib:util",
         "//tensorflow/compiler/tf2xla/ops:sendrecv_ops",
         "//tensorflow/compiler/xla:array4d",
diff --git a/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc b/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc
index a015b8e0e8..b0ba25b998 100644
--- a/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc
@@ -28,8 +28,9 @@ class BatchMatMulOp : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    auto result =
-        BatchDot(ctx->builder(), ctx->Input(0), ctx->Input(1), adj_x_, adj_y_);
+    auto result = BatchDot(ctx->builder(), ctx->Input(0), ctx->Input(1),
+                           /*transpose_x=*/adj_x_, /*transpose_y=*/adj_y_,
+                           /*conjugate_x=*/adj_x_, /*conjugate_y=*/adj_y_);
     OP_REQUIRES_OK(ctx, result.status());
     ctx->SetOutput(0, result.ValueOrDie());
   }
diff --git a/tensorflow/compiler/tf2xla/kernels/cholesky_op.cc b/tensorflow/compiler/tf2xla/kernels/cholesky_op.cc
index 87d858f763..fe6651793d 100644
--- a/tensorflow/compiler/tf2xla/kernels/cholesky_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/cholesky_op.cc
@@ -33,7 +33,7 @@ class CholeskyOp : public XlaOpKernel {
   }
 };
 
-REGISTER_XLA_OP(Name("Cholesky"), CholeskyOp);
+REGISTER_XLA_OP(Name("Cholesky").TypeConstraint("T", kFloatTypes), CholeskyOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_triangular_solve_op.cc b/tensorflow/compiler/tf2xla/kernels/matrix_triangular_solve_op.cc
new file mode 100644
index 0000000000..eaed931464
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/matrix_triangular_solve_op.cc
@@ -0,0 +1,50 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/lib/triangular_solve.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+
+namespace tensorflow {
+namespace {
+
+class MatrixTriangularSolveOp : public XlaOpKernel {
+ public:
+  explicit MatrixTriangularSolveOp(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("lower", &lower_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("adjoint", &adjoint_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    auto result = TriangularSolve(
+        ctx->builder(), ctx->Input(0), ctx->Input(1), /*left_side=*/true,
+        /*lower=*/lower_, /*transpose_a=*/adjoint_, /*conjugate_a=*/adjoint_);
+    if (!result.ok()) {
+      ctx->SetStatus(result.status());
+      return;
+    }
+    ctx->SetOutput(0, result.ValueOrDie());
+  }
+
+ private:
+  bool lower_;
+  bool adjoint_;
+};
+
+REGISTER_XLA_OP(Name("MatrixTriangularSolve"), MatrixTriangularSolveOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/lib/BUILD b/tensorflow/compiler/tf2xla/lib/BUILD
index 21ad21f737..d184f59e01 100644
--- a/tensorflow/compiler/tf2xla/lib/BUILD
+++ b/tensorflow/compiler/tf2xla/lib/BUILD
@@ -60,6 +60,8 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/core:lib",
diff --git a/tensorflow/compiler/tf2xla/lib/batch_dot.cc b/tensorflow/compiler/tf2xla/lib/batch_dot.cc
index 9b0e617447..798f0fa780 100644
--- a/tensorflow/compiler/tf2xla/lib/batch_dot.cc
+++ b/tensorflow/compiler/tf2xla/lib/batch_dot.cc
@@ -25,11 +25,10 @@ limitations under the License.
 
 namespace tensorflow {
 
-// The current implementation simply unrolls the computation along the batch
-// dimension.
 xla::StatusOr<xla::ComputationDataHandle> BatchDot(
     xla::ComputationBuilder* builder, xla::ComputationDataHandle x,
-    xla::ComputationDataHandle y, bool transpose_x, bool transpose_y) {
+    xla::ComputationDataHandle y, bool transpose_x, bool transpose_y,
+    bool conjugate_x, bool conjugate_y) {
   TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> x_shape,
                       builder->GetShape(x));
   TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> y_shape,
@@ -89,10 +88,10 @@ xla::StatusOr<xla::ComputationDataHandle> BatchDot(
         dimensions);
   }
 
-  if (x_shape->element_type() == xla::C64 && transpose_x) {
+  if (x_shape->element_type() == xla::C64 && conjugate_x) {
     x = builder->Conj(x);
   }
-  if (y_shape->element_type() == xla::C64 && transpose_y) {
+  if (y_shape->element_type() == xla::C64 && conjugate_y) {
     y = builder->Conj(y);
   }
 
diff --git a/tensorflow/compiler/tf2xla/lib/batch_dot.h b/tensorflow/compiler/tf2xla/lib/batch_dot.h
index b46bc7417d..b230e885f1 100644
--- a/tensorflow/compiler/tf2xla/lib/batch_dot.h
+++ b/tensorflow/compiler/tf2xla/lib/batch_dot.h
@@ -27,7 +27,10 @@ namespace tensorflow {
 // viewed as an element of a batch), and arranges the individual results
 // in a single output tensor of the same batch size. Each of the
 // individual slices can optionally be transposed before multiplication by
-// setting the `transpose_x` or `transpose_y` flag to `true`.
+// setting the `transpose_x` or `transpose_y` flag to `true`. Similarly, each
+// can be elementwise-complex-conjugated by setting the `conjugate_x` or
+// `conjugate_y` flag to `true`. To apply a Hermitian adjoint to `x`, set both
+// `transpose_x` and `conjugate_x` to `true`, and analogously for `y`.
 //
 // The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
 // and `[..., r_y, c_y]`.
@@ -40,11 +43,10 @@ namespace tensorflow {
 // It is computed as:
 //
 //     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
-// TODO(phawkins): add an option to take the complex conjugate of the LHS or
-// RHS.
 xla::StatusOr<xla::ComputationDataHandle> BatchDot(
     xla::ComputationBuilder* builder, xla::ComputationDataHandle x,
-    xla::ComputationDataHandle y, bool transpose_x, bool transpose_y);
+    xla::ComputationDataHandle y, bool transpose_x, bool transpose_y,
+    bool conjugate_x = false, bool conjugate_y = false);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/tf2xla/lib/cholesky.cc b/tensorflow/compiler/tf2xla/lib/cholesky.cc
index b3cc489adf..e795701181 100644
--- a/tensorflow/compiler/tf2xla/lib/cholesky.cc
+++ b/tensorflow/compiler/tf2xla/lib/cholesky.cc
@@ -71,11 +71,14 @@ xla::StatusOr<xla::ComputationDataHandle> CholeskyUnblocked(
                           SliceInMinorDims(builder, l, {j + 1, 0}, {n, j}));
       TF_ASSIGN_OR_RETURN(auto r_squared,
                           BatchDot(builder, r, r, /*transpose_x=*/false,
-                                   /*transpose_y=*/true));
+                                   /*transpose_y=*/true, /*conjugate_x=*/false,
+                                   /*conjugate_y=*/false));
       new_d_squared = builder->Sub(new_d_squared, r_squared);
 
       TF_ASSIGN_OR_RETURN(br, BatchDot(builder, b, r, /*transpose_x=*/false,
-                                       /*transpose_y=*/true));
+                                       /*transpose_y=*/true,
+                                       /*conjugate_x=*/false,
+                                       /*conjugate_y=*/false));
     }
     auto new_d_inv = builder->Pow(
         new_d_squared, FloatLiteral(builder, shape->element_type(), -0.5));
@@ -134,7 +137,8 @@ xla::StatusOr<xla::ComputationDataHandle> Cholesky(
                           SliceInMinorDims(builder, l, {i, 0}, {i + k, i}));
       TF_ASSIGN_OR_RETURN(auto delta,
                           BatchDot(builder, lhs, rhs, /*transpose_x=*/false,
-                                   /*transpose_y=*/true));
+                                   /*transpose_y=*/true, /*conjugate_x=*/false,
+                                   /*conjugate_y=*/false));
       TF_ASSIGN_OR_RETURN(auto before,
                           SliceInMinorDims(builder, a, {i, i}, {n, i + k}));
       TF_ASSIGN_OR_RETURN(
@@ -155,6 +159,10 @@ xla::StatusOr<xla::ComputationDataHandle> Cholesky(
                           SliceInMinorDims(builder, a, {i + k, i}, {n, i + k}));
       TF_ASSIGN_OR_RETURN(auto update,
                           TriangularSolve(builder, factorized, panel,
+                                          /*left_side=*/false,
+                                          /*lower=*/true,
+                                          /*transpose_a=*/true,
+                                          /*conjugate_a=*/false,
                                           /*block_size=*/8));
       TF_ASSIGN_OR_RETURN(
           l, UpdateSliceInMinorDims(builder, l, update, {i + k, i}));
diff --git a/tensorflow/compiler/tf2xla/lib/cholesky.h b/tensorflow/compiler/tf2xla/lib/cholesky.h
index 2bead7359b..e083a383be 100644
--- a/tensorflow/compiler/tf2xla/lib/cholesky.h
+++ b/tensorflow/compiler/tf2xla/lib/cholesky.h
@@ -29,6 +29,7 @@ namespace tensorflow {
 // the block size to use.
 // TODO(phawkins): check for negative values on the diagonal and return an
 // error, instead of silently yielding NaNs.
+// TODO(mattjj): handle the complex Hermitian case
 xla::StatusOr<xla::ComputationDataHandle> Cholesky(
     xla::ComputationBuilder* builder, xla::ComputationDataHandle a,
     int64 block_size = 256);
diff --git a/tensorflow/compiler/tf2xla/lib/triangular_solve.cc b/tensorflow/compiler/tf2xla/lib/triangular_solve.cc
index 579944c3a3..5f0445dd44 100644
--- a/tensorflow/compiler/tf2xla/lib/triangular_solve.cc
+++ b/tensorflow/compiler/tf2xla/lib/triangular_solve.cc
@@ -24,13 +24,15 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
 
 xla::StatusOr<xla::ComputationDataHandle> TriangularSolve(
     xla::ComputationBuilder* builder, const xla::ComputationDataHandle& a,
-    xla::ComputationDataHandle b, int64 block_size) {
+    xla::ComputationDataHandle b, bool left_side, bool lower, bool transpose_a,
+    bool conjugate_a, int64 block_size) {
   TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> a_shape,
                       builder->GetShape(a));
   TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> b_shape,
@@ -60,14 +62,15 @@ xla::StatusOr<xla::ComputationDataHandle> TriangularSolve(
     batch_dimensions.push_back(a_size);
   }
 
-  const int64 n = xla::ShapeUtil::GetDimension(*a_shape, -1);
-  const int64 m = xla::ShapeUtil::GetDimension(*b_shape, -2);
-  if (n != xla::ShapeUtil::GetDimension(*a_shape, -2)) {
+  if (xla::ShapeUtil::GetDimension(*a_shape, -1) !=
+      xla::ShapeUtil::GetDimension(*a_shape, -2)) {
     return errors::InvalidArgument(
         "The 'a' arguments to TriangularSolve must be square matrices: ",
         xla::ShapeUtil::HumanString(*a_shape));
   }
-  if (n != xla::ShapeUtil::GetDimension(*b_shape, -1)) {
+  const int64 m = xla::ShapeUtil::GetDimension(*b_shape, -2);
+  const int64 n = xla::ShapeUtil::GetDimension(*b_shape, -1);
+  if ((left_side ? m : n) != xla::ShapeUtil::GetDimension(*a_shape, -1)) {
     return errors::InvalidArgument(
         "Arguments to TriangularSolve have incompatible matrix shapes: ",
         xla::ShapeUtil::HumanString(*a_shape), " vs ",
@@ -89,6 +92,14 @@ xla::StatusOr<xla::ComputationDataHandle> TriangularSolve(
     return output;
   };
 
+  // Applies a complex conjugation operation if `a` is complex and `conjugate_a`
+  // is true, otherwise returns its argument.
+  auto maybe_conj = [&](xla::ComputationBuilder* builder,
+                        xla::ComputationDataHandle x) {
+    auto perform_conj = a_shape->element_type() == xla::C64 && conjugate_a;
+    return perform_conj ? builder->Conj(x) : x;
+  };
+
   std::map<int, xla::Computation> base_computations;
   auto get_base_triangular_solve =
       [&](int k) -> xla::StatusOr<xla::Computation*> {
@@ -103,19 +114,35 @@ xla::StatusOr<xla::ComputationDataHandle> TriangularSolve(
                                                    prepend_batch_dims({k, k})),
                          "a");
 
+      std::array<int64, 2> b_lastd;
+      if (left_side) {
+        b_lastd = {k, n};
+      } else {
+        b_lastd = {m, k};
+      }
       auto b_param =
           sub->Parameter(1,
                          xla::ShapeUtil::MakeShape(b_shape->element_type(),
-                                                   prepend_batch_dims({m, k})),
+                                                   prepend_batch_dims(b_lastd)),
                          "b");
 
-      // TODO(phawkins): it might make sense to use a while loop here, rather
-      // than unrolling.
-      // TODO(phawkins): the left-looking variant of the algorithm might be more
-      // efficient at block size 1.
-      TF_RETURN_IF_ERROR(TriangularSolve(sub.get(), a_param, b_param,
-                                         /*block_size=*/1)
-                             .status());
+      // We use a left-looking subroutine on the block diagonal in some common
+      // cases, while falling back to a recursive call in unsupported cases. The
+      // left-looking subroutine is written with a While loop and so yields much
+      // faster compile times. Moreover, the left-looking variant can give
+      // higher performance on smaller (sub)problems.
+      if (left_side && lower) {
+        TF_RETURN_IF_ERROR(TriangularSolveLeftLooking(sub.get(), a_param,
+                                                      b_param, transpose_a,
+                                                      conjugate_a)
+                               .status());
+      } else {
+        TF_RETURN_IF_ERROR(TriangularSolve(sub.get(), a_param, b_param,
+                                           left_side, lower, transpose_a,
+                                           conjugate_a,
+                                           /*block_size=*/1)
+                               .status());
+      }
 
       TF_ASSIGN_OR_RETURN(computation, sub->Build());
     }
@@ -129,47 +156,397 @@ xla::StatusOr<xla::ComputationDataHandle> TriangularSolve(
   // Goto, Kazushige, and Robert Van De Geijn. "High-performance implementation
   // of the level-3 BLAS." ACM Transactions on Mathematical Software (TOMS) 35.1
   // (2008): 4.
-  for (int64 i = 0; i < n; i += block_size) {
-    int64 k = std::min(block_size, n - i);
 
-    // if k > 1:
-    //   output[..., :, i:i+k] = triangular_solve(
-    //       a[..., i:i+k, ..., i:i+k], b[..., :, i:i+k], side='Right',
-    //       kind='Lower', transpose=True, block_size=1)
-    // else:
-    //   output[..., :, i] = b[..., :, i] / a[..., i, i]
+  // In the code comments below, T = lambda x: np.swapaxes(x, -1, -2) if
+  // conjugate_a is False, or T = lambda x: np.conj(np.swapaxes(x, -1, -2)) if
+  // conjugate_a is True.
+
+  if (!left_side && lower == transpose_a) {
+    // for i in range(0, a.shape[-1], block_size):
+    for (int64 i = 0; i < n; i += block_size) {
+      int64 k = std::min(block_size, n - i);
+
+      // output[..., :, i:i+k] = triangular_solve(
+      //     a[..., i:i+k, i:i+k], b[..., :, i:i+k], ..., block_size=1)
+      TF_ASSIGN_OR_RETURN(auto a_slice,
+                          SliceInMinorDims(builder, a, {i, i}, {i + k, i + k}));
+      TF_ASSIGN_OR_RETURN(auto b_slice,
+                          SliceInMinorDims(builder, b, {0, i}, {m, i + k}));
+      xla::ComputationDataHandle update;
+      if (k > 1) {
+        TF_ASSIGN_OR_RETURN(xla::Computation * solve,
+                            get_base_triangular_solve(k));
+        update = builder->Call(*solve, {a_slice, b_slice});
+      } else {
+        update = builder->Div(b_slice, maybe_conj(builder, a_slice));
+      }
+      TF_ASSIGN_OR_RETURN(
+          output, UpdateSliceInMinorDims(builder, output, update, {0, i}));
+
+      // if i + k < a.shape[-1]:
+      //   a_slice_2 = a[..., i+k:, i:i+k] if lower else a[..., i:i+k, i+k:]
+      //   a_slice_2 = T(a_slice_2) if transpose_a else a_slice_2
+      //   b[..., :, i+k:] -= np.matmul(output[..., :, i:i+k], a_slice_2)
+      if (i + k < n) {
+        xla::ComputationDataHandle a_slice_2;
+        if (lower) {
+          TF_ASSIGN_OR_RETURN(
+              a_slice_2, SliceInMinorDims(builder, a, {i + k, i}, {n, i + k}));
+        } else {
+          TF_ASSIGN_OR_RETURN(
+              a_slice_2, SliceInMinorDims(builder, a, {i, i + k}, {i + k, n}));
+        }
+
+        TF_ASSIGN_OR_RETURN(auto b_update,
+                            BatchDot(builder, update, a_slice_2,
+                                     /*transpose_x=*/false,
+                                     /*transpose_y=*/transpose_a,
+                                     /*conjugate_x=*/false,
+                                     /*conjugate_y=*/conjugate_a));
+        TF_ASSIGN_OR_RETURN(auto b_slice_2,
+                            SliceInMinorDims(builder, b, {0, i + k}, {m, n}));
+        b_update = builder->Sub(b_slice_2, b_update);
+        TF_ASSIGN_OR_RETURN(
+            b, UpdateSliceInMinorDims(builder, b, b_update, {0, i + k}));
+      }
+    }
+
+  } else if (left_side && lower != transpose_a) {
+    // for i in range(0, a.shape[-1], block_size):
+    for (int64 i = 0; i < m; i += block_size) {
+      int64 k = std::min(block_size, m - i);
+
+      // output[..., i:i+k, :] = triangular_solve(
+      //     a[..., i:i+k, i:i+k], b[..., i:i+k, :], ..., block_size=1)
+      TF_ASSIGN_OR_RETURN(auto a_slice,
+                          SliceInMinorDims(builder, a, {i, i}, {i + k, i + k}));
+      TF_ASSIGN_OR_RETURN(auto b_slice,
+                          SliceInMinorDims(builder, b, {i, 0}, {i + k, n}));
+      xla::ComputationDataHandle update;
+      if (k > 1) {
+        TF_ASSIGN_OR_RETURN(xla::Computation * solve,
+                            get_base_triangular_solve(k));
+        update = builder->Call(*solve, {a_slice, b_slice});
+      } else {
+        update = builder->Div(b_slice, maybe_conj(builder, a_slice));
+      }
+      TF_ASSIGN_OR_RETURN(
+          output, UpdateSliceInMinorDims(builder, output, update, {i, 0}));
+
+      // if i + k < a.shape[-1]:
+      //   a_slice_2 = a[..., i+k:, i:i+k] if lower else a[..., i:i+k, i+k:]
+      //   a_slice_2 = T(a_slice_2) if transpose_a else a_slice_2
+      //   b[..., i+k:, :] -= np.matmul(a_slice_2, output[..., i:i+k, :])
+      if (i + k < m) {
+        xla::ComputationDataHandle a_slice_2;
+        if (lower) {
+          TF_ASSIGN_OR_RETURN(
+              a_slice_2, SliceInMinorDims(builder, a, {i + k, i}, {m, i + k}));
+        } else {
+          TF_ASSIGN_OR_RETURN(
+              a_slice_2, SliceInMinorDims(builder, a, {i, i + k}, {i + k, m}));
+        }
+
+        TF_ASSIGN_OR_RETURN(auto b_update, BatchDot(builder, a_slice_2, update,
+                                                    /*transpose_x=*/transpose_a,
+                                                    /*transpose_y=*/false,
+                                                    /*conjugate_x=*/conjugate_a,
+                                                    /*conjugate_y=*/false));
+        TF_ASSIGN_OR_RETURN(auto b_slice_2,
+                            SliceInMinorDims(builder, b, {i + k, 0}, {m, n}));
+        b_update = builder->Sub(b_slice_2, b_update);
+        TF_ASSIGN_OR_RETURN(
+            b, UpdateSliceInMinorDims(builder, b, b_update, {i + k, 0}));
+      }
+    }
+  } else if (!left_side && lower != transpose_a) {
+    // for i in reversed(range(0, a.shape[-1], block_size)):
+    const int64 last_blk_ix = xla::RoundUpToNearest(n, block_size) - block_size;
+    for (int64 i = last_blk_ix; i >= 0; i -= block_size) {
+      int64 k = std::min(block_size, n - i);
+
+      // output[..., :, i:i+k] triangular_solve(
+      //     a[..., i:i+k, i:i+k], b[..., :, i:i+k], ..., block_size=1)
+      TF_ASSIGN_OR_RETURN(auto a_slice,
+                          SliceInMinorDims(builder, a, {i, i}, {i + k, i + k}));
+      TF_ASSIGN_OR_RETURN(auto b_slice,
+                          SliceInMinorDims(builder, b, {0, i}, {m, i + k}));
+      xla::ComputationDataHandle update;
+      if (k > 1) {
+        TF_ASSIGN_OR_RETURN(xla::Computation * solve,
+                            get_base_triangular_solve(k));
+        update = builder->Call(*solve, {a_slice, b_slice});
+      } else {
+        update = builder->Div(b_slice, maybe_conj(builder, a_slice));
+      }
+      TF_ASSIGN_OR_RETURN(
+          output, UpdateSliceInMinorDims(builder, output, update, {0, i}));
+
+      // if i - k >= 0:
+      //   a_slice_2 = a[..., i:i+k, :i] if lower else a[..., :i, i:i+k]
+      //   a_slice_2 = T(a_slice_2) if transpose_a else a_slice_2
+      //   b[..., :, :i] -= np.matmul(out[..., :, i:i+k], a_slice_2)
+      if (i - k >= 0) {
+        xla::ComputationDataHandle a_slice_2;
+        if (lower) {
+          TF_ASSIGN_OR_RETURN(a_slice_2,
+                              SliceInMinorDims(builder, a, {i, 0}, {i + k, i}));
+        } else {
+          TF_ASSIGN_OR_RETURN(a_slice_2,
+                              SliceInMinorDims(builder, a, {0, i}, {i, i + k}));
+        }
+
+        TF_ASSIGN_OR_RETURN(auto b_update,
+                            BatchDot(builder, update, a_slice_2,
+                                     /*transpose_x=*/false,
+                                     /*transpose_y=*/transpose_a,
+                                     /*conjugate_x=*/false,
+                                     /*conjugate_y=*/conjugate_a));
+        TF_ASSIGN_OR_RETURN(auto b_slice_2,
+                            SliceInMinorDims(builder, b, {0, 0}, {m, i}));
+        b_update = builder->Sub(b_slice_2, b_update);
+        TF_ASSIGN_OR_RETURN(
+            b, UpdateSliceInMinorDims(builder, b, b_update, {0, 0}));
+      }
+    }
+  } else {  // left_side && lower == transpose_a
+    // for i in reversed(range(0, a.shape[-1], block_size)):
+    const int64 last_blk_ix = xla::RoundUpToNearest(m, block_size) - block_size;
+    for (int64 i = last_blk_ix; i >= 0; i -= block_size) {
+      int64 k = std::min(block_size, m - i);
+
+      // output[..., i:i+k, :] triangular_solve(
+      //     a[..., i:i+k, i:i+k], b[..., i:i+k, :], ..., block_size=1)
+      TF_ASSIGN_OR_RETURN(auto a_slice,
+                          SliceInMinorDims(builder, a, {i, i}, {i + k, i + k}));
+      TF_ASSIGN_OR_RETURN(auto b_slice,
+                          SliceInMinorDims(builder, b, {i, 0}, {i + k, n}));
+      xla::ComputationDataHandle update;
+      if (k > 1) {
+        TF_ASSIGN_OR_RETURN(xla::Computation * solve,
+                            get_base_triangular_solve(k));
+        update = builder->Call(*solve, {a_slice, b_slice});
+      } else {
+        update = builder->Div(b_slice, maybe_conj(builder, a_slice));
+      }
+      TF_ASSIGN_OR_RETURN(
+          output, UpdateSliceInMinorDims(builder, output, update, {i, 0}));
+
+      // if i - k >= 0:
+      //   a_slice_2 = a[..., i:i+k, :i] if lower else a[..., :i, i:i+k]
+      //   a_slice_2 = T(a_slice_2) if transpose_a else a_slice_2
+      //   b[..., :i, :] -= np.matmul(a_slice_2, out[..., i:i+k, :])
+      if (i - k >= 0) {
+        xla::ComputationDataHandle a_slice_2;
+        if (lower) {
+          TF_ASSIGN_OR_RETURN(a_slice_2,
+                              SliceInMinorDims(builder, a, {i, 0}, {i + k, i}));
+        } else {
+          TF_ASSIGN_OR_RETURN(a_slice_2,
+                              SliceInMinorDims(builder, a, {0, i}, {i, i + k}));
+        }
+
+        TF_ASSIGN_OR_RETURN(auto b_update, BatchDot(builder, a_slice_2, update,
+                                                    /*transpose_x=*/transpose_a,
+                                                    /*transpose_y=*/false,
+                                                    /*conjugate_x=*/conjugate_a,
+                                                    /*conjugate_y=*/false));
+        TF_ASSIGN_OR_RETURN(auto b_slice_2,
+                            SliceInMinorDims(builder, b, {0, 0}, {i, n}));
+        b_update = builder->Sub(b_slice_2, b_update);
+        TF_ASSIGN_OR_RETURN(
+            b, UpdateSliceInMinorDims(builder, b, b_update, {0, 0}));
+      }
+    }
+  }
+
+  return output;
+}
+
+xla::StatusOr<xla::ComputationDataHandle> TriangularSolveLeftLooking(
+    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& a,
+    const xla::ComputationDataHandle& b, bool transpose_a, bool conjugate_a) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> a_shape,
+                      builder->GetShape(a));
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> b_shape,
+                      builder->GetShape(b));
+  const int64 m = xla::ShapeUtil::GetDimension(*b_shape, -2);
+  const int64 n = xla::ShapeUtil::GetDimension(*b_shape, -1);
+  const int64 ndims = xla::ShapeUtil::Rank(*a_shape);
+
+  std::vector<int64> batch_dimensions;
+  for (int i = 0; i < ndims - 2; ++i) {
+    int64 a_size = a_shape->dimensions(i);
+    batch_dimensions.push_back(a_size);
+  }
+
+  auto prepend_batch_dims = [&](std::array<int64, 2> indices) {
+    std::vector<int64> output(ndims);
+    std::copy(batch_dimensions.begin(), batch_dimensions.end(), output.begin());
+    std::copy(indices.begin(), indices.end(),
+              output.begin() + batch_dimensions.size());
+    return output;
+  };
+
+  auto maybe_conj = [&](xla::ComputationBuilder* builder,
+                        xla::ComputationDataHandle x) {
+    auto perform_conj = a_shape->element_type() == xla::C64 && conjugate_a;
+    return perform_conj ? builder->Conj(x) : x;
+  };
+
+  // The main computation is performed in a While loop.
+
+  // Allocate the output and set its first or last row,
+  // output = np.zeros_like(b)
+  // if transpose_a:
+  //   output[..., m-1:, :] = b[..., m-1:, :] / a[..., m-1:, m-1:]
+  // else:
+  //   output[..., :1, :] = b[..., :1, :] / a[..., :1, :1]
+  xla::ComputationDataHandle output = Zeros(builder, *b_shape);
+  {
+    auto i = transpose_a ? m - 1 : 0;
     TF_ASSIGN_OR_RETURN(auto a_slice,
-                        SliceInMinorDims(builder, a, {i, i}, {i + k, i + k}));
+                        SliceInMinorDims(builder, a, {i, i}, {i + 1, i + 1}));
     TF_ASSIGN_OR_RETURN(auto b_slice,
-                        SliceInMinorDims(builder, b, {0, i}, {m, i + k}));
-    xla::ComputationDataHandle update;
-    if (k > 1) {
-      TF_ASSIGN_OR_RETURN(xla::Computation * solve,
-                          get_base_triangular_solve(k));
-      update = builder->Call(*solve, {a_slice, b_slice});
+                        SliceInMinorDims(builder, b, {i, 0}, {i + 1, n}));
+    auto update = builder->Div(b_slice, maybe_conj(builder, a_slice));
+    TF_ASSIGN_OR_RETURN(
+        output, UpdateSliceInMinorDims(builder, output, update, {i, 0}));
+  }
+
+  // Construct the initial loop carry tuple,
+  // if transpose_a:
+  //   init = (m-2, output, a, b)
+  // else:
+  //   init = (1, output, a, b)
+  std::vector<xla::Shape> tuple_shapes({
+      // The loop iteration counter is a scalar, incremented each iteration.
+      xla::ShapeUtil::MakeShape(xla::S32, {}),
+      // The output has the shape of b, with one row updated each iteration.
+      xla::ShapeUtil::MakeShape(b_shape->element_type(), b_shape->dimensions()),
+      // The coefficient matrix a is a loop invariant.
+      xla::ShapeUtil::MakeShape(a_shape->element_type(), a_shape->dimensions()),
+      // The right-hand-side matrix b is a loop invariant.
+      xla::ShapeUtil::MakeShape(b_shape->element_type(), b_shape->dimensions()),
+  });
+  xla::Shape tuple_shape = xla::ShapeUtil::MakeTupleShape(tuple_shapes);
+  auto init_i = builder->ConstantR0<int32>(transpose_a ? m - 2 : 1);
+  auto init = builder->Tuple({init_i, output, a, b});
+
+  // Construct the loop condition function,
+  // def cond_fun(loop_carry):
+  //   i, output, a, b = loop_carry
+  //   return i >= 0 if transpose_a else i < m
+  std::unique_ptr<xla::ComputationBuilder> condb =
+      builder->CreateSubBuilder("TriangularSolveLeftLookingWhileCond");
+  {
+    auto i = condb->GetTupleElement(
+        condb->Parameter(0, tuple_shape,
+                         "TriangularSolveLeftLookingWhileTuple"),
+        0);
+    if (transpose_a) {
+      condb->Ge(i, condb->ConstantR0<int32>(0));
     } else {
-      update = builder->Div(b_slice, a_slice);
+      condb->Lt(i, condb->ConstantR0<int32>(m));
     }
+  }
+  TF_ASSIGN_OR_RETURN(auto cond, condb->Build());
 
-    TF_ASSIGN_OR_RETURN(
-        output, UpdateSliceInMinorDims(builder, output, update, {0, i}));
-    // b[..., :, i+k:] -= np.dot(output[..., :, i:i+k],
-    //                           np.transpose(..., a[i+k:, i:i+k]))
-    if (i + k < n) {
-      TF_ASSIGN_OR_RETURN(auto a_slice_2,
-                          SliceInMinorDims(builder, a, {i + k, i}, {n, i + k}));
-      TF_ASSIGN_OR_RETURN(auto b_update, BatchDot(builder, update, a_slice_2,
-                                                  /*transpose_x=*/false,
-                                                  /*transpose_y=*/true));
-
-      TF_ASSIGN_OR_RETURN(auto b_slice_2,
-                          SliceInMinorDims(builder, b, {0, i + k}, {m, n}));
-      b_update = builder->Sub(b_slice_2, b_update);
-      TF_ASSIGN_OR_RETURN(
-          b, UpdateSliceInMinorDims(builder, b, b_update, {0, i + k}));
+  // Construct the loop body function,
+  // def body_fun(loop_carry):
+  //   i, output, a, b = loop_carry
+  //   if transpose_a:
+  //     a_row = np.swapaxes(a[..., i+1:, i:i+1], -1 -2)
+  //   else:
+  //     a_row = a[..., i:i+1, :i]
+  //   result_row = b[..., i:i+1, :] - np.matmul(a_row, output[..., :, :])
+  //   output[..., i:i+1, :] = result_row / a[..., i:i+1, i:i+1]
+  //   if transpose_a:
+  //     return (i - 1, output, a, b)
+  //   else:
+  //     return (i + 1, output, a, b)
+  // We have to do some extra FLOPs propagating zeros in the matrix multiply
+  // because we can't have the size of its arguments depend on the loop counter.
+  std::unique_ptr<xla::ComputationBuilder> bodyb =
+      builder->CreateSubBuilder("TriangularSolveLeftLookingWhileBody");
+  {
+    auto input_tuple = bodyb->Parameter(0, tuple_shape,
+                                        "TriangularSolveLeftLookingWhileTuple");
+
+    // i, output, a, b = loop_carry
+    auto i = bodyb->GetTupleElement(input_tuple, 0);
+    auto body_out = bodyb->GetTupleElement(input_tuple, 1);
+    auto body_a = bodyb->GetTupleElement(input_tuple, 2);
+    auto body_b = bodyb->GetTupleElement(input_tuple, 3);
+    auto zero = bodyb->ConstantR0<int32>(0);
+
+    // Set up some helper functions.
+    auto prepend_zeros = [&](std::array<xla::ComputationDataHandle, 2> starts) {
+      auto zero = bodyb->Reshape(bodyb->ConstantR0<int32>(0), {1});
+      std::vector<xla::ComputationDataHandle> padded_starts(ndims, zero);
+      padded_starts[ndims - 2] = bodyb->Reshape(starts[0], {1});
+      padded_starts[ndims - 1] = bodyb->Reshape(starts[1], {1});
+      return bodyb->ConcatInDim(padded_starts, 0);
+    };
+
+    auto dynamic_slice = [&](xla::ComputationDataHandle x,
+                             std::array<xla::ComputationDataHandle, 2> starts,
+                             std::array<int64, 2> sizes) {
+      auto padded_starts = prepend_zeros(starts);
+      auto padded_sizes = prepend_batch_dims(sizes);
+      return bodyb->DynamicSlice(x, padded_starts, padded_sizes);
+    };
+
+    auto update = [&](xla::ComputationDataHandle x,
+                      xla::ComputationDataHandle update,
+                      std::array<xla::ComputationDataHandle, 2> starts) {
+      auto padded_starts = prepend_zeros(starts);
+      return bodyb->DynamicUpdateSlice(x, update, padded_starts);
+    };
+
+    // We'd like to implement this:
+    //   if transpose_a:
+    //     a_row = T(a[..., i+1:, i:i+1])
+    //     result_row = (b[..., i:i+1, :]
+    //                   - np.matmul(a_row, body_out[..., i+1:, :]))
+    //   else:
+    //     result_row = (b[..., i:i+1, :]
+    //                   - np.matmul(a[..., i:i+1, :i], body_out[..., :i, :]))
+    // But since we can't have intermediate array sizes depend on the loop
+    // counter, we instead exploit the fact that we initialized the output to
+    // all zeros and use that as zero-padding (doing unnecessary FLOPs).
+    xla::ComputationDataHandle a_row;
+    if (transpose_a) {
+      a_row = dynamic_slice(body_a, {zero, i}, {m, 1});
+    } else {
+      a_row = dynamic_slice(body_a, {i, zero}, {1, m});
     }
+    TF_ASSIGN_OR_RETURN(auto b_update, BatchDot(bodyb.get(), a_row, body_out,
+                                                /*transpose_x=*/transpose_a,
+                                                /*transpose_y=*/false,
+                                                /*conjugate_x=*/conjugate_a,
+                                                /*conjugate_y=*/false));
+    auto result_row =
+        bodyb->Sub(dynamic_slice(body_b, {i, zero}, {1, n}), b_update);
+
+    // body_out[..., i:i+1, :] = result_row / a[..., i:i+1, i:i+1]
+    auto a_elt = dynamic_slice(body_a, {i, i}, {1, 1});
+    auto div_result = bodyb->Div(result_row, maybe_conj(bodyb.get(), a_elt));
+    body_out = update(body_out, div_result, {i, zero});
+
+    // if transpose_a:
+    //   return (i - 1, body_out, a, b)
+    // else:
+    //   return (i + 1, body_out, a, b)
+    auto next_i = bodyb->Add(i, bodyb->ConstantR0<int32>(transpose_a ? -1 : 1));
+    bodyb->Tuple({next_i, body_out, body_a, body_b});
   }
-  return output;
+  TF_ASSIGN_OR_RETURN(auto body, bodyb->Build());
+
+  // Construct the While loop and return the result,
+  // return while_loop(cond_fun, body_fun, init)[1]
+  auto triangular_solve_left_looking_while = builder->While(cond, body, init);
+  return builder->GetTupleElement(triangular_solve_left_looking_while, 1);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/lib/triangular_solve.h b/tensorflow/compiler/tf2xla/lib/triangular_solve.h
index 501d026411..e32223bfdd 100644
--- a/tensorflow/compiler/tf2xla/lib/triangular_solve.h
+++ b/tensorflow/compiler/tf2xla/lib/triangular_solve.h
@@ -21,25 +21,50 @@ limitations under the License.
 
 namespace tensorflow {
 
-// Solves systems of linear equations with upper or lower triangular matrices by
-// backsubstitution.
+// Solves systems of linear equations with lower or upper triangular coefficient
+// matrices by forward- or back-substitution. Broadcasting along leading
+// dimensions, this routine solves one of the matrix systems
+//   `op(a) * x = b`,  or `x * op(a) = b`,
+// for the variable `x` given `a` and `b`, where `op(a)` is either
+//   `op(a) = a`,  or `op(a) = transpose(a)`,  or `op(a) = conj(transpose(a))`.
+// That is, the innermost matrices in the output satisfy a scalar system
+// depending on the value of the value of (left_side, transpose_a, conjugate_a)
+// according to:
+//   (F, F, F) => `output[..., i, k]  a[..., k, j] = b[..., i, j]`,
+//   (F, F, T) => `output[..., i, k] a*[..., k, j] = b[..., i, j]`,
+//   (F, T, F) => `output[..., i, k]  a[..., j, k] = b[..., i, j]`,
+//   (F, T, T) => `output[..., i, k] a*[..., j, k] = b[..., i, j]`,
+//   (T, F, F) => ` a[..., i, k] output[..., k, j] = b[..., i, j]`,
+//   (T, F, T) => `a*[..., i, k] output[..., k, j] = b[..., i, j]`,
+//   (T, T, F) => ` a[..., i, k] output[..., j, k] = b[..., i, j]`,
+//   (T, T, T) => `a*[..., i, k] output[..., j, k] = b[..., i, j]`,
+// where * denotes complex conjugation and where the index `k` is summed over.
 //
-// `a` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions form
-// square matrices. The strictly upper triangular part of each inner-most matrix
-// is assumed to be zero and not accessed.
-// `b` is a tensor of shape `[..., M, K]`.
-//
-// The innermost matrices in the output satisfy matrix equations
-// `output[..., i, j] * adjoint(a[..., k, j]) = b[..., i, k]`.
+// `a` is a tensor of shape `[..., M, M]` whose innermost 2 dimensions form
+// square matrices. If lower is true (false), then the strictly upper (lower)
+// triangular part of each innermost matrix in `a` is assumed to be zero and is
+// not accessed.
+// `b` is a tensor of shape `[..., M, K]` if left_side is true, otherwise a
+// tensor of shape `[..., K, M]`.
+// `left_side` is a boolean, indicating whether to solve a system of the form
+// op(a) * x = b (true) or x * op(a) = b (false).
+// `lower` is a boolean, indicating whether the argument `a` is lower-triangular
+// (true) or upper-triangular (false).
+// `transpose_a` is a boolean indicating whether the matrix `a` is transposed.
+// `conjugate_a` is a boolean indicating whether the entries of `a` are complex
+// conjugated (independently of whether they are transposed), so that when both
+// transpose_a and conjugate_a are true the effect is a Hermitian adjoint.
 //
 // Uses a blocked algorithm if `block_size` is > 1; if block_size == 1 then no
 // blocking is used.
-// TODO(phawkins): equivalent to the BLAS TRSM routine with side=right,
-// kind=lower, and transposed_a=true. Implement the other possible combinations
-// of side, kind and transposed_a.
 xla::StatusOr<xla::ComputationDataHandle> TriangularSolve(
     xla::ComputationBuilder* builder, const xla::ComputationDataHandle& a,
-    xla::ComputationDataHandle b, int64 block_size = 256);
+    xla::ComputationDataHandle b, bool left_side, bool lower, bool transpose_a,
+    bool conjugate_a, int64 block_size = 256);
+
+xla::StatusOr<xla::ComputationDataHandle> TriangularSolveLeftLooking(
+    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& a,
+    const xla::ComputationDataHandle& b, bool transpose_a, bool conjugate_a);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/tf2xla/lib/triangular_solve_test.cc b/tensorflow/compiler/tf2xla/lib/triangular_solve_test.cc
index 671d9aa4fe..6617070629 100644
--- a/tensorflow/compiler/tf2xla/lib/triangular_solve_test.cc
+++ b/tensorflow/compiler/tf2xla/lib/triangular_solve_test.cc
@@ -27,32 +27,134 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
 namespace tensorflow {
 namespace {
 
 using TriangularSolveTest = xla::ClientLibraryTestBase;
+using TriangularSolveLeftLookingTest = xla::ClientLibraryTestBase;
+using complex64 = xla::complex64;
 
-XLA_TEST_F(TriangularSolveTest, Simple) {
+xla::Array2D<float> AValsLower() {
+  return {{2, 0, 0, 0}, {3, 6, 0, 0}, {4, 7, 9, 0}, {5, 8, 10, 11}};
+}
+
+xla::Array2D<float> AValsUpper() {
+  return {{2, 3, 4, 5}, {0, 6, 7, 8}, {0, 0, 9, 10}, {0, 0, 0, 11}};
+}
+
+xla::Array2D<float> BValsRight() {
+  return {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}};
+}
+
+xla::Array2D<float> BValsLeft() {
+  return {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}, {10, 11, 12}};
+}
+
+xla::Array2D<complex64> AValsLowerComplex() {
+  return {{2, 0, 0, 0},
+          {complex64(3, 1), 6, 0, 0},
+          {4, complex64(7, 2), 9, 0},
+          {5, 8, complex64(10, 3), 11}};
+}
+
+xla::Array2D<complex64> AValsUpperComplex() {
+  return {{2, 3, complex64(4, 3), 5},
+          {0, 6, complex64(7, 2), 8},
+          {0, 0, complex64(9, 1), 10},
+          {0, 0, 0, 11}};
+}
+
+xla::Array2D<complex64> BValsRightComplex() {
+  return {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}};
+}
+
+xla::Array2D<complex64> BValsLeftComplex() {
+  return {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}, {10, 11, 12}};
+}
+
+xla::Array2D<float> AValsFull() {
+  return {{2, 0, 1, 2}, {3, 6, 0, 1}, {4, 7, 9, 0}, {5, 8, 10, 11}};
+}
+
+XLA_TEST_F(TriangularSolveTest, SimpleRightLowerTranspose) {
   xla::ComputationBuilder builder(client_, TestName());
 
-  xla::Array2D<float> a_vals({
-      {2, 0, 0, 0},
-      {3, 6, 0, 0},
-      {4, 7, 9, 0},
-      {5, 8, 10, 11},
+  xla::ComputationDataHandle a, b;
+  auto a_data = CreateR2Parameter<float>(AValsLower(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsRight(), 1, "b", &builder, &b);
+  auto result = TriangularSolve(&builder, a, b,
+                                /*left_side=*/false, /*lower=*/true,
+                                /*transpose_a=*/true, /*conjugate_a=*/false,
+                                /*block_size=*/2);
+  TF_ASSERT_OK(result.status());
+
+  xla::Array2D<float> expected({
+      {0.5, 0.08333334, 0.04629629, 0.03367003},
+      {2.5, -0.25, -0.1388889, -0.1010101},
+      {4.5, -0.58333331, -0.32407406, -0.23569024},
+  });
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             xla::ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveTest, SimpleRightLowerNotranspose) {
+  xla::ComputationBuilder builder(client_, TestName());
+
+  xla::ComputationDataHandle a, b;
+  auto a_data = CreateR2Parameter<float>(AValsLower(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsRight(), 1, "b", &builder, &b);
+  auto result = TriangularSolve(&builder, a, b,
+                                /*left_side=*/false, /*lower=*/true,
+                                /*transpose_a=*/false, /*conjugate_a=*/false,
+                                /*block_size=*/2);
+  TF_ASSERT_OK(result.status());
+
+  xla::Array2D<float> expected({
+      {-0.16414141, -0.06902357, -0.07070707, 0.36363636},
+      {0.64393939, 0.06565657, -0.03030303, 0.72727273},
+      {1.4520202, 0.2003367, 0.01010101, 1.09090909},
   });
-  xla::Array2D<float> b_vals({
-      {1, 2, 3, 4},
-      {5, 6, 7, 8},
-      {9, 10, 11, 12},
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             xla::ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveTest, SimpleRightUpperTranspose) {
+  xla::ComputationBuilder builder(client_, TestName());
+
+  xla::ComputationDataHandle a, b;
+  auto a_data = CreateR2Parameter<float>(AValsUpper(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsRight(), 1, "b", &builder, &b);
+  auto result = TriangularSolve(&builder, a, b,
+                                /*left_side=*/false, /*lower=*/false,
+                                /*transpose_a=*/true, /*conjugate_a=*/false,
+                                /*block_size=*/2);
+  TF_ASSERT_OK(result.status());
+
+  xla::Array2D<float> expected({
+      {-0.16414141, -0.06902357, -0.07070707, 0.36363636},
+      {0.64393939, 0.06565657, -0.03030303, 0.72727273},
+      {1.4520202, 0.2003367, 0.01010101, 1.09090909},
   });
 
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             xla::ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveTest, SimpleRightUpperNotranspose) {
+  xla::ComputationBuilder builder(client_, TestName());
+
   xla::ComputationDataHandle a, b;
-  auto a_data = CreateR2Parameter<float>(a_vals, 0, "a", &builder, &a);
-  auto b_data = CreateR2Parameter<float>(b_vals, 1, "b", &builder, &b);
-  auto result = TriangularSolve(&builder, a, b, /*block_size=*/2);
+  auto a_data = CreateR2Parameter<float>(AValsUpper(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsRight(), 1, "b", &builder, &b);
+  auto result = TriangularSolve(&builder, a, b,
+                                /*left_side=*/false, /*lower=*/false,
+                                /*transpose_a=*/false, /*conjugate_a=*/false,
+                                /*block_size=*/2);
   TF_ASSERT_OK(result.status());
 
   xla::Array2D<float> expected({
@@ -62,7 +164,201 @@ XLA_TEST_F(TriangularSolveTest, Simple) {
   });
 
   ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
-                             xla::ErrorSpec(2e-3, 2e-3));
+                             xla::ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerTranspose) {
+  xla::ComputationBuilder builder(client_, TestName());
+
+  xla::ComputationDataHandle a, b;
+  auto a_data = CreateR2Parameter<float>(AValsLower(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
+  auto result = TriangularSolve(&builder, a, b,
+                                /*left_side=*/true, /*lower=*/true,
+                                /*transpose_a=*/true, /*conjugate_a=*/false,
+                                /*block_size=*/2);
+  TF_ASSERT_OK(result.status());
+
+  xla::Array2D<float> expected({
+      {-0.89646465, -0.69444444, -0.49242424},
+      {-0.27441077, -0.24074074, -0.20707071},
+      {-0.23232323, -0.22222222, -0.21212121},
+      {0.90909091, 1., 1.09090909},
+  });
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             xla::ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerNotranspose) {
+  xla::ComputationBuilder builder(client_, TestName());
+
+  xla::ComputationDataHandle a, b;
+  auto a_data = CreateR2Parameter<float>(AValsLower(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
+  auto result = TriangularSolve(&builder, a, b,
+                                /*left_side=*/true, /*lower=*/true,
+                                /*transpose_a=*/false, /*conjugate_a=*/false,
+                                /*block_size=*/2);
+  TF_ASSERT_OK(result.status());
+
+  xla::Array2D<float> expected({
+      {0.5, 1.0, 1.5},
+      {0.41666667, 0.33333333, 0.25},
+      {0.23148148, 0.18518519, 0.13888889},
+      {0.16835017, 0.13468013, 0.1010101},
+  });
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             xla::ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperTranspose) {
+  xla::ComputationBuilder builder(client_, TestName());
+
+  xla::ComputationDataHandle a, b;
+  auto a_data = CreateR2Parameter<float>(AValsUpper(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
+  auto result = TriangularSolve(&builder, a, b,
+                                /*left_side=*/true, /*lower=*/false,
+                                /*transpose_a=*/true, /*conjugate_a=*/false,
+                                /*block_size=*/2);
+  TF_ASSERT_OK(result.status());
+
+  xla::Array2D<float> expected({
+      {0.5, 1.0, 1.5},
+      {0.41666667, 0.33333333, 0.25},
+      {0.23148148, 0.18518519, 0.13888889},
+      {0.16835017, 0.13468013, 0.1010101},
+  });
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             xla::ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperNotranspose) {
+  xla::ComputationBuilder builder(client_, TestName());
+
+  xla::ComputationDataHandle a, b;
+  auto a_data = CreateR2Parameter<float>(AValsUpper(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
+  auto result = TriangularSolve(&builder, a, b,
+                                /*left_side=*/true, /*lower=*/false,
+                                /*transpose_a=*/false, /*conjugate_a=*/false,
+                                /*block_size=*/2);
+  TF_ASSERT_OK(result.status());
+
+  xla::Array2D<float> expected({
+      {-0.89646465, -0.69444444, -0.49242424},
+      {-0.27441077, -0.24074074, -0.20707071},
+      {-0.23232323, -0.22222222, -0.21212121},
+      {0.90909091, 1., 1.09090909},
+  });
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             xla::ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveTest, SimpleRightLowerTransposeConjugate) {
+  xla::ComputationBuilder builder(client_, TestName());
+
+  xla::ComputationDataHandle a, b;
+  auto a_data =
+      CreateR2Parameter<complex64>(AValsLowerComplex(), 0, "a", &builder, &a);
+  auto b_data =
+      CreateR2Parameter<complex64>(BValsRightComplex(), 1, "b", &builder, &b);
+  auto result = TriangularSolve(&builder, a, b,
+                                /*left_side=*/false, /*lower=*/true,
+                                /*transpose_a=*/true, /*conjugate_a=*/true,
+                                /*block_size=*/2);
+  TF_ASSERT_OK(result.status());
+
+  xla::Array2D<complex64> expected({
+      {0.5, complex64(0.08333333, 0.08333333),
+       complex64(0.02777778, -0.0462963), complex64(0.06313131, -0.01094276)},
+      {2.5, complex64(-0.25, 0.41666667), complex64(-0.23148148, -0.37962963),
+       complex64(0.08670034, -0.02104377)},
+      {4.5, complex64(-0.58333333, 0.75), complex64(-0.49074074, -0.71296296),
+       complex64(0.11026936, -0.03114478)},
+  });
+
+  ComputeAndCompareR2<complex64>(&builder, expected,
+                                 {a_data.get(), b_data.get()},
+                                 xla::ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperTransposeNoconjugate) {
+  xla::ComputationBuilder builder(client_, TestName());
+
+  xla::ComputationDataHandle a, b;
+  auto a_data =
+      CreateR2Parameter<complex64>(AValsUpperComplex(), 0, "a", &builder, &a);
+  auto b_data =
+      CreateR2Parameter<complex64>(BValsLeftComplex(), 1, "b", &builder, &b);
+  auto result = TriangularSolve(&builder, a, b,
+                                /*left_side=*/true, /*lower=*/false,
+                                /*transpose_a=*/true, /*conjugate_a=*/false,
+                                /*block_size=*/2);
+  TF_ASSERT_OK(result.status());
+
+  xla::Array2D<complex64> expected({
+      {0.5, 1., 1.5},
+      {0.41666667, 0.33333333, 0.25},
+      {complex64(0.20020325, -2.81504065e-01),
+       complex64(0.13821138, -4.22764228e-01),
+       complex64(0.07621951, -5.64024390e-01)},
+      {complex64(0.19678492, 2.55912786e-01),
+       complex64(0.17738359, 3.84331116e-01),
+       complex64(0.15798226, 5.12749446e-01)},
+  });
+
+  ComputeAndCompareR2<complex64>(&builder, expected,
+                                 {a_data.get(), b_data.get()},
+                                 xla::ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveLeftLookingTest, Simple) {
+  xla::ComputationBuilder builder(client_, TestName());
+
+  xla::ComputationDataHandle a, b;
+  auto a_data = CreateR2Parameter<float>(AValsLower(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
+  auto result = TriangularSolveLeftLooking(&builder, a, b,
+                                           /*transpose_a=*/false,
+                                           /*conjugate_a=*/false);
+  TF_ASSERT_OK(result.status());
+
+  xla::Array2D<float> expected({
+      {0.5, 1.0, 1.5},
+      {0.41666667, 0.33333333, 0.25},
+      {0.23148148, 0.18518519, 0.13888889},
+      {0.16835017, 0.13468013, 0.1010101},
+  });
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             xla::ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveLeftLookingTest, NonzeroUpperTriangle) {
+  xla::ComputationBuilder builder(client_, TestName());
+
+  xla::ComputationDataHandle a, b;
+  auto a_data = CreateR2Parameter<float>(AValsFull(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
+  auto result = TriangularSolveLeftLooking(&builder, a, b,
+                                           /*transpose_a=*/false,
+                                           /*conjugate_a=*/false);
+  TF_ASSERT_OK(result.status());
+
+  xla::Array2D<float> expected({
+      {0.5, 1.0, 1.5},
+      {0.41666667, 0.33333333, 0.25},
+      {0.23148148, 0.18518519, 0.13888889},
+      {0.16835017, 0.13468013, 0.1010101},
+  });
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             xla::ErrorSpec(1e-2, 1e-2));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/lib/util.cc b/tensorflow/compiler/tf2xla/lib/util.cc
index ce24b61b5d..9b7492f8cf 100644
--- a/tensorflow/compiler/tf2xla/lib/util.cc
+++ b/tensorflow/compiler/tf2xla/lib/util.cc
@@ -107,4 +107,15 @@ xla::StatusOr<xla::ComputationDataHandle> UpdateSliceInMinorDims(
   return UpdateSlice(builder, x, update, padded_start);
 }
 
+xla::StatusOr<xla::ComputationDataHandle> TransposeInMinorDims(
+    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> shape, builder->GetShape(x));
+  const int64 n_dims = xla::ShapeUtil::Rank(*shape);
+  TF_RET_CHECK(n_dims >= 2);
+  std::vector<int64> permutation(n_dims);
+  std::iota(permutation.begin(), permutation.end(), 0);
+  std::swap(permutation[n_dims - 1], permutation[n_dims - 2]);
+  return builder->Transpose(x, permutation);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/lib/util.h b/tensorflow/compiler/tf2xla/lib/util.h
index fb138b4f73..7f93102ee7 100644
--- a/tensorflow/compiler/tf2xla/lib/util.h
+++ b/tensorflow/compiler/tf2xla/lib/util.h
@@ -49,6 +49,10 @@ xla::StatusOr<xla::ComputationDataHandle> UpdateSliceInMinorDims(
     xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x,
     const xla::ComputationDataHandle& update, gtl::ArraySlice<int64> start);
 
+// Transposes a stack of matrices `x` by swapping the last two dimensions.
+xla::StatusOr<xla::ComputationDataHandle> TransposeInMinorDims(
+    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_TF2XLA_LIB_UTIL_H_
-- 
GitLab


From 84cab7b04b178ed63c19c9d618926f09da327fd7 Mon Sep 17 00:00:00 2001
From: Sami Kama <skama@nvidia.com>
Date: Mon, 29 Jan 2018 17:34:07 -0800
Subject: [PATCH 372/966] Add TODO message

---
 tensorflow/contrib/tensorrt/python/trt_convert.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/tensorrt/python/trt_convert.py b/tensorflow/contrib/tensorrt/python/trt_convert.py
index 4f4dcf65aa..f6d2dbede6 100644
--- a/tensorflow/contrib/tensorrt/python/trt_convert.py
+++ b/tensorflow/contrib/tensorrt/python/trt_convert.py
@@ -29,7 +29,8 @@ from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
 
-
+# TODO(skama): get outputs from session when implemented as c++
+# optimization pass
 def CreateInferenceGraph(input_graph_def, outputs,max_batch_size=1,max_workspace_size=2<<20):
   """Python wrapper for the TRT transormation.
 
-- 
GitLab


From a4b88a5b795d5496bffe4ff80875a5bf0954a4d6 Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Mon, 29 Jan 2018 17:48:17 -0800
Subject: [PATCH 373/966] Use new Operation._set_attr method instead of
 modifying node_def directly.

Once calling the C API from TF's Python code is enabled, the NodeDef
returned by Operation.node_def will no longer be the NodeDef sent to
TF's runtime, meaning any changes to it will have no effect. Use
_set_attr instead, which works with and without the C API enabled.

PiperOrigin-RevId: 183759464
---
 tensorflow/contrib/tpu/python/tpu/tpu.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/tpu/python/tpu/tpu.py b/tensorflow/contrib/tpu/python/tpu/tpu.py
index 8fec379aad..d5f54ff4fd 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu.py
@@ -153,10 +153,11 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
       raise NotImplementedError(
           "Non-resource Variables are not supported inside TPU computations "
           "(operator name: %s)" % op.name)
-    # pylint: enable=protected-access
     if _TPU_REPLICATE_ATTR in op.node_def.attr:
       raise ValueError("TPU computations cannot be nested")
-    op.node_def.attr[_TPU_REPLICATE_ATTR].s = compat.as_bytes(self._name)
+    op._set_attr(_TPU_REPLICATE_ATTR,
+                 attr_value_pb2.AttrValue(s=compat.as_bytes(self._name)))
+    # pylint: enable=protected-access
     op.graph.prevent_feeding(op)
     op.graph.prevent_fetching(op)
 
-- 
GitLab


From e3d99c92975efc2010d0e1e2dd4c3eb787a8d67c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 29 Jan 2018 17:50:56 -0800
Subject: [PATCH 374/966] Remove Identity nodes if num_inputs * num_outputs <=
 num_inputs + num_outputs. Exceptions are Identity nodes after Variable nodes,
 and Identity nodes after Switch nodes when removing the node would require
 anchoring a control dependency on the Switch. Another exception is Identity
 nodes where inputs or outputs cross a device boundary, since we are not
 allowed to remove Identity nodes after _Recv that might be inserted in the
 graph later.

PiperOrigin-RevId: 183759826
---
 .../optimizers/dependency_optimizer.cc        | 209 +++++++++++++-----
 .../optimizers/dependency_optimizer.h         |   9 +-
 .../optimizers/dependency_optimizer_test.cc   | 150 ++++++++++++-
 .../python/debug/lib/debug_gradients_test.py  |   7 +-
 .../debug/lib/session_debug_grpc_test.py      |   3 +-
 5 files changed, 305 insertions(+), 73 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
index d2da125236..0842fc92a8 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
@@ -36,20 +36,20 @@ namespace grappler {
 
 namespace {
 
-int RemoveInput(NodeDef* node, const string& input, NodeMap* node_map) {
-  int num_removed = 0;
+bool RemoveInput(NodeDef* node, const string& input, NodeMap* node_map) {
+  bool removed_input = false;
   int pos = 0;
   while (pos < node->input_size()) {
     if (node->input(pos) == input) {
       node->mutable_input()->SwapElements(pos, node->input_size() - 1);
       node->mutable_input()->RemoveLast();
       node_map->RemoveOutput(NodeName(input), node->name());
+      removed_input = true;
     } else {
       ++pos;
     }
-    ++num_removed;
   }
-  return num_removed;
+  return removed_input;
 }
 
 // Remove duplicate control inputs.
@@ -71,6 +71,43 @@ void PruneControlInputs(NodeDef* node) {
 
 }  // namespace
 
+bool DependencyOptimizer::SafeToRemoveIdentity(const NodeDef& node) {
+  if (!IsIdentity(node)) {
+    return true;
+  }
+  if (nodes_to_preserve_.find(node.name()) != nodes_to_preserve_.end()) {
+    return false;
+  }
+  if (!fetch_nodes_known_) {
+    // The output values of this node may be needed.
+    return false;
+  }
+  const NodeDef* input = node_map_->GetNode(NodeName(node.input(0)));
+  CHECK(input != nullptr) << "node = " << node.name()
+                          << " input = " << node.input(0);
+  // Don't remove Identity nodes corresponding to Variable reads or following
+  // Recv.
+  if (IsVariable(*input) || IsRecv(*input)) {
+    return false;
+  } else if (IsSwitch(*input)) {
+    // Don't turn Identity nodes following Switch into NoOp or remove them
+    // if it requires anchoring a control dependencies the Switch node, which
+    // is not valid.
+    if (StringPiece(node.name()).starts_with(kConstantFoldingCtrl)) {
+      // TODO(rmlarsen): Try to remove this artificial contraint.
+      return false;
+    }
+    for (auto consumer : node_map_->GetOutputs(node.name())) {
+      for (const string& consumer_input : consumer->input()) {
+        if (consumer_input == AsControlDependency(node.name())) {
+          return false;
+        }
+      }
+    }
+  }
+  return true;
+}
+
 bool DependencyOptimizer::SafeToConvertToNoOp(const NodeDef& node) {
   if (nodes_to_preserve_.find(node.name()) != nodes_to_preserve_.end()) {
     return false;
@@ -100,18 +137,8 @@ bool DependencyOptimizer::SafeToConvertToNoOp(const NodeDef& node) {
     return false;
   }
 
-  // Don't turn Identity nodes inserted by Grappler after Switch into NoOp,
-  // since we cannot anchor control dependencies on Switch nodes.
-  // Don't remove Identity nodes corresponding to Variable reads.
-  if (IsIdentity(node)) {
-    const NodeDef* input = node_map_->GetNode(NodeName(node.input(0)));
-    if (input != nullptr) {
-      if (IsVariable(*input) ||
-          (StringPiece(node.name()).starts_with(kConstantFoldingCtrl) &&
-           IsSwitch(*input))) {
-        return false;
-      }
-    }
+  if (!SafeToRemoveIdentity(node)) {
+    return false;
   }
 
   const std::unordered_set<string> do_not_rewrite_ops{
@@ -124,19 +151,22 @@ bool DependencyOptimizer::SafeToConvertToNoOp(const NodeDef& node) {
 void DependencyOptimizer::OptimizeNode(int node_idx,
                                        SetVector<int>* nodes_to_simplify,
                                        std::set<int>* nodes_to_delete) {
+  const bool is_aggressive = opt_level_ == RewriterConfig::AGGRESSIVE;
   NodeDef* node = optimized_graph_->mutable_node(node_idx);
-
+  const bool is_noop = IsNoOp(*node);
+  const bool is_identity = IsIdentity(*node);
+  const string node_name = node->name();
   // Constant nodes with no input control dependency are always executed early,
   // so we can prune all their output control dependencies.
   if (IsConstant(*node) && node->input_size() == 0) {
-    const std::set<NodeDef*> output_nodes = node_map_->GetOutputs(node->name());
+    const std::set<NodeDef*> output_nodes = node_map_->GetOutputs(node_name);
     for (NodeDef* fanout : output_nodes) {
       bool optimize_fanout = false;
       bool data_connection = false;
       for (int i = fanout->input_size() - 1; i >= 0; --i) {
         int pos;
         string input_name = ParseNodeName(fanout->input(i), &pos);
-        if (input_name == node->name()) {
+        if (input_name == node_name) {
           if (pos < 0) {
             fanout->mutable_input()->SwapElements(i, fanout->input_size() - 1);
             fanout->mutable_input()->RemoveLast();
@@ -149,22 +179,21 @@ void DependencyOptimizer::OptimizeNode(int node_idx,
       if (optimize_fanout) {
         nodes_to_simplify->PushBack(node_to_idx_[fanout]);
         if (!data_connection) {
-          node_map_->RemoveOutput(node->name(), fanout->name());
+          node_map_->RemoveOutput(node_name, fanout->name());
         }
       }
     }
-    if (node_map_->GetOutputs(node->name()).empty() && fetch_nodes_known_ &&
-        nodes_to_preserve_.find(node->name()) == nodes_to_preserve_.end()) {
+    if (node_map_->GetOutputs(node_name).empty() && fetch_nodes_known_ &&
+        nodes_to_preserve_.find(node_name) == nodes_to_preserve_.end()) {
       // Mark the node for deletion.
       nodes_to_delete->insert(node_to_idx_[node]);
     }
-
     return;
   }
 
   // Change ops that only have control dependencies as outputs to NoOps.
-  if (node->op() != "NoOp" && SafeToConvertToNoOp(*node)) {
-    VLOG(1) << "***** Replacing  " << node->name() << " (" << node->op()
+  if (!is_noop && SafeToConvertToNoOp(*node)) {
+    VLOG(1) << "***** Replacing  " << node_name << " (" << node->op()
             << ") with NoOp.";
     // The outputs of this node are not consumed. Replace its inputs with
     // control dependencies and replace the op itself with the NoOp op.
@@ -186,7 +215,7 @@ void DependencyOptimizer::OptimizeNode(int node_idx,
           old_input, optimized_graph_, node_map_.get());
       if (ctrl_inputs.insert(ctrl_input).second) {
         node->set_input(pos, ctrl_input);
-        node_map_->UpdateInput(node->name(), old_input, ctrl_input);
+        node_map_->UpdateInput(node_name, old_input, ctrl_input);
         const NodeDef* old_input_node = node_map_->GetNode(old_input);
         nodes_to_simplify->PushBack(node_to_idx_[old_input_node]);
       }
@@ -194,6 +223,8 @@ void DependencyOptimizer::OptimizeNode(int node_idx,
     }
     node->set_op("NoOp");
     node->clear_attr();
+    nodes_to_simplify->PushBack(node_to_idx_[node]);
+    return;
   }
 
   // Remove NoOp nodes if the product of their fan-in and fan-out is less than
@@ -222,9 +253,30 @@ void DependencyOptimizer::OptimizeNode(int node_idx,
   // a and x, respectively, are on the same device. Control edges across device
   // boundaries require inter-device communication (Send/Recv pairs to be
   // inserted in the graph), which is very costly.
+  //
+  // We also remove identity nodes, subject to the same constraints on number of
+  // resulting control edges and device boundary crossings:
+  //
+  // Case a)
+  //          +----------+ ---> a       +---+ ---> a
+  //    x --> | Identity | --^> b  ==>  | x | --^> b
+  //          |          | ...          |   | ...
+  //          +----------+ --^> c       +---+ --^> c
+  //
+  // Case b)
+  //    x ---> +----------+ ---> a      x ---> +---+
+  //    y --^> | Identity |        ==>  y --^> | a |
+  //    ...    |          |               ...  |   |
+  //    z --^> +----------+             z --^> +---+
+  //
+  // Case c)
+  //           +----------+             x ---> +---+
+  //    x ---> | Identity | ---> a ==>   \--^> | a |
+  //    y --^> |          | --^> b       /\    +---+
+  //           +----------+             y --^> b
 
-  if (node->op() == "NoOp") {
-    const auto& output_node_set = node_map_->GetOutputs(node->name());
+  if (is_noop || (is_identity && is_aggressive)) {
+    const auto& output_node_set = node_map_->GetOutputs(node_name);
     const std::vector<NodeDef*> output_nodes(output_node_set.begin(),
                                              output_node_set.end());
     const int num_outputs = output_nodes.size();
@@ -233,15 +285,14 @@ void DependencyOptimizer::OptimizeNode(int node_idx,
     if (num_inputs * num_outputs > num_inputs + num_outputs) {
       return;
     }
-    VLOG(1) << "***** Rerouting input around " << node->name();
     std::vector<NodeDef*> input_nodes;
     for (int i = 0; i < num_inputs; ++i) {
-      NodeDef* tmp = node_map_->GetNode(node->input(i));
-      CHECK_NE(tmp, nullptr);
-      input_nodes.push_back(tmp);
+      NodeDef* input_node = node_map_->GetNode(node->input(i));
+      CHECK_NE(input_node, nullptr);
+      input_nodes.push_back(input_node);
     }
 
-    // Make sure that we don't increase the number of control edges that cross
+    // Make sure that we don't increase the number of edges that cross
     // device boundaries.
     if ((num_inputs == 1 && num_outputs > 1 &&
          input_nodes[0]->device() != node->device()) ||
@@ -266,40 +317,75 @@ void DependencyOptimizer::OptimizeNode(int node_idx,
       if (num_cross_after > num_cross_before) {
         return;
       }
+      // To avoid potentially removing Identity nodes following _Recv nodes,
+      // we require that no device crossings occur in that case.
+      // TODO(rmlarsen): See if we can relax this condition.
+      if (is_identity && (num_cross_after > 0 || num_cross_before > 0)) {
+        return;
+      }
+    }
+    if (is_identity && !SafeToRemoveIdentity(*node)) {
+      return;
     }
+
+    VLOG(1) << "***** Rerouting input around\n" << node->DebugString();
+    // Now remove the node and re-wire its inputs to its outputs.
     for (auto consumer : output_nodes) {
       bool updated_consumer = false;
-      VLOG(1) << "***** Considering consumer  " << consumer->name() << "\n"
-              << consumer->DebugString();
+      VLOG(1) << "consumer before:\n" << consumer->DebugString();
       for (int i = 0; i < num_inputs; ++i) {
         const NodeDef* input = input_nodes[i];
         // Forward dependency from input to consumer if it doesn't already
         // depend on it.
-        if (node_map_->GetOutputs(input->name()).count(consumer) == 0) {
-          consumer->add_input(AsControlDependency(input->name()));
+        if (is_identity && i == 0) {
+          // Replace regular input from Identity node.
+          bool found_input = false;
+          string new_input;
+          const string& input_to_forward = node->input(0);
+          CHECK(!IsControlInput(input_to_forward));
+          for (int j = 0; j < consumer->input_size(); ++j) {
+            const string& old_input = consumer->input(j);
+            if (old_input == node_name) {
+              new_input = input_to_forward;
+              node_map_->UpdateInput(consumer->name(), old_input, new_input);
+              consumer->set_input(j, new_input);
+              found_input = true;
+            } else if (old_input == AsControlDependency(NodeName(node_name))) {
+              new_input = AsControlDependency(NodeName(input_to_forward));
+              node_map_->UpdateInput(consumer->name(), old_input, new_input);
+              consumer->set_input(j, new_input);
+              found_input = true;
+            }
+          }
+          CHECK(found_input);
           updated_consumer = true;
-          node_map_->AddOutput(input->name(), consumer->name());
-          nodes_to_simplify->PushBack(node_to_idx_[input]);
+        } else {
+          // Forward dependency from input to consumer if it doesn't already
+          // depend on it.
+          if (node_map_->GetOutputs(input->name()).count(consumer) == 0) {
+            consumer->add_input(AsControlDependency(input->name()));
+            node_map_->AddOutput(input->name(), consumer->name());
+            nodes_to_simplify->PushBack(node_to_idx_[input]);
+            updated_consumer = true;
+          }
         }
       }
       // Remove dependency on node from consumer.
-      updated_consumer |= RemoveInput(
-          consumer, AsControlDependency(node->name()), node_map_.get());
+      updated_consumer |= RemoveInput(consumer, AsControlDependency(node_name),
+                                      node_map_.get());
       if (updated_consumer) {
-        VLOG(1) << "***** Updated consumer  " << consumer->name() << " ("
-                << consumer->op() << ")";
         nodes_to_simplify->PushBack(node_to_idx_[consumer]);
       }
+      VLOG(1) << "consumer after:\n" << consumer->DebugString();
     }
-
-    node_map_->RemoveOutputs(node->name());
+    node_map_->RemoveOutputs(node_name);
     if (fetch_nodes_known_ &&
-        nodes_to_preserve_.find(node->name()) == nodes_to_preserve_.end()) {
+        nodes_to_preserve_.find(node_name) == nodes_to_preserve_.end()) {
       // Mark the node for deletion.
       nodes_to_delete->insert(node_idx);
 
-      // Unconnect the node from its inputs to enable further optimizations.
-      node_map_->RemoveInputs(node->name());
+      // Disconnect the node from its inputs to enable further optimizations.
+      node_map_->RemoveInputs(node_name);
       node->clear_input();
     }
   }
@@ -330,13 +416,18 @@ Status DependencyOptimizer::OptimizeDependencies() {
   std::set<int> nodes_to_delete;
   for (int i = 0; i < optimized_graph_->node_size(); ++i) {
     const NodeDef& node = optimized_graph_->node(i);
-    if (node.op() == "NoOp" || IsConstant(node) || SafeToConvertToNoOp(node)) {
+    if (IsNoOp(node) || IsIdentity(node) || IsConstant(node) ||
+        SafeToConvertToNoOp(node)) {
       nodes_to_simplify.PushBack(i);
     }
   }
   while (!nodes_to_simplify.Empty()) {
-    OptimizeNode(nodes_to_simplify.PopBack(), &nodes_to_simplify,
-                 &nodes_to_delete);
+    int node_to_simplify = nodes_to_simplify.PopBack();
+    // Discard nodes that were marked for deletion already.
+    while (nodes_to_delete.find(node_to_simplify) != nodes_to_delete.end()) {
+      node_to_simplify = nodes_to_simplify.PopBack();
+    }
+    OptimizeNode(node_to_simplify, &nodes_to_simplify, &nodes_to_delete);
   }
 
   if (fetch_nodes_known_) {
@@ -431,9 +522,10 @@ Status DependencyOptimizer::TransitiveReduction() {
       if (longest_distance[target] > 1) {
         const int input_slot = control_output.second;
         control_edges_to_remove[target].emplace(input_slot, source);
-        VLOG(1) << "Removing edge from:\n"
-                << optimized_graph_->node(source).DebugString() << "\n\nto:\n\n"
-                << optimized_graph_->node(target).DebugString();
+        //        VLOG(1) << "Removing edge from:\n"
+        //                << optimized_graph_->node(source).DebugString() <<
+        //                "\n\nto:\n\n"
+        //                << optimized_graph_->node(target).DebugString();
       }
     }
   }
@@ -473,8 +565,8 @@ Status DependencyOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   *optimized_graph_ = item.graph;
   nodes_to_preserve_ = item.NodesToPreserve();
   fetch_nodes_known_ = !item.fetch.empty();
-
   CleanControlInputs();
+
   const int num_iterations = 2;
   for (int iteration = 0; iteration < num_iterations; ++iteration) {
     Status topo_sort_status;
@@ -491,9 +583,12 @@ Status DependencyOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
     } else {
       LOG(ERROR) << topo_sort_status.error_message();
     }
-
-    // Turn nodes with only control outputs into NoOps, prune NoOps.
+    // Turn nodes with only control outputs into NoOps, prune NoOp and Identity
+    // nodes.
     TF_RETURN_IF_ERROR(OptimizeDependencies());
+
+    // Dedup control inputs.
+    CleanControlInputs();
   }
 
   return Status::OK();
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer.h b/tensorflow/core/grappler/optimizers/dependency_optimizer.h
index cfc5324439..0f47528a04 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer.h
@@ -29,8 +29,9 @@ namespace grappler {
 // optimizations, such as removing nodes that are effectively noops.
 class DependencyOptimizer : public GraphOptimizer {
  public:
-  DependencyOptimizer() {}
-  explicit DependencyOptimizer(RewriterConfig::Toggle /*unused*/) {}
+  DependencyOptimizer() : opt_level_(RewriterConfig::ON) {}
+  explicit DependencyOptimizer(RewriterConfig::Toggle opt_level)
+      : opt_level_(opt_level) {}
   ~DependencyOptimizer() override {}
 
   string name() const override { return "dependency_optimizer"; };
@@ -42,6 +43,9 @@ class DependencyOptimizer : public GraphOptimizer {
                 const GraphDef& optimized_graph, double result) override;
 
  private:
+  // Returns true if node is not an Identity node or if it is an Identity
+  // that is safe to remove.
+  bool SafeToRemoveIdentity(const NodeDef& node);
   // Returns true if it is safe to convert node to NoOp.
   bool SafeToConvertToNoOp(const NodeDef& node);
   // Removes all duplicate control dependencies.
@@ -61,6 +65,7 @@ class DependencyOptimizer : public GraphOptimizer {
   // Main driver of dependency optimizations.
   Status OptimizeDependencies();
 
+  RewriterConfig::Toggle opt_level_;
   bool fetch_nodes_known_;
   std::unordered_set<string> nodes_to_preserve_;
   std::unique_ptr<NodeMap> node_map_;
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
index f5027a4a99..b8facb9dea 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
@@ -167,14 +167,16 @@ TEST_F(DependencyOptimizerTest, ChangeToNoop_SwitchIdentity) {
       ops::Const(scope.WithOpName("c2").WithControlDependencies(ctrl_dep_id),
                  {1.0f, 2.0f}, {1, 2});
   Output neg1 = ops::Neg(scope.WithOpName("neg1"), s.output_false);
+  Output neg2 = ops::Neg(scope.WithOpName("neg2"), ctrl_dep_id);
 
   GrapplerItem item;
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
   item.fetch.push_back("c1");
   item.fetch.push_back("c2");
   item.fetch.push_back("neg1");
+  item.fetch.push_back("neg2");
 
-  DependencyOptimizer optimizer;
+  DependencyOptimizer optimizer(RewriterConfig::AGGRESSIVE);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -323,25 +325,148 @@ TEST_F(DependencyOptimizerTest, RemoveNoOps_SingleInputOrOutput) {
   }
 }
 
+TEST_F(DependencyOptimizerTest, RemoveIdentity) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output x = ops::RandomUniform(s.WithOpName("x"), {1, 2}, DT_FLOAT);
+  Output y = ops::RandomUniform(s.WithOpName("y"), {1, 2}, DT_FLOAT);
+  Output z = ops::RandomUniform(s.WithOpName("z"), {1, 2}, DT_FLOAT);
+
+  // Identity nodes to be removed.
+  // Case a) with a single input- and multiple outputs.
+  auto id_a = ops::Identity(s.WithOpName("id_a"), x);
+  // Case b) with multiple inputs and a single output.
+  auto id_b = ops::Identity(
+      s.WithOpName("id_b").WithControlDependencies(y).WithControlDependencies(
+          z),
+      x);
+  // Case c) with two inputs and two outputs.
+  auto id_c = ops::Identity(s.WithOpName("id_c").WithControlDependencies(y), x);
+
+  // Output for Case a.
+  Output a_a = ops::Identity(s.WithOpName("a_a"), id_a);
+  Output a_b = ops::Identity(s.WithOpName("a_b"), id_a);
+  Output a_c =
+      ops::Identity(s.WithOpName("a_c").WithControlDependencies(id_a), z);
+  Output a_d =
+      ops::Identity(s.WithOpName("a_d").WithControlDependencies(id_a), z);
+  // Output for Case b.
+  Output b_a = ops::Identity(s.WithOpName("b_a"), id_b);
+  // Output for Case c.
+  Output c_a = ops::Identity(s.WithOpName("c_a"), id_c);
+  Output c_b =
+      ops::Identity(s.WithOpName("c_b").WithControlDependencies(id_c), z);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch = {"a_a", "a_b", "a_c", "a_d", "b_a", "c_a", "c_b"};
+
+  DependencyOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(item.graph.node_size() - 3, output.node_size());
+  for (const NodeDef& node : output.node()) {
+    EXPECT_NE("id_a", node.name());
+    EXPECT_NE("id_b", node.name());
+    EXPECT_NE("id_c", node.name());
+    if (node.name() == "a_a" || node.name() == "a_b") {
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+    }
+    if (node.name() == "a_c" || node.name() == "a_d") {
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("z", node.input(0));
+      EXPECT_EQ("^x", node.input(1));
+    }
+    if (node.name() == "b_a") {
+      EXPECT_EQ(3, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("^y", node.input(1));
+      EXPECT_EQ("^z", node.input(2));
+    }
+    if (node.name() == "c_a") {
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("^y", node.input(1));
+    }
+    if (node.name() == "c_b") {
+      EXPECT_EQ(3, node.input_size());
+      EXPECT_EQ("z", node.input(0));
+      EXPECT_EQ("^x", node.input(1));
+      EXPECT_EQ("^y", node.input(2));
+    }
+  }
+}
+
+TEST_F(DependencyOptimizerTest, RemoveIdentity_RepeatedInputs) {
+  // Corner cases with repeated inputs.
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+  ops::Variable x(scope.WithOpName("x"), {}, DT_BOOL);
+  ops::Variable y(scope.WithOpName("y"), {}, DT_BOOL);
+  ops::Switch sw(scope.WithOpName("switch"), x, x);
+  // id0 should be removed.
+  Output id0 = ops::Identity(scope.WithOpName("id0"), sw.output_true);
+  // id1 should not be removed, since it would anchor a control dependency
+  // on the switch.
+  Output id1 = ops::Identity(scope.WithOpName("id1"), sw.output_false);
+  Output or0 = ops::LogicalOr(scope.WithOpName("or0"), id0, id0);
+  Output or1 = ops::LogicalOr(scope.WithOpName("or1"), id0, y);
+  Output or2 = ops::LogicalOr(
+      scope.WithOpName("or2").WithControlDependencies(id1), y, y);
+
+  GrapplerItem item;
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+  item.fetch.push_back("or0");
+  item.fetch.push_back("or1");
+  item.fetch.push_back("or2");
+  DependencyOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(item.graph.node_size() - 1, output.node_size());
+  for (const NodeDef& node : output.node()) {
+    EXPECT_NE("id0", node.name());
+    if (node.name() == "or0") {
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("switch:1", node.input(0));
+      EXPECT_EQ("switch:1", node.input(1));
+    }
+    if (node.name() == "or1") {
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("switch:1", node.input(0));
+      EXPECT_EQ("y", node.input(1));
+    }
+    if (node.name() == "or2") {
+      // or1 should be unchanged.
+      EXPECT_EQ(3, node.input_size());
+      EXPECT_EQ("y", node.input(0));
+      EXPECT_EQ("y", node.input(1));
+      EXPECT_EQ("^id1", node.input(2));
+    }
+  }
+}
+
 TEST_F(DependencyOptimizerTest, Transitive_Reduction_Simple) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output c = ops::Const(s.WithOpName("c"), {1.0f, 2.0f}, {1, 2});
   Output x = ops::Square(s.WithOpName("x"), c);
-  Output id1 = ops::Identity(s.WithOpName("id1"), x);
-  Output id2 =
-      ops::Identity(s.WithOpName("id2").WithControlDependencies({x}), id1);
+  Output neg1 = ops::Neg(s.WithOpName("neg1"), x);
+  Output neg2 =
+      ops::Neg(s.WithOpName("neg2").WithControlDependencies({x}), neg1);
 
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-  item.fetch.push_back("id2");
-  DependencyOptimizer optimizer;
+  item.fetch.push_back("neg2");
+  DependencyOptimizer optimizer(RewriterConfig::AGGRESSIVE);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
   EXPECT_EQ(4, output.node_size());
-  EXPECT_EQ("id2", output.node(3).name());
+  EXPECT_EQ("neg2", output.node(3).name());
   EXPECT_EQ(1, output.node(3).input_size());
-  EXPECT_EQ("id1", output.node(3).input(0));
+  EXPECT_EQ("neg1", output.node(3).input(0));
 }
 
 TEST_F(DependencyOptimizerTest, ChangeToNoop_Identity) {
@@ -356,20 +481,21 @@ TEST_F(DependencyOptimizerTest, ChangeToNoop_Identity) {
   Output grappler_added_id = ops::Identity(
       scope.WithOpName("ConstantFoldingCtrl/switch_1"), s.output_true);
   Output c1 = ops::Const(scope.WithOpName("c1")
-                             .WithControlDependencies(id0)
                              .WithControlDependencies(id_after_var)
                              .WithControlDependencies(grappler_added_id),
                          {1.0f, 2.0f}, {1, 2});
   Output id1 = ops::Identity(scope.WithOpName("id1"), c1);
+  Output id2 = ops::Identity(scope.WithOpName("id2"), id0);
   Output fetch =
       ops::Identity(scope.WithOpName("fetch").WithControlDependencies(id1), c1);
 
   GrapplerItem item;
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
   item.fetch.push_back("c1");
+  item.fetch.push_back("id2");
   item.fetch.push_back("fetch");
 
-  DependencyOptimizer optimizer;
+  DependencyOptimizer optimizer(RewriterConfig::AGGRESSIVE);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -377,8 +503,8 @@ TEST_F(DependencyOptimizerTest, ChangeToNoop_Identity) {
   EXPECT_EQ(item.graph.node_size() - 2, output.node_size());
   for (int i = 0; i < output.node_size(); ++i) {
     const NodeDef& node = output.node(i);
-    // "id0" and "id1" but neither "ConstantFoldingCtrl/switch_1" nor
-    // "id_after_var" should be eliminated.
+    // "id0" and "id1" but neither "ConstantFoldingCtrl/switch_1",
+    // "id_after_var, nor "id2"" should be eliminated.
     EXPECT_NE("id0", node.name());
     EXPECT_NE("id1", node.name());
     if (node.name() == "c1") {
diff --git a/tensorflow/python/debug/lib/debug_gradients_test.py b/tensorflow/python/debug/lib/debug_gradients_test.py
index b6c7280a41..c1e9869d97 100644
--- a/tensorflow/python/debug/lib/debug_gradients_test.py
+++ b/tensorflow/python/debug/lib/debug_gradients_test.py
@@ -22,6 +22,7 @@ import shutil
 import tempfile
 
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.debug.lib import debug_data
 from tensorflow.python.debug.lib import debug_gradients
@@ -38,7 +39,11 @@ from tensorflow.python.training import gradient_descent
 class IdentifyGradientTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
-    self.sess = session.Session()
+    rewriter_config = rewriter_config_pb2.RewriterConfig(
+        dependency_optimization=rewriter_config_pb2.RewriterConfig.OFF)
+    graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config)
+    config = config_pb2.ConfigProto(graph_options=graph_options)
+    self.sess = session.Session(config=config)
     with self.sess.as_default():
       self.u = variables.Variable(2.0, name="u")
       self.v = variables.Variable(3.0, name="v")
diff --git a/tensorflow/python/debug/lib/session_debug_grpc_test.py b/tensorflow/python/debug/lib/session_debug_grpc_test.py
index 367b353545..b623ee31c5 100644
--- a/tensorflow/python/debug/lib/session_debug_grpc_test.py
+++ b/tensorflow/python/debug/lib/session_debug_grpc_test.py
@@ -54,7 +54,8 @@ from tensorflow.python.training import monitored_session
 def no_rewrite_session_config():
   rewriter_config = rewriter_config_pb2.RewriterConfig(
       disable_model_pruning=True,
-      arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF)
+      arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF,
+      dependency_optimization=rewriter_config_pb2.RewriterConfig.OFF)
   graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config)
   return config_pb2.ConfigProto(graph_options=graph_options)
 
-- 
GitLab


From 87a3c967973641d3b0d2a16d17add184ed967392 Mon Sep 17 00:00:00 2001
From: Akshay Agrawal <akshayka@google.com>
Date: Mon, 29 Jan 2018 18:30:59 -0800
Subject: [PATCH 375/966] TFE: Register a GPU kernel for tfe.py_func.

PiperOrigin-RevId: 183765122
---
 tensorflow/python/BUILD                       |  1 +
 .../python/kernel_tests/py_func_test.py       | 86 +++++++++----------
 tensorflow/python/lib/core/py_func.cc         | 72 +++++++++++-----
 tensorflow/python/ops/script_ops.py           | 38 +++++---
 4 files changed, 120 insertions(+), 77 deletions(-)

diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index a323d5bc39..363ff6fae9 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -298,6 +298,7 @@ cc_library(
         ":safe_ptr",
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/c/eager:c_api",
+        "//tensorflow/c/eager:c_api_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
diff --git a/tensorflow/python/kernel_tests/py_func_test.py b/tensorflow/python/kernel_tests/py_func_test.py
index 92fb68820e..c7181497d8 100644
--- a/tensorflow/python/kernel_tests/py_func_test.py
+++ b/tensorflow/python/kernel_tests/py_func_test.py
@@ -396,66 +396,66 @@ class PyFuncTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes()
   def testEagerSingleOutputFloat32(self):
-    a = array_ops.ones((3, 3), dtype=dtypes.float32)
-    x = array_ops.ones((3, 1), dtype=dtypes.float32)
-    output = script_ops.eager_py_func(matmul, inp=[a, x], Tout=dtypes.float32)
-    with self.test_session():
+    with test_util.device(use_gpu=True):
+      a = array_ops.ones((3, 3), dtype=dtypes.float32)
+      x = array_ops.ones((3, 1), dtype=dtypes.float32)
+      output = script_ops.eager_py_func(matmul, inp=[a, x], Tout=dtypes.float32)
       ret = self.evaluate(output)
       self.assertAllClose(ret, [[3.0], [3.0], [3.0]])
 
   @test_util.run_in_graph_and_eager_modes()
   def testEagerArrayOutput(self):
-    a = array_ops.ones((3, 3), dtype=dtypes.int32)
-    x = array_ops.ones((3, 1), dtype=dtypes.int32)
-    output = script_ops.eager_py_func(
-        lambda a, x: [matmul(a, x)], inp=[a, x], Tout=[dtypes.int32])
-
-    with self.test_session():
+    with test_util.device(use_gpu=True):
+      a = array_ops.ones((3, 3), dtype=dtypes.float32)
+      x = array_ops.ones((3, 1), dtype=dtypes.float32)
+      output = script_ops.eager_py_func(
+          lambda a, x: [matmul(a, x)], inp=[a, x], Tout=[dtypes.float32])
       ret = self.evaluate(output)
-      self.assertAllEqual(ret, [[[3], [3], [3]]])
+      self.assertAllEqual(ret, [[[3.0], [3.0], [3.0]]])
 
   @test_util.run_in_graph_and_eager_modes()
   def testEagerReturnNone(self):
+    with test_util.device(use_gpu=True):
+      def no_return_value():
+        return
 
-    def no_return_value():
-      return
-
-    output = script_ops.eager_py_func(no_return_value, inp=[], Tout=[])
-    ret = self.evaluate(output)
-    if context.in_eager_mode():
-      self.assertEquals(len(ret), 0)
-    else:
-      self.assertIsNone(ret)
+      output = script_ops.eager_py_func(no_return_value, inp=[], Tout=[])
+      ret = self.evaluate(output)
+      if context.in_eager_mode():
+        self.assertEquals(len(ret), 0)
+      else:
+        self.assertIsNone(ret)
 
   @test_util.run_in_graph_and_eager_modes()
   def testEagerPyFuncInDefun(self):
+    with test_util.device(use_gpu=True):
+      def wrapper():
+        a = array_ops.ones((3, 3), dtype=dtypes.float32)
+        x = array_ops.ones((3, 1), dtype=dtypes.float32)
+        return script_ops.eager_py_func(matmul, inp=[a, x], Tout=dtypes.float32)
 
-    def wrapper():
-      a = array_ops.ones((3, 3), dtype=dtypes.int32)
-      x = array_ops.ones((3, 1), dtype=dtypes.int32)
-      return script_ops.eager_py_func(matmul, inp=[a, x], Tout=dtypes.int32)
-
-    wrapped = function.defun(wrapper)
-    ret = self.evaluate(wrapped())
-    self.assertAllEqual(ret, [[3], [3], [3]])
+      wrapped = function.defun(wrapper)
+      ret = self.evaluate(wrapped())
+      self.assertAllEqual(ret, [[3.0], [3.0], [3.0]])
 
   @test_util.run_in_graph_and_eager_modes()
   def testEagerExceptionHandling(self):
-    self._testExceptionHandling(
-        ValueError, errors.InvalidArgumentError, eager=True)
-    self._testExceptionHandling(
-        TypeError, errors.InvalidArgumentError, eager=True)
-    self._testExceptionHandling(
-        StopIteration, errors.OutOfRangeError, eager=True)
-    self._testExceptionHandling(
-        MemoryError, errors.ResourceExhaustedError, eager=True)
-    self._testExceptionHandling(
-        NotImplementedError, errors.UnimplementedError, eager=True)
-
-    class WeirdError(Exception):
-      pass
-
-    self._testExceptionHandling(WeirdError, errors.UnknownError, eager=True)
+    with test_util.device(use_gpu=True):
+      self._testExceptionHandling(
+          ValueError, errors.InvalidArgumentError, eager=True)
+      self._testExceptionHandling(
+          TypeError, errors.InvalidArgumentError, eager=True)
+      self._testExceptionHandling(
+          StopIteration, errors.OutOfRangeError, eager=True)
+      self._testExceptionHandling(
+          MemoryError, errors.ResourceExhaustedError, eager=True)
+      self._testExceptionHandling(
+          NotImplementedError, errors.UnimplementedError, eager=True)
+
+      class WeirdError(Exception):
+        pass
+
+      self._testExceptionHandling(WeirdError, errors.UnknownError, eager=True)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/lib/core/py_func.cc b/tensorflow/python/lib/core/py_func.cc
index d3bfa0ee33..e0422ef80a 100644
--- a/tensorflow/python/lib/core/py_func.cc
+++ b/tensorflow/python/lib/core/py_func.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "numpy/arrayobject.h"
 #include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_internal.h"
 #include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/core/framework/allocation_description.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -53,6 +54,12 @@ struct PyCall {
   // with this "token".
   string token;
 
+  // The device on which Tensors are stored; only used for EagerPyFunc.
+  Device* device;
+
+  // True if and only if the op has been placed on a GPU.
+  bool gpu;
+
   // True if the call is associated with an EagerPyFunc.
   bool eager;
 
@@ -71,7 +78,12 @@ Status MakeArgTuple(const PyCall* call, PyObject** tuple) {
     PyObject* arg = nullptr;
     const Tensor& t = call->ins[i];
     if (call->eager) {
-      arg = EagerTensorFromHandle(TFE_NewTensorHandle(t));
+      if (call->gpu) {
+        arg = EagerTensorFromHandle(new TFE_TensorHandle(t, call->device));
+      } else {
+        // TFE_TensorHandle assumes that CPU is identified by `nullptr`.
+        arg = EagerTensorFromHandle(new TFE_TensorHandle(t, nullptr));
+      }
       if (arg == nullptr) {
         return errors::Internal("Unable to procure EagerTensor from Tensor.");
       }
@@ -84,7 +96,8 @@ Status MakeArgTuple(const PyCall* call, PyObject** tuple) {
     }
     PyList_SetItem(lst, i, arg);
   }
-  *tuple = Py_BuildValue("(sN)", call->token.c_str(), lst);
+  *tuple = Py_BuildValue("(sON)", call->token.c_str(),
+                         call->gpu ? Py_True : Py_False, lst);
   CHECK(*tuple);
   return Status::OK();
 }
@@ -150,15 +163,9 @@ bool IsSingleNone(PyObject* obj) {
 }
 
 // Retrieves a Tensor from `eager_tensor` and stores it in `output_tensor`.
-Status ExtractTensorFromEagerTensor(const PyObject* eager_tensor,
-                                    Tensor* output_tensor,
-                                    TF_Status* tf_status) {
-  // TODO(akshayka): Lift the restriction requiring output tensors to
-  // lie in host memory; EagerPyFunc should be able to dispatch ops on GPU
-  // tensors, so we should eventually implement a GPU kernel for EagerPyFunc.
-  *output_tensor = *TFE_TensorHandleUnderlyingTensorInHostMemory(
-      EagerTensor_Handle(eager_tensor), tf_status);
-  return StatusFromTF_Status(tf_status);
+void ExtractTensorFromEagerTensor(const PyObject* eager_tensor,
+                                  Tensor* output_tensor) {
+  *output_tensor = EagerTensor_Handle(eager_tensor)->t;
 }
 
 // Calls the registered py function through the trampoline.
@@ -201,15 +208,23 @@ Status DoCallPyFunc(PyCall* call, bool* out_log_on_error) {
   }
 
   // Process the return values and convert them to TF Tensors.
-  Status s;
+  Status s = Status::OK();
   if (PyList_Check(result)) {
+    // `result` is a Python list; if this operation is an `EagerPyFunc`, then
+    // every item in the list must be an `EagerTensor`; otherwise, every element
+    // must be a NumPy array.
     call->out.clear();
     for (int i = 0; i < PyList_Size(result); ++i) {
       Tensor t;
       if (call->eager) {
-        auto tf_status = tensorflow::make_safe(TF_NewStatus());
-        s = ExtractTensorFromEagerTensor(PyList_GetItem(result, i), &t,
-                                         tf_status.get());
+        const PyObject* item = PyList_GetItem(result, i);
+        if (EagerTensor_CheckExact(item)) {
+          ExtractTensorFromEagerTensor(item, &t);
+        } else {
+          s = errors::FailedPrecondition(
+              "Expected EagerTensor, found PyObject of type: ",
+              Py_TYPE(item)->tp_name);
+        }
       } else {
         s = ConvertNdarrayToTensor(PyList_GetItem(result, i), &t);
       }
@@ -220,16 +235,15 @@ Status DoCallPyFunc(PyCall* call, bool* out_log_on_error) {
       call->out.push_back(t);
     }
   } else if (EagerTensor_CheckExact(result) || result == Py_None) {
+    // result is an `EagerTensor` or `None`.
     DCHECK(call->eager);
     Tensor t;
     if (result != Py_None) {
-      auto tf_status = tensorflow::make_safe(TF_NewStatus());
-      s = ExtractTensorFromEagerTensor(result, &t, tf_status.get());
-      if (s.ok()) {
-        call->out.push_back(t);
-      }
+      ExtractTensorFromEagerTensor(result, &t);
+      call->out.push_back(t);
     }
   } else if (PyArray_Check(result)) {
+    // `result` is a NumPy array.
     DCHECK(!call->eager);
     if (!IsSingleNone(result)) {
       Tensor t;
@@ -239,7 +253,7 @@ Status DoCallPyFunc(PyCall* call, bool* out_log_on_error) {
       }
     }
   } else {
-    s = errors::Internal("Unexpected pyobject is returned: ",
+    s = errors::Internal("Unexpected PyObject was returned: ",
                          Py_TYPE(result)->tp_name);
   }
   Py_DECREF(result);
@@ -429,12 +443,24 @@ class PyFuncOp : public OpKernel {
   explicit PyFuncOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("token", &token_));
     eager_ = type_string() == "EagerPyFunc";
+    gpu_ = ctx->device_type().type_string() == DEVICE_GPU;
   }
 
   void Compute(OpKernelContext* ctx) override {
     PyCall call;
     call.token = token_;
+    call.gpu = gpu_;
     call.eager = eager_;
+    if (call.eager) {
+      // Eager's C API uses `Device`, whereas `OpKernelContext` stores a
+      // `DeviceBase`; attempt to downcast.
+      call.device = dynamic_cast<Device*>(ctx->device());
+      if (call.device == nullptr) {
+        ctx->CtxFailureWithWarning(
+            errors::Internal("Unrecognized device class"));
+      }
+    }
+
     for (int i = 0; i < ctx->num_inputs(); ++i) {
       call.ins.push_back(ctx->input(i));
     }
@@ -476,6 +502,9 @@ class PyFuncOp : public OpKernel {
  private:
   string token_;
 
+  // True if and only if this op has been placed on a GPU.
+  bool gpu_;
+
   // True if and only if this op should execute the python function eagerly,
   // i.e., if and only if the eager attribute is set.
   bool eager_;
@@ -486,5 +515,6 @@ class PyFuncOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(Name("PyFunc").Device(DEVICE_CPU), PyFuncOp);
 REGISTER_KERNEL_BUILDER(Name("PyFuncStateless").Device(DEVICE_CPU), PyFuncOp);
 REGISTER_KERNEL_BUILDER(Name("EagerPyFunc").Device(DEVICE_CPU), PyFuncOp);
+REGISTER_KERNEL_BUILDER(Name("EagerPyFunc").Device(DEVICE_GPU), PyFuncOp);
 
 }  // end namespace tensorflow
diff --git a/tensorflow/python/ops/script_ops.py b/tensorflow/python/ops/script_ops.py
index 4b5072fd67..1b9071ee93 100644
--- a/tensorflow/python/ops/script_ops.py
+++ b/tensorflow/python/ops/script_ops.py
@@ -50,19 +50,21 @@ class EagerFunc(object):
     self._func = func
     self._out_dtypes = Tout
 
-  def __call__(self, *args, **kwargs):
-    """Passes args, kwargs to `self._func`, which is executed eagerly."""
+  def __call__(self, on_gpu, args):
+    """Passes `args` to `self._func`, which is executed eagerly."""
     with context.eager_mode():
-      ret = self._func(*args, **kwargs)
+      ret = self._func(*args)
+      maybe_copy_to_gpu = lambda x: x if not on_gpu else x.gpu()
       if isinstance(ret, (tuple, list)):
         return [
-            ops.convert_to_tensor(x, dtype=dtype)
+            maybe_copy_to_gpu(ops.convert_to_tensor(x, dtype=dtype))
             for (x, dtype) in zip(ret, self._out_dtypes)
         ]
       elif ret is None:
         return ret
       else:
-        return ops.convert_to_tensor(ret, dtype=self._out_dtypes[0])
+        return maybe_copy_to_gpu(
+            ops.convert_to_tensor(ret, dtype=self._out_dtypes[0]))
 
 
 class FuncRegistry(object):
@@ -116,16 +118,29 @@ class FuncRegistry(object):
     else:
       return result
 
-  def __call__(self, token, args):
-    """Calls the registered function for `token` with args."""
+  def __call__(self, token, on_gpu, args):
+    """Calls the registered function for `token` with args.
+
+    Args:
+      token: A key into this `FuncRegistry` identifying which function to call.
+      on_gpu: A boolean indicating whether or not `token`'s corresponding
+        operation was placed on GPU; only used if the function registered for
+        `token` is an `EagerPyFunc`.
+      args: The arguments to pass to the function registered for `token`.
+
+    Returns:
+      The output of the function registered for `token`.
+
+    Raises:
+      ValueError: if no function is registered for `token`.
+    """
     func = self._funcs[token]
     if func is None:
       raise ValueError("callback %s is not found" % token)
-    ret = func(*args)
-
     if isinstance(func, EagerFunc):
-      return ret
+      return func(on_gpu, args)
     else:
+      ret = func(*args)
       # Strings seem to lead to a memory leak here if they're not wrapped in a
       # list.
       if isinstance(ret, six.binary_type):
@@ -302,8 +317,5 @@ def py_func(func, inp, Tout, stateful=True, name=None):
       func=func, inp=inp, Tout=Tout, stateful=stateful, eager=False, name=name)
 
 
-# TODO(akshayka): PyFuncs where the 'eager' attribute is set to True should be
-# differentiable, i.e., the gradient of PyFunc should propagate Nones if the
-# eager attribute is not set, and otherwise, it should return the gradient.
 ops.NotDifferentiable("PyFunc")
 ops.NotDifferentiable("PyFuncStateless")
-- 
GitLab


From 5ded6fc13bb451d41a0d3a8d874f1c23b2e7c424 Mon Sep 17 00:00:00 2001
From: Makoto Uchida <muchida@google.com>
Date: Mon, 29 Jan 2018 22:27:11 -0800
Subject: [PATCH 376/966] Adds loss_reduction argument to baseline estimators.

PiperOrigin-RevId: 183783628
---
 .../python/estimator/canned/baseline.py       | 20 ++++++++++++++-----
 ...rflow.estimator.-baseline-classifier.pbtxt |  2 +-
 ...orflow.estimator.-baseline-regressor.pbtxt |  2 +-
 3 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/tensorflow/python/estimator/canned/baseline.py b/tensorflow/python/estimator/canned/baseline.py
index 96e4ecd29f..138152ac1c 100644
--- a/tensorflow/python/estimator/canned/baseline.py
+++ b/tensorflow/python/estimator/canned/baseline.py
@@ -57,6 +57,7 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops.losses import losses
 from tensorflow.python.training import training_util
 
 # The default learning rate of 0.3 is a historical artifact of the initial
@@ -220,7 +221,8 @@ class BaselineClassifier(estimator.Estimator):
                weight_column=None,
                label_vocabulary=None,
                optimizer='Ftrl',
-               config=None):
+               config=None,
+               loss_reduction=losses.Reduction.SUM):
     """Initializes a BaselineClassifier instance.
 
     Args:
@@ -240,6 +242,8 @@ class BaselineClassifier(estimator.Estimator):
         optimizer to use for training. If not specified, will use
         `FtrlOptimizer` with a default learning rate of 0.3.
       config: `RunConfig` object to configure the runtime settings.
+      loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how
+        to reduce training loss over batch. Defaults to `SUM`.
     Returns:
       A `BaselineClassifier` estimator.
 
@@ -249,11 +253,13 @@ class BaselineClassifier(estimator.Estimator):
     if n_classes == 2:
       head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(  # pylint: disable=protected-access
           weight_column=weight_column,
-          label_vocabulary=label_vocabulary)
+          label_vocabulary=label_vocabulary,
+          loss_reduction=loss_reduction)
     else:
       head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(  # pylint: disable=protected-access
           n_classes, weight_column=weight_column,
-          label_vocabulary=label_vocabulary)
+          label_vocabulary=label_vocabulary,
+          loss_reduction=loss_reduction)
     def _model_fn(features, labels, mode, config):
       return _baseline_model_fn(
           features=features,
@@ -311,7 +317,8 @@ class BaselineRegressor(estimator.Estimator):
                label_dimension=1,
                weight_column=None,
                optimizer='Ftrl',
-               config=None):
+               config=None,
+               loss_reduction=losses.Reduction.SUM):
     """Initializes a BaselineRegressor instance.
 
     Args:
@@ -328,13 +335,16 @@ class BaselineRegressor(estimator.Estimator):
         optimizer to use for training. If not specified, will use
         `FtrlOptimizer` with a default learning rate of 0.3.
       config: `RunConfig` object to configure the runtime settings.
+      loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how
+        to reduce training loss over batch. Defaults to `SUM`.
     Returns:
       A `BaselineRegressor` estimator.
     """
 
     head = head_lib._regression_head_with_mean_squared_error_loss(  # pylint: disable=protected-access
         label_dimension=label_dimension,
-        weight_column=weight_column)
+        weight_column=weight_column,
+        loss_reduction=loss_reduction)
     def _model_fn(features, labels, mode, config):
       return _baseline_model_fn(
           features=features,
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-classifier.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-classifier.pbtxt
index ab697b1b95..874a73f661 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-classifier.pbtxt
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Ftrl\', \'None\'], "
+    argspec: "args=[\'self\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'config\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Ftrl\', \'None\', \'weighted_sum\'], "
   }
   member_method {
     name: "evaluate"
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-regressor.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-regressor.pbtxt
index b73f6433e2..8da2a2b686 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-regressor.pbtxt
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Ftrl\', \'None\'], "
+    argspec: "args=[\'self\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'config\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Ftrl\', \'None\', \'weighted_sum\'], "
   }
   member_method {
     name: "evaluate"
-- 
GitLab


From 7052c6a77985dccd8754f552f051be6b802e90fe Mon Sep 17 00:00:00 2001
From: Qiumin Xu <qiuminxu@google.com>
Date: Mon, 29 Jan 2018 22:58:19 -0800
Subject: [PATCH 377/966] Add options to enable new features for
 cloud-tpu-profiler. (#16550)

---
 .../pip_package/cloud_tpu_profiler/main.py    | 21 +++++++++++++------
 .../contrib/tpu/profiler/pip_package/setup.py |  2 +-
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py b/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
index 846db13329..885466e5d1 100644
--- a/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
+++ b/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
@@ -17,6 +17,7 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+from absl import flags
 
 import os
 import subprocess
@@ -24,13 +25,19 @@ import sys
 
 import tensorflow as tf
 
-tf.flags.DEFINE_string('service_addr', '',
+flags.DEFINE_string('service_addr', None,
                        'Address of TPU profiler service e.g. localhost:8466')
-tf.flags.DEFINE_string('logdir', '',
-                       'Path of TensorBoard log directory e.g. /tmp/tb_log')
-tf.flags.DEFINE_integer('duration_ms', 2000, 'Duration of tracing in ms.')
-
-FLAGS = tf.flags.FLAGS
+flags.DEFINE_string('logdir', None,
+                       "Path of TensorBoard log directory e.g. /tmp/tb_log, "
+                       "gs://tb_bucket")
+flags.DEFINE_integer('duration_ms', 2000, 'Duration of tracing in ms.')
+flags.DEFINE_integer('num_tracing_attempts', 3,
+                        "Automatically retry N times when no trace event is "
+                        "collected.")
+flags.DEFINE_boolean('include_dataset_ops', True,
+                     "Set to false to profile longer TPU device traces.")
+
+FLAGS = flags.FLAGS
 EXECUTABLE = 'data/capture_tpu_profile'
 
 
@@ -47,6 +54,8 @@ def main(unused_argv=None):
   cmd.append('--logdir='+logdir)
   cmd.append('--service_addr='+FLAGS.service_addr)
   cmd.append('--duration_ms='+str(FLAGS.duration_ms))
+  cmd.append('--num_tracing_attempts='+str(FLAGS.num_tracing_attempts))
+  cmd.append('--include_dataset_ops='+str(FLAGS.include_dataset_ops).lower())
   subprocess.call(cmd)
 
 
diff --git a/tensorflow/contrib/tpu/profiler/pip_package/setup.py b/tensorflow/contrib/tpu/profiler/pip_package/setup.py
index 9219663831..3dffebe668 100644
--- a/tensorflow/contrib/tpu/profiler/pip_package/setup.py
+++ b/tensorflow/contrib/tpu/profiler/pip_package/setup.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 from setuptools import setup
 
-_VERSION = '1.4.3-a2'
+_VERSION = '1.5.0-rc1'
 
 CONSOLE_SCRIPTS = [
     'capture_tpu_profile=cloud_tpu_profiler.main:run_main',
-- 
GitLab


From ba64f5334d4bba31d22c30e09a96f806ea0e2f7e Mon Sep 17 00:00:00 2001
From: ManHyuk <manhyuk@kw.ac.kr>
Date: Tue, 30 Jan 2018 15:59:05 +0900
Subject: [PATCH 378/966] Fix typo (#16562)

* fix typos

* fix typos

* fix typo

* fix typo
---
 .../contrib/lite/nnapi/NeuralNetworksShim.h    |  2 +-
 .../reduce_slice_ops/ops/reduce_slice_ops.cc   | 18 +++++++++---------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h b/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
index 7019c29959..76032771af 100644
--- a/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
+++ b/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
@@ -1571,7 +1571,7 @@ inline int ANeuralNetworksModel_addOperation(ANeuralNetworksModel* model,
 }
 
 /**
- * Specfifies which operands will be the model's inputs and outputs.
+ * Specifies which operands will be the model's inputs and outputs.
  *
  * An operand cannot be used for both input and output. Doing so will
  * return an error.
diff --git a/tensorflow/contrib/reduce_slice_ops/ops/reduce_slice_ops.cc b/tensorflow/contrib/reduce_slice_ops/ops/reduce_slice_ops.cc
index b8b56c0e22..31e565027f 100644
--- a/tensorflow/contrib/reduce_slice_ops/ops/reduce_slice_ops.cc
+++ b/tensorflow/contrib/reduce_slice_ops/ops/reduce_slice_ops.cc
@@ -87,9 +87,9 @@ and 'indices' is [[0,1]
                   [1,1]
                   [0,2]],
 
-the the output will be [[ 1, 2, 3]
-                        [ 0, 0, 0]
-                        [41,52,63]].
+the output will be [[ 1, 2, 3]
+                    [ 0, 0, 0]
+                    [41,52,63]].
 ```
 
 The data must be at least rank 1. The indices must be of shape (?,2) where the
@@ -132,9 +132,9 @@ and 'indices' is [[0,1]
                   [1,1]
                   [0,2]],
 
-the the output will be [[ 1,  2,  3]
-                        [ 1,  1,  1]
-                        [40,100,180]].
+the output will be [[ 1,  2,  3]
+                    [ 1,  1,  1]
+                    [40,100,180]].
 ```
 
 The data must be at least rank 1. The indices can be of shape (?,2) where the
@@ -189,9 +189,9 @@ and 'indices' is [[0,1]
                   [1,1]
                   [0,2]],
 
-the the output will be [[          1,         20,          3]
-                        [ -BIG_VALUE, -BIG_VALUE, -BIG_VALUE]
-                        [        400,         20,         60]].
+the output will be [[          1,         20,          3]
+                    [ -BIG_VALUE, -BIG_VALUE, -BIG_VALUE]
+                    [        400,         20,         60]].
 ```
 
 The data must be at least rank 1. The indices can be of shape (?,2) where the
-- 
GitLab


From 9560054b18ad3fc9faa296d344e062876858817a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 30 Jan 2018 08:08:27 -0800
Subject: [PATCH 379/966] Support `mode` option to discriminator function in
 GANEstimator.

This supports operations like batch normalization, which have different train and eval behavior.

PiperOrigin-RevId: 183833519
---
 .../gan/python/estimator/python/gan_estimator_impl.py  | 10 ++++++++--
 .../gan/python/estimator/python/gan_estimator_test.py  |  4 ++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
index 0d51c282a8..082c42eba1 100644
--- a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
+++ b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
@@ -59,7 +59,11 @@ _summary_type_map = {
 class GANEstimator(estimator.Estimator):
   """An estimator for Generative Adversarial Networks (GANs).
 
-  This Estimator is backed by TFGAN.
+  This Estimator is backed by TFGAN. The network functions follow the TFGAN API
+  except for one exception: if either `generator_fn` or `discriminator_fn` have
+  an argument called `mode`, then the tf.Estimator mode is passed in for that
+  argument. This helps with operations like batch normalization, which have
+  different train and evaluation behavior.
 
   Example:
 
@@ -233,9 +237,11 @@ def _gan_model_fn(
 def _make_gan_model(generator_fn, discriminator_fn, real_data,
                     generator_inputs, generator_scope, add_summaries, mode):
   """Make a `GANModel`, and optionally pass in `mode`."""
-  # If `generator_fn` has an argument `mode`, pass mode to it.
+  # If network functions have an argument `mode`, pass mode to it.
   if 'mode' in inspect.getargspec(generator_fn).args:
     generator_fn = functools.partial(generator_fn, mode=mode)
+  if 'mode' in inspect.getargspec(discriminator_fn).args:
+    discriminator_fn = functools.partial(discriminator_fn, mode=mode)
   gan_model = tfgan_train.gan_model(
       generator_fn,
       discriminator_fn,
diff --git a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
index e752f0bccc..387a62bd74 100644
--- a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
+++ b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
@@ -54,7 +54,8 @@ def generator_fn(noise_dict, mode):
   return layers.fully_connected(noise, noise.shape[1].value)
 
 
-def discriminator_fn(data, _):
+def discriminator_fn(data, unused_conditioning, mode):
+  del unused_conditioning, mode
   return layers.fully_connected(data, 1)
 
 
@@ -99,7 +100,6 @@ def mock_head(testcase, expected_generator_inputs, expected_real_data,
     else:
       testcase.assertEqual(discriminator_scope_name,
                            gan_model.discriminator_scope.name)
-    testcase.assertEqual(_or_none(discriminator_fn), gan_model.discriminator_fn)
 
     with ops.control_dependencies(assertions):
       if mode == model_fn_lib.ModeKeys.TRAIN:
-- 
GitLab


From f47e05b65bb15d595d24b4b7f0da0f2648c5b30e Mon Sep 17 00:00:00 2001
From: gracehoney <31743510+aaroey@users.noreply.github.com>
Date: Tue, 30 Jan 2018 08:36:09 -0800
Subject: [PATCH 380/966] Fix GOOGLE_TENSORRT

---
 tensorflow/contrib/tensorrt/log/trt_logger.cc       | 5 +++--
 tensorflow/contrib/tensorrt/log/trt_logger.h        | 1 +
 tensorflow/contrib/tensorrt/segment/segment_test.cc | 9 ---------
 3 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/tensorflow/contrib/tensorrt/log/trt_logger.cc b/tensorflow/contrib/tensorrt/log/trt_logger.cc
index deeba43474..2473b8effc 100644
--- a/tensorflow/contrib/tensorrt/log/trt_logger.cc
+++ b/tensorflow/contrib/tensorrt/log/trt_logger.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
 
@@ -52,5 +53,5 @@ void Logger::log(Severity severity, const char* msg) {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif
-#endif
+#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_TENSORRT
diff --git a/tensorflow/contrib/tensorrt/log/trt_logger.h b/tensorflow/contrib/tensorrt/log/trt_logger.h
index f82f3188e8..c07a3e6b2d 100644
--- a/tensorflow/contrib/tensorrt/log/trt_logger.h
+++ b/tensorflow/contrib/tensorrt/log/trt_logger.h
@@ -39,4 +39,5 @@ class Logger : public nvinfer1::ILogger {
 
 #endif // GOOGLE_TENSORRT
 #endif // GOOGLE_CUDA
+
 #endif  // TENSORFLOW_CONTRIB_TENSORRT_LOG_TRT_LOGGER_H_
diff --git a/tensorflow/contrib/tensorrt/segment/segment_test.cc b/tensorflow/contrib/tensorrt/segment/segment_test.cc
index ae491c38d1..5f99ba570b 100644
--- a/tensorflow/contrib/tensorrt/segment/segment_test.cc
+++ b/tensorflow/contrib/tensorrt/segment/segment_test.cc
@@ -13,9 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
-
 #include "tensorflow/contrib/tensorrt/segment/segment.h"
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/core/framework/graph.pb.h"
@@ -24,9 +21,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/test.h"
 
-//------------------------------------------------------------------------------
 namespace tensorflow {
-
 namespace tensorrt {
 namespace segment {
 namespace test {
@@ -363,8 +358,4 @@ TEST_F(SegmentTest, BigIfElse) {
 }  // namespace test
 }  // namespace segment
 }  // namespace tensorrt
-
 }  // namespace tensorflow
-
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
-- 
GitLab


From 4d3c6ca4fc99aeefa1370b51c9a3e1bcd26dd04a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 30 Jan 2018 08:35:46 -0800
Subject: [PATCH 381/966] Changed the heap simulator to allow it to be
 configured about whether to issue Alloc/Free for constants, and enable buffer
 sharing.

PiperOrigin-RevId: 183836922
---
 .../compiler/xla/service/buffer_assignment.cc | 10 ++--
 .../compiler/xla/service/heap_simulator.cc    | 53 ++++++++++---------
 .../compiler/xla/service/heap_simulator.h     | 30 +++++++----
 3 files changed, 54 insertions(+), 39 deletions(-)

diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
index d5594dc07c..774b11478c 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -997,14 +997,15 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
       auto color = single_colored_set.first;
       VLOG(2) << "Simulating heap for color " << color;
       int64 alignment = assignment->color_alignment_(color);
+      HeapSimulator::Options options;
+      options.buffers_to_assign = &single_colored_set.second;
       TF_ASSIGN_OR_RETURN(
           const HeapSimulator::Result result,
           HeapSimulator::Run(MakeUnique<DecreasingSizeRunsHeap>(
                                  MakeUnique<LazyBestFitHeap>(alignment)),
                              assignment->module(), module_sequence,
                              assignment->points_to_analysis(),
-                             assignment->buffer_size_,
-                             &single_colored_set.second));
+                             assignment->buffer_size_, options));
       AssignBuffersFromHeapSimulator(result, assignment,
                                      single_colored_set.first);
     }
@@ -1024,14 +1025,15 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
         auto color = single_colored_set.first;
         VLOG(2) << "Simulating heap for color " << color;
         int64 alignment = assignment->color_alignment_(color);
+        HeapSimulator::Options options;
+        options.buffers_to_assign = &single_colored_set.second;
         TF_ASSIGN_OR_RETURN(
             const HeapSimulator::Result result,
             HeapSimulator::Run(MakeUnique<DecreasingSizeRunsHeap>(
                                    MakeUnique<LazyBestFitHeap>(alignment)),
                                *computation, *instruction_sequence,
                                assignment->points_to_analysis(),
-                               assignment->buffer_size_,
-                               &single_colored_set.second));
+                               assignment->buffer_size_, options));
         AssignBuffersFromHeapSimulator(result, assignment,
                                        single_colored_set.first);
       }
diff --git a/tensorflow/compiler/xla/service/heap_simulator.cc b/tensorflow/compiler/xla/service/heap_simulator.cc
index 34e2f7ee20..cde5877e29 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator.cc
@@ -64,10 +64,8 @@ StatusOr<HeapSimulator::Result> HeapSimulator::Run(
     std::unique_ptr<HeapAlgorithm> algorithm, const HloModule& module,
     const SequentialHloOrdering::HloModuleSequence& module_sequence,
     const TuplePointsToAnalysis& points_to_analysis,
-    const LogicalBuffer::SizeFunction& size_fn,
-    const FlatSet<const LogicalBuffer*>* buffers_to_assign) {
-  HeapSimulator heap(std::move(algorithm), size_fn, buffers_to_assign,
-                     &module_sequence);
+    const LogicalBuffer::SizeFunction& size_fn, const Options& options) {
+  HeapSimulator heap(std::move(algorithm), size_fn, options, &module_sequence);
   const HloComputation* entry_computation = module.entry_computation();
   const std::vector<const HloInstruction*>& instruction_sequence =
       FindOrDie(module_sequence, entry_computation);
@@ -81,9 +79,8 @@ StatusOr<HeapSimulator::Result> HeapSimulator::Run(
     std::unique_ptr<HeapAlgorithm> algorithm, const HloComputation& computation,
     const std::vector<const HloInstruction*>& instruction_sequence,
     const TuplePointsToAnalysis& points_to_analysis,
-    const LogicalBuffer::SizeFunction& size_fn,
-    const FlatSet<const LogicalBuffer*>* buffers_to_assign) {
-  HeapSimulator heap(std::move(algorithm), size_fn, buffers_to_assign,
+    const LogicalBuffer::SizeFunction& size_fn, const Options& options) {
+  HeapSimulator heap(std::move(algorithm), size_fn, options,
                      /*module_sequence=*/nullptr);
   TF_RETURN_IF_ERROR(heap.RunComputation(computation, instruction_sequence,
                                          points_to_analysis));
@@ -199,15 +196,17 @@ Status HeapSimulator::RunComputation(
       // We can only share with the operand buffer if it is about to be freed;
       // we must be the last user of the buffer.
       bool shared = false;
-      for (const LogicalBuffer* operand_buffer : operand_buffers_to_free) {
-        if (buffer->instruction()->IsUserOf(operand_buffer->instruction()) &&
-            buffer->instruction()->opcode() != HloOpcode::kCopy &&
-            CanShareOperandBufferWithUser(
-                operand_buffer->instruction(), operand_buffer->index(),
-                buffer->instruction(), buffer->index(), points_to_analysis)) {
-          ShareBuffer(buffer, operand_buffer, instruction);
-          shared = true;
-          break;
+      if (options_.may_reuse_operand_buffers) {
+        for (const LogicalBuffer* operand_buffer : operand_buffers_to_free) {
+          if (buffer->instruction()->IsUserOf(operand_buffer->instruction()) &&
+              buffer->instruction()->opcode() != HloOpcode::kCopy &&
+              CanShareOperandBufferWithUser(
+                  operand_buffer->instruction(), operand_buffer->index(),
+                  buffer->instruction(), buffer->index(), points_to_analysis)) {
+            ShareBuffer(buffer, operand_buffer, instruction);
+            shared = true;
+            break;
+          }
         }
       }
 
@@ -266,13 +265,12 @@ Status HeapSimulator::RunComputation(
 
 HeapSimulator::HeapSimulator(
     std::unique_ptr<HeapAlgorithm> algorithm,
-    const LogicalBuffer::SizeFunction& size_fn,
-    const FlatSet<const LogicalBuffer*>* buffers_to_assign,
+    const LogicalBuffer::SizeFunction& size_fn, const Options& options,
     const SequentialHloOrdering::HloModuleSequence* module_sequence)
     : no_fragmentation_stats_(MakeUnique<NoFragmentationStatsHeap>()),
       algorithm_(std::move(algorithm)),
       size_fn_(size_fn),
-      buffers_to_assign_(buffers_to_assign),
+      options_(options),
       module_sequence_(module_sequence) {
   debug_trace_.set_whole_module_simulation(module_sequence_ != nullptr);
 }
@@ -280,13 +278,16 @@ HeapSimulator::HeapSimulator(
 HeapSimulator::~HeapSimulator() {}
 
 bool HeapSimulator::IgnoreBuffer(const LogicalBuffer* buffer) const {
-  // Buffers for constants are ignored, as with BufferAssigner.  Also ignore
-  // buffers that we're not meant to assign.
+  // Buffers for constants are ignored unless the alloc_constants option is
+  // set. Also ignore buffers that we're not meant to assign.
   //
   // TODO(b/32248867): For consistency, constants should get allocations.
-  return buffer->instruction()->opcode() == HloOpcode::kConstant ||
-         (buffers_to_assign_ != nullptr &&
-          buffers_to_assign_->count(buffer) == 0);
+  if (!options_.alloc_constants &&
+      buffer->instruction()->opcode() == HloOpcode::kConstant) {
+    return true;
+  }
+  return options_.buffers_to_assign != nullptr &&
+         options_.buffers_to_assign->count(buffer) == 0;
 }
 
 // Alloc always calls the underlying heap algorithm.
@@ -400,8 +401,8 @@ HeapSimulator::Result HeapSimulator::Finish() {
     }
     // If we were told to assign specific buffers, make sure we've assigned
     // exactly that many buffers.
-    if (buffers_to_assign_ != nullptr) {
-      CHECK_EQ(buffers_to_assign_->size(), result.chunk_map.size());
+    if (options_.buffers_to_assign != nullptr) {
+      CHECK_EQ(options_.buffers_to_assign->size(), result.chunk_map.size());
     }
   }
 
diff --git a/tensorflow/compiler/xla/service/heap_simulator.h b/tensorflow/compiler/xla/service/heap_simulator.h
index 88a8698d16..636f19dd39 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.h
+++ b/tensorflow/compiler/xla/service/heap_simulator.h
@@ -67,6 +67,23 @@ class HeapSimulator {
     HeapSimulatorTrace debug_trace;
   };
 
+  // The different options to be passed to the Run() APIs.
+  struct Options {
+    Options()
+        : may_reuse_operand_buffers(true),
+          alloc_constants(false),
+          buffers_to_assign(nullptr) {}
+
+    // Whether a buffer about to be Free()-ed, can be recycled for a new born
+    // one, hence collapsing Free()+Alloc() calls (default true).
+    bool may_reuse_operand_buffers;
+    // Whether to issue Alloc() and Free() calls for constants (default false).
+    bool alloc_constants;
+    // If 'buffers_to_assign' is provided, only those buffers are assigned
+    // offsets, otherwise all buffers defined by the instructions are assigned.
+    const tensorflow::gtl::FlatSet<const LogicalBuffer*>* buffers_to_assign;
+  };
+
   // Run the heap simulation with the given algorithm, assuming the given
   // module_sequence, which must contain a topologically-consistent total
   // ordering of all instructions within each computation. The result is invalid
@@ -76,15 +93,12 @@ class HeapSimulator {
   // to running on a per-computation basis, since we can re-use buffer space for
   // called sub-computations.
   //
-  // If 'buffers_to_assign' is provided, only those buffers are assigned
-  // offsets, otherwise all buffers defined by the instructions are assigned.
   static StatusOr<Result> Run(
       std::unique_ptr<HeapAlgorithm> algorithm, const HloModule& module,
       const SequentialHloOrdering::HloModuleSequence& module_sequence,
       const TuplePointsToAnalysis& points_to_analysis,
       const LogicalBuffer::SizeFunction& size_fn,
-      const tensorflow::gtl::FlatSet<const LogicalBuffer*>* buffers_to_assign =
-          nullptr);
+      const Options& options = Options());
 
   // Same as above, but runs on a single computation. The 'instruction_sequence'
   // must contain a topologically-consistent total ordering of all instructions
@@ -96,8 +110,7 @@ class HeapSimulator {
       const std::vector<const HloInstruction*>& instruction_sequence,
       const TuplePointsToAnalysis& points_to_analysis,
       const LogicalBuffer::SizeFunction& size_fn,
-      const tensorflow::gtl::FlatSet<const LogicalBuffer*>* buffers_to_assign =
-          nullptr);
+      const Options& options = Options());
 
  private:
   // If 'module_sequence' is non-null, it is used to find kCall and kWhile
@@ -105,8 +118,7 @@ class HeapSimulator {
   // be run recursively. I.e. the simulation is run over the whole module.
   HeapSimulator(
       std::unique_ptr<HeapAlgorithm> algorithm,
-      const LogicalBuffer::SizeFunction& size_fn,
-      const tensorflow::gtl::FlatSet<const LogicalBuffer*>* buffers_to_assign,
+      const LogicalBuffer::SizeFunction& size_fn, const Options& options,
       const SequentialHloOrdering::HloModuleSequence* module_sequence);
   ~HeapSimulator();
 
@@ -130,7 +142,7 @@ class HeapSimulator {
   const std::unique_ptr<HeapAlgorithm> no_fragmentation_stats_;
   const std::unique_ptr<HeapAlgorithm> algorithm_;
   const LogicalBuffer::SizeFunction size_fn_;
-  const tensorflow::gtl::FlatSet<const LogicalBuffer*>* buffers_to_assign_;
+  const Options options_;
   const SequentialHloOrdering::HloModuleSequence* module_sequence_;
 
   // In addition to Alloc and Free, the heap simulator exposes a concept of
-- 
GitLab


From c8b884c683e260b42c15883f1c14caac4ea8d000 Mon Sep 17 00:00:00 2001
From: Chris Leary <leary@google.com>
Date: Tue, 30 Jan 2018 08:43:37 -0800
Subject: [PATCH 382/966] [XLA] Plumb build options via local API.

* Break build options into their own translation unit for use from local client
  and to mirror ExecutableRunOptions.
* Add some ToString()s to aid debugging.
* Add HLO graph generation regex to build options.
* Add SWIG type map for ExecutableBuildOptions.

Also fix a build issue occurring on some platforms with triangular_solve.

PiperOrigin-RevId: 183837856
---
 .../compiler/tf2xla/lib/triangular_solve.cc   |   9 +-
 tensorflow/compiler/xla/BUILD                 |   4 +
 tensorflow/compiler/xla/client/BUILD          |  13 +++
 .../xla/client/executable_build_options.cc    |  79 +++++++++++++
 .../xla/client/executable_build_options.h     |  74 ++++++++++++
 .../compiler/xla/client/local_client.cc       | 107 +++++++-----------
 tensorflow/compiler/xla/client/local_client.h |  58 ++--------
 .../compiler/xla/executable_run_options.cc    |  17 +++
 .../compiler/xla/executable_run_options.h     |   7 ++
 tensorflow/compiler/xla/python/BUILD          |   1 +
 .../xla/python/local_computation_builder.cc   |   6 +-
 .../xla/python/local_computation_builder.h    |   4 +-
 .../xla/python/local_computation_builder.i    |  34 ++++++
 tensorflow/compiler/xla/python/xla_client.py  |  21 +++-
 tensorflow/compiler/xla/service/BUILD         |   1 +
 .../compiler/xla/service/hlo_graph_dumper.cc  |   6 +-
 .../compiler/xla/service/local_service.cc     |  27 +++--
 .../compiler/xla/service/local_service.h      |   4 +-
 18 files changed, 332 insertions(+), 140 deletions(-)
 create mode 100644 tensorflow/compiler/xla/client/executable_build_options.cc
 create mode 100644 tensorflow/compiler/xla/client/executable_build_options.h

diff --git a/tensorflow/compiler/tf2xla/lib/triangular_solve.cc b/tensorflow/compiler/tf2xla/lib/triangular_solve.cc
index 5f0445dd44..7f72a6073d 100644
--- a/tensorflow/compiler/tf2xla/lib/triangular_solve.cc
+++ b/tensorflow/compiler/tf2xla/lib/triangular_solve.cc
@@ -419,16 +419,15 @@ xla::StatusOr<xla::ComputationDataHandle> TriangularSolveLeftLooking(
   //   init = (m-2, output, a, b)
   // else:
   //   init = (1, output, a, b)
-  std::vector<xla::Shape> tuple_shapes({
+  std::vector<xla::Shape> tuple_shapes = {
       // The loop iteration counter is a scalar, incremented each iteration.
       xla::ShapeUtil::MakeShape(xla::S32, {}),
       // The output has the shape of b, with one row updated each iteration.
-      xla::ShapeUtil::MakeShape(b_shape->element_type(), b_shape->dimensions()),
+      *b_shape,
       // The coefficient matrix a is a loop invariant.
-      xla::ShapeUtil::MakeShape(a_shape->element_type(), a_shape->dimensions()),
+      *a_shape,
       // The right-hand-side matrix b is a loop invariant.
-      xla::ShapeUtil::MakeShape(b_shape->element_type(), b_shape->dimensions()),
-  });
+      *b_shape};
   xla::Shape tuple_shape = xla::ShapeUtil::MakeTupleShape(tuple_shapes);
   auto init_i = builder->ConstantR0<int32>(transpose_a ? m - 2 : 1);
   auto init = builder->Tuple({init_i, output, a, b});
diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index c22fd37129..38e39afdc0 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -444,6 +444,10 @@ cc_library(
     srcs = ["executable_run_options.cc"],
     hdrs = ["executable_run_options.h"],
     visibility = ["//visibility:public"],
+    deps = [
+        ":types",
+        "//tensorflow/core:lib",
+    ],
 )
 
 cc_library(
diff --git a/tensorflow/compiler/xla/client/BUILD b/tensorflow/compiler/xla/client/BUILD
index 952109dde2..02356699a2 100644
--- a/tensorflow/compiler/xla/client/BUILD
+++ b/tensorflow/compiler/xla/client/BUILD
@@ -80,6 +80,18 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "executable_build_options",
+    srcs = ["executable_build_options.cc"],
+    hdrs = ["executable_build_options.h"],
+    deps = [
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:device_memory_allocator",
+        "//tensorflow/core:lib",
+    ],
+)
+
 cc_library(
     name = "local_client",
     srcs = ["local_client.cc"],
@@ -87,6 +99,7 @@ cc_library(
     deps = [
         ":client",
         ":computation",
+        ":executable_build_options",
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
diff --git a/tensorflow/compiler/xla/client/executable_build_options.cc b/tensorflow/compiler/xla/client/executable_build_options.cc
new file mode 100644
index 0000000000..804e34f5e7
--- /dev/null
+++ b/tensorflow/compiler/xla/client/executable_build_options.cc
@@ -0,0 +1,79 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/executable_build_options.h"
+
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+
+namespace xla {
+
+ExecutableBuildOptions& ExecutableBuildOptions::set_device_allocator(
+    DeviceMemoryAllocator* allocator) {
+  device_allocator_ = allocator;
+  return *this;
+}
+
+DeviceMemoryAllocator* ExecutableBuildOptions::device_allocator() const {
+  return device_allocator_;
+}
+
+ExecutableBuildOptions& ExecutableBuildOptions::set_device_ordinal(
+    int device_ordinal) {
+  CHECK_GE(device_ordinal, 0);
+  device_ordinal_ = device_ordinal;
+  return *this;
+}
+
+int ExecutableBuildOptions::device_ordinal() const { return device_ordinal_; }
+
+ExecutableBuildOptions& ExecutableBuildOptions::set_result_layout(
+    const Shape& shape_with_layout) {
+  result_layout_set_ = true;
+  result_layout_ = shape_with_layout;
+  return *this;
+}
+
+const Shape* ExecutableBuildOptions::result_layout() const {
+  return result_layout_set_ ? &result_layout_ : nullptr;
+}
+
+string ExecutableBuildOptions::ToString() const {
+  string result_layout = "nullopt";
+  if (result_layout_set_) {
+    result_layout = ShapeUtil::HumanStringWithLayout(result_layout_);
+  }
+  string generate_hlo_graph = "nullopt";
+  if (generate_hlo_graph_.has_value()) {
+    generate_hlo_graph = generate_hlo_graph_.value();
+  }
+  return tensorflow::strings::Printf(
+      "ExecutableBuildOptions{device_ordinal=%d, result_layout=%s, "
+      "generate_hlo_graph=%s}",
+      device_ordinal_, result_layout.c_str(), generate_hlo_graph.c_str());
+}
+
+ExecutableBuildOptions& ExecutableBuildOptions::set_generate_hlo_graph(
+    string regex) {
+  generate_hlo_graph_ = std::move(regex);
+  return *this;
+}
+
+const tensorflow::gtl::optional<string>&
+ExecutableBuildOptions::generate_hlo_graph() const {
+  return generate_hlo_graph_;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/executable_build_options.h b/tensorflow/compiler/xla/client/executable_build_options.h
new file mode 100644
index 0000000000..3a52dbac9a
--- /dev/null
+++ b/tensorflow/compiler/xla/client/executable_build_options.h
@@ -0,0 +1,74 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_EXECUTABLE_BUILD_OPTIONS_H_
+#define TENSORFLOW_COMPILER_XLA_CLIENT_EXECUTABLE_BUILD_OPTIONS_H_
+
+#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/gtl/optional.h"
+
+namespace xla {
+
+// Class containing options for building an LocalExecutable with
+// LocalClient::Compile.
+class ExecutableBuildOptions {
+ public:
+  // If set, this is the device to build the computation for. Valid
+  // device_ordinal values are: 0 to # of devices - 1. These values are
+  // identical to the device ordinal values used by StreamExecutor. The built
+  // executable will be executable on any device equivalent to the specified
+  // device as determined by Backend::devices_equivalent(). A value of -1
+  // indicates this option has not been set.
+  ExecutableBuildOptions& set_device_ordinal(int device_ordinal);
+  int device_ordinal() const;
+
+  // If set, this specifies the layout of the result of the computation. If not
+  // set, the service will chose the layout of the result. A Shape is used to
+  // store the layout to accommodate tuple result shapes. A value of nullptr
+  // indicates the option has not been set.
+  ExecutableBuildOptions& set_result_layout(const Shape& shape_with_layout);
+  const Shape* result_layout() const;
+
+  // If set, this specifies an allocator that can be used to allocate temporary
+  // space on the device during compilation.  For example, the compiler might
+  // want to run various algorithms on the device and pick the fastest one -- it
+  // might allocate buffers for use by these algorithms using this allocator.
+  //
+  // This does not need to be the same as the DeviceMemoryAllocator passed when
+  // running the executable.
+  ExecutableBuildOptions& set_device_allocator(
+      DeviceMemoryAllocator* allocator);
+  DeviceMemoryAllocator* device_allocator() const;
+
+  // If set, specifies a regexp of HLO graphs to dump (as in DebugOptions).
+  ExecutableBuildOptions& set_generate_hlo_graph(string regex);
+  const tensorflow::gtl::optional<string>& generate_hlo_graph() const;
+
+  // Returns a string representation of the build options, suitable for
+  // debugging.
+  string ToString() const;
+
+ private:
+  int device_ordinal_ = -1;
+  Shape result_layout_;
+  bool result_layout_set_ = false;
+  tensorflow::gtl::optional<string> generate_hlo_graph_;
+  DeviceMemoryAllocator* device_allocator_ = nullptr;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_EXECUTABLE_BUILD_OPTIONS_H_
diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc
index e45787fca6..ef98dbb640 100644
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@@ -30,35 +30,6 @@ using xla::source_map_util::InvalidParameterArgument;
 
 namespace xla {
 
-ExecutableBuildOptions& ExecutableBuildOptions::set_device_ordinal(
-    int device_ordinal) {
-  device_ordinal_ = device_ordinal;
-  return *this;
-}
-
-int ExecutableBuildOptions::device_ordinal() const { return device_ordinal_; }
-
-ExecutableBuildOptions& ExecutableBuildOptions::set_result_layout(
-    const Shape& shape_with_layout) {
-  result_layout_set_ = true;
-  result_layout_ = shape_with_layout;
-  return *this;
-}
-
-const Shape* ExecutableBuildOptions::result_layout() const {
-  return result_layout_set_ ? &result_layout_ : nullptr;
-}
-
-ExecutableBuildOptions& ExecutableBuildOptions::set_device_allocator(
-    DeviceMemoryAllocator* allocator) {
-  device_allocator_ = allocator;
-  return *this;
-}
-
-DeviceMemoryAllocator* ExecutableBuildOptions::device_allocator() const {
-  return device_allocator_;
-}
-
 namespace {
 StatusOr<Backend::StreamPtr> BorrowStreamForDevice(int device_ordinal,
                                                    Backend* backend) {
@@ -70,16 +41,18 @@ StatusOr<Backend::StreamPtr> BorrowStreamForDevice(int device_ordinal,
 }  // namespace
 
 LocalExecutable::LocalExecutable(std::unique_ptr<Executable> executable,
-                                 Backend* backend, int device_ordinal,
-                                 const ExecutableBuildOptions& build_options)
+                                 Backend* backend,
+                                 ExecutableBuildOptions build_options)
     : executable_(std::move(executable)),
       backend_(backend),
-      build_device_ordinal_(device_ordinal),
-      build_options_(build_options) {}
+      build_options_(std::move(build_options)) {
+  CHECK_GE(build_options_.device_ordinal(), 0)
+      << "Must have a valid device ordinal that the executable was built for.";
+}
 
 tensorflow::Status LocalExecutable::ValidateExecutionOptions(
     const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-    const ExecutableRunOptions& options, const Backend& backend) {
+    const ExecutableRunOptions& run_options, const Backend& backend) {
   const ComputationLayout& computation_layout =
       executable_->module_config().entry_computation_layout();
 
@@ -103,14 +76,14 @@ tensorflow::Status LocalExecutable::ValidateExecutionOptions(
     }
   }
 
-  if (options.stream() != nullptr) {
-    if (!options.stream()->ok()) {
+  if (run_options.stream() != nullptr) {
+    if (!run_options.stream()->ok()) {
       return InvalidArgument("stream is uninitialized or in an error state");
     }
 
     // Check stream matches service platform.
     const se::Platform* stream_platform =
-        options.stream()->parent()->platform();
+        run_options.stream()->parent()->platform();
     if (stream_platform != backend_->platform()) {
       return InvalidArgument(
           "stream is for platform %s, but service targets platform %s",
@@ -120,7 +93,7 @@ tensorflow::Status LocalExecutable::ValidateExecutionOptions(
 
     // Cannot specify device_ordinal with a stream. The stream determines these
     // values.
-    if (options.device_ordinal() != -1) {
+    if (run_options.device_ordinal() != -1) {
       return InvalidArgument(
           "cannot set both device ordinal and stream options in "
           "ExecutableRunOptions; the stream determines the device ordinal");
@@ -129,34 +102,34 @@ tensorflow::Status LocalExecutable::ValidateExecutionOptions(
 
   // Verify that the device the executable was built for is equivalent to the
   // device it will run on.
-  int run_device_ordinal = options.device_ordinal() == -1
+  int run_device_ordinal = run_options.device_ordinal() == -1
                                ? backend_->default_device_ordinal()
-                               : options.device_ordinal();
-  TF_ASSIGN_OR_RETURN(
-      bool devices_equivalent,
-      backend_->devices_equivalent(run_device_ordinal, build_device_ordinal_));
+                               : run_options.device_ordinal();
+  TF_ASSIGN_OR_RETURN(bool devices_equivalent,
+                      backend_->devices_equivalent(
+                          run_device_ordinal, build_options_.device_ordinal()));
   if (!devices_equivalent) {
     TF_ASSIGN_OR_RETURN(se::StreamExecutor * run_executor,
                         backend_->stream_executor(run_device_ordinal));
     TF_ASSIGN_OR_RETURN(se::StreamExecutor * build_executor,
-                        backend_->stream_executor(build_device_ordinal_));
+                        backend_->stream_executor(build_device_ordinal()));
     return InvalidArgument(
         "executable is built for device %s of type \"%s\"; cannot run it on "
         "device %s of type \"%s\"",
-        backend_->device_name(build_device_ordinal_).c_str(),
+        backend_->device_name(build_device_ordinal()).c_str(),
         build_executor->GetDeviceDescription().name().c_str(),
         backend_->device_name(run_device_ordinal).c_str(),
         run_executor->GetDeviceDescription().name().c_str());
   }
 
-  if (!options.allocator()) {
+  if (!run_options.allocator()) {
     return InvalidArgument("an allocator must be provided to ExecuteLocally");
   }
 
-  if (options.allocator()->platform() != backend.platform()) {
+  if (run_options.allocator()->platform() != backend.platform()) {
     return InvalidArgument(
         "allocator platform (%s) does not match service platform (%s)",
-        options.allocator()->platform()->Name().c_str(),
+        run_options.allocator()->platform()->Name().c_str(),
         backend.platform()->Name().c_str());
   }
 
@@ -165,23 +138,22 @@ tensorflow::Status LocalExecutable::ValidateExecutionOptions(
 
 StatusOr<std::unique_ptr<ScopedShapedBuffer>> LocalExecutable::Run(
     const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-    const ExecutableRunOptions& options) {
-  TF_RETURN_IF_ERROR(ValidateExecutionOptions(arguments, options, *backend_));
-
-  ExecutableRunOptions actual_options = options;
+    ExecutableRunOptions run_options) {
+  TF_RETURN_IF_ERROR(
+      ValidateExecutionOptions(arguments, run_options, *backend_));
 
   Backend::StreamPtr stream;
-  if (options.stream() == nullptr) {
+  if (run_options.stream() == nullptr) {
     // NB!  The lifetime of `stream` needs to match the lifetime of
     // `actual_options` (otherwise we will end up using a returned stream in
     // ExecuteOnStreamWrapper), which is why it isn't declared in the inner "if"
     // scope.
     TF_ASSIGN_OR_RETURN(
-        stream, BorrowStreamForDevice(options.device_ordinal(), backend_));
-    actual_options.set_stream(stream.get());
+        stream, BorrowStreamForDevice(run_options.device_ordinal(), backend_));
+    run_options.set_stream(stream.get());
   }
-  if (options.allocator() == nullptr) {
-    actual_options.set_allocator(backend_->memory_allocator());
+  if (run_options.allocator() == nullptr) {
+    run_options.set_allocator(backend_->memory_allocator());
   }
 
   // For local client execution on CPU backends:
@@ -190,7 +162,7 @@ StatusOr<std::unique_ptr<ScopedShapedBuffer>> LocalExecutable::Run(
   // *) The thread pool used for XLA CPU ops is from
   //    backend_->eigen_intra_op_thread_pool().
   ServiceExecutableRunOptions service_options(
-      actual_options, backend_->StreamBorrower(),
+      run_options, backend_->StreamBorrower(),
       backend_->eigen_intra_op_thread_pool());
 
   if (executable_->dumping()) {
@@ -199,9 +171,8 @@ StatusOr<std::unique_ptr<ScopedShapedBuffer>> LocalExecutable::Run(
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<ShapedBuffer> result,
       executable_->ExecuteOnStreamWrapper(
-          &service_options, options.execution_profile(), arguments));
-  return ScopedShapedBuffer::MakeScoped(result.get(),
-                                        actual_options.allocator());
+          &service_options, run_options.execution_profile(), arguments));
+  return ScopedShapedBuffer::MakeScoped(result.get(), run_options.allocator());
 }
 
 StatusOr<std::unique_ptr<ScopedShapedBuffer>> LocalExecutable::ExecuteAndDump(
@@ -277,17 +248,19 @@ StatusOr<std::unique_ptr<LocalExecutable>> LocalClient::Compile(
     const Computation& computation,
     const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
     const ExecutableBuildOptions& options) {
-  int device_ordinal = options.device_ordinal() == -1
-                           ? default_device_ordinal()
-                           : options.device_ordinal();
+  ExecutableBuildOptions updated_options = options;
+  if (options.device_ordinal() == -1) {
+    updated_options.set_device_ordinal(default_device_ordinal());
+    VLOG(3) << "Set device ordinal to default value of: "
+            << updated_options.device_ordinal();
+  }
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<Executable> executable,
       local_service_->CompileExecutable(computation.handle(), argument_layouts,
-                                        options.result_layout(), device_ordinal,
-                                        options.device_allocator()));
+                                        updated_options));
   return WrapUnique(new LocalExecutable(std::move(executable),
                                         local_service_->mutable_backend(),
-                                        device_ordinal, options));
+                                        updated_options));
 }
 
 StatusOr<std::unique_ptr<ScopedShapedBuffer>>
diff --git a/tensorflow/compiler/xla/client/local_client.h b/tensorflow/compiler/xla/client/local_client.h
index 843ad7aa85..b52a30f5a0 100644
--- a/tensorflow/compiler/xla/client/local_client.h
+++ b/tensorflow/compiler/xla/client/local_client.h
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/client.h"
 #include "tensorflow/compiler/xla/client/computation.h"
+#include "tensorflow/compiler/xla/client/executable_build_options.h"
 #include "tensorflow/compiler/xla/executable_run_options.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
@@ -33,51 +34,13 @@ limitations under the License.
 
 namespace xla {
 
-// Class containing options for building an LocalExecutable with
-// LocalClient::Compile.
-class ExecutableBuildOptions {
- public:
-  // If set, this is the device to build the computation for. Valid
-  // device_ordinal values are: 0 to # of devices - 1. These values are
-  // identical to the device ordinal values used by StreamExecutor. The built
-  // executable will be executable on any device equivalent to the specified
-  // device as determined by Backend::devices_equivalent(). A value of -1
-  // indicates this option has not been set.
-  ExecutableBuildOptions& set_device_ordinal(int device_ordinal);
-  int device_ordinal() const;
-
-  // If set, this specifies the layout of the result of the computation. If not
-  // set, the service will chose the layout of the result. A Shape is used to
-  // store the layout to accommodate tuple result shapes. A value of nullptr
-  // indicates the option has not been set.
-  ExecutableBuildOptions& set_result_layout(const Shape& shape_with_layout);
-  const Shape* result_layout() const;
-
-  // If set, this specifies an allocator that can be used to allocate temporary
-  // space on the device during compilation.  For example, the compiler might
-  // want to run various algorithms on the device and pick the fastest one -- it
-  // might allocate buffers for use by these algorithms using this allocator.
-  //
-  // This does not need to be the same as the DeviceMemoryAllocator passed when
-  // running the executable.
-  ExecutableBuildOptions& set_device_allocator(
-      DeviceMemoryAllocator* allocator);
-  DeviceMemoryAllocator* device_allocator() const;
-
- private:
-  int device_ordinal_ = -1;
-  Shape result_layout_;
-  bool result_layout_set_ = false;
-  DeviceMemoryAllocator* device_allocator_ = nullptr;
-};
-
 class LocalExecutable {
  public:
   // Run the compiled computation with the given arguments and options and
   // return the result.
   StatusOr<std::unique_ptr<ScopedShapedBuffer>> Run(
       const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-      const ExecutableRunOptions& options);
+      ExecutableRunOptions run_options);
 
   // Return the layout (contained in a shape) of the result produced by the
   // computation.
@@ -100,8 +63,7 @@ class LocalExecutable {
 
   // Constructor invoked by LocalClient.
   LocalExecutable(std::unique_ptr<Executable> executable, Backend* backend,
-                  int device_ordinal,
-                  const ExecutableBuildOptions& build_options);
+                  ExecutableBuildOptions build_options);
 
   // Validates that the given arguments and options satisfy various constraints
   // of the computation.
@@ -129,19 +91,19 @@ class LocalExecutable {
   StatusOr<std::unique_ptr<Literal>> LiteralFromShapedBuffer(
       const ShapedBuffer& shaped_buffer);
 
+  // The ordinal of the device which this executable was compiled for. The
+  // executable can run on all equivalent devices (as determined by
+  // Backend::devices_equivalent).
+  int build_device_ordinal() const { return build_options_.device_ordinal(); }
+
   // Compiled computation.
   std::unique_ptr<Executable> executable_;
 
   // Execution backend.
-  Backend* backend_;
-
-  // The ordinal of the device which this executable was compiled for. The
-  // executable can run on all equivalent devices (as determined by
-  // Backend::devices_equivalent).
-  int build_device_ordinal_;
+  Backend* backend_ = nullptr;
 
   // Options used to build the executable.
-  const ExecutableBuildOptions& build_options_;
+  const ExecutableBuildOptions build_options_;
 };
 
 // An XLA Client specialization for use when the client and service run in
diff --git a/tensorflow/compiler/xla/executable_run_options.cc b/tensorflow/compiler/xla/executable_run_options.cc
index 392ad9010a..f8bb8e52c7 100644
--- a/tensorflow/compiler/xla/executable_run_options.cc
+++ b/tensorflow/compiler/xla/executable_run_options.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/executable_run_options.h"
 
+#include "tensorflow/core/lib/strings/stringprintf.h"
+
 namespace xla {
 
 ExecutableRunOptions& ExecutableRunOptions::set_device_ordinal(
@@ -87,4 +89,19 @@ const DeviceAssignment* ExecutableRunOptions::device_assignment() const {
   return device_assignment_;
 }
 
+string ExecutableRunOptions::ToString() const {
+  return tensorflow::strings::Printf(
+      "ExecutableRunOptions{allocator=%p, device_ordinal=%d, "
+      "device_assignment=%p, stream=%p, inter_op_thread_pool=%p, "
+      "intra_op_thread_pool=%p, execution_profile=%p}",
+      allocator_, device_ordinal_, device_assignment_, stream_,
+      inter_op_thread_pool_, intra_op_thread_pool_, execution_profile_);
+}
+
+std::ostream& operator<<(std::ostream& out,
+                         const ExecutableRunOptions& options) {
+  out << options.ToString();
+  return out;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/executable_run_options.h b/tensorflow/compiler/xla/executable_run_options.h
index d4fcbf0493..c7a20bb33c 100644
--- a/tensorflow/compiler/xla/executable_run_options.h
+++ b/tensorflow/compiler/xla/executable_run_options.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_EXECUTABLE_RUN_OPTIONS_H_
 #define TENSORFLOW_COMPILER_XLA_EXECUTABLE_RUN_OPTIONS_H_
 
+#include "tensorflow/compiler/xla/types.h"
+
 // Intentionally forward declared so that ExecutableRunOptions can be linked
 // into an XLA-compiled binary without having to link all of the pointed-to
 // objects (e.g., for an ahead-of-time compiled CPU binary, the gpu tools don't
@@ -84,6 +86,8 @@ class ExecutableRunOptions {
       DeviceAssignment* device_assignment);
   const DeviceAssignment* device_assignment() const;
 
+  string ToString() const;
+
  private:
   DeviceMemoryAllocator* allocator_ = nullptr;
   int device_ordinal_ = -1;
@@ -94,6 +98,9 @@ class ExecutableRunOptions {
   ExecutionProfile* execution_profile_ = nullptr;
 };
 
+std::ostream& operator<<(std::ostream& out,
+                         const ExecutableRunOptions& options);
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_EXECUTABLE_RUN_OPTIONS_H_
diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD
index a8ca0e3ea0..e2972f0601 100644
--- a/tensorflow/compiler/xla/python/BUILD
+++ b/tensorflow/compiler/xla/python/BUILD
@@ -49,6 +49,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client:executable_build_options",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/core:framework_lite",
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.cc b/tensorflow/compiler/xla/python/local_computation_builder.cc
index 37f1eada2b..5772532b84 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.cc
+++ b/tensorflow/compiler/xla/python/local_computation_builder.cc
@@ -233,7 +233,8 @@ LocalComputation::LocalComputation(Computation computation)
     : computation_(std::move(computation)) {}
 
 StatusOr<CompiledLocalComputation*> LocalComputation::Compile(
-    const std::vector<Shape>& argument_shapes) {
+    const std::vector<Shape>& argument_shapes,
+    const ExecutableBuildOptions* build_options) {
   std::vector<const Shape*> argument_shape_pointers;
   argument_shape_pointers.reserve(argument_shapes.size());
   for (auto& argument_shape : argument_shapes) {
@@ -242,6 +243,9 @@ StatusOr<CompiledLocalComputation*> LocalComputation::Compile(
 
   LocalClient* client = GetOrCreateLocalClient();
   ExecutableBuildOptions options;
+  if (build_options != nullptr) {
+    options = *build_options;
+  }
   TF_ASSIGN_OR_RETURN(
       auto local_executable,
       client->Compile(computation_, argument_shape_pointers, options));
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.h b/tensorflow/compiler/xla/python/local_computation_builder.h
index e5503cd52f..6851c2644d 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.h
+++ b/tensorflow/compiler/xla/python/local_computation_builder.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/executable_build_options.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -93,7 +94,8 @@ class LocalComputation {
  public:
   LocalComputation(Computation computation);
   StatusOr<CompiledLocalComputation*> Compile(
-      const std::vector<Shape>& argument_shapes);
+      const std::vector<Shape>& argument_shapes,
+      const ExecutableBuildOptions* build_options);
   const Computation& computation() const;
 
  private:
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.i b/tensorflow/compiler/xla/python/local_computation_builder.i
index 3178925960..6a52a088dd 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.i
+++ b/tensorflow/compiler/xla/python/local_computation_builder.i
@@ -176,6 +176,16 @@ tensorflow::ImportNumpy();
   }
 }
 
+%typemap(out) StatusOr< std::unique_ptr<Literal> > {
+  if ($1.ok()) {
+    std::unique_ptr<Literal> value = $1.ConsumeValueOrDie();
+    $result = numpy::PyObjectFromXlaLiteral(*value);
+  } else {
+    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
+    return NULL;
+  }
+}
+
 %typemap(out) StatusOr<xla::swig::LocalComputation*> {
   if ($1.ok()) {
     auto* value = $1.ValueOrDie();
@@ -623,6 +633,30 @@ tensorflow::ImportNumpy();
   $1 = &dimension_numbers;
 }
 
+// ExecutableBuildOptions
+
+%typemap(in) const ExecutableBuildOptions*
+    (ExecutableBuildOptions build_options) {
+  if ($input == Py_None) {
+    $1 = NULL;
+  } else {
+    PyObject* o = PyObject_GetAttrString($input, "generate_hlo_graph");
+    if (!o) {
+      return NULL;
+    }
+    if (o != Py_None) {
+      if (!PyString_Check(o)) {
+        PyErr_SetString(PyExc_TypeError, "ExecutableBuildOptions.generate_hlo_graph must be a string or None.");
+        return NULL;
+      }
+      build_options.set_generate_hlo_graph(PyString_AsString(o));
+    }
+    Py_DECREF(o);
+
+    $1 = &build_options;
+  }
+}
+
 %ignoreall
 %unignore xla;
 %unignore xla::swig;
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index 66ace613a0..a89e2643c8 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -260,6 +260,17 @@ def require_numpy_array_layout(value):
     return np.require(value, requirements=['C', 'A'])
 
 
+class CompileOptions(object):
+  """Python object for XLA compile options.
+
+  These options can be passed to the 'compile' step when using a local XLA
+  client.
+  """
+
+  def __init__(self):
+    self.generate_hlo_graph = None
+
+
 def transfer_to_infeed(value, replica_number=None):
   """Transfers the given value into the XLA infeed queue.
 
@@ -313,16 +324,18 @@ class LocalComputation(object):
     else:
       self._delete = c_api.DeleteLocalComputation
 
-  def Compile(self, argument_shapes=()):
+  def Compile(self, argument_shapes=(), compile_options=None):
     if self.is_compiled:
       raise ValueError('Attempt to compile a compiled local XLA computation.')
     return LocalComputation(
-        self.c_local_computation.Compile(_unwrap_shapes(argument_shapes)),
+        self.c_local_computation.Compile(
+            _unwrap_shapes(argument_shapes), compile_options),
         is_compiled=True)
 
-  def CompileWithExampleArguments(self, arguments=()):
+  def CompileWithExampleArguments(self, arguments=(), compile_options=None):
     return self.Compile(
-        argument_shapes=[Shape.from_numpy(arg) for arg in arguments])
+        argument_shapes=[Shape.from_numpy(arg) for arg in arguments],
+        compile_options=compile_options)
 
   def Execute(self, arguments=()):
     if not self.is_compiled:
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index a04ba7ae3e..93cc5ab1a9 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -511,6 +511,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:executable_build_options",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
     ],
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index c744c8ed81..44fcd36370 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -1426,9 +1426,11 @@ void DumpText(const HloModule& module, const string& label,
 
 string MaybeDumpHloModule(const HloModule& module, const string& label,
                           const HloExecutionProfile* profile) {
-  VLOG(2) << "MaybeDumpHloModule called on module " << module.name();
-  string graph_url;
   const DebugOptions& debug_options = module.config().debug_options();
+  VLOG(2) << "MaybeDumpHloModule called on module " << module.name()
+          << " with generate_hlo_graph regex \""
+          << debug_options.xla_generate_hlo_graph() << "\"";
+  string graph_url;
   if (!debug_options.xla_generate_hlo_graph().empty() &&
       RE2::PartialMatch(module.name(),
                         debug_options.xla_generate_hlo_graph())) {
diff --git a/tensorflow/compiler/xla/service/local_service.cc b/tensorflow/compiler/xla/service/local_service.cc
index bb9fd447d9..07f989d4fa 100644
--- a/tensorflow/compiler/xla/service/local_service.cc
+++ b/tensorflow/compiler/xla/service/local_service.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "tensorflow/compiler/xla/client/executable_build_options.h"
 #include "tensorflow/compiler/xla/execution_options_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/backend.h"
@@ -71,8 +72,7 @@ LocalService::LocalService(const ServiceOptions& options,
 StatusOr<std::unique_ptr<Executable>> LocalService::CompileExecutable(
     const ComputationHandle& computation,
     const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
-    const Shape* result_layout, int device_ordinal,
-    DeviceMemoryAllocator* device_allocator) {
+    const ExecutableBuildOptions& build_options) {
   TF_ASSIGN_OR_RETURN(UserComputation * user_computation,
                       computation_tracker_.Resolve(computation));
   VersionedComputationHandle versioned_handle =
@@ -113,14 +113,19 @@ StatusOr<std::unique_ptr<Executable>> LocalService::CompileExecutable(
           ShapeUtil::HumanString(argument_shape).c_str());
     }
   }
-  if (result_layout != nullptr) {
-    TF_RETURN_IF_ERROR(
-        ValidateResultShapeWithLayout(*result_layout, program_shape->result()));
+  if (build_options.result_layout() != nullptr) {
+    TF_RETURN_IF_ERROR(ValidateResultShapeWithLayout(
+        *build_options.result_layout(), program_shape->result()));
   }
 
   ExecutionOptions execution_options = CreateDefaultExecutionOptions();
-  if (result_layout != nullptr) {
-    *execution_options.mutable_shape_with_output_layout() = *result_layout;
+  if (build_options.generate_hlo_graph().has_value()) {
+    execution_options.mutable_debug_options()->set_xla_generate_hlo_graph(
+        build_options.generate_hlo_graph().value());
+  }
+  if (build_options.result_layout() != nullptr) {
+    *execution_options.mutable_shape_with_output_layout() =
+        *build_options.result_layout();
   } else {
     *execution_options.mutable_shape_with_output_layout() =
         program_shape->result();
@@ -132,11 +137,13 @@ StatusOr<std::unique_ptr<Executable>> LocalService::CompileExecutable(
       CreateModuleConfig(*program_shape, argument_layouts, &execution_options,
                          *user_computation));
 
-  TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor,
-                      execute_backend_->stream_executor(device_ordinal));
+  TF_ASSIGN_OR_RETURN(
+      se::StreamExecutor * executor,
+      execute_backend_->stream_executor(build_options.device_ordinal()));
 
   return BuildExecutable(versioned_handle, std::move(module_config),
-                         execute_backend_.get(), executor, device_allocator);
+                         execute_backend_.get(), executor,
+                         build_options.device_allocator());
 }
 
 StatusOr<int> LocalService::ReplicaNumberToDeviceOrdinal(int replica_number) {
diff --git a/tensorflow/compiler/xla/service/local_service.h b/tensorflow/compiler/xla/service/local_service.h
index 16c71b25c4..15e120685e 100644
--- a/tensorflow/compiler/xla/service/local_service.h
+++ b/tensorflow/compiler/xla/service/local_service.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <memory>
 
+#include "tensorflow/compiler/xla/client/executable_build_options.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
@@ -47,8 +48,7 @@ class LocalService : public Service {
   StatusOr<std::unique_ptr<Executable>> CompileExecutable(
       const ComputationHandle& computation,
       const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
-      const Shape* result_layout, int device_ordinal,
-      DeviceMemoryAllocator* device_allocator);
+      const ExecutableBuildOptions& options);
 
   // Returns the device ordinal that corresponds to the given replica number.
   //
-- 
GitLab


From 864d477a9923b1514f3cedb9bcebe45e65227663 Mon Sep 17 00:00:00 2001
From: gracehoney <31743510+aaroey@users.noreply.github.com>
Date: Tue, 30 Jan 2018 09:05:40 -0800
Subject: [PATCH 383/966] Remove commented lines in configure.py

---
 configure.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/configure.py b/configure.py
index 453dbb2b9c..34de1ad2d5 100644
--- a/configure.py
+++ b/configure.py
@@ -1419,11 +1419,6 @@ def main():
       if not is_windows():
         set_gcc_host_compiler_path(environ_cp)
     set_other_cuda_vars(environ_cp)
-    # superceeded by call above
-    # enable tensorrt if desired. Disabled on non-linux
-    # set_action_env_var(environ_cp, 'TF_NEED_TENSORRT', 'TensorRT', False)
-    # if environ_cp.get('TF_NEED_TENSORRT') == '1':
-    #   set_tf_trt_version(environ_cp)
 
   set_build_var(environ_cp, 'TF_NEED_MPI', 'MPI', 'with_mpi_support', False)
   if environ_cp.get('TF_NEED_MPI') == '1':
-- 
GitLab


From b5ee0be2df79efd6df150d1dcc8fadeaa99cb558 Mon Sep 17 00:00:00 2001
From: Shivani Agrawal <shivaniagrawal@google.com>
Date: Tue, 30 Jan 2018 09:21:57 -0800
Subject: [PATCH 384/966] Fixes broken link in documentation.

PiperOrigin-RevId: 183842485
---
 tensorflow/docs_src/get_started/checkpoints.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/docs_src/get_started/checkpoints.md b/tensorflow/docs_src/get_started/checkpoints.md
index 680e1c0d3f..dfa2110e69 100644
--- a/tensorflow/docs_src/get_started/checkpoints.md
+++ b/tensorflow/docs_src/get_started/checkpoints.md
@@ -16,7 +16,7 @@ This document focuses on checkpoints. For details on SavedModel, see the
 ## Sample code
 
 This document relies on the same
-[https://github.com/tensorflow/models/blob/master/samples/core/get_started/premade_estimator.py](Iris classification example) detailed in @{$premade_estimators$Getting Started with TensorFlow}.
+[Iris classification example](https://github.com/tensorflow/models/blob/master/samples/core/get_started/premade_estimator.py) detailed in @{$premade_estimators$Getting Started with TensorFlow}.
 To download and access the example, invoke the following two commands:
 
 ```shell
-- 
GitLab


From ca20a8382b9484a3ad406ae9a78ab6a164ec72f9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 30 Jan 2018 09:42:48 -0800
Subject: [PATCH 385/966] Reenable 'constant' test.

PiperOrigin-RevId: 183845007
---
 tensorflow/contrib/lite/testing/generated_examples_zip_test.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
index e6b782472a..d73c9937ce 100644
--- a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
+++ b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
@@ -243,8 +243,7 @@ INSTANTIATE_TESTS(avg_pool)
 INSTANTIATE_TESTS(space_to_batch_nd)
 INSTANTIATE_TESTS(batch_to_space_nd)
 INSTANTIATE_TESTS(concat)
-// TODO(b/71642435) re-enable this test
-// INSTANTIATE_TESTS(constant)
+INSTANTIATE_TESTS(constant)
 INSTANTIATE_TESTS(control_dep)
 INSTANTIATE_TESTS(conv)
 INSTANTIATE_TESTS(depthwiseconv)
-- 
GitLab


From 5ab07fcfc51fd524622e2c583f81f0cd8eca97d5 Mon Sep 17 00:00:00 2001
From: Yuanzhong Xu <yuanzx@google.com>
Date: Tue, 30 Jan 2018 09:54:46 -0800
Subject: [PATCH 386/966] Add BF16 test cases for pad.

PiperOrigin-RevId: 183846616
---
 tensorflow/compiler/xla/tests/pad_test.cc | 193 +++++++++++-----------
 1 file changed, 100 insertions(+), 93 deletions(-)

diff --git a/tensorflow/compiler/xla/tests/pad_test.cc b/tensorflow/compiler/xla/tests/pad_test.cc
index 3fd83a4c3b..8cef8dd34d 100644
--- a/tensorflow/compiler/xla/tests/pad_test.cc
+++ b/tensorflow/compiler/xla/tests/pad_test.cc
@@ -33,6 +33,14 @@ limitations under the License.
 namespace xla {
 namespace {
 
+#ifdef XLA_BACKEND_SUPPORTS_BFLOAT16
+// Tests both F32 and BF16.
+static std::array<bool, 2> use_bfloat16_params{false, true};
+#else
+// Only tests F32.
+static std::array<bool, 1> use_bfloat16_params{false};
+#endif
+
 class PadTest : public ClientLibraryTestBase {
  protected:
   PadTest() {
@@ -61,8 +69,22 @@ class PadTest : public ClientLibraryTestBase {
   PaddingConfig r4_padding_on_dim0_dim1_;
 };
 
+class PadTestFloat : public PadTest,
+                     public ::testing::WithParamInterface<bool> {
+ protected:
+  PadTestFloat() { set_use_bfloat16(GetParam()); }
+
+  ErrorSpec DefaultErrorSpec() const {
+    if (use_bfloat16()) {
+      return ErrorSpec(1e-3, 1e-3);
+    } else {
+      return ErrorSpec(1e-5, 1e-5);
+    }
+  }
+};
+
 // Tests a Pad() with a zero-element input and output.
-XLA_TEST_F(PadTest, Pad1DS0ToS0Array) {
+XLA_TEST_P(PadTestFloat, Pad1DS0ToS0Array) {
   ComputationBuilder b(client_, TestName());
   // Set up the padding configuration {low: 0, high: 0, interior: 0}.
   PaddingConfig padding_config;
@@ -71,12 +93,13 @@ XLA_TEST_F(PadTest, Pad1DS0ToS0Array) {
   dimension->set_edge_padding_high(0);
   dimension->set_interior_padding(0);
 
-  b.Pad(b.ConstantR1<float>({}), b.ConstantR0<float>(0.1), padding_config);
-  ComputeAndCompareR1<float>(&b, {}, {}, ErrorSpec(0.0001));
+  b.Pad(AddParam(*Literal::CreateR1<float>({}), &b),
+        AddParam(*Literal::CreateR0<float>(0.1), &b), padding_config);
+  ComputeAndCompareR1<float>(&b, {}, {}, DefaultErrorSpec());
 }
 
 // Tests a Pad() with a zero-element input but a non-zero-element output.
-XLA_TEST_F(PadTest, Pad1DS0ToS5Array) {
+XLA_TEST_P(PadTestFloat, Pad1DS0ToS5Array) {
   ComputationBuilder b(client_, TestName());
   // Set up the padding configuration {low: 3, high: 0, interior: 1}.
   PaddingConfig padding_config;
@@ -85,12 +108,13 @@ XLA_TEST_F(PadTest, Pad1DS0ToS5Array) {
   dimension->set_edge_padding_high(4);
   dimension->set_interior_padding(7);
 
-  b.Pad(b.ConstantR1<float>({}), b.ConstantR0<float>(0.1), padding_config);
+  b.Pad(AddParam(*Literal::CreateR1<float>({}), &b),
+        AddParam(*Literal::CreateR0<float>(0.1), &b), padding_config);
   ComputeAndCompareR1<float>(&b, std::vector<float>(5, 0.1), {},
-                             ErrorSpec(0.0001));
+                             DefaultErrorSpec());
 }
 
-XLA_TEST_F(PadTest, Pad1DS3Array) {
+XLA_TEST_P(PadTestFloat, Pad1DS3Array) {
   ComputationBuilder b(client_, TestName());
   // Set up the padding configuration {low: 3, high: 0, interior: 1}.
   PaddingConfig padding_config;
@@ -99,21 +123,21 @@ XLA_TEST_F(PadTest, Pad1DS3Array) {
   dimension->set_edge_padding_high(0);
   dimension->set_interior_padding(1);
 
-  b.Pad(b.ConstantR1<float>({1, 2, 3}), b.ConstantR0<float>(0.1),
-        padding_config);
+  b.Pad(AddParam(*Literal::CreateR1<float>({1, 2, 3}), &b),
+        AddParam(*Literal::CreateR0<float>(0.1), &b), padding_config);
   std::vector<float> expected({0.1, 0.1, 0.1, 1, 0.1, 2, 0.1, 3});
-  ComputeAndCompareR1<float>(&b, expected, {}, ErrorSpec(0.0001));
+  ComputeAndCompareR1<float>(&b, expected, {}, DefaultErrorSpec());
 }
 
-XLA_TEST_F(PadTest, Pad4D_2x0x3x2_FloatArray) {
+XLA_TEST_P(PadTestFloat, Pad4D_2x0x3x2_FloatArray) {
   ComputationBuilder b(client_, TestName());
-  b.Pad(b.ConstantR4FromArray4D<float>(Array4D<float>(2, 0, 3, 2)),
-        b.ConstantR0<float>(1.5), r4_padding_on_dim0_dim1_);
+  b.Pad(AddParam(Array4D<float>(2, 0, 3, 2), &b),
+        AddParam(*Literal::CreateR0<float>(1.5), &b), r4_padding_on_dim0_dim1_);
   ComputeAndCompareR4<float>(&b, Array4D<float>(5, 2, 3, 2, 1.5f), {},
-                             ErrorSpec(0.0001));
+                             DefaultErrorSpec());
 }
 
-TEST_F(PadTest, Pad4DFloat_1x1x3x2_Array) {
+TEST_P(PadTestFloat, Pad4DFloat_1x1x3x2_Array) {
   ComputationBuilder b(client_, TestName());
   auto input = MakeUnique<Array4D<float>>(1, 1, 3, 2);
   Array2D<float> input_xy({
@@ -123,7 +147,7 @@ TEST_F(PadTest, Pad4DFloat_1x1x3x2_Array) {
   });
   input->FillWithYX(input_xy);
 
-  b.Pad(b.ConstantR4FromArray4D<float>(*input), b.ConstantR0<float>(1.5),
+  b.Pad(AddParam(*input, &b), AddParam(*Literal::CreateR0<float>(1.5), &b),
         r4_padding_on_dim0_dim1_);
 
   auto expected = MakeUnique<Array4D<float>>(2, 3, 3, 2);
@@ -134,15 +158,15 @@ TEST_F(PadTest, Pad4DFloat_1x1x3x2_Array) {
   (*expected)(1, 0, 1, 1) = 4.0f;
   (*expected)(1, 0, 2, 0) = 5.0f;
   (*expected)(1, 0, 2, 1) = 6.0f;
-  ComputeAndCompareR4<float>(&b, *expected, {}, ErrorSpec(0.0001));
+  ComputeAndCompareR4<float>(&b, *expected, {}, DefaultErrorSpec());
 }
 
-TEST_F(PadTest, Pad4DFloatArrayWithInteriorPadding) {
+TEST_P(PadTestFloat, Pad4DFloatArrayWithInteriorPadding) {
   ComputationBuilder b(client_, TestName());
 
   const float pad_value = 1.5f;
   Array4D<float> input(3, 2, 1, 1, {1, 2, 3, 4, 5, 6});
-  b.Pad(b.ConstantR4FromArray4D<float>(input), b.ConstantR0<float>(pad_value),
+  b.Pad(AddParam(input, &b), AddParam(*Literal::CreateR0<float>(pad_value), &b),
         r4_padding_on_dim0_dim1_);
 
   auto expected = MakeUnique<Array4D<float>>(8, 5, 1, 1);
@@ -156,7 +180,7 @@ TEST_F(PadTest, Pad4DFloatArrayWithInteriorPadding) {
   ComputeAndCompareR4<float>(&b, *expected, {}, ErrorSpec(0.0001));
 }
 
-TEST_F(PadTest, Pad4DFloatArrayMinorFirstSmall) {
+TEST_P(PadTestFloat, Pad4DFloatArrayMinorFirstSmall) {
   ComputationBuilder b(client_, TestName());
 
   PaddingConfig padding_config;
@@ -184,7 +208,8 @@ TEST_F(PadTest, Pad4DFloatArrayMinorFirstSmall) {
   auto input = Literal::CreateR4FromArray4D<float>(input_array);
   input = input->Relayout(layout);
 
-  b.Pad(b.ConstantLiteral(*input), b.ConstantR0(pad_value), padding_config);
+  b.Pad(AddParam(*input, &b),
+        AddParam(*Literal::CreateR0<float>(pad_value), &b), padding_config);
 
   Array4D<float> expected_array(1, 1, 5, 8);
   expected_array.Fill(pad_value);
@@ -197,7 +222,7 @@ TEST_F(PadTest, Pad4DFloatArrayMinorFirstSmall) {
   ComputeAndCompareR4<float>(&b, expected_array, {}, ErrorSpec(0.0001));
 }
 
-XLA_TEST_F(PadTest, Pad4DFloatArrayMinorFirstNonTrivialMinorDimensions) {
+XLA_TEST_P(PadTestFloat, Pad4DFloatArrayMinorFirstNonTrivialMinorDimensions) {
   ComputationBuilder b(client_, TestName());
 
   PaddingConfig padding_config;
@@ -229,7 +254,8 @@ XLA_TEST_F(PadTest, Pad4DFloatArrayMinorFirstNonTrivialMinorDimensions) {
   auto input = Literal::CreateR4FromArray4D<float>(input_array);
   input = input->Relayout(layout);
 
-  b.Pad(b.ConstantLiteral(*input), b.ConstantR0(pad_value), padding_config);
+  b.Pad(AddParam(*input, &b),
+        AddParam(*Literal::CreateR0<float>(pad_value), &b), padding_config);
 
   Array4D<float> expected_array(1, 25, 17, 11);
   expected_array.Fill(pad_value);
@@ -249,7 +275,7 @@ XLA_TEST_F(PadTest, Pad4DU8Array) {
   });
   input->FillWithYX(input_xy);
 
-  b.Pad(b.ConstantR4FromArray4D<uint8>(*input), b.ConstantR0<uint8>(35),
+  b.Pad(AddParam(*input, &b), b.ConstantR0<uint8>(35),
         r4_padding_on_dim0_dim1_);
 
   auto expected = MakeUnique<Array4D<uint8>>(2, 3, 3, 2);
@@ -277,8 +303,7 @@ XLA_TEST_F(PadTest, Pad4DPredArray) {
   auto ones = MakeUnique<Array4D<int32>>(2, 3, 3, 2);
   zeros->Fill(0);
   ones->Fill(1);
-  b.Select(padded, b.ConstantR4FromArray4D<int32>(*ones),
-           b.ConstantR4FromArray4D<int32>(*zeros));
+  b.Select(padded, AddParam(*ones, &b), AddParam(*zeros, &b));
 
   auto expected = MakeUnique<Array4D<int32>>(2, 3, 3, 2);
   expected->Fill(0);
@@ -291,10 +316,12 @@ XLA_TEST_F(PadTest, Pad4DPredArray) {
   ComputeAndCompareR4<int32>(&b, *expected, {});
 }
 
-XLA_TEST_F(PadTest, Large2DPad) {
+XLA_TEST_P(PadTestFloat, Large2DPad) {
   ComputationBuilder b(client_, TestName());
 
-  auto input = b.Parameter(0, ShapeUtil::MakeShape(F32, {4, 4}), "input");
+  auto ones = MakeUnique<Array2D<float>>(4, 4);
+  ones->Fill(1.0f);
+  auto input = AddParam(*ones, &b);
   PaddingConfig padding_config = MakeNoPaddingConfig(2);
   for (int dim : {0, 1}) {
     padding_config.mutable_dimensions(dim)->set_edge_padding_low(
@@ -302,25 +329,22 @@ XLA_TEST_F(PadTest, Large2DPad) {
     padding_config.mutable_dimensions(dim)->set_edge_padding_high(58 +
                                                                   100 * dim);
   }
-  auto padded = b.Pad(input, b.ConstantR0<float>(0.0f), padding_config);
-
-  auto ones = MakeUnique<Array2D<float>>(4, 4);
-  ones->Fill(1.0f);
-  auto input_literal = Literal::CreateR2FromArray2D<float>(*ones);
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
+  auto padded = b.Pad(input, AddParam(*Literal::CreateR0<float>(0.0f), &b),
+                      padding_config);
 
   auto expected = ReferenceUtil::PadArray2D(*ones, padding_config, 0.0f);
-  ComputeAndCompareR2<float>(&b, *expected, {input_data.get()});
+  ComputeAndCompareR2<float>(&b, *expected, {}, DefaultErrorSpec());
 }
 
-XLA_TEST_F(PadTest, AllTypes2DPad) {
+XLA_TEST_P(PadTestFloat, AllTypes2DPad) {
   ComputationBuilder b(client_, TestName());
 
   constexpr int64 in_rows = 35;
   constexpr int64 in_cols = 35;
-  auto input =
-      b.Parameter(0, ShapeUtil::MakeShape(F32, {in_rows, in_cols}), "input");
+  auto operand = MakeUnique<Array2D<float>>(in_rows, in_cols);
+  operand->FillUnique(0.0f);
+  auto input = AddParam(*operand, &b);
+
   PaddingConfig padding_config = MakeNoPaddingConfig(2);
   padding_config.mutable_dimensions(0)->set_edge_padding_low(7);
   padding_config.mutable_dimensions(0)->set_edge_padding_high(5);
@@ -328,20 +352,14 @@ XLA_TEST_F(PadTest, AllTypes2DPad) {
   padding_config.mutable_dimensions(1)->set_edge_padding_low(6);
   padding_config.mutable_dimensions(1)->set_edge_padding_high(4);
   padding_config.mutable_dimensions(1)->set_interior_padding(2);
-  auto padded = b.Pad(input, b.ConstantR0<float>(3.14f), padding_config);
-
-  auto operand = MakeUnique<Array2D<float>>(in_rows, in_cols);
-  operand->FillUnique(0.0f);
-  auto input_literal = Literal::CreateR2FromArray2D<float>(*operand);
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
+  auto padded = b.Pad(input, AddParam(*Literal::CreateR0<float>(3.14f), &b),
+                      padding_config);
 
   auto expected = ReferenceUtil::PadArray2D(*operand, padding_config, 3.14f);
-  ComputeAndCompareR2<float>(&b, *expected, {input_data.get()},
-                             ErrorSpec{0.0001});
+  ComputeAndCompareR2<float>(&b, *expected, {}, DefaultErrorSpec());
 }
 
-XLA_TEST_F(PadTest, High2DPad) {
+XLA_TEST_P(PadTestFloat, High2DPad) {
   ComputationBuilder b(client_, TestName());
 
   constexpr int64 in_rows = 129;
@@ -349,8 +367,9 @@ XLA_TEST_F(PadTest, High2DPad) {
   constexpr int64 low_padding = 0;
   int64 high_padding[2] = {5, 7};
   constexpr int64 interior_padding = 0;
-  auto input =
-      b.Parameter(0, ShapeUtil::MakeShape(F32, {in_rows, in_cols}), "input");
+  auto operand = MakeUnique<Array2D<float>>(in_rows, in_cols);
+  operand->FillUnique(1.0f);
+  auto input = AddParam(*operand, &b);
   PaddingConfig padding_config = MakeNoPaddingConfig(2);
   for (int dim : {0, 1}) {
     padding_config.mutable_dimensions(dim)->set_edge_padding_low(low_padding);
@@ -359,20 +378,15 @@ XLA_TEST_F(PadTest, High2DPad) {
     padding_config.mutable_dimensions(dim)->set_interior_padding(
         interior_padding);
   }
-  auto padded = b.Pad(input, b.ConstantR0<float>(2.718f), padding_config);
+  auto padded = b.Pad(input, AddParam(*Literal::CreateR0<float>(2.718f), &b),
+                      padding_config);
 
-  auto operand = MakeUnique<Array2D<float>>(in_rows, in_cols);
-  operand->FillUnique(1.0f);
-  auto input_literal = Literal::CreateR2FromArray2D<float>(*operand);
   auto expected = ReferenceUtil::PadArray2D(*operand, padding_config, 2.718f);
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
 
-  ComputeAndCompareR2<float>(&b, *expected, {input_data.get()},
-                             ErrorSpec(0.0001));
+  ComputeAndCompareR2<float>(&b, *expected, {}, DefaultErrorSpec());
 }
 
-XLA_TEST_F(PadTest, NegativePadding2D) {
+XLA_TEST_P(PadTestFloat, NegativePadding2D) {
   ComputationBuilder b(client_, TestName());
 
   constexpr int64 in_rows = 129;
@@ -380,8 +394,9 @@ XLA_TEST_F(PadTest, NegativePadding2D) {
   int64 low_padding[2] = {-1, -2};
   int64 high_padding[2] = {-3, 4};
   constexpr int64 interior_padding = 0;
-  auto input =
-      b.Parameter(0, ShapeUtil::MakeShape(F32, {in_rows, in_cols}), "input");
+  auto operand = MakeUnique<Array2D<float>>(in_rows, in_cols);
+  operand->FillUnique(1.0f);
+  auto input = AddParam(*operand, &b);
   PaddingConfig padding_config = MakeNoPaddingConfig(2);
   for (int dim : {0, 1}) {
     padding_config.mutable_dimensions(dim)->set_edge_padding_low(
@@ -391,20 +406,15 @@ XLA_TEST_F(PadTest, NegativePadding2D) {
     padding_config.mutable_dimensions(dim)->set_interior_padding(
         interior_padding);
   }
-  auto padded = b.Pad(input, b.ConstantR0<float>(2.718f), padding_config);
+  auto padded = b.Pad(input, AddParam(*Literal::CreateR0<float>(2.718f), &b),
+                      padding_config);
 
-  auto operand = MakeUnique<Array2D<float>>(in_rows, in_cols);
-  operand->FillUnique(1.0f);
-  auto input_literal = Literal::CreateR2FromArray2D<float>(*operand);
   auto expected = ReferenceUtil::PadArray2D(*operand, padding_config, 2.718f);
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
 
-  ComputeAndCompareR2<float>(&b, *expected, {input_data.get()},
-                             ErrorSpec(0.0001));
+  ComputeAndCompareR2<float>(&b, *expected, {}, DefaultErrorSpec());
 }
 
-XLA_TEST_F(PadTest, NegativeAndInteriorPadding2D) {
+XLA_TEST_P(PadTestFloat, NegativeAndInteriorPadding2D) {
   ComputationBuilder b(client_, TestName());
 
   constexpr int64 in_rows = 8;
@@ -412,8 +422,9 @@ XLA_TEST_F(PadTest, NegativeAndInteriorPadding2D) {
   int64 low_padding[2] = {4, -1};
   int64 high_padding[2] = {-2, -4};
   int64 interior_padding[2] = {1, 2};
-  auto input =
-      b.Parameter(0, ShapeUtil::MakeShape(F32, {in_rows, in_cols}), "input");
+  auto operand = MakeUnique<Array2D<float>>(in_rows, in_cols);
+  operand->FillUnique(1.0f);
+  auto input = AddParam(*operand, &b);
   PaddingConfig padding_config = MakeNoPaddingConfig(2);
   for (int dim : {0, 1}) {
     padding_config.mutable_dimensions(dim)->set_edge_padding_low(
@@ -423,44 +434,40 @@ XLA_TEST_F(PadTest, NegativeAndInteriorPadding2D) {
     padding_config.mutable_dimensions(dim)->set_interior_padding(
         interior_padding[dim]);
   }
-  auto padded = b.Pad(input, b.ConstantR0<float>(2.718f), padding_config);
+  auto padded = b.Pad(input, AddParam(*Literal::CreateR0<float>(2.718f), &b),
+                      padding_config);
 
-  auto operand = MakeUnique<Array2D<float>>(in_rows, in_cols);
-  operand->FillUnique(1.0f);
-  auto input_literal = Literal::CreateR2FromArray2D<float>(*operand);
   auto expected = ReferenceUtil::PadArray2D(*operand, padding_config, 2.718f);
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
 
-  ComputeAndCompareR2<float>(&b, *expected, {input_data.get()},
-                             ErrorSpec(0.0001));
+  ComputeAndCompareR2<float>(&b, *expected, {}, DefaultErrorSpec());
 }
 
 // Regression test for b/31827337.
-XLA_TEST_F(PadTest, ReducePad) {
+XLA_TEST_P(PadTestFloat, ReducePad) {
   ComputationBuilder b(client_, TestName());
-  auto input = b.Parameter(0, ShapeUtil::MakeShape(F32, {2, 2, 2, 2}), "input");
+  auto ones = MakeUnique<Array4D<float>>(2, 2, 2, 2);
+  ones->Fill(1.0);
+  auto input = AddParam(*ones, &b);
 
-  Computation add_f32 = CreateScalarAddComputation(F32, &b);
-  auto reduce = b.Reduce(input, b.ConstantR0<float>(0.0), add_f32, {0});
+  Computation add = CreateScalarAddComputation(FloatType(), &b);
+  auto reduce =
+      b.Reduce(input, AddParam(*Literal::CreateR0<float>(0.0), &b), add, {0});
 
   PaddingConfig padding_config = MakeNoPaddingConfig(3);
   padding_config.mutable_dimensions(0)->set_edge_padding_low(1);
   padding_config.mutable_dimensions(0)->set_edge_padding_high(1);
-  auto pad = b.Pad(reduce, b.ConstantR0<float>(0.0), padding_config);
-
-  auto ones = MakeUnique<Array4D<float>>(2, 2, 2, 2);
-  ones->Fill(1.0);
-  auto input_literal = Literal::CreateR4FromArray4D<float>(*ones);
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
+  auto padded = b.Pad(reduce, AddParam(*Literal::CreateR0<float>(0.0f), &b),
+                      padding_config);
 
   Array3D<float> expected({{{0.0, 0.0}, {0.0, 0.0}},
                            {{2.0, 2.0}, {2.0, 2.0}},
                            {{2.0, 2.0}, {2.0, 2.0}},
                            {{0.0, 0.0}, {0.0, 0.0}}});
-  ComputeAndCompareR3<float>(&b, expected, {input_data.get()});
+  ComputeAndCompareR3<float>(&b, expected, {}, DefaultErrorSpec());
 }
 
+INSTANTIATE_TEST_CASE_P(PadTestFloatInstantiation, PadTestFloat,
+                        ::testing::ValuesIn(use_bfloat16_params));
+
 }  // namespace
 }  // namespace xla
-- 
GitLab


From 39bc42ebcf0df005b378fa88a4650a5bebb1eb0c Mon Sep 17 00:00:00 2001
From: Andrew Selle <aselle@google.com>
Date: Tue, 30 Jan 2018 09:55:38 -0800
Subject: [PATCH 387/966] Create an interface to create hints for future toco
 conversions.

Specifically, tf.contrib.lite.OpHint can create "breadcrumb"
hints that describe encapsulation of multiple TensorFlow ops
that make up a TensorFlow lite builtin or custom op. These
can later be replaced with stub versions in a GraphDef or
SavedModel.

PiperOrigin-RevId: 183846742
---
 tensorflow/contrib/framework/__init__.py    |   1 +
 tensorflow/contrib/lite/python/BUILD        |  13 +
 tensorflow/contrib/lite/python/lite.py      |   7 +-
 tensorflow/contrib/lite/python/lite_test.py | 118 +++++++-
 tensorflow/contrib/lite/python/op_hint.py   | 291 ++++++++++++++++++++
 5 files changed, 428 insertions(+), 2 deletions(-)
 create mode 100644 tensorflow/contrib/lite/python/op_hint.py

diff --git a/tensorflow/contrib/framework/__init__.py b/tensorflow/contrib/framework/__init__.py
index 673c517842..503b868aaa 100644
--- a/tensorflow/contrib/framework/__init__.py
+++ b/tensorflow/contrib/framework/__init__.py
@@ -53,6 +53,7 @@ See the @{$python/contrib.framework} guide.
 @@assign_from_values_fn
 @@create_global_step
 @@filter_variables
+@@fuse_op
 @@get_global_step
 @@get_or_create_global_step
 @@get_local_variables
diff --git a/tensorflow/contrib/lite/python/BUILD b/tensorflow/contrib/lite/python/BUILD
index 3d6a3ec0fd..2d8c49b7d7 100644
--- a/tensorflow/contrib/lite/python/BUILD
+++ b/tensorflow/contrib/lite/python/BUILD
@@ -13,6 +13,7 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
+        ":op_hint",
         "//tensorflow/contrib/lite/toco:model_flags_proto_py",
         "//tensorflow/contrib/lite/toco:toco_flags_proto_py",
         "//tensorflow/contrib/lite/toco/python:tensorflow_wrap_toco",
@@ -20,6 +21,17 @@ py_library(
     ],
 )
 
+py_library(
+    name = "op_hint",
+    srcs = ["op_hint.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/contrib/framework:framework_py",
+        "//tensorflow/python:platform",
+    ],
+)
+
 py_test(
     name = "lite_test",
     srcs = ["lite_test.py"],
@@ -27,6 +39,7 @@ py_test(
     tags = ["no_oss"],
     deps = [
         ":lite",
+        ":op_hint",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
diff --git a/tensorflow/contrib/lite/python/lite.py b/tensorflow/contrib/lite/python/lite.py
index 3c369774be..5d2f216537 100644
--- a/tensorflow/contrib/lite/python/lite.py
+++ b/tensorflow/contrib/lite/python/lite.py
@@ -18,16 +18,21 @@ EXPERIMENTAL: APIs here are unstable and likely to change without notice.
 
 @@toco_convert
 @@toco_convert_protos
+@@OpHint
+@@convert_op_hints_to_stubs
 
 """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-
 import os
 import subprocess
 import tempfile
 
+# pylint: disable=unused-import
+from tensorflow.contrib.lite.python.op_hint import convert_op_hints_to_stubs
+from tensorflow.contrib.lite.python.op_hint import OpHint
+# pylint: enable=unused-import
 from tensorflow.contrib.lite.toco import model_flags_pb2 as _model_flags_pb2
 from tensorflow.contrib.lite.toco import toco_flags_pb2 as _toco_flags_pb2
 from tensorflow.contrib.lite.toco import types_pb2 as _types_pb2
diff --git a/tensorflow/contrib/lite/python/lite_test.py b/tensorflow/contrib/lite/python/lite_test.py
index 7d55f3fe6f..b8b4510188 100644
--- a/tensorflow/contrib/lite/python/lite_test.py
+++ b/tensorflow/contrib/lite/python/lite_test.py
@@ -18,10 +18,14 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.lite.python import lite
+from tensorflow.contrib.lite.python.op_hint import _tensor_name_base as _tensor_name_base
 from tensorflow.python.client import session
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
+from tensorflow.python.framework.graph_util_impl import _bfs_for_reachable_nodes
+from tensorflow.python.framework.graph_util_impl import _extract_graph_summary
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
@@ -35,7 +39,8 @@ class LiteTest(test_util.TensorFlowTestCase):
     # Try running on valid graph
     result = lite.toco_convert(sess.graph_def, [in_tensor], [out_tensor])
     self.assertTrue(result)
-    # TODO(aselle): remove tests that fail.
+    # TODO(aselle): remove tests that fail (we must get TOCO to not fatal
+    # all the time).
     # Try running on identity graph (known fail)
     # with self.assertRaisesRegexp(RuntimeError, "!model->operators.empty()"):
     #   result = lite.toco_convert(sess.graph_def, [in_tensor], [in_tensor])
@@ -51,5 +56,116 @@ class LiteTest(test_util.TensorFlowTestCase):
                                quantized_input_stats=[(0., 1.)])
     self.assertTrue(result)
 
+
+class LiteTestOpHint(test_util.TensorFlowTestCase):
+  """Test the hint to stub functionality."""
+
+  def _getGraphOpTypes(self, graphdef, output_nodes):
+    """Returns used op types in `graphdef` reachable from `output_nodes`.
+
+    This is used to check that after the stub transformation the expected
+    nodes are there. Typically use this with self.assertCountEqual(...).
+
+    NOTE: this is not a exact test that the graph is the correct output, but
+      it balances compact expressibility of test with sanity checking.
+
+    Args:
+      graphdef: TensorFlow proto graphdef.
+      output_nodes: A list of output node names that we need to reach.
+
+    Returns:
+      A set of node types reachable from `output_nodes`.
+    """
+    name_to_input_name, name_to_node, _ = (
+        _extract_graph_summary(graphdef))
+    # Find all nodes that are needed by the outputs
+    used_node_names = _bfs_for_reachable_nodes(output_nodes, name_to_input_name)
+    return set([name_to_node[node_name].op for node_name in used_node_names])
+
+  def _countIdentities(self, nodes):
+    """Count the number of "Identity" op types in the list of proto nodes.
+
+    Args:
+      nodes: NodeDefs of the graph.
+
+    Returns:
+      The number of nodes with op type "Identity" found.
+    """
+    return len([x for x in nodes if x.op == "Identity"])
+
+  def testSwishLiteHint(self):
+    """Makes a custom op swish and makes sure it gets converted as a unit."""
+    image = array_ops.constant([1., 2., 3., 4.])
+    swish_scale = array_ops.constant(1.0)
+
+    def _swish(input_tensor, scale):
+      custom = lite.OpHint("cool_activation")
+      input_tensor, scale = custom.add_inputs(input_tensor, scale)
+      output = math_ops.sigmoid(input_tensor) * input_tensor * scale
+      output, = custom.add_outputs(output)
+      return output
+    output = array_ops.identity(_swish(image, swish_scale), name="ModelOutput")
+
+    with self.test_session() as sess:
+      # check if identities have been put into the graph (2 input, 1 output,
+      # and 1 final output).
+      self.assertEqual(self._countIdentities(sess.graph_def.node), 4)
+
+      stubbed_graphdef = lite.convert_op_hints_to_stubs(sess)
+
+      self.assertCountEqual(
+          self._getGraphOpTypes(
+              stubbed_graphdef, output_nodes=[_tensor_name_base(output)]),
+          ["cool_activation", "Const", "Identity"])
+
+  def testScaleAndBiasAndIdentity(self):
+    """This tests a scaled add which has 3 inputs and 2 outputs."""
+    a = array_ops.constant(1.)
+    x = array_ops.constant([2., 3.])
+    b = array_ops.constant([4., 5.])
+
+    def _scaled_and_bias_and_identity(a, x, b):
+      custom = lite.OpHint("scale_and_bias_and_identity")
+      a, x, b = custom.add_inputs(a, x, b)
+      return custom.add_outputs(a * x + b, x)
+    output = array_ops.identity(_scaled_and_bias_and_identity(a, x, b),
+                                name="ModelOutput")
+
+    with self.test_session() as sess:
+      # make sure one identity for each input (3) and output (2) => 3 + 2 = 5
+      # +1 for the final output
+      self.assertEqual(self._countIdentities(sess.graph_def.node), 6)
+
+      stubbed_graphdef = lite.convert_op_hints_to_stubs(sess)
+
+      self.assertCountEqual(
+          self._getGraphOpTypes(
+              stubbed_graphdef, output_nodes=[_tensor_name_base(output)]),
+          ["scale_and_bias_and_identity", "Const", "Identity", "Pack"])
+
+  def testTwoFunctions(self):
+    """Tests if two functions are converted correctly."""
+    a = array_ops.constant([1.])
+    b = array_ops.constant([1.])
+    def _double_values(x):
+      custom = lite.OpHint("add_test")
+      x = custom.add_inputs(x)
+      output = math_ops.multiply(x, x)
+      output, = custom.add_outputs(output)
+      return output
+    output = array_ops.identity(
+        math_ops.add(_double_values(a), _double_values(b)), name="ModelOutput")
+
+    with self.test_session() as sess:
+      # make sure one identity for each input (2) and output (2) => 2 + 2
+      # +1 for the final output
+      self.assertEqual(self._countIdentities(sess.graph_def.node), 5)
+      stubbed_graphdef = lite.convert_op_hints_to_stubs(sess)
+      self.assertCountEqual(
+          self._getGraphOpTypes(
+              stubbed_graphdef, output_nodes=[_tensor_name_base(output)]),
+          ["add_test", "Const", "Identity", "Add"])
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/lite/python/op_hint.py b/tensorflow/contrib/lite/python/op_hint.py
new file mode 100644
index 0000000000..7c587e38b1
--- /dev/null
+++ b/tensorflow/contrib/lite/python/op_hint.py
@@ -0,0 +1,291 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Define tflite op hints (intrinsic operations).
+
+This essentially allows defining a TensorFlow API for tflite operations in
+Python with hints on how they are represented in TensorFlow Lite. This basically
+is a form of tflite intrinsic. It wraps a subpart of a TensorFlow execution
+graph and is useful for LSTMs and other complicated TensorFlow constructions
+that are difficult to pattern match in TOCO, but are represented by a single
+accelerated tflite op.
+
+Example:
+  def tflite_cool_activation(input):
+    # A cool activation function.
+    custom = tf.contrib.lite.OpHint("cool_activation")
+    input = custom.add_inputs(input)
+    output = tf.sigmoid(input) * input
+    custom.add_outputs(output)
+    return output
+
+  image = tf.placeholder(tf.float32, (1, 16, 16, 1))
+  output = tf.identity(tflite_cool_activation(image))
+
+  session = tf.Session()
+
+  graphdef_to_convert = tf.contrib.lite.convert_op_hints_to_stubs(session)
+  tflite_graph = tf.contrib.lite.toco_convert(graphdef_to_convert,
+                                              [image], [output])
+                                              [image], [output])
+  with open("/tmp/graph.fb", "wb") as fp:
+    fp.write(tflite_graph)
+
+How does it work?:
+
+OpHint is a helper that you use when defining a vanilla python function.
+It allows you to wrap arguments with tf.identities with some custom attributes.
+These attributes allow you to find the original block of ops that was created.
+For example, if you use cool_activation above you essentially get:
+
+a_input = tf.identity()
+result = tf.multiply(tf.sigmoid(a_input), a_input)
+output = tf.identity()
+
+a_input, output are identities that have parameters representing
+what argument they are, what the name of the function they should turn into
+in tf lite as well as a guid that uniquely identifies a particular invocation.
+
+Once you have built your whole tensorflow graph, you can run it and train it
+as usual, but after you have done that, you need to convert the graph into
+a form that replaces these subgraphs wrapped in identities to stub ops. These
+ops don't actually exist in the normal TensorFlow runtime, but will be
+understood by toco later.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections as _collections
+import itertools as _itertools
+import uuid as _uuid
+
+from tensorflow.contrib import framework as _framework
+from tensorflow.python.framework import ops as _ops
+from tensorflow.python.ops import array_ops as _array_ops
+from tensorflow.python.util.all_util import remove_undocumented
+
+
+class OpHint(object):
+  """A class that helps build tflite function invocations.
+
+  It allows you to take a bunch of TensorFlow ops and annotate the construction
+  such that toco knows how to convert it to tflite. This embeds a pseudo
+  function in a TensorFlow graph. This allows embedding high-level API usage
+  information in a lower level TensorFlow implementation so that an alternative
+  implementation can be substituted later.
+
+  Essentially, any "input" into this pseudo op is fed into an identity, and
+  attributes are added to that input before being used by the constituent ops
+  that make up the pseudo op. A similar process is done to any output that
+  is to be exported from the current op.
+
+  TODO(aselle): When TensorFlow functions functionality works for arbitrary
+  constructs, this mechanism can be retired and changed to use python defun's.
+  """
+
+  # Attr constants that are used for representation in the GraphDef
+  FUNCTION_NAME_ATTR = "_tflite_function_name"
+  FUNCTION_UUID_ATTR = "_tflite_function_uuid"
+  FUNCTION_INPUT_INDEX_ATTR = "_tflite_function_input_index"
+  FUNCTION_OUTPUT_INDEX_ATTR = "_tflite_function_output_index"
+
+  def __init__(self, function_name, **kwargs):
+    """Create a OpHint.
+
+    Args:
+      function_name: Name of the function (the custom op name in tflite)
+      **kwargs: Keyword arguments of any constant attributes for the function.
+    """
+    self._function_name = function_name
+    self._unique_function_id = _uuid.uuid1().hex  # TODO(aselle): Unique enough?
+    self._curr_input_index = 0
+    self._curr_output_index = 0
+    self._attrs_to_store_later = kwargs
+    self._stored_attrs = False
+
+  def _setattr(self, dest_op, name, value):
+    tensor_value = _ops.convert_to_tensor(value)
+    dest_op.op.node_def.attr[name].tensor.CopyFrom(
+        tensor_value.op.node_def.attr["value"].tensor)
+
+  def add_inputs(self, *args):
+    """Add a sequence of inputs to the function invocation.
+
+    Args:
+      *args: List of inputs to be converted (should be Tf.Tensor).
+    Returns:
+      Wrapped inputs (identity standins that have additional metadata). These
+      are also are also tf.Tensor's.
+    """
+
+    def augmented_identity(arg):
+      identity_op = _array_ops.identity(arg)
+      attr = identity_op.op.node_def.attr
+      attr[OpHint.FUNCTION_NAME_ATTR].s = self._function_name
+      attr[OpHint.FUNCTION_UUID_ATTR].s = self._unique_function_id
+      attr[OpHint.FUNCTION_INPUT_INDEX_ATTR].i = self._curr_input_index
+      self._curr_input_index += 1
+      return identity_op
+
+    return [augmented_identity(arg) for arg in args]
+
+  def add_outputs(self, *args):
+    """Add a sequence of outputs to the function invocation.
+
+    Args:
+      *args: List of outputs to be converted (should be tf.Tensor).
+    Returns:
+      Wrapped outputs (identity standins that have additional metadata). These
+      are also tf.Tensor's.
+    """
+
+    def augmented_identity(arg):
+      identity_op = _array_ops.identity(arg)
+      attr = identity_op.op.node_def.attr
+      attr[OpHint.FUNCTION_NAME_ATTR].s = self._function_name
+      attr[OpHint.FUNCTION_UUID_ATTR].s = self._unique_function_id
+      attr[OpHint.FUNCTION_OUTPUT_INDEX_ATTR].i = self._curr_output_index
+      self._curr_output_index += 1
+      return identity_op
+
+    wrapped_outputs = [augmented_identity(arg) for arg in args]
+
+    if not self._stored_attrs:
+      for key, value in self._attrs_to_store_later.iteritems():
+        self._setattr(wrapped_outputs[0], "_tflite_attr_" + key, value)
+      self._stored_attrs = True
+
+    return wrapped_outputs
+
+
+class _LiteFuncCall(object):
+  """Represent a TensorFlow Lite custom function.
+
+  This is uses to accumulate found hints in the graphdef into a single
+  conceptual unit.
+
+  Properties:
+    self.inputs: inputs to the op (hash from index # to argument)
+    self.outputs: outputs to the op (hash from index # to argument)
+    self.function_name: the tflite custom op name to use
+    self.uuid: a unique call id for this particular call  (i.e.
+      multiple function calls would have the same function_name but different
+      uuids.
+    self.params: A param name to key value for op constant data. I.e. for
+      axis on a reduction, strides on a convolution, etc.
+  """
+
+  def __init__(self):
+    self.inputs = {}
+    self.outputs = {}
+    self.function_name = None
+    self.uuid = None
+    self.params = {}
+
+  def __str__(self):
+    return "tflite function %s call %s\n\tinputs: %r\n\toutputs: %r" % (
+        self.function_name, self.uuid, self.inputs, self.outputs)
+
+
+def _find_all_hints_in_graph_def(session):
+  """Look at the current default graph and return a list of LiteFuncCall objs.
+
+  Args:
+    session: A TensorFlow session that contains the graph to convert.
+  Returns:
+    a list of `LifeFuncCall` objects in the form
+
+  """
+  func_calls = _collections.defaultdict(_LiteFuncCall)
+  seen_ops = set()
+
+  for op in session.graph.get_operations():
+    for operand in _itertools.chain(op.inputs, op.outputs):
+      if operand in seen_ops:
+        continue
+      seen_ops.add(operand)
+      attr = operand.op.node_def.attr
+      uuid = attr[OpHint.FUNCTION_UUID_ATTR].s
+      if OpHint.FUNCTION_UUID_ATTR not in attr:
+        continue
+      call_def = func_calls[uuid]
+      call_def.uuid = uuid
+      if OpHint.FUNCTION_UUID_ATTR in attr:
+        call_def.function_name = attr[OpHint.FUNCTION_NAME_ATTR].s
+        if OpHint.FUNCTION_INPUT_INDEX_ATTR in attr:
+          call_def.inputs[attr[OpHint.FUNCTION_INPUT_INDEX_ATTR].i] = operand
+        if OpHint.FUNCTION_OUTPUT_INDEX_ATTR in attr:
+          call_def.outputs[attr[OpHint.FUNCTION_OUTPUT_INDEX_ATTR].i] = operand
+
+      for a in attr:
+        if a.startswith("_tflite_attr_"):
+          # TODO(aselle): Remember the attribute tensors so we can put them
+          # in collapse.
+          call_def.params[a.replace("_tflite_attr_,", "")] = attr[a].tensor
+
+  return func_calls
+
+
+def _tensor_name_base(full_tensor_name):
+  """Removes the device assignment code from a tensor.
+
+  e.g. _tensor_name_base("foo:3") => "foo"
+
+  Args:
+    full_tensor_name: A tensor name that is annotated with a device placement
+      (this is what tensor flow introspection gives).
+  Returns:
+    A name without any device assignment.
+  """
+  return full_tensor_name.name.split(":")[0]
+
+
+def convert_op_hints_to_stubs(session):
+  """Converts a graphdef with LiteOp hints into stub operations.
+
+  This is used to prepare for toco conversion of complex intrinsic usages.
+
+  Args:
+    session: A TensorFlow session that contains the graph to convert.
+  Returns:
+    A new graphdef with all ops contained in OpHints being replaced by
+    a single op call with the right parameters.
+  """
+  hints = _find_all_hints_in_graph_def(session)
+  current_graph_def = session.graph_def
+  for call in hints.values():
+    input_names = [None] * len(call.inputs)
+    output_names = [None] * len(call.outputs)
+    output_dtypes = [None] * len(call.outputs)
+    output_quantized = False
+    for input_index, tensor in call.inputs.items():
+      input_names[input_index] = _tensor_name_base(tensor)
+    for output_index, tensor in call.outputs.items():
+      output_names[output_index] = _tensor_name_base(tensor)
+      output_dtypes[output_index] = tensor.dtype.as_datatype_enum
+    # TODO(aselle): Support quantized flag properly
+    current_graph_def = _framework.fuse_op(
+        current_graph_def, input_names, output_names, output_dtypes,
+        output_quantized, call.uuid, call.function_name)
+    for node in current_graph_def.node:
+      if node.name == call.uuid:
+        for param, tensor in call.params.items():
+          node.attr[param].tensor.CopyFrom(tensor)
+  return current_graph_def
+
+
+_allowed_symbols = ["OpHint", "convert_op_hints_to_stubs"]
+remove_undocumented(__name__, _allowed_symbols)
-- 
GitLab


From 88eb6c61ef7659c2b5bb1ec6586c7d3cca5e4e9c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 30 Jan 2018 09:57:16 -0800
Subject: [PATCH 388/966] TensorFlow SavedModel loader: avoid segmentation
 fault when NewSession returns null

PiperOrigin-RevId: 183846994
---
 tensorflow/cc/saved_model/loader.cc      |  4 +++-
 tensorflow/cc/saved_model/loader_test.cc | 18 ++++++++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/tensorflow/cc/saved_model/loader.cc b/tensorflow/cc/saved_model/loader.cc
index acef098c7d..faa1e378d0 100644
--- a/tensorflow/cc/saved_model/loader.cc
+++ b/tensorflow/cc/saved_model/loader.cc
@@ -96,7 +96,9 @@ Status FindMetaGraphDefToLoad(const SavedModel& saved_model_proto,
 Status LoadMetaGraphIntoSession(const MetaGraphDef& meta_graph_def,
                                 const SessionOptions& session_options,
                                 std::unique_ptr<Session>* session) {
-  session->reset(NewSession(session_options));
+  Session* session_p = nullptr;
+  TF_RETURN_IF_ERROR(NewSession(session_options, &session_p));
+  session->reset(session_p);
   return (*session)->Create(meta_graph_def.graph_def());
 }
 
diff --git a/tensorflow/cc/saved_model/loader_test.cc b/tensorflow/cc/saved_model/loader_test.cc
index 0ad6b33bba..4c64d2cfe3 100644
--- a/tensorflow/cc/saved_model/loader_test.cc
+++ b/tensorflow/cc/saved_model/loader_test.cc
@@ -155,6 +155,24 @@ TEST_F(LoaderTest, NoTagMatchMultiple) {
       << st.error_message();
 }
 
+TEST_F(LoaderTest, SessionCreationFailure) {
+  SavedModelBundle bundle;
+  // Use invalid SessionOptions to cause session creation to fail.  Default
+  // options work, so provide an invalid value for the target field.
+  SessionOptions session_options;
+  constexpr char kInvalidTarget[] = "invalid target";
+  session_options.target = kInvalidTarget;
+  RunOptions run_options;
+
+  const string export_dir =
+      io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataSharded);
+  Status st = LoadSavedModel(session_options, run_options, export_dir,
+                             {kSavedModelTagServe}, &bundle);
+  EXPECT_FALSE(st.ok());
+  EXPECT_TRUE(StringPiece(st.error_message()).contains(kInvalidTarget))
+      << st.error_message();
+}
+
 TEST_F(LoaderTest, PbtxtFormat) {
   SavedModelBundle bundle;
   SessionOptions session_options;
-- 
GitLab


From c7b3d4be6113b9207ca25a3687e918cde61e210b Mon Sep 17 00:00:00 2001
From: Omar Aflak <aflakomar@hotmail.fr>
Date: Tue, 30 Jan 2018 11:10:56 -0800
Subject: [PATCH 389/966] Update README.md (#16558)

added a friendly badge to get the latest android tensorflow version
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 0c93813e58..c754c3f0db 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@
 
 | **`Linux CPU`** | **`Linux GPU`** | **`Mac OS CPU`** | **`Windows CPU`** | **`Android`** |
 |-----------------|---------------------|------------------|-------------------|---------------|
-| [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-cpu)](https://ci.tensorflow.org/job/tensorflow-master-cpu) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-linux-gpu)](https://ci.tensorflow.org/job/tensorflow-master-linux-gpu) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-mac)](https://ci.tensorflow.org/job/tensorflow-master-mac) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-win-cmake-py)](https://ci.tensorflow.org/job/tensorflow-master-win-cmake-py) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-android)](https://ci.tensorflow.org/job/tensorflow-master-android) |
+| [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-cpu)](https://ci.tensorflow.org/job/tensorflow-master-cpu) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-linux-gpu)](https://ci.tensorflow.org/job/tensorflow-master-linux-gpu) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-mac)](https://ci.tensorflow.org/job/tensorflow-master-mac) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-win-cmake-py)](https://ci.tensorflow.org/job/tensorflow-master-win-cmake-py) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-android)](https://ci.tensorflow.org/job/tensorflow-master-android) [ ![Download](https://api.bintray.com/packages/google/tensorflow/tensorflow/images/download.svg) ](https://bintray.com/google/tensorflow/tensorflow/_latestVersion) |
 
 **TensorFlow** is an open source software library for numerical computation using
 data flow graphs.  The graph nodes represent mathematical operations, while
-- 
GitLab


From d7b4fe4d4322a3fdab8a1dedb93d37a1f800a559 Mon Sep 17 00:00:00 2001
From: gracehoney <31743510+aaroey@users.noreply.github.com>
Date: Tue, 30 Jan 2018 12:00:23 -0800
Subject: [PATCH 390/966] Fix the build dependencies and formatting of the
 code, and make sure they follow the style conventions.

---
 tensorflow/contrib/BUILD                      |   2 +-
 tensorflow/contrib/tensorrt/BUILD             |  67 ++++------
 tensorflow/contrib/tensorrt/README.md         |  12 +-
 .../contrib/tensorrt/convert/convert_graph.cc |  41 +++---
 .../contrib/tensorrt/convert/convert_graph.h  |   8 ++
 .../contrib/tensorrt/convert/convert_nodes.cc |  52 ++++----
 .../contrib/tensorrt/convert/convert_nodes.h  |  10 +-
 .../contrib/tensorrt/kernels/trt_engine_op.cc |  23 ++--
 .../contrib/tensorrt/kernels/trt_engine_op.h  |  14 +-
 tensorflow/contrib/tensorrt/log/trt_logger.cc |   6 +-
 tensorflow/contrib/tensorrt/log/trt_logger.h  |   3 +-
 .../contrib/tensorrt/ops/trt_engine_op.cc     |   6 +
 .../tensorrt/python/ops/trt_engine_op.py      |   2 -
 .../contrib/tensorrt/python/trt_convert.py    |  31 +++--
 .../contrib/tensorrt/segment/segment.cc       |   1 -
 .../contrib/tensorrt/shape_fn/trt_shfn.cc     |  51 ++++----
 .../contrib/tensorrt/shape_fn/trt_shfn.h      |   7 +-
 tensorflow/contrib/tensorrt/trt_conversion.i  | 122 ++++++++++--------
 18 files changed, 236 insertions(+), 222 deletions(-)

diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index f745c175b1..738b74c929 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -105,7 +105,7 @@ py_library(
         "//tensorflow/contrib/training:training_py",
         "//tensorflow/contrib/util:util_py",
         "//tensorflow/python:util",
-    ] + if_mpi(["//tensorflow/contrib/mpi_collectives:mpi_ops_py"])
+    ] + if_mpi(["//tensorflow/contrib/mpi_collectives:mpi_collectives_py"])
     + if_tensorrt(["//tensorflow/contrib/tensorrt:init_py"]),
 
 )
diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index 2d4bccd267..3a214d2e86 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -17,7 +17,6 @@ load(
     "tf_gen_op_wrapper_py",
     "tf_py_wrap_cc",
     "tf_cc_test",
-    "tf_kernel_library",
     "tf_cuda_cc_test",
     "tf_cuda_library",
     "tf_custom_op_py_library",
@@ -47,13 +46,9 @@ tf_cuda_cc_test(
 
 tf_custom_op_library(
     name = "python/ops/_trt_engine_op.so",
-    srcs = [
-        "kernels/trt_engine_op.cc",
-        "kernels/trt_engine_op.h",
-        "ops/trt_engine_op.cc",
-    ],
-    gpu_srcs = [],
+    srcs = ["ops/trt_engine_op.cc"],
     deps = [
+        ":trt_engine_op_kernel",
         ":trt_shape_function",
         "//tensorflow/core:lib_proto_parsing",
         "//tensorflow/core/kernels:bounds_check_lib",
@@ -64,10 +59,9 @@ tf_custom_op_library(
 
 tf_cuda_library(
     name = "trt_shape_function",
-    srcs = [
-        "shape_fn/trt_shfn.cc",
-    ],
+    srcs = ["shape_fn/trt_shfn.cc"],
     hdrs = ["shape_fn/trt_shfn.h"],
+    visibility = ["//visibility:public"],
     deps = [
         ":trt_logging",
         "//tensorflow/core:framework_headers_lib",
@@ -76,36 +70,26 @@ tf_cuda_library(
         "@nsync//:nsync_headers",
         "@protobuf_archive//:protobuf",
     ],
-    visibility = ["//visibility:public"], #for c/c++ linking    
 )
 
-tf_kernel_library(
+cc_library(
     name = "trt_engine_op_kernel",
-    srcs = [
-        "kernels/trt_engine_op.cc",
-    ],
-    hdrs = [
-        "kernels/trt_engine_op.h",
-    ],
-    gpu_srcs = [
-    ],
+    srcs = ["kernels/trt_engine_op.cc"],
+    hdrs = ["kernels/trt_engine_op.h"],
     deps = [
         ":trt_logging",
-        ":trt_shape_function",
-        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:gpu_headers_lib",
-        "//tensorflow/core:lib",
         "//tensorflow/core:lib_proto_parsing",
         "//third_party/eigen3",
         "@local_config_tensorrt//:nv_infer",
+        "@nsync//:nsync_headers",
     ],
     alwayslink = 1,
 )
 
 tf_gen_op_libs(
-    op_lib_names = [
-        "trt_engine_op",
-    ],
+    op_lib_names = ["trt_engine_op"],
     deps = [
         "@local_config_tensorrt//:nv_infer",
     ],
@@ -113,12 +97,8 @@ tf_gen_op_libs(
 
 tf_cuda_library(
     name = "trt_logging",
-    srcs = [
-        "log/trt_logger.cc",
-    ],
-    hdrs = [
-        "log/trt_logger.h",
-    ],
+    srcs = ["log/trt_logger.cc"],
+    hdrs = ["log/trt_logger.h"],
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/core:lib_proto_parsing",
@@ -190,6 +170,7 @@ tf_py_wrap_cc(
     ],
 )
 
+# Library for the node-level conversion portion of TensorRT operation creation
 tf_cuda_library(
     name = "trt_conversion",
     srcs = [
@@ -201,29 +182,27 @@ tf_cuda_library(
         "convert/convert_nodes.h",
     ],
     deps = [
-        "@local_config_tensorrt//:nv_infer",
-        "@protobuf_archive//:protobuf_headers",
-        "@nsync//:nsync_headers",
         ":segment",
         ":trt_logging",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:framework_lite",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:framework_headers_lib",
-        "//tensorflow/core:core_cpu_base",
-        "//tensorflow/core/grappler/optimizers:constant_folding",
-        "//tensorflow/core/grappler/optimizers:layout_optimizer",
-        "//tensorflow/core/grappler/clusters:virtual_cluster",
         "//tensorflow/core/grappler:devices",
+        "//tensorflow/core/grappler/clusters:virtual_cluster",
         "//tensorflow/core/grappler/costs:graph_properties",
+        "//tensorflow/core/grappler/optimizers:constant_folding",
+        "//tensorflow/core/grappler/optimizers:layout_optimizer",
+        "@local_config_tensorrt//:nv_infer",
+        "@nsync//:nsync_headers",
+        "@protobuf_archive//:protobuf_headers",
     ],
 )
 
 # Library for the segmenting portion of TensorRT operation creation
 cc_library(
     name = "segment",
-    srcs = [
-        "segment/segment.cc",
-    ],
+    srcs = ["segment/segment.cc"],
     hdrs = [
         "segment/segment.h",
         "segment/union_find.h",
@@ -249,8 +228,6 @@ tf_cc_test(
     ],
 )
 
-# Library for the node-level conversion portion of TensorRT operation creation
-
 filegroup(
     name = "cppfiles",
     srcs = glob(["**/*.cc"]),
diff --git a/tensorflow/contrib/tensorrt/README.md b/tensorflow/contrib/tensorrt/README.md
index b362050983..b3c604f5f8 100644
--- a/tensorflow/contrib/tensorrt/README.md
+++ b/tensorflow/contrib/tensorrt/README.md
@@ -28,12 +28,12 @@ will be available. An example use is shown below.
 import tensorflow as tf
 import tensorflow.contrib.tensorrt as trt
 #... create and train or load model
-gdef=sess.graph.as_graph_def()
-trt_gdef=trt.CreateInferenceGraph(gdef, #original graph_def
-				  ["output"], #name of output node(s)
-				  max_batch_size, #maximum batch size to run the inference
-				  max_workspace_size # max memory for TensorRT to use 
-				  )
+gdef = sess.graph.as_graph_def()
+trt_gdef = trt.CreateInferenceGraph(
+    gdef, #original graph_def
+    ["output"], #name of output node(s)
+    max_batch_size, #maximum batch size to run the inference
+    max_workspace_size) # max memory for TensorRT to use
 tf.reset_default_graph()
 tf.import_graph_def(graph_def=trt_gdef)
 #...... run inference
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index b8dbc7b7c8..1507981ca8 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/contrib/tensorrt/convert/convert_graph.h"
+
 #include <list>
 #include <map>
 #include <set>
@@ -28,10 +30,6 @@ limitations under the License.
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/platform/logging.h"
-
 #include "tensorflow/core/grappler/clusters/virtual_cluster.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/devices.h"
@@ -39,11 +37,13 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
 #include "tensorflow/core/grappler/optimizers/layout_optimizer.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/protobuf/device_properties.pb.h"
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
-#include "tensorflow/contrib/tensorrt/convert/convert_graph.h"
 #include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
 #include "tensorflow/contrib/tensorrt/segment/segment.h"
 #include "tensorrt/include/NvInfer.h"
@@ -119,14 +119,15 @@ std::unordered_map<std::string, std::vector<int>> BuildTensorNameMap(
 }
 
 tensorflow::Status ConvertSubGraphToTensorRT(
-    tensorflow::Graph& graph, const std::vector<std::string>& output_names,
+    const std::vector<std::string>& output_names,
     const std::set<int>& subgraph_node_ids,
     size_t max_batch_size,  // max batch size that engine will be created for
     // max amount of memory that engine will be allowed to consume, in bytes
     size_t max_workspace_size,
-    const tensorflow::grappler::GraphProperties& graph_properties) {
+    const tensorflow::grappler::GraphProperties& graph_properties,
+    tensorflow::Graph* graph) {
   tensorflow::EdgeSet subgraph_incoming_edges;
-  GetSubGraphIncomingEdges(graph, subgraph_node_ids, &subgraph_incoming_edges);
+  GetSubGraphIncomingEdges(*graph, subgraph_node_ids, &subgraph_incoming_edges);
 
   std::vector<std::pair<int, int>> subgraph_inputs;
 
@@ -138,7 +139,7 @@ tensorflow::Status ConvertSubGraphToTensorRT(
   // Collect outputs referenced from output_names
   auto output_name_to_index_map = BuildTensorNameMap(output_names);
   for (int node_id : subgraph_node_ids) {
-    tensorflow::Node* node = graph.FindNodeId(node_id);
+    tensorflow::Node* node = graph->FindNodeId(node_id);
     if (output_name_to_index_map.count(node->name())) {
       for (int index : output_name_to_index_map.at(node->name())) {
         subgraph_outputs_set.insert({node_id, index});
@@ -147,7 +148,7 @@ tensorflow::Status ConvertSubGraphToTensorRT(
   }
   // Collect outputs referenced from outgoing edges
   tensorflow::EdgeSet subgraph_outgoing_edges;
-  GetSubGraphOutgoingEdges(graph, subgraph_node_ids, &subgraph_outgoing_edges);
+  GetSubGraphOutgoingEdges(*graph, subgraph_node_ids, &subgraph_outgoing_edges);
   for (const tensorflow::Edge* edge : subgraph_outgoing_edges) {
     subgraph_outputs_set.insert({edge->src()->id(), edge->src_output()});
   }
@@ -157,10 +158,10 @@ tensorflow::Status ConvertSubGraphToTensorRT(
   // Build TensorRT node and add it to the graph
   tensorflow::NodeDef trt_node_def;
   TF_RETURN_IF_ERROR(ConvertSubGraphToTensorRTNodeDef(
-      graph, subgraph_node_ids, subgraph_inputs, subgraph_outputs,
+      *graph, subgraph_node_ids, subgraph_inputs, subgraph_outputs,
       max_batch_size, max_workspace_size, graph_properties, &trt_node_def));
   tensorflow::Status status;
-  tensorflow::Node* trt_node = graph.AddNode(trt_node_def, &status);
+  tensorflow::Node* trt_node = graph->AddNode(trt_node_def, &status);
 
   TF_RETURN_IF_ERROR(status);
 
@@ -173,16 +174,16 @@ tensorflow::Status ConvertSubGraphToTensorRT(
   for (const tensorflow::Edge* edge : subgraph_outgoing_edges) {
     std::pair<int, int> old_src = {edge->src()->id(), edge->src_output()};
     int new_src_output = subgraph_edge_to_output_map.at(old_src);
-    graph.UpdateEdge(trt_node, new_src_output, edge->dst(), edge->dst_input());
+    graph->UpdateEdge(trt_node, new_src_output, edge->dst(), edge->dst_input());
   }
   // Remove the original subgraph
   for (int node_id : subgraph_node_ids) {
-    tensorflow::Node* node = graph.FindNodeId(node_id);
+    tensorflow::Node* node = graph->FindNodeId(node_id);
     // Don't remove the input placeholders
     if (node->type_string() == "Placeholder") {
       continue;
     }
-    graph.RemoveNode(node);
+    graph->RemoveNode(node);
   }
   return tensorflow::Status::OK();
 }
@@ -213,16 +214,16 @@ tensorflow::Status ConvertGraphDefToTensorRT(
   // layout optimization
   item.graph = graph_def;
   tensorflow::grappler::LayoutOptimizer optimizer;
-  tensorflow::grappler::Cluster* gCluster;
+  tensorflow::grappler::Cluster* cluster;
 
   // virtual cluster
   tensorflow::DeviceProperties device_properties;
   device_properties.set_type("GPU");
   device_properties.mutable_environment()->insert({"architecture", "6"});
-  gCluster =
+  cluster =
       new tensorflow::grappler::VirtualCluster({{"/GPU:0", device_properties}});
 
-  tensorflow::Status status = optimizer.Optimize(gCluster, item, &gdef);
+  tensorflow::Status status = optimizer.Optimize(cluster, item, &gdef);
 
   if (status != tensorflow::Status::OK()) return status;
 
@@ -267,8 +268,8 @@ tensorflow::Status ConvertGraphDefToTensorRT(
       subgraph_node_ids.insert(node_map.at(node_name)->id());
     }
     TF_RETURN_IF_ERROR(ConvertSubGraphToTensorRT(
-        graph, output_names, subgraph_node_ids, max_batch_size,
-        max_workspace_size, static_graph_properties));
+        output_names, subgraph_node_ids, max_batch_size, max_workspace_size,
+        static_graph_properties, &graph));
   }
   graph.ToGraphDef(new_graph_def);
   return tensorflow::Status::OK();
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.h b/tensorflow/contrib/tensorrt/convert/convert_graph.h
index 621d428ace..e0fa02ecd4 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.h
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.h
@@ -21,6 +21,9 @@ limitations under the License.
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
 namespace tensorflow {
 namespace tensorrt {
 namespace convert {
@@ -32,7 +35,12 @@ tensorflow::Status ConvertGraphDefToTensorRT(
     const tensorflow::GraphDef& graph_def,
     const std::vector<std::string>& output_names, size_t max_batch_size,
     size_t max_workspace_size, tensorflow::GraphDef* new_graph_def);
+
 }  // namespace convert
 }  // namespace tensorrt
 }  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+
 #endif  // TENSORFLOW_CONTRIB_TENSORRT_CONVERT_CONVERT_GRAPH_H_
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index 60e6a1ab96..a42f559651 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
+
 #include <algorithm>
-#include <fstream>
 #include <list>
 #include <map>
 #include <memory>
 #include <set>
-#include <sstream>
 #include <string>
 #include <unordered_map>
 #include <utility>
@@ -36,16 +36,13 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 
-//  Check if the types are equal. Cast to int first so that failure log message
-//  would work!
-
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
-
-#include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
 #include "tensorflow/contrib/tensorrt/log/trt_logger.h"
 #include "tensorrt/include/NvInfer.h"
 
+//  Check if the types are equal. Cast to int first so that failure log message
+//  would work!
 #define CHECK_EQ_TYPE(val1, val2) CHECK_EQ((int)val1, (int)val2)
 
 namespace tensorflow {
@@ -107,8 +104,8 @@ static std::vector<std::pair<int, int>> createSamePadding(
     int right = p - left;
 
     VLOG(-1) << "PADDING_" << i << " pre: " << left << ", post: " << right
-               << "paras: " << inputDims[i] << ", " << stride.d[i] << ", "
-               << "kernel: " << kernel.d[i];
+             << "paras: " << inputDims[i] << ", " << stride.d[i] << ", "
+             << "kernel: " << kernel.d[i];
     padding[i] = {left, right};
   }
   return padding;
@@ -664,7 +661,7 @@ tensorflow::Status ConstantFoldBinary(
   nvinfer1::Dims output_shape;
   output_shape.nbDims = nbDims;
   VLOG(-1) << "nbDims: " << nbDims
-             << "the other: " << weights_input_r.shape_.nbDims;
+           << "the other: " << weights_input_r.shape_.nbDims;
   for (int i = 0; i < nbDims; i++) {
     if (weights_input_l.shape_.d[i] == weights_input_r.shape_.d[i]) {
       output_shape.d[i] = weights_input_l.shape_.d[i];
@@ -677,8 +674,8 @@ tensorflow::Status ConstantFoldBinary(
           "Binary op with incompatible shape at, " + node_def.op());
     }
     VLOG(-1) << "left: " << weights_input_l.shape_.d[i]
-               << "right: " << weights_input_r.shape_.d[i]
-               << "output: " << output_shape.d[i];
+             << "right: " << weights_input_r.shape_.d[i]
+             << "output: " << output_shape.d[i];
   }
 
   // FIXME assume type matches input weights
@@ -822,9 +819,9 @@ tensorflow::Status BinaryTensorOpTensor(
   CHECK_EQ_TYPE(tensor_r->getType(), dtype);
   auto op_pair = ops.find(node_def.op());
   if (op_pair == ops.end())
-    return tensorflow::errors::Unimplemented("binary op: " + node_def.op() +
-                                             " not supported at: " +
-                                             node_def.name());
+    return tensorflow::errors::Unimplemented(
+        "binary op: " + node_def.op() +
+        " not supported at: " + node_def.name());
 
   nvinfer1::IElementWiseLayer* layer = ctx.network()->addElementWise(
       *const_cast<nvinfer1::ITensor*>(tensor_l),
@@ -909,11 +906,11 @@ tensorflow::Status ConvertConv2D(Converter& ctx,
       padding[1].first != padding[1].second) {
     // TODO(jie): handle asymmetric padding
     VLOG(-1) << "padding!!!: " << padding[0].first << padding[0].second
-               << padding[1].first << padding[1].second;
+             << padding[1].first << padding[1].second;
 
     auto dim_before = tensor->getDimensions();
-    VLOG(-1) << "TENSOR before: " << dim_before.d[0] << ", "
-               << dim_before.d[1] << dim_before.d[2] << ", " << dim_before.d[3];
+    VLOG(-1) << "TENSOR before: " << dim_before.d[0] << ", " << dim_before.d[1]
+             << dim_before.d[2] << ", " << dim_before.d[3];
     auto padLayer = ctx.network()->addPadding(
         *const_cast<nvinfer1::ITensor*>(tensor),
         nvinfer1::DimsHW(padding[0].first, padding[1].first),
@@ -922,7 +919,7 @@ tensorflow::Status ConvertConv2D(Converter& ctx,
     tensor = padLayer->getOutput(0);
     auto dim_after = tensor->getDimensions();
     VLOG(-1) << "TENSOR after: " << dim_after.d[0] << ", " << dim_after.d[1]
-               << dim_after.d[2] << ", " << dim_after.d[3];
+             << dim_after.d[2] << ", " << dim_after.d[3];
   }
 
   nvinfer1::IConvolutionLayer* layer =
@@ -936,7 +933,7 @@ tensorflow::Status ConvertConv2D(Converter& ctx,
 
   auto dim_after = output_tensor->getDimensions();
   VLOG(-1) << "TENSOR out: " << dim_after.d[0] << ", " << dim_after.d[1]
-             << dim_after.d[2] << ", " << dim_after.d[3];
+           << dim_after.d[2] << ", " << dim_after.d[3];
 
   if (data_format == "NHWC") {
     // TODO(jie): transpose it back!
@@ -992,8 +989,7 @@ tensorflow::Status ConvertPool(Converter& ctx,
         {static_cast<int>(tensor_dim.d[1]), static_cast<int>(tensor_dim.d[2])});
   } else if (attrs.get<std::string>("padding") == "VALID") {
     // No padding for valid padding here
-    VLOG(-1) << "no padding added for VALID padding in pool"
-               << node_def.name();
+    VLOG(-1) << "no padding added for VALID padding in pool" << node_def.name();
     padding = {{0, 0}, {0, 0}};
   } else {
     return tensorflow::errors::Unimplemented(
@@ -1004,7 +1000,7 @@ tensorflow::Status ConvertPool(Converter& ctx,
       padding[1].first != padding[1].second) {
     // TODO(jie): handle asymmetric padding
     VLOG(-1) << "padding!!!: " << padding[0].first << padding[0].second
-               << padding[1].first << padding[1].second;
+             << padding[1].first << padding[1].second;
     auto padLayer = ctx.network()->addPadding(
         *const_cast<nvinfer1::ITensor*>(tensor),
         nvinfer1::DimsHW(padding[0].first, padding[1].first),
@@ -1480,9 +1476,9 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
     TF_CHECK_OK(convert_dtype(tf_dtype, &dtype));
 
     VLOG(-1) << "accessing output index of: " << std::to_string(output_idx)
-               << ", at node: " << node_name
-               << "with output entry from shape_map: "
-               << std::to_string(op_info_vec.size());
+             << ", at node: " << node_name
+             << "with output entry from shape_map: "
+             << std::to_string(op_info_vec.size());
 
     // TODO(ben,jie): update TRT input format/dimension
     nvinfer1::DimsCHW input_dim_psuedo_chw;
@@ -1490,7 +1486,7 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
 
     for (int i = 1; i < op_info.shape().dim_size(); i++) {
       VLOG(-1) << "dimension: " << i
-                 << " , size: " << op_info.shape().dim(i).size();
+               << " , size: " << op_info.shape().dim(i).size();
       input_dim_psuedo_chw.d[i - 1] = op_info.shape().dim(i).size();
     }
 
@@ -1517,7 +1513,7 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
   for (const tensorflow::Node* node : order) {
     tensorflow::NodeDef const& node_def = node->def();
     VLOG(-1) << "converting node: " << node_def.name() << " , "
-               << node_def.op();
+             << node_def.op();
     TF_RETURN_IF_ERROR(converter.convert_node(node_def));
   }
 
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.h b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
index a1f9c3f4a1..69657e0cb9 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.h
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
@@ -25,6 +25,9 @@ limitations under the License.
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/lib/core/status.h"
 
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
 namespace tensorflow {
 namespace tensorrt {
 namespace convert {
@@ -35,12 +38,15 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
         input_inds,  // {node_id, output_idx}
     const std::vector<std::pair<int, int>>&
         output_inds,  // {node_id, output_idx}
-    size_t max_batch_size,
-    size_t max_workspace_size,
+    size_t max_batch_size, size_t max_workspace_size,
     const tensorflow::grappler::GraphProperties& graph_prop,
     tensorflow::NodeDef* trt_node);
 
 }  // namespace convert
 }  // namespace tensorrt
 }  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+
 #endif  // TENSORFLOW_CONTRIB_TENSORRT_CONVERT_CONVERT_NODES_H_
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
index dc8b625731..6e4fbe20e7 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
@@ -12,21 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <sstream>
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/contrib/tensorrt/kernels/trt_engine_op.h"
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
-
-#include <cuda_runtime_api.h>
-#include "tensorflow/contrib/tensorrt/kernels/trt_engine_op.h"
+#include "cuda/include/cuda_runtime_api.h"
 #include "tensorflow/contrib/tensorrt/log/trt_logger.h"
+#include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
-static ::tensorflow::tensorrt::Logger gLogger;
-
 namespace tensorrt {
+static ::tensorflow::tensorrt::Logger logger;
 
 TRTEngineOp::TRTEngineOp(OpKernelConstruction* context) : OpKernel(context) {
   // read serialized_engine
@@ -39,14 +35,14 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context) : OpKernel(context) {
   OP_REQUIRES_OK(context, context->GetAttr("output_nodes", &output_nodes_));
 
   // TODO(samikama) runtime should be taken from a resourcemanager as well.
-  //  Only engine should be in the op and context and runtime should be taken
-  //  from resourcemanager
-  nvinfer1::IRuntime* infer = nvinfer1::createInferRuntime(gLogger);
+  // Only engine should be in the op and context and runtime should be taken
+  // from resourcemanager
+  nvinfer1::IRuntime* infer = nvinfer1::createInferRuntime(logger);
   trt_engine_ptr_.reset(infer->deserializeCudaEngine(
       serialized_engine.c_str(), serialized_engine.size(), nullptr));
 
   trt_execution_context_ptr_.reset(trt_engine_ptr_->createExecutionContext());
-  // runtime is safe to delete after engine creation
+  // Runtime is safe to delete after engine creation
   infer->destroy();
 }
 
@@ -89,7 +85,7 @@ void TRTEngineOp::Compute(OpKernelContext* context) {
     // This is bad that we have to reallocate output buffer every run.
     // Create an output tensor
     binding_index = trt_engine_ptr_->getBindingIndex(output_nodes_[i].c_str());
-    Tensor* output_tensor = NULL;
+    Tensor* output_tensor = nullptr;
 
     TensorShape output_shape;
     if (binding_index != -1) {
@@ -131,6 +127,7 @@ void TRTEngineOp::Compute(OpKernelContext* context) {
 }
 
 REGISTER_KERNEL_BUILDER(Name("TRTEngineOp").Device(DEVICE_GPU), TRTEngineOp);
+
 }  // namespace tensorrt
 }  // namespace tensorflow
 
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
index 0e3ff45ede..0964b4b18a 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
@@ -20,18 +20,17 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
-#include <cuda_runtime_api.h>
+#include "cuda/include/cuda_runtime_api.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorrt/include/NvInfer.h"
 
 namespace tensorflow {
-
 namespace tensorrt {
 class Logger;
+
 class TRTEngineOp : public OpKernel {
  public:
   explicit TRTEngineOp(OpKernelConstruction* context);
@@ -43,17 +42,18 @@ class TRTEngineOp : public OpKernel {
   struct Destroyer {
     void operator()(T* d) { d->destroy(); }
   };
+
   template <typename T>
   using destroyed_ptr = std::unique_ptr<T, Destroyer<T>>;
   destroyed_ptr<nvinfer1::ICudaEngine> trt_engine_ptr_;
-  // TODO(samikama) context should go to a resource manager!
+  // TODO(samikama): context should go to a resource manager!
   destroyed_ptr<nvinfer1::IExecutionContext> trt_execution_context_ptr_;
+
   std::vector<string> input_nodes_;
   std::vector<string> output_nodes_;
 };
 
 }  // namespace tensorrt
-
 }  // namespace tensorflow
 
 #endif  // GOOGLE_TENSORRT
diff --git a/tensorflow/contrib/tensorrt/log/trt_logger.cc b/tensorflow/contrib/tensorrt/log/trt_logger.cc
index 2473b8effc..5131c80794 100644
--- a/tensorflow/contrib/tensorrt/log/trt_logger.cc
+++ b/tensorflow/contrib/tensorrt/log/trt_logger.cc
@@ -13,16 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
+
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
-
-#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
 #include "tensorflow/core/platform/logging.h"
 
-// Use TF logging for TensorRT informations
 namespace tensorflow {
 namespace tensorrt {
 
+// Use TF logging for TensorRT informations
 void Logger::log(Severity severity, const char* msg) {
   // Suppress info-level messages
   switch (severity) {
diff --git a/tensorflow/contrib/tensorrt/log/trt_logger.h b/tensorflow/contrib/tensorrt/log/trt_logger.h
index c07a3e6b2d..0dc2b1708b 100644
--- a/tensorflow/contrib/tensorrt/log/trt_logger.h
+++ b/tensorflow/contrib/tensorrt/log/trt_logger.h
@@ -1,4 +1,3 @@
-// -*- c++ -*-
 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -28,9 +27,9 @@ namespace tensorrt {
 
 // Logger for GIE info/warning/errors
 class Logger : public nvinfer1::ILogger {
+ private:
   void log(nvinfer1::ILogger::Severity severity, const char* msg) override;
 
- private:
   std::string name_;
 };
 
diff --git a/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc b/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc
index 7139ff9618..fa72bce039 100644
--- a/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc
+++ b/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/shape_inference.h"
@@ -34,4 +37,7 @@ REGISTER_OP("TRTEngineOp")
     .Output("out_tensor: OutT")
     .SetShapeFn(shape_inference::TRTEngineOpShapeInference);
 
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tensorrt/python/ops/trt_engine_op.py b/tensorflow/contrib/tensorrt/python/ops/trt_engine_op.py
index c4ab9b89ea..97db23797f 100644
--- a/tensorflow/contrib/tensorrt/python/ops/trt_engine_op.py
+++ b/tensorflow/contrib/tensorrt/python/ops/trt_engine_op.py
@@ -31,5 +31,3 @@ if platform.system() != "Windows":
       resource_loader.get_path_to_datafile("_trt_engine_op.so"))
 else:
   raise RuntimeError("Windows platforms are not supported")
-
-
diff --git a/tensorflow/contrib/tensorrt/python/trt_convert.py b/tensorflow/contrib/tensorrt/python/trt_convert.py
index f6d2dbede6..6bdc20ed04 100644
--- a/tensorflow/contrib/tensorrt/python/trt_convert.py
+++ b/tensorflow/contrib/tensorrt/python/trt_convert.py
@@ -29,9 +29,13 @@ from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
 
+
 # TODO(skama): get outputs from session when implemented as c++
 # optimization pass
-def CreateInferenceGraph(input_graph_def, outputs,max_batch_size=1,max_workspace_size=2<<20):
+def CreateInferenceGraph(input_graph_def,
+                         outputs,
+                         max_batch_size=1,
+                         max_workspace_size=2 << 20):
   """Python wrapper for the TRT transormation.
 
 
@@ -45,35 +49,34 @@ def CreateInferenceGraph(input_graph_def, outputs,max_batch_size=1,max_workspace
     New GraphDef with TRTEngineOps placed in graph replacing subgraphs.
   """
 
-  out_names=[]
+  out_names = []
   for i in outputs:
-    if isinstance(i,ops.Tensor):
+    if isinstance(i, ops.Tensor):
       out_names.append(i.name)
     else:
       out_names.append(i)
-      
-  input_graph_def_str= \
-    input_graph_def.SerializeToString()
+
+  input_graph_def_str = input_graph_def.SerializeToString()
 
   # TODO(sami): Fix this when we can return status from C++ library
   # There is a problem with the TF internal library setup that doesn't
   # allow us to return a status object from C++.  Thus we return a
   # pair or strings where first one is encoded status and the second
   # one is the transformed graphs protobuf string.
-  out = trt_convert(
-      input_graph_def_str ,outputs,
-      max_batch_size,max_workspace_size)
+  out = trt_convert(input_graph_def_str, outputs, max_batch_size,
+                    max_workspace_size)
   status = out[0]
   output_graph_def_string = out[1]
-  del input_graph_def_str #save some memory
+  del input_graph_def_str  #save some memory
   if len(status) < 2:
-    raise _impl.UnknownError(None,None,status)
+    raise _impl.UnknownError(None, None, status)
   if status[:2] != "OK":
-    msg=status.split(";")
+    msg = status.split(";")
     if len(msg) == 1:
       raise RuntimeError("Status message is malformed {}".format(status))
-    raise _impl._make_specific_exception(None,None,";".join(msg[1:]), int(msg[0]))
+    raise _impl._make_specific_exception(None, None, ";".join(msg[1:]),
+                                         int(msg[0]))
   output_graph_def = graph_pb2.GraphDef()
   output_graph_def.ParseFromString(output_graph_def_string)
-  del output_graph_def_string #save some memory
+  del output_graph_def_string  #save some memory
   return output_graph_def
diff --git a/tensorflow/contrib/tensorrt/segment/segment.cc b/tensorflow/contrib/tensorrt/segment/segment.cc
index 89457b71e8..c9d3840606 100644
--- a/tensorflow/contrib/tensorrt/segment/segment.cc
+++ b/tensorflow/contrib/tensorrt/segment/segment.cc
@@ -68,7 +68,6 @@ bool CanContractEdge(const tensorflow::Edge* edge,
   return !is_cycle;
 }
 
-//------------------------------------------------------------------------------
 void ContractEdge(tensorflow::Edge* edge, tensorflow::Graph* graph,
                   std::vector<const tensorflow::Edge*>* remove_edges) {
   // Transfer all inputs and outputs of 'dst' to 'src' except edges
diff --git a/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc b/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
index fef63c64d8..ebaf996a29 100644
--- a/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
+++ b/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
@@ -13,71 +13,74 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/contrib/tensorrt/shape_fn/trt_shfn.h"
+
 #include <string>
 #include <vector>
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
 #include "tensorflow/contrib/tensorrt/log/trt_logger.h"
-#include "tensorflow/contrib/tensorrt/shape_fn/trt_shfn.h"
 #include "tensorrt/include/NvInfer.h"
 
 namespace tensorflow {
 namespace shape_inference {
 
-tensorflow::Status TRTEngineOpShapeInference(InferenceContext* c) {
+tensorflow::Status TRTEngineOpShapeInference(InferenceContext* context) {
   tensorflow::tensorrt::Logger logger;
   string serialized_engine;
-  c->GetAttr("serialized_engine", &serialized_engine);
+  context->GetAttr("serialized_engine", &serialized_engine);
   nvinfer1::IRuntime* infer = nvinfer1::createInferRuntime(logger);
   nvinfer1::ICudaEngine* trt_engine = infer->deserializeCudaEngine(
       serialized_engine.c_str(), serialized_engine.size(), nullptr);
 
-  int nbBatch = -1;
-  // debug print out input arrays
+  int num_batch = -1;
   std::vector<::tensorflow::DataType> input_type;
-  c->GetAttr("InT", &input_type);
-  for (size_t i = 0; i < c->num_inputs(); i++) {
-    // check if input shape is legit
-    auto input_shape = c->input(i);
-    for (int j = 0; j < c->Rank(input_shape); j++) {
-      auto dimHandler = c->Dim(input_shape, j);
+  context->GetAttr("InT", &input_type);
+  for (size_t i = 0; i < context->num_inputs(); i++) {
+    // Check if input shape is legit
+    auto input_shape = context->input(i);
+    for (int j = 0; j < context->Rank(input_shape); j++) {
+      auto dim_handler = context->Dim(input_shape, j);
       if (j == 0) {
-        if (i == 0)
-          nbBatch = c->Value(dimHandler);
-        else if (nbBatch != c->Value(dimHandler))
+        if (i == 0) {
+          num_batch = context->Value(dim_handler);
+        } else if (num_batch != context->Value(dim_handler)) {
           // TODO(jie): TensorRT engine requires consistent batch between inputs
           //            tensors. Segmenter should be aware of this.
           LOG(FATAL) << "TensorRT engine requires consistent batch size";
+        }
       }
     }
   }
 
-  // arrange input here
+  // Arrange input here
   std::vector<string> input_nodes;
-  c->GetAttr("input_nodes", &input_nodes);
+  context->GetAttr("input_nodes", &input_nodes);
 
-  // arrange output here
+  // Arrange output here
   std::vector<string> output_nodes;
-  c->GetAttr("output_nodes", &output_nodes);
+  context->GetAttr("output_nodes", &output_nodes);
   for (size_t i = 0; i < output_nodes.size(); i++) {
     int binding_index = trt_engine->getBindingIndex(output_nodes[i].c_str());
     ShapeHandle output_shape;
-    std::vector<DimensionHandle> vecDim;
-    vecDim.emplace_back(c->MakeDim(nbBatch));
+    std::vector<DimensionHandle> dim_vec;
+    dim_vec.emplace_back(context->MakeDim(num_batch));
     if (binding_index != -1) {
       auto dims = trt_engine->getBindingDimensions(binding_index);
-      for (int j = 0; j < dims.nbDims; j++)
-        vecDim.emplace_back(c->MakeDim(dims.d[j]));
+      for (int j = 0; j < dims.nbDims; j++) {
+        dim_vec.emplace_back(context->MakeDim(dims.d[j]));
+      }
     } else {
       LOG(FATAL) << "TensorRT engine cannot find binding: " << output_nodes[i];
     }
-    output_shape = c->MakeShape(vecDim);
-    c->set_output(i, output_shape);
+    output_shape = context->MakeShape(dim_vec);
+    context->set_output(i, output_shape);
   }
 
   return Status::OK();
 }
+
 }  // namespace shape_inference
 }  // namespace tensorflow
 
diff --git a/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.h b/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.h
index f09b261139..9ca4ad0d55 100644
--- a/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.h
+++ b/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.h
@@ -16,8 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_TENSORRT_SHAPE_FN_TRT_SHFN_H_
 #define TENSORFLOW_CONTRIB_TENSORRT_SHAPE_FN_TRT_SHFN_H_
 
-#include "tensorflow/core/framework/shape_inference.h"
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/framework/shape_inference.h"
 
 namespace tensorflow {
 namespace shape_inference {
@@ -25,4 +27,7 @@ Status TRTEngineOpShapeInference(InferenceContext* c);
 }  // namespace shape_inference
 }  // namespace tensorflow
 
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+
 #endif  // TENSORFLOW_CONTRIB_TENSORRT_SHAPE_FN_TRT_SHFN_H_
diff --git a/tensorflow/contrib/tensorrt/trt_conversion.i b/tensorflow/contrib/tensorrt/trt_conversion.i
index 38cdabdff0..a7c7e5bc9f 100644
--- a/tensorflow/contrib/tensorrt/trt_conversion.i
+++ b/tensorflow/contrib/tensorrt/trt_conversion.i
@@ -1,8 +1,19 @@
-/*
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 
-  wrap trt_conversion
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
- */
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/* wrap trt_conversion */
 %{
 #define SWIG_FILE_WITH_INIT
 %}
@@ -25,60 +36,65 @@
 %unignore trt_convert;
 
 %{
-  std::pair<string,string> trt_convert(string graph_def_string,//const tensorflow::GraphDef&
-		   std::vector<string> output_names,
-		   size_t max_batch_size,
-		   size_t max_workspace_size
-		   // unfortunately we can't use TF_Status here since it
-		   // is in c/c_api and brings in a lot of other libraries
-		   // which in turn declare ops. These ops are included
-		   // statically in our library and cause an abort when
-		   // module is loaded due to double registration
-		   // until Tensorflow properly exposes these headers
-		   // we have to work around this by returning a string
-		   // and converting it to exception on python side.
-		   //,TF_Status* out_status) {
-		   ) {
-    string out_status;
+std::pair<string, string> trt_convert(
+    string graph_def_string,  // The serialized GraphDef string.
+    std::vector<string> output_names,
+    size_t max_batch_size,
+    size_t max_workspace_size
+    // Unfortunately we can't use TF_Status here since it
+    // is in c/c_api and brings in a lot of other libraries
+    // which in turn declare ops. These ops are included
+    // statically in our library and cause an abort when
+    // module is loaded due to double registration
+    // until Tensorflow properly exposes these headers
+    // we have to work around this by returning a string
+    // and converting it to exception on python side.
+    //,TF_Status* out_status) {
+) {
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+  string out_status;
 
-    tensorflow::GraphDef graph_def;
-    if (!graph_def.ParseFromString(graph_def_string)) {
-      out_status="InvalidArgument;Couldn't interpret input as a GraphDef";
-      return std::pair<string,string>{out_status,""};
-    }
+  tensorflow::GraphDef graph_def;
+  if (!graph_def.ParseFromString(graph_def_string)) {
+    out_status = "InvalidArgument;Couldn't interpret input as a GraphDef";
+    return std::pair<string, string>{out_status, ""};
+  }
 
-    if (!output_names.size()) {
-      out_status="InvalidArgument;Size of the output_names vector is 0";
-      return std::pair<string,string>{out_status,""};
-      //return "";
-    }
-    tensorflow::GraphDef outGraph;
-    tensorflow::Status conversion_status =
-        tensorflow::tensorrt::convert::ConvertGraphDefToTensorRT(graph_def,
-						   output_names,
-						   max_batch_size,
-						   max_workspace_size,
-						   &outGraph);
-    if (!conversion_status.ok()) {
-      auto retCode=(int)conversion_status.code();
-      char buff[2000];
-      snprintf(buff,2000,"%d;%s",retCode,conversion_status.error_message().c_str());
-      out_status=buff;
-      return std::pair<string,string>{out_status,""};
-    }
-    string result;
-    if (!outGraph.SerializeToString(&result)) {
-      out_status="InvalidArgument;Couldn't serialize output as a GraphDef";
-      return std::pair<string,string>{out_status,""};
-    }
-    out_status="OK;All good!";
-    return std::pair<string,string>{out_status,result};
+  if (!output_names.size()) {
+    out_status = "InvalidArgument;Size of the output_names vector is 0";
+    return std::pair<string, string>{out_status, ""};
+    // return "";
+  }
+  tensorflow::GraphDef outGraph;
+  tensorflow::Status conversion_status =
+      tensorflow::tensorrt::convert::ConvertGraphDefToTensorRT(
+          graph_def, output_names, max_batch_size, max_workspace_size,
+          &outGraph);
+  if (!conversion_status.ok()) {
+    auto retCode = (int)conversion_status.code();
+    char buff[2000];
+    snprintf(buff, 2000, "%d;%s", retCode,
+             conversion_status.error_message().c_str());
+    out_status = buff;
+    return std::pair<string, string>{out_status, ""};
+  }
+  string result;
+  if (!outGraph.SerializeToString(&result)) {
+    out_status = "InvalidArgument;Couldn't serialize output as a GraphDef";
+    return std::pair<string, string>{out_status, ""};
   }
+  out_status = "OK;All good!";
+  return std::pair<string, string>{out_status, result};
+#else
+  // Returns FAILED_PRECONDITION.
+  return std::pair<string, string>{"9;TensorRT is not enabled!", ""};
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+}
 %}
 
-std::pair<string,string> trt_convert(string graph_def_string,
-				     std::vector<string> output_names,
-				     size_t max_batch_size,
-				     size_t max_workspace_size);
+std::pair<string, string> trt_convert(string graph_def_string,
+                                      std::vector<string> output_names,
+                                      size_t max_batch_size,
+                                      size_t max_workspace_size);
 
 %unignoreall
-- 
GitLab


From a58f26d7265428ef026877cb24cc6bbd7693687b Mon Sep 17 00:00:00 2001
From: ManHyuk <manhyuk@kw.ac.kr>
Date: Wed, 31 Jan 2018 05:20:47 +0900
Subject: [PATCH 391/966] Fix typo (#16583)

* fix typos

* fix typos

* fix typo

* fix typo

* fix typo

* fix typo

* fix typo

* tweak grammar
---
 tensorflow/contrib/coder/kernels/range_coder.cc             | 2 +-
 tensorflow/contrib/reduce_slice_ops/ops/reduce_slice_ops.cc | 6 +++---
 tensorflow/core/platform/s3/s3_file_system.h                | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tensorflow/contrib/coder/kernels/range_coder.cc b/tensorflow/contrib/coder/kernels/range_coder.cc
index f4f076b6c4..21b35155ff 100644
--- a/tensorflow/contrib/coder/kernels/range_coder.cc
+++ b/tensorflow/contrib/coder/kernels/range_coder.cc
@@ -276,7 +276,7 @@ void RangeEncoder::Finalize(string* sink) {
     }
   } else if (base_ != 0) {
     // If base == 0, then pick 0 from [base, base + size) and no zeros are
-    // explcitly written.
+    // explicitly written.
     //
     // Otherwise, pick (base + (2^16 - base[16:0])), i.e., round up base to the
     // next multiple of 2^16. As 2^16 < size, this value should be in the
diff --git a/tensorflow/contrib/reduce_slice_ops/ops/reduce_slice_ops.cc b/tensorflow/contrib/reduce_slice_ops/ops/reduce_slice_ops.cc
index 31e565027f..92879ab535 100644
--- a/tensorflow/contrib/reduce_slice_ops/ops/reduce_slice_ops.cc
+++ b/tensorflow/contrib/reduce_slice_ops/ops/reduce_slice_ops.cc
@@ -246,9 +246,9 @@ and 'indices' is [[0,1]
                   [1,1]
                   [0,2]],
 
-the the output will be [[          1,         20,          3]
-                        [ +BIG_VALUE, +BIG_VALUE, +BIG_VALUE]
-                        [          1,          5,          3]].
+the output will be [[          1,         20,          3]
+                    [ +BIG_VALUE, +BIG_VALUE, +BIG_VALUE]
+                    [          1,          5,          3]].
 ```
 
 The data must be at least rank 1. The indices can be of shape (?,2) where the
diff --git a/tensorflow/core/platform/s3/s3_file_system.h b/tensorflow/core/platform/s3/s3_file_system.h
index d0d6bb5949..8177e48dba 100644
--- a/tensorflow/core/platform/s3/s3_file_system.h
+++ b/tensorflow/core/platform/s3/s3_file_system.h
@@ -63,7 +63,7 @@ class S3FileSystem : public FileSystem {
   // variables.
   // By default S3 access regional endpoint, with region
   // controlled by `AWS_REGION`. The endpoint could be overridden
-  // with explicity `S3_ENDPOINT`. S3 use HTTPS by default.
+  // explicitly with `S3_ENDPOINT`. S3 uses HTTPS by default.
   // If S3_USE_HTTPS=0 is specified, HTTP is used. Also,
   // S3_VERIFY_SSL=0 could disable SSL verification in case
   // HTTPS is used.
-- 
GitLab


From 7149a2e2e2f549035f23e21224ee41afe8df3876 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 30 Jan 2018 10:05:04 -0800
Subject: [PATCH 392/966] Cleanup: Ran clang-format on files in
 tensorflow/core/.../*.{cc,h}.

PiperOrigin-RevId: 183848459
---
 .../common_runtime/accumulate_n_optimizer.cc  |   3 -
 .../core/common_runtime/bfc_allocator.h       |   8 +-
 .../core/common_runtime/device_set_test.cc    |   4 +-
 .../core/common_runtime/direct_session.cc     |   7 +-
 .../common_runtime/direct_session_test.cc     |  68 ++--
 ...direct_session_with_tracking_alloc_test.cc |   8 +-
 tensorflow/core/common_runtime/executor.cc    |   2 +-
 tensorflow/core/common_runtime/function.cc    |   2 +-
 .../gpu/gpu_bfc_allocator_test.cc             |   5 +-
 .../core/common_runtime/gpu/gpu_device.cc     |   5 +-
 .../gpu/gpu_stream_util_test.cc               |   3 +-
 .../core/common_runtime/gpu/process_state.cc  |   4 +-
 .../common_runtime/graph_execution_state.h    |   4 +-
 .../core/common_runtime/memory_types.cc       |   2 +-
 .../core/common_runtime/memory_types_test.cc  |   6 +-
 tensorflow/core/common_runtime/placer.h       |   6 +-
 tensorflow/core/common_runtime/placer_test.cc |   6 +-
 .../core/common_runtime/session_factory.cc    |   4 +-
 .../common_runtime/step_stats_collector.cc    |  19 +-
 .../common_runtime/sycl/sycl_allocator.cc     |   2 +-
 .../core/common_runtime/sycl/sycl_allocator.h |   5 +-
 .../core/common_runtime/sycl/sycl_device.h    |   6 +-
 .../sycl/sycl_device_factory.cc               |  15 +-
 .../core/common_runtime/sycl/sycl_util.h      |   2 +-
 tensorflow/core/debug/debug_gateway.cc        |  53 +--
 tensorflow/core/debug/debug_gateway_test.cc   |  32 +-
 tensorflow/core/debug/debug_grpc_testlib.cc   |   2 +-
 tensorflow/core/debug/debug_io_utils_test.cc  |   3 +-
 .../core/distributed_runtime/graph_mgr.h      |   2 +-
 tensorflow/core/distributed_runtime/master.cc |   4 +-
 .../core/distributed_runtime/master_test.cc   |   4 +-
 .../rpc/grpc_master_service.cc                |  33 +-
 .../rpc/grpc_master_service_impl.h            |   6 +-
 .../rpc/grpc_remote_master.cc                 |   8 +-
 .../rpc/grpc_testlib_ops.cc                   |  14 +-
 .../core/distributed_runtime/rpcbench_test.cc |   4 +-
 .../core/distributed_runtime/scheduler.cc     |   2 +-
 .../core/distributed_runtime/scheduler.h      |   4 +-
 .../worker_cache_logger.cc                    |   5 +-
 tensorflow/core/graph/costmodel.cc            |   4 +-
 tensorflow/core/graph/costmodel.h             |   4 +-
 tensorflow/core/graph/graph.h                 |   4 +-
 .../core/graph/graph_def_builder_test.cc      |   1 -
 tensorflow/core/graph/mkl_graph_util.h        | 171 +++++----
 tensorflow/core/graph/mkl_layout_pass.cc      | 348 +++++++++---------
 tensorflow/core/graph/mkl_layout_pass_test.cc | 107 ++++--
 .../core/graph/mkl_tfconversion_pass.cc       |  22 +-
 tensorflow/core/graph/testlib.cc              |   2 +-
 .../kernels/batching_util/periodic_function.h |   2 +-
 .../shared_batch_scheduler_test.cc            |  25 +-
 .../core/kernels/data/batch_dataset_op.cc     |   1 -
 .../core/kernels/data/captured_function.cc    |   4 +-
 .../core/kernels/data/skip_dataset_op.cc      |  11 +-
 .../kernels/fuzzing/decode_base64_fuzz.cc     |   2 +-
 .../core/kernels/fuzzing/decode_jpeg_fuzz.cc  |   2 +-
 .../fuzzing/decode_json_example_fuzz.cc       |   2 +-
 .../core/kernels/fuzzing/decode_png_fuzz.cc   |   2 +-
 .../kernels/fuzzing/encode_base64_fuzz.cc     |   2 +-
 .../core/kernels/fuzzing/encode_jpeg_fuzz.cc  |   2 +-
 .../example_proto_fast_parsing_fuzz.cc        |   2 +-
 .../core/kernels/fuzzing/identity_fuzz.cc     |   2 +-
 .../kernels/fuzzing/parse_tensor_op_fuzz.cc   |   2 +-
 .../core/kernels/fuzzing/string_split_fuzz.cc |   2 +-
 .../kernels/fuzzing/string_to_number_fuzz.cc  |   2 +-
 .../kernels/hexagon/graph_transfer_utils.cc   |   2 +-
 .../core/kernels/hexagon/graph_transferer.h   |   4 +-
 .../kernels/hexagon/graph_transferer_test.cc  |   5 +-
 .../hexagon/hexagon_graph_execution_test.cc   |   8 +-
 .../kernels/neon/neon_depthwise_conv_op.cc    |   8 +-
 tensorflow/core/lib/core/status.h             |   2 +-
 tensorflow/core/lib/core/threadpool.cc        |   4 +-
 tensorflow/core/lib/core/threadpool_test.cc   |   4 +-
 tensorflow/core/lib/db/sqlite.h               |  23 +-
 tensorflow/core/lib/db/sqlite_test.cc         |   8 +-
 tensorflow/core/lib/gtl/cleanup.h             |  13 +-
 tensorflow/core/lib/gtl/cleanup_test.cc       |  14 +-
 tensorflow/core/lib/gtl/inlined_vector.h      |   4 +-
 tensorflow/core/lib/gtl/int_type.h            |  34 +-
 tensorflow/core/lib/gtl/int_type_test.cc      |   6 +-
 tensorflow/core/lib/gtl/optional.h            |  14 +-
 tensorflow/core/lib/gtl/optional_test.cc      |  26 +-
 tensorflow/core/lib/gtl/top_n_test.cc         |   2 +-
 tensorflow/core/lib/io/compression.cc         |   6 +-
 tensorflow/core/lib/io/compression.h          |   6 +-
 tensorflow/core/lib/io/record_reader.cc       |   2 +-
 tensorflow/core/lib/io/recordio_test.cc       |   4 +-
 tensorflow/core/lib/png/png_io.cc             |   4 +-
 .../lib/random/philox_random_test_utils.h     |   4 +-
 .../core/lib/random/random_distributions.h    |   3 +-
 .../lib/random/random_distributions_test.cc   |   4 +-
 tensorflow/core/lib/strings/ordered_code.cc   |   3 +-
 tensorflow/core/lib/strings/strcat.h          |   2 +-
 .../compat/backwards_compatibility_test.cc    |   5 +-
 .../core/platform/cloud/gcs_dns_cache.cc      |   2 +-
 .../core/platform/cloud/http_request_fake.h   |   3 +-
 .../core/platform/cloud/oauth_client_test.cc  |  12 +-
 .../platform/cloud/retrying_file_system.cc    |   1 -
 .../core/platform/cuda_libdevice_path_test.cc |   3 +-
 .../core/platform/default/device_tracer.cc    |   6 +-
 tensorflow/core/platform/default/logging.cc   |   5 +-
 tensorflow/core/platform/default/logging.h    |  26 +-
 tensorflow/core/platform/denormal.cc          |   4 +-
 .../core/platform/device_tracer_test.cc       |   3 +-
 tensorflow/core/platform/env.h                |   3 +-
 tensorflow/core/platform/file_system.cc       |  25 +-
 tensorflow/core/platform/gif.h                |   3 +-
 .../platform/hadoop/hadoop_file_system.cc     |   5 +-
 tensorflow/core/platform/jpeg.h               |   3 +-
 tensorflow/core/platform/png.h                |   3 +-
 tensorflow/core/platform/posix/error.cc       |  32 +-
 .../android_armv7a_cpu_utils_helper.h         |   4 +-
 .../core/platform/profile_utils/cpu_utils.cc  |  16 +-
 .../core/platform/profile_utils/cpu_utils.h   |  18 +-
 .../platform/profile_utils/cpu_utils_test.cc  |  20 +-
 .../profile_utils/i_cpu_utils_helper.h        |   4 +-
 tensorflow/core/platform/protobuf_internal.h  |   4 +-
 tensorflow/core/platform/s3/aws_logging.cc    |   2 +-
 tensorflow/core/platform/setround.cc          |   1 -
 tensorflow/core/platform/test_benchmark.h     |   2 +-
 tensorflow/core/platform/windows/env.cc       |  26 +-
 tensorflow/core/platform/windows/error.cc     |   2 +-
 tensorflow/core/platform/windows/error.h      |   4 +-
 .../core/platform/windows/integral_types.h    |  30 +-
 tensorflow/core/platform/windows/net.cc       |  18 +-
 tensorflow/core/platform/windows/subprocess.h |   3 +-
 tensorflow/core/platform/windows/test.cc      |   2 +-
 .../platform/windows/windows_file_system.cc   |  96 ++---
 .../platform/windows/windows_file_system.h    |  40 +-
 .../internal/advisor/tfprof_advisor_test.cc   |   9 +-
 .../core/profiler/internal/tfprof_op.cc       |  16 +-
 tensorflow/core/profiler/internal/tfprof_op.h |   5 +-
 .../core/profiler/internal/tfprof_show.h      |  71 ++--
 .../profiler/internal/tfprof_show_multi.h     |   2 +-
 .../core/profiler/internal/tfprof_timeline.h  |   2 +-
 tensorflow/core/util/bcast.cc                 |   6 +-
 .../core/util/ctc/ctc_loss_calculator.h       |  21 +-
 .../core/util/cuda_kernel_helper_test.cu.cc   | 112 +++---
 tensorflow/core/util/example_proto_helper.cc  |  10 +-
 .../core/util/memmapped_file_system_test.cc   |   4 +-
 tensorflow/core/util/mkl_util.h               | 104 +++---
 tensorflow/core/util/presized_cuckoo_map.h    |   2 +-
 tensorflow/core/util/reporter_test.cc         |   4 +-
 tensorflow/core/util/sparse/sparse_tensor.h   |   6 +-
 .../core/util/sparse/sparse_tensor_test.cc    |   2 +-
 tensorflow/core/util/stream_executor_util.h   |   7 +-
 .../core/util/tensor_bundle/tensor_bundle.cc  |   2 +-
 .../core/util/tensor_slice_reader_cache.cc    |   2 +-
 tensorflow/core/util/tensor_slice_set.cc      |   6 +-
 tensorflow/core/util/tensor_slice_util.h      |   6 +-
 tensorflow/core/util/tensor_slice_writer.h    |  10 +-
 150 files changed, 1101 insertions(+), 1063 deletions(-)

diff --git a/tensorflow/core/common_runtime/accumulate_n_optimizer.cc b/tensorflow/core/common_runtime/accumulate_n_optimizer.cc
index a1e3b21e4f..832a55f255 100644
--- a/tensorflow/core/common_runtime/accumulate_n_optimizer.cc
+++ b/tensorflow/core/common_runtime/accumulate_n_optimizer.cc
@@ -13,11 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/graph/node_builder.h"
 
-
 namespace tensorflow {
 namespace {
 
@@ -44,7 +42,6 @@ Tensor make_zeros(const DataType& dtype, const TensorShapeProto& shape) {
 // third-party libraries aren't currently supported.
 class AccumulateNV2RemovePass : public GraphOptimizationPass {
  public:
-
   Status Run(const GraphOptimizationPassOptions& options) override {
     // TODO(freiss.oss@gmail.com): Substantial shared code with
     // ParallelConcatRemovePass::Run(). Consider refactoring if someone makes
diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h
index 3dd011a58e..9353997753 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.h
+++ b/tensorflow/core/common_runtime/bfc_allocator.h
@@ -127,10 +127,10 @@ class BFCAllocator : public VisitableAllocator {
     string DebugString(BFCAllocator* a,
                        bool recurse) NO_THREAD_SAFETY_ANALYSIS {
       string dbg;
-      strings::StrAppend(&dbg, "  Size: ", strings::HumanReadableNumBytes(size),
-                         " | Requested Size: ",
-                         strings::HumanReadableNumBytes(requested_size),
-                         " | in_use: ", in_use());
+      strings::StrAppend(
+          &dbg, "  Size: ", strings::HumanReadableNumBytes(size),
+          " | Requested Size: ", strings::HumanReadableNumBytes(requested_size),
+          " | in_use: ", in_use());
       if (recurse && prev != BFCAllocator::kInvalidChunkHandle) {
         Chunk* p = a->ChunkFromHandle(prev);
         strings::StrAppend(&dbg, ", prev: ", p->DebugString(a, false));
diff --git a/tensorflow/core/common_runtime/device_set_test.cc b/tensorflow/core/common_runtime/device_set_test.cc
index 0507076c8c..fd9c4222a7 100644
--- a/tensorflow/core/common_runtime/device_set_test.cc
+++ b/tensorflow/core/common_runtime/device_set_test.cc
@@ -88,7 +88,9 @@ TEST_F(DeviceSetTest, PrioritizedDeviceTypeList) {
   // D3 is prioritized below D1.
   AddDevice("d3", "/job:a/replica:0/task:0/device:d3:0");
   EXPECT_EQ((std::vector<DeviceType>{
-                DeviceType("d2"), DeviceType("d1"), DeviceType("d3"),
+                DeviceType("d2"),
+                DeviceType("d1"),
+                DeviceType("d3"),
             }),
             types());
 }
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 20c59ad42b..df6f4b8877 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -61,7 +61,6 @@ limitations under the License.
 #include "tensorflow/core/util/device_name_utils.h"
 #include "tensorflow/core/util/env_var.h"
 
-
 namespace tensorflow {
 
 namespace {
@@ -472,9 +471,9 @@ Status DirectSession::Run(const RunOptions& run_options,
   Executor::Args args;
   args.step_id = step_id_counter_.fetch_add(1);
 
-  TF_RETURN_IF_ERROR(
-      GetOrCreateExecutors(input_tensor_names, output_names, target_nodes,
-                           &executors_and_keys, &run_state_args));
+  TF_RETURN_IF_ERROR(GetOrCreateExecutors(input_tensor_names, output_names,
+                                          target_nodes, &executors_and_keys,
+                                          &run_state_args));
   const int64 executor_step_count = executors_and_keys->step_count.fetch_add(1);
 
   std::unique_ptr<DebuggerStateInterface> debugger_state;
diff --git a/tensorflow/core/common_runtime/direct_session_test.cc b/tensorflow/core/common_runtime/direct_session_test.cc
index 99b33e2ef0..b75a4f76d9 100644
--- a/tensorflow/core/common_runtime/direct_session_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_test.cc
@@ -436,10 +436,7 @@ TEST(DirectSessionTest, FetchMultipleTimes) {
   }
 }
 
-REGISTER_OP("Darth")
-    .Input("x: float")
-    .Output("y: float")
-    .Doc(R"doc(
+REGISTER_OP("Darth").Input("x: float").Output("y: float").Doc(R"doc(
 Darth promises one return value.
 
 x: float
@@ -972,39 +969,38 @@ static void TestSessionInterOpThreadsImpl(bool use_function_lib,
 
   std::atomic<int32> num_done(0);
   // Runs session to compute <node>:0 using inter_op thread pool <pool>.
-  auto add_session_run_call = [use_global_pools, &def, &options, &sessions,
-                               &sessions_mu,
-                               &num_done](thread::ThreadPool* tp, Node* node,
-                                          int inter_op_pool) {
-    auto fn = [use_global_pools, &def, &options, &sessions, &sessions_mu,
-               inter_op_pool, node, &num_done]() {
-      RunOptions run_options;
-      run_options.set_inter_op_thread_pool(inter_op_pool);
-      std::vector<Tensor> outputs;
-
-      Session* session;
-      if (use_global_pools) {
-        std::unique_ptr<Session> s(NewSession(options));
-        TF_ASSERT_OK(s->Create(def));
-        session = s.get();
-
-        mutex_lock l(sessions_mu);
-        sessions.emplace_back(std::move(s));
-      } else {
-        session = sessions[0].get();
-      }
+  auto add_session_run_call =
+      [use_global_pools, &def, &options, &sessions, &sessions_mu, &num_done](
+          thread::ThreadPool* tp, Node* node, int inter_op_pool) {
+        auto fn = [use_global_pools, &def, &options, &sessions, &sessions_mu,
+                   inter_op_pool, node, &num_done]() {
+          RunOptions run_options;
+          run_options.set_inter_op_thread_pool(inter_op_pool);
+          std::vector<Tensor> outputs;
+
+          Session* session;
+          if (use_global_pools) {
+            std::unique_ptr<Session> s(NewSession(options));
+            TF_ASSERT_OK(s->Create(def));
+            session = s.get();
+
+            mutex_lock l(sessions_mu);
+            sessions.emplace_back(std::move(s));
+          } else {
+            session = sessions[0].get();
+          }
 
-      Status s = session->Run(run_options, {} /* inputs */,
-                              {node->name() + ":0"} /* output_names */, {},
-                              &outputs, nullptr /* run_metadata */);
-      TF_CHECK_OK(s);
-      ASSERT_EQ(1, outputs.size());
-      auto flat = outputs[0].flat<float>();
-      EXPECT_FLOAT_EQ(1.2, flat(0));
-      num_done.fetch_add(1);
-    };
-    tp->Schedule(fn);
-  };
+          Status s = session->Run(run_options, {} /* inputs */,
+                                  {node->name() + ":0"} /* output_names */, {},
+                                  &outputs, nullptr /* run_metadata */);
+          TF_CHECK_OK(s);
+          ASSERT_EQ(1, outputs.size());
+          auto flat = outputs[0].flat<float>();
+          EXPECT_FLOAT_EQ(1.2, flat(0));
+          num_done.fetch_add(1);
+        };
+        tp->Schedule(fn);
+      };
 
   // For blocking states:
   // - Starts at 0, BlockingOp::Compute will move to 1.
diff --git a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
index df9cf0c91f..31fb128f93 100644
--- a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
@@ -161,14 +161,14 @@ static void TestHWAccelerator(bool enableHWTrace) {
   x->set_assigned_device_name("/job:localhost/replica:0/task:0/device:GPU:0");
 #ifdef TENSORFLOW_USE_SYCL
   x->set_assigned_device_name("/job:localhost/replica:0/task:0/device:SYCL:0");
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
   // y = A * x
   Node* y = test::graph::Matmul(&graph, a, x, false, false);
   y->set_assigned_device_name("/job:localhost/replica:0/task:0/device:GPU:0");
 #ifdef TENSORFLOW_USE_SYCL
-y->set_assigned_device_name("/job:localhost/replica:0/task:0/device:SYCL:0");
-#endif // TENSORFLOW_USE_SYCL
+  y->set_assigned_device_name("/job:localhost/replica:0/task:0/device:SYCL:0");
+#endif  // TENSORFLOW_USE_SYCL
 
   Node* y_neg = test::graph::Unary(&graph, "Neg", y);
   y_neg->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:0");
@@ -181,7 +181,7 @@ y->set_assigned_device_name("/job:localhost/replica:0/task:0/device:SYCL:0");
   (*options.config.mutable_device_count())["GPU"] = 1;
 #ifdef TENSORFLOW_USE_SYCL
   (*options.config.mutable_device_count())["SYCL"] = 1;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
   options.config.set_allow_soft_placement(true);
   options.config.mutable_graph_options()->set_build_cost_model(1);
   std::unique_ptr<Session> session(NewSession(options));
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 9d03caff1e..f515590b28 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -1609,7 +1609,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_usec) {
         auto done = [this, state]() {
           Device* device = impl_->params_.device;
           NodeExecStatsWrapper* stats = state->stats;  // Shorthand
-          Entry* first_input = state->first_input;  // Shorthand
+          Entry* first_input = state->first_input;     // Shorthand
 
           nodestats::SetOpEnd(stats);
           EntryVector outputs;
diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index e9c4328f29..150fb85c70 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -205,7 +205,7 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
   // The instantiated and transformed function is encoded as a Graph
   // object, and an executor is created for the graph.
   struct Item : public core::RefCounted {
-    const Graph* graph = nullptr;  // Owned by exec.
+    const Graph* graph = nullptr;                            // Owned by exec.
     const FunctionLibraryDefinition* overlay_lib = nullptr;  // Not owned.
     FunctionBody* func_graph = nullptr;
     Executor* exec = nullptr;
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
index 9e4b617d2b..67caeb3495 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
@@ -154,8 +154,9 @@ TEST(GPUBFCAllocatorTest, ExerciseCoalescing) {
     a.DeallocateRaw(t3);
     a.DeallocateRaw(t4);
   }
-  CheckStats(&a, 4097, 0, 1024 * sizeof(float) + 1048576 * sizeof(int64) +
-                              2048 * sizeof(double) + 10485760 * sizeof(float),
+  CheckStats(&a, 4097, 0,
+             1024 * sizeof(float) + 1048576 * sizeof(int64) +
+                 2048 * sizeof(double) + 10485760 * sizeof(float),
              10485760 * sizeof(float));
 
   // At the end, we should have coalesced all memory into one region
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index 0e5b6b7ef8..e26cb0fc3e 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -763,8 +763,9 @@ int64 MinSystemMemory(int64 available_memory) {
   min_system_memory *= 2;
 #endif
 #if defined(NVIDIA_TEGRA)
-  // 1GB system mem for NVIDIA Tegra devices since they use the same mem for RAM and Video RAM
-  min_system_memory = 1<<30;
+  // 1GB system mem for NVIDIA Tegra devices since they use the same mem for RAM
+  // and Video RAM
+  min_system_memory = 1 << 30;
 #endif
   return min_system_memory;
 }
diff --git a/tensorflow/core/common_runtime/gpu/gpu_stream_util_test.cc b/tensorflow/core/common_runtime/gpu/gpu_stream_util_test.cc
index 7763a4f2e6..2500425359 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_stream_util_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_stream_util_test.cc
@@ -108,7 +108,8 @@ TEST_F(GpuStreamUtilTest, StreamOverrides) {
   ops::_Recv(root.WithOpName("input"), DT_FLOAT, "input", "/cpu:0", 0,
              "/device:GPU:0");
   Output n = ops::MatMul(root, {}, {});
-  ops::_Send(root.WithOpName("output"), n, "output", "/device:GPU:0", 0, "/cpu:0");
+  ops::_Send(root.WithOpName("output"), n, "output", "/device:GPU:0", 0,
+             "/cpu:0");
   Graph g(OpRegistry::Global());
   TF_ASSERT_OK(root.ToGraph(&g));
 
diff --git a/tensorflow/core/common_runtime/gpu/process_state.cc b/tensorflow/core/common_runtime/gpu/process_state.cc
index 2f13cf8bd7..b195de7cba 100644
--- a/tensorflow/core/common_runtime/gpu/process_state.cc
+++ b/tensorflow/core/common_runtime/gpu/process_state.cc
@@ -88,8 +88,8 @@ ProcessState::~ProcessState() {
 }
 
 string ProcessState::MemDesc::DebugString() {
-  return strings::StrCat((loc == CPU ? "CPU " : "GPU "), dev_index, ", dma: ",
-                         gpu_registered, ", nic: ", nic_registered);
+  return strings::StrCat((loc == CPU ? "CPU " : "GPU "), dev_index,
+                         ", dma: ", gpu_registered, ", nic: ", nic_registered);
 }
 
 ProcessState::MemDesc ProcessState::PtrType(const void* ptr) {
diff --git a/tensorflow/core/common_runtime/graph_execution_state.h b/tensorflow/core/common_runtime/graph_execution_state.h
index db2686ce2c..2312e1a89f 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.h
+++ b/tensorflow/core/common_runtime/graph_execution_state.h
@@ -139,9 +139,7 @@ class GraphExecutionState {
 
   // The graph returned by BuildGraph may contain only the pruned
   // graph, whereas some clients may want access to the full graph.
-  const Graph* full_graph() {
-    return graph_;
-  }
+  const Graph* full_graph() { return graph_; }
 
   // Returns the node with the given name, or null if it does not exist.
   const Node* get_node_by_name(const string& name) const {
diff --git a/tensorflow/core/common_runtime/memory_types.cc b/tensorflow/core/common_runtime/memory_types.cc
index 76b926ba40..090a16ebeb 100644
--- a/tensorflow/core/common_runtime/memory_types.cc
+++ b/tensorflow/core/common_runtime/memory_types.cc
@@ -47,7 +47,7 @@ struct EndpointEq {
 static Status ProcessMemoryTypes(
     const DeviceType& device_type, const Graph* g,
     const std::function<Status(const Edge*, MemoryType, MemoryType)>& fn) {
-  if (device_type != DEVICE_GPU && device_type != DEVICE_SYCL ) {
+  if (device_type != DEVICE_GPU && device_type != DEVICE_SYCL) {
     // On non-GPU and non-SYCL devices, HOST_MEMORY and DEVICE_MEMORY are always
     // compatible.
     return Status::OK();
diff --git a/tensorflow/core/common_runtime/memory_types_test.cc b/tensorflow/core/common_runtime/memory_types_test.cc
index 2a834ddca4..a093585571 100644
--- a/tensorflow/core/common_runtime/memory_types_test.cc
+++ b/tensorflow/core/common_runtime/memory_types_test.cc
@@ -36,7 +36,7 @@ TEST(MemoryTypeChecker, Int32OK) {
 #endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
   TF_EXPECT_OK(ValidateMemoryTypes(DEVICE_SYCL, g));
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
   delete g;
 }
 
@@ -64,7 +64,7 @@ TEST(MemoryTypeChecker, Int32NotOk) {
   // But we can insert _HostSend/_HostRecv to ensure the invariant.
   TF_EXPECT_OK(EnsureMemoryTypes(DEVICE_SYCL, "/device:SYCL:0", g));
   TF_EXPECT_OK(ValidateMemoryTypes(DEVICE_SYCL, g));
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
   delete g;
 }
 
@@ -91,7 +91,7 @@ TEST(MemoryTypeChecker, MemoryTypeForOutput) {
   TF_EXPECT_OK(MemoryTypeForOutput(DEVICE_SYCL, g, si, 0, &memory_type));
   // int Switch's output on GPU has HOST_MEMORY constraint.
   EXPECT_EQ(memory_type, HOST_MEMORY);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
   delete g;
 }
 
diff --git a/tensorflow/core/common_runtime/placer.h b/tensorflow/core/common_runtime/placer.h
index c5b76592e1..75dce7c7fe 100644
--- a/tensorflow/core/common_runtime/placer.h
+++ b/tensorflow/core/common_runtime/placer.h
@@ -88,9 +88,9 @@ class Placer {
   void AssignAndLog(int assigned_device, Node* node) const;
   void LogDeviceAssignment(const Node* node) const;
 
-  Graph* const graph_;                           // Not owned.
-  const DeviceSet* const devices_;               // Not owned.
-  const SessionOptions* options_;                // Not owned.
+  Graph* const graph_;              // Not owned.
+  const DeviceSet* const devices_;  // Not owned.
+  const SessionOptions* options_;   // Not owned.
   const bool log_device_placement_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(Placer);
diff --git a/tensorflow/core/common_runtime/placer_test.cc b/tensorflow/core/common_runtime/placer_test.cc
index 5d87b1e279..02c9cd5313 100644
--- a/tensorflow/core/common_runtime/placer_test.cc
+++ b/tensorflow/core/common_runtime/placer_test.cc
@@ -619,9 +619,9 @@ TEST_F(PlacerTest, TestReferenceConnectionIgnoreInfeasible) {
     Node* input = ops::SourceOp(
         "TestDevice",
         b.opts().WithName("in").WithDevice("/job:a/task:0/device:fakegpu:0"));
-    Node* var = ops::SourceOp("TestVariable",
-                              b.opts().WithName("var_0").WithDevice(
-                                  "/job:a/task:0/device:fakegpu:0"));
+    Node* var =
+        ops::SourceOp("TestVariable", b.opts().WithName("var_0").WithDevice(
+                                          "/job:a/task:0/device:fakegpu:0"));
 
     // This op is specified on CPU, but in practice will be ignored,
     // because the reference edges forces it on GPU.
diff --git a/tensorflow/core/common_runtime/session_factory.cc b/tensorflow/core/common_runtime/session_factory.cc
index 0234d4c372..4dbe113e44 100644
--- a/tensorflow/core/common_runtime/session_factory.cc
+++ b/tensorflow/core/common_runtime/session_factory.cc
@@ -60,8 +60,8 @@ const string RegisteredFactoriesErrorMessageLocked() {
                          str_util::Join(factory_types, ", "), "}.");
 }
 string SessionOptionsToString(const SessionOptions& options) {
-  return strings::StrCat("target: \"", options.target, "\" config: ",
-                         ProtoShortDebugString(options.config));
+  return strings::StrCat("target: \"", options.target,
+                         "\" config: ", ProtoShortDebugString(options.config));
 }
 }  // namespace
 
diff --git a/tensorflow/core/common_runtime/step_stats_collector.cc b/tensorflow/core/common_runtime/step_stats_collector.cc
index d7e01144c9..cb900db10a 100644
--- a/tensorflow/core/common_runtime/step_stats_collector.cc
+++ b/tensorflow/core/common_runtime/step_stats_collector.cc
@@ -226,22 +226,23 @@ void StepStatsCollector::BuildCostModel(
       if (node) {
         for (int i = 0; i < stats.output_size(); ++i) {
           const auto& output = stats.output(i);
-          cm->RecordMaxMemorySize(node, i, Bytes(output.tensor_description()
-                                                     .allocation_description()
-                                                     .allocated_bytes()),
+          cm->RecordMaxMemorySize(node, i,
+                                  Bytes(output.tensor_description()
+                                            .allocation_description()
+                                            .allocated_bytes()),
                                   stats.output(i).tensor_description().shape(),
                                   node->output_types()[i]);
-          cm->RecordAllocationId(node, i, output.tensor_description()
-                                              .allocation_description()
-                                              .allocation_id());
+          cm->RecordAllocationId(node, i,
+                                 output.tensor_description()
+                                     .allocation_description()
+                                     .allocation_id());
         }
         cm->RecordMemoryStats(node, stats.memory_stats());
         // Use hardware stats to record the execution time if they're available,
         // otherwise use the regular (less accurate) stats
         string node_name = dev_stats.regular_stats->node_stats(i).node_name();
-        if (dev_stats.hardware_stats &&
-            name_to_hw_node_stats.find(node_name) !=
-                name_to_hw_node_stats.end()) {
+        if (dev_stats.hardware_stats && name_to_hw_node_stats.find(node_name) !=
+                                            name_to_hw_node_stats.end()) {
           const NodeExecStats& hw_stats = name_to_hw_node_stats[node_name];
           cm->RecordMaxExecutionTime(
               node, Microseconds(hw_stats.op_end_rel_micros()));
diff --git a/tensorflow/core/common_runtime/sycl/sycl_allocator.cc b/tensorflow/core/common_runtime/sycl/sycl_allocator.cc
index 9094824ee7..02bd8b8f3b 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_allocator.cc
+++ b/tensorflow/core/common_runtime/sycl/sycl_allocator.cc
@@ -80,7 +80,7 @@ void SYCLAllocator::ClearStats() override {
 
 size_t SYCLAllocator::RequestedSize(void* ptr) {
   mutex_lock lock(mu_);
-  if(!sycl_device_) {
+  if (!sycl_device_) {
     return 0;
   }
   const auto& buffer = sycl_device_->get_sycl_buffer(ptr);
diff --git a/tensorflow/core/common_runtime/sycl/sycl_allocator.h b/tensorflow/core/common_runtime/sycl/sycl_allocator.h
index cca9f92c62..550f193332 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_allocator.h
+++ b/tensorflow/core/common_runtime/sycl/sycl_allocator.h
@@ -20,10 +20,10 @@ limitations under the License.
 #ifndef TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_ALLOCATOR_H_
 #define TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_ALLOCATOR_H_
 
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 namespace tensorflow {
 
@@ -56,14 +56,13 @@ class SYCLAllocator : public Allocator {
   // Clear the SYCL device used by the Allocator
   void ClearSYCLDevice() {
     mutex_lock lock(mu_);
-    if(sycl_device_) {
+    if (sycl_device_) {
       delete sycl_device_;
       sycl_device_ = nullptr;
     }
   }
 
  private:
-
   mutable mutex mu_;
   Eigen::SyclDevice* sycl_device_ GUARDED_BY(mu_);  // owned
   AllocatorStats stats_ GUARDED_BY(mu_);
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device.h b/tensorflow/core/common_runtime/sycl/sycl_device.h
index cc272d156e..7c09e0b8f1 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_device.h
+++ b/tensorflow/core/common_runtime/sycl/sycl_device.h
@@ -187,9 +187,9 @@ class GSYCLInterface {
       type = "Unknown";
     }
 
-    return strings::StrCat("id: ", device_id, ", type: ", type, ", name: ",
-                           name.c_str(), ", vendor: ", vendor.c_str(),
-                           ", profile: ", profile.c_str());
+    return strings::StrCat(
+        "id: ", device_id, ", type: ", type, ", name: ", name.c_str(),
+        ", vendor: ", vendor.c_str(), ", profile: ", profile.c_str());
   }
 };
 
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc b/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc
index 19c14770dc..14f7727659 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc
+++ b/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc
@@ -26,7 +26,6 @@ class SYCLDeviceFactory : public DeviceFactory {
  public:
   Status CreateDevices(const SessionOptions &options, const string &name_prefix,
                        std::vector<Device *> *devices) override {
-
     auto syclInterface = GSYCLInterface::instance();
 
     size_t n = 1;
@@ -37,13 +36,11 @@ class SYCLDeviceFactory : public DeviceFactory {
 
     for (int i = 0; i < n; i++) {
       string name = strings::StrCat(name_prefix, "/device:SYCL:", i);
-      devices->push_back(
-          new SYCLDevice(options, name, Bytes(256 << 20), DeviceLocality()
-                         , syclInterface->GetShortDeviceDescription(i)
-                         , syclInterface->GetSYCLAllocator(i)
-                         , syclInterface->GetCPUAllocator(i)
-                         , syclInterface->GetSYCLContext(i))
-                       );
+      devices->push_back(new SYCLDevice(
+          options, name, Bytes(256 << 20), DeviceLocality(),
+          syclInterface->GetShortDeviceDescription(i),
+          syclInterface->GetSYCLAllocator(i), syclInterface->GetCPUAllocator(i),
+          syclInterface->GetSYCLContext(i)));
     }
 
     return Status::OK();
@@ -51,6 +48,6 @@ class SYCLDeviceFactory : public DeviceFactory {
 };
 
 REGISTER_LOCAL_DEVICE_FACTORY("SYCL", SYCLDeviceFactory, 200);
-}
+}  // namespace tensorflow
 
 #endif  // TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/common_runtime/sycl/sycl_util.h b/tensorflow/core/common_runtime/sycl/sycl_util.h
index 83016b706a..3124ed23c9 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_util.h
+++ b/tensorflow/core/common_runtime/sycl/sycl_util.h
@@ -20,8 +20,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_UTIL_H_
 #define TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_UTIL_H_
 
-#include "tensorflow/core/common_runtime/device.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/common_runtime/device.h"
 // For DMA helper
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/framework/tensor.h"
diff --git a/tensorflow/core/debug/debug_gateway.cc b/tensorflow/core/debug/debug_gateway.cc
index 616ced3d0f..2e1aabd1cc 100644
--- a/tensorflow/core/debug/debug_gateway.cc
+++ b/tensorflow/core/debug/debug_gateway.cc
@@ -24,31 +24,31 @@ limitations under the License.
 namespace tensorflow {
 
 DebugGateway::DebugGateway(DirectSession* session) : session_(session) {
-  session_->node_outputs_callback_ = [this](
-      const string& node_name, const int output_slot, const Tensor* tensor,
-      const bool is_ref, OpKernelContext* ctx) {
-    if (comp_cb_ != nullptr && output_slot <= 0) {
-      // The node completion callback is invoked once for a node regardless
-      // of whether the node has zero, one or more outputs.
-      // The output_slot can be negative (-1, or kControlSlot) if
-      // node_outputs_callback_ is invoked for a node with no output. If that
-      // is the case, notify the callback that the node in question has no
-      // output.
-      comp_cb_(node_name, output_slot == 0);
-    }
-
-    // Copy tensor values (e.g., from GPU to host) only if the
-    // value callback is not nullptr.
-    if (val_cb_ != nullptr && output_slot >= 0) {
-      CopyTensor(
-          node_name, output_slot, tensor, ctx,
-          [this, node_name, output_slot, is_ref](const Tensor* copied_tensor) {
-            val_cb_(node_name, output_slot, *copied_tensor, is_ref);
-          });
-    }
-
-    return Status::OK();
-  };
+  session_->node_outputs_callback_ =
+      [this](const string& node_name, const int output_slot,
+             const Tensor* tensor, const bool is_ref, OpKernelContext* ctx) {
+        if (comp_cb_ != nullptr && output_slot <= 0) {
+          // The node completion callback is invoked once for a node regardless
+          // of whether the node has zero, one or more outputs.
+          // The output_slot can be negative (-1, or kControlSlot) if
+          // node_outputs_callback_ is invoked for a node with no output. If
+          // that is the case, notify the callback that the node in question has
+          // no output.
+          comp_cb_(node_name, output_slot == 0);
+        }
+
+        // Copy tensor values (e.g., from GPU to host) only if the
+        // value callback is not nullptr.
+        if (val_cb_ != nullptr && output_slot >= 0) {
+          CopyTensor(node_name, output_slot, tensor, ctx,
+                     [this, node_name, output_slot,
+                      is_ref](const Tensor* copied_tensor) {
+                       val_cb_(node_name, output_slot, *copied_tensor, is_ref);
+                     });
+        }
+
+        return Status::OK();
+      };
 }
 
 DebugGateway::~DebugGateway() {
@@ -86,7 +86,8 @@ void DebugGateway::CopyTensor(const string& node_name, const int output_slot,
     // Determine if the tensor is on device (GPU) or host (CPU).
     // The second part of the check is necessary because even an OpKernel on
     // may have output tensors allocated on CPU.
-    if ((device->name().find("GPU:") != string::npos || device->name().find("SYCL:") != string::npos) &&
+    if ((device->name().find("GPU:") != string::npos ||
+         device->name().find("SYCL:") != string::npos) &&
         !ctx->output_alloc_attr(output_slot).on_host()) {
       // GPU tensors: Copy it to host (CPU).
       DeviceContext* device_ctxt = ctx->op_device_context();
diff --git a/tensorflow/core/debug/debug_gateway_test.cc b/tensorflow/core/debug/debug_gateway_test.cc
index 5758334906..b1bbd3f698 100644
--- a/tensorflow/core/debug/debug_gateway_test.cc
+++ b/tensorflow/core/debug/debug_gateway_test.cc
@@ -390,9 +390,9 @@ TEST_F(SessionDebugMinusAXTest,
   debug_gateway.SetNodeValueCallback(
       [this, &mu, &val_callback_count, &a_debug_identity_node_name,
        &x_debug_identity_node_name, &y_debug_identity_node_name,
-       &debug_identity_tensor_vals, &callbacks_done, &kConcurrentRuns](
-           const string& node_name, const int output_slot,
-           const Tensor& tensor_value, const bool is_ref) {
+       &debug_identity_tensor_vals, &callbacks_done,
+       &kConcurrentRuns](const string& node_name, const int output_slot,
+                         const Tensor& tensor_value, const bool is_ref) {
         mutex_lock l(mu);
 
         if (node_name == a_debug_identity_node_name && output_slot == 0) {
@@ -560,21 +560,21 @@ TEST_F(SessionDebugOutputSlotWithoutOutgoingEdgeTest,
   Notification callbacks_done;
 
   std::vector<Tensor> debug_identity_tensor_vals;
-  debug_gateway.SetNodeValueCallback([this, &mu, &callbacks_done,
-                                      &debug_identity_node_name,
-                                      &debug_identity_tensor_vals](
-      const string& node_name, const int output_slot,
-      const Tensor& tensor_value, const bool is_ref) {
-    mutex_lock l(mu);
+  debug_gateway.SetNodeValueCallback(
+      [this, &mu, &callbacks_done, &debug_identity_node_name,
+       &debug_identity_tensor_vals](
+          const string& node_name, const int output_slot,
+          const Tensor& tensor_value, const bool is_ref) {
+        mutex_lock l(mu);
 
-    if (node_name == debug_identity_node_name && output_slot == 0) {
-      debug_identity_tensor_vals.push_back(tensor_value);
+        if (node_name == debug_identity_node_name && output_slot == 0) {
+          debug_identity_tensor_vals.push_back(tensor_value);
 
-      if (!callbacks_done.HasBeenNotified()) {
-        callbacks_done.Notify();
-      }
-    }
-  });
+          if (!callbacks_done.HasBeenNotified()) {
+            callbacks_done.Notify();
+          }
+        }
+      });
 
   // Add DebugIdentity watch on c:0, which does not have an outgoing edge.
   RunOptions run_opts;
diff --git a/tensorflow/core/debug/debug_grpc_testlib.cc b/tensorflow/core/debug/debug_grpc_testlib.cc
index a312f789d8..f70931e926 100644
--- a/tensorflow/core/debug/debug_grpc_testlib.cc
+++ b/tensorflow/core/debug/debug_grpc_testlib.cc
@@ -30,7 +30,7 @@ namespace test {
 
 ::grpc::Status TestEventListenerImpl::SendEvents(
     ::grpc::ServerContext* context,
-    ::grpc::ServerReaderWriter< ::tensorflow::EventReply, ::tensorflow::Event>*
+    ::grpc::ServerReaderWriter<::tensorflow::EventReply, ::tensorflow::Event>*
         stream) {
   Event event;
 
diff --git a/tensorflow/core/debug/debug_io_utils_test.cc b/tensorflow/core/debug/debug_io_utils_test.cc
index 2f83c2415b..0807a85b8b 100644
--- a/tensorflow/core/debug/debug_io_utils_test.cc
+++ b/tensorflow/core/debug/debug_io_utils_test.cc
@@ -57,7 +57,8 @@ class DebugIOUtilsTest : public ::testing::Test {
 TEST_F(DebugIOUtilsTest, ConstructDebugNodeKey) {
   DebugNodeKey debug_node_key("/job:worker/replica:1/task:0/device:GPU:2",
                               "hidden_1/MatMul", 0, "DebugIdentity");
-  EXPECT_EQ("/job:worker/replica:1/task:0/device:GPU:2", debug_node_key.device_name);
+  EXPECT_EQ("/job:worker/replica:1/task:0/device:GPU:2",
+            debug_node_key.device_name);
   EXPECT_EQ("hidden_1/MatMul", debug_node_key.node_name);
   EXPECT_EQ(0, debug_node_key.output_slot);
   EXPECT_EQ("DebugIdentity", debug_node_key.debug_op);
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.h b/tensorflow/core/distributed_runtime/graph_mgr.h
index d0ca2a6257..cc35264b8f 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.h
+++ b/tensorflow/core/distributed_runtime/graph_mgr.h
@@ -140,7 +140,7 @@ class GraphMgr {
     GraphMgr* graph_mgr;
   };
 
-  const WorkerEnv* worker_env_;             // Not owned.
+  const WorkerEnv* worker_env_;  // Not owned.
   DeviceMgr* device_mgr_;
 
   CostModelManager cost_model_manager_;
diff --git a/tensorflow/core/distributed_runtime/master.cc b/tensorflow/core/distributed_runtime/master.cc
index d1dc622ce7..1a488303ac 100644
--- a/tensorflow/core/distributed_runtime/master.cc
+++ b/tensorflow/core/distributed_runtime/master.cc
@@ -528,8 +528,8 @@ void Master::ListDevices(const ListDevicesRequest* req,
       auto session = FindMasterSession(req->session_handle());
       if (session == nullptr) {
         done(errors::InvalidArgument(
-             "Session ", req->session_handle(),
-             " is not found. Possibly, this master has restarted."));
+            "Session ", req->session_handle(),
+            " is not found. Possibly, this master has restarted."));
         return;
       }
       core::ScopedUnref ref(session);
diff --git a/tensorflow/core/distributed_runtime/master_test.cc b/tensorflow/core/distributed_runtime/master_test.cc
index 121c58762f..f2c1f3489c 100644
--- a/tensorflow/core/distributed_runtime/master_test.cc
+++ b/tensorflow/core/distributed_runtime/master_test.cc
@@ -61,7 +61,7 @@ class MasterTest : public ::testing::Test {
   // rpc calls.
 
   Status CreateSession(const GraphDef& def, string* handle,
-                            int64* initial_version) {
+                       int64* initial_version) {
     ::grpc::ClientContext ctx;
     CreateSessionRequest req;
     *(req.mutable_graph_def()) = def;
@@ -77,7 +77,7 @@ class MasterTest : public ::testing::Test {
   }
 
   Status ExtendSession(const string& handle, const GraphDef& def,
-                            int64 current_version, int64* new_version) {
+                       int64 current_version, int64* new_version) {
     ::grpc::ClientContext ctx;
     ExtendSessionRequest req;
     req.set_session_handle(handle);
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
index ac27993773..b4d18d8607 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
@@ -185,23 +185,22 @@ class GrpcMasterService : public AsyncServiceInterface {
     MutableRunStepResponseWrapper* wrapped_response =
         new NonOwnedProtoRunStepResponse(&call->response);
     call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
-    master_impl_->RunStep(call_opts, wrapped_request, wrapped_response,
-                          [call, call_opts, wrapped_request, wrapped_response,
-                           trace](const Status& status) {
-                            call->ClearCancelCallback();
-                            delete call_opts;
-                            delete wrapped_request;
-                            delete trace;
-                            if (call->request.store_errors_in_response_body() &&
-                                !status.ok()) {
-                              call->response.set_status_code(status.code());
-                              call->response.set_status_error_message(
-                                  status.error_message());
-                              call->SendResponse(ToGrpcStatus(Status::OK()));
-                            } else {
-                              call->SendResponse(ToGrpcStatus(status));
-                            }
-                          });
+    master_impl_->RunStep(
+        call_opts, wrapped_request, wrapped_response,
+        [call, call_opts, wrapped_request, wrapped_response,
+         trace](const Status& status) {
+          call->ClearCancelCallback();
+          delete call_opts;
+          delete wrapped_request;
+          delete trace;
+          if (call->request.store_errors_in_response_body() && !status.ok()) {
+            call->response.set_status_code(status.code());
+            call->response.set_status_error_message(status.error_message());
+            call->SendResponse(ToGrpcStatus(Status::OK()));
+          } else {
+            call->SendResponse(ToGrpcStatus(status));
+          }
+        });
     ENQUEUE_REQUEST(RunStep, true);
   }
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h
index 4e203e260a..6ae94b7441 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h
@@ -89,9 +89,9 @@ class MasterService final {
     ::grpc::Status ExtendSession(::grpc::ClientContext* context,
                                  const ExtendSessionRequest& request,
                                  ExtendSessionResponse* response) override;
-    ::grpc::Status PartialRunSetup(
-        ::grpc::ClientContext* context, const PartialRunSetupRequest& request,
-        PartialRunSetupResponse* response) override;
+    ::grpc::Status PartialRunSetup(::grpc::ClientContext* context,
+                                   const PartialRunSetupRequest& request,
+                                   PartialRunSetupResponse* response) override;
     ::grpc::Status RunStep(::grpc::ClientContext* context,
                            const RunStepRequest& request,
                            RunStepResponse* response) override;
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
index 70418f6368..1088e9be66 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
@@ -69,8 +69,7 @@ class GrpcRemoteMaster : public MasterInterface {
     ::grpc::ClientContext ctx;
     auto trace = TraceRpc("RunStep/Client", &ctx);
     return Call(&ctx, call_options, &request->ToProto(),
-                get_proto_from_wrapper(response),
-                &MasterServiceStub::RunStep);
+                get_proto_from_wrapper(response), &MasterServiceStub::RunStep);
   }
 
   Status CloseSession(CallOptions* call_options,
@@ -114,8 +113,9 @@ class GrpcRemoteMaster : public MasterInterface {
   template <typename Request, typename Response>
   Status Call(::grpc::ClientContext* ctx, CallOptions* call_options,
               const Request* request, Response* response,
-              ::grpc::Status (MasterServiceStub::*pfunc)(
-                  ::grpc::ClientContext*, const Request&, Response*)) {
+              ::grpc::Status (MasterServiceStub::*pfunc)(::grpc::ClientContext*,
+                                                         const Request&,
+                                                         Response*)) {
     ctx->set_fail_fast(false);
     SetDeadline(ctx, call_options->GetTimeout());
     return FromGrpcStatus((stub_.get()->*pfunc)(ctx, *request, response));
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_testlib_ops.cc b/tensorflow/core/distributed_runtime/rpc/grpc_testlib_ops.cc
index 373eecffca..5597ee7a76 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_testlib_ops.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_testlib_ops.cc
@@ -21,11 +21,8 @@ namespace tensorflow {
 namespace test {
 
 // ErrorOp::Compute returns an error.
-REGISTER_OP("Error")
-    .Input("in: T")
-    .Output("out: T")
-    .Attr("T: type")
-    .Attr("message: string");
+REGISTER_OP("Error").Input("in: T").Output("out: T").Attr("T: type").Attr(
+    "message: string");
 class ErrorOp : public OpKernel {
  public:
   explicit ErrorOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
@@ -66,11 +63,8 @@ REGISTER_KERNEL_BUILDER(Name("InvalidRefType").Device(DEVICE_CPU),
 
 // DelayOp::AsyncCompute sleeps for "micros"-econd and then returns
 // its input.
-REGISTER_OP("Delay")
-    .Input("in: T")
-    .Output("out: T")
-    .Attr("T: type")
-    .Attr("micros: int");
+REGISTER_OP("Delay").Input("in: T").Output("out: T").Attr("T: type").Attr(
+    "micros: int");
 class DelayOp : public AsyncOpKernel {
  public:
   explicit DelayOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
diff --git a/tensorflow/core/distributed_runtime/rpcbench_test.cc b/tensorflow/core/distributed_runtime/rpcbench_test.cc
index b2668fae25..d3af7417e6 100644
--- a/tensorflow/core/distributed_runtime/rpcbench_test.cc
+++ b/tensorflow/core/distributed_runtime/rpcbench_test.cc
@@ -184,8 +184,8 @@ static void BM_Helper(int iters, int width, int num_stages, int tensor_size,
 
   testing::SetLabel(
       strings::StrCat(def.node_size(), " nodes; ",
-             use_multiple_devices ? "Multi device" : "Single device",
-             "; tensor bytes/send: ", tensor_size * sizeof(float)));
+                      use_multiple_devices ? "Multi device" : "Single device",
+                      "; tensor bytes/send: ", tensor_size * sizeof(float)));
 
   std::vector<Tensor> outputs;
 
diff --git a/tensorflow/core/distributed_runtime/scheduler.cc b/tensorflow/core/distributed_runtime/scheduler.cc
index 4766f4c33b..9dae5b3b92 100644
--- a/tensorflow/core/distributed_runtime/scheduler.cc
+++ b/tensorflow/core/distributed_runtime/scheduler.cc
@@ -17,9 +17,9 @@ limitations under the License.
 
 #include <queue>
 
-#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/util/util.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/distributed_runtime/scheduler.h b/tensorflow/core/distributed_runtime/scheduler.h
index eabcaccdd1..ef87b9834d 100644
--- a/tensorflow/core/distributed_runtime/scheduler.h
+++ b/tensorflow/core/distributed_runtime/scheduler.h
@@ -16,15 +16,15 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_SCHEDULER_H_
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_SCHEDULER_H_
 
-#include <functional>
 #include <deque>
+#include <functional>
 #include <map>
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/core/graph/costmodel.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/graph/costmodel.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/distributed_runtime/worker_cache_logger.cc b/tensorflow/core/distributed_runtime/worker_cache_logger.cc
index 702af78c88..95ca3c3b4d 100644
--- a/tensorflow/core/distributed_runtime/worker_cache_logger.cc
+++ b/tensorflow/core/distributed_runtime/worker_cache_logger.cc
@@ -97,9 +97,8 @@ void WorkerCacheLogger::RecordDataTransfer(int64 step_id, int64 start_usecs,
                                            const string& tensor_name,
                                            const string& src_device,
                                            const string& dst_device,
-                                           int64 bytes,
-                                           const string& details,
-                                           const string& transfer_method_name){
+                                           int64 bytes, const string& details,
+                                           const string& transfer_method_name) {
   NodeExecStats* ns = new NodeExecStats;
   ns->set_node_name(transfer_method_name);
   if (details.empty()) {
diff --git a/tensorflow/core/graph/costmodel.cc b/tensorflow/core/graph/costmodel.cc
index 4118f14f8b..f47c983086 100644
--- a/tensorflow/core/graph/costmodel.cc
+++ b/tensorflow/core/graph/costmodel.cc
@@ -158,8 +158,8 @@ void CostModel::SetNumOutputs(const Node* node, int num_outputs) {
   Ensure(id, 0);
   auto perslot = &slot_bytes_[id];
   if (!perslot->empty()) {
-    CHECK_EQ(num_outputs, perslot->size()) << "Cannot resize slot_bytes, node="
-                                           << node->name();
+    CHECK_EQ(num_outputs, perslot->size())
+        << "Cannot resize slot_bytes, node=" << node->name();
   }
   Ensure(id, num_outputs);
 }
diff --git a/tensorflow/core/graph/costmodel.h b/tensorflow/core/graph/costmodel.h
index c60a946c2c..9b703e4693 100644
--- a/tensorflow/core/graph/costmodel.h
+++ b/tensorflow/core/graph/costmodel.h
@@ -198,7 +198,7 @@ class CostModel {
   // Cumulative execution time.
   std::vector<Microseconds> time_;
   // Cumulative Bytes output on each channel.
-  std::vector<gtl::InlinedVector<Bytes, 2> > slot_bytes_;
+  std::vector<gtl::InlinedVector<Bytes, 2>> slot_bytes_;
 
   // Maximum execution time
   std::vector<Microseconds> max_exec_time_;
@@ -217,7 +217,7 @@ class CostModel {
   };
   std::vector<MemUsage> max_mem_usage_;
 
-  std::vector<gtl::InlinedVector<int64, 2> > output_port_alloc_ids_;
+  std::vector<gtl::InlinedVector<int64, 2>> output_port_alloc_ids_;
 
   std::set<int64> persistent_alloc_ids_;
   std::map<string, std::set<int64>> persistent_alloc_ids_by_devices_;
diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h
index b620127d90..93d8dd6f11 100644
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@@ -62,8 +62,8 @@ class Node;
 class VersionDef;
 class WhileContext;
 
-class NeighborIter;  // Declared below
-class NodeIter;      // Declared below
+class NeighborIter;    // Declared below
+class NodeIter;        // Declared below
 class NodeProperties;  // Defined in .cc
 
 class Node {
diff --git a/tensorflow/core/graph/graph_def_builder_test.cc b/tensorflow/core/graph/graph_def_builder_test.cc
index e85de71ef7..e928c81b45 100644
--- a/tensorflow/core/graph/graph_def_builder_test.cc
+++ b/tensorflow/core/graph/graph_def_builder_test.cc
@@ -26,7 +26,6 @@ namespace tensorflow {
 namespace {
 
 TEST(GraphDefBuilderTest, Version) {
-
   // Verify that our assertions will be nontrivial
   ASSERT_LT(0, TF_GRAPH_DEF_VERSION);
 
diff --git a/tensorflow/core/graph/mkl_graph_util.h b/tensorflow/core/graph/mkl_graph_util.h
index 3df981437a..1b99d54e8e 100644
--- a/tensorflow/core/graph/mkl_graph_util.h
+++ b/tensorflow/core/graph/mkl_graph_util.h
@@ -21,102 +21,101 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 
 namespace tensorflow {
-  // Since our ops are going to produce and also consume N addition tensors
-  // (Mkl) for N Tensorflow tensors, we can have following different
-  // orderings among these 2N tensors.
-  //
-  // E.g., for Tensorflow tensors A, B, and C, our ops will produce and
-  // consume A_m, B_m, and C_m additionally.
-  //
-  // INTERLEAVED: in this case 2N tensors are interleaved. So for above
-  //              example, the ordering looks like: A, A_m, B, B_m, C, C_m.
-  //
-  // CONTIGUOUS: in thi case N Tensorflow tensors are contiguous followed
-  //             by N Mkl tensors. So for above example, the ordering looks
-  //             like: A, B, C, A_m, B_m, C_m
-  //
-  // Following APIs map index of original Tensorflow tensors to their
-  // appropriate position based on selected ordering. For contiguous ordering,
-  // we need to know the total number of tensors (parameter total).
-  //
-  typedef enum { TENSORS_INTERLEAVED, TENSORS_CONTIGUOUS } MklTfTensorOrdering;
-  // NOTE: Currently, we use contiguous ordering. If you change this, then you
-  // would need to change Mkl op definitions in nn_ops.cc.
-  static MklTfTensorOrdering kTensorOrdering = TENSORS_CONTIGUOUS;
+// Since our ops are going to produce and also consume N addition tensors
+// (Mkl) for N Tensorflow tensors, we can have following different
+// orderings among these 2N tensors.
+//
+// E.g., for Tensorflow tensors A, B, and C, our ops will produce and
+// consume A_m, B_m, and C_m additionally.
+//
+// INTERLEAVED: in this case 2N tensors are interleaved. So for above
+//              example, the ordering looks like: A, A_m, B, B_m, C, C_m.
+//
+// CONTIGUOUS: in thi case N Tensorflow tensors are contiguous followed
+//             by N Mkl tensors. So for above example, the ordering looks
+//             like: A, B, C, A_m, B_m, C_m
+//
+// Following APIs map index of original Tensorflow tensors to their
+// appropriate position based on selected ordering. For contiguous ordering,
+// we need to know the total number of tensors (parameter total).
+//
+typedef enum { TENSORS_INTERLEAVED, TENSORS_CONTIGUOUS } MklTfTensorOrdering;
+// NOTE: Currently, we use contiguous ordering. If you change this, then you
+// would need to change Mkl op definitions in nn_ops.cc.
+static MklTfTensorOrdering kTensorOrdering = TENSORS_CONTIGUOUS;
 
-  // Get index of MetaData tensor from index 'n' of Data tensor.
-  inline int DataIndexToMetaDataIndex(int n, int total_tensors) {
-    if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
-      // For interleaved ordering, Mkl tensor follows immediately after
-      // Tensorflow tensor.
-      return n + 1;
-    } else {
-      CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
-      // For contiguous ordering, Mkl tensor is n+total_tensors / 2 away.
-      return n + total_tensors / 2;
-    }
+// Get index of MetaData tensor from index 'n' of Data tensor.
+inline int DataIndexToMetaDataIndex(int n, int total_tensors) {
+  if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
+    // For interleaved ordering, Mkl tensor follows immediately after
+    // Tensorflow tensor.
+    return n + 1;
+  } else {
+    CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+    // For contiguous ordering, Mkl tensor is n+total_tensors / 2 away.
+    return n + total_tensors / 2;
   }
+}
 
-  int inline GetTensorDataIndex(int n, int total_tensors) {
-      if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
-        return 2 * n;  // index corresponding to nth input/output tensor
-      } else {
-        CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
-        return n;
-      }
-    }
+int inline GetTensorDataIndex(int n, int total_tensors) {
+  if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
+    return 2 * n;  // index corresponding to nth input/output tensor
+  } else {
+    CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+    return n;
+  }
+}
 
-  int inline GetTensorMetaDataIndex(int n, int total_tensors) {
-      // Get index for TensorData first and then use mapping function
-      // to get TensorMetaData index from TensorData index.
-      int tidx = GetTensorDataIndex(n, total_tensors);
-      return DataIndexToMetaDataIndex(tidx, total_tensors);
-    }
+int inline GetTensorMetaDataIndex(int n, int total_tensors) {
+  // Get index for TensorData first and then use mapping function
+  // to get TensorMetaData index from TensorData index.
+  int tidx = GetTensorDataIndex(n, total_tensors);
+  return DataIndexToMetaDataIndex(tidx, total_tensors);
+}
 
 namespace mkl_op_registry {
-  static const char* kMklOpLabel = "MklOp";
-  static const char* kMklOpLabelPattern = "label='MklOp'";
-  // Prefix that we add to Tensorflow op name to construct Mkl op name.
-  static const char* const kMklOpPrefix = "_Mkl";
+static const char* kMklOpLabel = "MklOp";
+static const char* kMklOpLabelPattern = "label='MklOp'";
+// Prefix that we add to Tensorflow op name to construct Mkl op name.
+static const char* const kMklOpPrefix = "_Mkl";
 
-  // Get the name of Mkl op from original TensorFlow op
-  // We prefix 'Mkl' to the original op to get Mkl op.
-  inline string GetMklOpName(const string& name) {
-    return string(kMklOpPrefix) + name;
-  }
+// Get the name of Mkl op from original TensorFlow op
+// We prefix 'Mkl' to the original op to get Mkl op.
+inline string GetMklOpName(const string& name) {
+  return string(kMklOpPrefix) + name;
+}
 
-  // Check whether opname with type T is registered as MKL-compliant.
-  //
-  // @input: name of the op
-  // @input: T datatype to be used for checking op
-  // @return: true if opname is registered as Mkl op; false otherwise
-  static inline bool IsMklOp(const std::string& op_name, DataType T) {
-    string kernel = KernelsRegisteredForOp(op_name);
-    bool result =
-        kernel.find(kMklOpLabelPattern) != string::npos && (T == DT_FLOAT);
-    return result;
-  }
+// Check whether opname with type T is registered as MKL-compliant.
+//
+// @input: name of the op
+// @input: T datatype to be used for checking op
+// @return: true if opname is registered as Mkl op; false otherwise
+static inline bool IsMklOp(const std::string& op_name, DataType T) {
+  string kernel = KernelsRegisteredForOp(op_name);
+  bool result =
+      kernel.find(kMklOpLabelPattern) != string::npos && (T == DT_FLOAT);
+  return result;
+}
 
-  // Check whether opname with type T is registered as MKL-compliant and
-  // is element-wise.
-  //
-  // @input: name of the op
-  // @input: T datatype to be used for checking op
-  // @return: true if opname is registered as element-wise Mkl op;
-  // false otherwise
-  static inline bool IsMklElementWiseOp(const std::string& op_name,
-    DataType T) {
-    if (!IsMklOp(op_name, T)) {
-      return false;
-    }
-    bool result = (0 == op_name.compare(GetMklOpName("Add")) ||
-                    0 == op_name.compare(GetMklOpName("Sub")) ||
-                    0 == op_name.compare(GetMklOpName("Mul")) ||
-                    0 == op_name.compare(GetMklOpName("Maximum")) ||
-                    0 == op_name.compare(GetMklOpName("SquaredDifference")));
-
-    return result;
+// Check whether opname with type T is registered as MKL-compliant and
+// is element-wise.
+//
+// @input: name of the op
+// @input: T datatype to be used for checking op
+// @return: true if opname is registered as element-wise Mkl op;
+// false otherwise
+static inline bool IsMklElementWiseOp(const std::string& op_name, DataType T) {
+  if (!IsMklOp(op_name, T)) {
+    return false;
   }
+  bool result = (0 == op_name.compare(GetMklOpName("Add")) ||
+                 0 == op_name.compare(GetMklOpName("Sub")) ||
+                 0 == op_name.compare(GetMklOpName("Mul")) ||
+                 0 == op_name.compare(GetMklOpName("Maximum")) ||
+                 0 == op_name.compare(GetMklOpName("SquaredDifference")));
+
+  return result;
+}
 }  // namespace mkl_op_registry
 }  // namespace tensorflow
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index 55bc401b9d..68c3136019 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -37,8 +37,8 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/tensor_format.h"
 
-#include "tensorflow/core/graph/mkl_layout_pass.h"
 #include "tensorflow/core/graph/mkl_graph_util.h"
+#include "tensorflow/core/graph/mkl_layout_pass.h"
 
 namespace tensorflow {
 
@@ -281,7 +281,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     csinfo_.mkl_conv2d_grad_filter = "_MklConv2DBackpropFilter";
     csinfo_.mkl_conv2d_with_bias = "_MklConv2DWithBias";
     csinfo_.mkl_conv2d_with_bias_backprop_bias =
-                                   "_MklConv2DWithBiasBackpropBias";
+        "_MklConv2DWithBiasBackpropBias";
     csinfo_.relu = "Relu";
     csinfo_.relu_grad = "ReluGrad";
     csinfo_.reshape = "Reshape";
@@ -297,10 +297,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     // End - element-wise ops. See note above.
 
     // NOTE: names are alphabetically sorted.
-    rinfo_.push_back({csinfo_.addn, mkl_op_registry::GetMklOpName(csinfo_.addn), CopyAttrsAddN,
-                      AddNRewrite, nullptr});
-    rinfo_.push_back({csinfo_.add,
-                      mkl_op_registry::GetMklOpName(csinfo_.add),
+    rinfo_.push_back({csinfo_.addn, mkl_op_registry::GetMklOpName(csinfo_.addn),
+                      CopyAttrsAddN, AddNRewrite, nullptr});
+    rinfo_.push_back({csinfo_.add, mkl_op_registry::GetMklOpName(csinfo_.add),
                       CopyAttrsDataType, AlwaysRewrite, nullptr});
     rinfo_.push_back({csinfo_.avg_pool,
                       mkl_op_registry::GetMklOpName(csinfo_.avg_pool),
@@ -337,14 +336,14 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     rinfo_.push_back({csinfo_.fused_batch_norm,
                       mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm),
                       CopyAttrsFusedBatchNorm, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.fused_batch_norm_grad,
-                      mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm_grad),
-                      CopyAttrsFusedBatchNorm, AlwaysRewrite, nullptr});
+    rinfo_.push_back(
+        {csinfo_.fused_batch_norm_grad,
+         mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm_grad),
+         CopyAttrsFusedBatchNorm, AlwaysRewrite, nullptr});
     rinfo_.push_back({csinfo_.identity,
                       mkl_op_registry::GetMklOpName(csinfo_.identity),
                       CopyAttrsIdentity, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.lrn,
-                      mkl_op_registry::GetMklOpName(csinfo_.lrn),
+    rinfo_.push_back({csinfo_.lrn, mkl_op_registry::GetMklOpName(csinfo_.lrn),
                       CopyAttrsLRN, AlwaysRewrite, nullptr});
     rinfo_.push_back({csinfo_.lrn_grad,
                       mkl_op_registry::GetMklOpName(csinfo_.lrn_grad),
@@ -358,11 +357,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     rinfo_.push_back({csinfo_.maximum,
                       mkl_op_registry::GetMklOpName(csinfo_.maximum),
                       CopyAttrsDataType, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.mul,
-                      mkl_op_registry::GetMklOpName(csinfo_.mul),
+    rinfo_.push_back({csinfo_.mul, mkl_op_registry::GetMklOpName(csinfo_.mul),
                       CopyAttrsDataType, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.relu,
-                      mkl_op_registry::GetMklOpName(csinfo_.relu),
+    rinfo_.push_back({csinfo_.relu, mkl_op_registry::GetMklOpName(csinfo_.relu),
                       CopyAttrsDataType, AlwaysRewrite, nullptr});
     rinfo_.push_back({csinfo_.relu_grad,
                       mkl_op_registry::GetMklOpName(csinfo_.relu_grad),
@@ -373,8 +370,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     rinfo_.push_back({csinfo_.squared_difference,
                       mkl_op_registry::GetMklOpName(csinfo_.squared_difference),
                       CopyAttrsDataType, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.sub,
-                      mkl_op_registry::GetMklOpName(csinfo_.sub),
+    rinfo_.push_back({csinfo_.sub, mkl_op_registry::GetMklOpName(csinfo_.sub),
                       CopyAttrsDataType, AlwaysRewrite, nullptr});
 
     // Add info about which ops to add workspace edge to and the slots.
@@ -388,9 +384,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     biasaddgrad_matmul_context_ = {csinfo_.bias_add_grad, csinfo_.matmul,
                                    IsBiasAddGradInMatMulContext};
 
-    biasaddgrad_conv2dwithbias_context_ = {csinfo_.bias_add_grad,
-                                   csinfo_.mkl_conv2d_with_bias,
-                                   IsBiasAddGradInConv2DWithBiasContext};
+    biasaddgrad_conv2dwithbias_context_ = {
+        csinfo_.bias_add_grad, csinfo_.mkl_conv2d_with_bias,
+        IsBiasAddGradInConv2DWithBiasContext};
 
     cinfo_.push_back(&biasaddgrad_matmul_context_);
     cinfo_.push_back(&biasaddgrad_conv2dwithbias_context_);
@@ -410,9 +406,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
 
   /// Structure to specify the context information used in a node rewrite rule
   typedef struct {
-    string node;     // Name of the node to be rewritten
-    string fwd;      // Name of the node in the forward pass that this node
-                     // corresponds to
+    string node;  // Name of the node to be rewritten
+    string fwd;   // Name of the node in the forward pass that this node
+                  // corresponds to
     std::function<bool(const Node*, const Node**, void* c)> context_match_fn;
   } ContextInfo;
 
@@ -615,14 +611,13 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     std::vector<int32> ksize, strides;
     CHECK_EQ(GetNodeAttr(n->def(), "ksize", &ksize).ok(), true);
     CHECK_EQ(GetNodeAttr(n->def(), "strides", &strides).ok(), true);
-    CHECK_EQ(GetNodeAttr(n->def(), "data_format", &data_format_str).ok(),
-             true);
+    CHECK_EQ(GetNodeAttr(n->def(), "data_format", &data_format_str).ok(), true);
     CHECK_EQ(FormatFromString(data_format_str, &data_format), true);
 
     // Condition that specifies non-batch-wise and non-depth-wise pooling.
-    if (GetTensorDim(ksize,   data_format, 'N') == 1 &&
+    if (GetTensorDim(ksize, data_format, 'N') == 1 &&
         GetTensorDim(strides, data_format, 'N') == 1 &&
-        GetTensorDim(ksize,   data_format, 'C') == 1 &&
+        GetTensorDim(ksize, data_format, 'C') == 1 &&
         GetTensorDim(strides, data_format, 'C') == 1) {
       return true;
     }
@@ -785,8 +780,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
       for (const Edge* fe : first_inp_of_filter->out_edges()) {
         if (fe->dst()->type_string() == csinfo_.mkl_conv2d_with_bias &&
             fe->dst_input() == 0) {
-          VLOG(1) << "MklLayoutRewritePass: found "
-                  << fe->dst()->DebugString()
+          VLOG(1) << "MklLayoutRewritePass: found " << fe->dst()->DebugString()
                   << " as the forward node for matching context, backward"
                   << " node is: " << n->DebugString();
           *fwd_node = fe->dst();
@@ -803,13 +797,11 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   //
   // @return - true (if BiasAddGrad is associated with MatMul);
   //           false otherwise.
-  static bool IsBiasAddGradInMatMulContext(const Node* n,
-                                           const Node** fwd_node,
+  static bool IsBiasAddGradInMatMulContext(const Node* n, const Node** fwd_node,
                                            void* ci) {
     return (!IsBiasAddGradInConv2DWithBiasContext(n, fwd_node, ci));
   }
 
-
   // Rewrite rule that uses context-information for matching,
   // used in scenario 2.
   //
@@ -880,10 +872,11 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   // @output output_nodes - the list of new nodes creating Mkl tensors
   //
   // @return None
-  void GetNodesProducingMklTensorList(std::unique_ptr<Graph>* g,
-    Node* orig_node, const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
-    int* input_idx, int list_length,
-    std::vector<NodeBuilder::NodeOut>* output_nodes);
+  void GetNodesProducingMklTensorList(
+      std::unique_ptr<Graph>* g, Node* orig_node,
+      const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
+      int* input_idx, int list_length,
+      std::vector<NodeBuilder::NodeOut>* output_nodes);
 
   // Get a node that will feed an Mkl tensor to the new
   // node that we are constructing. The output node could be (1) 'n'
@@ -900,7 +893,8 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   //                                will feed the tensor
   // @return None
   void GetNodeProducingMklTensor(std::unique_ptr<Graph>* g, Node* orig_node,
-    Node* n, int n_output_slot, Node** mkl_node, int* mkl_node_output_slot);
+                                 Node* n, int n_output_slot, Node** mkl_node,
+                                 int* mkl_node_output_slot);
 
   // Setup new inputs using old inputs 'inputs' for the rewritten node in 'nb'
   // in graph 'g'. Original node is input in 'old_node'. Inputs to 'nb' are
@@ -970,9 +964,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
 
 MklLayoutRewritePass::ConstStringsInfo MklLayoutRewritePass::csinfo_;
 MklLayoutRewritePass::ContextInfo
-  MklLayoutRewritePass::biasaddgrad_conv2dwithbias_context_;
+    MklLayoutRewritePass::biasaddgrad_conv2dwithbias_context_;
 MklLayoutRewritePass::ContextInfo
-  MklLayoutRewritePass::biasaddgrad_matmul_context_;
+    MklLayoutRewritePass::biasaddgrad_matmul_context_;
 std::vector<MklLayoutRewritePass::ContextInfo*> MklLayoutRewritePass::cinfo_;
 
 // We register Mkl rewrite pass for phase 1 in post partitioning group.
@@ -1041,13 +1035,13 @@ void MklLayoutRewritePass::GetDummyMklTensorNode(std::unique_ptr<Graph>* g,
   TensorShape dummy_shape({8});
   dummy_shape.AsProto(proto.mutable_tensor_shape());
   TF_CHECK_OK(NodeBuilder((*g)->NewName("DMT"), "Const")
-               .Attr("value", proto)
-               .Attr("dtype", dt)
-               .Device(orig_node->def().device())  // We place this node on
-                                                   // the same device as the
-                                                   // device of the original
-                                                   // node.
-               .Finalize(&**g, out));
+                  .Attr("value", proto)
+                  .Attr("dtype", dt)
+                  .Device(orig_node->def().device())  // We place this node on
+                                                      // the same device as the
+                                                      // device of the original
+                                                      // node.
+                  .Finalize(&**g, out));
 
   // If number of inputs to the original node is > 0, then we add
   // control dependency between 1st input (index 0) of the original node and
@@ -1060,8 +1054,8 @@ void MklLayoutRewritePass::GetDummyMklTensorNode(std::unique_ptr<Graph>* g,
   // the same frame.
   if (orig_node->num_inputs() > 0) {
     Node* orig_input0 = nullptr;
-    TF_CHECK_OK(orig_node->input_node(0,
-                                      const_cast<const Node**>(&orig_input0)));
+    TF_CHECK_OK(
+        orig_node->input_node(0, const_cast<const Node**>(&orig_input0)));
     CHECK_NOTNULL((*g)->AddControlEdge(orig_input0, *out));
   }
 
@@ -1069,11 +1063,9 @@ void MklLayoutRewritePass::GetDummyMklTensorNode(std::unique_ptr<Graph>* g,
 }
 
 void MklLayoutRewritePass::GetNodesProducingMklTensorList(
-    std::unique_ptr<Graph>* g,
-    Node* orig_node,
-    const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
-    int* input_idx, int list_length,
-    std::vector<NodeBuilder::NodeOut>* output_nodes) {
+    std::unique_ptr<Graph>* g, Node* orig_node,
+    const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs, int* input_idx,
+    int list_length, std::vector<NodeBuilder::NodeOut>* output_nodes) {
   CHECK_LT(*input_idx, inputs.size());
   CHECK_GT(list_length, 0);
   CHECK_NOTNULL(output_nodes);
@@ -1090,8 +1082,8 @@ void MklLayoutRewritePass::GetNodesProducingMklTensorList(
     int mkl_node_output_slot = 0;
     GetNodeProducingMklTensor(g, orig_node, n, slot, &mkl_node,
                               &mkl_node_output_slot);
-    output_nodes->push_back(NodeBuilder::NodeOut(mkl_node,
-                                                mkl_node_output_slot));
+    output_nodes->push_back(
+        NodeBuilder::NodeOut(mkl_node, mkl_node_output_slot));
     (*input_idx)++;
     list_length--;
   }
@@ -1101,9 +1093,9 @@ void MklLayoutRewritePass::GetNodesProducingMklTensorList(
 // node that we are constructing. An input node could be (1) 'n'
 // if it is Mkl layer, or (2) a dummy node producing dummy Mkl tensor
 // if 'n' is not an Mkl layer.
-void MklLayoutRewritePass::GetNodeProducingMklTensor(std::unique_ptr<Graph>* g,
-    Node* orig_node, Node* n,
-    int n_output_slot, Node** mkl_node, int* mkl_node_output_slot) {
+void MklLayoutRewritePass::GetNodeProducingMklTensor(
+    std::unique_ptr<Graph>* g, Node* orig_node, Node* n, int n_output_slot,
+    Node** mkl_node, int* mkl_node_output_slot) {
   CHECK_NOTNULL(n);
   CHECK_NOTNULL(mkl_node);
   CHECK_NOTNULL(mkl_node_output_slot);
@@ -1234,8 +1226,8 @@ int MklLayoutRewritePass::SetUpContiguousInputs(
     if (ArgIsList(arg)) {
       std::vector<NodeBuilder::NodeOut> new_node_inputs;
       int N = GetTensorListLength(arg, old_node);
-      GetNodesProducingMklTensorList(g, old_node, old_node_inputs, &iidx,
-                                     N, &new_node_inputs);
+      GetNodesProducingMklTensorList(g, old_node, old_node_inputs, &iidx, N,
+                                     &new_node_inputs);
       nb->Input(new_node_inputs);
       nn_slot_idx++;
     } else {
@@ -1336,13 +1328,13 @@ void MklLayoutRewritePass::GetDummyWorkspaceTensorNode(
   TensorShape dummy_shape({1});
   dummy_shape.AsProto(proto.mutable_tensor_shape());
   TF_CHECK_OK(NodeBuilder((*g)->NewName("DMT"), "Const")
-                .Attr("value", proto)
-                .Attr("dtype", dt)
-                .Device(orig_node->def().device())  // We place this node on
-                                                    // same the device as the
-                                                    // device of the original
-                                                    // node.
-                .Finalize(&**g, out));
+                  .Attr("value", proto)
+                  .Attr("dtype", dt)
+                  .Device(orig_node->def().device())  // We place this node on
+                                                      // same the device as the
+                                                      // device of the original
+                                                      // node.
+                  .Finalize(&**g, out));
 
   // If number of inputs to the original node is > 0, then we add
   // control dependency between 1st input (index 0) of the original node and
@@ -1355,8 +1347,8 @@ void MklLayoutRewritePass::GetDummyWorkspaceTensorNode(
   // the same frame.
   if (orig_node->num_inputs() > 0) {
     Node* orig_input0 = nullptr;
-    TF_CHECK_OK(orig_node->input_node(0,
-                                      const_cast<const Node**>(&orig_input0)));
+    TF_CHECK_OK(
+        orig_node->input_node(0, const_cast<const Node**>(&orig_input0)));
     CHECK_NOTNULL((*g)->AddControlEdge(orig_input0, *out));
   }
 
@@ -1374,7 +1366,8 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
   for (auto ws : wsinfo_) {
     if (orig_node->type_string() == ws.fwd_op &&
-        mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(orig_node->type_string()), T)) {
+        mkl_op_registry::IsMklOp(
+            mkl_op_registry::GetMklOpName(orig_node->type_string()), T)) {
       // If this op is a fwd op, then we need to check if there is an
       // edge from this node's fwd_slot to bwdop's bwd_slot. If there is
       // an edge, then we just add an attribute on this node for setting
@@ -1400,8 +1393,9 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(
         nb->Attr("workspace_enabled", false);
       }
     } else if (orig_node->type_string() == ws.bwd_op &&
-               mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(orig_node->type_string()),
-                                        T)) {
+               mkl_op_registry::IsMklOp(
+                   mkl_op_registry::GetMklOpName(orig_node->type_string()),
+                   T)) {
       // If this op is a bwd op, then we need to add workspace edge and
       // it's Mkl tensor edge between its corresponding fwd op and this
       // op. Corresponding fwd op is specified in 'fwd_op' field of
@@ -1416,7 +1410,8 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(
         if (e->src_output() == ws.fwd_slot &&
             // We would have rewritten the forward op, so we need to use
             // GetMklOpName call to get its Mkl name.
-            e->src()->type_string() == mkl_op_registry::GetMklOpName(ws.fwd_op) &&
+            e->src()->type_string() ==
+                mkl_op_registry::GetMklOpName(ws.fwd_op) &&
             e->dst_input() == ws.bwd_slot) {
           nb->Attr("workspace_enabled", true);
           CHECK_NOTNULL(ws_tensors);
@@ -1593,7 +1588,7 @@ void MklLayoutRewritePass::CopyAttrsDataType(const Node* orig_node,
 }
 
 void MklLayoutRewritePass::CopyAttrsReshape(const Node* orig_node,
-                                           NodeBuilder* nb) {
+                                            NodeBuilder* nb) {
   DataType T;
   DataType Tshape;
 
@@ -1869,8 +1864,8 @@ Status MklLayoutRewritePass::MergeNode(std::unique_ptr<Graph>* g, Node* succ,
       if (e->IsControlEdge()) {
         CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst()));
       } else {
-        CHECK_NOTNULL((*g)->AddEdge(new_node, e->src_output(), e->dst(),
-                                  e->dst_input()));
+        CHECK_NOTNULL(
+            (*g)->AddEdge(new_node, e->src_output(), e->dst(), e->dst_input()));
       }
     }
 
@@ -1941,9 +1936,9 @@ Status MklLayoutRewritePass::RewriteNode(std::unique_ptr<Graph>* g,
       // and leave BiasAddGrad as it is. But we check for this condition
       // when we check for node rewrite rule. So we should not even come
       // here for MatMul. So we will fail now.
-        return Status(
-            error::Code::INVALID_ARGUMENT,
-            "No rewrite is required for BiasAddGrad for MatMul context.");
+      return Status(
+          error::Code::INVALID_ARGUMENT,
+          "No rewrite is required for BiasAddGrad for MatMul context.");
     }
   }
 
@@ -2012,9 +2007,10 @@ Status MklLayoutRewritePass::RewriteNode(std::unique_ptr<Graph>* g,
     if (e->IsControlEdge()) {
       CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst()));
     } else {
-      CHECK_NOTNULL((*g)->AddEdge(new_node, GetTensorDataIndex(e->src_output(),
-                            e->src()->num_outputs()),
-                    e->dst(), e->dst_input()));
+      CHECK_NOTNULL((*g)->AddEdge(
+          new_node,
+          GetTensorDataIndex(e->src_output(), e->src()->num_outputs()),
+          e->dst(), e->dst_input()));
     }
   }
 
@@ -2070,7 +2066,8 @@ MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
 
   // BiasAddGrad is not an Mkl layer, so we make an exception for it.
   if (n->type_string() != csinfo_.bias_add_grad) {
-    if (!mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(n->type_string()), T)) {
+    if (!mkl_op_registry::IsMklOp(
+            mkl_op_registry::GetMklOpName(n->type_string()), T)) {
       return nullptr;
     }
   }
@@ -2186,8 +2183,7 @@ bool RunMklLayoutRewritePass(std::unique_ptr<Graph>* g) {
   return MklLayoutRewritePass().RunPass(g);
 }
 
-Status MklLayoutRewritePass::Run(
-  const GraphOptimizationPassOptions& options) {
+Status MklLayoutRewritePass::Run(const GraphOptimizationPassOptions& options) {
   if (options.graph == nullptr && options.partition_graphs == nullptr) {
     return Status::OK();
   }
@@ -2215,7 +2211,7 @@ Status MklLayoutRewritePass::Run(
   return Status::OK();
 }
 
-#else  // INTEL_MKL_DNN
+#else   // INTEL_MKL_DNN
 
 // This pass implements rewriting of graph to support following scenarios:
 // (A) Merging nodes in the graph
@@ -2421,7 +2417,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     csinfo_.conv2d_grad_input = "Conv2DBackpropInput";
     csinfo_.conv2d_grad_filter = "Conv2DBackpropFilter";
     csinfo_.conv2d_grad_filter_with_bias =
-                              "__MklDummyConv2DBackpropFilterWithBias";
+        "__MklDummyConv2DBackpropFilterWithBias";
     csinfo_.fused_batch_norm = "FusedBatchNorm";
     csinfo_.fused_batch_norm_grad = "FusedBatchNormGrad";
     csinfo_.identity = "Identity";
@@ -2435,11 +2431,11 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     csinfo_.mkl_conv2d_grad_filter = "_MklConv2DBackpropFilter";
     csinfo_.mkl_conv2d_with_bias = "_MklConv2DWithBias";
     csinfo_.mkl_conv2d_grad_filter_with_bias =
-                                   "_MklConv2DBackpropFilterWithBias";
+        "_MklConv2DBackpropFilterWithBias";
     csinfo_.relu = "Relu";
     csinfo_.relu_grad = "ReluGrad";
-    csinfo_.tanh       = "Tanh";
-    csinfo_.tanh_grad  = "TanhGrad";
+    csinfo_.tanh = "Tanh";
+    csinfo_.tanh_grad = "TanhGrad";
     csinfo_.reshape = "Reshape";
     csinfo_.softmax = "Softmax";
     csinfo_.split = "Split";
@@ -2474,29 +2470,28 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     rinfo_.push_back({csinfo_.conv2d,
                       mkl_op_registry::GetMklOpName(csinfo_.conv2d),
                       CopyAttrsConv2D, AlwaysRewrite});
-    rinfo_.push_back({csinfo_.conv2d_with_bias,
-                      csinfo_.mkl_conv2d_with_bias,
+    rinfo_.push_back({csinfo_.conv2d_with_bias, csinfo_.mkl_conv2d_with_bias,
                       CopyAttrsConv2D, AlwaysRewrite});
     rinfo_.push_back({csinfo_.conv2d_grad_filter,
                       mkl_op_registry::GetMklOpName(csinfo_.conv2d_grad_filter),
                       CopyAttrsConv2D, AlwaysRewrite});
     rinfo_.push_back({csinfo_.conv2d_grad_filter_with_bias,
-                      csinfo_.mkl_conv2d_grad_filter_with_bias,
-                      CopyAttrsConv2D, AlwaysRewrite});
+                      csinfo_.mkl_conv2d_grad_filter_with_bias, CopyAttrsConv2D,
+                      AlwaysRewrite});
     rinfo_.push_back({csinfo_.conv2d_grad_input,
                       mkl_op_registry::GetMklOpName(csinfo_.conv2d_grad_input),
                       CopyAttrsConv2D, AlwaysRewrite});
     rinfo_.push_back({csinfo_.fused_batch_norm,
                       mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm),
                       CopyAttrsFusedBatchNorm, AlwaysRewrite});
-    rinfo_.push_back({csinfo_.fused_batch_norm_grad,
-                      mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm_grad),
-                      CopyAttrsFusedBatchNorm, AlwaysRewrite});
+    rinfo_.push_back(
+        {csinfo_.fused_batch_norm_grad,
+         mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm_grad),
+         CopyAttrsFusedBatchNorm, AlwaysRewrite});
     rinfo_.push_back({csinfo_.identity,
                       mkl_op_registry::GetMklOpName(csinfo_.identity),
                       CopyAttrsDataType, AlwaysRewrite});
-    rinfo_.push_back({csinfo_.lrn,
-                      mkl_op_registry::GetMklOpName(csinfo_.lrn),
+    rinfo_.push_back({csinfo_.lrn, mkl_op_registry::GetMklOpName(csinfo_.lrn),
                       CopyAttrsLRN, AlwaysRewrite});
     rinfo_.push_back({csinfo_.lrn_grad,
                       mkl_op_registry::GetMklOpName(csinfo_.lrn_grad),
@@ -2515,8 +2510,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
                       mkl_op_registry::GetMklOpName(csinfo_.mul),
                       CopyAttrsDataType, AlwaysRewrite});
     */
-    rinfo_.push_back({csinfo_.relu,
-                      mkl_op_registry::GetMklOpName(csinfo_.relu),
+    rinfo_.push_back({csinfo_.relu, mkl_op_registry::GetMklOpName(csinfo_.relu),
                       CopyAttrsDataType, AlwaysRewrite});
     rinfo_.push_back({csinfo_.relu_grad,
                       mkl_op_registry::GetMklOpName(csinfo_.relu_grad),
@@ -2550,8 +2544,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
 
     // Add a rule for merging nodes
     minfo_.push_back({csinfo_.conv2d, csinfo_.bias_add,
-                      csinfo_.conv2d_with_bias,
-                      GetConv2DOrBiasAdd});
+                      csinfo_.conv2d_with_bias, GetConv2DOrBiasAdd});
 
     minfo_.push_back({csinfo_.conv2d_grad_filter, csinfo_.bias_add_grad,
                       csinfo_.conv2d_grad_filter_with_bias,
@@ -2846,9 +2839,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
 
   // Default rewrite rule to be used in scenario 1 for rewrite.
   // @return - true (since we want to always rewrite)
-  static bool AlwaysRewrite(const Node* n) {
-    return true;
-  }
+  static bool AlwaysRewrite(const Node* n) { return true; }
 
   // Check if we are performing pooling on depth or batch. If it is, then we
   // do not rewrite MaxPool node to Mkl version.
@@ -2862,14 +2853,13 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     std::vector<int32> ksize, strides;
     CHECK_EQ(GetNodeAttr(n->def(), "ksize", &ksize).ok(), true);
     CHECK_EQ(GetNodeAttr(n->def(), "strides", &strides).ok(), true);
-    CHECK_EQ(GetNodeAttr(n->def(), "data_format", &data_format_str).ok(),
-             true);
+    CHECK_EQ(GetNodeAttr(n->def(), "data_format", &data_format_str).ok(), true);
     CHECK_EQ(FormatFromString(data_format_str, &data_format), true);
 
     // Condition that specifies non-batch-wise and non-depth-wise pooling.
-    if (GetTensorDim(ksize,   data_format, 'N') == 1 &&
+    if (GetTensorDim(ksize, data_format, 'N') == 1 &&
         GetTensorDim(strides, data_format, 'N') == 1 &&
-        GetTensorDim(ksize,   data_format, 'C') == 1 &&
+        GetTensorDim(ksize, data_format, 'C') == 1 &&
         GetTensorDim(strides, data_format, 'C') == 1) {
       return true;
     }
@@ -2941,10 +2931,11 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   // @output output_nodes - the list of new nodes creating Mkl tensors
   //
   // @return None
-  void GetNodesProducingMklTensorList(std::unique_ptr<Graph>* g,
-    Node* orig_node, const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
-    int* input_idx, int list_length,
-    std::vector<NodeBuilder::NodeOut>* output_nodes);
+  void GetNodesProducingMklTensorList(
+      std::unique_ptr<Graph>* g, Node* orig_node,
+      const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
+      int* input_idx, int list_length,
+      std::vector<NodeBuilder::NodeOut>* output_nodes);
 
   // Get a node that will feed an Mkl tensor to the new
   // node that we are constructing. The output node could be (1) 'n'
@@ -2961,7 +2952,8 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   //                                will feed the tensor
   // @return None
   void GetNodeProducingMklTensor(std::unique_ptr<Graph>* g, Node* orig_node,
-    Node* n, int n_output_slot, Node** mkl_node, int* mkl_node_output_slot);
+                                 Node* n, int n_output_slot, Node** mkl_node,
+                                 int* mkl_node_output_slot);
 
   // Setup new inputs using old inputs 'inputs' for the rewritten node in 'nb'
   // in graph 'g'. Original node is input in 'old_node'. Inputs to 'nb' are
@@ -3096,13 +3088,13 @@ void MklLayoutRewritePass::GetDummyMklTensorNode(std::unique_ptr<Graph>* g,
   TensorShape dummy_shape({8});
   dummy_shape.AsProto(proto.mutable_tensor_shape());
   TF_CHECK_OK(NodeBuilder((*g)->NewName("DMT"), "Const")
-               .Attr("value", proto)
-               .Attr("dtype", dt)
-               .Device(orig_node->def().device())  // We place this node on
-                                                   // the same device as the
-                                                   // device of the original
-                                                   // node.
-               .Finalize(&**g, out));
+                  .Attr("value", proto)
+                  .Attr("dtype", dt)
+                  .Device(orig_node->def().device())  // We place this node on
+                                                      // the same device as the
+                                                      // device of the original
+                                                      // node.
+                  .Finalize(&**g, out));
 
   // If number of inputs to the original node is > 0, then we add
   // control dependency between 1st input (index 0) of the original node and
@@ -3115,8 +3107,8 @@ void MklLayoutRewritePass::GetDummyMklTensorNode(std::unique_ptr<Graph>* g,
   // the same frame.
   if (orig_node->num_inputs() > 0) {
     Node* orig_input0 = nullptr;
-    TF_CHECK_OK(orig_node->input_node(0,
-                                      const_cast<const Node**>(&orig_input0)));
+    TF_CHECK_OK(
+        orig_node->input_node(0, const_cast<const Node**>(&orig_input0)));
     // Allow duplicate while adding control edge as it would fail (return
     // NULL) if we try to add duplicate edge.
     CHECK_NOTNULL((*g)->AddControlEdge(orig_input0, *out, true));
@@ -3126,11 +3118,9 @@ void MklLayoutRewritePass::GetDummyMklTensorNode(std::unique_ptr<Graph>* g,
 }
 
 void MklLayoutRewritePass::GetNodesProducingMklTensorList(
-    std::unique_ptr<Graph>* g,
-    Node* orig_node,
-    const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
-    int* input_idx, int list_length,
-    std::vector<NodeBuilder::NodeOut>* output_nodes) {
+    std::unique_ptr<Graph>* g, Node* orig_node,
+    const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs, int* input_idx,
+    int list_length, std::vector<NodeBuilder::NodeOut>* output_nodes) {
   CHECK_LT(*input_idx, inputs.size());
   CHECK_GT(list_length, 0);
   CHECK_NOTNULL(output_nodes);
@@ -3147,8 +3137,8 @@ void MklLayoutRewritePass::GetNodesProducingMklTensorList(
     int mkl_node_output_slot = 0;
     GetNodeProducingMklTensor(g, orig_node, n, slot, &mkl_node,
                               &mkl_node_output_slot);
-    output_nodes->push_back(NodeBuilder::NodeOut(mkl_node,
-                                                mkl_node_output_slot));
+    output_nodes->push_back(
+        NodeBuilder::NodeOut(mkl_node, mkl_node_output_slot));
     (*input_idx)++;
     list_length--;
   }
@@ -3158,9 +3148,9 @@ void MklLayoutRewritePass::GetNodesProducingMklTensorList(
 // node that we are constructing. An input node could be (1) 'n'
 // if it is Mkl layer, or (2) a dummy node producing dummy Mkl tensor
 // if 'n' is not an Mkl layer.
-void MklLayoutRewritePass::GetNodeProducingMklTensor(std::unique_ptr<Graph>* g,
-    Node* orig_node, Node* n,
-    int n_output_slot, Node** mkl_node, int* mkl_node_output_slot) {
+void MklLayoutRewritePass::GetNodeProducingMklTensor(
+    std::unique_ptr<Graph>* g, Node* orig_node, Node* n, int n_output_slot,
+    Node** mkl_node, int* mkl_node_output_slot) {
   CHECK_NOTNULL(n);
   CHECK_NOTNULL(mkl_node);
   CHECK_NOTNULL(mkl_node_output_slot);
@@ -3292,8 +3282,8 @@ int MklLayoutRewritePass::SetUpContiguousInputs(
     if (ArgIsList(arg)) {
       std::vector<NodeBuilder::NodeOut> new_node_inputs;
       int N = GetTensorListLength(arg, old_node);
-      GetNodesProducingMklTensorList(g, old_node, old_node_inputs, &iidx,
-                                     N, &new_node_inputs);
+      GetNodesProducingMklTensorList(g, old_node, old_node_inputs, &iidx, N,
+                                     &new_node_inputs);
       nb->Input(new_node_inputs);
       nn_slot_idx++;
     } else {
@@ -3394,13 +3384,13 @@ void MklLayoutRewritePass::GetDummyWorkspaceTensorNode(
   TensorShape dummy_shape({1});
   dummy_shape.AsProto(proto.mutable_tensor_shape());
   TF_CHECK_OK(NodeBuilder((*g)->NewName("DMT"), "Const")
-                .Attr("value", proto)
-                .Attr("dtype", dt)
-                .Device(orig_node->def().device())  // We place this node on
-                                                    // same the device as the
-                                                    // device of the original
-                                                    // node.
-                .Finalize(&**g, out));
+                  .Attr("value", proto)
+                  .Attr("dtype", dt)
+                  .Device(orig_node->def().device())  // We place this node on
+                                                      // same the device as the
+                                                      // device of the original
+                                                      // node.
+                  .Finalize(&**g, out));
 
   // If number of inputs to the original node is > 0, then we add
   // control dependency between 1st input (index 0) of the original node and
@@ -3413,8 +3403,8 @@ void MklLayoutRewritePass::GetDummyWorkspaceTensorNode(
   // the same frame.
   if (orig_node->num_inputs() > 0) {
     Node* orig_input0 = nullptr;
-    TF_CHECK_OK(orig_node->input_node(0,
-                                      const_cast<const Node**>(&orig_input0)));
+    TF_CHECK_OK(
+        orig_node->input_node(0, const_cast<const Node**>(&orig_input0)));
     // Allow duplicate while adding control edge as it would fail (return
     // NULL) if we try to add duplicate edge.
     CHECK_NOTNULL((*g)->AddControlEdge(orig_input0, *out, true));
@@ -3434,8 +3424,8 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
   for (auto ws : wsinfo_) {
     if (orig_node->type_string() == ws.fwd_op &&
-        mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(
-          orig_node->type_string()), T)) {
+        mkl_op_registry::IsMklOp(
+            mkl_op_registry::GetMklOpName(orig_node->type_string()), T)) {
       // If this op is a fwd op, then we need to check if there is an
       // edge from this node's fwd_slot to bwdop's bwd_slot. If there is
       // an edge, then we just add an attribute on this node for setting
@@ -3461,8 +3451,9 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(
         nb->Attr("workspace_enabled", false);
       }
     } else if (orig_node->type_string() == ws.bwd_op &&
-               mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(
-                                          orig_node->type_string()), T)) {
+               mkl_op_registry::IsMklOp(
+                   mkl_op_registry::GetMklOpName(orig_node->type_string()),
+                   T)) {
       // If this op is a bwd op, then we need to add workspace edge and
       // it's Mkl tensor edge between its corresponding fwd op and this
       // op. Corresponding fwd op is specified in 'fwd_op' field of
@@ -3477,8 +3468,8 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(
         if (e->src_output() == ws.fwd_slot &&
             // We would have rewritten the forward op, so we need to use
             // GetMklOpName call to get its Mkl name.
-            e->src()->type_string() == mkl_op_registry::GetMklOpName(
-                                                          ws.fwd_op) &&
+            e->src()->type_string() ==
+                mkl_op_registry::GetMklOpName(ws.fwd_op) &&
             e->dst_input() == ws.bwd_slot) {
           nb->Attr("workspace_enabled", true);
           CHECK_NOTNULL(ws_tensors);
@@ -3645,7 +3636,7 @@ void MklLayoutRewritePass::CopyAttrsDataType(const Node* orig_node,
 }
 
 void MklLayoutRewritePass::CopyAttrsReshape(const Node* orig_node,
-                                           NodeBuilder* nb) {
+                                            NodeBuilder* nb) {
   DataType T;
   DataType Tshape;
 
@@ -3776,8 +3767,9 @@ Status MklLayoutRewritePass::MergeConv2DWithBiasAdd(std::unique_ptr<Graph>* g,
                                                     Node* m, Node* n) {
   CHECK_EQ(((m->type_string() == csinfo_.bias_add &&
              n->type_string() == csinfo_.conv2d)) ||
-           ((n->type_string() == csinfo_.bias_add &&
-             m->type_string() == csinfo_.conv2d)), true);
+               ((n->type_string() == csinfo_.bias_add &&
+                 m->type_string() == csinfo_.conv2d)),
+           true);
 
   // If 'm' is BiasAdd, then 'n' is Conv2D. Since Conv2D feeds BiasAdd,
   // BiasAdd is successor node, and Conv2D predecessor node.
@@ -3796,8 +3788,7 @@ Status MklLayoutRewritePass::MergeConv2DWithBiasAdd(std::unique_ptr<Graph>* g,
   TF_CHECK_OK(GetNodeAttr(pred->def(), "strides", &strides));
   TF_CHECK_OK(GetNodeAttr(pred->def(), "data_format", &data_format_pred));
   TF_CHECK_OK(GetNodeAttr(succ->def(), "data_format", &data_format_succ));
-  TF_CHECK_OK(
-      GetNodeAttr(pred->def(), "use_cudnn_on_gpu", &use_cudnn_on_gnu));
+  TF_CHECK_OK(GetNodeAttr(pred->def(), "use_cudnn_on_gpu", &use_cudnn_on_gnu));
   // We check to ensure that data formats of both succ and pred are same.
   // We expect them to be same, so we can enforce this as assert.
   // But assert can be too strict, so we enforce this as a check.
@@ -3900,8 +3891,8 @@ Status MklLayoutRewritePass::MergeConv2DWithBiasAdd(std::unique_ptr<Graph>* g,
       // BiasAdd has only 1 output (at slot 0) and merged node also has only 1
       // output (at slot 0).
       const int kConv2DWithBiasOutputSlot = 0;
-      CHECK_NOTNULL((*g)->AddEdge(new_node, kConv2DWithBiasOutputSlot,
-                                    e->dst(), e->dst_input()));
+      CHECK_NOTNULL((*g)->AddEdge(new_node, kConv2DWithBiasOutputSlot, e->dst(),
+                                  e->dst_input()));
     }
   }
 
@@ -3924,8 +3915,9 @@ Status MklLayoutRewritePass::MergeConv2DBackpropFilterWithBiasAddGrad(
     std::unique_ptr<Graph>* g, Node* m, Node* n) {
   CHECK_EQ(((m->type_string() == csinfo_.bias_add_grad &&
              n->type_string() == csinfo_.conv2d_grad_filter)) ||
-           ((n->type_string() == csinfo_.bias_add_grad &&
-             m->type_string() == csinfo_.conv2d_grad_filter)), true);
+               ((n->type_string() == csinfo_.bias_add_grad &&
+                 m->type_string() == csinfo_.conv2d_grad_filter)),
+           true);
 
   // If 'm' is BiasAddGrad, then 'n' is BackpropFilter.
   Node* badd = m->type_string() == csinfo_.bias_add_grad ? m : n;
@@ -4132,9 +4124,10 @@ Status MklLayoutRewritePass::RewriteNode(std::unique_ptr<Graph>* g,
       // NULL) if we try to add duplicate edge.
       CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst(), true));
     } else {
-      CHECK_NOTNULL((*g)->AddEdge(new_node, GetTensorDataIndex(e->src_output(),
-                            e->src()->num_outputs()),
-                    e->dst(), e->dst_input()));
+      CHECK_NOTNULL((*g)->AddEdge(
+          new_node,
+          GetTensorDataIndex(e->src_output(), e->src()->num_outputs()),
+          e->dst(), e->dst_input()));
     }
   }
 
@@ -4166,9 +4159,9 @@ MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
   // names.
   if (n->type_string() != csinfo_.conv2d_with_bias &&
       n->type_string() != csinfo_.conv2d_grad_filter_with_bias &&
-      !mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(
-                                        n->type_string()), T)) {
-      return nullptr;
+      !mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(n->type_string()),
+                                T)) {
+    return nullptr;
   }
 
   // For elementwise node, we reuse the Eigen implementation and pass the MKL
@@ -4184,29 +4177,30 @@ MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
   // eigen code to reduce cross-library dependency.
   VLOG(1) << "ELEMENTWISE: checking op: " << n->type_string();
   if (mkl_op_registry::IsMklElementWiseOp(
-        mkl_op_registry::GetMklOpName(n->type_string()), T) ||
+          mkl_op_registry::GetMklOpName(n->type_string()), T) ||
       n->type_string().find("Identity") != string::npos) {
     VLOG(1) << "ELEMENTWISE: op is elementwise: " << n->type_string();
     bool incoming_mkl_edge = false;
     int num_parent = 0;
     for (auto parent : n->in_edges()) {
       if (mkl_op_registry::IsMklOp(parent->src()->type_string(), T)) {
-        VLOG(1) << "ELEMENTWISE: parent " << num_parent++ << " is MKL op: "
-                << parent->src()->type_string();
+        VLOG(1) << "ELEMENTWISE: parent " << num_parent++
+                << " is MKL op: " << parent->src()->type_string();
         incoming_mkl_edge = true;
         break;
       } else {
-        VLOG(1) << "ELEMENTWISE: parent " << num_parent++ << " is NON-MKL op: "
-                << parent->src()->type_string();
+        VLOG(1) << "ELEMENTWISE: parent " << num_parent++
+                << " is NON-MKL op: " << parent->src()->type_string();
       }
     }
     if (incoming_mkl_edge == false) {
-      VLOG(1) << "ELEMENTWISE: Skipping replacement of elementwise node which has no MKL "
+      VLOG(1) << "ELEMENTWISE: Skipping replacement of elementwise node which "
+                 "has no MKL "
                  "parents.";
       return nullptr;
     } else {
-      VLOG(1) << "ELEMENTWISE: Replacing elementwise node " << n->type_string() <<
-        " which has MKL parents";
+      VLOG(1) << "ELEMENTWISE: Replacing elementwise node " << n->type_string()
+              << " which has MKL parents";
     }
   }
 
@@ -4214,8 +4208,7 @@ MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
   // for this op, then we rewrite it to Mkl op.
   // Find matching RewriteInfo and then check that rewrite rule applies.
   for (auto ri = rinfo_.cbegin(); ri != rinfo_.cend(); ++ri) {
-    if (n->type_string().compare(ri->name) == 0 &&
-        ri->rewrite_rule(n)) {
+    if (n->type_string().compare(ri->name) == 0 && ri->rewrite_rule(n)) {
       return &*ri;
     }
   }
@@ -4297,8 +4290,7 @@ bool RunMklLayoutRewritePass(std::unique_ptr<Graph>* g) {
   return MklLayoutRewritePass().RunPass(g);
 }
 
-Status MklLayoutRewritePass::Run(
-  const GraphOptimizationPassOptions& options) {
+Status MklLayoutRewritePass::Run(const GraphOptimizationPassOptions& options) {
   if (options.graph == nullptr && options.partition_graphs == nullptr) {
     return Status::OK();
   }
diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc
index 75f7ca2d4d..320d5a48c7 100644
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@@ -125,8 +125,10 @@ REGISTER_OP("InputList").Output("o: N * float").Attr("N: int").SetIsStateful();
 REGISTER_OP("HalfInput").Output("o: half").SetIsStateful();
 REGISTER_OP("Int32Input").Output("o: int32").SetIsStateful();
 REGISTER_OP("_MklInput").Output("o: uint8").SetIsStateful();
-REGISTER_OP("_MklInput2").Output("o: uint8")
-                        .Output("o1: uint8").SetIsStateful();
+REGISTER_OP("_MklInput2")
+    .Output("o: uint8")
+    .Output("o1: uint8")
+    .SetIsStateful();
 
 /////////////////////////////////////////////////////////////////////
 //  Unit tests related to node merge optiimization
@@ -498,7 +500,6 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Negative2) {
             "M->I:3;N->D:4;N->G:4;N->I:4;O->D:5;O->G:5;O->I:5");
 }
 
-
 // BiasAddGrad rewrite to BackpropBias in the presence of BackpropFilter only
 TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_BpropFilter_Positive) {
   InitGraph(
@@ -874,11 +875,12 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Basic) {
       " input: ['A', 'B:0', 'B:1']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['C', 'D'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Const);B(InputList);C(Input);D(_MklConcat);DMT/_0(Const);"
-            "DMT/_1(Const);DMT/_2(Const);E(Zeta)|A->D;A:control->DMT/_0:control;"
-            "A:control->DMT/_1:control;A:control->DMT/_2:control;B->D:1;"
-            "B:1->D:2;C->E;D->E:1;DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
+  EXPECT_EQ(
+      DoMklLayoutOptimizationPass(),
+      "A(Const);B(InputList);C(Input);D(_MklConcat);DMT/_0(Const);"
+      "DMT/_1(Const);DMT/_2(Const);E(Zeta)|A->D;A:control->DMT/_0:control;"
+      "A:control->DMT/_1:control;A:control->DMT/_2:control;B->D:1;"
+      "B:1->D:2;C->E;D->E:1;DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
 }
 
 // Concat with 2 Mkl layers feeding it
@@ -1273,7 +1275,8 @@ TEST_F(MklLayoutPassTest, MaxPoolLRN_Positive) {
       "node { name: 'H' op: 'Input'}"
       "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['H', 'G'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+  EXPECT_EQ(
+      DoMklLayoutOptimizationPass(),
       "A(Input);B(_MklLRN);C(_MklMaxPool);D(Input);DMT/_0(Const);DMT/_1(Const);"
       "DMT/_2(Const);E(_MklMaxPoolGrad);F(Input);G(_MklLRNGrad);H(Input);"
       "I(Zeta)|A->B;A:control->DMT/_0:control;B->C;B->E;B->G:2;B:1->G:3;"
@@ -1640,7 +1643,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_DeviceTest) {
       " attr { key: 'padding'          value { s: 'SAME' } }"
       " input: ['A', 'B']}"
       "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['B', 'C'] }", kGPUDevice);
+      " input: ['B', 'C'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(Conv2D);D(Zeta)|A->C;B->C:1;B->D;C->D:1");
 }
@@ -1666,7 +1670,8 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_DeviceTest) {
       "node { name: 'F' op: 'BiasAddGrad'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
       " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['E'] }", kGPUDevice);
+      " input: ['E'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);"
             "E(Zeta);F(BiasAddGrad);M(_MklInput);N(_MklInput);"
@@ -1687,7 +1692,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradFilter_DeviceTest) {
       " attr { key: 'padding'          value { s: 'SAME' } }"
       " input: ['A', 'B', 'C']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'D'] }", kGPUDevice);
+      " input: ['A', 'D'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Int32Input);C(Input);D(Conv2DBackpropFilter);E(Zeta)|"
             "A->D;A->E;B->D:1;C->D:2;D->E:1");
@@ -1700,7 +1706,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Relu_DeviceTest) {
       " attr { key: 'T'                value { type: DT_FLOAT } }"
       " input: ['A'] }"
       "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }", kGPUDevice);
+      " input: ['A', 'B'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Relu);C(Zeta)|A->B;A->C;B->C:1");
 }
@@ -1713,7 +1720,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ReluGrad_DeviceTest) {
       " attr { key: 'T'                value { type: DT_FLOAT } }"
       " input: ['A', 'B'] }"
       "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'C'] }", kGPUDevice);
+      " input: ['A', 'C'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(ReluGrad);D(Zeta)|A->C;A->D;B->C:1;C->D:1");
 }
@@ -1729,7 +1737,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_MaxPool_DeviceTest) {
       " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A'] }"
       "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }", kGPUDevice);
+      " input: ['A', 'B'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
 }
@@ -1745,7 +1754,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_AvgPool_DeviceTest) {
       " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A'] }"
       "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }", kGPUDevice);
+      " input: ['A', 'B'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(AvgPool);C(Zeta)|A->B;A->C;B->C:1");
 }
@@ -1766,7 +1776,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Concat_DeviceTest) {
       " attr { key: 'N'                value { i: 2 } }"
       " input: ['A', 'B:0', 'B:1']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['C', 'D'] }", kGPUDevice);
+      " input: ['C', 'D'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Const);B(InputList);C(Input);D(Concat);E(Zeta)|A->D;"
             "B->D:1;B:1->D:2;C->E;D->E:1");
@@ -1788,7 +1799,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_DeviceTest) {
       " attr { key: 'N'                value { i: 2 } }"
       " input: ['B:0', 'B:1', 'A']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['C', 'D'] }", kGPUDevice);
+      " input: ['C', 'D'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Const);B(InputList);C(Input);D(ConcatV2);E(Zeta)|"
             "A->D:2;B->D;B:1->D:1;C->E;D->E:1");
@@ -1808,7 +1820,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNorm_DeviceTest) {
       " attr { key: 'is_training'  value { b: true } }"
       " input: ['A', 'B', 'C', 'D', 'E'] }"
       "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'F'] }", kGPUDevice);
+      " input: ['A', 'F'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(Input);D(Input);E(Input);"
             "F(FusedBatchNorm);G(Zeta)|A->F;A->G;B->F:1;C->F:2;D->F:3;"
@@ -1837,7 +1850,8 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_DeviceTest) {
       "node { name: 'Y' op: 'Input'}"
       "node { name: 'Z' op: 'Zeta'"
       " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['E', 'Y']}", kGPUDevice);
+      " input: ['E', 'Y']}",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(_MklConv2D);D(Input);E(BiasAdd);"
             "M(_MklInput);N(_MklInput);Y(Input);Z(Zeta)|A->C;"
@@ -1972,8 +1986,10 @@ REGISTER_OP("InputList").Output("o: N * float").Attr("N: int").SetIsStateful();
 REGISTER_OP("HalfInput").Output("o: half").SetIsStateful();
 REGISTER_OP("Int32Input").Output("o: int32").SetIsStateful();
 REGISTER_OP("_MklInput").Output("o: uint8").SetIsStateful();
-REGISTER_OP("_MklInput2").Output("o: uint8")
-                        .Output("o1: uint8").SetIsStateful();
+REGISTER_OP("_MklInput2")
+    .Output("o: uint8")
+    .Output("o1: uint8")
+    .SetIsStateful();
 
 /////////////////////////////////////////////////////////////////////
 //  Unit tests related to node merge optiimization
@@ -2492,11 +2508,12 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Basic) {
       " input: ['A', 'B:0', 'B:1']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['C', 'D'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Const);B(InputList);C(Input);D(_MklConcat);DMT/_0(Const);"
-            "DMT/_1(Const);DMT/_2(Const);E(Zeta)|A->D;A:control->DMT/_0:control;"
-            "A:control->DMT/_1:control;A:control->DMT/_2:control;B->D:1;"
-            "B:1->D:2;C->E;D->E:1;DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
+  EXPECT_EQ(
+      DoMklLayoutOptimizationPass(),
+      "A(Const);B(InputList);C(Input);D(_MklConcat);DMT/_0(Const);"
+      "DMT/_1(Const);DMT/_2(Const);E(Zeta)|A->D;A:control->DMT/_0:control;"
+      "A:control->DMT/_1:control;A:control->DMT/_2:control;B->D:1;"
+      "B:1->D:2;C->E;D->E:1;DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
 }
 
 // Concat with 2 Mkl layers feeding it
@@ -2891,7 +2908,8 @@ TEST_F(MklLayoutPassTest, MaxPoolLRN_Positive) {
       "node { name: 'H' op: 'Input'}"
       "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['H', 'G'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+  EXPECT_EQ(
+      DoMklLayoutOptimizationPass(),
       "A(Input);B(_MklLRN);C(_MklMaxPool);D(Input);DMT/_0(Const);DMT/_1(Const);"
       "DMT/_2(Const);E(_MklMaxPoolGrad);F(Input);G(_MklLRNGrad);H(Input);"
       "I(Zeta)|A->B;A:control->DMT/_0:control;B->C;B->E;B->G:2;B:1->G:3;"
@@ -3258,7 +3276,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_DeviceTest) {
       " attr { key: 'padding'          value { s: 'SAME' } }"
       " input: ['A', 'B']}"
       "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['B', 'C'] }", kGPUDevice);
+      " input: ['B', 'C'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(Conv2D);D(Zeta)|A->C;B->C:1;B->D;C->D:1");
 }
@@ -3284,7 +3303,8 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_DeviceTest) {
       "node { name: 'F' op: 'BiasAddGrad'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
       " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['E'] }", kGPUDevice);
+      " input: ['E'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);"
             "E(Zeta);F(BiasAddGrad);M(_MklInput);N(_MklInput);"
@@ -3305,7 +3325,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradFilter_DeviceTest) {
       " attr { key: 'padding'          value { s: 'SAME' } }"
       " input: ['A', 'B', 'C']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'D'] }", kGPUDevice);
+      " input: ['A', 'D'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Int32Input);C(Input);D(Conv2DBackpropFilter);E(Zeta)|"
             "A->D;A->E;B->D:1;C->D:2;D->E:1");
@@ -3318,7 +3339,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Relu_DeviceTest) {
       " attr { key: 'T'                value { type: DT_FLOAT } }"
       " input: ['A'] }"
       "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }", kGPUDevice);
+      " input: ['A', 'B'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Relu);C(Zeta)|A->B;A->C;B->C:1");
 }
@@ -3331,7 +3353,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ReluGrad_DeviceTest) {
       " attr { key: 'T'                value { type: DT_FLOAT } }"
       " input: ['A', 'B'] }"
       "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'C'] }", kGPUDevice);
+      " input: ['A', 'C'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(ReluGrad);D(Zeta)|A->C;A->D;B->C:1;C->D:1");
 }
@@ -3347,7 +3370,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_MaxPool_DeviceTest) {
       " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A'] }"
       "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }", kGPUDevice);
+      " input: ['A', 'B'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
 }
@@ -3363,7 +3387,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_AvgPool_DeviceTest) {
       " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A'] }"
       "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }", kGPUDevice);
+      " input: ['A', 'B'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(AvgPool);C(Zeta)|A->B;A->C;B->C:1");
 }
@@ -3384,7 +3409,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Concat_DeviceTest) {
       " attr { key: 'N'                value { i: 2 } }"
       " input: ['A', 'B:0', 'B:1']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['C', 'D'] }", kGPUDevice);
+      " input: ['C', 'D'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Const);B(InputList);C(Input);D(Concat);E(Zeta)|A->D;"
             "B->D:1;B:1->D:2;C->E;D->E:1");
@@ -3406,7 +3432,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_DeviceTest) {
       " attr { key: 'N'                value { i: 2 } }"
       " input: ['B:0', 'B:1', 'A']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['C', 'D'] }", kGPUDevice);
+      " input: ['C', 'D'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Const);B(InputList);C(Input);D(ConcatV2);E(Zeta)|"
             "A->D:2;B->D;B:1->D:1;C->E;D->E:1");
@@ -3426,7 +3453,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNorm_DeviceTest) {
       " attr { key: 'is_training'  value { b: true } }"
       " input: ['A', 'B', 'C', 'D', 'E'] }"
       "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'F'] }", kGPUDevice);
+      " input: ['A', 'F'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(Input);D(Input);E(Input);"
             "F(FusedBatchNorm);G(Zeta)|A->F;A->G;B->F:1;C->F:2;D->F:3;"
@@ -3455,7 +3483,8 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_DeviceTest) {
       "node { name: 'Y' op: 'Input'}"
       "node { name: 'Z' op: 'Zeta'"
       " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['E', 'Y']}", kGPUDevice);
+      " input: ['E', 'Y']}",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(_MklConv2D);D(Input);E(BiasAdd);"
             "M(_MklInput);N(_MklInput);Y(Input);Z(Zeta)|A->C;"
diff --git a/tensorflow/core/graph/mkl_tfconversion_pass.cc b/tensorflow/core/graph/mkl_tfconversion_pass.cc
index 599bb88f01..5343e6802d 100644
--- a/tensorflow/core/graph/mkl_tfconversion_pass.cc
+++ b/tensorflow/core/graph/mkl_tfconversion_pass.cc
@@ -33,8 +33,8 @@ limitations under the License.
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/logging.h"
 
-#include "tensorflow/core/graph/mkl_tfconversion_pass.h"
 #include "tensorflow/core/graph/mkl_graph_util.h"
+#include "tensorflow/core/graph/mkl_tfconversion_pass.h"
 
 namespace tensorflow {
 
@@ -152,12 +152,12 @@ Status MklToTfConversionPass::InsertConversionNodeOnEdge(
   string data_format;
 
   TF_CHECK_OK(GetNodeAttr(src->def(), "T", &src_datatype));
-  bool dst_dtype_found = GetNodeAttr(dst->def(), "T", &dst_datatype) ==
-                          Status::OK();
+  bool dst_dtype_found =
+      GetNodeAttr(dst->def(), "T", &dst_datatype) == Status::OK();
   // We compare source and destination datatypes only when both are found.
   if (dst_dtype_found && (src_datatype != dst_datatype)) {
-    string err_msg = "T attribute of " + src->name() + " and " +
-                      dst->name() + " do not match. Will not insert" +
+    string err_msg = "T attribute of " + src->name() + " and " + dst->name() +
+                     " do not match. Will not insert" +
                      " MklToTf node in such case.";
     return Status(error::Code::INVALID_ARGUMENT, err_msg.c_str());
   }
@@ -325,12 +325,12 @@ bool MklToTfConversionPass::RunPass(std::unique_ptr<Graph>* g) {
     // may not be Mkl node.
     DataType src_datatype;
     DataType dst_datatype;
-    bool src_is_mkl_op = (GetNodeAttr(src->def(), "T", &src_datatype) ==
-                            Status::OK() &&
-                          IsMklSupportedOp(src->type_string(), src_datatype));
-    bool dst_is_mkl_op = (GetNodeAttr(dst->def(), "T", &dst_datatype) ==
-                            Status::OK() &&
-                          IsMklSupportedOp(dst->type_string(), dst_datatype));
+    bool src_is_mkl_op =
+        (GetNodeAttr(src->def(), "T", &src_datatype) == Status::OK() &&
+         IsMklSupportedOp(src->type_string(), src_datatype));
+    bool dst_is_mkl_op =
+        (GetNodeAttr(dst->def(), "T", &dst_datatype) == Status::OK() &&
+         IsMklSupportedOp(dst->type_string(), dst_datatype));
 
     // Check if src with is Mkl-compliant, while dst is not Mkl-compliant.
     if (src_is_mkl_op && !dst_is_mkl_op) {
diff --git a/tensorflow/core/graph/testlib.cc b/tensorflow/core/graph/testlib.cc
index 172471e34b..d5b026eae3 100644
--- a/tensorflow/core/graph/testlib.cc
+++ b/tensorflow/core/graph/testlib.cc
@@ -40,7 +40,7 @@ REGISTER_KERNEL_BUILDER(
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(
     Name("HostConst").Device(DEVICE_SYCL).HostMemory("output"), HostConstantOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // Register the HostConst Op
 // Returns a constant tensor on the host.  Useful for writing C++ tests
diff --git a/tensorflow/core/kernels/batching_util/periodic_function.h b/tensorflow/core/kernels/batching_util/periodic_function.h
index dbf1733dcc..36a4019002 100644
--- a/tensorflow/core/kernels/batching_util/periodic_function.h
+++ b/tensorflow/core/kernels/batching_util/periodic_function.h
@@ -114,7 +114,7 @@ class PeriodicFunction {
   void RunLoop(int64 start) LOCKS_EXCLUDED(mutex_);
 
   const std::function<void()> function_;  // Actual client function
-  const int64 interval_micros_;    // Interval between calls.
+  const int64 interval_micros_;           // Interval between calls.
   const Options options_;
 
   // Protects state below.
diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc b/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
index d73dcf0fa0..d5ea2b648f 100644
--- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
@@ -55,15 +55,14 @@ Status ScheduleTask(size_t task_size, BatchScheduler<FakeTask>* scheduler) {
 // use the clock to be destroyed.
 std::unique_ptr<Thread> CreateFakeClockAdvancerThread(
     test_util::FakeClockEnv* env, Notification* start, Notification* stop) {
-  return std::unique_ptr<Thread>(
-      Env::Default()->StartThread({}, "FakeClockAdvancerThread",
-                                  [env, start, stop] {
-                                    start->WaitForNotification();
-                                    while (!stop->HasBeenNotified()) {
-                                      env->AdvanceByMicroseconds(10);
-                                      Env::Default()->SleepForMicroseconds(10);
-                                    }
-                                  }));
+  return std::unique_ptr<Thread>(Env::Default()->StartThread(
+      {}, "FakeClockAdvancerThread", [env, start, stop] {
+        start->WaitForNotification();
+        while (!stop->HasBeenNotified()) {
+          env->AdvanceByMicroseconds(10);
+          Env::Default()->SleepForMicroseconds(10);
+        }
+      }));
 }
 
 TEST(SharedBatchSchedulerTest, Basic) {
@@ -258,7 +257,7 @@ TEST(SharedBatchSchedulerTest, ObeysTimeout) {
 TEST(SharedBatchSchedulerTest, ObeysTimeoutWithRealClock) {
   Notification first_batch_processed, second_batch_processed;
   auto callback = [&first_batch_processed, &second_batch_processed](
-      std::unique_ptr<Batch<FakeTask>> batch) {
+                      std::unique_ptr<Batch<FakeTask>> batch) {
     ASSERT_TRUE(batch->IsClosed());
     if (batch->size() == 1) {
       first_batch_processed.Notify();
@@ -301,7 +300,7 @@ TEST(SharedBatchSchedulerTest,
   {
     Notification first_batch_processed, second_batch_processed;
     auto callback = [&first_batch_processed, &second_batch_processed](
-        std::unique_ptr<Batch<FakeTask>> batch) {
+                        std::unique_ptr<Batch<FakeTask>> batch) {
       ASSERT_TRUE(batch->IsClosed());
       if (batch->size() == 1) {
         first_batch_processed.Notify();
@@ -349,7 +348,7 @@ TEST(SharedBatchSchedulerTest, Fairness) {
     auto queue_0_callback = [&queue_0_first_batch_scheduled,
                              &queue_0_first_batch_proceed,
                              &queue_0_second_batch_scheduled](
-        std::unique_ptr<Batch<FakeTask>> batch) {
+                                std::unique_ptr<Batch<FakeTask>> batch) {
       if (!queue_0_first_batch_scheduled.HasBeenNotified()) {
         queue_0_first_batch_scheduled.Notify();
         queue_0_first_batch_proceed.WaitForNotification();
@@ -467,7 +466,7 @@ TEST(SharedBatchSchedulerTest, ConstMethods) {
 TEST(SharedBatchSchedulerTest, OneFullQueueDoesntBlockOtherQueues) {
   Notification queue_0_processing, queue_0_proceed;
   auto queue_0_callback = [&queue_0_processing, &queue_0_proceed](
-      std::unique_ptr<Batch<FakeTask>> batch) {
+                              std::unique_ptr<Batch<FakeTask>> batch) {
     if (!queue_0_processing.HasBeenNotified()) {
       queue_0_processing.Notify();
       queue_0_proceed.WaitForNotification();
diff --git a/tensorflow/core/kernels/data/batch_dataset_op.cc b/tensorflow/core/kernels/data/batch_dataset_op.cc
index 2d6e06398f..0853362b26 100644
--- a/tensorflow/core/kernels/data/batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/batch_dataset_op.cc
@@ -92,7 +92,6 @@ class BatchDatasetOp : public UnaryDatasetOpKernel {
     }
 
    private:
-
     class Iterator : public DatasetIterator<Dataset> {
      public:
       explicit Iterator(const Params& params)
diff --git a/tensorflow/core/kernels/data/captured_function.cc b/tensorflow/core/kernels/data/captured_function.cc
index 1f6d32f8df..f3e4f1cd3f 100644
--- a/tensorflow/core/kernels/data/captured_function.cc
+++ b/tensorflow/core/kernels/data/captured_function.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/notification.h"
 
-
 namespace tensorflow {
 
 /* static */
@@ -185,8 +184,7 @@ Status CapturedFunction::MaybeInstantiate(
   return Status::OK();
 }
 
-Status CapturedFunction::Run(IteratorContext* ctx,
-                             std::vector<Tensor>&& args,
+Status CapturedFunction::Run(IteratorContext* ctx, std::vector<Tensor>&& args,
                              std::vector<Tensor>* rets) {
   FunctionLibraryRuntime::Handle handle;
   TF_RETURN_IF_ERROR(MaybeInstantiate(ctx, &handle));
diff --git a/tensorflow/core/kernels/data/skip_dataset_op.cc b/tensorflow/core/kernels/data/skip_dataset_op.cc
index 13c2501bbb..d636c37afe 100644
--- a/tensorflow/core/kernels/data/skip_dataset_op.cc
+++ b/tensorflow/core/kernels/data/skip_dataset_op.cc
@@ -128,8 +128,8 @@ class SkipDatasetOp : public UnaryDatasetOpKernel {
         while (i_ < dataset()->count_) {
           // Fetch and throw away Tensors.
           std::vector<Tensor> dummy_out_tensors;
-          TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, &dummy_out_tensors,
-                                                  end_of_sequence));
+          TF_RETURN_IF_ERROR(
+              input_impl_->GetNext(ctx, &dummy_out_tensors, end_of_sequence));
           if (*end_of_sequence) {
             // We reached the end before the count was reached.
             input_impl_.reset();
@@ -140,8 +140,8 @@ class SkipDatasetOp : public UnaryDatasetOpKernel {
         }
 
         // Return GetNext() on the underlying iterator.
-        TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, out_tensors,
-                                                end_of_sequence));
+        TF_RETURN_IF_ERROR(
+            input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
         if (*end_of_sequence) {
           input_impl_.reset();
         }
@@ -184,8 +184,7 @@ class SkipDatasetOp : public UnaryDatasetOpKernel {
   };
 };
 
-REGISTER_KERNEL_BUILDER(Name("SkipDataset").Device(DEVICE_CPU),
-                        SkipDatasetOp);
+REGISTER_KERNEL_BUILDER(Name("SkipDataset").Device(DEVICE_CPU), SkipDatasetOp);
 
 }  // namespace
 
diff --git a/tensorflow/core/kernels/fuzzing/decode_base64_fuzz.cc b/tensorflow/core/kernels/fuzzing/decode_base64_fuzz.cc
index 6d4a9dfdef..37edd1ce0f 100644
--- a/tensorflow/core/kernels/fuzzing/decode_base64_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/decode_base64_fuzz.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 
 namespace tensorflow {
 namespace fuzzing {
diff --git a/tensorflow/core/kernels/fuzzing/decode_jpeg_fuzz.cc b/tensorflow/core/kernels/fuzzing/decode_jpeg_fuzz.cc
index b084a97204..f3b24b2341 100644
--- a/tensorflow/core/kernels/fuzzing/decode_jpeg_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/decode_jpeg_fuzz.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 
 namespace tensorflow {
 namespace fuzzing {
diff --git a/tensorflow/core/kernels/fuzzing/decode_json_example_fuzz.cc b/tensorflow/core/kernels/fuzzing/decode_json_example_fuzz.cc
index 9dd795b94e..e9ffad1786 100644
--- a/tensorflow/core/kernels/fuzzing/decode_json_example_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/decode_json_example_fuzz.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 
 namespace tensorflow {
 namespace fuzzing {
diff --git a/tensorflow/core/kernels/fuzzing/decode_png_fuzz.cc b/tensorflow/core/kernels/fuzzing/decode_png_fuzz.cc
index 4a68a5b580..020f18b189 100644
--- a/tensorflow/core/kernels/fuzzing/decode_png_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/decode_png_fuzz.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 
 namespace tensorflow {
 namespace fuzzing {
diff --git a/tensorflow/core/kernels/fuzzing/encode_base64_fuzz.cc b/tensorflow/core/kernels/fuzzing/encode_base64_fuzz.cc
index 2d6c82826c..a8f07f4bad 100644
--- a/tensorflow/core/kernels/fuzzing/encode_base64_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/encode_base64_fuzz.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 
 namespace tensorflow {
 namespace fuzzing {
diff --git a/tensorflow/core/kernels/fuzzing/encode_jpeg_fuzz.cc b/tensorflow/core/kernels/fuzzing/encode_jpeg_fuzz.cc
index 81b6e49124..f5dd47a052 100644
--- a/tensorflow/core/kernels/fuzzing/encode_jpeg_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/encode_jpeg_fuzz.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 
 namespace tensorflow {
 namespace fuzzing {
diff --git a/tensorflow/core/kernels/fuzzing/example_proto_fast_parsing_fuzz.cc b/tensorflow/core/kernels/fuzzing/example_proto_fast_parsing_fuzz.cc
index d91a351c59..4d736a2160 100644
--- a/tensorflow/core/kernels/fuzzing/example_proto_fast_parsing_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/example_proto_fast_parsing_fuzz.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 
 namespace tensorflow {
 namespace fuzzing {
diff --git a/tensorflow/core/kernels/fuzzing/identity_fuzz.cc b/tensorflow/core/kernels/fuzzing/identity_fuzz.cc
index ac3a12aa39..5c3fc4a279 100644
--- a/tensorflow/core/kernels/fuzzing/identity_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/identity_fuzz.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 
 namespace tensorflow {
 namespace fuzzing {
diff --git a/tensorflow/core/kernels/fuzzing/parse_tensor_op_fuzz.cc b/tensorflow/core/kernels/fuzzing/parse_tensor_op_fuzz.cc
index 978fcd1028..c90ad2cfeb 100644
--- a/tensorflow/core/kernels/fuzzing/parse_tensor_op_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/parse_tensor_op_fuzz.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 
 namespace tensorflow {
 namespace fuzzing {
diff --git a/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc b/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc
index 7d1aa1fbf3..738d78e99a 100644
--- a/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 
 namespace tensorflow {
 namespace fuzzing {
diff --git a/tensorflow/core/kernels/fuzzing/string_to_number_fuzz.cc b/tensorflow/core/kernels/fuzzing/string_to_number_fuzz.cc
index 94255d215e..e98363ffbf 100644
--- a/tensorflow/core/kernels/fuzzing/string_to_number_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/string_to_number_fuzz.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 
 namespace tensorflow {
 namespace fuzzing {
diff --git a/tensorflow/core/kernels/hexagon/graph_transfer_utils.cc b/tensorflow/core/kernels/hexagon/graph_transfer_utils.cc
index f0d7c670a6..4040bf52bf 100644
--- a/tensorflow/core/kernels/hexagon/graph_transfer_utils.cc
+++ b/tensorflow/core/kernels/hexagon/graph_transfer_utils.cc
@@ -46,7 +46,7 @@ GraphTransferUtils::GetTopNFloatResults(const float* const data,
       GetTopNFloatResults(data, labels, element_count);
   LOG(INFO) << "=== Dump ranking ===";
   for (int i = 0; i < top_n; ++i) {
-    const std::tuple<float, int, string> &entry = queue.top();
+    const std::tuple<float, int, string>& entry = queue.top();
     LOG(INFO) << i << ": " << std::get<1>(entry) << ", " << std::get<2>(entry)
               << ", " << std::get<0>(entry);
     queue.pop();
diff --git a/tensorflow/core/kernels/hexagon/graph_transferer.h b/tensorflow/core/kernels/hexagon/graph_transferer.h
index a360d188cc..0d43d028cd 100644
--- a/tensorflow/core/kernels/hexagon/graph_transferer.h
+++ b/tensorflow/core/kernels/hexagon/graph_transferer.h
@@ -181,8 +181,8 @@ class GraphTransferer {
   void AppendNodeInputParams(const int id, const Node& node,
                              const std::vector<int>& extra_inputs);
 
-  void AppendNodeOutputParams(const ShapeRefiner& shape_refiner,
-                              const int id, const Node& node);
+  void AppendNodeOutputParams(const ShapeRefiner& shape_refiner, const int id,
+                              const Node& node);
 
   static std::array<int64, SHAPE_ARRAY_SIZE> BuildShapeArray(
       const shape_inference::ShapeHandle& shape_handle,
diff --git a/tensorflow/core/kernels/hexagon/graph_transferer_test.cc b/tensorflow/core/kernels/hexagon/graph_transferer_test.cc
index 536d295506..20b09f144b 100644
--- a/tensorflow/core/kernels/hexagon/graph_transferer_test.cc
+++ b/tensorflow/core/kernels/hexagon/graph_transferer_test.cc
@@ -42,8 +42,7 @@ constexpr float VALUE_TOLERANCE_FLOAT = 1e-8f;
 
 class GraphTransfererTest : public ::testing::Test {
  protected:
-  void SetUp() final {
-  }
+  void SetUp() final {}
 
   GraphTransferer gt_;
 };
@@ -61,7 +60,7 @@ class TestGraphTransferOpsDefinitions : public IRemoteFusedGraphOpsDefinitions {
       }
     }
     return -1;
-}
+  }
 
  private:
   const std::vector<string> op_types_{"INPUT",   "OUTPUT",  "Conv2D",
diff --git a/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc b/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc
index 71bc4187b7..3f794dfb1a 100644
--- a/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc
+++ b/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc
@@ -420,7 +420,7 @@ TEST(GraphTransferer,
       false,  // is_text_proto
       false,  // shape_inference_for_unknown_shape
       true    // dry_run_for_unknown_shape
-      );
+  );
   ASSERT_TRUE(status.ok()) << status;
   prof.Stop();
   prof.DumpStatistics("LoadGraphFromProtoFile");
@@ -487,7 +487,7 @@ TEST(GraphTransferer,
       false,  // is_text_proto
       true,   // shape_inference_for_unknown_shape
       false   // dry_run_for_unknown_shape
-      );
+  );
   ASSERT_TRUE(status.ok()) << status;
   prof.Stop();
   prof.DumpStatistics("LoadGraphFromProtoFile");
@@ -556,7 +556,7 @@ TEST(GraphTransferer, DISABLED_CheckShapeInferencePerformance) {
       false,  // is_text_proto
       false,  // shape_inference_for_unknown_shape
       true    // dry_run_for_unknown_shape
-      );
+  );
   const GraphTransferInfo& gfi0 = gt0.GetGraphTransferInfo();
 
   ASSERT_TRUE(status.ok());
@@ -576,7 +576,7 @@ TEST(GraphTransferer, DISABLED_CheckShapeInferencePerformance) {
       false,  // is_text_proto
       true,   // shape_inference_for_unknown_shape
       false   // dry_run_for_unknown_shape
-      );
+  );
   const GraphTransferInfo& gfi1 = gt1.GetGraphTransferInfo();
 
   ASSERT_TRUE(status.ok());
diff --git a/tensorflow/core/kernels/neon/neon_depthwise_conv_op.cc b/tensorflow/core/kernels/neon/neon_depthwise_conv_op.cc
index 17f2af550f..0e820bbb62 100644
--- a/tensorflow/core/kernels/neon/neon_depthwise_conv_op.cc
+++ b/tensorflow/core/kernels/neon/neon_depthwise_conv_op.cc
@@ -71,10 +71,10 @@ class NeonDepthwiseConv2dNativeOp : public BinaryOp<float> {
                                         filter.shape().DebugString()));
 
     const int32 in_depth = input.dim_size(3);
-    OP_REQUIRES(
-        context, in_depth == filter.dim_size(2),
-        errors::InvalidArgument("input and filter must have the same depth: ",
-                                in_depth, " vs ", filter.dim_size(2)));
+    OP_REQUIRES(context, in_depth == filter.dim_size(2),
+                errors::InvalidArgument(
+                    "input and filter must have the same depth: ", in_depth,
+                    " vs ", filter.dim_size(2)));
     const int32 batch = input.dim_size(0);
     const int32 input_rows = input.dim_size(1);
     const int32 input_cols = input.dim_size(2);
diff --git a/tensorflow/core/lib/core/status.h b/tensorflow/core/lib/core/status.h
index 58a50a70c2..49f74ff47f 100644
--- a/tensorflow/core/lib/core/status.h
+++ b/tensorflow/core/lib/core/status.h
@@ -131,7 +131,7 @@ inline tensorflow::string* TfCheckOpHelper(::tensorflow::Status v,
   while (auto _result = ::tensorflow::TfCheckOpHelper(val, #val)) \
   LOG(level) << *(_result)
 
-#define TF_CHECK_OK(val)  TF_DO_CHECK_OK(val, FATAL)
+#define TF_CHECK_OK(val) TF_DO_CHECK_OK(val, FATAL)
 #define TF_QCHECK_OK(val) TF_DO_CHECK_OK(val, QFATAL)
 
 // DEBUG only version of TF_CHECK_OK.  Compiler still parses 'val' even in opt
diff --git a/tensorflow/core/lib/core/threadpool.cc b/tensorflow/core/lib/core/threadpool.cc
index 2b10ebeaf7..e55ed79d36 100644
--- a/tensorflow/core/lib/core/threadpool.cc
+++ b/tensorflow/core/lib/core/threadpool.cc
@@ -66,7 +66,9 @@ struct EigenEnvironment {
     }
     return Task{
         std::unique_ptr<TaskImpl>(new TaskImpl{
-            std::move(f), Context(ContextKind::kThread), id,
+            std::move(f),
+            Context(ContextKind::kThread),
+            id,
         }),
     };
   }
diff --git a/tensorflow/core/lib/core/threadpool_test.cc b/tensorflow/core/lib/core/threadpool_test.cc
index 49ddb16645..627ef5a892 100644
--- a/tensorflow/core/lib/core/threadpool_test.cc
+++ b/tensorflow/core/lib/core/threadpool_test.cc
@@ -97,8 +97,8 @@ TEST(ThreadPool, ParallelForWithWorkerId) {
     }
     pool.ParallelForWithWorkerId(
         kWorkItems, kHugeCost,
-        [&threads_running, &work, num_threads](
-            int64 begin, int64 end, int64 id) {
+        [&threads_running, &work, num_threads](int64 begin, int64 end,
+                                               int64 id) {
           // Store true for the current thread, and assert that another thread
           // is not running with the same id.
           ASSERT_LE(0, id);
diff --git a/tensorflow/core/lib/db/sqlite.h b/tensorflow/core/lib/db/sqlite.h
index 0faa458f1d..efe97f78d2 100644
--- a/tensorflow/core/lib/db/sqlite.h
+++ b/tensorflow/core/lib/db/sqlite.h
@@ -18,12 +18,12 @@ limitations under the License.
 #include <mutex>
 
 #include "sqlite3.h"
+#include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/lib/core/refcount.h"
 
 /// TensorFlow SQLite Veneer
 ///
@@ -121,10 +121,7 @@ class LOCKABLE Sqlite : public core::RefCounted {
 
   Sqlite(sqlite3* db, sqlite3_stmt* begin, sqlite3_stmt* commit,
          sqlite3_stmt* rollback) noexcept
-      : db_(db),
-        begin_(begin),
-        commit_(commit),
-        rollback_(rollback) {}
+      : db_(db), begin_(begin), commit_(commit), rollback_(rollback) {}
 
   sqlite3* const db_;
   sqlite3_stmt* const begin_;
@@ -233,7 +230,8 @@ class SqliteStatement {
   /// freed until this statement is Reset() or finalized.
   void BindText(int parameter, const StringPiece& text) {
     Update(sqlite3_bind_text64(stmt_, parameter, text.data(), text.size(),
-                               SQLITE_TRANSIENT, SQLITE_UTF8), parameter);
+                               SQLITE_TRANSIENT, SQLITE_UTF8),
+           parameter);
     size_ += text.size();
   }
   void BindText(const char* parameter, const StringPiece& text) {
@@ -241,7 +239,8 @@ class SqliteStatement {
   }
   void BindTextUnsafe(int parameter, const StringPiece& text) {
     Update(sqlite3_bind_text64(stmt_, parameter, text.data(), text.size(),
-                               SQLITE_STATIC, SQLITE_UTF8), parameter);
+                               SQLITE_STATIC, SQLITE_UTF8),
+           parameter);
     size_ += text.size();
   }
   void BindTextUnsafe(const char* parameter, const StringPiece& text) {
@@ -254,7 +253,8 @@ class SqliteStatement {
   /// freed until this statement is Reset() or finalized.
   void BindBlob(int parameter, const StringPiece& blob) {
     Update(sqlite3_bind_blob64(stmt_, parameter, blob.data(), blob.size(),
-                               SQLITE_TRANSIENT), parameter);
+                               SQLITE_TRANSIENT),
+           parameter);
     size_ += blob.size();
   }
   void BindBlob(const char* parameter, const StringPiece& blob) {
@@ -262,7 +262,8 @@ class SqliteStatement {
   }
   void BindBlobUnsafe(int parameter, const StringPiece& blob) {
     Update(sqlite3_bind_blob64(stmt_, parameter, blob.data(), blob.size(),
-                               SQLITE_STATIC), parameter);
+                               SQLITE_STATIC),
+           parameter);
     size_ += blob.size();
   }
   void BindBlobUnsafe(const char* parameter, const StringPiece& text) {
@@ -320,9 +321,7 @@ class SqliteStatement {
 
   /// \brief Move constructor, after which <other> is reset to empty.
   SqliteStatement(SqliteStatement&& other) noexcept
-      : db_(other.db_),
-        stmt_(other.stmt_),
-        bind_error_(other.bind_error_) {
+      : db_(other.db_), stmt_(other.stmt_), bind_error_(other.bind_error_) {
     other.db_ = nullptr;
     other.stmt_ = nullptr;
     other.bind_error_ = SQLITE_OK;
diff --git a/tensorflow/core/lib/db/sqlite_test.cc b/tensorflow/core/lib/db/sqlite_test.cc
index c9c76ea5f2..1e88323d01 100644
--- a/tensorflow/core/lib/db/sqlite_test.cc
+++ b/tensorflow/core/lib/db/sqlite_test.cc
@@ -33,9 +33,7 @@ class SqliteTest : public ::testing::Test {
     db_->PrepareOrDie("CREATE TABLE T (a BLOB, b BLOB)").StepAndResetOrDie();
   }
 
-  void TearDown() override {
-    db_->Unref();
-  }
+  void TearDown() override { db_->Unref(); }
 
   Sqlite* db_;
   bool is_done_;
@@ -213,7 +211,7 @@ TEST_F(SqliteTest, BindFailed) {
   Status s = stmt.StepOnce();
   EXPECT_NE(string::npos,
             s.error_message().find("INSERT INTO T (a) VALUES (123)"))
-            << s.error_message();
+      << s.error_message();
 }
 
 TEST_F(SqliteTest, SnappyExtension) {
@@ -226,7 +224,7 @@ TEST_F(SqliteTest, SnappyBinaryCompatibility) {
   EXPECT_EQ(
       "today is the end of the republic",
       db_->PrepareOrDie("SELECT UNSNAP(X'03207C746F6461792069732074686520656E64"
-                            "206F66207468652072657075626C6963')")
+                        "206F66207468652072657075626C6963')")
           .StepOnceOrDie()
           .ColumnString(0));
 }
diff --git a/tensorflow/core/lib/gtl/cleanup.h b/tensorflow/core/lib/gtl/cleanup.h
index 6053e98640..6bd60ca482 100644
--- a/tensorflow/core/lib/gtl/cleanup.h
+++ b/tensorflow/core/lib/gtl/cleanup.h
@@ -55,22 +55,21 @@ namespace gtl {
 template <typename F>
 class Cleanup {
  public:
-  Cleanup()
-      : released_(true), f_() {}
+  Cleanup() : released_(true), f_() {}
 
   template <typename G>
-  explicit Cleanup(G&& f)  // NOLINT
+  explicit Cleanup(G&& f)          // NOLINT
       : f_(std::forward<G>(f)) {}  // NOLINT(build/c++11)
 
   Cleanup(Cleanup&& src)  // NOLINT
-      : released_(src.is_released()), f_(src.release()) { }
+      : released_(src.is_released()), f_(src.release()) {}
 
   // Implicitly move-constructible from any compatible Cleanup<G>.
   // The source will be released as if src.release() were called.
   // A moved-from Cleanup can be safely destroyed or reassigned.
   template <typename G>
   Cleanup(Cleanup<G>&& src)  // NOLINT
-      : released_(src.is_released()), f_(src.release()) { }
+      : released_(src.is_released()), f_(src.release()) {}
 
   // Assignment to a Cleanup object behaves like destroying it
   // and making a new one in its place, analogous to unique_ptr
@@ -102,8 +101,8 @@ class Cleanup {
   F f_;
 };
 
-template <int&... ExplicitParameterBarrier,
-          typename F, typename DecayF = typename std::decay<F>::type>
+template <int&... ExplicitParameterBarrier, typename F,
+          typename DecayF = typename std::decay<F>::type>
 TF_MUST_USE_RESULT Cleanup<DecayF> MakeCleanup(F&& f) {
   return Cleanup<DecayF>(std::forward<F>(f));
 }
diff --git a/tensorflow/core/lib/gtl/cleanup_test.cc b/tensorflow/core/lib/gtl/cleanup_test.cc
index bd151cb2ab..a86ffd5fe2 100644
--- a/tensorflow/core/lib/gtl/cleanup_test.cc
+++ b/tensorflow/core/lib/gtl/cleanup_test.cc
@@ -65,15 +65,14 @@ TEST(CleanupTest, Release) {
 TEST(FinallyTest, TypeErasedWithoutFactory) {
   string s = "active";
   {
-    AnyCleanup s_cleaner([&s]{ s.append(" clean"); });
+    AnyCleanup s_cleaner([&s] { s.append(" clean"); });
     EXPECT_EQ("active", s);
   }
   EXPECT_EQ("active clean", s);
 }
 
 struct Appender {
-  Appender(string* s, const string& msg)
-      : s_(s), msg_(msg) {}
+  Appender(string* s, const string& msg) : s_(s), msg_(msg) {}
   void operator()() const { s_->append(msg_); }
   string* s_;
   string msg_;
@@ -163,7 +162,12 @@ class CleanupReferenceTest : public ::testing::Test {
     int* i;
     F(int* cp, int* i) : cp(cp), i(i) {}
     F(const F& o) : cp(o.cp), i(o.i) { ++*cp; }
-    F& operator=(const F& o) { cp = o.cp; i = o.i; ++*cp; return *this; }
+    F& operator=(const F& o) {
+      cp = o.cp;
+      i = o.i;
+      ++*cp;
+      return *this;
+    }
     F(F&&) = default;
     F& operator=(F&&) = default;
     void operator()() const { ++*i; }
@@ -279,7 +283,7 @@ BENCHMARK(BM_AnyCleanup);
 
 void BM_AnyCleanupNoFactory(int iters) {
   while (iters--) {
-    AnyCleanup fin([]{Incr();});
+    AnyCleanup fin([] { Incr(); });
   }
 }
 BENCHMARK(BM_AnyCleanupNoFactory);
diff --git a/tensorflow/core/lib/gtl/inlined_vector.h b/tensorflow/core/lib/gtl/inlined_vector.h
index d6e5d9effa..6e3cb2206d 100644
--- a/tensorflow/core/lib/gtl/inlined_vector.h
+++ b/tensorflow/core/lib/gtl/inlined_vector.h
@@ -31,12 +31,12 @@ limitations under the License.
 #ifndef TENSORFLOW_LIB_GTL_INLINED_VECTOR_H_
 #define TENSORFLOW_LIB_GTL_INLINED_VECTOR_H_
 
-#include <cstddef>
 #include <stddef.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/types.h>
 #include <algorithm>
+#include <cstddef>
 #include <iterator>
 #include <memory>
 #include <type_traits>
@@ -407,7 +407,7 @@ class InlinedVector {
   };
   // 2) Construct a T with args at not-yet-initialized memory pointed by dst.
   struct Construct {
-    template<class... Args>
+    template <class... Args>
     void operator()(T* dst, Args&&... args) const {
       new (dst) T(std::forward<Args>(args)...);
     }
diff --git a/tensorflow/core/lib/gtl/int_type.h b/tensorflow/core/lib/gtl/int_type.h
index 647fc81aa7..af3e50ad78 100644
--- a/tensorflow/core/lib/gtl/int_type.h
+++ b/tensorflow/core/lib/gtl/int_type.h
@@ -255,13 +255,13 @@ class IntType {
     value_ op arg_value;                             \
     return *this;                                    \
   }
-  INT_TYPE_ASSIGNMENT_OP(+= );
-  INT_TYPE_ASSIGNMENT_OP(-= );
-  INT_TYPE_ASSIGNMENT_OP(*= );
-  INT_TYPE_ASSIGNMENT_OP(/= );
-  INT_TYPE_ASSIGNMENT_OP(<<= );  // NOLINT
-  INT_TYPE_ASSIGNMENT_OP(>>= );  // NOLINT
-  INT_TYPE_ASSIGNMENT_OP(%= );
+  INT_TYPE_ASSIGNMENT_OP(+=);
+  INT_TYPE_ASSIGNMENT_OP(-=);
+  INT_TYPE_ASSIGNMENT_OP(*=);
+  INT_TYPE_ASSIGNMENT_OP(/=);
+  INT_TYPE_ASSIGNMENT_OP(<<=);  // NOLINT
+  INT_TYPE_ASSIGNMENT_OP(>>=);  // NOLINT
+  INT_TYPE_ASSIGNMENT_OP(%=);
 #undef INT_TYPE_ASSIGNMENT_OP
 
   ThisType& operator=(ValueType arg_value) {
@@ -314,10 +314,10 @@ std::ostream& operator<<(std::ostream& os,  // NOLINT
 INT_TYPE_ARITHMETIC_OP(+);
 INT_TYPE_ARITHMETIC_OP(-);
 INT_TYPE_ARITHMETIC_OP(*);
-INT_TYPE_ARITHMETIC_OP(/ );
-INT_TYPE_ARITHMETIC_OP(<< );  // NOLINT
-INT_TYPE_ARITHMETIC_OP(>> );  // NOLINT
-INT_TYPE_ARITHMETIC_OP(% );
+INT_TYPE_ARITHMETIC_OP(/);
+INT_TYPE_ARITHMETIC_OP(<<);  // NOLINT
+INT_TYPE_ARITHMETIC_OP(>>);  // NOLINT
+INT_TYPE_ARITHMETIC_OP(%);
 #undef INT_TYPE_ARITHMETIC_OP
 
 // -- NON-MEMBER COMPARISON OPERATORS ------------------------------------------
@@ -345,12 +345,12 @@ INT_TYPE_ARITHMETIC_OP(% );
       IntType<IntTypeName, ValueType> id) {                      \
     return val op id.value();                                    \
   }
-INT_TYPE_COMPARISON_OP(== );  // NOLINT
-INT_TYPE_COMPARISON_OP(!= );  // NOLINT
-INT_TYPE_COMPARISON_OP(< );   // NOLINT
-INT_TYPE_COMPARISON_OP(<= );  // NOLINT
-INT_TYPE_COMPARISON_OP(> );   // NOLINT
-INT_TYPE_COMPARISON_OP(>= );  // NOLINT
+INT_TYPE_COMPARISON_OP(==);  // NOLINT
+INT_TYPE_COMPARISON_OP(!=);  // NOLINT
+INT_TYPE_COMPARISON_OP(<);   // NOLINT
+INT_TYPE_COMPARISON_OP(<=);  // NOLINT
+INT_TYPE_COMPARISON_OP(>);   // NOLINT
+INT_TYPE_COMPARISON_OP(>=);  // NOLINT
 #undef INT_TYPE_COMPARISON_OP
 
 }  // namespace gtl
diff --git a/tensorflow/core/lib/gtl/int_type_test.cc b/tensorflow/core/lib/gtl/int_type_test.cc
index d3c405d9ac..61d364017c 100644
--- a/tensorflow/core/lib/gtl/int_type_test.cc
+++ b/tensorflow/core/lib/gtl/int_type_test.cc
@@ -42,7 +42,8 @@ class IntTypeTest : public ::testing::Test {
 
 // All tests below will be executed on all supported IntTypes.
 typedef ::testing::Types<Int8_IT, UInt8_IT, Int16_IT, UInt16_IT, Int32_IT,
-                         Int64_IT, UInt64_IT, Long_IT> SupportedIntTypes;
+                         Int64_IT, UInt64_IT, Long_IT>
+    SupportedIntTypes;
 
 TYPED_TEST_CASE(IntTypeTest, SupportedIntTypes);
 
@@ -232,7 +233,8 @@ TYPED_TEST(IntTypeTest, TestOperators) {
 
 TYPED_TEST(IntTypeTest, TestHashFunctor) {
   std::unordered_map<typename TestFixture::T, char,
-                     typename TestFixture::T::Hasher> map;
+                     typename TestFixture::T::Hasher>
+      map;
   typename TestFixture::T a(0);
   map[a] = 'c';
   EXPECT_EQ('c', map[a]);
diff --git a/tensorflow/core/lib/gtl/optional.h b/tensorflow/core/lib/gtl/optional.h
index 2ff8b9c7d1..fa33c24c0c 100644
--- a/tensorflow/core/lib/gtl/optional.h
+++ b/tensorflow/core/lib/gtl/optional.h
@@ -593,12 +593,12 @@ class optional : private internal_optional::optional_data<T>,
     assert(this->engaged_);
     return this->pointer();
   }
-  constexpr const T& operator*() const & { return reference(); }
+  constexpr const T& operator*() const& { return reference(); }
   T& operator*() & {
     assert(this->engaged_);
     return reference();
   }
-  constexpr const T&& operator*() const && { return std::move(reference()); }
+  constexpr const T&& operator*() const&& { return std::move(reference()); }
   T&& operator*() && {
     assert(this->engaged_);
     return std::move(reference());
@@ -621,7 +621,7 @@ class optional : private internal_optional::optional_data<T>,
   // Use `opt.value()` to get a reference to underlying value.  The constness
   // and lvalue/rvalue-ness of `opt` is preserved to the view of the T
   // subobject.
-  const T& value() const & {
+  const T& value() const& {
     CHECK(*this) << "Bad optional access";
     return reference();
   }
@@ -633,7 +633,7 @@ class optional : private internal_optional::optional_data<T>,
     CHECK(*this) << "Bad optional access";
     return std::move(reference());
   }
-  const T&& value() const && {  // NOLINT(build/c++11)
+  const T&& value() const&& {  // NOLINT(build/c++11)
     CHECK(*this) << "Bad optional access";
     return std::move(reference());
   }
@@ -641,7 +641,7 @@ class optional : private internal_optional::optional_data<T>,
   // Use `opt.value_or(val)` to get either the value of T or the given default
   // `val` in the empty case.
   template <class U>
-  constexpr T value_or(U&& v) const & {
+  constexpr T value_or(U&& v) const& {
     return static_cast<bool>(*this) ? **this
                                     : static_cast<T>(std::forward<U>(v));
   }
@@ -656,8 +656,8 @@ class optional : private internal_optional::optional_data<T>,
   constexpr const T& reference() const { return *this->pointer(); }
   T& reference() { return *(this->pointer()); }
 
-  // T constraint checks.  You can't have an optional of nullopt_t, in_place_t or
-  // a reference.
+  // T constraint checks.  You can't have an optional of nullopt_t, in_place_t
+  // or a reference.
   static_assert(
       !std::is_same<nullopt_t, typename std::remove_cv<T>::type>::value,
       "optional<nullopt_t> is not allowed.");
diff --git a/tensorflow/core/lib/gtl/optional_test.cc b/tensorflow/core/lib/gtl/optional_test.cc
index 547bee7b75..12b5bbc60b 100644
--- a/tensorflow/core/lib/gtl/optional_test.cc
+++ b/tensorflow/core/lib/gtl/optional_test.cc
@@ -24,17 +24,29 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-using tensorflow::gtl::optional;
-using tensorflow::gtl::nullopt;
-using tensorflow::gtl::nullopt_t;
 using tensorflow::gtl::in_place;
 using tensorflow::gtl::in_place_t;
 using tensorflow::gtl::make_optional;
+using tensorflow::gtl::nullopt;
+using tensorflow::gtl::nullopt_t;
+using tensorflow::gtl::optional;
 
-template <typename T> string TypeQuals(T&) { return "&"; }
-template <typename T> string TypeQuals(T&&) { return "&&"; }
-template <typename T> string TypeQuals(const T&) { return "c&"; }
-template <typename T> string TypeQuals(const T&&) { return "c&&"; }
+template <typename T>
+string TypeQuals(T&) {
+  return "&";
+}
+template <typename T>
+string TypeQuals(T&&) {
+  return "&&";
+}
+template <typename T>
+string TypeQuals(const T&) {
+  return "c&";
+}
+template <typename T>
+string TypeQuals(const T&&) {
+  return "c&&";
+}
 
 struct StructorListener {
   int construct0 = 0;
diff --git a/tensorflow/core/lib/gtl/top_n_test.cc b/tensorflow/core/lib/gtl/top_n_test.cc
index fae85570dc..ba30c072a9 100644
--- a/tensorflow/core/lib/gtl/top_n_test.cc
+++ b/tensorflow/core/lib/gtl/top_n_test.cc
@@ -28,10 +28,10 @@ limitations under the License.
 
 namespace {
 
+using tensorflow::string;
 using tensorflow::gtl::TopN;
 using tensorflow::random::PhiloxRandom;
 using tensorflow::random::SimplePhilox;
-using tensorflow::string;
 
 // Move the contents from an owned raw pointer, returning by value.
 // Objects are easier to manage by value.
diff --git a/tensorflow/core/lib/io/compression.cc b/tensorflow/core/lib/io/compression.cc
index c12de98e40..0d25bca9ec 100644
--- a/tensorflow/core/lib/io/compression.cc
+++ b/tensorflow/core/lib/io/compression.cc
@@ -22,6 +22,6 @@ namespace compression {
 const char kNone[] = "";
 const char kGzip[] = "GZIP";
 
-}
-}
-}
+}  // namespace compression
+}  // namespace io
+}  // namespace tensorflow
diff --git a/tensorflow/core/lib/io/compression.h b/tensorflow/core/lib/io/compression.h
index ef90c60a3a..4d8e7788ca 100644
--- a/tensorflow/core/lib/io/compression.h
+++ b/tensorflow/core/lib/io/compression.h
@@ -23,8 +23,8 @@ namespace compression {
 extern const char kNone[];
 extern const char kGzip[];
 
-}
-}
-}
+}  // namespace compression
+}  // namespace io
+}  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_LIB_IO_COMPRESSION_H_
diff --git a/tensorflow/core/lib/io/record_reader.cc b/tensorflow/core/lib/io/record_reader.cc
index 403c82818e..9cc6c4034f 100644
--- a/tensorflow/core/lib/io/record_reader.cc
+++ b/tensorflow/core/lib/io/record_reader.cc
@@ -207,7 +207,7 @@ Status RecordReader::SkipNBytes(uint64 offset) {
     }
   }
   return Status::OK();
-}
+}  // namespace io
 
 SequentialRecordReader::SequentialRecordReader(
     RandomAccessFile* file, const RecordReaderOptions& options)
diff --git a/tensorflow/core/lib/io/recordio_test.cc b/tensorflow/core/lib/io/recordio_test.cc
index 507c26a63f..b7e51256a2 100644
--- a/tensorflow/core/lib/io/recordio_test.cc
+++ b/tensorflow/core/lib/io/recordio_test.cc
@@ -218,8 +218,8 @@ TEST_F(RecordioTest, RandomRead) {
 
 // Tests of all the error paths in log_reader.cc follow:
 static void AssertHasSubstr(StringPiece s, StringPiece expected) {
-  EXPECT_TRUE(StringPiece(s).contains(expected)) << s << " does not contain "
-                                                 << expected;
+  EXPECT_TRUE(StringPiece(s).contains(expected))
+      << s << " does not contain " << expected;
 }
 
 TEST_F(RecordioTest, ReadError) {
diff --git a/tensorflow/core/lib/png/png_io.cc b/tensorflow/core/lib/png/png_io.cc
index 354c819b09..77a3414442 100644
--- a/tensorflow/core/lib/png/png_io.cc
+++ b/tensorflow/core/lib/png/png_io.cc
@@ -197,8 +197,8 @@ bool CommonInitDecode(StringPiece png_string, int desired_channels,
                       int desired_channel_bits, DecodeContext* context) {
   CHECK(desired_channel_bits == 8 || desired_channel_bits == 16)
       << "desired_channel_bits = " << desired_channel_bits;
-  CHECK(0 <= desired_channels && desired_channels <= 4) << "desired_channels = "
-                                                        << desired_channels;
+  CHECK(0 <= desired_channels && desired_channels <= 4)
+      << "desired_channels = " << desired_channels;
   context->error_condition = false;
   context->channels = desired_channels;
   context->png_ptr = png_create_read_struct(PNG_LIBPNG_VER_STRING, context,
diff --git a/tensorflow/core/lib/random/philox_random_test_utils.h b/tensorflow/core/lib/random/philox_random_test_utils.h
index f4bb087e10..6c29ae6b6a 100644
--- a/tensorflow/core/lib/random/philox_random_test_utils.h
+++ b/tensorflow/core/lib/random/philox_random_test_utils.h
@@ -35,8 +35,8 @@ void FillRandoms(PhiloxRandom gen, typename Distribution::ResultElementType* p,
                  int64 size) {
   const int granularity = Distribution::kResultElementCount;
 
-  CHECK(size % granularity == 0) << " size: " << size
-                                 << " granularity: " << granularity;
+  CHECK(size % granularity == 0)
+      << " size: " << size << " granularity: " << granularity;
 
   Distribution dist;
   for (int i = 0; i < size; i += granularity) {
diff --git a/tensorflow/core/lib/random/random_distributions.h b/tensorflow/core/lib/random/random_distributions.h
index 0e281403f8..3fe1f9bc6c 100644
--- a/tensorflow/core/lib/random/random_distributions.h
+++ b/tensorflow/core/lib/random/random_distributions.h
@@ -17,8 +17,8 @@ limitations under the License.
 #define TENSORFLOW_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
 
 #define _USE_MATH_DEFINES
-#include <cmath>
 #include <math.h>
+#include <cmath>
 #undef _USE_MATH_DEFINES
 
 #include <string.h>
@@ -27,7 +27,6 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/lib/random/philox_random.h"
 
-
 namespace tensorflow {
 namespace random {
 
diff --git a/tensorflow/core/lib/random/random_distributions_test.cc b/tensorflow/core/lib/random/random_distributions_test.cc
index 90d0dba4a7..85d68f456e 100644
--- a/tensorflow/core/lib/random/random_distributions_test.cc
+++ b/tensorflow/core/lib/random/random_distributions_test.cc
@@ -45,8 +45,8 @@ void FillRandomsWithSingles(PhiloxRandom gen,
                             int64 size) {
   int granularity = Distribution::kResultElementCount;
 
-  CHECK(size % granularity == 0) << " size: " << size
-                                 << " granularity: " << granularity;
+  CHECK(size % granularity == 0)
+      << " size: " << size << " granularity: " << granularity;
 
   SingleSampleAdapter<PhiloxRandom> single_samples(&gen);
 
diff --git a/tensorflow/core/lib/strings/ordered_code.cc b/tensorflow/core/lib/strings/ordered_code.cc
index af9a151259..ef90050b4f 100644
--- a/tensorflow/core/lib/strings/ordered_code.cc
+++ b/tensorflow/core/lib/strings/ordered_code.cc
@@ -472,7 +472,8 @@ void OrderedCode::WriteSignedNumIncreasing(string* dest, int64 val) {
   // buf = val in network byte order, sign extended to 10 bytes
   const char sign_byte = val < 0 ? '\xff' : '\0';
   char buf[10] = {
-      sign_byte, sign_byte,
+      sign_byte,
+      sign_byte,
   };
   StoreBigEndian64(buf + 2, val);
   static_assert(sizeof(buf) == kMaxSigned64Length, "max length size mismatch");
diff --git a/tensorflow/core/lib/strings/strcat.h b/tensorflow/core/lib/strings/strcat.h
index 5835b0101d..2bc14945cd 100644
--- a/tensorflow/core/lib/strings/strcat.h
+++ b/tensorflow/core/lib/strings/strcat.h
@@ -126,7 +126,7 @@ class AlphaNum {
       : piece_(digits_, strlen(DoubleToBuffer(f, digits_))) {}
 
   AlphaNum(const Eigen::half &f);  // NOLINT(runtime/explicit)
-  AlphaNum(Hex hex);  // NOLINT(runtime/explicit)
+  AlphaNum(Hex hex);               // NOLINT(runtime/explicit)
 
   AlphaNum(const char *c_str) : piece_(c_str) {}   // NOLINT(runtime/explicit)
   AlphaNum(const StringPiece &pc) : piece_(pc) {}  // NOLINT(runtime/explicit)
diff --git a/tensorflow/core/ops/compat/backwards_compatibility_test.cc b/tensorflow/core/ops/compat/backwards_compatibility_test.cc
index add05d6610..6e05ae4be4 100644
--- a/tensorflow/core/ops/compat/backwards_compatibility_test.cc
+++ b/tensorflow/core/ops/compat/backwards_compatibility_test.cc
@@ -25,8 +25,9 @@ namespace tensorflow {
 namespace {
 
 TEST(BackwardsCompatibilityTest, IsCompatible) {
-  OpCompatibilityLib compatibility(
-      "tensorflow/core/ops", strings::StrCat("v", TF_MAJOR_VERSION), nullptr);
+  OpCompatibilityLib compatibility("tensorflow/core/ops",
+                                   strings::StrCat("v", TF_MAJOR_VERSION),
+                                   nullptr);
 
   Env* env = Env::Default();
   int changed_ops = 0;
diff --git a/tensorflow/core/platform/cloud/gcs_dns_cache.cc b/tensorflow/core/platform/cloud/gcs_dns_cache.cc
index 2b0e55bf37..4d9aff4d24 100644
--- a/tensorflow/core/platform/cloud/gcs_dns_cache.cc
+++ b/tensorflow/core/platform/cloud/gcs_dns_cache.cc
@@ -18,9 +18,9 @@ limitations under the License.
 #include <arpa/inet.h>
 #include <netdb.h>
 #else
+#include <Windows.h>
 #include <winsock2.h>
 #include <ws2tcpip.h>
-#include <Windows.h>
 #endif
 #include <sys/types.h>
 
diff --git a/tensorflow/core/platform/cloud/http_request_fake.h b/tensorflow/core/platform/cloud/http_request_fake.h
index 682b97f6ec..7711eaceb2 100644
--- a/tensorflow/core/platform/cloud/http_request_fake.h
+++ b/tensorflow/core/platform/cloud/http_request_fake.h
@@ -38,8 +38,7 @@ class FakeHttpRequest : public CurlHttpRequest {
  public:
   /// Return the response for the given request.
   FakeHttpRequest(const string& request, const string& response)
-      : FakeHttpRequest(request, response, Status::OK(), nullptr, {}, 200) {
-  }
+      : FakeHttpRequest(request, response, Status::OK(), nullptr, {}, 200) {}
 
   /// Return the response with headers for the given request.
   FakeHttpRequest(const string& request, const string& response,
diff --git a/tensorflow/core/platform/cloud/oauth_client_test.cc b/tensorflow/core/platform/cloud/oauth_client_test.cc
index 236259dbc1..ad569758cc 100644
--- a/tensorflow/core/platform/cloud/oauth_client_test.cc
+++ b/tensorflow/core/platform/cloud/oauth_client_test.cc
@@ -160,12 +160,12 @@ TEST(OAuthClientTest, GetTokenFromServiceAccountJson) {
   ASSERT_EQ(1, EVP_DigestVerifyInit(md_ctx, nullptr, md, nullptr, key));
   ASSERT_EQ(1, EVP_DigestVerifyUpdate(md_ctx, header_dot_claim.c_str(),
                                       header_dot_claim.size()));
-  ASSERT_EQ(
-      1,
-      EVP_DigestVerifyFinal(
-          md_ctx, const_cast<unsigned char*>(
-                      reinterpret_cast<const unsigned char*>(signature.data())),
-          signature.size()));
+  ASSERT_EQ(1,
+            EVP_DigestVerifyFinal(
+                md_ctx,
+                const_cast<unsigned char*>(
+                    reinterpret_cast<const unsigned char*>(signature.data())),
+                signature.size()));
   EVP_MD_CTX_cleanup(md_ctx);
 
   // Free all the crypto-related resources.
diff --git a/tensorflow/core/platform/cloud/retrying_file_system.cc b/tensorflow/core/platform/cloud/retrying_file_system.cc
index c3b6831361..870d935e11 100644
--- a/tensorflow/core/platform/cloud/retrying_file_system.cc
+++ b/tensorflow/core/platform/cloud/retrying_file_system.cc
@@ -25,7 +25,6 @@ namespace tensorflow {
 
 namespace {
 
-
 class RetryingRandomAccessFile : public RandomAccessFile {
  public:
   RetryingRandomAccessFile(std::unique_ptr<RandomAccessFile> base_file,
diff --git a/tensorflow/core/platform/cuda_libdevice_path_test.cc b/tensorflow/core/platform/cuda_libdevice_path_test.cc
index 639f6804ea..2d34239a99 100644
--- a/tensorflow/core/platform/cuda_libdevice_path_test.cc
+++ b/tensorflow/core/platform/cuda_libdevice_path_test.cc
@@ -27,8 +27,7 @@ TEST(CudaLibdevicePathTest, LibdevicePath) {
   VLOG(2) << "Libdevice root = " << LibdeviceRoot();
   std::vector<string> libdevice_files;
   TF_EXPECT_OK(Env::Default()->GetMatchingPaths(
-      io::JoinPath(LibdeviceRoot(), "libdevice.*.bc"),
-      &libdevice_files));
+      io::JoinPath(LibdeviceRoot(), "libdevice.*.bc"), &libdevice_files));
   EXPECT_LT(0, libdevice_files.size());
 }
 #endif
diff --git a/tensorflow/core/platform/default/device_tracer.cc b/tensorflow/core/platform/default/device_tracer.cc
index f4b0f16393..8e60a7f091 100644
--- a/tensorflow/core/platform/default/device_tracer.cc
+++ b/tensorflow/core/platform/default/device_tracer.cc
@@ -579,8 +579,10 @@ Status DeviceTracerImpl::Collect(StepStatsCollector *collector) {
   // TODO(pbar) Handle device IDs and prefix properly.
   const string prefix = "";
   const int id = 0;
-  const string stream_device = strings::StrCat(prefix, "/device:GPU:", id, "/stream:");
-  const string memcpy_device = strings::StrCat(prefix, "/device:GPU:", id, "/memcpy");
+  const string stream_device =
+      strings::StrCat(prefix, "/device:GPU:", id, "/stream:");
+  const string memcpy_device =
+      strings::StrCat(prefix, "/device:GPU:", id, "/memcpy");
 
   mutex_lock l2(trace_mu_);
   for (const auto &rec : kernel_records_) {
diff --git a/tensorflow/core/platform/default/logging.cc b/tensorflow/core/platform/default/logging.cc
index 82bd69f9ca..2b874da198 100644
--- a/tensorflow/core/platform/default/logging.cc
+++ b/tensorflow/core/platform/default/logging.cc
@@ -83,15 +83,14 @@ void LogMessage::GenerateLogMessage() {
   const size_t time_buffer_size = 30;
   char time_buffer[time_buffer_size];
   strftime(time_buffer, time_buffer_size, "%Y-%m-%d %H:%M:%S",
-	   localtime(&now_seconds));
+           localtime(&now_seconds));
 
   // TODO(jeff,sanjay): Replace this with something that logs through the env.
   fprintf(stderr, "%s.%06d: %c %s:%d] %s\n", time_buffer, micros_remainder,
-	  "IWEF"[severity_], fname_, line_, str().c_str());
+          "IWEF"[severity_], fname_, line_, str().c_str());
 }
 #endif
 
-
 namespace {
 
 // Parse log level (int64) from environment variable (char*)
diff --git a/tensorflow/core/platform/default/logging.h b/tensorflow/core/platform/default/logging.h
index 40c260f236..f0efa31d55 100644
--- a/tensorflow/core/platform/default/logging.h
+++ b/tensorflow/core/platform/default/logging.h
@@ -19,8 +19,8 @@ limitations under the License.
 // IWYU pragma: private, include "third_party/tensorflow/core/platform/logging.h"
 // IWYU pragma: friend third_party/tensorflow/core/platform/logging.h
 
-#include <sstream>
 #include <limits>
+#include <sstream>
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -205,16 +205,18 @@ string* MakeCheckOpString(const T1& v1, const T2& v2, const char* exprtext) {
   inline string* name##Impl(int v1, int v2, const char* exprtext) {       \
     return name##Impl<int, int>(v1, v2, exprtext);                        \
   }                                                                       \
-  inline string* name##Impl(const size_t v1, const int v2, const char* exprtext) {       \
+  inline string* name##Impl(const size_t v1, const int v2,                \
+                            const char* exprtext) {                       \
     if (TF_PREDICT_FALSE(v2 < 0)) {                                       \
-       return ::tensorflow::internal::MakeCheckOpString(v1, v2, exprtext);\
+      return ::tensorflow::internal::MakeCheckOpString(v1, v2, exprtext); \
     }                                                                     \
     const size_t uval = (size_t)((unsigned)v1);                           \
     return name##Impl<size_t, size_t>(uval, v2, exprtext);                \
   }                                                                       \
-  inline string* name##Impl(const int v1, const size_t v2, const char* exprtext) {       \
-    if (TF_PREDICT_FALSE(v2 >= std::numeric_limits<int>::max())) {      \
-       return ::tensorflow::internal::MakeCheckOpString(v1, v2, exprtext);\
+  inline string* name##Impl(const int v1, const size_t v2,                \
+                            const char* exprtext) {                       \
+    if (TF_PREDICT_FALSE(v2 >= std::numeric_limits<int>::max())) {        \
+      return ::tensorflow::internal::MakeCheckOpString(v1, v2, exprtext); \
     }                                                                     \
     const size_t uval = (size_t)((unsigned)v2);                           \
     return name##Impl<size_t, size_t>(v1, uval, exprtext);                \
@@ -225,12 +227,12 @@ string* MakeCheckOpString(const T1& v1, const T2& v2, const char* exprtext) {
 // This happens if, for example, those are used as token names in a
 // yacc grammar.
 TF_DEFINE_CHECK_OP_IMPL(Check_EQ,
-                        == )  // Compilation error with CHECK_EQ(NULL, x)?
-TF_DEFINE_CHECK_OP_IMPL(Check_NE, != )  // Use CHECK(x == NULL) instead.
-TF_DEFINE_CHECK_OP_IMPL(Check_LE, <= )
-TF_DEFINE_CHECK_OP_IMPL(Check_LT, < )
-TF_DEFINE_CHECK_OP_IMPL(Check_GE, >= )
-TF_DEFINE_CHECK_OP_IMPL(Check_GT, > )
+                        ==)  // Compilation error with CHECK_EQ(NULL, x)?
+TF_DEFINE_CHECK_OP_IMPL(Check_NE, !=)  // Use CHECK(x == NULL) instead.
+TF_DEFINE_CHECK_OP_IMPL(Check_LE, <=)
+TF_DEFINE_CHECK_OP_IMPL(Check_LT, <)
+TF_DEFINE_CHECK_OP_IMPL(Check_GE, >=)
+TF_DEFINE_CHECK_OP_IMPL(Check_GT, >)
 #undef TF_DEFINE_CHECK_OP_IMPL
 
 // In optimized mode, use CheckOpString to hint to compiler that
diff --git a/tensorflow/core/platform/denormal.cc b/tensorflow/core/platform/denormal.cc
index f13b0af2a7..e00dbdb4ae 100644
--- a/tensorflow/core/platform/denormal.cc
+++ b/tensorflow/core/platform/denormal.cc
@@ -41,8 +41,8 @@ namespace tensorflow {
 namespace port {
 
 ScopedFlushDenormal::ScopedFlushDenormal() {
-// For now, we flush denormals only on SSE 3.  Other architectures such as ARM
-// can be added as needed.
+  // For now, we flush denormals only on SSE 3.  Other architectures such as ARM
+  // can be added as needed.
 
 #ifdef DENORM_USE_INTRINSICS
   if (TestCPUFeature(SSE3)) {
diff --git a/tensorflow/core/platform/device_tracer_test.cc b/tensorflow/core/platform/device_tracer_test.cc
index c0c08dabac..89f14e905a 100644
--- a/tensorflow/core/platform/device_tracer_test.cc
+++ b/tensorflow/core/platform/device_tracer_test.cc
@@ -77,7 +77,8 @@ class DeviceTracerTest : public ::testing::Test {
 
     Node* y_neg = test::graph::Unary(&graph, "Neg", i);
     y_neg_ = y_neg->name();
-    y_neg->set_assigned_device_name("/job:localhost/replica:0/task:0/device:GPU:0");
+    y_neg->set_assigned_device_name(
+        "/job:localhost/replica:0/task:0/device:GPU:0");
 
     test::graph::ToGraphDef(&graph, &def_);
   }
diff --git a/tensorflow/core/platform/env.h b/tensorflow/core/platform/env.h
index 557bfa87e5..34aaf3f78b 100644
--- a/tensorflow/core/platform/env.h
+++ b/tensorflow/core/platform/env.h
@@ -286,7 +286,7 @@ class Env {
   // "version" should be the version of the library or NULL
   // returns the name that LoadLibrary() can use
   virtual string FormatLibraryFileName(const string& name,
-      const string& version) = 0;
+                                       const string& version) = 0;
 
  private:
   // Returns a possible list of local temporary directories.
@@ -353,6 +353,7 @@ class EnvWrapper : public Env {
                                const string& version) override {
     return target_->FormatLibraryFileName(name, version);
   }
+
  private:
   Env* target_;
 };
diff --git a/tensorflow/core/platform/file_system.cc b/tensorflow/core/platform/file_system.cc
index 14755891fa..b9866cf641 100644
--- a/tensorflow/core/platform/file_system.cc
+++ b/tensorflow/core/platform/file_system.cc
@@ -131,18 +131,19 @@ Status FileSystem::GetMatchingPaths(const string& pattern,
     if (children.empty()) continue;
     // This IsDirectory call can be expensive for some FS. Parallelizing it.
     children_dir_status.resize(children.size());
-    ForEach(0, children.size(), [this, &current_dir, &children, &fixed_prefix,
-                                 &children_dir_status](int i) {
-      const string child_path = io::JoinPath(current_dir, children[i]);
-      // In case the child_path doesn't start with the fixed_prefix then
-      // we don't need to explore this path.
-      if (!StringPiece(child_path).starts_with(fixed_prefix)) {
-        children_dir_status[i] =
-            Status(tensorflow::error::CANCELLED, "Operation not needed");
-      } else {
-        children_dir_status[i] = IsDirectory(child_path);
-      }
-    });
+    ForEach(0, children.size(),
+            [this, &current_dir, &children, &fixed_prefix,
+             &children_dir_status](int i) {
+              const string child_path = io::JoinPath(current_dir, children[i]);
+              // In case the child_path doesn't start with the fixed_prefix then
+              // we don't need to explore this path.
+              if (!StringPiece(child_path).starts_with(fixed_prefix)) {
+                children_dir_status[i] = Status(tensorflow::error::CANCELLED,
+                                                "Operation not needed");
+              } else {
+                children_dir_status[i] = IsDirectory(child_path);
+              }
+            });
     for (int i = 0; i < children.size(); ++i) {
       const string child_path = io::JoinPath(current_dir, children[i]);
       // If the IsDirectory call was cancelled we bail.
diff --git a/tensorflow/core/platform/gif.h b/tensorflow/core/platform/gif.h
index 9c72d34ff5..ab095a35c9 100644
--- a/tensorflow/core/platform/gif.h
+++ b/tensorflow/core/platform/gif.h
@@ -20,7 +20,8 @@ limitations under the License.
 
 #if defined(PLATFORM_GOOGLE)
 #include "tensorflow/core/platform/google/build_config/gif.h"
-#elif defined(PLATFORM_POSIX)|| defined(PLATFORM_WINDOWS) ||defined(PLATFORM_POSIX_ANDROID)
+#elif defined(PLATFORM_POSIX) || defined(PLATFORM_WINDOWS) || \
+    defined(PLATFORM_POSIX_ANDROID)
 #include <gif_lib.h>
 #else
 #error Define the appropriate PLATFORM_<foo> macro for this platform
diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system.cc b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
index 0baeac0984..74863293a3 100644
--- a/tensorflow/core/platform/hadoop/hadoop_file_system.cc
+++ b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
@@ -164,8 +164,9 @@ Status HadoopFileSystem::Connect(StringPiece fname, hdfsFS* fs) {
   } else {
     hdfs_->hdfsBuilderSetNameNode(builder, nn.c_str());
   }
-  // KERB_TICKET_CACHE_PATH will be deleted in the future, Because KRB5CCNAME is the build in
-  // environment variable of Kerberos, so KERB_TICKET_CACHE_PATH and related code are unnecessary.
+  // KERB_TICKET_CACHE_PATH will be deleted in the future, Because KRB5CCNAME is
+  // the build in environment variable of Kerberos, so KERB_TICKET_CACHE_PATH
+  // and related code are unnecessary.
   char* ticket_cache_path = getenv("KERB_TICKET_CACHE_PATH");
   if (ticket_cache_path != nullptr) {
     hdfs_->hdfsBuilderSetKerbTicketCachePath(builder, ticket_cache_path);
diff --git a/tensorflow/core/platform/jpeg.h b/tensorflow/core/platform/jpeg.h
index edbcbd960a..1b5e633f0a 100644
--- a/tensorflow/core/platform/jpeg.h
+++ b/tensorflow/core/platform/jpeg.h
@@ -20,7 +20,8 @@ limitations under the License.
 
 #if defined(PLATFORM_GOOGLE)
 #include "tensorflow/core/platform/google/build_config/jpeg.h"
-#elif defined(PLATFORM_POSIX)|| defined(PLATFORM_WINDOWS) ||defined(PLATFORM_POSIX_ANDROID)
+#elif defined(PLATFORM_POSIX) || defined(PLATFORM_WINDOWS) || \
+    defined(PLATFORM_POSIX_ANDROID)
 #include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>
diff --git a/tensorflow/core/platform/png.h b/tensorflow/core/platform/png.h
index 5b0203c343..dad18d7219 100644
--- a/tensorflow/core/platform/png.h
+++ b/tensorflow/core/platform/png.h
@@ -20,7 +20,8 @@ limitations under the License.
 
 #if defined(PLATFORM_GOOGLE)
 #include "tensorflow/core/platform/google/build_config/png.h"
-#elif defined(PLATFORM_POSIX)|| defined(PLATFORM_WINDOWS) ||defined(PLATFORM_POSIX_ANDROID)
+#elif defined(PLATFORM_POSIX) || defined(PLATFORM_WINDOWS) || \
+    defined(PLATFORM_POSIX_ANDROID)
 #include <png.h>
 #else
 #error Define the appropriate PLATFORM_<foo> macro for this platform
diff --git a/tensorflow/core/platform/posix/error.cc b/tensorflow/core/platform/posix/error.cc
index cda6d7d8f9..2bb9443fb3 100644
--- a/tensorflow/core/platform/posix/error.cc
+++ b/tensorflow/core/platform/posix/error.cc
@@ -73,19 +73,19 @@ error::Code ErrnoToCode(int err_number) {
     case ECHILD:      // No child processes
     case EISCONN:     // Socket is connected
 #if !defined(_WIN32) && !defined(__HAIKU__)
-    case ENOTBLK:     // Block device required
+    case ENOTBLK:  // Block device required
 #endif
-    case ENOTCONN:    // The socket is not connected
-    case EPIPE:       // Broken pipe
+    case ENOTCONN:  // The socket is not connected
+    case EPIPE:     // Broken pipe
 #if !defined(_WIN32)
-    case ESHUTDOWN:   // Cannot send after transport endpoint shutdown
+    case ESHUTDOWN:  // Cannot send after transport endpoint shutdown
 #endif
-    case ETXTBSY:     // Text file busy
+    case ETXTBSY:  // Text file busy
       code = error::FAILED_PRECONDITION;
       break;
-    case ENOSPC:   // No space left on device
+    case ENOSPC:  // No space left on device
 #if !defined(_WIN32)
-    case EDQUOT:   // Disk quota exceeded
+    case EDQUOT:  // Disk quota exceeded
 #endif
     case EMFILE:   // Too many open files
     case EMLINK:   // Too many links
@@ -95,7 +95,7 @@ error::Code ErrnoToCode(int err_number) {
     case ENOMEM:   // Not enough space
     case ENOSR:    // No STREAM resources
 #if !defined(_WIN32) && !defined(__HAIKU__)
-    case EUSERS:   // Too many users
+    case EUSERS:  // Too many users
 #endif
       code = error::RESOURCE_EXHAUSTED;
       break;
@@ -104,17 +104,17 @@ error::Code ErrnoToCode(int err_number) {
     case ERANGE:     // Result too large
       code = error::OUT_OF_RANGE;
       break;
-    case ENOSYS:           // Function not implemented
-    case ENOTSUP:          // Operation not supported
-    case EAFNOSUPPORT:     // Address family not supported
+    case ENOSYS:        // Function not implemented
+    case ENOTSUP:       // Operation not supported
+    case EAFNOSUPPORT:  // Address family not supported
 #if !defined(_WIN32)
-    case EPFNOSUPPORT:     // Protocol family not supported
+    case EPFNOSUPPORT:  // Protocol family not supported
 #endif
     case EPROTONOSUPPORT:  // Protocol not supported
 #if !defined(_WIN32) && !defined(__HAIKU__)
     case ESOCKTNOSUPPORT:  // Socket type not supported
 #endif
-    case EXDEV:            // Improper link
+    case EXDEV:  // Improper link
       code = error::UNIMPLEMENTED;
       break;
     case EAGAIN:        // Resource temporarily unavailable
@@ -123,7 +123,7 @@ error::Code ErrnoToCode(int err_number) {
     case ECONNRESET:    // Connection reset
     case EINTR:         // Interrupted function call
 #if !defined(_WIN32)
-    case EHOSTDOWN:     // Host is down
+    case EHOSTDOWN:  // Host is down
 #endif
     case EHOSTUNREACH:  // Host is unreachable
     case ENETDOWN:      // Network is down
@@ -139,7 +139,7 @@ error::Code ErrnoToCode(int err_number) {
       break;
     case EDEADLK:  // Resource deadlock avoided
 #if !defined(_WIN32)
-    case ESTALE:   // Stale file handle
+    case ESTALE:  // Stale file handle
 #endif
       code = error::ABORTED;
       break;
@@ -158,7 +158,7 @@ error::Code ErrnoToCode(int err_number) {
     case ENOMSG:       // No message of the desired type
     case EPROTO:       // Protocol error
 #if !defined(_WIN32) && !defined(__HAIKU__)
-    case EREMOTE:      // Object is remote
+    case EREMOTE:  // Object is remote
 #endif
       code = error::UNKNOWN;
       break;
diff --git a/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h b/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h
index 8604b01c53..ce2069b004 100644
--- a/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h
+++ b/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h
@@ -58,8 +58,8 @@ class AndroidArmV7ACpuUtilsHelper : public ICpuUtilsHelper {
   TF_DISALLOW_COPY_AND_ASSIGN(AndroidArmV7ACpuUtilsHelper);
 };
 
-}  // profile_utils
-}  // tensorflow
+}  // namespace profile_utils
+}  // namespace tensorflow
 
 #endif  // defined(__ANDROID__) && (__ANDROID_API__ >= 21) &&
         // (defined(__ARM_ARCH_7A__) || defined(__aarch64__))
diff --git a/tensorflow/core/platform/profile_utils/cpu_utils.cc b/tensorflow/core/platform/profile_utils/cpu_utils.cc
index d3362690d7..02de7d1362 100644
--- a/tensorflow/core/platform/profile_utils/cpu_utils.cc
+++ b/tensorflow/core/platform/profile_utils/cpu_utils.cc
@@ -28,15 +28,17 @@ namespace profile_utils {
 
 static ICpuUtilsHelper* cpu_utils_helper_instance_ = nullptr;
 
-#if (defined(__powerpc__) || defined(__ppc__) && ( __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) || (defined(__s390x__))
-   /* static */ uint64 CpuUtils::GetCycleCounterFrequency() {
-     static const uint64 cpu_frequency = GetCycleCounterFrequencyImpl();
-     return cpu_frequency;
+#if (defined(__powerpc__) ||                                             \
+     defined(__ppc__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) || \
+    (defined(__s390x__))
+/* static */ uint64 CpuUtils::GetCycleCounterFrequency() {
+  static const uint64 cpu_frequency = GetCycleCounterFrequencyImpl();
+  return cpu_frequency;
 }
 #else
-   /* static */ int64 CpuUtils::GetCycleCounterFrequency() {
-     static const int64 cpu_frequency = GetCycleCounterFrequencyImpl();
-     return cpu_frequency;
+/* static */ int64 CpuUtils::GetCycleCounterFrequency() {
+  static const int64 cpu_frequency = GetCycleCounterFrequencyImpl();
+  return cpu_frequency;
 }
 #endif
 
diff --git a/tensorflow/core/platform/profile_utils/cpu_utils.h b/tensorflow/core/platform/profile_utils/cpu_utils.h
index 5d215b4804..2da20bb1b8 100644
--- a/tensorflow/core/platform/profile_utils/cpu_utils.h
+++ b/tensorflow/core/platform/profile_utils/cpu_utils.h
@@ -94,14 +94,16 @@ class CpuUtils {
 #endif
   }
 
-  // Return cycle counter frequency.
-  // As this method caches the cpu frequency internally,
-  // the first call will incur overhead, but not subsequent calls.
-  #if (defined(__powerpc__) || defined(__ppc__) && ( __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) || (defined(__s390x__))
-     static uint64 GetCycleCounterFrequency();
-  #else
-     static int64 GetCycleCounterFrequency();
-  #endif
+// Return cycle counter frequency.
+// As this method caches the cpu frequency internally,
+// the first call will incur overhead, but not subsequent calls.
+#if (defined(__powerpc__) ||                                             \
+     defined(__ppc__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) || \
+    (defined(__s390x__))
+  static uint64 GetCycleCounterFrequency();
+#else
+  static int64 GetCycleCounterFrequency();
+#endif
 
   // Return micro secound per each clock
   // As this method caches the cpu frequency internally,
diff --git a/tensorflow/core/platform/profile_utils/cpu_utils_test.cc b/tensorflow/core/platform/profile_utils/cpu_utils_test.cc
index 5b11b684dd..eb8161fbfd 100644
--- a/tensorflow/core/platform/profile_utils/cpu_utils_test.cc
+++ b/tensorflow/core/platform/profile_utils/cpu_utils_test.cc
@@ -53,15 +53,17 @@ TEST_F(CpuUtilsTest, CheckGetCurrentClockCycle) {
 }
 
 TEST_F(CpuUtilsTest, CheckCycleCounterFrequency) {
-  #if (defined(__powerpc__) || defined(__ppc__) && ( __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) || (defined(__s390x__))
-     const uint64 cpu_frequency = CpuUtils::GetCycleCounterFrequency();
-     CHECK_GT(cpu_frequency, 0);
-     CHECK_NE(cpu_frequency, unsigned(CpuUtils::INVALID_FREQUENCY));
-  #else
-     const int64 cpu_frequency = CpuUtils::GetCycleCounterFrequency();
-     CHECK_GT(cpu_frequency, 0);
-     CHECK_NE(cpu_frequency, CpuUtils::INVALID_FREQUENCY);
-  #endif
+#if (defined(__powerpc__) ||                                             \
+     defined(__ppc__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) || \
+    (defined(__s390x__))
+  const uint64 cpu_frequency = CpuUtils::GetCycleCounterFrequency();
+  CHECK_GT(cpu_frequency, 0);
+  CHECK_NE(cpu_frequency, unsigned(CpuUtils::INVALID_FREQUENCY));
+#else
+  const int64 cpu_frequency = CpuUtils::GetCycleCounterFrequency();
+  CHECK_GT(cpu_frequency, 0);
+  CHECK_NE(cpu_frequency, CpuUtils::INVALID_FREQUENCY);
+#endif
   if (DBG) {
     LOG(INFO) << "Cpu frequency = " << cpu_frequency;
   }
diff --git a/tensorflow/core/platform/profile_utils/i_cpu_utils_helper.h b/tensorflow/core/platform/profile_utils/i_cpu_utils_helper.h
index 51c54d50d1..11b739c009 100644
--- a/tensorflow/core/platform/profile_utils/i_cpu_utils_helper.h
+++ b/tensorflow/core/platform/profile_utils/i_cpu_utils_helper.h
@@ -47,7 +47,7 @@ class ICpuUtilsHelper {
   TF_DISALLOW_COPY_AND_ASSIGN(ICpuUtilsHelper);
 };
 
-}  // profile_utils
-}  // tensorflow
+}  // namespace profile_utils
+}  // namespace tensorflow
 
 #endif  // TENSORFLOW_PLATFORM_PROFILEUTILS_I_CPU_UTILS_HELPER_H__
diff --git a/tensorflow/core/platform/protobuf_internal.h b/tensorflow/core/platform/protobuf_internal.h
index 7d6e8f57a6..2f151a5aee 100644
--- a/tensorflow/core/platform/protobuf_internal.h
+++ b/tensorflow/core/platform/protobuf_internal.h
@@ -45,8 +45,8 @@ Status ParseAny(const google::protobuf::Any& any, T* message,
 #ifdef TENSORFLOW_LITE_PROTOS
   if (any.type_url() != strings::StrCat("type.googleapis.com/", type_name)) {
     return errors::FailedPrecondition(
-        "Expected Any type_url for: ", type_name, ". Got: ",
-        string(any.type_url().data(), any.type_url().size()), ".");
+        "Expected Any type_url for: ", type_name,
+        ". Got: ", string(any.type_url().data(), any.type_url().size()), ".");
   }
   if (!message->ParseFromString(any.value())) {
     return errors::FailedPrecondition("Failed to unpack: ",
diff --git a/tensorflow/core/platform/s3/aws_logging.cc b/tensorflow/core/platform/s3/aws_logging.cc
index fbca0acc36..44317f1a3e 100644
--- a/tensorflow/core/platform/s3/aws_logging.cc
+++ b/tensorflow/core/platform/s3/aws_logging.cc
@@ -96,7 +96,7 @@ Aws::Utils::Logging::LogLevel ParseLogLevelFromEnv() {
 
   return log_level;
 }
-}
+}  // namespace
 
 static bool initialized = false;
 static mutex s3_logging_mutex(LINKER_INITIALIZED);
diff --git a/tensorflow/core/platform/setround.cc b/tensorflow/core/platform/setround.cc
index 0c66da09bb..592626bfa1 100644
--- a/tensorflow/core/platform/setround.cc
+++ b/tensorflow/core/platform/setround.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/core/platform/setround.h"
 
-
 namespace tensorflow {
 namespace port {
 
diff --git a/tensorflow/core/platform/test_benchmark.h b/tensorflow/core/platform/test_benchmark.h
index a6636225cc..327237dba9 100644
--- a/tensorflow/core/platform/test_benchmark.h
+++ b/tensorflow/core/platform/test_benchmark.h
@@ -60,7 +60,7 @@ class Benchmark {
  private:
   string name_;
   int num_args_;
-  std::vector<std::pair<int, int>> args_;
+  std::vector<std::pair<int, int> > args_;
   void (*fn0_)(int) = nullptr;
   void (*fn1_)(int, int) = nullptr;
   void (*fn2_)(int, int, int) = nullptr;
diff --git a/tensorflow/core/platform/windows/env.cc b/tensorflow/core/platform/windows/env.cc
index 788a4bf4b1..41b2644170 100644
--- a/tensorflow/core/platform/windows/env.cc
+++ b/tensorflow/core/platform/windows/env.cc
@@ -24,9 +24,9 @@ limitations under the License.
 #undef LoadLibrary
 #undef ERROR
 
+#include <string>
 #include <thread>
 #include <vector>
-#include <string>
 
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/platform/load_library.h"
@@ -53,8 +53,7 @@ class StdThread : public Thread {
 
 class WindowsEnv : public Env {
  public:
-  WindowsEnv()
-      : GetSystemTimePreciseAsFileTime_(NULL) {
+  WindowsEnv() : GetSystemTimePreciseAsFileTime_(NULL) {
     // GetSystemTimePreciseAsFileTime function is only available in the latest
     // versions of Windows. For that reason, we try to look it up in
     // kernel32.dll at runtime and use an alternative option if the function
@@ -72,8 +71,8 @@ class WindowsEnv : public Env {
   }
 
   bool MatchPath(const string& path, const string& pattern) override {
-      std::wstring ws_path(WindowsFileSystem::Utf8ToWideChar(path));
-      std::wstring ws_pattern(WindowsFileSystem::Utf8ToWideChar(pattern));
+    std::wstring ws_path(WindowsFileSystem::Utf8ToWideChar(path));
+    std::wstring ws_pattern(WindowsFileSystem::Utf8ToWideChar(pattern));
     return PathMatchSpecW(ws_path.c_str(), ws_pattern.c_str()) == TRUE;
   }
 
@@ -122,14 +121,14 @@ class WindowsEnv : public Env {
     SetThreadpoolTimer(timer, &FileDueTime, 0, 0);
   }
 
-  Status LoadLibrary(const char *library_filename, void** handle) override {
+  Status LoadLibrary(const char* library_filename, void** handle) override {
     std::string file_name = library_filename;
     std::replace(file_name.begin(), file_name.end(), '/', '\\');
 
     std::wstring ws_file_name(WindowsFileSystem::Utf8ToWideChar(file_name));
 
     HMODULE hModule = LoadLibraryExW(ws_file_name.c_str(), NULL,
-      LOAD_WITH_ALTERED_SEARCH_PATH);
+                                     LOAD_WITH_ALTERED_SEARCH_PATH);
     if (!hModule) {
       return errors::NotFound(file_name + " not found");
     }
@@ -138,31 +137,30 @@ class WindowsEnv : public Env {
   }
 
   Status GetSymbolFromLibrary(void* handle, const char* symbol_name,
-    void** symbol) override {
+                              void** symbol) override {
     FARPROC found_symbol;
 
     found_symbol = GetProcAddress((HMODULE)handle, symbol_name);
     if (found_symbol == NULL) {
       return errors::NotFound(std::string(symbol_name) + " not found");
     }
-    *symbol = (void **)found_symbol;
+    *symbol = (void**)found_symbol;
     return Status::OK();
   }
 
-  string FormatLibraryFileName(const string& name, const string& version)
-    override {
+  string FormatLibraryFileName(const string& name,
+                               const string& version) override {
     string filename;
     if (version.size() == 0) {
       filename = name + ".dll";
-    }
-    else {
+    } else {
       filename = name + version + ".dll";
     }
     return filename;
   }
 
  private:
-  typedef VOID(WINAPI * FnGetSystemTimePreciseAsFileTime)(LPFILETIME);
+  typedef VOID(WINAPI* FnGetSystemTimePreciseAsFileTime)(LPFILETIME);
   FnGetSystemTimePreciseAsFileTime GetSystemTimePreciseAsFileTime_;
 };
 
diff --git a/tensorflow/core/platform/windows/error.cc b/tensorflow/core/platform/windows/error.cc
index 39e941a383..291fc5003f 100644
--- a/tensorflow/core/platform/windows/error.cc
+++ b/tensorflow/core/platform/windows/error.cc
@@ -21,7 +21,7 @@ namespace internal {
 std::string GetWindowsErrorMessage(DWORD err) {
   LPSTR buffer = NULL;
   DWORD flags = FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM |
-      FORMAT_MESSAGE_IGNORE_INSERTS;
+                FORMAT_MESSAGE_IGNORE_INSERTS;
   FormatMessageA(flags, NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
                  reinterpret_cast<LPSTR>(&buffer), 0, NULL);
   std::string message = buffer;
diff --git a/tensorflow/core/platform/windows/error.h b/tensorflow/core/platform/windows/error.h
index 026e0d5aa9..ba643a0fa8 100644
--- a/tensorflow/core/platform/windows/error.h
+++ b/tensorflow/core/platform/windows/error.h
@@ -24,9 +24,7 @@ namespace tensorflow {
 namespace internal {
 
 std::string GetWindowsErrorMessage(DWORD err);
-
-}
 }
+}  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_PLATFORM_WINDOWS_ERROR_H_
-
diff --git a/tensorflow/core/platform/windows/integral_types.h b/tensorflow/core/platform/windows/integral_types.h
index 4970b8ca6a..46338a536d 100644
--- a/tensorflow/core/platform/windows/integral_types.h
+++ b/tensorflow/core/platform/windows/integral_types.h
@@ -1,18 +1,18 @@
- /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- ==============================================================================*/
- 
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
 #ifndef TENSORFLOW_PLATFORM_WINDOWS_INTEGRAL_TYPES_H_
 #define TENSORFLOW_PLATFORM_WINDOWS_INTEGRAL_TYPES_H_
 
diff --git a/tensorflow/core/platform/windows/net.cc b/tensorflow/core/platform/windows/net.cc
index 46eb072d42..2ab558ab95 100644
--- a/tensorflow/core/platform/windows/net.cc
+++ b/tensorflow/core/platform/windows/net.cc
@@ -26,7 +26,7 @@ limitations under the License.
 
 #undef ERROR
 
-#pragma comment(lib,"Ws2_32.lib")
+#pragma comment(lib, "Ws2_32.lib")
 
 namespace tensorflow {
 namespace internal {
@@ -44,8 +44,8 @@ bool IsPortAvailable(int* port, bool is_tcp) {
   CHECK_GE(*port, 0);
   CHECK_LE(*port, 65535);
   if (sock == INVALID_SOCKET) {
-    LOG(ERROR) << "socket() failed: " <<
-        GetWindowsErrorMessage(WSAGetLastError());
+    LOG(ERROR) << "socket() failed: "
+               << GetWindowsErrorMessage(WSAGetLastError());
     return false;
   }
 
@@ -54,8 +54,8 @@ bool IsPortAvailable(int* port, bool is_tcp) {
   int result = setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
                           reinterpret_cast<const char*>(&one), sizeof(one));
   if (result == SOCKET_ERROR) {
-    LOG(ERROR) << "setsockopt() failed: " <<
-        GetWindowsErrorMessage(WSAGetLastError());
+    LOG(ERROR) << "setsockopt() failed: "
+               << GetWindowsErrorMessage(WSAGetLastError());
     closesocket(sock);
     return false;
   }
@@ -66,8 +66,8 @@ bool IsPortAvailable(int* port, bool is_tcp) {
   addr.sin_port = htons((uint16_t)*port);
   result = bind(sock, (struct sockaddr*)&addr, sizeof(addr));
   if (result == SOCKET_ERROR) {
-    LOG(WARNING) << "bind(port=" << *port << ") failed: " <<
-        GetWindowsErrorMessage(WSAGetLastError());
+    LOG(WARNING) << "bind(port=" << *port
+                 << ") failed: " << GetWindowsErrorMessage(WSAGetLastError());
     closesocket(sock);
     return false;
   }
@@ -75,8 +75,8 @@ bool IsPortAvailable(int* port, bool is_tcp) {
   // Get the bound port number.
   result = getsockname(sock, (struct sockaddr*)&addr, &addr_len);
   if (result == SOCKET_ERROR) {
-    LOG(WARNING) << "getsockname() failed: " <<
-        GetWindowsErrorMessage(WSAGetLastError());
+    LOG(WARNING) << "getsockname() failed: "
+                 << GetWindowsErrorMessage(WSAGetLastError());
     closesocket(sock);
     return false;
   }
diff --git a/tensorflow/core/platform/windows/subprocess.h b/tensorflow/core/platform/windows/subprocess.h
index b65313363e..66ec44885d 100644
--- a/tensorflow/core/platform/windows/subprocess.h
+++ b/tensorflow/core/platform/windows/subprocess.h
@@ -19,8 +19,7 @@ limitations under the License.
 namespace tensorflow {
 
 // SubProcess is not yet implemented for Windows.
-class SubProcess {
-};
+class SubProcess {};
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/platform/windows/test.cc b/tensorflow/core/platform/windows/test.cc
index 0ffd02ff14..584acad91b 100644
--- a/tensorflow/core/platform/windows/test.cc
+++ b/tensorflow/core/platform/windows/test.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/platform/net.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/net.h"
 
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/core/platform/windows/windows_file_system.cc b/tensorflow/core/platform/windows/windows_file_system.cc
index 604348fe03..b6b3722caa 100644
--- a/tensorflow/core/platform/windows/windows_file_system.cc
+++ b/tensorflow/core/platform/windows/windows_file_system.cc
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <Shlwapi.h>
 #include <Windows.h>
 #include <direct.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <io.h>
-#include <Shlwapi.h>
 #undef StrCat
 #include <stdio.h>
 #include <sys/stat.h>
@@ -75,16 +75,16 @@ SSIZE_T pread(HANDLE hfile, char* src, size_t num_bytes, uint64_t offset) {
   if (TRUE == read_result) {
     result = bytes_read;
   } else if ((FALSE == read_result) &&
-      ((last_error = GetLastError()) != ERROR_IO_PENDING)) {
+             ((last_error = GetLastError()) != ERROR_IO_PENDING)) {
     result = (last_error == ERROR_HANDLE_EOF) ? 0 : -1;
   } else {
-    if (ERROR_IO_PENDING == last_error) { // Otherwise bytes_read already has the result.
-      BOOL overlapped_result = ::GetOverlappedResult(hfile, &overlapped,
-                                                     &bytes_read, TRUE);
+    if (ERROR_IO_PENDING ==
+        last_error) {  // Otherwise bytes_read already has the result.
+      BOOL overlapped_result =
+          ::GetOverlappedResult(hfile, &overlapped, &bytes_read, TRUE);
       if (FALSE == overlapped_result) {
         result = (::GetLastError() == ERROR_HANDLE_EOF) ? 0 : -1;
-      }
-      else {
+      } else {
         result = bytes_read;
       }
     }
@@ -151,11 +151,11 @@ class WindowsWritableFile : public WritableFile {
   Status Append(const StringPiece& data) override {
     DWORD bytes_written = 0;
     DWORD data_size = static_cast<DWORD>(data.size());
-    BOOL write_result = ::WriteFile(hfile_, data.data(), data_size,
-                                    &bytes_written, NULL);
+    BOOL write_result =
+        ::WriteFile(hfile_, data.data(), data_size, &bytes_written, NULL);
     if (FALSE == write_result) {
-      return IOErrorFromWindowsError(
-          "Failed to WriteFile: " + filename_, ::GetLastError());
+      return IOErrorFromWindowsError("Failed to WriteFile: " + filename_,
+                                     ::GetLastError());
     }
 
     assert(size_t(bytes_written) == data.size());
@@ -171,8 +171,8 @@ class WindowsWritableFile : public WritableFile {
     }
 
     if (FALSE == ::CloseHandle(hfile_)) {
-      return IOErrorFromWindowsError(
-          "CloseHandle failed for: " + filename_, ::GetLastError());
+      return IOErrorFromWindowsError("CloseHandle failed for: " + filename_,
+                                     ::GetLastError());
     }
 
     hfile_ = INVALID_HANDLE_VALUE;
@@ -187,9 +187,7 @@ class WindowsWritableFile : public WritableFile {
     return Status::OK();
   }
 
-  Status Sync() override {
-    return Flush();
-  }
+  Status Sync() override { return Flush(); }
 };
 
 class WinReadOnlyMemoryRegion : public ReadOnlyMemoryRegion {
@@ -204,7 +202,10 @@ class WinReadOnlyMemoryRegion : public ReadOnlyMemoryRegion {
  public:
   WinReadOnlyMemoryRegion(const std::string& filename, HANDLE hfile,
                           HANDLE hmap, const void* address, uint64 length)
-      : filename_(filename), hfile_(hfile), hmap_(hmap), address_(address),
+      : filename_(filename),
+        hfile_(hfile),
+        hmap_(hmap),
+        address_(address),
         length_(length) {}
 
   ~WinReadOnlyMemoryRegion() {
@@ -238,9 +239,9 @@ Status WindowsFileSystem::NewRandomAccessFile(
   // almost all tests would work with a possible exception of fault_injection.
   DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE;
 
-  HANDLE hfile = ::CreateFileW(ws_translated_fname.c_str(), GENERIC_READ,
-                               share_mode, NULL, OPEN_EXISTING, file_flags,
-                               NULL);
+  HANDLE hfile =
+      ::CreateFileW(ws_translated_fname.c_str(), GENERIC_READ, share_mode, NULL,
+                    OPEN_EXISTING, file_flags, NULL);
 
   if (INVALID_HANDLE_VALUE == hfile) {
     string context = "NewRandomAccessFile failed to Create/Open: " + fname;
@@ -258,9 +259,9 @@ Status WindowsFileSystem::NewWritableFile(
   result->reset();
 
   DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE;
-  HANDLE hfile = ::CreateFileW(ws_translated_fname.c_str(), GENERIC_WRITE,
-                               share_mode, NULL, CREATE_ALWAYS,
-                               FILE_ATTRIBUTE_NORMAL, NULL);
+  HANDLE hfile =
+      ::CreateFileW(ws_translated_fname.c_str(), GENERIC_WRITE, share_mode,
+                    NULL, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL);
 
   if (INVALID_HANDLE_VALUE == hfile) {
     string context = "Failed to create a NewWriteableFile: " + fname;
@@ -278,9 +279,9 @@ Status WindowsFileSystem::NewAppendableFile(
   result->reset();
 
   DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE;
-  HANDLE hfile = ::CreateFileW(ws_translated_fname.c_str(), GENERIC_WRITE,
-                               share_mode, NULL, OPEN_ALWAYS,
-                               FILE_ATTRIBUTE_NORMAL, NULL);
+  HANDLE hfile =
+      ::CreateFileW(ws_translated_fname.c_str(), GENERIC_WRITE, share_mode,
+                    NULL, OPEN_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL);
 
   if (INVALID_HANDLE_VALUE == hfile) {
     string context = "Failed to create a NewAppendableFile: " + fname;
@@ -316,9 +317,9 @@ Status WindowsFileSystem::NewReadOnlyMemoryRegionFromFile(
   file_flags |= FILE_FLAG_OVERLAPPED;
 
   DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE;
-  HANDLE hfile = ::CreateFileW(ws_translated_fname.c_str(), GENERIC_READ,
-                               share_mode, NULL, OPEN_EXISTING, file_flags,
-                               NULL);
+  HANDLE hfile =
+      ::CreateFileW(ws_translated_fname.c_str(), GENERIC_READ, share_mode, NULL,
+                    OPEN_EXISTING, file_flags, NULL);
 
   if (INVALID_HANDLE_VALUE == hfile) {
     return IOErrorFromWindowsError(
@@ -345,28 +346,32 @@ Status WindowsFileSystem::NewReadOnlyMemoryRegionFromFile(
                                        NULL);  // Mapping name
 
     if (!hmap) {
-      string context = "Failed to create file mapping for "
-                       "NewReadOnlyMemoryRegionFromFile: " + fname;
+      string context =
+          "Failed to create file mapping for "
+          "NewReadOnlyMemoryRegionFromFile: " +
+          fname;
       return IOErrorFromWindowsError(context, ::GetLastError());
     }
 
     UniqueCloseHandlePtr map_guard(hmap, CloseHandleFunc);
 
-    const void* mapped_region = ::MapViewOfFileEx(
-        hmap, FILE_MAP_READ,
-        0,  // High DWORD of access start
-        0,  // Low DWORD
-        file_size,
-        NULL);  // Let the OS choose the mapping
+    const void* mapped_region =
+        ::MapViewOfFileEx(hmap, FILE_MAP_READ,
+                          0,  // High DWORD of access start
+                          0,  // Low DWORD
+                          file_size,
+                          NULL);  // Let the OS choose the mapping
 
     if (!mapped_region) {
-      string context = "Failed to MapViewOfFile for "
-                       "NewReadOnlyMemoryRegionFromFile: " + fname;
+      string context =
+          "Failed to MapViewOfFile for "
+          "NewReadOnlyMemoryRegionFromFile: " +
+          fname;
       return IOErrorFromWindowsError(context, ::GetLastError());
     }
 
-    result->reset(new WinReadOnlyMemoryRegion(fname, hfile, hmap,
-                                              mapped_region, file_size));
+    result->reset(new WinReadOnlyMemoryRegion(fname, hfile, hmap, mapped_region,
+                                              file_size));
 
     map_guard.release();
     file_guard.release();
@@ -404,8 +409,8 @@ Status WindowsFileSystem::GetChildren(const string& dir,
   }
 
   do {
-	string file_name = WideCharToUtf8(find_data.cFileName);
-	const StringPiece basename = file_name;
+    string file_name = WideCharToUtf8(find_data.cFileName);
+    const StringPiece basename = file_name;
     if (basename != "." && basename != "..") {
       result->push_back(file_name);
     }
@@ -457,8 +462,7 @@ Status WindowsFileSystem::GetFileSize(const string& fname, uint64* size) {
     file_size.HighPart = attrs.nFileSizeHigh;
     file_size.LowPart = attrs.nFileSizeLow;
     *size = file_size.QuadPart;
-  }
-  else {
+  } else {
     string context = "Can not get size for: " + fname;
     result = IOErrorFromWindowsError(context, ::GetLastError());
   }
@@ -472,7 +476,7 @@ Status WindowsFileSystem::RenameFile(const string& src, const string& target) {
   std::wstring ws_translated_src = Utf8ToWideChar(TranslateName(src));
   std::wstring ws_translated_target = Utf8ToWideChar(TranslateName(target));
   if (!::MoveFileExW(ws_translated_src.c_str(), ws_translated_target.c_str(),
-      MOVEFILE_REPLACE_EXISTING)) {
+                     MOVEFILE_REPLACE_EXISTING)) {
     string context(strings::StrCat("Failed to rename: ", src, " to: ", target));
     result = IOErrorFromWindowsError(context, ::GetLastError());
   }
diff --git a/tensorflow/core/platform/windows/windows_file_system.h b/tensorflow/core/platform/windows/windows_file_system.h
index 8dcc153037..ba0302f0fd 100644
--- a/tensorflow/core/platform/windows/windows_file_system.h
+++ b/tensorflow/core/platform/windows/windows_file_system.h
@@ -63,33 +63,35 @@ class WindowsFileSystem : public FileSystem {
 
   Status RenameFile(const string& src, const string& target) override;
 
-  string TranslateName(const string& name) const override {
-    return name;
-  }
+  string TranslateName(const string& name) const override { return name; }
 
   static std::wstring Utf8ToWideChar(const string& utf8str) {
-      int size_required = MultiByteToWideChar(CP_UTF8, 0, utf8str.c_str(), (int)utf8str.size(), NULL, 0);
-      std::wstring ws_translated_str(size_required, 0);
-      MultiByteToWideChar(CP_UTF8, 0, utf8str.c_str(), (int)utf8str.size(), &ws_translated_str[0], size_required);
-      return ws_translated_str;
+    int size_required = MultiByteToWideChar(CP_UTF8, 0, utf8str.c_str(),
+                                            (int)utf8str.size(), NULL, 0);
+    std::wstring ws_translated_str(size_required, 0);
+    MultiByteToWideChar(CP_UTF8, 0, utf8str.c_str(), (int)utf8str.size(),
+                        &ws_translated_str[0], size_required);
+    return ws_translated_str;
   }
 
-  static string WideCharToUtf8(const std::wstring &wstr) {
-      if (wstr.empty()) return std::string();
-      int size_required = WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), (int)wstr.size(), NULL, 0, NULL, NULL);
-      string utf8_translated_str(size_required, 0);
-      WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), (int)wstr.size(), &utf8_translated_str[0], size_required, NULL, NULL);
-      return utf8_translated_str;
+  static string WideCharToUtf8(const std::wstring& wstr) {
+    if (wstr.empty()) return std::string();
+    int size_required = WideCharToMultiByte(
+        CP_UTF8, 0, wstr.c_str(), (int)wstr.size(), NULL, 0, NULL, NULL);
+    string utf8_translated_str(size_required, 0);
+    WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), (int)wstr.size(),
+                        &utf8_translated_str[0], size_required, NULL, NULL);
+    return utf8_translated_str;
   }
 };
 
 class LocalWinFileSystem : public WindowsFileSystem {
-public:
-    string TranslateName(const string& name) const override {
-      StringPiece scheme, host, path;
-      io::ParseURI(name, &scheme, &host, &path);
-      return path.ToString();
-    }
+ public:
+  string TranslateName(const string& name) const override {
+    StringPiece scheme, host, path;
+    io::ParseURI(name, &scheme, &host, &path);
+    return path.ToString();
+  }
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc b/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
index d05143aff9..e968b9c97e 100644
--- a/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
+++ b/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
@@ -53,10 +53,13 @@ class TFProfAdvisorTest : public ::testing::Test {
     NodeExecStats node_stat;
     node_stat.set_all_start_micros(start_miros);
     node_stat.set_op_end_rel_micros(end_rel_micros);
-    node->AddStepStat(step, "/job:localhost/replica:0/task:0/device:GPU:0", node_stat);
-    node->AddStepStat(step, "/job:localhost/replica:0/task:0/device:GPU:0:stream:all",
+    node->AddStepStat(step, "/job:localhost/replica:0/task:0/device:GPU:0",
                       node_stat);
-    node->AddStepStat(step, "/job:localhost/replica:0/task:0/device:GPU:0:stream:0",
+    node->AddStepStat(step,
+                      "/job:localhost/replica:0/task:0/device:GPU:0:stream:all",
+                      node_stat);
+    node->AddStepStat(step,
+                      "/job:localhost/replica:0/task:0/device:GPU:0:stream:0",
                       node_stat);
     return node;
   }
diff --git a/tensorflow/core/profiler/internal/tfprof_op.cc b/tensorflow/core/profiler/internal/tfprof_op.cc
index 5a8429d489..3dce1d85db 100644
--- a/tensorflow/core/profiler/internal/tfprof_op.cc
+++ b/tensorflow/core/profiler/internal/tfprof_op.cc
@@ -113,8 +113,9 @@ const ShowMultiNode* TFOp::ShowInternal(const Options& opts,
     root_->formatted_str = FormatNode(root_.get(), root_.get(), opts);
   }
   if (timeline) {
-    fprintf(stderr, "op view doesn't support timeline yet. "
-                    "Consider graph/scope/code view.\n");
+    fprintf(stderr,
+            "op view doesn't support timeline yet. "
+            "Consider graph/scope/code view.\n");
     return root_.get();
   }
   if (cnodes_map_.empty()) {
@@ -265,9 +266,9 @@ string TFOp::FormatNode(OpNode* node, OpNode* root, const Options& opts) const {
     double pct = 0.0;
     if (node->proto().total_parameters() > 0) {
       accu_pct = 100.0 * node->proto().total_parameters() /
-          root->proto().total_parameters();
-      pct = 100.0 * node->proto().parameters() /
-          root->proto().total_parameters();
+                 root->proto().total_parameters();
+      pct =
+          100.0 * node->proto().parameters() / root->proto().total_parameters();
     }
     attrs.push_back(strings::Printf(
         "%30s",
@@ -282,9 +283,8 @@ string TFOp::FormatNode(OpNode* node, OpNode* root, const Options& opts) const {
     double pct = 0.0;
     if (node->proto().total_float_ops() > 0) {
       accu_pct = 100.0 * node->proto().total_float_ops() /
-          root->proto().total_float_ops();
-      pct = 100.0 * node->proto().float_ops() /
-          root->proto().total_float_ops();
+                 root->proto().total_float_ops();
+      pct = 100.0 * node->proto().float_ops() / root->proto().total_float_ops();
     }
 
     attrs.push_back(strings::Printf(
diff --git a/tensorflow/core/profiler/internal/tfprof_op.h b/tensorflow/core/profiler/internal/tfprof_op.h
index fe1c3b2ae8..aa22182d36 100644
--- a/tensorflow/core/profiler/internal/tfprof_op.h
+++ b/tensorflow/core/profiler/internal/tfprof_op.h
@@ -41,8 +41,7 @@ namespace tfprof {
 // to input ops.
 class TFOp : public TFMultiShow {
  public:
-  explicit TFOp()
-      : TFMultiShow() {}
+  explicit TFOp() : TFMultiShow() {}
   ~TFOp() override {}
 
   void AddNode(TFGraphNode* node) override;
@@ -51,7 +50,7 @@ class TFOp : public TFMultiShow {
 
  private:
   const ShowMultiNode* ShowInternal(const Options& opts,
-                                   Timeline* timeline) override;
+                                    Timeline* timeline) override;
 
   int64 SearchRoot(const std::vector<OpNode*> nodes,
                    const std::vector<string>& regexes);
diff --git a/tensorflow/core/profiler/internal/tfprof_show.h b/tensorflow/core/profiler/internal/tfprof_show.h
index 4d6de06070..81b021549a 100644
--- a/tensorflow/core/profiler/internal/tfprof_show.h
+++ b/tensorflow/core/profiler/internal/tfprof_show.h
@@ -78,40 +78,43 @@ class TFShow {
       return nodes;
     }
     std::vector<T*> sorted_nodes = nodes;
-    std::sort(sorted_nodes.begin(), sorted_nodes.end(), [&opts](const T* n1,
-                                                                const T* n2) {
-      if (n1->name() == kTFProfRoot) return true;
-      if (n2->name() == kTFProfRoot) return false;
-      bool name_cmp = n1->name() < n2->name();
-      if (opts.order_by == kOrderBy[0]) {
-        return name_cmp;
-      } else if (opts.order_by == kOrderBy[1]) {
-        return n1->proto().total_requested_bytes() >
-               n2->proto().total_requested_bytes();
-      } else if (opts.order_by == kOrderBy[2]) {
-        return n1->proto().total_peak_bytes() > n2->proto().total_peak_bytes();
-      } else if (opts.order_by == kOrderBy[3]) {
-        return n1->proto().total_residual_bytes() >
-               n2->proto().total_residual_bytes();
-      } else if (opts.order_by == kOrderBy[4]) {
-        return n1->proto().total_output_bytes() >
-               n2->proto().total_output_bytes();
-      } else if (opts.order_by == kOrderBy[5]) {
-        return n1->proto().total_exec_micros() >
-               n2->proto().total_exec_micros();
-      } else if (opts.order_by == kOrderBy[6]) {
-        return n1->proto().total_accelerator_exec_micros() >
-               n2->proto().total_accelerator_exec_micros();
-      } else if (opts.order_by == kOrderBy[7]) {
-        return n1->proto().total_cpu_exec_micros() >
-               n2->proto().total_cpu_exec_micros();
-      } else if (opts.order_by == kOrderBy[8]) {
-        return n1->proto().total_parameters() > n2->proto().total_parameters();
-      } else if (opts.order_by == kOrderBy[9]) {
-        return n1->proto().total_float_ops() > n2->proto().total_float_ops();
-      }
-      return name_cmp;
-    });
+    std::sort(sorted_nodes.begin(), sorted_nodes.end(),
+              [&opts](const T* n1, const T* n2) {
+                if (n1->name() == kTFProfRoot) return true;
+                if (n2->name() == kTFProfRoot) return false;
+                bool name_cmp = n1->name() < n2->name();
+                if (opts.order_by == kOrderBy[0]) {
+                  return name_cmp;
+                } else if (opts.order_by == kOrderBy[1]) {
+                  return n1->proto().total_requested_bytes() >
+                         n2->proto().total_requested_bytes();
+                } else if (opts.order_by == kOrderBy[2]) {
+                  return n1->proto().total_peak_bytes() >
+                         n2->proto().total_peak_bytes();
+                } else if (opts.order_by == kOrderBy[3]) {
+                  return n1->proto().total_residual_bytes() >
+                         n2->proto().total_residual_bytes();
+                } else if (opts.order_by == kOrderBy[4]) {
+                  return n1->proto().total_output_bytes() >
+                         n2->proto().total_output_bytes();
+                } else if (opts.order_by == kOrderBy[5]) {
+                  return n1->proto().total_exec_micros() >
+                         n2->proto().total_exec_micros();
+                } else if (opts.order_by == kOrderBy[6]) {
+                  return n1->proto().total_accelerator_exec_micros() >
+                         n2->proto().total_accelerator_exec_micros();
+                } else if (opts.order_by == kOrderBy[7]) {
+                  return n1->proto().total_cpu_exec_micros() >
+                         n2->proto().total_cpu_exec_micros();
+                } else if (opts.order_by == kOrderBy[8]) {
+                  return n1->proto().total_parameters() >
+                         n2->proto().total_parameters();
+                } else if (opts.order_by == kOrderBy[9]) {
+                  return n1->proto().total_float_ops() >
+                         n2->proto().total_float_ops();
+                }
+                return name_cmp;
+              });
     return sorted_nodes;
   }
 
diff --git a/tensorflow/core/profiler/internal/tfprof_show_multi.h b/tensorflow/core/profiler/internal/tfprof_show_multi.h
index 2a2208d8e7..711d35f975 100644
--- a/tensorflow/core/profiler/internal/tfprof_show_multi.h
+++ b/tensorflow/core/profiler/internal/tfprof_show_multi.h
@@ -50,7 +50,7 @@ class TFMultiShow {
 
  protected:
   virtual const ShowMultiNode* ShowInternal(const Options& opts,
-                                           Timeline* timeline) = 0;
+                                            Timeline* timeline) = 0;
 
   bool LookUpCheckPoint(const string& name,
                         std::unique_ptr<TFProfTensor>* tensor);
diff --git a/tensorflow/core/profiler/internal/tfprof_timeline.h b/tensorflow/core/profiler/internal/tfprof_timeline.h
index 651ad3f0c1..baf3fb2bed 100644
--- a/tensorflow/core/profiler/internal/tfprof_timeline.h
+++ b/tensorflow/core/profiler/internal/tfprof_timeline.h
@@ -20,8 +20,8 @@ limitations under the License.
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/profiler/internal/tfprof_node_show.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 
 namespace tensorflow {
 namespace tfprof {
diff --git a/tensorflow/core/util/bcast.cc b/tensorflow/core/util/bcast.cc
index 1eab7e3d02..3a5f1f83af 100644
--- a/tensorflow/core/util/bcast.cc
+++ b/tensorflow/core/util/bcast.cc
@@ -69,9 +69,9 @@ BCast::BCast(const Vec& sx, const Vec& sy, const bool fewer_dims_optimization) {
       State curr = UNKNOWN;
       const int64 x_i = x[i];  // i-th dimension of x.
       const int64 y_i = y[i];  // i-th dimension of y.
-      int64 o_i;   // i-th dimension of the output.
-      int64 bx_i;  // i-th broadcast for x.
-      int64 by_i;  // i-th broadcast for y.
+      int64 o_i;               // i-th dimension of the output.
+      int64 bx_i;              // i-th broadcast for x.
+      int64 by_i;              // i-th broadcast for y.
       // Invariant:
       //   o_i = x_i * bx_i = y_i * by_i
       if (x_i == y_i) {
diff --git a/tensorflow/core/util/ctc/ctc_loss_calculator.h b/tensorflow/core/util/ctc/ctc_loss_calculator.h
index be00895b0d..dd1163310b 100644
--- a/tensorflow/core/util/ctc/ctc_loss_calculator.h
+++ b/tensorflow/core/util/ctc/ctc_loss_calculator.h
@@ -130,13 +130,13 @@ Status CTCLossCalculator::CalculateLoss(
   for (int t = 1; t < num_time_steps; ++t) {
     if (inputs[t].rows() != batch_size) {
       return errors::InvalidArgument("Expected batch size at t: ", t,
-                                     " to be: ", batch_size, " but got: ",
-                                     inputs[t].rows());
+                                     " to be: ", batch_size,
+                                     " but got: ", inputs[t].rows());
     }
     if (inputs[t].cols() != num_classes) {
       return errors::InvalidArgument("Expected class count at t: ", t,
-                                     " to be: ", num_classes, " but got: ",
-                                     inputs[t].cols());
+                                     " to be: ", num_classes,
+                                     " but got: ", inputs[t].cols());
     }
   }
 
@@ -282,8 +282,8 @@ Status CTCLossCalculator::PopulateLPrimes(
     LabelSequences* l_primes) const {
   // labels is a Label array of size batch_size
   if (labels.size() != batch_size) {
-    return errors::InvalidArgument("labels.size() != batch_size: ",
-                                   labels.size(), " vs. ", batch_size);
+    return errors::InvalidArgument(
+        "labels.size() != batch_size: ", labels.size(), " vs. ", batch_size);
   }
 
   *max_u_prime = 0;  // keep track of longest l' modified label sequence.
@@ -325,12 +325,13 @@ Status CTCLossCalculator::PopulateLPrimes(
     for (int l_i : l) {
       if (l_i < 0) {
         return errors::InvalidArgument(
-            "All labels must be nonnegative integers, batch: ", b, " labels: ",
-            str_util::Join(l, ","));
+            "All labels must be nonnegative integers, batch: ", b,
+            " labels: ", str_util::Join(l, ","));
       } else if (l_i >= num_classes) {
         return errors::InvalidArgument(
-            "No label may be greater than num_classes. ", "num_classes: ",
-            num_classes, ", batch: ", b, " labels: ", str_util::Join(l, ","));
+            "No label may be greater than num_classes. ",
+            "num_classes: ", num_classes, ", batch: ", b,
+            " labels: ", str_util::Join(l, ","));
       }
     }
     if (!ignore_longer_outputs_than_inputs) {
diff --git a/tensorflow/core/util/cuda_kernel_helper_test.cu.cc b/tensorflow/core/util/cuda_kernel_helper_test.cu.cc
index bd4c356ea0..732ed33ede 100644
--- a/tensorflow/core/util/cuda_kernel_helper_test.cu.cc
+++ b/tensorflow/core/util/cuda_kernel_helper_test.cu.cc
@@ -149,27 +149,27 @@ class CudaLaunchConfigTest : public ::testing::Test {
 TEST_F(CudaLaunchConfigTest, GetCudaLaunchConfig) {
   CudaLaunchConfig cfg;
 
-  // test valid inputs
-  #define TEST_LAUNCH_PARAMETER(work_element_count)                             \
-    cfg = GetCudaLaunchConfig(bufsize, d);                                      \
-    SetOutbufZero<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>     \
-                                                                (cfg, outbuf);  \
-    CUDA_ASSERT_SUCCESS                                                         \
-    cfg = GetCudaLaunchConfig(work_element_count, d);                           \
-    Count1D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>> (         \
-        cfg, bufsize, outbuf);                                                  \
-    CUDA_EXPECT_SUCCESS                                                         \
-    EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0));\
-                                                                                \
-    cfg = GetCudaLaunchConfig(bufsize, d, SetOutbufZero, 0, 0);                 \
-    SetOutbufZero<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>     \
-                                                                (cfg, outbuf);  \
-    CUDA_ASSERT_SUCCESS                                                         \
-    cfg = GetCudaLaunchConfig(work_element_count, d, Count1D, 0, 0);            \
-    Count1D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>> (         \
-        cfg, bufsize, outbuf);                                                  \
-    CUDA_EXPECT_SUCCESS                                                         \
-    EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0))
+// test valid inputs
+#define TEST_LAUNCH_PARAMETER(work_element_count)                              \
+  cfg = GetCudaLaunchConfig(bufsize, d);                                       \
+  SetOutbufZero<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(     \
+      cfg, outbuf);                                                            \
+  CUDA_ASSERT_SUCCESS                                                          \
+  cfg = GetCudaLaunchConfig(work_element_count, d);                            \
+  Count1D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(           \
+      cfg, bufsize, outbuf);                                                   \
+  CUDA_EXPECT_SUCCESS                                                          \
+  EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0)); \
+                                                                               \
+  cfg = GetCudaLaunchConfig(bufsize, d, SetOutbufZero, 0, 0);                  \
+  SetOutbufZero<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(     \
+      cfg, outbuf);                                                            \
+  CUDA_ASSERT_SUCCESS                                                          \
+  cfg = GetCudaLaunchConfig(work_element_count, d, Count1D, 0, 0);             \
+  Count1D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(           \
+      cfg, bufsize, outbuf);                                                   \
+  CUDA_EXPECT_SUCCESS                                                          \
+  EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0))
 
   TEST_LAUNCH_PARAMETER(128);
   TEST_LAUNCH_PARAMETER(129);
@@ -181,7 +181,7 @@ TEST_F(CudaLaunchConfigTest, GetCudaLaunchConfig) {
   TEST_LAUNCH_PARAMETER(8192);
   TEST_LAUNCH_PARAMETER(123456);
   TEST_LAUNCH_PARAMETER(1 << 30);
-  #undef TEST_LAUNCH_PARAMETER
+#undef TEST_LAUNCH_PARAMETER
 }
 
 bool operator==(const Cuda2DLaunchConfig& a, const Cuda2DLaunchConfig& b) {
@@ -200,27 +200,27 @@ TEST_F(CudaLaunchConfigTest, GetCuda2DLaunchConfig) {
   Cuda2DLaunchConfig cfg;
   CudaLaunchConfig cfg1d;
 
-  // test valid inputs
-  #define TEST_LAUNCH_PARAMETER(dimx, dimy)                                     \
-    cfg1d = GetCudaLaunchConfig(bufsize, d);                                    \
-    SetOutbufZero<<<cfg1d.block_count, cfg1d.thread_per_block, 0, d.stream()>>> \
-                                                                (cfg1d, outbuf);\
-    CUDA_ASSERT_SUCCESS                                                         \
-    cfg = GetCuda2DLaunchConfig(dimx, dimy, d);                                 \
-    Count2D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>> (         \
-        cfg, bufsize, outbuf);                                                  \
-    CUDA_EXPECT_SUCCESS                                                         \
-    EXPECT_EQ(dimx * dimy, std::accumulate(outbuf, outbuf + bufsize, 0));       \
-                                                                                \
-    cfg1d = GetCudaLaunchConfig(bufsize, d, SetOutbufZero, 0, 0);               \
-    SetOutbufZero<<<cfg1d.block_count, cfg1d.thread_per_block, 0, d.stream()>>> \
-                                                                (cfg1d, outbuf);\
-    CUDA_ASSERT_SUCCESS                                                         \
-    cfg = GetCuda2DLaunchConfig(dimx, dimy, d, Count2D, 0, 0);                  \
-    Count2D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>> (         \
-        cfg, bufsize, outbuf);                                                  \
-    CUDA_EXPECT_SUCCESS                                                         \
-    EXPECT_EQ(dimx * dimy, std::accumulate(outbuf, outbuf + bufsize, 0))
+// test valid inputs
+#define TEST_LAUNCH_PARAMETER(dimx, dimy)                                      \
+  cfg1d = GetCudaLaunchConfig(bufsize, d);                                     \
+  SetOutbufZero<<<cfg1d.block_count, cfg1d.thread_per_block, 0, d.stream()>>>( \
+      cfg1d, outbuf);                                                          \
+  CUDA_ASSERT_SUCCESS                                                          \
+  cfg = GetCuda2DLaunchConfig(dimx, dimy, d);                                  \
+  Count2D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(           \
+      cfg, bufsize, outbuf);                                                   \
+  CUDA_EXPECT_SUCCESS                                                          \
+  EXPECT_EQ(dimx* dimy, std::accumulate(outbuf, outbuf + bufsize, 0));         \
+                                                                               \
+  cfg1d = GetCudaLaunchConfig(bufsize, d, SetOutbufZero, 0, 0);                \
+  SetOutbufZero<<<cfg1d.block_count, cfg1d.thread_per_block, 0, d.stream()>>>( \
+      cfg1d, outbuf);                                                          \
+  CUDA_ASSERT_SUCCESS                                                          \
+  cfg = GetCuda2DLaunchConfig(dimx, dimy, d, Count2D, 0, 0);                   \
+  Count2D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(           \
+      cfg, bufsize, outbuf);                                                   \
+  CUDA_EXPECT_SUCCESS                                                          \
+  EXPECT_EQ(dimx* dimy, std::accumulate(outbuf, outbuf + bufsize, 0))
 
   TEST_LAUNCH_PARAMETER(128, 128);
   TEST_LAUNCH_PARAMETER(129, 64);
@@ -233,24 +233,24 @@ TEST_F(CudaLaunchConfigTest, GetCuda2DLaunchConfig) {
   TEST_LAUNCH_PARAMETER(123456, 12);
   TEST_LAUNCH_PARAMETER(1, 1 << 30);
   TEST_LAUNCH_PARAMETER(1 << 30, 1);
-  #undef TEST_LAUNCH_PARAMETER
+#undef TEST_LAUNCH_PARAMETER
 }
 
 TEST_F(CudaLaunchConfigTest, GetCuda3DLaunchConfig) {
   Cuda3DLaunchConfig cfg;
   CudaLaunchConfig cfg1d;
 
-  // test valid inputs
-  #define TEST_LAUNCH_PARAMETER(dimx, dimy, dimz)                               \
-    cfg1d = GetCudaLaunchConfig(bufsize, d, SetOutbufZero, 0, 0);               \
-    SetOutbufZero<<<cfg1d.block_count, cfg1d.thread_per_block, 0, d.stream()>>> \
-                                                                (cfg1d, outbuf);\
-    CUDA_ASSERT_SUCCESS                                                         \
-    cfg = GetCuda3DLaunchConfig(dimx, dimy, dimz, d, Count3D, 0, 0);            \
-    Count3D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>> (         \
-        cfg, bufsize, outbuf);                                                  \
-    CUDA_EXPECT_SUCCESS                                                         \
-    EXPECT_EQ(dimx * dimy * dimz, std::accumulate(outbuf, outbuf + bufsize, 0))
+// test valid inputs
+#define TEST_LAUNCH_PARAMETER(dimx, dimy, dimz)                                \
+  cfg1d = GetCudaLaunchConfig(bufsize, d, SetOutbufZero, 0, 0);                \
+  SetOutbufZero<<<cfg1d.block_count, cfg1d.thread_per_block, 0, d.stream()>>>( \
+      cfg1d, outbuf);                                                          \
+  CUDA_ASSERT_SUCCESS                                                          \
+  cfg = GetCuda3DLaunchConfig(dimx, dimy, dimz, d, Count3D, 0, 0);             \
+  Count3D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(           \
+      cfg, bufsize, outbuf);                                                   \
+  CUDA_EXPECT_SUCCESS                                                          \
+  EXPECT_EQ(dimx* dimy* dimz, std::accumulate(outbuf, outbuf + bufsize, 0))
 
   TEST_LAUNCH_PARAMETER(128, 128, 128);
   TEST_LAUNCH_PARAMETER(129, 64, 1024);
@@ -264,7 +264,7 @@ TEST_F(CudaLaunchConfigTest, GetCuda3DLaunchConfig) {
   TEST_LAUNCH_PARAMETER(1, 1, 1 << 30);
   TEST_LAUNCH_PARAMETER(1, 1 << 30, 1);
   TEST_LAUNCH_PARAMETER(1 << 30, 1, 1);
-  #undef TEST_LAUNCH_PARAMETER
+#undef TEST_LAUNCH_PARAMETER
 }
 
 TEST(CudaDeviceFunctionsTest, ShuffleGetSrcLane) {
diff --git a/tensorflow/core/util/example_proto_helper.cc b/tensorflow/core/util/example_proto_helper.cc
index 41f56d2daa..e156a3bc8f 100644
--- a/tensorflow/core/util/example_proto_helper.cc
+++ b/tensorflow/core/util/example_proto_helper.cc
@@ -247,8 +247,9 @@ Status SingleExampleProtoToTensors(
       bool types_match;
       TF_RETURN_IF_ERROR(CheckTypesMatch(f, dtype, &types_match));
       if (!types_match) {
-        return errors::InvalidArgument("Name: ", example_name, ", Feature: ",
-                                       key, ".  Data types don't match. ",
+        return errors::InvalidArgument("Name: ", example_name,
+                                       ", Feature: ", key,
+                                       ".  Data types don't match. ",
                                        "Expected type: ", DataTypeString(dtype),
                                        "  Feature is: ", ProtoDebugString(f));
       }
@@ -278,8 +279,9 @@ Status SingleExampleProtoToTensors(
       bool types_match;
       TF_RETURN_IF_ERROR(CheckTypesMatch(f, dtype, &types_match));
       if (!types_match) {
-        return errors::InvalidArgument("Name: ", example_name, ", Feature: ",
-                                       key, ".  Data types don't match. ",
+        return errors::InvalidArgument("Name: ", example_name,
+                                       ", Feature: ", key,
+                                       ".  Data types don't match. ",
                                        "Expected type: ", DataTypeString(dtype),
                                        "  Feature is: ", ProtoDebugString(f));
       }
diff --git a/tensorflow/core/util/memmapped_file_system_test.cc b/tensorflow/core/util/memmapped_file_system_test.cc
index 616eb5dac3..504d2d353f 100644
--- a/tensorflow/core/util/memmapped_file_system_test.cc
+++ b/tensorflow/core/util/memmapped_file_system_test.cc
@@ -144,8 +144,8 @@ TEST(MemmappedFileSystemTest, ProxyToDefault) {
   TF_ASSERT_OK(memmapped_env.NewAppendableFile(filename, &writable_file_temp));
   // Making sure to clean up after the test finishes.
   const auto adh = [&memmapped_env, &filename](WritableFile* f) {
-      delete f;
-      TF_CHECK_OK(memmapped_env.DeleteFile(filename));
+    delete f;
+    TF_CHECK_OK(memmapped_env.DeleteFile(filename));
   };
   std::unique_ptr<WritableFile, decltype(adh)> writable_file(
       writable_file_temp.release(), adh);
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 2caf5fc56d..864e7e39c2 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -210,31 +210,32 @@ class MklShape {
     CHECK_EQ(dnnDelete_F32(convert), E_SUCCESS);
   }
 
-// The following methods are used for serializing and de-serializing the
-// contents of the mklshape object.
-// The data is serialized in this order
-// isMklTensor_
-// dimension_
-// sizes_
-// strides_
-// mklLayout_
-// tfLayout_
-// tf_to_mkl_dim_map_
+  // The following methods are used for serializing and de-serializing the
+  // contents of the mklshape object.
+  // The data is serialized in this order
+  // isMklTensor_
+  // dimension_
+  // sizes_
+  // strides_
+  // mklLayout_
+  // tfLayout_
+  // tf_to_mkl_dim_map_
 
 #define SIZE_OF_MKL_DNN_BUF \
   (dnnLayoutSerializationBufferSize_F32())  // Size of buffer needed to
                                             // serialize dnn_layout pointer
 
-// Size of buffer to hold the serialized object, the size is computed as follows
-// sizeof(isMklTensor_) + sizeof(dimension_) + sizeof(sizes_) + sizeof(strides_)
-// + sizeof(mklLayout_ buffer) + sizeof(tfLayout_ buffer)
-// + sizeof(tf_to_mkl_dim_map_)
+  // Size of buffer to hold the serialized object, the size is computed as
+  // follows sizeof(isMklTensor_) + sizeof(dimension_) + sizeof(sizes_) +
+  // sizeof(strides_)
+  // + sizeof(mklLayout_ buffer) + sizeof(tfLayout_ buffer)
+  // + sizeof(tf_to_mkl_dim_map_)
 
 #define SIZE_OF_MKL_SERIAL_DATA(dims) \
   (2 * sizeof(size_t) + 3 * dims * sizeof(size_t) + 2 * SIZE_OF_MKL_DNN_BUF)
 
-// First we need to define some macro for offsets into the serial buffer where
-// different elements of Mklshape is written/read from
+  // First we need to define some macro for offsets into the serial buffer where
+  // different elements of Mklshape is written/read from
 
 #define IS_MKL_TENSOR_OFFSET 0
 // Location from start of buffer where isMklTensor_ is serialized
@@ -388,7 +389,7 @@ class MklDnnShape {
 
   /// Equality function for MklDnnShape objects
   /// @return true if both are equal; false otherwise.
-  inline bool operator == (const MklDnnShape& input_shape) const {
+  inline bool operator==(const MklDnnShape& input_shape) const {
     if (this->IsMklTensor() != input_shape.IsMklTensor()) {
       return false;
     }
@@ -406,7 +407,7 @@ class MklDnnShape {
 
   /// Equality operator for MklDnnShape and TFShape.
   /// Returns: true if TF shapes for both are the same, false otherwise
-  inline bool operator == (const TensorShape& input_shape) const {
+  inline bool operator==(const TensorShape& input_shape) const {
     if (!this->IsMklTensor()) {
       return false;
     }
@@ -425,7 +426,7 @@ class MklDnnShape {
   inline size_t GetDimension(char dimension) const {
     int index = GetMklDnnTensorDimIndex(dimension);
     CHECK(index >= 0 && index < this->GetDimension())
-      << "Invalid index from the dimension: " << index << ", " << dimension;
+        << "Invalid index from the dimension: " << index << ", " << dimension;
     return this->DimSize(index);
   }
 
@@ -705,8 +706,8 @@ inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
   Tensor output_tensor;
   TensorShape output_shape;
 
-  TF_CHECK_OK(Status(error::Code::UNIMPLEMENTED,
-                     "Unimplemented conversion function"));
+  TF_CHECK_OK(
+      Status(error::Code::UNIMPLEMENTED, "Unimplemented conversion function"));
 
   return output_tensor;
 }
@@ -748,7 +749,6 @@ inline void GetMklInputList(OpKernelContext* ctext, StringPiece name,
   ctext->input_list(name, input_tensors);
 }
 
-
 #ifndef INTEL_MKL_DNN
 
 inline void GetMklShapeList(OpKernelContext* ctext, StringPiece name,
@@ -973,8 +973,8 @@ inline int64 GetMklTensorDim(const MklShape& mkl_shape, char dimension) {
   return mkl_shape.dim_size(index);
 }
 
-inline void CopyMklTensorInToOut(OpKernelContext* context,
-                                 int idx_in, int idx_out) {
+inline void CopyMklTensorInToOut(OpKernelContext* context, int idx_in,
+                                 int idx_out) {
   int num_inputs = context->num_inputs();
   int num_outputs = context->num_outputs();
   int idx_data_in = GetTensorDataIndex(idx_in, num_inputs);
@@ -995,8 +995,8 @@ inline void CopyMklTensorInToOut(OpKernelContext* context,
 }
 
 #ifndef INTEL_MKL_DNN
-inline void CopyTfTensorInToOutWithShape(OpKernelContext* context,
-                                         int idx_in, int idx_out,
+inline void CopyTfTensorInToOutWithShape(OpKernelContext* context, int idx_in,
+                                         int idx_out,
                                          const TensorShape& shape) {
   int num_inputs = context->num_inputs();
   int num_outputs = context->num_outputs();
@@ -1013,8 +1013,8 @@ inline void CopyTfTensorInToOutWithShape(OpKernelContext* context,
   context->set_output(idx_data_out, output);
 }
 #else
-inline void CopyTfTensorInToOutWithShape(OpKernelContext* context,
-                                         int idx_in, int idx_out,
+inline void CopyTfTensorInToOutWithShape(OpKernelContext* context, int idx_in,
+                                         int idx_out,
                                          const TensorShape& shape) {
   int num_inputs = context->num_inputs();
   int num_outputs = context->num_outputs();
@@ -1034,8 +1034,8 @@ inline void CopyTfTensorInToOutWithShape(OpKernelContext* context,
 
 #ifndef INTEL_MKL_DNN
 
-inline void ForwardTfTensorInToOut(OpKernelContext* context,
-                                  int idx_in, int idx_out) {
+inline void ForwardTfTensorInToOut(OpKernelContext* context, int idx_in,
+                                   int idx_out) {
   int num_inputs = context->num_inputs();
   int num_outputs = context->num_outputs();
   int idx_data_in = GetTensorDataIndex(idx_in, num_inputs);
@@ -1053,8 +1053,8 @@ inline void ForwardTfTensorInToOut(OpKernelContext* context,
 
 #else
 
-inline void ForwardTfTensorInToOut(OpKernelContext* context,
-                                  int idx_in, int idx_out) {
+inline void ForwardTfTensorInToOut(OpKernelContext* context, int idx_in,
+                                   int idx_out) {
   int num_inputs = context->num_inputs();
   int num_outputs = context->num_outputs();
   int idx_data_in = GetTensorDataIndex(idx_in, num_inputs);
@@ -1072,8 +1072,8 @@ inline void ForwardTfTensorInToOut(OpKernelContext* context,
 
 #endif
 
-inline void ForwardMklTensorInToOut(OpKernelContext* context,
-                                   int idx_in, int idx_out) {
+inline void ForwardMklTensorInToOut(OpKernelContext* context, int idx_in,
+                                    int idx_out) {
   int num_inputs = context->num_inputs();
   int num_outputs = context->num_outputs();
   int idx_data_in = GetTensorDataIndex(idx_in, num_inputs);
@@ -1092,8 +1092,8 @@ inline void ForwardMklTensorInToOut(OpKernelContext* context,
 
 #ifdef INTEL_MKL_DNN
 inline void ForwardMklTensorInToOutWithMklShape(OpKernelContext* context,
-                                             int idx_in, int idx_out,
-                                             const MklDnnShape& mkl_shape) {
+                                                int idx_in, int idx_out,
+                                                const MklDnnShape& mkl_shape) {
   int num_inputs = context->num_inputs();
   int num_outputs = context->num_outputs();
   int idx_data_in = GetTensorDataIndex(idx_in, num_inputs);
@@ -1216,11 +1216,11 @@ inline void MklNHWCToNCHW(const Tensor& input, Tensor** output) {
   int64 H = input.dim_size(1);
   int64 W = input.dim_size(2);
   int64 C = input.dim_size(3);
-  int64 stride_n = H*W*C;
-# pragma omp parallel for num_threads(16)
+  int64 stride_n = H * W * C;
+#pragma omp parallel for num_threads(16)
   for (int64 n = 0; n < N; ++n) {
-    mkl_somatcopy('R', 'T', H*W, C, 1, buf_in + n*stride_n, C,
-        buf_out + n*stride_n, H*W);
+    mkl_somatcopy('R', 'T', H * W, C, 1, buf_in + n * stride_n, C,
+                  buf_out + n * stride_n, H * W);
   }
 }
 
@@ -1232,11 +1232,11 @@ inline void MklNCHWToNHWC(const Tensor& input, Tensor** output) {
   int64 H = (*output)->dim_size(1);
   int64 W = (*output)->dim_size(2);
   int64 C = (*output)->dim_size(3);
-  int64 stride_n = H*W*C;
-# pragma omp parallel for num_threads(16)
+  int64 stride_n = H * W * C;
+#pragma omp parallel for num_threads(16)
   for (int64 n = 0; n < N; ++n) {
-    mkl_somatcopy('R', 'T', C, H*W, 1, buf_in + n*stride_n, H*W,
-        buf_out + n*stride_n, C);
+    mkl_somatcopy('R', 'T', C, H * W, 1, buf_in + n * stride_n, H * W,
+                  buf_out + n * stride_n, C);
   }
 }
 
@@ -1279,10 +1279,11 @@ inline memory::format TFDataFormatToMklDnnDataFormat(TensorFormat format) {
 /// @return: Tensorflow data format corresponding to memory::format
 ///          Fails with an error if invalid data format.
 inline TensorFormat MklDnnDataFormatToTFDataFormat(memory::format format) {
-  if (format == memory::format::nhwc) return FORMAT_NHWC;
-  else if (format == memory::format::nchw) return FORMAT_NCHW;
-  TF_CHECK_OK(Status(error::Code::INVALID_ARGUMENT,
-                     "Unsupported data format"));
+  if (format == memory::format::nhwc)
+    return FORMAT_NHWC;
+  else if (format == memory::format::nchw)
+    return FORMAT_NCHW;
+  TF_CHECK_OK(Status(error::Code::INVALID_ARGUMENT, "Unsupported data format"));
 
   // Return to prevent compiler warnings, otherwise TF_CHECK_OK will ensure
   // that we don't come here.
@@ -1425,7 +1426,6 @@ inline memory::desc CreateBlockedMemDescHelper(const memory::dims& dim,
   return memory::desc(md);
 }
 
-
 /*
  * Class to represent all the resources corresponding to a tensor in TensorFlow
  * that are required to execute an operation (such as Convolution).
@@ -1494,7 +1494,7 @@ class MklDnnData {
   /// @return: memory::desc object corresponding to blocked memory format
   ///          for given dimensions and strides.
   static inline memory::desc CreateBlockedMemDesc(const memory::dims& dim,
-                                                 const memory::dims& strides) {
+                                                  const memory::dims& strides) {
     return CreateBlockedMemDescHelper(dim, strides, MklDnnType<T>());
   }
 
@@ -1563,7 +1563,6 @@ class MklDnnData {
     return user_memory_->get_primitive_desc();
   }
 
-
   /// Get function for descriptor of user memory.
   inline memory::desc GetUsrMemDesc() {
     // This is ugly. Why MKL-DNN does not provide desc() method of const type??
@@ -1634,7 +1633,8 @@ class MklDnnData {
   /// @return: true in case reorder of input is needed; false, otherwise.
   inline bool IsReorderNeeded(const memory::format& target_format) const {
     CHECK_NOTNULL(user_memory_);
-    return target_format != user_memory_->get_primitive_desc().desc().data.format;
+    return target_format !=
+           user_memory_->get_primitive_desc().desc().data.format;
   }
 
   /// Function to create a reorder from memory pointed by from to memory pointed
diff --git a/tensorflow/core/util/presized_cuckoo_map.h b/tensorflow/core/util/presized_cuckoo_map.h
index e7dab830f0..f88ad2faaf 100644
--- a/tensorflow/core/util/presized_cuckoo_map.h
+++ b/tensorflow/core/util/presized_cuckoo_map.h
@@ -67,7 +67,7 @@ inline uint64 multiply_high_u64(uint64 x, uint64 y) {
   return prod_hi + (prod_mid1 >> 32) + (prod_mid2 >> 32) + carry;
 #endif
 }
-}
+}  // namespace presized_cuckoo_map
 
 template <class value>
 class PresizedCuckooMap {
diff --git a/tensorflow/core/util/reporter_test.cc b/tensorflow/core/util/reporter_test.cc
index 1cb07718fe..575c27d4ef 100644
--- a/tensorflow/core/util/reporter_test.cc
+++ b/tensorflow/core/util/reporter_test.cc
@@ -29,8 +29,8 @@ namespace {
 
 // Tests of all the error paths in log_reader.cc follow:
 static void ExpectHasSubstr(StringPiece s, StringPiece expected) {
-  EXPECT_TRUE(StringPiece(s).contains(expected)) << s << " does not contain "
-                                                 << expected;
+  EXPECT_TRUE(StringPiece(s).contains(expected))
+      << s << " does not contain " << expected;
 }
 
 TEST(TestReporter, NoLogging) {
diff --git a/tensorflow/core/util/sparse/sparse_tensor.h b/tensorflow/core/util/sparse/sparse_tensor.h
index f2401a0af4..258ee418c1 100644
--- a/tensorflow/core/util/sparse/sparse_tensor.h
+++ b/tensorflow/core/util/sparse/sparse_tensor.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <numeric>
 #include <vector>
 
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
@@ -31,7 +32,6 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/sparse/dim_comparator.h"
 #include "tensorflow/core/util/sparse/group_iterator.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 namespace tensorflow {
 namespace sparse {
@@ -59,8 +59,8 @@ class SparseTensor {
         shape_(shape.begin(), shape.end()),
         order_(order.begin(), order.end()),
         dims_(GetDimsFromIx(ix)) {
-    CHECK_EQ(ix.dtype(), DT_INT64) << "indices must be type int64 but got: "
-                                   << ix.dtype();
+    CHECK_EQ(ix.dtype(), DT_INT64)
+        << "indices must be type int64 but got: " << ix.dtype();
     CHECK(TensorShapeUtils::IsVector(vals.shape()))
         << "vals must be a vec, but got: " << vals.shape().DebugString();
     CHECK_EQ(ix.shape().dim_size(0), vals.shape().dim_size(0))
diff --git a/tensorflow/core/util/sparse/sparse_tensor_test.cc b/tensorflow/core/util/sparse/sparse_tensor_test.cc
index efdd97fd3d..85de032085 100644
--- a/tensorflow/core/util/sparse/sparse_tensor_test.cc
+++ b/tensorflow/core/util/sparse/sparse_tensor_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -25,7 +26,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 namespace tensorflow {
 namespace sparse {
diff --git a/tensorflow/core/util/stream_executor_util.h b/tensorflow/core/util/stream_executor_util.h
index 6a5ddec04c..f7767ace71 100644
--- a/tensorflow/core/util/stream_executor_util.h
+++ b/tensorflow/core/util/stream_executor_util.h
@@ -41,9 +41,10 @@ class StreamExecutorUtil {
   // This assumes that the error codes between the two implementations
   // match.
   static Status ConvertStatus(const perftools::gputools::port::Status& s) {
-    return s.ok() ? Status::OK() : Status(static_cast<tensorflow::error::Code>(
-                                              static_cast<int>(s.code())),
-                                          s.error_message());
+    return s.ok() ? Status::OK()
+                  : Status(static_cast<tensorflow::error::Code>(
+                               static_cast<int>(s.code())),
+                           s.error_message());
   }
 };
 
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
index 579b70ab51..462b420976 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
@@ -913,8 +913,8 @@ Status BundleReader::LookupSlice(StringPiece full_tensor_key,
 Status BundleReader::GetSliceValue(StringPiece full_tensor_key,
                                    const BundleEntryProto& full_tensor_entry,
                                    const TensorSlice& slice_spec, Tensor* val) {
-  using checkpoint::TensorSliceSet;
   using checkpoint::RegisterTensorSlice;
+  using checkpoint::TensorSliceSet;
   DCHECK_GE(full_tensor_entry.slices_size(), 0);
 
   const TensorShape full_shape(TensorShape(full_tensor_entry.shape()));
diff --git a/tensorflow/core/util/tensor_slice_reader_cache.cc b/tensorflow/core/util/tensor_slice_reader_cache.cc
index 0f009d7de5..424f8098a9 100644
--- a/tensorflow/core/util/tensor_slice_reader_cache.cc
+++ b/tensorflow/core/util/tensor_slice_reader_cache.cc
@@ -55,7 +55,7 @@ const TensorSliceReader* TensorSliceReaderCache::GetReader(
     TensorSliceReader::OpenTableFunction open_function, int preferred_shard) {
   mutex_lock l(mu_);
 
-#if defined(__GXX_RTTI) ||  defined(_CPPRTTI)
+#if defined(__GXX_RTTI) || defined(_CPPRTTI)
   // Get the function pointer from the open_function value.
   TensorSliceReaderCache::OpenFuncType* func_ptr =
       open_function.target<TensorSliceReaderCache::OpenFuncType>();
diff --git a/tensorflow/core/util/tensor_slice_set.cc b/tensorflow/core/util/tensor_slice_set.cc
index 4217df90ca..7c1d325c0a 100644
--- a/tensorflow/core/util/tensor_slice_set.cc
+++ b/tensorflow/core/util/tensor_slice_set.cc
@@ -188,9 +188,9 @@ Status RegisterTensorSlice(
     }
     if (type != tss->type()) {
       return errors::Internal("Incompatible tensor types detected for tensor ",
-                              name, ": existing = ",
-                              DataTypeString(tss->type()), ", new = ",
-                              DataTypeString(type));
+                              name,
+                              ": existing = ", DataTypeString(tss->type()),
+                              ", new = ", DataTypeString(type));
     }
   }
   // Register the tensor slices without the actual data.
diff --git a/tensorflow/core/util/tensor_slice_util.h b/tensorflow/core/util/tensor_slice_util.h
index c7edae66b2..8f5a6f1d93 100644
--- a/tensorflow/core/util/tensor_slice_util.h
+++ b/tensorflow/core/util/tensor_slice_util.h
@@ -139,9 +139,9 @@ static bool CopyDataFromTensorSliceToTensorSlice(const TensorShape& shape,
                                                  const TensorSlice& slice_d,
                                                  const SrcT* ptr_s,
                                                  DstT* ptr_d) {
-  CHECK_LE(shape.dims(), kTensorSliceMaxRank) << "Only tensors of size up to "
-                                              << kTensorSliceMaxRank
-                                              << " are supported";
+  CHECK_LE(shape.dims(), kTensorSliceMaxRank)
+      << "Only tensors of size up to " << kTensorSliceMaxRank
+      << " are supported";
   // We need to compute the intersection of the two slices.
   TensorSlice inter;
   if (!slice_s.Intersect(slice_d, &inter)) {
diff --git a/tensorflow/core/util/tensor_slice_writer.h b/tensorflow/core/util/tensor_slice_writer.h
index bdb4921e1b..2888c66d10 100644
--- a/tensorflow/core/util/tensor_slice_writer.h
+++ b/tensorflow/core/util/tensor_slice_writer.h
@@ -101,8 +101,8 @@ Status TensorSliceWriter::Add(const string& name, const TensorShape& shape,
   // The tensor and the slice have to be compatible
   if (shape.dims() != slice.dims()) {
     return errors::Internal("Incompatible tensor shape and slice: ", "shape = ",
-                            shape.DebugString(), ", slice = ",
-                            slice.DebugString());
+                            shape.DebugString(),
+                            ", slice = ", slice.DebugString());
   }
   DataType dt = DataTypeToEnum<T>::value;
   // We need to add an entry for "name" if there isn't an entry already.
@@ -114,9 +114,9 @@ Status TensorSliceWriter::Add(const string& name, const TensorShape& shape,
     CHECK_EQ(name, ssm.name()) << ProtoShortDebugString(ssm);
     TensorShape ssm_shape(ssm.shape());
     if (!shape.IsSameSize(ssm_shape)) {
-      return errors::Internal("Mismatching shapes: existing tensor = ",
-                              ssm_shape.DebugString(), ", trying to add name ",
-                              name, ", shape = ", shape.DebugString());
+      return errors::Internal(
+          "Mismatching shapes: existing tensor = ", ssm_shape.DebugString(),
+          ", trying to add name ", name, ", shape = ", shape.DebugString());
     }
     if (dt != ssm.type()) {
       return errors::Internal(
-- 
GitLab


From 8f0e7207774279f4fe50f4d6c4fbd576e2941463 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 30 Jan 2018 10:18:36 -0800
Subject: [PATCH 393/966] Prepare variance to be exported for serving with the
 servo library.

PiperOrigin-RevId: 183851026
---
 .../tensor_forest/client/random_forest.py     | 41 +++++++++++++++----
 1 file changed, 34 insertions(+), 7 deletions(-)

diff --git a/tensorflow/contrib/tensor_forest/client/random_forest.py b/tensorflow/contrib/tensor_forest/client/random_forest.py
index a998ac1e11..4abcc20ed3 100644
--- a/tensorflow/contrib/tensor_forest/client/random_forest.py
+++ b/tensorflow/contrib/tensor_forest/client/random_forest.py
@@ -18,7 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib import layers
-
+from tensorflow.contrib.learn.python.learn.estimators import constants
 from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
 from tensorflow.contrib.learn.python.learn.estimators import model_fn as model_fn_lib
@@ -43,8 +43,8 @@ from tensorflow.python.training import training_util
 KEYS_NAME = 'keys'
 LOSS_NAME = 'rf_training_loss'
 TREE_PATHS_PREDICTION_KEY = 'tree_paths'
-VARIANCE_PREDICTION_KEY = 'regression_variance'
-
+VARIANCE_PREDICTION_KEY = 'prediction_variance'
+ALL_SERVING_KEY = 'tensorforest_all'
 EPSILON = 0.000001
 
 
@@ -134,7 +134,8 @@ def get_model_fn(params,
                  trainer_id=0,
                  report_feature_importances=False,
                  local_eval=False,
-                 head_scope=None):
+                 head_scope=None,
+                 include_all_in_serving=False):
   """Return a model function given a way to construct a graph builder."""
   if model_head is None:
     model_head = get_default_head(params, weights_name)
@@ -238,7 +239,13 @@ def get_model_fn(params,
       model_ops.predictions[TREE_PATHS_PREDICTION_KEY] = tree_paths
 
     model_ops.predictions[VARIANCE_PREDICTION_KEY] = regression_variance
-
+    if include_all_in_serving:
+      # In order to serve the variance we need to add the prediction dict
+      # to output_alternatives dict.
+      if not model_ops.output_alternatives:
+        model_ops.output_alternatives = {}
+      model_ops.output_alternatives[ALL_SERVING_KEY] = (
+          constants.ProblemType.UNSPECIFIED, model_ops.predictions)
     return model_ops
 
   return _model_fn
@@ -293,7 +300,8 @@ class TensorForestEstimator(estimator.Estimator):
                report_feature_importances=False,
                local_eval=False,
                version=None,
-               head=None):
+               head=None,
+               include_all_in_serving=False):
     """Initializes a TensorForestEstimator instance.
 
     Args:
@@ -339,6 +347,23 @@ class TensorForestEstimator(estimator.Estimator):
       version: Unused.
       head: A heads_lib.Head object that calculates losses and such. If None,
         one will be automatically created based on params.
+      include_all_in_serving: if True, allow preparation of the complete
+        prediction dict including the variance to be exported for serving with
+        the Servo lib; and it also requires calling export_savedmodel with
+        default_output_alternative_key=ALL_SERVING_KEY, i.e.
+        estimator.export_savedmodel(export_dir_base=your_export_dir,
+          serving_input_fn=your_export_input_fn,
+          default_output_alternative_key=ALL_SERVING_KEY)
+        if False, resort to default behavior, i.e. export scores and
+          probabilities but no variances. In this case
+          default_output_alternative_key should be None while calling
+          export_savedmodel().
+        Note, that due to backward compatibility we cannot always set
+        include_all_in_serving to True because in this case calling
+        export_saved_model() without
+        default_output_alternative_key=ALL_SERVING_KEY (legacy behavior) the
+        saved_model_export_utils.get_output_alternatives() would raise
+        ValueError.
 
     Returns:
       A `TensorForestEstimator` instance.
@@ -357,7 +382,9 @@ class TensorForestEstimator(estimator.Estimator):
             num_trainers=num_trainers,
             trainer_id=trainer_id,
             report_feature_importances=report_feature_importances,
-            local_eval=local_eval),
+            local_eval=local_eval,
+            include_all_in_serving=include_all_in_serving,
+        ),
         model_dir=model_dir,
         config=config,
         feature_engineering_fn=feature_engineering_fn)
-- 
GitLab


From 4463d105a8a4a83642b9709ba79310e8f4ddf577 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 30 Jan 2018 10:43:03 -0800
Subject: [PATCH 394/966] Cleanup: Ran clang-format on all *.{cc,h} files in
 tensorflow/contrib/.../*.{hh,c}.

PiperOrigin-RevId: 183855242
---
 .../contrib/android/jni/run_stats_jni.cc      |   2 +-
 .../boosted_trees/kernels/model_ops.cc        |   5 +-
 .../boosted_trees/kernels/prediction_ops.cc   |   2 +-
 .../boosted_trees/kernels/quantile_ops.cc     |   4 +-
 .../kernels/split_handler_ops.cc              |   7 +-
 .../kernels/stats_accumulator_ops.cc          |  30 +--
 .../learner/common/stats/node-stats_test.cc   |   2 +-
 .../lib/quantiles/weighted_quantiles_stream.h |   2 +-
 .../lib/testutil/random_tree_gen.cc           |   2 +-
 .../lib/utils/batch_features_test.cc          |   2 +-
 .../boosted_trees/lib/utils/dropout_utils.cc  |   2 +-
 .../lib/utils/dropout_utils_test.cc           |   2 +-
 .../contrib/boosted_trees/ops/quantile_ops.cc |   2 +-
 .../boosted_trees/ops/split_handler_ops.cc    |   2 +-
 .../ops/stats_accumulator_ops.cc              |   2 +-
 .../bigquery_table_accessor_test_data.h       |   2 +-
 .../contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc    |   8 +-
 .../kernels/masked_matmul_ops.cc              |  25 +-
 .../contrib/ffmpeg/default/ffmpeg_lib.cc      |  41 ++--
 .../contrib/ffmpeg/default/ffmpeg_lib_test.cc |   8 +-
 .../ffmpeg/default/ffmpeg_lib_utility_test.cc |   2 -
 .../framework/kernels/zero_initializer_op.cc  |   6 +-
 .../framework/kernels/zero_initializer_op.h   |   4 +-
 .../contrib/framework/ops/variable_ops.cc     |   4 +-
 tensorflow/contrib/gdr/gdr_memory_manager.cc  |  13 +-
 tensorflow/contrib/image/kernels/image_ops.cc |  13 +-
 ...single_image_random_dot_stereograms_ops.cc |   4 +-
 ...single_image_random_dot_stereograms_ops.cc |   4 +-
 .../kernels/input_pipeline_kernels.cc         |   5 +-
 .../kernels/sparse_feature_cross_kernel.cc    |   5 +-
 tensorflow/contrib/lite/interpreter.h         |   2 +-
 .../contrib/lite/kernels/activations.cc       |  11 +-
 tensorflow/contrib/lite/kernels/add.cc        |   8 +-
 tensorflow/contrib/lite/kernels/basic_rnn.cc  |   6 +-
 .../contrib/lite/kernels/basic_rnn_test.cc    |   5 +-
 .../kernels/embedding_lookup_sparse_test.cc   |  22 +-
 .../contrib/lite/kernels/gather_test.cc       |   9 +-
 .../lite/kernels/hashtable_lookup_test.cc     |   5 +-
 .../kernels/internal/optimized/cpu_check.h    |   8 +-
 .../internal/optimized/depthwiseconv_float.h  |  10 +-
 .../optimized/eigen_spatial_convolutions.h    |  14 +-
 .../lite/kernels/optional_tensor_test.cc      |   2 -
 tensorflow/contrib/lite/kernels/pad.cc        |   4 +-
 tensorflow/contrib/lite/kernels/svdf.cc       |   2 +-
 tensorflow/contrib/lite/kernels/svdf_test.cc  |   2 +-
 .../kernels/unidirectional_sequence_rnn.cc    |   6 +-
 .../unidirectional_sequence_rnn_test.cc       |   5 +-
 .../contrib/lite/toco/tensorflow_util.cc      |   6 +-
 tensorflow/contrib/lite/toco/tflite/export.cc |   6 +-
 .../contrib/lite/toco/tflite/operator.cc      |  12 +-
 .../memory_stats/kernels/memory_stats_ops.cc  |   8 +-
 tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc  | 218 ++++++++---------
 tensorflow/contrib/mpi/mpi_rendezvous_mgr.h   |  11 +-
 tensorflow/contrib/mpi/mpi_server_lib.cc      |   2 +-
 tensorflow/contrib/mpi/mpi_utils.h            |   2 +-
 .../mpi_collectives/kernels/mpi_ops.cc        |   2 +-
 .../kernels/hyperplane_lsh_probes.cc          |  24 +-
 .../kernels/periodic_resample_op.cc           |   5 +-
 .../kernels/periodic_resample_op.h            |   6 +-
 .../contrib/pi_examples/camera/camera.cc      |  10 +-
 .../pi_examples/label_image/label_image.cc    |  89 +++----
 .../kernels/reduce_slice_ops.cc               |   2 +-
 .../kernels/reduce_slice_ops.h                |  12 +-
 .../kernels/reduce_slice_ops_gpu.cu.cc        |   2 +-
 .../resampler/kernels/resampler_ops.cc        | 223 +++++++-----------
 .../contrib/resampler/kernels/resampler_ops.h |  39 +--
 .../resampler/kernels/resampler_ops_gpu.cu.cc | 188 +++++++--------
 tensorflow/contrib/rnn/kernels/blas_gemm.cc   |   9 +-
 tensorflow/contrib/rnn/kernels/gru_ops.cc     | 110 ++++-----
 tensorflow/contrib/rnn/kernels/lstm_ops.cc    | 193 ++++++++-------
 tensorflow/contrib/rnn/kernels/lstm_ops.h     |   1 -
 tensorflow/contrib/rnn/ops/lstm_ops_test.cc   |   5 +-
 .../session_bundle/bundle_shim_test.cc        |  14 +-
 .../contrib/session_bundle/signature.cc       |  14 +-
 .../core/ops/hard_routing_function_op.cc      |  27 +--
 .../hybrid/core/ops/k_feature_gradient_op.cc  |  56 ++---
 .../core/ops/k_feature_routing_function_op.cc |  49 ++--
 .../hybrid/core/ops/routing_function_op.cc    |  29 +--
 .../stochastic_hard_routing_function_op.cc    |  38 ++-
 .../stochastic_hard_routing_gradient_op.cc    |  18 +-
 .../hybrid/core/ops/unpack_path_op.cc         |  10 +-
 .../tensor_forest/hybrid/core/ops/utils.cc    |  11 +-
 .../tensor_forest/hybrid/core/ops/utils.h     |  13 +-
 .../kernels/reinterpret_string_to_float_op.cc |  15 +-
 .../kernels/scatter_add_ndim_op.cc            |  19 +-
 .../tensor_forest/kernels/tree_utils.cc       |  78 +++---
 .../tensor_forest/kernels/tree_utils.h        |  25 +-
 .../tensor_forest/kernels/tree_utils_test.cc  | 128 +++++-----
 .../kernels/v4/candidate_graph_runner.cc      |  13 +-
 .../kernels/v4/decision-tree-resource.h       |  13 +-
 .../kernels/v4/decision_node_evaluator.h      |   1 -
 .../v4/decision_node_evaluator_test.cc        |   5 +-
 .../kernels/v4/fertile-stats-resource.h       |   9 +-
 .../tensor_forest/kernels/v4/grow_stats.cc    |  36 ++-
 .../tensor_forest/kernels/v4/grow_stats.h     |  25 +-
 .../kernels/v4/grow_stats_test.cc             |  47 ++--
 .../tensor_forest/kernels/v4/input_data.cc    |   8 +-
 .../tensor_forest/kernels/v4/input_data.h     |   4 +-
 .../tensor_forest/kernels/v4/input_target.h   |   4 +-
 .../kernels/v4/leaf_model_operators.cc        |   1 -
 .../kernels/v4/leaf_model_operators_test.cc   |  24 +-
 .../contrib/tensor_forest/kernels/v4/params.h |   1 -
 .../tensor_forest/kernels/v4/params_test.cc   |   2 -
 .../kernels/v4/split_collection_operators.cc  |   7 +-
 .../kernels/v4/split_collection_operators.h   |   4 +-
 .../tensor_forest/kernels/v4/stat_utils.cc    |  22 +-
 .../tensor_forest/kernels/v4/test_utils.h     |   5 +-
 .../convert_graphdef_memmapped_format_lib.cc  |   6 +-
 tensorflow/contrib/util/inspect_checkpoint.cc |   2 +-
 tensorflow/contrib/verbs/verbs_server_lib.cc  |   4 +-
 110 files changed, 1029 insertions(+), 1276 deletions(-)

diff --git a/tensorflow/contrib/android/jni/run_stats_jni.cc b/tensorflow/contrib/android/jni/run_stats_jni.cc
index 119fa9cd2c..707853b59b 100644
--- a/tensorflow/contrib/android/jni/run_stats_jni.cc
+++ b/tensorflow/contrib/android/jni/run_stats_jni.cc
@@ -21,8 +21,8 @@ limitations under the License.
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/util/stat_summarizer.h"
 
-using tensorflow::StatSummarizer;
 using tensorflow::RunMetadata;
+using tensorflow::StatSummarizer;
 
 namespace {
 StatSummarizer* requireHandle(JNIEnv* env, jlong handle) {
diff --git a/tensorflow/contrib/boosted_trees/kernels/model_ops.cc b/tensorflow/contrib/boosted_trees/kernels/model_ops.cc
index 4b5d5ba0de..754b7bc327 100644
--- a/tensorflow/contrib/boosted_trees/kernels/model_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/model_ops.cc
@@ -48,8 +48,9 @@ class CreateTreeEnsembleVariableOp : public OpKernel {
     if (!result->InitFromSerialized(tree_ensemble_config_t->scalar<string>()(),
                                     stamp_token)) {
       result->Unref();
-      OP_REQUIRES(context, false, errors::InvalidArgument(
-                                      "Unable to parse tree ensemble config."));
+      OP_REQUIRES(
+          context, false,
+          errors::InvalidArgument("Unable to parse tree ensemble config."));
     }
 
     // Only create one, if one does not exist already. Report status for all
diff --git a/tensorflow/contrib/boosted_trees/kernels/prediction_ops.cc b/tensorflow/contrib/boosted_trees/kernels/prediction_ops.cc
index f8086b0c2b..b3fe38614e 100644
--- a/tensorflow/contrib/boosted_trees/kernels/prediction_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/prediction_ops.cc
@@ -47,8 +47,8 @@ namespace boosted_trees {
 using boosted_trees::learner::LearnerConfig;
 using boosted_trees::learner::LearningRateConfig;
 using boosted_trees::learner::LearningRateDropoutDrivenConfig;
-using boosted_trees::models::MultipleAdditiveTrees;
 using boosted_trees::models::DecisionTreeEnsembleResource;
+using boosted_trees::models::MultipleAdditiveTrees;
 using boosted_trees::utils::DropoutUtils;
 using boosted_trees::utils::TensorUtils;
 
diff --git a/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc b/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc
index e91232bf10..0f4c2298f5 100644
--- a/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc
@@ -36,8 +36,8 @@
 namespace tensorflow {
 
 using ::boosted_trees::QuantileConfig;
-using boosted_trees::utils::TensorUtils;
 using boosted_trees::QuantileStreamResource;
+using boosted_trees::utils::TensorUtils;
 
 namespace {
 const char* const kExampleWeightsName = "example_weights";
@@ -384,7 +384,7 @@ class MakeQuantileSummariesOp : public OpKernel {
         protobuf::Arena arena;
         ::boosted_trees::QuantileSummaryState* summary_proto =
             protobuf::Arena::CreateMessage<
-            ::boosted_trees::QuantileSummaryState>(&arena);
+                ::boosted_trees::QuantileSummaryState>(&arena);
         const auto& summary = stream.GetFinalSummary();
         CopySummaryToProto(summary, summary_proto);
         // Output to tensor.
diff --git a/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc b/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc
index 18b4abd654..44a8ffaf4b 100644
--- a/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc
@@ -34,10 +34,10 @@
 
 namespace tensorflow {
 
+using boosted_trees::learner::LearnerConfig_MultiClassStrategy;
 using boosted_trees::learner::SplitInfo;
 using boosted_trees::learner::stochastic::GradientStats;
 using boosted_trees::learner::stochastic::NodeStats;
-using boosted_trees::learner::LearnerConfig_MultiClassStrategy;
 
 namespace {
 const int32 DUMMY_FEATURE_DIMENSION = -1;
@@ -47,9 +47,8 @@ class BaseBuildSplitOp : public OpKernel {
  public:
   explicit BaseBuildSplitOp(OpKernelConstruction* const context)
       : OpKernel(context) {
-    OP_REQUIRES_OK(
-        context,
-        context->GetAttr("feature_column_group_id", &feature_column_group_id_));
+    OP_REQUIRES_OK(context, context->GetAttr("feature_column_group_id",
+                                             &feature_column_group_id_));
     OP_REQUIRES_OK(context,
                    context->GetAttr("l1_regularization", &l1_regularization_));
     OP_REQUIRES_OK(context,
diff --git a/tensorflow/contrib/boosted_trees/kernels/stats_accumulator_ops.cc b/tensorflow/contrib/boosted_trees/kernels/stats_accumulator_ops.cc
index a9a229c8ae..90a0655201 100644
--- a/tensorflow/contrib/boosted_trees/kernels/stats_accumulator_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/stats_accumulator_ops.cc
@@ -134,10 +134,9 @@ void SerializeScalarAccumulatorToOutput(
     OpKernelContext* context) {
   int64 num_slots = accumulator_resource.values().size();
   Tensor* partition_ids_t = nullptr;
-  OP_REQUIRES_OK(
-      context,
-      context->allocate_output("output_partition_ids", TensorShape({num_slots}),
-                               &partition_ids_t));
+  OP_REQUIRES_OK(context, context->allocate_output("output_partition_ids",
+                                                   TensorShape({num_slots}),
+                                                   &partition_ids_t));
   auto partition_ids = partition_ids_t->vec<int32>();
 
   // Feature ids tensor has ids of feature columns and their dimensions.
@@ -149,15 +148,14 @@ void SerializeScalarAccumulatorToOutput(
 
   Tensor* gradients_t = nullptr;
   OP_REQUIRES_OK(
-      context,
-      context->allocate_output("output_gradients", TensorShape({num_slots}),
-                               &gradients_t));
+      context, context->allocate_output(
+                   "output_gradients", TensorShape({num_slots}), &gradients_t));
   auto gradients = gradients_t->vec<float>();
 
   Tensor* hessians_t = nullptr;
-  OP_REQUIRES_OK(context,
-                 context->allocate_output(
-                     "output_hessians", TensorShape({num_slots}), &hessians_t));
+  OP_REQUIRES_OK(
+      context, context->allocate_output("output_hessians",
+                                        TensorShape({num_slots}), &hessians_t));
   auto hessians = hessians_t->vec<float>();
 
   int i = 0;
@@ -177,10 +175,9 @@ void SerializeTensorAccumulatorToOutput(
     OpKernelContext* context) {
   int64 num_slots = accumulator_resource.values().size();
   Tensor* partition_ids_t = nullptr;
-  OP_REQUIRES_OK(
-      context,
-      context->allocate_output("output_partition_ids", TensorShape({num_slots}),
-                               &partition_ids_t));
+  OP_REQUIRES_OK(context, context->allocate_output("output_partition_ids",
+                                                   TensorShape({num_slots}),
+                                                   &partition_ids_t));
   auto partition_ids = partition_ids_t->vec<int32>();
 
   Tensor* feature_ids_t = nullptr;
@@ -202,9 +199,8 @@ void SerializeTensorAccumulatorToOutput(
   int64 num_hessian_elements = hessian_shape.num_elements();
   hessian_shape.InsertDim(0, num_slots);
   Tensor* hessians_t = nullptr;
-  OP_REQUIRES_OK(
-      context,
-      context->allocate_output("output_hessians", hessian_shape, &hessians_t));
+  OP_REQUIRES_OK(context, context->allocate_output("output_hessians",
+                                                   hessian_shape, &hessians_t));
   auto hessians = hessians_t->flat_outer_dims<float>();
 
   int i = 0;
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/common/stats/node-stats_test.cc b/tensorflow/contrib/boosted_trees/lib/learner/common/stats/node-stats_test.cc
index f867e77d3e..8bca132acf 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/common/stats/node-stats_test.cc
+++ b/tensorflow/contrib/boosted_trees/lib/learner/common/stats/node-stats_test.cc
@@ -17,8 +17,8 @@
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/platform/test.h"
 
-using tensorflow::test::AsTensor;
 using std::vector;
+using tensorflow::test::AsTensor;
 
 namespace tensorflow {
 namespace boosted_trees {
diff --git a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h
index 1c4181f1b1..8ad97fedc9 100644
--- a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h
+++ b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h
@@ -15,9 +15,9 @@
 #ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_STREAM_H_
 #define TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_STREAM_H_
 
+#include <cmath>
 #include <memory>
 #include <vector>
-#include <cmath>
 
 #include "tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_buffer.h"
 #include "tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h"
diff --git a/tensorflow/contrib/boosted_trees/lib/testutil/random_tree_gen.cc b/tensorflow/contrib/boosted_trees/lib/testutil/random_tree_gen.cc
index cbe26ba918..705b65e9db 100644
--- a/tensorflow/contrib/boosted_trees/lib/testutil/random_tree_gen.cc
+++ b/tensorflow/contrib/boosted_trees/lib/testutil/random_tree_gen.cc
@@ -22,9 +22,9 @@ namespace tensorflow {
 namespace boosted_trees {
 namespace testutil {
 
+using boosted_trees::trees::DenseFloatBinarySplit;
 using tensorflow::boosted_trees::trees::DecisionTreeConfig;
 using tensorflow::boosted_trees::trees::TreeNode;
-using boosted_trees::trees::DenseFloatBinarySplit;
 
 namespace {
 
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/batch_features_test.cc b/tensorflow/contrib/boosted_trees/lib/utils/batch_features_test.cc
index 9de3e32b09..609519e8b1 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/batch_features_test.cc
+++ b/tensorflow/contrib/boosted_trees/lib/utils/batch_features_test.cc
@@ -25,8 +25,8 @@ namespace boosted_trees {
 namespace utils {
 namespace {
 
-using test::AsTensor;
 using errors::InvalidArgument;
+using test::AsTensor;
 
 class BatchFeaturesTest : public ::testing::Test {};
 
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.cc b/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.cc
index 38f0151255..db34db998a 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.cc
+++ b/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.cc
@@ -23,10 +23,10 @@
 #include "tensorflow/core/lib/random/simple_philox.h"
 #include "tensorflow/core/platform/logging.h"
 
+using tensorflow::Status;
 using tensorflow::boosted_trees::learner::LearningRateDropoutDrivenConfig;
 using tensorflow::random::PhiloxRandom;
 using tensorflow::random::SimplePhilox;
-using tensorflow::Status;
 
 namespace tensorflow {
 namespace boosted_trees {
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils_test.cc b/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils_test.cc
index ce7632e589..02f972c8e0 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils_test.cc
+++ b/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils_test.cc
@@ -26,9 +26,9 @@
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/env.h"
 
+using std::unordered_set;
 using tensorflow::boosted_trees::learner::LearningRateDropoutDrivenConfig;
 using tensorflow::boosted_trees::trees::DecisionTreeEnsembleConfig;
-using std::unordered_set;
 
 namespace tensorflow {
 namespace boosted_trees {
diff --git a/tensorflow/contrib/boosted_trees/ops/quantile_ops.cc b/tensorflow/contrib/boosted_trees/ops/quantile_ops.cc
index bb57dcf8ae..ae99d53a2c 100644
--- a/tensorflow/contrib/boosted_trees/ops/quantile_ops.cc
+++ b/tensorflow/contrib/boosted_trees/ops/quantile_ops.cc
@@ -19,8 +19,8 @@
 
 namespace tensorflow {
 namespace boosted_trees {
-using shape_inference::InferenceContext;
 using shape_inference::DimensionHandle;
+using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
 
 REGISTER_RESOURCE_HANDLE_OP(QuantileStreamResource);
diff --git a/tensorflow/contrib/boosted_trees/ops/split_handler_ops.cc b/tensorflow/contrib/boosted_trees/ops/split_handler_ops.cc
index 0d27ddaf3a..5d0ebbf73c 100644
--- a/tensorflow/contrib/boosted_trees/ops/split_handler_ops.cc
+++ b/tensorflow/contrib/boosted_trees/ops/split_handler_ops.cc
@@ -18,9 +18,9 @@
 
 namespace tensorflow {
 
+using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
-using shape_inference::DimensionHandle;
 
 REGISTER_OP("BuildDenseInequalitySplits")
     .Attr("feature_column_group_id: int")
diff --git a/tensorflow/contrib/boosted_trees/ops/stats_accumulator_ops.cc b/tensorflow/contrib/boosted_trees/ops/stats_accumulator_ops.cc
index 0354f7853c..179505eef0 100644
--- a/tensorflow/contrib/boosted_trees/ops/stats_accumulator_ops.cc
+++ b/tensorflow/contrib/boosted_trees/ops/stats_accumulator_ops.cc
@@ -19,9 +19,9 @@
 
 namespace tensorflow {
 namespace boosted_trees {
+using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
-using shape_inference::DimensionHandle;
 
 REGISTER_RESOURCE_HANDLE_OP(StatsAccumulatorScalarResource);
 
diff --git a/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test_data.h b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test_data.h
index 59f2333298..fea6b15640 100644
--- a/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test_data.h
+++ b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test_data.h
@@ -399,6 +399,6 @@ const string kTestEmptyRow = R"({
     }]}]})";
 
 }  // namespace
-}  // namepsace tensorflow
+}  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_KERNELS_CLOUD_BIGQUERY_TABLE_ACCESSOR_TEST_DATA_H_
diff --git a/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc b/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc
index 9e41e67857..1a79bf066c 100644
--- a/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc
+++ b/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc
@@ -251,9 +251,8 @@ REGISTER_OP("CudnnRNNParamsToCanonical")
       TF_RETURN_IF_ERROR(c->GetAttr("num_params", &num_params));
       // Set shape for weight matrices
       for (int i = 0; i < num_params; i++) {
-        c->set_output(i,
-                      c->Matrix(InferenceContext::kUnknownDim,
-                                InferenceContext::kUnknownDim));
+        c->set_output(i, c->Matrix(InferenceContext::kUnknownDim,
+                                   InferenceContext::kUnknownDim));
       }
       // Set shape for bias vectors
       for (int i = 0; i < num_params; i++) {
@@ -300,6 +299,7 @@ upcoming training or inferences.
 num_params: number of parameter sets for all layers.
     Each layer may contain multiple parameter sets, with each set consisting of
     a weight matrix and a bias vector.
-)doc", kCudnnRNNCommonAttrs));
+)doc",
+                         kCudnnRNNCommonAttrs));
 
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/factorization/kernels/masked_matmul_ops.cc b/tensorflow/contrib/factorization/kernels/masked_matmul_ops.cc
index 31d08bfb65..a8c5d0763c 100644
--- a/tensorflow/contrib/factorization/kernels/masked_matmul_ops.cc
+++ b/tensorflow/contrib/factorization/kernels/masked_matmul_ops.cc
@@ -57,11 +57,11 @@ typedef Eigen::Map<
 
 class MaskedMatmulOp : public OpKernel {
  public:
-  explicit MaskedMatmulOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->MatchSignature(
-        {DT_FLOAT, DT_FLOAT, DT_INT64, DT_BOOL, DT_BOOL},
-        {DT_FLOAT}));
+  explicit MaskedMatmulOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(
+        context,
+        context->MatchSignature(
+            {DT_FLOAT, DT_FLOAT, DT_INT64, DT_BOOL, DT_BOOL}, {DT_FLOAT}));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -110,12 +110,11 @@ class MaskedMatmulOp : public OpKernel {
                                       num_nonzero_elements, 2);
 
     Tensor* prod_values_tensor;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(
-                       0, TensorShape({num_nonzero_elements}),
-                       &prod_values_tensor));
-    EigenMatFloatMap prod_values(prod_values_tensor->vec<float>().data(),
-                                 1, num_nonzero_elements);
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                0, TensorShape({num_nonzero_elements}),
+                                &prod_values_tensor));
+    EigenMatFloatMap prod_values(prod_values_tensor->vec<float>().data(), 1,
+                                 num_nonzero_elements);
 
     auto get_a_index = [&indices_mat, &a_dim_0](int64 i) {
       int64 a_index = internal::SubtleMustCopy(indices_mat(i, 0));
@@ -182,8 +181,8 @@ class MaskedMatmulOp : public OpKernel {
       }
     };
     // Shard the work.
-    worker_threads.workers->ParallelFor(
-        num_nonzero_elements, cost_per_unit, work);
+    worker_threads.workers->ParallelFor(num_nonzero_elements, cost_per_unit,
+                                        work);
   }
 };
 REGISTER_KERNEL_BUILDER(Name("MaskedMatmul").Device(DEVICE_CPU),
diff --git a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
index c85b1837ab..e61221a6b0 100644
--- a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
+++ b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
@@ -47,20 +47,19 @@ std::vector<string> FfmpegAudioCommandLine(const string& input_filename,
                                            int32 channel_count,
                                            const string& stream) {
   std::vector<string> command({
-          "-nostats",             // No additional progress display.
-          "-nostdin",             // No interactive commands accepted.
-          "-f", input_format_id,  // eg: "mp3"
-          "-probesize", StrCat(kDefaultProbeSize), "-i", input_filename,
-          "-loglevel", "error",   // Print errors only.
-          "-hide_banner",         // Skip printing build options, version, etc.
-          "-map_metadata", "-1",  // Copy global metadata from input to output.
-          "-vn",                  // No video recording.
-          "-ac:a:0", StrCat(channel_count), "-ar:a:0",
-          StrCat(samples_per_second),
-          // Output set (in several ways) to signed 16-bit little-endian ints.
-          "-codec:a:0", "pcm_s16le", "-sample_fmt", "s16", "-f", "s16le",
-          "-sn",  // No subtitle recording.
-          "-y"   // Overwrite output file.
+      "-nostats",             // No additional progress display.
+      "-nostdin",             // No interactive commands accepted.
+      "-f", input_format_id,  // eg: "mp3"
+      "-probesize", StrCat(kDefaultProbeSize), "-i", input_filename,
+      "-loglevel", "error",   // Print errors only.
+      "-hide_banner",         // Skip printing build options, version, etc.
+      "-map_metadata", "-1",  // Copy global metadata from input to output.
+      "-vn",                  // No video recording.
+      "-ac:a:0", StrCat(channel_count), "-ar:a:0", StrCat(samples_per_second),
+      // Output set (in several ways) to signed 16-bit little-endian ints.
+      "-codec:a:0", "pcm_s16le", "-sample_fmt", "s16", "-f", "s16le",
+      "-sn",  // No subtitle recording.
+      "-y"    // Overwrite output file.
   });
   if (!stream.empty()) {
     command.emplace_back("-map");
@@ -75,21 +74,13 @@ std::vector<string> FfmpegVideoCommandLine(const string& input_filename,
                                            const string& output_filename) {
   return {"-nostats",  // No additional progress display.
           "-nostdin",  // No interactive commands accepted.
-          "-i",
-          input_filename,
-          "-f",
-          "image2pipe",
-          "-probesize",
-          StrCat(kDefaultProbeSize),
-          "-loglevel",
+          "-i", input_filename, "-f", "image2pipe", "-probesize",
+          StrCat(kDefaultProbeSize), "-loglevel",
           // Info is needed to get the information about stream, etc.
           // It is generated to a separate file, not stdout/stderr.
           "info",
           "-hide_banner",  // Skip printing build options, version, etc.
-          "-vcodec",
-          "rawvideo",
-          "-pix_fmt",
-          "rgb24",
+          "-vcodec", "rawvideo", "-pix_fmt", "rgb24",
           "-y",  // Overwrite output file.
           StrCat(output_filename)};
 }
diff --git a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_test.cc b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_test.cc
index 85b61b2616..05728b3d37 100644
--- a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_test.cc
+++ b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_test.cc
@@ -32,10 +32,8 @@ namespace tensorflow {
 namespace ffmpeg {
 namespace {
 
-const char kTestWavFilename[] =
-    "contrib/ffmpeg/testdata/mono_10khz.wav";
-const char kTestMp3Filename[] =
-    "contrib/ffmpeg/testdata/test_sound1.mp3";
+const char kTestWavFilename[] = "contrib/ffmpeg/testdata/mono_10khz.wav";
+const char kTestMp3Filename[] = "contrib/ffmpeg/testdata/test_sound1.mp3";
 
 // Set to true via a command line flag iff the test is expected to have FFmpeg
 // installed.
@@ -139,7 +137,7 @@ TEST(FfmpegLibTest, TestRoundTripWav) {
 }  // namespace ffmpeg
 }  // namespace tensorflow
 
-int main(int argc, char **argv) {
+int main(int argc, char** argv) {
   tensorflow::string usage = tensorflow::ffmpeg::ParseTestFlags(&argc, argv);
   testing::InitGoogleTest(&argc, argv);
   if (argc != 1) {
diff --git a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_utility_test.cc b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_utility_test.cc
index 36fc71794b..d6c885a324 100644
--- a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_utility_test.cc
+++ b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_utility_test.cc
@@ -20,8 +20,6 @@
 #include <string>
 #include <vector>
 
-
-#include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/env.h"
diff --git a/tensorflow/contrib/framework/kernels/zero_initializer_op.cc b/tensorflow/contrib/framework/kernels/zero_initializer_op.cc
index 6677dca752..5bf6b67529 100644
--- a/tensorflow/contrib/framework/kernels/zero_initializer_op.cc
+++ b/tensorflow/contrib/framework/kernels/zero_initializer_op.cc
@@ -21,8 +21,8 @@ limitations under the License.
 
 #include "tensorflow/contrib/framework/kernels/zero_initializer_op.h"
 
-#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
 
 namespace tensorflow {
 
@@ -81,8 +81,8 @@ TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
 #define REGISTER_GPU_KERNELS(T) REGISTER_KERNELS(GPU, T);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 
 #undef REGISTER_KERNELS
 
-} // namespace tensorflow
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/framework/kernels/zero_initializer_op.h b/tensorflow/contrib/framework/kernels/zero_initializer_op.h
index 14c9268efa..99389a5ab6 100644
--- a/tensorflow/contrib/framework/kernels/zero_initializer_op.h
+++ b/tensorflow/contrib/framework/kernels/zero_initializer_op.h
@@ -29,5 +29,5 @@ struct TensorSetZero {
 };
 }  // namespace functor
 
-} // end namespace tensorflow
-#endif // TENSORFLOW_CONTRIB_FRAMEWORK_KERNELS_ZERO_INITIALIZER_OP_H_
+}  // end namespace tensorflow
+#endif  // TENSORFLOW_CONTRIB_FRAMEWORK_KERNELS_ZERO_INITIALIZER_OP_H_
diff --git a/tensorflow/contrib/framework/ops/variable_ops.cc b/tensorflow/contrib/framework/ops/variable_ops.cc
index 1ee8e1498c..706134ba9a 100644
--- a/tensorflow/contrib/framework/ops/variable_ops.cc
+++ b/tensorflow/contrib/framework/ops/variable_ops.cc
@@ -26,8 +26,8 @@ REGISTER_OP("ZeroInitializer")
     .Attr("T: realnumbertype")
     .SetAllowsUninitializedInput()
     .SetShapeFn([](InferenceContext* c) {
-        c->set_output(0, c->input(0));
-        return Status::OK();
+      c->set_output(0, c->input(0));
+      return Status::OK();
     })
     .Doc(R"doc(
 Initialize 'ref' with all zeros. This op requires that the tensor is not
diff --git a/tensorflow/contrib/gdr/gdr_memory_manager.cc b/tensorflow/contrib/gdr/gdr_memory_manager.cc
index 5c7ac74428..81e70ae30a 100644
--- a/tensorflow/contrib/gdr/gdr_memory_manager.cc
+++ b/tensorflow/contrib/gdr/gdr_memory_manager.cc
@@ -86,8 +86,9 @@ int TryToReadNumaNode(ibv_device* device) {
   if (strings::safe_strto32(content, &value)) {
     if (value < 0) {
       LOG(INFO) << "Successful NUMA node read from SysFS had negative value ("
-                << value << "), but there must be at least one NUMA node"
-                            ", so returning NUMA node zero";
+                << value
+                << "), but there must be at least one NUMA node"
+                   ", so returning NUMA node zero";
       return 0;
     }
     LOG(INFO) << "NUMA node for device: " << device->name << " is " << value;
@@ -290,8 +291,8 @@ Status GdrMemoryManager::Init() {
   // Host memory allocators
   for (Allocator* allocator : allocators) {
     auto* visitable_allocator = dynamic_cast<VisitableAllocator*>(allocator);
-    CHECK(visitable_allocator) << "is not visitable for instrumentation"
-                               << allocator->Name();
+    CHECK(visitable_allocator)
+        << "is not visitable for instrumentation" << allocator->Name();
     // Make sure we don't instrument the same allocator twice
     if (instrumented_.find(allocator) == std::end(instrumented_)) {
       visitable_allocator->AddAllocVisitor(alloc_visitor);
@@ -635,8 +636,8 @@ void GdrMemoryManager::TensorFromTransportOptions(
     } else {
       checksum = GPUUtil::Checksum(*tensor);
     }
-    CHECK(checksum == remote_mr.checksum()) << "Checksum mismatch: " << checksum
-                                            << "!=" << remote_mr.checksum();
+    CHECK(checksum == remote_mr.checksum())
+        << "Checksum mismatch: " << checksum << "!=" << remote_mr.checksum();
 #endif
   }
   done(Status::OK());
diff --git a/tensorflow/contrib/image/kernels/image_ops.cc b/tensorflow/contrib/image/kernels/image_ops.cc
index 6adf837ca0..c2e32da133 100644
--- a/tensorflow/contrib/image/kernels/image_ops.cc
+++ b/tensorflow/contrib/image/kernels/image_ops.cc
@@ -43,9 +43,9 @@ template struct FillProjectiveTransform<CPUDevice, double>;
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
 using functor::FillProjectiveTransform;
+using generator::Interpolation;
 using generator::INTERPOLATION_BILINEAR;
 using generator::INTERPOLATION_NEAREST;
-using generator::Interpolation;
 using generator::ProjectiveGenerator;
 
 template <typename Device, typename T>
@@ -72,11 +72,12 @@ class ImageProjectiveTransform : public OpKernel {
     const Tensor& transform_t = ctx->input(1);
     OP_REQUIRES(ctx, images_t.shape().dims() == 4,
                 errors::InvalidArgument("Input images must have rank 4"));
-    OP_REQUIRES(ctx, (TensorShapeUtils::IsMatrix(transform_t.shape()) &&
-                      (transform_t.dim_size(0) == images_t.dim_size(0) ||
-                       transform_t.dim_size(0) == 1) &&
-                      transform_t.dim_size(1) ==
-                          ProjectiveGenerator<Device, T>::kNumParameters),
+    OP_REQUIRES(ctx,
+                (TensorShapeUtils::IsMatrix(transform_t.shape()) &&
+                 (transform_t.dim_size(0) == images_t.dim_size(0) ||
+                  transform_t.dim_size(0) == 1) &&
+                 transform_t.dim_size(1) ==
+                     ProjectiveGenerator<Device, T>::kNumParameters),
                 errors::InvalidArgument(
                     "Input transform should be num_images x 8 or 1 x 8"));
     auto images = images_t.tensor<T, 4>();
diff --git a/tensorflow/contrib/image/kernels/single_image_random_dot_stereograms_ops.cc b/tensorflow/contrib/image/kernels/single_image_random_dot_stereograms_ops.cc
index 9f0bf37aed..8f9a5c2803 100755
--- a/tensorflow/contrib/image/kernels/single_image_random_dot_stereograms_ops.cc
+++ b/tensorflow/contrib/image/kernels/single_image_random_dot_stereograms_ops.cc
@@ -143,8 +143,8 @@ class SingleImageRandomDotStereogramsOp : public OpKernel {
     }
 
     data_box_left = deltaX_border_image / 2;  // Center DATA in X dimension
-    data_box_width = data_Xwindow;             // width of scan line
-    data_box_height = data_Ywindow;            // hight of image
+    data_box_width = data_Xwindow;            // width of scan line
+    data_box_height = data_Ywindow;           // hight of image
 
     const T* inputZ = input_tensor.flat<T>().data();  // Flatten input Z buffer
 
diff --git a/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc b/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc
index 1f41f243f2..8139d4272d 100755
--- a/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc
+++ b/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc
@@ -58,7 +58,9 @@ REGISTER_OP("SingleImageRandomDotStereograms")
       int colors;
       TF_RETURN_IF_ERROR(c->GetAttr("number_colors", &colors));
 
-      c->set_output(0, c->MakeShape({y_dim, x_dim, colors > 256? c->MakeDim(3) : c->MakeDim(1)}));
+      c->set_output(
+          0, c->MakeShape(
+                 {y_dim, x_dim, colors > 256 ? c->MakeDim(3) : c->MakeDim(1)}));
       return Status::OK();
     })
     .Doc(R"doc(
diff --git a/tensorflow/contrib/input_pipeline/kernels/input_pipeline_kernels.cc b/tensorflow/contrib/input_pipeline/kernels/input_pipeline_kernels.cc
index ca288c1f73..886f679815 100644
--- a/tensorflow/contrib/input_pipeline/kernels/input_pipeline_kernels.cc
+++ b/tensorflow/contrib/input_pipeline/kernels/input_pipeline_kernels.cc
@@ -34,9 +34,8 @@ class ObtainNextOp : public OpKernel {
 
     // Allocate output.
     Tensor* output_tensor = nullptr;
-    OP_REQUIRES_OK(
-        ctx,
-        ctx->allocate_output("out_element", TensorShape({}), &output_tensor));
+    OP_REQUIRES_OK(ctx, ctx->allocate_output("out_element", TensorShape({}),
+                                             &output_tensor));
 
     // Obtain mutex for the "counter" tensor.
     mutex* mu;
diff --git a/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc b/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc
index 932c5ab992..01893d6061 100644
--- a/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc
+++ b/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc
@@ -423,8 +423,9 @@ class SparseFeatureCrossOp : public OpKernel {
               "Input values should be a std::vector but received shape ",
               values_list_in[i].shape().DebugString(), " at position ", i));
       OP_REQUIRES(
-          context, indices_list_in[i].shape().dim_size(0) ==
-                       values_list_in[i].shape().dim_size(0),
+          context,
+          indices_list_in[i].shape().dim_size(0) ==
+              values_list_in[i].shape().dim_size(0),
           errors::InvalidArgument(
               "Expected size of values to be ",
               indices_list_in[i].shape().dim_size(0), " got ",
diff --git a/tensorflow/contrib/lite/interpreter.h b/tensorflow/contrib/lite/interpreter.h
index 9dc864ead8..52e52df1b6 100644
--- a/tensorflow/contrib/lite/interpreter.h
+++ b/tensorflow/contrib/lite/interpreter.h
@@ -171,7 +171,7 @@ class Interpreter {
   // read/write access to structure
   TfLiteTensor* tensor(int tensor_index) {
     if (tensor_index >= context_.tensors_size || tensor_index < 0)
-        return nullptr;
+      return nullptr;
     return &context_.tensors[tensor_index];
   }
 
diff --git a/tensorflow/contrib/lite/kernels/activations.cc b/tensorflow/contrib/lite/kernels/activations.cc
index 8ac93bc8c8..3c5c77815d 100644
--- a/tensorflow/contrib/lite/kernels/activations.cc
+++ b/tensorflow/contrib/lite/kernels/activations.cc
@@ -15,8 +15,8 @@ limitations under the License.
 #include <unistd.h>
 #include <cassert>
 #include <cmath>
-#include <cstdlib>
 #include <cstdio>
+#include <cstdlib>
 #include <iostream>
 #include <limits>
 
@@ -134,8 +134,7 @@ TfLiteStatus ReluEval(TfLiteContext* context, TfLiteNode* node) {
       float* out = output->data.f;
       for (; in < in_end; in++, out++) *out = std::max(0.f, *in);
       return kTfLiteOk;
-    }
-    break;
+    } break;
     default:
       context->ReportError(context, "Only float32 supported currently.");
       return kTfLiteError;
@@ -173,8 +172,7 @@ TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) {
       float* out = output->data.f;
       for (; in < in_end; in++, out++) *out = std::min(std::max(0.f, *in), 6.f);
       return kTfLiteOk;
-    }
-    break;
+    } break;
     default:
       context->ReportError(context, "Only float32 supported currently.");
       return kTfLiteError;
@@ -192,8 +190,7 @@ TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
       float* out = output->data.f;
       for (; in < in_end; in++, out++) *out = std::tanh(*in);
       return kTfLiteOk;
-    }
-    break;
+    } break;
     default:
       context->ReportError(context, "Only float32 supported currently.");
       return kTfLiteError;
diff --git a/tensorflow/contrib/lite/kernels/add.cc b/tensorflow/contrib/lite/kernels/add.cc
index 0e10a249ab..fb5764f280 100644
--- a/tensorflow/contrib/lite/kernels/add.cc
+++ b/tensorflow/contrib/lite/kernels/add.cc
@@ -70,10 +70,10 @@ void EvalAddFloat(TfLiteContext* context, TfLiteNode* node,
             GetTensorData<float>(input2), GetTensorDims(input2), \
             output_activation_min, output_activation_max,        \
             GetTensorData<float>(output), GetTensorDims(output))
-    if (kernel_type == kReference) {
-      TF_LITE_ADD(reference_ops);
-    } else {
-      TF_LITE_ADD(optimized_ops);
+  if (kernel_type == kReference) {
+    TF_LITE_ADD(reference_ops);
+  } else {
+    TF_LITE_ADD(optimized_ops);
   }
 #undef TF_LITE_ADD
 }
diff --git a/tensorflow/contrib/lite/kernels/basic_rnn.cc b/tensorflow/contrib/lite/kernels/basic_rnn.cc
index 3cee43c68b..a0391e030f 100644
--- a/tensorflow/contrib/lite/kernels/basic_rnn.cc
+++ b/tensorflow/contrib/lite/kernels/basic_rnn.cc
@@ -15,8 +15,8 @@ limitations under the License.
 #include <unistd.h>
 #include <cassert>
 #include <cmath>
-#include <cstdlib>
 #include <cstdio>
+#include <cstdlib>
 #include <iostream>
 #include <limits>
 
@@ -76,8 +76,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteIntArray* output_size_array = TfLiteIntArrayCreate(2);
   output_size_array->data[0] = batch_size;
   output_size_array->data[1] = num_units;
-  TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, output,
-                                                   output_size_array));
+  TF_LITE_ENSURE_OK(context,
+                    context->ResizeTensor(context, output, output_size_array));
 
   return kTfLiteOk;
 }
diff --git a/tensorflow/contrib/lite/kernels/basic_rnn_test.cc b/tensorflow/contrib/lite/kernels/basic_rnn_test.cc
index 5ecccb985e..fa7ef525db 100644
--- a/tensorflow/contrib/lite/kernels/basic_rnn_test.cc
+++ b/tensorflow/contrib/lite/kernels/basic_rnn_test.cc
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 // Unit test for TFLite RNN op.
 
-#include <vector>
 #include <iomanip>
+#include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
@@ -120,8 +120,7 @@ static float rnn_golden_output[] = {
 
     0.415153,   0.210318,   0,         0,         0,         0,
     0,          2.02616,    0,         0.728256,  0.84183,   0.0907453,
-    0.628881,   3.58099,    1.49974,   0
-};
+    0.628881,   3.58099,    1.49974,   0};
 
 class RNNOpModel : public SingleOpModel {
  public:
diff --git a/tensorflow/contrib/lite/kernels/embedding_lookup_sparse_test.cc b/tensorflow/contrib/lite/kernels/embedding_lookup_sparse_test.cc
index dcdc5fffad..ef2b542225 100644
--- a/tensorflow/contrib/lite/kernels/embedding_lookup_sparse_test.cc
+++ b/tensorflow/contrib/lite/kernels/embedding_lookup_sparse_test.cc
@@ -123,18 +123,16 @@ TEST(EmbeddingLookupOpTest, SimpleTestSqrtn) {
       [](int i, int j, int k) { return i + j / 10.0f + k / 100.0f; });
   m.Invoke();
 
-  EXPECT_THAT(
-      m.GetOutput(),
-      ElementsAreArray(ArrayFloatNear({
-          1.00, 1.01, 1.10, 1.11, 1.20, 1.21,  // Row 1
-          0.00, 0.00, 0.00, 0.00, 0.00, 0.00,  // -
-          6.00f / std::sqrt(20.0f), 6.06f / std::sqrt(20.0f),
-          6.60f / std::sqrt(20.0f), 6.66f / std::sqrt(20.0f),
-          7.20f / std::sqrt(20.0f),
-          7.26f /
-              std::sqrt(
-                  20.0f),  // 2 * Row 3 + 4 * Row 0,  // 2 * Row 3 + 4 * Row 0
-      })));
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear({
+                  1.00, 1.01, 1.10, 1.11, 1.20, 1.21,  // Row 1
+                  0.00, 0.00, 0.00, 0.00, 0.00, 0.00,  // -
+                  6.00f / std::sqrt(20.0f), 6.06f / std::sqrt(20.0f),
+                  6.60f / std::sqrt(20.0f), 6.66f / std::sqrt(20.0f),
+                  7.20f / std::sqrt(20.0f),
+                  7.26f / std::sqrt(20.0f),  // 2 * Row 3 + 4 * Row 0,  // 2 *
+                                             // Row 3 + 4 * Row 0
+              })));
 }
 
 TEST(EmbeddingLookupOpTest, Indices3DTest) {
diff --git a/tensorflow/contrib/lite/kernels/gather_test.cc b/tensorflow/contrib/lite/kernels/gather_test.cc
index 658d977b8d..cdadbeda18 100644
--- a/tensorflow/contrib/lite/kernels/gather_test.cc
+++ b/tensorflow/contrib/lite/kernels/gather_test.cc
@@ -81,10 +81,8 @@ TEST(GatherOpTest, Test0DIndex) {
   m.SetInputFloat({-2.0, 0.2, 0.7, 0.8});
   m.SetPositions({1});
   m.Invoke();
-  EXPECT_THAT(m.GetOutputFloat(),
-              ElementsAreArray(ArrayFloatNear({0.7, 0.8})));
-  EXPECT_THAT(m.GetOutputShape(),
-              ElementsAreArray({2}));
+  EXPECT_THAT(m.GetOutputFloat(), ElementsAreArray(ArrayFloatNear({0.7, 0.8})));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
 }
 
 TEST(GatherOpTest, Test0DIndexWith0DResult) {
@@ -94,8 +92,7 @@ TEST(GatherOpTest, Test0DIndexWith0DResult) {
   m.SetInputFloat({1.0, 2.0, 3.0});
   m.SetPositions({1});
   m.Invoke();
-  EXPECT_THAT(m.GetOutputFloat(),
-              ElementsAreArray(ArrayFloatNear({2.0})));
+  EXPECT_THAT(m.GetOutputFloat(), ElementsAreArray(ArrayFloatNear({2.0})));
   EXPECT_TRUE(m.GetOutputShape().empty());
 }
 
diff --git a/tensorflow/contrib/lite/kernels/hashtable_lookup_test.cc b/tensorflow/contrib/lite/kernels/hashtable_lookup_test.cc
index cb6038f900..ba0ed5ce06 100644
--- a/tensorflow/contrib/lite/kernels/hashtable_lookup_test.cc
+++ b/tensorflow/contrib/lite/kernels/hashtable_lookup_test.cc
@@ -116,7 +116,10 @@ TEST(HashtableLookupOpTest, Test2DInput) {
                                  1.0, 1.1,  // 1-st item
                              })));
   EXPECT_THAT(m.GetHit(), ElementsAreArray({
-                              1, 0, 1, 1,
+                              1,
+                              0,
+                              1,
+                              1,
                           }));
 }
 
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/cpu_check.h b/tensorflow/contrib/lite/kernels/internal/optimized/cpu_check.h
index dea46cc120..6cb556bf45 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/cpu_check.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/cpu_check.h
@@ -36,15 +36,11 @@ inline bool TestCPUFeatureNeon() {
 
 #elif __ARM_NEON
 
-inline bool TestCPUFeatureNeon() {
-  return true;
-}
+inline bool TestCPUFeatureNeon() { return true; }
 
 #else
 
-inline bool TestCPUFeatureNeon() {
-  return false;
-}
+inline bool TestCPUFeatureNeon() { return false; }
 
 #endif
 
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h
index 81796e295d..e2c87df80b 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h
@@ -992,11 +992,11 @@ inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
           for (int k = 0; k < 4; k++) {
             acc[k] = vld1q_f32(acc_buffer + i + 4 * k);
           }
-            for (int k = 0; k < 4; k++) {
-              acc[k] = vmaxq_f32(
-                  vdupq_n_f32(output_activation_min),
-                  vminq_f32(vdupq_n_f32(output_activation_max), acc[k]));
-            }
+          for (int k = 0; k < 4; k++) {
+            acc[k] = vmaxq_f32(
+                vdupq_n_f32(output_activation_min),
+                vminq_f32(vdupq_n_f32(output_activation_max), acc[k]));
+          }
           for (int k = 0; k < 4; k++) {
             vst1q_f32(output_ptr + 4 * k, acc[k]);
           }
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/eigen_spatial_convolutions.h b/tensorflow/contrib/lite/kernels/internal/optimized/eigen_spatial_convolutions.h
index f21fbf532a..ce3cde7699 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/eigen_spatial_convolutions.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/eigen_spatial_convolutions.h
@@ -39,7 +39,6 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #endif
 
-
 namespace Eigen {
 
 /** SpatialConvolution
@@ -215,13 +214,12 @@ EIGEN_DEVICE_FUNC
   }
   // TODO(yangke): choose() is defined in TensorContraction.h -- consider
   // moving it to somewhere more "common".
-  return
-      input
-          .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride,
-                                 row_in_stride, col_in_stride, padding_type)
-          .reshape(pre_contract_dims)
-          .contract(kernel.reshape(kernel_dims), contract_dims)
-          .reshape(post_contract_dims);
+  return input
+      .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride,
+                             row_in_stride, col_in_stride, padding_type)
+      .reshape(pre_contract_dims)
+      .contract(kernel.reshape(kernel_dims), contract_dims)
+      .reshape(post_contract_dims);
 }
 
 }  // end namespace Eigen
diff --git a/tensorflow/contrib/lite/kernels/optional_tensor_test.cc b/tensorflow/contrib/lite/kernels/optional_tensor_test.cc
index 17166715ca..cee3ec6197 100644
--- a/tensorflow/contrib/lite/kernels/optional_tensor_test.cc
+++ b/tensorflow/contrib/lite/kernels/optional_tensor_test.cc
@@ -243,7 +243,6 @@ class LSTMOpModel : public SingleOpModel {
   int n_output_;
 };
 
-
 TEST(LSTMOpTest, BlackBoxTestWithCifgWithPeepholeNoProjectionNoClipping) {
   const int n_batch = 1;
   const int n_input = 2;
@@ -282,7 +281,6 @@ TEST(LSTMOpTest, BlackBoxTestWithCifgWithPeepholeNoProjectionNoClipping) {
                        {0},     // projection_bias tensor
                    });
 
-
   lstm.SetInputToCellWeights({-0.49770179, -0.27711356, -0.09624726, 0.05100781,
                               0.04717243, 0.48944736, -0.38535351,
                               -0.17212132});
diff --git a/tensorflow/contrib/lite/kernels/pad.cc b/tensorflow/contrib/lite/kernels/pad.cc
index 4003ed10df..48114e5a40 100644
--- a/tensorflow/contrib/lite/kernels/pad.cc
+++ b/tensorflow/contrib/lite/kernels/pad.cc
@@ -177,9 +177,7 @@ TfLiteRegistration* Register_PAD_GENERIC_OPT() {
   return &r;
 }
 
-TfLiteRegistration* Register_PAD() {
-  return Register_PAD_GENERIC_OPT();
-}
+TfLiteRegistration* Register_PAD() { return Register_PAD_GENERIC_OPT(); }
 
 }  // namespace builtin
 }  // namespace ops
diff --git a/tensorflow/contrib/lite/kernels/svdf.cc b/tensorflow/contrib/lite/kernels/svdf.cc
index 72f705fe42..c69755447d 100644
--- a/tensorflow/contrib/lite/kernels/svdf.cc
+++ b/tensorflow/contrib/lite/kernels/svdf.cc
@@ -15,8 +15,8 @@ limitations under the License.
 #include <unistd.h>
 #include <cassert>
 #include <cmath>
-#include <cstdlib>
 #include <cstdio>
+#include <cstdlib>
 #include <iostream>
 #include <limits>
 
diff --git a/tensorflow/contrib/lite/kernels/svdf_test.cc b/tensorflow/contrib/lite/kernels/svdf_test.cc
index 4de2ceaf05..0f166dc69b 100644
--- a/tensorflow/contrib/lite/kernels/svdf_test.cc
+++ b/tensorflow/contrib/lite/kernels/svdf_test.cc
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 // Unit test for TFLite SVDF op.
 
-#include <vector>
 #include <iomanip>
+#include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
diff --git a/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc b/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc
index f5f1ec2cf3..7ce87e4deb 100644
--- a/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc
+++ b/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc
@@ -15,8 +15,8 @@ limitations under the License.
 #include <unistd.h>
 #include <cassert>
 #include <cmath>
-#include <cstdlib>
 #include <cstdio>
+#include <cstdlib>
 #include <iostream>
 #include <limits>
 
@@ -82,8 +82,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   output_size_array->data[0] = (time_major) ? max_time : batch_size;
   output_size_array->data[1] = (time_major) ? batch_size : max_time;
   output_size_array->data[2] = num_units;
-  TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, output,
-                                                   output_size_array));
+  TF_LITE_ENSURE_OK(context,
+                    context->ResizeTensor(context, output, output_size_array));
 
   return kTfLiteOk;
 }
diff --git a/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn_test.cc b/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn_test.cc
index 82c680ec3d..7e32969763 100644
--- a/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn_test.cc
+++ b/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn_test.cc
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 // Unit test for TFLite Sequential RNN op.
 
-#include <vector>
 #include <iomanip>
+#include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
@@ -120,8 +120,7 @@ static float rnn_golden_output[] = {
 
     0.415153,   0.210318,   0,         0,         0,         0,
     0,          2.02616,    0,         0.728256,  0.84183,   0.0907453,
-    0.628881,   3.58099,    1.49974,   0
-};
+    0.628881,   3.58099,    1.49974,   0};
 
 class UnidirectionalRNNOpModel : public SingleOpModel {
  public:
diff --git a/tensorflow/contrib/lite/toco/tensorflow_util.cc b/tensorflow/contrib/lite/toco/tensorflow_util.cc
index 82e2800ca2..0e7e9c41a0 100644
--- a/tensorflow/contrib/lite/toco/tensorflow_util.cc
+++ b/tensorflow/contrib/lite/toco/tensorflow_util.cc
@@ -51,7 +51,8 @@ void LogDumpGraphDef(int log_level, const string& message,
 BEGIN DUMP OF TENSORFLOW GRAPHDEF (%s)
 There are %d nodes.
 There are %zu different op types:
-)MSG", message, tf_graph.node_size(), ops.size());
+)MSG",
+                      message, tf_graph.node_size(), ops.size());
   for (const auto& op : ops) {
     toco::port::AppendF(&dump, "  %s\n", op);
   }
@@ -63,7 +64,8 @@ PROTO DUMP
 BEGIN NODE: name = %s
   op = %s
   inputs = [
-)MSG", node.name(), node.op());
+)MSG",
+                        node.name(), node.op());
     for (const auto& input : node.input()) {
       toco::port::AppendF(&dump, "    %s\n", input);
     }
diff --git a/tensorflow/contrib/lite/toco/tflite/export.cc b/tensorflow/contrib/lite/toco/tflite/export.cc
index 391ef87029..2771959970 100644
--- a/tensorflow/contrib/lite/toco/tflite/export.cc
+++ b/tensorflow/contrib/lite/toco/tflite/export.cc
@@ -26,6 +26,9 @@ namespace toco {
 
 namespace tflite {
 
+using flatbuffers::FlatBufferBuilder;
+using flatbuffers::Offset;
+using flatbuffers::Vector;
 using ::tflite::Buffer;
 using ::tflite::BuiltinOperator;
 using ::tflite::BuiltinOperator_CUSTOM;
@@ -39,9 +42,6 @@ using ::tflite::Operator;
 using ::tflite::OperatorCode;
 using ::tflite::SubGraph;
 using ::tflite::Tensor;
-using flatbuffers::FlatBufferBuilder;
-using flatbuffers::Offset;
-using flatbuffers::Vector;
 
 namespace {
 
diff --git a/tensorflow/contrib/lite/toco/tflite/operator.cc b/tensorflow/contrib/lite/toco/tflite/operator.cc
index e2162e1493..461494fd99 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator.cc
@@ -144,8 +144,7 @@ class SpaceToBatchND
   }
 
   void ReadOptions(const TfLiteOptions& options,
-                   TocoOperator* op) const override {
-  }
+                   TocoOperator* op) const override {}
 };
 
 class Sub : public BuiltinOperator<SubOperator, ::tflite::SubOptions,
@@ -202,8 +201,7 @@ class BatchToSpaceND
   }
 
   void ReadOptions(const TfLiteOptions& options,
-                   TocoOperator* op) const override {
-  }
+                   TocoOperator* op) const override {}
 };
 
 class Cast : public CustomOperator<CastOperator> {
@@ -452,8 +450,7 @@ class Pad : public BuiltinOperator<PadOperator, ::tflite::PadOptions,
   }
 
   void ReadOptions(const TfLiteOptions& options,
-                   TocoOperator* op) const override {
-  }
+                   TocoOperator* op) const override {}
 };
 
 class Reshape
@@ -524,8 +521,7 @@ class Transpose
   }
 
   void ReadOptions(const TfLiteOptions& options,
-                   TocoOperator* op) const override {
-  }
+                   TocoOperator* op) const override {}
 };
 
 class Mean : public BuiltinOperator<MeanOperator, ::tflite::MeanOptions,
diff --git a/tensorflow/contrib/memory_stats/kernels/memory_stats_ops.cc b/tensorflow/contrib/memory_stats/kernels/memory_stats_ops.cc
index 39c0d5af45..974fb53749 100644
--- a/tensorflow/contrib/memory_stats/kernels/memory_stats_ops.cc
+++ b/tensorflow/contrib/memory_stats/kernels/memory_stats_ops.cc
@@ -80,9 +80,9 @@ REGISTER_KERNEL_BUILDER(Name("BytesLimit").Device(DEVICE_GPU).HostMemory("out"),
                         BytesLimitOp);
 
 #ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("BytesLimit").Device(DEVICE_SYCL).HostMemory("out"),
-                        BytesLimitOp);
-#endif // TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(
+    Name("BytesLimit").Device(DEVICE_SYCL).HostMemory("out"), BytesLimitOp);
+#endif  // TENSORFLOW_USE_SYCL
 
 // Op that measures the peak memory in bytes.
 class MaxBytesInUseOp : public MemoryStatsOp {
@@ -107,6 +107,6 @@ REGISTER_KERNEL_BUILDER(
 REGISTER_KERNEL_BUILDER(
     Name("MaxBytesInUse").Device(DEVICE_SYCL).HostMemory("out"),
     MaxBytesInUseOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc b/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc
index 8d14a3ef04..c2c42b8ed7 100644
--- a/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc
+++ b/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc
@@ -24,11 +24,11 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "tensorflow/core/distributed_runtime/tensor_coding.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_util.h"
 #include "tensorflow/core/distributed_runtime/session_mgr.h"
+#include "tensorflow/core/distributed_runtime/tensor_coding.h"
 
 namespace tensorflow {
 
@@ -62,7 +62,6 @@ BaseRemoteRendezvous* MPIRendezvousMgr::Create(int64 step_id,
 void MPIRemoteRendezvous::RecvFromRemoteAsync(
     const Rendezvous::ParsedKey& parsed, const Rendezvous::Args& recv_args,
     DoneCallback done) {
-
   Status s = Status::OK();
   MPIRequestTensorCall* rendezvous_call = new MPIRequestTensorCall();
 
@@ -103,37 +102,37 @@ void MPIRemoteRendezvous::RecvFromRemoteAsync(
   // Create the function which is called when the Tensor is send by remote
   const int64 temp1 = step_id_;
   rendezvous_call->recv_call_ =
-      [this, parsed, recv_args, done, dst, temp1, rendezvous_call](
-          MPIRecvTensorResponse mpi_response) {
-    Status s;
-    Device* dst_device;
-    if (s.ok()) {
-      s = env_->device_mgr->LookupDevice(parsed.dst_device, &dst_device);
-      CHECK(s.ok()) << "Device lookup failed";
-    }
-
-    VLOG(3) << "MPI Received tensor " << parsed.FullKey()
-            << " @ step: " << temp1
-            << " single-send: " << mpi_response.singlesend();
-
-    Tensor val;
-    if (mpi_response.singlesend()) {
-      dst_device->MakeTensorFromProto(mpi_response.response().tensor(),
-                                      recv_args.alloc_attrs, &val);
-    } else {
-      TensorResponse tr;
-      tr.InitAlloc(dst_device, recv_args.alloc_attrs);
-      tr.InitPartial(mpi_response.response());
-      const size_t nBytes = tr.tensor().TotalBytes();
-      void* data = const_cast<void*>(DMAHelper::base(&tr.tensor()));
-      MPI_Status status;
-      MPI_CHECK(MPI_Recv(data, static_cast<int>(nBytes), MPI_BYTE, dst,
-                         TAG_SENDTENSOR2, MPI_COMM_WORLD, &status));
-      val = std::move(tr.tensor());
-    }
-
-    done(s, Args(), recv_args, val, mpi_response.response().is_dead());
-  };
+      [this, parsed, recv_args, done, dst, temp1,
+       rendezvous_call](MPIRecvTensorResponse mpi_response) {
+        Status s;
+        Device* dst_device;
+        if (s.ok()) {
+          s = env_->device_mgr->LookupDevice(parsed.dst_device, &dst_device);
+          CHECK(s.ok()) << "Device lookup failed";
+        }
+
+        VLOG(3) << "MPI Received tensor " << parsed.FullKey()
+                << " @ step: " << temp1
+                << " single-send: " << mpi_response.singlesend();
+
+        Tensor val;
+        if (mpi_response.singlesend()) {
+          dst_device->MakeTensorFromProto(mpi_response.response().tensor(),
+                                          recv_args.alloc_attrs, &val);
+        } else {
+          TensorResponse tr;
+          tr.InitAlloc(dst_device, recv_args.alloc_attrs);
+          tr.InitPartial(mpi_response.response());
+          const size_t nBytes = tr.tensor().TotalBytes();
+          void* data = const_cast<void*>(DMAHelper::base(&tr.tensor()));
+          MPI_Status status;
+          MPI_CHECK(MPI_Recv(data, static_cast<int>(nBytes), MPI_BYTE, dst,
+                             TAG_SENDTENSOR2, MPI_COMM_WORLD, &status));
+          val = std::move(tr.tensor());
+        }
+
+        done(s, Args(), recv_args, val, mpi_response.response().is_dead());
+      };
 
   MPIRendezvousMgr* mgr =
       reinterpret_cast<MPIRendezvousMgr*>(this->rendezvous_mgr_);
@@ -159,9 +158,11 @@ void MPIRendezvousMgr::AddRequest(RecvTensorRequest request,
   TF_CHECK_OK(Rendezvous::ParseKey(key, &parsed));
 
   MPIRecvTensorCallBack send_cb = [this, mpi_dst, parsed](
-      const Status& status, const Rendezvous::Args& send_args,
-      const Rendezvous::Args& recv_args, const Tensor& val, bool is_dead,
-      MPISendTensorCall* mpi_send_call) {
+                                      const Status& status,
+                                      const Rendezvous::Args& send_args,
+                                      const Rendezvous::Args& recv_args,
+                                      const Tensor& val, bool is_dead,
+                                      MPISendTensorCall* mpi_send_call) {
     // TODO(jbedorf) this should be a loop over max size
     CHECK(mpi_send_call->mRes_.ByteSize() < INT_MAX)
         << "Buffer too large for single transfer";
@@ -194,74 +195,78 @@ void MPIRendezvousMgr::AddRequest(RecvTensorRequest request,
   };
 
   // Wrapper around the read callback to place the callback on our queue
-  Rendezvous::DoneCallback done_cb = [this, parsed, step_id, send_cb](
-      const Status& status, const Rendezvous::Args& send_args,
-      const Rendezvous::Args& recv_args, const Tensor& val, bool is_dead) {
-    if (!status.ok()) {
-      CHECK(status.ok()) << "RecvLocalAsync was not ok, key: "
-                         << parsed.FullKey() << " step: " << step_id
-                         << " error message: " << status.error_message();
-      return;
-    }
-
-    VLOG(3) << "MPI Sending tensor " << parsed.FullKey()
-            << " @ step: " << step_id << std::endl;
-
-    auto mpi_send_call = new MPISendTensorCall();
-    mpi_send_call->Init(parsed, step_id, is_dead);
-
-    Device* src_dev = nullptr;
-    Status s = this->worker_env_2->device_mgr->LookupDevice(parsed.src_device,
-                                                            &src_dev);
-    CHECK(s.ok()) << "src device not found";
-
-    // Control if shape and data should be send together or if we can optimize
-    // it in two different transfers, thereby reducing memory copies
-    bool doOptimalTransfer = true;
-    if (!DataTypeCanUseMemcpy(val.dtype())) doOptimalTransfer = false;
-    if (val.TotalBytes() < 1024) doOptimalTransfer = false;
-
-    doOptimalTransfer = doOptimalTransfer && use_optimal_transfer_;
-
-    if (doOptimalTransfer) {
-      // First send the Tensor description and in a follow up transfer the data
-      mpi_send_call->mRes_.mutable_response()->mutable_tensor()->set_dtype(
-          val.dtype());
-      val.shape().AsProto(mpi_send_call->mRes_.mutable_response()
-                              ->mutable_tensor()
-                              ->mutable_tensor_shape());
-      mpi_send_call->mRes_.set_singlesend(false);
-    } else {
-      // Send the Tensor description and data in a single transfer
-      if (src_dev->tensorflow_gpu_device_info() &&
-          (!send_args.alloc_attrs.on_host())) {
-        Notification n;
-        GPUUtil::SetProtoFromGPU(
-            val, src_dev, send_args.device_context,
-            mpi_send_call->mRes_.mutable_response()->mutable_tensor(), is_dead,
-            [&n, &s](const Status& s_) {
-              s = s_;
-              n.Notify();
-            });
-        n.WaitForNotification();
-      } else {
-        val.AsProtoTensorContent(
-            mpi_send_call->mRes_.mutable_response()->mutable_tensor());
-      }
-    }
-
-    std::function<MPISendTensorCall*()> res = std::bind(
-        send_cb, status, send_args, recv_args, val, is_dead, mpi_send_call);
-
-    SendQueueEntry req(parsed.FullKey().ToString().c_str(), std::move(res));
-
-    this->QueueSendRequest(req);
-
-    // Wait for the notification that indicates the tensor has been
-    // successfully transmitted to the remote process. Only needed if we
-    // have not parsed the tensor to proto
-    if (doOptimalTransfer) mpi_send_call->n_.WaitForNotification();
-  };  // done_cb
+  Rendezvous::DoneCallback done_cb =
+      [this, parsed, step_id, send_cb](
+          const Status& status, const Rendezvous::Args& send_args,
+          const Rendezvous::Args& recv_args, const Tensor& val, bool is_dead) {
+        if (!status.ok()) {
+          CHECK(status.ok())
+              << "RecvLocalAsync was not ok, key: " << parsed.FullKey()
+              << " step: " << step_id
+              << " error message: " << status.error_message();
+          return;
+        }
+
+        VLOG(3) << "MPI Sending tensor " << parsed.FullKey()
+                << " @ step: " << step_id << std::endl;
+
+        auto mpi_send_call = new MPISendTensorCall();
+        mpi_send_call->Init(parsed, step_id, is_dead);
+
+        Device* src_dev = nullptr;
+        Status s = this->worker_env_2->device_mgr->LookupDevice(
+            parsed.src_device, &src_dev);
+        CHECK(s.ok()) << "src device not found";
+
+        // Control if shape and data should be send together or if we can
+        // optimize it in two different transfers, thereby reducing memory
+        // copies
+        bool doOptimalTransfer = true;
+        if (!DataTypeCanUseMemcpy(val.dtype())) doOptimalTransfer = false;
+        if (val.TotalBytes() < 1024) doOptimalTransfer = false;
+
+        doOptimalTransfer = doOptimalTransfer && use_optimal_transfer_;
+
+        if (doOptimalTransfer) {
+          // First send the Tensor description and in a follow up transfer the
+          // data
+          mpi_send_call->mRes_.mutable_response()->mutable_tensor()->set_dtype(
+              val.dtype());
+          val.shape().AsProto(mpi_send_call->mRes_.mutable_response()
+                                  ->mutable_tensor()
+                                  ->mutable_tensor_shape());
+          mpi_send_call->mRes_.set_singlesend(false);
+        } else {
+          // Send the Tensor description and data in a single transfer
+          if (src_dev->tensorflow_gpu_device_info() &&
+              (!send_args.alloc_attrs.on_host())) {
+            Notification n;
+            GPUUtil::SetProtoFromGPU(
+                val, src_dev, send_args.device_context,
+                mpi_send_call->mRes_.mutable_response()->mutable_tensor(),
+                is_dead, [&n, &s](const Status& s_) {
+                  s = s_;
+                  n.Notify();
+                });
+            n.WaitForNotification();
+          } else {
+            val.AsProtoTensorContent(
+                mpi_send_call->mRes_.mutable_response()->mutable_tensor());
+          }
+        }
+
+        std::function<MPISendTensorCall*()> res = std::bind(
+            send_cb, status, send_args, recv_args, val, is_dead, mpi_send_call);
+
+        SendQueueEntry req(parsed.FullKey().ToString().c_str(), std::move(res));
+
+        this->QueueSendRequest(req);
+
+        // Wait for the notification that indicates the tensor has been
+        // successfully transmitted to the remote process. Only needed if we
+        // have not parsed the tensor to proto
+        if (doOptimalTransfer) mpi_send_call->n_.WaitForNotification();
+      };  // done_cb
 
   worker_env_2->compute_pool->Schedule([this, step_id, parsed, done_cb]() {
     this->RecvLocalAsync(step_id, parsed, done_cb);
@@ -293,9 +298,8 @@ void MPIRendezvousMgr::MPIBackgroundThread() {
     }
 
     // Remove sends that have been completed
-    active_sends.remove_if([](std::unique_ptr<MPISendTensorCall>& i) {
-      return i->IsFinished();
-    });
+    active_sends.remove_if(
+        [](std::unique_ptr<MPISendTensorCall>& i) { return i->IsFinished(); });
 
     // send a Tensor request
     RequestQueueEntry req;
diff --git a/tensorflow/contrib/mpi/mpi_rendezvous_mgr.h b/tensorflow/contrib/mpi/mpi_rendezvous_mgr.h
index ca42ee2f6d..e665922135 100644
--- a/tensorflow/contrib/mpi/mpi_rendezvous_mgr.h
+++ b/tensorflow/contrib/mpi/mpi_rendezvous_mgr.h
@@ -18,12 +18,12 @@ limitations under the License.
 
 #ifdef TENSORFLOW_USE_MPI
 
-#include <queue>
-#include <thread>
 #include <list>
-#include <string>
-#include <memory>
 #include <map>
+#include <memory>
+#include <queue>
+#include <string>
+#include <thread>
 #include <unordered_map>
 #include <utility>
 #include <vector>
@@ -160,7 +160,8 @@ class MPIRendezvousMgr : public BaseRendezvousMgr {
  private:
   typedef std::function<MPISendTensorCall*(
       const Status&, const Rendezvous::Args&, const Rendezvous::Args&,
-      const Tensor&, const bool, MPISendTensorCall*)> MPIRecvTensorCallBack;
+      const Tensor&, const bool, MPISendTensorCall*)>
+      MPIRecvTensorCallBack;
 
   typedef std::pair<std::string, std::function<void()>> RequestQueueEntry;
   typedef std::pair<std::string, std::function<MPISendTensorCall*()>>
diff --git a/tensorflow/contrib/mpi/mpi_server_lib.cc b/tensorflow/contrib/mpi/mpi_server_lib.cc
index d585c0565e..a31fa9ce0b 100644
--- a/tensorflow/contrib/mpi/mpi_server_lib.cc
+++ b/tensorflow/contrib/mpi/mpi_server_lib.cc
@@ -22,8 +22,8 @@ limitations under the License.
 
 #include "grpc/support/alloc.h"
 
-#include "tensorflow/core/distributed_runtime/server_lib.h"
 #include "tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h"
+#include "tensorflow/core/distributed_runtime/server_lib.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/env.h"
 
diff --git a/tensorflow/contrib/mpi/mpi_utils.h b/tensorflow/contrib/mpi/mpi_utils.h
index 45e21f2b25..fa297c28cb 100644
--- a/tensorflow/contrib/mpi/mpi_utils.h
+++ b/tensorflow/contrib/mpi/mpi_utils.h
@@ -18,8 +18,8 @@ limitations under the License.
 
 #ifdef TENSORFLOW_USE_MPI
 
-#include <string>
 #include <map>
+#include <string>
 #include <vector>
 
 #include "tensorflow/core/lib/strings/str_util.h"
diff --git a/tensorflow/contrib/mpi_collectives/kernels/mpi_ops.cc b/tensorflow/contrib/mpi_collectives/kernels/mpi_ops.cc
index 2d5b98022c..8dca90a1e3 100644
--- a/tensorflow/contrib/mpi_collectives/kernels/mpi_ops.cc
+++ b/tensorflow/contrib/mpi_collectives/kernels/mpi_ops.cc
@@ -35,8 +35,8 @@ limitations under the License.
 
 #define OMPI_SKIP_MPICXX
 #include "third_party/mpi/mpi.h"
-#include "tensorflow/contrib/mpi_collectives/mpi_message.pb.h"
 #include "tensorflow/contrib/mpi_collectives/kernels/ring.h"
+#include "tensorflow/contrib/mpi_collectives/mpi_message.pb.h"
 
 /*
  * MPI Allreduce and Allgather Ops for TensorFlow.
diff --git a/tensorflow/contrib/nearest_neighbor/kernels/hyperplane_lsh_probes.cc b/tensorflow/contrib/nearest_neighbor/kernels/hyperplane_lsh_probes.cc
index 2b412fac9a..13db6f62f5 100644
--- a/tensorflow/contrib/nearest_neighbor/kernels/hyperplane_lsh_probes.cc
+++ b/tensorflow/contrib/nearest_neighbor/kernels/hyperplane_lsh_probes.cc
@@ -75,7 +75,8 @@ class HyperplaneLSHProbesOp : public OpKernel {
                                 num_hyperplanes_per_table, "."));
     OP_REQUIRES(context, num_hyperplanes_per_table <= 30,
                 InvalidArgument("Need num_hyperplanes_per_table <= 30, got ",
-                                num_hyperplanes_per_table, ". "
+                                num_hyperplanes_per_table,
+                                ". "
                                 "If you need more hyperplanes, change this Op"
                                 " to work for larger integer types (int64)."));
 
@@ -88,12 +89,13 @@ class HyperplaneLSHProbesOp : public OpKernel {
                 InvalidArgument("num_probes must be at least 1."));
 
     int expected_num_hyperplanes = num_tables * num_hyperplanes_per_table;
-    OP_REQUIRES(
-        context, products_tensor.dim_size(1) == expected_num_hyperplanes,
-        InvalidArgument("Expected number of hyperplanes is ",
-                        expected_num_hyperplanes, " but received ",
-                        products_tensor.dim_size(1), " inner products per "
-                        "point."));
+    OP_REQUIRES(context,
+                products_tensor.dim_size(1) == expected_num_hyperplanes,
+                InvalidArgument("Expected number of hyperplanes is ",
+                                expected_num_hyperplanes, " but received ",
+                                products_tensor.dim_size(1),
+                                " inner products per "
+                                "point."));
 
     auto products_eigen_tensor = products_tensor.matrix<CoordinateType>();
     ConstMatrixMap products_matrix(products_eigen_tensor.data(),
@@ -116,13 +118,11 @@ class HyperplaneLSHProbesOp : public OpKernel {
     // lschmidt's workstation.
     int64 cost_per_unit = 21 * num_hyperplanes_per_table * num_tables;
     if (num_probes > num_tables) {
-      cost_per_unit += 110 * num_hyperplanes_per_table
-          * (num_probes - num_tables);
+      cost_per_unit +=
+          110 * num_hyperplanes_per_table * (num_probes - num_tables);
     }
     context->device()->tensorflow_cpu_worker_threads()->workers->ParallelFor(
-        batch_size,
-        cost_per_unit,
-        [&](int64 start, int64 end) {
+        batch_size, cost_per_unit, [&](int64 start, int64 end) {
           HyperplaneMultiprobe<CoordinateType, int32> multiprobe(
               num_hyperplanes_per_table, num_tables);
 
diff --git a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc
index 9cee405cef..e18923c8aa 100644
--- a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc
+++ b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc
@@ -14,13 +14,12 @@
 // limitations under the License.
 // =============================================================================
 
-#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h"
+#include "tensorflow/core/framework/register_types.h"
 
 namespace tensorflow {
 
-REGISTER_KERNEL_BUILDER(Name("PeriodicResample")
-                            .Device(DEVICE_CPU),
+REGISTER_KERNEL_BUILDER(Name("PeriodicResample").Device(DEVICE_CPU),
                         PeriodicResampleOp);
 
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h
index ba410f025d..3ab588c458 100644
--- a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h
+++ b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h
@@ -118,9 +118,9 @@ template <class InputDataT,
           tensorflow::errors::InvalidArgument(
               "periodic_resample expects the size of non-adjustable "
               "dimensions be at least as large as size of input tensor."
-              " Dimension ", i, " input tensor has size ",
-              input_tensor_shape.dim_size(i), ", desired shape has size ",
-              desired_shape[i], "."));
+              " Dimension ",
+              i, " input tensor has size ", input_tensor_shape.dim_size(i),
+              ", desired shape has size ", desired_shape[i], "."));
 
       // target_dimensions[i] = desired_shape(i);
       target_dimensions[i] = desired_shape[i];
diff --git a/tensorflow/contrib/pi_examples/camera/camera.cc b/tensorflow/contrib/pi_examples/camera/camera.cc
index cb20661662..8110185ea8 100644
--- a/tensorflow/contrib/pi_examples/camera/camera.cc
+++ b/tensorflow/contrib/pi_examples/camera/camera.cc
@@ -17,16 +17,16 @@ limitations under the License.
 
 #include <errno.h>
 #include <fcntl.h>
-#include <fstream>
 #include <libv4l2.h>
 #include <linux/videodev2.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/ioctl.h>
-#include <sys/types.h>
-#include <sys/time.h>
 #include <sys/mman.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <fstream>
 #include <vector>
 
 #include "tensorflow/core/framework/graph.pb.h"
@@ -46,10 +46,10 @@ limitations under the License.
 
 // These are all common classes it's handy to reference with no namespace.
 using tensorflow::Flag;
-using tensorflow::Tensor;
+using tensorflow::int32;
 using tensorflow::Status;
 using tensorflow::string;
-using tensorflow::int32;
+using tensorflow::Tensor;
 
 // Used to store the memory-mapped buffers we use for capture.
 struct CameraBuffer {
diff --git a/tensorflow/contrib/pi_examples/label_image/label_image.cc b/tensorflow/contrib/pi_examples/label_image/label_image.cc
index 0b18045789..c6935a093f 100644
--- a/tensorflow/contrib/pi_examples/label_image/label_image.cc
+++ b/tensorflow/contrib/pi_examples/label_image/label_image.cc
@@ -23,9 +23,9 @@ limitations under the License.
 //
 // Full build instructions are at tensorflow/contrib/pi_examples/README.md.
 
-#include <stdio.h>
 #include <jpeglib.h>
 #include <setjmp.h>
+#include <stdio.h>
 #include <fstream>
 #include <vector>
 
@@ -46,10 +46,10 @@ limitations under the License.
 
 // These are all common classes it's handy to reference with no namespace.
 using tensorflow::Flag;
-using tensorflow::Tensor;
+using tensorflow::int32;
 using tensorflow::Status;
 using tensorflow::string;
-using tensorflow::int32;
+using tensorflow::Tensor;
 
 // Takes a file name, and loads a list of labels from it, one per line, and
 // returns a vector of the strings. It pads with empty strings so the length
@@ -77,23 +77,22 @@ Status ReadLabelsFile(string file_name, std::vector<string>* result,
 // Error handling for JPEG decoding.
 void CatchError(j_common_ptr cinfo) {
   (*cinfo->err->output_message)(cinfo);
-  jmp_buf *jpeg_jmpbuf = reinterpret_cast<jmp_buf *>(cinfo->client_data);
+  jmp_buf* jpeg_jmpbuf = reinterpret_cast<jmp_buf*>(cinfo->client_data);
   jpeg_destroy(cinfo);
   longjmp(*jpeg_jmpbuf, 1);
 }
 
 // Decompresses a JPEG file from disk.
 Status LoadJpegFile(string file_name, std::vector<tensorflow::uint8>* data,
-		    int* width, int* height, int* channels) {
+                    int* width, int* height, int* channels) {
   struct jpeg_decompress_struct cinfo;
-  FILE * infile;
+  FILE* infile;
   JSAMPARRAY buffer;
   int row_stride;
 
   if ((infile = fopen(file_name.c_str(), "rb")) == NULL) {
     LOG(ERROR) << "Can't open " << file_name;
-    return tensorflow::errors::NotFound("JPEG file ", file_name,
-					" not found");
+    return tensorflow::errors::NotFound("JPEG file ", file_name, " not found");
   }
 
   struct jpeg_error_mgr jerr;
@@ -116,10 +115,11 @@ Status LoadJpegFile(string file_name, std::vector<tensorflow::uint8>* data,
   data->resize((*height) * (*width) * (*channels));
 
   row_stride = cinfo.output_width * cinfo.output_components;
-  buffer = (*cinfo.mem->alloc_sarray)
-    ((j_common_ptr) &cinfo, JPOOL_IMAGE, row_stride, 1);
+  buffer = (*cinfo.mem->alloc_sarray)((j_common_ptr)&cinfo, JPOOL_IMAGE,
+                                      row_stride, 1);
   while (cinfo.output_scanline < cinfo.output_height) {
-    tensorflow::uint8* row_address = &((*data)[cinfo.output_scanline * row_stride]);
+    tensorflow::uint8* row_address =
+        &((*data)[cinfo.output_scanline * row_stride]);
     jpeg_read_scanlines(&cinfo, buffer, 1);
     memcpy(row_address, buffer[0], row_stride);
   }
@@ -141,24 +141,25 @@ Status ReadTensorFromImageFile(string file_name, const int wanted_height,
   int image_height;
   int image_channels;
   TF_RETURN_IF_ERROR(LoadJpegFile(file_name, &image_data, &image_width,
-				  &image_height, &image_channels));
-  LOG(INFO) << "Loaded JPEG: " << image_width << "x" << image_height
-	    << "x" << image_channels;
+                                  &image_height, &image_channels));
+  LOG(INFO) << "Loaded JPEG: " << image_width << "x" << image_height << "x"
+            << image_channels;
   const int wanted_channels = 3;
   if (image_channels < wanted_channels) {
-    return tensorflow::errors::FailedPrecondition("Image needs to have at least ",
-						  wanted_channels, " but only has ",
-						  image_channels);
+    return tensorflow::errors::FailedPrecondition(
+        "Image needs to have at least ", wanted_channels, " but only has ",
+        image_channels);
   }
-  // In these loops, we convert the eight-bit data in the image into float, resize
-  // it using bilinear filtering, and scale it numerically to the float range that
-  // the model expects (given by input_mean and input_std).
+  // In these loops, we convert the eight-bit data in the image into float,
+  // resize it using bilinear filtering, and scale it numerically to the float
+  // range that the model expects (given by input_mean and input_std).
   tensorflow::Tensor image_tensor(
-      tensorflow::DT_FLOAT, tensorflow::TensorShape(
-      {1, wanted_height, wanted_width, wanted_channels}));
+      tensorflow::DT_FLOAT,
+      tensorflow::TensorShape(
+          {1, wanted_height, wanted_width, wanted_channels}));
   auto image_tensor_mapped = image_tensor.tensor<float, 4>();
   tensorflow::uint8* in = image_data.data();
-  float *out = image_tensor_mapped.data();
+  float* out = image_tensor_mapped.data();
   const size_t image_rowlen = image_width * image_channels;
   const float width_scale = static_cast<float>(image_width) / wanted_width;
   const float height_scale = static_cast<float>(image_height) / wanted_height;
@@ -166,35 +167,37 @@ Status ReadTensorFromImageFile(string file_name, const int wanted_height,
     const float in_y = y * height_scale;
     const int top_y_index = static_cast<int>(floorf(in_y));
     const int bottom_y_index =
-      std::min(static_cast<int>(ceilf(in_y)), (image_height - 1));
+        std::min(static_cast<int>(ceilf(in_y)), (image_height - 1));
     const float y_lerp = in_y - top_y_index;
     tensorflow::uint8* in_top_row = in + (top_y_index * image_rowlen);
     tensorflow::uint8* in_bottom_row = in + (bottom_y_index * image_rowlen);
-    float *out_row = out + (y * wanted_width * wanted_channels);
+    float* out_row = out + (y * wanted_width * wanted_channels);
     for (int x = 0; x < wanted_width; ++x) {
       const float in_x = x * width_scale;
       const int left_x_index = static_cast<int>(floorf(in_x));
       const int right_x_index =
-	std::min(static_cast<int>(ceilf(in_x)), (image_width - 1));
+          std::min(static_cast<int>(ceilf(in_x)), (image_width - 1));
       tensorflow::uint8* in_top_left_pixel =
-	in_top_row + (left_x_index * wanted_channels);
+          in_top_row + (left_x_index * wanted_channels);
       tensorflow::uint8* in_top_right_pixel =
-	in_top_row + (right_x_index * wanted_channels);
+          in_top_row + (right_x_index * wanted_channels);
       tensorflow::uint8* in_bottom_left_pixel =
-	in_bottom_row + (left_x_index * wanted_channels);
+          in_bottom_row + (left_x_index * wanted_channels);
       tensorflow::uint8* in_bottom_right_pixel =
-	in_bottom_row + (right_x_index * wanted_channels);
+          in_bottom_row + (right_x_index * wanted_channels);
       const float x_lerp = in_x - left_x_index;
-      float *out_pixel = out_row + (x * wanted_channels);
+      float* out_pixel = out_row + (x * wanted_channels);
       for (int c = 0; c < wanted_channels; ++c) {
-	const float top_left((in_top_left_pixel[c] - input_mean) / input_std);
-	const float top_right((in_top_right_pixel[c] - input_mean) / input_std);
-	const float bottom_left((in_bottom_left_pixel[c] - input_mean) / input_std);
-	const float bottom_right((in_bottom_right_pixel[c] - input_mean) / input_std);
-	const float top = top_left + (top_right - top_left) * x_lerp;
-	const float bottom =
-	  bottom_left + (bottom_right - bottom_left) * x_lerp;
-	out_pixel[c] = top + (bottom - top) * y_lerp;
+        const float top_left((in_top_left_pixel[c] - input_mean) / input_std);
+        const float top_right((in_top_right_pixel[c] - input_mean) / input_std);
+        const float bottom_left((in_bottom_left_pixel[c] - input_mean) /
+                                input_std);
+        const float bottom_right((in_bottom_right_pixel[c] - input_mean) /
+                                 input_std);
+        const float top = top_left + (top_right - top_left) * x_lerp;
+        const float bottom =
+            bottom_left + (bottom_right - bottom_left) * x_lerp;
+        out_pixel[c] = top + (bottom - top) * y_lerp;
       }
     }
   }
@@ -233,10 +236,10 @@ Status GetTopLabels(const std::vector<Tensor>& outputs, int how_many_labels,
     scores.push_back(std::pair<int, float>({i, unsorted_scores_flat(i)}));
   }
   std::sort(scores.begin(), scores.end(),
-	    [](const std::pair<int, float> &left,
-	       const std::pair<int, float> &right) {
-	      return left.second > right.second;
-	    });
+            [](const std::pair<int, float>& left,
+               const std::pair<int, float>& right) {
+              return left.second > right.second;
+            });
   scores.resize(how_many_labels);
   Tensor sorted_indices(tensorflow::DT_INT32, {scores.size()});
   Tensor sorted_scores(tensorflow::DT_FLOAT, {scores.size()});
diff --git a/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.cc b/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.cc
index c33804906f..2def4f3f17 100644
--- a/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.cc
+++ b/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include <algorithm>
 #include "tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.h"
+#include <algorithm>
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
diff --git a/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.h b/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.h
index 9bb1724a2c..d8c0a0631d 100644
--- a/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.h
+++ b/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.h
@@ -16,10 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_PARTIAL_REDUCTION_OPS_H_
 #define TENSORFLOW_CORE_KERNELS_PARTIAL_REDUCTION_OPS_H_
 
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 #define Sum(a, b) ((a) + (b))
 #define Prod(a, b) ((a) * (b))
@@ -58,11 +58,11 @@ inline T negative_infinity() {
 
 }  // namespace reduce_functions
 
-#define CALL_ALL_REDUCEOPS(func, ...)                                          \
-  func(Sum, functor::reduce_functions::zero, ##__VA_ARGS__)                    \
-  func(Prod, functor::reduce_functions::one, ##__VA_ARGS__)                    \
-  func(Max, functor::reduce_functions::negative_infinity, ##__VA_ARGS__)       \
-  func(Min, functor::reduce_functions::infinity, ##__VA_ARGS__)
+#define CALL_ALL_REDUCEOPS(func, ...)                                       \
+  func(Sum, functor::reduce_functions::zero, ##__VA_ARGS__)                 \
+      func(Prod, functor::reduce_functions::one, ##__VA_ARGS__) func(       \
+          Max, functor::reduce_functions::negative_infinity, ##__VA_ARGS__) \
+          func(Min, functor::reduce_functions::infinity, ##__VA_ARGS__)
 
 #define ReduceSliceFunctorReduceop(reduceop, dummy)                         \
   template <typename Device, typename T, typename Index>                    \
diff --git a/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops_gpu.cu.cc b/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops_gpu.cu.cc
index 501cddb8c8..9f2be03d71 100644
--- a/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops_gpu.cu.cc
+++ b/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops_gpu.cu.cc
@@ -17,10 +17,10 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
+#include "tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
 
 namespace tensorflow {
diff --git a/tensorflow/contrib/resampler/kernels/resampler_ops.cc b/tensorflow/contrib/resampler/kernels/resampler_ops.cc
index e02c1b6a2b..63c72836d7 100644
--- a/tensorflow/contrib/resampler/kernels/resampler_ops.cc
+++ b/tensorflow/contrib/resampler/kernels/resampler_ops.cc
@@ -36,17 +36,12 @@ using GPUDevice = Eigen::GpuDevice;
 namespace functor {
 
 template <typename T>
-struct Resampler2DFunctor<CPUDevice, T>{
-  void operator ()(::tensorflow::OpKernelContext* ctx,
-                   const CPUDevice& d,
-                   const T* __restrict__ data,
-                   const T* __restrict__ warp,
-                   T* __restrict__ output,
-                   const int batch_size,
-                   const int data_height,
-                   const int data_width,
-                   const int data_channels,
-                   const int num_sampling_points){
+struct Resampler2DFunctor<CPUDevice, T> {
+  void operator()(::tensorflow::OpKernelContext* ctx, const CPUDevice& d,
+                  const T* __restrict__ data, const T* __restrict__ warp,
+                  T* __restrict__ output, const int batch_size,
+                  const int data_height, const int data_width,
+                  const int data_channels, const int num_sampling_points) {
     const int warp_batch_stride = num_sampling_points * 2;
     const int data_batch_stride = data_height * data_width * data_channels;
     const int output_batch_stride = num_sampling_points * data_channels;
@@ -59,24 +54,19 @@ struct Resampler2DFunctor<CPUDevice, T>{
         // The functions take care of performing the relevant pointer
         // arithmetics abstracting away the low level details in the
         // main loop over samples. Note that data is stored in NHWC format.
-        auto set_output = [&](const int sample_id,
-                              const int channel,
+        auto set_output = [&](const int sample_id, const int channel,
                               const T value) {
-          output[batch_id * output_batch_stride +
-                 sample_id * data_channels +
+          output[batch_id * output_batch_stride + sample_id * data_channels +
                  channel] = value;
         };
 
-        auto get_data_point = [&](const int x,
-                                  const int y,
-                                  const int chan) {
+        auto get_data_point = [&](const int x, const int y, const int chan) {
           const bool point_is_in_range =
               (x >= 0 && y >= 0 && x <= data_width - 1 && y <= data_height - 1);
           return point_is_in_range
-                 ? data[batch_id * data_batch_stride +
-                        data_channels * (y * data_width + x) +
-                        chan]
-                 : zero;
+                     ? data[batch_id * data_batch_stride +
+                            data_channels * (y * data_width + x) + chan]
+                     : zero;
         };
 
         for (int sample_id = 0; sample_id < num_sampling_points; ++sample_id) {
@@ -89,8 +79,7 @@ struct Resampler2DFunctor<CPUDevice, T>{
           // The effect is that the sampled signal smoothly goes to 0 outside
           // the original input domain, rather than presenting a jump
           // discontinuity at the image boundaries.
-          if (x > static_cast<T>(-1.0) &&
-              y > static_cast<T>(-1.0) &&
+          if (x > static_cast<T>(-1.0) && y > static_cast<T>(-1.0) &&
               x < static_cast<T>(data_width) &&
               y < static_cast<T>(data_height)) {
             // Precompute floor (f) and ceil (c) values for x and y.
@@ -103,12 +92,10 @@ struct Resampler2DFunctor<CPUDevice, T>{
 
             for (int chan = 0; chan < data_channels; ++chan) {
               const T img_fxfy = dx * dy * get_data_point(fx, fy, chan);
-              const T img_cxcy = (one - dx) * (one - dy) *
-                                   get_data_point(cx, cy, chan);
-              const T img_fxcy = dx * (one - dy) *
-                                   get_data_point(fx, cy, chan);
-              const T img_cxfy = (one - dx) * dy *
-                                   get_data_point(cx, fy, chan);
+              const T img_cxcy =
+                  (one - dx) * (one - dy) * get_data_point(cx, cy, chan);
+              const T img_fxcy = dx * (one - dy) * get_data_point(fx, cy, chan);
+              const T img_cxfy = (one - dx) * dy * get_data_point(cx, fy, chan);
               set_output(sample_id, chan,
                          img_fxfy + img_cxcy + img_fxcy + img_cxfy);
             }
@@ -125,8 +112,8 @@ struct Resampler2DFunctor<CPUDevice, T>{
     // estimate of the cost of each work unit is needed to correctly shard the
     // workload. Shard assumes each cost unit is 1ns, minimum cost per shard
     // being 10us.
-    const int64 cost =  static_cast<int64>(num_sampling_points) *
-        data_channels * 1000;
+    const int64 cost =
+        static_cast<int64>(num_sampling_points) * data_channels * 1000;
     auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
     ::tensorflow::Shard(worker_threads.num_threads, worker_threads.workers,
                         batch_size, cost, resample_batches);
@@ -138,8 +125,8 @@ struct Resampler2DFunctor<CPUDevice, T>{
 template <typename Device, typename T>
 class ResamplerOp : public ::tensorflow::OpKernel {
  public:
-  explicit ResamplerOp(::tensorflow::OpKernelConstruction* context) :
-      ::tensorflow::OpKernel(context) {}
+  explicit ResamplerOp(::tensorflow::OpKernelConstruction* context)
+      : ::tensorflow::OpKernel(context) {}
 
   void Compute(::tensorflow::OpKernelContext* ctx) override {
     const ::tensorflow::Tensor& data = ctx->input(0);
@@ -158,16 +145,17 @@ class ResamplerOp : public ::tensorflow::OpKernel {
                 ::tensorflow::errors::InvalidArgument(
                     "warp should be at least a matrix, got shape ",
                     warp_shape.DebugString()));
-    OP_REQUIRES(ctx, warp_shape.dim_size(warp_shape.dims()-1) == 2,
+    OP_REQUIRES(ctx, warp_shape.dim_size(warp_shape.dims() - 1) == 2,
                 ::tensorflow::errors::Unimplemented(
                     "Only bilinear interpolation is supported, warping "
                     "coordinates must be 2D; warp shape last entry should be "
-                    "2, but shape vector is: ", warp_shape.DebugString()));
+                    "2, but shape vector is: ",
+                    warp_shape.DebugString()));
     OP_REQUIRES(ctx, data_shape.dim_size(0) == warp_shape.dim_size(0),
                 ::tensorflow::errors::InvalidArgument(
                     "Batch size of data and warp tensor must be the same, but "
-                    "input shapes are: ", data_shape.DebugString(), ", ",
-                    warp_shape.DebugString()));
+                    "input shapes are: ",
+                    data_shape.DebugString(), ", ", warp_shape.DebugString()));
     const int batch_size = data_shape.dim_size(0);
     const int data_height = data_shape.dim_size(1);
     const int data_width = data_shape.dim_size(2);
@@ -180,16 +168,10 @@ class ResamplerOp : public ::tensorflow::OpKernel {
 
     // Execute kernel only for nonempty output; otherwise Eigen crashes on GPU.
     if (num_sampling_points > 0) {
-      functor::Resampler2DFunctor<Device, T>()(ctx,
-                                               ctx->eigen_device<Device>(),
-                                               data.flat<T>().data(),
-                                               warp.flat<T>().data(),
-                                               output->flat<T>().data(),
-                                               batch_size,
-                                               data_height,
-                                               data_width,
-                                               data_channels,
-                                               num_sampling_points);
+      functor::Resampler2DFunctor<Device, T>()(
+          ctx, ctx->eigen_device<Device>(), data.flat<T>().data(),
+          warp.flat<T>().data(), output->flat<T>().data(), batch_size,
+          data_height, data_width, data_channels, num_sampling_points);
     }
   }
 
@@ -197,12 +179,9 @@ class ResamplerOp : public ::tensorflow::OpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(ResamplerOp);
 };
 
-
-#define REGISTER(TYPE)                       \
-  REGISTER_KERNEL_BUILDER(                   \
-      Name("Resampler")                      \
-          .Device(DEVICE_CPU)  \
-          .TypeConstraint<TYPE>("T"),        \
+#define REGISTER(TYPE)                                                \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("Resampler").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"), \
       ResamplerOp<CPUDevice, TYPE>);
 
 TF_CALL_half(REGISTER);
@@ -211,40 +190,32 @@ TF_CALL_double(REGISTER);
 #undef REGISTER
 
 #if GOOGLE_CUDA
-#define REGISTER(TYPE)                                           \
-  REGISTER_KERNEL_BUILDER(Name("Resampler")                      \
-                              .Device(DEVICE_GPU)  \
-                              .TypeConstraint<TYPE>("T"),        \
-                          ResamplerOp<GPUDevice, TYPE>)
+#define REGISTER(TYPE)                                                \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("Resampler").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"), \
+      ResamplerOp<GPUDevice, TYPE>)
 TF_CALL_float(REGISTER);
 TF_CALL_double(REGISTER);
 #undef REGISTER
 #endif  // GOOGLE_CUDA
 
-
 namespace functor {
 
 template <typename T>
-struct ResamplerGrad2DFunctor<CPUDevice, T>{
-  void operator ()(::tensorflow::OpKernelContext* ctx,
-                   const CPUDevice& d,
-                   const T* __restrict__ data,
-                   const T* __restrict__ warp,
-                   const T* __restrict__ grad_output,
-                   T* __restrict__ grad_data,
-                   T* __restrict__ grad_warp,
-                   const int batch_size,
-                   const int data_height,
-                   const int data_width,
-                   const int data_channels,
-                   const int num_sampling_points){
+struct ResamplerGrad2DFunctor<CPUDevice, T> {
+  void operator()(::tensorflow::OpKernelContext* ctx, const CPUDevice& d,
+                  const T* __restrict__ data, const T* __restrict__ warp,
+                  const T* __restrict__ grad_output, T* __restrict__ grad_data,
+                  T* __restrict__ grad_warp, const int batch_size,
+                  const int data_height, const int data_width,
+                  const int data_channels, const int num_sampling_points) {
     // Set gradients to 0, because the kernel incrementally updates the
     // tensor entries by adding partial contributions.
-    const int resampler_output_size = batch_size * num_sampling_points *
-        data_channels;
+    const int resampler_output_size =
+        batch_size * num_sampling_points * data_channels;
     const int grad_warp_size = resampler_output_size / data_channels * 2;
-    const int grad_data_size = data_height * data_width * data_channels *
-        batch_size;
+    const int grad_data_size =
+        data_height * data_width * data_channels * batch_size;
     memset(grad_data, 0, sizeof(T) * grad_data_size);
     memset(grad_warp, 0, sizeof(T) * grad_warp_size);
 
@@ -260,35 +231,29 @@ struct ResamplerGrad2DFunctor<CPUDevice, T>{
         // The functions take care of performing the relevant pointer
         // arithmetics abstracting away the low level details in the
         // main loop over samples. Note that data is stored in NHWC format.
-        auto get_data_point = [&](const int x,
-                                  const int y,
-                                  const int chan) {
+        auto get_data_point = [&](const int x, const int y, const int chan) {
           const bool point_is_in_range =
-            (x >= 0 && y >= 0 && x <= data_width - 1 && y <= data_height - 1);
+              (x >= 0 && y >= 0 && x <= data_width - 1 && y <= data_height - 1);
           return point_is_in_range
-                 ? data[batch_id * data_batch_stride +
-                        data_channels * (y * data_width + x) +
-                        chan]
-                 : zero;
+                     ? data[batch_id * data_batch_stride +
+                            data_channels * (y * data_width + x) + chan]
+                     : zero;
         };
 
         auto update_grad_data = [&](const int x, const int y, const int chan,
                                     const T value) {
           const bool point_is_in_range =
               (x >= 0 && y >= 0 && x <= data_width - 1 && y <= data_height - 1);
-          if (point_is_in_range){
+          if (point_is_in_range) {
             grad_data[batch_id * data_batch_stride +
-                      data_channels * (y * data_width + x) +
-                      chan] += value;
+                      data_channels * (y * data_width + x) + chan] += value;
           }
         };
 
-        auto update_grad_warp = [&](const int sample_id,
-                                    const int channel,
+        auto update_grad_warp = [&](const int sample_id, const int channel,
                                     const T value) {
-          grad_warp[batch_id * warp_batch_stride +
-                    sample_id * 2 +
-                    channel] += value;
+          grad_warp[batch_id * warp_batch_stride + sample_id * 2 + channel] +=
+              value;
         };
 
         for (int sample_id = 0; sample_id < num_sampling_points; ++sample_id) {
@@ -301,8 +266,7 @@ struct ResamplerGrad2DFunctor<CPUDevice, T>{
           // The effect is that the sampled signal smoothly goes to 0 outside
           // the original input domain, rather than presenting a jump
           // discontinuity at the image boundaries.
-          if (x > static_cast<T>(-1.0) &&
-              y > static_cast<T>(-1.0) &&
+          if (x > static_cast<T>(-1.0) && y > static_cast<T>(-1.0) &&
               x < static_cast<T>(data_width) &&
               y < static_cast<T>(data_height)) {
             // Precompute floor (f) and ceil (c) values for x and y.
@@ -316,27 +280,25 @@ struct ResamplerGrad2DFunctor<CPUDevice, T>{
             for (int chan = 0; chan < data_channels; ++chan) {
               const T grad_output_value =
                   grad_output[batch_id * output_batch_stride +
-                              sample_id * data_channels +
-                              chan];
+                              sample_id * data_channels + chan];
               const T img_fxfy = get_data_point(fx, fy, chan);
               const T img_cxcy = get_data_point(cx, cy, chan);
               const T img_fxcy = get_data_point(fx, cy, chan);
               const T img_cxfy = get_data_point(cx, fy, chan);
 
               // Update partial gradients wrt relevant warp field entries
-              update_grad_warp(sample_id, 0,
-                               grad_output_value *
-                                   ((one - dy) * (img_cxcy - img_fxcy) +
-                                    dy * (img_cxfy - img_fxfy)));
+              update_grad_warp(
+                  sample_id, 0,
+                  grad_output_value * ((one - dy) * (img_cxcy - img_fxcy) +
+                                       dy * (img_cxfy - img_fxfy)));
 
-              update_grad_warp(sample_id, 1,
-                               grad_output_value *
-                                   ((one - dx) * (img_cxcy - img_cxfy) +
-                                    dx * (img_fxcy - img_fxfy)));
+              update_grad_warp(
+                  sample_id, 1,
+                  grad_output_value * ((one - dx) * (img_cxcy - img_cxfy) +
+                                       dx * (img_fxcy - img_fxfy)));
 
               // Update partial gradients wrt sampled data
-              update_grad_data(fx, fy, chan,
-                               grad_output_value * dx * dy);
+              update_grad_data(fx, fy, chan, grad_output_value * dx * dy);
               update_grad_data(cx, cy, chan,
                                grad_output_value * (one - dx) * (one - dy));
               update_grad_data(fx, cy, chan,
@@ -355,8 +317,8 @@ struct ResamplerGrad2DFunctor<CPUDevice, T>{
     // being 10us.
     // TODO(fviola): Check out if there is a better way of doing this.
     auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
-    const int64 cost =  static_cast<int64>(num_sampling_points) *
-        data_channels * 1000;
+    const int64 cost =
+        static_cast<int64>(num_sampling_points) * data_channels * 1000;
     ::tensorflow::Shard(worker_threads.num_threads, worker_threads.workers,
                         batch_size, cost, update_grads_for_batches);
   }
@@ -364,12 +326,11 @@ struct ResamplerGrad2DFunctor<CPUDevice, T>{
 
 }  // namespace functor
 
-
 template <typename Device, typename T>
 class ResamplerGradOp : public ::tensorflow::OpKernel {
  public:
-  explicit ResamplerGradOp(::tensorflow::OpKernelConstruction* context) :
-      ::tensorflow::OpKernel(context) {}
+  explicit ResamplerGradOp(::tensorflow::OpKernelConstruction* context)
+      : ::tensorflow::OpKernel(context) {}
 
   void Compute(::tensorflow::OpKernelContext* ctx) override {
     const ::tensorflow::Tensor& data = ctx->input(0);
@@ -383,7 +344,7 @@ class ResamplerGradOp : public ::tensorflow::OpKernel {
                     "tensor must be a batch of 2d data; data shape should have "
                     "4 entries corresponding to [batch_size, data_height, "
                     "data_width, data_channels], but is: ",
-                data_shape.DebugString()));
+                    data_shape.DebugString()));
     const int batch_size = data_shape.dim_size(0);
     const int data_height = data_shape.dim_size(1);
     const int data_width = data_shape.dim_size(2);
@@ -394,7 +355,7 @@ class ResamplerGradOp : public ::tensorflow::OpKernel {
                 ::tensorflow::errors::InvalidArgument(
                     "warp should be at least a matrix, got shape ",
                     warp_shape.DebugString()));
-    OP_REQUIRES(ctx, warp_shape.dim_size(warp_shape.dims()-1) == 2,
+    OP_REQUIRES(ctx, warp_shape.dim_size(warp_shape.dims() - 1) == 2,
                 ::tensorflow::errors::Unimplemented(
                     "Only bilinear interpolation is supported, warping "
                     "coordinates must be 2D; warp shape last entry should be "
@@ -417,18 +378,11 @@ class ResamplerGradOp : public ::tensorflow::OpKernel {
     OP_REQUIRES_OK(ctx, ctx->allocate_output(1, warp.shape(), &grad_warp));
     // Execute kernel only for nonempty output; otherwise Eigen crashes on GPU.
     if (num_sampling_points > 0) {
-      functor::ResamplerGrad2DFunctor<Device, T>()(ctx,
-                                                   ctx->eigen_device<Device>(),
-                                                   data.flat<T>().data(),
-                                                   warp.flat<T>().data(),
-                                                   grad_output.flat<T>().data(),
-                                                   grad_data->flat<T>().data(),
-                                                   grad_warp->flat<T>().data(),
-                                                   batch_size,
-                                                   data_height,
-                                                   data_width,
-                                                   data_channels,
-                                                   num_sampling_points);
+      functor::ResamplerGrad2DFunctor<Device, T>()(
+          ctx, ctx->eigen_device<Device>(), data.flat<T>().data(),
+          warp.flat<T>().data(), grad_output.flat<T>().data(),
+          grad_data->flat<T>().data(), grad_warp->flat<T>().data(), batch_size,
+          data_height, data_width, data_channels, num_sampling_points);
     }
   }
 
@@ -436,11 +390,9 @@ class ResamplerGradOp : public ::tensorflow::OpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(ResamplerGradOp);
 };
 
-#define REGISTER(TYPE)                       \
-  REGISTER_KERNEL_BUILDER(                   \
-      Name("ResamplerGrad")                  \
-          .Device(DEVICE_CPU)  \
-          .TypeConstraint<TYPE>("T"),        \
+#define REGISTER(TYPE)                                                    \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("ResamplerGrad").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"), \
       ResamplerGradOp<CPUDevice, TYPE>);
 
 TF_CALL_half(REGISTER);
@@ -449,11 +401,10 @@ TF_CALL_double(REGISTER);
 #undef REGISTER
 
 #if GOOGLE_CUDA
-#define REGISTER(TYPE)                                           \
-  REGISTER_KERNEL_BUILDER(Name("ResamplerGrad")                  \
-                              .Device(DEVICE_GPU)  \
-                              .TypeConstraint<TYPE>("T"),        \
-                          ResamplerGradOp<GPUDevice, TYPE>)
+#define REGISTER(TYPE)                                                    \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("ResamplerGrad").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"), \
+      ResamplerGradOp<GPUDevice, TYPE>)
 // Disable half and double precision since atomicAdds are not supported
 // TF_CALL_half(REGISTER);
 // TF_CALL_double(REGISTER);
diff --git a/tensorflow/contrib/resampler/kernels/resampler_ops.h b/tensorflow/contrib/resampler/kernels/resampler_ops.h
index 85d3676efa..7fe3b9c0df 100644
--- a/tensorflow/contrib/resampler/kernels/resampler_ops.h
+++ b/tensorflow/contrib/resampler/kernels/resampler_ops.h
@@ -29,38 +29,25 @@ namespace functor {
 
 // Helper functor for the Resampler Op in 2D
 template <typename Device, typename T>
-struct Resampler2DFunctor{
-  void operator ()(::tensorflow::OpKernelContext* ctx,
-                   const Device& d,
-                   const T* __restrict__ data,
-                   const T* __restrict__ warp,
-                   T* __restrict__ output,
-                   const int batch_size,
-                   const int data_height,
-                   const int data_width,
-                   const int data_channels,
-                   const int num_sampling_points);
+struct Resampler2DFunctor {
+  void operator()(::tensorflow::OpKernelContext* ctx, const Device& d,
+                  const T* __restrict__ data, const T* __restrict__ warp,
+                  T* __restrict__ output, const int batch_size,
+                  const int data_height, const int data_width,
+                  const int data_channels, const int num_sampling_points);
 };
 
-
 // Helper functor for the Resampler Gradient Op in 2D
 template <typename Device, typename T>
-struct ResamplerGrad2DFunctor{
-  void operator ()(::tensorflow::OpKernelContext* ctx,
-                   const Device& d,
-                   const T* __restrict__ data,
-                   const T* __restrict__ warp,
-                   const T* __restrict__ grad_output,
-                   T* __restrict__ grad_data,
-                   T* __restrict__ grad_warp,
-                   const int batch_size,
-                   const int data_height,
-                   const int data_width,
-                   const int data_channels,
-                   const int num_sampling_points);
+struct ResamplerGrad2DFunctor {
+  void operator()(::tensorflow::OpKernelContext* ctx, const Device& d,
+                  const T* __restrict__ data, const T* __restrict__ warp,
+                  const T* __restrict__ grad_output, T* __restrict__ grad_data,
+                  T* __restrict__ grad_warp, const int batch_size,
+                  const int data_height, const int data_width,
+                  const int data_channels, const int num_sampling_points);
 };
 
-
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/contrib/resampler/kernels/resampler_ops_gpu.cu.cc b/tensorflow/contrib/resampler/kernels/resampler_ops_gpu.cu.cc
index 636847a212..3c07051f68 100644
--- a/tensorflow/contrib/resampler/kernels/resampler_ops_gpu.cu.cc
+++ b/tensorflow/contrib/resampler/kernels/resampler_ops_gpu.cu.cc
@@ -31,18 +31,15 @@ using GPUDevice = Eigen::GpuDevice;
 
 namespace {
 
-#define GET_DATA_POINT(x, y)                   \
-  data[batch_id * data_batch_stride +          \
-       data_channels * (y * data_width + x) +  \
+#define GET_DATA_POINT(x, y)                                                 \
+  data[batch_id * data_batch_stride + data_channels * (y * data_width + x) + \
        chan]
 
 template <typename T>
 __global__ void Resampler2DKernel(const T* __restrict__ data,
                                   const T* __restrict__ warp,
-                                  T* __restrict__ output,
-                                  const int batch_size,
-                                  const int data_height,
-                                  const int data_width,
+                                  T* __restrict__ output, const int batch_size,
+                                  const int data_height, const int data_width,
                                   const int data_channels,
                                   const int num_sampling_points) {
   const int output_data_size = batch_size * num_sampling_points * data_channels;
@@ -75,10 +72,8 @@ __global__ void Resampler2DKernel(const T* __restrict__ data,
     // The effect is that the sampled signal smoothly goes to 0 outside
     // the original input domain, rather than presenting a jump
     // discontinuity at the image boundaries.
-    if (x > static_cast<T>(-1.0) &&
-        y > static_cast<T>(-1.0) &&
-        x < static_cast<T>(data_width) &&
-        y < static_cast<T>(data_height)) {
+    if (x > static_cast<T>(-1.0) && y > static_cast<T>(-1.0) &&
+        x < static_cast<T>(data_width) && y < static_cast<T>(data_height)) {
       // Precompute floor (f) and ceil (c) values for x and y.
       const int fx = std::floor(static_cast<float>(x));
       const int fy = std::floor(static_cast<float>(y));
@@ -87,21 +82,20 @@ __global__ void Resampler2DKernel(const T* __restrict__ data,
       const T dx = static_cast<T>(cx) - x;
       const T dy = static_cast<T>(cy) - y;
 
-      const T img_fxfy = (fx >= 0 && fy >= 0)
-                         ? dx * dy * GET_DATA_POINT(fx, fy)
-                         : zero;
+      const T img_fxfy =
+          (fx >= 0 && fy >= 0) ? dx * dy * GET_DATA_POINT(fx, fy) : zero;
 
       const T img_cxcy = (cx <= data_width - 1 && cy <= data_height - 1)
-                         ? (one - dx) * (one - dy) * GET_DATA_POINT(cx, cy)
-                         : zero;
+                             ? (one - dx) * (one - dy) * GET_DATA_POINT(cx, cy)
+                             : zero;
 
       const T img_fxcy = (fx >= 0 && cy <= data_height - 1)
-                         ? dx * (one - dy) * GET_DATA_POINT(fx, cy)
-                         : zero;
+                             ? dx * (one - dy) * GET_DATA_POINT(fx, cy)
+                             : zero;
 
       const T img_cxfy = (cx <= data_width - 1 && fy >= 0)
-                         ? (one - dx) * dy * GET_DATA_POINT(cx, fy)
-                         : zero;
+                             ? (one - dx) * dy * GET_DATA_POINT(cx, fy)
+                             : zero;
 
       output[out_index] = img_fxfy + img_cxcy + img_fxcy + img_cxfy;
     } else {
@@ -115,24 +109,20 @@ __global__ void Resampler2DKernel(const T* __restrict__ data,
 namespace functor {
 
 template <typename T>
-struct Resampler2DFunctor<GPUDevice, T>{
-  void operator ()(::tensorflow::OpKernelContext* ctx,
-                   const GPUDevice& d,
-                   const T* __restrict__ data,
-                   const T* __restrict__ warp,
-                   T* __restrict__ output,
-                   const int batch_size,
-                   const int data_height,
-                   const int data_width,
-                   const int data_channels,
-                   const int num_sampling_points) {
-  const int output_data_size = batch_size * num_sampling_points * data_channels;
-  ::tensorflow::CudaLaunchConfig config =
-      ::tensorflow::GetCudaLaunchConfig(output_data_size, d);
-  Resampler2DKernel<T>
-      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-          data, warp, output, batch_size, data_height, data_width,
-          data_channels, num_sampling_points);
+struct Resampler2DFunctor<GPUDevice, T> {
+  void operator()(::tensorflow::OpKernelContext* ctx, const GPUDevice& d,
+                  const T* __restrict__ data, const T* __restrict__ warp,
+                  T* __restrict__ output, const int batch_size,
+                  const int data_height, const int data_width,
+                  const int data_channels, const int num_sampling_points) {
+    const int output_data_size =
+        batch_size * num_sampling_points * data_channels;
+    ::tensorflow::CudaLaunchConfig config =
+        ::tensorflow::GetCudaLaunchConfig(output_data_size, d);
+    Resampler2DKernel<T>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            data, warp, output, batch_size, data_height, data_width,
+            data_channels, num_sampling_points);
   }
 };
 
@@ -145,26 +135,20 @@ template struct Resampler2DFunctor<GPUDevice, double>;
 
 namespace {
 
-#define UPDATE_GRAD_DATA_POINT(x, y, v)                  \
-  atomicAdd(grad_data + (batch_id * data_batch_stride +  \
-            data_channels * (y * data_width + x) +       \
-            chan),                                       \
+#define UPDATE_GRAD_DATA_POINT(x, y, v)                                \
+  atomicAdd(grad_data + (batch_id * data_batch_stride +                \
+                         data_channels * (y * data_width + x) + chan), \
             v)
 
-
 template <typename T>
-__global__ void ResamplerGrad2DKernel(const T* __restrict__ data,
-                                      const T* __restrict__ warp,
-                                      const T* __restrict__ grad_output,
-                                      T* __restrict__ grad_data,
-                                      T* __restrict__ grad_warp,
-                                      const int batch_size,
-                                      const int data_height,
-                                      const int data_width,
-                                      const int data_channels,
-                                      const int num_sampling_points) {
-  const int resampler_output_size = batch_size * num_sampling_points *
-      data_channels;
+__global__ void ResamplerGrad2DKernel(
+    const T* __restrict__ data, const T* __restrict__ warp,
+    const T* __restrict__ grad_output, T* __restrict__ grad_data,
+    T* __restrict__ grad_warp, const int batch_size, const int data_height,
+    const int data_width, const int data_channels,
+    const int num_sampling_points) {
+  const int resampler_output_size =
+      batch_size * num_sampling_points * data_channels;
   CUDA_1D_KERNEL_LOOP(index, resampler_output_size) {
     const int out_index = index;
 
@@ -199,10 +183,8 @@ __global__ void ResamplerGrad2DKernel(const T* __restrict__ data,
     // The effect is that the sampled signal smoothly goes to 0 outside
     // the original input domain, rather than presenting a jump
     // discontinuity at the image boundaries.
-    if (x > static_cast<T>(-1.0) &&
-        y > static_cast<T>(-1.0) &&
-        x < static_cast<T>(data_width) &&
-        y < static_cast<T>(data_height)) {
+    if (x > static_cast<T>(-1.0) && y > static_cast<T>(-1.0) &&
+        x < static_cast<T>(data_width) && y < static_cast<T>(data_height)) {
       // Precompute floor (f) and ceil (c) values for x and y.
       const int fx = std::floor(static_cast<float>(x));
       const int fy = std::floor(static_cast<float>(y));
@@ -211,21 +193,17 @@ __global__ void ResamplerGrad2DKernel(const T* __restrict__ data,
       const T dx = static_cast<T>(cx) - x;
       const T dy = static_cast<T>(cy) - y;
 
-      const T img_fxfy = (fx >= 0 && fy >= 0)
-                         ? GET_DATA_POINT(fx, fy)
-                         : zero;
+      const T img_fxfy = (fx >= 0 && fy >= 0) ? GET_DATA_POINT(fx, fy) : zero;
 
       const T img_cxcy = (cx <= data_width - 1 && cy <= data_height - 1)
-                         ? GET_DATA_POINT(cx, cy)
-                         : zero;
+                             ? GET_DATA_POINT(cx, cy)
+                             : zero;
 
-      const T img_fxcy = (fx >= 0 && cy <= data_height - 1)
-                         ? GET_DATA_POINT(fx, cy)
-                         : zero;
+      const T img_fxcy =
+          (fx >= 0 && cy <= data_height - 1) ? GET_DATA_POINT(fx, cy) : zero;
 
-      const T img_cxfy = (cx <= data_width - 1 && fy >= 0)
-                         ? GET_DATA_POINT(cx, fy)
-                         : zero;
+      const T img_cxfy =
+          (cx <= data_width - 1 && fy >= 0) ? GET_DATA_POINT(cx, fy) : zero;
 
       // Update partial gradients wrt relevant warp field entries
       atomicAdd(grad_warp + warp_id_x,
@@ -241,7 +219,7 @@ __global__ void ResamplerGrad2DKernel(const T* __restrict__ data,
       }
       if (cx <= data_width - 1 && cy <= data_height - 1) {
         UPDATE_GRAD_DATA_POINT(cx, cy,
-                               grad_output_value  * (one - dx) * (one - dy));
+                               grad_output_value * (one - dx) * (one - dy));
       }
       if (fx >= 0 && cy <= data_height - 1) {
         UPDATE_GRAD_DATA_POINT(fx, cy, grad_output_value * dx * (one - dy));
@@ -261,43 +239,37 @@ __global__ void ResamplerGrad2DKernel(const T* __restrict__ data,
 namespace functor {
 
 template <typename T>
-struct ResamplerGrad2DFunctor<GPUDevice, T>{
-  void operator ()(::tensorflow::OpKernelContext* ctx,
-                   const GPUDevice& d,
-                   const T* __restrict__ data,
-                   const T* __restrict__ warp,
-                   const T* __restrict__ grad_output,
-                   T* __restrict__ grad_data,
-                   T* __restrict__ grad_warp,
-                   const int batch_size,
-                   const int data_height,
-                   const int data_width,
-                   const int data_channels,
-                   const int num_sampling_points) {
-  // Set gradients to 0, because the kernel incrementally updates the
-  // tensor entries by adding partial contributions.
-  const int grad_warp_size = batch_size * num_sampling_points * 2;
-  const int grad_data_size = batch_size * data_height * data_width *
-      data_channels;
-
-  ::tensorflow::CudaLaunchConfig config =
-     ::tensorflow::GetCudaLaunchConfig(grad_warp_size, d);
-  ::tensorflow::SetZero
-      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-          grad_warp_size, grad_warp);
-
-  config = ::tensorflow::GetCudaLaunchConfig(grad_data_size, d);
-  ::tensorflow::SetZero
-      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-          grad_data_size, grad_data);
-
-  const int resampler_output_size = batch_size * num_sampling_points *
-      data_channels;
-  config = ::tensorflow::GetCudaLaunchConfig(resampler_output_size, d);
-  ResamplerGrad2DKernel<T>
-      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-          data, warp, grad_output, grad_data, grad_warp, batch_size,
-          data_height, data_width, data_channels, num_sampling_points);
+struct ResamplerGrad2DFunctor<GPUDevice, T> {
+  void operator()(::tensorflow::OpKernelContext* ctx, const GPUDevice& d,
+                  const T* __restrict__ data, const T* __restrict__ warp,
+                  const T* __restrict__ grad_output, T* __restrict__ grad_data,
+                  T* __restrict__ grad_warp, const int batch_size,
+                  const int data_height, const int data_width,
+                  const int data_channels, const int num_sampling_points) {
+    // Set gradients to 0, because the kernel incrementally updates the
+    // tensor entries by adding partial contributions.
+    const int grad_warp_size = batch_size * num_sampling_points * 2;
+    const int grad_data_size =
+        batch_size * data_height * data_width * data_channels;
+
+    ::tensorflow::CudaLaunchConfig config =
+        ::tensorflow::GetCudaLaunchConfig(grad_warp_size, d);
+    ::tensorflow::
+        SetZero<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            grad_warp_size, grad_warp);
+
+    config = ::tensorflow::GetCudaLaunchConfig(grad_data_size, d);
+    ::tensorflow::
+        SetZero<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            grad_data_size, grad_data);
+
+    const int resampler_output_size =
+        batch_size * num_sampling_points * data_channels;
+    config = ::tensorflow::GetCudaLaunchConfig(resampler_output_size, d);
+    ResamplerGrad2DKernel<T>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            data, warp, grad_output, grad_data, grad_warp, batch_size,
+            data_height, data_width, data_channels, num_sampling_points);
   }
 };
 
diff --git a/tensorflow/contrib/rnn/kernels/blas_gemm.cc b/tensorflow/contrib/rnn/kernels/blas_gemm.cc
index e62501e9b1..03006dab32 100644
--- a/tensorflow/contrib/rnn/kernels/blas_gemm.cc
+++ b/tensorflow/contrib/rnn/kernels/blas_gemm.cc
@@ -36,11 +36,10 @@ perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory) {
 
 namespace functor {
 template <typename T>
-void TensorCuBlasGemm<T>::operator()(OpKernelContext* ctx,
-                                     bool transa, bool transb, uint64 m,
-                                     uint64 n, uint64 k, T alpha, const T* a,
-                                     int lda, const T* b, int ldb, T beta, T* c,
-                                     int ldc) {
+void TensorCuBlasGemm<T>::operator()(OpKernelContext* ctx, bool transa,
+                                     bool transb, uint64 m, uint64 n, uint64 k,
+                                     T alpha, const T* a, int lda, const T* b,
+                                     int ldb, T beta, T* c, int ldc) {
 #if GOOGLE_CUDA
   perftools::gputools::blas::Transpose trans[] = {
       perftools::gputools::blas::Transpose::kNoTranspose,
diff --git a/tensorflow/contrib/rnn/kernels/gru_ops.cc b/tensorflow/contrib/rnn/kernels/gru_ops.cc
index 0796f82b21..bd3d898fb0 100644
--- a/tensorflow/contrib/rnn/kernels/gru_ops.cc
+++ b/tensorflow/contrib/rnn/kernels/gru_ops.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/contrib/rnn/kernels/gru_ops.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 
 namespace tensorflow {
@@ -61,9 +61,9 @@ class GRUCellBlockOp : public OpKernel {
                                         h_prev_tensor->dim_size(0), " vs. ",
                                         batch_size));
     OP_REQUIRES(ctx, h_prev_tensor->dim_size(1) == cell_size,
-                errors::InvalidArgument("h_prev.dims(1) != cell_size: ",
-                                        h_prev_tensor->dim_size(1), " vs. ",
-                                        cell_size));
+                errors::InvalidArgument(
+                    "h_prev.dims(1) != cell_size: ", h_prev_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
     // Shape of 'w_ru' must be [input_size+cell_size, 2*cell_size]
     OP_REQUIRES(ctx, w_ru_tensor->dim_size(0) == input_size + cell_size,
@@ -82,10 +82,10 @@ class GRUCellBlockOp : public OpKernel {
                     "w_c.dim_size(0) != input_size + cell_size: ",
                     w_c_tensor->dim_size(0), " vs. ", input_size + cell_size));
 
-    OP_REQUIRES(
-        ctx, w_c_tensor->dim_size(1) == cell_size,
-        errors::InvalidArgument("w_c.dim_size(1) != cell_size: ",
-                                w_c_tensor->dim_size(1), " vs. ", cell_size));
+    OP_REQUIRES(ctx, w_c_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument(
+                    "w_c.dim_size(1) != cell_size: ", w_c_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
     // Shape of 'b_ru' must be [2*cell_size]
     OP_REQUIRES(ctx, b_ru_tensor->dim_size(0) == cell_size * 2,
@@ -97,10 +97,10 @@ class GRUCellBlockOp : public OpKernel {
                 errors::InvalidArgument("Rank of b_ru must be 1",
                                         b_ru_tensor->dims(), " vs. 1", 1));
     // Shape of 'b_c' must be [cell_size]
-    OP_REQUIRES(
-        ctx, b_c_tensor->dim_size(0) == cell_size,
-        errors::InvalidArgument("b_c.dim_size(0) != cell_size: ",
-                                b_c_tensor->dim_size(0), " vs. ", cell_size));
+    OP_REQUIRES(ctx, b_c_tensor->dim_size(0) == cell_size,
+                errors::InvalidArgument(
+                    "b_c.dim_size(0) != cell_size: ", b_c_tensor->dim_size(0),
+                    " vs. ", cell_size));
     OP_REQUIRES(ctx, b_c_tensor->dims() == 1,
                 errors::InvalidArgument("Rank of b_c must be 1",
                                         b_c_tensor->dims(), " vs. 1"));
@@ -216,9 +216,9 @@ class GRUBlockCellGradOp : public OpKernel {
                                         h_prev_tensor->dim_size(0), " vs. ",
                                         batch_size));
     OP_REQUIRES(ctx, h_prev_tensor->dim_size(1) == cell_size,
-                errors::InvalidArgument("h_prev.dims(1) != cell_size: ",
-                                        h_prev_tensor->dim_size(1), " vs. ",
-                                        cell_size));
+                errors::InvalidArgument(
+                    "h_prev.dims(1) != cell_size: ", h_prev_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
     // Shape of 'w_ru' must be [input_size+cell_size, 2*cell_size]
     OP_REQUIRES(ctx, w_ru_tensor->dim_size(0) == input_size + cell_size,
@@ -237,10 +237,10 @@ class GRUBlockCellGradOp : public OpKernel {
                     "w_c.dim_size(0) != input_size + cell_size: ",
                     w_c_tensor->dim_size(0), " vs. ", input_size + cell_size));
 
-    OP_REQUIRES(
-        ctx, w_c_tensor->dim_size(1) == cell_size,
-        errors::InvalidArgument("w_c.dim_size(1) != cell_size: ",
-                                w_c_tensor->dim_size(1), " vs. ", cell_size));
+    OP_REQUIRES(ctx, w_c_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument(
+                    "w_c.dim_size(1) != cell_size: ", w_c_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
     // Shape of 'b_ru' must be [2*cell_size]
     OP_REQUIRES(ctx, b_ru_tensor->dim_size(0) == cell_size * 2,
@@ -253,54 +253,54 @@ class GRUBlockCellGradOp : public OpKernel {
                                         b_ru_tensor->dims(), " vs. 1"));
 
     // Shape of 'b_c' must be [cell_size]
-    OP_REQUIRES(
-        ctx, b_c_tensor->dim_size(0) == cell_size,
-        errors::InvalidArgument("b_c.dim_size(0) != cell_size: ",
-                                b_c_tensor->dim_size(0), " vs. ", cell_size));
+    OP_REQUIRES(ctx, b_c_tensor->dim_size(0) == cell_size,
+                errors::InvalidArgument(
+                    "b_c.dim_size(0) != cell_size: ", b_c_tensor->dim_size(0),
+                    " vs. ", cell_size));
 
     OP_REQUIRES(ctx, b_c_tensor->dims() == 1,
                 errors::InvalidArgument("Rank of b_c must be 1 ",
                                         b_c_tensor->dims(), " vs. 1"));
 
     // Shape of 'r' must be [batch_size, cell_size]
-    OP_REQUIRES(
-        ctx, r_tensor->dim_size(0) == batch_size,
-        errors::InvalidArgument("r.dims(0) != batch_size: ",
-                                r_tensor->dim_size(0), " vs. ", batch_size));
-    OP_REQUIRES(
-        ctx, r_tensor->dim_size(1) == cell_size,
-        errors::InvalidArgument("r.dims(1) != cell_size: ",
-                                r_tensor->dim_size(1), " vs. ", cell_size));
+    OP_REQUIRES(ctx, r_tensor->dim_size(0) == batch_size,
+                errors::InvalidArgument(
+                    "r.dims(0) != batch_size: ", r_tensor->dim_size(0), " vs. ",
+                    batch_size));
+    OP_REQUIRES(ctx, r_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument(
+                    "r.dims(1) != cell_size: ", r_tensor->dim_size(1), " vs. ",
+                    cell_size));
 
     // Shape of 'u' must be [batch_size, cell_size]
-    OP_REQUIRES(
-        ctx, u_tensor->dim_size(0) == batch_size,
-        errors::InvalidArgument("u.dims(0) != batch_size: ",
-                                u_tensor->dim_size(0), " vs. ", batch_size));
-    OP_REQUIRES(
-        ctx, u_tensor->dim_size(1) == cell_size,
-        errors::InvalidArgument("u.dims(1) != cell_size: ",
-                                u_tensor->dim_size(1), " vs. ", cell_size));
+    OP_REQUIRES(ctx, u_tensor->dim_size(0) == batch_size,
+                errors::InvalidArgument(
+                    "u.dims(0) != batch_size: ", u_tensor->dim_size(0), " vs. ",
+                    batch_size));
+    OP_REQUIRES(ctx, u_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument(
+                    "u.dims(1) != cell_size: ", u_tensor->dim_size(1), " vs. ",
+                    cell_size));
 
     // Shape of 'c' must be [batch_size, cell_size]
-    OP_REQUIRES(
-        ctx, c_tensor->dim_size(0) == batch_size,
-        errors::InvalidArgument("c.dims(0) != batch_size: ",
-                                c_tensor->dim_size(0), " vs. ", batch_size));
-    OP_REQUIRES(
-        ctx, c_tensor->dim_size(1) == cell_size,
-        errors::InvalidArgument("c.dims(1) != cell_size: ",
-                                c_tensor->dim_size(1), " vs. ", cell_size));
+    OP_REQUIRES(ctx, c_tensor->dim_size(0) == batch_size,
+                errors::InvalidArgument(
+                    "c.dims(0) != batch_size: ", c_tensor->dim_size(0), " vs. ",
+                    batch_size));
+    OP_REQUIRES(ctx, c_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument(
+                    "c.dims(1) != cell_size: ", c_tensor->dim_size(1), " vs. ",
+                    cell_size));
 
     // Shape of 'd_h' must be [batch_size, cell_size]
-    OP_REQUIRES(
-        ctx, d_h_tensor->dim_size(0) == batch_size,
-        errors::InvalidArgument("d_h.dims(0) != batch_size: ",
-                                d_h_tensor->dim_size(0), " vs. ", batch_size));
-    OP_REQUIRES(
-        ctx, d_h_tensor->dim_size(1) == cell_size,
-        errors::InvalidArgument("d_h.dims(1) != cell_size: ",
-                                d_h_tensor->dim_size(1), " vs. ", cell_size));
+    OP_REQUIRES(ctx, d_h_tensor->dim_size(0) == batch_size,
+                errors::InvalidArgument(
+                    "d_h.dims(0) != batch_size: ", d_h_tensor->dim_size(0),
+                    " vs. ", batch_size));
+    OP_REQUIRES(ctx, d_h_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument(
+                    "d_h.dims(1) != cell_size: ", d_h_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
     // Create output tensors.
     Tensor* d_x_tensor = nullptr;
diff --git a/tensorflow/contrib/rnn/kernels/lstm_ops.cc b/tensorflow/contrib/rnn/kernels/lstm_ops.cc
index 941a457fd3..5e7cf0ce84 100644
--- a/tensorflow/contrib/rnn/kernels/lstm_ops.cc
+++ b/tensorflow/contrib/rnn/kernels/lstm_ops.cc
@@ -281,23 +281,23 @@ class LSTMBlockCellOp : public OpKernel {
                                         h_prev_tensor->dim_size(0), " vs. ",
                                         batch_size));
     OP_REQUIRES(ctx, h_prev_tensor->dim_size(1) == cell_size,
-                errors::InvalidArgument("h_prev.dims(1) != cell_size: ",
-                                        h_prev_tensor->dim_size(1), " vs. ",
-                                        cell_size));
+                errors::InvalidArgument(
+                    "h_prev.dims(1) != cell_size: ", h_prev_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
     OP_REQUIRES(ctx, w_tensor->dim_size(0) == input_size + cell_size,
                 errors::InvalidArgument(
                     "w.dim_size(0) != input_size + cell_size: ",
                     w_tensor->dim_size(0), " vs. ", input_size + cell_size));
-    OP_REQUIRES(
-        ctx, w_tensor->dim_size(1) == cell_size * 4,
-        errors::InvalidArgument("w.dim_size(1) != cell_size * 4: ",
-                                w_tensor->dim_size(1), " vs. ", cell_size * 4));
+    OP_REQUIRES(ctx, w_tensor->dim_size(1) == cell_size * 4,
+                errors::InvalidArgument(
+                    "w.dim_size(1) != cell_size * 4: ", w_tensor->dim_size(1),
+                    " vs. ", cell_size * 4));
 
-    OP_REQUIRES(
-        ctx, b_tensor->dim_size(0) == cell_size * 4,
-        errors::InvalidArgument("b.dim_size(0) != cell_size * 4: ",
-                                b_tensor->dim_size(0), " vs. ", cell_size * 4));
+    OP_REQUIRES(ctx, b_tensor->dim_size(0) == cell_size * 4,
+                errors::InvalidArgument(
+                    "b.dim_size(0) != cell_size * 4: ", b_tensor->dim_size(0),
+                    " vs. ", cell_size * 4));
 
     // Allocate our output tensors.
     Tensor* i_tensor = nullptr;
@@ -484,77 +484,77 @@ class LSTMBlockCellGradOp : public OpKernel {
                                         h_prev_tensor->dim_size(0), " vs. ",
                                         batch_size));
     OP_REQUIRES(ctx, h_prev_tensor->dim_size(1) == cell_size,
-                errors::InvalidArgument("h_prev.dims(1) != cell_size: ",
-                                        h_prev_tensor->dim_size(1), " vs. ",
-                                        cell_size));
+                errors::InvalidArgument(
+                    "h_prev.dims(1) != cell_size: ", h_prev_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
     OP_REQUIRES(ctx, w_tensor->dim_size(0) == input_size + cell_size,
                 errors::InvalidArgument(
                     "w.dim_size(0) != input_size + cell_size: ",
                     w_tensor->dim_size(0), " vs. ", input_size + cell_size));
-    OP_REQUIRES(
-        ctx, w_tensor->dim_size(1) == cell_size * 4,
-        errors::InvalidArgument("w.dim_size(1) != cell_size * 4: ",
-                                w_tensor->dim_size(1), " vs. ", cell_size * 4));
+    OP_REQUIRES(ctx, w_tensor->dim_size(1) == cell_size * 4,
+                errors::InvalidArgument(
+                    "w.dim_size(1) != cell_size * 4: ", w_tensor->dim_size(1),
+                    " vs. ", cell_size * 4));
 
-    OP_REQUIRES(
-        ctx, b_tensor->dim_size(0) == cell_size * 4,
-        errors::InvalidArgument("b.dim_size(0) != cell_size * 4: ",
-                                b_tensor->dim_size(0), " vs. ", cell_size * 4));
+    OP_REQUIRES(ctx, b_tensor->dim_size(0) == cell_size * 4,
+                errors::InvalidArgument(
+                    "b.dim_size(0) != cell_size * 4: ", b_tensor->dim_size(0),
+                    " vs. ", cell_size * 4));
 
-    OP_REQUIRES(
-        ctx, i_tensor->dim_size(0) == batch_size,
-        errors::InvalidArgument("i.dim_size(0) != batch_size: ",
-                                i_tensor->dim_size(0), " vs. ", batch_size));
-    OP_REQUIRES(
-        ctx, i_tensor->dim_size(1) == cell_size,
-        errors::InvalidArgument("i.dim_size(1) != cell_size: ",
-                                i_tensor->dim_size(1), " vs. ", cell_size));
+    OP_REQUIRES(ctx, i_tensor->dim_size(0) == batch_size,
+                errors::InvalidArgument(
+                    "i.dim_size(0) != batch_size: ", i_tensor->dim_size(0),
+                    " vs. ", batch_size));
+    OP_REQUIRES(ctx, i_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument(
+                    "i.dim_size(1) != cell_size: ", i_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
-    OP_REQUIRES(
-        ctx, cs_tensor->dim_size(0) == batch_size,
-        errors::InvalidArgument("cs.dim_size(0) != batch_size: ",
-                                cs_tensor->dim_size(0), " vs. ", batch_size));
-    OP_REQUIRES(
-        ctx, cs_tensor->dim_size(1) == cell_size,
-        errors::InvalidArgument("cs.dim_size(1) != cell_size: ",
-                                cs_tensor->dim_size(1), " vs. ", cell_size));
+    OP_REQUIRES(ctx, cs_tensor->dim_size(0) == batch_size,
+                errors::InvalidArgument(
+                    "cs.dim_size(0) != batch_size: ", cs_tensor->dim_size(0),
+                    " vs. ", batch_size));
+    OP_REQUIRES(ctx, cs_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument(
+                    "cs.dim_size(1) != cell_size: ", cs_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
-    OP_REQUIRES(
-        ctx, f_tensor->dim_size(0) == batch_size,
-        errors::InvalidArgument("f.dim_size(0) != batch_size: ",
-                                f_tensor->dim_size(0), " vs. ", batch_size));
-    OP_REQUIRES(
-        ctx, f_tensor->dim_size(1) == cell_size,
-        errors::InvalidArgument("i.dim_size(1) != cell_size: ",
-                                f_tensor->dim_size(1), " vs. ", cell_size));
+    OP_REQUIRES(ctx, f_tensor->dim_size(0) == batch_size,
+                errors::InvalidArgument(
+                    "f.dim_size(0) != batch_size: ", f_tensor->dim_size(0),
+                    " vs. ", batch_size));
+    OP_REQUIRES(ctx, f_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument(
+                    "i.dim_size(1) != cell_size: ", f_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
-    OP_REQUIRES(
-        ctx, o_tensor->dim_size(0) == batch_size,
-        errors::InvalidArgument("o.dim_size(0) != batch_size: ",
-                                o_tensor->dim_size(0), " vs. ", batch_size));
-    OP_REQUIRES(
-        ctx, o_tensor->dim_size(1) == cell_size,
-        errors::InvalidArgument("o.dim_size(1) != cell_size: ",
-                                o_tensor->dim_size(1), " vs. ", cell_size));
+    OP_REQUIRES(ctx, o_tensor->dim_size(0) == batch_size,
+                errors::InvalidArgument(
+                    "o.dim_size(0) != batch_size: ", o_tensor->dim_size(0),
+                    " vs. ", batch_size));
+    OP_REQUIRES(ctx, o_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument(
+                    "o.dim_size(1) != cell_size: ", o_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
-    OP_REQUIRES(
-        ctx, ci_tensor->dim_size(0) == batch_size,
-        errors::InvalidArgument("ci.dim_size(0) != batch_size: ",
-                                ci_tensor->dim_size(0), " vs. ", batch_size));
-    OP_REQUIRES(
-        ctx, ci_tensor->dim_size(1) == cell_size,
-        errors::InvalidArgument("ci.dim_size(1) != cell_size: ",
-                                ci_tensor->dim_size(1), " vs. ", cell_size));
+    OP_REQUIRES(ctx, ci_tensor->dim_size(0) == batch_size,
+                errors::InvalidArgument(
+                    "ci.dim_size(0) != batch_size: ", ci_tensor->dim_size(0),
+                    " vs. ", batch_size));
+    OP_REQUIRES(ctx, ci_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument(
+                    "ci.dim_size(1) != cell_size: ", ci_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
-    OP_REQUIRES(
-        ctx, co_tensor->dim_size(0) == batch_size,
-        errors::InvalidArgument("co.dim_size(0) != batch_size: ",
-                                co_tensor->dim_size(0), " vs. ", batch_size));
-    OP_REQUIRES(
-        ctx, co_tensor->dim_size(1) == cell_size,
-        errors::InvalidArgument("co.dim_size(1) != cell_size: ",
-                                co_tensor->dim_size(1), " vs. ", cell_size));
+    OP_REQUIRES(ctx, co_tensor->dim_size(0) == batch_size,
+                errors::InvalidArgument(
+                    "co.dim_size(0) != batch_size: ", co_tensor->dim_size(0),
+                    " vs. ", batch_size));
+    OP_REQUIRES(ctx, co_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument(
+                    "co.dim_size(1) != cell_size: ", co_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
     OP_REQUIRES(ctx, cs_grad_tensor->dim_size(0) == batch_size,
                 errors::InvalidArgument(
@@ -860,9 +860,9 @@ class BlockLSTMOp : public OpKernel {
                                         h_prev_tensor->dim_size(0), " vs. ",
                                         batch_size));
     OP_REQUIRES(ctx, h_prev_tensor->dim_size(1) == cell_size,
-                errors::InvalidArgument("h_prev.dims(1) != cell_size: ",
-                                        h_prev_tensor->dim_size(1), " vs. ",
-                                        cell_size));
+                errors::InvalidArgument(
+                    "h_prev.dims(1) != cell_size: ", h_prev_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
     const Tensor* w_tensor = nullptr;
     OP_REQUIRES_OK(ctx, ctx->input("w", &w_tensor));
@@ -872,46 +872,46 @@ class BlockLSTMOp : public OpKernel {
                 errors::InvalidArgument(
                     "w.dim_size(0) != input_size + cell_size: ",
                     w_tensor->dim_size(0), " vs. ", input_size + cell_size));
-    OP_REQUIRES(
-        ctx, w_tensor->dim_size(1) == cell_size * 4,
-        errors::InvalidArgument("w.dim_size(1) != cell_size * 4: ",
-                                w_tensor->dim_size(1), " vs. ", cell_size * 4));
+    OP_REQUIRES(ctx, w_tensor->dim_size(1) == cell_size * 4,
+                errors::InvalidArgument(
+                    "w.dim_size(1) != cell_size * 4: ", w_tensor->dim_size(1),
+                    " vs. ", cell_size * 4));
 
     const Tensor* wci_tensor = nullptr;
     OP_REQUIRES_OK(ctx, ctx->input("wci", &wci_tensor));
     OP_REQUIRES(ctx, wci_tensor->dims() == 1,
                 errors::InvalidArgument("wci must be 1D"));
-    OP_REQUIRES(
-        ctx, wci_tensor->dim_size(0) == cell_size,
-        errors::InvalidArgument("wci.dim_size(0) != cell_size: ",
-                                wci_tensor->dim_size(0), " vs. ", cell_size));
+    OP_REQUIRES(ctx, wci_tensor->dim_size(0) == cell_size,
+                errors::InvalidArgument(
+                    "wci.dim_size(0) != cell_size: ", wci_tensor->dim_size(0),
+                    " vs. ", cell_size));
 
     const Tensor* wcf_tensor = nullptr;
     OP_REQUIRES_OK(ctx, ctx->input("wcf", &wcf_tensor));
     OP_REQUIRES(ctx, wcf_tensor->dims() == 1,
                 errors::InvalidArgument("wcf must be 1D"));
-    OP_REQUIRES(
-        ctx, wcf_tensor->dim_size(0) == cell_size,
-        errors::InvalidArgument("wcf.dim_size(0) != cell_size: ",
-                                wcf_tensor->dim_size(0), " vs. ", cell_size));
+    OP_REQUIRES(ctx, wcf_tensor->dim_size(0) == cell_size,
+                errors::InvalidArgument(
+                    "wcf.dim_size(0) != cell_size: ", wcf_tensor->dim_size(0),
+                    " vs. ", cell_size));
 
     const Tensor* wco_tensor = nullptr;
     OP_REQUIRES_OK(ctx, ctx->input("wco", &wco_tensor));
     OP_REQUIRES(ctx, wco_tensor->dims() == 1,
                 errors::InvalidArgument("wco must be 1D"));
-    OP_REQUIRES(
-        ctx, wco_tensor->dim_size(0) == cell_size,
-        errors::InvalidArgument("wco.dim_size(0) != cell_size: ",
-                                wco_tensor->dim_size(0), " vs. ", cell_size));
+    OP_REQUIRES(ctx, wco_tensor->dim_size(0) == cell_size,
+                errors::InvalidArgument(
+                    "wco.dim_size(0) != cell_size: ", wco_tensor->dim_size(0),
+                    " vs. ", cell_size));
 
     const Tensor* b_tensor = nullptr;
     OP_REQUIRES_OK(ctx, ctx->input("b", &b_tensor));
     OP_REQUIRES(ctx, b_tensor->dims() == 1,
                 errors::InvalidArgument("b must be 1D"));
-    OP_REQUIRES(
-        ctx, b_tensor->dim_size(0) == cell_size * 4,
-        errors::InvalidArgument("b.dim_size(0) != cell_size * 4: ",
-                                b_tensor->dim_size(0), " vs. ", cell_size * 4));
+    OP_REQUIRES(ctx, b_tensor->dim_size(0) == cell_size * 4,
+                errors::InvalidArgument(
+                    "b.dim_size(0) != cell_size * 4: ", b_tensor->dim_size(0),
+                    " vs. ", cell_size * 4));
 
     TensorShape batch_cell_shape({timelen, batch_size, cell_size});
     Tensor* i_out;
@@ -1065,9 +1065,9 @@ class BlockLSTMGradOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->input("w", &w_tensor));
     const int64 cell_size = w_tensor->dim_size(1) / 4;
     OP_REQUIRES(ctx, input_size + cell_size == w_tensor->dim_size(0),
-                errors::InvalidArgument("w matrix rows don't match: ",
-                                        input_size + cell_size, " vs. ",
-                                        w_tensor->dim_size(0)));
+                errors::InvalidArgument(
+                    "w matrix rows don't match: ", input_size + cell_size,
+                    " vs. ", w_tensor->dim_size(0)));
 
     const Tensor* wci_tensor = nullptr;
     OP_REQUIRES_OK(ctx, ctx->input("wci", &wci_tensor));
@@ -1193,7 +1193,6 @@ class BlockLSTMGradOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::v(),
                                            batch_cell_shape, &h_grad_tensor));
 
-
     const Device& device = ctx->eigen_device<Device>();
 
     functor::TensorZero<Device, T>()(device, cs_grad_tensor.flat<float>());
diff --git a/tensorflow/contrib/rnn/kernels/lstm_ops.h b/tensorflow/contrib/rnn/kernels/lstm_ops.h
index bc6b85f3f1..d23cedc234 100644
--- a/tensorflow/contrib/rnn/kernels/lstm_ops.h
+++ b/tensorflow/contrib/rnn/kernels/lstm_ops.h
@@ -92,7 +92,6 @@ struct TensorZeroPadding {
   }
 };
 
-
 struct LSTMBlockCell {
   LSTMBlockCell(const int batch_size, const int input_size, const int cell_size)
       : batch_size_(batch_size),
diff --git a/tensorflow/contrib/rnn/ops/lstm_ops_test.cc b/tensorflow/contrib/rnn/ops/lstm_ops_test.cc
index 544cd163c5..68184b643e 100644
--- a/tensorflow/contrib/rnn/ops/lstm_ops_test.cc
+++ b/tensorflow/contrib/rnn/ops/lstm_ops_test.cc
@@ -149,8 +149,9 @@ TEST_F(LSTMOpsTest, BlockLSTMGrad_ShapeFn) {
   INFER_ERROR("must be rank 1", op, "?;?;?;?;?;?;?;?;[1,?]" + suffix);
 
   // Output with all input knowns makes known rank outputs.
-  INFER_OK(op, JoinedCopies("?", 18), "[?,?,?];" + JoinedCopies("[?,?]", 3) +
-                                          ";" + JoinedCopies("[?]", 4));
+  INFER_OK(
+      op, JoinedCopies("?", 18),
+      "[?,?,?];" + JoinedCopies("[?,?]", 3) + ";" + JoinedCopies("[?]", 4));
 
   // Output with copies input shapes to output.
   string input = strings::StrCat("?;[?,?,?];", JoinedCopies("[?,?]", 3), ";",
diff --git a/tensorflow/contrib/session_bundle/bundle_shim_test.cc b/tensorflow/contrib/session_bundle/bundle_shim_test.cc
index 72f32a0f55..9a1dd9303f 100644
--- a/tensorflow/contrib/session_bundle/bundle_shim_test.cc
+++ b/tensorflow/contrib/session_bundle/bundle_shim_test.cc
@@ -493,17 +493,15 @@ TEST(BundleShimTest, DefaultAndNamedSignatureWithPredict) {
   ASSERT_FALSE(
       actual_signature_def_predict->second.inputs().find("foo-input") ==
       actual_signature_def_predict->second.inputs().end());
-  EXPECT_EQ("foo-input",
-            actual_signature_def_predict->second.inputs()
-                .find("foo-input")
-                ->second.name());
+  EXPECT_EQ("foo-input", actual_signature_def_predict->second.inputs()
+                             .find("foo-input")
+                             ->second.name());
   ASSERT_FALSE(
       actual_signature_def_predict->second.outputs().find("foo-output") ==
       actual_signature_def_predict->second.outputs().end());
-  EXPECT_EQ("foo-output",
-            actual_signature_def_predict->second.outputs()
-                .find("foo-output")
-                ->second.name());
+  EXPECT_EQ("foo-output", actual_signature_def_predict->second.outputs()
+                              .find("foo-output")
+                              ->second.name());
   EXPECT_EQ(kPredictMethodName,
             actual_signature_def_predict->second.method_name());
 }
diff --git a/tensorflow/contrib/session_bundle/signature.cc b/tensorflow/contrib/session_bundle/signature.cc
index 7133875ad5..ed70a5b91b 100644
--- a/tensorflow/contrib/session_bundle/signature.cc
+++ b/tensorflow/contrib/session_bundle/signature.cc
@@ -38,9 +38,9 @@ namespace {
 Status BatchSizesMatch(const Tensor& input, const Tensor& output) {
   // Ensure the number of outputs match the number of inputs.
   if (input.dim_size(0) != output.dim_size(0)) {
-    return errors::Internal(
-        strings::StrCat("Input batch size did not match output batch size: ",
-                        input.dim_size(0), " vs. ", output.dim_size(0)));
+    return errors::Internal(strings::StrCat(
+        "Input batch size did not match output batch size: ", input.dim_size(0),
+        " vs. ", output.dim_size(0)));
   }
   return Status::OK();
 }
@@ -100,8 +100,8 @@ Status GetNamedClassificationSignature(
   const auto& it = signatures.named_signatures().find(name);
   if (it == signatures.named_signatures().end()) {
     return errors::NotFound(
-        strings::StrCat("Missing signature named \"", name, "\" in: ",
-                        DebugStringIfAvailable(signatures)));
+        strings::StrCat("Missing signature named \"", name,
+                        "\" in: ", DebugStringIfAvailable(signatures)));
   }
   if (!it->second.has_classification_signature()) {
     return errors::FailedPrecondition(
@@ -232,8 +232,8 @@ Status GetNamedSignature(const string& name,
   const auto& it = signatures.named_signatures().find(name);
   if (it == signatures.named_signatures().end()) {
     return errors::NotFound(
-        strings::StrCat("Missing signature named \"", name, "\" in: ",
-                        DebugStringIfAvailable(signatures)));
+        strings::StrCat("Missing signature named \"", name,
+                        "\" in: ", DebugStringIfAvailable(signatures)));
   }
   *signature = it->second;
   return Status::OK();
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/hard_routing_function_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/hard_routing_function_op.cc
index 76cfb4c9ca..cf0db788a4 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/hard_routing_function_op.cc
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/hard_routing_function_op.cc
@@ -99,18 +99,17 @@ class HardRoutingFunction : public OpKernel {
     const Tensor& tree_biases_tensor = context->input(2);
 
     if (input_data.shape().dim_size(0) > 0) {
-      OP_REQUIRES(context, input_data.shape().dims() == 2,
-                  errors::InvalidArgument(
-                      "input_data should be two-dimensional"));
+      OP_REQUIRES(
+          context, input_data.shape().dims() == 2,
+          errors::InvalidArgument("input_data should be two-dimensional"));
     }
 
     // Check tensor bounds.
     if (!CheckTensorBounds(context, input_data)) return;
 
-    const int32 num_data = static_cast<int32>(
-        input_data.shape().dim_size(0));
-    const int32 num_features = static_cast<int32>(
-        input_data.shape().dim_size(1));
+    const int32 num_data = static_cast<int32>(input_data.shape().dim_size(0));
+    const int32 num_features =
+        static_cast<int32>(input_data.shape().dim_size(1));
 
     Tensor* output_probability = nullptr;
     TensorShape output_probability_shape;
@@ -125,9 +124,8 @@ class HardRoutingFunction : public OpKernel {
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, output_probability_shape,
                                             &output_probability));
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(1, output_path_shape,
-                                            &output_path));
+    OP_REQUIRES_OK(
+        context, context->allocate_output(1, output_path_shape, &output_path));
 
     auto out_probability = output_probability->tensor<float, 2>();
     auto out_path = output_path->tensor<int32, 2>();
@@ -144,12 +142,11 @@ class HardRoutingFunction : public OpKernel {
       out_probability(i, 0) = 1.0;
       out_path(i, 0) = 0;
       for (int j = 0; j < tree_depth_ - 1; j++) {
-        float left_prob = LeftProbability(point,
-                                          tree_parameters_tensor.Slice(j, j+1),
-                                          tree_biases(j),
-                                          num_features);
+        float left_prob =
+            LeftProbability(point, tree_parameters_tensor.Slice(j, j + 1),
+                            tree_biases(j), num_features);
 
-        int32 left_child = 2*node + 1;
+        int32 left_child = 2 * node + 1;
         int32 right_child = left_child + 1;
 
         float dot_product = 0.0;
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/k_feature_gradient_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/k_feature_gradient_op.cc
index 28f50f1a32..f64155fa55 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/k_feature_gradient_op.cc
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/k_feature_gradient_op.cc
@@ -85,12 +85,9 @@ REGISTER_OP("KFeatureGradient")
 
 class KFeatureGradient : public OpKernel {
  public:
-  explicit KFeatureGradient(OpKernelConstruction* context)
-      : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("layer_num",
-                                             &layer_num_));
-    OP_REQUIRES_OK(context, context->GetAttr("random_seed",
-                                             &random_seed_));
+  explicit KFeatureGradient(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("layer_num", &layer_num_));
+    OP_REQUIRES_OK(context, context->GetAttr("random_seed", &random_seed_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -101,14 +98,14 @@ class KFeatureGradient : public OpKernel {
     const Tensor& routing_tensor = context->input(3);
 
     // Extract dimensions from input tensors.
-    const int32 num_data = static_cast<int32>(
-        input_data_tensor.shape().dim_size(0));
-    const int32 num_features = static_cast<int32>(
-        input_data_tensor.shape().dim_size(1));
-    const int32 num_nodes = static_cast<int32>(
-        tree_parameters_tensor.shape().dim_size(0));
-    const int32 num_features_per_node = static_cast<int32>(
-        tree_parameters_tensor.shape().dim_size(1));
+    const int32 num_data =
+        static_cast<int32>(input_data_tensor.shape().dim_size(0));
+    const int32 num_features =
+        static_cast<int32>(input_data_tensor.shape().dim_size(1));
+    const int32 num_nodes =
+        static_cast<int32>(tree_parameters_tensor.shape().dim_size(0));
+    const int32 num_features_per_node =
+        static_cast<int32>(tree_parameters_tensor.shape().dim_size(1));
 
     // Construct output tensors.
     Tensor* out_routes = nullptr;
@@ -127,12 +124,12 @@ class KFeatureGradient : public OpKernel {
     out_weights_shape.AddDim(num_nodes);
     out_weights_shape.AddDim(num_features_per_node);
 
-    OP_REQUIRES_OK(context, context->allocate_output(
-        0, out_routes_shape, &out_routes));
-    OP_REQUIRES_OK(context, context->allocate_output(
-        1, out_data_shape, &out_data));
-    OP_REQUIRES_OK(context, context->allocate_output(
-        2, out_weights_shape, &out_weights));
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, out_routes_shape, &out_routes));
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(1, out_data_shape, &out_data));
+    OP_REQUIRES_OK(
+        context, context->allocate_output(2, out_weights_shape, &out_weights));
 
     tensorforest::Initialize(*out_data, 0.0f);
 
@@ -148,18 +145,13 @@ class KFeatureGradient : public OpKernel {
 
     std::vector<int32> feature_set;
     for (int i = 0; i < num_data; i++) {
-      const Tensor point = input_data_tensor.Slice(i, i+1);
+      const Tensor point = input_data_tensor.Slice(i, i + 1);
       feature_set.clear();
 
       // Traverse the tree from the bottom up.
       for (int j = num_nodes - 1; j >= 0; j--) {
-        tensorforest::GetFeatureSet(
-            layer_num_,
-            j,
-            random_seed_,
-            num_features,
-            num_features_per_node,
-            &feature_set);
+        tensorforest::GetFeatureSet(layer_num_, j, random_seed_, num_features,
+                                    num_features_per_node, &feature_set);
 
         // Compute routing gradient.
         // j is a leaf node.
@@ -170,12 +162,8 @@ class KFeatureGradient : public OpKernel {
           int32 right_child = left_child + 1;
 
           float left_prob = LeftProbabilityK(
-              point,
-              feature_set,
-              tree_parameters_tensor.Slice(j, j+1),
-              tree_biases(j),
-              num_features,
-              num_features_per_node);
+              point, feature_set, tree_parameters_tensor.Slice(j, j + 1),
+              tree_biases(j), num_features, num_features_per_node);
 
           float right_prob = 1.0f - left_prob;
 
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/k_feature_routing_function_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/k_feature_routing_function_op.cc
index 9bc42eb61f..e7cafb144d 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/k_feature_routing_function_op.cc
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/k_feature_routing_function_op.cc
@@ -43,7 +43,6 @@ using shape_inference::ShapeHandle;
 using tensorforest::CheckTensorBounds;
 using tensorforest::LeftProbabilityK;
 
-
 // The term 'routing function' is synonymous with 'the probability
 // that an instance is routed to each leaf node.'  It is defined in
 // 'Deep Neural Decision Forests' by Kontschieder et al.
@@ -96,10 +95,8 @@ class KFeatureRoutingFunction : public OpKernel {
     OP_REQUIRES_OK(context, context->GetAttr("max_nodes", &max_nodes_));
     OP_REQUIRES_OK(context, context->GetAttr("num_features_per_node",
                                              &num_features_per_node_));
-    OP_REQUIRES_OK(context, context->GetAttr("layer_num",
-                                             &layer_num_));
-    OP_REQUIRES_OK(context, context->GetAttr("random_seed",
-                                             &random_seed_));
+    OP_REQUIRES_OK(context, context->GetAttr("layer_num", &layer_num_));
+    OP_REQUIRES_OK(context, context->GetAttr("random_seed", &random_seed_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -108,27 +105,25 @@ class KFeatureRoutingFunction : public OpKernel {
     const Tensor& tree_biases_tensor = context->input(2);
 
     if (input_data.shape().dim_size(0) > 0) {
-      OP_REQUIRES(context, input_data.shape().dims() == 2,
-                  errors::InvalidArgument(
-                      "input_data should be two-dimensional"));
+      OP_REQUIRES(
+          context, input_data.shape().dims() == 2,
+          errors::InvalidArgument("input_data should be two-dimensional"));
     }
 
     // Check tensor bounds.
     if (!CheckTensorBounds(context, input_data)) return;
 
-    const int32 num_data = static_cast<int32>(
-        input_data.shape().dim_size(0));
-    const int32 num_features = static_cast<int32>(
-        input_data.shape().dim_size(1));
+    const int32 num_data = static_cast<int32>(input_data.shape().dim_size(0));
+    const int32 num_features =
+        static_cast<int32>(input_data.shape().dim_size(1));
 
     Tensor* output_probabilities = nullptr;
     TensorShape output_shape;
     output_shape.AddDim(num_data);
     output_shape.AddDim(max_nodes_);
 
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, output_shape,
-                                            &output_probabilities));
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape,
+                                                     &output_probabilities));
 
     auto out_probs = output_probabilities->tensor<float, 2>();
     const auto tree_biases = tree_biases_tensor.tensor<float, 1>();
@@ -136,30 +131,22 @@ class KFeatureRoutingFunction : public OpKernel {
     // Iteratively compute the probability of reaching each leaf.
     std::vector<int32> feature_set;
     for (int i = 0; i < num_data; i++) {
-      const Tensor point = input_data.Slice(i, i+1);
+      const Tensor point = input_data.Slice(i, i + 1);
 
       out_probs(i, 0) = 1.0f;
 
       for (int j = 0; j < max_nodes_ / 2; j++) {
         feature_set.clear();
-        tensorforest::GetFeatureSet(
-            layer_num_,
-            i,
-            random_seed_,
-            num_features,
-            num_features_per_node_,
-            &feature_set);
-
-        int32 left_child = 2*j + 1;
+        tensorforest::GetFeatureSet(layer_num_, i, random_seed_, num_features,
+                                    num_features_per_node_, &feature_set);
+
+        int32 left_child = 2 * j + 1;
         int32 right_child = left_child + 1;
 
         float prob = out_probs(i, j);
-        float left_prob = LeftProbabilityK(point,
-                                           feature_set,
-                                           tree_parameters_tensor.Slice(j, j+1),
-                                           tree_biases(j),
-                                           num_features,
-                                           num_features_per_node_);
+        float left_prob = LeftProbabilityK(
+            point, feature_set, tree_parameters_tensor.Slice(j, j + 1),
+            tree_biases(j), num_features, num_features_per_node_);
 
         out_probs(i, left_child) = prob * left_prob;
         out_probs(i, right_child) = prob * (1.0f - left_prob);
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/routing_function_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/routing_function_op.cc
index 4027e732b3..0c2eaabe8f 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/routing_function_op.cc
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/routing_function_op.cc
@@ -90,46 +90,43 @@ class RoutingFunction : public OpKernel {
     const Tensor& tree_biases_tensor = context->input(2);
 
     if (input_data.shape().dim_size(0) > 0) {
-      OP_REQUIRES(context, input_data.shape().dims() == 2,
-                  errors::InvalidArgument(
-                      "input_data should be two-dimensional"));
+      OP_REQUIRES(
+          context, input_data.shape().dims() == 2,
+          errors::InvalidArgument("input_data should be two-dimensional"));
     }
 
     // Check tensor bounds.
     if (!CheckTensorBounds(context, input_data)) return;
 
-    const int32 num_data = static_cast<int32>(
-        input_data.shape().dim_size(0));
-    const int32 num_features = static_cast<int32>(
-        input_data.shape().dim_size(1));
+    const int32 num_data = static_cast<int32>(input_data.shape().dim_size(0));
+    const int32 num_features =
+        static_cast<int32>(input_data.shape().dim_size(1));
 
     Tensor* output_probabilities = nullptr;
     TensorShape output_shape;
     output_shape.AddDim(num_data);
     output_shape.AddDim(max_nodes_);
 
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, output_shape,
-                                            &output_probabilities));
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape,
+                                                     &output_probabilities));
 
     auto out_probs = output_probabilities->tensor<float, 2>();
     const auto tree_biases = tree_biases_tensor.tensor<float, 1>();
 
     // Iteratively compute the probability of reaching each leaf.
     for (int i = 0; i < num_data; i++) {
-      const Tensor point = input_data.Slice(i, i+1);
+      const Tensor point = input_data.Slice(i, i + 1);
 
       out_probs(i, 0) = 1.0;
 
       for (int j = 0; j < max_nodes_ / 2; j++) {
-        int32 left_child = 2*j + 1;
+        int32 left_child = 2 * j + 1;
         int32 right_child = left_child + 1;
 
         float prob = out_probs(i, j);
-        float left_prob = LeftProbability(point,
-                                          tree_parameters_tensor.Slice(j, j+1),
-                                          tree_biases(j),
-                                          num_features);
+        float left_prob =
+            LeftProbability(point, tree_parameters_tensor.Slice(j, j + 1),
+                            tree_biases(j), num_features);
 
         out_probs(i, left_child) = prob * left_prob;
         out_probs(i, right_child) = prob * (1.0 - left_prob);
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_function_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_function_op.cc
index 66aa293dc1..c9df09bfda 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_function_op.cc
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_function_op.cc
@@ -96,10 +96,9 @@ class StochasticHardRoutingFunction : public OpKernel {
   explicit StochasticHardRoutingFunction(OpKernelConstruction* context)
       : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("tree_depth", &tree_depth_));
-    OP_REQUIRES_OK(context, context->GetAttr("random_seed",
-                                             &random_seed_));
+    OP_REQUIRES_OK(context, context->GetAttr("random_seed", &random_seed_));
     single_rand_ = std::unique_ptr<random::PhiloxRandom>(
-          new random::PhiloxRandom(random_seed_));
+        new random::PhiloxRandom(random_seed_));
     rng_ = std::unique_ptr<random::SimplePhilox>(
         new random::SimplePhilox(single_rand_.get()));
   }
@@ -111,20 +110,19 @@ class StochasticHardRoutingFunction : public OpKernel {
     const Tensor& tree_biases_tensor = context->input(2);
 
     if (input_data.shape().dim_size(0) > 0) {
-      OP_REQUIRES(context, input_data.shape().dims() == 2,
-                  errors::InvalidArgument(
-                      "input_data should be two-dimensional"));
+      OP_REQUIRES(
+          context, input_data.shape().dims() == 2,
+          errors::InvalidArgument("input_data should be two-dimensional"));
     }
 
     // Check tensor bounds.
     if (!CheckTensorBounds(context, input_data)) return;
 
-    const int32 num_data = static_cast<int32>(
-        input_data.shape().dim_size(0));
-    const int32 num_features = static_cast<int32>(
-        input_data.shape().dim_size(1));
-    const int32 num_nodes = static_cast<int32>(
-        tree_parameters_tensor.shape().dim_size(0));
+    const int32 num_data = static_cast<int32>(input_data.shape().dim_size(0));
+    const int32 num_features =
+        static_cast<int32>(input_data.shape().dim_size(1));
+    const int32 num_nodes =
+        static_cast<int32>(tree_parameters_tensor.shape().dim_size(0));
 
     Tensor* output_probability = nullptr;
     TensorShape output_probability_shape;
@@ -139,9 +137,8 @@ class StochasticHardRoutingFunction : public OpKernel {
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, output_probability_shape,
                                             &output_probability));
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(1, output_path_shape,
-                                            &output_path));
+    OP_REQUIRES_OK(
+        context, context->allocate_output(1, output_path_shape, &output_path));
 
     auto out_probability = output_probability->tensor<float, 2>();
     auto out_path = output_path->tensor<int32, 2>();
@@ -150,19 +147,18 @@ class StochasticHardRoutingFunction : public OpKernel {
     // Stochastically traverse the tree to a leaf.
 
     for (int i = 0; i < num_data; i++) {
-      const Tensor point = input_data.Slice(i, i+1);
+      const Tensor point = input_data.Slice(i, i + 1);
 
       int32 node = 0;
       out_probability(i, 0) = 1.0;
       out_path(i, 0) = 0;
       for (int j = 0; j < tree_depth_ - 1; j++) {
-        int32 left_child = 2*node + 1;
+        int32 left_child = 2 * node + 1;
         int32 right_child = left_child + 1;
 
-        float left_prob = LeftProbability(point,
-                                          tree_parameters_tensor.Slice(j, j+1),
-                                          tree_biases(j),
-                                          num_features);
+        float left_prob =
+            LeftProbability(point, tree_parameters_tensor.Slice(j, j + 1),
+                            tree_biases(j), num_features);
 
         if (left_prob < rng_->RandFloat()) {
           CHECK_LT(i, num_data);
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_gradient_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_gradient_op.cc
index 0b5afe464f..b0d8b832b5 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_gradient_op.cc
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_gradient_op.cc
@@ -149,14 +149,14 @@ class StochasticHardRoutingGradient : public OpKernel {
     TensorShape output_bias_shape;
     output_bias_shape.AddDim(num_data);
 
-    OP_REQUIRES_OK(context, context->allocate_output(
-        0, output_routing_shape, &output_routing));
-    OP_REQUIRES_OK(context, context->allocate_output(
-        1, output_data_shape, &output_data));
-    OP_REQUIRES_OK(context, context->allocate_output(
-        2, output_parameters_shape, &output_parameters));
-    OP_REQUIRES_OK(context, context->allocate_output(
-        3, output_bias_shape, &output_bias));
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_routing_shape,
+                                                     &output_routing));
+    OP_REQUIRES_OK(
+        context, context->allocate_output(1, output_data_shape, &output_data));
+    OP_REQUIRES_OK(context, context->allocate_output(2, output_parameters_shape,
+                                                     &output_parameters));
+    OP_REQUIRES_OK(
+        context, context->allocate_output(3, output_bias_shape, &output_bias));
 
     tensorforest::Initialize(*output_routing, 0.0);
     tensorforest::Initialize(*output_data, 0.0);
@@ -178,7 +178,7 @@ class StochasticHardRoutingGradient : public OpKernel {
       const Tensor point = input_data.Slice(i, i + 1);
 
       // Traverses the tree from the bottom up.
-      for (int j = tree_depth_-1; j > -1; j--) {
+      for (int j = tree_depth_ - 1; j > -1; j--) {
         int32 node = path(i, j);
 
         CHECK_LT(node, num_nodes);
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/unpack_path_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/unpack_path_op.cc
index cacad03e27..25825a78a1 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/unpack_path_op.cc
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/unpack_path_op.cc
@@ -64,8 +64,7 @@ REGISTER_OP("UnpackPath")
 
 class UnpackPath : public OpKernel {
  public:
-  explicit UnpackPath(OpKernelConstruction* context)
-      : OpKernel(context) {}
+  explicit UnpackPath(OpKernelConstruction* context) : OpKernel(context) {}
 
   void Compute(OpKernelContext* context) override {
     VLOG(1) << "unpack start";
@@ -73,8 +72,8 @@ class UnpackPath : public OpKernel {
     const Tensor& path_values_tensor = context->input(1);
 
     const int32 num_data = static_cast<int32>(path_tensor.shape().dim_size(0));
-    const int32 tree_depth = static_cast<int32>(
-        path_tensor.shape().dim_size(1));
+    const int32 tree_depth =
+        static_cast<int32>(path_tensor.shape().dim_size(1));
 
     const int32 num_nodes = MathUtil::IPow(2, tree_depth) - 1;
 
@@ -107,7 +106,6 @@ class UnpackPath : public OpKernel {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("UnpackPath").Device(DEVICE_CPU),
-                        UnpackPath);
+REGISTER_KERNEL_BUILDER(Name("UnpackPath").Device(DEVICE_CPU), UnpackPath);
 
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/utils.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/utils.cc
index c091a73c4e..34388fe1aa 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/utils.cc
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/utils.cc
@@ -25,9 +25,7 @@ namespace tensorforest {
 
 using tensorflow::Tensor;
 
-float LeftProbability(const Tensor& point,
-                      const Tensor& weight,
-                      float bias,
+float LeftProbability(const Tensor& point, const Tensor& weight, float bias,
                       int num_features) {
   const auto p = point.unaligned_flat<float>();
   const auto w = weight.unaligned_flat<float>();
@@ -41,11 +39,8 @@ float LeftProbability(const Tensor& point,
   return 1.0 / (1.0 + exp(-dot_product + bias));
 }
 
-float LeftProbabilityK(const Tensor& point,
-                       std::vector<int32> feature_set,
-                       const Tensor& weight,
-                       float bias,
-                       int num_features,
+float LeftProbabilityK(const Tensor& point, std::vector<int32> feature_set,
+                       const Tensor& weight, float bias, int num_features,
                        int k) {
   const auto p = point.unaligned_flat<float>();
   const auto w = weight.unaligned_flat<float>();
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/utils.h b/tensorflow/contrib/tensor_forest/hybrid/core/ops/utils.h
index c5902184f9..69a0143a4e 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/utils.h
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/utils.h
@@ -24,16 +24,11 @@ namespace tensorflow {
 namespace tensorforest {
 
 // Returns the probability that the point falls to the left.
-float LeftProbability(const Tensor& point,
-                      const Tensor& weight,
-                      float bias,
+float LeftProbability(const Tensor& point, const Tensor& weight, float bias,
                       int num_features);
 
-float LeftProbabilityK(const Tensor& point,
-                       std::vector<int32> feature_set,
-                       const Tensor& weight,
-                       float bias,
-                       int num_features,
+float LeftProbabilityK(const Tensor& point, std::vector<int32> feature_set,
+                       const Tensor& weight, float bias, int num_features,
                        int k);
 
 // Returns a random set of num_features_to_pick features in the
@@ -49,5 +44,3 @@ void GetFeatureSet(int32 tree_num, int32 node_num, int32 random_seed,
 }  // namespace tensorflow
 
 #endif  // LEARNING_LIB_TENSOR_FOREST_HYBRID_CORE_OPS_UTILS_H_
-
-
diff --git a/tensorflow/contrib/tensor_forest/kernels/reinterpret_string_to_float_op.cc b/tensorflow/contrib/tensor_forest/kernels/reinterpret_string_to_float_op.cc
index 47b49a379c..b21a917977 100644
--- a/tensorflow/contrib/tensor_forest/kernels/reinterpret_string_to_float_op.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/reinterpret_string_to_float_op.cc
@@ -30,15 +30,13 @@ namespace tensorflow {
 
 using tensorforest::CheckTensorBounds;
 
-
 float Convert(const string& in) {
   const std::size_t intval = std::hash<string>()(in);
   return static_cast<float>(intval);
 }
 
-
-void Evaluate(const Tensor& input_data, Tensor output_data,
-              int32 start, int32 end) {
+void Evaluate(const Tensor& input_data, Tensor output_data, int32 start,
+              int32 end) {
   auto out_data = output_data.unaligned_flat<float>();
   const auto in_data = input_data.unaligned_flat<string>();
 
@@ -59,9 +57,8 @@ class ReinterpretStringToFloat : public OpKernel {
     if (!CheckTensorBounds(context, input_data)) return;
 
     Tensor* output_data = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, input_data.shape(),
-                                            &output_data));
+    OP_REQUIRES_OK(
+        context, context->allocate_output(0, input_data.shape(), &output_data));
 
     // Evaluate input data in parallel.
     const int32 num_data = static_cast<int32>(input_data.NumElements());
@@ -73,8 +70,8 @@ class ReinterpretStringToFloat : public OpKernel {
       auto work = [&input_data, output_data, num_data](int64 start, int64 end) {
         CHECK(start <= end);
         CHECK(end <= num_data);
-        Evaluate(input_data, *output_data,
-                 static_cast<int32>(start), static_cast<int32>(end));
+        Evaluate(input_data, *output_data, static_cast<int32>(start),
+                 static_cast<int32>(end));
       };
       Shard(num_threads, worker_threads->workers, num_data, 100, work);
     }
diff --git a/tensorflow/contrib/tensor_forest/kernels/scatter_add_ndim_op.cc b/tensorflow/contrib/tensor_forest/kernels/scatter_add_ndim_op.cc
index dd2a98b08c..60740c2be3 100644
--- a/tensorflow/contrib/tensor_forest/kernels/scatter_add_ndim_op.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/scatter_add_ndim_op.cc
@@ -22,7 +22,6 @@
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/platform/logging.h"
 
-
 namespace tensorflow {
 
 using tensorforest::CheckTensorBounds;
@@ -38,20 +37,19 @@ class ScatterAddNdim : public OpKernel {
 
     if (indices_tensor.shape().dim_size(0) > 0) {
       OP_REQUIRES(context, indices_tensor.shape().dims() == 2,
-                  errors::InvalidArgument(
-                      "indices should be two-dimensional"));
+                  errors::InvalidArgument("indices should be two-dimensional"));
       const int32 delta_dims = deltas_tensor.shape().dims();
       OP_REQUIRES(
           context,
           indices_tensor.shape().dim_size(1) + delta_dims ==
-          input_tensor.shape().dims() + 1,
+              input_tensor.shape().dims() + 1,
           errors::InvalidArgument(
               "Number of indices dimensions should be the same as input "
               "rank."));
       OP_REQUIRES(
           context,
           indices_tensor.shape().dim_size(0) ==
-          deltas_tensor.shape().dim_size(0),
+              deltas_tensor.shape().dim_size(0),
           errors::InvalidArgument(
               "Number of updates should be same as number of indices."));
     } else {
@@ -68,8 +66,8 @@ class ScatterAddNdim : public OpKernel {
     const auto indices = indices_tensor.tensor<int32, 2>();
     const auto deltas = deltas_tensor.unaligned_flat<float>();
 
-    const int32 num_dims = static_cast<int32>(
-        indices_tensor.shape().dim_size(1));
+    const int32 num_dims =
+        static_cast<int32>(indices_tensor.shape().dim_size(1));
 
     // Figure out if indices don't specify a complete position in the
     // input tensor.
@@ -80,10 +78,9 @@ class ScatterAddNdim : public OpKernel {
 
     // Calculate index multipliers.
     std::vector<int32> multipliers;
-    OP_REQUIRES(
-        context, input.size() < std::numeric_limits<int32>::max(),
-        errors::InvalidArgument(
-            "Input must contain less than 2^31 total elements"));
+    OP_REQUIRES(context, input.size() < std::numeric_limits<int32>::max(),
+                errors::InvalidArgument(
+                    "Input must contain less than 2^31 total elements"));
     int32 last_size = static_cast<int32>(input.size());
 
     for (int32 j = 0; j < num_dims; j++) {
diff --git a/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc b/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc
index 94e12cea5a..44997ec5d6 100644
--- a/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc
@@ -65,8 +65,8 @@ void GetTwoBest(int max, const std::function<float(int)>& score_fn,
 
 float ClassificationSplitScore(
     const Eigen::Tensor<float, 1, Eigen::RowMajor>& splits,
-    const Eigen::Tensor<float, 1, Eigen::RowMajor>& rights,
-    int32 num_classes, int i) {
+    const Eigen::Tensor<float, 1, Eigen::RowMajor>& rights, int32 num_classes,
+    int i) {
   Eigen::array<int, 1> offsets;
   // Class counts are stored with the total in [0], so the length of each
   // count vector is num_classes + 1.
@@ -74,7 +74,7 @@ float ClassificationSplitScore(
   Eigen::array<int, 1> extents;
   extents[0] = num_classes;
   return WeightedGiniImpurity(splits.slice(offsets, extents)) +
-      WeightedGiniImpurity(rights.slice(offsets, extents));
+         WeightedGiniImpurity(rights.slice(offsets, extents));
 }
 
 void GetTwoBestClassification(const Tensor& total_counts,
@@ -90,29 +90,28 @@ void GetTwoBestClassification(const Tensor& total_counts,
   // in seg faults, so we have to go with flat views of these tensors.  However,
   // it is still pretty efficient because we put off evaluation until the
   // score is actually returned.
-  const auto tc = total_counts.Slice(
-      accumulator, accumulator + 1).unaligned_flat<float>();
+  const auto tc =
+      total_counts.Slice(accumulator, accumulator + 1).unaligned_flat<float>();
 
   // TODO(gilberth): See if we can delay evaluation here by templating the
   // arguments to ClassificationSplitScore.
-  const Eigen::Tensor<float, 1, Eigen::RowMajor> splits = split_counts.Slice(
-      accumulator, accumulator + 1).unaligned_flat<float>();
+  const Eigen::Tensor<float, 1, Eigen::RowMajor> splits =
+      split_counts.Slice(accumulator, accumulator + 1).unaligned_flat<float>();
   Eigen::array<int, 1> bcast;
   bcast[0] = num_splits;
   const Eigen::Tensor<float, 1, Eigen::RowMajor> rights =
       tc.broadcast(bcast) - splits;
 
-  std::function<float(int)> score_fn = std::bind(
-      ClassificationSplitScore, splits, rights, num_classes,
-      std::placeholders::_1);
+  std::function<float(int)> score_fn =
+      std::bind(ClassificationSplitScore, splits, rights, num_classes,
+                std::placeholders::_1);
 
   GetTwoBest(num_splits, score_fn, best_score, best_index, second_best_score,
              second_best_index);
 }
 
-int32 BestFeatureClassification(
-    const Tensor& total_counts, const Tensor& split_counts,
-    int32 accumulator) {
+int32 BestFeatureClassification(const Tensor& total_counts,
+                                const Tensor& split_counts, int32 accumulator) {
   float best_score;
   float second_best_score;
   int best_feature_index;
@@ -130,8 +129,7 @@ float RegressionSplitScore(
     const Eigen::Tensor<float, 1, Eigen::RowMajor>& splits_square,
     const Eigen::Tensor<float, 1, Eigen::RowMajor>& right_sums,
     const Eigen::Tensor<float, 1, Eigen::RowMajor>& right_squares,
-    int32 accumulator,
-    int32 num_regression_dims, int i) {
+    int32 accumulator, int32 num_regression_dims, int i) {
   Eigen::array<int, 1> offsets = {i * num_regression_dims + 1};
   Eigen::array<int, 1> extents = {num_regression_dims - 1};
   float left_count = splits_count_accessor(accumulator, i, 0);
@@ -141,15 +139,15 @@ float RegressionSplitScore(
 
   // Guard against divide-by-zero.
   if (left_count > 0) {
-    score += WeightedVariance(
-        splits_sum.slice(offsets, extents),
-        splits_square.slice(offsets, extents), left_count);
+    score +=
+        WeightedVariance(splits_sum.slice(offsets, extents),
+                         splits_square.slice(offsets, extents), left_count);
   }
 
   if (right_count > 0) {
-    score += WeightedVariance(right_sums.slice(offsets, extents),
-                              right_squares.slice(offsets, extents),
-                              right_count);
+    score +=
+        WeightedVariance(right_sums.slice(offsets, extents),
+                         right_squares.slice(offsets, extents), right_count);
   }
   return score;
 }
@@ -159,20 +157,20 @@ void GetTwoBestRegression(const Tensor& total_sums, const Tensor& total_squares,
                           int32 accumulator, float* best_score, int* best_index,
                           float* second_best_score, int* second_best_index) {
   const int32 num_splits = static_cast<int32>(split_sums.shape().dim_size(1));
-  const int32 num_regression_dims = static_cast<int32>(
-      split_sums.shape().dim_size(2));
+  const int32 num_regression_dims =
+      static_cast<int32>(split_sums.shape().dim_size(2));
   // Ideally, Eigen::Tensor::chip would be best to use here but it results
   // in seg faults, so we have to go with flat views of these tensors.  However,
   // it is still pretty efficient because we put off evaluation until the
   // score is actually returned.
-  const auto tc_sum = total_sums.Slice(
-      accumulator, accumulator + 1).unaligned_flat<float>();
-  const auto tc_square = total_squares.Slice(
-      accumulator, accumulator + 1).unaligned_flat<float>();
-  const auto splits_sum = split_sums.Slice(
-      accumulator, accumulator + 1).unaligned_flat<float>();
-  const auto splits_square = split_squares.Slice(
-      accumulator, accumulator + 1).unaligned_flat<float>();
+  const auto tc_sum =
+      total_sums.Slice(accumulator, accumulator + 1).unaligned_flat<float>();
+  const auto tc_square =
+      total_squares.Slice(accumulator, accumulator + 1).unaligned_flat<float>();
+  const auto splits_sum =
+      split_sums.Slice(accumulator, accumulator + 1).unaligned_flat<float>();
+  const auto splits_square =
+      split_squares.Slice(accumulator, accumulator + 1).unaligned_flat<float>();
   // Eigen is infuriating to work with, usually resulting in all kinds of
   // unhelpful compiler errors when trying something that seems sane.  This
   // helps us do a simple thing like access the first element (the counts)
@@ -193,10 +191,10 @@ void GetTwoBestRegression(const Tensor& total_sums, const Tensor& total_squares,
              best_score, best_index, second_best_score, second_best_index);
 }
 
-int32 BestFeatureRegression(
-    const Tensor& total_sums, const Tensor& total_squares,
-    const Tensor& split_sums, const Tensor& split_squares,
-    int32 accumulator) {
+int32 BestFeatureRegression(const Tensor& total_sums,
+                            const Tensor& total_squares,
+                            const Tensor& split_sums,
+                            const Tensor& split_squares, int32 accumulator) {
   float best_score;
   float second_best_score;
   int best_feature_index;
@@ -207,10 +205,11 @@ int32 BestFeatureRegression(
   return best_feature_index;
 }
 
-bool BestSplitDominatesRegression(
-    const Tensor& total_sums, const Tensor& total_squares,
-    const Tensor& split_sums, const Tensor& split_squares,
-    int32 accumulator) {
+bool BestSplitDominatesRegression(const Tensor& total_sums,
+                                  const Tensor& total_squares,
+                                  const Tensor& split_sums,
+                                  const Tensor& split_squares,
+                                  int32 accumulator) {
   // TODO(thomaswc): Implement this, probably as part of v3.
   return false;
 }
@@ -599,7 +598,6 @@ bool Decide(float value, float bias, DataColumnTypes type) {
   }
 }
 
-
 void GetParentWeightedMean(float leaf_sum, const float* leaf_data,
                            float parent_sum, const float* parent_data,
                            float valid_leaf_threshold, int num_outputs,
diff --git a/tensorflow/contrib/tensor_forest/kernels/tree_utils.h b/tensorflow/contrib/tensor_forest/kernels/tree_utils.h
index dad9df4898..edbac67006 100644
--- a/tensorflow/contrib/tensor_forest/kernels/tree_utils.h
+++ b/tensorflow/contrib/tensor_forest/kernels/tree_utils.h
@@ -45,13 +45,10 @@ const int32 LEAF_NODE = -1;
 const int32 FREE_NODE = -2;
 
 // Used to indicate column types, e.g. categorical vs. float
-enum DataColumnTypes {
-  kDataFloat = 0,
-  kDataCategorical = 1
-};
+enum DataColumnTypes { kDataFloat = 0, kDataCategorical = 1 };
 
 // Calculates the sum of a tensor.
-template<typename T>
+template <typename T>
 T Sum(Tensor counts) {
   Eigen::Tensor<T, 0, Eigen::RowMajor> count_sum =
       counts.unaligned_flat<T>().sum();
@@ -97,7 +94,7 @@ float WeightedGiniImpurity(const T& counts) {
   return RawWeightedGiniImpurity(smoothed);
 }
 
-template<typename T1, typename T2>
+template <typename T1, typename T2>
 float WeightedVariance(const T1& sums, const T2& squares, float count) {
   const auto e_x = sums / count;
   const auto e_x2 = squares / count;
@@ -120,10 +117,11 @@ int32 BestFeatureRegression(const Tensor& total_sums,
 
 // Returns true if the best split's variance is sufficiently smaller than
 // that of the next best split.
-bool BestSplitDominatesRegression(
-    const Tensor& total_sums, const Tensor& total_squares,
-    const Tensor& split_sums, const Tensor& split_squares,
-    int32 accumulator);
+bool BestSplitDominatesRegression(const Tensor& total_sums,
+                                  const Tensor& total_squares,
+                                  const Tensor& split_sums,
+                                  const Tensor& split_squares,
+                                  int32 accumulator);
 
 // Performs booststrap_samples bootstrap samples of the best split's class
 // counts and the second best splits's class counts, and returns true if at
@@ -178,10 +176,8 @@ bool DecideNode(const GetFeatureFnType& get_dense,
 // isn't present in sparse_input_indices.  sparse_input_indices is assumed
 // to be sorted.
 template <typename T1, typename T2>
-float FindSparseValue(
-    const T1& sparse_input_indices,
-    const T2& sparse_input_values,
-    int32 i, int32 j) {
+float FindSparseValue(const T1& sparse_input_indices,
+                      const T2& sparse_input_values, int32 i, int32 j) {
   int32 low = 0;
   int32 high = sparse_input_values.dimension(0);
   while (low < high) {
@@ -273,7 +269,6 @@ int32 GetNumSparseFeatures(const T1& indices, int32 input_index,
 // categorical data, it is value != bias.
 bool Decide(float value, float bias, DataColumnTypes type = kDataFloat);
 
-
 // Returns true if all the splits are initialized. Since they get initialized
 // in order, we can simply infer this from the last split.
 // This should only be called for a single allocator's candidate features
diff --git a/tensorflow/contrib/tensor_forest/kernels/tree_utils_test.cc b/tensorflow/contrib/tensor_forest/kernels/tree_utils_test.cc
index 7485a695df..0855354550 100644
--- a/tensorflow/contrib/tensor_forest/kernels/tree_utils_test.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/tree_utils_test.cc
@@ -44,11 +44,13 @@ TEST(TestWeightedVariance, Basic) {
   Tensor squares = test::AsTensor<float>({29, 12}, {2});
 
   EXPECT_FLOAT_EQ(WeightedVariance(sums.unaligned_flat<float>(),
-                                   squares.unaligned_flat<float>(), 3), 2.0);
+                                   squares.unaligned_flat<float>(), 3),
+                  2.0);
 
   Tensor zero = test::AsTensor<float>({0}, {1});
   EXPECT_FLOAT_EQ(WeightedVariance(zero.unaligned_flat<float>(),
-                                   zero.unaligned_flat<float>(), 1), 0);
+                                   zero.unaligned_flat<float>(), 1),
+                  0);
 }
 
 TEST(TestInitialize, Basic) {
@@ -94,17 +96,16 @@ TEST(BestFeatureClassification, Basic) {
   const int32 num_accumulators = 4;
   const int32 num_splits = 3;
   const int32 num_classes = 4;
-  Tensor totals = test::AsTensor<float>({1, 5, 6, 7,
-                                         0, 0, 0, 0,
-                                         30, 10, 10, 10,      // this one
-                                         -1, -1, -1, -1},
-                                        {num_accumulators, num_classes});
-  Tensor splits = test::AsTensor<float>(
-      {1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       30, 10, 10, 10, 10, 0, 0, 10, 19, 5, 6, 8,  // this one
-       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
-      {num_accumulators, num_splits, num_classes});
+  Tensor totals = test::AsTensor<float>(
+      {1, 5, 6, 7, 0, 0, 0, 0, 30, 10, 10, 10,  // this one
+       -1, -1, -1, -1},
+      {num_accumulators, num_classes});
+  Tensor splits =
+      test::AsTensor<float>({1,  2,  3,  4,  1,  2,  3,  4,  1,  2,  3,  4,  0,
+                             0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  30, 10,
+                             10, 10, 10, 0,  0,  10, 19, 5,  6,  8,  // this one
+                             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+                            {num_accumulators, num_splits, num_classes});
 
   EXPECT_EQ(BestFeatureClassification(totals, splits, 2), 1);
 }
@@ -114,17 +115,16 @@ TEST(BestFeatureClassification, NoWinner) {
   const int32 num_splits = 3;
   const int32 num_classes = 4;
   // When counts are all the same, the most reasonable thing to do is pick 0.
-  Tensor totals = test::AsTensor<float>({1, 5, 6, 7,
-                                         0, 0, 0, 0,
-                                         18, 6, 6, 6,      // this one
-                                         -1, -1, -1, -1},
-                                        {num_accumulators, num_classes});
-  Tensor splits = test::AsTensor<float>(
-      {1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       9, 3, 3, 3, 9, 3, 3, 3, 9, 3, 3, 3,     // this one
-       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
-      {num_accumulators, num_splits, num_classes});
+  Tensor totals =
+      test::AsTensor<float>({1, 5, 6, 7, 0, 0, 0, 0, 18, 6, 6, 6,  // this one
+                             -1, -1, -1, -1},
+                            {num_accumulators, num_classes});
+  Tensor splits =
+      test::AsTensor<float>({1,  2,  3,  4,  1,  2,  3,  4,  1,  2,  3,  4, 0,
+                             0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  9, 3,
+                             3,  3,  9,  3,  3,  3,  9,  3,  3,  3,  // this one
+                             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+                            {num_accumulators, num_splits, num_classes});
 
   EXPECT_EQ(BestFeatureClassification(totals, splits, 2), 0);
 }
@@ -133,36 +133,34 @@ TEST(BestFeatureRegression, Basic) {
   const int32 num_accumulators = 4;
   const int32 num_splits = 3;
   const int32 num_classes = 4;
-  Tensor total_sums = test::AsTensor<float>(
-      {1, 5, 6, 7,
-       0, 0, 0, 0,
-       10, 8, 6, 9,      // this one
-       -1, -1, -1, -1},
-      {num_accumulators, num_classes});
+  Tensor total_sums =
+      test::AsTensor<float>({1, 5, 6, 7, 0, 0, 0, 0, 10, 8, 6, 9,  // this one
+                             -1, -1, -1, -1},
+                            {num_accumulators, num_classes});
   Tensor total_squares = test::AsTensor<float>(
-      {1, 5, 6, 7,
-       0, 0, 0, 0,
-       100, 50, 40, 45,      // this one
+      {1, 5, 6, 7, 0, 0, 0, 0, 100, 50, 40, 45,  // this one
        -1, -1, -1, -1},
       {num_accumulators, num_classes});
 
-  Tensor split_sums = test::AsTensor<float>(
-      {1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       10, 8, 6, 9, 9, 8, 5, 9, 0, 0, 0, 0,      // this one
-       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
-      {num_accumulators, num_splits, num_classes});
+  Tensor split_sums =
+      test::AsTensor<float>({1,  2,  3,  4,  1,  2,  3,  4,  1,  2,  3,  4,  0,
+                             0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  10, 8,
+                             6,  9,  9,  8,  5,  9,  0,  0,  0,  0,  // this one
+                             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+                            {num_accumulators, num_splits, num_classes});
 
   // lower the variance by lowering one of the squares just a little.
-  Tensor split_squares = test::AsTensor<float>(
-      {1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       100, 50, 40, 45, 100, 50, 40, 43, 0, 0, 0, 0,    // this one
-       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
-      {num_accumulators, num_splits, num_classes});
+  Tensor split_squares =
+      test::AsTensor<float>(
+          {1,   2,  3,  4,  1,   2,  3,  4,  1,  2,  3,  4,
+           0,   0,  0,  0,  0,   0,  0,  0,  0,  0,  0,  0,
+           100, 50, 40, 45, 100, 50, 40, 43, 0,  0,  0,  0,  // this one
+           -1,  -1, -1, -1, -1,  -1, -1, -1, -1, -1, -1, -1},
+          {num_accumulators, num_splits, num_classes});
 
   EXPECT_EQ(BestFeatureRegression(total_sums, total_squares, split_sums,
-                                  split_squares, 2), 1);
+                                  split_squares, 2),
+            1);
 }
 
 TEST(BestFeatureRegression, NoWinner) {
@@ -170,37 +168,33 @@ TEST(BestFeatureRegression, NoWinner) {
   const int32 num_splits = 3;
   const int32 num_classes = 4;
   // when counts are all the same, the most reasonable thing to do is pick 0.
-  Tensor total_sums = test::AsTensor<float>(
-      {1, 5, 6, 7,
-       0, 0, 0, 0,
-       10, 8, 6, 9,      // this one
-       -1, -1, -1, -1},
-      {num_accumulators, num_classes});
+  Tensor total_sums =
+      test::AsTensor<float>({1, 5, 6, 7, 0, 0, 0, 0, 10, 8, 6, 9,  // this one
+                             -1, -1, -1, -1},
+                            {num_accumulators, num_classes});
   Tensor total_squares = test::AsTensor<float>(
-      {1, 5, 6, 7,
-       0, 0, 0, 0,
-       100, 50, 40, 45,      // this one
+      {1, 5, 6, 7, 0, 0, 0, 0, 100, 50, 40, 45,  // this one
        -1, -1, -1, -1},
       {num_accumulators, num_classes});
 
-  Tensor split_sums = test::AsTensor<float>(
-      {1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       10, 8, 6, 9, 10, 8, 6, 9, 10, 8, 6, 9,      // this one
-       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
-      {num_accumulators, num_splits, num_classes});
+  Tensor split_sums =
+      test::AsTensor<float>({1,  2,  3,  4,  1,  2,  3,  4,  1,  2,  3,  4,  0,
+                             0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  10, 8,
+                             6,  9,  10, 8,  6,  9,  10, 8,  6,  9,  // this one
+                             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+                            {num_accumulators, num_splits, num_classes});
 
   Tensor split_squares = test::AsTensor<float>(
-      {1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       100, 50, 40, 45, 100, 50, 40, 45, 100, 50, 40, 45,    // this one
-       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+      {1,   2,  3,  4,  1,   2,  3,  4,  1,   2,  3,  4,
+       0,   0,  0,  0,  0,   0,  0,  0,  0,   0,  0,  0,
+       100, 50, 40, 45, 100, 50, 40, 45, 100, 50, 40, 45,  // this one
+       -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1},
       {num_accumulators, num_splits, num_classes});
 
   EXPECT_EQ(BestFeatureRegression(total_sums, total_squares, split_sums,
-                                  split_squares, 2), 0);
+                                  split_squares, 2),
+            0);
 }
 
 }  // namespace tensorforest
 }  // namespace tensorflow
-
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/candidate_graph_runner.cc b/tensorflow/contrib/tensor_forest/kernels/v4/candidate_graph_runner.cc
index 81e2a1b2a1..f4a7058ddb 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/candidate_graph_runner.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/candidate_graph_runner.cc
@@ -14,8 +14,8 @@
 // =============================================================================
 #include "tensorflow/contrib/tensor_forest/kernels/v4/candidate_graph_runner.h"
 
-#include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/env.h"
 
 namespace tensorflow {
@@ -58,8 +58,7 @@ CandidateGraphRunner::CandidateGraphRunner(
   // Features don't change, store them in a tensor.
   const auto& oblique = split.inequality_left_child_test().oblique();
   const int32 feat_size = oblique.features_size();
-  features_.reset(
-      new Tensor(tensorflow::DT_INT32, TensorShape({feat_size})));
+  features_.reset(new Tensor(tensorflow::DT_INT32, TensorShape({feat_size})));
   auto feat = features_->flat<int32>();
   int i = 0;
   for (const auto& id : oblique.features()) {
@@ -67,10 +66,10 @@ CandidateGraphRunner::CandidateGraphRunner(
   }
 }
 
-void CandidateGraphRunner::RunOp(
-    const string& name, const TensorNameValueList& inputs,
-    const std::vector<string>& output_tensor_names,
-    std::vector<Tensor>* outputs) {
+void CandidateGraphRunner::RunOp(const string& name,
+                                 const TensorNameValueList& inputs,
+                                 const std::vector<string>& output_tensor_names,
+                                 std::vector<Tensor>* outputs) {
   std::vector<string> op_name;
   if (name != kNoOp) {
     op_name.push_back(name);
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h b/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h
index cced26b903..328af28725 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h
@@ -26,7 +26,6 @@
 namespace tensorflow {
 namespace tensorforest {
 
-
 // Keep a tree ensemble in memory for efficient evaluation and mutation.
 class DecisionTreeResource : public ResourceBase {
  public:
@@ -35,15 +34,12 @@ class DecisionTreeResource : public ResourceBase {
 
   string DebugString() override {
     return strings::StrCat("DecisionTree[size=",
-                           decision_tree_->decision_tree().nodes_size(),
-                           "]");
+                           decision_tree_->decision_tree().nodes_size(), "]");
   }
 
   void MaybeInitialize();
 
-  const decision_trees::Model& decision_tree() const {
-    return *decision_tree_;
-  }
+  const decision_trees::Model& decision_tree() const { return *decision_tree_; }
 
   decision_trees::Model* mutable_decision_tree() {
     return decision_tree_.get();
@@ -59,9 +55,7 @@ class DecisionTreeResource : public ResourceBase {
 
   // Resets the resource and frees the proto.
   // Caller needs to hold the mutex lock while calling this.
-  void Reset() {
-    decision_tree_.reset(new decision_trees::Model());
-  }
+  void Reset() { decision_tree_.reset(new decision_trees::Model()); }
 
   mutex* get_mutex() { return &mu_; }
 
@@ -84,7 +78,6 @@ class DecisionTreeResource : public ResourceBase {
   std::vector<std::unique_ptr<DecisionNodeEvaluator>> node_evaluators_;
 };
 
-
 }  // namespace tensorforest
 }  // namespace tensorflow
 
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h b/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h
index 85ce7b825b..bf2b2aaa3c 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h
@@ -22,7 +22,6 @@
 namespace tensorflow {
 namespace tensorforest {
 
-
 // Base class for evaluators of decision nodes that effectively copy proto
 // contents into C++ structures for faster execution.
 class DecisionNodeEvaluator {
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator_test.cc b/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator_test.cc
index 5c49b87443..af5cf72a3c 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator_test.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator_test.cc
@@ -20,11 +20,11 @@
 namespace tensorflow {
 namespace {
 
+using tensorflow::decision_trees::InequalityTest;
+using tensorflow::decision_trees::MatchingValuesTest;
 using tensorflow::tensorforest::InequalityDecisionNodeEvaluator;
 using tensorflow::tensorforest::MatchingValuesDecisionNodeEvaluator;
 using tensorflow::tensorforest::ObliqueInequalityDecisionNodeEvaluator;
-using tensorflow::decision_trees::InequalityTest;
-using tensorflow::decision_trees::MatchingValuesTest;
 
 TEST(InequalityDecisionNodeEvaluatorTest, TestLessOrEqual) {
   InequalityTest test;
@@ -124,4 +124,3 @@ TEST(ObliqueDecisionNodeEvaluatorTest, Basic) {
 
 }  // namespace
 }  // namespace tensorflow
-
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/fertile-stats-resource.h b/tensorflow/contrib/tensor_forest/kernels/v4/fertile-stats-resource.h
index 0d6712e9e5..eea0be27ca 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/fertile-stats-resource.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/fertile-stats-resource.h
@@ -40,9 +40,7 @@ class FertileStatsResource : public ResourceBase {
     model_op_ = LeafModelOperatorFactory::CreateLeafModelOperator(params_);
   }
 
-  string DebugString() override {
-    return "FertileStats";
-  }
+  string DebugString() override { return "FertileStats"; }
 
   void ExtractFromProto(const FertileStats& stats);
 
@@ -50,8 +48,7 @@ class FertileStatsResource : public ResourceBase {
 
   // Resets the resource and frees the proto.
   // Caller needs to hold the mutex lock while calling this.
-  void Reset() {
-  }
+  void Reset() {}
 
   // Reset the stats for a node, but leave the leaf_stats intact.
   void ResetSplitStats(int32 node_id, int32 depth) {
@@ -84,7 +81,6 @@ class FertileStatsResource : public ResourceBase {
   // was found.
   bool BestSplit(int32 node_id, SplitCandidate* best, int32* depth);
 
-
  private:
   mutex mu_;
   std::shared_ptr<LeafModelOperator> model_op_;
@@ -94,7 +90,6 @@ class FertileStatsResource : public ResourceBase {
   void AllocateNode(int32 node_id, int32 depth);
 };
 
-
 }  // namespace tensorforest
 }  // namespace tensorflow
 
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.cc b/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.cc
index 3ce630e3a9..da600d34ea 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.cc
@@ -20,7 +20,6 @@
 #include "tensorflow/contrib/tensor_forest/kernels/v4/stat_utils.h"
 #include "tensorflow/core/lib/random/distribution_sampler.h"
 
-
 namespace tensorflow {
 namespace tensorforest {
 
@@ -454,14 +453,14 @@ void DenseClassificationGrowStats::PackToProto(FertileSlot* slot) const {
     class_stats->add_value()->set_float_value(total_counts_[i]);
   }
 
-  for (int split_num = 0;  split_num < num_splits(); ++split_num) {
+  for (int split_num = 0; split_num < num_splits(); ++split_num) {
     auto* cand = slot->add_candidates();
     *cand->mutable_split() = splits_[split_num];
     auto* left_stats = cand->mutable_left_stats()
                            ->mutable_classification()
                            ->mutable_dense_counts();
     for (int i = 0; i < num_outputs_; ++i) {
-       left_stats->add_value()->set_float_value(left_count(split_num, i));
+      left_stats->add_value()->set_float_value(left_count(split_num, i));
     }
   }
 }
@@ -546,7 +545,7 @@ void SparseClassificationGrowStats::PackToProto(FertileSlot* slot) const {
     (*class_stats)[entry.first] = val;
   }
 
-  for (int split_num = 0;  split_num < num_splits(); ++split_num) {
+  for (int split_num = 0; split_num < num_splits(); ++split_num) {
     auto* cand = slot->add_candidates();
     *cand->mutable_split() = splits_[split_num];
     auto* left_stats = cand->mutable_left_stats()
@@ -561,8 +560,8 @@ void SparseClassificationGrowStats::PackToProto(FertileSlot* slot) const {
   }
 }
 
-float SparseClassificationGrowStats::GiniScore(
-    int split, float* left_sum, float* right_sum) const {
+float SparseClassificationGrowStats::GiniScore(int split, float* left_sum,
+                                               float* right_sum) const {
   float left_square = 0, right_square = 0;
   *left_sum = 0;
   *right_sum = 0;
@@ -844,12 +843,11 @@ void LeastSquaresRegressionGrowStats::PackToProto(FertileSlot* slot) const {
     total_squares->add_value()->set_float_value(total_sum_squares_[i]);
   }
 
-  for (int split_num = 0;  split_num < num_splits(); ++split_num) {
+  for (int split_num = 0; split_num < num_splits(); ++split_num) {
     auto* cand = slot->add_candidates();
     *cand->mutable_split() = splits_[split_num];
-    auto* sums = cand->mutable_left_stats()
-                           ->mutable_regression()
-                           ->mutable_mean_output();
+    auto* sums =
+        cand->mutable_left_stats()->mutable_regression()->mutable_mean_output();
     auto* squares = cand->mutable_left_stats()
                         ->mutable_regression()
                         ->mutable_mean_output_squares();
@@ -891,20 +889,17 @@ float LeastSquaresRegressionGrowStats::SplitVariance(int split) const {
   float total_variance = 0;
   for (int i = 0; i < params_.num_outputs(); ++i) {
     // Left side
-    const float le_x =
-        left_sum(split, i) / left_counts_[split];
+    const float le_x = left_sum(split, i) / left_counts_[split];
 
-    const float le_x2 =
-        left_square(split, i) / left_counts_[split];
+    const float le_x2 = left_square(split, i) / left_counts_[split];
     total_variance += le_x2 - le_x * le_x;
 
     // Right side
     const float re_x = (total_sum_[i] - left_sum(split, i)) /
                        (weight_sum_ - left_counts_[split]);
 
-    const float re_x2 =
-        (total_sum_squares_[i] - left_square(split, i)) /
-        (weight_sum_ - left_counts_[split]);
+    const float re_x2 = (total_sum_squares_[i] - left_square(split, i)) /
+                        (weight_sum_ - left_counts_[split]);
     total_variance += re_x2 - re_x * re_x;
   }
   return total_variance;
@@ -937,8 +932,7 @@ bool LeastSquaresRegressionGrowStats::BestSplit(SplitCandidate* best) const {
   left->set_weight_sum(left_counts_[best_index]);
   auto* left_output_sum = left_reg_stats->mutable_mean_output();
   for (int i = 0; i < num_outputs; ++i) {
-    left_output_sum->add_value()->set_float_value(
-        left_sum(best_index, i));
+    left_output_sum->add_value()->set_float_value(left_sum(best_index, i));
   }
 
   // Right
@@ -947,8 +941,8 @@ bool LeastSquaresRegressionGrowStats::BestSplit(SplitCandidate* best) const {
   right->set_weight_sum(weight_sum_ - left_counts_[best_index]);
   auto* right_output_sum = right_reg_stats->mutable_mean_output();
   for (int i = 0; i < num_outputs; ++i) {
-    right_output_sum->add_value()->set_float_value(
-        total_sum_[i] - left_sum(best_index, i));
+    right_output_sum->add_value()->set_float_value(total_sum_[i] -
+                                                   left_sum(best_index, i));
   }
   return true;
 }
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.h b/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.h
index 02c0fc687f..04e6b0a735 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.h
@@ -73,21 +73,15 @@ class GrowStats {
       const InputTarget* target, int example) {}
   void RemoveSplit(int split_num);
 
-  int num_splits() const {
-    return splits_.size();
-  }
+  int num_splits() const { return splits_.size(); }
 
-  float weight_sum() const {
-    return weight_sum_;
-  }
+  float weight_sum() const { return weight_sum_; }
 
   virtual bool IsInitialized() const {
     return weight_sum_ > 0 || splits_.size() == num_splits_to_consider_;
   }
 
-  int32 depth() const {
-    return depth_;
-  }
+  int32 depth() const { return depth_; }
 
  protected:
   GrowStats(const TensorForestParams& params, int32 depth);
@@ -206,8 +200,8 @@ class ClassificationStats : public GrowStats {
   virtual float left_count(int split, int class_num) const = 0;
   virtual float right_count(int split, int class_num) const = 0;
 
-  virtual void ClassificationAddLeftExample(
-      int split, int64 int_label, float weight) = 0;
+  virtual void ClassificationAddLeftExample(int split, int64 int_label,
+                                            float weight) = 0;
   virtual void ClassificationAddRightExample(int split, int64 int_label,
                                              float weight) {
     // Does nothing by default, but sub-classes can override.
@@ -375,9 +369,7 @@ class SparseClassificationGrowStats : public ClassificationStats {
   SparseClassificationGrowStats(const TensorForestParams& params, int32 depth)
       : ClassificationStats(params, depth) {}
 
-  void Initialize() override {
-    Clear();
-  }
+  void Initialize() override { Clear(); }
 
   void ExtractFromProto(const FertileSlot& slot) override;
   void PackToProto(FertileSlot* slot) const override;
@@ -562,9 +554,9 @@ class LeastSquaresRegressionGrowStats : public GrowStats {
   }
   void RemoveSplitStats(int split_num) override {
     left_sums_.erase(left_sums_.begin() + num_outputs_ * split_num,
-                       left_sums_.begin() + num_outputs_ * (split_num + 1));
+                     left_sums_.begin() + num_outputs_ * (split_num + 1));
     left_squares_.erase(left_squares_.begin() + num_outputs_ * split_num,
-                       left_squares_.begin() + num_outputs_ * (split_num + 1));
+                        left_squares_.begin() + num_outputs_ * (split_num + 1));
     left_counts_.erase(left_counts_.begin() + split_num,
                        left_counts_.begin() + (split_num + 1));
   }
@@ -605,7 +597,6 @@ class LeastSquaresRegressionGrowStats : public GrowStats {
   std::vector<int64> left_counts_;
 };
 
-
 }  // namespace tensorforest
 }  // namespace tensorflow
 
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats_test.cc b/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats_test.cc
index ceb58d2ead..26e989928e 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats_test.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats_test.cc
@@ -24,21 +24,21 @@
 namespace tensorflow {
 namespace {
 
-using tensorflow::tensorforest::GrowStats;
-using tensorflow::tensorforest::TestableInputTarget;
-using tensorflow::tensorforest::FertileSlot;
+using tensorflow::decision_trees::BinaryNode;
+using tensorflow::decision_trees::FeatureId;
+using tensorflow::decision_trees::InequalityTest;
 using tensorflow::tensorforest::DenseClassificationGrowStats;
-using tensorflow::tensorforest::SparseClassificationGrowStats;
+using tensorflow::tensorforest::FertileSlot;
 using tensorflow::tensorforest::FixedSizeClassStats;
 using tensorflow::tensorforest::FixedSizeSparseClassificationGrowStats;
+using tensorflow::tensorforest::GrowStats;
 using tensorflow::tensorforest::LeastSquaresRegressionGrowStats;
-using tensorflow::tensorforest::TensorForestParams;
+using tensorflow::tensorforest::SparseClassificationGrowStats;
 using tensorflow::tensorforest::SPLIT_FINISH_BASIC;
 using tensorflow::tensorforest::SPLIT_FINISH_DOMINATE_HOEFFDING;
 using tensorflow::tensorforest::SPLIT_PRUNE_HOEFFDING;
-using tensorflow::decision_trees::BinaryNode;
-using tensorflow::decision_trees::InequalityTest;
-using tensorflow::decision_trees::FeatureId;
+using tensorflow::tensorforest::TensorForestParams;
+using tensorflow::tensorforest::TestableInputTarget;
 
 BinaryNode MakeSplit(const string& feat, float val) {
   BinaryNode split;
@@ -52,8 +52,7 @@ BinaryNode MakeSplit(const string& feat, float val) {
   return split;
 }
 
-void RunBatch(GrowStats* stats,
-              const TestableInputTarget* target) {
+void RunBatch(GrowStats* stats, const TestableInputTarget* target) {
   std::unique_ptr<tensorflow::tensorforest::TensorDataSet> dataset(
       new tensorflow::tensorforest::TestableDataSet(
           {1.0, 2.0, 3.0, 4.0, 5.0, 6.0}, 2));
@@ -102,18 +101,10 @@ class TestableRunningStats : public DenseClassificationGrowStats {
   TestableRunningStats(const TensorForestParams& params, int32 depth)
       : DenseClassificationGrowStats(params, depth) {}
 
-  float test_left_sum(int split) {
-    return get_left_gini()->sum(split);
-  }
-  float test_left_square(int split) {
-    return get_left_gini()->square(split);
-  }
-  float test_right_sum(int split) {
-    return get_right_gini()->sum(split);
-  }
-  float test_right_square(int split) {
-    return get_right_gini()->square(split);
-  }
+  float test_left_sum(int split) { return get_left_gini()->sum(split); }
+  float test_left_square(int split) { return get_left_gini()->square(split); }
+  float test_right_sum(int split) { return get_right_gini()->sum(split); }
+  float test_right_square(int split) { return get_right_gini()->square(split); }
 };
 
 TEST(GrowStatsDenseClassificationTest, BasicRunningStats) {
@@ -166,9 +157,7 @@ class TestableFinishEarly : public DenseClassificationGrowStats {
   int num_times_called_;
 
  protected:
-  void CheckFinishEarlyHoeffding() override {
-    ++num_times_called_;
-  }
+  void CheckFinishEarlyHoeffding() override { ++num_times_called_; }
 };
 
 TEST(GrowStatsDenseClassificationTest, TestFinishEarly) {
@@ -212,7 +201,6 @@ TEST(GrowStatsDenseClassificationTest, TestFinishEarly) {
   ASSERT_EQ(stat->num_times_called_, 9);
 }
 
-
 TEST(GrowStatsDenseClassificationTest, TestCheckPruneHoeffding) {
   TensorForestParams params;
   params.set_num_outputs(2);
@@ -224,7 +212,8 @@ TEST(GrowStatsDenseClassificationTest, TestCheckPruneHoeffding) {
   finish->set_type(SPLIT_FINISH_BASIC);
   finish->mutable_check_every_steps()->set_constant_value(100);
   params.mutable_pruning_type()->set_type(SPLIT_PRUNE_HOEFFDING);
-  params.mutable_pruning_type()->mutable_prune_every_samples()
+  params.mutable_pruning_type()
+      ->mutable_prune_every_samples()
       ->set_constant_value(1);
 
   // On each iteration, we add two examples, one of class 0 and one
@@ -234,8 +223,8 @@ TEST(GrowStatsDenseClassificationTest, TestCheckPruneHoeffding) {
   std::vector<float> weights = {1, 1};
   TestableInputTarget target(labels, weights, 1);
   std::unique_ptr<tensorflow::tensorforest::TensorDataSet> dataset(
-      new tensorflow::tensorforest::TestableDataSet(
-          {-1.0, -1.0, 1.0, -1.0}, 2));
+      new tensorflow::tensorforest::TestableDataSet({-1.0, -1.0, 1.0, -1.0},
+                                                    2));
 
   DenseClassificationGrowStats stats(params, 1);
   stats.Initialize();
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/input_data.cc b/tensorflow/contrib/tensor_forest/kernels/v4/input_data.cc
index bf0fb92450..d43884481a 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/input_data.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/input_data.cc
@@ -109,10 +109,10 @@ void TensorDataSet::set_input_tensors(const Tensor& dense,
     dense_data_.reset(new DenseStorageType(dense.tensor<float, 2>()));
   }
   if (sparse_indices.shape().dims() == 2) {
-    sparse_indices_.reset(new SparseIndicesStorageType(
-        sparse_indices.tensor<int64, 2>()));
-    sparse_values_.reset(new SparseValuesStorageType(
-        sparse_values.tensor<float, 1>()));
+    sparse_indices_.reset(
+        new SparseIndicesStorageType(sparse_indices.tensor<int64, 2>()));
+    sparse_values_.reset(
+        new SparseValuesStorageType(sparse_values.tensor<float, 1>()));
     sparse_batch_size_ = sparse_shape.tensor<int64, 1>()(0);
   }
   original_dense_tensor_ = dense;
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/input_data.h b/tensorflow/contrib/tensor_forest/kernels/v4/input_data.h
index eafad6b591..c544a8c75e 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/input_data.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/input_data.h
@@ -93,9 +93,7 @@ class TensorDataSet {
   // an int32 you can avoid the atoi32.
   virtual float GetExampleValue(int example, int32 feature_id) const;
 
-  int num_features() {
-    return available_features_.size();
-  }
+  int num_features() { return available_features_.size(); }
 
   const Tensor& original_tensor() const { return original_dense_tensor_; }
 
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/input_target.h b/tensorflow/contrib/tensor_forest/kernels/v4/input_target.h
index 44ec09c50e..d4402b6055 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/input_target.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/input_target.h
@@ -79,9 +79,7 @@ class TensorInputTarget : public StoredInputTarget<SingleDimStorageType> {
     return (*target_)(example_index * num_targets_ + target_index);
   }
 
-  const Tensor& original_tensor() const {
-    return original_tensor_;
-  }
+  const Tensor& original_tensor() const { return original_tensor_; }
 
  protected:
   Tensor original_tensor_;
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators.cc b/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators.cc
index d43c068e46..83614a2531 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators.cc
@@ -160,6 +160,5 @@ void RegressionLeafModelOperator::ExportModel(
   }
 }
 
-
 }  // namespace tensorforest
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators_test.cc b/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators_test.cc
index ffd92c01f9..ab4191809b 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators_test.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators_test.cc
@@ -26,19 +26,19 @@ namespace {
 using tensorflow::decision_trees::Leaf;
 using tensorflow::tensorforest::DenseClassificationLeafModelOperator;
 using tensorflow::tensorforest::LeafModelOperator;
-using tensorflow::tensorforest::SparseClassificationLeafModelOperator;
-using tensorflow::tensorforest::SparseOrDenseClassificationLeafModelOperator;
 using tensorflow::tensorforest::LeafStat;
 using tensorflow::tensorforest::RegressionLeafModelOperator;
-using tensorflow::tensorforest::TestableInputTarget;
+using tensorflow::tensorforest::SparseClassificationLeafModelOperator;
+using tensorflow::tensorforest::SparseOrDenseClassificationLeafModelOperator;
 using tensorflow::tensorforest::TensorForestParams;
+using tensorflow::tensorforest::TestableInputTarget;
 
 const int32 kNumClasses = 3;
 
 constexpr char kRegressionStatProto[] =
-  "weight_sum: 3 "
-  "regression { "
-  "mean_output { "
+    "weight_sum: 3 "
+    "regression { "
+    "mean_output { "
     "value { "
     "  float_value: 27 "
     "} "
@@ -48,8 +48,8 @@ constexpr char kRegressionStatProto[] =
     "value { "
     "  float_value: 10 "
     "} "
-  "} "
-  "mean_output_squares { "
+    "} "
+    "mean_output_squares { "
     "value {"
     "  float_value: 245"
     "}"
@@ -59,8 +59,8 @@ constexpr char kRegressionStatProto[] =
     "value {"
     "  float_value: 46"
     "}"
-  "}"
-"}";
+    "}"
+    "}";
 
 void TestClassificationNormalUse(const std::unique_ptr<LeafModelOperator>& op) {
   Leaf l;
@@ -83,7 +83,6 @@ void TestClassificationNormalUse(const std::unique_ptr<LeafModelOperator>& op) {
   EXPECT_FLOAT_EQ(op->GetOutputValue(l, 1), 3.4);
 }
 
-
 TEST(DenseLeafModelOperatorsTest, NormalUse) {
   TensorForestParams params;
   params.set_num_outputs(kNumClasses);
@@ -182,7 +181,7 @@ TEST(SparseLeafModelOperatorsTest, InitWithExisting) {
 
   std::unique_ptr<Leaf> leaf(new Leaf);
 
-  op->ExportModel( *stat, leaf.get());
+  op->ExportModel(*stat, leaf.get());
 
   // Make sure it was initialized correctly.
   EXPECT_FLOAT_EQ(op->GetOutputValue(*leaf, 0), 1.1);
@@ -194,7 +193,6 @@ TEST(SparseLeafModelOperatorsTest, InitWithExisting) {
   EXPECT_EQ(leaf->sparse_vector().sparse_value().size(), kNumClasses);
 }
 
-
 TEST(RegressionLeafModelOperatorsTest, NormalUse) {
   TensorForestParams params;
   params.set_num_outputs(kNumClasses);
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/params.h b/tensorflow/contrib/tensor_forest/kernels/v4/params.h
index b0ed949424..7583e3d040 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/params.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/params.h
@@ -24,7 +24,6 @@ namespace tensorforest {
 // Return the value of the given depth-dependent parameter given a leaf's depth.
 float ResolveParam(const DepthDependentParam& param, int32 depth);
 
-
 }  // namespace tensorforest
 }  // namespace tensorflow
 
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/params_test.cc b/tensorflow/contrib/tensor_forest/kernels/v4/params_test.cc
index 801881af13..4010a71006 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/params_test.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/params_test.cc
@@ -71,5 +71,3 @@ TEST(ParamsTest, TestThreshold) {
 }
 
 }  // namespace
-
-
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/split_collection_operators.cc b/tensorflow/contrib/tensor_forest/kernels/v4/split_collection_operators.cc
index cdb1d80a4b..b7b60d0ab8 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/split_collection_operators.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/split_collection_operators.cc
@@ -52,8 +52,8 @@ std::unique_ptr<GrowStats> SplitCollectionOperator::CreateGrowStats(
           new SparseClassificationGrowStats(params_, depth));
 
     case STATS_LEAST_SQUARES_REGRESSION:
-      return std::unique_ptr<GrowStats>(new LeastSquaresRegressionGrowStats(
-          params_, depth));
+      return std::unique_ptr<GrowStats>(
+          new LeastSquaresRegressionGrowStats(params_, depth));
 
     case STATS_FIXED_SIZE_SPARSE_GINI:
       return std::unique_ptr<GrowStats>(
@@ -136,8 +136,7 @@ void SplitCollectionOperator::CreateAndInitializeCandidateWithExample(
   stats_.at(node_id)->AddSplit(split, input_data, target, example);
 }
 
-bool SplitCollectionOperator::BestSplit(int32 node_id,
-                                        SplitCandidate* best,
+bool SplitCollectionOperator::BestSplit(int32 node_id, SplitCandidate* best,
                                         int32* depth) const {
   auto* slot = stats_.at(node_id).get();
   *depth = slot->depth();
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/split_collection_operators.h b/tensorflow/contrib/tensor_forest/kernels/v4/split_collection_operators.h
index ad52f89fad..c606ff98c6 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/split_collection_operators.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/split_collection_operators.h
@@ -71,9 +71,7 @@ class SplitCollectionOperator {
   }
 
   // Perform any necessary cleanup for any tracked state for the slot.
-  virtual void ClearSlot(int32 node_id) {
-    stats_.erase(node_id);
-  }
+  virtual void ClearSlot(int32 node_id) { stats_.erase(node_id); }
 
   // Return true if slot is fully initialized.
   virtual bool IsInitialized(int32 node_id) const;
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/stat_utils.cc b/tensorflow/contrib/tensor_forest/kernels/v4/stat_utils.cc
index 0bec198e97..c749fbe69e 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/stat_utils.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/stat_utils.cc
@@ -32,9 +32,9 @@ namespace tensorforest {
 //   smoothed_sum = stats.sum() + #_classes
 float GiniImpurity(const LeafStat& stats, int32 num_classes) {
   const float smoothed_sum = num_classes + stats.weight_sum();
-  return 1.0 - (
-      (stats.classification().gini().square()
-       + 2 * stats.weight_sum() + num_classes) / (smoothed_sum * smoothed_sum));
+  return 1.0 - ((stats.classification().gini().square() +
+                 2 * stats.weight_sum() + num_classes) /
+                (smoothed_sum * smoothed_sum));
 }
 
 float WeightedGiniImpurity(const LeafStat& stats, int32 num_classes) {
@@ -46,21 +46,20 @@ void UpdateGini(LeafStat* stats, float old_val, float weight) {
   // Equivalent to stats->square() - old_val * old_val + new_val * new_val,
   // (for new_val = old_val + weight), but more numerically stable.
   stats->mutable_classification()->mutable_gini()->set_square(
-      stats->classification().gini().square()
-      + weight * weight + 2 * old_val * weight);
+      stats->classification().gini().square() + weight * weight +
+      2 * old_val * weight);
 }
 
-
 float Variance(const LeafStat& stats, int output) {
   if (stats.weight_sum() == 0) {
     return 0;
   }
   const float e_x =
-      stats.regression().mean_output().value(output).float_value()
-      / stats.weight_sum();
+      stats.regression().mean_output().value(output).float_value() /
+      stats.weight_sum();
   const auto e_x2 =
-      stats.regression().mean_output_squares().value(output).float_value()
-      / stats.weight_sum();
+      stats.regression().mean_output_squares().value(output).float_value() /
+      stats.weight_sum();
   return e_x2 - e_x * e_x;
 }
 
@@ -75,8 +74,7 @@ float TotalVariance(const LeafStat& stats) {
 float SmoothedGini(float sum, float square, int num_classes) {
   // See comments for GiniImpurity above.
   const float smoothed_sum = num_classes + sum;
-  return 1.0 -
-         (square + 2 * sum + num_classes) / (smoothed_sum * smoothed_sum);
+  return 1.0 - (square + 2 * sum + num_classes) / (smoothed_sum * smoothed_sum);
 }
 
 float WeightedSmoothedGini(float sum, float square, int num_classes) {
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/test_utils.h b/tensorflow/contrib/tensor_forest/kernels/v4/test_utils.h
index 289c81e9d5..38deb3e3cd 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/test_utils.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/test_utils.h
@@ -27,9 +27,7 @@ class TestableInputTarget : public StoredInputTarget<std::vector<float>> {
       : StoredInputTarget(new std::vector<float>(t), new std::vector<float>(w),
                           num_t) {}
 
-  int NumItems() const {
-    return target_->size();
-  }
+  int NumItems() const { return target_->size(); }
 
   int32 GetTargetAsClassIndex(int example_index,
                               int target_index) const override {
@@ -51,7 +49,6 @@ class TestableInputTarget : public StoredInputTarget<std::vector<float>> {
   }
 };
 
-
 class TestableDataSet : public TensorDataSet {
  public:
   TestableDataSet(const std::vector<float>& data, int num_features)
diff --git a/tensorflow/contrib/util/convert_graphdef_memmapped_format_lib.cc b/tensorflow/contrib/util/convert_graphdef_memmapped_format_lib.cc
index 2992a61ea8..9675428e56 100644
--- a/tensorflow/contrib/util/convert_graphdef_memmapped_format_lib.cc
+++ b/tensorflow/contrib/util/convert_graphdef_memmapped_format_lib.cc
@@ -142,9 +142,9 @@ Status ConvertConstantsToImmutable(const string& in_graph_filename,
   const auto load_graph_status =
       ReadBinaryProto(default_env, in_graph_filename, &graph_def);
   if (!load_graph_status.ok()) {
-    return tensorflow::errors::NotFound("Failed to load graph at '",
-                                        in_graph_filename, "' : ",
-                                        load_graph_status.error_message());
+    return tensorflow::errors::NotFound(
+        "Failed to load graph at '", in_graph_filename,
+        "' : ", load_graph_status.error_message());
   }
 
   NodeConverter node_converter;
diff --git a/tensorflow/contrib/util/inspect_checkpoint.cc b/tensorflow/contrib/util/inspect_checkpoint.cc
index 39088aeaad..9b578ceb07 100644
--- a/tensorflow/contrib/util/inspect_checkpoint.cc
+++ b/tensorflow/contrib/util/inspect_checkpoint.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/util/tensor_slice_reader.h"
 
 namespace tensorflow {
diff --git a/tensorflow/contrib/verbs/verbs_server_lib.cc b/tensorflow/contrib/verbs/verbs_server_lib.cc
index 47ed83f521..1a0b5028fe 100644
--- a/tensorflow/contrib/verbs/verbs_server_lib.cc
+++ b/tensorflow/contrib/verbs/verbs_server_lib.cc
@@ -49,8 +49,8 @@ VerbsServer::~VerbsServer() {
 Status VerbsServer::ChannelCacheFactory(const ServerDef& server_def,
                                         GrpcChannelCache** channel_cache) {
   string name_prefix =
-      strings::StrCat("/job:", server_def.job_name(), "/replica:0", "/task:",
-                      server_def.task_index());
+      strings::StrCat("/job:", server_def.job_name(), "/replica:0",
+                      "/task:", server_def.task_index());
 
   GrpcChannelSpec channel_spec;
   TF_RETURN_IF_ERROR(ParseChannelSpec(server_def, &channel_spec));
-- 
GitLab


From b970652bf714cbf676fdd84256cb128afc2b1306 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 30 Jan 2018 11:09:06 -0800
Subject: [PATCH 395/966] Add py2tf to contrib_py.

PiperOrigin-RevId: 183860192
---
 tensorflow/contrib/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index f1e54432fa..efb6449bb0 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -76,6 +76,7 @@ py_library(
         "//tensorflow/contrib/predictor",
         "//tensorflow/contrib/quantization:quantization_py",
         "//tensorflow/contrib/quantize:quantize_graph",
+        "//tensorflow/contrib/py2tf",
         "//tensorflow/contrib/receptive_field:receptive_field_py",
         "//tensorflow/contrib/reduce_slice_ops:reduce_slice_ops_py",
         "//tensorflow/contrib/remote_fused_graph/pylib:remote_fused_graph_ops_py",
-- 
GitLab


From 29d24237b0e29d83478182ad219da478ee2135c1 Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Tue, 30 Jan 2018 11:18:03 -0800
Subject: [PATCH 396/966] Make loss_ops_test.py work with C API enabled.

PiperOrigin-RevId: 183861779
---
 tensorflow/contrib/losses/python/losses/loss_ops_test.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/losses/python/losses/loss_ops_test.py b/tensorflow/contrib/losses/python/losses/loss_ops_test.py
index 9d0f95e6f3..1417772e04 100644
--- a/tensorflow/contrib/losses/python/losses/loss_ops_test.py
+++ b/tensorflow/contrib/losses/python/losses/loss_ops_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -274,6 +275,7 @@ class SoftmaxCrossEntropyLossTest(test.TestCase):
       self.assertAlmostEqual(np.average(weights) * 10.0, loss, 3)
 
 
+@test_util.with_c_api
 class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
 
   def testNoneWeightRaisesValueError(self):
@@ -471,7 +473,11 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
       labels = constant_op.constant([[0, 1], [2, 3]])
       weights = constant_op.constant([1.2, 3.4, 5.6, 7.8])
 
-      with self.assertRaises(errors_impl.InvalidArgumentError):
+      if ops._USE_C_API:
+        error_type = ValueError
+      else:
+        error_type = errors_impl.InvalidArgumentError
+      with self.assertRaises(error_type):
         loss_ops.sparse_softmax_cross_entropy(
             logits, labels, weights=weights).eval()
 
-- 
GitLab


From a694f0ca2682f53f89a75707ad1f6c2ddffeacde Mon Sep 17 00:00:00 2001
From: Chris Leary <leary@google.com>
Date: Tue, 30 Jan 2018 11:22:12 -0800
Subject: [PATCH 397/966] [XLA] Fix tools broken by cl/183837856

PiperOrigin-RevId: 183862522
---
 tensorflow/compiler/xla/BUILD                   |  4 ----
 .../compiler/xla/executable_run_options.cc      | 17 -----------------
 .../compiler/xla/executable_run_options.h       |  7 -------
 .../dumped_computation_to_operation_list.cc     |  8 +++++---
 .../xla/tools/dumped_computation_to_text.cc     |  9 ++++++---
 5 files changed, 11 insertions(+), 34 deletions(-)

diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index 38e39afdc0..c22fd37129 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -444,10 +444,6 @@ cc_library(
     srcs = ["executable_run_options.cc"],
     hdrs = ["executable_run_options.h"],
     visibility = ["//visibility:public"],
-    deps = [
-        ":types",
-        "//tensorflow/core:lib",
-    ],
 )
 
 cc_library(
diff --git a/tensorflow/compiler/xla/executable_run_options.cc b/tensorflow/compiler/xla/executable_run_options.cc
index f8bb8e52c7..392ad9010a 100644
--- a/tensorflow/compiler/xla/executable_run_options.cc
+++ b/tensorflow/compiler/xla/executable_run_options.cc
@@ -15,8 +15,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/executable_run_options.h"
 
-#include "tensorflow/core/lib/strings/stringprintf.h"
-
 namespace xla {
 
 ExecutableRunOptions& ExecutableRunOptions::set_device_ordinal(
@@ -89,19 +87,4 @@ const DeviceAssignment* ExecutableRunOptions::device_assignment() const {
   return device_assignment_;
 }
 
-string ExecutableRunOptions::ToString() const {
-  return tensorflow::strings::Printf(
-      "ExecutableRunOptions{allocator=%p, device_ordinal=%d, "
-      "device_assignment=%p, stream=%p, inter_op_thread_pool=%p, "
-      "intra_op_thread_pool=%p, execution_profile=%p}",
-      allocator_, device_ordinal_, device_assignment_, stream_,
-      inter_op_thread_pool_, intra_op_thread_pool_, execution_profile_);
-}
-
-std::ostream& operator<<(std::ostream& out,
-                         const ExecutableRunOptions& options) {
-  out << options.ToString();
-  return out;
-}
-
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/executable_run_options.h b/tensorflow/compiler/xla/executable_run_options.h
index c7a20bb33c..d4fcbf0493 100644
--- a/tensorflow/compiler/xla/executable_run_options.h
+++ b/tensorflow/compiler/xla/executable_run_options.h
@@ -16,8 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_EXECUTABLE_RUN_OPTIONS_H_
 #define TENSORFLOW_COMPILER_XLA_EXECUTABLE_RUN_OPTIONS_H_
 
-#include "tensorflow/compiler/xla/types.h"
-
 // Intentionally forward declared so that ExecutableRunOptions can be linked
 // into an XLA-compiled binary without having to link all of the pointed-to
 // objects (e.g., for an ahead-of-time compiled CPU binary, the gpu tools don't
@@ -86,8 +84,6 @@ class ExecutableRunOptions {
       DeviceAssignment* device_assignment);
   const DeviceAssignment* device_assignment() const;
 
-  string ToString() const;
-
  private:
   DeviceMemoryAllocator* allocator_ = nullptr;
   int device_ordinal_ = -1;
@@ -98,9 +94,6 @@ class ExecutableRunOptions {
   ExecutionProfile* execution_profile_ = nullptr;
 };
 
-std::ostream& operator<<(std::ostream& out,
-                         const ExecutableRunOptions& options);
-
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_EXECUTABLE_RUN_OPTIONS_H_
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
index 4ad356d045..b82f1c81c8 100644
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
@@ -85,10 +85,12 @@ void RealMain(tensorflow::gtl::ArraySlice<char*> args) {
     for (int i = 0; i < program_shape->parameters_size(); ++i) {
       layouts.push_back(&program_shape->parameters(i));
     }
+    ExecutableBuildOptions build_options;
+    build_options.set_device_ordinal(0);
+    build_options.set_result_layout(program_shape->result());
     StatusOr<std::unique_ptr<Executable>> executable =
-        local_service->CompileExecutable(
-            computation.handle(), layouts, &program_shape->result(),
-            /*device_ordinal=*/0, /*device_allocator=*/nullptr);
+        local_service->CompileExecutable(computation.handle(), layouts,
+                                         build_options);
 
     const HloModule& module = executable.ValueOrDie()->module();
 
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
index 5ebb75a31c..05c0fdf97d 100644
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
@@ -60,10 +60,13 @@ void RealMain(tensorflow::gtl::ArraySlice<char*> args, bool compile) {
       for (int i = 0; i < program_shape->parameters_size(); ++i) {
         layouts.push_back(&program_shape->parameters(i));
       }
+
+      ExecutableBuildOptions build_options;
+      build_options.set_device_ordinal(0);
+      build_options.set_result_layout(program_shape->result());
       StatusOr<std::unique_ptr<Executable>> executable =
-          local_service->CompileExecutable(
-              computation.handle(), layouts, &program_shape->result(),
-              /*device_ordinal=*/0, /*device_allocator=*/nullptr);
+          local_service->CompileExecutable(computation.handle(), layouts,
+                                           build_options);
 
       const HloModule& module = executable.ValueOrDie()->module();
 
-- 
GitLab


From 979f139dd94e2bf5fba4794536715973b55373c1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 30 Jan 2018 11:09:06 -0800
Subject: [PATCH 398/966] Add py2tf to contrib_py.

PiperOrigin-RevId: 183860192
---
 tensorflow/compiler/xla/BUILD                   |  4 ++++
 .../compiler/xla/executable_run_options.cc      | 17 +++++++++++++++++
 .../compiler/xla/executable_run_options.h       |  7 +++++++
 .../dumped_computation_to_operation_list.cc     |  8 +++-----
 .../xla/tools/dumped_computation_to_text.cc     |  9 +++------
 .../losses/python/losses/loss_ops_test.py       |  8 +-------
 6 files changed, 35 insertions(+), 18 deletions(-)

diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index c22fd37129..38e39afdc0 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -444,6 +444,10 @@ cc_library(
     srcs = ["executable_run_options.cc"],
     hdrs = ["executable_run_options.h"],
     visibility = ["//visibility:public"],
+    deps = [
+        ":types",
+        "//tensorflow/core:lib",
+    ],
 )
 
 cc_library(
diff --git a/tensorflow/compiler/xla/executable_run_options.cc b/tensorflow/compiler/xla/executable_run_options.cc
index 392ad9010a..f8bb8e52c7 100644
--- a/tensorflow/compiler/xla/executable_run_options.cc
+++ b/tensorflow/compiler/xla/executable_run_options.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/executable_run_options.h"
 
+#include "tensorflow/core/lib/strings/stringprintf.h"
+
 namespace xla {
 
 ExecutableRunOptions& ExecutableRunOptions::set_device_ordinal(
@@ -87,4 +89,19 @@ const DeviceAssignment* ExecutableRunOptions::device_assignment() const {
   return device_assignment_;
 }
 
+string ExecutableRunOptions::ToString() const {
+  return tensorflow::strings::Printf(
+      "ExecutableRunOptions{allocator=%p, device_ordinal=%d, "
+      "device_assignment=%p, stream=%p, inter_op_thread_pool=%p, "
+      "intra_op_thread_pool=%p, execution_profile=%p}",
+      allocator_, device_ordinal_, device_assignment_, stream_,
+      inter_op_thread_pool_, intra_op_thread_pool_, execution_profile_);
+}
+
+std::ostream& operator<<(std::ostream& out,
+                         const ExecutableRunOptions& options) {
+  out << options.ToString();
+  return out;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/executable_run_options.h b/tensorflow/compiler/xla/executable_run_options.h
index d4fcbf0493..c7a20bb33c 100644
--- a/tensorflow/compiler/xla/executable_run_options.h
+++ b/tensorflow/compiler/xla/executable_run_options.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_EXECUTABLE_RUN_OPTIONS_H_
 #define TENSORFLOW_COMPILER_XLA_EXECUTABLE_RUN_OPTIONS_H_
 
+#include "tensorflow/compiler/xla/types.h"
+
 // Intentionally forward declared so that ExecutableRunOptions can be linked
 // into an XLA-compiled binary without having to link all of the pointed-to
 // objects (e.g., for an ahead-of-time compiled CPU binary, the gpu tools don't
@@ -84,6 +86,8 @@ class ExecutableRunOptions {
       DeviceAssignment* device_assignment);
   const DeviceAssignment* device_assignment() const;
 
+  string ToString() const;
+
  private:
   DeviceMemoryAllocator* allocator_ = nullptr;
   int device_ordinal_ = -1;
@@ -94,6 +98,9 @@ class ExecutableRunOptions {
   ExecutionProfile* execution_profile_ = nullptr;
 };
 
+std::ostream& operator<<(std::ostream& out,
+                         const ExecutableRunOptions& options);
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_EXECUTABLE_RUN_OPTIONS_H_
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
index b82f1c81c8..4ad356d045 100644
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
@@ -85,12 +85,10 @@ void RealMain(tensorflow::gtl::ArraySlice<char*> args) {
     for (int i = 0; i < program_shape->parameters_size(); ++i) {
       layouts.push_back(&program_shape->parameters(i));
     }
-    ExecutableBuildOptions build_options;
-    build_options.set_device_ordinal(0);
-    build_options.set_result_layout(program_shape->result());
     StatusOr<std::unique_ptr<Executable>> executable =
-        local_service->CompileExecutable(computation.handle(), layouts,
-                                         build_options);
+        local_service->CompileExecutable(
+            computation.handle(), layouts, &program_shape->result(),
+            /*device_ordinal=*/0, /*device_allocator=*/nullptr);
 
     const HloModule& module = executable.ValueOrDie()->module();
 
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
index 05c0fdf97d..5ebb75a31c 100644
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
@@ -60,13 +60,10 @@ void RealMain(tensorflow::gtl::ArraySlice<char*> args, bool compile) {
       for (int i = 0; i < program_shape->parameters_size(); ++i) {
         layouts.push_back(&program_shape->parameters(i));
       }
-
-      ExecutableBuildOptions build_options;
-      build_options.set_device_ordinal(0);
-      build_options.set_result_layout(program_shape->result());
       StatusOr<std::unique_ptr<Executable>> executable =
-          local_service->CompileExecutable(computation.handle(), layouts,
-                                           build_options);
+          local_service->CompileExecutable(
+              computation.handle(), layouts, &program_shape->result(),
+              /*device_ordinal=*/0, /*device_allocator=*/nullptr);
 
       const HloModule& module = executable.ValueOrDie()->module();
 
diff --git a/tensorflow/contrib/losses/python/losses/loss_ops_test.py b/tensorflow/contrib/losses/python/losses/loss_ops_test.py
index 1417772e04..9d0f95e6f3 100644
--- a/tensorflow/contrib/losses/python/losses/loss_ops_test.py
+++ b/tensorflow/contrib/losses/python/losses/loss_ops_test.py
@@ -27,7 +27,6 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -275,7 +274,6 @@ class SoftmaxCrossEntropyLossTest(test.TestCase):
       self.assertAlmostEqual(np.average(weights) * 10.0, loss, 3)
 
 
-@test_util.with_c_api
 class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
 
   def testNoneWeightRaisesValueError(self):
@@ -473,11 +471,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
       labels = constant_op.constant([[0, 1], [2, 3]])
       weights = constant_op.constant([1.2, 3.4, 5.6, 7.8])
 
-      if ops._USE_C_API:
-        error_type = ValueError
-      else:
-        error_type = errors_impl.InvalidArgumentError
-      with self.assertRaises(error_type):
+      with self.assertRaises(errors_impl.InvalidArgumentError):
         loss_ops.sparse_softmax_cross_entropy(
             logits, labels, weights=weights).eval()
 
-- 
GitLab


From 4da9f21f03f1031e02fcda27734ace1c552d2d2d Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Tue, 30 Jan 2018 11:18:03 -0800
Subject: [PATCH 399/966] Make loss_ops_test.py work with C API enabled.

PiperOrigin-RevId: 183861779
---
 tensorflow/contrib/losses/python/losses/loss_ops_test.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/losses/python/losses/loss_ops_test.py b/tensorflow/contrib/losses/python/losses/loss_ops_test.py
index 9d0f95e6f3..1417772e04 100644
--- a/tensorflow/contrib/losses/python/losses/loss_ops_test.py
+++ b/tensorflow/contrib/losses/python/losses/loss_ops_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -274,6 +275,7 @@ class SoftmaxCrossEntropyLossTest(test.TestCase):
       self.assertAlmostEqual(np.average(weights) * 10.0, loss, 3)
 
 
+@test_util.with_c_api
 class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
 
   def testNoneWeightRaisesValueError(self):
@@ -471,7 +473,11 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
       labels = constant_op.constant([[0, 1], [2, 3]])
       weights = constant_op.constant([1.2, 3.4, 5.6, 7.8])
 
-      with self.assertRaises(errors_impl.InvalidArgumentError):
+      if ops._USE_C_API:
+        error_type = ValueError
+      else:
+        error_type = errors_impl.InvalidArgumentError
+      with self.assertRaises(error_type):
         loss_ops.sparse_softmax_cross_entropy(
             logits, labels, weights=weights).eval()
 
-- 
GitLab


From ca15793a33a9d0e6baaf94eecb8f2edd117488a1 Mon Sep 17 00:00:00 2001
From: Chris Leary <leary@google.com>
Date: Tue, 30 Jan 2018 11:22:12 -0800
Subject: [PATCH 400/966] [XLA] Fix tools broken by cl/183837856

PiperOrigin-RevId: 183862522
---
 tensorflow/compiler/xla/BUILD                   |  4 ----
 .../compiler/xla/executable_run_options.cc      | 17 -----------------
 .../compiler/xla/executable_run_options.h       |  7 -------
 .../dumped_computation_to_operation_list.cc     |  8 +++++---
 .../xla/tools/dumped_computation_to_text.cc     |  9 ++++++---
 5 files changed, 11 insertions(+), 34 deletions(-)

diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index 38e39afdc0..c22fd37129 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -444,10 +444,6 @@ cc_library(
     srcs = ["executable_run_options.cc"],
     hdrs = ["executable_run_options.h"],
     visibility = ["//visibility:public"],
-    deps = [
-        ":types",
-        "//tensorflow/core:lib",
-    ],
 )
 
 cc_library(
diff --git a/tensorflow/compiler/xla/executable_run_options.cc b/tensorflow/compiler/xla/executable_run_options.cc
index f8bb8e52c7..392ad9010a 100644
--- a/tensorflow/compiler/xla/executable_run_options.cc
+++ b/tensorflow/compiler/xla/executable_run_options.cc
@@ -15,8 +15,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/executable_run_options.h"
 
-#include "tensorflow/core/lib/strings/stringprintf.h"
-
 namespace xla {
 
 ExecutableRunOptions& ExecutableRunOptions::set_device_ordinal(
@@ -89,19 +87,4 @@ const DeviceAssignment* ExecutableRunOptions::device_assignment() const {
   return device_assignment_;
 }
 
-string ExecutableRunOptions::ToString() const {
-  return tensorflow::strings::Printf(
-      "ExecutableRunOptions{allocator=%p, device_ordinal=%d, "
-      "device_assignment=%p, stream=%p, inter_op_thread_pool=%p, "
-      "intra_op_thread_pool=%p, execution_profile=%p}",
-      allocator_, device_ordinal_, device_assignment_, stream_,
-      inter_op_thread_pool_, intra_op_thread_pool_, execution_profile_);
-}
-
-std::ostream& operator<<(std::ostream& out,
-                         const ExecutableRunOptions& options) {
-  out << options.ToString();
-  return out;
-}
-
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/executable_run_options.h b/tensorflow/compiler/xla/executable_run_options.h
index c7a20bb33c..d4fcbf0493 100644
--- a/tensorflow/compiler/xla/executable_run_options.h
+++ b/tensorflow/compiler/xla/executable_run_options.h
@@ -16,8 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_EXECUTABLE_RUN_OPTIONS_H_
 #define TENSORFLOW_COMPILER_XLA_EXECUTABLE_RUN_OPTIONS_H_
 
-#include "tensorflow/compiler/xla/types.h"
-
 // Intentionally forward declared so that ExecutableRunOptions can be linked
 // into an XLA-compiled binary without having to link all of the pointed-to
 // objects (e.g., for an ahead-of-time compiled CPU binary, the gpu tools don't
@@ -86,8 +84,6 @@ class ExecutableRunOptions {
       DeviceAssignment* device_assignment);
   const DeviceAssignment* device_assignment() const;
 
-  string ToString() const;
-
  private:
   DeviceMemoryAllocator* allocator_ = nullptr;
   int device_ordinal_ = -1;
@@ -98,9 +94,6 @@ class ExecutableRunOptions {
   ExecutionProfile* execution_profile_ = nullptr;
 };
 
-std::ostream& operator<<(std::ostream& out,
-                         const ExecutableRunOptions& options);
-
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_EXECUTABLE_RUN_OPTIONS_H_
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
index 4ad356d045..b82f1c81c8 100644
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
@@ -85,10 +85,12 @@ void RealMain(tensorflow::gtl::ArraySlice<char*> args) {
     for (int i = 0; i < program_shape->parameters_size(); ++i) {
       layouts.push_back(&program_shape->parameters(i));
     }
+    ExecutableBuildOptions build_options;
+    build_options.set_device_ordinal(0);
+    build_options.set_result_layout(program_shape->result());
     StatusOr<std::unique_ptr<Executable>> executable =
-        local_service->CompileExecutable(
-            computation.handle(), layouts, &program_shape->result(),
-            /*device_ordinal=*/0, /*device_allocator=*/nullptr);
+        local_service->CompileExecutable(computation.handle(), layouts,
+                                         build_options);
 
     const HloModule& module = executable.ValueOrDie()->module();
 
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
index 5ebb75a31c..05c0fdf97d 100644
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
@@ -60,10 +60,13 @@ void RealMain(tensorflow::gtl::ArraySlice<char*> args, bool compile) {
       for (int i = 0; i < program_shape->parameters_size(); ++i) {
         layouts.push_back(&program_shape->parameters(i));
       }
+
+      ExecutableBuildOptions build_options;
+      build_options.set_device_ordinal(0);
+      build_options.set_result_layout(program_shape->result());
       StatusOr<std::unique_ptr<Executable>> executable =
-          local_service->CompileExecutable(
-              computation.handle(), layouts, &program_shape->result(),
-              /*device_ordinal=*/0, /*device_allocator=*/nullptr);
+          local_service->CompileExecutable(computation.handle(), layouts,
+                                           build_options);
 
       const HloModule& module = executable.ValueOrDie()->module();
 
-- 
GitLab


From 4c30f9676915704f1e0c31fa7a0729e375cb8412 Mon Sep 17 00:00:00 2001
From: Akshay Agrawal <akshayka@google.com>
Date: Tue, 30 Jan 2018 11:42:31 -0800
Subject: [PATCH 401/966] Delete dead code in Layer.

PiperOrigin-RevId: 183866106
---
 tensorflow/python/layers/base.py | 21 ---------------------
 1 file changed, 21 deletions(-)

diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py
index 5d9feb07b4..04b6056ace 100644
--- a/tensorflow/python/layers/base.py
+++ b/tensorflow/python/layers/base.py
@@ -139,9 +139,6 @@ class Layer(object):
 
     self._init_set_name(name)
 
-    # Holds functions for creating regularizer ops.
-    self._regularizer_factories = []
-
     # Determine variable scope.
     scope = kwargs.get('_scope')
     if scope:
@@ -306,22 +303,6 @@ class Layer(object):
       inputs_hash = None
     return self._per_input_updates.get(inputs_hash, [])
 
-  def _get_regularizer_factories(self):
-    try:
-      # Some subclasses of Layer do not use its constructor.
-      return self._regularizer_factories
-    except AttributeError:
-      self._regularizer_factories = []
-      return self._regularizer_factories
-
-  def _maybe_create_variable_regularizers(self):
-    """Creates added but uninstantiated regularizers."""
-    factories = self._get_regularizer_factories()
-    if factories:
-      for factory in factories:
-        factory()
-      factories[:] = []
-
   @property
   def losses(self):
     """Losses which are associated with this `Layer`.
@@ -333,7 +314,6 @@ class Layer(object):
     Returns:
       A list of tensors.
     """
-    self._maybe_create_variable_regularizers()
     if context.in_eager_mode():
       # _losses may only contain variable regularization losses when executing
       # eagerly, and they have been saved as lambdas to be executed when
@@ -417,7 +397,6 @@ class Layer(object):
       inputs_hash = layers_util.object_list_uid(inputs)
     else:
       inputs_hash = None
-    self._maybe_create_variable_regularizers()
     return self._per_input_losses.get(inputs_hash, [])
 
   def build(self, _):
-- 
GitLab


From 6558e37454de83652b5a9c5beb8f9230faecc7be Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 30 Jan 2018 11:47:43 -0800
Subject: [PATCH 402/966] K-FAC: expose set_global_constants() for
 tf.contrib.kfac.utils PiperOrigin-RevId: 183867014

---
 tensorflow/contrib/kfac/examples/mlp.py         | 5 ++++-
 tensorflow/contrib/kfac/python/ops/utils_lib.py | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/kfac/examples/mlp.py b/tensorflow/contrib/kfac/examples/mlp.py
index 0f0dbb53f4..87eed03888 100644
--- a/tensorflow/contrib/kfac/examples/mlp.py
+++ b/tensorflow/contrib/kfac/examples/mlp.py
@@ -317,7 +317,10 @@ def train_mnist_estimator(data_dir, num_epochs, use_fake_data=False):
     return tf.estimator.EstimatorSpec(
         mode=mode, loss=loss, train_op=train_op, training_hooks=hooks)
 
+  run_config = tf.estimator.RunConfig(
+      model_dir="/tmp/mnist", save_checkpoints_steps=1, keep_checkpoint_max=100)
+
   # Train until input_fn() is empty with Estimator. This is a prerequisite for
   # TPU compatibility.
-  estimator = tf.estimator.Estimator(model_fn=model_fn)
+  estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config)
   estimator.train(input_fn=input_fn)
diff --git a/tensorflow/contrib/kfac/python/ops/utils_lib.py b/tensorflow/contrib/kfac/python/ops/utils_lib.py
index cc48e3c69f..fe8e39c212 100644
--- a/tensorflow/contrib/kfac/python/ops/utils_lib.py
+++ b/tensorflow/contrib/kfac/python/ops/utils_lib.py
@@ -24,6 +24,7 @@ from tensorflow.python.util.all_util import remove_undocumented
 # pylint: enable=unused-import,line-too-long,wildcard-import
 
 _allowed_symbols = [
+    "set_global_constants",
     "SequenceDict",
     "tensors_to_column",
     "column_to_tensors",
-- 
GitLab


From 8a8fdb667bc77120e6ae47a88ab14e52cdd2cf07 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Tue, 30 Jan 2018 11:53:26 -0800
Subject: [PATCH 403/966] [TF:XLA] Bump open source llvm revision to r323761

PiperOrigin-RevId: 183868087
---
 tensorflow/workspace.bzl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 357b04d75f..0a669eeccd 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -472,11 +472,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "llvm",
       urls = [
-          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/674f4d74284e9c431c7b69e30f6e10fc4fcbc9ef.tar.gz",
-          "https://github.com/llvm-mirror/llvm/archive/674f4d74284e9c431c7b69e30f6e10fc4fcbc9ef.tar.gz",
+          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/36a30fc7c9ee6fdfe5157190ad15c1801b1ab2de.tar.gz",
+          "https://github.com/llvm-mirror/llvm/archive/36a30fc7c9ee6fdfe5157190ad15c1801b1ab2de.tar.gz",
       ],
-      sha256 = "3330c50efc9fc5d742e227dc810c2f586c7e36a60ecacd8251fafd2ea591e404",
-      strip_prefix = "llvm-674f4d74284e9c431c7b69e30f6e10fc4fcbc9ef",
+      sha256 = "3b74ecd8f59c712b4daf715a4da15c43ebdd40edcd4c30737bffef62f6a2bc9d",
+      strip_prefix = "llvm-36a30fc7c9ee6fdfe5157190ad15c1801b1ab2de",
       build_file = str(Label("//third_party/llvm:llvm.BUILD")),
   )
 
-- 
GitLab


From fcbfead019e064ac796a19b9d8b05325d26b3115 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 30 Jan 2018 12:00:04 -0800
Subject: [PATCH 404/966] Add static_sample flag to Mixture, permitting calls
 to `sample` to not rely on dynamic tensor indexing. This allows for some
 static graph compilation optimizations, but at the expense of sampling all
 underlying distributions in the mixture.

PiperOrigin-RevId: 183869189
---
 .../kernel_tests/distribution_util_test.py    |  40 +++++
 .../python/kernel_tests/mixture_test.py       | 142 ++++++++++++------
 .../python/ops/distribution_util.py           |  39 +++++
 .../distributions/python/ops/mixture.py       |  35 +++++
 .../python/ops/mixture_same_family.py         |  45 ++----
 5 files changed, 229 insertions(+), 72 deletions(-)

diff --git a/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py b/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py
index a255d4fc89..31d24aa9ea 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py
@@ -23,10 +23,15 @@ import itertools
 import numpy as np
 
 from tensorflow.contrib.distributions.python.ops import distribution_util
+from tensorflow.contrib.distributions.python.ops import mixture
+from tensorflow.contrib.distributions.python.ops import mixture_same_family
+from tensorflow.contrib.distributions.python.ops import mvn_diag
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.distributions import categorical
+from tensorflow.python.ops.distributions import normal
 from tensorflow.python.ops.linalg import linear_operator_diag
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
@@ -395,6 +400,41 @@ class MixtureStddevTest(test.TestCase):
     self.assertAllClose(actual_devs, expected_devs)
 
 
+class PadMixtureDimensionsTest(test.TestCase):
+
+  def test_pad_mixture_dimensions_mixture(self):
+    with self.test_session() as sess:
+      gm = mixture.Mixture(
+          cat=categorical.Categorical(probs=[[0.3, 0.7]]),
+          components=[
+              normal.Normal(loc=[-1.0], scale=[1.0]),
+              normal.Normal(loc=[1.0], scale=[0.5])
+          ])
+
+      x = array_ops.constant([[1.0, 2.0], [3.0, 4.0]])
+      x_pad = distribution_util.pad_mixture_dimensions(
+          x, gm, gm.cat, gm.event_shape.ndims)
+      x_out, x_pad_out = sess.run([x, x_pad])
+
+    self.assertAllEqual(x_pad_out.shape, [2, 2])
+    self.assertAllEqual(x_out.reshape([-1]), x_pad_out.reshape([-1]))
+
+  def test_pad_mixture_dimensions_mixture_same_family(self):
+    with self.test_session() as sess:
+      gm = mixture_same_family.MixtureSameFamily(
+          mixture_distribution=categorical.Categorical(probs=[0.3, 0.7]),
+          components_distribution=mvn_diag.MultivariateNormalDiag(
+              loc=[[-1., 1], [1, -1]], scale_identity_multiplier=[1.0, 0.5]))
+
+      x = array_ops.constant([[1.0, 2.0], [3.0, 4.0]])
+      x_pad = distribution_util.pad_mixture_dimensions(
+          x, gm, gm.mixture_distribution, gm.event_shape.ndims)
+      x_out, x_pad_out = sess.run([x, x_pad])
+
+    self.assertAllEqual(x_pad_out.shape, [2, 2, 1])
+    self.assertAllEqual(x_out.reshape([-1]), x_pad_out.reshape([-1]))
+
+
 class _PadTest(object):
 
   def testNegAxisCorrectness(self):
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/mixture_test.py b/tensorflow/contrib/distributions/python/kernel_tests/mixture_test.py
index 1e514fe0ff..0206489175 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/mixture_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/mixture_test.py
@@ -107,7 +107,7 @@ def _test_capture_normal_sample_outputs():
   ds.Normal._call_sample_n = true_normal_call_sample_n
 
 
-def make_univariate_mixture(batch_shape, num_components):
+def make_univariate_mixture(batch_shape, num_components, use_static_graph):
   batch_shape = ops.convert_to_tensor(batch_shape, dtypes.int32)
   logits = random_ops.random_uniform(
       array_ops.concat((batch_shape, [num_components]), axis=0),
@@ -119,11 +119,11 @@ def make_univariate_mixture(batch_shape, num_components):
       for _ in range(num_components)
   ]
   cat = ds.Categorical(logits, dtype=dtypes.int32)
-  return ds.Mixture(cat, components)
+  return ds.Mixture(cat, components, use_static_graph=use_static_graph)
 
 
 def make_multivariate_mixture(batch_shape, num_components, event_shape,
-                              batch_shape_tensor=None):
+                              use_static_graph, batch_shape_tensor=None):
   if batch_shape_tensor is None:
     batch_shape_tensor = batch_shape
   batch_shape_tensor = ops.convert_to_tensor(batch_shape_tensor, dtypes.int32)
@@ -145,15 +145,17 @@ def make_multivariate_mixture(batch_shape, num_components, event_shape,
         loc=loc, scale_diag=scale_diag)
   components = [create_component() for _ in range(num_components)]
   cat = ds.Categorical(logits, dtype=dtypes.int32)
-  return ds.Mixture(cat, components)
+  return ds.Mixture(cat, components, use_static_graph=use_static_graph)
 
 
 class MixtureTest(test.TestCase):
+  use_static_graph = False
 
   def testShapes(self):
     with self.test_session():
       for batch_shape in ([], [1], [2, 3, 4]):
-        dist = make_univariate_mixture(batch_shape, num_components=10)
+        dist = make_univariate_mixture(batch_shape, num_components=10,
+                                       use_static_graph=self.use_static_graph)
         self.assertAllEqual(batch_shape, dist.batch_shape)
         self.assertAllEqual(batch_shape, dist.batch_shape_tensor().eval())
         self.assertAllEqual([], dist.event_shape)
@@ -161,7 +163,8 @@ class MixtureTest(test.TestCase):
 
         for event_shape in ([1], [2]):
           dist = make_multivariate_mixture(
-              batch_shape, num_components=10, event_shape=event_shape)
+              batch_shape, num_components=10, event_shape=event_shape,
+              use_static_graph=self.use_static_graph)
           self.assertAllEqual(batch_shape, dist.batch_shape)
           self.assertAllEqual(batch_shape, dist.batch_shape_tensor().eval())
           self.assertAllEqual(event_shape, dist.event_shape)
@@ -172,7 +175,8 @@ class MixtureTest(test.TestCase):
                                              r"cat.num_classes != len"):
       ds.Mixture(
           ds.Categorical([0.1, 0.5]),  # 2 classes
-          [ds.Normal(loc=1.0, scale=2.0)])
+          [ds.Normal(loc=1.0, scale=2.0)],
+          use_static_graph=self.use_static_graph)
     with self.assertRaisesWithPredicateMatch(
         ValueError, r"\(\) and \(2,\) are not compatible"):
       # The value error is raised because the batch shapes of the
@@ -185,13 +189,15 @@ class MixtureTest(test.TestCase):
                   loc=1.0, scale=2.0),  # scalar dist
               ds.Normal(
                   loc=[1.0, 1.0], scale=[2.0, 2.0])
-          ])
+          ],
+          use_static_graph=self.use_static_graph)
     with self.assertRaisesWithPredicateMatch(ValueError, r"Could not infer"):
       cat_logits = array_ops.placeholder(shape=[1, None], dtype=dtypes.float32)
       ds.Mixture(
           ds.Categorical(cat_logits),
           [ds.Normal(
-              loc=[1.0], scale=[2.0])])
+              loc=[1.0], scale=[2.0])],
+          use_static_graph=self.use_static_graph)
 
   def testBrokenShapesDynamic(self):
     with self.test_session():
@@ -203,29 +209,37 @@ class MixtureTest(test.TestCase):
                   loc=d0_param, scale=d0_param), ds.Normal(
                       loc=d1_param, scale=d1_param)
           ],
-          validate_args=True)
-      with self.assertRaisesOpError(r"batch shape must match"):
+          validate_args=True,
+          use_static_graph=self.use_static_graph)
+
+      if self.use_static_graph:
+        error_string = r"Shapes of all inputs must match"
+      else:
+        error_string = r"batch shape must match"
+
+      with self.assertRaisesOpError(error_string):
         d.sample().eval(feed_dict={d0_param: [2.0, 3.0], d1_param: [1.0]})
-      with self.assertRaisesOpError(r"batch shape must match"):
+      with self.assertRaisesOpError(error_string):
         d.sample().eval(feed_dict={d0_param: [2.0, 3.0], d1_param: 1.0})
 
   def testBrokenTypes(self):
     with self.assertRaisesWithPredicateMatch(TypeError, "Categorical"):
-      ds.Mixture(None, [])
+      ds.Mixture(None, [], use_static_graph=self.use_static_graph)
     cat = ds.Categorical([0.3, 0.2])
     # components must be a list of distributions
     with self.assertRaisesWithPredicateMatch(
         TypeError, "all .* must be Distribution instances"):
-      ds.Mixture(cat, [None])
+      ds.Mixture(cat, [None], use_static_graph=self.use_static_graph)
     with self.assertRaisesWithPredicateMatch(TypeError, "same dtype"):
       ds.Mixture(
           cat, [
               ds.Normal(loc=[1.0], scale=[2.0]),
               ds.Normal(loc=[np.float16(1.0)],
                         scale=[np.float16(2.0)]),
-          ])
+          ], use_static_graph=self.use_static_graph)
     with self.assertRaisesWithPredicateMatch(ValueError, "non-empty list"):
-      ds.Mixture(ds.Categorical([0.3, 0.2]), None)
+      ds.Mixture(ds.Categorical([0.3, 0.2]), None,
+                 use_static_graph=self.use_static_graph)
 
     # TODO(ebrevdo): once distribution Domains have been added, add a
     # test to ensure that the domains of the distributions in a
@@ -235,7 +249,8 @@ class MixtureTest(test.TestCase):
     with self.test_session() as sess:
       for batch_shape in ((), (2,), (2, 3)):
         dist = make_univariate_mixture(
-            batch_shape=batch_shape, num_components=2)
+            batch_shape=batch_shape, num_components=2,
+            use_static_graph=self.use_static_graph)
         mean = dist.mean()
         self.assertEqual(batch_shape, mean.get_shape())
 
@@ -256,7 +271,8 @@ class MixtureTest(test.TestCase):
     with self.test_session() as sess:
       for batch_shape in ((), (2,), (2, 3)):
         dist = make_multivariate_mixture(
-            batch_shape=batch_shape, num_components=2, event_shape=(4,))
+            batch_shape=batch_shape, num_components=2, event_shape=(4,),
+            use_static_graph=self.use_static_graph)
         mean = dist.mean()
         self.assertEqual(batch_shape + (4,), mean.get_shape())
 
@@ -283,7 +299,8 @@ class MixtureTest(test.TestCase):
     with self.test_session() as sess:
       for batch_shape in ((), (2,), (2, 3)):
         dist = make_univariate_mixture(
-            batch_shape=batch_shape, num_components=num_components)
+            batch_shape=batch_shape, num_components=num_components,
+            use_static_graph=self.use_static_graph)
         dev = dist.stddev()
         self.assertEqual(batch_shape, dev.get_shape())
 
@@ -325,7 +342,8 @@ class MixtureTest(test.TestCase):
         dist = make_multivariate_mixture(
             batch_shape=batch_shape,
             num_components=num_components,
-            event_shape=(4,))
+            event_shape=(4,),
+            use_static_graph=self.use_static_graph)
         dev = dist.stddev()
         self.assertEqual(batch_shape + (4,), dev.get_shape())
 
@@ -371,7 +389,8 @@ class MixtureTest(test.TestCase):
                       scale=component_devs[0]),
             ds.Normal(loc=component_means[1],
                       scale=component_devs[1]),
-        ])
+        ],
+        use_static_graph=self.use_static_graph)
     mix_dev = mixture_dist.stddev()
     with self.test_session() as sess:
       actual_stddev = sess.run(mix_dev)
@@ -379,7 +398,8 @@ class MixtureTest(test.TestCase):
 
   def testProbScalarUnivariate(self):
     with self.test_session() as sess:
-      dist = make_univariate_mixture(batch_shape=[], num_components=2)
+      dist = make_univariate_mixture(batch_shape=[], num_components=2,
+                                     use_static_graph=self.use_static_graph)
       for x in [
           np.array(
               [1.0, 2.0], dtype=np.float32), np.array(
@@ -405,7 +425,8 @@ class MixtureTest(test.TestCase):
   def testProbScalarMultivariate(self):
     with self.test_session() as sess:
       dist = make_multivariate_mixture(
-          batch_shape=[], num_components=2, event_shape=[3])
+          batch_shape=[], num_components=2, event_shape=[3],
+          use_static_graph=self.use_static_graph)
       for x in [
           np.array(
               [[-1.0, 0.0, 1.0], [0.5, 1.0, -0.3]], dtype=np.float32), np.array(
@@ -432,7 +453,8 @@ class MixtureTest(test.TestCase):
 
   def testProbBatchUnivariate(self):
     with self.test_session() as sess:
-      dist = make_univariate_mixture(batch_shape=[2, 3], num_components=2)
+      dist = make_univariate_mixture(batch_shape=[2, 3], num_components=2,
+                                     use_static_graph=self.use_static_graph)
 
       for x in [
           np.random.randn(2, 3).astype(np.float32),
@@ -459,7 +481,8 @@ class MixtureTest(test.TestCase):
   def testProbBatchMultivariate(self):
     with self.test_session() as sess:
       dist = make_multivariate_mixture(
-          batch_shape=[2, 3], num_components=2, event_shape=[4])
+          batch_shape=[2, 3], num_components=2, event_shape=[4],
+          use_static_graph=self.use_static_graph)
 
       for x in [
           np.random.randn(2, 3, 4).astype(np.float32),
@@ -487,7 +510,8 @@ class MixtureTest(test.TestCase):
       num_components = 3
       batch_shape = []
       dist = make_univariate_mixture(
-          batch_shape=batch_shape, num_components=num_components)
+          batch_shape=batch_shape, num_components=num_components,
+          use_static_graph=self.use_static_graph)
       n = 4
       with _test_capture_normal_sample_outputs() as component_samples:
         samples = dist.sample(n, seed=123)
@@ -502,7 +526,10 @@ class MixtureTest(test.TestCase):
         which_c = np.where(cat_sample_values == c)[0]
         size_c = which_c.size
         # Scalar Batch univariate case: batch_size == 1, rank 1
-        which_dist_samples = dist_sample_values[c][:size_c]
+        if self.use_static_graph:
+          which_dist_samples = dist_sample_values[c][which_c]
+        else:
+          which_dist_samples = dist_sample_values[c][:size_c]
         self.assertAllClose(which_dist_samples, sample_values[which_c])
 
   # Test that sampling with the same seed twice gives the same results.
@@ -522,7 +549,8 @@ class MixtureTest(test.TestCase):
       ]
       cat = ds.Categorical(
           logits, dtype=dtypes.int32, name="cat1")
-      dist1 = ds.Mixture(cat, components, name="mixture1")
+      dist1 = ds.Mixture(cat, components, name="mixture1",
+                         use_static_graph=self.use_static_graph)
       samples1 = dist1.sample(n, seed=123456).eval()
 
       random_seed.set_random_seed(654321)
@@ -532,7 +560,8 @@ class MixtureTest(test.TestCase):
       ]
       cat2 = ds.Categorical(
           logits, dtype=dtypes.int32, name="cat2")
-      dist2 = ds.Mixture(cat2, components2, name="mixture2")
+      dist2 = ds.Mixture(cat2, components2, name="mixture2",
+                         use_static_graph=self.use_static_graph)
       samples2 = dist2.sample(n, seed=123456).eval()
 
       self.assertAllClose(samples1, samples2)
@@ -541,7 +570,8 @@ class MixtureTest(test.TestCase):
     with self.test_session() as sess:
       num_components = 3
       dist = make_multivariate_mixture(
-          batch_shape=[], num_components=num_components, event_shape=[2])
+          batch_shape=[], num_components=num_components, event_shape=[2],
+          use_static_graph=self.use_static_graph)
       n = 4
       with _test_capture_mvndiag_sample_outputs() as component_samples:
         samples = dist.sample(n, seed=123)
@@ -555,14 +585,18 @@ class MixtureTest(test.TestCase):
         which_c = np.where(cat_sample_values == c)[0]
         size_c = which_c.size
         # Scalar Batch multivariate case: batch_size == 1, rank 2
-        which_dist_samples = dist_sample_values[c][:size_c, :]
+        if self.use_static_graph:
+          which_dist_samples = dist_sample_values[c][which_c, :]
+        else:
+          which_dist_samples = dist_sample_values[c][:size_c, :]
         self.assertAllClose(which_dist_samples, sample_values[which_c, :])
 
   def testSampleBatchUnivariate(self):
     with self.test_session() as sess:
       num_components = 3
       dist = make_univariate_mixture(
-          batch_shape=[2, 3], num_components=num_components)
+          batch_shape=[2, 3], num_components=num_components,
+          use_static_graph=self.use_static_graph)
       n = 4
       with _test_capture_normal_sample_outputs() as component_samples:
         samples = dist.sample(n, seed=123)
@@ -576,8 +610,12 @@ class MixtureTest(test.TestCase):
         which_c_s, which_c_b0, which_c_b1 = np.where(cat_sample_values == c)
         size_c = which_c_s.size
         # Batch univariate case: batch_size == [2, 3], rank 3
-        which_dist_samples = dist_sample_values[c][range(size_c), which_c_b0,
-                                                   which_c_b1]
+        if self.use_static_graph:
+          which_dist_samples = dist_sample_values[c][which_c_s, which_c_b0,
+                                                     which_c_b1]
+        else:
+          which_dist_samples = dist_sample_values[c][range(size_c), which_c_b0,
+                                                     which_c_b1]
         self.assertAllClose(which_dist_samples,
                             sample_values[which_c_s, which_c_b0, which_c_b1])
 
@@ -594,7 +632,8 @@ class MixtureTest(test.TestCase):
       dist = make_multivariate_mixture(
           batch_shape=batch_shape,
           num_components=num_components, event_shape=[4],
-          batch_shape_tensor=batch_shape_tensor)
+          batch_shape_tensor=batch_shape_tensor,
+          use_static_graph=self.use_static_graph)
       n = 5
       with _test_capture_mvndiag_sample_outputs() as component_samples:
         samples = dist.sample(n, seed=123)
@@ -617,8 +656,12 @@ class MixtureTest(test.TestCase):
         which_c_s, which_c_b0, which_c_b1 = np.where(cat_sample_values == c)
         size_c = which_c_s.size
         # Batch univariate case: batch_size == [2, 3], rank 4 (multivariate)
-        which_dist_samples = dist_sample_values[c][range(size_c), which_c_b0,
-                                                   which_c_b1, :]
+        if self.use_static_graph:
+          which_dist_samples = dist_sample_values[c][which_c_s, which_c_b0,
+                                                     which_c_b1, :]
+        else:
+          which_dist_samples = dist_sample_values[c][range(size_c), which_c_b0,
+                                                     which_c_b1, :]
         self.assertAllClose(which_dist_samples,
                             sample_values[which_c_s, which_c_b0, which_c_b1, :])
 
@@ -632,7 +675,8 @@ class MixtureTest(test.TestCase):
     with self.test_session() as sess:
       for batch_shape in ((), (2,), (2, 3)):
         dist = make_multivariate_mixture(
-            batch_shape=batch_shape, num_components=2, event_shape=(4,))
+            batch_shape=batch_shape, num_components=2, event_shape=(4,),
+            use_static_graph=self.use_static_graph)
         entropy_lower_bound = dist.entropy_lower_bound()
         self.assertEqual(batch_shape, entropy_lower_bound.get_shape())
 
@@ -673,7 +717,8 @@ class MixtureTest(test.TestCase):
     cat_tf = ds.Categorical(probs=mixture_weights)
     components_tf = [ds.Normal(loc=mu, scale=sigma)
                      for (mu, sigma) in zip(means, sigmas)]
-    mixture_tf = ds.Mixture(cat=cat_tf, components=components_tf)
+    mixture_tf = ds.Mixture(cat=cat_tf, components=components_tf,
+                            use_static_graph=self.use_static_graph)
 
     x_tensor = array_ops.placeholder(shape=(), dtype=dtypes.float32)
 
@@ -721,7 +766,8 @@ class MixtureTest(test.TestCase):
     cat_tf = ds.Categorical(probs=mixture_weights)
     components_tf = [ds.Normal(loc=mu, scale=sigma)
                      for (mu, sigma) in zip(means, sigmas)]
-    mixture_tf = ds.Mixture(cat=cat_tf, components=components_tf)
+    mixture_tf = ds.Mixture(cat=cat_tf, components=components_tf,
+                            use_static_graph=self.use_static_graph)
 
     x_tensor = array_ops.placeholder(shape=psize, dtype=dtypes.float32)
     xs_to_check = [
@@ -760,12 +806,18 @@ class MixtureTest(test.TestCase):
       gm = ds.Mixture(
           cat=ds.Categorical(probs=[.3, .7]),
           components=[ds.Gamma(1., 2.),
-                      ds.Gamma(2., 1.)])
+                      ds.Gamma(2., 1.)],
+          use_static_graph=self.use_static_graph)
       x_ = gm.sample().eval()
       self.assertAllEqual([], x_.shape)
 
 
+class MixtureStaticSampleTest(MixtureTest):
+  use_static_graph = True
+
+
 class MixtureBenchmark(test.Benchmark):
+  use_static_graph = False
 
   def _runSamplingBenchmark(self, name, create_distribution, use_gpu,
                             num_components, batch_size, num_features,
@@ -811,7 +863,7 @@ class MixtureBenchmark(test.Benchmark):
       components = list(
           ds.MultivariateNormalDiag(
               loc=mu, scale_diag=sigma) for (mu, sigma) in zip(mus, sigmas))
-      return ds.Mixture(cat, components)
+      return ds.Mixture(cat, components, use_static_graph=self.use_static_graph)
 
     for use_gpu in False, True:
       if use_gpu and not test.is_gpu_available():
@@ -853,7 +905,7 @@ class MixtureBenchmark(test.Benchmark):
           ds.MultivariateNormalTriL(
               loc=mu, scale_tril=linalg_ops.cholesky(sigma))
           for (mu, sigma) in zip(mus, sigmas))
-      return ds.Mixture(cat, components)
+      return ds.Mixture(cat, components, use_static_graph=self.use_static_graph)
 
     for use_gpu in False, True:
       if use_gpu and not test.is_gpu_available():
@@ -872,5 +924,9 @@ class MixtureBenchmark(test.Benchmark):
                   sample_size=sample_size)
 
 
+class MixtureStaticSampleBenchmark(MixtureBenchmark):
+  use_static_graph = True
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distributions/python/ops/distribution_util.py b/tensorflow/contrib/distributions/python/ops/distribution_util.py
index a4d249d41e..289e1d50e1 100644
--- a/tensorflow/contrib/distributions/python/ops/distribution_util.py
+++ b/tensorflow/contrib/distributions/python/ops/distribution_util.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib import linalg
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
@@ -442,6 +443,44 @@ def maybe_check_scalar_distribution(
     return assertions
 
 
+def pad_mixture_dimensions(x, mixture_distribution, categorical_distribution,
+                           event_ndims):
+  """Pad dimensions of event tensors for mixture distributions.
+
+  See `Mixture._sample_n` and `MixtureSameFamily._sample_n` for usage examples.
+
+  Args:
+    x: event tensor to pad.
+    mixture_distribution: Base distribution of the mixture.
+    categorical_distribution: `Categorical` distribution that mixes the base
+      distribution.
+    event_ndims: Integer specifying the number of event dimensions in the event
+      tensor.
+
+  Returns:
+    A padded version of `x` that can broadcast with `categorical_distribution`.
+  """
+  with ops.name_scope("pad_mix_dims", values=[x]):
+    def _get_ndims(d):
+      if d.batch_shape.ndims is not None:
+        return d.batch_shape.ndims
+      return array_ops.shape(d.batch_shape_tensor())[0]
+    dist_batch_ndims = _get_ndims(mixture_distribution)
+    cat_batch_ndims = _get_ndims(categorical_distribution)
+    pad_ndims = array_ops.where(
+        categorical_distribution.is_scalar_batch(),
+        dist_batch_ndims,
+        dist_batch_ndims - cat_batch_ndims)
+    s = array_ops.shape(x)
+    x = array_ops.reshape(x, shape=array_ops.concat([
+        s[:-1],
+        array_ops.ones([pad_ndims], dtype=dtypes.int32),
+        s[-1:],
+        array_ops.ones([event_ndims], dtype=dtypes.int32),
+    ], axis=0))
+    return x
+
+
 def static_value(x):
   """Returns the static value of a `Tensor` or `None`."""
   return tensor_util.constant_value(ops.convert_to_tensor(x))
diff --git a/tensorflow/contrib/distributions/python/ops/mixture.py b/tensorflow/contrib/distributions/python/ops/mixture.py
index f2d492f548..cef6a143fc 100644
--- a/tensorflow/contrib/distributions/python/ops/mixture.py
+++ b/tensorflow/contrib/distributions/python/ops/mixture.py
@@ -71,6 +71,7 @@ class Mixture(distribution.Distribution):
                components,
                validate_args=False,
                allow_nan_stats=True,
+               use_static_graph=False,
                name="Mixture"):
     """Initialize a Mixture distribution.
 
@@ -96,6 +97,11 @@ class Mixture(distribution.Distribution):
        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
         batch member. If `True`, batch members with valid parameters leading to
         undefined statistics will return NaN for this statistic.
+      use_static_graph: Calls to `sample` will not rely on dynamic tensor
+        indexing, allowing for some static graph compilation optimizations, but
+        at the expense of sampling all underlying distributions in the mixture.
+        (Possibly useful when running on TPUs).
+        Default value: `False` (i.e., use dynamic indexing).
       name: A name for this distribution (optional).
 
     Raises:
@@ -178,6 +184,10 @@ class Mixture(distribution.Distribution):
       self._static_event_shape = static_event_shape
       self._static_batch_shape = static_batch_shape
 
+      self._use_static_graph = use_static_graph
+      if use_static_graph and static_num_components is None:
+        raise ValueError("Number of categories must be known statically when "
+                         "`static_sample=True`.")
     # We let the Mixture distribution access _graph_parents since its arguably
     # more like a baseclass.
     graph_parents = self._cat._graph_parents  # pylint: disable=protected-access
@@ -292,6 +302,31 @@ class Mixture(distribution.Distribution):
       return mixture_log_cdf
 
   def _sample_n(self, n, seed=None):
+    if self._use_static_graph:
+      # This sampling approach is almost the same as the approach used by
+      # `MixtureSameFamily`. The differences are due to having a list of
+      # `Distribution` objects rather than a single object, and maintaining
+      # random seed management that is consistent with the non-static code path.
+      samples = []
+      cat_samples = self.cat.sample(n, seed=seed)
+      for c in range(self.num_components):
+        seed = distribution_util.gen_new_seed(seed, "mixture")
+        samples.append(self.components[c].sample(n, seed=seed))
+      x = array_ops.stack(
+          samples, -self._static_event_shape.ndims - 1)     # [n, B, k, E]
+      npdt = x.dtype.as_numpy_dtype
+      mask = array_ops.one_hot(
+          indices=cat_samples,                              # [n, B]
+          depth=self._num_components,                       # == k
+          on_value=np.ones([], dtype=npdt),
+          off_value=np.zeros([], dtype=npdt))               # [n, B, k]
+      mask = distribution_utils.pad_mixture_dimensions(
+          mask, self, self._cat,
+          self._static_event_shape.ndims)                   # [n, B, k, [1]*e]
+      return math_ops.reduce_sum(
+          x * mask,
+          axis=-1 - self._static_event_shape.ndims)         # [n, B, E]
+
     with ops.control_dependencies(self._assertions):
       n = ops.convert_to_tensor(n, name="n")
       static_n = tensor_util.constant_value(n)
diff --git a/tensorflow/contrib/distributions/python/ops/mixture_same_family.py b/tensorflow/contrib/distributions/python/ops/mixture_same_family.py
index 49afbea7f0..b93bdc5ab4 100644
--- a/tensorflow/contrib/distributions/python/ops/mixture_same_family.py
+++ b/tensorflow/contrib/distributions/python/ops/mixture_same_family.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.framework import dtypes
+from tensorflow.contrib.distributions.python.ops import distribution_util as distribution_utils
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -239,7 +239,9 @@ class MixtureSameFamily(distribution.Distribution):
           depth=self._num_components,                        # == k
           on_value=np.ones([], dtype=npdt),
           off_value=np.zeros([], dtype=npdt))                # [n, B, k]
-      mask = self._pad_mix_dims(mask)                        # [n, B, k, [1]*e]
+      mask = distribution_utils.pad_mixture_dimensions(
+          mask, self, self.mixture_distribution,
+          self._event_shape().ndims)                         # [n, B, k, [1]*e]
       return math_ops.reduce_sum(
           x * mask, axis=-1 - self._event_ndims)             # [n, B, E]
 
@@ -254,8 +256,9 @@ class MixtureSameFamily(distribution.Distribution):
 
   def _mean(self):
     with ops.control_dependencies(self._runtime_assertions):
-      probs = self._pad_mix_dims(
-          self.mixture_distribution.probs)                   # [B, k, [1]*e]
+      probs = distribution_utils.pad_mixture_dimensions(
+          self.mixture_distribution.probs, self, self.mixture_distribution,
+          self._event_shape().ndims)                         # [B, k, [1]*e]
       return math_ops.reduce_sum(
           probs * self.components_distribution.mean(),
           axis=-1 - self._event_ndims)                       # [B, E]
@@ -271,8 +274,9 @@ class MixtureSameFamily(distribution.Distribution):
   def _variance(self):
     with ops.control_dependencies(self._runtime_assertions):
       # Law of total variance: Var(Y) = E[Var(Y|X)] + Var(E[Y|X])
-      probs = self._pad_mix_dims(
-          self.mixture_distribution.probs)                   # [B, k, [1]*e]
+      probs = distribution_utils.pad_mixture_dimensions(
+          self.mixture_distribution.probs, self, self.mixture_distribution,
+          self._event_shape().ndims)                         # [B, k, [1]*e]
       mean_cond_var = math_ops.reduce_sum(
           probs * self.components_distribution.variance(),
           axis=-1 - self._event_ndims)                       # [B, E]
@@ -291,8 +295,12 @@ class MixtureSameFamily(distribution.Distribution):
 
     with ops.control_dependencies(self._runtime_assertions):
       # Law of total variance: Var(Y) = E[Var(Y|X)] + Var(E[Y|X])
-      probs = self._pad_mix_dims(self._pad_mix_dims(
-          self.mixture_distribution.probs))                  # [B, k, 1, 1]
+      probs = distribution_utils.pad_mixture_dimensions(
+          distribution_utils.pad_mixture_dimensions(
+              self.mixture_distribution.probs, self, self.mixture_distribution,
+              self._event_shape().ndims),
+          self, self.mixture_distribution,
+          self._event_shape().ndims)                         # [B, k, 1, 1]
       mean_cond_var = math_ops.reduce_sum(
           probs * self.components_distribution.covariance(),
           axis=-3)                                           # [B, e, e]
@@ -312,27 +320,6 @@ class MixtureSameFamily(distribution.Distribution):
           shape[:d], [1], shape[d:]], axis=0))
       return x
 
-  def _pad_mix_dims(self, x):
-    with ops.name_scope("pad_mix_dims", values=[x]):
-      def _get_ndims(d):
-        if d.batch_shape.ndims is not None:
-          return d.batch_shape.ndims
-        return array_ops.shape(d.batch_shape_tensor())[0]
-      dist_batch_ndims = _get_ndims(self)
-      cat_batch_ndims = _get_ndims(self.mixture_distribution)
-      pad_ndims = array_ops.where(
-          self.mixture_distribution.is_scalar_batch(),
-          dist_batch_ndims,
-          dist_batch_ndims - cat_batch_ndims)
-      s = array_ops.shape(x)
-      x = array_ops.reshape(x, shape=array_ops.concat([
-          s[:-1],
-          array_ops.ones([pad_ndims], dtype=dtypes.int32),
-          s[-1:],
-          array_ops.ones([self._event_ndims], dtype=dtypes.int32),
-      ], axis=0))
-      return x
-
 
 def _outer_squared_difference(x, y):
   """Convenience function analogous to tf.squared_difference."""
-- 
GitLab


From 9eea6fa72a90b0b16b554ff23185e0365c2a6f48 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 30 Jan 2018 12:01:02 -0800
Subject: [PATCH 405/966] RetryingFileSystem::FlushCaches() calls the base
 FileSystem's FlushCaches().

PiperOrigin-RevId: 183869325
---
 .../platform/cloud/retrying_file_system.cc    |  2 ++
 .../platform/cloud/retrying_file_system.h     |  2 ++
 .../cloud/retrying_file_system_test.cc        | 19 ++++++++++++++++++-
 3 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/platform/cloud/retrying_file_system.cc b/tensorflow/core/platform/cloud/retrying_file_system.cc
index 870d935e11..be9ebe67b1 100644
--- a/tensorflow/core/platform/cloud/retrying_file_system.cc
+++ b/tensorflow/core/platform/cloud/retrying_file_system.cc
@@ -202,4 +202,6 @@ Status RetryingFileSystem::DeleteRecursively(const string& dirname,
       initial_delay_microseconds_);
 }
 
+void RetryingFileSystem::FlushCaches() { base_file_system_->FlushCaches(); }
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/retrying_file_system.h b/tensorflow/core/platform/cloud/retrying_file_system.h
index d9d8ea6b00..a262a5fd94 100644
--- a/tensorflow/core/platform/cloud/retrying_file_system.h
+++ b/tensorflow/core/platform/cloud/retrying_file_system.h
@@ -69,6 +69,8 @@ class RetryingFileSystem : public FileSystem {
   Status DeleteRecursively(const string& dirname, int64* undeleted_files,
                            int64* undeleted_dirs) override;
 
+  void FlushCaches() override;
+
  private:
   std::unique_ptr<FileSystem> base_file_system_;
   const int64 initial_delay_microseconds_;
diff --git a/tensorflow/core/platform/cloud/retrying_file_system_test.cc b/tensorflow/core/platform/cloud/retrying_file_system_test.cc
index 232dcb3e71..d3f763bb3c 100644
--- a/tensorflow/core/platform/cloud/retrying_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/retrying_file_system_test.cc
@@ -84,7 +84,8 @@ class MockWritableFile : public WritableFile {
 
 class MockFileSystem : public FileSystem {
  public:
-  explicit MockFileSystem(const ExpectedCalls& calls) : calls_(calls) {}
+  explicit MockFileSystem(const ExpectedCalls& calls, bool* flushed = nullptr)
+      : calls_(calls), flushed_(flushed) {}
 
   Status NewRandomAccessFile(
       const string& fname, std::unique_ptr<RandomAccessFile>* result) override {
@@ -156,11 +157,18 @@ class MockFileSystem : public FileSystem {
     return calls_.ConsumeNextCall("DeleteRecursively");
   }
 
+  void FlushCaches() override {
+    if (flushed_) {
+      *flushed_ = true;
+    }
+  }
+
   std::unique_ptr<WritableFile> writable_file_to_return;
   std::unique_ptr<RandomAccessFile> random_access_file_to_return;
 
  private:
   MockCallSequence calls_;
+  bool* flushed_ = nullptr;
 };
 
 TEST(RetryingFileSystemTest, NewRandomAccessFile_ImmediateSuccess) {
@@ -702,5 +710,14 @@ TEST(RetryingFileSystemTest, DeleteRecursively_AllRetriesFailed) {
       << status;
 }
 
+TEST(RetryingFileSystemTest, FlushCaches) {
+  ExpectedCalls none;
+  bool flushed = false;
+  std::unique_ptr<MockFileSystem> base_fs(new MockFileSystem(none, &flushed));
+  RetryingFileSystem fs(std::move(base_fs), 0);
+  fs.FlushCaches();
+  EXPECT_TRUE(flushed);
+}
+
 }  // namespace
 }  // namespace tensorflow
-- 
GitLab


From c48431588e7cf8aff61d4c299231e3e925144df8 Mon Sep 17 00:00:00 2001
From: "David G. Andersen" <dga@google.com>
Date: Tue, 30 Jan 2018 12:06:40 -0800
Subject: [PATCH 406/966] Eliminate crash on a 'no error' return from DecodeGif
 when parsing an invalid gif.  (Previous code tried to strcat a null).

PiperOrigin-RevId: 183870288
---
 tensorflow/core/lib/gif/gif_io.cc | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/lib/gif/gif_io.cc b/tensorflow/core/lib/gif/gif_io.cc
index 0f6999c88f..e5deb2b873 100644
--- a/tensorflow/core/lib/gif/gif_io.cc
+++ b/tensorflow/core/lib/gif/gif_io.cc
@@ -44,6 +44,14 @@ int input_callback(GifFileType* gif_file, GifByteType* buf, int size) {
   return 0;
 }
 
+static const char* GifErrorStringNonNull(int error_code) {
+  const char* error_string = GifErrorString(error_code);
+  if (error_string == nullptr) {
+    return "Unknown error";
+  }
+  return error_string;
+}
+
 uint8* Decode(const void* srcdata, int datasize,
               const std::function<uint8*(int, int, int, int)>& allocate_output,
               string* error_string) {
@@ -55,17 +63,17 @@ uint8* Decode(const void* srcdata, int datasize,
     int error_code = D_GIF_SUCCEEDED;
     if (gif_file && DGifCloseFile(gif_file, &error_code) != GIF_OK) {
       LOG(WARNING) << "Fail to close gif file, reason: "
-                   << GifErrorString(error_code);
+                   << GifErrorStringNonNull(error_code);
     }
   });
   if (error_code != D_GIF_SUCCEEDED) {
     *error_string = strings::StrCat("failed to open gif file: ",
-                                    GifErrorString(error_code));
+                                    GifErrorStringNonNull(error_code));
     return nullptr;
   }
   if (DGifSlurp(gif_file) != GIF_OK) {
     *error_string = strings::StrCat("failed to slurp gif file: ",
-                                    GifErrorString(gif_file->Error));
+                                    GifErrorStringNonNull(gif_file->Error));
     return nullptr;
   }
   if (gif_file->ImageCount <= 0) {
-- 
GitLab


From 77b22b38f03e5d9b52909d444f08592ffbe0334d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 30 Jan 2018 12:13:58 -0800
Subject: [PATCH 407/966] Add check macro for not-equal.

PiperOrigin-RevId: 183871336
---
 tensorflow/contrib/lite/kernels/internal/compatibility.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tensorflow/contrib/lite/kernels/internal/compatibility.h b/tensorflow/contrib/lite/kernels/internal/compatibility.h
index 1d963afb7e..51426bb1c5 100644
--- a/tensorflow/contrib/lite/kernels/internal/compatibility.h
+++ b/tensorflow/contrib/lite/kernels/internal/compatibility.h
@@ -27,6 +27,10 @@ limitations under the License.
 #define TFLITE_DCHECK_EQ(x, y) ((x) == (y)) ? (void)0 : assert(false)
 #endif
 
+#ifndef TFLITE_DCHECK_NE
+#define TFLITE_DCHECK_NE(x, y) ((x) != (y)) ? (void)0 : assert(false)
+#endif
+
 #ifndef TFLITE_DCHECK_GE
 #define TFLITE_DCHECK_GE(x, y) ((x) >= (y)) ? (void)0 : assert(false)
 #endif
@@ -52,6 +56,10 @@ limitations under the License.
 #define TFLITE_CHECK_EQ(x, y) ((x) == (y)) ? (void)0 : abort()
 #endif
 
+#ifndef TFLITE_CHECK_NE
+#define TFLITE_CHECK_NE(x, y) ((x) != (y)) ? (void)0 : abort()
+#endif
+
 #ifndef TFLITE_CHECK_GE
 #define TFLITE_CHECK_GE(x, y) ((x) >= (y)) ? (void)0 : abort()
 #endif
-- 
GitLab


From 7115e52f599724dfe7ab12d15e0dc193600c880f Mon Sep 17 00:00:00 2001
From: Martin Wicke <wicke@google.com>
Date: Tue, 30 Jan 2018 12:32:30 -0800
Subject: [PATCH 408/966] Fix bad logging call in warmstarting_utils.py.

PiperOrigin-RevId: 183873925
---
 tensorflow/python/estimator/warm_starting_util.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/estimator/warm_starting_util.py b/tensorflow/python/estimator/warm_starting_util.py
index ad95c71234..48110ef57f 100644
--- a/tensorflow/python/estimator/warm_starting_util.py
+++ b/tensorflow/python/estimator/warm_starting_util.py
@@ -415,8 +415,8 @@ def _warm_start(warm_start_settings):
       a stronger check for variable configuration than relying on users to
       examine the logs.
   """
-  logging.info("Warm-starting from: ",
-               warm_start_settings.ckpt_to_initialize_from)
+  logging.info("Warm-starting from: %s",
+               (warm_start_settings.ckpt_to_initialize_from,))
   # We have to deal with partitioned variables, since get_collection flattens
   # out the list.
   grouped_variables = {}
-- 
GitLab


From 5e9a946b0fbab8e36e4e2dd20480f0982f540890 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 30 Jan 2018 12:36:51 -0800
Subject: [PATCH 409/966] Automated g4 rollback of changelist 183846994

PiperOrigin-RevId: 183874527
---
 tensorflow/cc/saved_model/loader.cc      |  4 +---
 tensorflow/cc/saved_model/loader_test.cc | 18 ------------------
 2 files changed, 1 insertion(+), 21 deletions(-)

diff --git a/tensorflow/cc/saved_model/loader.cc b/tensorflow/cc/saved_model/loader.cc
index faa1e378d0..acef098c7d 100644
--- a/tensorflow/cc/saved_model/loader.cc
+++ b/tensorflow/cc/saved_model/loader.cc
@@ -96,9 +96,7 @@ Status FindMetaGraphDefToLoad(const SavedModel& saved_model_proto,
 Status LoadMetaGraphIntoSession(const MetaGraphDef& meta_graph_def,
                                 const SessionOptions& session_options,
                                 std::unique_ptr<Session>* session) {
-  Session* session_p = nullptr;
-  TF_RETURN_IF_ERROR(NewSession(session_options, &session_p));
-  session->reset(session_p);
+  session->reset(NewSession(session_options));
   return (*session)->Create(meta_graph_def.graph_def());
 }
 
diff --git a/tensorflow/cc/saved_model/loader_test.cc b/tensorflow/cc/saved_model/loader_test.cc
index 4c64d2cfe3..0ad6b33bba 100644
--- a/tensorflow/cc/saved_model/loader_test.cc
+++ b/tensorflow/cc/saved_model/loader_test.cc
@@ -155,24 +155,6 @@ TEST_F(LoaderTest, NoTagMatchMultiple) {
       << st.error_message();
 }
 
-TEST_F(LoaderTest, SessionCreationFailure) {
-  SavedModelBundle bundle;
-  // Use invalid SessionOptions to cause session creation to fail.  Default
-  // options work, so provide an invalid value for the target field.
-  SessionOptions session_options;
-  constexpr char kInvalidTarget[] = "invalid target";
-  session_options.target = kInvalidTarget;
-  RunOptions run_options;
-
-  const string export_dir =
-      io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataSharded);
-  Status st = LoadSavedModel(session_options, run_options, export_dir,
-                             {kSavedModelTagServe}, &bundle);
-  EXPECT_FALSE(st.ok());
-  EXPECT_TRUE(StringPiece(st.error_message()).contains(kInvalidTarget))
-      << st.error_message();
-}
-
 TEST_F(LoaderTest, PbtxtFormat) {
   SavedModelBundle bundle;
   SessionOptions session_options;
-- 
GitLab


From 92b3200b220e236cdaec6bf7c9a1a99192794347 Mon Sep 17 00:00:00 2001
From: Jonathan Hseu <jhseu@google.com>
Date: Tue, 30 Jan 2018 12:49:18 -0800
Subject: [PATCH 410/966] Support outfeed host calls in TPUEstimator. That
 implicitly allows us to support tf.contrib.summary (see the unit test).

We may change the recommended way to use summaries later.

PiperOrigin-RevId: 183876356
---
 tensorflow/contrib/tpu/BUILD                  |   1 +
 .../contrib/tpu/python/tpu/tpu_estimator.py   | 397 ++++++++++--------
 2 files changed, 232 insertions(+), 166 deletions(-)

diff --git a/tensorflow/contrib/tpu/BUILD b/tensorflow/contrib/tpu/BUILD
index 0199313bc8..a7d54d8a0c 100644
--- a/tensorflow/contrib/tpu/BUILD
+++ b/tensorflow/contrib/tpu/BUILD
@@ -43,6 +43,7 @@ py_library(
     deps = [
         ":tpu_lib",
         ":tpu_py",
+        "//tensorflow/contrib/summary:summary_ops",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index 7907d0baa5..9d715bb236 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -30,6 +30,7 @@ import six
 from six.moves import queue as Queue  # pylint: disable=redefined-builtin
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from tensorflow.contrib.summary import summary_ops as contrib_summary
 from tensorflow.contrib.tpu.python.ops import tpu_ops
 from tensorflow.contrib.tpu.python.tpu import tpu
 from tensorflow.contrib.tpu.python.tpu import tpu_config
@@ -385,7 +386,8 @@ class TPUEstimatorSpec(
         'train_op',
         'eval_metrics',
         'export_outputs',
-        'scaffold_fn'
+        'scaffold_fn',
+        'host_call'
     ])):
   """Ops and objects returned from a `model_fn` and passed to `TPUEstimator`.
 
@@ -411,6 +413,16 @@ class TPUEstimatorSpec(
 
   `scaffold_fn` is a function running on CPU to generate the `Scaffold`. This
   function should not capture any Tensors in `model_fn`.
+
+  `host_call` is a tuple of a `function` and a list or dictionary of `tensors`
+  to pass to that function. `host_call` currently works for train() and
+  evaluate(). The function's graph is executed on the CPU on every step, so
+  there is communication overhead when sending tensors from TPU to CPU. To
+  reduce the overhead, try reducing the size of the tensors. The `tensors` are
+  concatenated along their major (batch) dimension, and so must be >= rank 1.
+  The `host_call` is useful for writing summaries with
+  @{tf.contrib.summary.create_file_writer}. Note that `host_call` does not
+  currently work if `use_tpu` is set to False.
   """
 
   def __new__(cls,
@@ -420,10 +432,15 @@ class TPUEstimatorSpec(
               train_op=None,
               eval_metrics=None,
               export_outputs=None,
-              scaffold_fn=None):
+              scaffold_fn=None,
+              host_call=None):
     """Creates a validated `TPUEstimatorSpec` instance."""
+    host_calls = {}
     if eval_metrics is not None:
-      _EvalMetrics.validate(eval_metrics)
+      host_calls['eval_metrics'] = eval_metrics
+    if host_call is not None:
+      host_calls['host_call'] = host_call
+    _OutfeedHostCall.validate(host_calls)
     return super(TPUEstimatorSpec, cls).__new__(
         cls,
         mode=mode,
@@ -432,12 +449,15 @@ class TPUEstimatorSpec(
         train_op=train_op,
         eval_metrics=eval_metrics,
         export_outputs=export_outputs,
-        scaffold_fn=scaffold_fn)
+        scaffold_fn=scaffold_fn,
+        host_call=host_call)
 
   def as_estimator_spec(self):
     """Creates an equivalent `EstimatorSpec` used by CPU train/eval."""
-    eval_metric_ops = _EvalMetrics.to_metric_metric_ops_for_cpu(
-        self.eval_metrics)
+    eval_metric_ops = None
+    if self.eval_metrics is not None:
+      eval_metric_ops = _OutfeedHostCall.create_cpu_hostcall(
+          {'eval_metrics': self.eval_metrics})['eval_metrics']
     scaffold = self.scaffold_fn() if self.scaffold_fn else None
     return model_fn_lib.EstimatorSpec(
         mode=self.mode,
@@ -490,7 +510,7 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
      dequeue.
   """
 
-  def __init__(self, ctx, enqueue_ops, dequeue_ops=None):
+  def __init__(self, ctx, enqueue_ops, dequeue_ops):
     self._master_job = ctx.master_job
     self._enqueue_ops = enqueue_ops
     self._dequeue_ops = dequeue_ops
@@ -504,8 +524,15 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
   def begin(self):
     logging.info('TPU job name %s', self._master_job)
     self._iterations_per_loop_var = _create_or_get_iterations_per_loop()
-    self._init_op = [tpu.initialize_system(job=self._master_job)]
-    self._finalize_op = [tpu.shutdown_system(job=self._master_job)]
+    self._init_ops = [tpu.initialize_system(job=self._master_job)]
+    self._finalize_ops = [tpu.shutdown_system(job=self._master_job)]
+
+    summary_writer_init_ops = contrib_summary.summary_writer_initializer_op()
+    self._init_ops.extend(summary_writer_init_ops)
+    # Get all the writer resources from the initializer, so we know what to
+    # flush.
+    for op in summary_writer_init_ops:
+      self._finalize_ops.append(contrib_summary.flush(writer=op.inputs[0]))
 
   def _log_error(self, session, error):
     """Log an infeed or outfeed error.
@@ -517,8 +544,9 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
     emitting a stack trace for the infeed.
 
     Args:
-      session: `tf.Session`, session to be terminated
-      error: exception that triggered logging.
+      session: `tf.Session`, session to be terminated error: exception that
+        triggered logging.
+      error: the Exception to log.
     """
     logging.warning(
         '\n\n'
@@ -594,18 +622,16 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
 
   def after_create_session(self, session, coord):
     logging.info('Init TPU system')
-    session.run(
-        self._init_op,
-        options=config_pb2.RunOptions(timeout_in_ms=5 * 60 * 1000))
+    session.run(self._init_ops,
+                options=config_pb2.RunOptions(timeout_in_ms=5 * 60 * 1000))
 
     logging.info('Start infeed thread controller')
     self._infeed_controller = _OpQueueContext(
         name='InfeedController', target=self._run_infeed, args=(session,))
 
-    if self._dequeue_ops is not None:
-      logging.info('Start outfeed thread controller')
-      self._outfeed_controller = _OpQueueContext(
-          name='OutfeedController', target=self._run_outfeed, args=(session,))
+    logging.info('Start outfeed thread controller')
+    self._outfeed_controller = _OpQueueContext(
+        name='OutfeedController', target=self._run_outfeed, args=(session,))
 
   def before_run(self, run_context):
     if self._feed_error:
@@ -618,11 +644,10 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
     logging.info('Enqueue next (%d) batch(es) of data to infeed.', iterations)
     self._infeed_controller.send_next_batch_signal(iterations)
 
-    if self._dequeue_ops is not None:
-      # TODO(xiejw): Refactor the outfeed dequeue into tf.while_loop.
-      logging.info('Dequeue next (%d) batch(es) of data from outfeed.',
-                   iterations)
-      self._outfeed_controller.send_next_batch_signal(iterations)
+    # TODO(xiejw): Refactor the outfeed dequeue into tf.while_loop.
+    logging.info('Dequeue next (%d) batch(es) of data from outfeed.',
+                 iterations)
+    self._outfeed_controller.send_next_batch_signal(iterations)
 
   def end(self, session):
     if self._session_cancel_timer:
@@ -633,12 +658,11 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
     logging.info('Stop infeed thread controller')
     self._infeed_controller.join()
 
-    if self._dequeue_ops is not None:
-      logging.info('Stop output thread controller')
-      self._outfeed_controller.join()
+    logging.info('Stop output thread controller')
+    self._outfeed_controller.join()
 
     logging.info('Shutdown TPU system.')
-    session.run(self._finalize_op)
+    session.run(self._finalize_ops)
 
 
 class _TPUStopAtStepHook(session_run_hook.SessionRunHook):
@@ -1080,6 +1104,7 @@ class _ModelFnWrapper(object):
       A Fn representing the train step for TPU.
     """
 
+    host_call = _OutfeedHostCall(self._ctx)
     captured_scaffold_fn = _CapturedObject()
 
     def train_step(loss):
@@ -1091,15 +1116,19 @@ class _ModelFnWrapper(object):
           self._call_model_fn(features, labels))
       loss, train_op = estimator_spec.loss, estimator_spec.train_op
 
+      host_call_outfeed_ops = []
       if isinstance(estimator_spec, TPUEstimatorSpec):
         captured_scaffold_fn.capture(estimator_spec.scaffold_fn)
+        if estimator_spec.host_call is not None:
+          host_call.record({'host_call': estimator_spec.host_call})
+          host_call_outfeed_ops = host_call.create_enqueue_op()
       else:
         captured_scaffold_fn.capture(None)
 
-      with ops.control_dependencies([train_op]):
+      with ops.control_dependencies([train_op] + host_call_outfeed_ops):
         return array_ops.identity(loss)
 
-    return train_step, captured_scaffold_fn
+    return train_step, host_call, captured_scaffold_fn
 
   def convert_to_single_tpu_eval_step(self, dequeue_fn):
     """Converts user provided model_fn` as a single eval step on TPU.
@@ -1125,9 +1154,9 @@ class _ModelFnWrapper(object):
 
     Returns:
       A tuple of eval_fn and eval_metrics. The eval_fn representing the eval
-      step for TPU. and eval_metrics is an `_EvalMetrics` instance.
+      step for TPU. and eval_metrics is an `_OutfeedHostCall` instance.
     """
-    eval_metrics = _EvalMetrics(self._ctx)
+    host_calls = _OutfeedHostCall(self._ctx)
     captured_scaffold_fn = _CapturedObject()
 
     def eval_step(total_loss):
@@ -1142,13 +1171,16 @@ class _ModelFnWrapper(object):
 
       loss = tpu_estimator_spec.loss
       captured_scaffold_fn.capture(tpu_estimator_spec.scaffold_fn)
-      eval_metrics.record(tpu_estimator_spec)
-      outfeed_ops = tpu_ops.outfeed_enqueue_tuple(eval_metrics.outfeed_tensors)
+      to_record = {}
+      to_record['eval_metrics'] = tpu_estimator_spec.eval_metrics
+      if tpu_estimator_spec.host_call is not None:
+        to_record['host_call'] = tpu_estimator_spec.host_call
+      host_calls.record(to_record)
 
-      with ops.control_dependencies([outfeed_ops]):
+      with ops.control_dependencies(host_calls.create_enqueue_op()):
         return math_ops.add(total_loss, loss)
 
-    return eval_step, eval_metrics, captured_scaffold_fn
+    return eval_step, host_calls, captured_scaffold_fn
 
   def _call_model_fn(self, features, labels):
     """Calls the model_fn with required parameters."""
@@ -1208,158 +1240,178 @@ class _ModelFnWrapper(object):
     return estimator_spec
 
 
-class _EvalMetrics(object):
-  """Class wraps TPUEstimator.eval_metrics."""
+class _OutfeedHostCall(object):
+  """Support for `eval_metrics` and `host_call` in TPUEstimatorSpec."""
 
   def __init__(self, ctx):
     self._ctx = ctx
-    self._metric_fn = None
-    self._is_dict = False
-    self._tensor_keys = []
-    self._tensors = []
-    self._tensor_dtypes = []
-    self._tensor_shapes = []
-    self._recorded = False
+    self._names = []
+    # All of these are dictionaries of lists keyed on the name.
+    self._host_fns = {}
+    self._tensor_keys = collections.defaultdict(list)
+    self._tensors = collections.defaultdict(list)
+    self._tensor_dtypes = collections.defaultdict(list)
+    self._tensor_shapes = collections.defaultdict(list)
 
   @staticmethod
-  def validate(eval_metrics):
-    """Validates the `eval_metrics` in `TPUEstimatorSpec`."""
-
-    if not isinstance(eval_metrics, (tuple, list)):
-      raise ValueError('eval_metrics should be tuple or list')
-    if len(eval_metrics) != 2:
-      raise ValueError('eval_metrics should have two elements.')
-    if not callable(eval_metrics[0]):
-      raise TypeError('eval_metrics[0] should be callable.')
-    if not isinstance(eval_metrics[1], (tuple, list, dict)):
-      raise ValueError('eval_metrics[1] should be tuple or list, or dict.')
-
-    if isinstance(eval_metrics[1], (tuple, list)):
-      fn_args = util.fn_args(eval_metrics[0])
-      if len(eval_metrics[1]) != len(fn_args):
-        raise RuntimeError(
-            'In TPUEstimatorSpec.eval_metrics, length of tensors does not '
-            'match method args of metric_fn.')
+  def validate(host_calls):
+    """Validates the `eval_metrics` and `host_call` in `TPUEstimatorSpec`."""
+
+    for name, host_call in host_calls.items():
+      if not isinstance(host_call, (tuple, list)):
+        raise ValueError('{} should be tuple or list'.format(name))
+      if len(host_call) != 2:
+        raise ValueError('{} should have two elements.'.format(name))
+      if not callable(host_call[0]):
+        raise TypeError('{}[0] should be callable.'.format(name))
+      if not isinstance(host_call[1], (tuple, list, dict)):
+        raise ValueError('{}[1] should be tuple or list, or dict.'.format(name))
+
+      if isinstance(host_call[1], (tuple, list)):
+        fn_args = util.fn_args(host_call[0])
+        if len(host_call[1]) != len(fn_args):
+          raise RuntimeError(
+              'In TPUEstimatorSpec.{}, length of tensors does not '
+              'match method args of metric_fn.'.format(name))
 
   @staticmethod
-  def to_metric_metric_ops_for_cpu(eval_metrics):
-    """Converts `TPUEstimatorSpec.eval_metrics` to `eval_metric_ops` for CPU."""
-    if not eval_metrics:
-      return None
-
-    _EvalMetrics.validate(eval_metrics)
-
-    metric_fn, tensors = eval_metrics
+  def create_cpu_hostcall(host_calls):
+    """Runs on the host_call on CPU instead of TPU when use_tpu=False."""
+
+    _OutfeedHostCall.validate(host_calls)
+    ret = {}
+    for name, host_call in host_calls.items():
+      host_fn, tensors = host_call
+      if isinstance(tensors, (tuple, list)):
+        ret[name] = host_fn(*tensors)
+      else:
+        # Must be dict.
+        try:
+          ret[name] = host_fn(**tensors)
+        except TypeError as e:
+          logging.warning(
+              'Exception while calling %s: %s. It is likely the tensors '
+              '(%s[1]) do not match the '
+              'function\'s arguments', name, e, name)
+          raise e
+    return ret
+
+  def record(self, host_calls):
+    """Records the host_call structure."""
+
+    for name, host_call in host_calls.items():
+      host_fn, tensor_list_or_dict = host_call
+      self._names.append(name)
+      self._host_fns[name] = host_fn
+
+      if isinstance(tensor_list_or_dict, dict):
+        for (key, tensor) in six.iteritems(tensor_list_or_dict):
+          self._tensor_keys[name].append(key)
+          self._tensors[name].append(tensor)
+          self._tensor_dtypes[name].append(tensor.dtype)
+          self._tensor_shapes[name].append(tensor.shape)
+      else:
+        # List or tuple.
+        self._tensor_keys[name] = None
+        for tensor in tensor_list_or_dict:
+          self._tensors[name].append(tensor)
+          self._tensor_dtypes[name].append(tensor.dtype)
+          self._tensor_shapes[name].append(tensor.shape)
 
-    if isinstance(tensors, (tuple, list)):
-      return metric_fn(*tensors)
-    else:
-      # Must be dict.
-      try:
-        return metric_fn(**tensors)
-      except TypeError as e:
-        logging.warning(
-            'Exception while calling metric_fn for evalution: %s. '
-            'It is likely the tensors (eval_metrics[1]) do not match the '
-            'metric_fn arguments', e)
-        raise e
-
-  def record(self, spec):
-    """Records the eval_metrics structure in `spec`."""
-    if self._recorded:
-      raise RuntimeError('Eval metrics have been recorded already.')
-
-    self._metric_fn, tensor_list_or_dict = spec.eval_metrics
-
-    if isinstance(tensor_list_or_dict, dict):
-      self._is_dict = True
-      for (key, tensor) in six.iteritems(tensor_list_or_dict):
-        self._tensor_keys.append(key)
-        self._tensors.append(tensor)
-        self._tensor_dtypes.append(tensor.dtype)
-        self._tensor_shapes.append(tensor.shape)
-    else:
-      # List or tuple.
-      self._is_dict = False
-      self._tensors = tensor_list_or_dict
-      for tensor in tensor_list_or_dict:
-        self._tensor_dtypes.append(tensor.dtype)
-        self._tensor_shapes.append(tensor.shape)
-    self._recorded = True
+  def create_enqueue_op(self):
+    """Create the op to enqueue the recorded host_calls.
 
-  @property
-  def outfeed_tensors(self):
-    if not self._recorded:
-      raise RuntimeError('Eval metrics have not been recorded yet')
-    return self._tensors
+    Returns:
+      A list of enqueue ops, which is empty if there are no host calls.
+    """
+    if not self._names:
+      return []
 
-  def to_metric_metric_ops_for_tpu(self, dummy_update_op):
-    """Creates the eval_metric_ops now based on the TPU outfeed.
+    tensors = []
+    # TODO(jhseu): Consider deduping tensors.
+    for name in self._names:
+      tensors.extend(self._tensors[name])
+    return [tpu_ops.outfeed_enqueue_tuple(tensors)]
 
-    `eval_metric_ops` is defined in `EstimatorSpec`. From all shards, tensors
-    are dequeued from outfeed and then concatenated (along batch size dimension)
-    to form  global-like tensors. All global-like tensors are passed to the
-    metric fn.
+  def create_tpu_hostcall(self):
+    """Sends the tensors through outfeed and runs the host_fn on CPU.
 
-    Args:
-      dummy_update_op: A dummy update op.
+    The tensors are concatenated along dimension 0 to form a global tensor
+    across all shards. The concatenated function is passed to the host_fn and
+    executed on the first host.
 
     Returns:
-      A tuple of (`eval_metric_ops` and `update_ops`), where `update_ops` should
-      be invoked in Outfeed dequeue thread, which drive the outfeed dequeue and
-      update the state of metrics.
+      A dictionary mapping name to the return type of the host_call by that
+      name.
 
     Raises:
       RuntimeError: If outfeed tensor is scalar.
     """
+    if not self._names:
+      return []
 
-    num_cores = self._ctx.num_cores
-
+    ret = {}
     # For each i, dequeue_ops[i] is a list containing the tensors from all
     # shards. This list is concatenated later.
     dequeue_ops = []
-    for i in xrange(len(self._tensors)):
-      dequeue_ops.append([])
-
-    # Outfeed ops execute on each JF node.
+    tensor_dtypes = []
+    tensor_shapes = []
+    for name in self._names:
+      for _ in self._tensors[name]:
+        dequeue_ops.append([])
+      for dtype in self._tensor_dtypes[name]:
+        tensor_dtypes.append(dtype)
+      for shape in self._tensor_shapes[name]:
+        tensor_shapes.append(shape)
+
+    # Outfeed ops execute on each JF node. Note: we must constraint it such that
+    # we have at most one outfeed dequeue and enqueue.
     tpu_device_placement_fn = self._ctx.tpu_device_placement_function
-    for i in xrange(num_cores):
+    for i in xrange(self._ctx.num_cores):
       with ops.device(tpu_device_placement_fn(i)):
         outfeed_tensors = tpu_ops.outfeed_dequeue_tuple(
-            dtypes=self._tensor_dtypes, shapes=self._tensor_shapes)
+            dtypes=tensor_dtypes, shapes=tensor_shapes)
         for j, item in enumerate(outfeed_tensors):
           dequeue_ops[j].append(item)
 
-    # It is assumed evaluation always happends on single host TPU system. So,
+    # Deconstruct dequeue ops.
+    dequeue_ops_by_name = {}
+    pos = 0
+    for name in self._names:
+      dequeue_ops_by_name[name] = dequeue_ops[pos:pos+len(self._tensors[name])]
+      pos += len(self._tensors[name])
+
+    # It is assumed evaluation always happens on single host TPU system. So,
     # place all ops on tpu host if possible.
+    #
+    # TODO(jhseu): Evaluate whether this is right for summaries.
     with ops.device(self._ctx.tpu_host_placement_function(core_id=0)):
-      for i, item in enumerate(dequeue_ops):
-        if dequeue_ops[i][0].shape.ndims == 0:
-          raise RuntimeError(
-              'All tensors outfed from TPU should preseve batch size '
-              'dimension, but got scalar {}'.format(dequeue_ops[i][0]))
-        # TODO(xiejw): Allow users to specify the axis for batch size dimension.
-        dequeue_ops[i] = array_ops.concat(dequeue_ops[i], axis=0)
-
-      if self._is_dict:
-        dequeue_ops = dict(zip(self._tensor_keys, dequeue_ops))
-        try:
-          eval_metric_ops = self._metric_fn(**dequeue_ops)
-        except TypeError as e:
-          logging.warning(
-              'Exception while calling metric_fn for evalution: %s. '
-              'It is likely the tensors (eval_metrics[1]) do not match the '
-              'metric_fn arguments', e)
-          raise e
-      else:
-        eval_metric_ops = self._metric_fn(*dequeue_ops)
-
-    eval_update_ops = []
-    for k, v in eval_metric_ops.items():
-      eval_metric_ops[k] = (v[0], dummy_update_op)
-      eval_update_ops.append(v[1])
+      for name in self._names:
+        dequeue_ops = dequeue_ops_by_name[name]
+        for i, item in enumerate(dequeue_ops):
+          if dequeue_ops[i][0].shape.ndims == 0:
+            raise RuntimeError(
+                'All tensors outfed from TPU should preserve batch size '
+                'dimension, but got scalar {}'.format(dequeue_ops[i][0]))
+          # TODO(xiejw): Allow users to specify the axis for batch size
+          # dimension.
+          dequeue_ops[i] = array_ops.concat(dequeue_ops[i], axis=0)
+
+        if self._tensor_keys[name] is not None:
+          # The user-provided eval_metrics[1] is a dict.
+          dequeue_ops = dict(zip(self._tensor_keys[name], dequeue_ops))
+          try:
+            ret[name] = self._host_fns[name](**dequeue_ops)
+          except TypeError as e:
+            logging.warning(
+                'Exception while calling %s: %s. It is likely the tensors '
+                '(%s[1]) do not match the '
+                'function\'s arguments', name, e, name)
+            raise e
+        else:
+          ret[name] = self._host_fns[name](*dequeue_ops)
 
-    return eval_metric_ops, eval_update_ops
+    return ret
 
 
 class ExamplesPerSecondHook(basic_session_run_hooks.StepCounterHook):
@@ -1717,10 +1769,13 @@ class TPUEstimator(estimator_lib.Estimator):
             input_holders.generate_infeed_enqueue_ops_and_dequeue_fn())
 
         if mode == model_fn_lib.ModeKeys.TRAIN:
-          loss, scaffold = (
+          loss, host_call, scaffold = (
               _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn))
+          host_ops = host_call.create_tpu_hostcall()
+          if host_ops is None:
+            host_ops = []
           hooks = [
-              TPUInfeedOutfeedSessionHook(ctx, enqueue_ops),
+              TPUInfeedOutfeedSessionHook(ctx, enqueue_ops, host_ops),
               ExamplesPerSecondHook(ctx.global_batch_size),
               InstallSignalHandlerHook(),
               training.LoggingTensorHook(
@@ -1745,7 +1800,7 @@ class TPUEstimator(estimator_lib.Estimator):
               scaffold=scaffold)
 
         # Now eval.
-        total_loss, eval_metric_ops, scaffold = _eval_on_tpu_system(
+        total_loss, host_calls, scaffold = _eval_on_tpu_system(
             ctx, model_fn_wrapper, dequeue_fn)
         iterations_per_loop_var = _create_or_get_iterations_per_loop()
         mean_loss = math_ops.div(total_loss,
@@ -1767,10 +1822,20 @@ class TPUEstimator(estimator_lib.Estimator):
           with ops.control_dependencies(internal_ops_to_run):
             dummy_update_op = control_flow_ops.no_op()
 
-        eval_metric_ops, eval_update_ops = (
-            eval_metric_ops.to_metric_metric_ops_for_tpu(dummy_update_op))
+        host_call_ret = host_calls.create_tpu_hostcall()
+        eval_metric_ops = {}
+        eval_update_ops = []
+        for k, v in host_call_ret['eval_metrics'].items():
+          eval_metric_ops[k] = (v[0], dummy_update_op)
+          eval_update_ops.append(v[1])
+
+        if 'host_call' not in host_call_ret:
+          host_ops = []
+        else:
+          host_ops = host_call_ret['host_call']
         hooks = [
-            TPUInfeedOutfeedSessionHook(ctx, enqueue_ops, eval_update_ops),
+            TPUInfeedOutfeedSessionHook(ctx, enqueue_ops,
+                                        eval_update_ops + host_ops),
         ]
 
         return model_fn_lib.EstimatorSpec(
@@ -1788,7 +1853,7 @@ def _eval_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
   num_cores = ctx.num_cores
   iterations_per_loop_var = _create_or_get_iterations_per_loop()
 
-  single_tpu_eval_step, eval_metric_ops, captured_scaffold_fn = (
+  single_tpu_eval_step, host_calls, captured_scaffold_fn = (
       model_fn_wrapper.convert_to_single_tpu_eval_step(dequeue_fn))
 
   def multi_tpu_eval_steps_on_single_shard():
@@ -1804,7 +1869,7 @@ def _eval_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
       outputs_from_all_shards=False)
 
   scaffold = _get_scaffold(captured_scaffold_fn)
-  return loss, eval_metric_ops, scaffold
+  return loss, host_calls, scaffold
 
 
 def _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
@@ -1812,7 +1877,7 @@ def _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
   num_cores = ctx.num_cores
   iterations_per_loop_var = _create_or_get_iterations_per_loop()
 
-  single_tpu_train_step, captured_scaffold_fn = (
+  single_tpu_train_step, host_call, captured_scaffold_fn = (
       model_fn_wrapper.convert_to_single_tpu_train_step(dequeue_fn))
 
   def multi_tpu_train_steps_on_single_shard():
@@ -1828,7 +1893,7 @@ def _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
       outputs_from_all_shards=False)
 
   scaffold = _get_scaffold(captured_scaffold_fn)
-  return loss, scaffold
+  return loss, host_call, scaffold
 
 
 def _wrap_computation_in_while_loop(device, op_fn):
-- 
GitLab


From d56883617c07125eb9d488bc73baccaacd55f48b Mon Sep 17 00:00:00 2001
From: Russell Power <power@google.com>
Date: Tue, 30 Jan 2018 13:02:25 -0800
Subject: [PATCH 411/966] Enable bulk restoration by default.

This enables loading multiple tensors in single call, allowing for better buffering and reduced load on distributed filesystems.

PiperOrigin-RevId: 183878169
---
 tensorflow/python/training/saver.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py
index 2c59b82ebe..abc700b810 100644
--- a/tensorflow/python/training/saver.py
+++ b/tensorflow/python/training/saver.py
@@ -1229,7 +1229,7 @@ class Saver(object):
         The `saver_def` proto should be the one returned by the
         `as_saver_def()` call of the `Saver` that was created for that `Graph`.
       builder: Optional `SaverBuilder` to use if a `saver_def` was not provided.
-        Defaults to `BaseSaverBuilder()`.
+        Defaults to `BulkSaverBuilder()`.
       defer_build: If `True`, defer adding the save and restore ops to the
         `build()` call. In that case `build()` should be called before
         finalizing the graph or using the saver.
@@ -1309,7 +1309,7 @@ class Saver(object):
 
     if not self.saver_def or context.in_eager_mode():
       if self._builder is None:
-        self._builder = BaseSaverBuilder(self._write_version)
+        self._builder = BulkSaverBuilder(self._write_version)
 
       if self._var_list is None:
         # pylint: disable=protected-access
-- 
GitLab


From c68446dcfa4b096c4f248c1c02d54a8077e514c5 Mon Sep 17 00:00:00 2001
From: gracehoney <31743510+aaroey@users.noreply.github.com>
Date: Tue, 30 Jan 2018 13:36:21 -0800
Subject: [PATCH 412/966] Remove unused targets

---
 tensorflow/contrib/tensorrt/BUILD | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index 3a214d2e86..d694df85a5 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -228,18 +228,6 @@ tf_cc_test(
     ],
 )
 
-filegroup(
-    name = "cppfiles",
-    srcs = glob(["**/*.cc"]),
-    visibility = ["//visibility:private"],
-)
-
-filegroup(
-    name = "headers",
-    srcs = glob(["**/*.h"]),
-    visibility = ["//visibility:private"],
-)
-
 filegroup(
     name = "all_files",
     srcs = glob(
-- 
GitLab


From 527428241ca35b0743ed9cc8c955c8b8c3a51c86 Mon Sep 17 00:00:00 2001
From: gracehoney <31743510+aaroey@users.noreply.github.com>
Date: Tue, 30 Jan 2018 13:37:12 -0800
Subject: [PATCH 413/966] Run clang-format on c++ files

---
 tensorflow/contrib/tensorrt/log/trt_logger.h  |  4 +--
 .../contrib/tensorrt/segment/segment_test.cc  | 29 ++++++++++---------
 .../contrib/tensorrt/shape_fn/trt_shfn.h      |  2 +-
 3 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/tensorflow/contrib/tensorrt/log/trt_logger.h b/tensorflow/contrib/tensorrt/log/trt_logger.h
index 0dc2b1708b..32bfcf62af 100644
--- a/tensorflow/contrib/tensorrt/log/trt_logger.h
+++ b/tensorflow/contrib/tensorrt/log/trt_logger.h
@@ -36,7 +36,7 @@ class Logger : public nvinfer1::ILogger {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif // GOOGLE_TENSORRT
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
 
 #endif  // TENSORFLOW_CONTRIB_TENSORRT_LOG_TRT_LOGGER_H_
diff --git a/tensorflow/contrib/tensorrt/segment/segment_test.cc b/tensorflow/contrib/tensorrt/segment/segment_test.cc
index 5f99ba570b..bac115facd 100644
--- a/tensorflow/contrib/tensorrt/segment/segment_test.cc
+++ b/tensorflow/contrib/tensorrt/segment/segment_test.cc
@@ -155,10 +155,11 @@ TEST_F(SegmentTest, Simple) {
   ASSERT_TRUE(GetGraphDef(graph, &graph_def));
 
   SegmentNodesVector segments;
-  ASSERT_EQ(SegmentGraph(graph_def, MakeCandidateFn({"add0", "add1", "add2",
-                                                     "add3", "add4"}),
-                         default_options_, &segments),
-            tensorflow::Status::OK());
+  ASSERT_EQ(
+      SegmentGraph(graph_def,
+                   MakeCandidateFn({"add0", "add1", "add2", "add3", "add4"}),
+                   default_options_, &segments),
+      tensorflow::Status::OK());
 
   // Expect all Add operations to be collapsed into a single segment
   ASSERT_EQ(segments.size(), 1);
@@ -264,11 +265,11 @@ TEST_F(SegmentTest, Multiple) {
   ASSERT_TRUE(GetGraphDef(graph, &graph_def));
 
   SegmentNodesVector segments;
-  ASSERT_EQ(
-      SegmentGraph(graph_def, MakeCandidateFn({"add0", "add1", "add2", "add3",
-                                               "add4", "add6", "add7", "add8"}),
-                   default_options_, &segments),
-      tensorflow::Status::OK());
+  ASSERT_EQ(SegmentGraph(graph_def,
+                         MakeCandidateFn({"add0", "add1", "add2", "add3",
+                                          "add4", "add6", "add7", "add8"}),
+                         default_options_, &segments),
+            tensorflow::Status::OK());
 
   // Expect two subgraphs
   EXPECT_EQ(segments.size(), 2);
@@ -333,11 +334,11 @@ TEST_F(SegmentTest, BigIfElse) {
   ASSERT_TRUE(GetGraphDef(graph, &graph_def));
 
   SegmentNodesVector segments;
-  ASSERT_EQ(
-      SegmentGraph(graph_def, MakeCandidateFn({"add0", "add1", "add3", "add4",
-                                               "add5", "add6", "add7"}),
-                   default_options_, &segments),
-      tensorflow::Status::OK());
+  ASSERT_EQ(SegmentGraph(graph_def,
+                         MakeCandidateFn({"add0", "add1", "add3", "add4",
+                                          "add5", "add6", "add7"}),
+                         default_options_, &segments),
+            tensorflow::Status::OK());
 
   // Expect 2 subgraphs
   EXPECT_EQ(segments.size(), 2);
diff --git a/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.h b/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.h
index 9ca4ad0d55..4b50f66699 100644
--- a/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.h
+++ b/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.h
@@ -18,8 +18,8 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
-#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
 namespace shape_inference {
-- 
GitLab


From 21cd5e381769b388b35437e0143af23235c4b5bf Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 30 Jan 2018 13:10:54 -0800
Subject: [PATCH 414/966] Add missing dependency.

PiperOrigin-RevId: 183879566
---
 tensorflow/core/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 455da05738..7a0c0ea3fe 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -1898,6 +1898,7 @@ cc_library(
 tf_cuda_library(
     name = "cuda_device_functions",
     hdrs = ["util/cuda_device_functions.h"],
+    cuda_deps = ["//third_party_gpus/cuda:cuda_headers"],
     visibility = ["//visibility:public"],
     deps = [":framework_lite"],
 )
-- 
GitLab


From d52e0e0269b72624c1443e8eb78467dd99844558 Mon Sep 17 00:00:00 2001
From: Asim Shankar <ashankar@google.com>
Date: Tue, 30 Jan 2018 13:20:13 -0800
Subject: [PATCH 415/966] Eager: Update documentation to reflect that you can
 use TensorFlow 1.5

PiperOrigin-RevId: 183880991
---
 tensorflow/contrib/eager/README.md            | 24 ++------------
 .../contrib/eager/python/g3doc/guide.md       | 33 +++++++++++--------
 2 files changed, 21 insertions(+), 36 deletions(-)

diff --git a/tensorflow/contrib/eager/README.md b/tensorflow/contrib/eager/README.md
index 09242ee47d..9d2ca07c3a 100644
--- a/tensorflow/contrib/eager/README.md
+++ b/tensorflow/contrib/eager/README.md
@@ -41,28 +41,8 @@ support for distributed and multi-GPU training and CPU performance.
 
 ## Installation
 
-Since eager execution is not yet part of a TensorFlow release, using it requires
-either [building from source](https://www.tensorflow.org/install/install_sources)
-or the latest nightly builds. The nightly builds are available as:
-
-- [`pip` packages](https://github.com/tensorflow/tensorflow/blob/master/README.md#installation) and
-
-- [docker](https://hub.docker.com/r/tensorflow/tensorflow/) images.
-
-For example, to run the latest nightly docker image:
-
-```sh
-# If you have a GPU, use https://github.com/NVIDIA/nvidia-docker
-nvidia-docker pull tensorflow/tensorflow:nightly-gpu
-nvidia-docker run -it -p 8888:8888 tensorflow/tensorflow:nightly-gpu
-
-# If you do not have a GPU, use the CPU-only image
-docker pull tensorflow/tensorflow:nightly
-docker run -it -p 8888:8888 tensorflow/tensorflow:nightly
-```
-
-And then visit http://localhost:8888 in your browser for a Jupyter notebook
-environment. Try out the notebooks below.
+Eager execution is included in TensorFlow versions 1.5 and above.
+Installation instructions at https://www.tensorflow.org/install/
 
 ## Documentation
 
diff --git a/tensorflow/contrib/eager/python/g3doc/guide.md b/tensorflow/contrib/eager/python/g3doc/guide.md
index 7eea93ce1f..ffc1d0332e 100644
--- a/tensorflow/contrib/eager/python/g3doc/guide.md
+++ b/tensorflow/contrib/eager/python/g3doc/guide.md
@@ -19,29 +19,34 @@ to models defined without using eager execution.
 
 ## Installation
 
-Eager execution is **not** included in the latest release (version 1.4) of
-TensorFlow. To use it, you will need to [build TensorFlow from
-source](https://www.tensorflow.org/install/install_sources) or install the
-nightly builds.
+Eager execution is included in TensorFlow versions 1.5 and above.
+Installation instructions at https://www.tensorflow.org/install/
 
-For example, the nightly builds can be installed using `pip`:
+The contents of this guide are compatible with TensorFlow 1.5.
+However, if you run into bugs that are fixed in source but not the
+release, you may want to either either [building from
+source](https://www.tensorflow.org/install/install_sources)
+or the try latest nightly builds. The nightly builds are available as:
 
--   `pip install tf-nightly` (for CPU-only TensorFlow)
--   `pip install tf-nightly-gpu` (for GPU-enabled TensorFlow)
+- [`pip` packages](https://github.com/tensorflow/tensorflow/blob/master/README.md#installation) and
 
-Or using `docker`, with [Jupyter Notebook](http://jupyter.org/) support:
+- [docker](https://hub.docker.com/r/tensorflow/tensorflow/) images.
+
+For example, to run the latest nightly docker image:
 
 ```sh
-# For CPU-only TensorFlow
+# If you have a GPU, use https://github.com/NVIDIA/nvidia-docker
+docker pull tensorflow/tensorflow:nightly-gpu
+docker run --runtime=nvidia -it -p 8888:8888 tensorflow/tensorflow:nightly-gpu
+
+# If you do not have a GPU, use the CPU-only image
 docker pull tensorflow/tensorflow:nightly
 docker run -it -p 8888:8888 tensorflow/tensorflow:nightly
-
-# For GPU-enabled TensorFlow:
-# (Requires https://github.com/NVIDIA/nvidia-docker)
-nvidia-docker pull tensorflow/tensorflow:nightly-gpu
-nvidia-docker run -it -p 8888:8888 tensorflow/tensorflow:nightly-gpu
 ```
 
+And then visit http://localhost:8888 in your browser for a Jupyter notebook
+environment.
+
 ## Getting Started
 
 With TensorFlow installed, eager execution is enabled via a single call:
-- 
GitLab


From 548df15375488fc06ff663670f88734f3ece4814 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Tue, 30 Jan 2018 13:26:51 -0800
Subject: [PATCH 416/966] [tf.data] Add `IteratorContext::allocator()`.

This enables the various iterator implementations to use the actual allocator for the device on which they are running, rather than defaulting to `cpu_allocator()` (which is typically a plain malloc). In future, this will enable allocating iterator outputs in CUDA-pinned memory (and GPU memory).

PERFORMANCE NOTE: In sessions where `ConfigProto.force_gpu_compatible == True`, this change has the effect of allocating all input pipeline tensors in CUDA-pinned memory. Previous if this flag was set, only the tensors allocated during function execution would be allocated in this space, and other tensors (e.g. the result of a `Dataset.batch()` would be allocated using `cpu_allocator()` (i.e. `malloc()`). This change should lead to more efficient communication between a host-side input pipeline and GPUs, but it may also create more pressure on the CUDA host allocator (whose default maximum size is 64GB). The "TF_CUDA_HOST_MEM_LIMIT_IN_MB" environment variable can be used to override this value.

This change is a starting point for working on issue #13610.

PiperOrigin-RevId: 183881907
---
 tensorflow/core/kernels/data/BUILD                    |  1 +
 tensorflow/core/kernels/data/batch_dataset_op.cc      |  2 +-
 tensorflow/core/kernels/data/dataset.cc               |  5 +++++
 tensorflow/core/kernels/data/dataset.h                |  5 +++++
 .../kernels/data/dense_to_sparse_batch_dataset_op.cc  |  6 +++---
 .../core/kernels/data/map_and_batch_dataset_op.cc     | 11 ++++++-----
 .../core/kernels/data/padded_batch_dataset_op.cc      |  2 +-
 tensorflow/core/kernels/data/random_dataset_op.cc     |  2 +-
 tensorflow/core/kernels/data/range_dataset_op.cc      |  2 +-
 tensorflow/core/kernels/data/reader_dataset_ops.cc    |  6 +++---
 tensorflow/core/kernels/data/sql/BUILD                |  1 +
 tensorflow/core/kernels/data/sql/query_connection.h   |  4 +++-
 .../core/kernels/data/sql/sqlite_query_connection.cc  |  7 +++++--
 .../core/kernels/data/sql/sqlite_query_connection.h   |  2 +-
 tensorflow/core/kernels/data/sql_dataset_ops.cc       |  4 ++--
 .../core/kernels/data/tensor_slice_dataset_op.cc      |  2 +-
 16 files changed, 40 insertions(+), 22 deletions(-)

diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index 45505ef716..cdb4023861 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -49,6 +49,7 @@ cc_library(
     srcs = ["dataset.cc"],
     hdrs = ["dataset.h"],
     deps = [
+        "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
diff --git a/tensorflow/core/kernels/data/batch_dataset_op.cc b/tensorflow/core/kernels/data/batch_dataset_op.cc
index 0853362b26..7fa67efb9e 100644
--- a/tensorflow/core/kernels/data/batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/batch_dataset_op.cc
@@ -144,7 +144,7 @@ class BatchDatasetOp : public UnaryDatasetOpKernel {
           const Tensor& first_element = batch_elements[0][component_index];
           TensorShape batch_component_shape({num_batch_elements});
           batch_component_shape.AppendShape(first_element.shape());
-          Tensor batch_component(cpu_allocator(), first_element.dtype(),
+          Tensor batch_component(ctx->allocator({}), first_element.dtype(),
                                  batch_component_shape);
           // Build the output tuple component by copying one slice
           // from each input element in the batch.
diff --git a/tensorflow/core/kernels/data/dataset.cc b/tensorflow/core/kernels/data/dataset.cc
index 2ea6875567..d18cb16018 100644
--- a/tensorflow/core/kernels/data/dataset.cc
+++ b/tensorflow/core/kernels/data/dataset.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/graph/node_builder.h"
 
@@ -264,6 +265,10 @@ void BinaryDatasetOpKernel::MakeDataset(OpKernelContext* ctx,
   MakeDataset(ctx, input, another_input, output);
 }
 
+Allocator* IteratorContext::allocator(AllocatorAttributes attrs) {
+  return params_.lib->device()->GetAllocator(attrs);
+}
+
 const char GraphDatasetBase::kDatasetGraphKey[] = "_DATASET_GRAPH";
 const char GraphDatasetBase::kDatasetGraphOutputNodeKey[] =
     "_DATASET_GRAPH_OUTPUT_NODE";
diff --git a/tensorflow/core/kernels/data/dataset.h b/tensorflow/core/kernels/data/dataset.h
index 2ef31ddfaa..08c3ca82ea 100644
--- a/tensorflow/core/kernels/data/dataset.h
+++ b/tensorflow/core/kernels/data/dataset.h
@@ -272,6 +272,9 @@ class IteratorContext {
     // The FunctionLibraryRuntime object to be used to make function calls.
     FunctionLibraryRuntime* lib = nullptr;
     std::shared_ptr<const FunctionLibraryDefinition> function_library = nullptr;
+
+    // The Allocator to be used to allocate the output of an iterator.
+    Allocator* allocator = nullptr;
   };
 
   explicit IteratorContext(Params params) : params_(std::move(params)) {}
@@ -298,6 +301,8 @@ class IteratorContext {
 
   void set_lib(FunctionLibraryRuntime* lib) { params_.lib = lib; }
 
+  Allocator* allocator(AllocatorAttributes attrs);
+
  private:
   Params params_;
 };
diff --git a/tensorflow/core/kernels/data/dense_to_sparse_batch_dataset_op.cc b/tensorflow/core/kernels/data/dense_to_sparse_batch_dataset_op.cc
index e7224bb547..132808a5f1 100644
--- a/tensorflow/core/kernels/data/dense_to_sparse_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/dense_to_sparse_batch_dataset_op.cc
@@ -155,7 +155,7 @@ class DenseToSparseBatchDatasetOp : public UnaryDatasetOpKernel {
 
         // Determine the size of the output tensors:
         // * dense_shape will be [`row_shape + 1`].
-        Tensor dense_shape(cpu_allocator(), DT_INT64, {row_ndims + 1});
+        Tensor dense_shape(ctx->allocator({}), DT_INT64, {row_ndims + 1});
         auto dense_shape_vec = dense_shape.vec<int64>();
         for (size_t i = 0; i < row_ndims; ++i) {
           if (row_shape.dim_size(i) == -1) {
@@ -215,10 +215,10 @@ class DenseToSparseBatchDatasetOp : public UnaryDatasetOpKernel {
 
         // * indices will be [`total_elements`, `row_shape + 1`].
         // * values will be [`total_elements`].
-        Tensor indices(cpu_allocator(), DT_INT64,
+        Tensor indices(ctx->allocator({}), DT_INT64,
                        {total_elements, row_ndims + 1});
         Tensor values(
-            cpu_allocator(),
+            ctx->allocator({}),
             DatasetIterator<Dataset<T>>::dataset()->input_->output_dtypes()[0],
             {total_elements});
         auto indices_matrix = indices.matrix<int64>();
diff --git a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
index c529f671f2..9ce263732f 100644
--- a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
@@ -183,7 +183,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
               TensorShape component_shape(
                   batch_results_[current_batch_index_].output[i].shape());
               component_shape.set_dim(0, num_elements);
-              Tensor component(cpu_allocator(), output[i].dtype(),
+              Tensor component(ctx->allocator({}), output[i].dtype(),
                                component_shape);
               TF_RETURN_IF_ERROR(
                   CopyPartialBatch(&component, output[i], num_elements));
@@ -244,7 +244,8 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         return Status::OK();
       }
 
-      void EnsureOutputAllocated(BatchResult* batch_result,
+      void EnsureOutputAllocated(IteratorContext* ctx,
+                                 BatchResult* batch_result,
                                  const std::vector<Tensor>& return_values) {
         mutex_lock l(batch_result->mu);
         if (batch_result->output_allocated) {
@@ -254,7 +255,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         for (size_t i = 0; i < num_components; ++i) {
           TensorShape component_shape({dataset()->batch_size_});
           component_shape.AppendShape(return_values[i].shape());
-          Tensor component(cpu_allocator(), return_values[i].dtype(),
+          Tensor component(ctx->allocator({}), return_values[i].dtype(),
                            component_shape);
           batch_result->output.emplace_back(std::move(component));
         }
@@ -285,10 +286,9 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
               dataset()->captured_func_->RunAsync(
                   ctx, std::move(input_element), &result->return_values,
                   [this, ctx, result, batch_result, offset](Status ret_status) {
-                    delete ctx;
                     result->status.Update(ret_status);
                     if (ret_status.ok()) {
-                      EnsureOutputAllocated(batch_result,
+                      EnsureOutputAllocated(ctx, batch_result,
                                             result->return_values);
                       const size_t num_components =
                           result->return_values.size();
@@ -318,6 +318,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
                         }
                       }
                     }
+                    delete ctx;
                     // NOTE(mrry): We clear the return values here to release
                     // any memory associated with them and to paralellize the
                     // destruction of the tensors (which can be surprisingly
diff --git a/tensorflow/core/kernels/data/padded_batch_dataset_op.cc b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
index 346eca0bb2..4fe4e8e294 100644
--- a/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
@@ -376,7 +376,7 @@ class PaddedBatchDatasetOp : public UnaryDatasetOpKernel {
 
           // 2. Copy each batch element to the appropriate location in
           // the output component tensor.
-          Tensor batch_component(cpu_allocator(),
+          Tensor batch_component(ctx->allocator({}),
                                  output_dtypes()[component_index],
                                  batch_component_shape);
           TF_RETURN_IF_ERROR(SetElementZero(
diff --git a/tensorflow/core/kernels/data/random_dataset_op.cc b/tensorflow/core/kernels/data/random_dataset_op.cc
index bc638864b0..210b9ad1b8 100644
--- a/tensorflow/core/kernels/data/random_dataset_op.cc
+++ b/tensorflow/core/kernels/data/random_dataset_op.cc
@@ -99,7 +99,7 @@ class RandomDatasetOp : public DatasetOpKernel {
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
         mutex_lock l(mu_);
-        Tensor value_tensor(cpu_allocator(), DT_INT64, {});
+        Tensor value_tensor(ctx->allocator({}), DT_INT64, {});
         value_tensor.scalar<int64>()() = Random();
         out_tensors->emplace_back(std::move(value_tensor));
         *end_of_sequence = false;
diff --git a/tensorflow/core/kernels/data/range_dataset_op.cc b/tensorflow/core/kernels/data/range_dataset_op.cc
index d0bc61acd9..b57518e678 100644
--- a/tensorflow/core/kernels/data/range_dataset_op.cc
+++ b/tensorflow/core/kernels/data/range_dataset_op.cc
@@ -100,7 +100,7 @@ class RangeDatasetOp : public DatasetOpKernel {
           *end_of_sequence = true;
           return Status::OK();
         }
-        Tensor value_tensor(cpu_allocator(), DT_INT64, {});
+        Tensor value_tensor(ctx->allocator({}), DT_INT64, {});
         value_tensor.scalar<int64>()() = next_;
         out_tensors->emplace_back(std::move(value_tensor));
         *end_of_sequence = false;
diff --git a/tensorflow/core/kernels/data/reader_dataset_ops.cc b/tensorflow/core/kernels/data/reader_dataset_ops.cc
index aa39fffc2e..34d7d9f914 100644
--- a/tensorflow/core/kernels/data/reader_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/reader_dataset_ops.cc
@@ -141,7 +141,7 @@ class TextLineDatasetOp : public DatasetOpKernel {
 
             if (s.ok()) {
               // Produce the line as output.
-              Tensor line_tensor(cpu_allocator(), DT_STRING, {});
+              Tensor line_tensor(ctx->allocator({}), DT_STRING, {});
               line_tensor.scalar<string>()() = line_contents;
               out_tensors->emplace_back(std::move(line_tensor));
               *end_of_sequence = false;
@@ -384,7 +384,7 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel {
               TF_RETURN_IF_ERROR(
                   input_buffer_->ReadNBytes(dataset()->record_bytes_, &record));
               // Produce the record as output.
-              Tensor record_tensor(cpu_allocator(), DT_STRING, {});
+              Tensor record_tensor(ctx->allocator({}), DT_STRING, {});
               record_tensor.scalar<string>()() = record;
               out_tensors->emplace_back(std::move(record_tensor));
               *end_of_sequence = false;
@@ -589,7 +589,7 @@ class TFRecordDatasetOp : public DatasetOpKernel {
         do {
           // We are currently processing a file, so try to read the next record.
           if (reader_) {
-            Tensor result_tensor(cpu_allocator(), DT_STRING, {});
+            Tensor result_tensor(ctx->allocator({}), DT_STRING, {});
             Status s = reader_->ReadRecord(&result_tensor.scalar<string>()());
             if (s.ok()) {
               out_tensors->emplace_back(std::move(result_tensor));
diff --git a/tensorflow/core/kernels/data/sql/BUILD b/tensorflow/core/kernels/data/sql/BUILD
index 0286825af3..f4698bdaf7 100644
--- a/tensorflow/core/kernels/data/sql/BUILD
+++ b/tensorflow/core/kernels/data/sql/BUILD
@@ -33,6 +33,7 @@ cc_library(
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core/kernels/data:dataset",
         "//tensorflow/core/lib/db:sqlite",
     ],
 )
diff --git a/tensorflow/core/kernels/data/sql/query_connection.h b/tensorflow/core/kernels/data/sql/query_connection.h
index f31017bd19..e9ffca202f 100644
--- a/tensorflow/core/kernels/data/sql/query_connection.h
+++ b/tensorflow/core/kernels/data/sql/query_connection.h
@@ -19,6 +19,8 @@ limitations under the License.
 
 namespace tensorflow {
 
+class IteratorContext;
+
 namespace sql {
 // This interface allows a user to connect to a database, execute a query, and
 // iterate over the result set, putting the results into an output tensor.
@@ -56,7 +58,7 @@ class QueryConnection {
   // If there are no more rows in the result set, then instead `true` will be
   // stored in `*end_of_sequence`, and the content of `*out_tensors` will be
   // undefined.
-  virtual Status GetNext(std::vector<Tensor>* out_tensors,
+  virtual Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
                          bool* end_of_sequence) = 0;
 };
 
diff --git a/tensorflow/core/kernels/data/sql/sqlite_query_connection.cc b/tensorflow/core/kernels/data/sql/sqlite_query_connection.cc
index 029a0aab97..7cd07bd8ec 100644
--- a/tensorflow/core/kernels/data/sql/sqlite_query_connection.cc
+++ b/tensorflow/core/kernels/data/sql/sqlite_query_connection.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/data/sql/sqlite_query_connection.h"
 
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 
 namespace tensorflow {
@@ -48,14 +49,16 @@ Status SqliteQueryConnection::Close() {
   return Status::OK();
 }
 
-Status SqliteQueryConnection::GetNext(std::vector<Tensor>* out_tensors,
+Status SqliteQueryConnection::GetNext(IteratorContext* ctx,
+                                      std::vector<Tensor>* out_tensors,
                                       bool* end_of_sequence) {
   if (!stmt_) TF_RETURN_IF_ERROR(PrepareQuery());
   TF_RETURN_IF_ERROR(stmt_.Step(end_of_sequence));
   if (!*end_of_sequence) {
     for (int i = 0; i < column_count_; i++) {
       DataType dt = output_types_[i];
-      Tensor tensor(cpu_allocator(), dt, {});
+      // TODO(mrry): Pass in the `IteratorContext::allocator()`.
+      Tensor tensor(ctx->allocator({}), dt, {});
       FillTensorWithResultSetEntry(dt, i, &tensor);
       out_tensors->emplace_back(std::move(tensor));
     }
diff --git a/tensorflow/core/kernels/data/sql/sqlite_query_connection.h b/tensorflow/core/kernels/data/sql/sqlite_query_connection.h
index 787c17d6c0..81b19530b7 100644
--- a/tensorflow/core/kernels/data/sql/sqlite_query_connection.h
+++ b/tensorflow/core/kernels/data/sql/sqlite_query_connection.h
@@ -32,7 +32,7 @@ class SqliteQueryConnection : public QueryConnection {
   Status Open(const string& data_source_name, const string& query,
               const DataTypeVector& output_types) override;
   Status Close() override;
-  Status GetNext(std::vector<Tensor>* out_tensors,
+  Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
                  bool* end_of_sequence) override;
 
  private:
diff --git a/tensorflow/core/kernels/data/sql_dataset_ops.cc b/tensorflow/core/kernels/data/sql_dataset_ops.cc
index 7230219080..d50e9c9cf9 100644
--- a/tensorflow/core/kernels/data/sql_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/sql_dataset_ops.cc
@@ -116,7 +116,7 @@ class SqlDatasetOp : public DatasetOpKernel {
         }
       }
 
-      Status GetNextInternal(IteratorContext* /*ctx*/,
+      Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
         mutex_lock l(mu_);
@@ -132,7 +132,7 @@ class SqlDatasetOp : public DatasetOpKernel {
             return s;
           }
         }
-        return query_connection_->GetNext(out_tensors, end_of_sequence);
+        return query_connection_->GetNext(ctx, out_tensors, end_of_sequence);
       }
 
      private:
diff --git a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
index 18adae1ea3..d5be4c7780 100644
--- a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
@@ -117,7 +117,7 @@ class TensorSliceDatasetOp : public DatasetOpKernel {
           out_tensors->reserve(dataset()->tensors_.size());
           for (int i = 0; i < dataset()->tensors_.size(); ++i) {
             const Tensor& t = dataset()->tensors_[i];
-            Tensor t_slice(cpu_allocator(), t.dtype(),
+            Tensor t_slice(ctx->allocator({}), t.dtype(),
                            TensorShape(dataset()->shapes_[i].dim_sizes()));
             TF_RETURN_IF_ERROR(batch_util::CopySliceToElement(t, &t_slice, i_));
             out_tensors->emplace_back(std::move(t_slice));
-- 
GitLab


From 36f3a3b31ea9c32c64f1f4af543c75692d338876 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 30 Jan 2018 13:58:05 -0800
Subject: [PATCH 417/966] Add and Mul support broadcasting.

PiperOrigin-RevId: 183886920
---
 tensorflow/contrib/lite/kernels/add.cc        | 102 ++++++++++++------
 tensorflow/contrib/lite/kernels/add_test.cc   |  54 ++++++++--
 tensorflow/contrib/lite/kernels/mul.cc        |  99 +++++++++++------
 tensorflow/contrib/lite/kernels/mul_test.cc   |  59 ++++++++--
 .../testing/generated_examples_zip_test.cc    |   4 +-
 5 files changed, 235 insertions(+), 83 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/add.cc b/tensorflow/contrib/lite/kernels/add.cc
index fb5764f280..63ea89df56 100644
--- a/tensorflow/contrib/lite/kernels/add.cc
+++ b/tensorflow/contrib/lite/kernels/add.cc
@@ -37,7 +37,23 @@ constexpr int kInputTensor1 = 0;
 constexpr int kInputTensor2 = 1;
 constexpr int kOutputTensor = 0;
 
+struct OpData {
+  bool requires_broadcast;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* data = new OpData;
+  data->requires_broadcast = false;
+  return data;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<OpData*>(buffer);
+}
+
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
@@ -45,43 +61,56 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  TF_LITE_ENSURE_EQ(context, NumDimensions(input1), NumDimensions(input2));
-  for (int i = 0; i < NumDimensions(input1); ++i) {
-    TF_LITE_ENSURE_EQ(context, SizeOfDimension(input1, i),
-                      SizeOfDimension(input2, i));
-  }
+  TF_LITE_ENSURE_EQ(context, input1->type, input2->type);
+  output->type = input2->type;
+
+  data->requires_broadcast = !HaveSameShapes(input1, input2);
 
-  TF_LITE_ENSURE_EQ(context, input1->type, output->type);
-  TF_LITE_ENSURE_EQ(context, input2->type, output->type);
+  TfLiteIntArray* output_size = nullptr;
+  if (data->requires_broadcast) {
+    TF_LITE_ENSURE_OK(context, CalculateShapeForBroadcast(
+                                   context, input1, input2, &output_size));
+  } else {
+    output_size = TfLiteIntArrayCopy(input1->dims);
+  }
 
-  TfLiteIntArray* output_size = TfLiteIntArrayCopy(input1->dims);
   return context->ResizeTensor(context, output, output_size);
 }
 
 template <KernelType kernel_type>
 void EvalAddFloat(TfLiteContext* context, TfLiteNode* node,
-                  TfLiteAddParams* params, TfLiteTensor* input1,
-                  TfLiteTensor* input2, TfLiteTensor* output) {
+                  TfLiteAddParams* params, const OpData* data,
+                  TfLiteTensor* input1, TfLiteTensor* input2,
+                  TfLiteTensor* output) {
   float output_activation_min, output_activation_max;
   CalculateActivationRangeFloat(params->activation, &output_activation_min,
                                 &output_activation_max);
-#define TF_LITE_ADD(type)                                        \
-  type::Add(GetTensorData<float>(input1), GetTensorDims(input1), \
-            GetTensorData<float>(input2), GetTensorDims(input2), \
-            output_activation_min, output_activation_max,        \
-            GetTensorData<float>(output), GetTensorDims(output))
+#define TF_LITE_ADD(type, opname)                                   \
+  type::opname(GetTensorData<float>(input1), GetTensorDims(input1), \
+               GetTensorData<float>(input2), GetTensorDims(input2), \
+               output_activation_min, output_activation_max,        \
+               GetTensorData<float>(output), GetTensorDims(output))
   if (kernel_type == kReference) {
-    TF_LITE_ADD(reference_ops);
+    if (data->requires_broadcast) {
+      TF_LITE_ADD(reference_ops, BroadcastAdd);
+    } else {
+      TF_LITE_ADD(reference_ops, Add);
+    }
   } else {
-    TF_LITE_ADD(optimized_ops);
+    if (data->requires_broadcast) {
+      TF_LITE_ADD(optimized_ops, BroadcastAdd);
+    } else {
+      TF_LITE_ADD(optimized_ops, Add);
+    }
   }
 #undef TF_LITE_ADD
 }
 
 template <KernelType kernel_type>
 void EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
-                      TfLiteAddParams* params, TfLiteTensor* input1,
-                      TfLiteTensor* input2, TfLiteTensor* output) {
+                      TfLiteAddParams* params, const OpData* data,
+                      TfLiteTensor* input1, TfLiteTensor* input2,
+                      TfLiteTensor* output) {
   auto input1_offset = -input1->params.zero_point;
   auto input2_offset = -input2->params.zero_point;
   auto output_offset = output->params.zero_point;
@@ -112,19 +141,20 @@ void EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
   CalculateActivationRangeUint8(params->activation, output,
                                 &output_activation_min, &output_activation_max);
 
-#define TF_LITE_ADD(type)                                                   \
-  type::BroadcastAdd(                                                       \
-      left_shift, GetTensorData<uint8_t>(input1), GetTensorDims(input1),    \
-      input1_offset, input1_multiplier, input1_shift,                       \
-      GetTensorData<uint8_t>(input2), GetTensorDims(input2), input2_offset, \
-      input2_multiplier, input2_shift, output_offset, output_multiplier,    \
-      output_shift, output_activation_min, output_activation_max,           \
-      GetTensorData<uint8_t>(output), GetTensorDims(output));
-
+#define TF_LITE_ADD(type, opname)                                            \
+  type::opname(left_shift, GetTensorData<uint8_t>(input1),                   \
+               GetTensorDims(input1), input1_offset, input1_multiplier,      \
+               input1_shift, GetTensorData<uint8_t>(input2),                 \
+               GetTensorDims(input2), input2_offset, input2_multiplier,      \
+               input2_shift, output_offset, output_multiplier, output_shift, \
+               output_activation_min, output_activation_max,                 \
+               GetTensorData<uint8_t>(output), GetTensorDims(output));
+  // The quantized version of Add doesn't support activations, so we
+  // always use BroadcastAdd.
   if (kernel_type == kReference) {
-    TF_LITE_ADD(reference_ops);
+    TF_LITE_ADD(reference_ops, BroadcastAdd);
   } else {
-    TF_LITE_ADD(optimized_ops);
+    TF_LITE_ADD(optimized_ops, BroadcastAdd);
   }
 #undef TF_LITE_ADD
 }
@@ -132,15 +162,17 @@ void EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
 template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteAddParams*>(node->builtin_data);
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
   TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
   TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   if (output->type == kTfLiteFloat32) {
-    EvalAddFloat<kernel_type>(context, node, params, input1, input2, output);
+    EvalAddFloat<kernel_type>(context, node, params, data, input1, input2,
+                              output);
   } else if (output->type == kTfLiteUInt8) {
-    EvalAddQuantized<kernel_type>(context, node, params, input1, input2,
+    EvalAddQuantized<kernel_type>(context, node, params, data, input1, input2,
                                   output);
   } else {
     context->ReportError(context,
@@ -154,19 +186,19 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace add
 
 TfLiteRegistration* Register_ADD_REF() {
-  static TfLiteRegistration r = {nullptr, nullptr, add::Prepare,
+  static TfLiteRegistration r = {add::Init, add::Free, add::Prepare,
                                  add::Eval<add::kReference>};
   return &r;
 }
 
 TfLiteRegistration* Register_ADD_GENERIC_OPT() {
-  static TfLiteRegistration r = {nullptr, nullptr, add::Prepare,
+  static TfLiteRegistration r = {add::Init, add::Free, add::Prepare,
                                  add::Eval<add::kGenericOptimized>};
   return &r;
 }
 
 TfLiteRegistration* Register_ADD_NEON_OPT() {
-  static TfLiteRegistration r = {nullptr, nullptr, add::Prepare,
+  static TfLiteRegistration r = {add::Init, add::Free, add::Prepare,
                                  add::Eval<add::kNeonOptimized>};
   return &r;
 }
diff --git a/tensorflow/contrib/lite/kernels/add_test.cc b/tensorflow/contrib/lite/kernels/add_test.cc
index 306dfc3e80..956d05bed5 100644
--- a/tensorflow/contrib/lite/kernels/add_test.cc
+++ b/tensorflow/contrib/lite/kernels/add_test.cc
@@ -25,10 +25,11 @@ using ::testing::ElementsAreArray;
 
 class BaseAddOpModel : public SingleOpModel {
  public:
-  BaseAddOpModel(const TensorData& input, const TensorData& output,
+  BaseAddOpModel(const TensorData& input1, const TensorData& input2,
+                 const TensorData& output,
                  ActivationFunctionType activation_type) {
-    input1_ = AddInput(input);
-    input2_ = AddInput(input);
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
     output_ = AddOutput(output);
     SetBuiltinOp(BuiltinOperator_ADD, BuiltinOptions_AddOptions,
                  CreateAddOptions(builder_, activation_type).Union());
@@ -70,6 +71,7 @@ float GetTolerance(int min, int max) {
 
 TEST(FloatAddOpModel, NoActivation) {
   FloatAddOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {1, 2, 2, 1}},
                     {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
   m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8});
   m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.5});
@@ -78,9 +80,9 @@ TEST(FloatAddOpModel, NoActivation) {
 }
 
 TEST(FloatAddOpModel, ActivationRELU_N1_TO_1) {
-  FloatAddOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}},
-                    {TensorType_FLOAT32, {}},
-                    ActivationFunctionType_RELU_N1_TO_1);
+  FloatAddOpModel m(
+      {TensorType_FLOAT32, {1, 2, 2, 1}}, {TensorType_FLOAT32, {1, 2, 2, 1}},
+      {TensorType_FLOAT32, {}}, ActivationFunctionType_RELU_N1_TO_1);
   m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8});
   m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.5});
   m.Invoke();
@@ -92,6 +94,7 @@ TEST(FloatAddOpModel, VariousInputShapes) {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     FloatAddOpModel m({TensorType_FLOAT32, test_shapes[i]},
+                      {TensorType_FLOAT32, test_shapes[i]},
                       {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
     m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
     m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.5, 1.1, 0.1});
@@ -102,6 +105,23 @@ TEST(FloatAddOpModel, VariousInputShapes) {
   }
 }
 
+TEST(FloatAddOpModel, WithBroadcast) {
+  std::vector<std::initializer_list<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    FloatAddOpModel m({TensorType_FLOAT32, test_shapes[i]},
+                      {TensorType_FLOAT32, {}},  // always a scalar
+                      {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
+    m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
+    m.PopulateTensor<float>(m.input2(), {0.1});
+    m.Invoke();
+    EXPECT_THAT(
+        m.GetOutput(),
+        ElementsAreArray(ArrayFloatNear({-1.9, 0.3, 0.8, 0.9, 1.2, 2.1})))
+        << "With shape number " << i;
+  }
+}
+
 TEST(QuantizedAddOpModel, QuantizedTestsNoActivation) {
   float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
   std::vector<std::initializer_list<float>> inputs1 = {
@@ -112,6 +132,7 @@ TEST(QuantizedAddOpModel, QuantizedTestsNoActivation) {
       {0.7, 0.6, 0.6, 0.5}, {-0.2, 0.6, 0.9, -0.1}, {-0.2, 0.6, -0.1, 0.8}};
   for (int i = 0; i < inputs1.size(); ++i) {
     QuantizedAddOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
+                          {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
                           {TensorType_UINT8, {}, -1.0, 1.0},
                           ActivationFunctionType_NONE);
     m.QuantizeAndPopulate<uint8_t>(m.input1(), inputs1[i]);
@@ -133,6 +154,7 @@ TEST(QuantizedAddOpModel, QuantizedTestsActivationRELU_N1_TO_1) {
                                                        {-0.2, 0.6, -0.1, 0.8}};
   for (int i = 0; i < inputs1.size(); ++i) {
     QuantizedAddOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
+                          {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
                           {TensorType_UINT8, {}, -1.0, 1.0},
                           ActivationFunctionType_RELU_N1_TO_1);
     m.QuantizeAndPopulate<uint8_t>(m.input1(), inputs1[i]);
@@ -150,6 +172,7 @@ TEST(QuantizedAddOpModel, QuantizedVariousInputShapes) {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     QuantizedAddOpModel m({TensorType_UINT8, test_shapes[i], -3.0, 3.0},
+                          {TensorType_UINT8, test_shapes[i], -3.0, 3.0},
                           {TensorType_UINT8, {}, -3.0, 3.0},
                           ActivationFunctionType_NONE);
     m.QuantizeAndPopulate<uint8_t>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
@@ -162,6 +185,25 @@ TEST(QuantizedAddOpModel, QuantizedVariousInputShapes) {
   }
 }
 
+TEST(QuantizedAddOpModel, QuantizedWithBroadcast) {
+  float kQuantizedTolerance = GetTolerance(-3.0, 3.0);
+  std::vector<std::initializer_list<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    QuantizedAddOpModel m({TensorType_UINT8, test_shapes[i], -3.0, 3.0},
+                          {TensorType_UINT8, {}, -3.0, 3.0},
+                          {TensorType_UINT8, {}, -3.0, 3.0},
+                          ActivationFunctionType_NONE);
+    m.QuantizeAndPopulate<uint8_t>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
+    m.QuantizeAndPopulate<uint8_t>(m.input2(), {0.1});
+    m.Invoke();
+    EXPECT_THAT(m.GetDequantizedOutput(),
+                ElementsAreArray(ArrayFloatNear({-1.9, 0.3, 0.8, 0.9, 1.2, 2.1},
+                                                kQuantizedTolerance)))
+        << "With shape number " << i;
+  }
+}
+
 }  // namespace
 }  // namespace tflite
 int main(int argc, char** argv) {
diff --git a/tensorflow/contrib/lite/kernels/mul.cc b/tensorflow/contrib/lite/kernels/mul.cc
index 81c73f2523..54575019de 100644
--- a/tensorflow/contrib/lite/kernels/mul.cc
+++ b/tensorflow/contrib/lite/kernels/mul.cc
@@ -37,7 +37,23 @@ constexpr int kInputTensor1 = 0;
 constexpr int kInputTensor2 = 1;
 constexpr int kOutputTensor = 0;
 
+struct OpData {
+  bool requires_broadcast;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* data = new OpData;
+  data->requires_broadcast = false;
+  return data;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<OpData*>(buffer);
+}
+
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
@@ -45,43 +61,56 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  TF_LITE_ENSURE_EQ(context, NumDimensions(input1), NumDimensions(input2));
-  for (int i = 0; i < NumDimensions(input1); ++i) {
-    TF_LITE_ENSURE_EQ(context, SizeOfDimension(input1, i),
-                      SizeOfDimension(input2, i));
-  }
+  TF_LITE_ENSURE_EQ(context, input1->type, input2->type);
+  output->type = input2->type;
+
+  data->requires_broadcast = !HaveSameShapes(input1, input2);
 
-  TF_LITE_ENSURE_EQ(context, input1->type, output->type);
-  TF_LITE_ENSURE_EQ(context, input2->type, output->type);
+  TfLiteIntArray* output_size = nullptr;
+  if (data->requires_broadcast) {
+    TF_LITE_ENSURE_OK(context, CalculateShapeForBroadcast(
+                                   context, input1, input2, &output_size));
+  } else {
+    output_size = TfLiteIntArrayCopy(input1->dims);
+  }
 
-  TfLiteIntArray* output_size = TfLiteIntArrayCopy(input1->dims);
   return context->ResizeTensor(context, output, output_size);
 }
 
 template <KernelType kernel_type>
 void EvalFloat(TfLiteContext* context, TfLiteNode* node,
-               TfLiteMulParams* params, TfLiteTensor* input1,
-               TfLiteTensor* input2, TfLiteTensor* output) {
+               TfLiteMulParams* params, const OpData* data,
+               TfLiteTensor* input1, TfLiteTensor* input2,
+               TfLiteTensor* output) {
   float output_activation_min, output_activation_max;
   CalculateActivationRangeFloat(params->activation, &output_activation_min,
                                 &output_activation_max);
-#define TF_LITE_MUL(type)                                        \
-  type::Mul(GetTensorData<float>(input1), GetTensorDims(input1), \
-            GetTensorData<float>(input2), GetTensorDims(input2), \
-            output_activation_min, output_activation_max,        \
-            GetTensorData<float>(output), GetTensorDims(output))
+#define TF_LITE_MUL(type, opname)                                   \
+  type::opname(GetTensorData<float>(input1), GetTensorDims(input1), \
+               GetTensorData<float>(input2), GetTensorDims(input2), \
+               output_activation_min, output_activation_max,        \
+               GetTensorData<float>(output), GetTensorDims(output))
   if (kernel_type == kReference) {
-    TF_LITE_MUL(reference_ops);
+    if (data->requires_broadcast) {
+      TF_LITE_MUL(reference_ops, BroadcastMul);
+    } else {
+      TF_LITE_MUL(reference_ops, Mul);
+    }
   } else {
-    TF_LITE_MUL(optimized_ops);
+    if (data->requires_broadcast) {
+      TF_LITE_MUL(optimized_ops, BroadcastMul);
+    } else {
+      TF_LITE_MUL(optimized_ops, Mul);
+    }
   }
 #undef TF_LITE_MUL
 }
 
 template <KernelType kernel_type>
 void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                   TfLiteMulParams* params, TfLiteTensor* input1,
-                   TfLiteTensor* input2, TfLiteTensor* output) {
+                   TfLiteMulParams* params, const OpData* data,
+                   TfLiteTensor* input1, TfLiteTensor* input2,
+                   TfLiteTensor* output) {
   auto input1_offset = -input1->params.zero_point;
   auto input2_offset = -input2->params.zero_point;
   auto output_offset = output->params.zero_point;
@@ -98,17 +127,19 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
   CalculateActivationRangeUint8(params->activation, output,
                                 &output_activation_min, &output_activation_max);
 
-#define TF_LITE_MUL(type)                                                    \
-  type::BroadcastMul(GetTensorData<uint8_t>(input1), GetTensorDims(input1),  \
-                     input1_offset, GetTensorData<uint8_t>(input2),          \
-                     GetTensorDims(input2), input2_offset, output_offset,    \
-                     output_multiplier, output_shift, output_activation_min, \
-                     output_activation_max, GetTensorData<uint8_t>(output),  \
-                     GetTensorDims(output));
+#define TF_LITE_MUL(type, opname)                                      \
+  type::opname(GetTensorData<uint8_t>(input1), GetTensorDims(input1),  \
+               input1_offset, GetTensorData<uint8_t>(input2),          \
+               GetTensorDims(input2), input2_offset, output_offset,    \
+               output_multiplier, output_shift, output_activation_min, \
+               output_activation_max, GetTensorData<uint8_t>(output),  \
+               GetTensorDims(output));
+  // The quantized version of Mul doesn't support activations, so we
+  // always use BroadcastMul.
   if (kernel_type == kReference) {
-    TF_LITE_MUL(reference_ops);
+    TF_LITE_MUL(reference_ops, BroadcastMul);
   } else {
-    TF_LITE_MUL(optimized_ops);
+    TF_LITE_MUL(optimized_ops, BroadcastMul);
   }
 #undef TF_LITE_MUL
 }
@@ -116,15 +147,17 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
 template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteMulParams*>(node->builtin_data);
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
   TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
   TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   if (output->type == kTfLiteFloat32) {
-    EvalFloat<kernel_type>(context, node, params, input1, input2, output);
+    EvalFloat<kernel_type>(context, node, params, data, input1, input2, output);
   } else if (output->type == kTfLiteUInt8) {
-    EvalQuantized<kernel_type>(context, node, params, input1, input2, output);
+    EvalQuantized<kernel_type>(context, node, params, data, input1, input2,
+                               output);
   } else {
     context->ReportError(context,
                          "Mul only supports FLOAT32 and quantized UINT8 now.");
@@ -137,19 +170,19 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace mul
 
 TfLiteRegistration* Register_MUL_REF() {
-  static TfLiteRegistration r = {nullptr, nullptr, mul::Prepare,
+  static TfLiteRegistration r = {mul::Init, mul::Free, mul::Prepare,
                                  mul::Eval<mul::kReference>};
   return &r;
 }
 
 TfLiteRegistration* Register_MUL_GENERIC_OPT() {
-  static TfLiteRegistration r = {nullptr, nullptr, mul::Prepare,
+  static TfLiteRegistration r = {mul::Init, mul::Free, mul::Prepare,
                                  mul::Eval<mul::kGenericOptimized>};
   return &r;
 }
 
 TfLiteRegistration* Register_MUL_NEON_OPT() {
-  static TfLiteRegistration r = {nullptr, nullptr, mul::Prepare,
+  static TfLiteRegistration r = {mul::Init, mul::Free, mul::Prepare,
                                  mul::Eval<mul::kNeonOptimized>};
   return &r;
 }
diff --git a/tensorflow/contrib/lite/kernels/mul_test.cc b/tensorflow/contrib/lite/kernels/mul_test.cc
index 8838b300c0..f1a30f8263 100644
--- a/tensorflow/contrib/lite/kernels/mul_test.cc
+++ b/tensorflow/contrib/lite/kernels/mul_test.cc
@@ -25,10 +25,11 @@ using ::testing::ElementsAreArray;
 
 class BaseMulOpModel : public SingleOpModel {
  public:
-  BaseMulOpModel(TensorData input, TensorData output,
+  BaseMulOpModel(const TensorData& input1, const TensorData& input2,
+                 const TensorData& output,
                  ActivationFunctionType activation_type) {
-    input1_ = AddInput(input);
-    input2_ = AddInput(input);
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
     output_ = AddOutput(output);
     SetBuiltinOp(BuiltinOperator_MUL, BuiltinOptions_MulOptions,
                  CreateMulOptions(builder_, activation_type).Union());
@@ -70,6 +71,7 @@ class QuantizedMulOpModel : public BaseMulOpModel {
 
 TEST(FloatMulOpTest, NoActivation) {
   FloatMulOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {1, 2, 2, 1}},
                     {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
   m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8});
   m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.5});
@@ -79,9 +81,9 @@ TEST(FloatMulOpTest, NoActivation) {
 }
 
 TEST(FloatMulOpTest, ActivationRELU_N1_TO_1) {
-  FloatMulOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}},
-                    {TensorType_FLOAT32, {}},
-                    ActivationFunctionType_RELU_N1_TO_1);
+  FloatMulOpModel m(
+      {TensorType_FLOAT32, {1, 2, 2, 1}}, {TensorType_FLOAT32, {1, 2, 2, 1}},
+      {TensorType_FLOAT32, {}}, ActivationFunctionType_RELU_N1_TO_1);
   m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8});
   m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 5});
   m.Invoke();
@@ -94,6 +96,7 @@ TEST(FloatMulOpTest, VariousInputShapes) {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     FloatMulOpModel m({TensorType_FLOAT32, test_shapes[i]},
+                      {TensorType_FLOAT32, test_shapes[i]},
                       {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
     m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
     m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.5, 1.1, 0.1});
@@ -105,8 +108,26 @@ TEST(FloatMulOpTest, VariousInputShapes) {
   }
 }
 
+TEST(FloatMulOpTest, WithBroadcast) {
+  std::vector<std::initializer_list<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    FloatMulOpModel m({TensorType_FLOAT32, test_shapes[i]},
+                      {TensorType_FLOAT32, {}},  // always a scalar
+                      {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
+    m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
+    m.PopulateTensor<float>(m.input2(), {0.1});
+    m.Invoke();
+    EXPECT_THAT(
+        m.GetOutput(),
+        ElementsAreArray(ArrayFloatNear({-0.2, 0.02, 0.07, 0.08, 0.11, 0.2})))
+        << "With shape number " << i;
+  }
+}
+
 TEST(QuantizedMulOpTest, NoActivation) {
   QuantizedMulOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
+                        {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
                         {TensorType_UINT8, {}, -1.0, 1.0},
                         ActivationFunctionType_NONE);
   m.QuantizeAndPopulate<uint8_t>(m.input1(), {-0.8, 0.2, 0.9, 0.7});
@@ -117,6 +138,32 @@ TEST(QuantizedMulOpTest, NoActivation) {
                                               kQuantizedTolerance)));
 }
 
+// for quantized Mul, the error shouldn't exceed 2*step
+float GetTolerance(int min, int max) {
+  float kQuantizedStep = (max - min) / 255.0;
+  float kQuantizedTolerance = 2.0 * kQuantizedStep;
+  return kQuantizedTolerance;
+}
+
+TEST(QuantizedMulOpTest, WithBroadcast) {
+  float kQuantizedTolerance = GetTolerance(-3.0, 3.0);
+  std::vector<std::initializer_list<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    QuantizedMulOpModel m({TensorType_UINT8, test_shapes[i], -3.0, 3.0},
+                          {TensorType_UINT8, {}, -3.0, 3.0},  // always a scalar
+                          {TensorType_UINT8, {}, -3.0, 3.0},
+                          ActivationFunctionType_NONE);
+    m.QuantizeAndPopulate<uint8_t>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
+    m.QuantizeAndPopulate<uint8_t>(m.input2(), {0.1});
+    m.Invoke();
+    EXPECT_THAT(m.GetDequantizedOutput(),
+                ElementsAreArray(ArrayFloatNear(
+                    {-0.2, 0.02, 0.07, 0.08, 0.11, 0.2}, kQuantizedTolerance)))
+        << "With shape number " << i;
+  }
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
index d73c9937ce..e8b425a592 100644
--- a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
+++ b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
@@ -47,9 +47,7 @@ tensorflow::Env* env = tensorflow::Env::Default();
 // Key is a substring of the test name and value is a bug number.
 // TODO(ahentz): make sure we clean this list up frequently.
 std::map<string, string> kBrokenTests = {
-    // Add doesn't support broadcasting.
-    {R"(^\/adda.*input_shape_1=\[1,3,4,3\],input_shape_2=\[3\])", "68500195"},
-    {R"(^\/mula.*input_shape_1=\[1,3,4,3\],input_shape_2=\[3\])", "68500195"},
+    // Sub and Div don't support broadcasting.
     {R"(^\/diva.*input_shape_1=\[1,3,4,3\],input_shape_2=\[3\])", "68500195"},
     {R"(^\/suba.*input_shape_1=\[1,3,4,3\],input_shape_2=\[3\])", "68500195"},
 
-- 
GitLab


From 5b39f0c75e15f33834a9e68c0b7654944a743476 Mon Sep 17 00:00:00 2001
From: Todd Wang <toddw@google.com>
Date: Tue, 30 Jan 2018 14:06:07 -0800
Subject: [PATCH 418/966] Remove unnecessary dependency from xla:types

PiperOrigin-RevId: 183888495
---
 tensorflow/compiler/xla/BUILD | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index c22fd37129..34e733bc8d 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -88,7 +88,6 @@ cc_library(
     visibility = [":friends"],
     deps = [
         "//tensorflow/core:framework_lite",
-        "//tensorflow/core:lib",
         "//third_party/eigen3",
     ],
 )
-- 
GitLab


From 47e2392424cf0a571bc4b2b73167797877a9be3c Mon Sep 17 00:00:00 2001
From: Sami Kama <skama@nvidia.com>
Date: Tue, 30 Jan 2018 14:12:54 -0800
Subject: [PATCH 419/966] Fix dependency problem

---
 tensorflow/contrib/tensorrt/BUILD | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index 3a214d2e86..35b4a052d6 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -209,9 +209,10 @@ cc_library(
     ],
     linkstatic = 1,
     deps = [
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:lib_proto_parsing",
-        "//third_party/eigen3",
+        "//tensorflow/core:graph",
+        # "//tensorflow/core:core_cpu",
+        # "//tensorflow/core:lib_proto_parsing",
+        # "//third_party/eigen3",
         "@protobuf_archive//:protobuf_headers",
     ],
 )
-- 
GitLab


From 8301d4b3c6c05f837122736b06d1ae1f20849b0a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 30 Jan 2018 14:07:26 -0800
Subject: [PATCH 420/966] Add `tf.contrib.distributions.Kumaraswamy`.

PiperOrigin-RevId: 183888726
---
 .../python/kernel_tests/kumaraswamy_test.py   | 388 ++++++++++++++++++
 .../distributions/python/ops/kumaraswamy.py   | 258 ++++++++++++
 2 files changed, 646 insertions(+)
 create mode 100644 tensorflow/contrib/distributions/python/kernel_tests/kumaraswamy_test.py
 create mode 100644 tensorflow/contrib/distributions/python/ops/kumaraswamy.py

diff --git a/tensorflow/contrib/distributions/python/kernel_tests/kumaraswamy_test.py b/tensorflow/contrib/distributions/python/kernel_tests/kumaraswamy_test.py
new file mode 100644
index 0000000000..ea3c86b5c0
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/kumaraswamy_test.py
@@ -0,0 +1,388 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import importlib
+
+import numpy as np
+
+from tensorflow.contrib.distributions.python.ops import kumaraswamy as kumaraswamy_lib
+from tensorflow.python.client import session
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+
+def try_import(name):  # pylint: disable=invalid-name
+  module = None
+  try:
+    module = importlib.import_module(name)
+  except ImportError as e:
+    tf_logging.warning("Could not import %s: %s" % (name, str(e)))
+  return module
+
+
+special = try_import("scipy.special")
+stats = try_import("scipy.stats")
+
+
+def _kumaraswamy_mode(a, b):
+  a = np.asarray(a)
+  b = np.asarray(b)
+  return ((a - 1) / (a * b - 1))**(1 / a)
+
+
+def _kumaraswamy_moment(a, b, n):
+  a = np.asarray(a)
+  b = np.asarray(b)
+  return b * special.beta(1.0 + n / a, b)
+
+
+def _harmonic_number(b):
+  b = np.asarray(b)
+  return special.psi(b + 1) - special.psi(1)
+
+
+def _kumaraswamy_cdf(a, b, x):
+  a = np.asarray(a)
+  b = np.asarray(b)
+  x = np.asarray(x)
+  return 1 - (1 - x**a)**b
+
+
+def _kumaraswamy_pdf(a, b, x):
+  a = np.asarray(a)
+  b = np.asarray(b)
+  x = np.asarray(x)
+  return a * b * x ** (a - 1) * (1 - x ** a) ** (b - 1)
+
+
+class KumaraswamyTest(test.TestCase):
+
+  def testSimpleShapes(self):
+    with self.test_session():
+      a = np.random.rand(3)
+      b = np.random.rand(3)
+      dist = kumaraswamy_lib.Kumaraswamy(a, b)
+      self.assertAllEqual([], dist.event_shape_tensor().eval())
+      self.assertAllEqual([3], dist.batch_shape_tensor().eval())
+      self.assertEqual(tensor_shape.TensorShape([]), dist.event_shape)
+      self.assertEqual(tensor_shape.TensorShape([3]), dist.batch_shape)
+
+  def testComplexShapes(self):
+    with self.test_session():
+      a = np.random.rand(3, 2, 2)
+      b = np.random.rand(3, 2, 2)
+      dist = kumaraswamy_lib.Kumaraswamy(a, b)
+      self.assertAllEqual([], dist.event_shape_tensor().eval())
+      self.assertAllEqual([3, 2, 2], dist.batch_shape_tensor().eval())
+      self.assertEqual(tensor_shape.TensorShape([]), dist.event_shape)
+      self.assertEqual(tensor_shape.TensorShape([3, 2, 2]), dist.batch_shape)
+
+  def testComplexShapesBroadcast(self):
+    with self.test_session():
+      a = np.random.rand(3, 2, 2)
+      b = np.random.rand(2, 2)
+      dist = kumaraswamy_lib.Kumaraswamy(a, b)
+      self.assertAllEqual([], dist.event_shape_tensor().eval())
+      self.assertAllEqual([3, 2, 2], dist.batch_shape_tensor().eval())
+      self.assertEqual(tensor_shape.TensorShape([]), dist.event_shape)
+      self.assertEqual(tensor_shape.TensorShape([3, 2, 2]), dist.batch_shape)
+
+  def testAProperty(self):
+    a = [[1., 2, 3]]
+    b = [[2., 4, 3]]
+    with self.test_session():
+      dist = kumaraswamy_lib.Kumaraswamy(a, b)
+      self.assertEqual([1, 3], dist.concentration1.get_shape())
+      self.assertAllClose(a, dist.concentration1.eval())
+
+  def testBProperty(self):
+    a = [[1., 2, 3]]
+    b = [[2., 4, 3]]
+    with self.test_session():
+      dist = kumaraswamy_lib.Kumaraswamy(a, b)
+      self.assertEqual([1, 3], dist.concentration0.get_shape())
+      self.assertAllClose(b, dist.concentration0.eval())
+
+  def testPdfXProper(self):
+    a = [[1., 2, 3]]
+    b = [[2., 4, 3]]
+    with self.test_session():
+      dist = kumaraswamy_lib.Kumaraswamy(a, b, validate_args=True)
+      dist.prob([.1, .3, .6]).eval()
+      dist.prob([.2, .3, .5]).eval()
+      # Either condition can trigger.
+      with self.assertRaisesOpError("sample must be positive"):
+        dist.prob([-1., 0.1, 0.5]).eval()
+      with self.assertRaisesOpError("sample must be positive"):
+        dist.prob([0., 0.1, 0.5]).eval()
+      with self.assertRaisesOpError("sample must be no larger than `1`"):
+        dist.prob([.1, .2, 1.2]).eval()
+
+  def testPdfTwoBatches(self):
+    with self.test_session():
+      a = [1., 2]
+      b = [1., 2]
+      x = [.5, .5]
+      dist = kumaraswamy_lib.Kumaraswamy(a, b)
+      pdf = dist.prob(x)
+      expected_pdf = _kumaraswamy_pdf(a, b, x)
+      self.assertAllClose(expected_pdf, pdf.eval())
+      self.assertEqual((2,), pdf.get_shape())
+
+  def testPdfTwoBatchesNontrivialX(self):
+    with self.test_session():
+      a = [1., 2]
+      b = [1., 2]
+      x = [.3, .7]
+      dist = kumaraswamy_lib.Kumaraswamy(a, b)
+      pdf = dist.prob(x)
+      expected_pdf = _kumaraswamy_pdf(a, b, x)
+      self.assertAllClose(expected_pdf, pdf.eval())
+      self.assertEqual((2,), pdf.get_shape())
+
+  def testPdfUniformZeroBatch(self):
+    with self.test_session():
+      # This is equivalent to a uniform distribution
+      a = 1.
+      b = 1.
+      x = np.array([.1, .2, .3, .5, .8], dtype=np.float32)
+      dist = kumaraswamy_lib.Kumaraswamy(a, b)
+      pdf = dist.prob(x)
+      expected_pdf = _kumaraswamy_pdf(a, b, x)
+      self.assertAllClose(expected_pdf, pdf.eval())
+      self.assertEqual((5,), pdf.get_shape())
+
+  def testPdfAStretchedInBroadcastWhenSameRank(self):
+    with self.test_session():
+      a = [[1., 2]]
+      b = [[1., 2]]
+      x = [[.5, .5], [.3, .7]]
+      dist = kumaraswamy_lib.Kumaraswamy(a, b)
+      pdf = dist.prob(x)
+      expected_pdf = _kumaraswamy_pdf(a, b, x)
+      self.assertAllClose(expected_pdf, pdf.eval())
+      self.assertEqual((2, 2), pdf.get_shape())
+
+  def testPdfAStretchedInBroadcastWhenLowerRank(self):
+    with self.test_session():
+      a = [1., 2]
+      b = [1., 2]
+      x = [[.5, .5], [.2, .8]]
+      pdf = kumaraswamy_lib.Kumaraswamy(a, b).prob(x)
+      expected_pdf = _kumaraswamy_pdf(a, b, x)
+      self.assertAllClose(expected_pdf, pdf.eval())
+      self.assertEqual((2, 2), pdf.get_shape())
+
+  def testPdfXStretchedInBroadcastWhenSameRank(self):
+    with self.test_session():
+      a = [[1., 2], [2., 3]]
+      b = [[1., 2], [2., 3]]
+      x = [[.5, .5]]
+      pdf = kumaraswamy_lib.Kumaraswamy(a, b).prob(x)
+      expected_pdf = _kumaraswamy_pdf(a, b, x)
+      self.assertAllClose(expected_pdf, pdf.eval())
+      self.assertEqual((2, 2), pdf.get_shape())
+
+  def testPdfXStretchedInBroadcastWhenLowerRank(self):
+    with self.test_session():
+      a = [[1., 2], [2., 3]]
+      b = [[1., 2], [2., 3]]
+      x = [.5, .5]
+      pdf = kumaraswamy_lib.Kumaraswamy(a, b).prob(x)
+      expected_pdf = _kumaraswamy_pdf(a, b, x)
+      self.assertAllClose(expected_pdf, pdf.eval())
+      self.assertEqual((2, 2), pdf.get_shape())
+
+  def testKumaraswamyMean(self):
+    with session.Session():
+      a = [1., 2, 3]
+      b = [2., 4, 1.2]
+      dist = kumaraswamy_lib.Kumaraswamy(a, b)
+      self.assertEqual(dist.mean().get_shape(), (3,))
+      if not stats:
+        return
+      expected_mean = _kumaraswamy_moment(a, b, 1)
+      self.assertAllClose(expected_mean, dist.mean().eval())
+
+  def testKumaraswamyVariance(self):
+    with session.Session():
+      a = [1., 2, 3]
+      b = [2., 4, 1.2]
+      dist = kumaraswamy_lib.Kumaraswamy(a, b)
+      self.assertEqual(dist.variance().get_shape(), (3,))
+      if not stats:
+        return
+      expected_variance = _kumaraswamy_moment(a, b, 2) - _kumaraswamy_moment(
+          a, b, 1)**2
+      self.assertAllClose(expected_variance, dist.variance().eval())
+
+  def testKumaraswamyMode(self):
+    with session.Session():
+      a = np.array([1.1, 2, 3])
+      b = np.array([2., 4, 1.2])
+      expected_mode = _kumaraswamy_mode(a, b)
+      dist = kumaraswamy_lib.Kumaraswamy(a, b)
+      self.assertEqual(dist.mode().get_shape(), (3,))
+      self.assertAllClose(expected_mode, dist.mode().eval())
+
+  def testKumaraswamyModeInvalid(self):
+    with session.Session():
+      a = np.array([1., 2, 3])
+      b = np.array([2., 4, 1.2])
+      dist = kumaraswamy_lib.Kumaraswamy(a, b, allow_nan_stats=False)
+      with self.assertRaisesOpError("Condition x < y.*"):
+        dist.mode().eval()
+
+      a = np.array([2., 2, 3])
+      b = np.array([1., 4, 1.2])
+      dist = kumaraswamy_lib.Kumaraswamy(a, b, allow_nan_stats=False)
+      with self.assertRaisesOpError("Condition x < y.*"):
+        dist.mode().eval()
+
+  def testKumaraswamyModeEnableAllowNanStats(self):
+    with session.Session():
+      a = np.array([1., 2, 3])
+      b = np.array([2., 4, 1.2])
+      dist = kumaraswamy_lib.Kumaraswamy(a, b, allow_nan_stats=True)
+
+      expected_mode = _kumaraswamy_mode(a, b)
+      expected_mode[0] = np.nan
+      self.assertEqual((3,), dist.mode().get_shape())
+      self.assertAllClose(expected_mode, dist.mode().eval())
+
+      a = np.array([2., 2, 3])
+      b = np.array([1., 4, 1.2])
+      dist = kumaraswamy_lib.Kumaraswamy(a, b, allow_nan_stats=True)
+
+      expected_mode = _kumaraswamy_mode(a, b)
+      expected_mode[0] = np.nan
+      self.assertEqual((3,), dist.mode().get_shape())
+      self.assertAllClose(expected_mode, dist.mode().eval())
+
+  def testKumaraswamyEntropy(self):
+    with session.Session():
+      a = np.array([1., 2, 3])
+      b = np.array([2., 4, 1.2])
+      dist = kumaraswamy_lib.Kumaraswamy(a, b)
+      self.assertEqual(dist.entropy().get_shape(), (3,))
+      if not stats:
+        return
+      expected_entropy = (1 - 1. / a) + (
+          1 - 1. / b) * _harmonic_number(b) + np.log(a * b)
+      self.assertAllClose(expected_entropy, dist.entropy().eval())
+
+  def testKumaraswamySample(self):
+    with self.test_session():
+      a = 1.
+      b = 2.
+      kumaraswamy = kumaraswamy_lib.Kumaraswamy(a, b)
+      n = constant_op.constant(100000)
+      samples = kumaraswamy.sample(n)
+      sample_values = samples.eval()
+      self.assertEqual(sample_values.shape, (100000,))
+      self.assertFalse(np.any(sample_values < 0.0))
+      if not stats:
+        return
+      self.assertLess(
+          stats.kstest(
+              # Kumaraswamy is a univariate distribution.
+              sample_values,
+              lambda x: _kumaraswamy_cdf(1., 2., x))[0],
+          0.01)
+      # The standard error of the sample mean is 1 / (sqrt(18 * n))
+      expected_mean = _kumaraswamy_moment(a, b, 1)
+      self.assertAllClose(sample_values.mean(axis=0), expected_mean, atol=1e-2)
+      expected_variance = _kumaraswamy_moment(a, b, 2) - _kumaraswamy_moment(
+          a, b, 1)**2
+      self.assertAllClose(
+          np.cov(sample_values, rowvar=0), expected_variance, atol=1e-1)
+
+  # Test that sampling with the same seed twice gives the same results.
+  def testKumaraswamySampleMultipleTimes(self):
+    with self.test_session():
+      a_val = 1.
+      b_val = 2.
+      n_val = 100
+
+      random_seed.set_random_seed(654321)
+      kumaraswamy1 = kumaraswamy_lib.Kumaraswamy(
+          concentration1=a_val, concentration0=b_val, name="kumaraswamy1")
+      samples1 = kumaraswamy1.sample(n_val, seed=123456).eval()
+
+      random_seed.set_random_seed(654321)
+      kumaraswamy2 = kumaraswamy_lib.Kumaraswamy(
+          concentration1=a_val, concentration0=b_val, name="kumaraswamy2")
+      samples2 = kumaraswamy2.sample(n_val, seed=123456).eval()
+
+      self.assertAllClose(samples1, samples2)
+
+  def testKumaraswamySampleMultidimensional(self):
+    with self.test_session():
+      a = np.random.rand(3, 2, 2).astype(np.float32)
+      b = np.random.rand(3, 2, 2).astype(np.float32)
+      kumaraswamy = kumaraswamy_lib.Kumaraswamy(a, b)
+      n = constant_op.constant(100000)
+      samples = kumaraswamy.sample(n)
+      sample_values = samples.eval()
+      self.assertEqual(sample_values.shape, (100000, 3, 2, 2))
+      self.assertFalse(np.any(sample_values < 0.0))
+      if not stats:
+        return
+      self.assertAllClose(
+          sample_values[:, 1, :].mean(axis=0),
+          _kumaraswamy_moment(a, b, 1)[1, :],
+          atol=1e-1)
+
+  def testKumaraswamyCdf(self):
+    with self.test_session():
+      shape = (30, 40, 50)
+      for dt in (np.float32, np.float64):
+        a = 10. * np.random.random(shape).astype(dt)
+        b = 10. * np.random.random(shape).astype(dt)
+        x = np.random.random(shape).astype(dt)
+        actual = kumaraswamy_lib.Kumaraswamy(a, b).cdf(x).eval()
+        self.assertAllEqual(np.ones(shape, dtype=np.bool), 0. <= x)
+        self.assertAllEqual(np.ones(shape, dtype=np.bool), 1. >= x)
+        if not stats:
+          return
+        self.assertAllClose(
+            _kumaraswamy_cdf(a, b, x), actual, rtol=1e-4, atol=0)
+
+  def testKumaraswamyLogCdf(self):
+    with self.test_session():
+      shape = (30, 40, 50)
+      for dt in (np.float32, np.float64):
+        a = 10. * np.random.random(shape).astype(dt)
+        b = 10. * np.random.random(shape).astype(dt)
+        x = np.random.random(shape).astype(dt)
+        actual = math_ops.exp(kumaraswamy_lib.Kumaraswamy(a,
+                                                          b).log_cdf(x)).eval()
+        self.assertAllEqual(np.ones(shape, dtype=np.bool), 0. <= x)
+        self.assertAllEqual(np.ones(shape, dtype=np.bool), 1. >= x)
+        if not stats:
+          return
+        self.assertAllClose(
+            _kumaraswamy_cdf(a, b, x), actual, rtol=1e-4, atol=0)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/ops/kumaraswamy.py b/tensorflow/contrib/distributions/python/ops/kumaraswamy.py
new file mode 100644
index 0000000000..74d5d8773c
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/kumaraswamy.py
@@ -0,0 +1,258 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The Kumaraswamy distribution class."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import special_math_ops
+from tensorflow.python.ops.distributions import beta
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import util as distribution_util
+from tensorflow.python.util.tf_export import tf_export
+
+__all__ = [
+    "Kumaraswamy",
+]
+
+_kumaraswamy_sample_note = """Note: `x` must have dtype `self.dtype` and be in
+`[0, 1].` It must have a shape compatible with `self.batch_shape()`."""
+
+
+def _harmonic_number(x):
+  """Compute the harmonic number from its analytic continuation.
+
+  Derivation from [1] and Euler's constant [2].
+  [1] -
+  https://en.wikipedia.org/wiki/Digamma_function#Relation_to_harmonic_numbers
+  [2] - https://en.wikipedia.org/wiki/Euler%E2%80%93Mascheroni_constant
+
+
+  Args:
+    x: input float.
+
+  Returns:
+    z: The analytic continuation of the harmonic number for the input.
+
+  """
+  one = array_ops.ones([], dtype=x.dtype)
+  return math_ops.digamma(x + one) - math_ops.digamma(one)
+
+
+@tf_export("distributions.Kumaraswamy")
+class Kumaraswamy(beta.Beta):
+  """Kumaraswamy distribution.
+
+  The Kumaraswamy distribution is defined over the `(0, 1)` interval using
+  parameters
+  `concentration1` (aka "alpha") and `concentration0` (aka "beta").  It has a
+  shape similar to the Beta distribution, but is reparameterizeable.
+
+  #### Mathematical Details
+
+  The probability density function (pdf) is,
+
+  ```none
+  pdf(x; alpha, beta) = alpha * beta * x**(alpha - 1) * (1 - x**alpha)**(beta -
+  1)
+  ```
+
+  where:
+
+  * `concentration1 = alpha`,
+  * `concentration0 = beta`,
+
+  Distribution parameters are automatically broadcast in all functions; see
+  examples for details.
+
+  #### Examples
+
+  ```python
+  # Create a batch of three Kumaraswamy distributions.
+  alpha = [1, 2, 3]
+  beta = [1, 2, 3]
+  dist = Kumaraswamy(alpha, beta)
+
+  dist.sample([4, 5])  # Shape [4, 5, 3]
+
+  # `x` has three batch entries, each with two samples.
+  x = [[.1, .4, .5],
+       [.2, .3, .5]]
+  # Calculate the probability of each pair of samples under the corresponding
+  # distribution in `dist`.
+  dist.prob(x)         # Shape [2, 3]
+  ```
+
+  ```python
+  # Create batch_shape=[2, 3] via parameter broadcast:
+  alpha = [[1.], [2]]      # Shape [2, 1]
+  beta = [3., 4, 5]        # Shape [3]
+  dist = Kumaraswamy(alpha, beta)
+
+  # alpha broadcast as: [[1., 1, 1,],
+  #                      [2, 2, 2]]
+  # beta broadcast as:  [[3., 4, 5],
+  #                      [3, 4, 5]]
+  # batch_Shape [2, 3]
+  dist.sample([4, 5])  # Shape [4, 5, 2, 3]
+
+  x = [.2, .3, .5]
+  # x will be broadcast as [[.2, .3, .5],
+  #                         [.2, .3, .5]],
+  # thus matching batch_shape [2, 3].
+  dist.prob(x)         # Shape [2, 3]
+  ```
+
+  """
+
+  def __init__(self,
+               concentration1=None,
+               concentration0=None,
+               validate_args=False,
+               allow_nan_stats=True,
+               name="Kumaraswamy"):
+    """Initialize a batch of Kumaraswamy distributions.
+
+    Args:
+      concentration1: Positive floating-point `Tensor` indicating mean
+        number of successes; aka "alpha". Implies `self.dtype` and
+        `self.batch_shape`, i.e.,
+        `concentration1.shape = [N1, N2, ..., Nm] = self.batch_shape`.
+      concentration0: Positive floating-point `Tensor` indicating mean
+        number of failures; aka "beta". Otherwise has same semantics as
+        `concentration1`.
+      validate_args: Python `bool`, default `False`. When `True` distribution
+        parameters are checked for validity despite possibly degrading runtime
+        performance. When `False` invalid inputs may silently render incorrect
+        outputs.
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
+        (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
+        result is undefined. When `False`, an exception is raised if one or
+        more of the statistic's batch members are undefined.
+      name: Python `str` name prefixed to Ops created by this class.
+    """
+    super(Kumaraswamy, self).__init__(
+        concentration1=concentration1,
+        concentration0=concentration0,
+        validate_args=validate_args,
+        allow_nan_stats=allow_nan_stats,
+        name=name)
+    self._reparameterization_type = distribution.FULLY_REPARAMETERIZED
+
+  def _sample_n(self, n, seed=None):
+    expanded_concentration1 = array_ops.ones_like(
+        self.total_concentration, dtype=self.dtype) * self.concentration1
+    expanded_concentration0 = array_ops.ones_like(
+        self.total_concentration, dtype=self.dtype) * self.concentration0
+    shape = array_ops.concat([[n], self.batch_shape_tensor()], 0)
+    uniform_sample = random_ops.random_uniform(
+        shape=shape, minval=0.0, maxval=1.0, dtype=self.dtype, seed=seed)
+
+    kumaraswamy_sample = (1 - uniform_sample**(1. / expanded_concentration0))**(
+        1. / expanded_concentration1)
+    return kumaraswamy_sample
+
+  @distribution_util.AppendDocstring(_kumaraswamy_sample_note)
+  def _log_cdf(self, x):
+    a = self.concentration1
+    b = self.concentration0
+    return math_ops.log1p(-(1 - x**a)**b)
+
+  @distribution_util.AppendDocstring(_kumaraswamy_sample_note)
+  def _cdf(self, x):
+    a = self.concentration1
+    b = self.concentration0
+    return 1 - (1 - x**a)**b
+
+  def _survival_function(self, x):
+    a = self.concentration1
+    b = self.concentration0
+    return (1 - x**a)**b
+
+  def _log_survival_function(self, x):
+    a = self.concentration1
+    b = self.concentration0
+    return b * math_ops.log1p(-x**a)
+
+  def _log_unnormalized_prob(self, x):
+    x = self._maybe_assert_valid_sample(x)
+    a = self.concentration1
+    b = self.concentration0
+    return (a - 1) * math_ops.log(x) + (b - 1) * math_ops.log1p(-x**a)
+
+  def _log_normalization(self):
+    a = self.concentration1
+    b = self.concentration0
+    return -(math_ops.log(a) + math_ops.log(b))
+
+  def _entropy(self):
+    a = self.concentration1
+    b = self.concentration0
+    return (1 - 1. / a) + (
+        1 - 1. / b) * _harmonic_number(b) + math_ops.log(a) + math_ops.log(b)
+
+  def _moment(self, n):
+    """Compute the n'th (uncentered) moment."""
+    expanded_concentration1 = array_ops.ones_like(
+        self.total_concentration, dtype=self.dtype) * self.concentration1
+    expanded_concentration0 = array_ops.ones_like(
+        self.total_concentration, dtype=self.dtype) * self.concentration0
+    beta_arg0 = 1 + n / expanded_concentration1
+    beta_arg = array_ops.stack([beta_arg0, expanded_concentration0], -1)
+    log_moment = math_ops.log(expanded_concentration0) + special_math_ops.lbeta(
+        beta_arg)
+    return math_ops.exp(log_moment)
+
+  def _mean(self):
+    return self._moment(1)
+
+  def _variance(self):
+    # TODO(b/72696533): Investigate a more numerically stable version.
+    return self._moment(2) - math_ops.square(self._moment(1))
+
+  @distribution_util.AppendDocstring(
+      """Note: The mode is undefined when `concentration1 <= 1` or
+      `concentration0 <= 1`. If `self.allow_nan_stats` is `True`, `NaN`
+      is used for undefined modes. If `self.allow_nan_stats` is `False` an
+      exception is raised when one or more modes are undefined.""")
+  def _mode(self):
+    a = self.concentration1
+    b = self.concentration0
+    mode = ((a - 1) / (a * b - 1))**(1. / a)
+    if self.allow_nan_stats:
+      nan = array_ops.fill(
+          self.batch_shape_tensor(),
+          np.array(np.nan, dtype=self.dtype.as_numpy_dtype),
+          name="nan")
+      is_defined = (self.concentration1 > 1.) & (self.concentration0 > 1.)
+      return array_ops.where(is_defined, mode, nan)
+    return control_flow_ops.with_dependencies([
+        check_ops.assert_less(
+            array_ops.ones([], dtype=self.dtype),
+            self.concentration1,
+            message="Mode undefined for concentration1 <= 1."),
+        check_ops.assert_less(
+            array_ops.ones([], dtype=self.dtype),
+            self.concentration0,
+            message="Mode undefined for concentration0 <= 1.")
+    ], mode)
-- 
GitLab


From 23b9ffc522353baa1c06245b96e4b5ecc955d359 Mon Sep 17 00:00:00 2001
From: David Majnemer <majnemer@google.com>
Date: Tue, 30 Jan 2018 14:28:06 -0800
Subject: [PATCH 421/966] [XLA] Add a test for scalar kRoundNearestAfz

PiperOrigin-RevId: 183892483
---
 tensorflow/compiler/xla/tests/scalar_computations_test.cc | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tensorflow/compiler/xla/tests/scalar_computations_test.cc b/tensorflow/compiler/xla/tests/scalar_computations_test.cc
index debf2d2d31..43e4d891a1 100644
--- a/tensorflow/compiler/xla/tests/scalar_computations_test.cc
+++ b/tensorflow/compiler/xla/tests/scalar_computations_test.cc
@@ -852,5 +852,12 @@ XLA_TEST_F(ScalarComputationsTest, SqrtF320) {
   ComputeAndCompareR0<float>(&builder, 0.0f, {zero_data.get()}, error_spec_);
 }
 
+XLA_TEST_F(ScalarComputationsTest, RoundScalar) {
+  ComputationBuilder builder(client_, TestName());
+  builder.Round(builder.ConstantR0<float>(1.4f));
+
+  ComputeAndCompareR0<float>(&builder, 1.0f, {}, error_spec_);
+}
+
 }  // namespace
 }  // namespace xla
-- 
GitLab


From 599eadc299ae680bfb569ace4278b2eb262ecc44 Mon Sep 17 00:00:00 2001
From: Sami Kama <skama@nvidia.com>
Date: Tue, 30 Jan 2018 14:42:53 -0800
Subject: [PATCH 422/966] Changes for more PR comments

---
 .../contrib/tensorrt/convert/convert_graph.cc |  2 +-
 .../contrib/tensorrt/convert/convert_graph.h  |  7 +++---
 .../contrib/tensorrt/convert/convert_nodes.cc | 23 ++++++-------------
 .../contrib/tensorrt/convert/convert_nodes.h  |  2 +-
 4 files changed, 13 insertions(+), 21 deletions(-)

diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index 1507981ca8..e0f38c60ee 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -258,7 +258,7 @@ tensorflow::Status ConvertGraphDefToTensorRT(
   TF_RETURN_IF_ERROR(tensorrt::segment::SegmentGraph(
       gdef, IsTensorRTCandidate, segment_options, &segments));
   if (segments.size() > 1) {
-    LOG(INFO) << "MULTIPLE tensorrt candidate conversion: " << segments.size();
+    VLOG(INFO) << "MULTIPLE tensorrt candidate conversion: " << segments.size();
   }
   std::unordered_map<std::string, tensorflow::Node*> node_map;
   TF_RETURN_IF_ERROR(BuildNodeMap(graph, &node_map));
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.h b/tensorflow/contrib/tensorrt/convert/convert_graph.h
index e0fa02ecd4..fc918a9ec2 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.h
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.h
@@ -28,13 +28,14 @@ namespace tensorflow {
 namespace tensorrt {
 namespace convert {
 
-// max_batch_size: maximum batch size which can be used for inferencefor;
+// max_batch_size: maximum batch size which can be used for inference for
 //                 optimization targets inference run with max batch size.
-// max_workspace_size: The upper bound of memory allowence for engine building.
+// max_workspace_size_bytes: The upper bound of memory allowence for
+//                 engine building.
 tensorflow::Status ConvertGraphDefToTensorRT(
     const tensorflow::GraphDef& graph_def,
     const std::vector<std::string>& output_names, size_t max_batch_size,
-    size_t max_workspace_size, tensorflow::GraphDef* new_graph_def);
+    size_t max_workspace_size_bytes, tensorflow::GraphDef* new_graph_def);
 
 }  // namespace convert
 }  // namespace tensorrt
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index a42f559651..2653c1c636 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -1420,36 +1420,27 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
     }
   }
   // topological order is needed to build TRT network
-  VLOG(-1) << "BUILDING 1";
 
   tensorflow::tensorrt::Logger trt_logger;
 
-  VLOG(-1) << "BUILDING 2";
-
   auto trt_builder = infer_object(nvinfer1::createInferBuilder(trt_logger));
   if (!trt_builder) {
     return tensorflow::errors::Internal(
         "failed to create TensorRT builder object");
   }
 
-  VLOG(-1) << "BUILDING 3";
-
   auto trt_network = infer_object(trt_builder->createNetwork());
   if (!trt_network) {
     return tensorflow::errors::Internal(
         "failed to create TensorRT network object");
   }
 
-  VLOG(-1) << "BUILDING 4";
-
   // Build the network
   Converter converter(trt_network.get());
 
-  VLOG(-1) << "BUILDING 5";
   std::vector<std::string> input_names;
   std::vector<tensorflow::DataType> input_dtypes;
   for (std::pair<int, int> const& input : input_inds) {
-    VLOG(-1) << "parsing input!!!!!";
     int node_id = input.first;
     int output_idx = input.second;
     tensorflow::Node* node = graph.FindNodeId(node_id);
@@ -1551,11 +1542,12 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
   }
 
   VLOG(-1) << "finished output";
+  static int static_id = 0;
 
   // Build the engine
   trt_builder->setMaxBatchSize(max_batch_size);
   trt_builder->setMaxWorkspaceSize(max_workspace_size);
-  LOG(INFO) << "starting build engine";
+  LOG(INFO) << "starting build engine "<<static_id;
   // TODO(ben,jie): half2 and int8 mode support
   std::string engine_plan_string;
   {
@@ -1563,18 +1555,17 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
         infer_object(trt_builder->buildCudaEngine(*converter.network()));
     LOG(INFO) << "built network";
     auto engine_plan = infer_object(trt_engine->serialize());
-    LOG(INFO) << "serialized engine";
+    VLOG(INFO) << "serialized engine";
     const char* engine_plan_data =
         static_cast<const char*>(engine_plan->data());
     engine_plan_string = std::move(
         std::string(engine_plan_data, engine_plan_data + engine_plan->size()));
   }
 
-  LOG(INFO) << "finished engine";
+  VLOG(INFO) << "finished engine";
 
   // Build the TRT op
   // TODO(sami,ben,jie): proper naming!
-  static int static_id = 0;
   tensorflow::NodeDefBuilder op_builder(
       "my_trt_op" + std::to_string(static_id++), "TRTEngineOp");
   std::vector<tensorflow::NodeDefBuilder::NodeOut> income_edges;
@@ -1590,7 +1581,7 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
       income_edges);
   op_builder.Input(input_list);
 
-  LOG(INFO) << "finished op preparation";
+  VLOG(INFO) << "finished op preparation";
 
   auto status = op_builder.Attr("serialized_engine", engine_plan_string)
                     .Attr("input_nodes", input_names)
@@ -1598,8 +1589,8 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
                     .Attr("OutT", output_dtypes)
                     .Finalize(trt_node);
 
-  LOG(INFO) << status.ToString();
-  LOG(INFO) << "finished op building";
+  VLOG(INFO) << status.ToString();
+  VLOG(INFO) << "finished op building";
 
   return tensorflow::Status::OK();
 }
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.h b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
index 69657e0cb9..2e7fd19566 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.h
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
@@ -38,7 +38,7 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
         input_inds,  // {node_id, output_idx}
     const std::vector<std::pair<int, int>>&
         output_inds,  // {node_id, output_idx}
-    size_t max_batch_size, size_t max_workspace_size,
+    size_t max_batch_size, size_t max_workspace_size_bytes,
     const tensorflow::grappler::GraphProperties& graph_prop,
     tensorflow::NodeDef* trt_node);
 
-- 
GitLab


From 58f0be0179a3adf7a47db95a1be57af53c3ee601 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 30 Jan 2018 14:47:11 -0800
Subject: [PATCH 423/966] Expose the decorator in the main API. Move the
 top-level implementation files into a submodule, to avoid problems around the
 remove_undocumented call.

PiperOrigin-RevId: 183896273
---
 tensorflow/BUILD                              |  1 +
 tensorflow/contrib/py2tf/BUILD                | 59 +----------------
 tensorflow/contrib/py2tf/__init__.py          |  9 +--
 tensorflow/contrib/py2tf/impl/BUILD           | 65 +++++++++++++++++++
 tensorflow/contrib/py2tf/{ => impl}/api.py    |  4 +-
 .../contrib/py2tf/{ => impl}/api_test.py      |  4 +-
 tensorflow/contrib/py2tf/{ => impl}/config.py |  3 +-
 .../contrib/py2tf/{ => impl}/conversion.py    |  4 +-
 .../py2tf/{ => impl}/conversion_test.py       |  2 +-
 tensorflow/contrib/py2tf/{ => impl}/naming.py |  0
 .../contrib/py2tf/{ => impl}/naming_test.py   |  2 +-
 tensorflow/tools/pip_package/BUILD            |  3 +-
 .../tools/pip_package/pip_smoke_test.py       |  1 -
 13 files changed, 84 insertions(+), 73 deletions(-)
 create mode 100644 tensorflow/contrib/py2tf/impl/BUILD
 rename tensorflow/contrib/py2tf/{ => impl}/api.py (98%)
 rename tensorflow/contrib/py2tf/{ => impl}/api_test.py (98%)
 rename tensorflow/contrib/py2tf/{ => impl}/config.py (89%)
 rename tensorflow/contrib/py2tf/{ => impl}/conversion.py (99%)
 rename tensorflow/contrib/py2tf/{ => impl}/conversion_test.py (97%)
 rename tensorflow/contrib/py2tf/{ => impl}/naming.py (100%)
 rename tensorflow/contrib/py2tf/{ => impl}/naming_test.py (98%)

diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 3d2411a266..66a2ecd8b5 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -536,6 +536,7 @@ filegroup(
         "//tensorflow/contrib/predictor:all_files",
         "//tensorflow/contrib/py2tf:all_files",
         "//tensorflow/contrib/py2tf/converters:all_files",
+        "//tensorflow/contrib/py2tf/impl:all_files",
         "//tensorflow/contrib/py2tf/pyct:all_files",
         "//tensorflow/contrib/py2tf/pyct/static_analysis:all_files",
         "//tensorflow/contrib/quantize:all_files",
diff --git a/tensorflow/contrib/py2tf/BUILD b/tensorflow/contrib/py2tf/BUILD
index 3e846aefeb..cea3738499 100644
--- a/tensorflow/contrib/py2tf/BUILD
+++ b/tensorflow/contrib/py2tf/BUILD
@@ -18,69 +18,12 @@ py_library(
     name = "py2tf",
     srcs = [
         "__init__.py",
-        "api.py",
-        "config.py",
-        "conversion.py",
-        "naming.py",
     ],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/contrib/py2tf/converters",
-        "//tensorflow/contrib/py2tf/pyct",
-        "//tensorflow/contrib/py2tf/pyct/static_analysis",
+        "//tensorflow/contrib/py2tf/impl",
         "@gast_archive//:gast",
         "@six_archive//:six",
     ],
 )
-
-# Separate target that allows access to internal symbols for testing.
-py_library(
-    name = "py2tf_internal",
-    srcs = [
-        "api.py",
-        "config.py",
-        "conversion.py",
-        "naming.py",
-    ],
-    srcs_version = "PY2AND3",
-    visibility = ["//tensorflow:__subpackages__"],
-    deps = [
-        "//tensorflow/contrib/py2tf/converters",
-        "//tensorflow/contrib/py2tf/pyct",
-        "//tensorflow/contrib/py2tf/pyct/static_analysis",
-        "@gast_archive//:gast",
-        "@six_archive//:six",
-    ],
-)
-
-py_test(
-    name = "api_test",
-    srcs = ["api_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":py2tf_internal",
-        "//tensorflow/python:client_testlib",
-    ],
-)
-
-py_test(
-    name = "conversion_test",
-    srcs = ["conversion_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":py2tf_internal",
-        "//tensorflow/python:client_testlib",
-        "@gast_archive//:gast",
-    ],
-)
-
-py_test(
-    name = "naming_test",
-    srcs = ["naming_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":py2tf_internal",
-        "//tensorflow/python:client_testlib",
-    ],
-)
diff --git a/tensorflow/contrib/py2tf/__init__.py b/tensorflow/contrib/py2tf/__init__.py
index d187da99e0..878941b3a3 100644
--- a/tensorflow/contrib/py2tf/__init__.py
+++ b/tensorflow/contrib/py2tf/__init__.py
@@ -21,11 +21,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.api import to_code
-from tensorflow.contrib.py2tf.api import to_graph
+from tensorflow.contrib.py2tf.impl.api import convert
+from tensorflow.contrib.py2tf.impl.api import graph_ready
+from tensorflow.contrib.py2tf.impl.api import to_code
+from tensorflow.contrib.py2tf.impl.api import to_graph
 from tensorflow.python.util.all_util import remove_undocumented
 
-
-_allowed_symbols = ['to_graph', 'to_code']
+_allowed_symbols = ['to_graph', 'to_code', 'convert', 'graph_ready']
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/py2tf/impl/BUILD b/tensorflow/contrib/py2tf/impl/BUILD
new file mode 100644
index 0000000000..22f0c25cab
--- /dev/null
+++ b/tensorflow/contrib/py2tf/impl/BUILD
@@ -0,0 +1,65 @@
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+py_library(
+    name = "impl",
+    srcs = [
+        "api.py",
+        "config.py",
+        "conversion.py",
+        "naming.py",
+    ],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:__subpackages__"],
+    deps = [
+        "//tensorflow/contrib/py2tf/converters",
+        "//tensorflow/contrib/py2tf/pyct",
+        "//tensorflow/contrib/py2tf/pyct/static_analysis",
+        "@gast_archive//:gast",
+        "@six_archive//:six",
+    ],
+)
+
+py_test(
+    name = "api_test",
+    srcs = ["api_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":impl",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "conversion_test",
+    srcs = ["conversion_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":impl",
+        "//tensorflow/python:client_testlib",
+        "@gast_archive//:gast",
+    ],
+)
+
+py_test(
+    name = "naming_test",
+    srcs = ["naming_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":impl",
+        "//tensorflow/python:client_testlib",
+    ],
+)
diff --git a/tensorflow/contrib/py2tf/api.py b/tensorflow/contrib/py2tf/impl/api.py
similarity index 98%
rename from tensorflow/contrib/py2tf/api.py
rename to tensorflow/contrib/py2tf/impl/api.py
index 1f250d5f57..8ff6618912 100644
--- a/tensorflow/contrib/py2tf/api.py
+++ b/tensorflow/contrib/py2tf/impl/api.py
@@ -23,8 +23,8 @@ from functools import wraps
 import gast
 import six
 
-from tensorflow.contrib.py2tf import config
-from tensorflow.contrib.py2tf import conversion
+from tensorflow.contrib.py2tf.impl import config
+from tensorflow.contrib.py2tf.impl import conversion
 from tensorflow.contrib.py2tf.pyct import compiler
 from tensorflow.contrib.py2tf.pyct import parser
 from tensorflow.python.util import tf_inspect
diff --git a/tensorflow/contrib/py2tf/api_test.py b/tensorflow/contrib/py2tf/impl/api_test.py
similarity index 98%
rename from tensorflow/contrib/py2tf/api_test.py
rename to tensorflow/contrib/py2tf/impl/api_test.py
index 2384447708..dbd079a3ca 100644
--- a/tensorflow/contrib/py2tf/api_test.py
+++ b/tensorflow/contrib/py2tf/impl/api_test.py
@@ -18,8 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf import api
-from tensorflow.contrib.py2tf import config
+from tensorflow.contrib.py2tf.impl import api
+from tensorflow.contrib.py2tf.impl import config
 from tensorflow.contrib.py2tf.pyct import parser
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import math_ops
diff --git a/tensorflow/contrib/py2tf/config.py b/tensorflow/contrib/py2tf/impl/config.py
similarity index 89%
rename from tensorflow/contrib/py2tf/config.py
rename to tensorflow/contrib/py2tf/impl/config.py
index 8c502a7a9e..0892241983 100644
--- a/tensorflow/contrib/py2tf/config.py
+++ b/tensorflow/contrib/py2tf/impl/config.py
@@ -32,7 +32,8 @@ DEFAULT_UNCOMPILED_MODULES = set((
 NO_SIDE_EFFECT_CONSTRUCTORS = set(('tensorflow',))
 
 # TODO(mdan): Also allow controlling the generated names (for testability).
+# TODO(mdan): Verify that these names are not hidden by generated code.
+# TODO(mdan): Make sure copybara renames the reference below.
 COMPILED_IMPORT_STATEMENTS = (
-    'from contextlib import contextmanager',
     'import tensorflow as tf',
 )
diff --git a/tensorflow/contrib/py2tf/conversion.py b/tensorflow/contrib/py2tf/impl/conversion.py
similarity index 99%
rename from tensorflow/contrib/py2tf/conversion.py
rename to tensorflow/contrib/py2tf/impl/conversion.py
index 67ca52d194..ed71ff5c06 100644
--- a/tensorflow/contrib/py2tf/conversion.py
+++ b/tensorflow/contrib/py2tf/impl/conversion.py
@@ -21,8 +21,6 @@ from __future__ import print_function
 import gast
 import six
 
-from tensorflow.contrib.py2tf import config
-from tensorflow.contrib.py2tf import naming
 from tensorflow.contrib.py2tf.converters import asserts
 from tensorflow.contrib.py2tf.converters import break_canonicalization
 from tensorflow.contrib.py2tf.converters import builtin_functions
@@ -34,6 +32,8 @@ from tensorflow.contrib.py2tf.converters import for_canonicalization
 from tensorflow.contrib.py2tf.converters import logical_expressions
 from tensorflow.contrib.py2tf.converters import print_functions
 from tensorflow.contrib.py2tf.converters import side_effect_guards
+from tensorflow.contrib.py2tf.impl import config
+from tensorflow.contrib.py2tf.impl import naming
 from tensorflow.contrib.py2tf.pyct import context
 from tensorflow.contrib.py2tf.pyct import parser
 from tensorflow.contrib.py2tf.pyct.static_analysis import access
diff --git a/tensorflow/contrib/py2tf/conversion_test.py b/tensorflow/contrib/py2tf/impl/conversion_test.py
similarity index 97%
rename from tensorflow/contrib/py2tf/conversion_test.py
rename to tensorflow/contrib/py2tf/impl/conversion_test.py
index 26f915f4f4..3888958f19 100644
--- a/tensorflow/contrib/py2tf/conversion_test.py
+++ b/tensorflow/contrib/py2tf/impl/conversion_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import gast
 
-from tensorflow.contrib.py2tf import conversion
+from tensorflow.contrib.py2tf.impl import conversion
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/contrib/py2tf/naming.py b/tensorflow/contrib/py2tf/impl/naming.py
similarity index 100%
rename from tensorflow/contrib/py2tf/naming.py
rename to tensorflow/contrib/py2tf/impl/naming.py
diff --git a/tensorflow/contrib/py2tf/naming_test.py b/tensorflow/contrib/py2tf/impl/naming_test.py
similarity index 98%
rename from tensorflow/contrib/py2tf/naming_test.py
rename to tensorflow/contrib/py2tf/impl/naming_test.py
index 5cf0a3da2c..beb4e54937 100644
--- a/tensorflow/contrib/py2tf/naming_test.py
+++ b/tensorflow/contrib/py2tf/impl/naming_test.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf import naming
+from tensorflow.contrib.py2tf.impl import naming
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 598080ed27..e4fa6694d8 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -151,9 +151,10 @@ sh_binary(
             "//tensorflow/contrib/ndlstm:ndlstm",
             "//tensorflow/contrib/nn:nn_py",
             "//tensorflow/contrib/predictor:predictor_pip",
-            "//tensorflow/contrib/py2tf:py2tf_internal",
+            "//tensorflow/contrib/py2tf:py2tf",
             "//tensorflow/contrib/py2tf/converters:converters",
             "//tensorflow/contrib/py2tf/converters:test_lib",
+            "//tensorflow/contrib/py2tf/impl:impl",
             "//tensorflow/contrib/py2tf/pyct:pyct",
             "//tensorflow/contrib/py2tf/pyct/static_analysis:static_analysis",
             "//tensorflow/contrib/receptive_field:receptive_field_pip",
diff --git a/tensorflow/tools/pip_package/pip_smoke_test.py b/tensorflow/tools/pip_package/pip_smoke_test.py
index 38a9007387..73d759eb13 100644
--- a/tensorflow/tools/pip_package/pip_smoke_test.py
+++ b/tensorflow/tools/pip_package/pip_smoke_test.py
@@ -65,7 +65,6 @@ BLACKLIST = [
     "//tensorflow/contrib/framework:checkpoint_ops_testdata",
     "//tensorflow/contrib/bayesflow:reinforce_simple_example",
     "//tensorflow/contrib/bayesflow:examples/reinforce_simple/reinforce_simple_example.py",  # pylint:disable=line-too-long
-    "//tensorflow/contrib/py2tf:py2tf_internal",
     "//tensorflow/contrib/timeseries/examples:predict",
     "//tensorflow/contrib/timeseries/examples:multivariate",
     "//tensorflow/contrib/timeseries/examples:known_anomaly",
-- 
GitLab


From beacffedd9474e0e05b73af60d75af61a64c0aa7 Mon Sep 17 00:00:00 2001
From: Jie <jiej@nvidia.com>
Date: Tue, 30 Jan 2018 15:00:38 -0800
Subject: [PATCH 424/966] [COMMENT]   trt_engine_op compute     comment on the
 scope of pointer arrays for i/o binding    
 tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc

---
 tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
index 6e4fbe20e7..983c67677f 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
@@ -123,6 +123,7 @@ void TRTEngineOp::Compute(OpKernelContext* context) {
                                                 ->CudaStreamMemberHack()));
 
   // execution handled by TF since we are getting stream from TF.
+  // it is safe for CPU pointer array (buffers) to go out of scope after enqueue
   trt_execution_context_ptr_->enqueue(num_batch, &buffers[0], *stream, nullptr);
 }
 
-- 
GitLab


From 0181aa321c8656cb574506476ca5f749061c0e87 Mon Sep 17 00:00:00 2001
From: Asim Shankar <ashankar@google.com>
Date: Tue, 30 Jan 2018 15:00:10 -0800
Subject: [PATCH 425/966] Assign*VariableOp: Documentation fix.

These ops have no outputs.
The way to ensure ordering is by adding dependencies via control inputs.

Helps with #16464

PiperOrigin-RevId: 183898409
---
 .../api_def/base_api/api_def_AssignAddVariableOp.pbtxt     | 7 ++-----
 .../api_def/base_api/api_def_AssignSubVariableOp.pbtxt     | 7 ++-----
 2 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/tensorflow/core/api_def/base_api/api_def_AssignAddVariableOp.pbtxt b/tensorflow/core/api_def/base_api/api_def_AssignAddVariableOp.pbtxt
index 5d21d7bab6..ac05b54eea 100644
--- a/tensorflow/core/api_def/base_api/api_def_AssignAddVariableOp.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_AssignAddVariableOp.pbtxt
@@ -20,10 +20,7 @@ END
   }
   summary: "Adds a value to the current value of a variable."
   description: <<END
-Any ReadVariableOp which depends directly or indirectly on this assign is
-guaranteed to see the incremented value or a subsequent newer one.
-
-Outputs the incremented value, which can be used to totally order the
-increments to this variable.
+Any ReadVariableOp with a control dependency on this op is guaranteed to
+see the incremented value or a subsequent newer one.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_AssignSubVariableOp.pbtxt b/tensorflow/core/api_def/base_api/api_def_AssignSubVariableOp.pbtxt
index 102201c4cb..9dd28f8711 100644
--- a/tensorflow/core/api_def/base_api/api_def_AssignSubVariableOp.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_AssignSubVariableOp.pbtxt
@@ -20,10 +20,7 @@ END
   }
   summary: "Subtracts a value from the current value of a variable."
   description: <<END
-Any ReadVariableOp which depends directly or indirectly on this assign is
-guaranteed to see the incremented value or a subsequent newer one.
-
-Outputs the incremented value, which can be used to totally order the
-increments to this variable.
+Any ReadVariableOp with a control dependency on this op is guaranteed to
+see the decremented value or a subsequent newer one.
 END
 }
-- 
GitLab


From 1209fc3aaa629e135823f11da0a3316e8a7e8e78 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 30 Jan 2018 15:04:14 -0800
Subject: [PATCH 426/966] Optimize memory allocation of TFLite StridedSlice Op
 for constant tensors.

PiperOrigin-RevId: 183899156
---
 .../contrib/lite/kernels/strided_slice.cc     | 183 +++++++++---------
 .../contrib/lite/testing/generate_examples.py |  72 ++++---
 .../propagate_fixed_sizes.cc                  |   3 +-
 3 files changed, 133 insertions(+), 125 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/strided_slice.cc b/tensorflow/contrib/lite/kernels/strided_slice.cc
index c510ee3b9f..c4ffdf79d3 100644
--- a/tensorflow/contrib/lite/kernels/strided_slice.cc
+++ b/tensorflow/contrib/lite/kernels/strided_slice.cc
@@ -57,63 +57,6 @@ struct StridedSliceContext {
   int dims;
 };
 
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  TF_LITE_ENSURE_EQ(context, NumInputs(node), 4);
-  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-
-  StridedSliceContext op_context(context, node);
-
-  // Ensure validity of input tensor and its dimension
-  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.begin), 1);
-  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.end), 1);
-  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.strides), 1);
-  TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type);
-  // Only INT32 begin/end/strides are supported
-  // TODO(soroosh) add support for INT64
-  TF_LITE_ENSURE_EQ(context, op_context.begin->type, kTfLiteInt32);
-  TF_LITE_ENSURE_EQ(context, op_context.end->type, kTfLiteInt32);
-  TF_LITE_ENSURE_EQ(context, op_context.strides->type, kTfLiteInt32);
-  TF_LITE_ENSURE_MSG(context, op_context.dims <= 4,
-                     "StridedSlice op only supports 1D-4D input arrays.");
-
-  // TODO(soroosh): add the following missing functionalities
-  TF_LITE_ENSURE_MSG(context, op_context.params->ellipsis_mask == 0,
-                     "ellipsis_mask is not implemented yet.");
-  TF_LITE_ENSURE_MSG(context, op_context.params->new_axis_mask == 0,
-                     "new_axis_mask is not implemented yet.");
-
-  // TODO(soroosh): optimize for constant tensors to do allocation in Prepare
-  op_context.output->allocation_type = kTfLiteDynamic;
-  return kTfLiteOk;
-}  // namespace strided_slice
-
-// TODO(soroosh): consolidate with BytesRequired in interpreter.h
-TfLiteStatus BytesRequired(TfLiteContext* context, TfLiteType type,
-                           const int* dims, int dims_size, size_t* bytes) {
-  // TODO(aselle): Check for overflow here using overflow.h in TensorFlow
-  // MultiplyWithoutOverflow.
-  TF_LITE_ENSURE(context, bytes != nullptr);
-  size_t count = 1;
-  for (int k = 0; k < dims_size; k++) count *= dims[k];
-  switch (type) {
-    case kTfLiteFloat32:
-      *bytes = sizeof(float) * count;
-      break;
-    case kTfLiteInt32:
-      *bytes = sizeof(int32_t) * count;
-      break;
-    case kTfLiteUInt8:
-      *bytes = sizeof(uint8_t) * count;
-      break;
-    case kTfLiteInt64:
-      *bytes = sizeof(int64_t) * count;
-      break;
-    default:
-      return kTfLiteError;
-  }
-  return kTfLiteOk;
-}
-
 // Reverse order of bits in the mask to match the expected order in kernel
 inline int ReverseMaskBits(int mask, int num_dimensions) {
   int out = 0;
@@ -144,43 +87,44 @@ inline int32_t ClampedIndex(int32_t index, int dim, bool pos_stride) {
                           std::min(std::max(index, -dim), dim - 1), dim));
 }
 
-template <KernelType kernel_type>
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  StridedSliceContext op_context(context, node);
+inline int32_t GetBeginValueAtIndex(StridedSliceContext* op_context, int idx) {
+  const int dim = op_context->input->dims->data[idx];
+  const bool pos_stride = GetTensorData<int32_t>(op_context->strides)[idx] > 0;
+  return op_context->params->begin_mask & (1 << idx)
+             ? pos_stride ? 0 : dim - 1
+             : ClampedIndex(GetTensorData<int32_t>(op_context->begin)[idx], dim,
+                            pos_stride);
+}
 
-  std::vector<int> starts;
-  std::vector<int> stops;
-  std::vector<int> strides;
+inline int32_t GetEndValueAtIndex(StridedSliceContext* op_context, int idx) {
+  const int dim = op_context->input->dims->data[idx];
+  const bool pos_stride = GetTensorData<int32_t>(op_context->strides)[idx] > 0;
+  return op_context->params->end_mask & (1 << idx)
+             ? pos_stride ? dim : -1
+             : ClampedIndex(GetTensorData<int32_t>(op_context->end)[idx], dim,
+                            pos_stride);
+}
+
+// Processes the indexing tensors (begin, end and strides) to resize the
+// output tensor. This function is callable from both Prepare() and Eval() as
+// long as the caller ensures the indexing tensors are present.
+TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
+                                StridedSliceContext* op_context) {
   std::vector<int> output_shape_vector;
 
-  for (int idx = op_context.dims - 1; idx >= 0; --idx) {
-    int dim = op_context.input->dims->data[idx];
-    int32_t stride = GetTensorData<int32_t>(op_context.strides)[idx];
+  for (int idx = op_context->dims - 1; idx >= 0; --idx) {
+    int32_t stride = GetTensorData<int32_t>(op_context->strides)[idx];
     TF_LITE_ENSURE_MSG(context, stride != 0, "stride value has to be non-zero");
-    bool pos_stride = stride > 0;
-
-    int32_t begin =
-        op_context.params->begin_mask & (1 << idx)
-            ? pos_stride ? 0 : dim - 1
-            : ClampedIndex(GetTensorData<int32_t>(op_context.begin)[idx], dim,
-                           pos_stride);
-    int32_t end =
-        op_context.params->end_mask & (1 << idx)
-            ? pos_stride ? dim : -1
-            : ClampedIndex(GetTensorData<int32_t>(op_context.end)[idx], dim,
-                           pos_stride);
+
+    int32_t begin = GetBeginValueAtIndex(op_context, idx);
+    int32_t end = GetEndValueAtIndex(op_context, idx);
 
     // This is valid for both positive and negative strides
     int32_t dim_shape = ceil((end - begin) / static_cast<float>(stride));
     dim_shape = dim_shape < 0 ? 0 : dim_shape;
-
-    if (!(op_context.params->shrink_axis_mask & (1 << idx))) {
+    if (!(op_context->params->shrink_axis_mask & (1 << idx))) {
       output_shape_vector.push_back(dim_shape);
     }
-
-    starts.emplace_back(begin);
-    stops.emplace_back(end);
-    strides.emplace_back(stride);
   }
 
   TfLiteIntArray* output_shape =
@@ -189,22 +133,73 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   std::reverse_copy(output_shape_vector.begin(), output_shape_vector.end(),
                     output_shape->data);
 
+  TF_LITE_ENSURE_STATUS(
+      context->ResizeTensor(context, op_context->output, output_shape));
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 4);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  StridedSliceContext op_context(context, node);
+
+  // Ensure validity of input tensor and its dimension
+  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.begin), 1);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.end), 1);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.strides), 1);
+  TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type);
+  // Only INT32 begin/end/strides are supported
+  // TODO(soroosh) add support for INT64
+  TF_LITE_ENSURE_EQ(context, op_context.begin->type, kTfLiteInt32);
+  TF_LITE_ENSURE_EQ(context, op_context.end->type, kTfLiteInt32);
+  TF_LITE_ENSURE_EQ(context, op_context.strides->type, kTfLiteInt32);
+  TF_LITE_ENSURE_MSG(context, op_context.dims <= 4,
+                     "StridedSlice op only supports 1D-4D input arrays.");
+
+  // TODO(soroosh): add the following missing functionalities
+  TF_LITE_ENSURE_MSG(context, op_context.params->ellipsis_mask == 0,
+                     "ellipsis_mask is not implemented yet.");
+  TF_LITE_ENSURE_MSG(context, op_context.params->new_axis_mask == 0,
+                     "new_axis_mask is not implemented yet.");
+
+  // Postpone allocation of output if any of the indexing tensors is not
+  // constant
+  if (!(IsConstantTensor(op_context.begin) &&
+        IsConstantTensor(op_context.end) &&
+        IsConstantTensor(op_context.strides))) {
+    SetTensorToDynamic(op_context.output);
+    return kTfLiteOk;
+  }
+  return ResizeOutputTensor(context, &op_context);
+}
+
+template <KernelType kernel_type>
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  StridedSliceContext op_context(context, node);
+
+  if (IsDynamicTensor(op_context.output)) {
+    TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context));
+    TfLiteTensorRealloc(op_context.output->bytes, op_context.output);
+  }
+
+  std::vector<int32_t> starts;
+  std::vector<int32_t> stops;
+  std::vector<int32_t> strides;
+
+  for (int idx = op_context.dims - 1; idx >= 0; --idx) {
+    starts.emplace_back(GetBeginValueAtIndex(&op_context, idx));
+    stops.emplace_back(GetEndValueAtIndex(&op_context, idx));
+    strides.emplace_back(GetTensorData<int32_t>(op_context.strides)[idx]);
+  }
+
   for (int i = op_context.dims; i < kMaxDim; i++) {
     starts.emplace_back(0);
     stops.emplace_back(1);
     strides.emplace_back(1);
   }
 
-  TF_LITE_ENSURE_STATUS(
-      context->ResizeTensor(context, op_context.output, output_shape));
-
-  size_t required_bytes;
-  TF_LITE_ENSURE_OK(
-      context,
-      BytesRequired(context, op_context.output->type, output_shape->data,
-                    output_shape->size, &required_bytes));
-  TfLiteTensorRealloc(required_bytes, op_context.output);
-
   op_context.params->begin_mask =
       ReverseMaskBits(op_context.params->begin_mask, op_context.dims);
   op_context.params->end_mask =
diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py
index e7606eecc4..b2227a7c98 100644
--- a/tensorflow/contrib/lite/testing/generate_examples.py
+++ b/tensorflow/contrib/lite/testing/generate_examples.py
@@ -1560,10 +1560,11 @@ def make_strided_slice_tests(zip_path):
           "input_shape": [[12, 2, 2, 5]],
           "begin": [[0, 0, 0, 0], [1, 0, 1, 0]],
           "end": [[8, 2, 2, 3], [12, 2, 2, 5]],
-          "strides": [None, [1, 1, 1, 1], [2, 1, 3, 1]],
-          "begin_mask": [None, 1, 2, 8],
-          "end_mask": [None, 1, 2, 8],
-          "shrink_axis_mask": [None, 1, 2, 4, 8, 11, 15, -1],
+          "strides": [None, [2, 1, 3, 1]],
+          "begin_mask": [None, 1, 8],
+          "end_mask": [None, 1, 8],
+          "shrink_axis_mask": [None, 1, 8, 11, 15, -1],
+          "constant_indices": [False, True],
       },
       # 2-D
       {
@@ -1572,10 +1573,11 @@ def make_strided_slice_tests(zip_path):
           "input_shape": [[2, 3]],
           "begin": [[0, 0], [1, 0]],
           "end": [[2, 3], [2, 2]],
-          "strides": [None, [1, 1], [2, 2]],
+          "strides": [None, [2, 2]],
           "begin_mask": [None, 1, 2],
           "end_mask": [None, 1, 2],
           "shrink_axis_mask": [None, 1, 2, 3, -1],
+          "constant_indices": [False, True],
       },
       # Negative strides
       {
@@ -1588,6 +1590,7 @@ def make_strided_slice_tests(zip_path):
           "begin_mask": [None, 1, 2],
           "end_mask": [None, 1, 2],
           "shrink_axis_mask": [None, 1, 2, 3, -1],
+          "constant_indices": [False],
       },
   ]
 
@@ -1597,23 +1600,29 @@ def make_strided_slice_tests(zip_path):
         dtype=parameters["dtype"],
         name="input",
         shape=parameters["input_shape"])
-    begin = tf.placeholder(
-        dtype=parameters["index_type"],
-        name="begin",
-        shape=[len(parameters["input_shape"])])
-    end = tf.placeholder(
-        dtype=parameters["index_type"],
-        name="end",
-        shape=[len(parameters["input_shape"])])
-    strides = (
-        tf.placeholder(
-            dtype=parameters["index_type"],
-            name="strides",
-            shape=[len(parameters["input_shape"])])
-        if parameters["strides"] is not None else None)
-    tensors = [input_tensor, begin, end]
-    if strides is not None:
-      tensors.append(strides)
+    if parameters["constant_indices"]:
+      begin = parameters["begin"]
+      end = parameters["end"]
+      strides = parameters["strides"]
+      tensors = [input_tensor]
+    else:
+      begin = tf.placeholder(
+          dtype=parameters["index_type"],
+          name="begin",
+          shape=[len(parameters["input_shape"])])
+      end = tf.placeholder(
+          dtype=parameters["index_type"],
+          name="end",
+          shape=[len(parameters["input_shape"])])
+      strides = (
+          tf.placeholder(
+              dtype=parameters["index_type"],
+              name="strides",
+              shape=[len(parameters["input_shape"])])
+          if parameters["strides"] is not None else None)
+      tensors = [input_tensor, begin, end]
+      if strides is not None:
+        tensors.append(strides)
     out = tf.strided_slice(
         input_tensor,
         begin,
@@ -1628,14 +1637,17 @@ def make_strided_slice_tests(zip_path):
     input_values = create_tensor_data(parameters["dtype"],
                                       parameters["input_shape"])
     index_type = _TF_TYPE_INFO[parameters["index_type"]][0]
-    begin_values = np.array(parameters["begin"]).astype(index_type)
-    end_values = np.array(parameters["end"]).astype(index_type)
-    stride_values = (
-        np.array(parameters["strides"]).astype(index_type)
-        if parameters["strides"] is not None else None)
-    values = [input_values, begin_values, end_values]
-    if stride_values is not None:
-      values.append(stride_values)
+    values = [input_values]
+    if not parameters["constant_indices"]:
+      begin_values = np.array(parameters["begin"]).astype(index_type)
+      end_values = np.array(parameters["end"]).astype(index_type)
+      stride_values = (
+          np.array(parameters["strides"]).astype(index_type)
+          if parameters["strides"] is not None else None)
+      values.append(begin_values)
+      values.append(end_values)
+      if stride_values is not None:
+        values.append(stride_values)
 
     return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index 4fb3b6ae7a..7f26884bc1 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -1120,7 +1120,8 @@ void ProcessStridedSliceOperator(Model* model, StridedSliceOperator* op) {
       stop += input_array.shape().dims(i);
     }
 
-    int dim_size = (stop - start) / op->strides[i];
+    int dim_size = ceil((stop - start) / static_cast<float>(op->strides[i]));
+    dim_size = dim_size < 0 ? 0 : dim_size;
     if (op->shrink_axis_mask & mask) {
       CHECK_EQ(dim_size, 1) << "Output size for an axis must compute to 1 when "
                                "shrinking that axis";
-- 
GitLab


From b0ac7c553db40d42e699baa86442289b1314a2e1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 30 Jan 2018 15:05:49 -0800
Subject: [PATCH 427/966] Always evaluate() on TPU in TPUEstimator unless
 use_tpu is set to False.

PiperOrigin-RevId: 183899481
---
 .../contrib/tpu/python/tpu/tpu_config.py      |  3 +-
 .../contrib/tpu/python/tpu/tpu_estimator.py   | 70 ++++++++-----------
 2 files changed, 33 insertions(+), 40 deletions(-)

diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_config.py b/tensorflow/contrib/tpu/python/tpu/tpu_config.py
index 0c2580211a..188db6e2f0 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_config.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_config.py
@@ -53,7 +53,8 @@ class TPUConfig(
     num_shards: The number of TPU shards in the system.
     per_host_input_for_training: If `True`, `input_fn` is invoked Per-Host
       rather than Per-Core. With Per-Host input pipeline deployment, `input_fn`
-      is invoked once on each host. To be precise, with a global batch size
+      is invoked once on each host. With Per-Core input pipeline deployment, it
+      is invoked once for each core. To be precise, with a global batch size
       `train_batch_size` in `TPUEstimator` constructor, the batch size for each
       shard is `train_batch_size` // #hosts. With Per-Core input pipeline
       deployment, the shard batch size is `train_batch_size` // #cores.
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index 9d715bb236..bc55dbcb50 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -215,16 +215,11 @@ class _TPUContext(object):
   def is_running_on_cpu(self):
     """Determines whether the input_fn and model_fn should be invoked on CPU."""
     mode = self._assert_mode()
-    return ((not self._use_tpu) or mode == model_fn_lib.ModeKeys.PREDICT or
-            (mode == model_fn_lib.ModeKeys.EVAL and
-             self._eval_batch_size is None))
+    return (not self._use_tpu) or mode == model_fn_lib.ModeKeys.PREDICT
 
   @property
   def global_batch_size(self):
     mode = self._assert_mode()
-    if mode == model_fn_lib.ModeKeys.EVAL and self._eval_batch_size is None:
-      raise RuntimeError('Internal error, EVAL on TPU is not enabled, but '
-                         '`global_batch_size` is called.')
     return (self._train_batch_size
             if mode == model_fn_lib.ModeKeys.TRAIN else self._eval_batch_size)
 
@@ -232,9 +227,6 @@ class _TPUContext(object):
   def batch_size_for_input_fn(self):
     """Returns the shard batch size for `input_fn`."""
     mode = self._assert_mode()
-    # Special case for eval.
-    if mode == model_fn_lib.ModeKeys.EVAL and self._eval_batch_size is None:
-      return None
     if self.is_running_on_cpu():
       if mode == model_fn_lib.ModeKeys.TRAIN:
         return self._train_batch_size
@@ -255,9 +247,6 @@ class _TPUContext(object):
   def batch_size_for_model_fn(self):
     """Returns the shard batch size for `model_fn`."""
     mode = self._assert_mode()
-    # Special case for eval.
-    if mode == model_fn_lib.ModeKeys.EVAL and self._eval_batch_size is None:
-      return None
     if self.is_running_on_cpu():
       if mode == model_fn_lib.ModeKeys.TRAIN:
         return self._train_batch_size
@@ -1464,30 +1453,28 @@ class TPUEstimator(estimator_lib.Estimator):
   replicating inputs and models for each core, and returning to host
   periodically to run hooks.
 
-  If `use_tpu` is false, all training, evaluation, and predict are executed on
-  CPU.
-
-  For training, TPUEstimator transforms a global batch size in params to a
-  per-shard batch size when calling the `input_fn` and `model_fn`. Users should
-  specify `train_batch_size` in constructor, and then get the batch size for
-  each shard in `input_fn` and `model_fn` by `params['batch_size']`. If
-  `TPUConfig.per_host_input_for_training` is `True`, `input_fn` is invoked per
-  host rather than per core. In this case, a global batch size is transformed a
-  per-host batch size in params for `input_fn`, but `model_fn` still gets
-  per-core batch size.
-
-  For evaluation, if `eval_batch_size` is None, it is executed on CPU, even if
-  `use_tpu` is `True`. If `eval_batch_size` is not `None`, it is executed on
-  TPU, which is an experimental feature. In this case, `model_fn` should return
-  `TPUEstimatorSpec` instead of `EstimatorSpec`, which expects the
-  `eval_metrics` for TPU evaluation.
-
+  TPUEstimator transforms a global batch size in params to a per-shard batch
+  size when calling the `input_fn` and `model_fn`. Users should specify
+  global batch size in constructor, and then get the batch size for each shard
+  in `input_fn` and `model_fn` by `params['batch_size']`.
+  For training, `model_fn` gets per-core batch size; `input_fn` may get
+  per-core or per-host batch size depending on
+  `per_host_input_for_training` in `TPUConfig`.
+  For evaluation, `model_fn` gets per-core batch size and `input_fn` get
+  per-host batch size.
+
+  `model_fn` should return `TPUEstimatorSpec`, which expects the `eval_metrics`
+  for TPU evaluation.
   `TPUEstimatorSpec.eval_metrics` is a tuple of `metric_fn` and `tensors`, where
   `tensors` could be a list of `Tensor`s or dict of names to `Tensor`s. (See
   `TPUEstimatorSpec` for details).  `metric_fn` takes the `tensors` and returns
   a dict from metric string name to the result of calling a metric function,
   namely a `(metric_tensor, update_op)` tuple.
 
+  One can set `use_tpu` to `False` for testing. All training, evaluation, and
+  predict will be executed on CPU. `input_fn` and `model_fn` will receive
+  `train_batch_size` or `eval_batch_size` unmodified as `params['batch_size']`.
+
   Current limitations:
 
   1. TPU evaluation only works on single host.
@@ -1560,8 +1547,7 @@ class TPUEstimator(estimator_lib.Estimator):
         basic python types. There are reserved keys for `TPUEstimator`,
         including 'batch_size'.
       use_tpu: A bool indicating whether TPU support is enabled. Currently,
-        - TPU training respects this bit.
-        - If true, see `eval_batch_size` for evaluate support.
+        - TPU training and evaluation respect this bit.
         - Predict still happens on CPU.
       train_batch_size: An int representing the global training batch size.
         TPUEstimator transforms this global batch size to a per-shard batch
@@ -1569,9 +1555,7 @@ class TPUEstimator(estimator_lib.Estimator):
         Cannot be `None` if `use_tpu` is `True`. Must be divisible by
         `config.tpu_config.num_shards`.
       eval_batch_size: An int representing the global training batch size.
-        Currently, if `None`, evaluation is still executed on CPU (even when
-        `use_tpu` is True). In near future, `use_tpu` will be the only option to
-        switch between TPU/CPU evaluation.
+        Must be divisible by `config.tpu_config.num_shards`.
       batch_axis: A python tuple of int values describing how each tensor
         produced by the Estimator `input_fn` should be split across the TPU
         compute shards. For example, if your input_fn produced (images, labels)
@@ -1611,10 +1595,10 @@ class TPUEstimator(estimator_lib.Estimator):
             .format(train_batch_size, config.tpu_config.num_shards))
 
       if eval_batch_size is not None:
-        if config.tpu_config.num_shards > 8:
-          raise NotImplementedError(
-              'TPU evaluation is only supported with one host.')
-
+        if not isinstance(eval_batch_size, int):
+          raise ValueError('`eval_batch_size` must be an int')
+        if eval_batch_size < 1:
+          raise ValueError('`eval_batch_size` must be positive')
         if eval_batch_size % config.tpu_config.num_shards != 0:
           raise ValueError(
               'eval batch size {} must be divisible by number of shards {}'
@@ -1687,6 +1671,14 @@ class TPUEstimator(estimator_lib.Estimator):
 
     util_lib.check_positive_integer(steps, 'Eval steps')
 
+    if self._config.tpu_config.num_shards > 8:
+      raise NotImplementedError(
+          'TPU evaluation is only supported with one host.')
+
+    if self._ctx._eval_batch_size is None:  # pylint: disable=protected-access
+      raise ValueError('`eval_batch_size` cannot be `None`'
+                       'if evaluate() is called on TPU.')
+
     return [
         evaluation._StopAfterNEvalsHook(  # pylint: disable=protected-access
             num_evals=steps),
-- 
GitLab


From ac7167794f7c22d42c14a8d2f47b9533e118d73f Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Tue, 30 Jan 2018 15:07:03 -0800
Subject: [PATCH 428/966] Supporting paths for build_server script.

PiperOrigin-RevId: 183899675
---
 tensorflow/tools/dist_test/build_server.sh | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/tensorflow/tools/dist_test/build_server.sh b/tensorflow/tools/dist_test/build_server.sh
index 878fabd248..225c034741 100755
--- a/tensorflow/tools/dist_test/build_server.sh
+++ b/tensorflow/tools/dist_test/build_server.sh
@@ -16,14 +16,15 @@
 #
 # Builds the test server for distributed (GRPC) TensorFlow
 #
-# Usage: build_server.sh <docker_image_name> <whl_url> [--test]
+# Usage: build_server.sh <docker_image_name> <whl_file_location> [--test]
 #
 # Arguments:
 #   docker_image_name: Name of the docker image to build.
 #     E.g.: tensorflow/tf_grpc_test_server:0.11.0rc1
 #
-#   whl_url: URL from which the TensorFlow whl file will be downloaded.
+#   whl_file_location: URL from which the TensorFlow whl file will be downloaded.
 #     E.g.: https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc1-cp27-none-linux_x86_64.whl
+#     E.g.: /path/to/folder/tensorflow-0.11.0rc1-cp27-none-linux_x86_64.whl
 #
 # The optional flag --test lets the script to use the Dockerfile for the
 # testing GRPC server. Without the flag, the script will build the non-test
@@ -41,11 +42,11 @@ die() {
 
 # Check arguments
 if [[ $# -lt 2 ]]; then
-  die "Usage: $0 <docker_image_name> <whl_url> [--test]"
+  die "Usage: $0 <docker_image_name> <whl_location> [--test]"
 fi
 
 DOCKER_IMG_NAME=$1
-WHL_URL=$2
+WHL_FILE_LOCATION=$2
 shift 2
 
 # Current script directory
@@ -53,7 +54,7 @@ DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 
 BUILD_DIR=$(mktemp -d)
 echo ""
-echo "Using whl file URL: ${WHL_URL}"
+echo "Using whl file URL: ${WHL_FILE_LOCATION}"
 echo "Building in temporary directory: ${BUILD_DIR}"
 
 cp -r ${DIR}/* "${BUILD_DIR}"/ || \
@@ -65,9 +66,15 @@ if [[ $1 == "--test" ]]; then
 fi
 echo "Using Docker file: ${DOCKER_FILE}"
 
+if [[ $WHL_FILE_LOCATION =~ 'http://' || $WHL_FILE_LOCATION =~ 'https://' ]]; then
+    # Download whl file into the build context directory.
+    wget -P "${BUILD_DIR}" "${WHL_FILE_LOCATION}" || \
+        die "Failed to download tensorflow whl file from URL: ${WHL_FILE_LOCATION}"
+else
+    cp "${WHL_FILE_LOCATION}" "${BUILD_DIR}"
+fi
+
 # Download whl file into the build context directory.
-wget -P "${BUILD_DIR}" ${WHL_URL} || \
-    die "Failed to download tensorflow whl file from URL: ${WHL_URL}"
 
 if [[ ! -f "${DOCKER_FILE}" ]]; then
   die "ERROR: Unable to find dockerfile: ${DOCKER_FILE}"
-- 
GitLab


From e7df91c5f65e8b10535d1a09c817aa4e900e4001 Mon Sep 17 00:00:00 2001
From: Nupur Garg <nupurgarg@google.com>
Date: Tue, 30 Jan 2018 15:11:33 -0800
Subject: [PATCH 429/966] Internal change.

PiperOrigin-RevId: 183900332
---
 .../contrib/lite/kernels/batch_to_space_nd.cc       |  9 ++++++++-
 .../contrib/lite/kernels/batch_to_space_nd_test.cc  | 13 +++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc b/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc
index d84a77039b..889239f932 100644
--- a/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc
+++ b/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc
@@ -57,6 +57,7 @@ TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
                                 BatchToSpaceNDContext* op_context) {
   TfLiteIntArray* input_size = op_context->input->dims;
   const int* block_shape = GetTensorData<int32>(op_context->block_shape);
+  const int* crops = GetTensorData<int32>(op_context->crops);
 
   TF_LITE_ENSURE_EQ(context, NumDimensions(op_context->block_shape),
                     kBlockSizeDimensionNum);
@@ -65,7 +66,13 @@ TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
   TF_LITE_ENSURE_EQ(context, NumDimensions(op_context->crops),
                     kSpatialDimensionNum);
 
-  // TODO(ycling): Add crops as part of calculation.
+  // TODO(ycling): Add crops as part of calculation. Remove check for a crops
+  // containing all zeroes.
+  TF_LITE_ENSURE_EQ(context, crops[0], 0);
+  TF_LITE_ENSURE_EQ(context, crops[1], 0);
+  TF_LITE_ENSURE_EQ(context, crops[2], 0);
+  TF_LITE_ENSURE_EQ(context, crops[3], 0);
+
   // Number of batch must be multiple of (block_shape[0] * block_shape[1]).
   TF_LITE_ENSURE_EQ(context,
                     input_size->data[0] % (block_shape[0] * block_shape[1]), 0);
diff --git a/tensorflow/contrib/lite/kernels/batch_to_space_nd_test.cc b/tensorflow/contrib/lite/kernels/batch_to_space_nd_test.cc
index c9152bf967..8485cde1b4 100644
--- a/tensorflow/contrib/lite/kernels/batch_to_space_nd_test.cc
+++ b/tensorflow/contrib/lite/kernels/batch_to_space_nd_test.cc
@@ -119,6 +119,19 @@ TEST(BatchToSpaceNDOpTest, InvalidShapeTest) {
                "Cannot allocate tensors");
 }
 
+TEST(BatchToSpaceNDOpTest, InvalidCropsConstTest) {
+  EXPECT_DEATH(BatchToSpaceNDOpConstModel({3, 2, 2, 1}, {2, 2}, {0, 0, 0, 1}),
+               "1 != 0");
+}
+
+TEST(BatchToSpaceNDOpTest, InvalidCropsDynamicTest) {
+  BatchToSpaceNDOpDynamicModel m({4, 2, 2, 1});
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.SetBlockShape({2, 2});
+  m.SetCrops({0, 0, 1, 0});
+  EXPECT_DEATH(m.Invoke(), "1 != 0");
+}
+
 }  // namespace
 }  // namespace tflite
 
-- 
GitLab


From 3e0396be541f185acbfeaf170ff46bccda0c21eb Mon Sep 17 00:00:00 2001
From: Sami Kama <skama@nvidia.com>
Date: Tue, 30 Jan 2018 15:24:11 -0800
Subject: [PATCH 430/966] Add missing tf_copts() to Build file so #if
 GOOGLE_CUDA directives work correctly

---
 tensorflow/contrib/tensorrt/BUILD | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index a44e5e871b..c0bba9f950 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -20,6 +20,7 @@ load(
     "tf_cuda_cc_test",
     "tf_cuda_library",
     "tf_custom_op_py_library",
+    "tf_copts",
 )
 load(
     "@local_config_tensorrt//:build_defs.bzl",
@@ -163,6 +164,7 @@ py_library(
 tf_py_wrap_cc(
     name = "wrap_conversion",
     srcs = ["trt_conversion.i"],
+    copts = tf_copts(),
     deps = [
         ":trt_conversion",
         "//tensorflow/core:framework_lite",
-- 
GitLab


From b76f013900afcf679d27cf03fbd21c4e5035e9a2 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Tue, 30 Jan 2018 15:36:08 -0800
Subject: [PATCH 431/966] Internal change.

PiperOrigin-RevId: 183904042
---
 tensorflow/python/eager/BUILD                 | 23 ------
 tensorflow/python/eager/gen_op.bzl            | 65 -----------------
 .../python/eager/python_eager_op_gen_main.cc  | 72 -------------------
 3 files changed, 160 deletions(-)
 delete mode 100644 tensorflow/python/eager/gen_op.bzl
 delete mode 100644 tensorflow/python/eager/python_eager_op_gen_main.cc

diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index 9e3382d4f3..ab81d40148 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -206,29 +206,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "python_eager_op_gen_main",
-    srcs = [
-        "python_eager_op_gen_main.cc",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":python_eager_op_gen",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:op_gen_lib",
-        "//tensorflow/core:protos_all_cc",
-    ],
-)
-
-tf_cc_binary(
-    name = "python_eager_op_gen_demo",
-    deps = [
-        ":python_eager_op_gen_main",
-        "//tensorflow/core:ops",
-    ],
-)
-
 py_library(
     name = "custom_gradient",
     srcs = ["custom_gradient.py"],
diff --git a/tensorflow/python/eager/gen_op.bzl b/tensorflow/python/eager/gen_op.bzl
deleted file mode 100644
index 8bc1d6c10a..0000000000
--- a/tensorflow/python/eager/gen_op.bzl
+++ /dev/null
@@ -1,65 +0,0 @@
-"""For eager-mode Python."""
-
-load("//tensorflow:tensorflow.bzl",
-     "clean_dep",
-     "tf_binary_additional_srcs",
-     "tf_copts",
-     "tf_cc_binary")
-
-def tfe_gen_op_wrapper_py(name,
-                          out=None,
-                          visibility=None,
-                          deps=[],
-                          generated_target_name=None,
-                          # ApiDefs will be loaded in the order specified in this list.
-                          api_def_srcs=[]):
-  """Generate an eager-mode Python op wrapper for an op library."""
-  # Construct a cc_binary containing the specified ops.
-  tool_name = "gen_" + name + "_py_wrappers_cc"
-  if not deps:
-    deps = [str(Label("//tensorflow/core:" + name + "_op_lib"))]
-  tf_cc_binary(
-      name=tool_name,
-      linkopts=["-lm"],
-      copts=tf_copts(),
-      linkstatic=1,
-      deps=([
-          clean_dep("//tensorflow/python/eager:python_eager_op_gen_main")
-      ] + deps),
-      visibility=[clean_dep("//visibility:public")],)
-
-  # Invoke the previous cc_binary to generate a python file.
-  if not out:
-    out = "gen_" + name + ".py"
-
-  if not api_def_srcs:
-    api_def_args_str = ","
-  else:
-    api_def_args = []
-    for api_def_src in api_def_srcs:
-      # Add directory of the first ApiDef source to args.
-      # We are assuming all ApiDefs in a single api_def_src are in the
-      # same directory.
-      api_def_args.append(
-          "$$(dirname $$(echo $(locations " + api_def_src +
-          ") | cut -d\" \" -f1))")
-    api_def_args_str = ",".join(api_def_args)
-
-  native.genrule(
-      name=name + "_pygenrule",
-      outs=[out],
-      srcs=api_def_srcs,
-      tools=[tool_name] + tf_binary_additional_srcs(),
-      cmd=("$(location " + tool_name + ") " + api_def_args_str + " > $@"))
-
-  # Make a py_library out of the generated python file.
-  if not generated_target_name:
-    generated_target_name = name
-  native.py_library(
-      name=generated_target_name,
-      srcs=[out],
-      srcs_version="PY2AND3",
-      visibility=visibility,
-      deps=[
-          clean_dep("//tensorflow/python/eager:framework_for_generated_wrappers"),
-      ],)
diff --git a/tensorflow/python/eager/python_eager_op_gen_main.cc b/tensorflow/python/eager/python_eager_op_gen_main.cc
deleted file mode 100644
index 05351bd8b1..0000000000
--- a/tensorflow/python/eager/python_eager_op_gen_main.cc
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/python/eager/python_eager_op_gen.h"
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_def.pb.h"
-#include "tensorflow/core/framework/op_gen_lib.h"
-#include "tensorflow/core/lib/io/path.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/init_main.h"
-
-namespace tensorflow {
-namespace {
-
-void PrintAllPythonOps(const std::vector<string>& hidden_ops,
-                       const std::vector<string>& api_def_dirs) {
-  OpList ops;
-  OpRegistry::Global()->Export(false, &ops);
-
-  ApiDefMap api_def_map(ops);
-  if (!api_def_dirs.empty()) {
-    Env* env = Env::Default();
-
-    for (const auto& api_def_dir : api_def_dirs) {
-      std::vector<string> api_files;
-      TF_CHECK_OK(env->GetMatchingPaths(io::JoinPath(api_def_dir, "*.pbtxt"),
-                                        &api_files));
-      TF_CHECK_OK(api_def_map.LoadFileList(env, api_files));
-    }
-    api_def_map.UpdateDocs();
-  }
-
-  PrintEagerPythonOps(ops, api_def_map, hidden_ops, true /* require_shapes */);
-}
-
-}  // namespace
-}  // namespace tensorflow
-
-int main(int argc, char* argv[]) {
-  tensorflow::port::InitMain(argv[0], &argc, &argv);
-
-  // Usage:
-  //   python_eager_op_gen_main api_def_dir1,api_def_dir2,...
-  if (argc == 1) {
-    tensorflow::PrintAllPythonOps({}, {});
-  } else if (argc == 2) {
-    const std::vector<tensorflow::string> api_def_dirs =
-        tensorflow::str_util::Split(argv[1], ",",
-                                    tensorflow::str_util::SkipEmpty());
-    tensorflow::PrintAllPythonOps({}, api_def_dirs);
-  } else {
-    return -1;
-  }
-  return 0;
-}
-- 
GitLab


From 1f89285655e201f89c7fdb027cf16cb3eeeb8ade Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 30 Jan 2018 15:53:55 -0800
Subject: [PATCH 432/966] [XLA] Add Clamp and Round to the local Python XLA
 client.

PiperOrigin-RevId: 183906722
---
 .../xla/python/local_computation_builder.cc   | 17 +++++++++-----
 .../xla/python/local_computation_builder.h    | 14 +++++++----
 .../xla/python/local_computation_builder.i    |  2 ++
 tensorflow/compiler/xla/python/xla_client.py  |  8 +++++++
 .../compiler/xla/python/xla_client_test.py    | 23 +++++++++++++++++++
 5 files changed, 54 insertions(+), 10 deletions(-)

diff --git a/tensorflow/compiler/xla/python/local_computation_builder.cc b/tensorflow/compiler/xla/python/local_computation_builder.cc
index 5772532b84..67a73bc33d 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.cc
+++ b/tensorflow/compiler/xla/python/local_computation_builder.cc
@@ -367,12 +367,6 @@ LocalComputationBuilder::SelectAndScatterWithGeneralPadding(
       source, init_value, scatter.computation());
 }
 
-ComputationDataHandle LocalComputationBuilder::Select(
-    const ComputationDataHandle& pred, const ComputationDataHandle& on_true,
-    const ComputationDataHandle& on_false) {
-  return builder_.Select(pred, on_true, on_false);
-}
-
 ComputationDataHandle LocalComputationBuilder::Tuple(
     tensorflow::gtl::ArraySlice<ComputationDataHandle> elements) {
   return builder_.Tuple(elements);
@@ -487,6 +481,15 @@ ComputationDataHandle LocalComputationBuilder::While(
        tensorflow::gtl::ArraySlice<int64> broadcast_dimensions),           \
       (lhs, rhs, broadcast_dimensions))
 
+#define _FORWARD_TRIOP(method_name)                                        \
+  _FORWARD(                                                                \
+      method_name, ComputationDataHandle,                                  \
+      (const ComputationDataHandle& lhs, const ComputationDataHandle& rhs, \
+       const ComputationDataHandle& ehs),                                  \
+      (lhs, rhs, ehs))
+
+_FORWARD_TRIOP(Select)
+_FORWARD_TRIOP(Clamp)
 _FORWARD_BINOP(Eq)
 _FORWARD_BINOP(Ne)
 _FORWARD_BINOP(Ge)
@@ -507,6 +510,7 @@ _FORWARD_UNOP(Abs)
 _FORWARD_UNOP(Exp)
 _FORWARD_UNOP(Floor)
 _FORWARD_UNOP(Ceil)
+_FORWARD_UNOP(Round)
 _FORWARD_UNOP(Log)
 _FORWARD_UNOP(Sign)
 _FORWARD_UNOP(Cos)
@@ -523,6 +527,7 @@ _FORWARD_UNOP(Sort)
 #undef _FORWARD
 #undef _FORWARD_UNOP
 #undef _FORWARD_BINOP
+#undef _FORWARD_TRIOP
 
 void DeleteLocalShapedBuffer(LocalShapedBuffer* local_shaped_buffer) {
   delete local_shaped_buffer;
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.h b/tensorflow/compiler/xla/python/local_computation_builder.h
index 6851c2644d..d5c4c58040 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.h
+++ b/tensorflow/compiler/xla/python/local_computation_builder.h
@@ -174,10 +174,6 @@ class LocalComputationBuilder {
       const ComputationDataHandle& source,
       const ComputationDataHandle& init_value, const LocalComputation& scatter);
 
-  ComputationDataHandle Select(const ComputationDataHandle& pred,
-                               const ComputationDataHandle& on_true,
-                               const ComputationDataHandle& on_false);
-
   ComputationDataHandle Tuple(
       tensorflow::gtl::ArraySlice<ComputationDataHandle> elements);
 
@@ -254,6 +250,14 @@ class LocalComputationBuilder {
       (const ComputationDataHandle& lhs, const ComputationDataHandle& rhs, \
        tensorflow::gtl::ArraySlice<int64> broadcast_dimensions))
 
+#define _FORWARD_TRIOP(method_name)                                        \
+  _FORWARD(                                                                \
+      method_name, ComputationDataHandle,                                  \
+      (const ComputationDataHandle& lhs, const ComputationDataHandle& rhs, \
+       const ComputationDataHandle& ehs))
+
+  _FORWARD_TRIOP(Select)
+  _FORWARD_TRIOP(Clamp)
   _FORWARD_BINOP(Eq)
   _FORWARD_BINOP(Ne)
   _FORWARD_BINOP(Ge)
@@ -274,6 +278,7 @@ class LocalComputationBuilder {
   _FORWARD_UNOP(Exp)
   _FORWARD_UNOP(Floor)
   _FORWARD_UNOP(Ceil)
+  _FORWARD_UNOP(Round)
   _FORWARD_UNOP(Log)
   _FORWARD_UNOP(Sign)
   _FORWARD_UNOP(Cos)
@@ -290,6 +295,7 @@ class LocalComputationBuilder {
 #undef _FORWARD
 #undef _FORWARD_UNOP
 #undef _FORWARD_BINOP
+#undef _FORWARD_TRIOP
 
  private:
   ComputationBuilder builder_;
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.i b/tensorflow/compiler/xla/python/local_computation_builder.i
index 6a52a088dd..89f8385501 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.i
+++ b/tensorflow/compiler/xla/python/local_computation_builder.i
@@ -701,6 +701,7 @@ tensorflow::ImportNumpy();
 %unignore xla::swig::LocalComputationBuilder::Call;
 %unignore xla::swig::LocalComputationBuilder::Transpose;
 %unignore xla::swig::LocalComputationBuilder::Rev;
+%unignore xla::swig::LocalComputationBuilder::Clamp;
 %unignore xla::swig::LocalComputationBuilder::Map;
 %unignore xla::swig::LocalComputationBuilder::Reduce;
 %unignore xla::swig::LocalComputationBuilder::ReduceWindowWithGeneralPadding;
@@ -730,6 +731,7 @@ tensorflow::ImportNumpy();
 %unignore xla::swig::LocalComputationBuilder::Exp;
 %unignore xla::swig::LocalComputationBuilder::Floor;
 %unignore xla::swig::LocalComputationBuilder::Ceil;
+%unignore xla::swig::LocalComputationBuilder::Round;
 %unignore xla::swig::LocalComputationBuilder::Log;
 %unignore xla::swig::LocalComputationBuilder::Sign;
 %unignore xla::swig::LocalComputationBuilder::Cos;
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index a89e2643c8..7ee5febc09 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -89,6 +89,7 @@ _UNARY_OPS = [
     'Abs',
     'Exp',
     'Floor',
+    'Round',
     'Ceil',
     'Log',
     'Sign',
@@ -619,6 +620,13 @@ class ComputationBuilder(object):
     return _wrap_data_handle(
         self._client.Rev(_unwrap_data_handle(operand), dimensions))
 
+  def Clamp(self, min, operand, max):  # pylint: disable=redefined-builtin
+    """Clamp op."""
+    return _wrap_data_handle(
+        self._client.Clamp(_unwrap_data_handle(min),
+                           _unwrap_data_handle(operand),
+                           _unwrap_data_handle(max)))
+
   def SelectAndScatter(self, operand, select, window_dimensions, window_strides,
                        padding, source, init_value, scatter):
     """Select and scatter op, used by the gradient of ReduceWindow.
diff --git a/tensorflow/compiler/xla/python/xla_client_test.py b/tensorflow/compiler/xla/python/xla_client_test.py
index c0413b9bbc..3b5bbfd786 100644
--- a/tensorflow/compiler/xla/python/xla_client_test.py
+++ b/tensorflow/compiler/xla/python/xla_client_test.py
@@ -496,6 +496,12 @@ class SingleOpTest(LocalComputationTest):
     c.Exp(c.Constant(arr))
     self._ExecuteAndCompareClose(c, expected=np.exp(arr))
 
+  def testRound(self):
+    c = self._NewComputation()
+    arr = NumpyArrayF32([3.3, 12.1])
+    c.Round(c.Constant(arr))
+    self._ExecuteAndCompareClose(c, expected=np.round(arr))
+
   def testLog(self):
     c = self._NewComputation()
     arr = NumpyArrayF32([3.3, 12.1])
@@ -699,6 +705,23 @@ class SingleOpTest(LocalComputationTest):
     self._ExecuteAndCompareExact(
         c, expected=[[[6, 5], [8, 7]], [[2, 1], [4, 3]]])
 
+  def testClampF32(self):
+    c = self._NewComputation()
+    c.Clamp(
+        c.Constant(NumpyArrayF32(-1)),
+        c.Constant(NumpyArrayF32([-2, -1, 0, 1, 2, 3])),
+        c.Constant(NumpyArrayF32(2)))
+    self._ExecuteAndCompareExact(c, expected=[-1, -1, 0, 1, 2, 2])
+
+  # TODO(b/72689392): re-enable when bug S32 resolved
+  def DISABLED_testClampS32(self):
+    c = self._NewComputation()
+    c.Clamp(
+        c.Constant(NumpyArrayS32(-1)),
+        c.Constant(NumpyArrayS32([-2, -1, 0, 1, 2, 3])),
+        c.Constant(NumpyArrayS32(2)))
+    self._ExecuteAndCompareExact(c, expected=[-1, 0, 1, 2, 2])
+
   def testSelect(self):
     c = self._NewComputation()
     c.Select(
-- 
GitLab


From 6ed47cc06b0d7f445b322fb15bd08d28aa7791b4 Mon Sep 17 00:00:00 2001
From: Anjali Sridhar <anjalisridhar@google.com>
Date: Tue, 30 Jan 2018 16:12:32 -0800
Subject: [PATCH 433/966] Fix _standardize_input_data function to allow users
 to use list of ints/floats and list of lists as training and validation data
 input.

PiperOrigin-RevId: 183909545
---
 .../keras/_impl/keras/engine/training.py      | 21 +++++++++++--------
 .../keras/_impl/keras/engine/training_test.py | 18 ++++++++++++++++
 2 files changed, 30 insertions(+), 9 deletions(-)

diff --git a/tensorflow/python/keras/_impl/keras/engine/training.py b/tensorflow/python/keras/_impl/keras/engine/training.py
index 699ae2edf0..6885ef5724 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training.py
@@ -82,21 +82,24 @@ def _standardize_input_data(data,
           if data[x].__class__.__name__ == 'DataFrame' else data[x]
           for x in names
       ]
-      data = [np.expand_dims(x, 1) if x.ndim == 1 else x for x in data]
     except KeyError as e:
       raise ValueError('No data provided for "' + e.args[0] + '". Need data '
                        'for each key in: ' + str(names))
   elif isinstance(data, list):
-    data = [
-        x.values if x.__class__.__name__ == 'DataFrame' else x for x in data
-    ]
-    data = [
-        np.expand_dims(x, 1) if x is not None and x.ndim == 1 else x
-        for x in data
-    ]
+    if isinstance(data[0], list):
+      data = [np.asarray(d) for d in data]
+    elif len(names) == 1 and isinstance(data[0], (float, int)):
+      data = [np.asarray(data)]
+    else:
+      data = [
+          x.values if x.__class__.__name__ == 'DataFrame' else x for x in data
+      ]
   else:
     data = data.values if data.__class__.__name__ == 'DataFrame' else data
-    data = [np.expand_dims(data, 1)] if data.ndim == 1 else [data]
+    data = [data]
+  data = [
+      np.expand_dims(x, 1) if x is not None and x.ndim == 1 else x for x in data
+  ]
 
   if len(data) != len(names):
     if data and hasattr(data[0], 'shape'):
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_test.py b/tensorflow/python/keras/_impl/keras/engine/training_test.py
index 5a033a04ad..b380238e4e 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_test.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training_test.py
@@ -78,6 +78,14 @@ class TrainingTest(test.TestCase):
           verbose=2)
       model.train_on_batch([input_a_np, input_b_np], [output_d_np, output_e_np])
 
+      # Test model with input data as a list of lists
+      model.fit(
+          [np.ndarray.tolist(input_a_np), np.ndarray.tolist(input_b_np)],
+          [output_d_np, output_e_np],
+          epochs=2,
+          batch_size=5,
+          verbose=2)
+
       # Test with validation data
       model.fit(
           [input_a_np, input_b_np], [output_d_np, output_e_np],
@@ -205,6 +213,16 @@ class TrainingTest(test.TestCase):
       with self.assertRaises(ValueError):
         model.fit([input_a_np, input_a_np], output_d_np, epochs=1)
 
+      # Test model on a list of floats
+      input_a_np = np.random.random((10, 3))
+      input_b_np = np.random.random((10, 4))
+
+      model.fit([np.ndarray.tolist(input_a_np)],
+                [np.ndarray.tolist(input_b_np)],
+                epochs=2,
+                batch_size=5,
+                verbose=2)
+
   def test_evaluate_predict_on_arrays(self):
     with self.test_session():
       a = keras.layers.Input(shape=(3,), name='input_a')
-- 
GitLab


From 600c7a5fb8ed91060d3dae07a2fdd655af703244 Mon Sep 17 00:00:00 2001
From: Shivani Agrawal <shivaniagrawal@google.com>
Date: Tue, 30 Jan 2018 16:14:17 -0800
Subject: [PATCH 434/966] [tf.data] Native support for `tf.SparseTensor` and
 serialization tests for the `tf.data` methods that can produce
 `tf.SparseTensor` elements.

PiperOrigin-RevId: 183909820
---
 .../kernel_tests/batch_dataset_op_test.py     | 17 +++++++++++
 .../dataset_serialization_test_base.py        | 27 +++++++++++++----
 .../kernel_tests/filter_dataset_op_test.py    |  4 +++
 .../kernel_tests/flat_map_dataset_op_test.py  | 15 ++++++++++
 .../interleave_dataset_op_test.py             | 16 ++++++++++
 .../kernel_tests/map_dataset_op_test.py       | 29 +++++++++++++++----
 6 files changed, 97 insertions(+), 11 deletions(-)

diff --git a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
index 015f69c567..0c2827b1e4 100644
--- a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
@@ -744,6 +744,23 @@ class BatchDatasetSerializationTest(
                         lambda: self._build_dataset_dense_to_sparse(diff_comp),
                         num_outputs)
 
+  def _sparse(self, i):
+    return sparse_tensor.SparseTensorValue(
+        indices=[[0]], values=(i * [1]), dense_shape=[1])
+
+  def _build_dataset_sparse(self, batch_size=5):
+    return dataset_ops.Dataset.range(10).map(self._sparse).batch(batch_size)
+
+  def testSparseCore(self):
+    self.run_core_tests(self._build_dataset_sparse,
+                        lambda: self._build_dataset_sparse(2), 2)
+
+  def _build_dataset_nested_sparse(self):
+    return dataset_ops.Dataset.range(10).map(self._sparse).batch(5).batch(2)
+
+  def testNestedSparseCore(self):
+    self.run_core_tests(self._build_dataset_nested_sparse, None, 1)
+
 
 class PaddedBatchDatasetSerializationTest(
     dataset_serialization_test_base.DatasetSerializationTestBase):
diff --git a/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py b/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py
index 701fc8247e..4574a625a3 100644
--- a/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py
+++ b/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py
@@ -41,8 +41,8 @@ class DatasetSerializationTestBase(test.TestCase):
   def tearDown(self):
     self._delete_ckpt()
 
-  # TODO(b/70988345): Support native `tf.SparseTensor` objects and get rid of
-  # `sparse_tensors` argument.
+  # TODO(b/72657739): Remove sparse_tensor argument to test the (deprecated)
+  # `from_sparse_tensor_slices()` API once the API and tests are deleted.
   def run_core_tests(self, ds_fn1, ds_fn2, num_outputs, sparse_tensors=False):
     """Runs the core tests.
 
@@ -559,13 +559,16 @@ class DatasetSerializationTestBase(test.TestCase):
       get_next = sparse_tensor.SparseTensor(*iterator.get_next())
     else:
       get_next = iterator.get_next()
-    self._add_iterator_ops_to_collection(init_op, get_next, sparse_tensors)
+    self._add_iterator_ops_to_collection(init_op, get_next, ds_fn,
+                                         sparse_tensors)
     saver = saver_lib.Saver(allow_empty=True)
     return init_op, get_next, saver
 
   def _build_empty_graph(self, ds_fn, sparse_tensors=False):
     iterator = iterator_ops.Iterator.from_structure(
-        self._get_output_types(ds_fn), self._get_output_shapes(ds_fn))
+        self._get_output_types(ds_fn),
+        output_shapes=self._get_output_shapes(ds_fn),
+        output_classes=self._get_output_classes(ds_fn))
     saveable = contrib_iterator_ops.make_saveable_from_iterator(iterator)
     ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
     if sparse_tensors:
@@ -578,12 +581,19 @@ class DatasetSerializationTestBase(test.TestCase):
   def _add_iterator_ops_to_collection(self,
                                       init_op,
                                       get_next,
+                                      ds_fn,
                                       sparse_tensors=False):
     ops.add_to_collection("iterator_ops", init_op)
     # `get_next` may be a tuple e.g. in TensorSliceDataset. Since Collections
     # do not support tuples we flatten the tensors and restore the shape in
     # `_get_iterator_ops_from_collection`.
-    if sparse_tensors:
+
+    # TODO(shivaniagrwal): `output_classes` is a nested structure of classes,
+    # this base class is specific to current test cases. Update when test are
+    # added with `output_classes` as a nested structure with at least one of the
+    # component being `tf.SparseTensor`.
+    if (sparse_tensors or
+        self._get_output_classes(ds_fn) is sparse_tensor.SparseTensor):
       ops.add_to_collection("iterator_ops", get_next.indices)
       ops.add_to_collection("iterator_ops", get_next.values)
       ops.add_to_collection("iterator_ops", get_next.dense_shape)
@@ -593,7 +603,8 @@ class DatasetSerializationTestBase(test.TestCase):
 
   def _get_iterator_ops_from_collection(self, ds_fn, sparse_tensors=False):
     all_ops = ops.get_collection("iterator_ops")
-    if sparse_tensors:
+    if (sparse_tensors or
+        self._get_output_classes(ds_fn) is sparse_tensor.SparseTensor):
       init_op, indices, values, dense_shape = all_ops
       return init_op, sparse_tensor.SparseTensor(indices, values, dense_shape)
     else:
@@ -608,6 +619,10 @@ class DatasetSerializationTestBase(test.TestCase):
     with ops.Graph().as_default():
       return ds_fn().output_shapes
 
+  def _get_output_classes(self, ds_fn):
+    with ops.Graph().as_default():
+      return ds_fn().output_classes
+
   def _ckpt_path(self):
     return os.path.join(self.get_temp_dir(), "iterator")
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/filter_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/filter_dataset_op_test.py
index 5921be2ae8..06883934d0 100644
--- a/tensorflow/contrib/data/python/kernel_tests/filter_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/filter_dataset_op_test.py
@@ -194,6 +194,10 @@ class FilterDatasetSerializationTest(
     return dataset_ops.Dataset.range(10).map(_map_fn).filter(_filter_fn).map(
         lambda x, i: x)
 
+  def testSparseCore(self):
+    num_outputs = 5
+    self.run_core_tests(self._build_sparse_filter, None, num_outputs)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/flat_map_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/flat_map_dataset_op_test.py
index d4fbaa5cdc..86d69495ef 100644
--- a/tensorflow/contrib/data/python/kernel_tests/flat_map_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/flat_map_dataset_op_test.py
@@ -225,6 +225,21 @@ class FlatMapDatasetSerializationTest(
 
     self.verify_error_on_save(build_ds, 500, errors.InvalidArgumentError)
 
+  def testSparseCore(self):
+
+    def _map_fn(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0, 0], [1, 1]], values=(i * [1, -1]), dense_shape=[2, 2])
+
+    def _flat_map_fn(x):
+      return dataset_ops.Dataset.from_tensor_slices(
+          sparse_ops.sparse_to_dense(x.indices, x.dense_shape, x.values))
+
+    def _build_ds():
+      return dataset_ops.Dataset.range(10).map(_map_fn).flat_map(_flat_map_fn)
+
+    self.run_core_tests(_build_ds, None, 20)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py
index b1937c08f3..db8429512b 100644
--- a/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py
@@ -252,6 +252,22 @@ class InterleaveDatasetSeriazationTest(
         None, num_outputs)
     # pylint: enable=g-long-lambda
 
+  def testSparseCore(self):
+
+    def _map_fn(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0, 0], [1, 1]], values=(i * [1, -1]), dense_shape=[2, 2])
+
+    def _interleave_fn(x):
+      return dataset_ops.Dataset.from_tensor_slices(
+          sparse_ops.sparse_to_dense(x.indices, x.dense_shape, x.values))
+
+    def _build_dataset():
+      return dataset_ops.Dataset.range(10).map(_map_fn).interleave(
+          _interleave_fn, cycle_length=1)
+
+    self.run_core_tests(_build_dataset, None, 20)
+
 
 class ParallelInterleaveDatasetTest(test.TestCase):
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
index dd8247bfd4..d3ce89298b 100644
--- a/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
@@ -805,6 +805,21 @@ class MapDatasetSerializationTest(
 
     self.run_core_tests(_build_ds, None, num_outputs)
 
+  def testSparseCore(self):
+
+    def _sparse(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=np.array([[0, 0]]),
+          values=(i * np.array([1])),
+          dense_shape=np.array([1, 1]))
+
+    def _build_ds(num_outputs):
+      return contrib_dataset_ops.Dataset.range(num_outputs).map(_sparse)
+
+    num_outputs = 10
+    self.run_core_tests(lambda: _build_ds(num_outputs),
+                        lambda: _build_ds(int(num_outputs / 2)), num_outputs)
+
 
 class ParallelMapDatasetSerializationTest(
     dataset_serialization_test_base.DatasetSerializationTestBase):
@@ -851,7 +866,8 @@ class ParallelMapDatasetSerializationTest(
         return random_ops.random_uniform(
             (), 0, 10, dtype=dtypes.int32) * math_ops.to_int32(x)
 
-      return contrib_dataset_ops.Dataset.range(100).map(_map_fn)
+      return contrib_dataset_ops.Dataset.range(100).map(
+          _map_fn, num_parallel_calls=2).prefetch(2)
 
     self.verify_error_on_save(_build_ds, 15, errors.InvalidArgumentError)
 
@@ -861,7 +877,8 @@ class ParallelMapDatasetSerializationTest(
       counter_var = variable_scope.get_variable(
           "counter", (), dtypes.int32, use_resource=True)
       return (contrib_dataset_ops.Dataset.from_tensors(0).repeat(10).map(
-          lambda _: counter_var.assign_add(1)))
+          lambda _: counter_var.assign_add(1),
+          num_parallel_calls=2).prefetch(2))
 
     self.verify_error_on_save(_build_ds, 15, errors.InvalidArgumentError)
 
@@ -870,7 +887,7 @@ class ParallelMapDatasetSerializationTest(
     def _build_ds():
       constant_var = constant_op.constant(5)
       return (contrib_dataset_ops.Dataset.from_tensors(0).repeat(10).map(
-          lambda x: x + constant_var))
+          lambda x: x + constant_var, num_parallel_calls=2).prefetch(2))
 
     self.run_core_tests(_build_ds, None, 10)
 
@@ -883,7 +900,8 @@ class ParallelMapDatasetSerializationTest(
       def defun_fn(x):
         return constant_op.constant(1000) + math_ops.to_int32(x)
 
-      return contrib_dataset_ops.Dataset.range(num_outputs).map(defun_fn)
+      return contrib_dataset_ops.Dataset.range(num_outputs).map(
+          defun_fn, num_parallel_calls=2).prefetch(2)
 
     self.run_core_tests(_build_ds, None, num_outputs)
 
@@ -901,7 +919,8 @@ class ParallelMapDatasetSerializationTest(
 
         return constant_op.constant(11000) + defun_fn_deep(math_ops.to_int32(x))
 
-      return contrib_dataset_ops.Dataset.range(num_outputs).map(defun_fn)
+      return contrib_dataset_ops.Dataset.range(num_outputs).map(
+          defun_fn, num_parallel_calls=2).prefetch(2)
 
     self.run_core_tests(_build_ds, None, num_outputs)
 
-- 
GitLab


From f8f9163449c3b1ec60db7d05d68f652a0ca9257a Mon Sep 17 00:00:00 2001
From: joel-shor <joelshor@google.com>
Date: Tue, 30 Jan 2018 19:06:48 -0800
Subject: [PATCH 435/966] Simplify loader_impl.py logic around main Op Tensor.

---
 tensorflow/python/saved_model/loader_impl.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/tensorflow/python/saved_model/loader_impl.py b/tensorflow/python/saved_model/loader_impl.py
index 5ff954fd9f..6e85df0cbf 100644
--- a/tensorflow/python/saved_model/loader_impl.py
+++ b/tensorflow/python/saved_model/loader_impl.py
@@ -232,13 +232,9 @@ def load(sess, tags, export_dir, **saver_kwargs):
     asset_tensors_dictionary = _get_asset_tensors(export_dir,
                                                   meta_graph_def_to_load)
 
-    main_op_tensor = _get_main_op_tensor(meta_graph_def_to_load)
+    main_op_tensor = (_get_main_op_tensor(meta_graph_def_to_load) or
+                      (_get_legacy_init_op_tensor(meta_graph_def_to_load)))
     if main_op_tensor is not None:
       sess.run(fetches=[main_op_tensor], feed_dict=asset_tensors_dictionary)
-    else:
-      legacy_init_op_tensor = _get_legacy_init_op_tensor(meta_graph_def_to_load)
-      if legacy_init_op_tensor is not None:
-        sess.run(
-            fetches=[legacy_init_op_tensor], feed_dict=asset_tensors_dictionary)
 
     return meta_graph_def_to_load
-- 
GitLab


From 00c4febdac23cade71d6fdcf10c9ccb982e2a582 Mon Sep 17 00:00:00 2001
From: Koan-Sin Tan <koansin.tan@gmail.com>
Date: Wed, 31 Jan 2018 13:22:54 +0800
Subject: [PATCH 436/966] add brackets and new line

Add curly brackets and new line to address the format problem
mentioned in review.
---
 .../contrib/lite/examples/label_image/bitmap_helpers_impl.h   | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/lite/examples/label_image/bitmap_helpers_impl.h b/tensorflow/contrib/lite/examples/label_image/bitmap_helpers_impl.h
index 942906e269..33ea695dda 100644
--- a/tensorflow/contrib/lite/examples/label_image/bitmap_helpers_impl.h
+++ b/tensorflow/contrib/lite/examples/label_image/bitmap_helpers_impl.h
@@ -67,7 +67,9 @@ void resize(T* out, uint8_t* in, int image_height, int image_width,
   // fill input image
   // in[] are integers, cannot do memcpy() directly
   auto input = interpreter->typed_tensor<float>(0);
-  for (int i = 0; i < number_of_pixels; i++) input[i] = in[i];
+  for (int i = 0; i < number_of_pixels; i++) {
+    input[i] = in[i];
+  }
 
   // fill new_sizes
   interpreter->typed_tensor<int>(1)[0] = wanted_height;
-- 
GitLab


From f8b1986d67b1bcc352acb7644b642faf46ca79cb Mon Sep 17 00:00:00 2001
From: Sami Kama <skama@nvidia.com>
Date: Wed, 31 Jan 2018 11:07:25 -0800
Subject: [PATCH 437/966] Change VLOGs to increasing verbosity mode

---
 tensorflow/contrib/tensorrt/README.md         |   2 +-
 .../contrib/tensorrt/convert/convert_graph.cc |  13 +-
 .../contrib/tensorrt/convert/convert_nodes.cc | 132 +++++++++---------
 .../contrib/tensorrt/convert/convert_nodes.h  |   2 +-
 tensorflow/contrib/tensorrt/log/trt_logger.cc |   2 +-
 .../contrib/tensorrt/python/trt_convert.py    |   6 +-
 tensorflow/contrib/tensorrt/trt_conversion.i  |   6 +-
 7 files changed, 81 insertions(+), 82 deletions(-)

diff --git a/tensorflow/contrib/tensorrt/README.md b/tensorflow/contrib/tensorrt/README.md
index b3c604f5f8..1e9524c26b 100644
--- a/tensorflow/contrib/tensorrt/README.md
+++ b/tensorflow/contrib/tensorrt/README.md
@@ -33,7 +33,7 @@ trt_gdef = trt.CreateInferenceGraph(
     gdef, #original graph_def
     ["output"], #name of output node(s)
     max_batch_size, #maximum batch size to run the inference
-    max_workspace_size) # max memory for TensorRT to use
+    max_workspace_size_bytes) # max memory for TensorRT to use
 tf.reset_default_graph()
 tf.import_graph_def(graph_def=trt_gdef)
 #...... run inference
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index e0f38c60ee..81fdf01286 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -123,7 +123,7 @@ tensorflow::Status ConvertSubGraphToTensorRT(
     const std::set<int>& subgraph_node_ids,
     size_t max_batch_size,  // max batch size that engine will be created for
     // max amount of memory that engine will be allowed to consume, in bytes
-    size_t max_workspace_size,
+    size_t max_workspace_size_bytes,
     const tensorflow::grappler::GraphProperties& graph_properties,
     tensorflow::Graph* graph) {
   tensorflow::EdgeSet subgraph_incoming_edges;
@@ -159,7 +159,8 @@ tensorflow::Status ConvertSubGraphToTensorRT(
   tensorflow::NodeDef trt_node_def;
   TF_RETURN_IF_ERROR(ConvertSubGraphToTensorRTNodeDef(
       *graph, subgraph_node_ids, subgraph_inputs, subgraph_outputs,
-      max_batch_size, max_workspace_size, graph_properties, &trt_node_def));
+      max_batch_size, max_workspace_size_bytes, graph_properties,
+      &trt_node_def));
   tensorflow::Status status;
   tensorflow::Node* trt_node = graph->AddNode(trt_node_def, &status);
 
@@ -205,7 +206,7 @@ tensorflow::Status BuildNodeMap(
 tensorflow::Status ConvertGraphDefToTensorRT(
     const tensorflow::GraphDef& graph_def,
     const std::vector<std::string>& output_names, size_t max_batch_size,
-    size_t max_workspace_size, tensorflow::GraphDef* new_graph_def) {
+    size_t max_workspace_size_bytes, tensorflow::GraphDef* new_graph_def) {
   // optimization pass
   tensorflow::grappler::GrapplerItem item;
   item.fetch = output_names;
@@ -258,7 +259,7 @@ tensorflow::Status ConvertGraphDefToTensorRT(
   TF_RETURN_IF_ERROR(tensorrt::segment::SegmentGraph(
       gdef, IsTensorRTCandidate, segment_options, &segments));
   if (segments.size() > 1) {
-    VLOG(INFO) << "MULTIPLE tensorrt candidate conversion: " << segments.size();
+    VLOG(0) << "MULTIPLE tensorrt candidate conversion: " << segments.size();
   }
   std::unordered_map<std::string, tensorflow::Node*> node_map;
   TF_RETURN_IF_ERROR(BuildNodeMap(graph, &node_map));
@@ -268,8 +269,8 @@ tensorflow::Status ConvertGraphDefToTensorRT(
       subgraph_node_ids.insert(node_map.at(node_name)->id());
     }
     TF_RETURN_IF_ERROR(ConvertSubGraphToTensorRT(
-        output_names, subgraph_node_ids, max_batch_size, max_workspace_size,
-        static_graph_properties, &graph));
+        output_names, subgraph_node_ids, max_batch_size,
+        max_workspace_size_bytes, static_graph_properties, &graph));
   }
   graph.ToGraphDef(new_graph_def);
   return tensorflow::Status::OK();
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index 2653c1c636..c84684d485 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -103,9 +103,9 @@ static std::vector<std::pair<int, int>> createSamePadding(
     int left = p / 2;
     int right = p - left;
 
-    VLOG(-1) << "PADDING_" << i << " pre: " << left << ", post: " << right
-             << "paras: " << inputDims[i] << ", " << stride.d[i] << ", "
-             << "kernel: " << kernel.d[i];
+    VLOG(2) << "PADDING_" << i << " pre: " << left << ", post: " << right
+            << "paras: " << inputDims[i] << ", " << stride.d[i] << ", "
+            << "kernel: " << kernel.d[i];
     padding[i] = {left, right};
   }
   return padding;
@@ -352,7 +352,7 @@ class Converter {
       tensorflow::NodeDef const& node_def) {
     std::vector<TRT_TensorOrWeights> inputs;
     for (auto const& input_name : node_def.input()) {
-      VLOG(-1) << "retrieve input: " << input_name;
+      VLOG(2) << "retrieve input: " << input_name;
       inputs.push_back(_trt_tensors.at(input_name));
     }
     return inputs;
@@ -395,7 +395,7 @@ class Converter {
       if (output.is_tensor()) {
         output.tensor()->setName(output_name.c_str());
       }
-      VLOG(-1) << "write out tensor: " << output_name;
+      VLOG(2) << "write out tensor: " << output_name;
       if (!_trt_tensors.insert({output_name, output}).second) {
         return tensorflow::errors::AlreadyExists(
             "output tensor already exists for op: " + op);
@@ -456,13 +456,13 @@ struct LambdaFactory {
   std::function<T(T)> unary() {
     switch (op) {
       case OP_CATEGORY::RSQRT: {
-        VLOG(-1) << "RSQRT GETS DONE";
+        VLOG(2) << "RSQRT GETS DONE";
         return [](T t) -> T { return 1.0 / std::sqrt(t); };
       }
       case OP_CATEGORY::NEG:
         return [](T t) -> T { return -t; };
       default:
-        VLOG(-1) << "not supported op for unary: " << static_cast<int>(op);
+        VLOG(2) << "not supported op for unary: " << static_cast<int>(op);
         return nullptr;
     }
   }
@@ -487,22 +487,22 @@ struct LambdaFactory {
 
   template <typename T>
   std::function<T(T)> broadcast_r(T val) {
-    VLOG(-1) << "LAMBDA VAL : " << val;
+    VLOG(2) << "LAMBDA VAL : " << val;
     switch (op) {
       case OP_CATEGORY::ADD:
         return [val](T l) -> T {
-          VLOG(-1) << "LAMBDA VAL : " << val;
+          VLOG(2) << "LAMBDA VAL : " << val;
           return l + val;
         };
       // return [val](T l)-> T {return l+val;};
       case OP_CATEGORY::SUB:
         return [val](T l) -> T {
-          VLOG(-1) << "LAMBDA VAL : " << val;
+          VLOG(2) << "LAMBDA VAL : " << val;
           return l - val;
         };
       case OP_CATEGORY::MUL:
         return [val](T l) -> T {
-          VLOG(-1) << "LAMBDA VAL : " << val;
+          VLOG(2) << "LAMBDA VAL : " << val;
           return l * val;
         };
       default:
@@ -516,21 +516,21 @@ struct LambdaFactory {
 
   template <typename T>
   std::function<T(T)> broadcast_l(T val) {
-    VLOG(-1) << "LAMBDA VAL : " << val;
+    VLOG(2) << "LAMBDA VAL : " << val;
     switch (op) {
       case OP_CATEGORY::ADD:
         return [val](T l) -> T {
-          VLOG(-1) << "LAMBDA VAL : " << val;
+          VLOG(2) << "LAMBDA VAL : " << val;
           return val + l;
         };
       case OP_CATEGORY::SUB:
         return [val](T l) -> T {
-          VLOG(-1) << "LAMBDA VAL : " << val;
+          VLOG(2) << "LAMBDA VAL : " << val;
           return val - l;
         };
       case OP_CATEGORY::MUL:
         return [val](T l) -> T {
-          VLOG(-1) << "LAMBDA VAL : " << val;
+          VLOG(2) << "LAMBDA VAL : " << val;
           return val * l;
         };
       default:
@@ -570,7 +570,7 @@ tensorflow::Status BinaryCompute(TRT_ShapedWeights const& iweights_l,
   // assume iweights_l.type == iweight_r.type
   CHECK_EQ(iweights_l.type_, oweights->type_);
   CHECK_EQ(iweights_r.type_, oweights->type_);
-  VLOG(-1) << "SANITY CHECK!";
+  VLOG(2) << "SANITY CHECK!";
 
   switch (iweights_l.type_) {
     case tensorflow::DataType::DT_FLOAT: {
@@ -581,11 +581,11 @@ tensorflow::Status BinaryCompute(TRT_ShapedWeights const& iweights_l,
       if (iweights_l.count() != iweights_r.count()) {
         // we only supports broadcast of RankZero
         if (iweights_l.count() == 1) {
-          VLOG(-1) << "I bet it is not working!" << (*inp_l);
+          VLOG(2) << "I bet it is not working!" << (*inp_l);
           std::transform(inp_r, inp_r + iweights_r.count(), oup,
                          binary_op.broadcast_l<float>(*inp_l));
         } else if (iweights_r.count() == 1) {
-          VLOG(-1) << "I bet it is not working!" << (*inp_r);
+          VLOG(2) << "I bet it is not working!" << (*inp_r);
           std::transform(inp_l, inp_l + iweights_l.count(), oup,
                          binary_op.broadcast_r<float>(*inp_r));
         } else {
@@ -660,8 +660,8 @@ tensorflow::Status ConstantFoldBinary(
   int nbDims = weights_input_l.shape_.nbDims;
   nvinfer1::Dims output_shape;
   output_shape.nbDims = nbDims;
-  VLOG(-1) << "nbDims: " << nbDims
-           << "the other: " << weights_input_r.shape_.nbDims;
+  VLOG(2) << "nbDims: " << nbDims
+          << "the other: " << weights_input_r.shape_.nbDims;
   for (int i = 0; i < nbDims; i++) {
     if (weights_input_l.shape_.d[i] == weights_input_r.shape_.d[i]) {
       output_shape.d[i] = weights_input_l.shape_.d[i];
@@ -673,9 +673,9 @@ tensorflow::Status ConstantFoldBinary(
       return tensorflow::errors::Unimplemented(
           "Binary op with incompatible shape at, " + node_def.op());
     }
-    VLOG(-1) << "left: " << weights_input_l.shape_.d[i]
-             << "right: " << weights_input_r.shape_.d[i]
-             << "output: " << output_shape.d[i];
+    VLOG(2) << "left: " << weights_input_l.shape_.d[i]
+            << "right: " << weights_input_r.shape_.d[i]
+            << "output: " << output_shape.d[i];
   }
 
   // FIXME assume type matches input weights
@@ -735,7 +735,7 @@ tensorflow::Status BinaryTensorOpWeight(
   auto scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
 
   if (weights.count() == 1) {
-    VLOG(-1) << "UNIFORM";
+    VLOG(2) << "UNIFORM";
     scale_mode = nvinfer1::ScaleMode::kUNIFORM;
   } else {
     // no broadcasting on Batch dimension;
@@ -838,7 +838,7 @@ tensorflow::Status ConvertPlaceholder(
     Converter& ctx, tensorflow::NodeDef const& node_def,
     std::vector<TRT_TensorOrWeights> const& inputs,
     std::vector<TRT_TensorOrWeights>* outputs) {
-  VLOG(-1) << "Placeholder should have been replace already";
+  VLOG(2) << "Placeholder should have been replace already";
   return tensorflow::errors::Unimplemented("cannot convert Placeholder op");
   // OK this make sense since we are supposed to replace it with input
   TFAttrs attrs(node_def);
@@ -905,12 +905,12 @@ tensorflow::Status ConvertConv2D(Converter& ctx,
   if (padding[0].first != padding[0].second ||
       padding[1].first != padding[1].second) {
     // TODO(jie): handle asymmetric padding
-    VLOG(-1) << "padding!!!: " << padding[0].first << padding[0].second
-             << padding[1].first << padding[1].second;
+    VLOG(2) << "padding!!!: " << padding[0].first << padding[0].second
+            << padding[1].first << padding[1].second;
 
     auto dim_before = tensor->getDimensions();
-    VLOG(-1) << "TENSOR before: " << dim_before.d[0] << ", " << dim_before.d[1]
-             << dim_before.d[2] << ", " << dim_before.d[3];
+    VLOG(2) << "TENSOR before: " << dim_before.d[0] << ", " << dim_before.d[1]
+            << dim_before.d[2] << ", " << dim_before.d[3];
     auto padLayer = ctx.network()->addPadding(
         *const_cast<nvinfer1::ITensor*>(tensor),
         nvinfer1::DimsHW(padding[0].first, padding[1].first),
@@ -918,8 +918,8 @@ tensorflow::Status ConvertConv2D(Converter& ctx,
     padding = {{0, 0}, {0, 0}};
     tensor = padLayer->getOutput(0);
     auto dim_after = tensor->getDimensions();
-    VLOG(-1) << "TENSOR after: " << dim_after.d[0] << ", " << dim_after.d[1]
-             << dim_after.d[2] << ", " << dim_after.d[3];
+    VLOG(2) << "TENSOR after: " << dim_after.d[0] << ", " << dim_after.d[1]
+            << dim_after.d[2] << ", " << dim_after.d[3];
   }
 
   nvinfer1::IConvolutionLayer* layer =
@@ -932,14 +932,14 @@ tensorflow::Status ConvertConv2D(Converter& ctx,
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
 
   auto dim_after = output_tensor->getDimensions();
-  VLOG(-1) << "TENSOR out: " << dim_after.d[0] << ", " << dim_after.d[1]
-           << dim_after.d[2] << ", " << dim_after.d[3];
+  VLOG(2) << "TENSOR out: " << dim_after.d[0] << ", " << dim_after.d[1]
+          << dim_after.d[2] << ", " << dim_after.d[3];
 
   if (data_format == "NHWC") {
     // TODO(jie): transpose it back!
     output_tensor = ctx.transposeTensor(output_tensor, {0, 2, 3, 1});
   } else {
-    VLOG(-1) << "NCHW !!!!";
+    VLOG(2) << "NCHW !!!!";
   }
   outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return tensorflow::Status::OK();
@@ -961,7 +961,7 @@ tensorflow::Status ConvertPool(Converter& ctx,
     tensor = ctx.transposeTensor(const_cast<nvinfer1::ITensor*>(tensor),
                                  {0, 3, 1, 2});
   } else {
-    VLOG(-1) << "NCHW !!!!";
+    VLOG(2) << "NCHW !!!!";
   }
   nvinfer1::PoolingType type;
   // TODO(jie): support other pooling type
@@ -989,7 +989,7 @@ tensorflow::Status ConvertPool(Converter& ctx,
         {static_cast<int>(tensor_dim.d[1]), static_cast<int>(tensor_dim.d[2])});
   } else if (attrs.get<std::string>("padding") == "VALID") {
     // No padding for valid padding here
-    VLOG(-1) << "no padding added for VALID padding in pool" << node_def.name();
+    VLOG(2) << "no padding added for VALID padding in pool" << node_def.name();
     padding = {{0, 0}, {0, 0}};
   } else {
     return tensorflow::errors::Unimplemented(
@@ -999,8 +999,8 @@ tensorflow::Status ConvertPool(Converter& ctx,
   if (padding[0].first != padding[0].second ||
       padding[1].first != padding[1].second) {
     // TODO(jie): handle asymmetric padding
-    VLOG(-1) << "padding!!!: " << padding[0].first << padding[0].second
-             << padding[1].first << padding[1].second;
+    VLOG(2) << "padding!!!: " << padding[0].first << padding[0].second
+            << padding[1].first << padding[1].second;
     auto padLayer = ctx.network()->addPadding(
         *const_cast<nvinfer1::ITensor*>(tensor),
         nvinfer1::DimsHW(padding[0].first, padding[1].first),
@@ -1021,7 +1021,7 @@ tensorflow::Status ConvertPool(Converter& ctx,
     // TODO(jie): transpose it back!
     output_tensor = ctx.transposeTensor(output_tensor, {0, 2, 3, 1});
   } else {
-    VLOG(-1) << "NCHW !!!!";
+    VLOG(2) << "NCHW !!!!";
   }
   outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return tensorflow::Status::OK();
@@ -1063,7 +1063,7 @@ tensorflow::Status ConvertScale(Converter& ctx,
                                  {0, 3, 1, 2});
     // TODO(jie): transpose it
   } else {
-    VLOG(-1) << "NCHW !!!!";
+    VLOG(2) << "NCHW !!!!";
   }
   nvinfer1::IScaleLayer* layer = ctx.network()->addScale(
       *const_cast<nvinfer1::ITensor*>(tensor), nvinfer1::ScaleMode::kCHANNEL,
@@ -1074,7 +1074,7 @@ tensorflow::Status ConvertScale(Converter& ctx,
     // TODO(jie): transpose it back!
     output_tensor = ctx.transposeTensor(output_tensor, {0, 2, 3, 1});
   } else {
-    VLOG(-1) << "NCHW !!!!";
+    VLOG(2) << "NCHW !!!!";
   }
   outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return tensorflow::Status::OK();
@@ -1099,14 +1099,14 @@ tensorflow::Status ConvertConst(Converter& ctx,
 
   TRT_ShapedWeights weights(dtype);
   if (!weights_tensor.float_val().empty()) {
-    VLOG(-1) << "SCALAR!!!" << node_def.name();
+    VLOG(2) << "SCALAR!!!" << node_def.name();
     nvinfer1::Dims scalar_shape;
     if (tensor.dims() > 0) {
-      VLOG(-1) << "dimensions: " << tensor.dims();
+      VLOG(2) << "dimensions: " << tensor.dims();
       weights = TRT_ShapedWeights(dtype, weights_tensor.float_val().data(),
                                   get_tensor_shape(tensor));
     } else {
-      VLOG(-1) << "dimensions: " << tensor.dims();
+      VLOG(2) << "dimensions: " << tensor.dims();
       scalar_shape.nbDims = 1;
       scalar_shape.d[0] = 1;
       scalar_shape.type[0] = nvinfer1::DimensionType::kSPATIAL;
@@ -1118,7 +1118,7 @@ tensorflow::Status ConvertConst(Converter& ctx,
                                   scalar_shape);
     }
   } else if (!weights_tensor.tensor_content().empty()) {
-    VLOG(-1) << "TENSOR!!!" << node_def.name();
+    VLOG(2) << "TENSOR!!!" << node_def.name();
     weights = TRT_ShapedWeights(dtype, weights_tensor.tensor_content().data(),
                                 get_tensor_shape(tensor));
   } else {
@@ -1403,7 +1403,7 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
     const tensorflow::Graph& graph, const std::set<int>& subgraph_node_ids,
     const std::vector<std::pair<int, int>>& input_inds,
     const std::vector<std::pair<int, int>>& output_inds, size_t max_batch_size,
-    size_t max_workspace_size,
+    size_t max_workspace_size_bytes,
     const tensorflow::grappler::GraphProperties& graph_properties,
     tensorflow::NodeDef* trt_node) {
   // Visit nodes in reverse topological order and construct the TRT network.
@@ -1466,18 +1466,18 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
     nvinfer1::DataType dtype(nvinfer1::DataType::kFLOAT);
     TF_CHECK_OK(convert_dtype(tf_dtype, &dtype));
 
-    VLOG(-1) << "accessing output index of: " << std::to_string(output_idx)
-             << ", at node: " << node_name
-             << "with output entry from shape_map: "
-             << std::to_string(op_info_vec.size());
+    VLOG(2) << "accessing output index of: " << std::to_string(output_idx)
+            << ", at node: " << node_name
+            << "with output entry from shape_map: "
+            << std::to_string(op_info_vec.size());
 
     // TODO(ben,jie): update TRT input format/dimension
     nvinfer1::DimsCHW input_dim_psuedo_chw;
     for (int i = 0; i < 3; i++) input_dim_psuedo_chw.d[i] = 1;
 
     for (int i = 1; i < op_info.shape().dim_size(); i++) {
-      VLOG(-1) << "dimension: " << i
-               << " , size: " << op_info.shape().dim(i).size();
+      VLOG(2) << "dimension: " << i
+              << " , size: " << op_info.shape().dim(i).size();
       input_dim_psuedo_chw.d[i - 1] = op_info.shape().dim(i).size();
     }
 
@@ -1492,23 +1492,22 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
     if (!input_tensor)
       return tensorflow::errors::InvalidArgument(
           "Failed to create Input layer");
-    VLOG(-1) << "input tensor name :" << input_tensor_name;
+    VLOG(2) << "input tensor name :" << input_tensor_name;
 
     if (!converter.insert_input_tensor(input_tensor_name, input_tensor))
       return tensorflow::errors::AlreadyExists(
           "output tensor already exists for op: " + input_tensor_name);
   }
 
-  VLOG(-1) << "finished sorting";
+  VLOG(2) << "finished sorting";
 
   for (const tensorflow::Node* node : order) {
     tensorflow::NodeDef const& node_def = node->def();
-    VLOG(-1) << "converting node: " << node_def.name() << " , "
-             << node_def.op();
+    VLOG(2) << "converting node: " << node_def.name() << " , " << node_def.op();
     TF_RETURN_IF_ERROR(converter.convert_node(node_def));
   }
 
-  VLOG(-1) << "finished conversion";
+  VLOG(2) << "finished conversion";
 
   // Gather output metadata
   std::vector<std::string> output_names;
@@ -1521,7 +1520,7 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
     std::string tensor_name = op_name;
     if (output_idx != 0)
       tensor_name = tensor_name + ":" + std::to_string(output_idx);
-    VLOG(-1) << "output tensor name: " << tensor_name;
+    VLOG(2) << "output tensor name: " << tensor_name;
     output_names.push_back(tensor_name);
     auto tensor_or_weights = converter.get_tensor(tensor_name);
     if (!tensor_or_weights.is_tensor()) {
@@ -1541,28 +1540,28 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
     tensor->setType(trt_dtype);
   }
 
-  VLOG(-1) << "finished output";
+  VLOG(2) << "finished output";
   static int static_id = 0;
 
   // Build the engine
   trt_builder->setMaxBatchSize(max_batch_size);
-  trt_builder->setMaxWorkspaceSize(max_workspace_size);
-  LOG(INFO) << "starting build engine "<<static_id;
+  trt_builder->setMaxWorkspaceSize(max_workspace_size_bytes);
+  VLOG(0) << "starting build engine " << static_id;
   // TODO(ben,jie): half2 and int8 mode support
   std::string engine_plan_string;
   {
     auto trt_engine =
         infer_object(trt_builder->buildCudaEngine(*converter.network()));
-    LOG(INFO) << "built network";
+    VLOG(0) << "built network";
     auto engine_plan = infer_object(trt_engine->serialize());
-    VLOG(INFO) << "serialized engine";
+    VLOG(0) << "serialized engine";
     const char* engine_plan_data =
         static_cast<const char*>(engine_plan->data());
     engine_plan_string = std::move(
         std::string(engine_plan_data, engine_plan_data + engine_plan->size()));
   }
 
-  VLOG(INFO) << "finished engine";
+  VLOG(0) << "finished engine";
 
   // Build the TRT op
   // TODO(sami,ben,jie): proper naming!
@@ -1581,7 +1580,7 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
       income_edges);
   op_builder.Input(input_list);
 
-  VLOG(INFO) << "finished op preparation";
+  VLOG(0) << "finished op preparation";
 
   auto status = op_builder.Attr("serialized_engine", engine_plan_string)
                     .Attr("input_nodes", input_names)
@@ -1589,8 +1588,7 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
                     .Attr("OutT", output_dtypes)
                     .Finalize(trt_node);
 
-  VLOG(INFO) << status.ToString();
-  VLOG(INFO) << "finished op building";
+  VLOG(0) << status.ToString() << " finished op building";
 
   return tensorflow::Status::OK();
 }
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.h b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
index 2e7fd19566..82b12b74ea 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.h
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
@@ -38,7 +38,7 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
         input_inds,  // {node_id, output_idx}
     const std::vector<std::pair<int, int>>&
         output_inds,  // {node_id, output_idx}
-    size_t max_batch_size, size_t max_workspace_size_bytes,
+    size_t max_batch_size, size_t max_workspace_size_bytes_bytes,
     const tensorflow::grappler::GraphProperties& graph_prop,
     tensorflow::NodeDef* trt_node);
 
diff --git a/tensorflow/contrib/tensorrt/log/trt_logger.cc b/tensorflow/contrib/tensorrt/log/trt_logger.cc
index 5131c80794..7add8cb8b3 100644
--- a/tensorflow/contrib/tensorrt/log/trt_logger.cc
+++ b/tensorflow/contrib/tensorrt/log/trt_logger.cc
@@ -27,7 +27,7 @@ void Logger::log(Severity severity, const char* msg) {
   // Suppress info-level messages
   switch (severity) {
     case Severity::kINFO: {  // Mark TRT info messages as debug!
-      VLOG(-1) << msg;
+      VLOG(2) << msg;
       break;
     }
     case Severity::kWARNING: {
diff --git a/tensorflow/contrib/tensorrt/python/trt_convert.py b/tensorflow/contrib/tensorrt/python/trt_convert.py
index 6bdc20ed04..b17b07e296 100644
--- a/tensorflow/contrib/tensorrt/python/trt_convert.py
+++ b/tensorflow/contrib/tensorrt/python/trt_convert.py
@@ -35,7 +35,7 @@ from tensorflow.python.framework import ops
 def CreateInferenceGraph(input_graph_def,
                          outputs,
                          max_batch_size=1,
-                         max_workspace_size=2 << 20):
+                         max_workspace_size_bytes=2 << 20):
   """Python wrapper for the TRT transormation.
 
 
@@ -43,7 +43,7 @@ def CreateInferenceGraph(input_graph_def,
     input_graph_def: GraphDef object containing a model to be transformed.
     outputs: List of tensors or node names for the model outputs.
     max_batch_size: max size for the input batch
-    max_workspace_size: parameter to control memory allocation (in Bytes)
+    max_workspace_size_bytes: parameter to control memory allocation (in Bytes)
 
   Returns:
     New GraphDef with TRTEngineOps placed in graph replacing subgraphs.
@@ -64,7 +64,7 @@ def CreateInferenceGraph(input_graph_def,
   # pair or strings where first one is encoded status and the second
   # one is the transformed graphs protobuf string.
   out = trt_convert(input_graph_def_str, outputs, max_batch_size,
-                    max_workspace_size)
+                    max_workspace_size_bytes)
   status = out[0]
   output_graph_def_string = out[1]
   del input_graph_def_str  #save some memory
diff --git a/tensorflow/contrib/tensorrt/trt_conversion.i b/tensorflow/contrib/tensorrt/trt_conversion.i
index a7c7e5bc9f..828b4b35c2 100644
--- a/tensorflow/contrib/tensorrt/trt_conversion.i
+++ b/tensorflow/contrib/tensorrt/trt_conversion.i
@@ -40,7 +40,7 @@ std::pair<string, string> trt_convert(
     string graph_def_string,  // The serialized GraphDef string.
     std::vector<string> output_names,
     size_t max_batch_size,
-    size_t max_workspace_size
+    size_t max_workspace_size_bytes
     // Unfortunately we can't use TF_Status here since it
     // is in c/c_api and brings in a lot of other libraries
     // which in turn declare ops. These ops are included
@@ -68,7 +68,7 @@ std::pair<string, string> trt_convert(
   tensorflow::GraphDef outGraph;
   tensorflow::Status conversion_status =
       tensorflow::tensorrt::convert::ConvertGraphDefToTensorRT(
-          graph_def, output_names, max_batch_size, max_workspace_size,
+          graph_def, output_names, max_batch_size, max_workspace_size_bytes,
           &outGraph);
   if (!conversion_status.ok()) {
     auto retCode = (int)conversion_status.code();
@@ -95,6 +95,6 @@ std::pair<string, string> trt_convert(
 std::pair<string, string> trt_convert(string graph_def_string,
                                       std::vector<string> output_names,
                                       size_t max_batch_size,
-                                      size_t max_workspace_size);
+                                      size_t max_workspace_size_bytes);
 
 %unignoreall
-- 
GitLab


From dfb983c07237542264002ef90084d06a1ef9134c Mon Sep 17 00:00:00 2001
From: Pete Warden <pete@petewarden.com>
Date: Wed, 31 Jan 2018 11:11:58 -0800
Subject: [PATCH 438/966] Fixed iOS build script for all architectures, fixes
 #12904 (#16559)

---
 tensorflow/contrib/makefile/build_all_ios.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/makefile/build_all_ios.sh b/tensorflow/contrib/makefile/build_all_ios.sh
index a18df256f9..2d99791839 100755
--- a/tensorflow/contrib/makefile/build_all_ios.sh
+++ b/tensorflow/contrib/makefile/build_all_ios.sh
@@ -96,7 +96,7 @@ if [[ "${ONLY_MAKE_TENSORFLOW}" != "true" ]]; then
 
     if [[ -z "${BUILD_ARCH}" ]]; then
         # Compile protobuf for the target iOS device architectures.
-        tensorflow/contrib/makefile/compile_ios_protobuf.sh -a ${DEFAULT_ARCH}
+        tensorflow/contrib/makefile/compile_ios_protobuf.sh
     else
         # Compile protobuf for the target iOS device architectures.
         tensorflow/contrib/makefile/compile_ios_protobuf.sh -a ${BUILD_ARCH}
-- 
GitLab


From 6afe900f543e0005ce69b3152330f1b7b16cb286 Mon Sep 17 00:00:00 2001
From: yegord <yegor.derevenets@gmail.com>
Date: Thu, 1 Feb 2018 00:02:25 +0100
Subject: [PATCH 439/966] optimize_for_inference_lib.fold_batch_norms()
 preserves data_format (#16075)

Fixes https://github.com/tensorflow/tensorflow/issues/15034
---
 .../tools/optimize_for_inference_lib.py       |  1 +
 .../tools/optimize_for_inference_test.py      | 89 ++++++++++---------
 2 files changed, 48 insertions(+), 42 deletions(-)

diff --git a/tensorflow/python/tools/optimize_for_inference_lib.py b/tensorflow/python/tools/optimize_for_inference_lib.py
index c2687bf557..9c19271222 100644
--- a/tensorflow/python/tools/optimize_for_inference_lib.py
+++ b/tensorflow/python/tools/optimize_for_inference_lib.py
@@ -349,6 +349,7 @@ def fold_batch_norms(input_graph_def):
     bias_add_op.op = "BiasAdd"
     bias_add_op.name = node.name
     bias_add_op.attr["T"].CopyFrom(conv_op.attr["T"])
+    bias_add_op.attr["data_format"].CopyFrom(conv_op.attr["data_format"])
     bias_add_op.input.extend([new_conv_op.name, offset_op.name])
     new_ops.extend([scaled_weights_op, new_conv_op, offset_op, bias_add_op])
 
diff --git a/tensorflow/python/tools/optimize_for_inference_test.py b/tensorflow/python/tools/optimize_for_inference_test.py
index 7686bb0f14..2ef612473b 100644
--- a/tensorflow/python/tools/optimize_for_inference_test.py
+++ b/tensorflow/python/tools/optimize_for_inference_test.py
@@ -173,48 +173,53 @@ class OptimizeForInferenceTest(test.TestCase):
       self.assertNotEqual("BatchNormWithGlobalNormalization", node.op)
 
   def testFoldFusedBatchNorms(self):
-    with self.test_session() as sess:
-      inputs = [1, 4, 2, 5, 3, 6, -1, -4, -2, -5, -3, -6]
-      input_op = constant_op.constant(
-          np.array(inputs), shape=[1, 1, 6, 2], dtype=dtypes.float32)
-      weights = [1, 2, 3, 4, 0.1, 0.2, 0.3, 0.4]
-      weights_op = constant_op.constant(
-          np.array(weights), shape=[1, 2, 2, 2], dtype=dtypes.float32)
-      conv_op = nn_ops.conv2d(
-          input_op, weights_op, [1, 1, 1, 1], padding="SAME", name="conv_op")
-      mean_op = constant_op.constant(
-          np.array([10, 20]), shape=[2], dtype=dtypes.float32)
-      variance_op = constant_op.constant(
-          np.array([0.25, 0.5]), shape=[2], dtype=dtypes.float32)
-      beta_op = constant_op.constant(
-          np.array([0.1, 0.6]), shape=[2], dtype=dtypes.float32)
-      gamma_op = constant_op.constant(
-          np.array([1.0, 2.0]), shape=[2], dtype=dtypes.float32)
-      ops.get_default_graph().graph_def_versions.producer = 9
-      gen_nn_ops._fused_batch_norm(
-          conv_op,
-          gamma_op,
-          beta_op,
-          mean_op,
-          variance_op,
-          0.00001,
-          is_training=False,
-          name="output")
-      original_graph_def = sess.graph_def
-      original_result = sess.run(["output:0"])
-    optimized_graph_def = optimize_for_inference_lib.fold_batch_norms(
-        original_graph_def)
-
-    with self.test_session() as sess:
-      _ = importer.import_graph_def(
-          optimized_graph_def, input_map={}, name="optimized")
-      optimized_result = sess.run(["optimized/output:0"])
-
-    self.assertAllClose(
-        original_result, optimized_result, rtol=1e-04, atol=1e-06)
-
-    for node in optimized_graph_def.node:
-      self.assertNotEqual("FusedBatchNorm", node.op)
+    for data_format, use_gpu in [("NHWC", False), ("NCHW", True)]:
+      with self.test_session(use_gpu=use_gpu) as sess:
+        inputs = [1, 4, 2, 5, 3, 6, -1, -4, -2, -5, -3, -6]
+        input_op = constant_op.constant(
+            np.array(inputs),
+            shape=[1, 1, 6, 2] if data_format == "NHWC" else [1, 2, 1, 6],
+            dtype=dtypes.float32)
+        weights = [1, 2, 3, 4, 0.1, 0.2, 0.3, 0.4]
+        weights_op = constant_op.constant(
+            np.array(weights), shape=[1, 2, 2, 2], dtype=dtypes.float32)
+        conv_op = nn_ops.conv2d(
+            input_op, weights_op, [1, 1, 1, 1], padding="SAME",
+            data_format=data_format, name="conv_op")
+        mean_op = constant_op.constant(
+            np.array([10, 20]), shape=[2], dtype=dtypes.float32)
+        variance_op = constant_op.constant(
+            np.array([0.25, 0.5]), shape=[2], dtype=dtypes.float32)
+        beta_op = constant_op.constant(
+            np.array([0.1, 0.6]), shape=[2], dtype=dtypes.float32)
+        gamma_op = constant_op.constant(
+            np.array([1.0, 2.0]), shape=[2], dtype=dtypes.float32)
+        ops.get_default_graph().graph_def_versions.producer = 9
+        gen_nn_ops._fused_batch_norm(
+            conv_op,
+            gamma_op,
+            beta_op,
+            mean_op,
+            variance_op,
+            0.00001,
+            is_training=False,
+            data_format=data_format,
+            name="output")
+        original_graph_def = sess.graph_def
+        original_result = sess.run(["output:0"])
+      optimized_graph_def = optimize_for_inference_lib.fold_batch_norms(
+          original_graph_def)
+
+      with self.test_session(use_gpu=use_gpu) as sess:
+        _ = importer.import_graph_def(
+            optimized_graph_def, input_map={}, name="optimized")
+        optimized_result = sess.run(["optimized/output:0"])
+
+      self.assertAllClose(
+          original_result, optimized_result, rtol=1e-04, atol=1e-06)
+
+      for node in optimized_graph_def.node:
+        self.assertNotEqual("FusedBatchNorm", node.op)
 
   def testFuseResizePadAndConv(self):
     with self.test_session() as sess:
-- 
GitLab


From 1ec4b3568ec7d387ba0d672562cb6d6898b17311 Mon Sep 17 00:00:00 2001
From: Suharsh Sivakumar <suharshs@google.com>
Date: Tue, 30 Jan 2018 17:03:49 -0800
Subject: [PATCH 440/966] Update graph_matcher to include OneofPattern.

PiperOrigin-RevId: 183916547
---
 .../contrib/quantize/python/graph_matcher.py  | 111 +++++++++++++-----
 .../quantize/python/graph_matcher_test.py     |  40 ++++++-
 2 files changed, 120 insertions(+), 31 deletions(-)

diff --git a/tensorflow/contrib/quantize/python/graph_matcher.py b/tensorflow/contrib/quantize/python/graph_matcher.py
index e3581cc559..b458f039df 100644
--- a/tensorflow/contrib/quantize/python/graph_matcher.py
+++ b/tensorflow/contrib/quantize/python/graph_matcher.py
@@ -18,8 +18,19 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import abc
 
-class OpTypePattern(object):
+
+class Pattern(object):
+  """The parent class of all patterns (e.g. OpTypePattern and OneofPattern)."""
+
+  @abc.abstractmethod
+  def match(self, op, tensor):
+    """Returns the result of matching op/tensor against this pattern."""
+    raise NotImplementedError('Method "match" not implemented.')
+
+
+class OpTypePattern(Pattern):
   """A tree pattern that matches TF expressions with certain op types."""
 
   def __init__(self, op_type, name=None, inputs=None):
@@ -34,7 +45,7 @@ class OpTypePattern(object):
         similar TF op types.
       name: Optional string. The name of the pattern that can be looked up in
         MatchResult.
-      inputs: Optional list of `OpTypePattern`s or strings that specify the
+      inputs: Optional list of `Pattern`s or strings that specify the
         patterns for the inputs of a matching op. If None, this pattern accepts
         any inputs of a matching op.
     """
@@ -43,22 +54,51 @@ class OpTypePattern(object):
     if inputs is None:
       inputs = []
     self._inputs = [
-        input_pattern if isinstance(input_pattern, OpTypePattern) else
-        OpTypePattern(input_pattern) for input_pattern in inputs
+        input_pattern
+        if isinstance(input_pattern, Pattern) else OpTypePattern(input_pattern)
+        for input_pattern in inputs
     ]
 
-  @property
-  def op_type(self):
-    return self._op_type
-
-  @property
-  def inputs(self):
-    return self._inputs
-
   @property
   def name(self):
     return self._name
 
+  def match(self, op, tensor):
+    if self._op_type != '*':
+      if op.type not in self._op_type.split('|'):
+        return None
+
+    match_result = MatchResult()
+    match_result.add(self, op, tensor)
+
+    if not self._inputs:
+      # If pattern.inputs is empty, skips the rest and accepts all the inputs.
+      return match_result
+
+    if len(op.inputs) != len(self._inputs):
+      return None
+
+    for input_tensor, input_pattern in zip(op.inputs, self._inputs):
+      input_match_result = input_pattern.match(input_tensor.op, input_tensor)
+      if input_match_result is None:
+        return None
+      match_result.merge_from(input_match_result)
+    return match_result
+
+
+class OneofPattern(Pattern):
+  """Matches one of the given sub-patterns."""
+
+  def __init__(self, sub_patterns):
+    self._sub_patterns = sub_patterns
+
+  def match(self, op, tensor):
+    for sub_pattern in self._sub_patterns:
+      match_result = sub_pattern.match(op, tensor)
+      if match_result is not None:
+        return match_result
+    return None
+
 
 class MatchResult(object):
   r"""Encapsulates the result of a match done by GraphMatcher.
@@ -102,16 +142,36 @@ class MatchResult(object):
       return pattern_or_name
 
     if isinstance(pattern_or_name, str):
+      if pattern_or_name not in self._name_to_pattern:
+        return None
       return self._name_to_pattern[pattern_or_name]
 
     raise ValueError('pattern_or_name has type %s. Expect OpTypePattern or str.'
                      % type(pattern_or_name))
 
+  def _get_op_tensor(self, pattern_or_name):
+    pattern = self._to_pattern(pattern_or_name)
+    if pattern is None:
+      return None
+
+    if pattern not in self._pattern_to_op_tensor:
+      return None
+
+    return self._pattern_to_op_tensor[pattern]
+
   def get_op(self, pattern_or_name):
-    return self._pattern_to_op_tensor[self._to_pattern(pattern_or_name)][0]
+    op_tensor = self._get_op_tensor(pattern_or_name)
+    return op_tensor[0] if op_tensor else None
 
   def get_tensor(self, pattern_or_name):
-    return self._pattern_to_op_tensor[self._to_pattern(pattern_or_name)][1]
+    op_tensor = self._get_op_tensor(pattern_or_name)
+    return op_tensor[1] if op_tensor else None
+
+  def merge_from(self, other_match_result):
+    # pylint: disable=protected-access
+    self._pattern_to_op_tensor.update(other_match_result._pattern_to_op_tensor)
+    self._name_to_pattern.update(other_match_result._name_to_pattern)
+    # pylint: enable=protected-access
 
 
 class GraphMatcher(object):
@@ -121,7 +181,7 @@ class GraphMatcher(object):
     """Initializes a GraphMatcher.
 
     Args:
-      pattern: The `OpTypePattern` against which `GraphMatcher` matches
+      pattern: The `Pattern` against which `GraphMatcher` matches
         subgraphs.
     """
     self._pattern = pattern
@@ -133,7 +193,7 @@ class GraphMatcher(object):
     with key `pattern`.
 
     Args:
-      pattern: An `OpTypePattern`.
+      pattern: An `Pattern`.
       op: A `tf.Operation` to match against the pattern.
       tensor: the output `tf.Tensor` of `op` that is used by the matching op of
         `pattern`'s parent. Can be None if `pattern` is already the root of the
@@ -142,20 +202,11 @@ class GraphMatcher(object):
     Returns:
       True if an TF expression rooted at `op` matches `pattern`.
     """
-    if pattern.op_type != '*':
-      if op.type not in pattern.op_type.split('|'):
-        return False
-
-    self._match_result.add(pattern, op, tensor)
-
-    if not pattern.inputs:
-      # If pattern.inputs is empty, skips the rest and accepts all the inputs.
-      return True
-
-    return len(op.inputs) == len(pattern.inputs) and all([
-        self._match_pattern(input_pattern, input_tensor.op, input_tensor)
-        for input_tensor, input_pattern in zip(op.inputs, pattern.inputs)
-    ])
+    match_result = pattern.match(op, tensor)
+    if match_result is None:
+      return False
+    self._match_result.merge_from(match_result)
+    return True
 
   def match_op(self, op):
     """Matches `op` against `self._pattern`.
diff --git a/tensorflow/contrib/quantize/python/graph_matcher_test.py b/tensorflow/contrib/quantize/python/graph_matcher_test.py
index e1572865e4..6d58757218 100644
--- a/tensorflow/contrib/quantize/python/graph_matcher_test.py
+++ b/tensorflow/contrib/quantize/python/graph_matcher_test.py
@@ -105,7 +105,7 @@ class GraphMatcherTest(test_util.TensorFlowTestCase):
     self.assertEqual(match_result.get_op(y1_pattern), y1.op)
     self.assertEqual(match_result.get_tensor(y1_pattern), y1)
 
-  def test_oneof_pattern(self):
+  def test_oneof_type_pattern(self):
     #   -   +
     #  / \ / \
     # x   y   z
@@ -125,6 +125,44 @@ class GraphMatcherTest(test_util.TensorFlowTestCase):
         for match_result in matcher.match_graph(g)
     ], [plus.op, minus.op])
 
+  def test_oneof_pattern(self):
+    reshape_pattern = graph_matcher.OpTypePattern('Reshape')
+    transpose_pattern = graph_matcher.OneofPattern([
+        graph_matcher.OpTypePattern(
+            'Transpose',
+            name='transpose',
+            inputs=[
+                graph_matcher.OpTypePattern(
+                    'Slice', name='slice', inputs=[reshape_pattern, '*', '*']),
+                '*'
+            ]),
+        graph_matcher.OpTypePattern(
+            'Transpose', name='transpose', inputs=[reshape_pattern, '*'])
+    ])
+
+    matcher = graph_matcher.GraphMatcher(transpose_pattern)
+
+    g = ops.Graph()
+    with g.as_default():
+      inputs = array_ops.placeholder(dtypes.float32, shape=[6])
+      reshape = array_ops.reshape(inputs, [2, 3])
+      transpose = array_ops.transpose(reshape)
+      [match_result] = list(matcher.match_graph(g))
+      self.assertEqual(match_result.get_tensor(reshape_pattern), reshape)
+      self.assertEqual(match_result.get_tensor('slice'), None)
+      self.assertEqual(match_result.get_op('transpose'), transpose.op)
+
+    g = ops.Graph()
+    with g.as_default():
+      inputs = array_ops.placeholder(dtypes.float32, shape=[6])
+      reshape = array_ops.reshape(inputs, [2, 3])
+      slicing = array_ops.slice(reshape, [0, 0], [-1, -1])
+      transpose = array_ops.transpose(slicing)
+      [match_result] = list(matcher.match_graph(g))
+      self.assertEqual(match_result.get_tensor(reshape_pattern), reshape)
+      self.assertEqual(match_result.get_tensor('slice'), slicing)
+      self.assertEqual(match_result.get_op('transpose'), transpose.op)
+
 
 if __name__ == '__main__':
   googletest.main()
-- 
GitLab


From 9d91cc94bbcc4f4e27167edc419579af4b9be0c8 Mon Sep 17 00:00:00 2001
From: Jonathan Hseu <jhseu@google.com>
Date: Tue, 30 Jan 2018 17:28:42 -0800
Subject: [PATCH 441/966] Give tf.contrib.summary a default suffix of .v2. This
 is so that it doesn't conflict with v1 summaries that are being created at
 the same time.

By default, MonitoredTrainingSession and Estimator will create v1 summaries, so using both currently creates a race condition, and the last one to create the event file writer wins.

PiperOrigin-RevId: 183919988
---
 tensorflow/contrib/summary/summary_ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/summary/summary_ops.py b/tensorflow/contrib/summary/summary_ops.py
index ee661dfdc1..a6968d8b2a 100644
--- a/tensorflow/contrib/summary/summary_ops.py
+++ b/tensorflow/contrib/summary/summary_ops.py
@@ -202,7 +202,7 @@ def create_file_writer(logdir,
     if flush_millis is None:
       flush_millis = constant_op.constant(2 * 60 * 1000)
     if filename_suffix is None:
-      filename_suffix = constant_op.constant("")
+      filename_suffix = constant_op.constant(".v2")
     return _make_summary_writer(
         name,
         gen_summary_ops.create_summary_file_writer,
-- 
GitLab


From f6d3f0a5506ff221ba52c4f800d57c8d6b47643d Mon Sep 17 00:00:00 2001
From: Max Galkin <maxgalkin@google.com>
Date: Tue, 30 Jan 2018 17:41:00 -0800
Subject: [PATCH 442/966] Add a CLIF wrapper for DeviceProperties.

PiperOrigin-RevId: 183921389
---
 tensorflow/core/BUILD | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 7a0c0ea3fe..1fcdfa73ad 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -1351,6 +1351,13 @@ tf_pyclif_proto_library(
     visibility = ["//visibility:public"],
 )
 
+tf_pyclif_proto_library(
+    name = "protobuf/device_properties_pyclif",
+    proto_lib = ":protos_all_cc",
+    proto_srcfile = "protobuf/device_properties.proto",
+    visibility = ["//visibility:public"],
+)
+
 # -----------------------------------------------------------------------------
 # Internal targets
 
-- 
GitLab


From 4c2ee0464a7c50afa1db3973215b2c7df8ade0e4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 30 Jan 2018 17:58:28 -0800
Subject: [PATCH 443/966] K-FAC: Wrap extract_image_patches() for compatibility
 with XLA.

PiperOrigin-RevId: 183923073
---
 .../contrib/kfac/python/ops/fisher_factors.py | 63 ++++++++++++++++---
 1 file changed, 55 insertions(+), 8 deletions(-)

diff --git a/tensorflow/contrib/kfac/python/ops/fisher_factors.py b/tensorflow/contrib/kfac/python/ops/fisher_factors.py
index f59168cbc0..bcba18ae14 100644
--- a/tensorflow/contrib/kfac/python/ops/fisher_factors.py
+++ b/tensorflow/contrib/kfac/python/ops/fisher_factors.py
@@ -31,6 +31,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
 from tensorflow.python.ops import special_math_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
@@ -111,6 +112,54 @@ def diagonal_covariance_initializer(shape, dtype, partition_info):  # pylint: di
   return array_ops.ones(shape, dtype)
 
 
+def extract_image_patches(image, ksizes, strides, padding, name=None):
+  """Extracts image patches for an N-dimensional convolution.
+
+  This function is a compatibility wrapper over tf.extract_image_patches(), as
+  ExtractImagePatches isn't yet implemented in XLA.
+
+  Args:
+    image: Tensor of shape [batch, in_x, in_y, ..., in_channels]. Input images.
+      All dimensions except 'batch' must be defined.
+    ksizes: [filter_x, filter_y, ...]. Spatial shape of filter in each
+      dimension.
+    strides: [stride_x, stride_y, ...]. Spatial stride for filter in each
+      dimension.
+    padding: str. "VALID" or "SAME".
+    name: str or None. name of Op.
+
+  Returns:
+    result: [batch, out_x, out_y, ..., filter_x, filter_y, ..., in_channels].
+      Contains image patches to which conv kernel would be applied for each
+      output location. [out_x, out_y, ...] depends on padding.
+  """
+  if not utils.on_tpu():
+    return array_ops.extract_image_patches(
+        image,
+        ksizes=([1] + list(ksizes) + [1]),
+        strides=([1] + list(strides) + [1]),
+        rates=[1, 1, 1, 1],
+        padding=padding,
+        name=name)
+
+  with tf_ops.name_scope(name, "extract_image_patches",
+                         [image, ksizes, strides, padding]):
+    batch = image.shape.as_list()[0]
+    in_channels = image.shape.as_list()[-1]
+
+    # Map each input feature to a location in the output.
+    out_channels = np.prod(ksizes) * in_channels
+    filters = linalg_ops.eye(out_channels),
+    filters = array_ops.reshape(filters, ksizes + [in_channels, out_channels])
+
+    result = nn.convolution(image, filters, padding, strides=strides)
+    out_spatial = result.shape.as_list()[1:-1]
+    result = array_ops.reshape(
+        result, [batch or -1] + out_spatial + ksizes + [in_channels])
+
+    return result
+
+
 def compute_cov(tensor, tensor_right=None, normalizer=None):
   """Compute the empirical second moment of the rows of a 2D Tensor.
 
@@ -668,11 +717,10 @@ class ConvDiagonalFactor(DiagonalFactor):
 
       # TODO(b/64144716): there is potential here for a big savings in terms
       # of memory use.
-      patches = array_ops.extract_image_patches(
+      patches = extract_image_patches(
           self._inputs,
-          ksizes=[1, filter_height, filter_width, 1],
-          strides=self._strides,
-          rates=[1, 1, 1, 1],
+          ksizes=[filter_height, filter_width],
+          strides=self._strides[1:-1],
           padding=self._padding)
 
       if self._has_bias:
@@ -816,11 +864,10 @@ class ConvInputKroneckerFactor(InverseProvidingFactor):
 
       # TODO(b/64144716): there is potential here for a big savings in terms of
       # memory use.
-      patches = array_ops.extract_image_patches(
+      patches = extract_image_patches(
           self._inputs,
-          ksizes=[1, filter_height, filter_width, 1],
-          strides=self._strides,
-          rates=[1, 1, 1, 1],
+          ksizes=[filter_height, filter_width],
+          strides=self._strides[1:-1],
           padding=self._padding)
 
       flatten_size = (filter_height * filter_width * in_channels)
-- 
GitLab


From a667b5da18f28c50857adb211de15a883f85fd65 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 30 Jan 2018 18:03:31 -0800
Subject: [PATCH 444/966] Removed unused variable doReverseChannels.

PiperOrigin-RevId: 183923876
---
 .../examples/ios/camera/CameraExampleViewController.mm | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/tensorflow/contrib/lite/examples/ios/camera/CameraExampleViewController.mm b/tensorflow/contrib/lite/examples/ios/camera/CameraExampleViewController.mm
index 10f31bb6f1..d74e275f04 100644
--- a/tensorflow/contrib/lite/examples/ios/camera/CameraExampleViewController.mm
+++ b/tensorflow/contrib/lite/examples/ios/camera/CameraExampleViewController.mm
@@ -225,14 +225,8 @@ static void GetTopN(const uint8_t* prediction, const int prediction_size, const
   assert(pixelBuffer != NULL);
 
   OSType sourcePixelFormat = CVPixelBufferGetPixelFormatType(pixelBuffer);
-  int doReverseChannels;
-  if (kCVPixelFormatType_32ARGB == sourcePixelFormat) {
-    doReverseChannels = 1;
-  } else if (kCVPixelFormatType_32BGRA == sourcePixelFormat) {
-    doReverseChannels = 0;
-  } else {
-    assert(false);  // Unknown source format
-  }
+  assert(sourcePixelFormat == kCVPixelFormatType_32ARGB ||
+         sourcePixelFormat == kCVPixelFormatType_32BGRA);
 
   const int sourceRowBytes = (int)CVPixelBufferGetBytesPerRow(pixelBuffer);
   const int image_width = (int)CVPixelBufferGetWidth(pixelBuffer);
-- 
GitLab


From 7788bcaf4b48b22eaadee9a9cc1b0aff04cc5907 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 30 Jan 2018 18:08:52 -0800
Subject: [PATCH 445/966] internal change

PiperOrigin-RevId: 183924520
---
 .../tpu/profiler/capture_tpu_profile.cc       | 28 +++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
index 6a05a2abf6..9ac9e3cb28 100644
--- a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
+++ b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/contrib/tpu/profiler/version.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/util/command_line_flags.h"
 
@@ -47,6 +48,19 @@ string GetCurrentTimeStampAsString() {
   return s;
 }
 
+Status ValidateHostPortPair(const string& host_port) {
+  uint32 port;
+  std::vector<string> parts = str_util::Split(host_port, ':');
+  // Must be host:port, port must be a number, host must not contain a '/',
+  // host also must not be empty.
+  if (parts.size() != 2 || !strings::safe_strtou32(parts[1], &port) ||
+      parts[0].find("/") != string::npos || parts[0].empty()) {
+    return errors::InvalidArgument("Could not interpret \"", host_port,
+                                   "\" as a host-port pair.");
+  }
+  return Status::OK();
+}
+
 ProfileResponse Profile(const string& service_addr, int duration_ms,
                         const ProfileOptions& opts) {
   ProfileRequest request;
@@ -60,11 +74,14 @@ ProfileResponse Profile(const string& service_addr, int duration_ms,
   ::grpc::ClientContext context;
   ::grpc::ChannelArguments channel_args;
   // TODO(ioeric): use `SetMaxReceiveMessageSize` instead once it's available.
+  // TODO(qiuminxu): use `NewHostPortGrpcChannel` instead once their
+  // `ValidateHostPortPair` checks for empty host string case.
   channel_args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH,
                       std::numeric_limits<int32>::max());
   std::unique_ptr<TPUProfiler::Stub> stub =
       TPUProfiler::NewStub(::grpc::CreateCustomChannel(
-          service_addr, ::grpc::InsecureChannelCredentials(), channel_args));
+          "dns:///" + service_addr, ::grpc::InsecureChannelCredentials(),
+          channel_args));
   ProfileResponse response;
   TF_QCHECK_OK(FromGrpcStatus(stub->Profile(&context, request, &response)));
   return response;
@@ -101,7 +118,14 @@ int main(int argc, char** argv) {
   tensorflow::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   bool parse_ok = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_ok || FLAGS_service_addr.empty() || FLAGS_logdir.empty()) {
-    std::printf("%s", usage.c_str());
+    std::cout << usage.c_str() << std::endl;
+    return 2;
+  }
+  tensorflow::Status status =
+      tensorflow::tpu::ValidateHostPortPair(FLAGS_service_addr);
+  if (!status.ok()) {
+    std::cout << status.error_message() << std::endl;
+    std::cout << usage.c_str() << std::endl;
     return 2;
   }
   tensorflow::port::InitMain(argv[0], &argc, &argv);
-- 
GitLab


From 811785eec2d24743804d1128d1cce900aedad3d0 Mon Sep 17 00:00:00 2001
From: Anjali Sridhar <anjalisridhar@google.com>
Date: Tue, 30 Jan 2018 18:50:56 -0800
Subject: [PATCH 446/966] Eager support for tf.Keras.

PiperOrigin-RevId: 183928850
---
 tensorflow/python/keras/BUILD                 |  14 +
 .../python/keras/_impl/keras/backend.py       |  14 +-
 .../keras/_impl/keras/engine/topology.py      |   4 +-
 .../keras/_impl/keras/engine/training.py      | 225 ++++--
 .../_impl/keras/engine/training_eager.py      | 666 +++++++++++++++
 .../_impl/keras/engine/training_eager_test.py | 755 ++++++++++++++++++
 .../python/keras/_impl/keras/layers/core.py   |   4 +-
 .../keras/_impl/keras/layers/normalization.py |   3 +-
 .../python/keras/_impl/keras/optimizers.py    |  10 +-
 tensorflow/python/layers/base.py              |  10 +-
 tensorflow/python/layers/network.py           |  19 +-
 11 files changed, 1651 insertions(+), 73 deletions(-)
 create mode 100644 tensorflow/python/keras/_impl/keras/engine/training_eager.py
 create mode 100644 tensorflow/python/keras/_impl/keras/engine/training_eager_test.py

diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index 6125755775..a9dd8d8e9d 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -39,6 +39,7 @@ py_library(
         "_impl/keras/engine/__init__.py",
         "_impl/keras/engine/topology.py",
         "_impl/keras/engine/training.py",
+        "_impl/keras/engine/training_eager.py",
         "_impl/keras/estimator.py",
         "_impl/keras/initializers.py",
         "_impl/keras/layers/__init__.py",
@@ -719,6 +720,19 @@ py_test(
     ],
 )
 
+py_test(
+    name = "training_eager_test",
+    size = "medium",
+    srcs = ["_impl/keras/engine/training_eager_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["notsan"],
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "topology_test",
     size = "small",
diff --git a/tensorflow/python/keras/_impl/keras/backend.py b/tensorflow/python/keras/_impl/keras/backend.py
index 460c0dc5f3..098ea063f9 100644
--- a/tensorflow/python/keras/_impl/keras/backend.py
+++ b/tensorflow/python/keras/_impl/keras/backend.py
@@ -29,6 +29,7 @@ import numpy as np
 
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session as session_module
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_module
 from tensorflow.python.framework import ops
@@ -326,7 +327,15 @@ def learning_phase():
 
   Returns:
       Learning phase (scalar integer tensor or Python integer).
+
+  Raises:
+      ValueError: If called when Eager execution is enabled.
   """
+  if context.in_eager_mode():
+    if 'eager' not in _GRAPH_LEARNING_PHASES:
+      raise ValueError('No learning phase set in Eager mode.')
+    return _GRAPH_LEARNING_PHASES['eager']
+
   graph = ops.get_default_graph()
   if graph not in _GRAPH_LEARNING_PHASES:
     phase = array_ops.placeholder_with_default(
@@ -347,7 +356,10 @@ def set_learning_phase(value):
   global _GRAPH_LEARNING_PHASES  # pylint: disable=global-variable-not-assigned
   if value not in {0, 1}:
     raise ValueError('Expected learning phase to be ' '0 or 1.')
-  _GRAPH_LEARNING_PHASES[ops.get_default_graph()] = value
+  if context.in_eager_mode():
+    _GRAPH_LEARNING_PHASES['eager'] = value
+  else:
+    _GRAPH_LEARNING_PHASES[ops.get_default_graph()] = value
 
 
 def get_session():
diff --git a/tensorflow/python/keras/_impl/keras/engine/topology.py b/tensorflow/python/keras/_impl/keras/engine/topology.py
index 64aa868f38..8354a2b8fd 100644
--- a/tensorflow/python/keras/_impl/keras/engine/topology.py
+++ b/tensorflow/python/keras/_impl/keras/engine/topology.py
@@ -708,8 +708,10 @@ class Network(tf_network.GraphNetwork, Layer):
       self.input_names.append(layer.name)
       if layer.is_placeholder:
         self._feed_input_names.append(layer.name)
-        self._feed_inputs.append(layer.input)
         self._feed_input_shapes.append(K.int_shape(self.inputs[i]))
+        # layer.input gives an error in eager mode
+        if context.in_graph_mode():
+          self._feed_inputs.append(layer.input)
     for layer in self._output_layers:
       self.output_names.append(layer.name)
 
diff --git a/tensorflow/python/keras/_impl/keras/engine/training.py b/tensorflow/python/keras/_impl/keras/engine/training.py
index 6885ef5724..43d95b1f19 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training.py
@@ -22,17 +22,21 @@ import copy
 
 import numpy as np
 
+from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import callbacks as cbks
 from tensorflow.python.keras._impl.keras import losses
 from tensorflow.python.keras._impl.keras import metrics as metrics_module
 from tensorflow.python.keras._impl.keras import optimizers
+from tensorflow.python.keras._impl.keras.engine import training_eager
 from tensorflow.python.keras._impl.keras.engine.topology import Network
 from tensorflow.python.keras._impl.keras.utils.data_utils import GeneratorEnqueuer
 from tensorflow.python.keras._impl.keras.utils.data_utils import OrderedEnqueuer
 from tensorflow.python.keras._impl.keras.utils.data_utils import Sequence
 from tensorflow.python.keras._impl.keras.utils.generic_utils import Progbar
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import optimizer as tf_optimizer_module
 
 try:
   from scipy.sparse import issparse  # pylint: disable=g-import-not-at-top
@@ -621,9 +625,15 @@ class Model(Network):
             `optimizer`, `loss`, `metrics` or `sample_weight_mode`.
     """
     loss = loss or {}
+    if context.in_eager_mode() and  not isinstance(
+        optimizer, tf_optimizer_module.Optimizer):
+      raise ValueError('Only TF native optimizers are supported in Eager mode.')
+
     self.optimizer = optimizers.get(optimizer)
     self.loss = loss
     self.loss_weights = loss_weights
+    if context.in_eager_mode() and sample_weight_mode is not None:
+      raise ValueError('sample_weight_mode is not supported in Eager mode.')
     self.sample_weight_mode = sample_weight_mode
 
     # Prepare loss functions.
@@ -654,6 +664,7 @@ class Model(Network):
       loss_function = losses.get(loss)
       loss_functions = [loss_function for _ in range(len(self.outputs))]
     self.loss_functions = loss_functions
+
     weighted_losses = [_weighted_masked_objective(fn) for fn in loss_functions]
     skip_target_indices = []
     skip_target_weighing_indices = []
@@ -667,11 +678,12 @@ class Model(Network):
         skip_target_weighing_indices.append(i)
 
     # Prepare output masks.
-    masks = self.compute_mask(self.inputs, mask=None)
-    if masks is None:
-      masks = [None for _ in self.outputs]
-    if not isinstance(masks, list):
-      masks = [masks]
+    if context.in_graph_mode():
+      masks = self.compute_mask(self.inputs, mask=None)
+      if masks is None:
+        masks = [None for _ in self.outputs]
+      if not isinstance(masks, list):
+        masks = [masks]
 
     # Prepare loss weights.
     if loss_weights is None:
@@ -697,6 +709,32 @@ class Model(Network):
     else:
       raise TypeError('Could not interpret loss_weights argument: ' +
                       str(loss_weights) + ' - expected a list of dicts.')
+    self.loss_weights_list = loss_weights_list
+
+    # initialization for Eager mode execution
+    if context.in_eager_mode():
+      if target_tensors is not None:
+        raise ValueError('target_tensors are not currently supported in Eager'
+                         'mode.')
+      self.total_loss = None
+      self.metrics = metrics
+      self.weighted_metrics = weighted_metrics
+      self.metrics_tensors = []
+      self.metrics_names = ['loss']
+      for i in range(len(self.outputs)):
+        if len(self.outputs) > 1:
+          self.metrics_names.append(self.output_names[i] + '_loss')
+      self.nested_metrics = _collect_metrics(metrics, self.output_names)
+      self._feed_sample_weight_modes = []
+      for i in range(len(self.outputs)):
+        self._feed_sample_weight_modes.append(None)
+      self.sample_weights = []
+      self.targets = []
+      self._collected_trainable_weights = self.trainable_weights
+      for i in range(len(self.outputs)):
+        self._feed_output_names.append(self.output_names[i])
+
+      return
 
     # Prepare targets of model.
     self.targets = []
@@ -723,6 +761,7 @@ class Model(Network):
       else:
         raise TypeError('Expected `target_tensors` to be '
                         'a list or dict, but got:', target_tensors)
+
     for i in range(len(self.outputs)):
       if i in skip_target_indices:
         self.targets.append(None)
@@ -772,7 +811,7 @@ class Model(Network):
             weight = K.placeholder(ndim=2, name=name + '_sample_weights')
             sample_weight_modes.append('temporal')
           else:
-            weight = K.placeholder(ndim=1, name=name + '_sample_weights')
+            weight = K.placeholder(ndim=1, name=name + 'sample_weights')
             sample_weight_modes.append(None)
         sample_weights.append(weight)
     elif isinstance(sample_weight_mode, list):
@@ -932,7 +971,7 @@ class Model(Network):
     self._feed_sample_weights = []
     for i in range(len(self.sample_weights)):
       if i not in skip_target_weighing_indices:
-        self._feed_sample_weights.append(sample_weights[i])
+        self._feed_sample_weights.append(self.sample_weights[i])
 
     # Functions for train, test and predict will
     # be compiled lazily when required.
@@ -981,6 +1020,7 @@ class Model(Network):
         with K.name_scope(self.optimizer.__class__.__name__):
           training_updates = self.optimizer.get_updates(
               params=self._collected_trainable_weights, loss=self.total_loss)
+
         updates = self.updates + training_updates
         # Gets loss and metrics. Updates weights at each call.
         self.train_function = K.function(
@@ -1159,6 +1199,7 @@ class Model(Network):
       callback_model = self
 
     callbacks.set_model(callback_model)
+
     callbacks.set_params({
         'batch_size': batch_size,
         'epochs': epochs,
@@ -1219,6 +1260,7 @@ class Model(Network):
           np.random.shuffle(index_array)
 
         batches = _make_batches(num_train_samples, batch_size)
+
         for batch_index, (batch_start, batch_end) in enumerate(batches):
           batch_ids = index_array[batch_start:batch_end]
           try:
@@ -1413,6 +1455,7 @@ class Model(Network):
           ins_batch[i] = ins_batch[i].toarray()
 
         batch_outs = f(ins_batch)
+
         if isinstance(batch_outs, list):
           if batch_index == 0:
             for batch_out in enumerate(batch_outs):
@@ -1423,7 +1466,6 @@ class Model(Network):
           if batch_index == 0:
             outs.append(0.)
           outs[0] += batch_outs * len(batch_ids)
-
         if verbose == 1:
           progbar.update(batch_end)
       for i in range(len(outs)):
@@ -1639,6 +1681,7 @@ class Model(Network):
         batch_size=batch_size)
     # Prepare validation data.
     do_validation = False
+    val_ins = []
     if validation_data:
       do_validation = True
       if len(validation_data) == 2:
@@ -1689,39 +1732,65 @@ class Model(Network):
       ins = x + y + sample_weights + [1.]
     else:
       ins = x + y + sample_weights
-    self._make_train_function()
-    f = self.train_function
 
     # Prepare display labels.
     out_labels = self._get_deduped_metrics_names()
 
-    if do_validation:
-      self._make_test_function()
-      val_f = self.test_function
-      callback_metrics = copy.copy(out_labels) + [
-          'val_' + n for n in out_labels
-      ]
+    if context.in_eager_mode():
+      if do_validation:
+        callback_metrics = copy.copy(out_labels) + [
+            'val_' + n for n in out_labels
+        ]
+      else:
+        callback_metrics = copy.copy(out_labels)
+
+      return training_eager.fit_loop(
+          self,
+          ins,
+          out_labels=out_labels,
+          batch_size=batch_size,
+          epochs=epochs,
+          verbose=verbose,
+          callbacks=callbacks,
+          val_ins=val_ins,
+          shuffle=shuffle,
+          callback_metrics=callback_metrics,
+          initial_epoch=initial_epoch,
+          steps_per_epoch=steps_per_epoch,
+          validation_steps=validation_steps)
     else:
-      callback_metrics = copy.copy(out_labels)
-      val_f = None
-      val_ins = []
-
-    # Delegate logic to `_fit_loop`.
-    return self._fit_loop(
-        f,
-        ins,
-        out_labels=out_labels,
-        batch_size=batch_size,
-        epochs=epochs,
-        verbose=verbose,
-        callbacks=callbacks,
-        val_f=val_f,
-        val_ins=val_ins,
-        shuffle=shuffle,
-        callback_metrics=callback_metrics,
-        initial_epoch=initial_epoch,
-        steps_per_epoch=steps_per_epoch,
-        validation_steps=validation_steps)
+      self._make_train_function()
+      f = self.train_function
+
+      if do_validation:
+        if context.in_graph_mode():
+          self._make_test_function()
+          val_f = self.test_function
+        else:
+          val_f = None
+        callback_metrics = copy.copy(out_labels) + [
+            'val_' + n for n in out_labels
+        ]
+      else:
+        val_f = None
+        callback_metrics = copy.copy(out_labels)
+
+      # Delegate logic to `_fit_loop`.
+      return self._fit_loop(
+          f,
+          ins,
+          out_labels=out_labels,
+          batch_size=batch_size,
+          epochs=epochs,
+          verbose=verbose,
+          callbacks=callbacks,
+          val_f=val_f,
+          val_ins=val_ins,
+          shuffle=shuffle,
+          callback_metrics=callback_metrics,
+          initial_epoch=initial_epoch,
+          steps_per_epoch=steps_per_epoch,
+          validation_steps=validation_steps)
 
   def evaluate(self,
                x=None,
@@ -1797,10 +1866,15 @@ class Model(Network):
       ins = x + y + sample_weights + [0.]
     else:
       ins = x + y + sample_weights
-    self._make_test_function()
-    f = self.test_function
-    return self._test_loop(
-        f, ins, batch_size=batch_size, verbose=verbose, steps=steps)
+
+    if context.in_eager_mode():
+      return training_eager.test_loop(
+          self, ins, batch_size=batch_size, verbose=verbose, steps=steps)
+    else:
+      self._make_test_function()
+      f = self.test_function
+      return self._test_loop(
+          f, ins, batch_size=batch_size, verbose=verbose, steps=steps)
 
   def predict(self, x, batch_size=None, verbose=0, steps=None):
     """Generates output predictions for the input samples.
@@ -1852,10 +1926,16 @@ class Model(Network):
       ins = x + [0.]
     else:
       ins = x
-    self._make_predict_function()
-    f = self.predict_function
-    return self._predict_loop(
-        f, ins, batch_size=batch_size, verbose=verbose, steps=steps)
+
+    if context.in_eager_mode():
+      return training_eager.predict_loop(
+          self, ins, batch_size=batch_size, verbose=verbose, steps=steps)
+    else:
+      self._make_predict_function()
+      f = self.predict_function
+
+      return self._predict_loop(
+          f, ins, batch_size=batch_size, verbose=verbose, steps=steps)
 
   def train_on_batch(self, x, y, sample_weight=None, class_weight=None):
     """Runs a single gradient update on a single batch of data.
@@ -1891,6 +1971,7 @@ class Model(Network):
         or list of scalars (if the model has multiple outputs
         and/or metrics). The attribute `model.metrics_names` will give you
         the display labels for the scalar outputs.
+
     """
     x, y, sample_weights = self._standardize_user_data(
         x,
@@ -1902,11 +1983,16 @@ class Model(Network):
       ins = x + y + sample_weights + [1.]
     else:
       ins = x + y + sample_weights
-    self._make_train_function()
-    outputs = self.train_function(ins)
-    if len(outputs) == 1:
-      return outputs[0]
-    return outputs
+
+    if context.in_eager_mode():
+      return training_eager.train_on_batch(self, ins)
+
+    if context.in_graph_mode():
+      self._make_train_function()
+      outputs = self.train_function(ins)
+      if len(outputs) == 1:
+        return outputs[0]
+      return outputs
 
   def test_on_batch(self, x, y, sample_weight=None):
     """Test the model on a single batch of samples.
@@ -1945,11 +2031,16 @@ class Model(Network):
       ins = x + y + sample_weights + [0.]
     else:
       ins = x + y + sample_weights
-    self._make_test_function()
-    outputs = self.test_function(ins)
-    if len(outputs) == 1:
-      return outputs[0]
-    return outputs
+
+    if context.in_eager_mode():
+      return training_eager.test_on_batch(self, ins)
+
+    if context.in_graph_mode():
+      self._make_test_function()
+      outputs = self.test_function(ins)
+      if len(outputs) == 1:
+        return outputs[0]
+      return outputs
 
   def predict_on_batch(self, x):
     """Returns predictions for a single batch of samples.
@@ -1959,6 +2050,7 @@ class Model(Network):
 
     Returns:
         Numpy array(s) of predictions.
+
     """
     x = _standardize_input_data(x, self._feed_input_names,
                                 self._feed_input_shapes)
@@ -1966,11 +2058,25 @@ class Model(Network):
       ins = x + [0.]
     else:
       ins = x
-    self._make_predict_function()
-    outputs = self.predict_function(ins)
-    if len(outputs) == 1:
-      return outputs[0]
-    return outputs
+
+    if context.in_eager_mode():
+      ins_batch_converted = []
+      for ib in ins:
+        ins_batch_converted.append(ops.convert_to_tensor(ib, dtype=K.floatx()))
+
+      eager_model_inputs = []
+      for i in range(len(self.inputs)):
+        eager_model_inputs.append(ins_batch_converted[i])
+
+      outs = self(eager_model_inputs)  # pylint: disable=not-callable
+      return outs
+
+    if context.in_graph_mode():
+      self._make_predict_function()
+      outputs = self.predict_function(ins)
+      if len(outputs) == 1:
+        return outputs[0]
+      return outputs
 
   def fit_generator(self,
                     generator,
@@ -2075,7 +2181,6 @@ class Model(Network):
         model.fit_generator(generate_arrays_from_file('/my_file.txt'),
                             steps_per_epoch=10000, epochs=10)
     ```
-
     Raises:
         ValueError: In case the generator yields
             data in an invalid format.
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_eager.py b/tensorflow/python/keras/_impl/keras/engine/training_eager.py
new file mode 100644
index 0000000000..0a115969ca
--- /dev/null
+++ b/tensorflow/python/keras/_impl/keras/engine/training_eager.py
@@ -0,0 +1,666 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras training and evaluation routines.
+"""
+# pylint: disable=protected-access
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+from tensorflow.python.eager.backprop import GradientTape
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.keras._impl.keras import backend as K
+from tensorflow.python.keras._impl.keras import callbacks as cbks
+from tensorflow.python.keras._impl.keras import losses
+from tensorflow.python.keras._impl.keras import metrics as metrics_module
+from tensorflow.python.keras._impl.keras.utils.generic_utils import Progbar
+
+
+def _make_batches(size, batch_size):
+  """Returns a list of batch indices (tuples of indices).
+
+  Arguments:
+      size: Integer, total size of the data to slice into batches.
+      batch_size: Integer, batch size.
+
+  Returns:
+      A list of tuples of array indices.
+  """
+  num_batches = int(np.ceil(size / float(batch_size)))
+  return [(i * batch_size, min(size, (i + 1) * batch_size))
+          for i in range(0, num_batches)]
+
+
+def _slice_arrays(arrays, start=None, stop=None):
+  """Slice an array or list of arrays.
+
+  This takes an array-like, or a list of
+  array-likes, and outputs:
+      - arrays[start:stop] if `arrays` is an array-like
+      - [x[start:stop] for x in arrays] if `arrays` is a list
+
+  Can also work on list/array of indices: `_slice_arrays(x, indices)`
+
+  Arguments:
+      arrays: Single array or list of arrays.
+      start: can be an integer index (start index)
+          or a list/array of indices
+      stop: integer (stop index); should be None if
+          `start` was a list.
+
+  Returns:
+      A slice of the array(s).
+
+  Raises:
+      ValueError: If the value of start is a list and stop is not None.
+  """
+  if arrays is None:
+    return [None]
+  if isinstance(start, list) and stop is not None:
+    raise ValueError('The stop argument has to be None if the value of start is'
+                     'a list.')
+  elif isinstance(arrays, list):
+    if hasattr(start, '__len__'):
+      # hdf5 datasets only support list objects as indices
+      if hasattr(start, 'shape'):
+        start = start.tolist()
+      return [None if x is None else x[start] for x in arrays]
+    else:
+      return [None if x is None else x[start:stop] for x in arrays]
+  else:
+    if hasattr(start, '__len__'):
+      if hasattr(start, 'shape'):
+        start = start.tolist()
+      return arrays[start]
+    elif hasattr(start, '__getitem__'):
+      return arrays[start:stop]
+    else:
+      return [None]
+
+
+def _get_metrics_info(metric, internal_output_shapes=None, loss_func=None):
+  if metric == 'accuracy' or metric == 'acc':
+    # custom handling of accuracy
+    # (because of class mode duality)
+    output_shape = internal_output_shapes
+    if output_shape[-1] == 1 or loss_func == losses.binary_crossentropy:
+      # case: binary accuracy
+      acc_fn = metrics_module.binary_accuracy
+    elif loss_func == losses.sparse_categorical_crossentropy:
+      # case: categorical accuracy with sparse targets
+      acc_fn = metrics_module.sparse_categorical_accuracy
+    else:
+      acc_fn = metrics_module.categorical_accuracy
+
+    metric_name = 'acc'
+    return metric_name, acc_fn
+  else:
+    metric_fn = metrics_module.get(metric)
+    metric_name = metric_fn.__name__
+    return metric_name, metric_fn
+
+
+def _eager_loss_fn(outputs, targets, loss_fn, output_name):
+  with K.name_scope(output_name + '_loss'):
+    loss = loss_fn(targets, outputs)
+  return loss
+
+
+def _eager_metrics_fn(model, outputs, targets):
+  """Calculates the metrics for each output of the given model.
+
+  Arguments:
+      model: The model on which metrics are being calculated.
+      outputs: The outputs of the given model.
+      targets: The predictions or targets of the given model.
+
+  Returns:
+      Returns the metric names and metric results for each output of the model.
+  """
+  metric_names = []
+  metric_results = []
+  if not isinstance(outputs, list):
+    outputs = [outputs]
+
+  if not isinstance(targets, list):
+    targets = [targets]
+
+  for i in range(len(model.outputs)):
+    output_metrics = model.nested_metrics[i]
+    for nested_output_metric in output_metrics:
+      metric_name, metric_fn = _get_metrics_info(
+          nested_output_metric, model._internal_output_shapes[i],
+          model.loss_functions[i])
+
+      if len(model.output_names) > 1:
+        metric_name = model.output_names[i] + '_' + metric_name
+        if metric_name not in model.metrics_names:
+          model.metrics_names.append(metric_name)
+
+      with K.name_scope(metric_name):
+        metric_result = metric_fn(outputs[i], targets[i])
+        metric_names.append(metric_name)
+        metric_results.append(K.mean(metric_result))
+
+  return metric_names, metric_results
+
+
+def _model_loss(model, inputs, targets):
+  """Calculates the loss for a given model.
+
+  Arguments:
+     model: The model on which metrics are being calculated.
+     inputs: The inputs of the given model. This is typically the mini batch of
+              data that is fed to the model.
+     targets: The predictions or targets of the given model.
+
+  Returns:
+     Returns the model output, total loss and loss value calculated using the
+     specified loss function. The total loss includes regularization losses and
+     applies masking and sample weighting to the loss value.
+  """
+  total_loss = 0
+  outs = model(inputs)
+  if not isinstance(outs, list):
+    outs = [outs]
+
+  if not isinstance(targets, list):
+    targets = [targets]
+
+  loss_metrics = []
+  with K.name_scope('loss'):
+    for i, loss_fn in enumerate(model.loss_functions):
+      # compute the loss
+      output_loss = _eager_loss_fn(outs[i], targets[i], loss_fn,
+                                   model.output_names[i])
+      loss_metrics.append(K.mean(output_loss))
+
+      mask = outs[i]._keras_mask
+      # adapted from weighted_loss_fn
+      if mask is not None:
+        # mask should have the same shape as output_loss
+        output_loss *= mask
+        #  the loss per batch should be proportional
+        #  to the number of unmasked samples.
+        output_loss /= K.mean(mask)
+
+      # adapted from weighted_loss_fn
+      # apply sample weighting
+      if model.sample_weights:
+        # reduce score_array to same ndim as weight array
+        ndim = K.ndim(output_loss)
+        weight_ndim = K.ndim(model.sample_weights)
+        output_loss = K.mean(output_loss, axis=list(range(weight_ndim, ndim)))
+        output_loss *= model.sample_weights
+        output_loss /= K.mean(K.cast(K.not_equal(model.sample_weights, 0),
+                                     K.floatx()))
+        output_loss = K.mean(output_loss)
+
+      loss_weight = model.loss_weights_list[i]
+      if total_loss is None:
+        total_loss = loss_weight * output_loss
+      else:
+        total_loss += loss_weight * output_loss
+
+    total_loss = K.mean(total_loss)
+    # Add regularization losses
+    custom_losses = []
+    for layer in model.layers:
+      if layer.losses:
+        custom_losses += layer.losses
+
+    if custom_losses:
+      total_loss += sum(custom_losses)
+
+  return outs, total_loss, loss_metrics
+
+
+def _process_single_batch(eager_model_inputs, eager_model_outputs, model,
+                          training=True):
+  """Calculate the loss and gradient for one input batch.
+
+     The model weights are updated if training is set to True.
+
+  Arguments:
+      eager_model_inputs: Input batch data.
+      eager_model_outputs: Output batch data.
+      model: Model whose loss has to be calculated.
+      training: The boolean represents if the weights of the model are updated.
+              'fit' methods will set this to True while 'evaluate' methods will
+              set this to False.
+
+  Returns:
+      output of the model, total loss and the loss associated with each output.
+
+  Raises:
+      ValueError: If the model loss is 0 or if the trainable weights list is
+                  empty when the trainable parameter is set to True.
+  """
+  K.set_learning_phase(training)
+  with GradientTape() as tape:
+    outs, loss, loss_metrics = _model_loss(model, eager_model_inputs,
+                                           eager_model_outputs)
+    if loss is None:
+      raise ValueError('The model cannot be run '
+                       'because it has no loss to optimize.')
+  if training:
+    if not model._collected_trainable_weights:
+      raise ValueError('The list of trainable weights is empty. Make sure that '
+                       'you are not setting model.trainable to False before '
+                       'compiling the model.')
+    grads = tape.gradient(loss, model._collected_trainable_weights)
+    model.optimizer.apply_gradients(zip(grads,
+                                        model._collected_trainable_weights))
+  return outs, loss, loss_metrics
+
+
+def train_on_batch(model, ins):
+  """Calculates the loss and gradient updates for one input batch.
+
+  Arguments:
+      model: Given model on which loss and gradients are calculated.
+      ins: Input and output batch numpy arrays.
+
+  Returns:
+      total loss and the loss associated with each output.
+  """
+  ins_batch_converted = []
+  for ib in ins:
+    ins_batch_converted.append(ops.convert_to_tensor(ib, dtype=K.floatx()))
+  eager_model_inputs = []
+  eager_model_outputs = []
+  for i in range(len(model.inputs)):
+    eager_model_inputs.append(ins_batch_converted[i])
+  for i in range(len(model.inputs), len(ins_batch_converted)):
+    eager_model_outputs.append(ins_batch_converted[i])
+  outs, loss, _ = _process_single_batch(
+      eager_model_inputs, eager_model_outputs, model)
+  if not isinstance(outs, list):
+    outs = [outs]
+  _, metrics_results = _eager_metrics_fn(
+      model, outs, eager_model_outputs)
+  if not isinstance(loss, list):
+    loss = [loss]
+  return loss + metrics_results
+
+
+def test_on_batch(model, ins):
+  """Calculates the loss for one input batch.
+
+  Arguments:
+      model: Given model on which loss is calculated.
+      ins: Input and output batch numpy arrays.
+
+  Returns:
+      total loss, loss and metrics associated with each output.
+  """
+  ins_batch_converted = []
+  for ib in ins:
+    ins_batch_converted.append(ops.convert_to_tensor(ib, dtype=K.floatx()))
+  eager_model_inputs = []
+  eager_model_outputs = []
+  for i in range(len(model.inputs)):
+    eager_model_inputs.append(ins_batch_converted[i])
+  for i in range(len(model.inputs), len(ins_batch_converted)):
+    eager_model_outputs.append(ins_batch_converted[i])
+  outs, loss, loss_metrics = _process_single_batch(
+      eager_model_inputs, eager_model_outputs, model, training=False)
+  if not isinstance(outs, list):
+    outs = [outs]
+  metric_names, metrics_results = _eager_metrics_fn(
+      model, outs, eager_model_outputs)
+  model.metrics_names.append(metric_names)
+  if not isinstance(loss, list):
+    loss = [loss]
+  return loss + loss_metrics + metrics_results
+
+
+def fit_loop(
+    model,
+    ins,
+    out_labels=None,
+    batch_size=None,
+    epochs=100,
+    verbose=1,
+    callbacks=None,
+    val_ins=None,
+    shuffle=True,
+    callback_metrics=None,
+    initial_epoch=0,
+    steps_per_epoch=None,
+    validation_steps=None):
+  """Abstract fit function for `f(ins)`.
+
+  Assume that f returns a list, labeled by out_labels.
+
+  Arguments:
+      model: Instance of the model that is being executed in Eager mode.
+      ins: List of tensors to be fed to `f`
+      out_labels: List of strings, display names of
+          the outputs of `f`
+      batch_size: Integer batch size or None if unknown.
+      epochs: Number of times to iterate over the data
+      verbose: Verbosity mode, 0, 1 or 2
+      callbacks: List of callbacks to be called during training
+      val_ins: List of tensors to be fed to `val_f`
+      shuffle: Whether to shuffle the data at the beginning of each epoch
+      callback_metrics: List of strings, the display names of the metrics
+          passed to the callbacks. They should be the
+          concatenation of list the display names of the outputs of
+           `f` and the list of display names of the outputs of `f_val`.
+      initial_epoch: Epoch at which to start training
+          (useful for resuming a previous training run)
+      steps_per_epoch: Total number of steps (batches of samples)
+          before declaring one epoch finished and starting the
+          next epoch. Ignored with the default value of `None`.
+      validation_steps: Number of steps to run validation for (only if doing
+        validation from data tensors). Ignored with default value of `None`.
+
+  Returns:
+      `History` object.
+
+  Raises:
+    ValueError: In case of invalid argument values.
+  """
+  # Required for Eager mode
+  K.set_learning_phase(True)
+
+  do_validation = False
+  if val_ins:
+    do_validation = True
+    if (verbose and ins and hasattr(ins[0], 'shape') and
+        hasattr(val_ins[0], 'shape')):
+      print('Train on %d samples, validate on %d samples' %
+            (ins[0].shape[0], val_ins[0].shape[0]))
+  if validation_steps:
+    if steps_per_epoch is None:
+      raise ValueError('Can only use `validation_steps` when doing step-wise '
+                       'training, i.e. `steps_per_epoch` must be set.')
+    do_validation = True
+
+  num_train_samples = model._check_num_samples(
+      ins, batch_size, steps_per_epoch, 'steps_per_epoch')
+
+  if num_train_samples is not None:
+    index_array = np.arange(num_train_samples)
+
+  model.history = cbks.History()
+  callbacks = [cbks.BaseLogger()] + (callbacks or []) + [model.history]
+  if verbose:
+    if steps_per_epoch is not None:
+      count_mode = 'steps'
+    else:
+      count_mode = 'samples'
+    callbacks += [cbks.ProgbarLogger(count_mode)]
+  callbacks = cbks.CallbackList(callbacks)
+  out_labels = out_labels or []
+
+  # it's possible to callback a different model than self
+  # (used by Sequential models)
+  if hasattr(model, 'callback_model') and model.callback_model:
+    callback_model = model.callback_model
+  else:
+    callback_model = model
+
+  callbacks.set_model(callback_model)
+
+  callbacks.set_params({
+      'batch_size': batch_size,
+      'epochs': epochs,
+      'steps': steps_per_epoch,
+      'samples': num_train_samples,
+      'verbose': verbose,
+      'do_validation': do_validation,
+      'metrics': callback_metrics or [],
+  })
+  callbacks.on_train_begin()
+  callback_model.stop_training = False
+  for cbk in callbacks:
+    cbk.validation_data = val_ins
+
+  for epoch in range(initial_epoch, epochs):
+    callbacks.on_epoch_begin(epoch)
+    epoch_logs = {}
+    if shuffle == 'batch':
+      index_array = model._batch_shuffle(index_array, batch_size)
+    elif shuffle:
+      np.random.shuffle(index_array)
+
+    batches = _make_batches(num_train_samples, batch_size)
+
+    for batch_index, (batch_start, batch_end) in enumerate(batches):
+      batch_ids = index_array[batch_start:batch_end]
+      try:
+        if isinstance(ins[-1], float):
+          # Do not slice the training phase flag.
+          ins_batch = _slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
+        else:
+          ins_batch = _slice_arrays(ins, batch_ids)
+      except TypeError:
+        raise TypeError('TypeError while preparing batch. '
+                        'If using HDF5 input data, '
+                        'pass shuffle="batch".')
+      batch_logs = {}
+      batch_logs['batch'] = batch_index
+      batch_logs['size'] = len(batch_ids)
+
+      callbacks.on_batch_begin(batch_index, batch_logs)
+
+      ins_batch_converted = []
+      for ib in ins_batch:
+        ins_batch_converted.append(ops.convert_to_tensor(ib, dtype=K.floatx()))
+      eager_model_inputs = []
+      eager_model_outputs = []
+      for i in range(len(model.inputs)):
+        eager_model_inputs.append(ins_batch_converted[i])
+
+      for i in range(len(model.inputs), len(ins_batch_converted)):
+        eager_model_outputs.append(ins_batch_converted[i])
+
+      outs, loss, loss_metrics = _process_single_batch(eager_model_inputs,
+                                                       eager_model_outputs,
+                                                       model)
+
+      if not isinstance(outs, list):
+        outs = [outs]
+
+      for l, o in zip(out_labels, outs):
+        batch_logs[l] = o
+      # Required for Eager mode
+      metrics_names, metrics_results = _eager_metrics_fn(model, outs,
+                                                         eager_model_outputs)
+      batch_logs['loss'] = tensor_util.constant_value(K.mean(loss))
+
+      # TODO(anjalisridhar): Move this to compile to avoid duplicate code.
+      # In graph mode we set the metric names in compile. However in
+      # Eager mode we calculate the metrics for each batch in fit_loop.
+      # We could calculate the metric names and functions in compile.
+      # This would avoid setting the callback parameters separately.
+      # We need to do this for the first iteration alone
+      for m in metrics_names:
+        if m not in callback_metrics:
+          callback_metrics.append(m)
+
+      callbacks.set_params({
+          'batch_size': batch_size,
+          'epochs': epochs,
+          'steps': steps_per_epoch,
+          'samples': num_train_samples,
+          'verbose': verbose,
+          'do_validation': do_validation,
+          'metrics': callback_metrics or [],
+      })
+
+      for k, v in zip(model.metrics_names,
+                      [K.mean(loss)] + loss_metrics + metrics_results):
+        batch_logs[k] = tensor_util.constant_value(v)
+
+      callbacks.on_batch_end(batch_index, batch_logs)
+      if callback_model.stop_training:
+        break
+
+      if batch_index == len(batches) - 1:  # Last batch.
+        if do_validation:
+          val_outs = test_loop(
+              model, val_ins, batch_size=batch_size, verbose=0)
+          if not isinstance(val_outs, list):
+            val_outs = [val_outs]
+          # Same labels assumed.
+          for l, o in zip(out_labels, val_outs):
+            epoch_logs['val_' + l] = o
+    callbacks.on_epoch_end(epoch, epoch_logs)
+    if callback_model.stop_training:
+      break
+  callbacks.on_train_end()
+  return model.history
+
+
+def test_loop(model, ins, batch_size=None, verbose=0, steps=None):
+  """Abstract method to loop over some data in batches.
+
+  Arguments:
+      model: Model instance that is being evaluated in Eager mode.
+      ins: list of tensors to be fed to `f`.
+      batch_size: integer batch size or `None`.
+      verbose: verbosity mode.
+      steps: Total number of steps (batches of samples)
+          before declaring predictions finished.
+          Ignored with the default value of `None`.
+
+  Returns:
+      Scalar loss (if the model has a single output and no metrics)
+      or list of scalars (if the model has multiple outputs
+      and/or metrics). The attribute `model.metrics_names` will give you
+      the display labels for the scalar outputs.
+  """
+  K.set_learning_phase(False)
+  num_samples = model._check_num_samples(ins, batch_size, steps, 'steps')
+  outs = []
+  if verbose == 1:
+    progbar = Progbar(target=num_samples)
+  batches = _make_batches(num_samples, batch_size)
+  index_array = np.arange(num_samples)
+  for batch_index, (batch_start, batch_end) in enumerate(batches):
+    batch_ids = index_array[batch_start:batch_end]
+    if isinstance(ins[-1], float):
+      # Do not slice the training phase flag.
+      ins_batch = _slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
+    else:
+      ins_batch = _slice_arrays(ins, batch_ids)
+
+    ins_batch_converted = []
+    for ib in ins_batch:
+      ins_batch_converted.append(ops.convert_to_tensor(ib, dtype=K.floatx()))
+
+    eager_model_inputs = []
+    eager_model_outputs = []
+    for i in range(len(model.inputs)):
+      eager_model_inputs.append(ins_batch_converted[i])
+
+    for i in range(len(model.inputs), len(ins_batch_converted)):
+      eager_model_outputs.append(ins_batch_converted[i])
+
+    loss_outs, loss, loss_metrics = _model_loss(model, eager_model_inputs,
+                                                eager_model_outputs)
+    _, metrics_results = _eager_metrics_fn(model, loss_outs,
+                                           eager_model_outputs)
+    batch_outs = []
+    for _, v in zip(model.metrics_names,
+                    [K.mean(loss)] + loss_metrics + metrics_results):
+      batch_outs.append(tensor_util.constant_value(v))
+
+    if isinstance(batch_outs, list):
+      if batch_index == 0:
+        for batch_out in enumerate(batch_outs):
+          outs.append(0.)
+      for i, batch_out in enumerate(batch_outs):
+        outs[i] += batch_out * len(batch_ids)
+    else:
+      if batch_index == 0:
+        outs.append(0.)
+      outs[0] += batch_outs * len(batch_ids)
+
+    if verbose == 1:
+      progbar.update(batch_end)
+  for i in range(len(outs)):
+    outs[i] /= num_samples
+  if len(outs) == 1:
+    return outs[0]
+  return outs
+
+
+def predict_loop(model, ins, batch_size=32, verbose=0, steps=None):
+  """Abstract method to loop over some data in batches.
+
+  Arguments:
+      model:
+      ins: list of tensors to be fed to `f`.
+      batch_size: integer batch size.
+      verbose: verbosity mode.
+      steps: Total number of steps (batches of samples)
+          before declaring `_predict_loop` finished.
+          Ignored with the default value of `None`.
+
+  Returns:
+      Array of predictions (if the model has a single output)
+      or list of arrays of predictions
+      (if the model has multiple outputs).
+  """
+  K.set_learning_phase(False)
+  num_samples = model._check_num_samples(ins, batch_size, steps, 'steps')
+  if verbose == 1:
+    if steps is not None:
+      progbar = Progbar(target=steps)
+    else:
+      progbar = Progbar(target=num_samples)
+
+  outs = []
+  batches = _make_batches(num_samples, batch_size)
+  index_array = np.arange(num_samples)
+  for batch_index, (batch_start, batch_end) in enumerate(batches):
+    batch_ids = index_array[batch_start:batch_end]
+    if ins and isinstance(ins[-1], float):
+      # Do not slice the training phase flag.
+      ins_batch = _slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
+    else:
+      ins_batch = _slice_arrays(ins, batch_ids)
+
+    ins_batch_converted = []
+    for ib in ins_batch:
+      ins_batch_converted.append(ops.convert_to_tensor(ib, dtype=K.floatx()))
+
+    eager_model_inputs = []
+    for i in range(len(model.inputs)):
+      eager_model_inputs.append(ins_batch_converted[i])
+
+    batch_outs = model(eager_model_inputs)
+
+    if not isinstance(batch_outs, list):
+      batch_outs = [batch_outs]
+    if batch_index == 0:
+      # Pre-allocate the results arrays.
+      for batch_out in batch_outs:
+        dims = batch_out.shape[1:].dims
+        dims_list = [d.value for d in dims]
+        shape = (num_samples,) + tuple(dims_list)
+        outs.append(np.zeros(shape, dtype=batch_out.dtype.as_numpy_dtype))
+    for i, batch_out in enumerate(batch_outs):
+      outs[i][batch_start:batch_end] = batch_out
+    if verbose == 1:
+      progbar.update(batch_end)
+  if len(outs) == 1:
+    return outs[0]
+  return outs
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py b/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py
new file mode 100644
index 0000000000..81e2f7a514
--- /dev/null
+++ b/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py
@@ -0,0 +1,755 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for training routines."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import numpy as np
+
+from tensorflow.python.framework import ops
+from tensorflow.python.keras._impl import keras
+from tensorflow.python.keras._impl.keras import testing_utils
+from tensorflow.python.platform import test
+from tensorflow.python.training.rmsprop import RMSPropOptimizer
+
+
+class TrainingTest(test.TestCase):
+
+  def test_fit_on_arrays(self):
+    a = keras.layers.Input(shape=(3,), name='input_a')
+    b = keras.layers.Input(shape=(3,), name='input_b')
+
+    dense = keras.layers.Dense(4, name='dense')
+    c = dense(a)
+    d = dense(b)
+    e = keras.layers.Dropout(0.5, name='dropout')(c)
+
+    model = keras.models.Model([a, b], [d, e])
+
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    loss_weights = [1., 0.5]
+    metrics = ['mae']
+    model.compile(optimizer, loss, metrics=metrics, loss_weights=loss_weights)
+
+    input_a_np = np.random.random((10, 3))
+    input_b_np = np.random.random((10, 3))
+
+    output_d_np = np.random.random((10, 4))
+    output_e_np = np.random.random((10, 4))
+
+    # Test fit at different verbosity
+    model.fit(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        epochs=1,
+        batch_size=5,
+        verbose=0)
+    model.fit(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        epochs=1,
+        batch_size=5,
+        verbose=1)
+    model.fit(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        epochs=2,
+        batch_size=5,
+        verbose=2)
+
+    # Test with validation data
+    model.fit(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        validation_data=([input_a_np, input_b_np], [output_d_np,
+                                                    output_e_np]),
+        epochs=1,
+        batch_size=5,
+        verbose=0)
+    model.fit(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        validation_data=([input_a_np, input_b_np], [output_d_np,
+                                                    output_e_np]),
+        epochs=2,
+        batch_size=5,
+        verbose=1)
+    model.fit(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        validation_data=([input_a_np, input_b_np], [output_d_np,
+                                                    output_e_np]),
+        epochs=2,
+        batch_size=5,
+        verbose=2)
+    model.train_on_batch([input_a_np, input_b_np], [output_d_np, output_e_np])
+
+  # Test with validation split
+    model.fit(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        epochs=2,
+        batch_size=5,
+        verbose=0,
+        validation_split=0.2)
+
+    # Test with dictionary inputs
+    model.fit(
+        {
+            'input_a': input_a_np,
+            'input_b': input_b_np
+        }, {'dense': output_d_np,
+            'dropout': output_e_np},
+        epochs=1,
+        batch_size=5,
+        verbose=0)
+    model.fit(
+        {
+            'input_a': input_a_np,
+            'input_b': input_b_np
+        }, {'dense': output_d_np,
+            'dropout': output_e_np},
+        epochs=1,
+        batch_size=5,
+        verbose=1)
+    model.fit(
+        {
+            'input_a': input_a_np,
+            'input_b': input_b_np
+        }, {'dense': output_d_np,
+            'dropout': output_e_np},
+        validation_data=({'input_a': input_a_np,
+                          'input_b': input_b_np
+                         },
+                         {
+                             'dense': output_d_np,
+                             'dropout': output_e_np
+                         }),
+        epochs=1,
+        batch_size=5,
+        verbose=0)
+    model.train_on_batch({
+        'input_a': input_a_np,
+        'input_b': input_b_np
+    }, {'dense': output_d_np,
+        'dropout': output_e_np})
+    # Test with lists for loss, metrics
+    loss = ['mae', 'mse']
+    metrics = ['acc', 'mae']
+    model.compile(optimizer, loss, metrics=metrics)
+    model.fit(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        epochs=1,
+        batch_size=5,
+        verbose=0)
+
+    # Test with dictionaries for loss, metrics, loss weights
+    loss = {'dense': 'mse', 'dropout': 'mae'}
+    loss_weights = {'dense': 1., 'dropout': 0.5}
+    metrics = {'dense': 'mse', 'dropout': 'mae'}
+    model.compile(optimizer, loss, metrics=metrics, loss_weights=loss_weights)
+    model.fit(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        epochs=1,
+        batch_size=5,
+        verbose=0)
+
+    # Invalid use cases
+    with self.assertRaises(AttributeError):
+      model.fit(
+          [input_a_np, input_b_np], [output_d_np, output_e_np],
+          epochs=1,
+          validation_data=([input_a_np, input_b_np], 0, 0),
+          verbose=0)
+    with self.assertRaises(ValueError):
+      model.train_on_batch({'input_a': input_a_np},
+                           [output_d_np, output_e_np])
+    with self.assertRaises(ValueError):
+      model.train_on_batch([input_a_np], [output_d_np, output_e_np])
+    with self.assertRaises(AttributeError):
+      model.train_on_batch(1, [output_d_np, output_e_np])
+    with self.assertRaises(ValueError):
+      model.train_on_batch(input_a_np, [output_d_np, output_e_np])
+    with self.assertRaises(ValueError):
+      bad_input = np.random.random((11, 3))
+      model.train_on_batch([bad_input, input_b_np],
+                           [output_d_np, output_e_np])
+    with self.assertRaises(ValueError):
+      bad_target = np.random.random((11, 4))
+      model.train_on_batch([input_a_np, input_b_np],
+                           [bad_target, output_e_np])
+
+    # Build single-input model
+    x = keras.layers.Input(shape=(3,), name='input_a')
+    y = keras.layers.Dense(4)(x)
+    model = keras.models.Model(x, y)
+    model.compile(optimizer=RMSPropOptimizer(learning_rate=0.001), loss='mse')
+    # This will work
+    model.fit([input_a_np], output_d_np, epochs=1)
+    with self.assertRaises(ValueError):
+      model.fit([input_a_np, input_a_np], output_d_np, epochs=1)
+
+  def test_evaluate_predict_on_arrays(self):
+    a = keras.layers.Input(shape=(3,), name='input_a')
+    b = keras.layers.Input(shape=(3,), name='input_b')
+
+    dense = keras.layers.Dense(4, name='dense')
+    c = dense(a)
+    d = dense(b)
+    e = keras.layers.Dropout(0.5, name='dropout')(c)
+
+    model = keras.models.Model([a, b], [d, e])
+
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    loss_weights = [1., 0.5]
+    metrics = ['mae']
+    model.compile(
+        optimizer,
+        loss,
+        metrics=metrics,
+        loss_weights=loss_weights,
+        sample_weight_mode=None)
+
+    input_a_np = np.random.random((10, 3))
+    input_b_np = np.random.random((10, 3))
+
+    output_d_np = np.random.random((10, 4))
+    output_e_np = np.random.random((10, 4))
+
+    # Test evaluate at different verbosity
+    out = model.evaluate(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        batch_size=5,
+        verbose=0)
+    self.assertEqual(len(out), 5)
+    out = model.evaluate(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        batch_size=5,
+        verbose=1)
+    self.assertEqual(len(out), 5)
+    out = model.evaluate(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        batch_size=5,
+        verbose=2)
+    self.assertEqual(len(out), 5)
+    out = model.test_on_batch([input_a_np, input_b_np],
+                              [output_d_np, output_e_np])
+    self.assertEqual(len(out), 5)
+
+    # Test evaluate with dictionary inputs
+    model.evaluate(
+        {
+            'input_a': input_a_np,
+            'input_b': input_b_np
+        }, {'dense': output_d_np,
+            'dropout': output_e_np},
+        batch_size=5,
+        verbose=0)
+    model.evaluate(
+        {
+            'input_a': input_a_np,
+            'input_b': input_b_np
+        }, {'dense': output_d_np,
+            'dropout': output_e_np},
+        batch_size=5,
+        verbose=1)
+
+    # Test predict
+    out = model.predict([input_a_np, input_b_np], batch_size=5)
+    self.assertEqual(len(out), 2)
+    out = model.predict({'input_a': input_a_np, 'input_b': input_b_np})
+    self.assertEqual(len(out), 2)
+    out = model.predict_on_batch({
+        'input_a': input_a_np,
+        'input_b': input_b_np
+    })
+    self.assertEqual(len(out), 2)
+
+  def test_invalid_loss_or_metrics(self):
+    num_classes = 5
+    train_samples = 1000
+    test_samples = 1000
+    input_dim = 5
+
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(10, input_shape=(input_dim,)))
+    model.add(keras.layers.Activation('relu'))
+    model.add(keras.layers.Dense(num_classes))
+    model.add(keras.layers.Activation('softmax'))
+    model.compile(loss='categorical_crossentropy',
+                  optimizer=RMSPropOptimizer(learning_rate=0.001))
+    np.random.seed(1337)
+
+    (x_train, y_train), (_, _) = testing_utils.get_test_data(
+        train_samples=train_samples,
+        test_samples=test_samples,
+        input_shape=(input_dim,),
+        num_classes=num_classes)
+
+    with self.assertRaises(ValueError):
+      model.fit(x_train, np.concatenate([y_train, y_train], axis=-1))
+
+    with self.assertRaises(TypeError):
+      model.compile(loss='categorical_crossentropy',
+                    optimizer=RMSPropOptimizer(learning_rate=0.001),
+                    metrics=set(0))
+
+    with self.assertRaises(ValueError):
+      model.compile(loss=None,
+                    optimizer='rms')
+
+
+class LossWeightingTest(test.TestCase):
+
+  def test_class_weights(self):
+    num_classes = 5
+    batch_size = 5
+    epochs = 5
+    weighted_class = 3
+    train_samples = 3000
+    test_samples = 3000
+    input_dim = 5
+
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(10, input_shape=(input_dim,)))
+    model.add(keras.layers.Activation('relu'))
+    model.add(keras.layers.Dense(num_classes))
+    model.add(keras.layers.Activation('softmax'))
+    model.compile(loss='categorical_crossentropy',
+                  optimizer=RMSPropOptimizer(learning_rate=0.001))
+
+    np.random.seed(1337)
+    (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+        train_samples=train_samples,
+        test_samples=test_samples,
+        input_shape=(input_dim,),
+        num_classes=num_classes)
+    int_y_test = y_test.copy()
+    int_y_train = y_train.copy()
+    # convert class vectors to binary class matrices
+    y_train = keras.utils.to_categorical(y_train, num_classes)
+    y_test = keras.utils.to_categorical(y_test, num_classes)
+    test_ids = np.where(int_y_test == np.array(weighted_class))[0]
+
+    class_weight = dict([(i, 1.) for i in range(num_classes)])
+    class_weight[weighted_class] = 2.
+
+    sample_weight = np.ones((y_train.shape[0]))
+    sample_weight[int_y_train == weighted_class] = 2.
+
+    model.fit(
+        x_train,
+        y_train,
+        batch_size=batch_size,
+        epochs=epochs // 3,
+        verbose=0,
+        class_weight=class_weight,
+        validation_data=(x_train, y_train, sample_weight))
+    model.fit(
+        x_train,
+        y_train,
+        batch_size=batch_size,
+        epochs=epochs // 2,
+        verbose=0,
+        class_weight=class_weight)
+    model.fit(
+        x_train,
+        y_train,
+        batch_size=batch_size,
+        epochs=epochs // 2,
+        verbose=0,
+        class_weight=class_weight,
+        validation_split=0.1)
+
+    model.train_on_batch(
+        x_train[:batch_size], y_train[:batch_size], class_weight=class_weight)
+    ref_score = model.evaluate(x_test, y_test, verbose=0)
+    score = model.evaluate(
+        x_test[test_ids, :], y_test[test_ids, :], verbose=0)
+    self.assertLess(score, ref_score)
+
+  def test_sample_weights(self):
+    num_classes = 5
+    batch_size = 5
+    epochs = 5
+    weighted_class = 3
+    train_samples = 3000
+    test_samples = 3000
+    input_dim = 5
+
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(10, input_shape=(input_dim,)))
+    model.add(keras.layers.Activation('relu'))
+    model.add(keras.layers.Dense(num_classes))
+    model.add(keras.layers.Activation('softmax'))
+    model.compile(loss='categorical_crossentropy',
+                  optimizer=RMSPropOptimizer(learning_rate=0.001))
+
+    np.random.seed(43)
+    (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+        train_samples=train_samples,
+        test_samples=test_samples,
+        input_shape=(input_dim,),
+        num_classes=num_classes)
+    int_y_test = y_test.copy()
+    int_y_train = y_train.copy()
+    # convert class vectors to binary class matrices
+    y_train = keras.utils.to_categorical(y_train, num_classes)
+    y_test = keras.utils.to_categorical(y_test, num_classes)
+    test_ids = np.where(int_y_test == np.array(weighted_class))[0]
+
+    class_weight = dict([(i, 1.) for i in range(num_classes)])
+    class_weight[weighted_class] = 2.
+
+    sample_weight = np.ones((y_train.shape[0]))
+    sample_weight[int_y_train == weighted_class] = 2.
+
+    model.fit(
+        x_train,
+        y_train,
+        batch_size=batch_size,
+        epochs=epochs // 3,
+        verbose=0,
+        sample_weight=sample_weight)
+    model.fit(
+        x_train,
+        y_train,
+        batch_size=batch_size,
+        epochs=epochs // 3,
+        verbose=0,
+        sample_weight=sample_weight,
+        validation_split=0.1)
+    model.train_on_batch(
+        x_train[:batch_size],
+        y_train[:batch_size],
+        sample_weight=sample_weight[:batch_size])
+    model.test_on_batch(
+        x_train[:batch_size],
+        y_train[:batch_size],
+        sample_weight=sample_weight[:batch_size])
+
+  def test_temporal_sample_weights(self):
+    num_classes = 5
+    weighted_class = 3
+    train_samples = 1000
+    test_samples = 1000
+    input_dim = 5
+    timesteps = 3
+
+    model = keras.models.Sequential()
+    model.add(
+        keras.layers.TimeDistributed(
+            keras.layers.Dense(num_classes),
+            input_shape=(timesteps, input_dim)))
+    model.add(keras.layers.Activation('softmax'))
+
+    np.random.seed(1337)
+    (_, y_train), _ = testing_utils.get_test_data(
+        train_samples=train_samples,
+        test_samples=test_samples,
+        input_shape=(input_dim,),
+        num_classes=num_classes)
+    int_y_train = y_train.copy()
+    # convert class vectors to binary class matrices
+    y_train = keras.utils.to_categorical(y_train, num_classes)
+
+    class_weight = dict([(i, 1.) for i in range(num_classes)])
+    class_weight[weighted_class] = 2.
+
+    sample_weight = np.ones((y_train.shape[0]))
+    sample_weight[int_y_train == weighted_class] = 2.
+    with self.assertRaises(ValueError):
+      model.compile(
+          loss='binary_crossentropy',
+          optimizer=RMSPropOptimizer(learning_rate=0.001),
+          sample_weight_mode='temporal')
+
+  def test_class_weight_invalid_use_case(self):
+    num_classes = 5
+    train_samples = 1000
+    test_samples = 1000
+    input_dim = 5
+    timesteps = 3
+
+    model = keras.models.Sequential()
+    model.add(
+        keras.layers.TimeDistributed(
+            keras.layers.Dense(num_classes),
+            input_shape=(timesteps, input_dim)))
+    model.add(keras.layers.Activation('softmax'))
+    model.compile(
+        loss='binary_crossentropy',
+        optimizer=RMSPropOptimizer(learning_rate=0.001))
+
+    (x_train, y_train), _ = testing_utils.get_test_data(
+        train_samples=train_samples,
+        test_samples=test_samples,
+        input_shape=(input_dim,),
+        num_classes=num_classes)
+    # convert class vectors to binary class matrices
+    y_train = keras.utils.to_categorical(y_train, num_classes)
+    class_weight = dict([(i, 1.) for i in range(num_classes)])
+
+    del class_weight[1]
+    with self.assertRaises(ValueError):
+      model.fit(x_train, y_train,
+                epochs=0, verbose=0, class_weight=class_weight)
+
+    with self.assertRaises(ValueError):
+      model.compile(
+          loss='binary_crossentropy',
+          optimizer=RMSPropOptimizer(learning_rate=0.001),
+          sample_weight_mode=[])
+
+    # Build multi-output model
+    x = keras.Input((3,))
+    y1 = keras.layers.Dense(4, name='1')(x)
+    y2 = keras.layers.Dense(4, name='2')(x)
+    model = keras.models.Model(x, [y1, y2])
+    model.compile(optimizer=RMSPropOptimizer(learning_rate=0.001), loss='mse')
+    x_np = np.random.random((10, 3))
+    y_np = np.random.random((10, 4))
+    w_np = np.random.random((10,))
+    # This will work
+    model.fit(x_np, [y_np, y_np], epochs=1, sample_weight={'1': w_np})
+    # These will not
+    with self.assertRaises(ValueError):
+      model.fit(x_np, [y_np, y_np], epochs=1, sample_weight=[w_np])
+    with self.assertRaises(TypeError):
+      model.fit(x_np, [y_np, y_np], epochs=1, sample_weight=w_np)
+    with self.assertRaises(ValueError):
+      bad_w_np = np.random.random((11,))
+      model.fit(x_np, [y_np, y_np], epochs=1, sample_weight={'1': bad_w_np})
+    with self.assertRaises(ValueError):
+      bad_w_np = np.random.random((10, 2))
+      model.fit(x_np, [y_np, y_np], epochs=1, sample_weight={'1': bad_w_np})
+    with self.assertRaises(ValueError):
+      bad_w_np = np.random.random((10, 2, 2))
+      model.fit(x_np, [y_np, y_np], epochs=1, sample_weight={'1': bad_w_np})
+
+
+class TestDynamicTrainability(test.TestCase):
+
+  def test_trainable_warning(self):
+    x = np.random.random((5, 3))
+    y = np.random.random((5, 2))
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(2, input_dim=3))
+    model.trainable = False
+    model.compile(RMSPropOptimizer(learning_rate=0.001), 'mse')
+    model.trainable = True
+    with self.assertRaises(ValueError):
+      model.train_on_batch(x, y)
+
+  def test_trainable_argument(self):
+    x = np.random.random((5, 3))
+    y = np.random.random((5, 2))
+
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(2, input_dim=3, trainable=False))
+    model.compile(RMSPropOptimizer(learning_rate=0.001), 'mse')
+    out = model.predict(x)
+    with self.assertRaises(ValueError):
+      model.train_on_batch(x, y)
+    out_2 = model.predict(x)
+    self.assertAllClose(out, out_2)
+
+    # test with nesting
+    inputs = keras.layers.Input(shape=(3,))
+    output = model(inputs)
+    model = keras.models.Model(inputs, output)
+    model.compile(RMSPropOptimizer(learning_rate=0.001), 'mse')
+    out = model.predict(x)
+    with self.assertRaises(ValueError):
+      model.train_on_batch(x, y)
+    out_2 = model.predict(x)
+    self.assertAllClose(out, out_2)
+
+  def test_layer_trainability_switch(self):
+    # with constructor argument, in Sequential
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(2, trainable=False, input_dim=1))
+    self.assertListEqual(model.trainable_weights, [])
+
+    # by setting the `trainable` argument, in Sequential
+    model = keras.models.Sequential()
+    layer = keras.layers.Dense(2, input_dim=1)
+    model.add(layer)
+    self.assertListEqual(model.trainable_weights, layer.trainable_weights)
+    layer.trainable = False
+    self.assertListEqual(model.trainable_weights, [])
+
+    # with constructor argument, in Model
+    x = keras.layers.Input(shape=(1,))
+    y = keras.layers.Dense(2, trainable=False)(x)
+    model = keras.models.Model(x, y)
+    self.assertListEqual(model.trainable_weights, [])
+
+    # by setting the `trainable` argument, in Model
+    x = keras.layers.Input(shape=(1,))
+    layer = keras.layers.Dense(2)
+    y = layer(x)
+    model = keras.models.Model(x, y)
+    self.assertListEqual(model.trainable_weights, layer.trainable_weights)
+    layer.trainable = False
+    self.assertListEqual(model.trainable_weights, [])
+
+  def test_model_trainability_switch(self):
+    # a non-trainable model has no trainable weights
+    x = keras.layers.Input(shape=(1,))
+    y = keras.layers.Dense(2)(x)
+    model = keras.models.Model(x, y)
+    model.trainable = False
+    self.assertListEqual(model.trainable_weights, [])
+
+    # same for Sequential
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(2, input_dim=1))
+    model.trainable = False
+    self.assertListEqual(model.trainable_weights, [])
+
+  def test_nested_model_trainability(self):
+
+    # a Sequential inside a Model
+    inner_model = keras.models.Sequential()
+    inner_model.add(keras.layers.Dense(2, input_dim=1))
+
+    x = keras.layers.Input(shape=(1,))
+    y = inner_model(x)
+    outer_model = keras.models.Model(x, y)
+    self.assertListEqual(outer_model.trainable_weights,
+                         inner_model.trainable_weights)
+    inner_model.trainable = False
+    self.assertListEqual(outer_model.trainable_weights, [])
+    inner_model.trainable = True
+    inner_model.layers[-1].trainable = False
+    self.assertListEqual(outer_model.trainable_weights, [])
+
+    # a Sequential inside a Sequential
+    inner_model = keras.models.Sequential()
+    inner_model.add(keras.layers.Dense(2, input_dim=1))
+    outer_model = keras.models.Sequential()
+    outer_model.add(inner_model)
+    self.assertListEqual(outer_model.trainable_weights,
+                         inner_model.trainable_weights)
+    inner_model.trainable = False
+    self.assertListEqual(outer_model.trainable_weights, [])
+    inner_model.trainable = True
+    inner_model.layers[-1].trainable = False
+    self.assertListEqual(outer_model.trainable_weights, [])
+
+    # a Model inside a Model
+    x = keras.layers.Input(shape=(1,))
+    y = keras.layers.Dense(2)(x)
+    inner_model = keras.models.Model(x, y)
+    x = keras.layers.Input(shape=(1,))
+    y = inner_model(x)
+    outer_model = keras.models.Model(x, y)
+    self.assertListEqual(outer_model.trainable_weights,
+                         inner_model.trainable_weights)
+    inner_model.trainable = False
+    self.assertListEqual(outer_model.trainable_weights, [])
+    inner_model.trainable = True
+    inner_model.layers[-1].trainable = False
+    self.assertListEqual(outer_model.trainable_weights, [])
+
+    # a Model inside a Sequential
+    x = keras.layers.Input(shape=(1,))
+    y = keras.layers.Dense(2)(x)
+    inner_model = keras.models.Model(x, y)
+    outer_model = keras.models.Sequential()
+    outer_model.add(inner_model)
+    self.assertListEqual(outer_model.trainable_weights,
+                         inner_model.trainable_weights)
+    inner_model.trainable = False
+    self.assertListEqual(outer_model.trainable_weights, [])
+    inner_model.trainable = True
+    inner_model.layers[-1].trainable = False
+    self.assertListEqual(outer_model.trainable_weights, [])
+
+
+class TestTrainingUtils(test.TestCase):
+
+  def test_check_array_lengths(self):
+    keras.engine.training._check_array_lengths(None, None, None)
+    a_np = np.random.random((4, 3, 3))
+    keras.engine.training._check_array_lengths(a_np, a_np, a_np)
+    keras.engine.training._check_array_lengths(
+        [a_np, a_np], [a_np, a_np], [a_np, a_np])
+    keras.engine.training._check_array_lengths([None], [None], [None])
+
+    b_np = np.random.random((3, 4))
+    with self.assertRaises(ValueError):
+      keras.engine.training._check_array_lengths(a_np, None, None)
+    with self.assertRaises(ValueError):
+      keras.engine.training._check_array_lengths(a_np, a_np, None)
+    with self.assertRaises(ValueError):
+      keras.engine.training._check_array_lengths([a_np], [None], None)
+    with self.assertRaises(ValueError):
+      keras.engine.training._check_array_lengths([a_np], [b_np], None)
+    with self.assertRaises(ValueError):
+      keras.engine.training._check_array_lengths([a_np], None, [b_np])
+
+  def test_slice_arrays(self):
+    input_a = np.random.random((10, 3))
+    keras.engine.training._slice_arrays(None)
+    keras.engine.training._slice_arrays(input_a, 0)
+    keras.engine.training._slice_arrays(input_a, 0, 1)
+    keras.engine.training._slice_arrays(input_a, stop=2)
+    input_a = [None, [1, 1], None, [1, 1]]
+    keras.engine.training._slice_arrays(input_a, 0)
+    keras.engine.training._slice_arrays(input_a, 0, 1)
+    keras.engine.training._slice_arrays(input_a, stop=2)
+    input_a = [None]
+    keras.engine.training._slice_arrays(input_a, 0)
+    keras.engine.training._slice_arrays(input_a, 0, 1)
+    keras.engine.training._slice_arrays(input_a, stop=2)
+    input_a = None
+    keras.engine.training._slice_arrays(input_a, 0)
+    keras.engine.training._slice_arrays(input_a, 0, 1)
+    keras.engine.training._slice_arrays(input_a, stop=2)
+
+  def test_fit_with_BatchNorm(self):
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(10, input_dim=4))
+    model.add(keras.layers.BatchNormalization())
+    model.add(keras.layers.Activation('tanh'))
+    model.add(keras.layers.Dropout(0.2))
+
+    input_a_np = np.random.random((10, 4))
+    output_b_np = np.random.random((10, 10))
+
+    model.compile(loss='binary_crossentropy', optimizer=RMSPropOptimizer(0.001))
+    model.fit(input_a_np, output_b_np, epochs=1, batch_size=5, verbose=0)
+
+  def test_fit_with_regularization(self):
+    model = keras.models.Sequential()
+    with self.assertRaises(ValueError):
+      model.add(
+          keras.layers.Dense(4, input_dim=3,
+                             kernel_regularizer=keras.regularizers.l2(0.01),
+                             activity_regularizer=keras.regularizers.l1(0.01)))
+
+
+if __name__ == '__main__':
+  # Bazel sets these environment variables to very long paths.
+  # Tempfile uses them to create long paths, and in turn multiprocessing
+  # library tries to create sockets named after paths. Delete whatever bazel
+  # writes to these to avoid tests failing due to socket addresses being too
+  # long.
+  for var in ('TMPDIR', 'TMP', 'TEMP'):
+    if var in os.environ:
+      del os.environ[var]
+
+  ops.enable_eager_execution()
+  test.main()
diff --git a/tensorflow/python/keras/_impl/keras/layers/core.py b/tensorflow/python/keras/_impl/keras/layers/core.py
index 6ee3fb48b2..ea2d3f2f04 100644
--- a/tensorflow/python/keras/_impl/keras/layers/core.py
+++ b/tensorflow/python/keras/_impl/keras/layers/core.py
@@ -23,6 +23,7 @@ import types as python_types
 
 import numpy as np
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras._impl.keras import activations
 from tensorflow.python.keras._impl.keras import backend as K
@@ -119,7 +120,8 @@ class Dropout(tf_core_layers.Dropout, Layer):
     if training is None:
       training = K.learning_phase()
     output = super(Dropout, self).call(inputs, training=training)
-    if training is K.learning_phase():
+    # EagerTensor object has no attribute _uses_learning_phase
+    if not context.in_eager_mode() and training is K.learning_phase():
       output._uses_learning_phase = True  # pylint: disable=protected-access
     return output
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/normalization.py b/tensorflow/python/keras/_impl/keras/layers/normalization.py
index 965ef70e6e..eecb14ceaa 100644
--- a/tensorflow/python/keras/_impl/keras/layers/normalization.py
+++ b/tensorflow/python/keras/_impl/keras/layers/normalization.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.eager import context
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import constraints
 from tensorflow.python.keras._impl.keras import initializers
@@ -108,7 +109,7 @@ class BatchNormalization(tf_normalization_layers.BatchNormalization, Layer):
     if training is None:
       training = K.learning_phase()
     output = super(BatchNormalization, self).call(inputs, training=training)
-    if training is K.learning_phase():
+    if context.in_graph_mode() and training is K.learning_phase():
       output._uses_learning_phase = True  # pylint: disable=protected-access
     return output
 
diff --git a/tensorflow/python/keras/_impl/keras/optimizers.py b/tensorflow/python/keras/_impl/keras/optimizers.py
index e47987aadc..a55a5e39a6 100644
--- a/tensorflow/python/keras/_impl/keras/optimizers.py
+++ b/tensorflow/python/keras/_impl/keras/optimizers.py
@@ -24,6 +24,7 @@ import copy
 import six
 from six.moves import zip  # pylint: disable=redefined-builtin
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes as dtypes_module
 from tensorflow.python.framework import ops
 from tensorflow.python.keras._impl.keras import backend as K
@@ -680,7 +681,14 @@ class TFOptimizer(Optimizer):
   def __init__(self, optimizer):  # pylint: disable=super-init-not-called
     self.optimizer = optimizer
     with K.name_scope(self.__class__.__name__):
-      self.iterations = K.variable(0, dtype='int64', name='iterations')
+      if context.in_graph_mode():
+        self.iterations = K.variable(0, dtype='int64', name='iterations')
+
+  def apply_gradients(self, grads):
+    self.optimizer.apply_gradients(grads)
+
+  def get_grads(self, loss, params):
+    return self.optimizer.compute_gradients(loss, params)
 
   def get_updates(self, loss, params):
     grads = self.optimizer.compute_gradients(loss, params)
diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py
index 04b6056ace..5dea732cba 100644
--- a/tensorflow/python/layers/base.py
+++ b/tensorflow/python/layers/base.py
@@ -31,6 +31,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.layers import utils as layers_util
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables as tf_variables
@@ -649,6 +650,7 @@ class Layer(object):
     else:
       scope_context_manager = vs.variable_scope(
           self._scope, reuse=self._reuse, auxiliary_name_scope=False)
+    input_shapes = None
     with scope_context_manager as scope:
       with ops.name_scope(self._name_scope_name(scope)):
         if not self.built:
@@ -698,6 +700,9 @@ class Layer(object):
         else:
           # Deferred mode behavior: use `compute_output_shape` to
           # infer the number of outputs of the layer and their shapes.
+          if input_shapes is None:
+            input_shapes = nest.map_structure(lambda x: x.get_shape(), inputs)
+
           output_shapes = self.compute_output_shape(input_shapes)
           output_shapes = nest.flatten(output_shapes)
           outputs = [
@@ -1393,7 +1398,10 @@ class _DeferredTensor(object):
 
   def __init__(self, shape, dtype, name=None):
     self.shape = tensor_shape.TensorShape(shape)
-    self.dtype = dtypes.as_dtype(dtype)
+    if dtype is None:
+      self.dtype = dtypes.as_dtype(np.float32)
+    else:
+      self.dtype = dtypes.as_dtype(dtype)
     self.name = name
 
   def get_shape(self):
diff --git a/tensorflow/python/layers/network.py b/tensorflow/python/layers/network.py
index 0a5dd57621..745843975c 100644
--- a/tensorflow/python/layers/network.py
+++ b/tensorflow/python/layers/network.py
@@ -621,6 +621,11 @@ class GraphNetwork(base.Layer):
         A list of loss tensors.
     """
     losses = []
+    if context.in_eager_mode():
+      for layer in self.layers:
+        losses += layer.losses
+      return losses
+
     # Retrieve losses for all internal layers.
     for layer in self.layers:
       if hasattr(layer, 'losses'):
@@ -853,7 +858,6 @@ class GraphNetwork(base.Layer):
       for node in nodes:
         # This is always a single layer, never a list.
         layer = node.outbound_layer
-
         reference_input_tensors = node.input_tensors
         reference_output_tensors = node.output_tensors
 
@@ -901,12 +905,13 @@ class GraphNetwork(base.Layer):
               else:
                 output_masks = [None for _ in range(len(output_tensors))]
 
-            # Apply activity regularizer if any:
-            if layer.activity_regularizer is not None:
-              regularization_losses = [
-                  layer.activity_regularizer(x) for x in computed_tensors
-              ]
-              layer.add_loss(regularization_losses, computed_tensors)
+            if context.in_graph_mode():
+              if layer.activity_regularizer is not None:
+                regularization_losses = [
+                    layer.activity_regularizer(x) for x in computed_tensors
+                ]
+                # Apply activity regularizer if any:
+                layer.add_loss(regularization_losses, computed_tensors)
 
           if context.in_graph_mode():
             # Update model updates and losses:
-- 
GitLab


From 33f537e5275d477771d68eb9b6654ddeeffd2f15 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 30 Jan 2018 19:30:18 -0800
Subject: [PATCH 447/966] internal change

PiperOrigin-RevId: 183932015
---
 .../contrib/tpu/profiler/capture_tpu_profile.cc       |  2 ++
 tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc   | 11 ++++-------
 tensorflow/contrib/tpu/profiler/dump_tpu_profile.h    |  2 ++
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
index 9ac9e3cb28..b1ef9fde37 100644
--- a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
+++ b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
@@ -154,6 +154,8 @@ int main(int argc, char** argv) {
               << std::endl
               << "Tip: increase number of attempts with --num_tracing_attempts."
               << std::endl;
+    // Don't dump profile data if no trace is collected.
+    return 0;
   }
 
   // Use the current timestamp as the run name.
diff --git a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
index 64e4e6275d..ebd6185faa 100644
--- a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
+++ b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
@@ -151,8 +151,7 @@ Status WriteTensorboardTPUProfile(const string& logdir, const string& run,
   TF_RETURN_IF_ERROR(Env::Default()->RecursivelyCreateDir(profile_run_dir));
 
   // Ignore computation_graph for now.
-  const bool empty_trace = response.encoded_trace().empty();
-  if (!empty_trace) {
+  if (!response.encoded_trace().empty()) {
     LOG(INFO) << "Converting trace events to TraceViewer JSON.";
     TF_RETURN_IF_ERROR(
         DumpTraceToLogDirectory(profile_run_dir, response.encoded_trace(), os));
@@ -163,11 +162,9 @@ Status WriteTensorboardTPUProfile(const string& logdir, const string& run,
     TF_RETURN_IF_ERROR(DumpOpProfileToLogDirectory(profile_run_dir,
                                                    response.op_profile(), os));
   }
-  if (!empty_trace && !response.tool_data().empty()) {
-    for (const auto& tool_data : response.tool_data()) {
-      TF_RETURN_IF_ERROR(
-          DumpToolDataToLogDirectory(profile_run_dir, tool_data, os));
-    }
+  for (const auto& tool_data : response.tool_data()) {
+    TF_RETURN_IF_ERROR(
+        DumpToolDataToLogDirectory(profile_run_dir, tool_data, os));
   }
 
   return Status::OK();
diff --git a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h
index 2f8656a37b..29ef977bac 100644
--- a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h
+++ b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h
@@ -29,6 +29,8 @@ namespace tpu {
 //   - Op profile
 //   - Input pipeline analyzer
 //   - Overview page
+// Note: this function creates a directory even when all fields in
+// ProfileResponse are unset/empty.
 Status WriteTensorboardTPUProfile(const string& logdir, const string& run,
                                   const ProfileResponse& response,
                                   std::ostream* os);
-- 
GitLab


From 2a01e3f2ee1ec5b1cf212dd949c1072129e4770a Mon Sep 17 00:00:00 2001
From: Bjarke Hammersholt Roune <broune@google.com>
Date: Tue, 30 Jan 2018 20:12:22 -0800
Subject: [PATCH 448/966] Remove Google-internal bug numbers from XLA error
 messages.

PiperOrigin-RevId: 183934860
---
 .../compiler/xla/service/cpu/ir_emitter.cc    | 26 +++++++++----------
 .../xla/service/elemental_ir_emitter.cc       |  7 +++--
 .../compiler/xla/service/gpu/ir_emitter.cc    |  9 ++++---
 .../xla/service/gpu/ir_emitter_unnested.cc    |  3 +--
 4 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index 71e8133189..0b2d3d4746 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -479,7 +479,7 @@ Status IrEmitter::HandleOutfeed(HloInstruction* outfeed) {
 
 Status IrEmitter::HandleSort(HloInstruction* sort) {
   // TODO(b/26783907): Implement sort on CPU.
-  return Unimplemented("Sort is not supported on CPU (b/26783907).");
+  return Unimplemented("Sort is not implemented on CPU.");
 }
 
 Status IrEmitter::HandleTuple(HloInstruction* tuple) {
@@ -522,7 +522,7 @@ Status IrEmitter::HandleReduceWindow(HloInstruction* reduce_window) {
   // TODO(b/31410564): Implement dilation for reduce-window.
   if (window_util::HasDilation(window)) {
     return Unimplemented(
-        "Dilation for reduce-window not implemented on CPU. See b/31410564.");
+        "Dilation for ReduceWindow is not implemented on CPU.");
   }
 
   // The called computation should have been emitted previously.
@@ -625,8 +625,7 @@ Status IrEmitter::HandleSelectAndScatter(HloInstruction* select_and_scatter) {
   // TODO(b/31410564): Implement dilation for select-and-scatter.
   if (window_util::HasDilation(window)) {
     return Unimplemented(
-        "Dilation for select-and-scatter not implemented on CPU. "
-        "See b/31410564.");
+        "Dilation for SelectAndScatter is not implemented on CPU. ");
   }
 
   // The select and scatter computations should have been emitted previously.
@@ -1196,8 +1195,7 @@ Status IrEmitter::HandleCrossReplicaSum(HloInstruction* crs) {
   }
 
   // TODO(b/33011107): Support cross replica sum on CPU.
-  return Unimplemented(
-      "Cross replica sum is not implemented on CPU. See b/33011107.");
+  return Unimplemented("CrossReplicaSum is not implemented on CPU.");
 }
 
 // Fills up the free variables in 'index_with_free_var' with values from
@@ -1811,12 +1809,12 @@ Status IrEmitter::HandleReduce(HloInstruction* reduce) {
 
 Status IrEmitter::HandleSend(HloInstruction* send) {
   // TODO(b/33942983): Support Send/Recv on CPU.
-  return Unimplemented("Send is not implemented on CPU. See b/33942983.");
+  return Unimplemented("Send is not implemented on CPU.");
 }
 
 Status IrEmitter::HandleSendDone(HloInstruction* send_done) {
   // TODO(b/33942983): Support Send/Recv on CPU.
-  return Unimplemented("Send-done is not implemented on CPU. See b/33942983.");
+  return Unimplemented("Send-done is not implemented on CPU.");
 }
 
 Status IrEmitter::HandleSlice(HloInstruction* slice) {
@@ -1981,12 +1979,12 @@ Status IrEmitter::HandleDynamicUpdateSlice(
 
 Status IrEmitter::HandleRecv(HloInstruction* recv) {
   // TODO(b/33942983): Support Send/Recv on CPU.
-  return Unimplemented("Recv is not implemented on CPU. See b/33942983.");
+  return Unimplemented("Recv is not implemented on CPU.");
 }
 
 Status IrEmitter::HandleRecvDone(HloInstruction* recv_done) {
   // TODO(b/33942983): Support Send/Recv on CPU.
-  return Unimplemented("Recv-done is not implemented on CPU. See b/33942983.");
+  return Unimplemented("Recv-done is not implemented on CPU.");
 }
 
 Status IrEmitter::HandlePad(HloInstruction* pad) {
@@ -1995,10 +1993,10 @@ Status IrEmitter::HandlePad(HloInstruction* pad) {
   for (auto& padding_dimension : pad->padding_config().dimensions()) {
     if (padding_dimension.edge_padding_low() < 0 ||
         padding_dimension.edge_padding_high() < 0) {
-      return Unimplemented(
-          "Negative padding not supported in the CPU backend (b/34628603); "
-          "this should have been eliminated at the HLO level: %s",
-          pad->padding_config().ShortDebugString().c_str());
+      return InternalErrorStrCat(
+          "Encountered negative padding in IrEmitter on CPU. "
+          "This should have been eliminated at the HLO level. ",
+          pad->ToString());
     }
   }
 
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
index 9780bac16e..28cd425309 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
@@ -428,7 +428,7 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatUnaryOp(
           llvm::Intrinsic::round, {operand_value}, {operand_value->getType()},
           ir_builder_);
     case HloOpcode::kSign: {
-      // TODO(b/32151903): Ensure consistent sign behavior for -0.0
+      // TODO(b/32151903): Ensure consistent sign behavior for -0.0.
       auto type = operand_value->getType();
       auto zero = llvm::ConstantFP::get(type, 0.0);
       auto oeq = ir_builder_->CreateFCmpOEQ(operand_value, zero);
@@ -870,7 +870,10 @@ llvm::Value* ElementalIrEmitter::EmitFloatMin(llvm::Value* lhs_value,
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitErfInv(PrimitiveType prim_type,
                                                       llvm::Value* x) const {
   if (prim_type != F32) {
-    return Unimplemented("inverse erf only implemented for F32 (b/34339814)");
+    // TODO(b/34339814): Implement inverse erf for F64.
+    return Unimplemented(
+        "Inverse erf is only implemented for element "
+        "type F32.");
   }
   auto getFloat = [&](const float f) {
     return llvm::ConstantFP::get(ir_builder_->getFloatTy(), f);
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
index 23b72c3f71..affd2ffa8e 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
@@ -615,8 +615,7 @@ Status IrEmitter::HandleFft(HloInstruction* fft) {
 
 Status IrEmitter::HandleCrossReplicaSum(HloInstruction* crs) {
   // TODO(b/33011107): Support cross replica sum on GPU.
-  return Unimplemented(
-      "Cross replica sum not implemented on GPU. See b/33011107.");
+  return Unimplemented("CrossReplicaSum is not implemented on GPU.");
 }
 
 Status IrEmitter::HandleParameter(HloInstruction* parameter) {
@@ -710,11 +709,13 @@ Status IrEmitter::HandleCustomCall(HloInstruction*) {
 }
 
 Status IrEmitter::HandleInfeed(HloInstruction*) {
-  return Unimplemented("Infeed is not supported on GPU (b/30467474).");
+  // TODO(b/30467474): Implement infeed on GPU.
+  return Unimplemented("Infeed is not supported on GPU.");
 }
 
 Status IrEmitter::HandleOutfeed(HloInstruction*) {
-  return Unimplemented("Outfeed is not supported on GPU (b/34359662).");
+  // TODO(b/34359662): Implement outfeed on GPU.
+  return Unimplemented("Outfeed is not supported on GPU.");
 }
 
 Status IrEmitter::HandleRng(HloInstruction* random) {
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index fc8783e753..bd428f8028 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -1658,8 +1658,7 @@ Status IrEmitterUnnested::HandleSelectAndScatter(
   // TODO(b/31410564): Implement dilation rate for select-and-scatter.
   if (window_util::HasDilation(window)) {
     return Unimplemented(
-        "Dilation for select-and-scatter not implemented on GPU. "
-        "See b/31410564.");
+        "Dilation for SelectAndScatter not implemented on GPU.");
   }
 
   // kSelectAndScatter is implemented as two kernel launches: the first launch
-- 
GitLab


From e9d4d3d06c0fb211f7488f868fefb477f07df4f8 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Tue, 30 Jan 2018 20:28:38 -0800
Subject: [PATCH 449/966] Adding tf_export decorators/calls to TensorFlow
 functions and constants.

PiperOrigin-RevId: 183936100
---
 tensorflow/python/training/adadelta.py            |  2 ++
 tensorflow/python/training/adagrad.py             |  2 ++
 tensorflow/python/training/adagrad_da.py          |  2 ++
 tensorflow/python/training/adam.py                |  2 ++
 tensorflow/python/training/basic_loops.py         |  2 ++
 .../python/training/basic_session_run_hooks.py    | 14 ++++++++++++++
 tensorflow/python/training/checkpoint_utils.py    |  5 +++++
 tensorflow/python/training/coordinator.py         |  3 +++
 tensorflow/python/training/device_setter.py       |  2 ++
 tensorflow/python/training/ftrl.py                |  3 ++-
 tensorflow/python/training/gradient_descent.py    |  2 ++
 tensorflow/python/training/input.py               | 15 +++++++++++++++
 tensorflow/python/training/learning_rate_decay.py | 10 ++++++++++
 tensorflow/python/training/momentum.py            |  2 ++
 tensorflow/python/training/monitored_session.py   |  8 ++++++++
 tensorflow/python/training/moving_averages.py     |  2 ++
 tensorflow/python/training/optimizer.py           |  2 ++
 tensorflow/python/training/proximal_adagrad.py    |  2 ++
 .../python/training/proximal_gradient_descent.py  |  2 ++
 tensorflow/python/training/queue_runner_impl.py   |  5 +++++
 tensorflow/python/training/rmsprop.py             |  2 ++
 tensorflow/python/training/saver.py               | 10 ++++++++++
 tensorflow/python/training/server_lib.py          |  3 +++
 tensorflow/python/training/session_manager.py     |  2 ++
 tensorflow/python/training/session_run_hook.py    |  5 +++++
 tensorflow/python/training/supervisor.py          |  2 ++
 .../python/training/sync_replicas_optimizer.py    |  2 ++
 tensorflow/python/training/training_util.py       |  6 ++++++
 tensorflow/tools/api/generator/BUILD              |  1 +
 29 files changed, 119 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/training/adadelta.py b/tensorflow/python/training/adadelta.py
index 13c07cfd7b..c08e3cca00 100644
--- a/tensorflow/python/training/adadelta.py
+++ b/tensorflow/python/training/adadelta.py
@@ -22,8 +22,10 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.AdadeltaOptimizer")
 class AdadeltaOptimizer(optimizer.Optimizer):
   """Optimizer that implements the Adadelta algorithm.
 
diff --git a/tensorflow/python/training/adagrad.py b/tensorflow/python/training/adagrad.py
index afa192f7cc..deb4e6f546 100644
--- a/tensorflow/python/training/adagrad.py
+++ b/tensorflow/python/training/adagrad.py
@@ -25,8 +25,10 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.AdagradOptimizer")
 class AdagradOptimizer(optimizer.Optimizer):
   """Optimizer that implements the Adagrad algorithm.
 
diff --git a/tensorflow/python/training/adagrad_da.py b/tensorflow/python/training/adagrad_da.py
index b3f9ea323c..5ba403554f 100644
--- a/tensorflow/python/training/adagrad_da.py
+++ b/tensorflow/python/training/adagrad_da.py
@@ -23,8 +23,10 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.AdagradDAOptimizer")
 class AdagradDAOptimizer(optimizer.Optimizer):
   """Adagrad Dual Averaging algorithm for sparse linear models.
 
diff --git a/tensorflow/python/training/adam.py b/tensorflow/python/training/adam.py
index 0c69f8bf39..c92f6fc301 100644
--- a/tensorflow/python/training/adam.py
+++ b/tensorflow/python/training/adam.py
@@ -26,8 +26,10 @@ from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.AdamOptimizer")
 class AdamOptimizer(optimizer.Optimizer):
   """Optimizer that implements the Adam algorithm.
 
diff --git a/tensorflow/python/training/basic_loops.py b/tensorflow/python/training/basic_loops.py
index 52b0f42106..7af821c819 100644
--- a/tensorflow/python/training/basic_loops.py
+++ b/tensorflow/python/training/basic_loops.py
@@ -18,8 +18,10 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import errors
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.basic_train_loop")
 def basic_train_loop(supervisor, train_step_fn, args=None,
                      kwargs=None, master=""):
   """Basic loop to train a model.
diff --git a/tensorflow/python/training/basic_session_run_hooks.py b/tensorflow/python/training/basic_session_run_hooks.py
index 752d585cd1..17e07e171a 100644
--- a/tensorflow/python/training/basic_session_run_hooks.py
+++ b/tensorflow/python/training/basic_session_run_hooks.py
@@ -47,6 +47,7 @@ from tensorflow.python.training import session_run_hook
 from tensorflow.python.training import training_util
 from tensorflow.python.training.session_run_hook import SessionRunArgs
 from tensorflow.python.training.summary_io import SummaryWriterCache
+from tensorflow.python.util.tf_export import tf_export
 
 
 class _HookTimer(object):
@@ -85,6 +86,7 @@ class _HookTimer(object):
     raise NotImplementedError
 
 
+@tf_export("train.SecondOrStepTimer")
 class SecondOrStepTimer(_HookTimer):
   """Timer that triggers at most once every N seconds or once every N steps.
   """
@@ -164,6 +166,7 @@ class NeverTriggerTimer(_HookTimer):
     return None
 
 
+@tf_export("train.LoggingTensorHook")
 class LoggingTensorHook(session_run_hook.SessionRunHook):
   """Prints the given tensors every N local steps, every N seconds, or at end.
 
@@ -262,6 +265,7 @@ class LoggingTensorHook(session_run_hook.SessionRunHook):
       self._log_tensors(values)
 
 
+@tf_export("train.StopAtStepHook")
 class StopAtStepHook(session_run_hook.SessionRunHook):
   """Hook that requests stop at a specified step."""
 
@@ -317,6 +321,7 @@ class StopAtStepHook(session_run_hook.SessionRunHook):
         run_context.request_stop()
 
 
+@tf_export("train.CheckpointSaverListener")
 class CheckpointSaverListener(object):
   """Interface for listeners that take action before or after checkpoint save.
 
@@ -375,6 +380,7 @@ class CheckpointSaverListener(object):
     pass
 
 
+@tf_export("train.CheckpointSaverHook")
 class CheckpointSaverHook(session_run_hook.SessionRunHook):
   """Saves checkpoints every N steps or seconds."""
 
@@ -497,6 +503,7 @@ class CheckpointSaverHook(session_run_hook.SessionRunHook):
     return savers[0]
 
 
+@tf_export("train.StepCounterHook")
 class StepCounterHook(session_run_hook.SessionRunHook):
   """Hook that counts steps per second."""
 
@@ -575,12 +582,14 @@ class StepCounterHook(session_run_hook.SessionRunHook):
     self._last_global_step = stale_global_step
 
 
+@tf_export("train.NanLossDuringTrainingError")
 class NanLossDuringTrainingError(RuntimeError):
 
   def __str__(self):
     return "NaN loss during training."
 
 
+@tf_export("train.NanTensorHook")
 class NanTensorHook(session_run_hook.SessionRunHook):
   """Monitors the loss tensor and stops training if loss is NaN.
 
@@ -612,6 +621,7 @@ class NanTensorHook(session_run_hook.SessionRunHook):
         run_context.request_stop()
 
 
+@tf_export("train.SummarySaverHook")
 class SummarySaverHook(session_run_hook.SessionRunHook):
   """Saves summaries every N steps."""
 
@@ -720,6 +730,7 @@ class SummarySaverHook(session_run_hook.SessionRunHook):
     return summary_op
 
 
+@tf_export("train.GlobalStepWaiterHook")
 class GlobalStepWaiterHook(session_run_hook.SessionRunHook):
   """Delays execution until global step reaches `wait_until_step`.
 
@@ -767,6 +778,7 @@ class GlobalStepWaiterHook(session_run_hook.SessionRunHook):
       time.sleep(0.5)
 
 
+@tf_export("train.FinalOpsHook")
 class FinalOpsHook(session_run_hook.SessionRunHook):
   """A hook which evaluates `Tensors` at the end of a session."""
 
@@ -793,6 +805,7 @@ class FinalOpsHook(session_run_hook.SessionRunHook):
                                            feed_dict=self._final_ops_feed_dict)
 
 
+@tf_export("train.FeedFnHook")
 class FeedFnHook(session_run_hook.SessionRunHook):
   """Runs `feed_fn` and sets the `feed_dict` accordingly."""
 
@@ -810,6 +823,7 @@ class FeedFnHook(session_run_hook.SessionRunHook):
         fetches=None, feed_dict=self.feed_fn())
 
 
+@tf_export("train.ProfilerHook")
 class ProfilerHook(session_run_hook.SessionRunHook):
   """Captures CPU/GPU profiling information every N steps or seconds.
 
diff --git a/tensorflow/python/training/checkpoint_utils.py b/tensorflow/python/training/checkpoint_utils.py
index 63235a1454..fa3de6fad2 100644
--- a/tensorflow/python/training/checkpoint_utils.py
+++ b/tensorflow/python/training/checkpoint_utils.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import saver
+from tensorflow.python.util.tf_export import tf_export
 
 
 __all__ = [
@@ -36,6 +37,7 @@ __all__ = [
 ]
 
 
+@tf_export("train.load_checkpoint")
 def load_checkpoint(ckpt_dir_or_file):
   """Returns `CheckpointReader` for checkpoint found in `ckpt_dir_or_file`.
 
@@ -60,6 +62,7 @@ def load_checkpoint(ckpt_dir_or_file):
   return pywrap_tensorflow.NewCheckpointReader(filename)
 
 
+@tf_export("train.load_variable")
 def load_variable(ckpt_dir_or_file, name):
   """Returns the tensor value of the given variable in the checkpoint.
 
@@ -77,6 +80,7 @@ def load_variable(ckpt_dir_or_file, name):
   return reader.get_tensor(name)
 
 
+@tf_export("train.list_variables")
 def list_variables(ckpt_dir_or_file):
   """Returns list of all variables in the checkpoint.
 
@@ -95,6 +99,7 @@ def list_variables(ckpt_dir_or_file):
   return result
 
 
+@tf_export("train.init_from_checkpoint")
 def init_from_checkpoint(ckpt_dir_or_file, assignment_map):
   """Initializes current variables with tensors loaded from given checkpoint.
 
diff --git a/tensorflow/python/training/coordinator.py b/tensorflow/python/training/coordinator.py
index 0e31255b74..0ff97d85e3 100644
--- a/tensorflow/python/training/coordinator.py
+++ b/tensorflow/python/training/coordinator.py
@@ -27,8 +27,10 @@ import six
 from tensorflow.python.framework import errors
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.Coordinator")
 class Coordinator(object):
   """A coordinator for threads.
 
@@ -406,6 +408,7 @@ class Coordinator(object):
 
 
 # Threads for the standard services.
+@tf_export("train.LooperThread")
 class LooperThread(threading.Thread):
   """A thread that runs code repeatedly, optionally on a timer.
 
diff --git a/tensorflow/python/training/device_setter.py b/tensorflow/python/training/device_setter.py
index 37ab625779..689088bb41 100644
--- a/tensorflow/python/training/device_setter.py
+++ b/tensorflow/python/training/device_setter.py
@@ -23,6 +23,7 @@ from tensorflow.core.framework import node_def_pb2
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import server_lib
+from tensorflow.python.util.tf_export import tf_export
 
 
 class _RoundRobinStrategy(object):
@@ -121,6 +122,7 @@ class _ReplicaDeviceChooser(object):
     return worker_device.to_string()
 
 
+@tf_export("train.replica_device_setter")
 def replica_device_setter(ps_tasks=0, ps_device="/job:ps",
                           worker_device="/job:worker", merge_devices=True,
                           cluster=None, ps_ops=None, ps_strategy=None):
diff --git a/tensorflow/python/training/ftrl.py b/tensorflow/python/training/ftrl.py
index c64a1b3f79..9d02e694db 100644
--- a/tensorflow/python/training/ftrl.py
+++ b/tensorflow/python/training/ftrl.py
@@ -22,8 +22,10 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.FtrlOptimizer")
 class FtrlOptimizer(optimizer.Optimizer):
   """Optimizer that implements the FTRL algorithm.
 
@@ -265,4 +267,3 @@ class FtrlOptimizer(optimizer.Optimizer):
                         grad.dtype),
           math_ops.cast(self._learning_rate_power_tensor, grad.dtype),
           use_locking=self._use_locking)
-
diff --git a/tensorflow/python/training/gradient_descent.py b/tensorflow/python/training/gradient_descent.py
index 5a536e2729..380e14e024 100644
--- a/tensorflow/python/training/gradient_descent.py
+++ b/tensorflow/python/training/gradient_descent.py
@@ -23,8 +23,10 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.GradientDescentOptimizer")
 class GradientDescentOptimizer(optimizer.Optimizer):
   """Optimizer that implements the gradient descent algorithm.
   """
diff --git a/tensorflow/python/training/input.py b/tensorflow/python/training/input.py
index 331a51e8bc..992184ec9e 100644
--- a/tensorflow/python/training/input.py
+++ b/tensorflow/python/training/input.py
@@ -44,6 +44,7 @@ from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.summary import summary
 from tensorflow.python.training import queue_runner
+from tensorflow.python.util.tf_export import tf_export
 
 
 # pylint: disable=protected-access
@@ -53,6 +54,7 @@ _restore_sparse = sparse_ops._take_many_sparse_from_tensors_map
 # pylint: enable=protected-access
 
 
+@tf_export("train.match_filenames_once")
 def match_filenames_once(pattern, name=None):
   """Save the list of files matching pattern, so it is only computed once.
 
@@ -70,6 +72,7 @@ def match_filenames_once(pattern, name=None):
         collections=[ops.GraphKeys.LOCAL_VARIABLES])
 
 
+@tf_export("train.limit_epochs")
 def limit_epochs(tensor, num_epochs=None, name=None):
   """Returns tensor `num_epochs` times and then raises an `OutOfRange` error.
 
@@ -102,6 +105,7 @@ def limit_epochs(tensor, num_epochs=None, name=None):
       return array_ops.identity(tensor, name=name)
 
 
+@tf_export("train.input_producer")
 def input_producer(input_tensor,
                    element_shape=None,
                    num_epochs=None,
@@ -184,6 +188,7 @@ def input_producer(input_tensor,
     return q
 
 
+@tf_export("train.string_input_producer")
 def string_input_producer(string_tensor,
                           num_epochs=None,
                           shuffle=True,
@@ -253,6 +258,7 @@ def string_input_producer(string_tensor,
         cancel_op=cancel_op)
 
 
+@tf_export("train.range_input_producer")
 def range_input_producer(limit, num_epochs=None, shuffle=True, seed=None,
                          capacity=32, shared_name=None, name=None):
   """Produces the integers from 0 to limit-1 in a queue.
@@ -290,6 +296,7 @@ def range_input_producer(limit, num_epochs=None, shuffle=True, seed=None,
         shared_name, "fraction_of_%d_full" % capacity, name)
 
 
+@tf_export("train.slice_input_producer")
 def slice_input_producer(tensor_list, num_epochs=None, shuffle=True, seed=None,
                          capacity=32, shared_name=None, name=None):
   """Produces a slice of each `Tensor` in `tensor_list`.
@@ -885,6 +892,7 @@ def _shuffle_batch_join(tensors_list, batch_size, capacity,
 # Batching functions ----------------------------------------------------------
 
 
+@tf_export("train.batch")
 def batch(tensors, batch_size, num_threads=1, capacity=32,
           enqueue_many=False, shapes=None, dynamic_pad=False,
           allow_smaller_final_batch=False, shared_name=None, name=None):
@@ -979,6 +987,7 @@ def batch(tensors, batch_size, num_threads=1, capacity=32,
       name=name)
 
 
+@tf_export("train.maybe_batch")
 def maybe_batch(tensors, keep_input, batch_size, num_threads=1, capacity=32,
                 enqueue_many=False, shapes=None, dynamic_pad=False,
                 allow_smaller_final_batch=False, shared_name=None, name=None):
@@ -1031,6 +1040,7 @@ def maybe_batch(tensors, keep_input, batch_size, num_threads=1, capacity=32,
       name=name)
 
 
+@tf_export("train.batch_join")
 def batch_join(tensors_list, batch_size, capacity=32, enqueue_many=False,
                shapes=None, dynamic_pad=False, allow_smaller_final_batch=False,
                shared_name=None, name=None):
@@ -1136,6 +1146,7 @@ def batch_join(tensors_list, batch_size, capacity=32, enqueue_many=False,
       name=name)
 
 
+@tf_export("train.maybe_batch_join")
 def maybe_batch_join(tensors_list, keep_input, batch_size, capacity=32,
                      enqueue_many=False, shapes=None, dynamic_pad=False,
                      allow_smaller_final_batch=False, shared_name=None,
@@ -1188,6 +1199,7 @@ def maybe_batch_join(tensors_list, keep_input, batch_size, capacity=32,
       name=name)
 
 
+@tf_export("train.shuffle_batch")
 def shuffle_batch(tensors, batch_size, capacity, min_after_dequeue,
                   num_threads=1, seed=None, enqueue_many=False, shapes=None,
                   allow_smaller_final_batch=False, shared_name=None, name=None):
@@ -1287,6 +1299,7 @@ def shuffle_batch(tensors, batch_size, capacity, min_after_dequeue,
       name=name)
 
 
+@tf_export("train.maybe_shuffle_batch")
 def maybe_shuffle_batch(tensors, batch_size, capacity, min_after_dequeue,
                         keep_input, num_threads=1, seed=None,
                         enqueue_many=False, shapes=None,
@@ -1346,6 +1359,7 @@ def maybe_shuffle_batch(tensors, batch_size, capacity, min_after_dequeue,
       name=name)
 
 
+@tf_export("train.shuffle_batch_join")
 def shuffle_batch_join(tensors_list, batch_size, capacity,
                        min_after_dequeue, seed=None, enqueue_many=False,
                        shapes=None, allow_smaller_final_batch=False,
@@ -1439,6 +1453,7 @@ def shuffle_batch_join(tensors_list, batch_size, capacity,
       name=name)
 
 
+@tf_export("train.maybe_shuffle_batch_join")
 def maybe_shuffle_batch_join(tensors_list, batch_size, capacity,
                              min_after_dequeue, keep_input, seed=None,
                              enqueue_many=False, shapes=None,
diff --git a/tensorflow/python/training/learning_rate_decay.py b/tensorflow/python/training/learning_rate_decay.py
index 343a49cded..10ab4c1137 100644
--- a/tensorflow/python/training/learning_rate_decay.py
+++ b/tensorflow/python/training/learning_rate_decay.py
@@ -25,8 +25,10 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.exponential_decay")
 def exponential_decay(learning_rate,
                       global_step,
                       decay_steps,
@@ -103,6 +105,7 @@ def exponential_decay(learning_rate,
         learning_rate, math_ops.pow(decay_rate, p), name=name)
 
 
+@tf_export("train.piecewise_constant")
 def piecewise_constant(x, boundaries, values, name=None):
   """Piecewise constant from boundaries and interval values.
 
@@ -182,6 +185,7 @@ def piecewise_constant(x, boundaries, values, name=None):
     return control_flow_ops.case(pred_fn_pairs, default, exclusive=True)
 
 
+@tf_export("train.polynomial_decay")
 def polynomial_decay(learning_rate,
                      global_step,
                      decay_steps,
@@ -291,6 +295,7 @@ def polynomial_decay(learning_rate,
         name=name)
 
 
+@tf_export("train.natural_exp_decay")
 def natural_exp_decay(learning_rate,
                       global_step,
                       decay_steps,
@@ -362,6 +367,7 @@ def natural_exp_decay(learning_rate,
     return math_ops.multiply(learning_rate, exponent, name=name)
 
 
+@tf_export("train.inverse_time_decay")
 def inverse_time_decay(learning_rate,
                        global_step,
                        decay_steps,
@@ -444,6 +450,7 @@ def inverse_time_decay(learning_rate,
     return math_ops.div(learning_rate, denom, name=name)
 
 
+@tf_export("train.cosine_decay")
 def cosine_decay(learning_rate, global_step, decay_steps, alpha=0.0, name=None):
   """Applies cosine decay to the learning rate.
 
@@ -503,6 +510,7 @@ def cosine_decay(learning_rate, global_step, decay_steps, alpha=0.0, name=None):
     return math_ops.multiply(learning_rate, decayed)
 
 
+@tf_export("train.cosine_decay_restarts")
 def cosine_decay_restarts(learning_rate,
                           global_step,
                           first_decay_steps,
@@ -596,6 +604,7 @@ def cosine_decay_restarts(learning_rate,
   return math_ops.multiply(learning_rate, decayed, name=name)
 
 
+@tf_export("train.linear_cosine_decay")
 def linear_cosine_decay(learning_rate,
                         global_step,
                         decay_steps,
@@ -679,6 +688,7 @@ def linear_cosine_decay(learning_rate,
     return math_ops.multiply(learning_rate, linear_cosine_decayed, name=name)
 
 
+@tf_export("train.noisy_linear_cosine_decay")
 def noisy_linear_cosine_decay(learning_rate,
                               global_step,
                               decay_steps,
diff --git a/tensorflow/python/training/momentum.py b/tensorflow/python/training/momentum.py
index cf9530d87c..bd9fa79d8f 100644
--- a/tensorflow/python/training/momentum.py
+++ b/tensorflow/python/training/momentum.py
@@ -22,8 +22,10 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.MomentumOptimizer")
 class MomentumOptimizer(optimizer.Optimizer):
   """Optimizer that implements the Momentum algorithm.
 
diff --git a/tensorflow/python/training/monitored_session.py b/tensorflow/python/training/monitored_session.py
index fa3517db27..6c5c9e01a7 100644
--- a/tensorflow/python/training/monitored_session.py
+++ b/tensorflow/python/training/monitored_session.py
@@ -41,6 +41,7 @@ from tensorflow.python.training import queue_runner
 from tensorflow.python.training import saver as training_saver
 from tensorflow.python.training import session_manager as sm
 from tensorflow.python.training import session_run_hook
+from tensorflow.python.util.tf_export import tf_export
 
 
 # The list of exceptions that we should recover from. Exceptions not in this
@@ -52,6 +53,7 @@ _PREEMPTION_ERRORS = (errors.AbortedError, errors.UnavailableError)
 USE_DEFAULT = object()
 
 
+@tf_export('train.Scaffold')
 class Scaffold(object):
   """Structure to create or gather pieces commonly needed to train a model.
 
@@ -272,6 +274,7 @@ class Scaffold(object):
         resources.initialize_resources(resources.local_resources()))
 
 
+@tf_export('train.MonitoredTrainingSession')
 def MonitoredTrainingSession(master='',  # pylint: disable=invalid-name
                              is_chief=True,
                              checkpoint_dir=None,
@@ -381,6 +384,7 @@ def MonitoredTrainingSession(master='',  # pylint: disable=invalid-name
                           stop_grace_period_secs=stop_grace_period_secs)
 
 
+@tf_export('train.SessionCreator')
 class SessionCreator(object):
   """A factory for tf.Session."""
 
@@ -390,6 +394,7 @@ class SessionCreator(object):
         'create_session is not implemented for {}.'.format(self))
 
 
+@tf_export('train.ChiefSessionCreator')
 class ChiefSessionCreator(SessionCreator):
   """Creates a tf.Session for a chief."""
 
@@ -441,6 +446,7 @@ class ChiefSessionCreator(SessionCreator):
         init_fn=self._scaffold.init_fn)
 
 
+@tf_export('train.WorkerSessionCreator')
 class WorkerSessionCreator(SessionCreator):
   """Creates a tf.Session for a worker."""
 
@@ -706,6 +712,7 @@ class _MonitoredSession(object):
     return self._coordinated_creator.tf_sess
 
 
+@tf_export('train.MonitoredSession')
 class MonitoredSession(_MonitoredSession):
   """Session-like object that handles initialization, recovery and hooks.
 
@@ -788,6 +795,7 @@ class MonitoredSession(_MonitoredSession):
         stop_grace_period_secs=stop_grace_period_secs)
 
 
+@tf_export('train.SingularMonitoredSession')
 class SingularMonitoredSession(_MonitoredSession):
   """Session-like object that handles initialization, restoring, and hooks.
 
diff --git a/tensorflow/python/training/moving_averages.py b/tensorflow/python/training/moving_averages.py
index 43ed1ac170..2d89082ad7 100644
--- a/tensorflow/python/training/moving_averages.py
+++ b/tensorflow/python/training/moving_averages.py
@@ -26,6 +26,7 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.training import slot_creator
+from tensorflow.python.util.tf_export import tf_export
 
 
 # TODO(touts): switch to variables.Variable.
@@ -230,6 +231,7 @@ def _zero_debias(unbiased_var, value, decay):
       return unbiased_ema_delta
 
 
+@tf_export("train.ExponentialMovingAverage")
 class ExponentialMovingAverage(object):
   """Maintains moving averages of variables by employing an exponential decay.
 
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index a06b3eada6..425dbd8313 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -36,6 +36,7 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.training import slot_creator
 from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
 
 
 def _get_variable_for(v):
@@ -187,6 +188,7 @@ def _get_processor(v):
   raise NotImplementedError("Trying to optimize unsupported type ", v)
 
 
+@tf_export("train.Optimizer")
 class Optimizer(object):
   """Base class for optimizers.
 
diff --git a/tensorflow/python/training/proximal_adagrad.py b/tensorflow/python/training/proximal_adagrad.py
index da31ab325d..9bd677b8ef 100644
--- a/tensorflow/python/training/proximal_adagrad.py
+++ b/tensorflow/python/training/proximal_adagrad.py
@@ -23,8 +23,10 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.ProximalAdagradOptimizer")
 class ProximalAdagradOptimizer(optimizer.Optimizer):
   # pylint: disable=line-too-long
   """Optimizer that implements the Proximal Adagrad algorithm.
diff --git a/tensorflow/python/training/proximal_gradient_descent.py b/tensorflow/python/training/proximal_gradient_descent.py
index 53e9dc2ef2..369b6cbb50 100644
--- a/tensorflow/python/training/proximal_gradient_descent.py
+++ b/tensorflow/python/training/proximal_gradient_descent.py
@@ -24,8 +24,10 @@ from tensorflow.python.ops import math_ops
 # pylint: enable=unused-import
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.ProximalGradientDescentOptimizer")
 class ProximalGradientDescentOptimizer(optimizer.Optimizer):
   # pylint: disable=line-too-long
   """Optimizer that implements the proximal gradient descent algorithm.
diff --git a/tensorflow/python/training/queue_runner_impl.py b/tensorflow/python/training/queue_runner_impl.py
index 4e7c81d7b2..07afba79ab 100644
--- a/tensorflow/python/training/queue_runner_impl.py
+++ b/tensorflow/python/training/queue_runner_impl.py
@@ -27,8 +27,10 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.queue_runner.QueueRunner", "train.QueueRunner")
 class QueueRunner(object):
   """Holds a list of enqueue operations for a queue, each to be run in a thread.
 
@@ -384,6 +386,7 @@ class QueueRunner(object):
                        import_scope=import_scope)
 
 
+@tf_export("train.queue_runner.add_queue_runner", "train.add_queue_runner")
 def add_queue_runner(qr, collection=ops.GraphKeys.QUEUE_RUNNERS):
   """Adds a `QueueRunner` to a collection in the graph.
 
@@ -402,6 +405,8 @@ def add_queue_runner(qr, collection=ops.GraphKeys.QUEUE_RUNNERS):
   ops.add_to_collection(collection, qr)
 
 
+@tf_export("train.queue_runner.start_queue_runners",
+           "train.start_queue_runners")
 def start_queue_runners(sess=None, coord=None, daemon=True, start=True,
                         collection=ops.GraphKeys.QUEUE_RUNNERS):
   """Starts all queue runners collected in the graph.
diff --git a/tensorflow/python/training/rmsprop.py b/tensorflow/python/training/rmsprop.py
index 745e612018..89d1099a49 100644
--- a/tensorflow/python/training/rmsprop.py
+++ b/tensorflow/python/training/rmsprop.py
@@ -46,8 +46,10 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.RMSPropOptimizer")
 class RMSPropOptimizer(optimizer.Optimizer):
   """Optimizer that implements the RMSProp algorithm.
 
diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py
index abc700b810..3888e9bba4 100644
--- a/tensorflow/python/training/saver.py
+++ b/tensorflow/python/training/saver.py
@@ -53,6 +53,7 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import training_util
 from tensorflow.python.training.checkpoint_state_pb2 import CheckpointState
 from tensorflow.python.util import compat
+from tensorflow.python.util.tf_export import tf_export
 
 
 # Op names which identify variable reads which should be saved.
@@ -889,6 +890,7 @@ def _GetCheckpointFilename(save_dir, latest_filename):
   return os.path.join(save_dir, latest_filename)
 
 
+@tf_export("train.generate_checkpoint_state_proto")
 def generate_checkpoint_state_proto(save_dir,
                                     model_checkpoint_path,
                                     all_model_checkpoint_paths=None):
@@ -933,6 +935,7 @@ def generate_checkpoint_state_proto(save_dir,
   return coord_checkpoint_proto
 
 
+@tf_export("train.update_checkpoint_state")
 def update_checkpoint_state(save_dir,
                             model_checkpoint_path,
                             all_model_checkpoint_paths=None,
@@ -1025,6 +1028,7 @@ def _update_checkpoint_state(save_dir,
                                       text_format.MessageToString(ckpt))
 
 
+@tf_export("train.get_checkpoint_state")
 def get_checkpoint_state(checkpoint_dir, latest_filename=None):
   """Returns CheckpointState proto from the "checkpoint" file.
 
@@ -1082,6 +1086,7 @@ def get_checkpoint_state(checkpoint_dir, latest_filename=None):
   return ckpt
 
 
+@tf_export("train.Saver")
 class Saver(object):
   """Saves and restores variables.
 
@@ -1783,6 +1788,7 @@ def _prefix_to_checkpoint_path(prefix, format_version):
   return prefix  # Just the data file.
 
 
+@tf_export("train.latest_checkpoint")
 def latest_checkpoint(checkpoint_dir, latest_filename=None):
   """Finds the filename of latest saved checkpoint file.
 
@@ -1812,6 +1818,7 @@ def latest_checkpoint(checkpoint_dir, latest_filename=None):
   return None
 
 
+@tf_export("train.import_meta_graph")
 def import_meta_graph(meta_graph_or_file, clear_devices=False,
                       import_scope=None, **kwargs):
   """Recreates a Graph saved in a `MetaGraphDef` proto.
@@ -1913,6 +1920,7 @@ def import_meta_graph(meta_graph_or_file, clear_devices=False,
       return None
 
 
+@tf_export("train.export_meta_graph")
 def export_meta_graph(filename=None,
                       meta_info_def=None,
                       graph_def=None,
@@ -1989,6 +1997,7 @@ def export_meta_graph(filename=None,
   return meta_graph_def
 
 
+@tf_export("train.checkpoint_exists")
 def checkpoint_exists(checkpoint_prefix):
   """Checks whether a V1 or V2 checkpoint exists with the specified prefix.
 
@@ -2013,6 +2022,7 @@ def checkpoint_exists(checkpoint_prefix):
     return False
 
 
+@tf_export("train.get_checkpoint_mtimes")
 def get_checkpoint_mtimes(checkpoint_prefixes):
   """Returns the mtimes (modification timestamps) of the checkpoints.
 
diff --git a/tensorflow/python/training/server_lib.py b/tensorflow/python/training/server_lib.py
index 29da67a30a..2f421d1cc0 100644
--- a/tensorflow/python/training/server_lib.py
+++ b/tensorflow/python/training/server_lib.py
@@ -23,6 +23,7 @@ from tensorflow.core.protobuf import tensorflow_server_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.framework import errors
 from tensorflow.python.util import compat
+from tensorflow.python.util.tf_export import tf_export
 
 
 def _make_server_def(server_or_cluster_def, job_name, task_index, protocol,
@@ -92,6 +93,7 @@ def _make_server_def(server_or_cluster_def, job_name, task_index, protocol,
   return server_def
 
 
+@tf_export("train.Server")
 class Server(object):
   """An in-process TensorFlow server, for use in distributed training.
 
@@ -221,6 +223,7 @@ class Server(object):
                   start=start)
 
 
+@tf_export("train.ClusterSpec")
 class ClusterSpec(object):
   """Represents a cluster as a set of "tasks", organized into "jobs".
 
diff --git a/tensorflow/python/training/session_manager.py b/tensorflow/python/training/session_manager.py
index b396a1e7d0..360e02fb44 100644
--- a/tensorflow/python/training/session_manager.py
+++ b/tensorflow/python/training/session_manager.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import saver as saver_mod
+from tensorflow.python.util.tf_export import tf_export
 
 
 def _maybe_name(obj):
@@ -44,6 +45,7 @@ def _maybe_name(obj):
     return "<no name for %s>" % type(obj)
 
 
+@tf_export("train.SessionManager")
 class SessionManager(object):
   """Training helper that restores from checkpoint and creates session.
 
diff --git a/tensorflow/python/training/session_run_hook.py b/tensorflow/python/training/session_run_hook.py
index 5b023d8a26..89f4030065 100644
--- a/tensorflow/python/training/session_run_hook.py
+++ b/tensorflow/python/training/session_run_hook.py
@@ -96,8 +96,10 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.SessionRunHook")
 class SessionRunHook(object):
   """Hook to extend calls to MonitoredSession.run()."""
 
@@ -189,6 +191,7 @@ class SessionRunHook(object):
     pass
 
 
+@tf_export("train.SessionRunArgs")
 class SessionRunArgs(
     collections.namedtuple("SessionRunArgs",
                            ["fetches", "feed_dict", "options"])):
@@ -213,6 +216,7 @@ class SessionRunArgs(
     return super(SessionRunArgs, cls).__new__(cls, fetches, feed_dict, options)
 
 
+@tf_export("train.SessionRunContext")
 class SessionRunContext(object):
   """Provides information about the `session.run()` call being made.
 
@@ -264,6 +268,7 @@ class SessionRunContext(object):
     self._stop_requested = True
 
 
+@tf_export("train.SessionRunValues")
 class SessionRunValues(
     collections.namedtuple("SessionRunValues",
                            ["results", "options", "run_metadata"])):
diff --git a/tensorflow/python/training/supervisor.py b/tensorflow/python/training/supervisor.py
index e4514aaea2..d2ad34773e 100644
--- a/tensorflow/python/training/supervisor.py
+++ b/tensorflow/python/training/supervisor.py
@@ -37,8 +37,10 @@ from tensorflow.python.training import saver as saver_mod
 from tensorflow.python.training import session_manager as session_manager_mod
 from tensorflow.python.training import training_util
 from tensorflow.python.util import deprecation
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.Supervisor")
 class Supervisor(object):
   """A training helper that checkpoints models and computes summaries.
 
diff --git a/tensorflow/python/training/sync_replicas_optimizer.py b/tensorflow/python/training/sync_replicas_optimizer.py
index 47702fdad0..0c6cf910d1 100644
--- a/tensorflow/python/training/sync_replicas_optimizer.py
+++ b/tensorflow/python/training/sync_replicas_optimizer.py
@@ -31,6 +31,7 @@ from tensorflow.python.training import optimizer
 from tensorflow.python.training import queue_runner
 from tensorflow.python.training import session_manager
 from tensorflow.python.training import session_run_hook
+from tensorflow.python.util.tf_export import tf_export
 
 
 # Please note that the gradients from replicas are averaged instead of summed
@@ -38,6 +39,7 @@ from tensorflow.python.training import session_run_hook
 # rate according to the number of replicas. This change is introduced to be
 # consistent with how gradients are aggregated (averaged) within a batch in a
 # replica.
+@tf_export("train.SyncReplicasOptimizer")
 class SyncReplicasOptimizer(optimizer.Optimizer):
   """Class to synchronize, aggregate gradients and pass them to the optimizer.
 
diff --git a/tensorflow/python/training/training_util.py b/tensorflow/python/training/training_util.py
index 89a9e12932..499f1feb2d 100644
--- a/tensorflow/python/training/training_util.py
+++ b/tensorflow/python/training/training_util.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
 
 
 # Picked a long key value to minimize the chance of collision with user defined
@@ -40,6 +41,7 @@ GLOBAL_STEP_READ_KEY = 'global_step_read_op_cache'
 write_graph = graph_io.write_graph
 
 
+@tf_export('train.global_step')
 def global_step(sess, global_step_tensor):
   """Small helper to get the global step.
 
@@ -67,6 +69,7 @@ def global_step(sess, global_step_tensor):
   return int(sess.run(global_step_tensor))
 
 
+@tf_export('train.get_global_step')
 def get_global_step(graph=None):
   """Get the global step tensor.
 
@@ -101,6 +104,7 @@ def get_global_step(graph=None):
   return global_step_tensor
 
 
+@tf_export('train.create_global_step')
 def create_global_step(graph=None):
   """Create global step tensor in graph.
 
@@ -139,6 +143,7 @@ def create_global_step(graph=None):
                      ops.GraphKeys.GLOBAL_STEP])
 
 
+@tf_export('train.get_or_create_global_step')
 def get_or_create_global_step(graph=None):
   """Returns and create (if necessary) the global step tensor.
 
@@ -156,6 +161,7 @@ def get_or_create_global_step(graph=None):
   return global_step_tensor
 
 
+@tf_export('train.assert_global_step')
 def assert_global_step(global_step_tensor):
   """Asserts `global_step_tensor` is a scalar int `Variable` or `Tensor`.
 
diff --git a/tensorflow/tools/api/generator/BUILD b/tensorflow/tools/api/generator/BUILD
index d110316395..036bdd6d29 100644
--- a/tensorflow/tools/api/generator/BUILD
+++ b/tensorflow/tools/api/generator/BUILD
@@ -77,6 +77,7 @@ genrule(
         "api/nn/rnn_cell/__init__.py",
         "api/sets/__init__.py",
         "api/summary/__init__.py",
+        "api/train/queue_runner/__init__.py",
     ],
     cmd = "$(location create_python_api) $(OUTS)",
     tools = ["create_python_api"],
-- 
GitLab


From ad2a242edb6f6563e6993d8de2b05154a3a1ac26 Mon Sep 17 00:00:00 2001
From: Jianwei Xie <xiejw@google.com>
Date: Tue, 30 Jan 2018 21:34:02 -0800
Subject: [PATCH 450/966] Adds input_fn-return-dataset for Per-Host input
 pipeline deployment.

PiperOrigin-RevId: 183940666
---
 .../contrib/tpu/python/tpu/tpu_estimator.py   | 180 ++++++++++++++----
 1 file changed, 146 insertions(+), 34 deletions(-)

diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index bc55dbcb50..23960bb030 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -41,6 +41,7 @@ from tensorflow.contrib.tpu.python.tpu import util as util_lib
 from tensorflow.core.framework.summary_pb2 import Summary
 from tensorflow.core.protobuf import config_pb2
 
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.estimator import estimator as estimator_lib
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.estimator import util
@@ -70,7 +71,12 @@ _BATCH_SIZE_KEY = 'batch_size'
 _CROSS_REPLICA_SUM_OP = 'CrossReplicaSum'
 _RESERVED_PARAMS_KEYS = [_BATCH_SIZE_KEY]
 
-# TODO(b/65703635): Flip the value and remove all dead code.
+
+# TODO(b/65703635): Flip the value and remove all dead code. Currently, this is
+# only used for per-core based deployments. For per-host based pipelines, if a
+# user returns a Dataset instance it will be automatically wrapped in a
+# tf.while_loop (This can be disabled by returning features and labels
+# explicitly).
 _WRAP_INPUT_FN_INTO_WHILE_LOOP = False
 
 
@@ -499,12 +505,19 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
      dequeue.
   """
 
-  def __init__(self, ctx, enqueue_ops, dequeue_ops):
+  def __init__(self,
+               ctx,
+               enqueue_ops,
+               dequeue_ops,
+               run_infeed_loop_on_coordinator=True):
     self._master_job = ctx.master_job
     self._enqueue_ops = enqueue_ops
     self._dequeue_ops = dequeue_ops
+
+    self._run_infeed_loop_on_coordinator = run_infeed_loop_on_coordinator
     self._initial_infeed_sleep_secs = (
         ctx.config.tpu_config.initial_infeed_sleep_secs)
+
     self._session_cancel_timer = None
 
     self._feed_error = None
@@ -587,15 +600,15 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
       logging.info('%s thread starting after sleep', self._name)
 
     try:
-      if _WRAP_INPUT_FN_INTO_WHILE_LOOP:
-        for _ in queue_ctx.read_iteration_counts():
-          session.run(self._enqueue_ops)
-      else:
+      if self._run_infeed_loop_on_coordinator:
         for count, steps in enumerate(queue_ctx.read_iteration_counts()):
           for i in xrange(steps):
             logging.debug('Infeed enqueue for iteration (%d, %d)', count, i)
             session.run(self._enqueue_ops)
-      logging.debug('Infeed thread finished, shutting down.')
+      else:
+        for _ in queue_ctx.read_iteration_counts():
+          session.run(self._enqueue_ops)
+      logging.info('Infeed thread finished, shutting down.')
     except Exception as e:  # pylint: disable=broad-except
       self._log_error(session, e)
 
@@ -606,6 +619,7 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
         for i in xrange(steps):
           logging.debug('Outfeed dequeue for iteration (%d, %d)', count, i)
           session.run(self._dequeue_ops)
+      logging.info('Outfeed thread finished, shutting down.')
     except Exception as e:  # pylint: disable=broad-except
       self._log_error(session, e)
 
@@ -633,7 +647,6 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
     logging.info('Enqueue next (%d) batch(es) of data to infeed.', iterations)
     self._infeed_controller.send_next_batch_signal(iterations)
 
-    # TODO(xiejw): Refactor the outfeed dequeue into tf.while_loop.
     logging.info('Dequeue next (%d) batch(es) of data from outfeed.',
                  iterations)
     self._outfeed_controller.send_next_batch_signal(iterations)
@@ -752,11 +765,14 @@ def generate_per_core_enqueue_ops_fn_for_host(ctx, input_fn,
     per_host_sharded_inputs = []
     for core_ordinal in range(num_cores_per_host):
       with ops.name_scope('ordinal_%d' % (core_ordinal)):
-        inputs = input_fn()
-        if isinstance(inputs, tuple):
-          features, labels = inputs
-        else:
-          features, labels = inputs, None
+        inputs = _Inputs.from_input_fn(input_fn())
+        if inputs.is_dataset:
+          raise TypeError(
+              '`input_fn` returning `Dataset`  is not yet supported in '
+              'per-Core input pipeline deployment yet. Please set '
+              'TPUConfig.per_host_input_for_training to True or return '
+              '`features` and `labels` from `input_fn`')
+        features, labels = inputs.features_and_labels()
 
         inputs_structure_recorder.validate_and_record_structure(
             features, labels)
@@ -783,14 +799,23 @@ def generate_per_host_enqueue_ops_fn_for_host(
   """Generates infeed enqueue ops for per-host input_fn on a single host."""
   captured_infeed_queue = _CapturedObject()
 
+  hooks = []
+
+  with ops.device(device):
+    inputs = _Inputs.from_input_fn(input_fn())
+
+    is_dataset = inputs.is_dataset
+    if is_dataset:
+      hooks.append(inputs.dataset_initializer_hook())
+
   def enqueue_ops_fn():
     with ops.device(device):
       num_cores_per_host = ctx.num_of_cores_per_host
-      inputs = input_fn()
-      if isinstance(inputs, tuple):
-        features, labels = inputs
-      else:
-        features, labels = inputs, None
+      # Convert user input to features and labels.  If the user returns a
+      # dataset, it is initialized and the features and labels extracted via
+      # `dataset.iterator.get_next()`
+      features, labels = inputs.features_and_labels()
+
       inputs_structure_recorder.validate_and_record_structure(features, labels)
       unsharded_tensor_list = (
           inputs_structure_recorder.flatten_features_and_labels(
@@ -808,7 +833,7 @@ def generate_per_host_enqueue_ops_fn_for_host(
               unsharded_tensor_list, placement_function=lambda x: device))
       return per_host_enqueue_ops
 
-  return enqueue_ops_fn, captured_infeed_queue
+  return enqueue_ops_fn, captured_infeed_queue, hooks, is_dataset
 
 
 class _InputPipeline(object):
@@ -944,7 +969,7 @@ class _InputPipeline(object):
         # Single tensor case.
         unflattened_label = flattened_inputs[expected_num_features]
 
-      return unflattened_features, unflattened_label
+      return _Inputs(unflattened_features, unflattened_label)
 
   def __init__(self, input_fn, batch_axis, ctx):
     """Constructor.
@@ -972,7 +997,8 @@ class _InputPipeline(object):
     # While tf.while_loop is called, the body function, which invokes
     # `enqueue_fn` passed in, is called to construct the graph. So, input_fn
     # structure is recorded.
-    enqueue_ops = self._invoke_input_fn_and_record_structure()
+    enqueue_ops, all_hooks, run_infeed_loop_on_coordinator = (
+        self._invoke_input_fn_and_record_structure())
 
     self._validate_input_pipeline()
 
@@ -983,14 +1009,18 @@ class _InputPipeline(object):
       return self._inputs_structure_recorder.unflatten_features_and_labels(
           values)
 
-    return (enqueue_ops, dequeue_fn)
+    return (enqueue_ops, dequeue_fn, all_hooks, run_infeed_loop_on_coordinator)
 
   def _invoke_input_fn_and_record_structure(self):
     """Deploys the input pipeline and record input structure."""
     enqueue_ops = []
     infeed_queues = []
+    all_hooks = []
     num_hosts = self._ctx.num_hosts
     tpu_host_placement_fn = self._ctx.tpu_host_placement_function
+
+    run_infeed_loop_on_coordinator = True
+
     if self._sharded_per_core:
       # Per-Core input pipeline deployment.
       # Invoke input pipeline for each core and placed on the corresponding
@@ -1004,6 +1034,7 @@ class _InputPipeline(object):
                     self._ctx, self._input_fn, self._inputs_structure_recorder))
 
             if _WRAP_INPUT_FN_INTO_WHILE_LOOP:
+              run_infeed_loop_on_coordinator = False
               enqueue_ops.append(
                   _wrap_computation_in_while_loop(
                       device=host_device, op_fn=enqueue_ops_fn))
@@ -1017,12 +1048,26 @@ class _InputPipeline(object):
         host_device = tpu_host_placement_fn(host_id=host_id)
         with ops.device(host_device):
           with ops.name_scope('input_pipeline_task%d' % (host_id)):
-            enqueue_ops_fn, captured_infeed_queue = (
+            enqueue_ops_fn, captured_infeed_queue, hooks, is_dataset = (
                 generate_per_host_enqueue_ops_fn_for_host(
                     self._ctx, self._input_fn, self._inputs_structure_recorder,
                     self._batch_axis, host_device))
-
-            if _WRAP_INPUT_FN_INTO_WHILE_LOOP:
+            all_hooks.extend(hooks)
+
+            # NOTE(xiejw): We dispatch here based on the return type of the
+            # users `input_fn`.
+            #
+            # 1. If input_fn returns a Dataset instance, we initialize the
+            # iterator outside of tf.while_loop, and call the iterator.get_next
+            # inside tf.while_loop.  This should be always safe.
+            #
+            # 2. If input_fn returns (features, labels), it is too late to wrap
+            # them inside tf.while_loop, as resource initialization cannot be
+            # handled in TF control flow properly. In this case, we will use
+            # python loop to enqueue the data into TPU system.  This may be
+            # slow compared to the previous case.
+            if is_dataset:
+              run_infeed_loop_on_coordinator = False
               enqueue_ops.append(
                   _wrap_computation_in_while_loop(
                       device=host_device, op_fn=enqueue_ops_fn))
@@ -1033,7 +1078,7 @@ class _InputPipeline(object):
     # dequeue is dtypes and types. So, any one can be used. Here, grab the
     # first one.
     self._infeed_queue = infeed_queues[0]
-    return enqueue_ops
+    return enqueue_ops, all_hooks, run_infeed_loop_on_coordinator
 
   def _validate_input_pipeline(self):
     # Perform some sanity checks to log user friendly information. We should
@@ -1099,7 +1144,8 @@ class _ModelFnWrapper(object):
     def train_step(loss):
       """Training step function for use inside a while loop."""
       del loss  # unused; required in function signature.
-      features, labels = dequeue_fn()
+      inputs = dequeue_fn()
+      features, labels = inputs.features_and_labels()
 
       estimator_spec = self._verify_estimator_spec(
           self._call_model_fn(features, labels))
@@ -1150,7 +1196,8 @@ class _ModelFnWrapper(object):
 
     def eval_step(total_loss):
       """Evaluation step function for use inside a while loop."""
-      features, labels = dequeue_fn()
+      inputs = dequeue_fn()
+      features, labels = inputs.features_and_labels()
 
       tpu_estimator_spec = self._call_model_fn(features, labels)
       if not isinstance(tpu_estimator_spec, TPUEstimatorSpec):
@@ -1757,7 +1804,7 @@ class TPUEstimator(estimator_lib.Estimator):
         input_fn = features
 
         input_holders = _InputPipeline(input_fn, batch_axis, ctx)
-        enqueue_ops, dequeue_fn = (
+        enqueue_ops, dequeue_fn, input_hooks, run_infeed_loop_on_coordinator = (
             input_holders.generate_infeed_enqueue_ops_and_dequeue_fn())
 
         if mode == model_fn_lib.ModeKeys.TRAIN:
@@ -1767,7 +1814,12 @@ class TPUEstimator(estimator_lib.Estimator):
           if host_ops is None:
             host_ops = []
           hooks = [
-              TPUInfeedOutfeedSessionHook(ctx, enqueue_ops, host_ops),
+              TPUInfeedOutfeedSessionHook(
+                  ctx,
+                  enqueue_ops,
+                  host_ops,
+                  run_infeed_loop_on_coordinator=(
+                      run_infeed_loop_on_coordinator)),
               ExamplesPerSecondHook(ctx.global_batch_size),
               InstallSignalHandlerHook(),
               training.LoggingTensorHook(
@@ -1776,7 +1828,7 @@ class TPUEstimator(estimator_lib.Estimator):
                       'step': training.get_global_step()
                   },
                   every_n_secs=30)
-          ]
+          ] + input_hooks
           summary.scalar(model_fn_lib.LOSS_METRIC_KEY, loss)
           with ops.control_dependencies([loss]):
             update_ops = _sync_variables_ops()
@@ -1826,9 +1878,12 @@ class TPUEstimator(estimator_lib.Estimator):
         else:
           host_ops = host_call_ret['host_call']
         hooks = [
-            TPUInfeedOutfeedSessionHook(ctx, enqueue_ops,
-                                        eval_update_ops + host_ops),
-        ]
+            TPUInfeedOutfeedSessionHook(
+                ctx,
+                enqueue_ops,
+                eval_update_ops + host_ops,
+                run_infeed_loop_on_coordinator=run_infeed_loop_on_coordinator),
+        ] + input_hooks
 
         return model_fn_lib.EstimatorSpec(
             mode,
@@ -1996,3 +2051,60 @@ class _CapturingContext(control_flow_ops.ControlFlowContext):
 
   def __exit__(self, _, __, ___):  # pylint: disable=invalid-name
     self._g._set_control_flow_context(self._old)  # pylint: disable=protected-access
+
+
+# TODO(xiejw): Extend this to support internal signal.
+class _Inputs(object):
+  """A data structure representing the input_fn returned values.
+
+  This also supports the returned value from input_fn as `Dataset`.
+  """
+
+  def __init__(self, features=None, labels=None, dataset=None):
+    if dataset is not None and (features is not None or labels is not None):
+      raise RuntimeError('Internal Error: Either (features and labels) or '
+                         'dataset should be provided, not both. Please file '
+                         'bug')
+
+    self._features = features
+    self._labels = labels
+
+    self._dataset = dataset
+    self._iterator = None
+
+  @staticmethod
+  def from_input_fn(return_values):
+    """Returns an `_Inputs` instance according to `input_fn` return value."""
+    if isinstance(return_values, dataset_ops.Dataset):
+      dataset = return_values
+      return _Inputs(dataset=dataset)
+
+    if isinstance(return_values, tuple):
+      features, labels = return_values
+    else:
+      features, labels = return_values, None
+    return _Inputs(features, labels)
+
+  @property
+  def is_dataset(self):
+    """Returns True if the return value from input_fn is Dataset."""
+    return self._dataset is not None
+
+  def dataset_initializer_hook(self):
+    """Returns a `SessionRunHook` to initialize this dataset.
+
+    This must be called before `features_and_labels`.
+    """
+    iterator = self._dataset.make_initializable_iterator()
+    # pylint: disable=protected-access
+    hook = estimator_lib._DatasetInitializerHook(iterator)
+    self._iterator = iterator
+    return hook
+
+  def features_and_labels(self):
+    """Gets `features` and `labels`."""
+    if self.is_dataset:
+      return (_Inputs.from_input_fn(
+          self._iterator.get_next()).features_and_labels())
+
+    return (self._features, self._labels)
-- 
GitLab


From 66cffe9e4520208c0097283752bd2bfdbea94ae2 Mon Sep 17 00:00:00 2001
From: Asim Shankar <ashankar@google.com>
Date: Tue, 30 Jan 2018 23:15:28 -0800
Subject: [PATCH 451/966] Go: Support control dependencies.

Fixes #16464

PiperOrigin-RevId: 183946928
---
 tensorflow/go/graph.go         |  9 ++++++++-
 tensorflow/go/op/scope.go      | 27 +++++++++++++++++++++++----
 tensorflow/go/op/scope_test.go | 34 ++++++++++++++++++++++++++++++++++
 3 files changed, 65 insertions(+), 5 deletions(-)

diff --git a/tensorflow/go/graph.go b/tensorflow/go/graph.go
index fc087d9d99..08943a527c 100644
--- a/tensorflow/go/graph.go
+++ b/tensorflow/go/graph.go
@@ -173,7 +173,11 @@ type OpSpec struct {
 	// operation.
 	Attrs map[string]interface{}
 
-	// Other possible fields: Device, ColocateWith, ControlInputs.
+	// Operations that must be executed before executing the operation
+	// being added.
+	ControlDependencies []*Operation
+
+	// Other possible fields: Device, ColocateWith.
 }
 
 // AddOperation adds an operation to g.
@@ -204,6 +208,9 @@ func (g *Graph) AddOperation(args OpSpec) (*Operation, error) {
 			}
 		}
 	}
+	for _, in := range args.ControlDependencies {
+		C.TF_AddControlInput(cdesc, in.c)
+	}
 	status := newStatus()
 	for name, value := range args.Attrs {
 		if err := setAttr(cdesc, status, name, value); err != nil {
diff --git a/tensorflow/go/op/scope.go b/tensorflow/go/op/scope.go
index a9ec79463a..2cf1f30187 100644
--- a/tensorflow/go/op/scope.go
+++ b/tensorflow/go/op/scope.go
@@ -33,10 +33,11 @@ import (
 // A Scope object and all its derivates (e.g., obtained from Scope.SubScope)
 // are not safe for concurrent use by multiple goroutines.
 type Scope struct {
-	graph     *tf.Graph
-	namemap   map[string]int
-	namespace string
-	err       *scopeErr
+	graph               *tf.Graph
+	namemap             map[string]int
+	namespace           string
+	controlDependencies []*tf.Operation
+	err                 *scopeErr
 }
 
 // scopeErr is used to share errors between all derivatives of a root scope.
@@ -80,6 +81,7 @@ func (s *Scope) AddOperation(args tf.OpSpec) *tf.Operation {
 	if s.namespace != "" {
 		args.Name = s.namespace + "/" + args.Name
 	}
+	args.ControlDependencies = append(args.ControlDependencies, s.controlDependencies...)
 	op, err := s.graph.AddOperation(args)
 	if err != nil {
 		s.UpdateErr(args.Type, err)
@@ -103,6 +105,23 @@ func (s *Scope) SubScope(namespace string) *Scope {
 	}
 }
 
+// WithControlDependencies returns a new Scope which will cause all operations
+// added to the graph to execute only after all the provided operations have
+// executed first (in addition to any other control dependencies in s).
+func (s *Scope) WithControlDependencies(ops ...*tf.Operation) *Scope {
+	return &Scope{
+		graph:     s.graph,
+		namemap:   s.namemap,
+		namespace: s.namespace,
+		// append(ops, s.controlDependencies) and not the other way
+		// around so that we end up with a copy of the underlying array
+		// (and other calls to s.WithControlDependencies() do not stomp
+		// on each other).
+		controlDependencies: append(ops, s.controlDependencies...),
+		err:                 s.err,
+	}
+}
+
 // Err returns the error, if any, encountered during the construction
 // of the Graph managed by s.
 //
diff --git a/tensorflow/go/op/scope_test.go b/tensorflow/go/op/scope_test.go
index 6fb5d32e50..4f533881d0 100644
--- a/tensorflow/go/op/scope_test.go
+++ b/tensorflow/go/op/scope_test.go
@@ -69,6 +69,40 @@ func TestScopeSubScopeErrors(t *testing.T) {
 	}
 }
 
+func TestControlDependencies(t *testing.T) {
+	var (
+		s        = NewScope()
+		zero     = Const(s.SubScope("zero"), int32(0))
+		one      = Const(s.SubScope("one"), int32(1))
+		variable = VarHandleOp(s, tf.Int32, tf.ScalarShape())
+		init     = AssignVariableOp(s, variable, zero)
+		update   = AssignAddVariableOp(s, variable, one)
+		read     = ReadVariableOp(s.WithControlDependencies(update), variable, tf.Int32)
+	)
+	graph, err := s.Finalize()
+	if err != nil {
+		t.Fatal(err)
+	}
+	sess, err := tf.NewSession(graph, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if _, err = sess.Run(nil, nil, []*tf.Operation{init}); err != nil {
+		t.Fatal(err)
+	}
+	// Without the control dependency, the read operation may not see the
+	// update.
+	for i := int32(0); i < 10; i++ {
+		out, err := sess.Run(nil, []tf.Output{read}, nil)
+		if err != nil {
+			t.Fatal(err)
+		}
+		if got, want := out[0].Value().(int32), i+1; got != want {
+			t.Errorf("Got %d, want %d", got, want)
+		}
+	}
+}
+
 func TestScopeFinalize(t *testing.T) {
 	var (
 		root = NewScope()
-- 
GitLab


From 76b390539b296af43c9d3f4b5cfe0dfbf5b3cfd6 Mon Sep 17 00:00:00 2001
From: Eric Liu <ioeric@google.com>
Date: Wed, 31 Jan 2018 01:26:29 -0800
Subject: [PATCH 452/966] Remove a failing test for gpu numbers when XLA is
 enabled.

The test assumes that all devices are either CPU or GPU, which is not true when XLA is enabled.

PiperOrigin-RevId: 183956872
---
 tensorflow/contrib/eager/python/tfe_test.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tensorflow/contrib/eager/python/tfe_test.py b/tensorflow/contrib/eager/python/tfe_test.py
index 0dedb2fd7c..b6659c2a17 100644
--- a/tensorflow/contrib/eager/python/tfe_test.py
+++ b/tensorflow/contrib/eager/python/tfe_test.py
@@ -102,10 +102,6 @@ class TFETest(test_util.TensorFlowTestCase):
     # Expect at least one device.
     self.assertTrue(tfe.list_devices())
 
-  def testNumGPUs(self):
-    devices = tfe.list_devices()
-    self.assertEqual(len(devices) - 1, tfe.num_gpus())
-
   def testAddCheckNumericsOpsRaisesError(self):
     with self.assertRaisesRegexp(
         RuntimeError,
-- 
GitLab


From a55c018ded951ff4530557026fc97e9e6736a336 Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Wed, 31 Jan 2018 08:16:59 -0800
Subject: [PATCH 453/966] Fix docs generation for cluster_resolvers

Adds "cluster_resolver_pip" as a dependancy to opensource contrib, and applies a standard `remove_undocumented` to clear extra symbols.

Docs are build from a bazel bulid, and without this change the cluster resolvers are not directly accessible in "tf.contirb.cluster_resolver" during the docs build, so they do not get documented.

PiperOrigin-RevId: 183993115
---
 tensorflow/contrib/cluster_resolver/BUILD       |  1 +
 tensorflow/contrib/cluster_resolver/__init__.py | 12 ++++++++++++
 2 files changed, 13 insertions(+)

diff --git a/tensorflow/contrib/cluster_resolver/BUILD b/tensorflow/contrib/cluster_resolver/BUILD
index 15abd2be03..80e18a43a7 100644
--- a/tensorflow/contrib/cluster_resolver/BUILD
+++ b/tensorflow/contrib/cluster_resolver/BUILD
@@ -34,6 +34,7 @@ py_library(
         ":cluster_resolver_py",
         ":gce_cluster_resolver_py",
         ":tpu_cluster_resolver_py",
+        "//tensorflow/python:util",
     ],
 )
 
diff --git a/tensorflow/contrib/cluster_resolver/__init__.py b/tensorflow/contrib/cluster_resolver/__init__.py
index d17501e87e..b4d8cd4a7c 100644
--- a/tensorflow/contrib/cluster_resolver/__init__.py
+++ b/tensorflow/contrib/cluster_resolver/__init__.py
@@ -26,3 +26,15 @@ from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import
 from tensorflow.contrib.cluster_resolver.python.training.gce_cluster_resolver import GceClusterResolver
 from tensorflow.contrib.cluster_resolver.python.training.tpu_cluster_resolver import TPUClusterResolver
 # pylint: enable=wildcard-import,unused-import
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = [
+    'ClusterResolver',
+    'SimpleClusterResolver',
+    'UnionClusterResolver',
+    'GceClusterResolver',
+    'TPUClusterResolver',
+]
+
+remove_undocumented(__name__, _allowed_symbols)
-- 
GitLab


From 76989a191815bdd96390626db154676ac42b890d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 31 Jan 2018 09:40:48 -0800
Subject: [PATCH 454/966] Verify contents of tensors

PiperOrigin-RevId: 184003263
---
 tensorflow/contrib/BUILD                      |   1 +
 tensorflow/contrib/lite/tools/BUILD           |   4 +
 tensorflow/contrib/lite/tools/verifier.cc     | 170 ++++++++++++++-
 tensorflow/contrib/lite/tools/verifier.h      |   4 +-
 .../contrib/lite/tools/verifier_test.cc       | 200 +++++++++++++-----
 5 files changed, 325 insertions(+), 54 deletions(-)

diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index efb6449bb0..ac6f01365b 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -24,6 +24,7 @@ py_library(
         "//tensorflow/contrib/bayesflow:bayesflow_py",
         "//tensorflow/contrib/boosted_trees:init_py",
         "//tensorflow/contrib/cloud:cloud_py",
+        "//tensorflow/contrib/cluster_resolver:cluster_resolver_pip",
         "//tensorflow/contrib/cluster_resolver:cluster_resolver_py",
         "//tensorflow/contrib/coder:coder_ops_py",
         "//tensorflow/contrib/compiler:compiler_py",
diff --git a/tensorflow/contrib/lite/tools/BUILD b/tensorflow/contrib/lite/tools/BUILD
index 1bffcfb987..4d3b553b22 100644
--- a/tensorflow/contrib/lite/tools/BUILD
+++ b/tensorflow/contrib/lite/tools/BUILD
@@ -99,8 +99,11 @@ cc_library(
     srcs = ["verifier.cc"],
     hdrs = ["verifier.h"],
     deps = [
+        "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite:schema_fbs_version",
+        "//tensorflow/contrib/lite:string_util",
         "//tensorflow/contrib/lite/schema:schema_fbs",
+        "@com_google_absl//absl/base:core_headers",
     ],
 )
 
@@ -112,6 +115,7 @@ cc_test(
         ":verifier",
         "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite:schema_fbs_version",
+        "//tensorflow/contrib/lite:string_util",
         "//tensorflow/contrib/lite/schema:schema_fbs",
         "//tensorflow/contrib/lite/testing:util",
         "@com_google_googletest//:gtest",
diff --git a/tensorflow/contrib/lite/tools/verifier.cc b/tensorflow/contrib/lite/tools/verifier.cc
index 95a0895379..726e2aaa31 100644
--- a/tensorflow/contrib/lite/tools/verifier.cc
+++ b/tensorflow/contrib/lite/tools/verifier.cc
@@ -14,13 +14,32 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/contrib/lite/tools/verifier.h"
+#include <climits>
 #include "tensorflow/contrib/lite/schema/schema_generated.h"
+#include "tensorflow/contrib/lite/string_util.h"
 #include "tensorflow/contrib/lite/version.h"
 
 namespace tflite {
 
 namespace {
 
+// Reports error message when the reporter is set.
+void ReportError(ErrorReporter* error_reporter, const char* format, ...) {
+  if (error_reporter) {
+    va_list args;
+    va_start(args, format);
+    error_reporter->Report(format, args);
+    va_end(args);
+  }
+}
+
+// Returns the int32_t value pointed by ptr.
+const uint32_t* GetIntPtr(const char* ptr) {
+  return reinterpret_cast<const uint32_t*>(ptr);
+}
+
+// Verifies flatbuffer format of the model contents and returns the in-memory
+// model.
 const Model* VerifyFlatbufferAndGetModel(const void* buf, size_t len) {
   ::flatbuffers::Verifier verifier(static_cast<const uint8_t*>(buf), len);
   if (VerifyModelBuffer(verifier)) {
@@ -30,14 +49,159 @@ const Model* VerifyFlatbufferAndGetModel(const void* buf, size_t len) {
   }
 }
 
+const uint32_t kMaxNumString = UINT_MAX / sizeof(int32_t) - 2;
+
+// Verifies string tensor has legit buffer contents that follow the schema
+// defined in lite/string_util.h
+bool VerifyStringTensorBuffer(const Buffer& buffer,
+                              ErrorReporter* error_reporter) {
+  uint32_t buffer_size = buffer.data()->size();
+  const char* buffer_ptr = reinterpret_cast<const char*>(buffer.data()->data());
+
+  uint32_t num_strings = *GetIntPtr(buffer_ptr);
+  if (num_strings > kMaxNumString) {
+    ReportError(error_reporter,
+                "String tensor has invalid num of string set: %d", num_strings);
+    return false;
+  }
+  uint32_t header_offsets =
+      static_cast<uint32_t>(num_strings + 2) * sizeof(int32_t);
+
+  if (buffer_size < header_offsets) {
+    ReportError(error_reporter,
+                "String tensor buffer requires at least %d bytes, but is "
+                "allocated with %d bytes",
+                header_offsets, buffer_size);
+    return false;
+  }
+
+  uint32_t prev_ptr = header_offsets;
+  uint32_t offset = sizeof(int32_t);
+
+  if (*GetIntPtr(buffer_ptr + offset) != header_offsets) {
+    ReportError(error_reporter,
+                "String tensor buffer initial offset must be: %d",
+                header_offsets);
+    return false;
+  }
+  offset += sizeof(int32_t);
+  for (int i = 1; i <= num_strings; i++, offset += sizeof(int32_t)) {
+    int string_offset = *GetIntPtr(buffer_ptr + offset);
+    if (string_offset < prev_ptr || string_offset > buffer_size) {
+      ReportError(error_reporter, "String tensor buffer is invalid: index %d",
+                  i);
+      return false;
+    }
+  }
+  if (*GetIntPtr(buffer_ptr + offset - sizeof(int32_t)) != buffer_size) {
+    ReportError(error_reporter, "String tensor buffer last offset must be %d",
+                buffer_size);
+    return false;
+  }
+  return true;
+}
+
+// Verifies numeric tensor has legit buffer.
+bool VerifyNumericTensorBuffer(const Tensor& tensor, const Buffer& buffer,
+                               ErrorReporter* error_reporter) {
+  uint64_t bytes_required = 1;
+  for (int dim : *tensor.shape()) {
+    bytes_required *= dim;
+    if (bytes_required > UINT_MAX) {
+      ReportError(error_reporter, "Tensor dimension overflow");
+      return false;
+    }
+  }
+  switch (tensor.type()) {
+    case TensorType_FLOAT32:
+      bytes_required *= sizeof(float);
+      break;
+    case TensorType_INT32:
+      bytes_required *= sizeof(int32_t);
+      break;
+    case TensorType_UINT8:
+      bytes_required *= sizeof(uint8_t);
+      break;
+    case TensorType_INT64:
+      bytes_required *= sizeof(int64_t);
+      break;
+    case TensorType_FLOAT16:
+      // FALLTHROUGH_INTENDED;
+    default:
+      ReportError(error_reporter, "Invalid tensor type: %d", tensor.type());
+      return false;
+  }
+  if (bytes_required > UINT_MAX) {
+    ReportError(error_reporter, "Tensor dimension overflow");
+    return false;
+  }
+
+  if (bytes_required != buffer.data()->size()) {
+    ReportError(
+        error_reporter,
+        "Tensor requires %d bytes, but is allocated with %d bytes buffer",
+        bytes_required, buffer.data()->size());
+    return false;
+  }
+  return true;
+
+  // TODO(yichengfan): verify quantized tensors.
+}
+
+// Verifies tensors have valid properties and legit buffer if set.
+bool VerifyTensors(const Model& model, ErrorReporter* error_reporter) {
+  if (!model.subgraphs()) {
+    return true;
+  }
+  for (const auto& subgraph : *model.subgraphs()) {
+    if (!subgraph->tensors()) {
+      return true;
+    }
+    for (const auto& tensor : *subgraph->tensors()) {
+      if (!tensor->buffer()) {
+        return true;
+      }
+      if (tensor->buffer() >= model.buffers()->size()) {
+        ReportError(error_reporter, "Invalid tensor buffer index: %d",
+                    tensor->buffer());
+        return false;
+      }
+      auto* buffer = model.buffers()->Get(tensor->buffer());
+      if (!buffer || !buffer->data()) {
+        ReportError(error_reporter, "Tensor buffer %d not set",
+                    tensor->buffer());
+        return false;
+      }
+
+      if (tensor->type() == TensorType_STRING) {
+        if (!VerifyStringTensorBuffer(*buffer, error_reporter)) {
+          return false;
+        }
+      } else {
+        if (!VerifyNumericTensorBuffer(*tensor, *buffer, error_reporter)) {
+          return false;
+        }
+      }
+    }
+  }
+  return true;
+}
+
 }  // namespace
 
-bool Verify(const void* buf, size_t len) {
+bool Verify(const void* buf, size_t len, ErrorReporter* error_reporter) {
   const Model* model = VerifyFlatbufferAndGetModel(buf, len);
   if (model == nullptr) {
+    ReportError(error_reporter, "Invalid flatbuffer format");
     return false;
   }
-
-  return model->version() == TFLITE_SCHEMA_VERSION;
+  if (model->version() != TFLITE_SCHEMA_VERSION) {
+    ReportError(error_reporter, "Invalid model version %d", model->version());
+    return false;
+  }
+  if (!VerifyTensors(*model, error_reporter)) {
+    return false;
+  }
+  return true;
 }
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/tools/verifier.h b/tensorflow/contrib/lite/tools/verifier.h
index 03e1f22b7e..d2bf3c91d5 100644
--- a/tensorflow/contrib/lite/tools/verifier.h
+++ b/tensorflow/contrib/lite/tools/verifier.h
@@ -18,13 +18,15 @@ limitations under the License.
 
 #include <stdio.h>
 
+#include "tensorflow/contrib/lite/error_reporter.h"
+
 namespace tflite {
 
 // Verifies the integrity of a Tensorflow Lite flatbuffer model file.
 // Currently, it verifies:
 // * The file is following a legit flatbuffer schema.
 // * The model is in supported version.
-bool Verify(const void* buf, size_t len);
+bool Verify(const void* buf, size_t len, ErrorReporter* error_reporter);
 
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/tools/verifier_test.cc b/tensorflow/contrib/lite/tools/verifier_test.cc
index 0481a55a78..244d4f0396 100644
--- a/tensorflow/contrib/lite/tools/verifier_test.cc
+++ b/tensorflow/contrib/lite/tools/verifier_test.cc
@@ -28,31 +28,62 @@ using flatbuffers::FlatBufferBuilder;
 using flatbuffers::Offset;
 using flatbuffers::Vector;
 
-// Class that abstracts the list of buffers at the end of the TF Lite structure
-class DeferredBufferWriter {
+// Build single subgraph model.
+class TfLiteFlatbufferModelBuilder {
  public:
-  DeferredBufferWriter() {
-    data_.push_back({});  // sentinel empty buffer.
+  TfLiteFlatbufferModelBuilder() {
+    buffers_.push_back(
+        CreateBuffer(builder_, builder_.CreateVector(std::vector<uint8_t>{})));
   }
 
-  Offset<Vector<Offset<Buffer>>> BuildBuffers(FlatBufferBuilder *builder) {
-    std::vector<Offset<Buffer>> buffer_vector;
-    for (const auto &vec : data_) {
-      auto data_buffer = builder->CreateVector(vec.data(), vec.size());
-      buffer_vector.push_back(tflite::CreateBuffer(*builder, data_buffer));
+  void AddTensor(const std::vector<int>& shape, tflite::TensorType type,
+                 const std::vector<uint8_t>& buffer, const char* name) {
+    int buffer_index = 0;
+    if (!buffer.empty()) {
+      buffer_index = buffers_.size();
+      buffers_.push_back(CreateBuffer(builder_, builder_.CreateVector(buffer)));
     }
-    return builder->CreateVector(buffer_vector);
+    tensors_.push_back(CreateTensorDirect(builder_, &shape, type, buffer_index,
+                                          name, /*quantization=*/0));
   }
 
-  // Registers a buffer index and takes ownership of the data to write to it.
-  int Record(std::vector<uint8_t> data) {
-    int buffer_index = data_.size();
-    data_.emplace_back(std::move(data));
-    return buffer_index;
+  void AddOperator(const std::vector<int32_t>& inputs,
+                   const std::vector<int32_t>& outputs,
+                   tflite::BuiltinOperator builtin_op, const char* custom_op) {
+    operator_codes_.push_back(
+        CreateOperatorCodeDirect(builder_, builtin_op, custom_op));
+    operators_.push_back(CreateOperator(
+        builder_, operator_codes_.size() - 1, builder_.CreateVector(inputs),
+        builder_.CreateVector(outputs), BuiltinOptions_NONE,
+        /*builtin_options=*/0,
+        /*custom_options=*/0, tflite::CustomOptionsFormat_FLEXBUFFERS));
+  }
+
+  void FinishModel(const std::vector<int32_t>& inputs,
+                   const std::vector<int32_t>& outputs) {
+    auto subgraph = std::vector<Offset<SubGraph>>({CreateSubGraph(
+        builder_, builder_.CreateVector(tensors_),
+        builder_.CreateVector(inputs), builder_.CreateVector(outputs),
+        builder_.CreateVector(operators_),
+        builder_.CreateString("test_subgraph"))});
+    auto result = CreateModel(
+        builder_, TFLITE_SCHEMA_VERSION, builder_.CreateVector(operator_codes_),
+        builder_.CreateVector(subgraph), builder_.CreateString("test_model"),
+        builder_.CreateVector(buffers_));
+    tflite::FinishModelBuffer(builder_, result);
+  }
+
+  bool Verify() {
+    return tflite::Verify(builder_.GetBufferPointer(), builder_.GetSize(),
+                          DefaultErrorReporter());
   }
 
  private:
-  std::vector<std::vector<unsigned char>> data_;
+  FlatBufferBuilder builder_;
+  std::vector<Offset<Operator>> operators_;
+  std::vector<Offset<OperatorCode>> operator_codes_;
+  std::vector<Offset<Tensor>> tensors_;
+  std::vector<Offset<Buffer>> buffers_;
 };
 
 TEST(VerifyModel, TestEmptyModel) {
@@ -62,43 +93,26 @@ TEST(VerifyModel, TestEmptyModel) {
                            /*description=*/0, /*buffers=*/0);
   ::tflite::FinishModelBuffer(builder, model);
 
-  ASSERT_TRUE(Verify(builder.GetBufferPointer(), builder.GetSize()));
+  ASSERT_TRUE(Verify(builder.GetBufferPointer(), builder.GetSize(),
+                     DefaultErrorReporter()));
 }
 
 TEST(VerifyModel, TestSimpleModel) {
-  FlatBufferBuilder builder;
-  auto inputs = builder.CreateVector<int32_t>({0});
-  auto outputs = builder.CreateVector<int32_t>({1});
-  auto operator_codes = builder.CreateVector(std::vector<Offset<OperatorCode>>{
-      CreateOperatorCodeDirect(builder, BuiltinOperator_CUSTOM, "test")});
-  auto operators =
-      builder.CreateVector(std::vector<Offset<Operator>>{CreateOperator(
-          builder, /*opcode_index=*/0,
-          /*inputs=*/builder.CreateVector<int32_t>({0}),
-          /*outputs=*/builder.CreateVector<int32_t>({1}), BuiltinOptions_NONE,
-          /*builtin_options=*/0,
-          /*custom_options=*/0, ::tflite::CustomOptionsFormat_FLEXBUFFERS)});
-  std::vector<int> shape;
-  auto tensors = builder.CreateVector(std::vector<Offset<Tensor>>{
-      CreateTensorDirect(builder, &shape, TensorType_INT32, /*buffer=*/0,
-                         "input", /*quantization=*/0),
-      CreateTensorDirect(builder, &shape, TensorType_INT32, /*buffer=*/0,
-                         "output", /*quantization=*/0)});
-  auto subgraph = std::vector<Offset<SubGraph>>(
-      {CreateSubGraph(builder, tensors, inputs, outputs, operators,
-                      builder.CreateString("Main"))});
-
-  auto model = CreateModel(builder, TFLITE_SCHEMA_VERSION, operator_codes,
-                           builder.CreateVector(subgraph),
-                           builder.CreateString("SmartReply"), /*buffers=*/0);
-
-  ::tflite::FinishModelBuffer(builder, model);
-  ASSERT_TRUE(Verify(builder.GetBufferPointer(), builder.GetSize()));
+  TfLiteFlatbufferModelBuilder builder;
+  builder.AddOperator({0, 1}, {2}, BuiltinOperator_CUSTOM, "test");
+  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4, 5, 6}, "input");
+  builder.AddTensor(
+      {2}, TensorType_STRING,
+      {2, 0, 0, 0, 16, 0, 0, 0, 17, 0, 0, 0, 19, 0, 0, 0, 'A', 'B', 'C'},
+      "data");
+  builder.AddTensor({2, 3}, TensorType_INT32, {}, "output");
+  builder.FinishModel({0, 1}, {2});
+  ASSERT_TRUE(builder.Verify());
 }
 
 TEST(VerifyModel, TestCorruptedData) {
   string model = "123";
-  ASSERT_FALSE(Verify(model.data(), model.size()));
+  ASSERT_FALSE(Verify(model.data(), model.size(), /*error_reporter=*/nullptr));
 }
 
 TEST(VerifyModel, TestUnsupportedVersion) {
@@ -106,7 +120,8 @@ TEST(VerifyModel, TestUnsupportedVersion) {
   auto model = CreateModel(builder, /*version=*/1, /*operator_codes=*/0,
                            /*subgraphs=*/0, /*description=*/0, /*buffers=*/0);
   ::tflite::FinishModelBuffer(builder, model);
-  ASSERT_FALSE(Verify(builder.GetBufferPointer(), builder.GetSize()));
+  ASSERT_FALSE(Verify(builder.GetBufferPointer(), builder.GetSize(),
+                      DefaultErrorReporter()));
 }
 
 TEST(VerifyModel, TestRandomModificationIsNotAllowed) {
@@ -116,20 +131,105 @@ TEST(VerifyModel, TestRandomModificationIsNotAllowed) {
                            /*subgraphs=*/0, /*description=*/0, /*buffers=*/0);
   ::tflite::FinishModelBuffer(builder, model);
 
-  string model_content(reinterpret_cast<char *>(builder.GetBufferPointer()),
+  string model_content(reinterpret_cast<char*>(builder.GetBufferPointer()),
                        builder.GetSize());
   for (int i = 0; i < model_content.size(); i++) {
     model_content[i] = (model_content[i] + 137) % 255;
-    EXPECT_FALSE(Verify(model_content.data(), model_content.size()))
+    EXPECT_FALSE(Verify(model_content.data(), model_content.size(),
+                        DefaultErrorReporter()))
         << "Fail at position: " << i;
   }
 }
 
+TEST(VerifyModel, TestIntTensorShapeIsGreaterThanBuffer) {
+  TfLiteFlatbufferModelBuilder builder;
+  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4}, "input");
+  builder.FinishModel({}, {});
+  ASSERT_FALSE(builder.Verify());
+}
+
+TEST(VerifyModel, TestIntTensorShapeIsSmallerThanBuffer) {
+  TfLiteFlatbufferModelBuilder builder;
+  builder.AddTensor({2, 1}, TensorType_UINT8, {1, 2, 3, 4}, "input");
+  builder.FinishModel({}, {});
+  ASSERT_FALSE(builder.Verify());
+}
+
+TEST(VerifyModel, TestIntTensorShapeOverflow) {
+  TfLiteFlatbufferModelBuilder builder;
+  builder.AddTensor({1024, 2048, 4096}, TensorType_UINT8, {1, 2, 3, 4},
+                    "input");
+  builder.FinishModel({}, {});
+  ASSERT_FALSE(builder.Verify());
+}
+
+TEST(VerifyModel, TensorBufferIsNotValid) {
+  FlatBufferBuilder builder;
+  std::vector<int> shape = {2, 3};
+  auto tensors = builder.CreateVector(std::vector<Offset<Tensor>>{
+      CreateTensorDirect(builder, &shape, TensorType_INT32, /*buffer=*/2,
+                         "input", /*quantization=*/0)});
+  auto subgraph = std::vector<Offset<SubGraph>>(
+      {CreateSubGraph(builder, tensors, /*inputs=*/0, /*outputs=*/0,
+                      /*operators=*/0, builder.CreateString("Main"))});
+
+  auto buffers = builder.CreateVector(std::vector<Offset<Buffer>>{
+      CreateBuffer(builder,
+                   builder.CreateVector(std::vector<uint8>{1, 2, 3, 4, 5, 6})),
+  });
+
+  auto model = CreateModel(builder, TFLITE_SCHEMA_VERSION, /*operator_codes=*/0,
+                           builder.CreateVector(subgraph),
+                           builder.CreateString("SmartReply"), buffers);
+
+  ::tflite::FinishModelBuffer(builder, model);
+  ASSERT_FALSE(Verify(builder.GetBufferPointer(), builder.GetSize(),
+                      DefaultErrorReporter()));
+}
+
+TEST(VerifyModel, StringTensorHasInvalidNumString) {
+  TfLiteFlatbufferModelBuilder builder;
+  builder.AddTensor(
+      {2}, TensorType_STRING,
+      {0x00, 0x00, 0x00, 0x20, 16, 0, 0, 0, 17, 0, 0, 0, 18, 0, 0, 0, 'A', 'B'},
+      "input");
+  builder.FinishModel({}, {});
+  ASSERT_FALSE(builder.Verify());
+}
+
+TEST(VerifyModel, StringTensorOffsetTooSmall) {
+  TfLiteFlatbufferModelBuilder builder;
+  builder.AddTensor(
+      {2}, TensorType_STRING,
+      {2, 0, 0, 0, 12, 0, 0, 0, 17, 0, 0, 0, 18, 0, 0, 0, 'A', 'B'}, "input");
+  builder.FinishModel({}, {});
+  ASSERT_FALSE(builder.Verify());
+}
+
+TEST(VerifyModel, StringTensorOffsetOutOfRange) {
+  TfLiteFlatbufferModelBuilder builder;
+  builder.AddTensor(
+      {2}, TensorType_STRING,
+      {2, 0, 0, 0, 16, 0, 0, 0, 17, 0, 0, 0, 22, 0, 0, 0, 'A', 'B'}, "input");
+  builder.FinishModel({}, {});
+  ASSERT_FALSE(builder.Verify());
+}
+
+TEST(VerifyModel, StringTensorIsLargerThanRequired) {
+  TfLiteFlatbufferModelBuilder builder;
+  builder.AddTensor(
+      {2}, TensorType_STRING,
+      {2, 0, 0, 0, 16, 0, 0, 0, 17, 0, 0, 0, 18, 0, 0, 0, 'A', 'B', 'C'},
+      "input");
+  builder.FinishModel({}, {});
+  ASSERT_FALSE(builder.Verify());
+}
+
 // TODO(yichengfan): make up malicious files to test with.
 
 }  // namespace tflite
 
-int main(int argc, char **argv) {
+int main(int argc, char** argv) {
   ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
-- 
GitLab


From 16b9fc676be6f8aacf06977a7f9439a56ffccefa Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 31 Jan 2018 09:53:17 -0800
Subject: [PATCH 455/966] Extending sparsify_gather to remove variables from
 the tensorflow summaries.

PiperOrigin-RevId: 184004859
---
 .../meta_graph_transform.py                   |  7 ++
 .../tools/graph_transforms/sparsify_gather.cc | 80 ++++++++++++-----
 .../graph_transforms/sparsify_gather_test.cc  | 86 ++++++++++++++++---
 3 files changed, 138 insertions(+), 35 deletions(-)

diff --git a/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py b/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py
index 2932ae1c8d..ff88b4fa84 100644
--- a/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py
+++ b/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py
@@ -171,7 +171,14 @@ def _clean_save_and_restore(graph_def, op, removed_op_names):
   shape_op_value_tensor.tensor_shape.dim[0].size = len(shapes)
   op.attr['dtypes'].list.type[:] = dtypes
 
+  if not name_op.attr['_output_shapes'].list.shape:
+    name_op.attr['_output_shapes'].list.shape.add()
+    name_op.attr['_output_shapes'].list.shape[0].dim.add()
   name_op.attr['_output_shapes'].list.shape[0].dim[0].size = len(names)
+
+  if not shape_op.attr['_output_shapes'].list.shape:
+    shape_op.attr['_output_shapes'].list.shape.add()
+    shape_op.attr['_output_shapes'].list.shape[0].dim.add()
   shape_op.attr['_output_shapes'].list.shape[0].dim[0].size = len(shapes)
 
 
diff --git a/tensorflow/tools/graph_transforms/sparsify_gather.cc b/tensorflow/tools/graph_transforms/sparsify_gather.cc
index 593c654f9f..9c583d83ca 100644
--- a/tensorflow/tools/graph_transforms/sparsify_gather.cc
+++ b/tensorflow/tools/graph_transforms/sparsify_gather.cc
@@ -181,6 +181,14 @@ Status ObtainVariableInfo(
   return Status::OK();
 }
 
+Status RemoveInputAtIndex(NodeDef* n, int index) {
+  for (int i = index; i < n->input_size() - 1; i++) {
+    n->mutable_input()->SwapElements(i, i + 1);
+  }
+  n->mutable_input()->RemoveLast();
+  return Status::OK();
+}
+
 Status SparsifyGatherInternal(
     const GraphDef& input_graph_def,
     const std::unique_ptr<std::unordered_map<string, string> >&
@@ -301,13 +309,13 @@ Status SparsifyGatherInternal(
             TF_RETURN_IF_ERROR(ReadTensorFromCheckpoint(
                 weights_node.name(), ckpt_reader,
                 (*shapes_and_slices)[weights_node.name()], &weight));
-            // Add both both weight and identity node names.
-            removed_node_names.push_back(weights_node.name());
-            removed_node_names.push_back(match.inputs[0].node.name());
-            for (auto input_node : match.inputs[0].node.input()) {
-              auto parsed_input = StringReplace(input_node, "^", "", true);
-              refs[parsed_input]--;
-            }
+          }
+          // Add both both weight and identity node names.
+          removed_node_names.push_back(weights_node.name());
+          removed_node_names.push_back(match.inputs[0].node.name());
+          for (auto input_node : match.inputs[0].node.input()) {
+            auto parsed_input = StringReplace(input_node, "^", "", true);
+            refs[parsed_input]--;
           }
           Tensor indices_tensor;
           Tensor values_tensor;
@@ -468,26 +476,49 @@ Status SparsifyGatherInternal(
           continue;
         }
         int j = 0;
+        bool deleted_inputs = false;
         while (j < replaced_graph_def.node(i).input_size()) {
           if (replaced_graph_def.node(i).input(j) == name ||
               replaced_graph_def.node(i).input(j) == ("^" + name)) {
-            replaced_graph_def.mutable_node(i)->mutable_input()->SwapElements(
-                j, replaced_graph_def.node(i).input_size() - 1);
-            replaced_graph_def.mutable_node(i)->mutable_input()->RemoveLast();
+            TF_RETURN_IF_ERROR(
+                RemoveInputAtIndex(replaced_graph_def.mutable_node(i), j));
+            deleted_inputs = true;
             continue;
           }
           j++;
         }
-        if (!replaced_graph_def.node(i).input_size()) {
-          if ((refs.find(replaced_graph_def.node(i).name()) != refs.end()) &&
-              (refs[replaced_graph_def.node(i).name()] == 0)) {
+        if (deleted_inputs) {
+          if (replaced_graph_def.node(i).op() == "ConcatV2") {
+            if (replaced_graph_def.node(i).input_size() > 2) {
+              SetNodeAttr("N", replaced_graph_def.node(i).input_size() - 1,
+                          replaced_graph_def.mutable_node(i));
+            } else if (replaced_graph_def.node(i).input_size() == 2) {
+              if (refs[replaced_graph_def.node(i).input(1)] != 1) {
+                return errors::Internal(
+                    "Expect axis tensor of ConcatV2 node to only be referenced "
+                    "once.");
+              }
+              refs[replaced_graph_def.node(i).input(1)] -= 1;
+              removed_node_names.push_back(replaced_graph_def.node(i).input(1));
+              replaced_graph_def.mutable_node(i)->mutable_input()->RemoveLast();
+              replaced_graph_def.mutable_node(i)->mutable_attr()->erase("N");
+              replaced_graph_def.mutable_node(i)->set_op("Identity");
+            } else {
+              return errors::Internal(
+                  "ConcatV2 should have at least two elements");
+            }
+          }
+          if ((replaced_graph_def.node(i).op() == "Assign" ||
+               replaced_graph_def.node(i).op() == "Reshape" ||
+               replaced_graph_def.node(i).op() == "Equal" ||
+               replaced_graph_def.node(i).op() == "Mean" ||
+               replaced_graph_def.node(i).op() == "ScalarSummary") &&
+              replaced_graph_def.node(i).input_size() == 1) {
+            removed_node_names.push_back(replaced_graph_def.node(i).name());
+          }
+          if (!replaced_graph_def.node(i).input_size()) {
             removed_node_names.push_back(replaced_graph_def.node(i).name());
           }
-        }
-
-        if (replaced_graph_def.node(i).op() == "Assign" &&
-            replaced_graph_def.node(i).input_size() == 1) {
-          removed_node_names.push_back(replaced_graph_def.node(i).name());
         }
         i++;
       }
@@ -528,17 +559,22 @@ Status SparsifyGather(const GraphDef& input_graph_def,
     };
   // clang-format on
 
+  GraphDef cleaned_input_graph_def;
+  RemoveAttributes(input_graph_def, {"_output_shapes"},
+                   &cleaned_input_graph_def);
+
   GraphDef temp_output;
 
   std::unique_ptr<BundleReader> ckpt_reader;
   TF_RETURN_IF_ERROR(InitializeCheckpointReader(context, &ckpt_reader));
 
   std::unique_ptr<std::unordered_map<string, string> > shapes_and_slices;
-  TF_RETURN_IF_ERROR(ObtainVariableInfo(input_graph_def, &shapes_and_slices));
+  TF_RETURN_IF_ERROR(
+      ObtainVariableInfo(cleaned_input_graph_def, &shapes_and_slices));
 
-  TF_RETURN_IF_ERROR(SparsifyGatherInternal(input_graph_def, shapes_and_slices,
-                                            context, gather_pattern,
-                                            ckpt_reader, &temp_output));
+  TF_RETURN_IF_ERROR(SparsifyGatherInternal(
+      cleaned_input_graph_def, shapes_and_slices, context, gather_pattern,
+      ckpt_reader, &temp_output));
 
   TF_RETURN_IF_ERROR(SparsifyGatherInternal(temp_output, shapes_and_slices,
                                             context, gather_v2_pattern,
diff --git a/tensorflow/tools/graph_transforms/sparsify_gather_test.cc b/tensorflow/tools/graph_transforms/sparsify_gather_test.cc
index 6627df1331..203ed3e0f9 100644
--- a/tensorflow/tools/graph_transforms/sparsify_gather_test.cc
+++ b/tensorflow/tools/graph_transforms/sparsify_gather_test.cc
@@ -71,7 +71,7 @@ class SparsifyGatherTest : public ::testing::Test {
   }
 
   void TestSinglePartition(bool gather_v2, bool include_shared_init,
-                           bool test_variable,
+                           bool test_variable, bool test_kept_concat,
                            const string& shared_init_name = "group_deps") {
     GraphDef graph_def;
 
@@ -139,6 +139,26 @@ class SparsifyGatherTest : public ::testing::Test {
       }
     }
 
+    NodeDef* concat_axis_node =
+        CreateNode("linear/concat/axis", "Const", {}, &graph_def);
+    NodeDef* concat_input_node =
+        CreateNode("concat/input/node", "Const", {}, &graph_def);
+    NodeDef* concat_node = nullptr;
+    if (!test_kept_concat) {
+      concat_node = CreateNode(
+          "concat/node", "ConcatV2",
+          {identity_node, concat_input_node, concat_axis_node}, &graph_def);
+      SetNodeAttr("N", 2, concat_node);
+    } else {
+      NodeDef* concat_input_node_2 =
+          CreateNode("concat/input/node_2", "Const", {}, &graph_def);
+      concat_node = CreateNode("concat/node", "ConcatV2",
+                               {identity_node, concat_input_node,
+                                concat_input_node_2, concat_axis_node},
+                               &graph_def);
+      SetNodeAttr("N", 3, concat_node);
+    }
+
     // Run the op.
     GraphDef result;
     TransformFuncContext context;
@@ -166,6 +186,23 @@ class SparsifyGatherTest : public ::testing::Test {
     EXPECT_EQ(1, node_lookup.count("ids"));
     EXPECT_EQ("Const", node_lookup.at("ids")->op());
 
+    EXPECT_EQ(1, node_lookup.count("concat/node"));
+
+    if (!test_kept_concat) {
+      EXPECT_EQ(0, node_lookup.count("linear/concat/axis"));
+      EXPECT_EQ("Identity", node_lookup.at("concat/node")->op());
+      EXPECT_EQ(1, node_lookup.at("concat/node")->input_size());
+      EXPECT_EQ("concat/input/node", node_lookup.at("concat/node")->input(0));
+    } else {
+      EXPECT_EQ(1, node_lookup.count("linear/concat/axis"));
+      EXPECT_EQ("ConcatV2", node_lookup.at("concat/node")->op());
+      EXPECT_EQ(3, node_lookup.at("concat/node")->input_size());
+      EXPECT_EQ("concat/input/node", node_lookup.at("concat/node")->input(0));
+      EXPECT_EQ("concat/input/node_2", node_lookup.at("concat/node")->input(1));
+      EXPECT_EQ("linear/concat/axis", node_lookup.at("concat/node")->input(2));
+      EXPECT_EQ(2, node_lookup.at("concat/node")->attr().at("N").i());
+    }
+
     EXPECT_EQ(1, node_lookup.count("w/part_1/indices"));
     EXPECT_EQ("Const", node_lookup.at("w/part_1/indices")->op());
     Tensor expected_indices_tensor(DT_INT64, TensorShape({3}));
@@ -344,6 +381,13 @@ class SparsifyGatherTest : public ::testing::Test {
     MakeGather("gather1", gather_v2, identity_node1, input_node, &graph_def);
     MakeGather("gather2", gather_v2, identity_node2, input_node, &graph_def);
 
+    NodeDef* concat_axis_node =
+        CreateNode("linear/concat/axis", "Const", {}, &graph_def);
+    NodeDef* concat_node = CreateNode(
+        "concat/node", "ConcatV2",
+        {identity_node1, identity_node2, concat_axis_node}, &graph_def);
+    SetNodeAttr("N", 2, concat_node);
+
     // Shared init node
     if (include_shared_init) {
       if (!test_variable) {
@@ -515,6 +559,9 @@ class SparsifyGatherTest : public ::testing::Test {
               node_lookup.at("gather2/LookupTableFind")->input(2));
     EXPECT_EQ("gather2/LookupTableFind", node_lookup.at("gather2")->input(0));
 
+    EXPECT_EQ(0, node_lookup.count("linear/concat/axis"));
+    EXPECT_EQ(0, node_lookup.count("concat/node"));
+
     // Check control deps.
     EXPECT_EQ(2, node_lookup.at(shared_init_name)->input_size());
     EXPECT_NE(std::find(node_lookup.at(shared_init_name)->input().begin(),
@@ -550,18 +597,31 @@ class SparsifyGatherTest : public ::testing::Test {
 };
 
 TEST_F(SparsifyGatherTest, TestSinglePartition) {
-  TestSinglePartition(false, false, false);
-  TestSinglePartition(false, true, false);
-  TestSinglePartition(true, false, false);
-  TestSinglePartition(true, true, false);
-  TestSinglePartition(false, false, true);
-  TestSinglePartition(false, true, true);
-  TestSinglePartition(true, false, true);
-  TestSinglePartition(true, true, true);
-  TestSinglePartition(false, true, false, "shared_inits");
-  TestSinglePartition(true, true, false, "shared_inits");
-  TestSinglePartition(false, true, true, "shared_inits");
-  TestSinglePartition(true, true, true, "shared_inits");
+  TestSinglePartition(false, false, false, false);
+  TestSinglePartition(false, true, false, false);
+  TestSinglePartition(true, false, false, false);
+  TestSinglePartition(true, true, false, false);
+  TestSinglePartition(false, false, true, false);
+  TestSinglePartition(false, true, true, false);
+  TestSinglePartition(true, false, true, false);
+  TestSinglePartition(true, true, true, false);
+  TestSinglePartition(false, true, false, false, "shared_inits");
+  TestSinglePartition(true, true, false, false, "shared_inits");
+  TestSinglePartition(false, true, true, false, "shared_inits");
+  TestSinglePartition(true, true, true, false, "shared_inits");
+
+  TestSinglePartition(false, false, false, true);
+  TestSinglePartition(false, true, false, true);
+  TestSinglePartition(true, false, false, true);
+  TestSinglePartition(true, true, false, true);
+  TestSinglePartition(false, false, true, true);
+  TestSinglePartition(false, true, true, true);
+  TestSinglePartition(true, false, true, true);
+  TestSinglePartition(true, true, true, true);
+  TestSinglePartition(false, true, false, true, "shared_inits");
+  TestSinglePartition(true, true, false, true, "shared_inits");
+  TestSinglePartition(false, true, true, true, "shared_inits");
+  TestSinglePartition(true, true, true, true, "shared_inits");
 }
 
 TEST_F(SparsifyGatherTest, TestMultiPartition) {
-- 
GitLab


From e818d10f84bad3faad398f4b55831064666af5df Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 31 Jan 2018 10:06:24 -0800
Subject: [PATCH 456/966] Update external protobuf codebase version

PiperOrigin-RevId: 184006959
---
 .../util/example_proto_fast_parsing_test.cc   |  1 +
 tensorflow/workspace.bzl                      | 24 +++++++++----------
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/tensorflow/core/util/example_proto_fast_parsing_test.cc b/tensorflow/core/util/example_proto_fast_parsing_test.cc
index 9b6a8e1251..13e41c17f7 100644
--- a/tensorflow/core/util/example_proto_fast_parsing_test.cc
+++ b/tensorflow/core/util/example_proto_fast_parsing_test.cc
@@ -57,6 +57,7 @@ void TestCorrectness(const string& serialized) {
   Example example;
   Example fast_example;
   EXPECT_TRUE(example.ParseFromString(serialized));
+  example.DiscardUnknownFields();
   EXPECT_TRUE(TestFastParse(serialized, &fast_example));
   EXPECT_EQ(example.DebugString(), fast_example.DebugString());
   if (example.DebugString() != fast_example.DebugString()) {
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 0a669eeccd..d082b6747a 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -352,11 +352,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "protobuf_archive",
       urls = [
-          "https://mirror.bazel.build/github.com/google/protobuf/archive/b04e5cba356212e4e8c66c61bbe0c3a20537c5b9.tar.gz",
-          "https://github.com/google/protobuf/archive/b04e5cba356212e4e8c66c61bbe0c3a20537c5b9.tar.gz",
+          "https://mirror.bazel.build/github.com/google/protobuf/archive/396336eb961b75f03b25824fe86cf6490fb75e3a.tar.gz",
+          "https://github.com/google/protobuf/archive/396336eb961b75f03b25824fe86cf6490fb75e3a.tar.gz",
       ],
-      sha256 = "e178a25c52efcb6b05988bdbeace4c0d3f2d2fe5b46696d1d9898875c3803d6a",
-      strip_prefix = "protobuf-b04e5cba356212e4e8c66c61bbe0c3a20537c5b9",
+      sha256 = "846d907acf472ae233ec0882ef3a2d24edbbe834b80c305e867ac65a1f2c59e3",
+      strip_prefix = "protobuf-396336eb961b75f03b25824fe86cf6490fb75e3a",
   )
 
   # We need to import the protobuf library under the names com_google_protobuf
@@ -365,21 +365,21 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "com_google_protobuf",
       urls = [
-          "https://mirror.bazel.build/github.com/google/protobuf/archive/b04e5cba356212e4e8c66c61bbe0c3a20537c5b9.tar.gz",
-          "https://github.com/google/protobuf/archive/b04e5cba356212e4e8c66c61bbe0c3a20537c5b9.tar.gz",
+          "https://mirror.bazel.build/github.com/google/protobuf/archive/396336eb961b75f03b25824fe86cf6490fb75e3a.tar.gz",
+          "https://github.com/google/protobuf/archive/396336eb961b75f03b25824fe86cf6490fb75e3a.tar.gz",
       ],
-      sha256 = "e178a25c52efcb6b05988bdbeace4c0d3f2d2fe5b46696d1d9898875c3803d6a",
-      strip_prefix = "protobuf-b04e5cba356212e4e8c66c61bbe0c3a20537c5b9",
+      sha256 = "846d907acf472ae233ec0882ef3a2d24edbbe834b80c305e867ac65a1f2c59e3",
+      strip_prefix = "protobuf-396336eb961b75f03b25824fe86cf6490fb75e3a",
   )
 
   tf_http_archive(
       name = "com_google_protobuf_cc",
       urls = [
-          "https://mirror.bazel.build/github.com/google/protobuf/archive/b04e5cba356212e4e8c66c61bbe0c3a20537c5b9.tar.gz",
-          "https://github.com/google/protobuf/archive/b04e5cba356212e4e8c66c61bbe0c3a20537c5b9.tar.gz",
+          "https://mirror.bazel.build/github.com/google/protobuf/archive/396336eb961b75f03b25824fe86cf6490fb75e3a.tar.gz",
+          "https://github.com/google/protobuf/archive/396336eb961b75f03b25824fe86cf6490fb75e3a.tar.gz",
       ],
-      sha256 = "e178a25c52efcb6b05988bdbeace4c0d3f2d2fe5b46696d1d9898875c3803d6a",
-      strip_prefix = "protobuf-b04e5cba356212e4e8c66c61bbe0c3a20537c5b9",
+      sha256 = "846d907acf472ae233ec0882ef3a2d24edbbe834b80c305e867ac65a1f2c59e3",
+      strip_prefix = "protobuf-396336eb961b75f03b25824fe86cf6490fb75e3a",
   )
 
   tf_http_archive(
-- 
GitLab


From 950a4c41f73221e72229172cda4795ea3194c0b2 Mon Sep 17 00:00:00 2001
From: Blake Hechtman <blakehechtman@google.com>
Date: Wed, 31 Jan 2018 10:19:17 -0800
Subject: [PATCH 457/966] [XLA] Disable transpose folding into reduce for
 reduces of rank 2 or higher.

PiperOrigin-RevId: 184009219
---
 .../xla/service/algebraic_simplifier.cc       |  7 +++++--
 tensorflow/compiler/xla/tests/reduce_test.cc  | 20 +++++++++++++++++++
 2 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index ba82e822b2..fb857559f9 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -1618,9 +1618,12 @@ Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* reduce) {
         reduce,
         HloInstruction::CreateBroadcast(reduce->shape(), init_value, {}));
   }
+
   // A Transpose feeding a reduce can simply permute the reduction dimensions
-  // field.
-  if (arg->opcode() == HloOpcode::kTranspose) {
+  // field if the output of the reduce is a vector or scalar. Higher ranked
+  // result may require a transpose of the output.
+  if (ShapeUtil::Rank(reduce->shape()) <= 1 &&
+      arg->opcode() == HloOpcode::kTranspose) {
     auto transpose_dimensions = arg->dimensions();
     std::vector<int64> new_reduce_dimensions;
     for (auto dim : dimensions) {
diff --git a/tensorflow/compiler/xla/tests/reduce_test.cc b/tensorflow/compiler/xla/tests/reduce_test.cc
index a766fa2db0..50d7b5074d 100644
--- a/tensorflow/compiler/xla/tests/reduce_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_test.cc
@@ -494,6 +494,26 @@ XLA_TEST_F(ReduceTest, TransposeAndReduceElementwiseR2_111x50_To_R1) {
                              ErrorSpec(0.01, 1e-4));
 }
 
+// Test that algebraic simplifier does not incorrectly fold a transpose into a
+// reduction operation.
+XLA_TEST_F(ReduceTest, TransposeAndReduceR3_12x111x50_To_R2) {
+  ComputationBuilder builder(client_, TestName());
+  Computation add_f32 = CreateScalarAddComputation(F32, &builder);
+  const Shape input_shape = ShapeUtil::MakeShape(F32, {12, 111, 50});
+  ComputationDataHandle input = builder.Parameter(0, input_shape, "input");
+  ComputationDataHandle zero = builder.ConstantR0<float>(0.0);
+  ComputationDataHandle transpose =
+      builder.Transpose(input, /*permutation=*/{1, 0, 2});
+  ComputationDataHandle reduce =
+      builder.Reduce(transpose, zero, add_f32, /*dimensions_to_reduce=*/{0});
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> input_data,
+                          MakeFakeLiteral(input_shape));
+
+  ComputeAndCompare(&builder, reduce, {std::move(*input_data)},
+                    ErrorSpec(0.01, 1e-4));
+}
+
 XLA_TEST_F(ReduceTest, Reshape_111x2x25Reduce_111x50_To_R1) {
   const int64 rows = 111, cols = 50;
 
-- 
GitLab


From 46093eabe85895ed62a447d08b4988edf8fccb1a Mon Sep 17 00:00:00 2001
From: Shivani Agrawal <shivaniagrawal@google.com>
Date: Wed, 31 Jan 2018 10:29:29 -0800
Subject: [PATCH 458/966] Whitelisting stateful op for dataset checkpointing
 temporarily.

PiperOrigin-RevId: 184010966
---
 tensorflow/core/framework/dataset.h | 2 ++
 tensorflow/core/ops/lookup_ops.cc   | 3 +++
 2 files changed, 5 insertions(+)

diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
index 2c2c7e7c58..f866183f61 100644
--- a/tensorflow/core/framework/dataset.h
+++ b/tensorflow/core/framework/dataset.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_FRAMEWORK_DATASET_H_
 #define TENSORFLOW_FRAMEWORK_DATASET_H_
 
+#include "tensorflow/core/lib/core/status.h"
+
 namespace tensorflow {
 namespace dataset {
 // Registry for stateful ops that need to be used in dataset functions.
diff --git a/tensorflow/core/ops/lookup_ops.cc b/tensorflow/core/ops/lookup_ops.cc
index a67267418d..50ea8ad01a 100644
--- a/tensorflow/core/ops/lookup_ops.cc
+++ b/tensorflow/core/ops/lookup_ops.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def_builder.h"
 #include "tensorflow/core/framework/shape_inference.h"
@@ -102,6 +103,8 @@ REGISTER_OP("LookupTableFindV2")
       c->set_output(0, c->UnknownShape());
       return Status::OK();
     });
+WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS("LookupTableFindV2");
+// TODO(b/72710477): Update this.
 
 REGISTER_OP("LookupTableInsert")
     .Input("table_handle: Ref(string)")
-- 
GitLab


From 35cc45f0a8832a00f69f33858ce08c4242433ce7 Mon Sep 17 00:00:00 2001
From: Eric Liu <ioeric@google.com>
Date: Wed, 31 Jan 2018 10:38:57 -0800
Subject: [PATCH 459/966] Add std:: to min/max in cuda code to unbreak
 cuda-clang build.

PiperOrigin-RevId: 184012479
---
 tensorflow/core/kernels/conv_ops_gpu_3.cu.cc | 28 +++++++++++---------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
index e58f5f61f3..a376534bad 100644
--- a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
+++ b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
@@ -648,8 +648,9 @@ struct BatchNarrowMatrixTransposeDispatcher {
     static_assert(
         (TileLongSide & (TileLongSide - 1)) == 0,
         "The length of the longer side of the tile is always a power of 2.");
-    bool request_satisfied = max(tile_size_i, tile_size_j) <= TileLongSide &&
-                             min(tile_size_i, tile_size_j) <= TileShortSide;
+    bool request_satisfied =
+        std::max(tile_size_i, tile_size_j) <= TileLongSide &&
+        std::min(tile_size_i, tile_size_j) <= TileShortSide;
 
     if (request_satisfied) {
       LaunchBatchNarrowMatrixTransposeKernel<T, TileLongSide, TileShortSide>(
@@ -662,7 +663,7 @@ struct BatchNarrowMatrixTransposeDispatcher {
     // determine whether it is the long side or the short side that falls short
     // of the request and increase that parameter accordingly.
     const bool long_side_request_not_satisfied =
-        max(tile_size_i, tile_size_j) > TileLongSide;
+        std::max(tile_size_i, tile_size_j) > TileLongSide;
 
     if (long_side_request_not_satisfied) {
       BatchNarrowMatrixTransposeDispatcher<
@@ -690,8 +691,9 @@ struct BatchNarrowMatrixTransposeDispatcher<
     static_assert(
         (TileLongSide & (TileLongSide - 1)) == 0,
         "The length of the longer side of the tile is always a power of 2.");
-    bool request_satisfied = max(tile_size_i, tile_size_j) <= TileLongSide &&
-                             min(tile_size_i, tile_size_j) <= TileShortSide;
+    bool request_satisfied =
+        std::max(tile_size_i, tile_size_j) <= TileLongSide &&
+        std::min(tile_size_i, tile_size_j) <= TileShortSide;
 
     if (request_satisfied) {
       LaunchBatchNarrowMatrixTransposeKernel<T, TileLongSide, TileShortSide>(
@@ -816,7 +818,7 @@ void SwapDimension1And2InTensor3WithNarrowMatrices(
   int tile_long_side_len = 0;
   int tile_short_side_len = 0;
   float lowest_cost = std::numeric_limits<float>::max();
-  int data_long_side = max(input_dims[1], input_dims[2]);
+  int data_long_side = std::max(input_dims[1], input_dims[2]);
 
   for (auto tile_size_pair : tile_spec) {
     int proposed_tile_long_side_len = tile_size_pair.first;
@@ -861,12 +863,14 @@ void SwapDimension1And2InTensor3WithNarrowMatrices(
   // Truncate the shorter size requested according to the manual limit set in
   // tile_spec to make sure that we do not launch configurations violating
   // hardware limits.
-  requested_tile_size_i = requested_tile_size_i == tile_long_side_len
-                              ? tile_long_side_len
-                              : min(requested_tile_size_i, tile_short_side_len);
-  requested_tile_size_j = requested_tile_size_j == tile_long_side_len
-                              ? tile_long_side_len
-                              : min(requested_tile_size_j, tile_short_side_len);
+  requested_tile_size_i =
+      requested_tile_size_i == tile_long_side_len
+          ? tile_long_side_len
+          : std::min(requested_tile_size_i, tile_short_side_len);
+  requested_tile_size_j =
+      requested_tile_size_j == tile_long_side_len
+          ? tile_long_side_len
+          : std::min(requested_tile_size_j, tile_short_side_len);
 
   Dimension<3> input_dims_in_tiles = {
       input_dims[0],
-- 
GitLab


From acb2d6147415f8556ac74cbf45118baf00ca64df Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 31 Jan 2018 10:59:49 -0800
Subject: [PATCH 460/966] Typo Correction.

PiperOrigin-RevId: 184016082
---
 tensorflow/contrib/factorization/python/ops/kmeans.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/factorization/python/ops/kmeans.py b/tensorflow/contrib/factorization/python/ops/kmeans.py
index 4d0f9b2424..c861cfff54 100644
--- a/tensorflow/contrib/factorization/python/ops/kmeans.py
+++ b/tensorflow/contrib/factorization/python/ops/kmeans.py
@@ -143,7 +143,7 @@ class _ModelFn(object):
   def model_fn(self, features, mode, config):
     """Model function for the estimator.
 
-    Note that this does not take a `1abels` arg. This works, but `input_fn` must
+    Note that this does not take a `labels` arg. This works, but `input_fn` must
     return either `features` or, equivalently, `(features, None)`.
 
     Args:
-- 
GitLab


From adc9ee7150c45830eb0857f6b9e935b9672b7bb1 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Wed, 31 Jan 2018 11:25:38 -0800
Subject: [PATCH 461/966] Adding tf_export decorators/calls to TensorFlow
 functions and constants.

PiperOrigin-RevId: 184020524
---
 tensorflow/python/client/session.py           |  3 +++
 tensorflow/python/data/ops/dataset_ops.py     |  2 ++
 tensorflow/python/data/ops/iterator_ops.py    |  2 ++
 tensorflow/python/data/ops/readers.py         |  4 ++++
 tensorflow/python/estimator/export/export.py  |  4 ++++
 .../python/estimator/export/export_output.py  |  5 +++++
 .../python/estimator/inputs/numpy_io.py       |  2 ++
 .../python/estimator/inputs/pandas_io.py      |  2 ++
 .../python/feature_column/feature_column.py   | 14 ++++++++++++
 tensorflow/python/lib/io/file_io.py           | 13 +++++++++++
 tensorflow/python/lib/io/tf_record.py         |  5 +++++
 tensorflow/python/ops/losses/losses_impl.py   | 13 +++++++++++
 tensorflow/python/ops/losses/util.py          |  6 +++++
 tensorflow/python/platform/app.py             |  2 ++
 tensorflow/python/platform/resource_loader.py |  6 +++++
 tensorflow/python/platform/tf_logging.py      | 22 +++++++++++++++++++
 tensorflow/python/profiler/model_analyzer.py  |  4 ++++
 tensorflow/python/profiler/option_builder.py  |  2 ++
 tensorflow/python/profiler/tfprof_logger.py   |  2 ++
 tensorflow/python/summary/writer/writer.py    |  2 ++
 .../python/summary/writer/writer_cache.py     |  2 ++
 tensorflow/python/util/compat.py              |  9 ++++++++
 tensorflow/tools/api/generator/BUILD          |  9 ++++++++
 23 files changed, 135 insertions(+)

diff --git a/tensorflow/python/client/session.py b/tensorflow/python/client/session.py
index e6f94396b8..6befeb846d 100644
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py
@@ -35,6 +35,7 @@ from tensorflow.python.ops import session_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
 
 
 class SessionInterface(object):
@@ -1441,6 +1442,7 @@ class BaseSession(SessionInterface):
       return handles
 
 
+@tf_export('Session')
 class Session(BaseSession):
   """A class for running TensorFlow operations.
 
@@ -1581,6 +1583,7 @@ class Session(BaseSession):
     tf_session.TF_Reset(target, containers, config)
 
 
+@tf_export('InteractiveSession')
 class InteractiveSession(BaseSession):
   """A TensorFlow `Session` for use in interactive contexts, such as a shell.
 
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index 0594c6d6a7..f8798c1d6f 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -41,8 +41,10 @@ from tensorflow.python.ops import gen_io_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import script_ops
 from tensorflow.python.util import deprecation
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("data.Dataset")
 class Dataset(object):
   """Represents a potentially large set of elements.
 
diff --git a/tensorflow/python/data/ops/iterator_ops.py b/tensorflow/python/data/ops/iterator_ops.py
index 53a3244ce1..e573fe0192 100644
--- a/tensorflow/python/data/ops/iterator_ops.py
+++ b/tensorflow/python/data/ops/iterator_ops.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
 # NOTE(mrry): It is legitimate to call `Iterator.get_next()` multiple
@@ -47,6 +48,7 @@ GET_NEXT_CALL_WARNING_MESSAGE = (
     "`next_element` inside the loop.")
 
 
+@tf_export("data.Iterator")
 class Iterator(object):
   """Represents the state of iterating through a `Dataset`."""
 
diff --git a/tensorflow/python/data/ops/readers.py b/tensorflow/python/data/ops/readers.py
index 830dc5cec4..fa7601741b 100644
--- a/tensorflow/python/data/ops/readers.py
+++ b/tensorflow/python/data/ops/readers.py
@@ -23,12 +23,14 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
 # TODO(b/64974358): Increase default buffer size to 256 MB.
 _DEFAULT_READER_BUFFER_SIZE_BYTES = 256 * 1024  # 256 KB
 
 
+@tf_export("data.TextLineDataset")
 class TextLineDataset(Dataset):
   """A `Dataset` comprising lines from one or more text files."""
 
@@ -71,6 +73,7 @@ class TextLineDataset(Dataset):
     return dtypes.string
 
 
+@tf_export("data.TFRecordDataset")
 class TFRecordDataset(Dataset):
   """A `Dataset` comprising records from one or more TFRecord files."""
 
@@ -115,6 +118,7 @@ class TFRecordDataset(Dataset):
     return dtypes.string
 
 
+@tf_export("data.FixedLengthRecordDataset")
 class FixedLengthRecordDataset(Dataset):
   """A `Dataset` of fixed-length records from one or more binary files."""
 
diff --git a/tensorflow/python/estimator/export/export.py b/tensorflow/python/estimator/export/export.py
index 51075731dd..83251c79fc 100644
--- a/tensorflow/python/estimator/export/export.py
+++ b/tensorflow/python/estimator/export/export.py
@@ -36,12 +36,14 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import signature_def_utils
 from tensorflow.python.util import compat
+from tensorflow.python.util.tf_export import tf_export
 
 
 _SINGLE_FEATURE_DEFAULT_NAME = 'feature'
 _SINGLE_RECEIVER_DEFAULT_NAME = 'input'
 
 
+@tf_export('estimator.export.ServingInputReceiver')
 class ServingInputReceiver(collections.namedtuple(
     'ServingInputReceiver',
     ['features', 'receiver_tensors', 'receiver_tensors_alternatives'])):
@@ -118,6 +120,7 @@ class ServingInputReceiver(collections.namedtuple(
         receiver_tensors_alternatives=receiver_tensors_alternatives)
 
 
+@tf_export('estimator.export.build_parsing_serving_input_receiver_fn')
 def build_parsing_serving_input_receiver_fn(feature_spec,
                                             default_batch_size=None):
   """Build a serving_input_receiver_fn expecting fed tf.Examples.
@@ -146,6 +149,7 @@ def build_parsing_serving_input_receiver_fn(feature_spec,
   return serving_input_receiver_fn
 
 
+@tf_export('estimator.export.build_raw_serving_input_receiver_fn')
 def build_raw_serving_input_receiver_fn(features, default_batch_size=None):
   """Build a serving_input_receiver_fn expecting feature Tensors.
 
diff --git a/tensorflow/python/estimator/export/export_output.py b/tensorflow/python/estimator/export/export_output.py
index 863af6d41d..87b964be37 100644
--- a/tensorflow/python/estimator/export/export_output.py
+++ b/tensorflow/python/estimator/export/export_output.py
@@ -26,8 +26,10 @@ import six
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.saved_model import signature_def_utils
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('estimator.export.ExportOutput')
 class ExportOutput(object):
   """Represents an output of a model that can be served.
 
@@ -50,6 +52,7 @@ class ExportOutput(object):
     pass
 
 
+@tf_export('estimator.export.ClassificationOutput')
 class ClassificationOutput(ExportOutput):
   """Represents the output of a classification head.
 
@@ -118,6 +121,7 @@ class ClassificationOutput(ExportOutput):
         examples, self.classes, self.scores)
 
 
+@tf_export('estimator.export.RegressionOutput')
 class RegressionOutput(ExportOutput):
   """Represents the output of a regression head."""
 
@@ -153,6 +157,7 @@ class RegressionOutput(ExportOutput):
 _SINGLE_OUTPUT_DEFAULT_NAME = 'output'
 
 
+@tf_export('estimator.export.PredictOutput')
 class PredictOutput(ExportOutput):
   """Represents the output of a generic prediction head.
 
diff --git a/tensorflow/python/estimator/inputs/numpy_io.py b/tensorflow/python/estimator/inputs/numpy_io.py
index c4c2e30e87..a6f4712910 100644
--- a/tensorflow/python/estimator/inputs/numpy_io.py
+++ b/tensorflow/python/estimator/inputs/numpy_io.py
@@ -24,6 +24,7 @@ import numpy as np
 from six import string_types
 
 from tensorflow.python.estimator.inputs.queues import feeding_functions
+from tensorflow.python.util.tf_export import tf_export
 
 # Key name to pack the target into dict of `features`. See
 # `_get_unique_target_key` for details.
@@ -86,6 +87,7 @@ def _validate_and_convert_features(x):
   return ordered_dict_data
 
 
+@tf_export('estimator.inputs.numpy_input_fn')
 def numpy_input_fn(x,
                    y=None,
                    batch_size=128,
diff --git a/tensorflow/python/estimator/inputs/pandas_io.py b/tensorflow/python/estimator/inputs/pandas_io.py
index 90d6145377..bd06843021 100644
--- a/tensorflow/python/estimator/inputs/pandas_io.py
+++ b/tensorflow/python/estimator/inputs/pandas_io.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 
 import numpy as np
 from tensorflow.python.estimator.inputs.queues import feeding_functions
+from tensorflow.python.util.tf_export import tf_export
 
 try:
   # pylint: disable=g-import-not-at-top
@@ -34,6 +35,7 @@ except ImportError:
   HAS_PANDAS = False
 
 
+@tf_export('estimator.inputs.pandas_input_fn')
 def pandas_input_fn(x,
                     y=None,
                     batch_size=128,
diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index 7feb209cc4..5947d8f6e2 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -157,6 +157,7 @@ from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_utils
 from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
 
 
 def _internal_input_layer(features,
@@ -209,6 +210,7 @@ def _internal_input_layer(features,
     return array_ops.concat(output_tensors, 1)
 
 
+@tf_export('feature_column.input_layer')
 def input_layer(features,
                 feature_columns,
                 weight_collections=None,
@@ -329,6 +331,7 @@ class InputLayer(object):
     return self._input_layer_template.weights
 
 
+@tf_export('feature_column.linear_model')
 def linear_model(features,
                  feature_columns,
                  units=1,
@@ -498,6 +501,7 @@ def _transform_features(features, feature_columns):
   return outputs
 
 
+@tf_export('feature_column.make_parse_example_spec')
 def make_parse_example_spec(feature_columns):
   """Creates parsing spec dictionary from input feature_columns.
 
@@ -557,6 +561,7 @@ def make_parse_example_spec(feature_columns):
   return result
 
 
+@tf_export('feature_column.embedding_column')
 def embedding_column(
     categorical_column, dimension, combiner='mean', initializer=None,
     ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None,
@@ -807,6 +812,7 @@ def shared_embedding_columns(
   return result
 
 
+@tf_export('feature_column.numeric_column')
 def numeric_column(key,
                    shape=(1,),
                    default_value=None,
@@ -881,6 +887,7 @@ def numeric_column(key,
       normalizer_fn=normalizer_fn)
 
 
+@tf_export('feature_column.bucketized_column')
 def bucketized_column(source_column, boundaries):
   """Represents discretized dense input.
 
@@ -970,6 +977,7 @@ def _assert_string_or_int(dtype, prefix):
         '{} dtype must be string or integer. dtype: {}.'.format(prefix, dtype))
 
 
+@tf_export('feature_column.categorical_column_with_hash_bucket')
 def categorical_column_with_hash_bucket(key,
                                         hash_bucket_size,
                                         dtype=dtypes.string):
@@ -1026,6 +1034,7 @@ def categorical_column_with_hash_bucket(key,
   return _HashedCategoricalColumn(key, hash_bucket_size, dtype)
 
 
+@tf_export('feature_column.categorical_column_with_vocabulary_file')
 def categorical_column_with_vocabulary_file(key,
                                             vocabulary_file,
                                             vocabulary_size=None,
@@ -1145,6 +1154,7 @@ def categorical_column_with_vocabulary_file(key,
       dtype=dtype)
 
 
+@tf_export('feature_column.categorical_column_with_vocabulary_list')
 def categorical_column_with_vocabulary_list(
     key, vocabulary_list, dtype=None, default_value=-1, num_oov_buckets=0):
   """A `_CategoricalColumn` with in-memory vocabulary.
@@ -1255,6 +1265,7 @@ def categorical_column_with_vocabulary_list(
       default_value=default_value, num_oov_buckets=num_oov_buckets)
 
 
+@tf_export('feature_column.categorical_column_with_identity')
 def categorical_column_with_identity(key, num_buckets, default_value=None):
   """A `_CategoricalColumn` that returns identity values.
 
@@ -1322,6 +1333,7 @@ def categorical_column_with_identity(key, num_buckets, default_value=None):
       key=key, num_buckets=num_buckets, default_value=default_value)
 
 
+@tf_export('feature_column.indicator_column')
 def indicator_column(categorical_column):
   """Represents multi-hot representation of given categorical column.
 
@@ -1350,6 +1362,7 @@ def indicator_column(categorical_column):
   return _IndicatorColumn(categorical_column)
 
 
+@tf_export('feature_column.weighted_categorical_column')
 def weighted_categorical_column(
     categorical_column, weight_feature_key, dtype=dtypes.float32):
   """Applies weight values to a `_CategoricalColumn`.
@@ -1424,6 +1437,7 @@ def weighted_categorical_column(
       dtype=dtype)
 
 
+@tf_export('feature_column.crossed_column')
 def crossed_column(keys, hash_bucket_size, hash_key=None):
   """Returns a column for performing crosses of categorical features.
 
diff --git a/tensorflow/python/lib/io/file_io.py b/tensorflow/python/lib/io/file_io.py
index 4e3071d851..59f5075f17 100644
--- a/tensorflow/python/lib/io/file_io.py
+++ b/tensorflow/python/lib/io/file_io.py
@@ -31,6 +31,7 @@ from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import errors
 from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
+from tensorflow.python.util.tf_export import tf_export
 
 
 class FileIO(object):
@@ -235,6 +236,7 @@ class FileIO(object):
     self._writable_file = None
 
 
+@tf_export("gfile.Exists")
 def file_exists(filename):
   """Determines whether a path exists or not.
 
@@ -256,6 +258,7 @@ def file_exists(filename):
   return True
 
 
+@tf_export("gfile.Remove")
 def delete_file(filename):
   """Deletes the file located at 'filename'.
 
@@ -306,6 +309,7 @@ def write_string_to_file(filename, file_content):
     f.write(file_content)
 
 
+@tf_export("gfile.Glob")
 def get_matching_files(filename):
   """Returns a list of files that match the given pattern(s).
 
@@ -336,6 +340,7 @@ def get_matching_files(filename):
       ]
 
 
+@tf_export("gfile.MkDir")
 def create_dir(dirname):
   """Creates a directory with the name 'dirname'.
 
@@ -353,6 +358,7 @@ def create_dir(dirname):
     pywrap_tensorflow.CreateDir(compat.as_bytes(dirname), status)
 
 
+@tf_export("gfile.MakeDirs")
 def recursive_create_dir(dirname):
   """Creates a directory and all parent/intermediate directories.
 
@@ -368,6 +374,7 @@ def recursive_create_dir(dirname):
     pywrap_tensorflow.RecursivelyCreateDir(compat.as_bytes(dirname), status)
 
 
+@tf_export("gfile.Copy")
 def copy(oldpath, newpath, overwrite=False):
   """Copies data from oldpath to newpath.
 
@@ -385,6 +392,7 @@ def copy(oldpath, newpath, overwrite=False):
         compat.as_bytes(oldpath), compat.as_bytes(newpath), overwrite, status)
 
 
+@tf_export("gfile.Rename")
 def rename(oldname, newname, overwrite=False):
   """Rename or move a file / directory.
 
@@ -426,6 +434,7 @@ def atomic_write_string_to_file(filename, contents, overwrite=True):
     raise
 
 
+@tf_export("gfile.DeleteRecursively")
 def delete_recursively(dirname):
   """Deletes everything under dirname recursively.
 
@@ -439,6 +448,7 @@ def delete_recursively(dirname):
     pywrap_tensorflow.DeleteRecursively(compat.as_bytes(dirname), status)
 
 
+@tf_export("gfile.IsDirectory")
 def is_directory(dirname):
   """Returns whether the path is a directory or not.
 
@@ -452,6 +462,7 @@ def is_directory(dirname):
   return pywrap_tensorflow.IsDirectory(compat.as_bytes(dirname), status)
 
 
+@tf_export("gfile.ListDirectory")
 def list_directory(dirname):
   """Returns a list of entries contained within a directory.
 
@@ -479,6 +490,7 @@ def list_directory(dirname):
     ]
 
 
+@tf_export("gfile.Walk")
 def walk(top, in_order=True):
   """Recursive directory tree generator for directories.
 
@@ -522,6 +534,7 @@ def walk(top, in_order=True):
     yield here
 
 
+@tf_export("gfile.Stat")
 def stat(filename):
   """Returns file statistics for a given path.
 
diff --git a/tensorflow/python/lib/io/tf_record.py b/tensorflow/python/lib/io/tf_record.py
index df19010068..48ea107a14 100644
--- a/tensorflow/python/lib/io/tf_record.py
+++ b/tensorflow/python/lib/io/tf_record.py
@@ -22,8 +22,10 @@ from __future__ import print_function
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.framework import errors
 from tensorflow.python.util import compat
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("python_io.TFRecordCompressionType")
 class TFRecordCompressionType(object):
   """The type of compression for the record."""
   NONE = 0
@@ -33,6 +35,7 @@ class TFRecordCompressionType(object):
 
 # NOTE(vrv): This will eventually be converted into a proto.  to match
 # the interface used by the C++ RecordWriter.
+@tf_export("python_io.TFRecordOptions")
 class TFRecordOptions(object):
   """Options used for manipulating TFRecord files."""
   compression_type_map = {
@@ -51,6 +54,7 @@ class TFRecordOptions(object):
     return cls.compression_type_map[options.compression_type]
 
 
+@tf_export("python_io.tf_record_iterator")
 def tf_record_iterator(path, options=None):
   """An iterator that read the records from a TFRecords file.
 
@@ -81,6 +85,7 @@ def tf_record_iterator(path, options=None):
   reader.Close()
 
 
+@tf_export("python_io.TFRecordWriter")
 class TFRecordWriter(object):
   """A class to write records to a TFRecords file.
 
diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py
index 72508eb435..73563486e1 100644
--- a/tensorflow/python/ops/losses/losses_impl.py
+++ b/tensorflow/python/ops/losses/losses_impl.py
@@ -28,8 +28,10 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import weights_broadcast_ops
 from tensorflow.python.ops.losses import util
 from tensorflow.python.util.deprecation import deprecated_args
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("losses.Reduction")
 class Reduction(object):
   """Types of loss reduction.
 
@@ -152,6 +154,7 @@ def _num_elements(losses):
     return array_ops.size(losses, name=scope, out_type=losses.dtype)
 
 
+@tf_export("losses.compute_weighted_loss")
 def compute_weighted_loss(
     losses, weights=1.0, scope=None, loss_collection=ops.GraphKeys.LOSSES,
     reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
@@ -211,6 +214,7 @@ def compute_weighted_loss(
       return loss
 
 
+@tf_export("losses.absolute_difference")
 def absolute_difference(
     labels, predictions, weights=1.0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
@@ -258,6 +262,7 @@ def absolute_difference(
         losses, weights, scope, loss_collection, reduction=reduction)
 
 
+@tf_export("losses.cosine_distance")
 @deprecated_args(None, "dim is deprecated, use axis instead", "dim")
 def cosine_distance(
     labels, predictions, axis=None, weights=1.0, scope=None,
@@ -311,6 +316,7 @@ def cosine_distance(
         losses, weights, scope, loss_collection, reduction=reduction)
 
 
+@tf_export("losses.hinge_loss")
 def hinge_loss(labels, logits, weights=1.0, scope=None,
                loss_collection=ops.GraphKeys.LOSSES,
                reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
@@ -352,6 +358,7 @@ def hinge_loss(labels, logits, weights=1.0, scope=None,
         losses, weights, scope, loss_collection, reduction=reduction)
 
 
+@tf_export("losses.huber_loss")
 def huber_loss(labels, predictions, weights=1.0, delta=1.0, scope=None,
                loss_collection=ops.GraphKeys.LOSSES,
                reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
@@ -420,6 +427,7 @@ def huber_loss(labels, predictions, weights=1.0, delta=1.0, scope=None,
         losses, weights, scope, loss_collection, reduction=reduction)
 
 
+@tf_export("losses.log_loss")
 def log_loss(labels, predictions, weights=1.0, epsilon=1e-7, scope=None,
              loss_collection=ops.GraphKeys.LOSSES,
              reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
@@ -471,6 +479,7 @@ def log_loss(labels, predictions, weights=1.0, epsilon=1e-7, scope=None,
 
 
 # TODO(b/37208492): Add reduction arg.
+@tf_export("losses.mean_pairwise_squared_error")
 def mean_pairwise_squared_error(
     labels, predictions, weights=1.0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES):
@@ -557,6 +566,7 @@ def mean_pairwise_squared_error(
       return mean_loss
 
 
+@tf_export("losses.mean_squared_error")
 def mean_squared_error(
     labels, predictions, weights=1.0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
@@ -604,6 +614,7 @@ def mean_squared_error(
         losses, weights, scope, loss_collection, reduction=reduction)
 
 
+@tf_export("losses.sigmoid_cross_entropy")
 def sigmoid_cross_entropy(
     multi_class_labels, logits, weights=1.0, label_smoothing=0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
@@ -662,6 +673,7 @@ def sigmoid_cross_entropy(
         losses, weights, scope, loss_collection, reduction=reduction)
 
 
+@tf_export("losses.softmax_cross_entropy")
 def softmax_cross_entropy(
     onehot_labels, logits, weights=1.0, label_smoothing=0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
@@ -771,6 +783,7 @@ def _remove_squeezable_dimensions(
   return labels, predictions, weights
 
 
+@tf_export("losses.sparse_softmax_cross_entropy")
 def sparse_softmax_cross_entropy(
     labels, logits, weights=1.0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
diff --git a/tensorflow/python/ops/losses/util.py b/tensorflow/python/ops/losses/util.py
index 3718c481c2..b835d96386 100644
--- a/tensorflow/python/ops/losses/util.py
+++ b/tensorflow/python/ops/losses/util.py
@@ -30,8 +30,10 @@ from __future__ import print_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("losses.add_loss")
 def add_loss(loss, loss_collection=ops.GraphKeys.LOSSES):
   """Adds a externally defined loss to the collection of losses.
 
@@ -43,6 +45,7 @@ def add_loss(loss, loss_collection=ops.GraphKeys.LOSSES):
     ops.add_to_collection(loss_collection, loss)
 
 
+@tf_export("losses.get_losses")
 def get_losses(scope=None, loss_collection=ops.GraphKeys.LOSSES):
   """Gets the list of losses from the loss_collection.
 
@@ -56,6 +59,7 @@ def get_losses(scope=None, loss_collection=ops.GraphKeys.LOSSES):
   return ops.get_collection(loss_collection, scope)
 
 
+@tf_export("losses.get_regularization_losses")
 def get_regularization_losses(scope=None):
   """Gets the list of regularization losses.
 
@@ -68,6 +72,7 @@ def get_regularization_losses(scope=None):
   return ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES, scope)
 
 
+@tf_export("losses.get_regularization_loss")
 def get_regularization_loss(scope=None, name="total_regularization_loss"):
   """Gets the total regularization loss.
 
@@ -85,6 +90,7 @@ def get_regularization_loss(scope=None, name="total_regularization_loss"):
     return constant_op.constant(0.0)
 
 
+@tf_export("losses.get_total_loss")
 def get_total_loss(add_regularization_losses=True, name="total_loss"):
   """Returns a tensor whose value represents the total loss.
 
diff --git a/tensorflow/python/platform/app.py b/tensorflow/python/platform/app.py
index 9b92d9a180..cce64c0cca 100644
--- a/tensorflow/python/platform/app.py
+++ b/tensorflow/python/platform/app.py
@@ -23,6 +23,7 @@ import sys as _sys
 
 from tensorflow.python.platform import flags
 from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.util.tf_export import tf_export
 
 
 def _usage(shorthelp):
@@ -108,6 +109,7 @@ def _define_help_flags():
     _define_help_flags_called = True
 
 
+@tf_export('app.run')
 def run(main=None, argv=None):
   """Runs the program with an optional 'main' function and 'argv' list."""
 
diff --git a/tensorflow/python/platform/resource_loader.py b/tensorflow/python/platform/resource_loader.py
index 2455acb4c0..8f7b12e2b2 100644
--- a/tensorflow/python/platform/resource_loader.py
+++ b/tensorflow/python/platform/resource_loader.py
@@ -29,8 +29,10 @@ import sys as _sys
 
 from tensorflow.python.util import tf_inspect as _inspect
 from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('resource_loader.load_resource')
 def load_resource(path):
   """Load the resource at given path, where path is relative to tensorflow/.
 
@@ -52,6 +54,7 @@ def load_resource(path):
 
 
 # pylint: disable=protected-access
+@tf_export('resource_loader.get_data_files_path')
 def get_data_files_path():
   """Get a direct path to the data files colocated with the script.
 
@@ -62,6 +65,7 @@ def get_data_files_path():
   return _os.path.dirname(_inspect.getfile(_sys._getframe(1)))
 
 
+@tf_export('resource_loader.get_root_dir_with_all_resources')
 def get_root_dir_with_all_resources():
   """Get a root directory containing all the data attributes in the build rule.
 
@@ -101,6 +105,7 @@ def get_root_dir_with_all_resources():
   return data_files_dir or script_dir
 
 
+@tf_export('resource_loader.get_path_to_datafile')
 def get_path_to_datafile(path):
   """Get the path to the specified file in the data dependencies.
 
@@ -120,6 +125,7 @@ def get_path_to_datafile(path):
   return _os.path.join(data_files_path, path)
 
 
+@tf_export('resource_loader.readahead_file_path')
 def readahead_file_path(path, readahead='128M'):  # pylint: disable=unused-argument
   """Readahead files not implemented; simply returns given path."""
   return path
diff --git a/tensorflow/python/platform/tf_logging.py b/tensorflow/python/platform/tf_logging.py
index 85ed4f071c..22aabfd712 100644
--- a/tensorflow/python/platform/tf_logging.py
+++ b/tensorflow/python/platform/tf_logging.py
@@ -35,6 +35,7 @@ import threading
 import six
 
 from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.util.tf_export import tf_export
 
 
 # Don't use this directly. Use _get_logger() instead.
@@ -90,30 +91,37 @@ def _get_logger():
     _logger_lock.release()
 
 
+@tf_export('logging.log')
 def log(level, msg, *args, **kwargs):
   _get_logger().log(level, msg, *args, **kwargs)
 
 
+@tf_export('logging.debug')
 def debug(msg, *args, **kwargs):
   _get_logger().debug(msg, *args, **kwargs)
 
 
+@tf_export('logging.error')
 def error(msg, *args, **kwargs):
   _get_logger().error(msg, *args, **kwargs)
 
 
+@tf_export('logging.fatal')
 def fatal(msg, *args, **kwargs):
   _get_logger().fatal(msg, *args, **kwargs)
 
 
+@tf_export('logging.info')
 def info(msg, *args, **kwargs):
   _get_logger().info(msg, *args, **kwargs)
 
 
+@tf_export('logging.warn')
 def warn(msg, *args, **kwargs):
   _get_logger().warn(msg, *args, **kwargs)
 
 
+@tf_export('logging.warning')
 def warning(msg, *args, **kwargs):
   _get_logger().warning(msg, *args, **kwargs)
 
@@ -136,15 +144,18 @@ _log_prefix = None  # later set to google2_log_prefix
 _log_counter_per_token = {}
 
 
+@tf_export('logging.TaskLevelStatusMessage')
 def TaskLevelStatusMessage(msg):
   error(msg)
 
 
+@tf_export('logging.flush')
 def flush():
   raise NotImplementedError()
 
 
 # Code below is taken from pyglib/logging
+@tf_export('logging.vlog')
 def vlog(level, msg, *args, **kwargs):
   _get_logger().log(level, msg, *args, **kwargs)
 
@@ -164,6 +175,7 @@ def _GetNextLogCountPerToken(token):
   return _log_counter_per_token[token]
 
 
+@tf_export('logging.log_every_n')
 def log_every_n(level, msg, n, *args):
   """Log 'msg % args' at level 'level' once per 'n' times.
 
@@ -180,6 +192,7 @@ def log_every_n(level, msg, n, *args):
   log_if(level, msg, not (count % n), *args)
 
 
+@tf_export('logging.log_first_n')
 def log_first_n(level, msg, n, *args):  # pylint: disable=g-bad-name
   """Log 'msg % args' at level 'level' only first 'n' times.
 
@@ -195,6 +208,7 @@ def log_first_n(level, msg, n, *args):  # pylint: disable=g-bad-name
   log_if(level, msg, count < n, *args)
 
 
+@tf_export('logging.log_if')
 def log_if(level, msg, condition, *args):
   """Log 'msg % args' at level 'level' only if condition is fulfilled."""
   if condition:
@@ -251,11 +265,13 @@ def google2_log_prefix(level, timestamp=None, file_and_line=None):
   return s
 
 
+@tf_export('logging.get_verbosity')
 def get_verbosity():
   """Return how much logging output will be produced."""
   return _get_logger().getEffectiveLevel()
 
 
+@tf_export('logging.set_verbosity')
 def set_verbosity(v):
   """Sets the threshold for what messages will be logged."""
   _get_logger().setLevel(v)
@@ -296,4 +312,10 @@ _allowed_symbols = [
     'warning',
 ]
 
+tf_export('logging.DEBUG').export_constant(__name__, 'DEBUG')
+tf_export('logging.ERROR').export_constant(__name__, 'ERROR')
+tf_export('logging.FATAL').export_constant(__name__, 'FATAL')
+tf_export('logging.INFO').export_constant(__name__, 'INFO')
+tf_export('logging.WARN').export_constant(__name__, 'WARN')
+
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/profiler/model_analyzer.py b/tensorflow/python/profiler/model_analyzer.py
index 8f78054560..0e20ca35bb 100644
--- a/tensorflow/python/profiler/model_analyzer.py
+++ b/tensorflow/python/profiler/model_analyzer.py
@@ -33,6 +33,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.profiler import option_builder
 from tensorflow.python.profiler import tfprof_logger
+from tensorflow.python.util.tf_export import tf_export
 
 _DEFAULT_PROFILE_OPTIONS = 0
 _DEFAULT_ADVISE_OPTIONS = 0
@@ -121,6 +122,7 @@ def _build_advisor_options(options):
   return opts
 
 
+@tf_export('profiler.Profiler')
 class Profiler(object):
   """TensorFlow multi-step profiler.
 
@@ -304,6 +306,7 @@ class Profiler(object):
     print_mdl.WriteProfile(filename)
 
 
+@tf_export('profiler.profile')
 def profile(graph=None,
             run_meta=None,
             op_log=None,
@@ -378,6 +381,7 @@ def profile(graph=None,
   return tfprof_node
 
 
+@tf_export('profiler.advise')
 def advise(graph=None, run_meta=None, options=_DEFAULT_ADVISE_OPTIONS):
   """Auto profile and advise.
 
diff --git a/tensorflow/python/profiler/option_builder.py b/tensorflow/python/profiler/option_builder.py
index 13942ad6a2..957ebe6ddd 100644
--- a/tensorflow/python/profiler/option_builder.py
+++ b/tensorflow/python/profiler/option_builder.py
@@ -20,8 +20,10 @@ from __future__ import print_function
 import copy
 
 from tensorflow.python.profiler import tfprof_logger
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('profiler.ProfileOptionBuilder')
 class ProfileOptionBuilder(object):
   # pylint: disable=line-too-long
   """Option Builder for Profiling API.
diff --git a/tensorflow/python/profiler/tfprof_logger.py b/tensorflow/python/profiler/tfprof_logger.py
index ffda7ddad7..8d12106496 100644
--- a/tensorflow/python/profiler/tfprof_logger.py
+++ b/tensorflow/python/profiler/tfprof_logger.py
@@ -30,6 +30,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.platform import gfile
 from tensorflow.python.profiler.internal import flops_registry  # pylint: disable=unused-import
+from tensorflow.python.util.tf_export import tf_export
 
 TRAINABLE_VARIABLES = '_trainable_variables'
 REGISTERED_FLOP_STATS = 'flops'
@@ -187,6 +188,7 @@ def merge_default_with_oplog(graph, op_log=None, run_meta=None,
   return tmp_op_log
 
 
+@tf_export('profiler.write_op_log')
 def write_op_log(graph, log_dir, op_log=None, run_meta=None, add_trace=True):
   """Log provided 'op_log', and add additional model information below.
 
diff --git a/tensorflow/python/summary/writer/writer.py b/tensorflow/python/summary/writer/writer.py
index 12f120116f..1f3f228704 100644
--- a/tensorflow/python/summary/writer/writer.py
+++ b/tensorflow/python/summary/writer/writer.py
@@ -32,6 +32,7 @@ from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.summary import plugin_asset
 from tensorflow.python.summary.writer.event_file_writer import EventFileWriter
+from tensorflow.python.util.tf_export import tf_export
 
 _PLUGINS_DIR = "plugins"
 
@@ -276,6 +277,7 @@ class SummaryToEventTransformer(object):
     self.event_writer.add_event(event)
 
 
+@tf_export("summary.FileWriter")
 class FileWriter(SummaryToEventTransformer):
   """Writes `Summary` protocol buffers to event files.
 
diff --git a/tensorflow/python/summary/writer/writer_cache.py b/tensorflow/python/summary/writer/writer_cache.py
index bad289303c..645fa28a37 100644
--- a/tensorflow/python/summary/writer/writer_cache.py
+++ b/tensorflow/python/summary/writer/writer_cache.py
@@ -22,8 +22,10 @@ import threading
 
 from tensorflow.python.framework import ops
 from tensorflow.python.summary.writer.writer import FileWriter
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('summary.FileWriterCache')
 class FileWriterCache(object):
   """Cache for file writers.
 
diff --git a/tensorflow/python/util/compat.py b/tensorflow/python/util/compat.py
index 270d96a3c7..7e5f192b8f 100644
--- a/tensorflow/python/util/compat.py
+++ b/tensorflow/python/util/compat.py
@@ -41,8 +41,10 @@ import numpy as _np
 import six as _six
 
 from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('compat.as_bytes', 'compat.as_str')
 def as_bytes(bytes_or_text, encoding='utf-8'):
   """Converts either bytes or unicode to `bytes`, using utf-8 encoding for text.
 
@@ -65,6 +67,7 @@ def as_bytes(bytes_or_text, encoding='utf-8'):
                     (bytes_or_text,))
 
 
+@tf_export('compat.as_text')
 def as_text(bytes_or_text, encoding='utf-8'):
   """Returns the given argument as a unicode string.
 
@@ -93,6 +96,7 @@ else:
   as_str = as_text
 
 
+@tf_export('compat.as_str_any')
 def as_str_any(value):
   """Converts to `str` as `str(value)`, but use `as_str` for `bytes`.
 
@@ -125,11 +129,16 @@ def path_to_str(path):
 # Numpy 1.8 scalars don't inherit from numbers.Integral in Python 3, so we
 # need to check them specifically.  The same goes from Real and Complex.
 integral_types = (_numbers.Integral, _np.integer)
+tf_export('compat.integral_types').export_constant(__name__, 'integral_types')
 real_types = (_numbers.Real, _np.integer, _np.floating)
+tf_export('compat.real_types').export_constant(__name__, 'real_types')
 complex_types = (_numbers.Complex, _np.number)
+tf_export('compat.complex_types').export_constant(__name__, 'complex_types')
 
 # Either bytes or text.
 bytes_or_text_types = (bytes, _six.text_type)
+tf_export('compat.bytes_or_text_types').export_constant(__name__,
+                                                        'bytes_or_text_types')
 
 _allowed_symbols = [
     'as_str',
diff --git a/tensorflow/tools/api/generator/BUILD b/tensorflow/tools/api/generator/BUILD
index 036bdd6d29..66bbd572a6 100644
--- a/tensorflow/tools/api/generator/BUILD
+++ b/tensorflow/tools/api/generator/BUILD
@@ -78,6 +78,15 @@ genrule(
         "api/sets/__init__.py",
         "api/summary/__init__.py",
         "api/train/queue_runner/__init__.py",
+        "api/compat/__init__.py",
+        "api/data/__init__.py",
+        "api/estimator/__init__.py",
+        "api/estimator/export/__init__.py",
+        "api/estimator/inputs/__init__.py",
+        "api/feature_column/__init__.py",
+        "api/losses/__init__.py",
+        "api/profiler/__init__.py",
+        "api/python_io/__init__.py",
     ],
     cmd = "$(location create_python_api) $(OUTS)",
     tools = ["create_python_api"],
-- 
GitLab


From d418a14176f2e5cf6c8d16a155c005947c41974b Mon Sep 17 00:00:00 2001
From: Jonathan Hseu <jhseu@google.com>
Date: Wed, 31 Jan 2018 11:30:36 -0800
Subject: [PATCH 462/966] TPUEstimator: support host_call when use_tpu=False.

PiperOrigin-RevId: 184021299
---
 .../contrib/tpu/python/tpu/tpu_estimator.py   | 60 +++++++++++++++----
 1 file changed, 49 insertions(+), 11 deletions(-)

diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index 23960bb030..c7008533f3 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -410,14 +410,13 @@ class TPUEstimatorSpec(
   function should not capture any Tensors in `model_fn`.
 
   `host_call` is a tuple of a `function` and a list or dictionary of `tensors`
-  to pass to that function. `host_call` currently works for train() and
-  evaluate(). The function's graph is executed on the CPU on every step, so
-  there is communication overhead when sending tensors from TPU to CPU. To
-  reduce the overhead, try reducing the size of the tensors. The `tensors` are
-  concatenated along their major (batch) dimension, and so must be >= rank 1.
-  The `host_call` is useful for writing summaries with
-  @{tf.contrib.summary.create_file_writer}. Note that `host_call` does not
-  currently work if `use_tpu` is set to False.
+  to pass to that function and returns a list of Tensors. `host_call` currently
+  works for train() and evaluate(). The Tensors returned by the function is
+  executed on the CPU on every step, so there is communication overhead when
+  sending tensors from TPU to CPU. To reduce the overhead, try reducing the
+  size of the tensors. The `tensors` are concatenated along their major (batch)
+  dimension, and so must be >= rank 1. The `host_call` is useful for writing
+  summaries with @{tf.contrib.summary.create_file_writer}.
   """
 
   def __new__(cls,
@@ -449,10 +448,18 @@ class TPUEstimatorSpec(
 
   def as_estimator_spec(self):
     """Creates an equivalent `EstimatorSpec` used by CPU train/eval."""
+    host_calls = {}
+    if self.eval_metrics is not None:
+      host_calls['eval_metrics'] = self.eval_metrics
+    if self.host_call is not None:
+      host_calls['host_call'] = self.host_call
+    host_call_ret = _OutfeedHostCall.create_cpu_hostcall(host_calls)
     eval_metric_ops = None
     if self.eval_metrics is not None:
-      eval_metric_ops = _OutfeedHostCall.create_cpu_hostcall(
-          {'eval_metrics': self.eval_metrics})['eval_metrics']
+      eval_metric_ops = host_call_ret['eval_metrics']
+    hooks = None
+    if self.host_call is not None:
+      hooks = [_OutfeedHostCallHook(host_call_ret['host_call'])]
     scaffold = self.scaffold_fn() if self.scaffold_fn else None
     return model_fn_lib.EstimatorSpec(
         mode=self.mode,
@@ -461,7 +468,10 @@ class TPUEstimatorSpec(
         train_op=self.train_op,
         eval_metric_ops=eval_metric_ops,
         export_outputs=self.export_outputs,
-        scaffold=scaffold)
+        scaffold=scaffold,
+        training_hooks=hooks,
+        evaluation_hooks=hooks,
+        prediction_hooks=hooks)
 
 
 class _OpQueueContext(object):
@@ -1450,6 +1460,34 @@ class _OutfeedHostCall(object):
     return ret
 
 
+class _OutfeedHostCallHook(session_run_hook.SessionRunHook):
+  """Hook to run host calls when use_tpu=False."""
+
+  def __init__(self, tensors):
+    self._tensors = tensors
+
+  def begin(self):
+    # We duplicate this code from the TPUInfeedOutfeedSessionHook rather than
+    # create a separate hook to guarantee execution order, because summaries
+    # need to be initialized before the outfeed thread starts.
+    # TODO(jhseu): Make a wrapper hook instead?
+    self._init_ops = contrib_summary.summary_writer_initializer_op()
+    # Get all the writer resources from the initializer, so we know what to
+    # flush.
+    self._finalize_ops = []
+    for op in self._init_ops:
+      self._finalize_ops.append(contrib_summary.flush(writer=op.inputs[0]))
+
+  def after_create_session(self, session, coord):
+    session.run(self._init_ops)
+
+  def before_run(self, run_context):
+    return basic_session_run_hooks.SessionRunArgs(self._tensors)
+
+  def end(self, session):
+    session.run(self._finalize_ops)
+
+
 class ExamplesPerSecondHook(basic_session_run_hooks.StepCounterHook):
   """Count examples during runtime."""
 
-- 
GitLab


From b79c3b2d1e6825b59b72818ce467dc18f19b57ad Mon Sep 17 00:00:00 2001
From: Todd Wang <toddw@google.com>
Date: Wed, 31 Jan 2018 11:32:35 -0800
Subject: [PATCH 463/966] Go: Fix Scope.WithControlDependencies array-copying
 behavior.

The test fails with the old code, and passes with the new code.

PiperOrigin-RevId: 184021596
---
 tensorflow/go/op/scope.go      | 21 +++++++++++++--------
 tensorflow/go/op/scope_test.go | 11 ++++++++++-
 2 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/tensorflow/go/op/scope.go b/tensorflow/go/op/scope.go
index 2cf1f30187..13de4294dc 100644
--- a/tensorflow/go/op/scope.go
+++ b/tensorflow/go/op/scope.go
@@ -109,15 +109,20 @@ func (s *Scope) SubScope(namespace string) *Scope {
 // added to the graph to execute only after all the provided operations have
 // executed first (in addition to any other control dependencies in s).
 func (s *Scope) WithControlDependencies(ops ...*tf.Operation) *Scope {
+	// Force a copy of the control dependencies into a new underlying array on
+	// every call.  We cannot alias the same underlying array as `ops`, otherwise
+	// the user could modify that array after calling s.WithControlDependencies,
+	// which would be confusing.  We cannot alias the same underlying array as the
+	// original `s.controlDependencies`, since Scopes form a logical tree, and
+	// other calls to s.WithControlDependencies could stomp on each other.
+	deps := make([]*tf.Operation, 0, len(s.controlDependencies)+len(ops))
+	deps = append(deps, s.controlDependencies...)
+	deps = append(deps, ops...)
 	return &Scope{
-		graph:     s.graph,
-		namemap:   s.namemap,
-		namespace: s.namespace,
-		// append(ops, s.controlDependencies) and not the other way
-		// around so that we end up with a copy of the underlying array
-		// (and other calls to s.WithControlDependencies() do not stomp
-		// on each other).
-		controlDependencies: append(ops, s.controlDependencies...),
+		graph:               s.graph,
+		namemap:             s.namemap,
+		namespace:           s.namespace,
+		controlDependencies: deps,
 		err:                 s.err,
 	}
 }
diff --git a/tensorflow/go/op/scope_test.go b/tensorflow/go/op/scope_test.go
index 4f533881d0..b58a61de98 100644
--- a/tensorflow/go/op/scope_test.go
+++ b/tensorflow/go/op/scope_test.go
@@ -77,8 +77,17 @@ func TestControlDependencies(t *testing.T) {
 		variable = VarHandleOp(s, tf.Int32, tf.ScalarShape())
 		init     = AssignVariableOp(s, variable, zero)
 		update   = AssignAddVariableOp(s, variable, one)
-		read     = ReadVariableOp(s.WithControlDependencies(update), variable, tf.Int32)
+		readDeps = []*tf.Operation{update}
 	)
+	// We intend for `read` to have a control dependency on `update`.
+	s = s.WithControlDependencies(readDeps...)
+	// Ensure that Scope.WithControlDependencies makes a copy of the underlying
+	// array, rather than just holding a slice reference to the same user-supplied
+	// underlying array.  If the copy is correctly performed, overwriting
+	// readDeps[0] should have no effect on control dependencies for `read`.
+	readDeps[0] = init
+	read := ReadVariableOp(s, variable, tf.Int32)
+
 	graph, err := s.Finalize()
 	if err != nil {
 		t.Fatal(err)
-- 
GitLab


From 099c91b506214cb64840149df50edff19235b2bb Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Wed, 31 Jan 2018 11:34:56 -0800
Subject: [PATCH 464/966] De-bazel filename_test. Part of the effort to remove
 all_opensource_files.

PiperOrigin-RevId: 184021942
---
 tensorflow/tools/ci_build/ci_sanity.sh  |  9 +++--
 tensorflow/tools/test/file_name_test.py | 48 +++++++++++++++++++++++++
 2 files changed, 55 insertions(+), 2 deletions(-)
 create mode 100644 tensorflow/tools/test/file_name_test.py

diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index 106ea19d46..a58db51cb8 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -517,9 +517,14 @@ do_check_futures_test() {
   python check_futures_test.py
 }
 
+do_check_file_name_test() {
+  cd "$ROOT_DIR/tensorflow/tools/test"
+  python file_name_test.py
+}
+
 # Supply all sanity step commands and descriptions
-SANITY_STEPS=("do_pylint PYTHON2" "do_pylint PYTHON3" "do_check_futures_test" "do_buildifier" "do_bazel_nobuild" "do_pip_package_licenses_check" "do_lib_package_licenses_check" "do_java_package_licenses_check" "do_pip_smoke_test" "do_check_load_py_test" "do_code_link_check" "do_cmake_python_sanity")
-SANITY_STEPS_DESC=("Python 2 pylint" "Python 3 pylint" "Check that python files have certain __future__ imports" "buildifier check" "bazel nobuild" "pip: license check for external dependencies" "C library: license check for external dependencies" "Java Native Library: license check for external dependencies" "Pip Smoke Test: Checking py_test dependencies exist in pip package" "Check load py_test: Check that BUILD files with py_test target properly load py_test" "Code Link Check: Check there are no broken links" "Test entries in /tensorflow/contrib/cmake/python_{modules|protos|protos_cc}.txt for validity and consistency")
+SANITY_STEPS=("do_pylint PYTHON2" "do_pylint PYTHON3" "do_check_futures_test" "do_buildifier" "do_bazel_nobuild" "do_pip_package_licenses_check" "do_lib_package_licenses_check" "do_java_package_licenses_check" "do_pip_smoke_test" "do_check_load_py_test" "do_code_link_check" "do_cmake_python_sanity" "do_check_file_name_test")
+SANITY_STEPS_DESC=("Python 2 pylint" "Python 3 pylint" "Check that python files have certain __future__ imports" "buildifier check" "bazel nobuild" "pip: license check for external dependencies" "C library: license check for external dependencies" "Java Native Library: license check for external dependencies" "Pip Smoke Test: Checking py_test dependencies exist in pip package" "Check load py_test: Check that BUILD files with py_test target properly load py_test" "Code Link Check: Check there are no broken links" "Test entries in /tensorflow/contrib/cmake/python_{modules|protos|protos_cc}.txt for validity and consistency" "Check file names for cases")
 
 INCREMENTAL_FLAG=""
 DEFAULT_BAZEL_CONFIGS="--config=hdfs --config=gcp"
diff --git a/tensorflow/tools/test/file_name_test.py b/tensorflow/tools/test/file_name_test.py
new file mode 100644
index 0000000000..16fb8a822d
--- /dev/null
+++ b/tensorflow/tools/test/file_name_test.py
@@ -0,0 +1,48 @@
+#!/usr/bin/python
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Test that checks if we have any issues with case insensitive filesystems.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))
+ERROR_MESSAGE = """
+Files with same name but different case detected in directory: {}
+"""
+
+
+def main():
+  # Make sure BASE_DIR ends with tensorflow.  If it doesn't, we probably
+  # computed the wrong directory.
+  if os.path.split(BASE_DIR)[-1] != 'tensorflow':
+    raise AssertionError(
+        "BASE_DIR = '%s' doesn't end with tensorflow" % BASE_DIR)
+
+  for dirpath, dirnames, filenames in os.walk(BASE_DIR, followlinks=True):
+    lowercase_directories = [x.lower() for x in dirnames]
+    lowercase_files = [x.lower() for x in filenames]
+
+    lowercase_dir_contents = lowercase_directories + lowercase_files
+    if len(lowercase_dir_contents) != len(set(lowercase_dir_contents)):
+      raise AssertionError(ERROR_MESSAGE.format(dirpath))
+
+
+if __name__ == '__main__':
+  main()
-- 
GitLab


From f5b3824be1082013f86156c32e55f8e63376bec9 Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Wed, 31 Jan 2018 12:22:10 -0800
Subject: [PATCH 465/966] Increfing tapes when keeping them around calls to
 python API functions.

These calls can trigger GIL releases or python GC which can trigger
a change to the set of live tapes and segmentation faults.

PiperOrigin-RevId: 184029146
---
 tensorflow/python/eager/pywrap_tfe_src.cc | 41 +++++++++++++++++------
 1 file changed, 31 insertions(+), 10 deletions(-)

diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index 836998cfdc..d927f3abed 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -528,6 +528,34 @@ tensorflow::gtl::CompactPointerSet<TFE_Py_Tape*>* GetTapeSet() {
   return tape_set;
 }
 
+// A safe copy of the current tapeset. Does not get affected by other python
+// threads changing the set of active tapes.
+class SafeTapeSet {
+ public:
+  SafeTapeSet() : tape_set_(*GetTapeSet()) {
+    for (auto* tape : tape_set_) {
+      Py_INCREF(tape);
+    }
+  }
+
+  ~SafeTapeSet() {
+    for (auto* tape : tape_set_) {
+      Py_DECREF(tape);
+    }
+  }
+
+  tensorflow::gtl::CompactPointerSet<TFE_Py_Tape*>::const_iterator begin() {
+    return tape_set_.begin();
+  }
+
+  tensorflow::gtl::CompactPointerSet<TFE_Py_Tape*>::const_iterator end() {
+    return tape_set_.end();
+  }
+
+ private:
+  tensorflow::gtl::CompactPointerSet<TFE_Py_Tape*> tape_set_;
+};
+
 // xcode 7 doesn't define thread_local, so for compatibility we implement our
 // own. TODO(apassos) remove once we can deprecate xcode 7.
 #ifndef __APPLE__
@@ -718,10 +746,7 @@ void TFE_Py_TapeSetWatchVariable(PyObject* variable) {
   if (*ThreadTapeIsStopped()) {
     return;
   }
-  // Note: making a copy because watching a variable can trigger a change to the
-  // set of tapes by allowing python's garbage collector to run.
-  auto tape_set = *GetTapeSet();
-  for (TFE_Py_Tape* tape : tape_set) {
+  for (TFE_Py_Tape* tape : SafeTapeSet()) {
     tape->tape->WatchVariable(variable);
   }
 }
@@ -777,8 +802,7 @@ void TFE_Py_TapeSetRecordOperation(PyObject* op_type, PyObject* output_tensors,
     return;
   }
 
-  auto set = *GetTapeSet();
-  for (TFE_Py_Tape* tape : set) {
+  for (TFE_Py_Tape* tape : SafeTapeSet()) {
     Py_INCREF(backward_function);
     tape->tape->RecordOperation(
         op_type_str, output_info, input_ids, backward_function,
@@ -787,10 +811,7 @@ void TFE_Py_TapeSetRecordOperation(PyObject* op_type, PyObject* output_tensors,
 }
 
 void TFE_Py_TapeSetDeleteTrace(tensorflow::int64 tensor_id) {
-  // Note: making a copy because deleting the trace can trigger a change to the
-  // set of tapes by allowing python's garbage collector to run.
-  auto tape_set = *GetTapeSet();
-  for (TFE_Py_Tape* tape : tape_set) {
+  for (TFE_Py_Tape* tape : SafeTapeSet()) {
     tape->tape->DeleteTrace(tensor_id);
   }
 }
-- 
GitLab


From e9650510fe2a9dfde21158b052a751006a74e339 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Wed, 31 Jan 2018 12:27:38 -0800
Subject: [PATCH 466/966] [TF:XLA] Bump open source llvm revision to r323874

PiperOrigin-RevId: 184029790
---
 tensorflow/workspace.bzl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index d082b6747a..34ec291256 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -472,11 +472,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "llvm",
       urls = [
-          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/36a30fc7c9ee6fdfe5157190ad15c1801b1ab2de.tar.gz",
-          "https://github.com/llvm-mirror/llvm/archive/36a30fc7c9ee6fdfe5157190ad15c1801b1ab2de.tar.gz",
+          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/f135378ec6365e852f7d5a3cfcdce342f08cb5f3.tar.gz",
+          "https://github.com/llvm-mirror/llvm/archive/f135378ec6365e852f7d5a3cfcdce342f08cb5f3.tar.gz",
       ],
-      sha256 = "3b74ecd8f59c712b4daf715a4da15c43ebdd40edcd4c30737bffef62f6a2bc9d",
-      strip_prefix = "llvm-36a30fc7c9ee6fdfe5157190ad15c1801b1ab2de",
+      sha256 = "296ab832167e6c46eb65ef1f9a2b5fc31c77fcd2248799b306aa2d5d2e4edbfe",
+      strip_prefix = "llvm-f135378ec6365e852f7d5a3cfcdce342f08cb5f3",
       build_file = str(Label("//third_party/llvm:llvm.BUILD")),
   )
 
-- 
GitLab


From 1699bc041e624cdaf249a80136b467743357fbfc Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Wed, 31 Jan 2018 12:28:13 -0800
Subject: [PATCH 467/966] [TF:XLA] Implement Acos, Asin, Atan in terms of Atan2
 using half-angle formulae. This may not be the most efficient implementation
 but it is better than no implementation.

PiperOrigin-RevId: 184029858
---
 tensorflow/compiler/tests/unary_ops_test.py   | 15 ++++++++++++
 .../compiler/tf2xla/kernels/unary_ops.cc      | 23 +++++++++++++++++++
 2 files changed, 38 insertions(+)

diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py
index 8e4b8a3833..3d3e112f48 100644
--- a/tensorflow/compiler/tests/unary_ops_test.py
+++ b/tensorflow/compiler/tests/unary_ops_test.py
@@ -154,6 +154,21 @@ class UnaryOpsTest(XLATestCase):
 
   def testFloatOps(self):
     for dtype in self.float_types:
+      x = np.arange(-0.90, 0.90, 0.25)
+      self._assertOpOutputMatchesExpected(
+          math_ops.acos,
+          x.astype(dtype),
+          expected=np.arccos(x).astype(dtype))
+      self._assertOpOutputMatchesExpected(
+          math_ops.asin,
+          x.astype(dtype),
+          expected=np.arcsin(x).astype(dtype))
+      x = np.arange(-3, 3).reshape(1, 3, 2)
+      self._assertOpOutputMatchesExpected(
+          math_ops.atan,
+          x.astype(dtype),
+          expected=np.arctan(x).astype(dtype))
+
       self._assertOpOutputMatchesExpected(
           math_ops.acosh,
           np.array([1, 2, 3, 4], dtype=dtype),
diff --git a/tensorflow/compiler/tf2xla/kernels/unary_ops.cc b/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
index a266e9013c..0c5ad9e525 100644
--- a/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
@@ -50,18 +50,41 @@ XLAJIT_MAKE_UNARY(Conj, b->Conj(x));
 // Return x if x>0, otherwise -x.
 XLAJIT_MAKE_UNARY(Abs, b->Abs(x));
 
+// acos(x) = 2 * atan(sqrt(1 - x^2) / (1 + x))
+XLAJIT_MAKE_UNARY(
+    Acos,
+    b->Mul(XlaHelpers::FloatLiteral(b, input_type(0), 2.0),
+           b->Atan2(b->Pow(b->Sub(XlaHelpers::One(b, input_type(0)),
+                                  b->Mul(x, x)),
+                           XlaHelpers::FloatLiteral(b, input_type(0), 0.5)),
+                    b->Add(XlaHelpers::One(b, input_type(0)), x))));
+
 // acosh(x) = log(x + sqrt(x^2 - 1))
 XLAJIT_MAKE_UNARY(
     Acosh,
     b->Log(b->Add(x, b->Pow(b->Sub(b->Mul(x, x),
                                    XlaHelpers::One(b, input_type(0))),
                             XlaHelpers::FloatLiteral(b, input_type(0), 0.5)))));
+
+// asin(x) = 2 * atan(x / (1 + sqrt(1 - x^2)))
+XLAJIT_MAKE_UNARY(
+    Asin,
+    b->Mul(XlaHelpers::FloatLiteral(b, input_type(0), 2.0),
+           b->Atan2(x, b->Add(XlaHelpers::One(b, input_type(0)),
+                              b->Pow(b->Sub(XlaHelpers::One(b, input_type(0)),
+                                            b->Mul(x, x)),
+                                     XlaHelpers::FloatLiteral(b, input_type(0),
+                                                              0.5))))));
+
 // asinh(x) = log(x + sqrt(x^2 + 1))
 XLAJIT_MAKE_UNARY(
     Asinh,
     b->Log(b->Add(x, b->Pow(b->Add(b->Mul(x, x),
                                    XlaHelpers::One(b, input_type(0))),
                             XlaHelpers::FloatLiteral(b, input_type(0), 0.5)))));
+
+XLAJIT_MAKE_UNARY(Atan, b->Atan2(x, XlaHelpers::One(b, input_type(0))));
+
 // atanh(x) = 0.5 * log((1 + x) / (1 - x))
 XLAJIT_MAKE_UNARY(
     Atanh, b->Mul(b->Log(b->Div(b->Add(XlaHelpers::One(b, input_type(0)), x),
-- 
GitLab


From 2639cda72ba92a4a76c3cd2081268448461f8227 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 31 Jan 2018 12:32:38 -0800
Subject: [PATCH 468/966] Remove contacts of ex-Googler

PiperOrigin-RevId: 184030353
---
 tensorflow/core/profiler/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/profiler/README.md b/tensorflow/core/profiler/README.md
index 9e628b1065..460f935e4a 100644
--- a/tensorflow/core/profiler/README.md
+++ b/tensorflow/core/profiler/README.md
@@ -256,7 +256,7 @@ bug fix. `OpLogProto` is a good plus if it is used.
 
 #### Teams
 
-* Xin Pan (xpan@google.com, github: panyx0718)
+* Xin Pan
 * Chris Antaki
 * Yao Zhang
 * Jon Shlens
-- 
GitLab


From 9f75f8e6d55fb9ad605bce80b656e3e19781ee43 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Wed, 31 Jan 2018 12:58:05 -0800
Subject: [PATCH 469/966] [TF:XLA] Implement ExtractImagePatches.

PiperOrigin-RevId: 184033616
---
 tensorflow/compiler/tests/BUILD               |  12 ++
 .../tests/extract_image_patches_op_test.py    | 134 ++++++++++++++
 .../tf2xla/g3doc/cpu_supported_ops.md         |   4 +
 .../tf2xla/g3doc/gpu_supported_ops.md         |   4 +
 tensorflow/compiler/tf2xla/kernels/BUILD      |   1 +
 .../kernels/extract_image_patches_op.cc       | 169 ++++++++++++++++++
 .../extract_image_patches_op_test.py          |  18 +-
 7 files changed, 340 insertions(+), 2 deletions(-)
 create mode 100644 tensorflow/compiler/tests/extract_image_patches_op_test.py
 create mode 100644 tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc

diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 9e64f3e9a3..7277ba42ce 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -255,6 +255,18 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "extract_image_patches_op_test",
+    size = "small",
+    srcs = ["extract_image_patches_op_test.py"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 tf_xla_py_test(
     name = "fft_test",
     size = "medium",
diff --git a/tensorflow/compiler/tests/extract_image_patches_op_test.py b/tensorflow/compiler/tests/extract_image_patches_op_test.py
new file mode 100644
index 0000000000..0361702e7a
--- /dev/null
+++ b/tensorflow/compiler/tests/extract_image_patches_op_test.py
@@ -0,0 +1,134 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional tests for ExtractImagePatches op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class ExtractImagePatches(XLATestCase):
+  """Functional tests for ExtractImagePatches op."""
+
+  def _VerifyValues(self, image, ksizes, strides, rates, padding, patches):
+    """Tests input-output pairs for the ExtractImagePatches op.
+
+    Args:
+      image: Input tensor with shape: [batch, in_rows, in_cols, depth].
+      ksizes: Patch size specified as: [ksize_rows, ksize_cols].
+      strides: Output strides, specified as [stride_rows, stride_cols].
+      rates: Atrous rates, specified as [rate_rows, rate_cols].
+      padding: Padding type.
+      patches: Expected output.
+    """
+    ksizes = [1] + ksizes + [1]
+    strides = [1] + strides + [1]
+    rates = [1] + rates + [1]
+
+    with self.test_session():
+      image_placeholder = array_ops.placeholder(dtypes.float32)
+      with self.test_scope():
+        out_tensor = array_ops.extract_image_patches(
+            image_placeholder,
+            ksizes=ksizes,
+            strides=strides,
+            rates=rates,
+            padding=padding,
+            name="im2col")
+      feed_dict = {image_placeholder: image}
+      self.assertAllClose(patches, out_tensor.eval(feed_dict=feed_dict))
+
+  def testKsize1x1Stride1x1Rate1x1(self):
+    """Verifies that for 1x1 kernel the output equals the input."""
+    # [2, 3, 4, 5]
+    image = np.reshape(range(120), [2, 3, 4, 5])
+    # [2, 3, 4, 5]
+    patches = np.reshape(range(120), [2, 3, 4, 5])
+    for padding in ["VALID", "SAME"]:
+      self._VerifyValues(
+          image,
+          ksizes=[1, 1],
+          strides=[1, 1],
+          rates=[1, 1],
+          padding=padding,
+          patches=patches)
+
+  def testKsize1x1Stride2x3Rate1x1(self):
+    """Test for 1x1 kernel and strides."""
+    # [2, 4, 5, 3]
+    image = np.reshape(range(120), [2, 4, 5, 3])
+    # [2, 2, 2, 3]
+    patches = image[:, ::2, ::3, :]
+    for padding in ["VALID", "SAME"]:
+      self._VerifyValues(
+          image,
+          ksizes=[1, 1],
+          strides=[2, 3],
+          rates=[1, 1],
+          padding=padding,
+          patches=patches)
+
+  def testKsize2x2Stride1x1Rate1x1Valid(self):
+    """Test for 2x2 kernel with VALID padding."""
+    # [1, 2, 2, 1]
+    image = [[[[1], [2]], [[3], [4]]]]
+    # [1, 1, 1, 4]
+    patches = [[[[1, 2, 3, 4]]]]
+    self._VerifyValues(
+        image,
+        ksizes=[2, 2],
+        strides=[1, 1],
+        rates=[1, 1],
+        padding="VALID",
+        patches=patches)
+
+  def testKsize2x2Stride1x1Rate1x1Same(self):
+    """Test for 2x2 kernel with SAME padding."""
+    # [1, 2, 2, 1]
+    image = [[[[1], [2]], [[3], [4]]]]
+    # [1, 2, 2, 4]
+    patches = [[[[1, 2, 3, 4], [2, 0, 4, 0]], [[3, 4, 0, 0], [4, 0, 0, 0]]]]
+    self._VerifyValues(
+        image,
+        ksizes=[2, 2],
+        strides=[1, 1],
+        rates=[1, 1],
+        padding="SAME",
+        patches=patches)
+
+  def testKsize2x2Stride1x1Rate2x2Valid(self):
+    """Test for 2x2 kernel with 2x2 dilation."""
+    # [1, 2, 2, 1]
+    image = np.arange(16).reshape(1, 4, 4, 1).astype(np.float32)
+    # [1, 2, 2, 4]
+    patches = [[[[0, 2, 8, 10], [1, 3, 9, 11]],
+                [[4, 6, 12, 14], [5, 7, 13, 15]]]]
+    self._VerifyValues(
+        image,
+        ksizes=[2, 2],
+        strides=[1, 1],
+        rates=[2, 2],
+        padding="VALID",
+        patches=patches)
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tf2xla/g3doc/cpu_supported_ops.md b/tensorflow/compiler/tf2xla/g3doc/cpu_supported_ops.md
index 44f7db5ffd..91351421bc 100644
--- a/tensorflow/compiler/tf2xla/g3doc/cpu_supported_ops.md
+++ b/tensorflow/compiler/tf2xla/g3doc/cpu_supported_ops.md
@@ -71,6 +71,7 @@ Operator                              | Type Constraint
 `Exp`                                 | `T={complex64,double,float}`
 `ExpandDims`                          | `Tdim={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Expm1`                               | `T={complex64,double,float}`
+`ExtractImagePatches`                 | `T={double,float,int32,int64,uint32,uint64}`
 `FFT`                                 |
 `FFT2D`                               |
 `FFT3D`                               |
@@ -124,6 +125,8 @@ Operator                              | Type Constraint
 `MaxPool3D`                           | `T={float}`
 `MaxPool3DGrad`                       | `TInput={float}`<br>`T={float}`
 `MaxPoolGrad`                         | `T={double,float,int32,int64,uint32,uint64}`
+`MaxPoolGradV2`                       | `T={double,float,int32,int64,uint32,uint64}`
+`MaxPoolV2`                           | `T={double,float,int32,int64}`
 `Maximum`                             | `T={double,float,int32,int64}`
 `Mean`                                | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
 `Min`                                 | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
@@ -176,6 +179,7 @@ Operator                              | Type Constraint
 `ResourceGather`                      | `Tindices={int32,int64}`<br>`dtype={complex64,double,float,int32,int64,uint32,uint64}`
 `ResourceStridedSliceAssign`          | `Index={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Reverse`                             | `T={bool,complex64,double,float,int32,int64}`
+`ReverseSequence`                     | `Tlen={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `ReverseV2`                           | `T={bool,complex64,double,float,int32,int64}`<br>`Tidx={int32,int64}`
 `RightShift`                          | `T={int32,int64,uint32,uint64}`
 `Rint`                                | `T={double,float}`
diff --git a/tensorflow/compiler/tf2xla/g3doc/gpu_supported_ops.md b/tensorflow/compiler/tf2xla/g3doc/gpu_supported_ops.md
index eb1f891125..b9bdb829d7 100644
--- a/tensorflow/compiler/tf2xla/g3doc/gpu_supported_ops.md
+++ b/tensorflow/compiler/tf2xla/g3doc/gpu_supported_ops.md
@@ -71,6 +71,7 @@ Operator                              | Type Constraint
 `Exp`                                 | `T={complex64,double,float}`
 `ExpandDims`                          | `Tdim={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Expm1`                               | `T={complex64,double,float}`
+`ExtractImagePatches`                 | `T={double,float,int32,int64,uint32,uint64}`
 `FFT`                                 |
 `FFT2D`                               |
 `FFT3D`                               |
@@ -124,6 +125,8 @@ Operator                              | Type Constraint
 `MaxPool3D`                           | `T={float}`
 `MaxPool3DGrad`                       | `TInput={float}`<br>`T={float}`
 `MaxPoolGrad`                         | `T={double,float,int32,int64,uint32,uint64}`
+`MaxPoolGradV2`                       | `T={double,float,int32,int64,uint32,uint64}`
+`MaxPoolV2`                           | `T={double,float,int32,int64}`
 `Maximum`                             | `T={double,float,int32,int64}`
 `Mean`                                | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
 `Min`                                 | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
@@ -173,6 +176,7 @@ Operator                              | Type Constraint
 `ResourceGather`                      | `Tindices={int32,int64}`<br>`dtype={complex64,double,float,int32,int64,uint32,uint64}`
 `ResourceStridedSliceAssign`          | `Index={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Reverse`                             | `T={bool,complex64,double,float,int32,int64}`
+`ReverseSequence`                     | `Tlen={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `ReverseV2`                           | `T={bool,complex64,double,float,int32,int64}`<br>`Tidx={int32,int64}`
 `RightShift`                          | `T={int32,int64,uint32,uint64}`
 `Rint`                                | `T={double,float}`
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index 84fa43f4fb..67be1a4ba6 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -31,6 +31,7 @@ tf_kernel_library(
         "diag_op.cc",
         "dynamic_stitch_op.cc",
         "elu_op.cc",
+        "extract_image_patches_op.cc",
         "fft_ops.cc",
         "fill_op.cc",
         "function_ops.cc",
diff --git a/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc b/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
new file mode 100644
index 0000000000..b2970eae20
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
@@ -0,0 +1,169 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+namespace tensorflow {
+
+namespace {
+
+class ExtractImagePatchesOp : public XlaOpKernel {
+ public:
+  explicit ExtractImagePatchesOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("ksizes", &ksizes_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("strides", &strides_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("rates", &dilations_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("padding", &padding_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    const TensorFormat data_format = FORMAT_NHWC;
+    const int num_dims = ksizes_.size();
+
+    OP_REQUIRES(
+        ctx, num_dims >= 3,
+        errors::InvalidArgument("Kernel size must have at least 3 dimensions"));
+    const int num_spatial_dims = num_dims - 2;
+
+    OP_REQUIRES(ctx, strides_.size() == num_dims,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify ",
+                                        num_dims, " dimensions"));
+    OP_REQUIRES(ctx, dilations_.size() == num_dims,
+                errors::InvalidArgument("Dilations field must "
+                                        "specify ",
+                                        num_dims, " dimensions"));
+
+    int batch_dim = GetTensorBatchDimIndex(num_dims, data_format);
+    int feature_dim = GetTensorFeatureDimIndex(num_dims, data_format);
+    OP_REQUIRES(
+        ctx, ksizes_[batch_dim] == 1 && ksizes_[feature_dim] == 1,
+        errors::Unimplemented("Current implementation does not yet support "
+                              "kernel sizes > 1 in the batch and depth "
+                              "dimensions."));
+    OP_REQUIRES(
+        ctx, strides_[batch_dim] == 1 && strides_[feature_dim] == 1,
+        errors::Unimplemented("Current implementation does not yet support "
+                              "strides in the batch and depth dimensions."));
+    OP_REQUIRES(
+        ctx, dilations_[batch_dim] == 1 && dilations_[feature_dim] == 1,
+        errors::Unimplemented("Current implementation does not support "
+                              "dilations in the batch and depth dimensions."));
+
+    for (int i = 0; i < num_spatial_dims; ++i) {
+      int input_dim = GetTensorSpatialDimIndex(num_dims, data_format, i);
+      OP_REQUIRES(
+          ctx, ksizes_[input_dim] >= 0,
+          errors::Unimplemented("Kernel size values must be non-negative; ", i,
+                                "th spatial dimension had dilation ",
+                                dilations_[input_dim]));
+      OP_REQUIRES(ctx, strides_[input_dim] >= 1,
+                  errors::Unimplemented("Stride values must be positive; ", i,
+                                        "th spatial dimension had dilation ",
+                                        dilations_[input_dim]));
+      OP_REQUIRES(ctx, dilations_[input_dim] >= 1,
+                  errors::Unimplemented("Dilation values must be positive; ", i,
+                                        "th spatial dimension had dilation ",
+                                        dilations_[input_dim]));
+    }
+
+    xla::PrimitiveType type;
+    OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(ctx->input_type(0), &type));
+
+    const TensorShape input_shape = ctx->InputShape(0);
+    OP_REQUIRES(
+        ctx, input_shape.dims() == num_dims,
+        errors::InvalidArgument("input must be ", num_dims, "-dimensional",
+                                input_shape.DebugString()));
+    const int64 depth = input_shape.dim_size(feature_dim);
+
+    xla::ComputationBuilder* builder = ctx->builder();
+
+    // The following code is equivalent to:
+    // eye = np.eye(kH * kW * D).reshape([kH, kW, D, kH * kW * kD])
+    int64 kernel_size = 1;
+    std::vector<int64> lhs_shape(num_dims, 1);
+    for (int i = 0; i < num_spatial_dims; ++i) {
+      int input_dim = GetTensorSpatialDimIndex(num_dims, data_format, i);
+      lhs_shape[i] = ksizes_[input_dim];
+      kernel_size *= ksizes_[input_dim];
+    }
+    lhs_shape[num_spatial_dims] = depth;
+    lhs_shape[num_spatial_dims + 1] = 1;
+
+    // Builds an identity matrix as a broadcast equality of iotas.
+    // iota = np.arange(np.prod(ksize), depth)
+    // filter = np.equal(np.reshape(iota, [-1, 1]), iota).astype(np.float32)
+    xla::ComputationDataHandle iota;
+    TF_CHECK_OK(XlaHelpers::Iota(builder, DataType::DT_INT32,
+                                 kernel_size * depth, &iota));
+
+    auto lhs = builder->Reshape(iota, lhs_shape);
+    auto filter = builder->ConvertElementType(
+        builder->Eq(lhs, iota, {num_spatial_dims + 1}), type);
+
+    xla::ConvolutionDimensionNumbers dims;
+    std::vector<int64> window_strides(num_spatial_dims);
+    std::vector<int64> lhs_dilation(num_spatial_dims, 1);
+    std::vector<int64> rhs_dilation(num_spatial_dims);
+    std::vector<std::pair<int64, int64>> padding(num_spatial_dims);
+
+    dims.set_input_batch_dimension(batch_dim);
+    dims.set_output_batch_dimension(batch_dim);
+    dims.set_input_feature_dimension(feature_dim);
+    dims.set_output_feature_dimension(feature_dim);
+    dims.set_kernel_input_feature_dimension(num_spatial_dims);
+    dims.set_kernel_output_feature_dimension(num_spatial_dims + 1);
+
+    for (int i = 0; i < num_spatial_dims; ++i) {
+      const int64 dim = GetTensorSpatialDimIndex(num_dims, data_format, i);
+      dims.add_input_spatial_dimensions(dim);
+      dims.add_kernel_spatial_dimensions(i);
+      dims.add_output_spatial_dimensions(dim);
+      window_strides[i] = strides_.at(dim);
+      rhs_dilation[i] = dilations_.at(dim);
+
+      int64 unused_output_size;
+      OP_REQUIRES_OK(
+          ctx, GetWindowedOutputSizeVerboseV2(
+                   input_shape.dim_size(dim), ksizes_[dim], rhs_dilation[i],
+                   window_strides[i], padding_, &unused_output_size,
+                   &padding[i].first, &padding[i].second));
+    }
+
+    xla::ComputationDataHandle conv =
+        builder->ConvGeneralDilated(ctx->Input(0), filter, window_strides,
+                                    padding, lhs_dilation, rhs_dilation, dims);
+    ctx->SetOutput(0, conv);
+  }
+
+ protected:
+  std::vector<int32> ksizes_;
+  std::vector<int32> dilations_;
+  std::vector<int32> strides_;
+  Padding padding_;
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(ExtractImagePatchesOp);
+};
+
+REGISTER_XLA_OP(Name("ExtractImagePatches"), ExtractImagePatchesOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/python/kernel_tests/extract_image_patches_op_test.py b/tensorflow/python/kernel_tests/extract_image_patches_op_test.py
index 5c7624f1f6..6ea9f1badc 100644
--- a/tensorflow/python/kernel_tests/extract_image_patches_op_test.py
+++ b/tensorflow/python/kernel_tests/extract_image_patches_op_test.py
@@ -84,7 +84,7 @@ class ExtractImagePatches(test.TestCase):
           patches=patches)
 
   def testKsize2x2Stride1x1Rate1x1Valid(self):
-    """Test for 1x1 kernel ."""
+    """Test for 2x2 kernel with VALID padding."""
     # [1, 2, 2, 1]
     image = [[[[1], [2]], [[3], [4]]]]
     # [1, 1, 1, 4]
@@ -98,7 +98,7 @@ class ExtractImagePatches(test.TestCase):
         patches=patches)
 
   def testKsize2x2Stride1x1Rate1x1Same(self):
-    """Test for 1x1 kernel ."""
+    """Test for 2x2 kernel with SAME padding."""
     # [1, 2, 2, 1]
     image = [[[[1], [2]], [[3], [4]]]]
     # [1, 2, 2, 4]
@@ -111,6 +111,20 @@ class ExtractImagePatches(test.TestCase):
         padding="SAME",
         patches=patches)
 
+  def testKsize2x2Stride1x1Rate2x2Valid(self):
+    """Test for 2x2 kernel with 2x2 dilation."""
+    # [1, 2, 2, 1]
+    image = np.arange(16).reshape(1, 4, 4, 1).astype(np.float32)
+    # [1, 2, 2, 4]
+    patches = [[[[0, 2, 8, 10], [1, 3, 9, 11]],
+                [[4, 6, 12, 14], [5, 7, 13, 15]]]]
+    self._VerifyValues(
+        image,
+        ksizes=[2, 2],
+        strides=[1, 1],
+        rates=[2, 2],
+        padding="VALID",
+        patches=patches)
 
 if __name__ == "__main__":
   test.main()
-- 
GitLab


From 0bd78003c36dd194083ec22501c2b0b6db208f4c Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Wed, 31 Jan 2018 13:22:49 -0800
Subject: [PATCH 470/966] [XLA:CPU] Generate correct IR for integer clamp

PiperOrigin-RevId: 184037078
---
 .../xla/service/elemental_ir_emitter.cc       | 45 ++++++++++++++-----
 .../xla/service/elemental_ir_emitter.h        |  6 +++
 .../xla/tests/array_elementwise_ops_test.cc   | 20 +++++++++
 3 files changed, 60 insertions(+), 11 deletions(-)

diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
index 28cd425309..4468adbadb 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
@@ -1043,17 +1043,9 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerBinaryOp(
           is_signed ? llvm::CmpInst::ICMP_SGE : llvm::CmpInst::ICMP_UGE,
           lhs_value, rhs_value, ir_builder_);
     case HloOpcode::kMinimum:
-      return ir_builder_->CreateSelect(
-          ir_builder_->CreateICmp(
-              is_signed ? llvm::ICmpInst::ICMP_SLE : llvm::ICmpInst::ICMP_ULE,
-              lhs_value, rhs_value),
-          lhs_value, rhs_value);
+      return EmitIntegralMin(lhs_value, rhs_value, is_signed);
     case HloOpcode::kMaximum:
-      return ir_builder_->CreateSelect(
-          ir_builder_->CreateICmp(
-              is_signed ? llvm::ICmpInst::ICMP_SGE : llvm::ICmpInst::ICMP_UGE,
-              lhs_value, rhs_value),
-          lhs_value, rhs_value);
+      return EmitIntegralMax(lhs_value, rhs_value, is_signed);
     case HloOpcode::kAnd:
       return ir_builder_->CreateAnd(lhs_value, rhs_value);
     case HloOpcode::kOr:
@@ -1070,6 +1062,26 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerBinaryOp(
   }
 }
 
+llvm::Value* ElementalIrEmitter::EmitIntegralMax(llvm::Value* lhs_value,
+                                                 llvm::Value* rhs_value,
+                                                 bool is_signed) const {
+  return ir_builder_->CreateSelect(
+      ir_builder_->CreateICmp(
+          is_signed ? llvm::ICmpInst::ICMP_SGE : llvm::ICmpInst::ICMP_UGE,
+          lhs_value, rhs_value),
+      lhs_value, rhs_value);
+}
+
+llvm::Value* ElementalIrEmitter::EmitIntegralMin(llvm::Value* lhs_value,
+                                                 llvm::Value* rhs_value,
+                                                 bool is_signed) const {
+  return ir_builder_->CreateSelect(
+      ir_builder_->CreateICmp(
+          is_signed ? llvm::ICmpInst::ICMP_SLE : llvm::ICmpInst::ICMP_ULE,
+          lhs_value, rhs_value),
+      lhs_value, rhs_value);
+}
+
 llvm_ir::IrArray::Index ElementalIrEmitter::ElementwiseSourceIndex(
     const llvm_ir::IrArray::Index& target_index, const HloInstruction& hlo,
     int64 operand_no) const {
@@ -1366,7 +1378,18 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
         TF_ASSIGN_OR_RETURN(llvm::Value * max_value,
                             operand_to_generator.at(hlo->operand(2))(
                                 ElementwiseSourceIndex(index, *hlo, 2)));
-        return EmitFloatMin(max_value, EmitFloatMax(min_value, arg_value));
+        PrimitiveType prim_type = hlo->shape().element_type();
+        if (primitive_util::IsFloatingPointType(prim_type)) {
+          return EmitFloatMin(max_value, EmitFloatMax(min_value, arg_value));
+        } else if (primitive_util::IsIntegralType(prim_type)) {
+          bool is_signed = primitive_util::IsSignedIntegralType(prim_type);
+          return EmitIntegralMin(
+              max_value, EmitIntegralMax(min_value, arg_value, is_signed),
+              is_signed);
+        } else {
+          return Unimplemented("Clamp unimplemented for %s",
+                               PrimitiveType_Name(prim_type).c_str());
+        }
       };
     case HloOpcode::kReducePrecision:
       return [this, hlo, &operand_to_generator](
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
index 1a48eb5fcb..c516a826d9 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
@@ -86,6 +86,12 @@ class ElementalIrEmitter {
   virtual llvm::Value* EmitFloatMin(llvm::Value* lhs_value,
                                     llvm::Value* rhs_value) const;
 
+  llvm::Value* EmitIntegralMax(llvm::Value* lhs_value, llvm::Value* rhs_value,
+                               bool is_signed) const;
+
+  llvm::Value* EmitIntegralMin(llvm::Value* lhs_value, llvm::Value* rhs_value,
+                               bool is_signed) const;
+
   virtual StatusOr<llvm::Value*> EmitErfInv(PrimitiveType prim_type,
                                             llvm::Value* value) const;
 
diff --git a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
index 56fc21d019..52e14a1f7b 100644
--- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
@@ -1893,6 +1893,26 @@ XLA_TEST_F(ArrayElementwiseOpTest, ClampF32ScalarVector) {
                              error_spec_);
 }
 
+XLA_TEST_F(ArrayElementwiseOpTest, ClampS32Vector) {
+  ComputationBuilder builder(client_, TestName());
+  auto min_vector = builder.ConstantR1<int32>({1, -6, 1, 2, 0, -5});
+  auto arg_vector = builder.ConstantR1<int32>({2, 10, -5, 1, 4, 10});
+  auto max_vector = builder.ConstantR1<int32>({3, 0, 25, 5, 123, -1});
+  auto clamp = builder.Clamp(min_vector, arg_vector, max_vector);
+
+  ComputeAndCompareR1<int32>(&builder, {2, 0, 1, 2, 4, -1}, {});
+}
+
+XLA_TEST_F(ArrayElementwiseOpTest, ClampU32Vector) {
+  ComputationBuilder builder(client_, TestName());
+  auto min_vector = builder.ConstantR1<uint32>({1, 2, 1, 2, 0, ~0u - 4});
+  auto arg_vector = builder.ConstantR1<uint32>({2, 10, 5, 1, 4, 10});
+  auto max_vector = builder.ConstantR1<uint32>({3, 5, 25, 5, 123, ~0u});
+  auto clamp = builder.Clamp(min_vector, arg_vector, max_vector);
+
+  ComputeAndCompareR1<uint32>(&builder, {2, 5, 5, 2, 4, ~0u - 4}, {});
+}
+
 XLA_TEST_F(ArrayElementwiseOpTest, AddTwoParametersF32s) {
   ComputationBuilder builder(client_, TestName());
 
-- 
GitLab


From 6ac8f0947980fc9f88a19f4f1e211db8cd3e2d79 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 31 Jan 2018 13:35:23 -0800
Subject: [PATCH 471/966] Automated g4 rollback of changelist 184003263

PiperOrigin-RevId: 184038801
---
 tensorflow/contrib/lite/tools/BUILD           |   4 -
 tensorflow/contrib/lite/tools/verifier.cc     | 170 +--------------
 tensorflow/contrib/lite/tools/verifier.h      |   4 +-
 .../contrib/lite/tools/verifier_test.cc       | 200 +++++-------------
 4 files changed, 54 insertions(+), 324 deletions(-)

diff --git a/tensorflow/contrib/lite/tools/BUILD b/tensorflow/contrib/lite/tools/BUILD
index 4d3b553b22..1bffcfb987 100644
--- a/tensorflow/contrib/lite/tools/BUILD
+++ b/tensorflow/contrib/lite/tools/BUILD
@@ -99,11 +99,8 @@ cc_library(
     srcs = ["verifier.cc"],
     hdrs = ["verifier.h"],
     deps = [
-        "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite:schema_fbs_version",
-        "//tensorflow/contrib/lite:string_util",
         "//tensorflow/contrib/lite/schema:schema_fbs",
-        "@com_google_absl//absl/base:core_headers",
     ],
 )
 
@@ -115,7 +112,6 @@ cc_test(
         ":verifier",
         "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite:schema_fbs_version",
-        "//tensorflow/contrib/lite:string_util",
         "//tensorflow/contrib/lite/schema:schema_fbs",
         "//tensorflow/contrib/lite/testing:util",
         "@com_google_googletest//:gtest",
diff --git a/tensorflow/contrib/lite/tools/verifier.cc b/tensorflow/contrib/lite/tools/verifier.cc
index 726e2aaa31..95a0895379 100644
--- a/tensorflow/contrib/lite/tools/verifier.cc
+++ b/tensorflow/contrib/lite/tools/verifier.cc
@@ -14,32 +14,13 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/contrib/lite/tools/verifier.h"
-#include <climits>
 #include "tensorflow/contrib/lite/schema/schema_generated.h"
-#include "tensorflow/contrib/lite/string_util.h"
 #include "tensorflow/contrib/lite/version.h"
 
 namespace tflite {
 
 namespace {
 
-// Reports error message when the reporter is set.
-void ReportError(ErrorReporter* error_reporter, const char* format, ...) {
-  if (error_reporter) {
-    va_list args;
-    va_start(args, format);
-    error_reporter->Report(format, args);
-    va_end(args);
-  }
-}
-
-// Returns the int32_t value pointed by ptr.
-const uint32_t* GetIntPtr(const char* ptr) {
-  return reinterpret_cast<const uint32_t*>(ptr);
-}
-
-// Verifies flatbuffer format of the model contents and returns the in-memory
-// model.
 const Model* VerifyFlatbufferAndGetModel(const void* buf, size_t len) {
   ::flatbuffers::Verifier verifier(static_cast<const uint8_t*>(buf), len);
   if (VerifyModelBuffer(verifier)) {
@@ -49,159 +30,14 @@ const Model* VerifyFlatbufferAndGetModel(const void* buf, size_t len) {
   }
 }
 
-const uint32_t kMaxNumString = UINT_MAX / sizeof(int32_t) - 2;
-
-// Verifies string tensor has legit buffer contents that follow the schema
-// defined in lite/string_util.h
-bool VerifyStringTensorBuffer(const Buffer& buffer,
-                              ErrorReporter* error_reporter) {
-  uint32_t buffer_size = buffer.data()->size();
-  const char* buffer_ptr = reinterpret_cast<const char*>(buffer.data()->data());
-
-  uint32_t num_strings = *GetIntPtr(buffer_ptr);
-  if (num_strings > kMaxNumString) {
-    ReportError(error_reporter,
-                "String tensor has invalid num of string set: %d", num_strings);
-    return false;
-  }
-  uint32_t header_offsets =
-      static_cast<uint32_t>(num_strings + 2) * sizeof(int32_t);
-
-  if (buffer_size < header_offsets) {
-    ReportError(error_reporter,
-                "String tensor buffer requires at least %d bytes, but is "
-                "allocated with %d bytes",
-                header_offsets, buffer_size);
-    return false;
-  }
-
-  uint32_t prev_ptr = header_offsets;
-  uint32_t offset = sizeof(int32_t);
-
-  if (*GetIntPtr(buffer_ptr + offset) != header_offsets) {
-    ReportError(error_reporter,
-                "String tensor buffer initial offset must be: %d",
-                header_offsets);
-    return false;
-  }
-  offset += sizeof(int32_t);
-  for (int i = 1; i <= num_strings; i++, offset += sizeof(int32_t)) {
-    int string_offset = *GetIntPtr(buffer_ptr + offset);
-    if (string_offset < prev_ptr || string_offset > buffer_size) {
-      ReportError(error_reporter, "String tensor buffer is invalid: index %d",
-                  i);
-      return false;
-    }
-  }
-  if (*GetIntPtr(buffer_ptr + offset - sizeof(int32_t)) != buffer_size) {
-    ReportError(error_reporter, "String tensor buffer last offset must be %d",
-                buffer_size);
-    return false;
-  }
-  return true;
-}
-
-// Verifies numeric tensor has legit buffer.
-bool VerifyNumericTensorBuffer(const Tensor& tensor, const Buffer& buffer,
-                               ErrorReporter* error_reporter) {
-  uint64_t bytes_required = 1;
-  for (int dim : *tensor.shape()) {
-    bytes_required *= dim;
-    if (bytes_required > UINT_MAX) {
-      ReportError(error_reporter, "Tensor dimension overflow");
-      return false;
-    }
-  }
-  switch (tensor.type()) {
-    case TensorType_FLOAT32:
-      bytes_required *= sizeof(float);
-      break;
-    case TensorType_INT32:
-      bytes_required *= sizeof(int32_t);
-      break;
-    case TensorType_UINT8:
-      bytes_required *= sizeof(uint8_t);
-      break;
-    case TensorType_INT64:
-      bytes_required *= sizeof(int64_t);
-      break;
-    case TensorType_FLOAT16:
-      // FALLTHROUGH_INTENDED;
-    default:
-      ReportError(error_reporter, "Invalid tensor type: %d", tensor.type());
-      return false;
-  }
-  if (bytes_required > UINT_MAX) {
-    ReportError(error_reporter, "Tensor dimension overflow");
-    return false;
-  }
-
-  if (bytes_required != buffer.data()->size()) {
-    ReportError(
-        error_reporter,
-        "Tensor requires %d bytes, but is allocated with %d bytes buffer",
-        bytes_required, buffer.data()->size());
-    return false;
-  }
-  return true;
-
-  // TODO(yichengfan): verify quantized tensors.
-}
-
-// Verifies tensors have valid properties and legit buffer if set.
-bool VerifyTensors(const Model& model, ErrorReporter* error_reporter) {
-  if (!model.subgraphs()) {
-    return true;
-  }
-  for (const auto& subgraph : *model.subgraphs()) {
-    if (!subgraph->tensors()) {
-      return true;
-    }
-    for (const auto& tensor : *subgraph->tensors()) {
-      if (!tensor->buffer()) {
-        return true;
-      }
-      if (tensor->buffer() >= model.buffers()->size()) {
-        ReportError(error_reporter, "Invalid tensor buffer index: %d",
-                    tensor->buffer());
-        return false;
-      }
-      auto* buffer = model.buffers()->Get(tensor->buffer());
-      if (!buffer || !buffer->data()) {
-        ReportError(error_reporter, "Tensor buffer %d not set",
-                    tensor->buffer());
-        return false;
-      }
-
-      if (tensor->type() == TensorType_STRING) {
-        if (!VerifyStringTensorBuffer(*buffer, error_reporter)) {
-          return false;
-        }
-      } else {
-        if (!VerifyNumericTensorBuffer(*tensor, *buffer, error_reporter)) {
-          return false;
-        }
-      }
-    }
-  }
-  return true;
-}
-
 }  // namespace
 
-bool Verify(const void* buf, size_t len, ErrorReporter* error_reporter) {
+bool Verify(const void* buf, size_t len) {
   const Model* model = VerifyFlatbufferAndGetModel(buf, len);
   if (model == nullptr) {
-    ReportError(error_reporter, "Invalid flatbuffer format");
     return false;
   }
-  if (model->version() != TFLITE_SCHEMA_VERSION) {
-    ReportError(error_reporter, "Invalid model version %d", model->version());
-    return false;
-  }
-  if (!VerifyTensors(*model, error_reporter)) {
-    return false;
-  }
-  return true;
+
+  return model->version() == TFLITE_SCHEMA_VERSION;
 }
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/tools/verifier.h b/tensorflow/contrib/lite/tools/verifier.h
index d2bf3c91d5..03e1f22b7e 100644
--- a/tensorflow/contrib/lite/tools/verifier.h
+++ b/tensorflow/contrib/lite/tools/verifier.h
@@ -18,15 +18,13 @@ limitations under the License.
 
 #include <stdio.h>
 
-#include "tensorflow/contrib/lite/error_reporter.h"
-
 namespace tflite {
 
 // Verifies the integrity of a Tensorflow Lite flatbuffer model file.
 // Currently, it verifies:
 // * The file is following a legit flatbuffer schema.
 // * The model is in supported version.
-bool Verify(const void* buf, size_t len, ErrorReporter* error_reporter);
+bool Verify(const void* buf, size_t len);
 
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/tools/verifier_test.cc b/tensorflow/contrib/lite/tools/verifier_test.cc
index 244d4f0396..0481a55a78 100644
--- a/tensorflow/contrib/lite/tools/verifier_test.cc
+++ b/tensorflow/contrib/lite/tools/verifier_test.cc
@@ -28,62 +28,31 @@ using flatbuffers::FlatBufferBuilder;
 using flatbuffers::Offset;
 using flatbuffers::Vector;
 
-// Build single subgraph model.
-class TfLiteFlatbufferModelBuilder {
+// Class that abstracts the list of buffers at the end of the TF Lite structure
+class DeferredBufferWriter {
  public:
-  TfLiteFlatbufferModelBuilder() {
-    buffers_.push_back(
-        CreateBuffer(builder_, builder_.CreateVector(std::vector<uint8_t>{})));
+  DeferredBufferWriter() {
+    data_.push_back({});  // sentinel empty buffer.
   }
 
-  void AddTensor(const std::vector<int>& shape, tflite::TensorType type,
-                 const std::vector<uint8_t>& buffer, const char* name) {
-    int buffer_index = 0;
-    if (!buffer.empty()) {
-      buffer_index = buffers_.size();
-      buffers_.push_back(CreateBuffer(builder_, builder_.CreateVector(buffer)));
+  Offset<Vector<Offset<Buffer>>> BuildBuffers(FlatBufferBuilder *builder) {
+    std::vector<Offset<Buffer>> buffer_vector;
+    for (const auto &vec : data_) {
+      auto data_buffer = builder->CreateVector(vec.data(), vec.size());
+      buffer_vector.push_back(tflite::CreateBuffer(*builder, data_buffer));
     }
-    tensors_.push_back(CreateTensorDirect(builder_, &shape, type, buffer_index,
-                                          name, /*quantization=*/0));
+    return builder->CreateVector(buffer_vector);
   }
 
-  void AddOperator(const std::vector<int32_t>& inputs,
-                   const std::vector<int32_t>& outputs,
-                   tflite::BuiltinOperator builtin_op, const char* custom_op) {
-    operator_codes_.push_back(
-        CreateOperatorCodeDirect(builder_, builtin_op, custom_op));
-    operators_.push_back(CreateOperator(
-        builder_, operator_codes_.size() - 1, builder_.CreateVector(inputs),
-        builder_.CreateVector(outputs), BuiltinOptions_NONE,
-        /*builtin_options=*/0,
-        /*custom_options=*/0, tflite::CustomOptionsFormat_FLEXBUFFERS));
-  }
-
-  void FinishModel(const std::vector<int32_t>& inputs,
-                   const std::vector<int32_t>& outputs) {
-    auto subgraph = std::vector<Offset<SubGraph>>({CreateSubGraph(
-        builder_, builder_.CreateVector(tensors_),
-        builder_.CreateVector(inputs), builder_.CreateVector(outputs),
-        builder_.CreateVector(operators_),
-        builder_.CreateString("test_subgraph"))});
-    auto result = CreateModel(
-        builder_, TFLITE_SCHEMA_VERSION, builder_.CreateVector(operator_codes_),
-        builder_.CreateVector(subgraph), builder_.CreateString("test_model"),
-        builder_.CreateVector(buffers_));
-    tflite::FinishModelBuffer(builder_, result);
-  }
-
-  bool Verify() {
-    return tflite::Verify(builder_.GetBufferPointer(), builder_.GetSize(),
-                          DefaultErrorReporter());
+  // Registers a buffer index and takes ownership of the data to write to it.
+  int Record(std::vector<uint8_t> data) {
+    int buffer_index = data_.size();
+    data_.emplace_back(std::move(data));
+    return buffer_index;
   }
 
  private:
-  FlatBufferBuilder builder_;
-  std::vector<Offset<Operator>> operators_;
-  std::vector<Offset<OperatorCode>> operator_codes_;
-  std::vector<Offset<Tensor>> tensors_;
-  std::vector<Offset<Buffer>> buffers_;
+  std::vector<std::vector<unsigned char>> data_;
 };
 
 TEST(VerifyModel, TestEmptyModel) {
@@ -93,26 +62,43 @@ TEST(VerifyModel, TestEmptyModel) {
                            /*description=*/0, /*buffers=*/0);
   ::tflite::FinishModelBuffer(builder, model);
 
-  ASSERT_TRUE(Verify(builder.GetBufferPointer(), builder.GetSize(),
-                     DefaultErrorReporter()));
+  ASSERT_TRUE(Verify(builder.GetBufferPointer(), builder.GetSize()));
 }
 
 TEST(VerifyModel, TestSimpleModel) {
-  TfLiteFlatbufferModelBuilder builder;
-  builder.AddOperator({0, 1}, {2}, BuiltinOperator_CUSTOM, "test");
-  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4, 5, 6}, "input");
-  builder.AddTensor(
-      {2}, TensorType_STRING,
-      {2, 0, 0, 0, 16, 0, 0, 0, 17, 0, 0, 0, 19, 0, 0, 0, 'A', 'B', 'C'},
-      "data");
-  builder.AddTensor({2, 3}, TensorType_INT32, {}, "output");
-  builder.FinishModel({0, 1}, {2});
-  ASSERT_TRUE(builder.Verify());
+  FlatBufferBuilder builder;
+  auto inputs = builder.CreateVector<int32_t>({0});
+  auto outputs = builder.CreateVector<int32_t>({1});
+  auto operator_codes = builder.CreateVector(std::vector<Offset<OperatorCode>>{
+      CreateOperatorCodeDirect(builder, BuiltinOperator_CUSTOM, "test")});
+  auto operators =
+      builder.CreateVector(std::vector<Offset<Operator>>{CreateOperator(
+          builder, /*opcode_index=*/0,
+          /*inputs=*/builder.CreateVector<int32_t>({0}),
+          /*outputs=*/builder.CreateVector<int32_t>({1}), BuiltinOptions_NONE,
+          /*builtin_options=*/0,
+          /*custom_options=*/0, ::tflite::CustomOptionsFormat_FLEXBUFFERS)});
+  std::vector<int> shape;
+  auto tensors = builder.CreateVector(std::vector<Offset<Tensor>>{
+      CreateTensorDirect(builder, &shape, TensorType_INT32, /*buffer=*/0,
+                         "input", /*quantization=*/0),
+      CreateTensorDirect(builder, &shape, TensorType_INT32, /*buffer=*/0,
+                         "output", /*quantization=*/0)});
+  auto subgraph = std::vector<Offset<SubGraph>>(
+      {CreateSubGraph(builder, tensors, inputs, outputs, operators,
+                      builder.CreateString("Main"))});
+
+  auto model = CreateModel(builder, TFLITE_SCHEMA_VERSION, operator_codes,
+                           builder.CreateVector(subgraph),
+                           builder.CreateString("SmartReply"), /*buffers=*/0);
+
+  ::tflite::FinishModelBuffer(builder, model);
+  ASSERT_TRUE(Verify(builder.GetBufferPointer(), builder.GetSize()));
 }
 
 TEST(VerifyModel, TestCorruptedData) {
   string model = "123";
-  ASSERT_FALSE(Verify(model.data(), model.size(), /*error_reporter=*/nullptr));
+  ASSERT_FALSE(Verify(model.data(), model.size()));
 }
 
 TEST(VerifyModel, TestUnsupportedVersion) {
@@ -120,8 +106,7 @@ TEST(VerifyModel, TestUnsupportedVersion) {
   auto model = CreateModel(builder, /*version=*/1, /*operator_codes=*/0,
                            /*subgraphs=*/0, /*description=*/0, /*buffers=*/0);
   ::tflite::FinishModelBuffer(builder, model);
-  ASSERT_FALSE(Verify(builder.GetBufferPointer(), builder.GetSize(),
-                      DefaultErrorReporter()));
+  ASSERT_FALSE(Verify(builder.GetBufferPointer(), builder.GetSize()));
 }
 
 TEST(VerifyModel, TestRandomModificationIsNotAllowed) {
@@ -131,105 +116,20 @@ TEST(VerifyModel, TestRandomModificationIsNotAllowed) {
                            /*subgraphs=*/0, /*description=*/0, /*buffers=*/0);
   ::tflite::FinishModelBuffer(builder, model);
 
-  string model_content(reinterpret_cast<char*>(builder.GetBufferPointer()),
+  string model_content(reinterpret_cast<char *>(builder.GetBufferPointer()),
                        builder.GetSize());
   for (int i = 0; i < model_content.size(); i++) {
     model_content[i] = (model_content[i] + 137) % 255;
-    EXPECT_FALSE(Verify(model_content.data(), model_content.size(),
-                        DefaultErrorReporter()))
+    EXPECT_FALSE(Verify(model_content.data(), model_content.size()))
         << "Fail at position: " << i;
   }
 }
 
-TEST(VerifyModel, TestIntTensorShapeIsGreaterThanBuffer) {
-  TfLiteFlatbufferModelBuilder builder;
-  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4}, "input");
-  builder.FinishModel({}, {});
-  ASSERT_FALSE(builder.Verify());
-}
-
-TEST(VerifyModel, TestIntTensorShapeIsSmallerThanBuffer) {
-  TfLiteFlatbufferModelBuilder builder;
-  builder.AddTensor({2, 1}, TensorType_UINT8, {1, 2, 3, 4}, "input");
-  builder.FinishModel({}, {});
-  ASSERT_FALSE(builder.Verify());
-}
-
-TEST(VerifyModel, TestIntTensorShapeOverflow) {
-  TfLiteFlatbufferModelBuilder builder;
-  builder.AddTensor({1024, 2048, 4096}, TensorType_UINT8, {1, 2, 3, 4},
-                    "input");
-  builder.FinishModel({}, {});
-  ASSERT_FALSE(builder.Verify());
-}
-
-TEST(VerifyModel, TensorBufferIsNotValid) {
-  FlatBufferBuilder builder;
-  std::vector<int> shape = {2, 3};
-  auto tensors = builder.CreateVector(std::vector<Offset<Tensor>>{
-      CreateTensorDirect(builder, &shape, TensorType_INT32, /*buffer=*/2,
-                         "input", /*quantization=*/0)});
-  auto subgraph = std::vector<Offset<SubGraph>>(
-      {CreateSubGraph(builder, tensors, /*inputs=*/0, /*outputs=*/0,
-                      /*operators=*/0, builder.CreateString("Main"))});
-
-  auto buffers = builder.CreateVector(std::vector<Offset<Buffer>>{
-      CreateBuffer(builder,
-                   builder.CreateVector(std::vector<uint8>{1, 2, 3, 4, 5, 6})),
-  });
-
-  auto model = CreateModel(builder, TFLITE_SCHEMA_VERSION, /*operator_codes=*/0,
-                           builder.CreateVector(subgraph),
-                           builder.CreateString("SmartReply"), buffers);
-
-  ::tflite::FinishModelBuffer(builder, model);
-  ASSERT_FALSE(Verify(builder.GetBufferPointer(), builder.GetSize(),
-                      DefaultErrorReporter()));
-}
-
-TEST(VerifyModel, StringTensorHasInvalidNumString) {
-  TfLiteFlatbufferModelBuilder builder;
-  builder.AddTensor(
-      {2}, TensorType_STRING,
-      {0x00, 0x00, 0x00, 0x20, 16, 0, 0, 0, 17, 0, 0, 0, 18, 0, 0, 0, 'A', 'B'},
-      "input");
-  builder.FinishModel({}, {});
-  ASSERT_FALSE(builder.Verify());
-}
-
-TEST(VerifyModel, StringTensorOffsetTooSmall) {
-  TfLiteFlatbufferModelBuilder builder;
-  builder.AddTensor(
-      {2}, TensorType_STRING,
-      {2, 0, 0, 0, 12, 0, 0, 0, 17, 0, 0, 0, 18, 0, 0, 0, 'A', 'B'}, "input");
-  builder.FinishModel({}, {});
-  ASSERT_FALSE(builder.Verify());
-}
-
-TEST(VerifyModel, StringTensorOffsetOutOfRange) {
-  TfLiteFlatbufferModelBuilder builder;
-  builder.AddTensor(
-      {2}, TensorType_STRING,
-      {2, 0, 0, 0, 16, 0, 0, 0, 17, 0, 0, 0, 22, 0, 0, 0, 'A', 'B'}, "input");
-  builder.FinishModel({}, {});
-  ASSERT_FALSE(builder.Verify());
-}
-
-TEST(VerifyModel, StringTensorIsLargerThanRequired) {
-  TfLiteFlatbufferModelBuilder builder;
-  builder.AddTensor(
-      {2}, TensorType_STRING,
-      {2, 0, 0, 0, 16, 0, 0, 0, 17, 0, 0, 0, 18, 0, 0, 0, 'A', 'B', 'C'},
-      "input");
-  builder.FinishModel({}, {});
-  ASSERT_FALSE(builder.Verify());
-}
-
 // TODO(yichengfan): make up malicious files to test with.
 
 }  // namespace tflite
 
-int main(int argc, char** argv) {
+int main(int argc, char **argv) {
   ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
-- 
GitLab


From faaa74db7134a2195a6334ab86947005e173db1b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 31 Jan 2018 13:42:18 -0800
Subject: [PATCH 472/966] Tolerate Const nodes with no data or with smaller
 data than is required by their shape, by zero-extending the Const data to the
 required size.

We wanted to generate an error on that, but too many
existing graphs already rely on current lax behavior.

PiperOrigin-RevId: 184039876
---
 tensorflow/contrib/lite/toco/import_tensorflow.cc | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.cc b/tensorflow/contrib/lite/toco/import_tensorflow.cc
index ca378af4c5..9862dbe99d 100644
--- a/tensorflow/contrib/lite/toco/import_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/import_tensorflow.cc
@@ -173,7 +173,8 @@ void ImportFloatArray(const TensorProto& input_tensor, Array* output_array) {
   }
   auto& output_float_data =
       output_array->GetMutableBuffer<ArrayDataType::kFloat>().data;
-  output_float_data.resize(input_flat_size);
+  output_float_data.resize(RequiredBufferSizeForShape(output_array->shape()),
+                           0.f);
   if (input_tensor.float_val_size() == 1) {
     for (int i = 0; i < input_flat_size; i++) {
       output_float_data[i] = input_tensor.float_val(0);
@@ -203,7 +204,7 @@ void ImportQuint8Array(const TensorProto& input_tensor, Array* output_array) {
   }
   auto& output_int_data =
       output_array->GetMutableBuffer<ArrayDataType::kUint8>().data;
-  output_int_data.resize(input_flat_size);
+  output_int_data.resize(RequiredBufferSizeForShape(output_array->shape()), 0);
   if (input_tensor.int_val_size()) {
     for (int i = 0; i < input_tensor.int_val_size(); i++) {
       output_int_data[i] = input_tensor.int_val(i);
@@ -229,7 +230,7 @@ void ImportInt32Array(const TensorProto& input_tensor, Array* output_array) {
   }
   auto& output_int_data =
       output_array->GetMutableBuffer<ArrayDataType::kInt32>().data;
-  output_int_data.resize(input_flat_size);
+  output_int_data.resize(RequiredBufferSizeForShape(output_array->shape()), 0);
   if (input_tensor.int_val_size()) {
     for (int i = 0; i < input_tensor.int_val_size(); i++) {
       output_int_data[i] = input_tensor.int_val(i);
@@ -255,7 +256,7 @@ void ImportInt64Array(const TensorProto& input_tensor, Array* output_array) {
   }
   auto& output_int_data =
       output_array->GetMutableBuffer<ArrayDataType::kInt64>().data;
-  output_int_data.resize(input_flat_size);
+  output_int_data.resize(RequiredBufferSizeForShape(output_array->shape()), 0);
   if (input_tensor.int64_val_size()) {
     for (int i = 0; i < input_tensor.int64_val_size(); i++) {
       output_int_data[i] = input_tensor.int64_val(i);
@@ -281,7 +282,7 @@ void ImportStringArray(const TensorProto& input_tensor, Array* output_array) {
   }
   auto& output_string_data =
       output_array->GetMutableBuffer<ArrayDataType::kString>().data;
-  output_string_data.resize(input_flat_size);
+  output_string_data.resize(RequiredBufferSizeForShape(output_array->shape()));
   if (input_flat_size != input_tensor.string_val_size()) {
     LOG(FATAL) << "Input_content string_val doesn't have the right "
                   "dimensions for this string tensor.";
-- 
GitLab


From b47b2a7f2e500e261b7e77e30174a3ea121261c8 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Wed, 31 Jan 2018 14:30:22 -0800
Subject: [PATCH 473/966] Internal change.

PiperOrigin-RevId: 184047860
---
 tensorflow/tools/test/run_and_gather_logs_lib.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/tools/test/run_and_gather_logs_lib.py b/tensorflow/tools/test/run_and_gather_logs_lib.py
index a953ed1b53..3b4921bb98 100644
--- a/tensorflow/tools/test/run_and_gather_logs_lib.py
+++ b/tensorflow/tools/test/run_and_gather_logs_lib.py
@@ -136,7 +136,7 @@ def run_and_gather_logs(name, test_name, test_args,
   gpu_config = gpu_info_lib.gather_gpu_devices()
   if gpu_config:
     gpu_name = gpu_config[0].model
-    gpu_short_name_match = re.search(r"Tesla (K40|K80|P100)", gpu_name)
+    gpu_short_name_match = re.search(r"Tesla (K40|K80|P100|V100)", gpu_name)
     if gpu_short_name_match:
       gpu_short_name = gpu_short_name_match.group(0)
       test_adjusted_name = name + "|" + gpu_short_name.replace(" ", "_")
-- 
GitLab


From 5260cecbb46360b522d284573caf1ae17dcb677a Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <ndesaulniers@google.com>
Date: Wed, 31 Jan 2018 14:32:21 -0800
Subject: [PATCH 474/966] [XLA] Initialize linear indices used by
 NearComparator.

In unoptimized builds, we'd see miscompares with insanely large indices when
comparing literals containing NaN.

The linear indices may never be updated/initialized if the value NaN was
compared, since:

(NaN > x) == false
(NaN < x) == false

Adds a few tests that were used when debugging, but not a perfect one for this
case. It's currently not possible to test that LiteralTestUtil::Near() fails
when given bad input since it uses the EXPECT_* family of macros, which would
cause the intentional miscompares to fail the test no matter what.

PiperOrigin-RevId: 184048275
---
 .../compiler/xla/tests/literal_test_util.cc   |  8 +++++--
 .../xla/tests/literal_test_util_test.cc       | 24 +++++++++++++++++++
 2 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/xla/tests/literal_test_util.cc b/tensorflow/compiler/xla/tests/literal_test_util.cc
index 39c07297d6..474d2547ae 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util.cc
+++ b/tensorflow/compiler/xla/tests/literal_test_util.cc
@@ -376,6 +376,10 @@ class NearComparator {
     abs_expected_miscompare_sum_ = 0.0;
     max_rel_err_ = 0.0;
     max_abs_err_ = 0.0;
+    first_linear_index_ = -1;
+    last_linear_index_ = -1;
+    max_rel_linear_index_ = -1;
+    max_abs_linear_index_ = -1;
     miscompares_ = Literal(ShapeUtil::ChangeElementType(actual.shape(), PRED));
     miscompares_.PopulateWithValue(false);
     multi_index_.resize(expected.shape().dimensions_size(), 0);
@@ -482,11 +486,11 @@ class NearComparator {
     const float rel_err = abs_diff / std::abs(expected);
     abs_diff_sum_ += abs_diff;
     abs_expected_sum_ += std::abs(expected);
-    if (rel_err > max_rel_err_) {
+    if (rel_err > max_rel_err_ || std::isnan(rel_err)) {
       max_rel_err_ = rel_err;
       max_rel_linear_index_ = linear_index;
     }
-    if (abs_diff > max_abs_err_) {
+    if (abs_diff > max_abs_err_ || std::isnan(abs_diff)) {
       max_abs_err_ = abs_diff;
       max_abs_linear_index_ = linear_index;
     }
diff --git a/tensorflow/compiler/xla/tests/literal_test_util_test.cc b/tensorflow/compiler/xla/tests/literal_test_util_test.cc
index e477784557..3a421f8458 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util_test.cc
+++ b/tensorflow/compiler/xla/tests/literal_test_util_test.cc
@@ -97,5 +97,29 @@ TEST(LiteralTestUtilTest, ExpectNearFailurePlacesResultsInTemporaryDirectory) {
   }
 }
 
+TEST(LiteralTestUtilTest, NearComparatorR1) {
+  auto a =
+      Literal::CreateR1<float>({0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8});
+  auto b =
+      Literal::CreateR1<float>({0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8});
+  EXPECT_TRUE(LiteralTestUtil::Near(*a, *b, ErrorSpec{0.0001}));
+}
+
+TEST(LiteralTestUtilTest, NearComparatorR1Nan) {
+  auto a =
+      Literal::CreateR1<float>({0.0, 0.1, 0.2, 0.3, NAN, 0.5, 0.6, 0.7, 0.8});
+  auto b =
+      Literal::CreateR1<float>({0.0, 0.1, 0.2, 0.3, NAN, 0.5, 0.6, 0.7, 0.8});
+  EXPECT_TRUE(LiteralTestUtil::Near(*a, *b, ErrorSpec{0.0001}));
+}
+
+TEST(LiteralTestUtil, NearComparatorDifferentLengths) {
+  auto a =
+      Literal::CreateR1<float>({0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8});
+  auto b = Literal::CreateR1<float>({0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7});
+  EXPECT_FALSE(LiteralTestUtil::Near(*a, *b, ErrorSpec{0.0001}));
+  EXPECT_FALSE(LiteralTestUtil::Near(*b, *a, ErrorSpec{0.0001}));
+}
+
 }  // namespace
 }  // namespace xla
-- 
GitLab


From 65002aa25f8f77987e7183e972966b62208e384a Mon Sep 17 00:00:00 2001
From: Shivani Agrawal <shivaniagrawal@google.com>
Date: Wed, 31 Jan 2018 14:35:37 -0800
Subject: [PATCH 475/966] Fixes minor typos.

PiperOrigin-RevId: 184048812
---
 .../python/kernel_tests/dataset_serialization_test_base.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py b/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py
index 4574a625a3..3f64475e47 100644
--- a/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py
+++ b/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py
@@ -41,8 +41,9 @@ class DatasetSerializationTestBase(test.TestCase):
   def tearDown(self):
     self._delete_ckpt()
 
-  # TODO(b/72657739): Remove sparse_tensor argument to test the (deprecated)
-  # `from_sparse_tensor_slices()` API once the API and tests are deleted.
+  # TODO(b/72657739): Remove sparse_tensor argument, which is to test the
+  # (deprecated) saveable `SparseTensorSliceDataset`, once the API
+  # `from_sparse_tensor_slices()`and related tests are deleted.
   def run_core_tests(self, ds_fn1, ds_fn2, num_outputs, sparse_tensors=False):
     """Runs the core tests.
 
@@ -589,7 +590,7 @@ class DatasetSerializationTestBase(test.TestCase):
     # `_get_iterator_ops_from_collection`.
 
     # TODO(shivaniagrwal): `output_classes` is a nested structure of classes,
-    # this base class is specific to current test cases. Update when test are
+    # this base class is specific to current test cases. Update when tests are
     # added with `output_classes` as a nested structure with at least one of the
     # component being `tf.SparseTensor`.
     if (sparse_tensors or
-- 
GitLab


From 549581f635a439ddcc6fdabf05f972c8ced1738d Mon Sep 17 00:00:00 2001
From: Shanqing Cai <cais@google.com>
Date: Wed, 31 Jan 2018 14:55:13 -0800
Subject: [PATCH 476/966] Add grpcio as a pip dependency of tensorflow

PiperOrigin-RevId: 184052073
---
 tensorflow/tools/pip_package/setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 62df6453fb..1b1d60c4f3 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -35,6 +35,7 @@ REQUIRED_PACKAGES = [
     'absl-py >= 0.1.6',
     'astor >= 0.6.0',
     'gast >= 0.2.0',
+    'grpcio >= 1.8.6',
     'numpy >= 1.12.1',
     'six >= 1.10.0',
     'protobuf >= 3.4.0',
-- 
GitLab


From b976a142ac5ef8b3088f0adf8c1a5aa1503a01d7 Mon Sep 17 00:00:00 2001
From: Gunhan Gulsoy <gunan@google.com>
Date: Wed, 31 Jan 2018 14:57:24 -0800
Subject: [PATCH 477/966] Enable AVX in all TF windows builds.

PiperOrigin-RevId: 184052414
---
 tensorflow/tools/ci_build/windows/cpu/cmake/run_build.bat | 2 +-
 tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/tools/ci_build/windows/cpu/cmake/run_build.bat b/tensorflow/tools/ci_build/windows/cpu/cmake/run_build.bat
index 957729bb37..c1bc718507 100644
--- a/tensorflow/tools/ci_build/windows/cpu/cmake/run_build.bat
+++ b/tensorflow/tools/ci_build/windows/cpu/cmake/run_build.bat
@@ -36,7 +36,7 @@ SET CMAKE_DIR=%REPO_ROOT%\tensorflow\contrib\cmake
 SET MSBUILD_EXE="C:\Program Files (x86)\MSBuild\14.0\Bin\msbuild.exe"
 
 :: Run cmake to create Visual Studio Project files.
-%CMAKE_EXE% %CMAKE_DIR% -A x64 -DSWIG_EXECUTABLE=%SWIG_EXE% -DPYTHON_EXECUTABLE=%PY_EXE% -DCMAKE_BUILD_TYPE=Release -DPYTHON_LIBRARIES=%PY_LIB% -Dtensorflow_BUILD_PYTHON_TESTS=%BUILD_PYTHON_TESTS% -Dtensorflow_BUILD_CC_TESTS=%BUILD_CC_TESTS% -Dtensorflow_TF_NIGHTLY=%TF_NIGHTLY% -Dtensorflow_DISABLE_EIGEN_FORCEINLINE=%DISABLE_FORCEINLINE%
+%CMAKE_EXE% %CMAKE_DIR% -A x64 -DSWIG_EXECUTABLE=%SWIG_EXE% -DPYTHON_EXECUTABLE=%PY_EXE% -DCMAKE_BUILD_TYPE=Release -DPYTHON_LIBRARIES=%PY_LIB% -Dtensorflow_BUILD_PYTHON_TESTS=%BUILD_PYTHON_TESTS% -Dtensorflow_BUILD_CC_TESTS=%BUILD_CC_TESTS% -Dtensorflow_TF_NIGHTLY=%TF_NIGHTLY% -Dtensorflow_DISABLE_EIGEN_FORCEINLINE=%DISABLE_FORCEINLINE% -Dtensorflow_WIN_CPU_SIMD_OPTIONS=/arch:AVX
 
 :: Run msbuild in the resulting VS project files to build a pip package.
 %MSBUILD_EXE% /p:Configuration=Release /maxcpucount:32 tf_python_build_pip_package.vcxproj
diff --git a/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat b/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat
index 5a362de399..b87e4a9bec 100644
--- a/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat
+++ b/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat
@@ -37,7 +37,7 @@ SET CMAKE_DIR=%REPO_ROOT%\tensorflow\contrib\cmake
 SET MSBUILD_EXE="C:\Program Files (x86)\MSBuild\14.0\Bin\msbuild.exe"
 
 :: Run cmake to create Visual Studio Project files.
-%CMAKE_EXE% %CMAKE_DIR% -A x64 -DSWIG_EXECUTABLE=%SWIG_EXE% -DPYTHON_EXECUTABLE=%PY_EXE% -DCMAKE_BUILD_TYPE=Release -DPYTHON_LIBRARIES=%PY_LIB% -Dtensorflow_BUILD_PYTHON_TESTS=%BUILD_PYTHON_TESTS% -Dtensorflow_BUILD_CC_TESTS=%BUILD_CC_TESTS% -Dtensorflow_ENABLE_GPU=ON -DCUDNN_HOME=%CUDNN_HOME% -Dtensorflow_TF_NIGHTLY=%TF_NIGHTLY% -Dtensorflow_DISABLE_EIGEN_FORCEINLINE=%DISABLE_FORCEINLINE%
+%CMAKE_EXE% %CMAKE_DIR% -A x64 -DSWIG_EXECUTABLE=%SWIG_EXE% -DPYTHON_EXECUTABLE=%PY_EXE% -DCMAKE_BUILD_TYPE=Release -DPYTHON_LIBRARIES=%PY_LIB% -Dtensorflow_BUILD_PYTHON_TESTS=%BUILD_PYTHON_TESTS% -Dtensorflow_BUILD_CC_TESTS=%BUILD_CC_TESTS% -Dtensorflow_ENABLE_GPU=ON -DCUDNN_HOME=%CUDNN_HOME% -Dtensorflow_TF_NIGHTLY=%TF_NIGHTLY% -Dtensorflow_DISABLE_EIGEN_FORCEINLINE=%DISABLE_FORCEINLINE% -Dtensorflow_WIN_CPU_SIMD_OPTIONS=/arch:AVX
 
 :: Run msbuild in the resulting VS project files to build a pip package.
 %MSBUILD_EXE% /p:Configuration=Release /maxcpucount:32 tf_python_build_pip_package.vcxproj
-- 
GitLab


From 07bec47ba5db4c2f2e33ecb49f23253a371bfbbe Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 31 Jan 2018 15:32:39 -0800
Subject: [PATCH 478/966] Update external protobuf codebase version for Windows
 cmake build

PiperOrigin-RevId: 184057827
---
 tensorflow/contrib/cmake/external/protobuf.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/cmake/external/protobuf.cmake b/tensorflow/contrib/cmake/external/protobuf.cmake
index aedb793d2a..fd05fa6d47 100644
--- a/tensorflow/contrib/cmake/external/protobuf.cmake
+++ b/tensorflow/contrib/cmake/external/protobuf.cmake
@@ -16,7 +16,7 @@ include (ExternalProject)
 
 set(PROTOBUF_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/src)
 set(PROTOBUF_URL https://github.com/google/protobuf.git)
-set(PROTOBUF_TAG b04e5cba356212e4e8c66c61bbe0c3a20537c5b9)
+set(PROTOBUF_TAG 396336eb961b75f03b25824fe86cf6490fb75e3a)
 
 if(WIN32)
   set(protobuf_STATIC_LIBRARIES 
-- 
GitLab


From d074556997f3e8aad3a1ca2bcd723dc590777aeb Mon Sep 17 00:00:00 2001
From: Sami Kama <skama@nvidia.com>
Date: Wed, 31 Jan 2018 18:49:26 -0800
Subject: [PATCH 479/966] Fix missing defines in compilation flags that caused
 missing ops and add a simple test to validate functionality

---
 tensorflow/contrib/tensorrt/BUILD             |  7 +--
 .../contrib/tensorrt/kernels/trt_engine_op.cc |  2 +
 .../contrib/tensorrt/test/test_tftrt.py       | 53 +++++++++++++++++++
 3 files changed, 57 insertions(+), 5 deletions(-)
 create mode 100644 tensorflow/contrib/tensorrt/test/test_tftrt.py

diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index c0bba9f950..6e46f95fcb 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -52,8 +52,6 @@ tf_custom_op_library(
         ":trt_engine_op_kernel",
         ":trt_shape_function",
         "//tensorflow/core:lib_proto_parsing",
-        "//tensorflow/core/kernels:bounds_check_lib",
-        "//tensorflow/core/kernels:ops_util_hdrs",
         "@local_config_tensorrt//:nv_infer",
     ],
 )
@@ -77,8 +75,10 @@ cc_library(
     name = "trt_engine_op_kernel",
     srcs = ["kernels/trt_engine_op.cc"],
     hdrs = ["kernels/trt_engine_op.h"],
+    copts = tf_copts(),
     deps = [
         ":trt_logging",
+        "//tensorflow/core:stream_executor_headers_lib",
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:gpu_headers_lib",
         "//tensorflow/core:lib_proto_parsing",
@@ -212,9 +212,6 @@ cc_library(
     linkstatic = 1,
     deps = [
         "//tensorflow/core:graph",
-        # "//tensorflow/core:core_cpu",
-        # "//tensorflow/core:lib_proto_parsing",
-        # "//third_party/eigen3",
         "@protobuf_archive//:protobuf_headers",
     ],
 )
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
index 983c67677f..080e246458 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
@@ -19,6 +19,8 @@ limitations under the License.
 #include "cuda/include/cuda_runtime_api.h"
 #include "tensorflow/contrib/tensorrt/log/trt_logger.h"
 #include "tensorflow/core/platform/logging.h"
+//#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/platform/stream_executor.h"
 
 namespace tensorflow {
 namespace tensorrt {
diff --git a/tensorflow/contrib/tensorrt/test/test_tftrt.py b/tensorflow/contrib/tensorrt/test/test_tftrt.py
new file mode 100644
index 0000000000..06b6f64c4a
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/test/test_tftrt.py
@@ -0,0 +1,53 @@
+#  Script to test TF-TensorRT integration
+#
+#
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tensorflow as tf
+import tensorflow.contrib.tensorrt as trt
+import numpy as np
+
+def getSimpleGraphDef():
+    ''' 
+    Create a simple graph and return its graph_def
+    '''
+    g=tf.Graph()
+    with g.as_default():
+        A=tf.placeholder(dtype=tf.float32,shape=(None,24,24,2),name="input")
+        e=tf.constant([ [[[ 1., 0.5, 4., 6., 0.5, 1. ],
+                        [ 1., 0.5, 1., 1., 0.5, 1. ]]] ],
+                      name="weights",dtype=tf.float32)
+        conv=tf.nn.conv2d(input=A,filter=e,strides=[1,2,2,1],padding="SAME",name="conv")
+        b=tf.constant([  4., 1.5, 2., 3., 5., 7. ],
+                      name="bias",dtype=tf.float32)
+        t=tf.nn.bias_add(conv,b,name="biasAdd")
+        relu=tf.nn.relu(t,"relu")
+        idty=tf.identity(relu,"ID")
+        v=tf.nn.max_pool(idty,[1,2,2,1],[1,2,2,1],"VALID",name="max_pool")
+        out = tf.squeeze(v,name="output")
+    return g.as_graph_def()
+
+def runGraph(gdef,dumm_inp):
+    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.50)
+    tf.reset_default_graph()
+    g=tf.Graph()
+    with g.as_default():
+        inp,out=tf.import_graph_def(graph_def=gdef,
+                            return_elements=["input","output"])
+        inp=inp.outputs[0]
+        out=out.outputs[0]
+    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options),
+                    graph=g) as sess:
+        val=sess.run(out,{inp:dumm_inp})
+    return val
+if "__main__" in  __name__:
+    inpDims=(100,24,24,2)
+    dummy_input=np.random.random_sample(inpDims)
+    gdef=getSimpleGraphDef() #get graphdef
+    trt_graph=trt.CreateInferenceGraph(gdef,["output"],inpDims[0]) # get optimized graph
+    o1=runGraph(gdef,dummy_input)
+    o2=runGraph(trt_graph,dummy_input)
+    assert(np.array_equal(o1,o2))
+    
-- 
GitLab


From 970efe54114f5064b35ad94e938c83977e771bc2 Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Wed, 31 Jan 2018 18:58:24 -0800
Subject: [PATCH 480/966] Update ISSUE_TEMPLATE.md (#16635)

Fixes 16350
---
 ISSUE_TEMPLATE.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ISSUE_TEMPLATE.md b/ISSUE_TEMPLATE.md
index 1a401997c6..2f3df7cda9 100644
--- a/ISSUE_TEMPLATE.md
+++ b/ISSUE_TEMPLATE.md
@@ -4,7 +4,7 @@ https://stackoverflow.com/questions/tagged/tensorflow
 
 If you open a GitHub issue, here is our policy:
 
-1. It must be a bug or a feature request.
+1. It must be a bug, a feature request, or a significant problem with documentation (for small docs fixes please send a PR instead).
 2. The form below must be filled out.
 3. It shouldn't be a TensorBoard issue. Those go [here](https://github.com/tensorflow/tensorboard/issues).
 
-- 
GitLab


From d04292cbe15c3b8e61edc4587ca1663572ca05c2 Mon Sep 17 00:00:00 2001
From: Cole Gerdemann <corvettecole@gmail.com>
Date: Wed, 31 Jan 2018 20:58:56 -0600
Subject: [PATCH 481/966] Fixed typo (#16610)

---
 .../src/org/tensorflow/demo/tracking/MultiBoxTracker.java     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/tracking/MultiBoxTracker.java b/tensorflow/examples/android/src/org/tensorflow/demo/tracking/MultiBoxTracker.java
index 2fe2ba539e..af6af2bc8f 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/tracking/MultiBoxTracker.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/tracking/MultiBoxTracker.java
@@ -199,7 +199,7 @@ public class MultiBoxTracker {
       final int w,
       final int h,
       final int rowStride,
-      final int sensorOrienation,
+      final int sensorOrientation,
       final byte[] frame,
       final long timestamp) {
     if (objectTracker == null && !initialized) {
@@ -209,7 +209,7 @@ public class MultiBoxTracker {
       objectTracker = ObjectTracker.getInstance(w, h, rowStride, true);
       frameWidth = w;
       frameHeight = h;
-      this.sensorOrientation = sensorOrienation;
+      this.sensorOrientation = sensorOrientation;
       initialized = true;
 
       if (objectTracker == null) {
-- 
GitLab


From 72074b7d74df8283a8f86ad8e102d8844eae13ad Mon Sep 17 00:00:00 2001
From: simsicon <simsicon@gmail.com>
Date: Thu, 1 Feb 2018 10:59:10 +0800
Subject: [PATCH 482/966] Remove duplicated identical lines (#16573)

---
 tensorflow/examples/tutorials/word2vec/word2vec_basic.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
index d055d15745..f6906b0f79 100644
--- a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
+++ b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
@@ -270,12 +270,6 @@ with tf.Session(graph=graph) as session:
         run_metadata=run_metadata)
     average_loss += loss_val
 
-    # Add returned summaries to writer in each step.
-    writer.add_summary(summary, step)
-    # Add metadata to visualize the graph for the last run.
-    if step == (num_steps - 1):
-      writer.add_run_metadata(run_metadata, 'step%d' % step)
-
     # Add returned summaries to writer in each step.
     writer.add_summary(summary, step)
     # Add metadata to visualize the graph for the last run.
-- 
GitLab


From 3d958a5a4bb0009854e8dfa48987c04c0897cd96 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dandelion=20Man=C3=A9?= <danmane@gmail.com>
Date: Wed, 31 Jan 2018 18:59:31 -0800
Subject: [PATCH 483/966] Replace 'Dan' with 'Dandelion' in the citations
 (#16630)

---
 tensorflow/docs_src/about/bib.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/docs_src/about/bib.md b/tensorflow/docs_src/about/bib.md
index c9f0c532c6..5593a3d95c 100644
--- a/tensorflow/docs_src/about/bib.md
+++ b/tensorflow/docs_src/about/bib.md
@@ -60,7 +60,7 @@ author={
     Lukasz~Kaiser and
     Manjunath~Kudlur and
     Josh~Levenberg and
-    Dan~Man\'{e} and
+    Dandelion~Man\'{e} and
     Rajat~Monga and
     Sherry~Moore and
     Derek~Murray and
-- 
GitLab


From ec92f62e039055fa1fbc844ecde57017e8dd3311 Mon Sep 17 00:00:00 2001
From: Colin Raffel <craffel@gmail.com>
Date: Wed, 31 Jan 2018 19:00:33 -0800
Subject: [PATCH 484/966] Remove query_layer in LuongMonotonicAttention
 (#16602)

In the constructor for LuongMonotonicAttention, a query layer was being created but it was ultimately never used.

Fixes #16287.
---
 tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
index 95dea312f3..d6b5eceb47 100644
--- a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
@@ -924,8 +924,7 @@ class LuongMonotonicAttention(_BaseMonotonicAttentionMechanism):
         _monotonic_probability_fn, sigmoid_noise=sigmoid_noise, mode=mode,
         seed=sigmoid_noise_seed)
     super(LuongMonotonicAttention, self).__init__(
-        query_layer=layers_core.Dense(
-            num_units, name="query_layer", use_bias=False, dtype=dtype),
+        query_layer=None,
         memory_layer=layers_core.Dense(
             num_units, name="memory_layer", use_bias=False, dtype=dtype),
         memory=memory,
-- 
GitLab


From 80bfeb893245ed69079b04fdb7fcb57edd8cb766 Mon Sep 17 00:00:00 2001
From: hsm207 <hsm207@users.noreply.github.com>
Date: Wed, 31 Jan 2018 22:00:51 -0500
Subject: [PATCH 485/966] Fix typos. (#16570)

---
 tensorflow/docs_src/programmers_guide/graphs.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/docs_src/programmers_guide/graphs.md b/tensorflow/docs_src/programmers_guide/graphs.md
index 2b4896c381..9049a5a9f3 100644
--- a/tensorflow/docs_src/programmers_guide/graphs.md
+++ b/tensorflow/docs_src/programmers_guide/graphs.md
@@ -125,14 +125,14 @@ an operation:
   @{tf.Tensor} accepts an optional `name` argument. For example,
   `tf.constant(42.0, name="answer")` creates a new @{tf.Operation} named
   `"answer"` and returns a @{tf.Tensor} named `"answer:0"`. If the default graph
-  already contained an operation named `"answer"`, the TensorFlow would append
+  already contains an operation named `"answer"`, then TensorFlow would append
   `"_1"`, `"_2"`, and so on to the name, in order to make it unique.
 
 * The @{tf.name_scope} function makes it possible to add a **name scope** prefix
   to all operations created in a particular context. The current name scope
   prefix is a `"/"`-delimited list of the names of all active @{tf.name_scope}
   context managers. If a name scope has already been used in the current
-  context, TensorFlow appens `"_1"`, `"_2"`, and so on. For example:
+  context, TensorFlow appends `"_1"`, `"_2"`, and so on. For example:
 
   ```python
   c_0 = tf.constant(0, name="c")  # => operation named "c"
-- 
GitLab


From 94c6c14ea0088ea070cf304d0b1ab638f20bf785 Mon Sep 17 00:00:00 2001
From: ImSheridan <xiaoyudong-512@163.com>
Date: Thu, 1 Feb 2018 11:03:18 +0800
Subject: [PATCH 486/966] Fix the FutureWarning: Conversion of the second
 argument of issubdtype from `float` to `np.floating` is deprecated (#16591)

---
 tensorflow/contrib/learn/python/learn/estimators/dnn_test.py | 2 +-
 tensorflow/python/debug/cli/tensor_format.py                 | 2 +-
 tensorflow/python/debug/lib/debug_data.py                    | 2 +-
 tensorflow/python/eager/execution_callbacks.py               | 2 +-
 tensorflow/python/kernel_tests/topk_op_test.py               | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py b/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py
index 12f9bba531..2bd57597c2 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py
@@ -1224,7 +1224,7 @@ class DNNRegressorTest(test.TestCase):
       self, predictions, expected_shape):
     predictions_nparray = np.array(predictions)
     self.assertAllEqual(expected_shape, predictions_nparray.shape)
-    self.assertTrue(np.issubdtype(predictions_nparray.dtype, np.float))
+    self.assertTrue(np.issubdtype(predictions_nparray.dtype, np.floating))
 
   def testPredict_AsIterableFalse(self):
     """Tests predict method with as_iterable=False."""
diff --git a/tensorflow/python/debug/cli/tensor_format.py b/tensorflow/python/debug/cli/tensor_format.py
index d4aea76d65..e0759a8bc1 100644
--- a/tensorflow/python/debug/cli/tensor_format.py
+++ b/tensorflow/python/debug/cli/tensor_format.py
@@ -535,7 +535,7 @@ def numeric_summary(tensor):
   if not isinstance(tensor, np.ndarray) or not np.size(tensor):
     return debugger_cli_common.RichTextLines([
         "No numeric summary available due to empty tensor."])
-  elif (np.issubdtype(tensor.dtype, np.float) or
+  elif (np.issubdtype(tensor.dtype, np.floating) or
         np.issubdtype(tensor.dtype, np.complex) or
         np.issubdtype(tensor.dtype, np.integer)):
     counts = [
diff --git a/tensorflow/python/debug/lib/debug_data.py b/tensorflow/python/debug/lib/debug_data.py
index c4b13a1045..8d355aa27f 100644
--- a/tensorflow/python/debug/lib/debug_data.py
+++ b/tensorflow/python/debug/lib/debug_data.py
@@ -222,7 +222,7 @@ def has_inf_or_nan(datum, tensor):
     # Also return False for data types that cannot be represented as numpy
     # arrays.
     return False
-  elif (np.issubdtype(tensor.dtype, np.float) or
+  elif (np.issubdtype(tensor.dtype, np.floating) or
         np.issubdtype(tensor.dtype, np.complex) or
         np.issubdtype(tensor.dtype, np.integer)):
     return np.any(np.isnan(tensor)) or np.any(np.isinf(tensor))
diff --git a/tensorflow/python/eager/execution_callbacks.py b/tensorflow/python/eager/execution_callbacks.py
index 2f1654dda4..988442c971 100644
--- a/tensorflow/python/eager/execution_callbacks.py
+++ b/tensorflow/python/eager/execution_callbacks.py
@@ -153,7 +153,7 @@ def inf_nan_callback(op_type,
       continue
 
     numpy_dtype = output.dtype.as_numpy_dtype
-    if (np.issubdtype(numpy_dtype, np.float) or
+    if (np.issubdtype(numpy_dtype, np.floating) or
         np.issubdtype(numpy_dtype, np.complex) or
         np.issubdtype(numpy_dtype, np.integer)):
       try:
diff --git a/tensorflow/python/kernel_tests/topk_op_test.py b/tensorflow/python/kernel_tests/topk_op_test.py
index efb5b9f364..6ab931fdb9 100644
--- a/tensorflow/python/kernel_tests/topk_op_test.py
+++ b/tensorflow/python/kernel_tests/topk_op_test.py
@@ -58,7 +58,7 @@ class TopKTest(test.TestCase):
         # Do some special casing of equality of indices: if indices
         # are not the same, but values are floating type, ensure that
         # the values are within epsilon of each other.
-        if not np.issubdtype(np_expected_values.dtype, np.float):
+        if not np.issubdtype(np_expected_values.dtype, np.floating):
           # Values are not floating point type; check indices exactly
           self.assertAllEqual(np_expected_indices, indices)
         else:
-- 
GitLab


From 1fc1fb4a0c30d498cd2bb1d8deccce27a63f74b2 Mon Sep 17 00:00:00 2001
From: Tatiana Shpeisman <shpeisman@google.com>
Date: Wed, 31 Jan 2018 15:41:03 -0800
Subject: [PATCH 487/966] Change recommended option for building TensorFlow
 with MKL from -c opt to --config=opt. -c opt triggers optimized C++
 compilation. --config=opt also uses additional optimization flags as set by
 running ./configure.

PiperOrigin-RevId: 184059060
---
 tensorflow/docs_src/performance/performance_guide.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/docs_src/performance/performance_guide.md b/tensorflow/docs_src/performance/performance_guide.md
index 10e7ad7ada..cd47fc2803 100644
--- a/tensorflow/docs_src/performance/performance_guide.md
+++ b/tensorflow/docs_src/performance/performance_guide.md
@@ -498,7 +498,7 @@ For TensorFlow source versions after 1.3.0:
 ```bash
 ./configure
 # Pick the desired options
-bazel build --config=mkl -c opt //tensorflow/tools/pip_package:build_pip_package
+bazel build --config=mkl --config=opt //tensorflow/tools/pip_package:build_pip_package
 
 ```
 
-- 
GitLab


From 371923a0ac7b76dc5cbdd8d100679a20e594b827 Mon Sep 17 00:00:00 2001
From: Tatiana Shpeisman <shpeisman@google.com>
Date: Wed, 31 Jan 2018 15:47:42 -0800
Subject: [PATCH 488/966] MKL is no longer enabled via ./configure. Fixed
 documentation to reflect this.

PiperOrigin-RevId: 184060046
---
 tensorflow/docs_src/install/install_sources.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index f494cc7a7c..485863bf2e 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -272,8 +272,6 @@ Found possible Python library paths:
 Please input the desired Python library path to use.  Default is [/usr/lib/python2.7/dist-packages]
 
 Using python library path: /usr/local/lib/python2.7/dist-packages
-Do you wish to build TensorFlow with MKL support? [y/N]
-No MKL support will be enabled for TensorFlow
 Please specify optimization flags to use during compilation when bazel option "--config=opt" is specified [Default is -march=native]:
 Do you wish to use jemalloc as the malloc implementation? [Y/n]
 jemalloc enabled
-- 
GitLab


From 90ac0229a95f3f0e4f29b359e8e12cf8054064ef Mon Sep 17 00:00:00 2001
From: Asim Shankar <ashankar@google.com>
Date: Wed, 31 Jan 2018 16:17:42 -0800
Subject: [PATCH 489/966] eager: Fix dropout in MNIST example.

The dropout layer takes a training argument in __call__, which defaults
to false. So without this change, dropout was not being applied.

PiperOrigin-RevId: 184064465
---
 tensorflow/contrib/eager/python/examples/mnist/mnist.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tensorflow/contrib/eager/python/examples/mnist/mnist.py b/tensorflow/contrib/eager/python/examples/mnist/mnist.py
index 2a7be95811..ed7dbc8904 100644
--- a/tensorflow/contrib/eager/python/examples/mnist/mnist.py
+++ b/tensorflow/contrib/eager/python/examples/mnist/mnist.py
@@ -95,8 +95,7 @@ class MNISTModel(tfe.Network):
     x = self.max_pool2d(x)
     x = tf.layers.flatten(x)
     x = self.fc1(x)
-    if training:
-      x = self.dropout(x)
+    x = self.dropout(x, training=training)
     x = self.fc2(x)
     return x
 
-- 
GitLab


From ddb814cfdf1047187b7e09f53618a6d932b4b64f Mon Sep 17 00:00:00 2001
From: Justin Lebar <justin.lebar@gmail.com>
Date: Wed, 31 Jan 2018 19:15:44 -0800
Subject: [PATCH 490/966] Add note about ptxas bug affecting XLA:GPU to
 relnotes. (#16636)

---
 RELEASE.md | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/RELEASE.md b/RELEASE.md
index e63971ef93..d3b4037061 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -5,6 +5,27 @@
 * Starting from 1.6 release, our prebuilt binaries will use AVX instructions.
   This may break TF on older CPUs.
 
+## Known Bugs
+* Using XLA:GPU with CUDA 9 and CUDA 9.1 results in garbage results and/or
+  `CUDA_ILLEGAL_ADDRESS` failures.
+
+  Google discovered in mid-December 2017 that the PTX-to-SASS compiler in CUDA 9
+  and CUDA 9.1 sometimes does not properly compute the carry bit when
+  decomposing 64-bit address calculations with large offsets (e.g. `load [x +
+  large_constant]`) into 32-bit arithmetic in SASS.
+
+  As a result, these versions of `ptxas` miscompile most XLA programs which use
+  more than 4GB of temp memory.  This results in garbage results and/or
+  `CUDA_ERROR_ILLEGAL_ADDRESS` failures.
+
+  A fix in CUDA 9.1.121 is expected in late February 2018.  We do not expect a
+  fix for CUDA 9.0.x.  Until the fix is available, the only workaround is to
+  [downgrade](https://developer.nvidia.com/cuda-toolkit-archive) to CUDA 8.0.x
+  or disable XLA:GPU.
+
+  TensorFlow will print a warning if you use XLA:GPU with a known-bad version of
+  CUDA; see e00ba24c4038e7644da417ddc639169b6ea59122.
+
 ## Major Features And Improvements
 * [Eager execution](https://github.com/tensorflow/tensorflow/tree/r1.5/tensorflow/contrib/eager)
   preview version is now available.
-- 
GitLab


From 70e1d48ef5d9d9025c1fdb5777acfc1d864a9f30 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Wed, 31 Jan 2018 16:40:10 -0800
Subject: [PATCH 491/966] Add type check when assigning to a resource variable
 slice.

PiperOrigin-RevId: 184067663
---
 tensorflow/core/kernels/strided_slice_op.cc   |  5 ++++
 .../python/kernel_tests/array_ops_test.py     | 26 +++++++++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/tensorflow/core/kernels/strided_slice_op.cc b/tensorflow/core/kernels/strided_slice_op.cc
index 8f7f91c9df..7745effe2a 100644
--- a/tensorflow/core/kernels/strided_slice_op.cc
+++ b/tensorflow/core/kernels/strided_slice_op.cc
@@ -294,6 +294,11 @@ class StridedSliceAssignOp : public OpKernel {
       OP_REQUIRES_OK(context,
                      LookupResource(context, HandleFromInput(context, 0), &v));
       old_lhs = *v->tensor();
+      OP_REQUIRES(context, old_lhs.dtype() == DataTypeToEnum<T>::value,
+                  errors::InvalidArgument(
+                      "l-value dtype ", DataTypeString(old_lhs.dtype()),
+                      " does not match r-value dtype ",
+                      DataTypeString(DataTypeToEnum<T>::value)));
     } else {
       context->forward_ref_input_to_ref_output(0, 0);
       old_lhs = context->mutable_input(0, true);
diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index a96b88d96f..82dbe90002 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -952,6 +952,32 @@ class SliceAssignTest(test_util.TensorFlowTestCase):
         v = variables.Variable([1, 2])
         sess.run(v[:].assign([1, 2]))
 
+  def testTypeError(self):
+    init_val = constant_op.constant([1, 2], dtype=dtypes.int32)
+    too_small_val = constant_op.constant([3, 4], dtype=dtypes.int8)
+    too_large_val = constant_op.constant([3, 4], dtype=dtypes.int64)
+    v = variables.Variable(init_val)
+    with self.assertRaises(TypeError):
+      v[:].assign(too_small_val)
+    with self.assertRaises(TypeError):
+      v[:].assign(too_large_val)
+
+  def testTypeErrorResource(self):
+    init_val = constant_op.constant([1, 2], dtype=dtypes.int32)
+    too_small_val = constant_op.constant([3, 4], dtype=dtypes.int8)
+    too_large_val = constant_op.constant([3, 4], dtype=dtypes.int64)
+    v = resource_variable_ops.ResourceVariable(init_val)
+    with self.test_session() as sess:
+      sess.run(v.initializer)
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          "l-value dtype int32 does not match r-value dtype int64"):
+        sess.run(v[:].assign(too_large_val))
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          "l-value dtype int32 does not match r-value dtype int8"):
+        sess.run(v[:].assign(too_small_val))
+
 
 class ShapeSizeRankTest(test_util.TensorFlowTestCase):
 
-- 
GitLab


From 516444248d7b565e7a17504955f0f37dc447b36c Mon Sep 17 00:00:00 2001
From: Roy Frostig <frostig@google.com>
Date: Wed, 31 Jan 2018 16:54:20 -0800
Subject: [PATCH 492/966] [XLA] Support array layout specification in local
 Python XLA client.

PiperOrigin-RevId: 184069454
---
 .../xla/python/local_computation_builder.cc   |  50 +++++--
 .../xla/python/local_computation_builder.h    |  13 +-
 .../xla/python/local_computation_builder.i    |  66 +++++++--
 .../compiler/xla/python/numpy_bridge.cc       | 138 ++++++++++--------
 tensorflow/compiler/xla/python/numpy_bridge.h |  10 +-
 tensorflow/compiler/xla/python/xla_client.py  | 117 ++++++++++-----
 6 files changed, 264 insertions(+), 130 deletions(-)

diff --git a/tensorflow/compiler/xla/python/local_computation_builder.cc b/tensorflow/compiler/xla/python/local_computation_builder.cc
index 67a73bc33d..9e3cf79383 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.cc
+++ b/tensorflow/compiler/xla/python/local_computation_builder.cc
@@ -98,15 +98,25 @@ const std::unique_ptr<ScopedShapedBuffer>& LocalShapedBuffer::shaped_buffer()
   return shaped_buffer_;
 }
 
+static StatusOr<std::unique_ptr<ScopedShapedBuffer>> ToBuffer(
+    LocalClient* client, int device_ordinal, const Literal& arg) {
+  return client->LiteralToShapedBuffer(arg, device_ordinal,
+                                       client->backend().memory_allocator());
+}
+
 /* static */
-LocalShapedBuffer* LocalShapedBuffer::FromLiteral(const Literal& argument) {
+LocalShapedBuffer* LocalShapedBuffer::FromLiteral(
+    const Literal& argument,
+    const tensorflow::gtl::optional<Shape>& shape_with_layout) {
   LocalClient* client = GetOrCreateLocalClient();
-  std::unique_ptr<ScopedShapedBuffer> buf =
-      client
-          ->LiteralToShapedBuffer(argument,
-                                  /*device_ordinal=*/0,
-                                  client->backend().memory_allocator())
-          .ConsumeValueOrDie();
+  std::unique_ptr<ScopedShapedBuffer> buf;
+  if (shape_with_layout) {
+    std::unique_ptr<Literal> relaid =
+        argument.Relayout(shape_with_layout.value());
+    buf = ToBuffer(client, /*device_ordinal=*/0, *relaid).ConsumeValueOrDie();
+  } else {
+    buf = ToBuffer(client, /*device_ordinal=*/0, argument).ConsumeValueOrDie();
+  }
   return new LocalShapedBuffer(std::move(buf));
 }
 
@@ -120,7 +130,8 @@ CompiledLocalComputation::CompiledLocalComputation(
     : executable_(std::move(executable)) {}
 
 StatusOr<std::unique_ptr<Literal>> CompiledLocalComputation::Execute(
-    const std::vector<Literal>& arguments) {
+    const std::vector<Literal>& arguments,
+    const std::vector<tensorflow::gtl::optional<Shape>>& shapes_with_layout) {
   LocalClient* client = GetOrCreateLocalClient();
 
   VLOG(1) << "Execution requested with " << GetReplicaCount() << " replicas.";
@@ -133,7 +144,8 @@ StatusOr<std::unique_ptr<Literal>> CompiledLocalComputation::Execute(
                                         GetReplicaCount());
 
     for (int replica = 0; replica < GetReplicaCount(); ++replica) {
-      pool.Schedule([this, client, replica, &arguments, &results] {
+      pool.Schedule([this, client, replica, &arguments, &shapes_with_layout,
+                     &results] {
         StatusOr<int> device_ordinal_status =
             client->ReplicaNumberToDeviceOrdinal(replica);
         if (!device_ordinal_status.ok()) {
@@ -144,18 +156,28 @@ StatusOr<std::unique_ptr<Literal>> CompiledLocalComputation::Execute(
         VLOG(3) << "Replica " << replica
                 << " mapped to device ordinal for execution: "
                 << device_ordinal;
+
         // Transfer arguments in
         std::vector<std::unique_ptr<ScopedShapedBuffer>> scoped_buffers;
         scoped_buffers.reserve(arguments.size());
-        for (const Literal& argument : arguments) {
-          StatusOr<std::unique_ptr<ScopedShapedBuffer>> pushed =
-              client->LiteralToShapedBuffer(
-                  argument, device_ordinal,
-                  client->backend().memory_allocator());
+        for (int i = 0; i < arguments.size(); ++i) {
+          const Literal& argument = arguments[i];
+          const tensorflow::gtl::optional<Shape>& shape_with_layout =
+              shapes_with_layout[i];
+
+          StatusOr<std::unique_ptr<ScopedShapedBuffer>> pushed;
+          if (shape_with_layout) {
+            std::unique_ptr<Literal> relaid =
+                argument.Relayout(shape_with_layout.value());
+            pushed = ToBuffer(client, device_ordinal, *relaid);
+          } else {
+            pushed = ToBuffer(client, device_ordinal, argument);
+          }
           if (!pushed.ok()) {
             results[replica] = pushed.status();
             return;
           }
+
           scoped_buffers.push_back(std::move(pushed).ValueOrDie());
         }
 
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.h b/tensorflow/compiler/xla/python/local_computation_builder.h
index d5c4c58040..ae5bbc03cb 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.h
+++ b/tensorflow/compiler/xla/python/local_computation_builder.h
@@ -59,7 +59,9 @@ StatusOr<std::unique_ptr<Literal> > TransferFromOutfeedLocalReplica(
 // client.
 class LocalShapedBuffer {
  public:
-  static LocalShapedBuffer* FromLiteral(const Literal& argument);
+  static LocalShapedBuffer* FromLiteral(
+      const Literal& argument,
+      const tensorflow::gtl::optional<Shape>& shape_with_layout);
   LocalShapedBuffer(std::unique_ptr<ScopedShapedBuffer> shaped_buffer);
   const std::unique_ptr<ScopedShapedBuffer>& shaped_buffer() const;
   std::unique_ptr<Literal> ToLiteral() const;
@@ -77,8 +79,15 @@ class LocalShapedBuffer {
 class CompiledLocalComputation {
  public:
   CompiledLocalComputation(std::unique_ptr<LocalExecutable> executable);
+
+  // Execute the computation with the given argument literals, and
+  // with optionally-specified argument layouts. The literals will be
+  // re-laid out according to the corresponding elements of
+  // shapes_with_layout.
   StatusOr<std::unique_ptr<Literal> > Execute(
-      const std::vector<Literal>& arguments);
+      const std::vector<Literal>& arguments,
+      const std::vector<tensorflow::gtl::optional<Shape> >& shapes_with_layout);
+
   LocalShapedBuffer* ExecuteWithShapedBuffers(
       tensorflow::gtl::ArraySlice<LocalShapedBuffer*> argument_handles);
 
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.i b/tensorflow/compiler/xla/python/local_computation_builder.i
index 89f8385501..fdd05692fc 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.i
+++ b/tensorflow/compiler/xla/python/local_computation_builder.i
@@ -27,8 +27,9 @@ limitations under the License.
 //  ArraySlice<ComputationDataHandle>  <-  sequence of int
 //  Literal                            <-> (nested tuple of) numpy ndarray
 //  std::vector<Literal>               <-  sequence of (nested tuple of) ndarray
-//  Shape                              <-> pair holding (dtype, dimensions)
-//  std::vector<Shape>                 <-  sequence of shape information pairs
+//  Shape                               -> pair holding (dtype, dimensions)
+//                                     <-  object duck-typed as xla_client.Shape
+//  std::vector<Shape>                 <-  sequence of xla_client.Shape objects
 //  PrimitiveType                      <-  int
 //  ArraySlice<pair<int64, in64>>      <-  sequence of int pairs
 //  PaddingConfig proto                <-  corresponding Python proto
@@ -55,7 +56,7 @@ limitations under the License.
 // translates to a tuple-shaped XLA Literal, whose component subshapes
 // are a 2x3 F32-shaped literal followed by two tuple-shaped literals.
 //
-// The Python objects corresponding to C++ Shapes have the type:
+// Shapes output by C++ become Python objects with the type:
 //
 //   T            = (dtype, S)
 //   S            = DIMENSIONS | TUPLE_SHAPES
@@ -353,15 +354,31 @@ tensorflow::ImportNumpy();
 // Shape
 
 %typemap(in) const Shape& (Shape temp) {
-  Status shape_status = numpy::CheckPyShapeInfo($input);
-  if (!shape_status.ok()) {
-    PyErr_SetString(PyExc_RuntimeError, shape_status.ToString().c_str());
+  StatusOr<Shape> statusor = numpy::XlaShapeFromPyShape($input);
+  if (!statusor.ok()) {
+    PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
     return NULL;
   }
-  temp = numpy::XlaShapeFromPyShapeInfo($input);
+  temp = std::move(statusor).ValueOrDie();
   $1 = &temp;
 }
 
+%typemap(in) const tensorflow::gtl::optional<Shape>& (
+    tensorflow::gtl::optional<Shape> temp) {
+  if ($input == Py_None) {
+    temp = tensorflow::gtl::nullopt;
+    $1 = &temp;
+  } else {
+    StatusOr<Shape> statusor = numpy::XlaShapeFromPyShape($input);
+    if (!statusor.ok()) {
+      PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
+      return NULL;
+    }
+    temp = std::move(statusor).ValueOrDie();
+    $1 = &temp;
+  }
+}
+
 %typemap(out) std::unique_ptr<Shape> {
   $result = numpy::PyShapeInfoFromXlaShape(*$1);
 }
@@ -374,14 +391,37 @@ tensorflow::ImportNumpy();
   const int size = PySequence_Size($input);
   for (int i = 0; i < size; ++i) {
     PyObject* o = PySequence_GetItem($input, i);
-    Status shape_status = numpy::CheckPyShapeInfo(o);
-    if (!shape_status.ok()) {
-      PyErr_SetString(PyExc_RuntimeError, shape_status.ToString().c_str());
-      Py_DECREF(o);
+    StatusOr<Shape> statusor = numpy::XlaShapeFromPyShape(o);
+    Py_DECREF(o);
+    if (!statusor.ok()) {
+      PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
       return NULL;
     }
-    temps.push_back(numpy::XlaShapeFromPyShapeInfo(o));
-    Py_DECREF(o);
+    temps.push_back(statusor.ConsumeValueOrDie());
+  }
+  $1 = &temps;
+}
+
+%typemap(in) const std::vector<tensorflow::gtl::optional<Shape> >& (
+    std::vector<tensorflow::gtl::optional<Shape> > temps) {
+  if (!PySequence_Check($input)) {
+    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
+    return NULL;
+  }
+  const int size = PySequence_Size($input);
+  for (int i = 0; i < size; ++i) {
+    PyObject* o = PySequence_GetItem($input, i);
+    if (o == Py_None) {
+      temps.push_back(tensorflow::gtl::nullopt);
+    } else {
+      StatusOr<Shape> statusor = numpy::XlaShapeFromPyShape(o);
+      Py_DECREF(o);
+      if (!statusor.ok()) {
+        PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
+        return NULL;
+      }
+      temps.push_back(statusor.ConsumeValueOrDie());
+    }
   }
   $1 = &temps;
 }
diff --git a/tensorflow/compiler/xla/python/numpy_bridge.cc b/tensorflow/compiler/xla/python/numpy_bridge.cc
index 5c722623e3..3d87480728 100644
--- a/tensorflow/compiler/xla/python/numpy_bridge.cc
+++ b/tensorflow/compiler/xla/python/numpy_bridge.cc
@@ -176,85 +176,107 @@ static string PyObjectCppRepr(PyObject* o) {
   return ExtractStringAndDecref(r);
 }
 
-Status CheckPyShapeInfo(PyObject* o) {
+StatusOr<Shape> XlaShapeFromPyShape(PyObject* o) {
   auto error = [o](const string& prefix) {
     return InvalidArgument("%s; got %s", prefix.c_str(),
                            PyObjectCppRepr(o).c_str());
   };
-  // The object is a tuple (a pair)
-  if (!PyTuple_Check(o)) {
-    return error("Shape record must be a tuple");
-  }
-  if (PyTuple_Size(o) != 2) {
-    return error("Shape record tuple must be of length 2");
-  }
 
-  // It has a first element, which is a numpy dtype object
-  PyObject* first = PyTuple_GetItem(o, 0);
-  if (first == nullptr) {
-    return error("Tuple has no item 0 (shape dtype)");
-  }
-  if (first->ob_type != &PyArrayDescr_Type) {
-    return error(
-        "Shape record does not have a numpy dtype as its first element");
-  }
-  const int np_type = NumpyTypenum(first);
-  if (!NumpyTypeIsValid(np_type)) {
-    return error("Shape record has an invalid integer dtype");
-  }
+  auto get_attr = [o, &error](const string& field) -> StatusOr<PyObject*> {
+    PyObject* result =
+        PyObject_GetAttrString(o, const_cast<char*>(field.c_str()));
+    if (result == nullptr) {
+      return error(tensorflow::strings::StrCat(
+          "Failed to get attribute of Shape object:", field));
+    }
+    return result;
+  };
 
-  // It has a second element, which is a tuple, either of shape
-  // records or of Python ints
-  PyObject* second = PyTuple_GetItem(o, 1);
-  if (!second) {
-    return error("Tuple has no item 0 (shape dimensions)");
-  }
-  if (!PyTuple_Check(second)) {
-    return error("Shape record does not have a tuple as its second element");
-  }
-  const int length = PyTuple_Size(second);
-  const PrimitiveType element_type = NumpyTypeToPrimitiveType(np_type);
-  for (int i = 0; i < length; i++) {
-    PyObject* dimension = PyTuple_GetItem(second, i);
-    if (element_type == TUPLE) {
-      VLOG(3) << "element_type is tuple, checking member: " << i;
-      Status result = CheckPyShapeInfo(dimension);
-      if (!result.ok()) {
-        return AddStatus(
-            result, tensorflow::strings::StrCat("Validating tuple member ", i,
-                                                " of ", PyObjectCppRepr(o)));
-      }
-    } else if (!CheckPyIntOrLong(dimension)) {
-      return error("Non-tuple shape record has a non-integer dimension");
+  auto call_method = [o, &error](const string& method) -> StatusOr<PyObject*> {
+    PyObject* result =
+        PyObject_CallMethod(o, const_cast<char*>(method.c_str()), nullptr);
+    if (result == nullptr) {
+      return error(tensorflow::strings::StrCat(
+          "Failed to call method of shape object:", method));
     }
-  }
+    return result;
+  };
 
-  return Status::OK();
-}
+  PyObject* np_type;
+  TF_ASSIGN_OR_RETURN(np_type, get_attr("np_dtype"));
+  if (np_type->ob_type != &PyArrayDescr_Type) {
+    return error("Shape attribute np_dtype is not an integer numpy dtype");
+  }
+  if (!NumpyTypeIsValid(NumpyTypenum(np_type))) {
+    return error("Shape attribute np_dtype is not a valid integer numpy dtype");
+  }
+  const PrimitiveType element_type =
+      NumpyTypeToPrimitiveType(NumpyTypenum(np_type));
+  Py_DECREF(np_type);
 
-// Precondition: CheckPyShapeInfo(o)
-Shape XlaShapeFromPyShapeInfo(PyObject* o) {
-  const int np_type = NumpyTypenum(PyTuple_GetItem(o, 0));
-  const PrimitiveType element_type = NumpyTypeToPrimitiveType(np_type);
-  PyObject* py_dimensions = PyTuple_GetItem(o, 1);
-  const int length = PyTuple_Size(py_dimensions);
   if (element_type == TUPLE) {
+    PyObject* py_subshapes;
+    TF_ASSIGN_OR_RETURN(py_subshapes, call_method("tuple_shapes"));
+    if (!PyTuple_Check(py_subshapes)) {
+      return error(
+          "Return value of Shape method tuple_shapes() is not a tuple");
+    }
+    const int length = PyTuple_Size(py_subshapes);
     std::vector<Shape> subshapes;
     subshapes.reserve(length);
     for (int i = 0; i < length; i++) {
-      subshapes.push_back(
-          XlaShapeFromPyShapeInfo(PyTuple_GetItem(py_dimensions, i)));
+      TF_ASSIGN_OR_RETURN(
+          const Shape& subshape,
+          XlaShapeFromPyShape(PyTuple_GetItem(py_subshapes, i)));
+      subshapes.push_back(subshape);
     }
+    Py_DECREF(py_subshapes);
     return ShapeUtil::MakeTupleShape(subshapes);
   } else {
+    PyObject* py_dimensions;
+    PyObject* py_minor_to_major;
+    TF_ASSIGN_OR_RETURN(py_dimensions, call_method("dimensions"));
+    TF_ASSIGN_OR_RETURN(py_minor_to_major, call_method("minor_to_major"));
+    if (!PyTuple_Check(py_dimensions)) {
+      return error("Return value of Shape method dimensions() is not a tuple");
+    }
+    if (py_minor_to_major != Py_None && !PyTuple_Check(py_minor_to_major)) {
+      return error(
+          "Return value of Shape method minor_to_major() is neither a tuple "
+          "nor None");
+    }
+    const int length = PyTuple_Size(py_dimensions);
+    if (py_minor_to_major != Py_None &&
+        length != PyTuple_Size(py_minor_to_major)) {
+      return error(
+          "Shape methods dimensions() and minor_to_major() return "
+          "different-length tuples");
+    }
     std::vector<int64> dimensions(length);
+    std::vector<int64> minor_to_major(length);
     for (int i = 0; i < length; i++) {
       dimensions[i] = PyIntOrPyLongToLong(PyTuple_GetItem(py_dimensions, i));
-      if (dimensions[i] == -1) {
-        CHECK(!PyErr_Occurred());
+      if (dimensions[i] == -1 && PyErr_Occurred()) {
+        return error("Dimension is not an int");
       }
+
+      if (py_minor_to_major != Py_None) {
+        minor_to_major[i] =
+            PyIntOrPyLongToLong(PyTuple_GetItem(py_minor_to_major, i));
+        if (minor_to_major[i] == -1 && PyErr_Occurred()) {
+          return error("Minor-to-major value is not an int");
+        }
+      }
+    }
+    bool with_layout = py_minor_to_major != Py_None;
+    Py_DECREF(py_dimensions);
+    Py_DECREF(py_minor_to_major);
+    if (with_layout) {
+      return ShapeUtil::MakeShapeWithLayout(element_type, dimensions,
+                                            minor_to_major);
+    } else {
+      return ShapeUtil::MakeShape(element_type, dimensions);
     }
-    return ShapeUtil::MakeShape(element_type, dimensions);
   }
 }
 
diff --git a/tensorflow/compiler/xla/python/numpy_bridge.h b/tensorflow/compiler/xla/python/numpy_bridge.h
index 6ff1c34cfc..adfcc3b858 100644
--- a/tensorflow/compiler/xla/python/numpy_bridge.h
+++ b/tensorflow/compiler/xla/python/numpy_bridge.h
@@ -56,15 +56,11 @@ bool NumpyTypeIsValid(int np_type);
 // The return value is a new reference.
 PyObject* PyShapeInfoFromXlaShape(const Shape& shape);
 
-// Returns the outcome of a best-effort check that the Python object
-// is a pair of the form (numpy dtype, dimensions), as produced by
-// PyShapeInfoFromXlaShape.
-Status CheckPyShapeInfo(PyObject* o);
-
-// Performs the inverse conversion to that of PyShapeInfoFromXlaShape.
+// Converts a Python object with a method interface mathing that of
+// xla_client.Shape into an XLA Shape object.
 //
 // The return value is a new reference.
-Shape XlaShapeFromPyShapeInfo(PyObject* o);
+StatusOr<Shape> XlaShapeFromPyShape(PyObject* o);
 
 // Converts a PyObject that represents operation metadata into protocol buffer
 // form.
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index 7ee5febc09..bb42e8d703 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -156,9 +156,14 @@ class LocalBuffer(object):
     self._delete = c_api.DeleteLocalShapedBuffer
 
   @staticmethod
-  def from_py(npval):
+  def from_py(npval, layout_fn=None):
     npval = require_numpy_array_layout(npval)
-    return LocalBuffer(c_api.LocalShapedBuffer.FromLiteral(npval))
+    if layout_fn:
+      shape = Shape.from_numpy(npval)
+      shape = shape.map_leaves(layout_fn)
+    else:
+      shape = None
+    return LocalBuffer(c_api.LocalShapedBuffer.FromLiteral(npval, shape))
 
   def to_py(self):
     return self.c_local_shaped_buffer.ToLiteral()
@@ -183,13 +188,17 @@ class Shape(object):
   represents an XLA tuple.
   """
 
-  def __init__(self, np_dtype, dimensions):
+  def __init__(self, np_dtype, dimensions, minor_to_major=None):
+    assert isinstance(dimensions, tuple)
     self.np_dtype = np_dtype
     self._dimensions = dimensions
+    self._minor_to_major = minor_to_major
+    self._check_minor_to_major()
 
   def __repr__(self):
-    return 'xla_client.Shape(np_dtype={!r}, dimensions={!r})'.format(
-        self.np_dtype, self._dimensions)
+    return ('xla_client.Shape(np_dtype={!r}, dimensions={!r}, '
+            'minor_to_major={!r})').format(self.np_dtype, self._dimensions,
+                                           self._minor_to_major)
 
   def element_type(self):
     return DTYPE_TO_XLA_ELEMENT_TYPE[str(self.np_dtype)]
@@ -202,11 +211,49 @@ class Shape(object):
       raise ValueError('Tuple shape has no dimensions')
     return self._dimensions
 
+  def minor_to_major(self):
+    return self._minor_to_major
+
   def tuple_shapes(self):
     if not self.is_tuple():
       raise ValueError('Shape is not a tuple shape')
     return self._dimensions
 
+  def rank(self):
+    return len(self.dimensions())
+
+  def map_leaves(self, f):
+    """Map f over each leaf-level array subshape.
+
+    Args:
+      f: The function to apply. Whenever f returns None, the identity is
+        applied instead.
+
+    Returns:
+      A new Shape with the mapped leaves.
+    """
+    if self.is_tuple():
+      children = tuple(child.map_leaves(f) for child in self.tuple_shapes())
+      return Shape(np.dtype('O'), children)
+    else:
+      mapped = f(self)
+      return self if mapped is None else mapped
+
+  def _check_minor_to_major(self):
+    mtm = self._minor_to_major
+    if self.is_tuple():
+      assert mtm is None, self
+    if mtm is not None:
+      assert self.rank() == len(mtm), self
+      assert sorted(mtm) == range(len(mtm)), self
+
+  def update_minor_to_major(self, minor_to_major):
+    if not isinstance(minor_to_major, tuple):
+      raise TypeError('minor_to_major must be a tuple')
+    updated = Shape(self.np_dtype, tuple(self.dimensions()), minor_to_major)
+    updated._check_minor_to_major()  # pylint: disable=protected-access
+    return updated
+
   @staticmethod
   def from_numpy(npval):
 
@@ -223,23 +270,10 @@ def _wrap_shape(shape_info):
   dtype, dims = shape_info
   element_type = DTYPE_TO_XLA_ELEMENT_TYPE[str(dtype)]
   if element_type == xla_data_pb2.TUPLE:
-    dims = [_wrap_shape(subshape_info) for subshape_info in dims]
+    dims = tuple(_wrap_shape(subshape_info) for subshape_info in dims)
   return Shape(dtype, dims)
 
 
-def _unwrap_shape(shape):
-  if shape.is_tuple():
-    components = tuple(
-        _unwrap_shape(subshape) for subshape in shape.tuple_shapes())
-  else:
-    components = shape.dimensions()
-  return (shape.np_dtype, components)
-
-
-def _unwrap_shapes(shapes):
-  return [_unwrap_shape(shape) for shape in shapes]
-
-
 def _wrap_data_handle(handle):
   cdh = xla_data_pb2.ComputationDataHandle()
   cdh.handle = handle
@@ -303,8 +337,7 @@ def transfer_from_outfeed(shape, replica_number=None):
   Returns:
     The literal value that is produced from the outfeed queue.
   """
-  return c_api.TransferFromOutfeedLocalReplica(
-      _unwrap_shape(shape), replica_number or 0)
+  return c_api.TransferFromOutfeedLocalReplica(shape, replica_number or 0)
 
 
 class LocalComputation(object):
@@ -325,24 +358,39 @@ class LocalComputation(object):
     else:
       self._delete = c_api.DeleteLocalComputation
 
-  def Compile(self, argument_shapes=(), compile_options=None):
+  def Compile(self, argument_shapes=(), compile_options=None, layout_fn=None):
     if self.is_compiled:
       raise ValueError('Attempt to compile a compiled local XLA computation.')
+    if layout_fn:
+      argument_shapes = [
+          shape.map_leaves(layout_fn) for shape in argument_shapes
+      ]
     return LocalComputation(
-        self.c_local_computation.Compile(
-            _unwrap_shapes(argument_shapes), compile_options),
+        self.c_local_computation.Compile(argument_shapes, compile_options),
         is_compiled=True)
 
-  def CompileWithExampleArguments(self, arguments=(), compile_options=None):
+  def CompileWithExampleArguments(self,
+                                  arguments=(),
+                                  compile_options=None,
+                                  layout_fn=None):
     return self.Compile(
         argument_shapes=[Shape.from_numpy(arg) for arg in arguments],
-        compile_options=compile_options)
+        compile_options=compile_options,
+        layout_fn=layout_fn)
 
-  def Execute(self, arguments=()):
+  def Execute(self, arguments=(), layout_fn=None):
+    """Execute with Python values as arguments and return value."""
     if not self.is_compiled:
       raise ValueError('Cannot execute an uncompiled local XLA computation.')
+    argument_shapes = [Shape.from_numpy(arg) for arg in arguments]
+    if layout_fn:
+      argument_shapes = [
+          shape.map_leaves(layout_fn) for shape in argument_shapes
+      ]
+    else:
+      argument_shapes = [None for shape in argument_shapes]
     arguments = tuple(map(require_numpy_array_layout, arguments))
-    return self.c_local_computation.Execute(arguments)
+    return self.c_local_computation.Execute(arguments, argument_shapes)
 
   def ExecuteWithLocalBuffers(self, arguments=()):
     """Execute with LocalBuffer arguments and return value."""
@@ -398,7 +446,7 @@ class ComputationBuilder(object):
     Returns:
       A  ComputationDataHandle message.
     """
-    return _wrap_data_handle(self._client.Infeed(_unwrap_shape(shape)))
+    return _wrap_data_handle(self._client.Infeed(shape))
 
   def Outfeed(self, operand):
     """Enqueues an outfeed op onto the computation.
@@ -407,7 +455,7 @@ class ComputationBuilder(object):
     outfeed queue for subsequent dequeue via the client API.
     """
     self._client.Outfeed(
-        _unwrap_data_handle(operand), _unwrap_shape(self.GetShape(operand)),
+        _unwrap_data_handle(operand), self.GetShape(operand),
         ''.encode('utf-8'))
 
   def Constant(self, value):
@@ -498,8 +546,7 @@ class ComputationBuilder(object):
       parameter_num = next(self._parameter_numbering)
 
     return _wrap_data_handle(
-        self._client.Parameter(
-            parameter_num, _unwrap_shape(shape), name.encode('utf8')))
+        self._client.Parameter(parameter_num, shape, name.encode('utf8')))
 
   def ParameterFromNumpy(self, value, name=None, parameter_num=None):
     """Enqueues a Parameter op onto the computation.
@@ -846,8 +893,7 @@ class ComputationBuilder(object):
     shape = Shape(self.GetShape(mu).np_dtype, dims)
     return _wrap_data_handle(
         self._client.RngNormal(
-            _unwrap_data_handle(mu), _unwrap_data_handle(sigma),
-            _unwrap_shape(shape)))
+            _unwrap_data_handle(mu), _unwrap_data_handle(sigma), shape))
 
   def RngUniform(self, a, b, dims):
     """Enqueues an RngUniform operation onto the computation.
@@ -867,8 +913,7 @@ class ComputationBuilder(object):
     shape = Shape(self.GetShape(a).np_dtype, dims)
     return _wrap_data_handle(
         self._client.RngUniform(
-            _unwrap_data_handle(a), _unwrap_data_handle(b),
-            _unwrap_shape(shape)))
+            _unwrap_data_handle(a), _unwrap_data_handle(b), shape))
 
   def While(self, cond, body, init):
     """Enqueues a While operation onto the computation.
-- 
GitLab


From 5477012060749cd1164dc85ff75fc31ae1351482 Mon Sep 17 00:00:00 2001
From: Anjum Sayed <anjum.sayed48@gmail.com>
Date: Thu, 1 Feb 2018 06:22:54 +0300
Subject: [PATCH 493/966] Change RELEASE.md to specify CUDA 9.0 (#16486)

PR for https://github.com/tensorflow/tensorflow/issues/16348 (tinyest PR ever?)
---
 RELEASE.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/RELEASE.md b/RELEASE.md
index d3b4037061..af6440acef 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,7 +1,9 @@
 # Release 1.5.0
 
 ## Breaking Changes
-* Prebuilt binaries are now built against CUDA 9 and cuDNN 7.
+* Prebuilt binaries are now built against CUDA 9.0 and cuDNN 7.
+* Our Linux binaries are built using ubuntu 16 containers, potentially
+  introducing glibc incompatibility issues with ubuntu 14.
 * Starting from 1.6 release, our prebuilt binaries will use AVX instructions.
   This may break TF on older CPUs.
 
@@ -31,7 +33,7 @@
   preview version is now available.
 * [TensorFlow Lite](https://github.com/tensorflow/tensorflow/tree/r1.5/tensorflow/contrib/lite)
   dev preview is now available.
-* CUDA 9 and cuDNN 7 support.
+* CUDA 9.0 and cuDNN 7 support.
 * Accelerated Linear Algebra (XLA):
   * Add `complex64` support to XLA compiler.
   * `bfloat` support is now added to XLA infrastructure.
-- 
GitLab


From 17c0e1118b4f063d9bbfb3faf24befc69bc5f6c4 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 31 Jan 2018 19:28:01 -0800
Subject: [PATCH 494/966] Update docs for tf.matching_files to mention
 non-deterministic (#16633)

This fix tries to close the issue of 15374 by updating the
docs of `tf.matching_files` (as well as `Dataset.list_files`
and `train.match_filenames_once`), to mention that the file names
returned could be non-deterministic.

This fix fixes 15374.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/core/api_def/base_api/api_def_MatchingFiles.pbtxt | 1 +
 tensorflow/python/data/ops/dataset_ops.py                    | 2 ++
 tensorflow/python/training/input.py                          | 2 ++
 3 files changed, 5 insertions(+)

diff --git a/tensorflow/core/api_def/base_api/api_def_MatchingFiles.pbtxt b/tensorflow/core/api_def/base_api/api_def_MatchingFiles.pbtxt
index 8da76684e5..97fd39f647 100644
--- a/tensorflow/core/api_def/base_api/api_def_MatchingFiles.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_MatchingFiles.pbtxt
@@ -16,5 +16,6 @@ END
   description: <<END
 Note that this routine only supports wildcard characters in the
 basename portion of the pattern, not in the directory portion.
+Note also that the order of filenames returned can be non-deterministic.
 END
 }
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index c1ba67e474..f41e807b4c 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -556,6 +556,8 @@ class Dataset(object):
         - /path/to/dir/b.py
         - /path/to/dir/c.py
 
+    NOTE: The order of the file names returned can be non-deterministic.
+
     Args:
       file_pattern: A string or scalar string `tf.Tensor`, representing
         the filename pattern that will be matched.
diff --git a/tensorflow/python/training/input.py b/tensorflow/python/training/input.py
index 331a51e8bc..7160420a33 100644
--- a/tensorflow/python/training/input.py
+++ b/tensorflow/python/training/input.py
@@ -56,6 +56,8 @@ _restore_sparse = sparse_ops._take_many_sparse_from_tensors_map
 def match_filenames_once(pattern, name=None):
   """Save the list of files matching pattern, so it is only computed once.
 
+  NOTE: The order of the files returned can be non-deterministic.
+
   Args:
     pattern: A file pattern (glob), or 1D tensor of file patterns.
     name: A name for the operations (optional).
-- 
GitLab


From fdf44dbc2a8ae97624134484485558e246ab62e8 Mon Sep 17 00:00:00 2001
From: Ben <bstriner@users.noreply.github.com>
Date: Wed, 31 Jan 2018 22:28:25 -0500
Subject: [PATCH 495/966] By default, only download inception if it doesn't
 exist already (#16577)

* By default, only download inception if it doesn't exist already

* fix filename
---
 .../eval/python/classifier_metrics_impl.py    | 28 +++++++++++++------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
index 986a5ff6dc..7bede4e240 100644
--- a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
+++ b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
@@ -28,6 +28,7 @@ from __future__ import division
 from __future__ import print_function
 
 import functools
+import os
 import sys
 import tarfile
 
@@ -189,20 +190,31 @@ def get_graph_def_from_resource(filename):
   return graph_pb2.GraphDef.FromString(resource_loader.load_resource(filename))
 
 
-def get_graph_def_from_url_tarball(url, filename):
-  """Get a GraphDef proto from a tarball on the web."""
-  def _progress(count, block_size, total_size):
-    sys.stdout.write('\r>> Downloading %s %.1f%%' % (
-        url, float(count * block_size) / float(total_size) * 100.0))
-    sys.stdout.flush()
-  tar_filename, _ = urllib.request.urlretrieve(url, reporthook=_progress)
+def get_graph_def_from_url_tarball(url, filename, tar_filename=None):
+  """Get a GraphDef proto from a tarball on the web.
+
+  Args:
+    url: Web address of tarball
+    filename: Filename of graph definition within tarball
+    tar_filename: Temporary download filename (None = always download)
+
+  Returns:
+    A GraphDef loaded from a file in the downloaded tarball.
+  """
+  if not (tar_filename and os.path.exists(tar_filename)):
+    def _progress(count, block_size, total_size):
+      sys.stdout.write('\r>> Downloading %s %.1f%%' % (
+          url, float(count * block_size) / float(total_size) * 100.0))
+      sys.stdout.flush()
+    tar_filename, _ = urllib.request.urlretrieve(url, filename=tar_filename, reporthook=_progress)
   with tarfile.open(tar_filename, 'r:gz') as tar:
     proto_str = tar.extractfile(filename).read()
   return graph_pb2.GraphDef.FromString(proto_str)
 
 
 def _default_graph_def_fn():
-  return get_graph_def_from_url_tarball(INCEPTION_URL, INCEPTION_FROZEN_GRAPH)
+  return get_graph_def_from_url_tarball(INCEPTION_URL, INCEPTION_FROZEN_GRAPH,
+                                        os.path.basename(INCEPTION_URL))
 
 
 def run_inception(images,
-- 
GitLab


From df62a679e960df935928c0a602851dfb1e867480 Mon Sep 17 00:00:00 2001
From: Andrew Harp <andrewharp@users.noreply.github.com>
Date: Wed, 31 Jan 2018 23:42:56 -0500
Subject: [PATCH 496/966] Re-add missing argument specifier (#16632)

The ":" was erroneously removed in 76f70f5d62f35b5cc95121e6dfffa63a8214b626
---
 tensorflow/contrib/makefile/build_all_android.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/makefile/build_all_android.sh b/tensorflow/contrib/makefile/build_all_android.sh
index 281c4653c6..f67c516186 100755
--- a/tensorflow/contrib/makefile/build_all_android.sh
+++ b/tensorflow/contrib/makefile/build_all_android.sh
@@ -37,7 +37,7 @@ fi
 
 ARCH=armeabi-v7a
 
-while getopts "Es:t:Tx:a" opt_name; do
+while getopts "Es:t:Tx:a:" opt_name; do
   case "$opt_name" in
     E) ENABLE_EXPERIMENTAL_HEXNN_OPS="true";;
     s) SUB_MAKEFILES="${OPTARG}";;
-- 
GitLab


From 9e7ce91845500e5111e0400766983e69701a1733 Mon Sep 17 00:00:00 2001
From: Andrew Harp <andrewharp@users.noreply.github.com>
Date: Wed, 31 Jan 2018 23:43:18 -0500
Subject: [PATCH 497/966] Support multiple build types in Android build.gradle
 with the makefile build (#16640)

* updating CUDA srcs for Makefile build to fix unsatisfied link error

* more makefile refactoring

* Fixing Tegra build logic for Android

* reverting ndk dir

* set ccache back to default
---
 tensorflow/contrib/makefile/Makefile                 | 12 ++++++------
 .../makefile/sub_makefiles/android/Makefile.in       |  2 +-
 tensorflow/examples/android/build.gradle             |  9 ++++++---
 3 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/tensorflow/contrib/makefile/Makefile b/tensorflow/contrib/makefile/Makefile
index c573cf15da..81327407d4 100644
--- a/tensorflow/contrib/makefile/Makefile
+++ b/tensorflow/contrib/makefile/Makefile
@@ -407,7 +407,7 @@ $(MARCH_OPTION) \
 -I$(JETPACK)/cuda/extras/CUPTI/include
 
 
-		LIBS += \
+		CUDA_LIBS := \
 -ltfcuda \
 -lcudart_static \
 -lcudnn \
@@ -420,10 +420,10 @@ $(MARCH_OPTION) \
 -lculibos \
 -lcurand_static
 
-		OBJDIR := $(OBJDIR)Tegra/
-		LIBDIR := $(LIBDIR)Tegra/
-		BINDIR := $(BINDIR)Tegra/
-		DEPDIR := $(DEPDIR)Tegra/
+		OBJDIR := $(OBJDIR)android_arm64-v8a/
+		LIBDIR := $(LIBDIR)android_arm64-v8a/
+		BINDIR := $(BINDIR)android_arm64-v8a/
+		DEPDIR := $(DEPDIR)android_arm64-v8a/
 
 		TEGRA_LIBS := \
 -L$(JETPACK)/cuda/targets/aarch64-linux-androideabi/lib \
@@ -729,7 +729,7 @@ $(BENCHMARK_NAME): $(BENCHMARK_OBJS) $(LIB_PATH) $(CUDA_LIB_DEPS)
 	@mkdir -p $(dir $@)
 	$(CXX) $(CXXFLAGS) $(INCLUDES) \
 	-o $(BENCHMARK_NAME) $(BENCHMARK_OBJS) \
-	$(LIBFLAGS) $(TEGRA_LIBS) $(LIB_PATH) $(LDFLAGS) $(LIBS)
+	$(LIBFLAGS) $(TEGRA_LIBS) $(LIB_PATH) $(LDFLAGS) $(LIBS) $(CUDA_LIBS)
 
 # NVCC compilation rules for Tegra
 ifeq ($(BUILD_FOR_TEGRA),1)
diff --git a/tensorflow/contrib/makefile/sub_makefiles/android/Makefile.in b/tensorflow/contrib/makefile/sub_makefiles/android/Makefile.in
index d9277ed60c..3081084ee7 100644
--- a/tensorflow/contrib/makefile/sub_makefiles/android/Makefile.in
+++ b/tensorflow/contrib/makefile/sub_makefiles/android/Makefile.in
@@ -54,7 +54,7 @@ $(INFERENCE_SO_PATH): $(LIB_OBJS) $(INFERENCE_OBJS) $(CUDA_LIB_DEPS)
 	-o $@ $(INFERENCE_OBJS) $(LIB_OBJS) $(TEGRA_LIBS) \
 	$(LIBFLAGS) $(LDFLAGS) \
 	-shared -Wl,-soname,$(INFERENCE_SO_NAME) \
-	$(LIBS)
+	$(LIBS) $(CUDA_LIBS)
 
 $(INFERENCE_SO_NAME): $(INFERENCE_SO_PATH)
 
diff --git a/tensorflow/examples/android/build.gradle b/tensorflow/examples/android/build.gradle
index f7bdf8b816..0767726aa9 100644
--- a/tensorflow/examples/android/build.gradle
+++ b/tensorflow/examples/android/build.gradle
@@ -56,10 +56,12 @@ def nativeOutDir = 'libs/' + cpuType
 def nativeBuildRule = 'buildNativeBazel'
 def demoLibPath = '../../../bazel-bin/tensorflow/examples/android/libtensorflow_demo.so'
 def inferenceLibPath = '../../../bazel-bin/tensorflow/contrib/android/libtensorflow_inference.so'
+
+// Override for Makefile builds.
 if (nativeBuildSystem == 'makefile') {
     nativeBuildRule = 'buildNativeMake'
-    demoLibPath = '../../../tensorflow/contrib/makefile/gen/lib/libtensorflow_demo.so'
-    inferenceLibPath = '../../../tensorflow/contrib/makefile/gen/lib/libtensorflow_inference.so'
+    demoLibPath = '../../../tensorflow/contrib/makefile/gen/lib/android_' + cpuType + '/libtensorflow_demo.so'
+    inferenceLibPath = '../../../tensorflow/contrib/makefile/gen/lib/android_' + cpuType + '/libtensorflow_inference.so'
 }
 
 // If building with Bazel, this is the location of the bazel binary.
@@ -154,7 +156,8 @@ task buildNativeMake(type: Exec) {
          '-s',  \
          'tensorflow/contrib/makefile/sub_makefiles/android/Makefile.in',  \
          '-t',  \
-         'libtensorflow_inference.so libtensorflow_demo.so'  \
+         'libtensorflow_inference.so libtensorflow_demo.so all'  \
+         , '-a', cpuType  \
          //, '-T'  // Uncomment to skip protobuf and speed up subsequent builds.
 }
 
-- 
GitLab


From 9884a21214a88a2da847b9ea66533108e860067d Mon Sep 17 00:00:00 2001
From: gracehoney <31743510+aaroey@users.noreply.github.com>
Date: Wed, 31 Jan 2018 22:35:45 -0800
Subject: [PATCH 498/966] Add/fix copyright and fix format for the test script
 and other files.

---
 .../contrib/tensorrt/test/test_tftrt.py       | 101 ++++++++++--------
 tensorflow/contrib/tensorrt/trt_conversion.i  |   2 +-
 third_party/tensorrt/LICENSE                  |   2 +-
 3 files changed, 61 insertions(+), 44 deletions(-)

diff --git a/tensorflow/contrib/tensorrt/test/test_tftrt.py b/tensorflow/contrib/tensorrt/test/test_tftrt.py
index 06b6f64c4a..32f839e624 100644
--- a/tensorflow/contrib/tensorrt/test/test_tftrt.py
+++ b/tensorflow/contrib/tensorrt/test/test_tftrt.py
@@ -1,53 +1,70 @@
-#  Script to test TF-TensorRT integration
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
 #
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Script to test TF-TensorRT integration."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+
 import tensorflow as tf
 import tensorflow.contrib.tensorrt as trt
 import numpy as np
 
+
 def getSimpleGraphDef():
-    ''' 
-    Create a simple graph and return its graph_def
-    '''
-    g=tf.Graph()
-    with g.as_default():
-        A=tf.placeholder(dtype=tf.float32,shape=(None,24,24,2),name="input")
-        e=tf.constant([ [[[ 1., 0.5, 4., 6., 0.5, 1. ],
-                        [ 1., 0.5, 1., 1., 0.5, 1. ]]] ],
-                      name="weights",dtype=tf.float32)
-        conv=tf.nn.conv2d(input=A,filter=e,strides=[1,2,2,1],padding="SAME",name="conv")
-        b=tf.constant([  4., 1.5, 2., 3., 5., 7. ],
-                      name="bias",dtype=tf.float32)
-        t=tf.nn.bias_add(conv,b,name="biasAdd")
-        relu=tf.nn.relu(t,"relu")
-        idty=tf.identity(relu,"ID")
-        v=tf.nn.max_pool(idty,[1,2,2,1],[1,2,2,1],"VALID",name="max_pool")
-        out = tf.squeeze(v,name="output")
-    return g.as_graph_def()
-
-def runGraph(gdef,dumm_inp):
-    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.50)
-    tf.reset_default_graph()
-    g=tf.Graph()
-    with g.as_default():
-        inp,out=tf.import_graph_def(graph_def=gdef,
-                            return_elements=["input","output"])
-        inp=inp.outputs[0]
-        out=out.outputs[0]
-    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options),
-                    graph=g) as sess:
-        val=sess.run(out,{inp:dumm_inp})
-    return val
-if "__main__" in  __name__:
-    inpDims=(100,24,24,2)
-    dummy_input=np.random.random_sample(inpDims)
-    gdef=getSimpleGraphDef() #get graphdef
-    trt_graph=trt.CreateInferenceGraph(gdef,["output"],inpDims[0]) # get optimized graph
-    o1=runGraph(gdef,dummy_input)
-    o2=runGraph(trt_graph,dummy_input)
-    assert(np.array_equal(o1,o2))
-    
+  """Create a simple graph and return its graph_def"""
+  g = tf.Graph()
+  with g.as_default():
+    A = tf.placeholder(dtype=tf.float32, shape=(None, 24, 24, 2), name="input")
+    e = tf.constant(
+        [[[[1., 0.5, 4., 6., 0.5, 1.], [1., 0.5, 1., 1., 0.5, 1.]]]],
+        name="weights",
+        dtype=tf.float32)
+    conv = tf.nn.conv2d(
+        input=A, filter=e, strides=[1, 2, 2, 1], padding="SAME", name="conv")
+    b = tf.constant([4., 1.5, 2., 3., 5., 7.], name="bias", dtype=tf.float32)
+    t = tf.nn.bias_add(conv, b, name="biasAdd")
+    relu = tf.nn.relu(t, "relu")
+    idty = tf.identity(relu, "ID")
+    v = tf.nn.max_pool(
+        idty, [1, 2, 2, 1], [1, 2, 2, 1], "VALID", name="max_pool")
+    out = tf.squeeze(v, name="output")
+  return g.as_graph_def()
+
+
+def runGraph(gdef, dumm_inp):
+  gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.50)
+  tf.reset_default_graph()
+  g = tf.Graph()
+  with g.as_default():
+    inp, out = tf.import_graph_def(
+        graph_def=gdef, return_elements=["input", "output"])
+    inp = inp.outputs[0]
+    out = out.outputs[0]
+  with tf.Session(
+      config=tf.ConfigProto(gpu_options=gpu_options), graph=g) as sess:
+    val = sess.run(out, {inp: dumm_inp})
+  return val
+
+
+if "__main__" in __name__:
+  inpDims = (100, 24, 24, 2)
+  dummy_input = np.random.random_sample(inpDims)
+  gdef = getSimpleGraphDef()
+  trt_graph = trt.CreateInferenceGraph(gdef, ["output"],
+                                       inpDims[0])  # Get optimized graph
+  o1 = runGraph(gdef, dummy_input)
+  o2 = runGraph(trt_graph, dummy_input)
+  assert (np.array_equal(o1, o2))
diff --git a/tensorflow/contrib/tensorrt/trt_conversion.i b/tensorflow/contrib/tensorrt/trt_conversion.i
index 828b4b35c2..f085380054 100644
--- a/tensorflow/contrib/tensorrt/trt_conversion.i
+++ b/tensorflow/contrib/tensorrt/trt_conversion.i
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/third_party/tensorrt/LICENSE b/third_party/tensorrt/LICENSE
index 96c0e3bcaf..146d9b765c 100644
--- a/third_party/tensorrt/LICENSE
+++ b/third_party/tensorrt/LICENSE
@@ -188,7 +188,7 @@ Copyright 2018 The TensorFlow Authors.  All rights reserved.
       same "printed page" as the copyright notice for easier
       identification within third-party archives.
 
-   Copyright 2015, The TensorFlow Authors.
+   Copyright 2018, The TensorFlow Authors.
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
-- 
GitLab


From b06326cadc7b45078291e8fb3da0b7e941978540 Mon Sep 17 00:00:00 2001
From: Mahmoud Abuzaina <mahmoud.abuzaina@intel.com>
Date: Thu, 1 Feb 2018 00:37:47 -0800
Subject: [PATCH 499/966] Pooling and AddN fixes (#16607)

---
 tensorflow/core/kernels/mkl_aggregate_ops.cc |  7 +++++--
 tensorflow/core/kernels/mkl_avgpooling_op.cc | 22 ++++++++++++++++++++
 2 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/kernels/mkl_aggregate_ops.cc b/tensorflow/core/kernels/mkl_aggregate_ops.cc
index 49c34fed02..ef724f0a29 100644
--- a/tensorflow/core/kernels/mkl_aggregate_ops.cc
+++ b/tensorflow/core/kernels/mkl_aggregate_ops.cc
@@ -317,8 +317,11 @@ class MklAddNOp : public OpKernel {
                                                 : src2_tensor.dims();
       // if the shapes of two tensors are not same raise op error
       TensorShape src1_shape, src2_shape;
-      src1_shape = src1_tensor.shape();
-      src2_shape = src2_tensor.shape();
+      src1_shape = input1_in_mkl_format ? src1_mkl_shape.GetTfShape()
+                  : src1_tensor.shape();
+      src2_shape = input2_in_mkl_format ? src2_mkl_shape.GetTfShape()
+                  : src2_tensor.shape();
+
       if (!src1_shape.IsSameSize(src2_shape)) {
         ctx->SetStatus(errors::InvalidArgument(
             "Inputs to operation ", this->name(), " of type ",
diff --git a/tensorflow/core/kernels/mkl_avgpooling_op.cc b/tensorflow/core/kernels/mkl_avgpooling_op.cc
index ebaa0f4e2a..cff1bd18a7 100644
--- a/tensorflow/core/kernels/mkl_avgpooling_op.cc
+++ b/tensorflow/core/kernels/mkl_avgpooling_op.cc
@@ -468,6 +468,28 @@ class MklAvgPoolingOp : public MklPoolingForwardOpBase<T> {
       memory::dims output_dims_mkl_order;
       this->GetOutputDims(pool_params, &output_dims_mkl_order);
 
+      // If input is an empty tensor, allocate an empty output tensor and return
+      if (input_tensor.NumElements() == 0) {
+        MklDnnShape output_mkl_shape;
+        output_mkl_shape.SetMklTensor(false);
+        TensorShape output_tf_shape;
+        if (pool_params.data_format == TensorFormat::FORMAT_NCHW) {
+          output_tf_shape = MklDnnDimsToTFShape(output_dims_mkl_order);
+        } else {
+          memory::dims output_dims_NHWC_order;
+          output_dims_NHWC_order = {pool_params.tensor_in_batch,
+                                    static_cast<int>(pool_params.out_height),
+                                    static_cast<int>(pool_params.out_width),
+                                    pool_params.out_depth};
+          output_tf_shape = MklDnnDimsToTFShape(output_dims_NHWC_order);
+        }
+        const int kOutputIndex = 0;
+        AllocateOutputSetMklShape(context, kOutputIndex, &output_tensor,
+                                    output_tf_shape, output_mkl_shape);
+        CHECK_NOTNULL(output_tensor);
+        return;
+      }
+
       // If input is in Mkl layout, then just get the memory format from it
       // directly, instead of using input data_format to AvgPool.
       if (dnn_shape_input.IsMklTensor()) {
-- 
GitLab


From 3c2e6f883e87df550521b11edbd837a2479780fc Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Wed, 31 Jan 2018 17:46:43 -0800
Subject: [PATCH 500/966] Add a doc explaining how to convert an `Estimator` to
 run on a Cloud TPU.

PiperOrigin-RevId: 184075365
---
 .../api_guides/python/TPUEstimator.md         | 396 ++++++++++++++++++
 1 file changed, 396 insertions(+)
 create mode 100644 tensorflow/docs_src/api_guides/python/TPUEstimator.md

diff --git a/tensorflow/docs_src/api_guides/python/TPUEstimator.md b/tensorflow/docs_src/api_guides/python/TPUEstimator.md
new file mode 100644
index 0000000000..d74d7f3181
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/TPUEstimator.md
@@ -0,0 +1,396 @@
+# Using TPUs
+
+This document walks through the principal TensorFlow APIs necessary to make
+effective use of a [Cloud TPU](https://cloud.google.com/tpu/), and highlights
+the differences between regular TensorFlow usage, and usage on a TPU.
+
+This doc is aimed at users who:
+
+* Are familiar with TensorFlow's `Estimator` and `Dataset` APIs
+* Have maybe [tried out a Cloud TPU](https://cloud.google.com/tpu/docs/quickstart)
+  using an existing model.
+* Have, perhaps, skimmed the code of an example TPU model
+  [[1]](https://github.com/tensorflow/models/blob/master/official/mnist/mnist_tpu.py)
+  [[2]](https://github.com/tensorflow/tpu-demos/tree/master/cloud_tpu/models).
+* Are interested in porting an existing `Estimator` model to
+  run on Cloud TPUs
+
+## TPUEstimator
+
+@{tf.estimator.Estimator$Estimators} are TensorFlow's model-level abstraction.
+Standard `Estimators` can drive models on CPU and GPUs. You must use
+@{tf.contrib.tpu.TPUEstimator} to drive a model on TPUs.
+
+Refer to TensorFlow's Getting Started section for an introduction to the basics
+of using a @{$get_started/premade_estimators$pre-made `Estimator`}, and
+@{$get_started/custom_estimators$custom `Estimator`s}.
+
+The `TPUEstimator` class differs somewhat from the `Estimator` class.
+
+The simplest way to maintain a model that can be run both on CPU/GPU or on a
+Cloud TPU is to define the model's inference phase (from inputs to predictions)
+outside of the `model_fn`. Then maintain separate implementations of the
+`Estimator` setup and `model_fn`, both wrapping this inference step. For an
+example of this pattern compare the `mnist.py` and `mnist_tpu.py` implementation in
+[tensorflow/models](https://github.com/tensorflow/models/tree/master/official/mnist).
+
+### Running a `TPUEstimator` locally
+
+To create a standard `Estimator` you call the constructor, and pass it a
+`model_fn`, for example:
+
+```
+my_estimator = tf.estimator.Estimator(
+  model_fn=my_model_fn)
+```
+
+The changes required to use a @{tf.contrib.tpu.TPUEstimator} on your local
+machine are relatively minor. The constructor requires two additional arguments.
+You should set the `use_tpu` argument to `False`, and pass a
+@{tf.contrib.tpu.RunConfig} as the `config` argument, as shown below:
+
+``` python
+my_tpu_estimator = tf.contrib.tpu.TPUEstimator(
+    model_fn=my_model_fn,
+    config=tf.contrib.tpu.RunConfig()
+    use_tpu=False)
+```
+
+Just this simple change will allow you to run a `TPUEstimator` locally.
+The majority of example TPU models can be run in this local mode,
+by setting the command line flags as follows:
+
+
+```
+$> python mnist_tpu.py --use_tpu=false --master=''
+```
+
+Note: This `use_tpu=False` argument is useful for trying out the `TPUEstimator`
+API. It is not meant to be a complete TPU compatibility test. Successfully
+running a model locally in a `TPUEstimator` does not guarantee that it will
+work on a TPU.
+
+
+### Building a `tpu.RunConfig`
+
+While the default `RunConfig` is sufficient  for local training, these settings
+cannot be ignored in real usage.
+
+A more typical setup for a `RunConfig`, that can be switched to use a Cloud
+TPU, might be as follows:
+
+``` python
+import tempfile
+import subprocess
+
+class FLAGS(object):
+  use_tpu=False
+  tpu_name=None
+  # Use a local temporary path for the `model_dir`
+  model_dir = tempfile.mkdtemp()
+  # Number of training steps to run on the Cloud TPU before returning control.
+  iterations = 50
+  # A single Cloud TPU has 8 shards.
+  num_shards = 8
+
+if FLAGS.use_tpu:
+    my_project_name = subprocess.check_output([
+        'gcloud','config','get-value','project'])
+    my_zone = subprocess.check_output([
+        'gcloud','config','get-value','compute/zone'])
+    cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
+            tpu_names=[FLAGS.tpu_name],
+            zone=my_zone,
+            project=my_project)
+    master = tpu_cluster_resolver.get_master()
+else:
+    master = ''
+
+my_tpu_run_config = tf.contrib.tpu.RunConfig(
+    master=master,
+    evaluation_master=master,
+    model_dir=FLAGS.model_dir,
+    session_config=tf.ConfigProto(
+        allow_soft_placement=True, log_device_placement=True),
+    tpu_config=tf.contrib.tpu.TPUConfig(FLAGS.iterations,
+                                        FLAGS.num_shards),
+)
+```
+
+Then you must pass the @{tf.contrib.tpu.RunConfig} to the constructor:
+
+``` python
+my_tpu_estimator = tf.contrib.tpu.TPUEstimator(
+    model_fn=my_model_fn,
+    config = my_tpu_run_config,
+    use_tpu=FLAGS.use_tpu)
+```
+
+Typically the `FLAGS` would be set by command line arguments. To switch from
+training locally to training on a cloud TPU you would need to:
+
+  1) Set `FLAGS.use_tpu` to `True`
+  1) Set `FLAGS.tpu_name` so the
+     `tf.contrib.cluster_resolver.TPUClusterResolver` can find it
+  1) Set `FLAGS.model_dir` to a Google Cloud Storage bucket url (`gs://`).
+
+
+## Optimizer
+
+When training on a cloud TPU you **must** wrap the optimizer in a
+@{tf.contrib.tpu.CrossShardOptimizer}, which uses an `allreduce` to aggregate
+gradients and broadcast the result to each shard (each TPU core).
+
+The `CrossShardOptimizer` is not compatible with local training. So, to have
+the same code run both locally and on a Cloud TPU, add lines like the following:
+
+``` python
+optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
+if FLAGS.use_tpu:
+  optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
+```
+
+If you prefer to avoid a global `FLAGS` variable in your model code, one
+approach is to set the optimizer as one of the `Estimator`'s params,
+as follows:
+
+``` python
+my_tpu_estimator = tf.contrib.tpu.TPUEstimator(
+    model_fn=my_model_fn,
+    config = my_tpu_run_config,
+    use_tpu=FLAGS.use_tpu,
+    params={'optimizer':optimizer})
+```
+
+## Model Function
+
+This section details the changes you must make to the model function
+(`model_fn()`) to make it `TPUEstimator` compatible.
+
+### Static shapes
+
+During regular usage TensorFlow attempts to determine the shapes of each
+`tf.Tensor` during graph construction. During execution any unknown shape
+dimensions are determined dynamically,
+see @{$programmers_guide/tensors#shape$Tensor Shapes} for more details.
+
+To run on Cloud TPUs TensorFlow models are compiled using @{$xla$XLA}.
+XLA uses a similar system for determining shapes at compile time. XLA requires
+that all tensor dimensions be statically defined at compile time. All shapes
+must evaluate to a constant, and not depend on external data, or stateful
+operations like variables or a random number generator.
+
+
+### Summaries
+
+Remove any use of `tf.summary` from your model.
+
+@{$summaries_and_tensorboard$TensorBoard summaries} are a great way see inside
+your model. A minimal set of basic summaries are automatically recorded by the
+`TPUEstimator`, to `event` files in the `model_dir`. Custom summaries, however,
+are currently unsupported when training on a Cloud TPU. So while the
+`TPUEstimator` will still run locally with summaries, it will fail if used on a
+TPU.
+
+### Metrics
+
+Build your evaluation metrics dictionary in a stand-alone `metric_fn`.
+
+<!-- TODO(markdaoust) link to programmers_guide/metrics when it exists -->
+
+Evaluation metrics are an essential part of training a model. These are fully
+supported on Cloud TPUs, but with a slightly different syntax.
+
+A standard @{tf.metrics} returns two tensors. The first returns the running
+average of the metric value, while the second updates the running average and
+returns the value for this batch:
+
+```
+running_average, current_batch = tf.metrics.accuracy(labels, predictions)
+```
+
+In a standard `Estimator` you create a dictionary of these pairs, and return it
+as part of the `EstimatorSpec`.
+
+```python
+my_metrics = {'accuracy': tf.metrics.accuracy(labels, predictions)}
+
+return tf.estimator.EstimatorSpec(
+  ...
+  eval_metric_ops=my_metrics
+)
+```
+
+In a `TPUEstimator` you instead pass a function (which returns a metrics
+dictionary) and a list of argument tensors, as shown below:
+
+```python
+def my_metric_fn(labels, predictions):
+   return {'accuracy': tf.metrics.accuracy(labels, predictions)}
+
+return tf.contrib.tpu.TPUEstimatorSpec(
+  ...
+  eval_metrics=(my_metric_fn, [labels, predictions])
+)
+```
+
+### Use `TPUEstimatorSpec`
+
+`TPUEstimatorSpec` do not support hooks, and require function wrappers for
+some fields.
+
+An `Estimator`'s `model_fn` must return an `EstimatorSpec`. An `EstimatorSpec`
+is a simple structure of named fields containing all the `tf.Tensors` of the
+model that the `Estimator` may need to interact with.
+
+`TPUEstimators` use a @{tf.contrib.tpu.TPUEstimatorSpec}. There are a few
+differences between it and a standard @{tf.estimator.EstimatorSpec}:
+
+
+*  The `eval_metric_ops` must be wrapped into a `metrics_fn`, this field is
+   renamed `eval_metrics` ([see above](#metrics)).
+*  The @{tf.train.SessionRunHook$hooks} are unsupported, so these fields are
+   omitted.
+*  The @{tf.train.Scaffold$`scaffold`}, if used, must also be wrapped in a
+   function. This field is renamed to `scaffold_fn`.
+
+`Scaffold` and `Hooks` are for advanced usage, and can typically be omitted.
+
+## Input functions
+
+Input functions work mainly unchanged as they run on the host computer, not the
+Cloud TPU itself. This section explains the two necessary adjustments.
+
+### Params argument
+
+<!-- TODO(markdaoust) link to input_fn doc when it exists -->
+
+The `input_fn` for a standard `Estimator` _can_ include a
+`params` argument; the `input_fn` for a `TPUEstimator` *must* include a
+`params` argument. This is necessary to allow the estimator to set the batch
+size for each replica of the input stream. So the minimum signature for an
+`input_fn` for a `TPUEstimator` is:
+
+```
+def my_input_fn(params):
+  pass
+```
+
+Where `params['batch-size']` will contain the batch size.
+
+### Static shapes and batch size
+
+The input pipeline generated by your `input_fn` is run on CPU. So it is mostly
+free strict static shape requirements imposed by the XLA/TPU environment. The
+one requirement is that the batches of data fed from your input pipeline to
+the TPU have a static shape, as determined by the standard TensorFlow shape
+inference algorithm. Intermediate tensors are free to have a dynamic shapes.
+If shape inference has failed, but the shape is known it is possible to
+impose the correct shape using `tf.set_shape()`. 
+
+In the example below the shape
+inference algorithm fails, but it is corrected using `set_shape`:
+
+```
+>>> x = tf.zeros(tf.constant([1,2,3])+1)
+>>> x.shape
+
+TensorShape([Dimension(None), Dimension(None), Dimension(None)])
+
+>>> x.set_shape([2,3,4])
+```
+
+In many cases the batch size is the only unknown dimension.
+
+A typical input pipeline, using `tf.data`, will usually produce batches of a
+fixed size. The last batch of a finite `Dataset`, however, is typically smaller,
+containing just the remaining elements. Since a `Dataset` does not know its own
+length or finiteness, the standard @{tf.data.Dataset.batch$`batch`} method
+cannot determine if all batches will have a fixed size batch on its own:
+
+```
+>>> params = {'batch_size':32}
+>>> ds = tf.data.Dataset.from_tensors([0, 1, 2])
+>>> ds = ds.repeat().batch(params['batch-size'])
+>>> ds
+
+<BatchDataset shapes: (?, 3), types: tf.int32>
+```
+
+The most straightforward fix is to
+@{tf.data.Dataset.apply$apply} @{tf.contrib.data.batch_and_drop_remainder}
+as follows:
+
+```
+>>> params = {'batch_size':32}
+>>> ds = tf.data.Dataset.from_tensors([0, 1, 2])
+>>> ds = ds.repeat().apply(
+...     tf.contrib.data.batch_and_drop_remainder(params['batch-size']))
+>>> ds
+
+ <_RestructuredDataset shapes: (32, 3), types: tf.int32>
+```
+
+The one downside to this approach is that, as the name implies, this batching
+method throws out any fractional batch at the end of the dataset. This is fine
+for an infinitely repeating dataset being used for training, but could be a
+problem if you want to train for an exact number of epochs.
+
+To do an exact 1-epoch of _evaluation_ you can work around this by manually
+padding the length of the batches, and setting the padding entries to have zero
+weight when creating your `tf.metrics`.
+
+## Datasets
+
+Efficient use of the `tf.data.Dataset` API is critical when using a Cloud
+TPU, as it is impossible to use the Cloud TPU's unless you can feed it data
+quickly enough. See @{$datasets_performance} for details on dataset performance.
+
+For all but the simplest experimentation (using
+@{tf.data.Dataset.from_tensor_slices} or other in-graph data) you will need to
+store all data files read by the `TPUEstimator`'s `Dataset` in Google Cloud
+Storage Buckets.
+
+<!--TODO(markdaoust): link to the `TFRecord` doc when it exists.-->
+
+For most use-cases, we recommend converting your data into `TFRecord`
+format and using a @{tf.data.TFRecordDataset} to read it. This, however, is not
+a hard requirement and you can use other dataset readers
+(`FixedLengthRecordDataset` or `TextLineDataset`) if you prefer.
+
+Small datasets can be loaded entirely into memory using
+@{tf.data.Dataset.cache}.
+
+Regardless of the data format used, it is strongly recommended that you
+@{$performance_guide#use_large_files$use large files}, on the order of
+100MB. This is especially important in this networked setting as the overhead
+of opening a file is significantly higher.
+
+It is also important, regardless of the type of reader used, to enable buffering
+using the `buffer_size` argument to the constructor. This argument is specified
+in bytes. A minimum of a few MB (`buffer_size=8*1024*1024`) is recommended so
+that data is available when needed.
+
+The TPU-demos repo includes
+[a script](https://github.com/tensorflow/tpu-demos/blob/master/cloud_tpu/datasets/imagenet_to_gcs.py)
+for downloading the imagenet dataset and converting it to an appropriate format.
+This together with the imagenet
+[models](https://github.com/tensorflow/tpu-demos/tree/master/cloud_tpu/models)
+included in the repo demonstrate all of these best-practices.
+
+
+## What Next
+
+For details on how to actually set up and run a Cloud TPU see:
+
+ * [Google Cloud TPU Documentation](https://cloud.google.com/tpu/docs/)
+
+This document is by no means exhaustive. The best source of more detail on how
+to make a Cloud TPU compatible model are the example models published in:
+
+ * The [TPU Demos Repository.](https://github.com/tensorflow/tpu-demos/)
+
+For more information about tuning TensorFlow code for performance see:
+
+ * The @{$performance$Performance Section.}
+
-- 
GitLab


From 5409f55e104f2c9be60a850f128b256460daccc0 Mon Sep 17 00:00:00 2001
From: Akshay Agrawal <akshayka@google.com>
Date: Wed, 31 Jan 2018 18:03:25 -0800
Subject: [PATCH 501/966] TFE: Support `IndexedSlices` as inputs and outputs
 for `tfe.defun`.

In particular, this change fixes a bug that arose when attempting to
differentiate a defun-d function whose gradient yielded `IndexedSlices`.

PiperOrigin-RevId: 184077261
---
 tensorflow/python/eager/function.py      | 95 +++++++++++++++++++-----
 tensorflow/python/eager/function_test.py | 72 ++++++++++++++++++
 2 files changed, 150 insertions(+), 17 deletions(-)

diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index 81b1f6f12a..f5d0759bdc 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -292,6 +292,22 @@ def _map_sequence_obj_to_idx(sequence):
   return {id(x): i for i, x in enumerate(sequence)}
 
 
+def _flatten(sequence):
+  """A wrapper around `nest.flatten` that also unpacks `IndexedSlices`."""
+  # TODO(akshayka): Support `SparseTensor` in a similar fashion.
+  flat_sequence = nest.flatten(sequence)
+  outputs = []
+  for item in flat_sequence:
+    if isinstance(item, ops.IndexedSlices):
+      if item.dense_shape is not None:
+        outputs.extend([item.values, item.indices, item.dense_shape])
+      else:
+        outputs.extend([item.values, item.indices])
+    else:
+      outputs.append(item)
+  return outputs
+
+
 class GraphModeFunction(object):
   """Callable object representing a graph-mode function.
 
@@ -333,14 +349,14 @@ class GraphModeFunction(object):
     self._input_placeholders = input_placeholders
     self._extra_inputs = list(extra_inputs)
     self._graph = graph
-    self._has_backprop = False
+    self._backward_function = None
     self._func_name = name
     self._function_def = defined_function
     self._num_outputs = len(defined_function.signature.output_arg)
     self._ops = operations
     self._func_outputs = func_outputs
     self._returns = [func_outputs] if isinstance(
-        func_outputs, (ops.Tensor, type(None))) else list(func_outputs)
+        func_outputs, (ops.Tensor, type(None))) else _flatten(func_outputs)
     self._output_shapes = output_shapes
     self._variables = variables if variables is not None else []
 
@@ -348,9 +364,8 @@ class GraphModeFunction(object):
   def variables(self):
     return self._variables
 
-  def _compute_backprop(self):
-    """Computes the backprop function object for this function."""
-    self._has_backprop = True
+  def _construct_backprop_function(self):
+    """Constructs the backprop function object for this function."""
     with self._graph.as_default(), context.graph_mode():
       c = _CapturingContext()
       with c:
@@ -361,13 +376,16 @@ class GraphModeFunction(object):
             filtered_outputs,
             self._input_placeholders,
             grad_ys=self._out_grad_placeholders)
-        shapes = tuple(x.shape for x in in_gradients if x is not None)
+
+    backward_outputs = tuple(
+        grad for grad in _flatten(in_gradients) if grad is not None)
+    output_shapes = tuple(grad.shape for grad in backward_outputs)
+
     captures = list(sorted(c.captured_tensors, key=lambda x: x.name))
     forward_name = _forward_name(self._func_name)
     self._forward_fdef = _EagerDefinedFunction(
         forward_name, self._graph, self._ops, self._input_placeholders,
         filtered_outputs + captures)
-    backward_outputs = tuple(x for x in in_gradients if x is not None)
     all_inputs = self._out_grad_placeholders + captures
     # Excluding input ops from the body as we do not intend to execute these
     # operations when the function is executed.
@@ -381,7 +399,7 @@ class GraphModeFunction(object):
     bname = _backward_name(self._func_name)
     self._backward_function = GraphModeFunction(
         bname, all_inputs, [], self._graph, function_def_ops,
-        backward_outputs, in_gradients, shapes)
+        backward_outputs, in_gradients, output_shapes)
 
   def _backprop_call(self, args):
     """Calls the wrapped function and records the result on a tape."""
@@ -426,9 +444,24 @@ class GraphModeFunction(object):
 
   @property
   def output_shapes(self):
+    """The function's output shapes."""
     # TODO(ebrevdo): Should we only keep the output shapes associated
     # with len(self._returns) outputs?
-    return nest.pack_sequence_as(self._func_outputs, self._output_shapes)
+    outputs_list = nest.flatten(self._func_outputs)
+    j = 0
+    for i, o in enumerate(outputs_list):
+      if o is not None:
+        if isinstance(o, ops.IndexedSlices):
+          # Extract the shape of the `IndexedSlices` object's `values` field.
+          outputs_list[i] = self._output_shapes[j]  # the `values` shape
+          if o.dense_shape is not None:
+            j += 3  # skip over shapes for `values`, `indices`, `dense_shape`
+          else:
+            j += 2  # skip over shapes for `values`, `indices`
+        else:
+          outputs_list[i] = self._output_shapes[j]
+          j += 1
+    return nest.pack_sequence_as(self._func_outputs, outputs_list)
 
   @property
   def output_dtypes(self):
@@ -457,12 +490,11 @@ class GraphModeFunction(object):
       if v._trainable:  # pylint: disable=protected-access
         tape.watch_variable(v)
 
-    tensor_inputs = [x for x in nest.flatten(args)
-                     if isinstance(x, ops.Tensor)]
+    tensor_inputs = [x for x in nest.flatten(args) if isinstance(x, ops.Tensor)]
     if tape.should_record(tensor_inputs) or tape.should_record(
         self._extra_inputs):
-      if not self._has_backprop:
-        self._compute_backprop()
+      if self._backward_function is None:
+        self._construct_backprop_function()
       return self._backprop_call(tensor_inputs)
 
     ctx = context.context()
@@ -503,13 +535,30 @@ class GraphModeFunction(object):
     """
     if self._func_outputs is None:
       return None
+    # Use `nest.flatten` instead of `_flatten` in order to preserve any
+    # IndexedSlices in `self._func_outputs`.
     outputs_list = nest.flatten(self._func_outputs)
     j = 0
     for i, o in enumerate(outputs_list):
       if o is not None:
-        outputs_list[i] = result[j]
-        j += 1
-    return nest.pack_sequence_as(self._func_outputs, outputs_list)
+        if isinstance(o, ops.IndexedSlices):
+          # Repack Tensors for IndexedSlices.
+          if o.dense_shape is not None:
+            outputs_list[i] = ops.IndexedSlices(
+                values=result[j],
+                indices=result[j + 1],
+                dense_shape=result[j + 2])
+            j += 3
+          else:
+            outputs_list[i] = ops.IndexedSlices(
+                values=result[j],
+                indices=result[j + 1])
+            j += 2
+        else:
+          outputs_list[i] = result[j]
+          j += 1
+    ret = nest.pack_sequence_as(self._func_outputs, outputs_list)
+    return ret
 
 
 def _get_defun_inputs(args):
@@ -555,7 +604,7 @@ def _defun_internal(name, func, args, kwds):
 
         # Returning a closed-over tensor as an output does not trigger a
         # call to convert_to_tensor, so we manually capture all such tensors.
-        outputs_list = nest.flatten(func_outputs)
+        outputs_list = _flatten(func_outputs)
         func_def_outputs = [
             _convert_to_graph_tensor(x) for x in outputs_list if x is not None
         ]
@@ -600,6 +649,18 @@ def _cache_key(x):
   """Cache key for tfe functions."""
   if isinstance(x, ops.Tensor):
     return _TensorDtype(x.dtype, x._shape_tuple())  # pylint: disable=protected-access
+  if isinstance(x, ops.IndexedSlices):
+    if x.dense_shape is not None:
+      return tuple([
+          _TensorDtype(x.values.dtype, x.values._shape_tuple()),  # pylint: disable=protected-access
+          _TensorDtype(x.indices.dtype, x.indices._shape_tuple()),  # pylint: disable=protected-access
+          _TensorDtype(x.dense_shape.dtype, x.dense_shape._shape_tuple())  # pylint: disable=protected-access
+      ])
+    else:
+      return tuple([
+          _TensorDtype(x.values.dtype, x.values._shape_tuple()),  # pylint: disable=protected-access
+          _TensorDtype(x.indices.dtype, x.indices._shape_tuple())  # pylint: disable=protected-access
+      ])
   if isinstance(x, np.ndarray):
     return ("array", x.shape, tuple(x.reshape(-1)))
   if isinstance(x, (list, tuple)):
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index 2cb2cfb76c..3e8e67ac7e 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -374,6 +374,78 @@ class FunctionTest(test.TestCase):
 
     self.assertAllEqual(f(constant_op.constant(1.0)), 2.0)
 
+  def testGradientOfGatherWithDefun(self):
+
+    v = resource_variable_ops.ResourceVariable([0.0, 1.0, 2.0])
+
+    def sum_gather():
+      return math_ops.reduce_sum(array_ops.gather(v, [1, 2]))
+
+    grad_fn = backprop.implicit_grad(sum_gather)
+    gradient = grad_fn()
+    defun_grad_fn = backprop.implicit_grad(function.defun(sum_gather))
+    defun_gradient = defun_grad_fn()
+    self.assertEqual(len(gradient), len(defun_gradient))
+
+    gradient = gradient[0][0]
+    defun_gradient = defun_gradient[0][0]
+    self.assertAllEqual(gradient.values, defun_gradient.values)
+    self.assertAllEqual(gradient.indices, defun_gradient.indices)
+    self.assertAllEqual(gradient.dense_shape, defun_gradient.dense_shape)
+
+  def testReturningIndexedSlicesWithDefun(self):
+
+    def validate(indexed_slice):
+      def f():
+        return indexed_slice
+
+      output = function.defun(f)()
+      self.assertTrue(isinstance(output, ops.IndexedSlices))
+      self.assertAllEqual(indexed_slice.values, output.values)
+      self.assertAllEqual(indexed_slice.indices, output.indices)
+      self.assertAllEqual(indexed_slice.dense_shape, output.dense_shape)
+
+      self.assertEqual(
+          function.make_defun_op(f).output_shapes, indexed_slice.values.shape)
+
+    arg = ops.IndexedSlices(
+        values=constant_op.constant([1, 2]),
+        indices=constant_op.constant([0, 1]),
+        dense_shape=constant_op.constant([2]))
+    validate(arg)
+
+    arg = ops.IndexedSlices(
+        values=constant_op.constant([1, 2]),
+        indices=constant_op.constant([0, 1]),
+        dense_shape=None)
+    validate(arg)
+
+  def testIndexedSliceAsArgumentWithDefun(self):
+
+    @function.defun
+    def f(indexed_slice):
+      return indexed_slice
+
+    def validate(arg):
+      output = f(arg)
+      self.assertTrue(isinstance(output, ops.IndexedSlices))
+      self.assertAllEqual(arg.values, output.values)
+      self.assertAllEqual(arg.indices, output.indices)
+      self.assertAllEqual(arg.dense_shape, output.dense_shape)
+
+    indexed_slice = ops.IndexedSlices(
+        values=constant_op.constant([1]),
+        indices=constant_op.constant([0]),
+        dense_shape=constant_op.constant([1]))
+    validate(indexed_slice)
+
+    # Test that `f` works even when `dense_shape` is None.
+    indexed_slice = ops.IndexedSlices(
+        values=constant_op.constant([1]),
+        indices=constant_op.constant([0]),
+        dense_shape=None)
+    validate(indexed_slice)
+
   def testFunctionOnDevice(self):
     if not context.context().num_gpus():
       self.skipTest('No GPUs found')
-- 
GitLab


From e8e33b0050e7e1ff686312bcbdafa270c2e29462 Mon Sep 17 00:00:00 2001
From: Yu-Cheng Ling <ycling@google.com>
Date: Wed, 31 Jan 2018 18:08:33 -0800
Subject: [PATCH 502/966] Test all TFLite kernel implementations
 (reference/optimized/...)

PiperOrigin-RevId: 184077940
---
 tensorflow/contrib/lite/kernels/conv.cc      | 122 +++++++++++--------
 tensorflow/contrib/lite/kernels/conv_test.cc |  69 ++++++++---
 tensorflow/contrib/lite/kernels/test_util.cc |  11 +-
 tensorflow/contrib/lite/kernels/test_util.h  |  52 ++++++++
 4 files changed, 182 insertions(+), 72 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/conv.cc b/tensorflow/contrib/lite/kernels/conv.cc
index 37f499a4d0..a5095e1e64 100644
--- a/tensorflow/contrib/lite/kernels/conv.cc
+++ b/tensorflow/contrib/lite/kernels/conv.cc
@@ -42,7 +42,7 @@ namespace conv {
 enum KernelType {
   kReference,
   kGenericOptimized,  // Neon-free
-  kNeonOptimized,
+  kMultithreadOptimized,
 };
 
 struct OpData {
@@ -290,26 +290,33 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
   auto filter_offset = -filter->params.zero_point;
   auto output_offset = output->params.zero_point;
 
-  if (kernel_type == kReference) {
-    reference_ops::Conv(
-        GetTensorData<uint8_t>(input), GetTensorDims(input), input_offset,
-        GetTensorData<uint8_t>(filter), GetTensorDims(filter), filter_offset,
-        GetTensorData<int32_t>(bias), GetTensorDims(bias), params->stride_width,
-        params->stride_height, data->padding.width, data->padding.height,
-        output_offset, data->output_multiplier, data->output_shift,
-        data->output_activation_min, data->output_activation_max,
-        GetTensorData<uint8_t>(output), GetTensorDims(output),
-        GetTensorData<uint8_t>(im2col), GetTensorDims(im2col), gemm_context);
-  } else {
-    optimized_ops::Conv(
-        GetTensorData<uint8_t>(input), GetTensorDims(input), input_offset,
-        GetTensorData<uint8_t>(filter), GetTensorDims(filter), filter_offset,
-        GetTensorData<int32_t>(bias), GetTensorDims(bias), params->stride_width,
-        params->stride_height, data->padding.width, data->padding.height,
-        output_offset, data->output_multiplier, data->output_shift,
-        data->output_activation_min, data->output_activation_max,
-        GetTensorData<uint8_t>(output), GetTensorDims(output),
-        GetTensorData<uint8_t>(im2col), GetTensorDims(im2col), gemm_context);
+  switch (kernel_type) {
+    case kReference:
+      reference_ops::Conv(
+          GetTensorData<uint8_t>(input), GetTensorDims(input), input_offset,
+          GetTensorData<uint8_t>(filter), GetTensorDims(filter), filter_offset,
+          GetTensorData<int32_t>(bias), GetTensorDims(bias),
+          params->stride_width, params->stride_height, data->padding.width,
+          data->padding.height, output_offset, data->output_multiplier,
+          data->output_shift, data->output_activation_min,
+          data->output_activation_max, GetTensorData<uint8_t>(output),
+          GetTensorDims(output), GetTensorData<uint8_t>(im2col),
+          GetTensorDims(im2col), gemm_context);
+      break;
+    case kGenericOptimized:
+    case kMultithreadOptimized:
+      // There is only one optimized implementation for Quantized Conv.
+      optimized_ops::Conv(
+          GetTensorData<uint8_t>(input), GetTensorDims(input), input_offset,
+          GetTensorData<uint8_t>(filter), GetTensorDims(filter), filter_offset,
+          GetTensorData<int32_t>(bias), GetTensorDims(bias),
+          params->stride_width, params->stride_height, data->padding.width,
+          data->padding.height, output_offset, data->output_multiplier,
+          data->output_shift, data->output_activation_min,
+          data->output_activation_max, GetTensorData<uint8_t>(output),
+          GetTensorDims(output), GetTensorData<uint8_t>(im2col),
+          GetTensorDims(im2col), gemm_context);
+      break;
   }
 }
 
@@ -322,31 +329,46 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
   CalculateActivationRangeFloat(params->activation, &output_activation_min,
                                 &output_activation_max);
 
-  if (kernel_type == kReference) {
-    reference_ops::Conv(GetTensorData<float>(input), GetTensorDims(input),
-                        GetTensorData<float>(filter), GetTensorDims(filter),
-                        GetTensorData<float>(bias), GetTensorDims(bias),
-                        params->stride_width, params->stride_height,
-                        data->padding.width, data->padding.height,
-                        output_activation_min, output_activation_max,
-                        GetTensorData<float>(output), GetTensorDims(output),
-                        GetTensorData<float>(im2col), GetTensorDims(im2col));
-  } else {
-    const float* filter_data;
-    if (data->need_hwcn_weights) {
-      filter_data = GetTensorData<float>(hwcn_weights);
-    } else {
-      filter_data = GetTensorData<float>(filter);
+  switch (kernel_type) {
+    case kReference: {
+      reference_ops::Conv(GetTensorData<float>(input), GetTensorDims(input),
+                          GetTensorData<float>(filter), GetTensorDims(filter),
+                          GetTensorData<float>(bias), GetTensorDims(bias),
+                          params->stride_width, params->stride_height,
+                          data->padding.width, data->padding.height,
+                          output_activation_min, output_activation_max,
+                          GetTensorData<float>(output), GetTensorDims(output),
+                          GetTensorData<float>(im2col), GetTensorDims(im2col));
+      break;
+    }
+    case kGenericOptimized: {
+      optimized_ops::Conv(GetTensorData<float>(input), GetTensorDims(input),
+                          GetTensorData<float>(filter), GetTensorDims(filter),
+                          GetTensorData<float>(bias), GetTensorDims(bias),
+                          params->stride_width, params->stride_height,
+                          data->padding.width, data->padding.height,
+                          output_activation_min, output_activation_max,
+                          GetTensorData<float>(output), GetTensorDims(output),
+                          GetTensorData<float>(im2col), GetTensorDims(im2col));
+      break;
+    }
+    case kMultithreadOptimized: {
+      const float* filter_data;
+      if (data->need_hwcn_weights) {
+        filter_data = GetTensorData<float>(hwcn_weights);
+      } else {
+        filter_data = GetTensorData<float>(filter);
+      }
+      multithreaded_ops::Conv(
+          GetTensorData<float>(input), GetTensorDims(input), filter_data,
+          GetTensorDims(filter), GetTensorData<float>(bias),
+          GetTensorDims(bias), params->stride_width, params->stride_height,
+          data->padding.width, data->padding.height, params->padding,
+          output_activation_min, output_activation_max,
+          GetTensorData<float>(output), GetTensorDims(output),
+          GetTensorData<float>(im2col), GetTensorDims(im2col));
+      break;
     }
-
-    multithreaded_ops::Conv(
-        GetTensorData<float>(input), GetTensorDims(input), filter_data,
-        GetTensorDims(filter), GetTensorData<float>(bias), GetTensorDims(bias),
-        params->stride_width, params->stride_height, data->padding.width,
-        data->padding.height, params->padding, output_activation_min,
-        output_activation_max, GetTensorData<float>(output),
-        GetTensorDims(output), GetTensorData<float>(im2col),
-        GetTensorDims(im2col));
   }
 }
 
@@ -407,18 +429,14 @@ TfLiteRegistration* Register_CONVOLUTION_GENERIC_OPT() {
   return &r;
 }
 
-TfLiteRegistration* Register_CONVOLUTION_NEON_OPT() {
+TfLiteRegistration* Register_CONVOLUTION_MULTITHREADED_OPT() {
   static TfLiteRegistration r = {conv::Init, conv::Free, conv::Prepare,
-                                 conv::Eval<conv::kNeonOptimized>};
+                                 conv::Eval<conv::kMultithreadOptimized>};
   return &r;
 }
 
 TfLiteRegistration* Register_CONV_2D() {
-#ifdef USE_NEON
-  return Register_CONVOLUTION_NEON_OPT();
-#else
-  return Register_CONVOLUTION_GENERIC_OPT();
-#endif
+  return Register_CONVOLUTION_MULTITHREADED_OPT();
 }
 
 }  // namespace builtin
diff --git a/tensorflow/contrib/lite/kernels/conv_test.cc b/tensorflow/contrib/lite/kernels/conv_test.cc
index 1d0a81c313..461efffe39 100644
--- a/tensorflow/contrib/lite/kernels/conv_test.cc
+++ b/tensorflow/contrib/lite/kernels/conv_test.cc
@@ -21,6 +21,17 @@ limitations under the License.
 #include "tensorflow/contrib/lite/model.h"
 
 namespace tflite {
+
+namespace ops {
+namespace builtin {
+
+TfLiteRegistration* Register_CONVOLUTION_REF();
+TfLiteRegistration* Register_CONVOLUTION_GENERIC_OPT();
+TfLiteRegistration* Register_CONVOLUTION_MULTITHREADED_OPT();
+
+}  // namespace builtin
+}  // namespace ops
+
 namespace {
 
 using ::testing::ElementsAreArray;
@@ -30,9 +41,9 @@ class BaseConvolutionOpModel : public SingleOpModel {
   // TODO(ahentz): Also test different activation types, bias, padding types,
   // stride values.
   BaseConvolutionOpModel(
-      const TensorData& input, const TensorData& filter,
-      const TensorData& output, int stride_width = 2, int stride_height = 2,
-      enum Padding padding = Padding_VALID,
+      TfLiteRegistration* registration, const TensorData& input,
+      const TensorData& filter, const TensorData& output, int stride_width = 2,
+      int stride_height = 2, enum Padding padding = Padding_VALID,
       enum ActivationFunctionType activation = ActivationFunctionType_NONE) {
     input_ = AddInput(input);
     filter_ = AddInput(filter);
@@ -62,6 +73,8 @@ class BaseConvolutionOpModel : public SingleOpModel {
                                      stride_height, activation)
                      .Union());
 
+    resolver_ = absl::make_unique<SingleOpResolver>(BuiltinOperator_CONV_2D,
+                                                    registration);
     BuildInterpreter({GetShape(input_), GetShape(filter_), GetShape(bias_)});
   }
 
@@ -83,12 +96,25 @@ class ConvolutionOpModel : public BaseConvolutionOpModel {
   void SetInput(std::initializer_list<float> data) {
     PopulateTensor(input_, data);
   }
-
   std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
 };
 
-TEST(ConvolutionOpTest, SimpleTestFloat32) {
-  ConvolutionOpModel m({TensorType_FLOAT32, {2, 2, 4, 1}},
+const auto kKernelMap = new std::map<string, TfLiteRegistration*>({
+    {"Reference", ops::builtin::Register_CONVOLUTION_REF()},
+    {"GenericOptimized", ops::builtin::Register_CONVOLUTION_GENERIC_OPT()},
+    {"MultithreadedOptimized",
+     ops::builtin::Register_CONVOLUTION_MULTITHREADED_OPT()},
+});
+
+class ConvolutionOpTest : public SingleOpTest {
+ protected:
+  const std::map<string, TfLiteRegistration*>& GetKernelMap() override {
+    return *kKernelMap;
+  }
+};
+
+TEST_P(ConvolutionOpTest, SimpleTestFloat32) {
+  ConvolutionOpModel m(GetRegistration(), {TensorType_FLOAT32, {2, 2, 4, 1}},
                        {TensorType_FLOAT32, {3, 2, 2, 1}},
                        {TensorType_FLOAT32, {}});
 
@@ -117,8 +143,8 @@ TEST(ConvolutionOpTest, SimpleTestFloat32) {
                              }));
 }
 
-TEST(ConvolutionOpTest, SimpleTestFloat32WithAnisotropicStrides) {
-  ConvolutionOpModel m({TensorType_FLOAT32, {1, 3, 6, 1}},
+TEST_P(ConvolutionOpTest, SimpleTestFloat32WithAnisotropicStrides) {
+  ConvolutionOpModel m(GetRegistration(), {TensorType_FLOAT32, {1, 3, 6, 1}},
                        {TensorType_FLOAT32, {1, 2, 2, 1}},
                        {TensorType_FLOAT32, {}},
                        /*stride_width=*/3, /*stride_height=*/1);
@@ -139,7 +165,7 @@ TEST(ConvolutionOpTest, SimpleTestFloat32WithAnisotropicStrides) {
                              }));
 }
 
-TEST(ConvolutionOpTest, HandCalculatedFloat32) {
+TEST_P(ConvolutionOpTest, HandCalculatedFloat32) {
   const int depth = 1;
   const int image_width = 4;
   const int image_height = 3;
@@ -150,6 +176,7 @@ TEST(ConvolutionOpTest, HandCalculatedFloat32) {
   const int stride_height = 1;
   const Padding padding = Padding_SAME;
   ConvolutionOpModel m(
+      GetRegistration(),
       {TensorType_FLOAT32,
        {image_batch_count, image_height, image_width, depth}},
       {TensorType_FLOAT32, {depth, filter_size, filter_size, filter_count}},
@@ -192,7 +219,7 @@ TEST(ConvolutionOpTest, HandCalculatedFloat32) {
                                                178, 187, 234, 261, 121}));
 }
 
-TEST(ConvolutionOpTest, HandCalculatedWithBiasFloat32) {
+TEST_P(ConvolutionOpTest, HandCalculatedWithBiasFloat32) {
   const int depth = 1;
   const int image_width = 4;
   const int image_height = 3;
@@ -203,6 +230,7 @@ TEST(ConvolutionOpTest, HandCalculatedWithBiasFloat32) {
   const int stride_height = 1;
   const Padding padding = Padding_SAME;
   ConvolutionOpModel m(
+      GetRegistration(),
       {TensorType_FLOAT32,
        {image_batch_count, image_height, image_width, depth}},
       {TensorType_FLOAT32, {depth, filter_size, filter_size, filter_count}},
@@ -245,7 +273,7 @@ TEST(ConvolutionOpTest, HandCalculatedWithBiasFloat32) {
                                                367, 188, 197, 244, 271, 131}));
 }
 
-TEST(ConvolutionOpTest, HandCalculatedWithReluFloat32) {
+TEST_P(ConvolutionOpTest, HandCalculatedWithReluFloat32) {
   const int depth = 1;
   const int image_width = 4;
   const int image_height = 3;
@@ -256,6 +284,7 @@ TEST(ConvolutionOpTest, HandCalculatedWithReluFloat32) {
   const int stride_height = 1;
   const Padding padding = Padding_SAME;
   ConvolutionOpModel m(
+      GetRegistration(),
       {TensorType_FLOAT32,
        {image_batch_count, image_height, image_width, depth}},
       {TensorType_FLOAT32, {depth, filter_size, filter_size, filter_count}},
@@ -300,7 +329,7 @@ TEST(ConvolutionOpTest, HandCalculatedWithReluFloat32) {
               ElementsAreArray({0, 0, 0, 0, 35, 112, 157, 0, 0, 34, 61, 0}));
 }
 
-TEST(ConvolutionOpTest, HandCalculatedValidFloat32) {
+TEST_P(ConvolutionOpTest, HandCalculatedValidFloat32) {
   const int depth = 1;
   const int image_width = 4;
   const int image_height = 3;
@@ -311,6 +340,7 @@ TEST(ConvolutionOpTest, HandCalculatedValidFloat32) {
   const int stride_height = 1;
   const Padding padding = Padding_VALID;
   ConvolutionOpModel m(
+      GetRegistration(),
       {TensorType_FLOAT32,
        {image_batch_count, image_height, image_width, depth}},
       {TensorType_FLOAT32, {depth, filter_size, filter_size, filter_count}},
@@ -366,8 +396,9 @@ class QuantizedConvolutionOpModel : public BaseConvolutionOpModel {
 
 // In this tests we set the input and output scales so that the results
 // match exactly the 'non-quantized' version.
-TEST(ConvolutionOpTest, SimpleTestQuantized) {
-  QuantizedConvolutionOpModel m({TensorType_UINT8, {2, 2, 4, 1}, -63.5, 64},
+TEST_P(ConvolutionOpTest, SimpleTestQuantized) {
+  QuantizedConvolutionOpModel m(GetRegistration(),
+                                {TensorType_UINT8, {2, 2, 4, 1}, -63.5, 64},
                                 {TensorType_UINT8, {3, 2, 2, 1}, -63.5, 64},
                                 {TensorType_UINT8, {}, -127, 128});
   m.SetInput({
@@ -405,8 +436,9 @@ TEST(ConvolutionOpTest, SimpleTestQuantized) {
                              }));
 }
 
-TEST(ConvolutionOpTest, SimpleTestQuantizedWithAnisotropicStrides) {
-  QuantizedConvolutionOpModel m({TensorType_UINT8, {1, 3, 6, 1}, -63.5, 64},
+TEST_P(ConvolutionOpTest, SimpleTestQuantizedWithAnisotropicStrides) {
+  QuantizedConvolutionOpModel m(GetRegistration(),
+                                {TensorType_UINT8, {1, 3, 6, 1}, -63.5, 64},
                                 {TensorType_UINT8, {1, 2, 2, 1}, -63.5, 64},
                                 {TensorType_UINT8, {}, -127, 128},
                                 /*stride_width=*/3, /*stride_height=*/1);
@@ -430,6 +462,11 @@ TEST(ConvolutionOpTest, SimpleTestQuantizedWithAnisotropicStrides) {
                                  167, 93,   //
                              }));
 }
+
+INSTANTIATE_TEST_CASE_P(
+    ConvolutionOpTest, ConvolutionOpTest,
+    ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)));
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/test_util.cc b/tensorflow/contrib/lite/kernels/test_util.cc
index 3a58e7ec32..6f56aa6bf3 100644
--- a/tensorflow/contrib/lite/kernels/test_util.cc
+++ b/tensorflow/contrib/lite/kernels/test_util.cc
@@ -172,11 +172,14 @@ void SingleOpModel::BuildInterpreter(
 
   auto* model = GetModel(builder_.GetBufferPointer());
 
-  ops::builtin::BuiltinOpResolver builtins;
-  for (const auto& reg : custom_registrations_) {
-    builtins.AddCustom(reg.first.data(), reg.second());
+  if (!resolver_) {
+    auto resolver = new ops::builtin::BuiltinOpResolver();
+    for (const auto& reg : custom_registrations_) {
+      resolver->AddCustom(reg.first.data(), reg.second());
+    }
+    resolver_ = std::unique_ptr<OpResolver>(resolver);
   }
-  InterpreterBuilder(model, builtins)(&interpreter_);
+  InterpreterBuilder(model, *resolver_)(&interpreter_);
 
   CHECK(interpreter_ != nullptr);
 
diff --git a/tensorflow/contrib/lite/kernels/test_util.h b/tensorflow/contrib/lite/kernels/test_util.h
index cc445299ff..7d476ba1ea 100644
--- a/tensorflow/contrib/lite/kernels/test_util.h
+++ b/tensorflow/contrib/lite/kernels/test_util.h
@@ -85,6 +85,23 @@ struct TensorData {
   int32_t zero_point;
 };
 
+class SingleOpResolver : public OpResolver {
+ public:
+  SingleOpResolver(const BuiltinOperator op, TfLiteRegistration* registration)
+      : op_(op), registration_(registration) {}
+  TfLiteRegistration* FindOp(BuiltinOperator op) const override {
+    if (op == op_) {
+      return registration_;
+    }
+    return nullptr;
+  }
+  TfLiteRegistration* FindOp(const char* op) const override { return nullptr; }
+
+ private:
+  const BuiltinOperator op_;
+  TfLiteRegistration* registration_;
+};
+
 class SingleOpModel {
  public:
   SingleOpModel() {}
@@ -178,11 +195,16 @@ class SingleOpModel {
     return result;
   }
 
+  void SetResolver(std::unique_ptr<OpResolver> resolver) {
+    resolver_ = std::move(resolver);
+  }
+
  protected:
   int32_t GetTensorSize(int index) const;
 
   flatbuffers::FlatBufferBuilder builder_;
   std::unique_ptr<tflite::Interpreter> interpreter_;
+  std::unique_ptr<OpResolver> resolver_;
 
  private:
   int AddTensor(TensorData t, std::initializer_list<int> data);
@@ -197,6 +219,36 @@ class SingleOpModel {
   std::map<string, std::function<TfLiteRegistration*()>> custom_registrations_;
 };
 
+// Base class for single op unit tests.
+// The tests are parameterized to test multiple kernels for a single op.
+// The parameters are strings like "optimized" and "reference" to have better
+// readability in test reports.
+//
+// To use this class:
+// * Define a constant map from strings to TfLiteRegistration.
+// * Implement a test class that inherits SingleOpTest.
+// * Instantiate the test cases with SingleOpTest::GetKernelTags helper
+//   function.
+// * Call GetRegistration to get the TfLiteRegistration to be used before
+//   building the interpreter.
+class SingleOpTest : public ::testing::TestWithParam<string> {
+ public:
+  static std::vector<string> GetKernelTags(
+      const std::map<string, TfLiteRegistration*>& kernel_map) {
+    std::vector<string> tags;
+    for (auto it : kernel_map) {
+      tags.push_back(it.first);
+    }
+    return tags;
+  }
+
+ protected:
+  virtual const std::map<string, TfLiteRegistration*>& GetKernelMap() = 0;
+  TfLiteRegistration* GetRegistration() {
+    return GetKernelMap().at(GetParam());
+  }
+};
+
 // Strings have a special implementation that is in test_util.cc
 template <>
 std::vector<string> SingleOpModel::ExtractVector(int index);
-- 
GitLab


From 02937cb516facfc63935f82acdac26cb76a8bbf0 Mon Sep 17 00:00:00 2001
From: Eugene Brevdo <ebrevdo@google.com>
Date: Wed, 31 Jan 2018 18:18:04 -0800
Subject: [PATCH 503/966] Add a new Dataset:
 PrependFromQueueAndPaddedBatchDataset.

PiperOrigin-RevId: 184078894
---
 .../dataset_serialization_test_base.py        |  21 +
 tensorflow/contrib/training/BUILD             |  23 +
 .../python/training/tensor_queue_dataset.py   | 200 ++++++
 .../training/tensor_queue_dataset_test.py     | 355 ++++++++++
 .../api_def_EnqueueInQueueDataset.pbtxt       |   3 +
 ...rependFromQueueAndPaddedBatchDataset.pbtxt |   3 +
 tensorflow/core/kernels/batch_util.cc         | 113 +++
 tensorflow/core/kernels/batch_util.h          |  10 +
 tensorflow/core/kernels/data/BUILD            |  15 +
 .../kernels/data/padded_batch_dataset_op.cc   | 116 +---
 .../kernels/data/tensor_queue_dataset_op.cc   | 646 ++++++++++++++++++
 tensorflow/core/kernels/gather_op.cc          |   3 +
 tensorflow/core/kernels/strided_slice_op.cc   |   1 +
 .../core/kernels/strided_slice_op_impl.h      |   3 +
 tensorflow/core/ops/dataset_ops.cc            |  25 +
 tensorflow/python/data/ops/dataset_ops.py     |  25 +-
 16 files changed, 1449 insertions(+), 113 deletions(-)
 create mode 100644 tensorflow/contrib/training/python/training/tensor_queue_dataset.py
 create mode 100644 tensorflow/contrib/training/python/training/tensor_queue_dataset_test.py
 create mode 100644 tensorflow/core/api_def/base_api/api_def_EnqueueInQueueDataset.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_PrependFromQueueAndPaddedBatchDataset.pbtxt
 create mode 100644 tensorflow/core/kernels/data/tensor_queue_dataset_op.cc

diff --git a/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py b/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py
index 3f64475e47..dbc35097dd 100644
--- a/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py
+++ b/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py
@@ -24,6 +24,7 @@ import numpy as np
 
 from tensorflow.contrib.data.python.ops import iterator_ops as contrib_iterator_ops
 from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
@@ -35,6 +36,20 @@ from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.util import nest
 
 
+def remove_variants(get_next_op):
+  # TODO(b/72408568): Remove this once session.run can get
+  # variant tensors.
+  """Remove variants from a nest structure, so sess.run will execute."""
+
+  def _remove_variant(x):
+    if isinstance(x, ops.Tensor) and x.dtype == dtypes.variant:
+      return ()
+    else:
+      return x
+
+  return nest.map_structure(_remove_variant, get_next_op)
+
+
 class DatasetSerializationTestBase(test.TestCase):
   """Base class for testing serializable datasets."""
 
@@ -235,6 +250,7 @@ class DatasetSerializationTestBase(test.TestCase):
       saver = self._import_meta_graph()
       init_op, get_next_op = self._get_iterator_ops_from_collection(
           ds_fn, sparse_tensors=sparse_tensors)
+      get_next_op = remove_variants(get_next_op)
       with self.test_session(graph=g) as sess:
         self._restore(saver, sess)
         self._initialize(init_op, sess)
@@ -297,6 +313,7 @@ class DatasetSerializationTestBase(test.TestCase):
     with ops.Graph().as_default() as g:
       _, get_next_op, saver = self._build_graph(
           ds_fn2, sparse_tensors=sparse_tensors)
+      get_next_op = remove_variants(get_next_op)
       with self.test_session(graph=g) as sess:
         self._restore(saver, sess)
         for _ in range(num_outputs - break_point):
@@ -357,6 +374,7 @@ class DatasetSerializationTestBase(test.TestCase):
     with ops.Graph().as_default() as g:
       get_next_op, saver = self._build_empty_graph(
           ds_fn, sparse_tensors=sparse_tensors)
+      get_next_op = remove_variants(get_next_op)
       with self.test_session(graph=g) as sess:
         self._restore(saver, sess)
         for _ in range(num_outputs - break_point):
@@ -390,6 +408,7 @@ class DatasetSerializationTestBase(test.TestCase):
     with ops.Graph().as_default() as g:
       init_op, get_next_op, saver = self._build_graph(
           ds_fn, sparse_tensors=sparse_tensors)
+      get_next_op = remove_variants(get_next_op)
       with self.test_session(graph=g) as sess:
         self._initialize(init_op, sess)
         for _ in range(break_point):
@@ -485,11 +504,13 @@ class DatasetSerializationTestBase(test.TestCase):
       else:
         init_op, get_next_op, saver = self._build_graph(
             ds_fn, sparse_tensors=sparse_tensors)
+      get_next_op = remove_variants(get_next_op)
       return init_op, get_next_op, saver
 
     for i in range(len(break_points) + 1):
       with ops.Graph().as_default() as g:
         init_op, get_next_op, saver = get_ops()
+        get_next_op = remove_variants(get_next_op)
         with self.test_session(graph=g) as sess:
           if ckpt_saved:
             if init_before_restore:
diff --git a/tensorflow/contrib/training/BUILD b/tensorflow/contrib/training/BUILD
index cccaa2b833..6db373d2d5 100644
--- a/tensorflow/contrib/training/BUILD
+++ b/tensorflow/contrib/training/BUILD
@@ -26,6 +26,7 @@ py_library(
         "python/training/resample.py",
         "python/training/sampling_ops.py",
         "python/training/sequence_queueing_state_saver.py",
+        "python/training/tensor_queue_dataset.py",
         "python/training/training.py",
         "python/training/tuner.py",
     ],
@@ -285,6 +286,28 @@ py_test(
     ],
 )
 
+py_test(
+    name = "tensor_queue_dataset_test",
+    size = "large",
+    srcs = ["python/training/tensor_queue_dataset_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["notsan"],
+    deps = [
+        ":training_py",
+        "//tensorflow/contrib/data/python/kernel_tests:dataset_serialization_test",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:random_seed",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/data",
+        "//third_party/py/numpy",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/training/python/training/tensor_queue_dataset.py b/tensorflow/contrib/training/python/training/tensor_queue_dataset.py
new file mode 100644
index 0000000000..409aba817c
--- /dev/null
+++ b/tensorflow/contrib/training/python/training/tensor_queue_dataset.py
@@ -0,0 +1,200 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Python wrappers for Datasets and Iterators."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import sparse
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.util import nest as tf_nest
+
+
+class _PrependFromQueueAndPaddedBatchDataset(dataset_ops.Dataset):
+  """A `Dataset` that prepends a queue to another `Dataset`.
+
+  A vector of handles to the queue is returned as the first component of
+  the associated iterator.  This vector can be passed to
+  `enqueue_in_queue_dataset` to add new elements to the queue.
+  """
+
+  def __init__(self, input_dataset, batch_size, padded_shapes, padding_values):
+    """Initialize `PrependFromQueueAndPaddedBatchDataset`."""
+    super(_PrependFromQueueAndPaddedBatchDataset, self).__init__()
+    if sparse.any_sparse(input_dataset.output_classes):
+      raise TypeError(
+          "Batching of padded sparse tensors is not currently supported")
+    self._input_dataset = input_dataset
+    self._batch_size = ops.convert_to_tensor(
+        batch_size, dtype=dtypes.int64, name="batch_size")
+    # pylint: disable=protected-access
+    if padded_shapes is None:
+      self._padded_shapes = nest.map_structure(
+          dataset_ops._partial_shape_to_tensor, input_dataset.output_shapes)
+    else:
+      self._padded_shapes = nest.map_structure_up_to(
+          input_dataset.output_shapes, dataset_ops._partial_shape_to_tensor,
+          padded_shapes)
+    padding_values = (
+        padding_values if padding_values is not None else
+        dataset_ops._default_padding(input_dataset))
+    self._padding_values = nest.map_structure_up_to(
+        input_dataset.output_shapes, dataset_ops._padding_value_to_tensor,
+        padding_values, input_dataset.output_types)
+    # pylint: enable=protected-access
+
+  def _as_variant_tensor(self):
+    # pylint: disable=protected-access
+    return gen_dataset_ops.prepend_from_queue_and_padded_batch_dataset(
+        self._input_dataset._as_variant_tensor(),
+        batch_size=self._batch_size,
+        padded_shapes=[
+            ops.convert_to_tensor(s, dtype=dtypes.int64)
+            for s in nest.flatten(self._padded_shapes)
+        ],
+        padding_values=nest.flatten(self._padding_values),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
+    # pylint: enable=protected-access
+
+  @property
+  def output_classes(self):
+    return (ops.Tensor, self._input_dataset.output_classes)
+
+  def _as_batch_shape(self, shape_like):
+    return tensor_shape.vector(None).concatenate(
+        tensor_util.constant_value_as_shape(shape_like))
+
+  @property
+  def output_shapes(self):
+    # First output is a variant representing the Queue
+    return (tensor_shape.vector(None),
+            nest.map_structure(self._as_batch_shape, self._padded_shapes))
+
+  @property
+  def output_types(self):
+    # First output is a variant representing the Queue
+    return (dtypes.variant, self._input_dataset.output_types)
+
+
+def prepend_from_queue_and_padded_batch_dataset(batch_size,
+                                                padding_values=None,
+                                                padded_shapes=None):
+  """A transformation that prepends a queue to a `Dataset` and batches results.
+
+  A vector of handles to the queue is returned as the first component of the
+  associated iterator.  This vector can be passed to `enqueue_in_queue_dataset`
+  to add new elements to the queue.
+
+  Below is an example of how this dataset might be used to split incoming
+  variable-length sequences into "head" and "rest" parts, where "rest" parts
+  are re-enqueued back into the dataset.  A more realistic example would
+  perform some calculation on the "head" and modify some components of "rest"
+  with the result (before re-enqueueing).
+
+  ```python
+  dataset = tf.data.Dataset.from_tensor_slices([2*x for x in range(10)])
+  # Make a dataset of variable-length vectors and their lengths.
+  dataset = dataset.map(lambda count: (count, tf.ones((count,))))
+  # Emit a queue we can prepend to, and counts/values as padded batch.
+  dataset = dataset.apply(
+      tf.contrib.training.prepend_from_queue_and_padded_batch_dataset(
+        batch_size=10))
+  dataset = dataset.prefetch(1)
+
+  iterator = dataset.make_one_shot_iterator()
+  queue, (count, padded_value) = iterator.get_next()
+
+  # Split the padded_value into two pieces: head and rest
+  rest_indices = tf.squeeze(tf.where(count > 3), axis=1)
+  bound = tf.minimum(3, tf.reduce_max(count))
+  value_head = padded_value[:, :bound]
+  count_rest = tf.gather(count - 3, rest_indices)
+  value_rest = tf.gather(padded_value[:, bound:], rest_indices)
+  queue_rest = tf.gather(queue, rest_indices)
+  enqueue_rest_op = tf.contrib.training.enqueue_in_queue_dataset(
+    queue_rest, (count_rest, value_rest))
+  with tf.control_dependencies([enqueue_rest_op]):
+    calculation = fn(value_head)
+
+  while True:  # Will raise OutOfRange when finished with all pieces.
+    session.run(calculation)
+  ```
+
+  Args:
+    batch_size: `int64` scalar tensor.  The batch size to use when performing
+      padded batching.
+    padding_values: (optional) Nested tuple of scalar tensors.  If provided,
+      the structure and dtypes of padding_values should match that of
+      incoming dataset's `output_types`.
+    padded_shapes: (optional) Nested tuple of `int64` vector tensors.
+      If provided, the structure must match that of the incoming dataset's
+      `output_types`.  If not provided, the incoming dataset's `output_shapes`
+      is used.  Any unknown (`None` or `-1`) dimensions in the shapes are
+      treated as being unique per-batch: for each batch time, an unknown
+      dimension is replaced with the maximum given value of this dimension
+      across all tensors for the given component in the batch.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    @{tf.data.Dataset.apply}.
+  """
+
+  def _apply_fn(dataset):
+    return _PrependFromQueueAndPaddedBatchDataset(
+        dataset,
+        batch_size=batch_size,
+        padding_values=padding_values,
+        padded_shapes=padded_shapes)
+
+  return _apply_fn
+
+
+def enqueue_in_queue_dataset(queue, components):
+  """Enqueue components into queue from `PrependFromQueueAndPaddedBatchDataset`.
+
+  The components' dtypes and shapes must be compatible with the `output_shapes`
+  attribute of the `dataset` created by
+  `prepend_from_queue_and_padded_batch_dataset`.  This operation supports both
+  non-batched and batched modes.
+
+  For more details, see the example in the docstring for
+  `prepend_from_queue_and_padded_batch_dataset`.
+
+  Args:
+    queue: `variant` scalar or vector tensor.
+      The tensor emitted by the first component of the iterator associated with
+      `prepend_from_queue_and_padded_batch_dataset`.  If this is a scalar,
+      then the `components` input tensors should not have a prepended batch
+      dimension.
+    components: Nested tuple of tensors, each with a leading batch dimension
+      if `queue` is a vector.  The structure, dtypes, and shapes
+      (excluding batch dimension) must match the nested tuples
+      `dataset.output_types[1]` and `dataset.output_shapes[1]` (the non-queue
+      output types and shapes) of the `dataset` emitted by
+      the original `prepend_from_queue_and_padded_batch_dataset` call.
+
+  Returns:
+    An `Operation` that enqueues `components` into the dataset(s) associated
+    with entries of `queue`.
+  """
+  return gen_dataset_ops.enqueue_in_queue_dataset(
+      queue=queue, components=tf_nest.flatten(components))
diff --git a/tensorflow/contrib/training/python/training/tensor_queue_dataset_test.py b/tensorflow/contrib/training/python/training/tensor_queue_dataset_test.py
new file mode 100644
index 0000000000..0338f409a2
--- /dev/null
+++ b/tensorflow/contrib/training/python/training/tensor_queue_dataset_test.py
@@ -0,0 +1,355 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for TensorQueueDataset."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
+from tensorflow.contrib.training.python.training import tensor_queue_dataset as tqd
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import string_ops
+from tensorflow.python.platform import test
+
+
+class PrependFromQueueAndPaddedBatchDatasetTest(test.TestCase):
+
+  def testNoEnqueue(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([0, 1, 2])
+    dataset = dataset.apply(
+        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1))
+    self.assertEqual((dtypes.variant, dtypes.int32), dataset.output_types)
+    self.assertAllEqual(([None],) * 2,
+                        [x.as_list() for x in dataset.output_shapes])
+    iterator = dataset.make_one_shot_iterator()
+    _, value = iterator.get_next()
+    self.assertEqual([0], self.evaluate(value))
+    self.assertEqual([1], self.evaluate(value))
+    self.assertEqual([2], self.evaluate(value))
+    with self.assertRaisesOpError("End of sequence"):
+      self.evaluate(value)
+
+  def testBatchedNoEnqueue(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([0, 1, 2])
+    dataset = dataset.apply(
+        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=2))
+    iterator = dataset.make_one_shot_iterator()
+    _, value = iterator.get_next()
+    self.assertAllEqual([0, 1], self.evaluate(value))
+    self.assertAllEqual([2], self.evaluate(value))
+    with self.assertRaisesOpError("End of sequence"):
+      self.evaluate(value)
+
+  def testBatchedWithBiggerPaddingNoEnqueue(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([[0], [1], [2]])
+    dataset = dataset.apply(
+        tqd.prepend_from_queue_and_padded_batch_dataset(
+            batch_size=2, padded_shapes=[3]))
+    iterator = dataset.make_one_shot_iterator()
+    _, value = iterator.get_next()
+    self.assertAllEqual([[0, 0, 0], [1, 0, 0]], self.evaluate(value))
+    self.assertAllEqual([[2, 0, 0]], self.evaluate(value))
+    with self.assertRaisesOpError("End of sequence"):
+      self.evaluate(value)
+
+  def testBatchedWithBiggerPaddingOneEnqueue(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([[0], [1], [2]])
+    dataset = dataset.apply(
+        tqd.prepend_from_queue_and_padded_batch_dataset(
+            batch_size=1, padded_shapes=[3]))
+    iterator = dataset.make_one_shot_iterator()
+    queue_handle, value = iterator.get_next()
+    enqueue_negative = tqd.enqueue_in_queue_dataset(queue_handle, -value)
+    with self.test_session() as sess:
+      self.assertAllEqual([[0, 0, 0]], sess.run(value))
+      value_1, _ = sess.run([value, enqueue_negative])
+      self.assertAllEqual([[1, 0, 0]], value_1)
+      value_2, _ = sess.run([value, enqueue_negative])
+      self.assertAllEqual([[-1, 0, 0]], value_2)
+      value_3 = sess.run(value)
+      self.assertAllEqual([[1, 0, 0]], value_3)
+      value_4, _ = sess.run([value, enqueue_negative])
+      self.assertAllEqual([[2, 0, 0]], value_4)
+      value_5 = sess.run(value)
+      self.assertAllEqual([[-2, 0, 0]], value_5)
+      with self.assertRaisesOpError("End of sequence"):
+        sess.run(value)
+
+  def testOneEnqueue(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([0, 1, 2])
+    dataset = dataset.apply(
+        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1))
+    iterator = dataset.make_one_shot_iterator()
+    queue_handle, value = iterator.get_next()
+    enqueue_negative = tqd.enqueue_in_queue_dataset(queue_handle, -value)
+    with self.test_session() as sess:
+      self.assertEqual([0], sess.run(value))
+      value_1, _ = sess.run([value, enqueue_negative])
+      self.assertEqual([1], value_1)
+      value_2, _ = sess.run([value, enqueue_negative])
+      self.assertEqual([-1], value_2)
+      value_3 = sess.run(value)
+      self.assertEqual([1], value_3)
+      value_4, _ = sess.run([value, enqueue_negative])
+      self.assertEqual([2], value_4)
+      value_5 = sess.run(value)
+      self.assertEqual([-2], value_5)
+      with self.assertRaisesOpError("End of sequence"):
+        sess.run(value)
+
+  def testBatchedOneEnqueue(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([0, 1, 2])
+    dataset = dataset.apply(
+        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=2))
+    iterator = dataset.make_one_shot_iterator()
+    queue_handle, value = iterator.get_next()
+    enqueue_negative = tqd.enqueue_in_queue_dataset(queue_handle, -value)
+    enqueue_zeroth = tqd.enqueue_in_queue_dataset([queue_handle[0]],
+                                                  array_ops.expand_dims(
+                                                      value[0], axis=0))
+    with self.test_session() as sess:
+      value_0, _ = sess.run([value, enqueue_negative])
+      self.assertAllEqual([0, 1], value_0)
+      value_1, _ = sess.run([value, enqueue_zeroth])
+      self.assertAllEqual([0, -1], value_1)
+      value_2, _ = sess.run([value, enqueue_negative])
+      self.assertAllEqual([0, 2], value_2)
+      self.assertAllEqual([0, -2], sess.run(value))
+      with self.assertRaisesOpError("End of sequence"):
+        sess.run(value)
+
+  def testManyEnqueue(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([0, 1])
+    dataset = dataset.apply(
+        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1))
+    iterator = dataset.make_one_shot_iterator()
+    queue_handle, value = iterator.get_next()
+    enqueue_many_more = [
+        tqd.enqueue_in_queue_dataset(queue_handle, value + 100 + i)
+        for i in range(1000)
+    ]
+    with self.test_session() as sess:
+      value_0, _ = sess.run((value, enqueue_many_more))
+      self.assertEqual([0], value_0)
+      rest = []
+      for _ in range(1000):
+        rest.append(sess.run(value))
+      self.assertEquals([[100 + i] for i in range(1000)], sorted(rest))
+      # Going back to the original input.
+      value_1, _ = sess.run((value, enqueue_many_more))
+      self.assertEqual(1, value_1)
+      rest = []
+      for _ in range(1000):
+        rest.append(sess.run(value))
+      self.assertEquals([[100 + i + 1] for i in range(1000)], sorted(rest))
+      with self.assertRaisesOpError("End of sequence"):
+        sess.run(value)
+
+  def testEnqueueWithPrefetch(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([0])
+    dataset = dataset.apply(
+        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1))
+    # Prefetching will request additional values before they are
+    # available to the queue.
+    dataset = dataset.prefetch(buffer_size=3)
+    iterator = dataset.make_one_shot_iterator()
+    queue_handle, value = iterator.get_next()
+    enqueue = tqd.enqueue_in_queue_dataset(queue_handle, value + 1)
+    with self.test_session() as sess:
+      i = 0
+      while i < 4:
+        received, _ = sess.run((value, enqueue))
+        if received.size > 0:
+          self.assertAllEqual([i], received)
+          i += 1
+      received_last = False
+      while True:
+        try:
+          received = sess.run(value)
+          if received.size > 0:
+            self.assertAllEqual([4], received)
+            received_last = True
+        except errors.OutOfRangeError:
+          break
+      self.assertTrue(received_last)
+
+  def testDatasetWithPaddedShapeSmallerThanInputFails(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([[0, 0, 0]]).repeat(None)
+    dataset = dataset.apply(
+        tqd.prepend_from_queue_and_padded_batch_dataset(
+            batch_size=1, padded_shapes=[2]))
+    iterator = dataset.make_one_shot_iterator()
+    _, value = iterator.get_next()
+    with self.test_session() as sess:
+      with self.assertRaisesOpError(
+          r"Incompatible input shapes at component 0 between "
+          r"input dataset this dataset: \[3\] vs. \[2\]"):
+        sess.run(value)
+
+  def testEnqueueWithIncompatibleInputsFailsWithInformativeError(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([0]).repeat(None)
+    dataset = dataset.apply(
+        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1))
+    iterator = dataset.make_one_shot_iterator()
+    queue_handle, value = iterator.get_next()
+
+    enqueue_bad_structure = tqd.enqueue_in_queue_dataset(
+        queue_handle, (value, value))
+    enqueue_bad_dtype = tqd.enqueue_in_queue_dataset(queue_handle,
+                                                     np.array(
+                                                         [1.0],
+                                                         dtype=np.float32))
+    enqueue_bad_shape_no_batch_dim = tqd.enqueue_in_queue_dataset(
+        queue_handle, ([1],))
+    enqueue_bad_shape = tqd.enqueue_in_queue_dataset(queue_handle,
+                                                     np.array(
+                                                         [[1]], dtype=np.int32))
+
+    with self.test_session() as sess:
+      with self.assertRaisesOpError(
+          "mismatched number of tensors.  Queue expects 1 tensors but "
+          "tried to insert 2"):
+        sess.run(enqueue_bad_structure)
+      with self.assertRaisesOpError(r"Expected component 0 to have batched "
+                                    r"shape \[1,...\], but saw shape: \[\]"):
+        sess.run(enqueue_bad_shape_no_batch_dim)
+      with self.assertRaisesOpError(
+          r"mismatched shapes at component 0.  Attempted to insert tensor "
+          r"with shape \[1\] but queue expected shape: \[\]"):
+        sess.run(enqueue_bad_shape)
+      with self.assertRaisesOpError(
+          r"mismatched dtypes at component 0.  Attempted to insert tensor "
+          r"of type float but queue expected type: int32"):
+        sess.run(enqueue_bad_dtype)
+
+  def testEnqueueWithPaddedBatchFailsWithInformativeError(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([0, 1, 2])
+    dataset = dataset.apply(
+        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1))
+    with self.assertRaisesRegexp(
+        TypeError, r"Unable to create padding for field of type 'variant'"):
+      dataset.padded_batch(batch_size=10, padded_shapes=[1])
+
+  def testOneEnqueueWithPadding(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([0, 2, 4, 6])
+    # Make a dataset of variable-length vectors and their lengths.
+    dataset = dataset.map(
+        lambda c: (c, c * array_ops.ones((c,), dtype=c.dtype)))
+    # Emit a queue we can prepend to, and counts/values as padded
+    # batch.
+    dataset = dataset.apply(
+        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=3))
+
+    iterator = dataset.make_one_shot_iterator()
+    queue, (count, padded_value) = iterator.get_next()
+
+    # Split the padded_value into two pieces: head and rest
+    rest_indices = array_ops.squeeze(array_ops.where(count > 2), axis=1)
+    bound = math_ops.minimum(2, math_ops.reduce_max(count))
+    value_head = padded_value[:, :bound]
+    count_rest = array_ops.gather(count - 2, rest_indices)
+    value_rest = array_ops.gather(padded_value, rest_indices)[:, bound:]
+    queue_rest = array_ops.gather(queue, rest_indices)
+    enqueue_rest_op = tqd.enqueue_in_queue_dataset(queue_rest,
+                                                   (count_rest, value_rest))
+    with ops.control_dependencies([enqueue_rest_op]):
+      calc = array_ops.identity(value_head)
+
+    with self.test_session() as sess:
+      self.assertAllEqual([[0, 0], [2, 2], [4, 4]], sess.run(calc))
+      self.assertAllEqual([[4, 4], [6, 6]], sess.run(calc))
+      self.assertAllEqual([[6, 6]], sess.run(calc))
+      self.assertAllEqual([[6, 6]], sess.run(calc))
+      # Get some final batches due to prefetching.
+      for _ in range(3):
+        try:
+          self.assertAllEqual(
+              np.empty(shape=(0, 0), dtype=np.int32), sess.run(calc))
+        except errors.OutOfRangeError as e:
+          self.assertTrue(str(e).startswith("End of sequence"))
+
+  def testNonstandardPadding(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([0, 2, 4, 6])
+    # Make a dataset of variable-length vectors and their lengths.
+    dataset = dataset.map(
+        lambda c: (c, c * array_ops.ones((c,), dtype=c.dtype)))
+    # Emit a queue we can prepend to, and counts/values as padded
+    # batch.
+    dataset = dataset.apply(
+        tqd.prepend_from_queue_and_padded_batch_dataset(
+            batch_size=3, padding_values=(
+                0,
+                -1,
+            )))
+
+    iterator = dataset.make_one_shot_iterator()
+    _, (unused_count, padded_value) = iterator.get_next()
+
+    with self.test_session() as sess:
+      self.assertAllEqual([[-1, -1, -1, -1], [2, 2, -1, -1], [4, 4, 4, 4]],
+                          sess.run(padded_value))
+      self.assertAllEqual([[6] * 6], sess.run(padded_value))
+      with self.assertRaisesOpError("End of sequence"):
+        sess.run(padded_value)
+
+
+# TODO(ebrevdo): Figure out how to use run_core_tests to test state
+# saving of an iterator that's had some tensors enqueued into its queue.
+class PrependFromQueueAndPaddedBatchDatasetSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def testPrependFromQueueAndPaddedBatch(self):
+
+    def build_dataset(seq_lens):
+      return dataset_ops.Dataset.from_tensor_slices(seq_lens).map(
+          lambda x: array_ops.fill([x], x)).apply(
+              tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=4))
+
+    seq_lens1 = np.random.randint(1, 20, size=(32,)).astype(np.int32)
+    seq_lens2 = np.random.randint(21, 40, size=(32,)).astype(np.int32)
+    self.run_core_tests(lambda: build_dataset(seq_lens1),
+                        lambda: build_dataset(seq_lens2), 8)
+
+  def testPrependFromQueueAndPaddedBatchNonDefaultPadding(self):
+
+    def build_dataset(seq_lens):
+
+      def fill_tuple(x):
+        filled = array_ops.fill([x], x)
+        return (filled, string_ops.as_string(filled))
+
+      padded_shape = [-1]
+      return dataset_ops.Dataset.from_tensor_slices(seq_lens).map(
+          fill_tuple).apply(
+              tqd.prepend_from_queue_and_padded_batch_dataset(
+                  batch_size=4,
+                  padded_shapes=(padded_shape, padded_shape),
+                  padding_values=(-1, "<end>")))
+
+    seq_lens1 = np.random.randint(1, 20, size=(32,)).astype(np.int32)
+    seq_lens2 = np.random.randint(21, 40, size=(32,)).astype(np.int32)
+    self.run_core_tests(lambda: build_dataset(seq_lens1),
+                        lambda: build_dataset(seq_lens2), 8)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/core/api_def/base_api/api_def_EnqueueInQueueDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_EnqueueInQueueDataset.pbtxt
new file mode 100644
index 0000000000..9722f5ede3
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_EnqueueInQueueDataset.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "EnqueueInQueueDataset"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_PrependFromQueueAndPaddedBatchDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_PrependFromQueueAndPaddedBatchDataset.pbtxt
new file mode 100644
index 0000000000..d4549340fa
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_PrependFromQueueAndPaddedBatchDataset.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "PrependFromQueueAndPaddedBatchDataset"
+}
diff --git a/tensorflow/core/kernels/batch_util.cc b/tensorflow/core/kernels/batch_util.cc
index 7f2df95e2d..87d455faa7 100644
--- a/tensorflow/core/kernels/batch_util.cc
+++ b/tensorflow/core/kernels/batch_util.cc
@@ -19,6 +19,8 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/errors.h"
 
+#define TF_CALL_DATASET_TYPES(m) TF_CALL_ALL_TYPES(m) TF_CALL_QUANTIZED_TYPES(m)
+
 namespace tensorflow {
 namespace batch_util {
 
@@ -61,6 +63,21 @@ Status HandleElementToSlice<string>(Tensor element, Tensor* parent, int64 index,
   return Status::OK();
 }
 
+template <>
+Status HandleElementToSlice<Variant>(Tensor element, Tensor* parent,
+                                     int64 index, bool can_move) {
+  auto parent_as_matrix = parent->flat_outer_dims<Variant>();
+  auto element_flat = element.flat<Variant>();
+  if (can_move) {
+    for (int64 i = 0; i < element.NumElements(); ++i) {
+      parent_as_matrix(index, i) = std::move(element_flat(i));
+    }
+  } else {
+    parent_as_matrix.chip(index, 0) = element_flat;
+  }
+  return Status::OK();
+}
+
 // TODO(jsimsa): Add HandleElementToSlice<variant> specialization that moves
 // the data when possible.
 
@@ -115,5 +132,101 @@ Status CopySliceToElement(const Tensor& parent, Tensor* element, int64 index) {
   }
 }
 
+// The following five functions are copied from padding_fifo_queue.cc.
+// TODO(mrry): Reconcile these functions with the similar methods in the
+// queue implementation.
+Status ValidateElementToLargerSlice(const Tensor& element, Tensor* parent) {
+  DCHECK_NE(parent->dim_size(0), 0);
+  if (element.NumElements() > (parent->NumElements() / parent->dim_size(0))) {
+    TensorShape chip_shape = parent->shape();
+    chip_shape.RemoveDim(0);
+    return errors::Internal(
+        "HandleElementToLargerSlice Cannot copy slice: number of entries in "
+        "element is greater than number of elements in parent slice.  ",
+        "Shapes are: [element]: ", element.shape().DebugString(),
+        ", [parent slice]: ", chip_shape.DebugString());
+  }
+  return Status::OK();
+}
+
+template <typename T, int NDIMS>
+Status HandleElementToLargerSlice(const Tensor& element, Tensor* parent,
+                                  int index) {
+  TF_RETURN_IF_ERROR(ValidateElementToLargerSlice(element, parent));
+  if (element.NumElements() == 0) {
+    return Status::OK();
+  }
+  auto element_t = element.tensor<T, NDIMS>();
+  auto parent_t = parent->tensor<T, NDIMS + 1>();
+  Eigen::DSizes<Eigen::DenseIndex, NDIMS + 1> slice_indices;
+  slice_indices[0] = index;
+  Eigen::DSizes<Eigen::DenseIndex, NDIMS + 1> slice_size;
+  slice_size[0] = 1;
+  for (size_t i = 1; i < slice_size.size(); ++i) {
+    slice_size[i] = element_t.dimension(i - 1);
+  }
+  parent_t.slice(slice_indices, slice_size) = element_t.reshape(slice_size);
+  return Status::OK();
+}
+
+template <int NDIMS>
+Status HandleElementToLargerSliceWithRank(const Tensor& element, Tensor* parent,
+                                          int index) {
+#define HANDLE_TYPE(T)                                                   \
+  case DataTypeToEnum<T>::value: {                                       \
+    return HandleElementToLargerSlice<T, NDIMS>(element, parent, index); \
+  }
+
+  switch (element.dtype()) {
+    TF_CALL_DATASET_TYPES(HANDLE_TYPE);
+#undef HANDLE_TYPE
+    default:
+      return errors::Unimplemented(
+          "HandleElementToLargerSliceWithRank Unhandled data type: ",
+          element.dtype());
+  }
+}
+
+Status CopyElementToLargerSlice(const Tensor& element, Tensor* parent,
+                                int index) {
+  if (parent->dims() != element.dims() + 1) {
+    return errors::Internal(
+        "Mismatched ranks.  Element's rank is: ", element.dims(),
+        " but element is meant to be a slice in output Tensor having rank: ",
+        parent->dims(), " (should be: ", element.dims() + 1, ")");
+  }
+
+#define HANDLE_DIMS(NDIMS)                                                  \
+  case NDIMS: {                                                             \
+    TF_RETURN_IF_ERROR(                                                     \
+        HandleElementToLargerSliceWithRank<NDIMS>(element, parent, index)); \
+    return Status::OK();                                                    \
+  }
+
+  switch (element.dims()) {
+    HANDLE_DIMS(0);
+    HANDLE_DIMS(1);
+    HANDLE_DIMS(2);
+    HANDLE_DIMS(3);
+    HANDLE_DIMS(4);
+#undef HANDLE_DIMS
+    default:
+      return errors::Unimplemented("CopyElementToLargerSlice Unhandled rank: ",
+                                   element.dims());
+  }
+}
+
+Status SetElementZero(Tensor* element, const Tensor& padding) {
+#define HANDLE_TYPE(T)                                     \
+  if (element->dtype() == DataTypeToEnum<T>::value) {      \
+    element->flat<T>().setConstant(padding.scalar<T>()()); \
+    return Status::OK();                                   \
+  }
+  TF_CALL_DATASET_TYPES(HANDLE_TYPE);
+#undef HANDLE_TYPE
+  return errors::Unimplemented("SetElementZero Unhandled data type: ",
+                               element->dtype());
+}
+
 }  // namespace batch_util
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/batch_util.h b/tensorflow/core/kernels/batch_util.h
index 0d634ae7b0..a47bf1935d 100644
--- a/tensorflow/core/kernels/batch_util.h
+++ b/tensorflow/core/kernels/batch_util.h
@@ -32,6 +32,16 @@ Status CopyElementToSlice(Tensor element, Tensor* parent, int64 index);
 // Copies the index^th slice of parent (in the 0th dimension) into element.
 Status CopySliceToElement(const Tensor& parent, Tensor* element, int64 index);
 
+// Zero-initializes the tensor `element` using the scalar stored in `padding`.
+// Both `element` and `padding` must have matching `dtype`.
+Status SetElementZero(Tensor* element, const Tensor& padding);
+
+// Copies `element` into a (0th dimension) slice of `parent`, assuming
+// the shape of `element` is strictly not larger along any axis than a
+// slice.
+Status CopyElementToLargerSlice(const Tensor& element, Tensor* parent,
+                                int index);
+
 }  // namespace batch_util
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index cdb4023861..c4e21257ff 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -121,6 +121,7 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels:batch_util",
     ],
 )
 
@@ -401,6 +402,19 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "tensor_queue_dataset_op",
+    srcs = ["tensor_queue_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels:batch_util",
+    ],
+)
+
 tf_kernel_library(
     name = "tensor_slice_dataset_op",
     srcs = ["tensor_slice_dataset_op.cc"],
@@ -539,6 +553,7 @@ tf_kernel_library(
         ":stats_dataset_ops",
         ":take_dataset_op",
         ":tensor_dataset_op",
+        ":tensor_queue_dataset_op",
         ":tensor_slice_dataset_op",
         ":unique_dataset_op",
         ":zip_dataset_op",
diff --git a/tensorflow/core/kernels/data/padded_batch_dataset_op.cc b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
index 4fe4e8e294..cfb4efda9a 100644
--- a/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_util.h"
+#include "tensorflow/core/kernels/batch_util.h"
 #include "tensorflow/core/kernels/data/dataset.h"
 
 namespace tensorflow {
@@ -24,102 +25,6 @@ namespace {
 // See documentation in ../ops/dataset_ops.cc for a high-level
 // description of the following op.
 
-// The following five functions are copied from padding_fifo_queue.cc.
-// TODO(mrry): Reconcile these functions with the similar methods in the
-// queue implementation.
-Status ValidateElementToLargerSlice(const Tensor& element, Tensor* parent) {
-  DCHECK_NE(parent->dim_size(0), 0);
-  if (element.NumElements() > (parent->NumElements() / parent->dim_size(0))) {
-    TensorShape chip_shape = parent->shape();
-    chip_shape.RemoveDim(0);
-    return errors::Internal(
-        "HandleElementToLargerSlice Cannot copy slice: number of entries in "
-        "element is greater than number of elements in parent slice.  ",
-        "Shapes are: [element]: ", element.shape().DebugString(),
-        ", [parent slice]: ", chip_shape.DebugString());
-  }
-  return Status::OK();
-}
-
-template <typename T, int NDIMS>
-Status HandleElementToLargerSlice(const Tensor& element, Tensor* parent,
-                                  int index) {
-  TF_RETURN_IF_ERROR(ValidateElementToLargerSlice(element, parent));
-  if (element.NumElements() == 0) {
-    return Status::OK();
-  }
-  auto element_t = element.tensor<T, NDIMS>();
-  auto parent_t = parent->tensor<T, NDIMS + 1>();
-  Eigen::DSizes<Eigen::DenseIndex, NDIMS + 1> slice_indices;
-  slice_indices[0] = index;
-  Eigen::DSizes<Eigen::DenseIndex, NDIMS + 1> slice_size;
-  slice_size[0] = 1;
-  for (size_t i = 1; i < slice_size.size(); ++i) {
-    slice_size[i] = element_t.dimension(i - 1);
-  }
-  parent_t.slice(slice_indices, slice_size) = element_t.reshape(slice_size);
-  return Status::OK();
-}
-
-template <int NDIMS>
-Status HandleElementToLargerSliceWithRank(const Tensor& element, Tensor* parent,
-                                          int index) {
-#define HANDLE_TYPE(T)                                                   \
-  case DataTypeToEnum<T>::value: {                                       \
-    return HandleElementToLargerSlice<T, NDIMS>(element, parent, index); \
-  }
-
-  switch (element.dtype()) {
-    TF_CALL_DATASET_TYPES(HANDLE_TYPE);
-#undef HANDLE_TYPE
-    default:
-      return errors::Unimplemented(
-          "HandleElementToLargerSliceWithRank Unhandled data type: ",
-          element.dtype());
-  }
-}
-
-Status CopyElementToLargerSlice(const Tensor& element, Tensor* parent,
-                                int index) {
-  if (parent->dims() != element.dims() + 1) {
-    return errors::Internal(
-        "Mismatched ranks.  Element's rank is: ", element.dims(),
-        " but element is meant to be a slice in output Tensor having rank: ",
-        parent->dims(), " (should be: ", element.dims() + 1, ")");
-  }
-
-#define HANDLE_DIMS(NDIMS)                                                  \
-  case NDIMS: {                                                             \
-    TF_RETURN_IF_ERROR(                                                     \
-        HandleElementToLargerSliceWithRank<NDIMS>(element, parent, index)); \
-    return Status::OK();                                                    \
-  }
-
-  switch (element.dims()) {
-    HANDLE_DIMS(0);
-    HANDLE_DIMS(1);
-    HANDLE_DIMS(2);
-    HANDLE_DIMS(3);
-    HANDLE_DIMS(4);
-#undef HANDLE_DIMS
-    default:
-      return errors::Unimplemented("CopyElementToLargerSlice Unhandled rank: ",
-                                   element.dims());
-  }
-}
-
-Status SetElementZero(Tensor* element, const Tensor& padding) {
-#define HANDLE_TYPE(T)                                     \
-  if (element->dtype() == DataTypeToEnum<T>::value) {      \
-    element->flat<T>().setConstant(padding.scalar<T>()()); \
-    return Status::OK();                                   \
-  }
-  TF_CALL_DATASET_TYPES(HANDLE_TYPE);
-#undef HANDLE_TYPE
-  return errors::Unimplemented("SetElementZero Unhandled data type: ",
-                               element->dtype());
-}
-
 class PaddedBatchDatasetOp : public UnaryDatasetOpKernel {
  public:
   explicit PaddedBatchDatasetOp(OpKernelConstruction* ctx)
@@ -379,17 +284,24 @@ class PaddedBatchDatasetOp : public UnaryDatasetOpKernel {
           Tensor batch_component(ctx->allocator({}),
                                  output_dtypes()[component_index],
                                  batch_component_shape);
-          TF_RETURN_IF_ERROR(SetElementZero(
+          TF_RETURN_IF_ERROR(batch_util::SetElementZero(
               &batch_component, dataset()->padding_values_[component_index]));
 
           // Build the output tuple component by copying one slice
           // from each input element in the batch.
+          TensorShape component_shape({});
+          for (int i = 1; i < batch_component_shape.dims(); ++i) {
+            component_shape.AddDim(batch_component_shape.dim_size(i));
+          }
           for (int64 i = 0; i < num_batch_elements; ++i) {
-            TF_RETURN_IF_ERROR(ValidateElementToLargerSlice(
-                batch_elements[i][component_index], &batch_component));
-
-            TF_RETURN_IF_ERROR(CopyElementToLargerSlice(
-                batch_elements[i][component_index], &batch_component, i));
+            // Take the fast path if possible.
+            if (batch_elements[i][component_index].shape() == component_shape) {
+              TF_RETURN_IF_ERROR(batch_util::CopyElementToSlice(
+                  batch_elements[i][component_index], &batch_component, i));
+            } else {
+              TF_RETURN_IF_ERROR(batch_util::CopyElementToLargerSlice(
+                  batch_elements[i][component_index], &batch_component, i));
+            }
           }
           out_tensors->push_back(std::move(batch_component));
         }
diff --git a/tensorflow/core/kernels/data/tensor_queue_dataset_op.cc b/tensorflow/core/kernels/data/tensor_queue_dataset_op.cc
new file mode 100644
index 0000000000..ff412a4671
--- /dev/null
+++ b/tensorflow/core/kernels/data/tensor_queue_dataset_op.cc
@@ -0,0 +1,646 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <deque>
+
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
+#include "tensorflow/core/kernels/batch_util.h"
+#include "tensorflow/core/kernels/data/dataset.h"
+
+namespace tensorflow {
+
+namespace {
+
+bool IsGreaterEqualToOrCompatibleWith(const PartialTensorShape& a,
+                                      const PartialTensorShape& b) {
+  // Returns true if dims[a] >= dims[b], or are compatible.
+  if (a.unknown_rank()) return true;
+  if (a.dims() != b.dims()) return false;
+  for (int d = 0; d < a.dims(); ++d) {
+    if (a.dim_size(d) == -1 || b.dim_size(d) == -1) continue;
+    if (a.dim_size(d) < b.dim_size(d)) return false;
+  }
+  return true;
+}
+
+DataTypeVector PrependQueueType(const DataTypeVector& dtypes) {
+  DataTypeVector out;
+  out.reserve(dtypes.size() + 1);
+  out.push_back(DT_VARIANT);  // The queue component.
+  for (const DataType& d : dtypes) out.push_back(d);
+  return out;
+}
+
+std::vector<PartialTensorShape> PrependQueueShapeWithBatch(
+    const std::vector<PartialTensorShape>& shapes) {
+  std::vector<PartialTensorShape> out;
+  out.reserve(shapes.size() + 1);
+  out.emplace_back(PartialTensorShape({-1}));  // The queue component.
+  for (PartialTensorShape s : shapes) {
+    s.InsertDim(0, -1);  // Unknown batch size.
+    out.push_back(std::move(s));
+  }
+  return out;
+}
+
+class EnqueueInQueueDatasetOp;
+
+class PrependFromQueueAndPaddedBatchDataset : public GraphDatasetBase {
+ public:
+  PrependFromQueueAndPaddedBatchDataset(
+      OpKernelContext* ctx, const int64 batch_size, const DatasetBase* input,
+      const DataTypeVector& dtypes,
+      const std::vector<PartialTensorShape>& shapes,
+      std::vector<Tensor> padding_values)
+      : GraphDatasetBase(ctx),
+        batch_size_(batch_size),
+        input_(input),
+        dtypes_(dtypes),
+        shapes_(shapes),
+        padding_values_(std::move(padding_values)),
+        dtypes_with_queue_(PrependQueueType(dtypes)),
+        batched_shapes_with_queue_(PrependQueueShapeWithBatch(shapes)) {
+    input_->Ref();
+  }
+
+  ~PrependFromQueueAndPaddedBatchDataset() override { input_->Unref(); }
+
+  std::unique_ptr<IteratorBase> MakeIterator(
+      const string& prefix) const override {
+    return std::unique_ptr<IteratorBase>(new Iterator(
+        {this, strings::StrCat(prefix, "::PrependFromQueueAndPaddedBatch")}));
+  }
+
+  const DataTypeVector& output_dtypes() const override {
+    return dtypes_with_queue_;
+  }
+  const std::vector<PartialTensorShape>& output_shapes() const override {
+    return batched_shapes_with_queue_;
+  }
+
+  string DebugString() override {
+    return "PrependFromQueueAndPaddedBatchDatasetOp::Dataset";
+  }
+
+ protected:
+  Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                            Node** output) const override {
+    Node* input_graph = nullptr;
+    TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph));
+    Node* batch_size = nullptr;
+    TF_RETURN_IF_ERROR(b->AddScalar(batch_size_, &batch_size));
+
+    std::vector<Node*> padded_shapes;
+    padded_shapes.reserve(shapes_.size());
+    for (int i = 0; i < shapes_.size(); i++) {
+      Node* node;
+      Tensor t(DT_INT64, TensorShape({shapes_[i].dims()}));
+      for (int j = 0; j < shapes_[i].dims(); j++) {
+        t.vec<int64>()(j) = shapes_[i].dim_size(j);
+      }
+      TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+      padded_shapes.emplace_back(node);
+    }
+
+    std::vector<Node*> padding_values;
+    padding_values.reserve(padding_values_.size());
+    for (const Tensor& t : padding_values_) {
+      Node* node;
+      TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+      padding_values.emplace_back(node);
+    }
+
+    AttrValue output_types;
+    b->BuildAttrValue(dtypes_, &output_types);
+
+    AttrValue output_shapes;
+    b->BuildAttrValue(batched_shapes_with_queue_, &output_shapes);
+
+    AttrValue N;
+    b->BuildAttrValue<int64>(shapes_.size(), &N);
+
+    TF_RETURN_IF_ERROR(b->AddDataset(this, {{0, input_graph}, {1, batch_size}},
+                                     {{2, padded_shapes}, {3, padding_values}},
+                                     {{"Toutput_types", output_types},
+                                      {"output_shapes", output_shapes},
+                                      {"N", N}},
+                                     output));
+
+    return Status::OK();
+  }
+
+ private:
+  friend class EnqueueInQueueDatasetOp;
+
+  class Iterator
+      : public DatasetIterator<PrependFromQueueAndPaddedBatchDataset> {
+   public:
+    explicit Iterator(const Params& params)
+        : DatasetIterator<PrependFromQueueAndPaddedBatchDataset>(params),
+          queue_(new TensorQueue(/*input_impl*/
+                                 params.dataset->input_->MakeIterator(
+                                     params.prefix),
+                                 params.dataset->dtypes_,
+                                 params.dataset->shapes_)) {}
+
+    ~Iterator() override { queue_->Unref(); }
+
+    Status GetNextInternal(IteratorContext* ctx,
+                           std::vector<Tensor>* out_tensors,
+                           bool* end_of_sequence) override {
+      std::vector<std::vector<Tensor>> batch;
+      TF_RETURN_IF_ERROR(queue_->GetNext(ctx, dataset()->batch_size_, &batch,
+                                         end_of_sequence));
+      const auto& dtypes = dataset()->dtypes_;
+      const auto& shapes = dataset()->shapes_;
+      const auto& input_shapes = dataset()->input_->output_shapes();
+      const auto& padding_values = dataset()->padding_values_;
+      const int64 batch_size = batch.size();
+      out_tensors->reserve(dtypes.size());
+
+      std::vector<TensorShape> max_shapes;  // Of non-queue components.
+      for (int i = 0; i < dtypes.size(); ++i) {
+        const PartialTensorShape& shape = shapes[i];
+        TensorShape out_shape({batch_size});
+        for (int r = 0; r < shape.dims(); ++r) {
+          if (shape.dim_size(r) >= 0) {
+            // padded_shape[r] is known.
+            out_shape.AddDim(shape.dim_size(r));
+          } else {
+            // padded_shape[r] is unknown, find the maximum across
+            // the batch.
+            int64 dim = 0;
+            for (int b = 0; b < batch.size(); ++b) {
+              dim = std::max(dim, batch[b][i].dim_size(r));
+            }
+            out_shape.AddDim(dim);
+          }
+        }
+        max_shapes.push_back(std::move(out_shape));
+      }
+
+      Tensor queues_t(cpu_allocator(), DT_VARIANT, TensorShape({batch_size}));
+      if (!batch.empty()) {
+        auto queues = queues_t.flat<Variant>();
+        Variant& queue_inserter = queues(0);
+        queue_inserter = TensorQueueInserter();
+        queue_inserter.get<TensorQueueInserter>()->set_queue(queue_);
+        for (int b = 1; b < batch.size(); ++b) {
+          // Copy the TensorQueueInserter.  Each copy increments the
+          // Ref on the queue_.
+          queues(b) = queues(0);
+        }
+      }
+      out_tensors->push_back(std::move(queues_t));
+
+      for (int i = 0; i < max_shapes.size(); ++i) {
+        Tensor component(cpu_allocator(), dtypes[i], max_shapes[i]);
+        // Try hard to take the fast path.
+        if (shapes[i].IsFullyDefined() &&
+            shapes[i].IsIdenticalTo(input_shapes[i])) {
+          // Take the fast path if we know all the shapes statically.
+          for (int64 b = 0; b < batch.size(); ++b) {
+            TF_RETURN_IF_ERROR(
+                batch_util::CopyElementToSlice(batch[b][i], &component, b));
+          }
+        } else {
+          TF_RETURN_IF_ERROR(
+              batch_util::SetElementZero(&component, padding_values[i]));
+          for (int64 b = 0; b < batch.size(); ++b) {
+            if (batch[b][i].shape() == max_shapes[i]) {
+              TF_RETURN_IF_ERROR(
+                  batch_util::CopyElementToSlice(batch[b][i], &component, b));
+            } else {
+              TF_RETURN_IF_ERROR(batch_util::CopyElementToLargerSlice(
+                  batch[b][i], &component, b));
+            }
+          }
+        }
+        out_tensors->push_back(std::move(component));
+      }
+
+      // end_of_sequence was set before we populated out_tensors, so
+      // it's ok to return now.
+      return Status::OK();
+    }
+
+   protected:
+    // Work around bug in MSVC that disallows access to protected
+    // members of Iterator from within TensorQueue.
+    class TensorQueue;
+    friend class TensorQueue;
+
+    class TensorQueue : public core::RefCounted {
+     public:
+      TensorQueue(std::unique_ptr<IteratorBase> input_impl,
+                  const DataTypeVector& dtypes,
+                  const std::vector<PartialTensorShape>& shapes)
+          : dtypes_(dtypes),
+            shapes_(shapes),
+            input_impl_(std::move(input_impl)) {}
+
+      void MaybeWaitForNotificationLocked(mutex_lock* lock)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        // This essentially just releases the lock and immediately relocks.
+        cv_.wait_for(*lock, std::chrono::milliseconds(0));
+      }
+
+      void NotifyLocked() EXCLUSIVE_LOCKS_REQUIRED(mu_) { cv_.notify_all(); }
+
+      Status GetNext(IteratorContext* ctx, const int64 batch_size,
+                     std::vector<std::vector<Tensor>>* batch,
+                     bool* end_of_sequence) {
+        mutex_lock lock(mu_);
+
+        *end_of_sequence = false;
+
+        for (int64 b = 0; b < batch_size;) {
+          if (!entries_.empty()) {
+            batch->push_back(std::move(entries_.front()));
+            entries_.pop_front();
+            ++b;
+            continue;
+          } else {
+            if (input_impl_) {
+              // There's still input coming in.
+              std::vector<Tensor> tensors;
+              bool input_end;
+              TF_RETURN_IF_ERROR(
+                  input_impl_->GetNext(ctx, &tensors, &input_end));
+              if (!input_end) {
+                batch->push_back(std::move(tensors));
+                ++b;
+                continue;
+              } else {
+                input_impl_.reset();
+              }
+            }
+            if (!input_impl_) {
+              // There's no more input coming in.
+              if (RefCountIsOne()) {
+                // No TensorQueueInserters in the wild.
+                if (batch->empty()) {
+                  *end_of_sequence = true;
+                }
+                break;
+              } else {
+                MaybeWaitForNotificationLocked(&lock);
+                // If there's data available, try to add entries again.
+                // Otherwise return a smaller batch and hope the next
+                // iterator request has a non-empty or unused queue_.
+                if (entries_.empty()) {
+                  break;
+                }
+              }
+            }
+          }
+        }  // for (int64 b = ... batch_size)
+        return Status::OK();
+      }
+
+      Status Insert(const std::vector<Tensor>& tensors) {
+        if (tensors.size() != dtypes_.size()) {
+          return errors::InvalidArgument(
+              "TensorQueue::Insert: mismatched number of tensors.  Queue "
+              "expects ",
+              dtypes_.size(), " tensors but tried to insert ", tensors.size());
+        }
+        for (int i = 0; i < tensors.size(); ++i) {
+          if (tensors[i].dtype() != dtypes_[i]) {
+            return errors::InvalidArgument(
+                "TensorQueue::Insert: mismatched dtypes at component ", i,
+                ".  Attempted "
+                "to insert tensor of type ",
+                DataTypeString(tensors[i].dtype()),
+                " but queue expected type: ", DataTypeString(dtypes_[i]));
+          }
+          if (!shapes_[i].IsCompatibleWith(tensors[i].shape())) {
+            return errors::InvalidArgument(
+                "TensorQueue::Insert: mismatched shapes at component ", i,
+                ".  Attempted "
+                "to insert tensor with shape ",
+                tensors[i].shape().DebugString(),
+                " but queue expected shape: ", shapes_[i].DebugString());
+          }
+        }
+        mutex_lock lock(mu_);
+        entries_.push_back(tensors);
+        NotifyLocked();
+        return Status::OK();
+      }
+
+      Status Save(Iterator* iter, IteratorStateWriter* writer) {
+        mutex_lock lock(mu_);
+        if (input_impl_) {
+          TF_RETURN_IF_ERROR(iter->SaveParent(writer, input_impl_));
+        } else {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(iter->full_name("input_exhausted"), ""));
+        }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(iter->full_name("entries_size"),
+                                               entries_.size()));
+        for (int64 b = 0; b < entries_.size(); ++b) {
+          for (int i = 0; i < dtypes_.size(); ++i) {
+            TF_RETURN_IF_ERROR(
+                writer->WriteTensor(strings::StrCat(iter->full_name("entries"),
+                                                    "[", b, "][", i, "]"),
+                                    entries_[b][i]));
+          }
+        }
+        return Status::OK();
+      }
+
+      Status Restore(Iterator* iter, IteratorContext* ctx,
+                     IteratorStateReader* reader) {
+        mutex_lock l(mu_);
+        if (reader->Contains(iter->full_name("input_exhausted"))) {
+          input_impl_.reset();
+        } else {
+          input_impl_ = iter->dataset_input()->MakeIterator(iter->prefix());
+          TF_RETURN_IF_ERROR(iter->RestoreParent(ctx, reader, input_impl_));
+        }
+        entries_.clear();
+        int64 entries_size = -1;
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(iter->full_name("entries_size"), &entries_size));
+        if (entries_size < 0) {
+          return errors::DataLoss(
+              "Expected entries_size key '", iter->full_name("entries_size"),
+              "' to have nonnegative value, but saw: ", entries_size);
+        }
+        for (int64 b = 0; b < entries_size; ++b) {
+          std::vector<Tensor> entry;
+          for (int i = 0; i < dtypes_.size(); ++i) {
+            Tensor value;
+            TF_RETURN_IF_ERROR(
+                reader->ReadTensor(strings::StrCat(iter->full_name("entries"),
+                                                   "[", b, "][", i, "]"),
+                                   &value));
+            entry.push_back(std::move(value));
+          }
+          entries_.push_back(std::move(entry));
+        }
+        return Status::OK();
+      }
+
+      mutex* mu() { return &mu_; }
+
+     private:
+      DataTypeVector dtypes_;
+      std::vector<PartialTensorShape> shapes_;
+
+      mutex mu_;
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+      std::deque<std::vector<Tensor>> entries_ GUARDED_BY(mu_);
+      condition_variable cv_ GUARDED_BY(mu_);
+    };
+
+    const DatasetBase* dataset_input() const { return dataset()->input_; }
+
+    Status SaveInternal(IteratorStateWriter* writer) override {
+      return queue_->Save(this, writer);
+    }
+
+    Status RestoreInternal(IteratorContext* ctx,
+                           IteratorStateReader* reader) override {
+      return queue_->Restore(this, ctx, reader);
+    }
+
+   public:
+    class TensorQueueInserter {
+     public:
+      TensorQueueInserter() : queue_(nullptr) {}
+
+      void set_queue(TensorQueue* queue) {
+        queue_ = queue;
+        queue_->Ref();
+      }
+
+      TensorQueueInserter(const TensorQueueInserter& rhs) {
+        queue_ = rhs.queue_;
+        queue_->Ref();
+      };
+
+      TensorQueueInserter(TensorQueueInserter&& rhs) {
+        queue_ = rhs.queue_;
+        rhs.queue_ = nullptr;
+      }
+
+      TensorQueueInserter& operator=(const TensorQueueInserter& rhs) = delete;
+
+      string TypeName() const { return "tensorflow::TensorQueueInserter"; }
+      string DebugString() const { return TypeName(); }
+
+      void Encode(VariantTensorData*) const {}
+      bool Decode(const VariantTensorData&) { return false; }
+
+      ~TensorQueueInserter() {
+        if (queue_) {
+          mutex_lock lock(*queue_->mu());
+          queue_->Unref();
+          queue_->NotifyLocked();
+          queue_ = nullptr;
+        }
+      }
+
+      Status Insert(const std::vector<Tensor>& tensors) const {
+        CHECK(queue_);
+        return queue_->Insert(tensors);
+      }
+
+     private:
+      mutable TensorQueue* queue_;
+    };
+
+   private:
+    TensorQueue* const queue_;
+  };
+
+ private:
+  const int64 batch_size_;
+  const DatasetBase* input_;
+  const DataTypeVector dtypes_;
+  const std::vector<PartialTensorShape> shapes_;
+  const std::vector<Tensor> padding_values_;
+  const DataTypeVector dtypes_with_queue_;
+  const std::vector<PartialTensorShape> batched_shapes_with_queue_;
+};
+
+class PrependFromQueueAndPaddedBatchDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit PrependFromQueueAndPaddedBatchDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("Toutput_types", &output_types_));
+  }
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    int64 batch_size = 0;
+    OP_REQUIRES_OK(ctx,
+                   ParseScalarArgument<int64>(ctx, "batch_size", &batch_size));
+    OP_REQUIRES(
+        ctx, batch_size > 0,
+        errors::InvalidArgument("Batch size must be greater than zero."));
+
+    OpInputList padded_shape_tensors;
+    OP_REQUIRES_OK(ctx,
+                   ctx->input_list("padded_shapes", &padded_shape_tensors));
+    std::vector<PartialTensorShape> padded_shapes;
+    padded_shapes.reserve(padded_shape_tensors.size());
+    OP_REQUIRES(ctx,
+                padded_shape_tensors.size() == input->output_shapes().size(),
+                errors::InvalidArgument("Number of padded shapes (",
+                                        padded_shape_tensors.size(),
+                                        ") must match the number of components "
+                                        "in the input dataset's elements (",
+                                        input->output_shapes().size(), ")"));
+    for (const Tensor& padded_shape_t : padded_shape_tensors) {
+      OP_REQUIRES(ctx, TensorShapeUtils::IsVector(padded_shape_t.shape()),
+                  errors::InvalidArgument("All padded shapes must be vectors"));
+      PartialTensorShape padded_shape;
+      OP_REQUIRES_OK(ctx, PartialTensorShape::MakePartialShape(
+                              padded_shape_t.vec<int64>().data(),
+                              padded_shape_t.NumElements(), &padded_shape));
+      padded_shapes.push_back(std::move(padded_shape));
+    }
+
+    OP_REQUIRES(
+        ctx, input->output_dtypes() == output_types_,
+        errors::InvalidArgument("Input dataset and this dataset "
+                                "have different output_types: ",
+                                DataTypeVectorString(input->output_dtypes()),
+                                " and ", DataTypeVectorString(output_types_)));
+
+    for (int i = 0; i < input->output_shapes().size(); ++i) {
+      // Exclude the queue from the tensor_shapes calculation.
+      const PartialTensorShape& tensor_shape = padded_shapes[i];
+      OP_REQUIRES(
+          ctx,
+          IsGreaterEqualToOrCompatibleWith(tensor_shape,
+                                           input->output_shapes()[i]),
+          errors::InvalidArgument("Incompatible input shapes at component ", i,
+                                  " between input dataset this dataset: ",
+                                  input->output_shapes()[i].DebugString(),
+                                  " vs. ", tensor_shape.DebugString()));
+    }
+
+    OpInputList padding_values_list;
+    OP_REQUIRES_OK(ctx,
+                   ctx->input_list("padding_values", &padding_values_list));
+    std::vector<Tensor> padding_values;
+    OP_REQUIRES(ctx,
+                padding_values_list.size() == input->output_shapes().size(),
+                errors::InvalidArgument(
+                    "Number of padding values (", padding_values_list.size(),
+                    ") must match the number of components in the input "
+                    "dataset's elements (",
+                    input->output_shapes().size(), ")"));
+    for (int i = 0; i < padding_values_list.size(); ++i) {
+      const Tensor& padding_value_t = padding_values_list[i];
+      OP_REQUIRES(
+          ctx, TensorShapeUtils::IsScalar(padding_value_t.shape()),
+          errors::InvalidArgument(
+              "All padding values must be scalars; but at component ", i,
+              " saw shape: ", padding_value_t.shape().DebugString()));
+      OP_REQUIRES(ctx, padding_value_t.dtype() == input->output_dtypes()[i],
+                  errors::InvalidArgument(
+                      "Mismatched type between padding value ", i,
+                      " and input dataset's component ", i, ": ",
+                      DataTypeString(padding_value_t.dtype()), " vs. ",
+                      DataTypeString(input->output_dtypes()[i])));
+      padding_values.push_back(padding_value_t);
+    }
+
+    *output = new PrependFromQueueAndPaddedBatchDataset(
+        ctx, batch_size, input, output_types_, padded_shapes,
+        std::move(padding_values));
+  }
+
+ private:
+  DataTypeVector output_types_;
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("PrependFromQueueAndPaddedBatchDataset").Device(DEVICE_CPU),
+    PrependFromQueueAndPaddedBatchDatasetOp);
+
+class EnqueueInQueueDatasetOp : public OpKernel {
+ public:
+  explicit EnqueueInQueueDatasetOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  void Compute(OpKernelContext* ctx) override {
+    using TensorQueueInserter =
+        PrependFromQueueAndPaddedBatchDataset::Iterator::TensorQueueInserter;
+
+    // TODO(ebrevdo): accept list of sequence lengths to do proper
+    // sub-slicing of tensors for placement into the queue?
+    const Tensor& tensor_queue_t = ctx->input(0);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(tensor_queue_t.shape()),
+                errors::InvalidArgument("queue must be a vector, saw shape: ",
+                                        tensor_queue_t.shape().DebugString()));
+    std::vector<const TensorQueueInserter*> inserters;
+    const int64 batch_size = tensor_queue_t.NumElements();
+    inserters.reserve(batch_size);
+    const Variant* variants = tensor_queue_t.flat<Variant>().data();
+    for (int i = 0; i < batch_size; ++i) {
+      const auto* inserter = variants[i].get<TensorQueueInserter>();
+      OP_REQUIRES(ctx, inserter != nullptr,
+                  errors::InvalidArgument(
+                      "Could not access TensorQueueInserter from queue[", i,
+                      "].  Received variant: ", variants[i].DebugString()));
+      inserters.push_back(inserter);
+    }
+
+    OpInputList components;
+    OP_REQUIRES_OK(ctx, ctx->input_list("components", &components));
+    for (int i = 0; i < components.size(); ++i) {
+      OP_REQUIRES(
+          ctx,
+          components[i].dims() > 0 && components[i].dim_size(0) == batch_size,
+          errors::InvalidArgument(
+              "Expected component ", i, " to have batched shape [", batch_size,
+              ",...], but saw shape: ", components[i].shape().DebugString()));
+    }
+    std::vector<TensorShape> element_shapes;
+    for (int i = 0; i < components.size(); ++i) {
+      TensorShape element_shape = components[i].shape();
+      element_shape.RemoveDim(0);
+      element_shapes.push_back(std::move(element_shape));
+    }
+    for (int64 b = 0; b < batch_size; ++b) {
+      std::vector<Tensor> tensors;
+      tensors.reserve(components.size());
+      for (int i = 0; i < components.size(); ++i) {
+        Tensor t(components[i].dtype(), element_shapes[i]);
+        OP_REQUIRES_OK(ctx,
+                       batch_util::CopySliceToElement(components[i], &t, b));
+        tensors.push_back(std::move(t));
+      }
+      // TODO(ebrevdo): Acquire the lock once for all inserters with
+      // the same underlying queue?  Add InsertLocked?
+      OP_REQUIRES_OK(ctx, inserters[b]->Insert(tensors));
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("EnqueueInQueueDataset").Device(DEVICE_CPU),
+                        EnqueueInQueueDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/gather_op.cc b/tensorflow/core/kernels/gather_op.cc
index d6cbcf1d93..0a38d3d4af 100644
--- a/tensorflow/core/kernels/gather_op.cc
+++ b/tensorflow/core/kernels/gather_op.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/gather_functor.h"
 #include "tensorflow/core/platform/mem.h"
@@ -141,6 +143,7 @@ TF_CALL_ALL_TYPES(REGISTER_GATHER_CPU);
 TF_CALL_QUANTIZED_TYPES(REGISTER_GATHER_CPU);
 TF_CALL_quint16(REGISTER_GATHER_CPU);
 TF_CALL_qint16(REGISTER_GATHER_CPU);
+TF_CALL_variant(REGISTER_GATHER_CPU);
 
 #undef REGISTER_GATHER_CPU
 
diff --git a/tensorflow/core/kernels/strided_slice_op.cc b/tensorflow/core/kernels/strided_slice_op.cc
index 7745effe2a..e0b85c6d06 100644
--- a/tensorflow/core/kernels/strided_slice_op.cc
+++ b/tensorflow/core/kernels/strided_slice_op.cc
@@ -391,6 +391,7 @@ class StridedSliceAssignOp : public OpKernel {
                           StridedSliceAssignOp<CPUDevice, type>)
 
 TF_CALL_ALL_TYPES(REGISTER_STRIDED_SLICE);
+TF_CALL_variant(REGISTER_STRIDED_SLICE);
 
 #undef REGISTER_STRIDED_SLICE
 
diff --git a/tensorflow/core/kernels/strided_slice_op_impl.h b/tensorflow/core/kernels/strided_slice_op_impl.h
index ac1259a9ac..c3187e49ce 100644
--- a/tensorflow/core/kernels/strided_slice_op_impl.h
+++ b/tensorflow/core/kernels/strided_slice_op_impl.h
@@ -26,6 +26,8 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/register_types_traits.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/dense_update_functor.h"
 #include "tensorflow/core/kernels/ops_util.h"
@@ -288,6 +290,7 @@ DECLARE_FOR_N_GPU(int64);
 #endif  // END GOOGLE_CUDA
 
 TF_CALL_ALL_TYPES(DECLARE_FOR_N_CPU);
+TF_CALL_variant(DECLARE_FOR_N_CPU);
 
 #ifdef TENSORFLOW_USE_SYCL
 #define PREVENT_FOR_N_SYCL(T) \
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index 2cae814eab..3c8e9a8a5f 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -491,4 +491,29 @@ REGISTER_OP("StatsAggregatorSummary")
     .Output("summary: string")
     .SetShapeFn(shape_inference::ScalarShape);
 
+REGISTER_OP("PrependFromQueueAndPaddedBatchDataset")
+    .Input("input_dataset: variant")
+    .Input("batch_size: int64")
+    .Input("padded_shapes: N * int64")
+    .Input("padding_values: Toutput_types")
+    .Output("handle: variant")
+    .Attr("Toutput_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .Attr("N: int >= 1")
+    // TODO(ebrevdo): Validate that `padded_shapes` are all vectors, the lengths
+    // of `Toutput_types` and `output_shapes` are `N`, that the
+    // length of `output_types` is `N`, the `output_shapes` are
+    // (as far as possible to tell statically) compatible with `padded_shapes`,
+    // and that `padding_values` are all scalars.
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("EnqueueInQueueDataset")
+    .Input("queue: variant")
+    .Input("components: Tcomponents")
+    .Attr("Tcomponents: list(type) >= 1")
+    .SetIsStateful()  // To avoid CSE on multiple calls to Enqueue.
+    // TODO(ebrevdo): SetShapeFn to test input dtypes and shapes by
+    // reading from queue handle (is that even possible?).
+    .SetShapeFn(shape_inference::NoOutputs);
+
 }  // namespace tensorflow
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index f8798c1d6f..5d318531d5 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -1456,6 +1456,19 @@ def _padding_value_to_tensor(value, output_type):
   return value
 
 
+def _default_padding(input_dataset):
+
+  def make_zero(t):
+    if t.base_dtype == dtypes.string:
+      return ""
+    elif t.base_dtype == dtypes.variant:
+      raise TypeError("Unable to create padding for field of type 'variant'")
+    else:
+      return np.zeros_like(t.as_numpy_dtype())
+
+  return nest.map_structure(make_zero, input_dataset.output_types)
+
+
 class PaddedBatchDataset(Dataset):
   """A `Dataset` that batches and pads contiguous elements from its input."""
 
@@ -1471,23 +1484,13 @@ class PaddedBatchDataset(Dataset):
         batch_size, dtype=dtypes.int64, name="batch_size")
     padding_values = (
         padding_values
-        if padding_values is not None else self._default_padding(input_dataset))
+        if padding_values is not None else _default_padding(input_dataset))
     self._padded_shapes = nest.map_structure_up_to(
         input_dataset.output_shapes, _partial_shape_to_tensor, padded_shapes)
     self._padding_values = nest.map_structure_up_to(
         input_dataset.output_shapes, _padding_value_to_tensor, padding_values,
         input_dataset.output_types)
 
-  def _default_padding(self, input_dataset):
-
-    def make_zero(t):
-      if t.base_dtype == dtypes.string:
-        return ""
-      else:
-        return np.zeros_like(t.as_numpy_dtype())
-
-    return nest.map_structure(make_zero, input_dataset.output_types)
-
   def _as_variant_tensor(self):
     return gen_dataset_ops.padded_batch_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-- 
GitLab


From f5d40430cb48aa73632f5c45dc4229f530b7404b Mon Sep 17 00:00:00 2001
From: AG Ramesh <ag.ramesh@intel.com>
Date: Thu, 1 Feb 2018 11:10:54 -0700
Subject: [PATCH 504/966] Fix for mkl_input conversion for MKL DNN. Fix also
 enables elemenwise operations. (#16557)

---
 tensorflow/core/graph/mkl_layout_pass.cc      | 14 ++---
 .../core/kernels/mkl_input_conversion_op.cc   | 56 ++++++++++++++++---
 2 files changed, 56 insertions(+), 14 deletions(-)

diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index 18399bb0ac..0e8a1cb26c 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -2452,9 +2452,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     // NOTE: names are alphabetically sorted.
     rinfo_.push_back({csinfo_.addn, mkl_op_registry::GetMklOpName(csinfo_.addn),
                       CopyAttrsAddN, AddNRewrite});
-    /* rinfo_.push_back({csinfo_.add,
+    rinfo_.push_back({csinfo_.add,
                       mkl_op_registry::GetMklOpName(csinfo_.add),
-                      CopyAttrsDataType, AlwaysRewrite}); */
+                      CopyAttrsDataType, AlwaysRewrite});
     rinfo_.push_back({csinfo_.avg_pool,
                       mkl_op_registry::GetMklOpName(csinfo_.avg_pool),
                       CopyAttrsPooling, AlwaysRewrite});
@@ -2502,15 +2502,15 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     rinfo_.push_back({csinfo_.max_pool_grad,
                       mkl_op_registry::GetMklOpName(csinfo_.max_pool_grad),
                       CopyAttrsPooling, AlwaysRewrite});
-    /*
+
     rinfo_.push_back({csinfo_.maximum,
                       mkl_op_registry::GetMklOpName(csinfo_.maximum),
                       CopyAttrsDataType, AlwaysRewrite});
     rinfo_.push_back({csinfo_.mul,
                       mkl_op_registry::GetMklOpName(csinfo_.mul),
                       CopyAttrsDataType, AlwaysRewrite});
-    */
-    rinfo_.push_back({csinfo_.relu, mkl_op_registry::GetMklOpName(csinfo_.relu),
+    rinfo_.push_back({csinfo_.relu,
+                      mkl_op_registry::GetMklOpName(csinfo_.relu),
                       CopyAttrsDataType, AlwaysRewrite});
     rinfo_.push_back({csinfo_.relu_grad,
                       mkl_op_registry::GetMklOpName(csinfo_.relu_grad),
@@ -2529,14 +2529,14 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     rinfo_.push_back({csinfo_.softmax,
                       mkl_op_registry::GetMklOpName(csinfo_.softmax),
                       CopyAttrsDataType, AlwaysRewrite});
-    /*
+
     rinfo_.push_back({csinfo_.squared_difference,
                       mkl_op_registry::GetMklOpName(csinfo_.squared_difference),
                       CopyAttrsDataType, AlwaysRewrite});
     rinfo_.push_back({csinfo_.sub,
                       mkl_op_registry::GetMklOpName(csinfo_.sub),
                       CopyAttrsDataType, AlwaysRewrite});
-    */
+
 
     // Add info about which ops to add workspace edge to and the slots.
     wsinfo_.push_back({csinfo_.lrn, csinfo_.lrn_grad, 0, 2, 1, 3});
diff --git a/tensorflow/core/kernels/mkl_input_conversion_op.cc b/tensorflow/core/kernels/mkl_input_conversion_op.cc
index 4337e4b49e..acb0db57b3 100644
--- a/tensorflow/core/kernels/mkl_input_conversion_op.cc
+++ b/tensorflow/core/kernels/mkl_input_conversion_op.cc
@@ -293,14 +293,56 @@ class MklInputConversionOp : public OpKernel {
     // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
     // If both inputs are in MKL format
     if (input_shape_0.IsMklTensor() && input_shape_1.IsMklTensor()) {
-      // If both have the same shape, pass them through
       if (tf_shapes_are_same) {
-        VLOG(1) << "MklInputConversionOp: No conversion needed, "
-                << "copying MKL inputs with identical shapes to output";
-
-        ForwardMklTensorInToOut(context, 0, 0);
-        ForwardMklTensorInToOut(context, 1, 1);
-        return;
+        auto input0_md = input_shape_0.GetMklLayout();
+        auto input1_md = input_shape_1.GetMklLayout();
+        
+        // If both have the same shape and same format, pass them through
+        if ( input0_md.data.format == input1_md.data.format) {
+          VLOG(1) << "MklInputConversionOp: No conversion needed, "
+                  << "copying MKL inputs with identical shapes to output";
+
+          ForwardMklTensorInToOut(context, 0, 0);
+          ForwardMklTensorInToOut(context, 1, 1);
+          return;
+        } else {
+          VLOG(1) << "MklInputConversionOp: Shape is same, but format is different, "
+                  << "need to convert to same format";
+          
+          // Convert input0, and keep input1 unchanged
+          // Create MklDnnShape for output mkl tensor based on input0
+          Tensor* tensor_out;
+          MklDnnShape mkl_output_mkl_shape;
+          mkl_output_mkl_shape.SetMklTensor(true);
+          mkl_output_mkl_shape.SetElemType(MklDnnType<T>());
+          mkl_output_mkl_shape.SetTfLayout(input_shape_0.GetDimension(),
+                                           input_shape_0.GetSizesAsMklDnnDims(),
+                                           input_shape_0.GetTfDataFormat());
+
+          // Get MKL layout from input1 as destination layout
+          mkl_output_mkl_shape.SetMklLayout(&input1_md);
+
+          // Create output Mkl tensor for index 0
+          AllocateOutputSetMklShape(context, 0, &tensor_out,
+                                    input_tensor_0.shape(), mkl_output_mkl_shape);
+
+          // Create MklDnnData object for input0 tesnsor
+          auto cpu_engine = engine(engine::cpu, 0);
+          MklDnnData<T> input(&cpu_engine);
+          input.SetUsrMem(input0_md, &input_tensor_0);
+
+          // Create reorder from input0's layout to input1's layout
+          std::vector<primitive> net;
+          CHECK_EQ(input.CheckReorderToOpMem(memory::primitive_desc(
+                                             input1_md, cpu_engine),
+                                             tensor_out, &net),
+                                             true);
+          stream(stream::kind::eager).submit(net).wait();
+
+          // Input1 will be passed through
+          ForwardMklTensorInToOut(context, 1, 1);
+          return;         
+        }
       }
 
       // Sanity check
-- 
GitLab


From 0036ba86db170fefe3ad953d19b2bc7291da0bad Mon Sep 17 00:00:00 2001
From: Loo Rong Jie <loorongjie@gmail.com>
Date: Fri, 2 Feb 2018 02:11:25 +0800
Subject: [PATCH 505/966] Clang on Windows will define __BYTE_ORDER__ etc. for
 us (#16492)

---
 tensorflow/core/platform/windows/cpu_info.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/core/platform/windows/cpu_info.h b/tensorflow/core/platform/windows/cpu_info.h
index d6e78dbc8f..f20939d3c0 100644
--- a/tensorflow/core/platform/windows/cpu_info.h
+++ b/tensorflow/core/platform/windows/cpu_info.h
@@ -22,8 +22,10 @@ limitations under the License.
 // Byte order defines provided by gcc. MSVC doesn't define those so
 // we define them here.
 // We assume that all windows platform out there are little endian.
+#if defined(_MSC_VER) && !defined(__clang__)
 #define __ORDER_LITTLE_ENDIAN__ 0x4d2
 #define __ORDER_BIG_ENDIAN__ 0x10e1
 #define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
+#endif
 
 #endif  // TENSORFLOW_PLATFORM_WINDOWS_CPU_INFO_H_
-- 
GitLab


From 2ce77a160fe8fb99a951fbe51570507dc6ad05a6 Mon Sep 17 00:00:00 2001
From: Yifei Feng <fengyifei2026@gmail.com>
Date: Thu, 1 Feb 2018 10:12:34 -0800
Subject: [PATCH 506/966] Fix do_cmake_python_sanity error. (#16650)

* Fix do_cmake_python_sanity error.

* Update python_modules.txt
---
 tensorflow/contrib/cmake/python_modules.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt
index 9ce8b3cc9c..914c4124c3 100644
--- a/tensorflow/contrib/cmake/python_modules.txt
+++ b/tensorflow/contrib/cmake/python_modules.txt
@@ -216,6 +216,8 @@ tensorflow/contrib/input_pipeline/python/ops
 tensorflow/contrib/integrate
 tensorflow/contrib/integrate/python
 tensorflow/contrib/integrate/python/ops
+tensorflow/contrib/kafka/python
+tensorflow/contrib/kafka/python/ops
 tensorflow/contrib/keras
 tensorflow/contrib/keras/api
 tensorflow/contrib/keras/api/keras
-- 
GitLab


From 5be3f234fc83a4e5533b1f2eeb4cf5fd325b3329 Mon Sep 17 00:00:00 2001
From: Gunhan Gulsoy <gunan@google.com>
Date: Thu, 1 Feb 2018 10:20:31 -0800
Subject: [PATCH 507/966] Address sanity build issues. (#16647)

* Address sanity build issues.

* More fixes.

* Final pylint fixes.
---
 .../eval/python/classifier_metrics_impl.py    |  2 +-
 .../kafka/python/ops/kafka_dataset_ops.py     |  3 +-
 .../contrib/layers/python/layers/layers.py    | 16 ++--
 .../layers/python/layers/layers_test.py       |  3 +-
 .../python/learn/datasets/synthetic_test.py   |  2 +-
 tensorflow/contrib/ndlstm/python/lstm1d.py    |  2 +-
 tensorflow/contrib/py2tf/impl/api.py          |  4 +-
 .../python/kernel_tests/core_rnn_cell_test.py |  2 +-
 .../rnn/python/kernel_tests/rnn_cell_test.py  | 91 -------------------
 .../contrib/session_bundle/bundle_shim.py     | 12 ++-
 .../pip_package/cloud_tpu_profiler/main.py    | 22 +++--
 .../core/platform/default/build_config.bzl    |  6 --
 tensorflow/python/data/ops/dataset_ops.py     |  5 +-
 tensorflow/python/data/util/nest.py           |  4 +-
 .../python/kernel_tests/tensordot_op_test.py  |  2 +-
 tensorflow/python/ops/image_ops_impl.py       | 12 ++-
 tensorflow/tools/ci_build/ci_sanity.sh        |  4 +-
 17 files changed, 55 insertions(+), 137 deletions(-)

diff --git a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
index 7bede4e240..d9b07e62f8 100644
--- a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
+++ b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
@@ -206,7 +206,7 @@ def get_graph_def_from_url_tarball(url, filename, tar_filename=None):
       sys.stdout.write('\r>> Downloading %s %.1f%%' % (
           url, float(count * block_size) / float(total_size) * 100.0))
       sys.stdout.flush()
-    tar_filename, _ = urllib.request.urlretrieve(url, filename=tar_filename, reporthook=_progress)
+    tar_filename, _ = urllib.request.urlretrieve(url, tar_filename, _progress)
   with tarfile.open(tar_filename, 'r:gz') as tar:
     proto_str = tar.extractfile(filename).read()
   return graph_pb2.GraphDef.FromString(proto_str)
diff --git a/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py b/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py
index 6590d86ebb..e561f595a4 100644
--- a/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py
+++ b/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py
@@ -30,7 +30,8 @@ class KafkaDataset(Dataset):
   """A Kafka Dataset that consumes the message.
   """
 
-  def __init__(self, topics, servers="localhost", group="", eof=False, timeout=1000):
+  def __init__(
+      self, topics, servers="localhost", group="", eof=False, timeout=1000):
     """Create a KafkaReader.
 
     Args:
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index c8e3307ee8..fb7b2e315e 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -60,12 +60,12 @@ __all__ = [
     'conv2d_in_plane', 'conv2d_transpose', 'conv3d_transpose', 'convolution',
     'convolution2d', 'convolution2d_in_plane', 'convolution2d_transpose',
     'convolution3d', 'convolution3d_transpose', 'dense_to_sparse',
-           'dropout', 'elu', 'flatten',
-    'fully_connected', 'GDN', 'gdn', 'layer_norm', 'linear', 'pool',
-    'max_pool2d', 'max_pool3d', 'one_hot_encoding', 'relu', 'relu6', 'repeat',
-    'scale_gradient', 'separable_conv2d', 'separable_convolution2d', 'softmax',
-    'spatial_softmax', 'stack', 'unit_norm', 'legacy_fully_connected',
-    'legacy_linear', 'legacy_relu', 'maxout'
+    'dropout', 'elu', 'flatten', 'fully_connected', 'GDN', 'gdn', 'layer_norm',
+    'linear', 'pool', 'max_pool2d', 'max_pool3d', 'one_hot_encoding', 'relu',
+    'relu6', 'repeat', 'scale_gradient', 'separable_conv2d',
+    'separable_convolution2d', 'softmax', 'spatial_softmax', 'stack',
+    'unit_norm', 'legacy_fully_connected', 'legacy_linear', 'legacy_relu',
+    'maxout'
 ]
 
 DATA_FORMAT_NCHW = 'NCHW'
@@ -1418,7 +1418,9 @@ def dense_to_sparse(tensor, eos_token=0, outputs_collections=None, scope=None):
   with variable_scope.variable_scope(
       scope, 'dense_to_sparse', [tensor]) as sc:
     tensor = ops.convert_to_tensor(tensor)
-    indices = array_ops.where(math_ops.not_equal(tensor, constant_op.constant(eos_token, tensor.dtype)))
+    indices = array_ops.where(
+        math_ops.not_equal(
+            tensor, constant_op.constant(eos_token, tensor.dtype)))
     values = array_ops.gather_nd(tensor, indices)
     shape = array_ops.shape(tensor, out_type=dtypes.int64)
     outputs = sparse_tensor.SparseTensor(indices, values, shape)
diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py
index f05f632246..8945690db8 100644
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@@ -1308,7 +1308,8 @@ class DenseToSparseTest(test.TestCase):
     expected_constant = np.reshape(np.arange(24, dtype=np.int64), (3, 4, 2))
     tensor = constant_op.constant(expected_constant)
     sparse = _layers.dense_to_sparse(tensor)
-    dense = sparse_ops.sparse_to_dense(sparse.indices, sparse.dense_shape, sparse.values)
+    dense = sparse_ops.sparse_to_dense(
+        sparse.indices, sparse.dense_shape, sparse.values)
     with self.test_session() as sess:
       constant = sess.run(dense)
       self.assertAllEqual(expected_constant, constant)
diff --git a/tensorflow/contrib/learn/python/learn/datasets/synthetic_test.py b/tensorflow/contrib/learn/python/learn/datasets/synthetic_test.py
index 19791d7759..5809995c8c 100644
--- a/tensorflow/contrib/learn/python/learn/datasets/synthetic_test.py
+++ b/tensorflow/contrib/learn/python/learn/datasets/synthetic_test.py
@@ -136,7 +136,7 @@ class SyntheticTest(test.TestCase):
       self.assertRaises(AssertionError, np.testing.assert_array_equal,
                         spir0.data, spir1.data)
 
-  def test_spirals(self):
+  def test_spirals_synthetic(self):
     synthetic.spirals(3)
 
 
diff --git a/tensorflow/contrib/ndlstm/python/lstm1d.py b/tensorflow/contrib/ndlstm/python/lstm1d.py
index b24e332e4a..2e2e9086c0 100644
--- a/tensorflow/contrib/ndlstm/python/lstm1d.py
+++ b/tensorflow/contrib/ndlstm/python/lstm1d.py
@@ -88,7 +88,7 @@ def ndlstm_base_dynamic(inputs, noutput, scope=None, reverse=False):
     if reverse:
       inputs = array_ops.reverse_v2(inputs, [0])
     outputs, _ = rnn.dynamic_rnn(
-      lstm_cell, inputs, time_major=True, dtype=inputs.dtype)
+        lstm_cell, inputs, time_major=True, dtype=inputs.dtype)
     if reverse:
       outputs = array_ops.reverse_v2(outputs, [0])
     return outputs
diff --git a/tensorflow/contrib/py2tf/impl/api.py b/tensorflow/contrib/py2tf/impl/api.py
index 4b8cf0527a..85d40f3158 100644
--- a/tensorflow/contrib/py2tf/impl/api.py
+++ b/tensorflow/contrib/py2tf/impl/api.py
@@ -86,8 +86,8 @@ def convert_inline(f, *args, **kwargs):
 def convert(recursive=False, arg_types=None):
   """Decorator that compiles a function to graph mode.
 
-  The decorator is dynamic - invoking compilation whenever the decorated function
-  is called. This means the parameter values are known at compilation.
+  The decorator is dynamic - invoking compilation whenever the decorated
+  function is called. This means the parameter values are known at compilation.
 
   Args:
     recursive: Whether to recusrively convert any functions that the decorator
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
index 59d7b2466c..9b84635e85 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
@@ -157,7 +157,7 @@ class RNNCellTest(test.TestCase):
             m.name: np.array([[0.1, 0.1]])
         })
         # Smoke test
-        self.assertAllClose(res[0], [[0.509682,  0.509682]])
+        self.assertAllClose(res[0], [[0.509682, 0.509682]])
 
   def testSRUCellWithDiffSize(self):
     with self.test_session() as sess:
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
index 8a3894ef9d..7b883ebc5d 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
@@ -1545,97 +1545,6 @@ class BenchmarkLSTMCellXLA(test.Benchmark):
         ]))
 
 
-class WeightNormLSTMCellTest(test.TestCase):
-  """Compared cell output with pre-calculated values."""
-
-  def _cell_output(self, cell):
-    """Calculate cell output"""
-
-    with self.test_session() as sess:
-      init = init_ops.constant_initializer(0.5)
-      with variable_scope.variable_scope("root", initializer=init):
-        x = array_ops.zeros([1, 2])
-        c0 = array_ops.zeros([1, 2])
-        h0 = array_ops.zeros([1, 2])
-
-        state0 = rnn_cell.LSTMStateTuple(c0, h0)
-
-        xout, sout = cell()(x, state0)
-
-      sess.run([variables.global_variables_initializer()])
-      res = sess.run(
-          [xout, sout], {
-              x.name: np.array([[1., 1.]]),
-              c0.name: 0.1 * np.asarray([[0, 1]]),
-              h0.name: 0.1 * np.asarray([[2, 3]]),
-          })
-
-    actual_state_c = res[1].c
-    actual_state_h = res[1].h
-
-    return actual_state_c, actual_state_h
-
-  def testBasicCell(self):
-    """Tests cell w/o peepholes and w/o normalisation"""
-
-    def cell():
-      return contrib_rnn_cell.WeightNormLSTMCell(
-          2, norm=False, use_peepholes=False)
-
-    actual_c, actual_h = self._cell_output(cell)
-
-    expected_c = np.array([[0.65937078, 0.74983585]])
-    expected_h = np.array([[0.44923624, 0.49362513]])
-
-    self.assertAllClose(expected_c, actual_c, 1e-5)
-    self.assertAllClose(expected_h, actual_h, 1e-5)
-
-  def testNonbasicCell(self):
-    """Tests cell with peepholes and w/o normalisation"""
-
-    def cell():
-      return contrib_rnn_cell.WeightNormLSTMCell(
-          2, norm=False, use_peepholes=True)
-
-    actual_c, actual_h = self._cell_output(cell)
-
-    expected_c = np.array([[0.65937084, 0.7574988]])
-    expected_h = np.array([[0.4792085, 0.53470564]])
-
-    self.assertAllClose(expected_c, actual_c, 1e-5)
-    self.assertAllClose(expected_h, actual_h, 1e-5)
-
-  def testBasicCellWithNorm(self):
-    """Tests cell w/o peepholes and with normalisation"""
-
-    def cell():
-      return contrib_rnn_cell.WeightNormLSTMCell(
-          2, norm=True, use_peepholes=False)
-
-    actual_c, actual_h = self._cell_output(cell)
-
-    expected_c = np.array([[0.50125383, 0.58805949]])
-    expected_h = np.array([[0.32770363, 0.37397948]])
-
-    self.assertAllClose(expected_c, actual_c, 1e-5)
-    self.assertAllClose(expected_h, actual_h, 1e-5)
-
-  def testNonBasicCellWithNorm(self):
-    """Tests cell with peepholes and with normalisation"""
-
-    def cell():
-      return contrib_rnn_cell.WeightNormLSTMCell(
-          2, norm=True, use_peepholes=True)
-
-    actual_c, actual_h = self._cell_output(cell)
-
-    expected_c = np.array([[0.50125383, 0.59587258]])
-    expected_h = np.array([[0.35041603, 0.40873795]])
-
-    self.assertAllClose(expected_c, actual_c, 1e-5)
-    self.assertAllClose(expected_h, actual_h, 1e-5)
-
-
 class WeightNormLSTMCellTest(test.TestCase):
   """Compared cell output with pre-calculated values."""
 
diff --git a/tensorflow/contrib/session_bundle/bundle_shim.py b/tensorflow/contrib/session_bundle/bundle_shim.py
index 3149875e41..69db594f8a 100644
--- a/tensorflow/contrib/session_bundle/bundle_shim.py
+++ b/tensorflow/contrib/session_bundle/bundle_shim.py
@@ -82,7 +82,8 @@ def _convert_default_signature_to_signature_def(signatures):
   """
   default_signature = signatures.default_signature
   signature_def = meta_graph_pb2.SignatureDef()
-  if default_signature.WhichOneof("type") == legacy_constants.REGRESSION_SIGNATURE:
+  if (default_signature.WhichOneof("type") ==
+      legacy_constants.REGRESSION_SIGNATURE):
     regression_signature = default_signature.regression_signature
     signature_def.method_name = signature_constants.REGRESS_METHOD_NAME
     _add_input_to_signature_def(regression_signature.input.tensor_name,
@@ -91,7 +92,8 @@ def _convert_default_signature_to_signature_def(signatures):
     _add_output_to_signature_def(regression_signature.output.tensor_name,
                                  signature_constants.REGRESS_OUTPUTS,
                                  signature_def)
-  elif default_signature.WhichOneof("type") == legacy_constants.CLASSIFICATION_SIGNATURE:
+  elif (default_signature.WhichOneof("type") ==
+        legacy_constants.CLASSIFICATION_SIGNATURE):
     classification_signature = default_signature.classification_signature
     signature_def.method_name = signature_constants.CLASSIFY_METHOD_NAME
     _add_input_to_signature_def(classification_signature.input.tensor_name,
@@ -132,8 +134,10 @@ def _convert_named_signatures_to_signature_def(signatures):
       signature_constants.PREDICT_OUTPUTS]
   # TODO(pdudnik): what if there are other signatures? Mimic cr/140900781 once
   # it is submitted.
-  if (input_signature.WhichOneof("type") != legacy_constants.GENERIC_SIGNATURE or
-      output_signature.WhichOneof("type") != legacy_constants.GENERIC_SIGNATURE):
+  if (input_signature.WhichOneof("type") !=
+      legacy_constants.GENERIC_SIGNATURE or
+      output_signature.WhichOneof("type") !=
+      legacy_constants.GENERIC_SIGNATURE):
     raise RuntimeError("Named input and output signatures can only be "
                        "up-converted if they are generic signature. "
                        "Input signature type is %s, output signature type is "
diff --git a/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py b/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
index 885466e5d1..78d237e6a2 100644
--- a/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
+++ b/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
@@ -25,17 +25,19 @@ import sys
 
 import tensorflow as tf
 
-flags.DEFINE_string('service_addr', None,
-                       'Address of TPU profiler service e.g. localhost:8466')
-flags.DEFINE_string('logdir', None,
-                       "Path of TensorBoard log directory e.g. /tmp/tb_log, "
-                       "gs://tb_bucket")
+flags.DEFINE_string(
+    'service_addr', None, 'Address of TPU profiler service e.g. '
+    'localhost:8466')
+flags.DEFINE_string(
+    'logdir', None, 'Path of TensorBoard log directory e.g. /tmp/tb_log, '
+    'gs://tb_bucket')
 flags.DEFINE_integer('duration_ms', 2000, 'Duration of tracing in ms.')
-flags.DEFINE_integer('num_tracing_attempts', 3,
-                        "Automatically retry N times when no trace event is "
-                        "collected.")
-flags.DEFINE_boolean('include_dataset_ops', True,
-                     "Set to false to profile longer TPU device traces.")
+flags.DEFINE_integer(
+    'num_tracing_attempts', 3, 'Automatically retry N times when no trace '
+    'event is collected.')
+flags.DEFINE_boolean(
+    'include_dataset_ops', True, 'Set to false to profile longer TPU '
+    'device traces.')
 
 FLAGS = flags.FLAGS
 EXECUTABLE = 'data/capture_tpu_profile'
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index 119ffa3d9e..2102c5cca3 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -489,12 +489,6 @@ def tf_additional_core_deps():
           "//tensorflow/core/platform/s3:s3_file_system",
       ],
       "//conditions:default": [],
-  }) + select({
-      "//tensorflow:with_kafka_support": [
-          "//tensorflow/contrib/kafka:kafka_kernels",
-          "//tensorflow/contrib/kafka:kafka_ops_op_lib",
-      ],
-      "//conditions:default": [],
   })
 
 # TODO(jart, jhseu): Delete when GCP is default on.
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index b7afb8af46..7e0feb0669 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -903,10 +903,11 @@ class Dataset(object):
 
     Args:
       transformation_func: A function that takes one `Dataset` argument and
-        returns a `Dataset`.
+          returns a `Dataset`.
 
     Returns:
-      Dataset: The `Dataset` returned by applying `transformation_func` to this dataset.
+      Dataset: The `Dataset` returned by applying `transformation_func` to this
+          dataset.
     """
     dataset = transformation_func(self)
     if not isinstance(dataset, Dataset):
diff --git a/tensorflow/python/data/util/nest.py b/tensorflow/python/data/util/nest.py
index df5498be5f..6d2f730016 100644
--- a/tensorflow/python/data/util/nest.py
+++ b/tensorflow/python/data/util/nest.py
@@ -479,8 +479,8 @@ def map_structure_up_to(shallow_tree, func, *inputs):
   The `inputs`, can be thought of as having the same structure as
   `shallow_tree`, but with leaf nodes that are themselves tree structures.
 
-  This function, therefore, will return something with the same base structure as
-  `shallow_tree`.
+  This function, therefore, will return something with the same base structure
+  as `shallow_tree`.
 
   Examples:
 
diff --git a/tensorflow/python/kernel_tests/tensordot_op_test.py b/tensorflow/python/kernel_tests/tensordot_op_test.py
index 084f98581e..8ad29afd0a 100644
--- a/tensorflow/python/kernel_tests/tensordot_op_test.py
+++ b/tensorflow/python/kernel_tests/tensordot_op_test.py
@@ -105,7 +105,7 @@ class TensordotTest(test_lib.TestCase):
         self.assertAllEqual(tf_ans, np_ans)
 
   def test_partial_shape_inference(self):
-    for axes in ([1],[0]), 1:
+    for axes in ([1], [0]), 1:
       a = array_ops.placeholder(dtypes.float32)
       b = array_ops.placeholder(dtypes.float32)
       output = math_ops.tensordot(a, b, axes)
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index cab1025df1..22636fdbb3 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -1691,7 +1691,8 @@ def rgb_to_yiq(images):
     images: tensor with the same shape as `images`.
   """
   images = ops.convert_to_tensor(images, name='images')
-  kernel = ops.convert_to_tensor(_rgb_to_yiq_kernel, dtype=images.dtype, name='kernel')
+  kernel = ops.convert_to_tensor(
+      _rgb_to_yiq_kernel, dtype=images.dtype, name='kernel')
   ndims = images.get_shape().ndims
   return math_ops.tensordot(images, kernel, axes=[[ndims-1], [0]])
 
@@ -1717,7 +1718,8 @@ def yiq_to_rgb(images):
     images: tensor with the same shape as `images`.
   """
   images = ops.convert_to_tensor(images, name='images')
-  kernel = ops.convert_to_tensor(_yiq_to_rgb_kernel, dtype=images.dtype, name='kernel')
+  kernel = ops.convert_to_tensor(
+      _yiq_to_rgb_kernel, dtype=images.dtype, name='kernel')
   ndims = images.get_shape().ndims
   return math_ops.tensordot(images, kernel, axes=[[ndims-1], [0]])
 
@@ -1742,7 +1744,8 @@ def rgb_to_yuv(images):
     images: tensor with the same shape as `images`.
   """
   images = ops.convert_to_tensor(images, name='images')
-  kernel = ops.convert_to_tensor(_rgb_to_yuv_kernel, dtype=images.dtype, name='kernel')
+  kernel = ops.convert_to_tensor(
+      _rgb_to_yuv_kernel, dtype=images.dtype, name='kernel')
   ndims = images.get_shape().ndims
   return math_ops.tensordot(images, kernel, axes=[[ndims-1], [0]])
 
@@ -1768,7 +1771,8 @@ def yuv_to_rgb(images):
     images: tensor with the same shape as `images`.
   """
   images = ops.convert_to_tensor(images, name='images')
-  kernel = ops.convert_to_tensor(_yuv_to_rgb_kernel, dtype=images.dtype, name='kernel')
+  kernel = ops.convert_to_tensor(
+      _yuv_to_rgb_kernel, dtype=images.dtype, name='kernel')
   ndims = images.get_shape().ndims
   return math_ops.tensordot(images, kernel, axes=[[ndims-1], [0]])
 
diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index a58db51cb8..6e4b821463 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -321,7 +321,7 @@ do_external_licenses_check(){
   EXTRA_LICENSES_FILE="$(mktemp)_extra_licenses.log"
 
   echo "Getting external dependencies for ${BUILD_TARGET}"
- bazel query "attr('licenses', 'notice', deps(${BUILD_TARGET}))" --no_implicit_deps --no_host_deps --keep_going \
+ bazel query "attr('licenses', 'notice', deps(${BUILD_TARGET}))" --keep_going \
   | grep -E -v "^//tensorflow" \
   | sed -e 's|:.*||' \
   | sort \
@@ -330,7 +330,7 @@ do_external_licenses_check(){
 
   echo
   echo "Getting list of external licenses mentioned in ${LICENSES_TARGET}."
-  bazel query "deps(${LICENSES_TARGET})" --no_implicit_deps --no_host_deps --keep_going \
+  bazel query "deps(${LICENSES_TARGET})" --keep_going \
   | grep -E -v "^//tensorflow" \
   | sed -e 's|:.*||' \
   | sort \
-- 
GitLab


From f8e328da8861801e1c6c3279042fee8a7c1e933a Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Thu, 1 Feb 2018 10:28:34 -0800
Subject: [PATCH 508/966] Updating the version to 1.6.0-rc0.

---
 .../eager/python/examples/mnist/mnist.py      |  2 +-
 tensorflow/core/public/version.h              |  4 ++--
 tensorflow/docs_src/install/install_c.md      |  2 +-
 tensorflow/docs_src/install/install_go.md     |  2 +-
 tensorflow/docs_src/install/install_java.md   | 22 +++++++++----------
 tensorflow/docs_src/install/install_linux.md  | 22 +++++++++----------
 tensorflow/docs_src/install/install_mac.md    | 10 ++++-----
 .../docs_src/install/install_sources.md       | 10 ++++++---
 tensorflow/tools/docker/Dockerfile.devel      |  2 +-
 .../tools/docker/Dockerfile.devel-cpu-mkl     |  2 +-
 tensorflow/tools/docker/Dockerfile.devel-gpu  |  2 +-
 tensorflow/tools/pip_package/setup.py         |  2 +-
 12 files changed, 43 insertions(+), 39 deletions(-)

diff --git a/tensorflow/contrib/eager/python/examples/mnist/mnist.py b/tensorflow/contrib/eager/python/examples/mnist/mnist.py
index 2a7be95811..cfd5b1bca7 100644
--- a/tensorflow/contrib/eager/python/examples/mnist/mnist.py
+++ b/tensorflow/contrib/eager/python/examples/mnist/mnist.py
@@ -39,7 +39,7 @@ class MNISTModel(tfe.Network):
   """MNIST Network.
 
   Network structure is equivalent to:
-  https://github.com/tensorflow/tensorflow/blob/r1.5/tensorflow/examples/tutorials/mnist/mnist_deep.py
+  https://github.com/tensorflow/tensorflow/blob/r1.6/tensorflow/examples/tutorials/mnist/mnist_deep.py
   and
   https://github.com/tensorflow/models/blob/master/tutorials/image/mnist/convolutional.py
 
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index b02f899b87..50bfa91267 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -19,12 +19,12 @@ limitations under the License.
 // TensorFlow uses semantic versioning, see http://semver.org/.
 
 #define TF_MAJOR_VERSION 1
-#define TF_MINOR_VERSION 5
+#define TF_MINOR_VERSION 6
 #define TF_PATCH_VERSION 0
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
 // "-beta", "-rc", "-rc.1")
-#define TF_VERSION_SUFFIX ""
+#define TF_VERSION_SUFFIX "-rc0"
 
 #define TF_STR_HELPER(x) #x
 #define TF_STR(x) TF_STR_HELPER(x)
diff --git a/tensorflow/docs_src/install/install_c.md b/tensorflow/docs_src/install/install_c.md
index 14add7c77e..a783205b4a 100644
--- a/tensorflow/docs_src/install/install_c.md
+++ b/tensorflow/docs_src/install/install_c.md
@@ -38,7 +38,7 @@ enable TensorFlow for C:
          OS="linux" # Change to "darwin" for macOS
          TARGET_DIRECTORY="/usr/local"
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.5.0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.6.0-rc0.tar.gz" |
            sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_go.md b/tensorflow/docs_src/install/install_go.md
index d2af9d9843..5249e04615 100644
--- a/tensorflow/docs_src/install/install_go.md
+++ b/tensorflow/docs_src/install/install_go.md
@@ -38,7 +38,7 @@ steps to install this library and enable TensorFlow for Go:
          TF_TYPE="cpu" # Change to "gpu" for GPU support
          TARGET_DIRECTORY='/usr/local'
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.5.0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.6.0-rc0.tar.gz" |
          sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_java.md b/tensorflow/docs_src/install/install_java.md
index e5388c4b1e..0c6c773e62 100644
--- a/tensorflow/docs_src/install/install_java.md
+++ b/tensorflow/docs_src/install/install_java.md
@@ -36,7 +36,7 @@ following to the project's `pom.xml` to use the TensorFlow Java APIs:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>tensorflow</artifactId>
-  <version>1.5.0</version>
+  <version>1.6.0-rc0</version>
 </dependency>
 ```
 
@@ -65,7 +65,7 @@ As an example, these steps will create a Maven project that uses TensorFlow:
                <dependency>
                  <groupId>org.tensorflow</groupId>
                  <artifactId>tensorflow</artifactId>
-                 <version>1.5.0</version>
+                 <version>1.6.0-rc0</version>
                </dependency>
              </dependencies>
          </project>
@@ -123,12 +123,12 @@ instead:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow</artifactId>
-  <version>1.5.0</version>
+  <version>1.6.0-rc0</version>
 </dependency>
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow_jni_gpu</artifactId>
-  <version>1.5.0</version>
+  <version>1.6.0-rc0</version>
 </dependency>
 ```
 
@@ -147,7 +147,7 @@ refer to the simpler instructions above instead.
 Take the following steps to install TensorFlow for Java on Linux or macOS:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.5.0.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.6.0-rc0.jar),
      which is the TensorFlow Java Archive (JAR).
 
   2. Decide whether you will run TensorFlow for Java on CPU(s) only or with
@@ -166,7 +166,7 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
          OS=$(uname -s | tr '[:upper:]' '[:lower:]')
          mkdir -p ./jni
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.5.0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.6.0-rc0.tar.gz" |
            tar -xz -C ./jni
 
 ### Install on Windows
@@ -174,10 +174,10 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
 Take the following steps to install TensorFlow for Java on Windows:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.5.0.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.6.0-rc0.jar),
      which is the TensorFlow Java Archive (JAR).
   2. Download the following Java Native Interface (JNI) file appropriate for
-     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.5.0.zip).
+     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.6.0-rc0.zip).
   3. Extract this .zip file.
 
 
@@ -225,7 +225,7 @@ must be part of your `classpath`. For example, you can include the
 downloaded `.jar` in your `classpath` by using the `-cp` compilation flag
 as follows:
 
-<pre><b>javac -cp libtensorflow-1.5.0.jar HelloTF.java</b></pre>
+<pre><b>javac -cp libtensorflow-1.6.0-rc0.jar HelloTF.java</b></pre>
 
 
 ### Running
@@ -239,11 +239,11 @@ two files are available to the JVM:
 For example, the following command line executes the `HelloTF` program on Linux
 and macOS X:
 
-<pre><b>java -cp libtensorflow-1.5.0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.6.0-rc0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
 
 And the following command line executes the `HelloTF` program on Windows:
 
-<pre><b>java -cp libtensorflow-1.5.0.jar;. -Djava.library.path=jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.6.0-rc0.jar;. -Djava.library.path=jni HelloTF</b></pre>
 
 If the program prints <tt>Hello from <i>version</i></tt>, you've successfully
 installed TensorFlow for Java and are ready to use the API.  If the program
diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index cd8c14599f..105b225177 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -188,7 +188,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      Virtualenv environment:
 
      <pre>(tensorflow)$ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common_installation_problems).
@@ -293,7 +293,7 @@ take the following steps:
 
      <pre>
      $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp34-cp34m-linux_x86_64.whl</b>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp34-cp34m-linux_x86_64.whl</b>
      </pre>
 
      If this step fails, see
@@ -480,7 +480,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
 
      <pre>
      (tensorflow)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -648,14 +648,14 @@ This section documents the relevant values for Linux installations.
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp27-none-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.6.0rc0-cp27-none-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -667,14 +667,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.6.0rc0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -686,14 +686,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.6.0rc0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
@@ -705,14 +705,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.6.0rc0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
diff --git a/tensorflow/docs_src/install/install_mac.md b/tensorflow/docs_src/install/install_mac.md
index f49d3a2f08..a6ea548cfb 100644
--- a/tensorflow/docs_src/install/install_mac.md
+++ b/tensorflow/docs_src/install/install_mac.md
@@ -115,7 +115,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      TensorFlow in the active Virtualenv is as follows:
 
      <pre> $ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0-py3-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc0-py3-none-any.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common-installation-problems).
@@ -238,7 +238,7 @@ take the following steps:
      issue the following command:
 
      <pre> $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0-py3-none-any.whl</b> </pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc0-py3-none-any.whl</b> </pre>
 
      If the preceding command fails, see
      [installation problems](#common-installation-problems).
@@ -347,7 +347,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
      TensorFlow for Python 2.7:
 
      <pre> (<i>targetDirectory</i>)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0-py2-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc0-py2-none-any.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -520,7 +520,7 @@ This section documents the relevant values for Mac OS installations.
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0-py2-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc0-py2-none-any.whl
 </pre>
 
 
@@ -528,5 +528,5 @@ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0-py2-none-any.
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0-py3-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc0-py3-none-any.whl
 </pre>
diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index c1e4545d20..d68c7b2f03 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -361,10 +361,10 @@ Invoke `pip install` to install that pip package.
 The filename of the `.whl` file depends on your platform.
 For example, the following command will install the pip package
 
-for TensorFlow 1.5.0 on Linux:
+for TensorFlow 1.6.0rc0 on Linux:
 
 <pre>
-$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.5.0-py2-none-any.whl</b>
+$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.6.0rc0-py2-none-any.whl</b>
 </pre>
 
 ## Validate your installation
@@ -462,7 +462,8 @@ Stack Overflow and specify the `tensorflow` tag.
 **Linux**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
-
+<tr><td>tensorflow-1.6.0rc0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.6.0rc0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.5.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.8.0</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.5.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.8.0</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.4.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.5.4</td><td>N/A</td><td>N/A</td></tr>
@@ -480,6 +481,7 @@ Stack Overflow and specify the `tensorflow` tag.
 **Mac**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
+<tr><td>tensorflow-1.6.0rc0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.8.1</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.5.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.8.1</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.4.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.5.4</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.3.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.4.5</td><td>N/A</td><td>N/A</td></tr>
@@ -493,6 +495,8 @@ Stack Overflow and specify the `tensorflow` tag.
 **Windows**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
+<tr><td>tensorflow-1.6.0rc0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.6.0rc0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.5.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.5.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.4.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index 5dc4a053fd..26b136b091 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -70,7 +70,7 @@ RUN mkdir /bazel && \
 
 # Download and build TensorFlow.
 WORKDIR /tensorflow
-RUN git clone --branch=r1.5 --depth=1 https://github.com/tensorflow/tensorflow.git .
+RUN git clone --branch=r1\.6 --depth=1 https://github.com/tensorflow/tensorflow.git .
 
 # TODO(craigcitro): Don't install the pip package, since it makes it
 # more difficult to experiment with local changes. Instead, just add
diff --git a/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl b/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
index 96b260ad3a..3f37390801 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
+++ b/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
@@ -3,7 +3,7 @@ FROM tensorflow/tensorflow:latest-devel
 LABEL maintainer="Clayne Robison<clayne.b.robison@intel.com>"
 
 # These arguments are parameterized. Use --build-args to override.
-ARG TF_BRANCH=r1.5
+ARG TF_BRANCH=r1\.6
 ARG WHL_DIR=/whl
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index 07ffd3839a..4426a6e2c9 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -79,7 +79,7 @@ RUN mkdir /bazel && \
 
 # Download and build TensorFlow.
 WORKDIR /tensorflow
-RUN git clone --branch=r1.5 --depth=1 https://github.com/tensorflow/tensorflow.git .
+RUN git clone --branch=r1\.6 --depth=1 https://github.com/tensorflow/tensorflow.git .
 
 # Configure the build for our CUDA configuration.
 ENV CI_BUILD_PYTHON python
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 6c9b5e46ee..7e793335f2 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -29,7 +29,7 @@ from setuptools.dist import Distribution
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
-_VERSION = '1.5.0'
+_VERSION = '1.6.0-rc0'
 
 REQUIRED_PACKAGES = [
     'absl-py >= 0.1.6',
-- 
GitLab


From 291497cd60de3b9bd629545898f29afa62d00874 Mon Sep 17 00:00:00 2001
From: Jin-Hwan CHO <jinhwan.cho@gmail.com>
Date: Fri, 2 Feb 2018 03:32:36 +0900
Subject: [PATCH 509/966] Fix an imperfect implementation of
 tf.losses.mean_pairwise_squared_error (#16433)

* Imperfect implementation of tf.losses.mean_pairwise_squared_error (#15968)

https://github.com/tensorflow/tensorflow/issues/15968

* To pass the test, tensorflow/python/kernel_tests/losses_test.py needs to be fixed.
---
 tensorflow/python/kernel_tests/losses_test.py | 12 ++++++------
 tensorflow/python/ops/losses/losses_impl.py   |  4 ++--
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/tensorflow/python/kernel_tests/losses_test.py b/tensorflow/python/kernel_tests/losses_test.py
index 81af3a0887..59fe3df3e5 100644
--- a/tensorflow/python/kernel_tests/losses_test.py
+++ b/tensorflow/python/kernel_tests/losses_test.py
@@ -953,14 +953,14 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
     # Compute the expected loss 'manually'.
     total = np.zeros((batch_size,))
     for b in range(batch_size):
-      for i in range(dims):
-        for j in range(dims):
+      for i in range(dims-1):
+        for j in range(i+1, dims):
           x = self._predictions[b, i].item() - self._predictions[b, j].item()
           y = self._labels[b, i].item() - self._labels[b, j].item()
           diff = (x - y)
           total[b] += (diff * diff)
 
-    self._expected_losses = np.divide(total, 9.0)
+    self._expected_losses = np.divide(total, 3.0)
 
   def testValueErrorThrownWhenWeightIsNone(self):
     with self.test_session():
@@ -1060,7 +1060,7 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
         [[8, 1, 3], [7, 8, 9], [10, 11, 12]],
     ])
     self._test_valid_weights(
-        labels, predictions, expected_loss=122.22222)
+        labels, predictions, expected_loss=137.5)
 
   def test3dWeightedScalar(self):
     labels = np.array([
@@ -1073,7 +1073,7 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
     ])
     weight = 3.0
     self._test_valid_weights(
-        labels, predictions, expected_loss=weight * 122.22222,
+        labels, predictions, expected_loss=weight * 137.5,
         weights=weight)
 
   def _test_invalid_weights(
@@ -1124,7 +1124,7 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
     ])
     self._test_valid_weights(
         # TODO(ptucker): This doesn't look right.
-        labels, predictions, expected_loss=9 * 122.22222,
+        labels, predictions, expected_loss=9 * 137.5,
         weights=np.ones((2, 3, 3)))
 
   def testLossWithAllZeroBatchSpecificWeights(self):
diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py
index 73563486e1..f84de3880a 100644
--- a/tensorflow/python/ops/losses/losses_impl.py
+++ b/tensorflow/python/ops/losses/losses_impl.py
@@ -547,12 +547,12 @@ def mean_pairwise_squared_error(
       num_present_per_batch = _num_present(diffs, weights, per_batch=True)
 
       term1 = 2.0 * _safe_div(sum_squares_diff_per_batch,
-                              num_present_per_batch)
+                              num_present_per_batch-1)
 
       sum_diff = math_ops.reduce_sum(
           diffs, reduction_indices=reduction_indices, keep_dims=True)
       term2 = 2.0 * _safe_div(math_ops.square(sum_diff),
-                              math_ops.square(num_present_per_batch))
+                              math_ops.multiply(num_present_per_batch, num_present_per_batch-1))
 
       weighted_losses = math_ops.multiply(term1 - term2, weights)
       loss = math_ops.reduce_sum(weighted_losses)
-- 
GitLab


From bd92b8dcf7cf6f10661ccfd52029ee8df103159b Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Thu, 1 Feb 2018 10:33:21 -0800
Subject: [PATCH 510/966] Removing the escape character.

---
 tensorflow/tools/docker/Dockerfile.devel         | 2 +-
 tensorflow/tools/docker/Dockerfile.devel-cpu-mkl | 2 +-
 tensorflow/tools/docker/Dockerfile.devel-gpu     | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index 26b136b091..d16761c367 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -70,7 +70,7 @@ RUN mkdir /bazel && \
 
 # Download and build TensorFlow.
 WORKDIR /tensorflow
-RUN git clone --branch=r1\.6 --depth=1 https://github.com/tensorflow/tensorflow.git .
+RUN git clone --branch=r1.6 --depth=1 https://github.com/tensorflow/tensorflow.git .
 
 # TODO(craigcitro): Don't install the pip package, since it makes it
 # more difficult to experiment with local changes. Instead, just add
diff --git a/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl b/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
index 3f37390801..3690e7dfe5 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
+++ b/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
@@ -3,7 +3,7 @@ FROM tensorflow/tensorflow:latest-devel
 LABEL maintainer="Clayne Robison<clayne.b.robison@intel.com>"
 
 # These arguments are parameterized. Use --build-args to override.
-ARG TF_BRANCH=r1\.6
+ARG TF_BRANCH=r1.6
 ARG WHL_DIR=/whl
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index 4426a6e2c9..4ef37881bc 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -79,7 +79,7 @@ RUN mkdir /bazel && \
 
 # Download and build TensorFlow.
 WORKDIR /tensorflow
-RUN git clone --branch=r1\.6 --depth=1 https://github.com/tensorflow/tensorflow.git .
+RUN git clone --branch=r1.6 --depth=1 https://github.com/tensorflow/tensorflow.git .
 
 # Configure the build for our CUDA configuration.
 ENV CI_BUILD_PYTHON python
-- 
GitLab


From c9dbbb898b3b8c3fcb113f5e43bc5a764ee81a8f Mon Sep 17 00:00:00 2001
From: Gunhan Gulsoy <gunan@google.com>
Date: Thu, 1 Feb 2018 10:48:44 -0800
Subject: [PATCH 511/966] compile libtensorflow on windows with AVX. (#16673)

---
 tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh | 2 +-
 tensorflow/tools/ci_build/windows/libtensorflow_gpu.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh b/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh
index fa28e3d79c..583d1d5f09 100755
--- a/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh
+++ b/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh
@@ -41,7 +41,7 @@ run_configure_for_cpu_build
 # build_libtensorflow_tarball in ../builds/libtensorflow.sh
 # cannot be used on Windows since it relies on pkg_tar rules.
 # So we do something special here
-bazel build -c opt \
+bazel build -c opt --copt=/arch:AVX \
   tensorflow:libtensorflow.so \
   tensorflow/tools/lib_package:clicenses_generate \
   tensorflow/java:libtensorflow_jni.so \
diff --git a/tensorflow/tools/ci_build/windows/libtensorflow_gpu.sh b/tensorflow/tools/ci_build/windows/libtensorflow_gpu.sh
index 573c926203..94276c6c5c 100644
--- a/tensorflow/tools/ci_build/windows/libtensorflow_gpu.sh
+++ b/tensorflow/tools/ci_build/windows/libtensorflow_gpu.sh
@@ -41,7 +41,7 @@ run_configure_for_gpu_build
 # build_libtensorflow_tarball in ../builds/libtensorflow.sh
 # cannot be used on Windows since it relies on pkg_tar rules.
 # So we do something special here
-bazel build -c opt \
+bazel build -c opt --copt=/arch:AVX \
   tensorflow:libtensorflow.so \
   tensorflow/tools/lib_package:clicenses_generate \
   tensorflow/java:libtensorflow_jni.so \
-- 
GitLab


From 1bc391cb716e163712202c874f3431a5aa1a8f44 Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Thu, 1 Feb 2018 10:54:55 -0800
Subject: [PATCH 512/966] Updating the cloud tpu version.

---
 tensorflow/contrib/tpu/profiler/pip_package/setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/tpu/profiler/pip_package/setup.py b/tensorflow/contrib/tpu/profiler/pip_package/setup.py
index 3dffebe668..cb61984799 100644
--- a/tensorflow/contrib/tpu/profiler/pip_package/setup.py
+++ b/tensorflow/contrib/tpu/profiler/pip_package/setup.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 from setuptools import setup
 
-_VERSION = '1.5.0-rc1'
+_VERSION = '1.6.0-rc0'
 
 CONSOLE_SCRIPTS = [
     'capture_tpu_profile=cloud_tpu_profiler.main:run_main',
-- 
GitLab


From f13f8d3127e243223330b64b63b367e5190f2c94 Mon Sep 17 00:00:00 2001
From: Sami Kama <skama@nvidia.com>
Date: Thu, 1 Feb 2018 12:21:05 -0800
Subject: [PATCH 513/966] Fix ldscript for exporting PyInit_* functions for
 python3

---
 tensorflow/tensorflow.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index b33f32fdfb..dc940f8d74 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -1310,7 +1310,7 @@ def _append_init_to_versionscript_impl(ctx):
             template=ctx.file.template_file,
             output=ctx.outputs.versionscript,
             substitutions={
-                "global:":"global:\n   init_%s;"%modName,
+                "global:":"global:\n     init_%s;\n     PyInit_*;"%(modName),
             },
             is_executable=False,
         )
@@ -1319,7 +1319,7 @@ def _append_init_to_versionscript_impl(ctx):
             template=ctx.file.template_file,
             output=ctx.outputs.versionscript,
             substitutions={
-                "*tensorflow*":"*tensorflow*\ninit_%s"%modName,
+                "*tensorflow*":"*tensorflow*\ninit_%s\nPyInit_*\n"%(modName),
             },
             is_executable=False,
         )
-- 
GitLab


From b4cf62578ecbff2185f81987d1c41f3cd5cb9e19 Mon Sep 17 00:00:00 2001
From: Gunhan Gulsoy <gunan@google.com>
Date: Thu, 1 Feb 2018 14:23:35 -0800
Subject: [PATCH 514/966] Fix linter error in losses_impl.py (#16675)

---
 tensorflow/python/ops/losses/losses_impl.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py
index f84de3880a..285e544047 100644
--- a/tensorflow/python/ops/losses/losses_impl.py
+++ b/tensorflow/python/ops/losses/losses_impl.py
@@ -551,8 +551,9 @@ def mean_pairwise_squared_error(
 
       sum_diff = math_ops.reduce_sum(
           diffs, reduction_indices=reduction_indices, keep_dims=True)
-      term2 = 2.0 * _safe_div(math_ops.square(sum_diff),
-                              math_ops.multiply(num_present_per_batch, num_present_per_batch-1))
+      term2 = 2.0 * _safe_div(
+          math_ops.square(sum_diff),
+          math_ops.multiply(num_present_per_batch, num_present_per_batch-1))
 
       weighted_losses = math_ops.multiply(term1 - term2, weights)
       loss = math_ops.reduce_sum(weighted_losses)
-- 
GitLab


From 1c1aea213873a4f6865b619ee46c0d38e2458630 Mon Sep 17 00:00:00 2001
From: Jeff Tang <jeff.x.tang@gmail.com>
Date: Thu, 1 Feb 2018 14:31:04 -0800
Subject: [PATCH 515/966] added audio_ops.cc to fix the Op type not registered
 DecodeWav error - https://github.com/tensorflow/tensorflow/issues/15921
 (#16537)

---
 tensorflow/contrib/makefile/tf_op_files.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/contrib/makefile/tf_op_files.txt b/tensorflow/contrib/makefile/tf_op_files.txt
index 9a1ab50317..5a812af4e9 100644
--- a/tensorflow/contrib/makefile/tf_op_files.txt
+++ b/tensorflow/contrib/makefile/tf_op_files.txt
@@ -293,3 +293,4 @@ tensorflow/core/kernels/batchtospace_op.cc
 tensorflow/core/kernels/warn_about_ints.cc
 tensorflow/core/kernels/segment_reduction_ops.cc
 tensorflow/core/kernels/batch_util.cc
+tensorflow/core/ops/audio_ops.cc
-- 
GitLab


From 9f292d8eac67ed53248e019a969d39c63bca1868 Mon Sep 17 00:00:00 2001
From: Jie <jiej@nvidia.com>
Date: Thu, 1 Feb 2018 14:48:47 -0800
Subject: [PATCH 516/966] [update]   addressing comment issues

---
 tensorflow/contrib/tensorrt/convert/convert_graph.cc | 7 ++-----
 tensorflow/contrib/tensorrt/convert/convert_nodes.h  | 2 +-
 tensorflow/tensorflow.bzl                            | 2 +-
 3 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index 81fdf01286..d94f8ac4ba 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -224,15 +224,12 @@ tensorflow::Status ConvertGraphDefToTensorRT(
   cluster =
       new tensorflow::grappler::VirtualCluster({{"/GPU:0", device_properties}});
 
-  tensorflow::Status status = optimizer.Optimize(cluster, item, &gdef);
-
-  if (status != tensorflow::Status::OK()) return status;
+  TF_RETURN_IF_ERROR(optimizer.Optimize(cluster, item, &gdef));
 
   // constant folding
   item.graph = gdef;
   tensorflow::grappler::ConstantFolding fold(nullptr);
-  status = fold.Optimize(nullptr, item, &gdef);
-  if (status != tensorflow::Status::OK()) return status;
+  TF_RETURN_IF_ERROR(fold.Optimize(nullptr, item, &gdef));
 
   // AJ refactoring shape inference through grappler/GraphProperties.
   tensorflow::grappler::GraphProperties static_graph_properties(item);
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.h b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
index 82b12b74ea..210cc36dcd 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.h
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
@@ -38,7 +38,7 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
         input_inds,  // {node_id, output_idx}
     const std::vector<std::pair<int, int>>&
         output_inds,  // {node_id, output_idx}
-    size_t max_batch_size, size_t max_workspace_size_bytes_bytes,
+    size_t max_batch_size_bytes, size_t max_workspace_size_bytes,
     const tensorflow::grappler::GraphProperties& graph_prop,
     tensorflow::NodeDef* trt_node);
 
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index dc940f8d74..2534c88c18 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -286,7 +286,7 @@ def tf_cc_shared_object(
     linkopts=[],
     framework_so=tf_binary_additional_srcs(),
     **kwargs):
-    native.cc_binary(
+  native.cc_binary(
       name=name,
       srcs=srcs + framework_so,
       deps=deps,
-- 
GitLab


From 474bf4e0458bb16f5d6882d3a179f072260168ce Mon Sep 17 00:00:00 2001
From: fo40225 <fo40225@users.noreply.github.com>
Date: Fri, 2 Feb 2018 07:09:05 +0800
Subject: [PATCH 517/966] fix python 2.7 build break & import error on windows
 (#16649)

---
 tensorflow/contrib/cmake/python_modules.txt       | 1 +
 tensorflow/contrib/cmake/tools/create_def_file.py | 5 +++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt
index 914c4124c3..a7938f1f07 100644
--- a/tensorflow/contrib/cmake/python_modules.txt
+++ b/tensorflow/contrib/cmake/python_modules.txt
@@ -6,6 +6,7 @@ tensorflow/core/example
 tensorflow/core/framework
 tensorflow/core/lib
 tensorflow/core/lib/core
+tensorflow/core/profiler
 tensorflow/core/protobuf
 tensorflow/core/util
 tensorflow/examples
diff --git a/tensorflow/contrib/cmake/tools/create_def_file.py b/tensorflow/contrib/cmake/tools/create_def_file.py
index f67698eb99..77ea914380 100644
--- a/tensorflow/contrib/cmake/tools/create_def_file.py
+++ b/tensorflow/contrib/cmake/tools/create_def_file.py
@@ -31,6 +31,7 @@ from __future__ import division
 from __future__ import print_function
 
 import argparse
+import codecs
 import io
 import os
 import re
@@ -103,7 +104,7 @@ def main():
   for lib_path in args.input:
     proc = subprocess.Popen([DUMPBIN, "/nologo", "/linkermember:1", lib_path],
                             stdout=subprocess.PIPE)
-    for line in io.TextIOWrapper(proc.stdout, encoding="utf-8"):
+    for line in codecs.getreader("utf-8")(proc.stdout):
       cols = line.split()
       if len(cols) < 2:
         continue
@@ -131,7 +132,7 @@ def main():
     # We compare on undname but use the decorated name from candidates.
     dupes = 0
     proc = subprocess.Popen([UNDNAME, tmpfile.name], stdout=subprocess.PIPE)
-    for idx, line in enumerate(io.TextIOWrapper(proc.stdout, encoding="utf-8")):
+    for idx, line in enumerate(codecs.getreader("utf-8")(proc.stdout)):
       decorated = candidates[idx]
       if decorated in taken:
         # Symbol is already in output, done.
-- 
GitLab


From 087401a6a92c3d449c9564fff252d85ce736e5ff Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Thu, 1 Feb 2018 15:10:06 -0800
Subject: [PATCH 518/966] Improve shape function of NonMaxSuppression (#16664)

* Improve shape function of NonMaxSuppression

This fix tries to improve shape function of NonMaxSuppression.
As was specified in the docs, the shapes of parameters of
`tf.image.non_max_suppression` are clearly defined with:
boxes: 2-D with shape [num_boxes, 4]
scores: 1-D with shape [num_boxes]
max_output_size: 0-D scalar
iou_threshold: 0-D scalar

However, there is no shape check in the shape function of
NonMaxSuppression.

This fix adds the shape check for NonMaxSuppression,
and adds additinal test cases for it.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add additional test cases for shape check of NonMaxSuppression.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/core/ops/image_ops.cc        | 24 +++++++++++++++
 tensorflow/python/ops/image_ops_test.py | 40 +++++++++++++++++++++++++
 2 files changed, 64 insertions(+)

diff --git a/tensorflow/core/ops/image_ops.cc b/tensorflow/core/ops/image_ops.cc
index ef2ac267cc..a62e2d782b 100644
--- a/tensorflow/core/ops/image_ops.cc
+++ b/tensorflow/core/ops/image_ops.cc
@@ -586,6 +586,17 @@ REGISTER_OP("NonMaxSuppression")
     .Output("selected_indices: int32")
     .Attr("iou_threshold: float = 0.5")
     .SetShapeFn([](InferenceContext* c) {
+      // Get inputs and validate ranks.
+      ShapeHandle boxes;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &boxes));
+      ShapeHandle scores;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &scores));
+      ShapeHandle max_output_size;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &max_output_size));
+      // The boxes is a 2-D float Tensor of shape [num_boxes, 4].
+      DimensionHandle unused;
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(boxes, 1), 4, &unused));
+
       c->set_output(0, c->Vector(c->UnknownDim()));
       return Status::OK();
     });
@@ -597,6 +608,19 @@ REGISTER_OP("NonMaxSuppressionV2")
     .Input("iou_threshold: float")
     .Output("selected_indices: int32")
     .SetShapeFn([](InferenceContext* c) {
+      // Get inputs and validate ranks.
+      ShapeHandle boxes;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &boxes));
+      ShapeHandle scores;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &scores));
+      ShapeHandle max_output_size;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &max_output_size));
+      ShapeHandle iou_threshold;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &iou_threshold));
+      // The boxes is a 2-D float Tensor of shape [num_boxes, 4].
+      DimensionHandle unused;
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(boxes, 1), 4, &unused));
+
       c->set_output(0, c->Vector(c->UnknownDim()));
       return Status::OK();
     });
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index 6a516a9911..82b77ee8e3 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -3169,6 +3169,46 @@ class NonMaxSuppressionTest(test_util.TensorFlowTestCase):
           boxes, scores, max_output_size, iou_threshold).eval()
       self.assertAllClose(selected_indices, [3, 0, 5])
 
+  def testInvalidShape(self):
+    # The boxes should be 2D of shape [num_boxes, 4].
+    with self.assertRaisesRegexp(
+        ValueError, 'Shape must be rank 2 but is rank 1'):
+      boxes = constant_op.constant([0.0, 0.0, 1.0, 1.0])
+      scores = constant_op.constant([0.9])
+      selected_indices = image_ops.non_max_suppression(
+          boxes, scores, 3, 0.5)
+
+    with self.assertRaisesRegexp(
+        ValueError, 'Dimension must be 4 but is 3'):
+      boxes = constant_op.constant([[0.0, 0.0, 1.0]])
+      scores = constant_op.constant([0.9])
+      selected_indices = image_ops.non_max_suppression(
+          boxes, scores, 3, 0.5)
+
+    # The scores should be 1D of shape [num_boxes].
+    with self.assertRaisesRegexp(
+        ValueError, 'Shape must be rank 1 but is rank 2'):
+      boxes = constant_op.constant([[0.0, 0.0, 1.0, 1.0]])
+      scores = constant_op.constant([[0.9]])
+      selected_indices = image_ops.non_max_suppression(
+          boxes, scores, 3, 0.5)
+
+    # The max_output_size should be a scaler (0-D).
+    with self.assertRaisesRegexp(
+        ValueError, 'Shape must be rank 0 but is rank 1'):
+      boxes = constant_op.constant([[0.0, 0.0, 1.0, 1.0]])
+      scores = constant_op.constant([0.9])
+      selected_indices = image_ops.non_max_suppression(
+          boxes, scores, [3], 0.5)
+
+    # The iou_threshold should be a scaler (0-D).
+    with self.assertRaisesRegexp(
+        ValueError, 'Shape must be rank 0 but is rank 2'):
+      boxes = constant_op.constant([[0.0, 0.0, 1.0, 1.0]])
+      scores = constant_op.constant([0.9])
+      selected_indices = image_ops.non_max_suppression(
+          boxes, scores, 3, [[0.5]])
+
 
 if __name__ == "__main__":
   googletest.main()
-- 
GitLab


From f902b411106c4080e9c630b1c5118caaa702af7c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 31 Jan 2018 20:20:18 -0800
Subject: [PATCH 519/966] Update ops-related pbtxt files.

PiperOrigin-RevId: 184085402
---
 .../core/ops/compat/ops_history.v1.pbtxt      | 60 +++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 | 60 +++++++++++++++++++
 2 files changed, 120 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 65ab81931a..0930eaa01f 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -17136,6 +17136,24 @@ op {
     type: DT_STRING
   }
 }
+op {
+  name: "EnqueueInQueueDataset"
+  input_arg {
+    name: "queue"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "components"
+    type_list_attr: "Tcomponents"
+  }
+  attr {
+    name: "Tcomponents"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "Enter"
   input_arg {
@@ -32096,6 +32114,48 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "PrependFromQueueAndPaddedBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "padded_shapes"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "padding_values"
+    type_list_attr: "Toutput_types"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "Toutput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "PreventGradient"
   input_arg {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index b57206c9c4..36e0bab1b1 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -7644,6 +7644,24 @@ op {
     type: DT_STRING
   }
 }
+op {
+  name: "EnqueueInQueueDataset"
+  input_arg {
+    name: "queue"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "components"
+    type_list_attr: "Tcomponents"
+  }
+  attr {
+    name: "Tcomponents"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "Enter"
   input_arg {
@@ -15926,6 +15944,48 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "PrependFromQueueAndPaddedBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "padded_shapes"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "padding_values"
+    type_list_attr: "Toutput_types"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "Toutput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "PreventGradient"
   input_arg {
-- 
GitLab


From d999c5a32767e1f48582c96b879fe19afb4cce5d Mon Sep 17 00:00:00 2001
From: Jie <jiej@nvidia.com>
Date: Thu, 1 Feb 2018 15:15:57 -0800
Subject: [PATCH 520/966] [update]   addressing comment on variable names

---
 tensorflow/contrib/tensorrt/convert/convert_nodes.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.h b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
index 210cc36dcd..2e7fd19566 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.h
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
@@ -38,7 +38,7 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
         input_inds,  // {node_id, output_idx}
     const std::vector<std::pair<int, int>>&
         output_inds,  // {node_id, output_idx}
-    size_t max_batch_size_bytes, size_t max_workspace_size_bytes,
+    size_t max_batch_size, size_t max_workspace_size_bytes,
     const tensorflow::grappler::GraphProperties& graph_prop,
     tensorflow::NodeDef* trt_node);
 
-- 
GitLab


From 6aae3000eac9ff06831284fdd9475463508d8408 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 31 Jan 2018 20:46:33 -0800
Subject: [PATCH 521/966] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 184086955

---
 tensorflow/go/op/wrappers.go | 50 ++++++++++++++++++------------------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 5b19c90238..cb47651d7b 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -8729,31 +8729,6 @@ func IRFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Out
 	return op.Output(0)
 }
 
-// Compute the pairwise cross product.
-//
-// `a` and `b` must be the same shape; they can either be simple 3-element vectors,
-// or any shape where the innermost dimension is 3. In the latter case, each pair
-// of corresponding 3-element vectors is cross-multiplied independently.
-//
-// Arguments:
-//	a: A tensor containing 3-element vectors.
-//	b: Another tensor, of same type and shape as `a`.
-//
-// Returns Pairwise cross product of the vectors in `a` and `b`.
-func Cross(scope *Scope, a tf.Output, b tf.Output) (product tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Cross",
-		Input: []tf.Input{
-			a, b,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Transforms a vector of brain.Example protos (as strings) into typed tensors.
 //
 // Arguments:
@@ -21290,6 +21265,31 @@ func StatsAggregatorSummary(scope *Scope, iterator tf.Output) (summary tf.Output
 	return op.Output(0)
 }
 
+// Compute the pairwise cross product.
+//
+// `a` and `b` must be the same shape; they can either be simple 3-element vectors,
+// or any shape where the innermost dimension is 3. In the latter case, each pair
+// of corresponding 3-element vectors is cross-multiplied independently.
+//
+// Arguments:
+//	a: A tensor containing 3-element vectors.
+//	b: Another tensor, of same type and shape as `a`.
+//
+// Returns Pairwise cross product of the vectors in `a` and `b`.
+func Cross(scope *Scope, a tf.Output, b tf.Output) (product tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Cross",
+		Input: []tf.Input{
+			a, b,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Performs a padding as a preprocess during a convolution.
 //
 // Similar to FusedResizeAndPadConv2d, this op allows for an optimized
-- 
GitLab


From 56663eedd7296d85c9adb3e9537ddac521719445 Mon Sep 17 00:00:00 2001
From: Robin Richtsfeld <robin.richtsfeld@gmail.com>
Date: Fri, 2 Feb 2018 01:00:46 +0100
Subject: [PATCH 522/966] Remove BOM (#16505)

---
 tensorflow/core/kernels/mkl_cwise_ops_common.cc | 2 +-
 tensorflow/python/kernel_tests/io_ops_test.py   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/kernels/mkl_cwise_ops_common.cc b/tensorflow/core/kernels/mkl_cwise_ops_common.cc
index c065724e0d..58f0c30f32 100644
--- a/tensorflow/core/kernels/mkl_cwise_ops_common.cc
+++ b/tensorflow/core/kernels/mkl_cwise_ops_common.cc
@@ -1,4 +1,4 @@
-﻿/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0(the "License");
 you may not use this file except in compliance with the License.
diff --git a/tensorflow/python/kernel_tests/io_ops_test.py b/tensorflow/python/kernel_tests/io_ops_test.py
index f91875c6f0..61944f7e31 100644
--- a/tensorflow/python/kernel_tests/io_ops_test.py
+++ b/tensorflow/python/kernel_tests/io_ops_test.py
@@ -1,4 +1,4 @@
-﻿# -*- coding: utf-8 -*-
+# -*- coding: utf-8 -*-
 # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
-- 
GitLab


From 7d63461f14aa1b9c7204cb260df14d1f0422a5b4 Mon Sep 17 00:00:00 2001
From: Yu-Cheng Ling <ycling@google.com>
Date: Wed, 31 Jan 2018 21:17:30 -0800
Subject: [PATCH 523/966] Internal change

PiperOrigin-RevId: 184088913
---
 .../lite/lib_package/create_ios_frameworks.sh | 81 +++++++++++++++++++
 1 file changed, 81 insertions(+)
 create mode 100755 tensorflow/contrib/lite/lib_package/create_ios_frameworks.sh

diff --git a/tensorflow/contrib/lite/lib_package/create_ios_frameworks.sh b/tensorflow/contrib/lite/lib_package/create_ios_frameworks.sh
new file mode 100755
index 0000000000..b58ae26601
--- /dev/null
+++ b/tensorflow/contrib/lite/lib_package/create_ios_frameworks.sh
@@ -0,0 +1,81 @@
+#!/bin/bash -x
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+set -e
+
+echo "Starting"
+TFLITE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/.."
+
+TMP_DIR=$(mktemp -d)
+echo "Package dir: " $TMP_DIR
+FW_DIR=$TMP_DIR/tensorflow_lite_ios_frameworks
+FW_DIR_TFLITE=$FW_DIR/tensorflow_lite.framework
+FW_DIR_TFLITE_HDRS=$FW_DIR_TFLITE/Headers
+
+echo "Creating target Headers directories"
+mkdir -p $FW_DIR_TFLITE_HDRS
+
+echo "Headers, populating: TensorFlow Lite"
+cd $TFLITE_DIR/../../..
+
+find tensorflow/contrib/lite -name '*.h' \
+    -not -path 'tensorflow/contrib/lite/downloads/*' \
+    -not -path 'tensorflow/contrib/lite/examples/*' \
+    -not -path 'tensorflow/contrib/lite/gen/*' \
+    -not -path 'tensorflow/contrib/lite/toco/*' \
+    -not -path 'tensorflow/contrib/lite/nnapi/*' \
+    -not -path 'tensorflow/contrib/lite/java/*' \
+    | tar -cf $FW_DIR_TFLITE_HDRS/tmp.tar -T -
+cd $FW_DIR_TFLITE_HDRS
+tar xf tmp.tar
+rm -f tmp.tar
+
+echo "Headers, populating: Flatbuffer"
+cd $TFLITE_DIR/downloads/flatbuffers/include/
+find . -name '*.h' | tar -cf $FW_DIR_TFLITE_HDRS/tmp.tar -T -
+cd $FW_DIR_TFLITE_HDRS
+tar xf tmp.tar
+rm -f tmp.tar
+
+cd $TFLITE_DIR/../../..
+echo "Generate master LICENSE file and copy to target"
+bazel build //tensorflow/tools/lib_package:clicenses_generate
+cp $TFLITE_DIR/../../../bazel-genfiles/tensorflow/tools/lib_package/include/tensorflow/c/LICENSE \
+   $FW_DIR_TFLITE
+
+echo "Copying static libraries"
+cp $TFLITE_DIR/gen/lib/libtensorflow-lite.a \
+   $FW_DIR_TFLITE/tensorflow_lite
+
+# This is required, otherwise they interfere with the documentation of the
+# pod at cocoapods.org.
+echo "Remove all README files"
+cd $FW_DIR_TFLITE_HDRS
+find . -type f -name README\* -exec rm -f {} \;
+find . -type f -name readme\* -exec rm -f {} \;
+
+TARGET_GEN_LOCATION="$TFLITE_DIR/gen/ios_frameworks"
+echo "Moving results to target: " $TARGET_GEN_LOCATION
+cd $FW_DIR
+zip -q -r tensorflow_lite.framework.zip tensorflow_lite.framework -x .DS_Store
+rm -rf $TARGET_GEN_LOCATION
+mkdir -p $TARGET_GEN_LOCATION
+cp -r tensorflow_lite.framework.zip $TARGET_GEN_LOCATION
+
+echo "Cleaning up"
+rm -rf $TMP_DIR
+
+echo "Finished"
-- 
GitLab


From baf490ba79acaacb458078370e4bad1c3fd17563 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Wed, 31 Jan 2018 23:05:26 -0800
Subject: [PATCH 524/966] [TF:XLA] Fix tfcompile OSS build

 - The @org_tensorflow package designation is unnecessary, and breaks the build
   when building without a sandbox.

 - The generated tests must use tf_cc_test, not cc_test.  See the note in
   tensorflow/core/BUILD.

Partially addresses #15338

PiperOrigin-RevId: 184095571
---
 tensorflow/compiler/aot/tfcompile.bzl | 83 +++++++++++++++------------
 1 file changed, 45 insertions(+), 38 deletions(-)

diff --git a/tensorflow/compiler/aot/tfcompile.bzl b/tensorflow/compiler/aot/tfcompile.bzl
index 2b9c83ba14..58572fea3d 100644
--- a/tensorflow/compiler/aot/tfcompile.bzl
+++ b/tensorflow/compiler/aot/tfcompile.bzl
@@ -4,7 +4,7 @@
 
 To use from your BUILD file, add the following line to load the macro:
 
-load("@org_tensorflow//tensorflow/compiler/aot:tfcompile.bzl", "tf_library")
+load("//tensorflow/compiler/aot:tfcompile.bzl", "tf_library")
 
 Then call the macro like this:
 
@@ -16,14 +16,15 @@ tf_library(
 )
 """
 
-load("@org_tensorflow//tensorflow:tensorflow.bzl", "if_android", "tf_copts")
+load("//tensorflow:tensorflow.bzl",
+     "if_android", "tf_cc_test", "tf_copts")
 
 def tf_library(name, graph, config,
                freeze_checkpoint=None, freeze_saver=None,
                cpp_class=None, gen_test=True, gen_benchmark=True,
                visibility=None, testonly=None,
                tfcompile_flags=None,
-               tfcompile_tool="@org_tensorflow//tensorflow/compiler/aot:tfcompile",
+               tfcompile_tool="//tensorflow/compiler/aot:tfcompile",
                include_standard_runtime_deps=True, deps=None, tags=None):
   """Runs tfcompile to compile a TensorFlow graph into executable code.
 
@@ -119,9 +120,9 @@ def tf_library(name, graph, config,
             out_nodes_file,
         ] + freeze_saver_srcs,
         outs=[freeze_file],
-        cmd=("$(location @org_tensorflow//tensorflow/python/tools:freeze_graph)" +
+        cmd=("$(location //tensorflow/python/tools:freeze_graph)" +
              freeze_args),
-        tools=["@org_tensorflow//tensorflow/python/tools:freeze_graph"],
+        tools=["//tensorflow/python/tools:freeze_graph"],
         tags=tags,
     )
     tfcompile_graph = freeze_file
@@ -213,22 +214,22 @@ def tf_library(name, graph, config,
           # These deps are required by all tf_library targets even if
           # include_standard_runtime_deps is False.  Without them, the
           # generated code will fail to compile.
-          "@org_tensorflow//tensorflow/compiler/tf2xla:xla_compiled_cpu_function",
-          "@org_tensorflow//tensorflow/core:framework_lite",
+          "//tensorflow/compiler/tf2xla:xla_compiled_cpu_function",
+          "//tensorflow/core:framework_lite",
       ] + (need_xla_data_proto and [
           # If we're generating the program shape, we must depend on the proto.
-          "@org_tensorflow//tensorflow/compiler/xla:xla_data_proto",
+          "//tensorflow/compiler/xla:xla_data_proto",
       ] or []) + (include_standard_runtime_deps and [
           # TODO(cwhipkey): only depend on kernel code that the model actually needed.
-          "@org_tensorflow//tensorflow/compiler/tf2xla/kernels:index_ops_kernel_argmax_float_1d",
-          "@org_tensorflow//tensorflow/compiler/tf2xla/kernels:index_ops_kernel_argmax_float_2d",
-          "@org_tensorflow//tensorflow/compiler/xla/service/cpu:cpu_runtime_avx",
-          "@org_tensorflow//tensorflow/compiler/xla/service/cpu:cpu_runtime_neon",
-          "@org_tensorflow//tensorflow/compiler/xla/service/cpu:cpu_runtime_sse4_1",
-          "@org_tensorflow//tensorflow/compiler/xla/service/cpu:runtime_conv2d",
-          "@org_tensorflow//tensorflow/compiler/xla/service/cpu:runtime_matmul",
-          "@org_tensorflow//tensorflow/compiler/xla/service/cpu:runtime_single_threaded_conv2d",
-          "@org_tensorflow//tensorflow/compiler/xla/service/cpu:runtime_single_threaded_matmul",
+          "//tensorflow/compiler/tf2xla/kernels:index_ops_kernel_argmax_float_1d",
+          "//tensorflow/compiler/tf2xla/kernels:index_ops_kernel_argmax_float_2d",
+          "//tensorflow/compiler/xla/service/cpu:cpu_runtime_avx",
+          "//tensorflow/compiler/xla/service/cpu:cpu_runtime_neon",
+          "//tensorflow/compiler/xla/service/cpu:cpu_runtime_sse4_1",
+          "//tensorflow/compiler/xla/service/cpu:runtime_conv2d",
+          "//tensorflow/compiler/xla/service/cpu:runtime_matmul",
+          "//tensorflow/compiler/xla/service/cpu:runtime_single_threaded_conv2d",
+          "//tensorflow/compiler/xla/service/cpu:runtime_single_threaded_matmul",
           "//third_party/eigen3",
       ] or []) + (deps or []),
       tags=tags,
@@ -254,28 +255,32 @@ def tf_library(name, graph, config,
         name=("gen_" + test_name),
         testonly=1,
         srcs=[
-            "@org_tensorflow//tensorflow/compiler/aot:test.cc",
+            "//tensorflow/compiler/aot:test.cc",
             header_file,
         ],
         outs=[test_file],
         cmd=("sed " + sed_replace +
-             " $(location @org_tensorflow//tensorflow/compiler/aot:test.cc) " +
+             " $(location //tensorflow/compiler/aot:test.cc) " +
              "> $(OUTS)"),
         tags=tags,
     )
 
-    # The cc_test rule for the generated code.
-    native.cc_test(
+    # The cc_test rule for the generated code.  To ensure that this works
+    # reliably across build configurations, we must use tf_cc_test instead of
+    # native.cc_test.  This is related to how we build
+    # //tensorflow/core:lib -- see the note in tensorflow/core/BUILD
+    # for more details.
+    tf_cc_test(
         name=test_name,
         srcs=[test_file],
         deps=[
             ":" + name,
-            "@org_tensorflow//tensorflow/compiler/aot:runtime",
-            "@org_tensorflow//tensorflow/compiler/aot:tf_library_test_main",
-            "@org_tensorflow//tensorflow/compiler/xla:executable_run_options",
+            "//tensorflow/compiler/aot:runtime",
+            "//tensorflow/compiler/aot:tf_library_test_main",
+            "//tensorflow/compiler/xla:executable_run_options",
             "//third_party/eigen3",
-            "@org_tensorflow//tensorflow/core:lib",
-            "@org_tensorflow//tensorflow/core:test",
+            "//tensorflow/core:lib",
+            "//tensorflow/core:test",
             ],
         tags=tags,
     )
@@ -283,7 +288,7 @@ def tf_library(name, graph, config,
   if gen_benchmark:
     benchmark_name = name + "_benchmark"
     benchmark_file = benchmark_name + ".cc"
-    benchmark_main = ("@org_tensorflow//tensorflow/compiler/aot:" +
+    benchmark_main = ("//tensorflow/compiler/aot:" +
                       "benchmark_main.template")
 
     # Rule to rewrite benchmark.cc to produce the benchmark_file.
@@ -301,7 +306,9 @@ def tf_library(name, graph, config,
         tags=tags,
     )
 
-    # The cc_benchmark rule for the generated code.
+    # The cc_benchmark rule for the generated code.  This does not need the
+    # tf_cc_binary since we (by deliberate design) do not depend on
+    # //tensorflow/core:lib.
     #
     # Note: to get smaller size on android for comparison, compile with:
     #    --copt=-fvisibility=hidden
@@ -315,12 +322,12 @@ def tf_library(name, graph, config,
         linkopts = if_android(["-pie", "-s"]),
         deps=[
             ":" + name,
-            "@org_tensorflow//tensorflow/compiler/aot:benchmark",
-            "@org_tensorflow//tensorflow/compiler/aot:runtime",
-            "@org_tensorflow//tensorflow/compiler/xla:executable_run_options",
+            "//tensorflow/compiler/aot:benchmark",
+            "//tensorflow/compiler/aot:runtime",
+            "//tensorflow/compiler/xla:executable_run_options",
             "//third_party/eigen3",
         ] + if_android([
-            "@org_tensorflow//tensorflow/compiler/aot:benchmark_extra_android",
+            "//tensorflow/compiler/aot:benchmark_extra_android",
         ]),
         tags=tags,
     )
@@ -330,11 +337,11 @@ def target_llvm_triple():
   # TODO(toddw): Add target_triple for other targets.  For details see:
   # http://llvm.org/docs/doxygen/html/Triple_8h_source.html
   return select({
-      "@org_tensorflow//tensorflow:android_armeabi": "armv5-none-android",
-      "@org_tensorflow//tensorflow:android_arm": "armv7-none-android",
-      "@org_tensorflow//tensorflow:android_arm64": "aarch64-none-android",
-      "@org_tensorflow//tensorflow:android_x86": "i686-none-android",
-      "@org_tensorflow//tensorflow:linux_ppc64le": "ppc64le-ibm-linux-gnu",
-      "@org_tensorflow//tensorflow:darwin": "x86_64-none-darwin",
+      "//tensorflow:android_armeabi": "armv5-none-android",
+      "//tensorflow:android_arm": "armv7-none-android",
+      "//tensorflow:android_arm64": "aarch64-none-android",
+      "//tensorflow:android_x86": "i686-none-android",
+      "//tensorflow:linux_ppc64le": "ppc64le-ibm-linux-gnu",
+      "//tensorflow:darwin": "x86_64-none-darwin",
       "//conditions:default": "x86_64-pc-linux",
   })
-- 
GitLab


From 47fcca75bc8ec9e3c9d484e055c94facef280e21 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Thu, 1 Feb 2018 06:47:06 -0800
Subject: [PATCH 525/966] [TF:XLA] Implement MatrixSetDiag and MatrixBandPart.
 Add support for int32 indices to the MatrixBandPart operator.

PiperOrigin-RevId: 184133343
---
 tensorflow/compiler/tests/BUILD               | 13 +++
 tensorflow/compiler/tests/binary_ops_test.py  | 44 +++++++++
 .../compiler/tests/matrix_band_part_test.py   | 64 ++++++++++++
 tensorflow/compiler/tf2xla/kernels/BUILD      |  2 +
 .../tf2xla/kernels/matrix_band_part_op.cc     | 98 +++++++++++++++++++
 .../tf2xla/kernels/matrix_set_diag_op.cc      | 93 ++++++++++++++++++
 .../core/kernels/matrix_band_part_op.cc       | 12 ++-
 tensorflow/core/ops/array_ops.cc              |  5 +-
 .../kernel_tests/matrix_band_part_op_test.py  | 11 ++-
 9 files changed, 335 insertions(+), 7 deletions(-)
 create mode 100644 tensorflow/compiler/tests/matrix_band_part_test.py
 create mode 100644 tensorflow/compiler/tf2xla/kernels/matrix_band_part_op.cc
 create mode 100644 tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc

diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 7277ba42ce..b0b038775f 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -353,6 +353,19 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "matrix_band_part_test",
+    size = "medium",
+    srcs = ["matrix_band_part_test.py"],
+    tags = ["optonly"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 tf_xla_py_test(
     name = "momentum_test",
     size = "small",
diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index 16856bd736..9d34cdfe10 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -1181,6 +1181,50 @@ class BinaryOpsTest(XLATestCase):
                        np.array([4, 5, 6], dtype=np.int32),
                        expected=None)
 
+  def testMatrixSetDiag(self):
+    for dtype in self.numeric_types:
+      # Square
+      self._testBinary(
+          array_ops.matrix_set_diag,
+          np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 1.0], [1.0, 1.0, 1.0]],
+                   dtype=dtype),
+          np.array([1.0, 2.0, 3.0], dtype=dtype),
+          expected=np.array([[1.0, 1.0, 0.0], [1.0, 2.0, 1.0], [1.0, 1.0, 3.0]],
+                            dtype=dtype))
+
+      self._testBinary(
+          array_ops.matrix_set_diag,
+          np.array([[[1.0, 0.0, 3.0], [0.0, 2.0, 0.0], [1.0, 0.0, 3.0]],
+                    [[4.0, 0.0, 4.0], [0.0, 5.0, 0.0], [2.0, 0.0, 6.0]]],
+                   dtype=dtype),
+          np.array([[-1.0, 0.0, -3.0], [-4.0, -5.0, -6.0]], dtype=dtype),
+          expected=np.array(
+              [[[-1.0, 0.0, 3.0], [0.0, 0.0, 0.0], [1.0, 0.0, -3.0]],
+               [[-4.0, 0.0, 4.0], [0.0, -5.0, 0.0], [2.0, 0.0, -6.0]]],
+              dtype=dtype))
+
+      # Rectangular
+      self._testBinary(
+          array_ops.matrix_set_diag,
+          np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 1.0]], dtype=dtype),
+          np.array([3.0, 4.0], dtype=dtype),
+          expected=np.array([[3.0, 1.0, 0.0], [1.0, 4.0, 1.0]], dtype=dtype))
+
+      self._testBinary(
+          array_ops.matrix_set_diag,
+          np.array([[0.0, 1.0], [1.0, 0.0], [1.0, 1.0]], dtype=dtype),
+          np.array([3.0, 4.0], dtype=dtype),
+          expected=np.array([[3.0, 1.0], [1.0, 4.0], [1.0, 1.0]], dtype=dtype))
+
+      self._testBinary(
+          array_ops.matrix_set_diag,
+          np.array([[[1.0, 0.0, 3.0], [0.0, 2.0, 0.0]],
+                    [[4.0, 0.0, 4.0], [0.0, 5.0, 0.0]]], dtype=dtype),
+          np.array([[-1.0, -2.0], [-4.0, -5.0]],
+                   dtype=dtype),
+          expected=np.array([[[-1.0, 0.0, 3.0], [0.0, -2.0, 0.0]],
+                             [[-4.0, 0.0, 4.0], [0.0, -5.0, 0.0]]],
+                            dtype=dtype))
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/compiler/tests/matrix_band_part_test.py b/tensorflow/compiler/tests/matrix_band_part_test.py
new file mode 100644
index 0000000000..29394f9ea5
--- /dev/null
+++ b/tensorflow/compiler/tests/matrix_band_part_test.py
@@ -0,0 +1,64 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class MatrixBandPartTest(XLATestCase):
+
+  def _testMatrixBandPart(self, dtype, shape):
+    with self.test_session():
+      batch_shape = shape[:-2]
+      mat = np.ones(shape).astype(dtype)
+      batch_mat = np.tile(mat, batch_shape + [1, 1])
+      for lower in -1, 0, 1, shape[-2] - 1:
+        for upper in -1, 0, 1, shape[-1] - 1:
+          band_np = mat
+          if lower >= 0:
+            band_np = np.triu(band_np, -lower)
+          if upper >= 0:
+            band_np = np.tril(band_np, upper)
+          if batch_shape:
+            band_np = np.tile(band_np, batch_shape + [1, 1])
+
+          placeholder = array_ops.placeholder(dtype)
+          with self.test_scope():
+            band = array_ops.matrix_band_part(
+                placeholder,
+                constant_op.constant(lower, dtype=dtypes.int32),
+                constant_op.constant(upper, dtype=dtypes.int32))
+            feed_dict = {placeholder: batch_mat}
+            self.assertAllEqual(band_np, band.eval(feed_dict=feed_dict))
+
+  def testMatrixBandPart(self):
+    for dtype in self.float_types:
+      for batch_shape in [[], [2,], [1, 3, 2]]:
+        for rows in 1, 2, 7:
+          for cols in 1, 2, 7:
+            self._testMatrixBandPart(dtype, batch_shape + [rows, cols])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index 67be1a4ba6..e9be6f8476 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -44,6 +44,8 @@ tf_kernel_library(
         "l2loss_op.cc",
         "lrn_ops.cc",
         "matmul_op.cc",
+        "matrix_band_part_op.cc",
+        "matrix_set_diag_op.cc",
         "matrix_triangular_solve_op.cc",
         "mirror_pad_op.cc",
         "no_op.cc",
diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_band_part_op.cc b/tensorflow/compiler/tf2xla/kernels/matrix_band_part_op.cc
new file mode 100644
index 0000000000..faa415a97b
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/matrix_band_part_op.cc
@@ -0,0 +1,98 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+
+namespace tensorflow {
+namespace {
+
+class MatrixBandPartOp : public XlaOpKernel {
+ public:
+  explicit MatrixBandPartOp(OpKernelConstruction* context)
+      : XlaOpKernel(context) {}
+
+  void Compile(XlaOpKernelContext* context) override {
+    const TensorShape input_shape = context->InputShape(0);
+    // Preliminary validation of sizes.
+    OP_REQUIRES(context, TensorShapeUtils::IsMatrixOrHigher(input_shape),
+                errors::InvalidArgument(
+                    "input must be at least 2-dim, received shape: ",
+                    input_shape.DebugString()));
+
+    const TensorShape num_lower_in_shape = context->InputShape(1);
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(num_lower_in_shape),
+                errors::InvalidArgument("num_lower must be scalar, got shape ",
+                                        num_lower_in_shape.DebugString()));
+
+    const TensorShape num_upper_in_shape = context->InputShape(2);
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(num_upper_in_shape),
+                errors::InvalidArgument("num_upper must be scalar, got shape ",
+                                        num_upper_in_shape.DebugString()));
+
+    xla::ComputationBuilder* builder = context->builder();
+    xla::ComputationDataHandle input = context->Input(0);
+    xla::ComputationDataHandle num_lower = context->Input(1);
+    xla::ComputationDataHandle num_upper = context->Input(2);
+    DataType input_type = context->input_type(0);
+    DataType index_type = context->input_type(1);
+
+    TensorShape batch_shape = input_shape;
+    batch_shape.RemoveLastDims(2);
+    const int64 m = input_shape.dim_size(input_shape.dims() - 2);
+    const int64 n = input_shape.dim_size(input_shape.dims() - 1);
+
+    // Compute 'offset', which is how many diagonals we are above/below the
+    // diagonal.
+    xla::ComputationDataHandle iota_m;
+    OP_REQUIRES_OK(context, XlaHelpers::Iota(builder, index_type, m, &iota_m));
+
+    xla::ComputationDataHandle iota_n;
+    OP_REQUIRES_OK(context, XlaHelpers::Iota(builder, index_type, n, &iota_n));
+
+    auto offset = builder->Sub(builder->Broadcast(iota_n, {m}), iota_m,
+                               /*broadcast_dimensions=*/{0});
+
+    // If num_lower or num_upper are negative, include all lower/upper
+    // diagonals.
+    auto zero_index = XlaHelpers::Zero(builder, index_type);
+    num_lower = builder->Select(
+        builder->Lt(num_lower, zero_index),
+        XlaHelpers::IntegerLiteral(builder, index_type, m), num_lower);
+    num_upper = builder->Select(
+        builder->Lt(num_upper, zero_index),
+        XlaHelpers::IntegerLiteral(builder, index_type, n), num_upper);
+
+    auto indicator = builder->And(builder->Le(builder->Neg(num_lower), offset),
+                                  builder->Le(offset, num_upper));
+    indicator = builder->Broadcast(indicator, batch_shape.dim_sizes());
+
+    auto zero_input = XlaHelpers::Zero(builder, input_type);
+    auto output = builder->Select(
+        indicator, input,
+        builder->Broadcast(zero_input, input_shape.dim_sizes()));
+
+    context->SetOutput(0, output);
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(MatrixBandPartOp);
+};
+REGISTER_XLA_OP(Name("MatrixBandPart"), MatrixBandPartOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc b/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc
new file mode 100644
index 0000000000..b2940bdcff
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc
@@ -0,0 +1,93 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+
+namespace tensorflow {
+
+class MatrixSetDiagOp : public XlaOpKernel {
+ public:
+  explicit MatrixSetDiagOp(OpKernelConstruction* context)
+      : XlaOpKernel(context) {}
+
+  void Compile(XlaOpKernelContext* context) override {
+    const TensorShape input_shape = context->InputShape(0);
+    const TensorShape diag_shape = context->InputShape(1);
+
+    const int rank = input_shape.dims();
+
+    // Preliminary validation of sizes.
+    OP_REQUIRES(context, TensorShapeUtils::IsMatrixOrHigher(input_shape),
+                errors::InvalidArgument(
+                    "input must be at least 2-dim, received shape: ",
+                    input_shape.DebugString()));
+
+    // Check to make sure the last dimension of diag is equal to the smaller of
+    // the last two dimensions of input.
+    const int64 m = input_shape.dim_size(rank - 2);
+    const int64 n = input_shape.dim_size(rank - 1);
+    const int64 min_dim = std::min(m, n);
+
+    TensorShape batch_shape = input_shape;
+    batch_shape.RemoveLastDims(2);
+
+    TensorShape expected_diag_shape = batch_shape;
+    expected_diag_shape.AddDim(min_dim);
+    OP_REQUIRES(context, expected_diag_shape == diag_shape,
+                errors::InvalidArgument(
+                    "must have diagonal.shape == input.shape[:-2] + "
+                    "min(input.shape[-2:]), but received input shape: ",
+                    input_shape.DebugString(),
+                    " and diagonal shape: ", diag_shape.DebugString()));
+
+    xla::ComputationBuilder* builder = context->builder();
+    xla::ComputationDataHandle input = context->Input(0);
+    xla::ComputationDataHandle diag = context->Input(1);
+
+    auto zero = XlaHelpers::Zero(builder, context->input_type(0));
+
+    // Create an indicator tensor that is true only on the diagonal.
+    xla::ComputationDataHandle iota_m;
+    OP_REQUIRES_OK(context, XlaHelpers::Iota(builder, DT_INT32, m, &iota_m));
+    xla::ComputationDataHandle iota_n;
+    OP_REQUIRES_OK(context, XlaHelpers::Iota(builder, DT_INT32, n, &iota_n));
+    auto indicator = builder->Eq(iota_m,
+                                 builder->Broadcast(iota_n, {m}),
+                                 /*broadcast_dimensions=*/{0});
+    indicator = builder->Broadcast(indicator, batch_shape.dim_sizes());
+
+    // Broadcast diag up to the input shape. Use an implicit broadcast (Add)
+    // because we need to broadcast on the right.
+    std::vector<int64> diag_broadcast_dims(rank - 1);
+    std::iota(diag_broadcast_dims.begin(), diag_broadcast_dims.end(), 0);
+    if (min_dim != m) {
+      diag_broadcast_dims.back() = rank - 1;
+    }
+    diag = builder->Add(diag, builder->Broadcast(zero, input_shape.dim_sizes()),
+                        /*broadcast_dimensions=*/diag_broadcast_dims);
+
+    auto output = builder->Select(indicator, diag, input);
+    context->SetOutput(0, output);
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(MatrixSetDiagOp);
+};
+
+REGISTER_XLA_OP(Name("MatrixSetDiag"), MatrixSetDiagOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/matrix_band_part_op.cc b/tensorflow/core/kernels/matrix_band_part_op.cc
index d7fff4bb0c..1439141f64 100644
--- a/tensorflow/core/kernels/matrix_band_part_op.cc
+++ b/tensorflow/core/kernels/matrix_band_part_op.cc
@@ -62,7 +62,15 @@ class MatrixBandPartOp : public OpKernel {
     OP_REQUIRES(context, TensorShapeUtils::IsScalar(num_lower_in.shape()),
                 errors::InvalidArgument("num_lower must be scalar, got shape ",
                                         num_lower_in.shape().DebugString()));
-    const int64 num_lower = num_lower_in.scalar<int64>()();
+
+    auto as_int64_scalar = [](const Tensor& tensor) -> int64 {
+      if (tensor.dtype() == DT_INT32) {
+        return tensor.scalar<int32>()();
+      } else {
+        return tensor.scalar<int64>()();
+      }
+    };
+    const int64 num_lower = as_int64_scalar(num_lower_in);
     OP_REQUIRES(
         context, num_lower <= input_reshaped.dimension(1),
         errors::InvalidArgument(
@@ -73,7 +81,7 @@ class MatrixBandPartOp : public OpKernel {
     OP_REQUIRES(context, TensorShapeUtils::IsScalar(num_upper_in.shape()),
                 errors::InvalidArgument("num_upper must be scalar, got shape ",
                                         num_upper_in.shape().DebugString()));
-    const int64 num_upper = num_upper_in.scalar<int64>()();
+    const int64 num_upper = as_int64_scalar(num_upper_in);
     OP_REQUIRES(context, num_upper <= input_reshaped.dimension(2),
                 errors::InvalidArgument("num_upper must be negative or less or "
                                         "equal to number of columns (",
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index fb9e8ad50c..87dfa77689 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -701,10 +701,11 @@ REGISTER_OP("MatrixDiagPart")
 // --------------------------------------------------------------------------
 REGISTER_OP("MatrixBandPart")
     .Input("input: T")
-    .Input("num_lower: int64")
-    .Input("num_upper: int64")
+    .Input("num_lower: Tindex")
+    .Input("num_upper: Tindex")
     .Output("band: T")
     .Attr("T: type")
+    .Attr("Tindex: {int32, int64} = DT_INT64")
     .SetShapeFn(shape_inference::UnchangedShape);
 
 // --------------------------------------------------------------------------
diff --git a/tensorflow/python/kernel_tests/matrix_band_part_op_test.py b/tensorflow/python/kernel_tests/matrix_band_part_op_test.py
index 317b8dc05b..68d626de2c 100644
--- a/tensorflow/python/kernel_tests/matrix_band_part_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_band_part_op_test.py
@@ -21,6 +21,7 @@ import numpy as np
 
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes as dtypes_lib
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -54,9 +55,13 @@ def _GetMatrixBandPartTest(dtype_, batch_shape_, shape_):
           band_np = np.tril(band_np, upper)
         if batch_shape_ is not ():
           band_np = np.tile(band_np, batch_shape_ + (1, 1))
-        with self.test_session(use_gpu=False):
-          band = array_ops.matrix_band_part(batch_mat, lower, upper)
-          self.assertAllEqual(band_np, band.eval())
+        for index_dtype in [dtypes_lib.int32, dtypes_lib.int64]:
+          with self.test_session(use_gpu=False):
+            band = array_ops.matrix_band_part(
+                batch_mat,
+                constant_op.constant(lower, index_dtype),
+                constant_op.constant(upper, index_dtype))
+            self.assertAllEqual(band_np, band.eval())
 
   return Test
 
-- 
GitLab


From fce33290a7b75003398761ea60b0bb6f1fdd3880 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 1 Feb 2018 08:18:13 -0800
Subject: [PATCH 526/966] Update ops-related pbtxt files.

PiperOrigin-RevId: 184141875
---
 .../core/ops/compat/ops_history.v1.pbtxt      | 36 +++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 | 17 +++++++--
 2 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 0930eaa01f..177561161e 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -24858,6 +24858,42 @@ op {
     type: "type"
   }
 }
+op {
+  name: "MatrixBandPart"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_lower"
+    type_attr: "Tindex"
+  }
+  input_arg {
+    name: "num_upper"
+    type_attr: "Tindex"
+  }
+  output_arg {
+    name: "band"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindex"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "MatrixDeterminant"
   input_arg {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 36e0bab1b1..2cd8d8a03b 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -12348,11 +12348,11 @@ op {
   }
   input_arg {
     name: "num_lower"
-    type: DT_INT64
+    type_attr: "Tindex"
   }
   input_arg {
     name: "num_upper"
-    type: DT_INT64
+    type_attr: "Tindex"
   }
   output_arg {
     name: "band"
@@ -12362,6 +12362,19 @@ op {
     name: "T"
     type: "type"
   }
+  attr {
+    name: "Tindex"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
 }
 op {
   name: "MatrixDeterminant"
-- 
GitLab


From df344949494003ffa6eb8d9cb777558128436dc6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 1 Feb 2018 09:55:53 -0800
Subject: [PATCH 527/966] Add shape inference for outside_compilation graph
 rewrite. Pull out enough of the graph to enable inference of the shape of a
 SendFromHost Op once the shape of corresponding RecvAtHost Ops are known.

PiperOrigin-RevId: 184153187
---
 .../jit/encapsulate_subgraphs_pass.cc         | 647 ++++++++++++++---
 .../jit/encapsulate_subgraphs_pass_test.cc    | 682 ++++++++++++++----
 tensorflow/core/framework/function.cc         |  22 +-
 tensorflow/core/framework/function.h          |  15 +-
 4 files changed, 1125 insertions(+), 241 deletions(-)

diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
index 0de163d3a8..8edae9fc9c 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
@@ -30,12 +30,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/common_runtime/shape_refiner.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
@@ -141,8 +143,7 @@ struct NodeSlot {
 // everything to use it.
 static const char* const kArgOp = "_Arg";
 static const char* const kRetValOp = "_Retval";
-static const char* const kSendToHostOp = "_XlaSendToHost";
-static const char* const kRecvFromHostOp = "_XlaRecvFromHost";
+static const char* const kHostComputeOp = "_XlaHostCompute";
 static const char* const kSendFromHostOp = "_XlaSendFromHost";
 static const char* const kRecvAtHostOp = "_XlaRecvAtHost";
 
@@ -171,7 +172,8 @@ class Encapsulator {
 
   // Write a copy of the input graph to 'graph_out', where the subgraphs are
   // replaced with calls to the new functions.
-  Status BuildOutputGraph(bool parallel_checking, Graph* graph_out);
+  Status BuildOutputGraph(bool parallel_checking, Graph* graph_out,
+                          FunctionLibraryDefinition* library);
 
  private:
   // A subgraph of the input, all marked with a common 'group_attribute'
@@ -201,21 +203,29 @@ class Encapsulator {
   //     ..             .
   //  RAH -->  C  --> SFH
   //
-  // The compiled cluster is as follows. STH is a SendToHost node which is the
-  // source of a channel to the RAH node above. RFH is a RecvFromHost node which
-  // is the destination of a channel from the SFH node above. There is a control
-  // edge that ensures RFH follows STH, which is used in shape inference to
-  // ensure that the shapes on the STH host channel are known before the RFH
-  // channel is compiled.
+  // The compiled cluster is as follows. HC is a HostCompute node which is the
+  // source of a channel to the RAH node above and the destination of a channel
+  // from the SFH node above.
   //
-  //  Arg  --> B  --> STH  ..>  RFH  --> D --> Retval
+  //  Arg  --> B  --> HC  --> D --> Retval
   //
-  // The channels STH/RAH and SFH/RFH each transmit a tuple, so there is at most
-  // one RAH and SFH in each compiled cluster. This design is preferred over
-  // adding separate Arg/Retval nodes for each transmitted value because it
-  // simplifies the host code that would like to limit communication between
-  // host and device and, e.g., raise only one interrupt per channel rather than
-  // one per transmitted value.
+  // The channels HC/RAH and SFH/HC each transmit multiple tensors, so there is
+  // at most one RAH and SFH in each outside_compilation cluster. This design is
+  // preferred over adding separate Arg/Retval nodes for each transmitted value
+  // because it allows optimizations to the host code that would like to limit
+  // communication between host and device and, e.g., raise only one interrupt
+  // per channel rather than one per transmitted value.
+  //
+  // The shapes of the outputs from the HC node in general cannot be determined
+  // until the shapes of its inputs are known at compile time, since e.g.,
+  // above, the shape of C's outputs aren't known until the shape of its inputs
+  // are known. If the shapes of the HC's outputs can be determined during the
+  // rewrite, they are stored in the node's 'shapes' attr. Otherwise a minimal
+  // graph is stored in the shape_inference_graph attr. This graph can be used
+  // when compiling the HC Op to determined the shape of the SFH inputs given
+  // the shapes of any ancestor RAH outputs. If it can be determined that the
+  // shape of the SFH inputs will not be inferrable even once the shapes of the
+  // RAH outputs are known, an error is returned by the rewriter.
   class Subgraph {
    public:
     // Creates a graph to build the subgraph in, if it doesn't already exist,
@@ -246,6 +256,10 @@ class Encapsulator {
         const std::unordered_map<const Node*, Node*>& node_images,
         Graph* graph_out);
 
+    // Returns the names of all the outside_compilation subgraphs in this
+    // Subgraph.
+    void GetOutsideCompilationSubgraphNames(std::vector<string>* names) const;
+
     // Returns the Node that inputs to the function should be wired up to.
     Node* GetCallNodeForInputs() const;
 
@@ -305,15 +319,9 @@ class Encapsulator {
     void RecordOutsideCompilationOutputOrControl(
         const string& outside_compilation_id, const Edge* edge);
 
-    // Adds the SendToHost nodes for each outside_compilation subgraph once the
-    // edges have all been recorded via RecordOutsideCompilationInputOrControl.
-    Status AddSendsToOutsideCompilation(
-        const std::unordered_map<const Node*, Node*>& node_images);
-
-    // Adds the RecvFromHost nodes for each outside_compilation subgraph once
-    // the edges have all been recorded via
-    // RecordOutsideCompilationOutputOrControl.
-    Status AddRecvsFromOutsideCompilation(
+    // Adds the HostCompute nodes for each outside_compilation subgraph.
+    Status AddHostComputes(
+        const string& subgraph_name,
         const std::unordered_map<const Node*, Node*>& node_images);
 
     // Creates the sequencer node if it doesn't exist, adding it to graph_out.
@@ -323,10 +331,16 @@ class Encapsulator {
     // all the downstream nodes of call_node_outputs.
     void ConnectSequencerToOutputs(Graph* graph_out);
 
+    Status AddShapeInferenceInfo(
+        const string& outside_compilation_subgraph_name,
+        const std::vector<TensorShapeProto>& shapes, GraphDef* inference_graph);
+
+    Status ReplaceFunctionDef(FunctionLibraryDefinition* library);
+
    private:
     struct OutsideCompilationSubgraph {
       // Map from source (producer node/slot) tensors in the original graph to
-      // input index (slot number in the SendToHost/RecvAtHost nodes that will
+      // input index (slot number in the HostCompute/RecvAtHost nodes that will
       // be created) for the outside_compilation subgraph.
       std::unordered_map<NodeSlot, int, NodeSlot::Hasher> inputs;
 
@@ -335,14 +349,14 @@ class Encapsulator {
       // outside_compilation subgraph. These are recorded by
       // RecordOutsideCompilationInputOrControl while walking all the subgraph
       // edges, and lifted control edges within the subgraph are added by
-      // AddSendsToOutsideCompilation once the _SendToHost node has been
+      // AddSendsToOutsideCompilation once the _HostCompute node has been
       // created. The matching control edge from _RecvAtHost to the
       // destination is added by CopyEdgeToOutputGraph.
       std::unordered_set<const Node*> control_inputs;
 
       // Maps from source (producer node/slot) and destination (consumer
       // node/slot) tensors in the original graph to output index (slot number
-      // in the SendFromHost/RecvFromHost nodes that will be created) for the
+      // in the SendFromHost/HostCompute nodes that will be created) for the
       // outside_compilation subgraph.
       std::unordered_map<NodeSlot, int, NodeSlot::Hasher> outputs_by_src;
       std::unordered_map<NodeSlot, int, NodeSlot::Hasher> outputs_by_dst;
@@ -352,13 +366,13 @@ class Encapsulator {
       // containing compiled subgraph. These are recorded by
       // RecordOutsideCompilationOutputOrControl while walking all the subgraph
       // edges, and lifted control edges within the subgraph are added by
-      // AddRecvsFromToOutsideCompilation once the _RecvFromHost node has been
+      // AddRecvsFromToOutsideCompilation once the _HostCompute node has been
       // created. The matching control edge from the source to _SendFromHost to
       // the destination is added by CopyEdgeToOutputGraph.
       std::unordered_set<const Node*> control_outputs;
 
-      // _SendToHost node in the subgraph. Not owned.
-      Node* send_to_host = nullptr;
+      // Name of the _HostCompute node in the subgraph.
+      string host_compute_name;
 
       // _RecvAtHost node in the output graph. Not owned.
       Node* recv_at_host = nullptr;
@@ -516,6 +530,59 @@ class Encapsulator {
       const std::unordered_map<const Node*, Node*>& node_images,
       bool parallel_checking, Graph* graph_out);
 
+  // Constructs a minimal shape inference graph that can be used to determine
+  // the shape of send_node at the time that the subgraph is compiled.
+  // recv_at_host_nodes contains the names of all the recv_at_host nodes that
+  // send_node might depend on. These recv_at_host nodes have shapes that are
+  // not known during the rewrite pass, but will be known at compile time.
+  //
+  // If the shapes of all the inputs to send_node can be determined during the
+  // rewrite pass, on exit graphdef_out is empty and the shapes are returned in
+  // static_shape_out. Otherwise graphdef_out contains a graph that can be used
+  // for shape inference at compile time, where all the source nodes of the
+  // graph are either constants with known shapes, or nodes named in
+  // recv_at_host_nodes.
+  //
+  // A non-OK status is returned if neither of the above conditions can be
+  // satisfied, e.g., because send_node depends on a node that doesn't have a
+  // registered shape inference function.
+  Status DoStaticShapeInferenceForOutsideCompilationSend(
+      const Graph& graph_in, const ShapeRefiner& shape_refiner,
+      const std::unordered_set<string>& recv_at_host_nodes, Node* send_node,
+      FunctionLibraryDefinition* library,
+      std::vector<TensorShapeProto>* static_shape_out,
+      std::unique_ptr<GraphDef>* graphdef_out);
+
+  // Makes a copy of graph containing only nodes that are ancestors of at least
+  // one node in send_from_host_nodes and store it in pruned_graph. On exit
+  // nodes_images contains a mapping from nodes in graph to nodes in
+  // pruned_graph. All functions in the copied graph are inlined.
+  Status MakePrunedGraphCopyAndInline(
+      const Graph& graph, const std::vector<Node*>& sink_nodes,
+      std::unique_ptr<Graph>* pruned_graph,
+      std::unordered_map<const Node*, Node*>* node_images,
+      FunctionLibraryDefinition* library);
+
+  // Makes a copy of graph containing only nodes that are ancestors of a
+  // send_from_host node in an outside_compilation subgraph, and store it in
+  // pruned_graph. Also perform shape inference on the pruned graph, using
+  // shape_refiner. On exit node_images contains a mapping from nodes in graph
+  // to nodes in pruned_graph.
+  Status MakeGraphForOutsideCompilationSends(
+      const Graph& graph, std::unique_ptr<Graph>* pruned_graph,
+      ShapeRefiner* shape_refiner,
+      std::unordered_map<const Node*, Node*>* node_images,
+      FunctionLibraryDefinition* library);
+
+  // Performs static shape inference, as far as possible, for the send_from_host
+  // nodes in each outside_compilation subgraph. Where it is not possible to
+  // determine the shape statically, stores a serialized GraphDef in the
+  // HostCompute 'shape_inference_graph' attr, to be used at compile time for
+  // final inference. If the shapes are known statically they are stored in the
+  // HostCompute 'shapes' attr.
+  Status GetShapeInfoForOutsideCompilationSends(
+      Graph* graph_out, FunctionLibraryDefinition* library);
+
   const string group_attribute_;
   const string outside_compilation_attribute_;
   const Graph* graph_in_;
@@ -682,16 +749,20 @@ void Encapsulator::Subgraph::RecordOutsideCompilationOutputOrControl(
   }
 }
 
-Status Encapsulator::Subgraph::AddSendsToOutsideCompilation(
+Status Encapsulator::Subgraph::AddHostComputes(
+    const string& subgraph_name,
     const std::unordered_map<const Node*, Node*>& node_images) {
   for (auto& oc_subgraph_iter : outside_compilation_subgraphs_) {
     const string& oc_subgraph_name = oc_subgraph_iter.first;
     OutsideCompilationSubgraph& oc_subgraph = oc_subgraph_iter.second;
-    if (!oc_subgraph.inputs.empty() || !oc_subgraph.control_inputs.empty()) {
-      // Build a _SendToHost node sending all the args of the appropriate
-      // types.
-      std::vector<DataType> dtypes(oc_subgraph.inputs.size(), DT_INVALID);
+    if (!oc_subgraph.inputs.empty() || !oc_subgraph.control_inputs.empty() ||
+        !oc_subgraph.outputs_by_src.empty() ||
+        !oc_subgraph.control_outputs.empty()) {
+      // Build a _HostCompute node.
       std::vector<NodeDefBuilder::NodeOut> inputs(oc_subgraph.inputs.size());
+      std::vector<DataType> input_dtypes(oc_subgraph.inputs.size(), DT_INVALID);
+      std::vector<DataType> output_dtypes(oc_subgraph.outputs_by_src.size(),
+                                          DT_INVALID);
 
       for (const auto& input_src : oc_subgraph.inputs) {
         const Node* src_node = input_src.first.node;
@@ -700,94 +771,64 @@ Status Encapsulator::Subgraph::AddSendsToOutsideCompilation(
         int input_index = input_src.second;
 
         DataType dtype = src_node->output_type(src_slot);
-        dtypes[input_index] = dtype;
         inputs[input_index].Reset(src_image->name(), src_slot, dtype);
+        input_dtypes[input_index] = dtype;
       }
 
-      NodeDef send_def;
-      NodeDefBuilder builder(
-          strings::StrCat("outside_compilation_", oc_subgraph_name, "_send"),
-          kSendToHostOp);
-      builder.Attr("dtypes", dtypes);
+      for (const auto& output : oc_subgraph.outputs_by_src) {
+        DataType dtype = output.first.dtype;
+        int output_index = output.second;
+        output_dtypes[output_index] = dtype;
+      }
+
+      NodeDef host_compute_def;
+      NodeDefBuilder builder(strings::StrCat("outside_compilation_",
+                                             oc_subgraph_name, "_host_compute"),
+                             kHostComputeOp);
       builder.Input(inputs);
-      Status s = builder.Finalize(&send_def);
+      builder.Attr("Tinputs", input_dtypes);
+      builder.Attr("Toutputs", output_dtypes);
+      builder.Attr("key",
+                   strings::StrCat("host_compute_channel_", subgraph_name, "_",
+                                   oc_subgraph_name));
+      Status s = builder.Finalize(&host_compute_def);
       if (!s.ok()) return s;
 
-      oc_subgraph.send_to_host = graph_->AddNode(send_def, &s);
+      Node* host_compute = graph_->AddNode(host_compute_def, &s);
       if (!s.ok()) return s;
+      oc_subgraph.host_compute_name = host_compute->name();
 
-      // Connect the _SendToHost node to its producers in the subgraph.
+      // Connect the _HostCompute node to its producers in the subgraph.
       for (auto& input_src : oc_subgraph.inputs) {
         const Node* src_node = input_src.first.node;
         Node* src_image = node_images.at(src_node);
         int src_slot = input_src.first.slot;
         int input_index = input_src.second;
-        graph_->AddEdge(src_image, src_slot, oc_subgraph.send_to_host,
-                        input_index);
+        graph_->AddEdge(src_image, src_slot, host_compute, input_index);
       }
 
-      // Connect the _SendToHost node to its control edge producers in the
+      // Connect the _HostCompute node to its control edge producers in the
       // subgraph.
       for (const auto& src_node : oc_subgraph.control_inputs) {
         Node* src_image = node_images.at(src_node);
-        graph_->AddControlEdge(src_image, oc_subgraph.send_to_host);
+        graph_->AddControlEdge(src_image, host_compute);
       }
-    }
-  }
-
-  return Status::OK();
-}
-
-Status Encapsulator::Subgraph::AddRecvsFromOutsideCompilation(
-    const std::unordered_map<const Node*, Node*>& node_images) {
-  for (auto& oc_subgraph_iter : outside_compilation_subgraphs_) {
-    const string& oc_subgraph_name = oc_subgraph_iter.first;
-    OutsideCompilationSubgraph& oc_subgraph = oc_subgraph_iter.second;
-    if (!oc_subgraph.outputs_by_src.empty() ||
-        !oc_subgraph.control_outputs.empty()) {
-      // Build a _RecvFromHost node producing all the outputs of the appropriate
-      // types.
-      std::vector<DataType> dtypes(oc_subgraph.outputs_by_src.size(),
-                                   DT_INVALID);
-
-      for (const auto& output : oc_subgraph.outputs_by_src) {
-        DataType dtype = output.first.dtype;
-        int output_index = output.second;
-        dtypes[output_index] = dtype;
-      }
-
-      NodeDef recv_def;
-      NodeDefBuilder builder(
-          strings::StrCat("outside_compilation_", oc_subgraph_name, "_recv"),
-          kRecvFromHostOp);
-      builder.Attr("dtypes", dtypes);
-      Status s = builder.Finalize(&recv_def);
-      if (!s.ok()) return s;
 
-      Node* recv = graph_->AddNode(recv_def, &s);
-      if (!s.ok()) return s;
-
-      // Connect the consumers in the subgraph to the _RecvFromHost node.
+      // Connect the consumers in the subgraph to the _HostCompute node.
       for (const auto& output : oc_subgraph.outputs_by_dst) {
         const Node* dst_node = output.first.node;
         Node* dst_image = node_images.at(dst_node);
         int dst_slot = output.first.slot;
         int output_index = output.second;
 
-        graph_->AddEdge(recv, output_index, dst_image, dst_slot);
+        graph_->AddEdge(host_compute, output_index, dst_image, dst_slot);
       }
 
-      // Connect the control edge consumers in the subgraph to the _RecvFromHost
+      // Connect the control edge consumers in the subgraph to the _HostCompute
       // node.
       for (const auto& dst_node : oc_subgraph.control_outputs) {
         Node* dst_image = node_images.at(dst_node);
-        graph_->AddControlEdge(recv, dst_image);
-      }
-
-      // Add a control edge in the subgraph so that the _SendToHost node, if
-      // any, is compiled before the _RecvFromHost node.
-      if (oc_subgraph.send_to_host != nullptr) {
-        graph_->AddControlEdge(oc_subgraph.send_to_host, recv);
+        graph_->AddControlEdge(host_compute, dst_image);
       }
     }
   }
@@ -882,6 +923,63 @@ Status Encapsulator::Subgraph::BuildFunctionDef(
   return Status::OK();
 }
 
+Status Encapsulator::Subgraph::AddShapeInferenceInfo(
+    const string& outside_compilation_subgraph_name,
+    const std::vector<TensorShapeProto>& shapes, GraphDef* inference_graph) {
+  OutsideCompilationSubgraph& oc_subgraph =
+      outside_compilation_subgraphs_.at(outside_compilation_subgraph_name);
+
+  Node* host_compute = nullptr;
+  for (Node* n : graph_->nodes()) {
+    if (n->name() == oc_subgraph.host_compute_name) {
+      host_compute = n;
+      break;
+    }
+  }
+  if (host_compute == nullptr) {
+    return errors::InvalidArgument(
+        "After rewriting subgraph ", outside_compilation_subgraph_name,
+        " there is no HostCompute Op for outside compilation subgraph ",
+        oc_subgraph.host_compute_name);
+  }
+
+  if (inference_graph == nullptr) {
+    host_compute->AddAttr("shape_inference_graph", "");
+    host_compute->AddAttr("shapes", shapes);
+  } else {
+    string serialized_graph;
+    if (!inference_graph->SerializeToString(&serialized_graph)) {
+      return errors::Internal(
+          "Failed to serialize graph for outside compilation subgraph ",
+          oc_subgraph.host_compute_name);
+    }
+    host_compute->AddAttr("shape_inference_graph", serialized_graph);
+    host_compute->AddAttr("shapes", std::vector<TensorShapeProto>());
+  }
+  return Status::OK();
+}
+
+Status Encapsulator::Subgraph::ReplaceFunctionDef(
+    FunctionLibraryDefinition* library) {
+  const string& name = call_node_def_.name();
+
+  FunctionDef fdef;
+  TF_RETURN_IF_ERROR(GraphToFunctionDef(*graph_, name, &fdef));
+
+  if (VLOG_IS_ON(1)) {
+    VLOG(2) << "Replace function def " << name;
+    dump_graph::DumpGraphToFile(
+        strings::StrCat("replace_encapsulate_fdef_graph_", name), *graph_,
+        library);
+    dump_graph::DumpFunctionDefToFile(
+        strings::StrCat("replace_encapsulate_fdef_", name), fdef);
+  }
+
+  TF_RETURN_IF_ERROR(library->RemoveFunction(name));
+  TF_RETURN_IF_ERROR(library->AddFunctionDef(fdef));
+  return Status::OK();
+}
+
 Status Encapsulator::Subgraph::BuildParallelCheckOp(
     const std::unordered_map<const Node*, Node*>& node_images,
     Graph* graph_out) {
@@ -980,7 +1078,9 @@ Status Encapsulator::Subgraph::AddRecvAtHostNode(
   NodeDefBuilder builder(strings::StrCat("outside_compilation_", subgraph_name,
                                          "_", oc_subgraph_name, "_recv"),
                          kRecvAtHostOp);
-  builder.Attr("dtypes", dtypes);
+  builder.Attr("Toutputs", dtypes);
+  builder.Attr("key", strings::StrCat("host_compute_channel_", subgraph_name,
+                                      "_", oc_subgraph_name));
   Status s = builder.Finalize(&recv_def);
   if (!s.ok()) return s;
 
@@ -1020,7 +1120,9 @@ Status Encapsulator::Subgraph::AddSendFromHostNode(
   NodeDefBuilder builder(strings::StrCat("outside_compilation_", subgraph_name,
                                          "_", oc_subgraph_name, "_send"),
                          kSendFromHostOp);
-  builder.Attr("dtypes", dtypes);
+  builder.Attr("Tinputs", dtypes);
+  builder.Attr("key", strings::StrCat("host_compute_channel_", subgraph_name,
+                                      "_", oc_subgraph_name));
   builder.Input(inputs);
   Status s = builder.Finalize(&send_def);
   if (!s.ok()) return s;
@@ -1062,6 +1164,13 @@ Status Encapsulator::Subgraph::AddOutsideCompilationHostIONodes(
   return Status::OK();
 }
 
+void Encapsulator::Subgraph::GetOutsideCompilationSubgraphNames(
+    std::vector<string>* names) const {
+  for (auto& entry : outside_compilation_subgraphs_) {
+    names->push_back(entry.first);
+  }
+}
+
 Status Encapsulator::GetFunctionNameAttr(
     Node const* node, string* attr, string* outside_compilation_attr) const {
   Status s = GetNodeAttr(node->attrs(), group_attribute_, attr);
@@ -1220,8 +1329,7 @@ Status Encapsulator::SplitIntoSubgraphs() {
   // single input and output node for it.
   for (auto& entry : subgraphs_) {
     Subgraph& subgraph = entry.second;
-    TF_RETURN_IF_ERROR(subgraph.AddSendsToOutsideCompilation(node_images));
-    TF_RETURN_IF_ERROR(subgraph.AddRecvsFromOutsideCompilation(node_images));
+    TF_RETURN_IF_ERROR(subgraph.AddHostComputes(entry.first, node_images));
   }
 
   MarkGuaranteedConstants(*graph_in_, src_arg_pairs);
@@ -1509,8 +1617,344 @@ Status Encapsulator::AddEdgesToOutputGraph(
   return Status::OK();
 }
 
-Status Encapsulator::BuildOutputGraph(bool parallel_checking,
-                                      Graph* graph_out) {
+namespace {
+
+// Adds a dummy Const node to graph_out. The "constant" has the type of
+// data_type and the shape indicated in 'shape'. The dummy node is not a valid
+// Const node because it does not have any value defined, but this doesn't
+// matter because it will only be used subsequently for shape inference. (It
+// would be possible to add a switch statement over data_type to create a value
+// for the constant, but that would entail maintaining the logic as new types
+// are added, and is not necessary.)
+Node* AddDummyShapedNode(DataType data_type, const TensorShapeProto& shape,
+                         Graph* graph_out) {
+  TensorProto dummy_proto;
+  dummy_proto.set_dtype(data_type);
+  *dummy_proto.mutable_tensor_shape() = shape;
+  // Don't set any value field in the proto, since it is only going to be used
+  // for shape inference.
+
+  GraphDefBuilder::Options options(graph_out, /*status=*/nullptr);
+  NodeBuilder node_builder(options.GetNameForOp("KnownShape"), "Const",
+                           options.op_registry());
+  node_builder.Attr("dtype", data_type).Attr("value", dummy_proto);
+  return options.FinalizeBuilder(&node_builder);
+}
+
+// Adds a copy of node_in to graph_out and adds the mapping to
+// copied_node_images.
+Status CopyShapeInferenceNodeToGraph(
+    Node* node_in, const Node* send_node,
+    const std::unordered_map<Node*, Node*>& dummy_node_images,
+    FunctionLibraryDefinition* library,
+    std::unordered_map<Node*, Node*>* copied_node_images, Graph* graph_out) {
+  // Once all the ancestor nodes have been added to graph_out, add this node
+  // and connect it to its ancestors.
+  Node* node_out = graph_out->CopyNode(node_in);
+  (*copied_node_images)[node_in] = node_out;
+  // Don't bother to build the shape inference graph if there's a node with no
+  // shape inference function, since it would just result in an error later at
+  // compile time.
+  const OpRegistrationData* op_reg_data;
+  TF_RETURN_IF_ERROR(library->LookUp(node_in->type_string(), &op_reg_data));
+  if (op_reg_data->shape_inference_fn == nullptr) {
+    return errors::InvalidArgument(
+        "Shape inference is not possible for outside_compilation "
+        "SendFromHost node ",
+        send_node->name(), " because it depends on node ", node_in->name(),
+        " which does not have a shape inference function registered.");
+  }
+  // Add all the edges to the newly copied node.
+  for (const Edge* in_edge : node_in->in_edges()) {
+    if (!in_edge->IsControlEdge()) {
+      Node* src = in_edge->src();
+      const auto iter = dummy_node_images.find(src);
+      if (iter == dummy_node_images.end()) {
+        // The src is a copied node so use the original output port.
+        graph_out->AddEdge((*copied_node_images)[in_edge->src()],
+                           in_edge->src_output(), node_out,
+                           in_edge->dst_input());
+      } else {
+        // The src is a dummy node so use output port 0.
+        graph_out->AddEdge(iter->second, 0, node_out, in_edge->dst_input());
+      }
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+Status Encapsulator::DoStaticShapeInferenceForOutsideCompilationSend(
+    const Graph& graph_in, const ShapeRefiner& shape_refiner,
+    const std::unordered_set<string>& recv_at_host_nodes, Node* send_node,
+    FunctionLibraryDefinition* library,
+    std::vector<TensorShapeProto>* static_shape_out,
+    std::unique_ptr<GraphDef>* graphdef_out) {
+  // Maps from nodes in graph_in to nodes in graph_out.
+  //
+  // When an edge has fully defined shape the source node in graph_in is
+  // replaced in graph_out by a dummy constant node. The mapping from nodes
+  // in graph_in to dummy nodes is stored in dummy_node_images.
+  //
+  // When a node in graph_in has at least one ancestor that doesn't have fully
+  // defined shape, it is copied into graph_out. The mapping from nodes in
+  // graph_in to copied nodes is stored in copied_node_images.
+  //
+  // The two types of node are treated differently because, when adding edges to
+  // graph_out, an output from a dummy node always uses port 0, whereas an
+  // output from a copied node uses the same port that was used in graph_in.
+  std::unordered_map<Node*, Node*> dummy_node_images;
+  std::unordered_map<Node*, Node*> copied_node_images;
+
+  std::unique_ptr<Graph> graph_out(new Graph(graph_in.op_registry()));
+  graph_out->set_versions(graph_in.versions());
+  static_shape_out->resize(send_node->num_inputs());
+
+  // We don't use the standard ReverseDFS because we want to cut off traversal
+  // whenever we find an output with fully defined shape.
+  // TODO(misard) make this work properly in the presence of control flow.
+  struct Work {
+    Node* node;
+    bool leave;  // Are we entering or leaving node?
+  };
+  std::vector<Work> stack({{send_node, false}});
+  std::vector<bool> visited(graph_in.num_node_ids(), false);
+  while (!stack.empty()) {
+    Work w = stack.back();
+    stack.pop_back();
+    Node* n = w.node;
+
+    if (w.leave) {
+      TF_RETURN_IF_ERROR(CopyShapeInferenceNodeToGraph(
+          n, send_node, dummy_node_images, library, &copied_node_images,
+          graph_out.get()));
+    } else {
+      if (visited[n->id()]) continue;
+      visited[n->id()] = true;
+
+      // Arrange to revisit when all done with all inputs.
+      stack.push_back(Work{n, true});
+
+      bool has_parent_with_unknown_shape = false;
+      for (const Edge* in_edge : n->in_edges()) {
+        if (!in_edge->IsControlEdge()) {
+          Node* src_node = in_edge->src();
+          int src_port = in_edge->src_output();
+          shape_inference::InferenceContext* context =
+              shape_refiner.GetContext(src_node);
+          shape_inference::ShapeHandle shape = context->output(src_port);
+          if (context->FullyDefined(shape)) {
+            // This ancestor has known shape, so instead of adding it to the
+            // stack, add a dummy node with that shape to graph_out and
+            // continue.
+            TensorShapeProto proto;
+            context->ShapeHandleToProto(shape, &proto);
+            dummy_node_images[src_node] = AddDummyShapedNode(
+                src_node->output_type(src_port), proto, graph_out.get());
+            if (n == send_node) {
+              (*static_shape_out)[in_edge->dst_input()] = proto;
+            }
+          } else {
+            if (!visited[src_node->id()]) {
+              has_parent_with_unknown_shape = true;
+              stack.push_back({src_node, false});
+            }
+          }
+        }
+      }
+      if (!has_parent_with_unknown_shape) {
+        if (n == send_node) {
+          // The shapes of all the inputs to send_node are statically known. We
+          // won't have to do any inference at compile time so return now: the
+          // shapes were stored in static_shape_out above.
+          graphdef_out->reset();
+          return Status::OK();
+        } else {
+          // Any shape that is being processed is either the original send node
+          // or has at least one output with statically-unknown shape. If the
+          // latter and it doesn't have any inputs with statically-unknown
+          // shape, then check that it is of the recv nodes that we can fill in
+          // the shape of at run-time later. If it isn't one of those, then we
+          // won't have any additional knowledge at compile time, so we already
+          // know we won't be able to do shape inference and we can return an
+          // error now.
+          if (recv_at_host_nodes.find(n->name()) == recv_at_host_nodes.end()) {
+            return errors::InvalidArgument(
+                "Shape inference is not possible for outside_compilation "
+                "SendFromHost node ",
+                send_node->name(), " because shape of node ", n->name(),
+                " will not be known at compilation time.");
+          }
+        }
+      }
+    }
+  }
+
+  graphdef_out->reset(new GraphDef());
+  graph_out->ToGraphDef(graphdef_out->get());
+
+  return Status::OK();
+}
+
+Status Encapsulator::MakePrunedGraphCopyAndInline(
+    const Graph& graph, const std::vector<Node*>& sink_nodes,
+    std::unique_ptr<Graph>* pruned_graph,
+    std::unordered_map<const Node*, Node*>* node_images,
+    FunctionLibraryDefinition* library) {
+  // First copy all ancestor nodes of sink_nodes into a new graph.
+  pruned_graph->reset(new Graph(library));
+  (*pruned_graph)->set_versions(graph.versions());
+  ReverseDFSFrom(graph, sink_nodes,
+                 /*enter=*/nullptr,
+                 /*leave=*/[&](Node* n) {
+                   if (!n->IsSource()) {
+                     Node* copied = (*pruned_graph)->CopyNode(n);
+                     node_images->emplace(n, copied);
+                   }
+                 });
+
+  // Add all the edges between copied nodes.
+  for (auto entry : *node_images) {
+    const Node* orig = entry.first;
+    Node* image = entry.second;
+    for (const Edge* out_edge : orig->out_edges()) {
+      auto iter = node_images->find(out_edge->dst());
+      if (iter != node_images->end()) {
+        // The source and destination are both in the copied graph.
+        (*pruned_graph)
+            ->AddEdge(image, out_edge->src_output(), iter->second,
+                      out_edge->dst_input());
+      }
+    }
+  }
+
+  // Find all the function call nodes, and inline them.
+  std::vector<Node*> function_nodes;
+  for (auto node : (*pruned_graph)->nodes()) {
+    const OpRegistrationData* op_reg_data;
+    TF_RETURN_IF_ERROR(library->LookUp(node->type_string(), &op_reg_data));
+    if (op_reg_data->is_function_op) {
+      function_nodes.push_back(node);
+    }
+  }
+  for (auto node : function_nodes) {
+    VLOG(2) << "Inlining function " << node->name();
+    const FunctionDef* fdef = library->Find(node->type_string());
+    if (fdef == nullptr) {
+      return errors::Internal("Failed to find function ", node->type_string(),
+                              " in function library.");
+    }
+    FunctionBody* fbody = nullptr;
+    TF_RETURN_IF_ERROR(
+        FunctionDefToBodyHelper(*fdef, node->attrs(), library,
+                                [library](const string& op, const OpDef** sig) {
+                                  return library->LookUpOpDef(op, sig);
+                                },
+                                &fbody));
+    InlineFunctionBody(*library, pruned_graph->get(), node, fbody);
+    delete fbody;
+  }
+
+  return Status::OK();
+}
+
+Status Encapsulator::MakeGraphForOutsideCompilationSends(
+    const Graph& graph, std::unique_ptr<Graph>* pruned_graph,
+    ShapeRefiner* shape_refiner,
+    std::unordered_map<const Node*, Node*>* node_images,
+    FunctionLibraryDefinition* library) {
+  // Find all the send_from_host nodes in all subgraphs, to use as roots for the
+  // pruning.
+  std::vector<Node*> send_from_host_nodes;
+  for (auto& subgraph_entry : subgraphs_) {
+    Subgraph& subgraph = subgraph_entry.second;
+    std::vector<string> outside_compilation_names;
+    subgraph.GetOutsideCompilationSubgraphNames(&outside_compilation_names);
+    for (const auto& name : outside_compilation_names) {
+      Node* send_node = subgraph.GetSendFromHostNode(name);
+      if (send_node != nullptr) {
+        send_from_host_nodes.push_back(send_node);
+      }
+    }
+  }
+
+  // Make a copy of all the graph nodes needed to evaluate the send_from_host
+  // nodes, inlining any functions as needed.
+  TF_RETURN_IF_ERROR(MakePrunedGraphCopyAndInline(
+      graph, send_from_host_nodes, pruned_graph, node_images, library));
+
+  // Perform shape inference on the pruned graph.
+  shape_refiner->set_require_shape_inference_fns(false);
+  FixupSourceAndSinkEdges(pruned_graph->get());
+  std::vector<Node*> post_order;
+  GetReversePostOrder(*(*pruned_graph), &post_order);
+  for (auto node : post_order) {
+    // Ignore the status returned by the shape_refiner. At this point we want
+    // the best effort shapes, even if no shape function is registered for a
+    // node.
+    Status status = shape_refiner->AddNode(node);
+    VLOG_IF(1, !status.ok()) << "Shape inference failed for node: " << status;
+  }
+
+  return Status::OK();
+}
+
+Status Encapsulator::GetShapeInfoForOutsideCompilationSends(
+    Graph* graph_out, FunctionLibraryDefinition* library) {
+  std::unique_ptr<Graph> pruned_graph;
+  ShapeRefiner shape_refiner(graph_out->versions(), graph_out->op_registry());
+  std::unordered_map<const Node*, Node*> node_images;
+  TF_RETURN_IF_ERROR(MakeGraphForOutsideCompilationSends(
+      *graph_out, &pruned_graph, &shape_refiner, &node_images, library));
+
+  for (auto& subgraph_entry : subgraphs_) {
+    Subgraph& subgraph = subgraph_entry.second;
+    // Find all the recv_at_host nodes in this subgraph.
+    std::vector<string> outside_compilation_names;
+    subgraph.GetOutsideCompilationSubgraphNames(&outside_compilation_names);
+    std::unordered_set<string> recv_at_host_names;
+    for (const auto& name : outside_compilation_names) {
+      Node* recv_node = subgraph.GetRecvAtHostNode(name);
+      if (recv_node != nullptr) {
+        recv_at_host_names.insert(recv_node->name());
+      }
+    }
+    // For each send_from_host node, do as much shape inference as possible
+    // without knowing the shape of the recv_at_host nodes, and store the
+    // result, along with enough information to complete the job at compile time
+    // once the recv_at_host shapes are known.
+    for (const auto& name : outside_compilation_names) {
+      Node* send_node = subgraph.GetSendFromHostNode(name);
+      std::vector<TensorShapeProto> static_shape;
+      std::unique_ptr<GraphDef> graphdef;
+      if (send_node != nullptr) {
+        TF_RETURN_IF_ERROR(DoStaticShapeInferenceForOutsideCompilationSend(
+            *pruned_graph, shape_refiner, recv_at_host_names,
+            node_images[send_node], library, &static_shape, &graphdef));
+        if (graphdef == nullptr) {
+          VLOG(2) << "Send node  " << send_node->name() << " shapes";
+          for (int i = 0; i < static_shape.size(); ++i) {
+            VLOG(2) << static_shape[i].DebugString();
+          }
+        } else {
+          VLOG(2) << "Send node " << send_node->name() << " graph\n"
+                  << graphdef->DebugString();
+        }
+      }
+      TF_RETURN_IF_ERROR(
+          subgraph.AddShapeInferenceInfo(name, static_shape, graphdef.get()));
+    }
+    if (!outside_compilation_names.empty()) {
+      TF_RETURN_IF_ERROR(subgraph.ReplaceFunctionDef(library));
+    }
+  }
+
+  return Status::OK();
+}
+
+Status Encapsulator::BuildOutputGraph(bool parallel_checking, Graph* graph_out,
+                                      FunctionLibraryDefinition* library) {
   // Map from nodes in the input graph to nodes in the output graph.
   std::unordered_map<const Node*, Node*> node_images;
 
@@ -1522,6 +1966,9 @@ Status Encapsulator::BuildOutputGraph(bool parallel_checking,
   TF_RETURN_IF_ERROR(
       AddEdgesToOutputGraph(node_images, parallel_checking, graph_out));
 
+  TF_RETURN_IF_ERROR(
+      GetShapeInfoForOutsideCompilationSends(graph_out, library));
+
   return Status::OK();
 }
 
@@ -1545,7 +1992,7 @@ Status EncapsulateSubgraphsInFunctions(
   std::unique_ptr<Graph> out(new Graph(library));
   out->set_versions(graph_in.versions());
   TF_RETURN_IF_ERROR(
-      encapsulator.BuildOutputGraph(parallel_checking, out.get()));
+      encapsulator.BuildOutputGraph(parallel_checking, out.get(), library));
 
   *graph_out = std::move(out);
   return Status::OK();
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
index b100861d5e..5032a0935d 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
@@ -29,17 +29,181 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+template <class Tkey, class Tvalue>
+bool EqualProtoMap(const proto2::Map<Tkey, Tvalue>& a,
+                   const proto2::Map<Tkey, Tvalue>& b,
+                   const std::function<string(const Tkey&)>& key_to_string,
+                   const std::function<string(const Tvalue&)>& value_to_string,
+                   const std::function<bool(const Tkey&, const Tvalue&,
+                                            const Tvalue&)>& compare,
+                   const string& map_name, string* diff) {
+  for (const auto& elt_a : a) {
+    const auto iter = b.find(elt_a.first);
+    if (iter == b.end()) {
+      if (diff) {
+        *diff = strings::StrCat(
+            map_name, " expected: contains element with key '",
+            key_to_string(elt_a.first), "' got: map has no such element");
+      }
+      return false;
+    }
+    if (!compare(elt_a.first, elt_a.second, iter->second)) {
+      if (diff) {
+        *diff = strings::StrCat(map_name, " expected: element with key '",
+                                key_to_string(elt_a.first), " has value '",
+                                value_to_string(elt_a.second), "' got: '",
+                                value_to_string(iter->second), "'");
+      }
+      return false;
+    }
+  }
+  for (const auto& elt_b : b) {
+    const auto iter = a.find(elt_b.first);
+    if (iter == a.end()) {
+      if (diff) {
+        *diff = strings::StrCat(map_name, " got: contains element with key '",
+                                key_to_string(elt_b.first),
+                                "' expected: map has no such element");
+      }
+      return false;
+    }
+  }
+  return true;
+}
+
+bool EqualFunctionNodeDef(const NodeDef& a, const NodeDef& b,
+                          const string& diff_preamble, string* diff) {
+  if (a.op() != b.op()) {
+    if (diff) {
+      *diff = strings::StrCat(diff_preamble, " mismatch for node ", a.name(),
+                              ", expected op '", a.op(), "' got '", b.op());
+    }
+    return false;
+  }
+  if (a.device() != b.device()) {
+    if (diff) {
+      *diff = strings::StrCat(diff_preamble, " mismatch for node ", a.name(),
+                              ", expected device '", a.device(), "' got '",
+                              b.device());
+    }
+    return false;
+  }
+  if (a.input_size() != b.input_size()) {
+    if (diff) {
+      *diff = strings::StrCat(diff_preamble, " mismatch for node ", a.name(),
+                              ", expected ", a.input_size(), " inputs got ",
+                              b.input_size(), " expected:\n", a.DebugString(),
+                              "\ngot:\n", b.DebugString());
+    }
+    return false;
+  }
+  for (int i = 0; i < a.input_size(); ++i) {
+    if (a.input(i) != b.input(i)) {
+      if (diff) {
+        *diff = strings::StrCat(diff_preamble, " mismatch for node ", a.name(),
+                                " input ", i, ", expected ", a.input(i),
+                                " got ", b.input(i), " expected:\n",
+                                a.DebugString(), "\ngot:\n", b.DebugString());
+      }
+      return false;
+    }
+  }
+  return EqualProtoMap<string, AttrValue>(
+      a.attr(), b.attr(), [](const string& s) { return s; },
+      [](const AttrValue& v) { return v.DebugString(); },
+      [](const string& key, const AttrValue& av, const AttrValue& bv) {
+        if (key == "shape_inference_graph") {
+          // Default serialization of GraphDef is unstable because maps don't
+          // serialize deterministically. Rather than go through the hoops to
+          // turn on deterministic serialization of this attr just for this
+          // test, add logic here to compare determinstically.
+          GraphDef ga;
+          if (!ga.ParseFromString(av.s())) {
+            return false;
+          }
+          GraphDef gb;
+          if (!gb.ParseFromString(bv.s())) {
+            return false;
+          }
+          return EqualGraphDef(ga, gb, nullptr);
+        } else {
+          return av.DebugString() == bv.DebugString();
+        }
+      },
+      strings::StrCat(diff_preamble, " attr mismatch for node ", a.name()),
+      diff);
+}
+
 bool EqualFunctionDef(const FunctionDef& a, const FunctionDef& b,
                       string* diff) {
-  // TODO(phawkins) use a more sophisticated equality test.
-  if (a.DebugString() != b.DebugString()) {
+  if (a.signature().DebugString() != b.signature().DebugString()) {
     if (diff) {
-      *diff = strings::StrCat("Definition mismatch for function ",
+      *diff = strings::StrCat("Signature mismatch for function ",
                               a.signature().name(), ", expected:\n",
-                              a.DebugString(), "\ngot:\n", b.DebugString());
+                              a.signature().DebugString(), "\ngot:\n",
+                              b.signature().DebugString());
     }
     return false;
   }
+  if (!EqualProtoMap<string, AttrValue>(
+          a.attr(), b.attr(), [](const string& s) { return s; },
+          [](const AttrValue& v) { return v.DebugString(); },
+          [](const string& key, const AttrValue& av, const AttrValue& bv) {
+            return av.DebugString() == bv.DebugString();
+          },
+          strings::StrCat("attr mismatch for function ", a.signature().name()),
+          diff)) {
+    return false;
+  }
+  if (!EqualProtoMap<string, string>(
+          a.ret(), b.ret(), [](const string& s) { return s; },
+          [](const string& s) { return s; },
+          [](const string& key, const string& av, const string& bv) {
+            return av == bv;
+          },
+          strings::StrCat("ret mismatch for function ", a.signature().name()),
+          diff)) {
+    return false;
+  }
+  for (int i = 0; i < a.node_def_size(); ++i) {
+    bool found = false;
+    for (int j = 0; j < b.node_def_size(); ++j) {
+      if (a.node_def(i).name() == b.node_def(j).name()) {
+        if (!EqualFunctionNodeDef(
+                a.node_def(i), b.node_def(j),
+                strings::StrCat("Function ", a.signature().name()), diff)) {
+          return false;
+        }
+        found = true;
+        break;
+      }
+    }
+    if (!found) {
+      if (diff) {
+        *diff = strings::StrCat("Function ", a.signature().name(),
+                                ", expected: has node '", a.node_def(i).name(),
+                                "' got: no node of that name");
+      }
+      return false;
+    }
+  }
+  for (int i = 0; i < b.node_def_size(); ++i) {
+    bool found = false;
+    for (int j = 0; j < a.node_def_size(); ++j) {
+      if (b.node_def(i).name() == a.node_def(j).name()) {
+        found = true;
+        break;
+      }
+    }
+    if (!found) {
+      if (diff) {
+        *diff = strings::StrCat("Function ", a.signature().name(),
+                                ", got: has node '", b.node_def(i).name(),
+                                "' expected: no node of that name");
+      }
+      return false;
+    }
+  }
   return true;
 }
 
@@ -84,29 +248,64 @@ bool EqualFunctionDefLibrary(const FunctionDefLibrary& expected,
 
 // TODO(misard): remove these fake registrations once there are real Ops to be
 // compiled.
-REGISTER_OP("_XlaSendToHost")
-    .Input("input: dtypes")
-    .Attr("dtypes: list(type) >= 0");
-
-REGISTER_OP("_XlaRecvFromHost")
-    .Output("output: dtypes")
-    .Attr("dtypes: list(type) >= 0");
+REGISTER_OP("_XlaHostCompute")
+    .Input("inputs: Tinputs")
+    .Output("outputs: Toutputs")
+    .Attr("Tinputs: list(type) >= 0")
+    .Attr("Toutputs: list(type) >= 0")
+    .Attr("key: string")
+    .SetShapeFn(::tensorflow::shape_inference::UnknownShape);
 
 REGISTER_OP("_XlaSendFromHost")
-    .Input("input: dtypes")
-    .Attr("dtypes: list(type) >= 0");
+    .Input("input: Tinputs")
+    .Attr("Tinputs: list(type) >= 0")
+    .Attr("key: string")
+    .SetShapeFn(::tensorflow::shape_inference::UnknownShape);
 
 REGISTER_OP("_XlaRecvAtHost")
-    .Output("output: dtypes")
-    .Attr("dtypes: list(type) >= 0");
-
-REGISTER_OP("InputTest").Output("o: float");
-
-REGISTER_OP("UnaryTest").Input("a: float").Output("o: float");
+    .Output("output: Toutputs")
+    .Attr("Toutputs: list(type) >= 0")
+    .Attr("key: string")
+    .SetShapeFn(::tensorflow::shape_inference::UnknownShape);
+
+REGISTER_OP("InputTest")
+    .Output("o: float")
+    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
+      c->set_output(0, c->UnknownShape());
+      return Status::OK();
+    });
+
+REGISTER_OP("InputTestShaped")
+    .Output("o: float")
+    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
+      c->set_output(0, c->Vector(2));
+      return Status::OK();
+    });
+
+REGISTER_OP("UnaryTest")
+    .Input("a: float")
+    .Output("o: float")
+    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
+      ::tensorflow::shape_inference::ShapeHandle o;
+      TF_RETURN_IF_ERROR(c->Merge(c->UnknownShape(), c->input(0), &o));
+      c->set_output(0, o);
+      return Status::OK();
+    });
 REGISTER_OP("BinaryTest")
     .Input("a: float")
     .Input("b: float")
-    .Output("o: float");
+    .Output("o: float")
+    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
+      ::tensorflow::shape_inference::ShapeHandle o;
+      TF_RETURN_IF_ERROR(c->Merge(c->UnknownShape(), c->input(0), &o));
+      c->set_output(0, o);
+      return Status::OK();
+    });
+REGISTER_OP("BinaryTest2")
+    .Input("a: float")
+    .Input("b: float")
+    .Output("o: float")
+    .SetShapeFn(::tensorflow::shape_inference::UnknownShape);
 
 REGISTER_OP("AddNLikeTest")
     .Input("inputs: N * T")
@@ -124,22 +323,48 @@ Node* Input(const GraphDefBuilder::Options& opts) {
   return ops::SourceOp("InputTest", opts);
 }
 
-Node* RecvAtHost(const gtl::ArraySlice<DataType>& dtypes,
+Node* InputShaped(const GraphDefBuilder::Options& opts) {
+  return ops::SourceOp("InputTestShaped", opts);
+}
+
+Node* KnownShape(const gtl::ArraySlice<int>& shape,
+                 const GraphDefBuilder::Options& opts) {
+  if (opts.HaveError()) return nullptr;
+  NodeBuilder node_builder(opts.GetNameForOp("Const"), "Const",
+                           opts.op_registry());
+  TensorProto value;
+  value.set_dtype(DT_FLOAT);
+  for (int dim : shape) {
+    value.mutable_tensor_shape()->add_dim()->set_size(dim);
+  }
+  return opts.WithAttr("value", value)
+      .WithAttr("dtype", DT_FLOAT)
+      .FinalizeBuilder(&node_builder);
+}
+
+Node* RecvAtHost(const string& key, const gtl::ArraySlice<DataType>& dtypes,
                  const GraphDefBuilder::Options& opts) {
   if (opts.HaveError()) return nullptr;
   NodeBuilder node_builder(opts.GetNameForOp("_XlaRecvAtHost"),
                            "_XlaRecvAtHost", opts.op_registry());
-  return opts.WithAttr("dtypes", dtypes).FinalizeBuilder(&node_builder);
+  return opts.WithAttr("Toutputs", dtypes)
+      .WithAttr("key", key)
+      .FinalizeBuilder(&node_builder);
 }
 
-Node* SendFromHost(const std::vector<ops::NodeOut>& inputs,
-                   const gtl::ArraySlice<DataType>& dtypes,
+Node* SendFromHost(const string& key, const std::vector<ops::NodeOut>& inputs,
                    const GraphDefBuilder::Options& opts) {
   if (opts.HaveError()) return nullptr;
   NodeBuilder node_builder(opts.GetNameForOp("_XlaSendFromHost"),
                            "_XlaSendFromHost", opts.op_registry());
   node_builder.Input(inputs);
-  return opts.WithAttr("dtypes", dtypes).FinalizeBuilder(&node_builder);
+  std::vector<DataType> dtypes;
+  for (const auto& node : inputs) {
+    dtypes.push_back(node.dt);
+  }
+  return opts.WithAttr("key", key)
+      .WithAttr("Tinputs", dtypes)
+      .FinalizeBuilder(&node_builder);
 }
 
 Node* Unary(ops::NodeOut a, const GraphDefBuilder::Options& opts) {
@@ -151,6 +376,11 @@ Node* Binary(ops::NodeOut a, ops::NodeOut b,
   return ops::BinaryOp("BinaryTest", std::move(a), std::move(b), opts);
 }
 
+Node* BinaryUnknownShape(ops::NodeOut a, ops::NodeOut b,
+                         const GraphDefBuilder::Options& opts) {
+  return ops::BinaryOp("BinaryTest2", std::move(a), std::move(b), opts);
+}
+
 Node* AddNLike(const std::vector<ops::NodeOut>& inputs,
                const GraphDefBuilder::Options& opts) {
   if (opts.HaveError()) return nullptr;
@@ -576,6 +806,21 @@ TEST(EncapsulateSubgraphsTest, OneFunctionOneOutside) {
   FunctionDefLibrary library_expected;
   GraphDef graphdef_expected;
 
+  string shape_string_expected;
+  {
+    GraphDefBuilder shape(GraphDefBuilder::kFailImmediately);
+    Node* recv =
+        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT, DT_FLOAT},
+                   shape.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* e = Binary(ops::NodeOut(recv, 0), ops::NodeOut(recv, 1),
+                     shape.opts().WithName("E"));
+    SendFromHost("host_compute_channel_F1_O1", {e},
+                 shape.opts().WithName("outside_compilation_F1_O1_send"));
+    GraphDef shape_graph;
+    TF_EXPECT_OK(shape.ToGraphDef(&shape_graph));
+    EXPECT_TRUE(shape_graph.SerializeToString(&shape_string_expected));
+  }
+
   *library_expected.add_function() = test::function::XTimesTwo();
   *library_expected.add_function() = FunctionDefHelper::Create(
       "F1", {"a_0_arg:float", "b_0_arg:float"}, {"f_0_retval:float"}, {},
@@ -584,19 +829,18 @@ TEST(EncapsulateSubgraphsTest, OneFunctionOneOutside) {
           {{"c"}, "BinaryTest", {"b_0_arg", "C:o:0"}, {}, {"C"}},
           {{"F"},
            "BinaryTest",
-           {"C:o:0", "outside_compilation_O1_recv:output:0"},
+           {"C:o:0", "outside_compilation_O1_host_compute:outputs:0"},
            {},
-           {"outside_compilation_O1_recv"}},
-          {{"outside_compilation_O1_send"},
-           "_XlaSendToHost",
+           {"outside_compilation_O1_host_compute"}},
+          {{"outside_compilation_O1_host_compute"},
+           "_XlaHostCompute",
            {"C:o:0", "c:o:0"},
-           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})}},
+           {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})},
+            {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"key", "host_compute_channel_F1_O1"},
+            {"shape_inference_graph", shape_string_expected},
+            {"shapes", gtl::ArraySlice<DataType>({})}},
            {"c"}},
-          {{"outside_compilation_O1_recv"},
-           "_XlaRecvFromHost",
-           {},
-           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT})}},
-           {"outside_compilation_O1_send"}},
       },
       {{"f_0_retval", "F:o:0"}});
 
@@ -612,11 +856,11 @@ TEST(EncapsulateSubgraphsTest, OneFunctionOneOutside) {
     Node* call = b2.opts().FinalizeBuilder(&node_builder);
 
     Node* recv =
-        RecvAtHost({DT_FLOAT, DT_FLOAT},
+        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT, DT_FLOAT},
                    b2.opts().WithName("outside_compilation_F1_O1_recv"));
     Node* e = Binary(ops::NodeOut(recv, 0), ops::NodeOut(recv, 1),
                      b2.opts().WithName("E").WithControlInputs({recv, b}));
-    Node* send = SendFromHost({e}, {DT_FLOAT},
+    Node* send = SendFromHost("host_compute_channel_F1_O1", {e},
                               b2.opts()
                                   .WithName("outside_compilation_F1_O1_send")
                                   .WithControlInput(e));
@@ -674,37 +918,71 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
   FunctionDefLibrary library_expected;
   GraphDef graphdef_expected;
 
+  string shape_string_expected_1;
+  {
+    GraphDefBuilder shape1(GraphDefBuilder::kFailImmediately);
+    Node* recv =
+        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT, DT_FLOAT},
+                   shape1.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* e = Binary(ops::NodeOut(recv, 0), ops::NodeOut(recv, 1),
+                     shape1.opts().WithName("E"));
+    SendFromHost("host_compute_channel_F1_O1", {e},
+                 shape1.opts().WithName("outside_compilation_F1_O1_send"));
+    GraphDef shape1_graph;
+    TF_EXPECT_OK(shape1.ToGraphDef(&shape1_graph));
+    EXPECT_TRUE(shape1_graph.SerializeToString(&shape_string_expected_1));
+  }
+
+  string shape_string_expected_2;
+  {
+    GraphDefBuilder shape2(GraphDefBuilder::kFailImmediately);
+    Node* recv1 =
+        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT, DT_FLOAT},
+                   shape2.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* e = Binary(ops::NodeOut(recv1, 0), ops::NodeOut(recv1, 1),
+                     shape2.opts().WithName("E"));
+    Node* recv2 =
+        RecvAtHost("host_compute_channel_F1_O2", {DT_FLOAT, DT_FLOAT},
+                   shape2.opts().WithName("outside_compilation_F1_O2_recv"));
+    Node* h = Binary(ops::NodeOut(recv2, 0), e, shape2.opts().WithName("H"));
+    SendFromHost("host_compute_channel_F1_O2", {h},
+                 shape2.opts().WithName("outside_compilation_F1_O2_send"));
+    GraphDef shape2_graph;
+    TF_EXPECT_OK(shape2.ToGraphDef(&shape2_graph));
+    EXPECT_TRUE(shape2_graph.SerializeToString(&shape_string_expected_2));
+  }
+
   *library_expected.add_function() = FunctionDefHelper::Create(
       "F1", {"a_0_arg:float", "b_0_arg:float"}, {"i_0_retval:float"}, {},
       {
           {{"C"}, "UnaryTest", {"a_0_arg"}},
           {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}, {}},
-          {{"I"}, "UnaryTest", {"outside_compilation_O2_recv:output:0"}},
+          {{"I"},
+           "UnaryTest",
+           {"outside_compilation_O2_host_compute:outputs:0"}},
           {{"F"},
            "BinaryTest",
-           {"C:o:0", "outside_compilation_O1_recv:output:0"},
+           {"C:o:0", "outside_compilation_O1_host_compute:outputs:0"},
            {},
-           {"outside_compilation_O1_recv"}},
-          {{"outside_compilation_O2_send"},
-           "_XlaSendToHost",
+           {"outside_compilation_O1_host_compute"}},
+          {{"outside_compilation_O2_host_compute"},
+           "_XlaHostCompute",
            {"D:o:0", "F:o:0"},
-           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})}},
+           {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})},
+            {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"key", "host_compute_channel_F1_O2"},
+            {"shape_inference_graph", shape_string_expected_2},
+            {"shapes", gtl::ArraySlice<DataType>({})}},
            {"F"}},
-          {{"outside_compilation_O1_send"},
-           "_XlaSendToHost",
+          {{"outside_compilation_O1_host_compute"},
+           "_XlaHostCompute",
            {"C:o:0", "D:o:0"},
-           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})}},
+           {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})},
+            {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"key", "host_compute_channel_F1_O1"},
+            {"shape_inference_graph", shape_string_expected_1},
+            {"shapes", gtl::ArraySlice<DataType>({})}},
            {"D"}},
-          {{"outside_compilation_O2_recv"},
-           "_XlaRecvFromHost",
-           {},
-           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT})}},
-           {"outside_compilation_O2_send"}},
-          {{"outside_compilation_O1_recv"},
-           "_XlaRecvFromHost",
-           {},
-           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT})}},
-           {"outside_compilation_O1_send"}},
       },
       {{"i_0_retval", "I:o:0"}});
 
@@ -720,23 +998,24 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
     Node* call = b2.opts().FinalizeBuilder(&node_builder);
 
     Node* recv1 =
-        RecvAtHost({DT_FLOAT, DT_FLOAT},
+        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT, DT_FLOAT},
                    b2.opts().WithName("outside_compilation_F1_O1_recv"));
     Node* e = Binary(ops::NodeOut(recv1, 0), ops::NodeOut(recv1, 1),
                      b2.opts().WithName("E").WithControlInputs({recv1, b}));
-    Node* send1 = SendFromHost({e}, {DT_FLOAT},
+    Node* send1 = SendFromHost("host_compute_channel_F1_O1", {e},
                                b2.opts()
                                    .WithName("outside_compilation_F1_O1_send")
                                    .WithControlInput(e));
 
     Node* recv2 =
-        RecvAtHost({DT_FLOAT, DT_FLOAT},
+        RecvAtHost("host_compute_channel_F1_O2", {DT_FLOAT, DT_FLOAT},
                    b2.opts().WithName("outside_compilation_F1_O2_recv"));
     Node* g = Binary(e, ops::NodeOut(recv2, 1),
                      b2.opts().WithName("G").WithControlInputs({recv2, e}));
     Node* h = Binary(ops::NodeOut(recv2, 0), e, b2.opts().WithName("H"));
-    Node* send2 = SendFromHost(
-        {h}, {DT_FLOAT}, b2.opts().WithName("outside_compilation_F1_O2_send"));
+    Node* send2 =
+        SendFromHost("host_compute_channel_F1_O2", {h},
+                     b2.opts().WithName("outside_compilation_F1_O2_send"));
 
     Node* s = NoOp(b2.opts()
                        .WithName("F1_sequencer")
@@ -758,8 +1037,8 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
 
   {
     GraphDefBuilder b1(GraphDefBuilder::kFailImmediately);
-    Node* a = Input(b1.opts().WithName("A"));
-    Node* b = Input(b1.opts().WithName("B"));
+    Node* a = InputShaped(b1.opts().WithName("A"));
+    Node* b = InputShaped(b1.opts().WithName("B"));
     Node* c = Unary(a, b1.opts().WithName("C").WithAttr("_encapsulate", "F1"));
     Node* d =
         Binary(b, c, b1.opts().WithName("D").WithAttr("_encapsulate", "F1"));
@@ -791,6 +1070,24 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
   FunctionDefLibrary library_expected;
   GraphDef graphdef_expected;
 
+  string shape_string_expected;
+  {
+    GraphDefBuilder shape(GraphDefBuilder::kFailImmediately);
+    Node* recv =
+        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT, DT_FLOAT},
+                   shape.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* e = Binary(ops::NodeOut(recv, 0), ops::NodeOut(recv, 1),
+                     shape.opts().WithName("E"));
+    SendFromHost("host_compute_channel_F1_O1", {e},
+                 shape.opts().WithName("outside_compilation_F1_O1_send"));
+    GraphDef shape_graph;
+    TF_EXPECT_OK(shape.ToGraphDef(&shape_graph));
+    EXPECT_TRUE(shape_graph.SerializeToString(&shape_string_expected));
+  }
+
+  TensorShapeProto shape_proto_expected;
+  shape_proto_expected.add_dim()->set_size(2);
+
   *library_expected.add_function() = FunctionDefHelper::Create(
       "F1", {"a_0_arg:float", "b_0_arg:float"},
       {"f_0_retval:float", "d_0_retval:float"}, {},
@@ -799,19 +1096,18 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
           {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
           {{"F"},
            "BinaryTest",
-           {"C:o:0", "outside_compilation_O1_recv:output:0"},
+           {"C:o:0", "outside_compilation_O1_host_compute:outputs:0"},
            {},
-           {"outside_compilation_O1_recv"}},
-          {{"outside_compilation_O1_send"},
-           "_XlaSendToHost",
+           {"outside_compilation_O1_host_compute"}},
+          {{"outside_compilation_O1_host_compute"},
+           "_XlaHostCompute",
            {"C:o:0", "D:o:0"},
-           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})}},
+           {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})},
+            {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"key", "host_compute_channel_F1_O1"},
+            {"shape_inference_graph", shape_string_expected},
+            {"shapes", gtl::ArraySlice<DataType>({})}},
            {"D"}},
-          {{"outside_compilation_O1_recv"},
-           "_XlaRecvFromHost",
-           {},
-           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT})}},
-           {"outside_compilation_O1_send"}},
       },
       {{"d_0_retval", "D:o:0"}, {"f_0_retval", "F:o:0"}});
 
@@ -822,16 +1118,16 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
           {{"G"}, "BinaryTest", {"e_0_arg", "f_0_arg"}},
           {{"I"},
            "BinaryTest",
-           {"f_0_arg", "outside_compilation_O1_recv:output:0"}},
-          {{"outside_compilation_O1_send"},
-           "_XlaSendToHost",
+           {"f_0_arg", "outside_compilation_O1_host_compute:outputs:0"}},
+          {{"outside_compilation_O1_host_compute"},
+           "_XlaHostCompute",
            {"G:o:0"},
-           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT})}}},
-          {{"outside_compilation_O1_recv"},
-           "_XlaRecvFromHost",
-           {},
-           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT})}},
-           {"outside_compilation_O1_send"}},
+           {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"key", "host_compute_channel_F2_O1"},
+            {"shape_inference_graph", ""},
+            {"shapes",
+             gtl::ArraySlice<TensorShapeProto>({shape_proto_expected})}}},
       },
       {{"g_0_retval", "G:o:0"}, {"i_0_retval", "I:o:0"}});
 
@@ -839,15 +1135,15 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
     std::unique_ptr<FunctionLibraryDefinition> lib_def(
         new FunctionLibraryDefinition(OpRegistry::Global(), library_expected));
     GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get());
-    Node* a = Input(b2.opts().WithName("A"));
-    Node* b = Input(b2.opts().WithName("B"));
+    Node* a = InputShaped(b2.opts().WithName("A"));
+    Node* b = InputShaped(b2.opts().WithName("B"));
 
     Node* recv1 =
-        RecvAtHost({DT_FLOAT, DT_FLOAT},
+        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT, DT_FLOAT},
                    b2.opts().WithName("outside_compilation_F1_O1_recv"));
     Node* e = Binary(ops::NodeOut(recv1, 0), ops::NodeOut(recv1, 1),
                      b2.opts().WithName("E").WithControlInputs({recv1, b}));
-    Node* send1 = SendFromHost({e}, {DT_FLOAT},
+    Node* send1 = SendFromHost("host_compute_channel_F1_O1", {e},
                                b2.opts()
                                    .WithName("outside_compilation_F1_O1_send")
                                    .WithControlInput(e));
@@ -857,12 +1153,14 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
     Node* s1 = NoOp(
         b2.opts().WithName("F1_sequencer").WithControlInputs({recv1, send1}));
 
-    Node* recv2 = RecvAtHost(
-        {DT_FLOAT}, b2.opts().WithName("outside_compilation_F2_O1_recv"));
+    Node* recv2 =
+        RecvAtHost("host_compute_channel_F2_O1", {DT_FLOAT},
+                   b2.opts().WithName("outside_compilation_F2_O1_recv"));
     Node* h = Binary(ops::NodeOut(call1, 1), recv2,
                      b2.opts().WithName("H").WithControlInput(s1));
-    Node* send2 = SendFromHost(
-        {h}, {DT_FLOAT}, b2.opts().WithName("outside_compilation_F2_O1_send"));
+    Node* send2 =
+        SendFromHost("host_compute_channel_F2_O1", {h},
+                     b2.opts().WithName("outside_compilation_F2_O1_send"));
 
     NodeBuilder node_builder2("F2", "F2", lib_def.get());
     node_builder2.Input(e).Input(call1);
@@ -888,7 +1186,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) {
 
   {
     GraphDefBuilder b1(GraphDefBuilder::kFailImmediately);
-    Node* a = Input(b1.opts().WithName("A"));
+    Node* a = InputShaped(b1.opts().WithName("A"));
     Node* b = Input(b1.opts().WithName("B"));
     Node* c = Unary(a, b1.opts().WithName("C").WithAttr("_encapsulate", "F1"));
     Node* d =
@@ -908,6 +1206,9 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) {
   FunctionDefLibrary library_expected;
   GraphDef graphdef_expected;
 
+  TensorShapeProto shape_proto_expected;
+  shape_proto_expected.add_dim()->set_size(2);
+
   *library_expected.add_function() = FunctionDefHelper::Create(
       "F1", {"a_0_arg:float", "b_0_arg:float"}, {"f_0_retval:float"}, {},
       {
@@ -915,11 +1216,16 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) {
           {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
           {{"F"},
            "BinaryTest",
-           {"D:o:0", "outside_compilation_O1_recv:output:0"}},
-          {{"outside_compilation_O1_recv"},
-           "_XlaRecvFromHost",
+           {"D:o:0", "outside_compilation_O1_host_compute:outputs:0"}},
+          {{"outside_compilation_O1_host_compute"},
+           "_XlaHostCompute",
            {},
-           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT})}}},
+           {{"Tinputs", gtl::ArraySlice<DataType>({})},
+            {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"key", "host_compute_channel_F1_O1"},
+            {"shape_inference_graph", ""},
+            {"shapes",
+             gtl::ArraySlice<TensorShapeProto>({shape_proto_expected})}}},
       },
       {{"f_0_retval", "F:o:0"}});
 
@@ -927,12 +1233,13 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) {
     std::unique_ptr<FunctionLibraryDefinition> lib_def(
         new FunctionLibraryDefinition(OpRegistry::Global(), library_expected));
     GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get());
-    Node* a = Input(b2.opts().WithName("A"));
+    Node* a = InputShaped(b2.opts().WithName("A"));
     Node* b = Input(b2.opts().WithName("B"));
 
     Node* e = Unary(a, b2.opts().WithName("E"));
-    Node* send1 = SendFromHost(
-        {e}, {DT_FLOAT}, b2.opts().WithName("outside_compilation_F1_O1_send"));
+    Node* send1 =
+        SendFromHost("host_compute_channel_F1_O1", {e},
+                     b2.opts().WithName("outside_compilation_F1_O1_send"));
     NodeBuilder node_builder1("F1", "F1", lib_def.get());
     node_builder1.Input(a).Input(b);
     Node* call1 = b2.opts().FinalizeBuilder(&node_builder1);
@@ -954,7 +1261,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlInput) {
 
   {
     GraphDefBuilder b1(GraphDefBuilder::kFailImmediately);
-    Node* a = Input(b1.opts().WithName("A"));
+    Node* a = InputShaped(b1.opts().WithName("A"));
     Node* b = Input(b1.opts().WithName("B"));
     Node* c = Unary(a, b1.opts().WithName("C").WithAttr("_encapsulate", "F1"));
     Node* d =
@@ -975,6 +1282,9 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlInput) {
   FunctionDefLibrary library_expected;
   GraphDef graphdef_expected;
 
+  TensorShapeProto shape_proto_expected;
+  shape_proto_expected.add_dim()->set_size(2);
+
   *library_expected.add_function() = FunctionDefHelper::Create(
       "F1", {"a_0_arg:float", "b_0_arg:float"}, {"f_0_retval:float"}, {},
       {
@@ -982,17 +1292,17 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlInput) {
           {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
           {{"F"},
            "BinaryTest",
-           {"D:o:0", "outside_compilation_O1_recv:output:0"}},
-          {{"outside_compilation_O1_send"},
-           "_XlaSendToHost",
+           {"D:o:0", "outside_compilation_O1_host_compute:outputs:0"}},
+          {{"outside_compilation_O1_host_compute"},
+           "_XlaHostCompute",
            {},
-           {{"dtypes", gtl::ArraySlice<DataType>({})}},
+           {{"Tinputs", gtl::ArraySlice<DataType>({})},
+            {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"key", "host_compute_channel_F1_O1"},
+            {"shape_inference_graph", ""},
+            {"shapes",
+             gtl::ArraySlice<TensorShapeProto>({shape_proto_expected})}},
            {"D"}},
-          {{"outside_compilation_O1_recv"},
-           "_XlaRecvFromHost",
-           {},
-           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT})}},
-           {"outside_compilation_O1_send"}},
       },
       {{"f_0_retval", "F:o:0"}});
 
@@ -1000,14 +1310,16 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlInput) {
     std::unique_ptr<FunctionLibraryDefinition> lib_def(
         new FunctionLibraryDefinition(OpRegistry::Global(), library_expected));
     GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get());
-    Node* a = Input(b2.opts().WithName("A"));
+    Node* a = InputShaped(b2.opts().WithName("A"));
     Node* b = Input(b2.opts().WithName("B"));
 
     Node* recv1 =
-        RecvAtHost({}, b2.opts().WithName("outside_compilation_F1_O1_recv"));
+        RecvAtHost("host_compute_channel_F1_O1", {},
+                   b2.opts().WithName("outside_compilation_F1_O1_recv"));
     Node* e = Unary(a, b2.opts().WithName("E").WithControlInput(recv1));
-    Node* send1 = SendFromHost(
-        {e}, {DT_FLOAT}, b2.opts().WithName("outside_compilation_F1_O1_send"));
+    Node* send1 =
+        SendFromHost("host_compute_channel_F1_O1", {e},
+                     b2.opts().WithName("outside_compilation_F1_O1_send"));
     NodeBuilder node_builder1("F1", "F1", lib_def.get());
     node_builder1.Input(a).Input(b);
     Node* call1 = b2.opts().FinalizeBuilder(&node_builder1);
@@ -1055,10 +1367,14 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoOutputs) {
           {{"C"}, "UnaryTest", {"a_0_arg"}},
           {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
           {{"F"}, "UnaryTest", {"D:o:0"}},
-          {{"outside_compilation_O1_send"},
-           "_XlaSendToHost",
+          {{"outside_compilation_O1_host_compute"},
+           "_XlaHostCompute",
            {"D:o:0"},
-           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT})}}},
+           {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"Toutputs", gtl::ArraySlice<DataType>({})},
+            {"key", "host_compute_channel_F1_O1"},
+            {"shape_inference_graph", ""},
+            {"shapes", gtl::ArraySlice<TensorShapeProto>({})}}},
       },
       {{"f_0_retval", "F:o:0"}});
 
@@ -1069,8 +1385,9 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoOutputs) {
     Node* a = Input(b2.opts().WithName("A"));
     Node* b = Input(b2.opts().WithName("B"));
 
-    Node* recv1 = RecvAtHost(
-        {DT_FLOAT}, b2.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* recv1 =
+        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT},
+                   b2.opts().WithName("outside_compilation_F1_O1_recv"));
     Node* e = Unary(recv1, b2.opts().WithName("E"));
     NodeBuilder node_builder1("F1", "F1", lib_def.get());
     node_builder1.Input(a).Input(b);
@@ -1118,16 +1435,19 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlOutput) {
       {
           {{"C"}, "UnaryTest", {"a_0_arg"}},
           {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
-          {{"F"}, "UnaryTest", {"D:o:0"}, {}, {"outside_compilation_O1_recv"}},
-          {{"outside_compilation_O1_send"},
-           "_XlaSendToHost",
+          {{"F"},
+           "UnaryTest",
            {"D:o:0"},
-           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT})}}},
-          {{"outside_compilation_O1_recv"},
-           "_XlaRecvFromHost",
            {},
-           {{"dtypes", gtl::ArraySlice<DataType>({})}},
-           {"outside_compilation_O1_send"}},
+           {"outside_compilation_O1_host_compute"}},
+          {{"outside_compilation_O1_host_compute"},
+           "_XlaHostCompute",
+           {"D:o:0"},
+           {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"Toutputs", gtl::ArraySlice<DataType>({})},
+            {"key", "host_compute_channel_F1_O1"},
+            {"shape_inference_graph", ""},
+            {"shapes", gtl::ArraySlice<TensorShapeProto>({})}}},
       },
       {{"f_0_retval", "F:o:0"}});
 
@@ -1138,10 +1458,11 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlOutput) {
     Node* a = Input(b2.opts().WithName("A"));
     Node* b = Input(b2.opts().WithName("B"));
 
-    Node* recv1 = RecvAtHost(
-        {DT_FLOAT}, b2.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* recv1 =
+        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT},
+                   b2.opts().WithName("outside_compilation_F1_O1_recv"));
     Node* e = Unary(recv1, b2.opts().WithName("E"));
-    Node* send1 = SendFromHost({}, {},
+    Node* send1 = SendFromHost("host_compute_channel_F1_O1", {},
                                b2.opts()
                                    .WithName("outside_compilation_F1_O1_send")
                                    .WithControlInput(e));
@@ -1215,5 +1536,110 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputsOrOutputs) {
   TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library);
 }
 
+// Test for shape inference of outside compilation.
+TEST(EncapsulateSubgraphsTest, OutsideCompilationShapeInference) {
+  FunctionDefLibrary library;
+  GraphDef graphdef;
+
+  {
+    *library.add_function() = test::function::XTimesTwo();
+
+    GraphDefBuilder b1(GraphDefBuilder::kFailImmediately);
+    Node* a = InputShaped(b1.opts().WithName("A"));
+    Node* b = Input(b1.opts().WithName("B"));
+    // Give nodes 'c' and 'd' names that collide after lowercasing.
+    Node* c = Unary(a, b1.opts().WithName("C"));
+    Node* d = Unary(b, b1.opts().WithName("c").WithControlInput(c).WithAttr(
+                           "_encapsulate", "F1"));
+    Node* e = BinaryUnknownShape(c, d,
+                                 b1.opts()
+                                     .WithName("E")
+                                     .WithControlInputs({b, d})
+                                     .WithAttr("_encapsulate", "F1")
+                                     .WithAttr("_outside", "O1"));
+    Node* f = Binary(c, e,
+                     b1.opts().WithName("F").WithControlInput(e).WithAttr(
+                         "_encapsulate", "F1"));
+    Binary(a, f, b1.opts().WithName("G").WithControlInput(e));
+    TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
+  }
+
+  TF_EXPECT_OK(Encapsulate(&graphdef, &library));
+
+  FunctionDefLibrary library_expected;
+  GraphDef graphdef_expected;
+
+  string shape_string_expected;
+  {
+    GraphDefBuilder shape(GraphDefBuilder::kFailImmediately);
+    Node* known = KnownShape({2}, shape.opts().WithName("KnownShape/_0"));
+    Node* recv =
+        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT},
+                   shape.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* e = BinaryUnknownShape(known, recv, shape.opts().WithName("E"));
+    SendFromHost("host_compute_channel_F1_O1", {e},
+                 shape.opts().WithName("outside_compilation_F1_O1_send"));
+    GraphDef shape_graph;
+    TF_EXPECT_OK(shape.ToGraphDef(&shape_graph));
+    EXPECT_TRUE(shape_graph.SerializeToString(&shape_string_expected));
+  }
+
+  *library_expected.add_function() = test::function::XTimesTwo();
+  *library_expected.add_function() = FunctionDefHelper::Create(
+      "F1", {"b_0_arg:float", "c_0_arg:float"}, {"f_0_retval:float"}, {},
+      {
+          {{"c"}, "UnaryTest", {"b_0_arg"}, {}, {}},
+          {{"F"},
+           "BinaryTest",
+           {"c_0_arg", "outside_compilation_O1_host_compute:outputs:0"},
+           {},
+           {"outside_compilation_O1_host_compute"}},
+          {{"outside_compilation_O1_host_compute"},
+           "_XlaHostCompute",
+           {"c:o:0"},
+           {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"key", "host_compute_channel_F1_O1"},
+            {"shape_inference_graph", shape_string_expected},
+            {"shapes", gtl::ArraySlice<DataType>({})}},
+           {"c"}},
+      },
+      {{"f_0_retval", "F:o:0"}});
+
+  {
+    std::unique_ptr<FunctionLibraryDefinition> lib_def(
+        new FunctionLibraryDefinition(OpRegistry::Global(), library_expected));
+    GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get());
+    Node* a = InputShaped(b2.opts().WithName("A"));
+    Node* b = Input(b2.opts().WithName("B"));
+    Node* c = Unary(a, b2.opts().WithName("C"));
+
+    NodeBuilder node_builder("F1", "F1", lib_def.get());
+    node_builder.Input(b).Input(c);
+    Node* call =
+        b2.opts().WithControlInputs({c}).FinalizeBuilder(&node_builder);
+
+    Node* recv =
+        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT},
+                   b2.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* e = BinaryUnknownShape(
+        c, ops::NodeOut(recv, 0),
+        b2.opts().WithName("E").WithControlInputs({recv, b}));
+    Node* send = SendFromHost("host_compute_channel_F1_O1", {e},
+                              b2.opts()
+                                  .WithName("outside_compilation_F1_O1_send")
+                                  .WithControlInput(e));
+
+    Node* s = NoOp(
+        b2.opts().WithName("F1_sequencer").WithControlInputs({recv, send}));
+
+    Binary(a, call, b2.opts().WithName("G").WithControlInputs({s, e}));
+    TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
+  }
+
+  TF_EXPECT_GRAPH_EQ(graphdef_expected, graphdef);
+  TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc
index d6b576166c..eae8e6c3c1 100644
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@@ -1064,26 +1064,36 @@ Status FunctionLibraryDefinition::AddLibrary(
   return Status::OK();
 }
 
-void FunctionLibraryDefinition::RemoveFunction(const string& func) {
+Status FunctionLibraryDefinition::RemoveFunction(const string& func) {
   const auto& i = function_defs_.find(func);
-  DCHECK(i != function_defs_.end());
+  if (i == function_defs_.end()) {
+    return errors::InvalidArgument("Tried to remove non-existent function ",
+                                   func);
+  }
   function_defs_.erase(i);
+  return Status::OK();
 }
 
-void FunctionLibraryDefinition::RemoveGradient(const string& func) {
+Status FunctionLibraryDefinition::RemoveGradient(const string& func) {
   const auto& i = func_grad_.find(func);
-  DCHECK(i != func_grad_.end());
+  if (i == func_grad_.end()) {
+    return errors::InvalidArgument("Tried to remove non-existent gradient ",
+                                   func);
+  }
   func_grad_.erase(i);
+  return Status::OK();
 }
 
 void FunctionLibraryDefinition::Remove(
     const std::vector<string>& funcs,
     const std::vector<string>& funcs_with_grads) {
   for (const string& f : funcs) {
-    RemoveFunction(f);
+    Status s = RemoveFunction(f);
+    DCHECK(s.ok());
   }
   for (const string& f : funcs_with_grads) {
-    RemoveGradient(f);
+    Status s = RemoveGradient(f);
+    DCHECK(s.ok());
   }
 }
 
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index b933ee0b0e..7d0e15641d 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -312,6 +312,14 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   // This operation is atomic.
   Status AddGradientDef(const GradientDef& grad);
 
+  // Remove function `func` from the library. Returns non-OK Status unless
+  // `func` is in the library.
+  Status RemoveFunction(const string& func);
+
+  // Remove gradient of function `func` from the library. Returns non-OK Status
+  // unless `func` has a gradient.
+  Status RemoveGradient(const string& func);
+
   // Adds the functions and gradients in 'other' to this function library.
   // Duplicate functions and gradients are ignored.
   // This operation is atomic.
@@ -384,13 +392,6 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   // attr from.
   const FunctionDef* GetAttrImpl(const NodeDef& ndef) const;
 
-  // Remove function `func` from the library. `func` must be in the library.
-  void RemoveFunction(const string& func);
-
-  // Remove gradient of function `func` from the library. `func` must have
-  // a gradient.
-  void RemoveGradient(const string& func);
-
   // Remove all functions in `funcs` and all gradients of
   // functions in `funcs_with_grads` from this library.
   void Remove(const std::vector<string>& funcs,
-- 
GitLab


From 69655f34611747e51fc2644d1888301ecfcc4c96 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 1 Feb 2018 10:38:14 -0800
Subject: [PATCH 528/966] Fix nest bug with different dictionary key orderings.

PiperOrigin-RevId: 184160009
---
 tensorflow/python/data/util/nest.py      | 4 ++--
 tensorflow/python/data/util/nest_test.py | 4 ++++
 tensorflow/python/util/nest.py           | 4 ++--
 tensorflow/python/util/nest_test.py      | 4 ++++
 4 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/data/util/nest.py b/tensorflow/python/data/util/nest.py
index 2455395635..e387e35740 100644
--- a/tensorflow/python/data/util/nest.py
+++ b/tensorflow/python/data/util/nest.py
@@ -383,8 +383,8 @@ def assert_shallow_structure(shallow_tree, input_tree, check_types=True):
             "structure has keys %s, while shallow structure has keys %s." %
             (list(_six.iterkeys(input_tree)),
              list(_six.iterkeys(shallow_tree))))
-      input_tree = list(_six.iteritems(input_tree))
-      shallow_tree = list(_six.iteritems(shallow_tree))
+      input_tree = list(sorted(_six.iteritems(input_tree)))
+      shallow_tree = list(sorted(_six.iteritems(shallow_tree)))
 
     for shallow_branch, input_branch in zip(shallow_tree, input_tree):
       assert_shallow_structure(shallow_branch, input_branch,
diff --git a/tensorflow/python/data/util/nest_test.py b/tensorflow/python/data/util/nest_test.py
index 90dd7dfe77..ff380815a4 100644
--- a/tensorflow/python/data/util/nest_test.py
+++ b/tensorflow/python/data/util/nest_test.py
@@ -277,6 +277,10 @@ class NestTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, expected_message):
       nest.assert_shallow_structure(inp_ab2, inp_ab1)
 
+    inp_ab = collections.OrderedDict([("a", 1), ("b", (2, 3))])
+    inp_ba = collections.OrderedDict([("b", (2, 3)), ("a", 1)])
+    nest.assert_shallow_structure(inp_ab, inp_ba)
+
   def testFlattenUpTo(self):
     input_tree = (((2, 2), (3, 3)), ((4, 9), (5, 5)))
     shallow_tree = ((True, True), (False, True))
diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index 874df3d108..c8525ed420 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -532,8 +532,8 @@ def assert_shallow_structure(shallow_tree, input_tree, check_types=True):
             (list(_six.iterkeys(input_tree)),
              list(_six.iterkeys(shallow_tree))))
 
-      input_tree = list(_six.iteritems(input_tree))
-      shallow_tree = list(_six.iteritems(shallow_tree))
+      input_tree = list(sorted(_six.iteritems(input_tree)))
+      shallow_tree = list(sorted(_six.iteritems(shallow_tree)))
 
     for shallow_branch, input_branch in zip(shallow_tree, input_tree):
       assert_shallow_structure(shallow_branch, input_branch,
diff --git a/tensorflow/python/util/nest_test.py b/tensorflow/python/util/nest_test.py
index 6bec397db5..8aaf799fd0 100644
--- a/tensorflow/python/util/nest_test.py
+++ b/tensorflow/python/util/nest_test.py
@@ -425,6 +425,10 @@ class NestTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, expected_message):
       nest.assert_shallow_structure(inp_ab2, inp_ab1)
 
+    inp_ab = collections.OrderedDict([("a", 1), ("b", (2, 3))])
+    inp_ba = collections.OrderedDict([("b", (2, 3)), ("a", 1)])
+    nest.assert_shallow_structure(inp_ab, inp_ba)
+
   def testFlattenUpTo(self):
     # Shallow tree ends at scalar.
     input_tree = [[[2, 2], [3, 3]], [[4, 9], [5, 5]]]
-- 
GitLab


From a1a34b1440c4c4792f945275529e6c5b3c7aa2ca Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Thu, 1 Feb 2018 10:43:29 -0800
Subject: [PATCH 529/966] Add function paths to their signatures.

fixes #16167

PiperOrigin-RevId: 184160925
---
 tensorflow/tools/docs/pretty_docs.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/tensorflow/tools/docs/pretty_docs.py b/tensorflow/tools/docs/pretty_docs.py
index c033c16ae9..ac04f566d0 100644
--- a/tensorflow/tools/docs/pretty_docs.py
+++ b/tensorflow/tools/docs/pretty_docs.py
@@ -162,7 +162,7 @@ def _build_class_page(page_info):
       parts.append(h3.format(**method_info.__dict__))
 
       if method_info.signature is not None:
-        parts.append(_build_signature(method_info))
+        parts.append(_build_signature(method_info, use_full_name=False))
 
       parts.append(method_info.doc.docstring)
       parts.append(_build_function_details(method_info.doc.function_details))
@@ -259,14 +259,14 @@ def _build_module_page(page_info):
   return ''.join(parts)
 
 
-def _build_signature(obj_info):
+def _build_signature(obj_info, use_full_name=True):
   """Returns a md code block showing the function signature."""
   # Special case tf.range, since it has an optional first argument
   if obj_info.full_name == 'tf.range':
     return (
         '``` python\n'
-        "range(limit, delta=1, dtype=None, name='range')\n"
-        "range(start, limit, delta=1, dtype=None, name='range')\n"
+        "tf.range(limit, delta=1, dtype=None, name='range')\n"
+        "tf.range(start, limit, delta=1, dtype=None, name='range')\n"
         '```\n\n')
 
   parts = ['``` python']
@@ -281,7 +281,11 @@ def _build_signature(obj_info):
     sig = ',\n'.join('    %s' % sig_item for sig_item in obj_info.signature)
     sig = '\n'+sig+'\n'
 
-  parts.append(signature_template.format(name=obj_info.short_name, sig=sig))
+  if use_full_name:
+    obj_name = obj_info.full_name
+  else:
+    obj_name = obj_info.short_name
+  parts.append(signature_template.format(name=obj_name, sig=sig))
   parts.append('```\n\n')
 
   return '\n'.join(parts)
-- 
GitLab


From 14e0e7fe1eafd286f3813ba839b5f3236394a0a1 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Thu, 1 Feb 2018 11:06:53 -0800
Subject: [PATCH 530/966] Internal change.

PiperOrigin-RevId: 184165180
---
 tensorflow/contrib/lite/kernels/BUILD        | 1 +
 tensorflow/contrib/lite/kernels/conv_test.cc | 1 +
 2 files changed, 2 insertions(+)

diff --git a/tensorflow/contrib/lite/kernels/BUILD b/tensorflow/contrib/lite/kernels/BUILD
index d9051f3516..8c40adfae5 100644
--- a/tensorflow/contrib/lite/kernels/BUILD
+++ b/tensorflow/contrib/lite/kernels/BUILD
@@ -249,6 +249,7 @@ tf_cc_test(
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_absl//absl/memory",
         "@com_google_googletest//:gtest",
     ],
 )
diff --git a/tensorflow/contrib/lite/kernels/conv_test.cc b/tensorflow/contrib/lite/kernels/conv_test.cc
index 461efffe39..7550f7cc0d 100644
--- a/tensorflow/contrib/lite/kernels/conv_test.cc
+++ b/tensorflow/contrib/lite/kernels/conv_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include <cstdarg>
 
 #include <gtest/gtest.h>
+#include "absl/memory/memory.h"
 #include "tensorflow/contrib/lite/interpreter.h"
 #include "tensorflow/contrib/lite/kernels/register.h"
 #include "tensorflow/contrib/lite/kernels/test_util.h"
-- 
GitLab


From 4adfae9fe10968063cf55cea1bfef9b405c407c0 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Thu, 1 Feb 2018 11:33:34 -0800
Subject: [PATCH 531/966] Automated g4 rollback of changelist 184153187

PiperOrigin-RevId: 184169668
---
 .../jit/encapsulate_subgraphs_pass.cc         | 647 +++--------------
 .../jit/encapsulate_subgraphs_pass_test.cc    | 682 ++++--------------
 tensorflow/core/framework/function.cc         |  22 +-
 tensorflow/core/framework/function.h          |  15 +-
 4 files changed, 241 insertions(+), 1125 deletions(-)

diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
index 8edae9fc9c..0de163d3a8 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
@@ -30,14 +30,12 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
-#include "tensorflow/core/common_runtime/shape_refiner.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
@@ -143,7 +141,8 @@ struct NodeSlot {
 // everything to use it.
 static const char* const kArgOp = "_Arg";
 static const char* const kRetValOp = "_Retval";
-static const char* const kHostComputeOp = "_XlaHostCompute";
+static const char* const kSendToHostOp = "_XlaSendToHost";
+static const char* const kRecvFromHostOp = "_XlaRecvFromHost";
 static const char* const kSendFromHostOp = "_XlaSendFromHost";
 static const char* const kRecvAtHostOp = "_XlaRecvAtHost";
 
@@ -172,8 +171,7 @@ class Encapsulator {
 
   // Write a copy of the input graph to 'graph_out', where the subgraphs are
   // replaced with calls to the new functions.
-  Status BuildOutputGraph(bool parallel_checking, Graph* graph_out,
-                          FunctionLibraryDefinition* library);
+  Status BuildOutputGraph(bool parallel_checking, Graph* graph_out);
 
  private:
   // A subgraph of the input, all marked with a common 'group_attribute'
@@ -203,29 +201,21 @@ class Encapsulator {
   //     ..             .
   //  RAH -->  C  --> SFH
   //
-  // The compiled cluster is as follows. HC is a HostCompute node which is the
-  // source of a channel to the RAH node above and the destination of a channel
-  // from the SFH node above.
+  // The compiled cluster is as follows. STH is a SendToHost node which is the
+  // source of a channel to the RAH node above. RFH is a RecvFromHost node which
+  // is the destination of a channel from the SFH node above. There is a control
+  // edge that ensures RFH follows STH, which is used in shape inference to
+  // ensure that the shapes on the STH host channel are known before the RFH
+  // channel is compiled.
   //
-  //  Arg  --> B  --> HC  --> D --> Retval
+  //  Arg  --> B  --> STH  ..>  RFH  --> D --> Retval
   //
-  // The channels HC/RAH and SFH/HC each transmit multiple tensors, so there is
-  // at most one RAH and SFH in each outside_compilation cluster. This design is
-  // preferred over adding separate Arg/Retval nodes for each transmitted value
-  // because it allows optimizations to the host code that would like to limit
-  // communication between host and device and, e.g., raise only one interrupt
-  // per channel rather than one per transmitted value.
-  //
-  // The shapes of the outputs from the HC node in general cannot be determined
-  // until the shapes of its inputs are known at compile time, since e.g.,
-  // above, the shape of C's outputs aren't known until the shape of its inputs
-  // are known. If the shapes of the HC's outputs can be determined during the
-  // rewrite, they are stored in the node's 'shapes' attr. Otherwise a minimal
-  // graph is stored in the shape_inference_graph attr. This graph can be used
-  // when compiling the HC Op to determined the shape of the SFH inputs given
-  // the shapes of any ancestor RAH outputs. If it can be determined that the
-  // shape of the SFH inputs will not be inferrable even once the shapes of the
-  // RAH outputs are known, an error is returned by the rewriter.
+  // The channels STH/RAH and SFH/RFH each transmit a tuple, so there is at most
+  // one RAH and SFH in each compiled cluster. This design is preferred over
+  // adding separate Arg/Retval nodes for each transmitted value because it
+  // simplifies the host code that would like to limit communication between
+  // host and device and, e.g., raise only one interrupt per channel rather than
+  // one per transmitted value.
   class Subgraph {
    public:
     // Creates a graph to build the subgraph in, if it doesn't already exist,
@@ -256,10 +246,6 @@ class Encapsulator {
         const std::unordered_map<const Node*, Node*>& node_images,
         Graph* graph_out);
 
-    // Returns the names of all the outside_compilation subgraphs in this
-    // Subgraph.
-    void GetOutsideCompilationSubgraphNames(std::vector<string>* names) const;
-
     // Returns the Node that inputs to the function should be wired up to.
     Node* GetCallNodeForInputs() const;
 
@@ -319,9 +305,15 @@ class Encapsulator {
     void RecordOutsideCompilationOutputOrControl(
         const string& outside_compilation_id, const Edge* edge);
 
-    // Adds the HostCompute nodes for each outside_compilation subgraph.
-    Status AddHostComputes(
-        const string& subgraph_name,
+    // Adds the SendToHost nodes for each outside_compilation subgraph once the
+    // edges have all been recorded via RecordOutsideCompilationInputOrControl.
+    Status AddSendsToOutsideCompilation(
+        const std::unordered_map<const Node*, Node*>& node_images);
+
+    // Adds the RecvFromHost nodes for each outside_compilation subgraph once
+    // the edges have all been recorded via
+    // RecordOutsideCompilationOutputOrControl.
+    Status AddRecvsFromOutsideCompilation(
         const std::unordered_map<const Node*, Node*>& node_images);
 
     // Creates the sequencer node if it doesn't exist, adding it to graph_out.
@@ -331,16 +323,10 @@ class Encapsulator {
     // all the downstream nodes of call_node_outputs.
     void ConnectSequencerToOutputs(Graph* graph_out);
 
-    Status AddShapeInferenceInfo(
-        const string& outside_compilation_subgraph_name,
-        const std::vector<TensorShapeProto>& shapes, GraphDef* inference_graph);
-
-    Status ReplaceFunctionDef(FunctionLibraryDefinition* library);
-
    private:
     struct OutsideCompilationSubgraph {
       // Map from source (producer node/slot) tensors in the original graph to
-      // input index (slot number in the HostCompute/RecvAtHost nodes that will
+      // input index (slot number in the SendToHost/RecvAtHost nodes that will
       // be created) for the outside_compilation subgraph.
       std::unordered_map<NodeSlot, int, NodeSlot::Hasher> inputs;
 
@@ -349,14 +335,14 @@ class Encapsulator {
       // outside_compilation subgraph. These are recorded by
       // RecordOutsideCompilationInputOrControl while walking all the subgraph
       // edges, and lifted control edges within the subgraph are added by
-      // AddSendsToOutsideCompilation once the _HostCompute node has been
+      // AddSendsToOutsideCompilation once the _SendToHost node has been
       // created. The matching control edge from _RecvAtHost to the
       // destination is added by CopyEdgeToOutputGraph.
       std::unordered_set<const Node*> control_inputs;
 
       // Maps from source (producer node/slot) and destination (consumer
       // node/slot) tensors in the original graph to output index (slot number
-      // in the SendFromHost/HostCompute nodes that will be created) for the
+      // in the SendFromHost/RecvFromHost nodes that will be created) for the
       // outside_compilation subgraph.
       std::unordered_map<NodeSlot, int, NodeSlot::Hasher> outputs_by_src;
       std::unordered_map<NodeSlot, int, NodeSlot::Hasher> outputs_by_dst;
@@ -366,13 +352,13 @@ class Encapsulator {
       // containing compiled subgraph. These are recorded by
       // RecordOutsideCompilationOutputOrControl while walking all the subgraph
       // edges, and lifted control edges within the subgraph are added by
-      // AddRecvsFromToOutsideCompilation once the _HostCompute node has been
+      // AddRecvsFromToOutsideCompilation once the _RecvFromHost node has been
       // created. The matching control edge from the source to _SendFromHost to
       // the destination is added by CopyEdgeToOutputGraph.
       std::unordered_set<const Node*> control_outputs;
 
-      // Name of the _HostCompute node in the subgraph.
-      string host_compute_name;
+      // _SendToHost node in the subgraph. Not owned.
+      Node* send_to_host = nullptr;
 
       // _RecvAtHost node in the output graph. Not owned.
       Node* recv_at_host = nullptr;
@@ -530,59 +516,6 @@ class Encapsulator {
       const std::unordered_map<const Node*, Node*>& node_images,
       bool parallel_checking, Graph* graph_out);
 
-  // Constructs a minimal shape inference graph that can be used to determine
-  // the shape of send_node at the time that the subgraph is compiled.
-  // recv_at_host_nodes contains the names of all the recv_at_host nodes that
-  // send_node might depend on. These recv_at_host nodes have shapes that are
-  // not known during the rewrite pass, but will be known at compile time.
-  //
-  // If the shapes of all the inputs to send_node can be determined during the
-  // rewrite pass, on exit graphdef_out is empty and the shapes are returned in
-  // static_shape_out. Otherwise graphdef_out contains a graph that can be used
-  // for shape inference at compile time, where all the source nodes of the
-  // graph are either constants with known shapes, or nodes named in
-  // recv_at_host_nodes.
-  //
-  // A non-OK status is returned if neither of the above conditions can be
-  // satisfied, e.g., because send_node depends on a node that doesn't have a
-  // registered shape inference function.
-  Status DoStaticShapeInferenceForOutsideCompilationSend(
-      const Graph& graph_in, const ShapeRefiner& shape_refiner,
-      const std::unordered_set<string>& recv_at_host_nodes, Node* send_node,
-      FunctionLibraryDefinition* library,
-      std::vector<TensorShapeProto>* static_shape_out,
-      std::unique_ptr<GraphDef>* graphdef_out);
-
-  // Makes a copy of graph containing only nodes that are ancestors of at least
-  // one node in send_from_host_nodes and store it in pruned_graph. On exit
-  // nodes_images contains a mapping from nodes in graph to nodes in
-  // pruned_graph. All functions in the copied graph are inlined.
-  Status MakePrunedGraphCopyAndInline(
-      const Graph& graph, const std::vector<Node*>& sink_nodes,
-      std::unique_ptr<Graph>* pruned_graph,
-      std::unordered_map<const Node*, Node*>* node_images,
-      FunctionLibraryDefinition* library);
-
-  // Makes a copy of graph containing only nodes that are ancestors of a
-  // send_from_host node in an outside_compilation subgraph, and store it in
-  // pruned_graph. Also perform shape inference on the pruned graph, using
-  // shape_refiner. On exit node_images contains a mapping from nodes in graph
-  // to nodes in pruned_graph.
-  Status MakeGraphForOutsideCompilationSends(
-      const Graph& graph, std::unique_ptr<Graph>* pruned_graph,
-      ShapeRefiner* shape_refiner,
-      std::unordered_map<const Node*, Node*>* node_images,
-      FunctionLibraryDefinition* library);
-
-  // Performs static shape inference, as far as possible, for the send_from_host
-  // nodes in each outside_compilation subgraph. Where it is not possible to
-  // determine the shape statically, stores a serialized GraphDef in the
-  // HostCompute 'shape_inference_graph' attr, to be used at compile time for
-  // final inference. If the shapes are known statically they are stored in the
-  // HostCompute 'shapes' attr.
-  Status GetShapeInfoForOutsideCompilationSends(
-      Graph* graph_out, FunctionLibraryDefinition* library);
-
   const string group_attribute_;
   const string outside_compilation_attribute_;
   const Graph* graph_in_;
@@ -749,20 +682,16 @@ void Encapsulator::Subgraph::RecordOutsideCompilationOutputOrControl(
   }
 }
 
-Status Encapsulator::Subgraph::AddHostComputes(
-    const string& subgraph_name,
+Status Encapsulator::Subgraph::AddSendsToOutsideCompilation(
     const std::unordered_map<const Node*, Node*>& node_images) {
   for (auto& oc_subgraph_iter : outside_compilation_subgraphs_) {
     const string& oc_subgraph_name = oc_subgraph_iter.first;
     OutsideCompilationSubgraph& oc_subgraph = oc_subgraph_iter.second;
-    if (!oc_subgraph.inputs.empty() || !oc_subgraph.control_inputs.empty() ||
-        !oc_subgraph.outputs_by_src.empty() ||
-        !oc_subgraph.control_outputs.empty()) {
-      // Build a _HostCompute node.
+    if (!oc_subgraph.inputs.empty() || !oc_subgraph.control_inputs.empty()) {
+      // Build a _SendToHost node sending all the args of the appropriate
+      // types.
+      std::vector<DataType> dtypes(oc_subgraph.inputs.size(), DT_INVALID);
       std::vector<NodeDefBuilder::NodeOut> inputs(oc_subgraph.inputs.size());
-      std::vector<DataType> input_dtypes(oc_subgraph.inputs.size(), DT_INVALID);
-      std::vector<DataType> output_dtypes(oc_subgraph.outputs_by_src.size(),
-                                          DT_INVALID);
 
       for (const auto& input_src : oc_subgraph.inputs) {
         const Node* src_node = input_src.first.node;
@@ -771,64 +700,94 @@ Status Encapsulator::Subgraph::AddHostComputes(
         int input_index = input_src.second;
 
         DataType dtype = src_node->output_type(src_slot);
+        dtypes[input_index] = dtype;
         inputs[input_index].Reset(src_image->name(), src_slot, dtype);
-        input_dtypes[input_index] = dtype;
       }
 
-      for (const auto& output : oc_subgraph.outputs_by_src) {
-        DataType dtype = output.first.dtype;
-        int output_index = output.second;
-        output_dtypes[output_index] = dtype;
-      }
-
-      NodeDef host_compute_def;
-      NodeDefBuilder builder(strings::StrCat("outside_compilation_",
-                                             oc_subgraph_name, "_host_compute"),
-                             kHostComputeOp);
+      NodeDef send_def;
+      NodeDefBuilder builder(
+          strings::StrCat("outside_compilation_", oc_subgraph_name, "_send"),
+          kSendToHostOp);
+      builder.Attr("dtypes", dtypes);
       builder.Input(inputs);
-      builder.Attr("Tinputs", input_dtypes);
-      builder.Attr("Toutputs", output_dtypes);
-      builder.Attr("key",
-                   strings::StrCat("host_compute_channel_", subgraph_name, "_",
-                                   oc_subgraph_name));
-      Status s = builder.Finalize(&host_compute_def);
+      Status s = builder.Finalize(&send_def);
       if (!s.ok()) return s;
 
-      Node* host_compute = graph_->AddNode(host_compute_def, &s);
+      oc_subgraph.send_to_host = graph_->AddNode(send_def, &s);
       if (!s.ok()) return s;
-      oc_subgraph.host_compute_name = host_compute->name();
 
-      // Connect the _HostCompute node to its producers in the subgraph.
+      // Connect the _SendToHost node to its producers in the subgraph.
       for (auto& input_src : oc_subgraph.inputs) {
         const Node* src_node = input_src.first.node;
         Node* src_image = node_images.at(src_node);
         int src_slot = input_src.first.slot;
         int input_index = input_src.second;
-        graph_->AddEdge(src_image, src_slot, host_compute, input_index);
+        graph_->AddEdge(src_image, src_slot, oc_subgraph.send_to_host,
+                        input_index);
       }
 
-      // Connect the _HostCompute node to its control edge producers in the
+      // Connect the _SendToHost node to its control edge producers in the
       // subgraph.
       for (const auto& src_node : oc_subgraph.control_inputs) {
         Node* src_image = node_images.at(src_node);
-        graph_->AddControlEdge(src_image, host_compute);
+        graph_->AddControlEdge(src_image, oc_subgraph.send_to_host);
       }
+    }
+  }
+
+  return Status::OK();
+}
+
+Status Encapsulator::Subgraph::AddRecvsFromOutsideCompilation(
+    const std::unordered_map<const Node*, Node*>& node_images) {
+  for (auto& oc_subgraph_iter : outside_compilation_subgraphs_) {
+    const string& oc_subgraph_name = oc_subgraph_iter.first;
+    OutsideCompilationSubgraph& oc_subgraph = oc_subgraph_iter.second;
+    if (!oc_subgraph.outputs_by_src.empty() ||
+        !oc_subgraph.control_outputs.empty()) {
+      // Build a _RecvFromHost node producing all the outputs of the appropriate
+      // types.
+      std::vector<DataType> dtypes(oc_subgraph.outputs_by_src.size(),
+                                   DT_INVALID);
+
+      for (const auto& output : oc_subgraph.outputs_by_src) {
+        DataType dtype = output.first.dtype;
+        int output_index = output.second;
+        dtypes[output_index] = dtype;
+      }
+
+      NodeDef recv_def;
+      NodeDefBuilder builder(
+          strings::StrCat("outside_compilation_", oc_subgraph_name, "_recv"),
+          kRecvFromHostOp);
+      builder.Attr("dtypes", dtypes);
+      Status s = builder.Finalize(&recv_def);
+      if (!s.ok()) return s;
 
-      // Connect the consumers in the subgraph to the _HostCompute node.
+      Node* recv = graph_->AddNode(recv_def, &s);
+      if (!s.ok()) return s;
+
+      // Connect the consumers in the subgraph to the _RecvFromHost node.
       for (const auto& output : oc_subgraph.outputs_by_dst) {
         const Node* dst_node = output.first.node;
         Node* dst_image = node_images.at(dst_node);
         int dst_slot = output.first.slot;
         int output_index = output.second;
 
-        graph_->AddEdge(host_compute, output_index, dst_image, dst_slot);
+        graph_->AddEdge(recv, output_index, dst_image, dst_slot);
       }
 
-      // Connect the control edge consumers in the subgraph to the _HostCompute
+      // Connect the control edge consumers in the subgraph to the _RecvFromHost
       // node.
       for (const auto& dst_node : oc_subgraph.control_outputs) {
         Node* dst_image = node_images.at(dst_node);
-        graph_->AddControlEdge(host_compute, dst_image);
+        graph_->AddControlEdge(recv, dst_image);
+      }
+
+      // Add a control edge in the subgraph so that the _SendToHost node, if
+      // any, is compiled before the _RecvFromHost node.
+      if (oc_subgraph.send_to_host != nullptr) {
+        graph_->AddControlEdge(oc_subgraph.send_to_host, recv);
       }
     }
   }
@@ -923,63 +882,6 @@ Status Encapsulator::Subgraph::BuildFunctionDef(
   return Status::OK();
 }
 
-Status Encapsulator::Subgraph::AddShapeInferenceInfo(
-    const string& outside_compilation_subgraph_name,
-    const std::vector<TensorShapeProto>& shapes, GraphDef* inference_graph) {
-  OutsideCompilationSubgraph& oc_subgraph =
-      outside_compilation_subgraphs_.at(outside_compilation_subgraph_name);
-
-  Node* host_compute = nullptr;
-  for (Node* n : graph_->nodes()) {
-    if (n->name() == oc_subgraph.host_compute_name) {
-      host_compute = n;
-      break;
-    }
-  }
-  if (host_compute == nullptr) {
-    return errors::InvalidArgument(
-        "After rewriting subgraph ", outside_compilation_subgraph_name,
-        " there is no HostCompute Op for outside compilation subgraph ",
-        oc_subgraph.host_compute_name);
-  }
-
-  if (inference_graph == nullptr) {
-    host_compute->AddAttr("shape_inference_graph", "");
-    host_compute->AddAttr("shapes", shapes);
-  } else {
-    string serialized_graph;
-    if (!inference_graph->SerializeToString(&serialized_graph)) {
-      return errors::Internal(
-          "Failed to serialize graph for outside compilation subgraph ",
-          oc_subgraph.host_compute_name);
-    }
-    host_compute->AddAttr("shape_inference_graph", serialized_graph);
-    host_compute->AddAttr("shapes", std::vector<TensorShapeProto>());
-  }
-  return Status::OK();
-}
-
-Status Encapsulator::Subgraph::ReplaceFunctionDef(
-    FunctionLibraryDefinition* library) {
-  const string& name = call_node_def_.name();
-
-  FunctionDef fdef;
-  TF_RETURN_IF_ERROR(GraphToFunctionDef(*graph_, name, &fdef));
-
-  if (VLOG_IS_ON(1)) {
-    VLOG(2) << "Replace function def " << name;
-    dump_graph::DumpGraphToFile(
-        strings::StrCat("replace_encapsulate_fdef_graph_", name), *graph_,
-        library);
-    dump_graph::DumpFunctionDefToFile(
-        strings::StrCat("replace_encapsulate_fdef_", name), fdef);
-  }
-
-  TF_RETURN_IF_ERROR(library->RemoveFunction(name));
-  TF_RETURN_IF_ERROR(library->AddFunctionDef(fdef));
-  return Status::OK();
-}
-
 Status Encapsulator::Subgraph::BuildParallelCheckOp(
     const std::unordered_map<const Node*, Node*>& node_images,
     Graph* graph_out) {
@@ -1078,9 +980,7 @@ Status Encapsulator::Subgraph::AddRecvAtHostNode(
   NodeDefBuilder builder(strings::StrCat("outside_compilation_", subgraph_name,
                                          "_", oc_subgraph_name, "_recv"),
                          kRecvAtHostOp);
-  builder.Attr("Toutputs", dtypes);
-  builder.Attr("key", strings::StrCat("host_compute_channel_", subgraph_name,
-                                      "_", oc_subgraph_name));
+  builder.Attr("dtypes", dtypes);
   Status s = builder.Finalize(&recv_def);
   if (!s.ok()) return s;
 
@@ -1120,9 +1020,7 @@ Status Encapsulator::Subgraph::AddSendFromHostNode(
   NodeDefBuilder builder(strings::StrCat("outside_compilation_", subgraph_name,
                                          "_", oc_subgraph_name, "_send"),
                          kSendFromHostOp);
-  builder.Attr("Tinputs", dtypes);
-  builder.Attr("key", strings::StrCat("host_compute_channel_", subgraph_name,
-                                      "_", oc_subgraph_name));
+  builder.Attr("dtypes", dtypes);
   builder.Input(inputs);
   Status s = builder.Finalize(&send_def);
   if (!s.ok()) return s;
@@ -1164,13 +1062,6 @@ Status Encapsulator::Subgraph::AddOutsideCompilationHostIONodes(
   return Status::OK();
 }
 
-void Encapsulator::Subgraph::GetOutsideCompilationSubgraphNames(
-    std::vector<string>* names) const {
-  for (auto& entry : outside_compilation_subgraphs_) {
-    names->push_back(entry.first);
-  }
-}
-
 Status Encapsulator::GetFunctionNameAttr(
     Node const* node, string* attr, string* outside_compilation_attr) const {
   Status s = GetNodeAttr(node->attrs(), group_attribute_, attr);
@@ -1329,7 +1220,8 @@ Status Encapsulator::SplitIntoSubgraphs() {
   // single input and output node for it.
   for (auto& entry : subgraphs_) {
     Subgraph& subgraph = entry.second;
-    TF_RETURN_IF_ERROR(subgraph.AddHostComputes(entry.first, node_images));
+    TF_RETURN_IF_ERROR(subgraph.AddSendsToOutsideCompilation(node_images));
+    TF_RETURN_IF_ERROR(subgraph.AddRecvsFromOutsideCompilation(node_images));
   }
 
   MarkGuaranteedConstants(*graph_in_, src_arg_pairs);
@@ -1617,344 +1509,8 @@ Status Encapsulator::AddEdgesToOutputGraph(
   return Status::OK();
 }
 
-namespace {
-
-// Adds a dummy Const node to graph_out. The "constant" has the type of
-// data_type and the shape indicated in 'shape'. The dummy node is not a valid
-// Const node because it does not have any value defined, but this doesn't
-// matter because it will only be used subsequently for shape inference. (It
-// would be possible to add a switch statement over data_type to create a value
-// for the constant, but that would entail maintaining the logic as new types
-// are added, and is not necessary.)
-Node* AddDummyShapedNode(DataType data_type, const TensorShapeProto& shape,
-                         Graph* graph_out) {
-  TensorProto dummy_proto;
-  dummy_proto.set_dtype(data_type);
-  *dummy_proto.mutable_tensor_shape() = shape;
-  // Don't set any value field in the proto, since it is only going to be used
-  // for shape inference.
-
-  GraphDefBuilder::Options options(graph_out, /*status=*/nullptr);
-  NodeBuilder node_builder(options.GetNameForOp("KnownShape"), "Const",
-                           options.op_registry());
-  node_builder.Attr("dtype", data_type).Attr("value", dummy_proto);
-  return options.FinalizeBuilder(&node_builder);
-}
-
-// Adds a copy of node_in to graph_out and adds the mapping to
-// copied_node_images.
-Status CopyShapeInferenceNodeToGraph(
-    Node* node_in, const Node* send_node,
-    const std::unordered_map<Node*, Node*>& dummy_node_images,
-    FunctionLibraryDefinition* library,
-    std::unordered_map<Node*, Node*>* copied_node_images, Graph* graph_out) {
-  // Once all the ancestor nodes have been added to graph_out, add this node
-  // and connect it to its ancestors.
-  Node* node_out = graph_out->CopyNode(node_in);
-  (*copied_node_images)[node_in] = node_out;
-  // Don't bother to build the shape inference graph if there's a node with no
-  // shape inference function, since it would just result in an error later at
-  // compile time.
-  const OpRegistrationData* op_reg_data;
-  TF_RETURN_IF_ERROR(library->LookUp(node_in->type_string(), &op_reg_data));
-  if (op_reg_data->shape_inference_fn == nullptr) {
-    return errors::InvalidArgument(
-        "Shape inference is not possible for outside_compilation "
-        "SendFromHost node ",
-        send_node->name(), " because it depends on node ", node_in->name(),
-        " which does not have a shape inference function registered.");
-  }
-  // Add all the edges to the newly copied node.
-  for (const Edge* in_edge : node_in->in_edges()) {
-    if (!in_edge->IsControlEdge()) {
-      Node* src = in_edge->src();
-      const auto iter = dummy_node_images.find(src);
-      if (iter == dummy_node_images.end()) {
-        // The src is a copied node so use the original output port.
-        graph_out->AddEdge((*copied_node_images)[in_edge->src()],
-                           in_edge->src_output(), node_out,
-                           in_edge->dst_input());
-      } else {
-        // The src is a dummy node so use output port 0.
-        graph_out->AddEdge(iter->second, 0, node_out, in_edge->dst_input());
-      }
-    }
-  }
-  return Status::OK();
-}
-
-}  // namespace
-
-Status Encapsulator::DoStaticShapeInferenceForOutsideCompilationSend(
-    const Graph& graph_in, const ShapeRefiner& shape_refiner,
-    const std::unordered_set<string>& recv_at_host_nodes, Node* send_node,
-    FunctionLibraryDefinition* library,
-    std::vector<TensorShapeProto>* static_shape_out,
-    std::unique_ptr<GraphDef>* graphdef_out) {
-  // Maps from nodes in graph_in to nodes in graph_out.
-  //
-  // When an edge has fully defined shape the source node in graph_in is
-  // replaced in graph_out by a dummy constant node. The mapping from nodes
-  // in graph_in to dummy nodes is stored in dummy_node_images.
-  //
-  // When a node in graph_in has at least one ancestor that doesn't have fully
-  // defined shape, it is copied into graph_out. The mapping from nodes in
-  // graph_in to copied nodes is stored in copied_node_images.
-  //
-  // The two types of node are treated differently because, when adding edges to
-  // graph_out, an output from a dummy node always uses port 0, whereas an
-  // output from a copied node uses the same port that was used in graph_in.
-  std::unordered_map<Node*, Node*> dummy_node_images;
-  std::unordered_map<Node*, Node*> copied_node_images;
-
-  std::unique_ptr<Graph> graph_out(new Graph(graph_in.op_registry()));
-  graph_out->set_versions(graph_in.versions());
-  static_shape_out->resize(send_node->num_inputs());
-
-  // We don't use the standard ReverseDFS because we want to cut off traversal
-  // whenever we find an output with fully defined shape.
-  // TODO(misard) make this work properly in the presence of control flow.
-  struct Work {
-    Node* node;
-    bool leave;  // Are we entering or leaving node?
-  };
-  std::vector<Work> stack({{send_node, false}});
-  std::vector<bool> visited(graph_in.num_node_ids(), false);
-  while (!stack.empty()) {
-    Work w = stack.back();
-    stack.pop_back();
-    Node* n = w.node;
-
-    if (w.leave) {
-      TF_RETURN_IF_ERROR(CopyShapeInferenceNodeToGraph(
-          n, send_node, dummy_node_images, library, &copied_node_images,
-          graph_out.get()));
-    } else {
-      if (visited[n->id()]) continue;
-      visited[n->id()] = true;
-
-      // Arrange to revisit when all done with all inputs.
-      stack.push_back(Work{n, true});
-
-      bool has_parent_with_unknown_shape = false;
-      for (const Edge* in_edge : n->in_edges()) {
-        if (!in_edge->IsControlEdge()) {
-          Node* src_node = in_edge->src();
-          int src_port = in_edge->src_output();
-          shape_inference::InferenceContext* context =
-              shape_refiner.GetContext(src_node);
-          shape_inference::ShapeHandle shape = context->output(src_port);
-          if (context->FullyDefined(shape)) {
-            // This ancestor has known shape, so instead of adding it to the
-            // stack, add a dummy node with that shape to graph_out and
-            // continue.
-            TensorShapeProto proto;
-            context->ShapeHandleToProto(shape, &proto);
-            dummy_node_images[src_node] = AddDummyShapedNode(
-                src_node->output_type(src_port), proto, graph_out.get());
-            if (n == send_node) {
-              (*static_shape_out)[in_edge->dst_input()] = proto;
-            }
-          } else {
-            if (!visited[src_node->id()]) {
-              has_parent_with_unknown_shape = true;
-              stack.push_back({src_node, false});
-            }
-          }
-        }
-      }
-      if (!has_parent_with_unknown_shape) {
-        if (n == send_node) {
-          // The shapes of all the inputs to send_node are statically known. We
-          // won't have to do any inference at compile time so return now: the
-          // shapes were stored in static_shape_out above.
-          graphdef_out->reset();
-          return Status::OK();
-        } else {
-          // Any shape that is being processed is either the original send node
-          // or has at least one output with statically-unknown shape. If the
-          // latter and it doesn't have any inputs with statically-unknown
-          // shape, then check that it is of the recv nodes that we can fill in
-          // the shape of at run-time later. If it isn't one of those, then we
-          // won't have any additional knowledge at compile time, so we already
-          // know we won't be able to do shape inference and we can return an
-          // error now.
-          if (recv_at_host_nodes.find(n->name()) == recv_at_host_nodes.end()) {
-            return errors::InvalidArgument(
-                "Shape inference is not possible for outside_compilation "
-                "SendFromHost node ",
-                send_node->name(), " because shape of node ", n->name(),
-                " will not be known at compilation time.");
-          }
-        }
-      }
-    }
-  }
-
-  graphdef_out->reset(new GraphDef());
-  graph_out->ToGraphDef(graphdef_out->get());
-
-  return Status::OK();
-}
-
-Status Encapsulator::MakePrunedGraphCopyAndInline(
-    const Graph& graph, const std::vector<Node*>& sink_nodes,
-    std::unique_ptr<Graph>* pruned_graph,
-    std::unordered_map<const Node*, Node*>* node_images,
-    FunctionLibraryDefinition* library) {
-  // First copy all ancestor nodes of sink_nodes into a new graph.
-  pruned_graph->reset(new Graph(library));
-  (*pruned_graph)->set_versions(graph.versions());
-  ReverseDFSFrom(graph, sink_nodes,
-                 /*enter=*/nullptr,
-                 /*leave=*/[&](Node* n) {
-                   if (!n->IsSource()) {
-                     Node* copied = (*pruned_graph)->CopyNode(n);
-                     node_images->emplace(n, copied);
-                   }
-                 });
-
-  // Add all the edges between copied nodes.
-  for (auto entry : *node_images) {
-    const Node* orig = entry.first;
-    Node* image = entry.second;
-    for (const Edge* out_edge : orig->out_edges()) {
-      auto iter = node_images->find(out_edge->dst());
-      if (iter != node_images->end()) {
-        // The source and destination are both in the copied graph.
-        (*pruned_graph)
-            ->AddEdge(image, out_edge->src_output(), iter->second,
-                      out_edge->dst_input());
-      }
-    }
-  }
-
-  // Find all the function call nodes, and inline them.
-  std::vector<Node*> function_nodes;
-  for (auto node : (*pruned_graph)->nodes()) {
-    const OpRegistrationData* op_reg_data;
-    TF_RETURN_IF_ERROR(library->LookUp(node->type_string(), &op_reg_data));
-    if (op_reg_data->is_function_op) {
-      function_nodes.push_back(node);
-    }
-  }
-  for (auto node : function_nodes) {
-    VLOG(2) << "Inlining function " << node->name();
-    const FunctionDef* fdef = library->Find(node->type_string());
-    if (fdef == nullptr) {
-      return errors::Internal("Failed to find function ", node->type_string(),
-                              " in function library.");
-    }
-    FunctionBody* fbody = nullptr;
-    TF_RETURN_IF_ERROR(
-        FunctionDefToBodyHelper(*fdef, node->attrs(), library,
-                                [library](const string& op, const OpDef** sig) {
-                                  return library->LookUpOpDef(op, sig);
-                                },
-                                &fbody));
-    InlineFunctionBody(*library, pruned_graph->get(), node, fbody);
-    delete fbody;
-  }
-
-  return Status::OK();
-}
-
-Status Encapsulator::MakeGraphForOutsideCompilationSends(
-    const Graph& graph, std::unique_ptr<Graph>* pruned_graph,
-    ShapeRefiner* shape_refiner,
-    std::unordered_map<const Node*, Node*>* node_images,
-    FunctionLibraryDefinition* library) {
-  // Find all the send_from_host nodes in all subgraphs, to use as roots for the
-  // pruning.
-  std::vector<Node*> send_from_host_nodes;
-  for (auto& subgraph_entry : subgraphs_) {
-    Subgraph& subgraph = subgraph_entry.second;
-    std::vector<string> outside_compilation_names;
-    subgraph.GetOutsideCompilationSubgraphNames(&outside_compilation_names);
-    for (const auto& name : outside_compilation_names) {
-      Node* send_node = subgraph.GetSendFromHostNode(name);
-      if (send_node != nullptr) {
-        send_from_host_nodes.push_back(send_node);
-      }
-    }
-  }
-
-  // Make a copy of all the graph nodes needed to evaluate the send_from_host
-  // nodes, inlining any functions as needed.
-  TF_RETURN_IF_ERROR(MakePrunedGraphCopyAndInline(
-      graph, send_from_host_nodes, pruned_graph, node_images, library));
-
-  // Perform shape inference on the pruned graph.
-  shape_refiner->set_require_shape_inference_fns(false);
-  FixupSourceAndSinkEdges(pruned_graph->get());
-  std::vector<Node*> post_order;
-  GetReversePostOrder(*(*pruned_graph), &post_order);
-  for (auto node : post_order) {
-    // Ignore the status returned by the shape_refiner. At this point we want
-    // the best effort shapes, even if no shape function is registered for a
-    // node.
-    Status status = shape_refiner->AddNode(node);
-    VLOG_IF(1, !status.ok()) << "Shape inference failed for node: " << status;
-  }
-
-  return Status::OK();
-}
-
-Status Encapsulator::GetShapeInfoForOutsideCompilationSends(
-    Graph* graph_out, FunctionLibraryDefinition* library) {
-  std::unique_ptr<Graph> pruned_graph;
-  ShapeRefiner shape_refiner(graph_out->versions(), graph_out->op_registry());
-  std::unordered_map<const Node*, Node*> node_images;
-  TF_RETURN_IF_ERROR(MakeGraphForOutsideCompilationSends(
-      *graph_out, &pruned_graph, &shape_refiner, &node_images, library));
-
-  for (auto& subgraph_entry : subgraphs_) {
-    Subgraph& subgraph = subgraph_entry.second;
-    // Find all the recv_at_host nodes in this subgraph.
-    std::vector<string> outside_compilation_names;
-    subgraph.GetOutsideCompilationSubgraphNames(&outside_compilation_names);
-    std::unordered_set<string> recv_at_host_names;
-    for (const auto& name : outside_compilation_names) {
-      Node* recv_node = subgraph.GetRecvAtHostNode(name);
-      if (recv_node != nullptr) {
-        recv_at_host_names.insert(recv_node->name());
-      }
-    }
-    // For each send_from_host node, do as much shape inference as possible
-    // without knowing the shape of the recv_at_host nodes, and store the
-    // result, along with enough information to complete the job at compile time
-    // once the recv_at_host shapes are known.
-    for (const auto& name : outside_compilation_names) {
-      Node* send_node = subgraph.GetSendFromHostNode(name);
-      std::vector<TensorShapeProto> static_shape;
-      std::unique_ptr<GraphDef> graphdef;
-      if (send_node != nullptr) {
-        TF_RETURN_IF_ERROR(DoStaticShapeInferenceForOutsideCompilationSend(
-            *pruned_graph, shape_refiner, recv_at_host_names,
-            node_images[send_node], library, &static_shape, &graphdef));
-        if (graphdef == nullptr) {
-          VLOG(2) << "Send node  " << send_node->name() << " shapes";
-          for (int i = 0; i < static_shape.size(); ++i) {
-            VLOG(2) << static_shape[i].DebugString();
-          }
-        } else {
-          VLOG(2) << "Send node " << send_node->name() << " graph\n"
-                  << graphdef->DebugString();
-        }
-      }
-      TF_RETURN_IF_ERROR(
-          subgraph.AddShapeInferenceInfo(name, static_shape, graphdef.get()));
-    }
-    if (!outside_compilation_names.empty()) {
-      TF_RETURN_IF_ERROR(subgraph.ReplaceFunctionDef(library));
-    }
-  }
-
-  return Status::OK();
-}
-
-Status Encapsulator::BuildOutputGraph(bool parallel_checking, Graph* graph_out,
-                                      FunctionLibraryDefinition* library) {
+Status Encapsulator::BuildOutputGraph(bool parallel_checking,
+                                      Graph* graph_out) {
   // Map from nodes in the input graph to nodes in the output graph.
   std::unordered_map<const Node*, Node*> node_images;
 
@@ -1966,9 +1522,6 @@ Status Encapsulator::BuildOutputGraph(bool parallel_checking, Graph* graph_out,
   TF_RETURN_IF_ERROR(
       AddEdgesToOutputGraph(node_images, parallel_checking, graph_out));
 
-  TF_RETURN_IF_ERROR(
-      GetShapeInfoForOutsideCompilationSends(graph_out, library));
-
   return Status::OK();
 }
 
@@ -1992,7 +1545,7 @@ Status EncapsulateSubgraphsInFunctions(
   std::unique_ptr<Graph> out(new Graph(library));
   out->set_versions(graph_in.versions());
   TF_RETURN_IF_ERROR(
-      encapsulator.BuildOutputGraph(parallel_checking, out.get(), library));
+      encapsulator.BuildOutputGraph(parallel_checking, out.get()));
 
   *graph_out = std::move(out);
   return Status::OK();
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
index 5032a0935d..b100861d5e 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
@@ -29,181 +29,17 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-template <class Tkey, class Tvalue>
-bool EqualProtoMap(const proto2::Map<Tkey, Tvalue>& a,
-                   const proto2::Map<Tkey, Tvalue>& b,
-                   const std::function<string(const Tkey&)>& key_to_string,
-                   const std::function<string(const Tvalue&)>& value_to_string,
-                   const std::function<bool(const Tkey&, const Tvalue&,
-                                            const Tvalue&)>& compare,
-                   const string& map_name, string* diff) {
-  for (const auto& elt_a : a) {
-    const auto iter = b.find(elt_a.first);
-    if (iter == b.end()) {
-      if (diff) {
-        *diff = strings::StrCat(
-            map_name, " expected: contains element with key '",
-            key_to_string(elt_a.first), "' got: map has no such element");
-      }
-      return false;
-    }
-    if (!compare(elt_a.first, elt_a.second, iter->second)) {
-      if (diff) {
-        *diff = strings::StrCat(map_name, " expected: element with key '",
-                                key_to_string(elt_a.first), " has value '",
-                                value_to_string(elt_a.second), "' got: '",
-                                value_to_string(iter->second), "'");
-      }
-      return false;
-    }
-  }
-  for (const auto& elt_b : b) {
-    const auto iter = a.find(elt_b.first);
-    if (iter == a.end()) {
-      if (diff) {
-        *diff = strings::StrCat(map_name, " got: contains element with key '",
-                                key_to_string(elt_b.first),
-                                "' expected: map has no such element");
-      }
-      return false;
-    }
-  }
-  return true;
-}
-
-bool EqualFunctionNodeDef(const NodeDef& a, const NodeDef& b,
-                          const string& diff_preamble, string* diff) {
-  if (a.op() != b.op()) {
-    if (diff) {
-      *diff = strings::StrCat(diff_preamble, " mismatch for node ", a.name(),
-                              ", expected op '", a.op(), "' got '", b.op());
-    }
-    return false;
-  }
-  if (a.device() != b.device()) {
-    if (diff) {
-      *diff = strings::StrCat(diff_preamble, " mismatch for node ", a.name(),
-                              ", expected device '", a.device(), "' got '",
-                              b.device());
-    }
-    return false;
-  }
-  if (a.input_size() != b.input_size()) {
-    if (diff) {
-      *diff = strings::StrCat(diff_preamble, " mismatch for node ", a.name(),
-                              ", expected ", a.input_size(), " inputs got ",
-                              b.input_size(), " expected:\n", a.DebugString(),
-                              "\ngot:\n", b.DebugString());
-    }
-    return false;
-  }
-  for (int i = 0; i < a.input_size(); ++i) {
-    if (a.input(i) != b.input(i)) {
-      if (diff) {
-        *diff = strings::StrCat(diff_preamble, " mismatch for node ", a.name(),
-                                " input ", i, ", expected ", a.input(i),
-                                " got ", b.input(i), " expected:\n",
-                                a.DebugString(), "\ngot:\n", b.DebugString());
-      }
-      return false;
-    }
-  }
-  return EqualProtoMap<string, AttrValue>(
-      a.attr(), b.attr(), [](const string& s) { return s; },
-      [](const AttrValue& v) { return v.DebugString(); },
-      [](const string& key, const AttrValue& av, const AttrValue& bv) {
-        if (key == "shape_inference_graph") {
-          // Default serialization of GraphDef is unstable because maps don't
-          // serialize deterministically. Rather than go through the hoops to
-          // turn on deterministic serialization of this attr just for this
-          // test, add logic here to compare determinstically.
-          GraphDef ga;
-          if (!ga.ParseFromString(av.s())) {
-            return false;
-          }
-          GraphDef gb;
-          if (!gb.ParseFromString(bv.s())) {
-            return false;
-          }
-          return EqualGraphDef(ga, gb, nullptr);
-        } else {
-          return av.DebugString() == bv.DebugString();
-        }
-      },
-      strings::StrCat(diff_preamble, " attr mismatch for node ", a.name()),
-      diff);
-}
-
 bool EqualFunctionDef(const FunctionDef& a, const FunctionDef& b,
                       string* diff) {
-  if (a.signature().DebugString() != b.signature().DebugString()) {
+  // TODO(phawkins) use a more sophisticated equality test.
+  if (a.DebugString() != b.DebugString()) {
     if (diff) {
-      *diff = strings::StrCat("Signature mismatch for function ",
+      *diff = strings::StrCat("Definition mismatch for function ",
                               a.signature().name(), ", expected:\n",
-                              a.signature().DebugString(), "\ngot:\n",
-                              b.signature().DebugString());
+                              a.DebugString(), "\ngot:\n", b.DebugString());
     }
     return false;
   }
-  if (!EqualProtoMap<string, AttrValue>(
-          a.attr(), b.attr(), [](const string& s) { return s; },
-          [](const AttrValue& v) { return v.DebugString(); },
-          [](const string& key, const AttrValue& av, const AttrValue& bv) {
-            return av.DebugString() == bv.DebugString();
-          },
-          strings::StrCat("attr mismatch for function ", a.signature().name()),
-          diff)) {
-    return false;
-  }
-  if (!EqualProtoMap<string, string>(
-          a.ret(), b.ret(), [](const string& s) { return s; },
-          [](const string& s) { return s; },
-          [](const string& key, const string& av, const string& bv) {
-            return av == bv;
-          },
-          strings::StrCat("ret mismatch for function ", a.signature().name()),
-          diff)) {
-    return false;
-  }
-  for (int i = 0; i < a.node_def_size(); ++i) {
-    bool found = false;
-    for (int j = 0; j < b.node_def_size(); ++j) {
-      if (a.node_def(i).name() == b.node_def(j).name()) {
-        if (!EqualFunctionNodeDef(
-                a.node_def(i), b.node_def(j),
-                strings::StrCat("Function ", a.signature().name()), diff)) {
-          return false;
-        }
-        found = true;
-        break;
-      }
-    }
-    if (!found) {
-      if (diff) {
-        *diff = strings::StrCat("Function ", a.signature().name(),
-                                ", expected: has node '", a.node_def(i).name(),
-                                "' got: no node of that name");
-      }
-      return false;
-    }
-  }
-  for (int i = 0; i < b.node_def_size(); ++i) {
-    bool found = false;
-    for (int j = 0; j < a.node_def_size(); ++j) {
-      if (b.node_def(i).name() == a.node_def(j).name()) {
-        found = true;
-        break;
-      }
-    }
-    if (!found) {
-      if (diff) {
-        *diff = strings::StrCat("Function ", a.signature().name(),
-                                ", got: has node '", b.node_def(i).name(),
-                                "' expected: no node of that name");
-      }
-      return false;
-    }
-  }
   return true;
 }
 
@@ -248,64 +84,29 @@ bool EqualFunctionDefLibrary(const FunctionDefLibrary& expected,
 
 // TODO(misard): remove these fake registrations once there are real Ops to be
 // compiled.
-REGISTER_OP("_XlaHostCompute")
-    .Input("inputs: Tinputs")
-    .Output("outputs: Toutputs")
-    .Attr("Tinputs: list(type) >= 0")
-    .Attr("Toutputs: list(type) >= 0")
-    .Attr("key: string")
-    .SetShapeFn(::tensorflow::shape_inference::UnknownShape);
+REGISTER_OP("_XlaSendToHost")
+    .Input("input: dtypes")
+    .Attr("dtypes: list(type) >= 0");
+
+REGISTER_OP("_XlaRecvFromHost")
+    .Output("output: dtypes")
+    .Attr("dtypes: list(type) >= 0");
 
 REGISTER_OP("_XlaSendFromHost")
-    .Input("input: Tinputs")
-    .Attr("Tinputs: list(type) >= 0")
-    .Attr("key: string")
-    .SetShapeFn(::tensorflow::shape_inference::UnknownShape);
+    .Input("input: dtypes")
+    .Attr("dtypes: list(type) >= 0");
 
 REGISTER_OP("_XlaRecvAtHost")
-    .Output("output: Toutputs")
-    .Attr("Toutputs: list(type) >= 0")
-    .Attr("key: string")
-    .SetShapeFn(::tensorflow::shape_inference::UnknownShape);
-
-REGISTER_OP("InputTest")
-    .Output("o: float")
-    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-      c->set_output(0, c->UnknownShape());
-      return Status::OK();
-    });
-
-REGISTER_OP("InputTestShaped")
-    .Output("o: float")
-    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-      c->set_output(0, c->Vector(2));
-      return Status::OK();
-    });
-
-REGISTER_OP("UnaryTest")
-    .Input("a: float")
-    .Output("o: float")
-    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-      ::tensorflow::shape_inference::ShapeHandle o;
-      TF_RETURN_IF_ERROR(c->Merge(c->UnknownShape(), c->input(0), &o));
-      c->set_output(0, o);
-      return Status::OK();
-    });
+    .Output("output: dtypes")
+    .Attr("dtypes: list(type) >= 0");
+
+REGISTER_OP("InputTest").Output("o: float");
+
+REGISTER_OP("UnaryTest").Input("a: float").Output("o: float");
 REGISTER_OP("BinaryTest")
     .Input("a: float")
     .Input("b: float")
-    .Output("o: float")
-    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-      ::tensorflow::shape_inference::ShapeHandle o;
-      TF_RETURN_IF_ERROR(c->Merge(c->UnknownShape(), c->input(0), &o));
-      c->set_output(0, o);
-      return Status::OK();
-    });
-REGISTER_OP("BinaryTest2")
-    .Input("a: float")
-    .Input("b: float")
-    .Output("o: float")
-    .SetShapeFn(::tensorflow::shape_inference::UnknownShape);
+    .Output("o: float");
 
 REGISTER_OP("AddNLikeTest")
     .Input("inputs: N * T")
@@ -323,48 +124,22 @@ Node* Input(const GraphDefBuilder::Options& opts) {
   return ops::SourceOp("InputTest", opts);
 }
 
-Node* InputShaped(const GraphDefBuilder::Options& opts) {
-  return ops::SourceOp("InputTestShaped", opts);
-}
-
-Node* KnownShape(const gtl::ArraySlice<int>& shape,
-                 const GraphDefBuilder::Options& opts) {
-  if (opts.HaveError()) return nullptr;
-  NodeBuilder node_builder(opts.GetNameForOp("Const"), "Const",
-                           opts.op_registry());
-  TensorProto value;
-  value.set_dtype(DT_FLOAT);
-  for (int dim : shape) {
-    value.mutable_tensor_shape()->add_dim()->set_size(dim);
-  }
-  return opts.WithAttr("value", value)
-      .WithAttr("dtype", DT_FLOAT)
-      .FinalizeBuilder(&node_builder);
-}
-
-Node* RecvAtHost(const string& key, const gtl::ArraySlice<DataType>& dtypes,
+Node* RecvAtHost(const gtl::ArraySlice<DataType>& dtypes,
                  const GraphDefBuilder::Options& opts) {
   if (opts.HaveError()) return nullptr;
   NodeBuilder node_builder(opts.GetNameForOp("_XlaRecvAtHost"),
                            "_XlaRecvAtHost", opts.op_registry());
-  return opts.WithAttr("Toutputs", dtypes)
-      .WithAttr("key", key)
-      .FinalizeBuilder(&node_builder);
+  return opts.WithAttr("dtypes", dtypes).FinalizeBuilder(&node_builder);
 }
 
-Node* SendFromHost(const string& key, const std::vector<ops::NodeOut>& inputs,
+Node* SendFromHost(const std::vector<ops::NodeOut>& inputs,
+                   const gtl::ArraySlice<DataType>& dtypes,
                    const GraphDefBuilder::Options& opts) {
   if (opts.HaveError()) return nullptr;
   NodeBuilder node_builder(opts.GetNameForOp("_XlaSendFromHost"),
                            "_XlaSendFromHost", opts.op_registry());
   node_builder.Input(inputs);
-  std::vector<DataType> dtypes;
-  for (const auto& node : inputs) {
-    dtypes.push_back(node.dt);
-  }
-  return opts.WithAttr("key", key)
-      .WithAttr("Tinputs", dtypes)
-      .FinalizeBuilder(&node_builder);
+  return opts.WithAttr("dtypes", dtypes).FinalizeBuilder(&node_builder);
 }
 
 Node* Unary(ops::NodeOut a, const GraphDefBuilder::Options& opts) {
@@ -376,11 +151,6 @@ Node* Binary(ops::NodeOut a, ops::NodeOut b,
   return ops::BinaryOp("BinaryTest", std::move(a), std::move(b), opts);
 }
 
-Node* BinaryUnknownShape(ops::NodeOut a, ops::NodeOut b,
-                         const GraphDefBuilder::Options& opts) {
-  return ops::BinaryOp("BinaryTest2", std::move(a), std::move(b), opts);
-}
-
 Node* AddNLike(const std::vector<ops::NodeOut>& inputs,
                const GraphDefBuilder::Options& opts) {
   if (opts.HaveError()) return nullptr;
@@ -806,21 +576,6 @@ TEST(EncapsulateSubgraphsTest, OneFunctionOneOutside) {
   FunctionDefLibrary library_expected;
   GraphDef graphdef_expected;
 
-  string shape_string_expected;
-  {
-    GraphDefBuilder shape(GraphDefBuilder::kFailImmediately);
-    Node* recv =
-        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT, DT_FLOAT},
-                   shape.opts().WithName("outside_compilation_F1_O1_recv"));
-    Node* e = Binary(ops::NodeOut(recv, 0), ops::NodeOut(recv, 1),
-                     shape.opts().WithName("E"));
-    SendFromHost("host_compute_channel_F1_O1", {e},
-                 shape.opts().WithName("outside_compilation_F1_O1_send"));
-    GraphDef shape_graph;
-    TF_EXPECT_OK(shape.ToGraphDef(&shape_graph));
-    EXPECT_TRUE(shape_graph.SerializeToString(&shape_string_expected));
-  }
-
   *library_expected.add_function() = test::function::XTimesTwo();
   *library_expected.add_function() = FunctionDefHelper::Create(
       "F1", {"a_0_arg:float", "b_0_arg:float"}, {"f_0_retval:float"}, {},
@@ -829,18 +584,19 @@ TEST(EncapsulateSubgraphsTest, OneFunctionOneOutside) {
           {{"c"}, "BinaryTest", {"b_0_arg", "C:o:0"}, {}, {"C"}},
           {{"F"},
            "BinaryTest",
-           {"C:o:0", "outside_compilation_O1_host_compute:outputs:0"},
+           {"C:o:0", "outside_compilation_O1_recv:output:0"},
            {},
-           {"outside_compilation_O1_host_compute"}},
-          {{"outside_compilation_O1_host_compute"},
-           "_XlaHostCompute",
+           {"outside_compilation_O1_recv"}},
+          {{"outside_compilation_O1_send"},
+           "_XlaSendToHost",
            {"C:o:0", "c:o:0"},
-           {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})},
-            {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
-            {"key", "host_compute_channel_F1_O1"},
-            {"shape_inference_graph", shape_string_expected},
-            {"shapes", gtl::ArraySlice<DataType>({})}},
+           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})}},
            {"c"}},
+          {{"outside_compilation_O1_recv"},
+           "_XlaRecvFromHost",
+           {},
+           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT})}},
+           {"outside_compilation_O1_send"}},
       },
       {{"f_0_retval", "F:o:0"}});
 
@@ -856,11 +612,11 @@ TEST(EncapsulateSubgraphsTest, OneFunctionOneOutside) {
     Node* call = b2.opts().FinalizeBuilder(&node_builder);
 
     Node* recv =
-        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT, DT_FLOAT},
+        RecvAtHost({DT_FLOAT, DT_FLOAT},
                    b2.opts().WithName("outside_compilation_F1_O1_recv"));
     Node* e = Binary(ops::NodeOut(recv, 0), ops::NodeOut(recv, 1),
                      b2.opts().WithName("E").WithControlInputs({recv, b}));
-    Node* send = SendFromHost("host_compute_channel_F1_O1", {e},
+    Node* send = SendFromHost({e}, {DT_FLOAT},
                               b2.opts()
                                   .WithName("outside_compilation_F1_O1_send")
                                   .WithControlInput(e));
@@ -918,71 +674,37 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
   FunctionDefLibrary library_expected;
   GraphDef graphdef_expected;
 
-  string shape_string_expected_1;
-  {
-    GraphDefBuilder shape1(GraphDefBuilder::kFailImmediately);
-    Node* recv =
-        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT, DT_FLOAT},
-                   shape1.opts().WithName("outside_compilation_F1_O1_recv"));
-    Node* e = Binary(ops::NodeOut(recv, 0), ops::NodeOut(recv, 1),
-                     shape1.opts().WithName("E"));
-    SendFromHost("host_compute_channel_F1_O1", {e},
-                 shape1.opts().WithName("outside_compilation_F1_O1_send"));
-    GraphDef shape1_graph;
-    TF_EXPECT_OK(shape1.ToGraphDef(&shape1_graph));
-    EXPECT_TRUE(shape1_graph.SerializeToString(&shape_string_expected_1));
-  }
-
-  string shape_string_expected_2;
-  {
-    GraphDefBuilder shape2(GraphDefBuilder::kFailImmediately);
-    Node* recv1 =
-        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT, DT_FLOAT},
-                   shape2.opts().WithName("outside_compilation_F1_O1_recv"));
-    Node* e = Binary(ops::NodeOut(recv1, 0), ops::NodeOut(recv1, 1),
-                     shape2.opts().WithName("E"));
-    Node* recv2 =
-        RecvAtHost("host_compute_channel_F1_O2", {DT_FLOAT, DT_FLOAT},
-                   shape2.opts().WithName("outside_compilation_F1_O2_recv"));
-    Node* h = Binary(ops::NodeOut(recv2, 0), e, shape2.opts().WithName("H"));
-    SendFromHost("host_compute_channel_F1_O2", {h},
-                 shape2.opts().WithName("outside_compilation_F1_O2_send"));
-    GraphDef shape2_graph;
-    TF_EXPECT_OK(shape2.ToGraphDef(&shape2_graph));
-    EXPECT_TRUE(shape2_graph.SerializeToString(&shape_string_expected_2));
-  }
-
   *library_expected.add_function() = FunctionDefHelper::Create(
       "F1", {"a_0_arg:float", "b_0_arg:float"}, {"i_0_retval:float"}, {},
       {
           {{"C"}, "UnaryTest", {"a_0_arg"}},
           {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}, {}},
-          {{"I"},
-           "UnaryTest",
-           {"outside_compilation_O2_host_compute:outputs:0"}},
+          {{"I"}, "UnaryTest", {"outside_compilation_O2_recv:output:0"}},
           {{"F"},
            "BinaryTest",
-           {"C:o:0", "outside_compilation_O1_host_compute:outputs:0"},
+           {"C:o:0", "outside_compilation_O1_recv:output:0"},
            {},
-           {"outside_compilation_O1_host_compute"}},
-          {{"outside_compilation_O2_host_compute"},
-           "_XlaHostCompute",
+           {"outside_compilation_O1_recv"}},
+          {{"outside_compilation_O2_send"},
+           "_XlaSendToHost",
            {"D:o:0", "F:o:0"},
-           {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})},
-            {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
-            {"key", "host_compute_channel_F1_O2"},
-            {"shape_inference_graph", shape_string_expected_2},
-            {"shapes", gtl::ArraySlice<DataType>({})}},
+           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})}},
            {"F"}},
-          {{"outside_compilation_O1_host_compute"},
-           "_XlaHostCompute",
+          {{"outside_compilation_O1_send"},
+           "_XlaSendToHost",
            {"C:o:0", "D:o:0"},
-           {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})},
-            {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
-            {"key", "host_compute_channel_F1_O1"},
-            {"shape_inference_graph", shape_string_expected_1},
-            {"shapes", gtl::ArraySlice<DataType>({})}},
+           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})}},
            {"D"}},
+          {{"outside_compilation_O2_recv"},
+           "_XlaRecvFromHost",
+           {},
+           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT})}},
+           {"outside_compilation_O2_send"}},
+          {{"outside_compilation_O1_recv"},
+           "_XlaRecvFromHost",
+           {},
+           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT})}},
+           {"outside_compilation_O1_send"}},
       },
       {{"i_0_retval", "I:o:0"}});
 
@@ -998,24 +720,23 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
     Node* call = b2.opts().FinalizeBuilder(&node_builder);
 
     Node* recv1 =
-        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT, DT_FLOAT},
+        RecvAtHost({DT_FLOAT, DT_FLOAT},
                    b2.opts().WithName("outside_compilation_F1_O1_recv"));
     Node* e = Binary(ops::NodeOut(recv1, 0), ops::NodeOut(recv1, 1),
                      b2.opts().WithName("E").WithControlInputs({recv1, b}));
-    Node* send1 = SendFromHost("host_compute_channel_F1_O1", {e},
+    Node* send1 = SendFromHost({e}, {DT_FLOAT},
                                b2.opts()
                                    .WithName("outside_compilation_F1_O1_send")
                                    .WithControlInput(e));
 
     Node* recv2 =
-        RecvAtHost("host_compute_channel_F1_O2", {DT_FLOAT, DT_FLOAT},
+        RecvAtHost({DT_FLOAT, DT_FLOAT},
                    b2.opts().WithName("outside_compilation_F1_O2_recv"));
     Node* g = Binary(e, ops::NodeOut(recv2, 1),
                      b2.opts().WithName("G").WithControlInputs({recv2, e}));
     Node* h = Binary(ops::NodeOut(recv2, 0), e, b2.opts().WithName("H"));
-    Node* send2 =
-        SendFromHost("host_compute_channel_F1_O2", {h},
-                     b2.opts().WithName("outside_compilation_F1_O2_send"));
+    Node* send2 = SendFromHost(
+        {h}, {DT_FLOAT}, b2.opts().WithName("outside_compilation_F1_O2_send"));
 
     Node* s = NoOp(b2.opts()
                        .WithName("F1_sequencer")
@@ -1037,8 +758,8 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
 
   {
     GraphDefBuilder b1(GraphDefBuilder::kFailImmediately);
-    Node* a = InputShaped(b1.opts().WithName("A"));
-    Node* b = InputShaped(b1.opts().WithName("B"));
+    Node* a = Input(b1.opts().WithName("A"));
+    Node* b = Input(b1.opts().WithName("B"));
     Node* c = Unary(a, b1.opts().WithName("C").WithAttr("_encapsulate", "F1"));
     Node* d =
         Binary(b, c, b1.opts().WithName("D").WithAttr("_encapsulate", "F1"));
@@ -1070,24 +791,6 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
   FunctionDefLibrary library_expected;
   GraphDef graphdef_expected;
 
-  string shape_string_expected;
-  {
-    GraphDefBuilder shape(GraphDefBuilder::kFailImmediately);
-    Node* recv =
-        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT, DT_FLOAT},
-                   shape.opts().WithName("outside_compilation_F1_O1_recv"));
-    Node* e = Binary(ops::NodeOut(recv, 0), ops::NodeOut(recv, 1),
-                     shape.opts().WithName("E"));
-    SendFromHost("host_compute_channel_F1_O1", {e},
-                 shape.opts().WithName("outside_compilation_F1_O1_send"));
-    GraphDef shape_graph;
-    TF_EXPECT_OK(shape.ToGraphDef(&shape_graph));
-    EXPECT_TRUE(shape_graph.SerializeToString(&shape_string_expected));
-  }
-
-  TensorShapeProto shape_proto_expected;
-  shape_proto_expected.add_dim()->set_size(2);
-
   *library_expected.add_function() = FunctionDefHelper::Create(
       "F1", {"a_0_arg:float", "b_0_arg:float"},
       {"f_0_retval:float", "d_0_retval:float"}, {},
@@ -1096,18 +799,19 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
           {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
           {{"F"},
            "BinaryTest",
-           {"C:o:0", "outside_compilation_O1_host_compute:outputs:0"},
+           {"C:o:0", "outside_compilation_O1_recv:output:0"},
            {},
-           {"outside_compilation_O1_host_compute"}},
-          {{"outside_compilation_O1_host_compute"},
-           "_XlaHostCompute",
+           {"outside_compilation_O1_recv"}},
+          {{"outside_compilation_O1_send"},
+           "_XlaSendToHost",
            {"C:o:0", "D:o:0"},
-           {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})},
-            {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
-            {"key", "host_compute_channel_F1_O1"},
-            {"shape_inference_graph", shape_string_expected},
-            {"shapes", gtl::ArraySlice<DataType>({})}},
+           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})}},
            {"D"}},
+          {{"outside_compilation_O1_recv"},
+           "_XlaRecvFromHost",
+           {},
+           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT})}},
+           {"outside_compilation_O1_send"}},
       },
       {{"d_0_retval", "D:o:0"}, {"f_0_retval", "F:o:0"}});
 
@@ -1118,16 +822,16 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
           {{"G"}, "BinaryTest", {"e_0_arg", "f_0_arg"}},
           {{"I"},
            "BinaryTest",
-           {"f_0_arg", "outside_compilation_O1_host_compute:outputs:0"}},
-          {{"outside_compilation_O1_host_compute"},
-           "_XlaHostCompute",
+           {"f_0_arg", "outside_compilation_O1_recv:output:0"}},
+          {{"outside_compilation_O1_send"},
+           "_XlaSendToHost",
            {"G:o:0"},
-           {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
-            {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
-            {"key", "host_compute_channel_F2_O1"},
-            {"shape_inference_graph", ""},
-            {"shapes",
-             gtl::ArraySlice<TensorShapeProto>({shape_proto_expected})}}},
+           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT})}}},
+          {{"outside_compilation_O1_recv"},
+           "_XlaRecvFromHost",
+           {},
+           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT})}},
+           {"outside_compilation_O1_send"}},
       },
       {{"g_0_retval", "G:o:0"}, {"i_0_retval", "I:o:0"}});
 
@@ -1135,15 +839,15 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
     std::unique_ptr<FunctionLibraryDefinition> lib_def(
         new FunctionLibraryDefinition(OpRegistry::Global(), library_expected));
     GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get());
-    Node* a = InputShaped(b2.opts().WithName("A"));
-    Node* b = InputShaped(b2.opts().WithName("B"));
+    Node* a = Input(b2.opts().WithName("A"));
+    Node* b = Input(b2.opts().WithName("B"));
 
     Node* recv1 =
-        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT, DT_FLOAT},
+        RecvAtHost({DT_FLOAT, DT_FLOAT},
                    b2.opts().WithName("outside_compilation_F1_O1_recv"));
     Node* e = Binary(ops::NodeOut(recv1, 0), ops::NodeOut(recv1, 1),
                      b2.opts().WithName("E").WithControlInputs({recv1, b}));
-    Node* send1 = SendFromHost("host_compute_channel_F1_O1", {e},
+    Node* send1 = SendFromHost({e}, {DT_FLOAT},
                                b2.opts()
                                    .WithName("outside_compilation_F1_O1_send")
                                    .WithControlInput(e));
@@ -1153,14 +857,12 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
     Node* s1 = NoOp(
         b2.opts().WithName("F1_sequencer").WithControlInputs({recv1, send1}));
 
-    Node* recv2 =
-        RecvAtHost("host_compute_channel_F2_O1", {DT_FLOAT},
-                   b2.opts().WithName("outside_compilation_F2_O1_recv"));
+    Node* recv2 = RecvAtHost(
+        {DT_FLOAT}, b2.opts().WithName("outside_compilation_F2_O1_recv"));
     Node* h = Binary(ops::NodeOut(call1, 1), recv2,
                      b2.opts().WithName("H").WithControlInput(s1));
-    Node* send2 =
-        SendFromHost("host_compute_channel_F2_O1", {h},
-                     b2.opts().WithName("outside_compilation_F2_O1_send"));
+    Node* send2 = SendFromHost(
+        {h}, {DT_FLOAT}, b2.opts().WithName("outside_compilation_F2_O1_send"));
 
     NodeBuilder node_builder2("F2", "F2", lib_def.get());
     node_builder2.Input(e).Input(call1);
@@ -1186,7 +888,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) {
 
   {
     GraphDefBuilder b1(GraphDefBuilder::kFailImmediately);
-    Node* a = InputShaped(b1.opts().WithName("A"));
+    Node* a = Input(b1.opts().WithName("A"));
     Node* b = Input(b1.opts().WithName("B"));
     Node* c = Unary(a, b1.opts().WithName("C").WithAttr("_encapsulate", "F1"));
     Node* d =
@@ -1206,9 +908,6 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) {
   FunctionDefLibrary library_expected;
   GraphDef graphdef_expected;
 
-  TensorShapeProto shape_proto_expected;
-  shape_proto_expected.add_dim()->set_size(2);
-
   *library_expected.add_function() = FunctionDefHelper::Create(
       "F1", {"a_0_arg:float", "b_0_arg:float"}, {"f_0_retval:float"}, {},
       {
@@ -1216,16 +915,11 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) {
           {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
           {{"F"},
            "BinaryTest",
-           {"D:o:0", "outside_compilation_O1_host_compute:outputs:0"}},
-          {{"outside_compilation_O1_host_compute"},
-           "_XlaHostCompute",
+           {"D:o:0", "outside_compilation_O1_recv:output:0"}},
+          {{"outside_compilation_O1_recv"},
+           "_XlaRecvFromHost",
            {},
-           {{"Tinputs", gtl::ArraySlice<DataType>({})},
-            {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
-            {"key", "host_compute_channel_F1_O1"},
-            {"shape_inference_graph", ""},
-            {"shapes",
-             gtl::ArraySlice<TensorShapeProto>({shape_proto_expected})}}},
+           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT})}}},
       },
       {{"f_0_retval", "F:o:0"}});
 
@@ -1233,13 +927,12 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) {
     std::unique_ptr<FunctionLibraryDefinition> lib_def(
         new FunctionLibraryDefinition(OpRegistry::Global(), library_expected));
     GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get());
-    Node* a = InputShaped(b2.opts().WithName("A"));
+    Node* a = Input(b2.opts().WithName("A"));
     Node* b = Input(b2.opts().WithName("B"));
 
     Node* e = Unary(a, b2.opts().WithName("E"));
-    Node* send1 =
-        SendFromHost("host_compute_channel_F1_O1", {e},
-                     b2.opts().WithName("outside_compilation_F1_O1_send"));
+    Node* send1 = SendFromHost(
+        {e}, {DT_FLOAT}, b2.opts().WithName("outside_compilation_F1_O1_send"));
     NodeBuilder node_builder1("F1", "F1", lib_def.get());
     node_builder1.Input(a).Input(b);
     Node* call1 = b2.opts().FinalizeBuilder(&node_builder1);
@@ -1261,7 +954,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlInput) {
 
   {
     GraphDefBuilder b1(GraphDefBuilder::kFailImmediately);
-    Node* a = InputShaped(b1.opts().WithName("A"));
+    Node* a = Input(b1.opts().WithName("A"));
     Node* b = Input(b1.opts().WithName("B"));
     Node* c = Unary(a, b1.opts().WithName("C").WithAttr("_encapsulate", "F1"));
     Node* d =
@@ -1282,9 +975,6 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlInput) {
   FunctionDefLibrary library_expected;
   GraphDef graphdef_expected;
 
-  TensorShapeProto shape_proto_expected;
-  shape_proto_expected.add_dim()->set_size(2);
-
   *library_expected.add_function() = FunctionDefHelper::Create(
       "F1", {"a_0_arg:float", "b_0_arg:float"}, {"f_0_retval:float"}, {},
       {
@@ -1292,17 +982,17 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlInput) {
           {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
           {{"F"},
            "BinaryTest",
-           {"D:o:0", "outside_compilation_O1_host_compute:outputs:0"}},
-          {{"outside_compilation_O1_host_compute"},
-           "_XlaHostCompute",
+           {"D:o:0", "outside_compilation_O1_recv:output:0"}},
+          {{"outside_compilation_O1_send"},
+           "_XlaSendToHost",
            {},
-           {{"Tinputs", gtl::ArraySlice<DataType>({})},
-            {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
-            {"key", "host_compute_channel_F1_O1"},
-            {"shape_inference_graph", ""},
-            {"shapes",
-             gtl::ArraySlice<TensorShapeProto>({shape_proto_expected})}},
+           {{"dtypes", gtl::ArraySlice<DataType>({})}},
            {"D"}},
+          {{"outside_compilation_O1_recv"},
+           "_XlaRecvFromHost",
+           {},
+           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT})}},
+           {"outside_compilation_O1_send"}},
       },
       {{"f_0_retval", "F:o:0"}});
 
@@ -1310,16 +1000,14 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlInput) {
     std::unique_ptr<FunctionLibraryDefinition> lib_def(
         new FunctionLibraryDefinition(OpRegistry::Global(), library_expected));
     GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get());
-    Node* a = InputShaped(b2.opts().WithName("A"));
+    Node* a = Input(b2.opts().WithName("A"));
     Node* b = Input(b2.opts().WithName("B"));
 
     Node* recv1 =
-        RecvAtHost("host_compute_channel_F1_O1", {},
-                   b2.opts().WithName("outside_compilation_F1_O1_recv"));
+        RecvAtHost({}, b2.opts().WithName("outside_compilation_F1_O1_recv"));
     Node* e = Unary(a, b2.opts().WithName("E").WithControlInput(recv1));
-    Node* send1 =
-        SendFromHost("host_compute_channel_F1_O1", {e},
-                     b2.opts().WithName("outside_compilation_F1_O1_send"));
+    Node* send1 = SendFromHost(
+        {e}, {DT_FLOAT}, b2.opts().WithName("outside_compilation_F1_O1_send"));
     NodeBuilder node_builder1("F1", "F1", lib_def.get());
     node_builder1.Input(a).Input(b);
     Node* call1 = b2.opts().FinalizeBuilder(&node_builder1);
@@ -1367,14 +1055,10 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoOutputs) {
           {{"C"}, "UnaryTest", {"a_0_arg"}},
           {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
           {{"F"}, "UnaryTest", {"D:o:0"}},
-          {{"outside_compilation_O1_host_compute"},
-           "_XlaHostCompute",
+          {{"outside_compilation_O1_send"},
+           "_XlaSendToHost",
            {"D:o:0"},
-           {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
-            {"Toutputs", gtl::ArraySlice<DataType>({})},
-            {"key", "host_compute_channel_F1_O1"},
-            {"shape_inference_graph", ""},
-            {"shapes", gtl::ArraySlice<TensorShapeProto>({})}}},
+           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT})}}},
       },
       {{"f_0_retval", "F:o:0"}});
 
@@ -1385,9 +1069,8 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoOutputs) {
     Node* a = Input(b2.opts().WithName("A"));
     Node* b = Input(b2.opts().WithName("B"));
 
-    Node* recv1 =
-        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT},
-                   b2.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* recv1 = RecvAtHost(
+        {DT_FLOAT}, b2.opts().WithName("outside_compilation_F1_O1_recv"));
     Node* e = Unary(recv1, b2.opts().WithName("E"));
     NodeBuilder node_builder1("F1", "F1", lib_def.get());
     node_builder1.Input(a).Input(b);
@@ -1435,19 +1118,16 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlOutput) {
       {
           {{"C"}, "UnaryTest", {"a_0_arg"}},
           {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
-          {{"F"},
-           "UnaryTest",
+          {{"F"}, "UnaryTest", {"D:o:0"}, {}, {"outside_compilation_O1_recv"}},
+          {{"outside_compilation_O1_send"},
+           "_XlaSendToHost",
            {"D:o:0"},
+           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT})}}},
+          {{"outside_compilation_O1_recv"},
+           "_XlaRecvFromHost",
            {},
-           {"outside_compilation_O1_host_compute"}},
-          {{"outside_compilation_O1_host_compute"},
-           "_XlaHostCompute",
-           {"D:o:0"},
-           {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
-            {"Toutputs", gtl::ArraySlice<DataType>({})},
-            {"key", "host_compute_channel_F1_O1"},
-            {"shape_inference_graph", ""},
-            {"shapes", gtl::ArraySlice<TensorShapeProto>({})}}},
+           {{"dtypes", gtl::ArraySlice<DataType>({})}},
+           {"outside_compilation_O1_send"}},
       },
       {{"f_0_retval", "F:o:0"}});
 
@@ -1458,11 +1138,10 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlOutput) {
     Node* a = Input(b2.opts().WithName("A"));
     Node* b = Input(b2.opts().WithName("B"));
 
-    Node* recv1 =
-        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT},
-                   b2.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* recv1 = RecvAtHost(
+        {DT_FLOAT}, b2.opts().WithName("outside_compilation_F1_O1_recv"));
     Node* e = Unary(recv1, b2.opts().WithName("E"));
-    Node* send1 = SendFromHost("host_compute_channel_F1_O1", {},
+    Node* send1 = SendFromHost({}, {},
                                b2.opts()
                                    .WithName("outside_compilation_F1_O1_send")
                                    .WithControlInput(e));
@@ -1536,110 +1215,5 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputsOrOutputs) {
   TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library);
 }
 
-// Test for shape inference of outside compilation.
-TEST(EncapsulateSubgraphsTest, OutsideCompilationShapeInference) {
-  FunctionDefLibrary library;
-  GraphDef graphdef;
-
-  {
-    *library.add_function() = test::function::XTimesTwo();
-
-    GraphDefBuilder b1(GraphDefBuilder::kFailImmediately);
-    Node* a = InputShaped(b1.opts().WithName("A"));
-    Node* b = Input(b1.opts().WithName("B"));
-    // Give nodes 'c' and 'd' names that collide after lowercasing.
-    Node* c = Unary(a, b1.opts().WithName("C"));
-    Node* d = Unary(b, b1.opts().WithName("c").WithControlInput(c).WithAttr(
-                           "_encapsulate", "F1"));
-    Node* e = BinaryUnknownShape(c, d,
-                                 b1.opts()
-                                     .WithName("E")
-                                     .WithControlInputs({b, d})
-                                     .WithAttr("_encapsulate", "F1")
-                                     .WithAttr("_outside", "O1"));
-    Node* f = Binary(c, e,
-                     b1.opts().WithName("F").WithControlInput(e).WithAttr(
-                         "_encapsulate", "F1"));
-    Binary(a, f, b1.opts().WithName("G").WithControlInput(e));
-    TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
-  }
-
-  TF_EXPECT_OK(Encapsulate(&graphdef, &library));
-
-  FunctionDefLibrary library_expected;
-  GraphDef graphdef_expected;
-
-  string shape_string_expected;
-  {
-    GraphDefBuilder shape(GraphDefBuilder::kFailImmediately);
-    Node* known = KnownShape({2}, shape.opts().WithName("KnownShape/_0"));
-    Node* recv =
-        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT},
-                   shape.opts().WithName("outside_compilation_F1_O1_recv"));
-    Node* e = BinaryUnknownShape(known, recv, shape.opts().WithName("E"));
-    SendFromHost("host_compute_channel_F1_O1", {e},
-                 shape.opts().WithName("outside_compilation_F1_O1_send"));
-    GraphDef shape_graph;
-    TF_EXPECT_OK(shape.ToGraphDef(&shape_graph));
-    EXPECT_TRUE(shape_graph.SerializeToString(&shape_string_expected));
-  }
-
-  *library_expected.add_function() = test::function::XTimesTwo();
-  *library_expected.add_function() = FunctionDefHelper::Create(
-      "F1", {"b_0_arg:float", "c_0_arg:float"}, {"f_0_retval:float"}, {},
-      {
-          {{"c"}, "UnaryTest", {"b_0_arg"}, {}, {}},
-          {{"F"},
-           "BinaryTest",
-           {"c_0_arg", "outside_compilation_O1_host_compute:outputs:0"},
-           {},
-           {"outside_compilation_O1_host_compute"}},
-          {{"outside_compilation_O1_host_compute"},
-           "_XlaHostCompute",
-           {"c:o:0"},
-           {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
-            {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
-            {"key", "host_compute_channel_F1_O1"},
-            {"shape_inference_graph", shape_string_expected},
-            {"shapes", gtl::ArraySlice<DataType>({})}},
-           {"c"}},
-      },
-      {{"f_0_retval", "F:o:0"}});
-
-  {
-    std::unique_ptr<FunctionLibraryDefinition> lib_def(
-        new FunctionLibraryDefinition(OpRegistry::Global(), library_expected));
-    GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get());
-    Node* a = InputShaped(b2.opts().WithName("A"));
-    Node* b = Input(b2.opts().WithName("B"));
-    Node* c = Unary(a, b2.opts().WithName("C"));
-
-    NodeBuilder node_builder("F1", "F1", lib_def.get());
-    node_builder.Input(b).Input(c);
-    Node* call =
-        b2.opts().WithControlInputs({c}).FinalizeBuilder(&node_builder);
-
-    Node* recv =
-        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT},
-                   b2.opts().WithName("outside_compilation_F1_O1_recv"));
-    Node* e = BinaryUnknownShape(
-        c, ops::NodeOut(recv, 0),
-        b2.opts().WithName("E").WithControlInputs({recv, b}));
-    Node* send = SendFromHost("host_compute_channel_F1_O1", {e},
-                              b2.opts()
-                                  .WithName("outside_compilation_F1_O1_send")
-                                  .WithControlInput(e));
-
-    Node* s = NoOp(
-        b2.opts().WithName("F1_sequencer").WithControlInputs({recv, send}));
-
-    Binary(a, call, b2.opts().WithName("G").WithControlInputs({s, e}));
-    TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
-  }
-
-  TF_EXPECT_GRAPH_EQ(graphdef_expected, graphdef);
-  TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library);
-}
-
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc
index eae8e6c3c1..d6b576166c 100644
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@@ -1064,36 +1064,26 @@ Status FunctionLibraryDefinition::AddLibrary(
   return Status::OK();
 }
 
-Status FunctionLibraryDefinition::RemoveFunction(const string& func) {
+void FunctionLibraryDefinition::RemoveFunction(const string& func) {
   const auto& i = function_defs_.find(func);
-  if (i == function_defs_.end()) {
-    return errors::InvalidArgument("Tried to remove non-existent function ",
-                                   func);
-  }
+  DCHECK(i != function_defs_.end());
   function_defs_.erase(i);
-  return Status::OK();
 }
 
-Status FunctionLibraryDefinition::RemoveGradient(const string& func) {
+void FunctionLibraryDefinition::RemoveGradient(const string& func) {
   const auto& i = func_grad_.find(func);
-  if (i == func_grad_.end()) {
-    return errors::InvalidArgument("Tried to remove non-existent gradient ",
-                                   func);
-  }
+  DCHECK(i != func_grad_.end());
   func_grad_.erase(i);
-  return Status::OK();
 }
 
 void FunctionLibraryDefinition::Remove(
     const std::vector<string>& funcs,
     const std::vector<string>& funcs_with_grads) {
   for (const string& f : funcs) {
-    Status s = RemoveFunction(f);
-    DCHECK(s.ok());
+    RemoveFunction(f);
   }
   for (const string& f : funcs_with_grads) {
-    Status s = RemoveGradient(f);
-    DCHECK(s.ok());
+    RemoveGradient(f);
   }
 }
 
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index 7d0e15641d..b933ee0b0e 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -312,14 +312,6 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   // This operation is atomic.
   Status AddGradientDef(const GradientDef& grad);
 
-  // Remove function `func` from the library. Returns non-OK Status unless
-  // `func` is in the library.
-  Status RemoveFunction(const string& func);
-
-  // Remove gradient of function `func` from the library. Returns non-OK Status
-  // unless `func` has a gradient.
-  Status RemoveGradient(const string& func);
-
   // Adds the functions and gradients in 'other' to this function library.
   // Duplicate functions and gradients are ignored.
   // This operation is atomic.
@@ -392,6 +384,13 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   // attr from.
   const FunctionDef* GetAttrImpl(const NodeDef& ndef) const;
 
+  // Remove function `func` from the library. `func` must be in the library.
+  void RemoveFunction(const string& func);
+
+  // Remove gradient of function `func` from the library. `func` must have
+  // a gradient.
+  void RemoveGradient(const string& func);
+
   // Remove all functions in `funcs` and all gradients of
   // functions in `funcs_with_grads` from this library.
   void Remove(const std::vector<string>& funcs,
-- 
GitLab


From 997c209f9b8210f4bdc44a0172e0b64f5f7761c0 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <bsteiner@google.com>
Date: Thu, 1 Feb 2018 11:50:14 -0800
Subject: [PATCH 532/966] Added a utility to traverse the graph in reverse DFS
 order, identifying loops in the process.

PiperOrigin-RevId: 184172483
---
 tensorflow/core/grappler/graph_view.h         |   1 +
 tensorflow/core/grappler/utils/BUILD          |  26 +++++
 tensorflow/core/grappler/utils/traversal.cc   |  80 ++++++++++++++
 tensorflow/core/grappler/utils/traversal.h    |  39 +++++++
 .../core/grappler/utils/traversal_test.cc     | 101 ++++++++++++++++++
 5 files changed, 247 insertions(+)
 create mode 100644 tensorflow/core/grappler/utils/traversal.cc
 create mode 100644 tensorflow/core/grappler/utils/traversal.h
 create mode 100644 tensorflow/core/grappler/utils/traversal_test.cc

diff --git a/tensorflow/core/grappler/graph_view.h b/tensorflow/core/grappler/graph_view.h
index f4e2de75a6..173ce9c09c 100644
--- a/tensorflow/core/grappler/graph_view.h
+++ b/tensorflow/core/grappler/graph_view.h
@@ -46,6 +46,7 @@ class GraphView {
   };
 
   explicit GraphView(GraphDef* graph);
+  GraphDef* GetGraph() const { return graph_; }
   NodeDef* GetNode(const string& node_name) const;
   // Get the specified input port. Note that the special '-1' port_id can be
   // used to access the controlling nodes (i.e. the nodes connected to node_name
diff --git a/tensorflow/core/grappler/utils/BUILD b/tensorflow/core/grappler/utils/BUILD
index 534f7a063f..137d51790d 100644
--- a/tensorflow/core/grappler/utils/BUILD
+++ b/tensorflow/core/grappler/utils/BUILD
@@ -99,3 +99,29 @@ tf_cc_test(
         "//tensorflow/core:test_main",
     ],
 )
+
+cc_library(
+    name = "traversal",
+    srcs = ["traversal.cc"],
+    hdrs = ["traversal.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:graph_view",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler:utils",
+    ],
+)
+
+tf_cc_test(
+    name = "traversal_test",
+    srcs = ["traversal_test.cc"],
+    deps = [
+        ":traversal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
diff --git a/tensorflow/core/grappler/utils/traversal.cc b/tensorflow/core/grappler/utils/traversal.cc
new file mode 100644
index 0000000000..f44f53c4e6
--- /dev/null
+++ b/tensorflow/core/grappler/utils/traversal.cc
@@ -0,0 +1,80 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/utils/traversal.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+void ReverseDfs(const GraphView& graph_view, const std::vector<NodeDef*>& from,
+                const std::function<void(NodeDef*)>& pre_order,
+                const std::function<void(NodeDef*)>& post_order,
+                const std::function<void(NodeDef*, NodeDef*)>& on_back_edge) {
+  // Stack of work to do.
+  struct StackElem {
+    NodeDef* node;
+    bool children_visited;
+    NodeDef* src;
+  };
+  std::vector<StackElem> stack;
+
+  stack.reserve(from.size());
+  for (NodeDef* node : from) {
+    stack.push_back(StackElem{node, false});
+  }
+
+  enum NodeState { NOT_VISITED = 0, VISITING = 1, DONE = 2 };
+  std::unordered_map<NodeDef*, NodeState> node_state;
+  while (!stack.empty()) {
+    StackElem w = stack.back();
+    stack.pop_back();
+
+    if (w.children_visited) {
+      // We've processed all the children of this node
+      node_state[w.node] = DONE;
+      if (post_order) {
+        post_order(w.node);
+      }
+      continue;
+    }
+
+    auto& rslt = node_state[w.node];
+    if (rslt == DONE) {
+      continue;
+    } else if (rslt == VISITING) {
+      // Loop detected
+      if (on_back_edge) {
+        on_back_edge(w.src, w.node);
+      }
+      continue;
+    }
+    rslt = VISITING;
+    if (pre_order) {
+      pre_order(w.node);
+    }
+
+    // Enqueue the node again with the children_visited flag set to true.
+    stack.push_back(StackElem{w.node, true, w.src});
+
+    // Now enqueu the node children.
+    for (const auto fanin : graph_view.GetFanins(*w.node, true)) {
+      stack.push_back(StackElem{fanin.node, false, w.node});
+    }
+  }
+}
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/traversal.h b/tensorflow/core/grappler/utils/traversal.h
new file mode 100644
index 0000000000..bb3fa090e8
--- /dev/null
+++ b/tensorflow/core/grappler/utils/traversal.h
@@ -0,0 +1,39 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_UTILS_TRAVERSAL_H_
+#define TENSORFLOW_CORE_GRAPPLER_UTILS_TRAVERSAL_H_
+
+#include <functional>
+#include "tensorflow/core/grappler/graph_view.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Traverse the graph in reverse dfs order, starting from the list of nodes
+// specified in the 'from' argument. The pre_order and post_order functors will
+// be called on each reachable node (including the 'from' nodes) in pre and post
+// order. If loops are found, the on_back_edge functor will be called on the
+// corresponding back edges. Moreover, the pre and post order will assume that
+// these back edges will be cut.
+void ReverseDfs(const GraphView& graph_view, const std::vector<NodeDef*>& from,
+                const std::function<void(NodeDef*)>& pre_order,
+                const std::function<void(NodeDef*)>& post_order,
+                const std::function<void(NodeDef*, NodeDef*)>& on_back_edge);
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_UTILS_TRAVERSAL_H_
diff --git a/tensorflow/core/grappler/utils/traversal_test.cc b/tensorflow/core/grappler/utils/traversal_test.cc
new file mode 100644
index 0000000000..cc68bd1a96
--- /dev/null
+++ b/tensorflow/core/grappler/utils/traversal_test.cc
@@ -0,0 +1,101 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/utils/traversal.h"
+//#include "tensorflow/core/framework/node_def.pb.h"
+//#include "tensorflow/core/lib/core/status_test_util.h"
+//#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+class TraversalTest : public ::testing::Test {
+ protected:
+  static NodeDef CreateNode(const string& name,
+                            const std::vector<string>& inputs) {
+    return CreateNode(name, "", inputs);
+  }
+  static NodeDef CreateNode(const string& name, const string& op,
+                            const std::vector<string>& inputs) {
+    NodeDef node;
+    node.set_name(name);
+    if (!op.empty()) {
+      node.set_op(op);
+    }
+    for (const string& input : inputs) {
+      node.add_input(input);
+    }
+    return node;
+  }
+};
+
+TEST_F(TraversalTest, ReverseDfsNoLoop) {
+  GraphDef graph;
+  *graph.add_node() = CreateNode("2", {"5"});
+  *graph.add_node() = CreateNode("0", {"5", "4"});
+  *graph.add_node() = CreateNode("1", {"4", "3"});
+  *graph.add_node() = CreateNode("3", {"2"});
+  *graph.add_node() = CreateNode("5", {});
+  *graph.add_node() = CreateNode("4", {});
+
+  std::vector<NodeDef*> start_nodes = {graph.mutable_node(1),
+                                       graph.mutable_node(2)};
+  std::vector<string> pre_order;
+  std::vector<string> post_order;
+  bool found_back_edge = false;
+  ReverseDfs(
+      GraphView(&graph), start_nodes,
+      [&pre_order](NodeDef* n) { pre_order.push_back(n->name()); },
+      [&post_order](NodeDef* n) { post_order.push_back(n->name()); },
+      [&found_back_edge](NodeDef*, NodeDef*) { found_back_edge = true; });
+
+  EXPECT_EQ(std::vector<string>({"1", "4", "3", "2", "5", "0"}), pre_order);
+  EXPECT_EQ(std::vector<string>({"4", "5", "2", "3", "1", "0"}), post_order);
+  EXPECT_FALSE(found_back_edge);
+}
+
+TEST_F(TraversalTest, ReverseDfsWithLoop) {
+  GraphDef graph;
+  // Create a loop
+  *graph.add_node() = CreateNode("2", "Merge", {"1", "5"});
+  *graph.add_node() = CreateNode("3", "Switch", {"2"});
+  *graph.add_node() = CreateNode("4", "Identity", {"3"});
+  *graph.add_node() = CreateNode("5", "NextIteration", {"4"});
+  *graph.add_node() = CreateNode("1", "Enter", {});
+  *graph.add_node() = CreateNode("6", "Exit", {"3"});
+
+  std::vector<NodeDef*> start_nodes = {graph.mutable_node(5)};
+  std::vector<string> pre_order;
+  std::vector<string> post_order;
+  std::vector<string> back_edges;
+  ReverseDfs(
+      GraphView(&graph), start_nodes,
+      [&pre_order](NodeDef* n) { pre_order.push_back(n->name()); },
+      [&post_order](NodeDef* n) { post_order.push_back(n->name()); },
+      [&back_edges](NodeDef* src, NodeDef* dst) {
+        back_edges.push_back(strings::StrCat(src->name(), "->", dst->name()));
+      });
+
+  EXPECT_EQ(std::vector<string>({"6", "3", "2", "1", "5", "4"}), pre_order);
+  EXPECT_EQ(std::vector<string>({"1", "4", "5", "2", "3", "6"}), post_order);
+  EXPECT_EQ(std::vector<string>({"4->3"}), back_edges);
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
-- 
GitLab


From 77e6a452188e83ae4498cc3ae23e20e60061b367 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Thu, 1 Feb 2018 11:50:23 -0800
Subject: [PATCH 533/966] [tf.data] Fix bug where captured resources in shared
 iterators were invisible.

This change ensures that a shared iterator (which requires a private
FunctionLibraryRuntime that outlasts the calling op's runtime, because
it can outlive a single session) uses the same Device as a non-shared
iterator, and hence capturing resources from the creating graph will
work as intended.

Fixes #16481.

PiperOrigin-RevId: 184172498
---
 tensorflow/core/kernels/data/iterator_ops.cc  | 30 ++++++++++++---
 tensorflow/python/data/kernel_tests/BUILD     |  3 ++
 .../kernel_tests/iterator_ops_cluster_test.py | 37 +++++++++++++++++++
 3 files changed, 64 insertions(+), 6 deletions(-)

diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index b37bd672ad..dd5f4a4554 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/graph_runner.h"
+#include "tensorflow/core/common_runtime/renamed_device.h"
 #include "tensorflow/core/common_runtime/threadpool_device.h"
 #include "tensorflow/core/framework/iterator.pb.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
@@ -516,15 +517,32 @@ class IteratorHandleOp : public OpKernel {
     return Status::OK();
   }
 
+  template <typename To, typename From>  // use like this: down_cast<T*>(foo);
+  static inline To down_cast(From* f) {  // so we only accept pointers
+    static_assert(
+        (std::is_base_of<From, typename std::remove_pointer<To>::type>::value),
+        "target type not derived from source type");
+
+    // We skip the assert and hence the dynamic_cast if RTTI is disabled.
+#if !defined(__GNUC__) || defined(__GXX_RTTI)
+    // Uses RTTI in dbg and fastbuild. asserts are disabled in opt builds.
+    assert(f == nullptr || dynamic_cast<To>(f) != nullptr);
+#endif  // !defined(__GNUC__) || defined(__GXX_RTTI)
+    return static_cast<To>(f);
+  }
+
   FunctionLibraryRuntime* CreatePrivateFLR(
       OpKernelContext* ctx, std::unique_ptr<DeviceMgr>* device_mgr,
       std::unique_ptr<FunctionLibraryDefinition>* flib_def,
       std::unique_ptr<ProcessFunctionLibraryRuntime>* pflr) {
-    Device* device = new ThreadPoolDevice(
-        SessionOptions(), ctx->device()->attributes().name(), Bytes(256 << 20),
-        DeviceLocality(), cpu_allocator());
-
-    device_mgr->reset(new DeviceMgr({device}));
+    // Wrap the existing device in order to see any captured resources
+    // in its resource manager. The existing device will outlive the
+    // IteratorResource, because we are storing the IteratorResource
+    // in that device's resourc manager.
+    Device* wrapped_device = RenamedDevice::NewRenamedDevice(
+        ctx->device()->name(), down_cast<Device*>(ctx->device()),
+        false /* owns_underlying */, false /* isolate_session_state */);
+    device_mgr->reset(new DeviceMgr({wrapped_device}));
     flib_def->reset(new FunctionLibraryDefinition(
         *ctx->function_library()->GetFunctionLibraryDefinition()));
     pflr->reset(new ProcessFunctionLibraryRuntime(
@@ -532,7 +550,7 @@ class IteratorHandleOp : public OpKernel {
         {} /* TODO(mrry): OptimizerOptions? */,
         nullptr /* TODO(mrry): ClusterFLR */));
 
-    return (*pflr)->GetFLR(device->name());
+    return (*pflr)->GetFLR(ctx->device()->name());
   }
 
   mutex mu_;
diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index 43cbde69d9..8b8adefa65 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -357,6 +357,9 @@ tf_py_test(
         "//tensorflow/python:session",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python:lookup_ops",
     ],
     grpc_enabled = True,
     tags = [
diff --git a/tensorflow/python/data/kernel_tests/iterator_ops_cluster_test.py b/tensorflow/python/data/kernel_tests/iterator_ops_cluster_test.py
index 45dfa13720..2c65c49ebd 100644
--- a/tensorflow/python/data/kernel_tests/iterator_ops_cluster_test.py
+++ b/tensorflow/python/data/kernel_tests/iterator_ops_cluster_test.py
@@ -21,6 +21,7 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import function
@@ -28,6 +29,8 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import lookup_ops
+from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
 
 
@@ -103,6 +106,40 @@ class IteratorClusterTest(test.TestCase):
                                    "/job:worker/replica:0/task:1/cpu:0",
                                    workers[0].target)
 
+  def testCaptureHashTableInSharedIterator(self):
+    worker, _ = test_util.create_local_cluster(1, 1)
+
+    # NOTE(mrry): We must use the V2 variants of `HashTable`
+    # etc. because these produce a `tf.resource`-typed output that is
+    # compatible with the in-graph function implementation.
+    default_val = -1
+    keys = constant_op.constant(["brain", "salad", "surgery"])
+    values = constant_op.constant([0, 1, 2], dtypes.int64)
+    table = lookup_ops.HashTable(
+        lookup_ops.KeyValueTensorInitializer(keys, values),
+        default_val,
+        shared_name="shared_table")
+
+    input_sentences = dataset_ops.Dataset.from_tensor_slices(
+        ["brain brain tank salad surgery", "surgery brain"])
+
+    iterator = (
+        input_sentences.map(lambda x: string_ops.string_split([x]).values).map(
+            table.lookup)
+        .make_initializable_iterator(shared_name="shared_iterator"))
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with session.Session(worker[0].target) as sess:
+      sess.run(table.init)
+      sess.run(init_op)
+      self.assertAllEqual([0, 0, -1, 1, 2], sess.run(get_next))
+
+    with session.Session(worker[0].target) as sess:
+      self.assertAllEqual([2, 0], sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
 
 if __name__ == "__main__":
   test.main()
-- 
GitLab


From ccedcbe14c798fb3b227030cf85b4fe89406f0d8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 1 Feb 2018 11:54:13 -0800
Subject: [PATCH 534/966] Update deprecated API use

PiperOrigin-RevId: 184173047
---
 tensorflow/core/distributed_runtime/tensor_coding.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/distributed_runtime/tensor_coding.cc b/tensorflow/core/distributed_runtime/tensor_coding.cc
index fe2d1a1293..34a4013547 100644
--- a/tensorflow/core/distributed_runtime/tensor_coding.cc
+++ b/tensorflow/core/distributed_runtime/tensor_coding.cc
@@ -81,7 +81,7 @@ void TensorResponse::InitPartial(const RecvTensorResponse& response) {
 Status TensorResponse::ParseFrom(Source* source) {
   if (!on_host_) {
     protobuf::io::CodedInputStream input(source->contents());
-    input.SetTotalBytesLimit(INT_MAX, INT_MAX);  // Unlimited
+    input.SetTotalBytesLimit(INT_MAX);  // Unlimited
 
     // Pre-parse into local storage, then delegate to device.
     if (!meta_.ParseFromCodedStream(&input) || !input.ConsumedEntireMessage()) {
@@ -217,7 +217,7 @@ bool TensorResponse::ParseTensorSubmessage(
 
 bool TensorResponse::ParseFast(Source* source) {
   protobuf::io::CodedInputStream input(source->contents());
-  input.SetTotalBytesLimit(INT_MAX, INT_MAX);  // Unlimited
+  input.SetTotalBytesLimit(INT_MAX);  // Unlimited
   while (true) {
     auto p = input.ReadTagWithCutoff(127);
     int tag = GetTagFieldNumber(p.first);
-- 
GitLab


From 9ba944b79dff684954a4d4591e792d4fa1e858c2 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Thu, 1 Feb 2018 12:05:23 -0800
Subject: [PATCH 535/966] Internal change.

PiperOrigin-RevId: 184174800
---
 tensorflow/python/keras/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index a9dd8d8e9d..fdac22bb53 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -482,6 +482,7 @@ py_test(
     size = "small",
     srcs = ["_impl/keras/layers/normalization_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["notsan"],
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
-- 
GitLab


From 6bca4c6c21e2602b707f22b2ea29ef8cee27ec9c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 1 Feb 2018 12:17:08 -0800
Subject: [PATCH 536/966] Add a utility module that contains helper functions
 usable from within generated code. Add a helper for the control dependencies
 context manager.

PiperOrigin-RevId: 184176409
---
 tensorflow/BUILD                              |  1 +
 tensorflow/contrib/py2tf/BUILD                |  1 +
 tensorflow/contrib/py2tf/__init__.py          |  3 +-
 tensorflow/contrib/py2tf/converters/BUILD     |  1 +
 .../py2tf/converters/side_effect_guards.py    | 19 ++------
 .../converters/side_effect_guards_test.py     |  2 +
 tensorflow/contrib/py2tf/impl/config.py       |  3 +-
 tensorflow/contrib/py2tf/utils/BUILD          | 37 ++++++++++++++++
 tensorflow/contrib/py2tf/utils/__init__.py    | 21 +++++++++
 .../contrib/py2tf/utils/context_managers.py   | 41 ++++++++++++++++++
 .../py2tf/utils/context_managers_test.py      | 43 +++++++++++++++++++
 11 files changed, 155 insertions(+), 17 deletions(-)
 create mode 100644 tensorflow/contrib/py2tf/utils/BUILD
 create mode 100644 tensorflow/contrib/py2tf/utils/__init__.py
 create mode 100644 tensorflow/contrib/py2tf/utils/context_managers.py
 create mode 100644 tensorflow/contrib/py2tf/utils/context_managers_test.py

diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 66a2ecd8b5..bda0a83af3 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -539,6 +539,7 @@ filegroup(
         "//tensorflow/contrib/py2tf/impl:all_files",
         "//tensorflow/contrib/py2tf/pyct:all_files",
         "//tensorflow/contrib/py2tf/pyct/static_analysis:all_files",
+        "//tensorflow/contrib/py2tf/utils:all_files",
         "//tensorflow/contrib/quantize:all_files",
         "//tensorflow/contrib/receptive_field:all_files",
         "//tensorflow/contrib/reduce_slice_ops:all_files",
diff --git a/tensorflow/contrib/py2tf/BUILD b/tensorflow/contrib/py2tf/BUILD
index cea3738499..479ea9beca 100644
--- a/tensorflow/contrib/py2tf/BUILD
+++ b/tensorflow/contrib/py2tf/BUILD
@@ -23,6 +23,7 @@ py_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/contrib/py2tf/impl",
+        "//tensorflow/contrib/py2tf/utils",
         "@gast_archive//:gast",
         "@six_archive//:six",
     ],
diff --git a/tensorflow/contrib/py2tf/__init__.py b/tensorflow/contrib/py2tf/__init__.py
index 878941b3a3..0d51bf0bf2 100644
--- a/tensorflow/contrib/py2tf/__init__.py
+++ b/tensorflow/contrib/py2tf/__init__.py
@@ -21,12 +21,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.contrib.py2tf import utils
 from tensorflow.contrib.py2tf.impl.api import convert
 from tensorflow.contrib.py2tf.impl.api import graph_ready
 from tensorflow.contrib.py2tf.impl.api import to_code
 from tensorflow.contrib.py2tf.impl.api import to_graph
 from tensorflow.python.util.all_util import remove_undocumented
 
-_allowed_symbols = ['to_graph', 'to_code', 'convert', 'graph_ready']
+_allowed_symbols = ['to_graph', 'to_code', 'convert', 'graph_ready', 'utils']
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/py2tf/converters/BUILD b/tensorflow/contrib/py2tf/converters/BUILD
index b61fda3e91..3853c60f99 100644
--- a/tensorflow/contrib/py2tf/converters/BUILD
+++ b/tensorflow/contrib/py2tf/converters/BUILD
@@ -46,6 +46,7 @@ py_library(
     deps = [
         ":converters",
         "//tensorflow/contrib/py2tf/pyct/static_analysis",
+        "//tensorflow/contrib/py2tf/utils",
         "@gast_archive//:gast",
     ],
 )
diff --git a/tensorflow/contrib/py2tf/converters/side_effect_guards.py b/tensorflow/contrib/py2tf/converters/side_effect_guards.py
index 46a2269c20..ffca743542 100644
--- a/tensorflow/contrib/py2tf/converters/side_effect_guards.py
+++ b/tensorflow/contrib/py2tf/converters/side_effect_guards.py
@@ -109,31 +109,20 @@ class SideEffectGuardTransformer(gast.NodeTransformer):
       #   opt.minimize(loss)
       # or:
       #   tf.py_func(...)
-      args_scope = anno.getanno(node.value, 'args_scope')
-      temp_name = self.namer.new_symbol('temp', args_scope.parent.referenced)
-      # TODO(mdan): Unsafe reference modification!
-      args_scope.mark_write(temp_name)
       template = """
-        temp_result = call
-        if temp_result is not None:
-          if not isinstance(temp_result, (list, tuple)):
-            temp_result = (temp_result,)
-          ctx = tf.control_dependencies(temp_result)
-        else:
-          ctx = contextmanager(lambda: (yield))()
-        with ctx:
-          # TODO(mdan): Also insert ops to re-fetch if variables are involved.
+        with py2tf_utils.control_dependency_on_returns(tf, call):
+          # TODO(mdan): Also insert ops to re-fetch if variables are involved?
           pass  # Will be removed below.
       """
       # TODO(mdan): This is brittle. Reorganize the mechanism.
-      statements = templates.replace(
-          template, call=node.value, temp_result=temp_name)
+      statements = templates.replace(template, call=node.value)
       control_deps_guard = statements[-1]
       control_deps_guard.body = []
 
       # First, attempt to gate future evaluation of args. If that's not
       # possible, gate all remaining statements (and that may fail too, see
       # _visit_and_reindent.
+      args_scope = anno.getanno(node.value, 'args_scope')
       guarded_args = tuple(args_scope.used & (args_scope.parent.modified
                                               | args_scope.parent.returned))
       if guarded_args:
diff --git a/tensorflow/contrib/py2tf/converters/side_effect_guards_test.py b/tensorflow/contrib/py2tf/converters/side_effect_guards_test.py
index 5c56973dc2..452d7ab2be 100644
--- a/tensorflow/contrib/py2tf/converters/side_effect_guards_test.py
+++ b/tensorflow/contrib/py2tf/converters/side_effect_guards_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.contrib.py2tf import utils
 from tensorflow.contrib.py2tf.converters import converter_test_base
 from tensorflow.contrib.py2tf.converters import side_effect_guards
 from tensorflow.contrib.py2tf.pyct import compiler
@@ -46,6 +47,7 @@ class SideEffectGuardsTest(converter_test_base.TestCase):
     node = side_effect_guards.transform(node, TestNamer())
     result = compiler.ast_to_object(node)
     setattr(result, 'state_ops', state_ops)
+    setattr(result, 'py2tf_utils', utils)
 
     # TODO(mdan): Configure the namespaces instead of doing these hacks.
     ops.identity = array_ops.identity
diff --git a/tensorflow/contrib/py2tf/impl/config.py b/tensorflow/contrib/py2tf/impl/config.py
index 0892241983..6525806a09 100644
--- a/tensorflow/contrib/py2tf/impl/config.py
+++ b/tensorflow/contrib/py2tf/impl/config.py
@@ -36,4 +36,5 @@ NO_SIDE_EFFECT_CONSTRUCTORS = set(('tensorflow',))
 # TODO(mdan): Make sure copybara renames the reference below.
 COMPILED_IMPORT_STATEMENTS = (
     'import tensorflow as tf',
-)
+    'from tensorflow.contrib.py2tf import utils as '
+    'py2tf_utils')
diff --git a/tensorflow/contrib/py2tf/utils/BUILD b/tensorflow/contrib/py2tf/utils/BUILD
new file mode 100644
index 0000000000..01804aa883
--- /dev/null
+++ b/tensorflow/contrib/py2tf/utils/BUILD
@@ -0,0 +1,37 @@
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+py_library(
+    name = "utils",
+    srcs = [
+        "__init__.py",
+        "context_managers.py",
+    ],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:__subpackages__"],
+    deps = [
+    ],
+)
+
+py_test(
+    name = "context_managers_test",
+    srcs = ["context_managers_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":utils",
+        "//tensorflow/python:client_testlib",
+    ],
+)
diff --git a/tensorflow/contrib/py2tf/utils/__init__.py b/tensorflow/contrib/py2tf/utils/__init__.py
new file mode 100644
index 0000000000..bca33e89e9
--- /dev/null
+++ b/tensorflow/contrib/py2tf/utils/__init__.py
@@ -0,0 +1,21 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility module that contains APIs usable in the generated code."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.py2tf.utils.context_managers import control_dependency_on_returns
diff --git a/tensorflow/contrib/py2tf/utils/context_managers.py b/tensorflow/contrib/py2tf/utils/context_managers.py
new file mode 100644
index 0000000000..47d9839997
--- /dev/null
+++ b/tensorflow/contrib/py2tf/utils/context_managers.py
@@ -0,0 +1,41 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Various context managers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+
+
+def control_dependency_on_returns(tf, return_value):
+  """Create a TF control dependency on the return values of a function.
+
+  If the function had no return value, a no-op context is returned.
+
+  Args:
+    tf: The TensorFlow module.
+    return_value: The return value to set as control dependency.
+
+  Returns:
+    A context manager.
+  """
+  if return_value is None:
+    return contextlib.contextmanager(lambda: (yield))()
+  # TODO(mdan): Filter to tensor objects.
+  if not isinstance(return_value, (list, tuple)):
+    return_value = (return_value,)
+  return tf.control_dependencies(return_value)
diff --git a/tensorflow/contrib/py2tf/utils/context_managers_test.py b/tensorflow/contrib/py2tf/utils/context_managers_test.py
new file mode 100644
index 0000000000..c903f08252
--- /dev/null
+++ b/tensorflow/contrib/py2tf/utils/context_managers_test.py
@@ -0,0 +1,43 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for context_managers module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.py2tf.utils import context_managers
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import test
+
+
+class ContextManagersTest(test.TestCase):
+
+  def test_control_dependency_on_returns(self):
+    # Just dry run them.
+    with context_managers.control_dependency_on_returns(ops, None):
+      pass
+    with context_managers.control_dependency_on_returns(
+        ops, constant_op.constant(1)):
+      pass
+    with context_managers.control_dependency_on_returns(
+        ops, [constant_op.constant(1),
+              constant_op.constant(2)]):
+      pass
+
+
+if __name__ == '__main__':
+  test.main()
-- 
GitLab


From 448f6c70fa9fac05ecf291e59b20cf2451c65a9f Mon Sep 17 00:00:00 2001
From: Benoit Steiner <bsteiner@google.com>
Date: Thu, 1 Feb 2018 12:38:55 -0800
Subject: [PATCH 537/966] Made the addn optimization aware of the graph
 topology

PiperOrigin-RevId: 184179246
---
 tensorflow/core/grappler/optimizers/BUILD     |  1 +
 .../grappler/optimizers/memory_optimizer.cc   | 67 +++++++++++++++++--
 2 files changed, 63 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 68de03e81c..8b9885e4c1 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -289,6 +289,7 @@ cc_library(
         "//tensorflow/core/grappler/costs:graph_memory",
         "//tensorflow/core/grappler/costs:graph_properties",
         "//tensorflow/core/grappler/utils:topological_sort",
+        "//tensorflow/core/grappler/utils:traversal",
     ],
 )
 
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer.cc b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
index 6f95a00fa3..ffa03db262 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/static_schedule.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/topological_sort.h"
+#include "tensorflow/core/grappler/utils/traversal.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
 
 namespace tensorflow {
@@ -497,7 +498,7 @@ bool SchedulingPass(Cluster* cluster, GrapplerItem* item) {
     if (!IsAddN(node)) {
       continue;
     }
-    // There is nothing to gain by optimizing nodes with 2 inputs of fewer.
+    // There is nothing to gain by optimizing nodes with 2 or fewer inputs.
     if (view.NumFanins(node, false) <= 2) {
       continue;
     }
@@ -559,6 +560,54 @@ bool SchedulingPass(Cluster* cluster, GrapplerItem* item) {
       VLOG(1) << "Missing properties for " << node->name();
       continue;
     }
+
+    // Compute a topological ordering for the node fanin.
+    std::unordered_map<NodeDef*, int> topo_order;
+    ReverseDfs(view, {node}, nullptr,
+               [&topo_order](NodeDef* n) {
+                 int topo_index = topo_order.size();
+                 topo_order[n] = topo_index;
+               },
+               nullptr);
+
+    std::vector<int> input_topo_index;
+
+    for (int i = 0; i < node->input_size(); ++i) {
+      const string& input = node->input(i);
+      const string node_name = NodeName(input);
+      NodeDef* node = view.GetNode(node_name);
+      input_topo_index.push_back(topo_order.at(node));
+    }
+    int min_input_topo_index = INT_MAX;
+    int min_input_id = -1;
+    for (int i = 0; i < node->input_size(); ++i) {
+      if (IsControlInput(node->input(i))) {
+        // control inputs are always last.
+        break;
+      }
+      const int current = input_topo_index[i];
+      if (current < min_input_topo_index) {
+        min_input_topo_index = current;
+        min_input_id = i;
+      }
+    }
+    CHECK_LE(0, min_input_id);
+    std::vector<string> pre_ctrl_deps;
+    std::vector<string> post_ctrl_deps;
+    for (int i = node->input_size() - 1; i >= 0; --i) {
+      if (!IsControlInput(node->input(i))) {
+        // control inputs are always last.
+        break;
+      }
+      if (input_topo_index[i] < min_input_topo_index) {
+        // These control dependencies can be executed before the node.
+        pre_ctrl_deps.push_back(node->input(i));
+      } else {
+        // These control dependencies should be executed after the node.
+        post_ctrl_deps.push_back(node->input(i));
+      }
+    }
+
     const TensorShapeProto& shape =
         properties.GetOutputProperties(node->name())[0].shape();
     DataType dtype = node->attr().at("T").type();
@@ -573,13 +622,19 @@ bool SchedulingPass(Cluster* cluster, GrapplerItem* item) {
     *(*tmp_var->mutable_attr())["shape"].mutable_shape() = shape;
     (*tmp_var->mutable_attr())["var_name"].set_s(tmp_var->name());
 
+    for (const string& ctrl_dep : pre_ctrl_deps) {
+      *tmp_var->add_input() = ctrl_dep;
+    }
+    *tmp_var->add_input() =
+        AsControlDependency(NodeName(node->input(min_input_id)));
+
     // Initialize it to zero
     NodeDef* zeros = item->graph.add_node();
     zeros->set_name(strings::StrCat(node->name(), "/tmp_var_zeros"));
     zeros->set_op("ZerosLike");
     zeros->set_device(device);
     (*zeros->mutable_attr())["T"].set_type(dtype);
-    *zeros->add_input() = node->input(0);
+    *zeros->add_input() = node->input(min_input_id);
 
     NodeDef* initialize = item->graph.add_node();
     initialize->set_name(strings::StrCat(node->name(), "/tmp_var_initializer"));
@@ -593,9 +648,7 @@ bool SchedulingPass(Cluster* cluster, GrapplerItem* item) {
     std::vector<NodeDef*> accumulates;
     for (int i = 0; i < node->input_size(); ++i) {
       const string& input = node->input(i);
-      if (IsControlInput(input)) {
-        *zeros->add_input() = input;
-      } else {
+      if (!IsControlInput(input)) {
         NodeDef* accumulate = item->graph.add_node();
         accumulate->set_name(
             strings::StrCat(node->name(), "/tmp_var_accum_", i));
@@ -618,6 +671,10 @@ bool SchedulingPass(Cluster* cluster, GrapplerItem* item) {
     for (const NodeDef* accum : accumulates) {
       *node->add_input() = AsControlDependency(accum->name());
     }
+    for (const string& ctrl_dep : post_ctrl_deps) {
+      *node->add_input() = ctrl_dep;
+    }
+
     updated_graph = true;
   }
 
-- 
GitLab


From 1453d4c61178dbb4dea9e48790ea8fd7c58cd1d5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 1 Feb 2018 13:13:10 -0800
Subject: [PATCH 538/966] Verify tensor contents of tflite model

PiperOrigin-RevId: 184183725
---
 tensorflow/contrib/lite/tools/BUILD           |   4 +
 tensorflow/contrib/lite/tools/verifier.cc     | 170 ++++++++++++++-
 tensorflow/contrib/lite/tools/verifier.h      |   4 +-
 .../contrib/lite/tools/verifier_test.cc       | 200 +++++++++++++-----
 4 files changed, 324 insertions(+), 54 deletions(-)

diff --git a/tensorflow/contrib/lite/tools/BUILD b/tensorflow/contrib/lite/tools/BUILD
index 1bffcfb987..4d3b553b22 100644
--- a/tensorflow/contrib/lite/tools/BUILD
+++ b/tensorflow/contrib/lite/tools/BUILD
@@ -99,8 +99,11 @@ cc_library(
     srcs = ["verifier.cc"],
     hdrs = ["verifier.h"],
     deps = [
+        "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite:schema_fbs_version",
+        "//tensorflow/contrib/lite:string_util",
         "//tensorflow/contrib/lite/schema:schema_fbs",
+        "@com_google_absl//absl/base:core_headers",
     ],
 )
 
@@ -112,6 +115,7 @@ cc_test(
         ":verifier",
         "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite:schema_fbs_version",
+        "//tensorflow/contrib/lite:string_util",
         "//tensorflow/contrib/lite/schema:schema_fbs",
         "//tensorflow/contrib/lite/testing:util",
         "@com_google_googletest//:gtest",
diff --git a/tensorflow/contrib/lite/tools/verifier.cc b/tensorflow/contrib/lite/tools/verifier.cc
index 95a0895379..726e2aaa31 100644
--- a/tensorflow/contrib/lite/tools/verifier.cc
+++ b/tensorflow/contrib/lite/tools/verifier.cc
@@ -14,13 +14,32 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/contrib/lite/tools/verifier.h"
+#include <climits>
 #include "tensorflow/contrib/lite/schema/schema_generated.h"
+#include "tensorflow/contrib/lite/string_util.h"
 #include "tensorflow/contrib/lite/version.h"
 
 namespace tflite {
 
 namespace {
 
+// Reports error message when the reporter is set.
+void ReportError(ErrorReporter* error_reporter, const char* format, ...) {
+  if (error_reporter) {
+    va_list args;
+    va_start(args, format);
+    error_reporter->Report(format, args);
+    va_end(args);
+  }
+}
+
+// Returns the int32_t value pointed by ptr.
+const uint32_t* GetIntPtr(const char* ptr) {
+  return reinterpret_cast<const uint32_t*>(ptr);
+}
+
+// Verifies flatbuffer format of the model contents and returns the in-memory
+// model.
 const Model* VerifyFlatbufferAndGetModel(const void* buf, size_t len) {
   ::flatbuffers::Verifier verifier(static_cast<const uint8_t*>(buf), len);
   if (VerifyModelBuffer(verifier)) {
@@ -30,14 +49,159 @@ const Model* VerifyFlatbufferAndGetModel(const void* buf, size_t len) {
   }
 }
 
+const uint32_t kMaxNumString = UINT_MAX / sizeof(int32_t) - 2;
+
+// Verifies string tensor has legit buffer contents that follow the schema
+// defined in lite/string_util.h
+bool VerifyStringTensorBuffer(const Buffer& buffer,
+                              ErrorReporter* error_reporter) {
+  uint32_t buffer_size = buffer.data()->size();
+  const char* buffer_ptr = reinterpret_cast<const char*>(buffer.data()->data());
+
+  uint32_t num_strings = *GetIntPtr(buffer_ptr);
+  if (num_strings > kMaxNumString) {
+    ReportError(error_reporter,
+                "String tensor has invalid num of string set: %d", num_strings);
+    return false;
+  }
+  uint32_t header_offsets =
+      static_cast<uint32_t>(num_strings + 2) * sizeof(int32_t);
+
+  if (buffer_size < header_offsets) {
+    ReportError(error_reporter,
+                "String tensor buffer requires at least %d bytes, but is "
+                "allocated with %d bytes",
+                header_offsets, buffer_size);
+    return false;
+  }
+
+  uint32_t prev_ptr = header_offsets;
+  uint32_t offset = sizeof(int32_t);
+
+  if (*GetIntPtr(buffer_ptr + offset) != header_offsets) {
+    ReportError(error_reporter,
+                "String tensor buffer initial offset must be: %d",
+                header_offsets);
+    return false;
+  }
+  offset += sizeof(int32_t);
+  for (int i = 1; i <= num_strings; i++, offset += sizeof(int32_t)) {
+    int string_offset = *GetIntPtr(buffer_ptr + offset);
+    if (string_offset < prev_ptr || string_offset > buffer_size) {
+      ReportError(error_reporter, "String tensor buffer is invalid: index %d",
+                  i);
+      return false;
+    }
+  }
+  if (*GetIntPtr(buffer_ptr + offset - sizeof(int32_t)) != buffer_size) {
+    ReportError(error_reporter, "String tensor buffer last offset must be %d",
+                buffer_size);
+    return false;
+  }
+  return true;
+}
+
+// Verifies numeric tensor has legit buffer.
+bool VerifyNumericTensorBuffer(const Tensor& tensor, const Buffer& buffer,
+                               ErrorReporter* error_reporter) {
+  uint64_t bytes_required = 1;
+  for (int dim : *tensor.shape()) {
+    bytes_required *= dim;
+    if (bytes_required > UINT_MAX) {
+      ReportError(error_reporter, "Tensor dimension overflow");
+      return false;
+    }
+  }
+  switch (tensor.type()) {
+    case TensorType_FLOAT32:
+      bytes_required *= sizeof(float);
+      break;
+    case TensorType_INT32:
+      bytes_required *= sizeof(int32_t);
+      break;
+    case TensorType_UINT8:
+      bytes_required *= sizeof(uint8_t);
+      break;
+    case TensorType_INT64:
+      bytes_required *= sizeof(int64_t);
+      break;
+    case TensorType_FLOAT16:
+      // FALLTHROUGH_INTENDED;
+    default:
+      ReportError(error_reporter, "Invalid tensor type: %d", tensor.type());
+      return false;
+  }
+  if (bytes_required > UINT_MAX) {
+    ReportError(error_reporter, "Tensor dimension overflow");
+    return false;
+  }
+
+  if (bytes_required != buffer.data()->size()) {
+    ReportError(
+        error_reporter,
+        "Tensor requires %d bytes, but is allocated with %d bytes buffer",
+        bytes_required, buffer.data()->size());
+    return false;
+  }
+  return true;
+
+  // TODO(yichengfan): verify quantized tensors.
+}
+
+// Verifies tensors have valid properties and legit buffer if set.
+bool VerifyTensors(const Model& model, ErrorReporter* error_reporter) {
+  if (!model.subgraphs()) {
+    return true;
+  }
+  for (const auto& subgraph : *model.subgraphs()) {
+    if (!subgraph->tensors()) {
+      return true;
+    }
+    for (const auto& tensor : *subgraph->tensors()) {
+      if (!tensor->buffer()) {
+        return true;
+      }
+      if (tensor->buffer() >= model.buffers()->size()) {
+        ReportError(error_reporter, "Invalid tensor buffer index: %d",
+                    tensor->buffer());
+        return false;
+      }
+      auto* buffer = model.buffers()->Get(tensor->buffer());
+      if (!buffer || !buffer->data()) {
+        ReportError(error_reporter, "Tensor buffer %d not set",
+                    tensor->buffer());
+        return false;
+      }
+
+      if (tensor->type() == TensorType_STRING) {
+        if (!VerifyStringTensorBuffer(*buffer, error_reporter)) {
+          return false;
+        }
+      } else {
+        if (!VerifyNumericTensorBuffer(*tensor, *buffer, error_reporter)) {
+          return false;
+        }
+      }
+    }
+  }
+  return true;
+}
+
 }  // namespace
 
-bool Verify(const void* buf, size_t len) {
+bool Verify(const void* buf, size_t len, ErrorReporter* error_reporter) {
   const Model* model = VerifyFlatbufferAndGetModel(buf, len);
   if (model == nullptr) {
+    ReportError(error_reporter, "Invalid flatbuffer format");
     return false;
   }
-
-  return model->version() == TFLITE_SCHEMA_VERSION;
+  if (model->version() != TFLITE_SCHEMA_VERSION) {
+    ReportError(error_reporter, "Invalid model version %d", model->version());
+    return false;
+  }
+  if (!VerifyTensors(*model, error_reporter)) {
+    return false;
+  }
+  return true;
 }
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/tools/verifier.h b/tensorflow/contrib/lite/tools/verifier.h
index 03e1f22b7e..d2bf3c91d5 100644
--- a/tensorflow/contrib/lite/tools/verifier.h
+++ b/tensorflow/contrib/lite/tools/verifier.h
@@ -18,13 +18,15 @@ limitations under the License.
 
 #include <stdio.h>
 
+#include "tensorflow/contrib/lite/error_reporter.h"
+
 namespace tflite {
 
 // Verifies the integrity of a Tensorflow Lite flatbuffer model file.
 // Currently, it verifies:
 // * The file is following a legit flatbuffer schema.
 // * The model is in supported version.
-bool Verify(const void* buf, size_t len);
+bool Verify(const void* buf, size_t len, ErrorReporter* error_reporter);
 
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/tools/verifier_test.cc b/tensorflow/contrib/lite/tools/verifier_test.cc
index 0481a55a78..244d4f0396 100644
--- a/tensorflow/contrib/lite/tools/verifier_test.cc
+++ b/tensorflow/contrib/lite/tools/verifier_test.cc
@@ -28,31 +28,62 @@ using flatbuffers::FlatBufferBuilder;
 using flatbuffers::Offset;
 using flatbuffers::Vector;
 
-// Class that abstracts the list of buffers at the end of the TF Lite structure
-class DeferredBufferWriter {
+// Build single subgraph model.
+class TfLiteFlatbufferModelBuilder {
  public:
-  DeferredBufferWriter() {
-    data_.push_back({});  // sentinel empty buffer.
+  TfLiteFlatbufferModelBuilder() {
+    buffers_.push_back(
+        CreateBuffer(builder_, builder_.CreateVector(std::vector<uint8_t>{})));
   }
 
-  Offset<Vector<Offset<Buffer>>> BuildBuffers(FlatBufferBuilder *builder) {
-    std::vector<Offset<Buffer>> buffer_vector;
-    for (const auto &vec : data_) {
-      auto data_buffer = builder->CreateVector(vec.data(), vec.size());
-      buffer_vector.push_back(tflite::CreateBuffer(*builder, data_buffer));
+  void AddTensor(const std::vector<int>& shape, tflite::TensorType type,
+                 const std::vector<uint8_t>& buffer, const char* name) {
+    int buffer_index = 0;
+    if (!buffer.empty()) {
+      buffer_index = buffers_.size();
+      buffers_.push_back(CreateBuffer(builder_, builder_.CreateVector(buffer)));
     }
-    return builder->CreateVector(buffer_vector);
+    tensors_.push_back(CreateTensorDirect(builder_, &shape, type, buffer_index,
+                                          name, /*quantization=*/0));
   }
 
-  // Registers a buffer index and takes ownership of the data to write to it.
-  int Record(std::vector<uint8_t> data) {
-    int buffer_index = data_.size();
-    data_.emplace_back(std::move(data));
-    return buffer_index;
+  void AddOperator(const std::vector<int32_t>& inputs,
+                   const std::vector<int32_t>& outputs,
+                   tflite::BuiltinOperator builtin_op, const char* custom_op) {
+    operator_codes_.push_back(
+        CreateOperatorCodeDirect(builder_, builtin_op, custom_op));
+    operators_.push_back(CreateOperator(
+        builder_, operator_codes_.size() - 1, builder_.CreateVector(inputs),
+        builder_.CreateVector(outputs), BuiltinOptions_NONE,
+        /*builtin_options=*/0,
+        /*custom_options=*/0, tflite::CustomOptionsFormat_FLEXBUFFERS));
+  }
+
+  void FinishModel(const std::vector<int32_t>& inputs,
+                   const std::vector<int32_t>& outputs) {
+    auto subgraph = std::vector<Offset<SubGraph>>({CreateSubGraph(
+        builder_, builder_.CreateVector(tensors_),
+        builder_.CreateVector(inputs), builder_.CreateVector(outputs),
+        builder_.CreateVector(operators_),
+        builder_.CreateString("test_subgraph"))});
+    auto result = CreateModel(
+        builder_, TFLITE_SCHEMA_VERSION, builder_.CreateVector(operator_codes_),
+        builder_.CreateVector(subgraph), builder_.CreateString("test_model"),
+        builder_.CreateVector(buffers_));
+    tflite::FinishModelBuffer(builder_, result);
+  }
+
+  bool Verify() {
+    return tflite::Verify(builder_.GetBufferPointer(), builder_.GetSize(),
+                          DefaultErrorReporter());
   }
 
  private:
-  std::vector<std::vector<unsigned char>> data_;
+  FlatBufferBuilder builder_;
+  std::vector<Offset<Operator>> operators_;
+  std::vector<Offset<OperatorCode>> operator_codes_;
+  std::vector<Offset<Tensor>> tensors_;
+  std::vector<Offset<Buffer>> buffers_;
 };
 
 TEST(VerifyModel, TestEmptyModel) {
@@ -62,43 +93,26 @@ TEST(VerifyModel, TestEmptyModel) {
                            /*description=*/0, /*buffers=*/0);
   ::tflite::FinishModelBuffer(builder, model);
 
-  ASSERT_TRUE(Verify(builder.GetBufferPointer(), builder.GetSize()));
+  ASSERT_TRUE(Verify(builder.GetBufferPointer(), builder.GetSize(),
+                     DefaultErrorReporter()));
 }
 
 TEST(VerifyModel, TestSimpleModel) {
-  FlatBufferBuilder builder;
-  auto inputs = builder.CreateVector<int32_t>({0});
-  auto outputs = builder.CreateVector<int32_t>({1});
-  auto operator_codes = builder.CreateVector(std::vector<Offset<OperatorCode>>{
-      CreateOperatorCodeDirect(builder, BuiltinOperator_CUSTOM, "test")});
-  auto operators =
-      builder.CreateVector(std::vector<Offset<Operator>>{CreateOperator(
-          builder, /*opcode_index=*/0,
-          /*inputs=*/builder.CreateVector<int32_t>({0}),
-          /*outputs=*/builder.CreateVector<int32_t>({1}), BuiltinOptions_NONE,
-          /*builtin_options=*/0,
-          /*custom_options=*/0, ::tflite::CustomOptionsFormat_FLEXBUFFERS)});
-  std::vector<int> shape;
-  auto tensors = builder.CreateVector(std::vector<Offset<Tensor>>{
-      CreateTensorDirect(builder, &shape, TensorType_INT32, /*buffer=*/0,
-                         "input", /*quantization=*/0),
-      CreateTensorDirect(builder, &shape, TensorType_INT32, /*buffer=*/0,
-                         "output", /*quantization=*/0)});
-  auto subgraph = std::vector<Offset<SubGraph>>(
-      {CreateSubGraph(builder, tensors, inputs, outputs, operators,
-                      builder.CreateString("Main"))});
-
-  auto model = CreateModel(builder, TFLITE_SCHEMA_VERSION, operator_codes,
-                           builder.CreateVector(subgraph),
-                           builder.CreateString("SmartReply"), /*buffers=*/0);
-
-  ::tflite::FinishModelBuffer(builder, model);
-  ASSERT_TRUE(Verify(builder.GetBufferPointer(), builder.GetSize()));
+  TfLiteFlatbufferModelBuilder builder;
+  builder.AddOperator({0, 1}, {2}, BuiltinOperator_CUSTOM, "test");
+  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4, 5, 6}, "input");
+  builder.AddTensor(
+      {2}, TensorType_STRING,
+      {2, 0, 0, 0, 16, 0, 0, 0, 17, 0, 0, 0, 19, 0, 0, 0, 'A', 'B', 'C'},
+      "data");
+  builder.AddTensor({2, 3}, TensorType_INT32, {}, "output");
+  builder.FinishModel({0, 1}, {2});
+  ASSERT_TRUE(builder.Verify());
 }
 
 TEST(VerifyModel, TestCorruptedData) {
   string model = "123";
-  ASSERT_FALSE(Verify(model.data(), model.size()));
+  ASSERT_FALSE(Verify(model.data(), model.size(), /*error_reporter=*/nullptr));
 }
 
 TEST(VerifyModel, TestUnsupportedVersion) {
@@ -106,7 +120,8 @@ TEST(VerifyModel, TestUnsupportedVersion) {
   auto model = CreateModel(builder, /*version=*/1, /*operator_codes=*/0,
                            /*subgraphs=*/0, /*description=*/0, /*buffers=*/0);
   ::tflite::FinishModelBuffer(builder, model);
-  ASSERT_FALSE(Verify(builder.GetBufferPointer(), builder.GetSize()));
+  ASSERT_FALSE(Verify(builder.GetBufferPointer(), builder.GetSize(),
+                      DefaultErrorReporter()));
 }
 
 TEST(VerifyModel, TestRandomModificationIsNotAllowed) {
@@ -116,20 +131,105 @@ TEST(VerifyModel, TestRandomModificationIsNotAllowed) {
                            /*subgraphs=*/0, /*description=*/0, /*buffers=*/0);
   ::tflite::FinishModelBuffer(builder, model);
 
-  string model_content(reinterpret_cast<char *>(builder.GetBufferPointer()),
+  string model_content(reinterpret_cast<char*>(builder.GetBufferPointer()),
                        builder.GetSize());
   for (int i = 0; i < model_content.size(); i++) {
     model_content[i] = (model_content[i] + 137) % 255;
-    EXPECT_FALSE(Verify(model_content.data(), model_content.size()))
+    EXPECT_FALSE(Verify(model_content.data(), model_content.size(),
+                        DefaultErrorReporter()))
         << "Fail at position: " << i;
   }
 }
 
+TEST(VerifyModel, TestIntTensorShapeIsGreaterThanBuffer) {
+  TfLiteFlatbufferModelBuilder builder;
+  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4}, "input");
+  builder.FinishModel({}, {});
+  ASSERT_FALSE(builder.Verify());
+}
+
+TEST(VerifyModel, TestIntTensorShapeIsSmallerThanBuffer) {
+  TfLiteFlatbufferModelBuilder builder;
+  builder.AddTensor({2, 1}, TensorType_UINT8, {1, 2, 3, 4}, "input");
+  builder.FinishModel({}, {});
+  ASSERT_FALSE(builder.Verify());
+}
+
+TEST(VerifyModel, TestIntTensorShapeOverflow) {
+  TfLiteFlatbufferModelBuilder builder;
+  builder.AddTensor({1024, 2048, 4096}, TensorType_UINT8, {1, 2, 3, 4},
+                    "input");
+  builder.FinishModel({}, {});
+  ASSERT_FALSE(builder.Verify());
+}
+
+TEST(VerifyModel, TensorBufferIsNotValid) {
+  FlatBufferBuilder builder;
+  std::vector<int> shape = {2, 3};
+  auto tensors = builder.CreateVector(std::vector<Offset<Tensor>>{
+      CreateTensorDirect(builder, &shape, TensorType_INT32, /*buffer=*/2,
+                         "input", /*quantization=*/0)});
+  auto subgraph = std::vector<Offset<SubGraph>>(
+      {CreateSubGraph(builder, tensors, /*inputs=*/0, /*outputs=*/0,
+                      /*operators=*/0, builder.CreateString("Main"))});
+
+  auto buffers = builder.CreateVector(std::vector<Offset<Buffer>>{
+      CreateBuffer(builder,
+                   builder.CreateVector(std::vector<uint8>{1, 2, 3, 4, 5, 6})),
+  });
+
+  auto model = CreateModel(builder, TFLITE_SCHEMA_VERSION, /*operator_codes=*/0,
+                           builder.CreateVector(subgraph),
+                           builder.CreateString("SmartReply"), buffers);
+
+  ::tflite::FinishModelBuffer(builder, model);
+  ASSERT_FALSE(Verify(builder.GetBufferPointer(), builder.GetSize(),
+                      DefaultErrorReporter()));
+}
+
+TEST(VerifyModel, StringTensorHasInvalidNumString) {
+  TfLiteFlatbufferModelBuilder builder;
+  builder.AddTensor(
+      {2}, TensorType_STRING,
+      {0x00, 0x00, 0x00, 0x20, 16, 0, 0, 0, 17, 0, 0, 0, 18, 0, 0, 0, 'A', 'B'},
+      "input");
+  builder.FinishModel({}, {});
+  ASSERT_FALSE(builder.Verify());
+}
+
+TEST(VerifyModel, StringTensorOffsetTooSmall) {
+  TfLiteFlatbufferModelBuilder builder;
+  builder.AddTensor(
+      {2}, TensorType_STRING,
+      {2, 0, 0, 0, 12, 0, 0, 0, 17, 0, 0, 0, 18, 0, 0, 0, 'A', 'B'}, "input");
+  builder.FinishModel({}, {});
+  ASSERT_FALSE(builder.Verify());
+}
+
+TEST(VerifyModel, StringTensorOffsetOutOfRange) {
+  TfLiteFlatbufferModelBuilder builder;
+  builder.AddTensor(
+      {2}, TensorType_STRING,
+      {2, 0, 0, 0, 16, 0, 0, 0, 17, 0, 0, 0, 22, 0, 0, 0, 'A', 'B'}, "input");
+  builder.FinishModel({}, {});
+  ASSERT_FALSE(builder.Verify());
+}
+
+TEST(VerifyModel, StringTensorIsLargerThanRequired) {
+  TfLiteFlatbufferModelBuilder builder;
+  builder.AddTensor(
+      {2}, TensorType_STRING,
+      {2, 0, 0, 0, 16, 0, 0, 0, 17, 0, 0, 0, 18, 0, 0, 0, 'A', 'B', 'C'},
+      "input");
+  builder.FinishModel({}, {});
+  ASSERT_FALSE(builder.Verify());
+}
+
 // TODO(yichengfan): make up malicious files to test with.
 
 }  // namespace tflite
 
-int main(int argc, char **argv) {
+int main(int argc, char** argv) {
   ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
-- 
GitLab


From c460a245a25467a66d7319544afb92407057b424 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 1 Feb 2018 13:13:12 -0800
Subject: [PATCH 539/966] Fix segfault when Softmax is first in graph

PiperOrigin-RevId: 184183730
---
 tensorflow/contrib/lite/toco/export_tensorflow.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/lite/toco/export_tensorflow.cc b/tensorflow/contrib/lite/toco/export_tensorflow.cc
index 529df3cd2e..4c70b01a9d 100644
--- a/tensorflow/contrib/lite/toco/export_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/export_tensorflow.cc
@@ -621,7 +621,8 @@ void ConvertSoftmaxOperator(const Model& model, const SoftmaxOperator& src_op,
                             GraphDef* tensorflow_graph) {
   string softmax_input;
   Operator* providing_op = GetOpWithOutput(model, src_op.inputs[0]);
-  if (providing_op->type == OperatorType::kTensorFlowReshape) {
+  if (providing_op != nullptr &&
+      providing_op->type == OperatorType::kTensorFlowReshape) {
     softmax_input = src_op.inputs[0];
   } else {
     // Insert a reshape operator that reduces the dimensions down to the 2 that
-- 
GitLab


From 7092e612c1ec51b4aeafe9201706331dd4c3199e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 1 Feb 2018 13:33:20 -0800
Subject: [PATCH 540/966] Fixes a type conversion bug in
 losses.compute_weighted_loss for reduction=SUM_OVER_BATCH_SIZE.

PiperOrigin-RevId: 184186573
---
 tensorflow/python/kernel_tests/losses_test.py | 28 +++++++++++++++++++
 tensorflow/python/ops/losses/losses_impl.py   |  2 +-
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/kernel_tests/losses_test.py b/tensorflow/python/kernel_tests/losses_test.py
index 81af3a0887..00c6706593 100644
--- a/tensorflow/python/kernel_tests/losses_test.py
+++ b/tensorflow/python/kernel_tests/losses_test.py
@@ -1345,6 +1345,34 @@ class ComputeWeightedLossTest(test.TestCase):
               self.assertAllClose(
                   np.mean(self._raw_losses), unweighted_loss.eval())
 
+  def testUnweightedFromPlaceholder(self):
+    for reduction in losses.Reduction.all():
+      with ops.Graph().as_default() as g:
+        self.assertEqual(0, len(util.get_losses()))
+        raw_losses = array_ops.placeholder(dtype=dtypes.float32)
+        feed_dict = {raw_losses: self._raw_losses}
+        unweighted_losses = (
+            losses.compute_weighted_loss(raw_losses, reduction=reduction),
+            losses.compute_weighted_loss(
+                raw_losses, weights=np.ones((1, 1, 1)), reduction=reduction),
+            losses.compute_weighted_loss(
+                raw_losses, weights=np.ones((1, 1, 4)), reduction=reduction),
+        )
+        self.assertEqual(3, len(util.get_losses()))
+        with self.test_session(g):
+          for unweighted_loss in unweighted_losses:
+            if reduction == losses.Reduction.NONE:
+              self.assertAllClose(
+                  self._raw_losses, unweighted_loss.eval(feed_dict))
+            elif reduction == losses.Reduction.SUM:
+              self.assertAllClose(
+                  np.sum(self._raw_losses), unweighted_loss.eval(feed_dict))
+            else:
+              # reduction one of MEAN, SUM_OVER_NONZERO_WEIGHTS,
+              # SUM_BY_NONZERO_WEIGHTS or SUM_OVER_BATCH_SIZE.
+              self.assertAllClose(
+                  np.mean(self._raw_losses), unweighted_loss.eval(feed_dict))
+
   def testScalarWeight(self):
     with ops.Graph().as_default():
       self.assertEqual(0, len(util.get_losses()))
diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py
index 73563486e1..e75a9b22e4 100644
--- a/tensorflow/python/ops/losses/losses_impl.py
+++ b/tensorflow/python/ops/losses/losses_impl.py
@@ -151,7 +151,7 @@ def _num_present(losses, weights, per_batch=False):
 def _num_elements(losses):
   """Computes the number of elements in `losses` tensor."""
   with ops.name_scope(None, "num_elements", values=[losses]) as scope:
-    return array_ops.size(losses, name=scope, out_type=losses.dtype)
+    return math_ops.cast(array_ops.size(losses, name=scope), dtype=losses.dtype)
 
 
 @tf_export("losses.compute_weighted_loss")
-- 
GitLab


From 44de67f366d37db2b5483734b0cbf9e312ca9d8e Mon Sep 17 00:00:00 2001
From: Noah Fiedel <nfiedel@google.com>
Date: Thu, 1 Feb 2018 13:48:13 -0800
Subject: [PATCH 541/966] Adding documentation on how to load & serve a model
 with the TensorFlow Serving Model Server.

PiperOrigin-RevId: 184188752
---
 .../docs_src/programmers_guide/saved_model.md | 26 ++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/tensorflow/docs_src/programmers_guide/saved_model.md b/tensorflow/docs_src/programmers_guide/saved_model.md
index 9f50be5b31..f27a658342 100644
--- a/tensorflow/docs_src/programmers_guide/saved_model.md
+++ b/tensorflow/docs_src/programmers_guide/saved_model.md
@@ -285,7 +285,7 @@ with tf.Session(graph=tf.Graph()) as sess:
 ```
 
 
-### Loading a Savedmodel in C++
+### Loading a SavedModel in C++
 
 The C++ version of the SavedModel
 [loader](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/cc/saved_model/loader.h)
@@ -303,6 +303,30 @@ LoadSavedModel(session_options, run_options, export_dir, {kSavedModelTagTrain},
                &bundle);
 ```
 
+### Loading and Serving a SavedModel in TensorFlow Serving
+
+You can easily load and serve a SavedModel with the TensorFlow Serving Model
+Server binary. See [instructions](https://www.tensorflow.org/serving/setup#installing_using_apt-get)
+on how to install the server, or build it if you wish.
+
+Once you have the Model Server, run it with:
+```
+tensorflow_model_server --port=port-numbers --model_name=your-model-name --model_base_path=your_model_base_path
+```
+Set the port and model_name flags to values of your choosing. The
+model_base_path flag expects to be to a base directory, with each version of
+your model residing in a numerically named subdirectory. If you only have a
+single version of your model, simply place it in a subdirectory like so:
+* Place the model in /tmp/model/0001
+* Set model_base_path to /tmp/model
+
+Store different versions of your model in numerically named subdirectories of a
+common base directory. For example, suppose the base directory is `/tmp/model`.
+If you have only one version of your model, store it in `/tmp/model/0001`. If
+you have two versions of your model, store the second version in
+`/tmp/model/0002`, and so on.  Set the `--model-base_path` flag to the base
+directory (`/tmp/model`, in this example).  TensorFlow Model Server will serve
+the model in the highest numbered subdirectory of that base directory.
 
 ### Standard constants
 
-- 
GitLab


From 87aab43770cacbb73706ad6b11a28e9b19c1df0b Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Thu, 1 Feb 2018 13:48:33 -0800
Subject: [PATCH 542/966] [TFXLA] Use data flow to determine switch grouping.

* Change how switch grouping works:
  - This is an intermediate step, next is combining
    DetermineBranchMapAndFrontier into one traversal.
* Homogeneous the naming (switch_nodes -> switches);
* Change graph dumping to be due to class member - currently still performed when vlog-level is sufficiently high;
* Pass in correct library when dumping graphs;

PiperOrigin-RevId: 184188816
---
 .../tf2xla/functionalize_control_flow.cc      | 279 +++++++++++++-----
 .../tf2xla/functionalize_control_flow_test.cc |  10 +-
 tensorflow/compiler/tf2xla/graph_compiler.cc  |   2 +-
 3 files changed, 209 insertions(+), 82 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
index 1d9e0fb33e..7a4fa79078 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
@@ -285,7 +285,8 @@ Status BuildLoopBody(const Graph& graph, Frame* frame,
 Status FunctionalizeLoop(Graph* graph, Frame* frame,
                          FunctionLibraryDefinition* library) {
   VLOG(2) << "Frame " << frame->name << " before: "
-          << dump_graph::DumpGraphToFile("functionalize_before", *graph);
+          << dump_graph::DumpGraphToFile("functionalize_before", *graph,
+                                         library);
 
   // Split loop-varying Enter nodes with multiple successors. If the same
   // Tensor is fed as input to multiple loop arguments, we may end up with a
@@ -450,7 +451,7 @@ Status FunctionalizeLoop(Graph* graph, Frame* frame,
   TF_RETURN_IF_ERROR(BuildLoopBody(*graph, frame, &arg_types, &body_graph));
 
   VLOG(2) << "Frame " << frame->name << " condition: "
-          << dump_graph::DumpGraphToFile("loop_condition", *cond_graph)
+          << dump_graph::DumpGraphToFile("loop_condition", *cond_graph, library)
           << " body: " << dump_graph::DumpGraphToFile("loop_body", *body_graph);
 
   static std::atomic<int64> sequence_num(0LL);
@@ -531,7 +532,8 @@ Status FunctionalizeLoop(Graph* graph, Frame* frame,
   frame->parent->nodes.insert(while_node);
 
   VLOG(2) << "Frame " << frame->name << " after: "
-          << dump_graph::DumpGraphToFile("functionalize_after", *graph);
+          << dump_graph::DumpGraphToFile("functionalize_after", *graph,
+                                         library);
 
   return Status::OK();
 }
@@ -564,11 +566,11 @@ class FunctionalizeCond {
     explicit CondArgNode(Node* input) : input(input) {}
     string ToString() const {
       return strings::StrCat("input=", input->name(),
-                             " switches=", NodesToString(switch_nodes));
+                             " switches=", NodesToString(switches));
     }
 
     Node* input;
-    std::vector<Node*> switch_nodes;
+    std::vector<Node*> switches;
   };
   using CondArgNodes = std::vector<CondArgNode>;
 
@@ -582,15 +584,22 @@ class FunctionalizeCond {
     int count;
   };
 
-  struct PredicateSwitches {
-    explicit PredicateSwitches(Node* predicate) : predicate(predicate) {}
+  // Group of switch nodes that will be part of the same XlaIf.
+  struct SwitchCluster {
+    explicit SwitchCluster(Node* predicate) : predicate(predicate) {}
+    string ToString() const {
+      return strings::StrCat(name, " predicate=", predicate->name(),
+                             " switches=", NodesToString(switches));
+    }
 
+    string name;
     Node* predicate;
     std::vector<Node*> switches;
   };
 
-  FunctionalizeCond(Graph* graph, FunctionLibraryDefinition* library)
-      : library_(library), graph_(graph) {}
+  FunctionalizeCond(Graph* graph, FunctionLibraryDefinition* library,
+                    bool dump_graphs)
+      : library_(library), graph_(graph), dump_graphs_(dump_graphs) {}
 
   // Perform the actual cond functionalization. Iterate over groups of switch
   // nodes (linked by common predicate), from innermost to outermost, and
@@ -601,27 +610,25 @@ class FunctionalizeCond {
   // frontier (the nodes where the cond ends).
   StatusOr<std::pair<std::unordered_map<Node*, ForwardFlowNode>,
                      std::unordered_set<Node*>>>
-  DetermineBranchMapAndFrontier(const std::vector<Node*>& switches);
+  DetermineBranchMapAndFrontier(const SwitchCluster& switch_cluster);
 
   // Returns XlaIf node created from subgraph of merge and switch nodes. This
   // encapsulates the process of extracting the bodies needed for the then and
   // else branch, creates a XlaIf node, removing the nodes of the branches from
   // the graph and replacing the merge node with a XlaIf.
   StatusOr<Node*> ConvertToXlaIf(const CondArgNodes& cond_arg_nodes,
-                                 const std::vector<Node*>& switch_nodes,
-                                 const std::vector<Node*>& merge_nodes,
-                                 Node* predicate);
+                                 const SwitchCluster& switch_cluster,
+                                 const std::vector<Node*>& switches);
 
   // Builds a XlaIfOp to replace the Switch-Graph-Merge cluster with.
   StatusOr<Node*> BuildAndAddXlaIfOp(const CondArgNodes& cond_arg_nodes,
-                                     const std::vector<Node*>& switch_nodes,
-                                     const std::vector<Node*>& merge_nodes,
-                                     Node* predicate);
+                                     const SwitchCluster& switch_cluster,
+                                     const std::vector<Node*>& merge_nodes);
 
   // Extracts a function body corresponding to the given input edge of the merge
   // node.
   Status ExtractBody(const CondArgNodes& cond_arg_nodes,
-                     const std::vector<Node*>& switch_nodes,
+                     const std::vector<Node*>& switches,
                      const std::vector<Node*>& merge_nodes, int input_edge,
                      Graph* body);
 
@@ -632,9 +639,9 @@ class FunctionalizeCond {
   // Adds all output edges from the `if_node`.
   Status AddOutputEdges(const std::vector<Node*>& outputs, Node* if_node);
 
-  // Returns the switches of graph_ (along with grouping predicates) in
-  // postorder. Dead switch nodes are skipped and removed from the graph.
-  std::vector<PredicateSwitches> DeterminePredicateSwitchOrder();
+  // Returns the switch clusters of graph_ in postorder. Dead switch nodes are
+  // skipped and removed from the graph.
+  std::vector<SwitchCluster> DeterminePredicateSwitchOrder();
 
   // Update the state for destination based on the state of source and the node
   // being updated.
@@ -657,6 +664,7 @@ class FunctionalizeCond {
 
   FunctionLibraryDefinition* library_;
   Graph* graph_;
+  bool dump_graphs_;
 };
 
 bool IsDeadSwitch(const Node* node) {
@@ -704,10 +712,13 @@ Status FunctionalizeCond::ValidateFrontier(
           ") in both Else and Then branch should be in Both.");
     }
   }
-  if (pending[kBoth].empty() && pending[kThenBranch].empty() &&
-      pending[kElseBranch].empty()) {
-    return errors::Internal("Unexpected empty frontier for switch nodes");
-  }
+  // An empty frontier indicates a dead switch. Above we attempt to remove dead
+  // switch nodes, but not all are removed so don't treat it as an error yet.
+  // TODO(jpienaar): Find out why dead switch nodes remain.
+  // if (pending[kBoth].empty() && pending[kThenBranch].empty() &&
+  //     pending[kElseBranch].empty()) {
+  //   return errors::Internal("Unexpected empty frontier for switch nodes");
+  // }
   return Status::OK();
 }
 
@@ -734,33 +745,138 @@ Status FunctionalizeCond::Join(const ForwardFlowNode& src_state,
   return Status::OK();
 }
 
-std::vector<FunctionalizeCond::PredicateSwitches>
+std::vector<FunctionalizeCond::SwitchCluster>
 FunctionalizeCond::DeterminePredicateSwitchOrder() {
+  struct Cluster {
+    bool operator==(const Cluster& other) const {
+      return representative == other.representative;
+    }
+    int representative = -1;
+  };
+
+  // Perform a DFS over the graph and
+  // * Determine the reverse topological order of the nodes (there should be no
+  //   cycles at this point so the post-order numbering corresponds to the
+  //   reverse topological sorting);
+  // * Identify dead switches;
+  // * Initialize the cluster's representative;
+  std::vector<UnionFind<Cluster>> clusters(graph_->num_node_ids());
   std::vector<Node*> dead_switches;
   std::vector<Node*> switch_order;
-  DFS(*graph_, nullptr, [this, &dead_switches, &switch_order](Node* n) {
+  std::vector<Node*> rev_topo_sorted_nodes;
+  DFS(*graph_, nullptr, [&](Node* n) {
+    clusters[n->id()].Get().representative = n->id();
     if (IsSwitch(n)) {
       if (IsDeadSwitch(n)) {
         dead_switches.push_back(n);
       } else {
+        rev_topo_sorted_nodes.push_back(n);
         switch_order.push_back(n);
       }
+    } else if (n->IsOp()) {
+      // Exclude src and sink nodes from further consideration.
+      rev_topo_sorted_nodes.push_back(n);
     }
   });
 
+  std::vector<SwitchCluster> switch_clusters;
+  // Return early if there are no switches in the graph.
+  if (switch_order.empty()) {
+    return switch_clusters;
+  }
+
   // Remove all dead switch nodes.
   for (Node* n : dead_switches) {
     VLOG(2) << "Removing dead switch: " << n->DebugString();
     graph_->RemoveNode(n);
   }
 
-  std::vector<PredicateSwitches> predicate_switch_order;
-  if (switch_order.empty()) {
-    return predicate_switch_order;
+  // Identify switch nodes that are part of the same control flow context by
+  // considering the operands of operations: an operation is part of the same
+  // control context as its operands unless the operation is a switch. Control
+  // dependencies are considered part of the same control flow context if the
+  // switch depth is the same (see comment below).
+  // TODO(jpienaar): This could be combined with DetermineBranchMapAndFrontier.
+  std::vector<int> switch_depth(graph_->num_node_ids());
+  // entry_cluster records the input cluster to a switch node. This is used when
+  // merging with a merge node where the dst's cluster is merged with the entry
+  // cluster of the merge node's cluster (which corresponds to a switch cluster
+  // and so has an entry cluster).
+  std::unordered_map<int, UnionFind<Cluster>*> entry_cluster;
+  for (auto it = rev_topo_sorted_nodes.rbegin();
+       it != rev_topo_sorted_nodes.rend(); ++it) {
+    Node* n = *it;
+
+    // Compute switch depth.
+    int new_switch_depth = 0;
+    for (const Edge* e : n->in_edges()) {
+      Node* src = e->src();
+      new_switch_depth = std::max(
+          new_switch_depth, switch_depth[src->id()] + (IsSwitch(src) ? 1 : 0) -
+                                (IsMerge(src) ? 1 : 0));
+    }
+    switch_depth[n->id()] = new_switch_depth;
+
+    // Only merge the input operands of a switch. The switch's clustering itself
+    // is determined by the interaction of the switch's outputs.
+    if (IsSwitch(n)) {
+      Node* input;
+      TF_CHECK_OK(n->input_node(0, &input));
+      UnionFind<Cluster>& cluster = clusters[input->id()];
+      entry_cluster[n->id()] = &cluster;
+      // Merge the inputs of the switch node with one another. This results in
+      // predicates and control input residing in the same cluster.
+      for (const Edge* e : n->in_edges()) {
+        Node* src = e->src();
+        cluster.Merge(&clusters[src->id()]);
+      }
+      continue;
+    }
+
+    for (const Edge* e : n->in_edges()) {
+      Node* src = e->src();
+      if (!src->IsOp()) continue;
+      UnionFind<Cluster>* cluster = &clusters[src->id()];
+      if (IsMerge(src)) {
+        cluster = entry_cluster.at(clusters[src->id()].Get().representative);
+      }
+      // Merge a node with its data operands and with its control operands if
+      // the src and dst are in the same ControlContext. The ControlContext is
+      // not explicitly available here, and instead the switch depth is used as
+      // a proxy here. Due to the invariant that control edges can only be from
+      // a containing scope to an inner scope or from the inner scope to its
+      // containing scope (for exit nodes), the switch depth will only match if
+      // the src and dst are in the same ControlContext. Control edges between
+      // ControlContexts are handled during the extraction.
+      if (!e->IsControlEdge() ||
+          new_switch_depth ==
+              switch_depth[src->id()] + (IsSwitch(src) ? 1 : 0)) {
+        cluster->Merge(&clusters[n->id()]);
+      }
+    }
   }
 
+  if (dump_graphs_) {
+    // Mark the switch cluster each node is part of.
+    for (Node* n : graph_->nodes()) {
+      n->ClearAttr("_XlaFunctionalizeSwitchGroup");
+      n->AddAttr("_XlaFunctionalizeSwitchGroup",
+                 clusters[n->id()].Get().representative);
+    }
+    LOG(INFO) << "FunctionalizeControlFlow (with_clusters): "
+              << dump_graph::DumpGraphToFile("functionalize_clustered", *graph_,
+                                             library_);
+  }
+
+  struct Hash {
+    size_t operator()(const std::pair<Node*, Cluster>& item) const {
+      return Hash64Combine(hash<Node*>()(item.first),
+                           std::hash<int>()(item.second.representative));
+    }
+  };
+
   // Merge Switch nodes with common predicate.
-  std::unordered_map<Node*, int> predicate_index;
+  std::unordered_map<std::pair<Node*, Cluster>, int, Hash> predicate_index;
   // The nodes in switch_order are in reverse topological order, but the
   // clustered switches need not be (i.e., when considered as a cluster one
   // element of a cluster may be later in the topological order than another
@@ -769,13 +885,19 @@ FunctionalizeCond::DeterminePredicateSwitchOrder() {
   for (auto it = switch_order.rbegin(); it != switch_order.rend(); ++it) {
     Node* pred;
     TF_CHECK_OK((*it)->input_node(1, &pred));
-    if (predicate_index.find(pred) == predicate_index.end()) {
-      predicate_index[pred] = predicate_switch_order.size();
-      predicate_switch_order.emplace_back(pred);
+    auto repr = std::make_pair(pred, clusters[(*it)->id()].Get());
+    if (predicate_index.find(repr) == predicate_index.end()) {
+      predicate_index[repr] = switch_clusters.size();
+      switch_clusters.emplace_back(pred);
+      // Generate a name by concating with the cluster representative as there
+      // could be multiple switch clusters with the same predicate.
+      switch_clusters[predicate_index[repr]].name =
+          strings::StrCat(pred->name(), "_", repr.second.representative, "_If");
     }
-    predicate_switch_order[predicate_index[pred]].switches.push_back(*it);
+    switch_clusters[predicate_index[repr]].switches.push_back(*it);
   }
-  return predicate_switch_order;
+
+  return switch_clusters;
 }
 
 StatusOr<std::vector<Node*>>
@@ -823,10 +945,10 @@ StatusOr<
     std::pair<std::unordered_map<Node*, FunctionalizeCond::ForwardFlowNode>,
               std::unordered_set<Node*>>>
 FunctionalizeCond::DetermineBranchMapAndFrontier(
-    const std::vector<Node*>& switches) {
+    const SwitchCluster& switch_cluster) {
   std::unordered_map<Node*, ForwardFlowNode> branch_map;
   std::unordered_set<Node*> frontier;
-  std::vector<Node*> stack = switches;
+  std::vector<Node*> stack = switch_cluster.switches;
   std::vector<bool> visited(graph_->num_node_ids(), false);
   while (!stack.empty()) {
     Node* n = stack.back();
@@ -868,7 +990,7 @@ FunctionalizeCond::DetermineBranchMapAndFrontier(
     }
   }
 
-  if (VLOG_IS_ON(2)) {
+  if (dump_graphs_) {
     for (const auto& kv : branch_map) {
       // Append attribute to the graph if running with logging to make the
       // changes clearer in the visualization.
@@ -880,7 +1002,7 @@ FunctionalizeCond::DetermineBranchMapAndFrontier(
 }
 
 Status FunctionalizeCond::FunctionalizeInternal() {
-  std::vector<PredicateSwitches> predicate_switch_order =
+  std::vector<SwitchCluster> predicate_switch_order =
       DeterminePredicateSwitchOrder();
 
   // Iterate from innermost set of clustered switches to outermost, replacing
@@ -894,10 +1016,12 @@ Status FunctionalizeCond::FunctionalizeInternal() {
     std::unordered_map<Node*, ForwardFlowNode> branch_map;
     std::unordered_set<Node*> frontier;
     TF_ASSIGN_OR_RETURN(std::tie(branch_map, frontier),
-                        DetermineBranchMapAndFrontier(ps.switches));
+                        DetermineBranchMapAndFrontier(ps));
 
-    VLOG(2) << "FunctionalizeControlFlow (before XlaIf conversion): "
-            << dump_graph::DumpGraphToFile("functionalize_bc", *graph_);
+    if (dump_graphs_)
+      LOG(INFO) << "FunctionalizeControlFlow (before XlaIf conversion): "
+                << dump_graph::DumpGraphToFile("functionalize_bc", *graph_,
+                                               library_);
     TF_RETURN_IF_ERROR(ValidateFrontier(branch_map, frontier));
 
     // Sort the merge and switch nodes using NodeCmp. The switch-nodes are
@@ -914,7 +1038,7 @@ Status FunctionalizeCond::FunctionalizeInternal() {
         input_index[in] = cond_arg_nodes.size();
         cond_arg_nodes.emplace_back(in);
       }
-      cond_arg_nodes.at(input_index.at(in)).switch_nodes.push_back(switch_node);
+      cond_arg_nodes.at(input_index.at(in)).switches.push_back(switch_node);
     }
     std::vector<Node*> merge_nodes(frontier.begin(), frontier.end());
     std::sort(merge_nodes.begin(), merge_nodes.end(), NodeCmp());
@@ -923,9 +1047,8 @@ Status FunctionalizeCond::FunctionalizeInternal() {
                         EnsureDominanceAndReturnNonDominatedControlNodes(
                             branch_map, ps.switches));
 
-    TF_ASSIGN_OR_RETURN(
-        Node * if_node,
-        ConvertToXlaIf(cond_arg_nodes, ps.switches, merge_nodes, ps.predicate));
+    TF_ASSIGN_OR_RETURN(Node * if_node,
+                        ConvertToXlaIf(cond_arg_nodes, ps, merge_nodes));
     for (Node* old : old_control_nodes) {
       graph_->AddControlEdge(old, if_node);
     }
@@ -934,25 +1057,26 @@ Status FunctionalizeCond::FunctionalizeInternal() {
       graph_->RemoveNode(del_kv.first);
     }
     for (auto& kv : cond_arg_nodes) {
-      for (Node* node : kv.switch_nodes) {
+      for (Node* node : kv.switches) {
         graph_->RemoveNode(node);
       }
     }
-    VLOG(2) << "FunctionalizeControlFlow (after XlaIf conversion): "
-            << dump_graph::DumpGraphToFile("functionalize_ac", *graph_);
+    if (dump_graphs_)
+      LOG(INFO) << "FunctionalizeControlFlow (after XlaIf conversion): "
+                << dump_graph::DumpGraphToFile("functionalize_ac", *graph_,
+                                               library_);
   }
   return Status::OK();
 }
 
 StatusOr<Node*> FunctionalizeCond::BuildAndAddXlaIfOp(
-    const CondArgNodes& cond_arg_nodes, const std::vector<Node*>& switch_nodes,
-    const std::vector<Node*>& merge_nodes, Node* predicate) {
-  VLOG(2) << "Build if op for " << NodesToString(merge_nodes) << " with input "
-          << NodesToString(switch_nodes);
+    const CondArgNodes& cond_arg_nodes, const SwitchCluster& switch_cluster,
+    const std::vector<Node*>& merge_nodes) {
+  VLOG(2) << "Build if op for " << switch_cluster.name;
 
   NodeDef if_def;
   // Create a new If node using the name of the merge node.
-  NodeDefBuilder builder(strings::StrCat(predicate->name(), "_If"), "XlaIf");
+  NodeDefBuilder builder(switch_cluster.name, "XlaIf");
   string branch[] = {"else_branch", "then_branch"};
   for (int i = 0; i < 2; ++i) {
     static std::atomic<int64> sequence_num(0LL);
@@ -962,12 +1086,9 @@ StatusOr<Node*> FunctionalizeCond::BuildAndAddXlaIfOp(
     body_name.set_name(
         strings::StrCat("_functionalize_if_", branch[i], "_", id));
     auto body = xla::MakeUnique<Graph>(graph_->op_registry());
-    TF_RETURN_IF_ERROR(
-        ExtractBody(cond_arg_nodes, switch_nodes, merge_nodes, i, body.get()));
+    TF_RETURN_IF_ERROR(ExtractBody(cond_arg_nodes, switch_cluster.switches,
+                                   merge_nodes, i, body.get()));
     VLOG(3) << "Body " << branch[i] << ": " << DebugString(body.get());
-    VLOG(4) << "FunctionalizeControlFlow (" << branch[i] << "): "
-            << dump_graph::DumpGraphToFile(
-                   strings::StrCat("functionalize_", branch[i]), *body);
     FunctionDef body_fdef;
     TF_RETURN_IF_ERROR(GraphToFunctionDef(*body, body_name.name(), &body_fdef));
     TF_RETURN_IF_ERROR(library_->AddFunctionDef(body_fdef));
@@ -979,7 +1100,7 @@ StatusOr<Node*> FunctionalizeCond::BuildAndAddXlaIfOp(
   DataTypeVector in_arg_types;
   for (auto& kv : cond_arg_nodes) {
     bool inserted = false;
-    for (const Node* arg : kv.switch_nodes) {
+    for (const Node* arg : kv.switches) {
       const Edge* in_edge;
       TF_RETURN_IF_ERROR(arg->input_edge(0, &in_edge));
       if (in_edge->IsControlEdge()) {
@@ -1006,10 +1127,11 @@ StatusOr<Node*> FunctionalizeCond::BuildAndAddXlaIfOp(
   builder.Attr("Tout", out_type);
 
   builder.Attr("Tcond", DT_BOOL);
-  builder.Device(predicate->assigned_device_name());
+  builder.Device(switch_cluster.predicate->assigned_device_name());
   // Conditional should be the first input ...
   builder.Input(
-      NodeDefBuilder::NodeOut(predicate->name(), 0, predicate->output_type(0)));
+      NodeDefBuilder::NodeOut(switch_cluster.predicate->name(), 0,
+                              switch_cluster.predicate->output_type(0)));
   // ... followed by the other inputs.
   builder.Input(inputs);
 
@@ -1019,7 +1141,7 @@ StatusOr<Node*> FunctionalizeCond::BuildAndAddXlaIfOp(
 }
 
 Status FunctionalizeCond::ExtractBody(const CondArgNodes& cond_arg_nodes,
-                                      const std::vector<Node*>& switch_nodes,
+                                      const std::vector<Node*>& switches,
                                       const std::vector<Node*>& merge_nodes,
                                       int input_edge, Graph* body) {
   VLOG(2) << "ExtractBody for " << NodesToString(merge_nodes) << " along edge "
@@ -1029,7 +1151,7 @@ Status FunctionalizeCond::ExtractBody(const CondArgNodes& cond_arg_nodes,
   int arg_count = 0;
   for (auto& kv : cond_arg_nodes) {
     Node* arg_node = nullptr;
-    for (const auto* arg : kv.switch_nodes) {
+    for (const auto* arg : kv.switches) {
       DataType dtype = arg->input_type(0);
       if (arg_node == nullptr) {
         TF_ASSIGN_OR_RETURN(arg_node, BuildArgNode(body, dtype, arg_count++));
@@ -1053,8 +1175,7 @@ Status FunctionalizeCond::ExtractBody(const CondArgNodes& cond_arg_nodes,
       node_map.at(in->id()) = body->CopyNode(in);
     }
 
-    if (std::find(switch_nodes.begin(), switch_nodes.end(), in) ==
-        switch_nodes.end()) {
+    if (std::find(switches.begin(), switches.end(), in) == switches.end()) {
       body->AddEdge(node_map.at(in->id()), in_edge->src_output(),
                     node_map.at(node->id()), 0);
     } else {
@@ -1076,7 +1197,7 @@ Status FunctionalizeCond::AddInputEdges(const CondArgNodes& cond_arg_nodes,
   graph_->AddEdge(predicate, 0, if_node, index++);
   for (auto& kv : cond_arg_nodes) {
     bool inserted = false;
-    for (const Node* arg : kv.switch_nodes) {
+    for (const Node* arg : kv.switches) {
       const Edge* in_edge;
       TF_RETURN_IF_ERROR(arg->input_edge(0, &in_edge));
       if (in_edge->IsControlEdge()) {
@@ -1119,16 +1240,17 @@ Status FunctionalizeCond::AddOutputEdges(const std::vector<Node*>& outputs,
 }
 
 StatusOr<Node*> FunctionalizeCond::ConvertToXlaIf(
-    const CondArgNodes& cond_arg_nodes, const std::vector<Node*>& switch_nodes,
-    const std::vector<Node*>& merge_nodes, Node* predicate) {
-  VLOG(1) << "ConvertToXlaIf for " << NodesToString(switch_nodes) << " -> "
+    const CondArgNodes& cond_arg_nodes, const SwitchCluster& switch_cluster,
+    const std::vector<Node*>& merge_nodes) {
+  VLOG(1) << "ConvertToXlaIf for " << switch_cluster.ToString() << " -> "
           << NodesToString(merge_nodes);
 
   // Extract bodies and builds a If operator.
   TF_ASSIGN_OR_RETURN(
       Node * if_node,
-      BuildAndAddXlaIfOp(cond_arg_nodes, switch_nodes, merge_nodes, predicate));
-  TF_RETURN_IF_ERROR(AddInputEdges(cond_arg_nodes, predicate, if_node));
+      BuildAndAddXlaIfOp(cond_arg_nodes, switch_cluster, merge_nodes));
+  TF_RETURN_IF_ERROR(
+      AddInputEdges(cond_arg_nodes, switch_cluster.predicate, if_node));
   TF_RETURN_IF_ERROR(AddOutputEdges(merge_nodes, if_node));
 
   return if_node;
@@ -1137,18 +1259,19 @@ StatusOr<Node*> FunctionalizeCond::ConvertToXlaIf(
 Status FunctionalizeCond::Functionalize(Graph* graph,
                                         FunctionLibraryDefinition* library) {
   VLOG(1) << "FunctionalizeCond::Functionalize";
-  FunctionalizeCond fc(graph, library);
+  FunctionalizeCond fc(graph, library, /*dump_graphs=*/VLOG_IS_ON(2));
   return fc.FunctionalizeInternal();
 }
 
 }  // namespace
 
-// Transformation that converts Tensorflow's graph control flow constructs into
+// Transformation that converts TensorFlow's graph control flow constructs into
 // functional equivalents.
 Status FunctionalizeControlFlow(Graph* graph,
                                 FunctionLibraryDefinition* library) {
   VLOG(2) << "FunctionalizeControlFlow (initial): "
-          << dump_graph::DumpGraphToFile("functionalize_initial", *graph);
+          << dump_graph::DumpGraphToFile("functionalize_initial", *graph,
+                                         library);
   // Note: BuildControlFlowInfo() requires that the graph's source node is
   // connected to all source nodes in the graph. Many graphs violate this
   // invariant.
@@ -1160,7 +1283,8 @@ Status FunctionalizeControlFlow(Graph* graph,
   for (Node* node : graph->op_nodes()) {
     const ControlFlowInfo& cf = cf_info[node->id()];
 
-    VLOG(2) << "node: " << node->name() << " frame_name: " << cf.frame_name
+    VLOG(2) << "node: " << node->name() << " (" << node->id()
+            << ") frame_name: " << cf.frame_name
             << " frame: " << (cf.frame ? cf.frame->name() : "---")
             << " parent_frame: "
             << (cf.parent_frame ? cf.parent_frame->name() : "---");
@@ -1228,7 +1352,8 @@ Status FunctionalizeControlFlow(Graph* graph,
   TF_RETURN_IF_ERROR(FunctionalizeCond::Functionalize(graph, library));
 
   VLOG(2) << "FunctionalizeControlFlow (final): "
-          << dump_graph::DumpGraphToFile("functionalize_final", *graph);
+          << dump_graph::DumpGraphToFile("functionalize_final", *graph,
+                                         library);
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
index 71f12a1333..bc7276c3af 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
@@ -38,10 +38,11 @@ namespace {
 
 // Returns the names of the "then" and "else" functions for the XlaIf node in a
 // graph.
-Status FindIfThenAndElse(const GraphDef& graph, NameAttrList* then_fn,
-                         NameAttrList* else_fn) {
+Status FindIfThenAndElse(const GraphDef& graph, string* op_name,
+                         NameAttrList* then_fn, NameAttrList* else_fn) {
   for (const NodeDef& node : graph.node()) {
     if (node.op() == "XlaIf") {
+      *op_name = node.name();
       const NameAttrList* result;
       TF_RETURN_IF_ERROR(GetNodeAttr(node, "then_branch", &result));
       *then_fn = *result;
@@ -96,9 +97,10 @@ TEST(FunctionalizeControlFlow, Conditional) {
 
   GraphDef graph_def;
   graph.ToGraphDef(&graph_def);
+  string op_name;
   NameAttrList then_fn;
   NameAttrList else_fn;
-  TF_EXPECT_OK(FindIfThenAndElse(graph_def, &then_fn, &else_fn));
+  TF_EXPECT_OK(FindIfThenAndElse(graph_def, &op_name, &then_fn, &else_fn));
   InstantiationResultForTest else_result;
   TF_EXPECT_OK(
       InstantiateFunctionForTest(else_fn.name(), library, &else_result));
@@ -109,7 +111,7 @@ TEST(FunctionalizeControlFlow, Conditional) {
     auto y = ops::Placeholder(scope.WithOpName("y"), DT_INT32);
     auto x = ops::Placeholder(scope.WithOpName("x"), DT_INT32);
     auto less = ops::Less(scope.WithOpName("cond/Less"), y, x);
-    auto if_op = ops::XlaIf(scope.WithOpName("cond/Less_If"), less,
+    auto if_op = ops::XlaIf(scope.WithOpName(op_name), less,
                             std::initializer_list<Input>{less, y, x}, then_fn,
                             else_fn, {DT_INT32});
     GraphDef expected;
diff --git a/tensorflow/compiler/tf2xla/graph_compiler.cc b/tensorflow/compiler/tf2xla/graph_compiler.cc
index 02215b5112..c90ea09e17 100644
--- a/tensorflow/compiler/tf2xla/graph_compiler.cc
+++ b/tensorflow/compiler/tf2xla/graph_compiler.cc
@@ -136,7 +136,7 @@ Status GraphCompiler::Compile() {
       TF_RET_CHECK(src->id() < output_registry.size());
       const NodeOutputs& src_outputs = output_registry[src->id()];
 
-      tensor_inputs_[e->dst_input()] = src_outputs[e->src_output()];
+      tensor_inputs_.at(e->dst_input()) = src_outputs.at(e->src_output());
     }
 
     OpKernelContext op_context(&params, n->num_outputs());
-- 
GitLab


From a964248ae9aaee99165594e80427152576e803fe Mon Sep 17 00:00:00 2001
From: Benoit Steiner <bsteiner@google.com>
Date: Thu, 1 Feb 2018 14:11:08 -0800
Subject: [PATCH 543/966] Return an error instead of assertion when processing
 an ill-formed graph or an invalid set of fetch nodes

PiperOrigin-RevId: 184192790
---
 tensorflow/core/grappler/costs/virtual_scheduler.cc | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.cc b/tensorflow/core/grappler/costs/virtual_scheduler.cc
index d7d07ee7a5..020492a3e9 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.cc
@@ -323,8 +323,13 @@ Status VirtualScheduler::Init() {
   }
 
   // Get the nodes that would run to output fetch_nodes.
+  bool ill_formed = false;
   std::vector<const NodeDef*> nodes =
-      ComputeTransitiveFanin(graph, fetch_nodes);
+      ComputeTransitiveFanin(graph, fetch_nodes, &ill_formed);
+  if (ill_formed) {
+    return errors::InvalidArgument(
+        "Ill formed graph or invalid set of fetch nodes specified");
+  }
 
   // TODO(dyoon): this is a bit inefficient as name_to_node is already built in
   // ComputeTransitiveFanin().
-- 
GitLab


From f1f1d6d482e332f11452d9103a29149e2adc7125 Mon Sep 17 00:00:00 2001
From: Igor Saprykin <isaprykin@google.com>
Date: Thu, 1 Feb 2018 14:11:08 -0800
Subject: [PATCH 544/966] Throw an exception when the user's batch size isn't
 divisible by GPUs.

The alternative to this is to have an adaptive approach that would unevenly split input into per-tower batches.  The concern with that was that all towers will be as slow as the one with more input reducing the performance.  Batch size seems to be commonly tailored to the available hardware.

PiperOrigin-RevId: 184192793
---
 .../python/estimator/replicate_model_fn.py    | 10 +++
 .../estimator/replicate_model_fn_test.py      | 79 ++++++++++++++++++-
 2 files changed, 85 insertions(+), 4 deletions(-)

diff --git a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py
index caa9dd8323..c9153c9352 100644
--- a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py
+++ b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py
@@ -457,6 +457,13 @@ def _get_local_devices(device_type):
 def _split_batch(features, labels, number_of_shards, device):
   """Split input features and labes into batches."""
 
+  def ensure_divisible_by_shards(sequence):
+    batch_size = ops_lib.convert_to_tensor(sequence).get_shape()[0]
+    if batch_size % number_of_shards != 0:
+      raise ValueError(
+          'Batch size {} needs to be divisible by the number of GPUs, which '
+          'is {}.'.format(batch_size, number_of_shards))
+
   def split_dictionary(dictionary):
     """Split a dictionary into shards."""
     shards = [{} for _ in range(number_of_shards)]
@@ -467,6 +474,7 @@ def _split_batch(features, labels, number_of_shards, device):
                 sp_input=tensor, num_split=number_of_shards, axis=0)):
           shards[i][name] = shard
       else:
+        ensure_divisible_by_shards(tensor)
         for i, shard in enumerate(array_ops.split(tensor, number_of_shards)):
           shards[i][name] = shard
     return shards
@@ -476,6 +484,7 @@ def _split_batch(features, labels, number_of_shards, device):
       if isinstance(features, dict):
         feature_shards = split_dictionary(features)
       else:
+        ensure_divisible_by_shards(features)
         feature_shards = array_ops.split(features, number_of_shards)
 
       if labels is None:
@@ -483,6 +492,7 @@ def _split_batch(features, labels, number_of_shards, device):
       elif isinstance(labels, dict):
         label_shards = split_dictionary(labels)
       else:
+        ensure_divisible_by_shards(labels)
         label_shards = array_ops.split(labels, number_of_shards)
   return feature_shards, label_shards
 
diff --git a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py
index 03d31226af..6936f8a131 100644
--- a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py
@@ -37,6 +37,7 @@ from tensorflow.python.feature_column import feature_column
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops as ops_lib
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -433,6 +434,17 @@ class ReplicateModelTest(test_util.TensorFlowTestCase):
           'probabilities': np.array([[0.1], [0.02]])
       }, session.run(estimator_spec.predictions))
 
+  def test_batch_size_that_is_not_divisible_by_the_number_of_gpus(self):
+    features = np.array([[1.0], [2.0], [3.0]])
+    labels = np.array([[1.0], [2.0], [3.0]])
+
+    with self.assertRaisesRegexp(
+        ValueError, '.*Batch.+size.+needs.+to.+be.+divisible.+by.+GPUs.+'):
+      replicated_model_fn = replicate_model_fn.replicate_model_fn(
+          self.model_fn, devices=['/gpu:0', '/gpu:1'])
+      _ = replicated_model_fn(
+          features, labels, model_fn_lib.ModeKeys.TRAIN, self.params)
+
   def test_unsupported_loss_reduction(self):
     with self.assertRaisesRegexp(ValueError,
                                  '.+none.+reduction.+is.+specified.+'):
@@ -981,8 +993,13 @@ class SplitBatchTest(test_util.TensorFlowTestCase):
     return list(map(evaluate_items, first_list)), list(
         map(evaluate_items, second_list))
 
+  def assertSparseValuesEqual(self, a, b):
+    self.assertAllEqual(a.indices, b.indices)
+    self.assertAllEqual(a.values, b.values)
+    self.assertAllEqual(a.dense_shape, b.dense_shape)
+
   def test_simple_half_split(self):
-    with self.test_session() as session:  # pylint: disable=unused-variable
+    with self.test_session():
       features = [0.0, 1.0, 2.0, 3.0]
       labels = [10.0, 11.0, 12.0, 13.0]
       feature_shards, label_shards = replicate_model_fn._split_batch(
@@ -995,7 +1012,7 @@ class SplitBatchTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([[10.0, 11.0], [12.0, 13.0]], label_shards)
 
   def test_to_each_their_own(self):
-    with self.test_session() as session:  # pylint: disable=unused-variable
+    with self.test_session():
       features = [0.0, 1.0, 2.0, 3.0]
       labels = [10.0, 11.0, 12.0, 13.0]
       feature_shards, label_shards = replicate_model_fn._split_batch(
@@ -1008,7 +1025,7 @@ class SplitBatchTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([[10.0], [11.0], [12.0], [13.0]], label_shards)
 
   def test_one_batch(self):
-    with self.test_session() as session:  # pylint: disable=unused-variable
+    with self.test_session():
       features = [0.0, 1.0, 2.0, 3.0]
       labels = [10.0, 11.0, 12.0, 13.0]
       feature_shards, label_shards = replicate_model_fn._split_batch(
@@ -1021,7 +1038,7 @@ class SplitBatchTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([[10.0, 11.0, 12.0, 13.0]], label_shards)
 
   def test_half_split_in_dictionary(self):
-    with self.test_session() as session:  # pylint: disable=unused-variable
+    with self.test_session():
       features = {'first': [0.0, 1.0, 2.0, 3.0], 'second': [4.0, 5.0, 6.0, 7.0]}
       labels = [10.0, 11.0, 12.0, 13.0]
 
@@ -1035,6 +1052,60 @@ class SplitBatchTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([10.0, 11.0], label_shards[0].eval())
       self.assertAllEqual([12.0, 13.0], label_shards[1].eval())
 
+  def test_sparse_tensor_can_be_split_unevenly(self):
+    with self.test_session():
+      features = {
+          'x':
+              sparse_tensor.SparseTensor(
+                  indices=[[0, 0], [1, 2], [2, 2]],
+                  values=[1.0, 2.0, 3.0],
+                  dense_shape=[3, 4])
+      }
+      labels = np.array([[1.0], [2.0]])
+
+      feature_shards, label_shards = replicate_model_fn._split_batch(
+          features, labels, 2, device='/gpu:0')
+
+      self.assertSparseValuesEqual(
+          sparse_tensor.SparseTensorValue(
+              indices=[[0, 0], [1, 2]], values=[1., 2.], dense_shape=[2, 4]),
+          feature_shards[0]['x'].eval())
+      self.assertSparseValuesEqual(
+          sparse_tensor.SparseTensorValue(
+              indices=[[0, 2]], values=[3.], dense_shape=[1, 4]),
+          feature_shards[1]['x'].eval())
+      self.assertAllEqual([[1.0]], label_shards[0].eval())
+      self.assertAllEqual([[2.0]], label_shards[1].eval())
+
+  def test_sparse_tensor_can_be_split_unevenly_repeated_row(self):
+    with self.test_session():
+      features = {
+          'x':
+              sparse_tensor.SparseTensor(
+                  indices=[[0, 0], [1, 0], [1, 1]],
+                  values=[1.0, 2.0, 3.0],
+                  dense_shape=[3, 4])
+      }
+      labels = np.array([[1.0], [2.0]])
+
+      feature_shards, label_shards = replicate_model_fn._split_batch(
+          features, labels, 2, device='/gpu:0')
+
+      print(feature_shards[0]['x'].eval())
+      print(feature_shards[1]['x'].eval())
+      self.assertSparseValuesEqual(
+          sparse_tensor.SparseTensorValue(
+              indices=[[0, 0], [1, 0], [1, 1]],
+              values=[1., 2., 3.],
+              dense_shape=[2, 4]), feature_shards[0]['x'].eval())
+
+      second_batch = feature_shards[1]['x'].eval()
+      self.assertFalse(len(second_batch.indices))
+      self.assertFalse(len(second_batch.values))
+      self.assertAllEqual([1, 4], second_batch.dense_shape)
+      self.assertAllEqual([[1.0]], label_shards[0].eval())
+      self.assertAllEqual([[2.0]], label_shards[1].eval())
+
   def test_one_batch_in_dictionary(self):
     with self.test_session() as session:  # pylint: disable=unused-variable
       features = {'first': [0.0, 1.0, 2.0, 3.0], 'second': [4.0, 5.0, 6.0, 7.0]}
-- 
GitLab


From 31edddcc025b95fd6fec419d4372d3f3a4f89af8 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Thu, 1 Feb 2018 14:23:29 -0800
Subject: [PATCH 545/966]  Internal change.

PiperOrigin-RevId: 184194895
---
 tensorflow/contrib/bayesflow/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/contrib/bayesflow/BUILD b/tensorflow/contrib/bayesflow/BUILD
index 11c3c037c4..6e0f0a0572 100644
--- a/tensorflow/contrib/bayesflow/BUILD
+++ b/tensorflow/contrib/bayesflow/BUILD
@@ -217,6 +217,7 @@ cuda_py_test(
         "//tensorflow/python:platform_test",
         "//tensorflow/python:random_seed",
     ],
+    tags = ["notsan"],
 )
 
 cuda_py_test(
-- 
GitLab


From 1f7352e0e5354f35e6f181071f791612257dc026 Mon Sep 17 00:00:00 2001
From: Nick Felt <nickfelt@google.com>
Date: Thu, 1 Feb 2018 15:05:41 -0800
Subject: [PATCH 546/966] Revert TensorBoard entry point back to run_main

PiperOrigin-RevId: 184201506
---
 tensorflow/tools/pip_package/setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 1b1d60c4f3..3cd4d12100 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -80,13 +80,13 @@ CONSOLE_SCRIPTS = [
     # is now declared by the tensorboard pip package. If we remove the
     # TensorBoard command, pip will inappropriately remove it during install,
     # even though the command is not removed, just moved to a different wheel.
-    'tensorboard = tensorboard.main:main',
+    'tensorboard = tensorboard.main:run_main',
 ]
 # pylint: enable=line-too-long
 
 # remove the tensorboard console script if building tf_nightly
 if 'tf_nightly' in project_name:
-  CONSOLE_SCRIPTS.remove('tensorboard = tensorboard.main:main')
+  CONSOLE_SCRIPTS.remove('tensorboard = tensorboard.main:run_main')
 
 TEST_PACKAGES = [
     'scipy >= 0.15.1',
-- 
GitLab


From 16ce8ed3f9c2eafb7e1f96ea620698da382e621c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 1 Feb 2018 15:11:20 -0800
Subject: [PATCH 547/966] [XLA] add DotGeneral to the local Python XLA client.

PiperOrigin-RevId: 184202425
---
 .../xla/python/local_computation_builder.cc   |   6 +
 .../xla/python/local_computation_builder.h    |   4 +
 .../xla/python/local_computation_builder.i    | 131 ++++++++++++++++++
 tensorflow/compiler/xla/python/xla_client.py  |  39 +++++-
 .../compiler/xla/python/xla_client_test.py    |  24 ++++
 5 files changed, 203 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/python/local_computation_builder.cc b/tensorflow/compiler/xla/python/local_computation_builder.cc
index 9e3cf79383..8386acf0cd 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.cc
+++ b/tensorflow/compiler/xla/python/local_computation_builder.cc
@@ -404,6 +404,12 @@ ComputationDataHandle LocalComputationBuilder::Dot(
   return builder_.Dot(lhs, rhs);
 }
 
+ComputationDataHandle LocalComputationBuilder::DotGeneral(
+    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
+    const DotDimensionNumbers& dimension_numbers) {
+  return builder_.DotGeneral(lhs, rhs, dimension_numbers);
+}
+
 ComputationDataHandle LocalComputationBuilder::ConvGeneralDilated(
     const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
     tensorflow::gtl::ArraySlice<int64> window_strides,
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.h b/tensorflow/compiler/xla/python/local_computation_builder.h
index ae5bbc03cb..f39d15cff7 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.h
+++ b/tensorflow/compiler/xla/python/local_computation_builder.h
@@ -192,6 +192,10 @@ class LocalComputationBuilder {
   ComputationDataHandle Dot(const ComputationDataHandle& lhs,
                             const ComputationDataHandle& rhs);
 
+  ComputationDataHandle DotGeneral(
+      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
+      const DotDimensionNumbers& dimension_numbers);
+
   ComputationDataHandle ConvGeneralDilated(
       const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
       tensorflow::gtl::ArraySlice<int64> window_strides,
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.i b/tensorflow/compiler/xla/python/local_computation_builder.i
index fdd05692fc..5ea75550c9 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.i
+++ b/tensorflow/compiler/xla/python/local_computation_builder.i
@@ -34,6 +34,7 @@ limitations under the License.
 //  ArraySlice<pair<int64, in64>>      <-  sequence of int pairs
 //  PaddingConfig proto                <-  corresponding Python proto
 //  ConvolutionDimensionNumbers proto  <-  corresponding Python proto
+//  DotDimensionNumbers proto          <-  corresponding Python proto
 //
 // Arrows indicate whether a conversion only ever occurs in one
 // direction, or whether it is maintained bidirectionally.
@@ -511,6 +512,135 @@ tensorflow::ImportNumpy();
   $1 = temps;
 }
 
+// DotDimensionNumbers
+
+%typemap(in) const DotDimensionNumbers&
+    (DotDimensionNumbers dimension_numbers) {
+  int length;
+
+  /* lhs_contracting_dimensions */
+  PyObject* lhs_contracting_dimensions = PyObject_GetAttrString(
+      $input, "lhs_contracting_dimensions");
+  if (!lhs_contracting_dimensions) {
+    return NULL;
+  }
+
+  length = PySequence_Size(lhs_contracting_dimensions);
+  if (length == -1) {
+    Py_DECREF(lhs_contracting_dimensions);
+    return NULL;
+  }
+
+  for (int i = 0; i < length; ++i) {
+    PyObject* item = PySequence_GetItem(lhs_contracting_dimensions, i);
+    if (!item) {
+      Py_DECREF(lhs_contracting_dimensions);
+      return NULL;
+    }
+    const int64 dimension = numpy::PyIntOrPyLongToLong(item);
+    if (dimension == -1 && PyErr_Occurred()) {
+      Py_DECREF(item);
+      Py_DECREF(lhs_contracting_dimensions);
+      return NULL;
+    }
+    dimension_numbers.add_lhs_contracting_dimensions(dimension);
+    Py_DECREF(item);
+  }
+  Py_DECREF(lhs_contracting_dimensions);
+
+  /* rhs_contracting_dimensions */
+  PyObject* rhs_contracting_dimensions = PyObject_GetAttrString(
+      $input, "rhs_contracting_dimensions");
+  if (!lhs_contracting_dimensions) {
+    return NULL;
+  }
+
+  length = PySequence_Size(rhs_contracting_dimensions);
+  if (length == -1) {
+    Py_DECREF(rhs_contracting_dimensions);
+    return NULL;
+  }
+
+  for (int i = 0; i < length; ++i) {
+    PyObject* item = PySequence_GetItem(rhs_contracting_dimensions, i);
+    if (!item) {
+      Py_DECREF(rhs_contracting_dimensions);
+      return NULL;
+    }
+    const int64 dimension = numpy::PyIntOrPyLongToLong(item);
+    if (dimension == -1 && PyErr_Occurred()) {
+      Py_DECREF(item);
+      Py_DECREF(rhs_contracting_dimensions);
+      return NULL;
+    }
+    dimension_numbers.add_rhs_contracting_dimensions(dimension);
+    Py_DECREF(item);
+  }
+  Py_DECREF(rhs_contracting_dimensions);
+
+  /* lhs_batch_dimensions */
+  PyObject* lhs_batch_dimensions = PyObject_GetAttrString(
+      $input, "lhs_batch_dimensions");
+  if (!lhs_batch_dimensions) {
+    return NULL;
+  }
+
+  length = PySequence_Size(lhs_batch_dimensions);
+  if (length == -1) {
+    Py_DECREF(lhs_batch_dimensions);
+    return NULL;
+  }
+
+  for (int i = 0; i < length; ++i) {
+    PyObject* item = PySequence_GetItem(lhs_batch_dimensions, i);
+    if (!item) {
+      Py_DECREF(lhs_batch_dimensions);
+      return NULL;
+    }
+    const int64 dimension = numpy::PyIntOrPyLongToLong(item);
+    if (dimension == -1 && PyErr_Occurred()) {
+      Py_DECREF(item);
+      Py_DECREF(lhs_batch_dimensions);
+      return NULL;
+    }
+    dimension_numbers.add_lhs_batch_dimensions(dimension);
+    Py_DECREF(item);
+  }
+  Py_DECREF(lhs_batch_dimensions);
+
+  /* rhs_batch_dimensions */
+  PyObject* rhs_batch_dimensions = PyObject_GetAttrString(
+      $input, "rhs_batch_dimensions");
+  if (!rhs_batch_dimensions) {
+    return NULL;
+  }
+
+  length = PySequence_Size(rhs_batch_dimensions);
+  if (length == -1) {
+    Py_DECREF(rhs_batch_dimensions);
+    return NULL;
+  }
+
+  for (int i = 0; i < length; ++i) {
+    PyObject* item = PySequence_GetItem(rhs_batch_dimensions, i);
+    if (!item) {
+      Py_DECREF(rhs_batch_dimensions);
+      return NULL;
+    }
+    const int64 dimension = numpy::PyIntOrPyLongToLong(item);
+    if (dimension == -1 && PyErr_Occurred()) {
+      Py_DECREF(item);
+      Py_DECREF(rhs_batch_dimensions);
+      return NULL;
+    }
+    dimension_numbers.add_rhs_batch_dimensions(dimension);
+    Py_DECREF(item);
+  }
+  Py_DECREF(rhs_batch_dimensions);
+
+  $1 = &dimension_numbers;
+}
+
 // PaddingConfig
 
 %typemap(in) const PaddingConfig&
@@ -756,6 +886,7 @@ tensorflow::ImportNumpy();
 %unignore xla::swig::LocalComputationBuilder::Lt;
 %unignore xla::swig::LocalComputationBuilder::Le;
 %unignore xla::swig::LocalComputationBuilder::Dot;
+%unignore xla::swig::LocalComputationBuilder::DotGeneral;
 %unignore xla::swig::LocalComputationBuilder::ConvGeneralDilated;
 %unignore xla::swig::LocalComputationBuilder::Add;
 %unignore xla::swig::LocalComputationBuilder::Sub;
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index bb42e8d703..b890980955 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -931,10 +931,37 @@ class ComputationBuilder(object):
                            _unwrap_data_handle(init)))
 
   def Dot(self, lhs, rhs):
-    """Matrix multiplication between lhs and rhs."""
+    """Enqueues a dot operation onto the computation.
+
+    Args:
+      lhs: ComputationDataHandle for the rank 1 or rank 2 left-hand-side array.
+      rhs: ComputationDataHandle for the rank 1 or rank 2 right-hand-side array.
+
+    Returns: a ComputationDataHandle representing the Dot operation.
+    """
     return _wrap_data_handle(
         self._client.Dot(_unwrap_data_handle(lhs), _unwrap_data_handle(rhs)))
 
+  def DotGeneral(self, lhs, rhs, dimension_numbers):
+    """Enqueues a general dot operation onto the computation.
+
+    Args:
+      lhs: ComputationDataHandle for the left-hand-side array.
+      rhs: ComputationDataHandle for the right-hand-side array.
+      dimension_numbers: either an xla_data_pb2.DotDimensionNumbers or a nested
+        tuple ((lhs_contract, rhs_contract), (lhs_batch, rhs_batch)) of lists of
+        integers representing the dimensions to treat as contracting dimensions
+        and batch dimensions on each input operand.
+
+    Returns: a ComputationDataHandle representing the DotGeneral operation.
+    """
+    if not isinstance(dimension_numbers, xla_data_pb2.DotDimensionNumbers):
+      dimension_numbers = GetDotDimensionsFromLists(dimension_numbers)
+    return _wrap_data_handle(
+        self._client.DotGeneral(
+            _unwrap_data_handle(lhs), _unwrap_data_handle(rhs),
+            dimension_numbers))
+
   def Conv(self, lhs, rhs, window_strides, padding):
     """Enqueues a Conv operation onto the computation.
 
@@ -1071,3 +1098,13 @@ def GetPaddingConfigFromTriples(triples):
     dimension.edge_padding_high = hi
     dimension.interior_padding = interior
   return padding_config
+
+
+def GetDotDimensionsFromLists(dimension_numbers):
+  (lhs_contract, rhs_contract), (lhs_batch, rhs_batch) = dimension_numbers
+  dot_dims_proto = xla_data_pb2.DotDimensionNumbers()
+  dot_dims_proto.lhs_contracting_dimensions.extend(lhs_contract)
+  dot_dims_proto.rhs_contracting_dimensions.extend(rhs_contract)
+  dot_dims_proto.lhs_batch_dimensions.extend(lhs_batch)
+  dot_dims_proto.rhs_batch_dimensions.extend(rhs_batch)
+  return dot_dims_proto
diff --git a/tensorflow/compiler/xla/python/xla_client_test.py b/tensorflow/compiler/xla/python/xla_client_test.py
index 3b5bbfd786..421fba40e3 100644
--- a/tensorflow/compiler/xla/python/xla_client_test.py
+++ b/tensorflow/compiler/xla/python/xla_client_test.py
@@ -444,6 +444,30 @@ class SingleOpTest(LocalComputationTest):
     c.Dot(c.Constant(lhs), c.Constant(rhs))
     self._ExecuteAndCompareClose(c, expected=np.dot(lhs, rhs))
 
+  def testDotGeneral(self):
+    c = self._NewComputation()
+    rng = np.random.RandomState(0)
+    lhs = NumpyArrayF32(rng.randn(10, 3, 4))
+    rhs = NumpyArrayF32(rng.randn(10, 4, 5))
+    dimension_numbers = (([2], [1]), ([0], [0]))
+    c.DotGeneral(c.Constant(lhs), c.Constant(rhs), dimension_numbers)
+    self._ExecuteAndCompareClose(c, expected=np.matmul(lhs, rhs))
+
+  def testDotGeneralWithDotDimensionNumbersProto(self):
+    c = self._NewComputation()
+    rng = np.random.RandomState(0)
+    lhs = NumpyArrayF32(rng.randn(10, 3, 4))
+    rhs = NumpyArrayF32(rng.randn(10, 4, 5))
+
+    dimension_numbers = xla_client.xla_data_pb2.DotDimensionNumbers()
+    dimension_numbers.lhs_contracting_dimensions.append(2)
+    dimension_numbers.rhs_contracting_dimensions.append(1)
+    dimension_numbers.lhs_batch_dimensions.append(0)
+    dimension_numbers.rhs_batch_dimensions.append(0)
+
+    c.DotGeneral(c.Constant(lhs), c.Constant(rhs), dimension_numbers)
+    self._ExecuteAndCompareClose(c, expected=np.matmul(lhs, rhs))
+
   def testConvF32Same(self):
     c = self._NewComputation()
     a = lambda *dims: np.arange(np.prod(dims)).reshape(dims).astype("float32")
-- 
GitLab


From a7398af84e30eda2cf47496c82bdfe1c9e36381d Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Thu, 1 Feb 2018 15:11:35 -0800
Subject: [PATCH 548/966] Make jit_test.py work with C API enabled.

PiperOrigin-RevId: 184202470
---
 tensorflow/contrib/compiler/jit_test.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/compiler/jit_test.py b/tensorflow/contrib/compiler/jit_test.py
index 2108e42bce..29a593f6bc 100644
--- a/tensorflow/contrib/compiler/jit_test.py
+++ b/tensorflow/contrib/compiler/jit_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import function
 from tensorflow.python.framework import op_def_registry
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradients
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -169,6 +170,7 @@ class JITTest(test.TestCase):
       self.assertEqual(b"jit_scope_0", func_attrs["_XlaScope"].s)
 
 
+@test_util.with_c_api
 class CompilationEnabledInGradientTest(test.TestCase):
 
   def testCompilationInGradient(self):
@@ -188,7 +190,7 @@ class CompilationEnabledInGradientTest(test.TestCase):
       for cg in c_grad_ops:
         self.assertTrue(cg.get_attr("_XlaCompile"))
       for ncg in nc_grad_ops:
-        with self.assertRaisesRegexp(ValueError, "No attr named"):
+        with self.assertRaisesRegexp(ValueError, "[Nn]o attr named"):
           ncg.get_attr("_XlaCompile")
 
       # d/dx (x ** 4) = 4 * (x ** 3)
-- 
GitLab


From d719238f1ddedf5569bfd0ca13fa3a29bfecdd78 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 1 Feb 2018 15:32:20 -0800
Subject: [PATCH 549/966] Add iterate_batches arg to Estimator.predict

PiperOrigin-RevId: 184205196
---
 .../contrib/learn/python/learn/estimators/estimator.py   | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator.py b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
index 8d59fe66d9..63d0f1e1d4 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
@@ -600,7 +600,8 @@ class BaseEstimator(sklearn.BaseEstimator, evaluable.Evaluable,
               input_fn=None,
               batch_size=None,
               outputs=None,
-              as_iterable=True):
+              as_iterable=True,
+              iterate_batches=False):
     """Returns predictions for given features.
 
     Args:
@@ -616,6 +617,9 @@ class BaseEstimator(sklearn.BaseEstimator, evaluable.Evaluable,
         for each example until inputs are exhausted. Note: The inputs must
         terminate if you want the iterable to terminate (e.g. be sure to pass
         num_epochs=1 if you are using something like read_batch_features).
+      iterate_batches: If True, yield the whole batch at once instead of
+        decomposing the batch into individual samples. Only relevant when
+        as_iterable is True.
 
     Returns:
       A numpy array of predicted classes or regression values if the
@@ -635,7 +639,8 @@ class BaseEstimator(sklearn.BaseEstimator, evaluable.Evaluable,
         input_fn=input_fn,
         feed_fn=feed_fn,
         outputs=outputs,
-        as_iterable=as_iterable)
+        as_iterable=as_iterable,
+        iterate_batches=iterate_batches)
 
   def get_variable_value(self, name):
     """Returns value of the variable given by name.
-- 
GitLab


From e7cf813d810a8f0061d1f31d869a331687090904 Mon Sep 17 00:00:00 2001
From: Raghuraman Krishnamoorthi <raghuramank@google.com>
Date: Thu, 1 Feb 2018 16:14:51 -0800
Subject: [PATCH 550/966] Add functionality to fold batch norm (supporting both
 fused and unfused batch norm) to support quantized training. The weights are
 always now scaled by gamma/sigma, where sigma is the moving standard
 deviation for stability prior to quantization.  For improved performance, the
 moving means and variances are frozen and the training graph modified
 accordingly.

An additional parameter freeze_batch_norm_delay is added to foldbatchnorm function to set the delay at which training switches from regular batch norm to frozen mean and variances.

Remove placement options within FoldBatchNorm as this causes folded training to place all ops on a single GPU. The modification now significantly speeds up distributed training.

The tests for folding batch norms are also updated to reflect the additional topological changes to the graph.

PiperOrigin-RevId: 184211434
---
 tensorflow/contrib/quantize/BUILD             |   4 +
 .../quantize/python/fold_batch_norms.py       | 430 ++++++++++++++++--
 .../quantize/python/fold_batch_norms_test.py  |  97 ++--
 .../contrib/quantize/python/quantize_graph.py |  12 +-
 4 files changed, 467 insertions(+), 76 deletions(-)

diff --git a/tensorflow/contrib/quantize/BUILD b/tensorflow/contrib/quantize/BUILD
index 3c5b34a0a6..b7d525a1fa 100644
--- a/tensorflow/contrib/quantize/BUILD
+++ b/tensorflow/contrib/quantize/BUILD
@@ -77,9 +77,13 @@ py_library(
         "//tensorflow/contrib/graph_editor:graph_editor_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:layers",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn",
         "//tensorflow/python:nn_ops",
+        "//tensorflow/python:ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variables",
     ],
 )
 
diff --git a/tensorflow/contrib/quantize/python/fold_batch_norms.py b/tensorflow/contrib/quantize/python/fold_batch_norms.py
index aa605e6caa..8ec5334a39 100644
--- a/tensorflow/contrib/quantize/python/fold_batch_norms.py
+++ b/tensorflow/contrib/quantize/python/fold_batch_norms.py
@@ -17,7 +17,6 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-
 import re
 from tensorflow.contrib import graph_editor
 from tensorflow.contrib.quantize.python import common
@@ -26,14 +25,16 @@ from tensorflow.contrib.quantize.python import input_to_ops
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.layers import utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.training import training_util
 from tensorflow.python.util import compat
 
 
-def FoldBatchNorms(graph):
+def FoldBatchNorms(graph, freeze_batch_norm_delay=None, is_training=True):
   """Finds batch norm layers and folds them into preceding layers.
 
   Folding only affects the following layers: Conv2D, fully connected, depthwise
@@ -41,15 +42,25 @@ def FoldBatchNorms(graph):
 
   Args:
     graph: Graph to walk and modify.
+    freeze_batch_norm_delay: How many steps to wait before freezing
+    moving mean and variance and using them for batch normalization. This value
+    is used only when is_training is True.
+    is_training: Bool, true if training
 
   Raises:
     ValueError: When batch norm folding fails.
   """
-  _FoldFusedBatchNorms(graph)
-  _FoldUnfusedBatchNorms(graph)
+  _FoldFusedBatchNorms(
+      graph,
+      freeze_batch_norm_delay=freeze_batch_norm_delay,
+      is_training=is_training)
+  _FoldUnfusedBatchNorms(
+      graph,
+      freeze_batch_norm_delay=freeze_batch_norm_delay,
+      is_training=is_training)
 
 
-def _FoldFusedBatchNorms(graph):
+def _FoldFusedBatchNorms(graph, freeze_batch_norm_delay, is_training):
   """Finds fused batch norm layers and folds them into preceding layers.
 
   Folding only affects the following layers: Conv2D, fully connected, depthwise
@@ -57,6 +68,9 @@ def _FoldFusedBatchNorms(graph):
 
   Args:
     graph: Graph to walk and modify.
+    freeze_batch_norm_delay: How many steps to wait before freezing
+    moving mean and variance and using them for batch normalization
+    is_training: Bool, true if training
 
   Raises:
     ValueError: When batch norm folding fails.
@@ -67,8 +81,7 @@ def _FoldFusedBatchNorms(graph):
     # `bn_op`. The '/' (i.e. `sep`) ensures that we reuse the existing scope
     # named `scope`. Otherwise, TF creates a unique scope whose name starts with
     # `scope`.
-    with graph.as_default(), graph.name_scope(scope + sep), ops.device(
-        match.bn_op.device):
+    with graph.as_default(), graph.name_scope(scope + sep):
       with graph.name_scope(scope + sep + 'BatchNorm_Fold' + sep):
         # new weights = old weights * gamma / sqrt(variance + epsilon)
         # new biases = -mean * gamma / sqrt(variance + epsilon) + beta
@@ -79,9 +92,18 @@ def _FoldFusedBatchNorms(graph):
             match.mean_tensor * multiplier_tensor,
             name='bias')
 
+        correction_scale, correction_recip, correction_offset = None, None, None
+        if is_training:
+          correction_scale, correction_recip, correction_offset = (
+              _ComputeBatchNormCorrections(
+                  context='',
+                  match=match,
+                  freeze_batch_norm_delay=freeze_batch_norm_delay,
+                  fused_batch_norm=True))
         # The shape of depthwise weights is different, so we need to reshape the
         # multiplier_tensor to ensure that the scaled_weight_tensor has the
         # expected shape.
+        weights = match.weight_tensor
         if match.layer_op.type == 'DepthwiseConv2dNative':
           new_shape = [
               match.weight_tensor.get_shape().as_list()[2],
@@ -90,15 +112,29 @@ def _FoldFusedBatchNorms(graph):
           multiplier_tensor = array_ops.reshape(
               multiplier_tensor, new_shape, name='scale_reshape')
 
+          if correction_scale is not None:
+            correction_scale = array_ops.reshape(
+                correction_scale, new_shape, name='correction_reshape')
+
+      if correction_scale is not None:
+        weights = math_ops.multiply(
+            correction_scale, weights, name='correction_mult')
+
       # TODO(suharshs): This naming of the following ops needs to carefully
       # follow the naming expected by quantize.py. Generalize the quantize code
       # to not require these delicate naming conventions.
       scaled_weight_tensor = math_ops.multiply(
-          match.weight_tensor, multiplier_tensor, name='mul_fold')
+          weights, multiplier_tensor, name='mul_fold')
 
       new_layer_tensor = _CloneWithNewOperands(
           match.layer_op, match.input_tensor, scaled_weight_tensor)
 
+      if correction_recip is not None:
+        new_layer_tensor = math_ops.multiply(
+            correction_recip, new_layer_tensor, name='post_conv_mul')
+        new_layer_tensor = math_ops.add(new_layer_tensor, (correction_offset),
+                                        'correction_add')
+
       bias_add_tensor = math_ops.add(
           new_layer_tensor, bias_tensor, name='add_fold')
 
@@ -165,6 +201,8 @@ def _FindFusedBatchNorms(graph):
   mean_pattern = graph_matcher.OpTypePattern('*')
   variance_pattern = graph_matcher.OpTypePattern('*')
 
+  moving_average_pattern = graph_matcher.OpTypePattern('*')
+  bn_decay_pattern = graph_matcher.OpTypePattern('*')
   conv_pattern = graph_matcher.OpTypePattern(
       'Conv2D|DepthwiseConv2dNative', inputs=[input_pattern, weight_pattern])
   # MatMul has a Reshape between it and FusedBatchNorm.
@@ -180,6 +218,11 @@ def _FindFusedBatchNorms(graph):
           conv_pattern, gamma_pattern, beta_pattern, mean_pattern,
           variance_pattern
       ])
+  conv_moving_average_sub_pattern = graph_matcher.OpTypePattern(
+      'Sub', inputs=[moving_average_pattern, conv_batch_norm_pattern])
+  # TODO(suharshs): Use a OneofPattern here when available
+  conv_moving_average_mul_pattern = graph_matcher.OpTypePattern(
+      'Mul', inputs=[conv_moving_average_sub_pattern, bn_decay_pattern])
   matmul_batch_norm_pattern = graph_matcher.OpTypePattern(
       'FusedBatchNorm',
       inputs=[
@@ -191,8 +234,34 @@ def _FindFusedBatchNorms(graph):
       inputs=[matmul_batch_norm_pattern,
               graph_matcher.OpTypePattern('*')])
 
+  matmul_moving_average_sub_pattern = graph_matcher.OpTypePattern(
+      'Sub', inputs=[moving_average_pattern, matmul_batch_norm_pattern])
+  matmul_moving_average_mul_pattern = graph_matcher.OpTypePattern(
+      'Mul', inputs=[matmul_moving_average_sub_pattern, bn_decay_pattern])
+
   conv_matcher = graph_matcher.GraphMatcher(conv_batch_norm_pattern)
   matmul_matcher = graph_matcher.GraphMatcher(matmul_bn_output_reshape_pattern)
+  conv_moving_average_mul_matcher = graph_matcher.GraphMatcher(
+      conv_moving_average_mul_pattern)
+  matmul_moving_average_mul_matcher = graph_matcher.GraphMatcher(
+      matmul_moving_average_mul_pattern)
+
+  def _GetMovingAverageTensors(graph, moving_avg_mul_matcher,
+                               moving_avg_sub_pattern, bn_op):
+    """Gets the moving mean and variance tensors and the batch norm momentum."""
+    for mul_match_result in moving_avg_mul_matcher.match_graph(graph):
+      sub_op = mul_match_result.get_op(moving_avg_sub_pattern)
+
+      if sub_op.inputs[1].name == bn_op.outputs[1].name:
+        # During training: Batch Mean is bn_op.outputs[1]
+        moving_mean_tensor = sub_op.inputs[0]
+        bn_decay_mean_tensor = mul_match_result.get_tensor(bn_decay_pattern)
+      if sub_op.inputs[1].name == bn_op.outputs[2].name:
+        # During training: Batch Var is bn_op.outputs[2]
+        moving_variance_tensor = sub_op.inputs[0]
+        bn_decay_var_tensor = mul_match_result.get_tensor(bn_decay_pattern)
+    return (moving_mean_tensor, bn_decay_mean_tensor, moving_variance_tensor,
+            bn_decay_var_tensor)
 
   def _GetCommonTensors(match_result, bn_op, bn_input_tensor):
     """Gets tensors needed for FusedBatchNormMatch from match_result."""
@@ -222,10 +291,14 @@ def _FindFusedBatchNorms(graph):
       # calculation, the variance is corrected by the term N/N-1 (Bessel's
       # correction). The variance tensor read from FuseBatchNorm has bessel's
       # correction applied, so we undo it here.
-      n = math_ops.cast(
-          array_ops.size(bn_input_tensor) / array_ops.size(mean_tensor),
-          dtypes.float32)
-      variance_tensor = bn_op.outputs[2] * (n - 1) / n
+      scope, sep, _ = bn_op.name.rpartition('/')
+      g = ops.get_default_graph()
+      with g.as_default(), g.name_scope(scope + sep):
+        n = math_ops.cast(
+            array_ops.size(bn_input_tensor) / array_ops.size(mean_tensor),
+            dtypes.float32)
+        variance_tensor = math_ops.multiply(
+            bn_op.outputs[2], (n - 1) / n, name='Undo_Bessel_Correction')
     else:
       mean_tensor = match_result.get_tensor(mean_pattern)
       variance_tensor = match_result.get_tensor(variance_pattern)
@@ -233,15 +306,30 @@ def _FindFusedBatchNorms(graph):
             variance_tensor)
 
   for match_result in conv_matcher.match_graph(graph):
+    moving_mean_tensor = None
+    moving_variance_tensor = None
+    bn_decay_mean_tensor = None
+    bn_decay_var_tensor = None
     layer_op = match_result.get_op(conv_pattern)
     layer_tensor = match_result.get_tensor(conv_pattern)
     bn_op = match_result.get_op(conv_batch_norm_pattern)
-    # In the case of convolution the output_tensor is the output of bn_op.
-    output_tensor = bn_op.outputs[0]
+    if bn_op.get_attr('is_training'):
+      (moving_mean_tensor, bn_decay_mean_tensor, moving_variance_tensor,
+       bn_decay_var_tensor) = _GetMovingAverageTensors(
+           graph,
+           moving_avg_mul_matcher=conv_moving_average_mul_matcher,
+           moving_avg_sub_pattern=conv_moving_average_sub_pattern,
+           bn_op=bn_op)
 
+    output_tensor = bn_op.outputs[0]
+    batch_epsilon_tensor = bn_op.get_attr('epsilon')
     (input_tensor, weight_tensor, gamma_tensor, beta_tensor, mean_tensor,
-     variance_tensor) = _GetCommonTensors(match_result, bn_op, layer_tensor)
-    yield _FusedBatchNormMatch(
+     variance_tensor) = _GetCommonTensors(
+         match_result,
+         bn_op,
+         layer_tensor,
+     )
+    yield _BatchNormMatch(
         layer_op=layer_op,
         bn_op=bn_op,
         output_tensor=output_tensor,
@@ -250,20 +338,38 @@ def _FindFusedBatchNorms(graph):
         gamma_tensor=gamma_tensor,
         beta_tensor=beta_tensor,
         mean_tensor=mean_tensor,
-        variance_tensor=variance_tensor)
+        variance_tensor=variance_tensor,
+        moving_mean_tensor=moving_mean_tensor,
+        moving_variance_tensor=moving_variance_tensor,
+        bn_decay_mean_tensor=bn_decay_mean_tensor,
+        bn_decay_var_tensor=bn_decay_var_tensor,
+        batch_epsilon_tensor=batch_epsilon_tensor)
 
   for match_result in matmul_matcher.match_graph(graph):
+    moving_mean_tensor = None
+    moving_variance_tensor = None
+    bn_decay_mean_tensor = None
+    bn_decay_var_tensor = None
     layer_op = match_result.get_op(matmul_pattern)
     layer_tensor = match_result.get_tensor(matmul_pattern)
     bn_op = match_result.get_op(matmul_batch_norm_pattern)
+    if bn_op.get_attr('is_training'):
+      (moving_mean_tensor, bn_decay_mean_tensor, moving_variance_tensor,
+       bn_decay_var_tensor) = _GetMovingAverageTensors(
+           graph,
+           moving_avg_mul_matcher=matmul_moving_average_mul_matcher,
+           moving_avg_sub_pattern=matmul_moving_average_sub_pattern,
+           bn_op=bn_op)
+
     # In the MatMul case, the output of batch norm is reshaped back into a
     # 2D tensor, so the output_tensor is the output of the Reshape op.
     output_reshape_op = match_result.get_op(matmul_bn_output_reshape_pattern)
     output_tensor = output_reshape_op.outputs[0]
+    batch_epsilon_tensor = bn_op.get_attr('epsilon')
 
     (input_tensor, weight_tensor, gamma_tensor, beta_tensor, mean_tensor,
      variance_tensor) = _GetCommonTensors(match_result, bn_op, layer_tensor)
-    yield _FusedBatchNormMatch(
+    yield _BatchNormMatch(
         layer_op=layer_op,
         bn_op=bn_op,
         output_tensor=output_tensor,
@@ -272,15 +378,21 @@ def _FindFusedBatchNorms(graph):
         gamma_tensor=gamma_tensor,
         beta_tensor=beta_tensor,
         mean_tensor=mean_tensor,
-        variance_tensor=variance_tensor)
+        variance_tensor=variance_tensor,
+        moving_mean_tensor=moving_mean_tensor,
+        moving_variance_tensor=moving_variance_tensor,
+        bn_decay_mean_tensor=bn_decay_mean_tensor,
+        bn_decay_var_tensor=bn_decay_var_tensor,
+        batch_epsilon_tensor=batch_epsilon_tensor)
 
 
-class _FusedBatchNormMatch(object):
-  """Contains all information related to a found FusedBatchNorm."""
+class _BatchNormMatch(object):
+  """Contains all information related to a found Fused/UnfusedBatchNorm."""
 
   def __init__(self, layer_op, bn_op, output_tensor, input_tensor,
                weight_tensor, gamma_tensor, beta_tensor, mean_tensor,
-               variance_tensor):
+               variance_tensor, moving_mean_tensor, moving_variance_tensor,
+               bn_decay_mean_tensor, bn_decay_var_tensor, batch_epsilon_tensor):
     self._layer_op = layer_op
     self._bn_op = bn_op
     self._output_tensor = output_tensor
@@ -290,6 +402,11 @@ class _FusedBatchNormMatch(object):
     self._beta_tensor = beta_tensor
     self._mean_tensor = mean_tensor
     self._variance_tensor = variance_tensor
+    self._moving_mean_tensor = moving_mean_tensor
+    self._moving_variance_tensor = moving_variance_tensor
+    self._bn_decay_mean_tensor = bn_decay_mean_tensor
+    self._bn_decay_var_tensor = bn_decay_var_tensor
+    self._batch_epsilon_tensor = batch_epsilon_tensor
 
   @property
   def layer_op(self):
@@ -327,8 +444,28 @@ class _FusedBatchNormMatch(object):
   def variance_tensor(self):
     return self._variance_tensor
 
+  @property
+  def moving_mean_tensor(self):
+    return self._moving_mean_tensor
+
+  @property
+  def moving_variance_tensor(self):
+    return self._moving_variance_tensor
+
+  @property
+  def batch_epsilon_tensor(self):
+    return self._batch_epsilon_tensor
+
+  @property
+  def bn_decay_mean_tensor(self):
+    return self._bn_decay_mean_tensor
+
+  @property
+  def bn_decay_var_tensor(self):
+    return self._bn_decay_var_tensor
+
 
-def _FoldUnfusedBatchNorms(graph):
+def _FoldUnfusedBatchNorms(graph, freeze_batch_norm_delay, is_training):
   """Finds unfused batch norm layers and folds them into preceding layers.
 
   Folding only affects the following layers: Conv2D, fully connected, depthwise
@@ -336,6 +473,9 @@ def _FoldUnfusedBatchNorms(graph):
 
   Args:
     graph: Graph to walk and modify.
+    freeze_batch_norm_delay: How many steps to wait before freezing
+    moving mean and variance and using them for batch normalization
+    is_training: Bool, True if training
 
   Raises:
     ValueError: When batch norm folding fails.
@@ -346,7 +486,12 @@ def _FoldUnfusedBatchNorms(graph):
     has_scaling = _HasScaling(graph, input_to_ops_map, bn)
 
     # The mangling code intimately depends on BatchNorm node's internals.
-    original_op, folded_op = _CreateFoldedOp(graph, bn, has_scaling=has_scaling)
+    original_op, folded_op = _CreateFoldedOp(
+        graph,
+        bn,
+        has_scaling=has_scaling,
+        freeze_batch_norm_delay=freeze_batch_norm_delay,
+        is_training=is_training)
 
     activation = common.GetEndpointActivationOp(graph, bn)
     if activation:
@@ -407,7 +552,186 @@ def _HasScaling(graph, input_to_ops_map, bn):
   return sum(1 for op in rsqrt_consumers if op.type == 'Mul') == 1
 
 
-def _CreateFoldedOp(graph, context, has_scaling):
+def _GetBatchNormParams(graph, context, has_scaling):
+  """Extracts relevant tensors for folding batch norms.
+
+  Args:
+    graph: Graph to inspect.
+    context: The scope under which we look for batch norm params
+    has_scaling: Bool that specifies if scaling is done as part of batch
+    norm
+
+  Returns:
+   _BatchNormMatch containing all required batch norm parameters
+  """
+  gamma_tensor = None
+  batch_mean_tensor = None
+  batch_variance_tensor = None
+  moving_mean_tensor = None
+  moving_variance_tensor = None
+  batch_epsilon_tensor = None
+  bn_decay_mean_tensor = None
+  bn_decay_var_tensor = None
+
+  split_context = context.split('/')
+  base_context = split_context[-1]
+
+  oplist = graph.get_operations()
+  op_suffix_gamma = base_context + '/BatchNorm/gamma'
+  op_suffix_mean = base_context + '/BatchNorm/moments/Squeeze'
+  op_suffix_variance = base_context + '/BatchNorm/moments/Squeeze_1'
+  op_suffix_moving_variance = base_context + '/BatchNorm/moving_variance/read'
+  op_suffix_moving_mean = base_context + '/BatchNorm/moving_mean/read'
+  op_suffix_epsilon = base_context + '/BatchNorm/batchnorm/add/y'
+  op_suffix_bn_decay_mean = base_context + '/BatchNorm/AssignMovingAvg/decay'
+  op_suffix_bn_decay_var = base_context + '/BatchNorm/AssignMovingAvg_1/decay'
+
+  # Parse through list of ops to find relevant ops
+  for op in oplist:
+    if op.name.endswith(op_suffix_mean):
+      # This is an efficient way to check for two things:
+      # Is batch norm present and is it training mode?
+      # Batch statistics are computed only during batch norm in training
+      batch_mean_tensor = graph.get_tensor_by_name(op.name + ':0')
+    if op.name.endswith(op_suffix_variance):
+      batch_variance_tensor = graph.get_tensor_by_name(op.name + ':0')
+    if op.name.endswith(op_suffix_moving_mean):
+      moving_mean_tensor = graph.get_tensor_by_name(op.name + ':0')
+    if op.name.endswith(op_suffix_moving_variance):
+      moving_variance_tensor = graph.get_tensor_by_name(op.name + ':0')
+    if op.name.endswith(op_suffix_epsilon):
+      batch_epsilon_tensor = graph.get_tensor_by_name(op.name + ':0')
+    if op.name.endswith(op_suffix_bn_decay_mean):
+      bn_decay_mean_tensor = graph.get_tensor_by_name(op.name + ':0')
+    if op.name.endswith(op_suffix_bn_decay_var):
+      bn_decay_var_tensor = graph.get_tensor_by_name(op.name + ':0')
+    if has_scaling:
+      if op.name.endswith(op_suffix_gamma):
+        gamma_tensor = graph.get_tensor_by_name(op.name + ':0')
+
+  if not has_scaling:
+    gamma_tensor = array_ops.ones(batch_mean_tensor.shape)
+
+  return _BatchNormMatch(
+      layer_op=None,
+      bn_op=None,
+      output_tensor=None,
+      input_tensor=None,
+      weight_tensor=None,
+      gamma_tensor=gamma_tensor,
+      beta_tensor=None,
+      mean_tensor=batch_mean_tensor,
+      variance_tensor=batch_variance_tensor,
+      moving_mean_tensor=moving_mean_tensor,
+      moving_variance_tensor=moving_variance_tensor,
+      bn_decay_mean_tensor=bn_decay_mean_tensor,
+      bn_decay_var_tensor=bn_decay_var_tensor,
+      batch_epsilon_tensor=batch_epsilon_tensor)
+
+
+def _ComputeBatchNormCorrections(context, match, freeze_batch_norm_delay,
+                                 fused_batch_norm):
+  """Computes batch norm correction params.
+
+     Before batch normalization is frozen:
+     We use batch statistics for batch norm.
+       correction_scale = sigma_b/sigma_mv
+       correction_recip = 1/correction_scale
+       correction_offset = 0
+
+     After batch normalization is frozen:
+      correction_scale = sigma_b/sigma_mv
+      correction_recip = 1
+      correction_offset =  gamma*(mu_b/sigma_b-mu_mv/sigma_mv).
+
+     Batch norm is frozen if global_step > bn_freeze_delay.
+     The corrections ensure that:
+     a) The weights are quantized after scaling by gamma/sigma_mv. This enables
+     smoother training as the scaling on the weights changes slowly, rather than
+     jump across mini-batches
+     b) Changing the values of the corrections allows for one to switch between
+     using batch statistics to using moving mean and average, without requiring
+     changes to batch_norm
+
+
+  Args:
+    context: The scope under which we look for batch norm params
+    match: Object containg required batch norm tensors for correction
+      computation
+    freeze_batch_norm_delay: Delay in steps at which computation switches
+      from regular batch norm to frozen mean and variance.
+    fused_batch_norm: Bool, true if fused batch norm is used
+
+  Returns:
+    A tuple of correction_scale, correction_recip, correction_offset
+  """
+
+  g = ops.get_default_graph()
+  with g.name_scope(context + 'batch_norm_correction'):
+    recip_sigma_mv = math_ops.rsqrt(
+        match.moving_variance_tensor + match.batch_epsilon_tensor)
+    recip_sigma = math_ops.rsqrt(
+        match.variance_tensor + match.batch_epsilon_tensor)
+    correction_scale = math_ops.divide(
+        recip_sigma_mv, recip_sigma, name='scale_compute')
+    correction_scale = array_ops.identity(
+        correction_scale, name='correction_scale')
+    correction_recip = math_ops.reciprocal(
+        correction_scale, name='reciprocal_compute')
+    correction_offset = math_ops.multiply(
+        match.gamma_tensor,
+        match.mean_tensor * recip_sigma -
+        match.moving_mean_tensor * recip_sigma_mv,
+        name='offset_compute')
+
+    if freeze_batch_norm_delay is not None:
+      use_mv_avg = math_ops.greater_equal(
+          training_util.get_or_create_global_step(),
+          freeze_batch_norm_delay,
+          name='use_moving_average')
+    else:
+      use_mv_avg = False
+
+    bn_decay_zero = 0.0
+    bn_decay_mean_consumers = list(match.bn_decay_mean_tensor.consumers())
+    bn_decay_var_consumers = list(match.bn_decay_mean_tensor.consumers())
+
+    bn_decay_mean_out = utils.smart_cond(
+        use_mv_avg,
+        lambda: bn_decay_zero,
+        lambda: match.bn_decay_mean_tensor,
+        name='freeze_moving_mean')
+    graph_editor.reroute_ts(
+        [bn_decay_mean_out], [match.bn_decay_mean_tensor],
+        can_modify=bn_decay_mean_consumers)
+
+    if fused_batch_norm is False:
+      bn_decay_var_consumers = list(match.bn_decay_var_tensor.consumers())
+      bn_decay_var_out = utils.smart_cond(
+          use_mv_avg,
+          lambda: bn_decay_zero,
+          lambda: match.bn_decay_var_tensor,
+          name='freeze_moving_var')
+      graph_editor.reroute_ts(
+          [bn_decay_var_out], [match.bn_decay_var_tensor],
+          can_modify=bn_decay_var_consumers)
+
+    correction_recip = utils.smart_cond(
+        use_mv_avg,
+        lambda: array_ops.ones(correction_scale.shape),
+        lambda: correction_recip,
+        name='correction_recip')
+
+    correction_offset = utils.smart_cond(
+        use_mv_avg,
+        lambda: correction_offset,
+        lambda: array_ops.zeros(correction_offset.shape),
+        name='correction_offset')
+  return correction_scale, correction_recip, correction_offset
+
+
+def _CreateFoldedOp(graph, context, has_scaling, freeze_batch_norm_delay,
+                    is_training):
   """Folds in batch norm layer into preceding convolution or FC layer.
 
   Creates 3 new nodes, connects their inputs and adds them to the graph:
@@ -419,6 +743,9 @@ def _CreateFoldedOp(graph, context, has_scaling):
     context: String, batch norm context, i.e. node into which BatchNorm is
         nested.
     has_scaling: Whether the batch norm has scaling enabled.
+    freeze_batch_norm_delay: How many steps to wait before freezing
+    moving mean and variance and using them for batch normalization
+    is_training: Bool, true if training
 
   Raises:
     ValueError: When operation type is not supported, or input and output tensor
@@ -435,19 +762,43 @@ def _CreateFoldedOp(graph, context, has_scaling):
                                           mul_scale_name)
   op_below = mul_scale.inputs[0].op
   weights = op_below.inputs[1]
-
+  match = _GetBatchNormParams(
+      graph=graph, context=context, has_scaling=has_scaling)
+  correction_scale, correction_recip, correction_offset = None, None, None
+  if is_training:
+    correction_scale, correction_recip, correction_offset = (
+        _ComputeBatchNormCorrections(
+            context=context,
+            match=match,
+            freeze_batch_norm_delay=freeze_batch_norm_delay,
+            fused_batch_norm=False))
   # Special handling for weights of depthwise convolution.
   if op_below.type == 'DepthwiseConv2dNative':
-    new_shape = [weights.get_shape().as_list()[2],
-                 weights.get_shape().as_list()[3]]
+    new_shape = [
+        weights.get_shape().as_list()[2],
+        weights.get_shape().as_list()[3]
+    ]
     scale_name = 'mul' if has_scaling else 'Rsqrt'
-    scale = graph.get_operation_by_name(context + '/BatchNorm/batchnorm/' +
-                                        scale_name)
+    scale = graph.get_operation_by_name(
+        context + '/BatchNorm/batchnorm/' + scale_name)
     scale = array_ops.reshape(scale.outputs[0], new_shape,
                               context + '/scale_reshape')
-    mul_fold = _CloneOp(mul_scale, context + '/mul_fold',
-                        [(0, weights), (1, scale)])
+
+    if correction_scale is not None:
+      correction_scale = array_ops.reshape(correction_scale, new_shape,
+                                           context + '/correction_reshape')
+      with ops.device(mul_scale.device):
+        weights = math_ops.multiply(correction_scale, weights,
+                                    context + '/correction_mult')
+
+    mul_fold = _CloneOp(mul_scale, context + '/mul_fold', [(0, weights),
+                                                           (1, scale)])
   elif op_below.type in ['Conv2D', 'MatMul']:
+
+    if correction_scale is not None:
+      with ops.device(mul_scale.device):
+        weights = math_ops.multiply(correction_scale, weights,
+                                    context + '/correction_mult')
     mul_fold = _CloneOp(mul_scale, context + '/mul_fold', [(0, weights)])
   else:
     raise ValueError('Cannot handle operation of type: %s' % op_below.op)
@@ -456,10 +807,17 @@ def _CreateFoldedOp(graph, context, has_scaling):
   conv_or_fc_folded = _CloneOp(op_below, op_below.name + '_Fold',
                                [(1, mul_fold.outputs[0])])
 
-  add_shift = graph.get_operation_by_name(context +
-                                          '/BatchNorm/batchnorm/add_1')
-  add_fold = _CloneOp(add_shift, context + '/add_fold',
-                      [(0, conv_or_fc_folded.outputs[0])])
+  add_shift = graph.get_operation_by_name(
+      context + '/BatchNorm/batchnorm/add_1')
+
+  corrected_output = conv_or_fc_folded.outputs[0]
+  if correction_offset is not None:
+    with ops.device(conv_or_fc_folded.device):
+      corrected_output = math_ops.multiply(correction_recip, corrected_output,
+                                           context + '/post_conv_mul')
+      corrected_output = math_ops.add(corrected_output, (correction_offset),
+                                      context + '/correction_add')
+  add_fold = _CloneOp(add_shift, context + '/add_fold', [(0, corrected_output)])
   _AssertShapesMatch('add_fold', add_fold.inputs[0], add_fold.outputs[0])
   return add_shift, add_fold
 
diff --git a/tensorflow/contrib/quantize/python/fold_batch_norms_test.py b/tensorflow/contrib/quantize/python/fold_batch_norms_test.py
index ecf321ff57..330bd8a647 100644
--- a/tensorflow/contrib/quantize/python/fold_batch_norms_test.py
+++ b/tensorflow/contrib/quantize/python/fold_batch_norms_test.py
@@ -46,26 +46,27 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
 
   def _RunTestOverParameters(self, test_fn):
     parameters_list = [
-        # (relu, relu_op_name, with_bypass, has_scaling, fused_batch_norm)
-        (nn_ops.relu6, 'Relu6', False, False, False),
-        (nn_ops.relu, 'Relu', False, False, False),
-        (nn_ops.relu6, 'Relu6', True, False, False),
-        (nn_ops.relu, 'Relu', True, False, False),
-        (nn_ops.relu6, 'Relu6', False, True, False),
-        (nn_ops.relu, 'Relu', False, True, False),
-        (nn_ops.relu6, 'Relu6', True, True, False),
-        (nn_ops.relu, 'Relu', True, True, False),
+        # (relu, relu_op_name, with_bypass, has_scaling, fused_batch_norm,
+        # freeze_batch_norm_delay)
+        (nn_ops.relu6, 'Relu6', False, False, False, 100),
+        (nn_ops.relu, 'Relu', False, False, False, None),
+        (nn_ops.relu6, 'Relu6', True, False, False, 100),
+        (nn_ops.relu, 'Relu', True, False, False, None),
+        (nn_ops.relu6, 'Relu6', False, True, False, 100),
+        (nn_ops.relu, 'Relu', False, True, False, None),
+        (nn_ops.relu6, 'Relu6', True, True, False, 100),
+        (nn_ops.relu, 'Relu', True, True, False, None),
         # Fused batch norm always has scaling enabled.
-        (nn_ops.relu6, 'Relu6', False, True, True),
-        (nn_ops.relu, 'Relu', False, True, True),
-        (nn_ops.relu6, 'Relu6', True, True, True),
-        (nn_ops.relu, 'Relu', True, True, True),
+        (nn_ops.relu6, 'Relu6', False, True, True, None),
+        (nn_ops.relu, 'Relu', False, True, True, 100),
+        (nn_ops.relu6, 'Relu6', True, True, True, None),
+        (nn_ops.relu, 'Relu', True, True, True, 100),
     ]
     for params in parameters_list:
-      test_fn(params[0], params[1], params[2], params[3], params[4])
+      test_fn(params[0], params[1], params[2], params[3], params[4], params[5])
 
   def _TestFoldConv2d(self, relu, relu_op_name, with_bypass, has_scaling,
-                      fused_batch_norm):
+                      fused_batch_norm, freeze_batch_norm_delay):
     """Tests folding cases: inputs -> Conv2d with batch norm -> Relu*.
 
     Args:
@@ -75,6 +76,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
         inputs to just before Relu*.
       has_scaling: Bool, when true the batch norm has scaling.
       fused_batch_norm: Bool, when true the batch norm is fused.
+      freeze_batch_norm_delay: None or the number of steps after which training
+      switches to using frozen mean and variance
     """
     g = ops.Graph()
     with g.as_default():
@@ -99,12 +102,13 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
         node = math_ops.add(inputs, node, name='test/Add')
         relu(node, name='test/' + relu_op_name)
 
-      fold_batch_norms.FoldBatchNorms(g)
+      fold_batch_norms.FoldBatchNorms(
+          g, is_training=True, freeze_batch_norm_delay=freeze_batch_norm_delay)
 
     folded_mul = g.get_operation_by_name(scope + '/mul_fold')
     self.assertEqual(folded_mul.type, 'Mul')
     self._AssertInputOpsAre(folded_mul, [
-        scope + '/weights/read',
+        scope + '/correction_mult',
         self._BatchNormMultiplierName(scope, has_scaling, fused_batch_norm)
     ])
     self._AssertOutputGoesToOps(folded_mul, g, [scope + '/Conv2D_Fold'])
@@ -113,12 +117,12 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
     self.assertEqual(folded_conv.type, 'Conv2D')
     self._AssertInputOpsAre(folded_conv,
                             [scope + '/mul_fold', inputs.op.name])
-    self._AssertOutputGoesToOps(folded_conv, g, [scope + '/add_fold'])
+    self._AssertOutputGoesToOps(folded_conv, g, [scope + '/post_conv_mul'])
 
     folded_add = g.get_operation_by_name(scope + '/add_fold')
     self.assertEqual(folded_add.type, 'Add')
     self._AssertInputOpsAre(folded_add, [
-        scope + '/Conv2D_Fold',
+        scope + '/correction_add',
         self._BathNormBiasName(scope, fused_batch_norm)
     ])
     output_op_names = ['test/Add' if with_bypass else 'test/' + relu_op_name]
@@ -128,7 +132,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
     self._RunTestOverParameters(self._TestFoldConv2d)
 
   def _TestFoldConv2dUnknownShape(self, relu, relu_op_name, with_bypass,
-                                  has_scaling, fused_batch_norm):
+                                  has_scaling, fused_batch_norm,
+                                  freeze_batch_norm_delay):
     """Tests folding cases: inputs -> Conv2d with batch norm -> Relu*.
 
     Tests that folding works even with an input shape where some dimensions are
@@ -141,6 +146,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
         inputs to just before Relu*.
       has_scaling: Bool, when true the batch norm has scaling.
       fused_batch_norm: Bool, when true the batch norm is fused.
+      freeze_batch_norm_delay: None or the number of steps after which training
+      switches to using frozen mean and variance
     """
     g = ops.Graph()
     with g.as_default():
@@ -164,12 +171,13 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
         node = math_ops.add(inputs, node, name='test/Add')
         relu(node, name='test/' + relu_op_name)
 
-      fold_batch_norms.FoldBatchNorms(g)
+      fold_batch_norms.FoldBatchNorms(
+          g, is_training=True, freeze_batch_norm_delay=freeze_batch_norm_delay)
 
     folded_mul = g.get_operation_by_name(scope + '/mul_fold')
     self.assertEqual(folded_mul.type, 'Mul')
     self._AssertInputOpsAre(folded_mul, [
-        scope + '/weights/read',
+        scope + '/correction_mult',
         self._BatchNormMultiplierName(scope, has_scaling, fused_batch_norm)
     ])
     self._AssertOutputGoesToOps(folded_mul, g, [scope + '/Conv2D_Fold'])
@@ -177,12 +185,12 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
     folded_conv = g.get_operation_by_name(scope + '/Conv2D_Fold')
     self.assertEqual(folded_conv.type, 'Conv2D')
     self._AssertInputOpsAre(folded_conv, [scope + '/mul_fold', inputs.op.name])
-    self._AssertOutputGoesToOps(folded_conv, g, [scope + '/add_fold'])
+    self._AssertOutputGoesToOps(folded_conv, g, [scope + '/post_conv_mul'])
 
     folded_add = g.get_operation_by_name(scope + '/add_fold')
     self.assertEqual(folded_add.type, 'Add')
     self._AssertInputOpsAre(folded_add, [
-        scope + '/Conv2D_Fold',
+        scope + '/correction_add',
         self._BathNormBiasName(scope, fused_batch_norm)
     ])
     output_op_names = ['test/Add' if with_bypass else 'test/' + relu_op_name]
@@ -192,7 +200,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
     self._RunTestOverParameters(self._TestFoldConv2dUnknownShape)
 
   def _TestFoldFullyConnectedLayer(self, relu, relu_op_name, with_bypass,
-                                   has_scaling, fused_batch_norm):
+                                   has_scaling, fused_batch_norm,
+                                   freeze_batch_norm_delay):
     """Tests folding cases: inputs -> FC with batch norm -> Relu*.
 
     Args:
@@ -202,6 +211,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
         inputs to just before Relu*.
       has_scaling: Bool, when true the batch norm has scaling.
       fused_batch_norm: Bool, when true the batch norm is fused.
+      freeze_batch_norm_delay: None or the number of steps after which training
+      switches to using frozen mean and variance
     """
     g = ops.Graph()
     with g.as_default():
@@ -223,12 +234,13 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
         node = math_ops.add(inputs, node, name='test/Add')
         relu(node, name='test/' + relu_op_name)
 
-      fold_batch_norms.FoldBatchNorms(g)
+      fold_batch_norms.FoldBatchNorms(
+          g, is_training=True, freeze_batch_norm_delay=freeze_batch_norm_delay)
 
     folded_mul = g.get_operation_by_name(scope + '/mul_fold')
     self.assertEqual(folded_mul.type, 'Mul')
     self._AssertInputOpsAre(folded_mul, [
-        scope + '/weights/read',
+        scope + '/correction_mult',
         self._BatchNormMultiplierName(scope, has_scaling, fused_batch_norm)
     ])
     self._AssertOutputGoesToOps(folded_mul, g, [scope + '/MatMul_Fold'])
@@ -237,12 +249,12 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
     self.assertEqual(folded_conv.type, 'MatMul')
     self._AssertInputOpsAre(folded_conv,
                             [scope + '/mul_fold', inputs.op.name])
-    self._AssertOutputGoesToOps(folded_conv, g, [scope + '/add_fold'])
+    self._AssertOutputGoesToOps(folded_conv, g, [scope + '/post_conv_mul'])
 
     folded_add = g.get_operation_by_name(scope + '/add_fold')
     self.assertEqual(folded_add.type, 'Add')
     self._AssertInputOpsAre(folded_add, [
-        scope + '/MatMul_Fold',
+        scope + '/correction_add',
         self._BathNormBiasName(scope, fused_batch_norm)
     ])
     output_op_names = ['test/Add' if with_bypass else 'test/' + relu_op_name]
@@ -252,7 +264,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
     self._RunTestOverParameters(self._TestFoldFullyConnectedLayer)
 
   def _TestFoldDepthwiseConv2d(self, relu, relu_op_name, with_bypass,
-                               has_scaling, fused_batch_norm):
+                               has_scaling, fused_batch_norm,
+                               freeze_batch_norm_delay):
     """Tests folding: inputs -> DepthwiseConv2d with batch norm -> Relu*.
 
     Args:
@@ -262,6 +275,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
         inputs to just before Relu*.
       has_scaling: Bool, when true the batch norm has scaling.
       fused_batch_norm: Bool, when true the batch norm is fused.
+      freeze_batch_norm_delay: None or the number of steps after which training
+      switches to using frozen mean and variance
     """
     g = ops.Graph()
     with g.as_default():
@@ -286,7 +301,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
         node = math_ops.add(inputs, node, name='test/Add')
         relu(node, name='test/' + relu_op_name)
 
-      fold_batch_norms.FoldBatchNorms(g)
+      fold_batch_norms.FoldBatchNorms(
+          g, is_training=True, freeze_batch_norm_delay=freeze_batch_norm_delay)
 
     folded_mul = g.get_operation_by_name(scope + '/mul_fold')
     self.assertEqual(folded_mul.type, 'Mul')
@@ -295,8 +311,7 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
     else:
       scale_reshape_op_name = scope + '/scale_reshape'
     self._AssertInputOpsAre(folded_mul,
-                            [scope + '/depthwise_weights/read',
-                             scale_reshape_op_name])
+                            [scope + '/correction_mult', scale_reshape_op_name])
     self._AssertOutputGoesToOps(folded_mul, g, [scope + '/depthwise_Fold'])
 
     scale_reshape = g.get_operation_by_name(scale_reshape_op_name)
@@ -311,12 +326,12 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
     self.assertEqual(folded_conv.type, 'DepthwiseConv2dNative')
     self._AssertInputOpsAre(folded_conv,
                             [scope + '/mul_fold', inputs.op.name])
-    self._AssertOutputGoesToOps(folded_conv, g, [scope + '/add_fold'])
+    self._AssertOutputGoesToOps(folded_conv, g, [scope + '/post_conv_mul'])
 
     folded_add = g.get_operation_by_name(scope + '/add_fold')
     self.assertEqual(folded_add.type, 'Add')
     self._AssertInputOpsAre(folded_add, [
-        scope + '/depthwise_Fold',
+        scope + '/correction_add',
         self._BathNormBiasName(scope, fused_batch_norm)
     ])
     output_op_names = ['test/Add' if with_bypass else 'test/' + relu_op_name]
@@ -326,7 +341,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
     self._RunTestOverParameters(self._TestFoldDepthwiseConv2d)
 
   def _TestCompareFoldAndUnfolded(self, relu, relu_op_name, with_bypass,
-                                  has_scaling, fused_batch_norm):
+                                  has_scaling, fused_batch_norm,
+                                  freeze_batch_norm_delay):
     """Tests that running folded and unfolded BN returns the same results.
 
     Args:
@@ -336,6 +352,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
         inputs to just before Relu*.
       has_scaling: Bool, when true the batch norm has scaling.
       fused_batch_norm: Bool, when true the batch norm is fused.
+      freeze_batch_norm_delay: None or the number of steps after which training
+      switches to using frozen mean and variance
     """
     random_seed.set_random_seed(1234)
     unfolded_g = ops.Graph()
@@ -361,11 +379,12 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
       if with_bypass:
         node = math_ops.add(inputs, node, name='test/Add')
       relu_node = relu(node, name='test/' + relu_op_name)
-
     folded_g = copy_graph.CopyGraph(unfolded_g)
     with folded_g.as_default():
-      fold_batch_norms.FoldBatchNorms(folded_g)
-
+      fold_batch_norms.FoldBatchNorms(
+          folded_g,
+          is_training=True,
+          freeze_batch_norm_delay=freeze_batch_norm_delay)
     with session.Session(graph=unfolded_g) as sess:
       sess.run(variables.global_variables_initializer())
       grad_node = gradients.gradients(relu_node, inputs)
diff --git a/tensorflow/contrib/quantize/python/quantize_graph.py b/tensorflow/contrib/quantize/python/quantize_graph.py
index bbd9743d80..89b744c559 100644
--- a/tensorflow/contrib/quantize/python/quantize_graph.py
+++ b/tensorflow/contrib/quantize/python/quantize_graph.py
@@ -52,9 +52,19 @@ def _create_graph(input_graph,
   """
   # TODO(suharshs): Describe the process in more detail in the doc string.
   g = copy_graph.CopyGraph(input_graph)
+  if is_training:
+    # TODO(raghuramank): Need to make freeze_batch_norm_delay
+    # a function of the batch size. For now setting this to 250 epochs
+    # This corresponds to 5 million steps at a batch size of 64.
+    freeze_batch_norm_delay = 5000000
+  else:
+    freeze_batch_norm_delay = None
   with g.as_default():
     with ops.device(device_name_or_function):
-      fold_batch_norms.FoldBatchNorms(g)
+      fold_batch_norms.FoldBatchNorms(
+          g,
+          freeze_batch_norm_delay=freeze_batch_norm_delay,
+          is_training=is_training)
       quantize.Quantize(g, is_training=is_training)
   if elements is None:
     return g
-- 
GitLab


From 922bc83735a5ce8f58245427efb17f87735c2848 Mon Sep 17 00:00:00 2001
From: Brennan Saeta <saeta@google.com>
Date: Thu, 1 Feb 2018 16:22:24 -0800
Subject: [PATCH 551/966] GCS Throttle: 1 token == 1 Kb

Previously, 1 token was approximately 256 bytes. This is slightly less
intuitive than 1 kb.

PiperOrigin-RevId: 184212503
---
 tensorflow/core/platform/cloud/gcs_throttle.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/platform/cloud/gcs_throttle.h b/tensorflow/core/platform/cloud/gcs_throttle.h
index 8e46fca6ca..1a89daef08 100644
--- a/tensorflow/core/platform/cloud/gcs_throttle.h
+++ b/tensorflow/core/platform/cloud/gcs_throttle.h
@@ -126,7 +126,7 @@ class GcsThrottle {
   void UpdateState() EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   inline uint64 request_bytes_to_tokens(size_t num_bytes) {
-    return num_bytes >> 8;
+    return num_bytes >> 10;
   }
 
   mutex mu_;
-- 
GitLab


From c5e02bc8fd71d73c5d05f583ce5391f26ad937d7 Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Thu, 1 Feb 2018 16:29:30 -0800
Subject: [PATCH 552/966] Automated g4 rollback of changelist 184188816

PiperOrigin-RevId: 184213576
---
 .../tf2xla/functionalize_control_flow.cc      | 279 +++++-------------
 .../tf2xla/functionalize_control_flow_test.cc |  10 +-
 tensorflow/compiler/tf2xla/graph_compiler.cc  |   2 +-
 3 files changed, 82 insertions(+), 209 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
index 7a4fa79078..1d9e0fb33e 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
@@ -285,8 +285,7 @@ Status BuildLoopBody(const Graph& graph, Frame* frame,
 Status FunctionalizeLoop(Graph* graph, Frame* frame,
                          FunctionLibraryDefinition* library) {
   VLOG(2) << "Frame " << frame->name << " before: "
-          << dump_graph::DumpGraphToFile("functionalize_before", *graph,
-                                         library);
+          << dump_graph::DumpGraphToFile("functionalize_before", *graph);
 
   // Split loop-varying Enter nodes with multiple successors. If the same
   // Tensor is fed as input to multiple loop arguments, we may end up with a
@@ -451,7 +450,7 @@ Status FunctionalizeLoop(Graph* graph, Frame* frame,
   TF_RETURN_IF_ERROR(BuildLoopBody(*graph, frame, &arg_types, &body_graph));
 
   VLOG(2) << "Frame " << frame->name << " condition: "
-          << dump_graph::DumpGraphToFile("loop_condition", *cond_graph, library)
+          << dump_graph::DumpGraphToFile("loop_condition", *cond_graph)
           << " body: " << dump_graph::DumpGraphToFile("loop_body", *body_graph);
 
   static std::atomic<int64> sequence_num(0LL);
@@ -532,8 +531,7 @@ Status FunctionalizeLoop(Graph* graph, Frame* frame,
   frame->parent->nodes.insert(while_node);
 
   VLOG(2) << "Frame " << frame->name << " after: "
-          << dump_graph::DumpGraphToFile("functionalize_after", *graph,
-                                         library);
+          << dump_graph::DumpGraphToFile("functionalize_after", *graph);
 
   return Status::OK();
 }
@@ -566,11 +564,11 @@ class FunctionalizeCond {
     explicit CondArgNode(Node* input) : input(input) {}
     string ToString() const {
       return strings::StrCat("input=", input->name(),
-                             " switches=", NodesToString(switches));
+                             " switches=", NodesToString(switch_nodes));
     }
 
     Node* input;
-    std::vector<Node*> switches;
+    std::vector<Node*> switch_nodes;
   };
   using CondArgNodes = std::vector<CondArgNode>;
 
@@ -584,22 +582,15 @@ class FunctionalizeCond {
     int count;
   };
 
-  // Group of switch nodes that will be part of the same XlaIf.
-  struct SwitchCluster {
-    explicit SwitchCluster(Node* predicate) : predicate(predicate) {}
-    string ToString() const {
-      return strings::StrCat(name, " predicate=", predicate->name(),
-                             " switches=", NodesToString(switches));
-    }
+  struct PredicateSwitches {
+    explicit PredicateSwitches(Node* predicate) : predicate(predicate) {}
 
-    string name;
     Node* predicate;
     std::vector<Node*> switches;
   };
 
-  FunctionalizeCond(Graph* graph, FunctionLibraryDefinition* library,
-                    bool dump_graphs)
-      : library_(library), graph_(graph), dump_graphs_(dump_graphs) {}
+  FunctionalizeCond(Graph* graph, FunctionLibraryDefinition* library)
+      : library_(library), graph_(graph) {}
 
   // Perform the actual cond functionalization. Iterate over groups of switch
   // nodes (linked by common predicate), from innermost to outermost, and
@@ -610,25 +601,27 @@ class FunctionalizeCond {
   // frontier (the nodes where the cond ends).
   StatusOr<std::pair<std::unordered_map<Node*, ForwardFlowNode>,
                      std::unordered_set<Node*>>>
-  DetermineBranchMapAndFrontier(const SwitchCluster& switch_cluster);
+  DetermineBranchMapAndFrontier(const std::vector<Node*>& switches);
 
   // Returns XlaIf node created from subgraph of merge and switch nodes. This
   // encapsulates the process of extracting the bodies needed for the then and
   // else branch, creates a XlaIf node, removing the nodes of the branches from
   // the graph and replacing the merge node with a XlaIf.
   StatusOr<Node*> ConvertToXlaIf(const CondArgNodes& cond_arg_nodes,
-                                 const SwitchCluster& switch_cluster,
-                                 const std::vector<Node*>& switches);
+                                 const std::vector<Node*>& switch_nodes,
+                                 const std::vector<Node*>& merge_nodes,
+                                 Node* predicate);
 
   // Builds a XlaIfOp to replace the Switch-Graph-Merge cluster with.
   StatusOr<Node*> BuildAndAddXlaIfOp(const CondArgNodes& cond_arg_nodes,
-                                     const SwitchCluster& switch_cluster,
-                                     const std::vector<Node*>& merge_nodes);
+                                     const std::vector<Node*>& switch_nodes,
+                                     const std::vector<Node*>& merge_nodes,
+                                     Node* predicate);
 
   // Extracts a function body corresponding to the given input edge of the merge
   // node.
   Status ExtractBody(const CondArgNodes& cond_arg_nodes,
-                     const std::vector<Node*>& switches,
+                     const std::vector<Node*>& switch_nodes,
                      const std::vector<Node*>& merge_nodes, int input_edge,
                      Graph* body);
 
@@ -639,9 +632,9 @@ class FunctionalizeCond {
   // Adds all output edges from the `if_node`.
   Status AddOutputEdges(const std::vector<Node*>& outputs, Node* if_node);
 
-  // Returns the switch clusters of graph_ in postorder. Dead switch nodes are
-  // skipped and removed from the graph.
-  std::vector<SwitchCluster> DeterminePredicateSwitchOrder();
+  // Returns the switches of graph_ (along with grouping predicates) in
+  // postorder. Dead switch nodes are skipped and removed from the graph.
+  std::vector<PredicateSwitches> DeterminePredicateSwitchOrder();
 
   // Update the state for destination based on the state of source and the node
   // being updated.
@@ -664,7 +657,6 @@ class FunctionalizeCond {
 
   FunctionLibraryDefinition* library_;
   Graph* graph_;
-  bool dump_graphs_;
 };
 
 bool IsDeadSwitch(const Node* node) {
@@ -712,13 +704,10 @@ Status FunctionalizeCond::ValidateFrontier(
           ") in both Else and Then branch should be in Both.");
     }
   }
-  // An empty frontier indicates a dead switch. Above we attempt to remove dead
-  // switch nodes, but not all are removed so don't treat it as an error yet.
-  // TODO(jpienaar): Find out why dead switch nodes remain.
-  // if (pending[kBoth].empty() && pending[kThenBranch].empty() &&
-  //     pending[kElseBranch].empty()) {
-  //   return errors::Internal("Unexpected empty frontier for switch nodes");
-  // }
+  if (pending[kBoth].empty() && pending[kThenBranch].empty() &&
+      pending[kElseBranch].empty()) {
+    return errors::Internal("Unexpected empty frontier for switch nodes");
+  }
   return Status::OK();
 }
 
@@ -745,138 +734,33 @@ Status FunctionalizeCond::Join(const ForwardFlowNode& src_state,
   return Status::OK();
 }
 
-std::vector<FunctionalizeCond::SwitchCluster>
+std::vector<FunctionalizeCond::PredicateSwitches>
 FunctionalizeCond::DeterminePredicateSwitchOrder() {
-  struct Cluster {
-    bool operator==(const Cluster& other) const {
-      return representative == other.representative;
-    }
-    int representative = -1;
-  };
-
-  // Perform a DFS over the graph and
-  // * Determine the reverse topological order of the nodes (there should be no
-  //   cycles at this point so the post-order numbering corresponds to the
-  //   reverse topological sorting);
-  // * Identify dead switches;
-  // * Initialize the cluster's representative;
-  std::vector<UnionFind<Cluster>> clusters(graph_->num_node_ids());
   std::vector<Node*> dead_switches;
   std::vector<Node*> switch_order;
-  std::vector<Node*> rev_topo_sorted_nodes;
-  DFS(*graph_, nullptr, [&](Node* n) {
-    clusters[n->id()].Get().representative = n->id();
+  DFS(*graph_, nullptr, [this, &dead_switches, &switch_order](Node* n) {
     if (IsSwitch(n)) {
       if (IsDeadSwitch(n)) {
         dead_switches.push_back(n);
       } else {
-        rev_topo_sorted_nodes.push_back(n);
         switch_order.push_back(n);
       }
-    } else if (n->IsOp()) {
-      // Exclude src and sink nodes from further consideration.
-      rev_topo_sorted_nodes.push_back(n);
     }
   });
 
-  std::vector<SwitchCluster> switch_clusters;
-  // Return early if there are no switches in the graph.
-  if (switch_order.empty()) {
-    return switch_clusters;
-  }
-
   // Remove all dead switch nodes.
   for (Node* n : dead_switches) {
     VLOG(2) << "Removing dead switch: " << n->DebugString();
     graph_->RemoveNode(n);
   }
 
-  // Identify switch nodes that are part of the same control flow context by
-  // considering the operands of operations: an operation is part of the same
-  // control context as its operands unless the operation is a switch. Control
-  // dependencies are considered part of the same control flow context if the
-  // switch depth is the same (see comment below).
-  // TODO(jpienaar): This could be combined with DetermineBranchMapAndFrontier.
-  std::vector<int> switch_depth(graph_->num_node_ids());
-  // entry_cluster records the input cluster to a switch node. This is used when
-  // merging with a merge node where the dst's cluster is merged with the entry
-  // cluster of the merge node's cluster (which corresponds to a switch cluster
-  // and so has an entry cluster).
-  std::unordered_map<int, UnionFind<Cluster>*> entry_cluster;
-  for (auto it = rev_topo_sorted_nodes.rbegin();
-       it != rev_topo_sorted_nodes.rend(); ++it) {
-    Node* n = *it;
-
-    // Compute switch depth.
-    int new_switch_depth = 0;
-    for (const Edge* e : n->in_edges()) {
-      Node* src = e->src();
-      new_switch_depth = std::max(
-          new_switch_depth, switch_depth[src->id()] + (IsSwitch(src) ? 1 : 0) -
-                                (IsMerge(src) ? 1 : 0));
-    }
-    switch_depth[n->id()] = new_switch_depth;
-
-    // Only merge the input operands of a switch. The switch's clustering itself
-    // is determined by the interaction of the switch's outputs.
-    if (IsSwitch(n)) {
-      Node* input;
-      TF_CHECK_OK(n->input_node(0, &input));
-      UnionFind<Cluster>& cluster = clusters[input->id()];
-      entry_cluster[n->id()] = &cluster;
-      // Merge the inputs of the switch node with one another. This results in
-      // predicates and control input residing in the same cluster.
-      for (const Edge* e : n->in_edges()) {
-        Node* src = e->src();
-        cluster.Merge(&clusters[src->id()]);
-      }
-      continue;
-    }
-
-    for (const Edge* e : n->in_edges()) {
-      Node* src = e->src();
-      if (!src->IsOp()) continue;
-      UnionFind<Cluster>* cluster = &clusters[src->id()];
-      if (IsMerge(src)) {
-        cluster = entry_cluster.at(clusters[src->id()].Get().representative);
-      }
-      // Merge a node with its data operands and with its control operands if
-      // the src and dst are in the same ControlContext. The ControlContext is
-      // not explicitly available here, and instead the switch depth is used as
-      // a proxy here. Due to the invariant that control edges can only be from
-      // a containing scope to an inner scope or from the inner scope to its
-      // containing scope (for exit nodes), the switch depth will only match if
-      // the src and dst are in the same ControlContext. Control edges between
-      // ControlContexts are handled during the extraction.
-      if (!e->IsControlEdge() ||
-          new_switch_depth ==
-              switch_depth[src->id()] + (IsSwitch(src) ? 1 : 0)) {
-        cluster->Merge(&clusters[n->id()]);
-      }
-    }
-  }
-
-  if (dump_graphs_) {
-    // Mark the switch cluster each node is part of.
-    for (Node* n : graph_->nodes()) {
-      n->ClearAttr("_XlaFunctionalizeSwitchGroup");
-      n->AddAttr("_XlaFunctionalizeSwitchGroup",
-                 clusters[n->id()].Get().representative);
-    }
-    LOG(INFO) << "FunctionalizeControlFlow (with_clusters): "
-              << dump_graph::DumpGraphToFile("functionalize_clustered", *graph_,
-                                             library_);
+  std::vector<PredicateSwitches> predicate_switch_order;
+  if (switch_order.empty()) {
+    return predicate_switch_order;
   }
 
-  struct Hash {
-    size_t operator()(const std::pair<Node*, Cluster>& item) const {
-      return Hash64Combine(hash<Node*>()(item.first),
-                           std::hash<int>()(item.second.representative));
-    }
-  };
-
   // Merge Switch nodes with common predicate.
-  std::unordered_map<std::pair<Node*, Cluster>, int, Hash> predicate_index;
+  std::unordered_map<Node*, int> predicate_index;
   // The nodes in switch_order are in reverse topological order, but the
   // clustered switches need not be (i.e., when considered as a cluster one
   // element of a cluster may be later in the topological order than another
@@ -885,19 +769,13 @@ FunctionalizeCond::DeterminePredicateSwitchOrder() {
   for (auto it = switch_order.rbegin(); it != switch_order.rend(); ++it) {
     Node* pred;
     TF_CHECK_OK((*it)->input_node(1, &pred));
-    auto repr = std::make_pair(pred, clusters[(*it)->id()].Get());
-    if (predicate_index.find(repr) == predicate_index.end()) {
-      predicate_index[repr] = switch_clusters.size();
-      switch_clusters.emplace_back(pred);
-      // Generate a name by concating with the cluster representative as there
-      // could be multiple switch clusters with the same predicate.
-      switch_clusters[predicate_index[repr]].name =
-          strings::StrCat(pred->name(), "_", repr.second.representative, "_If");
+    if (predicate_index.find(pred) == predicate_index.end()) {
+      predicate_index[pred] = predicate_switch_order.size();
+      predicate_switch_order.emplace_back(pred);
     }
-    switch_clusters[predicate_index[repr]].switches.push_back(*it);
+    predicate_switch_order[predicate_index[pred]].switches.push_back(*it);
   }
-
-  return switch_clusters;
+  return predicate_switch_order;
 }
 
 StatusOr<std::vector<Node*>>
@@ -945,10 +823,10 @@ StatusOr<
     std::pair<std::unordered_map<Node*, FunctionalizeCond::ForwardFlowNode>,
               std::unordered_set<Node*>>>
 FunctionalizeCond::DetermineBranchMapAndFrontier(
-    const SwitchCluster& switch_cluster) {
+    const std::vector<Node*>& switches) {
   std::unordered_map<Node*, ForwardFlowNode> branch_map;
   std::unordered_set<Node*> frontier;
-  std::vector<Node*> stack = switch_cluster.switches;
+  std::vector<Node*> stack = switches;
   std::vector<bool> visited(graph_->num_node_ids(), false);
   while (!stack.empty()) {
     Node* n = stack.back();
@@ -990,7 +868,7 @@ FunctionalizeCond::DetermineBranchMapAndFrontier(
     }
   }
 
-  if (dump_graphs_) {
+  if (VLOG_IS_ON(2)) {
     for (const auto& kv : branch_map) {
       // Append attribute to the graph if running with logging to make the
       // changes clearer in the visualization.
@@ -1002,7 +880,7 @@ FunctionalizeCond::DetermineBranchMapAndFrontier(
 }
 
 Status FunctionalizeCond::FunctionalizeInternal() {
-  std::vector<SwitchCluster> predicate_switch_order =
+  std::vector<PredicateSwitches> predicate_switch_order =
       DeterminePredicateSwitchOrder();
 
   // Iterate from innermost set of clustered switches to outermost, replacing
@@ -1016,12 +894,10 @@ Status FunctionalizeCond::FunctionalizeInternal() {
     std::unordered_map<Node*, ForwardFlowNode> branch_map;
     std::unordered_set<Node*> frontier;
     TF_ASSIGN_OR_RETURN(std::tie(branch_map, frontier),
-                        DetermineBranchMapAndFrontier(ps));
+                        DetermineBranchMapAndFrontier(ps.switches));
 
-    if (dump_graphs_)
-      LOG(INFO) << "FunctionalizeControlFlow (before XlaIf conversion): "
-                << dump_graph::DumpGraphToFile("functionalize_bc", *graph_,
-                                               library_);
+    VLOG(2) << "FunctionalizeControlFlow (before XlaIf conversion): "
+            << dump_graph::DumpGraphToFile("functionalize_bc", *graph_);
     TF_RETURN_IF_ERROR(ValidateFrontier(branch_map, frontier));
 
     // Sort the merge and switch nodes using NodeCmp. The switch-nodes are
@@ -1038,7 +914,7 @@ Status FunctionalizeCond::FunctionalizeInternal() {
         input_index[in] = cond_arg_nodes.size();
         cond_arg_nodes.emplace_back(in);
       }
-      cond_arg_nodes.at(input_index.at(in)).switches.push_back(switch_node);
+      cond_arg_nodes.at(input_index.at(in)).switch_nodes.push_back(switch_node);
     }
     std::vector<Node*> merge_nodes(frontier.begin(), frontier.end());
     std::sort(merge_nodes.begin(), merge_nodes.end(), NodeCmp());
@@ -1047,8 +923,9 @@ Status FunctionalizeCond::FunctionalizeInternal() {
                         EnsureDominanceAndReturnNonDominatedControlNodes(
                             branch_map, ps.switches));
 
-    TF_ASSIGN_OR_RETURN(Node * if_node,
-                        ConvertToXlaIf(cond_arg_nodes, ps, merge_nodes));
+    TF_ASSIGN_OR_RETURN(
+        Node * if_node,
+        ConvertToXlaIf(cond_arg_nodes, ps.switches, merge_nodes, ps.predicate));
     for (Node* old : old_control_nodes) {
       graph_->AddControlEdge(old, if_node);
     }
@@ -1057,26 +934,25 @@ Status FunctionalizeCond::FunctionalizeInternal() {
       graph_->RemoveNode(del_kv.first);
     }
     for (auto& kv : cond_arg_nodes) {
-      for (Node* node : kv.switches) {
+      for (Node* node : kv.switch_nodes) {
         graph_->RemoveNode(node);
       }
     }
-    if (dump_graphs_)
-      LOG(INFO) << "FunctionalizeControlFlow (after XlaIf conversion): "
-                << dump_graph::DumpGraphToFile("functionalize_ac", *graph_,
-                                               library_);
+    VLOG(2) << "FunctionalizeControlFlow (after XlaIf conversion): "
+            << dump_graph::DumpGraphToFile("functionalize_ac", *graph_);
   }
   return Status::OK();
 }
 
 StatusOr<Node*> FunctionalizeCond::BuildAndAddXlaIfOp(
-    const CondArgNodes& cond_arg_nodes, const SwitchCluster& switch_cluster,
-    const std::vector<Node*>& merge_nodes) {
-  VLOG(2) << "Build if op for " << switch_cluster.name;
+    const CondArgNodes& cond_arg_nodes, const std::vector<Node*>& switch_nodes,
+    const std::vector<Node*>& merge_nodes, Node* predicate) {
+  VLOG(2) << "Build if op for " << NodesToString(merge_nodes) << " with input "
+          << NodesToString(switch_nodes);
 
   NodeDef if_def;
   // Create a new If node using the name of the merge node.
-  NodeDefBuilder builder(switch_cluster.name, "XlaIf");
+  NodeDefBuilder builder(strings::StrCat(predicate->name(), "_If"), "XlaIf");
   string branch[] = {"else_branch", "then_branch"};
   for (int i = 0; i < 2; ++i) {
     static std::atomic<int64> sequence_num(0LL);
@@ -1086,9 +962,12 @@ StatusOr<Node*> FunctionalizeCond::BuildAndAddXlaIfOp(
     body_name.set_name(
         strings::StrCat("_functionalize_if_", branch[i], "_", id));
     auto body = xla::MakeUnique<Graph>(graph_->op_registry());
-    TF_RETURN_IF_ERROR(ExtractBody(cond_arg_nodes, switch_cluster.switches,
-                                   merge_nodes, i, body.get()));
+    TF_RETURN_IF_ERROR(
+        ExtractBody(cond_arg_nodes, switch_nodes, merge_nodes, i, body.get()));
     VLOG(3) << "Body " << branch[i] << ": " << DebugString(body.get());
+    VLOG(4) << "FunctionalizeControlFlow (" << branch[i] << "): "
+            << dump_graph::DumpGraphToFile(
+                   strings::StrCat("functionalize_", branch[i]), *body);
     FunctionDef body_fdef;
     TF_RETURN_IF_ERROR(GraphToFunctionDef(*body, body_name.name(), &body_fdef));
     TF_RETURN_IF_ERROR(library_->AddFunctionDef(body_fdef));
@@ -1100,7 +979,7 @@ StatusOr<Node*> FunctionalizeCond::BuildAndAddXlaIfOp(
   DataTypeVector in_arg_types;
   for (auto& kv : cond_arg_nodes) {
     bool inserted = false;
-    for (const Node* arg : kv.switches) {
+    for (const Node* arg : kv.switch_nodes) {
       const Edge* in_edge;
       TF_RETURN_IF_ERROR(arg->input_edge(0, &in_edge));
       if (in_edge->IsControlEdge()) {
@@ -1127,11 +1006,10 @@ StatusOr<Node*> FunctionalizeCond::BuildAndAddXlaIfOp(
   builder.Attr("Tout", out_type);
 
   builder.Attr("Tcond", DT_BOOL);
-  builder.Device(switch_cluster.predicate->assigned_device_name());
+  builder.Device(predicate->assigned_device_name());
   // Conditional should be the first input ...
   builder.Input(
-      NodeDefBuilder::NodeOut(switch_cluster.predicate->name(), 0,
-                              switch_cluster.predicate->output_type(0)));
+      NodeDefBuilder::NodeOut(predicate->name(), 0, predicate->output_type(0)));
   // ... followed by the other inputs.
   builder.Input(inputs);
 
@@ -1141,7 +1019,7 @@ StatusOr<Node*> FunctionalizeCond::BuildAndAddXlaIfOp(
 }
 
 Status FunctionalizeCond::ExtractBody(const CondArgNodes& cond_arg_nodes,
-                                      const std::vector<Node*>& switches,
+                                      const std::vector<Node*>& switch_nodes,
                                       const std::vector<Node*>& merge_nodes,
                                       int input_edge, Graph* body) {
   VLOG(2) << "ExtractBody for " << NodesToString(merge_nodes) << " along edge "
@@ -1151,7 +1029,7 @@ Status FunctionalizeCond::ExtractBody(const CondArgNodes& cond_arg_nodes,
   int arg_count = 0;
   for (auto& kv : cond_arg_nodes) {
     Node* arg_node = nullptr;
-    for (const auto* arg : kv.switches) {
+    for (const auto* arg : kv.switch_nodes) {
       DataType dtype = arg->input_type(0);
       if (arg_node == nullptr) {
         TF_ASSIGN_OR_RETURN(arg_node, BuildArgNode(body, dtype, arg_count++));
@@ -1175,7 +1053,8 @@ Status FunctionalizeCond::ExtractBody(const CondArgNodes& cond_arg_nodes,
       node_map.at(in->id()) = body->CopyNode(in);
     }
 
-    if (std::find(switches.begin(), switches.end(), in) == switches.end()) {
+    if (std::find(switch_nodes.begin(), switch_nodes.end(), in) ==
+        switch_nodes.end()) {
       body->AddEdge(node_map.at(in->id()), in_edge->src_output(),
                     node_map.at(node->id()), 0);
     } else {
@@ -1197,7 +1076,7 @@ Status FunctionalizeCond::AddInputEdges(const CondArgNodes& cond_arg_nodes,
   graph_->AddEdge(predicate, 0, if_node, index++);
   for (auto& kv : cond_arg_nodes) {
     bool inserted = false;
-    for (const Node* arg : kv.switches) {
+    for (const Node* arg : kv.switch_nodes) {
       const Edge* in_edge;
       TF_RETURN_IF_ERROR(arg->input_edge(0, &in_edge));
       if (in_edge->IsControlEdge()) {
@@ -1240,17 +1119,16 @@ Status FunctionalizeCond::AddOutputEdges(const std::vector<Node*>& outputs,
 }
 
 StatusOr<Node*> FunctionalizeCond::ConvertToXlaIf(
-    const CondArgNodes& cond_arg_nodes, const SwitchCluster& switch_cluster,
-    const std::vector<Node*>& merge_nodes) {
-  VLOG(1) << "ConvertToXlaIf for " << switch_cluster.ToString() << " -> "
+    const CondArgNodes& cond_arg_nodes, const std::vector<Node*>& switch_nodes,
+    const std::vector<Node*>& merge_nodes, Node* predicate) {
+  VLOG(1) << "ConvertToXlaIf for " << NodesToString(switch_nodes) << " -> "
           << NodesToString(merge_nodes);
 
   // Extract bodies and builds a If operator.
   TF_ASSIGN_OR_RETURN(
       Node * if_node,
-      BuildAndAddXlaIfOp(cond_arg_nodes, switch_cluster, merge_nodes));
-  TF_RETURN_IF_ERROR(
-      AddInputEdges(cond_arg_nodes, switch_cluster.predicate, if_node));
+      BuildAndAddXlaIfOp(cond_arg_nodes, switch_nodes, merge_nodes, predicate));
+  TF_RETURN_IF_ERROR(AddInputEdges(cond_arg_nodes, predicate, if_node));
   TF_RETURN_IF_ERROR(AddOutputEdges(merge_nodes, if_node));
 
   return if_node;
@@ -1259,19 +1137,18 @@ StatusOr<Node*> FunctionalizeCond::ConvertToXlaIf(
 Status FunctionalizeCond::Functionalize(Graph* graph,
                                         FunctionLibraryDefinition* library) {
   VLOG(1) << "FunctionalizeCond::Functionalize";
-  FunctionalizeCond fc(graph, library, /*dump_graphs=*/VLOG_IS_ON(2));
+  FunctionalizeCond fc(graph, library);
   return fc.FunctionalizeInternal();
 }
 
 }  // namespace
 
-// Transformation that converts TensorFlow's graph control flow constructs into
+// Transformation that converts Tensorflow's graph control flow constructs into
 // functional equivalents.
 Status FunctionalizeControlFlow(Graph* graph,
                                 FunctionLibraryDefinition* library) {
   VLOG(2) << "FunctionalizeControlFlow (initial): "
-          << dump_graph::DumpGraphToFile("functionalize_initial", *graph,
-                                         library);
+          << dump_graph::DumpGraphToFile("functionalize_initial", *graph);
   // Note: BuildControlFlowInfo() requires that the graph's source node is
   // connected to all source nodes in the graph. Many graphs violate this
   // invariant.
@@ -1283,8 +1160,7 @@ Status FunctionalizeControlFlow(Graph* graph,
   for (Node* node : graph->op_nodes()) {
     const ControlFlowInfo& cf = cf_info[node->id()];
 
-    VLOG(2) << "node: " << node->name() << " (" << node->id()
-            << ") frame_name: " << cf.frame_name
+    VLOG(2) << "node: " << node->name() << " frame_name: " << cf.frame_name
             << " frame: " << (cf.frame ? cf.frame->name() : "---")
             << " parent_frame: "
             << (cf.parent_frame ? cf.parent_frame->name() : "---");
@@ -1352,8 +1228,7 @@ Status FunctionalizeControlFlow(Graph* graph,
   TF_RETURN_IF_ERROR(FunctionalizeCond::Functionalize(graph, library));
 
   VLOG(2) << "FunctionalizeControlFlow (final): "
-          << dump_graph::DumpGraphToFile("functionalize_final", *graph,
-                                         library);
+          << dump_graph::DumpGraphToFile("functionalize_final", *graph);
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
index bc7276c3af..71f12a1333 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
@@ -38,11 +38,10 @@ namespace {
 
 // Returns the names of the "then" and "else" functions for the XlaIf node in a
 // graph.
-Status FindIfThenAndElse(const GraphDef& graph, string* op_name,
-                         NameAttrList* then_fn, NameAttrList* else_fn) {
+Status FindIfThenAndElse(const GraphDef& graph, NameAttrList* then_fn,
+                         NameAttrList* else_fn) {
   for (const NodeDef& node : graph.node()) {
     if (node.op() == "XlaIf") {
-      *op_name = node.name();
       const NameAttrList* result;
       TF_RETURN_IF_ERROR(GetNodeAttr(node, "then_branch", &result));
       *then_fn = *result;
@@ -97,10 +96,9 @@ TEST(FunctionalizeControlFlow, Conditional) {
 
   GraphDef graph_def;
   graph.ToGraphDef(&graph_def);
-  string op_name;
   NameAttrList then_fn;
   NameAttrList else_fn;
-  TF_EXPECT_OK(FindIfThenAndElse(graph_def, &op_name, &then_fn, &else_fn));
+  TF_EXPECT_OK(FindIfThenAndElse(graph_def, &then_fn, &else_fn));
   InstantiationResultForTest else_result;
   TF_EXPECT_OK(
       InstantiateFunctionForTest(else_fn.name(), library, &else_result));
@@ -111,7 +109,7 @@ TEST(FunctionalizeControlFlow, Conditional) {
     auto y = ops::Placeholder(scope.WithOpName("y"), DT_INT32);
     auto x = ops::Placeholder(scope.WithOpName("x"), DT_INT32);
     auto less = ops::Less(scope.WithOpName("cond/Less"), y, x);
-    auto if_op = ops::XlaIf(scope.WithOpName(op_name), less,
+    auto if_op = ops::XlaIf(scope.WithOpName("cond/Less_If"), less,
                             std::initializer_list<Input>{less, y, x}, then_fn,
                             else_fn, {DT_INT32});
     GraphDef expected;
diff --git a/tensorflow/compiler/tf2xla/graph_compiler.cc b/tensorflow/compiler/tf2xla/graph_compiler.cc
index c90ea09e17..02215b5112 100644
--- a/tensorflow/compiler/tf2xla/graph_compiler.cc
+++ b/tensorflow/compiler/tf2xla/graph_compiler.cc
@@ -136,7 +136,7 @@ Status GraphCompiler::Compile() {
       TF_RET_CHECK(src->id() < output_registry.size());
       const NodeOutputs& src_outputs = output_registry[src->id()];
 
-      tensor_inputs_.at(e->dst_input()) = src_outputs.at(e->src_output());
+      tensor_inputs_[e->dst_input()] = src_outputs[e->src_output()];
     }
 
     OpKernelContext op_context(&params, n->num_outputs());
-- 
GitLab


From 1af63d6a6d738762c363ad05107d0b6959d53e76 Mon Sep 17 00:00:00 2001
From: Andrew Selle <aselle@google.com>
Date: Thu, 1 Feb 2018 16:51:59 -0800
Subject: [PATCH 553/966] Allow reordering of execution order of nodes with
 indirect execution_plan.

Now whenever we want to operate in dependency order we use execution_plan.
It begins as identity map (0, ..., nodes_size()) but can be changed
in the future. This is the basis for more pluggable delegation.

PiperOrigin-RevId: 184216885
---
 tensorflow/contrib/lite/interpreter.cc      |  68 +++++++----
 tensorflow/contrib/lite/interpreter.h       |  19 ++-
 tensorflow/contrib/lite/interpreter_test.cc | 127 ++++++++++++++++++++
 3 files changed, 190 insertions(+), 24 deletions(-)

diff --git a/tensorflow/contrib/lite/interpreter.cc b/tensorflow/contrib/lite/interpreter.cc
index 5f5981e45a..a8db149eaa 100644
--- a/tensorflow/contrib/lite/interpreter.cc
+++ b/tensorflow/contrib/lite/interpreter.cc
@@ -36,6 +36,10 @@ constexpr const int kSlotsToReserve = 128;
 namespace tflite {
 
 // A trivial implementation of GraphInfo around the Interpreter.
+// NOTE: this interpreter info represents the subset of the
+// graph that is executed according to execution plan. Thus,
+// the indices are execution plan indices rather than raw node
+// indices.
 class InterpreterInfo : public GraphInfo {
  public:
   explicit InterpreterInfo(Interpreter* interpreter)
@@ -45,9 +49,12 @@ class InterpreterInfo : public GraphInfo {
   TfLiteTensor* tensor(size_t index) override {
     return interpreter_->tensor(index);
   }
-  size_t num_nodes() const override { return interpreter_->nodes_size(); }
+  size_t num_nodes() const override {
+    return interpreter_->execution_plan().size();
+  }
   const TfLiteNode& node(size_t index) const override {
-    return interpreter_->node_and_registration(index)->first;
+    int node_index = interpreter_->execution_plan()[index];
+    return interpreter_->node_and_registration(node_index)->first;
   }
   const std::vector<int>& inputs() const override {
     return interpreter_->inputs();
@@ -73,7 +80,7 @@ Interpreter::Interpreter(ErrorReporter* error_reporter)
   // Reserve some space for the tensors to avoid excessive resizing.
   tensors_.reserve(kSlotsToReserve);
   nodes_and_registration_.reserve(kSlotsToReserve);
-  next_node_to_prepare_ = 0;
+  next_execution_plan_index_to_prepare_ = 0;
   UseNNAPI(false);
 }
 
@@ -160,7 +167,7 @@ TfLiteIntArray* convertVectorToTfLiteIntArray(const std::vector<int>& x) {
 }  // namespace
 
 TfLiteStatus Interpreter::AllocateTensors() {
-  next_node_to_prepare_ = 0;
+  next_execution_plan_index_to_prepare_ = 0;
   if (memory_planner_) {
     TF_LITE_ENSURE_STATUS(memory_planner_->ResetAllocations());
   }
@@ -190,7 +197,8 @@ TfLiteStatus Interpreter::AddNodeWithParameters(
       &context_,
       CheckTensorIndices("node outputs", outputs.data(), outputs.size()));
 
-  if (node_index) *node_index = nodes_and_registration_.size();
+  int new_node_index = nodes_and_registration_.size();
+  if (node_index) *node_index = new_node_index;
   nodes_and_registration_.resize(nodes_and_registration_.size() + 1);
   auto& node_and_reg = nodes_and_registration_.back();
   TfLiteNode& node = node_and_reg.first;
@@ -213,6 +221,7 @@ TfLiteStatus Interpreter::AddNodeWithParameters(
   }
   node.builtin_data = builtin_data_deleter.release();
   node_and_reg.second = *registration;
+  execution_plan_.push_back(new_node_index);
   return kTfLiteOk;
 }
 
@@ -240,16 +249,19 @@ bool HasDynamicTensor(const TfLiteContext& context,
   return false;
 }
 
-TfLiteStatus Interpreter::PrepareOpsStartingAt(int first_node,
-                                               int* last_node_prepared) {
-  for (int i = first_node; i < nodes_and_registration_.size(); i++) {
-    TfLiteNode& node = nodes_and_registration_[i].first;
-    const TfLiteRegistration& registration = nodes_and_registration_[i].second;
+TfLiteStatus Interpreter::PrepareOpsStartingAt(
+    int first_execution_plan_index, int* last_execution_plan_index_prepared) {
+  for (int execution_plan_index = first_execution_plan_index;
+       execution_plan_index < execution_plan_.size(); execution_plan_index++) {
+    int node_index = execution_plan_[execution_plan_index];
+    TfLiteNode& node = nodes_and_registration_[node_index].first;
+    const TfLiteRegistration& registration =
+        nodes_and_registration_[node_index].second;
     if (OpPrepare(registration, &node) == kTfLiteError) {
       return kTfLiteError;
     }
 
-    *last_node_prepared = i;
+    *last_execution_plan_index_prepared = execution_plan_index;
 
     // Discontinue if the node has dynamic outputs. Note that we don't
     // stop for dynamic temporary tensors since they won't affect the
@@ -268,14 +280,14 @@ TfLiteStatus Interpreter::PrepareOpsAndTensors() {
     memory_planner_->PlanAllocations();
   }
 
-  int last_node_prepared = 0;
+  int last_exec_plan_index_prepared = 0;
 
-  TF_LITE_ENSURE_STATUS(
-      PrepareOpsStartingAt(next_node_to_prepare_, &last_node_prepared));
+  TF_LITE_ENSURE_STATUS(PrepareOpsStartingAt(
+      next_execution_plan_index_to_prepare_, &last_exec_plan_index_prepared));
   TF_LITE_ENSURE_STATUS(memory_planner_->ExecuteAllocations(
-      next_node_to_prepare_, last_node_prepared));
+      next_execution_plan_index_to_prepare_, last_exec_plan_index_prepared));
 
-  next_node_to_prepare_ = last_node_prepared + 1;
+  next_execution_plan_index_to_prepare_ = last_exec_plan_index_prepared + 1;
   return kTfLiteOk;
 }
 
@@ -292,7 +304,7 @@ TfLiteStatus Interpreter::Invoke() {
   TfLiteStatus status = kTfLiteOk;
   if (nnapi_delegate_) {
     TF_LITE_ENSURE_STATUS(PrepareOpsAndTensors());
-    if (next_node_to_prepare_ == nodes_and_registration_.size()) {
+    if (next_execution_plan_index_to_prepare_ == execution_plan_.size()) {
       TF_LITE_ENSURE_OK(&context_, nnapi_delegate_->Invoke(this));
       return kTfLiteOk;
     } else {
@@ -312,13 +324,17 @@ TfLiteStatus Interpreter::Invoke() {
   // TODO(b/71913981): we should force recalculation in the presence of dynamic
   // tensors, because they may have new value which in turn may affect shapes
   // and allocations.
-  for (int i = 0; i < nodes_and_registration_.size(); i++) {
-    if (i == next_node_to_prepare_) {
+  for (int execution_plan_index = 0;
+       execution_plan_index < execution_plan_.size(); execution_plan_index++) {
+    if (execution_plan_index == next_execution_plan_index_to_prepare_) {
       TF_LITE_ENSURE_STATUS(PrepareOpsAndTensors());
-      TF_LITE_ENSURE(&context_, next_node_to_prepare_ >= i);
+      TF_LITE_ENSURE(&context_, next_execution_plan_index_to_prepare_ >=
+                                    execution_plan_index);
     }
-    TfLiteNode& node = nodes_and_registration_[i].first;
-    const TfLiteRegistration& registration = nodes_and_registration_[i].second;
+    int node_index = execution_plan_[execution_plan_index];
+    TfLiteNode& node = nodes_and_registration_[node_index].first;
+    const TfLiteRegistration& registration =
+        nodes_and_registration_[node_index].second;
     if (OpInvoke(registration, &node) == kTfLiteError) {
       status = kTfLiteError;
     }
@@ -422,6 +438,14 @@ TfLiteStatus Interpreter::SetTensorParametersReadWrite(
   return kTfLiteOk;
 }
 
+TfLiteStatus Interpreter::SetExecutionPlan(const std::vector<int>& new_plan) {
+  for (int node_index : new_plan) {
+    TF_LITE_ENSURE(&context_, node_index >= 0 && node_index < nodes_size());
+  }
+  execution_plan_ = new_plan;
+  return kTfLiteOk;
+}
+
 TfLiteStatus Interpreter::ResizeTensorImpl(TfLiteTensor* tensor,
                                            TfLiteIntArray* new_size) {
   // Note that in theory we could resize kTfLiteArenaRwPersistent tensors too.
diff --git a/tensorflow/contrib/lite/interpreter.h b/tensorflow/contrib/lite/interpreter.h
index 52e52df1b6..c822557d02 100644
--- a/tensorflow/contrib/lite/interpreter.h
+++ b/tensorflow/contrib/lite/interpreter.h
@@ -166,6 +166,13 @@ class Interpreter {
   // Return the number of ops in the model.
   int nodes_size() const { return nodes_and_registration_.size(); }
 
+  // WARNING: Experimental interface, subject to change
+  const std::vector<int>& execution_plan() const { return execution_plan_; }
+
+  // WARNING: Experimental interface, subject to change
+  // Overrides execution plan. This bounds checks indices sent in.
+  TfLiteStatus SetExecutionPlan(const std::vector<int>& new_plan);
+
   // Get a tensor data structure.
   // TODO(aselle): Create a safe ArrayHandle interface to avoid exposing this
   // read/write access to structure
@@ -279,7 +286,8 @@ class Interpreter {
   // dynamic tensors is found or all ops have been prepared. Fill
   // 'last_node_prepared' with the id of the op containing dynamic tensors, or
   // the last in the graph.
-  TfLiteStatus PrepareOpsStartingAt(int first_node, int* last_node_prepared);
+  TfLiteStatus PrepareOpsStartingAt(int first_execution_plan_index,
+                                    int* last_execution_plan_index_prepared);
 
   // Tensors needed by the interpreter. Use `AddTensors` to add more blank
   // tensor entries. Note, `tensors_.data()` needs to be synchronized to the
@@ -354,7 +362,14 @@ class Interpreter {
   // node id, and execute the node to generate the output tensor before continue
   // to allocate successors. This process repeats until all nodes are executed.
   // NOTE: this relies on the order of nodes that is in topological order.
-  int next_node_to_prepare_;
+  int next_execution_plan_index_to_prepare_;
+
+  // WARNING: This is an experimental interface that is subject to change.
+  // This is a list of node indices (to index into nodes_and_registration).
+  // This represents a valid topological sort (dependency ordered) execution
+  // plan. In particular, it is valid for this ordering to contain only a
+  // subset of the node indices.
+  std::vector<int> execution_plan_;
 
   // Whether to delegate to NN API
   std::unique_ptr<NNAPIDelegate> nnapi_delegate_;
diff --git a/tensorflow/contrib/lite/interpreter_test.cc b/tensorflow/contrib/lite/interpreter_test.cc
index edff210943..2ab4bb6567 100644
--- a/tensorflow/contrib/lite/interpreter_test.cc
+++ b/tensorflow/contrib/lite/interpreter_test.cc
@@ -514,6 +514,133 @@ TEST(BasicInterpreter, TestCustomErrorReporter) {
   ASSERT_EQ(reporter.calls, 1);
 }
 
+// Test fixture that allows playing with execution plans. It creates a two
+// node graph that can be executed in either [0,1] order or [1,0] order.
+// The CopyOp records when it is invoked in the class member run_order_
+// so we can test whether the execution plan was honored.
+class TestExecutionPlan : public ::testing::Test {
+  // Encapsulates the node ids and provides them to a C primitive data type
+  // Allocatable with placement new, but never destructed, so make sure this
+  // doesn't own any heap allocated data. This is then is used as op local
+  // data to allow access to the test fixture data.
+  class CallReporting {
+   public:
+    CallReporting(int node_id, std::vector<int>* run_order)
+        : node_id_(node_id), run_order_(run_order) {}
+
+    void Record() { run_order_->push_back(node_id_); }
+
+   private:
+    // The node id for this particular node
+    int node_id_;
+    // A pointer to the global run-order
+    std::vector<int>* run_order_;
+  };
+
+  // Build a kernel registration for an op that copies its one input
+  // to an output
+  TfLiteRegistration CopyOpRegistration() {
+    TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
+
+    reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+      // Set output size to input size
+      TfLiteTensor* tensor0 = &context->tensors[node->inputs->data[0]];
+      TfLiteTensor* tensor1 = &context->tensors[node->outputs->data[0]];
+      TfLiteIntArray* newSize = TfLiteIntArrayCopy(tensor0->dims);
+      return context->ResizeTensor(context, tensor1, newSize);
+    };
+
+    reg.invoke = [](TfLiteContext* context, TfLiteNode* node) {
+      CallReporting* call_reporting =
+          reinterpret_cast<CallReporting*>(node->builtin_data);
+      // Copy input data to output data.
+      TfLiteTensor* a0 = &context->tensors[node->inputs->data[0]];
+      TfLiteTensor* a1 = &context->tensors[node->outputs->data[0]];
+      int num = a0->dims->data[0];
+      for (int i = 0; i < num; i++) {
+        a1->data.f[i] = a0->data.f[i];
+      }
+      call_reporting->Record();
+      return kTfLiteOk;
+    };
+    return reg;
+  }
+
+  // Adds a copy node going from tensor `input` to output tensor `output`.
+  // Note, input is used as the node_id. Inject run_order as op accessible
+  // data. Note: this is a little strange of a way to do this, but it is
+  // using op functionality to avoid static global variables.
+  void MakeCopyNode(int input, int output) {
+    // Ownership of call_reporting is taken by interpreter (malloc is used due
+    // to nodes being a C99 interface so free() is used).
+    TfLiteRegistration copy_op = CopyOpRegistration();
+    CallReporting* call_reporting_1 =
+        reinterpret_cast<CallReporting*>(malloc(sizeof(CallReporting)));
+    new (call_reporting_1) CallReporting(input, &run_order_);
+    ASSERT_EQ(interpreter_.AddNodeWithParameters(
+                  {0}, {2}, nullptr, 0,
+                  reinterpret_cast<void*>(call_reporting_1), &copy_op),
+              kTfLiteOk);
+    ASSERT_EQ(interpreter_.ResizeInputTensor(input, {3}), kTfLiteOk);
+  }
+
+  void SetUp() final {
+    // Add two inputs and two outputs that don't depend on each other
+    ASSERT_EQ(interpreter_.AddTensors(4), kTfLiteOk);
+    interpreter_.SetInputs({0, 1});
+    interpreter_.SetOutputs({2, 3});
+    TfLiteQuantizationParams quantized;
+    for (int tensor_index = 0; tensor_index < 4; tensor_index++) {
+      ASSERT_EQ(interpreter_.SetTensorParametersReadWrite(
+                    tensor_index, kTfLiteFloat32, "", {3}, quantized),
+                kTfLiteOk);
+    }
+
+    // Define two copy functions that also use the user_data to report that
+    // they were called.
+    // i.e. tensor[2] = copy(tensor[0]); tensor[3] = copy(tensor[1]);
+    // thus we can reorder the two nodes arbitrary and still satisfy dependency
+    // order.
+    MakeCopyNode(0, 2);
+    MakeCopyNode(1, 3);
+
+    ASSERT_EQ(interpreter_.AllocateTensors(), kTfLiteOk);
+  }
+
+ protected:
+  Interpreter interpreter_;
+
+  // list of node_ids that were run
+  std::vector<int> run_order_;
+};
+
+TEST_F(TestExecutionPlan, DefaultExecutionPlan) {
+  // Check default order
+  ASSERT_EQ(interpreter_.Invoke(), kTfLiteOk);
+  ASSERT_EQ(run_order_, std::vector<int>({0, 1}));
+}
+
+TEST_F(TestExecutionPlan, ReversedExecutionPlan) {
+  // Check reversed order
+  interpreter_.SetExecutionPlan({1, 0});
+  ASSERT_EQ(interpreter_.Invoke(), kTfLiteOk);
+  ASSERT_EQ(run_order_, std::vector<int>({1, 0}));
+}
+
+TEST_F(TestExecutionPlan, SubsetExecutionPlan) {
+  // Check running only node index 1
+  interpreter_.SetExecutionPlan({1});
+  ASSERT_EQ(interpreter_.Invoke(), kTfLiteOk);
+  ASSERT_EQ(run_order_, std::vector<int>({1}));
+}
+
+TEST_F(TestExecutionPlan, NullExecutionPlan) {
+  // Check nothing executed.
+  interpreter_.SetExecutionPlan({});
+  ASSERT_EQ(interpreter_.Invoke(), kTfLiteOk);
+  ASSERT_EQ(run_order_, std::vector<int>());
+}
+
 }  // namespace
 }  // namespace tflite
 
-- 
GitLab


From c21282004f535edd7d1d56e2c72a17ca0880bcaf Mon Sep 17 00:00:00 2001
From: Benoit Steiner <bsteiner@google.com>
Date: Thu, 1 Feb 2018 17:19:32 -0800
Subject: [PATCH 554/966] Skip unknown devices since we can't optimize for them

PiperOrigin-RevId: 184220515
---
 tensorflow/core/common_runtime/graph_execution_state.cc | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/common_runtime/graph_execution_state.cc b/tensorflow/core/common_runtime/graph_execution_state.cc
index 3b309e915c..33a5d60eb7 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.cc
+++ b/tensorflow/core/common_runtime/graph_execution_state.cc
@@ -340,8 +340,11 @@ Status GraphExecutionState::OptimizeGraph(
     std::unordered_map<string, DeviceProperties> device_map;
     Device* cpu_device = nullptr;
     for (const auto& device : device_set_->devices()) {
-      device_map[device->name()] =
-          grappler::GetDeviceInfo(device->parsed_name());
+      DeviceProperties props = grappler::GetDeviceInfo(device->parsed_name());
+      if (props.type() == "UNKNOWN") {
+        continue;
+      }
+      device_map[device->name()] = props;
       if (device->parsed_name().id == 0 &&
           StringPiece(device->parsed_name().type) == "CPU" &&
           device->GetAllocator(AllocatorAttributes()) != nullptr) {
-- 
GitLab


From ff81ca3d1303ec3ad178113a3398f8f1cac0304d Mon Sep 17 00:00:00 2001
From: Brennan Saeta <saeta@google.com>
Date: Thu, 1 Feb 2018 17:20:11 -0800
Subject: [PATCH 555/966] Fix tests

PiperOrigin-RevId: 184220615
---
 tensorflow/core/platform/cloud/gcs_throttle_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/platform/cloud/gcs_throttle_test.cc b/tensorflow/core/platform/cloud/gcs_throttle_test.cc
index a1e8167c27..694756022e 100644
--- a/tensorflow/core/platform/cloud/gcs_throttle_test.cc
+++ b/tensorflow/core/platform/cloud/gcs_throttle_test.cc
@@ -68,7 +68,7 @@ TEST_F(GcsThrottleTest, RejectRequest) {
 TEST_F(GcsThrottleTest, MarkResponses) {
   time_.AdvanceSeconds(1);
   EXPECT_TRUE(throttle_.AdmitRequest());
-  throttle_.RecordResponse(32000000);  // 32 MB response
+  throttle_.RecordResponse(128000000);  // 128 MB response
   EXPECT_EQ(-25100, throttle_.available_tokens());
   EXPECT_FALSE(throttle_.AdmitRequest());
   time_.AdvanceSeconds(1);
-- 
GitLab


From ae805bd8ca3bb3e385b145ca8439e4150f4aae51 Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Thu, 1 Feb 2018 10:57:29 -0800
Subject: [PATCH 556/966] Fix sanity.

---
 tensorflow/tools/ci_build/ci_sanity.sh    |  5 ++---
 tensorflow/tools/pip_package/BUILD        | 14 ++++++++++++++
 third_party/flatbuffers/flatbuffers.BUILD |  2 ++
 third_party/pcre.BUILD                    |  2 +-
 third_party/termcolor.BUILD               |  2 +-
 5 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index 6e4b821463..c1612201a3 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -523,9 +523,8 @@ do_check_file_name_test() {
 }
 
 # Supply all sanity step commands and descriptions
-SANITY_STEPS=("do_pylint PYTHON2" "do_pylint PYTHON3" "do_check_futures_test" "do_buildifier" "do_bazel_nobuild" "do_pip_package_licenses_check" "do_lib_package_licenses_check" "do_java_package_licenses_check" "do_pip_smoke_test" "do_check_load_py_test" "do_code_link_check" "do_cmake_python_sanity" "do_check_file_name_test")
-SANITY_STEPS_DESC=("Python 2 pylint" "Python 3 pylint" "Check that python files have certain __future__ imports" "buildifier check" "bazel nobuild" "pip: license check for external dependencies" "C library: license check for external dependencies" "Java Native Library: license check for external dependencies" "Pip Smoke Test: Checking py_test dependencies exist in pip package" "Check load py_test: Check that BUILD files with py_test target properly load py_test" "Code Link Check: Check there are no broken links" "Test entries in /tensorflow/contrib/cmake/python_{modules|protos|protos_cc}.txt for validity and consistency" "Check file names for cases")
-
+SANITY_STEPS=("do_pip_package_licenses_check")
+SANITY_STEPS_DESC=("hihi")
 INCREMENTAL_FLAG=""
 DEFAULT_BAZEL_CONFIGS="--config=hdfs --config=gcp"
 
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index e4fa6694d8..6381b9144f 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -88,13 +88,23 @@ filegroup(
         "//third_party/eigen3:LICENSE",
         "//third_party/fft2d:LICENSE",
         "//third_party/hadoop:LICENSE.txt",
+        "@absl_py//absl/flags:LICENSE",
+        "@arm_neon_2_x86_sse//:LICENSE",
+        "@astor_archive//:LICENSE",
+        "@aws//:LICENSE",
+        "@bazel_tools//third_party/def_parser:Copyright.txt",
+        "@bazel_tools//third_party/ijar:LICENSE",
+        "@bazel_tools//third_party/zlib:LICENSE.txt",
         "@boringssl//:LICENSE",
+        "@com_google_absl//:LICENSE",
         "@com_googlesource_code_re2//:LICENSE",
         "@cub_archive//:LICENSE.TXT",
         "@curl//:COPYING",
         "@eigen_archive//:COPYING.MPL2",
         "@farmhash_archive//:COPYING",
         "@fft2d//:fft/readme.txt",
+        "@flatbuffers//:LICENSE.txt",
+        "@gast_archive//:LICENSE",
         "@gemmlowp//:LICENSE",
         "@gif_archive//:COPYING",
         "@grpc//:LICENSE",
@@ -105,11 +115,15 @@ filegroup(
         "@lmdb//:LICENSE",
         "@local_config_sycl//sycl:LICENSE.text",
         "@grpc//third_party/nanopb:LICENSE.txt",
+        "@nasm//:LICENSE",
         "@nsync//:LICENSE",
+        "@pcre//:LICENCE",
         "@png_archive//:LICENSE",
         "@protobuf_archive//:LICENSE",
         "@six_archive//:LICENSE",
         "@snappy//:COPYING",
+        "@swig//:LICENSE",
+        "@termcolor_archive//:COPYING.txt",
         "@zlib_archive//:zlib.h",
         "@org_python_pypi_backports_weakref//:LICENSE",
     ] + if_mkl([
diff --git a/third_party/flatbuffers/flatbuffers.BUILD b/third_party/flatbuffers/flatbuffers.BUILD
index f6b8e6ddb0..824c97be60 100644
--- a/third_party/flatbuffers/flatbuffers.BUILD
+++ b/third_party/flatbuffers/flatbuffers.BUILD
@@ -4,6 +4,8 @@ package(
 
 licenses(["notice"])  # Apache 2.0
 
+exports_files(["LICENSE.txt"])
+
 config_setting(
     name = "freebsd",
     values = {"cpu": "freebsd"},
diff --git a/third_party/pcre.BUILD b/third_party/pcre.BUILD
index e2cdec4029..3a8e7a10b4 100644
--- a/third_party/pcre.BUILD
+++ b/third_party/pcre.BUILD
@@ -1,6 +1,6 @@
 licenses(["notice"])  # BSD
 
-exports_files(["COPYING"])
+exports_files(["LICENCE"])
 
 cc_library(
     name = "pcre",
diff --git a/third_party/termcolor.BUILD b/third_party/termcolor.BUILD
index 6000e3289d..655d7cb85e 100644
--- a/third_party/termcolor.BUILD
+++ b/third_party/termcolor.BUILD
@@ -3,7 +3,7 @@
 
 licenses(["notice"])  # MIT
 
-exports_files(["LICENSE"])
+exports_files(["COPYING.txt"])
 
 py_library(
     name = "termcolor",
-- 
GitLab


From d6c449fc8dbaa2a69107d74ef98b9850c799ab6e Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Thu, 1 Feb 2018 10:59:16 -0800
Subject: [PATCH 557/966] Add back other sanity tests.

---
 tensorflow/tools/ci_build/ci_sanity.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index c1612201a3..f0d38db915 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -523,8 +523,8 @@ do_check_file_name_test() {
 }
 
 # Supply all sanity step commands and descriptions
-SANITY_STEPS=("do_pip_package_licenses_check")
-SANITY_STEPS_DESC=("hihi")
+SANITY_STEPS=("do_pylint PYTHON2" "do_pylint PYTHON3" "do_check_futures_test" "do_buildifier" "do_bazel_nobuild" "do_pip_package_licenses_check" "do_lib_package_licenses_check" "do_java_package_licenses_check" "do_pip_smoke_test" "do_check_load_py_test" "do_code_link_check" "do_cmake_python_sanity" "do_check_file_name_test")    +SANITY_STEPS=("do_pip_package_licenses_check")
+SANITY_STEPS_DESC=("Python 2 pylint" "Python 3 pylint" "Check that python files have certain __future__ imports" "buildifier check" "bazel nobuild" "pip: license check for external dependencies" "C library: license check for external dependencies" "Java Native Library: license check for external dependencies" "Pip Smoke Test: Checking py_test dependencies exist in pip package" "Check load py_test: Check that BUILD files with py_test target properly load py_test" "Code Link Check: Check there are no broken links" "Test entries in /tensorflow/contrib/cmake/python_{modules|protos|protos_cc}.txt for validity and consistency" "Check file names for cases")
 INCREMENTAL_FLAG=""
 DEFAULT_BAZEL_CONFIGS="--config=hdfs --config=gcp"
 
-- 
GitLab


From 33d88a86ab0c97bcd1fe936057ea7a2dccec3e97 Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Thu, 1 Feb 2018 11:00:14 -0800
Subject: [PATCH 558/966] Add sanity test back.

---
 tensorflow/tools/ci_build/ci_sanity.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index f0d38db915..035b2d2546 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -523,7 +523,7 @@ do_check_file_name_test() {
 }
 
 # Supply all sanity step commands and descriptions
-SANITY_STEPS=("do_pylint PYTHON2" "do_pylint PYTHON3" "do_check_futures_test" "do_buildifier" "do_bazel_nobuild" "do_pip_package_licenses_check" "do_lib_package_licenses_check" "do_java_package_licenses_check" "do_pip_smoke_test" "do_check_load_py_test" "do_code_link_check" "do_cmake_python_sanity" "do_check_file_name_test")    +SANITY_STEPS=("do_pip_package_licenses_check")
+SANITY_STEPS=("do_pylint PYTHON2" "do_pylint PYTHON3" "do_check_futures_test" "do_buildifier" "do_bazel_nobuild" "do_pip_package_licenses_check" "do_lib_package_licenses_check" "do_java_package_licenses_check" "do_pip_smoke_test" "do_check_load_py_test" "do_code_link_check" "do_cmake_python_sanity" "do_check_file_name_test")
 SANITY_STEPS_DESC=("Python 2 pylint" "Python 3 pylint" "Check that python files have certain __future__ imports" "buildifier check" "bazel nobuild" "pip: license check for external dependencies" "C library: license check for external dependencies" "Java Native Library: license check for external dependencies" "Pip Smoke Test: Checking py_test dependencies exist in pip package" "Check load py_test: Check that BUILD files with py_test target properly load py_test" "Code Link Check: Check there are no broken links" "Test entries in /tensorflow/contrib/cmake/python_{modules|protos|protos_cc}.txt for validity and consistency" "Check file names for cases")
 INCREMENTAL_FLAG=""
 DEFAULT_BAZEL_CONFIGS="--config=hdfs --config=gcp"
-- 
GitLab


From 8a5fc0e5ddcf0e19b14e78ba9f9c8dc783da4207 Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Thu, 1 Feb 2018 13:23:12 -0800
Subject: [PATCH 559/966] Fix build error.

---
 tensorflow/tools/pip_package/BUILD |  5 +----
 tensorflow/workspace.bzl           |  1 +
 third_party/com_google_absl.BUILD  | 17 +++++++++++++++++
 third_party/gast.BUILD             |  2 +-
 4 files changed, 20 insertions(+), 5 deletions(-)
 create mode 100644 third_party/com_google_absl.BUILD

diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 6381b9144f..a9c4a8de42 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -92,9 +92,6 @@ filegroup(
         "@arm_neon_2_x86_sse//:LICENSE",
         "@astor_archive//:LICENSE",
         "@aws//:LICENSE",
-        "@bazel_tools//third_party/def_parser:Copyright.txt",
-        "@bazel_tools//third_party/ijar:LICENSE",
-        "@bazel_tools//third_party/zlib:LICENSE.txt",
         "@boringssl//:LICENSE",
         "@com_google_absl//:LICENSE",
         "@com_googlesource_code_re2//:LICENSE",
@@ -104,7 +101,7 @@ filegroup(
         "@farmhash_archive//:COPYING",
         "@fft2d//:fft/readme.txt",
         "@flatbuffers//:LICENSE.txt",
-        "@gast_archive//:LICENSE",
+        "@gast_archive//:PKG-INFO",
         "@gemmlowp//:LICENSE",
         "@gif_archive//:COPYING",
         "@grpc//:LICENSE",
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index f965bd696f..b6bba78401 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -114,6 +114,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
      sha256 = "5996380e3e8b981f55d1c8d58e709c00dbb4806ba367be75d0925a68cc2f6478",
      strip_prefix = "abseil-cpp-720c017e30339fd1786ce4aac68bc8559736e53f",
+     build_file = str(Label("//third_party:com_google_absl.BUILD")),
   )
 
   tf_http_archive(
diff --git a/third_party/com_google_absl.BUILD b/third_party/com_google_absl.BUILD
new file mode 100644
index 0000000000..0c8d327c1f
--- /dev/null
+++ b/third_party/com_google_absl.BUILD
@@ -0,0 +1,17 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache
+
+exports_files(["LICENSE"])
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/third_party/gast.BUILD b/third_party/gast.BUILD
index 06db528ada..4866982e1f 100644
--- a/third_party/gast.BUILD
+++ b/third_party/gast.BUILD
@@ -3,7 +3,7 @@
 
 licenses(["notice"])  # BSD 3-clause
 
-exports_files(["LICENSE"])
+exports_files(["PKG-INFO"])
 
 py_library(
     name = "gast",
-- 
GitLab


From 4b137b3510b0cdd266e2a87138ca7c16daac73ab Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Thu, 1 Feb 2018 15:45:39 -0800
Subject: [PATCH 560/966] Add blacklist.

---
 tensorflow/tools/ci_build/ci_sanity.sh | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index 035b2d2546..87ca159062 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -343,6 +343,13 @@ do_external_licenses_check(){
   comm -2 -3 ${EXTERNAL_DEPENDENCIES_FILE}  ${LICENSES_FILE} 2>&1 | tee ${MISSING_LICENSES_FILE}
 
   EXTERNAL_LICENSES_CHECK_END_TIME=$(date +'%s')
+  
+  
+  # Things okay to miss
+  echo ${MISSING_LICENSES_FILE}
+  grep -e "@bazel_tools//third_party/" -e "@com_google_absl//absl" -e "@org_tensorflow//" -v ${MISSING_LICENSES_FILE} > temp.txt
+  mv temp.txt ${MISSING_LICENSES_FILE}
+
 
   echo
   echo "do_external_licenses_check took $((EXTERNAL_LICENSES_CHECK_END_TIME - EXTERNAL_LICENSES_CHECK_START_TIME)) s"
@@ -525,6 +532,7 @@ do_check_file_name_test() {
 # Supply all sanity step commands and descriptions
 SANITY_STEPS=("do_pylint PYTHON2" "do_pylint PYTHON3" "do_check_futures_test" "do_buildifier" "do_bazel_nobuild" "do_pip_package_licenses_check" "do_lib_package_licenses_check" "do_java_package_licenses_check" "do_pip_smoke_test" "do_check_load_py_test" "do_code_link_check" "do_cmake_python_sanity" "do_check_file_name_test")
 SANITY_STEPS_DESC=("Python 2 pylint" "Python 3 pylint" "Check that python files have certain __future__ imports" "buildifier check" "bazel nobuild" "pip: license check for external dependencies" "C library: license check for external dependencies" "Java Native Library: license check for external dependencies" "Pip Smoke Test: Checking py_test dependencies exist in pip package" "Check load py_test: Check that BUILD files with py_test target properly load py_test" "Code Link Check: Check there are no broken links" "Test entries in /tensorflow/contrib/cmake/python_{modules|protos|protos_cc}.txt for validity and consistency" "Check file names for cases")
+
 INCREMENTAL_FLAG=""
 DEFAULT_BAZEL_CONFIGS="--config=hdfs --config=gcp"
 
-- 
GitLab


From 73019bc43d81c781b591407f97f409b8570c6115 Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Thu, 1 Feb 2018 16:27:49 -0800
Subject: [PATCH 561/966] Add whitelist.

---
 tensorflow/tools/ci_build/ci_sanity.sh | 11 ++++++++---
 tensorflow/tools/lib_package/BUILD     |  4 ++++
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index 87ca159062..b3a8ff2ac7 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -343,13 +343,18 @@ do_external_licenses_check(){
   comm -2 -3 ${EXTERNAL_DEPENDENCIES_FILE}  ${LICENSES_FILE} 2>&1 | tee ${MISSING_LICENSES_FILE}
 
   EXTERNAL_LICENSES_CHECK_END_TIME=$(date +'%s')
-  
-  
-  # Things okay to miss
+
+  # Blacklist
   echo ${MISSING_LICENSES_FILE}
   grep -e "@bazel_tools//third_party/" -e "@com_google_absl//absl" -e "@org_tensorflow//" -v ${MISSING_LICENSES_FILE} > temp.txt
   mv temp.txt ${MISSING_LICENSES_FILE}
 
+  # Whitelist
+  echo ${EXTRA_LICENSE_FILE}
+  grep -e "@bazel_tools//src/" -e "@bazel_tools//tools/" -e "@com_google_absl//" -e "//external" -e "@local" -v ${EXTRA_LICENSES_FILE} > temp.txt
+  mv temp.txt ${EXTRA_LICENSES_FILE}
+
+
 
   echo
   echo "do_external_licenses_check took $((EXTERNAL_LICENSES_CHECK_END_TIME - EXTERNAL_LICENSES_CHECK_START_TIME)) s"
diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index dbc81599de..7717d8d7de 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -99,6 +99,7 @@ genrule(
         "//third_party/hadoop:LICENSE.txt",
         "//third_party/eigen3:LICENSE",
         "//third_party/fft2d:LICENSE",
+        "@aws//:LICENSE",
         "@boringssl//:LICENSE",
         "@com_googlesource_code_re2//:LICENSE",
         "@cub_archive//:LICENSE.TXT",
@@ -114,6 +115,7 @@ genrule(
         "@libxsmm_archive//:LICENSE",
         "@lmdb//:LICENSE",
         "@local_config_sycl//sycl:LICENSE.text",
+        "@nasm//:LICENSE",
         "@nsync//:LICENSE",
         "@png_archive//:LICENSE",
         "@protobuf_archive//:LICENSE",
@@ -134,6 +136,7 @@ genrule(
         "//third_party/hadoop:LICENSE.txt",
         "//third_party/eigen3:LICENSE",
         "//third_party/fft2d:LICENSE",
+        "@aws//:LICENSE",
         "@boringssl//:LICENSE",
         "@com_googlesource_code_re2//:LICENSE",
         "@cub_archive//:LICENSE.TXT",
@@ -149,6 +152,7 @@ genrule(
         "@libxsmm_archive//:LICENSE",
         "@lmdb//:LICENSE",
         "@local_config_sycl//sycl:LICENSE.text",
+        "@nasm//:LICENSE",
         "@nsync//:LICENSE",
         "@png_archive//:LICENSE",
         "@protobuf_archive//:LICENSE",
-- 
GitLab


From b0d432dcc8c52177ebbaeb3a74d488f0af702f21 Mon Sep 17 00:00:00 2001
From: lissyx <lissyx+github@lissyx.dyndns.org>
Date: Fri, 2 Feb 2018 03:03:31 +0100
Subject: [PATCH 562/966] Force sorting of CUDA and Python headers to avoid
 spurious rebuilds (#16586)

If one does try to re-use Bazel cache of a TensorFlow CUDA-enabled or
Python-enabled build, then it might happen that readdir() syscall behind
the use of find in _read_dir() will generate a different ordering of the
very same list of headers. This will make new genrules for symlinking
the CUDA headers and in the end it will result in different actionKey
computed by Bazel, hence invalidating the action cache.

Fixes #16585
---
 third_party/gpus/cuda_configure.bzl | 2 +-
 third_party/py/python_configure.bzl | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index 8e1dd8a54f..255ae01190 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -826,7 +826,7 @@ def symlink_genrule_for_dir(repository_ctx, src_dir, dest_dir, genrule_name,
   if src_dir != None:
     src_dir = _norm_path(src_dir)
     dest_dir = _norm_path(dest_dir)
-    files = _read_dir(repository_ctx, src_dir)
+    files = '\n'.join(sorted(_read_dir(repository_ctx, src_dir).splitlines()))
     # Create a list with the src_dir stripped to use for outputs.
     dest_files = files.replace(src_dir, '').splitlines()
     src_files = files.splitlines()
diff --git a/third_party/py/python_configure.bzl b/third_party/py/python_configure.bzl
index c16eb3a12a..954f21f5f8 100644
--- a/third_party/py/python_configure.bzl
+++ b/third_party/py/python_configure.bzl
@@ -118,7 +118,7 @@ def _symlink_genrule_for_dir(repository_ctx, src_dir, dest_dir, genrule_name,
   if src_dir != None:
     src_dir = _norm_path(src_dir)
     dest_dir = _norm_path(dest_dir)
-    files = _read_dir(repository_ctx, src_dir)
+    files = '\n'.join(sorted(_read_dir(repository_ctx, src_dir).splitlines()))
     # Create a list with the src_dir stripped to use for outputs.
     dest_files = files.replace(src_dir, '').splitlines()
     src_files = files.splitlines()
-- 
GitLab


From 67cf0604df88b260b6adf3393e9481d1e31eb0f0 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <bsteiner@google.com>
Date: Thu, 1 Feb 2018 18:05:15 -0800
Subject: [PATCH 563/966] Fixed the description of the fake GPU device to avoid
 a division by 0

PiperOrigin-RevId: 184225409
---
 tensorflow/core/grappler/costs/op_level_cost_estimator.cc | 4 ++++
 tensorflow/python/grappler/layout_optimizer_test.py       | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
index cf317374cf..5600267f6a 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -353,6 +353,9 @@ OpLevelCostEstimator::DeviceInfo OpLevelCostEstimator::GetDeviceInfo(
   VLOG(1) << "Device: " << device.type() << " gflops: " << gflops
           << " gb_per_sec: " << gb_per_sec;
 
+  DCHECK_LT(0, gflops) << device.DebugString();
+  DCHECK_LT(0, gb_per_sec) << device.DebugString();
+
   return {gflops, gb_per_sec};
 }
 
@@ -408,6 +411,7 @@ Costs OpLevelCostEstimator::PredictCostOfAnUnknownOp(
 Costs OpLevelCostEstimator::PredictOpCountBasedCost(
     double operations, const OpInfo& op_features) const {
   DeviceInfo device_perf = GetDeviceInfo(op_features.device());
+
   Costs::NanoSeconds compute_cost(std::ceil(operations / device_perf.gigaops));
   VLOG(1) << "Op:" << op_features.op() << " GOps:" << operations / 1e9
           << " Execution Time (ns):" << compute_cost.count();
diff --git a/tensorflow/python/grappler/layout_optimizer_test.py b/tensorflow/python/grappler/layout_optimizer_test.py
index 578f86ca5a..332ee725f0 100644
--- a/tensorflow/python/grappler/layout_optimizer_test.py
+++ b/tensorflow/python/grappler/layout_optimizer_test.py
@@ -179,6 +179,8 @@ def _get_cluster():
   named_device = device_properties_pb2.NamedDevice()
   named_device.name = '/GPU:0'
   named_device.properties.type = 'GPU'
+  named_device.properties.num_cores = 24
+  named_device.properties.frequency = 1000
   named_device.properties.environment['architecture'] = '4'
   cluster = gcluster.Cluster(devices=[named_device])
   return cluster
-- 
GitLab


From 930b347273c1b9ac32b7c96a8d35e92704c4e8b7 Mon Sep 17 00:00:00 2001
From: Yu-Cheng Ling <ycling@google.com>
Date: Thu, 1 Feb 2018 18:27:31 -0800
Subject: [PATCH 564/966] Add darwin_x86_64 config in TF Lite BUILD file.

PiperOrigin-RevId: 184227786
---
 tensorflow/contrib/lite/kernels/internal/BUILD | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tensorflow/contrib/lite/kernels/internal/BUILD b/tensorflow/contrib/lite/kernels/internal/BUILD
index 21118fc96d..de635cfb4a 100644
--- a/tensorflow/contrib/lite/kernels/internal/BUILD
+++ b/tensorflow/contrib/lite/kernels/internal/BUILD
@@ -124,6 +124,13 @@ config_setting(
     },
 )
 
+config_setting(
+    name = "darwin_x86_64",
+    values = {
+        "cpu": "darwin_x86_64",
+    },
+)
+
 config_setting(
     name = "freebsd",
     values = {
@@ -154,6 +161,7 @@ cc_library(
         ":x86": tflite_deps_intel,
         ":x86_64": tflite_deps_intel,
         ":darwin": tflite_deps_intel,
+        ":darwin_x86_64": tflite_deps_intel,
         ":freebsd": tflite_deps_intel,
         "//conditions:default": [],
     }),
@@ -232,6 +240,7 @@ cc_library(
         ":x86": tflite_deps_intel,
         ":x86_64": tflite_deps_intel,
         ":darwin": tflite_deps_intel,
+        ":darwin_x86_64": tflite_deps_intel,
         ":freebsd": tflite_deps_intel,
         "//conditions:default": [],
     }),
-- 
GitLab


From c39d141948174b94213848d9b95541ee09af5e53 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 1 Feb 2018 19:50:34 -0800
Subject: [PATCH 565/966] Supporting new saving op structure.

PiperOrigin-RevId: 184233513
---
 .../tools/graph_transforms/sparsify_gather.cc | 53 +++++++++++-----
 .../graph_transforms/sparsify_gather_test.cc  | 63 +++++++++----------
 2 files changed, 68 insertions(+), 48 deletions(-)

diff --git a/tensorflow/tools/graph_transforms/sparsify_gather.cc b/tensorflow/tools/graph_transforms/sparsify_gather.cc
index 9c583d83ca..214ec721e2 100644
--- a/tensorflow/tools/graph_transforms/sparsify_gather.cc
+++ b/tensorflow/tools/graph_transforms/sparsify_gather.cc
@@ -86,8 +86,17 @@ void CreateConstNode(const Tensor& tensor, const string& name,
   SetNodeTensorAttr<float>("value", tensor, node_def);
 }
 
+string GetMonolithicTensorKey(const string& tensor_slice_name) {
+  std::vector<string> names = Split(tensor_slice_name, "/");
+  if (StringPiece(names[names.size() - 1]).starts_with("part_")) {
+    CHECK_GE(names.size(), 2);
+    names.pop_back();
+  }
+  return Join(names, "/");
+}
+
 Status ObtainTensorSlice(const GraphDef& input_graph_def,
-                         const string& tensor_name,
+                         const string& target_name,
                          string* shape_slice_string) {
   string restore_node_name;
   for (const auto& node : input_graph_def.node()) {
@@ -95,39 +104,53 @@ Status ObtainTensorSlice(const GraphDef& input_graph_def,
     if (node_name_parts.size() == 2 &&
         StringPiece(node_name_parts[0]).starts_with("save") &&
         StringPiece(node_name_parts[1]).starts_with("Assign") &&
-        node.input(0) == tensor_name) {
+        node.input(0) == target_name) {
       restore_node_name = node.input(1);
       break;
     }
   }
+
+  std::vector<string> restore_node_parts = Split(restore_node_name, ":");
+  CHECK_LE(restore_node_parts.size(), 2);
+  string tensor_names_node;
   string shape_and_slices_node;
   for (const auto& node : input_graph_def.node()) {
-    if ((node.name() == restore_node_name) && (node.op() == "RestoreV2")) {
+    if ((node.name() == restore_node_parts[0]) && (node.op() == "RestoreV2")) {
+      tensor_names_node = node.input(1);
       shape_and_slices_node = node.input(2);
       break;
     }
   }
+
+  int offset = -1;
+  for (const auto& node : input_graph_def.node()) {
+    if (node.name() == tensor_names_node) {
+      Tensor tensor_names_tensor;
+      TF_RETURN_IF_ERROR(GetNodeAttr(node, "value", &tensor_names_tensor));
+      const auto& tensor_names_value = tensor_names_tensor.flat<string>();
+      for (int i = 0; i < tensor_names_value.size(); i++) {
+        if (tensor_names_value(i) == GetMonolithicTensorKey(target_name)) {
+          offset = i;
+          break;
+        }
+      }
+    }
+  }
+  if (offset == -1) {
+    return errors::Internal("Unable to find RestoreV2 entry for variable: ",
+                            target_name);
+  }
   for (const auto& node : input_graph_def.node()) {
     if (node.name() == shape_and_slices_node) {
       Tensor shape_and_slices_tensor;
       TF_RETURN_IF_ERROR(GetNodeAttr(node, "value", &shape_and_slices_tensor));
       const auto& shape_and_slices_value =
           shape_and_slices_tensor.flat<string>();
-      *shape_slice_string = shape_and_slices_value(0);
+      *shape_slice_string = shape_and_slices_value(offset);
       return Status::OK();
     }
   }
-  return errors::Internal("Unable to find slice for variable: ", tensor_name);
-}
-
-string GetMonolithicTensorKey(const string& tensor_slice_name) {
-  std::vector<string> names = Split(tensor_slice_name, "/");
-  CHECK_GE(names.size(), 2);
-  CHECK(StringPiece(names[names.size() - 1]).starts_with("part_"));
-
-  // Remove the "part_x" suffix
-  names.pop_back();
-  return Join(names, "/");
+  return errors::Internal("Unable to find slice for variable: ", target_name);
 }
 
 Status ReadTensorFromCheckpoint(
diff --git a/tensorflow/tools/graph_transforms/sparsify_gather_test.cc b/tensorflow/tools/graph_transforms/sparsify_gather_test.cc
index 203ed3e0f9..d41321c9a6 100644
--- a/tensorflow/tools/graph_transforms/sparsify_gather_test.cc
+++ b/tensorflow/tools/graph_transforms/sparsify_gather_test.cc
@@ -106,11 +106,15 @@ class SparsifyGatherTest : public ::testing::Test {
       NodeDef* save_const_node =
           CreateNode("save/Const", "Const", {}, &graph_def);
 
+      Tensor tensor_names_values(DT_STRING, TensorShape({1}));
+      test::FillValues<string>(&tensor_names_values, {"w"});
       NodeDef* tensor_names_node =
           CreateNode("save/RestoreV2/tensor_names", "Const", {}, &graph_def);
+      SetNodeTensorAttr<string>("value", tensor_names_values,
+                                tensor_names_node);
+
       NodeDef* tensor_shapes_slices_node = CreateNode(
           "save/RestoreV2/shape_and_slices", "Const", {}, &graph_def);
-
       Tensor shapes_slices_val(DT_STRING, TensorShape({1}));
       shapes_slices_val.flat<string>()(0) = "4 1 0,4:0,1";
       SetNodeTensorAttr<string>("value", shapes_slices_val,
@@ -310,6 +314,29 @@ class SparsifyGatherTest : public ::testing::Test {
       SetNodeTensorAttr<float>("value", weights, w_node1);
       SetNodeTensorAttr<float>("value", weights, w_node2);
     } else {
+      NodeDef* save_const_node =
+          CreateNode("save/Const", "Const", {}, &graph_def);
+
+      NodeDef* tensor_names_node =
+          CreateNode("save/RestoreV2/tensor_names", "Const", {}, &graph_def);
+      Tensor tensor_names_values(DT_STRING, TensorShape({2}));
+      test::FillValues<string>(&tensor_names_values, {"w1", "w2"});
+      SetNodeTensorAttr<string>("value", tensor_names_values,
+                                tensor_names_node);
+
+      NodeDef* tensor_shapes_slices_node = CreateNode(
+          "save/RestoreV2/shape_and_slices", "Const", {}, &graph_def);
+      Tensor shapes_slices_val(DT_STRING, TensorShape({2}));
+      shapes_slices_val.flat<string>()(0) = "4 1 0,4:0,1";
+      shapes_slices_val.flat<string>()(1) = "4 1 0,4:0,1";
+      SetNodeTensorAttr<string>("value", shapes_slices_val,
+                                tensor_shapes_slices_node);
+
+      NodeDef* restore_node = CreateNode(
+          "save/RestoreV2", "RestoreV2",
+          {save_const_node, tensor_names_node, tensor_shapes_slices_node},
+          &graph_def);
+
       w_node1 = CreateNode("w1/part_1", "VariableV2", {}, &graph_def);
 
       zeros_shape1 = CreateNode("w1/part_1/Initializer/zeros/shape_as_tensor",
@@ -321,23 +348,7 @@ class SparsifyGatherTest : public ::testing::Test {
       assign_node1 = CreateNode("w1/part_1/Assign", "Assign",
                                 {w_node1, zeros_node1}, &graph_def);
 
-      NodeDef* save_const_node =
-          CreateNode("save/Const", "Const", {}, &graph_def);
-      NodeDef* tensor_names_node1 =
-          CreateNode("save/RestoreV2/tensor_names", "Const", {}, &graph_def);
-      NodeDef* tensor_shapes_slices_node1 = CreateNode(
-          "save/RestoreV2/shape_and_slices", "Const", {}, &graph_def);
-
-      Tensor shapes_slices_val1(DT_STRING, TensorShape({1}));
-      shapes_slices_val1.flat<string>()(0) = "4 1 0,4:0,1";
-      SetNodeTensorAttr<string>("value", shapes_slices_val1,
-                                tensor_shapes_slices_node1);
-
-      NodeDef* restore_node1 = CreateNode(
-          "save/RestoreV2", "RestoreV2",
-          {save_const_node, tensor_names_node1, tensor_shapes_slices_node1},
-          &graph_def);
-      CreateNode("save/Assign", "Assign", {w_node1, restore_node1}, &graph_def);
+      CreateNode("save/Assign", "Assign", {w_node1, restore_node}, &graph_def);
 
       w_node2 = CreateNode("w2/part_1", "VariableV2", {}, &graph_def);
       zeros_shape2 = CreateNode("w2/part_1/Initializer/zeros/shape_as_tensor",
@@ -349,21 +360,7 @@ class SparsifyGatherTest : public ::testing::Test {
       assign_node2 = CreateNode("w2/part_1/Assign", "Assign",
                                 {w_node2, zeros_node2}, &graph_def);
 
-      NodeDef* tensor_names_node2 =
-          CreateNode("save/RestoreV2_1/tensor_names", "Const", {}, &graph_def);
-      NodeDef* tensor_shapes_slices_node2 = CreateNode(
-          "save/RestoreV2_1/shape_and_slices", "Const", {}, &graph_def);
-
-      Tensor shapes_slices_val2(DT_STRING, TensorShape({1}));
-      shapes_slices_val2.flat<string>()(0) = "4 1 0,4:0,1";
-      SetNodeTensorAttr<string>("value", shapes_slices_val2,
-                                tensor_shapes_slices_node2);
-
-      NodeDef* restore_node2 = CreateNode(
-          "save/RestoreV2_1", "RestoreV2",
-          {save_const_node, tensor_names_node2, tensor_shapes_slices_node2},
-          &graph_def);
-      CreateNode("save/Assign_1", "Assign", {w_node2, restore_node2},
+      CreateNode("save/Assign_1", "Assign", {w_node2, restore_node},
                  &graph_def);
 
       BundleWriter writer(Env::Default(), checkpoint_path);
-- 
GitLab


From d7fcf5a865570073569817fffafc07c8c74ec66d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 1 Feb 2018 20:35:29 -0800
Subject: [PATCH 566/966] Automated g4 rollback of changelist 183874527

PiperOrigin-RevId: 184236409
---
 tensorflow/cc/saved_model/loader.cc      |  4 +++-
 tensorflow/cc/saved_model/loader_test.cc | 18 ++++++++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/tensorflow/cc/saved_model/loader.cc b/tensorflow/cc/saved_model/loader.cc
index acef098c7d..faa1e378d0 100644
--- a/tensorflow/cc/saved_model/loader.cc
+++ b/tensorflow/cc/saved_model/loader.cc
@@ -96,7 +96,9 @@ Status FindMetaGraphDefToLoad(const SavedModel& saved_model_proto,
 Status LoadMetaGraphIntoSession(const MetaGraphDef& meta_graph_def,
                                 const SessionOptions& session_options,
                                 std::unique_ptr<Session>* session) {
-  session->reset(NewSession(session_options));
+  Session* session_p = nullptr;
+  TF_RETURN_IF_ERROR(NewSession(session_options, &session_p));
+  session->reset(session_p);
   return (*session)->Create(meta_graph_def.graph_def());
 }
 
diff --git a/tensorflow/cc/saved_model/loader_test.cc b/tensorflow/cc/saved_model/loader_test.cc
index 0ad6b33bba..4c64d2cfe3 100644
--- a/tensorflow/cc/saved_model/loader_test.cc
+++ b/tensorflow/cc/saved_model/loader_test.cc
@@ -155,6 +155,24 @@ TEST_F(LoaderTest, NoTagMatchMultiple) {
       << st.error_message();
 }
 
+TEST_F(LoaderTest, SessionCreationFailure) {
+  SavedModelBundle bundle;
+  // Use invalid SessionOptions to cause session creation to fail.  Default
+  // options work, so provide an invalid value for the target field.
+  SessionOptions session_options;
+  constexpr char kInvalidTarget[] = "invalid target";
+  session_options.target = kInvalidTarget;
+  RunOptions run_options;
+
+  const string export_dir =
+      io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataSharded);
+  Status st = LoadSavedModel(session_options, run_options, export_dir,
+                             {kSavedModelTagServe}, &bundle);
+  EXPECT_FALSE(st.ok());
+  EXPECT_TRUE(StringPiece(st.error_message()).contains(kInvalidTarget))
+      << st.error_message();
+}
+
 TEST_F(LoaderTest, PbtxtFormat) {
   SavedModelBundle bundle;
   SessionOptions session_options;
-- 
GitLab


From dbe690c679138837f07f99e04e747cdcc1ce8fd2 Mon Sep 17 00:00:00 2001
From: Yifei Feng <fengyifei2026@gmail.com>
Date: Thu, 1 Feb 2018 22:51:33 -0800
Subject: [PATCH 567/966] Fix sanity build (#16674)

* Fix sanity.

* Add back other sanity tests.

* Add sanity test back.

* Fix build error.

* Add blacklist.

* Add whitelist.
-- 
GitLab


From 97aa1856bc6dc75afd65dc4ad2a5e5a48d1ea6d6 Mon Sep 17 00:00:00 2001
From: Sami Kama <skama@nvidia.com>
Date: Fri, 2 Feb 2018 10:13:41 -0800
Subject: [PATCH 568/966] Fix py3 conversion

---
 configure.py                                  |  1 -
 .../contrib/tensorrt/python/trt_convert.py    | 23 +++++++++++--------
 .../contrib/tensorrt/test/test_tftrt.py       |  1 +
 3 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/configure.py b/configure.py
index 34de1ad2d5..94827891c3 100644
--- a/configure.py
+++ b/configure.py
@@ -1450,7 +1450,6 @@ def main():
         'more details.')
   config_info_line('mkl', 'Build with MKL support.')
   config_info_line('monolithic', 'Config for mostly static monolithic build.')
-  config_info_line('tensorrt', 'Build with TensorRT support.')
 
 if __name__ == '__main__':
   main()
diff --git a/tensorflow/contrib/tensorrt/python/trt_convert.py b/tensorflow/contrib/tensorrt/python/trt_convert.py
index b17b07e296..2d056610cd 100644
--- a/tensorflow/contrib/tensorrt/python/trt_convert.py
+++ b/tensorflow/contrib/tensorrt/python/trt_convert.py
@@ -22,13 +22,8 @@ from tensorflow.core.framework import graph_pb2
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import errors_impl as _impl
 from tensorflow.contrib.tensorrt.wrap_conversion import trt_convert
-from tensorflow.python.util import compat
-import tensorflow as tf
-from tensorflow.python.grappler import tf_optimizer
-from tensorflow.core.protobuf import rewriter_config_pb2
-from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
-
+import six as _six
 
 # TODO(skama): get outputs from session when implemented as c++
 # optimization pass
@@ -48,13 +43,21 @@ def CreateInferenceGraph(input_graph_def,
   Returns:
     New GraphDef with TRTEngineOps placed in graph replacing subgraphs.
   """
+  def py2bytes(inp):
+    return inp
+  def py3bytes(inp):
+    return inp.encode('utf-8',errors='surrogateescape')
+  if _six.PY2:
+    to_bytes=py2bytes
+  else:
+    to_bytes=py3bytes
 
   out_names = []
   for i in outputs:
     if isinstance(i, ops.Tensor):
-      out_names.append(i.name)
+      out_names.append(to_bytes(i.name))
     else:
-      out_names.append(i)
+      out_names.append(to_bytes(i))
 
   input_graph_def_str = input_graph_def.SerializeToString()
 
@@ -63,10 +66,10 @@ def CreateInferenceGraph(input_graph_def,
   # allow us to return a status object from C++.  Thus we return a
   # pair or strings where first one is encoded status and the second
   # one is the transformed graphs protobuf string.
-  out = trt_convert(input_graph_def_str, outputs, max_batch_size,
+  out = trt_convert(input_graph_def_str, out_names, max_batch_size,
                     max_workspace_size_bytes)
   status = out[0]
-  output_graph_def_string = out[1]
+  output_graph_def_string = to_bytes(out[1])
   del input_graph_def_str  #save some memory
   if len(status) < 2:
     raise _impl.UnknownError(None, None, status)
diff --git a/tensorflow/contrib/tensorrt/test/test_tftrt.py b/tensorflow/contrib/tensorrt/test/test_tftrt.py
index 32f839e624..ad7a85cf5a 100644
--- a/tensorflow/contrib/tensorrt/test/test_tftrt.py
+++ b/tensorflow/contrib/tensorrt/test/test_tftrt.py
@@ -68,3 +68,4 @@ if "__main__" in __name__:
   o1 = runGraph(gdef, dummy_input)
   o2 = runGraph(trt_graph, dummy_input)
   assert (np.array_equal(o1, o2))
+  print("Pass")
-- 
GitLab


From 3be90a490c31d5a8fad70713e059bbb3e723e664 Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Thu, 1 Feb 2018 21:34:18 -0800
Subject: [PATCH 569/966] Internal change

PiperOrigin-RevId: 184239740
---
 tensorflow/compiler/xla/service/gpu/BUILD     |  54 ++-
 .../xla/service/gpu/convolution_thunk.cc      | 378 ++----------------
 .../xla/service/gpu/convolution_thunk.h       | 126 ++----
 .../gpu/cudnn_convolution_algorithm_picker.cc | 370 +++++++++++++++++
 .../gpu/cudnn_convolution_algorithm_picker.h  |  62 +++
 ...lding.cc => cudnn_convolution_rewriter.cc} | 149 ++++---
 ...folding.h => cudnn_convolution_rewriter.h} |  14 +-
 ....cc => cudnn_convolution_rewriter_test.cc} | 167 ++++----
 .../service/gpu/cudnn_convolution_runner.cc   | 221 ++++++++++
 .../service/gpu/cudnn_convolution_runner.h    |  97 +++++
 .../compiler/xla/service/gpu/gpu_compiler.cc  |  80 +++-
 .../xla/service/gpu/gpu_copy_insertion.cc     |   6 +
 .../xla/service/gpu/gpu_layout_assignment.cc  | 227 +++++------
 .../service/gpu/instruction_fusion_test.cc    |  43 --
 .../xla/service/gpu/ir_emission_utils.cc      | 108 +++--
 .../xla/service/gpu/ir_emission_utils.h       |  60 ++-
 .../compiler/xla/service/gpu/ir_emitter.h     |   3 -
 .../xla/service/gpu/ir_emitter_unnested.cc    | 120 +++---
 .../compiler/xla/service/gpu/pad_insertion.cc | 157 ++++----
 .../compiler/xla/service/hlo_computation.cc   |  21 +-
 .../compiler/xla/service/hlo_computation.h    |   9 -
 .../compiler/xla/service/hlo_cost_analysis.cc |   8 +-
 .../compiler/xla/service/hlo_instruction.cc   |  48 +--
 .../compiler/xla/service/hlo_instruction.h    |  41 +-
 24 files changed, 1554 insertions(+), 1015 deletions(-)
 create mode 100644 tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
 create mode 100644 tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h
 rename tensorflow/compiler/xla/service/gpu/{convolution_folding.cc => cudnn_convolution_rewriter.cc} (83%)
 rename tensorflow/compiler/xla/service/gpu/{convolution_folding.h => cudnn_convolution_rewriter.h} (63%)
 rename tensorflow/compiler/xla/service/gpu/{convolution_folding_test.cc => cudnn_convolution_rewriter_test.cc} (82%)
 create mode 100644 tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.cc
 create mode 100644 tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h

diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 80c2eed109..7df01f7edd 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -131,6 +131,7 @@ cc_library(
         "ir_emitter_context.h",
     ],
     deps = [
+        ":cudnn_convolution_runner",
         ":elemental_ir_emitter",
         ":gpu_constants",
         ":gpu_executable",
@@ -262,6 +263,7 @@ cc_library(
     ],
     deps = [
         ":buffer_allocations",
+        ":cudnn_convolution_runner",
         ":infeed_manager",
         ":ir_emission_utils",
         ":partition_assignment",
@@ -309,9 +311,41 @@ cc_library(
 )
 
 cc_library(
-    name = "convolution_folding",
-    srcs = ["convolution_folding.cc"],
-    hdrs = ["convolution_folding.h"],
+    name = "cudnn_convolution_algorithm_picker",
+    srcs = ["cudnn_convolution_algorithm_picker.cc"],
+    hdrs = ["cudnn_convolution_algorithm_picker.h"],
+    deps = [
+        ":cudnn_convolution_runner",
+        ":gpu_executable",
+        ":ir_emission_utils",
+        "//tensorflow/compiler/xla/service:device_memory_allocator",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_pass",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:stream_executor_no_cuda",
+    ],
+)
+
+cc_library(
+    name = "cudnn_convolution_runner",
+    srcs = ["cudnn_convolution_runner.cc"],
+    hdrs = ["cudnn_convolution_runner.h"],
+    deps = [
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:stream_executor_no_cuda",
+    ],
+)
+
+cc_library(
+    name = "cudnn_convolution_rewriter",
+    srcs = ["cudnn_convolution_rewriter.cc"],
+    hdrs = ["cudnn_convolution_rewriter.h"],
     deps = [
         ":ir_emission_utils",
         "//tensorflow/compiler/xla:literal_util",
@@ -325,15 +359,18 @@ cc_library(
 )
 
 tf_cc_test(
-    name = "convolution_folding_test",
-    srcs = ["convolution_folding_test.cc"],
+    name = "cudnn_convolution_rewriter_test",
+    srcs = ["cudnn_convolution_rewriter_test.cc"],
     deps = [
-        ":convolution_folding",
+        ":cudnn_convolution_rewriter",
+        ":ir_emission_utils",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_matchers",
         "//tensorflow/compiler/xla/service:shape_inference",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/core:test",
     ],
 )
@@ -446,7 +483,8 @@ cc_library(
     srcs = ["gpu_compiler.cc"],
     hdrs = ["gpu_compiler.h"],
     deps = [
-        ":convolution_folding",
+        ":cudnn_convolution_algorithm_picker",
+        ":cudnn_convolution_rewriter",
         ":fusion_merger",
         ":gpu_constants",
         ":gpu_copy_insertion",
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
index 899cc5c83b..f76f15929d 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <string>
 
+#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -36,366 +37,69 @@ using se::dnn::DataLayout;
 using se::dnn::FilterDescriptor;
 using se::dnn::FilterLayout;
 
-ConvolveScratchAllocator::ConvolveScratchAllocator(
-    int device_ordinal, DeviceMemoryAllocator* memory_allocator)
-    : device_ordinal_(device_ordinal), memory_allocator_(memory_allocator) {}
-
-ConvolveScratchAllocator::~ConvolveScratchAllocator() {
-  for (auto& allocated_buffer : allocated_buffers_) {
-    if (!memory_allocator_->Deallocate(device_ordinal_, &allocated_buffer)
-             .ok()) {
-      // The program can still continue with failed deallocation.
-      LOG(ERROR) << "Failed to deallocate the allocated buffer: "
-                 << allocated_buffer.opaque();
-    }
-  }
-}
-
-int64 ConvolveScratchAllocator::GetMemoryLimitInBytes(se::Stream* stream) {
-  constexpr int64 kConvolveScratchSize = 1LL << 32;  // 4GB by default.
-  return kConvolveScratchSize;
-}
-
-se::port::StatusOr<se::DeviceMemory<uint8>>
-ConvolveScratchAllocator::AllocateBytes(se::Stream* stream, int64 byte_size) {
-  CHECK_GE(byte_size, 0) << "byte_size must be positive.";
-  if (byte_size > GetMemoryLimitInBytes(stream)) {
-    return se::port::Status(
-        se::port::error::RESOURCE_EXHAUSTED,
-        tensorflow::strings::Printf(
-            "Allocating %lld bytes exceeds the memory limit of %lld bytes.",
-            byte_size, GetMemoryLimitInBytes(stream)));
-  }
-
-  auto status_or_memory =
-      memory_allocator_->Allocate(device_ordinal_, byte_size,
-                                  /*retry_on_failure=*/false);
-  if (!status_or_memory.ok()) {
-    return se::port::Status(se::port::error::RESOURCE_EXHAUSTED,
-                            tensorflow::strings::Printf(
-                                "Failed to allocate %lld bytes on device %d.",
-                                byte_size, device_ordinal_));
-  }
-  se::DeviceMemoryBase allocated_buffer = status_or_memory.ValueOrDie();
-  allocated_buffers_.push_back(allocated_buffer);
-  total_allocated_bytes_ += byte_size;
-  return se::DeviceMemory<uint8>(allocated_buffer);
-}
-
-string ConvolutionKindToString(
-    ConvolutionThunk::ConvolutionKind convolution_kind) {
-  switch (convolution_kind) {
-    case ConvolutionThunk::ConvolutionKind::kForward:
-      return "forward";
-    case ConvolutionThunk::ConvolutionKind::kBackwardFilter:
-      return "backward_filter";
-    case ConvolutionThunk::ConvolutionKind::kBackwardInput:
-      return "backward_input";
-  }
-  return "unknown convolution kind";
-}
-
 ConvolutionThunk::ConvolutionThunk(
-    ConvolutionKind convolution_kind,
-    const BufferAllocation::Slice& input_buffer,
+    CudnnConvKind convolution_kind, const BufferAllocation::Slice& input_buffer,
     const BufferAllocation::Slice& filter_buffer,
-    const BufferAllocation::Slice& output_buffer, const Shape& input_shape,
+    const BufferAllocation::Slice& output_buffer,
+    const BufferAllocation::Slice& tuple_result_buffer,
+    const BufferAllocation::Slice& scratch_buffer, const Shape& input_shape,
     const Shape& filter_shape, const Shape& output_shape, const Window& window,
-    const ConvolutionDimensionNumbers& dim_nums, const HloInstruction* hlo)
+    const ConvolutionDimensionNumbers& dim_nums, int64 algorithm,
+    const HloInstruction* hlo)
     : Thunk(Kind::kConvolution, hlo),
       convolution_kind_(convolution_kind),
       input_buffer_(input_buffer),
       filter_buffer_(filter_buffer),
       output_buffer_(output_buffer),
+      tuple_result_buffer_(tuple_result_buffer),
+      scratch_buffer_(scratch_buffer),
       input_shape_(input_shape),
       filter_shape_(filter_shape),
       output_shape_(output_shape),
       window_(window),
-      dim_nums_(dim_nums) {}
+      dim_nums_(dim_nums),
+      algorithm_(algorithm) {}
 
-tensorflow::Status ConvolutionThunk::ExecuteOnStream(
+Status ConvolutionThunk::ExecuteOnStream(
     const BufferAllocations& buffer_allocations, se::Stream* stream) {
-  VLOG(3) << "Convolution kind: " << ConvolutionKindToString(convolution_kind_);
-  VLOG(3) << "input shape: { " << input_shape_.ShortDebugString() << " }";
-  VLOG(3) << "filter shape: { " << filter_shape_.ShortDebugString() << " }";
-  VLOG(3) << "Output shape: { " << output_shape_.ShortDebugString() << " }";
-  VLOG(3) << "Dim nums: { " << dim_nums_.ShortDebugString() << " }";
-  VLOG(3) << "Window: { " << window_.ShortDebugString() << " }";
-
-  const int num_dimensions = window_.dimensions_size();
-  CHECK_LE(num_dimensions, 3);
-  // cuDNN does not support 1D convolutions. We therefore express 1D
-  // convolutions as 2D convolutions where the first spatial dimension is 1.
-  // This matches the behavior of TF (see definition of conv1d in
-  // tensorflow/python/ops/nn_ops.py).
-  const int effective_num_dimensions = std::max(2, num_dimensions);
-
-  CHECK_EQ(F32, output_shape_.element_type());
-  CHECK_EQ(num_dimensions, dim_nums_.input_spatial_dimensions_size());
-  CHECK_EQ(num_dimensions, dim_nums_.kernel_spatial_dimensions_size());
-  CHECK_EQ(num_dimensions, dim_nums_.output_spatial_dimensions_size());
-  for (const WindowDimension& dim : window_.dimensions()) {
-    CHECK_EQ(dim.padding_low(), dim.padding_high());
-  }
-
-  // cuDNN's convolution APIs support the BDYX layout for activations/output and
-  // the OIYX layout for weights.
-  BatchDescriptor input_descriptor(effective_num_dimensions);
-  input_descriptor.set_layout(DataLayout::kBatchDepthYX)
-      .set_feature_map_count(
-          input_shape_.dimensions(dim_nums_.input_feature_dimension()))
-      .set_count(input_shape_.dimensions(dim_nums_.input_batch_dimension()));
-  for (int dim = 0; dim < num_dimensions; ++dim) {
-    // Note that the dimensions are reversed. The same holds below.
-    input_descriptor.set_spatial_dim(
-        static_cast<se::dnn::DimIndex>(effective_num_dimensions - dim - 1),
-        input_shape_.dimensions(dim_nums_.input_spatial_dimensions(dim)));
-  }
-
-  FilterDescriptor filter_descriptor(effective_num_dimensions);
-  filter_descriptor.set_layout(FilterLayout::kOutputInputYX)
-      .set_input_feature_map_count(
-          filter_shape_.dimensions(dim_nums_.kernel_input_feature_dimension()))
-      .set_output_feature_map_count(filter_shape_.dimensions(
-          dim_nums_.kernel_output_feature_dimension()));
-  for (int dim = 0; dim < num_dimensions; ++dim) {
-    filter_descriptor.set_spatial_dim(
-        static_cast<se::dnn::DimIndex>(effective_num_dimensions - dim - 1),
-        filter_shape_.dimensions(dim_nums_.kernel_spatial_dimensions(dim)));
-  }
-
-  ConvolutionDescriptor convolution_descriptor(effective_num_dimensions);
-  for (int dim = 0; dim < num_dimensions; ++dim) {
-    convolution_descriptor
-        .set_zero_padding(
-            static_cast<se::dnn::DimIndex>(effective_num_dimensions - dim - 1),
-            window_.dimensions(dim).padding_low())
-        .set_filter_stride(
-            static_cast<se::dnn::DimIndex>(effective_num_dimensions - dim - 1),
-            window_.dimensions(dim).stride());
-  }
-
-  BatchDescriptor output_descriptor(effective_num_dimensions);
-  output_descriptor.set_layout(DataLayout::kBatchDepthYX)
-      .set_feature_map_count(
-          output_shape_.dimensions(dim_nums_.output_feature_dimension()))
-      .set_count(output_shape_.dimensions(dim_nums_.output_batch_dimension()));
-  for (int dim = 0; dim < num_dimensions; ++dim) {
-    output_descriptor.set_spatial_dim(
-        static_cast<se::dnn::DimIndex>(effective_num_dimensions - dim - 1),
-        output_shape_.dimensions(dim_nums_.output_spatial_dimensions(dim)));
-  }
-
-  // Add a singleton dimension in the 1D convolution case.
-  if (num_dimensions == 1) {
-    input_descriptor.set_spatial_dim(static_cast<se::dnn::DimIndex>(0), 1);
-    output_descriptor.set_spatial_dim(static_cast<se::dnn::DimIndex>(0), 1);
-    filter_descriptor.set_spatial_dim(static_cast<se::dnn::DimIndex>(0), 1);
-    convolution_descriptor
-        .set_zero_padding(static_cast<se::dnn::DimIndex>(0), 0)
-        .set_filter_stride(static_cast<se::dnn::DimIndex>(0), 1);
-  }
-
   se::DeviceMemory<float> input_data(
       buffer_allocations.GetDeviceAddress(input_buffer_));
   se::DeviceMemory<float> filter_data(
       buffer_allocations.GetDeviceAddress(filter_buffer_));
   se::DeviceMemory<float> output_data(
       buffer_allocations.GetDeviceAddress(output_buffer_));
-  return ConvolveWithTune(input_descriptor, input_data, filter_descriptor,
-                          filter_data, output_descriptor, output_data,
-                          convolution_descriptor, buffer_allocations, stream);
-}
-
-tensorflow::Status ConvolutionThunk::Convolve(
-    const BatchDescriptor& input_descriptor, se::DeviceMemory<float> input_data,
-    const FilterDescriptor& filter_descriptor,
-    se::DeviceMemory<float> filter_data,
-    const BatchDescriptor& output_descriptor,
-    se::DeviceMemory<float> output_data,
-    const ConvolutionDescriptor& convolution_descriptor,
-    const se::dnn::AlgorithmConfig& algorithm_config, se::Stream* stream,
-    ConvolveScratchAllocator* scratch_allocator,
-    se::dnn::ProfileResult* profile_result) {
-  bool launch_ok;
-  switch (convolution_kind_) {
-    case ConvolutionKind::kBackwardFilter:
-      launch_ok =
-          stream
-              ->ThenConvolveBackwardFilterWithAlgorithm(
-                  input_descriptor, input_data, output_descriptor, output_data,
-                  convolution_descriptor, filter_descriptor, &filter_data,
-                  scratch_allocator, algorithm_config, profile_result)
-              .ok();
-      break;
-    case ConvolutionKind::kBackwardInput:
-      launch_ok = stream
-                      ->ThenConvolveBackwardDataWithAlgorithm(
-                          filter_descriptor, filter_data, output_descriptor,
-                          output_data, convolution_descriptor, input_descriptor,
-                          &input_data, scratch_allocator, algorithm_config,
-                          profile_result)
-                      .ok();
-      break;
-    case ConvolutionKind::kForward:
-      launch_ok =
-          stream
-              ->ThenConvolveWithAlgorithm(
-                  input_descriptor, input_data, filter_descriptor, filter_data,
-                  convolution_descriptor, output_descriptor, &output_data,
-                  scratch_allocator, algorithm_config, profile_result)
-              .ok();
-      break;
-  }
-  if (launch_ok) {
-    return tensorflow::Status::OK();
-  }
-  return InternalError(
-      "Unable to launch convolution for thunk %p with type %s and algorithm "
-      "(%lld, %lld)",
-      this, ConvolutionKindToString(convolution_kind_).c_str(),
-      algorithm_config.algorithm().algo_id(),
-      algorithm_config.algorithm_no_scratch().algo_id());
-}
-
-std::vector<AlgorithmDesc> ConvolutionThunk::GetAlgorithms(
-    bool with_winograd_nonfused, se::StreamExecutor* stream_exec) const {
-  std::vector<AlgorithmDesc> algorithms;
-  switch (convolution_kind_) {
-    case ConvolutionKind::kBackwardFilter:
-      CHECK(stream_exec->GetConvolveBackwardFilterAlgorithms(
-          with_winograd_nonfused, &algorithms));
-      break;
-    case ConvolutionKind::kBackwardInput:
-      CHECK(stream_exec->GetConvolveBackwardDataAlgorithms(
-          with_winograd_nonfused, &algorithms));
-      break;
-    case ConvolutionKind::kForward:
-      CHECK(stream_exec->GetConvolveAlgorithms(with_winograd_nonfused,
-                                               &algorithms));
-      break;
-  }
-  return algorithms;
-}
-
-static string AlgorithmToString(const se::dnn::AlgorithmDesc& algo) {
-  if (algo.tensor_ops_enabled()) {
-    return tensorflow::strings::StrCat(algo.algo_id(), "+TC");
-  }
-  return tensorflow::strings::StrCat(algo.algo_id());
-}
-
-// Determines whether we can safely perform a winograd non-fused convolution for
-// the given input and output descriptors.  This works around b/68264959, an
-// integer overflow in cuDNNv5 and cuDNNv6.
-static bool ShouldIncludeWinogradNonfusedAlgo(
-    const BatchDescriptor& input_descriptor,
-    const BatchDescriptor& output_descriptor) {
-  int64 batch = input_descriptor.count();
-  int64 in_depths = input_descriptor.feature_map_count();
-  int64 in_rows = input_descriptor.height();
-  int64 in_cols = input_descriptor.width();
-  int64 out_depths = output_descriptor.feature_map_count();
-
-  int64 total_size = 16 * std::ceil(batch / 16.0) *
-                     std::max(in_depths, out_depths) * in_cols * in_rows *
-                     sizeof(float);
-  int64 threshold = 1L << 31;
-
-  return total_size < threshold;
-}
-
-tensorflow::Status ConvolutionThunk::ConvolveWithTune(
-    const BatchDescriptor& input_descriptor, se::DeviceMemory<float> input_data,
-    const FilterDescriptor& filter_descriptor,
-    se::DeviceMemory<float> filter_data,
-    const BatchDescriptor& output_descriptor,
-    se::DeviceMemory<float> output_data,
-    const ConvolutionDescriptor& convolution_descriptor,
-    const BufferAllocations& buffer_allocations, se::Stream* stream) {
-  // TODO(b/29126320): Try cudnn v5's new auto-tuner when it's rolled out.
-  if (!best_algorithm_.has_value()) {
-    best_algorithm_.emplace();
-
-    // Auto-tuning either is disabled or only happens in the first run of this
-    // function.
-    VLOG(2) << "Profiling for best convolution algorithm used for "
-               "ConvolutionThunk: "
-            << this;
-
-    bool with_winograd_nonfused =
-        ShouldIncludeWinogradNonfusedAlgo(input_descriptor, output_descriptor);
-
-    se::dnn::ProfileResult best_result;
-    se::dnn::ProfileResult best_result_without_scratch;
-    std::vector<AlgorithmDesc> algorithms =
-        GetAlgorithms(with_winograd_nonfused, stream->parent());
-    for (auto algorithm : algorithms) {
-      ConvolveScratchAllocator scratch_allocator(
-          buffer_allocations.device_ordinal(),
-          buffer_allocations.memory_allocator());
-      se::dnn::ProfileResult profile_result;
-      VLOG(3) << "Trying algorithm " << AlgorithmToString(algorithm)
-              << " for ConvolutionThunk: " << this;
-      bool launch_ok =
-          Convolve(input_descriptor, input_data, filter_descriptor, filter_data,
-                   output_descriptor, output_data, convolution_descriptor,
-                   se::dnn::AlgorithmConfig(algorithm, algorithm), stream,
-                   &scratch_allocator, &profile_result)
-              .ok();
-      if (launch_ok && profile_result.is_valid()) {
-        VLOG(3) << "Run of algorithm " << AlgorithmToString(algorithm)
-                << " for ConvolutionThunk " << this << " succeeded, taking "
-                << profile_result.elapsed_time_in_ms()
-                << "ms. (Best result: " << best_result.elapsed_time_in_ms()
-                << "ms)";
-        if (profile_result.elapsed_time_in_ms() <
-            best_result.elapsed_time_in_ms()) {
-          best_result = profile_result;
-        }
-        if (scratch_allocator.TotalAllocatedBytes() == 0 &&
-            profile_result.elapsed_time_in_ms() <
-                best_result_without_scratch.elapsed_time_in_ms()) {
-          best_result_without_scratch = profile_result;
-        }
-      } else {
-        VLOG(3) << "Run of algorithm " << AlgorithmToString(algorithm)
-                << " for ConvolutionThunk " << this << " failed.";
-      }
-    }
-
-    if (best_result.is_valid()) {
-      best_algorithm_->set_algorithm(best_result.algorithm());
-    } else {
-      LOG(ERROR) << "No convolution algorithm works with profiling. Fall back "
-                    "to the default algorithm.";
-      best_algorithm_->set_algorithm(AlgorithmDesc());
+  se::DeviceMemoryBase scratch =
+      buffer_allocations.GetDeviceAddress(scratch_buffer_);
+
+  se::dnn::AlgorithmConfig algorithm_config(
+      se::dnn::AlgorithmDesc(algorithm_, /*use_tensor_ops=*/false));
+
+  TF_RETURN_IF_ERROR(RunCudnnConvolution(
+      convolution_kind_, input_shape_, filter_shape_, output_shape_, input_data,
+      filter_data, output_data, scratch, window_, dim_nums_, algorithm_config,
+      stream));
+
+  // Figure out which of output/input/filter is the result produced by this op,
+  // and write the result tuple.
+  void* result_ptr = [&] {
+    switch (convolution_kind_) {
+      case CudnnConvKind::kForward:
+        return output_data.opaque();
+      case CudnnConvKind::kBackwardInput:
+        return input_data.opaque();
+      case CudnnConvKind::kBackwardFilter:
+        return filter_data.opaque();
     }
+  }();
+  void* ptrs[] = {result_ptr, scratch.opaque()};
+  se::DeviceMemory<void*> tuple_addr(
+      buffer_allocations.GetDeviceAddress(tuple_result_buffer_));
+  stream->ThenMemcpyH2D<void*>(ptrs, &tuple_addr);
 
-    if (best_result_without_scratch.is_valid()) {
-      best_algorithm_->set_algorithm_no_scratch(
-          best_result_without_scratch.algorithm());
-    } else {
-      LOG(ERROR) << "No convolution algorithm without scratch works with "
-                    "profiling. Fall back "
-                    "to the default algorithm.";
-      best_algorithm_->set_algorithm_no_scratch(AlgorithmDesc());
-    }
-  }
-
-  {
-    VLOG(2) << "Using convolution algorithm ("
-            << AlgorithmToString(best_algorithm_->algorithm()) << ", "
-            << AlgorithmToString(best_algorithm_->algorithm_no_scratch())
-            << ") for ConvolutionThunk: " << this;
-    ConvolveScratchAllocator scratch_allocator(
-        buffer_allocations.device_ordinal(),
-        buffer_allocations.memory_allocator());
-    return Convolve(input_descriptor, input_data, filter_descriptor,
-                    filter_data, output_descriptor, output_data,
-                    convolution_descriptor, *best_algorithm_, stream,
-                    &scratch_allocator, nullptr);
+  if (!stream->ok()) {
+    return InternalError("ConvolutionThunk::ExecuteOnStream failed.");
   }
+  return Status::OK();
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.h b/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
index 46c94d0bf1..ca9ef5277b 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
+#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -30,106 +31,47 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-// A one-time scratch allocator for forward and backward convolution. The
-// scratch buffers allocated are released on destruction.
-//
-// Not thread-safe.
-class ConvolveScratchAllocator : public perftools::gputools::ScratchAllocator {
- public:
-  ConvolveScratchAllocator(int device_ordinal,
-                           DeviceMemoryAllocator* memory_allocator);
-
-  ~ConvolveScratchAllocator() override;
-
-  int64 GetMemoryLimitInBytes(perftools::gputools::Stream* stream) override;
-
-  int64 TotalAllocatedBytes() { return total_allocated_bytes_; }
-
-  perftools::gputools::port::StatusOr<perftools::gputools::DeviceMemory<uint8>>
-  AllocateBytes(perftools::gputools::Stream* stream, int64 byte_size) override;
-
- private:
-  const int device_ordinal_;
-  DeviceMemoryAllocator* memory_allocator_;
-  std::vector<perftools::gputools::DeviceMemoryBase> allocated_buffers_;
-  int64 total_allocated_bytes_ = 0;
-};
-
 // This class stores everything that StreamExecutor needs to launch a BNN
 // convolution. It is generated by IrEmitter.
 //
 // This is thread-compatible.
 class ConvolutionThunk : public Thunk {
  public:
-  // ConvolutionThunk performs one of the following types of convolution.
-  enum class ConvolutionKind {
-    kBackwardFilter,  // Backward convolution for filter.
-    kBackwardInput,   // Backward convolution for input.
-    kForward,         // Forward convolution.
-  };
-
-  // Constructs a thunk for launching a DNN convolution.
+  // Constructs a thunk for launching a DNN convolution.  When run, it will
+  // write a tuple (result, scratch_memory) into `tuple_result_buffer`.
+  //
+  // `algorithm` is a cudnn algorithm number.  `algorithm == -1` indicates that
+  // we should use the default (i.e. baseline) cudnn algorithm.
+  //
+  // Note that "output" here doesn't refer to the output from running this
+  // thunk, but rather to the "output" of a hypothetical forward convolution
+  // that corresponds to this input+filter+output triple.  That is, the result
+  // generated by this thunk is "output" for forward convs, "input" for
+  // backward-input convs, and "filter" for backward-filter convs.
+  //
   // Semantics of null hlo_instruction argument are as in Thunk.
-  ConvolutionThunk(ConvolutionKind convolution_kind,
+  ConvolutionThunk(CudnnConvKind convolution_kind,
                    const BufferAllocation::Slice& input_buffer,
                    const BufferAllocation::Slice& filter_buffer,
                    const BufferAllocation::Slice& output_buffer,
+                   const BufferAllocation::Slice& tuple_result_buffer,
+                   const BufferAllocation::Slice& scratch_buffer,
                    const Shape& input_shape, const Shape& filter_shape,
                    const Shape& output_shape, const Window& window,
-                   const ConvolutionDimensionNumbers& dnums,
+                   const ConvolutionDimensionNumbers& dim_nums, int64 algorithm,
                    const HloInstruction* hlo);
 
   ConvolutionThunk(const ConvolutionThunk&) = delete;
   ConvolutionThunk& operator=(const ConvolutionThunk&) = delete;
 
-  // Does the convolution for the thunk on "stream". Auto-tuning happens on the
-  // first run of this function.
-  tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations,
-      perftools::gputools::Stream* stream) override;
-
-  // Returns true if the next run of ExecuteOnStream will do autotuning.  If so,
-  // we want the GPU to be quiescent during autotuning, so as not to introduce
-  // noise in our results.
-  bool ShouldHaltAllActivityBeforeRunning(
-      perftools::gputools::Stream*) override {
-    return !best_algorithm_.has_value();
-  }
-
-  // Return true if scratch memory is needed to execute the thunk, that is
-  // either the best algorithm hasn't been chosen or the best algorithm is not
-  // the same as the no-scratch algorithm. This is because that the execution
-  // of the thunk is asynchronous, and the scratch allocator goes out of
-  // scope before the thunk finishes execution. Returning true tells the stream
-  // executor to make future thunks wait for this thunk to avoid reusing the
-  // deallocated scratch memory until this thunk is done with it.
-  bool ShouldBlockFutureThunks() {
-    if (!best_algorithm_.has_value()) {
-      return true;
-    }
-
-    const perftools::gputools::dnn::AlgorithmDesc& best_alg =
-        best_algorithm_->algorithm();
-    const perftools::gputools::dnn::AlgorithmDesc& no_scratch_best_alg =
-        best_algorithm_->algorithm_no_scratch();
-    return (!best_alg.is_default() || !no_scratch_best_alg.is_default() ||
-            !(best_alg == no_scratch_best_alg));
-  }
+  // Does the convolution for the thunk on "stream".
+  Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                         perftools::gputools::Stream* stream) override;
 
  private:
-  tensorflow::Status ConvolveWithTune(
-      const perftools::gputools::dnn::BatchDescriptor& input_descriptor,
-      perftools::gputools::DeviceMemory<float> input_data,
-      const perftools::gputools::dnn::FilterDescriptor& filter_descriptor,
-      perftools::gputools::DeviceMemory<float> filter_data,
-      const perftools::gputools::dnn::BatchDescriptor& output_descriptor,
-      perftools::gputools::DeviceMemory<float> output_data,
-      const perftools::gputools::dnn::ConvolutionDescriptor&
-          convolution_descriptor,
-      const BufferAllocations& buffer_allocations,
-      perftools::gputools::Stream* stream);
+  class ScratchAllocator;
 
-  tensorflow::Status Convolve(
+  Status Convolve(
       const perftools::gputools::dnn::BatchDescriptor& input_descriptor,
       perftools::gputools::DeviceMemory<float> input_data,
       const perftools::gputools::dnn::FilterDescriptor& filter_descriptor,
@@ -139,40 +81,26 @@ class ConvolutionThunk : public Thunk {
       const perftools::gputools::dnn::ConvolutionDescriptor&
           convolution_descriptor,
       const perftools::gputools::dnn::AlgorithmConfig& algorithm_config,
-      perftools::gputools::Stream* stream,
-      ConvolveScratchAllocator* scratch_allocator,
+      perftools::gputools::Stream* stream, ScratchAllocator* scratch_allocator,
       perftools::gputools::dnn::ProfileResult* profile_result);
 
-  // Returns the convolve algorithms that can be used for this ConvolutionThunk.
-  std::vector<perftools::gputools::dnn::AlgorithmDesc> GetAlgorithms(
-      bool with_winograd_nonfused,
-      perftools::gputools::StreamExecutor* stream_exec) const;
-
-  // Fastest cuDNN convolution algorithm for this thunk learned from
-  // auto-tuning. If auto-tuning is disabled or failed, best_algorithm_ is set
-  // to the default value, indicating cuDNN's convolution will choose the best
-  // algorithm from some heuristics based on its parameters.
-  tensorflow::gtl::optional<perftools::gputools::dnn::AlgorithmConfig>
-      best_algorithm_;
-
-  const ConvolutionKind convolution_kind_;
+  const CudnnConvKind convolution_kind_;
 
   const BufferAllocation::Slice input_buffer_;
   const BufferAllocation::Slice filter_buffer_;
   const BufferAllocation::Slice output_buffer_;
+  const BufferAllocation::Slice tuple_result_buffer_;
+  const BufferAllocation::Slice scratch_buffer_;
 
   const Shape input_shape_;
   const Shape filter_shape_;
   const Shape output_shape_;
 
   const Window window_;
-
   const ConvolutionDimensionNumbers dim_nums_;
+  int64 algorithm_;
 };
 
-string ConvolutionKindToString(
-    ConvolutionThunk::ConvolutionKind convolution_kind);
-
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
new file mode 100644
index 0000000000..621b2d510f
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
@@ -0,0 +1,370 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h"
+#include "tensorflow/compiler/xla/service/gpu/convolution_thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
+#include "tensorflow/core/lib/gtl/optional.h"
+#include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+namespace se = perftools::gputools;
+
+using se::DeviceMemoryBase;
+using se::dnn::AlgorithmConfig;
+using se::dnn::AlgorithmDesc;
+using tensorflow::gtl::nullopt;
+using tensorflow::gtl::optional;
+
+class ScratchAllocator : public se::ScratchAllocator {
+ public:
+  ScratchAllocator(int device_ordinal, DeviceMemoryAllocator* memory_allocator)
+      : device_ordinal_(device_ordinal), memory_allocator_(memory_allocator) {}
+
+  ~ScratchAllocator() override;
+
+  int64 GetMemoryLimitInBytes(se::Stream* stream) override {
+    return 1LL << 32;  // 4GB.  TODO(jlebar): Tune this?
+  }
+  int64 TotalAllocatedBytes() { return total_allocated_bytes_; }
+
+  se::port::StatusOr<se::DeviceMemory<uint8>> AllocateBytes(
+      se::Stream* stream, int64 byte_size) override;
+
+ private:
+  const int device_ordinal_;
+  DeviceMemoryAllocator* memory_allocator_;
+  std::vector<se::DeviceMemoryBase> allocated_buffers_;
+  int64 total_allocated_bytes_ = 0;
+};
+
+ScratchAllocator::~ScratchAllocator() {
+  for (auto& allocated_buffer : allocated_buffers_) {
+    if (!memory_allocator_->Deallocate(device_ordinal_, &allocated_buffer)
+             .ok()) {
+      // The program can still continue with failed deallocation.
+      LOG(ERROR) << "Failed to deallocate the allocated buffer: "
+                 << allocated_buffer.opaque();
+    }
+  }
+}
+
+se::port::StatusOr<se::DeviceMemory<uint8>> ScratchAllocator::AllocateBytes(
+    se::Stream* stream, int64 byte_size) {
+  CHECK_GE(byte_size, 0) << "byte_size must be positive.";
+  if (byte_size > GetMemoryLimitInBytes(stream)) {
+    return se::port::Status(
+        se::port::error::RESOURCE_EXHAUSTED,
+        tensorflow::strings::Printf(
+            "Allocating %lld bytes exceeds the memory limit of %lld bytes.",
+            byte_size, GetMemoryLimitInBytes(stream)));
+  }
+
+  auto status_or_memory =
+      memory_allocator_->Allocate(device_ordinal_, byte_size,
+                                  /*retry_on_failure=*/false);
+  if (!status_or_memory.ok()) {
+    return se::port::Status(se::port::error::RESOURCE_EXHAUSTED,
+                            tensorflow::strings::Printf(
+                                "Failed to allocate %lld bytes on device %d.",
+                                byte_size, device_ordinal_));
+  }
+  se::DeviceMemoryBase allocated_buffer = status_or_memory.ValueOrDie();
+  allocated_buffers_.push_back(allocated_buffer);
+  total_allocated_bytes_ += byte_size;
+  return se::DeviceMemory<uint8>(allocated_buffer);
+}
+
+// Determines whether we can safely perform a winograd non-fused convolution for
+// the given input and output shapes.  This works around b/68264959, an integer
+// overflow in cuDNNv5 and cuDNNv6.
+//
+// TODO(jlebar): We shouldn't need this check for cuDNNv7.
+bool ShouldIncludeWinogradNonfusedAlgo(
+    const Shape& input_shape, const Shape& output_shape,
+    const ConvolutionDimensionNumbers& dnums) {
+  int64 batch = input_shape.dimensions(dnums.input_batch_dimension());
+  int64 in_depths = input_shape.dimensions(dnums.input_feature_dimension());
+  int64 in_rows = input_shape.dimensions(dnums.input_spatial_dimensions(0));
+  int64 in_cols =
+      dnums.input_spatial_dimensions_size() == 1
+          ? 1
+          : input_shape.dimensions(dnums.input_spatial_dimensions(1));
+  int64 out_depths = output_shape.dimensions(dnums.output_feature_dimension());
+
+  int64 total_size = CeilOfRatio(batch, int64{16}) *
+                     std::max(in_depths, out_depths) * in_cols * in_rows *
+                     sizeof(float);
+
+  const int64 threshold = 1L << 31;
+  return total_size < threshold;
+}
+
+std::vector<AlgorithmDesc> GetAlgorithms(CudnnConvKind kind,
+                                         bool with_winograd_nonfused,
+                                         se::StreamExecutor* stream_exec_) {
+  std::vector<AlgorithmDesc> algorithms;
+  switch (kind) {
+    case CudnnConvKind::kBackwardFilter:
+      CHECK(stream_exec_->GetConvolveBackwardFilterAlgorithms(
+          with_winograd_nonfused, &algorithms));
+      break;
+    case CudnnConvKind::kBackwardInput:
+      CHECK(stream_exec_->GetConvolveBackwardDataAlgorithms(
+          with_winograd_nonfused, &algorithms));
+      break;
+    case CudnnConvKind::kForward:
+      CHECK(stream_exec_->GetConvolveAlgorithms(with_winograd_nonfused,
+                                                &algorithms));
+      break;
+  }
+
+  // Remove any algorithms with tensor math enabled.  These have lower precision
+  // than regular algorithms, and we don't yet have a way to turn this on/off in
+  // XLA.
+  algorithms.erase(std::remove_if(algorithms.begin(), algorithms.end(),
+                                  [&](const AlgorithmDesc& a) {
+                                    return a.tensor_ops_enabled();
+                                  }),
+                   algorithms.end());
+
+  return algorithms;
+}
+
+string AlgorithmToString(const AlgorithmDesc& algo) {
+  if (algo.tensor_ops_enabled()) {
+    return tensorflow::strings::StrCat(algo.algo_id(), "+TC");
+  }
+  return tensorflow::strings::StrCat(algo.algo_id());
+}
+
+string NumBytesToString(int64 bytes) {
+  return tensorflow::strings::StrCat(
+      tensorflow::strings::HumanReadableNumBytes(bytes), " (", bytes, "B)");
+}
+
+}  // anonymous namespace
+
+// We could have caching here so that we don't redo this work for two identical
+// convolutions.  Unfortunately our cache key would have to be a tuple
+// containing the protos passed to this function, and we have no utility for
+// hashing protos.  We could write our own hash functions, but they'd silently
+// break if we ever added a field to one of the protos.  Perhaps we could hack
+// using the binary-encoded proto as the hash key, on the assumption that two
+// protos being binary-equal is a sufficient, if not necessary, condition for
+// proper equality.  But that would still leave us open to having unnecessary
+// cache misses and doing extra work.  Overall, caching doesn't seem worth the
+// trouble, but we may want to revisit this if we ever find a model where
+// caching would speed up compilation a lot.
+optional<std::pair<int64, int64>>
+CudnnConvolutionAlgorithmPicker::PickBestAlgorithm(
+    CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape,
+    const Shape& output_shape, const Window& window,
+    const ConvolutionDimensionNumbers& dnums, HloInstruction* instr) {
+  // Create a stream for us to do our work on.
+  se::Stream stream{stream_exec_};
+  stream.Init();
+  const auto device_ordinal = stream_exec_->device_ordinal();
+
+  // allocator either points to this->allocator_ or, if that's null, to a
+  // StreamExecutorMemoryAllocator for stream_exec_.
+  DeviceMemoryAllocator* allocator;
+  optional<StreamExecutorMemoryAllocator> se_allocator;
+  if (allocator_ != nullptr) {
+    allocator = allocator_;
+  } else {
+    se_allocator.emplace(
+        stream_exec_->platform(),
+        tensorflow::gtl::ArraySlice<se::StreamExecutor*>({stream_exec_}));
+    allocator = &*se_allocator;
+  }
+
+  // Allocate space for the input, filter, and output of the convolution.  We
+  // use a ScratchAllocator for this instead of calling allocator_ directly so
+  // that our allocations don't leak.
+  //
+  // We don't put any data in these buffers, because (in theory, anyway) the
+  // speed of a conv isn't affected by the data being convolved.
+  ScratchAllocator input_output_allocator(device_ordinal, allocator);
+  se::port::StatusOr<DeviceMemoryBase> input_buf =
+      input_output_allocator.AllocateBytes(&stream,
+                                           ShapeUtil::ByteSizeOf(input_shape));
+  se::port::StatusOr<DeviceMemoryBase> filter_buf =
+      input_output_allocator.AllocateBytes(&stream,
+                                           ShapeUtil::ByteSizeOf(filter_shape));
+  se::port::StatusOr<DeviceMemoryBase> output_buf =
+      input_output_allocator.AllocateBytes(&stream,
+                                           ShapeUtil::ByteSizeOf(output_shape));
+  if (!input_buf.ok() || !filter_buf.ok() || !output_buf.ok()) {
+    LOG(WARNING)
+        << "Couldn't allocate space for input/filter/output of convolution "
+        << instr->ToString() << ".  Falling back to default algorithm.";
+    return nullopt;
+  }
+
+  const bool use_winograd_nonfused =
+      ShouldIncludeWinogradNonfusedAlgo(input_shape, output_shape, dnums);
+  se::dnn::ProfileResult best_result;
+  int64 best_result_bytes_used = 0;
+  for (const AlgorithmDesc& alg :
+       GetAlgorithms(kind, use_winograd_nonfused, stream_exec_)) {
+    ScratchAllocator scratch_allocator(device_ordinal, allocator);
+    se::dnn::ProfileResult profile_result;
+    VLOG(3) << "Trying algorithm " << AlgorithmToString(alg) << " for "
+            << instr->ToString();
+
+    bool launch_ok =
+        RunCudnnConvolution(kind, input_shape, filter_shape, output_shape,
+                            se::DeviceMemory<float>(input_buf.ValueOrDie()),
+                            se::DeviceMemory<float>(filter_buf.ValueOrDie()),
+                            se::DeviceMemory<float>(output_buf.ValueOrDie()),
+                            &scratch_allocator, window, dnums,
+                            AlgorithmConfig(alg), &stream, &profile_result)
+            .ok();
+
+    if (launch_ok && profile_result.is_valid()) {
+      int64 scratch_bytes_used = scratch_allocator.TotalAllocatedBytes();
+      VLOG(3) << "Run of algorithm " << AlgorithmToString(alg)
+              << " succeeded, taking " << profile_result.elapsed_time_in_ms()
+              << "ms and using " << NumBytesToString(scratch_bytes_used)
+              << " of scratch (Best result: "
+              << best_result.elapsed_time_in_ms() << "ms, "
+              << NumBytesToString(best_result_bytes_used) << " of scratch)";
+      if (profile_result.elapsed_time_in_ms() <
+          best_result.elapsed_time_in_ms()) {
+        best_result = profile_result;
+        best_result_bytes_used = scratch_bytes_used;
+      }
+    } else {
+      VLOG(3) << "Run of algorithm " << AlgorithmToString(alg) << " failed.";
+    }
+  }
+  if (best_result.is_valid()) {
+    VLOG(2) << "Best algorithm for " << instr->ToString() << ": "
+            << AlgorithmToString(best_result.algorithm()) << ", takes "
+            << best_result.elapsed_time_in_ms() << "ms, and uses "
+            << best_result_bytes_used << "B of scratch memory.";
+    return std::make_pair(best_result.algorithm().algo_id(),
+                          best_result_bytes_used);
+  }
+
+  LOG(WARNING) << "All algorithms tried for convolution " << instr->ToString()
+               << " failed.  Falling back to default algorithm.";
+  return nullopt;
+}
+
+StatusOr<bool> CudnnConvolutionAlgorithmPicker::RunOnInstruction(
+    HloInstruction* instr) {
+  CHECK(IsCustomCallToDnnConvolution(*instr));
+
+  const auto& call_target = instr->custom_call_target();
+  const auto& lhs_shape = instr->operand(0)->shape();
+  const auto& rhs_shape = instr->operand(1)->shape();
+  const auto& conv_result_shape = instr->shape().tuple_shapes(0);
+  optional<std::pair<int64, int64>> alg_and_scratch_bytes;
+  if (call_target == kCudnnConvForwardCallTarget) {
+    alg_and_scratch_bytes = PickBestAlgorithm(
+        CudnnConvKind::kForward, /*input_shape=*/lhs_shape,
+        /*filter_shape=*/rhs_shape, /*output_shape=*/conv_result_shape,
+        instr->window(), instr->convolution_dimension_numbers(), instr);
+  } else if (call_target == kCudnnConvBackwardInputCallTarget) {
+    alg_and_scratch_bytes = PickBestAlgorithm(
+        CudnnConvKind::kBackwardInput, /*input_shape=*/conv_result_shape,
+        /*filter_shape=*/rhs_shape, /*output_shape=*/lhs_shape, instr->window(),
+        instr->convolution_dimension_numbers(), instr);
+  } else if (call_target == kCudnnConvBackwardFilterCallTarget) {
+    alg_and_scratch_bytes = PickBestAlgorithm(
+        CudnnConvKind::kBackwardFilter, /*input_shape=*/lhs_shape,
+        /*filter_shape=*/conv_result_shape, /*output_shape=*/rhs_shape,
+        instr->window(), instr->convolution_dimension_numbers(), instr);
+  } else {
+    LOG(FATAL) << "Unknown custom call target for cudnn conv: "
+               << instr->ToString();
+  }
+
+  if (!alg_and_scratch_bytes.has_value()) {
+    return false;
+  }
+
+  int64 algorithm;
+  int64 scratch_bytes;
+  std::tie(algorithm, scratch_bytes) = *alg_and_scratch_bytes;
+
+  VLOG(1) << "Setting cudnn conv to use algorithm " << algorithm << " and "
+          << NumBytesToString(scratch_bytes)
+          << " of scratch memory: " << instr->ToString();
+
+  // Replace instr with a new CustomCall which has the correct algorithm, and
+  // whose output shape has the appropriate amount of scratch memory.
+  HloComputation* computation = instr->parent();
+  Shape new_call_shape =
+      ShapeUtil::MakeTupleShape({instr->shape().tuple_shapes(0),
+                                 ShapeUtil::MakeShape(U8, {scratch_bytes})});
+  HloInstruction* algorithm_hlo = computation->AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<int64>(algorithm)));
+  HloInstruction* new_call =
+      computation->AddInstruction(HloInstruction::CreateCustomCall(
+          new_call_shape,
+          {instr->mutable_operand(0), instr->mutable_operand(1), algorithm_hlo},
+          instr->custom_call_target()));
+  new_call->set_window(instr->window());
+  new_call->set_convolution_dimension_numbers(
+      instr->convolution_dimension_numbers());
+
+  // Repackage new_call so it has the same shape as the original call, namely
+  // (conv_result, u8[0]).
+  HloInstruction* new_tuple =
+      computation->AddInstruction(HloInstruction::CreateTuple(
+          {computation->AddInstruction(HloInstruction::CreateGetTupleElement(
+               new_call_shape.tuple_shapes(0), new_call, 0)),
+           computation->AddInstruction(
+               HloInstruction::CreateConstant(Literal::CreateR1<uint8>({})))}));
+
+  TF_RETURN_IF_ERROR(instr->parent()->ReplaceInstruction(instr, new_tuple));
+  return true;
+}
+
+StatusOr<bool> CudnnConvolutionAlgorithmPicker::RunOnComputation(
+    HloComputation* computation) {
+  std::vector<HloInstruction*> convs;
+  for (auto* instr : computation->instructions()) {
+    if (IsCustomCallToDnnConvolution(*instr)) {
+      convs.push_back(instr);
+    }
+  }
+
+  bool changed = false;
+  for (auto* instr : convs) {
+    TF_ASSIGN_OR_RETURN(bool result, RunOnInstruction(instr));
+    changed |= result;
+  }
+  return changed;
+}
+
+StatusOr<bool> CudnnConvolutionAlgorithmPicker::Run(HloModule* module) {
+  bool changed = false;
+  for (HloComputation* computation : module->MakeNonfusionComputations()) {
+    TF_ASSIGN_OR_RETURN(bool result, RunOnComputation(computation));
+    changed |= result;
+  }
+  return changed;
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h
new file mode 100644
index 0000000000..10e49daee5
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h
@@ -0,0 +1,62 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_ALGORITHM_PICKER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_ALGORITHM_PICKER_H_
+
+#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
+#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/core/lib/gtl/optional.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+
+namespace xla {
+namespace gpu {
+
+// Modifies CustomCalls to cudnn convolutions, choosing the best algorithm for
+// each and adding explicit scratch space to the CustomCalls.
+class CudnnConvolutionAlgorithmPicker : public HloPassInterface {
+ public:
+  // If the `allocator` parameter is not null, we will use it to allocate temp
+  // memory while timing the various convolution algorithms.  If it's null,
+  // we'll use the default allocator on the StreamExecutor.
+  CudnnConvolutionAlgorithmPicker(
+      perftools::gputools::StreamExecutor* stream_exec,
+      DeviceMemoryAllocator* allocator)
+      : stream_exec_(stream_exec), allocator_(allocator) {}
+
+  tensorflow::StringPiece name() const override {
+    return "cudnn-convolution-algorithm-picker";
+  }
+
+  StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  StatusOr<bool> RunOnComputation(HloComputation* computation);
+  StatusOr<bool> RunOnInstruction(HloInstruction* instr);
+  tensorflow::gtl::optional<std::pair<int64, int64>> PickBestAlgorithm(
+      CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape,
+      const Shape& output_shape, const Window& window,
+      const ConvolutionDimensionNumbers& dnums, HloInstruction* instr);
+
+  perftools::gputools::StreamExecutor* stream_exec_;  // never null
+  DeviceMemoryAllocator* allocator_;                  // may be null
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_ALGORITHM_PICKER_H_
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_folding.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.cc
similarity index 83%
rename from tensorflow/compiler/xla/service/gpu/convolution_folding.cc
rename to tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.cc
index b0626ca3bc..e0c73aa73a 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_folding.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.cc
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/gpu/convolution_folding.h"
+#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.h"
 
 #include <numeric>
 #include <vector>
@@ -33,14 +33,32 @@ namespace xla {
 namespace gpu {
 
 namespace {
+
+bool CanImplementAsCudnnForwardConv(HloInstruction* conv) {
+  const ConvolutionDimensionNumbers& dnums =
+      conv->convolution_dimension_numbers();
+  if (dnums.input_spatial_dimensions_size() > 3) {
+    return false;
+  }
+
+  // CuDNN does not accept zero-element arguments
+  if (ShapeUtil::HasZeroElements(conv->operand(0)->shape()) ||
+      ShapeUtil::HasZeroElements(conv->operand(1)->shape())) {
+    return false;
+  }
+
+  if (window_util::HasWindowReversal(conv->window())) {
+    return false;
+  }
+  return true;
+}
+
 // Try to match a backward filter pattern that contains "conv".
 // Precondition: "conv" is a kConvolution.
-std::tuple<bool, std::vector<HloInstruction*>, Window,
-           ConvolutionDimensionNumbers>
-MatchBackwardFilter(HloInstruction* conv) {
+std::tuple<bool, Window, ConvolutionDimensionNumbers> MatchBackwardFilter(
+    HloInstruction* conv) {
   const auto no_match_result =
-      std::make_tuple(false, std::vector<HloInstruction*>(), Window(),
-                      ConvolutionDimensionNumbers());
+      std::make_tuple(false, Window(), ConvolutionDimensionNumbers());
   // Step 1: match the instruction pattern without considering the paddings and
   // dimension numbers just yet. We may need some generic pattern matcher
   // similar to third_party/llvm/llvm/include/llvm/IR/PatternMatch.h
@@ -190,18 +208,15 @@ MatchBackwardFilter(HloInstruction* conv) {
     backward_conv_dnums.add_kernel_spatial_dimensions(output_spatial_dims[i]);
   }
 
-  return std::make_tuple(true, std::vector<HloInstruction*>({conv}),
-                         backward_conv_window, backward_conv_dnums);
+  return std::make_tuple(true, backward_conv_window, backward_conv_dnums);
 }
 
 // Try to match a backward input pattern that contains "conv".
 // Precondition: "conv" is a kConvolution.
-std::tuple<bool, std::vector<HloInstruction*>, Window,
-           ConvolutionDimensionNumbers>
-MatchBackwardInput(HloInstruction* conv) {
+std::tuple<bool, Window, ConvolutionDimensionNumbers> MatchBackwardInput(
+    HloInstruction* conv) {
   const auto no_match_result =
-      std::make_tuple(false, std::vector<HloInstruction*>(), Window(),
-                      ConvolutionDimensionNumbers());
+      std::make_tuple(false, Window(), ConvolutionDimensionNumbers());
 
   // Match instruction pattern.
   CHECK_EQ(HloOpcode::kConvolution, conv->opcode());
@@ -374,58 +389,82 @@ MatchBackwardInput(HloInstruction* conv) {
   dnums.set_kernel_output_feature_dimension(
       conv->convolution_dimension_numbers().kernel_input_feature_dimension());
 
-  return std::make_tuple(true,
-                         std::vector<HloInstruction*>({conv, reverse_filter}),
-                         new_window, dnums);
+  return std::make_tuple(true, new_window, dnums);
 }
-}  // namespace
 
-StatusOr<bool> ConvolutionFolding::Run(HloModule* module) {
-  HloComputation* entry_computation = module->entry_computation();
-  std::vector<HloInstruction*> convs;
-  for (auto* hlo : entry_computation->instructions()) {
-    if (hlo->opcode() == HloOpcode::kConvolution) {
-      convs.push_back(hlo);
-    }
-  }
+// Tries to rewrite a single convolution into a call to cudnn.
+StatusOr<bool> RunOnInstruction(HloInstruction* conv) {
+  CHECK_EQ(conv->opcode(), HloOpcode::kConvolution);
 
-  bool changed = false;
-  for (HloInstruction* conv : convs) {
+  HloInstruction* custom_call = [&]() -> HloInstruction* {
     bool match;
-    std::vector<HloInstruction*> hlos_to_fuse;
     Window window;
     ConvolutionDimensionNumbers dnums;
-    std::tie(match, hlos_to_fuse, window, dnums) = MatchBackwardFilter(conv);
+
+    std::tie(match, window, dnums) = MatchBackwardFilter(conv);
     if (match) {
-      VLOG(2) << "Fuse instructions";
-      for (HloInstruction* hlo_to_fuse : hlos_to_fuse) {
-        VLOG(2) << "  " << hlo_to_fuse->ToString();
-      }
-      HloInstruction* backward_convolution =
-          entry_computation->CreateFusionInstructionForBackwardConvolution(
-              hlos_to_fuse, HloInstruction::FusionKind::kConvBackwardFilter,
-              window, dnums);
-      VLOG(2) << "to backward filter convolution";
-      VLOG(2) << "  " << backward_convolution->ToString();
-      changed = true;
-      continue;
+      return CreateCudnnConvBackwardFilter(
+          conv->shape(), conv->mutable_operand(0), conv->mutable_operand(1),
+          window, dnums);
     }
 
-    std::tie(match, hlos_to_fuse, window, dnums) = MatchBackwardInput(conv);
+    std::tie(match, window, dnums) = MatchBackwardInput(conv);
     if (match) {
-      VLOG(2) << "Fuse instructions";
-      for (HloInstruction* hlo_to_fuse : hlos_to_fuse) {
-        VLOG(2) << "  " << hlo_to_fuse->ToString();
-      }
-      HloInstruction* backward_convolution =
-          entry_computation->CreateFusionInstructionForBackwardConvolution(
-              hlos_to_fuse, HloInstruction::FusionKind::kConvBackwardInput,
-              window, dnums);
-      VLOG(2) << "to backward input convolution";
-      VLOG(2) << "  " << backward_convolution->ToString();
-      changed = true;
-      continue;
+      // Backward input conv subsumes the conv plus the reverse in operand 1.
+      HloInstruction* reverse = conv->mutable_operand(1);
+      CHECK_EQ(reverse->opcode(), HloOpcode::kReverse);
+      HloInstruction* rhs = reverse->mutable_operand(0);
+
+      return CreateCudnnConvBackwardInput(
+          conv->shape(), conv->mutable_operand(0), rhs, window, dnums);
     }
+
+    // If all else fails, try a forward convolution.
+    if (CanImplementAsCudnnForwardConv(conv)) {
+      return CreateCudnnConvForward(conv->shape(), conv->mutable_operand(0),
+                                    conv->mutable_operand(1), conv->window(),
+                                    conv->convolution_dimension_numbers());
+    }
+
+    return nullptr;
+  }();
+
+  if (custom_call == nullptr) {
+    return false;
+  }
+
+  // The CustomCall returns a tuple (conv_result, scratch_memory).  Extract out
+  // the conv result and replace `conv` with it.
+  TF_RETURN_IF_ERROR(conv->parent()->ReplaceWithNewInstruction(
+      conv,
+      HloInstruction::CreateGetTupleElement(conv->shape(), custom_call, 0)));
+  return true;
+}
+
+// Rewrites the convolutions in the given computation into calls to cudnn.
+// Returns true if it made any changes.
+StatusOr<bool> RunOnComputation(HloComputation* computation) {
+  std::vector<HloInstruction*> convs;
+  for (auto* hlo : computation->instructions()) {
+    if (hlo->opcode() == HloOpcode::kConvolution) {
+      convs.push_back(hlo);
+    }
+  }
+
+  bool changed = false;
+  for (HloInstruction* conv : convs) {
+    TF_ASSIGN_OR_RETURN(bool result, RunOnInstruction(conv));
+    changed |= result;
+  }
+  return changed;
+}
+}  // namespace
+
+StatusOr<bool> CudnnConvolutionRewriter::Run(HloModule* module) {
+  bool changed = false;
+  for (HloComputation* computation : module->MakeNonfusionComputations()) {
+    TF_ASSIGN_OR_RETURN(bool result, RunOnComputation(computation));
+    changed |= result;
   }
   return changed;
 }
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_folding.h b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.h
similarity index 63%
rename from tensorflow/compiler/xla/service/gpu/convolution_folding.h
rename to tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.h
index f9c898721f..0c0578d888 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_folding.h
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.h
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CONVOLUTION_FOLDING_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CONVOLUTION_FOLDING_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_REWRITER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_REWRITER_H_
 
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
@@ -22,10 +22,12 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-class ConvolutionFolding : public HloPassInterface {
+// Rewrites plain convolutions, backwards-filter convolutions, and
+// backwards-input convolutions into CustomCall HLOs that call into cuDNN.
+class CudnnConvolutionRewriter : public HloPassInterface {
  public:
   tensorflow::StringPiece name() const override {
-    return "convolution-folding";
+    return "cudnn-convolution-rewriter";
   }
 
   StatusOr<bool> Run(HloModule* module) override;
@@ -34,4 +36,4 @@ class ConvolutionFolding : public HloPassInterface {
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CONVOLUTION_FOLDING_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_REWRITER_H_
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_folding_test.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter_test.cc
similarity index 82%
rename from tensorflow/compiler/xla/service/gpu/convolution_folding_test.cc
rename to tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter_test.cc
index 34e6bdb117..65588b6aaf 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_folding_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter_test.cc
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,23 +13,29 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/gpu/convolution_folding.h"
+#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.h"
 
+#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace xla {
 namespace gpu {
+namespace {
 
-class ConvolutionFoldingTest : public HloTestBase {
+namespace op = xla::testing::opcode_matchers;
+
+class CudnnConvolutionRewriterTest : public HloTestBase {
  public:
-  ConvolutionFoldingTest() {
+  CudnnConvolutionRewriterTest() {
     for (int i = 0; i < 2; ++i) {
       WindowDimension* window_dim = default_conv_window_.add_dimensions();
       window_dim->set_size(1);
@@ -44,7 +50,8 @@ class ConvolutionFoldingTest : public HloTestBase {
     // the batch and feature dimension in the activations, and treat the batch
     // dimension in gradients as the input feature dimension in the filter.
     //
-    // TODO(jingyue): Add more tests on NCHW input order which TF also supports.
+    // TODO(jingyue): Add more tests on NCHW input order, which TF also
+    // supports.
     tf_default_dnums_for_backward_filter_.set_input_batch_dimension(3);
     tf_default_dnums_for_backward_filter_.set_input_feature_dimension(0);
     tf_default_dnums_for_backward_filter_.add_input_spatial_dimensions(1);
@@ -74,9 +81,8 @@ class ConvolutionFoldingTest : public HloTestBase {
   }
 
  protected:
-  bool FoldConvolution(HloModule* module) {
-    ConvolutionFolding convolution_folding;
-    return convolution_folding.Run(module).ValueOrDie();
+  bool RunPass(HloModule* module) {
+    return CudnnConvolutionRewriter().Run(module).ValueOrDie();
   }
 
   // A convolution window with stride 1 and zero padding. The size fields are
@@ -86,7 +92,7 @@ class ConvolutionFoldingTest : public HloTestBase {
   ConvolutionDimensionNumbers tf_default_dnums_for_backward_input_;
 };
 
-TEST_F(ConvolutionFoldingTest, BackwardFilterConvolve) {
+TEST_F(CudnnConvolutionRewriterTest, BackwardFilterConvolve) {
   HloComputation::Builder builder(TestName());
   HloInstruction* activations =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -108,14 +114,13 @@ TEST_F(ConvolutionFoldingTest, BackwardFilterConvolve) {
   auto module = CreateNewModule();
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(FoldConvolution(module.get()));
-  EXPECT_EQ(HloOpcode::kFusion,
-            entry_computation->root_instruction()->opcode());
-  EXPECT_TRUE(HloInstruction::FusionKind::kConvBackwardFilter ==
-              entry_computation->root_instruction()->fusion_kind());
+  EXPECT_TRUE(RunPass(module.get()));
+  EXPECT_THAT(entry_computation->root_instruction(),
+              op::GetTupleElement(
+                  op::CustomCall(kCudnnConvBackwardFilterCallTarget), 0));
 }
 
-TEST_F(ConvolutionFoldingTest,
+TEST_F(CudnnConvolutionRewriterTest,
        BackwardFilterConvolveEquivalentToForwardConvolution) {
   HloComputation::Builder builder(TestName());
   HloInstruction* activations =
@@ -135,12 +140,17 @@ TEST_F(ConvolutionFoldingTest,
       tf_default_dnums_for_backward_filter_));
 
   auto module = CreateNewModule();
-  module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(FoldConvolution(module.get()));
+  HloComputation* entry_computation =
+      module->AddEntryComputation(builder.Build());
+  EXPECT_TRUE(RunPass(module.get()));
+  EXPECT_THAT(entry_computation->root_instruction(),
+              op::GetTupleElement(
+                  op::CustomCall(kCudnnConvBackwardFilterCallTarget), 0));
 }
 
 // Extracted from block35 training.
-TEST_F(ConvolutionFoldingTest, BackwardFilterConvolveWithPaddedActivations) {
+TEST_F(CudnnConvolutionRewriterTest,
+       BackwardFilterConvolveWithPaddedActivations) {
   auto builder = HloComputation::Builder(TestName());
   HloInstruction* activations =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -162,15 +172,15 @@ TEST_F(ConvolutionFoldingTest, BackwardFilterConvolveWithPaddedActivations) {
   auto module = CreateNewModule();
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(FoldConvolution(module.get()));
-  EXPECT_EQ(HloOpcode::kFusion,
-            entry_computation->root_instruction()->opcode());
-  EXPECT_TRUE(HloInstruction::FusionKind::kConvBackwardFilter ==
-              entry_computation->root_instruction()->fusion_kind());
+  EXPECT_TRUE(RunPass(module.get()));
+  EXPECT_THAT(entry_computation->root_instruction(),
+              op::GetTupleElement(
+                  op::CustomCall(kCudnnConvBackwardFilterCallTarget), 0));
 }
 
 // Extracted from inception v3 training.
-TEST_F(ConvolutionFoldingTest, BackwardFilterConvolveWithPaddedGradients) {
+TEST_F(CudnnConvolutionRewriterTest,
+       BackwardFilterConvolveWithPaddedGradients) {
   auto builder = HloComputation::Builder(TestName());
   HloInstruction* activations =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -192,14 +202,13 @@ TEST_F(ConvolutionFoldingTest, BackwardFilterConvolveWithPaddedGradients) {
   auto module = CreateNewModule();
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(FoldConvolution(module.get()));
-  EXPECT_EQ(HloOpcode::kFusion,
-            entry_computation->root_instruction()->opcode());
-  EXPECT_TRUE(HloInstruction::FusionKind::kConvBackwardFilter ==
-              entry_computation->root_instruction()->fusion_kind());
+  EXPECT_TRUE(RunPass(module.get()));
+  EXPECT_THAT(entry_computation->root_instruction(),
+              op::GetTupleElement(
+                  op::CustomCall(kCudnnConvBackwardFilterCallTarget), 0));
 }
 
-TEST_F(ConvolutionFoldingTest, BackwardFilterConvolveWithUnevenPadding) {
+TEST_F(CudnnConvolutionRewriterTest, BackwardFilterConvolveWithUnevenPadding) {
   auto builder = HloComputation::Builder(TestName());
   HloInstruction* activations =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -221,14 +230,13 @@ TEST_F(ConvolutionFoldingTest, BackwardFilterConvolveWithUnevenPadding) {
   auto module = CreateNewModule();
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(FoldConvolution(module.get()));
-  EXPECT_EQ(HloOpcode::kFusion,
-            entry_computation->root_instruction()->opcode());
-  EXPECT_TRUE(HloInstruction::FusionKind::kConvBackwardFilter ==
-              entry_computation->root_instruction()->fusion_kind());
+  EXPECT_TRUE(RunPass(module.get()));
+  EXPECT_THAT(entry_computation->root_instruction(),
+              op::GetTupleElement(
+                  op::CustomCall(kCudnnConvBackwardFilterCallTarget), 0));
 }
 
-TEST_F(ConvolutionFoldingTest, BackwardInputConvolveEvenPadding) {
+TEST_F(CudnnConvolutionRewriterTest, BackwardInputConvolveEvenPadding) {
   auto builder = HloComputation::Builder(TestName());
   HloInstruction* output =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -272,14 +280,15 @@ TEST_F(ConvolutionFoldingTest, BackwardInputConvolveEvenPadding) {
   auto module = CreateNewModule();
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(FoldConvolution(module.get()));
-  EXPECT_EQ(HloOpcode::kFusion,
-            entry_computation->root_instruction()->opcode());
-  EXPECT_TRUE(HloInstruction::FusionKind::kConvBackwardInput ==
-              entry_computation->root_instruction()->fusion_kind());
+  EXPECT_TRUE(RunPass(module.get()));
+
+  ASSERT_THAT(entry_computation->root_instruction(),
+              op::GetTupleElement(
+                  op::CustomCall(kCudnnConvBackwardInputCallTarget), 0));
+  const HloInstruction* custom_call =
+      entry_computation->root_instruction()->operand(0);
   for (int i = 0; i < 2; ++i) {
-    const WindowDimension& window_dim =
-        entry_computation->root_instruction()->window().dimensions(i);
+    const WindowDimension& window_dim = custom_call->window().dimensions(i);
     // Low padding of the backward input convolution
     //   = kernel_size - 1 - low padding on gradients.
     EXPECT_EQ(3, window_dim.padding_low());
@@ -291,7 +300,7 @@ TEST_F(ConvolutionFoldingTest, BackwardInputConvolveEvenPadding) {
 // Convolve([abc], [x], base_dilation=2)
 //   = Convolve([abc], Reverse([x]), base_dilation=2)
 //   = BackwardInputConvolve([abc], [x], stride=2)
-TEST_F(ConvolutionFoldingTest, BackwardInputConvolve1x1Filter) {
+TEST_F(CudnnConvolutionRewriterTest, BackwardInputConvolve1x1Filter) {
   auto builder = HloComputation::Builder(TestName());
   // NHWC dimension order.
   HloInstruction* output =
@@ -316,17 +325,16 @@ TEST_F(ConvolutionFoldingTest, BackwardInputConvolve1x1Filter) {
   auto module = CreateNewModule();
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(FoldConvolution(module.get()));
-  EXPECT_EQ(HloOpcode::kFusion,
-            entry_computation->root_instruction()->opcode());
-  EXPECT_TRUE(HloInstruction::FusionKind::kConvBackwardInput ==
-              entry_computation->root_instruction()->fusion_kind());
+  EXPECT_TRUE(RunPass(module.get()));
+  EXPECT_THAT(entry_computation->root_instruction(),
+              op::GetTupleElement(
+                  op::CustomCall(kCudnnConvBackwardInputCallTarget), 0));
 }
 
 // BackwardInputConvolve([abc], [x], stride=1) is equivalent to
 // ForwardConvolve([abc], [x], stride=1). No need to fold it into backward input
 // convolution.
-TEST_F(ConvolutionFoldingTest,
+TEST_F(CudnnConvolutionRewriterTest,
        BackwardInputConvolve1x1FilterEquivalentToForwardConvolve) {
   auto builder = HloComputation::Builder(TestName());
   // NHWC dimension order.
@@ -347,8 +355,12 @@ TEST_F(ConvolutionFoldingTest,
       tf_default_dnums_for_backward_input_));
 
   auto module = CreateNewModule();
-  module->AddEntryComputation(builder.Build());
-  EXPECT_FALSE(FoldConvolution(module.get()));
+  HloComputation* entry_computation =
+      module->AddEntryComputation(builder.Build());
+  EXPECT_TRUE(RunPass(module.get()));
+  EXPECT_THAT(
+      entry_computation->root_instruction(),
+      op::GetTupleElement(op::CustomCall(kCudnnConvForwardCallTarget), 0));
 }
 
 // Extracted from Inception V3 training.
@@ -365,7 +377,8 @@ TEST_F(ConvolutionFoldingTest,
 //                     20x10x10x192
 //
 // Gradients are padded unevenly.
-TEST_F(ConvolutionFoldingTest, BackwardInputConvolveUnevenPaddingOnGradients) {
+TEST_F(CudnnConvolutionRewriterTest,
+       BackwardInputConvolveUnevenPaddingOnGradients) {
   auto builder = HloComputation::Builder(TestName());
   HloInstruction* output =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -397,14 +410,14 @@ TEST_F(ConvolutionFoldingTest, BackwardInputConvolveUnevenPaddingOnGradients) {
   auto module = CreateNewModule();
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(FoldConvolution(module.get()));
-  EXPECT_EQ(HloOpcode::kFusion,
-            entry_computation->root_instruction()->opcode());
-  EXPECT_TRUE(HloInstruction::FusionKind::kConvBackwardInput ==
-              entry_computation->root_instruction()->fusion_kind());
+  EXPECT_TRUE(RunPass(module.get()));
+  ASSERT_THAT(entry_computation->root_instruction(),
+              op::GetTupleElement(
+                  op::CustomCall(kCudnnConvBackwardInputCallTarget), 0));
+  const HloInstruction* custom_call =
+      entry_computation->root_instruction()->operand(0);
   for (int i = 0; i < 2; ++i) {
-    const WindowDimension& window_dim =
-        entry_computation->root_instruction()->window().dimensions(i);
+    const WindowDimension& window_dim = custom_call->window().dimensions(i);
     EXPECT_EQ(0, window_dim.padding_low());
     EXPECT_EQ(0, window_dim.padding_high());
     EXPECT_EQ(2, window_dim.stride());
@@ -413,7 +426,7 @@ TEST_F(ConvolutionFoldingTest, BackwardInputConvolveUnevenPaddingOnGradients) {
 
 // Similar to BackwardInputConvolveUnevenPadding, but the low padding of the
 // gradients exceeds kernel_size - 1. Therefore, this pattern cannot be fused.
-TEST_F(ConvolutionFoldingTest, BackwardInputConvolveLowPaddingTooLarge) {
+TEST_F(CudnnConvolutionRewriterTest, BackwardInputConvolveLowPaddingTooLarge) {
   auto builder = HloComputation::Builder(TestName());
   HloInstruction* output =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -442,8 +455,12 @@ TEST_F(ConvolutionFoldingTest, BackwardInputConvolveLowPaddingTooLarge) {
                          .ValueOrDie()));
 
   auto module = CreateNewModule();
-  module->AddEntryComputation(builder.Build());
-  EXPECT_FALSE(FoldConvolution(module.get()));
+  HloComputation* entry_computation =
+      module->AddEntryComputation(builder.Build());
+  EXPECT_TRUE(RunPass(module.get()));
+  EXPECT_THAT(
+      entry_computation->root_instruction(),
+      op::GetTupleElement(op::CustomCall(kCudnnConvForwardCallTarget), 0));
 }
 
 // Extracted from //learning/brain/google/xla/benchmarks/resnet.py
@@ -460,7 +477,7 @@ TEST_F(ConvolutionFoldingTest, BackwardInputConvolveLowPaddingTooLarge) {
 //
 // We should fuse BC even though padding on activations is uneven, because
 // PadInsertion will canonicalize the fusion HLO.
-TEST_F(ConvolutionFoldingTest,
+TEST_F(CudnnConvolutionRewriterTest,
        BackwardInputConvolveUnevenPaddingOnActivations) {
   auto builder = HloComputation::Builder(TestName());
   // The gradients are in NCHW layout.
@@ -493,13 +510,12 @@ TEST_F(ConvolutionFoldingTest,
   auto module = CreateNewModule();
   const HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(FoldConvolution(module.get()));
-  const HloInstruction* backward_conv = entry_computation->root_instruction();
-  EXPECT_EQ(HloOpcode::kFusion, backward_conv->opcode());
-  EXPECT_TRUE(HloInstruction::FusionKind::kConvBackwardInput ==
-              backward_conv->fusion_kind());
+  EXPECT_TRUE(RunPass(module.get()));
+  ASSERT_THAT(entry_computation->root_instruction(),
+              op::GetTupleElement(
+                  op::CustomCall(kCudnnConvBackwardInputCallTarget), 0));
   const WindowDimension& backward_conv_col_dim =
-      backward_conv->window().dimensions(1);
+      entry_computation->root_instruction()->operand(0)->window().dimensions(1);
   EXPECT_EQ(0, backward_conv_col_dim.padding_low());
   EXPECT_EQ(1, backward_conv_col_dim.padding_high());
 }
@@ -515,7 +531,7 @@ TEST_F(ConvolutionFoldingTest,
 //
 // We currently don't fuse BC because PadInsertion doesn't support negative
 // padding on the gradients of backward convolution (b/32744257).
-TEST_F(ConvolutionFoldingTest,
+TEST_F(CudnnConvolutionRewriterTest,
        BackwardInputConvolveNegativePaddingHighOnActivations) {
   auto builder = HloComputation::Builder(TestName());
   // The gradients are in NCHW layout.
@@ -544,9 +560,14 @@ TEST_F(ConvolutionFoldingTest,
                          .ValueOrDie()));
 
   auto module = CreateNewModule();
-  module->AddEntryComputation(builder.Build());
-  EXPECT_FALSE(FoldConvolution(module.get()));
+  HloComputation* entry_computation =
+      module->AddEntryComputation(builder.Build());
+  EXPECT_TRUE(RunPass(module.get()));
+  EXPECT_THAT(
+      entry_computation->root_instruction(),
+      op::GetTupleElement(op::CustomCall(kCudnnConvForwardCallTarget), 0));
 }
 
+}  // anonymous namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.cc
new file mode 100644
index 0000000000..f5f52cf62b
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.cc
@@ -0,0 +1,221 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/util.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+namespace se = ::perftools::gputools;
+
+using se::DeviceMemory;
+using se::DeviceMemoryBase;
+using se::Stream;
+using se::dnn::AlgorithmConfig;
+using se::dnn::BatchDescriptor;
+using se::dnn::ConvolutionDescriptor;
+using se::dnn::DataLayout;
+using se::dnn::DimIndex;
+using se::dnn::FilterDescriptor;
+using se::dnn::FilterLayout;
+using se::dnn::ProfileResult;
+
+// A StreamExecutor ScratchAllocator that wraps a single XLA allocation,
+// returning it (in its entirety) the first time Allocate() is called.
+class ScratchBufAllocator : public se::ScratchAllocator {
+ public:
+  explicit ScratchBufAllocator(se::DeviceMemoryBase scratch)
+      : scratch_(scratch) {}
+
+  ~ScratchBufAllocator() override = default;
+
+  int64 GetMemoryLimitInBytes(se::Stream* /*stream*/) override {
+    return scratch_.size();
+  }
+
+  se::port::StatusOr<DeviceMemory<uint8>> AllocateBytes(
+      se::Stream* stream, int64 byte_size) override {
+    if (allocated_) {
+      return se::port::InternalError(
+          "Can't allocate twice from a ScratchBufAllocator.");
+    }
+    if (byte_size > scratch_.size()) {
+      return se::port::InternalError(tensorflow::strings::StrCat(
+          "Can't allocate ", byte_size,
+          " bytes from a ScratchBufAllocator of size ", scratch_.size()));
+    }
+
+    allocated_ = true;
+    return se::DeviceMemory<uint8>(scratch_);
+  }
+
+ private:
+  se::DeviceMemoryBase scratch_;
+  bool allocated_ = false;
+};
+
+}  // anonymous namespace
+
+string CudnnConvKindToString(CudnnConvKind kind) {
+  switch (kind) {
+    case CudnnConvKind::kForward:
+      return "forward";
+    case CudnnConvKind::kBackwardFilter:
+      return "backward_filter";
+    case CudnnConvKind::kBackwardInput:
+      return "backward_input";
+  }
+}
+
+Status RunCudnnConvolution(CudnnConvKind kind, const Shape& input_shape,
+                           const Shape& filter_shape, const Shape& output_shape,
+                           DeviceMemory<float> input_buf,
+                           DeviceMemory<float> filter_buf,
+                           DeviceMemory<float> output_buf,
+                           DeviceMemoryBase scratch_buf, const Window& window,
+                           const ConvolutionDimensionNumbers& dnums,
+                           AlgorithmConfig algorithm, Stream* stream,
+                           ProfileResult* profile_result /*= nullptr*/) {
+  ScratchBufAllocator scratch_allocator(scratch_buf);
+  return RunCudnnConvolution(kind, input_shape, filter_shape, output_shape,
+                             input_buf, filter_buf, output_buf,
+                             &scratch_allocator, window, dnums, algorithm,
+                             stream, profile_result);
+}
+
+Status RunCudnnConvolution(
+    CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape,
+    const Shape& output_shape, DeviceMemory<float> input_buf,
+    DeviceMemory<float> filter_buf, DeviceMemory<float> output_buf,
+    se::ScratchAllocator* scratch_allocator, const Window& window,
+    const ConvolutionDimensionNumbers& dnums, AlgorithmConfig algorithm,
+    Stream* stream, ProfileResult* profile_result /*= nullptr*/) {
+  VLOG(3) << "Convolution kind: " << CudnnConvKindToString(kind);
+  VLOG(3) << "input shape: { " << ShapeUtil::HumanString(input_shape) << " }";
+  VLOG(3) << "filter shape: { " << ShapeUtil::HumanString(filter_shape) << " }";
+  VLOG(3) << "Output shape: { " << ShapeUtil::HumanString(output_shape) << " }";
+  VLOG(3) << "Window: { " << window.ShortDebugString() << " }";
+  VLOG(3) << "Dim nums: { " << dnums.ShortDebugString() << " }";
+
+  const int num_dimensions = window.dimensions_size();
+  CHECK_LE(num_dimensions, 3);
+  // cuDNN does not support 1D convolutions. We therefore express 1D
+  // convolutions as 2D convolutions where the first spatial dimension is 1.
+  // This matches the behavior of TF (see definition of conv1d in
+  // tensorflow/python/ops/nn_ops.py).
+  const int effective_num_dimensions = std::max(2, num_dimensions);
+
+  CHECK_EQ(F32, output_shape.element_type())
+      << ShapeUtil::HumanString(output_shape);
+  CHECK_EQ(num_dimensions, dnums.input_spatial_dimensions_size());
+  CHECK_EQ(num_dimensions, dnums.kernel_spatial_dimensions_size());
+  CHECK_EQ(num_dimensions, dnums.output_spatial_dimensions_size());
+  for (const WindowDimension& dim : window.dimensions()) {
+    CHECK_EQ(dim.padding_low(), dim.padding_high());
+  }
+
+  // cuDNN's convolution APIs support the BDYX layout for activations/output and
+  // the OIYX layout for weights.
+  BatchDescriptor input_descriptor(effective_num_dimensions);
+  input_descriptor.set_layout(DataLayout::kBatchDepthYX)
+      .set_feature_map_count(
+          input_shape.dimensions(dnums.input_feature_dimension()))
+      .set_count(input_shape.dimensions(dnums.input_batch_dimension()));
+  for (int dim = 0; dim < num_dimensions; ++dim) {
+    // Note that the dimensions are reversed. The same holds below.
+    input_descriptor.set_spatial_dim(
+        static_cast<DimIndex>(effective_num_dimensions - dim - 1),
+        input_shape.dimensions(dnums.input_spatial_dimensions(dim)));
+  }
+
+  FilterDescriptor filter_descriptor(effective_num_dimensions);
+  filter_descriptor.set_layout(FilterLayout::kOutputInputYX)
+      .set_input_feature_map_count(
+          filter_shape.dimensions(dnums.kernel_input_feature_dimension()))
+      .set_output_feature_map_count(
+          filter_shape.dimensions(dnums.kernel_output_feature_dimension()));
+  for (int dim = 0; dim < num_dimensions; ++dim) {
+    filter_descriptor.set_spatial_dim(
+        static_cast<DimIndex>(effective_num_dimensions - dim - 1),
+        filter_shape.dimensions(dnums.kernel_spatial_dimensions(dim)));
+  }
+
+  ConvolutionDescriptor convolution_descriptor(effective_num_dimensions);
+  for (int dim = 0; dim < num_dimensions; ++dim) {
+    convolution_descriptor
+        .set_zero_padding(
+            static_cast<DimIndex>(effective_num_dimensions - dim - 1),
+            window.dimensions(dim).padding_low())
+        .set_filter_stride(
+            static_cast<DimIndex>(effective_num_dimensions - dim - 1),
+            window.dimensions(dim).stride());
+  }
+
+  BatchDescriptor output_descriptor(effective_num_dimensions);
+  output_descriptor.set_layout(DataLayout::kBatchDepthYX)
+      .set_feature_map_count(
+          output_shape.dimensions(dnums.output_feature_dimension()))
+      .set_count(output_shape.dimensions(dnums.output_batch_dimension()));
+  for (int dim = 0; dim < num_dimensions; ++dim) {
+    output_descriptor.set_spatial_dim(
+        static_cast<DimIndex>(effective_num_dimensions - dim - 1),
+        output_shape.dimensions(dnums.output_spatial_dimensions(dim)));
+  }
+
+  // Add a singleton dimension in the 1D convolution case.
+  if (num_dimensions == 1) {
+    input_descriptor.set_spatial_dim(static_cast<DimIndex>(0), 1);
+    output_descriptor.set_spatial_dim(static_cast<DimIndex>(0), 1);
+    filter_descriptor.set_spatial_dim(static_cast<DimIndex>(0), 1);
+    convolution_descriptor.set_zero_padding(static_cast<DimIndex>(0), 0)
+        .set_filter_stride(static_cast<DimIndex>(0), 1);
+  }
+
+  switch (kind) {
+    case CudnnConvKind::kForward:
+      stream->ThenConvolveWithAlgorithm(
+          input_descriptor, input_buf, filter_descriptor, filter_buf,
+          convolution_descriptor, output_descriptor, &output_buf,
+          scratch_allocator, algorithm, profile_result);
+      break;
+    case CudnnConvKind::kBackwardInput:
+      stream->ThenConvolveBackwardDataWithAlgorithm(
+          filter_descriptor, filter_buf, output_descriptor, output_buf,
+          convolution_descriptor, input_descriptor, &input_buf,
+          scratch_allocator, algorithm, profile_result);
+      break;
+    case CudnnConvKind::kBackwardFilter:
+      stream->ThenConvolveBackwardFilterWithAlgorithm(
+          input_descriptor, input_buf, output_descriptor, output_buf,
+          convolution_descriptor, filter_descriptor, &filter_buf,
+          scratch_allocator, algorithm, profile_result);
+      break;
+  }
+
+  if (!stream->ok()) {
+    return InternalError(
+        "Unable to launch convolution with type %s and algorithm (%lld, %lld)",
+        CudnnConvKindToString(kind).c_str(), algorithm.algorithm().algo_id(),
+        algorithm.algorithm_no_scratch().algo_id());
+  }
+  return Status::OK();
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h
new file mode 100644
index 0000000000..b101f76510
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h
@@ -0,0 +1,97 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_RUNNER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_RUNNER_H_
+
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+
+namespace xla {
+namespace gpu {
+
+// This file contains low-level routines for running cudnn convolutions.
+
+// Different types of convolutions supported by cudnn.
+//
+// A way to think about these is that a convolution is defined by three arrays
+// -- the "input", the "filter", and the "output" -- and given any two of these,
+// we can compute the third.  For example, a backward-input convolution takes as
+// input a filter and an "output" and produces an "input" such that if one were
+// to do a forward convolution of "input" using filter, the result would be
+// something with the same shape as "output".
+//
+// This way of thinking is not correct if you look at the values produced. For
+// example, a backward-input convolution is not actually the mathematical
+// inverse of a forward convolution.  But it's right as far as the shapes and
+// "connectivity" (i.e. which elements of the input affect which elements of
+// the output) are concerned.
+enum class CudnnConvKind {
+  kForward,         // input  + filter => output
+  kBackwardInput,   // filter + output => input
+  kBackwardFilter,  // input  + output => filter
+};
+
+// Converts a CudnnConvKind value to a string.
+string CudnnConvKindToString(CudnnConvKind kind);
+
+// Calls into cudnn to run the specified convolution.
+//
+// Note that depending on the value of CudnnConvKind, the result of this call
+// may be written into input_buf, filter_buf, or output_buf!
+//
+// At the moment we only support cudnn convolutions over floats.
+//
+// We provide one overload which takes a scratch buffer, and another which takes
+// an allocator which is responsible for allocating the scratch space.  In
+// theory the second one shouldn't be necessary -- users of this function could
+// just ask cudnn how much scratch space it needs for a particular convolution.
+// But in practice, StreamExecutor does not expose such an API, and in the name
+// of parsimony, perhaps it's better not to add it.  Instead, the first time you
+// call a convolution, you should call the version that takes a scratch
+// allocator and take note of how much memory is used.  The next time you call
+// the same conv, you can provide an explicitly preallocated scratch buffer of
+// that size, if you like.
+Status RunCudnnConvolution(
+    CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape,
+    const Shape& output_shape,
+    perftools::gputools::DeviceMemory<float> input_buf,
+    perftools::gputools::DeviceMemory<float> filter_buf,
+    perftools::gputools::DeviceMemory<float> output_buf,
+    perftools::gputools::DeviceMemoryBase scratch_buf, const Window& window,
+    const ConvolutionDimensionNumbers& dnums,
+    perftools::gputools::dnn::AlgorithmConfig algorithm,
+    perftools::gputools::Stream* stream,
+    perftools::gputools::dnn::ProfileResult* profile_result = nullptr);
+
+Status RunCudnnConvolution(
+    CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape,
+    const Shape& output_shape,
+    perftools::gputools::DeviceMemory<float> input_buf,
+    perftools::gputools::DeviceMemory<float> filter_buf,
+    perftools::gputools::DeviceMemory<float> output_buf,
+    perftools::gputools::ScratchAllocator* scratch_allocator,
+    const Window& window, const ConvolutionDimensionNumbers& dnums,
+    perftools::gputools::dnn::AlgorithmConfig algorithm,
+    perftools::gputools::Stream* stream,
+    perftools::gputools::dnn::ProfileResult* profile_result = nullptr);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_RUNNER_H_
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index 07543d42e3..12ec266ff3 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -35,8 +35,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/call_inliner.h"
 #include "tensorflow/compiler/xla/service/dot_decomposer.h"
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
-#include "tensorflow/compiler/xla/service/gpu/convolution_folding.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_rewriter.h"
+#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h"
+#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.h"
 #include "tensorflow/compiler/xla/service/gpu/fusion_merger.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_constants.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.h"
@@ -127,7 +128,9 @@ string GetLibdeviceDir(const string& config_cuda_data_dir) {
 }
 
 // Runs optimization passes on the given HLO module.
-tensorflow::Status OptimizeHloModule(HloModule* hlo_module) {
+tensorflow::Status OptimizeHloModule(HloModule* hlo_module,
+                                     se::StreamExecutor* stream_exec,
+                                     DeviceMemoryAllocator* device_allocator) {
   {
     HloPassPipeline pipeline("optimization");
     pipeline.AddInvariantChecker<HloVerifier>();
@@ -143,6 +146,7 @@ tensorflow::Status OptimizeHloModule(HloModule* hlo_module) {
     // most ops.
     pipeline.AddPass<HloElementTypeConverter>(BF16, F32);
     pipeline.AddPass<DotDecomposer>();
+
     {
       auto& pass =
           pipeline.AddPass<HloPassFix<HloPassPipeline>>("simplification");
@@ -173,7 +177,7 @@ tensorflow::Status OptimizeHloModule(HloModule* hlo_module) {
       pass.AddPass<ReshapeMover>();
       pass.AddPass<HloConstantFolding>();
     }
-    pipeline.AddPass<ConvolutionFolding>();
+
     pipeline.AddPass<TransposeFolding>(
         [](const HloInstruction& dot,
            const TransposeFolding::OperandIndices& candidate_operands) {
@@ -185,6 +189,58 @@ tensorflow::Status OptimizeHloModule(HloModule* hlo_module) {
     pipeline.AddPass<HloDCE>();
     TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
   }
+
+  {
+    // Convert convolutions into CustomCalls to cudnn, then canonicalize them
+    // (PadInsertion).
+    HloPassPipeline pipeline("conv_canonicalization");
+    pipeline.AddInvariantChecker<HloVerifier>();
+    pipeline.AddPass<CudnnConvolutionRewriter>();
+    pipeline.AddPass<PadInsertion>();
+
+    // Choose the fastest algorithm for each conv.
+    //
+    // In theory doing this here is way too early: It needs to happen after
+    // layout assignment, because the layout of the inputs/outputs affects the
+    // speed of the conv.  But currently we only allow only one input/output
+    // layout when calling cudnn, so there's no ambiguity.
+    //
+    // We pick the algorithm at this early stage so we can generate better HLO.
+    // After CudnnConvolutionRewriter, our convolutions are CustomCalls which
+    // return a tuple (conv_result, scratch_memory), and the each conv uses 0
+    // bytes of scratch:
+    //
+    //   customcall = (f32[...], f32[0])
+    //   return gte(customcall, 0)
+    //
+    // The algorithm picker then chooses the best algorithm, and potentially
+    // increases the scratch space.  It replaces customcall with new_tuple,
+    // giving us the following:
+    //
+    //   new_customcall = (f32[...], f32[N])
+    //   new_tuple = tuple(gte(new_customcall, 0), constant f32[0])
+    //   return gte(new_tuple, 0)
+    //
+    // The new tuple and gte instructions then be simplified away, because
+    // nobody is expected to use the scratch value.
+    //
+    // However, if we were to run CudnnConvolutionAlgorithmPicker after layout
+    // assignment, fusion would already have run, and the gte(customcall, 0)
+    // would probably already be into a fusion node.  We can't simplify across
+    // HloComputation boundaries, so in this case we wouldn't be able to
+    // simplify away the new_tuple bits.
+    //
+    // We'll need to revisit this if we ever allow multiple layouts for the
+    // inputs/outputs of a cudnn convolution.
+    pipeline.AddPass<CudnnConvolutionAlgorithmPicker>(stream_exec,
+                                                      device_allocator);
+    // Clean up new_tuple described above.
+    pipeline.AddPass<TupleSimplifier>();
+    pipeline.AddPass<HloDCE>();
+
+    TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
+  }
+
   {
     HloPassFix<HloPassPipeline> fusion("fusion");
     fusion.AddInvariantChecker<HloVerifier>();
@@ -212,9 +268,7 @@ tensorflow::Status OptimizeHloModule(HloModule* hlo_module) {
 
 // Modifies the given HLO module so that it will be accepted by IrEmitter.
 // Unlike optimization passes, the passes are necessary for correctness.
-tensorflow::Status PrepareHloModuleForIrEmitting(
-    HloModule* hlo_module, se::StreamExecutor* stream_exec,
-    DeviceMemoryAllocator* /*device_allocator*/) {
+tensorflow::Status PrepareHloModuleForIrEmitting(HloModule* hlo_module) {
   // In some cases, we have to place the result of an instruction in a temporary
   // buffer. For instance, the buffer that holds an external parameter is
   // assumed immutable at this point, and should not be reused for output
@@ -222,9 +276,10 @@ tensorflow::Status PrepareHloModuleForIrEmitting(
   // the parameter.
   HloPassPipeline pipeline("GPU-ir-emit-prepare");
   pipeline.AddInvariantChecker<HloVerifier>();
-  pipeline.AddPass<PadInsertion>();
+
   pipeline.AddPass<GpuLayoutAssignment>(
       hlo_module->mutable_entry_computation_layout());
+
   // The LayoutAssignment pass may leave behind kCopy instructions which are
   // duplicate or NOPs, so remove them with algebraic simplification and CSE.
   pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(
@@ -417,7 +472,8 @@ StatusOr<std::unique_ptr<HloModule>> GpuCompiler::RunHloPasses(
   XLA_SCOPED_LOGGING_TIMER("GpuCompiler::RunHloPasses");
   Tracing::TraceMe annotation("HLO Transforms", module->name(),
                               /*is_expensive=*/true);
-  TF_RETURN_IF_ERROR(OptimizeHloModule(module.get()));
+  TF_RETURN_IF_ERROR(
+      OptimizeHloModule(module.get(), stream_exec, device_allocator));
   return std::move(module);
 }
 
@@ -428,8 +484,7 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
 
   TF_RET_CHECK(stream_exec != nullptr);
 
-  TF_RETURN_IF_ERROR(PrepareHloModuleForIrEmitting(module.get(), stream_exec,
-                                                   device_allocator));
+  TF_RETURN_IF_ERROR(PrepareHloModuleForIrEmitting(module.get()));
 
   llvm::LLVMContext llvm_context;
   std::string buffer;
@@ -464,8 +519,9 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
                           /*color_alignment=*/[](LogicalBuffer::Color) {
                             return kCudaMallocAlignBytes;
                           }));
-  // BufferAssignment::ToString() includes a header, so no need for us to
-  // print one ourselves.
+  // BufferAssignment::Stats::ToString() and BufferAssignment::ToString()
+  // include headers, so no need for us to print them ourselves.
+  XLA_VLOG_LINES(1, buffer_assignment->GetStats().ToString());
   XLA_VLOG_LINES(2, buffer_assignment->ToString());
   XLA_VLOG_LINES(2, module->ToString());
   const string xla_dump_optimized_hlo_proto_to =
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc b/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc
index e3b493c663..88bf5a74fa 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc
@@ -78,6 +78,12 @@ StatusOr<bool> GpuCopyInsertion::Run(HloModule* module) {
       for (int64 i = 0; i < hlo->operand_count() - 2; ++i) {
         TF_RETURN_IF_ERROR(copy_operand_if_constant(i));
       }
+    } else if (IsCustomCallToDnnConvolution(*hlo)) {
+      // The last argument to a CUDNN convolution is its algorithm, which must
+      // be an HLO constant -- it shouldn't be copied.
+      for (int64 i = 0; i < hlo->operand_count() - 1; ++i) {
+        TF_RETURN_IF_ERROR(copy_operand_if_constant(i));
+      }
     } else if (ImplementedAsLibraryCall(*hlo)) {
       // For all other library calls, materialize all the operands into memory.
       for (int64 i = 0; i < hlo->operand_count(); ++i) {
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
index 58915f1f62..89f1e62588 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
@@ -28,122 +28,114 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+// cuDNN convolutions are called with specific layouts on the input, output,
+// and filter:
+//
+//   input: DataLayout::kBatchDepthYX
+//   output: DataLayout::kBatchDepthYX
+//   filter: FilterLayout::kOutputInputYX
+//
+// The order dimensions in the constant name is major-to-minor (eg, the
+// most-major dimension of the input is batch, most-minor is X). The
+// specific dimension numbers these named dimensions correspond to is
+// determined by the ConvolutionDimensionNumbers argument. Y is spatial
+// dimension 0, and X is spatial dimension 1.
+//
+// TODO(b/29399649): Be more flexible about handling layouts of cuDNN calls.
+static Status AddBackendConstraintsToDnnConvCustomCall(
+    HloInstruction* instr, LayoutConstraints* constraints) {
+  CHECK(IsCustomCallToDnnConvolution(*instr)) << instr->ToString();
+  Shape input_shape;
+  Shape filter_shape;
+  Shape output_shape;
+  const auto& target = instr->custom_call_target();
+  if (target == kCudnnConvForwardCallTarget) {
+    input_shape = instr->operand(0)->shape();
+    filter_shape = instr->operand(1)->shape();
+    output_shape = instr->shape().tuple_shapes(0);
+  } else if (target == kCudnnConvBackwardInputCallTarget) {
+    input_shape = instr->shape().tuple_shapes(0);
+    filter_shape = instr->operand(1)->shape();
+    output_shape = instr->operand(0)->shape();
+  } else if (target == kCudnnConvBackwardFilterCallTarget) {
+    input_shape = instr->operand(0)->shape();
+    filter_shape = instr->shape().tuple_shapes(0);
+    output_shape = instr->operand(1)->shape();
+  } else {
+    LOG(FATAL) << "Unexpected custom call target: "
+               << instr->custom_call_target();
+  }
+
+  // Construct minor-to-major dimension orders for operands and result.
+  // cuDNN's convolution APIs support the BDYX layout for activations/output
+  // and the OIYX layout for weights.
+  // TODO(b/29399649): Be more flexible about handling layouts of cuDNN
+  // calls after we switch to cuDNN v5.
+  const ConvolutionDimensionNumbers& dimension_numbers =
+      instr->convolution_dimension_numbers();
+  std::vector<int64> input_layout;
+  for (int i = dimension_numbers.input_spatial_dimensions_size() - 1; i >= 0;
+       --i) {
+    input_layout.push_back(dimension_numbers.input_spatial_dimensions(i));
+  }
+  input_layout.push_back(dimension_numbers.input_feature_dimension());
+  input_layout.push_back(dimension_numbers.input_batch_dimension());
+  *input_shape.mutable_layout() = LayoutUtil::MakeLayout(input_layout);
+
+  std::vector<int64> filter_layout;
+  for (int i = dimension_numbers.kernel_spatial_dimensions_size() - 1; i >= 0;
+       --i) {
+    filter_layout.push_back(dimension_numbers.kernel_spatial_dimensions(i));
+  }
+  filter_layout.push_back(dimension_numbers.kernel_input_feature_dimension());
+  filter_layout.push_back(dimension_numbers.kernel_output_feature_dimension());
+  *filter_shape.mutable_layout() = LayoutUtil::MakeLayout(filter_layout);
+
+  std::vector<int64> output_layout;
+  for (int i = dimension_numbers.output_spatial_dimensions_size() - 1; i >= 0;
+       --i) {
+    output_layout.push_back(dimension_numbers.output_spatial_dimensions(i));
+  }
+  output_layout.push_back(dimension_numbers.output_feature_dimension());
+  output_layout.push_back(dimension_numbers.output_batch_dimension());
+  *output_shape.mutable_layout() = LayoutUtil::MakeLayout(output_layout);
+
+  // The custom call returns a tuple of (actual_result, scratch_buffer);
+  // call_result_buf is the logical buffer for actual_result, the thing that
+  // contains the result of the conv call.
+  TF_ASSIGN_OR_RETURN(const LogicalBuffer* call_result_buf,
+                      constraints->points_to_analysis().GetBufferDefinedAt(
+                          instr, /*index=*/{0}));
+
+  // Set layouts of the instructions' shapes.
+  if (target == kCudnnConvForwardCallTarget) {
+    TF_RETURN_IF_ERROR(constraints->SetOperandLayout(input_shape, instr, 0));
+    TF_RETURN_IF_ERROR(constraints->SetOperandLayout(filter_shape, instr, 1));
+    TF_RETURN_IF_ERROR(
+        constraints->SetBufferLayout(output_shape.layout(), *call_result_buf));
+  } else if (target == kCudnnConvBackwardInputCallTarget) {
+    TF_RETURN_IF_ERROR(constraints->SetOperandLayout(output_shape, instr, 0));
+    TF_RETURN_IF_ERROR(constraints->SetOperandLayout(filter_shape, instr, 1));
+    TF_RETURN_IF_ERROR(
+        constraints->SetBufferLayout(input_shape.layout(), *call_result_buf));
+  } else if (target == kCudnnConvBackwardFilterCallTarget) {
+    TF_RETURN_IF_ERROR(constraints->SetOperandLayout(input_shape, instr, 0));
+    TF_RETURN_IF_ERROR(constraints->SetOperandLayout(output_shape, instr, 1));
+    TF_RETURN_IF_ERROR(
+        constraints->SetBufferLayout(filter_shape.layout(), *call_result_buf));
+  } else {
+    LOG(FATAL) << "Unexpected custom call target: "
+               << instr->custom_call_target();
+  }
+  return Status::OK();
+}
+
 Status GpuLayoutAssignment::AddBackendConstraints(
     LayoutConstraints* constraints) {
   for (auto* instruction : constraints->computation()->instructions()) {
-    // cuDNN is called with specific layouts on the input, output, and filter:
-    //
-    //   input: DataLayout::kBatchDepthYX
-    //   output: DataLayout::kBatchDepthYX
-    //   filter: FilterLayout::kOutputInputYX
-    //
-    // The order dimensions in the constant name is major-to-minor (eg, the
-    // most-major dimension of the input is batch, most-minor is X). The
-    // specific dimension numbers these named dimensions correspond to is
-    // determined by the ConvolutionDimensionNumbers argument. Y is spatial
-    // dimension 0, and X is spatial dimension 1.
-    //
-    // TODO(b/29399649): Be more flexible about handling layouts of cuDNN calls.
-    if (ImplementedAsDnnConvolution(*instruction)) {
-      HloInstruction* input = nullptr;
-      HloInstruction* filter = nullptr;
-      HloInstruction* output = nullptr;
-      if (instruction->opcode() == HloOpcode::kConvolution) {
-        input = instruction->mutable_operand(0);
-        filter = instruction->mutable_operand(1);
-        output = instruction;
-      } else {
-        CHECK_EQ(HloOpcode::kFusion, instruction->opcode());
-        switch (instruction->fusion_kind()) {
-          case HloInstruction::FusionKind::kConvBackwardFilter:
-            // filter = BackwardFilterConvolve(input, output)
-            input = instruction->mutable_operand(0);
-            filter = instruction;
-            output = instruction->mutable_operand(1);
-            break;
-          case HloInstruction::FusionKind::kConvBackwardInput:
-            // input = BackwardInputConvolve(output, filter)
-            input = instruction;
-            filter = instruction->mutable_operand(1);
-            output = instruction->mutable_operand(0);
-            break;
-          default:
-            LOG(FATAL) << "Not a convolution-fusion";
-        }
-      }
-
-      // Construct minor-to-major dimension orders for operands and result.
-      // cuDNN's convolution APIs support the BDYX layout for activations/output
-      // and the OIYX layout for weights.
-      // TODO(b/29399649): Be more flexible about handling layouts of cuDNN
-      // calls after we switch to cuDNN v5.
-      const ConvolutionDimensionNumbers& dimension_numbers =
-          instruction->convolution_dimension_numbers();
-      std::vector<int64> input_layout;
-      for (int i = dimension_numbers.input_spatial_dimensions_size() - 1;
-           i >= 0; --i) {
-        input_layout.push_back(dimension_numbers.input_spatial_dimensions(i));
-      }
-      input_layout.push_back(dimension_numbers.input_feature_dimension());
-      input_layout.push_back(dimension_numbers.input_batch_dimension());
-      Shape input_shape(input->shape());
-      *input_shape.mutable_layout() = LayoutUtil::MakeLayout(input_layout);
-
-      std::vector<int64> filter_layout;
-      for (int i = dimension_numbers.kernel_spatial_dimensions_size() - 1;
-           i >= 0; --i) {
-        filter_layout.push_back(dimension_numbers.kernel_spatial_dimensions(i));
-      }
-      filter_layout.push_back(
-          dimension_numbers.kernel_input_feature_dimension());
-      filter_layout.push_back(
-          dimension_numbers.kernel_output_feature_dimension());
-      Shape filter_shape(filter->shape());
-      *filter_shape.mutable_layout() = LayoutUtil::MakeLayout(filter_layout);
-
-      std::vector<int64> output_layout;
-      for (int i = dimension_numbers.output_spatial_dimensions_size() - 1;
-           i >= 0; --i) {
-        output_layout.push_back(dimension_numbers.output_spatial_dimensions(i));
-      }
-      output_layout.push_back(dimension_numbers.output_feature_dimension());
-      output_layout.push_back(dimension_numbers.output_batch_dimension());
-      Shape output_shape(output->shape());
-      *output_shape.mutable_layout() = LayoutUtil::MakeLayout(output_layout);
-
-      // Set layouts of the instructions' shapes.
-      if (instruction->opcode() == HloOpcode::kConvolution) {
-        TF_RETURN_IF_ERROR(
-            constraints->SetOperandLayout(input_shape, output, 0));
-        TF_RETURN_IF_ERROR(
-            constraints->SetOperandLayout(filter_shape, output, 1));
-        TF_RETURN_IF_ERROR(
-            constraints->SetInstructionLayout(output_shape, output));
-      } else {
-        CHECK_EQ(HloOpcode::kFusion, instruction->opcode());
-        switch (instruction->fusion_kind()) {
-          case HloInstruction::FusionKind::kConvBackwardFilter:
-            // filter = BackwardFilterConvolve(input, output)
-            TF_RETURN_IF_ERROR(
-                constraints->SetOperandLayout(input_shape, filter, 0));
-            TF_RETURN_IF_ERROR(
-                constraints->SetInstructionLayout(filter_shape, filter));
-            TF_RETURN_IF_ERROR(
-                constraints->SetOperandLayout(output_shape, filter, 1));
-            break;
-          case HloInstruction::FusionKind::kConvBackwardInput:
-            // input = BackwardInputConvolve(output, filter)
-            TF_RETURN_IF_ERROR(
-                constraints->SetInstructionLayout(input_shape, input));
-            TF_RETURN_IF_ERROR(
-                constraints->SetOperandLayout(output_shape, input, 0));
-            TF_RETURN_IF_ERROR(
-                constraints->SetOperandLayout(filter_shape, input, 1));
-            break;
-          default:
-            LOG(FATAL) << "Not a convolution-fusion";
-        }
-      }
+    if (IsCustomCallToDnnConvolution(*instruction)) {
+      TF_RETURN_IF_ERROR(
+          AddBackendConstraintsToDnnConvCustomCall(instruction, constraints));
     }
   }
   return Status::OK();
@@ -151,9 +143,12 @@ Status GpuLayoutAssignment::AddBackendConstraints(
 
 bool GpuLayoutAssignment::CustomCallRequiresMajorFirstLayout(
     const HloInstruction* instruction) {
-  // Inputs to cudnn batchnorm custom calls don't need the major-first layout
-  // (i.e. {n, n-1, ...0}) -- we can handle any layout.
-  return !IsCustomCallToDnnBatchNorm(*instruction);
+  // - Inputs to cudnn batchnorm custom calls don't need the major-first layout
+  //   (i.e. {n, n-1, ...0}) -- we can handle any layout.
+  // - Inputs to cudnn convolution require custom layouts handled in
+  //   AddBackendConstraints.
+  return !IsCustomCallToDnnBatchNorm(*instruction) &&
+         !IsCustomCallToDnnConvolution(*instruction);
 }
 
 Status GpuLayoutAssignment::PropagateOperandConstraint(
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
index 1d47ffde43..2d6dad27a5 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
@@ -137,49 +137,6 @@ TEST_F(InstructionFusionTest, PotentialBitcastTransposeOfDotUnfused) {
                    .ValueOrDie());
 }
 
-TEST_F(InstructionFusionTest, PotentialBitcastTransposeOfConvolutionUnfused) {
-  HloComputation::Builder builder(TestName());
-  auto input = builder.AddInstruction(HloInstruction::CreateParameter(
-      0, ShapeUtil::MakeShape(F32, {1, 1, 1, 3}), "input"));
-  auto filter = builder.AddInstruction(HloInstruction::CreateParameter(
-      1, ShapeUtil::MakeShape(F32, {1, 1, 1, 2}), "filter"));
-
-  Window conv_window;
-  WindowDimension* conv_window_row = conv_window.add_dimensions();
-  conv_window_row->set_size(1);
-  WindowDimension* conv_window_col = conv_window.add_dimensions();
-  conv_window_col->set_size(2);
-  conv_window_col->set_padding_high(1);
-
-  ConvolutionDimensionNumbers conv_dnums;
-  conv_dnums.set_input_batch_dimension(0);
-  conv_dnums.set_output_batch_dimension(0);
-  conv_dnums.set_input_feature_dimension(1);
-  conv_dnums.set_output_feature_dimension(1);
-  conv_dnums.add_input_spatial_dimensions(2);
-  conv_dnums.add_output_spatial_dimensions(2);
-  conv_dnums.add_input_spatial_dimensions(3);
-  conv_dnums.add_output_spatial_dimensions(3);
-  conv_dnums.set_kernel_output_feature_dimension(0);
-  conv_dnums.set_kernel_input_feature_dimension(1);
-  conv_dnums.add_kernel_spatial_dimensions(2);
-  conv_dnums.add_kernel_spatial_dimensions(3);
-
-  auto conv = builder.AddInstruction(
-      HloInstruction::CreateConvolve(ShapeUtil::MakeShape(F32, {1, 1, 1, 3}),
-                                     input, filter, conv_window, conv_dnums));
-  auto transpose = builder.AddInstruction(HloInstruction::CreateTranspose(
-      ShapeUtil::MakeShape(F32, {3, 1, 1, 1}), conv, {3, 2, 1, 0}));
-  builder.AddInstruction(
-      HloInstruction::CreateReshape(ShapeUtil::MakeShape(F32, {3}), transpose));
-
-  auto module = CreateNewModule();
-  module->AddEntryComputation(builder.Build());
-  EXPECT_FALSE(GpuInstructionFusion(/*may_duplicate=*/true)
-                   .Run(module.get())
-                   .ValueOrDie());
-}
-
 TEST_F(InstructionFusionTest, GetTupleElementFused) {
   HloComputation::Builder builder(TestName());
   Shape data_shape = ShapeUtil::MakeShape(F32, {8});
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
index 76566a9e3d..2f65edffea 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
@@ -90,43 +90,6 @@ bool ImplementedAsGemm(const HloInstruction& hlo) {
   return false;
 }
 
-bool ImplementedAsDnnConvolution(const HloInstruction& hlo) {
-  // We can only do this if the HLO is unnested.
-  if (hlo.parent() != hlo.GetModule()->entry_computation()) {
-    return false;
-  }
-
-  // Forward convolution.
-  if (hlo.opcode() == HloOpcode::kConvolution) {
-    const ConvolutionDimensionNumbers& dnums =
-        hlo.convolution_dimension_numbers();
-    if (dnums.input_spatial_dimensions_size() > 3) {
-      return false;
-    }
-
-    // CuDNN does not accept zero-element arguments
-    if (ShapeUtil::HasZeroElements(hlo.operand(0)->shape()) ||
-        ShapeUtil::HasZeroElements(hlo.operand(1)->shape())) {
-      return false;
-    }
-
-    if (window_util::HasWindowReversal(hlo.window())) {
-      return false;
-    }
-
-    return true;
-  }
-
-  // Backward convolution.
-  if (hlo.opcode() == HloOpcode::kFusion &&
-      (hlo.fusion_kind() == HloInstruction::FusionKind::kConvBackwardFilter ||
-       hlo.fusion_kind() == HloInstruction::FusionKind::kConvBackwardInput)) {
-    return true;
-  }
-
-  return false;
-}
-
 const char* const kCudnnBatchNormForwardInferenceCallTarget =
     "__cudnn$batchNormalizationForwardInference";
 const char* const kCudnnBatchNormForwardTrainingCallTarget =
@@ -144,9 +107,76 @@ bool IsCustomCallToDnnBatchNorm(const HloInstruction& hlo) {
          target == kCudnnBatchNormBackwardCallTarget;
 }
 
+const char* const kCudnnConvForwardCallTarget = "__cudnn$convForward";
+const char* const kCudnnConvBackwardInputCallTarget =
+    "__cudnn$convBackwardInput";
+const char* const kCudnnConvBackwardFilterCallTarget =
+    "__cudnn$convBackwardFilter";
+
+bool IsCustomCallToDnnConvolution(const HloInstruction& hlo) {
+  if (hlo.opcode() != HloOpcode::kCustomCall) {
+    return false;
+  }
+  const auto& target = hlo.custom_call_target();
+  return target == kCudnnConvForwardCallTarget ||
+         target == kCudnnConvBackwardInputCallTarget ||
+         target == kCudnnConvBackwardFilterCallTarget;
+}
+
 bool ImplementedAsLibraryCall(const HloInstruction& hlo) {
-  return ImplementedAsGemm(hlo) || ImplementedAsDnnConvolution(hlo) ||
-         IsCustomCallToDnnBatchNorm(hlo);
+  return ImplementedAsGemm(hlo) || IsCustomCallToDnnBatchNorm(hlo) ||
+         IsCustomCallToDnnConvolution(hlo);
+}
+
+static HloInstruction* CreateCudnnConv(
+    const char* call_target, const Shape& shape, HloInstruction* lhs,
+    HloInstruction* rhs, const Window& window,
+    const ConvolutionDimensionNumbers& dnums) {
+  HloComputation* computation = lhs->parent();
+
+  // This call returns a tuple of (conv_result, scratch_memory), where
+  // conv_result is the actual result of the convolution, and scratch_memory is
+  // temporary memory used by cudnn.
+  //
+  // At the moment, we don't know how much scratch memory this conv is going to
+  // use, so we put u8[0] in this place.  Later on another pass will choose
+  // which conv algorithm to use, and at that point we'll modify the shape of
+  // this second tuple element.
+  Shape call_shape =
+      ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeShape(U8, {0})});
+
+  // Our CustomCall takes three arguments: The conv lhs and rhs, and the cudnn
+  // algorithm to use.  It's up to a later pass to choose the algorithm, so to
+  // indicate that we haven't yet made a choice, we speicfy -1 for that arg.
+  HloInstruction* negative_one = computation->AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<int64>(-1)));
+  HloInstruction* custom_call =
+      computation->AddInstruction(HloInstruction::CreateCustomCall(
+          call_shape, {lhs, rhs, negative_one}, call_target));
+  custom_call->set_window(window);
+  custom_call->set_convolution_dimension_numbers(dnums);
+  return custom_call;
+}
+
+HloInstruction* CreateCudnnConvForward(
+    const Shape& shape, HloInstruction* input, HloInstruction* kernel,
+    const Window& window, const ConvolutionDimensionNumbers& dnums) {
+  return CreateCudnnConv(kCudnnConvForwardCallTarget, shape, input, kernel,
+                         window, dnums);
+}
+
+HloInstruction* CreateCudnnConvBackwardInput(
+    const Shape& shape, HloInstruction* output, HloInstruction* reverse_filter,
+    const Window& window, const ConvolutionDimensionNumbers& dnums) {
+  return CreateCudnnConv(kCudnnConvBackwardInputCallTarget, shape, output,
+                         reverse_filter, window, dnums);
+}
+
+HloInstruction* CreateCudnnConvBackwardFilter(
+    const Shape& shape, HloInstruction* input, HloInstruction* output,
+    const Window& window, const ConvolutionDimensionNumbers& dnums) {
+  return CreateCudnnConv(kCudnnConvBackwardFilterCallTarget, shape, input,
+                         output, window, dnums);
 }
 
 bool IsReductionToVector(const HloInstruction& reduce) {
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
index d24ed9879d..7ad9680bfb 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
@@ -22,6 +22,9 @@ limitations under the License.
 #include "llvm/IR/Value.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 
+// TODO(jlebar): Move functions related to cublas/cudnn to a separate file; they
+// don't belong in "ir_emission_utils".
+
 namespace xla {
 namespace gpu {
 
@@ -30,9 +33,6 @@ constexpr int64 kWarpSize = 32;
 // Returns true if `hlo` will be implemented as a call to BLAS gemm.
 bool ImplementedAsGemm(const HloInstruction& hlo);
 
-// Returns true if `hlo` will be implemented as a call to cuDNN convolution.
-bool ImplementedAsDnnConvolution(const HloInstruction& hlo);
-
 // A call to cuDNN for batch normalization is represented as CustomCall HLO with
 // a call target equal to one of these strings.
 //
@@ -58,6 +58,60 @@ extern const char* const kCudnnBatchNormBackwardCallTarget;
 // sequence of generic HLOs or to a cuDNN CustomCall.
 bool IsCustomCallToDnnBatchNorm(const HloInstruction& hlo);
 
+// A call to cuDNN for convolution (forward, backward filter, or backward input)
+// is represented as a CustomCall HLO with a call target equal to one of these
+// strings.
+//
+// These CustomCalls have window() and convolution_dimension_numbers() set like
+// regular convolution ops.  They have the same LHS and RHS operands, plus one
+// additional int64 operand, representing which cudnn algorithm to run.  This
+// operand must be an HLO constant.  A value of -1 means that the implementation
+// is free to choose the best algorithm it can.
+//
+// These calls output a tuple (conv_result, scratch_memory), where conv_result
+// is the actual result of the convolution, and scratch_memory is temporary
+// memory used by cudnn.  Callers shouldn't inspect scratch_memory, as its value
+// is not well-defined.
+//
+// CudnnConvolutionRewriter lowers kConvolution HLOs to these custom calls.
+// When it does so, it chooses algorithm -1 and 0 bytes of scratch space.  Later
+// on in the pipeline, CudnnConvolutionAlgorithmChooser chooses an explicit
+// algorithm for each conv and sets the amount of scratch space needed.
+//
+// (Representing the scratch memory as an output may seem strange at first, but
+// it's quite sensible, from a certain point of view.  The scratch buffer is a
+// location in memory that the conv can write into, but which it can't legally
+// read from, at least until it's written something first.  But that's exactly
+// the definition of an output buffer.)
+extern const char* const kCudnnConvForwardCallTarget;
+extern const char* const kCudnnConvBackwardInputCallTarget;
+extern const char* const kCudnnConvBackwardFilterCallTarget;
+
+// Returns true if `hlo` will be implemented as a call to a cuDNN convolution
+// routine.
+//
+// This returns true if `hlo` is a CustomCall HLO with a call target equal to
+// one of the kCudnnConvFoo constants above, but returns *false* for HLOs with a
+// kConvolution opcode.
+bool IsCustomCallToDnnConvolution(const HloInstruction& hlo);
+
+// Creates a CustomCall for a cudnn forward/backward-input/backward-filter conv.
+// Note that these CustomCalls return a tuple (conv_result, scratch_memory).  If
+// you want just the conv result, you'll need to get-tuple-element the value
+// returned by this function.
+//
+// The created cudnn call will use the default cudnn algorithm and no scratch
+// space.
+HloInstruction* CreateCudnnConvForward(
+    const Shape& shape, HloInstruction* input, HloInstruction* kernel,
+    const Window& window, const ConvolutionDimensionNumbers& dnums);
+HloInstruction* CreateCudnnConvBackwardInput(
+    const Shape& shape, HloInstruction* output, HloInstruction* reverse_filter,
+    const Window& window, const ConvolutionDimensionNumbers& dnums);
+HloInstruction* CreateCudnnConvBackwardFilter(
+    const Shape& shape, HloInstruction* input, HloInstruction* output,
+    const Window& window, const ConvolutionDimensionNumbers& dnums);
+
 // Returns true if `hlo` will be implemented as a library call, e.g. cuBLAS gemm
 // or cuDNN convolution.
 bool ImplementedAsLibraryCall(const HloInstruction& hlo);
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.h b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
index 3aa178410f..9031a838f9 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
@@ -336,9 +336,6 @@ class IrEmitterUnnested : public IrEmitter {
   // Thunk object.
   std::unique_ptr<Thunk> BuildKernelThunk(const HloInstruction* inst);
 
-  // Returns a ConvolutionThunk that calls DNN to implement `inst`.
-  std::unique_ptr<Thunk> BuildConvolutionThunk(const HloInstruction* inst);
-
   // Returns a FftThunk that calls cuFFT to implement `inst`.
   std::unique_ptr<Thunk> BuildFftThunk(const HloInstruction* inst);
 
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index bd428f8028..40725739e2 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/convolution_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/copy_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h"
 #include "tensorflow/compiler/xla/service/gpu/fft_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/for_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/gemm_thunk.h"
@@ -278,10 +279,6 @@ Status IrEmitterUnnested::HandleConditional(HloInstruction* conditional) {
 }
 
 Status IrEmitterUnnested::HandleConvolution(HloInstruction* convolution) {
-  if (ImplementedAsDnnConvolution(*convolution)) {
-    thunk_sequence_->emplace_back(BuildConvolutionThunk(convolution));
-    return Status::OK();
-  }
   thunk_sequence_->emplace_back(BuildKernelThunk(convolution));
   return IrEmitter::HandleConvolution(convolution);
 }
@@ -380,6 +377,71 @@ Status IrEmitterUnnested::HandleCustomCall(HloInstruction* custom_call) {
     return Status::OK();
   }
 
+  if (IsCustomCallToDnnConvolution(*custom_call)) {
+    const auto& assn = ir_emitter_context_->buffer_assignment();
+    const auto& lhs_shape = custom_call->operand(0)->shape();
+    const auto& rhs_shape = custom_call->operand(1)->shape();
+    const auto& conv_result_shape = custom_call->shape().tuple_shapes(0);
+    auto lhs_slice = GetAllocationSlice(*custom_call->operand(0));
+    auto rhs_slice = GetAllocationSlice(*custom_call->operand(1));
+    auto tuple_result_slice = GetAllocationSlice(*custom_call);
+    auto conv_result_slice = assn.GetUniqueSlice(custom_call, {0}).ValueOrDie();
+    auto scratch_slice = assn.GetUniqueSlice(custom_call, {1}).ValueOrDie();
+
+    const HloInstruction* algorithm_inst = custom_call->operand(2);
+    CHECK(algorithm_inst->IsConstant()) << algorithm_inst->ToString();
+    int64 algorithm = algorithm_inst->literal().Get<int64>({});
+
+    const auto& target = custom_call->custom_call_target();
+    std::unique_ptr<ConvolutionThunk> thunk;
+    if (target == kCudnnConvForwardCallTarget) {
+      thunk = MakeUnique<ConvolutionThunk>(
+          CudnnConvKind::kForward,
+          /*input_buffer=*/lhs_slice,
+          /*filter_buffer=*/rhs_slice,
+          /*output_buffer=*/conv_result_slice,
+          /*tuple_result_buffer=*/tuple_result_slice,
+          /*scratch_buffer=*/scratch_slice,
+          /*input_shape=*/lhs_shape,
+          /*filter_shape=*/rhs_shape,
+          /*output_shape=*/conv_result_shape,  //
+          custom_call->window(), custom_call->convolution_dimension_numbers(),
+          algorithm, custom_call);
+    } else if (target == kCudnnConvBackwardInputCallTarget) {
+      thunk = MakeUnique<ConvolutionThunk>(
+          CudnnConvKind::kBackwardInput,
+          /*input_buffer=*/conv_result_slice,
+          /*filter_buffer=*/rhs_slice,
+          /*output_buffer=*/lhs_slice,
+          /*tuple_result_buffer=*/tuple_result_slice,
+          /*scratch_buffer=*/scratch_slice,
+          /*input_shape=*/conv_result_shape,
+          /*filter_shape=*/rhs_shape,
+          /*output_shape=*/lhs_shape,  //
+          custom_call->window(), custom_call->convolution_dimension_numbers(),
+          algorithm, custom_call);
+    } else if (target == kCudnnConvBackwardFilterCallTarget) {
+      thunk = MakeUnique<ConvolutionThunk>(
+          CudnnConvKind::kBackwardFilter,
+          /*input_buffer=*/lhs_slice,
+          /*filter_buffer=*/conv_result_slice,
+          /*output_buffer=*/rhs_slice,
+          /*tuple_result_buffer=*/tuple_result_slice,
+          /*scratch_buffer=*/scratch_slice,
+          /*input_shape=*/lhs_shape,
+          /*filter_shape=*/conv_result_shape,
+          /*output_shape=*/rhs_shape,  //
+          custom_call->window(), custom_call->convolution_dimension_numbers(),
+          algorithm, custom_call);
+    } else {
+      LOG(FATAL) << "Unexpected custom call target: "
+                 << custom_call->custom_call_target();
+    }
+
+    thunk_sequence_->emplace_back(std::move(thunk));
+    return Status::OK();
+  }
+
   return IrEmitter::HandleCustomCall(custom_call);
 }
 
@@ -500,10 +562,6 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
     thunk_sequence_->emplace_back(BuildGemmThunk(fusion));
     return Status::OK();
   }
-  if (ImplementedAsDnnConvolution(*fusion)) {
-    thunk_sequence_->emplace_back(BuildConvolutionThunk(fusion));
-    return Status::OK();
-  }
   thunk_sequence_->emplace_back(BuildKernelThunk(fusion));
   return IrEmitter::HandleFusion(fusion);
 }
@@ -2011,52 +2069,6 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildGemmThunk(
   LOG(FATAL) << "Cannot build a GemmThunk for " << inst->ToString();
 }
 
-std::unique_ptr<Thunk> IrEmitterUnnested::BuildConvolutionThunk(
-    const HloInstruction* inst) {
-  const HloInstruction* lhs = inst->operand(0);
-  const HloInstruction* rhs = inst->operand(1);
-  if (inst->opcode() == HloOpcode::kConvolution) {
-    // Forward covolution.
-    return MakeUnique<ConvolutionThunk>(
-        ConvolutionThunk::ConvolutionKind::kForward,
-        /*input_buffer=*/GetAllocationSlice(*lhs),
-        /*filter_buffer=*/GetAllocationSlice(*rhs),
-        /*output_buffer=*/GetAllocationSlice(*inst),
-        /*input_shape=*/lhs->shape(),
-        /*filter_shape=*/rhs->shape(),
-        /*output_shape=*/inst->shape(), inst->window(),
-        inst->convolution_dimension_numbers(), inst);
-  }
-
-  // Backward filter convolution, which takes the input (activations) and the
-  // gradients, and computes the filter.
-  CHECK_EQ(HloOpcode::kFusion, inst->opcode());
-  switch (inst->fusion_kind()) {
-    case HloInstruction::FusionKind::kConvBackwardFilter:
-      return MakeUnique<ConvolutionThunk>(
-          ConvolutionThunk::ConvolutionKind::kBackwardFilter,
-          /*input_buffer=*/GetAllocationSlice(*lhs),
-          /*filter_buffer=*/GetAllocationSlice(*inst),
-          /*output_buffer=*/GetAllocationSlice(*rhs),
-          /*input_shape=*/lhs->shape(),
-          /*filter_shape=*/inst->shape(),
-          /*output_shape=*/rhs->shape(), inst->window(),
-          inst->convolution_dimension_numbers(), inst);
-    case HloInstruction::FusionKind::kConvBackwardInput:
-      return MakeUnique<ConvolutionThunk>(
-          ConvolutionThunk::ConvolutionKind::kBackwardInput,
-          /*input_buffer=*/GetAllocationSlice(*inst),
-          /*filter_buffer=*/GetAllocationSlice(*rhs),
-          /*output_buffer=*/GetAllocationSlice(*lhs),
-          /*input_shape=*/inst->shape(),
-          /*filter_shape=*/rhs->shape(),
-          /*output_shape=*/lhs->shape(), inst->window(),
-          inst->convolution_dimension_numbers(), inst);
-    default:
-      LOG(FATAL) << "Not a convolution-fusion";
-  }
-}
-
 std::unique_ptr<Thunk> IrEmitterUnnested::BuildFftThunk(
     const HloInstruction* inst) {
   const HloInstruction* operand = inst->operand(0);
diff --git a/tensorflow/compiler/xla/service/gpu/pad_insertion.cc b/tensorflow/compiler/xla/service/gpu/pad_insertion.cc
index 2923a79af0..25846dc6cd 100644
--- a/tensorflow/compiler/xla/service/gpu/pad_insertion.cc
+++ b/tensorflow/compiler/xla/service/gpu/pad_insertion.cc
@@ -27,7 +27,7 @@ namespace gpu {
 
 namespace {
 bool IsForwardConvolutionCanonical(const HloInstruction& conv) {
-  CHECK_EQ(HloOpcode::kConvolution, conv.opcode());
+  CHECK_EQ(conv.custom_call_target(), kCudnnConvForwardCallTarget);
   return window_util::HasSymmetricPadding(conv.window()) &&
          !window_util::HasNegativePadding(conv.window()) &&
          !window_util::HasDilation(conv.window());
@@ -47,6 +47,12 @@ HloInstruction* MaybePaddedAndSlicedInput(
       window_util::HasBaseDilation(conv_window)) {
     // If padding is uneven or has dilation, we insert a kPad instruction that
     // applies positive padding and dilation.
+    //
+    // TODO(phawkins): If conv_window has asymmetric padding, perhaps instead of
+    // moving all the padding into an explicit pad op, we should keep as much
+    // padding inside of cudnn as possible, on the assumption that padding
+    // within cudnn is basically free, whereas a kPad's cost increases as the
+    // amount of padding increases.
     PaddingConfig padding_config =
         MakeNoPaddingConfig(input->shape().dimensions_size());
     for (size_t i = 0; i < conv_dnums.input_spatial_dimensions().size(); ++i) {
@@ -167,14 +173,17 @@ bool PadInsertion::CanonicalizeForwardConvolution(HloInstruction* conv) {
     dim->set_window_dilation(1);
   }
 
+  // The conv CustomCall returns a tuple (conv_result, scratch_buffer).  Extract
+  // out the shape of conv_result.
+  Shape old_conv_shape = conv->shape().tuple_shapes(0);
+
   VLOG(1) << "Canonicalizing forward conv";
-  auto new_conv = HloInstruction::CreateConvolve(
-      conv->shape(), new_input, new_kernel, new_conv_window,
-      conv->convolution_dimension_numbers());
+  auto new_conv = CreateCudnnConvForward(old_conv_shape, new_input, new_kernel,
+                                         new_conv_window,
+                                         conv->convolution_dimension_numbers());
   VLOG(1) << "Replacing:\n  " << conv->ToString() << "\nwith:\n  "
           << new_conv->ToString();
-  TF_CHECK_OK(
-      conv->parent()->ReplaceWithNewInstruction(conv, std::move(new_conv)));
+  TF_CHECK_OK(conv->parent()->ReplaceInstruction(conv, new_conv));
   return true;
 }
 
@@ -190,6 +199,8 @@ void IncreasePaddingHighBy(int64 delta, WindowDimension* window_dim) {
 
 bool PadInsertion::CanonicalizeBackwardFilterConvolution(
     HloInstruction* backward_conv) {
+  CHECK_EQ(backward_conv->custom_call_target(),
+           kCudnnConvBackwardFilterCallTarget);
   if (window_util::HasSymmetricPadding(backward_conv->window())) {
     return false;
   }
@@ -202,15 +213,11 @@ bool PadInsertion::CanonicalizeBackwardFilterConvolution(
   //   ABCD0 = Pad(ABCD, padding_high=1)
   //   BackwardFilterConv(ABCD0, xyz, padding_low=pading_high=1)
   // We choose the lesser of padding_low and padding_high as the new padding.
-  HloInstruction* forward_conv = backward_conv->fused_expression_root();
   HloInstruction* input = backward_conv->mutable_operand(0);
-  Window new_forward_conv_window = forward_conv->window();
   Window new_backward_conv_window = backward_conv->window();
   // input_padding_config is the config of the kPad to be inserted.
   PaddingConfig input_padding_config =
       MakeNoPaddingConfig(ShapeUtil::Rank(input->shape()));
-  ConvolutionDimensionNumbers forward_conv_dnums =
-      forward_conv->convolution_dimension_numbers();
   ConvolutionDimensionNumbers backward_conv_dnums =
       backward_conv->convolution_dimension_numbers();
   for (size_t i = 0; i < backward_conv->window().dimensions_size(); ++i) {
@@ -222,11 +229,7 @@ bool PadInsertion::CanonicalizeBackwardFilterConvolution(
       // cuDNN convolution (which doesn't support negative padding) to fail.
       return false;
     }
-    // If the backward convolution has uneven padding on the activations, we
-    // move some padding on the larger end to "internal" padding, so that the
-    // backward convolution produces larger weight gradients which get sliced
-    // later. Therefore, the amount of new padding (low or high) is the minimum
-    // of the amount of old padding low and old padding high.
+    // Compute the new, even padding for the backward conv operation.
     int64 new_conv_padding = std::min(padding_low, padding_high);
     int64 dim = backward_conv_dnums.input_spatial_dimensions(i);
     input_padding_config.mutable_dimensions(dim)->set_edge_padding_low(
@@ -237,14 +240,9 @@ bool PadInsertion::CanonicalizeBackwardFilterConvolution(
     // Since we move some padding from the backward convolution to the kPad, we
     // need to accordingly reduce the padding amount of the backward convolution
     // and its inner forward convolution.
-    IncreasePaddingLowBy(-(padding_low - new_conv_padding),
-                         new_backward_conv_window.mutable_dimensions(i));
-    IncreasePaddingHighBy(-(padding_high - new_conv_padding),
-                          new_backward_conv_window.mutable_dimensions(i));
-    IncreasePaddingLowBy(-(padding_low - new_conv_padding),
-                         new_forward_conv_window.mutable_dimensions(i));
-    IncreasePaddingHighBy(-(padding_high - new_conv_padding),
-                          new_forward_conv_window.mutable_dimensions(i));
+    auto* new_dim = new_backward_conv_window.mutable_dimensions(i);
+    new_dim->set_padding_low(new_conv_padding);
+    new_dim->set_padding_high(new_conv_padding);
   }
 
   // Create a new backward convolution replacing the old one.
@@ -260,19 +258,12 @@ bool PadInsertion::CanonicalizeBackwardFilterConvolution(
               .ConsumeValueOrDie(),
           input, padding, input_padding_config));
 
-  HloInstruction* new_forward_conv =
-      computation->AddInstruction(HloInstruction::CreateConvolve(
-          ShapeInference::InferConvolveShape(
-              padded_input->shape(), output->shape(), new_forward_conv_window,
-              forward_conv_dnums)
-              .ConsumeValueOrDie(),
-          padded_input, output, new_forward_conv_window, forward_conv_dnums));
-
-  // Fuse the new forward convolution to the new backward convolution.
-  HloInstruction* new_backward_conv =
-      computation->CreateFusionInstructionForBackwardConvolution(
-          {new_forward_conv}, HloInstruction::FusionKind::kConvBackwardFilter,
-          new_backward_conv_window, backward_conv_dnums);
+  // The shape of the backward_conv CustomCall is a tuple (conv_result,
+  // scratch_buffer).  Extract out the shape of conv_result.
+  Shape backward_conv_shape = backward_conv->shape().tuple_shapes(0);
+  HloInstruction* new_backward_conv = CreateCudnnConvBackwardFilter(
+      backward_conv_shape, padded_input, output, new_backward_conv_window,
+      backward_conv_dnums);
 
   VLOG(1) << "Canonicalizing backward filter conv";
   VLOG(1) << "Replacing:\n  " << backward_conv->ToString() << "\nwith:\n  "
@@ -289,14 +280,15 @@ bool PadInsertion::CanonicalizeBackwardInputConvolution(
     return false;
   }
 
-  HloInstruction* forward_conv = backward_conv->fused_expression_root();
-  HloInstruction* reverse_filter = forward_conv->mutable_operand(1);
-  Window new_forward_conv_window = forward_conv->window();
   Window new_backward_conv_window = backward_conv->window();
-  ConvolutionDimensionNumbers forward_conv_dnums =
-      forward_conv->convolution_dimension_numbers();
   ConvolutionDimensionNumbers backward_conv_dnums =
       backward_conv->convolution_dimension_numbers();
+
+  // The backward_conv CustomCall returns a tuple (conv_result, scratch_memory).
+  // Get the shape of conv_result.
+  Shape backward_conv_shape = backward_conv->shape().tuple_shapes(0);
+
+  Shape new_backward_conv_shape = backward_conv_shape;
   for (size_t i = 0; i < backward_conv->window().dimensions_size(); ++i) {
     int64 padding_low = backward_conv->window().dimensions(i).padding_low();
     int64 padding_high = backward_conv->window().dimensions(i).padding_high();
@@ -315,41 +307,38 @@ bool PadInsertion::CanonicalizeBackwardInputConvolution(
     // where the amount of padding low is larger, we can canonicalize it to
     //   [B A] = BackwardInputConvolve([a b], [x y z], padding=(low=1,high=1))
     //   [A] = Slice([B A])
-    // For consistency, we need to increase the low padding of the inner
-    // convolution by 1 as well because the input is larger now.
     if (padding_low > padding_high) {
       IncreasePaddingLowBy(padding_high - padding_low,
                            new_backward_conv_window.mutable_dimensions(i));
-      IncreasePaddingLowBy(padding_low - padding_high,
-                           new_forward_conv_window.mutable_dimensions(i));
     } else if (padding_low < padding_high) {
       IncreasePaddingHighBy(padding_low - padding_high,
                             new_backward_conv_window.mutable_dimensions(i));
-      IncreasePaddingHighBy(padding_high - padding_low,
-                            new_forward_conv_window.mutable_dimensions(i));
     }
+    // Decreasing the padding by X *increases* the size of our output by X.
+    int64 dim = backward_conv_dnums.output_spatial_dimensions(i);
+    new_backward_conv_shape.set_dimensions(
+        dim, new_backward_conv_shape.dimensions(dim) +
+                 std::abs(padding_low - padding_high));
   }
 
   // Create a new backward convolution replacing the old one.
   HloComputation* computation = backward_conv->parent();
   HloInstruction* output = backward_conv->mutable_operand(0);
   HloInstruction* filter = backward_conv->mutable_operand(1);
-  HloInstruction* new_reverse_filter =
-      computation->AddInstruction(HloInstruction::CreateReverse(
-          filter->shape(), filter, reverse_filter->dimensions()));
-  HloInstruction* new_forward_conv =
-      computation->AddInstruction(HloInstruction::CreateConvolve(
-          ShapeInference::InferConvolveShape(
-              output->shape(), new_reverse_filter->shape(),
-              new_forward_conv_window, forward_conv_dnums)
-              .ConsumeValueOrDie(),
-          output, new_reverse_filter, new_forward_conv_window,
-          forward_conv_dnums));
+
+  HloInstruction* new_backward_conv_call = CreateCudnnConvBackwardInput(
+      new_backward_conv_shape, output, filter, new_backward_conv_window,
+      backward_conv_dnums);
+
+  // The CustomCall created above returns a tuple (conv_result, scratch_memory).
+  // Extract out the two elements.
   HloInstruction* new_backward_conv =
-      computation->CreateFusionInstructionForBackwardConvolution(
-          {new_forward_conv, new_reverse_filter},
-          HloInstruction::FusionKind::kConvBackwardInput,
-          new_backward_conv_window, backward_conv_dnums);
+      computation->AddInstruction(HloInstruction::CreateGetTupleElement(
+          new_backward_conv_shape, new_backward_conv_call, 0));
+  HloInstruction* new_backward_conv_scratch =
+      computation->AddInstruction(HloInstruction::CreateGetTupleElement(
+          new_backward_conv_call->shape().tuple_shapes(1),
+          new_backward_conv_call, 1));
 
   // Slice the new backward convolution.
   //
@@ -377,22 +366,25 @@ bool PadInsertion::CanonicalizeBackwardInputConvolution(
   }
 
   // Replace the old backward convolution with the slice.
-  CHECK(ShapeUtil::Compatible(
+  Shape slice_shape =
       ShapeInference::InferSliceShape(new_backward_conv->shape(), start_indices,
                                       limit_indices, strides)
-          .ConsumeValueOrDie(),
-      backward_conv->shape()));
+          .ConsumeValueOrDie();
+  CHECK(ShapeUtil::Compatible(slice_shape, backward_conv_shape))
+      << ShapeUtil::HumanString(slice_shape) << " vs "
+      << ShapeUtil::HumanString(backward_conv_shape);
 
-  auto slice =
-      HloInstruction::CreateSlice(backward_conv->shape(), new_backward_conv,
-                                  start_indices, limit_indices, strides);
+  HloInstruction* slice = computation->AddInstruction(
+      HloInstruction::CreateSlice(backward_conv_shape, new_backward_conv,
+                                  start_indices, limit_indices, strides));
+  HloInstruction* new_tuple = computation->AddInstruction(
+      HloInstruction::CreateTuple({slice, new_backward_conv_scratch}));
 
   VLOG(1) << "Canonicalizing backward input conv";
   VLOG(1) << "Replacing:\n  " << backward_conv->ToString() << "\nwith:\n  "
-          << slice->ToString();
+          << new_tuple->ToString();
 
-  TF_CHECK_OK(
-      computation->ReplaceWithNewInstruction(backward_conv, std::move(slice)));
+  TF_CHECK_OK(computation->ReplaceInstruction(backward_conv, new_tuple));
   return true;
 }
 
@@ -400,18 +392,17 @@ StatusOr<bool> PadInsertion::Run(HloModule* module) {
   bool changed = false;
   for (HloInstruction* instruction :
        module->entry_computation()->MakeInstructionPostOrder()) {
-    if (instruction->opcode() == HloOpcode::kConvolution) {
-      changed |= CanonicalizeForwardConvolution(instruction);
-    } else if (instruction->opcode() == HloOpcode::kFusion) {
-      switch (instruction->fusion_kind()) {
-        case HloInstruction::FusionKind::kConvBackwardFilter:
-          changed |= CanonicalizeBackwardFilterConvolution(instruction);
-          break;
-        case HloInstruction::FusionKind::kConvBackwardInput:
-          changed |= CanonicalizeBackwardInputConvolution(instruction);
-          break;
-        default:
-          break;
+    if (IsCustomCallToDnnConvolution(*instruction)) {
+      const auto& target = instruction->custom_call_target();
+      if (target == kCudnnConvForwardCallTarget) {
+        changed |= CanonicalizeForwardConvolution(instruction);
+      } else if (target == kCudnnConvBackwardFilterCallTarget) {
+        changed |= CanonicalizeBackwardFilterConvolution(instruction);
+      } else if (target == kCudnnConvBackwardInputCallTarget) {
+        changed |= CanonicalizeBackwardInputConvolution(instruction);
+      } else {
+        LOG(FATAL) << "Unknown custom call target for cudnn conv: "
+                   << instruction->ToString();
       }
     }
   }
diff --git a/tensorflow/compiler/xla/service/hlo_computation.cc b/tensorflow/compiler/xla/service/hlo_computation.cc
index a63affa06c..5432419e4a 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation.cc
@@ -461,20 +461,6 @@ HloInstruction* HloComputation::CreateFusionInstruction(
   return fusion_instruction;
 }
 
-HloInstruction* HloComputation::CreateFusionInstructionForBackwardConvolution(
-    tensorflow::gtl::ArraySlice<HloInstruction*> instructions_to_fuse,
-    HloInstruction::FusionKind fusion_kind, const Window& window,
-    const ConvolutionDimensionNumbers& conv_dnums) {
-  CHECK(HloInstruction::FusionKind::kConvBackwardFilter == fusion_kind ||
-        HloInstruction::FusionKind::kConvBackwardInput == fusion_kind);
-  HloInstruction* root = instructions_to_fuse.front();
-  HloInstruction* fusion_instruction =
-      AddInstruction(HloInstruction::CreateFusionForBackwardConvolution(
-          root->shape(), fusion_kind, window, conv_dnums, root));
-  FuseInstructionsInto(instructions_to_fuse, fusion_instruction);
-  return fusion_instruction;
-}
-
 StatusOr<HloInstruction*> HloComputation::DeepCopyHelper(
     HloInstruction* instruction, const ShapeTree<bool>* indices_to_copy,
     ShapeTree<HloInstruction*>* copies_added, ShapeIndex* index) {
@@ -577,8 +563,11 @@ Status HloComputation::ReplaceWithNewInstruction(
 
 Status HloComputation::ReplaceInstruction(HloInstruction* old_instruction,
                                           HloInstruction* new_instruction) {
-  TF_RET_CHECK(ShapeUtil::Compatible(old_instruction->shape(),
-                                     new_instruction->shape()));
+  TF_RET_CHECK(
+      ShapeUtil::Compatible(old_instruction->shape(), new_instruction->shape()))
+      << ShapeUtil::HumanString(old_instruction->shape()) << " vs "
+      << ShapeUtil::HumanString(new_instruction->shape());
+
   VLOG(10) << "transformed " << old_instruction->ToString() << " to "
            << new_instruction->ToString();
   // Try to add metadata for HLO instructions that are created to replace
diff --git a/tensorflow/compiler/xla/service/hlo_computation.h b/tensorflow/compiler/xla/service/hlo_computation.h
index 6436815f91..061c59abe5 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.h
+++ b/tensorflow/compiler/xla/service/hlo_computation.h
@@ -224,15 +224,6 @@ class HloComputation {
       tensorflow::gtl::ArraySlice<HloInstruction*> instructions_to_fuse,
       HloInstruction::FusionKind fusion_kind);
 
-  // Creates a fusion instruction that represents a backward convolution. This
-  // is similar to CreateFusionInstruction but takes window and conv_dnums which
-  // indicate the window and convolution dimension numbers of the backward
-  // convolution.
-  HloInstruction* CreateFusionInstructionForBackwardConvolution(
-      tensorflow::gtl::ArraySlice<HloInstruction*> instructions_to_fuse,
-      HloInstruction::FusionKind fusion_kind, const Window& window,
-      const ConvolutionDimensionNumbers& conv_dnums);
-
   // Create a deep copy of the given instruction and return the instruction
   // producing the copied result. All instructions performing the copy are added
   // to the computation. For array-shaped values, this method trivially returns
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
index cd54eb74d1..9cd5a1e2b7 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
@@ -469,7 +469,13 @@ Status HloCostAnalysis::HandleCall(const HloInstruction* call) {
 }
 
 Status HloCostAnalysis::HandleCustomCall(const HloInstruction*) {
-  return Unimplemented("Custom-call is not implemented for HLO cost analysis.");
+  // We can't do anything sane with CustomCalls, since we don't know what they
+  // do, and returning an error status will stop iteration over this
+  // computation, which is probably also not what we want.  So just punt and
+  // return OK.  This will cause all of the properties to be reported as 0,
+  // which is fine.
+  current_should_compute_bottleneck_time_ = false;
+  return Status::OK();
 }
 
 Status HloCostAnalysis::HandleSort(const HloInstruction* sort) {
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index a889c35aeb..fac6b43405 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -763,16 +763,13 @@ HloInstruction::CreateBroadcastSequence(
   return instruction;
 }
 
-// We put the fusion kind into the instruction's name for transpose-dot and
-// backward-conv fusions, since those fusions are really just describing a type
-// of dot/conv rather than generating a novel computation.
+// We put the fusion kind into the instruction's name for transpose-dot fusions,
+// since those fusions are really just describing a type of dot rather than
+// generating a novel computation.
 static string FusionNodeName(HloInstruction::FusionKind fusion_kind) {
   switch (fusion_kind) {
     case HloInstruction::FusionKind::kTransposeDot:
       return "dot_fusion";
-    case HloInstruction::FusionKind::kConvBackwardInput:
-    case HloInstruction::FusionKind::kConvBackwardFilter:
-      return "conv_fusion";
     default:
       return "fusion";
   }
@@ -804,18 +801,6 @@ static string FusionNodeName(HloInstruction::FusionKind fusion_kind) {
   return instruction;
 }
 
-/* static */ std::unique_ptr<HloInstruction>
-HloInstruction::CreateFusionForBackwardConvolution(
-    const Shape& shape, FusionKind fusion_kind, const Window& window,
-    const ConvolutionDimensionNumbers& conv_dnums, HloInstruction* fused_root) {
-  std::unique_ptr<HloInstruction> fusion =
-      CreateFusion(shape, fusion_kind, fused_root);
-  fusion->window_ = MakeUnique<Window>(window);
-  fusion->convolution_dimension_numbers_ =
-      MakeUnique<ConvolutionDimensionNumbers>(conv_dnums);
-  return fusion;
-}
-
 void HloInstruction::MergeFusionInstruction(
     HloInstruction* instruction_to_merge) {
   CHECK_EQ(opcode_, HloOpcode::kFusion);
@@ -2318,7 +2303,7 @@ string HloInstruction::ToCategory() const {
     return "data formatting";
   }
 
-  auto conv_category = [&] {
+  if (opcode() == HloOpcode::kConvolution) {
     string category = "convolution";
     if (window_util::HasBaseDilation(window())) {
       category += " base-dilated";
@@ -2327,10 +2312,6 @@ string HloInstruction::ToCategory() const {
       category += " window-dilated";
     }
     return category;
-  };
-
-  if (opcode() == HloOpcode::kConvolution) {
-    return conv_category();
   }
 
   // Give transpose-dot and backwards-conv fusions the categories "dot" and
@@ -2348,9 +2329,6 @@ string HloInstruction::ToCategory() const {
         return "output fusion";
       case FusionKind::kTransposeDot:
         return "dot";
-      case FusionKind::kConvBackwardFilter:
-      case FusionKind::kConvBackwardInput:
-        return conv_category();
       case FusionKind::kCustom:
         return "custom fusion";
     }
@@ -3125,10 +3103,6 @@ string ToString(HloInstruction::FusionKind kind) {
       return "kOutput";
     case HloInstruction::FusionKind::kTransposeDot:
       return "kTransposeDot";
-    case HloInstruction::FusionKind::kConvBackwardFilter:
-      return "kConvBackwardFilter";
-    case HloInstruction::FusionKind::kConvBackwardInput:
-      return "kConvBackwardInput";
     case HloInstruction::FusionKind::kCustom:
       return "kCustom";
   }
@@ -3148,12 +3122,6 @@ StatusOr<HloInstruction::FusionKind> StringToFusionKind(
   if (kind_name == "kTransposeDot") {
     return HloInstruction::FusionKind::kTransposeDot;
   }
-  if (kind_name == "kConvBackwardFilter") {
-    return HloInstruction::FusionKind::kConvBackwardFilter;
-  }
-  if (kind_name == "kConvBackwardInput") {
-    return HloInstruction::FusionKind::kConvBackwardInput;
-  }
   if (kind_name == "kCustom") {
     return HloInstruction::FusionKind::kCustom;
   }
@@ -3261,7 +3229,13 @@ string HloInstruction::ConvolutionDimensionNumbersToString() const {
   result += "_";
   append_dims(rhs_dims, operand(1)->shape());
   result += "->";
-  append_dims(output_dims, shape());
+
+  // A convolution can be represented as a kConvolution HLO or as a CustomCall
+  // that returns a tuple, the first element of which is the result of the
+  // convolution.
+  Shape this_shape =
+      ShapeUtil::IsTuple(shape()) ? shape().tuple_shapes(0) : shape();
+  append_dims(output_dims, this_shape);
   return result;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index 5e89dc79be..84b469688b 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -162,17 +162,14 @@ class HloPrintOptions {
 class HloInstruction {
  public:
   enum class FusionKind {
-    kLoop,                // Fused into a loop.
-    kInput,               // Op's input is fused into the op itself.
-    kOutput,              // Op's output is fused into the op itself.
-                          // REQUIRES: At least one operand buffer must be able
-                          // to alias the output buffer.
-    kTransposeDot,        // Fused into a dot with transposed operands.
-    kConvBackwardFilter,  // Fused into a backward filter convolution.
-    kConvBackwardInput,   // Fused into a backward input convolution.
-
-    kCustom,  // Custom category for backend-specific fusions that
-              // do not match any of the more specific ones.
+    kLoop,          // Fused into a loop.
+    kInput,         // Op's input is fused into the op itself.
+    kOutput,        // Op's output is fused into the op itself.
+                    // REQUIRES: At least one operand buffer must be able
+                    // to alias the output buffer.
+    kTransposeDot,  // Fused into a dot with transposed operands.
+    kCustom,        // Custom category for backend-specific fusions that
+                    // do not match any of the more specific ones.
   };
 
   ~HloInstruction();
@@ -466,14 +463,6 @@ class HloInstruction {
       tensorflow::gtl::ArraySlice<HloInstruction*> operands,
       HloComputation* fusion_computation);
 
-  // Creates a fusion instruction that represents backward convolution. This is
-  // similar to CreateFusion, but with extra arguments indicating the window and
-  // dimemsion mapping of the backward convolution.
-  static std::unique_ptr<HloInstruction> CreateFusionForBackwardConvolution(
-      const Shape& shape, FusionKind fusion_kind, const Window& window,
-      const ConvolutionDimensionNumbers& conv_dnums,
-      HloInstruction* fused_root);
-
   // Creates a call instruction that applies the given computation on the given
   // operands. "shape" is the resultant shape.
   static std::unique_ptr<HloInstruction> CreateCall(
@@ -1052,13 +1041,23 @@ class HloInstruction {
     return *padding_config_;
   }
 
-  // Returns data on the dimension numbers used for a convolution
-  // operation.
+  // Returns data on the dimension numbers used for a convolution operation,
+  // which may be a kConvolution instruction or a kCustomCall that implements a
+  // convolution.
   const ConvolutionDimensionNumbers& convolution_dimension_numbers() const {
     CHECK(convolution_dimension_numbers_ != nullptr);
     return *convolution_dimension_numbers_;
   }
 
+  // Sets the convolution dimension numbers on this instruction.  In general you
+  // shouldn't need to call this; instead, specify the convolution dimension
+  // numbers when you create the instruction.
+  void set_convolution_dimension_numbers(
+      const ConvolutionDimensionNumbers& dnums) {
+    convolution_dimension_numbers_ =
+        MakeUnique<ConvolutionDimensionNumbers>(dnums);
+  }
+
   FftType fft_type() const {
     CHECK_EQ(HloOpcode::kFft, opcode_);
     return fft_type_;
-- 
GitLab


From e46e0459c2e5135db3ff72918a463ce9f48d204e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 1 Feb 2018 21:43:12 -0800
Subject: [PATCH 570/966] Fix tolerance too tight for Wasserstein distance
 test.

PiperOrigin-RevId: 184240222
---
 .../contrib/gan/python/eval/python/sliced_wasserstein_test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_test.py b/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_test.py
index b960af28ea..871f1ad54e 100644
--- a/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_test.py
+++ b/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_test.py
@@ -84,11 +84,11 @@ class ClassifierMetricsTest(test.TestCase):
     self.assertAllClose(
         np.array([0.014, 0.014], 'f'),
         np.array([x[0] for x in wscores], 'f'),
-        rtol=0.1)
+        rtol=0.15)
     self.assertAllClose(
         np.array([0.014, 0.020], 'f'),
         np.array([x[1] for x in wscores], 'f'),
-        rtol=0.1)
+        rtol=0.15)
 
   def test_sliced_wasserstein_distance_svd(self):
     """Test the distance."""
-- 
GitLab


From 4e1863b76676afb863131f09ce8f9ac44f04e816 Mon Sep 17 00:00:00 2001
From: Gunhan Gulsoy <gunan@google.com>
Date: Thu, 1 Feb 2018 23:47:33 -0800
Subject: [PATCH 571/966] Fix some tf-lite tests

PiperOrigin-RevId: 184247187
---
 tensorflow/contrib/lite/BUILD                        |  2 ++
 tensorflow/contrib/lite/arena_planner_test.cc        |  6 +++---
 .../contrib/lite/kernels/strided_slice_test.cc       |  1 +
 tensorflow/contrib/lite/tools/BUILD                  |  1 +
 tensorflow/contrib/lite/tools/verifier_test.cc       | 12 ++++++++----
 5 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/tensorflow/contrib/lite/BUILD b/tensorflow/contrib/lite/BUILD
index 13350c5a43..2009b3b05d 100644
--- a/tensorflow/contrib/lite/BUILD
+++ b/tensorflow/contrib/lite/BUILD
@@ -53,6 +53,8 @@ cc_test(
     srcs = ["arena_planner_test.cc"],
     deps = [
         ":arena_planner",
+        "//tensorflow/contrib/lite/testing:util",
+        "//tensorflow/core:lib",
         "@com_google_googletest//:gtest",
     ],
 )
diff --git a/tensorflow/contrib/lite/arena_planner_test.cc b/tensorflow/contrib/lite/arena_planner_test.cc
index c27c327abc..e10611e6d4 100644
--- a/tensorflow/contrib/lite/arena_planner_test.cc
+++ b/tensorflow/contrib/lite/arena_planner_test.cc
@@ -18,6 +18,8 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/testing/util.h"
+#include "tensorflow/core/platform/logging.h"
 
 namespace tflite {
 namespace {
@@ -464,9 +466,7 @@ TEST_F(ArenaPlannerTest, LargerGraphAndStepwiseAllocation) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // ::tflite::LogToStderr();
-  FLAGS_logtostderr = true;
-
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/kernels/strided_slice_test.cc b/tensorflow/contrib/lite/kernels/strided_slice_test.cc
index 5bc7dc353b..5cac04b383 100644
--- a/tensorflow/contrib/lite/kernels/strided_slice_test.cc
+++ b/tensorflow/contrib/lite/kernels/strided_slice_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 namespace tflite {
 namespace {
 
+using ::int32;
 using ::testing::ElementsAreArray;
 
 class StridedSliceOpModel : public SingleOpModel {
diff --git a/tensorflow/contrib/lite/tools/BUILD b/tensorflow/contrib/lite/tools/BUILD
index 4d3b553b22..6786b16184 100644
--- a/tensorflow/contrib/lite/tools/BUILD
+++ b/tensorflow/contrib/lite/tools/BUILD
@@ -118,6 +118,7 @@ cc_test(
         "//tensorflow/contrib/lite:string_util",
         "//tensorflow/contrib/lite/schema:schema_fbs",
         "//tensorflow/contrib/lite/testing:util",
+        "//tensorflow/core:framework_lite",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
diff --git a/tensorflow/contrib/lite/tools/verifier_test.cc b/tensorflow/contrib/lite/tools/verifier_test.cc
index 244d4f0396..87f6854e9e 100644
--- a/tensorflow/contrib/lite/tools/verifier_test.cc
+++ b/tensorflow/contrib/lite/tools/verifier_test.cc
@@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/tools/verifier.h"
+#include <string>
+#include <vector>
+
 #include "flatbuffers/flatbuffers.h"
 #include "flatbuffers/util.h"
 #include <gtest/gtest.h>
@@ -20,7 +22,9 @@ limitations under the License.
 #include "tensorflow/contrib/lite/error_reporter.h"
 #include "tensorflow/contrib/lite/schema/schema_generated.h"
 #include "tensorflow/contrib/lite/testing/util.h"
+#include "tensorflow/contrib/lite/tools/verifier.h"
 #include "tensorflow/contrib/lite/version.h"
+#include "tensorflow/core/framework/numeric_types.h"
 
 namespace tflite {
 
@@ -111,7 +115,7 @@ TEST(VerifyModel, TestSimpleModel) {
 }
 
 TEST(VerifyModel, TestCorruptedData) {
-  string model = "123";
+  std::string model = "123";
   ASSERT_FALSE(Verify(model.data(), model.size(), /*error_reporter=*/nullptr));
 }
 
@@ -131,8 +135,8 @@ TEST(VerifyModel, TestRandomModificationIsNotAllowed) {
                            /*subgraphs=*/0, /*description=*/0, /*buffers=*/0);
   ::tflite::FinishModelBuffer(builder, model);
 
-  string model_content(reinterpret_cast<char*>(builder.GetBufferPointer()),
-                       builder.GetSize());
+  std::string model_content(reinterpret_cast<char*>(builder.GetBufferPointer()),
+                            builder.GetSize());
   for (int i = 0; i < model_content.size(); i++) {
     model_content[i] = (model_content[i] + 137) % 255;
     EXPECT_FALSE(Verify(model_content.data(), model_content.size(),
-- 
GitLab


From 0612bc8fb566d813d35050a33a7109d67bb6136d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 1 Feb 2018 21:43:12 -0800
Subject: [PATCH 572/966] Fix tolerance too tight for Wasserstein distance
 test.

PiperOrigin-RevId: 184240222
---
 tensorflow/contrib/lite/BUILD                        |  2 --
 tensorflow/contrib/lite/arena_planner_test.cc        |  6 +++---
 .../contrib/lite/kernels/strided_slice_test.cc       |  1 -
 tensorflow/contrib/lite/tools/BUILD                  |  1 -
 tensorflow/contrib/lite/tools/verifier_test.cc       | 12 ++++--------
 5 files changed, 7 insertions(+), 15 deletions(-)

diff --git a/tensorflow/contrib/lite/BUILD b/tensorflow/contrib/lite/BUILD
index 2009b3b05d..13350c5a43 100644
--- a/tensorflow/contrib/lite/BUILD
+++ b/tensorflow/contrib/lite/BUILD
@@ -53,8 +53,6 @@ cc_test(
     srcs = ["arena_planner_test.cc"],
     deps = [
         ":arena_planner",
-        "//tensorflow/contrib/lite/testing:util",
-        "//tensorflow/core:lib",
         "@com_google_googletest//:gtest",
     ],
 )
diff --git a/tensorflow/contrib/lite/arena_planner_test.cc b/tensorflow/contrib/lite/arena_planner_test.cc
index e10611e6d4..c27c327abc 100644
--- a/tensorflow/contrib/lite/arena_planner_test.cc
+++ b/tensorflow/contrib/lite/arena_planner_test.cc
@@ -18,8 +18,6 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/testing/util.h"
-#include "tensorflow/core/platform/logging.h"
 
 namespace tflite {
 namespace {
@@ -466,7 +464,9 @@ TEST_F(ArenaPlannerTest, LargerGraphAndStepwiseAllocation) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
+  // ::tflite::LogToStderr();
+  FLAGS_logtostderr = true;
+
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/kernels/strided_slice_test.cc b/tensorflow/contrib/lite/kernels/strided_slice_test.cc
index 5cac04b383..5bc7dc353b 100644
--- a/tensorflow/contrib/lite/kernels/strided_slice_test.cc
+++ b/tensorflow/contrib/lite/kernels/strided_slice_test.cc
@@ -21,7 +21,6 @@ limitations under the License.
 namespace tflite {
 namespace {
 
-using ::int32;
 using ::testing::ElementsAreArray;
 
 class StridedSliceOpModel : public SingleOpModel {
diff --git a/tensorflow/contrib/lite/tools/BUILD b/tensorflow/contrib/lite/tools/BUILD
index 6786b16184..4d3b553b22 100644
--- a/tensorflow/contrib/lite/tools/BUILD
+++ b/tensorflow/contrib/lite/tools/BUILD
@@ -118,7 +118,6 @@ cc_test(
         "//tensorflow/contrib/lite:string_util",
         "//tensorflow/contrib/lite/schema:schema_fbs",
         "//tensorflow/contrib/lite/testing:util",
-        "//tensorflow/core:framework_lite",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
diff --git a/tensorflow/contrib/lite/tools/verifier_test.cc b/tensorflow/contrib/lite/tools/verifier_test.cc
index 87f6854e9e..244d4f0396 100644
--- a/tensorflow/contrib/lite/tools/verifier_test.cc
+++ b/tensorflow/contrib/lite/tools/verifier_test.cc
@@ -12,9 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <string>
-#include <vector>
-
+#include "tensorflow/contrib/lite/tools/verifier.h"
 #include "flatbuffers/flatbuffers.h"
 #include "flatbuffers/util.h"
 #include <gtest/gtest.h>
@@ -22,9 +20,7 @@ limitations under the License.
 #include "tensorflow/contrib/lite/error_reporter.h"
 #include "tensorflow/contrib/lite/schema/schema_generated.h"
 #include "tensorflow/contrib/lite/testing/util.h"
-#include "tensorflow/contrib/lite/tools/verifier.h"
 #include "tensorflow/contrib/lite/version.h"
-#include "tensorflow/core/framework/numeric_types.h"
 
 namespace tflite {
 
@@ -115,7 +111,7 @@ TEST(VerifyModel, TestSimpleModel) {
 }
 
 TEST(VerifyModel, TestCorruptedData) {
-  std::string model = "123";
+  string model = "123";
   ASSERT_FALSE(Verify(model.data(), model.size(), /*error_reporter=*/nullptr));
 }
 
@@ -135,8 +131,8 @@ TEST(VerifyModel, TestRandomModificationIsNotAllowed) {
                            /*subgraphs=*/0, /*description=*/0, /*buffers=*/0);
   ::tflite::FinishModelBuffer(builder, model);
 
-  std::string model_content(reinterpret_cast<char*>(builder.GetBufferPointer()),
-                            builder.GetSize());
+  string model_content(reinterpret_cast<char*>(builder.GetBufferPointer()),
+                       builder.GetSize());
   for (int i = 0; i < model_content.size(); i++) {
     model_content[i] = (model_content[i] + 137) % 255;
     EXPECT_FALSE(Verify(model_content.data(), model_content.size(),
-- 
GitLab


From ffa37312ab71d338adfb035a5ab71beb0da842ef Mon Sep 17 00:00:00 2001
From: Gunhan Gulsoy <gunan@google.com>
Date: Thu, 1 Feb 2018 23:47:33 -0800
Subject: [PATCH 573/966] Fix some tf-lite tests

PiperOrigin-RevId: 184247187
---
 tensorflow/contrib/lite/BUILD                        |  2 ++
 tensorflow/contrib/lite/arena_planner_test.cc        |  6 +++---
 .../contrib/lite/kernels/strided_slice_test.cc       |  1 +
 tensorflow/contrib/lite/tools/BUILD                  |  1 +
 tensorflow/contrib/lite/tools/verifier_test.cc       | 12 ++++++++----
 5 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/tensorflow/contrib/lite/BUILD b/tensorflow/contrib/lite/BUILD
index 13350c5a43..2009b3b05d 100644
--- a/tensorflow/contrib/lite/BUILD
+++ b/tensorflow/contrib/lite/BUILD
@@ -53,6 +53,8 @@ cc_test(
     srcs = ["arena_planner_test.cc"],
     deps = [
         ":arena_planner",
+        "//tensorflow/contrib/lite/testing:util",
+        "//tensorflow/core:lib",
         "@com_google_googletest//:gtest",
     ],
 )
diff --git a/tensorflow/contrib/lite/arena_planner_test.cc b/tensorflow/contrib/lite/arena_planner_test.cc
index c27c327abc..e10611e6d4 100644
--- a/tensorflow/contrib/lite/arena_planner_test.cc
+++ b/tensorflow/contrib/lite/arena_planner_test.cc
@@ -18,6 +18,8 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/testing/util.h"
+#include "tensorflow/core/platform/logging.h"
 
 namespace tflite {
 namespace {
@@ -464,9 +466,7 @@ TEST_F(ArenaPlannerTest, LargerGraphAndStepwiseAllocation) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // ::tflite::LogToStderr();
-  FLAGS_logtostderr = true;
-
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/kernels/strided_slice_test.cc b/tensorflow/contrib/lite/kernels/strided_slice_test.cc
index 5bc7dc353b..5cac04b383 100644
--- a/tensorflow/contrib/lite/kernels/strided_slice_test.cc
+++ b/tensorflow/contrib/lite/kernels/strided_slice_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 namespace tflite {
 namespace {
 
+using ::int32;
 using ::testing::ElementsAreArray;
 
 class StridedSliceOpModel : public SingleOpModel {
diff --git a/tensorflow/contrib/lite/tools/BUILD b/tensorflow/contrib/lite/tools/BUILD
index 4d3b553b22..6786b16184 100644
--- a/tensorflow/contrib/lite/tools/BUILD
+++ b/tensorflow/contrib/lite/tools/BUILD
@@ -118,6 +118,7 @@ cc_test(
         "//tensorflow/contrib/lite:string_util",
         "//tensorflow/contrib/lite/schema:schema_fbs",
         "//tensorflow/contrib/lite/testing:util",
+        "//tensorflow/core:framework_lite",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
diff --git a/tensorflow/contrib/lite/tools/verifier_test.cc b/tensorflow/contrib/lite/tools/verifier_test.cc
index 244d4f0396..87f6854e9e 100644
--- a/tensorflow/contrib/lite/tools/verifier_test.cc
+++ b/tensorflow/contrib/lite/tools/verifier_test.cc
@@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/tools/verifier.h"
+#include <string>
+#include <vector>
+
 #include "flatbuffers/flatbuffers.h"
 #include "flatbuffers/util.h"
 #include <gtest/gtest.h>
@@ -20,7 +22,9 @@ limitations under the License.
 #include "tensorflow/contrib/lite/error_reporter.h"
 #include "tensorflow/contrib/lite/schema/schema_generated.h"
 #include "tensorflow/contrib/lite/testing/util.h"
+#include "tensorflow/contrib/lite/tools/verifier.h"
 #include "tensorflow/contrib/lite/version.h"
+#include "tensorflow/core/framework/numeric_types.h"
 
 namespace tflite {
 
@@ -111,7 +115,7 @@ TEST(VerifyModel, TestSimpleModel) {
 }
 
 TEST(VerifyModel, TestCorruptedData) {
-  string model = "123";
+  std::string model = "123";
   ASSERT_FALSE(Verify(model.data(), model.size(), /*error_reporter=*/nullptr));
 }
 
@@ -131,8 +135,8 @@ TEST(VerifyModel, TestRandomModificationIsNotAllowed) {
                            /*subgraphs=*/0, /*description=*/0, /*buffers=*/0);
   ::tflite::FinishModelBuffer(builder, model);
 
-  string model_content(reinterpret_cast<char*>(builder.GetBufferPointer()),
-                       builder.GetSize());
+  std::string model_content(reinterpret_cast<char*>(builder.GetBufferPointer()),
+                            builder.GetSize());
   for (int i = 0; i < model_content.size(); i++) {
     model_content[i] = (model_content[i] + 137) % 255;
     EXPECT_FALSE(Verify(model_content.data(), model_content.size(),
-- 
GitLab


From 67f0b8f9ee6b003d6267b375bd8daf76d98015ab Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 2 Feb 2018 05:38:50 -0800
Subject: [PATCH 574/966] Enabling partitioned variables to work with TPU. When
 partitioned variables are used in a TPU training loop, concat gradient
 operations get generated for which XLA requires the concat dimension argument
 to be a constant (or foldable to a constant). However since such constant is
 defined outside of the train while context an Enter node is generated in
 order to pass it. The fix consists in detecting such case, and to duplicate
 the (scalar) constant inside the while context, so that XLA can succesfully
 process the resulting graph.

PiperOrigin-RevId: 184273245
---
 tensorflow/python/ops/array_grad.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/tensorflow/python/ops/array_grad.py b/tensorflow/python/ops/array_grad.py
index c9292184e6..9745d38dc2 100644
--- a/tensorflow/python/ops/array_grad.py
+++ b/tensorflow/python/ops/array_grad.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
@@ -115,6 +116,19 @@ def _ConcatGradHelper(op, grad, start_value_index, end_value_index, dim_index):
                                                         non_neg_concat_dim)
       out_grads = array_ops.split(grad, sizes, non_neg_concat_dim)
     else:
+      if constant_op.is_constant(concat_dim):
+        # If concat_dim is a constant defined in a different context,
+        # then we duplicate it in the current context to avoid passing it
+        # through an Enter node.
+        # This is a small optimization in general, but it is required when
+        # compiling with XLA, as XLA needs the concat input to be folded into a
+        # constant.
+        grad_context = control_flow_util.GetOutputContext(grad.op)
+        dim_context = control_flow_util.GetOutputContext(concat_dim.op)
+        if dim_context != grad_context:
+          value = tensor_util.constant_value(concat_dim)
+          concat_dim = constant_op.constant(value=value, dtype=concat_dim.dtype)
+
       # Using mod here for convenience since concat_dim is already verified
       # in concat implementation to be within the allowed [-rank, rank) range.
       non_neg_concat_dim = concat_dim % array_ops.rank(input_values[0])
-- 
GitLab


From 1804be16f41f5217d1ad53a8ba992d9a132d4d79 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Fri, 2 Feb 2018 08:09:13 -0800
Subject: [PATCH 575/966] Fix a bug in function inlining when the argument is
 an implicitly dereferenced ref tensor.

Previously the inliner would add an identity node with an invalid ref-type attr when the actual parameter had ref type. The changed version removes the reference.

PiperOrigin-RevId: 184285084
---
 tensorflow/core/common_runtime/function.cc   |  3 +--
 tensorflow/python/framework/function_test.py | 19 +++++++++++++++++--
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index 150fb85c70..e1b5404b79 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -97,12 +97,11 @@ static Node* AddNoOp(Graph* g) {
 
 static Node* AddIdentity(Graph* g, Endpoint input) {
   DCHECK_LT(0, input.dtype());
-  DCHECK_LT(input.dtype(), DT_FLOAT_REF);
   NodeDef ndef;
   ndef.set_name(g->NewName(kNodeLabel));
   ndef.set_op("Identity");
   ndef.add_input(input.name());
-  AddNodeAttr("T", input.dtype(), &ndef);
+  AddNodeAttr("T", BaseType(input.dtype()), &ndef);
   Status s;
   Node* ret = g->AddNode(ndef, &s);
   TF_CHECK_OK(s);
diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py
index a4ca3f9a89..b35cee0111 100644
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@@ -19,8 +19,8 @@ from __future__ import division
 from __future__ import print_function
 
 import re
-import time
 import sys
+import time
 
 import numpy as np
 
@@ -86,6 +86,21 @@ class FunctionTest(test.TestCase):
       with session.Session() as sess:
         self.assertAllEqual([18.0], sess.run(call))
 
+  def testIdentityImplicitDeref(self):
+
+    @function.Defun(dtypes.float32, func_name="MyIdentity")
+    def MyIdentityFunc(a):
+      return a
+
+    with ops.Graph().as_default():
+      var = variables.Variable([18.0])
+      call = MyIdentityFunc(var._ref())  # pylint: disable=protected-access
+      self.assertEqual("MyIdentity", call.op.name)
+      for cfg in _OptimizerOptions():
+        with session.Session(config=cfg) as sess:
+          sess.run(var.initializer)
+          self.assertAllEqual([18.0], sess.run(call))
+
   def testIdentityOutputName(self):
 
     @function.Defun(
@@ -771,7 +786,7 @@ class FunctionTest(test.TestCase):
     # We added more randomness to function names in C API.
     # TODO(iga): Remove this if statement when we switch to C API.
     if ops._USE_C_API:  # pylint: disable=protected-access
-      if sys.byteorder == 'big':
+      if sys.byteorder == "big":
         self.assertEqual("Foo_kEdkAG8SJvg",
                          Foo.instantiate([dtypes.float32] * 3).name)
       else:
-- 
GitLab


From 28c80ca9d230fb8b235044702b1d96f9be6c1f10 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 2 Feb 2018 08:09:48 -0800
Subject: [PATCH 576/966] [comment-only change]: Fix grammar error.

PiperOrigin-RevId: 184285125
---
 tensorflow/contrib/data/python/ops/batching.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/data/python/ops/batching.py b/tensorflow/contrib/data/python/ops/batching.py
index 76c07b2c99..6eb512dec6 100644
--- a/tensorflow/contrib/data/python/ops/batching.py
+++ b/tensorflow/contrib/data/python/ops/batching.py
@@ -403,7 +403,7 @@ def map_and_batch(map_func, batch_size, num_parallel_batches=1):
     num_parallel_batches: A `tf.int64` scalar `tf.Tensor`, representing the
       number of batches to create in parallel. On one hand, higher values can
       help mitigate the effect of stragglers. On the other hand, higher values
-      can increasing contention if CPU is scarce.
+      can increase contention if CPU is scarce.
 
   Returns:
     A `Dataset` transformation function, which can be passed to
-- 
GitLab


From 355c6ac2bdaab1c1748c2a64a851283e205a90ba Mon Sep 17 00:00:00 2001
From: Yao Zhang <yaozhang@google.com>
Date: Fri, 2 Feb 2018 09:00:41 -0800
Subject: [PATCH 577/966] Disable graph optimizations (CSE) in test so that
 constant nodes are not deduped.

PiperOrigin-RevId: 184289685
---
 tensorflow/python/grappler/layout_optimizer_test.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/grappler/layout_optimizer_test.py b/tensorflow/python/grappler/layout_optimizer_test.py
index 332ee725f0..5bc9e4b803 100644
--- a/tensorflow/python/grappler/layout_optimizer_test.py
+++ b/tensorflow/python/grappler/layout_optimizer_test.py
@@ -157,6 +157,7 @@ def _get_config(layout_optimizer=True):
   graph_options = config_pb2.GraphOptions(
       rewrite_options=rewrite_options, build_cost_model=1)
   config = config_pb2.ConfigProto(graph_options=graph_options)
+  config.graph_options.optimizer_options.opt_level = -1
   return config
 
 
@@ -1171,7 +1172,7 @@ class LayoutOptimizerTest(test.TestCase):
           num_transposes += 1
         nodes.append(node.name)
 
-      expected_num_transposes = 2
+      expected_num_transposes = 3
       self.assertEqual(expected_num_transposes, num_transposes)
       self._assert_trans_nhwc_to_nchw('map/while/Conv2D-0', nodes)
       self._assert_trans_nchw_to_nhwc('map/while/Add-0-2', nodes)
-- 
GitLab


From 9cfdf520d997e73011ee7bd5a183ced622a913aa Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 2 Feb 2018 09:20:26 -0800
Subject: [PATCH 578/966] Fix latent bug in dependency optimizer.

PiperOrigin-RevId: 184291701
---
 .../core/grappler/optimizers/dependency_optimizer.cc       | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
index 0842fc92a8..7b4ca1496c 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
@@ -97,7 +97,12 @@ bool DependencyOptimizer::SafeToRemoveIdentity(const NodeDef& node) {
       // TODO(rmlarsen): Try to remove this artificial contraint.
       return false;
     }
-    for (auto consumer : node_map_->GetOutputs(node.name())) {
+  }
+  for (auto consumer : node_map_->GetOutputs(node.name())) {
+    if (node.input_size() > 1 && IsMerge(*consumer)) {
+      return false;
+    }
+    if (IsSwitch(*input)) {
       for (const string& consumer_input : consumer->input()) {
         if (consumer_input == AsControlDependency(node.name())) {
           return false;
-- 
GitLab


From b38575ea9b12534e9c63833782a46e43e1c9a8af Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 2 Feb 2018 10:01:40 -0800
Subject: [PATCH 579/966] Show that we must over reallocate after resizing
 dynamic tensors.

PiperOrigin-RevId: 184296680
---
 tensorflow/contrib/lite/BUILD                 |  1 +
 tensorflow/contrib/lite/interpreter_test.cc   | 50 +++++++++++++++++--
 tensorflow/contrib/lite/kernels/kernel_util.h |  5 +-
 tensorflow/contrib/lite/testing/BUILD         |  2 +-
 4 files changed, 53 insertions(+), 5 deletions(-)

diff --git a/tensorflow/contrib/lite/BUILD b/tensorflow/contrib/lite/BUILD
index 2009b3b05d..cc0e20f75e 100644
--- a/tensorflow/contrib/lite/BUILD
+++ b/tensorflow/contrib/lite/BUILD
@@ -169,6 +169,7 @@ cc_test(
     deps = [
         ":framework",
         ":string_util",
+        "//tensorflow/contrib/lite/testing:util",
         "@com_google_googletest//:gtest",
     ],
 )
diff --git a/tensorflow/contrib/lite/interpreter_test.cc b/tensorflow/contrib/lite/interpreter_test.cc
index 2ab4bb6567..cfda19d72c 100644
--- a/tensorflow/contrib/lite/interpreter_test.cc
+++ b/tensorflow/contrib/lite/interpreter_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "tensorflow/contrib/lite/error_reporter.h"
 #include "tensorflow/contrib/lite/string_util.h"
+#include "tensorflow/contrib/lite/testing/util.h"
 
 namespace tflite {
 namespace {
@@ -282,6 +283,51 @@ TEST(BasicInterpreter, NoOpInterpreter) {
   ASSERT_EQ(interpreter.Invoke(), kTfLiteOk);
 }
 
+TEST(BasicInterpreter, ResizingTensors) {
+  Interpreter interpreter;
+  ASSERT_EQ(interpreter.AddTensors(1), kTfLiteOk);
+  ASSERT_EQ(interpreter.SetInputs({0}), kTfLiteOk);
+  ASSERT_EQ(interpreter.SetOutputs({0}), kTfLiteOk);
+
+  ASSERT_EQ(interpreter.SetTensorParametersReadWrite(
+                0, kTfLiteFloat32, "", {3}, TfLiteQuantizationParams()),
+            kTfLiteOk);
+
+  int t = interpreter.inputs()[0];
+  TfLiteTensor* tensor = interpreter.tensor(t);
+
+  ASSERT_EQ(interpreter.ResizeInputTensor(t, {1, 2, 3}), kTfLiteOk);
+  EXPECT_EQ(tensor->bytes, 6 * sizeof(float));
+  ASSERT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
+
+  tensor->data.f[5] = 0.123f;
+
+  // Changing from kTfLiteArenaRw to kTfLiteDynamic is quite complicate: we need
+  // to unset data.raw, otherwise Realloc will try to free that memory.
+  tensor->data.raw = nullptr;
+  tensor->allocation_type = kTfLiteDynamic;
+
+  ASSERT_EQ(interpreter.ResizeInputTensor(t, {1, 2, 4}), kTfLiteOk);
+  EXPECT_EQ(tensor->bytes, 8 * sizeof(float));
+  ASSERT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
+
+  // TODO(ahentz): We shouldn't have to force reallocation, but
+  // ResizeInputTensor doesn't realloc dynamic tensors. Also note that
+  // TfLiteTensorRealloc(tensor->bytes, tensor) is a no-op.
+  TfLiteTensorRealloc(9 * sizeof(float), tensor);
+  tensor->data.f[7] = 0.123f;
+
+  ASSERT_EQ(interpreter.ResizeInputTensor(t, {2, 2, 4}), kTfLiteOk);
+  EXPECT_EQ(tensor->bytes, 16 * sizeof(float));
+  ASSERT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
+
+  // TODO(ahentz): We shouldn't have to force reallocation, but
+  // ResizeInputTensor doesn't realloc dynamic tensors. Also note that
+  // TfLiteTensorRealloc(tensor->bytes, tensor) is a no-op.
+  TfLiteTensorRealloc(17 * sizeof(float), tensor);
+  tensor->data.f[15] = 0.123f;
+}
+
 TEST(BasicInterpreter, OneOpInterpreter) {
   Interpreter interpreter;
   ASSERT_EQ(interpreter.AddTensors(2), kTfLiteOk);
@@ -645,9 +691,7 @@ TEST_F(TestExecutionPlan, NullExecutionPlan) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-#ifdef OS_LINUX
-  FLAGS_logtostderr = true;
-#endif
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/kernels/kernel_util.h b/tensorflow/contrib/lite/kernels/kernel_util.h
index 3cfa72615a..28f53b9fbb 100644
--- a/tensorflow/contrib/lite/kernels/kernel_util.h
+++ b/tensorflow/contrib/lite/kernels/kernel_util.h
@@ -65,7 +65,10 @@ inline bool IsDynamicTensor(TfLiteTensor* tensor) {
 
 // Sets tensor to dynamic.
 inline void SetTensorToDynamic(TfLiteTensor* tensor) {
-  tensor->allocation_type = kTfLiteDynamic;
+  if (tensor->allocation_type != kTfLiteDynamic) {
+    tensor->allocation_type = kTfLiteDynamic;
+    tensor->data.raw = nullptr;
+  }
 }
 
 // Calculates the multiplication factor for a quantized convolution (or
diff --git a/tensorflow/contrib/lite/testing/BUILD b/tensorflow/contrib/lite/testing/BUILD
index 7f84a0ab9b..b949045128 100644
--- a/tensorflow/contrib/lite/testing/BUILD
+++ b/tensorflow/contrib/lite/testing/BUILD
@@ -206,7 +206,7 @@ tf_cc_test(
         "--unzip_binary_path=/usr/bin/unzip",
     ],
     data = [":optest"],
-    shard_count = 10,
+    shard_count = 20,
     tags = ["no_oss"],
     deps = [
         ":parse_testdata_lib",
-- 
GitLab


From 417e7d953e538e79002c59751a13f265de15bf2c Mon Sep 17 00:00:00 2001
From: Yifei Feng <fengyifei2026@gmail.com>
Date: Fri, 2 Feb 2018 11:02:34 -0800
Subject: [PATCH 580/966] Remove all_files rule from com_google_absl.BUILD

---
 third_party/com_google_absl.BUILD | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/third_party/com_google_absl.BUILD b/third_party/com_google_absl.BUILD
index 0c8d327c1f..8fca145f75 100644
--- a/third_party/com_google_absl.BUILD
+++ b/third_party/com_google_absl.BUILD
@@ -3,15 +3,3 @@ package(default_visibility = ["//visibility:public"])
 licenses(["notice"])  # Apache
 
 exports_files(["LICENSE"])
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-- 
GitLab


From b70a3794b6e06418b6d5b4d7142edeb78494fe7b Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Fri, 2 Feb 2018 10:04:15 -0800
Subject: [PATCH 581/966] Consider beyond immediate neighbors to find exit
 node.

Most of the exit nodes are immediate neighbors of the switch, except we do have
cases where the switch feeds into an identity that feeds into a exit.

PiperOrigin-RevId: 184297180
---
 .../tf2xla/functionalize_control_flow.cc      | 28 ++++++++++++++++---
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
index 1d9e0fb33e..bf304102ed 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
@@ -427,16 +427,36 @@ Status FunctionalizeLoop(Graph* graph, Frame* frame,
       //   identity nodes are values used by the loop body or condition.
       //   The Identity node may have the wrong device so copy the device from
       //   one of its outputs instead.
+      std::deque<const Edge*> possible_exit;
       for (const Edge* edge : arg.switch_node->out_edges()) {
-        if (edge->src_output() == 0 && IsExit(edge->dst())) {
+        if (edge->src_output() == 0) {
+          possible_exit.push_back(edge);
+        }
+        if (IsIdentity(edge->dst())) {
+          TF_RETURN_IF_ERROR(
+              SetNodeShardingFromNeighbors(edge->dst(), /*out_edges=*/true));
+        }
+      }
+      // TODO(b/67425339): Allow general graph between switch and exit.
+      while (!possible_exit.empty()) {
+        const Edge* edge = possible_exit.front();
+        possible_exit.pop_front();
+        if (IsExit(edge->dst())) {
           if (arg.exit != nullptr) {
             return errors::InvalidArgument("Duplicate Exit successors to ",
                                            arg.switch_node->name());
           }
           arg.exit = edge->dst();
-        } else if (StringPiece(edge->dst()->type_string()) == "Identity") {
-          TF_RETURN_IF_ERROR(
-              SetNodeShardingFromNeighbors(edge->dst(), /*out_edges=*/true));
+        } else {
+          if (!IsIdentity(edge->dst())) {
+            return errors::Unimplemented("General graph between switch (",
+                                         arg.switch_node->name(),
+                                         ") and exit node of frame ",
+                                         frame->name, " not supported yet.");
+          }
+          for (const Edge* out : edge->dst()->out_edges()) {
+            possible_exit.push_back(out);
+          }
         }
       }
     }
-- 
GitLab


From 16594f81af70eb33e9bbda1f1f26082a64919b2d Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Fri, 2 Feb 2018 10:28:20 -0800
Subject: [PATCH 582/966] Remove hidden_ops.txt file. Instead, switch to use
 visibility attribute in ApiDef proto.

PiperOrigin-RevId: 184301076
---
 tensorflow/contrib/cmake/tf_python.cmake      |   2 +-
 tensorflow/core/api_def/BUILD                 |   1 +
 tensorflow/core/api_def/api_test.cc           | 197 ++++++++++-----
 tensorflow/python/BUILD                       |   6 -
 tensorflow/python/build_defs.bzl              |   1 -
 tensorflow/python/client/session_test.py      |   7 +-
 .../python/eager/python_eager_op_gen.cc       |  21 +-
 tensorflow/python/framework/python_op_gen.cc  |  24 +-
 .../kernel_tests/control_flow_ops_py_test.py  |  32 +--
 .../kernel_tests/control_flow_util_test.py    |  10 +-
 tensorflow/python/ops/control_flow_ops.py     |  10 +-
 tensorflow/tools/api/tests/BUILD              |   5 -
 .../tools/api/tests/api_compatibility_test.py | 239 ------------------
 13 files changed, 198 insertions(+), 357 deletions(-)

diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index 8862390d2b..294b9c5941 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -307,7 +307,7 @@ function(GENERATE_PYTHON_OP_LIB tf_python_op_lib_name)
     # containing the wrappers.
     add_custom_command(
       OUTPUT ${GENERATE_PYTHON_OP_LIB_DESTINATION}
-      COMMAND ${tf_python_op_lib_name}_gen_python ${tensorflow_source_dir}/tensorflow/core/api_def/base_api,${tensorflow_source_dir}/tensorflow/core/api_def/python_api @${tensorflow_source_dir}/tensorflow/python/ops/hidden_ops.txt ${require_shape_fn} > ${GENERATE_PYTHON_OP_LIB_DESTINATION}
+      COMMAND ${tf_python_op_lib_name}_gen_python ${tensorflow_source_dir}/tensorflow/core/api_def/base_api,${tensorflow_source_dir}/tensorflow/core/api_def/python_api ${require_shape_fn} > ${GENERATE_PYTHON_OP_LIB_DESTINATION}
       DEPENDS ${tf_python_op_lib_name}_gen_python
     )
 
diff --git a/tensorflow/core/api_def/BUILD b/tensorflow/core/api_def/BUILD
index 81187ff6b7..58dbac4e8e 100644
--- a/tensorflow/core/api_def/BUILD
+++ b/tensorflow/core/api_def/BUILD
@@ -96,6 +96,7 @@ tf_cc_test(
     srcs = ["api_test.cc"],
     data = [
         ":base_api_def",
+        ":python_api_def",
     ],
     deps = [
         ":excluded_ops_lib",
diff --git a/tensorflow/core/api_def/api_test.cc b/tensorflow/core/api_def/api_test.cc
index 112c55ccc3..477a0b670e 100644
--- a/tensorflow/core/api_def/api_test.cc
+++ b/tensorflow/core/api_def/api_test.cc
@@ -41,8 +41,9 @@ namespace tensorflow {
 namespace {
 constexpr char kDefaultApiDefDir[] =
     "tensorflow/core/api_def/base_api";
+constexpr char kPythonApiDefDir[] =
+    "tensorflow/core/api_def/python_api";
 constexpr char kApiDefFilePattern[] = "api_def_*.pbtxt";
-}  // namespace
 
 // Reads golden ApiDef files and returns a map from file name to ApiDef file
 // contents.
@@ -66,9 +67,93 @@ void GetGoldenApiDefs(Env* env, const string& api_files_dir,
   }
 }
 
-class ApiTest : public ::testing::Test {
+void TestAllApiDefsHaveCorrespondingOp(
+    const OpList& ops, const std::unordered_map<string, ApiDef>& api_defs_map) {
+  std::unordered_set<string> op_names;
+  for (const auto& op : ops.op()) {
+    op_names.insert(op.name());
+  }
+  for (const auto& name_and_api_def : api_defs_map) {
+    ASSERT_TRUE(op_names.find(name_and_api_def.first) != op_names.end())
+        << name_and_api_def.first << " op has ApiDef but missing from ops. "
+        << "Does api_def_" << name_and_api_def.first << " need to be deleted?";
+  }
+}
+
+void TestAllApiDefInputArgsAreValid(
+    const OpList& ops, const std::unordered_map<string, ApiDef>& api_defs_map) {
+  for (const auto& op : ops.op()) {
+    const auto api_def_iter = api_defs_map.find(op.name());
+    if (api_def_iter == api_defs_map.end()) {
+      continue;
+    }
+    const auto& api_def = api_def_iter->second;
+    for (const auto& api_def_arg : api_def.in_arg()) {
+      bool found_arg = false;
+      for (const auto& op_arg : op.input_arg()) {
+        if (api_def_arg.name() == op_arg.name()) {
+          found_arg = true;
+          break;
+        }
+      }
+      ASSERT_TRUE(found_arg)
+          << "Input argument " << api_def_arg.name()
+          << " (overwritten in api_def_" << op.name()
+          << ".pbtxt) is not defined in OpDef for " << op.name();
+    }
+  }
+}
+
+void TestAllApiDefOutputArgsAreValid(
+    const OpList& ops, const std::unordered_map<string, ApiDef>& api_defs_map) {
+  for (const auto& op : ops.op()) {
+    const auto api_def_iter = api_defs_map.find(op.name());
+    if (api_def_iter == api_defs_map.end()) {
+      continue;
+    }
+    const auto& api_def = api_def_iter->second;
+    for (const auto& api_def_arg : api_def.out_arg()) {
+      bool found_arg = false;
+      for (const auto& op_arg : op.output_arg()) {
+        if (api_def_arg.name() == op_arg.name()) {
+          found_arg = true;
+          break;
+        }
+      }
+      ASSERT_TRUE(found_arg)
+          << "Output argument " << api_def_arg.name()
+          << " (overwritten in api_def_" << op.name()
+          << ".pbtxt) is not defined in OpDef for " << op.name();
+    }
+  }
+}
+
+void TestAllApiDefAttributeNamesAreValid(
+    const OpList& ops, const std::unordered_map<string, ApiDef>& api_defs_map) {
+  for (const auto& op : ops.op()) {
+    const auto api_def_iter = api_defs_map.find(op.name());
+    if (api_def_iter == api_defs_map.end()) {
+      continue;
+    }
+    const auto& api_def = api_def_iter->second;
+    for (const auto& api_def_attr : api_def.attr()) {
+      bool found_attr = false;
+      for (const auto& op_attr : op.attr()) {
+        if (api_def_attr.name() == op_attr.name()) {
+          found_attr = true;
+        }
+      }
+      ASSERT_TRUE(found_attr)
+          << "Attribute " << api_def_attr.name() << " (overwritten in api_def_"
+          << op.name() << ".pbtxt) is not defined in OpDef for " << op.name();
+    }
+  }
+}
+}  // namespace
+
+class BaseApiTest : public ::testing::Test {
  protected:
-  ApiTest() {
+  BaseApiTest() {
     OpRegistry::Global()->Export(false, &ops_);
     const std::vector<string> multi_line_fields = {"description"};
 
@@ -80,7 +165,7 @@ class ApiTest : public ::testing::Test {
 };
 
 // Check that all ops have an ApiDef.
-TEST_F(ApiTest, AllOpsAreInApiDef) {
+TEST_F(BaseApiTest, AllOpsAreInApiDef) {
   auto* excluded_ops = GetExcludedOps();
   for (const auto& op : ops_.op()) {
     if (excluded_ops->find(op.name()) != excluded_ops->end()) {
@@ -94,16 +179,8 @@ TEST_F(ApiTest, AllOpsAreInApiDef) {
 }
 
 // Check that ApiDefs have a corresponding op.
-TEST_F(ApiTest, AllApiDefsHaveCorrespondingOp) {
-  std::unordered_set<string> op_names;
-  for (const auto& op : ops_.op()) {
-    op_names.insert(op.name());
-  }
-  for (const auto& name_and_api_def : api_defs_map_) {
-    ASSERT_TRUE(op_names.find(name_and_api_def.first) != op_names.end())
-        << name_and_api_def.first << " op has ApiDef but missing from ops. "
-        << "Does api_def_" << name_and_api_def.first << " need to be deleted?";
-  }
+TEST_F(BaseApiTest, AllApiDefsHaveCorrespondingOp) {
+  TestAllApiDefsHaveCorrespondingOp(ops_, api_defs_map_);
 }
 
 string GetOpDefHasDocStringError(const string& op_name) {
@@ -117,7 +194,7 @@ string GetOpDefHasDocStringError(const string& op_name) {
 
 // Check that OpDef's do not have descriptions and summaries.
 // Descriptions and summaries must be in corresponding ApiDefs.
-TEST_F(ApiTest, OpDefsShouldNotHaveDocs) {
+TEST_F(BaseApiTest, OpDefsShouldNotHaveDocs) {
   auto* excluded_ops = GetExcludedOps();
   for (const auto& op : ops_.op()) {
     if (excluded_ops->find(op.name()) != excluded_ops->end()) {
@@ -143,62 +220,56 @@ TEST_F(ApiTest, OpDefsShouldNotHaveDocs) {
 
 // Checks that input arg names in an ApiDef match input
 // arg names in corresponding OpDef.
-TEST_F(ApiTest, AllApiDefInputArgsAreValid) {
-  for (const auto& op : ops_.op()) {
-    const auto& api_def = api_defs_map_[op.name()];
-    for (const auto& api_def_arg : api_def.in_arg()) {
-      bool found_arg = false;
-      for (const auto& op_arg : op.input_arg()) {
-        if (api_def_arg.name() == op_arg.name()) {
-          found_arg = true;
-          break;
-        }
-      }
-      ASSERT_TRUE(found_arg)
-          << "Input argument " << api_def_arg.name()
-          << " (overwritten in api_def_" << op.name()
-          << ".pbtxt) is not defined in OpDef for " << op.name();
-    }
-  }
+TEST_F(BaseApiTest, AllApiDefInputArgsAreValid) {
+  TestAllApiDefInputArgsAreValid(ops_, api_defs_map_);
 }
 
 // Checks that output arg names in an ApiDef match output
 // arg names in corresponding OpDef.
-TEST_F(ApiTest, AllApiDefOutputArgsAreValid) {
-  for (const auto& op : ops_.op()) {
-    const auto& api_def = api_defs_map_[op.name()];
-    for (const auto& api_def_arg : api_def.out_arg()) {
-      bool found_arg = false;
-      for (const auto& op_arg : op.output_arg()) {
-        if (api_def_arg.name() == op_arg.name()) {
-          found_arg = true;
-          break;
-        }
-      }
-      ASSERT_TRUE(found_arg)
-          << "Output argument " << api_def_arg.name()
-          << " (overwritten in api_def_" << op.name()
-          << ".pbtxt) is not defined in OpDef for " << op.name();
-    }
-  }
+TEST_F(BaseApiTest, AllApiDefOutputArgsAreValid) {
+  TestAllApiDefOutputArgsAreValid(ops_, api_defs_map_);
 }
 
 // Checks that attribute names in an ApiDef match attribute
 // names in corresponding OpDef.
-TEST_F(ApiTest, AllApiDefAttributeNamesAreValid) {
-  for (const auto& op : ops_.op()) {
-    const auto& api_def = api_defs_map_[op.name()];
-    for (const auto& api_def_attr : api_def.attr()) {
-      bool found_attr = false;
-      for (const auto& op_attr : op.attr()) {
-        if (api_def_attr.name() == op_attr.name()) {
-          found_attr = true;
-        }
-      }
-      ASSERT_TRUE(found_attr)
-          << "Attribute " << api_def_attr.name() << " (overwritten in api_def_"
-          << op.name() << ".pbtxt) is not defined in OpDef for " << op.name();
-    }
+TEST_F(BaseApiTest, AllApiDefAttributeNamesAreValid) {
+  TestAllApiDefAttributeNamesAreValid(ops_, api_defs_map_);
+}
+
+class PythonApiTest : public ::testing::Test {
+ protected:
+  PythonApiTest() {
+    OpRegistry::Global()->Export(false, &ops_);
+    const std::vector<string> multi_line_fields = {"description"};
+
+    Env* env = Env::Default();
+    GetGoldenApiDefs(env, kPythonApiDefDir, &api_defs_map_);
   }
+  OpList ops_;
+  std::unordered_map<string, ApiDef> api_defs_map_;
+};
+
+// Check that ApiDefs have a corresponding op.
+TEST_F(PythonApiTest, AllApiDefsHaveCorrespondingOp) {
+  TestAllApiDefsHaveCorrespondingOp(ops_, api_defs_map_);
 }
+
+// Checks that input arg names in an ApiDef match input
+// arg names in corresponding OpDef.
+TEST_F(PythonApiTest, AllApiDefInputArgsAreValid) {
+  TestAllApiDefInputArgsAreValid(ops_, api_defs_map_);
+}
+
+// Checks that output arg names in an ApiDef match output
+// arg names in corresponding OpDef.
+TEST_F(PythonApiTest, AllApiDefOutputArgsAreValid) {
+  TestAllApiDefOutputArgsAreValid(ops_, api_defs_map_);
+}
+
+// Checks that attribute names in an ApiDef match attribute
+// names in corresponding OpDef.
+TEST_F(PythonApiTest, AllApiDefAttributeNamesAreValid) {
+  TestAllApiDefAttributeNamesAreValid(ops_, api_defs_map_);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 363ff6fae9..a89142d814 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -4230,12 +4230,6 @@ filegroup(
     visibility = ["//tensorflow:__subpackages__"],
 )
 
-filegroup(
-    name = "hidden_ops",
-    srcs = ["ops/hidden_ops.txt"],
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 cuda_py_test(
     name = "accumulate_n_benchmark",
     size = "large",
diff --git a/tensorflow/python/build_defs.bzl b/tensorflow/python/build_defs.bzl
index 7f29adc06f..b9056f86e6 100644
--- a/tensorflow/python/build_defs.bzl
+++ b/tensorflow/python/build_defs.bzl
@@ -22,7 +22,6 @@ def tf_gen_op_wrapper_private_py(name, out=None, deps=[],
   bare_op_name = name[:-4] # Strip off the _gen
   tf_gen_op_wrapper_py(name=bare_op_name,
     out=out,
-    hidden_file="ops/hidden_ops.txt",
     visibility=visibility,
     deps=deps,
     require_shape_functions=require_shape_functions,
diff --git a/tensorflow/python/client/session_test.py b/tensorflow/python/client/session_test.py
index 768a5db88a..f12c005511 100644
--- a/tensorflow/python/client/session_test.py
+++ b/tensorflow/python/client/session_test.py
@@ -46,6 +46,7 @@ from tensorflow.python.framework import versions
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import gen_control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 # Import resource_variable_ops for the variables-to-tensor implicit conversion.
@@ -1745,8 +1746,10 @@ class SessionTest(test_util.TensorFlowTestCase):
   def runTestBuildGraphError(self, sess):
     # Ensure that errors from building the graph get propagated.
     data = array_ops.placeholder(dtypes.float32, shape=[])
-    enter_1 = control_flow_ops.enter(data, 'foo_1', False)
-    enter_2 = control_flow_ops.enter(data, 'foo_2', False)
+    # pylint: disable=protected-access
+    enter_1 = gen_control_flow_ops._enter(data, 'foo_1', False)
+    enter_2 = gen_control_flow_ops._enter(data, 'foo_2', False)
+    # pylint: enable=protected-access
     res = math_ops.add(enter_1, enter_2)
     with self.assertRaisesOpError('has inputs from different frames'):
       sess.run(res, feed_dict={data: 1.0})
diff --git a/tensorflow/python/eager/python_eager_op_gen.cc b/tensorflow/python/eager/python_eager_op_gen.cc
index 90a8779ff8..0f18f28c95 100644
--- a/tensorflow/python/eager/python_eager_op_gen.cc
+++ b/tensorflow/python/eager/python_eager_op_gen.cc
@@ -756,11 +756,21 @@ from tensorflow.python.util.tf_export import tf_export
   auto out = cleaned_ops.mutable_op();
   out->Reserve(ops.op_size());
   for (const auto& op_def : ops.op()) {
-    bool is_hidden = false;
-    for (const string& hidden : hidden_ops) {
-      if (op_def.name() == hidden) {
-        is_hidden = true;
-        break;
+    const auto* api_def = api_defs.GetApiDef(op_def.name());
+
+    if (api_def->visibility() == ApiDef::SKIP) {
+      continue;
+    }
+
+    // An op is hidden if either its ApiDef visibility is HIDDEN
+    // or it is in the hidden_ops list.
+    bool is_hidden = api_def->visibility() == ApiDef::HIDDEN;
+    if (!is_hidden) {
+      for (const string& hidden : hidden_ops) {
+        if (op_def.name() == hidden) {
+          is_hidden = true;
+          break;
+        }
       }
     }
 
@@ -777,7 +787,6 @@ from tensorflow.python.util.tf_export import tf_export
       continue;
     }
 
-    const auto* api_def = api_defs.GetApiDef(op_def.name());
     strings::StrAppend(&result,
                        GetEagerPythonOp(op_def, *api_def, function_name));
 
diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index 65810fa709..85cba59be4 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -476,9 +476,6 @@ GenPythonOp::GenPythonOp(const OpDef& op_def, const ApiDef& api_def,
 GenPythonOp::~GenPythonOp() {}
 
 string GenPythonOp::Code() {
-  if (api_def_.visibility() == ApiDef::SKIP) {
-    return "";
-  }
   // This has all the input args followed by those attrs that don't have
   // defaults.
   std::vector<ParamNames> params_no_default;
@@ -805,11 +802,21 @@ from tensorflow.python.util.tf_export import tf_export
   auto out = cleaned_ops.mutable_op();
   out->Reserve(ops.op_size());
   for (const auto& op_def : ops.op()) {
-    bool is_hidden = false;
-    for (const string& hidden : hidden_ops) {
-      if (op_def.name() == hidden) {
-        is_hidden = true;
-        break;
+    const auto* api_def = api_defs.GetApiDef(op_def.name());
+
+    if (api_def->visibility() == ApiDef::SKIP) {
+      continue;
+    }
+
+    // An op is hidden if either its ApiDef visibility is HIDDEN
+    // or it is in the hidden_ops list.
+    bool is_hidden = api_def->visibility() == ApiDef::HIDDEN;
+    if (!is_hidden) {
+      for (const string& hidden : hidden_ops) {
+        if (op_def.name() == hidden) {
+          is_hidden = true;
+          break;
+        }
       }
     }
 
@@ -826,7 +833,6 @@ from tensorflow.python.util.tf_export import tf_export
       continue;
     }
 
-    const auto* api_def = api_defs.GetApiDef(op_def.name());
     strings::StrAppend(&result, GetPythonOp(op_def, *api_def, function_name));
 
     if (!require_shapes) {
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index 5d648bb235..4fafc36014 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -44,6 +44,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import gen_control_flow_ops
 from tensorflow.python.ops import gen_data_flow_ops
 from tensorflow.python.ops import gen_logging_ops
 from tensorflow.python.ops import gen_state_ops
@@ -143,7 +144,7 @@ class ControlFlowTest(test.TestCase):
 
       enter_v = control_flow_ops._Enter(v, "foo_1", is_constant=True)
       nine = constant_op.constant(9)
-      enter_nine = control_flow_ops.enter(nine, "foo_1")
+      enter_nine = gen_control_flow_ops._enter(nine, "foo_1")
       op = state_ops.assign(enter_v, enter_nine)
       v2 = control_flow_ops.with_dependencies([op], enter_v)
       v3 = control_flow_ops.exit(v2)
@@ -163,9 +164,9 @@ class ControlFlowTest(test.TestCase):
   def testEnterMulExit(self):
     with self.test_session():
       data = constant_op.constant([1, 2, 3, 4, 5, 6], name="data")
-      enter_data = control_flow_ops.enter(data, "foo_1", False)
+      enter_data = gen_control_flow_ops._enter(data, "foo_1", False)
       five = constant_op.constant(5)
-      enter_five = control_flow_ops.enter(five, "foo_1", False)
+      enter_five = gen_control_flow_ops._enter(five, "foo_1", False)
       mul_op = math_ops.multiply(enter_data, enter_five)
       exit_op = control_flow_ops.exit(mul_op)
 
@@ -177,11 +178,12 @@ class ControlFlowTest(test.TestCase):
       v = variables.Variable([0.0, 0.0], dtype=dtypes.float32)
 
       # If is_constant=True, the shape information should be propagated.
-      enter_v_constant = control_flow_ops.enter(v, "frame1", is_constant=True)
+      enter_v_constant = gen_control_flow_ops._enter(
+          v, "frame1", is_constant=True)
       self.assertEqual(enter_v_constant.shape, [2])
 
       # Otherwise, the shape should be unknown.
-      enter_v_non_constant = control_flow_ops.enter(
+      enter_v_non_constant = gen_control_flow_ops._enter(
           v, "frame2", is_constant=False)
       self.assertEqual(enter_v_non_constant.shape, None)
 
@@ -255,8 +257,8 @@ class ControlFlowTest(test.TestCase):
       false = ops.convert_to_tensor(False)
       n = constant_op.constant(10)
 
-      enter_false = control_flow_ops.enter(false, "foo_1", False)
-      enter_n = control_flow_ops.enter(n, "foo_1", False)
+      enter_false = gen_control_flow_ops._enter(false, "foo_1", False)
+      enter_n = gen_control_flow_ops._enter(n, "foo_1", False)
 
       merge_n = control_flow_ops.merge([enter_n, enter_n], name="merge_n")[0]
       switch_n = control_flow_ops.switch(merge_n, enter_false)
@@ -273,9 +275,9 @@ class ControlFlowTest(test.TestCase):
       one = constant_op.constant(1)
       n = constant_op.constant(10)
 
-      enter_i = control_flow_ops.enter(zero, "foo", False)
-      enter_one = control_flow_ops.enter(one, "foo", True)
-      enter_n = control_flow_ops.enter(n, "foo", True)
+      enter_i = gen_control_flow_ops._enter(zero, "foo", False)
+      enter_one = gen_control_flow_ops._enter(one, "foo", True)
+      enter_n = gen_control_flow_ops._enter(n, "foo", True)
 
       with ops.device(test.gpu_device_name()):
         merge_i = control_flow_ops.merge([enter_i, enter_i])[0]
@@ -299,9 +301,9 @@ class ControlFlowTest(test.TestCase):
       one = constant_op.constant(1)
       n = constant_op.constant(10)
 
-      enter_i = control_flow_ops.enter(zero, "foo", False)
-      enter_one = control_flow_ops.enter(one, "foo", True)
-      enter_n = control_flow_ops.enter(n, "foo", True)
+      enter_i = gen_control_flow_ops._enter(zero, "foo", False)
+      enter_one = gen_control_flow_ops._enter(one, "foo", True)
+      enter_n = gen_control_flow_ops._enter(n, "foo", True)
 
       merge_i = control_flow_ops.merge([enter_i, enter_i])[0]
 
@@ -322,8 +324,8 @@ class ControlFlowTest(test.TestCase):
   def testDifferentFrame(self):
     with self.test_session():
       data = array_ops.placeholder(dtypes.float32, shape=[])
-      enter_1 = control_flow_ops.enter(data, "foo_1", False)
-      enter_2 = control_flow_ops.enter(data, "foo_2", False)
+      enter_1 = gen_control_flow_ops._enter(data, "foo_1", False)
+      enter_2 = gen_control_flow_ops._enter(data, "foo_2", False)
       res = math_ops.add(enter_1, enter_2)
       with self.assertRaisesOpError("has inputs from different frames"):
         res.eval(feed_dict={data: 1.0})
diff --git a/tensorflow/python/kernel_tests/control_flow_util_test.py b/tensorflow/python/kernel_tests/control_flow_util_test.py
index 39e96f74b0..23185eaeec 100644
--- a/tensorflow/python/kernel_tests/control_flow_util_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_util_test.py
@@ -41,17 +41,17 @@ class ControlFlowUtilTest(test.TestCase):
     self.assertFalse(control_flow_util.IsSwitch(test_ops.int_output().op))
 
   def testIsLoopEnter(self):
-    enter = gen_control_flow_ops.enter(1, frame_name="name").op
+    enter = gen_control_flow_ops._enter(1, frame_name="name").op
     self.assertTrue(control_flow_util.IsLoopEnter(enter))
     self.assertFalse(control_flow_util.IsLoopConstantEnter(enter))
 
-    ref_enter = gen_control_flow_ops.ref_enter(test_ops.ref_output(),
-                                               frame_name="name").op
+    ref_enter = gen_control_flow_ops._ref_enter(test_ops.ref_output(),
+                                                frame_name="name").op
     self.assertTrue(control_flow_util.IsLoopEnter(ref_enter))
     self.assertFalse(control_flow_util.IsLoopConstantEnter(ref_enter))
 
-    const_enter = gen_control_flow_ops.enter(1, frame_name="name",
-                                             is_constant=True).op
+    const_enter = gen_control_flow_ops._enter(1, frame_name="name",
+                                              is_constant=True).op
     self.assertTrue(control_flow_util.IsLoopEnter(const_enter))
     self.assertTrue(control_flow_util.IsLoopConstantEnter(const_enter))
 
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 49191c647d..33a92631d0 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -261,10 +261,10 @@ def _Enter(data,
   data = ops.internal_convert_to_tensor_or_indexed_slices(data, as_ref=True)
   if isinstance(data, ops.Tensor):
     if data.dtype._is_ref_dtype and use_ref:  # pylint: disable=protected-access
-      result = ref_enter(
+      result = gen_control_flow_ops._ref_enter(
           data, frame_name, is_constant, parallel_iterations, name=name)
     else:
-      result = enter(
+      result = gen_control_flow_ops._enter(
           data, frame_name, is_constant, parallel_iterations, name=name)
     if use_input_shape:
       result.set_shape(data.get_shape())
@@ -279,7 +279,7 @@ def _Enter(data,
         parallel_iterations=parallel_iterations,
         use_input_shape=use_input_shape,
         name=name)
-    indices = enter(
+    indices = gen_control_flow_ops._enter(
         data.indices,
         frame_name,
         is_constant,
@@ -290,7 +290,7 @@ def _Enter(data,
     if isinstance(data, ops.IndexedSlices):
       dense_shape = data.dense_shape
       if dense_shape is not None:
-        dense_shape = enter(
+        dense_shape = gen_control_flow_ops._enter(
             dense_shape,
             frame_name,
             is_constant,
@@ -300,7 +300,7 @@ def _Enter(data,
           dense_shape.set_shape(data.dense_shape.get_shape())
       return ops.IndexedSlices(values, indices, dense_shape)
     else:
-      dense_shape = enter(
+      dense_shape = gen_control_flow_ops._enter(
           data.dense_shape,
           frame_name,
           is_constant,
diff --git a/tensorflow/tools/api/tests/BUILD b/tensorflow/tools/api/tests/BUILD
index 8fb6b1cdfd..608a34ab7b 100644
--- a/tensorflow/tools/api/tests/BUILD
+++ b/tensorflow/tools/api/tests/BUILD
@@ -17,10 +17,6 @@ py_test(
     name = "api_compatibility_test",
     srcs = ["api_compatibility_test.py"],
     data = [
-        ":convert_from_multiline",
-        "//tensorflow/core/api_def:base_api_def",
-        "//tensorflow/core/api_def:python_api_def",
-        "//tensorflow/python:hidden_ops",
         "//tensorflow/tools/api/golden:api_golden",
         "//tensorflow/tools/api/tests:API_UPDATE_WARNING.txt",
         "//tensorflow/tools/api/tests:README.txt",
@@ -29,7 +25,6 @@ py_test(
     deps = [
         "//tensorflow:tensorflow_py",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:lib",
         "//tensorflow/python:platform",
         "//tensorflow/tools/api/lib:python_object_to_proto_visitor",
diff --git a/tensorflow/tools/api/tests/api_compatibility_test.py b/tensorflow/tools/api/tests/api_compatibility_test.py
index afcbf50944..c1e09cc531 100644
--- a/tensorflow/tools/api/tests/api_compatibility_test.py
+++ b/tensorflow/tools/api/tests/api_compatibility_test.py
@@ -28,10 +28,8 @@ from __future__ import division
 from __future__ import print_function
 
 import argparse
-from collections import defaultdict
 import os
 import re
-import subprocess
 import sys
 import unittest
 
@@ -39,7 +37,6 @@ import tensorflow as tf
 
 from google.protobuf import text_format
 
-from tensorflow.core.framework import api_def_pb2
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.platform import resource_loader
 from tensorflow.python.platform import test
@@ -67,11 +64,6 @@ _API_GOLDEN_FOLDER = 'tensorflow/tools/api/golden'
 _TEST_README_FILE = 'tensorflow/tools/api/tests/README.txt'
 _UPDATE_WARNING_FILE = 'tensorflow/tools/api/tests/API_UPDATE_WARNING.txt'
 
-_CONVERT_FROM_MULTILINE_SCRIPT = 'tensorflow/tools/api/tests/convert_from_multiline'
-_BASE_API_DIR = 'tensorflow/core/api_def/base_api'
-_PYTHON_API_DIR = 'tensorflow/core/api_def/python_api'
-_HIDDEN_OPS_FILE = 'tensorflow/python/ops/hidden_ops.txt'
-
 
 def _KeyToFilePath(key):
   """From a given key, construct a filepath."""
@@ -96,55 +88,6 @@ def _FileNameToKey(filename):
   return api_object_key
 
 
-def _GetSymbol(symbol_id):
-  """Get TensorFlow symbol based on the given identifier.
-
-  Args:
-    symbol_id: Symbol identifier in the form module1.module2. ... .sym.
-
-  Returns:
-    Symbol corresponding to the given id.
-  """
-  # Ignore first module which should be tensorflow
-  symbol_id_split = symbol_id.split('.')[1:]
-  symbol = tf
-  for sym in symbol_id_split:
-    symbol = getattr(symbol, sym)
-  return symbol
-
-
-def _IsGenModule(module_name):
-  if not module_name:
-    return False
-  module_name_split = module_name.split('.')
-  return module_name_split[-1].startswith('gen_')
-
-
-def _GetHiddenOps():
-  hidden_ops_file = file_io.FileIO(_HIDDEN_OPS_FILE, 'r')
-  hidden_ops = set()
-  for line in hidden_ops_file:
-    line = line.strip()
-    if not line:
-      continue
-    if line[0] == '#':  # comment line
-      continue
-    # If line is of the form "op_name # comment", only keep the op_name.
-    line_split = line.split('#')
-    hidden_ops.add(line_split[0].strip())
-  return hidden_ops
-
-
-def _GetGoldenApiDefs():
-  old_api_def_files = file_io.get_matching_files(_GetApiDefFilePath('*'))
-  return {file_path: file_io.read_file_to_string(file_path)
-          for file_path in old_api_def_files}
-
-
-def _GetApiDefFilePath(graph_op_name):
-  return os.path.join(_PYTHON_API_DIR, 'api_def_%s.pbtxt' % graph_op_name)
-
-
 class ApiCompatibilityTest(test.TestCase):
 
   def __init__(self, *args, **kwargs):
@@ -287,188 +230,6 @@ class ApiCompatibilityTest(test.TestCase):
         update_goldens=FLAGS.update_goldens)
 
 
-class ApiDefTest(test.TestCase):
-
-  def __init__(self, *args, **kwargs):
-    super(ApiDefTest, self).__init__(*args, **kwargs)
-    self._first_cap_pattern = re.compile('(.)([A-Z][a-z]+)')
-    self._all_cap_pattern = re.compile('([a-z0-9])([A-Z])')
-
-  def _GenerateLowerCaseOpName(self, op_name):
-    lower_case_name = self._first_cap_pattern.sub(r'\1_\2', op_name)
-    return self._all_cap_pattern.sub(r'\1_\2', lower_case_name).lower()
-
-  def _CreatePythonApiDef(self, base_api_def, endpoint_names):
-    """Creates Python ApiDef that overrides base_api_def if needed.
-
-    Args:
-      base_api_def: (api_def_pb2.ApiDef) base ApiDef instance.
-      endpoint_names: List of Python endpoint names.
-
-    Returns:
-      api_def_pb2.ApiDef instance with overrides for base_api_def
-      if module.name endpoint is different from any existing
-      endpoints in base_api_def. Otherwise, returns None.
-    """
-    endpoint_names_set = set(endpoint_names)
-
-    # If the only endpoint is equal to graph_op_name then
-    # it is equivalent to having no endpoints.
-    if (not base_api_def.endpoint and len(endpoint_names) == 1
-        and endpoint_names[0] ==
-        self._GenerateLowerCaseOpName(base_api_def.graph_op_name)):
-      return None
-
-    base_endpoint_names_set = {
-        self._GenerateLowerCaseOpName(endpoint.name)
-        for endpoint in base_api_def.endpoint}
-
-    if endpoint_names_set == base_endpoint_names_set:
-      return None  # All endpoints are the same
-
-    api_def = api_def_pb2.ApiDef()
-    api_def.graph_op_name = base_api_def.graph_op_name
-
-    for endpoint_name in sorted(endpoint_names):
-      new_endpoint = api_def.endpoint.add()
-      new_endpoint.name = endpoint_name
-
-    return api_def
-
-  def _GetBaseApiMap(self):
-    """Get a map from graph op name to its base ApiDef.
-
-    Returns:
-      Dictionary mapping graph op name to corresponding ApiDef.
-    """
-    # Convert base ApiDef in Multiline format to Proto format.
-    converted_base_api_dir = os.path.join(
-        test.get_temp_dir(), 'temp_base_api_defs')
-    subprocess.check_call(
-        [os.path.join(resource_loader.get_root_dir_with_all_resources(),
-                      _CONVERT_FROM_MULTILINE_SCRIPT),
-         _BASE_API_DIR, converted_base_api_dir])
-
-    name_to_base_api_def = {}
-    base_api_files = file_io.get_matching_files(
-        os.path.join(converted_base_api_dir, 'api_def_*.pbtxt'))
-    for base_api_file in base_api_files:
-      if file_io.file_exists(base_api_file):
-        api_defs = api_def_pb2.ApiDefs()
-        text_format.Merge(
-            file_io.read_file_to_string(base_api_file), api_defs)
-        for api_def in api_defs.op:
-          name_to_base_api_def[api_def.graph_op_name] = api_def
-    return name_to_base_api_def
-
-  def _AddHiddenOpOverrides(self, name_to_base_api_def, api_def_map):
-    """Adds ApiDef overrides to api_def_map for hidden Python ops.
-
-    Args:
-      name_to_base_api_def: Map from op name to base api_def_pb2.ApiDef.
-      api_def_map: Map from file path to api_def_pb2.ApiDefs for Python API
-        overrides.
-    """
-    hidden_ops = _GetHiddenOps()
-    for hidden_op in hidden_ops:
-      if hidden_op not in name_to_base_api_def:
-        logging.warning('Unexpected hidden op name: %s' % hidden_op)
-        continue
-
-      base_api_def = name_to_base_api_def[hidden_op]
-      if base_api_def.visibility != api_def_pb2.ApiDef.HIDDEN:
-        api_def = api_def_pb2.ApiDef()
-        api_def.graph_op_name = base_api_def.graph_op_name
-        api_def.visibility = api_def_pb2.ApiDef.HIDDEN
-
-        file_path = _GetApiDefFilePath(base_api_def.graph_op_name)
-        api_def_map[file_path].op.extend([api_def])
-
-  @unittest.skipUnless(
-      sys.version_info.major == 2 and os.uname()[0] == 'Linux',
-      'API compabitility test goldens are generated using python2 on Linux.')
-  def testAPIDefCompatibility(self):
-    # Get base ApiDef
-    name_to_base_api_def = self._GetBaseApiMap()
-    snake_to_camel_graph_op_names = {
-        self._GenerateLowerCaseOpName(name): name
-        for name in name_to_base_api_def.keys()}
-    # Extract Python API
-    visitor = python_object_to_proto_visitor.PythonObjectToProtoVisitor()
-    public_api_visitor = public_api.PublicAPIVisitor(visitor)
-    public_api_visitor.do_not_descend_map['tf'].append('contrib')
-    traverse.traverse(tf, public_api_visitor)
-    proto_dict = visitor.GetProtos()
-
-    # Map from file path to Python ApiDefs.
-    new_api_defs_map = defaultdict(api_def_pb2.ApiDefs)
-    # We need to override all endpoints even if 1 endpoint differs from base
-    # ApiDef. So, we first create a map from an op to all its endpoints.
-    op_to_endpoint_name = defaultdict(list)
-
-    # Generate map from generated python op to endpoint names.
-    for public_module, value in proto_dict.items():
-      module_obj = _GetSymbol(public_module)
-      for sym in value.tf_module.member_method:
-        obj = getattr(module_obj, sym.name)
-
-        # Check if object is defined in gen_* module. That is,
-        # the object has been generated from OpDef.
-        if hasattr(obj, '__module__') and _IsGenModule(obj.__module__):
-          if obj.__name__ not in snake_to_camel_graph_op_names:
-            # Symbol might be defined only in Python and not generated from
-            # C++ api.
-            continue
-          relative_public_module = public_module[len('tensorflow.'):]
-          full_name = (relative_public_module + '.' + sym.name
-                       if relative_public_module else sym.name)
-          op_to_endpoint_name[obj].append(full_name)
-
-    # Generate Python ApiDef overrides.
-    for op, endpoint_names in op_to_endpoint_name.items():
-      graph_op_name = snake_to_camel_graph_op_names[op.__name__]
-      api_def = self._CreatePythonApiDef(
-          name_to_base_api_def[graph_op_name], endpoint_names)
-
-      if api_def:
-        file_path = _GetApiDefFilePath(graph_op_name)
-        api_defs = new_api_defs_map[file_path]
-        api_defs.op.extend([api_def])
-
-    self._AddHiddenOpOverrides(name_to_base_api_def, new_api_defs_map)
-
-    old_api_defs_map = _GetGoldenApiDefs()
-    for file_path, new_api_defs in new_api_defs_map.items():
-      # Get new ApiDef string.
-      new_api_defs_str = str(new_api_defs)
-
-      # Get current ApiDef for the given file.
-      old_api_defs_str = (
-          old_api_defs_map[file_path] if file_path in old_api_defs_map else '')
-
-      if old_api_defs_str == new_api_defs_str:
-        continue
-
-      if FLAGS.update_goldens:
-        logging.info('Updating %s...' % file_path)
-        file_io.write_string_to_file(file_path, new_api_defs_str)
-      else:
-        self.assertMultiLineEqual(
-            old_api_defs_str, new_api_defs_str,
-            'To update golden API files, run api_compatibility_test locally '
-            'with --update_goldens=True flag.')
-
-    for file_path in set(old_api_defs_map) - set(new_api_defs_map):
-      if FLAGS.update_goldens:
-        logging.info('Deleting %s...' % file_path)
-        file_io.delete_file(file_path)
-      else:
-        self.fail(
-            '%s file is no longer needed and should be removed.'
-            'To update golden API files, run api_compatibility_test locally '
-            'with --update_goldens=True flag.' % file_path)
-
-
 if __name__ == '__main__':
   parser = argparse.ArgumentParser()
   parser.add_argument(
-- 
GitLab


From 08b68c87a718cc8d32f4725d053cc192b61f3c64 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 2 Feb 2018 10:46:34 -0800
Subject: [PATCH 583/966] Automated g4 rollback of changelist 184273245

PiperOrigin-RevId: 184303789
---
 tensorflow/python/ops/array_grad.py | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/tensorflow/python/ops/array_grad.py b/tensorflow/python/ops/array_grad.py
index 9745d38dc2..c9292184e6 100644
--- a/tensorflow/python/ops/array_grad.py
+++ b/tensorflow/python/ops/array_grad.py
@@ -27,7 +27,6 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
@@ -116,19 +115,6 @@ def _ConcatGradHelper(op, grad, start_value_index, end_value_index, dim_index):
                                                         non_neg_concat_dim)
       out_grads = array_ops.split(grad, sizes, non_neg_concat_dim)
     else:
-      if constant_op.is_constant(concat_dim):
-        # If concat_dim is a constant defined in a different context,
-        # then we duplicate it in the current context to avoid passing it
-        # through an Enter node.
-        # This is a small optimization in general, but it is required when
-        # compiling with XLA, as XLA needs the concat input to be folded into a
-        # constant.
-        grad_context = control_flow_util.GetOutputContext(grad.op)
-        dim_context = control_flow_util.GetOutputContext(concat_dim.op)
-        if dim_context != grad_context:
-          value = tensor_util.constant_value(concat_dim)
-          concat_dim = constant_op.constant(value=value, dtype=concat_dim.dtype)
-
       # Using mod here for convenience since concat_dim is already verified
       # in concat implementation to be within the allowed [-rank, rank) range.
       non_neg_concat_dim = concat_dim % array_ops.rank(input_values[0])
-- 
GitLab


From bc66b6a4c3a11231b0d19e6a218a42999ee3cb77 Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Fri, 2 Feb 2018 11:13:44 -0800
Subject: [PATCH 584/966] Revert "Updating the version to 1.6.0-rc0." (#16700)

---
 .../eager/python/examples/mnist/mnist.py      |  2 +-
 .../contrib/tpu/profiler/pip_package/setup.py |  2 +-
 tensorflow/core/public/version.h              |  4 ++--
 tensorflow/docs_src/install/install_c.md      |  2 +-
 tensorflow/docs_src/install/install_go.md     |  2 +-
 tensorflow/docs_src/install/install_java.md   | 22 +++++++++----------
 tensorflow/docs_src/install/install_linux.md  | 22 +++++++++----------
 tensorflow/docs_src/install/install_mac.md    | 10 ++++-----
 .../docs_src/install/install_sources.md       | 10 +++------
 tensorflow/tools/docker/Dockerfile.devel      |  2 +-
 .../tools/docker/Dockerfile.devel-cpu-mkl     |  2 +-
 tensorflow/tools/docker/Dockerfile.devel-gpu  |  2 +-
 tensorflow/tools/pip_package/setup.py         |  2 +-
 13 files changed, 40 insertions(+), 44 deletions(-)

diff --git a/tensorflow/contrib/eager/python/examples/mnist/mnist.py b/tensorflow/contrib/eager/python/examples/mnist/mnist.py
index 772f59562b..ed7dbc8904 100644
--- a/tensorflow/contrib/eager/python/examples/mnist/mnist.py
+++ b/tensorflow/contrib/eager/python/examples/mnist/mnist.py
@@ -39,7 +39,7 @@ class MNISTModel(tfe.Network):
   """MNIST Network.
 
   Network structure is equivalent to:
-  https://github.com/tensorflow/tensorflow/blob/r1.6/tensorflow/examples/tutorials/mnist/mnist_deep.py
+  https://github.com/tensorflow/tensorflow/blob/r1.5/tensorflow/examples/tutorials/mnist/mnist_deep.py
   and
   https://github.com/tensorflow/models/blob/master/tutorials/image/mnist/convolutional.py
 
diff --git a/tensorflow/contrib/tpu/profiler/pip_package/setup.py b/tensorflow/contrib/tpu/profiler/pip_package/setup.py
index cb61984799..3dffebe668 100644
--- a/tensorflow/contrib/tpu/profiler/pip_package/setup.py
+++ b/tensorflow/contrib/tpu/profiler/pip_package/setup.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 from setuptools import setup
 
-_VERSION = '1.6.0-rc0'
+_VERSION = '1.5.0-rc1'
 
 CONSOLE_SCRIPTS = [
     'capture_tpu_profile=cloud_tpu_profiler.main:run_main',
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 50bfa91267..b02f899b87 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -19,12 +19,12 @@ limitations under the License.
 // TensorFlow uses semantic versioning, see http://semver.org/.
 
 #define TF_MAJOR_VERSION 1
-#define TF_MINOR_VERSION 6
+#define TF_MINOR_VERSION 5
 #define TF_PATCH_VERSION 0
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
 // "-beta", "-rc", "-rc.1")
-#define TF_VERSION_SUFFIX "-rc0"
+#define TF_VERSION_SUFFIX ""
 
 #define TF_STR_HELPER(x) #x
 #define TF_STR(x) TF_STR_HELPER(x)
diff --git a/tensorflow/docs_src/install/install_c.md b/tensorflow/docs_src/install/install_c.md
index a783205b4a..14add7c77e 100644
--- a/tensorflow/docs_src/install/install_c.md
+++ b/tensorflow/docs_src/install/install_c.md
@@ -38,7 +38,7 @@ enable TensorFlow for C:
          OS="linux" # Change to "darwin" for macOS
          TARGET_DIRECTORY="/usr/local"
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.6.0-rc0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.5.0.tar.gz" |
            sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_go.md b/tensorflow/docs_src/install/install_go.md
index 5249e04615..d2af9d9843 100644
--- a/tensorflow/docs_src/install/install_go.md
+++ b/tensorflow/docs_src/install/install_go.md
@@ -38,7 +38,7 @@ steps to install this library and enable TensorFlow for Go:
          TF_TYPE="cpu" # Change to "gpu" for GPU support
          TARGET_DIRECTORY='/usr/local'
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.6.0-rc0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.5.0.tar.gz" |
          sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_java.md b/tensorflow/docs_src/install/install_java.md
index 0c6c773e62..e5388c4b1e 100644
--- a/tensorflow/docs_src/install/install_java.md
+++ b/tensorflow/docs_src/install/install_java.md
@@ -36,7 +36,7 @@ following to the project's `pom.xml` to use the TensorFlow Java APIs:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>tensorflow</artifactId>
-  <version>1.6.0-rc0</version>
+  <version>1.5.0</version>
 </dependency>
 ```
 
@@ -65,7 +65,7 @@ As an example, these steps will create a Maven project that uses TensorFlow:
                <dependency>
                  <groupId>org.tensorflow</groupId>
                  <artifactId>tensorflow</artifactId>
-                 <version>1.6.0-rc0</version>
+                 <version>1.5.0</version>
                </dependency>
              </dependencies>
          </project>
@@ -123,12 +123,12 @@ instead:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow</artifactId>
-  <version>1.6.0-rc0</version>
+  <version>1.5.0</version>
 </dependency>
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow_jni_gpu</artifactId>
-  <version>1.6.0-rc0</version>
+  <version>1.5.0</version>
 </dependency>
 ```
 
@@ -147,7 +147,7 @@ refer to the simpler instructions above instead.
 Take the following steps to install TensorFlow for Java on Linux or macOS:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.6.0-rc0.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.5.0.jar),
      which is the TensorFlow Java Archive (JAR).
 
   2. Decide whether you will run TensorFlow for Java on CPU(s) only or with
@@ -166,7 +166,7 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
          OS=$(uname -s | tr '[:upper:]' '[:lower:]')
          mkdir -p ./jni
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.6.0-rc0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.5.0.tar.gz" |
            tar -xz -C ./jni
 
 ### Install on Windows
@@ -174,10 +174,10 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
 Take the following steps to install TensorFlow for Java on Windows:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.6.0-rc0.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.5.0.jar),
      which is the TensorFlow Java Archive (JAR).
   2. Download the following Java Native Interface (JNI) file appropriate for
-     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.6.0-rc0.zip).
+     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.5.0.zip).
   3. Extract this .zip file.
 
 
@@ -225,7 +225,7 @@ must be part of your `classpath`. For example, you can include the
 downloaded `.jar` in your `classpath` by using the `-cp` compilation flag
 as follows:
 
-<pre><b>javac -cp libtensorflow-1.6.0-rc0.jar HelloTF.java</b></pre>
+<pre><b>javac -cp libtensorflow-1.5.0.jar HelloTF.java</b></pre>
 
 
 ### Running
@@ -239,11 +239,11 @@ two files are available to the JVM:
 For example, the following command line executes the `HelloTF` program on Linux
 and macOS X:
 
-<pre><b>java -cp libtensorflow-1.6.0-rc0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.5.0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
 
 And the following command line executes the `HelloTF` program on Windows:
 
-<pre><b>java -cp libtensorflow-1.6.0-rc0.jar;. -Djava.library.path=jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.5.0.jar;. -Djava.library.path=jni HelloTF</b></pre>
 
 If the program prints <tt>Hello from <i>version</i></tt>, you've successfully
 installed TensorFlow for Java and are ready to use the API.  If the program
diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index 105b225177..cd8c14599f 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -188,7 +188,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      Virtualenv environment:
 
      <pre>(tensorflow)$ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common_installation_problems).
@@ -293,7 +293,7 @@ take the following steps:
 
      <pre>
      $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp34-cp34m-linux_x86_64.whl</b>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp34-cp34m-linux_x86_64.whl</b>
      </pre>
 
      If this step fails, see
@@ -480,7 +480,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
 
      <pre>
      (tensorflow)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -648,14 +648,14 @@ This section documents the relevant values for Linux installations.
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp27-none-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.6.0rc0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0-cp27-none-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -667,14 +667,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.6.0rc0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -686,14 +686,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.6.0rc0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
@@ -705,14 +705,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.6.0rc0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
diff --git a/tensorflow/docs_src/install/install_mac.md b/tensorflow/docs_src/install/install_mac.md
index a6ea548cfb..f49d3a2f08 100644
--- a/tensorflow/docs_src/install/install_mac.md
+++ b/tensorflow/docs_src/install/install_mac.md
@@ -115,7 +115,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      TensorFlow in the active Virtualenv is as follows:
 
      <pre> $ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc0-py3-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0-py3-none-any.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common-installation-problems).
@@ -238,7 +238,7 @@ take the following steps:
      issue the following command:
 
      <pre> $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc0-py3-none-any.whl</b> </pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0-py3-none-any.whl</b> </pre>
 
      If the preceding command fails, see
      [installation problems](#common-installation-problems).
@@ -347,7 +347,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
      TensorFlow for Python 2.7:
 
      <pre> (<i>targetDirectory</i>)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc0-py2-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0-py2-none-any.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -520,7 +520,7 @@ This section documents the relevant values for Mac OS installations.
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc0-py2-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0-py2-none-any.whl
 </pre>
 
 
@@ -528,5 +528,5 @@ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc0-py2-none-a
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc0-py3-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0-py3-none-any.whl
 </pre>
diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index 36dffd85dc..bc7d2080dc 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -359,10 +359,10 @@ Invoke `pip install` to install that pip package.
 The filename of the `.whl` file depends on your platform.
 For example, the following command will install the pip package
 
-for TensorFlow 1.6.0rc0 on Linux:
+for TensorFlow 1.5.0 on Linux:
 
 <pre>
-$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.6.0rc0-py2-none-any.whl</b>
+$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.5.0-py2-none-any.whl</b>
 </pre>
 
 ## Validate your installation
@@ -460,8 +460,7 @@ Stack Overflow and specify the `tensorflow` tag.
 **Linux**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
-<tr><td>tensorflow-1.6.0rc0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>N/A</td><td>N/A</td></tr>
-<tr><td>tensorflow_gpu-1.6.0rc0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>7</td><td>9</td></tr>
+
 <tr><td>tensorflow-1.5.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.8.0</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.5.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.8.0</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.4.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.5.4</td><td>N/A</td><td>N/A</td></tr>
@@ -479,7 +478,6 @@ Stack Overflow and specify the `tensorflow` tag.
 **Mac**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
-<tr><td>tensorflow-1.6.0rc0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.8.1</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.5.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.8.1</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.4.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.5.4</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.3.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.4.5</td><td>N/A</td><td>N/A</td></tr>
@@ -493,8 +491,6 @@ Stack Overflow and specify the `tensorflow` tag.
 **Windows**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
-<tr><td>tensorflow-1.6.0rc0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
-<tr><td>tensorflow_gpu-1.6.0rc0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.5.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.5.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.4.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index d16761c367..5dc4a053fd 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -70,7 +70,7 @@ RUN mkdir /bazel && \
 
 # Download and build TensorFlow.
 WORKDIR /tensorflow
-RUN git clone --branch=r1.6 --depth=1 https://github.com/tensorflow/tensorflow.git .
+RUN git clone --branch=r1.5 --depth=1 https://github.com/tensorflow/tensorflow.git .
 
 # TODO(craigcitro): Don't install the pip package, since it makes it
 # more difficult to experiment with local changes. Instead, just add
diff --git a/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl b/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
index 3690e7dfe5..96b260ad3a 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
+++ b/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
@@ -3,7 +3,7 @@ FROM tensorflow/tensorflow:latest-devel
 LABEL maintainer="Clayne Robison<clayne.b.robison@intel.com>"
 
 # These arguments are parameterized. Use --build-args to override.
-ARG TF_BRANCH=r1.6
+ARG TF_BRANCH=r1.5
 ARG WHL_DIR=/whl
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index 4ef37881bc..07ffd3839a 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -79,7 +79,7 @@ RUN mkdir /bazel && \
 
 # Download and build TensorFlow.
 WORKDIR /tensorflow
-RUN git clone --branch=r1.6 --depth=1 https://github.com/tensorflow/tensorflow.git .
+RUN git clone --branch=r1.5 --depth=1 https://github.com/tensorflow/tensorflow.git .
 
 # Configure the build for our CUDA configuration.
 ENV CI_BUILD_PYTHON python
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 2002786999..d7fab2b93a 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -29,7 +29,7 @@ from setuptools.dist import Distribution
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
-_VERSION = '1.6.0-rc0'
+_VERSION = '1.5.0'
 
 REQUIRED_PACKAGES = [
     'absl-py >= 0.1.6',
-- 
GitLab


From 123c613fd8ac5b1f11681849e4e009d4bcb89aa4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 2 Feb 2018 10:58:12 -0800
Subject: [PATCH 585/966] Register GPU host kernels for Identity and
 RefIdentity.

PiperOrigin-RevId: 184305574
---
 tensorflow/core/kernels/identity_op.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/core/kernels/identity_op.cc b/tensorflow/core/kernels/identity_op.cc
index 1db9263e5d..a18a72c66d 100644
--- a/tensorflow/core/kernels/identity_op.cc
+++ b/tensorflow/core/kernels/identity_op.cc
@@ -128,6 +128,7 @@ REGISTER_GPU_KERNEL(Variant);
 
 REGISTER_GPU_HOST_KERNEL(int32);
 REGISTER_GPU_HOST_KERNEL(bool);
+REGISTER_GPU_HOST_KERNEL(string);
 
 #undef REGISTER_GPU_HOST_KERNEL
 
-- 
GitLab


From 4b83dea191761967e5d4c705caac6078f8360a1e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 2 Feb 2018 11:06:09 -0800
Subject: [PATCH 586/966] Add shape inference for outside_compilation graph
 rewrite. Pull out enough of the graph to enable inference of the shape of a
 SendFromHost Op once the shape of corresponding RecvAtHost Ops are known.
 END_PUBLIC

Fixed open source build breaks.

BEGIN_PUBLIC
Automated g4 rollback of changelist 184169668

PiperOrigin-RevId: 184306845
---
 .../jit/encapsulate_subgraphs_pass.cc         | 649 ++++++++++++++---
 .../jit/encapsulate_subgraphs_pass_test.cc    | 682 ++++++++++++++----
 tensorflow/core/framework/function.cc         |  22 +-
 tensorflow/core/framework/function.h          |  15 +-
 4 files changed, 1127 insertions(+), 241 deletions(-)

diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
index 0de163d3a8..9c372a0127 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
@@ -30,12 +30,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/common_runtime/shape_refiner.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
@@ -141,8 +143,7 @@ struct NodeSlot {
 // everything to use it.
 static const char* const kArgOp = "_Arg";
 static const char* const kRetValOp = "_Retval";
-static const char* const kSendToHostOp = "_XlaSendToHost";
-static const char* const kRecvFromHostOp = "_XlaRecvFromHost";
+static const char* const kHostComputeOp = "_XlaHostCompute";
 static const char* const kSendFromHostOp = "_XlaSendFromHost";
 static const char* const kRecvAtHostOp = "_XlaRecvAtHost";
 
@@ -171,7 +172,8 @@ class Encapsulator {
 
   // Write a copy of the input graph to 'graph_out', where the subgraphs are
   // replaced with calls to the new functions.
-  Status BuildOutputGraph(bool parallel_checking, Graph* graph_out);
+  Status BuildOutputGraph(bool parallel_checking, Graph* graph_out,
+                          FunctionLibraryDefinition* library);
 
  private:
   // A subgraph of the input, all marked with a common 'group_attribute'
@@ -201,21 +203,29 @@ class Encapsulator {
   //     ..             .
   //  RAH -->  C  --> SFH
   //
-  // The compiled cluster is as follows. STH is a SendToHost node which is the
-  // source of a channel to the RAH node above. RFH is a RecvFromHost node which
-  // is the destination of a channel from the SFH node above. There is a control
-  // edge that ensures RFH follows STH, which is used in shape inference to
-  // ensure that the shapes on the STH host channel are known before the RFH
-  // channel is compiled.
+  // The compiled cluster is as follows. HC is a HostCompute node which is the
+  // source of a channel to the RAH node above and the destination of a channel
+  // from the SFH node above.
   //
-  //  Arg  --> B  --> STH  ..>  RFH  --> D --> Retval
+  //  Arg  --> B  --> HC  --> D --> Retval
   //
-  // The channels STH/RAH and SFH/RFH each transmit a tuple, so there is at most
-  // one RAH and SFH in each compiled cluster. This design is preferred over
-  // adding separate Arg/Retval nodes for each transmitted value because it
-  // simplifies the host code that would like to limit communication between
-  // host and device and, e.g., raise only one interrupt per channel rather than
-  // one per transmitted value.
+  // The channels HC/RAH and SFH/HC each transmit multiple tensors, so there is
+  // at most one RAH and SFH in each outside_compilation cluster. This design is
+  // preferred over adding separate Arg/Retval nodes for each transmitted value
+  // because it allows optimizations to the host code that would like to limit
+  // communication between host and device and, e.g., raise only one interrupt
+  // per channel rather than one per transmitted value.
+  //
+  // The shapes of the outputs from the HC node in general cannot be determined
+  // until the shapes of its inputs are known at compile time, since e.g.,
+  // above, the shape of C's outputs aren't known until the shape of its inputs
+  // are known. If the shapes of the HC's outputs can be determined during the
+  // rewrite, they are stored in the node's 'shapes' attr. Otherwise a minimal
+  // graph is stored in the shape_inference_graph attr. This graph can be used
+  // when compiling the HC Op to determined the shape of the SFH inputs given
+  // the shapes of any ancestor RAH outputs. If it can be determined that the
+  // shape of the SFH inputs will not be inferrable even once the shapes of the
+  // RAH outputs are known, an error is returned by the rewriter.
   class Subgraph {
    public:
     // Creates a graph to build the subgraph in, if it doesn't already exist,
@@ -246,6 +256,10 @@ class Encapsulator {
         const std::unordered_map<const Node*, Node*>& node_images,
         Graph* graph_out);
 
+    // Returns the names of all the outside_compilation subgraphs in this
+    // Subgraph.
+    void GetOutsideCompilationSubgraphNames(std::vector<string>* names) const;
+
     // Returns the Node that inputs to the function should be wired up to.
     Node* GetCallNodeForInputs() const;
 
@@ -305,15 +319,9 @@ class Encapsulator {
     void RecordOutsideCompilationOutputOrControl(
         const string& outside_compilation_id, const Edge* edge);
 
-    // Adds the SendToHost nodes for each outside_compilation subgraph once the
-    // edges have all been recorded via RecordOutsideCompilationInputOrControl.
-    Status AddSendsToOutsideCompilation(
-        const std::unordered_map<const Node*, Node*>& node_images);
-
-    // Adds the RecvFromHost nodes for each outside_compilation subgraph once
-    // the edges have all been recorded via
-    // RecordOutsideCompilationOutputOrControl.
-    Status AddRecvsFromOutsideCompilation(
+    // Adds the HostCompute nodes for each outside_compilation subgraph.
+    Status AddHostComputes(
+        const string& subgraph_name,
         const std::unordered_map<const Node*, Node*>& node_images);
 
     // Creates the sequencer node if it doesn't exist, adding it to graph_out.
@@ -323,10 +331,16 @@ class Encapsulator {
     // all the downstream nodes of call_node_outputs.
     void ConnectSequencerToOutputs(Graph* graph_out);
 
+    Status AddShapeInferenceInfo(
+        const string& outside_compilation_subgraph_name,
+        const std::vector<TensorShapeProto>& shapes, GraphDef* inference_graph);
+
+    Status ReplaceFunctionDef(FunctionLibraryDefinition* library);
+
    private:
     struct OutsideCompilationSubgraph {
       // Map from source (producer node/slot) tensors in the original graph to
-      // input index (slot number in the SendToHost/RecvAtHost nodes that will
+      // input index (slot number in the HostCompute/RecvAtHost nodes that will
       // be created) for the outside_compilation subgraph.
       std::unordered_map<NodeSlot, int, NodeSlot::Hasher> inputs;
 
@@ -335,14 +349,14 @@ class Encapsulator {
       // outside_compilation subgraph. These are recorded by
       // RecordOutsideCompilationInputOrControl while walking all the subgraph
       // edges, and lifted control edges within the subgraph are added by
-      // AddSendsToOutsideCompilation once the _SendToHost node has been
+      // AddSendsToOutsideCompilation once the _HostCompute node has been
       // created. The matching control edge from _RecvAtHost to the
       // destination is added by CopyEdgeToOutputGraph.
       std::unordered_set<const Node*> control_inputs;
 
       // Maps from source (producer node/slot) and destination (consumer
       // node/slot) tensors in the original graph to output index (slot number
-      // in the SendFromHost/RecvFromHost nodes that will be created) for the
+      // in the SendFromHost/HostCompute nodes that will be created) for the
       // outside_compilation subgraph.
       std::unordered_map<NodeSlot, int, NodeSlot::Hasher> outputs_by_src;
       std::unordered_map<NodeSlot, int, NodeSlot::Hasher> outputs_by_dst;
@@ -352,13 +366,13 @@ class Encapsulator {
       // containing compiled subgraph. These are recorded by
       // RecordOutsideCompilationOutputOrControl while walking all the subgraph
       // edges, and lifted control edges within the subgraph are added by
-      // AddRecvsFromToOutsideCompilation once the _RecvFromHost node has been
+      // AddRecvsFromToOutsideCompilation once the _HostCompute node has been
       // created. The matching control edge from the source to _SendFromHost to
       // the destination is added by CopyEdgeToOutputGraph.
       std::unordered_set<const Node*> control_outputs;
 
-      // _SendToHost node in the subgraph. Not owned.
-      Node* send_to_host = nullptr;
+      // Name of the _HostCompute node in the subgraph.
+      string host_compute_name;
 
       // _RecvAtHost node in the output graph. Not owned.
       Node* recv_at_host = nullptr;
@@ -516,6 +530,59 @@ class Encapsulator {
       const std::unordered_map<const Node*, Node*>& node_images,
       bool parallel_checking, Graph* graph_out);
 
+  // Constructs a minimal shape inference graph that can be used to determine
+  // the shape of send_node at the time that the subgraph is compiled.
+  // recv_at_host_nodes contains the names of all the recv_at_host nodes that
+  // send_node might depend on. These recv_at_host nodes have shapes that are
+  // not known during the rewrite pass, but will be known at compile time.
+  //
+  // If the shapes of all the inputs to send_node can be determined during the
+  // rewrite pass, on exit graphdef_out is empty and the shapes are returned in
+  // static_shape_out. Otherwise graphdef_out contains a graph that can be used
+  // for shape inference at compile time, where all the source nodes of the
+  // graph are either constants with known shapes, or nodes named in
+  // recv_at_host_nodes.
+  //
+  // A non-OK status is returned if neither of the above conditions can be
+  // satisfied, e.g., because send_node depends on a node that doesn't have a
+  // registered shape inference function.
+  Status DoStaticShapeInferenceForOutsideCompilationSend(
+      const Graph& graph_in, const ShapeRefiner& shape_refiner,
+      const std::unordered_set<string>& recv_at_host_nodes, Node* send_node,
+      FunctionLibraryDefinition* library,
+      std::vector<TensorShapeProto>* static_shape_out,
+      std::unique_ptr<GraphDef>* graphdef_out);
+
+  // Makes a copy of graph containing only nodes that are ancestors of at least
+  // one node in send_from_host_nodes and store it in pruned_graph. On exit
+  // nodes_images contains a mapping from nodes in graph to nodes in
+  // pruned_graph. All functions in the copied graph are inlined.
+  Status MakePrunedGraphCopyAndInline(
+      const Graph& graph, const std::vector<Node*>& sink_nodes,
+      std::unique_ptr<Graph>* pruned_graph,
+      std::unordered_map<const Node*, Node*>* node_images,
+      FunctionLibraryDefinition* library);
+
+  // Makes a copy of graph containing only nodes that are ancestors of a
+  // send_from_host node in an outside_compilation subgraph, and store it in
+  // pruned_graph. Also perform shape inference on the pruned graph, using
+  // shape_refiner. On exit node_images contains a mapping from nodes in graph
+  // to nodes in pruned_graph.
+  Status MakeGraphForOutsideCompilationSends(
+      const Graph& graph, std::unique_ptr<Graph>* pruned_graph,
+      ShapeRefiner* shape_refiner,
+      std::unordered_map<const Node*, Node*>* node_images,
+      FunctionLibraryDefinition* library);
+
+  // Performs static shape inference, as far as possible, for the send_from_host
+  // nodes in each outside_compilation subgraph. Where it is not possible to
+  // determine the shape statically, stores a serialized GraphDef in the
+  // HostCompute 'shape_inference_graph' attr, to be used at compile time for
+  // final inference. If the shapes are known statically they are stored in the
+  // HostCompute 'shapes' attr.
+  Status GetShapeInfoForOutsideCompilationSends(
+      Graph* graph_out, FunctionLibraryDefinition* library);
+
   const string group_attribute_;
   const string outside_compilation_attribute_;
   const Graph* graph_in_;
@@ -682,16 +749,20 @@ void Encapsulator::Subgraph::RecordOutsideCompilationOutputOrControl(
   }
 }
 
-Status Encapsulator::Subgraph::AddSendsToOutsideCompilation(
+Status Encapsulator::Subgraph::AddHostComputes(
+    const string& subgraph_name,
     const std::unordered_map<const Node*, Node*>& node_images) {
   for (auto& oc_subgraph_iter : outside_compilation_subgraphs_) {
     const string& oc_subgraph_name = oc_subgraph_iter.first;
     OutsideCompilationSubgraph& oc_subgraph = oc_subgraph_iter.second;
-    if (!oc_subgraph.inputs.empty() || !oc_subgraph.control_inputs.empty()) {
-      // Build a _SendToHost node sending all the args of the appropriate
-      // types.
-      std::vector<DataType> dtypes(oc_subgraph.inputs.size(), DT_INVALID);
+    if (!oc_subgraph.inputs.empty() || !oc_subgraph.control_inputs.empty() ||
+        !oc_subgraph.outputs_by_src.empty() ||
+        !oc_subgraph.control_outputs.empty()) {
+      // Build a _HostCompute node.
       std::vector<NodeDefBuilder::NodeOut> inputs(oc_subgraph.inputs.size());
+      std::vector<DataType> input_dtypes(oc_subgraph.inputs.size(), DT_INVALID);
+      std::vector<DataType> output_dtypes(oc_subgraph.outputs_by_src.size(),
+                                          DT_INVALID);
 
       for (const auto& input_src : oc_subgraph.inputs) {
         const Node* src_node = input_src.first.node;
@@ -700,94 +771,64 @@ Status Encapsulator::Subgraph::AddSendsToOutsideCompilation(
         int input_index = input_src.second;
 
         DataType dtype = src_node->output_type(src_slot);
-        dtypes[input_index] = dtype;
         inputs[input_index].Reset(src_image->name(), src_slot, dtype);
+        input_dtypes[input_index] = dtype;
       }
 
-      NodeDef send_def;
-      NodeDefBuilder builder(
-          strings::StrCat("outside_compilation_", oc_subgraph_name, "_send"),
-          kSendToHostOp);
-      builder.Attr("dtypes", dtypes);
+      for (const auto& output : oc_subgraph.outputs_by_src) {
+        DataType dtype = output.first.dtype;
+        int output_index = output.second;
+        output_dtypes[output_index] = dtype;
+      }
+
+      NodeDef host_compute_def;
+      NodeDefBuilder builder(strings::StrCat("outside_compilation_",
+                                             oc_subgraph_name, "_host_compute"),
+                             kHostComputeOp);
       builder.Input(inputs);
-      Status s = builder.Finalize(&send_def);
+      builder.Attr("Tinputs", input_dtypes);
+      builder.Attr("Toutputs", output_dtypes);
+      builder.Attr("key",
+                   strings::StrCat("host_compute_channel_", subgraph_name, "_",
+                                   oc_subgraph_name));
+      Status s = builder.Finalize(&host_compute_def);
       if (!s.ok()) return s;
 
-      oc_subgraph.send_to_host = graph_->AddNode(send_def, &s);
+      Node* host_compute = graph_->AddNode(host_compute_def, &s);
       if (!s.ok()) return s;
+      oc_subgraph.host_compute_name = host_compute->name();
 
-      // Connect the _SendToHost node to its producers in the subgraph.
+      // Connect the _HostCompute node to its producers in the subgraph.
       for (auto& input_src : oc_subgraph.inputs) {
         const Node* src_node = input_src.first.node;
         Node* src_image = node_images.at(src_node);
         int src_slot = input_src.first.slot;
         int input_index = input_src.second;
-        graph_->AddEdge(src_image, src_slot, oc_subgraph.send_to_host,
-                        input_index);
+        graph_->AddEdge(src_image, src_slot, host_compute, input_index);
       }
 
-      // Connect the _SendToHost node to its control edge producers in the
+      // Connect the _HostCompute node to its control edge producers in the
       // subgraph.
       for (const auto& src_node : oc_subgraph.control_inputs) {
         Node* src_image = node_images.at(src_node);
-        graph_->AddControlEdge(src_image, oc_subgraph.send_to_host);
-      }
-    }
-  }
-
-  return Status::OK();
-}
-
-Status Encapsulator::Subgraph::AddRecvsFromOutsideCompilation(
-    const std::unordered_map<const Node*, Node*>& node_images) {
-  for (auto& oc_subgraph_iter : outside_compilation_subgraphs_) {
-    const string& oc_subgraph_name = oc_subgraph_iter.first;
-    OutsideCompilationSubgraph& oc_subgraph = oc_subgraph_iter.second;
-    if (!oc_subgraph.outputs_by_src.empty() ||
-        !oc_subgraph.control_outputs.empty()) {
-      // Build a _RecvFromHost node producing all the outputs of the appropriate
-      // types.
-      std::vector<DataType> dtypes(oc_subgraph.outputs_by_src.size(),
-                                   DT_INVALID);
-
-      for (const auto& output : oc_subgraph.outputs_by_src) {
-        DataType dtype = output.first.dtype;
-        int output_index = output.second;
-        dtypes[output_index] = dtype;
+        graph_->AddControlEdge(src_image, host_compute);
       }
 
-      NodeDef recv_def;
-      NodeDefBuilder builder(
-          strings::StrCat("outside_compilation_", oc_subgraph_name, "_recv"),
-          kRecvFromHostOp);
-      builder.Attr("dtypes", dtypes);
-      Status s = builder.Finalize(&recv_def);
-      if (!s.ok()) return s;
-
-      Node* recv = graph_->AddNode(recv_def, &s);
-      if (!s.ok()) return s;
-
-      // Connect the consumers in the subgraph to the _RecvFromHost node.
+      // Connect the consumers in the subgraph to the _HostCompute node.
       for (const auto& output : oc_subgraph.outputs_by_dst) {
         const Node* dst_node = output.first.node;
         Node* dst_image = node_images.at(dst_node);
         int dst_slot = output.first.slot;
         int output_index = output.second;
 
-        graph_->AddEdge(recv, output_index, dst_image, dst_slot);
+        graph_->AddEdge(host_compute, output_index, dst_image, dst_slot);
       }
 
-      // Connect the control edge consumers in the subgraph to the _RecvFromHost
+      // Connect the control edge consumers in the subgraph to the _HostCompute
       // node.
       for (const auto& dst_node : oc_subgraph.control_outputs) {
         Node* dst_image = node_images.at(dst_node);
-        graph_->AddControlEdge(recv, dst_image);
-      }
-
-      // Add a control edge in the subgraph so that the _SendToHost node, if
-      // any, is compiled before the _RecvFromHost node.
-      if (oc_subgraph.send_to_host != nullptr) {
-        graph_->AddControlEdge(oc_subgraph.send_to_host, recv);
+        graph_->AddControlEdge(host_compute, dst_image);
       }
     }
   }
@@ -882,6 +923,63 @@ Status Encapsulator::Subgraph::BuildFunctionDef(
   return Status::OK();
 }
 
+Status Encapsulator::Subgraph::AddShapeInferenceInfo(
+    const string& outside_compilation_subgraph_name,
+    const std::vector<TensorShapeProto>& shapes, GraphDef* inference_graph) {
+  OutsideCompilationSubgraph& oc_subgraph =
+      outside_compilation_subgraphs_.at(outside_compilation_subgraph_name);
+
+  Node* host_compute = nullptr;
+  for (Node* n : graph_->nodes()) {
+    if (n->name() == oc_subgraph.host_compute_name) {
+      host_compute = n;
+      break;
+    }
+  }
+  if (host_compute == nullptr) {
+    return errors::InvalidArgument(
+        "After rewriting subgraph ", outside_compilation_subgraph_name,
+        " there is no HostCompute Op for outside compilation subgraph ",
+        oc_subgraph.host_compute_name);
+  }
+
+  if (inference_graph == nullptr) {
+    host_compute->AddAttr("shape_inference_graph", "");
+    host_compute->AddAttr("shapes", shapes);
+  } else {
+    string serialized_graph;
+    if (!inference_graph->SerializeToString(&serialized_graph)) {
+      return errors::Internal(
+          "Failed to serialize graph for outside compilation subgraph ",
+          oc_subgraph.host_compute_name);
+    }
+    host_compute->AddAttr("shape_inference_graph", serialized_graph);
+    host_compute->AddAttr("shapes", std::vector<TensorShapeProto>());
+  }
+  return Status::OK();
+}
+
+Status Encapsulator::Subgraph::ReplaceFunctionDef(
+    FunctionLibraryDefinition* library) {
+  const string& name = call_node_def_.name();
+
+  FunctionDef fdef;
+  TF_RETURN_IF_ERROR(GraphToFunctionDef(*graph_, name, &fdef));
+
+  if (VLOG_IS_ON(1)) {
+    VLOG(2) << "Replace function def " << name;
+    dump_graph::DumpGraphToFile(
+        strings::StrCat("replace_encapsulate_fdef_graph_", name), *graph_,
+        library);
+    dump_graph::DumpFunctionDefToFile(
+        strings::StrCat("replace_encapsulate_fdef_", name), fdef);
+  }
+
+  TF_RETURN_IF_ERROR(library->RemoveFunction(name));
+  TF_RETURN_IF_ERROR(library->AddFunctionDef(fdef));
+  return Status::OK();
+}
+
 Status Encapsulator::Subgraph::BuildParallelCheckOp(
     const std::unordered_map<const Node*, Node*>& node_images,
     Graph* graph_out) {
@@ -980,7 +1078,9 @@ Status Encapsulator::Subgraph::AddRecvAtHostNode(
   NodeDefBuilder builder(strings::StrCat("outside_compilation_", subgraph_name,
                                          "_", oc_subgraph_name, "_recv"),
                          kRecvAtHostOp);
-  builder.Attr("dtypes", dtypes);
+  builder.Attr("Toutputs", dtypes);
+  builder.Attr("key", strings::StrCat("host_compute_channel_", subgraph_name,
+                                      "_", oc_subgraph_name));
   Status s = builder.Finalize(&recv_def);
   if (!s.ok()) return s;
 
@@ -1020,7 +1120,9 @@ Status Encapsulator::Subgraph::AddSendFromHostNode(
   NodeDefBuilder builder(strings::StrCat("outside_compilation_", subgraph_name,
                                          "_", oc_subgraph_name, "_send"),
                          kSendFromHostOp);
-  builder.Attr("dtypes", dtypes);
+  builder.Attr("Tinputs", dtypes);
+  builder.Attr("key", strings::StrCat("host_compute_channel_", subgraph_name,
+                                      "_", oc_subgraph_name));
   builder.Input(inputs);
   Status s = builder.Finalize(&send_def);
   if (!s.ok()) return s;
@@ -1062,6 +1164,13 @@ Status Encapsulator::Subgraph::AddOutsideCompilationHostIONodes(
   return Status::OK();
 }
 
+void Encapsulator::Subgraph::GetOutsideCompilationSubgraphNames(
+    std::vector<string>* names) const {
+  for (auto& entry : outside_compilation_subgraphs_) {
+    names->push_back(entry.first);
+  }
+}
+
 Status Encapsulator::GetFunctionNameAttr(
     Node const* node, string* attr, string* outside_compilation_attr) const {
   Status s = GetNodeAttr(node->attrs(), group_attribute_, attr);
@@ -1220,8 +1329,7 @@ Status Encapsulator::SplitIntoSubgraphs() {
   // single input and output node for it.
   for (auto& entry : subgraphs_) {
     Subgraph& subgraph = entry.second;
-    TF_RETURN_IF_ERROR(subgraph.AddSendsToOutsideCompilation(node_images));
-    TF_RETURN_IF_ERROR(subgraph.AddRecvsFromOutsideCompilation(node_images));
+    TF_RETURN_IF_ERROR(subgraph.AddHostComputes(entry.first, node_images));
   }
 
   MarkGuaranteedConstants(*graph_in_, src_arg_pairs);
@@ -1509,8 +1617,346 @@ Status Encapsulator::AddEdgesToOutputGraph(
   return Status::OK();
 }
 
-Status Encapsulator::BuildOutputGraph(bool parallel_checking,
-                                      Graph* graph_out) {
+namespace {
+
+// Adds a dummy Const node to graph_out. The "constant" has the type of
+// data_type and the shape indicated in 'shape'. The dummy node is not a valid
+// Const node because it does not have any value defined, but this doesn't
+// matter because it will only be used subsequently for shape inference. (It
+// would be possible to add a switch statement over data_type to create a value
+// for the constant, but that would entail maintaining the logic as new types
+// are added, and is not necessary.)
+Node* AddDummyShapedNode(DataType data_type, const TensorShapeProto& shape,
+                         Graph* graph_out) {
+  TensorProto dummy_proto;
+  dummy_proto.set_dtype(data_type);
+  *dummy_proto.mutable_tensor_shape() = shape;
+  // Don't set any value field in the proto, since it is only going to be used
+  // for shape inference.
+
+  GraphDefBuilder::Options options(graph_out, /*status=*/nullptr);
+  NodeBuilder node_builder(options.GetNameForOp("KnownShape"), "Const",
+                           options.op_registry());
+  node_builder.Attr("dtype", data_type).Attr("value", dummy_proto);
+  return options.FinalizeBuilder(&node_builder);
+}
+
+// Adds a copy of node_in to graph_out and adds the mapping to
+// copied_node_images.
+Status CopyShapeInferenceNodeToGraph(
+    Node* node_in, const Node* send_node,
+    const std::unordered_map<Node*, Node*>& dummy_node_images,
+    FunctionLibraryDefinition* library,
+    std::unordered_map<Node*, Node*>* copied_node_images, Graph* graph_out) {
+  // Once all the ancestor nodes have been added to graph_out, add this node
+  // and connect it to its ancestors.
+  Node* node_out = graph_out->CopyNode(node_in);
+  (*copied_node_images)[node_in] = node_out;
+  // Don't bother to build the shape inference graph if there's a node with no
+  // shape inference function, since it would just result in an error later at
+  // compile time.
+  const OpRegistrationData* op_reg_data;
+  TF_RETURN_IF_ERROR(library->LookUp(node_in->type_string(), &op_reg_data));
+  if (op_reg_data->shape_inference_fn == nullptr) {
+    return errors::InvalidArgument(
+        "Shape inference is not possible for outside_compilation "
+        "SendFromHost node ",
+        send_node->name(), " because it depends on node ", node_in->name(),
+        " which does not have a shape inference function registered.");
+  }
+  // Add all the edges to the newly copied node.
+  for (const Edge* in_edge : node_in->in_edges()) {
+    if (!in_edge->IsControlEdge()) {
+      Node* src = in_edge->src();
+      const auto iter = dummy_node_images.find(src);
+      if (iter == dummy_node_images.end()) {
+        // The src is a copied node so use the original output port.
+        graph_out->AddEdge((*copied_node_images)[in_edge->src()],
+                           in_edge->src_output(), node_out,
+                           in_edge->dst_input());
+      } else {
+        // The src is a dummy node so use output port 0.
+        graph_out->AddEdge(iter->second, 0, node_out, in_edge->dst_input());
+      }
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+Status Encapsulator::DoStaticShapeInferenceForOutsideCompilationSend(
+    const Graph& graph_in, const ShapeRefiner& shape_refiner,
+    const std::unordered_set<string>& recv_at_host_nodes, Node* send_node,
+    FunctionLibraryDefinition* library,
+    std::vector<TensorShapeProto>* static_shape_out,
+    std::unique_ptr<GraphDef>* graphdef_out) {
+  // Maps from nodes in graph_in to nodes in graph_out.
+  //
+  // When an edge has fully defined shape the source node in graph_in is
+  // replaced in graph_out by a dummy constant node. The mapping from nodes
+  // in graph_in to dummy nodes is stored in dummy_node_images.
+  //
+  // When a node in graph_in has at least one ancestor that doesn't have fully
+  // defined shape, it is copied into graph_out. The mapping from nodes in
+  // graph_in to copied nodes is stored in copied_node_images.
+  //
+  // The two types of node are treated differently because, when adding edges to
+  // graph_out, an output from a dummy node always uses port 0, whereas an
+  // output from a copied node uses the same port that was used in graph_in.
+  std::unordered_map<Node*, Node*> dummy_node_images;
+  std::unordered_map<Node*, Node*> copied_node_images;
+
+  std::unique_ptr<Graph> graph_out(new Graph(graph_in.op_registry()));
+  graph_out->set_versions(graph_in.versions());
+  static_shape_out->resize(send_node->num_inputs());
+
+  // We don't use the standard ReverseDFS because we want to cut off traversal
+  // whenever we find an output with fully defined shape.
+  // TODO(misard) make this work properly in the presence of control flow.
+  struct Work {
+    Node* node;
+    bool leave;  // Are we entering or leaving node?
+  };
+  std::vector<Work> stack({{send_node, false}});
+  std::vector<bool> visited(graph_in.num_node_ids(), false);
+  while (!stack.empty()) {
+    Work w = stack.back();
+    stack.pop_back();
+    Node* n = w.node;
+
+    if (w.leave) {
+      TF_RETURN_IF_ERROR(CopyShapeInferenceNodeToGraph(
+          n, send_node, dummy_node_images, library, &copied_node_images,
+          graph_out.get()));
+    } else {
+      if (visited[n->id()]) continue;
+      visited[n->id()] = true;
+
+      // Arrange to revisit when all done with all inputs.
+      stack.push_back(Work{n, true});
+
+      bool has_parent_with_unknown_shape = false;
+      for (const Edge* in_edge : n->in_edges()) {
+        if (!in_edge->IsControlEdge()) {
+          Node* src_node = in_edge->src();
+          int src_port = in_edge->src_output();
+          shape_inference::InferenceContext* context =
+              shape_refiner.GetContext(src_node);
+          shape_inference::ShapeHandle shape = context->output(src_port);
+          if (context->FullyDefined(shape)) {
+            // This ancestor has known shape, so instead of adding it to the
+            // stack, add a dummy node with that shape to graph_out and
+            // continue.
+            TensorShapeProto proto;
+            context->ShapeHandleToProto(shape, &proto);
+            dummy_node_images[src_node] = AddDummyShapedNode(
+                src_node->output_type(src_port), proto, graph_out.get());
+            if (n == send_node) {
+              (*static_shape_out)[in_edge->dst_input()] = proto;
+            }
+          } else {
+            if (!visited[src_node->id()]) {
+              has_parent_with_unknown_shape = true;
+              stack.push_back({src_node, false});
+            }
+          }
+        }
+      }
+      if (!has_parent_with_unknown_shape) {
+        if (n == send_node) {
+          // The shapes of all the inputs to send_node are statically known. We
+          // won't have to do any inference at compile time so return now: the
+          // shapes were stored in static_shape_out above.
+          graphdef_out->reset();
+          return Status::OK();
+        } else {
+          // Any shape that is being processed is either the original send node
+          // or has at least one output with statically-unknown shape. If the
+          // latter and it doesn't have any inputs with statically-unknown
+          // shape, then check that it is of the recv nodes that we can fill in
+          // the shape of at run-time later. If it isn't one of those, then we
+          // won't have any additional knowledge at compile time, so we already
+          // know we won't be able to do shape inference and we can return an
+          // error now.
+          if (recv_at_host_nodes.find(n->name()) == recv_at_host_nodes.end()) {
+            return errors::InvalidArgument(
+                "Shape inference is not possible for outside_compilation "
+                "SendFromHost node ",
+                send_node->name(), " because shape of node ", n->name(),
+                " will not be known at compilation time.");
+          }
+        }
+      }
+    }
+  }
+
+  graphdef_out->reset(new GraphDef());
+  graph_out->ToGraphDef(graphdef_out->get());
+
+  return Status::OK();
+}
+
+Status Encapsulator::MakePrunedGraphCopyAndInline(
+    const Graph& graph, const std::vector<Node*>& sink_nodes,
+    std::unique_ptr<Graph>* pruned_graph,
+    std::unordered_map<const Node*, Node*>* node_images,
+    FunctionLibraryDefinition* library) {
+  // First copy all ancestor nodes of sink_nodes into a new graph.
+  pruned_graph->reset(new Graph(library));
+  (*pruned_graph)->set_versions(graph.versions());
+  ReverseDFSFrom(graph, sink_nodes,
+                 /*enter=*/nullptr,
+                 /*leave=*/[&](Node* n) {
+                   if (!n->IsSource()) {
+                     Node* copied = (*pruned_graph)->CopyNode(n);
+                     node_images->emplace(n, copied);
+                   }
+                 });
+
+  // Add all the edges between copied nodes.
+  for (auto entry : *node_images) {
+    const Node* orig = entry.first;
+    Node* image = entry.second;
+    for (const Edge* out_edge : orig->out_edges()) {
+      auto iter = node_images->find(out_edge->dst());
+      if (iter != node_images->end()) {
+        // The source and destination are both in the copied graph.
+        (*pruned_graph)
+            ->AddEdge(image, out_edge->src_output(), iter->second,
+                      out_edge->dst_input());
+      }
+    }
+  }
+
+  // Find all the function call nodes, and inline them.
+  std::vector<Node*> function_nodes;
+  for (auto node : (*pruned_graph)->nodes()) {
+    const OpRegistrationData* op_reg_data;
+    TF_RETURN_IF_ERROR(library->LookUp(node->type_string(), &op_reg_data));
+    if (op_reg_data->is_function_op) {
+      function_nodes.push_back(node);
+    }
+  }
+  for (auto node : function_nodes) {
+    VLOG(2) << "Inlining function " << node->name();
+    const FunctionDef* fdef = library->Find(node->type_string());
+    if (fdef == nullptr) {
+      return errors::Internal("Failed to find function ", node->type_string(),
+                              " in function library.");
+    }
+    FunctionBody* fbody = nullptr;
+    TF_RETURN_IF_ERROR(
+        FunctionDefToBodyHelper(*fdef, node->attrs(), library,
+                                [library](const string& op, const OpDef** sig) {
+                                  return library->LookUpOpDef(op, sig);
+                                },
+                                &fbody));
+    InlineFunctionBody(*library, pruned_graph->get(), node, fbody);
+    delete fbody;
+  }
+
+  return Status::OK();
+}
+
+Status Encapsulator::MakeGraphForOutsideCompilationSends(
+    const Graph& graph, std::unique_ptr<Graph>* pruned_graph,
+    ShapeRefiner* shape_refiner,
+    std::unordered_map<const Node*, Node*>* node_images,
+    FunctionLibraryDefinition* library) {
+  // Find all the send_from_host nodes in all subgraphs, to use as roots for the
+  // pruning.
+  std::vector<Node*> send_from_host_nodes;
+  for (auto& subgraph_entry : subgraphs_) {
+    Subgraph& subgraph = subgraph_entry.second;
+    std::vector<string> outside_compilation_names;
+    subgraph.GetOutsideCompilationSubgraphNames(&outside_compilation_names);
+    for (const auto& name : outside_compilation_names) {
+      Node* send_node = subgraph.GetSendFromHostNode(name);
+      if (send_node != nullptr) {
+        send_from_host_nodes.push_back(send_node);
+      }
+    }
+  }
+
+  // Make a copy of all the graph nodes needed to evaluate the send_from_host
+  // nodes, inlining any functions as needed.
+  TF_RETURN_IF_ERROR(MakePrunedGraphCopyAndInline(
+      graph, send_from_host_nodes, pruned_graph, node_images, library));
+
+  // Perform shape inference on the pruned graph.
+  shape_refiner->set_require_shape_inference_fns(false);
+  FixupSourceAndSinkEdges(pruned_graph->get());
+  std::vector<Node*> post_order;
+  GetReversePostOrder(*(*pruned_graph), &post_order);
+  for (auto node : post_order) {
+    // Ignore the status returned by the shape_refiner. At this point we want
+    // the best effort shapes, even if no shape function is registered for a
+    // node.
+    Status status = shape_refiner->AddNode(node);
+    if (!status.ok()) {
+      VLOG(1) << "Shape inference failed for node: " << status;
+    }
+  }
+
+  return Status::OK();
+}
+
+Status Encapsulator::GetShapeInfoForOutsideCompilationSends(
+    Graph* graph_out, FunctionLibraryDefinition* library) {
+  std::unique_ptr<Graph> pruned_graph;
+  ShapeRefiner shape_refiner(graph_out->versions(), graph_out->op_registry());
+  std::unordered_map<const Node*, Node*> node_images;
+  TF_RETURN_IF_ERROR(MakeGraphForOutsideCompilationSends(
+      *graph_out, &pruned_graph, &shape_refiner, &node_images, library));
+
+  for (auto& subgraph_entry : subgraphs_) {
+    Subgraph& subgraph = subgraph_entry.second;
+    // Find all the recv_at_host nodes in this subgraph.
+    std::vector<string> outside_compilation_names;
+    subgraph.GetOutsideCompilationSubgraphNames(&outside_compilation_names);
+    std::unordered_set<string> recv_at_host_names;
+    for (const auto& name : outside_compilation_names) {
+      Node* recv_node = subgraph.GetRecvAtHostNode(name);
+      if (recv_node != nullptr) {
+        recv_at_host_names.insert(recv_node->name());
+      }
+    }
+    // For each send_from_host node, do as much shape inference as possible
+    // without knowing the shape of the recv_at_host nodes, and store the
+    // result, along with enough information to complete the job at compile time
+    // once the recv_at_host shapes are known.
+    for (const auto& name : outside_compilation_names) {
+      Node* send_node = subgraph.GetSendFromHostNode(name);
+      std::vector<TensorShapeProto> static_shape;
+      std::unique_ptr<GraphDef> graphdef;
+      if (send_node != nullptr) {
+        TF_RETURN_IF_ERROR(DoStaticShapeInferenceForOutsideCompilationSend(
+            *pruned_graph, shape_refiner, recv_at_host_names,
+            node_images[send_node], library, &static_shape, &graphdef));
+        if (graphdef == nullptr) {
+          VLOG(2) << "Send node  " << send_node->name() << " shapes";
+          for (int i = 0; i < static_shape.size(); ++i) {
+            VLOG(2) << static_shape[i].DebugString();
+          }
+        } else {
+          VLOG(2) << "Send node " << send_node->name() << " graph\n"
+                  << graphdef->DebugString();
+        }
+      }
+      TF_RETURN_IF_ERROR(
+          subgraph.AddShapeInferenceInfo(name, static_shape, graphdef.get()));
+    }
+    if (!outside_compilation_names.empty()) {
+      TF_RETURN_IF_ERROR(subgraph.ReplaceFunctionDef(library));
+    }
+  }
+
+  return Status::OK();
+}
+
+Status Encapsulator::BuildOutputGraph(bool parallel_checking, Graph* graph_out,
+                                      FunctionLibraryDefinition* library) {
   // Map from nodes in the input graph to nodes in the output graph.
   std::unordered_map<const Node*, Node*> node_images;
 
@@ -1522,6 +1968,9 @@ Status Encapsulator::BuildOutputGraph(bool parallel_checking,
   TF_RETURN_IF_ERROR(
       AddEdgesToOutputGraph(node_images, parallel_checking, graph_out));
 
+  TF_RETURN_IF_ERROR(
+      GetShapeInfoForOutsideCompilationSends(graph_out, library));
+
   return Status::OK();
 }
 
@@ -1545,7 +1994,7 @@ Status EncapsulateSubgraphsInFunctions(
   std::unique_ptr<Graph> out(new Graph(library));
   out->set_versions(graph_in.versions());
   TF_RETURN_IF_ERROR(
-      encapsulator.BuildOutputGraph(parallel_checking, out.get()));
+      encapsulator.BuildOutputGraph(parallel_checking, out.get(), library));
 
   *graph_out = std::move(out);
   return Status::OK();
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
index b100861d5e..aed9cae0f1 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
@@ -29,17 +29,181 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+template <class Tkey, class Tvalue>
+bool EqualProtoMap(const ::tensorflow::protobuf::Map<Tkey, Tvalue>& a,
+                   const ::tensorflow::protobuf::Map<Tkey, Tvalue>& b,
+                   const std::function<string(const Tkey&)>& key_to_string,
+                   const std::function<string(const Tvalue&)>& value_to_string,
+                   const std::function<bool(const Tkey&, const Tvalue&,
+                                            const Tvalue&)>& compare,
+                   const string& map_name, string* diff) {
+  for (const auto& elt_a : a) {
+    const auto iter = b.find(elt_a.first);
+    if (iter == b.end()) {
+      if (diff) {
+        *diff = strings::StrCat(
+            map_name, " expected: contains element with key '",
+            key_to_string(elt_a.first), "' got: map has no such element");
+      }
+      return false;
+    }
+    if (!compare(elt_a.first, elt_a.second, iter->second)) {
+      if (diff) {
+        *diff = strings::StrCat(map_name, " expected: element with key '",
+                                key_to_string(elt_a.first), " has value '",
+                                value_to_string(elt_a.second), "' got: '",
+                                value_to_string(iter->second), "'");
+      }
+      return false;
+    }
+  }
+  for (const auto& elt_b : b) {
+    const auto iter = a.find(elt_b.first);
+    if (iter == a.end()) {
+      if (diff) {
+        *diff = strings::StrCat(map_name, " got: contains element with key '",
+                                key_to_string(elt_b.first),
+                                "' expected: map has no such element");
+      }
+      return false;
+    }
+  }
+  return true;
+}
+
+bool EqualFunctionNodeDef(const NodeDef& a, const NodeDef& b,
+                          const string& diff_preamble, string* diff) {
+  if (a.op() != b.op()) {
+    if (diff) {
+      *diff = strings::StrCat(diff_preamble, " mismatch for node ", a.name(),
+                              ", expected op '", a.op(), "' got '", b.op());
+    }
+    return false;
+  }
+  if (a.device() != b.device()) {
+    if (diff) {
+      *diff = strings::StrCat(diff_preamble, " mismatch for node ", a.name(),
+                              ", expected device '", a.device(), "' got '",
+                              b.device());
+    }
+    return false;
+  }
+  if (a.input_size() != b.input_size()) {
+    if (diff) {
+      *diff = strings::StrCat(diff_preamble, " mismatch for node ", a.name(),
+                              ", expected ", a.input_size(), " inputs got ",
+                              b.input_size(), " expected:\n", a.DebugString(),
+                              "\ngot:\n", b.DebugString());
+    }
+    return false;
+  }
+  for (int i = 0; i < a.input_size(); ++i) {
+    if (a.input(i) != b.input(i)) {
+      if (diff) {
+        *diff = strings::StrCat(diff_preamble, " mismatch for node ", a.name(),
+                                " input ", i, ", expected ", a.input(i),
+                                " got ", b.input(i), " expected:\n",
+                                a.DebugString(), "\ngot:\n", b.DebugString());
+      }
+      return false;
+    }
+  }
+  return EqualProtoMap<string, AttrValue>(
+      a.attr(), b.attr(), [](const string& s) { return s; },
+      [](const AttrValue& v) { return v.DebugString(); },
+      [](const string& key, const AttrValue& av, const AttrValue& bv) {
+        if (key == "shape_inference_graph") {
+          // Default serialization of GraphDef is unstable because maps don't
+          // serialize deterministically. Rather than go through the hoops to
+          // turn on deterministic serialization of this attr just for this
+          // test, add logic here to compare determinstically.
+          GraphDef ga;
+          if (!ga.ParseFromString(av.s())) {
+            return false;
+          }
+          GraphDef gb;
+          if (!gb.ParseFromString(bv.s())) {
+            return false;
+          }
+          return EqualGraphDef(ga, gb, nullptr);
+        } else {
+          return av.DebugString() == bv.DebugString();
+        }
+      },
+      strings::StrCat(diff_preamble, " attr mismatch for node ", a.name()),
+      diff);
+}
+
 bool EqualFunctionDef(const FunctionDef& a, const FunctionDef& b,
                       string* diff) {
-  // TODO(phawkins) use a more sophisticated equality test.
-  if (a.DebugString() != b.DebugString()) {
+  if (a.signature().DebugString() != b.signature().DebugString()) {
     if (diff) {
-      *diff = strings::StrCat("Definition mismatch for function ",
+      *diff = strings::StrCat("Signature mismatch for function ",
                               a.signature().name(), ", expected:\n",
-                              a.DebugString(), "\ngot:\n", b.DebugString());
+                              a.signature().DebugString(), "\ngot:\n",
+                              b.signature().DebugString());
     }
     return false;
   }
+  if (!EqualProtoMap<string, AttrValue>(
+          a.attr(), b.attr(), [](const string& s) { return s; },
+          [](const AttrValue& v) { return v.DebugString(); },
+          [](const string& key, const AttrValue& av, const AttrValue& bv) {
+            return av.DebugString() == bv.DebugString();
+          },
+          strings::StrCat("attr mismatch for function ", a.signature().name()),
+          diff)) {
+    return false;
+  }
+  if (!EqualProtoMap<string, string>(
+          a.ret(), b.ret(), [](const string& s) { return s; },
+          [](const string& s) { return s; },
+          [](const string& key, const string& av, const string& bv) {
+            return av == bv;
+          },
+          strings::StrCat("ret mismatch for function ", a.signature().name()),
+          diff)) {
+    return false;
+  }
+  for (int i = 0; i < a.node_def_size(); ++i) {
+    bool found = false;
+    for (int j = 0; j < b.node_def_size(); ++j) {
+      if (a.node_def(i).name() == b.node_def(j).name()) {
+        if (!EqualFunctionNodeDef(
+                a.node_def(i), b.node_def(j),
+                strings::StrCat("Function ", a.signature().name()), diff)) {
+          return false;
+        }
+        found = true;
+        break;
+      }
+    }
+    if (!found) {
+      if (diff) {
+        *diff = strings::StrCat("Function ", a.signature().name(),
+                                ", expected: has node '", a.node_def(i).name(),
+                                "' got: no node of that name");
+      }
+      return false;
+    }
+  }
+  for (int i = 0; i < b.node_def_size(); ++i) {
+    bool found = false;
+    for (int j = 0; j < a.node_def_size(); ++j) {
+      if (b.node_def(i).name() == a.node_def(j).name()) {
+        found = true;
+        break;
+      }
+    }
+    if (!found) {
+      if (diff) {
+        *diff = strings::StrCat("Function ", a.signature().name(),
+                                ", got: has node '", b.node_def(i).name(),
+                                "' expected: no node of that name");
+      }
+      return false;
+    }
+  }
   return true;
 }
 
@@ -84,29 +248,64 @@ bool EqualFunctionDefLibrary(const FunctionDefLibrary& expected,
 
 // TODO(misard): remove these fake registrations once there are real Ops to be
 // compiled.
-REGISTER_OP("_XlaSendToHost")
-    .Input("input: dtypes")
-    .Attr("dtypes: list(type) >= 0");
-
-REGISTER_OP("_XlaRecvFromHost")
-    .Output("output: dtypes")
-    .Attr("dtypes: list(type) >= 0");
+REGISTER_OP("_XlaHostCompute")
+    .Input("inputs: Tinputs")
+    .Output("outputs: Toutputs")
+    .Attr("Tinputs: list(type) >= 0")
+    .Attr("Toutputs: list(type) >= 0")
+    .Attr("key: string")
+    .SetShapeFn(::tensorflow::shape_inference::UnknownShape);
 
 REGISTER_OP("_XlaSendFromHost")
-    .Input("input: dtypes")
-    .Attr("dtypes: list(type) >= 0");
+    .Input("input: Tinputs")
+    .Attr("Tinputs: list(type) >= 0")
+    .Attr("key: string")
+    .SetShapeFn(::tensorflow::shape_inference::UnknownShape);
 
 REGISTER_OP("_XlaRecvAtHost")
-    .Output("output: dtypes")
-    .Attr("dtypes: list(type) >= 0");
-
-REGISTER_OP("InputTest").Output("o: float");
-
-REGISTER_OP("UnaryTest").Input("a: float").Output("o: float");
+    .Output("output: Toutputs")
+    .Attr("Toutputs: list(type) >= 0")
+    .Attr("key: string")
+    .SetShapeFn(::tensorflow::shape_inference::UnknownShape);
+
+REGISTER_OP("InputTest")
+    .Output("o: float")
+    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
+      c->set_output(0, c->UnknownShape());
+      return Status::OK();
+    });
+
+REGISTER_OP("InputTestShaped")
+    .Output("o: float")
+    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
+      c->set_output(0, c->Vector(2));
+      return Status::OK();
+    });
+
+REGISTER_OP("UnaryTest")
+    .Input("a: float")
+    .Output("o: float")
+    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
+      ::tensorflow::shape_inference::ShapeHandle o;
+      TF_RETURN_IF_ERROR(c->Merge(c->UnknownShape(), c->input(0), &o));
+      c->set_output(0, o);
+      return Status::OK();
+    });
 REGISTER_OP("BinaryTest")
     .Input("a: float")
     .Input("b: float")
-    .Output("o: float");
+    .Output("o: float")
+    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
+      ::tensorflow::shape_inference::ShapeHandle o;
+      TF_RETURN_IF_ERROR(c->Merge(c->UnknownShape(), c->input(0), &o));
+      c->set_output(0, o);
+      return Status::OK();
+    });
+REGISTER_OP("BinaryTest2")
+    .Input("a: float")
+    .Input("b: float")
+    .Output("o: float")
+    .SetShapeFn(::tensorflow::shape_inference::UnknownShape);
 
 REGISTER_OP("AddNLikeTest")
     .Input("inputs: N * T")
@@ -124,22 +323,48 @@ Node* Input(const GraphDefBuilder::Options& opts) {
   return ops::SourceOp("InputTest", opts);
 }
 
-Node* RecvAtHost(const gtl::ArraySlice<DataType>& dtypes,
+Node* InputShaped(const GraphDefBuilder::Options& opts) {
+  return ops::SourceOp("InputTestShaped", opts);
+}
+
+Node* KnownShape(const gtl::ArraySlice<int>& shape,
+                 const GraphDefBuilder::Options& opts) {
+  if (opts.HaveError()) return nullptr;
+  NodeBuilder node_builder(opts.GetNameForOp("Const"), "Const",
+                           opts.op_registry());
+  TensorProto value;
+  value.set_dtype(DT_FLOAT);
+  for (int dim : shape) {
+    value.mutable_tensor_shape()->add_dim()->set_size(dim);
+  }
+  return opts.WithAttr("value", value)
+      .WithAttr("dtype", DT_FLOAT)
+      .FinalizeBuilder(&node_builder);
+}
+
+Node* RecvAtHost(const string& key, const gtl::ArraySlice<DataType>& dtypes,
                  const GraphDefBuilder::Options& opts) {
   if (opts.HaveError()) return nullptr;
   NodeBuilder node_builder(opts.GetNameForOp("_XlaRecvAtHost"),
                            "_XlaRecvAtHost", opts.op_registry());
-  return opts.WithAttr("dtypes", dtypes).FinalizeBuilder(&node_builder);
+  return opts.WithAttr("Toutputs", dtypes)
+      .WithAttr("key", key)
+      .FinalizeBuilder(&node_builder);
 }
 
-Node* SendFromHost(const std::vector<ops::NodeOut>& inputs,
-                   const gtl::ArraySlice<DataType>& dtypes,
+Node* SendFromHost(const string& key, const std::vector<ops::NodeOut>& inputs,
                    const GraphDefBuilder::Options& opts) {
   if (opts.HaveError()) return nullptr;
   NodeBuilder node_builder(opts.GetNameForOp("_XlaSendFromHost"),
                            "_XlaSendFromHost", opts.op_registry());
   node_builder.Input(inputs);
-  return opts.WithAttr("dtypes", dtypes).FinalizeBuilder(&node_builder);
+  std::vector<DataType> dtypes;
+  for (const auto& node : inputs) {
+    dtypes.push_back(node.dt);
+  }
+  return opts.WithAttr("key", key)
+      .WithAttr("Tinputs", dtypes)
+      .FinalizeBuilder(&node_builder);
 }
 
 Node* Unary(ops::NodeOut a, const GraphDefBuilder::Options& opts) {
@@ -151,6 +376,11 @@ Node* Binary(ops::NodeOut a, ops::NodeOut b,
   return ops::BinaryOp("BinaryTest", std::move(a), std::move(b), opts);
 }
 
+Node* BinaryUnknownShape(ops::NodeOut a, ops::NodeOut b,
+                         const GraphDefBuilder::Options& opts) {
+  return ops::BinaryOp("BinaryTest2", std::move(a), std::move(b), opts);
+}
+
 Node* AddNLike(const std::vector<ops::NodeOut>& inputs,
                const GraphDefBuilder::Options& opts) {
   if (opts.HaveError()) return nullptr;
@@ -576,6 +806,21 @@ TEST(EncapsulateSubgraphsTest, OneFunctionOneOutside) {
   FunctionDefLibrary library_expected;
   GraphDef graphdef_expected;
 
+  string shape_string_expected;
+  {
+    GraphDefBuilder shape(GraphDefBuilder::kFailImmediately);
+    Node* recv =
+        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT, DT_FLOAT},
+                   shape.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* e = Binary(ops::NodeOut(recv, 0), ops::NodeOut(recv, 1),
+                     shape.opts().WithName("E"));
+    SendFromHost("host_compute_channel_F1_O1", {e},
+                 shape.opts().WithName("outside_compilation_F1_O1_send"));
+    GraphDef shape_graph;
+    TF_EXPECT_OK(shape.ToGraphDef(&shape_graph));
+    EXPECT_TRUE(shape_graph.SerializeToString(&shape_string_expected));
+  }
+
   *library_expected.add_function() = test::function::XTimesTwo();
   *library_expected.add_function() = FunctionDefHelper::Create(
       "F1", {"a_0_arg:float", "b_0_arg:float"}, {"f_0_retval:float"}, {},
@@ -584,19 +829,18 @@ TEST(EncapsulateSubgraphsTest, OneFunctionOneOutside) {
           {{"c"}, "BinaryTest", {"b_0_arg", "C:o:0"}, {}, {"C"}},
           {{"F"},
            "BinaryTest",
-           {"C:o:0", "outside_compilation_O1_recv:output:0"},
+           {"C:o:0", "outside_compilation_O1_host_compute:outputs:0"},
            {},
-           {"outside_compilation_O1_recv"}},
-          {{"outside_compilation_O1_send"},
-           "_XlaSendToHost",
+           {"outside_compilation_O1_host_compute"}},
+          {{"outside_compilation_O1_host_compute"},
+           "_XlaHostCompute",
            {"C:o:0", "c:o:0"},
-           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})}},
+           {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})},
+            {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"key", "host_compute_channel_F1_O1"},
+            {"shape_inference_graph", shape_string_expected},
+            {"shapes", gtl::ArraySlice<DataType>({})}},
            {"c"}},
-          {{"outside_compilation_O1_recv"},
-           "_XlaRecvFromHost",
-           {},
-           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT})}},
-           {"outside_compilation_O1_send"}},
       },
       {{"f_0_retval", "F:o:0"}});
 
@@ -612,11 +856,11 @@ TEST(EncapsulateSubgraphsTest, OneFunctionOneOutside) {
     Node* call = b2.opts().FinalizeBuilder(&node_builder);
 
     Node* recv =
-        RecvAtHost({DT_FLOAT, DT_FLOAT},
+        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT, DT_FLOAT},
                    b2.opts().WithName("outside_compilation_F1_O1_recv"));
     Node* e = Binary(ops::NodeOut(recv, 0), ops::NodeOut(recv, 1),
                      b2.opts().WithName("E").WithControlInputs({recv, b}));
-    Node* send = SendFromHost({e}, {DT_FLOAT},
+    Node* send = SendFromHost("host_compute_channel_F1_O1", {e},
                               b2.opts()
                                   .WithName("outside_compilation_F1_O1_send")
                                   .WithControlInput(e));
@@ -674,37 +918,71 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
   FunctionDefLibrary library_expected;
   GraphDef graphdef_expected;
 
+  string shape_string_expected_1;
+  {
+    GraphDefBuilder shape1(GraphDefBuilder::kFailImmediately);
+    Node* recv =
+        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT, DT_FLOAT},
+                   shape1.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* e = Binary(ops::NodeOut(recv, 0), ops::NodeOut(recv, 1),
+                     shape1.opts().WithName("E"));
+    SendFromHost("host_compute_channel_F1_O1", {e},
+                 shape1.opts().WithName("outside_compilation_F1_O1_send"));
+    GraphDef shape1_graph;
+    TF_EXPECT_OK(shape1.ToGraphDef(&shape1_graph));
+    EXPECT_TRUE(shape1_graph.SerializeToString(&shape_string_expected_1));
+  }
+
+  string shape_string_expected_2;
+  {
+    GraphDefBuilder shape2(GraphDefBuilder::kFailImmediately);
+    Node* recv1 =
+        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT, DT_FLOAT},
+                   shape2.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* e = Binary(ops::NodeOut(recv1, 0), ops::NodeOut(recv1, 1),
+                     shape2.opts().WithName("E"));
+    Node* recv2 =
+        RecvAtHost("host_compute_channel_F1_O2", {DT_FLOAT, DT_FLOAT},
+                   shape2.opts().WithName("outside_compilation_F1_O2_recv"));
+    Node* h = Binary(ops::NodeOut(recv2, 0), e, shape2.opts().WithName("H"));
+    SendFromHost("host_compute_channel_F1_O2", {h},
+                 shape2.opts().WithName("outside_compilation_F1_O2_send"));
+    GraphDef shape2_graph;
+    TF_EXPECT_OK(shape2.ToGraphDef(&shape2_graph));
+    EXPECT_TRUE(shape2_graph.SerializeToString(&shape_string_expected_2));
+  }
+
   *library_expected.add_function() = FunctionDefHelper::Create(
       "F1", {"a_0_arg:float", "b_0_arg:float"}, {"i_0_retval:float"}, {},
       {
           {{"C"}, "UnaryTest", {"a_0_arg"}},
           {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}, {}},
-          {{"I"}, "UnaryTest", {"outside_compilation_O2_recv:output:0"}},
+          {{"I"},
+           "UnaryTest",
+           {"outside_compilation_O2_host_compute:outputs:0"}},
           {{"F"},
            "BinaryTest",
-           {"C:o:0", "outside_compilation_O1_recv:output:0"},
+           {"C:o:0", "outside_compilation_O1_host_compute:outputs:0"},
            {},
-           {"outside_compilation_O1_recv"}},
-          {{"outside_compilation_O2_send"},
-           "_XlaSendToHost",
+           {"outside_compilation_O1_host_compute"}},
+          {{"outside_compilation_O2_host_compute"},
+           "_XlaHostCompute",
            {"D:o:0", "F:o:0"},
-           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})}},
+           {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})},
+            {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"key", "host_compute_channel_F1_O2"},
+            {"shape_inference_graph", shape_string_expected_2},
+            {"shapes", gtl::ArraySlice<DataType>({})}},
            {"F"}},
-          {{"outside_compilation_O1_send"},
-           "_XlaSendToHost",
+          {{"outside_compilation_O1_host_compute"},
+           "_XlaHostCompute",
            {"C:o:0", "D:o:0"},
-           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})}},
+           {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})},
+            {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"key", "host_compute_channel_F1_O1"},
+            {"shape_inference_graph", shape_string_expected_1},
+            {"shapes", gtl::ArraySlice<DataType>({})}},
            {"D"}},
-          {{"outside_compilation_O2_recv"},
-           "_XlaRecvFromHost",
-           {},
-           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT})}},
-           {"outside_compilation_O2_send"}},
-          {{"outside_compilation_O1_recv"},
-           "_XlaRecvFromHost",
-           {},
-           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT})}},
-           {"outside_compilation_O1_send"}},
       },
       {{"i_0_retval", "I:o:0"}});
 
@@ -720,23 +998,24 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
     Node* call = b2.opts().FinalizeBuilder(&node_builder);
 
     Node* recv1 =
-        RecvAtHost({DT_FLOAT, DT_FLOAT},
+        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT, DT_FLOAT},
                    b2.opts().WithName("outside_compilation_F1_O1_recv"));
     Node* e = Binary(ops::NodeOut(recv1, 0), ops::NodeOut(recv1, 1),
                      b2.opts().WithName("E").WithControlInputs({recv1, b}));
-    Node* send1 = SendFromHost({e}, {DT_FLOAT},
+    Node* send1 = SendFromHost("host_compute_channel_F1_O1", {e},
                                b2.opts()
                                    .WithName("outside_compilation_F1_O1_send")
                                    .WithControlInput(e));
 
     Node* recv2 =
-        RecvAtHost({DT_FLOAT, DT_FLOAT},
+        RecvAtHost("host_compute_channel_F1_O2", {DT_FLOAT, DT_FLOAT},
                    b2.opts().WithName("outside_compilation_F1_O2_recv"));
     Node* g = Binary(e, ops::NodeOut(recv2, 1),
                      b2.opts().WithName("G").WithControlInputs({recv2, e}));
     Node* h = Binary(ops::NodeOut(recv2, 0), e, b2.opts().WithName("H"));
-    Node* send2 = SendFromHost(
-        {h}, {DT_FLOAT}, b2.opts().WithName("outside_compilation_F1_O2_send"));
+    Node* send2 =
+        SendFromHost("host_compute_channel_F1_O2", {h},
+                     b2.opts().WithName("outside_compilation_F1_O2_send"));
 
     Node* s = NoOp(b2.opts()
                        .WithName("F1_sequencer")
@@ -758,8 +1037,8 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
 
   {
     GraphDefBuilder b1(GraphDefBuilder::kFailImmediately);
-    Node* a = Input(b1.opts().WithName("A"));
-    Node* b = Input(b1.opts().WithName("B"));
+    Node* a = InputShaped(b1.opts().WithName("A"));
+    Node* b = InputShaped(b1.opts().WithName("B"));
     Node* c = Unary(a, b1.opts().WithName("C").WithAttr("_encapsulate", "F1"));
     Node* d =
         Binary(b, c, b1.opts().WithName("D").WithAttr("_encapsulate", "F1"));
@@ -791,6 +1070,24 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
   FunctionDefLibrary library_expected;
   GraphDef graphdef_expected;
 
+  string shape_string_expected;
+  {
+    GraphDefBuilder shape(GraphDefBuilder::kFailImmediately);
+    Node* recv =
+        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT, DT_FLOAT},
+                   shape.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* e = Binary(ops::NodeOut(recv, 0), ops::NodeOut(recv, 1),
+                     shape.opts().WithName("E"));
+    SendFromHost("host_compute_channel_F1_O1", {e},
+                 shape.opts().WithName("outside_compilation_F1_O1_send"));
+    GraphDef shape_graph;
+    TF_EXPECT_OK(shape.ToGraphDef(&shape_graph));
+    EXPECT_TRUE(shape_graph.SerializeToString(&shape_string_expected));
+  }
+
+  TensorShapeProto shape_proto_expected;
+  shape_proto_expected.add_dim()->set_size(2);
+
   *library_expected.add_function() = FunctionDefHelper::Create(
       "F1", {"a_0_arg:float", "b_0_arg:float"},
       {"f_0_retval:float", "d_0_retval:float"}, {},
@@ -799,19 +1096,18 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
           {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
           {{"F"},
            "BinaryTest",
-           {"C:o:0", "outside_compilation_O1_recv:output:0"},
+           {"C:o:0", "outside_compilation_O1_host_compute:outputs:0"},
            {},
-           {"outside_compilation_O1_recv"}},
-          {{"outside_compilation_O1_send"},
-           "_XlaSendToHost",
+           {"outside_compilation_O1_host_compute"}},
+          {{"outside_compilation_O1_host_compute"},
+           "_XlaHostCompute",
            {"C:o:0", "D:o:0"},
-           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})}},
+           {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})},
+            {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"key", "host_compute_channel_F1_O1"},
+            {"shape_inference_graph", shape_string_expected},
+            {"shapes", gtl::ArraySlice<DataType>({})}},
            {"D"}},
-          {{"outside_compilation_O1_recv"},
-           "_XlaRecvFromHost",
-           {},
-           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT})}},
-           {"outside_compilation_O1_send"}},
       },
       {{"d_0_retval", "D:o:0"}, {"f_0_retval", "F:o:0"}});
 
@@ -822,16 +1118,16 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
           {{"G"}, "BinaryTest", {"e_0_arg", "f_0_arg"}},
           {{"I"},
            "BinaryTest",
-           {"f_0_arg", "outside_compilation_O1_recv:output:0"}},
-          {{"outside_compilation_O1_send"},
-           "_XlaSendToHost",
+           {"f_0_arg", "outside_compilation_O1_host_compute:outputs:0"}},
+          {{"outside_compilation_O1_host_compute"},
+           "_XlaHostCompute",
            {"G:o:0"},
-           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT})}}},
-          {{"outside_compilation_O1_recv"},
-           "_XlaRecvFromHost",
-           {},
-           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT})}},
-           {"outside_compilation_O1_send"}},
+           {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"key", "host_compute_channel_F2_O1"},
+            {"shape_inference_graph", ""},
+            {"shapes",
+             gtl::ArraySlice<TensorShapeProto>({shape_proto_expected})}}},
       },
       {{"g_0_retval", "G:o:0"}, {"i_0_retval", "I:o:0"}});
 
@@ -839,15 +1135,15 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
     std::unique_ptr<FunctionLibraryDefinition> lib_def(
         new FunctionLibraryDefinition(OpRegistry::Global(), library_expected));
     GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get());
-    Node* a = Input(b2.opts().WithName("A"));
-    Node* b = Input(b2.opts().WithName("B"));
+    Node* a = InputShaped(b2.opts().WithName("A"));
+    Node* b = InputShaped(b2.opts().WithName("B"));
 
     Node* recv1 =
-        RecvAtHost({DT_FLOAT, DT_FLOAT},
+        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT, DT_FLOAT},
                    b2.opts().WithName("outside_compilation_F1_O1_recv"));
     Node* e = Binary(ops::NodeOut(recv1, 0), ops::NodeOut(recv1, 1),
                      b2.opts().WithName("E").WithControlInputs({recv1, b}));
-    Node* send1 = SendFromHost({e}, {DT_FLOAT},
+    Node* send1 = SendFromHost("host_compute_channel_F1_O1", {e},
                                b2.opts()
                                    .WithName("outside_compilation_F1_O1_send")
                                    .WithControlInput(e));
@@ -857,12 +1153,14 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
     Node* s1 = NoOp(
         b2.opts().WithName("F1_sequencer").WithControlInputs({recv1, send1}));
 
-    Node* recv2 = RecvAtHost(
-        {DT_FLOAT}, b2.opts().WithName("outside_compilation_F2_O1_recv"));
+    Node* recv2 =
+        RecvAtHost("host_compute_channel_F2_O1", {DT_FLOAT},
+                   b2.opts().WithName("outside_compilation_F2_O1_recv"));
     Node* h = Binary(ops::NodeOut(call1, 1), recv2,
                      b2.opts().WithName("H").WithControlInput(s1));
-    Node* send2 = SendFromHost(
-        {h}, {DT_FLOAT}, b2.opts().WithName("outside_compilation_F2_O1_send"));
+    Node* send2 =
+        SendFromHost("host_compute_channel_F2_O1", {h},
+                     b2.opts().WithName("outside_compilation_F2_O1_send"));
 
     NodeBuilder node_builder2("F2", "F2", lib_def.get());
     node_builder2.Input(e).Input(call1);
@@ -888,7 +1186,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) {
 
   {
     GraphDefBuilder b1(GraphDefBuilder::kFailImmediately);
-    Node* a = Input(b1.opts().WithName("A"));
+    Node* a = InputShaped(b1.opts().WithName("A"));
     Node* b = Input(b1.opts().WithName("B"));
     Node* c = Unary(a, b1.opts().WithName("C").WithAttr("_encapsulate", "F1"));
     Node* d =
@@ -908,6 +1206,9 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) {
   FunctionDefLibrary library_expected;
   GraphDef graphdef_expected;
 
+  TensorShapeProto shape_proto_expected;
+  shape_proto_expected.add_dim()->set_size(2);
+
   *library_expected.add_function() = FunctionDefHelper::Create(
       "F1", {"a_0_arg:float", "b_0_arg:float"}, {"f_0_retval:float"}, {},
       {
@@ -915,11 +1216,16 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) {
           {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
           {{"F"},
            "BinaryTest",
-           {"D:o:0", "outside_compilation_O1_recv:output:0"}},
-          {{"outside_compilation_O1_recv"},
-           "_XlaRecvFromHost",
+           {"D:o:0", "outside_compilation_O1_host_compute:outputs:0"}},
+          {{"outside_compilation_O1_host_compute"},
+           "_XlaHostCompute",
            {},
-           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT})}}},
+           {{"Tinputs", gtl::ArraySlice<DataType>({})},
+            {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"key", "host_compute_channel_F1_O1"},
+            {"shape_inference_graph", ""},
+            {"shapes",
+             gtl::ArraySlice<TensorShapeProto>({shape_proto_expected})}}},
       },
       {{"f_0_retval", "F:o:0"}});
 
@@ -927,12 +1233,13 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) {
     std::unique_ptr<FunctionLibraryDefinition> lib_def(
         new FunctionLibraryDefinition(OpRegistry::Global(), library_expected));
     GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get());
-    Node* a = Input(b2.opts().WithName("A"));
+    Node* a = InputShaped(b2.opts().WithName("A"));
     Node* b = Input(b2.opts().WithName("B"));
 
     Node* e = Unary(a, b2.opts().WithName("E"));
-    Node* send1 = SendFromHost(
-        {e}, {DT_FLOAT}, b2.opts().WithName("outside_compilation_F1_O1_send"));
+    Node* send1 =
+        SendFromHost("host_compute_channel_F1_O1", {e},
+                     b2.opts().WithName("outside_compilation_F1_O1_send"));
     NodeBuilder node_builder1("F1", "F1", lib_def.get());
     node_builder1.Input(a).Input(b);
     Node* call1 = b2.opts().FinalizeBuilder(&node_builder1);
@@ -954,7 +1261,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlInput) {
 
   {
     GraphDefBuilder b1(GraphDefBuilder::kFailImmediately);
-    Node* a = Input(b1.opts().WithName("A"));
+    Node* a = InputShaped(b1.opts().WithName("A"));
     Node* b = Input(b1.opts().WithName("B"));
     Node* c = Unary(a, b1.opts().WithName("C").WithAttr("_encapsulate", "F1"));
     Node* d =
@@ -975,6 +1282,9 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlInput) {
   FunctionDefLibrary library_expected;
   GraphDef graphdef_expected;
 
+  TensorShapeProto shape_proto_expected;
+  shape_proto_expected.add_dim()->set_size(2);
+
   *library_expected.add_function() = FunctionDefHelper::Create(
       "F1", {"a_0_arg:float", "b_0_arg:float"}, {"f_0_retval:float"}, {},
       {
@@ -982,17 +1292,17 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlInput) {
           {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
           {{"F"},
            "BinaryTest",
-           {"D:o:0", "outside_compilation_O1_recv:output:0"}},
-          {{"outside_compilation_O1_send"},
-           "_XlaSendToHost",
+           {"D:o:0", "outside_compilation_O1_host_compute:outputs:0"}},
+          {{"outside_compilation_O1_host_compute"},
+           "_XlaHostCompute",
            {},
-           {{"dtypes", gtl::ArraySlice<DataType>({})}},
+           {{"Tinputs", gtl::ArraySlice<DataType>({})},
+            {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"key", "host_compute_channel_F1_O1"},
+            {"shape_inference_graph", ""},
+            {"shapes",
+             gtl::ArraySlice<TensorShapeProto>({shape_proto_expected})}},
            {"D"}},
-          {{"outside_compilation_O1_recv"},
-           "_XlaRecvFromHost",
-           {},
-           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT})}},
-           {"outside_compilation_O1_send"}},
       },
       {{"f_0_retval", "F:o:0"}});
 
@@ -1000,14 +1310,16 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlInput) {
     std::unique_ptr<FunctionLibraryDefinition> lib_def(
         new FunctionLibraryDefinition(OpRegistry::Global(), library_expected));
     GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get());
-    Node* a = Input(b2.opts().WithName("A"));
+    Node* a = InputShaped(b2.opts().WithName("A"));
     Node* b = Input(b2.opts().WithName("B"));
 
     Node* recv1 =
-        RecvAtHost({}, b2.opts().WithName("outside_compilation_F1_O1_recv"));
+        RecvAtHost("host_compute_channel_F1_O1", {},
+                   b2.opts().WithName("outside_compilation_F1_O1_recv"));
     Node* e = Unary(a, b2.opts().WithName("E").WithControlInput(recv1));
-    Node* send1 = SendFromHost(
-        {e}, {DT_FLOAT}, b2.opts().WithName("outside_compilation_F1_O1_send"));
+    Node* send1 =
+        SendFromHost("host_compute_channel_F1_O1", {e},
+                     b2.opts().WithName("outside_compilation_F1_O1_send"));
     NodeBuilder node_builder1("F1", "F1", lib_def.get());
     node_builder1.Input(a).Input(b);
     Node* call1 = b2.opts().FinalizeBuilder(&node_builder1);
@@ -1055,10 +1367,14 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoOutputs) {
           {{"C"}, "UnaryTest", {"a_0_arg"}},
           {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
           {{"F"}, "UnaryTest", {"D:o:0"}},
-          {{"outside_compilation_O1_send"},
-           "_XlaSendToHost",
+          {{"outside_compilation_O1_host_compute"},
+           "_XlaHostCompute",
            {"D:o:0"},
-           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT})}}},
+           {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"Toutputs", gtl::ArraySlice<DataType>({})},
+            {"key", "host_compute_channel_F1_O1"},
+            {"shape_inference_graph", ""},
+            {"shapes", gtl::ArraySlice<TensorShapeProto>({})}}},
       },
       {{"f_0_retval", "F:o:0"}});
 
@@ -1069,8 +1385,9 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoOutputs) {
     Node* a = Input(b2.opts().WithName("A"));
     Node* b = Input(b2.opts().WithName("B"));
 
-    Node* recv1 = RecvAtHost(
-        {DT_FLOAT}, b2.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* recv1 =
+        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT},
+                   b2.opts().WithName("outside_compilation_F1_O1_recv"));
     Node* e = Unary(recv1, b2.opts().WithName("E"));
     NodeBuilder node_builder1("F1", "F1", lib_def.get());
     node_builder1.Input(a).Input(b);
@@ -1118,16 +1435,19 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlOutput) {
       {
           {{"C"}, "UnaryTest", {"a_0_arg"}},
           {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
-          {{"F"}, "UnaryTest", {"D:o:0"}, {}, {"outside_compilation_O1_recv"}},
-          {{"outside_compilation_O1_send"},
-           "_XlaSendToHost",
+          {{"F"},
+           "UnaryTest",
            {"D:o:0"},
-           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT})}}},
-          {{"outside_compilation_O1_recv"},
-           "_XlaRecvFromHost",
            {},
-           {{"dtypes", gtl::ArraySlice<DataType>({})}},
-           {"outside_compilation_O1_send"}},
+           {"outside_compilation_O1_host_compute"}},
+          {{"outside_compilation_O1_host_compute"},
+           "_XlaHostCompute",
+           {"D:o:0"},
+           {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"Toutputs", gtl::ArraySlice<DataType>({})},
+            {"key", "host_compute_channel_F1_O1"},
+            {"shape_inference_graph", ""},
+            {"shapes", gtl::ArraySlice<TensorShapeProto>({})}}},
       },
       {{"f_0_retval", "F:o:0"}});
 
@@ -1138,10 +1458,11 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlOutput) {
     Node* a = Input(b2.opts().WithName("A"));
     Node* b = Input(b2.opts().WithName("B"));
 
-    Node* recv1 = RecvAtHost(
-        {DT_FLOAT}, b2.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* recv1 =
+        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT},
+                   b2.opts().WithName("outside_compilation_F1_O1_recv"));
     Node* e = Unary(recv1, b2.opts().WithName("E"));
-    Node* send1 = SendFromHost({}, {},
+    Node* send1 = SendFromHost("host_compute_channel_F1_O1", {},
                                b2.opts()
                                    .WithName("outside_compilation_F1_O1_send")
                                    .WithControlInput(e));
@@ -1215,5 +1536,110 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputsOrOutputs) {
   TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library);
 }
 
+// Test for shape inference of outside compilation.
+TEST(EncapsulateSubgraphsTest, OutsideCompilationShapeInference) {
+  FunctionDefLibrary library;
+  GraphDef graphdef;
+
+  {
+    *library.add_function() = test::function::XTimesTwo();
+
+    GraphDefBuilder b1(GraphDefBuilder::kFailImmediately);
+    Node* a = InputShaped(b1.opts().WithName("A"));
+    Node* b = Input(b1.opts().WithName("B"));
+    // Give nodes 'c' and 'd' names that collide after lowercasing.
+    Node* c = Unary(a, b1.opts().WithName("C"));
+    Node* d = Unary(b, b1.opts().WithName("c").WithControlInput(c).WithAttr(
+                           "_encapsulate", "F1"));
+    Node* e = BinaryUnknownShape(c, d,
+                                 b1.opts()
+                                     .WithName("E")
+                                     .WithControlInputs({b, d})
+                                     .WithAttr("_encapsulate", "F1")
+                                     .WithAttr("_outside", "O1"));
+    Node* f = Binary(c, e,
+                     b1.opts().WithName("F").WithControlInput(e).WithAttr(
+                         "_encapsulate", "F1"));
+    Binary(a, f, b1.opts().WithName("G").WithControlInput(e));
+    TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
+  }
+
+  TF_EXPECT_OK(Encapsulate(&graphdef, &library));
+
+  FunctionDefLibrary library_expected;
+  GraphDef graphdef_expected;
+
+  string shape_string_expected;
+  {
+    GraphDefBuilder shape(GraphDefBuilder::kFailImmediately);
+    Node* known = KnownShape({2}, shape.opts().WithName("KnownShape/_0"));
+    Node* recv =
+        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT},
+                   shape.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* e = BinaryUnknownShape(known, recv, shape.opts().WithName("E"));
+    SendFromHost("host_compute_channel_F1_O1", {e},
+                 shape.opts().WithName("outside_compilation_F1_O1_send"));
+    GraphDef shape_graph;
+    TF_EXPECT_OK(shape.ToGraphDef(&shape_graph));
+    EXPECT_TRUE(shape_graph.SerializeToString(&shape_string_expected));
+  }
+
+  *library_expected.add_function() = test::function::XTimesTwo();
+  *library_expected.add_function() = FunctionDefHelper::Create(
+      "F1", {"b_0_arg:float", "c_0_arg:float"}, {"f_0_retval:float"}, {},
+      {
+          {{"c"}, "UnaryTest", {"b_0_arg"}, {}, {}},
+          {{"F"},
+           "BinaryTest",
+           {"c_0_arg", "outside_compilation_O1_host_compute:outputs:0"},
+           {},
+           {"outside_compilation_O1_host_compute"}},
+          {{"outside_compilation_O1_host_compute"},
+           "_XlaHostCompute",
+           {"c:o:0"},
+           {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"key", "host_compute_channel_F1_O1"},
+            {"shape_inference_graph", shape_string_expected},
+            {"shapes", gtl::ArraySlice<DataType>({})}},
+           {"c"}},
+      },
+      {{"f_0_retval", "F:o:0"}});
+
+  {
+    std::unique_ptr<FunctionLibraryDefinition> lib_def(
+        new FunctionLibraryDefinition(OpRegistry::Global(), library_expected));
+    GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get());
+    Node* a = InputShaped(b2.opts().WithName("A"));
+    Node* b = Input(b2.opts().WithName("B"));
+    Node* c = Unary(a, b2.opts().WithName("C"));
+
+    NodeBuilder node_builder("F1", "F1", lib_def.get());
+    node_builder.Input(b).Input(c);
+    Node* call =
+        b2.opts().WithControlInputs({c}).FinalizeBuilder(&node_builder);
+
+    Node* recv =
+        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT},
+                   b2.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* e = BinaryUnknownShape(
+        c, ops::NodeOut(recv, 0),
+        b2.opts().WithName("E").WithControlInputs({recv, b}));
+    Node* send = SendFromHost("host_compute_channel_F1_O1", {e},
+                              b2.opts()
+                                  .WithName("outside_compilation_F1_O1_send")
+                                  .WithControlInput(e));
+
+    Node* s = NoOp(
+        b2.opts().WithName("F1_sequencer").WithControlInputs({recv, send}));
+
+    Binary(a, call, b2.opts().WithName("G").WithControlInputs({s, e}));
+    TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
+  }
+
+  TF_EXPECT_GRAPH_EQ(graphdef_expected, graphdef);
+  TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc
index d6b576166c..eae8e6c3c1 100644
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@@ -1064,26 +1064,36 @@ Status FunctionLibraryDefinition::AddLibrary(
   return Status::OK();
 }
 
-void FunctionLibraryDefinition::RemoveFunction(const string& func) {
+Status FunctionLibraryDefinition::RemoveFunction(const string& func) {
   const auto& i = function_defs_.find(func);
-  DCHECK(i != function_defs_.end());
+  if (i == function_defs_.end()) {
+    return errors::InvalidArgument("Tried to remove non-existent function ",
+                                   func);
+  }
   function_defs_.erase(i);
+  return Status::OK();
 }
 
-void FunctionLibraryDefinition::RemoveGradient(const string& func) {
+Status FunctionLibraryDefinition::RemoveGradient(const string& func) {
   const auto& i = func_grad_.find(func);
-  DCHECK(i != func_grad_.end());
+  if (i == func_grad_.end()) {
+    return errors::InvalidArgument("Tried to remove non-existent gradient ",
+                                   func);
+  }
   func_grad_.erase(i);
+  return Status::OK();
 }
 
 void FunctionLibraryDefinition::Remove(
     const std::vector<string>& funcs,
     const std::vector<string>& funcs_with_grads) {
   for (const string& f : funcs) {
-    RemoveFunction(f);
+    Status s = RemoveFunction(f);
+    DCHECK(s.ok());
   }
   for (const string& f : funcs_with_grads) {
-    RemoveGradient(f);
+    Status s = RemoveGradient(f);
+    DCHECK(s.ok());
   }
 }
 
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index b933ee0b0e..7d0e15641d 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -312,6 +312,14 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   // This operation is atomic.
   Status AddGradientDef(const GradientDef& grad);
 
+  // Remove function `func` from the library. Returns non-OK Status unless
+  // `func` is in the library.
+  Status RemoveFunction(const string& func);
+
+  // Remove gradient of function `func` from the library. Returns non-OK Status
+  // unless `func` has a gradient.
+  Status RemoveGradient(const string& func);
+
   // Adds the functions and gradients in 'other' to this function library.
   // Duplicate functions and gradients are ignored.
   // This operation is atomic.
@@ -384,13 +392,6 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   // attr from.
   const FunctionDef* GetAttrImpl(const NodeDef& ndef) const;
 
-  // Remove function `func` from the library. `func` must be in the library.
-  void RemoveFunction(const string& func);
-
-  // Remove gradient of function `func` from the library. `func` must have
-  // a gradient.
-  void RemoveGradient(const string& func);
-
   // Remove all functions in `funcs` and all gradients of
   // functions in `funcs_with_grads` from this library.
   void Remove(const std::vector<string>& funcs,
-- 
GitLab


From 51792c887abd425693a3c36f16ea221b949f7277 Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Fri, 2 Feb 2018 11:24:11 -0800
Subject: [PATCH 587/966] Register resource_scatter_update for string types.

PiperOrigin-RevId: 184309674
---
 tensorflow/core/kernels/resource_variable_ops.cc     |  3 +++
 tensorflow/core/ops/resource_variable_ops.cc         |  2 +-
 .../kernel_tests/resource_variable_ops_test.py       | 12 ++++++++++++
 3 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index 9cc8e03e3a..6ce53e725f 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -635,6 +635,9 @@ class ResourceScatterUpdateOp : public OpKernel {
 
 TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ARITHEMTIC_CPU);
 
+REGISTER_SCATTER_KERNEL(string, CPU, "ResourceScatterUpdate",
+                        scatter_op::UpdateOp::ASSIGN);
+
 // Registers GPU kernels.
 #if GOOGLE_CUDA
 #define REGISTER_SCATTER_ARITHEMTIC_GPU(type) \
diff --git a/tensorflow/core/ops/resource_variable_ops.cc b/tensorflow/core/ops/resource_variable_ops.cc
index f6cfbf873a..8dae7e1ff5 100644
--- a/tensorflow/core/ops/resource_variable_ops.cc
+++ b/tensorflow/core/ops/resource_variable_ops.cc
@@ -193,7 +193,7 @@ REGISTER_OP("ResourceScatterUpdate")
     .Input("resource: resource")
     .Input("indices: Tindices")
     .Input("updates: dtype")
-    .Attr("dtype: numbertype")
+    .Attr("dtype: type")
     .Attr("Tindices: {int32, int64}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeAndType handle_shape_and_type;
diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index b4b555591d..cd94579688 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -36,6 +36,7 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
+from tensorflow.python.util import compat
 
 
 @test_util.with_c_api
@@ -170,6 +171,17 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
     self.assertEqual(self.evaluate(read), [[3]])
 
+  def testScatterUpdateString(self):
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.string, shape=[1, 1])
+    self.evaluate(resource_variable_ops.assign_variable_op(
+        handle, constant_op.constant([["a"]], dtype=dtypes.string)))
+    self.evaluate(resource_variable_ops.resource_scatter_update(
+        handle, [0], constant_op.constant([["b"]], dtype=dtypes.string)))
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.string)
+    self.assertEqual(compat.as_bytes(self.evaluate(read)[0][0]),
+                     compat.as_bytes("b"))
+
   # TODO(alive): get this to work in Eager mode.
   def testGPU(self):
     with self.test_session(use_gpu=True):
-- 
GitLab


From 224874002f93fec471e401488e23d97d4f36c4fc Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 2 Feb 2018 11:24:17 -0800
Subject: [PATCH 588/966] Allow ResizeBilinear to resize the output tensor in
 Prepare(), if the size tensor is const.

PiperOrigin-RevId: 184309687
---
 .../contrib/lite/kernels/resize_bilinear.cc   | 33 +++++---
 .../lite/kernels/resize_bilinear_test.cc      | 81 ++++++++++++++++---
 2 files changed, 92 insertions(+), 22 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/resize_bilinear.cc b/tensorflow/contrib/lite/kernels/resize_bilinear.cc
index 9a419af023..4a2101f246 100644
--- a/tensorflow/contrib/lite/kernels/resize_bilinear.cc
+++ b/tensorflow/contrib/lite/kernels/resize_bilinear.cc
@@ -36,6 +36,17 @@ constexpr int kInputTensor = 0;
 constexpr int kSizeTensor = 1;
 constexpr int kOutputTensor = 0;
 
+TfLiteStatus ResizeOutputTensor(TfLiteContext* context, TfLiteTensor* input,
+                                TfLiteTensor* size, TfLiteTensor* output) {
+  TfLiteIntArray* output_size = TfLiteIntArrayCreate(4);
+  output_size->data[0] = input->dims->data[0];
+  const int32* size_data = GetTensorData<int32>(size);
+  output_size->data[1] = size_data[0];
+  output_size->data[2] = size_data[1];
+  output_size->data[3] = input->dims->data[3];
+  return context->ResizeTensor(context, output, output_size);
+}
+
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
@@ -55,9 +66,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // integers.
   output->type = kTfLiteFloat32;
 
-  // TODO(ahentz): if the input is constant, we can allocate here.
-  output->allocation_type = kTfLiteDynamic;
-  return kTfLiteOk;
+  if (!IsConstantTensor(size)) {
+    SetTensorToDynamic(output);
+    return kTfLiteOk;
+  }
+  return ResizeOutputTensor(context, input, size, output);
 }
 
 template <KernelType kernel_type>
@@ -66,15 +79,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   TfLiteTensor* size = GetInput(context, node, kSizeTensor);
 
-  // TODO(ahentz): we only need to do this here if it wasn't done in Eval().
-  TfLiteIntArray* output_size = TfLiteIntArrayCreate(4);
-  output_size->data[0] = input->dims->data[0];
-  const int32* size_data = GetTensorData<int32>(size);
-  output_size->data[1] = size_data[0];
-  output_size->data[2] = size_data[1];
-  output_size->data[3] = input->dims->data[3];
-  context->ResizeTensor(context, output, output_size);
-  TfLiteTensorRealloc(output->bytes, output);
+  if (IsDynamicTensor(output)) {
+    TF_LITE_ENSURE_OK(context,
+                      ResizeOutputTensor(context, input, size, output));
+    TfLiteTensorRealloc(output->bytes, output);
+  }
 
   if (output->type == kTfLiteFloat32) {
 #define TF_LITE_RESIZE_BILINEAR(type)                                     \
diff --git a/tensorflow/contrib/lite/kernels/resize_bilinear_test.cc b/tensorflow/contrib/lite/kernels/resize_bilinear_test.cc
index 2b1aaf654f..4e03f3820a 100644
--- a/tensorflow/contrib/lite/kernels/resize_bilinear_test.cc
+++ b/tensorflow/contrib/lite/kernels/resize_bilinear_test.cc
@@ -25,14 +25,24 @@ using ::testing::ElementsAreArray;
 
 class ResizeBilinearOpModel : public SingleOpModel {
  public:
-  ResizeBilinearOpModel(std::initializer_list<int> input_shape) {
-    input_ = AddInput(TensorType_FLOAT32);
-    size_ = AddInput(TensorType_INT32);
-    output_ = AddOutput(TensorType_FLOAT32);
+  ResizeBilinearOpModel(const TensorData& input,
+                        std::initializer_list<int> size_data = {}) {
+    bool const_size = size_data.size() != 0;
+    input_ = AddInput(input);
+    if (const_size) {
+      size_ = AddConstInput(TensorType_INT32, size_data, {2});
+    } else {
+      size_ = AddInput({TensorType_INT32, {2}});
+    }
+    output_ = AddOutput(TensorType_FLOAT32);  // Always float.
     SetBuiltinOp(BuiltinOperator_RESIZE_BILINEAR,
                  BuiltinOptions_ResizeBilinearOptions,
                  CreateResizeBilinearOptions(builder_).Union());
-    BuildInterpreter({input_shape, {2}});
+    if (const_size) {
+      BuildInterpreter({GetShape(input_)});
+    } else {
+      BuildInterpreter({GetShape(input_), GetShape(size_)});
+    }
   }
 
   void SetInput(std::initializer_list<float> data) {
@@ -49,23 +59,33 @@ class ResizeBilinearOpModel : public SingleOpModel {
 };
 
 TEST(ResizeBilinearOpTest, HorizontalResize) {
-  ResizeBilinearOpModel m({1, 1, 2, 1});
+  ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 1, 2, 1}});
   m.SetInput({3, 6});
   m.SetSize({1, 3});
   m.Invoke();
   EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({3, 5, 6})));
+
+  ResizeBilinearOpModel const_m({TensorType_FLOAT32, {1, 1, 2, 1}}, {1, 3});
+  const_m.SetInput({3, 6});
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput(), ElementsAreArray(ArrayFloatNear({3, 5, 6})));
 }
 
 TEST(ResizeBilinearOpTest, VerticalResize) {
-  ResizeBilinearOpModel m({1, 2, 1, 1});
+  ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 2, 1, 1}});
   m.SetInput({3, 9});
   m.SetSize({3, 1});
   m.Invoke();
   EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({3, 7, 9})));
+
+  ResizeBilinearOpModel const_m({TensorType_FLOAT32, {1, 2, 1, 1}}, {3, 1});
+  const_m.SetInput({3, 9});
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput(), ElementsAreArray(ArrayFloatNear({3, 7, 9})));
 }
 
 TEST(ResizeBilinearOpTest, TwoDimensionalResize) {
-  ResizeBilinearOpModel m({1, 2, 2, 1});
+  ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}});
   m.SetInput({
       3, 6,  //
       9, 12  //
@@ -77,10 +97,22 @@ TEST(ResizeBilinearOpTest, TwoDimensionalResize) {
                                  7, 9, 10,   //
                                  9, 11, 12,  //
                              })));
+
+  ResizeBilinearOpModel const_m({TensorType_FLOAT32, {1, 2, 2, 1}}, {3, 3});
+  const_m.SetInput({
+      3, 6,  //
+      9, 12  //
+  });
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput(), ElementsAreArray(ArrayFloatNear({
+                                       3, 5, 6,    //
+                                       7, 9, 10,   //
+                                       9, 11, 12,  //
+                                   })));
 }
 
 TEST(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatches) {
-  ResizeBilinearOpModel m({2, 2, 2, 1});
+  ResizeBilinearOpModel m({TensorType_FLOAT32, {2, 2, 2, 1}});
   m.SetInput({
       3, 6,   //
       9, 12,  //
@@ -97,10 +129,27 @@ TEST(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatches) {
                                  8, 12, 14,   //
                                  10, 14, 16,  //
                              })));
+
+  ResizeBilinearOpModel const_m({TensorType_FLOAT32, {2, 2, 2, 1}}, {3, 3});
+  const_m.SetInput({
+      3, 6,   //
+      9, 12,  //
+      4, 10,  //
+      10, 16  //
+  });
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput(), ElementsAreArray(ArrayFloatNear({
+                                       3, 5, 6,     //
+                                       7, 9, 10,    //
+                                       9, 11, 12,   //
+                                       4, 8, 10,    //
+                                       8, 12, 14,   //
+                                       10, 14, 16,  //
+                                   })));
 }
 
 TEST(ResizeBilinearOpTest, ThreeDimensionalResize) {
-  ResizeBilinearOpModel m({1, 2, 2, 2});
+  ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 2, 2, 2}});
   m.SetInput({
       3, 4, 6, 10,    //
       9, 10, 12, 16,  //
@@ -112,6 +161,18 @@ TEST(ResizeBilinearOpTest, ThreeDimensionalResize) {
                                  7, 8, 9, 12, 10, 14,    //
                                  9, 10, 11, 14, 12, 16,  //
                              })));
+
+  ResizeBilinearOpModel const_m({TensorType_FLOAT32, {1, 2, 2, 2}}, {3, 3});
+  const_m.SetInput({
+      3, 4, 6, 10,    //
+      9, 10, 12, 16,  //
+  });
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput(), ElementsAreArray(ArrayFloatNear({
+                                       3, 4, 5, 8, 6, 10,      //
+                                       7, 8, 9, 12, 10, 14,    //
+                                       9, 10, 11, 14, 12, 16,  //
+                                   })));
 }
 
 }  // namespace
-- 
GitLab


From 22116459b258d5753aa76410ab6f4d3cbc928a5a Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Fri, 2 Feb 2018 11:31:06 -0800
Subject: [PATCH 589/966] [TF:XLA] Improve/refactor the handling of resource
 types/shapes.

Previously we used an xla::Shape to track the shape of a resource (Variable, TensorArray, Stack) shape. The xla::Shape described how the resource was represented to XLA, e.g., as a (buffer, size) pair for a Stack resource.

Instead, separate the TensorFlow abstract shape representation from the XLA shape representation and track it separately. This leads to simpler and more readable code.

PiperOrigin-RevId: 184310694
---
 .../compiler/jit/kernels/xla_launch_op.cc     |   4 +-
 .../compiler/jit/xla_compilation_cache.cc     |  11 +-
 tensorflow/compiler/tf2xla/graph_compiler.cc  |   4 +-
 .../compiler/tf2xla/kernels/stack_ops.cc      |  20 +--
 .../tf2xla/kernels/strided_slice_op.cc        |  11 +-
 .../tf2xla/kernels/tensor_array_ops.cc        |  33 ++---
 .../compiler/tf2xla/kernels/training_ops.cc   | 113 +++++---------
 .../compiler/tf2xla/kernels/variable_ops.cc   |  51 ++++---
 .../compiler/tf2xla/kernels/while_op.cc       |  33 ++---
 tensorflow/compiler/tf2xla/tf2xla.cc          |   4 +-
 tensorflow/compiler/tf2xla/xla_compiler.cc    | 133 ++++++++++++-----
 tensorflow/compiler/tf2xla/xla_compiler.h     |  26 ++--
 .../compiler/tf2xla/xla_compiler_test.cc      |  26 ++--
 tensorflow/compiler/tf2xla/xla_context.cc     |  12 +-
 tensorflow/compiler/tf2xla/xla_context.h      |  10 +-
 tensorflow/compiler/tf2xla/xla_op_kernel.cc   |  30 +++-
 tensorflow/compiler/tf2xla/xla_op_kernel.h    |  11 +-
 tensorflow/compiler/tf2xla/xla_resource.cc    | 139 +++++++++++-------
 tensorflow/compiler/tf2xla/xla_resource.h     |  38 +++--
 19 files changed, 385 insertions(+), 324 deletions(-)

diff --git a/tensorflow/compiler/jit/kernels/xla_launch_op.cc b/tensorflow/compiler/jit/kernels/xla_launch_op.cc
index 17ae2bb25c..6353149e4a 100644
--- a/tensorflow/compiler/jit/kernels/xla_launch_op.cc
+++ b/tensorflow/compiler/jit/kernels/xla_launch_op.cc
@@ -376,8 +376,6 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
     OP_REQUIRES(ctx,
                 write.input_index >= 0 && write.input_index < ctx->num_inputs(),
                 errors::Internal("Invalid input index for variable write."));
-    TensorShape write_shape;
-    OP_REQUIRES_OK(ctx, XLAShapeToTensorShape(write.shape, &write_shape));
 
     gpu::DeviceMemoryBase buffer = output->buffer({output_num});
 
@@ -399,7 +397,7 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
 
     // Looks up the owning Tensor by buffer address.
     OP_REQUIRES_OK(
-        ctx, xla_allocator.MakeTensorFromBuffer(buffer, write.type, write_shape,
+        ctx, xla_allocator.MakeTensorFromBuffer(buffer, write.type, write.shape,
                                                 variable->tensor()));
     ++output_num;
   }
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.cc b/tensorflow/compiler/jit/xla_compilation_cache.cc
index 21d3a54f1b..6d854a920e 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.cc
+++ b/tensorflow/compiler/jit/xla_compilation_cache.cc
@@ -148,8 +148,7 @@ Status BuildArguments(int num_constant_args,
     XlaCompiler::Argument& arg = (*args)[input_num];
     arg.kind = XlaCompiler::Argument::kConstant;
     arg.type = input.dtype();
-    TF_RETURN_IF_ERROR(
-        TensorShapeToXLAShape(input.dtype(), input.shape(), &arg.shape));
+    arg.shape = input.shape();
     arg.constant_value = input;
     ++input_num;
   }
@@ -170,8 +169,7 @@ Status BuildArguments(int num_constant_args,
       arg.constant_value = input;
     }
     arg.type = input.dtype();
-    TF_RETURN_IF_ERROR(
-        TensorShapeToXLAShape(input.dtype(), input.shape(), &arg.shape));
+    arg.shape = input.shape();
     ++input_num;
   }
 
@@ -189,8 +187,7 @@ Status BuildArguments(int num_constant_args,
     if (variable_args[variable_id].present) {
       const Tensor& value = variable_args[variable_id].value;
       arg.type = value.dtype();
-      TF_RETURN_IF_ERROR(
-          TensorShapeToXLAShape(value.dtype(), value.shape(), &arg.shape));
+      arg.shape = value.shape();
       arg.initialized = true;
     } else {
       // The values of uninitialized variables are not passed as inputs, since
@@ -199,7 +196,7 @@ Status BuildArguments(int num_constant_args,
       // uninitialized variables.
       arg.initialized = false;
       arg.type = DT_INVALID;
-      arg.shape = xla::Shape();
+      arg.shape = TensorShape();
     }
     ++input_num;
   }
diff --git a/tensorflow/compiler/tf2xla/graph_compiler.cc b/tensorflow/compiler/tf2xla/graph_compiler.cc
index 02215b5112..1418d95956 100644
--- a/tensorflow/compiler/tf2xla/graph_compiler.cc
+++ b/tensorflow/compiler/tf2xla/graph_compiler.cc
@@ -60,9 +60,7 @@ Status PrepareArguments(XlaOpKernelContext* ctx, Graph* graph,
   for (int i = 0; i < args->size(); ++i) {
     XlaCompiler::Argument& arg = (*args)[i];
     arg.type = ctx->input_type(i);
-
-    TF_RETURN_IF_ERROR(
-        TensorShapeToXLAShape(arg.type, ctx->InputShape(i), &arg.shape));
+    arg.shape = ctx->InputShape(i);
 
     if (arg.type == DT_RESOURCE) {
       return errors::InvalidArgument(
diff --git a/tensorflow/compiler/tf2xla/kernels/stack_ops.cc b/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
index d77fb768ef..1a78c7ab9b 100644
--- a/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
@@ -77,10 +77,8 @@ Status MaybeInitializeStack(xla::ComputationBuilder* builder,
     // Stack has not been initialized.
     xla::ComputationDataHandle zero =
         XlaHelpers::Zero(builder, resource->type());
-    TF_RETURN_IF_ERROR(resource->SetValue(
-        dtype,
-        builder->Tuple({builder->Broadcast(zero, stack_shape.dim_sizes()),
-                        builder->ConstantR0<int32>(0)})));
+    TF_RETURN_IF_ERROR(resource->SetTypeAndShape(dtype, elem_shape));
+    TF_RETURN_IF_ERROR(resource->SetZeroValue(builder));
   } else {
     // Checks the expected shape matches the actual shape.
     TensorShape actual_shape;
@@ -119,8 +117,8 @@ class StackOp : public XlaOpKernel {
     string name = strings::StrCat("Stack: ", stack_name_);
     OP_REQUIRES_OK(
         ctx, xc.CreateResource(XlaResource::kStack, -1, std::move(name), dtype_,
-                               value, &resource));
-    resource->set_tensor_array_size(size);
+                               TensorShape(), value, /*tensor_array_size=*/size,
+                               /*tensor_array_gradients=*/{}, &resource));
     ctx->SetResourceOutput(0, resource);
   }
 
@@ -164,11 +162,9 @@ class StackPushOp : public XlaOpKernel {
 
     // TODO(phawkins): We don't check the index is in bounds --- there is no
     // error mechanism in XLA.
-    OP_REQUIRES_OK(
-        ctx,
-        resource->SetValue(
-            dtype_, b->Tuple({b->DynamicUpdateSlice(ta, update, start_indices),
-                              b->Add(index, b->ConstantR0<int32>(1))})));
+    OP_REQUIRES_OK(ctx, resource->SetValue(b->Tuple(
+                            {b->DynamicUpdateSlice(ta, update, start_indices),
+                             b->Add(index, b->ConstantR0<int32>(1))})));
 
     ctx->SetOutput(0, value);
   }
@@ -208,7 +204,7 @@ class StackPopOp : public XlaOpKernel {
     xla::ComputationDataHandle index = b->GetTupleElement(state, 1);
 
     index = b->Sub(index, b->ConstantR0<int32>(1));
-    OP_REQUIRES_OK(ctx, resource->SetValue(dtype_, b->Tuple({ta, index})));
+    OP_REQUIRES_OK(ctx, resource->SetValue(b->Tuple({ta, index})));
 
     // start_indices of the DynamicSlice are [index, 0, 0, ..., 0].
     auto start_indices =
diff --git a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
index f0525a5fb8..91c169428c 100644
--- a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
@@ -231,6 +231,7 @@ class StridedSliceAssignOp : public XlaOpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("new_axis_mask", &new_axis_mask_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("shrink_axis_mask", &shrink_axis_mask_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("Index", &index_type_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &dtype_));
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
@@ -252,9 +253,9 @@ class StridedSliceAssignOp : public XlaOpKernel {
     OP_REQUIRES_OK(ctx, LiteralToHostTensor(strides_literal, index_type_,
                                             &strides_tensor));
 
-    DataType lhs_type;
     TensorShape lhs_shape;
-    OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(0, &lhs_type, &lhs_shape));
+    xla::ComputationDataHandle lhs;
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, dtype_, &lhs_shape, &lhs));
 
     const TensorShape rhs_shape = ctx->InputShape(4);
 
@@ -282,9 +283,6 @@ class StridedSliceAssignOp : public XlaOpKernel {
                     " does not match r-value shape ", rhs_shape.DebugString(),
                     ". Automatic broadcasting not yet implemented."));
 
-    xla::ComputationDataHandle lhs;
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &lhs));
-
     xla::ComputationDataHandle rhs = ctx->Input(4);
 
     gtl::InlinedVector<int64, 4> dimensions_to_reverse;
@@ -320,13 +318,14 @@ class StridedSliceAssignOp : public XlaOpKernel {
           lhs, rhs, ctx->builder()->ConstantR1<int64>(slice_begin));
     }
 
-    OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, lhs_type, lhs));
+    OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, dtype_, lhs));
   }
 
  private:
   int32 begin_mask_, end_mask_;
   int32 ellipsis_mask_, new_axis_mask_, shrink_axis_mask_;
   DataType index_type_;
+  DataType dtype_;
 };
 
 REGISTER_XLA_OP(Name("ResourceStridedSliceAssign")
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
index 9224072a3c..7cf9b796b9 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
@@ -62,15 +62,13 @@ Status MaybeInitializeTensorArray(xla::ComputationBuilder* builder,
 
   TF_RET_CHECK(resource->tensor_array_size() >= 0)
       << resource->name() << " size " << resource->tensor_array_size();
-  TensorShape ta_shape;
-  ta_shape.AddDim(resource->tensor_array_size());
-  ta_shape.AppendShape(elem_shape);
 
   if (!resource->initialized()) {
     xla::ComputationDataHandle zero =
         XlaHelpers::Zero(builder, resource->type());
-    TF_RETURN_IF_ERROR(resource->SetValue(
-        dtype, builder->Broadcast(zero, ta_shape.dim_sizes())));
+
+    TF_RETURN_IF_ERROR(resource->SetTypeAndShape(dtype, elem_shape));
+    TF_RETURN_IF_ERROR(resource->SetZeroValue(builder));
   } else {
     // Checks the elem_shape matches the TensorArray shape.
     auto shape_or_status = builder->GetShape(resource->value());
@@ -80,6 +78,10 @@ Status MaybeInitializeTensorArray(xla::ComputationBuilder* builder,
     TensorShape shape;
     TF_RETURN_IF_ERROR(
         XLAShapeToTensorShape(*shape_or_status.ValueOrDie(), &shape));
+
+    TensorShape ta_shape;
+    ta_shape.AddDim(resource->tensor_array_size());
+    ta_shape.AppendShape(elem_shape);
     if (ta_shape != shape) {
       return errors::InvalidArgument(
           "Mismatched TensorArray sizes: ", ta_shape.DebugString(), " vs ",
@@ -114,10 +116,8 @@ Status CheckTensorArrayIsInitialized(const string& op_name,
 Status GetTensorArrayShape(const XlaResource* resource,
                            xla::ComputationBuilder* builder,
                            TensorShape* shape) {
-  TF_RETURN_IF_ERROR(resource->GetShape(builder, shape));
-  if (shape->dims() < 1) {
-    return errors::InvalidArgument("TensorArray rank must be >= 1");
-  }
+  *shape = resource->shape();
+  shape->InsertDim(0, resource->tensor_array_size());
   return Status::OK();
 }
 
@@ -160,8 +160,8 @@ class TensorArrayOp : public XlaOpKernel {
     // Initializes the TensorArray value if we know the element shape.
     // Otherwise, defer initialization to the first write.
     xla::ComputationDataHandle value;
+    TensorShape shape;
     if (element_shape_.IsFullyDefined()) {
-      TensorShape shape;
       CHECK(element_shape_.AsTensorShape(&shape));
       TensorShape ta_shape;
       ta_shape.AddDim(size);
@@ -175,8 +175,8 @@ class TensorArrayOp : public XlaOpKernel {
     string name = strings::StrCat("TensorArray: ", tensor_array_name_);
     OP_REQUIRES_OK(
         ctx, xc.CreateResource(XlaResource::kTensorArray, -1, std::move(name),
-                               dtype_, value, &var));
-    var->set_tensor_array_size(size);
+                               dtype_, shape, value, /*tensor_array_size=*/size,
+                               /*tensor_array_gradients=*/{}, &var));
     ctx->SetResourceOutput(0, var);
 
     Tensor flow(DT_FLOAT, TensorShape({}));
@@ -230,7 +230,7 @@ class TensorArrayWriteOp : public XlaOpKernel {
     xla::ComputationDataHandle written =
         DynamicAddSlice(b, ta, update, slice_shape.dim_sizes(), start_indices);
 
-    OP_REQUIRES_OK(ctx, resource->SetValue(dtype_, written));
+    OP_REQUIRES_OK(ctx, resource->SetValue(written));
     ctx->SetOutput(0, flow);
   }
 
@@ -421,7 +421,7 @@ class TensorArrayScatterOp : public XlaOpKernel {
       }
     }
 
-    OP_REQUIRES_OK(ctx, resource->SetValue(dtype_, ta));
+    OP_REQUIRES_OK(ctx, resource->SetValue(ta));
     ctx->SetOutput(0, flow);
   }
 
@@ -525,9 +525,8 @@ class TensorArraySplitOp : public XlaOpKernel {
                                         value_shape.DebugString(), " vs. ",
                                         ta_shape.DebugString()));
 
-    OP_REQUIRES_OK(
-        ctx, resource->SetValue(
-                 dtype_, b->Add(ta, b->Reshape(value, ta_shape.dim_sizes()))));
+    OP_REQUIRES_OK(ctx, resource->SetValue(b->Add(
+                            ta, b->Reshape(value, ta_shape.dim_sizes()))));
 
     ctx->SetOutput(0, flow);
   }
diff --git a/tensorflow/compiler/tf2xla/kernels/training_ops.cc b/tensorflow/compiler/tf2xla/kernels/training_ops.cc
index 5534d1bfa1..f750f7003b 100644
--- a/tensorflow/compiler/tf2xla/kernels/training_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/training_ops.cc
@@ -32,9 +32,24 @@ class ResourceApplyGradientDescent : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override {
     xla::ComputationDataHandle handle;
     xla::ComputationBuilder* b = ctx->builder();
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &handle));
+    DataType type = ctx->input_type(1);
+    TensorShape var_shape;
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, type, &var_shape, &handle));
+
+    TensorShape alpha_shape = ctx->InputShape(1);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(alpha_shape),
+                errors::InvalidArgument("alpha is not a scalar: ",
+                                        alpha_shape.DebugString()));
+
+    TensorShape delta_shape = ctx->InputShape(2);
+    OP_REQUIRES(
+        ctx, var_shape.IsSameSize(delta_shape),
+        errors::InvalidArgument("var and delta do not have the same shape: ",
+                                var_shape.DebugString(), " vs ",
+                                delta_shape.DebugString()));
+
     handle = b->Sub(handle, b->Mul(ctx->Input(1), ctx->Input(2)));
-    OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, ctx->input_type(1), handle));
+    OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, type, handle));
   }
 };
 REGISTER_XLA_OP(
@@ -52,18 +67,10 @@ class ResourceApplyMomentum : public XlaOpKernel {
 
     DataType type = ctx->input_type(2);
 
-    DataType var_type, accum_type;
     TensorShape var_shape, accum_shape;
-    OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(0, &var_type, &var_shape));
-    OP_REQUIRES_OK(ctx,
-                   ctx->GetVariableTypeAndShape(1, &accum_type, &accum_shape));
-
-    OP_REQUIRES(
-        ctx, type == var_type && type == accum_type,
-        errors::InvalidArgument(
-            "Types of variable arguments to ResourceApplyMomentum must match: ",
-            DataTypeString(type), " vs. ", DataTypeString(var_type), " and ",
-            DataTypeString(accum_type)));
+    xla::ComputationDataHandle var, accum;
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, type, &var_shape, &var));
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, type, &accum_shape, &accum));
 
     OP_REQUIRES(ctx, var_shape.IsSameSize(accum_shape),
                 errors::InvalidArgument(
@@ -86,10 +93,6 @@ class ResourceApplyMomentum : public XlaOpKernel {
                 errors::InvalidArgument("momentum is not a scalar: ",
                                         momentum_shape.DebugString()));
 
-    xla::ComputationDataHandle var, accum;
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &var));
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, &accum));
-
     xla::ComputationDataHandle lr = ctx->Input(2);
     xla::ComputationDataHandle grad = ctx->Input(3);
     xla::ComputationDataHandle momentum = ctx->Input(4);
@@ -122,18 +125,10 @@ class ResourceApplyAdagrad : public XlaOpKernel {
 
     DataType type = ctx->input_type(2);
 
-    DataType var_type, accum_type;
     TensorShape var_shape, accum_shape;
-    OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(0, &var_type, &var_shape));
-    OP_REQUIRES_OK(ctx,
-                   ctx->GetVariableTypeAndShape(1, &accum_type, &accum_shape));
-
-    OP_REQUIRES(
-        ctx, type == var_type && type == accum_type,
-        errors::InvalidArgument(
-            "Types of variable arguments to ResourceApplyAdagrad must match: ",
-            DataTypeString(type), " vs. ", DataTypeString(var_type), " and ",
-            DataTypeString(accum_type)));
+    xla::ComputationDataHandle var, accum;
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, type, &var_shape, &var));
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, type, &accum_shape, &accum));
 
     OP_REQUIRES(ctx, var_shape.IsSameSize(accum_shape),
                 errors::InvalidArgument(
@@ -151,9 +146,6 @@ class ResourceApplyAdagrad : public XlaOpKernel {
                     "var and grad do not have the same shape",
                     var_shape.DebugString(), " ", grad_shape.DebugString()));
 
-    xla::ComputationDataHandle var, accum;
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &var));
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, &accum));
     xla::ComputationDataHandle lr = ctx->Input(2);
     xla::ComputationDataHandle grad = ctx->Input(3);
 
@@ -175,18 +167,11 @@ class ResourceApplyAdam : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    DataType var_type, m_type, v_type;
     TensorShape var_shape, m_shape, v_shape;
-    OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(0, &var_type, &var_shape));
-    OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(1, &m_type, &m_shape));
-    OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(2, &v_type, &v_shape));
-
-    OP_REQUIRES(
-        ctx, dtype_ == var_type && dtype_ == m_type && dtype_ == v_type,
-        errors::InvalidArgument(
-            "Types of variable arguments to ResourceApplyRMSProp must match: ",
-            DataTypeString(dtype_), " vs. ", DataTypeString(var_type), " vs. ",
-            DataTypeString(m_type), " vs. ", DataTypeString(v_type)));
+    xla::ComputationDataHandle var, m, v;
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, dtype_, &var_shape, &var));
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, dtype_, &m_shape, &m));
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(2, dtype_, &v_shape, &v));
 
     TensorShape beta1_power_shape = ctx->InputShape(3);
     TensorShape beta2_power_shape = ctx->InputShape(4);
@@ -228,10 +213,6 @@ class ResourceApplyAdam : public XlaOpKernel {
                     "var and grad do not have the same shape",
                     var_shape.DebugString(), " ", grad_shape.DebugString()));
 
-    xla::ComputationDataHandle var, m, v;
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &var));
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, &m));
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(2, &v));
     xla::ComputationDataHandle beta1_power = ctx->Input(3);
     xla::ComputationDataHandle beta2_power = ctx->Input(4);
     xla::ComputationDataHandle lr = ctx->Input(5);
@@ -278,18 +259,11 @@ class ResourceApplyRMSProp : public XlaOpKernel {
 
     DataType type = ctx->input_type(3);
 
-    DataType var_type, ms_type, mom_type;
     TensorShape var_shape, ms_shape, mom_shape;
-    OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(0, &var_type, &var_shape));
-    OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(1, &ms_type, &ms_shape));
-    OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(2, &mom_type, &mom_shape));
-
-    OP_REQUIRES(
-        ctx, type == var_type && type == ms_type && type == mom_type,
-        errors::InvalidArgument(
-            "Types of variable arguments to ResourceApplyRMSProp must match: ",
-            DataTypeString(type), " vs. ", DataTypeString(var_type), " vs. ",
-            DataTypeString(ms_type), " vs. ", DataTypeString(mom_type)));
+    xla::ComputationDataHandle var, ms, mom;
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, type, &var_shape, &var));
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, type, &ms_shape, &ms));
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(2, type, &mom_shape, &mom));
 
     TensorShape lr_shape = ctx->InputShape(3);
     OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr_shape),
@@ -323,10 +297,6 @@ class ResourceApplyRMSProp : public XlaOpKernel {
                     "var and grad do not have the same shape",
                     var_shape.DebugString(), " ", grad_shape.DebugString()));
 
-    xla::ComputationDataHandle var, ms, mom;
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &var));
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, &ms));
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(2, &mom));
     xla::ComputationDataHandle lr = ctx->Input(3);
     xla::ComputationDataHandle rho = ctx->Input(4);
     xla::ComputationDataHandle momentum = ctx->Input(5);
@@ -373,20 +343,11 @@ void CompileFtrl(XlaOpKernelContext* ctx, DataType dtype,
                  bool has_l2_shrinkage) {
   xla::ComputationBuilder* b = ctx->builder();
 
-  DataType var_type, accum_type, linear_type;
   TensorShape var_shape, accum_shape, linear_shape;
-  OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(0, &var_type, &var_shape));
-  OP_REQUIRES_OK(ctx,
-                 ctx->GetVariableTypeAndShape(1, &accum_type, &accum_shape));
-  OP_REQUIRES_OK(ctx,
-                 ctx->GetVariableTypeAndShape(2, &linear_type, &linear_shape));
-
-  OP_REQUIRES(
-      ctx, dtype == var_type && dtype == accum_type && dtype == linear_type,
-      errors::InvalidArgument(
-          "Types of variable arguments to ResourceApplyFtrlV2 must match: ",
-          DataTypeString(dtype), " vs. ", DataTypeString(var_type), " and ",
-          DataTypeString(accum_type), " and ", DataTypeString(linear_type)));
+  xla::ComputationDataHandle var, accum, linear;
+  OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, dtype, &var_shape, &var));
+  OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, dtype, &accum_shape, &accum));
+  OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(2, dtype, &linear_shape, &linear));
 
   OP_REQUIRES(ctx, var_shape.IsSameSize(accum_shape),
               errors::InvalidArgument(
@@ -438,10 +399,6 @@ void CompileFtrl(XlaOpKernelContext* ctx, DataType dtype,
               errors::InvalidArgument("lr_power is not a scalar: ",
                                       lr_power_shape.DebugString()));
 
-  xla::ComputationDataHandle var, accum, linear;
-  OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &var));
-  OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, &accum));
-  OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(2, &linear));
   xla::ComputationDataHandle grad = ctx->Input(3);
   xla::ComputationDataHandle lr = ctx->Input(4);
   xla::ComputationDataHandle l1 = ctx->Input(5);
diff --git a/tensorflow/compiler/tf2xla/kernels/variable_ops.cc b/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
index 68847ae7a2..e4079ebf0b 100644
--- a/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
@@ -33,21 +33,29 @@ class VarIsInitializedOp : public XlaOpKernel {
  public:
   explicit VarIsInitializedOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::ComputationDataHandle handle;
-    bool initialized = ctx->ReadVariableInput(0, &handle).ok();
-    ctx->SetOutput(0, ctx->builder()->ConstantR0<bool>(initialized));
+    XlaResource* variable;
+    OP_REQUIRES_OK(ctx, ctx->GetResourceInput(0, &variable));
+    ctx->SetOutput(0,
+                   ctx->builder()->ConstantR0<bool>(variable->initialized()));
   }
 };
 REGISTER_XLA_OP(Name("VarIsInitializedOp"), VarIsInitializedOp);
 
 class ReadVariableOp : public XlaOpKernel {
  public:
-  explicit ReadVariableOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+  explicit ReadVariableOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_));
+  }
+
   void Compile(XlaOpKernelContext* ctx) override {
     xla::ComputationDataHandle handle;
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &handle));
+    OP_REQUIRES_OK(
+        ctx, ctx->ReadVariableInput(0, dtype_, /*shape=*/nullptr, &handle));
     ctx->SetOutput(0, handle);
   }
+
+ private:
+  DataType dtype_;
 };
 REGISTER_XLA_OP(Name("ReadVariableOp"), ReadVariableOp);
 
@@ -65,10 +73,12 @@ class AssignAddVariableOp : public XlaOpKernel {
  public:
   explicit AssignAddVariableOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
   void Compile(XlaOpKernelContext* ctx) override {
+    DataType type = ctx->input_type(1);
     xla::ComputationDataHandle handle;
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &handle));
+    OP_REQUIRES_OK(ctx,
+                   ctx->ReadVariableInput(0, type, /*shape=*/nullptr, &handle));
     handle = ctx->builder()->Add(handle, ctx->Input(1));
-    OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, ctx->input_type(1), handle));
+    OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, type, handle));
   }
 };
 REGISTER_XLA_OP(
@@ -79,10 +89,12 @@ class AssignSubVariableOp : public XlaOpKernel {
  public:
   explicit AssignSubVariableOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
   void Compile(XlaOpKernelContext* ctx) override {
+    DataType type = ctx->input_type(1);
     xla::ComputationDataHandle handle;
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &handle));
+    OP_REQUIRES_OK(ctx,
+                   ctx->ReadVariableInput(0, type, /*shape=*/nullptr, &handle));
     handle = ctx->builder()->Sub(handle, ctx->Input(1));
-    OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, ctx->input_type(1), handle));
+    OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, type, handle));
   }
 };
 REGISTER_XLA_OP(
@@ -95,28 +107,19 @@ class ResourceGatherOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override {
     xla::ComputationBuilder* builder = ctx->builder();
 
-    // Get the shape of the resource tensor.
-    TensorShape resource_shape;
-    DataType resource_dtype;
-    OP_REQUIRES_OK(
-        ctx, ctx->GetVariableTypeAndShape(0, &resource_dtype, &resource_shape));
-
-    DataType expected_output_dtype = ctx->expected_output_dtype(0);
-    OP_REQUIRES(ctx, resource_dtype == expected_output_dtype,
-                errors::InvalidArgument(
-                    "Variable dtype is ", DataTypeString(resource_dtype),
-                    " but expected output dtype is ",
-                    DataTypeString(expected_output_dtype), "."));
+    DataType type = ctx->expected_output_dtype(0);
 
+    TensorShape resource_shape;
     xla::ComputationDataHandle resource_handle;
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &resource_handle));
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, type, &resource_shape,
+                                               &resource_handle));
 
     auto indices = ctx->Input(1);
     auto indices_shape = ctx->InputShape(1);
     DataType index_type = ctx->input_type(1);
     xla::ComputationDataHandle gather = XlaComputeGatherDynamicSlice(
-        ctx, resource_handle, resource_shape, indices, indices_shape, 0,
-        resource_dtype, index_type, builder);
+        ctx, resource_handle, resource_shape, indices, indices_shape, 0, type,
+        index_type, builder);
     ctx->SetOutput(0, gather);
   }
 };
diff --git a/tensorflow/compiler/tf2xla/kernels/while_op.cc b/tensorflow/compiler/tf2xla/kernels/while_op.cc
index 4a711e4d9b..0ff1b65ae9 100644
--- a/tensorflow/compiler/tf2xla/kernels/while_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/while_op.cc
@@ -58,9 +58,8 @@ Status MakeXlaCompilerArgumentsFromInputs(
       }
 
       arg.type = resource->type();
-      if (arg.initialized) {
-        TF_RETURN_IF_ERROR(resource->PackedShape(ctx->builder(), &arg.shape));
-      } else {
+      arg.shape = resource->shape();
+      if (!arg.initialized) {
         *has_uninitialized_vars = true;
       }
       arg.tensor_array_size = resource->tensor_array_size();
@@ -70,14 +69,13 @@ Status MakeXlaCompilerArgumentsFromInputs(
       arg.name = resource->name();
       VLOG(2) << "    resource " << resource->name()
               << " type: " << DataTypeString(arg.type)
-              << " shape: " << xla::ShapeUtil::HumanString(arg.shape)
+              << " shape: " << arg.shape.DebugString()
               << " initialized: " << arg.initialized;
 
     } else {
       arg.kind = XlaCompiler::Argument::kParameter;
       arg.type = ctx->input_type(i);
-      TF_RETURN_IF_ERROR(
-          TensorShapeToXLAShape(arg.type, ctx->InputShape(i), &arg.shape));
+      arg.shape = ctx->InputShape(i);
     }
   }
   return Status::OK();
@@ -154,17 +152,14 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
       XlaCompiler::Argument& arg = arguments[update.input_index];
       if (!arg.initialized) {
         VLOG(2) << "Update shape for argument " << update.input_index << " "
-                << xla::ShapeUtil::HumanString(update.shape);
+                << update.shape.DebugString();
         arg.initialized = true;
 
-        xla::Shape shape = update.shape;
-        if (!update.tensor_array_gradients_accessed.empty()) {
-          shape = xla::ShapeUtil::GetTupleElementShape(shape, 0);
-        }
-        std::unique_ptr<xla::Literal> zero =
-            xla::Literal::CreateFromShape(shape);
-        OP_REQUIRES_OK(ctx, resource->SetValue(
-                                update.type, builder->ConstantLiteral(*zero)));
+        arg.shape = update.shape;
+        OP_REQUIRES_OK(ctx,
+                       resource->SetTypeAndShape(update.type, update.shape));
+
+        OP_REQUIRES_OK(ctx, resource->SetZeroValue(builder));
       }
 
       // Add any TensorArray gradients touched by the body to the enclosing
@@ -182,9 +177,6 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
       for (const auto& gradient : resource->tensor_array_gradients()) {
         arg.tensor_array_gradients.insert(gradient.first);
       }
-
-      // Recompute the argument shape.
-      OP_REQUIRES_OK(ctx, resource->PackedShape(ctx->builder(), &arg.shape));
     }
     // Recompile the body with the "correct" resource shapes.
     VLOG(1) << "Recompiling body with corrected resource shapes";
@@ -292,13 +284,12 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
       OP_REQUIRES_OK(ctx,
                      resource->SetFromPack(
                          arguments[update.input_index].tensor_array_gradients,
-                         builder->GetTupleElement(while_result, pos),
-                         /*reset_initial_values=*/false, builder));
+                         builder->GetTupleElement(while_result, pos), builder));
     }
     VLOG(2) << "Loop-carried variable: pos: " << update.input_index
             << " name: " << resource->name() << " modified: " << update.modified
             << " type: " << DataTypeString(update.type)
-            << " shape: " << xla::ShapeUtil::HumanString(update.shape);
+            << " shape: " << update.shape.DebugString();
     // Copies the identity of the resource variable from input to output
     // unchanged, even if the variable was not modified.
     ctx->op_kernel_context()->set_output(
diff --git a/tensorflow/compiler/tf2xla/tf2xla.cc b/tensorflow/compiler/tf2xla/tf2xla.cc
index 906f229043..6051d7dffd 100644
--- a/tensorflow/compiler/tf2xla/tf2xla.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla.cc
@@ -241,9 +241,7 @@ Status CreateXlaArgs(const Graph& graph,
     XlaCompiler::Argument arg;
     arg.kind = XlaCompiler::Argument::kParameter;
     TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), "T", &arg.type));
-    TensorShape shape;
-    TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), kShapeAttr, &shape));
-    TF_RETURN_IF_ERROR(TensorShapeToXLAShape(arg.type, shape, &arg.shape));
+    TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), kShapeAttr, &arg.shape));
     TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), kDebugNameAttr, &arg.name));
     xla_args->push_back(arg);
   }
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index 69b265436b..c5b4ec5b15 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -66,13 +66,14 @@ Status CheckSignature(const DataTypeVector& types,
 
 bool XlaCompiler::Argument::operator==(
     const XlaCompiler::Argument& other) const {
-  if (std::tie(kind, resource_kind, type, name, tensor_array_size,
+  if (std::tie(kind, resource_kind, type, name, initialized, tensor_array_size,
                tensor_array_gradients) !=
       std::tie(other.kind, other.resource_kind, other.type, other.name,
-               other.tensor_array_size, other.tensor_array_gradients)) {
+               other.initialized, other.tensor_array_size,
+               other.tensor_array_gradients)) {
     return false;
   }
-  if (!xla::ShapeUtil::Equal(shape, other.shape)) {
+  if (shape != other.shape) {
     return false;
   }
   if (constant_value.shape() != other.constant_value.shape()) {
@@ -230,6 +231,64 @@ Status XlaCompiler::CompileFunction(const XlaCompiler::CompileOptions& options,
   return Status::OK();
 }
 
+// Computes the XLA shape for argument 'arg'.
+/*static*/ Status XlaCompiler::XLAShapeForArgument(
+    const XlaCompiler::Argument& arg, xla::Shape* xla_shape) {
+  switch (arg.kind) {
+    case XlaCompiler::Argument::kConstant:
+      return TensorShapeToXLAShape(arg.type, arg.constant_value.shape(),
+                                   xla_shape);
+    case XlaCompiler::Argument::kParameter:
+      return TensorShapeToXLAShape(arg.type, arg.shape, xla_shape);
+    case XlaCompiler::Argument::kResource: {
+      TF_RET_CHECK(arg.initialized);
+
+      switch (arg.resource_kind) {
+        case XlaResource::kVariable:
+          return TensorShapeToXLAShape(arg.type, arg.shape, xla_shape);
+        case XlaResource::kTensorArray: {
+          if (arg.tensor_array_size < 0) {
+            return errors::InvalidArgument(
+                "Negative tensor_array_size in XLAShapeForArgument");
+          }
+          TensorShape shape;
+          shape.AddDim(arg.tensor_array_size);
+          shape.AppendShape(arg.shape);
+          TF_RETURN_IF_ERROR(TensorShapeToXLAShape(arg.type, shape, xla_shape));
+
+          if (!arg.tensor_array_gradients.empty()) {
+            std::vector<xla::Shape> tuple_shape(
+                arg.tensor_array_gradients.size() + 1, *xla_shape);
+            *xla_shape = xla::ShapeUtil::MakeTupleShape(tuple_shape);
+          }
+          return Status::OK();
+        }
+        case XlaResource::kStack: {
+          if (arg.tensor_array_size < 0) {
+            return errors::InvalidArgument(
+                "Negative tensor_array_size in XLAShapeForArgument");
+          }
+          TensorShape shape;
+          shape.AddDim(arg.tensor_array_size);
+          shape.AppendShape(arg.shape);
+          xla::Shape buffer_shape;
+          TF_RETURN_IF_ERROR(
+              TensorShapeToXLAShape(arg.type, shape, &buffer_shape));
+          *xla_shape = xla::ShapeUtil::MakeTupleShape(
+              {buffer_shape, xla::ShapeUtil::MakeShape(xla::S32, {})});
+          return Status::OK();
+        }
+
+        case XlaResource::kInvalid:
+          return errors::Internal(
+              "Invalid resource type in XLAShapeForArgument()");
+      }
+    }
+    case XlaCompiler::Argument::kInvalid:
+      return errors::Internal("Invalid argument type in XLAShapeForArgument()");
+  }
+}
+
 namespace {
 
 Status ExecuteGraph(XlaContext* xla_context, std::unique_ptr<Graph> graph,
@@ -275,8 +334,9 @@ Status BuildArguments(const Graph& graph,
 
   // Argument numbers of arguments and resources that are to be passed to the
   // XLA computation as runtime parameters.
-  std::vector<int> parameters, resources;
-  parameters.reserve(args.size());
+  input_mapping->clear();
+  input_mapping->reserve(args.size());
+  std::vector<int> resources;
   resources.reserve(args.size());
 
   // Fills in constant arguments, and computes non-constant argument order.
@@ -290,18 +350,20 @@ Status BuildArguments(const Graph& graph,
         // TODO(phawkins): this code assumes that resource arguments do not
         // alias.
         XlaResource* resource;
-        TF_RETURN_IF_ERROR(
-            context->CreateResource(arg.resource_kind, i, arg.name, arg.type,
-                                    xla::ComputationDataHandle(), &resource));
-        resource->set_tensor_array_size(arg.tensor_array_size);
+        TF_RETURN_IF_ERROR(context->CreateResource(
+            arg.resource_kind, i, arg.name, arg.type, arg.shape,
+            xla::ComputationDataHandle(),
+            /*tensor_array_size=*/arg.tensor_array_size,
+            /*tensor_array_gradients=*/arg.tensor_array_gradients, &resource));
         arg_expression.set_resource(resource);
         if (arg.initialized) {
           resources.push_back(i);
         }
         break;
-      case XlaCompiler::Argument::kParameter:
-        parameters.push_back(i);
+      case XlaCompiler::Argument::kParameter: {
+        input_mapping->push_back(i);
         break;
+      }
       case XlaCompiler::Argument::kConstant:
         arg_expression.set_constant_value(arg.constant_value);
         break;
@@ -312,19 +374,17 @@ Status BuildArguments(const Graph& graph,
 
   // Append parameters containing variable values after the other runtime
   // parameters.
-  parameters.insert(parameters.end(), resources.begin(), resources.end());
-  if (parameters.empty()) {
+  input_mapping->insert(input_mapping->end(), resources.begin(),
+                        resources.end());
+  if (input_mapping->empty()) {
     return Status::OK();
   }
 
-  std::vector<xla::Shape> arg_shapes;
-  arg_shapes.reserve(parameters.size());
-  input_mapping->resize(parameters.size());
-  for (std::vector<int>::size_type i = 0; i < parameters.size(); ++i) {
-    const XlaCompiler::Argument& arg = args[parameters[i]];
+  std::vector<xla::Shape> arg_shapes(input_mapping->size());
+  for (std::vector<int>::size_type i = 0; i < input_mapping->size(); ++i) {
     // Computes the shapes of non-constant arguments.
-    arg_shapes.push_back(arg.shape);
-    (*input_mapping)[i] = parameters[i];
+    TF_RETURN_IF_ERROR(XlaCompiler::XLAShapeForArgument(
+        args[(*input_mapping)[i]], &arg_shapes[i]));
   }
 
   if (use_tuple_arg) {
@@ -354,13 +414,13 @@ Status BuildArguments(const Graph& graph,
   }
 
   // Build parameter handles for non-constant arguments.
-  std::vector<xla::ComputationDataHandle> arg_handles(parameters.size());
+  std::vector<xla::ComputationDataHandle> arg_handles(input_mapping->size());
   if (use_tuple_arg) {
     xla::ComputationDataHandle tuple;
     if (is_entry_computation) {
       xla::OpSharding tuple_sharding;
       tuple_sharding.set_type(xla::OpSharding::Type::OpSharding_Type_TUPLE);
-      for (int64 parameter : parameters) {
+      for (int64 parameter : *input_mapping) {
         const int core = (*arg_cores)[parameter];
         const int root_device = 0;
         *tuple_sharding.add_tuple_shardings() =
@@ -373,16 +433,16 @@ Status BuildArguments(const Graph& graph,
     } else {
       tuple = builder->Parameter(0, (*input_shapes)[0], "arg_tuple");
     }
-    for (std::vector<int>::size_type i = 0; i < parameters.size(); ++i) {
-      const int core = (*arg_cores)[parameters[i]];
+    for (std::vector<int>::size_type i = 0; i < input_mapping->size(); ++i) {
+      const int core = (*arg_cores)[input_mapping->at(i)];
       xla::ScopedShardingAssignment assign_sharding(
           builder, core == -1 ? tensorflow::gtl::optional<xla::OpSharding>()
                               : xla::sharding_builder::AssignDevice(core));
       arg_handles[i] = builder->GetTupleElement(tuple, i);
     }
   } else {
-    for (std::vector<int>::size_type i = 0; i < parameters.size(); ++i) {
-      const int core = (*arg_cores)[parameters[i]];
+    for (std::vector<int>::size_type i = 0; i < input_mapping->size(); ++i) {
+      const int core = (*arg_cores)[input_mapping->at(i)];
       xla::ScopedShardingAssignment assign_sharding(
           builder, core == -1 ? tensorflow::gtl::optional<xla::OpSharding>()
                               : xla::sharding_builder::AssignDevice(core));
@@ -393,19 +453,18 @@ Status BuildArguments(const Graph& graph,
 
   // Fill in the handles in non-constant arguments.
   VLOG(2) << "XLA computation inputs:";
-  for (std::vector<int>::size_type i = 0; i < parameters.size(); ++i) {
-    const XlaCompiler::Argument& arg = args[parameters[i]];
+  for (std::vector<int>::size_type i = 0; i < input_mapping->size(); ++i) {
+    const XlaCompiler::Argument& arg = args[input_mapping->at(i)];
     VLOG(2) << "  XLA arg " << i
             << " shape: " << xla::ShapeUtil::HumanString(arg_shapes[i])
-            << " name: " << arg.name << " TF arg " << parameters[i];
-    XlaExpression& arg_expression = (*arg_expressions)[parameters[i]];
+            << " name: " << arg.name << " TF arg " << input_mapping->at(i);
+    XlaExpression& arg_expression = (*arg_expressions)[input_mapping->at(i)];
     switch (arg.kind) {
       case XlaCompiler::Argument::kResource: {
         TF_RET_CHECK(arg.initialized);
         XlaResource* resource = arg_expression.resource();
-        TF_RETURN_IF_ERROR(
-            resource->SetFromPack(arg.tensor_array_gradients, arg_handles[i],
-                                  /*reset_initial_values=*/true, builder));
+        TF_RETURN_IF_ERROR(resource->SetFromPack(arg.tensor_array_gradients,
+                                                 arg_handles[i], builder));
         VLOG(2) << "    resource: num_gradients: "
                 << arg.tensor_array_gradients.size();
         break;
@@ -486,6 +545,7 @@ Status BuildComputation(
       XlaCompiler::ResourceUpdate& update = resource_updates->back();
       update.input_index = resource->arg_num();
       update.type = resource->type();
+      update.shape = resource->shape();
       update.modified = modified;
       for (const auto& grad : resource->tensor_array_gradients()) {
         update.tensor_array_gradients_accessed.insert(grad.first);
@@ -616,13 +676,6 @@ Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
       ++computation_output;
     }
   }
-
-  for (std::vector<ResourceUpdate>::size_type i = 0;
-       i < result->resource_updates.size(); ++i) {
-    result->resource_updates[i].shape = xla::ShapeUtil::GetTupleElementShape(
-        result->xla_output_shape, computation_output);
-    ++computation_output;
-  }
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h
index 30d3c05ee9..b86c82c0ab 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.h
+++ b/tensorflow/compiler/tf2xla/xla_compiler.h
@@ -104,9 +104,17 @@ class XlaCompiler {
     // is the type of the variable's value, not DT_RESOURCE.
     DataType type;
 
-    // The shape of the argument. If the argument is a resource, this is the
-    // shape of the resource's value.
-    xla::Shape shape;
+    // The shape of the argument. For:
+    // * a parameter: the shape of the parameter.
+    // * a constant: ignored; the shape given by constant_value is used
+    //     instead.
+    // * an uninitialized resource: ignored. We don't yet know the shape of an
+    //     uninitialized resource (otherwise we would have initialized it!)
+    // * an initialized variable: the shape of the variable's value.
+    // * an initialized TensorArray or Stack resource: the shape of an entry in
+    //   the TensorArray/Stack. Note this is the size of a single entry, not the
+    //   XLA data structure that represents the complete stack/array.
+    TensorShape shape;
 
     // The value of the argument, if it is a compile-time constant. Must be a
     // host-memory tensor.
@@ -175,8 +183,9 @@ class XlaCompiler {
     int input_index;
 
     // Type and shape of the tensor to be written back.
+    // The `shape` field has the same meaning as the Argument::shape field.
     DataType type;
-    xla::Shape shape;
+    TensorShape shape;
 
     // Was the value of the variable modified by the computation?
     // (Always true, unless `return_updated_values_for_all_resources` is true.)
@@ -266,11 +275,10 @@ class XlaCompiler {
                       const std::vector<Argument>& args,
                       CompilationResult* result);
 
-  Status PrepareArguments(xla::ComputationBuilder* builder, NameAttrList func,
-                          const std::vector<DataType>& types,
-                          const std::vector<TensorShape>& shapes,
-                          const std::vector<const XlaExpression*>& expressions,
-                          std::vector<Argument>* args);
+  // Returns the shape of the XLA parameter for an argument 'arg'.
+  // See the class comment for more details about the argument passing
+  // convention.
+  static Status XLAShapeForArgument(const Argument& arg, xla::Shape* xla_shape);
 
   // Retrieves the channel handle associated with `key`. Allocates
   // a new channel handle if none exists.
diff --git a/tensorflow/compiler/tf2xla/xla_compiler_test.cc b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
index 7ebe4b75bc..65de4dbad7 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
@@ -191,10 +191,10 @@ TEST_F(XlaCompilerTest, Simple) {
   std::vector<XlaCompiler::Argument> args(2);
   args[0].kind = XlaCompiler::Argument::kParameter;
   args[0].type = DT_INT32;
-  args[0].shape = xla::ShapeUtil::MakeShape(xla::S32, {2});
+  args[0].shape = TensorShape({2});
   args[1].kind = XlaCompiler::Argument::kParameter;
   args[1].type = DT_INT32;
-  args[1].shape = xla::ShapeUtil::MakeShape(xla::S32, {2});
+  args[1].shape = TensorShape({2});
 
   // Compiles the graph.
   XlaCompiler compiler(DefaultOptions());
@@ -242,10 +242,10 @@ TEST_F(XlaCompilerTest, HasSaneErrorOnNonCompileTimeConstantInputToReshape) {
   std::vector<XlaCompiler::Argument> args(2);
   args[0].kind = XlaCompiler::Argument::kParameter;
   args[0].type = DT_INT32;
-  args[0].shape = xla::ShapeUtil::MakeShape(xla::S32, {2});
+  args[0].shape = TensorShape({2});
   args[1].kind = XlaCompiler::Argument::kParameter;
   args[1].type = DT_INT32;
-  args[1].shape = xla::ShapeUtil::MakeShape(xla::S32, {2});
+  args[1].shape = TensorShape({2});
 
   // Compiles the graph.
   XlaCompiler compiler(DefaultOptions());
@@ -281,7 +281,7 @@ TEST_F(XlaCompilerTest, ConstantOutputs) {
   std::vector<XlaCompiler::Argument> args(1);
   args[0].kind = XlaCompiler::Argument::kParameter;
   args[0].type = DT_INT32;
-  args[0].shape = xla::ShapeUtil::MakeShape(xla::S32, {2});
+  args[0].shape = TensorShape({2});
 
   XlaCompiler::Options options = DefaultOptions();
   XlaCompiler compiler(options);
@@ -373,7 +373,7 @@ TEST_F(XlaCompilerTest, ResourceManager) {
   std::vector<XlaCompiler::Argument> args(1);
   args[0].kind = XlaCompiler::Argument::kParameter;
   args[0].type = DT_INT32;
-  args[0].shape = xla::ShapeUtil::MakeShape(xla::S32, {2});
+  args[0].shape = TensorShape({2});
 
   DummyResourceForTest* resource = new DummyResourceForTest();
 
@@ -420,7 +420,7 @@ TEST_F(XlaCompilerTest, DeterministicCompilation) {
     std::vector<XlaCompiler::Argument> args(1);
     args[0].kind = XlaCompiler::Argument::kParameter;
     args[0].type = DT_INT32;
-    args[0].shape = xla::ShapeUtil::MakeShape(xla::S32, {2});
+    args[0].shape = TensorShape({2});
 
     // Compiles the graph.
     auto options = DefaultOptions();
@@ -472,9 +472,7 @@ TEST_F(XlaCompilerTest, CanPassTensorArraysToAndFromComputation) {
   args[0].resource_kind = XlaResource::kTensorArray;
   args[0].initialized = true;
   args[0].type = DT_INT32;
-  args[0].shape = xla::ShapeUtil::MakeTupleShape(
-      {xla::ShapeUtil::MakeShape(xla::S32, {2}),
-       xla::ShapeUtil::MakeShape(xla::S32, {2})});
+  args[0].shape = TensorShape({});
   args[0].tensor_array_size = 2;
   args[0].tensor_array_gradients = {"grad2"};
 
@@ -540,9 +538,7 @@ TEST_F(XlaCompilerTest, UnwrittenTensorArrayGradientsAreNotComputationOutputs) {
   args[0].resource_kind = XlaResource::kTensorArray;
   args[0].initialized = true;
   args[0].type = DT_INT32;
-  args[0].shape = xla::ShapeUtil::MakeTupleShape(
-      {xla::ShapeUtil::MakeShape(xla::S32, {2}),
-       xla::ShapeUtil::MakeShape(xla::S32, {2})});
+  args[0].shape = TensorShape({});
   args[0].tensor_array_size = 2;
   args[0].tensor_array_gradients = {"grad1"};
 
@@ -574,9 +570,7 @@ TEST_F(XlaCompilerTest, NewTensorArrayGradientsAreComputationOutputs) {
   args[0].resource_kind = XlaResource::kTensorArray;
   args[0].initialized = true;
   args[0].type = DT_INT32;
-  args[0].shape = xla::ShapeUtil::MakeTupleShape(
-      {xla::ShapeUtil::MakeShape(xla::S32, {2}),
-       xla::ShapeUtil::MakeShape(xla::S32, {2})});
+  args[0].shape = TensorShape({});
   args[0].tensor_array_size = 2;
   args[0].tensor_array_gradients = {"grad1"};
 
diff --git a/tensorflow/compiler/tf2xla/xla_context.cc b/tensorflow/compiler/tf2xla/xla_context.cc
index e8d17e2e0a..73878955e3 100644
--- a/tensorflow/compiler/tf2xla/xla_context.cc
+++ b/tensorflow/compiler/tf2xla/xla_context.cc
@@ -103,12 +103,14 @@ Status XlaContext::AddConstRetval(int retval_index, DataType dtype,
 
 xla::ComputationBuilder* XlaContext::builder() { return builder_; }
 
-Status XlaContext::CreateResource(XlaResource::Kind kind, int arg_num,
-                                  string name, DataType type,
-                                  const xla::ComputationDataHandle& handle,
-                                  XlaResource** resource) {
+Status XlaContext::CreateResource(
+    XlaResource::Kind kind, int arg_num, string name, DataType type,
+    TensorShape shape, const xla::ComputationDataHandle& handle,
+    int64 tensor_array_size, const std::set<string>& tensor_array_gradients,
+    XlaResource** resource) {
   resources_.emplace_back(
-      new XlaResource(kind, arg_num, std::move(name), type, handle));
+      new XlaResource(kind, arg_num, std::move(name), type, std::move(shape),
+                      handle, tensor_array_size, tensor_array_gradients));
   *resource = resources_.back().get();
   return Status::OK();
 }
diff --git a/tensorflow/compiler/tf2xla/xla_context.h b/tensorflow/compiler/tf2xla/xla_context.h
index 1a7dafe8cd..fac0352ae8 100644
--- a/tensorflow/compiler/tf2xla/xla_context.h
+++ b/tensorflow/compiler/tf2xla/xla_context.h
@@ -71,11 +71,15 @@ class XlaContext : public ResourceBase {
   Status AddConstRetval(int retval_index, DataType dtype,
                         const xla::Literal& literal);
 
-  // Creates a resource with resource `kind` and initial type `type` and
-  // value `handle`. `name` is a descriptive name for use in error messages.
+  // Creates a resource with resource `kind` and initial value `handle`. `name`
+  // is a descriptive name for use in error messages. See the `XlaResource`
+  // constructor for a description of the remaining arguments.
   // Fails if the resource already exists.
   Status CreateResource(XlaResource::Kind kind, int arg_num, string name,
-                        DataType type, const xla::ComputationDataHandle& handle,
+                        DataType type, TensorShape shape,
+                        const xla::ComputationDataHandle& handle,
+                        int64 tensor_array_size,
+                        const std::set<string>& tensor_array_gradients,
                         XlaResource** resource);
 
   const std::vector<std::unique_ptr<XlaResource>>& resources() {
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.cc b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
index ee0aed672e..ee29158646 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
@@ -286,7 +286,8 @@ Status XlaOpKernelContext::ConstantInputList(
 }
 
 Status XlaOpKernelContext::ReadVariableInput(
-    int index, xla::ComputationDataHandle* value) {
+    int index, DataType type, TensorShape* shape,
+    xla::ComputationDataHandle* value) {
   const Tensor& tensor = context_->input(index);
   const XlaExpression* expression = CastExpressionFromTensor(tensor);
   XlaResource* variable = expression->resource();
@@ -296,7 +297,15 @@ Status XlaOpKernelContext::ReadVariableInput(
     return errors::InvalidArgument("Read of uninitialized variable ",
                                    variable->name());
   }
+  if (variable->type() != type) {
+    return errors::InvalidArgument(
+        "Type mismatch for read of variable ", variable->name(), ". Expected ",
+        DataTypeString(type), "; got ", DataTypeString(variable->type()));
+  }
   *value = variable->value();
+  if (shape) {
+    *shape = variable->shape();
+  }
   return Status::OK();
 }
 
@@ -312,12 +321,7 @@ Status XlaOpKernelContext::GetVariableTypeAndShape(int index, DataType* type,
                                    variable->name());
   }
   *type = variable->type();
-  auto shape_or_status = builder()->GetShape(variable->value());
-  if (!shape_or_status.ok()) {
-    return shape_or_status.status();
-  }
-  TF_RETURN_IF_ERROR(
-      XLAShapeToTensorShape(*shape_or_status.ValueOrDie(), shape));
+  *shape = variable->shape();
   return Status::OK();
 }
 
@@ -405,7 +409,17 @@ Status XlaOpKernelContext::AssignVariable(
   XlaResource* variable = expression->resource();
   TF_RET_CHECK(variable != nullptr);
   TF_RET_CHECK(variable->kind() == XlaResource::kVariable);
-  return variable->SetValue(type, handle);
+
+  auto shape_or_status = builder()->GetShape(handle);
+  if (!shape_or_status.ok()) {
+    return shape_or_status.status();
+  }
+  TensorShape shape;
+  TF_RETURN_IF_ERROR(
+      XLAShapeToTensorShape(*shape_or_status.ValueOrDie(), &shape));
+
+  TF_RETURN_IF_ERROR(variable->SetTypeAndShape(type, shape));
+  return variable->SetValue(handle);
 }
 
 XlaCompiler* XlaOpKernelContext::compiler() const {
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.h b/tensorflow/compiler/tf2xla/xla_op_kernel.h
index 6d3b6db228..e1fd0f55c6 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.h
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.h
@@ -164,11 +164,16 @@ class XlaOpKernelContext {
                                  TensorShape* shape) const;
 
   // Reads the current value of the resouce variable referred to by input
-  // 'index'.
-  Status ReadVariableInput(int index, xla::ComputationDataHandle* value);
+  // 'index'. If `shape` is not nullptr, sets `*shape` to the shape of the
+  // variable. Returns an error if the variable has not been initialized, or if
+  // its type does not match `type`.
+  Status ReadVariableInput(int index, DataType type, TensorShape* shape,
+                           xla::ComputationDataHandle* value);
 
   // Assigns the value `handle` to the variable referenced by input
-  // `input_index`. Marks the operator as having side effects.
+  // `input_index`. The variable must be of `type`. Returns an error if the
+  // variable has been initialized with a different type or with a
+  // different shape.
   Status AssignVariable(int input_index, DataType type,
                         const xla::ComputationDataHandle& handle);
 
diff --git a/tensorflow/compiler/tf2xla/xla_resource.cc b/tensorflow/compiler/tf2xla/xla_resource.cc
index 9abac8bdaa..c2075b44b8 100644
--- a/tensorflow/compiler/tf2xla/xla_resource.cc
+++ b/tensorflow/compiler/tf2xla/xla_resource.cc
@@ -25,51 +25,99 @@ limitations under the License.
 
 namespace tensorflow {
 
-XlaResource::XlaResource(Kind kind, int arg_num, string name,
-                         DataType initial_type,
-                         const xla::ComputationDataHandle& initial_value)
+XlaResource::XlaResource(Kind kind, int arg_num, string name, DataType type,
+                         TensorShape shape,
+                         const xla::ComputationDataHandle& initial_value,
+                         int64 tensor_array_size,
+                         const std::set<string>& tensor_array_gradients)
     : kind_(kind),
       arg_num_(arg_num),
       name_(std::move(name)),
-      type_(initial_type),
+      type_(type),
+      shape_(std::move(shape)),
       value_(initial_value),
-      initial_value_(initial_value) {
+      initial_value_(initial_value),
+      tensor_array_size_(tensor_array_size) {
   CHECK(kind_ != kInvalid);
+
+  for (const string& gradient : tensor_array_gradients) {
+    tensor_array_gradients_[gradient].reset(
+        new XlaResource(/*kind=*/kTensorArray, /*arg_num=*/-1,
+                        /*name=*/strings::StrCat("TensorArrayGrad: ", name_),
+                        type_, shape_, xla::ComputationDataHandle(),
+                        tensor_array_size_, /*tensor_array_gradients=*/{}));
+  }
 }
 
-Status XlaResource::SetValue(DataType type,
-                             const xla::ComputationDataHandle& value) {
-  if (type_ == DT_INVALID && type == DT_INVALID) {
-    return errors::InvalidArgument("Attempted to initialized resource ", name_,
-                                   " to an invalid type");
+Status XlaResource::SetTypeAndShape(DataType type, const TensorShape& shape) {
+  if (type == DT_INVALID) {
+    return errors::InvalidArgument("Attempted to set type of resource '", name_,
+                                   "'' to an invalid type");
   }
-  if (type_ != DT_INVALID && type_ != type) {
+  if (initialized() && type_ != type) {
     return errors::InvalidArgument("Type of resource ", name_,
                                    " cannot be changed after initialization: "
                                    "old type was ",
                                    DataTypeString(type_), ", new type is ",
                                    DataTypeString(type));
   }
+  if (initialized() && shape_ != shape) {
+    return errors::InvalidArgument("Shape of resource ", name_,
+                                   " cannot be changed after initialization: "
+                                   "old shape was ",
+                                   shape_.DebugString(), ", new shape is ",
+                                   shape.DebugString());
+  }
   type_ = type;
-  value_ = value;
+  shape_ = shape;
   return Status::OK();
 }
 
-Status XlaResource::GetXlaShape(xla::ComputationBuilder* builder,
-                                xla::Shape* shape) const {
-  auto shape_or_status = builder->GetShape(value_);
-  if (!shape_or_status.ok()) {
-    return shape_or_status.status();
+Status XlaResource::SetValue(const xla::ComputationDataHandle& value) {
+  if (type_ == DT_INVALID) {
+    return errors::InvalidArgument(
+        "Resource '", name_,
+        "' must be initialized with a valid type before use.");
   }
-  *shape = *shape_or_status.ValueOrDie();
+  value_ = value;
   return Status::OK();
 }
 
-Status XlaResource::GetShape(xla::ComputationBuilder* builder,
-                             TensorShape* shape) const {
-  xla::Shape xla_shape;
-  TF_RETURN_IF_ERROR(GetXlaShape(builder, &xla_shape));
-  TF_RETURN_IF_ERROR(XLAShapeToTensorShape(xla_shape, shape));
+Status XlaResource::SetZeroValue(xla::ComputationBuilder* builder) {
+  if (type_ == DT_INVALID) {
+    return errors::InvalidArgument(
+        "Resource '", name_,
+        "' must be initialized with a valid type before use.");
+  }
+  switch (kind_) {
+    case kVariable: {
+      value_ = builder->Broadcast(XlaHelpers::Zero(builder, type_),
+                                  shape_.dim_sizes());
+      break;
+    }
+    case kTensorArray: {
+      TensorShape ta_shape;
+      ta_shape.AddDim(tensor_array_size_);
+      ta_shape.AppendShape(shape_);
+      value_ = builder->Broadcast(XlaHelpers::Zero(builder, type_),
+                                  ta_shape.dim_sizes());
+      break;
+    }
+    case kStack: {
+      TensorShape ta_shape;
+      ta_shape.AddDim(tensor_array_size_);
+      ta_shape.AppendShape(shape_);
+      value_ =
+          builder->Tuple({builder->Broadcast(XlaHelpers::Zero(builder, type_),
+                                             ta_shape.dim_sizes()),
+                          builder->ConstantR0<int32>(0)});
+      break;
+    }
+
+    case kInvalid:
+    default:
+      LOG(FATAL) << "Invalid resource type";
+  }
   return Status::OK();
 }
 
@@ -82,36 +130,20 @@ Status XlaResource::GetOrCreateTensorArrayGradient(
   std::unique_ptr<XlaResource>& gradient = tensor_array_gradients_[source];
   if (!gradient) {
     TensorShape ta_shape;
-    TF_RETURN_IF_ERROR(GetShape(builder, &ta_shape));
+    ta_shape.AddDim(tensor_array_size_);
+    ta_shape.AppendShape(shape_);
     xla::ComputationDataHandle gradient_value = builder->Broadcast(
         XlaHelpers::Zero(builder, type_), ta_shape.dim_sizes());
     gradient.reset(
         new XlaResource(/*kind=*/kTensorArray, /*arg_num=*/-1,
                         /*name=*/strings::StrCat("TensorArrayGrad: ", name_),
-                        type_, gradient_value));
-    gradient->tensor_array_size_ = tensor_array_size_;
+                        type_, shape_, gradient_value, tensor_array_size_,
+                        /*tensor_array_gradients=*/{}));
   }
   *gradient_out = gradient.get();
   return Status::OK();
 }
 
-Status XlaResource::PackedShape(xla::ComputationBuilder* builder,
-                                xla::Shape* packed_shape) const {
-  if (tensor_array_gradients_.empty()) {
-    return GetXlaShape(builder, packed_shape);
-  }
-  TF_RET_CHECK(kind_ == kTensorArray);
-  std::vector<xla::Shape> elem_shapes(1 + tensor_array_gradients_.size());
-  int pos = 0;
-  TF_RETURN_IF_ERROR(GetXlaShape(builder, &elem_shapes[pos++]));
-  for (const auto& gradient : tensor_array_gradients_) {
-    TF_RETURN_IF_ERROR(
-        gradient.second->GetXlaShape(builder, &elem_shapes[pos++]));
-  }
-  *packed_shape = xla::ShapeUtil::MakeTupleShape(elem_shapes);
-  return Status::OK();
-}
-
 Status XlaResource::Pack(xla::ComputationDataHandle* pack,
                          xla::ComputationBuilder* builder) const {
   if (tensor_array_gradients_.empty()) {
@@ -130,27 +162,32 @@ Status XlaResource::Pack(xla::ComputationDataHandle* pack,
 
 Status XlaResource::SetFromPack(const std::set<string>& gradient_sources,
                                 const xla::ComputationDataHandle& pack,
-                                bool reset_initial_values,
                                 xla::ComputationBuilder* builder) {
   if (gradient_sources.empty()) {
+    if (!initialized()) {
+      initial_value_ = pack;
+    }
     value_ = pack;
   } else {
     TF_RET_CHECK(kind_ == kTensorArray);
     int pos = 0;
-    value_ = builder->GetTupleElement(pack, pos++);
+    auto v = builder->GetTupleElement(pack, pos++);
+    if (!initialized()) {
+      initial_value_ = v;
+    }
+    value_ = v;
+
     for (const auto& source : gradient_sources) {
       XlaResource* gradient;
       TF_RETURN_IF_ERROR(
           GetOrCreateTensorArrayGradient(source, builder, &gradient));
-      gradient->value_ = builder->GetTupleElement(pack, pos++);
-      if (reset_initial_values) {
-        gradient->initial_value_ = gradient->value_;
+      auto v = builder->GetTupleElement(pack, pos++);
+      if (!gradient->initialized()) {
+        gradient->initial_value_ = v;
       }
+      gradient->value_ = v;
     }
   }
-  if (reset_initial_values) {
-    initial_value_ = value_;
-  }
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/tf2xla/xla_resource.h b/tensorflow/compiler/tf2xla/xla_resource.h
index 6b46089e4f..1bb2c7274e 100644
--- a/tensorflow/compiler/tf2xla/xla_resource.h
+++ b/tensorflow/compiler/tf2xla/xla_resource.h
@@ -36,8 +36,11 @@ class XlaResource {
     kStack,
   };
 
-  XlaResource(Kind kind, int arg_num, string name, DataType initial_type,
-              const xla::ComputationDataHandle& initial_value);
+  XlaResource(Kind kind, int arg_num, string name, DataType type,
+              TensorShape shape,
+              const xla::ComputationDataHandle& initial_value,
+              int64 tensor_array_size,
+              const std::set<string>& tensor_array_gradients);
 
   XlaResource(const XlaResource&) = delete;
   XlaResource(XlaResource&&) = delete;
@@ -60,6 +63,12 @@ class XlaResource {
   // a resource is first initialized we do not yet know its type, so we keep
   // track of its type dynamically.
   DataType type() const { return type_; }
+
+  // Shape of the resource. For an uninitialized resource, this is ignored.
+  // For a Variable, this is the shape of the value. For a TensorArray or Stack
+  // this is the shape of each entry in the TensorArray/Stack.
+  const TensorShape& shape() const { return shape_; }
+
   const xla::ComputationDataHandle& value() const { return value_; }
 
   // Value of the resource at computation entry. Used to detect which
@@ -68,17 +77,19 @@ class XlaResource {
     return initial_value_;
   }
 
+  // A variable is initialized if it has a value.
   bool initialized() const { return value_.handle() > 0; }
 
-  // Sets the current type/value of the resource.
-  Status SetValue(DataType type, const xla::ComputationDataHandle& value);
+  // Sets the type and shape of the resource. The type and shape of a resource
+  // must not change once the variable has been initialized.
+  Status SetTypeAndShape(DataType type, const TensorShape& shape);
 
-  // Returns the shape of the resource as an xla::Shape.
-  Status GetXlaShape(xla::ComputationBuilder* builder, xla::Shape* shape) const;
+  // Sets the current value of the resource. Returns an error if the type is not
+  // set to a valid value.
+  Status SetValue(const xla::ComputationDataHandle& value);
 
-  // Returns the shape of the resource as an TensorShape. Fails if the shape is
-  // not representable as a TensorShape.
-  Status GetShape(xla::ComputationBuilder* builder, TensorShape* shape) const;
+  // Sets the current value of the resource to an all-zero value.
+  Status SetZeroValue(xla::ComputationBuilder* builder);
 
   // Looks up the gradient for `source`, or creates it if it does not already
   // exist. The call target must be an initialized TensorArray resource. A
@@ -96,10 +107,6 @@ class XlaResource {
   Status Pack(xla::ComputationDataHandle* pack,
               xla::ComputationBuilder* builder) const;
 
-  // Returns the shape of the `pack` value computed by `Pack()`.
-  Status PackedShape(xla::ComputationBuilder* builder,
-                     xla::Shape* packed_shape) const;
-
   // Updates the resource with values from `pack`. If `gradient_sources` is
   // non-empty, treats `pack` as a tuple that represents a TensorArray and
   // its gradients, and unpacks and updates the gradient resources.
@@ -108,14 +115,14 @@ class XlaResource {
   // Opposite of Pack().
   Status SetFromPack(const std::set<string>& gradient_sources,
                      const xla::ComputationDataHandle& pack,
-                     bool reset_initial_values,
                      xla::ComputationBuilder* builder);
 
-  // TensorArray-specific fields
+  // TensorArray and Stack specific fields
 
   // 'tensor_array_size' stores the expected size of the TensorArray or Stack.
   // We need to store this since sometimes TensorArrays must be initialized
   // lazily since we do not know the element shape at construction time.
+  // Used by both TensorArrays and Stacks.
   int64 tensor_array_size() const { return tensor_array_size_; }
   void set_tensor_array_size(int64 size) { tensor_array_size_ = size; }
 
@@ -136,6 +143,7 @@ class XlaResource {
   const string name_;
 
   DataType type_;
+  TensorShape shape_;
   xla::ComputationDataHandle value_;
   xla::ComputationDataHandle initial_value_;
 
-- 
GitLab


From 31427937f9f4bfb777648819bb87db7f0d36ecd8 Mon Sep 17 00:00:00 2001
From: Kenji <kenji.toyama@gmail.com>
Date: Fri, 2 Feb 2018 20:16:49 +0000
Subject: [PATCH 590/966] Fix "Define the model" link. (#16661)

The link syntax was inverted, that is, round brackets were coming before square brackets, but Markdown doesn't like it.
---
 tensorflow/docs_src/get_started/custom_estimators.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/docs_src/get_started/custom_estimators.md b/tensorflow/docs_src/get_started/custom_estimators.md
index 79c4ee75d0..42a246678a 100644
--- a/tensorflow/docs_src/get_started/custom_estimators.md
+++ b/tensorflow/docs_src/get_started/custom_estimators.md
@@ -161,7 +161,7 @@ classifier = tf.estimator.Estimator(
 
 To implement a typical model function, you must do the following:
 
-* (Define the model)[#define_the_model].
+* [Define the model](#define_the_model).
 * Specify additional calculations for each of
   the [three different modes](#modes):
   * [Predict](#predict)
-- 
GitLab


From d4f3be511506b8577546b53cc9fb20b2b6a0c3c3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 2 Feb 2018 12:19:43 -0800
Subject: [PATCH 591/966] Update ops-related pbtxt files.

PiperOrigin-RevId: 184317521
---
 .../core/ops/compat/ops_history.v1.pbtxt      | 30 +++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 | 21 -------------
 2 files changed, 30 insertions(+), 21 deletions(-)

diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 177561161e..2580eaf987 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -42916,6 +42916,36 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceScatterUpdate"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "ResourceSparseApplyAdadelta"
   input_arg {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 2cd8d8a03b..8df126735b 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -20998,27 +20998,6 @@ op {
   attr {
     name: "dtype"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
   }
   attr {
     name: "Tindices"
-- 
GitLab


From 6360a0cc28b7e82c1582e700b46940aeafff96a6 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Fri, 2 Feb 2018 12:23:47 -0800
Subject: [PATCH 592/966] [TF:XLA] Bump open source llvm revision to r324073

PiperOrigin-RevId: 184318036
---
 tensorflow/workspace.bzl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 34ec291256..971e30bac8 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -472,11 +472,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "llvm",
       urls = [
-          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/f135378ec6365e852f7d5a3cfcdce342f08cb5f3.tar.gz",
-          "https://github.com/llvm-mirror/llvm/archive/f135378ec6365e852f7d5a3cfcdce342f08cb5f3.tar.gz",
+          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/299f8c346e1ab483463da5f02536ffd00b7ad9c6.tar.gz",
+          "https://github.com/llvm-mirror/llvm/archive/299f8c346e1ab483463da5f02536ffd00b7ad9c6.tar.gz",
       ],
-      sha256 = "296ab832167e6c46eb65ef1f9a2b5fc31c77fcd2248799b306aa2d5d2e4edbfe",
-      strip_prefix = "llvm-f135378ec6365e852f7d5a3cfcdce342f08cb5f3",
+      sha256 = "0556bc6a85000c573d92fe00946b6418cbcd3844912696a81055e4768299dda4",
+      strip_prefix = "llvm-299f8c346e1ab483463da5f02536ffd00b7ad9c6",
       build_file = str(Label("//third_party/llvm:llvm.BUILD")),
   )
 
-- 
GitLab


From 3a131265528582b99efa202a957f73cac6d5e23b Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Fri, 2 Feb 2018 12:31:09 -0800
Subject: [PATCH 593/966] Update Eigen library to 2355b229ea4c and fixes conv2d
 padding issue (#16705)

This fix updates Eigen to 2355b229ea4c, so that the issue raised in
14601 could be fixed.

This fix fixes 14601.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/workspace.bzl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index b6bba78401..93d44c6319 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -120,11 +120,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "eigen_archive",
       urls = [
-          "https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/14e1418fcf12.tar.gz",
-          "https://bitbucket.org/eigen/eigen/get/14e1418fcf12.tar.gz",
+          "https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/2355b229ea4c.tar.gz",
+          "https://bitbucket.org/eigen/eigen/get/2355b229ea4c.tar.gz",
       ],
-      sha256 = "2b526c6888639025323fd4f2600533c0f982d304ea48e4f1663e8066bd9f6368",
-      strip_prefix = "eigen-eigen-14e1418fcf12",
+      sha256 = "0cadb31a35b514bf2dfd6b5d38205da94ef326ec6908fc3fd7c269948467214f",
+      strip_prefix = "eigen-eigen-2355b229ea4c",
       build_file = str(Label("//third_party:eigen.BUILD")),
   )
 
-- 
GitLab


From 2863d44374c226444785f81ffe385e5dd5b9c295 Mon Sep 17 00:00:00 2001
From: Yuefeng Zhou <yuefengz@google.com>
Date: Fri, 2 Feb 2018 12:46:34 -0800
Subject: [PATCH 594/966] Skip the node that has unexpected number of outputs.

PiperOrigin-RevId: 184320865
---
 tensorflow/core/graph/costmodel.cc | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/graph/costmodel.cc b/tensorflow/core/graph/costmodel.cc
index f47c983086..4f3a6ec38c 100644
--- a/tensorflow/core/graph/costmodel.cc
+++ b/tensorflow/core/graph/costmodel.cc
@@ -252,9 +252,12 @@ void CostModel::RecordMaxMemorySize(const Node* node, int output_slot,
                                     const DataType& dtype) {
   const int id = Id(node);
   if (id < 0) return;
-  CHECK_LT(output_slot, node->num_outputs())
-      << "Unexpected output slot for node " << node->DebugString() << ". Got "
-      << output_slot << " but its num_outputs is " << node->num_outputs();
+  if (output_slot >= node->num_outputs()) {
+    LOG(ERROR) << "Unexpected output slot for node " << node->DebugString()
+               << ". Got " << output_slot << " but its num_outputs is "
+               << node->num_outputs();
+    return;
+  }
   Ensure(id, node->num_outputs());
   auto& current_max = max_mem_usage_[id].output_port_mem[output_slot];
   // If the memory allocator doesn't track memory usage, let's infer a lower
-- 
GitLab


From af9afcfe44ed97dc13422445b1f1c91eaa98d583 Mon Sep 17 00:00:00 2001
From: "Joshua V. Dillon" <jvdillon@google.com>
Date: Fri, 2 Feb 2018 13:07:37 -0800
Subject: [PATCH 595/966] Breaking change: Revise HMC interface to accept a
 list of Tensors representing a partitioning of chain state.

PiperOrigin-RevId: 184323369
---
 .../bayesflow/python/kernel_tests/hmc_test.py |  680 +++++---
 .../contrib/bayesflow/python/ops/hmc.py       |   11 +-
 .../contrib/bayesflow/python/ops/hmc_impl.py  | 1546 +++++++++++------
 3 files changed, 1455 insertions(+), 782 deletions(-)

diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/hmc_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/hmc_test.py
index cbc66b6dc1..d9d0dfce44 100644
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/hmc_test.py
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/hmc_test.py
@@ -19,29 +19,36 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-from scipy import special
 from scipy import stats
 
 from tensorflow.contrib.bayesflow.python.ops import hmc
+from tensorflow.contrib.bayesflow.python.ops.hmc_impl import _compute_energy_change
+from tensorflow.contrib.bayesflow.python.ops.hmc_impl import _leapfrog_integrator
 
+from tensorflow.contrib.distributions.python.ops import independent as independent_lib
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import gradients_impl as gradients_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import gamma as gamma_lib
+from tensorflow.python.ops.distributions import normal as normal_lib
 from tensorflow.python.platform import test
-from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.platform import tf_logging as logging_ops
+
+
+def _reduce_variance(x, axis=None, keepdims=False):
+  sample_mean = math_ops.reduce_mean(x, axis, keepdims=True)
+  return math_ops.reduce_mean(
+      math_ops.squared_difference(x, sample_mean), axis, keepdims)
 
 
-# TODO(b/66964210): Test float16.
 class HMCTest(test.TestCase):
 
   def setUp(self):
     self._shape_param = 5.
     self._rate_param = 10.
-    self._expected_x = (special.digamma(self._shape_param)
-                        - np.log(self._rate_param))
-    self._expected_exp_x = self._shape_param / self._rate_param
 
     random_seed.set_random_seed(10003)
     np.random.seed(10003)
@@ -63,63 +70,46 @@ class HMCTest(test.TestCase):
                                self._rate_param * math_ops.exp(x),
                                event_dims)
 
-  def _log_gamma_log_prob_grad(self, x, event_dims=()):
-    """Computes log-pdf and gradient of a log-gamma random variable.
-
-    Args:
-      x: Value of the random variable.
-      event_dims: Dimensions not to treat as independent. Default is (),
-        i.e., all dimensions are independent.
-
-    Returns:
-      log_prob: The log-pdf up to a normalizing constant.
-      grad: The gradient of the log-pdf with respect to x.
-    """
-    return (math_ops.reduce_sum(self._shape_param * x -
-                                self._rate_param * math_ops.exp(x),
-                                event_dims),
-            self._shape_param - self._rate_param * math_ops.exp(x))
-
-  def _n_event_dims(self, x_shape, event_dims):
-    return np.prod([int(x_shape[i]) for i in event_dims])
-
-  def _integrator_conserves_energy(self, x, event_dims, sess,
+  def _integrator_conserves_energy(self, x, independent_chain_ndims, sess,
                                    feed_dict=None):
-    def potential_and_grad(x):
-      log_prob, grad = self._log_gamma_log_prob_grad(x, event_dims)
-      return -log_prob, -grad
-
-    step_size = array_ops.placeholder(np.float32, [], name='step_size')
-    hmc_lf_steps = array_ops.placeholder(np.int32, [], name='hmc_lf_steps')
+    step_size = array_ops.placeholder(np.float32, [], name="step_size")
+    hmc_lf_steps = array_ops.placeholder(np.int32, [], name="hmc_lf_steps")
 
     if feed_dict is None:
       feed_dict = {}
     feed_dict[hmc_lf_steps] = 1000
 
-    m = random_ops.random_normal(array_ops.shape(x))
-    potential_0, grad_0 = potential_and_grad(x)
-    old_energy = potential_0 + 0.5 * math_ops.reduce_sum(m * m,
-                                                         event_dims)
-
-    _, new_m, potential_1, _ = (
-        hmc.leapfrog_integrator(step_size, hmc_lf_steps, x,
-                                m, potential_and_grad, grad_0))
+    event_dims = math_ops.range(independent_chain_ndims,
+                                array_ops.rank(x))
 
-    new_energy = potential_1 + 0.5 * math_ops.reduce_sum(new_m * new_m,
+    m = random_ops.random_normal(array_ops.shape(x))
+    log_prob_0 = self._log_gamma_log_prob(x, event_dims)
+    grad_0 = gradients_ops.gradients(log_prob_0, x)
+    old_energy = -log_prob_0 + 0.5 * math_ops.reduce_sum(m**2., event_dims)
+
+    new_m, _, log_prob_1, _ = _leapfrog_integrator(
+        current_momentums=[m],
+        target_log_prob_fn=lambda x: self._log_gamma_log_prob(x, event_dims),
+        current_state_parts=[x],
+        step_sizes=[step_size],
+        num_leapfrog_steps=hmc_lf_steps,
+        current_target_log_prob=log_prob_0,
+        current_grads_target_log_prob=grad_0)
+    new_m = new_m[0]
+
+    new_energy = -log_prob_1 + 0.5 * math_ops.reduce_sum(new_m * new_m,
                                                          event_dims)
 
     x_shape = sess.run(x, feed_dict).shape
-    n_event_dims = self._n_event_dims(x_shape, event_dims)
-    feed_dict[step_size] = 0.1 / n_event_dims
-    old_energy_val, new_energy_val = sess.run([old_energy, new_energy],
-                                              feed_dict)
-    logging.vlog(1, 'average energy change: {}'.format(
-        abs(old_energy_val - new_energy_val).mean()))
-
-    self.assertAllEqual(np.ones_like(new_energy_val, dtype=np.bool),
-                        abs(old_energy_val - new_energy_val) < 1.)
-
-  def _integrator_conserves_energy_wrapper(self, event_dims):
+    event_size = np.prod(x_shape[independent_chain_ndims:])
+    feed_dict[step_size] = 0.1 / event_size
+    old_energy_, new_energy_ = sess.run([old_energy, new_energy],
+                                        feed_dict)
+    logging_ops.vlog(1, "average energy relative change: {}".format(
+        (1. - new_energy_ / old_energy_).mean()))
+    self.assertAllClose(old_energy_, new_energy_, atol=0., rtol=0.02)
+
+  def _integrator_conserves_energy_wrapper(self, independent_chain_ndims):
     """Tests the long-term energy conservation of the leapfrog integrator.
 
     The leapfrog integrator is symplectic, so for sufficiently small step
@@ -127,135 +117,167 @@ class HMCTest(test.TestCase):
     the energy of the system blowing up or collapsing.
 
     Args:
-      event_dims: A tuple of dimensions that should not be treated as
-        independent. This allows for multiple chains to be run independently
-        in parallel. Default is (), i.e., all dimensions are independent.
+      independent_chain_ndims: Python `int` scalar representing the number of
+        dims associated with independent chains.
     """
     with self.test_session() as sess:
-      x_ph = array_ops.placeholder(np.float32, name='x_ph')
-
-      feed_dict = {x_ph: np.zeros([50, 10, 2])}
-      self._integrator_conserves_energy(x_ph, event_dims, sess, feed_dict)
+      x_ph = array_ops.placeholder(np.float32, name="x_ph")
+      feed_dict = {x_ph: np.random.rand(50, 10, 2)}
+      self._integrator_conserves_energy(x_ph, independent_chain_ndims,
+                                        sess, feed_dict)
 
   def testIntegratorEnergyConservationNullShape(self):
-    self._integrator_conserves_energy_wrapper([])
+    self._integrator_conserves_energy_wrapper(0)
 
   def testIntegratorEnergyConservation1(self):
-    self._integrator_conserves_energy_wrapper([1])
+    self._integrator_conserves_energy_wrapper(1)
 
   def testIntegratorEnergyConservation2(self):
-    self._integrator_conserves_energy_wrapper([2])
+    self._integrator_conserves_energy_wrapper(2)
 
-  def testIntegratorEnergyConservation12(self):
-    self._integrator_conserves_energy_wrapper([1, 2])
+  def testIntegratorEnergyConservation3(self):
+    self._integrator_conserves_energy_wrapper(3)
 
-  def testIntegratorEnergyConservation012(self):
-    self._integrator_conserves_energy_wrapper([0, 1, 2])
-
-  def _chain_gets_correct_expectations(self, x, event_dims, sess,
-                                       feed_dict=None):
+  def _chain_gets_correct_expectations(self, x, independent_chain_ndims,
+                                       sess, feed_dict=None):
     def log_gamma_log_prob(x):
+      event_dims = math_ops.range(independent_chain_ndims,
+                                  array_ops.rank(x))
       return self._log_gamma_log_prob(x, event_dims)
 
-    step_size = array_ops.placeholder(np.float32, [], name='step_size')
-    hmc_lf_steps = array_ops.placeholder(np.int32, [], name='hmc_lf_steps')
-    hmc_n_steps = array_ops.placeholder(np.int32, [], name='hmc_n_steps')
+    num_results = array_ops.placeholder(
+        np.int32, [], name="num_results")
+    step_size = array_ops.placeholder(
+        np.float32, [], name="step_size")
+    num_leapfrog_steps = array_ops.placeholder(
+        np.int32, [], name="num_leapfrog_steps")
 
     if feed_dict is None:
       feed_dict = {}
-    feed_dict.update({step_size: 0.1,
-                      hmc_lf_steps: 2,
-                      hmc_n_steps: 300})
-
-    sample_chain, acceptance_prob_chain = hmc.chain([hmc_n_steps],
-                                                    step_size,
-                                                    hmc_lf_steps,
-                                                    x, log_gamma_log_prob,
-                                                    event_dims)
-
-    acceptance_probs, samples = sess.run([acceptance_prob_chain, sample_chain],
-                                         feed_dict)
-    samples = samples[feed_dict[hmc_n_steps] // 2:]
-    expected_x_est = samples.mean()
-    expected_exp_x_est = np.exp(samples).mean()
-
-    logging.vlog(1, 'True      E[x, exp(x)]: {}\t{}'.format(
-        self._expected_x, self._expected_exp_x))
-    logging.vlog(1, 'Estimated E[x, exp(x)]: {}\t{}'.format(
-        expected_x_est, expected_exp_x_est))
-    self.assertNear(expected_x_est, self._expected_x, 2e-2)
-    self.assertNear(expected_exp_x_est, self._expected_exp_x, 2e-2)
-    self.assertTrue((acceptance_probs > 0.5).all())
-    self.assertTrue((acceptance_probs <= 1.0).all())
-
-  def _chain_gets_correct_expectations_wrapper(self, event_dims):
+    feed_dict.update({num_results: 150,
+                      step_size: 0.1,
+                      num_leapfrog_steps: 2})
+
+    samples, kernel_results = hmc.sample_chain(
+        num_results=num_results,
+        target_log_prob_fn=log_gamma_log_prob,
+        current_state=x,
+        step_size=step_size,
+        num_leapfrog_steps=num_leapfrog_steps,
+        num_burnin_steps=150,
+        seed=42)
+
+    expected_x = (math_ops.digamma(self._shape_param)
+                  - np.log(self._rate_param))
+
+    expected_exp_x = self._shape_param / self._rate_param
+
+    acceptance_probs_, samples_, expected_x_ = sess.run(
+        [kernel_results.acceptance_probs, samples, expected_x],
+        feed_dict)
+
+    actual_x = samples_.mean()
+    actual_exp_x = np.exp(samples_).mean()
+
+    logging_ops.vlog(1, "True      E[x, exp(x)]: {}\t{}".format(
+        expected_x_, expected_exp_x))
+    logging_ops.vlog(1, "Estimated E[x, exp(x)]: {}\t{}".format(
+        actual_x, actual_exp_x))
+    self.assertNear(actual_x, expected_x_, 2e-2)
+    self.assertNear(actual_exp_x, expected_exp_x, 2e-2)
+    self.assertTrue((acceptance_probs_ > 0.5).all())
+    self.assertTrue((acceptance_probs_ <= 1.0).all())
+
+  def _chain_gets_correct_expectations_wrapper(self, independent_chain_ndims):
     with self.test_session() as sess:
-      x_ph = array_ops.placeholder(np.float32, name='x_ph')
-
-      feed_dict = {x_ph: np.zeros([50, 10, 2])}
-      self._chain_gets_correct_expectations(x_ph, event_dims, sess,
-                                            feed_dict)
+      x_ph = array_ops.placeholder(np.float32, name="x_ph")
+      feed_dict = {x_ph: np.random.rand(50, 10, 2)}
+      self._chain_gets_correct_expectations(x_ph, independent_chain_ndims,
+                                            sess, feed_dict)
 
   def testHMCChainExpectationsNullShape(self):
-    self._chain_gets_correct_expectations_wrapper([])
+    self._chain_gets_correct_expectations_wrapper(0)
 
   def testHMCChainExpectations1(self):
-    self._chain_gets_correct_expectations_wrapper([1])
+    self._chain_gets_correct_expectations_wrapper(1)
 
   def testHMCChainExpectations2(self):
-    self._chain_gets_correct_expectations_wrapper([2])
-
-  def testHMCChainExpectations12(self):
-    self._chain_gets_correct_expectations_wrapper([1, 2])
+    self._chain_gets_correct_expectations_wrapper(2)
 
-  def _kernel_leaves_target_invariant(self, initial_draws, event_dims,
+  def _kernel_leaves_target_invariant(self, initial_draws,
+                                      independent_chain_ndims,
                                       sess, feed_dict=None):
     def log_gamma_log_prob(x):
+      event_dims = math_ops.range(independent_chain_ndims, array_ops.rank(x))
       return self._log_gamma_log_prob(x, event_dims)
 
     def fake_log_prob(x):
       """Cooled version of the target distribution."""
       return 1.1 * log_gamma_log_prob(x)
 
-    step_size = array_ops.placeholder(np.float32, [], name='step_size')
+    step_size = array_ops.placeholder(np.float32, [], name="step_size")
 
     if feed_dict is None:
       feed_dict = {}
 
     feed_dict[step_size] = 0.4
 
-    sample, acceptance_probs, _, _ = hmc.kernel(step_size, 5, initial_draws,
-                                                log_gamma_log_prob, event_dims)
-    bad_sample, bad_acceptance_probs, _, _ = hmc.kernel(
-        step_size, 5, initial_draws, fake_log_prob, event_dims)
-    (acceptance_probs_val, bad_acceptance_probs_val, initial_draws_val,
-     updated_draws_val, fake_draws_val) = sess.run([acceptance_probs,
-                                                    bad_acceptance_probs,
-                                                    initial_draws, sample,
-                                                    bad_sample], feed_dict)
+    sample, kernel_results = hmc.kernel(
+        target_log_prob_fn=log_gamma_log_prob,
+        current_state=initial_draws,
+        step_size=step_size,
+        num_leapfrog_steps=5,
+        seed=43)
+
+    bad_sample, bad_kernel_results = hmc.kernel(
+        target_log_prob_fn=fake_log_prob,
+        current_state=initial_draws,
+        step_size=step_size,
+        num_leapfrog_steps=5,
+        seed=44)
+
+    [
+        acceptance_probs_,
+        bad_acceptance_probs_,
+        initial_draws_,
+        updated_draws_,
+        fake_draws_,
+    ] = sess.run([
+        kernel_results.acceptance_probs,
+        bad_kernel_results.acceptance_probs,
+        initial_draws,
+        sample,
+        bad_sample,
+    ], feed_dict)
+
     # Confirm step size is small enough that we usually accept.
-    self.assertGreater(acceptance_probs_val.mean(), 0.5)
-    self.assertGreater(bad_acceptance_probs_val.mean(), 0.5)
+    self.assertGreater(acceptance_probs_.mean(), 0.5)
+    self.assertGreater(bad_acceptance_probs_.mean(), 0.5)
+
     # Confirm step size is large enough that we sometimes reject.
-    self.assertLess(acceptance_probs_val.mean(), 0.99)
-    self.assertLess(bad_acceptance_probs_val.mean(), 0.99)
-    _, ks_p_value_true = stats.ks_2samp(initial_draws_val.flatten(),
-                                        updated_draws_val.flatten())
-    _, ks_p_value_fake = stats.ks_2samp(initial_draws_val.flatten(),
-                                        fake_draws_val.flatten())
-    logging.vlog(1, 'acceptance rate for true target: {}'.format(
-        acceptance_probs_val.mean()))
-    logging.vlog(1, 'acceptance rate for fake target: {}'.format(
-        bad_acceptance_probs_val.mean()))
-    logging.vlog(1, 'K-S p-value for true target: {}'.format(ks_p_value_true))
-    logging.vlog(1, 'K-S p-value for fake target: {}'.format(ks_p_value_fake))
+    self.assertLess(acceptance_probs_.mean(), 0.99)
+    self.assertLess(bad_acceptance_probs_.mean(), 0.99)
+
+    _, ks_p_value_true = stats.ks_2samp(initial_draws_.flatten(),
+                                        updated_draws_.flatten())
+    _, ks_p_value_fake = stats.ks_2samp(initial_draws_.flatten(),
+                                        fake_draws_.flatten())
+
+    logging_ops.vlog(1, "acceptance rate for true target: {}".format(
+        acceptance_probs_.mean()))
+    logging_ops.vlog(1, "acceptance rate for fake target: {}".format(
+        bad_acceptance_probs_.mean()))
+    logging_ops.vlog(1, "K-S p-value for true target: {}".format(
+        ks_p_value_true))
+    logging_ops.vlog(1, "K-S p-value for fake target: {}".format(
+        ks_p_value_fake))
     # Make sure that the MCMC update hasn't changed the empirical CDF much.
     self.assertGreater(ks_p_value_true, 1e-3)
     # Confirm that targeting the wrong distribution does
     # significantly change the empirical CDF.
     self.assertLess(ks_p_value_fake, 1e-6)
 
-  def _kernel_leaves_target_invariant_wrapper(self, event_dims):
+  def _kernel_leaves_target_invariant_wrapper(self, independent_chain_ndims):
     """Tests that the kernel leaves the target distribution invariant.
 
     Draws some independent samples from the target distribution,
@@ -267,86 +289,116 @@ class HMCTest(test.TestCase):
     does change the target distribution. (And that we can detect that.)
 
     Args:
-      event_dims: A tuple of dimensions that should not be treated as
-        independent. This allows for multiple chains to be run independently
-        in parallel. Default is (), i.e., all dimensions are independent.
+      independent_chain_ndims: Python `int` scalar representing the number of
+        dims associated with independent chains.
     """
     with self.test_session() as sess:
       initial_draws = np.log(np.random.gamma(self._shape_param,
                                              size=[50000, 2, 2]))
       initial_draws -= np.log(self._rate_param)
-      x_ph = array_ops.placeholder(np.float32, name='x_ph')
+      x_ph = array_ops.placeholder(np.float32, name="x_ph")
 
       feed_dict = {x_ph: initial_draws}
 
-      self._kernel_leaves_target_invariant(x_ph, event_dims, sess,
-                                           feed_dict)
-
-  def testKernelLeavesTargetInvariantNullShape(self):
-    self._kernel_leaves_target_invariant_wrapper([])
+      self._kernel_leaves_target_invariant(x_ph, independent_chain_ndims,
+                                           sess, feed_dict)
 
   def testKernelLeavesTargetInvariant1(self):
-    self._kernel_leaves_target_invariant_wrapper([1])
+    self._kernel_leaves_target_invariant_wrapper(1)
 
   def testKernelLeavesTargetInvariant2(self):
-    self._kernel_leaves_target_invariant_wrapper([2])
+    self._kernel_leaves_target_invariant_wrapper(2)
 
-  def testKernelLeavesTargetInvariant12(self):
-    self._kernel_leaves_target_invariant_wrapper([1, 2])
+  def testKernelLeavesTargetInvariant3(self):
+    self._kernel_leaves_target_invariant_wrapper(3)
 
-  def _ais_gets_correct_log_normalizer(self, init, event_dims, sess,
-                                       feed_dict=None):
+  def _ais_gets_correct_log_normalizer(self, init, independent_chain_ndims,
+                                       sess, feed_dict=None):
     def proposal_log_prob(x):
-      return math_ops.reduce_sum(-0.5 * x * x - 0.5 * np.log(2*np.pi),
-                                 event_dims)
+      event_dims = math_ops.range(independent_chain_ndims, array_ops.rank(x))
+      return -0.5 * math_ops.reduce_sum(x**2. + np.log(2 * np.pi),
+                                        axis=event_dims)
 
     def target_log_prob(x):
+      event_dims = math_ops.range(independent_chain_ndims, array_ops.rank(x))
       return self._log_gamma_log_prob(x, event_dims)
 
     if feed_dict is None:
       feed_dict = {}
 
-    w, _, _ = hmc.ais_chain(200, 0.5, 2, init, target_log_prob,
-                            proposal_log_prob, event_dims)
-
-    w_val = sess.run(w, feed_dict)
-    init_shape = sess.run(init, feed_dict).shape
-    normalizer_multiplier = np.prod([init_shape[i] for i in event_dims])
-
-    true_normalizer = -self._shape_param * np.log(self._rate_param)
-    true_normalizer += special.gammaln(self._shape_param)
-    true_normalizer *= normalizer_multiplier
-
-    n_weights = np.prod(w_val.shape)
-    normalized_w = np.exp(w_val - true_normalizer)
-    standard_error = np.std(normalized_w) / np.sqrt(n_weights)
-    logging.vlog(1, 'True normalizer {}, estimated {}, n_weights {}'.format(
-        true_normalizer, np.log(normalized_w.mean()) + true_normalizer,
-        n_weights))
-    self.assertNear(normalized_w.mean(), 1.0, 4.0 * standard_error)
-
-  def _ais_gets_correct_log_normalizer_wrapper(self, event_dims):
+    num_steps = 200
+
+    _, ais_weights, _ = hmc.sample_annealed_importance_chain(
+        proposal_log_prob_fn=proposal_log_prob,
+        num_steps=num_steps,
+        target_log_prob_fn=target_log_prob,
+        step_size=0.5,
+        current_state=init,
+        num_leapfrog_steps=2,
+        seed=45)
+
+    event_shape = array_ops.shape(init)[independent_chain_ndims:]
+    event_size = math_ops.reduce_prod(event_shape)
+
+    log_true_normalizer = (
+        -self._shape_param * math_ops.log(self._rate_param)
+        + math_ops.lgamma(self._shape_param))
+    log_true_normalizer *= math_ops.cast(event_size, log_true_normalizer.dtype)
+
+    log_estimated_normalizer = (math_ops.reduce_logsumexp(ais_weights)
+                                - np.log(num_steps))
+
+    ratio_estimate_true = math_ops.exp(ais_weights - log_true_normalizer)
+    ais_weights_size = array_ops.size(ais_weights)
+    standard_error = math_ops.sqrt(
+        _reduce_variance(ratio_estimate_true)
+        / math_ops.cast(ais_weights_size, ratio_estimate_true.dtype))
+
+    [
+        ratio_estimate_true_,
+        log_true_normalizer_,
+        log_estimated_normalizer_,
+        standard_error_,
+        ais_weights_size_,
+        event_size_,
+    ] = sess.run([
+        ratio_estimate_true,
+        log_true_normalizer,
+        log_estimated_normalizer,
+        standard_error,
+        ais_weights_size,
+        event_size,
+    ], feed_dict)
+
+    logging_ops.vlog(1, "        log_true_normalizer: {}\n"
+                        "   log_estimated_normalizer: {}\n"
+                        "           ais_weights_size: {}\n"
+                        "                 event_size: {}\n".format(
+                            log_true_normalizer_,
+                            log_estimated_normalizer_,
+                            ais_weights_size_,
+                            event_size_))
+    self.assertNear(ratio_estimate_true_.mean(), 1., 4. * standard_error_)
+
+  def _ais_gets_correct_log_normalizer_wrapper(self, independent_chain_ndims):
     """Tests that AIS yields reasonable estimates of normalizers."""
     with self.test_session() as sess:
-      x_ph = array_ops.placeholder(np.float32, name='x_ph')
-
+      x_ph = array_ops.placeholder(np.float32, name="x_ph")
       initial_draws = np.random.normal(size=[30, 2, 1])
-      feed_dict = {x_ph: initial_draws}
-
-      self._ais_gets_correct_log_normalizer(x_ph, event_dims, sess,
-                                            feed_dict)
-
-  def testAISNullShape(self):
-    self._ais_gets_correct_log_normalizer_wrapper([])
+      self._ais_gets_correct_log_normalizer(
+          x_ph,
+          independent_chain_ndims,
+          sess,
+          feed_dict={x_ph: initial_draws})
 
   def testAIS1(self):
-    self._ais_gets_correct_log_normalizer_wrapper([1])
+    self._ais_gets_correct_log_normalizer_wrapper(1)
 
   def testAIS2(self):
-    self._ais_gets_correct_log_normalizer_wrapper([2])
+    self._ais_gets_correct_log_normalizer_wrapper(2)
 
-  def testAIS12(self):
-    self._ais_gets_correct_log_normalizer_wrapper([1, 2])
+  def testAIS3(self):
+    self._ais_gets_correct_log_normalizer_wrapper(3)
 
   def testNanRejection(self):
     """Tests that an update that yields NaN potentials gets rejected.
@@ -359,24 +411,29 @@ class HMCTest(test.TestCase):
     """
     def _unbounded_exponential_log_prob(x):
       """An exponential distribution with log-likelihood NaN for x < 0."""
-      per_element_potentials = array_ops.where(x < 0,
-                                               np.nan * array_ops.ones_like(x),
-                                               -x)
+      per_element_potentials = array_ops.where(
+          x < 0.,
+          array_ops.fill(array_ops.shape(x), x.dtype.as_numpy_dtype(np.nan)),
+          -x)
       return math_ops.reduce_sum(per_element_potentials)
 
     with self.test_session() as sess:
       initial_x = math_ops.linspace(0.01, 5, 10)
-      updated_x, acceptance_probs, _, _ = hmc.kernel(
-          2., 5, initial_x, _unbounded_exponential_log_prob, [0])
-      initial_x_val, updated_x_val, acceptance_probs_val = sess.run(
-          [initial_x, updated_x, acceptance_probs])
-
-      logging.vlog(1, 'initial_x = {}'.format(initial_x_val))
-      logging.vlog(1, 'updated_x = {}'.format(updated_x_val))
-      logging.vlog(1, 'acceptance_probs = {}'.format(acceptance_probs_val))
-
-      self.assertAllEqual(initial_x_val, updated_x_val)
-      self.assertEqual(acceptance_probs_val, 0.)
+      updated_x, kernel_results = hmc.kernel(
+          target_log_prob_fn=_unbounded_exponential_log_prob,
+          current_state=initial_x,
+          step_size=2.,
+          num_leapfrog_steps=5,
+          seed=46)
+      initial_x_, updated_x_, acceptance_probs_ = sess.run(
+          [initial_x, updated_x, kernel_results.acceptance_probs])
+
+      logging_ops.vlog(1, "initial_x = {}".format(initial_x_))
+      logging_ops.vlog(1, "updated_x = {}".format(updated_x_))
+      logging_ops.vlog(1, "acceptance_probs = {}".format(acceptance_probs_))
+
+      self.assertAllEqual(initial_x_, updated_x_)
+      self.assertEqual(acceptance_probs_, 0.)
 
   def testNanFromGradsDontPropagate(self):
     """Test that update with NaN gradients does not cause NaN in results."""
@@ -385,60 +442,195 @@ class HMCTest(test.TestCase):
 
     with self.test_session() as sess:
       initial_x = math_ops.linspace(0.01, 5, 10)
-      updated_x, acceptance_probs, new_log_prob, new_grad = hmc.kernel(
-          2., 5, initial_x, _nan_log_prob_with_nan_gradient, [0])
-      initial_x_val, updated_x_val, acceptance_probs_val = sess.run(
-          [initial_x, updated_x, acceptance_probs])
-
-      logging.vlog(1, 'initial_x = {}'.format(initial_x_val))
-      logging.vlog(1, 'updated_x = {}'.format(updated_x_val))
-      logging.vlog(1, 'acceptance_probs = {}'.format(acceptance_probs_val))
-
-      self.assertAllEqual(initial_x_val, updated_x_val)
-      self.assertEqual(acceptance_probs_val, 0.)
+      updated_x, kernel_results = hmc.kernel(
+          target_log_prob_fn=_nan_log_prob_with_nan_gradient,
+          current_state=initial_x,
+          step_size=2.,
+          num_leapfrog_steps=5,
+          seed=47)
+      initial_x_, updated_x_, acceptance_probs_ = sess.run(
+          [initial_x, updated_x, kernel_results.acceptance_probs])
+
+      logging_ops.vlog(1, "initial_x = {}".format(initial_x_))
+      logging_ops.vlog(1, "updated_x = {}".format(updated_x_))
+      logging_ops.vlog(1, "acceptance_probs = {}".format(acceptance_probs_))
+
+      self.assertAllEqual(initial_x_, updated_x_)
+      self.assertEqual(acceptance_probs_, 0.)
 
       self.assertAllFinite(
-          gradients_impl.gradients(updated_x, initial_x)[0].eval())
-      self.assertTrue(
-          gradients_impl.gradients(new_grad, initial_x)[0] is None)
+          gradients_ops.gradients(updated_x, initial_x)[0].eval())
+      self.assertAllEqual([True], [g is None for g in gradients_ops.gradients(
+          kernel_results.proposed_grads_target_log_prob, initial_x)])
+      self.assertAllEqual([False], [g is None for g in gradients_ops.gradients(
+          kernel_results.proposed_grads_target_log_prob,
+          kernel_results.proposed_state)])
 
       # Gradients of the acceptance probs and new log prob are not finite.
-      _ = new_log_prob  # Prevent unused arg error.
       # self.assertAllFinite(
-      #     gradients_impl.gradients(acceptance_probs, initial_x)[0].eval())
+      #     gradients_ops.gradients(acceptance_probs, initial_x)[0].eval())
       # self.assertAllFinite(
-      #     gradients_impl.gradients(new_log_prob, initial_x)[0].eval())
+      #     gradients_ops.gradients(new_log_prob, initial_x)[0].eval())
+
+  def _testChainWorksDtype(self, dtype):
+    states, kernel_results = hmc.sample_chain(
+        num_results=10,
+        target_log_prob_fn=lambda x: -math_ops.reduce_sum(x**2., axis=-1),
+        current_state=np.zeros(5).astype(dtype),
+        step_size=0.01,
+        num_leapfrog_steps=10,
+        seed=48)
+    with self.test_session() as sess:
+      states_, acceptance_probs_ = sess.run(
+          [states, kernel_results.acceptance_probs])
+    self.assertEqual(dtype, states_.dtype)
+    self.assertEqual(dtype, acceptance_probs_.dtype)
 
   def testChainWorksIn64Bit(self):
-    def log_prob(x):
-      return - math_ops.reduce_sum(x * x, axis=-1)
-    states, acceptance_probs = hmc.chain(
-        n_iterations=10,
-        step_size=np.float64(0.01),
-        n_leapfrog_steps=10,
-        initial_x=np.zeros(5).astype(np.float64),
-        target_log_prob_fn=log_prob,
-        event_dims=[-1])
-    with self.test_session() as sess:
-      states_, acceptance_probs_ = sess.run([states, acceptance_probs])
-    self.assertEqual(np.float64, states_.dtype)
-    self.assertEqual(np.float64, acceptance_probs_.dtype)
+    self._testChainWorksDtype(np.float64)
 
   def testChainWorksIn16Bit(self):
-    def log_prob(x):
-      return - math_ops.reduce_sum(x * x, axis=-1)
-    states, acceptance_probs = hmc.chain(
-        n_iterations=10,
-        step_size=np.float16(0.01),
-        n_leapfrog_steps=10,
-        initial_x=np.zeros(5).astype(np.float16),
-        target_log_prob_fn=log_prob,
-        event_dims=[-1])
+    self._testChainWorksDtype(np.float16)
+
+
+class _EnergyComputationTest(object):
+
+  def testHandlesNanFromPotential(self):
+    with self.test_session() as sess:
+      x = [1, np.inf, -np.inf, np.nan]
+      target_log_prob, proposed_target_log_prob = [
+          self.dtype(x.flatten()) for x in np.meshgrid(x, x)]
+      num_chains = len(target_log_prob)
+      dummy_momentums = [-1, 1]
+      momentums = [self.dtype([dummy_momentums] * num_chains)]
+      proposed_momentums = [self.dtype([dummy_momentums] * num_chains)]
+
+      target_log_prob = ops.convert_to_tensor(target_log_prob)
+      momentums = [ops.convert_to_tensor(momentums[0])]
+      proposed_target_log_prob = ops.convert_to_tensor(proposed_target_log_prob)
+      proposed_momentums = [ops.convert_to_tensor(proposed_momentums[0])]
+
+      energy = _compute_energy_change(
+          target_log_prob,
+          momentums,
+          proposed_target_log_prob,
+          proposed_momentums,
+          independent_chain_ndims=1)
+      grads = gradients_ops.gradients(energy, momentums)
+
+      [actual_energy, grads_] = sess.run([energy, grads])
+
+      # Ensure energy is `inf` (note: that's positive inf) in weird cases and
+      # finite otherwise.
+      expected_energy = self.dtype([0] + [np.inf]*(num_chains - 1))
+      self.assertAllEqual(expected_energy, actual_energy)
+
+      # Ensure gradient is finite.
+      self.assertAllEqual(np.ones_like(grads_).astype(np.bool),
+                          np.isfinite(grads_))
+
+  def testHandlesNanFromKinetic(self):
     with self.test_session() as sess:
-      states_, acceptance_probs_ = sess.run([states, acceptance_probs])
-    self.assertEqual(np.float16, states_.dtype)
-    self.assertEqual(np.float16, acceptance_probs_.dtype)
+      x = [1, np.inf, -np.inf, np.nan]
+      momentums, proposed_momentums = [
+          [np.reshape(self.dtype(x), [-1, 1])]
+          for x in np.meshgrid(x, x)]
+      num_chains = len(momentums[0])
+      target_log_prob = np.ones(num_chains, self.dtype)
+      proposed_target_log_prob = np.ones(num_chains, self.dtype)
 
+      target_log_prob = ops.convert_to_tensor(target_log_prob)
+      momentums = [ops.convert_to_tensor(momentums[0])]
+      proposed_target_log_prob = ops.convert_to_tensor(proposed_target_log_prob)
+      proposed_momentums = [ops.convert_to_tensor(proposed_momentums[0])]
 
-if __name__ == '__main__':
+      energy = _compute_energy_change(
+          target_log_prob,
+          momentums,
+          proposed_target_log_prob,
+          proposed_momentums,
+          independent_chain_ndims=1)
+      grads = gradients_ops.gradients(energy, momentums)
+
+      [actual_energy, grads_] = sess.run([energy, grads])
+
+      # Ensure energy is `inf` (note: that's positive inf) in weird cases and
+      # finite otherwise.
+      expected_energy = self.dtype([0] + [np.inf]*(num_chains - 1))
+      self.assertAllEqual(expected_energy, actual_energy)
+
+      # Ensure gradient is finite.
+      g = grads_[0].reshape([len(x), len(x)])[:, 0]
+      self.assertAllEqual(np.ones_like(g).astype(np.bool), np.isfinite(g))
+
+      # The remaining gradients are nan because the momentum was itself nan or
+      # inf.
+      g = grads_[0].reshape([len(x), len(x)])[:, 1:]
+      self.assertAllEqual(np.ones_like(g).astype(np.bool), np.isnan(g))
+
+
+class EnergyComputationTest16(test.TestCase, _EnergyComputationTest):
+  dtype = np.float16
+
+
+class EnergyComputationTest32(test.TestCase, _EnergyComputationTest):
+  dtype = np.float32
+
+
+class EnergyComputationTest64(test.TestCase, _EnergyComputationTest):
+  dtype = np.float64
+
+
+class _HMCHandlesLists(object):
+
+  def testStateParts(self):
+    with self.test_session() as sess:
+      dist_x = normal_lib.Normal(loc=self.dtype(0), scale=self.dtype(1))
+      dist_y = independent_lib.Independent(
+          gamma_lib.Gamma(concentration=self.dtype([1, 2]),
+                          rate=self.dtype([0.5, 0.75])),
+          reinterpreted_batch_ndims=1)
+      def target_log_prob(x, y):
+        return dist_x.log_prob(x) + dist_y.log_prob(y)
+      x0 = [dist_x.sample(seed=1), dist_y.sample(seed=2)]
+      samples, _ = hmc.sample_chain(
+          num_results=int(2e3),
+          target_log_prob_fn=target_log_prob,
+          current_state=x0,
+          step_size=0.85,
+          num_leapfrog_steps=3,
+          num_burnin_steps=int(250),
+          seed=49)
+      actual_means = [math_ops.reduce_mean(s, axis=0) for s in samples]
+      actual_vars = [_reduce_variance(s, axis=0) for s in samples]
+      expected_means = [dist_x.mean(), dist_y.mean()]
+      expected_vars = [dist_x.variance(), dist_y.variance()]
+      [
+          actual_means_,
+          actual_vars_,
+          expected_means_,
+          expected_vars_,
+      ] = sess.run([
+          actual_means,
+          actual_vars,
+          expected_means,
+          expected_vars,
+      ])
+      self.assertAllClose(expected_means_, actual_means_, atol=0.05, rtol=0.16)
+      self.assertAllClose(expected_vars_, actual_vars_, atol=0., rtol=0.30)
+
+
+class HMCHandlesLists16(_HMCHandlesLists, test.TestCase):
+  dtype = np.float16
+
+
+class HMCHandlesLists32(_HMCHandlesLists, test.TestCase):
+  dtype = np.float32
+
+
+class HMCHandlesLists64(_HMCHandlesLists, test.TestCase):
+  dtype = np.float64
+
+
+if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/bayesflow/python/ops/hmc.py b/tensorflow/contrib/bayesflow/python/ops/hmc.py
index 977d42fc16..7fd5652c5c 100644
--- a/tensorflow/contrib/bayesflow/python/ops/hmc.py
+++ b/tensorflow/contrib/bayesflow/python/ops/hmc.py
@@ -12,8 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Hamiltonian Monte Carlo, a gradient-based MCMC algorithm.
-"""
+"""Hamiltonian Monte Carlo, a gradient-based MCMC algorithm."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -24,11 +23,9 @@ from tensorflow.contrib.bayesflow.python.ops.hmc_impl import *  # pylint: disabl
 from tensorflow.python.util import all_util
 
 _allowed_symbols = [
-    'chain',
-    'kernel',
-    'leapfrog_integrator',
-    'leapfrog_step',
-    'ais_chain'
+    "sample_chain",
+    "sample_annealed_importance_chain",
+    "kernel",
 ]
 
 all_util.remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/bayesflow/python/ops/hmc_impl.py b/tensorflow/contrib/bayesflow/python/ops/hmc_impl.py
index 5685a942e9..f7a11c21d8 100644
--- a/tensorflow/contrib/bayesflow/python/ops/hmc_impl.py
+++ b/tensorflow/contrib/bayesflow/python/ops/hmc_impl.py
@@ -14,17 +14,16 @@
 # ==============================================================================
 """Hamiltonian Monte Carlo, a gradient-based MCMC algorithm.
 
-@@chain
-@@update
-@@leapfrog_integrator
-@@leapfrog_step
-@@ais_chain
+@@sample_chain
+@@sample_annealed_importance_chain
+@@kernel
 """
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import numpy as np
 
 from tensorflow.python.framework import dtypes
@@ -32,168 +31,292 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import functional_ops
-from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import gradients_impl as gradients_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
-from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.ops.distributions import util as distributions_util
 
 __all__ = [
-    'chain',
-    'kernel',
-    'leapfrog_integrator',
-    'leapfrog_step',
-    'ais_chain'
+    "sample_chain",
+    "sample_annealed_importance_chain",
+    "kernel",
 ]
 
 
-def _make_potential_and_grad(target_log_prob_fn):
-  def potential_and_grad(x):
-    log_prob_result = -target_log_prob_fn(x)
-    grad_result = gradients_impl.gradients(math_ops.reduce_sum(log_prob_result),
-                                           x)[0]
-    return log_prob_result, grad_result
-  return potential_and_grad
-
-
-def chain(n_iterations, step_size, n_leapfrog_steps, initial_x,
-          target_log_prob_fn, event_dims=(), name=None):
+KernelResults = collections.namedtuple(
+    "KernelResults",
+    [
+        "acceptance_probs",
+        "current_grads_target_log_prob",  # "Current result" means "accepted".
+        "current_target_log_prob",  # "Current result" means "accepted".
+        "energy_change",
+        "is_accepted",
+        "proposed_grads_target_log_prob",
+        "proposed_state",
+        "proposed_target_log_prob",
+        "random_positive",
+    ])
+
+
+def _make_dummy_kernel_results(
+    dummy_state,
+    dummy_target_log_prob,
+    dummy_grads_target_log_prob):
+  return KernelResults(
+      acceptance_probs=dummy_target_log_prob,
+      current_grads_target_log_prob=dummy_grads_target_log_prob,
+      current_target_log_prob=dummy_target_log_prob,
+      energy_change=dummy_target_log_prob,
+      is_accepted=array_ops.ones_like(dummy_target_log_prob, dtypes.bool),
+      proposed_grads_target_log_prob=dummy_grads_target_log_prob,
+      proposed_state=dummy_state,
+      proposed_target_log_prob=dummy_target_log_prob,
+      random_positive=dummy_target_log_prob,
+  )
+
+
+def sample_chain(
+    num_results,
+    target_log_prob_fn,
+    current_state,
+    step_size,
+    num_leapfrog_steps,
+    num_burnin_steps=0,
+    num_steps_between_results=0,
+    seed=None,
+    current_target_log_prob=None,
+    current_grads_target_log_prob=None,
+    name=None):
   """Runs multiple iterations of one or more Hamiltonian Monte Carlo chains.
 
-  Hamiltonian Monte Carlo (HMC) is a Markov chain Monte Carlo (MCMC)
-  algorithm that takes a series of gradient-informed steps to produce
-  a Metropolis proposal. This function samples from an HMC Markov
-  chain whose initial state is `initial_x` and whose stationary
-  distribution has log-density `target_log_prob_fn()`.
-
-  This function can update multiple chains in parallel. It assumes
-  that all dimensions of `initial_x` not specified in `event_dims` are
-  independent, and should therefore be updated independently. The
-  output of `target_log_prob_fn()` should sum log-probabilities across
-  all event dimensions. Slices along dimensions not in `event_dims`
-  may have different target distributions; this is up to
+  Hamiltonian Monte Carlo (HMC) is a Markov chain Monte Carlo (MCMC) algorithm
+  that takes a series of gradient-informed steps to produce a Metropolis
+  proposal. This function samples from an HMC Markov chain at `current_state`
+  and whose stationary distribution has log-unnormalized-density
   `target_log_prob_fn()`.
 
-  This function basically just wraps `hmc.kernel()` in a tf.scan() loop.
+  This function samples from multiple chains in parallel. It assumes that the
+  the leftmost dimensions of (each) `current_state` (part) index an independent
+  chain.  The function `target_log_prob_fn()` sums log-probabilities across
+  event dimensions (i.e., current state (part) rightmost dimensions). Each
+  element of the output of `target_log_prob_fn()` represents the (possibly
+  unnormalized) log-probability of the joint distribution over (all) the current
+  state (parts).
 
-  Args:
-    n_iterations: Integer number of Markov chain updates to run.
-    step_size: Scalar step size or array of step sizes for the
-      leapfrog integrator. Broadcasts to the shape of
-      `initial_x`. Larger step sizes lead to faster progress, but
-      too-large step sizes make rejection exponentially more likely.
-      When possible, it's often helpful to match per-variable step
-      sizes to the standard deviations of the target distribution in
-      each variable.
-    n_leapfrog_steps: Integer number of steps to run the leapfrog
-      integrator for. Total progress per HMC step is roughly
-      proportional to step_size * n_leapfrog_steps.
-    initial_x: Tensor of initial state(s) of the Markov chain(s).
-    target_log_prob_fn: Python callable which takes an argument like `initial_x`
-      and returns its (possibly unnormalized) log-density under the target
-      distribution.
-    event_dims: List of dimensions that should not be treated as
-      independent. This allows for multiple chains to be run independently
-      in parallel. Default is (), i.e., all dimensions are independent.
-    name: Python `str` name prefixed to Ops created by this function.
+  The `current_state` can be represented as a single `Tensor` or a `list` of
+  `Tensors` which collectively represent the current state. When specifying a
+  `list`, one must also specify a list of `step_size`s.
 
-  Returns:
-    acceptance_probs: Tensor with the acceptance probabilities for each
-      iteration. Has shape matching `target_log_prob_fn(initial_x)`.
-    chain_states: Tensor with the state of the Markov chain at each iteration.
-      Has shape `[n_iterations, initial_x.shape[0],...,initial_x.shape[-1]`.
+  Only one out of every `num_steps_between_samples + 1` steps is included in the
+  returned results. This "thinning" comes at a cost of reduced statistical
+  power, while reducing memory requirements and autocorrelation. For more
+  discussion see [1].
+
+  [1]: "Statistically efficient thinning of a Markov chain sampler."
+       Art B. Owen. April 2017.
+       http://statweb.stanford.edu/~owen/reports/bestthinning.pdf
 
   #### Examples:
 
-  ```python
-  # Sampling from a standard normal (note `log_joint()` is unnormalized):
-  def log_joint(x):
-    return tf.reduce_sum(-0.5 * tf.square(x))
-  chain, acceptance_probs = hmc.chain(1000, 0.5, 2, tf.zeros(10), log_joint,
-                                      event_dims=[0])
-  # Discard first half of chain as warmup/burn-in
-  warmed_up = chain[500:]
-  mean_est = tf.reduce_mean(warmed_up, 0)
-  var_est = tf.reduce_mean(tf.square(warmed_up), 0) - tf.square(mean_est)
-  ```
+  ##### Sample from a diagonal-variance Gaussian.
 
   ```python
-  # Sampling from a diagonal-variance Gaussian:
-  variances = tf.linspace(1., 3., 10)
-  def log_joint(x):
-    return tf.reduce_sum(-0.5 / variances * tf.square(x))
-  chain, acceptance_probs = hmc.chain(1000, 0.5, 2, tf.zeros(10), log_joint,
-                                      event_dims=[0])
-  # Discard first half of chain as warmup/burn-in
-  warmed_up = chain[500:]
-  mean_est = tf.reduce_mean(warmed_up, 0)
-  var_est = tf.reduce_mean(tf.square(warmed_up), 0) - tf.square(mean_est)
+  tfd = tf.contrib.distributions
+
+  def make_likelihood(true_variances):
+    return tfd.MultivariateNormalDiag(
+        scale_diag=tf.sqrt(true_variances))
+
+  dims = 10
+  dtype = np.float32
+  true_variances = tf.linspace(dtype(1), dtype(3), dims)
+  likelihood = make_likelihood(true_variances)
+
+  states, kernel_results = hmc.sample_chain(
+      num_results=1000,
+      target_log_prob_fn=likelihood.log_prob,
+      current_state=tf.zeros(dims),
+      step_size=0.5,
+      num_leapfrog_steps=2,
+      num_burnin_steps=500)
+
+  # Compute sample stats.
+  sample_mean = tf.reduce_mean(states, axis=0)
+  sample_var = tf.reduce_mean(
+      tf.squared_difference(states, sample_mean),
+      axis=0)
   ```
 
-  ```python
-  # Sampling from factor-analysis posteriors with known factors W:
-  # mu[i, j] ~ Normal(0, 1)
-  # x[i] ~ Normal(matmul(mu[i], W), I)
-  def log_joint(mu, x, W):
-    prior = -0.5 * tf.reduce_sum(tf.square(mu), 1)
-    x_mean = tf.matmul(mu, W)
-    likelihood = -0.5 * tf.reduce_sum(tf.square(x - x_mean), 1)
-    return prior + likelihood
-  chain, acceptance_probs = hmc.chain(1000, 0.1, 2,
-                                      tf.zeros([x.shape[0], W.shape[0]]),
-                                      lambda mu: log_joint(mu, x, W),
-                                      event_dims=[1])
-  # Discard first half of chain as warmup/burn-in
-  warmed_up = chain[500:]
-  mean_est = tf.reduce_mean(warmed_up, 0)
-  var_est = tf.reduce_mean(tf.square(warmed_up), 0) - tf.square(mean_est)
+  ##### Sampling from factor-analysis posteriors with known factors.
+
+  I.e.,
+
+  ```none
+  for i=1..n:
+    w[i] ~ Normal(0, eye(d))            # prior
+    x[i] ~ Normal(loc=matmul(w[i], F))  # likelihood
   ```
 
+  where `F` denotes factors.
+
   ```python
-  # Sampling from the posterior of a Bayesian regression model.:
-
-  # Run 100 chains in parallel, each with a different initialization.
-  initial_beta = tf.random_normal([100, x.shape[1]])
-  chain, acceptance_probs = hmc.chain(1000, 0.1, 10, initial_beta,
-                                      log_joint_partial, event_dims=[1])
-  # Discard first halves of chains as warmup/burn-in
-  warmed_up = chain[500:]
-  # Averaging across samples within a chain and across chains
-  mean_est = tf.reduce_mean(warmed_up, [0, 1])
-  var_est = tf.reduce_mean(tf.square(warmed_up), [0, 1]) - tf.square(mean_est)
+  tfd = tf.contrib.distributions
+
+  def make_prior(dims, dtype):
+    return tfd.MultivariateNormalDiag(
+        loc=tf.zeros(dims, dtype))
+
+  def make_likelihood(weights, factors):
+    return tfd.MultivariateNormalDiag(
+        loc=tf.tensordot(weights, factors, axes=[[0], [-1]]))
+
+  # Setup data.
+  num_weights = 10
+  num_factors = 4
+  num_chains = 100
+  dtype = np.float32
+
+  prior = make_prior(num_weights, dtype)
+  weights = prior.sample(num_chains)
+  factors = np.random.randn(num_factors, num_weights).astype(dtype)
+  x = make_likelihood(weights, factors).sample(num_chains)
+
+  def target_log_prob(w):
+    # Target joint is: `f(w) = p(w, x | factors)`.
+    return prior.log_prob(w) + make_likelihood(w, factors).log_prob(x)
+
+  # Get `num_results` samples from `num_chains` independent chains.
+  chains_states, kernels_results = hmc.sample_chain(
+      num_results=1000,
+      target_log_prob_fn=target_log_prob,
+      current_state=tf.zeros([num_chains, dims], dtype),
+      step_size=0.1,
+      num_leapfrog_steps=2,
+      num_burnin_steps=500)
+
+  # Compute sample stats.
+  sample_mean = tf.reduce_mean(chains_states, axis=[0, 1])
+  sample_var = tf.reduce_mean(
+      tf.squared_difference(chains_states, sample_mean),
+      axis=[0, 1])
   ```
-  """
-  with ops.name_scope(name, 'hmc_chain', [n_iterations, step_size,
-                                          n_leapfrog_steps, initial_x]):
-    initial_x = ops.convert_to_tensor(initial_x, name='initial_x')
-    non_event_shape = array_ops.shape(target_log_prob_fn(initial_x))
-
-    def body(a, _):
-      updated_x, acceptance_probs, log_prob, grad = kernel(
-          step_size, n_leapfrog_steps, a[0], target_log_prob_fn, event_dims,
-          a[2], a[3])
-      return updated_x, acceptance_probs, log_prob, grad
-
-    potential_and_grad = _make_potential_and_grad(target_log_prob_fn)
-    potential, grad = potential_and_grad(initial_x)
-    return functional_ops.scan(
-        body, array_ops.zeros(n_iterations, dtype=initial_x.dtype),
-        (initial_x,
-         array_ops.zeros(non_event_shape, dtype=initial_x.dtype),
-         -potential, -grad))[:2]
 
+  Args:
+    num_results: Integer number of Markov chain draws.
+    target_log_prob_fn: Python callable which takes an argument like
+      `current_state` (or `*current_state` if it's a list) and returns its
+      (possibly unnormalized) log-density under the target distribution.
+    current_state: `Tensor` or Python `list` of `Tensor`s representing the
+      current state(s) of the Markov chain(s). The first `r` dimensions index
+      independent chains, `r = tf.rank(target_log_prob_fn(*current_state))`.
+    step_size: `Tensor` or Python `list` of `Tensor`s representing the step size
+      for the leapfrog integrator. Must broadcast with the shape of
+      `current_state`. Larger step sizes lead to faster progress, but too-large
+      step sizes make rejection exponentially more likely. When possible, it's
+      often helpful to match per-variable step sizes to the standard deviations
+      of the target distribution in each variable.
+    num_leapfrog_steps: Integer number of steps to run the leapfrog integrator
+      for. Total progress per HMC step is roughly proportional to `step_size *
+      num_leapfrog_steps`.
+    num_burnin_steps: Integer number of chain steps to take before starting to
+      collect results.
+      Default value: 0 (i.e., no burn-in).
+    num_steps_between_results: Integer number of chain steps between collecting
+      a result. Only one out of every `num_steps_between_samples + 1` steps is
+      included in the returned results. This "thinning" comes at a cost of
+      reduced statistical power, while reducing memory requirements and
+      autocorrelation. For more discussion see [1].
+      Default value: 0 (i.e., no subsampling).
+    seed: Python integer to seed the random number generator.
+    current_target_log_prob: (Optional) `Tensor` representing the value of
+      `target_log_prob_fn` at the `current_state`. The only reason to specify
+      this argument is to reduce TF graph size.
+      Default value: `None` (i.e., compute as needed).
+    current_grads_target_log_prob: (Optional) Python list of `Tensor`s
+      representing gradient of `target_log_prob` at the `current_state` and wrt
+      the `current_state`. Must have same shape as `current_state`. The only
+      reason to specify this argument is to reduce TF graph size.
+      Default value: `None` (i.e., compute as needed).
+    name: Python `str` name prefixed to Ops created by this function.
+      Default value: `None` (i.e., "hmc_sample_chain").
+
+  Returns:
+    accepted_states: Tensor or Python list of `Tensor`s representing the
+      state(s) of the Markov chain(s) at each result step. Has same shape as
+      input `current_state` but with a prepended `num_results`-size dimension.
+    kernel_results: `collections.namedtuple` of internal calculations used to
+      advance the chain.
+  """
+  with ops.name_scope(
+      name, "hmc_sample_chain",
+      [num_results, current_state, step_size, num_leapfrog_steps,
+       num_burnin_steps, num_steps_between_results, seed,
+       current_target_log_prob, current_grads_target_log_prob]):
+    with ops.name_scope("initialize"):
+      [
+          current_state,
+          step_size,
+          current_target_log_prob,
+          current_grads_target_log_prob,
+      ] = _prepare_args(
+          target_log_prob_fn, current_state, step_size,
+          current_target_log_prob, current_grads_target_log_prob)
+    def _run_chain(num_steps, current_state, seed, kernel_results):
+      """Runs the chain(s) for `num_steps`."""
+      def _loop_body(iter_, current_state, kernel_results):
+        return [iter_ + 1] + list(kernel(
+            target_log_prob_fn,
+            current_state,
+            step_size,
+            num_leapfrog_steps,
+            seed,
+            kernel_results.current_target_log_prob,
+            kernel_results.current_grads_target_log_prob))
+      return control_flow_ops.while_loop(
+          cond=lambda iter_, *args: iter_ < num_steps,
+          body=_loop_body,
+          loop_vars=[0, current_state, kernel_results])[1:]  # Lop-off "iter_".
+
+    def _scan_body(args_list, _):
+      """Closure which implements `tf.scan` body."""
+      current_state, kernel_results = args_list
+      return _run_chain(num_steps_between_results + 1, current_state, seed,
+                        kernel_results)
+
+    current_state, kernel_results = _run_chain(
+        num_burnin_steps,
+        current_state,
+        distributions_util.gen_new_seed(
+            seed, salt="hmc_sample_chain_burnin"),
+        _make_dummy_kernel_results(
+            current_state,
+            current_target_log_prob,
+            current_grads_target_log_prob))
 
-def ais_chain(n_iterations, step_size, n_leapfrog_steps, initial_x,
-              target_log_prob_fn, proposal_log_prob_fn, event_dims=(),
-              name=None):
+    return functional_ops.scan(
+        fn=_scan_body,
+        elems=array_ops.zeros(num_results, dtype=dtypes.bool),  # Dummy arg.
+        initializer=[current_state, kernel_results])
+
+
+def sample_annealed_importance_chain(
+    proposal_log_prob_fn,
+    num_steps,
+    target_log_prob_fn,
+    current_state,
+    step_size,
+    num_leapfrog_steps,
+    seed=None,
+    name=None):
   """Runs annealed importance sampling (AIS) to estimate normalizing constants.
 
-  This routine uses Hamiltonian Monte Carlo to sample from a series of
+  This function uses Hamiltonian Monte Carlo to sample from a series of
   distributions that slowly interpolates between an initial "proposal"
-  distribution
+  distribution:
 
   `exp(proposal_log_prob_fn(x) - proposal_log_normalizer)`
 
-  and the target distribution
+  and the target distribution:
 
   `exp(target_log_prob_fn(x) - target_log_normalizer)`,
 
@@ -202,113 +325,183 @@ def ais_chain(n_iterations, step_size, n_leapfrog_steps, initial_x,
   normalizing constants of the initial distribution and the target
   distribution:
 
-  E[exp(w)] = exp(target_log_normalizer - proposal_log_normalizer).
-
-  Args:
-    n_iterations: Integer number of Markov chain updates to run. More
-      iterations means more expense, but smoother annealing between q
-      and p, which in turn means exponentially lower variance for the
-      normalizing constant estimator.
-    step_size: Scalar step size or array of step sizes for the
-      leapfrog integrator. Broadcasts to the shape of
-      `initial_x`. Larger step sizes lead to faster progress, but
-      too-large step sizes make rejection exponentially more likely.
-      When possible, it's often helpful to match per-variable step
-      sizes to the standard deviations of the target distribution in
-      each variable.
-    n_leapfrog_steps: Integer number of steps to run the leapfrog
-      integrator for. Total progress per HMC step is roughly
-      proportional to step_size * n_leapfrog_steps.
-    initial_x: Tensor of initial state(s) of the Markov chain(s). Must
-      be a sample from q, or results will be incorrect.
-    target_log_prob_fn: Python callable which takes an argument like `initial_x`
-      and returns its (possibly unnormalized) log-density under the target
-      distribution.
-    proposal_log_prob_fn: Python callable that returns the log density of the
-      initial distribution.
-    event_dims: List of dimensions that should not be treated as
-      independent. This allows for multiple chains to be run independently
-      in parallel. Default is (), i.e., all dimensions are independent.
-    name: Python `str` name prefixed to Ops created by this function.
-
-  Returns:
-    ais_weights: Tensor with the estimated weight(s). Has shape matching
-      `target_log_prob_fn(initial_x)`.
-    chain_states: Tensor with the state(s) of the Markov chain(s) the final
-      iteration. Has shape matching `initial_x`.
-    acceptance_probs: Tensor with the acceptance probabilities for the final
-      iteration. Has shape matching `target_log_prob_fn(initial_x)`.
+  `E[exp(ais_weights)] = exp(target_log_normalizer - proposal_log_normalizer)`.
 
   #### Examples:
 
+  ##### Estimate the normalizing constant of a log-gamma distribution.
+
   ```python
-  # Estimating the normalizing constant of a log-gamma distribution:
-  def proposal_log_prob(x):
-    # Standard normal log-probability. This is properly normalized.
-    return tf.reduce_sum(-0.5 * tf.square(x) - 0.5 * np.log(2 * np.pi), 1)
-  def target_log_prob(x):
-    # Unnormalized log-gamma(2, 3) distribution.
-    # True normalizer is (lgamma(2) - 2 * log(3)) * x.shape[1]
-    return tf.reduce_sum(2. * x - 3. * tf.exp(x), 1)
+  tfd = tf.contrib.distributions
+
   # Run 100 AIS chains in parallel
-  initial_x = tf.random_normal([100, 20])
-  w, _, _ = hmc.ais_chain(1000, 0.2, 2, initial_x, target_log_prob,
-                          proposal_log_prob, event_dims=[1])
-  log_normalizer_estimate = tf.reduce_logsumexp(w) - np.log(100)
+  num_chains = 100
+  dims = 20
+  dtype = np.float32
+
+  proposal = tfd.MultivatiateNormalDiag(
+     loc=tf.zeros([dims], dtype=dtype))
+
+  target = tfd.TransformedDistribution(
+    distribution=tfd.Gamma(concentration=dtype(2),
+                           rate=dtype(3)),
+    bijector=tfd.bijectors.Invert(tfd.bijectors.Exp()),
+    event_shape=[dims])
+
+  chains_state, ais_weights, kernels_results = (
+      hmc.sample_annealed_importance_chain(
+          proposal_log_prob_fn=proposal.log_prob,
+          num_steps=1000,
+          target_log_prob_fn=target.log_prob,
+          step_size=0.2,
+          current_state=proposal.sample(num_chains),
+          num_leapfrog_steps=2))
+
+  log_estimated_normalizer = (tf.reduce_logsumexp(ais_weights)
+                              - np.log(num_chains))
+  log_true_normalizer = tf.lgamma(2.) - 2. * tf.log(3.)
   ```
 
+  ##### Estimate marginal likelihood of a Bayesian regression model.
+
   ```python
-  # Estimating the marginal likelihood of a Bayesian regression model:
-  base_measure = -0.5 * np.log(2 * np.pi)
-  def proposal_log_prob(x):
-    # Standard normal log-probability. This is properly normalized.
-    return tf.reduce_sum(-0.5 * tf.square(x) + base_measure, 1)
-  def regression_log_joint(beta, x, y):
-    # This function returns a vector whose ith element is log p(beta[i], y | x).
-    # Each row of beta corresponds to the state of an independent Markov chain.
-    log_prior = tf.reduce_sum(-0.5 * tf.square(beta) + base_measure, 1)
-    means = tf.matmul(beta, x, transpose_b=True)
-    log_likelihood = tf.reduce_sum(-0.5 * tf.square(y - means) +
-                                   base_measure, 1)
-    return log_prior + log_likelihood
-  def log_joint_partial(beta):
-    return regression_log_joint(beta, x, y)
+  tfd = tf.contrib.distributions
+
+  def make_prior(dims, dtype):
+    return tfd.MultivariateNormalDiag(
+        loc=tf.zeros(dims, dtype))
+
+  def make_likelihood(weights, x):
+    return tfd.MultivariateNormalDiag(
+        loc=tf.tensordot(weights, x, axes=[[0], [-1]]))
+
   # Run 100 AIS chains in parallel
-  initial_beta = tf.random_normal([100, x.shape[1]])
-  w, beta_samples, _ = hmc.ais_chain(1000, 0.1, 2, initial_beta,
-                                     log_joint_partial, proposal_log_prob,
-                                     event_dims=[1])
-  log_normalizer_estimate = tf.reduce_logsumexp(w) - np.log(100)
+  num_chains = 100
+  dims = 10
+  dtype = np.float32
+
+  # Make training data.
+  x = np.random.randn(num_chains, dims).astype(dtype)
+  true_weights = np.random.randn(dims).astype(dtype)
+  y = np.dot(x, true_weights) + np.random.randn(num_chains)
+
+  # Setup model.
+  prior = make_prior(dims, dtype)
+  def target_log_prob_fn(weights):
+    return prior.log_prob(weights) + make_likelihood(weights, x).log_prob(y)
+
+  proposal = tfd.MultivariateNormalDiag(
+      loc=tf.zeros(dims, dtype))
+
+  weight_samples, ais_weights, kernel_results = (
+      hmc.sample_annealed_importance_chain(
+        num_steps=1000,
+        proposal_log_prob_fn=proposal.log_prob,
+        target_log_prob_fn=target_log_prob_fn
+        current_state=tf.zeros([num_chains, dims], dtype),
+        step_size=0.1,
+        num_leapfrog_steps=2))
+  log_normalizer_estimate = (tf.reduce_logsumexp(ais_weights)
+                             - np.log(num_chains))
   ```
+
+  Args:
+    proposal_log_prob_fn: Python callable that returns the log density of the
+      initial distribution.
+    num_steps: Integer number of Markov chain updates to run. More
+      iterations means more expense, but smoother annealing between q
+      and p, which in turn means exponentially lower variance for the
+      normalizing constant estimator.
+    target_log_prob_fn: Python callable which takes an argument like
+      `current_state` (or `*current_state` if it's a list) and returns its
+      (possibly unnormalized) log-density under the target distribution.
+    current_state: `Tensor` or Python `list` of `Tensor`s representing the
+      current state(s) of the Markov chain(s). The first `r` dimensions index
+      independent chains, `r = tf.rank(target_log_prob_fn(*current_state))`.
+    step_size: `Tensor` or Python `list` of `Tensor`s representing the step size
+      for the leapfrog integrator. Must broadcast with the shape of
+      `current_state`. Larger step sizes lead to faster progress, but too-large
+      step sizes make rejection exponentially more likely. When possible, it's
+      often helpful to match per-variable step sizes to the standard deviations
+      of the target distribution in each variable.
+    num_leapfrog_steps: Integer number of steps to run the leapfrog integrator
+      for. Total progress per HMC step is roughly proportional to `step_size *
+      num_leapfrog_steps`.
+    seed: Python integer to seed the random number generator.
+    name: Python `str` name prefixed to Ops created by this function.
+      Default value: `None` (i.e., "hmc_sample_annealed_importance_chain").
+
+  Returns:
+    accepted_state: `Tensor` or Python list of `Tensor`s representing the
+      state(s) of the Markov chain(s) at the final iteration. Has same shape as
+      input `current_state`.
+    ais_weights: Tensor with the estimated weight(s). Has shape matching
+      `target_log_prob_fn(current_state)`.
   """
-  with ops.name_scope(name, 'hmc_ais_chain',
-                      [n_iterations, step_size, n_leapfrog_steps, initial_x]):
-    non_event_shape = array_ops.shape(target_log_prob_fn(initial_x))
-
-    beta_series = math_ops.linspace(0., 1., n_iterations+1)[1:]
-    def _body(a, beta):  # pylint: disable=missing-docstring
-      def log_prob_beta(x):
-        return ((1 - beta) * proposal_log_prob_fn(x) +
-                beta * target_log_prob_fn(x))
-      last_x = a[0]
-      w = a[2]
-      w += (1. / n_iterations) * (target_log_prob_fn(last_x) -
-                                  proposal_log_prob_fn(last_x))
-      # TODO(b/66917083): There's an opportunity for gradient reuse here.
-      updated_x, acceptance_probs, _, _ = kernel(step_size, n_leapfrog_steps,
-                                                 last_x, log_prob_beta,
-                                                 event_dims)
-      return updated_x, acceptance_probs, w
-
-    x, acceptance_probs, w = functional_ops.scan(
-        _body, beta_series,
-        (initial_x, array_ops.zeros(non_event_shape, dtype=initial_x.dtype),
-         array_ops.zeros(non_event_shape, dtype=initial_x.dtype)))
-  return w[-1], x[-1], acceptance_probs[-1]
-
-
-def kernel(step_size, n_leapfrog_steps, x, target_log_prob_fn, event_dims=(),
-           x_log_prob=None, x_grad=None, name=None):
+  def make_convex_combined_log_prob_fn(iter_):
+    def _fn(*args):
+      p = proposal_log_prob_fn(*args)
+      t = target_log_prob_fn(*args)
+      dtype = p.dtype.base_dtype
+      beta = (math_ops.cast(iter_ + 1, dtype)
+              / math_ops.cast(num_steps, dtype))
+      return (1. - beta) * p + beta * t
+    return _fn
+
+  with ops.name_scope(
+      name, "hmc_sample_annealed_importance_chain",
+      [num_steps, current_state, step_size, num_leapfrog_steps, seed]):
+    with ops.name_scope("initialize"):
+      [
+          current_state,
+          step_size,
+          current_log_prob,
+          current_grads_log_prob,
+      ] = _prepare_args(
+          make_convex_combined_log_prob_fn(iter_=0),
+          current_state,
+          step_size,
+          description="convex_combined_log_prob")
+    def _loop_body(iter_, ais_weights, current_state, kernel_results):
+      """Closure which implements `tf.while_loop` body."""
+      current_state_parts = (list(current_state)
+                             if _is_list_like(current_state)
+                             else [current_state])
+      ais_weights += ((target_log_prob_fn(*current_state_parts)
+                       - proposal_log_prob_fn(*current_state_parts))
+                      / math_ops.cast(num_steps, ais_weights.dtype))
+      return [iter_ + 1, ais_weights] + list(kernel(
+          make_convex_combined_log_prob_fn(iter_),
+          current_state,
+          step_size,
+          num_leapfrog_steps,
+          seed,
+          kernel_results.current_target_log_prob,
+          kernel_results.current_grads_target_log_prob))
+
+    [ais_weights, current_state, kernel_results] = control_flow_ops.while_loop(
+        cond=lambda iter_, *args: iter_ < num_steps,
+        body=_loop_body,
+        loop_vars=[
+            0,  # iter_
+            array_ops.zeros_like(current_log_prob),  # ais_weights
+            current_state,
+            _make_dummy_kernel_results(current_state,
+                                       current_log_prob,
+                                       current_grads_log_prob),
+        ])[1:]  # Lop-off "iter_".
+
+    return [current_state, ais_weights, kernel_results]
+
+
+def kernel(target_log_prob_fn,
+           current_state,
+           step_size,
+           num_leapfrog_steps,
+           seed=None,
+           current_target_log_prob=None,
+           current_grads_target_log_prob=None,
+           name=None):
   """Runs one iteration of Hamiltonian Monte Carlo.
 
   Hamiltonian Monte Carlo (HMC) is a Markov chain Monte Carlo (MCMC)
@@ -316,334 +509,625 @@ def kernel(step_size, n_leapfrog_steps, x, target_log_prob_fn, event_dims=(),
   a Metropolis proposal. This function applies one step of HMC to
   randomly update the variable `x`.
 
-  This function can update multiple chains in parallel. It assumes
-  that all dimensions of `x` not specified in `event_dims` are
-  independent, and should therefore be updated independently. The
-  output of `target_log_prob_fn()` should sum log-probabilities across
-  all event dimensions. Slices along dimensions not in `event_dims`
-  may have different target distributions; for example, if
-  `event_dims == (1,)`, then `x[0, :]` could have a different target
-  distribution from x[1, :]. This is up to `target_log_prob_fn()`.
-
-  Args:
-    step_size: Scalar step size or array of step sizes for the
-      leapfrog integrator. Broadcasts to the shape of
-      `x`. Larger step sizes lead to faster progress, but
-      too-large step sizes make rejection exponentially more likely.
-      When possible, it's often helpful to match per-variable step
-      sizes to the standard deviations of the target distribution in
-      each variable.
-    n_leapfrog_steps: Integer number of steps to run the leapfrog
-      integrator for. Total progress per HMC step is roughly
-      proportional to step_size * n_leapfrog_steps.
-    x: Tensor containing the value(s) of the random variable(s) to update.
-    target_log_prob_fn: Python callable which takes an argument like `initial_x`
-      and returns its (possibly unnormalized) log-density under the target
-      distribution.
-    event_dims: List of dimensions that should not be treated as
-      independent. This allows for multiple chains to be run independently
-      in parallel. Default is (), i.e., all dimensions are independent.
-    x_log_prob (optional): Tensor containing the cached output of a previous
-      call to `target_log_prob_fn()` evaluated at `x` (such as that provided by
-      a previous call to `kernel()`). Providing `x_log_prob` and
-      `x_grad` saves one gradient computation per call to `kernel()`.
-    x_grad (optional): Tensor containing the cached gradient of
-      `target_log_prob_fn()` evaluated at `x` (such as that provided by
-      a previous call to `kernel()`). Providing `x_log_prob` and
-      `x_grad` saves one gradient computation per call to `kernel()`.
-    name: Python `str` name prefixed to Ops created by this function.
-
-  Returns:
-    updated_x: The updated variable(s) x. Has shape matching `initial_x`.
-    acceptance_probs: Tensor with the acceptance probabilities for the final
-      iteration. This is useful for diagnosing step size problems etc. Has
-      shape matching `target_log_prob_fn(initial_x)`.
-    new_log_prob: The value of `target_log_prob_fn()` evaluated at `updated_x`.
-    new_grad: The value of the gradient of `target_log_prob_fn()` evaluated at
-      `updated_x`.
+  This function can update multiple chains in parallel. It assumes that all
+  leftmost dimensions of `current_state` index independent chain states (and are
+  therefore updated independently). The output of `target_log_prob_fn()` should
+  sum log-probabilities across all event dimensions. Slices along the rightmost
+  dimensions may have different target distributions; for example,
+  `current_state[0, :]` could have a different target distribution from
+  `current_state[1, :]`. This is up to `target_log_prob_fn()`. (The number of
+  independent chains is `tf.size(target_log_prob_fn(*current_state))`.)
 
   #### Examples:
 
+  ##### Simple chain with warm-up.
+
   ```python
+  tfd = tf.contrib.distributions
+
   # Tuning acceptance rates:
+  dtype = np.float32
   target_accept_rate = 0.631
-  def target_log_prob(x):
-    # Standard normal
-    return tf.reduce_sum(-0.5 * tf.square(x))
-  initial_x = tf.zeros([10])
-  initial_log_prob = target_log_prob(initial_x)
-  initial_grad = tf.gradients(initial_log_prob, initial_x)[0]
-  # Algorithm state
-  x = tf.Variable(initial_x, name='x')
-  step_size = tf.Variable(1., name='step_size')
-  last_log_prob = tf.Variable(initial_log_prob, name='last_log_prob')
-  last_grad = tf.Variable(initial_grad, name='last_grad')
-  # Compute updates
-  new_x, acceptance_prob, log_prob, grad = hmc.kernel(step_size, 3, x,
-                                                      target_log_prob,
-                                                      event_dims=[0],
-                                                      x_log_prob=last_log_prob)
-  x_update = tf.assign(x, new_x)
-  log_prob_update = tf.assign(last_log_prob, log_prob)
-  grad_update = tf.assign(last_grad, grad)
-  step_size_update = tf.assign(step_size,
-                               tf.where(acceptance_prob > target_accept_rate,
-                                        step_size * 1.01, step_size / 1.01))
-  adaptive_updates = [x_update, log_prob_update, grad_update, step_size_update]
-  sampling_updates = [x_update, log_prob_update, grad_update]
-
-  sess = tf.Session()
-  sess.run(tf.global_variables_initializer())
+  num_warmup_iter = 500
+  num_chain_iter = 500
+
+  x = tf.get_variable(name="x", initializer=dtype(1))
+  step_size = tf.get_variable(name="step_size", initializer=dtype(1))
+
+  target = tfd.Normal(loc=dtype(0), scale=dtype(1))
+
+  new_x, other_results = hmc.kernel(
+      target_log_prob_fn=target.log_prob,
+      current_state=x,
+      step_size=step_size,
+      num_leapfrog_steps=3)[:4]
+
+  x_update = x.assign(new_x)
+
+  step_size_update = step_size.assign_add(
+      step_size * tf.where(
+        other_results.acceptance_probs > target_accept_rate,
+        0.01, -0.01))
+
+  warmup = tf.group([x_update, step_size_update])
+
+  tf.global_variables_initializer().run()
+
+  sess.graph.finalize()  # No more graph building.
+
   # Warm up the sampler and adapt the step size
-  for i in xrange(500):
-    sess.run(adaptive_updates)
+  for _ in xrange(num_warmup_iter):
+    sess.run(warmup)
+
   # Collect samples without adapting step size
-  samples = np.zeros([500, 10])
-  for i in xrange(500):
-    x_val, _ = sess.run([new_x, sampling_updates])
-    samples[i] = x_val
+  samples = np.zeros([num_chain_iter])
+  for i in xrange(num_chain_iter):
+    _, x_, target_log_prob_, grad_ = sess.run([
+        x_update,
+        x,
+        other_results.target_log_prob,
+        other_results.grads_target_log_prob])
+    samples[i] = x_
+
+  print(samples.mean(), samples.std())
   ```
 
-  ```python
-  # Empirical-Bayes estimation of a hyperparameter by MCMC-EM:
-
-  # Problem setup
-  N = 150
-  D = 10
-  x = np.random.randn(N, D).astype(np.float32)
-  true_sigma = 0.5
-  true_beta = true_sigma * np.random.randn(D).astype(np.float32)
-  y = x.dot(true_beta) + np.random.randn(N).astype(np.float32)
-
-  def log_prior(beta, log_sigma):
-    return tf.reduce_sum(-0.5 / tf.exp(2 * log_sigma) * tf.square(beta) -
-                         log_sigma)
-  def regression_log_joint(beta, log_sigma, x, y):
-    # This function returns log p(beta | log_sigma) + log p(y | x, beta).
-    means = tf.matmul(tf.expand_dims(beta, 0), x, transpose_b=True)
-    means = tf.squeeze(means)
-    log_likelihood = tf.reduce_sum(-0.5 * tf.square(y - means))
-    return log_prior(beta, log_sigma) + log_likelihood
-  def log_joint_partial(beta):
-    return regression_log_joint(beta, log_sigma, x, y)
-  # Our estimate of log(sigma)
-  log_sigma = tf.Variable(0., name='log_sigma')
-  # The state of the Markov chain
-  beta = tf.Variable(tf.random_normal([x.shape[1]]), name='beta')
-  new_beta, _, _, _ = hmc.kernel(0.1, 5, beta, log_joint_partial,
-                                 event_dims=[0])
-  beta_update = tf.assign(beta, new_beta)
-  optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)
-  with tf.control_dependencies([beta_update]):
-    log_sigma_update = optimizer.minimize(-log_prior(beta, log_sigma),
-                                          var_list=[log_sigma])
-
-  sess = tf.Session()
-  sess.run(tf.global_variables_initializer())
-  log_sigma_history = np.zeros(1000)
-  for i in xrange(1000):
-    log_sigma_val, _ = sess.run([log_sigma, log_sigma_update])
-    log_sigma_history[i] = log_sigma_val
-  # Should converge to something close to true_sigma
-  plt.plot(np.exp(log_sigma_history))
-  ```
-  """
-  with ops.name_scope(name, 'hmc_kernel', [step_size, n_leapfrog_steps, x]):
-    potential_and_grad = _make_potential_and_grad(target_log_prob_fn)
-    x = ops.convert_to_tensor(x, name='x')
-
-    x_shape = array_ops.shape(x)
-    m = random_ops.random_normal(x_shape, dtype=x.dtype)
-
-    kinetic_0 = 0.5 * math_ops.reduce_sum(math_ops.square(m), event_dims)
-
-    if (x_log_prob is not None) and (x_grad is not None):
-      log_potential_0, grad_0 = -x_log_prob, -x_grad  # pylint: disable=invalid-unary-operand-type
-    else:
-      if x_log_prob is not None:
-        logging.warn('x_log_prob was provided, but x_grad was not,'
-                     ' so x_log_prob was not used.')
-      if x_grad is not None:
-        logging.warn('x_grad was provided, but x_log_prob was not,'
-                     ' so x_grad was not used.')
-      log_potential_0, grad_0 = potential_and_grad(x)
-
-    new_x, new_m, log_potential_1, grad_1 = leapfrog_integrator(
-        step_size, n_leapfrog_steps, x, m, potential_and_grad, grad_0)
-
-    kinetic_1 = 0.5 * math_ops.reduce_sum(math_ops.square(new_m), event_dims)
-
-    energy_change = log_potential_1 - log_potential_0 + kinetic_1 - kinetic_0
-    # Treat NaN as infinite energy (and therefore guaranteed rejection).
-    energy_change = array_ops.where(
-        math_ops.is_nan(energy_change),
-        array_ops.fill(array_ops.shape(energy_change),
-                       energy_change.dtype.as_numpy_dtype(np.inf)),
-        energy_change)
-    acceptance_probs = math_ops.exp(math_ops.minimum(-energy_change, 0.))
-    accepted = (
-        random_ops.random_uniform(
-            array_ops.shape(acceptance_probs), dtype=x.dtype)
-        < acceptance_probs)
-    new_log_prob = -array_ops.where(accepted, log_potential_1, log_potential_0)
-
-    # TODO(b/65738010): This should work, but it doesn't for now.
-    # reduced_shape = math_ops.reduced_shape(x_shape, event_dims)
-    reduced_shape = array_ops.shape(math_ops.reduce_sum(x, event_dims,
-                                                        keep_dims=True))
-    accepted = array_ops.reshape(accepted, reduced_shape)
-    accepted = math_ops.logical_or(
-        accepted, math_ops.cast(array_ops.zeros_like(x), dtypes.bool))
-    new_x = array_ops.where(accepted, new_x, x)
-    new_grad = -array_ops.where(accepted, grad_1, grad_0)
-
-  # TODO(langmore) Gradients of acceptance_probs and new_log_prob with respect
-  # to initial_x will propagate NaNs (see testNanFromGradsDontPropagate).  This
-  # should be fixed.
-  return new_x, acceptance_probs, new_log_prob, new_grad
-
-
-def leapfrog_integrator(step_size, n_steps, initial_position, initial_momentum,
-                        potential_and_grad, initial_grad, name=None):
-  """Applies `n_steps` steps of the leapfrog integrator.
-
-  This just wraps `leapfrog_step()` in a `tf.while_loop()`, reusing
-  gradient computations where possible.
+  ##### Sample from more complicated posterior.
 
-  Args:
-    step_size: Scalar step size or array of step sizes for the
-      leapfrog integrator. Broadcasts to the shape of
-      `initial_position`. Larger step sizes lead to faster progress, but
-      too-large step sizes lead to larger discretization error and
-      worse energy conservation.
-    n_steps: Number of steps to run the leapfrog integrator.
-    initial_position: Tensor containing the value(s) of the position variable(s)
-      to update.
-    initial_momentum: Tensor containing the value(s) of the momentum variable(s)
-      to update.
-    potential_and_grad: Python callable that takes a position tensor like
-      `initial_position` and returns the potential energy and its gradient at
-      that position.
-    initial_grad: Tensor with the value of the gradient of the potential energy
-      at `initial_position`.
-    name: Python `str` name prefixed to Ops created by this function.
-
-  Returns:
-    updated_position: Updated value of the position.
-    updated_momentum: Updated value of the momentum.
-    new_potential: Potential energy of the new position. Has shape matching
-      `potential_and_grad(initial_position)`.
-    new_grad: Gradient from potential_and_grad() evaluated at the new position.
-      Has shape matching `initial_position`.
+  I.e.,
 
-  Example: Simple quadratic potential.
+  ```none
+    W ~ MVN(loc=0, scale=sigma * eye(dims))
+    for i=1...num_samples:
+        X[i] ~ MVN(loc=0, scale=eye(dims))
+      eps[i] ~ Normal(loc=0, scale=1)
+        Y[i] = X[i].T * W + eps[i]
+  ```
 
   ```python
-  def potential_and_grad(position):
-    return tf.reduce_sum(0.5 * tf.square(position)), position
-  position = tf.placeholder(np.float32)
-  momentum = tf.placeholder(np.float32)
-  potential, grad = potential_and_grad(position)
-  new_position, new_momentum, new_potential, new_grad = hmc.leapfrog_integrator(
-    0.1, 3, position, momentum, potential_and_grad, grad)
-
-  sess = tf.Session()
-  position_val = np.random.randn(10)
-  momentum_val = np.random.randn(10)
-  potential_val, grad_val = sess.run([potential, grad],
-                                     {position: position_val})
-  positions = np.zeros([100, 10])
-  for i in xrange(100):
-    position_val, momentum_val, potential_val, grad_val = sess.run(
-      [new_position, new_momentum, new_potential, new_grad],
-      {position: position_val, momentum: momentum_val})
-    positions[i] = position_val
-  # Should trace out sinusoidal dynamics.
-  plt.plot(positions[:, 0])
-  ```
-  """
-  def leapfrog_wrapper(step_size, x, m, grad, l):
-    x, m, _, grad = leapfrog_step(step_size, x, m, potential_and_grad, grad)
-    return step_size, x, m, grad, l + 1
+  tfd = tf.contrib.distributions
+
+  def make_training_data(num_samples, dims, sigma):
+    dt = np.asarray(sigma).dtype
+    zeros = tf.zeros(dims, dtype=dt)
+    x = tfd.MultivariateNormalDiag(
+        loc=zeros).sample(num_samples, seed=1)
+    w = tfd.MultivariateNormalDiag(
+        loc=zeros,
+        scale_identity_multiplier=sigma).sample(seed=2)
+    noise = tfd.Normal(
+        loc=dt(0),
+        scale=dt(1)).sample(num_samples, seed=3)
+    y = tf.tensordot(x, w, axes=[[1], [0]]) + noise
+    return y, x, w
+
+  def make_prior(sigma, dims):
+    # p(w | sigma)
+    return tfd.MultivariateNormalDiag(
+        loc=tf.zeros([dims], dtype=sigma.dtype),
+        scale_identity_multiplier=sigma)
+
+  def make_likelihood(x, w):
+    # p(y | x, w)
+    return tfd.MultivariateNormalDiag(
+        loc=tf.tensordot(x, w, axes=[[1], [0]]))
+
+  # Setup assumptions.
+  dtype = np.float32
+  num_samples = 150
+  dims = 10
+  num_iters = int(5e3)
+
+  true_sigma = dtype(0.5)
+  y, x, true_weights = make_training_data(num_samples, dims, true_sigma)
+
+  # Estimate of `log(true_sigma)`.
+  log_sigma = tf.get_variable(name="log_sigma", initializer=dtype(0))
+  sigma = tf.exp(log_sigma)
+
+  # State of the Markov chain.
+  weights = tf.get_variable(
+      name="weights",
+      initializer=np.random.randn(dims).astype(dtype))
+
+  prior = make_prior(sigma, dims)
+
+  def joint_log_prob_fn(w):
+    # f(w) = log p(w, y | x)
+    return prior.log_prob(w) + make_likelihood(x, w).log_prob(y)
+
+  weights_update = weights.assign(
+      hmc.kernel(target_log_prob_fn=joint_log_prob,
+                 current_state=weights,
+                 step_size=0.1,
+                 num_leapfrog_steps=5)[0])
+
+  with tf.control_dependencies([weights_update]):
+    loss = -prior.log_prob(weights)
+
+  optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)
+  log_sigma_update = optimizer.minimize(loss, var_list=[log_sigma])
+
+  sess.graph.finalize()  # No more graph building.
 
-  def counter_fn(a, b, c, d, counter):  # pylint: disable=unused-argument
-    return counter < n_steps
+  tf.global_variables_initializer().run()
 
-  with ops.name_scope(name, 'leapfrog_integrator',
-                      [step_size, n_steps, initial_position, initial_momentum,
-                       initial_grad]):
-    _, new_x, new_m, new_grad, _ = control_flow_ops.while_loop(
-        counter_fn, leapfrog_wrapper, [step_size, initial_position,
-                                       initial_momentum, initial_grad,
-                                       array_ops.constant(0)], back_prop=False)
-    # We're counting on the runtime to eliminate this redundant computation.
-    new_potential, new_grad = potential_and_grad(new_x)
-  return new_x, new_m, new_potential, new_grad
+  sigma_history = np.zeros(num_iters, dtype)
+  weights_history = np.zeros([num_iters, dims], dtype)
 
+  for i in xrange(num_iters):
+    _, sigma_, weights_, _ = sess.run([log_sigma_update, sigma, weights])
+    weights_history[i, :] = weights_
+    sigma_history[i] = sigma_
 
-def leapfrog_step(step_size, position, momentum, potential_and_grad, grad,
-                  name=None):
-  """Applies one step of the leapfrog integrator.
+  true_weights_ = sess.run(true_weights)
 
-  Assumes a simple quadratic kinetic energy function: 0.5 * ||momentum||^2.
+  # Should converge to something close to true_sigma.
+  plt.plot(sigma_history);
+  plt.ylabel("sigma");
+  plt.xlabel("iteration");
+  ```
 
   Args:
-    step_size: Scalar step size or array of step sizes for the
-      leapfrog integrator. Broadcasts to the shape of
-      `position`. Larger step sizes lead to faster progress, but
-      too-large step sizes lead to larger discretization error and
-      worse energy conservation.
-    position: Tensor containing the value(s) of the position variable(s)
-      to update.
-    momentum: Tensor containing the value(s) of the momentum variable(s)
-      to update.
-    potential_and_grad: Python callable that takes a position tensor like
-      `position` and returns the potential energy and its gradient at that
-      position.
-    grad: Tensor with the value of the gradient of the potential energy
-      at `position`.
+    target_log_prob_fn: Python callable which takes an argument like
+      `current_state` (or `*current_state` if it's a list) and returns its
+      (possibly unnormalized) log-density under the target distribution.
+    current_state: `Tensor` or Python `list` of `Tensor`s representing the
+      current state(s) of the Markov chain(s). The first `r` dimensions index
+      independent chains, `r = tf.rank(target_log_prob_fn(*current_state))`.
+    step_size: `Tensor` or Python `list` of `Tensor`s representing the step size
+      for the leapfrog integrator. Must broadcast with the shape of
+      `current_state`. Larger step sizes lead to faster progress, but too-large
+      step sizes make rejection exponentially more likely. When possible, it's
+      often helpful to match per-variable step sizes to the standard deviations
+      of the target distribution in each variable.
+    num_leapfrog_steps: Integer number of steps to run the leapfrog integrator
+      for. Total progress per HMC step is roughly proportional to `step_size *
+      num_leapfrog_steps`.
+    seed: Python integer to seed the random number generator.
+    current_target_log_prob: (Optional) `Tensor` representing the value of
+      `target_log_prob_fn` at the `current_state`. The only reason to
+      specify this argument is to reduce TF graph size.
+      Default value: `None` (i.e., compute as needed).
+    current_grads_target_log_prob: (Optional) Python list of `Tensor`s
+      representing gradient of `current_target_log_prob` at the `current_state`
+      and wrt the `current_state`. Must have same shape as `current_state`. The
+      only reason to specify this argument is to reduce TF graph size.
+      Default value: `None` (i.e., compute as needed).
     name: Python `str` name prefixed to Ops created by this function.
+      Default value: `None` (i.e., "hmc_kernel").
 
   Returns:
-    updated_position: Updated value of the position.
-    updated_momentum: Updated value of the momentum.
-    new_potential: Potential energy of the new position. Has shape matching
-      `potential_and_grad(position)`.
-    new_grad: Gradient from potential_and_grad() evaluated at the new position.
-      Has shape matching `position`.
+    accepted_state: Tensor or Python list of `Tensor`s representing the state(s)
+      of the Markov chain(s) at each result step. Has same shape as
+      `current_state`.
+    acceptance_probs: Tensor with the acceptance probabilities for each
+      iteration. Has shape matching `target_log_prob_fn(current_state)`.
+    accepted_target_log_prob: `Tensor` representing the value of
+      `target_log_prob_fn` at `accepted_state`.
+    accepted_grads_target_log_prob: Python `list` of `Tensor`s representing the
+      gradient of `accepted_target_log_prob` wrt each `accepted_state`.
+
+  Raises:
+    ValueError: if there isn't one `step_size` or a list with same length as
+      `current_state`.
+  """
+  with ops.name_scope(
+      name, "hmc_kernel",
+      [current_state, step_size, num_leapfrog_steps, seed,
+       current_target_log_prob, current_grads_target_log_prob]):
+    with ops.name_scope("initialize"):
+      [current_state_parts, step_sizes, current_target_log_prob,
+       current_grads_target_log_prob] = _prepare_args(
+           target_log_prob_fn, current_state, step_size,
+           current_target_log_prob, current_grads_target_log_prob,
+           maybe_expand=True)
+      independent_chain_ndims = distributions_util.prefer_static_rank(
+          current_target_log_prob)
+      def init_momentum(s):
+        return random_ops.random_normal(
+            shape=array_ops.shape(s),
+            dtype=s.dtype.base_dtype,
+            seed=distributions_util.gen_new_seed(
+                seed, salt="hmc_kernel_momentums"))
+      current_momentums = [init_momentum(s) for s in current_state_parts]
+
+    [
+        proposed_momentums,
+        proposed_state_parts,
+        proposed_target_log_prob,
+        proposed_grads_target_log_prob,
+    ] = _leapfrog_integrator(current_momentums,
+                             target_log_prob_fn,
+                             current_state_parts,
+                             step_sizes,
+                             num_leapfrog_steps,
+                             current_target_log_prob,
+                             current_grads_target_log_prob)
+
+    energy_change = _compute_energy_change(current_target_log_prob,
+                                           current_momentums,
+                                           proposed_target_log_prob,
+                                           proposed_momentums,
+                                           independent_chain_ndims)
+
+    # u < exp(min(-energy, 0)),  where u~Uniform[0,1)
+    # ==> -log(u) >= max(e, 0)
+    # ==> -log(u) >= e
+    # (Perhaps surprisingly, we don't have a better way to obtain a random
+    # uniform from positive reals, i.e., `tf.random_uniform(minval=0,
+    # maxval=np.inf)` won't work.)
+    random_uniform = random_ops.random_uniform(
+        shape=array_ops.shape(energy_change),
+        dtype=energy_change.dtype,
+        seed=seed)
+    random_positive = -math_ops.log(random_uniform)
+    is_accepted = random_positive >= energy_change
+
+    accepted_target_log_prob = array_ops.where(is_accepted,
+                                               proposed_target_log_prob,
+                                               current_target_log_prob)
+
+    accepted_state_parts = [_choose(is_accepted,
+                                    proposed_state_part,
+                                    current_state_part,
+                                    independent_chain_ndims)
+                            for current_state_part, proposed_state_part
+                            in zip(current_state_parts, proposed_state_parts)]
+
+    accepted_grads_target_log_prob = [
+        _choose(is_accepted,
+                proposed_grad,
+                grad,
+                independent_chain_ndims)
+        for proposed_grad, grad
+        in zip(proposed_grads_target_log_prob, current_grads_target_log_prob)]
+
+    maybe_flatten = lambda x: x if _is_list_like(current_state) else x[0]
+    return [
+        maybe_flatten(accepted_state_parts),
+        KernelResults(
+            acceptance_probs=math_ops.exp(math_ops.minimum(-energy_change, 0.)),
+            current_grads_target_log_prob=accepted_grads_target_log_prob,
+            current_target_log_prob=accepted_target_log_prob,
+            energy_change=energy_change,
+            is_accepted=is_accepted,
+            proposed_grads_target_log_prob=proposed_grads_target_log_prob,
+            proposed_state=maybe_flatten(proposed_state_parts),
+            proposed_target_log_prob=proposed_target_log_prob,
+            random_positive=random_positive,
+        ),
+    ]
+
+
+def _leapfrog_integrator(current_momentums,
+                         target_log_prob_fn,
+                         current_state_parts,
+                         step_sizes,
+                         num_leapfrog_steps,
+                         current_target_log_prob=None,
+                         current_grads_target_log_prob=None,
+                         name=None):
+  """Applies `num_leapfrog_steps` of the leapfrog integrator.
+
+  Assumes a simple quadratic kinetic energy function: `0.5 ||momentum||**2`.
+
+  #### Examples:
 
-  Example: Simple quadratic potential.
+  ##### Simple quadratic potential.
 
   ```python
-  def potential_and_grad(position):
-    # Simple quadratic potential
-    return tf.reduce_sum(0.5 * tf.square(position)), position
+  tfd = tf.contrib.distributions
+
+  dims = 10
+  num_iter = int(1e3)
+  dtype = np.float32
+
   position = tf.placeholder(np.float32)
   momentum = tf.placeholder(np.float32)
-  potential, grad = potential_and_grad(position)
-  new_position, new_momentum, new_potential, new_grad = hmc.leapfrog_step(
-    0.1, position, momentum, potential_and_grad, grad)
-
-  sess = tf.Session()
-  position_val = np.random.randn(10)
-  momentum_val = np.random.randn(10)
-  potential_val, grad_val = sess.run([potential, grad],
-                                     {position: position_val})
-  positions = np.zeros([100, 10])
-  for i in xrange(100):
-    position_val, momentum_val, potential_val, grad_val = sess.run(
-      [new_position, new_momentum, new_potential, new_grad],
-      {position: position_val, momentum: momentum_val})
-    positions[i] = position_val
-  # Should trace out sinusoidal dynamics.
-  plt.plot(positions[:, 0])
+
+  [
+      new_momentums,
+      new_positions,
+  ] = hmc._leapfrog_integrator(
+      current_momentums=[momentum],
+      target_log_prob_fn=tfd.MultivariateNormalDiag(
+          loc=tf.zeros(dims, dtype)).log_prob,
+      current_state_parts=[position],
+      step_sizes=0.1,
+      num_leapfrog_steps=3)[:2]
+
+  sess.graph.finalize()  # No more graph building.
+
+  momentum_ = np.random.randn(dims).astype(dtype)
+  position_ = np.random.randn(dims).astype(dtype)
+
+  positions = np.zeros([num_iter, dims], dtype)
+  for i in xrange(num_iter):
+    position_, momentum_ = sess.run(
+        [new_momentums[0], new_position[0]],
+        feed_dict={position: position_, momentum: momentum_})
+    positions[i] = position_
+
+  plt.plot(positions[:, 0]);  # Sinusoidal.
   ```
+
+  Args:
+    current_momentums: Tensor containing the value(s) of the momentum
+      variable(s) to update.
+    target_log_prob_fn: Python callable which takes an argument like
+      `*current_state_parts` and returns its (possibly unnormalized) log-density
+      under the target distribution.
+    current_state_parts: Python `list` of `Tensor`s representing the current
+      state(s) of the Markov chain(s). The first `independent_chain_ndims` of
+      the `Tensor`(s) index different chains.
+    step_sizes: Python `list` of `Tensor`s representing the step size for the
+      leapfrog integrator. Must broadcast with the shape of
+      `current_state_parts`.  Larger step sizes lead to faster progress, but
+      too-large step sizes make rejection exponentially more likely. When
+      possible, it's often helpful to match per-variable step sizes to the
+      standard deviations of the target distribution in each variable.
+    num_leapfrog_steps: Integer number of steps to run the leapfrog integrator
+      for. Total progress per HMC step is roughly proportional to `step_size *
+      num_leapfrog_steps`.
+    current_target_log_prob: (Optional) `Tensor` representing the value of
+      `target_log_prob_fn(*current_state_parts)`. The only reason to specify
+      this argument is to reduce TF graph size.
+      Default value: `None` (i.e., compute as needed).
+    current_grads_target_log_prob: (Optional) Python list of `Tensor`s
+      representing gradient of `target_log_prob_fn(*current_state_parts`) wrt
+      `current_state_parts`. Must have same shape as `current_state_parts`. The
+      only reason to specify this argument is to reduce TF graph size.
+      Default value: `None` (i.e., compute as needed).
+    name: Python `str` name prefixed to Ops created by this function.
+      Default value: `None` (i.e., "hmc_leapfrog_integrator").
+
+  Returns:
+    proposed_momentums: Updated value of the momentum.
+    proposed_state_parts: Tensor or Python list of `Tensor`s representing the
+      state(s) of the Markov chain(s) at each result step. Has same shape as
+      input `current_state_parts`.
+    proposed_target_log_prob: `Tensor` representing the value of
+      `target_log_prob_fn` at `accepted_state`.
+    proposed_grads_target_log_prob: Gradient of `proposed_target_log_prob` wrt
+      `accepted_state`.
+
+  Raises:
+    ValueError: if `len(momentums) != len(state_parts)`.
+    ValueError: if `len(state_parts) != len(step_sizes)`.
+    ValueError: if `len(state_parts) != len(grads_target_log_prob)`.
+    TypeError: if `not target_log_prob.dtype.is_floating`.
   """
-  with ops.name_scope(name, 'leapfrog_step', [step_size, position, momentum,
-                                              grad]):
-    momentum -= 0.5 * step_size * grad
-    position += step_size * momentum
-    potential, grad = potential_and_grad(position)
-    momentum -= 0.5 * step_size * grad
-
-  return position, momentum, potential, grad
+  def _loop_body(step,
+                 current_momentums,
+                 current_state_parts,
+                 ignore_current_target_log_prob,  # pylint: disable=unused-argument
+                 current_grads_target_log_prob):
+    return [step + 1] + list(_leapfrog_step(current_momentums,
+                                            target_log_prob_fn,
+                                            current_state_parts,
+                                            step_sizes,
+                                            current_grads_target_log_prob))
+
+  with ops.name_scope(
+      name, "hmc_leapfrog_integrator",
+      [current_momentums, current_state_parts, step_sizes, num_leapfrog_steps,
+       current_target_log_prob, current_grads_target_log_prob]):
+    if len(current_momentums) != len(current_state_parts):
+      raise ValueError("`momentums` must be in one-to-one correspondence "
+                       "with `state_parts`")
+    num_leapfrog_steps = ops.convert_to_tensor(num_leapfrog_steps,
+                                               name="num_leapfrog_steps")
+    current_target_log_prob, current_grads_target_log_prob = (
+        _maybe_call_fn_and_grads(
+            target_log_prob_fn,
+            current_state_parts,
+            current_target_log_prob,
+            current_grads_target_log_prob))
+    return control_flow_ops.while_loop(
+        cond=lambda iter_, *args: iter_ < num_leapfrog_steps,
+        body=_loop_body,
+        loop_vars=[
+            0,  # iter_
+            current_momentums,
+            current_state_parts,
+            current_target_log_prob,
+            current_grads_target_log_prob,
+        ],
+        back_prop=False)[1:]  # Lop-off "iter_".
+
+
+def _leapfrog_step(current_momentums,
+                   target_log_prob_fn,
+                   current_state_parts,
+                   step_sizes,
+                   current_grads_target_log_prob,
+                   name=None):
+  """Applies one step of the leapfrog integrator."""
+  with ops.name_scope(
+      name, "_leapfrog_step",
+      [current_momentums, current_state_parts, step_sizes,
+       current_grads_target_log_prob]):
+    proposed_momentums = [m + 0.5 * ss * g for m, ss, g
+                          in zip(current_momentums,
+                                 step_sizes,
+                                 current_grads_target_log_prob)]
+    proposed_state_parts = [x + ss * m for x, ss, m
+                            in zip(current_state_parts,
+                                   step_sizes,
+                                   proposed_momentums)]
+    proposed_target_log_prob = target_log_prob_fn(*proposed_state_parts)
+    if not proposed_target_log_prob.dtype.is_floating:
+      raise TypeError("`target_log_prob_fn` must produce a `Tensor` "
+                      "with `float` `dtype`.")
+    proposed_grads_target_log_prob = gradients_ops.gradients(
+        proposed_target_log_prob, proposed_state_parts)
+    if any(g is None for g in proposed_grads_target_log_prob):
+      raise ValueError(
+          "Encountered `None` gradient. Does your target `target_log_prob_fn` "
+          "access all `tf.Variable`s via `tf.get_variable`?\n"
+          "  current_state_parts: {}\n"
+          "  proposed_state_parts: {}\n"
+          "  proposed_grads_target_log_prob: {}".format(
+              current_state_parts,
+              proposed_state_parts,
+              proposed_grads_target_log_prob))
+    proposed_momentums = [m + 0.5 * ss * g for m, ss, g
+                          in zip(proposed_momentums,
+                                 step_sizes,
+                                 proposed_grads_target_log_prob)]
+    return [
+        proposed_momentums,
+        proposed_state_parts,
+        proposed_target_log_prob,
+        proposed_grads_target_log_prob,
+    ]
+
+
+def _compute_energy_change(current_target_log_prob,
+                           current_momentums,
+                           proposed_target_log_prob,
+                           proposed_momentums,
+                           independent_chain_ndims,
+                           name=None):
+  """Helper to `kernel` which computes the energy change."""
+  with ops.name_scope(
+      name, "compute_energy_change",
+      ([current_target_log_prob, proposed_target_log_prob,
+        independent_chain_ndims] +
+       current_momentums + proposed_momentums)):
+    # Abbreviate lk0=log_kinetic_energy and lk1=proposed_log_kinetic_energy
+    # since they're a mouthful and lets us inline more.
+    lk0, lk1 = [], []
+    for current_momentum, proposed_momentum in zip(current_momentums,
+                                                   proposed_momentums):
+      axis = math_ops.range(independent_chain_ndims,
+                            array_ops.rank(current_momentum))
+      lk0.append(_log_sum_sq(current_momentum, axis))
+      lk1.append(_log_sum_sq(proposed_momentum, axis))
+
+    lk0 = -np.log(2.) + math_ops.reduce_logsumexp(array_ops.stack(lk0, axis=-1),
+                                                  axis=-1)
+    lk1 = -np.log(2.) + math_ops.reduce_logsumexp(array_ops.stack(lk1, axis=-1),
+                                                  axis=-1)
+    lp0 = -current_target_log_prob   # log_potential
+    lp1 = -proposed_target_log_prob  # proposed_log_potential
+    x = array_ops.stack([lp1, math_ops.exp(lk1), -lp0, -math_ops.exp(lk0)],
+                        axis=-1)
+
+    # The sum is NaN if any element is NaN or we see both +Inf and -Inf.
+    # Thus we will replace such rows with infinite energy change which implies
+    # rejection. Recall that float-comparisons with NaN are always False.
+    is_sum_determinate = (
+        math_ops.reduce_all(math_ops.is_finite(x) | (x >= 0.), axis=-1) &
+        math_ops.reduce_all(math_ops.is_finite(x) | (x <= 0.), axis=-1))
+    is_sum_determinate = array_ops.tile(
+        is_sum_determinate[..., array_ops.newaxis],
+        multiples=array_ops.concat([
+            array_ops.ones(array_ops.rank(is_sum_determinate),
+                           dtype=dtypes.int32),
+            [4],
+        ], axis=0))
+    x = array_ops.where(is_sum_determinate,
+                        x,
+                        array_ops.fill(array_ops.shape(x),
+                                       value=x.dtype.as_numpy_dtype(np.inf)))
+
+    return math_ops.reduce_sum(x, axis=-1)
+
+
+def _choose(is_accepted,
+            accepted,
+            rejected,
+            independent_chain_ndims,
+            name=None):
+  """Helper to `kernel` which expand_dims `is_accepted` to apply tf.where."""
+  def _expand_is_accepted_like(x):
+    with ops.name_scope("_choose"):
+      expand_shape = array_ops.concat([
+          array_ops.shape(is_accepted),
+          array_ops.ones([array_ops.rank(x) - array_ops.rank(is_accepted)],
+                         dtype=dtypes.int32),
+      ], axis=0)
+      multiples = array_ops.concat([
+          array_ops.ones([array_ops.rank(is_accepted)], dtype=dtypes.int32),
+          array_ops.shape(x)[independent_chain_ndims:],
+      ], axis=0)
+      m = array_ops.tile(array_ops.reshape(is_accepted, expand_shape),
+                         multiples)
+      m.set_shape(x.shape)
+      return m
+  with ops.name_scope(name, "_choose", values=[
+      is_accepted, accepted, rejected, independent_chain_ndims]):
+    return array_ops.where(_expand_is_accepted_like(accepted),
+                           accepted,
+                           rejected)
+
+
+def _maybe_call_fn_and_grads(fn,
+                             fn_arg_list,
+                             fn_result=None,
+                             grads_fn_result=None,
+                             description="target_log_prob"):
+  """Helper which computes `fn_result` and `grads` if needed."""
+  fn_arg_list = (list(fn_arg_list) if _is_list_like(fn_arg_list)
+                 else [fn_arg_list])
+  if fn_result is None:
+    fn_result = fn(*fn_arg_list)
+  if not fn_result.dtype.is_floating:
+    raise TypeError("`{}` must be a `Tensor` with `float` `dtype`.".format(
+        description))
+  if grads_fn_result is None:
+    grads_fn_result = gradients_ops.gradients(
+        fn_result, fn_arg_list)
+  if len(fn_arg_list) != len(grads_fn_result):
+    raise ValueError("`{}` must be in one-to-one correspondence with "
+                     "`grads_{}`".format(*[description]*2))
+  if any(g is None for g in grads_fn_result):
+    raise ValueError("Encountered `None` gradient.")
+  return fn_result, grads_fn_result
+
+
+def _prepare_args(target_log_prob_fn, state, step_size,
+                  target_log_prob=None, grads_target_log_prob=None,
+                  maybe_expand=False, description="target_log_prob"):
+  """Helper which processes input args to meet list-like assumptions."""
+  state_parts = list(state) if _is_list_like(state) else [state]
+  state_parts = [ops.convert_to_tensor(s, name="state")
+                 for s in state_parts]
+  target_log_prob, grads_target_log_prob = _maybe_call_fn_and_grads(
+      target_log_prob_fn,
+      state_parts,
+      target_log_prob,
+      grads_target_log_prob,
+      description)
+  step_sizes = list(step_size) if _is_list_like(step_size) else [step_size]
+  step_sizes = [
+      ops.convert_to_tensor(
+          s, name="step_size", dtype=target_log_prob.dtype)
+      for s in step_sizes]
+  if len(step_sizes) == 1:
+    step_sizes *= len(state_parts)
+  if len(state_parts) != len(step_sizes):
+    raise ValueError("There should be exactly one `step_size` or it should "
+                     "have same length as `current_state`.")
+  if maybe_expand:
+    maybe_flatten = lambda x: x
+  else:
+    maybe_flatten = lambda x: x if _is_list_like(state) else x[0]
+  return [
+      maybe_flatten(state_parts),
+      maybe_flatten(step_sizes),
+      target_log_prob,
+      grads_target_log_prob,
+  ]
+
+
+def _is_list_like(x):
+  """Helper which returns `True` if input is `list`-like."""
+  return isinstance(x, (tuple, list))
+
+
+def _log_sum_sq(x, axis=None):
+  """Computes log(sum(x**2))."""
+  return math_ops.reduce_logsumexp(2. * math_ops.log(math_ops.abs(x)), axis)
-- 
GitLab


From f85f23bea87ea856ac839806b5d62f4257fb684b Mon Sep 17 00:00:00 2001
From: "Joshua V. Dillon" <jvdillon@google.com>
Date: Fri, 2 Feb 2018 13:08:31 -0800
Subject: [PATCH 596/966] Support `float16` `dtype` in `tf.linalg.*`. Note: not
 all `LinearOperator` functions will support `float16`. This change merely
 enables constructing the `LinearOperator` object(s) using this `dtype`.

PiperOrigin-RevId: 184323477
---
 tensorflow/python/ops/linalg/linalg_impl.py           |  8 ++++----
 tensorflow/python/ops/linalg/linear_operator_diag.py  | 11 ++++++++---
 .../python/ops/linalg/linear_operator_full_matrix.py  | 10 ++++++++--
 .../ops/linalg/linear_operator_low_rank_update.py     | 10 +++++++---
 .../ops/linalg/linear_operator_lower_triangular.py    |  9 +++++++--
 5 files changed, 34 insertions(+), 14 deletions(-)

diff --git a/tensorflow/python/ops/linalg/linalg_impl.py b/tensorflow/python/ops/linalg/linalg_impl.py
index db33a08137..a5096ffdd9 100644
--- a/tensorflow/python/ops/linalg/linalg_impl.py
+++ b/tensorflow/python/ops/linalg/linalg_impl.py
@@ -65,8 +65,8 @@ def logdet(matrix, name=None):
   ```
 
   Args:
-    matrix:  A `Tensor`. Must be `float32`, `float64`, `complex64`, or
-      `complex128` with shape `[..., M, M]`.
+    matrix:  A `Tensor`. Must be `float16`, `float32`, `float64`, `complex64`,
+      or `complex128` with shape `[..., M, M]`.
     name:  A name to give this `Op`.  Defaults to `logdet`.
 
   Returns:
@@ -99,8 +99,8 @@ def adjoint(matrix, name=None):
                         #  [3 - 3j, 6 - 6j]]
 
   Args:
-    matrix:  A `Tensor`. Must be `float32`, `float64`, `complex64`, or
-      `complex128` with shape `[..., M, M]`.
+    matrix:  A `Tensor`. Must be `float16`, `float32`, `float64`, `complex64`,
+      or `complex128` with shape `[..., M, M]`.
     name:  A name to give this `Op` (optional).
 
   Returns:
diff --git a/tensorflow/python/ops/linalg/linear_operator_diag.py b/tensorflow/python/ops/linalg/linear_operator_diag.py
index a4724d030f..2217bfd545 100644
--- a/tensorflow/python/ops/linalg/linear_operator_diag.py
+++ b/tensorflow/python/ops/linalg/linear_operator_diag.py
@@ -121,8 +121,8 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
 
     Args:
       diag:  Shape `[B1,...,Bb, N]` `Tensor` with `b >= 0` `N >= 0`.
-        The diagonal of the operator.  Allowed dtypes: `float32`, `float64`,
-          `complex64`, `complex128`.
+        The diagonal of the operator.  Allowed dtypes: `float16`, `float32`,
+          `float64`, `complex64`, `complex128`.
       is_non_singular:  Expect that this operator is non-singular.
       is_self_adjoint:  Expect that this operator is equal to its hermitian
         transpose.  If `diag.dtype` is real, this is auto-set to `True`.
@@ -167,7 +167,12 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
   def _check_diag(self, diag):
     """Static check of diag."""
     allowed_dtypes = [
-        dtypes.float32, dtypes.float64, dtypes.complex64, dtypes.complex128]
+        dtypes.float16,
+        dtypes.float32,
+        dtypes.float64,
+        dtypes.complex64,
+        dtypes.complex128,
+    ]
 
     dtype = diag.dtype
     if dtype not in allowed_dtypes:
diff --git a/tensorflow/python/ops/linalg/linear_operator_full_matrix.py b/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
index dd4c7cb041..8fb59ca1a7 100644
--- a/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
+++ b/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
@@ -114,7 +114,8 @@ class LinearOperatorFullMatrix(linear_operator.LinearOperator):
 
     Args:
       matrix:  Shape `[B1,...,Bb, M, N]` with `b >= 0`, `M, N >= 0`.
-        Allowed dtypes: `float32`, `float64`, `complex64`, `complex128`.
+        Allowed dtypes: `float16`, `float32`, `float64`, `complex64`,
+        `complex128`.
       is_non_singular:  Expect that this operator is non-singular.
       is_self_adjoint:  Expect that this operator is equal to its hermitian
         transpose.
@@ -147,7 +148,12 @@ class LinearOperatorFullMatrix(linear_operator.LinearOperator):
   def _check_matrix(self, matrix):
     """Static check of the `matrix` argument."""
     allowed_dtypes = [
-        dtypes.float32, dtypes.float64, dtypes.complex64, dtypes.complex128]
+        dtypes.float16,
+        dtypes.float32,
+        dtypes.float64,
+        dtypes.complex64,
+        dtypes.complex128,
+    ]
 
     matrix = ops.convert_to_tensor(matrix, name="matrix")
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py b/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py
index ad3bb2efa9..36eed89db6 100644
--- a/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py
+++ b/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py
@@ -150,8 +150,8 @@ class LinearOperatorLowRankUpdate(linear_operator.LinearOperator):
     `is_X` matrix property hints, which will trigger the appropriate code path.
 
     Args:
-      base_operator:  Shape `[B1,...,Bb, M, N]` real `float32` or `float64`
-        `LinearOperator`.  This is `L` above.
+      base_operator:  Shape `[B1,...,Bb, M, N]` real `float16`, `float32` or
+        `float64` `LinearOperator`.  This is `L` above.
       u:  Shape `[B1,...,Bb, M, K]` `Tensor` of same `dtype` as `base_operator`.
         This is `U` above.
       diag_update:  Optional shape `[B1,...,Bb, K]` `Tensor` with same `dtype`
@@ -188,7 +188,11 @@ class LinearOperatorLowRankUpdate(linear_operator.LinearOperator):
     #    because if diag has non-zero imaginary part, it will not be
     #    self-adjoint positive definite.
     dtype = base_operator.dtype
-    allowed_dtypes = [dtypes.float32, dtypes.float64]
+    allowed_dtypes = [
+        dtypes.float16,
+        dtypes.float32,
+        dtypes.float64,
+    ]
     if dtype not in allowed_dtypes:
       raise TypeError(
           "Argument matrix must have dtype in %s.  Found: %s"
diff --git a/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py b/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py
index 6ea55f0367..6419030755 100644
--- a/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py
+++ b/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py
@@ -118,7 +118,8 @@ class LinearOperatorLowerTriangular(linear_operator.LinearOperator):
     Args:
       tril:  Shape `[B1,...,Bb, N, N]` with `b >= 0`, `N >= 0`.
         The lower triangular part of `tril` defines this operator.  The strictly
-        upper triangle is ignored.  Allowed dtypes: `float32`, `float64`.
+        upper triangle is ignored.  Allowed dtypes: `float16`, `float32`,
+        `float64`.
       is_non_singular:  Expect that this operator is non-singular.
         This operator is non-singular if and only if its diagonal elements are
         all non-zero.
@@ -164,7 +165,11 @@ class LinearOperatorLowerTriangular(linear_operator.LinearOperator):
     """Static check of the `tril` argument."""
     # TODO(langmore) Add complex types once matrix_triangular_solve works for
     # them.
-    allowed_dtypes = [dtypes.float32, dtypes.float64]
+    allowed_dtypes = [
+        dtypes.float16,
+        dtypes.float32,
+        dtypes.float64,
+    ]
     dtype = tril.dtype
     if dtype not in allowed_dtypes:
       raise TypeError(
-- 
GitLab


From cf330efa207d635efecfb1b703ee65ccfbc3f98c Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Fri, 2 Feb 2018 13:39:38 -0800
Subject: [PATCH 597/966] [tf.data] Move framework/dataset.h to
 framework/dataset_stateful_op_whitelist.h.

This will make way for the move of kernels/data/dataset.h to
framework/dataset.h, while preserving version control history, after
which we might recombine the headers.

PiperOrigin-RevId: 184327481
---
 tensorflow/core/BUILD                                       | 2 +-
 .../{dataset.h => dataset_stateful_op_whitelist.h}          | 6 +++---
 tensorflow/core/kernels/data/dataset.h                      | 2 +-
 tensorflow/core/ops/lookup_ops.cc                           | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)
 rename tensorflow/core/framework/{dataset.h => dataset_stateful_op_whitelist.h} (93%)

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 1fcdfa73ad..68c82ea303 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -432,7 +432,7 @@ tf_cuda_library(
         "framework/cancellation.h",
         "framework/common_shape_fns.h",
         "framework/control_flow.h",  # TODO(josh11b): Make internal?
-        "framework/dataset.h",
+        "framework/dataset_stateful_op_whitelist.h",
         "framework/device_base.h",
         "framework/function.h",
         "framework/graph_def_util.h",
diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset_stateful_op_whitelist.h
similarity index 93%
rename from tensorflow/core/framework/dataset.h
rename to tensorflow/core/framework/dataset_stateful_op_whitelist.h
index f866183f61..3b48999edb 100644
--- a/tensorflow/core/framework/dataset.h
+++ b/tensorflow/core/framework/dataset_stateful_op_whitelist.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_FRAMEWORK_DATASET_H_
-#define TENSORFLOW_FRAMEWORK_DATASET_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_DATASET_STATEFUL_OP_WHITELIST_H_
+#define TENSORFLOW_CORE_FRAMEWORK_DATASET_STATEFUL_OP_WHITELIST_H_
 
 #include "tensorflow/core/lib/core/status.h"
 
@@ -74,4 +74,4 @@ class WhitelistedStatefulOpRegistry {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_FRAMEWORK_DATASET_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_DATASET_STATEFUL_OP_WHITELIST_H_
diff --git a/tensorflow/core/kernels/data/dataset.h b/tensorflow/core/kernels/data/dataset.h
index 08c3ca82ea..8238661b87 100644
--- a/tensorflow/core/kernels/data/dataset.h
+++ b/tensorflow/core/kernels/data/dataset.h
@@ -19,7 +19,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/attr_value_util.h"
-#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/dataset_stateful_op_whitelist.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
diff --git a/tensorflow/core/ops/lookup_ops.cc b/tensorflow/core/ops/lookup_ops.cc
index 50ea8ad01a..444aa8b954 100644
--- a/tensorflow/core/ops/lookup_ops.cc
+++ b/tensorflow/core/ops/lookup_ops.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/dataset_stateful_op_whitelist.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def_builder.h"
 #include "tensorflow/core/framework/shape_inference.h"
-- 
GitLab


From 1a92f45677ee66af24f2219c6b1cbaeee87056b7 Mon Sep 17 00:00:00 2001
From: Yu-Cheng Ling <ycling@google.com>
Date: Fri, 2 Feb 2018 13:49:35 -0800
Subject: [PATCH 598/966] TFLite: Conv CBLAS kernel PiperOrigin-RevId:
 184328848

---
 tensorflow/contrib/lite/kernels/conv.cc       | 32 ++++++-
 tensorflow/contrib/lite/kernels/conv_test.cc  |  2 +
 .../contrib/lite/kernels/internal/BUILD       |  2 +
 .../kernels/internal/optimized/cblas_conv.h   | 92 +++++++++++++++++++
 .../internal/optimized/cblas_reference.h      | 66 +++++++++++++
 5 files changed, 193 insertions(+), 1 deletion(-)
 create mode 100644 tensorflow/contrib/lite/kernels/internal/optimized/cblas_conv.h
 create mode 100644 tensorflow/contrib/lite/kernels/internal/optimized/cblas_reference.h

diff --git a/tensorflow/contrib/lite/kernels/conv.cc b/tensorflow/contrib/lite/kernels/conv.cc
index a5095e1e64..7a45647434 100644
--- a/tensorflow/contrib/lite/kernels/conv.cc
+++ b/tensorflow/contrib/lite/kernels/conv.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/contrib/lite/builtin_op_data.h"
 #include "tensorflow/contrib/lite/context.h"
 #include "tensorflow/contrib/lite/kernels/gemm_support.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/cblas_conv.h"
 #include "tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h"
 #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
@@ -38,11 +39,16 @@ namespace ops {
 namespace builtin {
 namespace conv {
 
-// This file has three implementation of Conv.
+// This file has 4 implementation of Conv.
 enum KernelType {
   kReference,
   kGenericOptimized,  // Neon-free
   kMultithreadOptimized,
+  // The kernel uses use CBLAS interface for matrix multiplication.
+  // It's fast when an optimized CBLAS implementation is available (e.g. Apple
+  // Accelerate Framework), and it's slow when falling back to naive
+  // implementation.
+  kCblasOptimized,
 };
 
 struct OpData {
@@ -305,6 +311,7 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
       break;
     case kGenericOptimized:
     case kMultithreadOptimized:
+    case kCblasOptimized:
       // There is only one optimized implementation for Quantized Conv.
       optimized_ops::Conv(
           GetTensorData<uint8_t>(input), GetTensorDims(input), input_offset,
@@ -369,6 +376,17 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
           GetTensorData<float>(im2col), GetTensorDims(im2col));
       break;
     }
+    case kCblasOptimized: {
+      cblas_ops::Conv(GetTensorData<float>(input), GetTensorDims(input),
+                      GetTensorData<float>(filter), GetTensorDims(filter),
+                      GetTensorData<float>(bias), GetTensorDims(bias),
+                      params->stride_width, params->stride_height,
+                      data->padding.width, data->padding.height,
+                      output_activation_min, output_activation_max,
+                      GetTensorData<float>(output), GetTensorDims(output),
+                      GetTensorData<float>(im2col), GetTensorDims(im2col));
+      break;
+    }
   }
 }
 
@@ -435,8 +453,20 @@ TfLiteRegistration* Register_CONVOLUTION_MULTITHREADED_OPT() {
   return &r;
 }
 
+TfLiteRegistration* Register_CONVOLUTION_CBLAS_OPT() {
+  static TfLiteRegistration r = {conv::Init, conv::Free, conv::Prepare,
+                                 conv::Eval<conv::kCblasOptimized>};
+  return &r;
+}
+
 TfLiteRegistration* Register_CONV_2D() {
+// TODO(ycling): Define a compilation flag and use CBLAS kernel when a
+// fast CBLAS implementatino is available.
+#ifdef TFLITE_USE_CBLAS_CONVOLUTION_KERNEL
+  return Register_CONVOLUTION_CBLAS_OPT();
+#else
   return Register_CONVOLUTION_MULTITHREADED_OPT();
+#endif
 }
 
 }  // namespace builtin
diff --git a/tensorflow/contrib/lite/kernels/conv_test.cc b/tensorflow/contrib/lite/kernels/conv_test.cc
index 7550f7cc0d..d2393c3c97 100644
--- a/tensorflow/contrib/lite/kernels/conv_test.cc
+++ b/tensorflow/contrib/lite/kernels/conv_test.cc
@@ -29,6 +29,7 @@ namespace builtin {
 TfLiteRegistration* Register_CONVOLUTION_REF();
 TfLiteRegistration* Register_CONVOLUTION_GENERIC_OPT();
 TfLiteRegistration* Register_CONVOLUTION_MULTITHREADED_OPT();
+TfLiteRegistration* Register_CONVOLUTION_CBLAS_OPT();
 
 }  // namespace builtin
 }  // namespace ops
@@ -105,6 +106,7 @@ const auto kKernelMap = new std::map<string, TfLiteRegistration*>({
     {"GenericOptimized", ops::builtin::Register_CONVOLUTION_GENERIC_OPT()},
     {"MultithreadedOptimized",
      ops::builtin::Register_CONVOLUTION_MULTITHREADED_OPT()},
+    {"CblasOptimized", ops::builtin::Register_CONVOLUTION_CBLAS_OPT()},
 });
 
 class ConvolutionOpTest : public SingleOpTest {
diff --git a/tensorflow/contrib/lite/kernels/internal/BUILD b/tensorflow/contrib/lite/kernels/internal/BUILD
index de635cfb4a..288f1f8bbc 100644
--- a/tensorflow/contrib/lite/kernels/internal/BUILD
+++ b/tensorflow/contrib/lite/kernels/internal/BUILD
@@ -170,6 +170,8 @@ cc_library(
 cc_library(
     name = "optimized",
     hdrs = [
+        "optimized/cblas_conv.h",
+        "optimized/cblas_reference.h",
         "optimized/eigen_spatial_convolutions.h",
         "optimized/eigen_tensor_reduced_instantiations_oss.h",
         "optimized/multithreaded_conv.h",
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/cblas_conv.h b/tensorflow/contrib/lite/kernels/internal/optimized/cblas_conv.h
new file mode 100644
index 0000000000..a9b6663122
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/cblas_conv.h
@@ -0,0 +1,92 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CBLAS_CONV_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CBLAS_CONV_H_
+
+// The Conv implementation based on CBLAS interface. This is only used on iOS
+// for now, utilizing Apple's Accelerate framework.
+
+#if defined(__APPLE__)
+#include <Accelerate/Accelerate.h>
+#else
+#include "tensorflow/contrib/lite/kernels/internal/optimized/cblas_reference.h"
+#endif  // __APPLE__
+
+#include "tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+
+namespace tflite {
+namespace cblas_ops {
+
+inline void Conv(const float* input_data, const Dims<4>& input_dims,
+                 const float* filter_data, const Dims<4>& filter_dims,
+                 const float* bias_data, const Dims<4>& bias_dims,
+                 int stride_width, int stride_height, int pad_width,
+                 int pad_height, float output_activation_min,
+                 float output_activation_max, float* output_data,
+                 const Dims<4>& output_dims, float* im2col_data,
+                 const Dims<4>& im2col_dims) {
+  gemmlowp::ScopedProfilingLabel label("Conv/cblas");
+
+  const float* gemm_input_data = nullptr;
+  const Dims<4>* gemm_input_dims = nullptr;
+  const int filter_width = ArraySize(filter_dims, 1);
+  const int filter_height = ArraySize(filter_dims, 2);
+  const bool need_im2col = stride_width != 1 || stride_height != 1 ||
+                           filter_width != 1 || filter_height != 1;
+  if (need_im2col) {
+    TFLITE_DCHECK(im2col_data);
+    optimized_ops::Im2col(input_data, input_dims, stride_width, stride_height,
+                          pad_width, pad_height, filter_height, filter_width, 0,
+                          im2col_data, im2col_dims);
+    gemm_input_data = im2col_data;
+    gemm_input_dims = &im2col_dims;
+  } else {
+    TFLITE_DCHECK(!im2col_data);
+    gemm_input_data = input_data;
+    gemm_input_dims = &input_dims;
+  }
+
+  // The following code computes matrix multiplication c = a * transponse(b)
+  // with CBLAS, where:
+  // * `a` is a matrix with dimensions (m, k).
+  // * `b` is a matrix with dimensions (n, k), so transpose(b) is (k, n).
+  // * `c` is a matrix with dimensions (m, n).
+  // The naming of variables are aligned with CBLAS specification here.
+  const float* a = gemm_input_data;
+  const float* b = filter_data;
+  float* c = output_data;
+  int m = gemm_input_dims->sizes[1] * gemm_input_dims->sizes[2] *
+          gemm_input_dims->sizes[3];
+  int n = output_dims.sizes[0];
+  int k = gemm_input_dims->sizes[0];
+  // The stride of matrix a, b and c respectively.
+  int stride_a = k;
+  int stride_b = k;
+  int stride_c = n;
+
+  cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, m, n, k, 1.0f, a,
+              stride_a, b, stride_b, 0.0f, c, stride_c);
+
+  optimized_ops::AddBiasAndEvalActivationFunction(
+      bias_data, bias_dims, output_data, output_dims, output_activation_min,
+      output_activation_max);
+}
+
+}  // namespace cblas_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CBLAS_CONV_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/cblas_reference.h b/tensorflow/contrib/lite/kernels/internal/optimized/cblas_reference.h
new file mode 100644
index 0000000000..6578915743
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/cblas_reference.h
@@ -0,0 +1,66 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CBLAS_REFERENCE_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CBLAS_REFERENCE_H_
+
+#include "tensorflow/contrib/lite/kernels/internal/compatibility.h"
+
+// The reference implementation for a small subset of CBLAS interface.
+// This is only used for testing CBLAS implementation, and should never be used
+// in production code.
+
+namespace tflite {
+namespace cblas_ops {
+
+// The following code follows the original CBLAS specification, and it might
+// conflict with the TensorFlow naming convention.
+// TODO(ycling): Find another way to test CBLAS with bazel, without writing
+// a reference implementation by ourselves.
+enum CBLAS_ORDER { CblasRowMajor = 0, CblasColMajor = 1 };
+
+enum CBLAS_TRANSPOSE { CblasNoTrans = 0, CblasTrans = 1, CblasConjTrans = 2 };
+
+// A reference implementation for matrix multiplication.
+// The following code computes, c = a * transponse(b) matrix multiplication
+// with CBLAS, where:
+// * `a` is a matrix with dimensions (m, k).
+// * `b` is a matrix with dimensions (n, k), so transpose(b) is (k, n).
+// * `c` is a matrix with dimensions (m, n).
+// The naming of variables is aligned with CBLAS specification here.
+void cblas_sgemm(const enum CBLAS_ORDER order,
+                 const enum CBLAS_TRANSPOSE trans_a,
+                 const enum CBLAS_TRANSPOSE trans_b, const int m, const int n,
+                 const int k, const float alpha, const float *a,
+                 const int stride_a, const float *b, const int stride_b,
+                 const float beta, float *c, const int stride_c) {
+  TFLITE_DCHECK(order == CblasRowMajor);
+  TFLITE_DCHECK(trans_a == CblasNoTrans);
+  TFLITE_DCHECK(trans_b == CblasTrans);
+  for (int row = 0; row < m; ++row) {
+    for (int col = 0; col < n; ++col) {
+      float value = beta * c[stride_c * row + col];
+      for (int idx = 0; idx < k; ++idx) {
+        value += alpha * a[stride_a * row + idx] * b[stride_b * col + idx];
+      }
+      c[stride_c * row + col] = value;
+    }
+  }
+}
+
+}  // namespace cblas_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CBLAS_REFERENCE_H_
-- 
GitLab


From 9332ce7e8d394f3e40b7def72da7b1f761db2803 Mon Sep 17 00:00:00 2001
From: Mingsheng Hong <hongm@google.com>
Date: Fri, 2 Feb 2018 13:53:08 -0800
Subject: [PATCH 599/966] A few misc tweaks to TFE APIs.

PiperOrigin-RevId: 184329345
---
 tensorflow/c/eager/c_api.cc         | 2 +-
 tensorflow/c/eager/c_api_internal.h | 2 ++
 tensorflow/c/eager/runtime.cc       | 9 ++++-----
 tensorflow/c/eager/runtime.h        | 2 +-
 tensorflow/c/eager/runtime_test.cc  | 6 +++---
 5 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index fd6cecd77b..d5b9bff505 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -290,7 +290,7 @@ TF_AttrType TFE_OpGetAttrType(TFE_Op* op, const char* attr_name,
     return TF_ATTR_INT;  // The compiler requires that we return something.
   }
   status->status =
-      tensorflow::AttrTypeByName(op->attr_types, attr_name, &ret, is_list);
+      tensorflow::AttrTypeByName(*op->attr_types, attr_name, &ret, is_list);
   return ret;
 }
 
diff --git a/tensorflow/c/eager/c_api_internal.h b/tensorflow/c/eager/c_api_internal.h
index dda68471a8..f2abffb7bc 100644
--- a/tensorflow/c/eager/c_api_internal.h
+++ b/tensorflow/c/eager/c_api_internal.h
@@ -107,6 +107,8 @@ struct TFE_TensorHandle {
 };
 
 struct TFE_Op {
+  // t is NULL iff the TFE_Op corresponds to a TensorFlow function instead of a
+  // primitive operation.
   TFE_Op(TFE_Context* ctx, const char* op, const tensorflow::AttrTypeMap* t)
       : ctx(ctx), name(op), attrs(op), attr_types(t), device(nullptr) {}
 
diff --git a/tensorflow/c/eager/runtime.cc b/tensorflow/c/eager/runtime.cc
index 3a9951e14d..12abfcba2f 100644
--- a/tensorflow/c/eager/runtime.cc
+++ b/tensorflow/c/eager/runtime.cc
@@ -86,10 +86,9 @@ Status AttrTypeMapForOp(const char* op_name, const AttrTypeMap** out) {
   return Status::OK();
 }
 
-Status AttrTypeByName(const AttrTypeMap* m, const string& attr_name,
+Status AttrTypeByName(const AttrTypeMap& m, const string& attr_name,
                       TF_AttrType* out, unsigned char* is_list) {
-  CHECK(m);
-  auto* t = gtl::FindOrNull(*m, attr_name);
+  auto* t = gtl::FindOrNull(m, attr_name);
   if (t == nullptr) {
     return errors::InvalidArgument("Attribute '", attr_name,
                                    "' does not exist for this operation");
@@ -173,14 +172,14 @@ void CombineUnordered(const tensorflow::Fprint128& a,
   b->high64 += a.high64;
 }
 
-inline tensorflow::Fprint128 CacheKeyHelper(const StringPiece& s,
+inline tensorflow::Fprint128 CacheKeyHelper(StringPiece s,
                                             const tensorflow::Fprint128& b) {
   // TODO(agarwal): avoid ToString().
   tensorflow::Fprint128 a = tensorflow::Fingerprint128(s.ToString());
   return FingerprintCat128(a, b);
 }
 
-inline tensorflow::Fprint128 CacheKeyHelper(const StringPiece& s, uint64 b) {
+inline tensorflow::Fprint128 CacheKeyHelper(StringPiece s, uint64 b) {
   return CacheKeyHelper(s, {b, b});
 }
 
diff --git a/tensorflow/c/eager/runtime.h b/tensorflow/c/eager/runtime.h
index e28a416e67..4d20b5244a 100644
--- a/tensorflow/c/eager/runtime.h
+++ b/tensorflow/c/eager/runtime.h
@@ -43,7 +43,7 @@ typedef std::unordered_map<string, uint32> AttrTypeMap;
 Status AttrTypeMapForOp(const char* op_name, const AttrTypeMap** out);
 
 // Looks for 'attr_name' in 'm' and sets 'out' and 'is_list'.
-Status AttrTypeByName(const AttrTypeMap* m, const string& attr_name,
+Status AttrTypeByName(const AttrTypeMap& m, const string& attr_name,
                       TF_AttrType* out, unsigned char* is_list);
 
 // KernelAndDevice::Init needs a NodeDef only to pass the attribute map through.
diff --git a/tensorflow/c/eager/runtime_test.cc b/tensorflow/c/eager/runtime_test.cc
index 2ccca66f67..643153058c 100644
--- a/tensorflow/c/eager/runtime_test.cc
+++ b/tensorflow/c/eager/runtime_test.cc
@@ -63,17 +63,17 @@ TEST(AttrTypeMap, Lookup) {
 
   TF_AttrType t;
   unsigned char is_list = 1;
-  s = AttrTypeByName(m, "ThisAttribyteCannotPossiblyExist", &t, &is_list);
+  s = AttrTypeByName(*m, "ThisAttribyteCannotPossiblyExist", &t, &is_list);
   EXPECT_FALSE(s.ok());
   EXPECT_NE(is_list, 0);
-  s = AttrTypeByName(m, "transpose_a", &t, &is_list);
+  s = AttrTypeByName(*m, "transpose_a", &t, &is_list);
   ASSERT_TRUE(s.ok()) << s;
   EXPECT_EQ(TF_ATTR_BOOL, t);
   EXPECT_EQ(is_list, 0);
 
   s = AttrTypeMapForOp("Squeeze", &m);
   ASSERT_TRUE(s.ok()) << s;
-  s = AttrTypeByName(m, "squeeze_dims", &t, &is_list);
+  s = AttrTypeByName(*m, "squeeze_dims", &t, &is_list);
   ASSERT_TRUE(s.ok()) << s;
   EXPECT_EQ(TF_ATTR_INT, t);
   EXPECT_NE(is_list, 0);
-- 
GitLab


From af4782345cc4b800d4d7d7918e74061f162a7c19 Mon Sep 17 00:00:00 2001
From: Jie <jiej@nvidia.com>
Date: Fri, 2 Feb 2018 14:08:35 -0800
Subject: [PATCH 600/966] [update]   removing commented code & unused code

---
 tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc | 1 -
 tensorflow/contrib/tensorrt/segment/segment.cc       | 9 ---------
 2 files changed, 10 deletions(-)

diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
index 080e246458..50c9e64deb 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include "cuda/include/cuda_runtime_api.h"
 #include "tensorflow/contrib/tensorrt/log/trt_logger.h"
 #include "tensorflow/core/platform/logging.h"
-//#include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/platform/stream_executor.h"
 
 namespace tensorflow {
diff --git a/tensorflow/contrib/tensorrt/segment/segment.cc b/tensorflow/contrib/tensorrt/segment/segment.cc
index c9d3840606..1672226643 100644
--- a/tensorflow/contrib/tensorrt/segment/segment.cc
+++ b/tensorflow/contrib/tensorrt/segment/segment.cc
@@ -223,15 +223,6 @@ tensorflow::Status SegmentGraph(
     }
   }
 
-  // Cleanup the graph to remove disconnected nodes before outputting
-  if (VLOG_IS_ON(2)) {
-    for (tensorflow::Node* node : graph.nodes()) {
-      if ((node->in_edges().size() == 0) && (node->out_edges().size() == 0)) {
-        graph.RemoveNode(node);
-      }
-    }
-  }
-
   // Convert the segments into the expected return format
   for (const auto& itr : sg_map) {
     const auto& segment_node_names = itr.second;
-- 
GitLab


From cbda8a9c36aeaa2b98c17019e9743134eecc89c1 Mon Sep 17 00:00:00 2001
From: Tatiana Shpeisman <shpeisman@google.com>
Date: Fri, 2 Feb 2018 14:14:13 -0800
Subject: [PATCH 601/966] Changed minimal required version of bleach Python
 package from 1.5 to 2.0. bleach 1.5 causes problems with Jupyter as reported
 in #16424

PiperOrigin-RevId: 184332663
---
 tensorflow/tools/ci_build/install/install_pip_packages.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index 71744c04f2..d406b83a62 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -43,8 +43,8 @@ pip2 install --upgrade werkzeug==0.11.10
 pip3 install --upgrade werkzeug==0.11.10
 
 # Install bleach. html5lib will be picked up as a dependency.
-pip2 install --upgrade bleach==1.5.0
-pip3 install --upgrade bleach==1.5.0
+pip2 install --upgrade bleach==2.0.0
+pip3 install --upgrade bleach==2.0.0
 
 # Install markdown.
 pip2 install --upgrade markdown==2.6.8
-- 
GitLab


From 1d172a20f63fbcf33c75a7793374e11df1a9db1a Mon Sep 17 00:00:00 2001
From: Eugene Brevdo <ebrevdo@google.com>
Date: Fri, 2 Feb 2018 14:30:50 -0800
Subject: [PATCH 602/966] [TF RNN] Small optimization to rnn: only calculate
 copy_cond once.

PiperOrigin-RevId: 184335231
---
 tensorflow/python/ops/rnn.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/tensorflow/python/ops/rnn.py b/tensorflow/python/ops/rnn.py
index a10e1963d1..24c6f64f0a 100644
--- a/tensorflow/python/ops/rnn.py
+++ b/tensorflow/python/ops/rnn.py
@@ -171,11 +171,11 @@ def _rnn_step(
   return (final_output, final_state)
 
   Args:
-    time: Python int, the current time step
-    sequence_length: int32 `Tensor` vector of size [batch_size]
-    min_sequence_length: int32 `Tensor` scalar, min of sequence_length
-    max_sequence_length: int32 `Tensor` scalar, max of sequence_length
-    zero_output: `Tensor` vector of shape [output_size]
+    time: int32 `Tensor` scalar.
+    sequence_length: int32 `Tensor` vector of size [batch_size].
+    min_sequence_length: int32 `Tensor` scalar, min of sequence_length.
+    max_sequence_length: int32 `Tensor` scalar, max of sequence_length.
+    zero_output: `Tensor` vector of shape [output_size].
     state: Either a single `Tensor` matrix of shape `[batch_size, state_size]`,
       or a list/tuple of such tensors.
     call_cell: lambda returning tuple of (new_output, new_state) where
@@ -202,6 +202,9 @@ def _rnn_step(
   flat_state = nest.flatten(state)
   flat_zero_output = nest.flatten(zero_output)
 
+  # Vector describing which batch entries are finished.
+  copy_cond = time >= sequence_length
+
   def _copy_one_through(output, new_output):
     # TensorArray and scalar get passed through.
     if isinstance(output, tensor_array_ops.TensorArray):
@@ -209,7 +212,6 @@ def _rnn_step(
     if output.shape.ndims == 0:
       return new_output
     # Otherwise propagate the old or the new value.
-    copy_cond = (time >= sequence_length)
     with ops.colocate_with(new_output):
       return array_ops.where(copy_cond, output, new_output)
 
-- 
GitLab


From 45fcb8f2d3e24ec52d418b93303f1afa0fac6496 Mon Sep 17 00:00:00 2001
From: Sami Kama <skama@nvidia.com>
Date: Fri, 2 Feb 2018 14:36:07 -0800
Subject: [PATCH 603/966] Fixes for PR comments and build failures

---
 tensorflow/contrib/BUILD                      |  5 +-
 tensorflow/contrib/cmake/python_modules.txt   |  2 +
 .../contrib/tensorrt/python/trt_convert.py    |  6 +-
 tensorflow/tensorflow.bzl                     | 55 ++++++++++---------
 third_party/gpus/cuda_configure.bzl           | 14 ++---
 third_party/tensorrt/BUILD.tpl                |  2 -
 6 files changed, 43 insertions(+), 41 deletions(-)

diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index 738b74c929..93a0308ddd 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -105,8 +105,9 @@ py_library(
         "//tensorflow/contrib/training:training_py",
         "//tensorflow/contrib/util:util_py",
         "//tensorflow/python:util",
-    ] + if_mpi(["//tensorflow/contrib/mpi_collectives:mpi_collectives_py"])
-    + if_tensorrt(["//tensorflow/contrib/tensorrt:init_py"]),
+    ] + if_mpi(["//tensorflow/contrib/mpi_collectives:mpi_collectives_py"]) + if_tensorrt([
+      "//tensorflow/contrib/tensorrt:init_py",
+    ]),
 
 )
 
diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt
index 9ce8b3cc9c..52f37b1e3b 100644
--- a/tensorflow/contrib/cmake/python_modules.txt
+++ b/tensorflow/contrib/cmake/python_modules.txt
@@ -405,6 +405,8 @@ tensorflow/contrib/summary
 tensorflow/contrib/tensorboard
 tensorflow/contrib/tensorboard/plugins
 tensorflow/contrib/tensorboard/plugins/projector
+# TODO(sami): Add cmake implementations
+# tensorflow/contrib/tensorrt/python
 tensorflow/contrib/tensor_forest
 tensorflow/contrib/tensor_forest/client
 tensorflow/contrib/tensor_forest/hybrid
diff --git a/tensorflow/contrib/tensorrt/python/trt_convert.py b/tensorflow/contrib/tensorrt/python/trt_convert.py
index 2d056610cd..5161831282 100644
--- a/tensorflow/contrib/tensorrt/python/trt_convert.py
+++ b/tensorflow/contrib/tensorrt/python/trt_convert.py
@@ -46,11 +46,11 @@ def CreateInferenceGraph(input_graph_def,
   def py2bytes(inp):
     return inp
   def py3bytes(inp):
-    return inp.encode('utf-8',errors='surrogateescape')
+    return inp.encode('utf-8', errors='surrogateescape')
   if _six.PY2:
-    to_bytes=py2bytes
+    to_bytes = py2bytes
   else:
-    to_bytes=py3bytes
+    to_bytes = py3bytes
 
   out_names = []
   for i in outputs:
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 2534c88c18..f78bee5630 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -1303,36 +1303,37 @@ def tf_extension_copts():
 # This function attempts to append init_module_name to list of
 # exported functions in version script
 def _append_init_to_versionscript_impl(ctx):
-    modName=ctx.attr.module_name
-    isVS=ctx.attr.is_version_script
-    if isVS:
-        ctx.actions.expand_template(
-            template=ctx.file.template_file,
-            output=ctx.outputs.versionscript,
-            substitutions={
-                "global:":"global:\n     init_%s;\n     PyInit_*;"%(modName),
-            },
-            is_executable=False,
-        )
-    else:
-        ctx.actions.expand_template(
-            template=ctx.file.template_file,
-            output=ctx.outputs.versionscript,
-            substitutions={
-                "*tensorflow*":"*tensorflow*\ninit_%s\nPyInit_*\n"%(modName),
-            },
-            is_executable=False,
-        )
+  mod_name = ctx.attr.module_name
+  if ctx.attr.is_version_script:
+    ctx.actions.expand_template(
+      template=ctx.file.template_file,
+      output=ctx.outputs.versionscript,
+      substitutions={
+        "global:":"global:\n     init_%s;\n     PyInit_*;"%(mod_name),
+      },
+      is_executable=False,
+    )
+  else:
+    ctx.actions.expand_template(
+      template=ctx.file.template_file,
+      output=ctx.outputs.versionscript,
+      substitutions={
+        "*tensorflow*":"*tensorflow*\ninit_%s\nPyInit_*\n"%(mod_name),
+      },
+      is_executable=False,
+    )
 
 
 _append_init_to_versionscript= rule(
-    implementation=_append_init_to_versionscript_impl,
-    attrs={
-        "module_name":attr.string(mandatory=True),
-        "template_file":attr.label(allow_files=True,single_file=True,mandatory=True),
-        "is_version_script":attr.bool(default=True,doc='whether target is a ld version script or exported symbol list',mandatory=False),
-    },
-    outputs={"versionscript":"%{name}.lds"},
+  implementation=_append_init_to_versionscript_impl,
+  attrs={
+    "module_name":attr.string(mandatory=True),
+    "template_file":attr.label(allow_files=True,single_file=True,mandatory=True),
+    "is_version_script":attr.bool(default=True,
+      doc='whether target is a ld version script or exported symbol list',
+      mandatory=False),
+  },
+  outputs={"versionscript":"%{name}.lds"},
 )
 
 def tf_py_wrap_cc(name,
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index 7504154735..47cc4a5e69 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -358,8 +358,8 @@ def find_cuda_define(repository_ctx, header_dir, header_file, define):
   if not h_path.exists:
     auto_configure_fail("Cannot find %s at %s" % (header_file, str(h_path)))
   result = repository_ctx.execute(
-    # Grep one more lines as some #defines are splitted into two lines.
-    ["grep", "--color=never", "-A1", "-E", define, str(h_path)])
+      # Grep one more lines as some #defines are splitted into two lines.
+      ["grep", "--color=never", "-A1", "-E", define, str(h_path)])
   if result.stderr:
     auto_configure_fail("Error reading %s: %s" % (str(h_path), result.stderr))
 
@@ -368,14 +368,14 @@ def find_cuda_define(repository_ctx, header_dir, header_file, define):
     auto_configure_fail("Cannot find line containing '%s' in %s" %
                         (define, h_path))
   #split results to lines
-  lines=result.stdout.split('\n')
-  lenLines=len(lines)
+  lines = result.stdout.split('\n')
+  lenLines = len(lines)
   for l in range(lenLines):
-    line=lines[l]
+    line = lines[l]
     if define in line: # find the line with define
-      version=line
+      version = line
       if l != lenLines-1 and line[-1] == '\\': # add next line, if multiline
-        version=version[:-1]+lines[l+1]
+        version = version[:-1] + lines[l+1]
       break
   #remove any comments
   version = version.split("//")[0]
diff --git a/third_party/tensorrt/BUILD.tpl b/third_party/tensorrt/BUILD.tpl
index dc7fe0c8c8..354f17cda5 100644
--- a/third_party/tensorrt/BUILD.tpl
+++ b/third_party/tensorrt/BUILD.tpl
@@ -5,8 +5,6 @@ licenses(["notice"])
 
 exports_files(["LICENSE"])
 
-load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts", "if_cuda")
-
 package(default_visibility = ["//visibility:public"])
 
 cc_library(
-- 
GitLab


From 137ddf01f370028cb6014916fc409b1840f4e356 Mon Sep 17 00:00:00 2001
From: Jianwei Xie <xiejw@google.com>
Date: Fri, 2 Feb 2018 15:00:17 -0800
Subject: [PATCH 604/966] Adds batch inference support on TPU with
 TPUEstimator.predict.

PiperOrigin-RevId: 184339842
---
 .../contrib/tpu/python/tpu/tpu_estimator.py   | 627 +++++++++++++++---
 1 file changed, 541 insertions(+), 86 deletions(-)

diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index c7008533f3..b5082fc823 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -47,6 +47,7 @@ from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.estimator import util
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -170,10 +171,12 @@ class _TPUContext(object):
   ```
   """
 
-  def __init__(self, config, train_batch_size, eval_batch_size, use_tpu):
+  def __init__(self, config, train_batch_size, eval_batch_size,
+               predict_batch_size, use_tpu):
     self._config = config
     self._train_batch_size = train_batch_size
     self._eval_batch_size = eval_batch_size
+    self._predict_batch_size = predict_batch_size
     self._use_tpu = use_tpu
     self._num_shards_or_none = self._config.tpu_config.num_shards
     self._mode = None
@@ -218,31 +221,66 @@ class _TPUContext(object):
     return (self._mode == model_fn_lib.ModeKeys.TRAIN and
             not self._config.tpu_config.per_host_input_for_training)
 
-  def is_running_on_cpu(self):
-    """Determines whether the input_fn and model_fn should be invoked on CPU."""
+  def is_running_on_cpu(self, is_export_mode=False):
+    """Determines whether the input_fn and model_fn should be invoked on CPU.
+
+    Args:
+      is_export_mode: Indicates whether the current mode is for exporting the
+        model, when mode == PREDICT. Only with this bool, we could
+        tell whether user is calling the Estimator.predict or
+        Estimator.export_savedmodel, which are running on TPU and CPU
+        respectively. Parent class Estimator does not distingush these two.
+
+    Returns:
+      bool, whether current input_fn or model_fn should be running on CPU.
+
+    Raises:
+      ValueError: any configuration is invalid.
+    """
     mode = self._assert_mode()
-    return (not self._use_tpu) or mode == model_fn_lib.ModeKeys.PREDICT
+
+    if not self._use_tpu:
+      return True
+
+    if mode != model_fn_lib.ModeKeys.PREDICT:
+      return False
+
+    # There are actually 2 use cases when running with mode.PREDICT: prediction
+    # and saving the model.  We run actual predictions on the TPU, but
+    # model export is run on the CPU.
+    if is_export_mode:
+      return True
+
+    if self._predict_batch_size is None:
+      raise ValueError(
+          'predict_batch_size in TPUEstimator constructor should not be '
+          '`None` if .predict is running on TPU.')
+    if self.num_hosts > 1:
+      raise ValueError(
+          'TPUEstimator.predict should be running on single host.')
+
+    return False
 
   @property
   def global_batch_size(self):
     mode = self._assert_mode()
-    return (self._train_batch_size
-            if mode == model_fn_lib.ModeKeys.TRAIN else self._eval_batch_size)
+    if mode == model_fn_lib.ModeKeys.TRAIN:
+      return self._train_batch_size
+    elif mode == model_fn_lib.ModeKeys.EVAL:
+      return self._eval_batch_size
+    elif mode == model_fn_lib.ModeKeys.PREDICT:
+      return self._predict_batch_size
+    else:
+      return None
 
   @property
   def batch_size_for_input_fn(self):
     """Returns the shard batch size for `input_fn`."""
-    mode = self._assert_mode()
+    global_batch_size = self.global_batch_size
+
     if self.is_running_on_cpu():
-      if mode == model_fn_lib.ModeKeys.TRAIN:
-        return self._train_batch_size
-      if mode == model_fn_lib.ModeKeys.EVAL:
-        return self._eval_batch_size
-      return None
+      return global_batch_size
 
-    global_batch_size = (
-        self._train_batch_size
-        if mode == model_fn_lib.ModeKeys.TRAIN else self._eval_batch_size)
     # On TPU
     if self.is_input_sharded_per_core():
       return global_batch_size // self.num_cores
@@ -252,19 +290,13 @@ class _TPUContext(object):
   @property
   def batch_size_for_model_fn(self):
     """Returns the shard batch size for `model_fn`."""
-    mode = self._assert_mode()
+    global_batch_size = self.global_batch_size
+
     if self.is_running_on_cpu():
-      if mode == model_fn_lib.ModeKeys.TRAIN:
-        return self._train_batch_size
-      if mode == model_fn_lib.ModeKeys.EVAL:
-        return self._eval_batch_size
-      return None
+      return global_batch_size
 
     # On TPU. always sharded per core.
-    if mode == model_fn_lib.ModeKeys.TRAIN:
-      return self._train_batch_size // self.num_cores
-    else:
-      return self._eval_batch_size // self.num_cores
+    return global_batch_size // self.num_cores
 
   @property
   def master_job(self):
@@ -506,6 +538,22 @@ class _OpQueueContext(object):
     self._thread.join()
 
 
+class _OpSignalOnceQueueContext(_OpQueueContext):
+  """Manages work queue and thread for a infeed/outfeed thread.
+
+  This subclass only signals once.
+  """
+
+  def __init__(self, name, target, args):
+    super(_OpSignalOnceQueueContext, self).__init__(name, target, args)
+    self._has_signaled = False
+
+  def send_next_batch_signal(self, iterations):
+    if not self._has_signaled:
+      self._queue.put(iterations)
+      self._has_signaled = True
+
+
 class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
   """A Session hook setting up the TPU initialization, infeed, and outfeed.
 
@@ -633,13 +681,16 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
     except Exception as e:  # pylint: disable=broad-except
       self._log_error(session, e)
 
+  def _create_infeed_controller(self, name, target, args):
+    return _OpQueueContext(name=name, target=target, args=args)
+
   def after_create_session(self, session, coord):
     logging.info('Init TPU system')
     session.run(self._init_ops,
                 options=config_pb2.RunOptions(timeout_in_ms=5 * 60 * 1000))
 
     logging.info('Start infeed thread controller')
-    self._infeed_controller = _OpQueueContext(
+    self._infeed_controller = self._create_infeed_controller(
         name='InfeedController', target=self._run_infeed, args=(session,))
 
     logging.info('Start outfeed thread controller')
@@ -677,6 +728,16 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
     session.run(self._finalize_ops)
 
 
+class TPUInfeedOutfeedSessionHookForPrediction(TPUInfeedOutfeedSessionHook):
+
+  def __init__(self, ctx, enqueue_ops, dequeue_ops):
+    super(TPUInfeedOutfeedSessionHookForPrediction, self).__init__(
+        ctx, enqueue_ops, dequeue_ops, run_infeed_loop_on_coordinator=False)
+
+  def _create_infeed_controller(self, name, target, args):
+    return _OpSignalOnceQueueContext(name=name, target=target, args=args)
+
+
 class _TPUStopAtStepHook(session_run_hook.SessionRunHook):
   """Hook that requests stop at a specified step.
 
@@ -764,6 +825,47 @@ class _SetEvalIterationsHook(session_run_hook.SessionRunHook):
     self._iterations_per_loop_var.load(self._num_steps, session=session)
 
 
+class _StoppingPredictHook(session_run_hook.SessionRunHook):
+  """Hook that requests stop according to the stopping signal in prediction."""
+
+  def __init__(self, scalar_stopping_signal):
+    self._scalar_stopping_signal = scalar_stopping_signal
+
+  def begin(self):
+    self._iterations_per_loop_var = _create_or_get_iterations_per_loop()
+
+  def after_create_session(self, session, coord):
+    # This is not necessary as we do not run infeed enqueue and outfeed dequeue
+    # in side threads for prediction model. But it makes the
+    # TPUInfeedOutfeedSessionHook prints nice message.
+    self._iterations_per_loop_var.load(1, session=session)
+
+  def before_run(self, run_context):
+    return session_run_hook.SessionRunArgs(self._scalar_stopping_signal)
+
+  def after_run(self, run_context, run_values):
+    _ = run_context
+    scalar_stopping_signal = run_values.results
+    if _StopSignals.should_stop(scalar_stopping_signal):
+      # NOTE(xiejw): In prediction, stopping signals are inserted for each
+      # batch. And we append one more batch to signal the system it should stop.
+      # The data flow might look like
+      #
+      #  batch   0: images, labels, stop = 0  (user provideded)
+      #  batch   1: images, labels, stop = 0  (user provideded)
+      #  ...
+      #  batch  99: images, labels, stop = 0  (user provideded)
+      #  batch 100: images, labels, stop = 1  (TPUEstimator appended)
+      #
+      # where the final batch (id = 100) is appended by TPUEstimator, so we
+      # should drop it before returning the predictions to user.
+      # To achieve that, we throw the OutOfRangeError in after_run. Once
+      # Monitored Session sees this error in SessionRunHook.after_run, the
+      # "current" prediciton, i.e., batch with id=100, will be discarded
+      # immediately
+      raise errors.OutOfRangeError(None, None, 'Stopped by stopping signal.')
+
+
 def generate_per_core_enqueue_ops_fn_for_host(ctx, input_fn,
                                               inputs_structure_recorder):
   """Generates infeed enqueue ops for per-core input_fn on a single host."""
@@ -815,6 +917,14 @@ def generate_per_host_enqueue_ops_fn_for_host(
     inputs = _Inputs.from_input_fn(input_fn())
 
     is_dataset = inputs.is_dataset
+    if ctx.mode == model_fn_lib.ModeKeys.PREDICT:
+      if not is_dataset:
+        raise TypeError(
+            'For mode PREDICT, `input_fn` must return `Dataset` instead of '
+            '`features` and `labels`.')
+      inputs = _InputsWithStoppingSignals(
+          dataset=inputs.dataset, batch_size=ctx.batch_size_for_input_fn)
+
     if is_dataset:
       hooks.append(inputs.dataset_initializer_hook())
 
@@ -825,11 +935,13 @@ def generate_per_host_enqueue_ops_fn_for_host(
       # dataset, it is initialized and the features and labels extracted via
       # `dataset.iterator.get_next()`
       features, labels = inputs.features_and_labels()
+      signals = inputs.signals()
 
-      inputs_structure_recorder.validate_and_record_structure(features, labels)
+      inputs_structure_recorder.validate_and_record_structure(
+          features, labels, signals)
       unsharded_tensor_list = (
           inputs_structure_recorder.flatten_features_and_labels(
-              features, labels))
+              features, labels, signals))
 
       infeed_queue = tpu_feed.InfeedQueue(
           tuple_types=[t.dtype for t in unsharded_tensor_list],
@@ -841,7 +953,13 @@ def generate_per_host_enqueue_ops_fn_for_host(
       per_host_enqueue_ops = (
           infeed_queue.split_inputs_and_generate_enqueue_ops(
               unsharded_tensor_list, placement_function=lambda x: device))
-      return per_host_enqueue_ops
+      if signals is None:
+        return per_host_enqueue_ops
+      else:
+        return {
+            'ops': per_host_enqueue_ops,
+            'signals': signals,
+        }
 
   return enqueue_ops_fn, captured_infeed_queue, hooks, is_dataset
 
@@ -883,6 +1001,7 @@ class _InputPipeline(object):
       self._feature_names = []
       self._label_names = []
       self._has_labels = False
+      self._signals_helper = None
 
       # Internal state.
       self._initialized = False
@@ -890,7 +1009,7 @@ class _InputPipeline(object):
     def has_labels(self):
       return self._has_labels
 
-    def validate_and_record_structure(self, features, labels):
+    def validate_and_record_structure(self, features, labels, signals=None):
       """Validates and records the structure of features` and `labels`."""
 
       def _extract_key_names(tensor_or_dict):
@@ -903,6 +1022,10 @@ class _InputPipeline(object):
       feature_names = _extract_key_names(features)
       label_names = _extract_key_names(labels)
 
+      if signals is not None and self._signals_helper is None:
+        # Record signals helper.
+        self._signals_helper = _SignalsHelper(signals)
+
       if self._initialized:
         # Verify the structure is same. The following should never happen.
         assert feature_names == self._feature_names, 'feature keys mismatched'
@@ -915,7 +1038,7 @@ class _InputPipeline(object):
         self._label_names = label_names
         self._has_labels = has_labels
 
-    def flatten_features_and_labels(self, features, labels):
+    def flatten_features_and_labels(self, features, labels, signals=None):
       """Flattens the `features` and `labels` to a single tensor list."""
       flattened_inputs = []
       if self._feature_names:
@@ -931,6 +1054,9 @@ class _InputPipeline(object):
           flattened_inputs.extend([labels[name] for name in self._label_names])
         else:
           flattened_inputs.append(labels)
+
+      if signals is not None:
+        flattened_inputs.extend(_SignalsHelper.as_tensor_list(signals))
       return flattened_inputs
 
     def unflatten_features_and_labels(self, flattened_inputs):
@@ -956,7 +1082,11 @@ class _InputPipeline(object):
       else:
         expected_num_labels = 0
 
-      expected_num_tensors = expected_num_features + expected_num_labels
+      expected_num_signals = (
+          self._signals_helper.num_signals if self._signals_helper else 0)
+
+      expected_num_tensors = (
+          expected_num_features + expected_num_labels + expected_num_signals)
 
       if expected_num_tensors != len(flattened_inputs):
         raise ValueError(
@@ -973,13 +1103,20 @@ class _InputPipeline(object):
       if expected_num_labels == 0:
         unflattened_label = None
       elif self._label_names:
-        unflattened_label = dict(
-            zip(self._label_names, flattened_inputs[expected_num_features:]))
+        label_list = flattened_inputs[
+            expected_num_features:expected_num_features + expected_num_labels]
+        unflattened_label = dict(zip(self._label_names, label_list))
       else:
         # Single tensor case.
         unflattened_label = flattened_inputs[expected_num_features]
 
-      return _Inputs(unflattened_features, unflattened_label)
+      signals = None
+      if expected_num_signals != 0:
+        tensor_list_for_signals = flattened_inputs[
+            expected_num_features + expected_num_labels:]
+        signals = self._signals_helper.unflatten(tensor_list_for_signals)
+
+      return _Inputs(unflattened_features, unflattened_label, signals=signals)
 
   def __init__(self, input_fn, batch_axis, ctx):
     """Constructor.
@@ -1078,9 +1215,12 @@ class _InputPipeline(object):
             # slow compared to the previous case.
             if is_dataset:
               run_infeed_loop_on_coordinator = False
+              wrap_fn = (
+                  _wrap_computation_in_while_loop
+                  if self._ctx.mode != model_fn_lib.ModeKeys.PREDICT else
+                  _wrap_computation_in_while_loop_with_stopping_signals)
               enqueue_ops.append(
-                  _wrap_computation_in_while_loop(
-                      device=host_device, op_fn=enqueue_ops_fn))
+                  wrap_fn(device=host_device, op_fn=enqueue_ops_fn))
             else:
               enqueue_ops.append(enqueue_ops_fn())
             infeed_queues.append(captured_infeed_queue.get())
@@ -1145,7 +1285,8 @@ class _ModelFnWrapper(object):
         infeed dequeue channel.
 
     Returns:
-      A Fn representing the train step for TPU.
+      A tuple of train_fn, host_calls, and captured scaffold_fn. The train_fn
+      representing the train step for TPU.
     """
 
     host_call = _OutfeedHostCall(self._ctx)
@@ -1198,8 +1339,8 @@ class _ModelFnWrapper(object):
         infeed dequeue channel.
 
     Returns:
-      A tuple of eval_fn and eval_metrics. The eval_fn representing the eval
-      step for TPU. and eval_metrics is an `_OutfeedHostCall` instance.
+      A tuple of eval_fn, host_calls, and captured scaffold_fn. The eval_fn
+      representing the eval step for TPU.
     """
     host_calls = _OutfeedHostCall(self._ctx)
     captured_scaffold_fn = _CapturedObject()
@@ -1228,7 +1369,55 @@ class _ModelFnWrapper(object):
 
     return eval_step, host_calls, captured_scaffold_fn
 
-  def _call_model_fn(self, features, labels):
+  def convert_to_single_tpu_predict_step(self, dequeue_fn):
+    """Converts user provided model_fn` as a single predict step on TPU.
+
+    Args:
+      dequeue_fn: The function to retrieve inputs, features and labels, from TPU
+        infeed dequeue channel.
+
+    Returns:
+      A tuple of predict_fn, host_calls, and captured scaffold_fn. The
+      predict_fn representing the predict step for TPU.
+    """
+    host_calls = _OutfeedHostCall(self._ctx)
+    captured_scaffold_fn = _CapturedObject()
+
+    def predict_step(unused_scalar_stopping_signal):
+      """Evaluation step function for use inside a while loop."""
+      inputs = dequeue_fn()
+      features, labels = inputs.features_and_labels()
+      stopping_signals = inputs.signals()
+
+      assert stopping_signals is not None, (
+          'Internal Error: `signals` is missing.')
+
+      tpu_estimator_spec = self._call_model_fn(
+          features, labels, is_export_mode=False)
+      if not isinstance(tpu_estimator_spec, TPUEstimatorSpec):
+        raise RuntimeError(
+            'estimator_spec used by TPU prediction must have type'
+            '`TPUEstimatorSpec`. Got {}'.format(type(tpu_estimator_spec)))
+
+      captured_scaffold_fn.capture(tpu_estimator_spec.scaffold_fn)
+      to_record = {}
+      identity_fn = lambda **kwargs: kwargs
+      # TODO(xiejw): Adds validation for prediction dictionrary.
+      # TODO(xiejw): Adds support for single tensor as predictions.
+      if not isinstance(tpu_estimator_spec.predictions, dict):
+        raise TypeError('TPUEstimatorSpec.predictions must be dict of Tensors.')
+      to_record['predictions'] = [identity_fn, tpu_estimator_spec.predictions]
+      to_record['signals'] = [identity_fn, stopping_signals]
+      if tpu_estimator_spec.host_call is not None:
+        to_record['host_call'] = tpu_estimator_spec.host_call
+      host_calls.record(to_record)
+
+      with ops.control_dependencies(host_calls.create_enqueue_op()):
+        return _StopSignals.as_scalar_stopping_signal(stopping_signals)
+
+    return predict_step, host_calls, captured_scaffold_fn
+
+  def _call_model_fn(self, features, labels, is_export_mode=True):
     """Calls the model_fn with required parameters."""
     model_fn_args = util.fn_args(self._model_fn)
     kwargs = {}
@@ -1259,7 +1448,7 @@ class _ModelFnWrapper(object):
       params[_BATCH_SIZE_KEY] = batch_size_for_model_fn
 
     estimator_spec = self._model_fn(features=features, **kwargs)
-    if (self._ctx.is_running_on_cpu() and
+    if (self._ctx.is_running_on_cpu(is_export_mode) and
         isinstance(estimator_spec, TPUEstimatorSpec)):
       # The estimator_spec will be passed to `Estimator` directly, which expects
       # type `EstimatorSpec`.
@@ -1614,6 +1803,7 @@ class TPUEstimator(estimator_lib.Estimator):
                use_tpu=True,
                train_batch_size=None,
                eval_batch_size=None,
+               predict_batch_size=None,
                batch_axis=None):
     """Constructs an `TPUEstimator` instance.
 
@@ -1639,7 +1829,9 @@ class TPUEstimator(estimator_lib.Estimator):
         size, as params['batch_size'], when calling `input_fn` and `model_fn`.
         Cannot be `None` if `use_tpu` is `True`. Must be divisible by
         `config.tpu_config.num_shards`.
-      eval_batch_size: An int representing the global training batch size.
+      eval_batch_size: An int representing evaluation batch size.
+        Must be divisible by `config.tpu_config.num_shards`.
+      predict_batch_size: An int representing the prediction batch size.
         Must be divisible by `config.tpu_config.num_shards`.
       batch_axis: A python tuple of int values describing how each tensor
         produced by the Estimator `input_fn` should be split across the TPU
@@ -1689,6 +1881,16 @@ class TPUEstimator(estimator_lib.Estimator):
               'eval batch size {} must be divisible by number of shards {}'
               .format(eval_batch_size, config.tpu_config.num_shards))
 
+      if predict_batch_size is not None:
+        if not isinstance(predict_batch_size, int):
+          raise ValueError('`predict_batch_size` must be an int')
+        if predict_batch_size < 1:
+          raise ValueError('`predict_batch_size` must be positive')
+        if predict_batch_size % config.tpu_config.num_shards != 0:
+          raise ValueError(
+              'predict batch size {} must be divisible by number of shards {}'
+              .format(predict_batch_size, config.tpu_config.num_shards))
+
     # Verifies the model_fn signature according to Estimator framework.
     estimator_lib._verify_model_fn_args(model_fn, params)  # pylint: disable=protected-access
     # We cannot store config and params in this constructor as parent
@@ -1708,7 +1910,7 @@ class TPUEstimator(estimator_lib.Estimator):
 
     # All properties passed to _TPUContext are immutable.
     self._ctx = _TPUContext(self._config, train_batch_size, eval_batch_size,
-                            use_tpu)
+                            predict_batch_size, use_tpu)
 
   def _create_global_step(self, graph):
     """Creates a global step suitable for TPUs.
@@ -1804,7 +2006,9 @@ class TPUEstimator(estimator_lib.Estimator):
       if batch_size_for_input_fn is not None:
         kwargs['params'][_BATCH_SIZE_KEY] = batch_size_for_input_fn
 
-      if ctx.is_running_on_cpu():
+      # For export_savedmodel, input_fn is never passed to Estimator. So,
+      # `is_export_mode` must be False.
+      if ctx.is_running_on_cpu(is_export_mode=False):
         with ops.device('/device:CPU:0'):
           return input_fn(**kwargs)
 
@@ -1831,8 +2035,13 @@ class TPUEstimator(estimator_lib.Estimator):
       with self._ctx.with_mode(mode) as ctx:
         model_fn_wrapper = _ModelFnWrapper(model_fn, config, params, ctx)
 
-        # TODO(jhseu): Move to PREDICT to TPU.
-        if ctx.is_running_on_cpu():
+        # For export_savedmodel, input_fn is never passed to Estimator. So,
+        # if features is callable, it means it is the input_fn passed by
+        # TPUEstimator._call_input_fn. Then we can know if the mode == PREDICT,
+        # it implies, it is the .predict API, not export_savedmodel API.
+        is_export_mode = not callable(features)
+
+        if ctx.is_running_on_cpu(is_export_mode=is_export_mode):
           logging.info('Running %s on CPU', mode)
           return model_fn_wrapper.call_without_tpu(features, labels)
 
@@ -1881,53 +2090,114 @@ class TPUEstimator(estimator_lib.Estimator):
               train_op=control_flow_ops.group(*update_ops),
               scaffold=scaffold)
 
-        # Now eval.
-        total_loss, host_calls, scaffold = _eval_on_tpu_system(
+        if mode == model_fn_lib.ModeKeys.EVAL:
+          total_loss, host_calls, scaffold = _eval_on_tpu_system(
+              ctx, model_fn_wrapper, dequeue_fn)
+          iterations_per_loop_var = _create_or_get_iterations_per_loop()
+          mean_loss = math_ops.div(total_loss,
+                                   math_ops.cast(
+                                       iterations_per_loop_var,
+                                       dtype=total_loss.dtype))
+
+          # Creates a dummy metric update_op for all metrics. Estimator expects
+          # all metrics in eval_metric_ops have update_op and calls them one by
+          # one. The real metric update_ops are invoked in a separated thread.
+          # So, here give Estimator the dummy op for all metrics.
+          with ops.control_dependencies([mean_loss]):
+            # After TPU evaluation computation is done (the mean_loss tensor),
+            # reads all variables back from TPU and updates the eval step
+            # counter properly
+            internal_ops_to_run = _sync_variables_ops()
+            internal_ops_to_run.append(
+                _increase_eval_step_op(iterations_per_loop_var))
+            with ops.control_dependencies(internal_ops_to_run):
+              dummy_update_op = control_flow_ops.no_op()
+
+          host_call_ret = host_calls.create_tpu_hostcall()
+          eval_metric_ops = {}
+          eval_update_ops = []
+          for k, v in host_call_ret['eval_metrics'].items():
+            eval_metric_ops[k] = (v[0], dummy_update_op)
+            eval_update_ops.append(v[1])
+
+          if 'host_call' not in host_call_ret:
+            host_ops = []
+          else:
+            host_ops = host_call_ret['host_call']
+          hooks = [
+              TPUInfeedOutfeedSessionHook(
+                  ctx,
+                  enqueue_ops,
+                  eval_update_ops + host_ops,
+                  run_infeed_loop_on_coordinator=(
+                      run_infeed_loop_on_coordinator)),
+          ] + input_hooks
+
+          return model_fn_lib.EstimatorSpec(
+              mode,
+              loss=mean_loss,
+              evaluation_hooks=hooks,
+              eval_metric_ops=eval_metric_ops,
+              scaffold=scaffold)
+
+        # Predict
+        assert mode == model_fn_lib.ModeKeys.PREDICT
+
+        dummy_predict_op, host_calls, scaffold = _predict_on_tpu_system(
             ctx, model_fn_wrapper, dequeue_fn)
-        iterations_per_loop_var = _create_or_get_iterations_per_loop()
-        mean_loss = math_ops.div(total_loss,
-                                 math_ops.cast(
-                                     iterations_per_loop_var,
-                                     dtype=total_loss.dtype))
-
-        # Creates a dummy metric update_op for all metrics. Estimator expects
-        # all metrics in eval_metric_ops have update_op and calls them one by
-        # one. The real metric update_ops are invoked in a separated thread. So,
-        # here give Estimator the dummy op for all metrics.
-        with ops.control_dependencies([mean_loss]):
-          # After TPU evaluation computation is done (the mean_loss tensor),
-          # reads all variables back from TPU and updates the eval step counter
-          # properly
+        with ops.control_dependencies([dummy_predict_op]):
           internal_ops_to_run = _sync_variables_ops()
-          internal_ops_to_run.append(
-              _increase_eval_step_op(iterations_per_loop_var))
           with ops.control_dependencies(internal_ops_to_run):
-            dummy_update_op = control_flow_ops.no_op()
+            dummy_predict_op = control_flow_ops.no_op()
+
+        # In train and evaluation, the main TPU program is passed to monitored
+        # training session to run. Infeed enqueue and outfeed dequeue are
+        # executed in side threads. This is not the configuration for
+        # prediction mode.
+        #
+        # For prediction, the Estimator executes the EstimatorSpec.predictions
+        # directly and yield the element (via generator) to call site. So, the
+        # outfeed based prediction must be passed to MonitoredSession directly.
+        # Other parts of the TPU execution are organized as follows.
+        #
+        # 1. All outfeed based Tensors must be grouped with predictions Tensors
+        #    to form a single invocation. This avoid the issue we might trigger
+        #    multiple outfeeds incorrectly. To achieve this, `host_call` is
+        #    placed in control_dependencies of `stopping_signals`, and
+        #    `stopping_signals` is passed into _StoppingPredictHook, which sets
+        #    the `stopping_signals` as SessionRunArgs. MonitoredSession merges
+        #    all SessionRunArgs with the fetch in session.run together.
+        #
+        # 2. The TPU program (dummy_predict_op) and enqueue_ops (infeed Enqueue)
+        #    are grouped together. They will be launched once and only once in
+        #    side threads and they quit naturally according to the SAME stopping
+        #    condition.
+        enqueue_ops.append(dummy_predict_op)
 
         host_call_ret = host_calls.create_tpu_hostcall()
-        eval_metric_ops = {}
-        eval_update_ops = []
-        for k, v in host_call_ret['eval_metrics'].items():
-          eval_metric_ops[k] = (v[0], dummy_update_op)
-          eval_update_ops.append(v[1])
-
         if 'host_call' not in host_call_ret:
           host_ops = []
         else:
           host_ops = host_call_ret['host_call']
+
+        predictions = host_call_ret['predictions']
+        stopping_signals = host_call_ret['signals']
+
+        with ops.control_dependencies(host_ops):
+          host_ops = []  # Empty, we do do not need it anymore.
+          scalar_stopping_signal = _StopSignals.as_scalar_stopping_signal(
+              stopping_signals)
+
         hooks = [
-            TPUInfeedOutfeedSessionHook(
-                ctx,
-                enqueue_ops,
-                eval_update_ops + host_ops,
-                run_infeed_loop_on_coordinator=run_infeed_loop_on_coordinator),
+            _StoppingPredictHook(scalar_stopping_signal),
+            TPUInfeedOutfeedSessionHookForPrediction(ctx, enqueue_ops,
+                                                     host_ops),
         ] + input_hooks
 
         return model_fn_lib.EstimatorSpec(
             mode,
-            loss=mean_loss,
-            evaluation_hooks=hooks,
-            eval_metric_ops=eval_metric_ops,
+            prediction_hooks=hooks,
+            predictions=predictions,
             scaffold=scaffold)
 
     return _model_fn
@@ -1981,6 +2251,34 @@ def _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
   return loss, host_call, scaffold
 
 
+def _predict_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
+  """Executes `model_fn_wrapper` multiple times on all TPU shards."""
+  num_cores = ctx.num_cores
+
+  single_tpu_predict_step, host_calls, captured_scaffold_fn = (
+      model_fn_wrapper.convert_to_single_tpu_predict_step(dequeue_fn))
+
+  def multi_tpu_predict_steps_on_single_shard():
+
+    def cond(scalar_stopping_signal):
+      return math_ops.logical_not(
+          _StopSignals.should_stop(scalar_stopping_signal))
+
+    inputs = [_StopSignals.NON_STOPPING_SIGNAL]
+    outputs = training_loop.while_loop(
+        cond, single_tpu_predict_step, inputs=inputs, name=b'loop')
+    return outputs
+
+  (dummy_predict_op,) = tpu.shard(
+      multi_tpu_predict_steps_on_single_shard,
+      inputs=[],
+      num_shards=num_cores,
+      outputs_from_all_shards=False)
+
+  scaffold = _get_scaffold(captured_scaffold_fn)
+  return dummy_predict_op, host_calls, scaffold
+
+
 def _wrap_computation_in_while_loop(device, op_fn):
   """Wraps the ops generated by `op_fn` in tf.while_loop."""
 
@@ -1999,6 +2297,29 @@ def _wrap_computation_in_while_loop(device, op_fn):
         parallel_iterations=1)
 
 
+def _wrap_computation_in_while_loop_with_stopping_signals(device, op_fn):
+  """Wraps the ops generated by `op_fn` in tf.while_loop."""
+
+  def cond(scalar_stopping_signal):
+    return math_ops.logical_not(
+        _StopSignals.should_stop(scalar_stopping_signal))
+
+  def computation(unused_scalar_stopping_signal):
+    return_value = op_fn()
+    execute_ops = return_value['ops']
+    signals = return_value['signals']
+    with ops.control_dependencies(execute_ops):
+      return _StopSignals.as_scalar_stopping_signal(signals)
+
+  # By setting parallel_iterations=1, the parallel execution in while_loop is
+  # basically turned off.
+  with ops.device(device):
+    return control_flow_ops.while_loop(
+        cond,
+        computation, [_StopSignals.NON_STOPPING_SIGNAL],
+        parallel_iterations=1)
+
+
 def _validate_tpu_training_graph():
   """Validate graph before running distributed training.
 
@@ -2091,21 +2412,22 @@ class _CapturingContext(control_flow_ops.ControlFlowContext):
     self._g._set_control_flow_context(self._old)  # pylint: disable=protected-access
 
 
-# TODO(xiejw): Extend this to support internal signal.
 class _Inputs(object):
   """A data structure representing the input_fn returned values.
 
   This also supports the returned value from input_fn as `Dataset`.
   """
 
-  def __init__(self, features=None, labels=None, dataset=None):
-    if dataset is not None and (features is not None or labels is not None):
+  def __init__(self, features=None, labels=None, dataset=None, signals=None):
+    if dataset is not None and (features is not None or labels is not None or
+                                signals is not None):
       raise RuntimeError('Internal Error: Either (features and labels) or '
                          'dataset should be provided, not both. Please file '
                          'bug')
 
     self._features = features
     self._labels = labels
+    self._signals = signals
 
     self._dataset = dataset
     self._iterator = None
@@ -2117,11 +2439,16 @@ class _Inputs(object):
       dataset = return_values
       return _Inputs(dataset=dataset)
 
+    features, labels = _Inputs._parse_inputs(return_values)
+    return _Inputs(features, labels)
+
+  @staticmethod
+  def _parse_inputs(return_values):
     if isinstance(return_values, tuple):
       features, labels = return_values
     else:
       features, labels = return_values, None
-    return _Inputs(features, labels)
+    return features, labels
 
   @property
   def is_dataset(self):
@@ -2142,7 +2469,135 @@ class _Inputs(object):
   def features_and_labels(self):
     """Gets `features` and `labels`."""
     if self.is_dataset:
-      return (_Inputs.from_input_fn(
-          self._iterator.get_next()).features_and_labels())
+      return _Inputs._parse_inputs(self._iterator.get_next())
 
     return (self._features, self._labels)
+
+  def signals(self):
+    return self._signals
+
+  @property
+  def dataset(self):
+    return self._dataset
+
+
+# TODO(xiejw): Extend this to support final partial batch.
+class _InputsWithStoppingSignals(_Inputs):
+  """Inputs with `_StopSignals` inserted into the dataset."""
+
+  def __init__(self, dataset, batch_size):
+
+    assert dataset is not None
+
+    user_provided_dataset = dataset.map(
+        _InputsWithStoppingSignals.insert_stopping_signal(
+            stop=False, batch_size=batch_size))
+    final_batch_dataset = dataset.take(1).map(
+        _InputsWithStoppingSignals.insert_stopping_signal(
+            stop=True, batch_size=batch_size))
+    dataset = user_provided_dataset.concatenate(final_batch_dataset).prefetch(2)
+
+    super(_InputsWithStoppingSignals, self).__init__(dataset=dataset)
+    self._current_inputs = None
+
+  def features_and_labels(self):
+    if self._current_inputs is not None:
+      raise RuntimeError(
+          'Internal Error: The previous inputs have not been properly '
+          'consumed. First call features_and_labels, then call signals.')
+
+    inputs_with_signals = self._iterator.get_next()
+    features = inputs_with_signals['features']
+    labels = inputs_with_signals.get('labels')
+
+    self._current_inputs = inputs_with_signals
+    return features, labels
+
+  def signals(self):
+    """Returns the `Signals` from `_Inputs`."""
+    if self._current_inputs is None:
+      raise RuntimeError(
+          'Internal Error: The current inputs have not been properly '
+          'generated. First call features_and_labels, then call signals.')
+    signals = self._current_inputs['signals']
+    self._current_inputs = None
+    return signals
+
+  @staticmethod
+  def insert_stopping_signal(stop, batch_size):
+    """Inserts stopping_signal into dataset via _map_fn.
+
+    Here we change the data structure in the dataset, such that the return value
+    is a dictionary now and `features`, `labels`, and `signals` are three
+    distinguished keys in that dict. This provides a better structure, which
+    eases the process to decompose the inputs (see `features_and_labels`).
+
+    Args:
+      stop: bool, state of current stopping signals.
+      batch_size: int, batch size.
+
+    Returns:
+      A map_fn passed to dataset.map API.
+    """
+
+    def _map_fn(*args):
+      features, labels = _Inputs._parse_inputs(args)
+      new_input_dict = {}
+      new_input_dict['features'] = features
+      if labels is not None:
+        new_input_dict['labels'] = labels
+      new_input_dict['signals'] = _StopSignals(
+          stop=stop, batch_size=batch_size).as_dict()
+      return new_input_dict
+
+    return _map_fn
+
+
+class _StopSignals(object):
+  """Signals class holding all logic to handle TPU stopping condition."""
+
+  NON_STOPPING_SIGNAL = 0.0
+  STOPPING_SIGNAL = 1.0
+
+  def __init__(self, stop, batch_size):
+    self._stop = stop
+    self._batch_size = batch_size
+
+  def as_dict(self):
+    shape = [self._batch_size, 1]
+    dtype = dtypes.float32
+
+    if self._stop:
+      stopping = array_ops.ones(shape=shape, dtype=dtype)
+    else:
+      stopping = array_ops.zeros(shape=shape, dtype=dtype)
+
+    return {'stopping': stopping}
+
+  @staticmethod
+  def as_scalar_stopping_signal(signals):
+    return array_ops.identity(signals['stopping'][0][0])
+
+  @staticmethod
+  def should_stop(scalar_stopping_signal):
+    return scalar_stopping_signal >= _StopSignals.STOPPING_SIGNAL
+
+
+class _SignalsHelper(object):
+  """A general helper class to handle common signals manipulation."""
+
+  def __init__(self, signals):
+    self._signal_keys = []
+    for key in sorted(signals.iterkeys()):
+      self._signal_keys.append(key)
+
+  @property
+  def num_signals(self):
+    return len(self._signal_keys)
+
+  def unflatten(self, tensor_list):
+    return dict(zip(self._signal_keys, tensor_list))
+
+  @staticmethod
+  def as_tensor_list(signals):
+    return [signals[key] for key in sorted(signals.iterkeys())]
-- 
GitLab


From 29d118793b3a08b87b4019da56cdf3579cd88694 Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Fri, 2 Feb 2018 15:51:41 -0800
Subject: [PATCH 605/966] Rollback of recent changes to tensor.{h,cc}

PiperOrigin-RevId: 184347012
---
 tensorflow/core/framework/tensor.cc  | 4 ++--
 tensorflow/core/framework/tensor.h   | 6 ++----
 tensorflow/python/kernel_tests/BUILD | 1 +
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/tensorflow/core/framework/tensor.cc b/tensorflow/core/framework/tensor.cc
index 77a3edcc10..4f08cdc1d7 100644
--- a/tensorflow/core/framework/tensor.cc
+++ b/tensorflow/core/framework/tensor.cc
@@ -615,11 +615,11 @@ void Tensor::CheckType(DataType expected_dtype) const {
 
 void Tensor::CheckTypeAndIsAligned(DataType expected_dtype) const {
   CHECK_EQ(dtype(), expected_dtype);
-  CHECK(IsAligned()) << "CheckTypeAndIsAligned";
+  CHECK(IsAligned());
 }
 
 void Tensor::CheckIsAlignedAndSingleElement() const {
-  CHECK(IsAligned()) << "Aligned and single element";
+  CHECK(IsAligned());
   CHECK_EQ(1, NumElements()) << "Must have a one element tensor";
 }
 
diff --git a/tensorflow/core/framework/tensor.h b/tensorflow/core/framework/tensor.h
index 94c39c53a6..92d10f0d8c 100644
--- a/tensorflow/core/framework/tensor.h
+++ b/tensorflow/core/framework/tensor.h
@@ -660,8 +660,7 @@ void Tensor::FillDimsAndValidateCompatibleShape(
 template <typename T, size_t NDIMS>
 typename TTypes<T, NDIMS>::Tensor Tensor::shaped(
     gtl::ArraySlice<int64> new_sizes) {
-  CheckType(DataTypeToEnum<T>::v());
-  CHECK(IsAligned());
+  CheckTypeAndIsAligned(DataTypeToEnum<T>::v());
   Eigen::array<Eigen::DenseIndex, NDIMS> dims;
   FillDimsAndValidateCompatibleShape(new_sizes, &dims);
   return typename TTypes<T, NDIMS>::Tensor(base<T>(), dims);
@@ -688,8 +687,7 @@ typename TTypes<T, NDIMS>::UnalignedTensor Tensor::unaligned_shaped(
 template <typename T, size_t NDIMS>
 typename TTypes<T, NDIMS>::ConstTensor Tensor::shaped(
     gtl::ArraySlice<int64> new_sizes) const {
-  CheckType(DataTypeToEnum<T>::v());
-  CHECK(IsAligned());
+  CheckTypeAndIsAligned(DataTypeToEnum<T>::v());
   Eigen::array<Eigen::DenseIndex, NDIMS> dims;
   FillDimsAndValidateCompatibleShape(new_sizes, &dims);
   return typename TTypes<T, NDIMS>::ConstTensor(base<T>(), dims);
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index c87b7652ad..54011b2b4d 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -96,6 +96,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
     ],
     grpc_enabled = True,
+    tags = ["no_oss_gpu_py3"],
 )
 
 cuda_py_test(
-- 
GitLab


From 026c90c2ebae46cc68d7a9f05474bdd18bb18a6f Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Fri, 2 Feb 2018 15:52:18 -0800
Subject: [PATCH 606/966] TF_CALL_ALL_TYPES should include variant

PiperOrigin-RevId: 184347081
---
 tensorflow/core/framework/register_types.h       | 2 +-
 tensorflow/core/kernels/batch_util.cc            | 2 --
 tensorflow/core/kernels/concat_lib_cpu.cc        | 1 -
 tensorflow/core/kernels/gather_op.cc             | 1 -
 tensorflow/core/kernels/pack_op.cc               | 1 -
 tensorflow/core/kernels/parse_tensor_op.cc       | 1 -
 tensorflow/core/kernels/resource_variable_ops.cc | 1 -
 tensorflow/core/kernels/serialize_sparse_op.cc   | 1 -
 tensorflow/core/kernels/strided_slice_op.cc      | 1 -
 tensorflow/core/kernels/strided_slice_op.h       | 1 +
 tensorflow/core/kernels/strided_slice_op_impl.h  | 1 -
 11 files changed, 2 insertions(+), 11 deletions(-)

diff --git a/tensorflow/core/framework/register_types.h b/tensorflow/core/framework/register_types.h
index f88025fd33..e448a60f5e 100644
--- a/tensorflow/core/framework/register_types.h
+++ b/tensorflow/core/framework/register_types.h
@@ -179,7 +179,7 @@ limitations under the License.
 
 // Call "m" on all types.
 #define TF_CALL_ALL_TYPES(m) \
-  TF_CALL_POD_TYPES(m) TF_CALL_string(m) TF_CALL_resource(m)
+  TF_CALL_POD_TYPES(m) TF_CALL_string(m) TF_CALL_resource(m) TF_CALL_variant(m)
 
 // Call "m" on POD and string types.
 #define TF_CALL_POD_STRING_TYPES(m) TF_CALL_POD_TYPES(m) TF_CALL_string(m)
diff --git a/tensorflow/core/kernels/batch_util.cc b/tensorflow/core/kernels/batch_util.cc
index 87d455faa7..1a45212ad2 100644
--- a/tensorflow/core/kernels/batch_util.cc
+++ b/tensorflow/core/kernels/batch_util.cc
@@ -104,7 +104,6 @@ Status CopyElementToSlice(Tensor element, Tensor* parent, int64 index) {
   switch (element.dtype()) {
     TF_CALL_ALL_TYPES(HANDLE_TYPE);
     TF_CALL_QUANTIZED_TYPES(HANDLE_TYPE);
-    TF_CALL_variant(HANDLE_TYPE);
 #undef HANDLE_TYPE
     default:
       return errors::Unimplemented("CopyElementToSlice Unhandled data type: ",
@@ -124,7 +123,6 @@ Status CopySliceToElement(const Tensor& parent, Tensor* element, int64 index) {
   switch (parent.dtype()) {
     TF_CALL_ALL_TYPES(HANDLE_TYPE);
     TF_CALL_QUANTIZED_TYPES(HANDLE_TYPE);
-    TF_CALL_variant(HANDLE_TYPE);
 #undef HANDLE_TYPE
     default:
       return errors::Unimplemented("CopySliceToElement Unhandled data type: ",
diff --git a/tensorflow/core/kernels/concat_lib_cpu.cc b/tensorflow/core/kernels/concat_lib_cpu.cc
index fc5a3e6288..547a7b40b9 100644
--- a/tensorflow/core/kernels/concat_lib_cpu.cc
+++ b/tensorflow/core/kernels/concat_lib_cpu.cc
@@ -73,7 +73,6 @@ REGISTER(qint8)
 REGISTER(quint16)
 REGISTER(qint16)
 REGISTER(qint32)
-TF_CALL_variant(REGISTER)
 
 #if defined(IS_MOBILE_PLATFORM) && !defined(SUPPORT_SELECTIVE_REGISTRATION) && \
     !defined(__ANDROID_TYPES_FULL__)
diff --git a/tensorflow/core/kernels/gather_op.cc b/tensorflow/core/kernels/gather_op.cc
index 0a38d3d4af..08adf4badb 100644
--- a/tensorflow/core/kernels/gather_op.cc
+++ b/tensorflow/core/kernels/gather_op.cc
@@ -143,7 +143,6 @@ TF_CALL_ALL_TYPES(REGISTER_GATHER_CPU);
 TF_CALL_QUANTIZED_TYPES(REGISTER_GATHER_CPU);
 TF_CALL_quint16(REGISTER_GATHER_CPU);
 TF_CALL_qint16(REGISTER_GATHER_CPU);
-TF_CALL_variant(REGISTER_GATHER_CPU);
 
 #undef REGISTER_GATHER_CPU
 
diff --git a/tensorflow/core/kernels/pack_op.cc b/tensorflow/core/kernels/pack_op.cc
index e0ae5de0f4..5645275cfa 100644
--- a/tensorflow/core/kernels/pack_op.cc
+++ b/tensorflow/core/kernels/pack_op.cc
@@ -139,7 +139,6 @@ class PackOp : public OpKernel {
 
 TF_CALL_ALL_TYPES(REGISTER_PACK);
 TF_CALL_QUANTIZED_TYPES(REGISTER_PACK);
-TF_CALL_variant(REGISTER_PACK);
 
 #if defined(IS_MOBILE_PLATFORM) && !defined(SUPPORT_SELECTIVE_REGISTRATION)
 // Primarily used for SavedModel support on mobile.
diff --git a/tensorflow/core/kernels/parse_tensor_op.cc b/tensorflow/core/kernels/parse_tensor_op.cc
index dd41744f02..8e175fe8d4 100644
--- a/tensorflow/core/kernels/parse_tensor_op.cc
+++ b/tensorflow/core/kernels/parse_tensor_op.cc
@@ -91,7 +91,6 @@ class SerializeTensorOp : public OpKernel {
       Name("SerializeTensor").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
       SerializeTensorOp<T>);
 TF_CALL_ALL_TYPES(REGISTER)
-TF_CALL_variant(REGISTER)
 #undef REGISTER
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index 6ce53e725f..5b4aad3cdd 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -387,7 +387,6 @@ class AssignVariableOp<Device, Variant> : public OpKernel {
 
 TF_CALL_ALL_TYPES(REGISTER_KERNELS);
 TF_CALL_QUANTIZED_TYPES(REGISTER_KERNELS);
-TF_CALL_variant(REGISTER_KERNELS);
 #undef REGISTER_KERNELS
 
 #if GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/serialize_sparse_op.cc b/tensorflow/core/kernels/serialize_sparse_op.cc
index 61e40caef9..799c574d15 100644
--- a/tensorflow/core/kernels/serialize_sparse_op.cc
+++ b/tensorflow/core/kernels/serialize_sparse_op.cc
@@ -426,7 +426,6 @@ class DeserializeSparseOp : public OpKernel {
     switch (dtype_) {
       TF_CALL_ALL_TYPES(HANDLE_TYPE);
       TF_CALL_QUANTIZED_TYPES(HANDLE_TYPE);
-      TF_CALL_variant(HANDLE_TYPE);
 #undef HANDLE_TYPE
       default:
         OP_REQUIRES(context, false,
diff --git a/tensorflow/core/kernels/strided_slice_op.cc b/tensorflow/core/kernels/strided_slice_op.cc
index e0b85c6d06..7745effe2a 100644
--- a/tensorflow/core/kernels/strided_slice_op.cc
+++ b/tensorflow/core/kernels/strided_slice_op.cc
@@ -391,7 +391,6 @@ class StridedSliceAssignOp : public OpKernel {
                           StridedSliceAssignOp<CPUDevice, type>)
 
 TF_CALL_ALL_TYPES(REGISTER_STRIDED_SLICE);
-TF_CALL_variant(REGISTER_STRIDED_SLICE);
 
 #undef REGISTER_STRIDED_SLICE
 
diff --git a/tensorflow/core/kernels/strided_slice_op.h b/tensorflow/core/kernels/strided_slice_op.h
index 0f72c4b771..2b58632298 100644
--- a/tensorflow/core/kernels/strided_slice_op.h
+++ b/tensorflow/core/kernels/strided_slice_op.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/resource_handle.h"
 #include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/strided_slice_op_impl.h b/tensorflow/core/kernels/strided_slice_op_impl.h
index c3187e49ce..1c4472bb1a 100644
--- a/tensorflow/core/kernels/strided_slice_op_impl.h
+++ b/tensorflow/core/kernels/strided_slice_op_impl.h
@@ -290,7 +290,6 @@ DECLARE_FOR_N_GPU(int64);
 #endif  // END GOOGLE_CUDA
 
 TF_CALL_ALL_TYPES(DECLARE_FOR_N_CPU);
-TF_CALL_variant(DECLARE_FOR_N_CPU);
 
 #ifdef TENSORFLOW_USE_SYCL
 #define PREVENT_FOR_N_SYCL(T) \
-- 
GitLab


From e0be56b9ccc5ef372ceea632c5d606e3dcfb3cec Mon Sep 17 00:00:00 2001
From: Sami Kama <skama@nvidia.com>
Date: Fri, 2 Feb 2018 16:02:31 -0800
Subject: [PATCH 607/966] Fix issues with the tests and add mistakenly taken
 out macro back

---
 tensorflow/contrib/BUILD                    | 2 +-
 tensorflow/contrib/cmake/python_modules.txt | 1 +
 tensorflow/contrib/tensorrt/BUILD           | 4 ++--
 tensorflow/tools/pip_package/BUILD          | 5 +++--
 third_party/tensorrt/BUILD.tpl              | 1 +
 5 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index 93a0308ddd..e04ce01f43 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -106,7 +106,7 @@ py_library(
         "//tensorflow/contrib/util:util_py",
         "//tensorflow/python:util",
     ] + if_mpi(["//tensorflow/contrib/mpi_collectives:mpi_collectives_py"]) + if_tensorrt([
-      "//tensorflow/contrib/tensorrt:init_py",
+        "//tensorflow/contrib/tensorrt:init_py",
     ]),
 
 )
diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt
index 52f37b1e3b..1260b81b07 100644
--- a/tensorflow/contrib/cmake/python_modules.txt
+++ b/tensorflow/contrib/cmake/python_modules.txt
@@ -407,6 +407,7 @@ tensorflow/contrib/tensorboard/plugins
 tensorflow/contrib/tensorboard/plugins/projector
 # TODO(sami): Add cmake implementations
 # tensorflow/contrib/tensorrt/python
+# tensorflow/contrib/tensorrt/python/ops
 tensorflow/contrib/tensor_forest
 tensorflow/contrib/tensor_forest/client
 tensorflow/contrib/tensor_forest/hybrid
diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index 6e46f95fcb..d29ecbb2e6 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -78,10 +78,10 @@ cc_library(
     copts = tf_copts(),
     deps = [
         ":trt_logging",
-        "//tensorflow/core:stream_executor_headers_lib",
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:gpu_headers_lib",
         "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:stream_executor_headers_lib",
         "//third_party/eigen3",
         "@local_config_tensorrt//:nv_infer",
         "@nsync//:nsync_headers",
@@ -186,9 +186,9 @@ tf_cuda_library(
     deps = [
         ":segment",
         ":trt_logging",
-        "//tensorflow/core:graph",
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:framework_lite",
+        "//tensorflow/core:graph",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:devices",
         "//tensorflow/core/grappler/clusters:virtual_cluster",
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 63bceaa8a6..c87a3048fa 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -182,8 +182,9 @@ sh_binary(
             "//tensorflow/python:test_ops",
             "//tensorflow/tools/dist_test/server:grpc_tensorflow_server",
         ],
-    }) + if_mkl(["//third_party/mkl:intel_binary_blob"])
-    + if_tensorrt(["//tensorflow/contrib/tensorrt:init_py"]),
+    }) + if_mkl(["//third_party/mkl:intel_binary_blob"]) + if_tensorrt([
+      "//tensorflow/contrib/tensorrt:init_py",
+    ]),
 )
 
 # A genrule for generating a marker file for the pip package on Windows
diff --git a/third_party/tensorrt/BUILD.tpl b/third_party/tensorrt/BUILD.tpl
index 354f17cda5..12d0115091 100644
--- a/third_party/tensorrt/BUILD.tpl
+++ b/third_party/tensorrt/BUILD.tpl
@@ -1,6 +1,7 @@
 # NVIDIA TensorRT
 # A high-performance deep learning inference optimizer and runtime.
 
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts")
 licenses(["notice"])
 
 exports_files(["LICENSE"])
-- 
GitLab


From 048b19f11e1ba1fa76c0fd508fed9007c852e8df Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Fri, 2 Feb 2018 16:02:22 -0800
Subject: [PATCH 608/966] Replacing _container_prefix with _graph_key to
 preserve its use in optimizer and deprecate others.

PiperOrigin-RevId: 184348303
---
 tensorflow/contrib/eager/python/saver_test.py | 10 ---
 tensorflow/contrib/eager/python/tfe.py        |  2 -
 tensorflow/python/eager/function.py           | 10 ++-
 tensorflow/python/eager/graph_callable.py     | 12 ++--
 tensorflow/python/framework/ops.py            | 17 ++---
 tensorflow/python/framework/test_util.py      | 71 ++-----------------
 tensorflow/python/framework/test_util_test.py | 68 ------------------
 .../python/ops/resource_variable_ops.py       | 14 +---
 tensorflow/python/training/optimizer.py       |  2 +-
 9 files changed, 24 insertions(+), 182 deletions(-)

diff --git a/tensorflow/contrib/eager/python/saver_test.py b/tensorflow/contrib/eager/python/saver_test.py
index abc7e3690c..1a7f7b85e6 100644
--- a/tensorflow/contrib/eager/python/saver_test.py
+++ b/tensorflow/contrib/eager/python/saver_test.py
@@ -73,16 +73,6 @@ class SaverTest(test.TestCase):
       with self.assertRaisesRegexp(ValueError, 'v1'):
         saver.save(ckpt_prefix)
 
-  def testDifferentGraphError(self):
-    with ops.device(self._dev()):
-      with ops.Graph().as_default():
-        v1 = resource_variable_ops.ResourceVariable(1.0, name='v1')
-      with ops.Graph().as_default():
-        saver = _saver.Saver([v1])
-        ckpt_prefix = os.path.join(test.get_temp_dir(), 'ckpt')
-        with self.assertRaisesRegexp(ValueError, 'Graph'):
-          saver.save(ckpt_prefix)
-
   def testSameObjectOK(self):
     with ops.device(self._dev()):
       v1 = resource_variable_ops.ResourceVariable(1.0, name='v1')
diff --git a/tensorflow/contrib/eager/python/tfe.py b/tensorflow/contrib/eager/python/tfe.py
index 712d1cb94d..d32bebf90c 100644
--- a/tensorflow/contrib/eager/python/tfe.py
+++ b/tensorflow/contrib/eager/python/tfe.py
@@ -59,7 +59,6 @@ To use, at program startup, call `tfe.enable_eager_execution()`.
 @@in_eager_mode
 @@in_graph_mode
 
-@@IsolateTest
 @@run_test_in_graph_and_eager_modes
 
 @@DEVICE_PLACEMENT_EXPLICIT
@@ -101,7 +100,6 @@ from tensorflow.python.eager.execution_callbacks import nan_callback
 from tensorflow.python.eager.execution_callbacks import seterr
 from tensorflow.python.framework.ops import enable_eager_execution
 from tensorflow.python.framework.ops import eager_run as run
-from tensorflow.python.framework.test_util import IsolateTest
 from tensorflow.python.framework.test_util import run_in_graph_and_eager_modes as run_test_in_graph_and_eager_modes
 from tensorflow.python.ops.resource_variable_ops import ResourceVariable as Variable
 from tensorflow.python.ops.variable_scope import EagerVariableStore
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index f5d0759bdc..246df9afef 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -575,15 +575,13 @@ def _get_defun_inputs(args):
 
 def _defun_internal(name, func, args, kwds):
   """Defines and returns graph-mode version of func."""
-  container_prefix = ops.get_default_graph()._container_prefix  # pylint: disable=protected-access
+  graph_key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
   with context.graph_mode():
     captures = {}
     tmp_graph = CapturingGraph(captures)
-    # Inherit the container prefix, since this is used for error checking when
-    # isolating eager execution (the container prefix at creation must match the
-    # container prefix when used, and variables accessed in the defun will be
-    # used in the outside context).
-    tmp_graph._container_prefix = container_prefix  # pylint: disable=protected-access
+    # Inherit the graph key, since this is used for matching variables in
+    # optimizers.
+    tmp_graph._graph_key = graph_key  # pylint: disable=protected-access
     # Copy the graph collections to ensure summaries and other things work. This
     # lets the function access (but not mutate) collections of the containing
     # graph, such as the global step and the summary writer collections.
diff --git a/tensorflow/python/eager/graph_callable.py b/tensorflow/python/eager/graph_callable.py
index 5c13ea8908..62106bf0e2 100644
--- a/tensorflow/python/eager/graph_callable.py
+++ b/tensorflow/python/eager/graph_callable.py
@@ -252,21 +252,17 @@ def _graph_callable_internal(func, shape_and_dtypes):
     Callable graph object.
   """
   container = tf_ops.get_default_graph()._container  # pylint: disable=protected-access
-  container_prefix = tf_ops.get_default_graph()._container_prefix  # pylint: disable=protected-access
+  graph_key = tf_ops.get_default_graph()._graph_key  # pylint: disable=protected-access
   with context.graph_mode():
     # This graph will store both the initialization and the call version of the
     # wrapped function. It will later be used by the backprop code to build the
     # backprop graph, if necessary.
     captures = {}
     tmp_graph = function.CapturingGraph(captures)
-    # Inherit the container from the original graph to create resources at user
-    # expected containers. Also inherits the container prefix, since this is
-    # used for error checking when isolating Eager execution (the container
-    # prefix at creation must match the container prefix when used, and
-    # variables returned from the graph callable will be used in the outside
-    # context).
+    # Inherit the graph key from the original graph to ensure optimizers don't
+    # misbehave.
     tmp_graph._container = container  # pylint: disable=protected-access
-    tmp_graph._container_prefix = container_prefix  # pylint: disable=protected-access
+    tmp_graph._graph_key = graph_key  # pylint: disable=protected-access
     with tmp_graph.as_default():
       # Placeholders for the non-variable inputs.
       func_inputs = _get_graph_callable_inputs(shape_and_dtypes)
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index d5786cac68..ea589cc4d4 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -2760,15 +2760,12 @@ class Graph(object):
     self._handle_movers = {}
     # A map from tensor handle to its delete op.
     self._handle_deleters = {}
-    # Resource container.
-    if context.in_graph_mode():
-      self._container_prefix = ""
-    else:
-      # In Eager mode, isolate resources (particularly ResourceVariables) in
-      # Graphs by default. This prevents unintended variable sharing. Graph mode
-      # gets this kind of isolation from Sessions.
-      self._container_prefix = "eager-execution-%d/" % (uid(),)
-    self._container = self._container_prefix
+    # Allow optimizers and other objects to pseudo-uniquely key graphs (this key
+    # will be shared when defining function graphs, for example, so optimizers
+    # being called inside function definitions behave as if they were seeing the
+    # actual outside graph).
+    self._graph_key = "grap-key-%d/" % (uid(),)
+    self._container = ""
     self._registered_ops = op_def_registry.get_registered_ops()
 
     # TODO(skyewm): fold as much of the above as possible into the C
@@ -4229,7 +4226,7 @@ class Graph(object):
     """
     original_container = self._container
     try:
-      self._container = self._container_prefix + container_name
+      self._container = container_name
       yield self._container
     finally:
       self._container = original_container
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 70f6a2acba..15e8f5a38d 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -425,66 +425,6 @@ def with_c_api(cls):
   return cls
 
 
-class IsolateTest(object):
-  """A context manager which isolates resources in its block.
-
-  Provides an Eager-agnostic abstraction for preventing the sharing of
-  variables and other resources.
-
-  In graph mode, resource handle ops are only executed in a particular Session,
-  isolating them from resources with the same name in other Graphs. In Eager,
-  separate Sessions do not exist, so resources (particularly ResourceVariables)
-  would be shared implicitly if a resource of the same name were created
-  anywhere in a Python process. Multiple handles to the same resource would
-  cause several issues, and so this type of sharing will raise an exception.
-
-  Using resources with the same name in a single Python process may be useful
-  (especially for unit tests), so this context manager provides an abstraction
-  for isolating resources. Using a resource created in one Isolation environment
-  in another is an error.
-
-  Example usage in Eager mode:
-
-  ```python
-  import tensorflow as tf
-  # Import subject to change
-  from tensorflow.contrib.eager.python import tfe
-
-  tfe.enable_eager_execution()
-
-  for hyperparameter in [1, 2, 3]:
-    with tfe.IsolateTest():
-      v = tfe.Variable(name="v", initial_value=hyperparameter)
-      # train model, test results ...
-  ```
-
-  IsolateTest is currently exposed through contrib.eager, but it creates a new
-  default Graph and provides equivalent safety in graph mode.
-  """
-
-  def __init__(self):
-    if context.in_eager_mode() and tape.could_possibly_record():
-      raise ValueError("Cannot isolate Eager execution with an active tape.")
-    # In Eager, Graphs set a container which isolates resources, and maintain a
-    # VariableStore which caches ResourceVariable objects created through
-    # get_variable. So setting the default Graph has the side effect of
-    # isolating Eager resources.
-    with context.eager_mode():
-      # Create the graph in Eager mode, as this provides stricter semantics
-      # (i.e. has a unique container prefix). This prevents implicit sharing
-      # when a Graph-mode graph is created and then Eager mode is enabled (an
-      # error through enable_eager_execution, but common with context managers
-      # in unit tests).
-      self._graph_as_default_context_manager = ops.Graph().as_default()
-
-  def __enter__(self):
-    self._graph_as_default_context_manager.__enter__()
-
-  def __exit__(self, type_arg, value_arg, traceback_arg):
-    return self._graph_as_default_context_manager.__exit__(
-        type_arg, value_arg, traceback_arg)
-
-
 def assert_no_new_tensors(f):
   """Decorator for asserting that no new Tensors persist after a test.
 
@@ -515,12 +455,11 @@ def assert_no_new_tensors(f):
         return False
 
     tensors_before = set(id(obj) for obj in gc.get_objects() if _is_tensor(obj))
-    outside_container_prefix = ops.get_default_graph()._container_prefix
-    with IsolateTest():
+    outside_graph_key = ops.get_default_graph()._graph_key
+    with ops.Graph().as_default():
       # Run the test in a new graph so that collections get cleared when it's
-      # done, but inherit the container prefix so that we can print the values
-      # of variables which get leaked when executing eagerly.
-      ops.get_default_graph()._container_prefix = outside_container_prefix
+      # done, but inherit the graph key so optimizers behave.
+      ops.get_default_graph()._graph_key = outside_graph_key
       f(self, **kwargs)
     # Make an effort to clear caches, which would otherwise look like leaked
     # Tensors.
@@ -642,7 +581,7 @@ def run_in_graph_and_eager_modes(__unused__=None,
             assert_no_garbage_created(run_eager_mode))
 
       with context.eager_mode():
-        with IsolateTest():
+        with ops.Graph().as_default():
           run_eager_mode(self, **kwargs)
 
     return decorated
diff --git a/tensorflow/python/framework/test_util_test.py b/tensorflow/python/framework/test_util_test.py
index 3594d125bf..a717eb3951 100644
--- a/tensorflow/python/framework/test_util_test.py
+++ b/tensorflow/python/framework/test_util_test.py
@@ -29,7 +29,6 @@ from google.protobuf import text_format
 
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
-from tensorflow.python.client import session
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors
@@ -39,7 +38,6 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
 
@@ -443,71 +441,5 @@ class GarbageCollectionTest(test_util.TensorFlowTestCase):
     LeakedTensorTest().test_has_no_leak()
 
 
-@test_util.with_c_api
-class IsolationTest(test_util.TensorFlowTestCase):
-
-  @test_util.run_in_graph_and_eager_modes()
-  def test_variable_reuse_exception(self):
-    with test_util.IsolateTest(), session.Session():
-      first_container_variable = resource_variable_ops.ResourceVariable(
-          name="first_container_variable",
-          initial_value=1)
-      if context.in_graph_mode():
-        self.evaluate([variables.global_variables_initializer()])
-    with test_util.IsolateTest():
-      if context.in_graph_mode():
-        with self.assertRaises(RuntimeError):
-          self.evaluate(first_container_variable.read_value())
-      else:
-        with self.assertRaises(ValueError):
-          first_container_variable.read_value()
-
-  @test_util.run_in_graph_and_eager_modes()
-  def test_variable_reuse_exception_nested(self):
-    with test_util.IsolateTest(), session.Session():
-      first_container_variable = resource_variable_ops.ResourceVariable(
-          name="first_container_variable",
-          initial_value=1)
-      if context.in_graph_mode():
-        self.evaluate([variables.global_variables_initializer()])
-      with test_util.IsolateTest(), session.Session():
-        if context.in_graph_mode():
-          with self.assertRaises(RuntimeError):
-            self.evaluate(first_container_variable.read_value())
-        else:
-          with self.assertRaises(ValueError):
-            first_container_variable.read_value()
-
-  @test_util.run_in_graph_and_eager_modes()
-  def test_no_sharing(self):
-    with test_util.IsolateTest(), session.Session():
-      first_container_variable = resource_variable_ops.ResourceVariable(
-          name="same_name",
-          initial_value=1)
-      if context.in_graph_mode():
-        self.evaluate([variables.global_variables_initializer()])
-      with test_util.IsolateTest(), session.Session():
-        second_container_variable = resource_variable_ops.ResourceVariable(
-            name="same_name",
-            initial_value=2)
-        if context.in_graph_mode():
-          self.evaluate([variables.global_variables_initializer()])
-        self.assertEqual(
-            2, self.evaluate(second_container_variable.read_value()))
-      self.assertEqual(1, self.evaluate(first_container_variable.read_value()))
-
-  def test_graph_mode_isolation(self):
-    with context.graph_mode():
-      # Even if we've (accidentally) called IsolateTest in Graph mode, it should
-      # provide Eager isolation.
-      with test_util.IsolateTest():
-        with context.eager_mode():
-          first_container_variable = resource_variable_ops.ResourceVariable(
-              name="first_container_variable",
-              initial_value=1)
-      with context.eager_mode():
-        with self.assertRaises(ValueError):
-          first_container_variable.read_value()
-
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index bdf41cd75d..cc9f7981e4 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -348,9 +348,9 @@ class ResourceVariable(variables.Variable):
     if trainable and ops.GraphKeys.TRAINABLE_VARIABLES not in collections:
       collections = list(collections) + [ops.GraphKeys.TRAINABLE_VARIABLES]
     self._save_slice_info = None
-    # Save the graph's container prefix for error checking. Reading the value of
-    # the ResourceVariable from another Graph in Eager mode is an error.
-    self._container_prefix = ops.get_default_graph()._container_prefix  # pylint: disable=protected-access
+    # Store the graph key so optimizers know how to only retrieve variables from
+    # this graph.
+    self._graph_key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
     with ops.init_scope():
       self._in_graph_mode = context.in_graph_mode()
       with ops.name_scope(name, "Variable", []
@@ -662,15 +662,7 @@ class ResourceVariable(variables.Variable):
 
     Returns:
      the read operation.
-    Raises:
-      ValueError: if the ResourceVariable was created in another isolation
-        environment or graph.
     """
-    if (not self._in_graph_mode and
-        self._container_prefix != ops.get_default_graph()._container_prefix):  # pylint: disable=protected-access
-      raise ValueError(
-          "Attempted to read a variable from another isolation environment"
-          " or Graph")
     with ops.name_scope("Read"):
       # Ensure we read the variable in the same device as the handle.
       with ops.device(self._handle_device):
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index 425dbd8313..9ec588bac9 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -602,7 +602,7 @@ class Optimizer(object):
       if executing_eagerly:
         # No variable.op in eager mode. We don't expect lots of eager graphs,
         # but behavior should be consistent with graph mode.
-        return variable._container_prefix == current_graph._container_prefix  # pylint: disable=protected-access
+        return variable._graph_key == current_graph._graph_key  # pylint: disable=protected-access
       else:
         return variable.op.graph is current_graph
 
-- 
GitLab


From 23f0529f52f2ae9615e465f804e5622cad4aeb8f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 2 Feb 2018 16:18:59 -0800
Subject: [PATCH 609/966] Tpu cluster resolver only returns TF server addresses
 for 'HEALTHY' tpu nodes.

PiperOrigin-RevId: 184350480
---
 .../python/training/tpu_cluster_resolver.py   |  5 +-
 .../training/tpu_cluster_resolver_test.py     | 55 +++++++++++++++++--
 2 files changed, 52 insertions(+), 8 deletions(-)

diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
index 2e75ac226e..a6a6e642e4 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
@@ -143,7 +143,8 @@ class TPUClusterResolver(ClusterResolver):
       request = self._service.projects().locations().nodes().get(name=full_name)
       response = request.execute()
 
-      instance_url = '%s:%s' % (response['ipAddress'], response['port'])
-      worker_list.append(instance_url)
+      if 'health' in response and response['health'] == 'HEALTHY':
+        instance_url = '%s:%s' % (response['ipAddress'], response['port'])
+        worker_list.append(instance_url)
 
     return ClusterSpec({self._job_name: worker_list})
diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
index 0c4730613a..4fd34629cf 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
@@ -105,7 +105,8 @@ class TPUClusterResolverTest(test.TestCase):
     tpu_map = {
         'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
             'ipAddress': '10.1.2.3',
-            'port': '8470'
+            'port': '8470',
+            'health': 'HEALTHY'
         }
     }
 
@@ -126,7 +127,8 @@ class TPUClusterResolverTest(test.TestCase):
     tpu_map = {
         'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
             'ipAddress': '10.1.2.3',
-            'port': '8470'
+            'port': '8470',
+            'health': 'HEALTHY'
         }
     }
 
@@ -147,11 +149,13 @@ class TPUClusterResolverTest(test.TestCase):
     tpu_map = {
         'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
             'ipAddress': '10.1.2.3',
-            'port': '8470'
+            'port': '8470',
+            'health': 'HEALTHY'
         },
         'projects/test-project/locations/us-central1-c/nodes/test-tpu-2': {
             'ipAddress': '10.4.5.6',
-            'port': '8470'
+            'port': '8470',
+            'health': 'HEALTHY'
         }
     }
 
@@ -169,15 +173,54 @@ class TPUClusterResolverTest(test.TestCase):
     """
     self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
 
+  def testHealthyTpuNodeRetrieval(self):
+    tpu_map = {
+        'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
+            'ipAddress': '10.1.2.3',
+            'port': '8470',
+            'health': 'HEALTHY'
+        },
+        'projects/test-project/locations/us-central1-c/nodes/test-tpu-2': {
+            'ipAddress': '10.4.5.6',
+            'port': '8470',
+        },
+        'projects/test-project/locations/us-central1-c/nodes/test-tpu-3': {
+            'ipAddress': '10.7.8.9',
+            'port': '8470',
+            'health': 'UNHEALTHY'
+        }
+    }
+
+    tpu_cluster_resolver = TPUClusterResolver(
+        project='test-project',
+        zone='us-central1-c',
+        tpu_names=['test-tpu-2', 'test-tpu-1', 'test-tpu-3'],
+        credentials=None,
+        service=self.mock_service_client(tpu_map=tpu_map))
+
+    actual_cluster_spec = tpu_cluster_resolver.cluster_spec()
+    expected_proto = """
+    job {
+      name: 'tpu_worker'
+      tasks {
+        key: 0
+        value: '10.1.2.3:8470'
+      }
+    }
+    """
+    self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
+
   def testGetMasterMultipleEntries(self):
     tpu_map = {
         'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
             'ipAddress': '10.1.2.3',
-            'port': '8470'
+            'port': '8470',
+            'health': 'HEALTHY'
         },
         'projects/test-project/locations/us-central1-c/nodes/test-tpu-2': {
             'ipAddress': '10.4.5.6',
-            'port': '8470'
+            'port': '8470',
+            'health': 'HEALTHY'
         }
     }
 
-- 
GitLab


From 9e45772d16bbcb3adb3c5faa298969e183cdc89e Mon Sep 17 00:00:00 2001
From: Chris Drake <cjdrake@users.noreply.github.com>
Date: Fri, 2 Feb 2018 16:34:55 -0800
Subject: [PATCH 610/966] Update metric_ops.py (#16712)

---
 tensorflow/contrib/metrics/python/ops/metric_ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index 55946c128b..c2340d0377 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -739,7 +739,7 @@ def _streaming_confusion_matrix_at_thresholds(predictions,
   else:
     for include in includes:
       if include not in all_includes:
-        raise ValueError('Invaild key: %s.' % include)
+        raise ValueError('Invalid key: %s.' % include)
 
   predictions, labels, weights = metrics_impl._remove_squeezable_dimensions(  # pylint: disable=protected-access
       predictions, labels, weights)
-- 
GitLab


From 1f2a68267ebe08f71e59d86c544fb42e46ba0f62 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Fri, 2 Feb 2018 16:36:41 -0800
Subject: [PATCH 611/966] Fix a broken link in regression_examples.md (#16685)

This fix fixes a broken link in regression_examples.md
---
 tensorflow/docs_src/api_guides/python/regression_examples.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/docs_src/api_guides/python/regression_examples.md b/tensorflow/docs_src/api_guides/python/regression_examples.md
index dae50a8f03..7de2be0552 100644
--- a/tensorflow/docs_src/api_guides/python/regression_examples.md
+++ b/tensorflow/docs_src/api_guides/python/regression_examples.md
@@ -38,7 +38,7 @@ The preceding examples rely on the following data set utility:
   <tr> <th>Utility</th> <th>Description</th></tr>
 
   <tr>
-    <td><a href="../../examples/get_started/regression/imports85.py">imports85.py</a></td>
+    <td><a href="https://www.tensorflow.org/code/tensorflow/examples/get_started/regression/imports85.py">imports85.py</a></td>
     <td>This program provides utility functions that load the
         <tt>imports85</tt> data set into formats that other TensorFlow
         programs (for example, <tt>linear_regression.py</tt> and
-- 
GitLab


From 83ad2e8ade5dceaec466d2f7f4d5545cee6ba79a Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Fri, 2 Feb 2018 16:37:50 -0800
Subject: [PATCH 612/966] Convert unicode to six.string_types for python 3
 (#16676)

In Python 3, there is no unicode type. This fix converts
unicode to use six.string_types instead, while maintaining
python 2/3 compatibility.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/contrib/learn/python/learn/monitors.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/learn/python/learn/monitors.py b/tensorflow/contrib/learn/python/learn/monitors.py
index 0948dee7e2..51381a7427 100644
--- a/tensorflow/contrib/learn/python/learn/monitors.py
+++ b/tensorflow/contrib/learn/python/learn/monitors.py
@@ -879,7 +879,7 @@ class GraphDump(BaseMonitor):
     this_output = self.data[step] if step in self.data else {}
     other_output = other_dump.data[step] if step in other_dump.data else {}
     for key in this_output:
-      if not isinstance(key, str) and not isinstance(key, unicode):
+      if not isinstance(key, six.string_types):
         continue
       if key not in other_output:
         raise ValueError("%s missing at step %s.", (key, step))
-- 
GitLab


From ed040a1981d49e4cee6c8adbd2d20852d925daf5 Mon Sep 17 00:00:00 2001
From: ManHyuk <manhyuk@kw.ac.kr>
Date: Sat, 3 Feb 2018 09:38:55 +0900
Subject: [PATCH 613/966] fix typo (#16679)

---
 tensorflow/compiler/xla/python/xla_client.py       | 2 +-
 tensorflow/contrib/nearest_neighbor/kernels/heap.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index b890980955..2c693f0581 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -1072,7 +1072,7 @@ def initialize_replica_count(replica_count):
 
   Args:
     replica_count: number of replicas that are desired for set up during XLA
-      initalization.
+      initialization.
 
   Raises:
     A runtime exception if the XLA service has already been initialized.
diff --git a/tensorflow/contrib/nearest_neighbor/kernels/heap.h b/tensorflow/contrib/nearest_neighbor/kernels/heap.h
index 32925569a8..a2dbb8052b 100644
--- a/tensorflow/contrib/nearest_neighbor/kernels/heap.h
+++ b/tensorflow/contrib/nearest_neighbor/kernels/heap.h
@@ -56,7 +56,7 @@ class HeapBase {
 
   // This method adds an element at the end of the internal array without
   // "heapifying" the array afterwards. This is useful for setting up a heap
-  // where a single call to heapify at the end of the inital insertion
+  // where a single call to heapify at the end of the initial insertion
   // operations suffices.
   void InsertUnsorted(const KeyType& key, const DataType& data) {
     if (v_.size() == static_cast<size_t>(num_elements_)) {
-- 
GitLab


From 3ddf1d6902333a02733431f6ed5b089adcd13089 Mon Sep 17 00:00:00 2001
From: Jonathan Hseu <jhseu@google.com>
Date: Fri, 2 Feb 2018 16:35:10 -0800
Subject: [PATCH 614/966] Update global_step by default if the user specifies a
 host_call.

PiperOrigin-RevId: 184352399
---
 .../contrib/tpu/python/tpu/tpu_estimator.py   | 50 +++++++++++++++++--
 1 file changed, 45 insertions(+), 5 deletions(-)

diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index b5082fc823..56793f11d9 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -63,6 +63,7 @@ from tensorflow.python.training import evaluation
 from tensorflow.python.training import session_run_hook
 from tensorflow.python.training import training
 from tensorflow.python.training import training_util
+from tensorflow.python.util import tf_inspect
 
 _INITIAL_LOSS = 1e7
 _ZERO_LOSS = 0.
@@ -484,7 +485,7 @@ class TPUEstimatorSpec(
     if self.eval_metrics is not None:
       host_calls['eval_metrics'] = self.eval_metrics
     if self.host_call is not None:
-      host_calls['host_call'] = self.host_call
+      host_calls['host_call'] = wrap_hostcall_with_global_step(self.host_call)
     host_call_ret = _OutfeedHostCall.create_cpu_hostcall(host_calls)
     eval_metric_ops = None
     if self.eval_metrics is not None:
@@ -1306,7 +1307,9 @@ class _ModelFnWrapper(object):
       if isinstance(estimator_spec, TPUEstimatorSpec):
         captured_scaffold_fn.capture(estimator_spec.scaffold_fn)
         if estimator_spec.host_call is not None:
-          host_call.record({'host_call': estimator_spec.host_call})
+          host_call.record({
+              'host_call': wrap_hostcall_with_global_step(
+                  estimator_spec.host_call)})
           host_call_outfeed_ops = host_call.create_enqueue_op()
       else:
         captured_scaffold_fn.capture(None)
@@ -1361,6 +1364,8 @@ class _ModelFnWrapper(object):
       to_record = {}
       to_record['eval_metrics'] = tpu_estimator_spec.eval_metrics
       if tpu_estimator_spec.host_call is not None:
+        # We assume that evaluate won't update global step, so we don't wrap
+        # this host_call.
         to_record['host_call'] = tpu_estimator_spec.host_call
       host_calls.record(to_record)
 
@@ -1503,11 +1508,14 @@ class _OutfeedHostCall(object):
         raise ValueError('{}[1] should be tuple or list, or dict.'.format(name))
 
       if isinstance(host_call[1], (tuple, list)):
+        fullargspec = tf_inspect.getfullargspec(host_call[0])
         fn_args = util.fn_args(host_call[0])
-        if len(host_call[1]) != len(fn_args):
+        # wrapped_hostcall_with_global_step uses varargs, so we allow that.
+        if fullargspec.varargs is None and len(host_call[1]) != len(fn_args):
           raise RuntimeError(
-              'In TPUEstimatorSpec.{}, length of tensors does not '
-              'match method args of metric_fn.'.format(name))
+              'In TPUEstimatorSpec.{}, length of tensors {} does not match '
+              'method args of the function, which takes {}.'.format(
+                  name, len(host_call[1]), len(fn_args)))
 
   @staticmethod
   def create_cpu_hostcall(host_calls):
@@ -1649,6 +1657,38 @@ class _OutfeedHostCall(object):
     return ret
 
 
+def wrap_hostcall_with_global_step(hostcall):
+  """Wrap the hostcall so that we update the global step upon every call."""
+  if hostcall is None:
+    return None
+  host_fn, tensors = hostcall
+
+  def global_step_host_fn(_global_step, *args, **kwargs):  # pylint: disable=invalid-name
+    # Note that we don't have any ordering here, so the graph may see a
+    # global_step that's off by 1.
+    state_ops.assign(
+        training.get_global_step(),
+        math_ops.cast(_global_step[0], dtypes.int64))
+    return host_fn(*args, **kwargs)
+  # Give the global step tensor a batch dimension. Reshape is not supported for
+  # int64, so we cast it to int32.
+  # TODO(jhseu): Remove the cast once int64 is supported.
+  global_step_tensor = array_ops.reshape(
+      math_ops.cast(training.get_global_step(), dtypes.int32), [1])
+  if isinstance(tensors, dict):
+    outfeed_tensors = {'_global_step': global_step_tensor}
+    outfeed_tensors.update(tensors)
+    return global_step_host_fn, outfeed_tensors
+  else:
+    fn_args = util.fn_args(host_fn)
+    if len(tensors) != len(fn_args):
+      raise RuntimeError(
+          'In TPUEstimatorSpec.host_call, length of tensors {} does not match '
+          'method args of the function, which takes {}.'.format(
+              len(tensors), len(fn_args)))
+    return global_step_host_fn, [global_step_tensor] + list(tensors)
+
+
 class _OutfeedHostCallHook(session_run_hook.SessionRunHook):
   """Hook to run host calls when use_tpu=False."""
 
-- 
GitLab


From d947acb2ca24474533aa13c054171ec8b4d291ce Mon Sep 17 00:00:00 2001
From: DosLin <doslino@gmail.com>
Date: Sat, 3 Feb 2018 08:39:18 +0800
Subject: [PATCH 615/966] Fix docs (#16653)

---
 tensorflow/docs_src/install/install_sources.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index bc7d2080dc..ccf62a169f 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -221,7 +221,7 @@ problem, do either of the following:
   * Download Xcode 7.2 and select it as your default by issuing the following
     command:
 
-    <pre> $ <b>sudo xcode-select -s /Application/Xcode-7.2/Xcode.app</b></pre>
+    <pre> $ <b>sudo xcode-select -s /Applications/Xcode-7.2/Xcode.app</b></pre>
 
 **NOTE:** Your system must fulfill the NVIDIA software requirements described
 in one of the following documents:
-- 
GitLab


From cb2828a844ccaf0394e602d15fd95e45073729a2 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Fri, 2 Feb 2018 17:09:16 -0800
Subject: [PATCH 616/966] Check that the type of an implicitly dereferenced
 tensor matches the expected input type.

The dtype of a tensor reference can change between the point when it is "produced" by an
operation and consumed by the next operation. This evades checks in the executor that the
type of tensor on each edge matches the type signatures of the producing and consuming operation, which could lead to undefined behavior. Although there is no existing operation that changes the type of a tensor reference, it is possible to use the OpKernelContext API to do so, so we add a further check in the runtime to defend against operations that might be added in the future.

PiperOrigin-RevId: 184356242
---
 tensorflow/core/common_runtime/executor.cc | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index f515590b28..e3416da988 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -1776,6 +1776,19 @@ Status ExecutorState::PrepareInputs(const NodeItem& item, Entry* first_input,
         entry->ref_mu = nullptr;
 
         inp->tensor = entry->val.get();
+        // The dtype of entry->ref could have been changed by another operation
+        // that ran after the operation that "produced" it executed, so
+        // re-validate that the type of the dereferenced tensor matches the
+        // expected input type.
+        if (item.input_type(i) != inp->tensor->dtype()) {
+          return AttachDef(
+              errors::InvalidArgument(
+                  i, "-th input expects type ",
+                  DataTypeString(item.input_type(i)),
+                  " but automatically dereferenced input tensor has type ",
+                  DataTypeString(inp->tensor->dtype())),
+              item.kernel->def());
+        }
       }
     }
   }
-- 
GitLab


From 245f6c840c86f79a0e42d81ca84b8ce5eeb4c396 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Fri, 2 Feb 2018 17:09:59 -0800
Subject: [PATCH 617/966] [tf.data] Add public header
 "tensorflow/core/framework/dataset.h".

This adds the ability to create a custom C++ Dataset implementation
without linking it statically into the TensorFlow library.  Note that
this internal API is experimental and subject to change between
versions of TensorFlow.

Fixes #16682.

PiperOrigin-RevId: 184356318
---
 tensorflow/core/BUILD                  |   1 +
 tensorflow/core/framework/dataset.h    | 614 +++++++++++++++++++++++++
 tensorflow/core/kernels/data/dataset.h | 596 +-----------------------
 3 files changed, 616 insertions(+), 595 deletions(-)
 create mode 100644 tensorflow/core/framework/dataset.h

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 68c82ea303..3aa3018cd2 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -432,6 +432,7 @@ tf_cuda_library(
         "framework/cancellation.h",
         "framework/common_shape_fns.h",
         "framework/control_flow.h",  # TODO(josh11b): Make internal?
+        "framework/dataset.h",
         "framework/dataset_stateful_op_whitelist.h",
         "framework/device_base.h",
         "framework/function.h",
diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
new file mode 100644
index 0000000000..96566c285a
--- /dev/null
+++ b/tensorflow/core/framework/dataset.h
@@ -0,0 +1,614 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_DATASET_H_
+#define TENSORFLOW_CORE_FRAMEWORK_DATASET_H_
+
+#include <memory>
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/dataset_stateful_op_whitelist.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
+#include "tensorflow/core/framework/variant_tensor_data.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/tracing.h"
+
+// Polymorphic datasets should support all primitive TensorFlow
+// types. Use this macro to expand `m(T)` once for each primitive type
+// `T`, e.g. to build a `switch` statement.
+#define TF_CALL_DATASET_TYPES(m) TF_CALL_ALL_TYPES(m) TF_CALL_QUANTIZED_TYPES(m)
+
+namespace tensorflow {
+
+// Interface for reading values from a key-value store.
+// Used for restoring iterator state.
+class IteratorStateReader {
+ public:
+  virtual Status ReadScalar(StringPiece key, int64* val) = 0;
+  virtual Status ReadScalar(StringPiece key, string* val) = 0;
+  virtual Status ReadTensor(StringPiece key, Tensor* val) = 0;
+  virtual bool Contains(StringPiece key) = 0;
+
+  virtual ~IteratorStateReader() {}
+};
+
+// Interface for writing values to a key-value store.
+// Used for saving iterator state.
+class IteratorStateWriter {
+ public:
+  virtual Status WriteScalar(StringPiece key, const int64 val) = 0;
+  virtual Status WriteScalar(StringPiece key, const string& val) = 0;
+  virtual Status WriteTensor(StringPiece key, const Tensor& val) = 0;
+
+  virtual ~IteratorStateWriter() {}
+};
+
+// Forward declarations to avoid introducing a dependency on headers in
+// "tensorflow/core/graph/...".
+class GraphDefBuilder;
+class GraphDatasetBase;
+class Node;
+
+// Wrapper around GraphDefBuilder. Used to serialize Dataset graph.
+class GraphDefBuilderWrapper {
+ public:
+  explicit GraphDefBuilderWrapper(GraphDefBuilder* b) : b_(b) {}
+
+  // Adds a Const node with scalar value to the Graph.
+  // `*output` contains a pointer to the output `Node`. It is guaranteed to be
+  // non-null if the method returns with an OK status.
+  // The returned Node pointer is owned by the backing Graph of GraphDefBuilder.
+  template <typename T>
+  Status AddScalar(const T& val, Node** output) {
+    Tensor val_t = Tensor(DataTypeToEnum<T>::v(), TensorShape({}));
+    val_t.scalar<T>()() = val;
+    AddTensorInternal(val_t, output);
+    if (*output == nullptr) {
+      return errors::Internal("AddScalar: Failed to build Const op.");
+    }
+    return Status::OK();
+  }
+
+  // Adds a Const node with vector value to the Graph.
+  // `*output` contains a pointer to the output `Node`. It is guaranteed to be
+  // non-null if the method returns with an OK status.
+  // The returned Node pointer is owned by the backing Graph of GraphDefBuilder.
+  // TODO(shivaniagrawal): Consider changing to gtl::ArraySlice?
+  template <typename T>
+  Status AddVector(const std::vector<T>& val, Node** output) {
+    Tensor val_t = Tensor(DataTypeToEnum<T>::v(),
+                          TensorShape({static_cast<int64>(val.size())}));
+    for (int i = 0; i < val.size(); i++) {
+      val_t.flat<T>()(i) = val[i];
+    }
+    AddTensorInternal(val_t, output);
+    if (*output == nullptr) {
+      return errors::Internal("AddVector: Failed to build Const op.");
+    }
+    return Status::OK();
+  }
+
+  // Adds a Const node with Tensor value to the Graph.
+  // `*output` contains a pointer to the output `Node`. It is guaranteed to be
+  // non-null if the method returns with an OK status.
+  // The returned Node pointer is owned by the backing Graph of GraphDefBuilder.
+  Status AddTensor(const Tensor& val, Node** output) {
+    AddTensorInternal(val, output);
+    if (*output == nullptr) {
+      return errors::Internal("AddTensor: Failed to build Const op.");
+    }
+    return Status::OK();
+  }
+
+  Status AddDataset(const GraphDatasetBase* dataset,
+                    const std::vector<Node*>& inputs, Node** output) {
+    return AddDataset(dataset, inputs, {}, output);
+  }
+
+  // Adds a node corresponding to the `DatasetType` to the Graph.
+  // Return value of `DatasetType::op_name()` is used as the op type for the
+  // node.
+  // Values for the output_types and output_shapes node attributes are also
+  // written if those attributes are defined in the OpDef.
+  // `*output` contains a pointer to the output `Node`. It is guaranteed to be
+  // non-null if the method returns with an OK status.
+  // The returned Node pointer is owned by the backing Graph of GraphDefBuilder.
+  Status AddDataset(const GraphDatasetBase* dataset,
+                    const std::vector<Node*>& inputs,
+                    const std::vector<std::pair<StringPiece, AttrValue>>& attrs,
+                    Node** output) {
+    std::vector<std::pair<size_t, Node*>> enumerated_inputs(inputs.size());
+    for (int i = 0; i < inputs.size(); i++) {
+      enumerated_inputs[i] = std::make_pair(i, inputs[i]);
+    }
+    return AddDataset(dataset, enumerated_inputs, {}, attrs, output);
+  }
+
+  Status AddDataset(
+      const GraphDatasetBase* dataset,
+      const std::vector<std::pair<size_t, Node*>>& inputs,
+      const std::vector<std::pair<size_t, gtl::ArraySlice<Node*>>>& list_inputs,
+      const std::vector<std::pair<StringPiece, AttrValue>>& attrs,
+      Node** output);
+
+  // Adds a user-defined function with name `function_name` to the graph and
+  // recursively adds all functions it references. If a function with a matching
+  // name has already been added, returns with OK status. If a user-defined with
+  // name `function_name` is not found in the FunctionLibraryDefinition, returns
+  // an InvalidArgumentError. If the function with name `function_name` or any
+  // of its dependent functions are stateful, returns an InvalidArgument error.
+  Status AddFunction(OpKernelContext* ctx, const string& function_name);
+
+  template <typename T>
+  void BuildAttrValue(const T& value, AttrValue* attr) {
+    SetAttrValue(value, attr);
+  }
+
+ private:
+  void AddTensorInternal(const Tensor& val, Node** output);
+
+  Status EnsureFunctionIsStateless(OpKernelContext* ctx,
+                                   const string& function_name) const {
+    const FunctionLibraryDefinition* lib_def =
+        ctx->function_library()->GetFunctionLibraryDefinition();
+    const FunctionDef* function_def = lib_def->Find(function_name);
+    if (!function_def) {
+      return errors::InvalidArgument("Unable to find FunctionDef for ",
+                                     function_name, " in registry.");
+    }
+    for (const NodeDef& node_def : function_def->node_def()) {
+      const OpDef* op_def;
+      TF_RETURN_IF_ERROR(lib_def->LookUpOpDef(node_def.op(), &op_def));
+      // TODO(b/65524810): Hack to allow functions to capture Dataset op
+      // nodes needed for FlatMap. Currently, source datasets nodes have been
+      // marked stateful to avoid constant folding since we do not have a
+      // good way of serializing them.
+      if (IsOpWhitelisted(op_def)) {
+        continue;
+      }
+      if (op_def->is_stateful()) {
+        return errors::InvalidArgument(
+            "Op[name: ", node_def.name(), ", type: ", node_def.op(), "] ",
+            "in function ", function_name, " is stateful. ",
+            "Saving stateful functions is not supported yet.");
+      }
+    }
+    return Status::OK();
+  }
+
+  // Returns whether an op has been whitelisted for use inside map_fns.
+  // Uses a heuristic to whitelist source dataset ops which have been
+  // marked stateful due to b/65524810.
+  // Also looks up the `op_def->name` in the global
+  // `WhitelistedStatefulOpRegistry`.
+  bool IsOpWhitelisted(const OpDef* op_def) const {
+    return (StringPiece(op_def->name()).ends_with("Dataset") &&
+            op_def->output_arg_size() == 1 &&
+            op_def->output_arg(0).type() == DT_VARIANT) ||
+           dataset::WhitelistedStatefulOpRegistry::Global()->Contains(
+               op_def->name());
+  }
+
+  bool HasAttr(const string& op_type_name, const string& attr_name) const;
+
+  bool HasAttr(const OpDef* op_def, const string& attr_name) const {
+    for (auto attr : op_def->attr()) {
+      if (attr.name() == attr_name) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  Status AddAttrFunctions(const AttrValue& attr_value, OpKernelContext* ctx) {
+    if (attr_value.has_func()) {
+      TF_RETURN_IF_ERROR(AddFunction(ctx, attr_value.func().name()));
+    } else if (attr_value.has_list()) {
+      for (const NameAttrList& name_attr_list : attr_value.list().func()) {
+        TF_RETURN_IF_ERROR(AddFunction(ctx, name_attr_list.name()));
+      }
+    }
+    return Status::OK();
+  }
+
+  GraphDefBuilder* b_;
+};
+
+class StatsAggregator;
+
+// A cut-down version of OpKernelContext for running computations in
+// iterators. Note that we cannot simply use OpKernelContext here
+// because we might run computation in an iterator whose lifetime is
+// not nested within the lifetime of a single OpKernelContext
+// (e.g. asynchronous prefetching).
+//
+// TODO(mrry): We will probably need to support more of
+// OpKernelContext here. For example, should allocation be handled by
+// the IteratorContext?
+// TODO(mrry): We're making some daring assumptions about the lifetime
+// of the runner passed in here. A runner will be deleted when the original
+// step ends, but all existing runners only close over session-lifetime (or
+// longer-lived) state, so we can make a copy of the function. There's nothing
+// in the definition of the API from which we took the runner to guarantee that
+// what we are doing is safe. We should formalize the properties here.
+class IteratorContext {
+ public:
+  struct Params {
+    // Interface to operating system functionality.
+    Env* env;
+
+    // Function call support.
+    std::function<void(std::function<void()>)> runner = nullptr;
+
+    // A function that returns the current `StatsAggregator` instance to be
+    // used when recording statistics about the iterator.
+    //
+    // NOTE(mrry): This is somewhat awkward, because (i) the `StatsAggregator`
+    // is a property of the `IteratorResource` (which this class does not know
+    // about), and (ii) it can change after the `IteratorContext` has been
+    // created. Better suggestions are welcome!
+    std::function<std::shared_ptr<StatsAggregator>()> stats_aggregator_getter =
+        nullptr;
+
+    // The FunctionLibraryRuntime object to be used to make function calls.
+    FunctionLibraryRuntime* lib = nullptr;
+    std::shared_ptr<const FunctionLibraryDefinition> function_library = nullptr;
+
+    // The Allocator to be used to allocate the output of an iterator.
+    Allocator* allocator = nullptr;
+  };
+
+  explicit IteratorContext(Params params) : params_(std::move(params)) {}
+
+  Env* env() const { return params_.env; }
+
+  std::function<void(std::function<void()>)>* runner() {
+    return &params_.runner;
+  }
+
+  std::shared_ptr<StatsAggregator> stats_aggregator() {
+    if (params_.stats_aggregator_getter) {
+      return params_.stats_aggregator_getter();
+    } else {
+      return nullptr;
+    }
+  }
+
+  std::shared_ptr<const FunctionLibraryDefinition> function_library() {
+    return params_.function_library;
+  }
+
+  FunctionLibraryRuntime* lib() { return params_.lib; }
+
+  void set_lib(FunctionLibraryRuntime* lib) { params_.lib = lib; }
+
+  Allocator* allocator(AllocatorAttributes attrs);
+
+ private:
+  Params params_;
+};
+
+// Represents the current position in a range of outputs, where the
+// range of outputs is typically represented by an `DatasetBase`,
+// defined below.
+class IteratorBase {
+ public:
+  virtual ~IteratorBase() {}
+
+  // Gets the next output from the range that this iterator is traversing.
+  //
+  // If at least one output remains in this iterator's range, that
+  // output will be stored in `*out_tensors` and `false` will be
+  // stored in `*end_of_sequence`.
+  //
+  // If no more outputs remain in this iterator's range, `true` will
+  // be stored in `*end_of_sequence`, and the content of
+  // `*out_tensors` will be undefined.
+  //
+  // This method is thread-safe.
+  //
+  // TODO(mrry): Define `GetNextAsync()` or `GetNextManyAsync()`, and
+  // potentially remove this method.
+  virtual Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                         bool* end_of_sequence) = 0;
+
+  // Returns a vector of DataType values, representing the respective
+  // element types of each tuple component in the outputs of this
+  // iterator.
+  virtual const DataTypeVector& output_dtypes() const = 0;
+
+  // Returns a vector of tensor shapes, representing the respective
+  // (and possibly partially defined) shapes of each tuple component
+  // in the outputs of this iterator.
+  virtual const std::vector<PartialTensorShape>& output_shapes() const = 0;
+
+  // Saves the state of this iterator.
+  virtual Status Save(OpKernelContext* ctx, IteratorStateWriter* writer) {
+    return SaveInternal(writer);
+  }
+
+  // Restores the state of this iterator.
+  virtual Status Restore(IteratorContext* ctx, IteratorStateReader* reader) {
+    return RestoreInternal(ctx, reader);
+  }
+
+ protected:
+  // This is needed so that sub-classes of IteratorBase can call
+  // `SaveInternal` on their parent iterators, e.g., in
+  // `RepeatDataasetOp::Dataset`.
+  Status SaveParent(IteratorStateWriter* writer,
+                    const std::unique_ptr<IteratorBase>& parent) {
+    return parent->SaveInternal(writer);
+  }
+
+  // This is needed so that sub-classes of IteratorBase can call
+  // `RestoreInternal` on their parent iterators, e.g., in
+  // `RepeatDataasetOp::Dataset`.
+  Status RestoreParent(IteratorContext* ctx, IteratorStateReader* reader,
+                       const std::unique_ptr<IteratorBase>& parent) {
+    return parent->RestoreInternal(ctx, reader);
+  }
+
+  // Saves the state of this iterator recursively.
+  virtual Status SaveInternal(IteratorStateWriter* writer) {
+    return errors::Unimplemented("SaveInternal");
+  }
+
+  // Restores the state of this iterator recursively.
+  virtual Status RestoreInternal(IteratorContext* ctx,
+                                 IteratorStateReader* reader) {
+    return errors::Unimplemented("RestoreInternal");
+  }
+};
+
+// Represents a (potentially infinite) range of outputs, where each
+// output is a tuple of tensors.
+class DatasetBase : public core::RefCounted {
+ public:
+  // Returns a new iterator for iterating over the range of elements in
+  // this dataset.
+  //
+  // This method may be called multiple times on the same instance,
+  // and the resulting iterators will have distinct state. Each
+  // iterator will traverse all elements in this dataset from the
+  // start.
+  //
+  // Ownership of the created iterator will be transferred to the caller.
+  //
+  // The prefix identifies the sequence of iterators leading up to the newly
+  // created iterator.
+  virtual std::unique_ptr<IteratorBase> MakeIterator(
+      const string& prefix) const = 0;
+
+  // Returns a vector of DataType values, representing the respective
+  // element types of each tuple component in the outputs of this
+  // dataset.
+  virtual const DataTypeVector& output_dtypes() const = 0;
+
+  // Returns a vector of tensor shapes, representing the respective
+  // (and possibly partially defined) shapes of each tuple component
+  // in the outputs of this dataset.
+  virtual const std::vector<PartialTensorShape>& output_shapes() const = 0;
+
+  // A human-readable debug string for this dataset.
+  virtual string DebugString() = 0;
+
+  // Serializes the dataset and writes it to the `writer`.
+  virtual Status Save(OpKernelContext* ctx, IteratorStateWriter* writer) const {
+    return errors::Unimplemented("DatasetBase::Save");
+  }
+
+ protected:
+  // TODO(srbs): Ideally all graph related logic should reside in
+  // GraphDatasetBase. However, that would require Datasets defined in all ops
+  // to derive from GraphDatasetBase. Once that is done we can move
+  // DatasetGraphDefBuilder and AsGraphDefInternal to GraphDatasetBase.
+  class DatasetGraphDefBuilder : public GraphDefBuilderWrapper {
+   public:
+    DatasetGraphDefBuilder(GraphDefBuilder* b) : GraphDefBuilderWrapper(b) {}
+    Status AddParentDataset(OpKernelContext* ctx, const DatasetBase* dataset,
+                            Node** output) {
+      return dataset->AsGraphDefInternal(ctx, this, output);
+    }
+  };
+
+  virtual Status AsGraphDefInternal(OpKernelContext* ctx,
+                                    DatasetGraphDefBuilder* b,
+                                    Node** node) const {
+    return AsGraphDefInternal(b, node);
+  }
+
+  virtual Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
+                                    Node** node) const {
+    return errors::Unimplemented("AsGraphDefInternal");
+  }
+};
+
+// Base-class for datasets that are built by ops.
+class GraphDatasetBase : public DatasetBase {
+ public:
+  GraphDatasetBase(OpKernelContext* ctx)
+      : op_name_(ctx->op_kernel().type_string()) {}
+
+  const string op_name() const { return op_name_; }
+
+  Status Save(OpKernelContext* ctx,
+              IteratorStateWriter* writer) const override {
+    string serialized_graph_def;
+    string output_node;
+    TF_RETURN_IF_ERROR(Serialize(ctx, &serialized_graph_def, &output_node));
+    TF_RETURN_IF_ERROR(
+        writer->WriteScalar(kDatasetGraphKey, serialized_graph_def));
+    TF_RETURN_IF_ERROR(
+        writer->WriteScalar(kDatasetGraphOutputNodeKey, output_node));
+    return Status::OK();
+  }
+
+  // Key for storing the Dataset graph in the serialized format.
+  static const char kDatasetGraphKey[];
+
+  // Key for storing the output node of the Dataset graph in the serialized
+  // format.
+  static const char kDatasetGraphOutputNodeKey[];
+
+ private:
+  Status Serialize(OpKernelContext* ctx, string* serialized_graph_def,
+                   string* output_node) const;
+
+  const string op_name_;
+};
+
+// Represents an iterator that is associated with a particular parent dataset.
+template <class DatasetType>
+class DatasetIterator : public IteratorBase {
+ public:
+  struct Params {
+    // Owns one reference on the shared dataset resource.
+    const DatasetType* dataset;
+
+    // Identifies the sequence of iterators leading up to this iterator.
+    const string prefix;
+  };
+
+  explicit DatasetIterator(const Params& params) : params_(params) {
+    params_.dataset->Ref();
+  }
+
+  ~DatasetIterator() override { params_.dataset->Unref(); }
+
+  // The dataset from which this iterator was created.
+  const DatasetType* dataset() const { return params_.dataset; }
+
+  // The sequence of iterators leading up to this iterator.
+  const string prefix() const { return params_.prefix; }
+
+  const DataTypeVector& output_dtypes() const override {
+    return params_.dataset->output_dtypes();
+  }
+
+  const std::vector<PartialTensorShape>& output_shapes() const override {
+    return params_.dataset->output_shapes();
+  }
+
+  Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                 bool* end_of_sequence) final {
+    port::Tracing::TraceMe activity(params_.prefix);
+    Status s = GetNextInternal(ctx, out_tensors, end_of_sequence);
+    if (TF_PREDICT_FALSE(errors::IsOutOfRange(s) && !*end_of_sequence)) {
+      s = errors::Internal(
+          "Iterator \"", params_.prefix,
+          "\" returned OutOfRange without setting `*end_of_sequence`. This "
+          "indicates that an error may have occurred. Original message: ",
+          s.error_message());
+      LOG(ERROR) << s;
+    }
+    return s;
+  }
+
+  Status Save(OpKernelContext* ctx, IteratorStateWriter* writer) final {
+    TF_RETURN_IF_ERROR(dataset()->Save(ctx, writer));
+    return IteratorBase::Save(ctx, writer);
+  }
+
+ protected:
+  // Internal implementation of GetNext that is wrapped in tracing logic.
+  virtual Status GetNextInternal(IteratorContext* ctx,
+                                 std::vector<Tensor>* out_tensors,
+                                 bool* end_of_sequence) = 0;
+
+  string full_name(const string& name) const {
+    return strings::StrCat(prefix(), ":", name);
+  }
+
+ private:
+  Params params_;
+};
+
+// Encapsulates the work required to plug a DatasetBase into the core TensorFlow
+// graph execution engine.
+class DatasetOpKernel : public OpKernel {
+ public:
+  DatasetOpKernel(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  void Compute(OpKernelContext* ctx) final;
+
+ protected:
+  // Subclasses should implement this method. It will be called during Compute
+  // execution.
+  virtual void MakeDataset(OpKernelContext* ctx, DatasetBase** output) = 0;
+
+  template <typename T>
+  Status ParseScalarArgument(OpKernelContext* ctx,
+                             const StringPiece& argument_name, T* output) {
+    const Tensor* argument_t;
+    TF_RETURN_IF_ERROR(ctx->input(argument_name, &argument_t));
+    if (!TensorShapeUtils::IsScalar(argument_t->shape())) {
+      return errors::InvalidArgument(argument_name, " must be a scalar");
+    }
+    *output = argument_t->scalar<T>()();
+    return Status::OK();
+  }
+};
+
+// Encapsulates the work required to plug unary Datasets into the core
+// TensorFlow graph execution engine.
+class UnaryDatasetOpKernel : public DatasetOpKernel {
+ public:
+  UnaryDatasetOpKernel(OpKernelConstruction* ctx) : DatasetOpKernel(ctx) {}
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) final;
+  virtual void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                           DatasetBase** output) = 0;
+};
+
+// Encapsulates the work required to plug binary Datasets into the core
+// TensorFlow graph execution engine.
+class BinaryDatasetOpKernel : public DatasetOpKernel {
+ public:
+  BinaryDatasetOpKernel(OpKernelConstruction* ctx) : DatasetOpKernel(ctx) {}
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) final;
+  virtual void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                           DatasetBase* another_input,
+                           DatasetBase** output) = 0;
+};
+
+// Validates and extracts a `DatasetBase` object from `tensor`.
+//
+// `tensor` must have been written by a call to SetVariantTensorToDataset().
+//
+// The retrieved pointer is a borrowed reference to the dataset, which is owned
+// by the tensor. The consumer must either acquire its own reference to the
+// dataset by calling `(*out_dataset)->Ref()`, or ensure that `tensor` is not
+// destroyed or mutated while the retrieved pointer is in use.
+Status GetDatasetFromVariantTensor(const Tensor& tensor,
+                                   DatasetBase** out_dataset);
+
+// Stores a `DatasetBase` object in `tensor`.
+//
+// The ownership of `dataset` is transferred to `tensor`.
+Status StoreDatasetInVariantTensor(DatasetBase* dataset, Tensor* tensor);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_DATASET_H_
diff --git a/tensorflow/core/kernels/data/dataset.h b/tensorflow/core/kernels/data/dataset.h
index 8238661b87..2c6fc8d5b4 100644
--- a/tensorflow/core/kernels/data/dataset.h
+++ b/tensorflow/core/kernels/data/dataset.h
@@ -15,600 +15,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_DATA_DATASET_H_
 #define TENSORFLOW_CORE_KERNELS_DATA_DATASET_H_
 
-#include <memory>
-
-#include "tensorflow/core/framework/attr_value.pb.h"
-#include "tensorflow/core/framework/attr_value_util.h"
-#include "tensorflow/core/framework/dataset_stateful_op_whitelist.h"
-#include "tensorflow/core/framework/function.h"
-#include "tensorflow/core/framework/graph.pb.h"
-#include "tensorflow/core/framework/node_def.pb.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/framework/variant_encode_decode.h"
-#include "tensorflow/core/framework/variant_tensor_data.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/platform/tracing.h"
-
-// Polymorphic datasets should support all primitive TensorFlow
-// types. Use this macro to expand `m(T)` once for each primitive type
-// `T`, e.g. to build a `switch` statement.
-#define TF_CALL_DATASET_TYPES(m) TF_CALL_ALL_TYPES(m) TF_CALL_QUANTIZED_TYPES(m)
-
-namespace tensorflow {
-
-// Interface for reading values from a key-value store.
-// Used for restoring iterator state.
-class IteratorStateReader {
- public:
-  virtual Status ReadScalar(StringPiece key, int64* val) = 0;
-  virtual Status ReadScalar(StringPiece key, string* val) = 0;
-  virtual Status ReadTensor(StringPiece key, Tensor* val) = 0;
-  virtual bool Contains(StringPiece key) = 0;
-
-  virtual ~IteratorStateReader() {}
-};
-
-// Interface for writing values to a key-value store.
-// Used for saving iterator state.
-class IteratorStateWriter {
- public:
-  virtual Status WriteScalar(StringPiece key, const int64 val) = 0;
-  virtual Status WriteScalar(StringPiece key, const string& val) = 0;
-  virtual Status WriteTensor(StringPiece key, const Tensor& val) = 0;
-
-  virtual ~IteratorStateWriter() {}
-};
-
-// Forward declarations to avoid introducing a dependency on headers in
-// "tensorflow/core/graph/...".
-class GraphDefBuilder;
-class GraphDatasetBase;
-class Node;
-
-// Wrapper around GraphDefBuilder. Used to serialize Dataset graph.
-class GraphDefBuilderWrapper {
- public:
-  explicit GraphDefBuilderWrapper(GraphDefBuilder* b) : b_(b) {}
-
-  // Adds a Const node with scalar value to the Graph.
-  // `*output` contains a pointer to the output `Node`. It is guaranteed to be
-  // non-null if the method returns with an OK status.
-  // The returned Node pointer is owned by the backing Graph of GraphDefBuilder.
-  template <typename T>
-  Status AddScalar(const T& val, Node** output) {
-    Tensor val_t = Tensor(DataTypeToEnum<T>::v(), TensorShape({}));
-    val_t.scalar<T>()() = val;
-    AddTensorInternal(val_t, output);
-    if (*output == nullptr) {
-      return errors::Internal("AddScalar: Failed to build Const op.");
-    }
-    return Status::OK();
-  }
-
-  // Adds a Const node with vector value to the Graph.
-  // `*output` contains a pointer to the output `Node`. It is guaranteed to be
-  // non-null if the method returns with an OK status.
-  // The returned Node pointer is owned by the backing Graph of GraphDefBuilder.
-  // TODO(shivaniagrawal): Consider changing to gtl::ArraySlice?
-  template <typename T>
-  Status AddVector(const std::vector<T>& val, Node** output) {
-    Tensor val_t = Tensor(DataTypeToEnum<T>::v(),
-                          TensorShape({static_cast<int64>(val.size())}));
-    for (int i = 0; i < val.size(); i++) {
-      val_t.flat<T>()(i) = val[i];
-    }
-    AddTensorInternal(val_t, output);
-    if (*output == nullptr) {
-      return errors::Internal("AddVector: Failed to build Const op.");
-    }
-    return Status::OK();
-  }
-
-  // Adds a Const node with Tensor value to the Graph.
-  // `*output` contains a pointer to the output `Node`. It is guaranteed to be
-  // non-null if the method returns with an OK status.
-  // The returned Node pointer is owned by the backing Graph of GraphDefBuilder.
-  Status AddTensor(const Tensor& val, Node** output) {
-    AddTensorInternal(val, output);
-    if (*output == nullptr) {
-      return errors::Internal("AddTensor: Failed to build Const op.");
-    }
-    return Status::OK();
-  }
-
-  Status AddDataset(const GraphDatasetBase* dataset,
-                    const std::vector<Node*>& inputs, Node** output) {
-    return AddDataset(dataset, inputs, {}, output);
-  }
-
-  // Adds a node corresponding to the `DatasetType` to the Graph.
-  // Return value of `DatasetType::op_name()` is used as the op type for the
-  // node.
-  // Values for the output_types and output_shapes node attributes are also
-  // written if those attributes are defined in the OpDef.
-  // `*output` contains a pointer to the output `Node`. It is guaranteed to be
-  // non-null if the method returns with an OK status.
-  // The returned Node pointer is owned by the backing Graph of GraphDefBuilder.
-  Status AddDataset(const GraphDatasetBase* dataset,
-                    const std::vector<Node*>& inputs,
-                    const std::vector<std::pair<StringPiece, AttrValue>>& attrs,
-                    Node** output) {
-    std::vector<std::pair<size_t, Node*>> enumerated_inputs(inputs.size());
-    for (int i = 0; i < inputs.size(); i++) {
-      enumerated_inputs[i] = std::make_pair(i, inputs[i]);
-    }
-    return AddDataset(dataset, enumerated_inputs, {}, attrs, output);
-  }
-
-  Status AddDataset(
-      const GraphDatasetBase* dataset,
-      const std::vector<std::pair<size_t, Node*>>& inputs,
-      const std::vector<std::pair<size_t, gtl::ArraySlice<Node*>>>& list_inputs,
-      const std::vector<std::pair<StringPiece, AttrValue>>& attrs,
-      Node** output);
-
-  // Adds a user-defined function with name `function_name` to the graph and
-  // recursively adds all functions it references. If a function with a matching
-  // name has already been added, returns with OK status. If a user-defined with
-  // name `function_name` is not found in the FunctionLibraryDefinition, returns
-  // an InvalidArgumentError. If the function with name `function_name` or any
-  // of its dependent functions are stateful, returns an InvalidArgument error.
-  Status AddFunction(OpKernelContext* ctx, const string& function_name);
-
-  template <typename T>
-  void BuildAttrValue(const T& value, AttrValue* attr) {
-    SetAttrValue(value, attr);
-  }
-
- private:
-  void AddTensorInternal(const Tensor& val, Node** output);
-
-  Status EnsureFunctionIsStateless(OpKernelContext* ctx,
-                                   const string& function_name) const {
-    const FunctionLibraryDefinition* lib_def =
-        ctx->function_library()->GetFunctionLibraryDefinition();
-    const FunctionDef* function_def = lib_def->Find(function_name);
-    if (!function_def) {
-      return errors::InvalidArgument("Unable to find FunctionDef for ",
-                                     function_name, " in registry.");
-    }
-    for (const NodeDef& node_def : function_def->node_def()) {
-      const OpDef* op_def;
-      TF_RETURN_IF_ERROR(lib_def->LookUpOpDef(node_def.op(), &op_def));
-      // TODO(b/65524810): Hack to allow functions to capture Dataset op
-      // nodes needed for FlatMap. Currently, source datasets nodes have been
-      // marked stateful to avoid constant folding since we do not have a
-      // good way of serializing them.
-      if (IsOpWhitelisted(op_def)) {
-        continue;
-      }
-      if (op_def->is_stateful()) {
-        return errors::InvalidArgument(
-            "Op[name: ", node_def.name(), ", type: ", node_def.op(), "] ",
-            "in function ", function_name, " is stateful. ",
-            "Saving stateful functions is not supported yet.");
-      }
-    }
-    return Status::OK();
-  }
-
-  // Returns whether an op has been whitelisted for use inside map_fns.
-  // Uses a heuristic to whitelist source dataset ops which have been
-  // marked stateful due to b/65524810.
-  // Also looks up the `op_def->name` in the global
-  // `WhitelistedStatefulOpRegistry`.
-  bool IsOpWhitelisted(const OpDef* op_def) const {
-    return (StringPiece(op_def->name()).ends_with("Dataset") &&
-            op_def->output_arg_size() == 1 &&
-            op_def->output_arg(0).type() == DT_VARIANT) ||
-           dataset::WhitelistedStatefulOpRegistry::Global()->Contains(
-               op_def->name());
-  }
-
-  bool HasAttr(const string& op_type_name, const string& attr_name) const;
-
-  bool HasAttr(const OpDef* op_def, const string& attr_name) const {
-    for (auto attr : op_def->attr()) {
-      if (attr.name() == attr_name) {
-        return true;
-      }
-    }
-    return false;
-  }
-
-  Status AddAttrFunctions(const AttrValue& attr_value, OpKernelContext* ctx) {
-    if (attr_value.has_func()) {
-      TF_RETURN_IF_ERROR(AddFunction(ctx, attr_value.func().name()));
-    } else if (attr_value.has_list()) {
-      for (const NameAttrList& name_attr_list : attr_value.list().func()) {
-        TF_RETURN_IF_ERROR(AddFunction(ctx, name_attr_list.name()));
-      }
-    }
-    return Status::OK();
-  }
-
-  GraphDefBuilder* b_;
-};
-
-class StatsAggregator;
-
-// A cut-down version of OpKernelContext for running computations in
-// iterators. Note that we cannot simply use OpKernelContext here
-// because we might run computation in an iterator whose lifetime is
-// not nested within the lifetime of a single OpKernelContext
-// (e.g. asynchronous prefetching).
-//
-// TODO(mrry): We will probably need to support more of
-// OpKernelContext here. For example, should allocation be handled by
-// the IteratorContext?
-// TODO(mrry): We're making some daring assumptions about the lifetime
-// of the runner passed in here. A runner will be deleted when the original
-// step ends, but all existing runners only close over session-lifetime (or
-// longer-lived) state, so we can make a copy of the function. There's nothing
-// in the definition of the API from which we took the runner to guarantee that
-// what we are doing is safe. We should formalize the properties here.
-class IteratorContext {
- public:
-  struct Params {
-    // Interface to operating system functionality.
-    Env* env;
-
-    // Function call support.
-    std::function<void(std::function<void()>)> runner = nullptr;
-
-    // A function that returns the current `StatsAggregator` instance to be
-    // used when recording statistics about the iterator.
-    //
-    // NOTE(mrry): This is somewhat awkward, because (i) the `StatsAggregator`
-    // is a property of the `IteratorResource` (which this class does not know
-    // about), and (ii) it can change after the `IteratorContext` has been
-    // created. Better suggestions are welcome!
-    std::function<std::shared_ptr<StatsAggregator>()> stats_aggregator_getter =
-        nullptr;
-
-    // The FunctionLibraryRuntime object to be used to make function calls.
-    FunctionLibraryRuntime* lib = nullptr;
-    std::shared_ptr<const FunctionLibraryDefinition> function_library = nullptr;
-
-    // The Allocator to be used to allocate the output of an iterator.
-    Allocator* allocator = nullptr;
-  };
-
-  explicit IteratorContext(Params params) : params_(std::move(params)) {}
-
-  Env* env() const { return params_.env; }
-
-  std::function<void(std::function<void()>)>* runner() {
-    return &params_.runner;
-  }
-
-  std::shared_ptr<StatsAggregator> stats_aggregator() {
-    if (params_.stats_aggregator_getter) {
-      return params_.stats_aggregator_getter();
-    } else {
-      return nullptr;
-    }
-  }
-
-  std::shared_ptr<const FunctionLibraryDefinition> function_library() {
-    return params_.function_library;
-  }
-
-  FunctionLibraryRuntime* lib() { return params_.lib; }
-
-  void set_lib(FunctionLibraryRuntime* lib) { params_.lib = lib; }
-
-  Allocator* allocator(AllocatorAttributes attrs);
-
- private:
-  Params params_;
-};
-
-// Represents the current position in a range of outputs, where the
-// range of outputs is typically represented by an `DatasetBase`,
-// defined below.
-class IteratorBase {
- public:
-  virtual ~IteratorBase() {}
-
-  // Gets the next output from the range that this iterator is traversing.
-  //
-  // If at least one output remains in this iterator's range, that
-  // output will be stored in `*out_tensors` and `false` will be
-  // stored in `*end_of_sequence`.
-  //
-  // If no more outputs remain in this iterator's range, `true` will
-  // be stored in `*end_of_sequence`, and the content of
-  // `*out_tensors` will be undefined.
-  //
-  // This method is thread-safe.
-  //
-  // TODO(mrry): Define `GetNextAsync()` or `GetNextManyAsync()`, and
-  // potentially remove this method.
-  virtual Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
-                         bool* end_of_sequence) = 0;
-
-  // Returns a vector of DataType values, representing the respective
-  // element types of each tuple component in the outputs of this
-  // iterator.
-  virtual const DataTypeVector& output_dtypes() const = 0;
-
-  // Returns a vector of tensor shapes, representing the respective
-  // (and possibly partially defined) shapes of each tuple component
-  // in the outputs of this iterator.
-  virtual const std::vector<PartialTensorShape>& output_shapes() const = 0;
-
-  // Saves the state of this iterator.
-  virtual Status Save(OpKernelContext* ctx, IteratorStateWriter* writer) {
-    return SaveInternal(writer);
-  }
-
-  // Restores the state of this iterator.
-  virtual Status Restore(IteratorContext* ctx, IteratorStateReader* reader) {
-    return RestoreInternal(ctx, reader);
-  }
-
- protected:
-  // This is needed so that sub-classes of IteratorBase can call
-  // `SaveInternal` on their parent iterators, e.g., in
-  // `RepeatDataasetOp::Dataset`.
-  Status SaveParent(IteratorStateWriter* writer,
-                    const std::unique_ptr<IteratorBase>& parent) {
-    return parent->SaveInternal(writer);
-  }
-
-  // This is needed so that sub-classes of IteratorBase can call
-  // `RestoreInternal` on their parent iterators, e.g., in
-  // `RepeatDataasetOp::Dataset`.
-  Status RestoreParent(IteratorContext* ctx, IteratorStateReader* reader,
-                       const std::unique_ptr<IteratorBase>& parent) {
-    return parent->RestoreInternal(ctx, reader);
-  }
-
-  // Saves the state of this iterator recursively.
-  virtual Status SaveInternal(IteratorStateWriter* writer) {
-    return errors::Unimplemented("SaveInternal");
-  }
-
-  // Restores the state of this iterator recursively.
-  virtual Status RestoreInternal(IteratorContext* ctx,
-                                 IteratorStateReader* reader) {
-    return errors::Unimplemented("RestoreInternal");
-  }
-};
-
-// Represents a (potentially infinite) range of outputs, where each
-// output is a tuple of tensors.
-class DatasetBase : public core::RefCounted {
- public:
-  // Returns a new iterator for iterating over the range of elements in
-  // this dataset.
-  //
-  // This method may be called multiple times on the same instance,
-  // and the resulting iterators will have distinct state. Each
-  // iterator will traverse all elements in this dataset from the
-  // start.
-  //
-  // Ownership of the created iterator will be transferred to the caller.
-  //
-  // The prefix identifies the sequence of iterators leading up to the newly
-  // created iterator.
-  virtual std::unique_ptr<IteratorBase> MakeIterator(
-      const string& prefix) const = 0;
-
-  // Returns a vector of DataType values, representing the respective
-  // element types of each tuple component in the outputs of this
-  // dataset.
-  virtual const DataTypeVector& output_dtypes() const = 0;
-
-  // Returns a vector of tensor shapes, representing the respective
-  // (and possibly partially defined) shapes of each tuple component
-  // in the outputs of this dataset.
-  virtual const std::vector<PartialTensorShape>& output_shapes() const = 0;
-
-  // A human-readable debug string for this dataset.
-  virtual string DebugString() = 0;
-
-  // Serializes the dataset and writes it to the `writer`.
-  virtual Status Save(OpKernelContext* ctx, IteratorStateWriter* writer) const {
-    return errors::Unimplemented("DatasetBase::Save");
-  }
-
- protected:
-  // TODO(srbs): Ideally all graph related logic should reside in
-  // GraphDatasetBase. However, that would require Datasets defined in all ops
-  // to derive from GraphDatasetBase. Once that is done we can move
-  // DatasetGraphDefBuilder and AsGraphDefInternal to GraphDatasetBase.
-  class DatasetGraphDefBuilder : public GraphDefBuilderWrapper {
-   public:
-    DatasetGraphDefBuilder(GraphDefBuilder* b) : GraphDefBuilderWrapper(b) {}
-    Status AddParentDataset(OpKernelContext* ctx, const DatasetBase* dataset,
-                            Node** output) {
-      return dataset->AsGraphDefInternal(ctx, this, output);
-    }
-  };
-
-  virtual Status AsGraphDefInternal(OpKernelContext* ctx,
-                                    DatasetGraphDefBuilder* b,
-                                    Node** node) const {
-    return AsGraphDefInternal(b, node);
-  }
-
-  virtual Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
-                                    Node** node) const {
-    return errors::Unimplemented("AsGraphDefInternal");
-  }
-};
-
-// Base-class for datasets that are built by ops.
-class GraphDatasetBase : public DatasetBase {
- public:
-  GraphDatasetBase(OpKernelContext* ctx)
-      : op_name_(ctx->op_kernel().type_string()) {}
-
-  const string op_name() const { return op_name_; }
-
-  Status Save(OpKernelContext* ctx,
-              IteratorStateWriter* writer) const override {
-    string serialized_graph_def;
-    string output_node;
-    TF_RETURN_IF_ERROR(Serialize(ctx, &serialized_graph_def, &output_node));
-    TF_RETURN_IF_ERROR(
-        writer->WriteScalar(kDatasetGraphKey, serialized_graph_def));
-    TF_RETURN_IF_ERROR(
-        writer->WriteScalar(kDatasetGraphOutputNodeKey, output_node));
-    return Status::OK();
-  }
-
-  // Key for storing the Dataset graph in the serialized format.
-  static const char kDatasetGraphKey[];
-
-  // Key for storing the output node of the Dataset graph in the serialized
-  // format.
-  static const char kDatasetGraphOutputNodeKey[];
-
- private:
-  Status Serialize(OpKernelContext* ctx, string* serialized_graph_def,
-                   string* output_node) const;
-
-  const string op_name_;
-};
-
-// Represents an iterator that is associated with a particular parent dataset.
-template <class DatasetType>
-class DatasetIterator : public IteratorBase {
- public:
-  struct Params {
-    // Owns one reference on the shared dataset resource.
-    const DatasetType* dataset;
-
-    // Identifies the sequence of iterators leading up to this iterator.
-    const string prefix;
-  };
-
-  explicit DatasetIterator(const Params& params) : params_(params) {
-    params_.dataset->Ref();
-  }
-
-  ~DatasetIterator() override { params_.dataset->Unref(); }
-
-  // The dataset from which this iterator was created.
-  const DatasetType* dataset() const { return params_.dataset; }
-
-  // The sequence of iterators leading up to this iterator.
-  const string prefix() const { return params_.prefix; }
-
-  const DataTypeVector& output_dtypes() const override {
-    return params_.dataset->output_dtypes();
-  }
-
-  const std::vector<PartialTensorShape>& output_shapes() const override {
-    return params_.dataset->output_shapes();
-  }
-
-  Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
-                 bool* end_of_sequence) final {
-    port::Tracing::TraceMe activity(params_.prefix);
-    Status s = GetNextInternal(ctx, out_tensors, end_of_sequence);
-    if (TF_PREDICT_FALSE(errors::IsOutOfRange(s) && !*end_of_sequence)) {
-      s = errors::Internal(
-          "Iterator \"", params_.prefix,
-          "\" returned OutOfRange without setting `*end_of_sequence`. This "
-          "indicates that an error may have occurred. Original message: ",
-          s.error_message());
-      LOG(ERROR) << s;
-    }
-    return s;
-  }
-
-  Status Save(OpKernelContext* ctx, IteratorStateWriter* writer) final {
-    TF_RETURN_IF_ERROR(dataset()->Save(ctx, writer));
-    return IteratorBase::Save(ctx, writer);
-  }
-
- protected:
-  // Internal implementation of GetNext that is wrapped in tracing logic.
-  virtual Status GetNextInternal(IteratorContext* ctx,
-                                 std::vector<Tensor>* out_tensors,
-                                 bool* end_of_sequence) = 0;
-
-  string full_name(const string& name) const {
-    return strings::StrCat(prefix(), ":", name);
-  }
-
- private:
-  Params params_;
-};
-
-// Encapsulates the work required to plug a DatasetBase into the core TensorFlow
-// graph execution engine.
-class DatasetOpKernel : public OpKernel {
- public:
-  DatasetOpKernel(OpKernelConstruction* ctx) : OpKernel(ctx) {}
-  void Compute(OpKernelContext* ctx) final;
-
- protected:
-  // Subclasses should implement this method. It will be called during Compute
-  // execution.
-  virtual void MakeDataset(OpKernelContext* ctx, DatasetBase** output) = 0;
-
-  template <typename T>
-  Status ParseScalarArgument(OpKernelContext* ctx,
-                             const StringPiece& argument_name, T* output) {
-    const Tensor* argument_t;
-    TF_RETURN_IF_ERROR(ctx->input(argument_name, &argument_t));
-    if (!TensorShapeUtils::IsScalar(argument_t->shape())) {
-      return errors::InvalidArgument(argument_name, " must be a scalar");
-    }
-    *output = argument_t->scalar<T>()();
-    return Status::OK();
-  }
-};
-
-// Encapsulates the work required to plug unary Datasets into the core
-// TensorFlow graph execution engine.
-class UnaryDatasetOpKernel : public DatasetOpKernel {
- public:
-  UnaryDatasetOpKernel(OpKernelConstruction* ctx) : DatasetOpKernel(ctx) {}
-
- protected:
-  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) final;
-  virtual void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
-                           DatasetBase** output) = 0;
-};
-
-// Encapsulates the work required to plug binary Datasets into the core
-// TensorFlow graph execution engine.
-class BinaryDatasetOpKernel : public DatasetOpKernel {
- public:
-  BinaryDatasetOpKernel(OpKernelConstruction* ctx) : DatasetOpKernel(ctx) {}
-
- protected:
-  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) final;
-  virtual void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
-                           DatasetBase* another_input,
-                           DatasetBase** output) = 0;
-};
-
-// Validates and extracts a `DatasetBase` object from `tensor`.
-//
-// `tensor` must have been written by a call to SetVariantTensorToDataset().
-//
-// The retrieved pointer is a borrowed reference to the dataset, which is owned
-// by the tensor. The consumer must either acquire its own reference to the
-// dataset by calling `(*out_dataset)->Ref()`, or ensure that `tensor` is not
-// destroyed or mutated while the retrieved pointer is in use.
-Status GetDatasetFromVariantTensor(const Tensor& tensor,
-                                   DatasetBase** out_dataset);
-
-// Stores a `DatasetBase` object in `tensor`.
-//
-// The ownership of `dataset` is transferred to `tensor`.
-Status StoreDatasetInVariantTensor(DatasetBase* dataset, Tensor* tensor);
-
-}  // namespace tensorflow
+#include "tensorflow/core/framework/dataset.h"
 
 #endif  // TENSORFLOW_CORE_KERNELS_DATA_DATASET_H_
-- 
GitLab


From ca22df219cee6ae1daf137fa174f9d0f3874d364 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Fri, 2 Feb 2018 17:59:16 -0800
Subject: [PATCH 618/966] Automated g4 rollback of changelist 184347012

PiperOrigin-RevId: 184360818
---
 tensorflow/core/framework/tensor.cc  | 4 ++--
 tensorflow/core/framework/tensor.h   | 6 ++++--
 tensorflow/python/kernel_tests/BUILD | 1 -
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/framework/tensor.cc b/tensorflow/core/framework/tensor.cc
index 4f08cdc1d7..77a3edcc10 100644
--- a/tensorflow/core/framework/tensor.cc
+++ b/tensorflow/core/framework/tensor.cc
@@ -615,11 +615,11 @@ void Tensor::CheckType(DataType expected_dtype) const {
 
 void Tensor::CheckTypeAndIsAligned(DataType expected_dtype) const {
   CHECK_EQ(dtype(), expected_dtype);
-  CHECK(IsAligned());
+  CHECK(IsAligned()) << "CheckTypeAndIsAligned";
 }
 
 void Tensor::CheckIsAlignedAndSingleElement() const {
-  CHECK(IsAligned());
+  CHECK(IsAligned()) << "Aligned and single element";
   CHECK_EQ(1, NumElements()) << "Must have a one element tensor";
 }
 
diff --git a/tensorflow/core/framework/tensor.h b/tensorflow/core/framework/tensor.h
index 92d10f0d8c..94c39c53a6 100644
--- a/tensorflow/core/framework/tensor.h
+++ b/tensorflow/core/framework/tensor.h
@@ -660,7 +660,8 @@ void Tensor::FillDimsAndValidateCompatibleShape(
 template <typename T, size_t NDIMS>
 typename TTypes<T, NDIMS>::Tensor Tensor::shaped(
     gtl::ArraySlice<int64> new_sizes) {
-  CheckTypeAndIsAligned(DataTypeToEnum<T>::v());
+  CheckType(DataTypeToEnum<T>::v());
+  CHECK(IsAligned());
   Eigen::array<Eigen::DenseIndex, NDIMS> dims;
   FillDimsAndValidateCompatibleShape(new_sizes, &dims);
   return typename TTypes<T, NDIMS>::Tensor(base<T>(), dims);
@@ -687,7 +688,8 @@ typename TTypes<T, NDIMS>::UnalignedTensor Tensor::unaligned_shaped(
 template <typename T, size_t NDIMS>
 typename TTypes<T, NDIMS>::ConstTensor Tensor::shaped(
     gtl::ArraySlice<int64> new_sizes) const {
-  CheckTypeAndIsAligned(DataTypeToEnum<T>::v());
+  CheckType(DataTypeToEnum<T>::v());
+  CHECK(IsAligned());
   Eigen::array<Eigen::DenseIndex, NDIMS> dims;
   FillDimsAndValidateCompatibleShape(new_sizes, &dims);
   return typename TTypes<T, NDIMS>::ConstTensor(base<T>(), dims);
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 54011b2b4d..c87b7652ad 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -96,7 +96,6 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
     ],
     grpc_enabled = True,
-    tags = ["no_oss_gpu_py3"],
 )
 
 cuda_py_test(
-- 
GitLab


From 09d2efecf44bf313d2e03abdc1c8884cf48e23ae Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 2 Feb 2018 18:03:11 -0800
Subject: [PATCH 619/966] Propagate outfeed sharding, if specified from
 TensorFlow.

PiperOrigin-RevId: 184361221
---
 tensorflow/compiler/xla/service/service.cc               | 6 +++---
 tensorflow/compiler/xla/service/user_computation.cc      | 6 ++----
 tensorflow/compiler/xla/service/user_computation.h       | 3 ++-
 tensorflow/compiler/xla/service/user_computation_test.cc | 3 ++-
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index a57b7e5717..98dfc89867 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -1453,9 +1453,9 @@ tensorflow::Status Service::Op(const OpRequest* arg, OpResponse* result) {
       handle_status = computation->AddInfeedInstruction(arg->infeed_request());
       break;
     case OpRequest::kOutfeedRequest:
-      TF_RETURN_IF_ERROR(
-          computation->AddOutfeedInstruction(arg->outfeed_request()));
-      return tensorflow::Status::OK();
+      handle_status =
+          computation->AddOutfeedInstruction(arg->outfeed_request());
+      break;
     case OpRequest::kMapRequest: {
       TF_ASSIGN_OR_RETURN(
           UserComputation * to_apply,
diff --git a/tensorflow/compiler/xla/service/user_computation.cc b/tensorflow/compiler/xla/service/user_computation.cc
index 2ea6507900..ef9c80b043 100644
--- a/tensorflow/compiler/xla/service/user_computation.cc
+++ b/tensorflow/compiler/xla/service/user_computation.cc
@@ -1185,7 +1185,7 @@ StatusOr<ComputationDataHandle> UserComputation::AddInfeedInstruction(
   return handle;
 }
 
-Status UserComputation::AddOutfeedInstruction(
+StatusOr<ComputationDataHandle> UserComputation::AddOutfeedInstruction(
     const OutfeedRequest& outfeed_request) {
   tensorflow::mutex_lock lock(mutex_);
 
@@ -1197,8 +1197,6 @@ Status UserComputation::AddOutfeedInstruction(
   // Verify that operand is valid.
   TF_RETURN_IF_ERROR(LookUpRequest(outfeed_request.operand()).status());
 
-  // No handle is returned, but a handle must be assigned to this instruction
-  // for computation versioning.
   ComputationDataHandle handle = CreateComputationDataHandle();
   OperationRequest& request =
       (*session_computation_.mutable_requests())[handle.handle()];
@@ -1209,7 +1207,7 @@ Status UserComputation::AddOutfeedInstruction(
   VLOG(1) << "AddOutfeedInstruction (" << GetVersionedHandleInternal()
           << "), data handle " << handle.handle() << ": "
           << outfeed_request.ShortDebugString();
-  return Status::OK();
+  return handle;
 }
 
 StatusOr<ComputationDataHandle> UserComputation::AddCallInstruction(
diff --git a/tensorflow/compiler/xla/service/user_computation.h b/tensorflow/compiler/xla/service/user_computation.h
index 4f92e58877..54bb24d6d7 100644
--- a/tensorflow/compiler/xla/service/user_computation.h
+++ b/tensorflow/compiler/xla/service/user_computation.h
@@ -146,7 +146,8 @@ class UserComputation {
       const InfeedRequest& infeed_request);
 
   // Enqueues an outfeed instruction onto this user computation.
-  Status AddOutfeedInstruction(const OutfeedRequest& outfeed_request);
+  StatusOr<ComputationDataHandle> AddOutfeedInstruction(
+      const OutfeedRequest& outfeed_request);
 
   // Enqueues a call instruction onto this user computation.
   StatusOr<ComputationDataHandle> AddCallInstruction(
diff --git a/tensorflow/compiler/xla/service/user_computation_test.cc b/tensorflow/compiler/xla/service/user_computation_test.cc
index ca02115863..2fa163953f 100644
--- a/tensorflow/compiler/xla/service/user_computation_test.cc
+++ b/tensorflow/compiler/xla/service/user_computation_test.cc
@@ -67,7 +67,8 @@ TEST_F(UserComputationTest, SimpleComputation) {
   *outfeed_request.mutable_operand() = constant_handle;
   *outfeed_request.mutable_shape() = kVectorShape;
   outfeed_request.set_outfeed_config("abc");
-  TF_ASSERT_OK(computation.AddOutfeedInstruction(outfeed_request));
+  TF_ASSERT_OK_AND_ASSIGN(ComputationDataHandle outfeed_handle,
+                          computation.AddOutfeedInstruction(outfeed_request));
 
   auto hlo_resolver = [](const VersionedComputationHandle& handle) {
     return nullptr;
-- 
GitLab


From 2b932c28d60ec4f248d950cd2ef69a7eb98bb66d Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Fri, 2 Feb 2018 18:57:10 -0800
Subject: [PATCH 620/966] [XLA] Minor cleanups related to multi-output fusion.

- Add some comments about preexisting invariants, and add some CHECKs.

- In the LoopEmitter constructor, materialize the given
  ArraySlice<IrArray> to a vector, so we don't rely on the given
  ArraySlice having any particular lifetime.

- Add the invariant that the LoopEmitter constructor which takes a
  list of IrArrays is only for multi-output fusion.  Previously it said:
  If you only pass one array, then treat it as regular fusion.  But this
  results in an LLVM type mismatch, because the given
  target_element_generator should be passing a struct with one element.

PiperOrigin-RevId: 184365310
---
 .../xla/service/gpu/hlo_to_ir_bindings.cc     |  6 +-
 .../xla/service/gpu/ir_emitter_unnested.cc    | 22 +++----
 .../xla/service/gpu/parallel_loop_emitter.h   |  5 ++
 .../compiler/xla/service/hlo_instruction.h    |  4 +-
 .../xla/service/llvm_ir/fused_ir_emitter.h    |  6 ++
 .../xla/service/llvm_ir/loop_emitter.cc       | 57 ++++++++++---------
 .../xla/service/llvm_ir/loop_emitter.h        |  8 ++-
 7 files changed, 66 insertions(+), 42 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
index c2115c4999..dd4426ca7b 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
+++ b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
@@ -191,7 +191,11 @@ static bool BuffersInvariantWithinConsumer(
 llvm_ir::IrArray HloToIrBindings::GetIrArray(const HloInstruction& hlo,
                                              const HloInstruction& consumer,
                                              const ShapeIndex& shape_index) {
-  llvm_ir::IrArray ir_array(GetBasePointer(hlo, shape_index),
+  llvm::Value* base_ptr = GetBasePointer(hlo, shape_index);
+  CHECK_NE(base_ptr, nullptr)
+      << "Buffer not assigned for shape_index " << shape_index.ToString()
+      << " of " << hlo.ToString();
+  llvm_ir::IrArray ir_array(base_ptr,
                             ShapeUtil::GetSubshape(hlo.shape(), shape_index));
   alias_analysis_.AddAliasingInformationToIrArray(hlo, &ir_array);
 
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 40725739e2..a4847f6ca9 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -1657,24 +1657,24 @@ Status IrEmitterUnnested::HandleReduce(HloInstruction* reduce) {
 }
 
 Status IrEmitterUnnested::HandleTuple(HloInstruction* tuple) {
-  tensorflow::gtl::ArraySlice<HloInstruction*> operands(tuple->operands());
-  bool all_tuple_elements_have_buffer = std::all_of(
-      operands.begin(), operands.end(), [this](HloInstruction* tuple_element) {
+  bool all_tuple_elements_have_buffer =
+      c_all_of(tuple->operands(), [&](HloInstruction* tuple_element) {
         return ir_emitter_context_->buffer_assignment().HasTopLevelAllocation(
             tuple_element);
       });
-  // Tuples (especially output tuples) can take too many tuple elements,
-  // causing the kernel emitted exceeds the parameter space limit
-  // (b/31336476). As an optimization, if all tuple elements have a buffer, we
-  // collect their buffer addresses in a host array, and then copy that array
-  // to the tuple's buffer.
+  // Tuples (especially tuples that are the final result of a computation) can
+  // be so huge that if we were to emit a kernel that took each tuple element as
+  // a parameter, we would exceed the max allowable number of parameters to a
+  // GPU kernel, b/31336476. As an optimization, if all tuple elements have a
+  // buffer, we collect their buffer addresses in a host array, and then copy
+  // that array to the tuple's buffer.
   //
   // Some tuple elements (e.g. const or bitcast of const) might not have a
-  // buffer -- their contents are stored in code. In that case, we fall back
-  // to emitting kernels which have access to their buffer addresses in code.
+  // buffer -- their contents are stored in code. In that case, we fall back to
+  // emitting kernels which have access to their buffer addresses in code.
   if (all_tuple_elements_have_buffer) {
     std::vector<BufferAllocation::Slice> tuple_element_buffers;
-    for (const HloInstruction* tuple_element : operands) {
+    for (const HloInstruction* tuple_element : tuple->operands()) {
       tuple_element_buffers.push_back(GetAllocationSlice(*tuple_element));
     }
     thunk_sequence_->emplace_back(MakeUnique<TupleThunk>(
diff --git a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h
index 934e7e1919..8ed63a854a 100644
--- a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h
@@ -42,6 +42,11 @@ class ParallelLoopEmitter : public llvm_ir::LoopEmitter {
                       const LaunchDimensions& launch_dimensions,
                       llvm::IRBuilder<>* ir_builder);
 
+  // Constructs a loop emitter for a loop that generates on element of each of N
+  // arrays on each iteration.
+  //
+  // This is used in multi-output fusion.  target_element_generator should
+  // produce a struct with N elements, one for each of target_arrays.
   ParallelLoopEmitter(
       const llvm_ir::ElementGenerator& target_element_generator,
       tensorflow::gtl::ArraySlice<llvm_ir::IrArray> target_arrays,
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index 84b469688b..bce9ebdda8 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -874,8 +874,8 @@ class HloInstruction {
   // Returns true if this instruction is a fusion instruction that generates
   // multiple outputs.
   const bool IsMultiOutputFusion() const {
-    return (opcode() == HloOpcode::kFusion &&
-            fused_expression_root()->opcode() == HloOpcode::kTuple);
+    return opcode() == HloOpcode::kFusion &&
+           fused_expression_root()->opcode() == HloOpcode::kTuple;
   }
 
   FusionKind fusion_kind() const {
diff --git a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
index 9ad7cd82cb..242062e616 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
@@ -34,6 +34,12 @@ namespace xla {
 
 // Unlike IrEmitter, this creates host functions which emit IR to generate the
 // output element at the given index. It is used to generate fused operations.
+//
+// This class handles both vanilla fusion and multi-output fusion.  In the MOF
+// case, the fusion node ends with a kTuple instruction, and the root generator
+// returned by this emitter returns an LLVM struct with N elements, one for each
+// element of the arrays in the tuple.  It follows that the arrays in the tuple
+// must have the same length.
 class FusedIrEmitter : public DfsHloVisitorWithDefault {
  public:
   using Generator = llvm_ir::ElementGenerator;
diff --git a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
index a5f7c850c3..b6b918ec78 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
@@ -51,37 +51,40 @@ LoopEmitter::LoopEmitter(const ElementGenerator& target_element_generator,
       shape_(target_array.GetShape()),
       ir_builder_(ir_builder) {}
 
+static LoopEmitter::BodyEmitter MakeBodyEmitterForMultiOutputFusion(
+    const ElementGenerator& target_element_generator,
+    const std::vector<IrArray>& target_arrays, llvm::IRBuilder<>* ir_builder) {
+  return [=](const llvm_ir::IrArray::Index array_index) {
+    TF_ASSIGN_OR_RETURN(llvm::Value * target_element,
+                        target_element_generator(array_index));
+    CHECK(target_element->getType()->isStructTy())
+        << "This BodyEmitter is for multi-output fusion, but target element "
+           "generator does not produce values of struct type.";
+    CHECK_EQ(target_element->getType()->getStructNumElements(),
+             target_arrays.size());
+
+    for (int64 i = 0; i < target_arrays.size(); ++i) {
+      target_arrays[i].EmitWriteArrayElement(
+          array_index, ir_builder->CreateExtractValue(target_element, i),
+          ir_builder);
+    }
+    return Status::OK();
+  };
+}
+
 LoopEmitter::LoopEmitter(const ElementGenerator& target_element_generator,
                          tensorflow::gtl::ArraySlice<IrArray> target_arrays,
                          llvm::IRBuilder<>* ir_builder)
-    : body_emitter_([=](const llvm_ir::IrArray::Index array_index)
-                        -> ::tensorflow::Status {
-        // Convert target_element_generator to a BodyEmitter.
-        TF_ASSIGN_OR_RETURN(llvm::Value * target_element,
-                            target_element_generator(array_index));
-        if (target_arrays.size() == 1) {
-          target_arrays[0].EmitWriteArrayElement(array_index, target_element,
-                                                 ir_builder);
-          return tensorflow::Status::OK();
-        }
-
-        for (int64 i = 0; i < target_arrays.size(); ++i) {
-          target_arrays[i].EmitWriteArrayElement(
-              array_index, ir_builder_->CreateExtractValue(target_element, i),
-              ir_builder);
-        }
-        return tensorflow::Status::OK();
-      }),
+    : body_emitter_(MakeBodyEmitterForMultiOutputFusion(
+          target_element_generator,
+          std::vector<IrArray>(target_arrays.begin(), target_arrays.end()),
+          ir_builder)),
+      shape_(target_arrays[0].GetShape()),
       ir_builder_(ir_builder) {
-  if (target_arrays.size() > 1) {
-    // The sanity check for multiple outputs.
-    shape_ = target_arrays[0].GetShape();
-    for (int64 i = 1; i < target_arrays.size(); ++i) {
-      const Shape& element_shape = target_arrays[i].GetShape();
-      CHECK(ShapeUtil::SameDimensions(shape_, element_shape));
-    }
-  } else {
-    shape_ = target_arrays[0].GetShape();
+  // Sanity check: In multi-output fusion, all shapes produced must have the
+  // same dimensions.
+  for (const IrArray& array : target_arrays) {
+    CHECK(ShapeUtil::SameDimensions(shape_, array.GetShape()));
   }
 }
 
diff --git a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h
index 1ef1dc2464..0fc528439a 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h
@@ -47,10 +47,16 @@ class LoopEmitter {
   // element of the given target array.
   LoopEmitter(const ElementGenerator& target_element_generator,
               const IrArray& target_array, llvm::IRBuilder<>* ir_builder);
-  // Same as previous method except emits multiple targets in an array.
+
+  // Constructs a LoopEmitter that emits one element into each of N separate
+  // arrays on each iteration of the loop.
+  //
+  // This is used for multi-output fusion.  target_element_generator must
+  // produce an LLVM struct with N elements.
   LoopEmitter(const ElementGenerator& target_element_generator,
               tensorflow::gtl::ArraySlice<IrArray> target_arrays,
               llvm::IRBuilder<>* ir_builder);
+
   LoopEmitter(const LoopEmitter&) = delete;
   LoopEmitter& operator=(const LoopEmitter&) = delete;
   virtual ~LoopEmitter() = default;
-- 
GitLab


From 0fab6e888c5f90de3e878566123c1906261ce27e Mon Sep 17 00:00:00 2001
From: Mingsheng Hong <hongm@google.com>
Date: Fri, 2 Feb 2018 19:10:39 -0800
Subject: [PATCH 621/966] Extended TFE_OpSetDevice() with the ability to set an
 op device from non-GPU back to GPU.

Added unit testing, and also refined unit test logic for checking the presence
of a GPU device. The latter is needed when we add XLA device support.

PiperOrigin-RevId: 184366172
---
 tensorflow/c/eager/c_api.cc      | 26 +++++----
 tensorflow/c/eager/c_api.h       |  3 ++
 tensorflow/c/eager/c_api_test.cc | 92 +++++++++++++++++++++++---------
 3 files changed, 84 insertions(+), 37 deletions(-)

diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index d5b9bff505..d65b592895 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -253,15 +253,6 @@ TFE_Op* TFE_NewOp(TFE_Context* ctx, const char* op_or_function_name,
 
 void TFE_DeleteOp(TFE_Op* op) { delete op; }
 
-static void TFE_OpSetDeviceHelper(TFE_Op* op, tensorflow::Device* device,
-                                  TF_Status* status) {
-  // Questionable heuristic: Place the op on the same device as the first input
-  // placed outside of host memory?
-  if (IsCPU(op->device) && !IsCPU(device)) {
-    op->device = device;
-  }
-}
-
 void TFE_OpSetDevice(TFE_Op* op, const char* device_name, TF_Status* status) {
   tensorflow::Device* d = nullptr;
   if (device_name != nullptr && strlen(device_name) > 0) {
@@ -269,11 +260,24 @@ void TFE_OpSetDevice(TFE_Op* op, const char* device_name, TF_Status* status) {
         op->ctx->session->device_mgr->LookupDevice(device_name, &d);
     if (!status->status.ok()) return;
   }
-  TFE_OpSetDeviceHelper(op, d, status);
+  op->device = d;
+}
+
+const char* TFE_OpGetDevice(TFE_Op* op, TF_Status* status) {
+  tensorflow::Device* device =
+      (op->device == nullptr) ? op->ctx->devices()[0] : op->device;
+  return device->name().c_str();
 }
 
 void TFE_OpAddInput(TFE_Op* op, TFE_TensorHandle* h, TF_Status* status) {
-  TFE_OpSetDeviceHelper(op, h->d, status);
+  // Questionable heuristic ...
+  //
+  // Motivation: After an 'op' is placed on GPU because some of its earlier
+  // inputs are on GPU, we want to keep the 'op' there, even if some later
+  // inputs of it are not on GPU.
+  if (IsCPU(op->device) && !IsCPU(h->d)) {
+    op->device = h->d;
+  }
   if (!status->status.ok()) return;
   op->inputs.push_back(h->t);
   op->input_devices.push_back(h->d);
diff --git a/tensorflow/c/eager/c_api.h b/tensorflow/c/eager/c_api.h
index 387de07894..6a2aff1591 100644
--- a/tensorflow/c/eager/c_api.h
+++ b/tensorflow/c/eager/c_api.h
@@ -154,6 +154,9 @@ TF_CAPI_EXPORT extern void TFE_DeleteOp(TFE_Op* op);
 
 TF_CAPI_EXPORT extern void TFE_OpSetDevice(TFE_Op* op, const char* device_name,
                                            TF_Status* status);
+// The returned string remains valid throughout the lifetime of 'op'.
+TF_CAPI_EXPORT extern const char* TFE_OpGetDevice(TFE_Op* op,
+                                                  TF_Status* status);
 
 TF_CAPI_EXPORT extern void TFE_OpAddInput(TFE_Op* op, TFE_TensorHandle* h, TF_Status* status);
 
diff --git a/tensorflow/c/eager/c_api_test.cc b/tensorflow/c/eager/c_api_test.cc
index 18e7a64435..b0409af87c 100644
--- a/tensorflow/c/eager/c_api_test.cc
+++ b/tensorflow/c/eager/c_api_test.cc
@@ -60,6 +60,31 @@ TFE_Op* MatMulOp(TFE_Context* ctx, TFE_TensorHandle* a, TFE_TensorHandle* b) {
   return op;
 }
 
+// If there is a GPU device, returns true and sets 'gpu_device_name'
+// accordingly.
+bool GetGPUDeviceName(TFE_Context* ctx, string* gpu_device_name) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TF_DeviceList* devices = TFE_ContextListDevices(ctx, status.get());
+  CHECK_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  const int num_devices = TF_DeviceListCount(devices);
+  for (int i = 0; i < num_devices; ++i) {
+    const string device_type(TF_DeviceListType(devices, i, status.get()));
+    CHECK_EQ(TF_GetCode(status.get()), TF_OK) << TF_Message(status.get());
+    const string device_name(TF_DeviceListName(devices, i, status.get()));
+    CHECK_EQ(TF_GetCode(status.get()), TF_OK) << TF_Message(status.get());
+    if (device_type == "GPU") {
+      *gpu_device_name = device_name;
+      LOG(INFO) << "Found GPU device " << device_name;
+      TF_DeleteDeviceList(devices);
+      return true;
+    }
+  }
+  TF_DeleteDeviceList(devices);
+  return false;
+}
+
 void BM_InitOp(int iters) {
   tensorflow::testing::StopTiming();
   TF_Status* status = TF_NewStatus();
@@ -288,22 +313,15 @@ TEST(CAPI, TensorHandleSilentCopy) {
   TF_Tensor* t = TFE_TensorHandleResolve(hcpu, status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
 
-  TF_DeviceList* devices = TFE_ContextListDevices(ctx, status.get());
-  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
-  const int num_devices = TF_DeviceListCount(devices);
-
   // Disable the test if no GPU is present.
-  if (num_devices > 1) {
-    const int device_to_use = 1;
-    const string name(TF_DeviceListName(devices, device_to_use, status.get()));
-    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
-
-    TFE_TensorHandle* hgpu =
-        TFE_TensorHandleCopyToDevice(hcpu, ctx, name.c_str(), status.get());
+  string gpu_device_name;
+  if (GetGPUDeviceName(ctx, &gpu_device_name)) {
+    TFE_TensorHandle* hgpu = TFE_TensorHandleCopyToDevice(
+        hcpu, ctx, gpu_device_name.c_str(), status.get());
     ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
 
     TFE_Op* matmul = MatMulOp(ctx, hcpu, hgpu);
-    TFE_OpSetDevice(matmul, name.c_str(), status.get());
+    TFE_OpSetDevice(matmul, gpu_device_name.c_str(), status.get());
     ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
     TFE_TensorHandle* retvals[1];
     int num_retvals = 1;
@@ -314,7 +332,6 @@ TEST(CAPI, TensorHandleSilentCopy) {
     TFE_DeleteTensorHandle(hgpu);
   }
 
-  TF_DeleteDeviceList(devices);
   TF_DeleteTensor(t);
   TFE_DeleteTensorHandle(hcpu);
   TFE_DeleteContext(ctx, status.get());
@@ -337,22 +354,15 @@ TEST(CAPI, TensorHandleSilentCopyLocal) {
   TF_Tensor* t = TFE_TensorHandleResolve(hcpu, status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
 
-  TF_DeviceList* devices = TFE_ContextListDevices(ctx, status.get());
-  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
-  const int num_devices = TF_DeviceListCount(devices);
-
   // Disable the test if no GPU is present.
-  if (num_devices > 1) {
-    const int device_to_use = 1;
-    const string name(TF_DeviceListName(devices, device_to_use, status.get()));
-    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
-
-    TFE_TensorHandle* hgpu =
-        TFE_TensorHandleCopyToDevice(hcpu, ctx, name.c_str(), status.get());
+  string gpu_device_name;
+  if (GetGPUDeviceName(ctx, &gpu_device_name)) {
+    TFE_TensorHandle* hgpu = TFE_TensorHandleCopyToDevice(
+        hcpu, ctx, gpu_device_name.c_str(), status.get());
     ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
 
     TFE_Op* matmul = MatMulOp(ctx, hcpu, hgpu);
-    TFE_OpSetDevice(matmul, name.c_str(), status.get());
+    TFE_OpSetDevice(matmul, gpu_device_name.c_str(), status.get());
     ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
     TFE_TensorHandle* retvals[1];
     int num_retvals = 1;
@@ -363,13 +373,43 @@ TEST(CAPI, TensorHandleSilentCopyLocal) {
     TFE_DeleteTensorHandle(hgpu);
   }
 
-  TF_DeleteDeviceList(devices);
   TF_DeleteTensor(t);
   TFE_DeleteTensorHandle(hcpu);
   TFE_DeleteContext(ctx, status.get());
   EXPECT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
 }
 
+TEST(CAPI, SetAndGetOpDevices) {
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_TensorHandle* m = TestMatrixTensorHandle();
+  TFE_Op* matmul = MatMulOp(ctx, m, m);
+
+  // Disable the test if no GPU is present.
+  string gpu_device_name;
+  if (GetGPUDeviceName(ctx, &gpu_device_name)) {
+    TFE_OpSetDevice(matmul, "GPU:0", status);
+    ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+    const char* device_name = TFE_OpGetDevice(matmul, status);
+    ASSERT_TRUE(strstr(device_name, "GPU:0") != nullptr);
+
+    TFE_OpSetDevice(matmul, "CPU:0", status);
+    ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+    device_name = TFE_OpGetDevice(matmul, status);
+    ASSERT_TRUE(strstr(device_name, "CPU:0") != nullptr);
+  }
+
+  TFE_DeleteOp(matmul);
+  TFE_DeleteTensorHandle(m);
+  TFE_DeleteContext(ctx, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TF_DeleteStatus(status);
+}
+
 TEST(CAPI, Execute) {
   TF_Status* status = TF_NewStatus();
   TFE_ContextOptions* opts = TFE_NewContextOptions();
-- 
GitLab


From 39010bef7f72709a87a275060878baac815744c2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 2 Feb 2018 19:40:30 -0800
Subject: [PATCH 622/966] A more efficient implementation of the Op using batch
 operations.

PiperOrigin-RevId: 184367562
---
 tensorflow/contrib/lite/kernels/BUILD         |  1 +
 tensorflow/contrib/lite/kernels/basic_rnn.cc  | 57 ++++----------
 .../kernels/bidirectional_sequence_rnn.cc     | 62 +++-------------
 .../contrib/lite/kernels/internal/BUILD       | 10 +++
 .../lite/kernels/internal/kernel_utils.cc     | 44 +++++++++++
 .../lite/kernels/internal/kernel_utils.h      | 40 ++++++++++
 .../kernels/unidirectional_sequence_rnn.cc    | 74 +++++--------------
 7 files changed, 135 insertions(+), 153 deletions(-)
 create mode 100644 tensorflow/contrib/lite/kernels/internal/kernel_utils.cc
 create mode 100644 tensorflow/contrib/lite/kernels/internal/kernel_utils.h

diff --git a/tensorflow/contrib/lite/kernels/BUILD b/tensorflow/contrib/lite/kernels/BUILD
index 8c40adfae5..a8ef0daede 100644
--- a/tensorflow/contrib/lite/kernels/BUILD
+++ b/tensorflow/contrib/lite/kernels/BUILD
@@ -156,6 +156,7 @@ cc_library(
         "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite:string_util",
         "//tensorflow/contrib/lite/kernels:gemm_support",
+        "//tensorflow/contrib/lite/kernels/internal:kernel_utils",
         "//tensorflow/contrib/lite/kernels/internal:optimized",
         "//tensorflow/contrib/lite/kernels/internal:optimized_base",
         "//tensorflow/contrib/lite/kernels/internal:quantization_util",
diff --git a/tensorflow/contrib/lite/kernels/basic_rnn.cc b/tensorflow/contrib/lite/kernels/basic_rnn.cc
index a0391e030f..2c5074eca3 100644
--- a/tensorflow/contrib/lite/kernels/basic_rnn.cc
+++ b/tensorflow/contrib/lite/kernels/basic_rnn.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/contrib/lite/builtin_op_data.h"
 #include "tensorflow/contrib/lite/context.h"
 #include "tensorflow/contrib/lite/kernels/activation_functor.h"
+#include "tensorflow/contrib/lite/kernels/internal/kernel_utils.h"
 #include "tensorflow/contrib/lite/kernels/op_macros.h"
 
 namespace tflite {
@@ -101,50 +102,20 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const int batch_size = input->dims->data[0];
   const int num_units = input_weights->dims->data[0];
   const int input_size = input->dims->data[1];
-  const int input_weights_stride = input_weights->dims->data[1];
-  const int recurrent_weights_stride = recurrent_weights->dims->data[1];
-
-  // For each batch
-  for (int b = 0; b < batch_size; b++) {
-    // Initialize the pointer to input, output and bias.
-    const float* input_ptr_batch = input->data.f + b * input_size;
-    float* output_ptr_batch = output->data.f + b * num_units;
-    float* hidden_state_ptr_batch = hidden_state->data.f + b * num_units;
-
-    // Initialize input_weights and recurrent_weights.
-    const float* input_weights_ptr = input_weights->data.f;
-    const float* recurrent_weights_ptr = recurrent_weights->data.f;
-
-    // Output = bias
-    for (int o = 0; o < num_units; o++) {
-      output_ptr_batch[o] = bias_ptr[o];
-    }
-
-    // Output += input * input_weights
-    for (int o = 0; o < num_units; o++) {
-      for (int i = 0; i < input_size; i++) {
-        output_ptr_batch[o] += input_ptr_batch[i] * input_weights_ptr[i];
-      }
-      input_weights_ptr += input_weights_stride;
-    }
-
-    // Output += recurrent_weights * hidden_state
-    for (int o = 0; o < num_units; o++) {
-      for (int h = 0; h < num_units; h++) {
-        output_ptr_batch[o] +=
-            hidden_state_ptr_batch[h] * recurrent_weights_ptr[h];
-      }
-      recurrent_weights_ptr += recurrent_weights_stride;
-    }
-
-    // Output = activation(Output) and update hidden_state
-    for (int o = 0; o < num_units; o++) {
-      output_ptr_batch[o] =
-          (ActivationFunctor(params->activation))(output_ptr_batch[o]);
-      hidden_state_ptr_batch[o] = output_ptr_batch[o];
-    }
-  }
 
+  // Initialize the pointer to hidden state.
+  float* hidden_state_ptr_batch = hidden_state->data.f;
+  // Initialize the pointer to input and output.
+  const float* input_ptr_batch = input->data.f;
+  float* output_ptr_batch = output->data.f;
+  // Initialize input_weights and recurrent_weights.
+  const float* input_weights_ptr = input_weights->data.f;
+  const float* recurrent_weights_ptr = recurrent_weights->data.f;
+
+  kernel_utils::RnnBatchStep(input_ptr_batch, input_weights_ptr,
+                             recurrent_weights_ptr, bias_ptr, input_size,
+                             num_units, batch_size, params->activation,
+                             hidden_state_ptr_batch, output_ptr_batch);
   return kTfLiteOk;
 }
 
diff --git a/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn.cc b/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn.cc
index f540816235..aa24c1f34c 100644
--- a/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn.cc
+++ b/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/contrib/lite/builtin_op_data.h"
 #include "tensorflow/contrib/lite/context.h"
 #include "tensorflow/contrib/lite/kernels/activation_functor.h"
+#include "tensorflow/contrib/lite/kernels/internal/kernel_utils.h"
 #include "tensorflow/contrib/lite/kernels/op_macros.h"
 
 namespace tflite {
@@ -119,47 +120,6 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
-namespace {
-// Performs one RNN computation step for the input specified by input_ptr_batch.
-// The RNN cell is specified by the pointers to its weights and biases, along
-// with the input size, number of units, strides, activation.
-// The pointers to the hidden state and the output are updated as a result.
-// TODO(mirkov): factor out this function to a shared library.
-void RnnStep(const float* input_ptr_batch, const float* input_weights_ptr,
-             const float* recurrent_weights_ptr, const float* bias_ptr,
-             int input_size, int num_units, int input_weights_stride,
-             int recurrent_weights_stride, TfLiteFusedActivation activation,
-             float* hidden_state_ptr_batch, float* output_ptr_batch) {
-  // Output = bias
-  for (int o = 0; o < num_units; o++) {
-    output_ptr_batch[o] = bias_ptr[o];
-  }
-
-  // Output += input * input_weights
-  for (int o = 0; o < num_units; o++) {
-    for (int i = 0; i < input_size; i++) {
-      output_ptr_batch[o] += input_ptr_batch[i] * input_weights_ptr[i];
-    }
-    input_weights_ptr += input_weights_stride;
-  }
-
-  // Output += recurrent_weights * hidden_state
-  for (int o = 0; o < num_units; o++) {
-    for (int h = 0; h < num_units; h++) {
-      output_ptr_batch[o] +=
-          hidden_state_ptr_batch[h] * recurrent_weights_ptr[h];
-    }
-    recurrent_weights_ptr += recurrent_weights_stride;
-  }
-
-  // Output = activation(Output) and update hidden_state
-  for (int o = 0; o < num_units; o++) {
-    output_ptr_batch[o] = (ActivationFunctor(activation))(output_ptr_batch[o]);
-    hidden_state_ptr_batch[o] = output_ptr_batch[o];
-  }
-}
-}  // namespace
-
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteSequenceRNNParams*>(node->builtin_data);
 
@@ -189,15 +149,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const int input_size = input->dims->data[2];
 
   const int fw_num_units = fw_input_weights->dims->data[0];
-  const int fw_input_weights_stride = fw_input_weights->dims->data[1];
-  const int fw_recurrent_weights_stride = fw_recurrent_weights->dims->data[1];
   const float* fw_bias_ptr = fw_bias->data.f;
   const float* fw_input_weights_ptr = fw_input_weights->data.f;
   const float* fw_recurrent_weights_ptr = fw_recurrent_weights->data.f;
 
   const int bw_num_units = bw_input_weights->dims->data[0];
-  const int bw_input_weights_stride = bw_input_weights->dims->data[1];
-  const int bw_recurrent_weights_stride = bw_recurrent_weights->dims->data[1];
   const float* bw_bias_ptr = bw_bias->data.f;
   const float* bw_input_weights_ptr = bw_input_weights->data.f;
   const float* bw_recurrent_weights_ptr = bw_recurrent_weights->data.f;
@@ -212,10 +168,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       float* output_ptr_batch =
           fw_output->data.f + b * fw_num_units * max_time + s * fw_num_units;
 
-      RnnStep(input_ptr_batch, fw_input_weights_ptr, fw_recurrent_weights_ptr,
-              fw_bias_ptr, input_size, fw_num_units, fw_input_weights_stride,
-              fw_recurrent_weights_stride, params->activation,
-              fw_hidden_state_ptr_batch, output_ptr_batch);
+      kernel_utils::RnnBatchStep(
+          input_ptr_batch, fw_input_weights_ptr, fw_recurrent_weights_ptr,
+          fw_bias_ptr, input_size, fw_num_units, /*batch_size=*/1,
+          params->activation, fw_hidden_state_ptr_batch, output_ptr_batch);
     }
     // Backward cell.
     float* bw_hidden_state_ptr_batch =
@@ -226,10 +182,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       float* output_ptr_batch =
           bw_output->data.f + b * bw_num_units * max_time + s * bw_num_units;
 
-      RnnStep(input_ptr_batch, bw_input_weights_ptr, bw_recurrent_weights_ptr,
-              bw_bias_ptr, input_size, bw_num_units, bw_input_weights_stride,
-              bw_recurrent_weights_stride, params->activation,
-              bw_hidden_state_ptr_batch, output_ptr_batch);
+      kernel_utils::RnnBatchStep(
+          input_ptr_batch, bw_input_weights_ptr, bw_recurrent_weights_ptr,
+          bw_bias_ptr, input_size, bw_num_units, /*batch_size=*/1,
+          params->activation, bw_hidden_state_ptr_batch, output_ptr_batch);
     }
   }
   return kTfLiteOk;
diff --git a/tensorflow/contrib/lite/kernels/internal/BUILD b/tensorflow/contrib/lite/kernels/internal/BUILD
index 288f1f8bbc..4691a543e9 100644
--- a/tensorflow/contrib/lite/kernels/internal/BUILD
+++ b/tensorflow/contrib/lite/kernels/internal/BUILD
@@ -290,6 +290,16 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "kernel_utils",
+    srcs = ["kernel_utils.cc"],
+    hdrs = ["kernel_utils.h"],
+    deps = [
+        ":tensor_utils",
+        "//tensorflow/contrib/lite:builtin_op_data",
+    ],
+)
+
 cc_library(
     name = "tensor_utils",
     srcs = [
diff --git a/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc b/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc
new file mode 100644
index 0000000000..510395126c
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc
@@ -0,0 +1,44 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
+
+namespace tflite {
+namespace kernel_utils {
+
+void RnnBatchStep(const float* input_ptr_batch, const float* input_weights_ptr,
+                  const float* recurrent_weights_ptr, const float* bias_ptr,
+                  int input_size, int num_units, int batch_size,
+                  TfLiteFusedActivation activation,
+                  float* hidden_state_ptr_batch, float* output_ptr_batch) {
+  // Output = bias
+  tensor_utils::VectorBatchVectorAssign(bias_ptr, num_units, batch_size,
+                                        output_ptr_batch);
+  // Output += input * input_weights
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      input_weights_ptr, num_units, input_size, input_ptr_batch, batch_size,
+      output_ptr_batch, /*result_stride=*/1);
+  // Output += recurrent_weights * hidden_state
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      recurrent_weights_ptr, num_units, num_units, hidden_state_ptr_batch,
+      batch_size, output_ptr_batch, /*result_stride=*/1);
+  // Output = activation(Output) and update hidden_state
+  tensor_utils::ApplyActivationToVector(
+      output_ptr_batch, num_units * batch_size, activation, output_ptr_batch);
+  tensor_utils::VectorBatchVectorAssign(output_ptr_batch, num_units, batch_size,
+                                        hidden_state_ptr_batch);
+}
+
+}  // namespace kernel_utils
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/internal/kernel_utils.h b/tensorflow/contrib/lite/kernels/internal/kernel_utils.h
new file mode 100644
index 0000000000..9872d4500b
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/kernel_utils.h
@@ -0,0 +1,40 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_KERNEL_UTILS_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_KERNEL_UTILS_H_
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+
+namespace tflite {
+namespace kernel_utils {
+
+// Performs an RNN batch inference step for inputs specified by input_ptr_batch.
+// The RNN cell is specified by the pointers to its input and recurrent weights,
+// and biases, along with the input size, number of units, activation.
+//
+// The pointers to the hidden state and the output are updated as a result.
+//
+// The pointers with the suffix "_batch" point to data aligned in batch_major
+// order, and each step processes batch_size many inputs from input_ptr_batch,
+// and updates batch_size many outputs and hidden states.
+void RnnBatchStep(const float* input_ptr_batch, const float* input_weights_ptr,
+                  const float* recurrent_weights_ptr, const float* bias_ptr,
+                  int input_size, int num_units, int batch_size,
+                  TfLiteFusedActivation activation,
+                  float* hidden_state_ptr_batch, float* output_ptr_batch);
+
+}  // namespace kernel_utils
+}  // namespace tflite
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_KERNEL_UTILS_H_
diff --git a/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc b/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc
index 7ce87e4deb..ac00c37b67 100644
--- a/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc
+++ b/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/contrib/lite/builtin_op_data.h"
 #include "tensorflow/contrib/lite/context.h"
 #include "tensorflow/contrib/lite/kernels/activation_functor.h"
+#include "tensorflow/contrib/lite/kernels/internal/kernel_utils.h"
 #include "tensorflow/contrib/lite/kernels/op_macros.h"
 
 namespace tflite {
@@ -88,42 +89,6 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
-namespace {
-void RnnStep(const float* input_ptr_batch, const float* input_weights_ptr,
-             const float* recurrent_weights_ptr, const float* bias_ptr,
-             int input_size, int num_units, int input_weights_stride,
-             int recurrent_weights_stride, TfLiteFusedActivation activation,
-             float* hidden_state_ptr_batch, float* output_ptr_batch) {
-  // Output = bias
-  for (int o = 0; o < num_units; o++) {
-    output_ptr_batch[o] = bias_ptr[o];
-  }
-
-  // Output += input * input_weights
-  for (int o = 0; o < num_units; o++) {
-    for (int i = 0; i < input_size; i++) {
-      output_ptr_batch[o] += input_ptr_batch[i] * input_weights_ptr[i];
-    }
-    input_weights_ptr += input_weights_stride;
-  }
-
-  // Output += recurrent_weights * hidden_state
-  for (int o = 0; o < num_units; o++) {
-    for (int h = 0; h < num_units; h++) {
-      output_ptr_batch[o] +=
-          hidden_state_ptr_batch[h] * recurrent_weights_ptr[h];
-    }
-    recurrent_weights_ptr += recurrent_weights_stride;
-  }
-
-  // Output = activation(Output) and update hidden_state
-  for (int o = 0; o < num_units; o++) {
-    output_ptr_batch[o] = (ActivationFunctor(activation))(output_ptr_batch[o]);
-    hidden_state_ptr_batch[o] = output_ptr_batch[o];
-  }
-}
-}  // namespace
-
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteSequenceRNNParams*>(node->builtin_data);
 
@@ -147,30 +112,25 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       (time_major) ? input->dims->data[0] : input->dims->data[1];
   const int num_units = input_weights->dims->data[0];
   const int input_size = input->dims->data[2];
-  const int input_weights_stride = input_weights->dims->data[1];
-  const int recurrent_weights_stride = recurrent_weights->dims->data[1];
 
   // Initialize input_weights and recurrent_weights.
   const float* input_weights_ptr = input_weights->data.f;
   const float* recurrent_weights_ptr = recurrent_weights->data.f;
 
   if (time_major) {
-    // Unroll the sequence
+    // Initialize the pointer to hidden state.
+    float* hidden_state_ptr_batch = hidden_state->data.f;
+    // Unroll the sequence and use batch batch operations for efficiency.
     for (int s = 0; s < max_time; s++) {
-      for (int b = 0; b < batch_size; b++) {
-        // Initialize the pointer to hidden state.
-        float* hidden_state_ptr_batch = hidden_state->data.f + b * num_units;
-        // Initialize the pointer to input and output.
-        const float* input_ptr_batch =
-            input->data.f + s * input_size * batch_size + b * input_size;
-        float* output_ptr_batch =
-            output->data.f + s * num_units * batch_size + b * num_units;
-
-        RnnStep(input_ptr_batch, input_weights_ptr, recurrent_weights_ptr,
-                bias_ptr, input_size, num_units, input_weights_stride,
-                recurrent_weights_stride, params->activation,
-                hidden_state_ptr_batch, output_ptr_batch);
-      }
+      // Initialize the pointer to input and output.
+      const float* input_ptr_batch =
+          input->data.f + s * input_size * batch_size;
+      float* output_ptr_batch = output->data.f + s * num_units * batch_size;
+
+      kernel_utils::RnnBatchStep(input_ptr_batch, input_weights_ptr,
+                                 recurrent_weights_ptr, bias_ptr, input_size,
+                                 num_units, batch_size, params->activation,
+                                 hidden_state_ptr_batch, output_ptr_batch);
     }
   } else {
     // For each batch
@@ -184,10 +144,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         float* output_ptr_batch =
             output->data.f + b * num_units * max_time + s * num_units;
 
-        RnnStep(input_ptr_batch, input_weights_ptr, recurrent_weights_ptr,
-                bias_ptr, input_size, num_units, input_weights_stride,
-                recurrent_weights_stride, params->activation,
-                hidden_state_ptr_batch, output_ptr_batch);
+        kernel_utils::RnnBatchStep(
+            input_ptr_batch, input_weights_ptr, recurrent_weights_ptr, bias_ptr,
+            input_size, num_units, /*batch_size=*/1, params->activation,
+            hidden_state_ptr_batch, output_ptr_batch);
       }
     }
   }
-- 
GitLab


From a4f4d3131cdd63c871e864d631f8c8466924f10c Mon Sep 17 00:00:00 2001
From: Yu-Cheng Ling <ycling@google.com>
Date: Fri, 2 Feb 2018 22:39:45 -0800
Subject: [PATCH 623/966] Fix breakage: Can't build TFLite with Bazel for Mac /
 iOS.

PiperOrigin-RevId: 184375534
---
 .../contrib/lite/kernels/internal/optimized/cblas_conv.h   | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/cblas_conv.h b/tensorflow/contrib/lite/kernels/internal/optimized/cblas_conv.h
index a9b6663122..fcb9fac671 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/cblas_conv.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/cblas_conv.h
@@ -19,12 +19,9 @@ limitations under the License.
 // The Conv implementation based on CBLAS interface. This is only used on iOS
 // for now, utilizing Apple's Accelerate framework.
 
-#if defined(__APPLE__)
-#include <Accelerate/Accelerate.h>
-#else
+// TODO(ycling): Update the BUILD file and integrate with Apple Accelerate
+// Famework when it's available.
 #include "tensorflow/contrib/lite/kernels/internal/optimized/cblas_reference.h"
-#endif  // __APPLE__
-
 #include "tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h"
 #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
 
-- 
GitLab


From 34bff30979896879815dd6fc4d77c1a37d9b98a0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 2 Feb 2018 23:02:16 -0800
Subject: [PATCH 624/966] [XLA] Add tests for Clamp with scalars S32 and U32.

PiperOrigin-RevId: 184376425
---
 .../xla/tests/scalar_computations_test.cc     | 60 ++++++++++++++++++-
 1 file changed, 57 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/xla/tests/scalar_computations_test.cc b/tensorflow/compiler/xla/tests/scalar_computations_test.cc
index 43e4d891a1..4da6ee9160 100644
--- a/tensorflow/compiler/xla/tests/scalar_computations_test.cc
+++ b/tensorflow/compiler/xla/tests/scalar_computations_test.cc
@@ -737,7 +737,61 @@ XLA_TEST_F(ScalarComputationsTest, PowScalar) {
   ComputeAndCompareR0<float>(&builder, 8.0, {}, error_spec_);
 }
 
-XLA_TEST_F(ScalarComputationsTest, ClampScalarHigh) {
+XLA_TEST_F(ScalarComputationsTest, ClampScalarHighS32) {
+  ComputationBuilder builder(client_, TestName());
+  builder.Clamp(builder.ConstantR0<int32>(-1),  // The lower bound.
+                builder.ConstantR0<int32>(5),   // The operand to be clamped.
+                builder.ConstantR0<int32>(3));  // The upper bound.
+
+  ComputeAndCompareR0<int32>(&builder, 3, {});
+}
+
+XLA_TEST_F(ScalarComputationsTest, ClampScalarMiddleS32) {
+  ComputationBuilder builder(client_, TestName());
+  builder.Clamp(builder.ConstantR0<int32>(-1),  // The lower bound.
+                builder.ConstantR0<int32>(2),   // The operand to be clamped.
+                builder.ConstantR0<int32>(3));  // The upper bound.
+
+  ComputeAndCompareR0<int32>(&builder, 2, {});
+}
+
+XLA_TEST_F(ScalarComputationsTest, ClampScalarLowS32) {
+  ComputationBuilder builder(client_, TestName());
+  builder.Clamp(builder.ConstantR0<int32>(-1),  // The lower bound.
+                builder.ConstantR0<int32>(-5),  // The operand to be clamped.
+                builder.ConstantR0<int32>(3));  // The upper bound.
+
+  ComputeAndCompareR0<int32>(&builder, -1, {});
+}
+
+XLA_TEST_F(ScalarComputationsTest, ClampScalarHighU32) {
+  ComputationBuilder builder(client_, TestName());
+  builder.Clamp(builder.ConstantR0<uint32>(1),   // The lower bound.
+                builder.ConstantR0<uint32>(5),   // The operand to be clamped.
+                builder.ConstantR0<uint32>(3));  // The upper bound.
+
+  ComputeAndCompareR0<uint32>(&builder, 3, {});
+}
+
+XLA_TEST_F(ScalarComputationsTest, ClampScalarMiddleU32) {
+  ComputationBuilder builder(client_, TestName());
+  builder.Clamp(builder.ConstantR0<uint32>(1),   // The lower bound.
+                builder.ConstantR0<uint32>(2),   // The operand to be clamped.
+                builder.ConstantR0<uint32>(3));  // The upper bound.
+
+  ComputeAndCompareR0<uint32>(&builder, 2, {});
+}
+
+XLA_TEST_F(ScalarComputationsTest, ClampScalarLowU32) {
+  ComputationBuilder builder(client_, TestName());
+  builder.Clamp(builder.ConstantR0<uint32>(1),   // The lower bound.
+                builder.ConstantR0<uint32>(0),   // The operand to be clamped.
+                builder.ConstantR0<uint32>(3));  // The upper bound.
+
+  ComputeAndCompareR0<uint32>(&builder, 1, {});
+}
+
+XLA_TEST_F(ScalarComputationsTest, ClampScalarHighF32) {
   ComputationBuilder builder(client_, TestName());
   builder.Clamp(builder.ConstantR0<float>(2.0f),   // The lower bound.
                 builder.ConstantR0<float>(5.0f),   // The operand to be clamped.
@@ -746,7 +800,7 @@ XLA_TEST_F(ScalarComputationsTest, ClampScalarHigh) {
   ComputeAndCompareR0<float>(&builder, 3.0, {}, error_spec_);
 }
 
-XLA_TEST_F(ScalarComputationsTest, ClampScalarMiddle) {
+XLA_TEST_F(ScalarComputationsTest, ClampScalarMiddleF32) {
   ComputationBuilder builder(client_, TestName());
   builder.Clamp(builder.ConstantR0<float>(2.0f),   // The lower bound.
                 builder.ConstantR0<float>(2.5f),   // The operand to be clamped.
@@ -755,7 +809,7 @@ XLA_TEST_F(ScalarComputationsTest, ClampScalarMiddle) {
   ComputeAndCompareR0<float>(&builder, 2.5, {}, error_spec_);
 }
 
-XLA_TEST_F(ScalarComputationsTest, ClampScalarLow) {
+XLA_TEST_F(ScalarComputationsTest, ClampScalarLowF32) {
   ComputationBuilder builder(client_, TestName());
   builder.Clamp(builder.ConstantR0<float>(2.0f),   // The lower bound.
                 builder.ConstantR0<float>(-5.0f),  // The operand to be clamped.
-- 
GitLab


From 3eef7708e92f80c73ba18abe702cd6b1abeb58f7 Mon Sep 17 00:00:00 2001
From: ManHyuk <manhyuk@kw.ac.kr>
Date: Sat, 3 Feb 2018 18:39:42 +0900
Subject: [PATCH 625/966] fix typo

---
 tensorflow/c/c_api.cc                     | 2 +-
 tensorflow/core/framework/numeric_types.h | 2 +-
 tensorflow/python/kernel_tests/BUILD      | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index 3c7f041b39..cb1cc178d1 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -2144,7 +2144,7 @@ Status CopyGraph(Graph* src_graph, Graph* dst_graph,
     opts.return_tensors.push_back(ToTensorId(nodes_to_return[i]));
   }
 
-  // TOOD(skyewm): change to OutputTensor
+  // TODO(skyewm): change to OutputTensor
   tensorflow::ImportGraphDefResults results;
   TF_RETURN_IF_ERROR(
       ImportGraphDef(opts, gdef, dst_graph, dst_refiner, &results));
diff --git a/tensorflow/core/framework/numeric_types.h b/tensorflow/core/framework/numeric_types.h
index 99a5d0a054..8249059c29 100644
--- a/tensorflow/core/framework/numeric_types.h
+++ b/tensorflow/core/framework/numeric_types.h
@@ -44,7 +44,7 @@ typedef Eigen::QUInt16 quint16;
 }  // namespace tensorflow
 
 namespace Eigen {
-// TOOD(xpan): We probably need to overwrite more methods to have correct eigen
+// TODO(xpan): We probably need to overwrite more methods to have correct eigen
 // behavior. E.g. loest(), is_integer, etc. See NumTraits.h in eigen.
 template <>
 struct NumTraits<tensorflow::bfloat16>
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 3c4d962995..d041423a2e 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -1295,7 +1295,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "control_flow_ops_py_test",
-    # TOOD(b/70473603): change this back to "small" once the C API is
+    # TODO(b/70473603): change this back to "small" once the C API is
     # permanently enabled
     size = "medium",
     srcs = ["control_flow_ops_py_test.py"],
-- 
GitLab


From a42450a76e43154cc3bf8977c2e9c8afb1d08621 Mon Sep 17 00:00:00 2001
From: RJ Ryan <rjryan@google.com>
Date: Sat, 3 Feb 2018 09:46:31 -0800
Subject: [PATCH 626/966] [tf-signal] Fix exception when input shape is unknown
 in mfccs_from_log_mel_spectrograms.

PiperOrigin-RevId: 184400783
---
 .../contrib/signal/python/kernel_tests/mfcc_ops_test.py  | 9 +++++++++
 tensorflow/contrib/signal/python/ops/mfcc_ops.py         | 2 +-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/signal/python/kernel_tests/mfcc_ops_test.py b/tensorflow/contrib/signal/python/kernel_tests/mfcc_ops_test.py
index c04f1cf5ba..e7743bdcba 100644
--- a/tensorflow/contrib/signal/python/kernel_tests/mfcc_ops_test.py
+++ b/tensorflow/contrib/signal/python/kernel_tests/mfcc_ops_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.contrib.signal.python.ops import mfcc_ops
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import spectral_ops_test_util
@@ -49,6 +50,14 @@ class MFCCTest(test.TestCase):
         signal = random_ops.random_normal((2, 3, 5))
         mfcc_ops.mfccs_from_log_mel_spectrograms(signal).eval()
 
+  def test_unknown_shape(self):
+    """A test that the op runs when shape and rank are unknown."""
+    with spectral_ops_test_util.fft_kernel_label_map():
+      with self.test_session(use_gpu=True):
+        signal = array_ops.placeholder_with_default(
+            random_ops.random_normal((2, 3, 5)), tensor_shape.TensorShape(None))
+        self.assertIsNone(signal.shape.ndims)
+        mfcc_ops.mfccs_from_log_mel_spectrograms(signal).eval()
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/signal/python/ops/mfcc_ops.py b/tensorflow/contrib/signal/python/ops/mfcc_ops.py
index 6cef95f742..4e842f7f10 100644
--- a/tensorflow/contrib/signal/python/ops/mfcc_ops.py
+++ b/tensorflow/contrib/signal/python/ops/mfcc_ops.py
@@ -105,4 +105,4 @@ def mfccs_from_log_mel_spectrograms(log_mel_spectrograms, name=None):
       num_mel_bins = array_ops.shape(log_mel_spectrograms)[-1]
 
     dct2 = spectral_ops.dct(log_mel_spectrograms)
-    return dct2 * math_ops.rsqrt(num_mel_bins * 2.0)
+    return dct2 * math_ops.rsqrt(math_ops.to_float(num_mel_bins) * 2.0)
-- 
GitLab


From 2ef28496c7ae6029291d83fba69e506514c5c62d Mon Sep 17 00:00:00 2001
From: Jonathan Hseu <jhseu@google.com>
Date: Sat, 3 Feb 2018 09:46:13 -0800
Subject: [PATCH 627/966] Delete device_functions.h include.

---
 tensorflow/core/util/cuda_device_functions.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/core/util/cuda_device_functions.h b/tensorflow/core/util/cuda_device_functions.h
index f787687f66..d67663c669 100644
--- a/tensorflow/core/util/cuda_device_functions.h
+++ b/tensorflow/core/util/cuda_device_functions.h
@@ -29,7 +29,6 @@ limitations under the License.
 #include <algorithm>
 #include <complex>
 #include "cuda/include/cuda.h"
-#include "cuda/include/device_functions.h"
 #include "tensorflow/core/platform/types.h"
 
 #if CUDA_VERSION >= 7050
-- 
GitLab


From cb7f4fe74165d764981ee1750187d0c0fdca4d5d Mon Sep 17 00:00:00 2001
From: MandarJKulkarni <33712629+MandarJKulkarni@users.noreply.github.com>
Date: Sun, 4 Feb 2018 04:52:22 +0530
Subject: [PATCH 628/966] Fixed a couple of typos (#16731)

Fixed a couple of typos
---
 tensorflow/c/c_api.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index 3c7f041b39..b7986ec5cc 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -195,10 +195,10 @@ TF_Tensor* TF_NewTensor(TF_DataType dtype, const int64_t* dims, int num_dims,
       reinterpret_cast<intptr_t>(data) % EIGEN_MAX_ALIGN_BYTES != 0) {
     // TF_STRING and TF_RESOURCE tensors have a different representation in
     // TF_Tensor than they do in tensorflow::Tensor. So a copy here is a waste
-    // (any alignement requirements will be taken care of by TF_TensorToTensor
+    // (any alignment requirements will be taken care of by TF_TensorToTensor
     // and TF_TensorFromTensor).
     //
-    // Other types have the same represntation, so copy only if it is safe to do
+    // Other types have the same representation, so copy only if it is safe to do
     // so.
     buf->data_ = allocate_tensor("TF_NewTensor", len);
     std::memcpy(buf->data_, data, len);
-- 
GitLab


From b46db3ccd1167f0e4e404a91455b3b7b5cc6cf59 Mon Sep 17 00:00:00 2001
From: Rohin Mohanadas <RohinMohanadas@users.noreply.github.com>
Date: Sun, 4 Feb 2018 00:22:48 +0100
Subject: [PATCH 629/966] change to hyper link (#16690)

-- 
GitLab


From 203733d565e41add7c897c7dfbda3498e3a21f9e Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Sat, 3 Feb 2018 17:24:29 -0800
Subject: [PATCH 630/966] Making the for_canonicalization_test only for py2

http://ci.tensorflow.org/view/Release/job/release-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/239/consoleFull
---
 tensorflow/contrib/py2tf/converters/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/contrib/py2tf/converters/BUILD b/tensorflow/contrib/py2tf/converters/BUILD
index 3853c60f99..e1849374be 100644
--- a/tensorflow/contrib/py2tf/converters/BUILD
+++ b/tensorflow/contrib/py2tf/converters/BUILD
@@ -131,6 +131,7 @@ py_test(
 py_test(
     name = "for_canonicalization_test",
     srcs = ["for_canonicalization_test.py"],
+    srcs_version = "PY2",
     deps = [
         ":test_lib",
         "//tensorflow/contrib/py2tf/pyct",
-- 
GitLab


From 764a39826ee3e994961c331f824ac2cd3bd18542 Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Sat, 3 Feb 2018 18:16:24 -0800
Subject: [PATCH 631/966] Update BUILD

---
 tensorflow/contrib/py2tf/converters/BUILD | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/py2tf/converters/BUILD b/tensorflow/contrib/py2tf/converters/BUILD
index e1849374be..03ded49022 100644
--- a/tensorflow/contrib/py2tf/converters/BUILD
+++ b/tensorflow/contrib/py2tf/converters/BUILD
@@ -131,7 +131,8 @@ py_test(
 py_test(
     name = "for_canonicalization_test",
     srcs = ["for_canonicalization_test.py"],
-    srcs_version = "PY2",
+    srcs_version = "PY2AND3",
+    tags = ["nomac"],
     deps = [
         ":test_lib",
         "//tensorflow/contrib/py2tf/pyct",
-- 
GitLab


From 0662ba4a230055ca379fbde5526e7435a762f0b1 Mon Sep 17 00:00:00 2001
From: Jonathan Hseu <jhseu@google.com>
Date: Sat, 3 Feb 2018 22:02:53 -0800
Subject: [PATCH 632/966] Fix the Windows GPU build #2

---
 tensorflow/core/util/cuda_device_functions.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tensorflow/core/util/cuda_device_functions.h b/tensorflow/core/util/cuda_device_functions.h
index f787687f66..c66727c364 100644
--- a/tensorflow/core/util/cuda_device_functions.h
+++ b/tensorflow/core/util/cuda_device_functions.h
@@ -32,10 +32,6 @@ limitations under the License.
 #include "cuda/include/device_functions.h"
 #include "tensorflow/core/platform/types.h"
 
-#if CUDA_VERSION >= 7050
-#include "cuda/include/cuda_fp16.h"
-#endif  // CUDA_VERSION >= 7050
-
 namespace tensorflow {
 
 namespace detail {
-- 
GitLab


From de6037d7505cd24b24fd937d1c011c8d24ca0e81 Mon Sep 17 00:00:00 2001
From: Blake Hechtman <blakehechtman@google.com>
Date: Sat, 3 Feb 2018 23:45:00 -0800
Subject: [PATCH 633/966] [XLA] Assign mandatory constraints in a DFS order and
 non-manatory constraints in a BFS order.

PiperOrigin-RevId: 184429818
---
 .../compiler/xla/service/layout_assignment.cc | 70 +++++++++++--------
 .../compiler/xla/service/layout_assignment.h  | 29 +++++---
 2 files changed, 58 insertions(+), 41 deletions(-)

diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index 5413b95cfb..fce135ef61 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -61,8 +61,8 @@ std::ostream& operator<<(std::ostream& out,
 
 BufferLayoutConstraint::BufferLayoutConstraint(const Layout& layout,
                                                const LogicalBuffer& buffer,
-                                               bool mandatory)
-    : LayoutConstraint(mandatory), layout_(layout), buffer_(&buffer) {
+                                               bool mandatory, bool dfs)
+    : LayoutConstraint(mandatory, dfs), layout_(layout), buffer_(&buffer) {
   CHECK(LayoutUtil::ValidateLayoutForShape(layout, buffer.shape()).ok());
 }
 
@@ -74,8 +74,8 @@ string BufferLayoutConstraint::ToString() const {
 
 OperandLayoutConstraint::OperandLayoutConstraint(
     const ShapeLayout& shape_layout, const HloInstruction* instruction,
-    int64 operand_no, bool mandatory)
-    : LayoutConstraint(mandatory),
+    int64 operand_no, bool mandatory, bool dfs)
+    : LayoutConstraint(mandatory, dfs),
       shape_layout_(shape_layout),
       instruction_(instruction),
       operand_no_(operand_no) {
@@ -134,7 +134,7 @@ bool LayoutConstraints::OperandBufferForwarded(
 
 Status LayoutConstraints::SetBufferLayout(const Layout& layout,
                                           const LogicalBuffer& buffer,
-                                          bool mandatory) {
+                                          bool mandatory, bool dfs) {
   VLOG(3) << "SetBufferLayout : " << buffer << " : "
           << LayoutUtil::HumanString(layout);
 
@@ -171,10 +171,11 @@ Status LayoutConstraints::SetBufferLayout(const Layout& layout,
   if (!overwrite) {
     iter = buffer_constraints_
                .insert(std::make_pair(
-                   &buffer, BufferLayoutConstraint(layout, buffer, mandatory)))
+                   &buffer,
+                   BufferLayoutConstraint(layout, buffer, mandatory, dfs)))
                .first;
   } else {
-    iter->second = BufferLayoutConstraint(layout, buffer, /*mandatory=*/true);
+    iter->second = BufferLayoutConstraint(layout, buffer, mandatory, dfs);
   }
   added_constraints_.push_back(&iter->second);
 
@@ -188,7 +189,8 @@ Status LayoutConstraints::SetBufferLayout(const Layout& layout,
 
 Status LayoutConstraints::SetOperandLayout(const Shape& shape_with_layout,
                                            const HloInstruction* instruction,
-                                           int64 operand_no, bool mandatory) {
+                                           int64 operand_no, bool mandatory,
+                                           bool dfs) {
   VLOG(3) << "SetOperandLayout : " << instruction->name() << ", operand "
           << operand_no << " : "
           << ShapeUtil::HumanStringWithLayout(shape_with_layout);
@@ -226,12 +228,12 @@ Status LayoutConstraints::SetOperandLayout(const Shape& shape_with_layout,
   if (iter == operand_constraints_.end()) {
     auto pair = std::make_pair(
         key, OperandLayoutConstraint(ShapeLayout(shape_with_layout),
-                                     instruction, operand_no, mandatory));
+                                     instruction, operand_no, mandatory, dfs));
     iter = operand_constraints_.insert(pair).first;
   } else {
     iter->second =
         OperandLayoutConstraint(ShapeLayout(shape_with_layout), instruction,
-                                operand_no, /*mandatory=*/true);
+                                operand_no, mandatory, dfs);
   }
   added_constraints_.push_back(&iter->second);
 
@@ -240,16 +242,17 @@ Status LayoutConstraints::SetOperandLayout(const Shape& shape_with_layout,
 
 Status LayoutConstraints::SetArrayOperandLayout(
     const Layout& layout, const HloInstruction* instruction, int64 operand_no,
-    bool mandatory) {
+    bool mandatory, bool dfs) {
   const HloInstruction* operand = instruction->operand(operand_no);
   TF_RET_CHECK(ShapeUtil::IsArray(operand->shape()));
   Shape shape(operand->shape());
   *shape.mutable_layout() = layout;
   TF_RETURN_IF_ERROR(LayoutUtil::ValidateLayoutInShape(shape));
-  return SetOperandLayout(shape, instruction, operand_no, mandatory);
+  return SetOperandLayout(shape, instruction, operand_no, mandatory, dfs);
 }
 
-Status LayoutConstraints::SetResultLayout(const Shape& shape_with_layout) {
+Status LayoutConstraints::SetResultLayout(const Shape& shape_with_layout,
+                                          bool dfs) {
   VLOG(3) << "SetResultLayout : "
           << ShapeUtil::HumanStringWithLayout(shape_with_layout);
 
@@ -267,14 +270,15 @@ Status LayoutConstraints::SetResultLayout(const Shape& shape_with_layout) {
   }
 
   result_constraint_.reset(
-      new ResultLayoutConstraint(ShapeLayout(shape_with_layout)));
+      new ResultLayoutConstraint(ShapeLayout(shape_with_layout), dfs));
   added_constraints_.push_back(result_constraint_.get());
 
   return Status::OK();
 }
 
 Status LayoutConstraints::SetInstructionLayout(
-    const Shape& shape_with_layout, const HloInstruction* instruction) {
+    const Shape& shape_with_layout, const HloInstruction* instruction,
+    bool mandatory, bool dfs) {
   VLOG(3) << "SetInstructionLayout : " << instruction->name() << ", "
           << ShapeUtil::HumanStringWithLayout(shape_with_layout);
 
@@ -290,8 +294,8 @@ Status LayoutConstraints::SetInstructionLayout(
   // instruction.
   return ShapeUtil::ForEachSubshapeWithStatus(
       shape_with_layout,
-      [this, instruction](const Shape& subshape,
-                          const ShapeIndex& index) -> Status {
+      [this, instruction, mandatory](const Shape& subshape,
+                                     const ShapeIndex& index) -> Status {
         // The precondition for this method is that the instruction defines all
         // buffers in its output.
         auto buffers =
@@ -300,7 +304,7 @@ Status LayoutConstraints::SetInstructionLayout(
         CHECK_EQ(buffers[0]->instruction(), instruction);
 
         if (ShapeUtil::IsArray(subshape)) {
-          return SetBufferLayout(subshape.layout(), *buffers[0]);
+          return SetBufferLayout(subshape.layout(), *buffers[0], mandatory);
         } else {
           return Status::OK();
         }
@@ -394,8 +398,7 @@ Status LayoutAssignment::AddMandatoryConstraints(
       // Constrain the input to the Outfeed instruction to be the expected
       // layout of the Outfeed.
       TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
-          instruction->outfeed_shape(), instruction, 0,
-          /*mandatory=*/true));
+          instruction->outfeed_shape(), instruction, 0));
     } else if (instruction->opcode() == HloOpcode::kParameter) {
       // Parameter layouts must match the respective layout in
       // ComputationLayout.
@@ -434,8 +437,8 @@ Status LayoutAssignment::AddMandatoryConstraints(
                                                                  {0}));
         Shape new_shape = channel_constraints->LayoutShapeForChannel(
             recv_buffer_shape, instruction->channel_id());
-        TF_RETURN_IF_ERROR(constraints->SetBufferLayout(
-            new_shape.layout(), *buffer, /*mandatory=*/true));
+        TF_RETURN_IF_ERROR(
+            constraints->SetBufferLayout(new_shape.layout(), *buffer));
       }
     }
   }
@@ -457,7 +460,7 @@ Status LayoutAssignment::AddMandatoryConstraints(
       for (int64 i = 0; i < instruction->operand_count(); ++i) {
         TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
             called_computation_layout.parameter_layout(i).shape(), instruction,
-            i, /*mandatory=*/true));
+            i));
       }
     } else if (instruction->opcode() == HloOpcode::kWhile) {
       // Layout of input and output of kWhile instruction must be equal and must
@@ -508,8 +511,7 @@ Status LayoutAssignment::AddMandatoryConstraints(
       TF_RETURN_IF_ERROR(constraints->SetInstructionLayout(
           body_layout.result_shape(), instruction));
       TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
-          body_layout.result_shape(), instruction, 0,
-          /*mandatory=*/true));
+          body_layout.result_shape(), instruction, 0));
     } else if (instruction->opcode() == HloOpcode::kCustomCall) {
       if (!CustomCallRequiresMajorFirstLayout(instruction)) {
         continue;
@@ -533,7 +535,7 @@ Status LayoutAssignment::AddMandatoryConstraints(
                 operand_shape.element_type(),
                 AsInt64Slice(operand_shape.dimensions()));
         TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
-            row_major_operand_shape, instruction, i, /*mandatory=*/true));
+            row_major_operand_shape, instruction, i));
       }
     }
   }
@@ -907,7 +909,11 @@ Status LayoutAssignment::PropagateConstraints(LayoutConstraints* constraints) {
   auto add_new_constraints_to_worklist = [constraints, &worklist]() {
     // Add constraints to the front of the deque for DFS ordering.
     for (auto* constraint : constraints->ConsumeAddedConstraints()) {
-      worklist.push_front(constraint);
+      if (constraint->dfs()) {
+        worklist.push_front(constraint);
+      } else {
+        worklist.push_back(constraint);
+      }
     }
   };
   add_new_constraints_to_worklist();
@@ -1390,7 +1396,7 @@ Status LayoutAssignment::RunOnComputation(
   // Add any backend-specific constraints.
   TF_RETURN_IF_ERROR(AddBackendConstraints(&constraints));
 
-  // Propagates layouts from an HLO to its neighbors.
+  // Propagates layouts from mandatory and backend constraints.
   TF_RETURN_IF_ERROR(PropagateConstraints(&constraints));
 
   // While any unconstrained buffers remain, pick an arbitrary buffer, give it a
@@ -1455,7 +1461,12 @@ StatusOr<bool> LayoutAssignment::Run(HloModule* module) {
   // Assign layouts to computations in an order such that a callee computation
   // is handled before its caller computation. This ensures that the layout of
   // all callers of a computation will agree.
+  std::list<HloComputation*> computation_post_order =
+      module->MakeComputationPostOrder();
   for (auto* computation : module->MakeComputationPostOrder()) {
+    if (computation->IsFusionComputation()) {
+      continue;
+    }
     // Clear existing layouts of the instructions.  All layouts must be assigned
     // by the LayoutAssignment pass, except for those on infeeds, parameters,
     // and the computation result. The latter two are specified in
@@ -1467,13 +1478,10 @@ StatusOr<bool> LayoutAssignment::Run(HloModule* module) {
         LayoutUtil::ClearLayout(instruction->mutable_shape());
       }
     }
-
     if (computation == module->entry_computation()) {
       TF_RETURN_IF_ERROR(RunOnComputation(
           *entry_computation_layout_, *points_to_analysis,
           module->entry_computation(), channel_layout_constraints_));
-    } else if (computation->IsFusionComputation()) {
-      continue;
     } else {
       ComputationLayout computation_layout(computation->ComputeProgramShape());
       // Setting all embedded computations to the default layout is potentially
diff --git a/tensorflow/compiler/xla/service/layout_assignment.h b/tensorflow/compiler/xla/service/layout_assignment.h
index 6bfae29986..2901858448 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.h
+++ b/tensorflow/compiler/xla/service/layout_assignment.h
@@ -46,7 +46,8 @@ namespace xla {
 // gathered together in LayoutConstraints object.
 class LayoutConstraint {
  public:
-  LayoutConstraint(bool mandatory) : mandatory_(mandatory) {}
+  LayoutConstraint(bool mandatory, bool dfs)
+      : mandatory_(mandatory), dfs_(dfs) {}
   virtual ~LayoutConstraint() = default;
 
   virtual string ToString() const = 0;
@@ -54,8 +55,12 @@ class LayoutConstraint {
   // True if this constraint cannot be overwritten by a different constraint.
   bool mandatory() const { return mandatory_; }
 
+  // When true, propagate in DFS. When false, constraint will propagate in BFS.
+  bool dfs() const { return dfs_; }
+
  private:
   bool mandatory_;
+  bool dfs_;
 };
 
 std::ostream& operator<<(std::ostream& out, const LayoutConstraint& constraint);
@@ -65,7 +70,7 @@ std::ostream& operator<<(std::ostream& out, const LayoutConstraint& constraint);
 class BufferLayoutConstraint : public LayoutConstraint {
  public:
   BufferLayoutConstraint(const Layout& layout, const LogicalBuffer& buffer,
-                         bool mandatory);
+                         bool mandatory, bool dfs);
 
   const LogicalBuffer& buffer() const { return *buffer_; }
   const Layout& layout() const { return layout_; }
@@ -86,7 +91,7 @@ class OperandLayoutConstraint : public LayoutConstraint {
  public:
   OperandLayoutConstraint(const ShapeLayout& shape_layout,
                           const HloInstruction* instruction, int64 operand_no,
-                          bool mandatory);
+                          bool mandatory, bool dfs);
 
   const ShapeLayout& shape_layout() const { return shape_layout_; }
   const HloInstruction* instruction() const { return instruction_; }
@@ -106,8 +111,10 @@ class OperandLayoutConstraint : public LayoutConstraint {
 // Constraint on the layout of the result of the entry computation.
 class ResultLayoutConstraint : public LayoutConstraint {
  public:
-  explicit ResultLayoutConstraint(const ShapeLayout& shape_layout)
-      : LayoutConstraint(/*mandatory=*/true), shape_layout_(shape_layout) {}
+  explicit ResultLayoutConstraint(const ShapeLayout& shape_layout,
+                                  bool dfs = false)
+      : LayoutConstraint(/*mandatory=*/true, dfs),
+        shape_layout_(shape_layout) {}
 
   const ShapeLayout& shape_layout() const { return shape_layout_; }
   string ToString() const override;
@@ -157,23 +164,25 @@ class LayoutConstraints {
   // operand of the instruction, or the layout of the result of the computation,
   // respectively.
   Status SetBufferLayout(const Layout& layout, const LogicalBuffer& buffer,
-                         bool mandatory = true);
+                         bool mandatory = true, bool dfs = true);
   Status SetOperandLayout(const Shape& shape_with_layout,
                           const HloInstruction* instruction, int64 operand_no,
-                          bool mandatory = true);
-  Status SetResultLayout(const Shape& shape_with_layout);
+                          bool mandatory = true, bool dfs = true);
+  Status SetResultLayout(const Shape& shape_with_layout, bool dfs = true);
 
   // Convenience wrapper around SetOperandLayout for setting the layout of a
   // operand using a Layout object. The operand must be array-shaped.
   Status SetArrayOperandLayout(const Layout& layout,
                                const HloInstruction* instruction,
-                               int64 operand_no, bool mandatory = true);
+                               int64 operand_no, bool mandatory = true,
+                               bool dfs = true);
 
   // Convenience wrapper around SetBufferLayout. Sets the layouts of all buffers
   // created by the instruction to the layouts in the given shape. The
   // instruction must define every logical buffer in its output.
   Status SetInstructionLayout(const Shape& shape_with_layout,
-                              const HloInstruction* instruction);
+                              const HloInstruction* instruction,
+                              bool mandatory = true, bool dfs = true);
 
   // Returns true if any buffer in the given operand is forwarded to the output
   // of the given instruction. For example, the Tuple instruction forwards the
-- 
GitLab


From a5069f07f75c00dbc742e57a0aa294739137ba95 Mon Sep 17 00:00:00 2001
From: Rohin Mohanadas <RohinMohanadas@users.noreply.github.com>
Date: Sun, 4 Feb 2018 15:42:33 +0100
Subject: [PATCH 634/966] Grammatical error fixed (#16746)

---
 tensorflow/docs_src/programmers_guide/index.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/docs_src/programmers_guide/index.md b/tensorflow/docs_src/programmers_guide/index.md
index d45e666ce7..7a5e90081d 100644
--- a/tensorflow/docs_src/programmers_guide/index.md
+++ b/tensorflow/docs_src/programmers_guide/index.md
@@ -13,7 +13,7 @@ works. The units are as follows:
 ## Low Level APIs
 
   * @{$programmers_guide/low_level_intro}, which introduces the
-    basics of how you can to use TensorFlow outside of the high Level APIs.
+    basics of how you can use TensorFlow outside of the high Level APIs.
   * @{$programmers_guide/tensors}, which explains how to create,
     manipulate, and access Tensors--the fundamental object in TensorFlow.
   * @{$programmers_guide/variables}, which details how
-- 
GitLab


From ba032db13627945e7cc772dbd3d85d257aef3ab9 Mon Sep 17 00:00:00 2001
From: ManHyuk <manhyuk@kw.ac.kr>
Date: Mon, 5 Feb 2018 03:49:48 +0900
Subject: [PATCH 635/966] fix typo (#16722)

---
 tensorflow/c/c_api.cc                     | 2 +-
 tensorflow/core/framework/numeric_types.h | 2 +-
 tensorflow/python/kernel_tests/BUILD      | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index b7986ec5cc..e7fb1dec53 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -2144,7 +2144,7 @@ Status CopyGraph(Graph* src_graph, Graph* dst_graph,
     opts.return_tensors.push_back(ToTensorId(nodes_to_return[i]));
   }
 
-  // TOOD(skyewm): change to OutputTensor
+  // TODO(skyewm): change to OutputTensor
   tensorflow::ImportGraphDefResults results;
   TF_RETURN_IF_ERROR(
       ImportGraphDef(opts, gdef, dst_graph, dst_refiner, &results));
diff --git a/tensorflow/core/framework/numeric_types.h b/tensorflow/core/framework/numeric_types.h
index 99a5d0a054..8249059c29 100644
--- a/tensorflow/core/framework/numeric_types.h
+++ b/tensorflow/core/framework/numeric_types.h
@@ -44,7 +44,7 @@ typedef Eigen::QUInt16 quint16;
 }  // namespace tensorflow
 
 namespace Eigen {
-// TOOD(xpan): We probably need to overwrite more methods to have correct eigen
+// TODO(xpan): We probably need to overwrite more methods to have correct eigen
 // behavior. E.g. loest(), is_integer, etc. See NumTraits.h in eigen.
 template <>
 struct NumTraits<tensorflow::bfloat16>
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 3a6058054b..d4ceb2e489 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -1294,7 +1294,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "control_flow_ops_py_test",
-    # TOOD(b/70473603): change this back to "small" once the C API is
+    # TODO(b/70473603): change this back to "small" once the C API is
     # permanently enabled
     size = "medium",
     srcs = ["control_flow_ops_py_test.py"],
-- 
GitLab


From fc09f65a5d283baa9af182536e3e3652c7a41dd7 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Sun, 4 Feb 2018 11:24:51 -0800
Subject: [PATCH 636/966] Avoid retaining two copies of each constant in
 `ConstantOp`.

Presently, the kernel keeps two copies of the constant tensor value, which can be large:
1. In the `ConstantOp::tensor_` field.
2. In the `OpKernel::def_` field (as an attr of the `NodeDef`).

Since we can be sure that `ConstantOp` will never need to access the
tensor value from `OpKernel::def_`, this change introduces a mechanism
for `OpKernel` implementations to store a stripped `NodeDef` in the
base class, and uses it in `ConstantOp` to avoid storing the tensor
value attr.

PiperOrigin-RevId: 184455793
---
 tensorflow/core/framework/op_kernel.cc |  8 +++++++-
 tensorflow/core/framework/op_kernel.h  |  8 ++++++++
 tensorflow/core/kernels/constant_op.cc | 28 +++++++++++++++++++++++++-
 3 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index fd2d06be98..56c013db9d 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -79,8 +79,14 @@ Status MatchSignatureHelper(const DataTypeSlice expected_inputs,
 
 // OpKernel ------------------------------------------------------------------
 
+// TODO(mrry): Convert to std::make_unique when available.
 OpKernel::OpKernel(OpKernelConstruction* context)
-    : def_(new NodeDef(context->def())),
+    : OpKernel(context,
+               std::unique_ptr<const NodeDef>(new NodeDef(context->def()))) {}
+
+OpKernel::OpKernel(OpKernelConstruction* context,
+                   std::unique_ptr<const NodeDef> node_def)
+    : def_(std::move(node_def)),
       input_types_(context->input_types().begin(),
                    context->input_types().end()),
       input_memory_types_(context->input_memory_types().begin(),
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index b72f1405cf..a3dc96b3de 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -75,6 +75,14 @@ class OpKernel {
   // OpKernel won't be instantiated by the scheduler, so you may perform
   // expensive initialization in the descendant's constructor.
   explicit OpKernel(OpKernelConstruction* context);
+
+  // Specialized constructor that enables the descendant to provide a different
+  // `NodeDef` value. For example, this constructor can be used to provide a
+  // stripped-down `NodeDef` that does not contain the full set of attrs (such
+  // as tensor values) if the descendant stores them in a different form.
+  explicit OpKernel(OpKernelConstruction* context,
+                    std::unique_ptr<const NodeDef> node_def);
+
   virtual ~OpKernel();
 
   // An OpKernel's computation can be either synchronous or
diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc
index 920cd87858..4ab6fdbca1 100644
--- a/tensorflow/core/kernels/constant_op.cc
+++ b/tensorflow/core/kernels/constant_op.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/constant_op.h"
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor.pb.h"
@@ -41,8 +42,33 @@ limitations under the License.
 
 namespace tensorflow {
 
+namespace {
+
+std::unique_ptr<const NodeDef> StripTensorDataFromNodeDef(
+    OpKernelConstruction* ctx) {
+#ifndef __ANDROID__
+  DCHECK_EQ(NodeDef::descriptor()->field_count(), 5)
+      << "The NodeDef format has changed, and the attr-stripping code may need "
+      << "to be updated.";
+#endif
+  const NodeDef& original = ctx->def();
+  NodeDef* ret = new NodeDef;
+  ret->set_name(original.name());
+  ret->set_op(original.op());
+  ret->set_device(original.device());
+  // Strip the "value" attr from the returned NodeDef.
+  // NOTE(mrry): The present implementation of `OpKernel::OpKernel()` only uses
+  // attrs that affect the cardinality of list-typed inputs and outputs, so it
+  // is safe to drop other attrs from the NodeDef.
+  AddNodeAttr("dtype", ctx->output_type(0), ret);
+  return std::unique_ptr<const NodeDef>(ret);
+}
+
+}  // namespace
+
 ConstantOp::ConstantOp(OpKernelConstruction* ctx)
-    : OpKernel(ctx), tensor_(ctx->output_type(0)) {
+    : OpKernel(ctx, StripTensorDataFromNodeDef(ctx)),
+      tensor_(ctx->output_type(0)) {
   const TensorProto* proto = nullptr;
   OP_REQUIRES_OK(ctx, ctx->GetAttr("value", &proto));
   OP_REQUIRES_OK(ctx, ctx->device()->MakeTensorFromProto(
-- 
GitLab


From 1de28439143bee41ee87ff3970913ee88723e5e8 Mon Sep 17 00:00:00 2001
From: Mingsheng Hong <hongm@google.com>
Date: Sun, 4 Feb 2018 11:54:35 -0800
Subject: [PATCH 637/966] Minor fixes to the get started doc.

PiperOrigin-RevId: 184457074
---
 .../docs_src/get_started/get_started_for_beginners.md       | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/docs_src/get_started/get_started_for_beginners.md b/tensorflow/docs_src/get_started/get_started_for_beginners.md
index ea1c2fb3f4..cf514ceaf0 100644
--- a/tensorflow/docs_src/get_started/get_started_for_beginners.md
+++ b/tensorflow/docs_src/get_started/get_started_for_beginners.md
@@ -357,7 +357,7 @@ my_feature_columns = [
 
 ### Select the type of model
 
-We need the select the kind of model that will be trained.
+We need to select the kind of model that will be trained.
 Lots of model types exist; picking the ideal type takes experience.
 We've selected a neural network to solve the Iris problem.  [**Neural
 networks**](https://developers.google.com/machine-learning/glossary/#neural_network)
@@ -655,7 +655,9 @@ calls as follows:
 
 ```python
 predictions = classifier.predict(
-    input_fn=lambda:eval_input_fn(predict_x, batch_size=args.batch_size))
+    input_fn=lambda:eval_input_fn(predict_x,
+                                  labels=None,
+                                  batch_size=args.batch_size))
 ```
 
 As with the `evaluate` method, our `predict` method also gathers examples
-- 
GitLab


From da4583c26254ea568ea133f80e33bb5ae8508da0 Mon Sep 17 00:00:00 2001
From: ManHyuk <manhyuk@kw.ac.kr>
Date: Mon, 5 Feb 2018 09:23:07 +0900
Subject: [PATCH 638/966] FIx Typo

---
 .../graph_transformations/resolve_constant_concatenation.cc     | 2 +-
 .../core/distributed_runtime/rpc/grpc_serialization_traits.h    | 2 +-
 tensorflow/docs_src/programmers_guide/debugger.md               | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc
index 5ac449749a..db68968bad 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc
@@ -73,7 +73,7 @@ void CopyTensorSegments(const std::vector<Array*>& input_arrays,
 
 // Receives a series of input arrays of type Array and an integer showing the
 // axis on which those arrays will be concatenated. It returns the concatenated
-// arrray.
+// array.
 template <ArrayDataType A>
 void ConcatenateTensorBuffers(const std::vector<Array*>& input_arrays,
                               int concatenation_axis,
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h b/tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h
index dd114d39c6..730124c25e 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h
@@ -66,7 +66,7 @@ class GrpcBufferWriter final
     }
     // It's dangerous to keep an inlined grpc_slice as the backup slice, since
     // on a following Next() call, a reference will be returned to this slice
-    // via GRPC_SLICE_START_PTR, which will not be an adddress held by
+    // via GRPC_SLICE_START_PTR, which will not be an address held by
     // slice_buffer_.
     have_backup_ = backup_slice_.refcount != NULL;
     byte_count_ -= count;
diff --git a/tensorflow/docs_src/programmers_guide/debugger.md b/tensorflow/docs_src/programmers_guide/debugger.md
index 9eaee27028..c1a90dee0a 100644
--- a/tensorflow/docs_src/programmers_guide/debugger.md
+++ b/tensorflow/docs_src/programmers_guide/debugger.md
@@ -214,7 +214,7 @@ navigate between these screens by clicking the `<--` and
 ### Other Features of the tfdbg CLI
 
 In addition to the commands listed above, the tfdbg CLI provides the following
-addditional features:
+additional features:
 
 *   To navigate through previous tfdbg commands, type in a few characters
     followed by the Up or Down arrow keys. tfdbg will show you the history of
-- 
GitLab


From bd0d204700df1a1a245b0593a11efd8ede139311 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sun, 4 Feb 2018 20:01:28 -0800
Subject: [PATCH 639/966] Fix logging format error in retrain.py (#16738)

This fix fixes the logging format error in
`tensorflow/examples/image_retraining/retrain.py`.

This fix fixes 16735.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/examples/image_retraining/retrain.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/examples/image_retraining/retrain.py b/tensorflow/examples/image_retraining/retrain.py
index ec22684eaf..58c5f87884 100644
--- a/tensorflow/examples/image_retraining/retrain.py
+++ b/tensorflow/examples/image_retraining/retrain.py
@@ -344,8 +344,8 @@ def maybe_download_and_extract(data_url):
     filepath, _ = urllib.request.urlretrieve(data_url, filepath, _progress)
     print()
     statinfo = os.stat(filepath)
-    tf.logging.info('Successfully downloaded', filename, statinfo.st_size,
-                    'bytes.')
+    tf.logging.info('Successfully downloaded %s %d bytes.',
+                    filename, statinfo.st_size)
     print('Extracting file from ', filepath)
     tarfile.open(filepath, 'r:gz').extractall(dest_directory)
   else:
-- 
GitLab


From 3cf771cc92402b1b52ea3d1f4b29046e4a7927b2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 4 Feb 2018 19:58:18 -0800
Subject: [PATCH 640/966] Support for quantized LSTM models.

PiperOrigin-RevId: 184476753
---
 .../internal/optimized/optimized_ops.h        | 49 +++++++++++++++++++
 .../internal/reference/reference_ops.h        | 49 +++++++++++++++++++
 .../graph_transformations/hardcode_min_max.cc |  6 +++
 .../toco/graph_transformations/quantize.cc    | 32 ++++++++++--
 tensorflow/contrib/lite/toco/tooling_util.cc  | 18 +++++++
 tensorflow/contrib/lite/toco/tooling_util.h   |  3 ++
 6 files changed, 154 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
index 8163c76cfd..9ffa72b401 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -2938,6 +2938,55 @@ inline void Tanh(const float* input_data, const Dims<4>& input_dims,
   output_map.array() = input_map.array().tanh();
 }
 
+inline void Tanh(const uint8* input_data, const Dims<4>& input_dims,
+                 int32 input_zero_point, int32 input_range_radius,
+                 int32 input_multiplier, int input_left_shift,
+                 uint8* output_data, const Dims<4>& output_dims) {
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        for (int c = 0; c < depth; ++c) {
+          const uint8 input_val_u8 = input_data[Offset(input_dims, c, x, y, b)];
+          const int32 input_val_centered =
+              static_cast<int32>(input_val_u8) - input_zero_point;
+          uint8 output_val;
+          if (input_val_centered <= -input_range_radius) {
+            output_val = 0;
+          } else if (input_val_centered >= input_range_radius) {
+            output_val = 255;
+          } else {
+            const int32 input_val_rescaled =
+                MultiplyByQuantizedMultiplierGreaterThanOne(
+                    input_val_centered, input_multiplier, input_left_shift);
+            using FixedPoint4 = gemmlowp::FixedPoint<int32, 4>;
+            using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
+            const FixedPoint4 input_val_f4 =
+                FixedPoint4::FromRaw(input_val_rescaled);
+            const FixedPoint0 output_val_f0 = gemmlowp::tanh(input_val_f4);
+
+            using gemmlowp::RoundingDivideByPOT;
+            int32 output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 24);
+            // TODO(mjmatthews): properly wire through this zero offset
+            output_val_s32 += 127;
+            if (output_val_s32 == -1) {
+              // May underflow since we cannot properly represent -1.0f
+              output_val_s32 = 0;
+            }
+            TFLITE_DCHECK_GE(output_val_s32, 0);
+            TFLITE_DCHECK_LE(output_val_s32, 255);
+            output_val = static_cast<uint8>(output_val_s32);
+          }
+          output_data[Offset(output_dims, c, x, y, b)] = output_val;
+        }
+      }
+    }
+  }
+}
+
 inline void Dequantize(const uint8* input_data, const Dims<4>& input_dims,
                        int32 zero_point, double scale, float* output_data,
                        const Dims<4>& output_dims) {
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index 4bcf4993e9..730e6f2ad2 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -2043,6 +2043,55 @@ inline void Tanh(const float* input_data, const Dims<4>& input_dims,
   }
 }
 
+inline void Tanh(const uint8* input_data, const Dims<4>& input_dims,
+                 int32 input_zero_point, int32 input_range_radius,
+                 int32 input_multiplier, int input_left_shift,
+                 uint8* output_data, const Dims<4>& output_dims) {
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        for (int c = 0; c < depth; ++c) {
+          const uint8 input_val_u8 = input_data[Offset(input_dims, c, x, y, b)];
+          const int32 input_val_centered =
+              static_cast<int32>(input_val_u8) - input_zero_point;
+          uint8 output_val;
+          if (input_val_centered <= -input_range_radius) {
+            output_val = 0;
+          } else if (input_val_centered >= input_range_radius) {
+            output_val = 255;
+          } else {
+            const int32 input_val_rescaled =
+                MultiplyByQuantizedMultiplierGreaterThanOne(
+                    input_val_centered, input_multiplier, input_left_shift);
+            using FixedPoint4 = gemmlowp::FixedPoint<int32, 4>;
+            using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
+            const FixedPoint4 input_val_f4 =
+                FixedPoint4::FromRaw(input_val_rescaled);
+            const FixedPoint0 output_val_f0 = gemmlowp::tanh(input_val_f4);
+
+            using gemmlowp::RoundingDivideByPOT;
+            int32 output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 24);
+            // TODO(mjmatthews): properly wire through this zero offset
+            output_val_s32 += 127;
+            if (output_val_s32 == -1) {
+              // May underflow since we cannot properly represent -1.0f
+              output_val_s32 = 0;
+            }
+            TFLITE_DCHECK_GE(output_val_s32, 0);
+            TFLITE_DCHECK_LE(output_val_s32, 255);
+            output_val = static_cast<uint8>(output_val_s32);
+          }
+          output_data[Offset(output_dims, c, x, y, b)] = output_val;
+        }
+      }
+    }
+  }
+}
+
 inline void Dequantize(const uint8* input_data, const Dims<4>& input_dims,
                        int32 zero_point, double scale, float* output_data,
                        const Dims<4>& output_dims) {
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc b/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc
index 9689b205cd..f1892136cf 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc
@@ -219,6 +219,12 @@ bool HardcodeMinMax::Run(Model* model, std::size_t op_index) {
       changed = HardcodeMinMaxForOutput(model, op, 0, 255. / 256.);
       break;
 
+    case OperatorType::kTanh:
+      // We hardcode quantization_params to: zero_point=127, scale=1/128.
+      // This choice of minmax is the one that is equivalent to that.
+      changed = HardcodeMinMaxForOutput(model, op, -127. / 128., 1.0);
+      break;
+
     default:
       break;
   }
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
index b973b2b813..139c19022e 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
@@ -41,10 +41,14 @@ bool SupportsQuantization(const Operator& op) {
          type == OperatorType::kConcatenation ||
          type == OperatorType::kL2Normalization || type == OperatorType::kAdd ||
          type == OperatorType::kAveragePool || type == OperatorType::kMaxPool ||
+         type == OperatorType::kTensorFlowMinimum ||
+         type == OperatorType::kTensorFlowMaximum ||
          type == OperatorType::kLogistic || type == OperatorType::kSoftmax ||
+         type == OperatorType::kTensorFlowSplit || type == OperatorType::kSub ||
          type == OperatorType::kSqueeze || type == OperatorType::kPad ||
          type == OperatorType::kTensorFlowReshape ||
-         type == OperatorType::kMul || type == OperatorType::kSpaceToDepth ||
+         type == OperatorType::kTanh || type == OperatorType::kMul ||
+         type == OperatorType::kSpaceToDepth ||
          type == OperatorType::kDepthToSpace;
 }
 
@@ -258,6 +262,17 @@ bool ChooseHardcodedQuantizationForOperatorOutput(
                                  *quantization_params));
     return true;
   }
+  if (op.type == OperatorType::kTanh) {
+    // Tanh has the range: [-1, 1].
+    *quantized_data_type = ArrayDataType::kUint8;
+    quantization_params->zero_point = 127;
+    quantization_params->scale = 1. / 128.;
+    // 0 should be exactly representable, as values will typically be centered
+    // around 0, with many values near 0.
+    CHECK(
+        IsExactlyRepresentable(0., *quantized_data_type, *quantization_params));
+    return true;
+  }
   return false;
 }
 
@@ -395,11 +410,22 @@ bool Quantize::Run(Model* model, std::size_t op_index) {
       if (IsConstantParameterArray(*model, input)) {
         QuantizeArray(this, model, input, quantized_data_type,
                       quantization_params);
+      } else if (toco::IsRnnStateArray(*model, input)) {
+        // Simply Quantize the Array
+        auto& array = model->GetArray(op.inputs[input_index]);
+        array.GetOrCreateQuantizationParams() = quantization_params;
+        array.data_type = quantized_data_type;
       } else {
         auto dequantize_it = FindOpWithOutput(*model, input);
-        CHECK(dequantize_it != model->operators.end());
+        CHECK(dequantize_it != model->operators.end())
+            << "Cannot quantize input \"" << input
+            << "\" on operator with output \"" << op.outputs[0]
+            << "\". Nothing feeding input.";
         auto* dequantize_op = dequantize_it->get();
-        CHECK(dequantize_op->type == OperatorType::kDequantize);
+        CHECK(dequantize_op->type == OperatorType::kDequantize)
+            << "Cannot quantize input \"" << input
+            << "\" on operator with output \"" << op.outputs[0]
+            << "\". Input is not fed by a Dequantize operator.";
         op.inputs[input_index] = dequantize_op->inputs[0];
         // Check if the output of that Dequantize op was not used by any
         // other operator. We will then erase that Dequantize op.
diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc
index d2741a5e9b..0a168c671e 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.cc
+++ b/tensorflow/contrib/lite/toco/tooling_util.cc
@@ -1757,4 +1757,22 @@ void UseArraysExtraInfo(Model* model) {
   }
 }
 
+bool IsRnnSourceArray(const toco::Model& model, const string& array_name) {
+  for (const auto& rnn_state : model.flags.rnn_states()) {
+    if (array_name == rnn_state.back_edge_source_array()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool IsRnnStateArray(const toco::Model& model, const string& array_name) {
+  for (const auto& rnn_state : model.flags.rnn_states()) {
+    if (array_name == rnn_state.state_array()) {
+      return true;
+    }
+  }
+  return false;
+}
+
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/tooling_util.h b/tensorflow/contrib/lite/toco/tooling_util.h
index a7e77a02eb..a023bab1a0 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.h
+++ b/tensorflow/contrib/lite/toco/tooling_util.h
@@ -285,6 +285,9 @@ ArrayDataType ConvertIODataTypeToArrayDataType(IODataType type);
 
 void UseArraysExtraInfo(Model* model);
 
+bool IsRnnSourceArray(const toco::Model& model, const string& array_name);
+bool IsRnnStateArray(const toco::Model& model, const string& array_name);
+
 }  // namespace toco
 
 #endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TOOLING_UTIL_H_
-- 
GitLab


From ab06a9c865abe58b882ea76a3bf008799c4c0ea4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 5 Feb 2018 01:03:32 -0800
Subject: [PATCH 641/966] Fixed sequence_mask behavior on unknown shape.

`sequence_mask` crashed when fed with a tensor of unknown rank.
Added a test for that, and expanded a bit existing tests.

Also fixed pre-existing lint errors.

PiperOrigin-RevId: 184493239
---
 .../python/kernel_tests/array_ops_test.py     | 33 +++++++++++++++----
 tensorflow/python/ops/array_ops.py            |  7 ++--
 2 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index 82dbe90002..aae6d0a36e 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -414,7 +414,7 @@ class MeshgridTest(test_util.TensorFlowTestCase):
   def _compareDiffType(self, n, np_dtype, use_gpu):
     inputs = []
     for index in ("ij", "xy"):
-      for i in range(n):
+      for _ in range(n):
         x = np.linspace(-10, 10, 5).astype(np_dtype)
         if np_dtype in (np.complex64, np.complex128):
           x += 1j
@@ -422,8 +422,8 @@ class MeshgridTest(test_util.TensorFlowTestCase):
       numpy_out = np.meshgrid(*inputs, indexing=index)
       with self.test_session(use_gpu=use_gpu):
         tf_out = array_ops.meshgrid(*inputs, indexing=index)
-        for X, _X in zip(numpy_out, tf_out):
-          self.assertAllEqual(X, _X.eval())
+        for x_np, x_tf in zip(numpy_out, tf_out):
+          self.assertAllEqual(x_np, x_tf.eval())
 
   def testCompare(self):
     for t in (np.float16, np.float32, np.float64, np.int32, np.int64,
@@ -1015,7 +1015,7 @@ class SequenceMaskTest(test_util.TensorFlowTestCase):
       with self.assertRaisesRegexp(ValueError, "maxlen must be scalar"):
         array_ops.sequence_mask([10, 20], [10, 20])
 
-  def testOneDimensional(self):
+  def testOneDimensionalWithMaxlen(self):
     with self.test_session():
       res = array_ops.sequence_mask(constant_op.constant([1, 3, 2]), 5)
       self.assertAllEqual(res.get_shape(), [3, 5])
@@ -1024,9 +1024,11 @@ class SequenceMaskTest(test_util.TensorFlowTestCase):
           [[True, False, False, False, False], [True, True, True, False, False],
            [True, True, False, False, False]])
 
+  def testOneDimensionalDtypeWithoutMaxlen(self):
+    with self.test_session():
       # test dtype and default maxlen:
-      res = array_ops.sequence_mask(
-          constant_op.constant([0, 1, 4]), dtype=dtypes.float32)
+      res = array_ops.sequence_mask(constant_op.constant([0, 1, 4]),
+                                    dtype=dtypes.float32)
       if ops._USE_C_API:
         self.assertAllEqual(res.get_shape().as_list(), [3, 4])
       else:
@@ -1035,6 +1037,20 @@ class SequenceMaskTest(test_util.TensorFlowTestCase):
           res.eval(),
           [[0.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 1.0]])
 
+  def testOneDimensionalWithoutMaxlen(self):
+    with self.test_session():
+      res = array_ops.sequence_mask(
+          constant_op.constant([0, 1, 4]))
+      if ops._USE_C_API:
+        self.assertAllEqual(res.get_shape().as_list(), [3, 4])
+      else:
+        self.assertAllEqual(res.get_shape().as_list(), [3, None])
+      self.assertAllEqual(
+          res.eval(),
+          [[False, False, False, False],
+           [True, False, False, False],
+           [True, True, True, True]])
+
   def testTwoDimensional(self):
     with self.test_session():
       res = array_ops.sequence_mask(constant_op.constant([[1, 3, 2]]), 5)
@@ -1055,6 +1071,11 @@ class SequenceMaskTest(test_util.TensorFlowTestCase):
           [[[0.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 1.0]],
            [[1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0], [1.0, 1.0, 1.0, 0.0]]])
 
+  def testUnknownShape(self):
+    lengths = array_ops.placeholder(dtype=dtypes.int32)
+    res = array_ops.sequence_mask(lengths)
+    self.assertEqual(res.shape, None)
+
   def testDtypes(self):
 
     def check_dtypes(lengths_dtype, maxlen_dtype):
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 6210204d80..e3902f5a8a 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+# Tests for this file live in python/kernel_tests/array_ops_test.py
 """Support for manipulating tensors.
 
 See the @{$python/array_ops} guide.
@@ -2450,8 +2451,8 @@ def _all_dimensions(x):
     r = x.dense_shape.get_shape()[0].value  # sparse.dense_shape is 1-D.
     return constant_op.constant(np.arange(r), dtype=dtypes.int32)
 
-  # Otherwise, we rely on Range and Rank to do the right thing at run-time.
-  return range(0, rank(x))
+  # Otherwise, we rely on `range` and `rank` to do the right thing at runtime.
+  return gen_math_ops._range(0, rank(x), 1)
 
 
 @tf_export("sequence_mask")
@@ -2496,7 +2497,7 @@ def sequence_mask(lengths, maxlen=None, dtype=dtypes.bool, name=None):
       maxlen = gen_math_ops._max(lengths, _all_dimensions(lengths))
     else:
       maxlen = ops.convert_to_tensor(maxlen)
-    if maxlen.get_shape().ndims != 0:
+    if maxlen.get_shape().ndims is not None and maxlen.get_shape().ndims != 0:
       raise ValueError("maxlen must be scalar for sequence_mask")
 
     # The basic idea is to compare a range row vector of size maxlen:
-- 
GitLab


From c73d03525aadedfaa34d054ea752ba99d616d4a4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 5 Feb 2018 01:42:16 -0800
Subject: [PATCH 642/966] mini documentation fix

PiperOrigin-RevId: 184496843
---
 tensorflow/python/ops/confusion_matrix.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/confusion_matrix.py b/tensorflow/python/ops/confusion_matrix.py
index 50690cd891..e4ce2ab28a 100644
--- a/tensorflow/python/ops/confusion_matrix.py
+++ b/tensorflow/python/ops/confusion_matrix.py
@@ -119,7 +119,7 @@ def confusion_matrix(labels, predictions, num_classes=None, dtype=dtypes.int32,
   For example:
 
   ```python
-    tf.contrib.metrics.confusion_matrix([1, 2, 4], [2, 2, 4]) ==>
+    tf.confusion_matrix([1, 2, 4], [2, 2, 4]) ==>
         [[0 0 0 0 0]
          [0 0 1 0 0]
          [0 0 1 0 0]
-- 
GitLab


From 6a822c373818948037baacfbae1c7355e0fc2c48 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 5 Feb 2018 07:56:27 -0800
Subject: [PATCH 643/966] Expand the activity analysis to composite names. Fix
 a bug in the cond template that caused bad syntax when there it no symbol
 that needs aliasing. More refactoring in the process, including:  * introduce
 the QN (qualified name) class to hold symbol information; it has value
 semantics and can generate the original symbol, a corresponding AST tree or a
 single-symbol form (e.g. "a.b" -> a_b)  * allow the template mechanism to use
 QNs for substitutions  * annotate *all* symbol nodes with their corresponding
 QN object; this is done as first step during static analysis, and
 automatically performed on all template expansions  * start using typed
 annotation keys (Enum values) instead of plain strings  * rename access.py to
 activity.py  * sanitize nodes in template expansion by deep copying the AST
 without annotations, to avoid common references

PiperOrigin-RevId: 184528586
---
 tensorflow/contrib/py2tf/BUILD                |   1 +
 tensorflow/contrib/py2tf/__init__.py          |   5 +-
 tensorflow/contrib/py2tf/converters/BUILD     |  13 -
 .../converters/break_canonicalization.py      |  24 +-
 .../converters/break_canonicalization_test.py |  12 +-
 .../py2tf/converters/builtin_functions.py     |  19 +-
 .../converters/builtin_functions_test.py      |  16 +-
 .../contrib/py2tf/converters/call_trees.py    |  11 +-
 .../converters/continue_canonicalization.py   |  22 +-
 .../continue_canonicalization_test.py         |  12 +-
 .../contrib/py2tf/converters/control_flow.py  | 161 +++++++----
 .../py2tf/converters/control_flow_test.py     |  16 +-
 .../py2tf/converters/converter_test_base.py   |   6 +-
 .../py2tf/converters/for_canonicalization.py  |  32 +--
 .../converters/for_canonicalization_test.py   |   4 +-
 .../py2tf/converters/print_functions.py       |  51 ----
 .../py2tf/converters/side_effect_guards.py    |  17 +-
 .../converters/side_effect_guards_test.py     |   5 +-
 tensorflow/contrib/py2tf/impl/conversion.py   |  30 +-
 tensorflow/contrib/py2tf/impl/naming.py       |  17 +-
 tensorflow/contrib/py2tf/pyct/BUILD           |  13 +
 tensorflow/contrib/py2tf/pyct/anno.py         |  19 ++
 tensorflow/contrib/py2tf/pyct/copier.py       |  68 +++++
 .../copier_test.py}                           |  44 +--
 .../contrib/py2tf/pyct/pretty_printer.py      |  32 ++-
 .../contrib/py2tf/pyct/pretty_printer_test.py |   4 -
 tensorflow/contrib/py2tf/pyct/qual_names.py   |  99 +++++++
 .../contrib/py2tf/pyct/static_analysis/BUILD  |   7 +-
 .../py2tf/pyct/static_analysis/access_test.py | 236 ---------------
 .../{access.py => activity.py}                |  88 +++---
 .../pyct/static_analysis/activity_test.py     | 271 ++++++++++++++++++
 .../py2tf/pyct/static_analysis/annos.py       |  50 ++++
 .../py2tf/pyct/static_analysis/live_values.py |  23 +-
 .../pyct/static_analysis/live_values_test.py  |   6 +-
 .../py2tf/pyct/static_analysis/type_info.py   |  31 +-
 .../pyct/static_analysis/type_info_test.py    |   6 +-
 tensorflow/contrib/py2tf/pyct/templates.py    |  66 +++--
 .../contrib/py2tf/pyct/templates_test.py      |  10 +
 tensorflow/contrib/py2tf/pyct/transformer.py  |  14 +-
 39 files changed, 978 insertions(+), 583 deletions(-)
 delete mode 100644 tensorflow/contrib/py2tf/converters/print_functions.py
 create mode 100644 tensorflow/contrib/py2tf/pyct/copier.py
 rename tensorflow/contrib/py2tf/{converters/print_functions_test.py => pyct/copier_test.py} (53%)
 create mode 100644 tensorflow/contrib/py2tf/pyct/qual_names.py
 delete mode 100644 tensorflow/contrib/py2tf/pyct/static_analysis/access_test.py
 rename tensorflow/contrib/py2tf/pyct/static_analysis/{access.py => activity.py} (75%)
 create mode 100644 tensorflow/contrib/py2tf/pyct/static_analysis/activity_test.py
 create mode 100644 tensorflow/contrib/py2tf/pyct/static_analysis/annos.py

diff --git a/tensorflow/contrib/py2tf/BUILD b/tensorflow/contrib/py2tf/BUILD
index 479ea9beca..d91220f6dd 100644
--- a/tensorflow/contrib/py2tf/BUILD
+++ b/tensorflow/contrib/py2tf/BUILD
@@ -23,6 +23,7 @@ py_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/contrib/py2tf/impl",
+        "//tensorflow/contrib/py2tf/pyct",
         "//tensorflow/contrib/py2tf/utils",
         "@gast_archive//:gast",
         "@six_archive//:six",
diff --git a/tensorflow/contrib/py2tf/__init__.py b/tensorflow/contrib/py2tf/__init__.py
index 0d51bf0bf2..379fa7fd5c 100644
--- a/tensorflow/contrib/py2tf/__init__.py
+++ b/tensorflow/contrib/py2tf/__init__.py
@@ -26,8 +26,11 @@ from tensorflow.contrib.py2tf.impl.api import convert
 from tensorflow.contrib.py2tf.impl.api import graph_ready
 from tensorflow.contrib.py2tf.impl.api import to_code
 from tensorflow.contrib.py2tf.impl.api import to_graph
+from tensorflow.contrib.py2tf.pyct.transformer import PyFlowParseError
 from tensorflow.python.util.all_util import remove_undocumented
 
-_allowed_symbols = ['to_graph', 'to_code', 'convert', 'graph_ready', 'utils']
+_allowed_symbols = [
+    'to_graph', 'to_code', 'convert', 'graph_ready', 'utils', 'PyFlowParseError'
+]
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/py2tf/converters/BUILD b/tensorflow/contrib/py2tf/converters/BUILD
index 3853c60f99..68ea247778 100644
--- a/tensorflow/contrib/py2tf/converters/BUILD
+++ b/tensorflow/contrib/py2tf/converters/BUILD
@@ -26,7 +26,6 @@ py_library(
         "decorators.py",
         "for_canonicalization.py",
         "logical_expressions.py",
-        "print_functions.py",
         "side_effect_guards.py",
     ],
     srcs_version = "PY2AND3",
@@ -149,18 +148,6 @@ py_test(
     ],
 )
 
-py_test(
-    name = "print_functions_test",
-    srcs = ["print_functions_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":test_lib",
-        "//tensorflow/contrib/py2tf/pyct",
-        "//tensorflow/python:client_testlib",
-        "@gast_archive//:gast",
-    ],
-)
-
 py_test(
     name = "side_effect_guards_test",
     srcs = ["side_effect_guards_test.py"],
diff --git a/tensorflow/contrib/py2tf/converters/break_canonicalization.py b/tensorflow/contrib/py2tf/converters/break_canonicalization.py
index 2ae65e3007..bfb709c5e3 100644
--- a/tensorflow/contrib/py2tf/converters/break_canonicalization.py
+++ b/tensorflow/contrib/py2tf/converters/break_canonicalization.py
@@ -22,13 +22,15 @@ import gast
 
 from tensorflow.contrib.py2tf.pyct import anno
 from tensorflow.contrib.py2tf.pyct import templates
+from tensorflow.contrib.py2tf.pyct import transformer
+from tensorflow.contrib.py2tf.pyct.static_analysis.annos import NodeAnno
 
 
-class BreakCanonicalizationTransformer(gast.NodeTransformer):
+class BreakCanonicalizationTransformer(transformer.Base):
   """Canonicalizes continue statements into additional conditionals."""
 
-  def __init__(self, namer):
-    self.namer = namer
+  def __init__(self, context):
+    super(BreakCanonicalizationTransformer, self).__init__(context)
     # This is a stack structure, to correctly process nested loops.
     self.break_uses = []
 
@@ -67,9 +69,10 @@ class BreakCanonicalizationTransformer(gast.NodeTransformer):
 
   def visit_While(self, node):
     self.generic_visit(node.test)
-    scope = anno.getanno(node, 'body_scope')
+    scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
 
-    break_var = self.namer.new_symbol('break_requested', scope.referenced)
+    break_var = self.context.namer.new_symbol('break_requested',
+                                              scope.referenced)
     self.break_uses.append([False, break_var])
     node.body = self._manual_visit_list(node.body)
     if self.break_uses[-1][0]:
@@ -89,9 +92,10 @@ class BreakCanonicalizationTransformer(gast.NodeTransformer):
   def visit_For(self, node):
     self.generic_visit(node.target)
     self.generic_visit(node.iter)
-    scope = anno.getanno(node, 'body_scope')
+    scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
 
-    break_var = self.namer.new_symbol('break_requested', scope.referenced)
+    break_var = self.context.namer.new_symbol('break_requested',
+                                              scope.referenced)
     self.break_uses.append([False, break_var])
     node.body = self._manual_visit_list(node.body)
     if self.break_uses[-1][0]:
@@ -112,7 +116,5 @@ class BreakCanonicalizationTransformer(gast.NodeTransformer):
     return self._create_break_trigger()
 
 
-def transform(node, namer):
-  transformer = BreakCanonicalizationTransformer(namer)
-  node = transformer.visit(node)
-  return node
+def transform(node, context):
+  return BreakCanonicalizationTransformer(context).visit(node)
diff --git a/tensorflow/contrib/py2tf/converters/break_canonicalization_test.py b/tensorflow/contrib/py2tf/converters/break_canonicalization_test.py
index b5ba2ad923..54c4d99361 100644
--- a/tensorflow/contrib/py2tf/converters/break_canonicalization_test.py
+++ b/tensorflow/contrib/py2tf/converters/break_canonicalization_test.py
@@ -44,8 +44,8 @@ class BreakCanonicalizationTest(converter_test_base.TestCase):
         v.append(x)
       return v
 
-    node = self.parse_and_analyze(test_fn, {}, include_type_analysis=False)
-    node = break_canonicalization.transform(node, TestNamer())
+    node = self.parse_and_analyze(test_fn, {}, namer=TestNamer())
+    node = break_canonicalization.transform(node, self.ctx)
     result = compiler.ast_to_object(node)
 
     self.assertEqual(test_fn(0), result.test_fn(0))
@@ -76,8 +76,8 @@ class BreakCanonicalizationTest(converter_test_base.TestCase):
         v.append(x)
       return v
 
-    node = self.parse_and_analyze(test_fn, {}, include_type_analysis=False)
-    node = break_canonicalization.transform(node, TestNamer())
+    node = self.parse_and_analyze(test_fn, {}, namer=TestNamer())
+    node = break_canonicalization.transform(node, self.ctx)
     result = compiler.ast_to_object(node)
 
     # The break is incompletely canonicalized. Everything is in place, but
@@ -104,8 +104,8 @@ class BreakCanonicalizationTest(converter_test_base.TestCase):
         v.append(x)
       return v, u, w
 
-    node = self.parse_and_analyze(test_fn, {}, include_type_analysis=False)
-    node = break_canonicalization.transform(node, TestNamer())
+    node = self.parse_and_analyze(test_fn, {}, namer=TestNamer())
+    node = break_canonicalization.transform(node, self.ctx)
     result = compiler.ast_to_object(node)
 
     self.assertEqual(test_fn(0), result.test_fn(0))
diff --git a/tensorflow/contrib/py2tf/converters/builtin_functions.py b/tensorflow/contrib/py2tf/converters/builtin_functions.py
index 7f6b64a34c..3e56634106 100644
--- a/tensorflow/contrib/py2tf/converters/builtin_functions.py
+++ b/tensorflow/contrib/py2tf/converters/builtin_functions.py
@@ -21,12 +21,14 @@ from __future__ import print_function
 import gast
 
 from tensorflow.contrib.py2tf.pyct import templates
+from tensorflow.contrib.py2tf.pyct import transformer
 
 
-class BuiltinFunctionTransformer(gast.NodeTransformer):
+class BuiltinFunctionTransformer(transformer.Base):
   """Transforms Print nodes to Call so they can be handled as functions."""
 
-  # TODO(mdan): Bring print_functions in here.
+  def __init__(self, context):
+    super(BuiltinFunctionTransformer, self).__init__(context)
 
   def _convert_len(self, node):
     template = """
@@ -44,10 +46,15 @@ class BuiltinFunctionTransformer(gast.NodeTransformer):
       return self._convert_len(node)
     return node
 
+  def visit_Print(self, node):
+    self.generic_visit(node)
+    template = """
+      fname(args)
+    """
+    return templates.replace(template, fname='print', args=node.values)
+
   # pylint:enable=invalid-name
 
 
-def transform(node):
-  transformer = BuiltinFunctionTransformer()
-  node = transformer.visit(node)
-  return node
+def transform(node, context):
+  return BuiltinFunctionTransformer(context).visit(node)
diff --git a/tensorflow/contrib/py2tf/converters/builtin_functions_test.py b/tensorflow/contrib/py2tf/converters/builtin_functions_test.py
index b5358da6bc..be76066242 100644
--- a/tensorflow/contrib/py2tf/converters/builtin_functions_test.py
+++ b/tensorflow/contrib/py2tf/converters/builtin_functions_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import gast
+
 from tensorflow.contrib.py2tf.converters import builtin_functions
 from tensorflow.contrib.py2tf.converters import converter_test_base
 from tensorflow.contrib.py2tf.pyct import compiler
@@ -34,7 +36,7 @@ class BuiltinFunctionsTest(converter_test_base.TestCase):
       return len(a)
 
     node = self.parse_and_analyze(test_fn, {'len': len})
-    node = builtin_functions.transform(node)
+    node = builtin_functions.transform(node, self.ctx)
     result = compiler.ast_to_object(node)
     setattr(result, 'tf', array_ops)
 
@@ -43,6 +45,18 @@ class BuiltinFunctionsTest(converter_test_base.TestCase):
                        sess.run(
                            result.test_fn(constant_op.constant([0, 0, 0]))))
 
+  def test_print(self):
+
+    def test_fn(a):
+      print(a)
+
+    node = self.parse_and_analyze(test_fn, {'print': print})
+    node = builtin_functions.transform(node, self.ctx)
+    result = compiler.ast_to_object(node)
+
+    result.test_fn('a')
+    self.assertTrue(isinstance(node.body[0].body[0].value, gast.Call))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/py2tf/converters/call_trees.py b/tensorflow/contrib/py2tf/converters/call_trees.py
index 4c238b7fb9..834baf258d 100644
--- a/tensorflow/contrib/py2tf/converters/call_trees.py
+++ b/tensorflow/contrib/py2tf/converters/call_trees.py
@@ -30,6 +30,7 @@ from tensorflow.contrib.py2tf.pyct import anno
 from tensorflow.contrib.py2tf.pyct import parser
 from tensorflow.contrib.py2tf.pyct import templates
 from tensorflow.contrib.py2tf.pyct import transformer
+from tensorflow.contrib.py2tf.pyct.static_analysis.annos import NodeAnno
 from tensorflow.python.util import tf_inspect
 
 
@@ -192,11 +193,11 @@ class CallTreeTransformer(transformer.Base):
           # The renaming process will transform it into a regular function.
           # TODO(mdan): Is this complete? How does it work with nested members?
           node.args = [node.func.value] + node.args
-      node.func = gast.Name(new_name, gast.Load(), None)
+      node.func = templates.replace('func_name', func_name=new_name)[0]
     return node
 
   def _wrap_to_py_func_no_return(self, node):
-    args_scope = anno.getanno(node, 'args_scope')
+    args_scope = anno.getanno(node, NodeAnno.ARGS_SCOPE)
     # TODO(mdan): Properly handle varargs, kwargs, etc.
     template = """
       def wrapper(args):
@@ -208,10 +209,10 @@ class CallTreeTransformer(transformer.Base):
         template,
         call=node.func,
         wrapper=self.context.namer.compiled_function_name(node.func.id)[0],
-        args=tuple(gast.Name(n, gast.Load(), None) for n in args_scope.used))
-    anno.setanno(call_expr.value, 'args_scope', args_scope)
+        args=tuple(args_scope.used))
+    anno.setanno(call_expr.value, NodeAnno.ARGS_SCOPE, args_scope)
     # TODO(mdan): Rename this annotation to 'graph_ready'
-    anno.setanno(wrapper_def, 'skip_processing', True)
+    anno.setanno(wrapper_def, anno.Basic.SKIP_PROCESSING, True)
 
     return (wrapper_def, call_expr)
 
diff --git a/tensorflow/contrib/py2tf/converters/continue_canonicalization.py b/tensorflow/contrib/py2tf/converters/continue_canonicalization.py
index 486f0f6509..4069a678b1 100644
--- a/tensorflow/contrib/py2tf/converters/continue_canonicalization.py
+++ b/tensorflow/contrib/py2tf/converters/continue_canonicalization.py
@@ -18,17 +18,17 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import gast
-
 from tensorflow.contrib.py2tf.pyct import anno
 from tensorflow.contrib.py2tf.pyct import templates
+from tensorflow.contrib.py2tf.pyct import transformer
+from tensorflow.contrib.py2tf.pyct.static_analysis.annos import NodeAnno
 
 
-class ContinueCanonicalizationTransformer(gast.NodeTransformer):
+class ContinueCanonicalizationTransformer(transformer.Base):
   """Canonicalizes continue statements into additional conditionals."""
 
-  def __init__(self, namer):
-    self.namer = namer
+  def __init__(self, context):
+    super(ContinueCanonicalizationTransformer, self).__init__(context)
     # This is a stack structure, to correctly process nested loops.
     self.continuation_uses = []
 
@@ -76,7 +76,7 @@ class ContinueCanonicalizationTransformer(gast.NodeTransformer):
     return reorganized_nodes
 
   def _process_loop_block(self, block, scope):
-    cont_var = self.namer.new_symbol('cont_requested', scope.referenced)
+    cont_var = self.context.namer.new_symbol('cont_requested', scope.referenced)
     self.continuation_uses.append([False, cont_var])
     block = self._visit_and_reindent_if_necessary(block)
     if self.continuation_uses[-1][0]:
@@ -87,7 +87,8 @@ class ContinueCanonicalizationTransformer(gast.NodeTransformer):
   def visit_While(self, node):
     self.generic_visit(node.test)
     node.body = self._process_loop_block(node.body,
-                                         anno.getanno(node, 'body_scope'))
+                                         anno.getanno(node,
+                                                      NodeAnno.BODY_SCOPE))
     for n in node.orelse:
       self.generic_visit(n)
     return node
@@ -96,7 +97,8 @@ class ContinueCanonicalizationTransformer(gast.NodeTransformer):
     self.generic_visit(node.target)
     self.generic_visit(node.iter)
     node.body = self._process_loop_block(node.body,
-                                         anno.getanno(node, 'body_scope'))
+                                         anno.getanno(node,
+                                                      NodeAnno.BODY_SCOPE))
     for n in node.orelse:
       self.generic_visit(n)
     return node
@@ -122,6 +124,4 @@ class ContinueCanonicalizationTransformer(gast.NodeTransformer):
 
 
 def transform(node, namer):
-  transformer = ContinueCanonicalizationTransformer(namer)
-  node = transformer.visit(node)
-  return node
+  return ContinueCanonicalizationTransformer(namer).visit(node)
diff --git a/tensorflow/contrib/py2tf/converters/continue_canonicalization_test.py b/tensorflow/contrib/py2tf/converters/continue_canonicalization_test.py
index c1fe903a2d..4b18819559 100644
--- a/tensorflow/contrib/py2tf/converters/continue_canonicalization_test.py
+++ b/tensorflow/contrib/py2tf/converters/continue_canonicalization_test.py
@@ -44,8 +44,8 @@ class ContinueCanonicalizationTest(converter_test_base.TestCase):
         v.append(x)
       return v
 
-    node = self.parse_and_analyze(test_fn, {}, include_type_analysis=False)
-    node = continue_canonicalization.transform(node, TestNamer())
+    node = self.parse_and_analyze(test_fn, {}, namer=TestNamer())
+    node = continue_canonicalization.transform(node, self.ctx)
     result = compiler.ast_to_object(node)
 
     self.assertEqual(test_fn(0), result.test_fn(0))
@@ -65,8 +65,8 @@ class ContinueCanonicalizationTest(converter_test_base.TestCase):
         v.append(x)
       return v
 
-    node = self.parse_and_analyze(test_fn, {}, include_type_analysis=False)
-    node = continue_canonicalization.transform(node, TestNamer())
+    node = self.parse_and_analyze(test_fn, {}, namer=TestNamer())
+    node = continue_canonicalization.transform(node, self.ctx)
     result = compiler.ast_to_object(node)
 
     self.assertEqual(test_fn([]), result.test_fn([]))
@@ -91,8 +91,8 @@ class ContinueCanonicalizationTest(converter_test_base.TestCase):
         v.append(x)
       return v, u, w
 
-    node = self.parse_and_analyze(test_fn, {}, include_type_analysis=False)
-    node = continue_canonicalization.transform(node, TestNamer())
+    node = self.parse_and_analyze(test_fn, {}, namer=TestNamer())
+    node = continue_canonicalization.transform(node, self.ctx)
     result = compiler.ast_to_object(node)
 
     self.assertEqual(test_fn(0), result.test_fn(0))
diff --git a/tensorflow/contrib/py2tf/converters/control_flow.py b/tensorflow/contrib/py2tf/converters/control_flow.py
index a40c7b28f7..a256c074a8 100644
--- a/tensorflow/contrib/py2tf/converters/control_flow.py
+++ b/tensorflow/contrib/py2tf/converters/control_flow.py
@@ -22,6 +22,8 @@ import gast
 
 from tensorflow.contrib.py2tf.pyct import anno
 from tensorflow.contrib.py2tf.pyct import templates
+from tensorflow.contrib.py2tf.pyct import transformer
+from tensorflow.contrib.py2tf.pyct.static_analysis.annos import NodeAnno
 
 
 class SymbolNamer(object):
@@ -41,21 +43,29 @@ class SymbolNamer(object):
 
 
 class SymbolRenamer(gast.NodeTransformer):
+  """Transformer that can rename symbols to a simple names."""
 
   def __init__(self, name_map):
     self.name_map = name_map
 
-  def visit_Name(self, node):
-    if node.id in self.name_map:
-      node.id = self.name_map[node.id]
+  def _process(self, node):
+    qn = anno.getanno(node, anno.Basic.QN)
+    if qn in self.name_map:
+      return gast.Name(self.name_map[qn], node.ctx, None)
     return node
 
+  def visit_Name(self, node):
+    return self._process(node)
+
+  def visit_Attribute(self, node):
+    return self._process(node)
+
 
-class ControlFlowTransformer(gast.NodeTransformer):
+class ControlFlowTransformer(transformer.Base):
   """Transforms control flow structures like loops an conditionals."""
 
-  def __init__(self, namer):
-    self.namer = namer
+  def __init__(self, context):
+    super(ControlFlowTransformer, self).__init__(context)
 
   # pylint:disable=invalid-name
 
@@ -65,8 +75,8 @@ class ControlFlowTransformer(gast.NodeTransformer):
   def visit_If(self, node):
     self.generic_visit(node)
 
-    body_scope = anno.getanno(node, 'body_scope')
-    orelse_scope = anno.getanno(node, 'orelse_scope')
+    body_scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
+    orelse_scope = anno.getanno(node, NodeAnno.ORELSE_SCOPE)
 
     if body_scope.created - orelse_scope.created:
       raise ValueError(
@@ -86,7 +96,8 @@ class ControlFlowTransformer(gast.NodeTransformer):
         (body_scope.created | orelse_scope.created))
     aliased_orig_names = tuple(need_alias)
     aliased_new_names = tuple(
-        self.namer.new_symbol(s, all_referenced) for s in aliased_orig_names)
+        self.context.namer.new_symbol(s.ssf(), all_referenced)
+        for s in aliased_orig_names)
     alias_map = dict(zip(aliased_orig_names, aliased_new_names))
     node_body = node.body
     node_body = [SymbolRenamer(alias_map).visit(n) for n in node_body]
@@ -94,72 +105,112 @@ class ControlFlowTransformer(gast.NodeTransformer):
     node_orelse = [SymbolRenamer(alias_map).visit(n) for n in node_orelse]
 
     if len(all_modified) == 1:
-      results = gast.Name(all_modified[0], None, None)
+      results = all_modified[0]
     else:
-      results = gast.Tuple(
-          tuple(gast.Name(s, None, None) for s in all_modified), None)
-
-    template = """
-      def body_name():
-        aliased_new_names, = aliased_orig_names,
-        body
-        return (all_results,)
-      def orelse_name():
-        aliased_new_names, = aliased_orig_names,
-        orelse
-        return (all_results,)
-      results = tf.cond(test, body_name, orelse_name)
-    """
-    body_name = self.namer.new_symbol('if_true', all_referenced)
-    return templates.replace(
-        template,
-        test=node.test,
-        body_name=body_name,
-        body=node_body,
-        orelse_name=self.namer.new_symbol('if_false', all_referenced),
-        orelse=node_orelse,
-        aliased_orig_names=tuple(aliased_orig_names),
-        aliased_new_names=tuple(aliased_new_names),
-        all_results=tuple(alias_map[s] if s in aliased_orig_names else s
-                          for s in all_modified),
-        results=results)
+      results = gast.Tuple([s.ast() for s in all_modified], None)
+
+    if aliased_orig_names:
+      template = """
+        def body_name():
+          aliased_new_names, = aliased_orig_names,
+          body
+          return (all_results,)
+        def orelse_name():
+          aliased_new_names, = aliased_orig_names,
+          orelse
+          return (all_results,)
+        results = tf.cond(test, body_name, orelse_name)
+      """
+      body_name = self.context.namer.new_symbol('if_true', all_referenced)
+      return templates.replace(
+          template,
+          test=node.test,
+          body_name=body_name,
+          body=node_body,
+          orelse_name=self.context.namer.new_symbol('if_false', all_referenced),
+          orelse=node_orelse,
+          aliased_orig_names=tuple(aliased_orig_names),
+          aliased_new_names=tuple(aliased_new_names),
+          all_results=tuple(alias_map[s] if s in aliased_orig_names else s
+                            for s in all_modified),
+          results=results)
+    else:
+      template = """
+        def body_name():
+          body
+          return (all_results,)
+        def orelse_name():
+          orelse
+          return (all_results,)
+        results = tf.cond(test, body_name, orelse_name)
+      """
+      body_name = self.context.namer.new_symbol('if_true', all_referenced)
+      return templates.replace(
+          template,
+          test=node.test,
+          body_name=body_name,
+          body=node_body,
+          orelse_name=self.context.namer.new_symbol('if_false', all_referenced),
+          orelse=node_orelse,
+          all_results=tuple(s for s in all_modified),
+          results=results)
 
   def visit_While(self, node):
     self.generic_visit(node)
 
-    body_scope = anno.getanno(node, 'body_scope')
-    body_closure = tuple(body_scope.modified - body_scope.created)
-
-    if len(body_closure) == 1:
-      state = body_closure[0]
+    body_scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
+    body_closure = body_scope.modified - body_scope.created
+    all_referenced = body_scope.referenced
+
+    state = list(body_closure)
+    state_ssf = [
+        self.context.namer.new_symbol(s.ssf(), all_referenced) for s in state
+    ]
+    ssf_map = {
+        name: ssf
+        for name, ssf in zip(state, state_ssf)
+        if str(name) != ssf
+    }
+
+    if len(state) == 1:
+      state = state[0]
+      state_ssf = state_ssf[0]
       state_ast_tuple = state
     else:
-      state = tuple(body_closure)
-      state_ast_tuple = gast.Tuple(
-          tuple(gast.Name(n, None, None) for n in state), None)
+      state_ast_tuple = gast.Tuple([n.ast() for n in state], None)
+
+    node_body = node.body
+    node_body = [SymbolRenamer(ssf_map).visit(n) for n in node_body]
+
+    test = node.test
+    test = SymbolRenamer(ssf_map).visit(test)
+
     template = """
-      def test_name(state):
+      def test_name(state_ssf):
         return test
-      def body_name(state):
+      def body_name(state_ssf):
         body
-        return state,
+        return state_ssf,
       state_ast_tuple = tf.while_loop(test_name, body_name, [state])
     """
     node = templates.replace(
         template,
         state=state,
+        state_ssf=state_ssf,
         state_ast_tuple=state_ast_tuple,
-        test_name=self.namer.new_symbol('loop_test', body_scope.referenced),
-        test=node.test,
-        body_name=self.namer.new_symbol('loop_body', body_scope.referenced),
-        body=node.body)
+        test_name=self.context.namer.new_symbol('loop_test',
+                                                body_scope.referenced),
+        test=test,
+        body_name=self.context.namer.new_symbol('loop_body',
+                                                body_scope.referenced),
+        body=node_body)
 
     return node
 
   # pylint:enable=invalid-name
 
 
-def transform(node, namer):
-  transformer = ControlFlowTransformer(namer)
-  node = transformer.visit(node)
+def transform(node, context):
+  t = ControlFlowTransformer(context)
+  node = t.visit(node)
   return node
diff --git a/tensorflow/contrib/py2tf/converters/control_flow_test.py b/tensorflow/contrib/py2tf/converters/control_flow_test.py
index 054e33750d..f192bf1b46 100644
--- a/tensorflow/contrib/py2tf/converters/control_flow_test.py
+++ b/tensorflow/contrib/py2tf/converters/control_flow_test.py
@@ -49,8 +49,8 @@ class ControlFlowTest(converter_test_base.TestCase):
         i += 1
       return s, i, n
 
-    node = self.parse_and_analyze(test_fn, {})
-    node = control_flow.transform(node, TestNamer())
+    node = self.parse_and_analyze(test_fn, {}, namer=TestNamer())
+    node = control_flow.transform(node, self.ctx)
     result = compiler.ast_to_object(node)
     setattr(result, 'tf', control_flow_ops)
 
@@ -65,8 +65,8 @@ class ControlFlowTest(converter_test_base.TestCase):
         n -= 1
       return n
 
-    node = self.parse_and_analyze(test_fn, {})
-    node = control_flow.transform(node, TestNamer())
+    node = self.parse_and_analyze(test_fn, {}, namer=TestNamer())
+    node = control_flow.transform(node, self.ctx)
     result = compiler.ast_to_object(node)
     setattr(result, 'tf', control_flow_ops)
 
@@ -84,8 +84,8 @@ class ControlFlowTest(converter_test_base.TestCase):
         b = 2 * n
       return a, b
 
-    node = self.parse_and_analyze(test_fn, {})
-    node = control_flow.transform(node, TestNamer())
+    node = self.parse_and_analyze(test_fn, {}, namer=TestNamer())
+    node = control_flow.transform(node, self.ctx)
     result = compiler.ast_to_object(node)
     setattr(result, 'tf', control_flow_ops)
 
@@ -102,8 +102,8 @@ class ControlFlowTest(converter_test_base.TestCase):
         n = -n
       return n
 
-    node = self.parse_and_analyze(test_fn, {})
-    node = control_flow.transform(node, TestNamer())
+    node = self.parse_and_analyze(test_fn, {}, namer=TestNamer())
+    node = control_flow.transform(node, self.ctx)
     result = compiler.ast_to_object(node)
     setattr(result, 'tf', control_flow_ops)
 
diff --git a/tensorflow/contrib/py2tf/converters/converter_test_base.py b/tensorflow/contrib/py2tf/converters/converter_test_base.py
index 6bfa55443c..bcb96c81ae 100644
--- a/tensorflow/contrib/py2tf/converters/converter_test_base.py
+++ b/tensorflow/contrib/py2tf/converters/converter_test_base.py
@@ -20,7 +20,8 @@ from __future__ import print_function
 
 from tensorflow.contrib.py2tf.pyct import context
 from tensorflow.contrib.py2tf.pyct import parser
-from tensorflow.contrib.py2tf.pyct.static_analysis import access
+from tensorflow.contrib.py2tf.pyct import qual_names
+from tensorflow.contrib.py2tf.pyct.static_analysis import activity
 from tensorflow.contrib.py2tf.pyct.static_analysis import live_values
 from tensorflow.contrib.py2tf.pyct.static_analysis import type_info
 from tensorflow.python.platform import test
@@ -44,7 +45,8 @@ class TestCase(test.TestCase):
         arg_values=None,
         arg_types=arg_types,
         recursive=recursive)
-    node = access.resolve(node, ctx)
+    node = qual_names.resolve(node)
+    node = activity.resolve(node, ctx)
     node = live_values.resolve(node, ctx, {})
     if include_type_analysis:
       node = type_info.resolve(node, ctx)
diff --git a/tensorflow/contrib/py2tf/converters/for_canonicalization.py b/tensorflow/contrib/py2tf/converters/for_canonicalization.py
index c284689b90..935dade0ed 100644
--- a/tensorflow/contrib/py2tf/converters/for_canonicalization.py
+++ b/tensorflow/contrib/py2tf/converters/for_canonicalization.py
@@ -22,24 +22,21 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import gast
-
 from tensorflow.contrib.py2tf.pyct import anno
 from tensorflow.contrib.py2tf.pyct import templates
+from tensorflow.contrib.py2tf.pyct import transformer
+from tensorflow.contrib.py2tf.pyct.static_analysis.annos import NodeAnno
 
 
-class ForLoopCanonicalizationTransformer(gast.NodeTransformer):
+class ForLoopCanonicalizationTransformer(transformer.Base):
   """Canonicalizes for loops (e.g. into while loops)."""
 
-  def __init__(self, namer):
-    self.namer = namer
+  def __init__(self, context):
+    super(ForLoopCanonicalizationTransformer, self).__init__(context)
 
   def visit_For(self, node):
     self.generic_visit(node)
-    body_scope = anno.getanno(node, 'body_scope')
-
-    # TODO(mdan): Distinguish between `for i in n` and `for i in range(n)`
-    # Or maybe we should replace range with tf.range?
+    body_scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
 
     if anno.hasanno(node, 'extra_cond'):
       template = """
@@ -56,8 +53,8 @@ class ForLoopCanonicalizationTransformer(gast.NodeTransformer):
           loop_iter=node.iter,
           target=node.target,
           body=node.body,
-          i=self.namer.new_symbol('i', body_scope.referenced),
-          n=self.namer.new_symbol('n', body_scope.referenced),
+          i=self.context.namer.new_symbol('i', body_scope.referenced),
+          n=self.context.namer.new_symbol('n', body_scope.referenced),
           extra_cond=anno.getanno(node, 'extra_cond'))
     else:
       template = """
@@ -69,13 +66,14 @@ class ForLoopCanonicalizationTransformer(gast.NodeTransformer):
           body  # pylint:disable=pointless-statement
           i += 1
       """
-      return templates.replace(
+      repl = templates.replace(
           template,
           loop_iter=node.iter,
           target=node.target,
           body=node.body,
-          i=self.namer.new_symbol('i', body_scope.referenced),
-          n=self.namer.new_symbol('n', body_scope.referenced))
+          i=self.context.namer.new_symbol('i', body_scope.referenced),
+          n=self.context.namer.new_symbol('n', body_scope.referenced))
+      return repl
 
   def visit_Continue(self, node):
     assert False, 'continue statement should be desugared at this point'
@@ -84,7 +82,5 @@ class ForLoopCanonicalizationTransformer(gast.NodeTransformer):
     assert False, 'break statement should be desugared at this point'
 
 
-def transform(node, namer):
-  transformer = ForLoopCanonicalizationTransformer(namer)
-  node = transformer.visit(node)
-  return node
+def transform(node, context):
+  return ForLoopCanonicalizationTransformer(context).visit(node)
diff --git a/tensorflow/contrib/py2tf/converters/for_canonicalization_test.py b/tensorflow/contrib/py2tf/converters/for_canonicalization_test.py
index a6e6350fd4..142bd4aea1 100644
--- a/tensorflow/contrib/py2tf/converters/for_canonicalization_test.py
+++ b/tensorflow/contrib/py2tf/converters/for_canonicalization_test.py
@@ -41,8 +41,8 @@ class ControlFlowTest(converter_test_base.TestCase):
         s += e
       return s
 
-    node = self.parse_and_analyze(test_fn, {})
-    node = for_canonicalization.transform(node, TestNamer())
+    node = self.parse_and_analyze(test_fn, {}, namer=TestNamer())
+    node = for_canonicalization.transform(node, self.ctx)
     result = compiler.ast_to_object(node)
 
     l = [1, 2, 3]
diff --git a/tensorflow/contrib/py2tf/converters/print_functions.py b/tensorflow/contrib/py2tf/converters/print_functions.py
deleted file mode 100644
index 5da738c495..0000000000
--- a/tensorflow/contrib/py2tf/converters/print_functions.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Compatibility support. Converts Print nodes to function calls."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import gast
-
-from tensorflow.contrib.py2tf.pyct import anno
-
-
-class PrintFunctionTransformer(gast.NodeTransformer):
-  """Transforms Print nodes to Call so they can be handled as functions."""
-
-  # pylint:disable=invalid-name
-
-  def visit_Print(self, node):
-    self.generic_visit(node)
-    for n in node.values:
-      n.ctx = gast.Param()
-    call_node = gast.Call(
-        func=gast.Name('print', gast.Load(), None),
-        args=node.values,
-        keywords=[])
-    anno.setanno(call_node.func, 'live_val', print)
-    anno.setanno(call_node.func, 'fqn', 'print')
-    anno.setanno(call_node, 'args_scope', anno.getanno(node, 'args_scope'))
-    node = gast.Expr(call_node)
-    return node
-
-  # pylint:enable=invalid-name
-
-
-def transform(node):
-  transformer = PrintFunctionTransformer()
-  node = transformer.visit(node)
-  return node
diff --git a/tensorflow/contrib/py2tf/converters/side_effect_guards.py b/tensorflow/contrib/py2tf/converters/side_effect_guards.py
index ffca743542..c73388d06e 100644
--- a/tensorflow/contrib/py2tf/converters/side_effect_guards.py
+++ b/tensorflow/contrib/py2tf/converters/side_effect_guards.py
@@ -38,6 +38,8 @@ import gast
 
 from tensorflow.contrib.py2tf.pyct import anno
 from tensorflow.contrib.py2tf.pyct import templates
+from tensorflow.contrib.py2tf.pyct import transformer
+from tensorflow.contrib.py2tf.pyct.static_analysis.annos import NodeAnno
 
 
 class SymbolNamer(object):
@@ -55,11 +57,11 @@ class SymbolNamer(object):
     raise NotImplementedError()
 
 
-class SideEffectGuardTransformer(gast.NodeTransformer):
+class SideEffectGuardTransformer(transformer.Base):
   """Adds control dependencies to functions with side effects."""
 
-  def __init__(self, namer):
-    self.namer = namer
+  def __init__(self, context):
+    super(SideEffectGuardTransformer, self).__init__(context)
     self.indent_next = False
     self.next_indent_owner = None
 
@@ -88,8 +90,6 @@ class SideEffectGuardTransformer(gast.NodeTransformer):
     return new_nodes
 
   def visit_FunctionDef(self, node):
-    if anno.hasanno(node, 'skip_processing'):
-      return node
     node.body = self._visit_and_reindent(node.body)
     return node
 
@@ -122,7 +122,7 @@ class SideEffectGuardTransformer(gast.NodeTransformer):
       # First, attempt to gate future evaluation of args. If that's not
       # possible, gate all remaining statements (and that may fail too, see
       # _visit_and_reindent.
-      args_scope = anno.getanno(node.value, 'args_scope')
+      args_scope = anno.getanno(node.value, NodeAnno.ARGS_SCOPE)
       guarded_args = tuple(args_scope.used & (args_scope.parent.modified
                                               | args_scope.parent.returned))
       if guarded_args:
@@ -138,6 +138,5 @@ class SideEffectGuardTransformer(gast.NodeTransformer):
   # pylint:enable=invalid-name
 
 
-def transform(node, namer):
-  transformer = SideEffectGuardTransformer(namer)
-  return transformer.visit(node)
+def transform(node, context):
+  return SideEffectGuardTransformer(context).visit(node)
diff --git a/tensorflow/contrib/py2tf/converters/side_effect_guards_test.py b/tensorflow/contrib/py2tf/converters/side_effect_guards_test.py
index 452d7ab2be..dea09ecc3f 100644
--- a/tensorflow/contrib/py2tf/converters/side_effect_guards_test.py
+++ b/tensorflow/contrib/py2tf/converters/side_effect_guards_test.py
@@ -43,8 +43,9 @@ class SideEffectGuardsTest(converter_test_base.TestCase):
       state_ops.assign(a, a + 1)
       return a
 
-    node = self.parse_and_analyze(test_fn, {'state_ops': state_ops})
-    node = side_effect_guards.transform(node, TestNamer())
+    node = self.parse_and_analyze(
+        test_fn, {'state_ops': state_ops}, namer=TestNamer())
+    node = side_effect_guards.transform(node, self.ctx)
     result = compiler.ast_to_object(node)
     setattr(result, 'state_ops', state_ops)
     setattr(result, 'py2tf_utils', utils)
diff --git a/tensorflow/contrib/py2tf/impl/conversion.py b/tensorflow/contrib/py2tf/impl/conversion.py
index ed71ff5c06..ff4f159975 100644
--- a/tensorflow/contrib/py2tf/impl/conversion.py
+++ b/tensorflow/contrib/py2tf/impl/conversion.py
@@ -30,13 +30,13 @@ from tensorflow.contrib.py2tf.converters import control_flow
 from tensorflow.contrib.py2tf.converters import decorators
 from tensorflow.contrib.py2tf.converters import for_canonicalization
 from tensorflow.contrib.py2tf.converters import logical_expressions
-from tensorflow.contrib.py2tf.converters import print_functions
 from tensorflow.contrib.py2tf.converters import side_effect_guards
 from tensorflow.contrib.py2tf.impl import config
 from tensorflow.contrib.py2tf.impl import naming
 from tensorflow.contrib.py2tf.pyct import context
 from tensorflow.contrib.py2tf.pyct import parser
-from tensorflow.contrib.py2tf.pyct.static_analysis import access
+from tensorflow.contrib.py2tf.pyct import qual_names
+from tensorflow.contrib.py2tf.pyct.static_analysis import activity
 from tensorflow.contrib.py2tf.pyct.static_analysis import live_values
 from tensorflow.contrib.py2tf.pyct.static_analysis import type_info
 from tensorflow.python.util import tf_inspect
@@ -208,7 +208,8 @@ def function_to_graph(f, conversion_map, arg_values, arg_types,
 
 
 def _static_analysis_pass(node, ctx):
-  node = access.resolve(node, ctx)
+  node = qual_names.resolve(node)
+  node = activity.resolve(node, ctx, None)
   node = live_values.resolve(node, ctx, config.PYTHON_LITERALS)
   node = type_info.resolve(node, ctx)
   return node
@@ -233,10 +234,7 @@ def node_to_graph(node, ctx, nocompile_decorators):
 
   # TODO(mdan): Factor out common elements.
   # These include:
-  #   * keeping track of symbols that have been created
-  #   * marking nodes (e.g. py_func wrappers) to suppress further processing
   #   * code move between blocks
-  #   * insertion of new global references
   #   * visiting blocks in transformers
 
   # Certain steps, especially canonicalization, insert new symbols into the
@@ -244,29 +242,35 @@ def node_to_graph(node, ctx, nocompile_decorators):
   # to re-run the analysis.
 
   node = _static_analysis_pass(node, ctx)
+  # Past this point, line numbers are no longer accurate so we ignore the
+  # source.
+  # TODO(mdan): Is it feasible to reconstruct intermediate source code?
+  ctx.source_code = None
   node = decorators.transform(node, nocompile_decorators)
-  node = break_canonicalization.transform(node, ctx.namer)
+  node = break_canonicalization.transform(node, ctx)
   node = asserts.transform(node, ctx)
 
   # Note: sequencing continue canonicalization before for loop one avoids
   # dealing with the extra loop increment operation that the for
   # canonicalization creates.
-  node = continue_canonicalization.transform(node, ctx.namer)
+  node = continue_canonicalization.transform(node, ctx)
   ctx.namespace['len'] = len
 
   node = _static_analysis_pass(node, ctx)
-  node = for_canonicalization.transform(node, ctx.namer)
+  node = for_canonicalization.transform(node, ctx)
   # for_canonicalization may insert new global references.
-  node = builtin_functions.transform(node)
+  node = builtin_functions.transform(node, ctx)
   # builtin_functions may insert new global references.
   ctx.namespace['print'] = print
 
   node = _static_analysis_pass(node, ctx)
-  node = print_functions.transform(node)
   node = call_trees.transform(node, ctx, config.DEFAULT_UNCOMPILED_MODULES,
                               nocompile_decorators)
-  node = control_flow.transform(node, ctx.namer)
+  node = control_flow.transform(node, ctx)
+
+  # control_flow may create new symbols and change scopes.
+  node = _static_analysis_pass(node, ctx)
   node = logical_expressions.transform(node)
-  node = side_effect_guards.transform(node, ctx.namer)
+  node = side_effect_guards.transform(node, ctx)
 
   return node
diff --git a/tensorflow/contrib/py2tf/impl/naming.py b/tensorflow/contrib/py2tf/impl/naming.py
index 5c7e4c5f95..d31462cba0 100644
--- a/tensorflow/contrib/py2tf/impl/naming.py
+++ b/tensorflow/contrib/py2tf/impl/naming.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.contrib.py2tf.pyct import qual_names
+
 
 class Namer(object):
   """Implementation of the namer interfaces required by various converters.
@@ -103,11 +105,20 @@ class Namer(object):
 
   def new_symbol(self, name_root, reserved_locals):
     """See control_flow.SymbolNamer.new_symbol."""
+    # reserved_locals may contain QNs.
+    all_reserved_locals = set()
+    for s in reserved_locals:
+      if isinstance(s, qual_names.QN):
+        all_reserved_locals.update(s.qn)
+      elif isinstance(s, str):
+        all_reserved_locals.add(s)
+      else:
+        raise ValueError('Unexpected symbol type "%s"' % type(s))
+
     new_name = name_root
     n = 0
-    while (new_name in self.global_namespace
-           or new_name in reserved_locals
-           or new_name in self.generated_names):
+    while (new_name in self.global_namespace or
+           new_name in all_reserved_locals or new_name in self.generated_names):
       n += 1
       new_name = '%s_%d' % (name_root, n)
 
diff --git a/tensorflow/contrib/py2tf/pyct/BUILD b/tensorflow/contrib/py2tf/pyct/BUILD
index 1b2408ba0e..054eb17fb6 100644
--- a/tensorflow/contrib/py2tf/pyct/BUILD
+++ b/tensorflow/contrib/py2tf/pyct/BUILD
@@ -21,8 +21,10 @@ py_library(
         "anno.py",
         "compiler.py",
         "context.py",
+        "copier.py",
         "parser.py",
         "pretty_printer.py",
+        "qual_names.py",
         "templates.py",
         "transformer.py",
     ],
@@ -57,6 +59,17 @@ py_test(
     ],
 )
 
+py_test(
+    name = "copier_test",
+    srcs = ["copier_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":pyct",
+        "//tensorflow/python:client_testlib",
+        "@gast_archive//:gast",
+    ],
+)
+
 py_test(
     name = "parser_test",
     srcs = ["parser_test.py"],
diff --git a/tensorflow/contrib/py2tf/pyct/anno.py b/tensorflow/contrib/py2tf/pyct/anno.py
index 889e4ba4ff..c6d41f9e12 100644
--- a/tensorflow/contrib/py2tf/pyct/anno.py
+++ b/tensorflow/contrib/py2tf/pyct/anno.py
@@ -21,6 +21,25 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from enum import Enum
+
+
+class NoValue(Enum):
+
+  def __repr__(self):
+    return self.name
+
+
+class Basic(NoValue):
+  """Container for annotation keys.
+
+  The enum values are used strictly for documentation purposes.
+  """
+
+  QN = 'Qualified name, as it appeared in the code.'
+  SKIP_PROCESSING = (
+      'This node should be preserved as is and not processed any further.')
+
 
 def getanno(node, key, field_name='___pyct_anno'):
   return getattr(node, field_name)[key]
diff --git a/tensorflow/contrib/py2tf/pyct/copier.py b/tensorflow/contrib/py2tf/pyct/copier.py
new file mode 100644
index 0000000000..41598fdc99
--- /dev/null
+++ b/tensorflow/contrib/py2tf/pyct/copier.py
@@ -0,0 +1,68 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Copy an AST tree, discarding annotations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import ast
+
+import gast
+
+from tensorflow.contrib.py2tf.pyct import anno
+
+
+class CleanCopier(gast.NodeVisitor):
+  """Copy AST nodes.
+
+  The copied nodes will ignore almost all fields that prefixed by '__'.
+  Exceptions make some annotations.
+  """
+
+  # TODO(mdan): Parametrize which annotations get carried over.
+
+  def generic_visit(self, node):
+    new_fields = {}
+    for f in node._fields:
+      if f.startswith('__'):
+        continue
+      if not hasattr(node, f):
+        continue
+      v = getattr(node, f)
+      if isinstance(v, list):
+        v = [self.generic_visit(n) for n in v]
+      elif isinstance(v, tuple):
+        v = tuple(self.generic_visit(n) for n in v)
+      elif isinstance(v, (gast.AST, ast.AST)):
+        v = self.generic_visit(v)
+      else:
+        # Assume everything else is a value type.
+        pass
+      new_fields[f] = v
+    new_node = type(node)(**new_fields)
+    if anno.hasanno(node, anno.Basic.SKIP_PROCESSING):
+      anno.setanno(new_node, anno.Basic.SKIP_PROCESSING, True)
+    return new_node
+
+
+def copy_clean(node):
+  copier = CleanCopier()
+  if isinstance(node, list):
+    return [copier.visit(n) for n in node]
+  elif isinstance(node, tuple):
+    return tuple(copier.visit(n) for n in node)
+  else:
+    return copier.visit(node)
diff --git a/tensorflow/contrib/py2tf/converters/print_functions_test.py b/tensorflow/contrib/py2tf/pyct/copier_test.py
similarity index 53%
rename from tensorflow/contrib/py2tf/converters/print_functions_test.py
rename to tensorflow/contrib/py2tf/pyct/copier_test.py
index 475196ce10..a6b35eda14 100644
--- a/tensorflow/contrib/py2tf/converters/print_functions_test.py
+++ b/tensorflow/contrib/py2tf/pyct/copier_test.py
@@ -12,33 +12,41 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for print_functions module."""
+"""Tests for copier module."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import gast
+import ast
 
-from tensorflow.contrib.py2tf.converters import converter_test_base
-from tensorflow.contrib.py2tf.converters import print_functions
-from tensorflow.contrib.py2tf.pyct import compiler
+from tensorflow.contrib.py2tf.pyct import copier
 from tensorflow.python.platform import test
 
 
-class PrintFunctionsTest(converter_test_base.TestCase):
-
-  def test_transform(self):
-
-    def test_fn(a):
-      print(a)
-
-    node = self.parse_and_analyze(test_fn, {'print': print})
-    node = print_functions.transform(node)
-    result = compiler.ast_to_object(node)
-
-    result.test_fn('a')
-    self.assertTrue(isinstance(node.body[0].body[0].value, gast.Call))
+class CopierTest(test.TestCase):
+
+  def test_copy_clean(self):
+    ret = ast.Return(
+        ast.BinOp(
+            op=ast.Add(),
+            left=ast.Name(id='a', ctx=ast.Load()),
+            right=ast.Num(1)))
+    setattr(ret, '__foo', 'bar')
+    node = ast.FunctionDef(
+        name='f',
+        args=ast.arguments(
+            args=[ast.Name(id='a', ctx=ast.Param())],
+            vararg=None,
+            kwarg=None,
+            defaults=[]),
+        body=[ret],
+        decorator_list=[],
+        returns=None)
+    new_node = copier.copy_clean(node)
+    self.assertFalse(node is new_node)
+    self.assertFalse(ret is new_node.body[0])
+    self.assertFalse(hasattr(new_node.body[0], '__foo'))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/py2tf/pyct/pretty_printer.py b/tensorflow/contrib/py2tf/pyct/pretty_printer.py
index 5e70c0ed83..bacc1e4a77 100644
--- a/tensorflow/contrib/py2tf/pyct/pretty_printer.py
+++ b/tensorflow/contrib/py2tf/pyct/pretty_printer.py
@@ -25,24 +25,30 @@ import termcolor
 class PrettyPrinter(gast.NodeVisitor):
   """Print AST nodes."""
 
-  def __init__(self):
+  def __init__(self, color):
     self.indent_lvl = 0
     self.result = ''
+    self.color = color
+
+  def _color(self, string, color, attrs=None):
+    if self.color:
+      return termcolor.colored(string, color, attrs=attrs)
+    return string
 
   def _type(self, node):
-    return termcolor.colored(node.__class__.__name__, None, attrs=['bold'])
+    return self._color(node.__class__.__name__, None, ['bold'])
 
   def _field(self, name):
-    return termcolor.colored(name, 'blue')
+    return self._color(name, 'blue')
 
   def _value(self, name):
-    return termcolor.colored(name, 'magenta')
+    return self._color(name, 'magenta')
 
   def _warning(self, name):
-    return termcolor.colored(name, 'red')
+    return self._color(name, 'red')
 
   def _indent(self):
-    return termcolor.colored('| ' * self.indent_lvl, None, attrs=['dark'])
+    return self._color('| ' * self.indent_lvl, None, ['dark'])
 
   def _print(self, s):
     self.result += s
@@ -76,6 +82,16 @@ class PrettyPrinter(gast.NodeVisitor):
           self._print('%s]' % (self._indent()))
         else:
           self._print('%s%s=[]' % (self._indent(), self._field(f)))
+      elif isinstance(v, tuple):
+        if v:
+          self._print('%s%s=(' % (self._indent(), self._field(f)))
+          self.indent_lvl += 1
+          for n in v:
+            self.generic_visit(n)
+          self.indent_lvl -= 1
+          self._print('%s)' % (self._indent()))
+        else:
+          self._print('%s%s=()' % (self._indent(), self._field(f)))
       elif isinstance(v, gast.AST):
         self.generic_visit(v, f)
       elif isinstance(v, str):
@@ -87,8 +103,8 @@ class PrettyPrinter(gast.NodeVisitor):
     self.indent_lvl -= 1
 
 
-def fmt(node):
-  printer = PrettyPrinter()
+def fmt(node, color=True):
+  printer = PrettyPrinter(color)
   if isinstance(node, (list, tuple)):
     for n in node:
       printer.visit(n)
diff --git a/tensorflow/contrib/py2tf/pyct/pretty_printer_test.py b/tensorflow/contrib/py2tf/pyct/pretty_printer_test.py
index 65e5b1d919..81e3f47b80 100644
--- a/tensorflow/contrib/py2tf/pyct/pretty_printer_test.py
+++ b/tensorflow/contrib/py2tf/pyct/pretty_printer_test.py
@@ -24,10 +24,6 @@ from tensorflow.contrib.py2tf.pyct import pretty_printer
 from tensorflow.python.platform import test
 
 
-def f(x):
-  return x + 1
-
-
 class PrettyPrinterTest(test.TestCase):
 
   def test_format(self):
diff --git a/tensorflow/contrib/py2tf/pyct/qual_names.py b/tensorflow/contrib/py2tf/pyct/qual_names.py
new file mode 100644
index 0000000000..11e3838467
--- /dev/null
+++ b/tensorflow/contrib/py2tf/pyct/qual_names.py
@@ -0,0 +1,99 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for manipulating qualified names.
+
+A qualified name is a uniform way to refer to simple (e.g. 'foo') and composite
+(e.g. 'foo.bar') syntactic symbols.
+
+This is *not* related to the __qualname__ attribute used by inspect, which
+refers to scopes.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gast
+
+from tensorflow.contrib.py2tf.pyct import anno
+
+
+class QN(object):
+  """Represents a qualified name.
+
+  """
+
+  def __init__(self, base, attr=None):
+    if attr:
+      if not isinstance(base, QN):
+        raise ValueError('For attribute QNs, base must be a QN.')
+      self._parent = base
+      self.qn = base.qn + (attr,)
+    else:
+      self._parent = None
+      self.qn = tuple(base.split('.'))
+
+  def is_composite(self):
+    return len(self.qn) > 1
+
+  @property
+  def parent(self):
+    if self._parent is None:
+      raise ValueError('Cannot get parent of simple name "%s".' % self.qn[0])
+    return self._parent
+
+  def __hash__(self):
+    return hash(self.qn)
+
+  def __eq__(self, other):
+    return self.qn == other.qn
+
+  def __str__(self):
+    return '.'.join(self.qn)
+
+  def __repr__(self):
+    return str(self)
+
+  def ssf(self):
+    """Simple symbol form."""
+    return '_'.join(self.qn)
+
+  def ast(self):
+    # The caller must adjust the context appropriately.
+    if self.is_composite():
+      return gast.Attribute(self.parent.ast(), self.qn[-1], None)
+    return gast.Name(self.qn[0], None, None)
+
+
+class QnResolver(gast.NodeTransformer):
+  """Annotates nodes with QN information.
+
+  Note: Not using NodeAnnos to avoid circular dependencies.
+  """
+
+  def visit_Name(self, node):
+    self.generic_visit(node)
+    anno.setanno(node, anno.Basic.QN, QN(node.id))
+    return node
+
+  def visit_Attribute(self, node):
+    self.generic_visit(node)
+    anno.setanno(node, anno.Basic.QN,
+                 QN(anno.getanno(node.value, anno.Basic.QN), node.attr))
+    return node
+
+
+def resolve(node):
+  return QnResolver().visit(node)
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/BUILD b/tensorflow/contrib/py2tf/pyct/static_analysis/BUILD
index 32e2954fff..fbfce18c60 100644
--- a/tensorflow/contrib/py2tf/pyct/static_analysis/BUILD
+++ b/tensorflow/contrib/py2tf/pyct/static_analysis/BUILD
@@ -17,7 +17,8 @@ filegroup(
 py_library(
     name = "static_analysis",
     srcs = [
-        "access.py",
+        "activity.py",
+        "annos.py",
         "live_values.py",
         "type_info.py",
     ],
@@ -30,8 +31,8 @@ py_library(
 )
 
 py_test(
-    name = "access_test",
-    srcs = ["access_test.py"],
+    name = "activity_test",
+    srcs = ["activity_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":static_analysis",
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/access_test.py b/tensorflow/contrib/py2tf/pyct/static_analysis/access_test.py
deleted file mode 100644
index df0283b54d..0000000000
--- a/tensorflow/contrib/py2tf/pyct/static_analysis/access_test.py
+++ /dev/null
@@ -1,236 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for access module."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import gast
-
-from tensorflow.contrib.py2tf.pyct import anno
-from tensorflow.contrib.py2tf.pyct import context
-from tensorflow.contrib.py2tf.pyct import parser
-from tensorflow.contrib.py2tf.pyct.static_analysis import access
-from tensorflow.python.platform import test
-
-
-class ScopeTest(test.TestCase):
-
-  def test_basic(self):
-    scope = access.Scope(None)
-    self.assertFalse(scope.has('foo'))
-
-    scope.mark_read('foo')
-    self.assertFalse(scope.has('foo'))
-
-    scope.mark_write('foo')
-    self.assertTrue(scope.has('foo'))
-
-    scope.mark_read('bar')
-    self.assertFalse(scope.has('bar'))
-
-  def test_copy(self):
-    scope = access.Scope(None)
-    scope.mark_write('foo')
-
-    other = access.Scope(None)
-    other.copy_from(scope)
-
-    self.assertTrue('foo' in other.created)
-
-    scope.mark_write('bar')
-    scope.copy_from(other)
-
-    self.assertFalse('bar' in scope.created)
-
-    scope.mark_write('bar')
-    scope.merge_from(other)
-
-    self.assertTrue('bar' in scope.created)
-    self.assertFalse('bar' in other.created)
-
-  def test_nesting(self):
-    scope = access.Scope(None)
-    scope.mark_write('foo')
-    scope.mark_read('bar')
-
-    child = access.Scope(scope)
-    self.assertTrue(child.has('foo'))
-    self.assertTrue(scope.has('foo'))
-
-    child.mark_write('bar')
-    self.assertTrue(child.has('bar'))
-    self.assertFalse(scope.has('bar'))
-
-  def test_referenced(self):
-    scope = access.Scope(None)
-    scope.mark_read('a')
-
-    child = access.Scope(scope)
-    child.mark_read('b')
-
-    child2 = access.Scope(child, isolated=False)
-    child2.mark_read('c')
-
-    self.assertTrue('c' in child2.referenced)
-    self.assertTrue('b' in child2.referenced)
-    self.assertFalse('a' in child2.referenced)
-
-    self.assertTrue('c' in child.referenced)
-    self.assertTrue('b' in child.referenced)
-    self.assertFalse('a' in child.referenced)
-
-
-class AccessResolverTest(test.TestCase):
-
-  def _parse_and_analyze(self, test_fn):
-    node, source = parser.parse_entity(test_fn)
-    ctx = context.EntityContext(
-        namer=None,
-        source_code=source,
-        source_file=None,
-        namespace={},
-        arg_values=None,
-        arg_types=None,
-        recursive=True)
-    node = access.resolve(node, ctx)
-    return node
-
-  def test_local_markers(self):
-
-    def test_fn(a):  # pylint:disable=unused-argument
-      b = c  # pylint:disable=undefined-variable
-      while b > 0:
-        b -= 1
-      return b
-
-    node = self._parse_and_analyze(test_fn)
-    self.assertFalse(anno.getanno(node.body[0].body[0].value,
-                                  'is_local'))  # c in b = c
-    self.assertTrue(anno.getanno(node.body[0].body[1].test.left,
-                                 'is_local'))  # b in b > 0
-    self.assertTrue(anno.getanno(node.body[0].body[2].value,
-                                 'is_local'))  # b in return b
-
-  def assertScopeIs(self, scope, used, modified, created):
-    self.assertItemsEqual(used, scope.used)
-    self.assertItemsEqual(modified, scope.modified)
-    self.assertItemsEqual(created, scope.created)
-
-  def test_print_statement(self):
-
-    def test_fn(a):
-      b = 0
-      c = 1
-      print(a, b)
-      return c
-
-    node = self._parse_and_analyze(test_fn)
-    print_node = node.body[0].body[2]
-    if isinstance(print_node, gast.Print):
-      # Python 2
-      print_args_scope = anno.getanno(print_node, 'args_scope')
-    else:
-      # Python 3
-      assert isinstance(print_node, gast.Expr)
-      # The call node should be the one being annotated.
-      print_node = print_node.value
-      print_args_scope = anno.getanno(print_node, 'args_scope')
-    # We basically need to detect which variables are captured by the call
-    # arguments.
-    self.assertScopeIs(print_args_scope, ('a', 'b'), (), ())
-
-  def test_call(self):
-
-    def test_fn(a):
-      b = 0
-      c = 1
-      foo(a, b)  # pylint:disable=undefined-variable
-      return c
-
-    node = self._parse_and_analyze(test_fn)
-    call_node = node.body[0].body[2].value
-    # We basically need to detect which variables are captured by the call
-    # arguments.
-    self.assertScopeIs(
-        anno.getanno(call_node, 'args_scope'), ('a', 'b'), (), ())
-
-  def test_while(self):
-
-    def test_fn(a):
-      b = a
-      while b > 0:
-        c = b
-        b -= 1
-      return b, c
-
-    node = self._parse_and_analyze(test_fn)
-    while_node = node.body[0].body[1]
-    self.assertScopeIs(
-        anno.getanno(while_node, 'body_scope'), ('b',), ('b', 'c'), ('c',))
-    self.assertScopeIs(
-        anno.getanno(while_node, 'body_parent_scope'), ('a', 'b', 'c'),
-        ('b', 'c'), ('a', 'b', 'c'))
-
-  def test_for(self):
-
-    def test_fn(a):
-      b = a
-      for _ in a:
-        c = b
-        b -= 1
-      return b, c
-
-    node = self._parse_and_analyze(test_fn)
-    for_node = node.body[0].body[1]
-    self.assertScopeIs(
-        anno.getanno(for_node, 'body_scope'), ('b',), ('b', 'c'), ('c',))
-    self.assertScopeIs(
-        anno.getanno(for_node, 'body_parent_scope'), ('a', 'b', 'c'),
-        ('b', 'c', '_'), ('a', 'b', 'c', '_'))
-
-  def test_if(self):
-
-    def test_fn(x):
-      if x > 0:
-        x = -x
-        y = 2 * x
-        z = -y
-      else:
-        x = 2 * x
-        y = -x
-        u = -y
-      return z, u
-
-    node = self._parse_and_analyze(test_fn)
-    if_node = node.body[0].body[0]
-    self.assertScopeIs(
-        anno.getanno(if_node, 'body_scope'), ('x', 'y'), ('x', 'y', 'z'),
-        ('y', 'z'))
-    # TODO(mdan): Double check: is it ok to not mark a local symbol as not read?
-    self.assertScopeIs(
-        anno.getanno(if_node, 'body_parent_scope'), ('x', 'z', 'u'),
-        ('x', 'y', 'z', 'u'), ('x', 'y', 'z', 'u'))
-    self.assertScopeIs(
-        anno.getanno(if_node, 'orelse_scope'), ('x', 'y'), ('x', 'y', 'u'),
-        ('y', 'u'))
-    self.assertScopeIs(
-        anno.getanno(if_node, 'body_parent_scope'), ('x', 'z', 'u'),
-        ('x', 'y', 'z', 'u'), ('x', 'y', 'z', 'u'))
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/access.py b/tensorflow/contrib/py2tf/pyct/static_analysis/activity.py
similarity index 75%
rename from tensorflow/contrib/py2tf/pyct/static_analysis/access.py
rename to tensorflow/contrib/py2tf/pyct/static_analysis/activity.py
index 33629f87d1..1c93e16031 100644
--- a/tensorflow/contrib/py2tf/pyct/static_analysis/access.py
+++ b/tensorflow/contrib/py2tf/pyct/static_analysis/activity.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Access information (reads, writes) resolution."""
+"""Activity analysis."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -24,6 +24,7 @@ import gast
 
 from tensorflow.contrib.py2tf.pyct import anno
 from tensorflow.contrib.py2tf.pyct import transformer
+from tensorflow.contrib.py2tf.pyct.static_analysis.annos import NodeAnno
 
 # TODO(mdan): Add support for PY3 (e.g. Param vs arg).
 
@@ -112,6 +113,14 @@ class Scope(object):
     self.params.add(name)
 
   def mark_creation(self, name):
+    if name.is_composite():
+      parent = name.parent
+      if self.has(parent):
+        # This is considered mutation of the parent, not creation.
+        # TODO(mdan): Is that really so?
+        return
+      else:
+        raise ValueError('Unknown symbol "%s".' % parent)
     self.created.add(name)
 
   def mark_write(self, name):
@@ -132,39 +141,48 @@ class Scope(object):
       self.parent.mark_returned(name)
 
 
-class AccessResolver(transformer.Base):
+class ActivityAnalizer(transformer.Base):
   """Annotates nodes with local scope information. See Scope."""
 
-  def __init__(self, context):
-    super(AccessResolver, self).__init__(context)
-    self.scope = Scope(None)
+  def __init__(self, context, parent_scope):
+    super(ActivityAnalizer, self).__init__(context)
+    self.scope = Scope(parent_scope)
     self._in_return_statement = False
 
-  def visit_Name(self, node):
-    # TODO(mdan): This is insufficient for object fields, e.g. hp.learning_rate.
-    self.generic_visit(node)
+  def _track_symbol(self, node):
+    qn = anno.getanno(node, anno.Basic.QN)
+
     if isinstance(node.ctx, gast.Store):
-      self.scope.mark_write(node.id)
+      self.scope.mark_write(qn)
     elif isinstance(node.ctx, gast.Load):
-      anno.setanno(node, 'is_local', self.scope.has(node.id))
-      self.scope.mark_read(node.id)
+      self.scope.mark_read(qn)
     elif isinstance(node.ctx, gast.Param):
       # Param contexts appear in function defs, so they have the meaning of
       # defining a variable.
       # TODO(mdan): This bay be incorrect with nested functions.
       # For nested functions, we'll have to add the notion of hiding args from
       # the parent scope, not writing to them.
-      self.scope.mark_creation(node.id)
-      self.scope.mark_param(node.id)
+      self.scope.mark_creation(qn)
+      self.scope.mark_param(qn)
     else:
-      raise ValueError('Unknown context %s for node %s.' % (type(node.ctx),
-                                                            node.id))
-    anno.setanno(node, 'is_modified_since_entry',
-                 self.scope.is_modified_since_entry(node.id))
-    anno.setanno(node, 'is_param', self.scope.is_param(node.id))
+      raise ValueError('Unknown context %s for node %s.' % (type(node.ctx), qn))
+
+    anno.setanno(node, NodeAnno.IS_LOCAL, self.scope.has(qn))
+    anno.setanno(node, NodeAnno.IS_MODIFIED_SINCE_ENTRY,
+                 self.scope.is_modified_since_entry(qn))
+    anno.setanno(node, NodeAnno.IS_PARAM, self.scope.is_param(qn))
 
     if self._in_return_statement:
-      self.scope.mark_returned(node.id)
+      self.scope.mark_returned(qn)
+
+  def visit_Name(self, node):
+    self.generic_visit(node)
+    self._track_symbol(node)
+    return node
+
+  def visit_Attribute(self, node):
+    self.generic_visit(node)
+    self._track_symbol(node)
     return node
 
   def visit_Print(self, node):
@@ -173,7 +191,7 @@ class AccessResolver(transformer.Base):
     self.scope = args_scope
     for n in node.values:
       self.visit(n)
-    anno.setanno(node, 'args_scope', args_scope)
+    anno.setanno(node, NodeAnno.ARGS_SCOPE, args_scope)
     self.scope = current_scope
     return node
 
@@ -186,7 +204,7 @@ class AccessResolver(transformer.Base):
     # TODO(mdan): Account starargs, kwargs
     for n in node.keywords:
       self.visit(n)
-    anno.setanno(node, 'args_scope', args_scope)
+    anno.setanno(node, NodeAnno.ARGS_SCOPE, args_scope)
     self.scope = current_scope
     self.visit(node.func)
     return node
@@ -197,7 +215,7 @@ class AccessResolver(transformer.Base):
     self.scope = block_scope
     for n in block:
       self.visit(n)
-    anno.setanno(node, '%s_scope' % scope_name, block_scope)
+    anno.setanno(node, scope_name, block_scope)
     self.scope = current_scope
     return node
 
@@ -209,36 +227,36 @@ class AccessResolver(transformer.Base):
     before_parent = Scope(None)
     before_parent.copy_from(self.scope)
     after_children = []
-    for child, name in children:
+    for child, scope_name in children:
       self.scope.copy_from(before_parent)
-      parent = self._process_block_node(parent, child, name)
+      parent = self._process_block_node(parent, child, scope_name)
       after_child = Scope(None)
       after_child.copy_from(self.scope)
       after_children.append(after_child)
     for after_child in after_children:
       self.scope.merge_from(after_child)
-    for child, name in children:
-      # TODO(mdan): We don't need this - we have the parent link from scope.
-      anno.setanno(parent, '%s_parent_scope' % name, self.scope)
     return parent
 
   def visit_If(self, node):
     self.visit(node.test)
-    node = self._process_parallel_blocks(
-        node, ((node.body, 'body'), (node.orelse, 'orelse')))
+    node = self._process_parallel_blocks(node,
+                                         ((node.body, NodeAnno.BODY_SCOPE),
+                                          (node.orelse, NodeAnno.ORELSE_SCOPE)))
     return node
 
   def visit_For(self, node):
     self.visit(node.target)
     self.visit(node.iter)
-    node = self._process_parallel_blocks(
-        node, ((node.body, 'body'), (node.orelse, 'orelse')))
+    node = self._process_parallel_blocks(node,
+                                         ((node.body, NodeAnno.BODY_SCOPE),
+                                          (node.orelse, NodeAnno.ORELSE_SCOPE)))
     return node
 
   def visit_While(self, node):
     self.visit(node.test)
-    node = self._process_parallel_blocks(
-        node, ((node.body, 'body'), (node.orelse, 'orelse')))
+    node = self._process_parallel_blocks(node,
+                                         ((node.body, NodeAnno.BODY_SCOPE),
+                                          (node.orelse, NodeAnno.ORELSE_SCOPE)))
     return node
 
   def visit_Return(self, node):
@@ -248,5 +266,5 @@ class AccessResolver(transformer.Base):
     return node
 
 
-def resolve(node, context):
-  return AccessResolver(context).visit(node)
+def resolve(node, context, parent_scope=None):
+  return ActivityAnalizer(context, parent_scope).visit(node)
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/activity_test.py b/tensorflow/contrib/py2tf/pyct/static_analysis/activity_test.py
new file mode 100644
index 0000000000..e1eb954a5e
--- /dev/null
+++ b/tensorflow/contrib/py2tf/pyct/static_analysis/activity_test.py
@@ -0,0 +1,271 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for activity module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gast
+
+from tensorflow.contrib.py2tf.pyct import anno
+from tensorflow.contrib.py2tf.pyct import context
+from tensorflow.contrib.py2tf.pyct import parser
+from tensorflow.contrib.py2tf.pyct import qual_names
+from tensorflow.contrib.py2tf.pyct.qual_names import QN
+from tensorflow.contrib.py2tf.pyct.static_analysis import activity
+from tensorflow.contrib.py2tf.pyct.static_analysis.annos import NodeAnno
+from tensorflow.python.platform import test
+
+
+class ScopeTest(test.TestCase):
+
+  def test_basic(self):
+    scope = activity.Scope(None)
+    self.assertFalse(scope.has(QN('foo')))
+
+    scope.mark_read(QN('foo'))
+    self.assertFalse(scope.has(QN('foo')))
+
+    scope.mark_write(QN('foo'))
+    self.assertTrue(scope.has(QN('foo')))
+
+    scope.mark_read(QN('bar'))
+    self.assertFalse(scope.has(QN('bar')))
+
+  def test_copy(self):
+    scope = activity.Scope(None)
+    scope.mark_write(QN('foo'))
+
+    other = activity.Scope(None)
+    other.copy_from(scope)
+
+    self.assertTrue(QN('foo') in other.created)
+
+    scope.mark_write(QN('bar'))
+    scope.copy_from(other)
+
+    self.assertFalse(QN('bar') in scope.created)
+
+    scope.mark_write(QN('bar'))
+    scope.merge_from(other)
+
+    self.assertTrue(QN('bar') in scope.created)
+    self.assertFalse(QN('bar') in other.created)
+
+  def test_nesting(self):
+    scope = activity.Scope(None)
+    scope.mark_write(QN('foo'))
+    scope.mark_read(QN('bar'))
+
+    child = activity.Scope(scope)
+    self.assertTrue(child.has(QN('foo')))
+    self.assertTrue(scope.has(QN('foo')))
+
+    child.mark_write(QN('bar'))
+    self.assertTrue(child.has(QN('bar')))
+    self.assertFalse(scope.has(QN('bar')))
+
+  def test_referenced(self):
+    scope = activity.Scope(None)
+    scope.mark_read(QN('a'))
+
+    child = activity.Scope(scope)
+    child.mark_read(QN('b'))
+
+    child2 = activity.Scope(child, isolated=False)
+    child2.mark_read(QN('c'))
+
+    self.assertTrue(QN('c') in child2.referenced)
+    self.assertTrue(QN('b') in child2.referenced)
+    self.assertFalse(QN('a') in child2.referenced)
+
+    self.assertTrue(QN('c') in child.referenced)
+    self.assertTrue(QN('b') in child.referenced)
+    self.assertFalse(QN('a') in child.referenced)
+
+
+class ActivityAnalizerTest(test.TestCase):
+
+  def _parse_and_analyze(self, test_fn):
+    node, source = parser.parse_entity(test_fn)
+    ctx = context.EntityContext(
+        namer=None,
+        source_code=source,
+        source_file=None,
+        namespace={},
+        arg_values=None,
+        arg_types=None,
+        recursive=True)
+    node = qual_names.resolve(node)
+    node = activity.resolve(node, ctx)
+    return node
+
+  def test_local_markers(self):
+
+    def test_fn(a):  # pylint:disable=unused-argument
+      b = c  # pylint:disable=undefined-variable
+      while b > 0:
+        b -= 1
+      return b
+
+    node = self._parse_and_analyze(test_fn)
+    self.assertFalse(
+        anno.getanno(node.body[0].body[0].value,
+                     NodeAnno.IS_LOCAL))  # c in b = c
+    self.assertTrue(
+        anno.getanno(node.body[0].body[1].test.left,
+                     NodeAnno.IS_LOCAL))  # b in b > 0
+    self.assertTrue(
+        anno.getanno(node.body[0].body[2].value,
+                     NodeAnno.IS_LOCAL))  # b in return b
+
+  def assertScopeIs(self, scope, used, modified, created):
+    self.assertItemsEqual(used, tuple(str(s) for s in scope.used))
+    self.assertItemsEqual(modified, tuple(str(s) for s in scope.modified))
+    self.assertItemsEqual(created, tuple(str(s) for s in scope.created))
+
+  def test_print_statement(self):
+
+    def test_fn(a):
+      b = 0
+      c = 1
+      print(a, b)
+      return c
+
+    node = self._parse_and_analyze(test_fn)
+    print_node = node.body[0].body[2]
+    if isinstance(print_node, gast.Print):
+      # Python 2
+      print_args_scope = anno.getanno(print_node, NodeAnno.ARGS_SCOPE)
+    else:
+      # Python 3
+      assert isinstance(print_node, gast.Expr)
+      # The call node should be the one being annotated.
+      print_node = print_node.value
+      print_args_scope = anno.getanno(print_node, NodeAnno.ARGS_SCOPE)
+    # We basically need to detect which variables are captured by the call
+    # arguments.
+    self.assertScopeIs(print_args_scope, ('a', 'b'), (), ())
+
+  def test_call(self):
+
+    def test_fn(a):
+      b = 0
+      c = 1
+      foo(a, b)  # pylint:disable=undefined-variable
+      return c
+
+    node = self._parse_and_analyze(test_fn)
+    call_node = node.body[0].body[2].value
+    # We basically need to detect which variables are captured by the call
+    # arguments.
+    self.assertScopeIs(
+        anno.getanno(call_node, NodeAnno.ARGS_SCOPE), ('a', 'b'), (), ())
+
+  def test_while(self):
+
+    def test_fn(a):
+      b = a
+      while b > 0:
+        c = b
+        b -= 1
+      return b, c
+
+    node = self._parse_and_analyze(test_fn)
+    while_node = node.body[0].body[1]
+    self.assertScopeIs(
+        anno.getanno(while_node, NodeAnno.BODY_SCOPE), ('b',), ('b', 'c'),
+        ('c',))
+    self.assertScopeIs(
+        anno.getanno(while_node, NodeAnno.BODY_SCOPE).parent, ('a', 'b', 'c'),
+        ('b', 'c'), ('a', 'b', 'c'))
+
+  def test_for(self):
+
+    def test_fn(a):
+      b = a
+      for _ in a:
+        c = b
+        b -= 1
+      return b, c
+
+    node = self._parse_and_analyze(test_fn)
+    for_node = node.body[0].body[1]
+    self.assertScopeIs(
+        anno.getanno(for_node, NodeAnno.BODY_SCOPE), ('b',), ('b', 'c'), ('c',))
+    self.assertScopeIs(
+        anno.getanno(for_node, NodeAnno.BODY_SCOPE).parent, ('a', 'b', 'c'),
+        ('b', 'c', '_'), ('a', 'b', 'c', '_'))
+
+  def test_if(self):
+
+    def test_fn(x):
+      if x > 0:
+        x = -x
+        y = 2 * x
+        z = -y
+      else:
+        x = 2 * x
+        y = -x
+        u = -y
+      return z, u
+
+    node = self._parse_and_analyze(test_fn)
+    if_node = node.body[0].body[0]
+    self.assertScopeIs(
+        anno.getanno(if_node, NodeAnno.BODY_SCOPE), ('x', 'y'), ('x', 'y', 'z'),
+        ('y', 'z'))
+    # TODO(mdan): Double check: is it ok to not mark a local symbol as not read?
+    self.assertScopeIs(
+        anno.getanno(if_node, NodeAnno.BODY_SCOPE).parent, ('x', 'z', 'u'),
+        ('x', 'y', 'z', 'u'), ('x', 'y', 'z', 'u'))
+    self.assertScopeIs(
+        anno.getanno(if_node, NodeAnno.ORELSE_SCOPE), ('x', 'y'),
+        ('x', 'y', 'u'), ('y', 'u'))
+    self.assertScopeIs(
+        anno.getanno(if_node, NodeAnno.ORELSE_SCOPE).parent, ('x', 'z', 'u'),
+        ('x', 'y', 'z', 'u'), ('x', 'y', 'z', 'u'))
+
+  def test_call_with_composite_names(self):
+
+    def foo(*_):
+      pass
+
+    def test_fn(a):
+      foo(a.b, a.c)
+      if a > 0:
+        a.b = 2
+      else:
+        d = 2
+        d.e = a.c
+        f = d.e + 1
+        a.c = f
+
+    node = self._parse_and_analyze(test_fn)
+    call_node = node.body[0].body[0].value
+    self.assertScopeIs(
+        anno.getanno(call_node, NodeAnno.ARGS_SCOPE), ('a', 'a.b', 'a.c'), (),
+        ())
+    if_node = node.body[0].body[1]
+    self.assertScopeIs(
+        anno.getanno(if_node, NodeAnno.BODY_SCOPE), ('a',), ('a.b',), ())
+    self.assertScopeIs(
+        anno.getanno(if_node, NodeAnno.ORELSE_SCOPE),
+        ('a', 'a.c', 'd', 'd.e', 'f'), ('a.c', 'd', 'd.e', 'f'), ('d', 'f'))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/annos.py b/tensorflow/contrib/py2tf/pyct/static_analysis/annos.py
new file mode 100644
index 0000000000..2d8e494423
--- /dev/null
+++ b/tensorflow/contrib/py2tf/pyct/static_analysis/annos.py
@@ -0,0 +1,50 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Annotations used by the static analizer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from enum import Enum
+
+
+class NoValue(Enum):
+
+  def __repr__(self):
+    return self.name
+
+
+class NodeAnno(NoValue):
+  """Additionnal annotations used by the static analyzer.
+
+  These are in addition to the basic annotations declared in anno.py.
+  """
+
+  # Symbols
+
+  IS_LOCAL = 'Symbol is local to the function scope being analized.'
+  IS_PARAM = 'Symbol is a parameter to the function being analized.'
+  IS_MODIFIED_SINCE_ENTRY = (
+      'Symbol has been explicitly replaced in the current function scope.')
+
+  # Scopes
+  ARGS_SCOPE = 'The scope for the argument list of a function call.'
+  BODY_SCOPE = (
+      'The scope for the main body of a statement (True branch for if '
+      'statements, main body for loops).')
+  ORELSE_SCOPE = (
+      'The scope for the orelse body of a statement (False branch for if '
+      'statements, orelse body for loops).')
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/live_values.py b/tensorflow/contrib/py2tf/pyct/static_analysis/live_values.py
index 5a2903e6b5..9c0a9a9e74 100644
--- a/tensorflow/contrib/py2tf/pyct/static_analysis/live_values.py
+++ b/tensorflow/contrib/py2tf/pyct/static_analysis/live_values.py
@@ -16,7 +16,7 @@
 
 Live values are extracted from the known execution context.
 
-Requires annotations generated by AccessResolver.
+Requires activity analysis annotations.
 """
 
 from __future__ import absolute_import
@@ -27,6 +27,7 @@ import gast
 
 from tensorflow.contrib.py2tf.pyct import anno
 from tensorflow.contrib.py2tf.pyct import transformer
+from tensorflow.contrib.py2tf.pyct.static_analysis.annos import NodeAnno
 
 
 class LiveValueResolver(transformer.Base):
@@ -44,12 +45,12 @@ class LiveValueResolver(transformer.Base):
   def visit_Name(self, node):
     self.generic_visit(node)
     if isinstance(node.ctx, gast.Load):
-      assert anno.hasanno(node, 'is_local'), node
-      symbol_is_local = anno.getanno(node, 'is_local')
-      assert anno.hasanno(node, 'is_modified_since_entry'), node
-      symbol_is_modified = anno.getanno(node, 'is_modified_since_entry')
-      assert anno.hasanno(node, 'is_param'), node
-      symbol_is_param = anno.getanno(node, 'is_param')
+      assert anno.hasanno(node, NodeAnno.IS_LOCAL), node
+      symbol_is_local = anno.getanno(node, NodeAnno.IS_LOCAL)
+      assert anno.hasanno(node, NodeAnno.IS_MODIFIED_SINCE_ENTRY), node
+      symbol_is_modified = anno.getanno(node, NodeAnno.IS_MODIFIED_SINCE_ENTRY)
+      assert anno.hasanno(node, NodeAnno.IS_PARAM), node
+      symbol_is_param = anno.getanno(node, NodeAnno.IS_PARAM)
 
       if not symbol_is_local and not symbol_is_param:
         if node.id in self.literals:
@@ -60,7 +61,11 @@ class LiveValueResolver(transformer.Base):
           anno.setanno(node, 'live_val', obj)
           anno.setanno(node, 'fqn', (obj.__name__,))
         else:
-          raise ValueError('Could not resolve symbol "%s".' % node.id)
+          pass
+          # TODO(mdan): Should we raise an error here?
+          # Can encounter this when:
+          #  * a symbol truly lacks reference
+          #  * a symbol is new, like the new name of a function we just renamed.
       else:
         pass
         # TODO(mdan): Attempt to trace its value through the local chain.
@@ -97,7 +102,7 @@ class LiveValueResolver(transformer.Base):
     elif isinstance(node.value, gast.Name):
       stem_name = node.value
       # All nonlocal symbols should be fully resolved.
-      assert anno.hasanno(stem_name, 'is_local'), stem_name
+      assert anno.hasanno(stem_name, NodeAnno.IS_LOCAL), stem_name
       # TODO(mdan): Figure out what to do when calling attribute on local object
       # Maybe just leave as-is?
     return node
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/live_values_test.py b/tensorflow/contrib/py2tf/pyct/static_analysis/live_values_test.py
index f3057b3466..9f64689401 100644
--- a/tensorflow/contrib/py2tf/pyct/static_analysis/live_values_test.py
+++ b/tensorflow/contrib/py2tf/pyct/static_analysis/live_values_test.py
@@ -21,7 +21,8 @@ from __future__ import print_function
 from tensorflow.contrib.py2tf.pyct import anno
 from tensorflow.contrib.py2tf.pyct import context
 from tensorflow.contrib.py2tf.pyct import parser
-from tensorflow.contrib.py2tf.pyct.static_analysis import access
+from tensorflow.contrib.py2tf.pyct import qual_names
+from tensorflow.contrib.py2tf.pyct.static_analysis import activity
 from tensorflow.contrib.py2tf.pyct.static_analysis import live_values
 from tensorflow.contrib.py2tf.pyct.static_analysis import type_info
 from tensorflow.python.framework import constant_op
@@ -46,7 +47,8 @@ class LiveValuesResolverTest(test.TestCase):
         arg_values=None,
         arg_types=arg_types,
         recursive=True)
-    node = access.resolve(node, ctx)
+    node = qual_names.resolve(node)
+    node = activity.resolve(node, ctx)
     node = live_values.resolve(node, ctx, literals)
     node = type_info.resolve(node, ctx)
     node = live_values.resolve(node, ctx, literals)
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/type_info.py b/tensorflow/contrib/py2tf/pyct/static_analysis/type_info.py
index cf74142cbe..8203bda0f9 100644
--- a/tensorflow/contrib/py2tf/pyct/static_analysis/type_info.py
+++ b/tensorflow/contrib/py2tf/pyct/static_analysis/type_info.py
@@ -116,28 +116,30 @@ class TypeInfoResolver(transformer.Base):
     return node
 
   def _process_function_arg(self, arg_name):
-    if self.function_level == 1 and arg_name in self.context.arg_types:
+    str_name = str(arg_name)
+    if self.function_level == 1 and str_name in self.context.arg_types:
       # Forge a node to hold the type information, so that method calls on
       # it can resolve the type.
-      type_holder = gast.Name(arg_name, gast.Load(), None)
-      type_string, type_obj = self.context.arg_types[arg_name]
+      type_holder = arg_name.ast()
+      type_string, type_obj = self.context.arg_types[str_name]
       anno.setanno(type_holder, 'type', type_obj)
       anno.setanno(type_holder, 'type_fqn', tuple(type_string.split('.')))
       self.scope.setval(arg_name, type_holder)
 
   def visit_arg(self, node):
-    self._process_function_arg(node.arg)
+    self._process_function_arg(anno.getanno(node.arg, anno.Basic.QN))
     return node
 
   def visit_Name(self, node):
     self.generic_visit(node)
+    qn = anno.getanno(node, anno.Basic.QN)
     if isinstance(node.ctx, gast.Param):
-      self._process_function_arg(node.id)
-    elif isinstance(node.ctx, gast.Load) and self.scope.hasval(node.id):
+      self._process_function_arg(qn)
+    elif isinstance(node.ctx, gast.Load) and self.scope.hasval(qn):
       # E.g. if we had
       # a = b
       # then for future references to `a` we should have traced_source = `b`
-      traced_source = self.scope.getval(node.id)
+      traced_source = self.scope.getval(qn)
       if anno.hasanno(traced_source, 'type'):
         anno.setanno(node, 'type', anno.getanno(traced_source, 'type'))
         anno.setanno(node, 'type_fqn', anno.getanno(traced_source, 'type_fqn'))
@@ -159,16 +161,11 @@ class TypeInfoResolver(transformer.Base):
     for t in targets:
       if isinstance(t, gast.Tuple):
         for i, e in enumerate(t.elts):
-          self.scope.setval(e.id,
-                            gast.Subscript(
-                                source, gast.Index(i), ctx=gast.Store()))
-      elif isinstance(t, gast.Name):
-        self.scope.setval(t.id, source)
-      elif isinstance(t, gast.Attribute):
-        if not (isinstance(t.value, gast.Name) and t.value.id == 'self'):
-          raise ValueError(
-              'Dont know how to handle assignment to attributes of objects'
-              ' other than "self": [%s].%s' % (t.value, t.attr))
+          self.scope.setval(
+              anno.getanno(e, anno.Basic.QN),
+              gast.Subscript(source, gast.Index(i), ctx=gast.Store()))
+      elif isinstance(t, (gast.Name, gast.Attribute)):
+        self.scope.setval(anno.getanno(t, anno.Basic.QN), source)
       else:
         raise ValueError('Dont know how to handle assignment to %s' % t)
 
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/type_info_test.py b/tensorflow/contrib/py2tf/pyct/static_analysis/type_info_test.py
index 68fa1ee92a..3659f949db 100644
--- a/tensorflow/contrib/py2tf/pyct/static_analysis/type_info_test.py
+++ b/tensorflow/contrib/py2tf/pyct/static_analysis/type_info_test.py
@@ -21,7 +21,8 @@ from __future__ import print_function
 from tensorflow.contrib.py2tf.pyct import anno
 from tensorflow.contrib.py2tf.pyct import context
 from tensorflow.contrib.py2tf.pyct import parser
-from tensorflow.contrib.py2tf.pyct.static_analysis import access
+from tensorflow.contrib.py2tf.pyct import qual_names
+from tensorflow.contrib.py2tf.pyct.static_analysis import activity
 from tensorflow.contrib.py2tf.pyct.static_analysis import live_values
 from tensorflow.contrib.py2tf.pyct.static_analysis import type_info
 from tensorflow.python.client import session
@@ -65,7 +66,8 @@ class TypeInfoResolverTest(test.TestCase):
         arg_values=None,
         arg_types=arg_types,
         recursive=True)
-    node = access.resolve(node, ctx)
+    node = qual_names.resolve(node)
+    node = activity.resolve(node, ctx)
     node = live_values.resolve(node, ctx, {})
     node = type_info.resolve(node, ctx)
     node = live_values.resolve(node, ctx, {})
diff --git a/tensorflow/contrib/py2tf/pyct/templates.py b/tensorflow/contrib/py2tf/pyct/templates.py
index 6be526f20d..1039fc8713 100644
--- a/tensorflow/contrib/py2tf/pyct/templates.py
+++ b/tensorflow/contrib/py2tf/pyct/templates.py
@@ -22,12 +22,13 @@ from __future__ import division
 from __future__ import print_function
 
 import ast
-import copy
 import textwrap
 
 import gast
 
+from tensorflow.contrib.py2tf.pyct import copier
 from tensorflow.contrib.py2tf.pyct import parser
+from tensorflow.contrib.py2tf.pyct import qual_names
 
 
 class ReplaceTransformer(gast.NodeTransformer):
@@ -41,6 +42,7 @@ class ReplaceTransformer(gast.NodeTransformer):
           that these placeholders will be replaced by.
     """
     self.replacements = replacements
+    self.in_replacements = False
 
   # TODO(mdan): Make a more detailed pass and clean up if needed.
 
@@ -62,34 +64,53 @@ class ReplaceTransformer(gast.NodeTransformer):
       node.name = repl.id
     return node
 
-  def visit_Name(self, node):
-    if node.id in self.replacements:
-      # TODO(mdan): Sanitize the nodes by erasing scope-dependent annotations.
-      new_nodes = copy.copy(self.replacements[node.id])
-      if isinstance(new_nodes, gast.AST):
-        new_nodes = [new_nodes]
-      # Preserve the target context.
-      for n in new_nodes:
-        if isinstance(n, gast.Tuple):
-          for e in n.elts:
-            e.ctx = node.ctx
-        n.ctx = node.ctx
-      if len(new_nodes) == 1:
-        new_nodes, = new_nodes
-      return new_nodes
+  def _set_inner_child_context(self, node, ctx):
+    if isinstance(node, gast.Attribute):
+      self._set_inner_child_context(node.value, ctx)
+      node.ctx = gast.Load()
+    elif isinstance(node, gast.Name):
+      node.ctx = ctx
     else:
+      raise ValueError('unexpected node type "%s"' % node)
+
+  def visit_Name(self, node):
+    if node.id not in self.replacements:
       return node
 
+    new_nodes = copier.copy_clean(self.replacements[node.id])
+    if isinstance(new_nodes, gast.AST):
+      new_nodes = [new_nodes]
+
+    # Preserve the target context.
+    for n in new_nodes:
+      if isinstance(n, gast.Tuple):
+        for e in n.elts:
+          self._set_inner_child_context(e, node.ctx)
+      if isinstance(n, gast.Attribute):
+        # For attributes, the inner Name node receives the context, while the
+        # outer ones have it set to Load.
+        self._set_inner_child_context(n, node.ctx)
+      else:
+        n.ctx = node.ctx
+
+    if len(new_nodes) == 1:
+      new_nodes, = new_nodes
+
+    return new_nodes
+
 
-def _strings_to_names(n):
+def _convert_to_ast(n):
+  """Convert from a known data type to AST."""
   if isinstance(n, str):
     # Note: the node will receive the ctx value from the template, see
     # ReplaceTransformer.visit_Name.
     return gast.Name(id=n, ctx=None, annotation=None)
+  if isinstance(n, qual_names.QN):
+    return n.ast()
   if isinstance(n, list):
-    return [_strings_to_names(e) for e in n]
+    return [_convert_to_ast(e) for e in n]
   if isinstance(n, tuple):
-    return tuple(_strings_to_names(e) for e in n)
+    return tuple(_convert_to_ast(e) for e in n)
   return n
 
 
@@ -122,5 +143,8 @@ def replace(template, **replacements):
     raise ValueError('Expected string template, got %s' % type(template))
   tree = parser.parse_str(textwrap.dedent(template))
   for k in replacements:
-    replacements[k] = _strings_to_names(replacements[k])
-  return ReplaceTransformer(replacements).visit(tree).body
+    replacements[k] = _convert_to_ast(replacements[k])
+  results = ReplaceTransformer(replacements).visit(tree).body
+  if isinstance(results, list):
+    return [qual_names.resolve(r) for r in results]
+  return qual_names.resolve(results)
diff --git a/tensorflow/contrib/py2tf/pyct/templates_test.py b/tensorflow/contrib/py2tf/pyct/templates_test.py
index 1143131283..0e3d07e378 100644
--- a/tensorflow/contrib/py2tf/pyct/templates_test.py
+++ b/tensorflow/contrib/py2tf/pyct/templates_test.py
@@ -27,6 +27,16 @@ from tensorflow.python.platform import test
 
 class TemplatesTest(test.TestCase):
 
+  def test_replace_tuple(self):
+    template = """
+      def test_fn(a, c):
+        return b,
+    """
+
+    node = templates.replace(template, b=('a', 'c'))[0]
+    result = compiler.ast_to_object(node)
+    self.assertEquals((2, 3), result.test_fn(2, 3))
+
   def test_replace_variable(self):
     template = """
       def test_fn(a):
diff --git a/tensorflow/contrib/py2tf/pyct/transformer.py b/tensorflow/contrib/py2tf/pyct/transformer.py
index 8a836b7c1b..877d52af01 100644
--- a/tensorflow/contrib/py2tf/pyct/transformer.py
+++ b/tensorflow/contrib/py2tf/pyct/transformer.py
@@ -23,6 +23,7 @@ import sys
 import gast
 import six
 
+from tensorflow.contrib.py2tf.pyct import anno
 from tensorflow.contrib.py2tf.pyct import pretty_printer
 
 
@@ -44,16 +45,19 @@ class Base(gast.NodeTransformer):
     self.context = context
 
   def visit(self, node):
+    source_code = self.context.source_code
+    source_file = self.context.source_file
     try:
-      source_code = self.context.source_code
-      source_file = self.context.source_file
       if source_code and hasattr(node, 'lineno'):
         self._lineno = node.lineno
         self._col_offset = node.col_offset
+      if anno.hasanno(node, anno.Basic.SKIP_PROCESSING):
+        return node
       return super(Base, self).visit(node)
-    except (ValueError, AttributeError, NotImplementedError) as e:
-      msg = '%s: %s\nOccurred at node:\n%s' % (e.__class__.__name__, str(e),
-                                               pretty_printer.fmt(node))
+    except (ValueError, AttributeError, KeyError, NotImplementedError,
+            AssertionError) as e:
+      msg = '%s: %s\nOccurred at node:\n%s' % (
+          e.__class__.__name__, str(e), pretty_printer.fmt(node, color=False))
       if source_code:
         line = source_code.splitlines()[self._lineno - 1]
       else:
-- 
GitLab


From 95ed84d5eaa4acccb2f28bda5e5e459921568c3b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 5 Feb 2018 08:32:18 -0800
Subject: [PATCH 644/966] Internal Change

PiperOrigin-RevId: 184532417
---
 .../internal/optimized/optimized_ops.h        | 20 +++++++++++++++++--
 .../internal/reference/reference_ops.h        | 17 +++++++++++++++-
 .../contrib/lite/toco/export_tensorflow.cc    |  1 +
 .../contrib/lite/toco/import_tensorflow.cc    |  6 ++++++
 tensorflow/contrib/lite/toco/model.h          |  2 ++
 5 files changed, 43 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
index 9ffa72b401..d5b0f45fd8 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -3459,7 +3459,7 @@ inline void ResizeBilinearGeneric(const float* input_data,
 inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims,
                            const int32* output_size_data,
                            const Dims<4>& output_size_dims, float* output_data,
-                           const Dims<4>& output_dims) {
+                           const Dims<4>& output_dims, bool align_corners) {
   gemmlowp::ScopedProfilingLabel label("ResizeBilinear");
   int32 batches = MatchingArraySize(input_dims, 3, output_dims, 3);
   int32 input_height = ArraySize(input_dims, 2);
@@ -3474,13 +3474,20 @@ inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims,
   int32 output_width = output_size_data[Offset(output_size_dims, 1, 0, 0, 0)];
 
   // Specialize for 2x2 upsample.
-  if (output_height == 2 * input_height && output_width == 2 * input_width) {
+  if (!align_corners && output_height == 2 * input_height &&
+      output_width == 2 * input_width) {
     ResizeBilinear2x2(input_data, input_dims, output_data, output_dims, batches,
                       input_height, input_width, depth, output_height,
                       output_width);
   } else {
     float height_scale = static_cast<float>(input_height) / output_height;
     float width_scale = static_cast<float>(input_width) / output_width;
+    if (align_corners && output_height > 1) {
+      height_scale = static_cast<float>(input_height - 1) / (output_height - 1);
+    }
+    if (align_corners && output_width > 1) {
+      width_scale = static_cast<float>(input_width - 1) / (output_width - 1);
+    }
 
     ResizeBilinearGeneric(input_data, input_dims, output_data, output_dims,
                           batches, input_height, input_width, depth,
@@ -3489,6 +3496,15 @@ inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims,
   }
 }
 
+// legacy, for compatibility with old checked-in code
+inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims,
+                           const int32* output_size_data,
+                           const Dims<4>& output_size_dims, float* output_data,
+                           const Dims<4>& output_dims) {
+  ResizeBilinear(input_data, input_dims, output_size_data, output_size_dims,
+                 output_data, output_dims, /*align_corners=*/false);
+}
+
 template <typename T>
 inline void SpaceToBatchND(const T* input_data, const Dims<4>& input_dims,
                            const int32* block_shape_data,
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index 730e6f2ad2..40e5c48a4c 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -2251,7 +2251,7 @@ inline void Gather(const T* input_data, const Dims<4>& input_dims,
 inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims,
                            const int32* output_size_data,
                            const Dims<4>& output_size_dims, float* output_data,
-                           const Dims<4>& output_dims) {
+                           const Dims<4>& output_dims, bool align_corners) {
   int32 batches = MatchingArraySize(input_dims, 3, output_dims, 3);
   int32 input_height = ArraySize(input_dims, 2);
   int32 input_width = ArraySize(input_dims, 1);
@@ -2265,6 +2265,12 @@ inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims,
   int32 output_width = output_size_data[Offset(output_size_dims, 1, 0, 0, 0)];
   float height_scale = static_cast<float>(input_height) / output_height;
   float width_scale = static_cast<float>(input_width) / output_width;
+  if (align_corners && output_height > 1) {
+    height_scale = static_cast<float>(input_height - 1) / (output_height - 1);
+  }
+  if (align_corners && output_width > 1) {
+    width_scale = static_cast<float>(input_width - 1) / (output_width - 1);
+  }
 
   for (int b = 0; b < batches; ++b) {
     for (int y = 0; y < output_height; ++y) {
@@ -2292,6 +2298,15 @@ inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims,
   }
 }
 
+// legacy, for compatibility with old checked-in code
+inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims,
+                           const int32* output_size_data,
+                           const Dims<4>& output_size_dims, float* output_data,
+                           const Dims<4>& output_dims) {
+  ResizeBilinear(input_data, input_dims, output_size_data, output_size_dims,
+                 output_data, output_dims, /*align_corners=*/false);
+}
+
 template <typename T>
 inline void SpaceToBatchND(const T* input_data, const Dims<4>& input_dims,
                            const int32* block_shape_data,
diff --git a/tensorflow/contrib/lite/toco/export_tensorflow.cc b/tensorflow/contrib/lite/toco/export_tensorflow.cc
index 4c70b01a9d..be6d506bf3 100644
--- a/tensorflow/contrib/lite/toco/export_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/export_tensorflow.cc
@@ -992,6 +992,7 @@ void ConvertResizeBilinearOperator(const Model& model,
   *resize_op->add_input() = src_op.inputs[0];
   *resize_op->add_input() = src_op.inputs[1];
   (*resize_op->mutable_attr())["T"].set_type(DT_FLOAT);
+  (*resize_op->mutable_attr())["align_corners"].set_b(src_op.align_corners);
 }
 
 namespace {
diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.cc b/tensorflow/contrib/lite/toco/import_tensorflow.cc
index 9862dbe99d..c12706e52d 100644
--- a/tensorflow/contrib/lite/toco/import_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/import_tensorflow.cc
@@ -1312,6 +1312,12 @@ void ConvertResizeBilinearOperator(const NodeDef& node,
   CHECK_EQ(node.op(), "ResizeBilinear");
   CheckInputsCount(node, tf_import_flags, 2);
   auto* op = new ResizeBilinearOperator;
+
+  op->align_corners = false;
+  if (HasAttr(node, "align_corners")) {
+    op->align_corners = GetBoolAttr(node, "align_corners");
+  }
+
   op->inputs.push_back(node.input(0));
   op->inputs.push_back(node.input(1));
   op->outputs.push_back(node.name());
diff --git a/tensorflow/contrib/lite/toco/model.h b/tensorflow/contrib/lite/toco/model.h
index d1af371fd4..17d5007450 100644
--- a/tensorflow/contrib/lite/toco/model.h
+++ b/tensorflow/contrib/lite/toco/model.h
@@ -1272,6 +1272,8 @@ struct ArgMaxOperator : Operator {
 // TensorFlow equivalent: ResizeBilinear
 struct ResizeBilinearOperator : Operator {
   ResizeBilinearOperator() : Operator(OperatorType::kResizeBilinear) {}
+
+  bool align_corners = false;
 };
 
 // SpaceToBatchND operator. It divides spatial dimensions into a grid of
-- 
GitLab


From 23851760b7b099214bdd4f1b88156d7ac2bdd2a2 Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Mon, 5 Feb 2018 08:41:54 -0800
Subject: [PATCH 645/966] Get control_flow_ops.py ready to support
 de/serializing nested control flow.

With this change, ControlFlowContexts keep track of their nested
contexts (the reverse lookup as ControlFlowContext.outer_context).
This is to enable de/serializing the nested contexts of each "root"
context, and only adding the root contexts to collections. This allows
for simple deserialization of each root context by recursively
deserializing its nested contexts.

The de/serialization logic is disabled and the corresponding
control_flow.proto changes are omitted for now for forwards
compatability (i.e. three-week-old binaries must be ready to accept
the new proto format once its commited). After this is committed for
three weeks, I'll commit a follow-up change enabling the new behavior.

Design note: I chose to serialize the nested contexts, rather than the
outer contexts, because it makes it easy to deserialize the contexts
in topological order and to assign the right outer context. If we
serialized the outer contexts, there'd need to be some mechanism for
either sorting all the serialized contexts first, or deserializing all
of them and then doing another pass to assign the outer contexts.

PiperOrigin-RevId: 184533406
---
 tensorflow/python/ops/control_flow_ops.py     | 113 +++++++++++++++---
 .../python/ops/control_flow_ops_test.py       |   8 +-
 2 files changed, 102 insertions(+), 19 deletions(-)

diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 33a92631d0..bcd187d821 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -50,6 +50,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import abc
 import collections
 import functools
 
@@ -1498,7 +1499,10 @@ class ControlFlowContext(object):
   """
 
   def __init__(self, values_def=None, import_scope=None):
+    self._nested_contexts = []
     self._outer_context = ops.get_default_graph()._get_control_flow_context()
+    if self._outer_context:
+      self._outer_context._nested_contexts.append(self)  # pylint: disable=protected-access
     self._context_stack = []
     if values_def:
       self._init_values_from_proto(values_def, import_scope=import_scope)
@@ -1551,7 +1555,17 @@ class ControlFlowContext(object):
   def back_prop(self):
     raise NotImplementedError("Abstract method")
 
-  def _to_proto(self, export_scope=None):
+  @abc.abstractmethod
+  def to_control_flow_context_def(self, context_def, export_scope=None):
+    """Serializes this into `context_def`.
+
+    Args:
+      context_def: a `ControlFlowContextDef` protocol buffer.
+      export_scope: Optional `string`. Name scope to remove.
+    """
+    raise NotImplementedError("Abstract method")
+
+  def _to_values_def(self, export_scope=None):
     """Converts the values to a `ValuesDef` protocol buffer.
 
     Args:
@@ -1568,11 +1582,6 @@ class ControlFlowContext(object):
       values_def.external_values[k] = ops.strip_name_scope(v.name, export_scope)
     return values_def
 
-  @staticmethod
-  def _from_proto(values_def, import_scope=None):
-    """Returns a `ControlFlowContext` created from `values_def`."""
-    return ControlFlowContext(values_def=values_def, import_scope=import_scope)
-
   def AddName(self, name):
     self._values.add(name)
 
@@ -1751,8 +1760,15 @@ class CondContext(ControlFlowContext):
       context_def.pivot_name = ops.strip_name_scope(self._pivot.name,
                                                     export_scope)
       context_def.branch = self._branch
-      context_def.values_def.MergeFrom(
-          super(CondContext, self)._to_proto(export_scope))
+      context_def.values_def.MergeFrom(super(CondContext, self)._to_values_def(
+          export_scope))
+      # TODO(b/72868227): enable this once the corresponding control_flow.proto
+      # changes have been checked in (they aren't checked in and this is
+      # disabled for now to ensure forwards compatibility).
+      if False:  # pylint: disable=using-constant-test
+        for nested in self._nested_contexts:
+          nested_def = context_def.nested_contexts.add()
+          nested.to_control_flow_context_def(nested_def)
 
       return context_def
     else:
@@ -1761,7 +1777,21 @@ class CondContext(ControlFlowContext):
   @staticmethod
   def from_proto(context_def, import_scope=None):
     """Returns a `CondContext` object created from `context_def`."""
-    return CondContext(context_def=context_def, import_scope=import_scope)
+    ret = CondContext(context_def=context_def,
+                      import_scope=import_scope)
+
+    # TODO(b/72868227): remove "if hasattr(...)" once the corresponding
+    # control_flow.proto changes have been checked in (they aren't checked in
+    # and this is here for now to ensure forwards compatibility).
+    if hasattr(context_def, "nested_contexts"):
+      ret.Enter()
+      for nested_def in context_def.nested_contexts:
+        from_control_flow_context_def(nested_def)
+      ret.Exit()
+    return ret
+
+  def to_control_flow_context_def(self, context_def, export_scope=None):
+    context_def.cond_ctxt.CopyFrom(self.to_proto(export_scope=export_scope))
 
   def AddValue(self, val):
     """Add `val` to the current context and its outer context recursively."""
@@ -2067,9 +2097,15 @@ def cond(pred,
     merges = [merge(pair)[0] for pair in zip(res_f_flat, res_t_flat)]
     merges = _convert_flows_to_tensorarrays(nest.flatten(orig_res_t), merges)
 
-    # Add to collections
-    ops.add_to_collection(ops.GraphKeys.COND_CONTEXT, context_t)
-    ops.add_to_collection(ops.GraphKeys.COND_CONTEXT, context_f)
+    # Only add non-nested conds to the collection. Any nested control flow will
+    # be encapsulated in the root context.
+    assert context_t.outer_context == context_f.outer_context
+    # TODO(b/72868227): remove "if True..." once the corresponding
+    # control_flow.proto changes have been checked in (they aren't checked in
+    # and this is disabled for now to ensure forwards compatibility).
+    if True or context_t.outer_context is None:
+      ops.add_to_collection(ops.GraphKeys.COND_CONTEXT, context_t)
+      ops.add_to_collection(ops.GraphKeys.COND_CONTEXT, context_f)
 
     merges = nest.pack_sequence_as(structure=orig_res_t, flat_sequence=merges)
 
@@ -2277,12 +2313,23 @@ class WhileContext(ControlFlowContext):
           ops.strip_name_scope(l.name, export_scope) for l in self._loop_enters
       ])
       context_def.values_def.MergeFrom(
-          super(WhileContext, self)._to_proto(export_scope=export_scope))
+          super(WhileContext, self)._to_values_def(
+              export_scope=export_scope))
+      # TODO(b/72868227): remove "if True..." once the corresponding
+      # control_flow.proto changes have been checked in (they aren't checked in
+      # and this is disabled for now to ensure forwards compatibility).
+      if False:  # pylint: disable=using-constant-test
+        for nested in self._nested_contexts:
+          nested_def = context_def.nested_contexts.add()
+          nested.to_control_flow_context_def(nested_def)
 
       return context_def
     else:
       return None
 
+  def to_control_flow_context_def(self, context_def, export_scope=None):
+    context_def.while_ctxt.CopyFrom(self.to_proto(export_scope=export_scope))
+
   @staticmethod
   def from_proto(context_def, import_scope=None):
     """Returns a `WhileContext` object created from `context_def`.
@@ -2294,7 +2341,17 @@ class WhileContext(ControlFlowContext):
     Returns:
       A `WhileContext` Python object.
     """
-    return WhileContext(context_def=context_def, import_scope=import_scope)
+    ret = WhileContext(context_def=context_def,
+                       import_scope=import_scope)
+    # TODO(b/72868227): remove "if hasattr(...)" once the corresponding
+    # control_flow.proto changes have been checked in (they aren't checked in
+    # and this is disabled for now to ensure forwards compatibility).
+    if hasattr(context_def, "nested_contexts"):
+      ret.Enter()
+      for nested_def in context_def.nested_contexts:
+        from_control_flow_context_def(nested_def, import_scope=import_scope)
+      ret.Exit()
+    return ret
 
   def GetWhileContext(self):
     return self
@@ -3092,7 +3149,13 @@ def while_loop(cond,
         parallel_iterations=parallel_iterations,
         back_prop=back_prop,
         swap_memory=swap_memory)
-    ops.add_to_collection(ops.GraphKeys.WHILE_CONTEXT, loop_context)
+    # Only add non-nested loops to the collection. Any nested control flow will
+    # be encapsulated in the root context.
+    # TODO(b/72868227): enable condition once the corresponding
+    # control_flow.proto changes have been checked in (they aren't checked in
+    # and this is disabled for now to ensure forwards compatibility).
+    if True or loop_context.outer_context is None:
+      ops.add_to_collection(ops.GraphKeys.WHILE_CONTEXT, loop_context)
     result = loop_context.BuildLoop(cond, body, loop_vars, shape_invariants)
     if maximum_iterations is not None:
       return result[1]
@@ -3540,6 +3603,26 @@ class XLAControlFlowContext(ControlFlowContext):
     return x
 
 
+def from_control_flow_context_def(context_def, import_scope=None):
+  """Deserializes `context_def` into the appropriate ControlFlowContext.
+
+  Args:
+    context_def: ControlFlowContextDef proto
+    import_scope: Optional `string`. Name scope to add.
+
+  Returns:
+    A ControlFlowContext subclass
+  """
+  if context_def.HasField("cond_ctxt"):
+    return CondContext.from_proto(context_def.cond_ctxt,
+                                  import_scope=import_scope)
+  if context_def.HasField("while_ctxt"):
+    return WhileContext.from_proto(context_def.while_ctxt,
+                                   import_scope=import_scope)
+  raise NotImplementedError("Unknown ControlFlowContextDef field: %s"
+                            % context_def.WhichOneof("ctxt"))
+
+
 ops.register_proto_function(
     ops.GraphKeys.COND_CONTEXT,
     proto_type=control_flow_pb2.CondContextDef,
diff --git a/tensorflow/python/ops/control_flow_ops_test.py b/tensorflow/python/ops/control_flow_ops_test.py
index cc5a42bf3d..f942f478f2 100644
--- a/tensorflow/python/ops/control_flow_ops_test.py
+++ b/tensorflow/python/ops/control_flow_ops_test.py
@@ -483,8 +483,8 @@ class ContextTest(test_util.TensorFlowTestCase):
       c._values = ["a", "b"]
       c._external_values = {"a": b1}
 
-      c_with_scope = control_flow_ops.ControlFlowContext._from_proto(
-          c._to_proto(), import_scope="test_scope")
+      c_with_scope = control_flow_ops.ControlFlowContext(
+          values_def=c._to_values_def(), import_scope="test_scope")
 
       # _values and _external_values should be have scope prepended.
       self.assertEquals(
@@ -494,8 +494,8 @@ class ContextTest(test_util.TensorFlowTestCase):
 
       # Calling _to_proto() with export_scope should remove "test_scope".
       self.assertProtoEquals(
-          c._to_proto(),
-          c_with_scope._to_proto(export_scope="test_scope"))
+          c._to_values_def(),
+          c_with_scope._to_values_def(export_scope="test_scope"))
 
 
 def _GetNestedShape(nested):
-- 
GitLab


From 83a2e03627bffa0f8cad1a9b11886aba7206a988 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 5 Feb 2018 09:33:42 -0800
Subject: [PATCH 646/966] Enable aggressive identity node pruning in dependency
 optimizer.

PiperOrigin-RevId: 184539756
---
 .../core/grappler/optimizers/dependency_optimizer.cc   |  3 +--
 .../grappler/optimizers/dependency_optimizer_test.cc   | 10 +++++-----
 tensorflow/python/grappler/cluster_test.py             |  4 ++--
 3 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
index 7b4ca1496c..db64e53026 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
@@ -156,7 +156,6 @@ bool DependencyOptimizer::SafeToConvertToNoOp(const NodeDef& node) {
 void DependencyOptimizer::OptimizeNode(int node_idx,
                                        SetVector<int>* nodes_to_simplify,
                                        std::set<int>* nodes_to_delete) {
-  const bool is_aggressive = opt_level_ == RewriterConfig::AGGRESSIVE;
   NodeDef* node = optimized_graph_->mutable_node(node_idx);
   const bool is_noop = IsNoOp(*node);
   const bool is_identity = IsIdentity(*node);
@@ -280,7 +279,7 @@ void DependencyOptimizer::OptimizeNode(int node_idx,
   //    y --^> |          | --^> b       /\    +---+
   //           +----------+             y --^> b
 
-  if (is_noop || (is_identity && is_aggressive)) {
+  if (is_noop || is_identity) {
     const auto& output_node_set = node_map_->GetOutputs(node_name);
     const std::vector<NodeDef*> output_nodes(output_node_set.begin(),
                                              output_node_set.end());
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
index b8facb9dea..33d6b992d2 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
@@ -176,7 +176,7 @@ TEST_F(DependencyOptimizerTest, ChangeToNoop_SwitchIdentity) {
   item.fetch.push_back("neg1");
   item.fetch.push_back("neg2");
 
-  DependencyOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  DependencyOptimizer optimizer;
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -360,7 +360,7 @@ TEST_F(DependencyOptimizerTest, RemoveIdentity) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   item.fetch = {"a_a", "a_b", "a_c", "a_d", "b_a", "c_a", "c_b"};
 
-  DependencyOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  DependencyOptimizer optimizer;
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -420,7 +420,7 @@ TEST_F(DependencyOptimizerTest, RemoveIdentity_RepeatedInputs) {
   item.fetch.push_back("or0");
   item.fetch.push_back("or1");
   item.fetch.push_back("or2");
-  DependencyOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  DependencyOptimizer optimizer;
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -459,7 +459,7 @@ TEST_F(DependencyOptimizerTest, Transitive_Reduction_Simple) {
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   item.fetch.push_back("neg2");
-  DependencyOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  DependencyOptimizer optimizer;
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -495,7 +495,7 @@ TEST_F(DependencyOptimizerTest, ChangeToNoop_Identity) {
   item.fetch.push_back("id2");
   item.fetch.push_back("fetch");
 
-  DependencyOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  DependencyOptimizer optimizer;
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
diff --git a/tensorflow/python/grappler/cluster_test.py b/tensorflow/python/grappler/cluster_test.py
index 2292b2c732..10d515a364 100644
--- a/tensorflow/python/grappler/cluster_test.py
+++ b/tensorflow/python/grappler/cluster_test.py
@@ -45,7 +45,7 @@ class ClusterTest(test.TestCase):
       op_perfs, run_time, step_stats = grappler_cluster.MeasureCosts(
           grappler_item)
       self.assertTrue(run_time > 0)
-      self.assertEqual(len(op_perfs), 9)
+      self.assertEqual(len(op_perfs), 7)
       self.assertTrue(step_stats.dev_stats)
 
   def testNoDetailedStats(self):
@@ -125,7 +125,7 @@ class ClusterTest(test.TestCase):
         disable_detailed_stats=False, disable_timeline=False) as gcluster:
       op_perfs, run_time, step_stats = gcluster.MeasureCosts(grappler_item)
       self.assertTrue(run_time > 0)
-      self.assertEqual(len(op_perfs), 9)
+      self.assertEqual(len(op_perfs), 7)
       self.assertTrue(step_stats.dev_stats)
 
   def testAvailableOps(self):
-- 
GitLab


From dcffcefacac05f25d1c676fec226e2356227b684 Mon Sep 17 00:00:00 2001
From: Dan Ringwalt <ringwalt@google.com>
Date: Mon, 5 Feb 2018 09:49:01 -0800
Subject: [PATCH 647/966] Clarify that tf.contrib.image.rotate angles are
 counterclockwise.

PiperOrigin-RevId: 184541776
---
 tensorflow/contrib/image/python/ops/image_ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/image/python/ops/image_ops.py b/tensorflow/contrib/image/python/ops/image_ops.py
index 63377ae503..6122ee5805 100644
--- a/tensorflow/contrib/image/python/ops/image_ops.py
+++ b/tensorflow/contrib/image/python/ops/image_ops.py
@@ -40,7 +40,7 @@ ops.RegisterShape("ImageProjectiveTransform")(common_shapes.call_cpp_shape_fn)
 
 
 def rotate(images, angles, interpolation="NEAREST", name=None):
-  """Rotate image(s) by the passed angle(s) in radians.
+  """Rotate image(s) counterclockwise by the passed angle(s) in radians.
 
   Args:
     images: A tensor of shape (num_images, num_rows, num_columns, num_channels)
-- 
GitLab


From 5c631db07f69d8747b62f05056364ea393578b62 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 5 Feb 2018 10:06:10 -0800
Subject: [PATCH 648/966] [XLA] add Conditional to the local Python XLA client.

PiperOrigin-RevId: 184544483
---
 .../xla/python/local_computation_builder.cc   | 11 ++++++++++
 .../xla/python/local_computation_builder.h    |  6 +++++
 .../xla/python/local_computation_builder.i    |  1 +
 tensorflow/compiler/xla/python/xla_client.py  | 22 ++++++++++++++++++-
 .../compiler/xla/python/xla_client_test.py    | 22 +++++++++++++++++++
 5 files changed, 61 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/python/local_computation_builder.cc b/tensorflow/compiler/xla/python/local_computation_builder.cc
index 8386acf0cd..3b0d837739 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.cc
+++ b/tensorflow/compiler/xla/python/local_computation_builder.cc
@@ -493,6 +493,17 @@ ComputationDataHandle LocalComputationBuilder::While(
   return builder_.While(condition.computation(), body.computation(), init);
 }
 
+ComputationDataHandle LocalComputationBuilder::Conditional(
+    const ComputationDataHandle& predicate,
+    const ComputationDataHandle& true_operand,
+    const LocalComputation& true_computation,
+    const ComputationDataHandle& false_operand,
+    const LocalComputation& false_computation) {
+  return builder_.Conditional(predicate, true_operand,
+                              true_computation.computation(), false_operand,
+                              false_computation.computation());
+}
+
 #define _FORWARD(method_name, return_sig, args_sig, args)    \
   return_sig LocalComputationBuilder::method_name args_sig { \
     return builder_.method_name args;                        \
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.h b/tensorflow/compiler/xla/python/local_computation_builder.h
index f39d15cff7..4c6a504f4c 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.h
+++ b/tensorflow/compiler/xla/python/local_computation_builder.h
@@ -250,6 +250,12 @@ class LocalComputationBuilder {
                               const LocalComputation& body,
                               const ComputationDataHandle& init);
 
+  ComputationDataHandle Conditional(const ComputationDataHandle& predicate,
+                                    const ComputationDataHandle& true_operand,
+                                    const LocalComputation& true_computation,
+                                    const ComputationDataHandle& false_operand,
+                                    const LocalComputation& false_computation);
+
 #define _FORWARD(method_name, return_sig, args_sig) \
   return_sig method_name args_sig;
 
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.i b/tensorflow/compiler/xla/python/local_computation_builder.i
index 5ea75550c9..114754bde4 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.i
+++ b/tensorflow/compiler/xla/python/local_computation_builder.i
@@ -879,6 +879,7 @@ tensorflow::ImportNumpy();
 %unignore xla::swig::LocalComputationBuilder::RngUniform;
 %unignore xla::swig::LocalComputationBuilder::RngBernoulli;
 %unignore xla::swig::LocalComputationBuilder::While;
+%unignore xla::swig::LocalComputationBuilder::Conditional;
 %unignore xla::swig::LocalComputationBuilder::Eq;
 %unignore xla::swig::LocalComputationBuilder::Ne;
 %unignore xla::swig::LocalComputationBuilder::Ge;
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index b890980955..b3b2b1c7ff 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -921,7 +921,7 @@ class ComputationBuilder(object):
     Args:
       cond: a Computation for the loop condition, which has type T -> PRED
       body: a Computation for the loop body, which has type T -> T
-      init: an ComputationDataHandle for the initial parameter, which has type T
+      init: a ComputationDataHandle for the initial parameter, which has type T
 
     Returns: a ComputationDataHandle representing the While operation.
     """
@@ -930,6 +930,26 @@ class ComputationBuilder(object):
                            body.c_local_computation,
                            _unwrap_data_handle(init)))
 
+  def Conditional(self, pred, true_operand, true_computation, false_operand,
+                  false_computation):
+    """Enqueues a Conditional operation onto the computation.
+
+    Args:
+      predicate: a ComputationDataHandle to test, which has scalar type PRED
+      true_operand: a ComputationDataHandle of type T_0
+      true_computation: a Computation to apply to true_operand, type T_0 -> S
+      false_operand: a ComputationDatahandle of type T_1
+      false_computation: a Computation to apply to false_operand, type T_1 -> S
+
+    Returns: a ComputationDataHandle representing the Conditional operation.
+    """
+    return _wrap_data_handle(
+        self._client.Conditional(
+            _unwrap_data_handle(pred), _unwrap_data_handle(true_operand),
+            true_computation.c_local_computation,
+            _unwrap_data_handle(false_operand),
+            false_computation.c_local_computation))
+
   def Dot(self, lhs, rhs):
     """Enqueues a dot operation onto the computation.
 
diff --git a/tensorflow/compiler/xla/python/xla_client_test.py b/tensorflow/compiler/xla/python/xla_client_test.py
index 421fba40e3..65720c6ef9 100644
--- a/tensorflow/compiler/xla/python/xla_client_test.py
+++ b/tensorflow/compiler/xla/python/xla_client_test.py
@@ -1219,6 +1219,28 @@ class EmbeddedComputationsTest(LocalComputationTest):
     c.While(cond, body, init)
     self._ExecuteAndCompareClose(c, expected=16.)
 
+  def testConditionalTrue(self):
+    c = self._NewComputation()
+    pred = c.ConstantPredScalar(True)
+    true_operand = c.ConstantF32Scalar(3.)
+    true_computation = self._CreateMulF32By2Computation()
+    false_operand = c.ConstantF32Scalar(2.)
+    false_computation = self._CreateConstantF32Computation()
+    c.Conditional(pred, true_operand, true_computation, false_operand,
+                  false_computation)
+    self._ExecuteAndCompareClose(c, expected=6.)
+
+  def testConditionalFalse(self):
+    c = self._NewComputation()
+    pred = c.ConstantPredScalar(False)
+    true_operand = c.ConstantF32Scalar(3.)
+    true_computation = self._CreateMulF32By2Computation()
+    false_operand = c.ConstantF32Scalar(2.)
+    false_computation = self._CreateConstantF32Computation()
+    c.Conditional(pred, true_operand, true_computation, false_operand,
+                  false_computation)
+    self._ExecuteAndCompareClose(c, expected=1.)
+
   def testInfeedS32Values(self):
     to_infeed = NumpyArrayS32([1, 2, 3, 4])
     c = self._NewComputation()
-- 
GitLab


From 2fde0f219c98dca9e75c9e5f95157d7b0edce746 Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Mon, 5 Feb 2018 10:16:02 -0800
Subject: [PATCH 649/966] Changing the link to point to new android job.

PiperOrigin-RevId: 184546160
---
 tensorflow/java/maven/tensorflow-android/update.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/java/maven/tensorflow-android/update.py b/tensorflow/java/maven/tensorflow-android/update.py
index 7c25071834..4ae666e4e5 100644
--- a/tensorflow/java/maven/tensorflow-android/update.py
+++ b/tensorflow/java/maven/tensorflow-android/update.py
@@ -95,7 +95,7 @@ def main():
     release_prefix = 'https://storage.googleapis.com/tensorflow/libtensorflow'
     info_url = '%s/android_buildinfo-%s.json' % (release_prefix, args.version)
     aar_url = '%s/tensorflow-%s.aar' % (release_prefix, args.version)
-    build_type = 'release-matrix-android'
+    build_type = 'release-matrix-android2'
 
   # Retrieve build information
   build_info = get_json(info_url)
-- 
GitLab


From a9034babfd0100f0b998e900d074dd3d839d18bc Mon Sep 17 00:00:00 2001
From: Dan Ringwalt <ringwalt@google.com>
Date: Mon, 5 Feb 2018 10:36:24 -0800
Subject: [PATCH 650/966] Make flat_transforms_to_matrices and
 matrices_to_flat_transforms public (#781).

PiperOrigin-RevId: 184549704
---
 .../contrib/image/python/ops/image_ops.py     | 87 ++++++++++++++-----
 1 file changed, 66 insertions(+), 21 deletions(-)

diff --git a/tensorflow/contrib/image/python/ops/image_ops.py b/tensorflow/contrib/image/python/ops/image_ops.py
index 6122ee5805..c139ae89d8 100644
--- a/tensorflow/contrib/image/python/ops/image_ops.py
+++ b/tensorflow/contrib/image/python/ops/image_ops.py
@@ -290,31 +290,76 @@ def compose_transforms(*transforms):
   """
   assert transforms, "transforms cannot be empty"
   with ops.name_scope("compose_transforms"):
-    composed = _flat_transforms_to_matrices(transforms[0])
+    composed = flat_transforms_to_matrices(transforms[0])
     for tr in transforms[1:]:
       # Multiply batches of matrices.
-      composed = math_ops.matmul(composed, _flat_transforms_to_matrices(tr))
-    return _transform_matrices_to_flat(composed)
+      composed = math_ops.matmul(composed, flat_transforms_to_matrices(tr))
+    return matrices_to_flat_transforms(composed)
 
 
-def _flat_transforms_to_matrices(transforms):
-  # Make the transform(s) 2D in case the input is a single transform.
-  transforms = array_ops.reshape(transforms, constant_op.constant([-1, 8]))
-  num_transforms = array_ops.shape(transforms)[0]
-  # Add a column of ones for the implicit last entry in the matrix.
-  return array_ops.reshape(
-      array_ops.concat(
-          [transforms, array_ops.ones([num_transforms, 1])], axis=1),
-      constant_op.constant([-1, 3, 3]))
+def flat_transforms_to_matrices(transforms):
+  """Converts `tf.contrib.image` projective transforms to affine matrices.
 
+  Note that the output matrices map output coordinates to input coordinates. For
+  the forward transformation matrix, call `tf.linalg.inv` on the result.
 
-def _transform_matrices_to_flat(transform_matrices):
-  # Flatten each matrix.
-  transforms = array_ops.reshape(transform_matrices,
-                                 constant_op.constant([-1, 9]))
-  # Divide each matrix by the last entry (normally 1).
-  transforms /= transforms[:, 8:9]
-  return transforms[:, :8]
+  Args:
+    transforms: Vector of length 8, or batches of transforms with shape
+      `(N, 8)`.
+
+  Returns:
+    3D tensor of matrices with shape `(N, 3, 3)`. The output matrices map the
+      *output coordinates* (in homogeneous coordinates) of each transform to the
+      corresponding *input coordinates*.
+
+  Raises:
+    ValueError: If `transforms` have an invalid shape.
+  """
+  with ops.name_scope("flat_transforms_to_matrices"):
+    transforms = ops.convert_to_tensor(transforms, name="transforms")
+    if transforms.shape.ndims not in (1, 2):
+      raise ValueError("Transforms should be 1D or 2D, got: %s" % transforms)
+    # Make the transform(s) 2D in case the input is a single transform.
+    transforms = array_ops.reshape(transforms, constant_op.constant([-1, 8]))
+    num_transforms = array_ops.shape(transforms)[0]
+    # Add a column of ones for the implicit last entry in the matrix.
+    return array_ops.reshape(
+        array_ops.concat(
+            [transforms, array_ops.ones([num_transforms, 1])], axis=1),
+        constant_op.constant([-1, 3, 3]))
+
+
+def matrices_to_flat_transforms(transform_matrices):
+  """Converts affine matrices to `tf.contrib.image` projective transforms.
+
+  Note that we expect matrices that map output coordinates to input coordinates.
+  To convert forward transformation matrices, call `tf.linalg.inv` on the
+  matrices and use the result here.
+
+  Args:
+    transform_matrices: One or more affine transformation matrices, for the
+      reverse transformation in homogeneous coordinates. Shape `(3, 3)` or
+      `(N, 3, 3)`.
+
+  Returns:
+    2D tensor of flat transforms with shape `(N, 8)`, which may be passed into
+      `tf.contrib.image.transform`.
+
+  Raises:
+    ValueError: If `transform_matrices` have an invalid shape.
+  """
+  with ops.name_scope("matrices_to_flat_transforms"):
+    transform_matrices = ops.convert_to_tensor(
+        transform_matrices, name="transform_matrices")
+    if transform_matrices.shape.ndims not in (2, 3):
+      raise ValueError(
+          "Matrices should be 2D or 3D, got: %s" % transform_matrices)
+    # Flatten each matrix.
+    transforms = array_ops.reshape(transform_matrices,
+                                   constant_op.constant([-1, 9]))
+    # Divide each matrix by the last entry (normally 1).
+    transforms /= transforms[:, 8:9]
+    return transforms[:, :8]
 
 
 @ops.RegisterGradient("ImageProjectiveTransform")
@@ -346,9 +391,9 @@ def _image_projective_transform_grad(op, grad):
     raise TypeError("Transforms should have rank 1 or 2.")
 
   # Invert transformations
-  transforms = _flat_transforms_to_matrices(transforms=transforms)
+  transforms = flat_transforms_to_matrices(transforms=transforms)
   inverse = linalg_ops.matrix_inverse(transforms)
-  transforms = _transform_matrices_to_flat(inverse)
+  transforms = matrices_to_flat_transforms(inverse)
   output = gen_image_ops.image_projective_transform(
       grad, transforms, interpolation=interpolation)
   if len(image_or_images.get_shape()) == 2:
-- 
GitLab


From bccd80c45dab8f2856c176d45df7e7c72f35e14e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 5 Feb 2018 10:39:18 -0800
Subject: [PATCH 651/966] Proper reallocation of dynamic tensors.

PiperOrigin-RevId: 184550199
---
 tensorflow/contrib/lite/arena_planner.cc      |  8 ++++++--
 tensorflow/contrib/lite/arena_planner_test.cc | 10 +++-------
 tensorflow/contrib/lite/interpreter.cc        |  3 +++
 tensorflow/contrib/lite/interpreter.h         |  3 ++-
 tensorflow/contrib/lite/kernels/conv.cc       |  5 ++++-
 5 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/tensorflow/contrib/lite/arena_planner.cc b/tensorflow/contrib/lite/arena_planner.cc
index bf1bcdd1a7..87b17c338e 100644
--- a/tensorflow/contrib/lite/arena_planner.cc
+++ b/tensorflow/contrib/lite/arena_planner.cc
@@ -185,8 +185,12 @@ TfLiteStatus ArenaPlanner::CalculateAllocations(int first_node, int last_node) {
 TfLiteStatus ArenaPlanner::ResolveTensorAllocation(int tensor_index) {
   TfLiteTensor& tensor = *graph_info_->tensor(tensor_index);
   if (tensor.allocation_type == kTfLiteArenaRw) {
-    TF_LITE_ENSURE_STATUS(
-        arena_.ResolveAlloc(context_, allocs_[tensor_index], &tensor.data.raw));
+    // Skip resolution if the size of the tensor is zero, leaving it as a
+    // nullptr.
+    if (allocs_[tensor_index].size != 0) {
+      TF_LITE_ENSURE_STATUS(arena_.ResolveAlloc(context_, allocs_[tensor_index],
+                                                &tensor.data.raw));
+    }
   }
   if (tensor.allocation_type == kTfLiteArenaRwPersistent) {
     TF_LITE_ENSURE_STATUS(persistent_arena_.ResolveAlloc(
diff --git a/tensorflow/contrib/lite/arena_planner_test.cc b/tensorflow/contrib/lite/arena_planner_test.cc
index e10611e6d4..a8a8755e2c 100644
--- a/tensorflow/contrib/lite/arena_planner_test.cc
+++ b/tensorflow/contrib/lite/arena_planner_test.cc
@@ -193,8 +193,8 @@ TEST_F(ArenaPlannerTest, GraphWithNoOps) {
   EXPECT_EQ(GetOffset(10), GetOffsetAfter(0));
   // The outputs are never allocated because they are not connected to any
   // inputs.
-  EXPECT_EQ(GetOffset(5), 0);
-  EXPECT_EQ(GetOffset(11), 0);
+  EXPECT_TRUE((*graph.tensors())[5].data.raw == nullptr);
+  EXPECT_TRUE((*graph.tensors())[11].data.raw == nullptr);
 }
 
 TEST_F(ArenaPlannerTest, GraphWithOneOp) {
@@ -373,11 +373,7 @@ TEST_F(ArenaPlannerTest, LargerGraphAndStepwiseAllocation) {
   SetGraph(&graph);
 
   auto is_unallocated = [&](int tensor_index) {
-    // TODO(ahentz): We'd to use nullptr to represent unallocated tensors, but
-    // the current code still points them all to the beginning fo the alloc
-    // (that is, zero offset).
-    // return (*graph.tensors())[tensor_index].data.raw == nullptr;
-    return GetOffset(tensor_index) == 0;
+    return (*graph.tensors())[tensor_index].data.raw == nullptr;
   };
 
   // The allocation plan is made at the beginning and is independent of
diff --git a/tensorflow/contrib/lite/interpreter.cc b/tensorflow/contrib/lite/interpreter.cc
index a8db149eaa..9dd60abc86 100644
--- a/tensorflow/contrib/lite/interpreter.cc
+++ b/tensorflow/contrib/lite/interpreter.cc
@@ -459,6 +459,9 @@ TfLiteStatus Interpreter::ResizeTensorImpl(TfLiteTensor* tensor,
         TfLiteIntArrayFree(new_size);
         return kTfLiteError;
       }
+
+      // Realloc space for kTfLiteDynamic tensors.
+      TfLiteTensorRealloc(bytesRequired, tensor);
       tensor->bytes = bytesRequired;
     }
     if (tensor->dims) TfLiteIntArrayFree(tensor->dims);
diff --git a/tensorflow/contrib/lite/interpreter.h b/tensorflow/contrib/lite/interpreter.h
index c822557d02..3b077c7a35 100644
--- a/tensorflow/contrib/lite/interpreter.h
+++ b/tensorflow/contrib/lite/interpreter.h
@@ -307,7 +307,8 @@ class Interpreter {
   TfLiteStatus BytesRequired(TfLiteType type, const int* dims, int dims_size,
                              size_t* bytes);
 
-  // Request an tensor be resized implementation.
+  // Request an tensor be resized implementation. If the given tensor is of
+  // type kTfLiteDynamic it will also be allocated new memory.
   TfLiteStatus ResizeTensorImpl(TfLiteTensor* tensor, TfLiteIntArray* new_size);
 
   // Report a detailed error string (will be printed to stderr).
diff --git a/tensorflow/contrib/lite/kernels/conv.cc b/tensorflow/contrib/lite/kernels/conv.cc
index 7a45647434..1fba3cbbce 100644
--- a/tensorflow/contrib/lite/kernels/conv.cc
+++ b/tensorflow/contrib/lite/kernels/conv.cc
@@ -271,10 +271,13 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
       free(hwcn_weights->data.raw);
       hwcn_weights->data.raw = nullptr;
     }
+
+    // Note that hwcn_weights_status is a kTfLiteDynamic tensor, and
+    // ResizeTensor will actually allocate space for it. The would be more
+    // efficient if we placed hwcn_weights_status in the persistent arena.
     auto hwcn_weights_status =
         context->ResizeTensor(context, hwcn_weights, hwcn_weights_size);
     if (hwcn_weights_status != kTfLiteOk) return hwcn_weights_status;
-    hwcn_weights->data.raw = static_cast<char*>(malloc(hwcn_weights->bytes));
 
     // TODO(petewarden): If Resize() is called when the size hasn't actually
     // changed, this will do extra redundant work.
-- 
GitLab


From 16963086368463552d2e53b8cdac11c65482d13f Mon Sep 17 00:00:00 2001
From: "Joshua V. Dillon" <jvdillon@google.com>
Date: Mon, 5 Feb 2018 10:45:33 -0800
Subject: [PATCH 652/966] Automated g4 rollback of changelist 184323369

PiperOrigin-RevId: 184551259
---
 .../bayesflow/python/kernel_tests/hmc_test.py |  680 +++-----
 .../contrib/bayesflow/python/ops/hmc.py       |   11 +-
 .../contrib/bayesflow/python/ops/hmc_impl.py  | 1546 ++++++-----------
 3 files changed, 782 insertions(+), 1455 deletions(-)

diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/hmc_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/hmc_test.py
index d9d0dfce44..cbc66b6dc1 100644
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/hmc_test.py
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/hmc_test.py
@@ -19,36 +19,29 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
+from scipy import special
 from scipy import stats
 
 from tensorflow.contrib.bayesflow.python.ops import hmc
-from tensorflow.contrib.bayesflow.python.ops.hmc_impl import _compute_energy_change
-from tensorflow.contrib.bayesflow.python.ops.hmc_impl import _leapfrog_integrator
 
-from tensorflow.contrib.distributions.python.ops import independent as independent_lib
-from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gradients_impl as gradients_ops
+from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
-from tensorflow.python.ops.distributions import gamma as gamma_lib
-from tensorflow.python.ops.distributions import normal as normal_lib
 from tensorflow.python.platform import test
-from tensorflow.python.platform import tf_logging as logging_ops
-
-
-def _reduce_variance(x, axis=None, keepdims=False):
-  sample_mean = math_ops.reduce_mean(x, axis, keepdims=True)
-  return math_ops.reduce_mean(
-      math_ops.squared_difference(x, sample_mean), axis, keepdims)
+from tensorflow.python.platform import tf_logging as logging
 
 
+# TODO(b/66964210): Test float16.
 class HMCTest(test.TestCase):
 
   def setUp(self):
     self._shape_param = 5.
     self._rate_param = 10.
+    self._expected_x = (special.digamma(self._shape_param)
+                        - np.log(self._rate_param))
+    self._expected_exp_x = self._shape_param / self._rate_param
 
     random_seed.set_random_seed(10003)
     np.random.seed(10003)
@@ -70,46 +63,63 @@ class HMCTest(test.TestCase):
                                self._rate_param * math_ops.exp(x),
                                event_dims)
 
-  def _integrator_conserves_energy(self, x, independent_chain_ndims, sess,
+  def _log_gamma_log_prob_grad(self, x, event_dims=()):
+    """Computes log-pdf and gradient of a log-gamma random variable.
+
+    Args:
+      x: Value of the random variable.
+      event_dims: Dimensions not to treat as independent. Default is (),
+        i.e., all dimensions are independent.
+
+    Returns:
+      log_prob: The log-pdf up to a normalizing constant.
+      grad: The gradient of the log-pdf with respect to x.
+    """
+    return (math_ops.reduce_sum(self._shape_param * x -
+                                self._rate_param * math_ops.exp(x),
+                                event_dims),
+            self._shape_param - self._rate_param * math_ops.exp(x))
+
+  def _n_event_dims(self, x_shape, event_dims):
+    return np.prod([int(x_shape[i]) for i in event_dims])
+
+  def _integrator_conserves_energy(self, x, event_dims, sess,
                                    feed_dict=None):
-    step_size = array_ops.placeholder(np.float32, [], name="step_size")
-    hmc_lf_steps = array_ops.placeholder(np.int32, [], name="hmc_lf_steps")
+    def potential_and_grad(x):
+      log_prob, grad = self._log_gamma_log_prob_grad(x, event_dims)
+      return -log_prob, -grad
+
+    step_size = array_ops.placeholder(np.float32, [], name='step_size')
+    hmc_lf_steps = array_ops.placeholder(np.int32, [], name='hmc_lf_steps')
 
     if feed_dict is None:
       feed_dict = {}
     feed_dict[hmc_lf_steps] = 1000
 
-    event_dims = math_ops.range(independent_chain_ndims,
-                                array_ops.rank(x))
-
     m = random_ops.random_normal(array_ops.shape(x))
-    log_prob_0 = self._log_gamma_log_prob(x, event_dims)
-    grad_0 = gradients_ops.gradients(log_prob_0, x)
-    old_energy = -log_prob_0 + 0.5 * math_ops.reduce_sum(m**2., event_dims)
-
-    new_m, _, log_prob_1, _ = _leapfrog_integrator(
-        current_momentums=[m],
-        target_log_prob_fn=lambda x: self._log_gamma_log_prob(x, event_dims),
-        current_state_parts=[x],
-        step_sizes=[step_size],
-        num_leapfrog_steps=hmc_lf_steps,
-        current_target_log_prob=log_prob_0,
-        current_grads_target_log_prob=grad_0)
-    new_m = new_m[0]
-
-    new_energy = -log_prob_1 + 0.5 * math_ops.reduce_sum(new_m * new_m,
+    potential_0, grad_0 = potential_and_grad(x)
+    old_energy = potential_0 + 0.5 * math_ops.reduce_sum(m * m,
+                                                         event_dims)
+
+    _, new_m, potential_1, _ = (
+        hmc.leapfrog_integrator(step_size, hmc_lf_steps, x,
+                                m, potential_and_grad, grad_0))
+
+    new_energy = potential_1 + 0.5 * math_ops.reduce_sum(new_m * new_m,
                                                          event_dims)
 
     x_shape = sess.run(x, feed_dict).shape
-    event_size = np.prod(x_shape[independent_chain_ndims:])
-    feed_dict[step_size] = 0.1 / event_size
-    old_energy_, new_energy_ = sess.run([old_energy, new_energy],
-                                        feed_dict)
-    logging_ops.vlog(1, "average energy relative change: {}".format(
-        (1. - new_energy_ / old_energy_).mean()))
-    self.assertAllClose(old_energy_, new_energy_, atol=0., rtol=0.02)
-
-  def _integrator_conserves_energy_wrapper(self, independent_chain_ndims):
+    n_event_dims = self._n_event_dims(x_shape, event_dims)
+    feed_dict[step_size] = 0.1 / n_event_dims
+    old_energy_val, new_energy_val = sess.run([old_energy, new_energy],
+                                              feed_dict)
+    logging.vlog(1, 'average energy change: {}'.format(
+        abs(old_energy_val - new_energy_val).mean()))
+
+    self.assertAllEqual(np.ones_like(new_energy_val, dtype=np.bool),
+                        abs(old_energy_val - new_energy_val) < 1.)
+
+  def _integrator_conserves_energy_wrapper(self, event_dims):
     """Tests the long-term energy conservation of the leapfrog integrator.
 
     The leapfrog integrator is symplectic, so for sufficiently small step
@@ -117,167 +127,135 @@ class HMCTest(test.TestCase):
     the energy of the system blowing up or collapsing.
 
     Args:
-      independent_chain_ndims: Python `int` scalar representing the number of
-        dims associated with independent chains.
+      event_dims: A tuple of dimensions that should not be treated as
+        independent. This allows for multiple chains to be run independently
+        in parallel. Default is (), i.e., all dimensions are independent.
     """
     with self.test_session() as sess:
-      x_ph = array_ops.placeholder(np.float32, name="x_ph")
-      feed_dict = {x_ph: np.random.rand(50, 10, 2)}
-      self._integrator_conserves_energy(x_ph, independent_chain_ndims,
-                                        sess, feed_dict)
+      x_ph = array_ops.placeholder(np.float32, name='x_ph')
+
+      feed_dict = {x_ph: np.zeros([50, 10, 2])}
+      self._integrator_conserves_energy(x_ph, event_dims, sess, feed_dict)
 
   def testIntegratorEnergyConservationNullShape(self):
-    self._integrator_conserves_energy_wrapper(0)
+    self._integrator_conserves_energy_wrapper([])
 
   def testIntegratorEnergyConservation1(self):
-    self._integrator_conserves_energy_wrapper(1)
+    self._integrator_conserves_energy_wrapper([1])
 
   def testIntegratorEnergyConservation2(self):
-    self._integrator_conserves_energy_wrapper(2)
+    self._integrator_conserves_energy_wrapper([2])
 
-  def testIntegratorEnergyConservation3(self):
-    self._integrator_conserves_energy_wrapper(3)
+  def testIntegratorEnergyConservation12(self):
+    self._integrator_conserves_energy_wrapper([1, 2])
 
-  def _chain_gets_correct_expectations(self, x, independent_chain_ndims,
-                                       sess, feed_dict=None):
+  def testIntegratorEnergyConservation012(self):
+    self._integrator_conserves_energy_wrapper([0, 1, 2])
+
+  def _chain_gets_correct_expectations(self, x, event_dims, sess,
+                                       feed_dict=None):
     def log_gamma_log_prob(x):
-      event_dims = math_ops.range(independent_chain_ndims,
-                                  array_ops.rank(x))
       return self._log_gamma_log_prob(x, event_dims)
 
-    num_results = array_ops.placeholder(
-        np.int32, [], name="num_results")
-    step_size = array_ops.placeholder(
-        np.float32, [], name="step_size")
-    num_leapfrog_steps = array_ops.placeholder(
-        np.int32, [], name="num_leapfrog_steps")
+    step_size = array_ops.placeholder(np.float32, [], name='step_size')
+    hmc_lf_steps = array_ops.placeholder(np.int32, [], name='hmc_lf_steps')
+    hmc_n_steps = array_ops.placeholder(np.int32, [], name='hmc_n_steps')
 
     if feed_dict is None:
       feed_dict = {}
-    feed_dict.update({num_results: 150,
-                      step_size: 0.1,
-                      num_leapfrog_steps: 2})
-
-    samples, kernel_results = hmc.sample_chain(
-        num_results=num_results,
-        target_log_prob_fn=log_gamma_log_prob,
-        current_state=x,
-        step_size=step_size,
-        num_leapfrog_steps=num_leapfrog_steps,
-        num_burnin_steps=150,
-        seed=42)
-
-    expected_x = (math_ops.digamma(self._shape_param)
-                  - np.log(self._rate_param))
-
-    expected_exp_x = self._shape_param / self._rate_param
-
-    acceptance_probs_, samples_, expected_x_ = sess.run(
-        [kernel_results.acceptance_probs, samples, expected_x],
-        feed_dict)
-
-    actual_x = samples_.mean()
-    actual_exp_x = np.exp(samples_).mean()
-
-    logging_ops.vlog(1, "True      E[x, exp(x)]: {}\t{}".format(
-        expected_x_, expected_exp_x))
-    logging_ops.vlog(1, "Estimated E[x, exp(x)]: {}\t{}".format(
-        actual_x, actual_exp_x))
-    self.assertNear(actual_x, expected_x_, 2e-2)
-    self.assertNear(actual_exp_x, expected_exp_x, 2e-2)
-    self.assertTrue((acceptance_probs_ > 0.5).all())
-    self.assertTrue((acceptance_probs_ <= 1.0).all())
-
-  def _chain_gets_correct_expectations_wrapper(self, independent_chain_ndims):
+    feed_dict.update({step_size: 0.1,
+                      hmc_lf_steps: 2,
+                      hmc_n_steps: 300})
+
+    sample_chain, acceptance_prob_chain = hmc.chain([hmc_n_steps],
+                                                    step_size,
+                                                    hmc_lf_steps,
+                                                    x, log_gamma_log_prob,
+                                                    event_dims)
+
+    acceptance_probs, samples = sess.run([acceptance_prob_chain, sample_chain],
+                                         feed_dict)
+    samples = samples[feed_dict[hmc_n_steps] // 2:]
+    expected_x_est = samples.mean()
+    expected_exp_x_est = np.exp(samples).mean()
+
+    logging.vlog(1, 'True      E[x, exp(x)]: {}\t{}'.format(
+        self._expected_x, self._expected_exp_x))
+    logging.vlog(1, 'Estimated E[x, exp(x)]: {}\t{}'.format(
+        expected_x_est, expected_exp_x_est))
+    self.assertNear(expected_x_est, self._expected_x, 2e-2)
+    self.assertNear(expected_exp_x_est, self._expected_exp_x, 2e-2)
+    self.assertTrue((acceptance_probs > 0.5).all())
+    self.assertTrue((acceptance_probs <= 1.0).all())
+
+  def _chain_gets_correct_expectations_wrapper(self, event_dims):
     with self.test_session() as sess:
-      x_ph = array_ops.placeholder(np.float32, name="x_ph")
-      feed_dict = {x_ph: np.random.rand(50, 10, 2)}
-      self._chain_gets_correct_expectations(x_ph, independent_chain_ndims,
-                                            sess, feed_dict)
+      x_ph = array_ops.placeholder(np.float32, name='x_ph')
+
+      feed_dict = {x_ph: np.zeros([50, 10, 2])}
+      self._chain_gets_correct_expectations(x_ph, event_dims, sess,
+                                            feed_dict)
 
   def testHMCChainExpectationsNullShape(self):
-    self._chain_gets_correct_expectations_wrapper(0)
+    self._chain_gets_correct_expectations_wrapper([])
 
   def testHMCChainExpectations1(self):
-    self._chain_gets_correct_expectations_wrapper(1)
+    self._chain_gets_correct_expectations_wrapper([1])
 
   def testHMCChainExpectations2(self):
-    self._chain_gets_correct_expectations_wrapper(2)
+    self._chain_gets_correct_expectations_wrapper([2])
+
+  def testHMCChainExpectations12(self):
+    self._chain_gets_correct_expectations_wrapper([1, 2])
 
-  def _kernel_leaves_target_invariant(self, initial_draws,
-                                      independent_chain_ndims,
+  def _kernel_leaves_target_invariant(self, initial_draws, event_dims,
                                       sess, feed_dict=None):
     def log_gamma_log_prob(x):
-      event_dims = math_ops.range(independent_chain_ndims, array_ops.rank(x))
       return self._log_gamma_log_prob(x, event_dims)
 
     def fake_log_prob(x):
       """Cooled version of the target distribution."""
       return 1.1 * log_gamma_log_prob(x)
 
-    step_size = array_ops.placeholder(np.float32, [], name="step_size")
+    step_size = array_ops.placeholder(np.float32, [], name='step_size')
 
     if feed_dict is None:
       feed_dict = {}
 
     feed_dict[step_size] = 0.4
 
-    sample, kernel_results = hmc.kernel(
-        target_log_prob_fn=log_gamma_log_prob,
-        current_state=initial_draws,
-        step_size=step_size,
-        num_leapfrog_steps=5,
-        seed=43)
-
-    bad_sample, bad_kernel_results = hmc.kernel(
-        target_log_prob_fn=fake_log_prob,
-        current_state=initial_draws,
-        step_size=step_size,
-        num_leapfrog_steps=5,
-        seed=44)
-
-    [
-        acceptance_probs_,
-        bad_acceptance_probs_,
-        initial_draws_,
-        updated_draws_,
-        fake_draws_,
-    ] = sess.run([
-        kernel_results.acceptance_probs,
-        bad_kernel_results.acceptance_probs,
-        initial_draws,
-        sample,
-        bad_sample,
-    ], feed_dict)
-
+    sample, acceptance_probs, _, _ = hmc.kernel(step_size, 5, initial_draws,
+                                                log_gamma_log_prob, event_dims)
+    bad_sample, bad_acceptance_probs, _, _ = hmc.kernel(
+        step_size, 5, initial_draws, fake_log_prob, event_dims)
+    (acceptance_probs_val, bad_acceptance_probs_val, initial_draws_val,
+     updated_draws_val, fake_draws_val) = sess.run([acceptance_probs,
+                                                    bad_acceptance_probs,
+                                                    initial_draws, sample,
+                                                    bad_sample], feed_dict)
     # Confirm step size is small enough that we usually accept.
-    self.assertGreater(acceptance_probs_.mean(), 0.5)
-    self.assertGreater(bad_acceptance_probs_.mean(), 0.5)
-
+    self.assertGreater(acceptance_probs_val.mean(), 0.5)
+    self.assertGreater(bad_acceptance_probs_val.mean(), 0.5)
     # Confirm step size is large enough that we sometimes reject.
-    self.assertLess(acceptance_probs_.mean(), 0.99)
-    self.assertLess(bad_acceptance_probs_.mean(), 0.99)
-
-    _, ks_p_value_true = stats.ks_2samp(initial_draws_.flatten(),
-                                        updated_draws_.flatten())
-    _, ks_p_value_fake = stats.ks_2samp(initial_draws_.flatten(),
-                                        fake_draws_.flatten())
-
-    logging_ops.vlog(1, "acceptance rate for true target: {}".format(
-        acceptance_probs_.mean()))
-    logging_ops.vlog(1, "acceptance rate for fake target: {}".format(
-        bad_acceptance_probs_.mean()))
-    logging_ops.vlog(1, "K-S p-value for true target: {}".format(
-        ks_p_value_true))
-    logging_ops.vlog(1, "K-S p-value for fake target: {}".format(
-        ks_p_value_fake))
+    self.assertLess(acceptance_probs_val.mean(), 0.99)
+    self.assertLess(bad_acceptance_probs_val.mean(), 0.99)
+    _, ks_p_value_true = stats.ks_2samp(initial_draws_val.flatten(),
+                                        updated_draws_val.flatten())
+    _, ks_p_value_fake = stats.ks_2samp(initial_draws_val.flatten(),
+                                        fake_draws_val.flatten())
+    logging.vlog(1, 'acceptance rate for true target: {}'.format(
+        acceptance_probs_val.mean()))
+    logging.vlog(1, 'acceptance rate for fake target: {}'.format(
+        bad_acceptance_probs_val.mean()))
+    logging.vlog(1, 'K-S p-value for true target: {}'.format(ks_p_value_true))
+    logging.vlog(1, 'K-S p-value for fake target: {}'.format(ks_p_value_fake))
     # Make sure that the MCMC update hasn't changed the empirical CDF much.
     self.assertGreater(ks_p_value_true, 1e-3)
     # Confirm that targeting the wrong distribution does
     # significantly change the empirical CDF.
     self.assertLess(ks_p_value_fake, 1e-6)
 
-  def _kernel_leaves_target_invariant_wrapper(self, independent_chain_ndims):
+  def _kernel_leaves_target_invariant_wrapper(self, event_dims):
     """Tests that the kernel leaves the target distribution invariant.
 
     Draws some independent samples from the target distribution,
@@ -289,116 +267,86 @@ class HMCTest(test.TestCase):
     does change the target distribution. (And that we can detect that.)
 
     Args:
-      independent_chain_ndims: Python `int` scalar representing the number of
-        dims associated with independent chains.
+      event_dims: A tuple of dimensions that should not be treated as
+        independent. This allows for multiple chains to be run independently
+        in parallel. Default is (), i.e., all dimensions are independent.
     """
     with self.test_session() as sess:
       initial_draws = np.log(np.random.gamma(self._shape_param,
                                              size=[50000, 2, 2]))
       initial_draws -= np.log(self._rate_param)
-      x_ph = array_ops.placeholder(np.float32, name="x_ph")
+      x_ph = array_ops.placeholder(np.float32, name='x_ph')
 
       feed_dict = {x_ph: initial_draws}
 
-      self._kernel_leaves_target_invariant(x_ph, independent_chain_ndims,
-                                           sess, feed_dict)
+      self._kernel_leaves_target_invariant(x_ph, event_dims, sess,
+                                           feed_dict)
+
+  def testKernelLeavesTargetInvariantNullShape(self):
+    self._kernel_leaves_target_invariant_wrapper([])
 
   def testKernelLeavesTargetInvariant1(self):
-    self._kernel_leaves_target_invariant_wrapper(1)
+    self._kernel_leaves_target_invariant_wrapper([1])
 
   def testKernelLeavesTargetInvariant2(self):
-    self._kernel_leaves_target_invariant_wrapper(2)
+    self._kernel_leaves_target_invariant_wrapper([2])
 
-  def testKernelLeavesTargetInvariant3(self):
-    self._kernel_leaves_target_invariant_wrapper(3)
+  def testKernelLeavesTargetInvariant12(self):
+    self._kernel_leaves_target_invariant_wrapper([1, 2])
 
-  def _ais_gets_correct_log_normalizer(self, init, independent_chain_ndims,
-                                       sess, feed_dict=None):
+  def _ais_gets_correct_log_normalizer(self, init, event_dims, sess,
+                                       feed_dict=None):
     def proposal_log_prob(x):
-      event_dims = math_ops.range(independent_chain_ndims, array_ops.rank(x))
-      return -0.5 * math_ops.reduce_sum(x**2. + np.log(2 * np.pi),
-                                        axis=event_dims)
+      return math_ops.reduce_sum(-0.5 * x * x - 0.5 * np.log(2*np.pi),
+                                 event_dims)
 
     def target_log_prob(x):
-      event_dims = math_ops.range(independent_chain_ndims, array_ops.rank(x))
       return self._log_gamma_log_prob(x, event_dims)
 
     if feed_dict is None:
       feed_dict = {}
 
-    num_steps = 200
-
-    _, ais_weights, _ = hmc.sample_annealed_importance_chain(
-        proposal_log_prob_fn=proposal_log_prob,
-        num_steps=num_steps,
-        target_log_prob_fn=target_log_prob,
-        step_size=0.5,
-        current_state=init,
-        num_leapfrog_steps=2,
-        seed=45)
-
-    event_shape = array_ops.shape(init)[independent_chain_ndims:]
-    event_size = math_ops.reduce_prod(event_shape)
-
-    log_true_normalizer = (
-        -self._shape_param * math_ops.log(self._rate_param)
-        + math_ops.lgamma(self._shape_param))
-    log_true_normalizer *= math_ops.cast(event_size, log_true_normalizer.dtype)
-
-    log_estimated_normalizer = (math_ops.reduce_logsumexp(ais_weights)
-                                - np.log(num_steps))
-
-    ratio_estimate_true = math_ops.exp(ais_weights - log_true_normalizer)
-    ais_weights_size = array_ops.size(ais_weights)
-    standard_error = math_ops.sqrt(
-        _reduce_variance(ratio_estimate_true)
-        / math_ops.cast(ais_weights_size, ratio_estimate_true.dtype))
-
-    [
-        ratio_estimate_true_,
-        log_true_normalizer_,
-        log_estimated_normalizer_,
-        standard_error_,
-        ais_weights_size_,
-        event_size_,
-    ] = sess.run([
-        ratio_estimate_true,
-        log_true_normalizer,
-        log_estimated_normalizer,
-        standard_error,
-        ais_weights_size,
-        event_size,
-    ], feed_dict)
-
-    logging_ops.vlog(1, "        log_true_normalizer: {}\n"
-                        "   log_estimated_normalizer: {}\n"
-                        "           ais_weights_size: {}\n"
-                        "                 event_size: {}\n".format(
-                            log_true_normalizer_,
-                            log_estimated_normalizer_,
-                            ais_weights_size_,
-                            event_size_))
-    self.assertNear(ratio_estimate_true_.mean(), 1., 4. * standard_error_)
-
-  def _ais_gets_correct_log_normalizer_wrapper(self, independent_chain_ndims):
+    w, _, _ = hmc.ais_chain(200, 0.5, 2, init, target_log_prob,
+                            proposal_log_prob, event_dims)
+
+    w_val = sess.run(w, feed_dict)
+    init_shape = sess.run(init, feed_dict).shape
+    normalizer_multiplier = np.prod([init_shape[i] for i in event_dims])
+
+    true_normalizer = -self._shape_param * np.log(self._rate_param)
+    true_normalizer += special.gammaln(self._shape_param)
+    true_normalizer *= normalizer_multiplier
+
+    n_weights = np.prod(w_val.shape)
+    normalized_w = np.exp(w_val - true_normalizer)
+    standard_error = np.std(normalized_w) / np.sqrt(n_weights)
+    logging.vlog(1, 'True normalizer {}, estimated {}, n_weights {}'.format(
+        true_normalizer, np.log(normalized_w.mean()) + true_normalizer,
+        n_weights))
+    self.assertNear(normalized_w.mean(), 1.0, 4.0 * standard_error)
+
+  def _ais_gets_correct_log_normalizer_wrapper(self, event_dims):
     """Tests that AIS yields reasonable estimates of normalizers."""
     with self.test_session() as sess:
-      x_ph = array_ops.placeholder(np.float32, name="x_ph")
+      x_ph = array_ops.placeholder(np.float32, name='x_ph')
+
       initial_draws = np.random.normal(size=[30, 2, 1])
-      self._ais_gets_correct_log_normalizer(
-          x_ph,
-          independent_chain_ndims,
-          sess,
-          feed_dict={x_ph: initial_draws})
+      feed_dict = {x_ph: initial_draws}
+
+      self._ais_gets_correct_log_normalizer(x_ph, event_dims, sess,
+                                            feed_dict)
+
+  def testAISNullShape(self):
+    self._ais_gets_correct_log_normalizer_wrapper([])
 
   def testAIS1(self):
-    self._ais_gets_correct_log_normalizer_wrapper(1)
+    self._ais_gets_correct_log_normalizer_wrapper([1])
 
   def testAIS2(self):
-    self._ais_gets_correct_log_normalizer_wrapper(2)
+    self._ais_gets_correct_log_normalizer_wrapper([2])
 
-  def testAIS3(self):
-    self._ais_gets_correct_log_normalizer_wrapper(3)
+  def testAIS12(self):
+    self._ais_gets_correct_log_normalizer_wrapper([1, 2])
 
   def testNanRejection(self):
     """Tests that an update that yields NaN potentials gets rejected.
@@ -411,29 +359,24 @@ class HMCTest(test.TestCase):
     """
     def _unbounded_exponential_log_prob(x):
       """An exponential distribution with log-likelihood NaN for x < 0."""
-      per_element_potentials = array_ops.where(
-          x < 0.,
-          array_ops.fill(array_ops.shape(x), x.dtype.as_numpy_dtype(np.nan)),
-          -x)
+      per_element_potentials = array_ops.where(x < 0,
+                                               np.nan * array_ops.ones_like(x),
+                                               -x)
       return math_ops.reduce_sum(per_element_potentials)
 
     with self.test_session() as sess:
       initial_x = math_ops.linspace(0.01, 5, 10)
-      updated_x, kernel_results = hmc.kernel(
-          target_log_prob_fn=_unbounded_exponential_log_prob,
-          current_state=initial_x,
-          step_size=2.,
-          num_leapfrog_steps=5,
-          seed=46)
-      initial_x_, updated_x_, acceptance_probs_ = sess.run(
-          [initial_x, updated_x, kernel_results.acceptance_probs])
-
-      logging_ops.vlog(1, "initial_x = {}".format(initial_x_))
-      logging_ops.vlog(1, "updated_x = {}".format(updated_x_))
-      logging_ops.vlog(1, "acceptance_probs = {}".format(acceptance_probs_))
-
-      self.assertAllEqual(initial_x_, updated_x_)
-      self.assertEqual(acceptance_probs_, 0.)
+      updated_x, acceptance_probs, _, _ = hmc.kernel(
+          2., 5, initial_x, _unbounded_exponential_log_prob, [0])
+      initial_x_val, updated_x_val, acceptance_probs_val = sess.run(
+          [initial_x, updated_x, acceptance_probs])
+
+      logging.vlog(1, 'initial_x = {}'.format(initial_x_val))
+      logging.vlog(1, 'updated_x = {}'.format(updated_x_val))
+      logging.vlog(1, 'acceptance_probs = {}'.format(acceptance_probs_val))
+
+      self.assertAllEqual(initial_x_val, updated_x_val)
+      self.assertEqual(acceptance_probs_val, 0.)
 
   def testNanFromGradsDontPropagate(self):
     """Test that update with NaN gradients does not cause NaN in results."""
@@ -442,195 +385,60 @@ class HMCTest(test.TestCase):
 
     with self.test_session() as sess:
       initial_x = math_ops.linspace(0.01, 5, 10)
-      updated_x, kernel_results = hmc.kernel(
-          target_log_prob_fn=_nan_log_prob_with_nan_gradient,
-          current_state=initial_x,
-          step_size=2.,
-          num_leapfrog_steps=5,
-          seed=47)
-      initial_x_, updated_x_, acceptance_probs_ = sess.run(
-          [initial_x, updated_x, kernel_results.acceptance_probs])
-
-      logging_ops.vlog(1, "initial_x = {}".format(initial_x_))
-      logging_ops.vlog(1, "updated_x = {}".format(updated_x_))
-      logging_ops.vlog(1, "acceptance_probs = {}".format(acceptance_probs_))
-
-      self.assertAllEqual(initial_x_, updated_x_)
-      self.assertEqual(acceptance_probs_, 0.)
+      updated_x, acceptance_probs, new_log_prob, new_grad = hmc.kernel(
+          2., 5, initial_x, _nan_log_prob_with_nan_gradient, [0])
+      initial_x_val, updated_x_val, acceptance_probs_val = sess.run(
+          [initial_x, updated_x, acceptance_probs])
+
+      logging.vlog(1, 'initial_x = {}'.format(initial_x_val))
+      logging.vlog(1, 'updated_x = {}'.format(updated_x_val))
+      logging.vlog(1, 'acceptance_probs = {}'.format(acceptance_probs_val))
+
+      self.assertAllEqual(initial_x_val, updated_x_val)
+      self.assertEqual(acceptance_probs_val, 0.)
 
       self.assertAllFinite(
-          gradients_ops.gradients(updated_x, initial_x)[0].eval())
-      self.assertAllEqual([True], [g is None for g in gradients_ops.gradients(
-          kernel_results.proposed_grads_target_log_prob, initial_x)])
-      self.assertAllEqual([False], [g is None for g in gradients_ops.gradients(
-          kernel_results.proposed_grads_target_log_prob,
-          kernel_results.proposed_state)])
+          gradients_impl.gradients(updated_x, initial_x)[0].eval())
+      self.assertTrue(
+          gradients_impl.gradients(new_grad, initial_x)[0] is None)
 
       # Gradients of the acceptance probs and new log prob are not finite.
+      _ = new_log_prob  # Prevent unused arg error.
       # self.assertAllFinite(
-      #     gradients_ops.gradients(acceptance_probs, initial_x)[0].eval())
+      #     gradients_impl.gradients(acceptance_probs, initial_x)[0].eval())
       # self.assertAllFinite(
-      #     gradients_ops.gradients(new_log_prob, initial_x)[0].eval())
-
-  def _testChainWorksDtype(self, dtype):
-    states, kernel_results = hmc.sample_chain(
-        num_results=10,
-        target_log_prob_fn=lambda x: -math_ops.reduce_sum(x**2., axis=-1),
-        current_state=np.zeros(5).astype(dtype),
-        step_size=0.01,
-        num_leapfrog_steps=10,
-        seed=48)
-    with self.test_session() as sess:
-      states_, acceptance_probs_ = sess.run(
-          [states, kernel_results.acceptance_probs])
-    self.assertEqual(dtype, states_.dtype)
-    self.assertEqual(dtype, acceptance_probs_.dtype)
+      #     gradients_impl.gradients(new_log_prob, initial_x)[0].eval())
 
   def testChainWorksIn64Bit(self):
-    self._testChainWorksDtype(np.float64)
+    def log_prob(x):
+      return - math_ops.reduce_sum(x * x, axis=-1)
+    states, acceptance_probs = hmc.chain(
+        n_iterations=10,
+        step_size=np.float64(0.01),
+        n_leapfrog_steps=10,
+        initial_x=np.zeros(5).astype(np.float64),
+        target_log_prob_fn=log_prob,
+        event_dims=[-1])
+    with self.test_session() as sess:
+      states_, acceptance_probs_ = sess.run([states, acceptance_probs])
+    self.assertEqual(np.float64, states_.dtype)
+    self.assertEqual(np.float64, acceptance_probs_.dtype)
 
   def testChainWorksIn16Bit(self):
-    self._testChainWorksDtype(np.float16)
-
-
-class _EnergyComputationTest(object):
-
-  def testHandlesNanFromPotential(self):
-    with self.test_session() as sess:
-      x = [1, np.inf, -np.inf, np.nan]
-      target_log_prob, proposed_target_log_prob = [
-          self.dtype(x.flatten()) for x in np.meshgrid(x, x)]
-      num_chains = len(target_log_prob)
-      dummy_momentums = [-1, 1]
-      momentums = [self.dtype([dummy_momentums] * num_chains)]
-      proposed_momentums = [self.dtype([dummy_momentums] * num_chains)]
-
-      target_log_prob = ops.convert_to_tensor(target_log_prob)
-      momentums = [ops.convert_to_tensor(momentums[0])]
-      proposed_target_log_prob = ops.convert_to_tensor(proposed_target_log_prob)
-      proposed_momentums = [ops.convert_to_tensor(proposed_momentums[0])]
-
-      energy = _compute_energy_change(
-          target_log_prob,
-          momentums,
-          proposed_target_log_prob,
-          proposed_momentums,
-          independent_chain_ndims=1)
-      grads = gradients_ops.gradients(energy, momentums)
-
-      [actual_energy, grads_] = sess.run([energy, grads])
-
-      # Ensure energy is `inf` (note: that's positive inf) in weird cases and
-      # finite otherwise.
-      expected_energy = self.dtype([0] + [np.inf]*(num_chains - 1))
-      self.assertAllEqual(expected_energy, actual_energy)
-
-      # Ensure gradient is finite.
-      self.assertAllEqual(np.ones_like(grads_).astype(np.bool),
-                          np.isfinite(grads_))
-
-  def testHandlesNanFromKinetic(self):
+    def log_prob(x):
+      return - math_ops.reduce_sum(x * x, axis=-1)
+    states, acceptance_probs = hmc.chain(
+        n_iterations=10,
+        step_size=np.float16(0.01),
+        n_leapfrog_steps=10,
+        initial_x=np.zeros(5).astype(np.float16),
+        target_log_prob_fn=log_prob,
+        event_dims=[-1])
     with self.test_session() as sess:
-      x = [1, np.inf, -np.inf, np.nan]
-      momentums, proposed_momentums = [
-          [np.reshape(self.dtype(x), [-1, 1])]
-          for x in np.meshgrid(x, x)]
-      num_chains = len(momentums[0])
-      target_log_prob = np.ones(num_chains, self.dtype)
-      proposed_target_log_prob = np.ones(num_chains, self.dtype)
+      states_, acceptance_probs_ = sess.run([states, acceptance_probs])
+    self.assertEqual(np.float16, states_.dtype)
+    self.assertEqual(np.float16, acceptance_probs_.dtype)
 
-      target_log_prob = ops.convert_to_tensor(target_log_prob)
-      momentums = [ops.convert_to_tensor(momentums[0])]
-      proposed_target_log_prob = ops.convert_to_tensor(proposed_target_log_prob)
-      proposed_momentums = [ops.convert_to_tensor(proposed_momentums[0])]
 
-      energy = _compute_energy_change(
-          target_log_prob,
-          momentums,
-          proposed_target_log_prob,
-          proposed_momentums,
-          independent_chain_ndims=1)
-      grads = gradients_ops.gradients(energy, momentums)
-
-      [actual_energy, grads_] = sess.run([energy, grads])
-
-      # Ensure energy is `inf` (note: that's positive inf) in weird cases and
-      # finite otherwise.
-      expected_energy = self.dtype([0] + [np.inf]*(num_chains - 1))
-      self.assertAllEqual(expected_energy, actual_energy)
-
-      # Ensure gradient is finite.
-      g = grads_[0].reshape([len(x), len(x)])[:, 0]
-      self.assertAllEqual(np.ones_like(g).astype(np.bool), np.isfinite(g))
-
-      # The remaining gradients are nan because the momentum was itself nan or
-      # inf.
-      g = grads_[0].reshape([len(x), len(x)])[:, 1:]
-      self.assertAllEqual(np.ones_like(g).astype(np.bool), np.isnan(g))
-
-
-class EnergyComputationTest16(test.TestCase, _EnergyComputationTest):
-  dtype = np.float16
-
-
-class EnergyComputationTest32(test.TestCase, _EnergyComputationTest):
-  dtype = np.float32
-
-
-class EnergyComputationTest64(test.TestCase, _EnergyComputationTest):
-  dtype = np.float64
-
-
-class _HMCHandlesLists(object):
-
-  def testStateParts(self):
-    with self.test_session() as sess:
-      dist_x = normal_lib.Normal(loc=self.dtype(0), scale=self.dtype(1))
-      dist_y = independent_lib.Independent(
-          gamma_lib.Gamma(concentration=self.dtype([1, 2]),
-                          rate=self.dtype([0.5, 0.75])),
-          reinterpreted_batch_ndims=1)
-      def target_log_prob(x, y):
-        return dist_x.log_prob(x) + dist_y.log_prob(y)
-      x0 = [dist_x.sample(seed=1), dist_y.sample(seed=2)]
-      samples, _ = hmc.sample_chain(
-          num_results=int(2e3),
-          target_log_prob_fn=target_log_prob,
-          current_state=x0,
-          step_size=0.85,
-          num_leapfrog_steps=3,
-          num_burnin_steps=int(250),
-          seed=49)
-      actual_means = [math_ops.reduce_mean(s, axis=0) for s in samples]
-      actual_vars = [_reduce_variance(s, axis=0) for s in samples]
-      expected_means = [dist_x.mean(), dist_y.mean()]
-      expected_vars = [dist_x.variance(), dist_y.variance()]
-      [
-          actual_means_,
-          actual_vars_,
-          expected_means_,
-          expected_vars_,
-      ] = sess.run([
-          actual_means,
-          actual_vars,
-          expected_means,
-          expected_vars,
-      ])
-      self.assertAllClose(expected_means_, actual_means_, atol=0.05, rtol=0.16)
-      self.assertAllClose(expected_vars_, actual_vars_, atol=0., rtol=0.30)
-
-
-class HMCHandlesLists16(_HMCHandlesLists, test.TestCase):
-  dtype = np.float16
-
-
-class HMCHandlesLists32(_HMCHandlesLists, test.TestCase):
-  dtype = np.float32
-
-
-class HMCHandlesLists64(_HMCHandlesLists, test.TestCase):
-  dtype = np.float64
-
-
-if __name__ == "__main__":
+if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/bayesflow/python/ops/hmc.py b/tensorflow/contrib/bayesflow/python/ops/hmc.py
index 7fd5652c5c..977d42fc16 100644
--- a/tensorflow/contrib/bayesflow/python/ops/hmc.py
+++ b/tensorflow/contrib/bayesflow/python/ops/hmc.py
@@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Hamiltonian Monte Carlo, a gradient-based MCMC algorithm."""
+"""Hamiltonian Monte Carlo, a gradient-based MCMC algorithm.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -23,9 +24,11 @@ from tensorflow.contrib.bayesflow.python.ops.hmc_impl import *  # pylint: disabl
 from tensorflow.python.util import all_util
 
 _allowed_symbols = [
-    "sample_chain",
-    "sample_annealed_importance_chain",
-    "kernel",
+    'chain',
+    'kernel',
+    'leapfrog_integrator',
+    'leapfrog_step',
+    'ais_chain'
 ]
 
 all_util.remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/bayesflow/python/ops/hmc_impl.py b/tensorflow/contrib/bayesflow/python/ops/hmc_impl.py
index f7a11c21d8..5685a942e9 100644
--- a/tensorflow/contrib/bayesflow/python/ops/hmc_impl.py
+++ b/tensorflow/contrib/bayesflow/python/ops/hmc_impl.py
@@ -14,16 +14,17 @@
 # ==============================================================================
 """Hamiltonian Monte Carlo, a gradient-based MCMC algorithm.
 
-@@sample_chain
-@@sample_annealed_importance_chain
-@@kernel
+@@chain
+@@update
+@@leapfrog_integrator
+@@leapfrog_step
+@@ais_chain
 """
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
 import numpy as np
 
 from tensorflow.python.framework import dtypes
@@ -31,292 +32,168 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import functional_ops
-from tensorflow.python.ops import gradients_impl as gradients_ops
+from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
-from tensorflow.python.ops.distributions import util as distributions_util
+from tensorflow.python.platform import tf_logging as logging
 
 __all__ = [
-    "sample_chain",
-    "sample_annealed_importance_chain",
-    "kernel",
+    'chain',
+    'kernel',
+    'leapfrog_integrator',
+    'leapfrog_step',
+    'ais_chain'
 ]
 
 
-KernelResults = collections.namedtuple(
-    "KernelResults",
-    [
-        "acceptance_probs",
-        "current_grads_target_log_prob",  # "Current result" means "accepted".
-        "current_target_log_prob",  # "Current result" means "accepted".
-        "energy_change",
-        "is_accepted",
-        "proposed_grads_target_log_prob",
-        "proposed_state",
-        "proposed_target_log_prob",
-        "random_positive",
-    ])
-
-
-def _make_dummy_kernel_results(
-    dummy_state,
-    dummy_target_log_prob,
-    dummy_grads_target_log_prob):
-  return KernelResults(
-      acceptance_probs=dummy_target_log_prob,
-      current_grads_target_log_prob=dummy_grads_target_log_prob,
-      current_target_log_prob=dummy_target_log_prob,
-      energy_change=dummy_target_log_prob,
-      is_accepted=array_ops.ones_like(dummy_target_log_prob, dtypes.bool),
-      proposed_grads_target_log_prob=dummy_grads_target_log_prob,
-      proposed_state=dummy_state,
-      proposed_target_log_prob=dummy_target_log_prob,
-      random_positive=dummy_target_log_prob,
-  )
-
-
-def sample_chain(
-    num_results,
-    target_log_prob_fn,
-    current_state,
-    step_size,
-    num_leapfrog_steps,
-    num_burnin_steps=0,
-    num_steps_between_results=0,
-    seed=None,
-    current_target_log_prob=None,
-    current_grads_target_log_prob=None,
-    name=None):
+def _make_potential_and_grad(target_log_prob_fn):
+  def potential_and_grad(x):
+    log_prob_result = -target_log_prob_fn(x)
+    grad_result = gradients_impl.gradients(math_ops.reduce_sum(log_prob_result),
+                                           x)[0]
+    return log_prob_result, grad_result
+  return potential_and_grad
+
+
+def chain(n_iterations, step_size, n_leapfrog_steps, initial_x,
+          target_log_prob_fn, event_dims=(), name=None):
   """Runs multiple iterations of one or more Hamiltonian Monte Carlo chains.
 
-  Hamiltonian Monte Carlo (HMC) is a Markov chain Monte Carlo (MCMC) algorithm
-  that takes a series of gradient-informed steps to produce a Metropolis
-  proposal. This function samples from an HMC Markov chain at `current_state`
-  and whose stationary distribution has log-unnormalized-density
+  Hamiltonian Monte Carlo (HMC) is a Markov chain Monte Carlo (MCMC)
+  algorithm that takes a series of gradient-informed steps to produce
+  a Metropolis proposal. This function samples from an HMC Markov
+  chain whose initial state is `initial_x` and whose stationary
+  distribution has log-density `target_log_prob_fn()`.
+
+  This function can update multiple chains in parallel. It assumes
+  that all dimensions of `initial_x` not specified in `event_dims` are
+  independent, and should therefore be updated independently. The
+  output of `target_log_prob_fn()` should sum log-probabilities across
+  all event dimensions. Slices along dimensions not in `event_dims`
+  may have different target distributions; this is up to
   `target_log_prob_fn()`.
 
-  This function samples from multiple chains in parallel. It assumes that the
-  the leftmost dimensions of (each) `current_state` (part) index an independent
-  chain.  The function `target_log_prob_fn()` sums log-probabilities across
-  event dimensions (i.e., current state (part) rightmost dimensions). Each
-  element of the output of `target_log_prob_fn()` represents the (possibly
-  unnormalized) log-probability of the joint distribution over (all) the current
-  state (parts).
+  This function basically just wraps `hmc.kernel()` in a tf.scan() loop.
 
-  The `current_state` can be represented as a single `Tensor` or a `list` of
-  `Tensors` which collectively represent the current state. When specifying a
-  `list`, one must also specify a list of `step_size`s.
-
-  Only one out of every `num_steps_between_samples + 1` steps is included in the
-  returned results. This "thinning" comes at a cost of reduced statistical
-  power, while reducing memory requirements and autocorrelation. For more
-  discussion see [1].
+  Args:
+    n_iterations: Integer number of Markov chain updates to run.
+    step_size: Scalar step size or array of step sizes for the
+      leapfrog integrator. Broadcasts to the shape of
+      `initial_x`. Larger step sizes lead to faster progress, but
+      too-large step sizes make rejection exponentially more likely.
+      When possible, it's often helpful to match per-variable step
+      sizes to the standard deviations of the target distribution in
+      each variable.
+    n_leapfrog_steps: Integer number of steps to run the leapfrog
+      integrator for. Total progress per HMC step is roughly
+      proportional to step_size * n_leapfrog_steps.
+    initial_x: Tensor of initial state(s) of the Markov chain(s).
+    target_log_prob_fn: Python callable which takes an argument like `initial_x`
+      and returns its (possibly unnormalized) log-density under the target
+      distribution.
+    event_dims: List of dimensions that should not be treated as
+      independent. This allows for multiple chains to be run independently
+      in parallel. Default is (), i.e., all dimensions are independent.
+    name: Python `str` name prefixed to Ops created by this function.
 
-  [1]: "Statistically efficient thinning of a Markov chain sampler."
-       Art B. Owen. April 2017.
-       http://statweb.stanford.edu/~owen/reports/bestthinning.pdf
+  Returns:
+    acceptance_probs: Tensor with the acceptance probabilities for each
+      iteration. Has shape matching `target_log_prob_fn(initial_x)`.
+    chain_states: Tensor with the state of the Markov chain at each iteration.
+      Has shape `[n_iterations, initial_x.shape[0],...,initial_x.shape[-1]`.
 
   #### Examples:
 
-  ##### Sample from a diagonal-variance Gaussian.
-
   ```python
-  tfd = tf.contrib.distributions
-
-  def make_likelihood(true_variances):
-    return tfd.MultivariateNormalDiag(
-        scale_diag=tf.sqrt(true_variances))
-
-  dims = 10
-  dtype = np.float32
-  true_variances = tf.linspace(dtype(1), dtype(3), dims)
-  likelihood = make_likelihood(true_variances)
-
-  states, kernel_results = hmc.sample_chain(
-      num_results=1000,
-      target_log_prob_fn=likelihood.log_prob,
-      current_state=tf.zeros(dims),
-      step_size=0.5,
-      num_leapfrog_steps=2,
-      num_burnin_steps=500)
-
-  # Compute sample stats.
-  sample_mean = tf.reduce_mean(states, axis=0)
-  sample_var = tf.reduce_mean(
-      tf.squared_difference(states, sample_mean),
-      axis=0)
+  # Sampling from a standard normal (note `log_joint()` is unnormalized):
+  def log_joint(x):
+    return tf.reduce_sum(-0.5 * tf.square(x))
+  chain, acceptance_probs = hmc.chain(1000, 0.5, 2, tf.zeros(10), log_joint,
+                                      event_dims=[0])
+  # Discard first half of chain as warmup/burn-in
+  warmed_up = chain[500:]
+  mean_est = tf.reduce_mean(warmed_up, 0)
+  var_est = tf.reduce_mean(tf.square(warmed_up), 0) - tf.square(mean_est)
   ```
 
-  ##### Sampling from factor-analysis posteriors with known factors.
-
-  I.e.,
-
-  ```none
-  for i=1..n:
-    w[i] ~ Normal(0, eye(d))            # prior
-    x[i] ~ Normal(loc=matmul(w[i], F))  # likelihood
+  ```python
+  # Sampling from a diagonal-variance Gaussian:
+  variances = tf.linspace(1., 3., 10)
+  def log_joint(x):
+    return tf.reduce_sum(-0.5 / variances * tf.square(x))
+  chain, acceptance_probs = hmc.chain(1000, 0.5, 2, tf.zeros(10), log_joint,
+                                      event_dims=[0])
+  # Discard first half of chain as warmup/burn-in
+  warmed_up = chain[500:]
+  mean_est = tf.reduce_mean(warmed_up, 0)
+  var_est = tf.reduce_mean(tf.square(warmed_up), 0) - tf.square(mean_est)
   ```
 
-  where `F` denotes factors.
-
   ```python
-  tfd = tf.contrib.distributions
-
-  def make_prior(dims, dtype):
-    return tfd.MultivariateNormalDiag(
-        loc=tf.zeros(dims, dtype))
-
-  def make_likelihood(weights, factors):
-    return tfd.MultivariateNormalDiag(
-        loc=tf.tensordot(weights, factors, axes=[[0], [-1]]))
-
-  # Setup data.
-  num_weights = 10
-  num_factors = 4
-  num_chains = 100
-  dtype = np.float32
-
-  prior = make_prior(num_weights, dtype)
-  weights = prior.sample(num_chains)
-  factors = np.random.randn(num_factors, num_weights).astype(dtype)
-  x = make_likelihood(weights, factors).sample(num_chains)
-
-  def target_log_prob(w):
-    # Target joint is: `f(w) = p(w, x | factors)`.
-    return prior.log_prob(w) + make_likelihood(w, factors).log_prob(x)
-
-  # Get `num_results` samples from `num_chains` independent chains.
-  chains_states, kernels_results = hmc.sample_chain(
-      num_results=1000,
-      target_log_prob_fn=target_log_prob,
-      current_state=tf.zeros([num_chains, dims], dtype),
-      step_size=0.1,
-      num_leapfrog_steps=2,
-      num_burnin_steps=500)
-
-  # Compute sample stats.
-  sample_mean = tf.reduce_mean(chains_states, axis=[0, 1])
-  sample_var = tf.reduce_mean(
-      tf.squared_difference(chains_states, sample_mean),
-      axis=[0, 1])
+  # Sampling from factor-analysis posteriors with known factors W:
+  # mu[i, j] ~ Normal(0, 1)
+  # x[i] ~ Normal(matmul(mu[i], W), I)
+  def log_joint(mu, x, W):
+    prior = -0.5 * tf.reduce_sum(tf.square(mu), 1)
+    x_mean = tf.matmul(mu, W)
+    likelihood = -0.5 * tf.reduce_sum(tf.square(x - x_mean), 1)
+    return prior + likelihood
+  chain, acceptance_probs = hmc.chain(1000, 0.1, 2,
+                                      tf.zeros([x.shape[0], W.shape[0]]),
+                                      lambda mu: log_joint(mu, x, W),
+                                      event_dims=[1])
+  # Discard first half of chain as warmup/burn-in
+  warmed_up = chain[500:]
+  mean_est = tf.reduce_mean(warmed_up, 0)
+  var_est = tf.reduce_mean(tf.square(warmed_up), 0) - tf.square(mean_est)
   ```
 
-  Args:
-    num_results: Integer number of Markov chain draws.
-    target_log_prob_fn: Python callable which takes an argument like
-      `current_state` (or `*current_state` if it's a list) and returns its
-      (possibly unnormalized) log-density under the target distribution.
-    current_state: `Tensor` or Python `list` of `Tensor`s representing the
-      current state(s) of the Markov chain(s). The first `r` dimensions index
-      independent chains, `r = tf.rank(target_log_prob_fn(*current_state))`.
-    step_size: `Tensor` or Python `list` of `Tensor`s representing the step size
-      for the leapfrog integrator. Must broadcast with the shape of
-      `current_state`. Larger step sizes lead to faster progress, but too-large
-      step sizes make rejection exponentially more likely. When possible, it's
-      often helpful to match per-variable step sizes to the standard deviations
-      of the target distribution in each variable.
-    num_leapfrog_steps: Integer number of steps to run the leapfrog integrator
-      for. Total progress per HMC step is roughly proportional to `step_size *
-      num_leapfrog_steps`.
-    num_burnin_steps: Integer number of chain steps to take before starting to
-      collect results.
-      Default value: 0 (i.e., no burn-in).
-    num_steps_between_results: Integer number of chain steps between collecting
-      a result. Only one out of every `num_steps_between_samples + 1` steps is
-      included in the returned results. This "thinning" comes at a cost of
-      reduced statistical power, while reducing memory requirements and
-      autocorrelation. For more discussion see [1].
-      Default value: 0 (i.e., no subsampling).
-    seed: Python integer to seed the random number generator.
-    current_target_log_prob: (Optional) `Tensor` representing the value of
-      `target_log_prob_fn` at the `current_state`. The only reason to specify
-      this argument is to reduce TF graph size.
-      Default value: `None` (i.e., compute as needed).
-    current_grads_target_log_prob: (Optional) Python list of `Tensor`s
-      representing gradient of `target_log_prob` at the `current_state` and wrt
-      the `current_state`. Must have same shape as `current_state`. The only
-      reason to specify this argument is to reduce TF graph size.
-      Default value: `None` (i.e., compute as needed).
-    name: Python `str` name prefixed to Ops created by this function.
-      Default value: `None` (i.e., "hmc_sample_chain").
-
-  Returns:
-    accepted_states: Tensor or Python list of `Tensor`s representing the
-      state(s) of the Markov chain(s) at each result step. Has same shape as
-      input `current_state` but with a prepended `num_results`-size dimension.
-    kernel_results: `collections.namedtuple` of internal calculations used to
-      advance the chain.
+  ```python
+  # Sampling from the posterior of a Bayesian regression model.:
+
+  # Run 100 chains in parallel, each with a different initialization.
+  initial_beta = tf.random_normal([100, x.shape[1]])
+  chain, acceptance_probs = hmc.chain(1000, 0.1, 10, initial_beta,
+                                      log_joint_partial, event_dims=[1])
+  # Discard first halves of chains as warmup/burn-in
+  warmed_up = chain[500:]
+  # Averaging across samples within a chain and across chains
+  mean_est = tf.reduce_mean(warmed_up, [0, 1])
+  var_est = tf.reduce_mean(tf.square(warmed_up), [0, 1]) - tf.square(mean_est)
+  ```
   """
-  with ops.name_scope(
-      name, "hmc_sample_chain",
-      [num_results, current_state, step_size, num_leapfrog_steps,
-       num_burnin_steps, num_steps_between_results, seed,
-       current_target_log_prob, current_grads_target_log_prob]):
-    with ops.name_scope("initialize"):
-      [
-          current_state,
-          step_size,
-          current_target_log_prob,
-          current_grads_target_log_prob,
-      ] = _prepare_args(
-          target_log_prob_fn, current_state, step_size,
-          current_target_log_prob, current_grads_target_log_prob)
-    def _run_chain(num_steps, current_state, seed, kernel_results):
-      """Runs the chain(s) for `num_steps`."""
-      def _loop_body(iter_, current_state, kernel_results):
-        return [iter_ + 1] + list(kernel(
-            target_log_prob_fn,
-            current_state,
-            step_size,
-            num_leapfrog_steps,
-            seed,
-            kernel_results.current_target_log_prob,
-            kernel_results.current_grads_target_log_prob))
-      return control_flow_ops.while_loop(
-          cond=lambda iter_, *args: iter_ < num_steps,
-          body=_loop_body,
-          loop_vars=[0, current_state, kernel_results])[1:]  # Lop-off "iter_".
-
-    def _scan_body(args_list, _):
-      """Closure which implements `tf.scan` body."""
-      current_state, kernel_results = args_list
-      return _run_chain(num_steps_between_results + 1, current_state, seed,
-                        kernel_results)
-
-    current_state, kernel_results = _run_chain(
-        num_burnin_steps,
-        current_state,
-        distributions_util.gen_new_seed(
-            seed, salt="hmc_sample_chain_burnin"),
-        _make_dummy_kernel_results(
-            current_state,
-            current_target_log_prob,
-            current_grads_target_log_prob))
-
+  with ops.name_scope(name, 'hmc_chain', [n_iterations, step_size,
+                                          n_leapfrog_steps, initial_x]):
+    initial_x = ops.convert_to_tensor(initial_x, name='initial_x')
+    non_event_shape = array_ops.shape(target_log_prob_fn(initial_x))
+
+    def body(a, _):
+      updated_x, acceptance_probs, log_prob, grad = kernel(
+          step_size, n_leapfrog_steps, a[0], target_log_prob_fn, event_dims,
+          a[2], a[3])
+      return updated_x, acceptance_probs, log_prob, grad
+
+    potential_and_grad = _make_potential_and_grad(target_log_prob_fn)
+    potential, grad = potential_and_grad(initial_x)
     return functional_ops.scan(
-        fn=_scan_body,
-        elems=array_ops.zeros(num_results, dtype=dtypes.bool),  # Dummy arg.
-        initializer=[current_state, kernel_results])
-
-
-def sample_annealed_importance_chain(
-    proposal_log_prob_fn,
-    num_steps,
-    target_log_prob_fn,
-    current_state,
-    step_size,
-    num_leapfrog_steps,
-    seed=None,
-    name=None):
+        body, array_ops.zeros(n_iterations, dtype=initial_x.dtype),
+        (initial_x,
+         array_ops.zeros(non_event_shape, dtype=initial_x.dtype),
+         -potential, -grad))[:2]
+
+
+def ais_chain(n_iterations, step_size, n_leapfrog_steps, initial_x,
+              target_log_prob_fn, proposal_log_prob_fn, event_dims=(),
+              name=None):
   """Runs annealed importance sampling (AIS) to estimate normalizing constants.
 
-  This function uses Hamiltonian Monte Carlo to sample from a series of
+  This routine uses Hamiltonian Monte Carlo to sample from a series of
   distributions that slowly interpolates between an initial "proposal"
-  distribution:
+  distribution
 
   `exp(proposal_log_prob_fn(x) - proposal_log_normalizer)`
 
-  and the target distribution:
+  and the target distribution
 
   `exp(target_log_prob_fn(x) - target_log_normalizer)`,
 
@@ -325,183 +202,113 @@ def sample_annealed_importance_chain(
   normalizing constants of the initial distribution and the target
   distribution:
 
-  `E[exp(ais_weights)] = exp(target_log_normalizer - proposal_log_normalizer)`.
+  E[exp(w)] = exp(target_log_normalizer - proposal_log_normalizer).
 
-  #### Examples:
+  Args:
+    n_iterations: Integer number of Markov chain updates to run. More
+      iterations means more expense, but smoother annealing between q
+      and p, which in turn means exponentially lower variance for the
+      normalizing constant estimator.
+    step_size: Scalar step size or array of step sizes for the
+      leapfrog integrator. Broadcasts to the shape of
+      `initial_x`. Larger step sizes lead to faster progress, but
+      too-large step sizes make rejection exponentially more likely.
+      When possible, it's often helpful to match per-variable step
+      sizes to the standard deviations of the target distribution in
+      each variable.
+    n_leapfrog_steps: Integer number of steps to run the leapfrog
+      integrator for. Total progress per HMC step is roughly
+      proportional to step_size * n_leapfrog_steps.
+    initial_x: Tensor of initial state(s) of the Markov chain(s). Must
+      be a sample from q, or results will be incorrect.
+    target_log_prob_fn: Python callable which takes an argument like `initial_x`
+      and returns its (possibly unnormalized) log-density under the target
+      distribution.
+    proposal_log_prob_fn: Python callable that returns the log density of the
+      initial distribution.
+    event_dims: List of dimensions that should not be treated as
+      independent. This allows for multiple chains to be run independently
+      in parallel. Default is (), i.e., all dimensions are independent.
+    name: Python `str` name prefixed to Ops created by this function.
+
+  Returns:
+    ais_weights: Tensor with the estimated weight(s). Has shape matching
+      `target_log_prob_fn(initial_x)`.
+    chain_states: Tensor with the state(s) of the Markov chain(s) the final
+      iteration. Has shape matching `initial_x`.
+    acceptance_probs: Tensor with the acceptance probabilities for the final
+      iteration. Has shape matching `target_log_prob_fn(initial_x)`.
 
-  ##### Estimate the normalizing constant of a log-gamma distribution.
+  #### Examples:
 
   ```python
-  tfd = tf.contrib.distributions
-
+  # Estimating the normalizing constant of a log-gamma distribution:
+  def proposal_log_prob(x):
+    # Standard normal log-probability. This is properly normalized.
+    return tf.reduce_sum(-0.5 * tf.square(x) - 0.5 * np.log(2 * np.pi), 1)
+  def target_log_prob(x):
+    # Unnormalized log-gamma(2, 3) distribution.
+    # True normalizer is (lgamma(2) - 2 * log(3)) * x.shape[1]
+    return tf.reduce_sum(2. * x - 3. * tf.exp(x), 1)
   # Run 100 AIS chains in parallel
-  num_chains = 100
-  dims = 20
-  dtype = np.float32
-
-  proposal = tfd.MultivatiateNormalDiag(
-     loc=tf.zeros([dims], dtype=dtype))
-
-  target = tfd.TransformedDistribution(
-    distribution=tfd.Gamma(concentration=dtype(2),
-                           rate=dtype(3)),
-    bijector=tfd.bijectors.Invert(tfd.bijectors.Exp()),
-    event_shape=[dims])
-
-  chains_state, ais_weights, kernels_results = (
-      hmc.sample_annealed_importance_chain(
-          proposal_log_prob_fn=proposal.log_prob,
-          num_steps=1000,
-          target_log_prob_fn=target.log_prob,
-          step_size=0.2,
-          current_state=proposal.sample(num_chains),
-          num_leapfrog_steps=2))
-
-  log_estimated_normalizer = (tf.reduce_logsumexp(ais_weights)
-                              - np.log(num_chains))
-  log_true_normalizer = tf.lgamma(2.) - 2. * tf.log(3.)
+  initial_x = tf.random_normal([100, 20])
+  w, _, _ = hmc.ais_chain(1000, 0.2, 2, initial_x, target_log_prob,
+                          proposal_log_prob, event_dims=[1])
+  log_normalizer_estimate = tf.reduce_logsumexp(w) - np.log(100)
   ```
 
-  ##### Estimate marginal likelihood of a Bayesian regression model.
-
   ```python
-  tfd = tf.contrib.distributions
-
-  def make_prior(dims, dtype):
-    return tfd.MultivariateNormalDiag(
-        loc=tf.zeros(dims, dtype))
-
-  def make_likelihood(weights, x):
-    return tfd.MultivariateNormalDiag(
-        loc=tf.tensordot(weights, x, axes=[[0], [-1]]))
-
+  # Estimating the marginal likelihood of a Bayesian regression model:
+  base_measure = -0.5 * np.log(2 * np.pi)
+  def proposal_log_prob(x):
+    # Standard normal log-probability. This is properly normalized.
+    return tf.reduce_sum(-0.5 * tf.square(x) + base_measure, 1)
+  def regression_log_joint(beta, x, y):
+    # This function returns a vector whose ith element is log p(beta[i], y | x).
+    # Each row of beta corresponds to the state of an independent Markov chain.
+    log_prior = tf.reduce_sum(-0.5 * tf.square(beta) + base_measure, 1)
+    means = tf.matmul(beta, x, transpose_b=True)
+    log_likelihood = tf.reduce_sum(-0.5 * tf.square(y - means) +
+                                   base_measure, 1)
+    return log_prior + log_likelihood
+  def log_joint_partial(beta):
+    return regression_log_joint(beta, x, y)
   # Run 100 AIS chains in parallel
-  num_chains = 100
-  dims = 10
-  dtype = np.float32
-
-  # Make training data.
-  x = np.random.randn(num_chains, dims).astype(dtype)
-  true_weights = np.random.randn(dims).astype(dtype)
-  y = np.dot(x, true_weights) + np.random.randn(num_chains)
-
-  # Setup model.
-  prior = make_prior(dims, dtype)
-  def target_log_prob_fn(weights):
-    return prior.log_prob(weights) + make_likelihood(weights, x).log_prob(y)
-
-  proposal = tfd.MultivariateNormalDiag(
-      loc=tf.zeros(dims, dtype))
-
-  weight_samples, ais_weights, kernel_results = (
-      hmc.sample_annealed_importance_chain(
-        num_steps=1000,
-        proposal_log_prob_fn=proposal.log_prob,
-        target_log_prob_fn=target_log_prob_fn
-        current_state=tf.zeros([num_chains, dims], dtype),
-        step_size=0.1,
-        num_leapfrog_steps=2))
-  log_normalizer_estimate = (tf.reduce_logsumexp(ais_weights)
-                             - np.log(num_chains))
+  initial_beta = tf.random_normal([100, x.shape[1]])
+  w, beta_samples, _ = hmc.ais_chain(1000, 0.1, 2, initial_beta,
+                                     log_joint_partial, proposal_log_prob,
+                                     event_dims=[1])
+  log_normalizer_estimate = tf.reduce_logsumexp(w) - np.log(100)
   ```
-
-  Args:
-    proposal_log_prob_fn: Python callable that returns the log density of the
-      initial distribution.
-    num_steps: Integer number of Markov chain updates to run. More
-      iterations means more expense, but smoother annealing between q
-      and p, which in turn means exponentially lower variance for the
-      normalizing constant estimator.
-    target_log_prob_fn: Python callable which takes an argument like
-      `current_state` (or `*current_state` if it's a list) and returns its
-      (possibly unnormalized) log-density under the target distribution.
-    current_state: `Tensor` or Python `list` of `Tensor`s representing the
-      current state(s) of the Markov chain(s). The first `r` dimensions index
-      independent chains, `r = tf.rank(target_log_prob_fn(*current_state))`.
-    step_size: `Tensor` or Python `list` of `Tensor`s representing the step size
-      for the leapfrog integrator. Must broadcast with the shape of
-      `current_state`. Larger step sizes lead to faster progress, but too-large
-      step sizes make rejection exponentially more likely. When possible, it's
-      often helpful to match per-variable step sizes to the standard deviations
-      of the target distribution in each variable.
-    num_leapfrog_steps: Integer number of steps to run the leapfrog integrator
-      for. Total progress per HMC step is roughly proportional to `step_size *
-      num_leapfrog_steps`.
-    seed: Python integer to seed the random number generator.
-    name: Python `str` name prefixed to Ops created by this function.
-      Default value: `None` (i.e., "hmc_sample_annealed_importance_chain").
-
-  Returns:
-    accepted_state: `Tensor` or Python list of `Tensor`s representing the
-      state(s) of the Markov chain(s) at the final iteration. Has same shape as
-      input `current_state`.
-    ais_weights: Tensor with the estimated weight(s). Has shape matching
-      `target_log_prob_fn(current_state)`.
   """
-  def make_convex_combined_log_prob_fn(iter_):
-    def _fn(*args):
-      p = proposal_log_prob_fn(*args)
-      t = target_log_prob_fn(*args)
-      dtype = p.dtype.base_dtype
-      beta = (math_ops.cast(iter_ + 1, dtype)
-              / math_ops.cast(num_steps, dtype))
-      return (1. - beta) * p + beta * t
-    return _fn
-
-  with ops.name_scope(
-      name, "hmc_sample_annealed_importance_chain",
-      [num_steps, current_state, step_size, num_leapfrog_steps, seed]):
-    with ops.name_scope("initialize"):
-      [
-          current_state,
-          step_size,
-          current_log_prob,
-          current_grads_log_prob,
-      ] = _prepare_args(
-          make_convex_combined_log_prob_fn(iter_=0),
-          current_state,
-          step_size,
-          description="convex_combined_log_prob")
-    def _loop_body(iter_, ais_weights, current_state, kernel_results):
-      """Closure which implements `tf.while_loop` body."""
-      current_state_parts = (list(current_state)
-                             if _is_list_like(current_state)
-                             else [current_state])
-      ais_weights += ((target_log_prob_fn(*current_state_parts)
-                       - proposal_log_prob_fn(*current_state_parts))
-                      / math_ops.cast(num_steps, ais_weights.dtype))
-      return [iter_ + 1, ais_weights] + list(kernel(
-          make_convex_combined_log_prob_fn(iter_),
-          current_state,
-          step_size,
-          num_leapfrog_steps,
-          seed,
-          kernel_results.current_target_log_prob,
-          kernel_results.current_grads_target_log_prob))
-
-    [ais_weights, current_state, kernel_results] = control_flow_ops.while_loop(
-        cond=lambda iter_, *args: iter_ < num_steps,
-        body=_loop_body,
-        loop_vars=[
-            0,  # iter_
-            array_ops.zeros_like(current_log_prob),  # ais_weights
-            current_state,
-            _make_dummy_kernel_results(current_state,
-                                       current_log_prob,
-                                       current_grads_log_prob),
-        ])[1:]  # Lop-off "iter_".
-
-    return [current_state, ais_weights, kernel_results]
-
-
-def kernel(target_log_prob_fn,
-           current_state,
-           step_size,
-           num_leapfrog_steps,
-           seed=None,
-           current_target_log_prob=None,
-           current_grads_target_log_prob=None,
-           name=None):
+  with ops.name_scope(name, 'hmc_ais_chain',
+                      [n_iterations, step_size, n_leapfrog_steps, initial_x]):
+    non_event_shape = array_ops.shape(target_log_prob_fn(initial_x))
+
+    beta_series = math_ops.linspace(0., 1., n_iterations+1)[1:]
+    def _body(a, beta):  # pylint: disable=missing-docstring
+      def log_prob_beta(x):
+        return ((1 - beta) * proposal_log_prob_fn(x) +
+                beta * target_log_prob_fn(x))
+      last_x = a[0]
+      w = a[2]
+      w += (1. / n_iterations) * (target_log_prob_fn(last_x) -
+                                  proposal_log_prob_fn(last_x))
+      # TODO(b/66917083): There's an opportunity for gradient reuse here.
+      updated_x, acceptance_probs, _, _ = kernel(step_size, n_leapfrog_steps,
+                                                 last_x, log_prob_beta,
+                                                 event_dims)
+      return updated_x, acceptance_probs, w
+
+    x, acceptance_probs, w = functional_ops.scan(
+        _body, beta_series,
+        (initial_x, array_ops.zeros(non_event_shape, dtype=initial_x.dtype),
+         array_ops.zeros(non_event_shape, dtype=initial_x.dtype)))
+  return w[-1], x[-1], acceptance_probs[-1]
+
+
+def kernel(step_size, n_leapfrog_steps, x, target_log_prob_fn, event_dims=(),
+           x_log_prob=None, x_grad=None, name=None):
   """Runs one iteration of Hamiltonian Monte Carlo.
 
   Hamiltonian Monte Carlo (HMC) is a Markov chain Monte Carlo (MCMC)
@@ -509,625 +316,334 @@ def kernel(target_log_prob_fn,
   a Metropolis proposal. This function applies one step of HMC to
   randomly update the variable `x`.
 
-  This function can update multiple chains in parallel. It assumes that all
-  leftmost dimensions of `current_state` index independent chain states (and are
-  therefore updated independently). The output of `target_log_prob_fn()` should
-  sum log-probabilities across all event dimensions. Slices along the rightmost
-  dimensions may have different target distributions; for example,
-  `current_state[0, :]` could have a different target distribution from
-  `current_state[1, :]`. This is up to `target_log_prob_fn()`. (The number of
-  independent chains is `tf.size(target_log_prob_fn(*current_state))`.)
+  This function can update multiple chains in parallel. It assumes
+  that all dimensions of `x` not specified in `event_dims` are
+  independent, and should therefore be updated independently. The
+  output of `target_log_prob_fn()` should sum log-probabilities across
+  all event dimensions. Slices along dimensions not in `event_dims`
+  may have different target distributions; for example, if
+  `event_dims == (1,)`, then `x[0, :]` could have a different target
+  distribution from x[1, :]. This is up to `target_log_prob_fn()`.
 
-  #### Examples:
+  Args:
+    step_size: Scalar step size or array of step sizes for the
+      leapfrog integrator. Broadcasts to the shape of
+      `x`. Larger step sizes lead to faster progress, but
+      too-large step sizes make rejection exponentially more likely.
+      When possible, it's often helpful to match per-variable step
+      sizes to the standard deviations of the target distribution in
+      each variable.
+    n_leapfrog_steps: Integer number of steps to run the leapfrog
+      integrator for. Total progress per HMC step is roughly
+      proportional to step_size * n_leapfrog_steps.
+    x: Tensor containing the value(s) of the random variable(s) to update.
+    target_log_prob_fn: Python callable which takes an argument like `initial_x`
+      and returns its (possibly unnormalized) log-density under the target
+      distribution.
+    event_dims: List of dimensions that should not be treated as
+      independent. This allows for multiple chains to be run independently
+      in parallel. Default is (), i.e., all dimensions are independent.
+    x_log_prob (optional): Tensor containing the cached output of a previous
+      call to `target_log_prob_fn()` evaluated at `x` (such as that provided by
+      a previous call to `kernel()`). Providing `x_log_prob` and
+      `x_grad` saves one gradient computation per call to `kernel()`.
+    x_grad (optional): Tensor containing the cached gradient of
+      `target_log_prob_fn()` evaluated at `x` (such as that provided by
+      a previous call to `kernel()`). Providing `x_log_prob` and
+      `x_grad` saves one gradient computation per call to `kernel()`.
+    name: Python `str` name prefixed to Ops created by this function.
 
-  ##### Simple chain with warm-up.
+  Returns:
+    updated_x: The updated variable(s) x. Has shape matching `initial_x`.
+    acceptance_probs: Tensor with the acceptance probabilities for the final
+      iteration. This is useful for diagnosing step size problems etc. Has
+      shape matching `target_log_prob_fn(initial_x)`.
+    new_log_prob: The value of `target_log_prob_fn()` evaluated at `updated_x`.
+    new_grad: The value of the gradient of `target_log_prob_fn()` evaluated at
+      `updated_x`.
 
-  ```python
-  tfd = tf.contrib.distributions
+  #### Examples:
 
+  ```python
   # Tuning acceptance rates:
-  dtype = np.float32
   target_accept_rate = 0.631
-  num_warmup_iter = 500
-  num_chain_iter = 500
-
-  x = tf.get_variable(name="x", initializer=dtype(1))
-  step_size = tf.get_variable(name="step_size", initializer=dtype(1))
-
-  target = tfd.Normal(loc=dtype(0), scale=dtype(1))
-
-  new_x, other_results = hmc.kernel(
-      target_log_prob_fn=target.log_prob,
-      current_state=x,
-      step_size=step_size,
-      num_leapfrog_steps=3)[:4]
-
-  x_update = x.assign(new_x)
-
-  step_size_update = step_size.assign_add(
-      step_size * tf.where(
-        other_results.acceptance_probs > target_accept_rate,
-        0.01, -0.01))
-
-  warmup = tf.group([x_update, step_size_update])
-
-  tf.global_variables_initializer().run()
-
-  sess.graph.finalize()  # No more graph building.
-
+  def target_log_prob(x):
+    # Standard normal
+    return tf.reduce_sum(-0.5 * tf.square(x))
+  initial_x = tf.zeros([10])
+  initial_log_prob = target_log_prob(initial_x)
+  initial_grad = tf.gradients(initial_log_prob, initial_x)[0]
+  # Algorithm state
+  x = tf.Variable(initial_x, name='x')
+  step_size = tf.Variable(1., name='step_size')
+  last_log_prob = tf.Variable(initial_log_prob, name='last_log_prob')
+  last_grad = tf.Variable(initial_grad, name='last_grad')
+  # Compute updates
+  new_x, acceptance_prob, log_prob, grad = hmc.kernel(step_size, 3, x,
+                                                      target_log_prob,
+                                                      event_dims=[0],
+                                                      x_log_prob=last_log_prob)
+  x_update = tf.assign(x, new_x)
+  log_prob_update = tf.assign(last_log_prob, log_prob)
+  grad_update = tf.assign(last_grad, grad)
+  step_size_update = tf.assign(step_size,
+                               tf.where(acceptance_prob > target_accept_rate,
+                                        step_size * 1.01, step_size / 1.01))
+  adaptive_updates = [x_update, log_prob_update, grad_update, step_size_update]
+  sampling_updates = [x_update, log_prob_update, grad_update]
+
+  sess = tf.Session()
+  sess.run(tf.global_variables_initializer())
   # Warm up the sampler and adapt the step size
-  for _ in xrange(num_warmup_iter):
-    sess.run(warmup)
-
+  for i in xrange(500):
+    sess.run(adaptive_updates)
   # Collect samples without adapting step size
-  samples = np.zeros([num_chain_iter])
-  for i in xrange(num_chain_iter):
-    _, x_, target_log_prob_, grad_ = sess.run([
-        x_update,
-        x,
-        other_results.target_log_prob,
-        other_results.grads_target_log_prob])
-    samples[i] = x_
-
-  print(samples.mean(), samples.std())
+  samples = np.zeros([500, 10])
+  for i in xrange(500):
+    x_val, _ = sess.run([new_x, sampling_updates])
+    samples[i] = x_val
   ```
 
-  ##### Sample from more complicated posterior.
-
-  I.e.,
-
-  ```none
-    W ~ MVN(loc=0, scale=sigma * eye(dims))
-    for i=1...num_samples:
-        X[i] ~ MVN(loc=0, scale=eye(dims))
-      eps[i] ~ Normal(loc=0, scale=1)
-        Y[i] = X[i].T * W + eps[i]
+  ```python
+  # Empirical-Bayes estimation of a hyperparameter by MCMC-EM:
+
+  # Problem setup
+  N = 150
+  D = 10
+  x = np.random.randn(N, D).astype(np.float32)
+  true_sigma = 0.5
+  true_beta = true_sigma * np.random.randn(D).astype(np.float32)
+  y = x.dot(true_beta) + np.random.randn(N).astype(np.float32)
+
+  def log_prior(beta, log_sigma):
+    return tf.reduce_sum(-0.5 / tf.exp(2 * log_sigma) * tf.square(beta) -
+                         log_sigma)
+  def regression_log_joint(beta, log_sigma, x, y):
+    # This function returns log p(beta | log_sigma) + log p(y | x, beta).
+    means = tf.matmul(tf.expand_dims(beta, 0), x, transpose_b=True)
+    means = tf.squeeze(means)
+    log_likelihood = tf.reduce_sum(-0.5 * tf.square(y - means))
+    return log_prior(beta, log_sigma) + log_likelihood
+  def log_joint_partial(beta):
+    return regression_log_joint(beta, log_sigma, x, y)
+  # Our estimate of log(sigma)
+  log_sigma = tf.Variable(0., name='log_sigma')
+  # The state of the Markov chain
+  beta = tf.Variable(tf.random_normal([x.shape[1]]), name='beta')
+  new_beta, _, _, _ = hmc.kernel(0.1, 5, beta, log_joint_partial,
+                                 event_dims=[0])
+  beta_update = tf.assign(beta, new_beta)
+  optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)
+  with tf.control_dependencies([beta_update]):
+    log_sigma_update = optimizer.minimize(-log_prior(beta, log_sigma),
+                                          var_list=[log_sigma])
+
+  sess = tf.Session()
+  sess.run(tf.global_variables_initializer())
+  log_sigma_history = np.zeros(1000)
+  for i in xrange(1000):
+    log_sigma_val, _ = sess.run([log_sigma, log_sigma_update])
+    log_sigma_history[i] = log_sigma_val
+  # Should converge to something close to true_sigma
+  plt.plot(np.exp(log_sigma_history))
   ```
+  """
+  with ops.name_scope(name, 'hmc_kernel', [step_size, n_leapfrog_steps, x]):
+    potential_and_grad = _make_potential_and_grad(target_log_prob_fn)
+    x = ops.convert_to_tensor(x, name='x')
+
+    x_shape = array_ops.shape(x)
+    m = random_ops.random_normal(x_shape, dtype=x.dtype)
+
+    kinetic_0 = 0.5 * math_ops.reduce_sum(math_ops.square(m), event_dims)
+
+    if (x_log_prob is not None) and (x_grad is not None):
+      log_potential_0, grad_0 = -x_log_prob, -x_grad  # pylint: disable=invalid-unary-operand-type
+    else:
+      if x_log_prob is not None:
+        logging.warn('x_log_prob was provided, but x_grad was not,'
+                     ' so x_log_prob was not used.')
+      if x_grad is not None:
+        logging.warn('x_grad was provided, but x_log_prob was not,'
+                     ' so x_grad was not used.')
+      log_potential_0, grad_0 = potential_and_grad(x)
+
+    new_x, new_m, log_potential_1, grad_1 = leapfrog_integrator(
+        step_size, n_leapfrog_steps, x, m, potential_and_grad, grad_0)
+
+    kinetic_1 = 0.5 * math_ops.reduce_sum(math_ops.square(new_m), event_dims)
+
+    energy_change = log_potential_1 - log_potential_0 + kinetic_1 - kinetic_0
+    # Treat NaN as infinite energy (and therefore guaranteed rejection).
+    energy_change = array_ops.where(
+        math_ops.is_nan(energy_change),
+        array_ops.fill(array_ops.shape(energy_change),
+                       energy_change.dtype.as_numpy_dtype(np.inf)),
+        energy_change)
+    acceptance_probs = math_ops.exp(math_ops.minimum(-energy_change, 0.))
+    accepted = (
+        random_ops.random_uniform(
+            array_ops.shape(acceptance_probs), dtype=x.dtype)
+        < acceptance_probs)
+    new_log_prob = -array_ops.where(accepted, log_potential_1, log_potential_0)
+
+    # TODO(b/65738010): This should work, but it doesn't for now.
+    # reduced_shape = math_ops.reduced_shape(x_shape, event_dims)
+    reduced_shape = array_ops.shape(math_ops.reduce_sum(x, event_dims,
+                                                        keep_dims=True))
+    accepted = array_ops.reshape(accepted, reduced_shape)
+    accepted = math_ops.logical_or(
+        accepted, math_ops.cast(array_ops.zeros_like(x), dtypes.bool))
+    new_x = array_ops.where(accepted, new_x, x)
+    new_grad = -array_ops.where(accepted, grad_1, grad_0)
+
+  # TODO(langmore) Gradients of acceptance_probs and new_log_prob with respect
+  # to initial_x will propagate NaNs (see testNanFromGradsDontPropagate).  This
+  # should be fixed.
+  return new_x, acceptance_probs, new_log_prob, new_grad
+
+
+def leapfrog_integrator(step_size, n_steps, initial_position, initial_momentum,
+                        potential_and_grad, initial_grad, name=None):
+  """Applies `n_steps` steps of the leapfrog integrator.
+
+  This just wraps `leapfrog_step()` in a `tf.while_loop()`, reusing
+  gradient computations where possible.
 
-  ```python
-  tfd = tf.contrib.distributions
-
-  def make_training_data(num_samples, dims, sigma):
-    dt = np.asarray(sigma).dtype
-    zeros = tf.zeros(dims, dtype=dt)
-    x = tfd.MultivariateNormalDiag(
-        loc=zeros).sample(num_samples, seed=1)
-    w = tfd.MultivariateNormalDiag(
-        loc=zeros,
-        scale_identity_multiplier=sigma).sample(seed=2)
-    noise = tfd.Normal(
-        loc=dt(0),
-        scale=dt(1)).sample(num_samples, seed=3)
-    y = tf.tensordot(x, w, axes=[[1], [0]]) + noise
-    return y, x, w
-
-  def make_prior(sigma, dims):
-    # p(w | sigma)
-    return tfd.MultivariateNormalDiag(
-        loc=tf.zeros([dims], dtype=sigma.dtype),
-        scale_identity_multiplier=sigma)
-
-  def make_likelihood(x, w):
-    # p(y | x, w)
-    return tfd.MultivariateNormalDiag(
-        loc=tf.tensordot(x, w, axes=[[1], [0]]))
-
-  # Setup assumptions.
-  dtype = np.float32
-  num_samples = 150
-  dims = 10
-  num_iters = int(5e3)
-
-  true_sigma = dtype(0.5)
-  y, x, true_weights = make_training_data(num_samples, dims, true_sigma)
-
-  # Estimate of `log(true_sigma)`.
-  log_sigma = tf.get_variable(name="log_sigma", initializer=dtype(0))
-  sigma = tf.exp(log_sigma)
-
-  # State of the Markov chain.
-  weights = tf.get_variable(
-      name="weights",
-      initializer=np.random.randn(dims).astype(dtype))
-
-  prior = make_prior(sigma, dims)
-
-  def joint_log_prob_fn(w):
-    # f(w) = log p(w, y | x)
-    return prior.log_prob(w) + make_likelihood(x, w).log_prob(y)
-
-  weights_update = weights.assign(
-      hmc.kernel(target_log_prob_fn=joint_log_prob,
-                 current_state=weights,
-                 step_size=0.1,
-                 num_leapfrog_steps=5)[0])
-
-  with tf.control_dependencies([weights_update]):
-    loss = -prior.log_prob(weights)
+  Args:
+    step_size: Scalar step size or array of step sizes for the
+      leapfrog integrator. Broadcasts to the shape of
+      `initial_position`. Larger step sizes lead to faster progress, but
+      too-large step sizes lead to larger discretization error and
+      worse energy conservation.
+    n_steps: Number of steps to run the leapfrog integrator.
+    initial_position: Tensor containing the value(s) of the position variable(s)
+      to update.
+    initial_momentum: Tensor containing the value(s) of the momentum variable(s)
+      to update.
+    potential_and_grad: Python callable that takes a position tensor like
+      `initial_position` and returns the potential energy and its gradient at
+      that position.
+    initial_grad: Tensor with the value of the gradient of the potential energy
+      at `initial_position`.
+    name: Python `str` name prefixed to Ops created by this function.
 
-  optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)
-  log_sigma_update = optimizer.minimize(loss, var_list=[log_sigma])
+  Returns:
+    updated_position: Updated value of the position.
+    updated_momentum: Updated value of the momentum.
+    new_potential: Potential energy of the new position. Has shape matching
+      `potential_and_grad(initial_position)`.
+    new_grad: Gradient from potential_and_grad() evaluated at the new position.
+      Has shape matching `initial_position`.
+
+  Example: Simple quadratic potential.
 
-  sess.graph.finalize()  # No more graph building.
+  ```python
+  def potential_and_grad(position):
+    return tf.reduce_sum(0.5 * tf.square(position)), position
+  position = tf.placeholder(np.float32)
+  momentum = tf.placeholder(np.float32)
+  potential, grad = potential_and_grad(position)
+  new_position, new_momentum, new_potential, new_grad = hmc.leapfrog_integrator(
+    0.1, 3, position, momentum, potential_and_grad, grad)
+
+  sess = tf.Session()
+  position_val = np.random.randn(10)
+  momentum_val = np.random.randn(10)
+  potential_val, grad_val = sess.run([potential, grad],
+                                     {position: position_val})
+  positions = np.zeros([100, 10])
+  for i in xrange(100):
+    position_val, momentum_val, potential_val, grad_val = sess.run(
+      [new_position, new_momentum, new_potential, new_grad],
+      {position: position_val, momentum: momentum_val})
+    positions[i] = position_val
+  # Should trace out sinusoidal dynamics.
+  plt.plot(positions[:, 0])
+  ```
+  """
+  def leapfrog_wrapper(step_size, x, m, grad, l):
+    x, m, _, grad = leapfrog_step(step_size, x, m, potential_and_grad, grad)
+    return step_size, x, m, grad, l + 1
 
-  tf.global_variables_initializer().run()
+  def counter_fn(a, b, c, d, counter):  # pylint: disable=unused-argument
+    return counter < n_steps
 
-  sigma_history = np.zeros(num_iters, dtype)
-  weights_history = np.zeros([num_iters, dims], dtype)
+  with ops.name_scope(name, 'leapfrog_integrator',
+                      [step_size, n_steps, initial_position, initial_momentum,
+                       initial_grad]):
+    _, new_x, new_m, new_grad, _ = control_flow_ops.while_loop(
+        counter_fn, leapfrog_wrapper, [step_size, initial_position,
+                                       initial_momentum, initial_grad,
+                                       array_ops.constant(0)], back_prop=False)
+    # We're counting on the runtime to eliminate this redundant computation.
+    new_potential, new_grad = potential_and_grad(new_x)
+  return new_x, new_m, new_potential, new_grad
 
-  for i in xrange(num_iters):
-    _, sigma_, weights_, _ = sess.run([log_sigma_update, sigma, weights])
-    weights_history[i, :] = weights_
-    sigma_history[i] = sigma_
 
-  true_weights_ = sess.run(true_weights)
+def leapfrog_step(step_size, position, momentum, potential_and_grad, grad,
+                  name=None):
+  """Applies one step of the leapfrog integrator.
 
-  # Should converge to something close to true_sigma.
-  plt.plot(sigma_history);
-  plt.ylabel("sigma");
-  plt.xlabel("iteration");
-  ```
+  Assumes a simple quadratic kinetic energy function: 0.5 * ||momentum||^2.
 
   Args:
-    target_log_prob_fn: Python callable which takes an argument like
-      `current_state` (or `*current_state` if it's a list) and returns its
-      (possibly unnormalized) log-density under the target distribution.
-    current_state: `Tensor` or Python `list` of `Tensor`s representing the
-      current state(s) of the Markov chain(s). The first `r` dimensions index
-      independent chains, `r = tf.rank(target_log_prob_fn(*current_state))`.
-    step_size: `Tensor` or Python `list` of `Tensor`s representing the step size
-      for the leapfrog integrator. Must broadcast with the shape of
-      `current_state`. Larger step sizes lead to faster progress, but too-large
-      step sizes make rejection exponentially more likely. When possible, it's
-      often helpful to match per-variable step sizes to the standard deviations
-      of the target distribution in each variable.
-    num_leapfrog_steps: Integer number of steps to run the leapfrog integrator
-      for. Total progress per HMC step is roughly proportional to `step_size *
-      num_leapfrog_steps`.
-    seed: Python integer to seed the random number generator.
-    current_target_log_prob: (Optional) `Tensor` representing the value of
-      `target_log_prob_fn` at the `current_state`. The only reason to
-      specify this argument is to reduce TF graph size.
-      Default value: `None` (i.e., compute as needed).
-    current_grads_target_log_prob: (Optional) Python list of `Tensor`s
-      representing gradient of `current_target_log_prob` at the `current_state`
-      and wrt the `current_state`. Must have same shape as `current_state`. The
-      only reason to specify this argument is to reduce TF graph size.
-      Default value: `None` (i.e., compute as needed).
+    step_size: Scalar step size or array of step sizes for the
+      leapfrog integrator. Broadcasts to the shape of
+      `position`. Larger step sizes lead to faster progress, but
+      too-large step sizes lead to larger discretization error and
+      worse energy conservation.
+    position: Tensor containing the value(s) of the position variable(s)
+      to update.
+    momentum: Tensor containing the value(s) of the momentum variable(s)
+      to update.
+    potential_and_grad: Python callable that takes a position tensor like
+      `position` and returns the potential energy and its gradient at that
+      position.
+    grad: Tensor with the value of the gradient of the potential energy
+      at `position`.
     name: Python `str` name prefixed to Ops created by this function.
-      Default value: `None` (i.e., "hmc_kernel").
 
   Returns:
-    accepted_state: Tensor or Python list of `Tensor`s representing the state(s)
-      of the Markov chain(s) at each result step. Has same shape as
-      `current_state`.
-    acceptance_probs: Tensor with the acceptance probabilities for each
-      iteration. Has shape matching `target_log_prob_fn(current_state)`.
-    accepted_target_log_prob: `Tensor` representing the value of
-      `target_log_prob_fn` at `accepted_state`.
-    accepted_grads_target_log_prob: Python `list` of `Tensor`s representing the
-      gradient of `accepted_target_log_prob` wrt each `accepted_state`.
-
-  Raises:
-    ValueError: if there isn't one `step_size` or a list with same length as
-      `current_state`.
-  """
-  with ops.name_scope(
-      name, "hmc_kernel",
-      [current_state, step_size, num_leapfrog_steps, seed,
-       current_target_log_prob, current_grads_target_log_prob]):
-    with ops.name_scope("initialize"):
-      [current_state_parts, step_sizes, current_target_log_prob,
-       current_grads_target_log_prob] = _prepare_args(
-           target_log_prob_fn, current_state, step_size,
-           current_target_log_prob, current_grads_target_log_prob,
-           maybe_expand=True)
-      independent_chain_ndims = distributions_util.prefer_static_rank(
-          current_target_log_prob)
-      def init_momentum(s):
-        return random_ops.random_normal(
-            shape=array_ops.shape(s),
-            dtype=s.dtype.base_dtype,
-            seed=distributions_util.gen_new_seed(
-                seed, salt="hmc_kernel_momentums"))
-      current_momentums = [init_momentum(s) for s in current_state_parts]
-
-    [
-        proposed_momentums,
-        proposed_state_parts,
-        proposed_target_log_prob,
-        proposed_grads_target_log_prob,
-    ] = _leapfrog_integrator(current_momentums,
-                             target_log_prob_fn,
-                             current_state_parts,
-                             step_sizes,
-                             num_leapfrog_steps,
-                             current_target_log_prob,
-                             current_grads_target_log_prob)
-
-    energy_change = _compute_energy_change(current_target_log_prob,
-                                           current_momentums,
-                                           proposed_target_log_prob,
-                                           proposed_momentums,
-                                           independent_chain_ndims)
-
-    # u < exp(min(-energy, 0)),  where u~Uniform[0,1)
-    # ==> -log(u) >= max(e, 0)
-    # ==> -log(u) >= e
-    # (Perhaps surprisingly, we don't have a better way to obtain a random
-    # uniform from positive reals, i.e., `tf.random_uniform(minval=0,
-    # maxval=np.inf)` won't work.)
-    random_uniform = random_ops.random_uniform(
-        shape=array_ops.shape(energy_change),
-        dtype=energy_change.dtype,
-        seed=seed)
-    random_positive = -math_ops.log(random_uniform)
-    is_accepted = random_positive >= energy_change
-
-    accepted_target_log_prob = array_ops.where(is_accepted,
-                                               proposed_target_log_prob,
-                                               current_target_log_prob)
-
-    accepted_state_parts = [_choose(is_accepted,
-                                    proposed_state_part,
-                                    current_state_part,
-                                    independent_chain_ndims)
-                            for current_state_part, proposed_state_part
-                            in zip(current_state_parts, proposed_state_parts)]
-
-    accepted_grads_target_log_prob = [
-        _choose(is_accepted,
-                proposed_grad,
-                grad,
-                independent_chain_ndims)
-        for proposed_grad, grad
-        in zip(proposed_grads_target_log_prob, current_grads_target_log_prob)]
-
-    maybe_flatten = lambda x: x if _is_list_like(current_state) else x[0]
-    return [
-        maybe_flatten(accepted_state_parts),
-        KernelResults(
-            acceptance_probs=math_ops.exp(math_ops.minimum(-energy_change, 0.)),
-            current_grads_target_log_prob=accepted_grads_target_log_prob,
-            current_target_log_prob=accepted_target_log_prob,
-            energy_change=energy_change,
-            is_accepted=is_accepted,
-            proposed_grads_target_log_prob=proposed_grads_target_log_prob,
-            proposed_state=maybe_flatten(proposed_state_parts),
-            proposed_target_log_prob=proposed_target_log_prob,
-            random_positive=random_positive,
-        ),
-    ]
-
-
-def _leapfrog_integrator(current_momentums,
-                         target_log_prob_fn,
-                         current_state_parts,
-                         step_sizes,
-                         num_leapfrog_steps,
-                         current_target_log_prob=None,
-                         current_grads_target_log_prob=None,
-                         name=None):
-  """Applies `num_leapfrog_steps` of the leapfrog integrator.
-
-  Assumes a simple quadratic kinetic energy function: `0.5 ||momentum||**2`.
-
-  #### Examples:
+    updated_position: Updated value of the position.
+    updated_momentum: Updated value of the momentum.
+    new_potential: Potential energy of the new position. Has shape matching
+      `potential_and_grad(position)`.
+    new_grad: Gradient from potential_and_grad() evaluated at the new position.
+      Has shape matching `position`.
 
-  ##### Simple quadratic potential.
+  Example: Simple quadratic potential.
 
   ```python
-  tfd = tf.contrib.distributions
-
-  dims = 10
-  num_iter = int(1e3)
-  dtype = np.float32
-
+  def potential_and_grad(position):
+    # Simple quadratic potential
+    return tf.reduce_sum(0.5 * tf.square(position)), position
   position = tf.placeholder(np.float32)
   momentum = tf.placeholder(np.float32)
-
-  [
-      new_momentums,
-      new_positions,
-  ] = hmc._leapfrog_integrator(
-      current_momentums=[momentum],
-      target_log_prob_fn=tfd.MultivariateNormalDiag(
-          loc=tf.zeros(dims, dtype)).log_prob,
-      current_state_parts=[position],
-      step_sizes=0.1,
-      num_leapfrog_steps=3)[:2]
-
-  sess.graph.finalize()  # No more graph building.
-
-  momentum_ = np.random.randn(dims).astype(dtype)
-  position_ = np.random.randn(dims).astype(dtype)
-
-  positions = np.zeros([num_iter, dims], dtype)
-  for i in xrange(num_iter):
-    position_, momentum_ = sess.run(
-        [new_momentums[0], new_position[0]],
-        feed_dict={position: position_, momentum: momentum_})
-    positions[i] = position_
-
-  plt.plot(positions[:, 0]);  # Sinusoidal.
+  potential, grad = potential_and_grad(position)
+  new_position, new_momentum, new_potential, new_grad = hmc.leapfrog_step(
+    0.1, position, momentum, potential_and_grad, grad)
+
+  sess = tf.Session()
+  position_val = np.random.randn(10)
+  momentum_val = np.random.randn(10)
+  potential_val, grad_val = sess.run([potential, grad],
+                                     {position: position_val})
+  positions = np.zeros([100, 10])
+  for i in xrange(100):
+    position_val, momentum_val, potential_val, grad_val = sess.run(
+      [new_position, new_momentum, new_potential, new_grad],
+      {position: position_val, momentum: momentum_val})
+    positions[i] = position_val
+  # Should trace out sinusoidal dynamics.
+  plt.plot(positions[:, 0])
   ```
-
-  Args:
-    current_momentums: Tensor containing the value(s) of the momentum
-      variable(s) to update.
-    target_log_prob_fn: Python callable which takes an argument like
-      `*current_state_parts` and returns its (possibly unnormalized) log-density
-      under the target distribution.
-    current_state_parts: Python `list` of `Tensor`s representing the current
-      state(s) of the Markov chain(s). The first `independent_chain_ndims` of
-      the `Tensor`(s) index different chains.
-    step_sizes: Python `list` of `Tensor`s representing the step size for the
-      leapfrog integrator. Must broadcast with the shape of
-      `current_state_parts`.  Larger step sizes lead to faster progress, but
-      too-large step sizes make rejection exponentially more likely. When
-      possible, it's often helpful to match per-variable step sizes to the
-      standard deviations of the target distribution in each variable.
-    num_leapfrog_steps: Integer number of steps to run the leapfrog integrator
-      for. Total progress per HMC step is roughly proportional to `step_size *
-      num_leapfrog_steps`.
-    current_target_log_prob: (Optional) `Tensor` representing the value of
-      `target_log_prob_fn(*current_state_parts)`. The only reason to specify
-      this argument is to reduce TF graph size.
-      Default value: `None` (i.e., compute as needed).
-    current_grads_target_log_prob: (Optional) Python list of `Tensor`s
-      representing gradient of `target_log_prob_fn(*current_state_parts`) wrt
-      `current_state_parts`. Must have same shape as `current_state_parts`. The
-      only reason to specify this argument is to reduce TF graph size.
-      Default value: `None` (i.e., compute as needed).
-    name: Python `str` name prefixed to Ops created by this function.
-      Default value: `None` (i.e., "hmc_leapfrog_integrator").
-
-  Returns:
-    proposed_momentums: Updated value of the momentum.
-    proposed_state_parts: Tensor or Python list of `Tensor`s representing the
-      state(s) of the Markov chain(s) at each result step. Has same shape as
-      input `current_state_parts`.
-    proposed_target_log_prob: `Tensor` representing the value of
-      `target_log_prob_fn` at `accepted_state`.
-    proposed_grads_target_log_prob: Gradient of `proposed_target_log_prob` wrt
-      `accepted_state`.
-
-  Raises:
-    ValueError: if `len(momentums) != len(state_parts)`.
-    ValueError: if `len(state_parts) != len(step_sizes)`.
-    ValueError: if `len(state_parts) != len(grads_target_log_prob)`.
-    TypeError: if `not target_log_prob.dtype.is_floating`.
   """
-  def _loop_body(step,
-                 current_momentums,
-                 current_state_parts,
-                 ignore_current_target_log_prob,  # pylint: disable=unused-argument
-                 current_grads_target_log_prob):
-    return [step + 1] + list(_leapfrog_step(current_momentums,
-                                            target_log_prob_fn,
-                                            current_state_parts,
-                                            step_sizes,
-                                            current_grads_target_log_prob))
-
-  with ops.name_scope(
-      name, "hmc_leapfrog_integrator",
-      [current_momentums, current_state_parts, step_sizes, num_leapfrog_steps,
-       current_target_log_prob, current_grads_target_log_prob]):
-    if len(current_momentums) != len(current_state_parts):
-      raise ValueError("`momentums` must be in one-to-one correspondence "
-                       "with `state_parts`")
-    num_leapfrog_steps = ops.convert_to_tensor(num_leapfrog_steps,
-                                               name="num_leapfrog_steps")
-    current_target_log_prob, current_grads_target_log_prob = (
-        _maybe_call_fn_and_grads(
-            target_log_prob_fn,
-            current_state_parts,
-            current_target_log_prob,
-            current_grads_target_log_prob))
-    return control_flow_ops.while_loop(
-        cond=lambda iter_, *args: iter_ < num_leapfrog_steps,
-        body=_loop_body,
-        loop_vars=[
-            0,  # iter_
-            current_momentums,
-            current_state_parts,
-            current_target_log_prob,
-            current_grads_target_log_prob,
-        ],
-        back_prop=False)[1:]  # Lop-off "iter_".
-
-
-def _leapfrog_step(current_momentums,
-                   target_log_prob_fn,
-                   current_state_parts,
-                   step_sizes,
-                   current_grads_target_log_prob,
-                   name=None):
-  """Applies one step of the leapfrog integrator."""
-  with ops.name_scope(
-      name, "_leapfrog_step",
-      [current_momentums, current_state_parts, step_sizes,
-       current_grads_target_log_prob]):
-    proposed_momentums = [m + 0.5 * ss * g for m, ss, g
-                          in zip(current_momentums,
-                                 step_sizes,
-                                 current_grads_target_log_prob)]
-    proposed_state_parts = [x + ss * m for x, ss, m
-                            in zip(current_state_parts,
-                                   step_sizes,
-                                   proposed_momentums)]
-    proposed_target_log_prob = target_log_prob_fn(*proposed_state_parts)
-    if not proposed_target_log_prob.dtype.is_floating:
-      raise TypeError("`target_log_prob_fn` must produce a `Tensor` "
-                      "with `float` `dtype`.")
-    proposed_grads_target_log_prob = gradients_ops.gradients(
-        proposed_target_log_prob, proposed_state_parts)
-    if any(g is None for g in proposed_grads_target_log_prob):
-      raise ValueError(
-          "Encountered `None` gradient. Does your target `target_log_prob_fn` "
-          "access all `tf.Variable`s via `tf.get_variable`?\n"
-          "  current_state_parts: {}\n"
-          "  proposed_state_parts: {}\n"
-          "  proposed_grads_target_log_prob: {}".format(
-              current_state_parts,
-              proposed_state_parts,
-              proposed_grads_target_log_prob))
-    proposed_momentums = [m + 0.5 * ss * g for m, ss, g
-                          in zip(proposed_momentums,
-                                 step_sizes,
-                                 proposed_grads_target_log_prob)]
-    return [
-        proposed_momentums,
-        proposed_state_parts,
-        proposed_target_log_prob,
-        proposed_grads_target_log_prob,
-    ]
-
-
-def _compute_energy_change(current_target_log_prob,
-                           current_momentums,
-                           proposed_target_log_prob,
-                           proposed_momentums,
-                           independent_chain_ndims,
-                           name=None):
-  """Helper to `kernel` which computes the energy change."""
-  with ops.name_scope(
-      name, "compute_energy_change",
-      ([current_target_log_prob, proposed_target_log_prob,
-        independent_chain_ndims] +
-       current_momentums + proposed_momentums)):
-    # Abbreviate lk0=log_kinetic_energy and lk1=proposed_log_kinetic_energy
-    # since they're a mouthful and lets us inline more.
-    lk0, lk1 = [], []
-    for current_momentum, proposed_momentum in zip(current_momentums,
-                                                   proposed_momentums):
-      axis = math_ops.range(independent_chain_ndims,
-                            array_ops.rank(current_momentum))
-      lk0.append(_log_sum_sq(current_momentum, axis))
-      lk1.append(_log_sum_sq(proposed_momentum, axis))
-
-    lk0 = -np.log(2.) + math_ops.reduce_logsumexp(array_ops.stack(lk0, axis=-1),
-                                                  axis=-1)
-    lk1 = -np.log(2.) + math_ops.reduce_logsumexp(array_ops.stack(lk1, axis=-1),
-                                                  axis=-1)
-    lp0 = -current_target_log_prob   # log_potential
-    lp1 = -proposed_target_log_prob  # proposed_log_potential
-    x = array_ops.stack([lp1, math_ops.exp(lk1), -lp0, -math_ops.exp(lk0)],
-                        axis=-1)
-
-    # The sum is NaN if any element is NaN or we see both +Inf and -Inf.
-    # Thus we will replace such rows with infinite energy change which implies
-    # rejection. Recall that float-comparisons with NaN are always False.
-    is_sum_determinate = (
-        math_ops.reduce_all(math_ops.is_finite(x) | (x >= 0.), axis=-1) &
-        math_ops.reduce_all(math_ops.is_finite(x) | (x <= 0.), axis=-1))
-    is_sum_determinate = array_ops.tile(
-        is_sum_determinate[..., array_ops.newaxis],
-        multiples=array_ops.concat([
-            array_ops.ones(array_ops.rank(is_sum_determinate),
-                           dtype=dtypes.int32),
-            [4],
-        ], axis=0))
-    x = array_ops.where(is_sum_determinate,
-                        x,
-                        array_ops.fill(array_ops.shape(x),
-                                       value=x.dtype.as_numpy_dtype(np.inf)))
-
-    return math_ops.reduce_sum(x, axis=-1)
-
-
-def _choose(is_accepted,
-            accepted,
-            rejected,
-            independent_chain_ndims,
-            name=None):
-  """Helper to `kernel` which expand_dims `is_accepted` to apply tf.where."""
-  def _expand_is_accepted_like(x):
-    with ops.name_scope("_choose"):
-      expand_shape = array_ops.concat([
-          array_ops.shape(is_accepted),
-          array_ops.ones([array_ops.rank(x) - array_ops.rank(is_accepted)],
-                         dtype=dtypes.int32),
-      ], axis=0)
-      multiples = array_ops.concat([
-          array_ops.ones([array_ops.rank(is_accepted)], dtype=dtypes.int32),
-          array_ops.shape(x)[independent_chain_ndims:],
-      ], axis=0)
-      m = array_ops.tile(array_ops.reshape(is_accepted, expand_shape),
-                         multiples)
-      m.set_shape(x.shape)
-      return m
-  with ops.name_scope(name, "_choose", values=[
-      is_accepted, accepted, rejected, independent_chain_ndims]):
-    return array_ops.where(_expand_is_accepted_like(accepted),
-                           accepted,
-                           rejected)
-
-
-def _maybe_call_fn_and_grads(fn,
-                             fn_arg_list,
-                             fn_result=None,
-                             grads_fn_result=None,
-                             description="target_log_prob"):
-  """Helper which computes `fn_result` and `grads` if needed."""
-  fn_arg_list = (list(fn_arg_list) if _is_list_like(fn_arg_list)
-                 else [fn_arg_list])
-  if fn_result is None:
-    fn_result = fn(*fn_arg_list)
-  if not fn_result.dtype.is_floating:
-    raise TypeError("`{}` must be a `Tensor` with `float` `dtype`.".format(
-        description))
-  if grads_fn_result is None:
-    grads_fn_result = gradients_ops.gradients(
-        fn_result, fn_arg_list)
-  if len(fn_arg_list) != len(grads_fn_result):
-    raise ValueError("`{}` must be in one-to-one correspondence with "
-                     "`grads_{}`".format(*[description]*2))
-  if any(g is None for g in grads_fn_result):
-    raise ValueError("Encountered `None` gradient.")
-  return fn_result, grads_fn_result
-
-
-def _prepare_args(target_log_prob_fn, state, step_size,
-                  target_log_prob=None, grads_target_log_prob=None,
-                  maybe_expand=False, description="target_log_prob"):
-  """Helper which processes input args to meet list-like assumptions."""
-  state_parts = list(state) if _is_list_like(state) else [state]
-  state_parts = [ops.convert_to_tensor(s, name="state")
-                 for s in state_parts]
-  target_log_prob, grads_target_log_prob = _maybe_call_fn_and_grads(
-      target_log_prob_fn,
-      state_parts,
-      target_log_prob,
-      grads_target_log_prob,
-      description)
-  step_sizes = list(step_size) if _is_list_like(step_size) else [step_size]
-  step_sizes = [
-      ops.convert_to_tensor(
-          s, name="step_size", dtype=target_log_prob.dtype)
-      for s in step_sizes]
-  if len(step_sizes) == 1:
-    step_sizes *= len(state_parts)
-  if len(state_parts) != len(step_sizes):
-    raise ValueError("There should be exactly one `step_size` or it should "
-                     "have same length as `current_state`.")
-  if maybe_expand:
-    maybe_flatten = lambda x: x
-  else:
-    maybe_flatten = lambda x: x if _is_list_like(state) else x[0]
-  return [
-      maybe_flatten(state_parts),
-      maybe_flatten(step_sizes),
-      target_log_prob,
-      grads_target_log_prob,
-  ]
-
-
-def _is_list_like(x):
-  """Helper which returns `True` if input is `list`-like."""
-  return isinstance(x, (tuple, list))
-
-
-def _log_sum_sq(x, axis=None):
-  """Computes log(sum(x**2))."""
-  return math_ops.reduce_logsumexp(2. * math_ops.log(math_ops.abs(x)), axis)
+  with ops.name_scope(name, 'leapfrog_step', [step_size, position, momentum,
+                                              grad]):
+    momentum -= 0.5 * step_size * grad
+    position += step_size * momentum
+    potential, grad = potential_and_grad(position)
+    momentum -= 0.5 * step_size * grad
+
+  return position, momentum, potential, grad
-- 
GitLab


From d0c2b84c2d6cd3c235671e67f22e3639b4b2b54c Mon Sep 17 00:00:00 2001
From: gracehoney <31743510+aaroey@users.noreply.github.com>
Date: Mon, 5 Feb 2018 11:19:46 -0800
Subject: [PATCH 653/966] Remove LICENSE file. It will be added internally
 instead.

---
 third_party/tensorrt/LICENSE | 203 -----------------------------------
 1 file changed, 203 deletions(-)
 delete mode 100644 third_party/tensorrt/LICENSE

diff --git a/third_party/tensorrt/LICENSE b/third_party/tensorrt/LICENSE
deleted file mode 100644
index 146d9b765c..0000000000
--- a/third_party/tensorrt/LICENSE
+++ /dev/null
@@ -1,203 +0,0 @@
-Copyright 2018 The TensorFlow Authors.  All rights reserved.
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright 2018, The TensorFlow Authors.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-- 
GitLab


From 13fb1cf9942839a13683f946d2058697163831e8 Mon Sep 17 00:00:00 2001
From: Yao Zhang <yaozhang@google.com>
Date: Mon, 5 Feb 2018 11:23:40 -0800
Subject: [PATCH 654/966] Support parsing from text and fused op in contrib.

PiperOrigin-RevId: 184558131
---
 tensorflow/python/grappler/cost_analyzer_tool.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/grappler/cost_analyzer_tool.py b/tensorflow/python/grappler/cost_analyzer_tool.py
index 61dc4e2afb..ac251f2bbd 100644
--- a/tensorflow/python/grappler/cost_analyzer_tool.py
+++ b/tensorflow/python/grappler/cost_analyzer_tool.py
@@ -22,7 +22,7 @@ import argparse
 import sys
 
 from google.protobuf import text_format
-
+from tensorflow.contrib.fused_conv.ops import gen_fused_conv2d_bias_activation_op  # pylint: disable=unused-import
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
@@ -43,7 +43,10 @@ def main(_):
   else:
     with gfile.GFile(FLAGS.graphdef) as graph_file:
       graph_def = graph_pb2.GraphDef()
-      graph_def.ParseFromString(graph_file.read())
+      if FLAGS.graphdef.endswith(".pbtxt"):
+        text_format.Merge(graph_file.read(), graph_def)
+      else:
+        graph_def.ParseFromString(graph_file.read())
       importer.import_graph_def(graph_def, name="")
       graph = ops.get_default_graph()
       fetch = graph.get_operation_by_name(FLAGS.fetch)
-- 
GitLab


From 2eacd726ed2317cb1c0887ed58d4bde935608274 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <bsteiner@google.com>
Date: Mon, 5 Feb 2018 12:08:36 -0800
Subject: [PATCH 655/966] Serialize the evaluation of the AssignAdd nodes to
 make the result more deterministic Improved testing

PiperOrigin-RevId: 184565483
---
 tensorflow/core/grappler/optimizers/BUILD     |  3 ++
 .../optimizers/constant_folding_test.cc       | 19 +-------
 .../grappler/optimizers/memory_optimizer.cc   |  1 +
 .../optimizers/memory_optimizer_test.cc       | 45 ++++++++++++++-----
 tensorflow/core/grappler/utils/BUILD          | 20 +++++++++
 .../core/grappler/utils/grappler_test.cc      | 39 ++++++++++++++++
 .../core/grappler/utils/grappler_test.h       | 37 +++++++++++++++
 7 files changed, 135 insertions(+), 29 deletions(-)
 create mode 100644 tensorflow/core/grappler/utils/grappler_test.cc
 create mode 100644 tensorflow/core/grappler/utils/grappler_test.h

diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 8b9885e4c1..2ac31ebf6a 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -125,6 +125,7 @@ tf_cc_test(
         "//tensorflow/core:testlib",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/utils:grappler_test",
     ],
 )
 
@@ -301,11 +302,13 @@ tf_cc_test(
         "//tensorflow/cc:cc_ops",
         "//tensorflow/core:ops",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensor_testutil",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:virtual_cluster",
+        "//tensorflow/core/grappler/utils:grappler_test",
     ],
 )
 
diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index 849a88770a..46998dcc91 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -20,30 +20,15 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/grappler_test.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/public/session.h"
 
 namespace tensorflow {
 namespace grappler {
 namespace {
 
-class ConstantFoldingTest : public ::testing::Test {
- protected:
-  std::vector<Tensor> EvaluateNodes(const GraphDef& graph,
-                                    const std::vector<string>& fetch) {
-    SessionOptions options;
-    std::unique_ptr<tensorflow::Session> session(NewSession(options));
-    TF_CHECK_OK(session->Create(graph));
-    RunOptions run_options;
-    std::vector<Tensor> output_tensors;
-    TF_CHECK_OK(
-        session->Run(run_options, {}, fetch, fetch, &output_tensors, nullptr));
-    TF_CHECK_OK(session->Close());
-    return output_tensors;
-  }
-};
+class ConstantFoldingTest : public GrapplerTest {};
 
 TEST_F(ConstantFoldingTest, SimpleFolding) {
   // Build a simple graph with a few trivially prunable ops.
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer.cc b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
index ffa03db262..be18e7d6f0 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
@@ -655,6 +655,7 @@ bool SchedulingPass(Cluster* cluster, GrapplerItem* item) {
         accumulate->set_op("AssignAdd");
         accumulate->set_device(device);
         (*accumulate->mutable_attr())["T"].set_type(dtype);
+        (*accumulate->mutable_attr())["use_locking"].set_b(true);
         *accumulate->add_input() = initialize->name();
         *accumulate->add_input() = input;
         accumulates.push_back(accumulate);
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc b/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
index f5d9c87992..5d7913e0c0 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
@@ -19,17 +19,18 @@ limitations under the License.
 
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/grappler/clusters/virtual_cluster.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/grappler_test.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
 namespace grappler {
 namespace {
 
-class RecomputeSubgraphTest : public ::testing::Test {};
+class RecomputeSubgraphTest : public GrapplerTest {};
 
 TEST_F(RecomputeSubgraphTest, SimpleSubgraph) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
@@ -193,7 +194,7 @@ TEST_F(RecomputeSubgraphTest, MultiNode) {
   EXPECT_EQ("^gradients/BN1Grad", recompute_trigger_c->input(0));
 }
 
-class MemoryOptimizerTest : public ::testing::Test {
+class MemoryOptimizerTest : public GrapplerTest {
  public:
   static std::unique_ptr<VirtualCluster> CreateVirtualCluster() {
     DeviceProperties cpu_device;
@@ -201,6 +202,7 @@ class MemoryOptimizerTest : public ::testing::Test {
     cpu_device.set_frequency(1000);
     cpu_device.set_num_cores(4);
     cpu_device.set_bandwidth(32);
+    cpu_device.set_memory_size(1024 * 1024);
     DeviceProperties gpu_device;
     gpu_device.set_type("GPU");
     gpu_device.set_frequency(1000);
@@ -346,17 +348,18 @@ TEST_F(MemoryOptimizerTest, UnswappableInputs) {
 
 TEST_F(MemoryOptimizerTest, AccumulationRewrites) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output a = ops::Variable(s.WithOpName("a").WithDevice("/gpu:0"),
-                           {128, 128, 8}, DT_FLOAT);
-  Output b = ops::Variable(s.WithOpName("b").WithDevice("/gpu:0"),
-                           {128, 128, 8}, DT_FLOAT);
-  Output c = ops::Variable(s.WithOpName("c").WithDevice("/gpu:0"),
-                           {128, 128, 8}, DT_FLOAT);
-  Output d = ops::AddN(s.WithOpName("d").WithDevice("/gpu:0"), {a, b, c});
+  Output a = ops::RandomNormal(s.WithOpName("a").WithDevice("/cpu:0"),
+                               {128, 128, 8}, DT_FLOAT);
+  Output b = ops::RandomNormal(s.WithOpName("b").WithDevice("/cpu:0"),
+                               {128, 128, 8}, DT_FLOAT);
+  Output c = ops::RandomNormal(s.WithOpName("c").WithDevice("/cpu:0"),
+                               {128, 128, 8}, DT_FLOAT);
+  Output d = ops::AddN(s.WithOpName("d").WithDevice("/cpu:0"), {a, b, c});
+  Output e = ops::Square(s.WithOpName("e").WithDevice("/cpu:0"), d);
 
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-  item.fetch = {"d"};
+  item.fetch = {"e"};
 
   std::unique_ptr<VirtualCluster> cluster(CreateVirtualCluster());
   MemoryOptimizer optimizer(RewriterConfig::SCHEDULING_HEURISTICS);
@@ -375,9 +378,27 @@ TEST_F(MemoryOptimizerTest, AccumulationRewrites) {
     } else if (node.name() == "d/tmp_var") {
       EXPECT_EQ("TemporaryVariable", node.op());
       count++;
+    } else if (node.name() == "e") {
+      EXPECT_EQ("Square", node.op());
+      EXPECT_EQ("d", node.input(0));
+      count++;
+    }
+  }
+  EXPECT_EQ(4, count);
+
+  std::vector<string> fetch = {"a", "b", "c", "e"};
+  auto tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(4, tensors.size());
+
+  for (int i = 0; i < tensors[0].NumElements(); ++i) {
+    float actual = tensors[3].flat<float>()(i);
+    float expected = 0.0f;
+    for (int j = 0; j < 3; ++j) {
+      expected += tensors[j].flat<float>()(i);
     }
+    expected *= expected;
+    EXPECT_NEAR(actual, expected, 1e-4);
   }
-  EXPECT_EQ(3, count);
 }
 
 }  // namespace
diff --git a/tensorflow/core/grappler/utils/BUILD b/tensorflow/core/grappler/utils/BUILD
index 137d51790d..0a9dbe22cf 100644
--- a/tensorflow/core/grappler/utils/BUILD
+++ b/tensorflow/core/grappler/utils/BUILD
@@ -125,3 +125,23 @@ tf_cc_test(
         "//tensorflow/core:test_main",
     ],
 )
+
+cc_library(
+    name = "grappler_test",
+    testonly = 1,
+    srcs = [
+        "grappler_test.cc",
+    ],
+    hdrs = ["grappler_test.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:all_kernels",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:direct_session",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core/grappler:utils",
+    ],
+)
diff --git a/tensorflow/core/grappler/utils/grappler_test.cc b/tensorflow/core/grappler/utils/grappler_test.cc
new file mode 100644
index 0000000000..813f65f825
--- /dev/null
+++ b/tensorflow/core/grappler/utils/grappler_test.cc
@@ -0,0 +1,39 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/utils/grappler_test.h"
+#include <memory>
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/public/session.h"
+
+namespace tensorflow {
+namespace grappler {
+
+std::vector<Tensor> GrapplerTest::EvaluateNodes(
+    const GraphDef& graph, const std::vector<string>& node_names) {
+  SessionOptions options;
+  std::unique_ptr<tensorflow::Session> session(NewSession(options));
+  TF_CHECK_OK(session->Create(graph));
+  RunOptions run_options;
+  std::vector<Tensor> output_tensors;
+  TF_CHECK_OK(session->Run(run_options, {}, node_names, node_names,
+                           &output_tensors, nullptr));
+  TF_CHECK_OK(session->Close());
+  return output_tensors;
+}
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/grappler_test.h b/tensorflow/core/grappler/utils/grappler_test.h
new file mode 100644
index 0000000000..46ce47c8c3
--- /dev/null
+++ b/tensorflow/core/grappler/utils/grappler_test.h
@@ -0,0 +1,37 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_GRAPPLER_TEST_H_
+#define TENSORFLOW_GRAPPLER_GRAPPLER_TEST_H_
+
+#include <vector>
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+
+class GrapplerTest : public ::testing::Test {
+ protected:
+  std::vector<Tensor> EvaluateNodes(const GraphDef& graph,
+                                    const std::vector<string>& node_names);
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_GRAPPLER_TEST_H_
-- 
GitLab


From 11167ab9f4a27c0ec65c53698d66c710f0801194 Mon Sep 17 00:00:00 2001
From: Yunxing Dai <yunxing@google.com>
Date: Mon, 5 Feb 2018 12:15:10 -0800
Subject: [PATCH 656/966] [TF:XLA] Making constant folding deterministic.

Making constant folding deterministic by doing DFS deterministically and inserting a serialization point based on nodes' names.

This is the last source of non-determinism remaining in the TF:XLA stack.

RELNOTES: Constant folding pass is now deterministic.
PiperOrigin-RevId: 184566644
---
 .../core/common_runtime/constant_folding.cc   | 53 ++++++++++++-------
 .../core/common_runtime/constant_folding.h    | 10 ++++
 .../common_runtime/constant_folding_test.cc   | 52 ++++++++++++++++++
 .../core/common_runtime/function_test.cc      |  6 +--
 4 files changed, 99 insertions(+), 22 deletions(-)

diff --git a/tensorflow/core/common_runtime/constant_folding.cc b/tensorflow/core/common_runtime/constant_folding.cc
index 0398c2a60d..b5a51d2526 100644
--- a/tensorflow/core/common_runtime/constant_folding.cc
+++ b/tensorflow/core/common_runtime/constant_folding.cc
@@ -328,7 +328,8 @@ void FindConstantFoldableNodes(
                ConsiderConstantFoldableNode(
                    n, opts, nodes, constant_control_deps, shape_replacement_map,
                    &internal_node_inserted);
-             });
+             },
+             NodeComparatorName());
   // If we have inserted just leaf level nodes, then there is nothing to fold.
   if (!internal_node_inserted) {
     nodes->clear();
@@ -339,8 +340,8 @@ void FindConstantFoldableNodes(
 typedef std::pair<Node*, int> NodeAndOutput;
 
 int64 UniqueConstantId() {
-  static std::atomic_int_fast64_t id;
-  return id.fetch_add(1);
+  static std::atomic_int_fast64_t unique_constant_id;
+  return unique_constant_id.fetch_add(1);
 }
 
 // Adds n to constant_graph which is being built up for subsequent evaluation of
@@ -386,14 +387,12 @@ void AddShapeNodeToConstantGraph(
     const std::unordered_map<const Node*, std::vector<Tensor>>&
         shape_replacement_map,
     std::unordered_map<Node*, std::vector<Node*>>* node_map,
-    Graph* constant_graph) {
+    const ConstantFoldNameGenerator& generate_new_name, Graph* constant_graph) {
   std::vector<Node*>& added = (*node_map)[n];
   const string& node_name = n->name();
   for (const Tensor& t : shape_replacement_map.at(n)) {
     auto builder =
-        NodeDefBuilder(strings::StrCat(constant_graph->NewName(node_name),
-                                       "__cf__", UniqueConstantId()),
-                       "Const")
+        NodeDefBuilder(generate_new_name(constant_graph, node_name), "Const")
             .Attr("dtype", t.dtype())
             .Attr("value", t);
     NodeDef def;
@@ -414,7 +413,8 @@ Graph* GetConstantGraph(
     const Graph* orig_graph, const std::vector<Node*>& nodes,
     const std::unordered_map<const Node*, std::vector<Tensor>>&
         shape_replacement_map,
-    std::map<NodeAndOutput, Node*>* tensors_to_fetch) {
+    std::map<NodeAndOutput, Node*>* tensors_to_fetch,
+    const ConstantFoldNameGenerator& generate_new_name) {
   Graph* constant_graph = new Graph(orig_graph->op_registry());
   std::unordered_map<Node*, std::vector<Node*>> node_map;
   node_map[orig_graph->source_node()] = {constant_graph->source_node()};
@@ -424,7 +424,7 @@ Graph* GetConstantGraph(
       AddNodeToConstantGraph(n, &node_map, constant_graph);
     } else {
       AddShapeNodeToConstantGraph(n, shape_replacement_map, &node_map,
-                                  constant_graph);
+                                  generate_new_name, constant_graph);
     }
   }
 
@@ -458,10 +458,11 @@ Graph* GetConstantGraph(
 // replacement was successful, false otherwise.
 // 'control_deps' is the set of nodes that should be control predecessors of the
 // new constant node.
-bool ReplaceTensorWithConstant(Graph* graph, Device* partition_device,
-                               NodeAndOutput tensor, const Tensor& constant,
-                               const gtl::FlatSet<Node*>& control_deps,
-                               int64 max_constant_size_in_bytes) {
+bool ReplaceTensorWithConstant(
+    Graph* graph, Device* partition_device, NodeAndOutput tensor,
+    const Tensor& constant, const gtl::FlatSet<Node*>& control_deps,
+    int64 max_constant_size_in_bytes,
+    const ConstantFoldNameGenerator& generate_new_name) {
   // Be conservative when replacing a tensor with a constant, when not
   // running on CPU.
   // 1) If the destination tensor is not an int32 tensor, and has HOST_MEMORY
@@ -509,9 +510,7 @@ bool ReplaceTensorWithConstant(Graph* graph, Device* partition_device,
   }
   const string& node_name = n->name();
   Node* constant_node;
-  auto builder = NodeDefBuilder(strings::StrCat(graph->NewName(node_name),
-                                                "__cf__", UniqueConstantId()),
-                                "Const")
+  auto builder = NodeDefBuilder(generate_new_name(graph, node_name), "Const")
                      .Attr("dtype", constant.dtype())
                      .Attr("value", constant);
   if (partition_device) {
@@ -555,6 +554,13 @@ Status ConstantFold(const ConstantFoldingOptions& opts,
                     FunctionLibraryRuntime* function_library, Env* env,
                     Device* partition_device, Graph* graph, bool* was_mutated) {
   DumpGraph("Before", graph);
+  ConstantFoldNameGenerator generate_new_name = opts.generate_new_name;
+  if (generate_new_name == nullptr) {
+    generate_new_name = [](Graph* graph, string old_name) {
+      return strings::StrCat(graph->NewName(old_name), "__cf__",
+                             UniqueConstantId());
+    };
+  }
 
   std::vector<Node*> constant_foldable_nodes;
   std::unordered_map<const Node*, gtl::FlatSet<Node*>> constant_control_deps;
@@ -571,7 +577,7 @@ Status ConstantFold(const ConstantFoldingOptions& opts,
   std::map<NodeAndOutput, Node*> tensors_to_fetch;
   std::unique_ptr<Graph> constant_graph(
       GetConstantGraph(graph, constant_foldable_nodes, shape_replacement_map,
-                       &tensors_to_fetch));
+                       &tensors_to_fetch, generate_new_name));
   DumpGraph("Constant graph", constant_graph.get());
 
   if (tensors_to_fetch.empty()) {
@@ -585,7 +591,16 @@ Status ConstantFold(const ConstantFoldingOptions& opts,
 
   std::vector<string> tensors_to_fetch_names;
   std::vector<NodeAndOutput> tensors_to_replace;
-  for (auto n : tensors_to_fetch) {
+  // Sorting the nodes based on the name gives us a stable ordering between runs
+  // for the same graph.
+  std::vector<std::pair<NodeAndOutput, Node*>> tensors_to_fetch_sorted(
+      tensors_to_fetch.begin(), tensors_to_fetch.end());
+  std::sort(tensors_to_fetch_sorted.begin(), tensors_to_fetch_sorted.end(),
+            [](const std::pair<NodeAndOutput, Node*>& n1,
+               const std::pair<NodeAndOutput, Node*>& n2) {
+              return n1.first.first->name() < n2.first.first->name();
+            });
+  for (auto n : tensors_to_fetch_sorted) {
     tensors_to_fetch_names.push_back(
         strings::StrCat(n.first.first->name(), ":", n.first.second));
     tensors_to_replace.push_back({n.second, n.first.second});
@@ -617,7 +632,7 @@ Status ConstantFold(const ConstantFoldingOptions& opts,
         constant_control_deps[tensors_to_replace[c].first];
     if (ReplaceTensorWithConstant(
             graph, partition_device, tensors_to_replace[c], outputs[c],
-            control_deps, opts.max_constant_size_in_bytes)) {
+            control_deps, opts.max_constant_size_in_bytes, generate_new_name)) {
       ++num_nodes_replaced;
     }
   }
diff --git a/tensorflow/core/common_runtime/constant_folding.h b/tensorflow/core/common_runtime/constant_folding.h
index e4d724c58a..b1e1fb8319 100644
--- a/tensorflow/core/common_runtime/constant_folding.h
+++ b/tensorflow/core/common_runtime/constant_folding.h
@@ -24,6 +24,11 @@ limitations under the License.
 
 namespace tensorflow {
 
+// This generator type is used to generate a name for the newly folded node
+// based on the node's old name.
+using ConstantFoldNameGenerator =
+    std::function<string(Graph* graph, string old_name)>;
+
 // Options specific to constant folding optimizations.
 struct ConstantFoldingOptions {
   // If "consider" is not a nullptr, then only constant fold a node "n" if
@@ -37,6 +42,11 @@ struct ConstantFoldingOptions {
   // The maximum size of each constant created during constant folding
   // optimization.
   int64 max_constant_size_in_bytes = 10 * 1024 * 1024;
+
+  // A generator for the name suffix of constant folded nodes. A
+  // default id generator that monotonically increases is used if nullptr is
+  // passed.
+  ConstantFoldNameGenerator generate_new_name = nullptr;
 };
 
 // Perform constant folding optimization on "graph".
diff --git a/tensorflow/core/common_runtime/constant_folding_test.cc b/tensorflow/core/common_runtime/constant_folding_test.cc
index 923a4d9249..6ac9319ad1 100644
--- a/tensorflow/core/common_runtime/constant_folding_test.cc
+++ b/tensorflow/core/common_runtime/constant_folding_test.cc
@@ -121,6 +121,58 @@ TEST_F(ConstantFoldingTest, Basic) {
                          {2, 2});
 }
 
+// Tests that different node creation ordering creates same graph after constant
+// folding.
+TEST_F(ConstantFoldingTest, DeterministicFolding) {
+  auto build_graph_and_constant_folding = [](Graph& g, bool swap) -> Status {
+    Scope s = Scope::NewRootScope();
+    auto a = ops::Const<float>(s, {1.0}, {});
+    auto b = ops::Const<float>(s, {2.0}, {});
+
+    if (swap) {
+      auto add1 = ops::Add(s.WithOpName("add1"), a, b);
+      auto add2 = ops::Add(s.WithOpName("add2"), a, b);
+      auto s1 =
+          ops::_Send(s.WithOpName("s1"), add1, "add1", "sender", 0, "receiver");
+      auto s2 =
+          ops::_Send(s.WithOpName("s2"), add2, "add2", "sender", 0, "receiver");
+    } else {
+      // Swap the order of node creation.
+      auto add2 = ops::Add(s.WithOpName("add2"), a, b);
+      auto add1 = ops::Add(s.WithOpName("add1"), a, b);
+      auto s1 =
+          ops::_Send(s.WithOpName("s1"), add1, "add1", "sender", 0, "receiver");
+      auto s2 =
+          ops::_Send(s.WithOpName("s2"), add2, "add2", "sender", 0, "receiver");
+    }
+
+    TF_CHECK_OK(s.ToGraph(&g));
+    bool was_mutated;
+    int64 unique_id = 0;
+    auto generate_new_name = [&unique_id](Graph* graph, string old_name) {
+      return strings::StrCat(graph->NewName(old_name), "__cf__", unique_id++);
+    };
+    ConstantFoldingOptions opt{};
+    opt.generate_new_name = generate_new_name;
+    TF_CHECK_OK(
+        ConstantFold(opt, nullptr, Env::Default(), nullptr, &g, &was_mutated));
+    return Status::OK();
+  };
+
+  Graph g1(OpRegistry::Global());
+  TF_ASSERT_OK(build_graph_and_constant_folding(g1, false));
+  Graph g2(OpRegistry::Global());
+  TF_ASSERT_OK(build_graph_and_constant_folding(g2, true));
+  EXPECT_EQ(g1.num_nodes(), g2.num_nodes());
+  auto index = NodeNameIndex(g2);
+
+  // All the nodes in g1 are expected to be present in g2.
+  for (int64 i = 0; i < g1.num_nodes(); ++i) {
+    Node* n1 = g1.FindNodeId(i);
+    EXPECT_GT(index.count(n1->name()), 0);
+  }
+}
+
 TEST_F(ConstantFoldingTest, ConsiderFunction) {
   Scope s = Scope::NewRootScope();
   BuildSimpleGraph(&s);
diff --git a/tensorflow/core/common_runtime/function_test.cc b/tensorflow/core/common_runtime/function_test.cc
index cad3b3801e..8b05146299 100644
--- a/tensorflow/core/common_runtime/function_test.cc
+++ b/tensorflow/core/common_runtime/function_test.cc
@@ -787,7 +787,7 @@ TEST_F(FunctionLibraryRuntimeTest, OptimizeGraph) {
     Scope s = Scope::NewRootScope();
     auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
     auto x4_x2_scale = ops::Const<float>(
-        s.WithOpName("x4/x2/scale/_15__cf__9")
+        s.WithOpName("x4/x2/scale/_12__cf__6")
             .WithDevice("/job:localhost/replica:0/task:0/device:CPU:0"),
         2.0f);
     auto x4_x2_y = ops::Mul(s.WithOpName("x4/x2/y"), x, x4_x2_scale);
@@ -993,13 +993,13 @@ TEST_F(FunctionLibraryRuntimeTest, Gradient_XTimesTwo) {
     auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
     auto func0 = ops::_Arg(s.WithOpName("Func/_0"), DT_FLOAT, 1);
     auto scale = ops::Const(
-        s.WithOpName("scale/_5__cf__10")
+        s.WithOpName("scale/_6__cf__11")
             .WithDevice("/job:localhost/replica:0/task:0/device:CPU:0"),
         2.0f);
     auto func1_gx = ops::Mul(s.WithOpName("Func/_1/gx"), func0, scale);
     auto func1_sx = ops::Shape(s.WithOpName("Func/_1/sx"), x);
     auto const0 = ops::Const(
-        s.WithOpName("Func/_1/sy/_6__cf__11")
+        s.WithOpName("Func/_1/sy/_5__cf__10")
             .WithDevice("/job:localhost/replica:0/task:0/device:CPU:0"),
         0, {0});
     auto func1_rx = ops::internal::BroadcastGradientArgs(
-- 
GitLab


From fc8d9c38692d3dddf474dba7e43c666105c08a3d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 5 Feb 2018 12:16:04 -0800
Subject: [PATCH 657/966] Bug fix: Don't dereference nullptr in
 OpKernelContext::input_alloc_attr().

PiperOrigin-RevId: 184566770
---
 tensorflow/core/framework/op_kernel.h | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index a3dc96b3de..c45026c6af 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -909,9 +909,13 @@ class OpKernelContext {
   }
 
   AllocatorAttributes input_alloc_attr(int index) const {
-    DCHECK_GE(index, 0);
-    DCHECK_LT(index, params_->input_alloc_attrs->size());
-    return (*params_->input_alloc_attrs)[index];
+    if (params_->input_alloc_attrs == nullptr) {
+      return AllocatorAttributes();
+    } else {
+      DCHECK_GE(index, 0);
+      DCHECK_LT(index, params_->input_alloc_attrs->size());
+      return (*params_->input_alloc_attrs)[index];
+    }
   }
 
   AllocatorAttributes output_alloc_attr(int index) const {
-- 
GitLab


From d0904cbe01c88332acb4faa8bede21adb5fa1de7 Mon Sep 17 00:00:00 2001
From: Asim Shankar <ashankar@google.com>
Date: Mon, 5 Feb 2018 12:40:51 -0800
Subject: [PATCH 658/966] contrib/rnn: Fix #16703 (Bug introduced in
 https://github.com/tensorflow/tensorflow/commit/3f579020bab8f00e4621e9c7c740cbf13136a809)

Kudos to @akhti for pointing this out.

PiperOrigin-RevId: 184570448
---
 tensorflow/contrib/rnn/python/ops/rnn_cell.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/tensorflow/contrib/rnn/python/ops/rnn_cell.py b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
index 8adf5dce6e..eb8fd0c1cd 100644
--- a/tensorflow/contrib/rnn/python/ops/rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
@@ -2285,7 +2285,7 @@ class GLSTMCell(rnn_cell_impl.RNNCell):
     else:
       self._state_size = rnn_cell_impl.LSTMStateTuple(num_units, num_units)
       self._output_size = num_units
-    self._linear1 = None
+    self._linear1 = [None] * number_of_groups
     self._linear2 = None
 
   @property
@@ -2359,9 +2359,11 @@ class GLSTMCell(rnn_cell_impl.RNNCell):
                                             self._group_shape[0])
               ],
               axis=1)
-          if self._linear1 is None:
-            self._linear1 = _Linear(x_g_id, 4 * self._group_shape[1], False)
-          R_k = self._linear1(x_g_id)  # pylint: disable=invalid-name
+          linear = self._linear1[group_id]
+          if linear is None:
+            linear = _Linear(x_g_id, 4 * self._group_shape[1], False)
+            self._linear1[group_id] = linear
+          R_k = linear(x_g_id)  # pylint: disable=invalid-name
           i_k, j_k, f_k, o_k = array_ops.split(R_k, 4, 1)
 
         i_parts.append(i_k)
-- 
GitLab


From f8f921c828fb2c97da7c7b80c01390ccec90ae40 Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Mon, 5 Feb 2018 13:03:53 -0800
Subject: [PATCH 659/966] Fixes issue where external control dependencies in
 while loops are dropped.

Fixes #15891

PiperOrigin-RevId: 184573795
---
 .../kernel_tests/control_flow_ops_py_test.py  | 30 +++++++++++++++++++
 tensorflow/python/ops/control_flow_ops.py     | 25 ++++++++++------
 2 files changed, 46 insertions(+), 9 deletions(-)

diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index 4fafc36014..15ff0ec09b 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -704,6 +704,36 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(c, b, [n], parallel_iterations=20)
       self.assertEqual(10000, r.eval())
 
+  def testWhileExternalControlDependencies(self):
+    with self.test_session():
+      v = variables.Variable(0.0)
+      v.initializer.run()
+      increment = v.assign_add(1.0)
+
+      def body_fn(i):
+        with ops.control_dependencies([increment]):
+          return i + i
+
+      result = control_flow_ops.while_loop(cond=lambda i: i < 1,
+                                           body=body_fn, loop_vars=[1])
+      result.eval()
+      self.assertAllEqual(v.eval(), 1.0)
+
+  def testWhileExternalControlDependenciesNoInput(self):
+    with self.test_session():
+      v = variables.Variable(0.0)
+      v.initializer.run()
+      increment = v.assign_add(1.0)
+
+      def body_fn(unused_i):
+        with ops.control_dependencies([increment]):
+          return constant_op.constant(5, name="five")
+
+      result = control_flow_ops.while_loop(cond=lambda i: i < 5,
+                                           body=body_fn, loop_vars=[0])
+      result.eval()
+      self.assertAllEqual(v.eval(), 1.0)
+
   def testWhileWithRefs_1(self):
     with self.test_session() as sess:
       x = variables.Variable(0)._ref()  # pylint: disable=protected-access
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index bcd187d821..87ff0aba0a 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -1631,10 +1631,13 @@ class ControlFlowContext(object):
         ctxt = util.GetOutputContext(x)
         if ctxt is not None and ctxt.GetWhileContext() == while_ctxt:
           internal_control_inputs.append(x)
+    external_control_inputs = []
     if len(internal_control_inputs) != len(op.control_inputs):
+      external_control_inputs = list(set(op.control_inputs)
+                                     - set(internal_control_inputs))
       op._remove_all_control_inputs()
       op._add_control_inputs(internal_control_inputs)
-    return internal_control_inputs
+    return internal_control_inputs, external_control_inputs
 
   # pylint: enable=protected-access
 
@@ -2432,14 +2435,12 @@ class WhileContext(ControlFlowContext):
   def _AddOpInternal(self, op):
     """Add `op` to the current context.
 
-    In the case that op has only external data inputs, we remove all of its
-    external control inputs so all its inputs are in the same while loop
-    context. This is valid because op now has an Enter input that has all
-    the right control dependency.
+    We move any external control dependencies of the op to the loop pivot, to
+    ensure they get executed.
     """
     if not op.inputs:
       # Remove any external control dependency on this op
-      control_inputs = self._RemoveExternalControlEdges(op)
+      control_inputs, external_inputs = self._RemoveExternalControlEdges(op)
       # Add a control edge from the control pivot to this op.
       if not control_inputs:
         # pylint: disable=protected-access
@@ -2452,14 +2453,20 @@ class WhileContext(ControlFlowContext):
         x = op.inputs[index]
         real_x = self.AddValue(x)
         if real_x != x:
-          op._update_input(index, real_x)
-      # Remove any external control dependency on this op.
-      self._RemoveExternalControlEdges(op)
+          op._update_input(index, real_x)  # pylint: disable=protected-access
+      # Remove any external control dependency on this op and move then to an
+      # Enter node.
+      _, external_inputs = self._RemoveExternalControlEdges(op)
       # Add a control dependency to prevent loop invariants from
       # enabling ops that should not be executed.
       self._MaybeAddControlDependency(op)
       for x in op.outputs:
         self._values.add(x.name)
+    if external_inputs:
+      # Make the pivot depend on external control inputs
+      pred = self._pivot_for_pred.op.inputs[0]
+      assert util.IsLoopEnter(pred.op)
+      pred.op._add_control_inputs(external_inputs)  # pylint: disable=protected-access
     if self._outer_context or not util.IsLoopExit(op):
       op.graph.prevent_fetching(op)
       for x in op.outputs:
-- 
GitLab


From 1f915bf88b282aae50da6fe3b4204df5509d0542 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Mon, 5 Feb 2018 13:43:21 -0800
Subject: [PATCH 660/966] Fix the incorrect link to vulnerability reporting
 (#16778)

This fix fixes the incorrect link to vulnerability reporting.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/docs_src/community/welcome.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/docs_src/community/welcome.md b/tensorflow/docs_src/community/welcome.md
index d2d3f9edae..9f6fe91b14 100644
--- a/tensorflow/docs_src/community/welcome.md
+++ b/tensorflow/docs_src/community/welcome.md
@@ -65,5 +65,5 @@ please read the following list carefully:
     on GitHub.  For example, use the issue tracker to request a
     new operation in TensorFlow.
   * To report vulnerabilities, please follow our
-    [vulnerability disclosure guidelines](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md).
+    [vulnerability disclosure guidelines](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/SECURITY.md).
 
-- 
GitLab


From b3360e040602a2aa46fb9e27c7af940d651a704b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 5 Feb 2018 13:43:08 -0800
Subject: [PATCH 661/966] [XLA] Add tests for Clamp of S32 and U32 vectors with
 broadcasted scalars.

PiperOrigin-RevId: 184579375
---
 .../xla/tests/array_elementwise_ops_test.cc   | 41 +++++++++++++++++--
 1 file changed, 37 insertions(+), 4 deletions(-)

diff --git a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
index 52e14a1f7b..b788631fa3 100644
--- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
@@ -1879,17 +1879,16 @@ XLA_TEST_F(ArrayElementwiseOpTest, ClampF32ScalarVector) {
   auto min_scalar = builder.ConstantR0<float>(0.0f);
   auto min_vector = builder.ConstantR1<float>({1.0f, -6.5f, 1.0f, 2.25f, 0.0f});
   auto arg_vector = builder.ConstantR1<float>({2.0f, 10.0f, -5.0f, 1.0f, 4.0f});
-  auto arg_scalar = builder.ConstantR1<float>({2.0f, 10.0f, -5.0f, 1.0f, 4.0f});
   auto max_scalar = builder.ConstantR0<float>(3.0f);
   auto max_vector = builder.ConstantR1<float>({3.0f, 0.5f, 25.5f, 5.0f, 123.0});
   // Perform clamp with broadcasted scalar and vector.
   auto clamp = builder.Add(
       builder.Add(builder.Clamp(min_vector, arg_vector, max_scalar),
                   builder.Clamp(min_scalar, arg_vector, max_vector)),
-      builder.Add(builder.Clamp(min_vector, arg_scalar, max_vector),
-                  builder.Clamp(min_scalar, arg_scalar, max_vector)));
+      builder.Add(builder.Clamp(min_vector, arg_vector, max_vector),
+                  builder.Clamp(min_scalar, arg_vector, max_scalar)));
 
-  ComputeAndCompareR1<float>(&builder, {8.0f, 4.5f, 2.0f, 6.5f, 15.0f}, {},
+  ComputeAndCompareR1<float>(&builder, {8.0f, 7.0f, 2.0f, 6.5f, 14.0f}, {},
                              error_spec_);
 }
 
@@ -1903,6 +1902,23 @@ XLA_TEST_F(ArrayElementwiseOpTest, ClampS32Vector) {
   ComputeAndCompareR1<int32>(&builder, {2, 0, 1, 2, 4, -1}, {});
 }
 
+XLA_TEST_F(ArrayElementwiseOpTest, ClampS32ScalarVector) {
+  ComputationBuilder builder(client_, TestName());
+  auto min_scalar = builder.ConstantR0<int32>(0);
+  auto min_vector = builder.ConstantR1<int32>({1, -6, 1, 2, 0});
+  auto arg_vector = builder.ConstantR1<int32>({2, 10, -5, 1, 4});
+  auto max_scalar = builder.ConstantR0<int32>(3);
+  auto max_vector = builder.ConstantR1<int32>({3, 1, 25, 5, 123});
+  // Perform clamp with broadcasted scalar and vector.
+  auto clamp = builder.Add(
+      builder.Add(builder.Clamp(min_vector, arg_vector, max_scalar),
+                  builder.Clamp(min_scalar, arg_vector, max_vector)),
+      builder.Add(builder.Clamp(min_vector, arg_vector, max_vector),
+                  builder.Clamp(min_scalar, arg_vector, max_scalar)));
+
+  ComputeAndCompareR1<int32>(&builder, {8, 8, 2, 6, 14}, {});
+}
+
 XLA_TEST_F(ArrayElementwiseOpTest, ClampU32Vector) {
   ComputationBuilder builder(client_, TestName());
   auto min_vector = builder.ConstantR1<uint32>({1, 2, 1, 2, 0, ~0u - 4});
@@ -1913,6 +1929,23 @@ XLA_TEST_F(ArrayElementwiseOpTest, ClampU32Vector) {
   ComputeAndCompareR1<uint32>(&builder, {2, 5, 5, 2, 4, ~0u - 4}, {});
 }
 
+XLA_TEST_F(ArrayElementwiseOpTest, ClampU32ScalarVector) {
+  ComputationBuilder builder(client_, TestName());
+  auto min_scalar = builder.ConstantR0<uint32>(0);
+  auto min_vector = builder.ConstantR1<uint32>({1, 0, 1, 2, 0});
+  auto arg_vector = builder.ConstantR1<uint32>({2, 10, 0, 1, 4});
+  auto max_scalar = builder.ConstantR0<uint32>(3);
+  auto max_vector = builder.ConstantR1<uint32>({3, 1, 25, 5, 123});
+  // Perform clamp with broadcasted scalar and vector.
+  auto clamp = builder.Add(
+      builder.Add(builder.Clamp(min_vector, arg_vector, max_scalar),
+                  builder.Clamp(min_scalar, arg_vector, max_vector)),
+      builder.Add(builder.Clamp(min_vector, arg_vector, max_vector),
+                  builder.Clamp(min_scalar, arg_vector, max_scalar)));
+
+  ComputeAndCompareR1<uint32>(&builder, {8, 8, 2, 6, 14}, {});
+}
+
 XLA_TEST_F(ArrayElementwiseOpTest, AddTwoParametersF32s) {
   ComputationBuilder builder(client_, TestName());
 
-- 
GitLab


From 1bbfc0c9cebd1808fa46024f738e56d10d57d97e Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Mon, 5 Feb 2018 13:43:52 -0800
Subject: [PATCH 662/966] [tf.data] Fix use-after-free bug when closing down an
 input pipeline.

This fix affects the distributed runtime; DirectSession use is unaffected.

Before this change, an iterator that used a background prefetching
thread might attempt to use a captured FunctionLibraryRuntime from a
subgraph that had been deregistered (and hence its
FunctionLibraryRuntime would have been deleted). This change
introduces a mechanism for "cloning" the necessary parts of the
FunctionLibraryRuntime so that it can be owned by the
IteratorResource.

PiperOrigin-RevId: 184579490
---
 tensorflow/core/common_runtime/function.cc    | 19 ++++++++++++
 .../core/common_runtime/graph_optimizer.h     |  2 ++
 .../process_function_library_runtime.cc       | 12 ++++++++
 .../process_function_library_runtime.h        |  6 ++++
 tensorflow/core/framework/function.h          |  5 ++++
 tensorflow/core/kernels/data/iterator_ops.cc  |  7 +++--
 .../kernel_tests/iterator_ops_cluster_test.py | 30 +++++++++++++++++++
 7 files changed, 79 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index e1b5404b79..d349d2bb12 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -182,6 +182,10 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
 
   string DebugString(Handle h) override;
 
+  Status Clone(std::unique_ptr<FunctionLibraryDefinition>* out_lib_def,
+               std::unique_ptr<ProcessFunctionLibraryRuntime>* out_pflr,
+               FunctionLibraryRuntime** out_flr) override;
+
  private:
   typedef FunctionLibraryRuntimeImpl ME;
 
@@ -894,6 +898,21 @@ string FunctionLibraryRuntimeImpl::DebugString(Handle handle) {
   }
 }
 
+Status FunctionLibraryRuntimeImpl::Clone(
+    std::unique_ptr<FunctionLibraryDefinition>* out_lib_def,
+    std::unique_ptr<ProcessFunctionLibraryRuntime>* out_pflr,
+    FunctionLibraryRuntime** out_flr) {
+  TF_RETURN_IF_ERROR(
+      parent_->Clone(env_, graph_def_version_, optimizer_.options(),
+                     custom_kernel_creator_, out_lib_def, out_pflr));
+  *out_flr = (*out_pflr)->GetFLR(device_->name());
+  if (out_flr != nullptr) {
+    return Status::OK();
+  } else {
+    return errors::Internal("Cloning FunctionLibraryRuntime failed.");
+  }
+}
+
 namespace {
 
 struct CustomCreatorSingleton {
diff --git a/tensorflow/core/common_runtime/graph_optimizer.h b/tensorflow/core/common_runtime/graph_optimizer.h
index 8477cea126..80246281cd 100644
--- a/tensorflow/core/common_runtime/graph_optimizer.h
+++ b/tensorflow/core/common_runtime/graph_optimizer.h
@@ -52,6 +52,8 @@ class GraphOptimizer {
           shape_map,
       const std::function<bool(const Node*)>& cse_consider_fn = nullptr);
 
+  const OptimizerOptions& options() { return opts_; }
+
  private:
   OptimizerOptions opts_;
 
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc
index dd4bf6a345..41e1ce8c15 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@@ -350,4 +350,16 @@ void ProcessFunctionLibraryRuntime::Run(
   done(errors::Internal("Could not find device"));
 }
 
+Status ProcessFunctionLibraryRuntime::Clone(
+    Env* env, int graph_def_version, const OptimizerOptions& optimizer_options,
+    CustomKernelCreator custom_kernel_creator,
+    std::unique_ptr<FunctionLibraryDefinition>* out_lib_def,
+    std::unique_ptr<ProcessFunctionLibraryRuntime>* out_pflr) {
+  out_lib_def->reset(new FunctionLibraryDefinition(*lib_def_));
+  out_pflr->reset(new ProcessFunctionLibraryRuntime(
+      device_mgr_, env, graph_def_version, out_lib_def->get(),
+      optimizer_options, std::move(custom_kernel_creator), parent_));
+  return Status::OK();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.h b/tensorflow/core/common_runtime/process_function_library_runtime.h
index 9c9c92f1ea..4296f9449f 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.h
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.h
@@ -145,6 +145,12 @@ class ProcessFunctionLibraryRuntime {
   // Removes handle from the state owned by this object.
   Status RemoveHandle(FunctionLibraryRuntime::Handle handle);
 
+  Status Clone(Env* env, int graph_def_version,
+               const OptimizerOptions& optimizer_options,
+               CustomKernelCreator custom_kernel_creator,
+               std::unique_ptr<FunctionLibraryDefinition>* out_lib_def,
+               std::unique_ptr<ProcessFunctionLibraryRuntime>* out_pflr);
+
   friend class FunctionLibraryRuntimeImpl;
 
   mutable mutex mu_;
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index 7d0e15641d..e27001133b 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -35,6 +35,7 @@ namespace tensorflow {
 class CancellationManager;
 class GraphDef;
 class OpKernel;
+class ProcessFunctionLibraryRuntime;
 class ResourceMgr;
 class Rendezvous;
 class ScopedStepContainer;
@@ -535,6 +536,10 @@ class FunctionLibraryRuntime {
   virtual int graph_def_version() = 0;
 
   typedef uint64 LocalHandle;
+
+  virtual Status Clone(std::unique_ptr<FunctionLibraryDefinition>* out_lib_def,
+                       std::unique_ptr<ProcessFunctionLibraryRuntime>* out_pflr,
+                       FunctionLibraryRuntime** out_flr) = 0;
 };
 
 // Returns a canonicalized string for the instantiation of the
diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index dd5f4a4554..8a420ac26d 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -459,7 +459,7 @@ class IteratorHandleOp : public OpKernel {
     {
       mutex_lock l(mu_);
       if (resource_ == nullptr) {
-        FunctionLibraryRuntime* lib = context->function_library();
+        FunctionLibraryRuntime* lib;
         std::unique_ptr<DeviceMgr> device_mgr(nullptr);
         std::unique_ptr<FunctionLibraryDefinition> flib_def(nullptr);
         std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(nullptr);
@@ -469,6 +469,9 @@ class IteratorHandleOp : public OpKernel {
         // is sufficient demand, but it will require a significant refactoring.
         if (!name_.empty()) {
           lib = CreatePrivateFLR(context, &device_mgr, &flib_def, &pflr);
+        } else {
+          OP_REQUIRES_OK(context, context->function_library()->Clone(
+                                      &flib_def, &pflr, &lib));
         }
 
         ResourceMgr* mgr = context->resource_manager();
@@ -538,7 +541,7 @@ class IteratorHandleOp : public OpKernel {
     // Wrap the existing device in order to see any captured resources
     // in its resource manager. The existing device will outlive the
     // IteratorResource, because we are storing the IteratorResource
-    // in that device's resourc manager.
+    // in that device's resource manager.
     Device* wrapped_device = RenamedDevice::NewRenamedDevice(
         ctx->device()->name(), down_cast<Device*>(ctx->device()),
         false /* owns_underlying */, false /* isolate_session_state */);
diff --git a/tensorflow/python/data/kernel_tests/iterator_ops_cluster_test.py b/tensorflow/python/data/kernel_tests/iterator_ops_cluster_test.py
index 2c65c49ebd..25c91b42dc 100644
--- a/tensorflow/python/data/kernel_tests/iterator_ops_cluster_test.py
+++ b/tensorflow/python/data/kernel_tests/iterator_ops_cluster_test.py
@@ -17,6 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.data.ops import dataset_ops
@@ -30,6 +32,7 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import lookup_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
 
@@ -140,6 +143,33 @@ class IteratorClusterTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  def testImplicitDisposeParallelMapDataset(self):
+    # Tests whether a parallel map dataset will be cleaned up correctly when
+    # the pipeline does not run it until exhaustion.
+    # The pipeline is TensorSliceDataset -> MapDataset(square_3) ->
+    # RepeatDataset(None) -> PrefetchDataset(100).
+    worker, _ = test_util.create_local_cluster(1, 1)
+
+    components = (np.arange(1000),
+                  np.array([[1, 2, 3]]) * np.arange(1000)[:, np.newaxis],
+                  np.array(37.0) * np.arange(1000))
+
+    def _map_fn(x, y, z):
+      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
+
+    dataset = (
+        dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
+        .repeat(None).prefetch(10000))
+
+    iterator = dataset.make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with session.Session(worker[0].target) as sess:
+      sess.run(init_op)
+      for _ in range(3):
+        sess.run(get_next)
+
 
 if __name__ == "__main__":
   test.main()
-- 
GitLab


From a2d007b6bcdc76488692310e4c12ae010b6c9d32 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Mon, 5 Feb 2018 14:01:26 -0800
Subject: [PATCH 663/966] Fix incorrect reference DOI number/link for GDR
 (#16734)

This fix fixes the incorrect reference DOI number/link for GDR:
`https://doi.org/10.1145/3123878.3123907` -> `https://doi.org/10.1145/3123878.3131975`

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/contrib/gdr/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/gdr/README.md b/tensorflow/contrib/gdr/README.md
index 34ce60b360..8242d93f12 100644
--- a/tensorflow/contrib/gdr/README.md
+++ b/tensorflow/contrib/gdr/README.md
@@ -119,4 +119,4 @@ In the original design (as in the reference), tensor buffers are only registered
 Reference
 ===
 
-Bairen Yi, Jiacheng Xia, Li Chen, and Kai Chen. 2017. Towards Zero Copy Dataflows using RDMA. In Proceedings of SIGCOMM Posters and Demos'17, Los Angeles, CA, USA, August 22-24, 2017, 3 pages. https://doi.org/10.1145/3123878.3123907
+Bairen Yi, Jiacheng Xia, Li Chen, and Kai Chen. 2017. Towards Zero Copy Dataflows using RDMA. In Proceedings of SIGCOMM Posters and Demos'17, Los Angeles, CA, USA, August 22-24, 2017, 3 pages. https://doi.org/10.1145/3123878.3131975
-- 
GitLab


From ce6c6c4c12175490a05dc73a4d9122e551f662eb Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Mon, 5 Feb 2018 14:02:36 -0800
Subject: [PATCH 664/966] Propagate the name on resource variable assign
 (#16714)

---
 tensorflow/python/ops/state_ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/state_ops.py b/tensorflow/python/ops/state_ops.py
index 3cc76fdbf3..f00213eb88 100644
--- a/tensorflow/python/ops/state_ops.py
+++ b/tensorflow/python/ops/state_ops.py
@@ -278,7 +278,7 @@ def assign(ref, value, validate_shape=None, use_locking=None, name=None):
     return gen_state_ops.assign(
         ref, value, use_locking=use_locking, name=name,
         validate_shape=validate_shape)
-  return ref.assign(value)
+  return ref.assign(value, name=name)
 
 
 @tf_export("count_up_to")
-- 
GitLab


From 473bc3580510e2da299662da200791cf4a9fb086 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Mon, 5 Feb 2018 14:12:02 -0800
Subject: [PATCH 665/966] [TF:XLA] Implement GatherNd.

PiperOrigin-RevId: 184584104
---
 tensorflow/compiler/tests/BUILD               |  12 +
 .../compiler/tests/gather_nd_op_test.py       | 147 ++++++++++
 tensorflow/compiler/tf2xla/kernels/BUILD      |   1 -
 .../compiler/tf2xla/kernels/gather_op.cc      | 251 ++++++++++++------
 .../compiler/tf2xla/kernels/gather_op.h       |  41 ---
 .../tf2xla/kernels/gather_op_helpers.h        |  15 +-
 .../tf2xla/kernels/tensor_array_ops.cc        |   7 +-
 .../compiler/tf2xla/kernels/variable_ops.cc   |   8 +-
 8 files changed, 346 insertions(+), 136 deletions(-)
 create mode 100644 tensorflow/compiler/tests/gather_nd_op_test.py
 delete mode 100644 tensorflow/compiler/tf2xla/kernels/gather_op.h

diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index b0b038775f..25e329b6aa 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -665,6 +665,18 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "gather_nd_op_test",
+    size = "medium",
+    srcs = ["gather_nd_op_test.py"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 cuda_py_test(
     name = "xla_device_test",
     size = "small",
diff --git a/tensorflow/compiler/tests/gather_nd_op_test.py b/tensorflow/compiler/tests/gather_nd_op_test.py
new file mode 100644
index 0000000000..9378b1db72
--- /dev/null
+++ b/tensorflow/compiler/tests/gather_nd_op_test.py
@@ -0,0 +1,147 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.ops.tf.gather_nd."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.framework import errors
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class GatherNdTest(XLATestCase):
+
+  def _runGather(self, params, indices):
+    with self.test_session():
+      paramsp = array_ops.placeholder(params.dtype)
+      indicesp = array_ops.placeholder(indices.dtype)
+      with self.test_scope():
+        gather_nd_t = array_ops.gather_nd(paramsp, indicesp)
+      feed_dict = {paramsp: params, indicesp: indices}
+      return gather_nd_t.eval(feed_dict=feed_dict)
+
+  def testSimpleDtype(self):
+    for dtype in self.numeric_types:
+      self.assertAllEqual(
+          np.array([7, 7, 8], dtype=dtype),
+          self._runGather(
+              np.array([8, 1, 2, 3, 7, 5], dtype=dtype),
+              np.array([[4], [4], [0]], np.int32)))
+
+  def testEmptyIndicesAndParamsOKButJustEmptyParamsFails(self):
+    with self.test_session():
+      params = np.ones((3, 3), dtype=np.float32)
+
+      indices_empty = np.empty((0, 2), dtype=np.int32)
+      gather_nd_ok_val = self._runGather(params, indices_empty)
+      self.assertAllClose(np.empty((0,), dtype=np.float32), gather_nd_ok_val)
+
+      indices_empty = np.empty((0, 1), dtype=np.int32)
+      gather_nd_ok_val = self._runGather(params, indices_empty)
+      self.assertAllClose(np.empty((0, 3), dtype=np.float32), gather_nd_ok_val)
+
+      params_empty = np.empty((0, 3), dtype=np.float32)
+      indices_empty = np.empty((0, 2), dtype=np.int32)
+      gather_nd_ok_val = self._runGather(params_empty, indices_empty)
+      self.assertAllClose(np.empty((0,), dtype=np.float32), gather_nd_ok_val)
+
+      params_empty = np.empty((0, 3), dtype=np.float32)
+      indices_nonempty = np.zeros((1, 2), dtype=np.int32)
+      with self.assertRaisesWithPredicateMatch(
+          errors.InvalidArgumentError, r"Gather dimension 0 is of size zero"):
+        self._runGather(params_empty, indices_nonempty)
+
+  def testIndexScalar(self):
+    params = np.array(
+        [[-8, -1, -2, -3, -7, -5], [8, 1, 2, 3, 7, 5]], dtype=np.float32).T
+    indices = np.array([4, 1], dtype=np.int32)
+    gather_nd_val = self._runGather(params, indices)
+    self.assertAllEqual(np.array(7), gather_nd_val)
+
+  def testParamsRankLargerThanIndexIndexScalarSlices(self):
+    params = np.array(
+        [[-8, -1, -2, -3, -7, -5], [8, 1, 2, 3, 7, 5]], dtype=np.float32).T
+    indices = np.array(
+        [
+            4,
+        ], dtype=np.int32)
+    gather_nd_val = self._runGather(params, indices)
+    self.assertAllEqual(np.array([-7, 7]), gather_nd_val)
+
+  def testParamsRankLargerThanIndexSlices(self):
+    params = np.array(
+        [[-8, -1, -2, -3, -7, -5], [8, 1, 2, 3, 7, 5]], dtype=np.float32).T
+    indices = np.array([[4], [4], [0]], np.int32)
+    gather_nd_val = self._runGather(params, indices)
+    self.assertAllEqual(np.array([[-7, 7], [-7, 7], [-8, 8]]), gather_nd_val)
+
+  def testHigherRankParamsLargerThanIndexSlices(self):
+    params = np.array(
+        [[[-8, -1, -2, -3, -7, -5], [8, 1, 2, 3, 7, 5]],
+         [[-80, -10, -20, -30, -70, -50], [80, 10, 20, 30, 70, 50]]],
+        dtype=np.float32).T
+    indices = np.array([[4], [4], [0]], np.int32)
+    gather_nd_val = self._runGather(params, indices)
+    self.assertAllEqual(params[[4, 4, 0]], gather_nd_val)
+
+  def testEmptyIndicesLastRankMeansCopyEntireTensor(self):
+    params = np.array(
+        [[[-8, -1, -2, -3, -7, -5], [8, 1, 2, 3, 7, 5]],
+         [[-80, -10, -20, -30, -70, -50], [80, 10, 20, 30, 70, 50]]],
+        dtype=np.float32).T
+    indices = np.array([[], []], dtype=np.int32)  # Size (2, 0)
+    gather_nd_val = self._runGather(params, indices)
+    self.assertAllEqual(
+        np.vstack((params[np.newaxis, :], params[np.newaxis, :])),
+        gather_nd_val)
+
+  def testHigherRankParamsAndIndicesLargerThanIndexSlices(self):
+    params = np.array(
+        [[[-8, -1, -2, -3, -7, -5], [8, 1, 2, 3, 7, 5]],
+         [[-80, -10, -20, -30, -70, -50], [80, 10, 20, 30, 70, 50]]],
+        dtype=np.float32).T
+    indices = np.array([[[3], [2], [1]], [[4], [4], [0]]], np.int32)
+    gather_nd_val = self._runGather(params, indices)
+    self.assertAllEqual(params[[3, 2, 1, 4, 4, 0]].reshape(2, 3, 2, 2),
+                        gather_nd_val)
+
+  def testHigherRankParams(self):
+    shape = (10, 20, 5, 1, 17)
+    params = np.random.rand(*shape).astype(np.float32)
+    indices = np.vstack(
+        [np.random.randint(0, s, size=2000, dtype=np.int32) for s in shape]).T
+    gather_nd_val = self._runGather(params, indices)
+
+    expected = params[tuple(indices.T)]
+    self.assertAllEqual(expected, gather_nd_val)
+
+  def testHigherRankParamsAndIndices(self):
+    shape = (10, 20, 5, 1, 17)
+    params = np.random.rand(*shape).astype(np.float32)
+    indices = np.vstack(
+        [np.random.randint(0, s, size=2000, dtype=np.int32) for s in shape]).T
+    indices_reshaped = indices.reshape([10, 10, 20, 5])
+    gather_nd_val = self._runGather(params, indices_reshaped)
+    expected = params[tuple(indices.T)]
+    self.assertAllEqual(expected.reshape([10, 10, 20]), gather_nd_val)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index e9be6f8476..4c6b29bd01 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -87,7 +87,6 @@ tf_kernel_library(
         "variable_ops.cc",
     ],
     hdrs = [
-        "gather_op.h",
         "index_ops.h",
         "shape_util.h",
     ],
diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op.cc b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
index ffed382494..e9af1e9c2f 100644
--- a/tensorflow/compiler/tf2xla/kernels/gather_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/tf2xla/kernels/gather_op.h"
 #include "tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
@@ -26,25 +25,38 @@ limitations under the License.
 
 namespace tensorflow {
 
-xla::ComputationDataHandle XlaComputeGatherDynamicSlice(
-    XlaOpKernelContext* context, const xla::ComputationDataHandle& input,
-    const TensorShape& input_shape, const xla::ComputationDataHandle& indices,
-    const TensorShape& indices_shape, int64 axis, DataType dtype,
-    DataType index_type, xla::ComputationBuilder* builder) {
+Status XlaGather(const xla::ComputationDataHandle& input,
+                 const TensorShape& input_shape,
+                 const xla::ComputationDataHandle& indices,
+                 TensorShape indices_shape, int64 axis, bool indices_are_nd,
+                 DataType dtype, DataType index_type,
+                 xla::ComputationBuilder* builder,
+                 xla::ComputationDataHandle* gather_output) {
+  // If the indices are N-dimensional, then the last dimension of indices should
+  // be of size N and correspond to the N indices.
+  int64 num_axes = 1;
+  if (indices_are_nd) {
+    CHECK_GE(indices_shape.dims(), 1);
+    num_axes = indices_shape.dim_size(indices_shape.dims() - 1);
+    indices_shape.RemoveLastDims(1);
+  }
+
   // Although the indices Tensor is flattened into rank 1 during the lookup,
   // and each scalar entry is used as an index into the first dimension of the
   // input, the output is returned with shape:
   // input.shape[:axis] + indices.shape + input.shape[axis+1:]
+
   const int num_indices = indices_shape.num_elements();
   TensorShape input_shape_pre_axis(input_shape);
   input_shape_pre_axis.RemoveDimRange(axis, input_shape.dims());
   TensorShape input_shape_post_axis(input_shape);
-  input_shape_post_axis.RemoveDimRange(0, axis + 1);
-
+  input_shape_post_axis.RemoveDimRange(0, axis + num_axes);
   // Each slice of the input tensor has shape:
-  // [<input_shape_pre_axis>, 1, <input shape_post_axis>]
+  // [<input_shape_pre_axis>, 1, ..., 1, <input shape_post_axis>]
   TensorShape slice_shape(input_shape);
-  slice_shape.set_dim(axis, 1);
+  for (int64 i = 0; i < num_axes; ++i) {
+    slice_shape.set_dim(axis + i, 1);
+  }
 
   TensorShape loop_out_shape;
   loop_out_shape.AppendShape(input_shape_pre_axis);
@@ -62,8 +74,24 @@ xla::ComputationDataHandle XlaComputeGatherDynamicSlice(
 
   // Degenerate case: empty indices.
   if (num_indices == 0) {
-    return builder->Broadcast(XlaHelpers::Zero(builder, dtype),
-                              out_shape.dim_sizes());
+    *gather_output = builder->Broadcast(XlaHelpers::Zero(builder, dtype),
+                                        out_shape.dim_sizes());
+    return Status::OK();
+  }
+
+  for (int64 i = 0; i < num_axes; ++i) {
+    if (input_shape.dim_size(axis + i) == 0) {
+      return errors::InvalidArgument("Gather dimension ", axis + i,
+                                     " is of size zero in tensor with shape ",
+                                     input_shape.DebugString());
+    }
+  }
+
+  // Flatten the major dimensions of indices into a single dimension for ease of
+  // iteration. If there is an axis dimension, we must leave it alone.
+  std::vector<int64> flat_indices_shape = {num_indices};
+  if (indices_are_nd) {
+    flat_indices_shape.push_back(num_axes);
   }
 
   // Specify the shape of the loop-carried Tensor tuple.
@@ -76,8 +104,8 @@ xla::ComputationDataHandle XlaComputeGatherDynamicSlice(
        xla::ShapeUtil::MakeShape(idxtype, {}),
        // The input array has shape input_shape. Loop invariant.
        xla::ShapeUtil::MakeShape(ptype, input_shape.dim_sizes()),
-       // The gather indices are reshaped to rank 1. Loop invariant.
-       xla::ShapeUtil::MakeShape(idxtype, {num_indices}),
+       // The gather indices are reshaped to flat_indices_shape. Loop invariant.
+       xla::ShapeUtil::MakeShape(idxtype, flat_indices_shape),
        // The output array, which is updated on each loop iteration.
        xla::ShapeUtil::MakeShape(ptype, loop_out_shape.dim_sizes())});
   xla::Shape tuple_shape = xla::ShapeUtil::MakeTupleShape(tuple_shapes);
@@ -86,17 +114,16 @@ xla::ComputationDataHandle XlaComputeGatherDynamicSlice(
   auto init_i = XlaHelpers::Zero(builder, index_type);
   auto init_out = builder->Broadcast(XlaHelpers::Zero(builder, dtype),
                                      loop_out_shape.dim_sizes());
-  // Flatten the indices into 1-D for ease of iteration.
-  auto indices_1d = builder->Reshape(indices, {num_indices});
-  auto init = builder->Tuple({init_i, input, indices_1d, init_out});
+  auto flat_indices = builder->Reshape(indices, flat_indices_shape);
+  auto init = builder->Tuple({init_i, input, flat_indices, init_out});
 
   // Construct the while loop condition (i < num_indices)
-  xla::ComputationBuilder condb(context->builder()->client(),
-                                "GatherWhileCond");
-  condb.Lt(condb.GetTupleElement(
-               condb.Parameter(0, tuple_shape, "GatherWhileTuple"), 0),
-           XlaHelpers::IntegerLiteral(&condb, index_type, num_indices));
-  auto cond_status = condb.Build();
+  std::unique_ptr<xla::ComputationBuilder> condb =
+      builder->CreateSubBuilder("GatherWhileCond");
+  condb->Lt(condb->GetTupleElement(
+                condb->Parameter(0, tuple_shape, "GatherWhileTuple"), 0),
+            XlaHelpers::IntegerLiteral(condb.get(), index_type, num_indices));
+  auto cond_status = condb->Build();
   auto cond = cond_status.ConsumeValueOrDie();
 
   // Construct the while loop body's function. The implementation of gather is:
@@ -104,89 +131,145 @@ xla::ComputationDataHandle XlaComputeGatherDynamicSlice(
   //   index = dynamic-slice(indices, i)
   //   xi = dynamic-slice(input, index)
   //   output = dynamic-update-slice(output, xi, i)
-  xla::ComputationBuilder bodyb(context->builder()->client(),
-                                "GatherWhileBody");
+  std::unique_ptr<xla::ComputationBuilder> bodyb =
+      builder->CreateSubBuilder("GatherWhileBody");
   {
     // The four loop carried values.
-    auto loop_tuple = bodyb.Parameter(0, tuple_shape, "GatherWhileTuple");
-    auto i = bodyb.GetTupleElement(loop_tuple, 0);
-    auto input = bodyb.GetTupleElement(loop_tuple, 1);
-    auto indices = bodyb.GetTupleElement(loop_tuple, 2);
-    auto output = bodyb.GetTupleElement(loop_tuple, 3);
-
-    // Slice from the input array.
-    auto index = bodyb.DynamicSlice(indices, bodyb.Reshape(i, {1}), {1});
-    auto start_indices = bodyb.Pad(
-        bodyb.Reshape(index, {1}), XlaHelpers::Zero(&bodyb, index_type),
+    auto loop_tuple = bodyb->Parameter(0, tuple_shape, "GatherWhileTuple");
+    auto i = bodyb->GetTupleElement(loop_tuple, 0);
+    auto input = bodyb->GetTupleElement(loop_tuple, 1);
+    auto indices = bodyb->GetTupleElement(loop_tuple, 2);
+    auto output = bodyb->GetTupleElement(loop_tuple, 3);
+
+    auto zero_index = XlaHelpers::Zero(bodyb.get(), index_type);
+
+    // Slice the i-th index from the indices array.
+    xla::ComputationDataHandle index;
+    auto indices_offset = bodyb->Reshape(i, {1});
+    if (indices_are_nd) {
+      // Slice out the entire nd index, if applicable.
+      indices_offset = bodyb->Pad(indices_offset, zero_index,
+                                  xla::MakeEdgePaddingConfig({{0, 1}}));
+      index = bodyb->DynamicSlice(indices, indices_offset, {1, num_axes});
+      index = bodyb->Collapse(index, {0, 1});
+    } else {
+      index = bodyb->DynamicSlice(indices, indices_offset, {1});
+    }
+
+    // Slice the corresponding data from the input array.
+    auto start_indices = bodyb->Pad(
+        index, zero_index,
         xla::MakeEdgePaddingConfig(
             {{input_shape_pre_axis.dims(), input_shape_post_axis.dims()}}));
-    auto slice_i = bodyb.Reshape(
-        bodyb.DynamicSlice(input, start_indices, slice_shape.dim_sizes()),
+    auto slice_i = bodyb->Reshape(
+        bodyb->DynamicSlice(input, start_indices, slice_shape.dim_sizes()),
         loop_out_slice_shape.dim_sizes());
 
     // Construct the index into the output Tensor 0, ..., <index>, 0, ...
     std::vector<xla::ComputationDataHandle> out_index_vals(
-        loop_out_shape.dims(),
-        bodyb.Reshape(XlaHelpers::Zero(&bodyb, index_type), {1}));
-    out_index_vals[input_shape_pre_axis.dims()] = bodyb.Reshape(i, {1});
-    auto out_index = bodyb.ConcatInDim(out_index_vals, 0);
+        loop_out_shape.dims(), bodyb->Reshape(zero_index, {1}));
+    out_index_vals[input_shape_pre_axis.dims()] = bodyb->Reshape(i, {1});
+    auto out_index = bodyb->ConcatInDim(out_index_vals, 0);
 
     // Update the output Tensor
-    auto updated_output = bodyb.DynamicUpdateSlice(output, slice_i, out_index);
+    auto updated_output = bodyb->DynamicUpdateSlice(output, slice_i, out_index);
 
-    bodyb.Tuple({bodyb.Add(i, XlaHelpers::One(&bodyb, index_type)), input,
-                 indices, updated_output});
+    bodyb->Tuple({bodyb->Add(i, XlaHelpers::One(bodyb.get(), index_type)),
+                  input, indices, updated_output});
   }
-  auto body_status = bodyb.Build();
+  auto body_status = bodyb->Build();
   auto body = body_status.ConsumeValueOrDie();
 
   // Construct the While loop, extract and reshape the output.
   auto gather_while = builder->While(cond, body, init);
-  auto gather_output = builder->GetTupleElement(gather_while, 3);
-  return builder->Reshape(gather_output, out_shape.dim_sizes());
+  auto result = builder->GetTupleElement(gather_while, 3);
+  *gather_output = builder->Reshape(result, out_shape.dim_sizes());
+  return Status::OK();
 }
 
-GatherOpDynamicSlice::GatherOpDynamicSlice(OpKernelConstruction* context)
-    : XlaOpKernel(context) {}
-
-void GatherOpDynamicSlice::Compile(XlaOpKernelContext* context) {
-  xla::ComputationBuilder* builder = context->builder();
-  auto input = context->Input(0);
-  auto input_shape = context->InputShape(0);
-  auto indices = context->Input(1);
-  auto indices_shape = context->InputShape(1);
-  int64 axis = 0;
-  if (context->num_inputs() == 3) {
-    const TensorShape axis_shape = context->InputShape(2);
-    OP_REQUIRES(context, TensorShapeUtils::IsScalar(axis_shape),
-                errors::InvalidArgument("axis must be scalar"));
-    DataType axis_type = input_type(2);
-    OP_REQUIRES(context, axis_type == DT_INT32 || axis_type == DT_INT64,
-                errors::InvalidArgument("axis must be int32 or int64"));
-
-    OP_REQUIRES_OK(context, context->ConstantInputAsIntScalar(2, &axis));
-    const auto params_dims = input_shape.dims();
-    if (axis < 0) {
-      axis += params_dims;
+class GatherOp : public XlaOpKernel {
+ public:
+  explicit GatherOp(OpKernelConstruction* context) : XlaOpKernel(context) {}
+
+  void Compile(XlaOpKernelContext* context) override {
+    xla::ComputationBuilder* builder = context->builder();
+    auto input = context->Input(0);
+    auto input_shape = context->InputShape(0);
+    auto indices = context->Input(1);
+    auto indices_shape = context->InputShape(1);
+    int64 axis = 0;
+    if (context->num_inputs() == 3) {
+      const TensorShape axis_shape = context->InputShape(2);
+      OP_REQUIRES(context, TensorShapeUtils::IsScalar(axis_shape),
+                  errors::InvalidArgument("axis must be scalar"));
+      DataType axis_type = input_type(2);
+      OP_REQUIRES(context, axis_type == DT_INT32 || axis_type == DT_INT64,
+                  errors::InvalidArgument("axis must be int32 or int64"));
+
+      OP_REQUIRES_OK(context, context->ConstantInputAsIntScalar(2, &axis));
+      const auto params_dims = input_shape.dims();
+      if (axis < 0) {
+        axis += params_dims;
+      }
+      OP_REQUIRES(
+          context, 0 <= axis && axis < params_dims,
+          errors::InvalidArgument("Expected axis in the range [", -params_dims,
+                                  ", ", params_dims, "), but got ", axis));
     }
-    OP_REQUIRES(
-        context, 0 <= axis && axis < params_dims,
-        errors::InvalidArgument("Expected axis in the range [", -params_dims,
-                                ", ", params_dims, "), but got ", axis));
+
+    DataType index_type = input_type(1);
+    OP_REQUIRES(context, index_type == DT_INT32 || index_type == DT_INT64,
+                errors::InvalidArgument("indices must be int32 or int64"));
+
+    xla::ComputationDataHandle gather;
+    OP_REQUIRES_OK(
+        context, XlaGather(input, input_shape, indices, indices_shape, axis,
+                           /*indices_are_nd=*/false, input_type(0), index_type,
+                           builder, &gather));
+    context->SetOutput(0, gather);
   }
 
-  DataType index_type = input_type(1);
-  OP_REQUIRES(context, index_type == DT_INT32 || index_type == DT_INT64,
-              errors::InvalidArgument("indices must be int32 or int64"));
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(GatherOp);
+};
 
-  xla::ComputationDataHandle gather = XlaComputeGatherDynamicSlice(
-      context, input, input_shape, indices, indices_shape, axis, input_type(0),
-      index_type, builder);
-  context->SetOutput(0, gather);
-}
+REGISTER_XLA_OP(Name("Gather"), GatherOp);
+REGISTER_XLA_OP(Name("GatherV2").CompileTimeConstInput("axis"), GatherOp);
+
+class GatherNdOp : public XlaOpKernel {
+ public:
+  explicit GatherNdOp(OpKernelConstruction* context) : XlaOpKernel(context) {}
+
+  void Compile(XlaOpKernelContext* context) override {
+    DataType params_type = context->input_type(0);
+    DataType indices_type = context->input_type(1);
+
+    TensorShape params_shape = context->InputShape(0);
+    TensorShape indices_shape = context->InputShape(1);
+    OP_REQUIRES(context, TensorShapeUtils::IsVectorOrHigher(params_shape),
+                errors::InvalidArgument("params must be at least a vector"));
+    OP_REQUIRES(context, TensorShapeUtils::IsVectorOrHigher(indices_shape),
+                errors::InvalidArgument("indices must be at least a vector"));
+    const int64 num_axes = indices_shape.dim_size(indices_shape.dims() - 1);
+    OP_REQUIRES(
+        context, num_axes <= params_shape.dims(),
+        errors::InvalidArgument(
+            "index innermost dimension length must be <= params rank; saw: ",
+            indices_shape.dim_size(indices_shape.dims() - 1), " vs. ",
+            params_shape.dims()));
+
+    xla::ComputationBuilder* builder = context->builder();
+    auto params = context->Input(0);
+    auto indices = context->Input(1);
+    xla::ComputationDataHandle gather;
+    OP_REQUIRES_OK(context, XlaGather(params, params_shape, indices,
+                                      indices_shape, /*axis=*/0,
+                                      /*indices_are_nd=*/true, params_type,
+                                      indices_type, builder, &gather));
+    context->SetOutput(0, gather);
+  }
+};
 
-REGISTER_XLA_OP(Name("Gather"), GatherOpDynamicSlice);
-REGISTER_XLA_OP(Name("GatherV2").CompileTimeConstInput("axis"),
-                GatherOpDynamicSlice);
+REGISTER_XLA_OP(Name("GatherNd"), GatherNdOp);
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op.h b/tensorflow/compiler/tf2xla/kernels/gather_op.h
deleted file mode 100644
index df86e1fcdd..0000000000
--- a/tensorflow/compiler/tf2xla/kernels/gather_op.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Declaration of the Gather Op using the XLA dynamic slice implementation.
-
-#ifndef TENSORFLOW_COMPILER_TF2XLA_KERNELS_GATHER_OP_H_
-#define TENSORFLOW_COMPILER_TF2XLA_KERNELS_GATHER_OP_H_
-
-#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
-#include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/util/bcast.h"
-
-namespace tensorflow {
-
-class GatherOpDynamicSlice : public XlaOpKernel {
- public:
-  explicit GatherOpDynamicSlice(OpKernelConstruction* context);
-
-  void Compile(XlaOpKernelContext* context) override;
-
- private:
-  TF_DISALLOW_COPY_AND_ASSIGN(GatherOpDynamicSlice);
-};
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_TF2XLA_KERNELS_GATHER_OP_H_
diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h b/tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h
index 2c80395c56..bd8b92c22d 100644
--- a/tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h
+++ b/tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h
@@ -30,11 +30,16 @@ namespace tensorflow {
 // shape input_shape) keyed on indices (of shape indices_shape).
 //
 // index_type must be must be DT_INT32 or DT_INT64.
-xla::ComputationDataHandle XlaComputeGatherDynamicSlice(
-    XlaOpKernelContext* ctx, const xla::ComputationDataHandle& input,
-    const TensorShape& input_shape, const xla::ComputationDataHandle& indices,
-    const TensorShape& indices_shape, int64 axis, DataType dtype,
-    DataType index_type, xla::ComputationBuilder* builder);
+// If `indices_are_nd` is true, the last dimension of `indices` are treated as
+// a multidimensional index values. Otherwise, `indices` is treated as a tensor
+// of scalar indices.
+Status XlaGather(const xla::ComputationDataHandle& input,
+                 const TensorShape& input_shape,
+                 const xla::ComputationDataHandle& indices,
+                 TensorShape indices_shape, int64 axis, bool indices_are_nd,
+                 DataType dtype, DataType index_type,
+                 xla::ComputationBuilder* builder,
+                 xla::ComputationDataHandle* gather_output);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
index 7cf9b796b9..000b50af6b 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
@@ -337,8 +337,11 @@ class TensorArrayGatherOp : public XlaOpKernel {
       }
     }
 
-    xla::ComputationDataHandle gather = XlaComputeGatherDynamicSlice(
-        ctx, ta, ta_shape, indices, indices_shape, 0, dtype_, index_type, b);
+    xla::ComputationDataHandle gather;
+    OP_REQUIRES_OK(
+        ctx,
+        XlaGather(ta, ta_shape, indices, indices_shape, /*axis=*/0,
+                  /*indices_are_nd=*/false, dtype_, index_type, b, &gather));
     ctx->SetOutput(0, gather);
   }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/variable_ops.cc b/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
index e4079ebf0b..71173f5aea 100644
--- a/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
@@ -117,9 +117,11 @@ class ResourceGatherOp : public XlaOpKernel {
     auto indices = ctx->Input(1);
     auto indices_shape = ctx->InputShape(1);
     DataType index_type = ctx->input_type(1);
-    xla::ComputationDataHandle gather = XlaComputeGatherDynamicSlice(
-        ctx, resource_handle, resource_shape, indices, indices_shape, 0, type,
-        index_type, builder);
+    xla::ComputationDataHandle gather;
+    OP_REQUIRES_OK(
+        ctx, XlaGather(resource_handle, resource_shape, indices, indices_shape,
+                       /*axis=*/0, /*indices_are_nd=*/false, type, index_type,
+                       builder, &gather));
     ctx->SetOutput(0, gather);
   }
 };
-- 
GitLab


From d53202bc9498a9045747282530380f92b4d66a00 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 5 Feb 2018 14:38:07 -0800
Subject: [PATCH 666/966] [XLA] Fix documentation for Clamp.

PiperOrigin-RevId: 184588630
---
 tensorflow/docs_src/performance/xla/operation_semantics.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/docs_src/performance/xla/operation_semantics.md b/tensorflow/docs_src/performance/xla/operation_semantics.md
index 1e9b8b35db..f865c30aa8 100644
--- a/tensorflow/docs_src/performance/xla/operation_semantics.md
+++ b/tensorflow/docs_src/performance/xla/operation_semantics.md
@@ -252,7 +252,7 @@ Clamps an operand to within the range between a minimum and maximum value.
 Given an operand and minimum and maximum values, returns the operand if it is in
 the range between the minimum and maximum, else returns the minimum value if the
 operand is below this range or the maximum value if the operand is above this
-range.  That is, `clamp(a, x, b) =  max(min(a, x), b)`.
+range.  That is, `clamp(a, x, b) =  min(max(a, x), b)`.
 
 All three arrays must be the same shape. Alternately, as a restricted form of
 [broadcasting](broadcasting.md), `min` and/or `max` can be a scalar of type `T`.
-- 
GitLab


From 1c762f70caf7004470cdfa599b4eb7a76e5bcc78 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 5 Feb 2018 14:43:35 -0800
Subject: [PATCH 667/966] Backward pass implementation for fusion optimizer.

PiperOrigin-RevId: 184589487
---
 tensorflow/core/grappler/optimizers/constant_folding.cc | 2 +-
 tensorflow/core/grappler/utils.cc                       | 9 ++++++++-
 tensorflow/core/grappler/utils.h                        | 2 +-
 tensorflow/core/grappler/utils_test.cc                  | 7 ++++---
 4 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index 0aeff6222c..37a4759fdd 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -1658,7 +1658,7 @@ Status ConstantFolding::RunOptimizationPass(Cluster* cluster,
   // more with the original node name.
   for (const auto& fetch : item.fetch) {
     const NodeDef* fetch_node = node_map_->GetNode(fetch);
-    if (fetch_node && NumOutputs(*fetch_node) == 1) {
+    if (fetch_node && NumOutputs(*fetch_node, graph_) == 1) {
       nodes_whitelist_.insert(fetch_node->name());
     }
   }
diff --git a/tensorflow/core/grappler/utils.cc b/tensorflow/core/grappler/utils.cc
index 8099214c2b..634577ed30 100644
--- a/tensorflow/core/grappler/utils.cc
+++ b/tensorflow/core/grappler/utils.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/types.h"
@@ -207,7 +208,7 @@ string AsControlDependency(const string& node_name) {
              : strings::StrCat("^", node_name);
 }
 
-int NumOutputs(const NodeDef& node) {
+int NumOutputs(const NodeDef& node, GraphDef* graph) {
   int num_outputs = 0;
   const OpDef* op_def = nullptr;
   auto status = OpRegistry::Global()->LookUpOpDef(node.op(), &op_def);
@@ -222,6 +223,12 @@ int NumOutputs(const NodeDef& node) {
         num_outputs++;
       }
     }
+  } else {
+    FunctionLibraryDefinition fdef(OpRegistry::Global(), graph->library());
+    auto status = fdef.LookUpOpDef(node.op(), &op_def);
+    if (status.ok()) {
+      num_outputs = op_def->output_arg_size();
+    }
   }
   return num_outputs;
 }
diff --git a/tensorflow/core/grappler/utils.h b/tensorflow/core/grappler/utils.h
index c04a9a666d..8840c44d05 100644
--- a/tensorflow/core/grappler/utils.h
+++ b/tensorflow/core/grappler/utils.h
@@ -135,7 +135,7 @@ string AsControlDependency(const string& node);
 
 // Returns the number of outputs of a node according to its OpDef. Note that
 // some of the outputs may be unconnected.
-int NumOutputs(const NodeDef& node);
+int NumOutputs(const NodeDef& node, GraphDef* graph);
 
 // Number of connected non-control inputs.
 int NumNonControlInputs(const NodeDef& node);
diff --git a/tensorflow/core/grappler/utils_test.cc b/tensorflow/core/grappler/utils_test.cc
index 77371c399e..ba4e6b1bae 100644
--- a/tensorflow/core/grappler/utils_test.cc
+++ b/tensorflow/core/grappler/utils_test.cc
@@ -177,9 +177,10 @@ TEST_F(UtilsTest, ExecuteWithTimeout) {
 }
 
 TEST_F(UtilsTest, NumOutputs) {
-  EXPECT_EQ(2, NumOutputs(CreateConcatOffsetNode()));
-  EXPECT_EQ(5, NumOutputs(CreateFusedBatchNormNode()));
-  EXPECT_EQ(1, NumOutputs(CreateDequeueNode()));
+  GraphDef graph;
+  EXPECT_EQ(2, NumOutputs(CreateConcatOffsetNode(), &graph));
+  EXPECT_EQ(5, NumOutputs(CreateFusedBatchNormNode(), &graph));
+  EXPECT_EQ(1, NumOutputs(CreateDequeueNode(), &graph));
 }
 
 TEST_F(UtilsTest, AsControlDependency) {
-- 
GitLab


From 036284bfbf78066ef74663bf3750bd728e03459a Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Mon, 5 Feb 2018 14:48:51 -0800
Subject: [PATCH 668/966] Merging the 1.6 branch back into master.

---
 RELEASE.md                                    | 99 ++++++++++++++++++-
 tensorflow/contrib/cmake/tf_tests.cmake       |  2 +
 .../eager/python/examples/mnist/mnist.py      |  2 +-
 .../contrib/lite/kernels/internal/BUILD       |  3 +
 tensorflow/contrib/py2tf/converters/BUILD     |  1 -
 .../contrib/tpu/profiler/pip_package/setup.py |  2 +-
 tensorflow/core/public/version.h              |  4 +-
 tensorflow/docs_src/install/install_c.md      |  2 +-
 tensorflow/docs_src/install/install_go.md     |  2 +-
 tensorflow/docs_src/install/install_java.md   | 22 ++---
 tensorflow/docs_src/install/install_linux.md  | 22 ++---
 tensorflow/docs_src/install/install_mac.md    | 10 +-
 .../docs_src/install/install_sources.md       | 10 +-
 tensorflow/python/client/session.py           | 18 +++-
 tensorflow/tools/ci_build/ci_sanity.sh        |  2 +-
 tensorflow/tools/docker/Dockerfile.devel      |  2 +-
 .../tools/docker/Dockerfile.devel-cpu-mkl     |  2 +-
 tensorflow/tools/docker/Dockerfile.devel-gpu  |  2 +-
 tensorflow/tools/pip_package/setup.py         |  2 +-
 19 files changed, 160 insertions(+), 49 deletions(-)

diff --git a/RELEASE.md b/RELEASE.md
index af6440acef..0fad3b5d41 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,11 +1,43 @@
-# Release 1.5.0
+# Release 1.6.0
 
 ## Breaking Changes
 * Prebuilt binaries are now built against CUDA 9.0 and cuDNN 7.
-* Our Linux binaries are built using ubuntu 16 containers, potentially
-  introducing glibc incompatibility issues with ubuntu 14.
-* Starting from 1.6 release, our prebuilt binaries will use AVX instructions.
-  This may break TF on older CPUs.
+* Prebuilt binaries will use AVX instructions. This may break TF on older CPUs.
+
+## Major Features And Improvements
+* New Optimizer internal API for non-slot variables. Descendants of AdamOptimizer that access _beta[12]_power will need to be updated.
+* `tf.estimator.{FinalExporter,LatestExporter}` now export stripped SavedModels. This improves forward compatibility of the SavedModel.
+* FFT support added to XLA CPU/GPU.
+
+## Bug Fixes and Other Changes
+* Documentation updates:
+  * Added a second version of Getting Started, which is aimed at ML
+newcomers.
+  * Clarified documentation on `resize_images.align_corners` parameter.
+  * Additional documentation for TPUs.
+* Google Cloud Storage (GCS):
+  * Add client-side throttle.
+  * Add a `FlushCaches()` method to the FileSystem interface, with an implementation for GcsFileSystem.
+* Other:
+  * Add `tf.contrib.distributions.Kumaraswamy`.
+  * `RetryingFileSystem::FlushCaches()` calls the base FileSystem's `FlushCaches()`.
+  * Add auto_correlation to distributions.
+  * Add `tf.contrib.distributions.Autoregressive`.
+  * Add SeparableConv1D layer.
+  * Add convolutional Flipout layers.
+  * When both inputs of `tf.matmul` are bfloat16, it returns bfloat16, instead of float32.
+  * Added `tf.contrib.image.connected_components`.
+  * Add `tf.contrib.framework.CriticalSection` that allows atomic variable access.
+  * Output variance over trees predictions for classifications tasks.
+  * For `pt` and `eval` commands, allow writing tensor values to filesystem as numpy files.
+  * gRPC: Propagate truncated errors (instead of returning gRPC internal error).
+  * Augment parallel_interleave to support 2 kinds of prefetching.
+  * Improved XLA support for C64-related ops log, pow, atan2, tanh.
+  * Add probabilistic convolutional layers.
+
+## API Changes
+* Introducing prepare_variance boolean with default setting to False for backward compatibility.
+* Move `layers_dense_variational_impl.py` to `layers_dense_variational.py`.
 
 ## Known Bugs
 * Using XLA:GPU with CUDA 9 and CUDA 9.1 results in garbage results and/or
@@ -28,6 +60,42 @@
   TensorFlow will print a warning if you use XLA:GPU with a known-bad version of
   CUDA; see e00ba24c4038e7644da417ddc639169b6ea59122.
 
+## Thanks to our Contributors
+
+This release contains contributions from many people at Google, as well as:
+
+4d55397500, Ag Ramesh, Aiden Scandella, Akimasa Kimura, Alex Rothberg, Allen Goodman,
+amilioto, Andrei Costinescu, Andrei Nigmatulin, Anjum Sayed, Anthony Platanios,
+Anush Elangovan, Armando Fandango, Ashish Kumar Ram, Ashwini Shukla, Ben, Bhavani Subramanian,
+Brett Koonce, Carl Thomé, cclauss, Cesc, Changming Sun, Christoph Boeddeker, Clayne Robison,
+Clemens Schulz, Clint (Woonhyuk Baek), codrut3, Cole Gerdemann, Colin Raffel, Daniel Trebbien,
+Daniel Ylitalo, Daniel Zhang, Daniyar, Darjan Salaj, Dave Maclachlan, David Norman, Dong--Jian,
+dongsamb, dssgsra, Edward H, eladweiss, elilienstein, Eric Lilienstein, error.d, Eunji Jeong, fanlu,
+Florian Courtial, fo40225, Fred, Gregg Helt, Guozhong Zhuang, Hanchen Li, hsm207, hyunyoung2,
+ImSheridan, Ishant Mrinal Haloi, Jacky Ko, Jay Young, Jean Flaherty, Jerome, JerrikEph, Jesse
+Kinkead, jfaath, Jian Lin, jinghuangintel, Jiongyan Zhang, Joel Hestness, Joel Shor, Johnny Chan,
+Julian Niedermeier, Julian Wolff, JxKing, K-W-W, Karl Lessard, Kasper Marstal, Keiji Ariyama,
+Koan-Sin Tan, Loki Der Quaeler, Loo Rong Jie, Luke Schaefer, Lynn Jackson, ManHyuk, Matt Basta,
+Matt Smith, Matthew Schulkind, Michael, michaelkhan3, Miguel Piedrafita, Mikalai Drabovich,
+Mike Knapp, mjwen, mktozk, Mohamed Aly, Mohammad Ashraf Bhuiyan, Myungjoo Ham, Naman Bhalla,
+Namrata-Ibm, Nathan Luehr, nathansilberman, Netzeband, Niranjan Hasabnis, Omar Aflak, Ozge
+Yalcinkaya, Parth P Panchal, patrickzzy, Patryk Chrabaszcz, Paul Van Eck, Paweł Kapica, Peng Yu,
+Philip Yang, Pierre Blondeau, Po-Hsien Chu, powderluv, Puyu Wang, Rajendra Arora, Rasmus, Renat
+Idrisov, resec, Robin Richtsfeld, Ronald Eddy Jr, Sahil Singh, Sam Matzek, Sami Kama, sandipmgiri,
+Santiago Castro, Sayed Hadi Hashemi, Scott Tseng, Sergii Khomenko, Shahid, Shengpeng Liu, Shreyash
+Sharma, Shrinidhi Kl, Simone Cirillo, simsicon, Stanislav Levental, starsblinking, Stephen Lumenta,
+Steven Hickson, Su Tang, Taehoon Lee, Takuya Wakisaka, Ted Chang, Ted Ying, Tijmen Verhulsdonck,
+Timofey Kondrashov, vade, vaibhav, Valentin Khrulkov, vchigrin, Victor Costan, Viraj Navkal,
+Vivek Rane, wagonhelm, Yan Facai (颜发才), Yanbo Liang, Yaroslav Bulatov, yegord, Yong Tang,
+Yoni Tsafir, yordun, Yuan (Terry) Tang, Yuxin Wu, zhengdi, Zhengsheng Wei, 田传武
+
+# Release 1.5.0
+
+## Breaking Changes
+* Prebuilt binaries are now built against CUDA 9.0 and cuDNN 7.
+* Starting from 1.6 release, our prebuilt binaries will use AVX instructions.
+  This may break TF on older CPUs.
+
 ## Major Features And Improvements
 * [Eager execution](https://github.com/tensorflow/tensorflow/tree/r1.5/tensorflow/contrib/eager)
   preview version is now available.
@@ -146,6 +214,27 @@
 * Minor refactor: move stats files from `stochastic` to `common` and remove
   `stochastic`.
 
+## Known Bugs
+* Using XLA:GPU with CUDA 9 and CUDA 9.1 results in garbage results and/or
+  `CUDA_ILLEGAL_ADDRESS` failures.
+
+  Google discovered in mid-December 2017 that the PTX-to-SASS compiler in CUDA 9
+  and CUDA 9.1 sometimes does not properly compute the carry bit when
+  decomposing 64-bit address calculations with large offsets (e.g. `load [x +
+  large_constant]`) into 32-bit arithmetic in SASS.
+
+  As a result, these versions of `ptxas` miscompile most XLA programs which use
+  more than 4GB of temp memory.  This results in garbage results and/or
+  `CUDA_ERROR_ILLEGAL_ADDRESS` failures.
+
+  A fix in CUDA 9.1.121 is expected in late February 2018.  We do not expect a
+  fix for CUDA 9.0.x.  Until the fix is available, the only workaround is to
+  [downgrade](https://developer.nvidia.com/cuda-toolkit-archive) to CUDA 8.0.x
+  or disable XLA:GPU.
+
+  TensorFlow will print a warning if you use XLA:GPU with a known-bad version of
+  CUDA; see e00ba24c4038e7644da417ddc639169b6ea59122.
+
 ## Thanks to our Contributors
 
 This release contains contributions from many people at Google, as well as:
diff --git a/tensorflow/contrib/cmake/tf_tests.cmake b/tensorflow/contrib/cmake/tf_tests.cmake
index 2e79eadf7f..73edd616ea 100644
--- a/tensorflow/contrib/cmake/tf_tests.cmake
+++ b/tensorflow/contrib/cmake/tf_tests.cmake
@@ -310,6 +310,8 @@ if (tensorflow_BUILD_PYTHON_TESTS)
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/control_flow_util_test.py"
       # Flaky replicate_model_fn_test
       "${tensorflow_source_dir}/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py"  # b/71901810
+      # Broken io_utils_test
+      "${tensorflow_source_dir}/tensorflow/python/keras/_impl/keras/utils/io_utils_test.py"  # b/72894325
   )
   endif()
   list(REMOVE_ITEM tf_test_src_py ${tf_test_src_py_exclude})
diff --git a/tensorflow/contrib/eager/python/examples/mnist/mnist.py b/tensorflow/contrib/eager/python/examples/mnist/mnist.py
index ed7dbc8904..772f59562b 100644
--- a/tensorflow/contrib/eager/python/examples/mnist/mnist.py
+++ b/tensorflow/contrib/eager/python/examples/mnist/mnist.py
@@ -39,7 +39,7 @@ class MNISTModel(tfe.Network):
   """MNIST Network.
 
   Network structure is equivalent to:
-  https://github.com/tensorflow/tensorflow/blob/r1.5/tensorflow/examples/tutorials/mnist/mnist_deep.py
+  https://github.com/tensorflow/tensorflow/blob/r1.6/tensorflow/examples/tutorials/mnist/mnist_deep.py
   and
   https://github.com/tensorflow/models/blob/master/tutorials/image/mnist/convolutional.py
 
diff --git a/tensorflow/contrib/lite/kernels/internal/BUILD b/tensorflow/contrib/lite/kernels/internal/BUILD
index 404c7d3718..adedd58ff4 100644
--- a/tensorflow/contrib/lite/kernels/internal/BUILD
+++ b/tensorflow/contrib/lite/kernels/internal/BUILD
@@ -351,6 +351,9 @@ cc_library(
         ":x86": [
             ":neon_tensor_utils",
         ],
+        ":k8": [
+            ":neon_tensor_utils",
+        ],
         ":darwin": [
             ":neon_tensor_utils",
         ],
diff --git a/tensorflow/contrib/py2tf/converters/BUILD b/tensorflow/contrib/py2tf/converters/BUILD
index 03ded49022..fc6781d50e 100644
--- a/tensorflow/contrib/py2tf/converters/BUILD
+++ b/tensorflow/contrib/py2tf/converters/BUILD
@@ -132,7 +132,6 @@ py_test(
     name = "for_canonicalization_test",
     srcs = ["for_canonicalization_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["nomac"],
     deps = [
         ":test_lib",
         "//tensorflow/contrib/py2tf/pyct",
diff --git a/tensorflow/contrib/tpu/profiler/pip_package/setup.py b/tensorflow/contrib/tpu/profiler/pip_package/setup.py
index 3dffebe668..cb61984799 100644
--- a/tensorflow/contrib/tpu/profiler/pip_package/setup.py
+++ b/tensorflow/contrib/tpu/profiler/pip_package/setup.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 from setuptools import setup
 
-_VERSION = '1.5.0-rc1'
+_VERSION = '1.6.0-rc0'
 
 CONSOLE_SCRIPTS = [
     'capture_tpu_profile=cloud_tpu_profiler.main:run_main',
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index b02f899b87..50bfa91267 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -19,12 +19,12 @@ limitations under the License.
 // TensorFlow uses semantic versioning, see http://semver.org/.
 
 #define TF_MAJOR_VERSION 1
-#define TF_MINOR_VERSION 5
+#define TF_MINOR_VERSION 6
 #define TF_PATCH_VERSION 0
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
 // "-beta", "-rc", "-rc.1")
-#define TF_VERSION_SUFFIX ""
+#define TF_VERSION_SUFFIX "-rc0"
 
 #define TF_STR_HELPER(x) #x
 #define TF_STR(x) TF_STR_HELPER(x)
diff --git a/tensorflow/docs_src/install/install_c.md b/tensorflow/docs_src/install/install_c.md
index 14add7c77e..a783205b4a 100644
--- a/tensorflow/docs_src/install/install_c.md
+++ b/tensorflow/docs_src/install/install_c.md
@@ -38,7 +38,7 @@ enable TensorFlow for C:
          OS="linux" # Change to "darwin" for macOS
          TARGET_DIRECTORY="/usr/local"
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.5.0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.6.0-rc0.tar.gz" |
            sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_go.md b/tensorflow/docs_src/install/install_go.md
index d2af9d9843..5249e04615 100644
--- a/tensorflow/docs_src/install/install_go.md
+++ b/tensorflow/docs_src/install/install_go.md
@@ -38,7 +38,7 @@ steps to install this library and enable TensorFlow for Go:
          TF_TYPE="cpu" # Change to "gpu" for GPU support
          TARGET_DIRECTORY='/usr/local'
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.5.0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.6.0-rc0.tar.gz" |
          sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_java.md b/tensorflow/docs_src/install/install_java.md
index e5388c4b1e..0c6c773e62 100644
--- a/tensorflow/docs_src/install/install_java.md
+++ b/tensorflow/docs_src/install/install_java.md
@@ -36,7 +36,7 @@ following to the project's `pom.xml` to use the TensorFlow Java APIs:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>tensorflow</artifactId>
-  <version>1.5.0</version>
+  <version>1.6.0-rc0</version>
 </dependency>
 ```
 
@@ -65,7 +65,7 @@ As an example, these steps will create a Maven project that uses TensorFlow:
                <dependency>
                  <groupId>org.tensorflow</groupId>
                  <artifactId>tensorflow</artifactId>
-                 <version>1.5.0</version>
+                 <version>1.6.0-rc0</version>
                </dependency>
              </dependencies>
          </project>
@@ -123,12 +123,12 @@ instead:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow</artifactId>
-  <version>1.5.0</version>
+  <version>1.6.0-rc0</version>
 </dependency>
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow_jni_gpu</artifactId>
-  <version>1.5.0</version>
+  <version>1.6.0-rc0</version>
 </dependency>
 ```
 
@@ -147,7 +147,7 @@ refer to the simpler instructions above instead.
 Take the following steps to install TensorFlow for Java on Linux or macOS:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.5.0.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.6.0-rc0.jar),
      which is the TensorFlow Java Archive (JAR).
 
   2. Decide whether you will run TensorFlow for Java on CPU(s) only or with
@@ -166,7 +166,7 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
          OS=$(uname -s | tr '[:upper:]' '[:lower:]')
          mkdir -p ./jni
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.5.0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.6.0-rc0.tar.gz" |
            tar -xz -C ./jni
 
 ### Install on Windows
@@ -174,10 +174,10 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
 Take the following steps to install TensorFlow for Java on Windows:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.5.0.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.6.0-rc0.jar),
      which is the TensorFlow Java Archive (JAR).
   2. Download the following Java Native Interface (JNI) file appropriate for
-     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.5.0.zip).
+     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.6.0-rc0.zip).
   3. Extract this .zip file.
 
 
@@ -225,7 +225,7 @@ must be part of your `classpath`. For example, you can include the
 downloaded `.jar` in your `classpath` by using the `-cp` compilation flag
 as follows:
 
-<pre><b>javac -cp libtensorflow-1.5.0.jar HelloTF.java</b></pre>
+<pre><b>javac -cp libtensorflow-1.6.0-rc0.jar HelloTF.java</b></pre>
 
 
 ### Running
@@ -239,11 +239,11 @@ two files are available to the JVM:
 For example, the following command line executes the `HelloTF` program on Linux
 and macOS X:
 
-<pre><b>java -cp libtensorflow-1.5.0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.6.0-rc0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
 
 And the following command line executes the `HelloTF` program on Windows:
 
-<pre><b>java -cp libtensorflow-1.5.0.jar;. -Djava.library.path=jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.6.0-rc0.jar;. -Djava.library.path=jni HelloTF</b></pre>
 
 If the program prints <tt>Hello from <i>version</i></tt>, you've successfully
 installed TensorFlow for Java and are ready to use the API.  If the program
diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index cd8c14599f..105b225177 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -188,7 +188,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      Virtualenv environment:
 
      <pre>(tensorflow)$ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common_installation_problems).
@@ -293,7 +293,7 @@ take the following steps:
 
      <pre>
      $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp34-cp34m-linux_x86_64.whl</b>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp34-cp34m-linux_x86_64.whl</b>
      </pre>
 
      If this step fails, see
@@ -480,7 +480,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
 
      <pre>
      (tensorflow)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -648,14 +648,14 @@ This section documents the relevant values for Linux installations.
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp27-none-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.6.0rc0-cp27-none-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -667,14 +667,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.6.0rc0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -686,14 +686,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.6.0rc0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
@@ -705,14 +705,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.6.0rc0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
diff --git a/tensorflow/docs_src/install/install_mac.md b/tensorflow/docs_src/install/install_mac.md
index f49d3a2f08..a6ea548cfb 100644
--- a/tensorflow/docs_src/install/install_mac.md
+++ b/tensorflow/docs_src/install/install_mac.md
@@ -115,7 +115,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      TensorFlow in the active Virtualenv is as follows:
 
      <pre> $ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0-py3-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc0-py3-none-any.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common-installation-problems).
@@ -238,7 +238,7 @@ take the following steps:
      issue the following command:
 
      <pre> $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0-py3-none-any.whl</b> </pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc0-py3-none-any.whl</b> </pre>
 
      If the preceding command fails, see
      [installation problems](#common-installation-problems).
@@ -347,7 +347,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
      TensorFlow for Python 2.7:
 
      <pre> (<i>targetDirectory</i>)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0-py2-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc0-py2-none-any.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -520,7 +520,7 @@ This section documents the relevant values for Mac OS installations.
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0-py2-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc0-py2-none-any.whl
 </pre>
 
 
@@ -528,5 +528,5 @@ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0-py2-none-any.
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0-py3-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc0-py3-none-any.whl
 </pre>
diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index ccf62a169f..0dbb15188e 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -359,10 +359,10 @@ Invoke `pip install` to install that pip package.
 The filename of the `.whl` file depends on your platform.
 For example, the following command will install the pip package
 
-for TensorFlow 1.5.0 on Linux:
+for TensorFlow 1.6.0rc0 on Linux:
 
 <pre>
-$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.5.0-py2-none-any.whl</b>
+$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.6.0rc0-py2-none-any.whl</b>
 </pre>
 
 ## Validate your installation
@@ -460,7 +460,8 @@ Stack Overflow and specify the `tensorflow` tag.
 **Linux**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
-
+<tr><td>tensorflow-1.6.0rc0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.6.0rc0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.5.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.8.0</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.5.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.8.0</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.4.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.5.4</td><td>N/A</td><td>N/A</td></tr>
@@ -478,6 +479,7 @@ Stack Overflow and specify the `tensorflow` tag.
 **Mac**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
+<tr><td>tensorflow-1.6.0rc0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.8.1</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.5.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.8.1</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.4.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.5.4</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.3.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.4.5</td><td>N/A</td><td>N/A</td></tr>
@@ -491,6 +493,8 @@ Stack Overflow and specify the `tensorflow` tag.
 **Windows**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
+<tr><td>tensorflow-1.6.0rc0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.6.0rc0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.5.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.5.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.4.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
diff --git a/tensorflow/python/client/session.py b/tensorflow/python/client/session.py
index 6befeb846d..f3c4fecdc0 100644
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py
@@ -1539,8 +1539,22 @@ class Session(BaseSession):
   def __exit__(self, exec_type, exec_value, exec_tb):
     if exec_type is errors.OpError:
       logging.error('Session closing due to OpError: %s', (exec_value,))
-    self._default_session_context_manager.__exit__(exec_type, exec_value,
-                                                   exec_tb)
+    try:
+      self._default_session_context_manager.__exit__(exec_type, exec_value,
+                                                     exec_tb)
+    except RuntimeError as error:
+      if error == exec_value:
+        # NOTE(skyewm): for some reason, in Python3,
+        # _default_session_context_manager.__exit__ will re-raise the "not
+        # re-entrant" exception raised in __enter__ above (note that if we're
+        # here, we're in the outer session context manager, since __exit__ is
+        # not called when __enter__ raises an exception). We still want to
+        # continue cleaning up this context manager before the exception is
+        # further propagated, so we ignore it here (note that it'll continue
+        # being propagated after this method completes).
+        pass
+      else:
+        raise
     self._default_graph_context_manager.__exit__(exec_type, exec_value, exec_tb)
 
     self._default_session_context_manager = None
diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index b3a8ff2ac7..fd5d005844 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -351,7 +351,7 @@ do_external_licenses_check(){
 
   # Whitelist
   echo ${EXTRA_LICENSE_FILE}
-  grep -e "@bazel_tools//src/" -e "@bazel_tools//tools/" -e "@com_google_absl//" -e "//external" -e "@local" -v ${EXTRA_LICENSES_FILE} > temp.txt
+  grep -e "@bazel_tools//src" -e "@bazel_tools//tools/" -e "@com_google_absl//" -e "//external" -e "@local" -v ${EXTRA_LICENSES_FILE} > temp.txt
   mv temp.txt ${EXTRA_LICENSES_FILE}
 
 
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index 5dc4a053fd..d16761c367 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -70,7 +70,7 @@ RUN mkdir /bazel && \
 
 # Download and build TensorFlow.
 WORKDIR /tensorflow
-RUN git clone --branch=r1.5 --depth=1 https://github.com/tensorflow/tensorflow.git .
+RUN git clone --branch=r1.6 --depth=1 https://github.com/tensorflow/tensorflow.git .
 
 # TODO(craigcitro): Don't install the pip package, since it makes it
 # more difficult to experiment with local changes. Instead, just add
diff --git a/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl b/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
index 96b260ad3a..3690e7dfe5 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
+++ b/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
@@ -3,7 +3,7 @@ FROM tensorflow/tensorflow:latest-devel
 LABEL maintainer="Clayne Robison<clayne.b.robison@intel.com>"
 
 # These arguments are parameterized. Use --build-args to override.
-ARG TF_BRANCH=r1.5
+ARG TF_BRANCH=r1.6
 ARG WHL_DIR=/whl
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index 07ffd3839a..4ef37881bc 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -79,7 +79,7 @@ RUN mkdir /bazel && \
 
 # Download and build TensorFlow.
 WORKDIR /tensorflow
-RUN git clone --branch=r1.5 --depth=1 https://github.com/tensorflow/tensorflow.git .
+RUN git clone --branch=r1.6 --depth=1 https://github.com/tensorflow/tensorflow.git .
 
 # Configure the build for our CUDA configuration.
 ENV CI_BUILD_PYTHON python
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index d7fab2b93a..2002786999 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -29,7 +29,7 @@ from setuptools.dist import Distribution
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
-_VERSION = '1.5.0'
+_VERSION = '1.6.0-rc0'
 
 REQUIRED_PACKAGES = [
     'absl-py >= 0.1.6',
-- 
GitLab


From 3374d3a6e5b9f77fa4229c41b233f1c0a229216f Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Mon, 5 Feb 2018 14:47:23 -0800
Subject: [PATCH 669/966] Automated g4 rollback of changelist 184573795

PiperOrigin-RevId: 184590080
---
 .../kernel_tests/control_flow_ops_py_test.py  | 30 -------------------
 tensorflow/python/ops/control_flow_ops.py     | 25 ++++++----------
 2 files changed, 9 insertions(+), 46 deletions(-)

diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index 15ff0ec09b..4fafc36014 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -704,36 +704,6 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(c, b, [n], parallel_iterations=20)
       self.assertEqual(10000, r.eval())
 
-  def testWhileExternalControlDependencies(self):
-    with self.test_session():
-      v = variables.Variable(0.0)
-      v.initializer.run()
-      increment = v.assign_add(1.0)
-
-      def body_fn(i):
-        with ops.control_dependencies([increment]):
-          return i + i
-
-      result = control_flow_ops.while_loop(cond=lambda i: i < 1,
-                                           body=body_fn, loop_vars=[1])
-      result.eval()
-      self.assertAllEqual(v.eval(), 1.0)
-
-  def testWhileExternalControlDependenciesNoInput(self):
-    with self.test_session():
-      v = variables.Variable(0.0)
-      v.initializer.run()
-      increment = v.assign_add(1.0)
-
-      def body_fn(unused_i):
-        with ops.control_dependencies([increment]):
-          return constant_op.constant(5, name="five")
-
-      result = control_flow_ops.while_loop(cond=lambda i: i < 5,
-                                           body=body_fn, loop_vars=[0])
-      result.eval()
-      self.assertAllEqual(v.eval(), 1.0)
-
   def testWhileWithRefs_1(self):
     with self.test_session() as sess:
       x = variables.Variable(0)._ref()  # pylint: disable=protected-access
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 87ff0aba0a..bcd187d821 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -1631,13 +1631,10 @@ class ControlFlowContext(object):
         ctxt = util.GetOutputContext(x)
         if ctxt is not None and ctxt.GetWhileContext() == while_ctxt:
           internal_control_inputs.append(x)
-    external_control_inputs = []
     if len(internal_control_inputs) != len(op.control_inputs):
-      external_control_inputs = list(set(op.control_inputs)
-                                     - set(internal_control_inputs))
       op._remove_all_control_inputs()
       op._add_control_inputs(internal_control_inputs)
-    return internal_control_inputs, external_control_inputs
+    return internal_control_inputs
 
   # pylint: enable=protected-access
 
@@ -2435,12 +2432,14 @@ class WhileContext(ControlFlowContext):
   def _AddOpInternal(self, op):
     """Add `op` to the current context.
 
-    We move any external control dependencies of the op to the loop pivot, to
-    ensure they get executed.
+    In the case that op has only external data inputs, we remove all of its
+    external control inputs so all its inputs are in the same while loop
+    context. This is valid because op now has an Enter input that has all
+    the right control dependency.
     """
     if not op.inputs:
       # Remove any external control dependency on this op
-      control_inputs, external_inputs = self._RemoveExternalControlEdges(op)
+      control_inputs = self._RemoveExternalControlEdges(op)
       # Add a control edge from the control pivot to this op.
       if not control_inputs:
         # pylint: disable=protected-access
@@ -2453,20 +2452,14 @@ class WhileContext(ControlFlowContext):
         x = op.inputs[index]
         real_x = self.AddValue(x)
         if real_x != x:
-          op._update_input(index, real_x)  # pylint: disable=protected-access
-      # Remove any external control dependency on this op and move then to an
-      # Enter node.
-      _, external_inputs = self._RemoveExternalControlEdges(op)
+          op._update_input(index, real_x)
+      # Remove any external control dependency on this op.
+      self._RemoveExternalControlEdges(op)
       # Add a control dependency to prevent loop invariants from
       # enabling ops that should not be executed.
       self._MaybeAddControlDependency(op)
       for x in op.outputs:
         self._values.add(x.name)
-    if external_inputs:
-      # Make the pivot depend on external control inputs
-      pred = self._pivot_for_pred.op.inputs[0]
-      assert util.IsLoopEnter(pred.op)
-      pred.op._add_control_inputs(external_inputs)  # pylint: disable=protected-access
     if self._outer_context or not util.IsLoopExit(op):
       op.graph.prevent_fetching(op)
       for x in op.outputs:
-- 
GitLab


From 85344e3366e440ecdb160b9bb3be71212f19bacd Mon Sep 17 00:00:00 2001
From: Yu-Cheng Ling <ycling@google.com>
Date: Mon, 5 Feb 2018 15:05:43 -0800
Subject: [PATCH 670/966] Fix CBLAS Conv reference implementation in TFLite.

PiperOrigin-RevId: 184592951
---
 .../lite/kernels/internal/optimized/cblas_reference.h        | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/cblas_reference.h b/tensorflow/contrib/lite/kernels/internal/optimized/cblas_reference.h
index 6578915743..6acc513805 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/cblas_reference.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/cblas_reference.h
@@ -49,9 +49,12 @@ void cblas_sgemm(const enum CBLAS_ORDER order,
   TFLITE_DCHECK(order == CblasRowMajor);
   TFLITE_DCHECK(trans_a == CblasNoTrans);
   TFLITE_DCHECK(trans_b == CblasTrans);
+  TFLITE_DCHECK(beta == 0.0f);
   for (int row = 0; row < m; ++row) {
     for (int col = 0; col < n; ++col) {
-      float value = beta * c[stride_c * row + col];
+      // If `beta` non-zero, multiple it with the original values in output.
+      // Otherwise, ignore the original value in output completely.
+      float value = 0.0f;
       for (int idx = 0; idx < k; ++idx) {
         value += alpha * a[stride_a * row + idx] * b[stride_b * col + idx];
       }
-- 
GitLab


From ee8742b949f8fcd52bfac4e5266e518975b34d28 Mon Sep 17 00:00:00 2001
From: ManHyuk <manhyuk@kw.ac.kr>
Date: Tue, 6 Feb 2018 08:11:17 +0900
Subject: [PATCH 671/966] Fix typo (#16759)

* fix typo

* FIx Typo
---
 .../graph_transformations/resolve_constant_concatenation.cc     | 2 +-
 .../core/distributed_runtime/rpc/grpc_serialization_traits.h    | 2 +-
 tensorflow/docs_src/programmers_guide/debugger.md               | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc
index 5ac449749a..db68968bad 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc
@@ -73,7 +73,7 @@ void CopyTensorSegments(const std::vector<Array*>& input_arrays,
 
 // Receives a series of input arrays of type Array and an integer showing the
 // axis on which those arrays will be concatenated. It returns the concatenated
-// arrray.
+// array.
 template <ArrayDataType A>
 void ConcatenateTensorBuffers(const std::vector<Array*>& input_arrays,
                               int concatenation_axis,
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h b/tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h
index dd114d39c6..730124c25e 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h
@@ -66,7 +66,7 @@ class GrpcBufferWriter final
     }
     // It's dangerous to keep an inlined grpc_slice as the backup slice, since
     // on a following Next() call, a reference will be returned to this slice
-    // via GRPC_SLICE_START_PTR, which will not be an adddress held by
+    // via GRPC_SLICE_START_PTR, which will not be an address held by
     // slice_buffer_.
     have_backup_ = backup_slice_.refcount != NULL;
     byte_count_ -= count;
diff --git a/tensorflow/docs_src/programmers_guide/debugger.md b/tensorflow/docs_src/programmers_guide/debugger.md
index 9eaee27028..c1a90dee0a 100644
--- a/tensorflow/docs_src/programmers_guide/debugger.md
+++ b/tensorflow/docs_src/programmers_guide/debugger.md
@@ -214,7 +214,7 @@ navigate between these screens by clicking the `<--` and
 ### Other Features of the tfdbg CLI
 
 In addition to the commands listed above, the tfdbg CLI provides the following
-addditional features:
+additional features:
 
 *   To navigate through previous tfdbg commands, type in a few characters
     followed by the Up or Down arrow keys. tfdbg will show you the history of
-- 
GitLab


From 1b19bc9edcd993e19e01cd935e9787e1672c4391 Mon Sep 17 00:00:00 2001
From: Max Galkin <maxgalkin@google.com>
Date: Mon, 5 Feb 2018 15:10:44 -0800
Subject: [PATCH 672/966] Add logging to diagnose device properties parsing
 problem in Grappler.

PiperOrigin-RevId: 184594084
---
 tensorflow/core/grappler/costs/op_level_cost_estimator.cc | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
index 5600267f6a..1af973855e 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -411,6 +411,11 @@ Costs OpLevelCostEstimator::PredictCostOfAnUnknownOp(
 Costs OpLevelCostEstimator::PredictOpCountBasedCost(
     double operations, const OpInfo& op_features) const {
   DeviceInfo device_perf = GetDeviceInfo(op_features.device());
+  if (device_perf.gigaops <= 0 || device_perf.gb_per_sec <= 0) {
+    VLOG(1) << "BAD DEVICE. Op:" << op_features.op()
+            << " device type:" << op_features.device().type()
+            << " device model:" << op_features.device().model();
+  }
 
   Costs::NanoSeconds compute_cost(std::ceil(operations / device_perf.gigaops));
   VLOG(1) << "Op:" << op_features.op() << " GOps:" << operations / 1e9
-- 
GitLab


From e78aff41a14e000a4f68508b2c08c742f37ba59c Mon Sep 17 00:00:00 2001
From: Mahesh Bhosale <maheshbhosale24051996@gmail.com>
Date: Tue, 6 Feb 2018 04:45:12 +0530
Subject: [PATCH 673/966] Resolve Programmatic mistake. (#16765)

SPECIES should have been either imported here or it should have referenced from iris_data. The corresponding code is correct but the conflict in the documentation.
---
 tensorflow/docs_src/get_started/get_started_for_beginners.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/docs_src/get_started/get_started_for_beginners.md b/tensorflow/docs_src/get_started/get_started_for_beginners.md
index ea1c2fb3f4..390cc81eef 100644
--- a/tensorflow/docs_src/get_started/get_started_for_beginners.md
+++ b/tensorflow/docs_src/get_started/get_started_for_beginners.md
@@ -700,7 +700,7 @@ for pred_dict, expec in zip(predictions, expected):
 
     class_id = pred_dict['class_ids'][0]
     probability = pred_dict['probabilities'][class_id]
-    print(template.format(SPECIES[class_id], 100 * probability, expec))
+    print(template.format(iris_data.SPECIES[class_id], 100 * probability, expec))
 ```
 
 Running the program yields the following output:
-- 
GitLab


From c8674c8a755525fb87073c0ac74dd21267a7f22f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 5 Feb 2018 15:14:20 -0800
Subject: [PATCH 674/966] Verify tflite model in TFLite Java API

PiperOrigin-RevId: 184594561
---
 tensorflow/contrib/lite/java/BUILD            | 20 ++++++++++++++++
 .../native/nativeinterpreterwrapper_jni.cc    | 23 +++++++++++++++++++
 .../lite/NativeInterpreterWrapperTest.java    |  4 +---
 3 files changed, 44 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/lite/java/BUILD b/tensorflow/contrib/lite/java/BUILD
index 9a1a888b93..35aacb7000 100644
--- a/tensorflow/contrib/lite/java/BUILD
+++ b/tensorflow/contrib/lite/java/BUILD
@@ -111,6 +111,26 @@ java_test(
     ],
 )
 
+# TODO: generate large models at runtime, instead of storing them.
+java_test(
+    name = "InterpreterTest",
+    size = "small",
+    srcs = ["src/test/java/org/tensorflow/lite/InterpreterTest.java"],
+    data = [
+        "src/testdata/add.bin",
+        "src/testdata/mobilenet.tflite.bin",
+    ],
+    javacopts = JAVACOPTS,
+    test_class = "org.tensorflow.lite.InterpreterTest",
+    visibility = ["//visibility:private"],
+    deps = [
+        ":libtensorflowlite_jni.so",
+        ":tensorflowlitelib",
+        "@com_google_truth",
+        "@junit",
+    ],
+)
+
 java_test(
     name = "TensorTest",
     size = "small",
diff --git a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
index f3f51b668f..c346f9f92e 100644
--- a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
+++ b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
@@ -200,6 +200,12 @@ TfLiteStatus setInputs(JNIEnv* env, tflite::Interpreter* interpreter,
   return kTfLiteOk;
 }
 
+// TODO(yichengfan): evaluate the benefit to use tflite verifier.
+bool VerifyModel(const void* buf, size_t len) {
+  flatbuffers::Verifier verifier(static_cast<const uint8_t*>(buf), len);
+  return tflite::VerifyModelBuffer(verifier);
+}
+
 }  // namespace
 
 JNIEXPORT jobjectArray JNICALL
@@ -271,6 +277,17 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_createModel(
       convertLongToErrorReporter(env, error_handle);
   if (error_reporter == nullptr) return 0;
   const char* path = env->GetStringUTFChars(model_file, nullptr);
+
+  {
+    tflite::FileCopyAllocation allocation(path, nullptr);
+    if (!VerifyModel(allocation.base(), allocation.bytes())) {
+      throwException(env, kIllegalArgumentException,
+                     "Contents of %s is not a valid flatbuffer model", path);
+      env->ReleaseStringUTFChars(model_file, path);
+      return 0;
+    }
+  }
+
   auto model = tflite::FlatBufferModel::BuildFromFile(path, error_reporter);
   if (!model) {
     throwException(env, kIllegalArgumentException,
@@ -293,6 +310,12 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_createModelWithBuffer(
   const char* buf =
       static_cast<char*>(env->GetDirectBufferAddress(model_buffer));
   jlong capacity = env->GetDirectBufferCapacity(model_buffer);
+  if (!VerifyModel(buf, capacity)) {
+    throwException(env, kIllegalArgumentException,
+                   "MappedByteBuffer is not a valid flatbuffer model");
+    return 0;
+  }
+
   auto model = tflite::FlatBufferModel::BuildFromBuffer(
       buf, static_cast<size_t>(capacity), error_reporter);
   if (!model) {
diff --git a/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java b/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java
index 473f73816f..90323555d8 100644
--- a/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java
+++ b/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java
@@ -60,9 +60,7 @@ public final class NativeInterpreterWrapperTest {
       NativeInterpreterWrapper wrapper = new NativeInterpreterWrapper(INVALID_MODEL_PATH);
       fail();
     } catch (IllegalArgumentException e) {
-      assertThat(e)
-          .hasMessageThat()
-          .contains("Model provided has model identifier ' is ', should be 'TFL3'");
+      assertThat(e).hasMessageThat().contains("is not a valid flatbuffer model");
     }
   }
 
-- 
GitLab


From 2074a568810ea20c267e9d063a066b81ad491eed Mon Sep 17 00:00:00 2001
From: Sergio Guadarrama <sguada@google.com>
Date: Mon, 5 Feb 2018 15:16:29 -0800
Subject: [PATCH 675/966] Adding TensorSpec to represent the specification of
 Tensors.

PiperOrigin-RevId: 184594856
---
 tensorflow/contrib/framework/__init__.py      |   6 +
 tensorflow/python/BUILD                       |  28 +++
 tensorflow/python/framework/tensor_spec.py    | 201 ++++++++++++++++
 .../python/framework/tensor_spec_test.py      | 227 ++++++++++++++++++
 4 files changed, 462 insertions(+)
 create mode 100644 tensorflow/python/framework/tensor_spec.py
 create mode 100644 tensorflow/python/framework/tensor_spec_test.py

diff --git a/tensorflow/contrib/framework/__init__.py b/tensorflow/contrib/framework/__init__.py
index 503b868aaa..fb101c3653 100644
--- a/tensorflow/contrib/framework/__init__.py
+++ b/tensorflow/contrib/framework/__init__.py
@@ -86,6 +86,9 @@ See the @{$python/contrib.framework} guide.
 @@sort
 
 @@CriticalSection
+
+@@BoundedTensorSpec
+@@TensorSpec
 """
 
 from __future__ import absolute_import
@@ -100,6 +103,9 @@ from tensorflow.contrib.framework.python.ops import *
 from tensorflow.python.framework.ops import prepend_name_scope
 from tensorflow.python.framework.ops import strip_name_scope
 
+from tensorflow.python.framework.tensor_spec import BoundedTensorSpec
+from tensorflow.python.framework.tensor_spec import TensorSpec
+
 from tensorflow.python.util.all_util import remove_undocumented
 
 _allowed_symbols = ['nest']
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index a89142d814..2b4d5b8e0f 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -576,6 +576,7 @@ py_library(
         ":pywrap_tensorflow",
         ":random_seed",
         ":sparse_tensor",
+        ":tensor_spec",
         ":tensor_util",
         ":util",
         "//tensorflow/python/eager:context",
@@ -780,6 +781,18 @@ py_library(
     ],
 )
 
+py_library(
+    name = "tensor_spec",
+    srcs = ["framework/tensor_spec.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":common_shapes",
+        ":dtypes",
+        ":tensor_shape",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_library(
     name = "tensor_util",
     srcs = ["framework/tensor_util.py"],
@@ -1148,6 +1161,21 @@ py_test(
     ],
 )
 
+py_test(
+    name = "framework_tensor_spec_test",
+    size = "small",
+    srcs = ["framework/tensor_spec_test.py"],
+    main = "framework/tensor_spec_test.py",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":framework_for_generated_wrappers",
+        ":framework_test_lib",
+        ":platform_test",
+        ":tensor_spec",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "framework_sparse_tensor_test",
     size = "small",
diff --git a/tensorflow/python/framework/tensor_spec.py b/tensorflow/python/framework/tensor_spec.py
new file mode 100644
index 0000000000..a0411bc3d9
--- /dev/null
+++ b/tensorflow/python/framework/tensor_spec.py
@@ -0,0 +1,201 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A TensorSpec class."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import common_shapes
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+
+
+class TensorSpec(object):
+  """Describes a tf.Tensor.
+
+  A TensorSpec allows an API to describe the Tensors that it accepts or
+  returns, before that Tensor exists. This allows dynamic and flexible graph
+  construction and configuration.
+  """
+
+  __slots__ = ["_shape", "_dtype", "_name"]
+
+  def __init__(self, shape, dtype, name=None):
+    """Creates a TensorSpec.
+
+    Args:
+      shape: Value convertible to `tf.TensorShape`. The shape of the tensor.
+      dtype: Value convertible to `tf.DType`. The type of the tensor values.
+      name: Optional name for the Tensor.
+
+    Raises:
+      TypeError: If shape is not convertible to a `tf.TensorShape`, or dtype is
+        not convertible to a `tf.DType`.
+    """
+    self._shape = tensor_shape.TensorShape(shape)
+    self._dtype = dtypes.as_dtype(dtype)
+    self._name = name
+
+  @classmethod
+  def from_spec(cls, spec, name=None):
+    return cls(spec.shape, spec.dtype, name or spec.name)
+
+  @classmethod
+  def from_tensor(cls, tensor, name=None):
+    if isinstance(tensor, ops.EagerTensor):
+      return TensorSpec(tensor.shape, tensor.dtype, name)
+    elif isinstance(tensor, ops.Tensor):
+      return TensorSpec(tensor.shape, tensor.dtype, name or tensor.op.name)
+    else:
+      raise ValueError("`tensor` should be a tf.Tensor")
+
+  @property
+  def shape(self):
+    """Returns the `TensorShape` that represents the shape of the tensor."""
+    return self._shape
+
+  @property
+  def dtype(self):
+    """Returns the `dtype` of elements in the tensor."""
+    return self._dtype
+
+  @property
+  def name(self):
+    """Returns the name of the described tensor."""
+    return self._name
+
+  def is_compatible_with(self, spec_or_tensor):
+    """True if the shape and dtype of `spec_or_tensor` are compatible."""
+    return (self._dtype.is_compatible_with(spec_or_tensor.dtype) and
+            self._shape.is_compatible_with(spec_or_tensor.shape))
+
+  def __repr__(self):
+    return "TensorSpec(shape={}, dtype={}, name={})".format(
+        self.shape, repr(self.dtype), repr(self.name))
+
+  def __eq__(self, other):
+    return self.shape == other.shape and self.dtype == other.dtype
+
+  def __ne__(self, other):
+    return not self == other
+
+
+class BoundedTensorSpec(TensorSpec):
+  """A `TensorSpec` that specifies minimum and maximum values.
+
+  Example usage:
+  ```python
+  spec = tensor_spec.BoundedTensorSpec((1, 2, 3), tf.float32, 0, (5, 5, 5))
+  tf_minimum = tf.convert_to_tensor(spec.minimum, dtype=spec.dtype)
+  tf_maximum = tf.convert_to_tensor(spec.maximum, dtype=spec.dtype)
+  ```
+
+  Bounds are meant to be inclusive. This is especially important for
+  integer types. The following spec will be satisfied by tensors
+  with values in the set {0, 1, 2}:
+  ```python
+  spec = tensor_spec.BoundedTensorSpec((3, 5), tf.int32, 0, 2)
+  ```
+  """
+
+  __slots__ = ("_minimum", "_maximum")
+
+  def __init__(self, shape, dtype, minimum, maximum, name=None):
+    """Initializes a new `BoundedTensorSpec`.
+
+    Args:
+      shape: Value convertible to `tf.TensorShape`. The shape of the tensor.
+      dtype: Value convertible to `tf.DType`. The type of the tensor values.
+      minimum: Number or sequence specifying the minimum element bounds
+        (inclusive). Must be broadcastable to `shape`.
+      maximum: Number or sequence specifying the maximum element bounds
+        (inclusive). Must be broadcastable to `shape`.
+      name: Optional string containing a semantic name for the corresponding
+        array. Defaults to `None`.
+
+    Raises:
+      ValueError: If `minimum` or `maximum` are not provided or not
+        broadcastable to `shape`.
+      TypeError: If the shape is not an iterable or if the `dtype` is an invalid
+        numpy dtype.
+    """
+    super(BoundedTensorSpec, self).__init__(shape, dtype, name)
+
+    if minimum is None or maximum is None:
+      raise ValueError("minimum and maximum must be provided; but saw "
+                       "'%s' and '%s'" % (minimum, maximum))
+
+    try:
+      minimum_shape = np.shape(minimum)
+      common_shapes.broadcast_shape(
+          tensor_shape.TensorShape(minimum_shape), self.shape)
+    except ValueError as exception:
+      raise ValueError("minimum is not compatible with shape. "
+                       "Message: {!r}.".format(exception))
+
+    try:
+      maximum_shape = np.shape(maximum)
+      common_shapes.broadcast_shape(
+          tensor_shape.TensorShape(maximum_shape), self.shape)
+    except ValueError as exception:
+      raise ValueError("maximum is not compatible with shape. "
+                       "Message: {!r}.".format(exception))
+
+    self._minimum = np.array(minimum, dtype=self.dtype.as_numpy_dtype())
+    self._minimum.setflags(write=False)
+
+    self._maximum = np.array(maximum, dtype=self.dtype.as_numpy_dtype())
+    self._maximum.setflags(write=False)
+
+  @classmethod
+  def from_spec(cls, spec):
+    dtype = dtypes.as_dtype(spec.dtype)
+    if dtype in [dtypes.float64, dtypes.float32]:
+      # Avoid under/over-flow for `dtype.maximum - dtype.minimum`.
+      low = dtype.min / 2
+      high = dtype.max / 2
+    else:
+      low = dtype.min
+      high = dtype.max
+
+    minimum = getattr(spec, "minimum", low)
+    maximum = getattr(spec, "maximum", high)
+    return BoundedTensorSpec(spec.shape, dtype, minimum, maximum, spec.name)
+
+  @property
+  def minimum(self):
+    """Returns a NumPy array specifying the minimum bounds (inclusive)."""
+    return self._minimum
+
+  @property
+  def maximum(self):
+    """Returns a NumPy array specifying the maximum bounds (inclusive)."""
+    return self._maximum
+
+  def __repr__(self):
+    s = "BoundedTensorSpec(shape={}, dtype={}, name={}, minimum={}, maximum={})"
+    return s.format(self.shape, repr(self.dtype), repr(self.name),
+                    repr(self.minimum), repr(self.maximum))
+
+  def __eq__(self, other):
+    tensor_spec_eq = super(BoundedTensorSpec, self).__eq__(other)
+    return (tensor_spec_eq and np.allclose(self.minimum, other.minimum) and
+            np.allclose(self.maximum, other.maximum))
+
+
diff --git a/tensorflow/python/framework/tensor_spec_test.py b/tensorflow/python/framework/tensor_spec_test.py
new file mode 100644
index 0000000000..54ca4d9a19
--- /dev/null
+++ b/tensorflow/python/framework/tensor_spec_test.py
@@ -0,0 +1,227 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensor_spec."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import googletest
+
+
+class TensorSpecTest(test_util.TensorFlowTestCase):
+
+  def testAcceptsNumpyDType(self):
+    desc = tensor_spec.TensorSpec([1], np.float32)
+    self.assertEqual(desc.dtype, dtypes.float32)
+
+  def testAcceptsTensorShape(self):
+    desc = tensor_spec.TensorSpec(tensor_shape.TensorShape([1]), dtypes.float32)
+    self.assertEqual(desc.shape, tensor_shape.TensorShape([1]))
+
+  def testUnknownShape(self):
+    desc = tensor_spec.TensorSpec(shape=None, dtype=dtypes.float32)
+    self.assertEqual(desc.shape, tensor_shape.TensorShape(None))
+
+  def testShapeCompatibility(self):
+    unknown = array_ops.placeholder(dtypes.int64)
+    partial = array_ops.placeholder(dtypes.int64, shape=[None, 1])
+    full = array_ops.placeholder(dtypes.int64, shape=[2, 3])
+    rank3 = array_ops.placeholder(dtypes.int64, shape=[4, 5, 6])
+
+    desc_unknown = tensor_spec.TensorSpec(None, dtypes.int64)
+    self.assertTrue(desc_unknown.is_compatible_with(unknown))
+    self.assertTrue(desc_unknown.is_compatible_with(partial))
+    self.assertTrue(desc_unknown.is_compatible_with(full))
+    self.assertTrue(desc_unknown.is_compatible_with(rank3))
+
+    desc_partial = tensor_spec.TensorSpec([2, None], dtypes.int64)
+    self.assertTrue(desc_partial.is_compatible_with(unknown))
+    self.assertTrue(desc_partial.is_compatible_with(partial))
+    self.assertTrue(desc_partial.is_compatible_with(full))
+    self.assertFalse(desc_partial.is_compatible_with(rank3))
+
+    desc_full = tensor_spec.TensorSpec([2, 3], dtypes.int64)
+    self.assertTrue(desc_full.is_compatible_with(unknown))
+    self.assertFalse(desc_full.is_compatible_with(partial))
+    self.assertTrue(desc_full.is_compatible_with(full))
+    self.assertFalse(desc_full.is_compatible_with(rank3))
+
+    desc_rank3 = tensor_spec.TensorSpec([4, 5, 6], dtypes.int64)
+    self.assertTrue(desc_rank3.is_compatible_with(unknown))
+    self.assertFalse(desc_rank3.is_compatible_with(partial))
+    self.assertFalse(desc_rank3.is_compatible_with(full))
+    self.assertTrue(desc_rank3.is_compatible_with(rank3))
+
+  def testTypeCompatibility(self):
+    floats = array_ops.placeholder(dtypes.float32, shape=[10, 10])
+    ints = array_ops.placeholder(dtypes.int32, shape=[10, 10])
+    desc = tensor_spec.TensorSpec(shape=(10, 10), dtype=dtypes.float32)
+    self.assertTrue(desc.is_compatible_with(floats))
+    self.assertFalse(desc.is_compatible_with(ints))
+
+  def testName(self):
+    desc = tensor_spec.TensorSpec([1], dtypes.float32, name="beep")
+    self.assertEqual(desc.name, "beep")
+
+  def testRepr(self):
+    desc1 = tensor_spec.TensorSpec([1], dtypes.float32, name="beep")
+    self.assertEqual(
+        repr(desc1),
+        "TensorSpec(shape=(1,), dtype=tf.float32, name='beep')")
+    desc2 = tensor_spec.TensorSpec([1, None], dtypes.int32)
+    self.assertEqual(
+        repr(desc2),
+        "TensorSpec(shape=(1, ?), dtype=tf.int32, name=None)")
+
+  def testFromTensorSpec(self):
+    spec_1 = tensor_spec.TensorSpec((1, 2), dtypes.int32)
+    spec_2 = tensor_spec.TensorSpec.from_spec(spec_1)
+    self.assertEqual(spec_1, spec_2)
+
+  def testFromTensor(self):
+    zero = constant_op.constant(0)
+    spec = tensor_spec.TensorSpec.from_tensor(zero)
+    self.assertEqual(spec.dtype, dtypes.int32)
+    self.assertEqual(spec.shape, [])
+    self.assertEqual(spec.name, "Const")
+
+  def testFromPlaceholder(self):
+    unknown = array_ops.placeholder(dtypes.int64, name="unknown")
+    partial = array_ops.placeholder(dtypes.float32,
+                                    shape=[None, 1],
+                                    name="partial")
+    spec_1 = tensor_spec.TensorSpec.from_tensor(unknown)
+    self.assertEqual(spec_1.dtype, dtypes.int64)
+    self.assertEqual(spec_1.shape, None)
+    self.assertEqual(spec_1.name, "unknown")
+    spec_2 = tensor_spec.TensorSpec.from_tensor(partial)
+    self.assertEqual(spec_2.dtype, dtypes.float32)
+    self.assertEqual(spec_2.shape.as_list(), [None, 1])
+    self.assertEqual(spec_2.name, "partial")
+
+  def testFromBoundedTensorSpec(self):
+    bounded_spec = tensor_spec.BoundedTensorSpec((1, 2), dtypes.int32, 0, 1)
+    spec = tensor_spec.TensorSpec.from_spec(bounded_spec)
+    self.assertEqual(bounded_spec.shape, spec.shape)
+    self.assertEqual(bounded_spec.dtype, spec.dtype)
+    self.assertEqual(bounded_spec.name, spec.name)
+
+
+class BoundedTensorSpecTest(test_util.TensorFlowTestCase):
+
+  def testInvalidMinimum(self):
+    with self.assertRaisesRegexp(ValueError, "not compatible"):
+      tensor_spec.BoundedTensorSpec((3, 5), dtypes.uint8, (0, 0, 0), (1, 1))
+
+  def testInvalidMaximum(self):
+    with self.assertRaisesRegexp(ValueError, "not compatible"):
+      tensor_spec.BoundedTensorSpec((3, 5), dtypes.uint8, 0, (1, 1, 1))
+
+  def testMinimumMaximumAttributes(self):
+    spec = tensor_spec.BoundedTensorSpec(
+        (1, 2, 3), dtypes.float32, 0, (5, 5, 5))
+    self.assertEqual(type(spec.minimum), np.ndarray)
+    self.assertEqual(type(spec.maximum), np.ndarray)
+    self.assertAllEqual(spec.minimum, np.array(0, dtype=np.float32))
+    self.assertAllEqual(spec.maximum, np.array([5, 5, 5], dtype=np.float32))
+
+  def testNotWriteableNP(self):
+    spec = tensor_spec.BoundedTensorSpec(
+        (1, 2, 3), dtypes.float32, 0, (5, 5, 5))
+    with self.assertRaisesRegexp(ValueError, "read-only"):
+      spec.minimum[0] = -1
+    with self.assertRaisesRegexp(ValueError, "read-only"):
+      spec.maximum[0] = 100
+
+  def testReuseSpec(self):
+    spec_1 = tensor_spec.BoundedTensorSpec((1, 2), dtypes.int32,
+                                           minimum=0, maximum=1)
+    spec_2 = tensor_spec.BoundedTensorSpec(
+        spec_1.shape, spec_1.dtype, spec_1.minimum, spec_1.maximum)
+    self.assertEqual(spec_1, spec_2)
+
+  def testScalarBounds(self):
+    spec = tensor_spec.BoundedTensorSpec(
+        (), dtypes.float32, minimum=0.0, maximum=1.0)
+
+    self.assertIsInstance(spec.minimum, np.ndarray)
+    self.assertIsInstance(spec.maximum, np.ndarray)
+
+    # Sanity check that numpy compares correctly to a scalar for an empty shape.
+    self.assertEqual(0.0, spec.minimum)
+    self.assertEqual(1.0, spec.maximum)
+
+    # Check that the spec doesn't fail its own input validation.
+    _ = tensor_spec.BoundedTensorSpec(
+        spec.shape, spec.dtype, spec.minimum, spec.maximum)
+
+  def testFromBoundedTensorSpec(self):
+    spec_1 = tensor_spec.BoundedTensorSpec((1, 2), dtypes.int32,
+                                           minimum=0, maximum=1)
+    spec_2 = tensor_spec.BoundedTensorSpec.from_spec(spec_1)
+    self.assertEqual(spec_1, spec_2)
+
+  def testEquality(self):
+    spec_1_1 = tensor_spec.BoundedTensorSpec((1, 2, 3), dtypes.float32,
+                                             0, (5, 5, 5))
+    spec_1_2 = tensor_spec.BoundedTensorSpec((1, 2, 3), dtypes.float32,
+                                             0.00000001,
+                                             (5, 5, 5.00000000000000001))
+    spec_2_1 = tensor_spec.BoundedTensorSpec((1, 2, 3), dtypes.float32,
+                                             1, (5, 5, 5))
+    spec_2_2 = tensor_spec.BoundedTensorSpec((1, 2, 3), dtypes.float32,
+                                             (1, 1, 1), (5, 5, 5))
+    spec_2_3 = tensor_spec.BoundedTensorSpec((1, 2, 3), dtypes.float32,
+                                             (1, 1, 1), 5)
+    spec_3_1 = tensor_spec.BoundedTensorSpec((1, 2, 3), dtypes.float32,
+                                             (2, 1, 1), (5, 5, 5))
+
+    self.assertEqual(spec_1_1, spec_1_2)
+    self.assertEqual(spec_1_2, spec_1_1)
+
+    self.assertNotEqual(spec_1_1, spec_2_2)
+    self.assertNotEqual(spec_1_1, spec_2_1)
+    self.assertNotEqual(spec_2_2, spec_1_1)
+    self.assertNotEqual(spec_2_1, spec_1_1)
+
+    self.assertEqual(spec_2_1, spec_2_2)
+    self.assertEqual(spec_2_2, spec_2_1)
+    self.assertEqual(spec_2_2, spec_2_3)
+
+    self.assertNotEqual(spec_1_1, spec_3_1)
+    self.assertNotEqual(spec_2_1, spec_3_1)
+    self.assertNotEqual(spec_2_2, spec_3_1)
+
+  def testFromTensorSpec(self):
+    spec = tensor_spec.TensorSpec((1, 2), dtypes.int32)
+    bounded_spec = tensor_spec.BoundedTensorSpec.from_spec(spec)
+    self.assertEqual(spec.shape, bounded_spec.shape)
+    self.assertEqual(spec.dtype, bounded_spec.dtype)
+    self.assertEqual(spec.dtype.min, bounded_spec.minimum)
+    self.assertEqual(spec.dtype.max, bounded_spec.maximum)
+    self.assertEqual(spec.name, bounded_spec.name)
+
+
+if __name__ == "__main__":
+  googletest.main()
-- 
GitLab


From a9485a4b59477c722ed480baec9043d04cc25ea0 Mon Sep 17 00:00:00 2001
From: Tod <todjiang@gmail.com>
Date: Tue, 6 Feb 2018 07:30:17 +0800
Subject: [PATCH 676/966] Fix the relative path issue of JNI lib loader
 (#16754)

---
 tensorflow/java/src/main/java/org/tensorflow/NativeLibrary.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/java/src/main/java/org/tensorflow/NativeLibrary.java b/tensorflow/java/src/main/java/org/tensorflow/NativeLibrary.java
index 499757e8cf..cf773e1686 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/NativeLibrary.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/NativeLibrary.java
@@ -88,7 +88,7 @@ final class NativeLibrary {
       // Deletions are in the reverse order of requests, so we need to request that the directory be
       // deleted first, so that it is empty when the request is fulfilled.
       tempPath.deleteOnExit();
-      final String tempDirectory = tempPath.toString();
+      final String tempDirectory = tempPath.getCanonicalPath();
       if (frameworkResource != null) {
         extractResource(frameworkResource, frameworkLibName, tempDirectory);
       } else {
-- 
GitLab


From 4e1ebbc2871f02cc99e8a571004bf48cfd517e80 Mon Sep 17 00:00:00 2001
From: Andy Kernahan <andrew.kernahan@gmail.com>
Date: Mon, 5 Feb 2018 23:33:50 +0000
Subject: [PATCH 677/966] Ensure bash is invoked as a login shell on windows
 otherwise fixups fail. (#16580)

---
 third_party/repo.bzl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/repo.bzl b/third_party/repo.bzl
index 11e9c842d2..aa178fa8ca 100644
--- a/third_party/repo.bzl
+++ b/third_party/repo.bzl
@@ -27,7 +27,7 @@ def _wrap_bash_cmd(ctx, cmd):
     bazel_sh = _get_env_var(ctx, "BAZEL_SH")
     if not bazel_sh:
       fail("BAZEL_SH environment variable is not set")
-    cmd = [bazel_sh, "-c", " ".join(cmd)]
+    cmd = [bazel_sh, "-l", "-c", " ".join(cmd)]
   return cmd
 
 def _get_env_var(ctx, name):
-- 
GitLab


From 395550bc423f6a5d9c96233f34f21fce6bd23b4e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 5 Feb 2018 15:30:54 -0800
Subject: [PATCH 678/966] Assign total_loss in order not to crash if training
 loop exists early.

PiperOrigin-RevId: 184596877
---
 tensorflow/contrib/slim/python/slim/learning.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/contrib/slim/python/slim/learning.py b/tensorflow/contrib/slim/python/slim/learning.py
index 54362c87b5..83f33806e0 100644
--- a/tensorflow/contrib/slim/python/slim/learning.py
+++ b/tensorflow/contrib/slim/python/slim/learning.py
@@ -738,6 +738,7 @@ def train(train_op,
   if summary_writer is not None:
     train_step_kwargs['summary_writer'] = sv.summary_writer
 
+  total_loss = 0
   should_retry = True
   while should_retry:
     try:
-- 
GitLab


From e95fbc14b7b5e48f221acf57e1cf73752172f4ce Mon Sep 17 00:00:00 2001
From: Marshal Hayes <mdhayes.07@gmail.com>
Date: Mon, 5 Feb 2018 17:35:46 -0600
Subject: [PATCH 679/966] Removing duplicate code block that raises exception
 (#16568)

For TensorFlow version 1.5.0-rc1, the code block below raises a `ValueError`. Simply remove the duplication (lines 274 - 277 are exactly the same) and the issue is resolved.

```
# Add returned summaries to writer in each step.
writer.add_summary(summary, step)
# Add metadata to visualize the graph for the last run.
if step == (num_steps - 1):
      writer.add_run_metadata(run_metadata, 'step%d' % step)
```
-- 
GitLab


From 2271f0f8c463a01af86c9e17be38e3cfc12eae11 Mon Sep 17 00:00:00 2001
From: Suharsh Sivakumar <suharshs@google.com>
Date: Mon, 5 Feb 2018 15:32:32 -0800
Subject: [PATCH 680/966] Make fold batch norm code use OneofPattern and
 rearrange functions to (maybe) be more readable.

PiperOrigin-RevId: 184597111
---
 .../quantize/python/fold_batch_norms.py       | 727 ++++++++----------
 1 file changed, 329 insertions(+), 398 deletions(-)

diff --git a/tensorflow/contrib/quantize/python/fold_batch_norms.py b/tensorflow/contrib/quantize/python/fold_batch_norms.py
index 8ec5334a39..7fa0d484ec 100644
--- a/tensorflow/contrib/quantize/python/fold_batch_norms.py
+++ b/tensorflow/contrib/quantize/python/fold_batch_norms.py
@@ -17,6 +17,7 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+
 import re
 from tensorflow.contrib import graph_editor
 from tensorflow.contrib.quantize.python import common
@@ -120,12 +121,8 @@ def _FoldFusedBatchNorms(graph, freeze_batch_norm_delay, is_training):
         weights = math_ops.multiply(
             correction_scale, weights, name='correction_mult')
 
-      # TODO(suharshs): This naming of the following ops needs to carefully
-      # follow the naming expected by quantize.py. Generalize the quantize code
-      # to not require these delicate naming conventions.
       scaled_weight_tensor = math_ops.multiply(
           weights, multiplier_tensor, name='mul_fold')
-
       new_layer_tensor = _CloneWithNewOperands(
           match.layer_op, match.input_tensor, scaled_weight_tensor)
 
@@ -145,46 +142,6 @@ def _FoldFusedBatchNorms(graph, freeze_batch_norm_delay, is_training):
             'Unexpected inputs to op: %s' % match.output_tensor.name)
 
 
-def _CloneWithNewOperands(layer_op, input_tensor, weight_tensor):
-  """Clones layer_op with input_tensor and weight_tensor as new inputs."""
-  new_layer_name = layer_op.name.split('/')[-1] + '_Fold'
-  if layer_op.type == 'Conv2D':
-    return nn_ops.conv2d(
-        input_tensor,
-        weight_tensor,
-        strides=layer_op.get_attr('strides'),
-        padding=layer_op.get_attr('padding'),
-        use_cudnn_on_gpu=layer_op.get_attr('use_cudnn_on_gpu'),
-        data_format=layer_op.get_attr('data_format'),
-        name=new_layer_name)
-  elif layer_op.type == 'MatMul':
-    return math_ops.matmul(
-        input_tensor,
-        weight_tensor,
-        transpose_a=layer_op.get_attr('transpose_a'),
-        transpose_b=layer_op.get_attr('transpose_b'),
-        name=new_layer_name)
-  elif layer_op.type == 'DepthwiseConv2dNative':
-    return nn.depthwise_conv2d(
-        input_tensor,
-        weight_tensor,
-        strides=layer_op.get_attr('strides'),
-        padding=layer_op.get_attr('padding'),
-        name=new_layer_name)
-  else:
-    raise ValueError('Cannot handle operation of type: %s' % layer_op.type)
-
-
-@ops.RegisterGradient('FoldFusedBatchNormGrad')
-def _FoldFusedBatchNormGrad(op, unused_grad_y, grad_mean, grad_var, unused_1,
-                            unused_2):
-  x = op.inputs[0]
-  n = x.get_shape().num_elements() / grad_mean.get_shape().num_elements()
-  dmean_dx = grad_mean / n
-  dvar_dx = 2 * grad_var * (x - op.outputs[1]) / (n - 1)
-  return (dmean_dx + dvar_dx), None, None, None, None
-
-
 def _FindFusedBatchNorms(graph):
   """Finds all ops and tensors related to found FusedBatchNorms.
 
@@ -203,68 +160,57 @@ def _FindFusedBatchNorms(graph):
 
   moving_average_pattern = graph_matcher.OpTypePattern('*')
   bn_decay_pattern = graph_matcher.OpTypePattern('*')
-  conv_pattern = graph_matcher.OpTypePattern(
-      'Conv2D|DepthwiseConv2dNative', inputs=[input_pattern, weight_pattern])
+  layer_pattern = graph_matcher.OpTypePattern(
+      'Conv2D|DepthwiseConv2dNative|MatMul',
+      inputs=[input_pattern, weight_pattern])
   # MatMul has a Reshape between it and FusedBatchNorm.
-  matmul_pattern = graph_matcher.OpTypePattern(
-      'MatMul', inputs=[input_pattern, weight_pattern])
   matmul_reshape_pattern = graph_matcher.OpTypePattern(
-      'Reshape', inputs=[matmul_pattern,
+      'Reshape', inputs=[layer_pattern,
                          graph_matcher.OpTypePattern('*')])
 
-  conv_batch_norm_pattern = graph_matcher.OpTypePattern(
+  batch_norm_pattern = graph_matcher.OpTypePattern(
       'FusedBatchNorm',
       inputs=[
-          conv_pattern, gamma_pattern, beta_pattern, mean_pattern,
-          variance_pattern
-      ])
-  conv_moving_average_sub_pattern = graph_matcher.OpTypePattern(
-      'Sub', inputs=[moving_average_pattern, conv_batch_norm_pattern])
-  # TODO(suharshs): Use a OneofPattern here when available
-  conv_moving_average_mul_pattern = graph_matcher.OpTypePattern(
-      'Mul', inputs=[conv_moving_average_sub_pattern, bn_decay_pattern])
-  matmul_batch_norm_pattern = graph_matcher.OpTypePattern(
-      'FusedBatchNorm',
-      inputs=[
-          matmul_reshape_pattern, gamma_pattern, beta_pattern, mean_pattern,
-          variance_pattern
+          graph_matcher.OneofPattern([matmul_reshape_pattern, layer_pattern]),
+          gamma_pattern, beta_pattern, mean_pattern, variance_pattern
       ])
   matmul_bn_output_reshape_pattern = graph_matcher.OpTypePattern(
-      'Reshape',
-      inputs=[matmul_batch_norm_pattern,
-              graph_matcher.OpTypePattern('*')])
-
-  matmul_moving_average_sub_pattern = graph_matcher.OpTypePattern(
-      'Sub', inputs=[moving_average_pattern, matmul_batch_norm_pattern])
-  matmul_moving_average_mul_pattern = graph_matcher.OpTypePattern(
-      'Mul', inputs=[matmul_moving_average_sub_pattern, bn_decay_pattern])
-
-  conv_matcher = graph_matcher.GraphMatcher(conv_batch_norm_pattern)
-  matmul_matcher = graph_matcher.GraphMatcher(matmul_bn_output_reshape_pattern)
-  conv_moving_average_mul_matcher = graph_matcher.GraphMatcher(
-      conv_moving_average_mul_pattern)
-  matmul_moving_average_mul_matcher = graph_matcher.GraphMatcher(
-      matmul_moving_average_mul_pattern)
-
-  def _GetMovingAverageTensors(graph, moving_avg_mul_matcher,
-                               moving_avg_sub_pattern, bn_op):
-    """Gets the moving mean and variance tensors and the batch norm momentum."""
-    for mul_match_result in moving_avg_mul_matcher.match_graph(graph):
-      sub_op = mul_match_result.get_op(moving_avg_sub_pattern)
-
-      if sub_op.inputs[1].name == bn_op.outputs[1].name:
-        # During training: Batch Mean is bn_op.outputs[1]
-        moving_mean_tensor = sub_op.inputs[0]
-        bn_decay_mean_tensor = mul_match_result.get_tensor(bn_decay_pattern)
-      if sub_op.inputs[1].name == bn_op.outputs[2].name:
-        # During training: Batch Var is bn_op.outputs[2]
-        moving_variance_tensor = sub_op.inputs[0]
-        bn_decay_var_tensor = mul_match_result.get_tensor(bn_decay_pattern)
-    return (moving_mean_tensor, bn_decay_mean_tensor, moving_variance_tensor,
-            bn_decay_var_tensor)
-
-  def _GetCommonTensors(match_result, bn_op, bn_input_tensor):
-    """Gets tensors needed for FusedBatchNormMatch from match_result."""
+      'Reshape', inputs=[batch_norm_pattern,
+                         graph_matcher.OpTypePattern('*')])
+
+  bn_matcher = graph_matcher.GraphMatcher(
+      graph_matcher.OneofPattern(
+          [matmul_bn_output_reshape_pattern, batch_norm_pattern]))
+
+  moving_average_sub_pattern = graph_matcher.OpTypePattern(
+      'Sub', inputs=[moving_average_pattern, batch_norm_pattern])
+  moving_average_mul_pattern = graph_matcher.OpTypePattern(
+      'Mul', inputs=[moving_average_sub_pattern, bn_decay_pattern])
+
+  moving_avg_mul_matcher = graph_matcher.GraphMatcher(
+      moving_average_mul_pattern)
+
+  for match_result in bn_matcher.match_graph(graph):
+    moving_mean_tensor = None
+    moving_variance_tensor = None
+    bn_decay_mean_tensor = None
+    bn_decay_var_tensor = None
+    layer_op = match_result.get_op(layer_pattern)
+    layer_tensor = match_result.get_tensor(layer_pattern)
+    bn_op = match_result.get_op(batch_norm_pattern)
+    batch_epsilon_tensor = bn_op.get_attr('epsilon')
+
+    # In the MatMul case, the output of batch norm is reshaped back into a
+    # 2D tensor, so the output_tensor is the output of the Reshape op.
+    output_tensor = bn_op.outputs[0]
+    if layer_op.type == 'MatMul':
+      output_reshape_op = match_result.get_op(matmul_bn_output_reshape_pattern)
+      # If the matcher didn't match matmul_bn_output_reshape, there will be
+      # another match for this 'MatMul' later, so we can skip this one.
+      if output_reshape_op is None:
+        continue
+      output_tensor = output_reshape_op.outputs[0]
+
     input_tensor = match_result.get_tensor(input_pattern)
     weight_tensor = match_result.get_tensor(weight_pattern)
     gamma_tensor = match_result.get_tensor(gamma_pattern)
@@ -295,40 +241,25 @@ def _FindFusedBatchNorms(graph):
       g = ops.get_default_graph()
       with g.as_default(), g.name_scope(scope + sep):
         n = math_ops.cast(
-            array_ops.size(bn_input_tensor) / array_ops.size(mean_tensor),
+            array_ops.size(layer_tensor) / array_ops.size(mean_tensor),
             dtypes.float32)
         variance_tensor = math_ops.multiply(
             bn_op.outputs[2], (n - 1) / n, name='Undo_Bessel_Correction')
+      # TODO(suharshs): Find a way to get rid of this inner match.
+      for mul_match_result in moving_avg_mul_matcher.match_graph(graph):
+        sub_op = mul_match_result.get_op(moving_average_sub_pattern)
+        if sub_op.inputs[1].name == bn_op.outputs[1].name:
+          # During training: Batch Mean is bn_op.outputs[1]
+          moving_mean_tensor = sub_op.inputs[0]
+          bn_decay_mean_tensor = mul_match_result.get_tensor(bn_decay_pattern)
+        if sub_op.inputs[1].name == bn_op.outputs[2].name:
+          # During training: Batch Var is bn_op.outputs[2]
+          moving_variance_tensor = sub_op.inputs[0]
+          bn_decay_var_tensor = mul_match_result.get_tensor(bn_decay_pattern)
     else:
       mean_tensor = match_result.get_tensor(mean_pattern)
       variance_tensor = match_result.get_tensor(variance_pattern)
-    return (input_tensor, weight_tensor, gamma_tensor, beta_tensor, mean_tensor,
-            variance_tensor)
-
-  for match_result in conv_matcher.match_graph(graph):
-    moving_mean_tensor = None
-    moving_variance_tensor = None
-    bn_decay_mean_tensor = None
-    bn_decay_var_tensor = None
-    layer_op = match_result.get_op(conv_pattern)
-    layer_tensor = match_result.get_tensor(conv_pattern)
-    bn_op = match_result.get_op(conv_batch_norm_pattern)
-    if bn_op.get_attr('is_training'):
-      (moving_mean_tensor, bn_decay_mean_tensor, moving_variance_tensor,
-       bn_decay_var_tensor) = _GetMovingAverageTensors(
-           graph,
-           moving_avg_mul_matcher=conv_moving_average_mul_matcher,
-           moving_avg_sub_pattern=conv_moving_average_sub_pattern,
-           bn_op=bn_op)
 
-    output_tensor = bn_op.outputs[0]
-    batch_epsilon_tensor = bn_op.get_attr('epsilon')
-    (input_tensor, weight_tensor, gamma_tensor, beta_tensor, mean_tensor,
-     variance_tensor) = _GetCommonTensors(
-         match_result,
-         bn_op,
-         layer_tensor,
-     )
     yield _BatchNormMatch(
         layer_op=layer_op,
         bn_op=bn_op,
@@ -345,124 +276,146 @@ def _FindFusedBatchNorms(graph):
         bn_decay_var_tensor=bn_decay_var_tensor,
         batch_epsilon_tensor=batch_epsilon_tensor)
 
-  for match_result in matmul_matcher.match_graph(graph):
-    moving_mean_tensor = None
-    moving_variance_tensor = None
-    bn_decay_mean_tensor = None
-    bn_decay_var_tensor = None
-    layer_op = match_result.get_op(matmul_pattern)
-    layer_tensor = match_result.get_tensor(matmul_pattern)
-    bn_op = match_result.get_op(matmul_batch_norm_pattern)
-    if bn_op.get_attr('is_training'):
-      (moving_mean_tensor, bn_decay_mean_tensor, moving_variance_tensor,
-       bn_decay_var_tensor) = _GetMovingAverageTensors(
-           graph,
-           moving_avg_mul_matcher=matmul_moving_average_mul_matcher,
-           moving_avg_sub_pattern=matmul_moving_average_sub_pattern,
-           bn_op=bn_op)
-
-    # In the MatMul case, the output of batch norm is reshaped back into a
-    # 2D tensor, so the output_tensor is the output of the Reshape op.
-    output_reshape_op = match_result.get_op(matmul_bn_output_reshape_pattern)
-    output_tensor = output_reshape_op.outputs[0]
-    batch_epsilon_tensor = bn_op.get_attr('epsilon')
 
-    (input_tensor, weight_tensor, gamma_tensor, beta_tensor, mean_tensor,
-     variance_tensor) = _GetCommonTensors(match_result, bn_op, layer_tensor)
-    yield _BatchNormMatch(
-        layer_op=layer_op,
-        bn_op=bn_op,
-        output_tensor=output_tensor,
-        input_tensor=input_tensor,
-        weight_tensor=weight_tensor,
-        gamma_tensor=gamma_tensor,
-        beta_tensor=beta_tensor,
-        mean_tensor=mean_tensor,
-        variance_tensor=variance_tensor,
-        moving_mean_tensor=moving_mean_tensor,
-        moving_variance_tensor=moving_variance_tensor,
-        bn_decay_mean_tensor=bn_decay_mean_tensor,
-        bn_decay_var_tensor=bn_decay_var_tensor,
-        batch_epsilon_tensor=batch_epsilon_tensor)
+def _ComputeBatchNormCorrections(context, match, freeze_batch_norm_delay,
+                                 fused_batch_norm):
+  """Computes batch norm correction params.
 
+     Before batch normalization is frozen:
+     We use batch statistics for batch norm.
+       correction_scale = sigma_b/sigma_mv
+       correction_recip = 1/correction_scale
+       correction_offset = 0
 
-class _BatchNormMatch(object):
-  """Contains all information related to a found Fused/UnfusedBatchNorm."""
+     After batch normalization is frozen:
+      correction_scale = sigma_b/sigma_mv
+      correction_recip = 1
+      correction_offset =  gamma*(mu_b/sigma_b-mu_mv/sigma_mv).
 
-  def __init__(self, layer_op, bn_op, output_tensor, input_tensor,
-               weight_tensor, gamma_tensor, beta_tensor, mean_tensor,
-               variance_tensor, moving_mean_tensor, moving_variance_tensor,
-               bn_decay_mean_tensor, bn_decay_var_tensor, batch_epsilon_tensor):
-    self._layer_op = layer_op
-    self._bn_op = bn_op
-    self._output_tensor = output_tensor
-    self._input_tensor = input_tensor
-    self._weight_tensor = weight_tensor
-    self._gamma_tensor = gamma_tensor
-    self._beta_tensor = beta_tensor
-    self._mean_tensor = mean_tensor
-    self._variance_tensor = variance_tensor
-    self._moving_mean_tensor = moving_mean_tensor
-    self._moving_variance_tensor = moving_variance_tensor
-    self._bn_decay_mean_tensor = bn_decay_mean_tensor
-    self._bn_decay_var_tensor = bn_decay_var_tensor
-    self._batch_epsilon_tensor = batch_epsilon_tensor
+     Batch norm is frozen if global_step > bn_freeze_delay.
+     The corrections ensure that:
+     a) The weights are quantized after scaling by gamma/sigma_mv. This enables
+     smoother training as the scaling on the weights changes slowly, rather than
+     jump across mini-batches
+     b) Changing the values of the corrections allows for one to switch between
+     using batch statistics to using moving mean and average, without requiring
+     changes to batch_norm
 
-  @property
-  def layer_op(self):
-    return self._layer_op
 
-  @property
-  def bn_op(self):
-    return self._bn_op
+  Args:
+    context: The scope under which we look for batch norm params
+    match: Object containg required batch norm tensors for correction
+      computation
+    freeze_batch_norm_delay: Delay in steps at which computation switches
+      from regular batch norm to frozen mean and variance.
+    fused_batch_norm: Bool, true if fused batch norm is used
 
-  @property
-  def output_tensor(self):
-    return self._output_tensor
+  Returns:
+    A tuple of correction_scale, correction_recip, correction_offset
+  """
 
-  @property
-  def input_tensor(self):
-    return self._input_tensor
+  g = ops.get_default_graph()
+  with g.name_scope(context + '/batch_norm_correction'):
+    recip_sigma_mv = math_ops.rsqrt(
+        match.moving_variance_tensor + match.batch_epsilon_tensor)
+    recip_sigma = math_ops.rsqrt(
+        match.variance_tensor + match.batch_epsilon_tensor)
+    correction_scale = math_ops.divide(
+        recip_sigma_mv, recip_sigma, name='scale_compute')
+    correction_scale = array_ops.identity(
+        correction_scale, name='correction_scale')
+    correction_recip = math_ops.reciprocal(
+        correction_scale, name='reciprocal_compute')
+    correction_offset = math_ops.multiply(
+        match.gamma_tensor,
+        match.mean_tensor * recip_sigma -
+        match.moving_mean_tensor * recip_sigma_mv,
+        name='offset_compute')
 
-  @property
-  def weight_tensor(self):
-    return self._weight_tensor
+    if freeze_batch_norm_delay is not None:
+      use_mv_avg = math_ops.greater_equal(
+          training_util.get_or_create_global_step(),
+          freeze_batch_norm_delay,
+          name='use_moving_average')
+    else:
+      use_mv_avg = False
 
-  @property
-  def gamma_tensor(self):
-    return self._gamma_tensor
+    bn_decay_zero = 0.0
+    bn_decay_mean_consumers = list(match.bn_decay_mean_tensor.consumers())
+    bn_decay_var_consumers = list(match.bn_decay_mean_tensor.consumers())
 
-  @property
-  def beta_tensor(self):
-    return self._beta_tensor
+    bn_decay_mean_out = utils.smart_cond(
+        use_mv_avg,
+        lambda: bn_decay_zero,
+        lambda: match.bn_decay_mean_tensor,
+        name='freeze_moving_mean')
+    graph_editor.reroute_ts(
+        [bn_decay_mean_out], [match.bn_decay_mean_tensor],
+        can_modify=bn_decay_mean_consumers)
 
-  @property
-  def mean_tensor(self):
-    return self._mean_tensor
+    if fused_batch_norm is False:
+      bn_decay_var_consumers = list(match.bn_decay_var_tensor.consumers())
+      bn_decay_var_out = utils.smart_cond(
+          use_mv_avg,
+          lambda: bn_decay_zero,
+          lambda: match.bn_decay_var_tensor,
+          name='freeze_moving_var')
+      graph_editor.reroute_ts(
+          [bn_decay_var_out], [match.bn_decay_var_tensor],
+          can_modify=bn_decay_var_consumers)
 
-  @property
-  def variance_tensor(self):
-    return self._variance_tensor
+    correction_recip = utils.smart_cond(
+        use_mv_avg,
+        lambda: array_ops.ones(correction_scale.shape),
+        lambda: correction_recip,
+        name='correction_recip')
 
-  @property
-  def moving_mean_tensor(self):
-    return self._moving_mean_tensor
+    correction_offset = utils.smart_cond(
+        use_mv_avg,
+        lambda: correction_offset,
+        lambda: array_ops.zeros(correction_offset.shape),
+        name='correction_offset')
+  return correction_scale, correction_recip, correction_offset
 
-  @property
-  def moving_variance_tensor(self):
-    return self._moving_variance_tensor
 
-  @property
-  def batch_epsilon_tensor(self):
-    return self._batch_epsilon_tensor
+def _CloneWithNewOperands(layer_op, input_tensor, weight_tensor):
+  """Clones layer_op with input_tensor and weight_tensor as new inputs."""
+  new_layer_name = layer_op.name.split('/')[-1] + '_Fold'
+  if layer_op.type == 'Conv2D':
+    return nn_ops.conv2d(
+        input_tensor,
+        weight_tensor,
+        strides=layer_op.get_attr('strides'),
+        padding=layer_op.get_attr('padding'),
+        use_cudnn_on_gpu=layer_op.get_attr('use_cudnn_on_gpu'),
+        data_format=layer_op.get_attr('data_format'),
+        name=new_layer_name)
+  elif layer_op.type == 'MatMul':
+    return math_ops.matmul(
+        input_tensor,
+        weight_tensor,
+        transpose_a=layer_op.get_attr('transpose_a'),
+        transpose_b=layer_op.get_attr('transpose_b'),
+        name=new_layer_name)
+  elif layer_op.type == 'DepthwiseConv2dNative':
+    return nn.depthwise_conv2d(
+        input_tensor,
+        weight_tensor,
+        strides=layer_op.get_attr('strides'),
+        padding=layer_op.get_attr('padding'),
+        name=new_layer_name)
+  else:
+    raise ValueError('Cannot handle operation of type: %s' % layer_op.type)
 
-  @property
-  def bn_decay_mean_tensor(self):
-    return self._bn_decay_mean_tensor
 
-  @property
-  def bn_decay_var_tensor(self):
-    return self._bn_decay_var_tensor
+@ops.RegisterGradient('FoldFusedBatchNormGrad')
+def _FoldFusedBatchNormGrad(op, unused_grad_y, grad_mean, grad_var, unused_1,
+                            unused_2):
+  x = op.inputs[0]
+  n = x.get_shape().num_elements() / grad_mean.get_shape().num_elements()
+  dmean_dx = grad_mean / n
+  dvar_dx = 2 * grad_var * (x - op.outputs[1]) / (n - 1)
+  return (dmean_dx + dvar_dx), None, None, None, None
 
 
 def _FoldUnfusedBatchNorms(graph, freeze_batch_norm_delay, is_training):
@@ -475,81 +428,42 @@ def _FoldUnfusedBatchNorms(graph, freeze_batch_norm_delay, is_training):
     graph: Graph to walk and modify.
     freeze_batch_norm_delay: How many steps to wait before freezing
     moving mean and variance and using them for batch normalization
-    is_training: Bool, True if training
-
-  Raises:
-    ValueError: When batch norm folding fails.
-  """
-  input_to_ops_map = input_to_ops.InputToOps(graph)
-
-  for bn in common.BatchNormGroups(graph):
-    has_scaling = _HasScaling(graph, input_to_ops_map, bn)
-
-    # The mangling code intimately depends on BatchNorm node's internals.
-    original_op, folded_op = _CreateFoldedOp(
-        graph,
-        bn,
-        has_scaling=has_scaling,
-        freeze_batch_norm_delay=freeze_batch_norm_delay,
-        is_training=is_training)
-
-    activation = common.GetEndpointActivationOp(graph, bn)
-    if activation:
-      nodes_modified_count = graph_editor.reroute_ts([folded_op.outputs[0]],
-                                                     [original_op.outputs[0]],
-                                                     can_modify=[activation])
-      if nodes_modified_count != 1:
-        raise ValueError('Unexpected inputs to op: %s' % activation.name)
-      continue
-
-    # Treat consumer ops in bypass modules differently since they have Add
-    # operations instead of Relu* above.
-    add_bypass_ctx = re.search(r'^(.*)/([^/]+)', bn).group(1)
-    add_bypass = graph.get_operation_by_name(add_bypass_ctx + '/Add')
-    nodes_modified_count = graph_editor.reroute_ts([folded_op.outputs[0]],
-                                                   [original_op.outputs[0]],
-                                                   can_modify=[add_bypass])
-    if nodes_modified_count != 1:
-      raise ValueError('Unexpected inputs to op: %s' % add_bypass.name)
-
-
-def _HasScaling(graph, input_to_ops_map, bn):
-  r"""Checks if batch norm  has scaling enabled.
-
-  Difference between batch norm with scaling and without is that with scaling:
-
-  Rsqrt -> mul -> mul_1
-              \-> mul_2
-
-  where
-    mul multiplies gamma by inverse square root of EMA of batch variance,
-    mul_1 multiplies output of mul with output from the base operation
-      (convolution, FC or depthwise convolution),
-    mul_2 multiplies output of mul with EMA of batch mean,
-  and without scaling:
-
-  Rsqrt -> mul
-       \-> mul_1
-
-  where
-    mul multiplies the inverse square root of EMA of batch variance with output
-      from the base operation,
-    mul_1 multiplies inverse square root of EMA of batch variance with EMA
-      of batch mean.
-
-  Args:
-    graph: Graph to inspect.
-    input_to_ops_map: InputToOps object containing mapping from tensor's name
-      to ops that take it as input.
-    bn: Batch norm layer prefix string.
+    is_training: Bool, True if training
 
-  Returns:
-    A boolean indicating whether this batch norm layer has scaling enabled.
+  Raises:
+    ValueError: When batch norm folding fails.
   """
-  rsqrt_op = graph.get_operation_by_name(bn + '/BatchNorm/batchnorm/Rsqrt')
-  rsqrt_consumers = input_to_ops_map.ConsumerOperations(rsqrt_op)
+  input_to_ops_map = input_to_ops.InputToOps(graph)
 
-  return sum(1 for op in rsqrt_consumers if op.type == 'Mul') == 1
+  for bn in common.BatchNormGroups(graph):
+    has_scaling = _HasScaling(graph, input_to_ops_map, bn)
+
+    # The mangling code intimately depends on BatchNorm node's internals.
+    original_op, folded_op = _CreateFoldedOp(
+        graph,
+        bn,
+        has_scaling=has_scaling,
+        freeze_batch_norm_delay=freeze_batch_norm_delay,
+        is_training=is_training)
+
+    activation = common.GetEndpointActivationOp(graph, bn)
+    if activation:
+      nodes_modified_count = graph_editor.reroute_ts([folded_op.outputs[0]],
+                                                     [original_op.outputs[0]],
+                                                     can_modify=[activation])
+      if nodes_modified_count != 1:
+        raise ValueError('Unexpected inputs to op: %s' % activation.name)
+      continue
+
+    # Treat consumer ops in bypass modules differently since they have Add
+    # operations instead of Relu* above.
+    add_bypass_ctx = re.search(r'^(.*)/([^/]+)', bn).group(1)
+    add_bypass = graph.get_operation_by_name(add_bypass_ctx + '/Add')
+    nodes_modified_count = graph_editor.reroute_ts([folded_op.outputs[0]],
+                                                   [original_op.outputs[0]],
+                                                   can_modify=[add_bypass])
+    if nodes_modified_count != 1:
+      raise ValueError('Unexpected inputs to op: %s' % add_bypass.name)
 
 
 def _GetBatchNormParams(graph, context, has_scaling):
@@ -629,107 +543,6 @@ def _GetBatchNormParams(graph, context, has_scaling):
       batch_epsilon_tensor=batch_epsilon_tensor)
 
 
-def _ComputeBatchNormCorrections(context, match, freeze_batch_norm_delay,
-                                 fused_batch_norm):
-  """Computes batch norm correction params.
-
-     Before batch normalization is frozen:
-     We use batch statistics for batch norm.
-       correction_scale = sigma_b/sigma_mv
-       correction_recip = 1/correction_scale
-       correction_offset = 0
-
-     After batch normalization is frozen:
-      correction_scale = sigma_b/sigma_mv
-      correction_recip = 1
-      correction_offset =  gamma*(mu_b/sigma_b-mu_mv/sigma_mv).
-
-     Batch norm is frozen if global_step > bn_freeze_delay.
-     The corrections ensure that:
-     a) The weights are quantized after scaling by gamma/sigma_mv. This enables
-     smoother training as the scaling on the weights changes slowly, rather than
-     jump across mini-batches
-     b) Changing the values of the corrections allows for one to switch between
-     using batch statistics to using moving mean and average, without requiring
-     changes to batch_norm
-
-
-  Args:
-    context: The scope under which we look for batch norm params
-    match: Object containg required batch norm tensors for correction
-      computation
-    freeze_batch_norm_delay: Delay in steps at which computation switches
-      from regular batch norm to frozen mean and variance.
-    fused_batch_norm: Bool, true if fused batch norm is used
-
-  Returns:
-    A tuple of correction_scale, correction_recip, correction_offset
-  """
-
-  g = ops.get_default_graph()
-  with g.name_scope(context + 'batch_norm_correction'):
-    recip_sigma_mv = math_ops.rsqrt(
-        match.moving_variance_tensor + match.batch_epsilon_tensor)
-    recip_sigma = math_ops.rsqrt(
-        match.variance_tensor + match.batch_epsilon_tensor)
-    correction_scale = math_ops.divide(
-        recip_sigma_mv, recip_sigma, name='scale_compute')
-    correction_scale = array_ops.identity(
-        correction_scale, name='correction_scale')
-    correction_recip = math_ops.reciprocal(
-        correction_scale, name='reciprocal_compute')
-    correction_offset = math_ops.multiply(
-        match.gamma_tensor,
-        match.mean_tensor * recip_sigma -
-        match.moving_mean_tensor * recip_sigma_mv,
-        name='offset_compute')
-
-    if freeze_batch_norm_delay is not None:
-      use_mv_avg = math_ops.greater_equal(
-          training_util.get_or_create_global_step(),
-          freeze_batch_norm_delay,
-          name='use_moving_average')
-    else:
-      use_mv_avg = False
-
-    bn_decay_zero = 0.0
-    bn_decay_mean_consumers = list(match.bn_decay_mean_tensor.consumers())
-    bn_decay_var_consumers = list(match.bn_decay_mean_tensor.consumers())
-
-    bn_decay_mean_out = utils.smart_cond(
-        use_mv_avg,
-        lambda: bn_decay_zero,
-        lambda: match.bn_decay_mean_tensor,
-        name='freeze_moving_mean')
-    graph_editor.reroute_ts(
-        [bn_decay_mean_out], [match.bn_decay_mean_tensor],
-        can_modify=bn_decay_mean_consumers)
-
-    if fused_batch_norm is False:
-      bn_decay_var_consumers = list(match.bn_decay_var_tensor.consumers())
-      bn_decay_var_out = utils.smart_cond(
-          use_mv_avg,
-          lambda: bn_decay_zero,
-          lambda: match.bn_decay_var_tensor,
-          name='freeze_moving_var')
-      graph_editor.reroute_ts(
-          [bn_decay_var_out], [match.bn_decay_var_tensor],
-          can_modify=bn_decay_var_consumers)
-
-    correction_recip = utils.smart_cond(
-        use_mv_avg,
-        lambda: array_ops.ones(correction_scale.shape),
-        lambda: correction_recip,
-        name='correction_recip')
-
-    correction_offset = utils.smart_cond(
-        use_mv_avg,
-        lambda: correction_offset,
-        lambda: array_ops.zeros(correction_offset.shape),
-        name='correction_offset')
-  return correction_scale, correction_recip, correction_offset
-
-
 def _CreateFoldedOp(graph, context, has_scaling, freeze_batch_norm_delay,
                     is_training):
   """Folds in batch norm layer into preceding convolution or FC layer.
@@ -961,3 +774,121 @@ def _AssertShapesMatch(op_name, in_tensor, out_tensor):
   if not in_shape.is_compatible_with(out_shape):
     raise ValueError('%s should not change tensor shape: input %s, '
                      'output %s' % (op_name, in_shape, out_shape))
+
+
+def _HasScaling(graph, input_to_ops_map, bn):
+  r"""Checks if batch norm  has scaling enabled.
+
+  Difference between batch norm with scaling and without is that with scaling:
+
+  Rsqrt -> mul -> mul_1
+              \-> mul_2
+
+  where
+    mul multiplies gamma by inverse square root of EMA of batch variance,
+    mul_1 multiplies output of mul with output from the base operation
+      (convolution, FC or depthwise convolution),
+    mul_2 multiplies output of mul with EMA of batch mean,
+  and without scaling:
+
+  Rsqrt -> mul
+       \-> mul_1
+
+  where
+    mul multiplies the inverse square root of EMA of batch variance with output
+      from the base operation,
+    mul_1 multiplies inverse square root of EMA of batch variance with EMA
+      of batch mean.
+
+  Args:
+    graph: Graph to inspect.
+    input_to_ops_map: InputToOps object containing mapping from tensor's name
+      to ops that take it as input.
+    bn: Batch norm layer prefix string.
+
+  Returns:
+    A boolean indicating whether this batch norm layer has scaling enabled.
+  """
+  rsqrt_op = graph.get_operation_by_name(bn + '/BatchNorm/batchnorm/Rsqrt')
+  rsqrt_consumers = input_to_ops_map.ConsumerOperations(rsqrt_op)
+
+  return sum(1 for op in rsqrt_consumers if op.type == 'Mul') == 1
+
+
+class _BatchNormMatch(object):
+  """Contains all information related to a found Fused/UnfusedBatchNorm."""
+
+  def __init__(self, layer_op, bn_op, output_tensor, input_tensor,
+               weight_tensor, gamma_tensor, beta_tensor, mean_tensor,
+               variance_tensor, moving_mean_tensor, moving_variance_tensor,
+               bn_decay_mean_tensor, bn_decay_var_tensor, batch_epsilon_tensor):
+    self._layer_op = layer_op
+    self._bn_op = bn_op
+    self._output_tensor = output_tensor
+    self._input_tensor = input_tensor
+    self._weight_tensor = weight_tensor
+    self._gamma_tensor = gamma_tensor
+    self._beta_tensor = beta_tensor
+    self._mean_tensor = mean_tensor
+    self._variance_tensor = variance_tensor
+    self._moving_mean_tensor = moving_mean_tensor
+    self._moving_variance_tensor = moving_variance_tensor
+    self._bn_decay_mean_tensor = bn_decay_mean_tensor
+    self._bn_decay_var_tensor = bn_decay_var_tensor
+    self._batch_epsilon_tensor = batch_epsilon_tensor
+
+  @property
+  def layer_op(self):
+    return self._layer_op
+
+  @property
+  def bn_op(self):
+    return self._bn_op
+
+  @property
+  def output_tensor(self):
+    return self._output_tensor
+
+  @property
+  def input_tensor(self):
+    return self._input_tensor
+
+  @property
+  def weight_tensor(self):
+    return self._weight_tensor
+
+  @property
+  def gamma_tensor(self):
+    return self._gamma_tensor
+
+  @property
+  def beta_tensor(self):
+    return self._beta_tensor
+
+  @property
+  def mean_tensor(self):
+    return self._mean_tensor
+
+  @property
+  def variance_tensor(self):
+    return self._variance_tensor
+
+  @property
+  def moving_mean_tensor(self):
+    return self._moving_mean_tensor
+
+  @property
+  def moving_variance_tensor(self):
+    return self._moving_variance_tensor
+
+  @property
+  def batch_epsilon_tensor(self):
+    return self._batch_epsilon_tensor
+
+  @property
+  def bn_decay_mean_tensor(self):
+    return self._bn_decay_mean_tensor
+
+  @property
+  def bn_decay_var_tensor(self):
+    return self._bn_decay_var_tensor
-- 
GitLab


From 5476489053f0523b8aebab05bc39a02c089300e0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 5 Feb 2018 15:46:50 -0800
Subject: [PATCH 681/966] [XLA] Sink layout sensitivity from CSE into
 HloInstruction::Identical, and make it the default.

PiperOrigin-RevId: 184598903
---
 tensorflow/compiler/xla/service/hlo_cse.cc    |  5 ++--
 .../compiler/xla/service/hlo_instruction.cc   | 17 ++++++------
 .../compiler/xla/service/hlo_instruction.h    | 27 ++++++++++++++-----
 3 files changed, 31 insertions(+), 18 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_cse.cc b/tensorflow/compiler/xla/service/hlo_cse.cc
index 7feda2b3b0..279edd4ba8 100644
--- a/tensorflow/compiler/xla/service/hlo_cse.cc
+++ b/tensorflow/compiler/xla/service/hlo_cse.cc
@@ -119,9 +119,8 @@ StatusOr<bool> HloCSE::Run(HloModule* module) {
           equivalent_instructions;
       for (HloInstruction* user : operand->users()) {
         if (user != instruction &&
-            user->Identical(*instruction, eq_instructions, eq_computations) &&
-            (!is_layout_sensitive_ ||
-             ShapeUtil::Equal(user->shape(), instruction->shape()))) {
+            user->Identical(*instruction, eq_instructions, eq_computations,
+                            is_layout_sensitive_)) {
           equivalent_instructions.push_back(user);
         }
       }
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index fac6b43405..277648f072 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -1612,7 +1612,8 @@ bool HloInstruction::HasConstantOperand() const {
 bool HloInstruction::IdenticalSlowPath(
     const HloInstruction& other,
     const std::function<bool(const HloComputation*, const HloComputation*)>&
-        eq_computations) const {
+        eq_computations,
+    const std::function<bool(const Shape&, const Shape&)>& eq_shapes) const {
   // Perform opcode specific checks.
   switch (opcode()) {
     // The result of these instructions only depend upon their opcode and
@@ -1671,7 +1672,7 @@ bool HloInstruction::IdenticalSlowPath(
       return parameter_number() == other.parameter_number() &&
              // Check the shape too because `this` and `other` may be in
              // different HloComputations.
-             ShapeUtil::Compatible(shape(), other.shape());
+             eq_shapes(shape(), other.shape());
 
     case HloOpcode::kBatchNormTraining:
     case HloOpcode::kBatchNormInference:
@@ -1727,18 +1728,18 @@ bool HloInstruction::IdenticalSlowPath(
              protobuf_util::ProtobufEquals(window(), other.window());
 
     case HloOpcode::kReshape:
-      return ShapeUtil::Compatible(shape(), other.shape());
+      return eq_shapes(shape(), other.shape());
 
     // Transpose result is determined by the final shape and the permutation.
     case HloOpcode::kTranspose:
-      return ShapeUtil::Compatible(shape(), other.shape()) &&
+      return eq_shapes(shape(), other.shape()) &&
              dimensions() == other.dimensions();
 
     // Remaining instructions with special values.
     case HloOpcode::kBitcast:
-      return ShapeUtil::Equal(shape(), other.shape());
+      return eq_shapes(shape(), other.shape());
     case HloOpcode::kBroadcast:
-      return ShapeUtil::Compatible(shape(), other.shape()) &&
+      return eq_shapes(shape(), other.shape()) &&
              dimensions() == other.dimensions();
     case HloOpcode::kConcatenate:
       return dimensions() == other.dimensions();
@@ -1752,10 +1753,10 @@ bool HloInstruction::IdenticalSlowPath(
              slice_limits_ == other.slice_limits_ &&
              slice_strides_ == other.slice_strides_;
     case HloOpcode::kDynamicSlice:
-      return ShapeUtil::Compatible(shape(), other.shape()) &&
+      return eq_shapes(shape(), other.shape()) &&
              dynamic_slice_sizes_ == other.dynamic_slice_sizes_;
     case HloOpcode::kDynamicUpdateSlice:
-      return ShapeUtil::Compatible(shape(), other.shape());
+      return eq_shapes(shape(), other.shape());
     case HloOpcode::kCall:
     case HloOpcode::kMap:
       return eq_computations(to_apply(), other.to_apply());
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index bce9ebdda8..50931c563a 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -554,27 +554,36 @@ class HloInstruction {
   }
 
   // Returns true if "other" performs the same computation as this instruction.
-  // Layout of the instructions' output array is not considered.
   bool Identical(
       const HloInstruction& other,
       const std::function<bool(const HloInstruction*, const HloInstruction*)>&
           eq_operands = std::equal_to<const HloInstruction*>(),
       const std::function<bool(const HloComputation*, const HloComputation*)>&
-          eq_computations = std::equal_to<const HloComputation*>()) const {
+          eq_computations = std::equal_to<const HloComputation*>(),
+      bool layout_sensitive = true) const {
     // An instruction is always identical to itself.
     if (this == &other) {
       return true;
     }
 
-    // Identical instruction must have the same opcode and identical operands.
-    // In general, there is no need to check shape because shape is inferred
-    // from the shape of the operands.
+    // Identical instruction must have the same opcode, shape, and identical
+    // operands.
     if (opcode() != other.opcode()) {
       return false;
     }
+    auto eq_shapes = layout_sensitive
+                         ? [](const Shape& a,
+                              const Shape& b) { return ShapeUtil::Equal(a, b); }
+                         : [](const Shape& a, const Shape& b) {
+                             return ShapeUtil::Compatible(a, b);
+                           };
+    if (!eq_shapes(shape(), other.shape())) {
+      return false;
+    }
     if (operands().size() != other.operands().size()) {
       return false;
     }
+
     // Use an explicit loop rather than ContainerEquals, because copying around
     // std::functions may be too expensive in some cases.
     for (size_t i = 0; i < operands().size(); ++i) {
@@ -583,7 +592,7 @@ class HloInstruction {
       }
     }
 
-    return IdenticalSlowPath(other, eq_computations);
+    return IdenticalSlowPath(other, eq_computations, eq_shapes);
   }
 
   // Returns whether the instruction has a constant operand.
@@ -1232,10 +1241,14 @@ class HloInstruction {
   class FusionReusesParamElements;
 
   // See comments on Identical().
+  // eq_shapes() is used to check shapes for equality, and would normally be
+  // expected to be ShapeUtil::Equals or ShapeUtil::Compatible, depending on
+  // whether we want a layout-sensitive check or not.
   bool IdenticalSlowPath(
       const HloInstruction& other,
       const std::function<bool(const HloComputation*, const HloComputation*)>&
-          eq_computations) const;
+          eq_computations,
+      const std::function<bool(const Shape&, const Shape&)>& eq_shapes) const;
 
   // Creates an n-ary elementwise operation.
   static std::unique_ptr<HloInstruction> CreateNary(
-- 
GitLab


From 8fe70dc41f269b266b2adce7bed96b3833dda60e Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Mon, 5 Feb 2018 15:49:57 -0800
Subject: [PATCH 682/966] "frame_name" attr must be altered when
 importing/exporting MetaGraphDefs.

The frame_name attr of Enter operations must be the name of the
associated WhileLoopContext, otherwise taking the gradient of the loop
will result in frame errors.

I'm not happy that the export logic is in meta_graph.py (all other
control flow de/serialization is in control_flow_ops.py). However, I
can't think of how else to do it, since only export_scoped_meta_graph
has access to the NodeDefs being exported.

PiperOrigin-RevId: 184599323
---
 tensorflow/python/framework/meta_graph.py     |  6 ++-
 .../python/framework/meta_graph_test.py       | 52 +++++++++++++++++++
 tensorflow/python/ops/control_flow_ops.py     | 13 +++++
 3 files changed, 69 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/framework/meta_graph.py b/tensorflow/python/framework/meta_graph.py
index fc1a82361b..8c03a5f19d 100644
--- a/tensorflow/python/framework/meta_graph.py
+++ b/tensorflow/python/framework/meta_graph.py
@@ -87,6 +87,10 @@ def _node_def(from_node_def, export_scope, unbound_inputs, clear_devices=False):
                compat.as_str(s).split("@")[1].startswith(export_scope)]
       node_def.attr[k].CopyFrom(attr_value_pb2.AttrValue(
           list=attr_value_pb2.AttrValue.ListValue(s=new_s)))
+    elif node_def.op in ("Enter", "RefEnter") and k == "frame_name":
+      if not export_scope or compat.as_str(v.s).startswith(export_scope):
+        new_s = compat.as_bytes(ops.strip_name_scope(v.s, export_scope))
+      node_def.attr[k].CopyFrom(attr_value_pb2.AttrValue(s=new_s))
     else:
       node_def.attr[k].CopyFrom(v)
 
@@ -959,5 +963,3 @@ def copy_scoped_meta_graph(from_scope, to_scope,
                                       graph=to_graph,
                                       import_scope=to_scope)
   return var_list
-
-
diff --git a/tensorflow/python/framework/meta_graph_test.py b/tensorflow/python/framework/meta_graph_test.py
index b5ed135284..f2f1e83da1 100644
--- a/tensorflow/python/framework/meta_graph_test.py
+++ b/tensorflow/python/framework/meta_graph_test.py
@@ -25,6 +25,7 @@ import shutil
 
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
+from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
@@ -34,6 +35,7 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import metrics
 from tensorflow.python.ops import nn_ops
@@ -447,6 +449,56 @@ class ScopedMetaGraphTest(test.TestCase):
       del b.collection_def["unbound_inputs"]
       test_util.assert_meta_graph_protos_equal(self, a, b)
 
+  def testWhileLoopGradients(self):
+    # Create a simple while loop.
+    with ops.Graph().as_default():
+      with ops.name_scope("export"):
+        var = variables.Variable(0)
+        var_name = var.name
+        _, output = control_flow_ops.while_loop(lambda i, x: i < 5,
+                                                lambda i, x: (i + 1, x + i),
+                                                [0, var])
+        output_name = output.name
+
+      # Generate a MetaGraphDef containing the while loop with an export scope.
+      meta_graph_def, _ = meta_graph.export_scoped_meta_graph(
+          export_scope="export")
+
+      # Build and run the gradients of the while loop. We use this below to
+      # verify that the gradients are correct with the imported MetaGraphDef.
+      init_op = variables.global_variables_initializer()
+      grad = gradients_impl.gradients([output], [var])
+      with session.Session() as sess:
+        sess.run(init_op)
+        expected_grad_value = sess.run(grad)
+
+    # Restore the MetaGraphDef into a new Graph with an import scope.
+    with ops.Graph().as_default():
+      meta_graph.import_scoped_meta_graph(meta_graph_def, import_scope="import")
+
+      # Re-export and make sure we get the same MetaGraphDef.
+      new_meta_graph_def, _ = meta_graph.export_scoped_meta_graph(
+          export_scope="import")
+      test_util.assert_meta_graph_protos_equal(
+          self, meta_graph_def, new_meta_graph_def)
+
+      # Make sure we can still build gradients and get the same result.
+
+      def new_name(tensor_name):
+        base_tensor_name = tensor_name.replace("export/", "")
+        return "import/" + base_tensor_name
+
+      var = ops.get_default_graph().get_tensor_by_name(new_name(var_name))
+      output = ops.get_default_graph().get_tensor_by_name(new_name(output_name))
+      grad = gradients_impl.gradients([output], [var])
+
+      init_op = variables.global_variables_initializer()
+
+      with session.Session() as sess:
+        sess.run(init_op)
+        actual_grad_value = sess.run(grad)
+        self.assertEqual(expected_grad_value, actual_grad_value)
+
   def testScopedImportUnderNameScope(self):
     graph = ops.Graph()
     with graph.as_default():
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index bcd187d821..e75eb0843f 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -57,6 +57,7 @@ import functools
 import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.protobuf import control_flow_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
@@ -79,6 +80,7 @@ from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops.gen_control_flow_ops import *
 # pylint: enable=wildcard-import
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_should_use
@@ -2242,6 +2244,17 @@ class WhileContext(ControlFlowContext):
     super(WhileContext, self).__init__(
         values_def=context_def.values_def, import_scope=import_scope)
 
+    # import_scope causes self.name to be different from the original serialized
+    # context's name. Rewrite "frame_name" attrs with the new name.
+    if import_scope:
+      for tensor_name in self._values:
+        op = g.as_graph_element(tensor_name).op
+        if util.IsLoopEnter(op):
+          # pylint: disable=protected-access
+          op._set_attr("frame_name",
+                       attr_value_pb2.AttrValue(s=compat.as_bytes(self.name)))
+          # pylint: enable=protected-access
+
   @property
   def maximum_iterations(self):
     """The maximum number of iterations that will be executed."""
-- 
GitLab


From ca27418eec4f6302ff25bb5c1ef143b207bb5a4a Mon Sep 17 00:00:00 2001
From: Jonathan Hseu <vomjom@vomjom.net>
Date: Mon, 5 Feb 2018 16:05:25 -0800
Subject: [PATCH 683/966] Bump the rtol in hmc_test (#16787)

---
 tensorflow/contrib/bayesflow/python/kernel_tests/hmc_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/hmc_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/hmc_test.py
index d9d0dfce44..d244d2f4f5 100644
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/hmc_test.py
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/hmc_test.py
@@ -617,7 +617,7 @@ class _HMCHandlesLists(object):
           expected_vars,
       ])
       self.assertAllClose(expected_means_, actual_means_, atol=0.05, rtol=0.16)
-      self.assertAllClose(expected_vars_, actual_vars_, atol=0., rtol=0.30)
+      self.assertAllClose(expected_vars_, actual_vars_, atol=0., rtol=0.40)
 
 
 class HMCHandlesLists16(_HMCHandlesLists, test.TestCase):
-- 
GitLab


From aea9333d5bee349f1daff0c551a38b8bf73beffa Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Mon, 5 Feb 2018 16:04:06 -0800
Subject: [PATCH 684/966] Remove makefile build dependency on
 all_opensource_files, as part of the effort to remove all_opensource_files
 #15758.

PiperOrigin-RevId: 184601439
---
 tensorflow/contrib/makefile/BUILD | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/tensorflow/contrib/makefile/BUILD b/tensorflow/contrib/makefile/BUILD
index a8dd59f32a..701eeb44fe 100644
--- a/tensorflow/contrib/makefile/BUILD
+++ b/tensorflow/contrib/makefile/BUILD
@@ -12,20 +12,3 @@ filegroup(
     ),
     visibility = ["//tensorflow:__subpackages__"],
 )
-
-sh_test(
-    name = "build_all_linux",
-    size = "enormous",
-    srcs = ["build_all_linux.sh"],
-    data = [
-        "//tensorflow:all_opensource_files",
-        "//third_party/eigen3:all_files",
-        "//third_party/fft2d:all_files",
-    ],
-    tags = [
-        "manual",
-        "no_gpu",
-        "no_oss",
-        "notap",
-    ],
-)
-- 
GitLab


From 0375ffcf83e16c3d6818fa67c9c13de810c1dacf Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Mon, 5 Feb 2018 16:08:16 -0800
Subject: [PATCH 685/966] Add filepaths to test_local support.

PiperOrigin-RevId: 184602010
---
 tensorflow/tools/dist_test/local_test.sh | 33 ++++++++++++++----------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/tensorflow/tools/dist_test/local_test.sh b/tensorflow/tools/dist_test/local_test.sh
index 7d7f92d246..435f9d0dc9 100755
--- a/tensorflow/tools/dist_test/local_test.sh
+++ b/tensorflow/tools/dist_test/local_test.sh
@@ -24,19 +24,20 @@
 # 3) Call a script to launch a k8s TensorFlow GRPC cluster inside the container
 #    and run the distributed test suite.
 #
-# Usage: local_test.sh <whl_url>
+# Usage: local_test.sh <whl_file_location>
 #                      [--leave_container_running]
 #                      [--model_name <MODEL_NAME>]
 #                      [--num_workers <NUM_WORKERS>]
 #                      [--num_parameter_servers <NUM_PARAMETER_SERVERS>]
 #                      [--sync_replicas]
 #
-# E.g., local_test.sh <whl_url> --model_name CENSUS_WIDENDEEP
-#       local_test.sh <whl_url> --num_workers 3 --num_parameter_servers 3
+# E.g., local_test.sh <whl_file_location> --model_name CENSUS_WIDENDEEP
+#       local_test.sh <whl_file_location> --num_workers 3 --num_parameter_servers 3
 #
 # Arguments:
-# <whl_url>
-#   Specify custom TensorFlow whl file URL to install in the test Docker image.
+# whl_file_location: URL from which the TensorFlow whl file will be acquired.
+#   E.g.: https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc1-cp27-none-linux_x86_64.whl
+#   E.g.: /path/to/folder/tensorflow-0.11.0rc1-cp27-none-linux_x86_64.whl
 #
 # --leave_container_running:  Do not stop the docker-in-docker container after
 #                             the termination of the tests, e.g., for debugging
@@ -81,9 +82,9 @@ NUM_WORKERS=2
 NUM_PARAMETER_SERVERS=2
 SYNC_REPLICAS_FLAG=""
 
-WHL_URL=${1}
-if [[ -z "${WHL_URL}" ]]; then
-  die "whl file URL is not specified"
+WHL_FILE_LOCATION=${1}
+if [[ -z "${WHL_FILE_LOCATION}" ]]; then
+  die "whl file location is not specified"
 fi
 
 while true; do
@@ -98,8 +99,8 @@ while true; do
     NUM_PARAMETER_SERVERS=$2
   elif [[ $1 == "--sync_replicas" ]]; then
     SYNC_REPLICAS_FLAG="--sync_replicas"
-  elif [[ $1 == "--whl_url" ]]; then
-    WHL_URL=$2
+  elif [[ $1 == "--WHL_FILE_LOCATION" ]]; then
+    WHL_FILE_LOCATION=$2
   fi
 
   shift
@@ -130,15 +131,19 @@ fi
 # Create docker build context directory.
 BUILD_DIR=$(mktemp -d)
 echo ""
-echo "Using whl file URL: ${WHL_URL}"
+echo "Using whl file location: ${WHL_FILE_LOCATION}"
 echo "Building in temporary directory: ${BUILD_DIR}"
 
 cp -r ${DIR}/* "${BUILD_DIR}"/ || \
   die "Failed to copy files to ${BUILD_DIR}"
 
-# Download whl file into the build context directory.
-wget -P "${BUILD_DIR}" ${WHL_URL} || \
-  die "Failed to download tensorflow whl file from URL: ${WHL_URL}"
+if [[ $WHL_FILE_LOCATION =~ 'http://' || $WHL_FILE_LOCATION =~ 'https://' ]]; then
+    # Download whl file into the build context directory.
+    wget -P "${BUILD_DIR}" "${WHL_FILE_LOCATION}" || \
+        die "Failed to download tensorflow whl file from URL: ${WHL_FILE_LOCATION}"
+else
+    cp "${WHL_FILE_LOCATION}" "${BUILD_DIR}"
+fi
 
 # Build docker image for test.
 docker build ${NO_CACHE_FLAG} -t ${DOCKER_IMG_NAME} \
-- 
GitLab


From fe7bbd8565f63d30d6a96c7f8c3c4326488e613d Mon Sep 17 00:00:00 2001
From: Glenn Weidner <gweidner@us.ibm.com>
Date: Mon, 5 Feb 2018 16:15:21 -0800
Subject: [PATCH 686/966] Remove invalid exception in linear operator (#16678)

---
 tensorflow/python/ops/linalg/linear_operator.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/python/ops/linalg/linear_operator.py b/tensorflow/python/ops/linalg/linear_operator.py
index 27e0f17020..8339c940af 100644
--- a/tensorflow/python/ops/linalg/linear_operator.py
+++ b/tensorflow/python/ops/linalg/linear_operator.py
@@ -478,7 +478,6 @@ class LinearOperator(object):
           cond,
           self._max_condition_number_to_be_non_singular(),
           message="Singular matrix up to precision epsilon.")
-    raise NotImplementedError("assert_non_singular is not implemented.")
 
   def _max_condition_number_to_be_non_singular(self):
     """Return the maximum condition number that we consider nonsingular."""
-- 
GitLab


From 283f03c825312efd3319cb37dc1a412288a536ec Mon Sep 17 00:00:00 2001
From: ImSheridan <xiaoyudong-512@163.com>
Date: Tue, 6 Feb 2018 08:15:33 +0800
Subject: [PATCH 687/966] Fix the mean the description of the output shape,
 which should be [batch_size, 1, max_time] (#16642)

---
 tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
index d6b5eceb47..0a53fd66db 100644
--- a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
@@ -331,7 +331,7 @@ def _luong_score(query, keys, scale):
   # batched matmul on:
   #   [batch_size, 1, depth] . [batch_size, depth, max_time]
   # resulting in an output shape of:
-  #   [batch_time, 1, max_time].
+  #   [batch_size, 1, max_time].
   # we then squeeze out the center singleton dimension.
   score = math_ops.matmul(query, keys, transpose_b=True)
   score = array_ops.squeeze(score, [1])
-- 
GitLab


From dcefe9bc65de55b6b3bc81c01adc3e41ec2d33aa Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 5 Feb 2018 16:12:19 -0800
Subject: [PATCH 688/966] Shard linear operator tests.

PiperOrigin-RevId: 184602704
---
 tensorflow/python/kernel_tests/linalg/BUILD | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tensorflow/python/kernel_tests/linalg/BUILD b/tensorflow/python/kernel_tests/linalg/BUILD
index 4e18eaa4e8..fd1b5bab6f 100644
--- a/tensorflow/python/kernel_tests/linalg/BUILD
+++ b/tensorflow/python/kernel_tests/linalg/BUILD
@@ -39,6 +39,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
+    shard_count = 5,
     tags = ["noasan"],  # times out b/63678675
 )
 
@@ -57,6 +58,7 @@ cuda_py_test(
         "//tensorflow/python:platform_test",
         "//tensorflow/python:random_ops",
     ],
+    shard_count = 5,
 )
 
 cuda_py_test(
@@ -73,6 +75,7 @@ cuda_py_test(
         "//tensorflow/python:platform_test",
         "//tensorflow/python:random_ops",
     ],
+    shard_count = 5,
 )
 
 cuda_py_test(
@@ -88,6 +91,7 @@ cuda_py_test(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
     ],
+    shard_count = 5,
 )
 
 cuda_py_test(
@@ -134,6 +138,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
+    shard_count = 5,
 )
 
 filegroup(
-- 
GitLab


From 179795c0067f05abe54904797288efebf6958b35 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 5 Feb 2018 16:32:38 -0800
Subject: [PATCH 689/966] Support negative axis in concatenation

PiperOrigin-RevId: 184605786
---
 .../contrib/lite/kernels/concatenation.cc      |  6 ++++--
 .../contrib/lite/kernels/concatenation_test.cc | 18 +++++++++++++++++-
 .../contrib/lite/testing/generate_examples.py  |  4 +++-
 .../propagate_fixed_sizes.cc                   | 11 +++++++----
 4 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/concatenation.cc b/tensorflow/contrib/lite/kernels/concatenation.cc
index 9e7a1233da..7ff9075318 100644
--- a/tensorflow/contrib/lite/kernels/concatenation.cc
+++ b/tensorflow/contrib/lite/kernels/concatenation.cc
@@ -49,6 +49,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // dimensions except 'axis' must be equal.
   TfLiteTensor* t0 = &context->tensors[node->inputs->data[0]];
   TfLiteType input_type = t0->type;
+  if (axis < 0) axis += t0->dims->size;
   TF_LITE_ENSURE(context, axis >= 0);
   TF_LITE_ENSURE(context, axis < t0->dims->size);
 
@@ -131,8 +132,9 @@ template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params =
       reinterpret_cast<TfLiteConcatenationParams*>(node->builtin_data);
-
+  int axis = params->axis;
   TfLiteTensor* output = &context->tensors[node->outputs->data[0]];
+  if (axis < 0) axis += output->dims->size;
 
 // TODO(ahentz): Creating 'all_inputs' below is not very efficient. We should
 // allocate and populate these during Prepare().
@@ -141,7 +143,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 #define TF_LITE_CONCATENATION(type, scalar)                                 \
   VectorOfInputs<scalar> all_inputs(*context, *node->inputs);               \
   type::Concatenation<FusedActivationFunctionType::kNone, scalar>(          \
-      RemapDim(NumDimensions(output), params->axis), all_inputs.data(),     \
+      RemapDim(NumDimensions(output), axis), all_inputs.data(),             \
       all_inputs.dims(), node->inputs->size, GetTensorData<scalar>(output), \
       GetTensorDims(output))
 
diff --git a/tensorflow/contrib/lite/kernels/concatenation_test.cc b/tensorflow/contrib/lite/kernels/concatenation_test.cc
index 499856a93c..ba1ffc5f84 100644
--- a/tensorflow/contrib/lite/kernels/concatenation_test.cc
+++ b/tensorflow/contrib/lite/kernels/concatenation_test.cc
@@ -94,7 +94,7 @@ TEST(ConcatenationOpTest, TwoDimensionalOneInput) {
   EXPECT_THAT(m0.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
 }
 
-TEST(ConcatenationOpTest, TwoInputsTwoAxis) {
+TEST(ConcatenationOpTest, TwoInputsTwoAxesNegativeAxes) {
   // We will concatenate two tensors along different dimensions.
   auto tensor0 = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
   auto tensor1 = {7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f};
@@ -107,6 +107,14 @@ TEST(ConcatenationOpTest, TwoInputsTwoAxis) {
   EXPECT_THAT(m0.GetOutput(),
               ElementsAreArray({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}));
 
+  ConcatenationOpModel m0_negative({TensorType_FLOAT32, {2, 3}}, /*axis=*/-2,
+                                   /*num_inputs=*/2);
+  m0_negative.SetInput(0, tensor0);
+  m0_negative.SetInput(1, tensor1);
+  m0_negative.Invoke();
+  EXPECT_THAT(m0_negative.GetOutput(),
+              ElementsAreArray({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}));
+
   ConcatenationOpModel m1({TensorType_FLOAT32, {2, 3}}, /*axis=*/1,
                           /*num_inputs=*/2);
   m1.SetInput(0, tensor0);
@@ -114,6 +122,14 @@ TEST(ConcatenationOpTest, TwoInputsTwoAxis) {
   m1.Invoke();
   EXPECT_THAT(m1.GetOutput(),
               ElementsAreArray({1, 2, 3, 7, 8, 9, 4, 5, 6, 10, 11, 12}));
+
+  ConcatenationOpModel m1_negative({TensorType_FLOAT32, {2, 3}}, /*axis=*/-1,
+                                   /*num_inputs=*/2);
+  m1_negative.SetInput(0, tensor0);
+  m1_negative.SetInput(1, tensor1);
+  m1_negative.Invoke();
+  EXPECT_THAT(m1_negative.GetOutput(),
+              ElementsAreArray({1, 2, 3, 7, 8, 9, 4, 5, 6, 10, 11, 12}));
 }
 
 TEST(ConcatenationOpTest, FourInputs) {
diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py
index b2227a7c98..6264daa988 100644
--- a/tensorflow/contrib/lite/testing/generate_examples.py
+++ b/tensorflow/contrib/lite/testing/generate_examples.py
@@ -1001,13 +1001,15 @@ def make_concatenation_tests(zip_path):
   test_parameters = [{
       "base_shape": [[1, 3, 4, 3], [3, 4]],
       "num_tensors": [1, 2, 3, 4, 5, 6],
-      "axis": [0, 1, 2, 3],
+      "axis": [0, 1, 2, 3, -3, -2, -1],
   }]
 
   def get_shape(parameters, delta):
     """Return a tweaked version of 'base_shape'."""
     axis = parameters["axis"]
     shape = parameters["base_shape"][:]
+    if axis < 0:
+      axis += len(shape)
     if axis < len(shape):
       shape[axis] += delta
     return shape
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index 7f26884bc1..fa7e70d90b 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -546,6 +546,9 @@ void ProcessConcatenationOperator(Model* model, ConcatenationOperator* op) {
   // Use 0 input as basis for output dimensions.
   const auto& first_input_array = model->GetArray(op->inputs[0]);
   output_array.copy_shape(first_input_array.shape());
+  // Negative axis means the count starts at the back of the dims().
+  int axis = op->axis;
+  if (axis < 0) axis += first_input_array.shape().dims().size();
   // Determine the concat size, and enfore that all inputs have
   // the same dimensions count.
   int concat_size = 0;
@@ -558,14 +561,14 @@ void ProcessConcatenationOperator(Model* model, ConcatenationOperator* op) {
     CHECK_EQ(input_array.shape().dimensions_count(),
              output_array.shape().dimensions_count());
     const std::vector<int>& input_dims = input_array.shape().dims();
-    CHECK_LT(op->axis, input_dims.size());
-    concat_size += input_dims[op->axis];
+    CHECK_LT(axis, input_dims.size());
+    concat_size += input_dims[axis];
   }
   // Write out the concat_size on the output array shape.
   auto& output_shape = *output_array.mutable_shape();
   auto& output_dims = *output_shape.mutable_dims();
-  CHECK_LT(op->axis, output_shape.dimensions_count());
-  output_dims[op->axis] = concat_size;
+  CHECK_LT(axis, output_shape.dimensions_count());
+  output_dims[axis] = concat_size;
 }
 
 void ProcessRangeOperator(Model* model, RangeOperator* op) {
-- 
GitLab


From 076e004c912c133367f4a457e3df95192d276814 Mon Sep 17 00:00:00 2001
From: ManHyuk <manhyuk@kw.ac.kr>
Date: Tue, 6 Feb 2018 09:53:07 +0900
Subject: [PATCH 690/966] fix typo

---
 .../contrib/lite/kernels/internal/optimized/tensor_utils_impl.h | 2 +-
 .../lite/kernels/internal/reference/portable_tensor_utils.h     | 2 +-
 tensorflow/contrib/lite/testing/generate_examples.py            | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h b/tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h
index f8be99e82f..4e324a5e10 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TF_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_
 #define TF_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_
 
-// TDOD(ghodrat): Remove this header file and the dependency to internal data
+// TODO(ghodrat): Remove this header file and the dependency to internal data
 // structure.
 #include "tensorflow/contrib/lite/builtin_op_data.h"
 
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.h b/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.h
index afc3e26e79..c05c21b472 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_H_
 #define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_H_
 
-// TDOD(ghodrat): Remove this header file and the dependency to internal data
+// TODO(ghodrat): Remove this header file and the dependency to internal data
 // structure.
 #include "tensorflow/contrib/lite/builtin_op_data.h"
 
diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py
index b2227a7c98..b1519c4d47 100644
--- a/tensorflow/contrib/lite/testing/generate_examples.py
+++ b/tensorflow/contrib/lite/testing/generate_examples.py
@@ -619,7 +619,7 @@ def make_constant_tests(zip_path):
 
   def build_graph(parameters):
     # Since Toco & Tflite can't have a single constant op in the entire graph,
-    # this test adds a zero tesnor with a constant op tensor.
+    # this test adds a zero tensor with a constant op tensor.
     input1 = tf.placeholder(dtype=parameters["dtype"], name="input1",
                             shape=parameters["input_shape"])
     out = tf.ones(parameters["input_shape"], dtype=parameters["dtype"]) + input1
-- 
GitLab


From fa99ec49976c92cd8ea15d2a487f52554de3659d Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Mon, 5 Feb 2018 17:11:39 -0800
Subject: [PATCH 691/966] [XLA:GPU] Split IrEmitter{Unn,N}ested out of
 ir_emitter.h.

Also add a bunch of clarifying comments.

PiperOrigin-RevId: 184610674
---
 tensorflow/compiler/xla/service/gpu/BUILD     |   2 +
 .../compiler/xla/service/gpu/gpu_compiler.cc  |   2 +-
 .../compiler/xla/service/gpu/ir_emitter.cc    |   2 +
 .../compiler/xla/service/gpu/ir_emitter.h     | 229 ++----------------
 .../xla/service/gpu/ir_emitter_nested.cc      |   3 +-
 .../xla/service/gpu/ir_emitter_nested.h       |  72 ++++++
 .../xla/service/gpu/ir_emitter_unnested.cc    |   3 +-
 .../xla/service/gpu/ir_emitter_unnested.h     | 209 ++++++++++++++++
 .../xla/service/llvm_ir/fused_ir_emitter.h    |  21 +-
 9 files changed, 321 insertions(+), 222 deletions(-)
 create mode 100644 tensorflow/compiler/xla/service/gpu/ir_emitter_nested.h
 create mode 100644 tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h

diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 7df01f7edd..9da4fb97fa 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -129,6 +129,8 @@ cc_library(
     hdrs = [
         "ir_emitter.h",
         "ir_emitter_context.h",
+        "ir_emitter_nested.h",
+        "ir_emitter_unnested.h",
     ],
     deps = [
         ":cudnn_convolution_runner",
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index 12ec266ff3..28ebd034ee 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -47,8 +47,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/hlo_schedule.h"
 #include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
-#include "tensorflow/compiler/xla/service/gpu/ir_emitter.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter_context.h"
+#include "tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h"
 #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
 #include "tensorflow/compiler/xla/service/gpu/pad_insertion.h"
 #include "tensorflow/compiler/xla/service/gpu/partition_assignment.h"
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
index affd2ffa8e..a3df67a873 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
@@ -27,6 +27,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/service/elemental_ir_emitter.h"
 #include "tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h"
+#include "tensorflow/compiler/xla/service/gpu/ir_emitter_nested.h"
+#include "tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h"
 #include "tensorflow/compiler/xla/service/gpu/partition_assignment.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h"
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.h b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
index 9031a838f9..b0accc08d4 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
@@ -13,19 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// An XLA HLO graph may contain multiple computations. These computations
-// fall into two types, nested and unnested. We translate each nested
-// computation (e.g. the computation operand of a Map operator) to a device
-// function. For each unnested computation composed of top-level
-// HloInstructions, we generate a CUDA kernel for each HloInstruction.
-//
-// This file declares classes that translate an XLA HLO graph to LLVM IR for
-// GPUs. IrEmitterNested emits LLVM IR for nested computations, and
-// IrEmitterUnnested for unnested computations. The logic of emitting LLVM IR
-// for each individual HloInstruction is largely the same between these two
-// classes. Therefore, we implement the common logic in the Handle* functions in
-// the superclass IrEmitter.
-
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_IR_EMITTER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_IR_EMITTER_H_
 
@@ -60,19 +47,28 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-// This class is the top-level API for the XLA HLO --> LLVM IR compiler.
-// It implements the DfsHloVisitor interface and emits an LLVM IR program that
-// implements the input HLO graph.
+// Abstract base class for translating HLO graphs to LLVM IR for a GPU.
+//
+// There are two concrete subclasses of IrEmitter: IrEmitterNested and
+// IrEmitterUnnested.  In the unnested variety, each HLO gets its own kernel
+// function, whereas in the nested version the whole computation is emitted as
+// one *non-kernel* function.
+//
+// In XLA, kernel functions never call other kernel functions.  This means that
+// if we have a kernel -- e.g. implementing a kReduce HLO -- that wants to use
+// an HLO computation as a "subroutine" -- e.g. the HLO computation that
+// specifies how to reduce two elements -- then the subroutine computation must
+// be emitted using IrEmitterNested.
 //
-// Note: if `T` is a subclass of `IrEmitter` and a handler is not overridden in
-//       either `IrEmitter` or `T`, the handler in `DfsHloVisitorWithDefault`
-//       calls `T::DefaultAction`.
+// Fusion nodes are a special case.  A fusion node is emitted using
+// IrEmitterUnnested, but the code is generated using FusedIrEmitter, which is
+// not a subclass of gpu::IrEmitter, and in fact is better understood as an IR
+// generator generator.  See comments on that class.
 class IrEmitter : public DfsHloVisitorWithDefault {
  public:
   IrEmitter(const IrEmitter&) = delete;
   IrEmitter& operator=(const IrEmitter&) = delete;
 
-  // The following methods implement the DfsHloVisitorWithDefault interface.
   Status DefaultAction(HloInstruction* hlo) override;
   Status HandleConstant(HloInstruction* constant) override;
   Status HandleBitcast(HloInstruction* bitcast) override;
@@ -217,199 +213,6 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   std::map<const HloComputation*, llvm::Function*> computation_to_ir_function_;
 };
 
-// Emits LLVM IR for unnested computations. Each HloInstruction is translated to
-// a separate CUDA kernel. These kernels are inserted into the resultant module
-// sorted in reverse postorder of the XLA HLO graph.
-class IrEmitterUnnested : public IrEmitter {
- public:
-  IrEmitterUnnested(const HloModuleConfig& hlo_module_config,
-                    const HloComputation* hlo_computation,
-                    IrEmitterContext* ir_emitter_context);
-  IrEmitterUnnested(const IrEmitterUnnested&) = delete;
-  IrEmitterUnnested& operator=(const IrEmitterUnnested&) = delete;
-
-  // Transfers the ownship of thunk_sequence_ out.
-  std::unique_ptr<ThunkSequence> ConsumeThunkSequence() {
-    return std::move(thunk_sequence_);
-  }
-
-  Status DefaultAction(HloInstruction* hlo) override;
-
-  // IrEmitterUnnested handles the following instructions differently from
-  // IrEmitter.
-  Status HandleCopy(HloInstruction* copy) override;
-  Status HandleConditional(HloInstruction* conditional) override;
-  Status HandleConvolution(HloInstruction* convolution) override;
-  Status HandleCustomCall(HloInstruction* custom_call) override;
-  Status HandleDot(HloInstruction* dot) override;
-  Status HandleFft(HloInstruction* fft) override;
-  Status HandleFusion(HloInstruction* fusion) override;
-  Status HandleGetTupleElement(HloInstruction* get_tuple_element) override;
-  Status HandleReduce(HloInstruction* reduce) override;
-  Status HandleSelectAndScatter(HloInstruction* instruction) override;
-  Status HandleTuple(HloInstruction* tuple) override;
-  Status HandleWhile(HloInstruction* xla_while) override;
-  Status HandleInfeed(HloInstruction* xla_infeed) override;
-  Status HandleRng(HloInstruction* random) override;
-  Status HandleSelect(HloInstruction* select) override;
-
-  Status EmitTargetElementLoop(
-      const HloInstruction& hlo,
-      const llvm_ir::ElementGenerator& body_emitter) override;
-
-  // Same as `EmitTargetElementLoop`, but in given `thunk` rather than
-  // `LastThunk()`.
-  Status EmitTargetElementLoopInThunk(
-      const HloInstruction& hlo, const llvm_ir::ElementGenerator& body_emitter,
-      KernelThunk* thunk);
-
- private:
-  // Builds the appropriate thunk for the instruction hlo and returns the owning
-  // pointer to it. The caller needs to make sure `inst` outlives the lifetime
-  // of the returned Thunk object.
-  std::unique_ptr<Thunk> BuildThunk(const HloInstruction* hlo);
-
-  // Builds the prototype of the IR kernel for `inst` and adds it to the module.
-  llvm::Function* BuildKernelPrototype(
-      const HloInstruction& inst,
-      tensorflow::gtl::ArraySlice<const HloInstruction*> escaped_hlos);
-
-  // Emits the base pointers for `hlo` and its operands. `io_hlos` will store
-  // all input/output HLOs among `hlo` and its operands.
-  llvm::Function* EmitBasePointersForHloAndItsOperands(
-      const HloInstruction& hlo, std::vector<const HloInstruction*>* io_hlos);
-
-  // EmitColumnReduction and EmitRowReduction emit code for column and row
-  // reduction of a matrix and/or 3D tensor. Row and column reduction have
-  // different memory access pattern, so for performance their implementations
-  // are significantly different.
-  //
-  // Emits code that reduces a matrix of shape [height x width] to a vector of
-  // [width]. Other parameters have the same meaning as those of
-  // `EmitReductionToVector`. Note that input shape might not be
-  // [height x width], but can be bitcast to [height x weight] with "height"
-  // being the major dimension.
-  Status EmitColumnReduction(int64 height, int64 width, HloInstruction* reduce,
-                             const Shape& input_shape,
-                             const llvm_ir::ElementGenerator& input_gen,
-                             const llvm_ir::ElementGenerator& init_value_gen,
-                             HloComputation* reducer);
-
-  // Emits code that reduces a 3D tensor of shape [depth x height x width] to a
-  // vector of shape [height]. Other parameters have the same meaning as those
-  // of `EmitReductionToVector`. Note that input shape might not be
-  // [depth x height x width], but can be bitcast to [depth x height x weight]
-  // with "depth" being the most major dimension.
-  Status EmitRowReduction(int64 depth, int64 height, int64 width,
-                          HloInstruction* reduce, const Shape& input_shape,
-                          const llvm_ir::ElementGenerator& input_gen,
-                          const llvm_ir::ElementGenerator& init_value_gen,
-                          HloComputation* reducer);
-
-  // Emits code that reduces a tensor of arbitrary rank to a scalar.
-  Status EmitReductionToScalar(HloInstruction* reduce, const Shape& input_shape,
-                               const llvm_ir::ElementGenerator& input_gen,
-                               const llvm_ir::ElementGenerator& init_value_gen,
-                               HloComputation* reducer);
-
-  // Figures out whether `reduce` is a row or column reduction, and which
-  // dimensions to reduce, and calls either `EmitRowReduction` or
-  // `EmitColumnReduction` as appropriate. `input_shape` is the shape of the
-  // input array, which is the operand of the Reduce instruction if unfused or
-  // of the Fusion instruction if fused. `input_gen` and `init_value_gen`
-  // generate elements of the input and the initial value. Other parameters mean
-  // the same as for `HandleReduce`.
-  //
-  // Prerequisite: `IsReductionToVector(*reduce)`
-  Status EmitReductionToVector(
-      HloInstruction* reduce, const Shape& input_shape,
-      const llvm_ir::ElementGenerator& input_gen,
-      const llvm_ir::ElementGenerator& init_value_gen,
-      tensorflow::gtl::ArraySlice<int64> dimensions_to_reduce,
-      HloComputation* reducer);
-
-  // Emits code to initialize buffer of `inst` in given `thunk`.
-  Status EmitInitializer(const HloInstruction* inst, KernelThunk* thunk);
-
-  // Returns a KernelThunk that invokes the kernel emitted for `inst`. The
-  // caller needs to make sure `inst` outlives the lifetime of the returned
-  // Thunk object.
-  std::unique_ptr<Thunk> BuildKernelThunk(const HloInstruction* inst);
-
-  // Returns a FftThunk that calls cuFFT to implement `inst`.
-  std::unique_ptr<Thunk> BuildFftThunk(const HloInstruction* inst);
-
-  // Returns a GemmThunk that calls gemm to implement `inst`. The caller needs
-  // to make sure `inst` outlives the lifetime of the returned Thunk object.
-  std::unique_ptr<Thunk> BuildGemmThunk(const HloInstruction* inst);
-
-  // Returns a thunk that calls host-to-device cuMemcpy to implement `inst`.
-  std::unique_ptr<Thunk> BuildHostToDeviceCopyThunk(const HloInstruction* inst);
-
-  // Returns a thunk that calls device-to-device cuMemcpy to implement `inst`.
-  std::unique_ptr<Thunk> BuildDeviceToDeviceCopyThunk(
-      const HloInstruction* inst);
-
-  // Returns an InfeedThunk that performs device-to-device memcpy to implement
-  // `inst`.
-  std::unique_ptr<Thunk> BuildInfeedThunk(const HloInstruction* inst);
-
-  // Returns a WhileThunk that invokes thunk sequences for 'condition' and
-  // 'body' sub-computations of while instruction 'hlo'.
-  std::unique_ptr<Thunk> BuildWhileThunk(const HloInstruction* hlo);
-
-  // Returns a ForThunk which executes 'loop_limit' invocations of a thunk
-  // sequence from the 'body' sub-computation of the while instruction 'hlo'.
-  std::unique_ptr<Thunk> BuildForThunk(const HloInstruction* hlo,
-                                       const int64 loop_limit);
-
-  // Returns a ConditionalThunk that executes the thunk sequence for
-  // 'true_computation' or 'false_computation' depending on the value of the
-  // predicate in the given conditional instruction.
-  std::unique_ptr<Thunk> BuildConditionalThunk(const HloInstruction* hlo);
-
-  Status Postprocess(HloInstruction* hlo) override;
-
-  // Returns the last generated thunk.
-  Thunk* LastThunk() const { return thunk_sequence_->back().get(); }
-
-  // The thunk sequence this IrEmitter generates for the input computation.
-  std::unique_ptr<ThunkSequence> thunk_sequence_;
-
-  // The HloComputation that this IrEmitter emits code for.
-  const HloComputation* hlo_computation_;
-};
-
-// Emits LLVM IR for a nested computation to the resultant function.
-class IrEmitterNested : public IrEmitter {
- public:
-  // Constructs an LLVM IR emitter for a nested HLO computation. `function` is
-  // the containing IR function this emitter produces IR to. See
-  // IrEmitter::IrEmitter for the meanings of other arguments.
-  IrEmitterNested(const HloModuleConfig& hlo_module_config,
-                  const HloComputation& nested_computation,
-                  IrEmitterContext* ir_emitter_context);
-  IrEmitterNested(const IrEmitterNested&) = delete;
-  IrEmitterNested& operator=(const IrEmitterNested&) = delete;
-
-  // Overrides the default empty implementation. Binds the given instruction
-  // "parameter" with the parameter of the IR function.
-  Status HandleParameter(HloInstruction* parameter) override;
-
-  llvm::Function* GetEmittedFunction() const { return emitted_function_; }
-
-  Status EmitTargetElementLoop(
-      const HloInstruction& hlo,
-      const llvm_ir::ElementGenerator& body_emitter) override;
-
- private:
-  llvm::Function* EmitBasePointersForNestedComputation(
-      const HloComputation& nested_computation,
-      std::vector<const HloInstruction*>* io_hlos);
-
-  llvm::Function* emitted_function_;
-};
-
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc
index 5225ff36ff..71aada080a 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc
@@ -16,12 +16,13 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "tensorflow/compiler/xla/service/gpu/ir_emitter_nested.h"
+
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h"
-#include "tensorflow/compiler/xla/service/gpu/ir_emitter.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter_context.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.h
new file mode 100644
index 0000000000..ca11cf2c18
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.h
@@ -0,0 +1,72 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_IR_EMITTER_NESTED_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_IR_EMITTER_NESTED_H_
+
+#include "llvm/IR/Function.h"
+#include "tensorflow/compiler/xla/service/gpu/ir_emitter.h"
+
+namespace xla {
+namespace gpu {
+
+// Emits LLVM IR for a "nested computation" into a non-kernel device function.
+//
+// This is used to emit code for HloComputations that don't require a separate
+// kernel call.  For example, IrEmitterNested is used to emit code for a kReduce
+// HLO's elementwise reduction computation.  Notably, IrEmitterNested is *not*
+// used to emit code for fusion nodes -- fusion nodes use FusedIrEmitter, which
+// is a different beast altogether.
+//
+// IrEmitterNested generates a non-kernel function with the following
+// parameters:
+//
+//   - N pointers to the buffers of each of the N parameters to the computation,
+//   - a pointer to the output buffer of the computation, and
+//   - a pointer to the top-level temp buffer.
+//
+class IrEmitterNested : public IrEmitter {
+ public:
+  // Constructs an LLVM IR emitter for a nested HLO computation. `function` is
+  // the containing IR function this emitter produces IR to. See
+  // IrEmitter::IrEmitter for the meanings of other arguments.
+  IrEmitterNested(const HloModuleConfig& hlo_module_config,
+                  const HloComputation& nested_computation,
+                  IrEmitterContext* ir_emitter_context);
+  IrEmitterNested(const IrEmitterNested&) = delete;
+  IrEmitterNested& operator=(const IrEmitterNested&) = delete;
+
+  // Overrides the default empty implementation. Binds the given instruction
+  // "parameter" with the parameter of the IR function.
+  Status HandleParameter(HloInstruction* parameter) override;
+
+  llvm::Function* GetEmittedFunction() const { return emitted_function_; }
+
+  Status EmitTargetElementLoop(
+      const HloInstruction& hlo,
+      const llvm_ir::ElementGenerator& body_emitter) override;
+
+ private:
+  llvm::Function* EmitBasePointersForNestedComputation(
+      const HloComputation& nested_computation,
+      std::vector<const HloInstruction*>* io_hlos);
+
+  llvm::Function* emitted_function_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_IR_EMITTER_NESTED_H_
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index a4847f6ca9..08fea34a70 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -17,6 +17,8 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h"
+
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Function.h"
@@ -40,7 +42,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h"
 #include "tensorflow/compiler/xla/service/gpu/infeed_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
-#include "tensorflow/compiler/xla/service/gpu/ir_emitter.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter_context.h"
 #include "tensorflow/compiler/xla/service/gpu/kernel_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h"
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
new file mode 100644
index 0000000000..56ab8208ce
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
@@ -0,0 +1,209 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_IR_EMITTER_UNNESTED_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_IR_EMITTER_UNNESTED_H_
+
+#include "tensorflow/compiler/xla/service/gpu/ir_emitter.h"
+#include "tensorflow/compiler/xla/service/gpu/thunk.h"
+
+namespace xla {
+namespace gpu {
+
+// Emits LLVM IR for an "unnested computation".
+//
+// An unnested computation is an HloComputation which you run by executing one
+// or more kernels for each HloInstruction it contains.  Examples of unnested
+// computations:
+//
+//  - An HloModule's root computation,
+//  - The body of an HLO while loop,
+//  - The true/false computation of an HLO conditional.
+//
+// Note the opportunity for confusion -- the while loop's computation is nested
+// within the root computation, but it's emitted using IrEmitterUnnested!  Don't
+// think about it too hard.
+//
+// Examples of things that are not unnested computations:
+//
+//  - The reducer of a kReduce HLO.  This is emited using IrEmitterNested.
+//  - The body of a fusion node.  IrEmitterUnenested emits the relevant code
+//    within a kernel function using FusedIrEmitter.  (FusedIrEmitter is not
+//    really an IrEmitter, but is more an "IR generator generator".)
+//
+class IrEmitterUnnested : public IrEmitter {
+ public:
+  IrEmitterUnnested(const HloModuleConfig& hlo_module_config,
+                    const HloComputation* hlo_computation,
+                    IrEmitterContext* ir_emitter_context);
+  IrEmitterUnnested(const IrEmitterUnnested&) = delete;
+  IrEmitterUnnested& operator=(const IrEmitterUnnested&) = delete;
+
+  // Transfers the ownship of thunk_sequence_ out.
+  std::unique_ptr<ThunkSequence> ConsumeThunkSequence() {
+    return std::move(thunk_sequence_);
+  }
+
+  Status DefaultAction(HloInstruction* hlo) override;
+
+  // IrEmitterUnnested handles the following instructions differently from
+  // IrEmitter.
+  Status HandleCopy(HloInstruction* copy) override;
+  Status HandleConditional(HloInstruction* conditional) override;
+  Status HandleConvolution(HloInstruction* convolution) override;
+  Status HandleCustomCall(HloInstruction* custom_call) override;
+  Status HandleDot(HloInstruction* dot) override;
+  Status HandleFft(HloInstruction* fft) override;
+  Status HandleFusion(HloInstruction* fusion) override;
+  Status HandleGetTupleElement(HloInstruction* get_tuple_element) override;
+  Status HandleReduce(HloInstruction* reduce) override;
+  Status HandleSelectAndScatter(HloInstruction* instruction) override;
+  Status HandleTuple(HloInstruction* tuple) override;
+  Status HandleWhile(HloInstruction* xla_while) override;
+  Status HandleInfeed(HloInstruction* xla_infeed) override;
+  Status HandleRng(HloInstruction* random) override;
+  Status HandleSelect(HloInstruction* select) override;
+
+  Status EmitTargetElementLoop(
+      const HloInstruction& hlo,
+      const llvm_ir::ElementGenerator& body_emitter) override;
+
+  // Same as `EmitTargetElementLoop`, but in given `thunk` rather than
+  // `LastThunk()`.
+  Status EmitTargetElementLoopInThunk(
+      const HloInstruction& hlo, const llvm_ir::ElementGenerator& body_emitter,
+      KernelThunk* thunk);
+
+ private:
+  // Builds the appropriate thunk for the instruction hlo and returns the owning
+  // pointer to it. The caller needs to make sure `inst` outlives the lifetime
+  // of the returned Thunk object.
+  std::unique_ptr<Thunk> BuildThunk(const HloInstruction* hlo);
+
+  // Builds the prototype of the IR kernel for `inst` and adds it to the module.
+  llvm::Function* BuildKernelPrototype(
+      const HloInstruction& inst,
+      tensorflow::gtl::ArraySlice<const HloInstruction*> escaped_hlos);
+
+  // Emits the base pointers for `hlo` and its operands. `io_hlos` will store
+  // all input/output HLOs among `hlo` and its operands.
+  llvm::Function* EmitBasePointersForHloAndItsOperands(
+      const HloInstruction& hlo, std::vector<const HloInstruction*>* io_hlos);
+
+  // EmitColumnReduction and EmitRowReduction emit code for column and row
+  // reduction of a matrix and/or 3D tensor. Row and column reduction have
+  // different memory access pattern, so for performance their implementations
+  // are significantly different.
+  //
+  // Emits code that reduces a matrix of shape [height x width] to a vector of
+  // [width]. Other parameters have the same meaning as those of
+  // `EmitReductionToVector`. Note that input shape might not be
+  // [height x width], but can be bitcast to [height x weight] with "height"
+  // being the major dimension.
+  Status EmitColumnReduction(int64 height, int64 width, HloInstruction* reduce,
+                             const Shape& input_shape,
+                             const llvm_ir::ElementGenerator& input_gen,
+                             const llvm_ir::ElementGenerator& init_value_gen,
+                             HloComputation* reducer);
+
+  // Emits code that reduces a 3D tensor of shape [depth x height x width] to a
+  // vector of shape [height]. Other parameters have the same meaning as those
+  // of `EmitReductionToVector`. Note that input shape might not be
+  // [depth x height x width], but can be bitcast to [depth x height x weight]
+  // with "depth" being the most major dimension.
+  Status EmitRowReduction(int64 depth, int64 height, int64 width,
+                          HloInstruction* reduce, const Shape& input_shape,
+                          const llvm_ir::ElementGenerator& input_gen,
+                          const llvm_ir::ElementGenerator& init_value_gen,
+                          HloComputation* reducer);
+
+  // Emits code that reduces a tensor of arbitrary rank to a scalar.
+  Status EmitReductionToScalar(HloInstruction* reduce, const Shape& input_shape,
+                               const llvm_ir::ElementGenerator& input_gen,
+                               const llvm_ir::ElementGenerator& init_value_gen,
+                               HloComputation* reducer);
+
+  // Figures out whether `reduce` is a row or column reduction, and which
+  // dimensions to reduce, and calls either `EmitRowReduction` or
+  // `EmitColumnReduction` as appropriate. `input_shape` is the shape of the
+  // input array, which is the operand of the Reduce instruction if unfused or
+  // of the Fusion instruction if fused. `input_gen` and `init_value_gen`
+  // generate elements of the input and the initial value. Other parameters mean
+  // the same as for `HandleReduce`.
+  //
+  // Prerequisite: `IsReductionToVector(*reduce)`
+  Status EmitReductionToVector(
+      HloInstruction* reduce, const Shape& input_shape,
+      const llvm_ir::ElementGenerator& input_gen,
+      const llvm_ir::ElementGenerator& init_value_gen,
+      tensorflow::gtl::ArraySlice<int64> dimensions_to_reduce,
+      HloComputation* reducer);
+
+  // Emits code to initialize buffer of `inst` in given `thunk`.
+  Status EmitInitializer(const HloInstruction* inst, KernelThunk* thunk);
+
+  // Returns a KernelThunk that invokes the kernel emitted for `inst`. The
+  // caller needs to make sure `inst` outlives the lifetime of the returned
+  // Thunk object.
+  std::unique_ptr<Thunk> BuildKernelThunk(const HloInstruction* inst);
+
+  // Returns a FftThunk that calls cuFFT to implement `inst`.
+  std::unique_ptr<Thunk> BuildFftThunk(const HloInstruction* inst);
+
+  // Returns a GemmThunk that calls gemm to implement `inst`. The caller needs
+  // to make sure `inst` outlives the lifetime of the returned Thunk object.
+  std::unique_ptr<Thunk> BuildGemmThunk(const HloInstruction* inst);
+
+  // Returns a thunk that calls host-to-device cuMemcpy to implement `inst`.
+  std::unique_ptr<Thunk> BuildHostToDeviceCopyThunk(const HloInstruction* inst);
+
+  // Returns a thunk that calls device-to-device cuMemcpy to implement `inst`.
+  std::unique_ptr<Thunk> BuildDeviceToDeviceCopyThunk(
+      const HloInstruction* inst);
+
+  // Returns an InfeedThunk that performs device-to-device memcpy to implement
+  // `inst`.
+  std::unique_ptr<Thunk> BuildInfeedThunk(const HloInstruction* inst);
+
+  // Returns a WhileThunk that invokes thunk sequences for 'condition' and
+  // 'body' sub-computations of while instruction 'hlo'.
+  std::unique_ptr<Thunk> BuildWhileThunk(const HloInstruction* hlo);
+
+  // Returns a ForThunk which executes 'loop_limit' invocations of a thunk
+  // sequence from the 'body' sub-computation of the while instruction 'hlo'.
+  std::unique_ptr<Thunk> BuildForThunk(const HloInstruction* hlo,
+                                       const int64 loop_limit);
+
+  // Returns a ConditionalThunk that executes the thunk sequence for
+  // 'true_computation' or 'false_computation' depending on the value of the
+  // predicate in the given conditional instruction.
+  std::unique_ptr<Thunk> BuildConditionalThunk(const HloInstruction* hlo);
+
+  Status Postprocess(HloInstruction* hlo) override;
+
+  // Returns the last generated thunk.
+  Thunk* LastThunk() const { return thunk_sequence_->back().get(); }
+
+  // The thunk sequence this IrEmitter generates for the input computation.
+  std::unique_ptr<ThunkSequence> thunk_sequence_;
+
+  // The HloComputation that this IrEmitter emits code for.
+  const HloComputation* hlo_computation_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_IR_EMITTER_UNNESTED_H_
diff --git a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
index 242062e616..b3b6026ef1 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
@@ -32,14 +32,23 @@ limitations under the License.
 
 namespace xla {
 
-// Unlike IrEmitter, this creates host functions which emit IR to generate the
-// output element at the given index. It is used to generate fused operations.
+// FusedIrEmitter is used to generate code for fusion nodes.
+//
+// Unlike IrEmitter and its ilk, which directly create LLVM IR in an LLVM
+// Module, FusedIrEmitter is better understood as "IR generator generator".
+// FusedIrEmitter recursively creates a generator (a host function) which the
+// compiler can invoke at a later time.  Invoking the generator emits LLVM IR
+// that, when run, produces the value at a particular index of the output.
+//
+// After building this generator, the compiler creates a loop (or its moral
+// equivalent, e.g. a GPU kernel) and calls the generator from within the loop.
+// This generates code that produces each element of the output.
 //
 // This class handles both vanilla fusion and multi-output fusion.  In the MOF
-// case, the fusion node ends with a kTuple instruction, and the root generator
-// returned by this emitter returns an LLVM struct with N elements, one for each
-// element of the arrays in the tuple.  It follows that the arrays in the tuple
-// must have the same length.
+// case, the fusion node ends with a kTuple instruction, and the generator
+// created produces an LLVM struct with N elements, one for each element of the
+// arrays in the tuple.  It follows that the arrays in the tuple must have the
+// same length.
 class FusedIrEmitter : public DfsHloVisitorWithDefault {
  public:
   using Generator = llvm_ir::ElementGenerator;
-- 
GitLab


From 238bae4426729a0af555aecfda5e2de123443ccd Mon Sep 17 00:00:00 2001
From: Mingsheng Hong <hongm@google.com>
Date: Mon, 5 Feb 2018 17:16:27 -0800
Subject: [PATCH 692/966] Misc cleanups and tweaks: 1. Removed obsolete
 constructors in ProcessLunctionLibraryRuntime 2. Add const annotations to
 Tensor::PrintOneDim(), and removed unnecessary vector copy.

PiperOrigin-RevId: 184611531
---
 .../process_function_library_runtime.cc       | 17 --------------
 .../process_function_library_runtime.h        | 22 +++++--------------
 tensorflow/core/framework/tensor.cc           |  5 +++--
 3 files changed, 9 insertions(+), 35 deletions(-)

diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc
index 41e1ce8c15..f9d9633bee 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@@ -70,23 +70,6 @@ ProcessFunctionLibraryRuntime::ProcessFunctionLibraryRuntime(
   }
 }
 
-ProcessFunctionLibraryRuntime::ProcessFunctionLibraryRuntime(
-    const DeviceMgr* device_mgr, Env* env, int graph_def_version,
-    const FunctionLibraryDefinition* lib_def,
-    const OptimizerOptions& optimizer_options)
-    : ProcessFunctionLibraryRuntime(device_mgr, env, graph_def_version, lib_def,
-                                    optimizer_options,
-                                    nullptr /* cluster_flr */) {}
-
-ProcessFunctionLibraryRuntime::ProcessFunctionLibraryRuntime(
-    const DeviceMgr* device_mgr, Env* env, int graph_def_version,
-    const FunctionLibraryDefinition* lib_def,
-    const OptimizerOptions& optimizer_options,
-    CustomKernelCreator custom_kernel_creator)
-    : ProcessFunctionLibraryRuntime(
-          device_mgr, env, graph_def_version, lib_def, optimizer_options,
-          std::move(custom_kernel_creator), nullptr /* cluster_flr */) {}
-
 /* static */
 Status ProcessFunctionLibraryRuntime::SendTensors(
     const string& source_device, const string& target_device,
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.h b/tensorflow/core/common_runtime/process_function_library_runtime.h
index 4296f9449f..0473e16d24 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.h
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.h
@@ -29,12 +29,13 @@ class ProcessFunctionLibraryRuntime {
   // Creates FunctionLibraryRuntime objects for each device in the provided
   // DeviceMgr. Caller needs to make sure that device_mgr, lib_def and parent
   // (if provided) outlive this object.
-  ProcessFunctionLibraryRuntime(const DeviceMgr* device_mgr, Env* env,
-                                int graph_def_version,
-                                const FunctionLibraryDefinition* lib_def,
-                                const OptimizerOptions& optimizer_options,
-                                DistributedFunctionLibraryRuntime* parent);
+  ProcessFunctionLibraryRuntime(
+      const DeviceMgr* device_mgr, Env* env, int graph_def_version,
+      const FunctionLibraryDefinition* lib_def,
+      const OptimizerOptions& optimizer_options,
+      DistributedFunctionLibraryRuntime* parent = nullptr);
 
+  // With `custom_kernel_creator`.
   ProcessFunctionLibraryRuntime(const DeviceMgr* device_mgr, Env* env,
                                 int graph_def_version,
                                 const FunctionLibraryDefinition* lib_def,
@@ -42,17 +43,6 @@ class ProcessFunctionLibraryRuntime {
                                 CustomKernelCreator custom_kernel_creator,
                                 DistributedFunctionLibraryRuntime* parent);
 
-  ProcessFunctionLibraryRuntime(const DeviceMgr* device_mgr, Env* env,
-                                int graph_def_version,
-                                const FunctionLibraryDefinition* lib_def,
-                                const OptimizerOptions& optimizer_options);
-
-  ProcessFunctionLibraryRuntime(const DeviceMgr* device_mgr, Env* env,
-                                int graph_def_version,
-                                const FunctionLibraryDefinition* lib_def,
-                                const OptimizerOptions& optimizer_options,
-                                CustomKernelCreator custom_kernel_creator);
-
   // Sends `tensors_to_send` from `source_device` to `target_device` using
   // `rendezvous`. `key_prefix` is used as a prefix for the keys sent to the
   // Rendezvous. `device_context` should be the DeviceContext of the device
diff --git a/tensorflow/core/framework/tensor.cc b/tensorflow/core/framework/tensor.cc
index 77a3edcc10..0645ec4282 100644
--- a/tensorflow/core/framework/tensor.cc
+++ b/tensorflow/core/framework/tensor.cc
@@ -886,8 +886,9 @@ bool Tensor::CanUseDMA() const {
 namespace {
 // Print from left dim to right dim recursively.
 template <typename T>
-void PrintOneDim(int dim_index, gtl::InlinedVector<int64, 4> shape, int64 limit,
-                 int shape_size, T* data, int64* data_index, string* result) {
+void PrintOneDim(int dim_index, const gtl::InlinedVector<int64, 4>& shape,
+                 int64 limit, int shape_size, const T* data, int64* data_index,
+                 string* result) {
   if (*data_index >= limit) return;
   int64 element_count = shape[dim_index];
   // We have reached the right-most dimension of the tensor.
-- 
GitLab


From 78af5c7e2cf9e7a09acfceb368d6bd3818405353 Mon Sep 17 00:00:00 2001
From: Igor Saprykin <isaprykin@google.com>
Date: Mon, 5 Feb 2018 17:24:31 -0800
Subject: [PATCH 693/966] Correctly treat "devices=/gpu:0" argument of
 replicate_model_fn.

At the moment if "devices=/GPU:0" are specified by the user, then variables are going to be placed on the GPU.  However, if "devices=/gpu:0" are given, then they are going to be placed on the CPU.  Instead, the latter case should be equivalent to the former case.

PiperOrigin-RevId: 184612823
---
 .../python/estimator/replicate_model_fn.py    |  2 +-
 .../estimator/replicate_model_fn_test.py      | 28 +++++++++++++++++++
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py
index c9153c9352..dfae034afc 100644
--- a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py
+++ b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py
@@ -195,7 +195,7 @@ def _replicate_model_fn_with_mode(
   if not devices:
     devices = _get_local_devices('GPU') or _get_local_devices('CPU')
 
-  is_a_single_gpu_case = len(devices) == 1 and 'GPU' in devices[0]
+  is_a_single_gpu_case = len(devices) == 1 and 'GPU' in devices[0].upper()
   consolidation_device = devices[0] if is_a_single_gpu_case else '/CPU:0'
 
   ps_devices = [consolidation_device]
diff --git a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py
index 6936f8a131..ab117e61a7 100644
--- a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py
@@ -451,6 +451,34 @@ class ReplicateModelTest(test_util.TensorFlowTestCase):
       _ = replicate_model_fn.replicate_model_fn(self.model_fn,
                                                 losses.Reduction.NONE)
 
+  def test_places_on_gpu_with_upper_case_spelling(self):
+    features = np.array([[0.01], [0.002]])
+    labels = np.array([[0.01], [0.02]])
+
+    with self.test_session():
+      replicated_model_fn = replicate_model_fn.replicate_model_fn(
+          self.model_fn, devices=['/GPU:0'])
+      _ = replicated_model_fn(
+          features, labels, model_fn_lib.ModeKeys.TRAIN, self.params)
+
+      with variable_scope.variable_scope('', reuse=True):
+        c = variable_scope.get_variable('c', dtype=dtypes.float64)
+        self.assertEqual('/device:GPU:0', c.device)
+
+  def test_places_on_gpu_with_lower_case_spelling(self):
+    features = np.array([[0.01], [0.002]])
+    labels = np.array([[0.01], [0.02]])
+
+    with self.test_session():
+      replicated_model_fn = replicate_model_fn.replicate_model_fn(
+          self.model_fn, devices=['/gpu:0'])
+      _ = replicated_model_fn(
+          features, labels, model_fn_lib.ModeKeys.TRAIN, self.params)
+
+      with variable_scope.variable_scope('', reuse=True):
+        c = variable_scope.get_variable('c', dtype=dtypes.float64)
+        self.assertEqual('/device:GPU:0', c.device)
+
 
 class ReplicateAcrossASingleDeviceWithoutTowerOptimizer(
     test_util.TensorFlowTestCase):
-- 
GitLab


From f92d4e8bc878245379ed2c04123b20758a261af0 Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Mon, 5 Feb 2018 17:45:26 -0800
Subject: [PATCH 694/966] [XLA] Add HloBindings::ToString().

PiperOrigin-RevId: 184615306
---
 .../xla/service/gpu/hlo_to_ir_bindings.cc     | 54 +++++++++++++++++++
 .../xla/service/gpu/hlo_to_ir_bindings.h      |  2 +
 .../xla/service/gpu/ir_emitter_unnested.cc    |  2 +
 3 files changed, 58 insertions(+)

diff --git a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
index dd4426ca7b..061210352c 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
+++ b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
@@ -22,12 +22,17 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
 namespace gpu {
 
+using tensorflow::strings::StrAppend;
+using tensorflow::strings::StrCat;
+
 void HloToIrBindings::EmitBasePointersForHlos(
     tensorflow::gtl::ArraySlice<const HloInstruction*> io_hlos,
     tensorflow::gtl::ArraySlice<const HloInstruction*> non_io_hlos) {
@@ -227,5 +232,54 @@ void HloToIrBindings::UnbindAllLocalIrValues() {
   }
 }
 
+string HloToIrBindings::ToString() const {
+  string s = StrCat("** HloToIrBindings **\n");
+  StrAppend(&s, "  is_nested_=", is_nested_, "\n");
+  StrAppend(&s,
+            "  temp_buffer_base_=", llvm_ir::DumpToString(*temp_buffer_base_),
+            "\n");
+
+  if (base_ptrs_.empty()) {
+    return s;
+  }
+
+  // Iterate over all computations in the module in topological order, and print
+  // out the base pointers we have in each computation in topological order.
+  for (const HloComputation* computation :
+       base_ptrs_.begin()->first->GetModule()->MakeComputationPostOrder()) {
+    bool is_first = true;
+    for (const HloInstruction* instr :
+         computation->MakeInstructionPostOrder()) {
+      auto it = base_ptrs_.find(instr);
+      if (it == base_ptrs_.end()) {
+        continue;
+      }
+      if (is_first) {
+        StrAppend(&s, "  Base pointers for computation ", computation->name(),
+                  ":\n");
+        is_first = false;
+      }
+      StrAppend(&s, "    ", instr->ToString());
+
+      const ShapeTree<llvm::Value*>& shape_tree = it->second;
+      if (!ShapeUtil::IsTuple(instr->shape())) {
+        const llvm::Value* val = shape_tree.begin()->second;
+        StrAppend(&s, " -> ", llvm_ir::DumpToString(*val), "\n");
+        continue;
+      }
+
+      StrAppend(&s, "\n");
+      for (auto shape_it = shape_tree.begin(); shape_it != shape_tree.end();
+           ++shape_it) {
+        llvm::Value* val = shape_it->second;
+        StrAppend(&s, "      ", shape_it->first.ToString(), " -> ",
+                  (val != nullptr ? llvm_ir::DumpToString(*val) : "null"),
+                  "\n");
+      }
+    }
+  }
+  return s;
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h
index 62ae1769a1..1fe7970e7d 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h
+++ b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h
@@ -87,6 +87,8 @@ class HloToIrBindings {
                               const HloInstruction& consumer,
                               const ShapeIndex& shape_index = {});
 
+  string ToString() const;
+
  private:
   // Emits IR to resolve (possibly) recursive GetTupleElement instructions.
   llvm::Value* EmitGetTupleElement(const HloInstruction* gte,
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 08fea34a70..c81dfbf6c2 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -2271,6 +2271,8 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildConditionalThunk(
 Status IrEmitterUnnested::EmitTargetElementLoopInThunk(
     const HloInstruction& hlo,
     const llvm_ir::ElementGenerator& element_generator, KernelThunk* thunk) {
+  VLOG(3) << bindings_.ToString();
+
   const Shape& element_shape = hlo.IsMultiOutputFusion()
                                    ? ShapeUtil::GetSubshape(hlo.shape(), {0})
                                    : hlo.shape();
-- 
GitLab


From 54dd9c9d38ad93f5e2d9c880d57a5df57fbe26ab Mon Sep 17 00:00:00 2001
From: "Joshua V. Dillon" <jvdillon@google.com>
Date: Mon, 5 Feb 2018 17:55:26 -0800
Subject: [PATCH 695/966] Cleanup markdown errors in `Bijector`.

PiperOrigin-RevId: 184616392
---
 .../python/ops/distributions/bijector_impl.py | 29 +++++++++----------
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/tensorflow/python/ops/distributions/bijector_impl.py b/tensorflow/python/ops/distributions/bijector_impl.py
index 44d64070ce..ed435557fd 100644
--- a/tensorflow/python/ops/distributions/bijector_impl.py
+++ b/tensorflow/python/ops/distributions/bijector_impl.py
@@ -114,7 +114,7 @@ class _Mapping(collections.namedtuple(
 @six.add_metaclass(abc.ABCMeta)
 @tf_export("distributions.bijectors.Bijector")
 class Bijector(object):
-  """Interface for transformations of a `Distribution` sample.
+  r"""Interface for transformations of a `Distribution` sample.
 
   Bijectors can be used to represent any differentiable and injective
   (one to one) function defined on an open subset of `R^n`.  Some non-injective
@@ -122,27 +122,24 @@ class Bijector(object):
 
   #### Mathematical Details
 
-  A `Bijector` implements a
-  [diffeomorphism](https://en.wikipedia.org/wiki/Diffeomorphism), i.e., a
-  bijective, differentiable function. A `Bijector` is used by
-  `TransformedDistribution` but can be generally used for transforming a
-  `Distribution` generated `Tensor`. A `Bijector` is characterized by three
-  operations:
-
-  1. Forward Evaluation
+  A `Bijector` implements a [smooth covering map](
+  https://en.wikipedia.org/wiki/Local_diffeomorphism), i.e., a local
+  diffeomorphism such that every point in the target has a neighborhood evenly
+  covered by a map ([see also](
+  https://en.wikipedia.org/wiki/Covering_space#Covering_of_a_manifold)).
+  A `Bijector` is used by `TransformedDistribution` but can be generally used
+  for transforming a `Distribution` generated `Tensor`. A `Bijector` is
+  characterized by three operations:
 
+  1. Forward\
      Useful for turning one random outcome into another random outcome from a
      different distribution.
-
-  2. Inverse Evaluation
-
+  2. Inverse\
      Useful for "reversing" a transformation to compute one probability in
      terms of another.
-
-  3. (log o det o Jacobian o inverse)(x)
-
+  3. `(log o det o Jacobian o inverse)(x)`\
      "The log of the determinant of the matrix of all first-order partial
-     derivatives of the inverse function."
+     derivatives of the inverse function."\
      Useful for inverting a transformation to compute one probability in terms
      of another. Geometrically, the det(Jacobian) is the volume of the
      transformation and is used to scale the probability.
-- 
GitLab


From 09138338ed81b56aeaff88b9ee5a11c3f6d77b70 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <bsteiner@google.com>
Date: Mon, 5 Feb 2018 18:22:37 -0800
Subject: [PATCH 696/966] Added the ability to query the amount of RAM
 available Also added the ability to query the CPU frequency on Windows.

PiperOrigin-RevId: 184620390
---
 tensorflow/core/grappler/clusters/utils.cc |  6 ++++++
 tensorflow/core/platform/mem.h             |  3 +++
 tensorflow/core/platform/posix/port.cc     | 12 ++++++++++++
 tensorflow/core/platform/windows/port.cc   | 14 +++++++++++++-
 4 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/grappler/clusters/utils.cc b/tensorflow/core/grappler/clusters/utils.cc
index 592e4b789d..aacd2ccb72 100644
--- a/tensorflow/core/grappler/clusters/utils.cc
+++ b/tensorflow/core/grappler/clusters/utils.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/mem.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -48,6 +49,11 @@ DeviceProperties GetLocalCPUInfo() {
   device.set_l2_cache_size(Eigen::l2CacheSize());
   device.set_l3_cache_size(Eigen::l3CacheSize());
 
+  int64 free_mem = port::AvailableRam();
+  if (free_mem < INT64_MAX) {
+    device.set_memory_size(free_mem);
+  }
+
   (*device.mutable_environment())["cpu_instruction_set"] =
       Eigen::SimdInstructionSetsInUse();
 
diff --git a/tensorflow/core/platform/mem.h b/tensorflow/core/platform/mem.h
index dc389a8741..7bb9fc264f 100644
--- a/tensorflow/core/platform/mem.h
+++ b/tensorflow/core/platform/mem.h
@@ -59,6 +59,9 @@ void MallocExtension_ReleaseToSystem(std::size_t num_bytes);
 // routine, this routine returns 0.
 std::size_t MallocExtension_GetAllocatedSize(const void* p);
 
+// Returns the amount of RAM available in kB, or INT64_MAX if unknown.
+int64 AvailableRam();
+
 }  // namespace port
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/platform/posix/port.cc b/tensorflow/core/platform/posix/port.cc
index 614ee00b01..494acde803 100644
--- a/tensorflow/core/platform/posix/port.cc
+++ b/tensorflow/core/platform/posix/port.cc
@@ -29,6 +29,7 @@ limitations under the License.
 
 #if defined(__linux__) && !defined(__ANDROID__)
 #include <sched.h>
+#include <sys/sysinfo.h>
 #endif
 #include <stdio.h>
 #include <stdlib.h>
@@ -171,5 +172,16 @@ double NominalCPUFrequency() {
 #endif
 }
 
+int64 AvailableRam() {
+#if defined(__linux__) && !defined(__ANDROID__)
+  struct sysinfo info;
+  int err = sysinfo(&info);
+  if (err == 0) {
+    return info.freeram / 1024;
+  }
+#endif
+  return INT64_MAX;
+}
+
 }  // namespace port
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/windows/port.cc b/tensorflow/core/platform/windows/port.cc
index e327d53949..582b232054 100644
--- a/tensorflow/core/platform/windows/port.cc
+++ b/tensorflow/core/platform/windows/port.cc
@@ -149,8 +149,20 @@ bool Snappy_Uncompress(const char* input, size_t length, char* output) {
 string Demangle(const char* mangled) { return mangled; }
 
 double NominalCPUFrequency() {
-  // TODO(yuefengz): implement it for this platform.
+#ifdef TENSORFLOW_USE_ABSL
+  return absl::base_internal::NominalCPUFrequency();
+#else
   return 1.0;
+#endif
+}
+
+int64 AvailableRam() {
+  MEMORYSTATUSEX statex;
+  statex.dwLength = sizeof(statex);
+  if (GlobalMemoryStatusEx(&statex)) {
+    return statex.ullAvailPhys / 1024;
+  }
+  return INT64_MAX;
 }
 
 }  // namespace port
-- 
GitLab


From 1a0b637df8d082301118dd0f85ec63704f862aeb Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 5 Feb 2018 18:45:13 -0800
Subject: [PATCH 697/966] Export align_corners to TF Lite

PiperOrigin-RevId: 184622482
---
 tensorflow/contrib/lite/builtin_op_data.h     |  1 +
 .../contrib/lite/kernels/resize_bilinear.cc   | 12 ++++--
 tensorflow/contrib/lite/model.cc              |  1 +
 tensorflow/contrib/lite/schema/schema.fbs     |  3 ++
 .../contrib/lite/schema/schema_generated.h    | 40 +++++++++++--------
 .../testing/generated_examples_zip_test.cc    |  3 --
 .../contrib/lite/toco/tflite/operator.cc      | 22 +++++++++-
 .../contrib/lite/toco/tflite/operator_test.cc | 10 ++++-
 8 files changed, 65 insertions(+), 27 deletions(-)

diff --git a/tensorflow/contrib/lite/builtin_op_data.h b/tensorflow/contrib/lite/builtin_op_data.h
index a1037a525c..5dbeadd165 100644
--- a/tensorflow/contrib/lite/builtin_op_data.h
+++ b/tensorflow/contrib/lite/builtin_op_data.h
@@ -151,6 +151,7 @@ typedef struct {
 } TfLiteLSTMParams;
 
 typedef struct {
+  bool align_corners;
 } TfLiteResizeBilinearParams;
 
 typedef struct {
diff --git a/tensorflow/contrib/lite/kernels/resize_bilinear.cc b/tensorflow/contrib/lite/kernels/resize_bilinear.cc
index 4a2101f246..c5d60cae3a 100644
--- a/tensorflow/contrib/lite/kernels/resize_bilinear.cc
+++ b/tensorflow/contrib/lite/kernels/resize_bilinear.cc
@@ -75,6 +75,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
 template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params =
+      reinterpret_cast<TfLiteResizeBilinearParams*>(node->builtin_data);
+
   TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   TfLiteTensor* size = GetInput(context, node, kSizeTensor);
@@ -86,10 +89,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   }
 
   if (output->type == kTfLiteFloat32) {
-#define TF_LITE_RESIZE_BILINEAR(type)                                     \
-  type::ResizeBilinear(GetTensorData<float>(input), GetTensorDims(input), \
-                       GetTensorData<int32>(size), GetTensorDims(size),   \
-                       GetTensorData<float>(output), GetTensorDims(output))
+#define TF_LITE_RESIZE_BILINEAR(type)                                       \
+  type::ResizeBilinear(GetTensorData<float>(input), GetTensorDims(input),   \
+                       GetTensorData<int32>(size), GetTensorDims(size),     \
+                       GetTensorData<float>(output), GetTensorDims(output), \
+                       params->align_corners)
 
     if (kernel_type == kReference) {
       TF_LITE_RESIZE_BILINEAR(reference_ops);
diff --git a/tensorflow/contrib/lite/model.cc b/tensorflow/contrib/lite/model.cc
index b36bfcef84..14b6709964 100644
--- a/tensorflow/contrib/lite/model.cc
+++ b/tensorflow/contrib/lite/model.cc
@@ -469,6 +469,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
       auto* params = MallocPOD<TfLiteResizeBilinearParams>();
       if (auto* schema_params =
               op->builtin_options_as_ResizeBilinearOptions()) {
+        params->align_corners = schema_params->align_corners();
       }
       builtin_data = reinterpret_cast<void*>(params);
       break;
diff --git a/tensorflow/contrib/lite/schema/schema.fbs b/tensorflow/contrib/lite/schema/schema.fbs
index c0b220e872..36cc2724eb 100644
--- a/tensorflow/contrib/lite/schema/schema.fbs
+++ b/tensorflow/contrib/lite/schema/schema.fbs
@@ -273,6 +273,9 @@ table LSTMOptions {
 }
 
 table ResizeBilinearOptions {
+  new_height: int (deprecated);
+  new_width: int (deprecated);
+  align_corners: bool;
 }
 
 // A call operation options
diff --git a/tensorflow/contrib/lite/schema/schema_generated.h b/tensorflow/contrib/lite/schema/schema_generated.h
index 29f3a17be7..e2ac0b9d1e 100755
--- a/tensorflow/contrib/lite/schema/schema_generated.h
+++ b/tensorflow/contrib/lite/schema/schema_generated.h
@@ -2626,28 +2626,36 @@ flatbuffers::Offset<LSTMOptions> CreateLSTMOptions(
 
 struct ResizeBilinearOptionsT : public flatbuffers::NativeTable {
   typedef ResizeBilinearOptions TableType;
-  ResizeBilinearOptionsT() {}
+  bool align_corners;
+  ResizeBilinearOptionsT()
+      : align_corners(false) {
+  }
 };
 
-struct ResizeBilinearOptions FLATBUFFERS_FINAL_CLASS
-    : private flatbuffers::Table {
+struct ResizeBilinearOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef ResizeBilinearOptionsT NativeTableType;
+  enum {
+    VT_ALIGN_CORNERS = 8
+  };
+  bool align_corners() const {
+    return GetField<uint8_t>(VT_ALIGN_CORNERS, 0) != 0;
+  }
   bool Verify(flatbuffers::Verifier &verifier) const {
-    return VerifyTableStart(verifier) && verifier.EndTable();
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_ALIGN_CORNERS) &&
+           verifier.EndTable();
   }
-  ResizeBilinearOptionsT *UnPack(
-      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(
-      ResizeBilinearOptionsT *_o,
-      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<ResizeBilinearOptions> Pack(
-      flatbuffers::FlatBufferBuilder &_fbb, const ResizeBilinearOptionsT *_o,
-      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  ResizeBilinearOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ResizeBilinearOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<ResizeBilinearOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ResizeBilinearOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct ResizeBilinearOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
+  void add_align_corners(bool align_corners) {
+    fbb_.AddElement<uint8_t>(ResizeBilinearOptions::VT_ALIGN_CORNERS, static_cast<uint8_t>(align_corners), 0);
+  }
   explicit ResizeBilinearOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
       : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -2661,14 +2669,14 @@ struct ResizeBilinearOptionsBuilder {
 };
 
 inline flatbuffers::Offset<ResizeBilinearOptions> CreateResizeBilinearOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+    flatbuffers::FlatBufferBuilder &_fbb,
+    bool align_corners = false) {
   ResizeBilinearOptionsBuilder builder_(_fbb);
+  builder_.add_align_corners(align_corners);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<ResizeBilinearOptions> CreateResizeBilinearOptions(
-    flatbuffers::FlatBufferBuilder &_fbb, const ResizeBilinearOptionsT *_o,
-    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<ResizeBilinearOptions> CreateResizeBilinearOptions(flatbuffers::FlatBufferBuilder &_fbb, const ResizeBilinearOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct CallOptionsT : public flatbuffers::NativeTable {
   typedef CallOptions TableType;
diff --git a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
index e8b425a592..5ea3e21f6a 100644
--- a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
+++ b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
@@ -89,9 +89,6 @@ std::map<string, string> kBrokenTests = {
 
     // ResizeBilinear looks completely incompatible with Tensorflow
     {R"(^\/resize_bilinear.*dtype=tf.int32)", "72401107"},
-    {R"(^\/resize_bilinearalign_corners=True,.*,size=\[2,2\])", "72401483"},
-    {R"(^\/resize_bilinearalign_corners=True,.*,size=\[4,3\])", "72401483"},
-    {R"(^\/resize_bilinearalign_corners=True,.*,size=\[5,6\])", "72401483"},
 
     // Transpose only supports 1D-4D input tensors.
     {R"(^\/transpose.*input_shape=\[.,.,.,.,.\])", "71545879"},
diff --git a/tensorflow/contrib/lite/toco/tflite/operator.cc b/tensorflow/contrib/lite/toco/tflite/operator.cc
index 461494fd99..04aaedd59d 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator.cc
@@ -540,6 +540,24 @@ class Mean : public BuiltinOperator<MeanOperator, ::tflite::MeanOptions,
   }
 };
 
+class ResizeBilinear
+    : public BuiltinOperator<ResizeBilinearOperator,
+                             ::tflite::ResizeBilinearOptions,
+                             ::tflite::BuiltinOptions_ResizeBilinearOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateResizeBilinearOptions(*builder, op.align_corners);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->align_corners = options.align_corners();
+  }
+};
+
 class Squeeze
     : public BuiltinOperator<SqueezeOperator, ::tflite::SqueezeOptions,
                              ::tflite::BuiltinOptions_SqueezeOptions> {
@@ -755,6 +773,8 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList() {
                                  OperatorType::kTranspose));
   ops.emplace_back(
       new Mean(::tflite::BuiltinOperator_MEAN, OperatorType::kMean));
+  ops.emplace_back(new ResizeBilinear(::tflite::BuiltinOperator_RESIZE_BILINEAR,
+                                      OperatorType::kResizeBilinear));
   ops.emplace_back(
       new Squeeze(::tflite::BuiltinOperator_SQUEEZE, OperatorType::kSqueeze));
   ops.emplace_back(new StridedSlice(::tflite::BuiltinOperator_STRIDED_SLICE,
@@ -787,8 +807,6 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList() {
       new SimpleOperator<Relu1Operator>("RELU_N1_TO_1", OperatorType::kRelu1));
   ops.emplace_back(
       new SimpleOperator<Relu6Operator>("RELU6", OperatorType::kRelu6));
-  ops.emplace_back(new SimpleOperator<ResizeBilinearOperator>(
-      "RESIZE_BILINEAR", OperatorType::kResizeBilinear));
   ops.emplace_back(new SimpleOperator<LogisticOperator>(
       "LOGISTIC", OperatorType::kLogistic));
   ops.emplace_back(
diff --git a/tensorflow/contrib/lite/toco/tflite/operator_test.cc b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
index 6daa296282..796534be53 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator_test.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
@@ -104,8 +104,6 @@ TEST_F(OperatorTest, SimpleOperators) {
   CheckSimpleOperator<ReluOperator>("RELU", OperatorType::kRelu);
   CheckSimpleOperator<Relu1Operator>("RELU_N1_TO_1", OperatorType::kRelu1);
   CheckSimpleOperator<Relu6Operator>("RELU6", OperatorType::kRelu6);
-  CheckSimpleOperator<ResizeBilinearOperator>("RESIZE_BILINEAR",
-                                              OperatorType::kResizeBilinear);
   CheckSimpleOperator<LogisticOperator>("LOGISTIC", OperatorType::kLogistic);
   CheckSimpleOperator<TanhOperator>("TANH", OperatorType::kTanh);
 }
@@ -331,6 +329,14 @@ TEST_F(OperatorTest, BuiltinMul) {
             output_toco_op->fused_activation_function);
 }
 
+TEST_F(OperatorTest, ResizeBilinear) {
+  ResizeBilinearOperator op;
+  op.align_corners = true;
+  auto output_toco_op = SerializeAndDeserialize(
+      GetOperator("RESIZE_BILINEAR", OperatorType::kResizeBilinear), op);
+  EXPECT_EQ(op.align_corners, output_toco_op->align_corners);
+}
+
 TEST_F(OperatorTest, Svdf) {
   SvdfOperator op;
   op.fused_activation_function = FusedActivationFunctionType::kRelu;
-- 
GitLab


From 60b4211873388ac8518a8397cc28e553b22420ea Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 6 Feb 2018 00:24:51 -0800
Subject: [PATCH 698/966] Rollback of CL 183879566, and fix Windows CMAKE build
 by copying one additional CUDA header file.

PiperOrigin-RevId: 184644220
---
 tensorflow/contrib/cmake/CMakeLists.txt | 2 ++
 tensorflow/core/BUILD                   | 1 -
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index 12bfd3c62b..b732b23320 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -364,6 +364,8 @@ if (tensorflow_ENABLE_GPU)
       ${CUDA_TOOLKIT_TARGET_DIR}/include/cufft.h ${CUDA_TOOLKIT_TARGET_DIR}/include/curand.h
       ${CUDA_TOOLKIT_TARGET_DIR}/include/cuda_runtime_api.h
       ${CUDA_TOOLKIT_TARGET_DIR}/include/cusolverDn.h
+      ${CUDA_TOOLKIT_TARGET_DIR}/include/cuda_fp16.h
+      ${CUDA_TOOLKIT_TARGET_DIR}/include/device_functions.h
       DESTINATION ${tensorflow_source_dir}/third_party/gpus/cuda/include
     )
   else(WIN32)
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 3aa3018cd2..7fade697de 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -1906,7 +1906,6 @@ cc_library(
 tf_cuda_library(
     name = "cuda_device_functions",
     hdrs = ["util/cuda_device_functions.h"],
-    cuda_deps = ["//third_party_gpus/cuda:cuda_headers"],
     visibility = ["//visibility:public"],
     deps = [":framework_lite"],
 )
-- 
GitLab


From 2341e3f09f8453c64d96f9c1fd870825bb2a3755 Mon Sep 17 00:00:00 2001
From: Ian Langmore <langmore@google.com>
Date: Tue, 6 Feb 2018 00:29:29 -0800
Subject: [PATCH 699/966] mcmc_diagnostics.py added to contrib/bayesflow/. 
 potential_scale_reduction function added. .

PiperOrigin-RevId: 184644450
---
 tensorflow/contrib/bayesflow/BUILD            |  19 ++
 tensorflow/contrib/bayesflow/__init__.py      |   2 +
 .../kernel_tests/mcmc_diagnostics_test.py     | 230 ++++++++++++++++++
 .../bayesflow/python/ops/mcmc_diagnostics.py  |  31 +++
 .../python/ops/mcmc_diagnostics_impl.py       | 228 +++++++++++++++++
 5 files changed, 510 insertions(+)
 create mode 100644 tensorflow/contrib/bayesflow/python/kernel_tests/mcmc_diagnostics_test.py
 create mode 100644 tensorflow/contrib/bayesflow/python/ops/mcmc_diagnostics.py
 create mode 100644 tensorflow/contrib/bayesflow/python/ops/mcmc_diagnostics_impl.py

diff --git a/tensorflow/contrib/bayesflow/BUILD b/tensorflow/contrib/bayesflow/BUILD
index 6e0f0a0572..82944f5363 100644
--- a/tensorflow/contrib/bayesflow/BUILD
+++ b/tensorflow/contrib/bayesflow/BUILD
@@ -137,6 +137,25 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "mcmc_diagnostics_test",
+    size = "small",
+    srcs = ["python/kernel_tests/mcmc_diagnostics_test.py"],
+    additional_deps = [
+        ":bayesflow_py",
+        "//third_party/py/numpy",
+        "//tensorflow/contrib/distributions:distributions_py",
+        "//tensorflow/python/ops/distributions",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:random_seed",
+    ],
+)
+
 cuda_py_test(
     name = "monte_carlo_test",
     size = "small",
diff --git a/tensorflow/contrib/bayesflow/__init__.py b/tensorflow/contrib/bayesflow/__init__.py
index 95b9452b1a..c411026346 100644
--- a/tensorflow/contrib/bayesflow/__init__.py
+++ b/tensorflow/contrib/bayesflow/__init__.py
@@ -26,6 +26,7 @@ from tensorflow.contrib.bayesflow.python.ops import custom_grad
 from tensorflow.contrib.bayesflow.python.ops import halton_sequence
 from tensorflow.contrib.bayesflow.python.ops import hmc
 from tensorflow.contrib.bayesflow.python.ops import layers
+from tensorflow.contrib.bayesflow.python.ops import mcmc_diagnostics
 from tensorflow.contrib.bayesflow.python.ops import metropolis_hastings
 from tensorflow.contrib.bayesflow.python.ops import monte_carlo
 from tensorflow.contrib.bayesflow.python.ops import optimizers
@@ -42,6 +43,7 @@ _allowed_symbols = [
     'hmc',
     'layers',
     'metropolis_hastings',
+    'mcmc_diagnostics',
     'monte_carlo',
     'optimizers',
     'special_math',
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/mcmc_diagnostics_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/mcmc_diagnostics_test.py
new file mode 100644
index 0000000000..7652b6a7ce
--- /dev/null
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/mcmc_diagnostics_test.py
@@ -0,0 +1,230 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for MCMC diagnostic utilities."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.bayesflow.python.ops import mcmc_diagnostics_impl as mcmc_diagnostics
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+rng = np.random.RandomState(42)
+
+
+class _PotentialScaleReductionTest(object):
+
+  @property
+  def use_static_shape(self):
+    raise NotImplementedError(
+        "Subclass failed to impliment `use_static_shape`.")
+
+  def testListOfStatesWhereFirstPassesSecondFails(self):
+    """Simple test showing API with two states.  Read first!."""
+    n_samples = 1000
+
+    # state_0 is two scalar chains taken from iid Normal(0, 1).  Will pass.
+    state_0 = rng.randn(n_samples, 2)
+
+    # state_1 is three 4-variate chains taken from Normal(0, 1) that have been
+    # shifted.  Since every chain is shifted, they are not the same, and the
+    # test should fail.
+    offset = np.array([1., -1., 2.]).reshape(3, 1)
+    state_1 = rng.randn(n_samples, 3, 4) + offset
+
+    rhat = mcmc_diagnostics.potential_scale_reduction(
+        state=[state_0, state_1], independent_chain_ndims=1)
+
+    self.assertIsInstance(rhat, list)
+    with self.test_session() as sess:
+      rhat_0_, rhat_1_ = sess.run(rhat)
+
+    # r_hat_0 should be close to 1, meaning test is passed.
+    self.assertAllEqual((), rhat_0_.shape)
+    self.assertAllClose(1., rhat_0_, rtol=0.02)
+
+    # r_hat_1 should be greater than 1.2, meaning test has failed.
+    self.assertAllEqual((4,), rhat_1_.shape)
+    self.assertAllEqual(np.ones_like(rhat_1_).astype(bool), rhat_1_ > 1.2)
+
+  def check_results(self, state_, independent_chain_shape, should_pass):
+    sample_ndims = 1
+    independent_chain_ndims = len(independent_chain_shape)
+    with self.test_session():
+      state = array_ops.placeholder_with_default(
+          input=state_, shape=state_.shape if self.use_static_shape else None)
+
+      rhat = mcmc_diagnostics.potential_scale_reduction(
+          state, independent_chain_ndims=independent_chain_ndims)
+
+      if self.use_static_shape:
+        self.assertAllEqual(
+            state_.shape[sample_ndims + independent_chain_ndims:], rhat.shape)
+
+      rhat_ = rhat.eval()
+      if should_pass:
+        self.assertAllClose(np.ones_like(rhat_), rhat_, atol=0, rtol=0.02)
+      else:
+        self.assertAllEqual(np.ones_like(rhat_).astype(bool), rhat_ > 1.2)
+
+  def iid_normal_chains_should_pass_wrapper(self,
+                                            sample_shape,
+                                            independent_chain_shape,
+                                            other_shape,
+                                            dtype=np.float32):
+    """Check results with iid normal chains."""
+
+    state_shape = sample_shape + independent_chain_shape + other_shape
+    state_ = rng.randn(*state_shape).astype(dtype)
+
+    # The "other" dimensions do not have to be identical, just independent, so
+    # force them to not be identical.
+    if other_shape:
+      state_ *= rng.rand(*other_shape).astype(dtype)
+
+    self.check_results(state_, independent_chain_shape, should_pass=True)
+
+  def testPassingIIDNdimsAreIndependentOneOtherZero(self):
+    self.iid_normal_chains_should_pass_wrapper(
+        sample_shape=[10000], independent_chain_shape=[4], other_shape=[])
+
+  def testPassingIIDNdimsAreIndependentOneOtherOne(self):
+    self.iid_normal_chains_should_pass_wrapper(
+        sample_shape=[10000], independent_chain_shape=[3], other_shape=[7])
+
+  def testPassingIIDNdimsAreIndependentOneOtherTwo(self):
+    self.iid_normal_chains_should_pass_wrapper(
+        sample_shape=[10000], independent_chain_shape=[2], other_shape=[5, 7])
+
+  def testPassingIIDNdimsAreIndependentTwoOtherTwo64Bit(self):
+    self.iid_normal_chains_should_pass_wrapper(
+        sample_shape=[10000],
+        independent_chain_shape=[2, 3],
+        other_shape=[5, 7],
+        dtype=np.float64)
+
+  def offset_normal_chains_should_fail_wrapper(
+      self, sample_shape, independent_chain_shape, other_shape):
+    """Check results with normal chains that are offset from each other."""
+
+    state_shape = sample_shape + independent_chain_shape + other_shape
+    state_ = rng.randn(*state_shape)
+
+    # Add a significant offset to the different (formerly iid) chains.
+    offset = np.linspace(
+        0, 2, num=np.prod(independent_chain_shape)).reshape([1] * len(
+            sample_shape) + independent_chain_shape + [1] * len(other_shape))
+    state_ += offset
+
+    self.check_results(state_, independent_chain_shape, should_pass=False)
+
+  def testFailingOffsetNdimsAreSampleOneIndependentOneOtherOne(self):
+    self.offset_normal_chains_should_fail_wrapper(
+        sample_shape=[10000], independent_chain_shape=[2], other_shape=[5])
+
+
+class PotentialScaleReductionStaticTest(test.TestCase,
+                                        _PotentialScaleReductionTest):
+
+  @property
+  def use_static_shape(self):
+    return True
+
+  def testIndependentNdimsLessThanOneRaises(self):
+    with self.assertRaisesRegexp(ValueError, "independent_chain_ndims"):
+      mcmc_diagnostics.potential_scale_reduction(
+          rng.rand(2, 3, 4), independent_chain_ndims=0)
+
+
+class PotentialScaleReductionDynamicTest(test.TestCase,
+                                         _PotentialScaleReductionTest):
+
+  @property
+  def use_static_shape(self):
+    return False
+
+
+class _ReduceVarianceTest(object):
+
+  @property
+  def use_static_shape(self):
+    raise NotImplementedError(
+        "Subclass failed to impliment `use_static_shape`.")
+
+  def check_versus_numpy(self, x_, axis, biased, keepdims):
+    with self.test_session():
+      x_ = np.asarray(x_)
+      x = array_ops.placeholder_with_default(
+          input=x_, shape=x_.shape if self.use_static_shape else None)
+      var = mcmc_diagnostics._reduce_variance(
+          x, axis=axis, biased=biased, keepdims=keepdims)
+      np_var = np.var(x_, axis=axis, ddof=0 if biased else 1, keepdims=keepdims)
+
+      if self.use_static_shape:
+        self.assertAllEqual(np_var.shape, var.shape)
+
+      var_ = var.eval()
+      # We will mask below, which changes shape, so check shape explicitly here.
+      self.assertAllEqual(np_var.shape, var_.shape)
+
+      # We get NaN when we divide by zero due to the size being the same as ddof
+      nan_mask = np.isnan(np_var)
+      if nan_mask.any():
+        self.assertTrue(np.isnan(var_[nan_mask]).all())
+      self.assertAllClose(np_var[~nan_mask], var_[~nan_mask], atol=0, rtol=0.02)
+
+  def testScalarBiasedTrue(self):
+    self.check_versus_numpy(x_=-1.234, axis=None, biased=True, keepdims=False)
+
+  def testScalarBiasedFalse(self):
+    # This should result in NaN.
+    self.check_versus_numpy(x_=-1.234, axis=None, biased=False, keepdims=False)
+
+  def testShape2x3x4AxisNoneBiasedFalseKeepdimsFalse(self):
+    self.check_versus_numpy(
+        x_=rng.randn(2, 3, 4), axis=None, biased=True, keepdims=False)
+
+  def testShape2x3x4Axis1BiasedFalseKeepdimsTrue(self):
+    self.check_versus_numpy(
+        x_=rng.randn(2, 3, 4), axis=1, biased=True, keepdims=True)
+
+  def testShape2x3x4x5Axis13BiasedFalseKeepdimsTrue(self):
+    self.check_versus_numpy(
+        x_=rng.randn(2, 3, 4, 5), axis=1, biased=True, keepdims=True)
+
+  def testShape2x3x4x5Axis13BiasedFalseKeepdimsFalse(self):
+    self.check_versus_numpy(
+        x_=rng.randn(2, 3, 4, 5), axis=1, biased=False, keepdims=False)
+
+
+class ReduceVarianceTestStaticShape(test.TestCase, _ReduceVarianceTest):
+
+  @property
+  def use_static_shape(self):
+    return True
+
+
+class ReduceVarianceTestDynamicShape(test.TestCase, _ReduceVarianceTest):
+
+  @property
+  def use_static_shape(self):
+    return False
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/bayesflow/python/ops/mcmc_diagnostics.py b/tensorflow/contrib/bayesflow/python/ops/mcmc_diagnostics.py
new file mode 100644
index 0000000000..5f3e6ade70
--- /dev/null
+++ b/tensorflow/contrib/bayesflow/python/ops/mcmc_diagnostics.py
@@ -0,0 +1,31 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for Markov Chain Monte Carlo (MCMC) sampling."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# go/tf-wildcard-import
+# pylint: disable=wildcard-import
+from tensorflow.contrib.bayesflow.python.ops.mcmc_diagnostics_impl import *
+# pylint: enable=wildcard-import
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = [
+    "potential_scale_reduction",
+]
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/bayesflow/python/ops/mcmc_diagnostics_impl.py b/tensorflow/contrib/bayesflow/python/ops/mcmc_diagnostics_impl.py
new file mode 100644
index 0000000000..3b6f92463e
--- /dev/null
+++ b/tensorflow/contrib/bayesflow/python/ops/mcmc_diagnostics_impl.py
@@ -0,0 +1,228 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for Markov Chain Monte Carlo (MCMC) sampling.
+
+@@potential_scale_reduction
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+
+__all__ = [
+    "potential_scale_reduction",
+]
+
+
+def potential_scale_reduction(state, independent_chain_ndims=1, name=None):
+  """Gelman and Rubin's potential scale reduction factor for chain convergence.
+
+  Given `N > 1` samples from each of `C > 1` independent chains, the potential
+  scale reduction factor, commonly referred to as R-hat, measures convergence of
+  the chains (to the same target) by testing for equality of means.
+  Specifically, R-hat measures the degree to which variance (of the means)
+  between chains exceeds what one would expect if the chains were identically
+  distributed.  See [1], [2].
+
+  Some guidelines:
+
+  * The initial state of the chains should be drawn from a distribution
+    overdispersed with respect to the target.
+  * If all chains converge to the target, then as `N --> infinity`, R-hat --> 1.
+    Before that, R-hat > 1 (except in pathological cases, e.g. if the chain
+    paths were identical).
+  * The above holds for any number of chains `C > 1`.  Increasing `C` does
+    improves effectiveness of the diagnostic.
+  * Sometimes, R-hat < 1.2 is used to indicate approximate convergence, but of
+    course this is problem depedendent.  See [2].
+  * R-hat only measures non-convergence of the mean. If higher moments, or other
+    statistics are desired, a different diagnostic should be used.  See [2].
+
+  #### Examples
+
+  Diagnosing convergence by monitoring 10 chains that each attempt to
+  sample from a 2-variate normal.
+
+  ```python
+  tfd = tf.contrib.distributions
+  tfb = tf.contrib.bayesflow
+
+  target = tfd.MultivariateNormalDiag(scale_diag=[1., 2.])
+
+  # Get 10 (2x) overdispersed initial states.
+  initial_state = target.sample(10) * 2.
+  ==> (10, 2)
+
+  # Get 1000 samples from the 10 independent chains.
+  state = tfb.hmc.sample_chain(
+      num_results=1000,
+      target_log_prob_fn=target.log_prob,
+      current_state=initial_state,
+      step_size=0.05,
+      num_leapfrog_steps=20,
+      num_burnin_steps=200)
+  state.shape
+  ==> (1000, 10, 2)
+
+  rhat = tfb.mcmc_diagnostics.potential_scale_reduction(
+      state, independent_chain_ndims=1)
+
+  # The second dimension needed a longer burn-in.
+  rhat.eval()
+  ==> [1.05, 1.3]
+  ```
+
+  To see why R-hat is reasonable, let `X` be a random variable drawn uniformly
+  from the combined states (combined over all chains).  Then, in the limit
+  `N, C --> infinity`, with `E`, `Var` denoting expectation and variance,
+
+  ```R-hat = ( E[Var[X | chain]] + Var[E[X | chain]] ) / E[Var[X | chain]].```
+
+  Using the law of total variance, the numerator is the variance of the combined
+  states, and the denominator is the total variance minus the variance of the
+  the individual chain means.  If the chains are all drawing from the same
+  distribution, they will have the same mean, and thus the ratio should be one.
+
+  [1] "Inference from Iterative Simulation Using Multiple Sequences"
+      Andrew Gelman and Donald B. Rubin
+      Statist. Sci. Volume 7, Number 4 (1992), 457-472.
+  [2] "General Methods for Monitoring Convergence of Iterative Simulations"
+      Stephen P. Brooks and Andrew Gelman
+      Journal of Computational and Graphical Statistics, 1998. Vol 7, No. 4.
+
+  Args:
+    state:  `Tensor` or Python `list` of `Tensor`s representing the state(s) of
+      a Markov Chain at each result step.  The `ith` state is assumed to have
+      shape `[Ni, Ci1, Ci2,...,CiD] + A`.
+      Dimension `0` indexes the `Ni > 1` result steps of the Markov Chain.
+      Dimensions `1` through `D` index the `Ci1 x ... x CiD` independent
+      chains to be tested for convergence to the same target.
+      The remaining dimensions, `A`, can have any shape (even empty).
+    independent_chain_ndims: Integer type `Tensor` with value `>= 1` giving the
+      number of giving the number of dimensions, from `dim = 1` to `dim = D`,
+      holding independent chain results to be tested for convergence.
+    name: `String` name to prepend to created ops.  Default:
+      `potential_scale_reduction`.
+
+  Returns:
+    `Tensor` or Python `list` of `Tensor`s representing the R-hat statistic for
+    the state(s).  Same `dtype` as `state`, and shape equal to
+    `state.shape[1 + independent_chain_ndims:]`.
+
+  Raises:
+    ValueError:  If `independent_chain_ndims < 1`.
+  """
+  # tensor_util.constant_value returns None iff a constant value (as a numpy
+  # array) is not efficiently computable.  Therefore, we try constant_value then
+  # check for None.
+  icn_const_ = tensor_util.constant_value(
+      ops.convert_to_tensor(independent_chain_ndims))
+  if icn_const_ is not None:
+    independent_chain_ndims = icn_const_
+    if icn_const_ < 1:
+      raise ValueError(
+          "Argument `independent_chain_ndims` must be `>= 1`, found: {}".format(
+              independent_chain_ndims))
+  with ops.name_scope(
+      name,
+      "potential_scale_reduction",
+      values=[state, independent_chain_ndims]):
+    if _is_list_like(state):
+      return [
+          _potential_scale_reduction_single_state(s, independent_chain_ndims)
+          for s in state
+      ]
+    return _potential_scale_reduction_single_state(state,
+                                                   independent_chain_ndims)
+
+
+def _potential_scale_reduction_single_state(state, independent_chain_ndims):
+  """potential_scale_reduction for one single state `Tensor`."""
+  # We assume exactly one leading dimension indexes e.g. correlated samples from
+  # each Markov chain.
+  state = ops.convert_to_tensor(state, name="state")
+  sample_ndims = 1
+
+  sample_axis = math_ops.range(0, sample_ndims)
+  chain_axis = math_ops.range(sample_ndims,
+                              sample_ndims + independent_chain_ndims)
+  sample_and_chain_axis = math_ops.range(0,
+                                         sample_ndims + independent_chain_ndims)
+
+  n = _axis_size(state, sample_axis)
+  m = _axis_size(state, chain_axis)
+
+  # In the language of [2],
+  # B / n is the between chain variance, the variance of the chain means.
+  # W is the within sequence variance, the mean of the chain variances.
+  b_div_n = _reduce_variance(
+      math_ops.reduce_mean(state, sample_axis, keepdims=True),
+      sample_and_chain_axis,
+      biased=False)
+  w = math_ops.reduce_mean(
+      _reduce_variance(state, sample_axis, keepdims=True, biased=True),
+      sample_and_chain_axis)
+
+  # sigma^2_+ is an estimate of the true variance, which would be unbiased if
+  # each chain was drawn from the target.  c.f. "law of total variance."
+  sigma_2_plus = w + b_div_n
+
+  return ((m + 1.) / m) * sigma_2_plus / w - (n - 1.) / (m * n)
+
+
+def effective_sample_size(state,
+                          independent_chain_ndims=1,
+                          max_lags=None,
+                          max_lags_threshold=None,
+                          name="effective_sample_size"):
+  if max_lags is not None and max_lags_threshold is not None:
+    raise ValueError(
+        "Expected at most one of max_lags, max_lags_threshold to be provided.  "
+        "Found: {}, {}".format(max_lags, max_lags_threshold))
+  with ops.name_scope(
+      name,
+      values=[state, independent_chain_ndims, max_lags, max_lags_threshold]):
+    pass
+
+
+# TODO(b/72873233) Move some variant of this to sample_stats.
+def _reduce_variance(x, axis=None, biased=True, keepdims=False):
+  with ops.name_scope("reduce_variance"):
+    x = ops.convert_to_tensor(x, name="x")
+    mean = math_ops.reduce_mean(x, axis=axis, keepdims=True)
+    biased_var = math_ops.reduce_mean(
+        math_ops.squared_difference(x, mean), axis=axis, keepdims=keepdims)
+    if biased:
+      return biased_var
+    n = _axis_size(x, axis)
+    return (n / (n - 1.)) * biased_var
+
+
+def _axis_size(x, axis=None):
+  """Get number of elements of `x` in `axis`, as type `x.dtype`."""
+  if axis is None:
+    return math_ops.cast(array_ops.size(x), x.dtype)
+  return math_ops.cast(
+      math_ops.reduce_prod(array_ops.gather(array_ops.shape(x), axis)), x.dtype)
+
+
+def _is_list_like(x):
+  """Helper which returns `True` if input is `list`-like."""
+  return isinstance(x, (tuple, list))
-- 
GitLab


From 4babfb446d3b904f413652df773ec87d2bd9cb0b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 6 Feb 2018 03:11:59 -0800
Subject: [PATCH 700/966] Makes ResourceVariables return correct
 initialized_value and initial_value for objects created from VariableDef
 protos.

Previously self._initial_value wasn't set in such cases which causes accessing var.initial_value to fail for variables in the imported meta graphs.

PiperOrigin-RevId: 184657191
---
 .../resource_variable_ops_test.py             | 26 +++++++++++++++++++
 .../python/ops/resource_variable_ops.py       | 14 ++++++++++
 2 files changed, 40 insertions(+)

diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index cd94579688..dc6e73bd5b 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -276,6 +276,32 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     v.load(2.0)
     self.assertEqual(2.0, self.evaluate(v.value()))
 
+  def testVariableDefInitializedInstances(self):
+    with ops.Graph().as_default(), self.test_session() as sess:
+      v_def = resource_variable_ops.ResourceVariable(
+          initial_value=constant_op.constant(3.0)).to_proto()
+
+    with ops.Graph().as_default(), self.test_session() as sess:
+      # v describes a VariableDef-based variable without an initial value.
+      v = resource_variable_ops.ResourceVariable(variable_def=v_def)
+      self.assertEqual(3.0, sess.run(v.initialized_value()))
+
+      # initialized_value should not rerun the initializer_op if the variable
+      # has already been initialized elsewhere.
+      sess.run(v.assign(1.0))
+      self.assertEqual(1.0, v.initialized_value().eval())
+
+    v_def.ClearField("initial_value_name")
+    with ops.Graph().as_default(), self.test_session() as sess:
+      # Restoring a legacy VariableDef proto that does not have
+      # initial_value_name set should still work.
+      v = resource_variable_ops.ResourceVariable(variable_def=v_def)
+      # We should also be able to re-export the variable to a new meta graph.
+      self.assertProtoEquals(v_def, v.to_proto())
+      # But attempts to use initialized_value will result in errors.
+      with self.assertRaises(ValueError):
+        sess.run(v.initialized_value())
+
   @test_util.run_in_graph_and_eager_modes()
   def testSparseRead(self):
     with self.test_session():
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index cc9f7981e4..75cb57f16f 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -503,6 +503,14 @@ class ResourceVariable(variables.Variable):
     self._initializer_op = g.as_graph_element(
         ops.prepend_name_scope(
             variable_def.initializer_name, import_scope=import_scope))
+    # Check whether initial_value_name exists for backwards compatibility.
+    if (hasattr(variable_def, "initial_value_name") and
+        variable_def.initial_value_name):
+      self._initial_value = g.as_graph_element(
+          ops.prepend_name_scope(variable_def.initial_value_name,
+                                 import_scope=import_scope))
+    else:
+      self._initial_value = None
     if variable_def.snapshot_name:
       self._cached_value = g.as_graph_element(
           ops.prepend_name_scope(
@@ -699,6 +707,12 @@ class ResourceVariable(variables.Variable):
       var_def = variable_pb2.VariableDef()
       var_def.variable_name = ops.strip_name_scope(self.handle.name,
                                                    export_scope)
+      if self._initial_value is not None:
+        # This is inside an if-statement for backwards compatibility, since
+        # self._initial_value might be None for variables constructed from old
+        # protos.
+        var_def.initial_value_name = ops.strip_name_scope(
+            self._initial_value.name, export_scope)
       var_def.initializer_name = ops.strip_name_scope(self.initializer.name,
                                                       export_scope)
       if self._cached_value is not None:
-- 
GitLab


From 883559fbf5a86b1088bc9537adc5c8ad7cc25fe6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 6 Feb 2018 08:11:41 -0800
Subject: [PATCH 701/966] Remove redundant calls to realloc dynamic tensors.

PiperOrigin-RevId: 184682942
---
 tensorflow/contrib/lite/kernels/batch_to_space_nd.cc | 1 -
 tensorflow/contrib/lite/kernels/mean.cc              | 2 --
 tensorflow/contrib/lite/kernels/pad.cc               | 1 -
 tensorflow/contrib/lite/kernels/resize_bilinear.cc   | 1 -
 tensorflow/contrib/lite/kernels/space_to_batch_nd.cc | 1 -
 tensorflow/contrib/lite/kernels/strided_slice.cc     | 1 -
 tensorflow/contrib/lite/kernels/transpose.cc         | 1 -
 7 files changed, 8 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc b/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc
index 889239f932..bc438f99c6 100644
--- a/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc
+++ b/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc
@@ -116,7 +116,6 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   // Resize the output tensor if the output tensor is dynamic.
   if (IsDynamicTensor(op_context.output)) {
     TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context));
-    TfLiteTensorRealloc(op_context.output->bytes, op_context.output);
   }
 
 #define TF_LITE_BATCH_TO_SPACE_ND(type, scalar)                        \
diff --git a/tensorflow/contrib/lite/kernels/mean.cc b/tensorflow/contrib/lite/kernels/mean.cc
index ec1c402027..aff19581ea 100644
--- a/tensorflow/contrib/lite/kernels/mean.cc
+++ b/tensorflow/contrib/lite/kernels/mean.cc
@@ -183,8 +183,6 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     TF_LITE_ENSURE_OK(context,
                       ResizeTempAxis(context, &op_context, resolved_axis));
     TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context));
-    TfLiteTensorRealloc(resolved_axis->bytes, resolved_axis);
-    TfLiteTensorRealloc(op_context.output->bytes, op_context.output);
   }
 
 #define TF_LITE_MEAN(kernel_type, data_type)                        \
diff --git a/tensorflow/contrib/lite/kernels/pad.cc b/tensorflow/contrib/lite/kernels/pad.cc
index 48114e5a40..c29da3862e 100644
--- a/tensorflow/contrib/lite/kernels/pad.cc
+++ b/tensorflow/contrib/lite/kernels/pad.cc
@@ -101,7 +101,6 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   // Resize the output tensor if the output tensor is dynamic.
   if (IsDynamicTensor(op_context.output)) {
     TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context));
-    TfLiteTensorRealloc(op_context.output->bytes, op_context.output);
   }
 
   // TODO(nupurgarg): Change kernel implementation to take in int* instead of
diff --git a/tensorflow/contrib/lite/kernels/resize_bilinear.cc b/tensorflow/contrib/lite/kernels/resize_bilinear.cc
index c5d60cae3a..9e3e19c09a 100644
--- a/tensorflow/contrib/lite/kernels/resize_bilinear.cc
+++ b/tensorflow/contrib/lite/kernels/resize_bilinear.cc
@@ -85,7 +85,6 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   if (IsDynamicTensor(output)) {
     TF_LITE_ENSURE_OK(context,
                       ResizeOutputTensor(context, input, size, output));
-    TfLiteTensorRealloc(output->bytes, output);
   }
 
   if (output->type == kTfLiteFloat32) {
diff --git a/tensorflow/contrib/lite/kernels/space_to_batch_nd.cc b/tensorflow/contrib/lite/kernels/space_to_batch_nd.cc
index e2e1873f77..d8c9e352f0 100644
--- a/tensorflow/contrib/lite/kernels/space_to_batch_nd.cc
+++ b/tensorflow/contrib/lite/kernels/space_to_batch_nd.cc
@@ -111,7 +111,6 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   // Resize the output tensor if the output tensor is dynamic.
   if (IsDynamicTensor(op_context.output)) {
     TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context));
-    TfLiteTensorRealloc(op_context.output->bytes, op_context.output);
   }
 
 #define TF_LITE_SPACE_TO_BATCH_ND(type, scalar)                        \
diff --git a/tensorflow/contrib/lite/kernels/strided_slice.cc b/tensorflow/contrib/lite/kernels/strided_slice.cc
index c4ffdf79d3..fb1e11e0ca 100644
--- a/tensorflow/contrib/lite/kernels/strided_slice.cc
+++ b/tensorflow/contrib/lite/kernels/strided_slice.cc
@@ -181,7 +181,6 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
   if (IsDynamicTensor(op_context.output)) {
     TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context));
-    TfLiteTensorRealloc(op_context.output->bytes, op_context.output);
   }
 
   std::vector<int32_t> starts;
diff --git a/tensorflow/contrib/lite/kernels/transpose.cc b/tensorflow/contrib/lite/kernels/transpose.cc
index 093814bc44..d3c10a9bb7 100644
--- a/tensorflow/contrib/lite/kernels/transpose.cc
+++ b/tensorflow/contrib/lite/kernels/transpose.cc
@@ -90,7 +90,6 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   // Resize the output tensor if the output tensor is dynamic.
   if (IsDynamicTensor(op_context.output)) {
     TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context));
-    TfLiteTensorRealloc(op_context.output->bytes, op_context.output);
   }
 
   // Reverse the permuted axes and convert to 4D due to the way Dims are
-- 
GitLab


From ef8c0863ca653f235ec2b79beaea32fe6ddee7a9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 6 Feb 2018 08:50:41 -0800
Subject: [PATCH 702/966] Make namedtuples with identical name and field names
 to be considered as the same shallow structure in assert_shallow_structure

PiperOrigin-RevId: 184687609
---
 tensorflow/python/util/nest.py      | 23 ++++++++++++++++++-----
 tensorflow/python/util/nest_test.py |  9 +++++++++
 2 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index c8525ed420..23c2c48f4b 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -497,7 +497,9 @@ def assert_shallow_structure(shallow_tree, input_tree, check_types=True):
     shallow_tree: an arbitrarily nested structure.
     input_tree: an arbitrarily nested structure.
     check_types: if `True` (default) the sequence types of `shallow_tree` and
-      `input_tree` have to be the same.
+      `input_tree` have to be the same. Note that even with check_types==True,
+      this function will consider two different namedtuple classes with the same
+      name and _fields attribute to be the same class.
 
   Raises:
     TypeError: If `shallow_tree` is a sequence but `input_tree` is not.
@@ -513,10 +515,21 @@ def assert_shallow_structure(shallow_tree, input_tree, check_types=True):
           "Input has type: %s." % type(input_tree))
 
     if check_types and not isinstance(input_tree, type(shallow_tree)):
-      raise TypeError(
-          "The two structures don't have the same sequence type. Input "
-          "structure has type %s, while shallow structure has type %s."
-          % (type(input_tree), type(shallow_tree)))
+      # Duck-typing means that nest should be fine with two different
+      # namedtuples with identical name and fields.
+      shallow_is_namedtuple = _is_namedtuple(shallow_tree, False)
+      input_is_namedtuple = _is_namedtuple(input_tree, False)
+      if shallow_is_namedtuple and input_is_namedtuple:
+        if not _same_namedtuples(shallow_tree, input_tree):
+          raise TypeError(
+              "The two namedtuples don't have the same sequence type. Input "
+              "structure has type %s, while shallow structure has type %s."
+              % (type(input_tree), type(shallow_tree)))
+      else:
+        raise TypeError(
+            "The two structures don't have the same sequence type. Input "
+            "structure has type %s, while shallow structure has type %s."
+            % (type(input_tree), type(shallow_tree)))
 
     if len(input_tree) != len(shallow_tree):
       raise ValueError(
diff --git a/tensorflow/python/util/nest_test.py b/tensorflow/python/util/nest_test.py
index 8aaf799fd0..4439d6241e 100644
--- a/tensorflow/python/util/nest_test.py
+++ b/tensorflow/python/util/nest_test.py
@@ -429,6 +429,15 @@ class NestTest(test.TestCase):
     inp_ba = collections.OrderedDict([("b", (2, 3)), ("a", 1)])
     nest.assert_shallow_structure(inp_ab, inp_ba)
 
+    # This assertion is expected to pass: two namedtuples with the same
+    # name and field names are considered to be identical.
+    same_name_type_0 = collections.namedtuple("same_name", ("a", "b"))
+    same_name_type_1 = collections.namedtuple("same_name", ("a", "b"))
+    inp_shallow = same_name_type_0(1, 2)
+    inp_deep = same_name_type_1(1, [1, 2, 3])
+    nest.assert_shallow_structure(inp_shallow, inp_deep, check_types=False)
+    nest.assert_shallow_structure(inp_shallow, inp_deep, check_types=True)
+
   def testFlattenUpTo(self):
     # Shallow tree ends at scalar.
     input_tree = [[[2, 2], [3, 3]], [[4, 9], [5, 5]]]
-- 
GitLab


From ec6e45e9e9a7044c6aab7097ae1f08f62a59449b Mon Sep 17 00:00:00 2001
From: Asim Shankar <ashankar@google.com>
Date: Tue, 6 Feb 2018 08:53:58 -0800
Subject: [PATCH 703/966] Java: Update to 1.6.0-rc0

PiperOrigin-RevId: 184687994
---
 tensorflow/java/maven/libtensorflow/pom.xml         | 2 +-
 tensorflow/java/maven/libtensorflow_jni/pom.xml     | 2 +-
 tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml | 2 +-
 tensorflow/java/maven/pom.xml                       | 2 +-
 tensorflow/java/maven/proto/pom.xml                 | 2 +-
 tensorflow/java/maven/tensorflow/pom.xml            | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tensorflow/java/maven/libtensorflow/pom.xml b/tensorflow/java/maven/libtensorflow/pom.xml
index a9ce5372ae..99add51069 100644
--- a/tensorflow/java/maven/libtensorflow/pom.xml
+++ b/tensorflow/java/maven/libtensorflow/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.5.0</version>
+    <version>1.6.0-rc0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow</artifactId>
diff --git a/tensorflow/java/maven/libtensorflow_jni/pom.xml b/tensorflow/java/maven/libtensorflow_jni/pom.xml
index fe34ca83ff..7bb9879f68 100644
--- a/tensorflow/java/maven/libtensorflow_jni/pom.xml
+++ b/tensorflow/java/maven/libtensorflow_jni/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.5.0</version>
+    <version>1.6.0-rc0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow_jni</artifactId>
diff --git a/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml b/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
index 390152808e..268e1bae1f 100644
--- a/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
+++ b/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.5.0</version>
+    <version>1.6.0-rc0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow_jni_gpu</artifactId>
diff --git a/tensorflow/java/maven/pom.xml b/tensorflow/java/maven/pom.xml
index 524ec45f48..6a3abcbc11 100644
--- a/tensorflow/java/maven/pom.xml
+++ b/tensorflow/java/maven/pom.xml
@@ -6,7 +6,7 @@
   <modelVersion>4.0.0</modelVersion>
   <groupId>org.tensorflow</groupId>
   <artifactId>parentpom</artifactId>
-  <version>1.5.0</version>
+  <version>1.6.0-rc0</version>
   <packaging>pom</packaging>
 
   <url>https://www.tensorflow.org</url>
diff --git a/tensorflow/java/maven/proto/pom.xml b/tensorflow/java/maven/proto/pom.xml
index 9cf3217f51..54a4fd577a 100644
--- a/tensorflow/java/maven/proto/pom.xml
+++ b/tensorflow/java/maven/proto/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.5.0</version>
+    <version>1.6.0-rc0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>proto</artifactId>
diff --git a/tensorflow/java/maven/tensorflow/pom.xml b/tensorflow/java/maven/tensorflow/pom.xml
index d619f986a9..76e0fecae4 100644
--- a/tensorflow/java/maven/tensorflow/pom.xml
+++ b/tensorflow/java/maven/tensorflow/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.5.0</version>
+    <version>1.6.0-rc0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>tensorflow</artifactId>
-- 
GitLab


From 37bcd5af97c822e99f617813269351447c79f161 Mon Sep 17 00:00:00 2001
From: gracehoney <31743510+aaroey@users.noreply.github.com>
Date: Tue, 6 Feb 2018 09:26:20 -0800
Subject: [PATCH 704/966] Fix some BUILD files

---
 tensorflow/contrib/tensorrt/BUILD           |  1 -
 third_party/gpus/cuda_configure.bzl         | 14 +++++++-------
 third_party/tensorrt/BUILD.tpl              |  3 ++-
 third_party/tensorrt/tensorrt_configure.bzl |  5 +----
 4 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index d29ecbb2e6..bcd3c8c547 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -9,7 +9,6 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_custom_op_library",
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index e23a533716..b7c47a19dd 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -367,19 +367,19 @@ def find_cuda_define(repository_ctx, header_dir, header_file, define):
   if result.stdout.find(define) == -1:
     auto_configure_fail("Cannot find line containing '%s' in %s" %
                         (define, h_path))
-  #split results to lines
+  # Split results to lines
   lines = result.stdout.split('\n')
-  lenLines = len(lines)
-  for l in range(lenLines):
+  num_lines = len(lines)
+  for l in range(num_lines):
     line = lines[l]
-    if define in line: # find the line with define
+    if define in line:  # Find the line with define
       version = line
-      if l != lenLines-1 and line[-1] == '\\': # add next line, if multiline
+      if l != num_lines-1 and line[-1] == '\\':  # Add next line, if multiline
         version = version[:-1] + lines[l+1]
       break
-  #remove any comments
+  # Remove any comments
   version = version.split("//")[0]
-  # remove define name
+  # Remove define name
   version = version.replace(define, "").strip()
   # Remove the code after the version number.
   version_end = version.find(" ")
diff --git a/third_party/tensorrt/BUILD.tpl b/third_party/tensorrt/BUILD.tpl
index 12d0115091..57682e8735 100644
--- a/third_party/tensorrt/BUILD.tpl
+++ b/third_party/tensorrt/BUILD.tpl
@@ -1,11 +1,12 @@
 # NVIDIA TensorRT
 # A high-performance deep learning inference optimizer and runtime.
 
-load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts")
 licenses(["notice"])
 
 exports_files(["LICENSE"])
 
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts")
+
 package(default_visibility = ["//visibility:public"])
 
 cc_library(
diff --git a/third_party/tensorrt/tensorrt_configure.bzl b/third_party/tensorrt/tensorrt_configure.bzl
index 4a1441500a..8e76e5d02a 100644
--- a/third_party/tensorrt/tensorrt_configure.bzl
+++ b/third_party/tensorrt/tensorrt_configure.bzl
@@ -20,10 +20,7 @@ _TENSORRT_INSTALL_PATH = "TENSORRT_INSTALL_PATH"
 _TF_TENSORRT_VERSION = "TF_TENSORRT_VERSION"
 
 _TF_TENSORRT_LIBS = ["nvinfer"]
-_TF_TENSORRT_HEADERS = [
-    "NvInfer.h",
-    "NvUtils.h"
-]
+_TF_TENSORRT_HEADERS = ["NvInfer.h", "NvUtils.h"]
 
 _DEFINE_TENSORRT_SONAME_MAJOR = "#define NV_TENSORRT_SONAME_MAJOR"
 _DEFINE_TENSORRT_SONAME_MINOR = "#define NV_TENSORRT_SONAME_MINOR"
-- 
GitLab


From 59c8e2a276cacb30790e57c403781b185815a5e5 Mon Sep 17 00:00:00 2001
From: Andrew Selle <aselle@google.com>
Date: Tue, 6 Feb 2018 09:28:24 -0800
Subject: [PATCH 705/966] Add a utility function to partition TensorFlow Lite
 graphs into subgraphs.

This will be used for the forthcoming delegate interface. The delegate
interface will allow parts of the TensorFlow lite graph to be sent to
other accelerators. NNAPI will be implemented to move to this interface.
That will allow partial delegation to NNAPI.

PiperOrigin-RevId: 184692126
---
 tensorflow/contrib/lite/BUILD              |   7 +-
 tensorflow/contrib/lite/graph_info.cc      | 224 +++++++++++++++++
 tensorflow/contrib/lite/graph_info.h       |  26 ++
 tensorflow/contrib/lite/graph_info_test.cc | 270 +++++++++++++++++++++
 4 files changed, 522 insertions(+), 5 deletions(-)
 create mode 100644 tensorflow/contrib/lite/graph_info.cc
 create mode 100644 tensorflow/contrib/lite/graph_info_test.cc

diff --git a/tensorflow/contrib/lite/BUILD b/tensorflow/contrib/lite/BUILD
index cc0e20f75e..5f89cbbe43 100644
--- a/tensorflow/contrib/lite/BUILD
+++ b/tensorflow/contrib/lite/BUILD
@@ -6,6 +6,8 @@ licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts", "gen_selected_ops")
 
+exports_files(["LICENSE"])
+
 exports_files(glob([
     "testdata/*.bin",
     "models/testdata/*",
@@ -25,11 +27,6 @@ config_setting(
     },
 )
 
-load(
-    "//tensorflow:tensorflow.bzl",
-    "tf_cc_test",
-)
-
 cc_library(
     name = "schema_fbs_version",
     hdrs = ["version.h"],
diff --git a/tensorflow/contrib/lite/graph_info.cc b/tensorflow/contrib/lite/graph_info.cc
new file mode 100644
index 0000000000..186b80fe82
--- /dev/null
+++ b/tensorflow/contrib/lite/graph_info.cc
@@ -0,0 +1,224 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/graph_info.h"
+#include <algorithm>
+
+namespace tflite {
+
+namespace {
+
+// Provide a range iterable wrapper for TfLiteIntArray* (C lists that TfLite
+// C api uses. Can't use the google array_view, since we can't depend on even
+// absl for embedded device reasons.
+// TODO(aselle): Move this into central utilities.
+class TfLiteIntArrayView {
+ public:
+  // Construct a view of a TfLiteIntArray*. Note, `int_array` should be non-null
+  // and this view does not take ownership of it.
+  explicit TfLiteIntArrayView(const TfLiteIntArray* int_array)
+      : int_array_(int_array) {}
+
+  typedef const int* const_iterator;
+  const_iterator begin() const { return int_array_->data; }
+  const_iterator end() const { return &int_array_->data[int_array_->size]; }
+
+  TfLiteIntArrayView(const TfLiteIntArrayView&) = default;
+  TfLiteIntArrayView& operator=(const TfLiteIntArrayView& rhs) = default;
+
+ private:
+  const TfLiteIntArray* int_array_;
+};
+
+// Helper class that actually performs partitioning by subgraph.
+// Outputs to a provided `subgraphs` structure.
+//
+// Example usage:
+// PartitionGraphIntoIndependentSubgraphsImpl partitioner(
+//     info, nodes_to_part, subgraphs);
+// partitioner.Partition();
+class PartitionGraphIntoIndependentSubgraphsImpl {
+ public:
+  PartitionGraphIntoIndependentSubgraphsImpl(
+      const GraphInfo* info, const TfLiteIntArray* nodes_to_partition,
+      std::vector<Subgraph>* subgraphs)
+      : info_(info),
+        subgraphs_(subgraphs),
+        node_type_(info->num_nodes(), Subgraph::kTfNonPartition) {
+    // Populate the node_type_ map.
+    for (auto node_index : TfLiteIntArrayView(nodes_to_partition)) {
+      node_type_[node_index] = Subgraph::kTfPartition;
+    }
+  }
+
+  // Actually partition the graph.
+  void Partition() {
+    // Initialize here to make Partition() re-entrant.
+    subgraphs_->clear();
+    tensor_epochs_.clear();
+    tensor_epochs_.resize(info_->num_tensors(), kEpochAlwaysReady);
+    node_epochs_.clear();
+    node_epochs_.resize(info_->num_nodes(), kEpochNotReady);
+    // Set computed tensors to be kEpochNotReady (initializer set everything to
+    // AlwaysReady).
+    for (int node_index = 0; node_index < info_->num_nodes(); node_index++) {
+      const TfLiteNode& node = info_->node(node_index);
+      for (int output_tensor_index : TfLiteIntArrayView(node.outputs)) {
+        tensor_epochs_[output_tensor_index] = kEpochNotReady;
+      }
+    }
+
+    // Do a graph traversal where each iteration in the loop is an epoch
+    // that corresponds to a subgraph that only contains nodes that are of
+    // the same node_type_.
+    while (true) {
+      BuildSubgraph();
+      if (subgraphs_->back().nodes.empty()) {
+        subgraphs_->pop_back();
+        break;
+      }
+    }
+
+    // Mark model outputs as subgraph outputs. All the rest have already been
+    // identified.
+    for (int output_index : info_->outputs()) {
+      int output_epoch = tensor_epochs_[output_index];
+      Subgraph& output_subgraph = (*subgraphs_)[output_epoch];
+      output_subgraph.output_tensors.push_back(output_index);
+    }
+    // Make sure every subgraph's inputs and outputs are unique. Since the
+    // list of inputs and outputs is generated in a way that produces
+    // duplicates.
+    for (Subgraph& subgraph : *subgraphs_) {
+      // Sort and uniquefy using standard library algorithms.
+      auto uniquefy = [](std::vector<int>* items) {
+        std::sort(items->begin(), items->end());
+        auto last = std::unique(items->begin(), items->end());
+        items->erase(last, items->end());
+      };
+      uniquefy(&subgraph.input_tensors);
+      uniquefy(&subgraph.output_tensors);
+    }
+  }
+
+ private:
+  // Special integer values needed for tensor_epochs_ and node_epochs_.
+  enum {
+    // The node or tensor is not ready to be assigned an epoch. e.g. a node's
+    // inputs have not all been assigned epochs.
+    kEpochNotReady = -1,
+    // Used for tensor_epochs_. This means that the tensor is always ready.
+    // e.g. an input to the whole model or a constant that has no dependencies.
+    kEpochAlwaysReady = -2
+  };
+
+  // Updates the  node `node_index` and returns true if it is assigned to an
+  // epoch. False is returned if the node is already set to an epoch, its inputs
+  // are not all assigned to epochs, or if it cannot be assigned to the current
+  // epoch since the epoch's node_type doesn't match.
+  bool UpdateNode(int node_index) {
+    const TfLiteNode& node = info_->node(node_index);
+    Subgraph& current_subgraph = subgraphs_->back();
+    int current_epoch = subgraphs_->size() - 1;
+    // Check if node is already done.
+    if (node_epochs_[node_index] != kEpochNotReady) {
+      return false;
+    }
+    // See if all dependencies of this node are already assigned to a
+    // subgraph.
+    for (int input_tensor_index : TfLiteIntArrayView(node.inputs)) {
+      if (tensor_epochs_[input_tensor_index] == kEpochNotReady) {
+        return false;
+      }
+    }
+    // When we are starting a new epoch, the first ready node defines
+    // the type of that epoch.
+    if (current_subgraph.type == Subgraph::kTfUnexplored) {
+      current_subgraph.type = node_type_[node_index];
+    }
+    // The node gets assigned to this epoch if it is the same type as
+    // the epoch's assigned type. Note, if this is the current ready
+    // node encountered during this epoch, this condition will be
+    // automatically true.
+    if (current_subgraph.type == node_type_[node_index]) {
+      node_epochs_[node_index] = current_epoch;
+      current_subgraph.nodes.push_back(node_index);
+      // All outputs of this node now are assigned to this epoch as
+      // well.
+      for (int output_tensor_index : TfLiteIntArrayView(node.outputs)) {
+        tensor_epochs_[output_tensor_index] = current_epoch;
+      }
+      // Look at our inputs one more time to update that tensor's
+      // epochs' outputs
+      for (int input_tensor_index : TfLiteIntArrayView(node.inputs)) {
+        int input_epoch = tensor_epochs_[input_tensor_index];
+        int node_epoch = current_epoch;
+        if (input_epoch != node_epoch) {
+          current_subgraph.input_tensors.push_back(input_tensor_index);
+          // Set inputs to be outputs of the subgraph where they reside.
+          // the if condition makes sure inputs to the whole computation
+          // are not included (i.e. those initialized to -2 above).
+          if (input_epoch >= 0) {
+            Subgraph& input_subgraph = (*subgraphs_)[input_epoch];
+            input_subgraph.output_tensors.push_back(input_tensor_index);
+          }
+        }
+      }
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  // Completely populates the current subgraph by doing graph traversal
+  void BuildSubgraph() {
+    subgraphs_->emplace_back(Subgraph());
+    // loop until no more nodes can be updated.
+    while (true) {
+      bool did_something = false;
+      for (int node_index = 0; node_index < info_->num_nodes(); node_index++) {
+        if (UpdateNode(node_index)) {
+          did_something = true;
+        }
+      }
+      if (!did_something) return;
+    }
+  }
+
+  // Temporary data needed for partitioning.
+  const GraphInfo* info_;
+  // List of subgraphs to populate
+  std::vector<Subgraph>* subgraphs_;
+  std::vector<Subgraph::Type> node_type_;
+  // Maps from tensor index to the epoch in which it is assigned. Also special
+  // negative values of kEpochNotAssigned if not assigned, kEpochNotReady if it
+  // is an input or constant.
+  std::vector<int> tensor_epochs_;
+  // Maps from tensor index to the epoch in which it is assigned. Also special
+  // negative values of kEpochNotAssigned if not assigned.
+  std::vector<int> node_epochs_;
+};
+
+}  // namespace
+
+TfLiteStatus PartitionGraphIntoIndependentSubgraphs(
+    const GraphInfo* info, const TfLiteIntArray* nodes_to_partition,
+    std::vector<Subgraph>* subgraphs) {
+  PartitionGraphIntoIndependentSubgraphsImpl(info, nodes_to_partition,
+                                             subgraphs)
+      .Partition();
+  return kTfLiteOk;
+}
+
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/graph_info.h b/tensorflow/contrib/lite/graph_info.h
index 57690058c4..313af5fb75 100644
--- a/tensorflow/contrib/lite/graph_info.h
+++ b/tensorflow/contrib/lite/graph_info.h
@@ -48,6 +48,32 @@ class GraphInfo {
   virtual const std::vector<int>& outputs() const = 0;
 };
 
+// Represents a subgraph of a TensorFlow Lite graph.
+struct Subgraph {
+  enum Type {
+    kTfUnexplored = 0,  // temporarily used during creation
+    kTfPartition,
+    kTfNonPartition
+  };
+  Type type = kTfUnexplored;
+  // Nodes within the subgraph
+  std::vector<int> nodes;
+  // Tensors that stride output from another subgraph that this depends on,
+  // or global inputs to the TensorFlow Lite full graph.
+  std::vector<int> input_tensors;
+  // Outputs that are consumed by other subgraphs or are global output tensors.
+  // All output tensors of the nodes in the subgraph that do not appear in this
+  // list are intermediate results that can be potentially elided.
+  std::vector<int> output_tensors;
+};
+
+// Partitions a list of node indices `nodes_to_partition` into subgraphs.
+// Each subgraph is in dependency order (i.e. all members of the subgraph).
+// `subgraphs` is assumed to be empty.
+TfLiteStatus PartitionGraphIntoIndependentSubgraphs(
+    const GraphInfo* info, const TfLiteIntArray* nodes_to_partition,
+    std::vector<Subgraph>* subgraphs);
+
 }  // namespace tflite
 
 #endif  // TENSORFLOW_CONTRIB_LITE_GRAPH_INFO_H_
diff --git a/tensorflow/contrib/lite/graph_info_test.cc b/tensorflow/contrib/lite/graph_info_test.cc
new file mode 100644
index 0000000000..ea38b43993
--- /dev/null
+++ b/tensorflow/contrib/lite/graph_info_test.cc
@@ -0,0 +1,270 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include "tensorflow/contrib/lite/graph_info.h"
+#include "tensorflow/contrib/lite/testing/util.h"
+
+namespace tflite {
+namespace {
+
+// Makes a TfLiteIntArray* from std::vector, must free with TfLiteIntFree().
+TfLiteIntArray* ConvertVector(const std::vector<int>& x) {
+  TfLiteIntArray* lite = TfLiteIntArrayCreate(x.size());
+  for (size_t i = 0; i < x.size(); i++) lite->data[i] = x[i];
+  return lite;
+}
+
+// A very simple test graph that supports setting in/out tensors on nodes.
+class SimpleTestGraph : public GraphInfo {
+ public:
+  ~SimpleTestGraph() override {
+    for (auto& node : nodes_) {
+      TfLiteIntArrayFree(node.inputs);
+      TfLiteIntArrayFree(node.outputs);
+    }
+  }
+
+  size_t num_tensors() const override { return tensors_.size(); }
+  size_t num_nodes() const override { return nodes_.size(); }
+  const TfLiteNode& node(size_t index) const override { return nodes_[index]; }
+  TfLiteTensor* tensor(size_t index) override { return &tensors_[index]; }
+  const std::vector<int>& inputs() const override { return inputs_; }
+  const std::vector<int>& outputs() const override { return outputs_; }
+
+  void AddNode(const std::vector<int>& inputs,
+               const std::vector<int>& outputs) {
+    nodes_.push_back(TfLiteNode());
+    TfLiteNode& node = nodes_.back();
+    node.inputs = ConvertVector(inputs);
+    node.outputs = ConvertVector(outputs);
+  }
+
+  void AddTensors(int count) { tensors_.resize(count + tensors_.size()); }
+
+  void SetInputsAndOutputs(const std::vector<int>& inputs,
+                           const std::vector<int>& outputs) {
+    inputs_ = inputs;
+    outputs_ = outputs;
+  }
+
+ private:
+  std::vector<TfLiteNode> nodes_;
+  std::vector<TfLiteTensor> tensors_;
+  std::vector<int> inputs_;
+  std::vector<int> outputs_;
+};
+
+// Partition a graph to generate a list of subgraphs. This wraps the API call
+// we are testing and handles memory management and conversion to
+// TfLiteIntArray. Populates `subgraphs` with resulting generated subgraphs.
+void PartitionGraph(const SimpleTestGraph& graph,
+                    const std::vector<int>& nodes_to_partition,
+                    std::vector<Subgraph>* subgraphs) {
+  TfLiteIntArray* nodes_to_partition_int_array =
+      ConvertVector(nodes_to_partition);
+  PartitionGraphIntoIndependentSubgraphs(&graph, nodes_to_partition_int_array,
+                                         subgraphs);
+  TfLiteIntArrayFree(nodes_to_partition_int_array);
+}
+
+// Check a generated list of subgraphs against the expected list of subgraphs.
+void CheckPartitionSubgraphs(const std::vector<Subgraph>& generated_subgraphs,
+                             const std::vector<Subgraph>& expected_subgraphs) {
+  ASSERT_EQ(generated_subgraphs.size(), expected_subgraphs.size());
+  for (int subgraph_index = 0; subgraph_index < generated_subgraphs.size();
+       subgraph_index++) {
+    EXPECT_EQ(generated_subgraphs[subgraph_index].nodes,
+              expected_subgraphs[subgraph_index].nodes);
+    EXPECT_EQ(generated_subgraphs[subgraph_index].input_tensors,
+              expected_subgraphs[subgraph_index].input_tensors);
+    EXPECT_EQ(generated_subgraphs[subgraph_index].output_tensors,
+              expected_subgraphs[subgraph_index].output_tensors);
+  }
+}
+
+// Test an empty trivial graph with no partitions.
+TEST(PartitionTest, Nodes0_PartitionNodes0) {
+  SimpleTestGraph graph;
+  std::vector<int> nodes_to_partition = {};
+  std::vector<Subgraph> generated_subgraphs;
+  PartitionGraph(graph, nodes_to_partition, &generated_subgraphs);
+  CheckPartitionSubgraphs(generated_subgraphs, {});
+}
+
+// Test a 1 node graph with no partitions.
+// Input: tensor(0) -> node(0) -> tensor(1), nodes_to_partition=[]
+// Output: [kTfNoPartition, tensor(0) -> node(0) -> tensor(1)]
+TEST(PartitionTest, Nodes1PartitionNodes0) {
+  SimpleTestGraph graph;
+  graph.AddTensors(2);
+  graph.AddNode({0}, {1});
+  graph.SetInputsAndOutputs({0}, {1});
+  std::vector<int> nodes_to_partition = {};
+  std::vector<Subgraph> generated_subgraphs;
+  PartitionGraph(graph, nodes_to_partition, &generated_subgraphs);
+
+  Subgraph expected_subgraph;
+  expected_subgraph.type = Subgraph::kTfNonPartition;
+  expected_subgraph.nodes = {0};
+  expected_subgraph.input_tensors = {0};
+  expected_subgraph.output_tensors = {1};
+  CheckPartitionSubgraphs(generated_subgraphs, {expected_subgraph});
+}
+
+// Test a 1 node graph with no inputs that is fully partitioned.
+// Input: node(0) -> tensor(1), nodes_to_partition=[node0]
+// Output: [kTfPartition, node(0) -> tensor(1)]
+TEST(PartitionTest, Nodes1PartitionNodes0Inputs0) {
+  SimpleTestGraph graph;
+  graph.AddTensors(1);
+  graph.AddNode({}, {0});
+  graph.SetInputsAndOutputs({}, {0});
+  std::vector<Subgraph> generated_subgraphs;
+  std::vector<int> nodes_to_partition = {0};
+  PartitionGraph(graph, nodes_to_partition, &generated_subgraphs);
+
+  Subgraph expected_subgraph;
+  expected_subgraph.type = Subgraph::kTfPartition;
+  expected_subgraph.nodes = {0};
+  expected_subgraph.input_tensors = {};
+  expected_subgraph.output_tensors = {0};
+  CheckPartitionSubgraphs(generated_subgraphs, {expected_subgraph});
+}
+
+// Test a 1 node graph that is partitioned completely.
+// Input: tensor(0) -> node(0) -> tensor(1), nodes_to_partition=[node0]
+// Output: [kTfPartition, tensor(0) -> node(0) -> tensor(1)]
+TEST(PartitionTest, Nodes1PartitionNodes1) {
+  SimpleTestGraph graph;
+  graph.AddTensors(2);
+  graph.AddNode({0}, {1});
+  graph.SetInputsAndOutputs({0}, {1});
+  std::vector<int> nodes_to_partition = {0};
+  std::vector<Subgraph> generated_subgraphs;
+  PartitionGraph(graph, nodes_to_partition, &generated_subgraphs);
+
+  Subgraph expected_subgraph;
+  expected_subgraph.type = Subgraph::kTfPartition;
+  expected_subgraph.nodes = {0};
+  expected_subgraph.input_tensors = {0};
+  expected_subgraph.output_tensors = {1};
+  CheckPartitionSubgraphs(generated_subgraphs, {expected_subgraph});
+}
+
+// Test a 2 node graph where 1 node is partitioned and the other is not.
+// Input: tensor(0) -> node(0) -> tensor(1) -> node(1) -> tensor(2),
+//    nodes_to_partition = [1]
+// Output: [kTfNonPartition, tensor(0) -> node(0) -> tensor(1),
+//          kTfPartition, tensor(1) -> node(1), tensor(2)]
+TEST(PartitionTest, Nodes2PartitionNodes1) {
+  SimpleTestGraph graph;
+  graph.AddTensors(3);
+  graph.AddNode({0}, {1});
+  graph.AddNode({1}, {2});
+  graph.SetInputsAndOutputs({0}, {2});
+  std::vector<int> nodes_to_partition = {1};
+  std::vector<Subgraph> generated_subgraphs;
+  PartitionGraph(graph, nodes_to_partition, &generated_subgraphs);
+
+  Subgraph expected_subgraph0;
+  expected_subgraph0.type = Subgraph::kTfPartition;
+  expected_subgraph0.nodes = {0};
+  expected_subgraph0.input_tensors = {0};
+  expected_subgraph0.output_tensors = {1};
+  Subgraph expected_subgraph1;
+  expected_subgraph1.type = Subgraph::kTfPartition;
+  expected_subgraph1.nodes = {1};
+  expected_subgraph1.input_tensors = {1};
+  expected_subgraph1.output_tensors = {2};
+  CheckPartitionSubgraphs(generated_subgraphs,
+                          {expected_subgraph0, expected_subgraph1});
+}
+
+// Test a 2 node graph where both nodes are fully partitioned.
+// Input: tensor(0) -> node(0) -> tensor(1) -> node(1) -> tensor(2),
+//    nodes_to_partition = [0, 1]
+// Output: [kTfPartition, tensor(0) -> node(0) -> node(1) -> tensor(1)]
+TEST(PartitionTest, Nodes2PartitionNodes2) {
+  SimpleTestGraph graph;
+  graph.AddTensors(3);
+  graph.AddNode({0}, {1});
+  graph.AddNode({1}, {2});
+  graph.SetInputsAndOutputs({0}, {2});
+  std::vector<int> nodes_to_partition = {0, 1};
+  std::vector<Subgraph> generated_subgraphs;
+  PartitionGraph(graph, nodes_to_partition, &generated_subgraphs);
+
+  Subgraph expected_subgraph0;
+  expected_subgraph0.type = Subgraph::kTfPartition;
+  expected_subgraph0.nodes = {0, 1};
+  expected_subgraph0.input_tensors = {0};
+  expected_subgraph0.output_tensors = {2};
+  CheckPartitionSubgraphs(generated_subgraphs, {expected_subgraph0});
+}
+
+// Test a three node model where we want to partition nodes 0 and nodes
+// 2, but nodes 0 and nodes 2 cannot be in the same subgraph since node 2
+// depends on node 1 which depends on node 0. Thus, we need to produce three
+// subgraphs.
+//
+// Input: tensor(0) -> node(0) -> tensor(1)
+//        tensor(1) -> node(1) -> tensor(2)
+//        [tensor(2), tensor(1)] -> node(2) -> tensor(3)
+//    nodes_to_partition = [0, 2]
+// Output: [[kTfPartition, tensor(0) -> node(0) -> tensor(1),
+//          [kTfNonPartition, tensor(1) -> node(1) -> tensor(2)],
+//          [kTfPartition, [tensor(2), tensor(1)] -> node(2) -> node(3)]
+TEST(PartitionTest, Nodes3PartitionNodes2) {
+  SimpleTestGraph graph;
+  graph.AddTensors(4);
+  graph.AddNode({0}, {1});
+  graph.AddNode({1}, {2});
+  graph.AddNode({1, 2}, {3});
+  graph.SetInputsAndOutputs({0}, {3});
+  std::vector<int> nodes_to_partition = {0, 2};
+  std::vector<Subgraph> generated_subgraphs;
+  PartitionGraph(graph, nodes_to_partition, &generated_subgraphs);
+
+  Subgraph expected_subgraph0;
+  expected_subgraph0.type = Subgraph::kTfPartition;
+  expected_subgraph0.nodes = {0};
+  expected_subgraph0.input_tensors = {0};
+  expected_subgraph0.output_tensors = {1};
+  Subgraph expected_subgraph1;
+  expected_subgraph1.type = Subgraph::kTfNonPartition;
+  expected_subgraph1.nodes = {1};
+  expected_subgraph1.input_tensors = {1};
+  expected_subgraph1.output_tensors = {2};
+  Subgraph expected_subgraph2;
+  expected_subgraph2.type = Subgraph::kTfPartition;
+  expected_subgraph2.nodes = {2};
+  expected_subgraph2.input_tensors = {1, 2};
+  expected_subgraph2.output_tensors = {3};
+  CheckPartitionSubgraphs(
+      generated_subgraphs,
+      {expected_subgraph0, expected_subgraph1, expected_subgraph2});
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
-- 
GitLab


From 360ae1cd2c7a73f96e8fb4fd601521dc108246cf Mon Sep 17 00:00:00 2001
From: gracehoney <31743510+aaroey@users.noreply.github.com>
Date: Tue, 6 Feb 2018 09:34:17 -0800
Subject: [PATCH 706/966] Revert "Remove LICENSE file. It will be added
 internally instead."

This reverts commit d0c2b84c2d6cd3c235671e67f22e3639b4b2b54c.
---
 third_party/tensorrt/LICENSE | 203 +++++++++++++++++++++++++++++++++++
 1 file changed, 203 insertions(+)
 create mode 100644 third_party/tensorrt/LICENSE

diff --git a/third_party/tensorrt/LICENSE b/third_party/tensorrt/LICENSE
new file mode 100644
index 0000000000..146d9b765c
--- /dev/null
+++ b/third_party/tensorrt/LICENSE
@@ -0,0 +1,203 @@
+Copyright 2018 The TensorFlow Authors.  All rights reserved.
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2018, The TensorFlow Authors.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
-- 
GitLab


From c446422e3857344d9b94a1521ff86734b700f1ae Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 6 Feb 2018 09:40:53 -0800
Subject: [PATCH 707/966] Fix bug in and speed up
 ConstantFolding::CreateNodeDef():   * Fix bug trying to store more than
 kintmax32 values in a repeated proto field.   * Speed up populating
 compressed format. Example:
 tensorflow/python/kernel_tests/large_concat_op_test with size = 2**29+6 goes
 from ~30 seconds to ~15 seconds. The fraction of time spent in
 ConstantFolding::CreateNodeDef() goes down from about 35% to about 12%.

PiperOrigin-RevId: 184693749
---
 .../grappler/optimizers/constant_folding.cc   | 34 +++++++++++--------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index 37a4759fdd..1e6f11c8aa 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -808,20 +808,26 @@ NodeDef ConstantFolding::CreateNodeDef(const string& name,
   // Use the packed representation whenever possible to avoid generating large
   // graphdefs. Moreover, avoid repeating the last values if they're equal.
   if (tensor->NumElements() > 4) {
-#define POPULATE_TENSOR_PROTO(tensor, t, TYPE, NAME)         \
-  optimized = true;                                          \
-  TYPE last = tensor->flat<TYPE>()(0);                       \
-  int last_index = 0;                                        \
-  for (int i = 0; i < tensor->NumElements(); ++i) {          \
-    TYPE cur = tensor->flat<TYPE>()(i);                      \
-    t->add_##NAME##_val(cur);                                \
-    if (cur != last) {                                       \
-      last = cur;                                            \
-      last_index = i;                                        \
-    }                                                        \
-  }                                                          \
-  /* Remove all identical trailing values to save memory. */ \
-  t->mutable_##NAME##_val()->Truncate(last_index + 1);
+#define POPULATE_TENSOR_PROTO(tensor, t, TYPE, NAME)                \
+  const TYPE* val_ptr = tensor->flat<TYPE>().data();                \
+  TYPE last = *val_ptr;                                             \
+  int64 last_index = 0;                                             \
+  for (int64 i = 0; i < tensor->NumElements(); ++i) {               \
+    TYPE cur = *val_ptr++;                                          \
+    if (cur != last) {                                              \
+      last = cur;                                                   \
+      last_index = i;                                               \
+    }                                                               \
+  }                                                                 \
+  if (last_index < kint32max) {                                     \
+    optimized = true;                                               \
+    t->mutable_##NAME##_val()->Reserve(last_index + 1);             \
+    t->mutable_##NAME##_val()->AddNAlreadyReserved(last_index + 1); \
+    val_ptr = tensor->flat<TYPE>().data();                          \
+    for (int64 i = 0; i <= last_index; ++i) {                       \
+      t->set_##NAME##_val(i, *val_ptr++);                           \
+    }                                                               \
+  }
 
     if (tensor->dtype() == DT_FLOAT) {
       POPULATE_TENSOR_PROTO(tensor, t, float, float)
-- 
GitLab


From c9527bf5caee75192e697008a7d2d732e92060b2 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <bsteiner@google.com>
Date: Tue, 6 Feb 2018 10:11:26 -0800
Subject: [PATCH 708/966] Explicitly specify the value of all the attributes
 for the newly created Assign nodes since we can't always rely on TF calling
 AddDefaultAttrsToGraphDef.

PiperOrigin-RevId: 184698463
---
 tensorflow/core/grappler/optimizers/memory_optimizer.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer.cc b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
index be18e7d6f0..9f3e94052f 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
@@ -641,6 +641,8 @@ bool SchedulingPass(Cluster* cluster, GrapplerItem* item) {
     initialize->set_op("Assign");
     initialize->set_device(device);
     (*initialize->mutable_attr())["T"].set_type(dtype);
+    (*initialize->mutable_attr())["use_locking"].set_b(false);
+    (*initialize->mutable_attr())["validate_shape"].set_b(false);
     *initialize->add_input() = tmp_var->name();
     *initialize->add_input() = zeros->name();
 
-- 
GitLab


From 993398d23d60efbc153c6ee7af33b0ecab85d33b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 6 Feb 2018 10:13:04 -0800
Subject: [PATCH 709/966] Add local interconnect data to DeviceLocality.

This information can be used within a distributed implementation
when deciding how to route data transfers that might involve more
than one hop.  By default the new fields are populated according to
StreamExecutor::CanEnablePeerAccessTo(), however a platform-specific
implementation can augment them with more detailed values.

Do some refactoring of gpu_device and gpu_device_factory, making
GetDeviceLocalities() and GetInterconnectMaps() into virtual
functions.

PiperOrigin-RevId: 184698821
---
 .../core/common_runtime/gpu/gpu_device.cc     | 251 ++++++++++++------
 .../core/common_runtime/gpu/gpu_device.h      |  37 ++-
 .../common_runtime/gpu/gpu_device_test.cc     |  12 +
 .../core/framework/device_attributes.proto    |  16 ++
 4 files changed, 237 insertions(+), 79 deletions(-)

diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index e26cb0fc3e..04b5541863 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -834,6 +834,9 @@ void BaseGPUDevice::ReinitializeGpuDevice(OpKernelContext* context,
   }
 }
 
+const int BaseGPUDeviceFactory::InterconnectMap::kSameDeviceStrength = 1000;
+const int BaseGPUDeviceFactory::InterconnectMap::kStreamExecutorStrength = 1;
+
 Status BaseGPUDeviceFactory::CreateDevices(const SessionOptions& options,
                                            const string& name_prefix,
                                            std::vector<Device*>* devices) {
@@ -893,6 +896,35 @@ Status BaseGPUDeviceFactory::CreateDevices(const SessionOptions& options,
     }
   }
 
+  std::vector<InterconnectMap> interconnect_maps;
+  TF_RETURN_IF_ERROR(
+      GetInterconnectMaps(visible_gpu_order, gpu_manager, &interconnect_maps));
+
+  // Print each interconnect map to the log.
+  for (const InterconnectMap& im : interconnect_maps) {
+    LOG(INFO) << "Device interconnect " << im.name << " with strength "
+              << im.strength << " edge matrix:";
+    string line_buf = "     ";
+    for (int i = 0; i < visible_gpu_order.size(); ++i) {
+      strings::StrAppend(&line_buf, visible_gpu_order[i].value(), " ");
+    }
+    LOG(INFO) << line_buf;
+    for (int i = 0; i < visible_gpu_order.size(); ++i) {
+      line_buf = strings::StrCat(visible_gpu_order[i].value(), ":   ");
+      CudaGpuId cuda_id_i = visible_gpu_order[i];
+      for (int j = 0; j < visible_gpu_order.size(); ++j) {
+        CudaGpuId cuda_id_j = visible_gpu_order[j];
+        if (im.directed_links.find({cuda_id_i, cuda_id_j}) !=
+            im.directed_links.end()) {
+          line_buf.append("Y ");
+        } else {
+          line_buf.append("N ");
+        }
+      }
+      LOG(INFO) << line_buf;
+    }
+  }
+
   const auto& virtual_devices = gpu_options.experimental().virtual_devices();
   if (!virtual_devices.empty()) {
     TF_RETURN_IF_ERROR(VerifyVirtualDeviceSettings(
@@ -903,9 +935,9 @@ Status BaseGPUDeviceFactory::CreateDevices(const SessionOptions& options,
           valid_cuda_gpu_ids == visible_gpu_order);
   }
   int next_tf_gpu_id = 0;
+  std::vector<int64> memory_limit_bytes;
   for (int i = 0; i < num_gpus_to_use; ++i) {
     const CudaGpuId cuda_gpu_id = valid_cuda_gpu_ids[i];
-    std::vector<int64> memory_limit_bytes;
     if (virtual_devices.empty() ||
         virtual_devices.Get(i).memory_limit_mb_size() == 0) {
       int64 single_virtual_device_memory_limit = 0;
@@ -919,14 +951,31 @@ Status BaseGPUDeviceFactory::CreateDevices(const SessionOptions& options,
                        return static_cast<int64>(mb) * (1ll << 20);
                      });
     }
-    for (int64 bytes : memory_limit_bytes) {
+    while (next_tf_gpu_id < memory_limit_bytes.size()) {
       TfGpuId tf_gpu_id(next_tf_gpu_id);
       ++next_tf_gpu_id;
       GpuIdUtil::InsertTfCudaGpuIdPair(tf_gpu_id, cuda_gpu_id);
-      TF_RETURN_IF_ERROR(
-          CreateGPUDevice(options, name_prefix, tf_gpu_id, bytes, devices));
     }
   }
+  const int num_tf_gpus = next_tf_gpu_id;
+
+  LocalityMap device_localities;
+  TF_RETURN_IF_ERROR(
+      GetDeviceLocalities(num_tf_gpus, interconnect_maps, &device_localities));
+
+  // Build the GPUDevices
+  CHECK_EQ(next_tf_gpu_id, memory_limit_bytes.size());
+  for (int di = 0; di < num_tf_gpus; ++di) {
+    TfGpuId tf_gpu_id(di);
+    int64 bytes = memory_limit_bytes[di];
+    auto it = device_localities.find(tf_gpu_id);
+    if (it == device_localities.end()) {
+      return errors::Internal("Failed to find DeviceLocality for GPU device ",
+                              tf_gpu_id.value());
+    }
+    TF_RETURN_IF_ERROR(CreateGPUDevice(options, name_prefix, tf_gpu_id, bytes,
+                                       it->second, devices));
+  }
   return Status::OK();
 }
 
@@ -950,41 +999,19 @@ Status BaseGPUDeviceFactory::CreateGPUDevice(const SessionOptions& options,
                                              const string& name_prefix,
                                              TfGpuId tf_gpu_id,
                                              int64 memory_limit,
+                                             const DeviceLocality& dev_locality,
                                              std::vector<Device*>* devices) {
   CHECK_GE(tf_gpu_id.value(), 0);
   const string device_name =
       strings::StrCat(name_prefix, "/device:GPU:", tf_gpu_id.value());
-
-  // Look up the device, to see its attributes.
   GpuIdUtil::CheckValidTfGpuId(tf_gpu_id);
-  gpu::StreamExecutor* se =
-      GpuIdUtil::ExecutorForTfGpuId(tf_gpu_id).ValueOrDie();
-  const gpu::DeviceDescription& desc = se->GetDeviceDescription();
-  int numa_node = desc.numa_node();
-  if (numa_node < 0) {
-    // For some reason the StreamExecutor couldn't get the NUMA
-    // affinity of the GPU.  If this is not a multi-socket mobo with
-    // GPUs local to different buses, it doesn't matter.  If it is, we
-    // may run into trouble later with data transfer operations.  The
-    // trouble may manifest as slower than expected performance, or
-    // outright failures.
-    LOG(INFO) << "Could not identify NUMA node of " << device_name
-              << ", defaulting to 0.  Your kernel may not have been built "
-              << "with NUMA support.";
-    numa_node = 0;
-  }
+  CudaGpuId cuda_gpu_id = GpuIdUtil::TfToCudaGpuId(tf_gpu_id);
+  int numa_node = dev_locality.numa_node();
   Bytes allocated_bytes = static_cast<Bytes>(memory_limit);
 
-  // Get GPU bus_id from its reported NUMA affinity.  Because GPUs are
-  // virtualized in some environments, we can't just use the GPU id.
-  // NUMA locales are indexed from 0, buses are indexed from 1.
-  DeviceLocality dev_locality;
-  dev_locality.set_bus_id(numa_node + 1);
-  const CudaGpuId cuda_gpu_id = GpuIdUtil::TfToCudaGpuId(tf_gpu_id);
-  VLOG(1) << "GPUDevice id " << cuda_gpu_id << " on bus "
-          << dev_locality.bus_id() << " numa: " << numa_node
-          << " pci: " << desc.pci_bus_id();
-
+  gpu::StreamExecutor* se =
+      GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
+  const gpu::DeviceDescription& desc = se->GetDeviceDescription();
   LOG(INFO) << "Creating TensorFlow device (" << device_name << " with "
             << (memory_limit >> 20) << " MB memory) -> physical GPU ("
             << GetShortDeviceDescription(cuda_gpu_id, desc) << ")";
@@ -1001,6 +1028,116 @@ Status BaseGPUDeviceFactory::CreateGPUDevice(const SessionOptions& options,
   return Status::OK();
 }
 
+namespace {
+std::unique_ptr<std::map<std::pair<CudaGpuId, CudaGpuId>, bool>>
+GetPeerAccessMap(gpu::Platform* platform,
+                 const std::vector<CudaGpuId>& visible_gpu_order) {
+  std::unique_ptr<std::map<std::pair<CudaGpuId, CudaGpuId>, bool>> map(
+      new std::map<std::pair<CudaGpuId, CudaGpuId>, bool>);
+  for (CudaGpuId cuda_gpu_i : visible_gpu_order) {
+    for (CudaGpuId cuda_gpu_j : visible_gpu_order) {
+      gpu::StreamExecutor* from =
+          GpuIdUtil::ExecutorForCudaGpuId(platform, cuda_gpu_i).ValueOrDie();
+      gpu::StreamExecutor* to =
+          GpuIdUtil::ExecutorForCudaGpuId(platform, cuda_gpu_j).ValueOrDie();
+      (*map)[{cuda_gpu_i, cuda_gpu_j}] = from->CanEnablePeerAccessTo(to);
+    }
+  }
+
+  return map;
+}
+
+}  // namespace
+
+Status BaseGPUDeviceFactory::GetInterconnectMaps(
+    const std::vector<CudaGpuId>& visible_gpu_order, gpu::Platform* gpu_manager,
+    std::vector<InterconnectMap>* maps) {
+  // The default interconnect map is obtained from the StreamExecutor.
+  auto access_map = GetPeerAccessMap(gpu_manager, visible_gpu_order);
+  maps->resize(1);
+  InterconnectMap& imap = maps->at(0);
+  imap.name = "StreamExecutor";
+  imap.strength = InterconnectMap::kStreamExecutorStrength;
+  for (CudaGpuId cuda_id_i : visible_gpu_order) {
+    for (CudaGpuId cuda_id_j : visible_gpu_order) {
+      if (cuda_id_i == cuda_id_j) continue;
+      if ((*access_map)[{cuda_id_i, cuda_id_j}]) {
+        imap.directed_links.insert({cuda_id_i, cuda_id_j});
+      }
+    }
+  }
+  return Status::OK();
+}
+
+Status BaseGPUDeviceFactory::GetDeviceLocalities(
+    int num_tf_gpus, const std::vector<InterconnectMap>& interconnects,
+    LocalityMap* localities) {
+  std::vector<TfGpuId> all_tf_gpu_ids;
+  for (int i = 0; i < num_tf_gpus; ++i) {
+    all_tf_gpu_ids.push_back(TfGpuId(i));
+  }
+  for (TfGpuId tf_gpu_id : all_tf_gpu_ids) {
+    CudaGpuId cuda_gpu_id = GpuIdUtil::TfToCudaGpuId(tf_gpu_id);
+    // Get GPU bus_id from its reported NUMA affinity.  Because GPUs are
+    // virtualized in some environments, we can't just use the GPU id.
+    // NUMA locales are indexed from 0, buses are indexed from 1.
+    gpu::StreamExecutor* se =
+        GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
+    const gpu::DeviceDescription& desc = se->GetDeviceDescription();
+    int numa_node = desc.numa_node();
+    if (numa_node < 0) {
+      // For some reason the StreamExecutor couldn't get the NUMA
+      // affinity of the GPU.  If this is not a multi-socket mobo with
+      // GPUs local to different buses, it doesn't matter.  If it is, we
+      // may run into trouble later with data transfer operations.  The
+      // trouble may manifest as slower than expected performance, or
+      // outright failures.
+      LOG(INFO) << "Could not identify NUMA node of CUDA gpu id " << cuda_gpu_id
+                << ", defaulting to 0.  Your kernel may not have been built "
+                << "with NUMA support.";
+      numa_node = 0;
+    }
+    DeviceLocality dev_locality;
+    dev_locality.set_numa_node(numa_node);
+    dev_locality.set_bus_id(numa_node + 1);
+
+    // Set LocalLinks from InterconnectMaps.
+    LocalLinks* links = dev_locality.mutable_links();
+    for (const InterconnectMap& imap : interconnects) {
+      for (TfGpuId tf_gpu_dst : all_tf_gpu_ids) {
+        CudaGpuId cuda_gpu_dst = GpuIdUtil::TfToCudaGpuId(tf_gpu_dst);
+        if (imap.directed_links.find({cuda_gpu_id, cuda_gpu_dst}) !=
+            imap.directed_links.end()) {
+          InterconnectLink* ilink = links->add_link();
+          ilink->set_device_id(tf_gpu_dst.value());
+          ilink->set_type(imap.name);
+          ilink->set_strength(imap.strength);
+        }
+      }
+    }
+
+    // If this is one of multiple virtual GPUs on the same physical GPU
+    // add high strength links to the others.
+    for (TfGpuId tf_gpu_dst : all_tf_gpu_ids) {
+      if (tf_gpu_id == tf_gpu_dst) continue;
+      CudaGpuId cuda_gpu_dst = GpuIdUtil::TfToCudaGpuId(tf_gpu_dst);
+      if (cuda_gpu_id == cuda_gpu_dst) {
+        InterconnectLink* ilink = links->add_link();
+        ilink->set_device_id(tf_gpu_dst.value());
+        ilink->set_type("SAME_DEVICE");
+        ilink->set_strength(InterconnectMap::kSameDeviceStrength);
+      }
+    }
+
+    (*localities)[tf_gpu_id] = dev_locality;
+    VLOG(1) << "GPUDevice CudaGpuId " << cuda_gpu_id << " TfGpuId " << tf_gpu_id
+            << " on bus " << dev_locality.bus_id() << " numa: " << numa_node
+            << " pci: " << desc.pci_bus_id()
+            << " DeviceLocality: " << dev_locality.DebugString();
+  }
+  return Status::OK();
+}
+
 static int GetDefaultMinGPUMultiprocessorCount(
     gpu::Platform* gpu_manager,
     const std::vector<CudaGpuId>& visible_gpu_order) {
@@ -1106,38 +1243,19 @@ std::vector<CudaVersion> GetSupportedCudaComputeCapabilities() {
   return cuda_caps;
 }
 
-std::unique_ptr<std::map<std::pair<int, int>, bool>> GetPeerAccessMap(
-    gpu::Platform* platform, const std::vector<CudaGpuId>& visible_gpu_order) {
-  std::unique_ptr<std::map<std::pair<int, int>, bool>> map(
-      new std::map<std::pair<int, int>, bool>);
-  for (int i = 0; i < visible_gpu_order.size(); ++i) {
-    const CudaGpuId i_gpu_id = visible_gpu_order[i];
-    for (int j = 0; j < visible_gpu_order.size(); ++j) {
-      const CudaGpuId j_gpu_id = visible_gpu_order[j];
-      gpu::StreamExecutor* from =
-          GpuIdUtil::ExecutorForCudaGpuId(platform, i_gpu_id).ValueOrDie();
-      gpu::StreamExecutor* to =
-          GpuIdUtil::ExecutorForCudaGpuId(platform, j_gpu_id).ValueOrDie();
-      (*map)[{i, j}] = from->CanEnablePeerAccessTo(to);
-    }
-  }
-
-  return map;
-}
-
 Status EnablePeerAccess(gpu::Platform* platform,
                         const std::vector<CudaGpuId>& visible_gpu_order) {
   int possible_peer_count = 0;
   int enabled_peer_count = 0;
   for (int i = 0; i < visible_gpu_order.size(); ++i) {
-    const CudaGpuId i_gpu_id = visible_gpu_order[i];
+    const CudaGpuId cuda_gpu_i = visible_gpu_order[i];
     for (int j = 0; j < visible_gpu_order.size(); ++j) {
-      const CudaGpuId j_gpu_id = visible_gpu_order[j];
+      const CudaGpuId cuda_gpu_j = visible_gpu_order[j];
       // We have already validated that ExecutorForDevice() calls return OK.
       gpu::StreamExecutor* from =
-          GpuIdUtil::ExecutorForCudaGpuId(platform, i_gpu_id).ValueOrDie();
+          GpuIdUtil::ExecutorForCudaGpuId(platform, cuda_gpu_i).ValueOrDie();
       gpu::StreamExecutor* to =
-          GpuIdUtil::ExecutorForCudaGpuId(platform, j_gpu_id).ValueOrDie();
+          GpuIdUtil::ExecutorForCudaGpuId(platform, cuda_gpu_j).ValueOrDie();
 
       if (from->CanEnablePeerAccessTo(to)) {
         ++possible_peer_count;
@@ -1145,7 +1263,7 @@ Status EnablePeerAccess(gpu::Platform* platform,
         if (!status.ok()) {
           LOG(WARNING)
               << "Unable to enable peer access between device ordinals "
-              << i_gpu_id << " and " << j_gpu_id << ", status: " << status;
+              << cuda_gpu_i << " and " << cuda_gpu_j << ", status: " << status;
         } else {
           ++enabled_peer_count;
         }
@@ -1216,27 +1334,6 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
   if (new_gpu_found && visible_gpu_order.size() > 1) {
     // Enable peer access
     TF_RETURN_IF_ERROR(EnablePeerAccess(gpu_manager, visible_gpu_order));
-
-    // Print out a matrix showing which devices can DMA to one
-    // another.
-    LOG(INFO) << "Device peer to peer matrix";
-    auto access_map = GetPeerAccessMap(gpu_manager, visible_gpu_order);
-    string line_buf = "DMA: ";
-    for (int i = 0; i < visible_gpu_order.size(); ++i) {
-      strings::StrAppend(&line_buf, visible_gpu_order[i].value(), " ");
-    }
-    LOG(INFO) << line_buf;
-    for (int i = 0; i < visible_gpu_order.size(); ++i) {
-      line_buf = strings::StrCat(visible_gpu_order[i].value(), ":   ");
-      for (int j = 0; j < visible_gpu_order.size(); ++j) {
-        if ((*access_map)[{i, j}]) {
-          line_buf.append("Y ");
-        } else {
-          line_buf.append("N ");
-        }
-      }
-      LOG(INFO) << line_buf;
-    }
   }
 
   auto cuda_supported_capabilities = GetSupportedCudaComputeCapabilities();
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h
index 41e60b4884..82ce3a2e9d 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.h
@@ -140,17 +140,50 @@ class BaseGPUDeviceFactory : public DeviceFactory {
   Status CreateDevices(const SessionOptions& options, const string& name_prefix,
                        std::vector<Device*>* devices) override;
 
+  struct InterconnectMap {
+    // Name of interconnect technology, if known.
+    string name;
+    // If possible, strength should approximate Gb/sec bandwidth rate.
+    // Where architecture-specific subclassing is not done that won't
+    // always be possible.  The minimum expectation is that
+    // faster links should have a higher value than slower links.
+    int32 strength;
+    static const int kSameDeviceStrength;
+    static const int kStreamExecutorStrength;
+    std::set<std::pair<CudaGpuId, CudaGpuId>> directed_links;
+  };
+
+ protected:
+  // Populates *maps with interconnect maps for all local direct access
+  // pathways between GPUs.
+  virtual Status GetInterconnectMaps(
+      const std::vector<CudaGpuId>& visible_gpu_order,
+      gpu::Platform* gpu_manager, std::vector<InterconnectMap>* maps);
+
+  struct TfGpuIdHash {
+    std::size_t operator()(const TfGpuId& id) const noexcept {
+      return std::hash<int>{}(id.value());
+    }
+  };
+  typedef std::unordered_map<TfGpuId, DeviceLocality, TfGpuIdHash> LocalityMap;
+  // Populates *localities with the DeviceLocality descriptor for
+  // every TfGpuId.
+  virtual Status GetDeviceLocalities(
+      int num_tf_gpus, const std::vector<InterconnectMap>& interconnects,
+      LocalityMap* localities);
+
  private:
   // Creates a BaseGPUDevice associated with 'tf_gpu_id', allocates (strictly)
   // 'memory_limit' bytes of GPU memory to it, and adds it to the 'devices'
   // vector.
   Status CreateGPUDevice(const SessionOptions& options,
                          const string& name_prefix, TfGpuId tf_gpu_id,
-                         int64 memory_limit, std::vector<Device*>* devices);
+                         int64 memory_limit, const DeviceLocality& dev_locality,
+                         std::vector<Device*>* devices);
 
   virtual BaseGPUDevice* CreateGPUDevice(const SessionOptions& options,
                                          const string& name, Bytes memory_limit,
-                                         const DeviceLocality& locality,
+                                         const DeviceLocality& dev_locality,
                                          TfGpuId tf_gpu_id,
                                          const string& physical_device_desc,
                                          Allocator* gpu_allocator,
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
index ff46be9c01..b56823204a 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
@@ -180,6 +180,18 @@ TEST(GPUDeviceTest, MultipleVirtualDevices) {
   EXPECT_EQ(2, devices.size());
   EXPECT_EQ(123 << 20, devices[0]->attributes().memory_limit());
   EXPECT_EQ(456 << 20, devices[1]->attributes().memory_limit());
+  ASSERT_EQ(1, devices[0]->attributes().locality().links().link_size());
+  ASSERT_EQ(1, devices[1]->attributes().locality().links().link_size());
+  EXPECT_EQ(1, devices[0]->attributes().locality().links().link(0).device_id());
+  EXPECT_EQ("SAME_DEVICE",
+            devices[0]->attributes().locality().links().link(0).type());
+  EXPECT_EQ(BaseGPUDeviceFactory::InterconnectMap::kSameDeviceStrength,
+            devices[0]->attributes().locality().links().link(0).strength());
+  EXPECT_EQ(0, devices[1]->attributes().locality().links().link(0).device_id());
+  EXPECT_EQ("SAME_DEVICE",
+            devices[1]->attributes().locality().links().link(0).type());
+  EXPECT_EQ(BaseGPUDeviceFactory::InterconnectMap::kSameDeviceStrength,
+            devices[1]->attributes().locality().links().link(0).strength());
   for (auto d : devices) delete d;
 }
 
diff --git a/tensorflow/core/framework/device_attributes.proto b/tensorflow/core/framework/device_attributes.proto
index 9983bcb6be..0b3c0d5bdf 100644
--- a/tensorflow/core/framework/device_attributes.proto
+++ b/tensorflow/core/framework/device_attributes.proto
@@ -6,10 +6,26 @@ option java_outer_classname = "DeviceAttributesProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
 
+message InterconnectLink {
+  int32 device_id = 1;
+  string type = 2;
+  int32 strength = 3;
+};
+
+message LocalLinks {
+  repeated InterconnectLink link = 1;
+};
+
 message DeviceLocality {
   // Optional bus locality of device.  Default value of 0 means
   // no specific locality.  Specific localities are indexed from 1.
   int32 bus_id = 1;
+
+  // Optional NUMA locality of device.
+  int32 numa_node = 2;
+
+  // Optional local interconnect links to other devices.
+  LocalLinks links = 3;
 };
 
 message DeviceAttributes {
-- 
GitLab


From c8717d589d6822419537008e9254f06b25ce7e68 Mon Sep 17 00:00:00 2001
From: ted chang <htchang@us.ibm.com>
Date: Tue, 6 Feb 2018 10:44:48 -0800
Subject: [PATCH 710/966] remove write_version=saver_pb2.SaverDef.V1 (#15966)

-- 
GitLab


From 1f512c719052096fc8a1a8c8f52cc26d13e7e9ba Mon Sep 17 00:00:00 2001
From: Benoit Steiner <bsteiner@google.com>
Date: Tue, 6 Feb 2018 10:43:45 -0800
Subject: [PATCH 711/966] Use the function name as the grappler item id

PiperOrigin-RevId: 184704131
---
 tensorflow/core/grappler/grappler_item_builder.cc      | 8 ++++----
 tensorflow/core/grappler/grappler_item_builder.h       | 2 +-
 tensorflow/core/grappler/grappler_item_builder_test.cc | 6 ++++--
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/tensorflow/core/grappler/grappler_item_builder.cc b/tensorflow/core/grappler/grappler_item_builder.cc
index 7a9ad50519..88fddaf019 100644
--- a/tensorflow/core/grappler/grappler_item_builder.cc
+++ b/tensorflow/core/grappler/grappler_item_builder.cc
@@ -511,14 +511,14 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
 }
 
 std::unique_ptr<GrapplerItem> GrapplerItemFromFunctionDef(
-    const string& id, const FunctionDef& func,
+    const FunctionDef& func,
     const std::unordered_map<string, AttrValue>& func_attr) {
-  if (id.empty()) {
-    LOG(ERROR) << "id must be non-empty.";
+  if (func.signature().name().empty()) {
+    LOG(ERROR) << "function name must be specified.";
     return nullptr;
   }
   std::unique_ptr<GrapplerItem> new_item(new GrapplerItem());
-  new_item->id = id;
+  new_item->id = func.signature().name();
 
   std::unordered_map<string, string> port_map;
 
diff --git a/tensorflow/core/grappler/grappler_item_builder.h b/tensorflow/core/grappler/grappler_item_builder.h
index fa6f9faa09..cb116840c1 100644
--- a/tensorflow/core/grappler/grappler_item_builder.h
+++ b/tensorflow/core/grappler/grappler_item_builder.h
@@ -61,7 +61,7 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
 // Factory method for creating a GrapplerItem from a FunctionDef.
 // Returns nullptr if the given function def cannot be converted.
 std::unique_ptr<GrapplerItem> GrapplerItemFromFunctionDef(
-    const string& id, const FunctionDef& func,
+    const FunctionDef& func,
     const std::unordered_map<string, AttrValue>& func_attr);
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/grappler_item_builder_test.cc b/tensorflow/core/grappler/grappler_item_builder_test.cc
index 87377a0258..8dcc4ec0f9 100644
--- a/tensorflow/core/grappler/grappler_item_builder_test.cc
+++ b/tensorflow/core/grappler/grappler_item_builder_test.cc
@@ -301,8 +301,9 @@ TEST_F(GrapplerItemBuilderTest, FromSimpleFunctionDef) {
   std::unordered_map<string, AttrValue> func_attr;
   func_attr["T"].set_type(DT_FLOAT);
   std::unique_ptr<GrapplerItem> item =
-      GrapplerItemFromFunctionDef("test", func, func_attr);
+      GrapplerItemFromFunctionDef(func, func_attr);
   CHECK(item);
+  EXPECT_EQ("XTimesTwo", item->id);
   EXPECT_EQ(4, item->graph.node_size());
   EXPECT_EQ(std::vector<string>({"y"}), item->fetch);
   EXPECT_EQ(1, item->feed.size());
@@ -366,8 +367,9 @@ TEST_F(GrapplerItemBuilderTest, FromFunctionDefWithMultiOutputNodes) {
   std::unordered_map<string, AttrValue> func_attr;
   func_attr["T"].set_type(DT_FLOAT);
   std::unique_ptr<GrapplerItem> item =
-      GrapplerItemFromFunctionDef("test", func, func_attr);
+      GrapplerItemFromFunctionDef(func, func_attr);
   CHECK(item);
+  EXPECT_EQ("SubGrad", item->id);
   EXPECT_EQ(12, item->graph.node_size());
   EXPECT_EQ(std::vector<string>({"dx", "dy"}), item->fetch);
   EXPECT_EQ(3, item->feed.size());
-- 
GitLab


From 78b0e872cd370a941069352e9e4160ee55138725 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 6 Feb 2018 10:48:28 -0800
Subject: [PATCH 712/966] Support training with model parallelism in
 tpu_estimator.

PiperOrigin-RevId: 184704902
---
 .../tpu/python/tpu/device_assignment.py       |  55 +++++++
 .../contrib/tpu/python/tpu/tpu_config.py      |  52 ++++++-
 .../contrib/tpu/python/tpu/tpu_config_test.py |  15 ++
 .../contrib/tpu/python/tpu/tpu_estimator.py   | 145 ++++++++++++++----
 4 files changed, 235 insertions(+), 32 deletions(-)

diff --git a/tensorflow/contrib/tpu/python/tpu/device_assignment.py b/tensorflow/contrib/tpu/python/tpu/device_assignment.py
index ee202610a8..bdd9b88af5 100644
--- a/tensorflow/contrib/tpu/python/tpu/device_assignment.py
+++ b/tensorflow/contrib/tpu/python/tpu/device_assignment.py
@@ -87,6 +87,8 @@ class DeviceAssignment(object):
                                            core_assignment.shape))
 
     self._core_assignment = core_assignment
+    self._task_and_cores_to_replicas = self._compute_task_and_cores_to_replicas(
+        self._core_assignment, self._topology_tasks)
 
   def _invert_topology(self, topology):
     """Inverts a [task,device,axis] topology to [x,y,z] -> task/device maps."""
@@ -100,6 +102,34 @@ class DeviceAssignment(object):
         devices[x, y, z] = device
     return tasks, devices
 
+  def _compute_task_and_cores_to_replicas(self, core_assignment,
+                                          topology_tasks):
+    """Computes a nested dict which maps task and logical core to replicas."""
+    task_and_cores_to_replicas = {}
+    for replica in xrange(core_assignment.shape[0]):
+      for dx in xrange(core_assignment.shape[1]):
+        for dy in xrange(core_assignment.shape[2]):
+          for dz in xrange(core_assignment.shape[3]):
+            x, y, z = core_assignment[replica, dx, dy, dz, :]
+            task_id = topology_tasks[x, y, z]
+            if task_id not in task_and_cores_to_replicas:
+              task_and_cores_to_replicas[task_id] = {}
+            logical_core = (dx, dy, dz)
+            if logical_core not in task_and_cores_to_replicas[task_id]:
+              task_and_cores_to_replicas[task_id][logical_core] = set()
+
+            task_and_cores_to_replicas[task_id][logical_core].add(replica)
+
+    task_to_sorted_replica_id = {}
+
+    for task, core_to_replicas in task_and_cores_to_replicas.items():
+      core_to_sorted_replicas = {}
+      for core, replicas in core_to_replicas.items():
+        core_to_sorted_replicas[core] = sorted(replicas)
+
+      task_to_sorted_replica_id[task] = core_to_sorted_replicas
+    return task_to_sorted_replica_id
+
   @property
   def topology(self):
     """A `Topology` that describes the TPU topology."""
@@ -119,6 +149,11 @@ class DeviceAssignment(object):
     """
     return self._computation_shape
 
+  @property
+  def num_cores_per_replica(self):
+    """The number of cores per replica."""
+    return np.prod(self.computation_shape)
+
   @property
   def num_replicas(self):
     """The number of replicas of the computation."""
@@ -148,6 +183,26 @@ class DeviceAssignment(object):
     logical_offset = tuple([replica] + logical_core.tolist() + [slice(3)])
     return tuple(self.core_assignment[logical_offset])
 
+  def lookup_replicas(self, task_id, logical_core):
+    """Lookup replica ids by task number and logical core.
+
+    Args:
+      task_id: TensorFlow task number.
+      logical_core: A tuple of three integers which represents a logical core.
+    Returns:
+      A sorted list of the replicas that are attached to that task and
+      loical_core.
+    Raises:
+      ValueError: If no replica exisis in the task which contains the logical
+      core.
+    """
+    try:
+      return self._task_and_cores_to_replicas[task_id][logical_core]
+    except KeyError:
+      raise ValueError(
+          "Can not find any replica in task: {} contains logical_core: {} ".
+          format(task_id, logical_core))
+
   def tpu_ordinal(self, replica=0, logical_core=None):
     """Returns the ordinal of the TPU device assigned to a logical core."""
     coordinates = self._coordinates(replica, logical_core)
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_config.py b/tensorflow/contrib/tpu/python/tpu/tpu_config.py
index 188db6e2f0..b8ae2bf798 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_config.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_config.py
@@ -23,6 +23,8 @@ import collections
 import json
 import os
 
+import numpy as np
+
 from tensorflow.contrib.tpu.python.tpu import util as util_lib
 from tensorflow.python.estimator import run_config as run_config_lib
 from tensorflow.python.platform import tf_logging as logging
@@ -31,26 +33,39 @@ from tensorflow.python.platform import tf_logging as logging
 _TF_CONFIG_ENV = run_config_lib._TF_CONFIG_ENV
 _SERVICE_KEY = run_config_lib._SERVICE_KEY
 _TPU_WORKER_JOB_NAME = 'tpu_worker_job_name'
+_NUM_CORES_PER_HOST = 8
 
 # pylint: enable=protected-access
 
 
+# TODO(b/72511246) Provide a simplified api to configure model parallelism.
 class TPUConfig(
     collections.namedtuple('TPUConfig', [
         'iterations_per_loop',
         'num_shards',
+        'computation_shape',
         'per_host_input_for_training',
         'tpu_job_name',
         'initial_infeed_sleep_secs',
     ])):
-  """TPU related configuration required by `TPUEstimator`.
+  r"""TPU related configuration required by `TPUEstimator`.
 
   Args:
     iterations_per_loop: This is the number of train steps runnining in TPU
       system before returning to CPU host for each `Session.run`. This means
       global step is increased `iterations_per_loop` times in one `Session.run`.
       It is recommended to be set as number of global steps for next checkpoint.
-    num_shards: The number of TPU shards in the system.
+    num_shards: The number of model replicas in the system. For
+      non-model-parallelism case, this number equals the total number of TPU
+      cores. For model-parallelism, the total number of TPU cores equals
+      product(computation_shape) * num_shards.
+    computation_shape: A list of size 3 which describes the shape of a model
+      replica's block of cores. This is required by model-parallelism which
+      enables partitioning the model to multiple cores. For example, [2, 2, 1]
+        means the model
+      is partitioned across 4 cores which span two cores in both x and y
+      coordinates. Set it to `None` for non-model-parallelism. Please refer to
+      ${tf.contrib.tpu.TopologyProto} for the geometry of a TPU mesh.
     per_host_input_for_training: If `True`, `input_fn` is invoked Per-Host
       rather than Per-Core. With Per-Host input pipeline deployment, `input_fn`
       is invoked once on each host. With Per-Core input pipeline deployment, it
@@ -65,11 +80,14 @@ class TPUConfig(
     initial_infeed_sleep_secs: The number of seconds the infeed thread should
       wait before enqueueing the first batch. This helps avoid timeouts for
       models that require a long compilation time.
+    Raises:
+      ValueError: If `computation_shape` or `computation_shape` are invalid.
   """
 
   def __new__(cls,
               iterations_per_loop=2,
               num_shards=2,
+              computation_shape=None,
               per_host_input_for_training=True,
               tpu_job_name=None,
               initial_infeed_sleep_secs=None):
@@ -81,6 +99,32 @@ class TPUConfig(
     # Check num_shards.
     util_lib.check_positive_integer(num_shards, 'TPUConfig num_shards')
 
+    # Check computation_shape
+    if computation_shape is not None and len(computation_shape) != 3:
+      raise ValueError(
+          'computation_shape must be a list with length 3 or None; got {}'.
+          format(str(computation_shape)))
+
+    if computation_shape is not None:
+      computation_shape_array = np.asarray(computation_shape, dtype=np.int32)
+      # This prevents any computation being replicated across multiple hosts, so
+      # that each host feeds the same number of computations.
+      if any(computation_shape_array < 1) or any(computation_shape_array > 2):
+        raise ValueError('computation_shape elements can only be 1 or 2; got '
+                         'computation_shape={}'.format(computation_shape))
+      max_replicas_per_host = (
+          _NUM_CORES_PER_HOST // np.prod(computation_shape_array))
+      if num_shards > max_replicas_per_host and (
+          num_shards % max_replicas_per_host != 0):
+        raise ValueError(
+            '{0} shards can not be evenly distributed across'
+            ' multiple hosts. Each shard needs {1} cores and each'
+            ' host has {2} cores. Thus {0} shards needs {3} hosts.'
+            ' Please adjust num shards so that num_shards is'
+            ' divisible by {4} or <= {4}.'.format(
+                num_shards, np.prod(computation_shape), _NUM_CORES_PER_HOST,
+                num_shards / max_replicas_per_host, max_replicas_per_host))
+
     # Check initial_infeed_sleep_secs.
     if initial_infeed_sleep_secs:
       util_lib.check_positive_integer(initial_infeed_sleep_secs,
@@ -92,6 +136,7 @@ class TPUConfig(
         cls,
         iterations_per_loop=iterations_per_loop,
         num_shards=num_shards,
+        computation_shape=computation_shape,
         per_host_input_for_training=per_host_input_for_training,
         tpu_job_name=tpu_job_name,
         initial_infeed_sleep_secs=initial_infeed_sleep_secs)
@@ -112,8 +157,7 @@ class RunConfig(run_config_lib.RunConfig):
       evaluation_master: a string. The address of the master to use for eval.
         Defaults to master if not set.
       master: a string. The address of the master to use for training.
-      tf_random_seed: an int. Sets the TensorFlow random seed. Defaults to None,
-        which initializes it randomly based on the environment.
+      **kwargs: keyword config parameters.
     """
     super(RunConfig, self).__init__(**kwargs)
     self._tpu_config = tpu_config or TPUConfig()
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_config_test.py b/tensorflow/contrib/tpu/python/tpu/tpu_config_test.py
index 60884aa32f..2c9d7be232 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_config_test.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_config_test.py
@@ -43,6 +43,21 @@ class TPURunConfigTest(test.TestCase):
       tpu_config_lib.RunConfig(
           tpu_config=tpu_config_lib.TPUConfig(iterations_per_loop=0))
 
+  def test_fail_with_invalid_computation_shape(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 'computation_shape must be a list with length'
+                                 ' 3 or None'):
+      tpu_config_lib.TPUConfig(computation_shape=[2, 1])
+
+    with self.assertRaisesRegexp(ValueError,
+                                 'computation_shape elements can only be'):
+      tpu_config_lib.TPUConfig(computation_shape=[1, 3, 1])
+
+  def test_fail_with_invalid_shards(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 'shards can not be evenly distributed across'):
+      tpu_config_lib.TPUConfig(num_shards=6, computation_shape=[1, 1, 2])
+
 
 class TPURunConfigMasterTest(test.TestCase):
 
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index 56793f11d9..a236c08991 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -37,10 +37,10 @@ from tensorflow.contrib.tpu.python.tpu import tpu_config
 from tensorflow.contrib.tpu.python.tpu import tpu_feed
 from tensorflow.contrib.tpu.python.tpu import training_loop
 from tensorflow.contrib.tpu.python.tpu import util as util_lib
-
+from tensorflow.contrib.tpu.python.tpu.device_assignment import device_assignment as tpu_device_assignment
 from tensorflow.core.framework.summary_pb2 import Summary
 from tensorflow.core.protobuf import config_pb2
-
+from tensorflow.python.client import session as tf_session
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.estimator import estimator as estimator_lib
 from tensorflow.python.estimator import model_fn as model_fn_lib
@@ -71,6 +71,7 @@ _TPU_ESTIMATOR = 'tpu_estimator'
 _ITERATIONS_PER_LOOP_VAR = 'iterations_per_loop'
 _BATCH_SIZE_KEY = 'batch_size'
 _CROSS_REPLICA_SUM_OP = 'CrossReplicaSum'
+_PINGING_MASTER_TIMEOUT_IN_MS = 300 * 1000  # 5 minutes
 _RESERVED_PARAMS_KEYS = [_BATCH_SIZE_KEY]
 
 
@@ -173,14 +174,15 @@ class _TPUContext(object):
   """
 
   def __init__(self, config, train_batch_size, eval_batch_size,
-               predict_batch_size, use_tpu):
+               predict_batch_size, use_tpu, device_assignment):
     self._config = config
     self._train_batch_size = train_batch_size
     self._eval_batch_size = eval_batch_size
     self._predict_batch_size = predict_batch_size
     self._use_tpu = use_tpu
-    self._num_shards_or_none = self._config.tpu_config.num_shards
     self._mode = None
+    self._device_assignment = device_assignment
+    self._max_cores_per_host = 8
 
   def _assert_mode(self):
     if self._mode is None:
@@ -191,7 +193,18 @@ class _TPUContext(object):
   @property
   def num_of_cores_per_host(self):
     num_cores = self.num_cores
-    return min(num_cores, 8)
+    return min(num_cores, self._max_cores_per_host)
+
+  @property
+  def num_of_shards_per_host(self):
+    if self._device_assignment:
+      maximum_shards_per_host = (
+          self._max_cores_per_host //
+          self._device_assignment.num_cores_per_replica)
+      return min(self.num_shards, maximum_shards_per_host)
+    else:
+      num_cores = self.num_cores
+      return min(num_cores, self._max_cores_per_host)
 
   @contextmanager
   def with_mode(self, mode):
@@ -206,7 +219,14 @@ class _TPUContext(object):
   @property
   def num_cores(self):
     # TODO(xiejw): Adds lazy num_shards initialization.
-    return self._num_shards_or_none
+    if self._device_assignment:
+      return self._device_assignment.num_cores_per_replica * self.num_shards
+    else:
+      return self.num_shards
+
+  @property
+  def num_shards(self):
+    return self._config.tpu_config.num_shards
 
   @property
   def num_hosts(self):
@@ -284,6 +304,8 @@ class _TPUContext(object):
 
     # On TPU
     if self.is_input_sharded_per_core():
+      # We prohibit per core input sharding for the model parallelism case,
+      # therefore it is safe to use num_cores here.
       return global_batch_size // self.num_cores
     else:
       return global_batch_size // self.num_hosts
@@ -296,8 +318,7 @@ class _TPUContext(object):
     if self.is_running_on_cpu():
       return global_batch_size
 
-    # On TPU. always sharded per core.
-    return global_batch_size // self.num_cores
+    return global_batch_size // self.num_shards
 
   @property
   def master_job(self):
@@ -368,11 +389,15 @@ class _TPUContext(object):
 
   @property
   def tpu_device_placement_function(self):
+    """Returns the TPU device place function."""
     master = self.master_job
     job_device = '' if master is None else ('/job:%s' % master)
 
     def _placement_function(i):
-      return '%s/task:%d/device:TPU:%d' % (job_device, i / 8, i % 8)
+      if self._device_assignment:
+        return self._device_assignment.tpu_device(replica=i, job=master)
+      else:
+        return '%s/task:%d/device:TPU:%d' % (job_device, i / 8, i % 8)
 
     return _placement_function
 
@@ -391,10 +416,17 @@ class _TPUContext(object):
       Returns:
         The ordinal of the TPU device the shard's infeed should be placed on.
       """
-      return index % 8
+      if self._device_assignment:
+        return self._device_assignment.tpu_ordinal(replica=index)
+      else:
+        return index % 8
 
     return _tpu_ordinal_function
 
+  @property
+  def device_assignment(self):
+    return self._device_assignment
+
 
 class _SIGNAL(object):
   """Signal used to control the thread of infeed/outfeed.
@@ -908,7 +940,7 @@ def generate_per_core_enqueue_ops_fn_for_host(ctx, input_fn,
 
 
 def generate_per_host_enqueue_ops_fn_for_host(
-    ctx, input_fn, inputs_structure_recorder, batch_axis, device):
+    ctx, input_fn, inputs_structure_recorder, batch_axis, device, host_id):
   """Generates infeed enqueue ops for per-host input_fn on a single host."""
   captured_infeed_queue = _CapturedObject()
 
@@ -929,9 +961,20 @@ def generate_per_host_enqueue_ops_fn_for_host(
     if is_dataset:
       hooks.append(inputs.dataset_initializer_hook())
 
+  def _tpu_ordinal_function_impl(shard_index_in_host):
+    # We put both enqueue/dequeue op at tpu.core(0) in each replica.
+    replica = ctx.device_assignment.lookup_replicas(
+        host_id, (0, 0, 0))[shard_index_in_host]
+    return ctx.device_assignment.tpu_ordinal(replica=replica)
+
+  if ctx.device_assignment:
+    tpu_ordinal_function = _tpu_ordinal_function_impl
+  else:
+    tpu_ordinal_function = None
+
   def enqueue_ops_fn():
     with ops.device(device):
-      num_cores_per_host = ctx.num_of_cores_per_host
+      num_of_shards_per_host = ctx.num_of_shards_per_host
       # Convert user input to features and labels.  If the user returns a
       # dataset, it is initialized and the features and labels extracted via
       # `dataset.iterator.get_next()`
@@ -949,11 +992,12 @@ def generate_per_host_enqueue_ops_fn_for_host(
           tuple_shapes=[t.shape for t in unsharded_tensor_list],
           shard_dimensions=batch_axis)
       captured_infeed_queue.capture(infeed_queue)
-      infeed_queue.set_number_of_shards(num_cores_per_host)
-
+      infeed_queue.set_number_of_shards(num_of_shards_per_host)
       per_host_enqueue_ops = (
           infeed_queue.split_inputs_and_generate_enqueue_ops(
-              unsharded_tensor_list, placement_function=lambda x: device))
+              unsharded_tensor_list,
+              placement_function=lambda x: device,
+              tpu_ordinal_function=tpu_ordinal_function))
       if signals is None:
         return per_host_enqueue_ops
       else:
@@ -1152,7 +1196,11 @@ class _InputPipeline(object):
 
     def dequeue_fn():
       """dequeue_fn is used by TPU to retrieve the tensors."""
-      values = self._infeed_queue.generate_dequeue_op()
+      # In the model-parallel case, both the host-side and device-side
+      # computations must agree on the core on which infeed takes place. We
+      # choose to perform infeed on logical core 0 of each replica.
+      with ops.device(tpu.core(0)):
+        values = self._infeed_queue.generate_dequeue_op()
       # The unflatten process uses the structure information recorded above.
       return self._inputs_structure_recorder.unflatten_features_and_labels(
           values)
@@ -1199,7 +1247,7 @@ class _InputPipeline(object):
             enqueue_ops_fn, captured_infeed_queue, hooks, is_dataset = (
                 generate_per_host_enqueue_ops_fn_for_host(
                     self._ctx, self._input_fn, self._inputs_structure_recorder,
-                    self._batch_axis, host_device))
+                    self._batch_axis, host_device, host_id))
             all_hooks.extend(hooks)
 
             # NOTE(xiejw): We dispatch here based on the return type of the
@@ -1574,7 +1622,9 @@ class _OutfeedHostCall(object):
     # TODO(jhseu): Consider deduping tensors.
     for name in self._names:
       tensors.extend(self._tensors[name])
-    return [tpu_ops.outfeed_enqueue_tuple(tensors)]
+
+    with ops.device(tpu.core(0)):
+      return [tpu_ops.outfeed_enqueue_tuple(tensors)]
 
   def create_tpu_hostcall(self):
     """Sends the tensors through outfeed and runs the host_fn on CPU.
@@ -1607,10 +1657,11 @@ class _OutfeedHostCall(object):
       for shape in self._tensor_shapes[name]:
         tensor_shapes.append(shape)
 
-    # Outfeed ops execute on each JF node. Note: we must constraint it such that
-    # we have at most one outfeed dequeue and enqueue.
+    # Outfeed ops execute on each replica's first logical core. Note: we must
+    # constraint it such that we have at most one outfeed dequeue and enqueue
+    # per replica.
     tpu_device_placement_fn = self._ctx.tpu_device_placement_function
-    for i in xrange(self._ctx.num_cores):
+    for i in xrange(self._ctx.num_shards):
       with ops.device(tpu_device_placement_fn(i)):
         outfeed_tensors = tpu_ops.outfeed_dequeue_tuple(
             dtypes=tensor_dtypes, shapes=tensor_shapes)
@@ -1911,6 +1962,11 @@ class TPUEstimator(estimator_lib.Estimator):
             'train batch size {} must be divisible by number of shards {}'
             .format(train_batch_size, config.tpu_config.num_shards))
 
+      if (not config.tpu_config.per_host_input_for_training and
+          config.tpu_config.computation_shape):
+        raise ValueError(
+            'Model parallelism only supports per host input for training.')
+
       if eval_batch_size is not None:
         if not isinstance(eval_batch_size, int):
           raise ValueError('`eval_batch_size` must be an int')
@@ -1948,9 +2004,35 @@ class TPUEstimator(estimator_lib.Estimator):
     self._iterations_per_training_loop = (
         self._config.tpu_config.iterations_per_loop)
 
+    if use_tpu and self._config.tpu_config.computation_shape:
+      try:
+        with tf_session.Session(
+            self._config.master,
+            config=config_pb2.ConfigProto(
+                operation_timeout_in_ms=_PINGING_MASTER_TIMEOUT_IN_MS)) as sess:
+          logging.info('Initializing TPU system to fetch topology for model '
+                       'parallelism.')
+          topology = sess.run(tpu.initialize_system())
+          device_assignment = tpu_device_assignment(
+              topology,
+              computation_shape=self._config.tpu_config.computation_shape,
+              num_replicas=self._config.tpu_config.num_shards)
+          logging.info('computation_shape: %s',
+                       str(self._config.tpu_config.computation_shape))
+          logging.info('num_replicas: %d', self._config.tpu_config.num_shards)
+          logging.info('device_assignment.topology.device_coordinates: %s',
+                       str(device_assignment.topology.device_coordinates))
+          logging.info('device_assignment.core_assignment: %s',
+                       str(device_assignment.core_assignment))
+      except errors.DeadlineExceededError:
+        raise ValueError(
+            'Fail to connect master (%s). Please double check %s is '
+            'correct.' % (self._config.master, self._config.master))
+    else:
+      device_assignment = None
     # All properties passed to _TPUContext are immutable.
     self._ctx = _TPUContext(self._config, train_batch_size, eval_batch_size,
-                            predict_batch_size, use_tpu)
+                            predict_batch_size, use_tpu, device_assignment)
 
   def _create_global_step(self, graph):
     """Creates a global step suitable for TPUs.
@@ -1998,6 +2080,13 @@ class TPUEstimator(estimator_lib.Estimator):
 
     util_lib.check_positive_integer(steps, 'Eval steps')
 
+    # TODO(ylc): Support evaluating with model parallelism in different cluster.
+    if ctx.device_assignment and (self._config.evaluation_master !=
+                                  self._config.master):
+      raise ValueError(
+          'In the model-parallel case, both training and evaluation must run '
+          'in the same cluster.')
+
     if self._config.tpu_config.num_shards > 8:
       raise NotImplementedError(
           'TPU evaluation is only supported with one host.')
@@ -2245,7 +2334,6 @@ class TPUEstimator(estimator_lib.Estimator):
 
 def _eval_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
   """Executes `model_fn_wrapper` multiple times on all TPU shards."""
-  num_cores = ctx.num_cores
   iterations_per_loop_var = _create_or_get_iterations_per_loop()
 
   single_tpu_eval_step, host_calls, captured_scaffold_fn = (
@@ -2260,8 +2348,9 @@ def _eval_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
   (loss,) = tpu.shard(
       multi_tpu_eval_steps_on_single_shard,
       inputs=[],
-      num_shards=num_cores,
-      outputs_from_all_shards=False)
+      num_shards=ctx.num_shards,
+      outputs_from_all_shards=False,
+      device_assignment=ctx.device_assignment)
 
   scaffold = _get_scaffold(captured_scaffold_fn)
   return loss, host_calls, scaffold
@@ -2269,7 +2358,6 @@ def _eval_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
 
 def _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
   """Executes `model_fn_wrapper` multiple times on all TPU shards."""
-  num_cores = ctx.num_cores
   iterations_per_loop_var = _create_or_get_iterations_per_loop()
 
   single_tpu_train_step, host_call, captured_scaffold_fn = (
@@ -2284,8 +2372,9 @@ def _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
   (loss,) = tpu.shard(
       multi_tpu_train_steps_on_single_shard,
       inputs=[],
-      num_shards=num_cores,
-      outputs_from_all_shards=False)
+      num_shards=ctx.num_shards,
+      outputs_from_all_shards=False,
+      device_assignment=ctx.device_assignment)
 
   scaffold = _get_scaffold(captured_scaffold_fn)
   return loss, host_call, scaffold
-- 
GitLab


From 8ca4366f16650bc694132e15b7fd866191db3c77 Mon Sep 17 00:00:00 2001
From: Yao Zhang <yaozhang@google.com>
Date: Tue, 6 Feb 2018 11:00:58 -0800
Subject: [PATCH 713/966] Enable layout optimizer by default

PiperOrigin-RevId: 184707084
---
 tensorflow/core/grappler/optimizers/meta_optimizer.cc | 4 ++--
 tensorflow/core/protobuf/rewriter_config.proto        | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 4228e7baba..6d93f74186 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -97,7 +97,7 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
       optimizers.push_back(std::unique_ptr<GraphOptimizer>(
           new DependencyOptimizer(cfg_.dependency_optimization())));
     }
-    if (cfg_.layout_optimizer() == RewriterConfig::ON) {
+    if (cfg_.layout_optimizer() != RewriterConfig::OFF) {
       optimizers.push_back(
           std::unique_ptr<GraphOptimizer>(new LayoutOptimizer()));
     }
@@ -201,7 +201,7 @@ void MetaOptimizer::Feedback(Cluster* cluster, const GrapplerItem& item,
 
 bool MetaOptimizerEnabled(const RewriterConfig& cfg) {
   return !cfg.disable_model_pruning() ||
-         cfg.layout_optimizer() == RewriterConfig::ON ||
+         cfg.layout_optimizer() != RewriterConfig::OFF ||
          cfg.constant_folding() != RewriterConfig::OFF ||
          cfg.dependency_optimization() != RewriterConfig::OFF ||
          cfg.arithmetic_optimization() != RewriterConfig::OFF ||
diff --git a/tensorflow/core/protobuf/rewriter_config.proto b/tensorflow/core/protobuf/rewriter_config.proto
index d3c3d432a3..dddadceeb5 100644
--- a/tensorflow/core/protobuf/rewriter_config.proto
+++ b/tensorflow/core/protobuf/rewriter_config.proto
@@ -29,7 +29,7 @@ message RewriterConfig {
     AGGRESSIVE = 3;
   }
 
-  // Optimize tensor layouts
+  // Optimize tensor layouts (default is ON)
   Toggle layout_optimizer = 1;
   // Fold constants (default is ON)
   Toggle constant_folding = 3;
-- 
GitLab


From 5fb3c20b2f864bf88540fe5aaef3df4e70c2a6ad Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Tue, 6 Feb 2018 11:07:04 -0800
Subject: [PATCH 714/966] Adding tf_export decorators/calls to TensorFlow
 functions and constants.

PiperOrigin-RevId: 184708277
---
 tensorflow/python/ops/linalg/linalg.py        |  1 +
 tensorflow/python/ops/linalg/linalg_impl.py   |  3 ++
 .../python/ops/linalg/linear_operator.py      |  2 ++
 .../ops/linalg/linear_operator_composition.py |  2 ++
 .../python/ops/linalg/linear_operator_diag.py |  2 ++
 .../ops/linalg/linear_operator_full_matrix.py |  2 ++
 .../ops/linalg/linear_operator_identity.py    |  3 ++
 .../linalg/linear_operator_low_rank_update.py |  2 ++
 .../linear_operator_lower_triangular.py       |  2 ++
 tensorflow/python/saved_model/builder_impl.py |  2 ++
 tensorflow/python/saved_model/constants.py    | 19 +++++++++++++
 tensorflow/python/saved_model/loader_impl.py  |  3 ++
 tensorflow/python/saved_model/main_op_impl.py |  3 ++
 .../python/saved_model/signature_constants.py | 28 +++++++++++++++++++
 .../saved_model/signature_def_utils_impl.py   |  6 ++++
 tensorflow/python/saved_model/simple_save.py  |  2 ++
 .../python/saved_model/tag_constants.py       |  7 +++++
 tensorflow/python/saved_model/utils_impl.py   |  3 ++
 tensorflow/tools/api/generator/BUILD          |  9 ++++++
 19 files changed, 101 insertions(+)

diff --git a/tensorflow/python/ops/linalg/linalg.py b/tensorflow/python/ops/linalg/linalg.py
index 5369007a56..14319025ff 100644
--- a/tensorflow/python/ops/linalg/linalg.py
+++ b/tensorflow/python/ops/linalg/linalg.py
@@ -41,4 +41,5 @@ del gen_linalg_ops
 del linalg_ops
 del math_ops
 del special_math_ops
+del tf_export
 # pylint: enable=undefined-variable
diff --git a/tensorflow/python/ops/linalg/linalg_impl.py b/tensorflow/python/ops/linalg/linalg_impl.py
index a5096ffdd9..d5bd916f80 100644
--- a/tensorflow/python/ops/linalg/linalg_impl.py
+++ b/tensorflow/python/ops/linalg/linalg_impl.py
@@ -24,6 +24,7 @@ from tensorflow.python.ops import gen_linalg_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import special_math_ops
+from tensorflow.python.util.tf_export import tf_export
 
 # Linear algebra ops.
 band_part = array_ops.matrix_band_part
@@ -54,6 +55,7 @@ transpose = array_ops.matrix_transpose
 triangular_solve = linalg_ops.matrix_triangular_solve
 
 
+@tf_export('linalg.logdet')
 def logdet(matrix, name=None):
   """Computes log of the determinant of a hermitian positive definite matrix.
 
@@ -86,6 +88,7 @@ def logdet(matrix, name=None):
         reduction_indices=[-1])
 
 
+@tf_export('linalg.adjoint')
 def adjoint(matrix, name=None):
   """Transposes the last two dimensions of and conjugates tensor `matrix`.
 
diff --git a/tensorflow/python/ops/linalg/linear_operator.py b/tensorflow/python/ops/linalg/linear_operator.py
index 27e0f17020..2c24a29567 100644
--- a/tensorflow/python/ops/linalg/linear_operator.py
+++ b/tensorflow/python/ops/linalg/linear_operator.py
@@ -32,11 +32,13 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linalg_impl as linalg
 from tensorflow.python.ops.linalg import linear_operator_util
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
 
 __all__ = ["LinearOperator"]
 
 
 # TODO(langmore) Use matrix_solve_ls for singular or non-square matrices.
+@tf_export("linalg.LinearOperator")
 class LinearOperator(object):
   """Base class defining a [batch of] linear operator[s].
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_composition.py b/tensorflow/python/ops/linalg/linear_operator_composition.py
index 14411291d4..ecd30e4d7e 100644
--- a/tensorflow/python/ops/linalg/linear_operator_composition.py
+++ b/tensorflow/python/ops/linalg/linear_operator_composition.py
@@ -25,10 +25,12 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops.linalg import linear_operator
+from tensorflow.python.util.tf_export import tf_export
 
 __all__ = ["LinearOperatorComposition"]
 
 
+@tf_export("linalg.LinearOperatorComposition")
 class LinearOperatorComposition(linear_operator.LinearOperator):
   """Composes one or more `LinearOperators`.
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_diag.py b/tensorflow/python/ops/linalg/linear_operator_diag.py
index 2217bfd545..b3ec3d5b7c 100644
--- a/tensorflow/python/ops/linalg/linear_operator_diag.py
+++ b/tensorflow/python/ops/linalg/linear_operator_diag.py
@@ -26,10 +26,12 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linalg_impl as linalg
 from tensorflow.python.ops.linalg import linear_operator
 from tensorflow.python.ops.linalg import linear_operator_util
+from tensorflow.python.util.tf_export import tf_export
 
 __all__ = ["LinearOperatorDiag",]
 
 
+@tf_export("linalg.LinearOperatorDiag")
 class LinearOperatorDiag(linear_operator.LinearOperator):
   """`LinearOperator` acting like a [batch] square diagonal matrix.
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_full_matrix.py b/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
index 8fb59ca1a7..f979fb37d6 100644
--- a/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
+++ b/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
@@ -23,10 +23,12 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linear_operator
+from tensorflow.python.util.tf_export import tf_export
 
 __all__ = ["LinearOperatorFullMatrix"]
 
 
+@tf_export("linalg.LinearOperatorFullMatrix")
 class LinearOperatorFullMatrix(linear_operator.LinearOperator):
   """`LinearOperator` that wraps a [batch] matrix.
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_identity.py b/tensorflow/python/ops/linalg/linear_operator_identity.py
index 740c6c811f..50f3d407e8 100644
--- a/tensorflow/python/ops/linalg/linear_operator_identity.py
+++ b/tensorflow/python/ops/linalg/linear_operator_identity.py
@@ -31,6 +31,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linalg_impl as linalg
 from tensorflow.python.ops.linalg import linear_operator
 from tensorflow.python.ops.linalg import linear_operator_util
+from tensorflow.python.util.tf_export import tf_export
 
 __all__ = [
     "LinearOperatorIdentity",
@@ -97,6 +98,7 @@ class BaseLinearOperatorIdentity(linear_operator.LinearOperator):
     return array_ops.ones(shape=d_shape, dtype=self.dtype)
 
 
+@tf_export("linalg.LinearOperatorIdentity")
 class LinearOperatorIdentity(BaseLinearOperatorIdentity):
   """`LinearOperator` acting like a [batch] square identity matrix.
 
@@ -460,6 +462,7 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
                        "%s" % self._batch_shape_static)
 
 
+@tf_export("linalg.LinearOperatorScaledIdentity")
 class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
   """`LinearOperator` acting like a scaled [batch] identity matrix `A = c I`.
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py b/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py
index 36eed89db6..be91102909 100644
--- a/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py
+++ b/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py
@@ -27,12 +27,14 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linear_operator
 from tensorflow.python.ops.linalg import linear_operator_diag
 from tensorflow.python.ops.linalg import linear_operator_identity
+from tensorflow.python.util.tf_export import tf_export
 
 __all__ = [
     "LinearOperatorLowRankUpdate",
 ]
 
 
+@tf_export("linalg.LinearOperatorLowRankUpdate")
 class LinearOperatorLowRankUpdate(linear_operator.LinearOperator):
   """Perturb a `LinearOperator` with a rank `K` update.
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py b/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py
index 6419030755..a5130188b6 100644
--- a/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py
+++ b/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py
@@ -26,12 +26,14 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linalg_impl as linalg
 from tensorflow.python.ops.linalg import linear_operator
 from tensorflow.python.ops.linalg import linear_operator_util
+from tensorflow.python.util.tf_export import tf_export
 
 __all__ = [
     "LinearOperatorLowerTriangular",
 ]
 
 
+@tf_export("linalg.LinearOperatorLowerTriangular")
 class LinearOperatorLowerTriangular(linear_operator.LinearOperator):
   """`LinearOperator` acting like a [batch] square lower triangular matrix.
 
diff --git a/tensorflow/python/saved_model/builder_impl.py b/tensorflow/python/saved_model/builder_impl.py
index 62ee53b816..7347da7536 100644
--- a/tensorflow/python/saved_model/builder_impl.py
+++ b/tensorflow/python/saved_model/builder_impl.py
@@ -34,8 +34,10 @@ from tensorflow.python.platform import tf_logging
 from tensorflow.python.saved_model import constants
 from tensorflow.python.training import saver as tf_saver
 from tensorflow.python.util import compat
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("saved_model.builder.SavedModelBuilder")
 class SavedModelBuilder(object):
   """Builds the `SavedModel` protocol buffer and saves variables and assets.
 
diff --git a/tensorflow/python/saved_model/constants.py b/tensorflow/python/saved_model/constants.py
index 7e3e8df47f..ec49a0539f 100644
--- a/tensorflow/python/saved_model/constants.py
+++ b/tensorflow/python/saved_model/constants.py
@@ -20,33 +20,52 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.util.tf_export import tf_export
 
 # Subdirectory name containing the asset files.
 ASSETS_DIRECTORY = "assets"
+tf_export("saved_model.constants.ASSETS_DIRECTORY").export_constant(
+    __name__, "ASSETS_DIRECTORY")
 
 # CollectionDef key containing SavedModel assets.
 ASSETS_KEY = "saved_model_assets"
+tf_export("saved_model.constants.ASSETS_KEY").export_constant(
+    __name__, "ASSETS_KEY")
 
 # CollectionDef key for the legacy init op.
 LEGACY_INIT_OP_KEY = "legacy_init_op"
+tf_export("saved_model.constants.LEGACY_INIT_OP_KEY").export_constant(
+    __name__, "LEGACY_INIT_OP_KEY")
 
 # CollectionDef key for the SavedModel main op.
 MAIN_OP_KEY = "saved_model_main_op"
+tf_export("saved_model.constants.MAIN_OP_KEY").export_constant(
+    __name__, "MAIN_OP_KEY")
 
 # Schema version for SavedModel.
 SAVED_MODEL_SCHEMA_VERSION = 1
+tf_export("saved_model.constants.SAVED_MODEL_SCHEMA_VERSION").export_constant(
+    __name__, "SAVED_MODEL_SCHEMA_VERSION")
 
 # File name for SavedModel protocol buffer.
 SAVED_MODEL_FILENAME_PB = "saved_model.pb"
+tf_export("saved_model.constants.SAVED_MODEL_FILENAME_PB").export_constant(
+    __name__, "SAVED_MODEL_FILENAME_PB")
 
 # File name for text version of SavedModel protocol buffer.
 SAVED_MODEL_FILENAME_PBTXT = "saved_model.pbtxt"
+tf_export("saved_model.constants.SAVED_MODEL_FILENAME_PBTXT").export_constant(
+    __name__, "SAVED_MODEL_FILENAME_PBTXT")
 
 # Subdirectory name containing the variables/checkpoint files.
 VARIABLES_DIRECTORY = "variables"
+tf_export("saved_model.constants.VARIABLES_DIRECTORY").export_constant(
+    __name__, "VARIABLES_DIRECTORY")
 
 # File name used for variables.
 VARIABLES_FILENAME = "variables"
+tf_export("saved_model.constants.VARIABLES_FILENAME").export_constant(
+    __name__, "VARIABLES_FILENAME")
 
 
 _allowed_symbols = [
diff --git a/tensorflow/python/saved_model/loader_impl.py b/tensorflow/python/saved_model/loader_impl.py
index 5ff954fd9f..ddfd6be6da 100644
--- a/tensorflow/python/saved_model/loader_impl.py
+++ b/tensorflow/python/saved_model/loader_impl.py
@@ -32,6 +32,7 @@ from tensorflow.python.platform import tf_logging
 from tensorflow.python.saved_model import constants
 from tensorflow.python.training import saver as tf_saver
 from tensorflow.python.util import compat
+from tensorflow.python.util.tf_export import tf_export
 
 
 def _parse_saved_model(export_dir):
@@ -156,6 +157,7 @@ def _get_legacy_init_op_tensor(meta_graph_def_to_load):
   return legacy_init_op_tensor
 
 
+@tf_export("saved_model.loader.maybe_saved_model_directory")
 def maybe_saved_model_directory(export_dir):
   """Checks whether the provided export directory could contain a SavedModel.
 
@@ -176,6 +178,7 @@ def maybe_saved_model_directory(export_dir):
   return file_io.file_exists(txt_path) or file_io.file_exists(pb_path)
 
 
+@tf_export("saved_model.loader.load")
 def load(sess, tags, export_dir, **saver_kwargs):
   """Loads the model from a SavedModel as specified by tags.
 
diff --git a/tensorflow/python/saved_model/main_op_impl.py b/tensorflow/python/saved_model/main_op_impl.py
index 355fd57bf1..631ee63729 100644
--- a/tensorflow/python/saved_model/main_op_impl.py
+++ b/tensorflow/python/saved_model/main_op_impl.py
@@ -22,8 +22,10 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('saved_model.main_op.main_op')
 def main_op():
   """Returns a main op to init variables and tables.
 
@@ -40,6 +42,7 @@ def main_op():
 
 
 # TODO(sukritiramesh): Integrate with Saver for complete restore functionality.
+@tf_export('saved_model.main_op.main_op_with_restore')
 def main_op_with_restore(restore_op_name):
   """Returns a main op to init variables, tables and restore the graph.
 
diff --git a/tensorflow/python/saved_model/signature_constants.py b/tensorflow/python/saved_model/signature_constants.py
index 935a124645..6461fe8a7e 100644
--- a/tensorflow/python/saved_model/signature_constants.py
+++ b/tensorflow/python/saved_model/signature_constants.py
@@ -20,51 +20,79 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.util.tf_export import tf_export
 
 
 # Key in the signature def map for `default` serving signatures. The default
 # signature is used in inference requests where a specific signature was not
 # specified.
 DEFAULT_SERVING_SIGNATURE_DEF_KEY = "serving_default"
+tf_export("saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY"
+         ).export_constant(__name__, "DEFAULT_SERVING_SIGNATURE_DEF_KEY")
 
 ################################################################################
 # Classification API constants.
 
 # Classification inputs.
 CLASSIFY_INPUTS = "inputs"
+tf_export("saved_model.signature_constants.CLASSIFY_INPUTS").export_constant(
+    __name__, "CLASSIFY_INPUTS")
 
 # Classification method name used in a SignatureDef.
 CLASSIFY_METHOD_NAME = "tensorflow/serving/classify"
+tf_export(
+    "saved_model.signature_constants.CLASSIFY_METHOD_NAME").export_constant(
+        __name__, "CLASSIFY_METHOD_NAME")
 
 # Classification classes output.
 CLASSIFY_OUTPUT_CLASSES = "classes"
+tf_export(
+    "saved_model.signature_constants.CLASSIFY_OUTPUT_CLASSES").export_constant(
+        __name__, "CLASSIFY_OUTPUT_CLASSES")
 
 # Classification scores output.
 CLASSIFY_OUTPUT_SCORES = "scores"
+tf_export(
+    "saved_model.signature_constants.CLASSIFY_OUTPUT_SCORES").export_constant(
+        __name__, "CLASSIFY_OUTPUT_SCORES")
 
 ################################################################################
 # Prediction API constants.
 
 # Predict inputs.
 PREDICT_INPUTS = "inputs"
+tf_export("saved_model.signature_constants.PREDICT_INPUTS").export_constant(
+    __name__, "PREDICT_INPUTS")
 
 # Prediction method name used in a SignatureDef.
 PREDICT_METHOD_NAME = "tensorflow/serving/predict"
+tf_export(
+    "saved_model.signature_constants.PREDICT_METHOD_NAME").export_constant(
+        __name__, "PREDICT_METHOD_NAME")
 
 # Predict outputs.
 PREDICT_OUTPUTS = "outputs"
+tf_export("saved_model.signature_constants.PREDICT_OUTPUTS").export_constant(
+    __name__, "PREDICT_OUTPUTS")
 
 ################################################################################
 # Regression API constants.
 
 # Regression inputs.
 REGRESS_INPUTS = "inputs"
+tf_export("saved_model.signature_constants.REGRESS_INPUTS").export_constant(
+    __name__, "REGRESS_INPUTS")
 
 # Regression method name used in a SignatureDef.
 REGRESS_METHOD_NAME = "tensorflow/serving/regress"
+tf_export(
+    "saved_model.signature_constants.REGRESS_METHOD_NAME").export_constant(
+        __name__, "REGRESS_METHOD_NAME")
 
 # Regression outputs.
 REGRESS_OUTPUTS = "outputs"
+tf_export("saved_model.signature_constants.REGRESS_OUTPUTS").export_constant(
+    __name__, "REGRESS_OUTPUTS")
 
 ################################################################################
 
diff --git a/tensorflow/python/saved_model/signature_def_utils_impl.py b/tensorflow/python/saved_model/signature_def_utils_impl.py
index 240ea61aa5..d033159188 100644
--- a/tensorflow/python/saved_model/signature_def_utils_impl.py
+++ b/tensorflow/python/saved_model/signature_def_utils_impl.py
@@ -26,8 +26,10 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import utils
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('saved_model.signature_def_utils.build_signature_def')
 def build_signature_def(inputs=None, outputs=None, method_name=None):
   """Utility function to build a SignatureDef protocol buffer.
 
@@ -53,6 +55,7 @@ def build_signature_def(inputs=None, outputs=None, method_name=None):
   return signature_def
 
 
+@tf_export('saved_model.signature_def_utils.regression_signature_def')
 def regression_signature_def(examples, predictions):
   """Creates regression signature from given examples and predictions.
 
@@ -94,6 +97,7 @@ def regression_signature_def(examples, predictions):
   return signature_def
 
 
+@tf_export('saved_model.signature_def_utils.classification_signature_def')
 def classification_signature_def(examples, classes, scores):
   """Creates classification signature from given examples and predictions.
 
@@ -146,6 +150,7 @@ def classification_signature_def(examples, classes, scores):
   return signature_def
 
 
+@tf_export('saved_model.signature_def_utils.predict_signature_def')
 def predict_signature_def(inputs, outputs):
   """Creates prediction signature from given inputs and outputs.
 
@@ -180,6 +185,7 @@ def predict_signature_def(inputs, outputs):
   return signature_def
 
 
+@tf_export('saved_model.signature_def_utils.is_valid_signature')
 def is_valid_signature(signature_def):
   """Determine whether a SignatureDef can be served by TensorFlow Serving."""
   if signature_def is None:
diff --git a/tensorflow/python/saved_model/simple_save.py b/tensorflow/python/saved_model/simple_save.py
index 1e4cc73370..042b8fa8e2 100644
--- a/tensorflow/python/saved_model/simple_save.py
+++ b/tensorflow/python/saved_model/simple_save.py
@@ -23,8 +23,10 @@ from tensorflow.python.saved_model import builder
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import signature_def_utils
 from tensorflow.python.saved_model import tag_constants
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('saved_model.simple_save')
 def simple_save(session, export_dir, inputs, outputs, legacy_init_op=None):
   """Convenience function to build a SavedModel suitable for serving.
 
diff --git a/tensorflow/python/saved_model/tag_constants.py b/tensorflow/python/saved_model/tag_constants.py
index e2facafda5..d164e2c23f 100644
--- a/tensorflow/python/saved_model/tag_constants.py
+++ b/tensorflow/python/saved_model/tag_constants.py
@@ -20,19 +20,26 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.util.tf_export import tf_export
 
 
 # Tag for the `serving` graph.
 SERVING = "serve"
+tf_export("saved_model.tag_constants.SERVING").export_constant(
+    __name__, "SERVING")
 
 # Tag for the `training` graph.
 TRAINING = "train"
+tf_export("saved_model.tag_constants.TRAINING").export_constant(
+    __name__, "TRAINING")
 
 # Tag for the `gpu` graph.
 GPU = "gpu"
+tf_export("saved_model.tag_constants.GPU").export_constant(__name__, "GPU")
 
 # Tag for the `tpu` graph.
 TPU = "tpu"
+tf_export("saved_model.tag_constants.TPU").export_constant(__name__, "TPU")
 
 _allowed_symbols = [
     "SERVING",
diff --git a/tensorflow/python/saved_model/utils_impl.py b/tensorflow/python/saved_model/utils_impl.py
index 73ca8c9c1c..cddce29a08 100644
--- a/tensorflow/python/saved_model/utils_impl.py
+++ b/tensorflow/python/saved_model/utils_impl.py
@@ -22,11 +22,13 @@ from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.util.tf_export import tf_export
 
 
 # TensorInfo helpers.
 
 
+@tf_export("saved_model.utils.build_tensor_info")
 def build_tensor_info(tensor):
   """Utility function to build TensorInfo proto.
 
@@ -50,6 +52,7 @@ def build_tensor_info(tensor):
   return tensor_info
 
 
+@tf_export("saved_model.utils.get_tensor_from_tensor_info")
 def get_tensor_from_tensor_info(tensor_info, graph=None, import_scope=None):
   """Returns the Tensor or SparseTensor described by a TensorInfo proto.
 
diff --git a/tensorflow/tools/api/generator/BUILD b/tensorflow/tools/api/generator/BUILD
index 66bbd572a6..dab1b4b8f4 100644
--- a/tensorflow/tools/api/generator/BUILD
+++ b/tensorflow/tools/api/generator/BUILD
@@ -87,6 +87,15 @@ genrule(
         "api/losses/__init__.py",
         "api/profiler/__init__.py",
         "api/python_io/__init__.py",
+        "api/saved_model/__init__.py",
+        "api/saved_model/builder/__init__.py",
+        "api/saved_model/constants/__init__.py",
+        "api/saved_model/loader/__init__.py",
+        "api/saved_model/main_op/__init__.py",
+        "api/saved_model/signature_constants/__init__.py",
+        "api/saved_model/signature_def_utils/__init__.py",
+        "api/saved_model/tag_constants/__init__.py",
+        "api/saved_model/utils/__init__.py",
     ],
     cmd = "$(location create_python_api) $(OUTS)",
     tools = ["create_python_api"],
-- 
GitLab


From 893a4a51732932ec05ad933463f46974721389d3 Mon Sep 17 00:00:00 2001
From: Mingsheng Hong <hongm@google.com>
Date: Tue, 6 Feb 2018 11:51:18 -0800
Subject: [PATCH 715/966] Internal change

PiperOrigin-RevId: 184715822
---
 tensorflow/c/eager/BUILD    | 14 ++++++++++++--
 tensorflow/c/eager/c_api.cc | 14 +++++++++++++-
 tensorflow/tensorflow.bzl   | 10 ++++++++++
 3 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index e62310d811..3505f70dc1 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -6,6 +6,7 @@ load(
     "tf_cuda_cc_test",
     "tf_cc_test",
     "tf_copts",
+    "tfe_xla_copts",
     "tf_cuda_library",
 )
 
@@ -16,7 +17,7 @@ tf_cuda_library(
         "c_api_internal.h",
     ],
     hdrs = ["c_api.h"],
-    copts = tf_copts(),
+    copts = tf_copts() + tfe_xla_copts(),
     visibility = ["//visibility:public"],
     deps = select({
         "//tensorflow:android": [
@@ -33,7 +34,15 @@ tf_cuda_library(
             "//tensorflow/core:lib_internal",
             "//tensorflow/core:protos_all_cc",
         ],
-    }) + ["//tensorflow/core:gpu_runtime"],
+    }) + select({
+        "//tensorflow:with_xla_support": [
+            "//tensorflow/compiler/tf2xla:xla_compiler",
+            "//tensorflow/compiler/jit",
+        ],
+        "//conditions:default": [],
+    }) + [
+        "//tensorflow/core:gpu_runtime",
+    ],
 )
 
 tf_cuda_library(
@@ -56,6 +65,7 @@ tf_cuda_library(
 tf_cuda_cc_test(
     name = "c_api_test",
     srcs = ["c_api_test.cc"],
+    extra_copts = tfe_xla_copts(),
     tags = [
         "guitar",
         "multi_gpu",
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index d65b592895..3a6d2ce45b 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -25,6 +25,9 @@ limitations under the License.
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/eager/c_api_internal.h"
 #include "tensorflow/c/eager/runtime.h"
+#ifdef TENSORFLOW_EAGER_USE_XLA
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#endif  // TENSORFLOW_EAGER_USE_XLA
 #include "tensorflow/core/common_runtime/copy_tensor.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
@@ -48,6 +51,12 @@ bool IsCPU(tensorflow::Device* d) {
   return d == nullptr || d->tensorflow_gpu_device_info() == nullptr;
 }
 
+bool IsXLA(tensorflow::Device* d) {
+  if (d == nullptr) return false;
+  const auto& device_type = d->attributes().device_type();
+  return device_type.find("XLA") != std::string::npos;
+}
+
 string DeviceName(tensorflow::Device* d) {
   return (d == nullptr) ? "cpu:0" : d->name();
 }
@@ -183,7 +192,10 @@ TFE_TensorHandle* TFE_TensorHandleCopyToDevice(TFE_TensorHandle* h,
       (srcd == dstd) || (DeviceName(srcd) == DeviceName(dstd));
   const bool dst_cpu = IsCPU(dstd);
   const bool src_cpu = IsCPU(srcd);
-  if (is_same_device) {
+  // both_on_cpu can be true and yet is_same_device is false, if one of src/dst
+  // has device type XLA_CPU, and the other CPU.
+  const bool both_on_cpu = src_cpu && dst_cpu;
+  if (is_same_device || both_on_cpu) {
     return new TFE_TensorHandle(h->t, dst_cpu ? nullptr : dstd);
   }
   tensorflow::Tensor* src = &(h->t);
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 7fe9c98726..bf4a9fe6ce 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -219,6 +219,13 @@ def tf_copts(android_optimization_level_override="-O2", is_external=False):
             "//conditions:default": ["-pthread"]
       }))
 
+
+def tfe_xla_copts():
+  return select({
+      "//tensorflow:with_xla_support": ["-DTENSORFLOW_EAGER_USE_XLA"],
+      "//conditions:default": [],
+  })
+
 def tf_opts_nortti_if_android():
   return if_android([
       "-fno-rtti",
@@ -666,6 +673,7 @@ def tf_cuda_cc_test(name,
                     tags=[],
                     data=[],
                     size="medium",
+                    extra_copts=[],
                     linkstatic=0,
                     args=[],
                     linkopts=[]):
@@ -676,6 +684,7 @@ def tf_cuda_cc_test(name,
       tags=tags + ["manual"],
       data=data,
       size=size,
+      extra_copts=extra_copts,
       linkstatic=linkstatic,
       linkopts=linkopts,
       args=args)
@@ -696,6 +705,7 @@ def tf_cuda_cc_test(name,
       tags=tags + tf_cuda_tests_tags(),
       data=data,
       size=size,
+      extra_copts=extra_copts,
       linkopts=linkopts,
       args=args)
 
-- 
GitLab


From 1af9e837e04ca5eaf46be5b32a6fe23bbc856923 Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Tue, 6 Feb 2018 11:51:47 -0800
Subject: [PATCH 716/966] Contrib estimator should handle ResourceVariables
 properly.

PiperOrigin-RevId: 184715896
---
 tensorflow/contrib/learn/python/learn/estimators/estimator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator.py b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
index 63d0f1e1d4..4b63e08ab3 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
@@ -211,7 +211,7 @@ def _get_replica_device_setter(config):
       'Variable', 'VariableV2', 'AutoReloadVariable', 'MutableHashTable',
       'MutableHashTableV2', 'MutableHashTableOfTensors',
       'MutableHashTableOfTensorsV2', 'MutableDenseHashTable',
-      'MutableDenseHashTableV2'
+      'MutableDenseHashTableV2', 'VarHandleOp'
   ]
 
   if config.task_type:
-- 
GitLab


From 7b2e60382bad9cf73b9a68c99c0d32e28fe17485 Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Tue, 6 Feb 2018 11:58:36 -0800
Subject: [PATCH 717/966] Support resource variables in moving averages.

PiperOrigin-RevId: 184716895
---
 tensorflow/python/training/moving_averages.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/training/moving_averages.py b/tensorflow/python/training/moving_averages.py
index 2d89082ad7..b9ecb27df1 100644
--- a/tensorflow/python/training/moving_averages.py
+++ b/tensorflow/python/training/moving_averages.py
@@ -400,7 +400,9 @@ class ExponentialMovingAverage(object):
           avg = slot_creator.create_zeros_slot(
               var,
               self._name,
-              colocate_with_primary=(var.op.type in ["Variable", "VariableV2"]))
+              colocate_with_primary=(var.op.type in ["Variable",
+                                                     "VariableV2",
+                                                     "VarHandleOp"]))
           if self._zero_debias:
             zero_debias_true.add(avg)
       self._averages[var] = avg
-- 
GitLab


From 51d608259c50cb60633607d3edb7d5dc73aa4e34 Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Tue, 6 Feb 2018 12:01:22 -0800
Subject: [PATCH 718/966] Estimator should handle ResourceVariables correctly.

PiperOrigin-RevId: 184717305
---
 tensorflow/python/estimator/estimator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index 8d1f1afcff..6da890cd22 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -999,7 +999,7 @@ def _get_replica_device_setter(config):
       'Variable', 'VariableV2', 'AutoReloadVariable', 'MutableHashTable',
       'MutableHashTableV2', 'MutableHashTableOfTensors',
       'MutableHashTableOfTensorsV2', 'MutableDenseHashTable',
-      'MutableDenseHashTableV2'
+      'MutableDenseHashTableV2', 'VarHandleOp'
   ]
 
   if config.task_type:
-- 
GitLab


From d9df4313a98fdc62187a94c5ab6d8955b699e9f2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 6 Feb 2018 12:02:18 -0800
Subject: [PATCH 719/966] Improve side_effect_guards to (1) avoid aliasing
 non-tensors and (2) combine aliasing with re-indenting. Move the renaming
 visitor into a generic utility API. This loses potential efficiency by
 risking sequencing ops where there is no risk of a race. On the other hand
 it's still not entirely robust, and we need to raise an error where we can't
 guarantee that.

PiperOrigin-RevId: 184717456
---
 .../contrib/py2tf/converters/control_flow.py  |  33 +----
 .../py2tf/converters/converter_test_base.py   |  13 ++
 .../py2tf/converters/side_effect_guards.py    | 117 +++++++++++-----
 .../converters/side_effect_guards_test.py     | 131 ++++++++++++++++--
 tensorflow/contrib/py2tf/pyct/BUILD           |  12 +-
 tensorflow/contrib/py2tf/pyct/anno.py         |  13 ++
 tensorflow/contrib/py2tf/pyct/anno_test.py    |   5 +
 .../py2tf/pyct/{copier.py => ast_util.py}     |  28 ++++
 .../pyct/{copier_test.py => ast_util_test.py} |  34 ++++-
 tensorflow/contrib/py2tf/pyct/templates.py    |   4 +-
 tensorflow/contrib/py2tf/utils/BUILD          |  13 ++
 tensorflow/contrib/py2tf/utils/__init__.py    |   1 +
 tensorflow/contrib/py2tf/utils/misc.py        |  48 +++++++
 tensorflow/contrib/py2tf/utils/misc_test.py   |  64 +++++++++
 14 files changed, 429 insertions(+), 87 deletions(-)
 rename tensorflow/contrib/py2tf/pyct/{copier.py => ast_util.py} (73%)
 rename tensorflow/contrib/py2tf/pyct/{copier_test.py => ast_util_test.py} (57%)
 create mode 100644 tensorflow/contrib/py2tf/utils/misc.py
 create mode 100644 tensorflow/contrib/py2tf/utils/misc_test.py

diff --git a/tensorflow/contrib/py2tf/converters/control_flow.py b/tensorflow/contrib/py2tf/converters/control_flow.py
index a256c074a8..6a94258103 100644
--- a/tensorflow/contrib/py2tf/converters/control_flow.py
+++ b/tensorflow/contrib/py2tf/converters/control_flow.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import gast
 
 from tensorflow.contrib.py2tf.pyct import anno
+from tensorflow.contrib.py2tf.pyct import ast_util
 from tensorflow.contrib.py2tf.pyct import templates
 from tensorflow.contrib.py2tf.pyct import transformer
 from tensorflow.contrib.py2tf.pyct.static_analysis.annos import NodeAnno
@@ -42,25 +43,6 @@ class SymbolNamer(object):
     raise NotImplementedError()
 
 
-class SymbolRenamer(gast.NodeTransformer):
-  """Transformer that can rename symbols to a simple names."""
-
-  def __init__(self, name_map):
-    self.name_map = name_map
-
-  def _process(self, node):
-    qn = anno.getanno(node, anno.Basic.QN)
-    if qn in self.name_map:
-      return gast.Name(self.name_map[qn], node.ctx, None)
-    return node
-
-  def visit_Name(self, node):
-    return self._process(node)
-
-  def visit_Attribute(self, node):
-    return self._process(node)
-
-
 class ControlFlowTransformer(transformer.Base):
   """Transforms control flow structures like loops an conditionals."""
 
@@ -99,10 +81,8 @@ class ControlFlowTransformer(transformer.Base):
         self.context.namer.new_symbol(s.ssf(), all_referenced)
         for s in aliased_orig_names)
     alias_map = dict(zip(aliased_orig_names, aliased_new_names))
-    node_body = node.body
-    node_body = [SymbolRenamer(alias_map).visit(n) for n in node_body]
-    node_orelse = node.orelse
-    node_orelse = [SymbolRenamer(alias_map).visit(n) for n in node_orelse]
+    node_body = ast_util.rename_symbols(node.body, alias_map)
+    node_orelse = ast_util.rename_symbols(node.orelse, alias_map)
 
     if len(all_modified) == 1:
       results = all_modified[0]
@@ -179,11 +159,8 @@ class ControlFlowTransformer(transformer.Base):
     else:
       state_ast_tuple = gast.Tuple([n.ast() for n in state], None)
 
-    node_body = node.body
-    node_body = [SymbolRenamer(ssf_map).visit(n) for n in node_body]
-
-    test = node.test
-    test = SymbolRenamer(ssf_map).visit(test)
+    node_body = ast_util.rename_symbols(node.body, ssf_map)
+    test = ast_util.rename_symbols(node.test, ssf_map)
 
     template = """
       def test_name(state_ssf):
diff --git a/tensorflow/contrib/py2tf/converters/converter_test_base.py b/tensorflow/contrib/py2tf/converters/converter_test_base.py
index bcb96c81ae..5b23db33e1 100644
--- a/tensorflow/contrib/py2tf/converters/converter_test_base.py
+++ b/tensorflow/contrib/py2tf/converters/converter_test_base.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import imp
+
 from tensorflow.contrib.py2tf.pyct import context
 from tensorflow.contrib.py2tf.pyct import parser
 from tensorflow.contrib.py2tf.pyct import qual_names
@@ -28,6 +30,17 @@ from tensorflow.python.platform import test
 
 
 class TestCase(test.TestCase):
+  """Base class for unit tests in this module. Contains relevant utilities."""
+
+  def make_fake_tf(self, *symbols):
+    fake_tf = imp.new_module('fake_tf')
+    for s in symbols:
+      setattr(fake_tf, s.__name__, s)
+    return fake_tf
+
+  def attach_namespace(self, module, **ns):
+    for k, v in ns.items():
+      setattr(module, k, v)
 
   def parse_and_analyze(self,
                         test_fn,
diff --git a/tensorflow/contrib/py2tf/converters/side_effect_guards.py b/tensorflow/contrib/py2tf/converters/side_effect_guards.py
index c73388d06e..948cb96c3f 100644
--- a/tensorflow/contrib/py2tf/converters/side_effect_guards.py
+++ b/tensorflow/contrib/py2tf/converters/side_effect_guards.py
@@ -37,6 +37,8 @@ from __future__ import print_function
 import gast
 
 from tensorflow.contrib.py2tf.pyct import anno
+from tensorflow.contrib.py2tf.pyct import ast_util
+from tensorflow.contrib.py2tf.pyct import qual_names
 from tensorflow.contrib.py2tf.pyct import templates
 from tensorflow.contrib.py2tf.pyct import transformer
 from tensorflow.contrib.py2tf.pyct.static_analysis.annos import NodeAnno
@@ -62,45 +64,56 @@ class SideEffectGuardTransformer(transformer.Base):
 
   def __init__(self, context):
     super(SideEffectGuardTransformer, self).__init__(context)
-    self.indent_next = False
-    self.next_indent_owner = None
 
   # pylint:disable=invalid-name
 
   def _visit_and_reindent(self, nodes):
     new_nodes = []
     current_dest = new_nodes
+    alias_map = {}
+    reindent_requested = False
     for n in nodes:
       n = self.visit(n)
+      # NOTE: the order in which these statements execute is important; in
+      # particular, watch out for ending up with cycles in the AST.
+      if alias_map:
+        n = ast_util.rename_symbols(n, alias_map)
       if isinstance(n, (list, tuple)):
         current_dest.extend(n)
       else:
         current_dest.append(n)
-      if self.indent_next:
-        assert self.next_indent_owner is not None
-        current_dest.append(self.next_indent_owner)
-        current_dest = self.next_indent_owner.body
-        self.next_indent_owner = None
-        self.indent_next = False
-    if not current_dest:
+      if anno.hasanno(n, anno.Basic.INDENT_BLOCK_REMAINDER):
+        reindent_requested = True
+        new_dest, new_alias_map = anno.getanno(
+            n, anno.Basic.INDENT_BLOCK_REMAINDER)
+        anno.delanno(n, anno.Basic.INDENT_BLOCK_REMAINDER)
+        new_alias_map.update(alias_map)
+        alias_map = new_alias_map
+        current_dest = new_dest
+    if reindent_requested and not current_dest:
       # TODO(mdan): There may still be something that could be done.
       raise ValueError('Unable to insert statement into the computation flow: '
-                       'it is not followed by any computation that can we can '
-                       'condition on the statement.')
+                       'it is not followed by any computation which '
+                       'the statement could gate.')
     return new_nodes
 
   def visit_FunctionDef(self, node):
     node.body = self._visit_and_reindent(node.body)
     return node
 
-  def _gate_symbols(self, guard_statement, guarded_args):
-    # TODO(mdan): This won't work for variables.
-    template = """
-      (args,) = (tf.identity(a) for a in (args,))
-    """
-    guards = templates.replace(template, args=tuple(guarded_args))
-    guard_statement.body.extend(guards)
-    return guard_statement
+  def visit_With(self, node):
+    node.body = self._visit_and_reindent(node.body)
+    return node
+
+  def visit_If(self, node):
+    node.body = self._visit_and_reindent(node.body)
+    node.orelse = self._visit_and_reindent(node.orelse)
+    return node
+
+  def visit_While(self, node):
+    node.body = self._visit_and_reindent(node.body)
+    node.orelse = self._visit_and_reindent(node.orelse)
+    return node
 
   def visit_Expr(self, node):
     self.generic_visit(node)
@@ -109,30 +122,62 @@ class SideEffectGuardTransformer(transformer.Base):
       #   opt.minimize(loss)
       # or:
       #   tf.py_func(...)
-      template = """
-        with py2tf_utils.control_dependency_on_returns(tf, call):
-          # TODO(mdan): Also insert ops to re-fetch if variables are involved?
-          pass  # Will be removed below.
-      """
-      # TODO(mdan): This is brittle. Reorganize the mechanism.
-      statements = templates.replace(template, call=node.value)
-      control_deps_guard = statements[-1]
-      control_deps_guard.body = []
 
       # First, attempt to gate future evaluation of args. If that's not
       # possible, gate all remaining statements (and that may fail too, see
       # _visit_and_reindent.
       args_scope = anno.getanno(node.value, NodeAnno.ARGS_SCOPE)
-      guarded_args = tuple(args_scope.used & (args_scope.parent.modified
-                                              | args_scope.parent.returned))
+      # NOTE: We can't guard object attributes because they may not be writable.
+      guarded_args = tuple(
+          s for s in args_scope.used if not s.is_composite())
+
+      # TODO(mdan): Include all arguments which depended on guarded_args too.
+      # For example, the following will still cause a race:
+      #   tf.assign(a, a + 1)
+      #   b = a + 1
+      #   tf.assign(a, a + 1)  # Control deps here should include `b`
+      #   c = b + 1
+      # Or maybe we should just raise an "unsafe assign" error?
+
       if guarded_args:
-        node = tuple(statements[:-1]) + (
-            self._gate_symbols(control_deps_guard, guarded_args),)
+        # The aliases may need new names to avoid incorrectly making them local.
+        # TODO(mdan): This is brutal. It will even rename modules - any fix?
+        need_alias = tuple(
+            s for s in guarded_args if s not in args_scope.parent.modified)
+        aliased_new_names = tuple(
+            qual_names.QN(
+                self.context.namer.new_symbol(
+                    s.ssf(), args_scope.parent.referenced)) for s in need_alias)
+        alias_map = dict(zip(need_alias, aliased_new_names))
+        if len(guarded_args) == 1:
+          s, = guarded_args
+          aliased_guarded_args = alias_map.get(s, s)
+        else:
+          aliased_guarded_args = gast.Tuple(
+              [alias_map.get(s, s).ast() for s in guarded_args], None)
+
+        template = """
+          with py2tf_utils.control_dependency_on_returns(tf, call):
+            aliased_guarded_args = py2tf_utils.alias_tensors(tf, guarded_args)
+        """
+        control_deps_guard = templates.replace(
+            template,
+            call=node.value,
+            aliased_guarded_args=aliased_guarded_args,
+            guarded_args=guarded_args)[-1]
       else:
-        node = tuple(statements[:-1])
-        # The mechanism will insert the guard statement later.
-        self.indent_next = True
-        self.next_indent_owner = control_deps_guard
+        alias_map = {}
+
+        template = """
+          with py2tf_utils.control_dependency_on_returns(tf, call):
+            pass
+        """
+        control_deps_guard = templates.replace(template, call=node.value)[-1]
+        control_deps_guard.body = []
+
+      node = control_deps_guard
+      anno.setanno(node, anno.Basic.INDENT_BLOCK_REMAINDER,
+                   (node.body, alias_map))
     return node
 
   # pylint:enable=invalid-name
diff --git a/tensorflow/contrib/py2tf/converters/side_effect_guards_test.py b/tensorflow/contrib/py2tf/converters/side_effect_guards_test.py
index dea09ecc3f..409c8b02c5 100644
--- a/tensorflow/contrib/py2tf/converters/side_effect_guards_test.py
+++ b/tensorflow/contrib/py2tf/converters/side_effect_guards_test.py
@@ -22,8 +22,12 @@ from tensorflow.contrib.py2tf import utils
 from tensorflow.contrib.py2tf.converters import converter_test_base
 from tensorflow.contrib.py2tf.converters import side_effect_guards
 from tensorflow.contrib.py2tf.pyct import compiler
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -32,32 +36,135 @@ from tensorflow.python.platform import test
 class TestNamer(side_effect_guards.SymbolNamer):
 
   def new_symbol(self, name_root, _):
-    return name_root
+    return 'renamed_%s' % name_root
 
 
 class SideEffectGuardsTest(converter_test_base.TestCase):
 
-  def test_transform(self):
+  def _transform_and_compile(self, test_fn):
+    ns = {
+        'control_flow_ops': control_flow_ops,
+        'constant_op': constant_op,
+        'gen_math_ops': gen_math_ops,
+        'ops': ops,
+        'state_ops': state_ops,
+    }
+    node = self.parse_and_analyze(
+        test_fn, ns,
+        namer=TestNamer())
+    node = side_effect_guards.transform(node, self.ctx)
+    result = compiler.ast_to_object(node)
+    self.attach_namespace(result, **ns)
+    result.tf = self.make_fake_tf(array_ops.identity, control_flow_ops.Assert,
+                                  gen_math_ops.greater,
+                                  ops.control_dependencies, ops.Tensor)
+    result.py2tf_utils = utils
+    return result.test_fn, node
+
+  def test_side_effect_on_return_only_variable(self):
 
     def test_fn(a):
       state_ops.assign(a, a + 1)
       return a
 
-    node = self.parse_and_analyze(
-        test_fn, {'state_ops': state_ops}, namer=TestNamer())
-    node = side_effect_guards.transform(node, self.ctx)
-    result = compiler.ast_to_object(node)
-    setattr(result, 'state_ops', state_ops)
-    setattr(result, 'py2tf_utils', utils)
+    tf_test_fn, node = self._transform_and_compile(test_fn)
+
+    self.assertEqual(len(node.body[0].body), 1)
+    with self.test_session() as sess:
+      v = variables.Variable(2)
+      sess.run(v.initializer)
+      # NOTE: We don't expect the assignment to execute in this case, because
+      # variables cannot be reliably guarded.
+      self.assertEqual(2, sess.run(tf_test_fn(v)))
+
+  def test_side_effect_on_used_variable(self):
+
+    def test_fn(a):
+      state_ops.assign(a, a + 1)
+      return a + 1
+
+    tf_test_fn, node = self._transform_and_compile(test_fn)
+
+    self.assertEqual(len(node.body[0].body), 1)
+    with self.test_session() as sess:
+      v = variables.Variable(2)
+      sess.run(v.initializer)
+      # NOTE: Unlike test_side_effect_on_return_only_variable, the variable was
+      # used in the local scope and so we could catch the assign's side effect.
+      self.assertEqual(4, sess.run(tf_test_fn(v)))
+
+  def test_side_effect_on_tensor(self):
+
+    def test_fn(a):
+      control_flow_ops.Assert(gen_math_ops.greater(a, 0), ['expected in throw'])
+      return a
+
+    tf_test_fn, node = self._transform_and_compile(test_fn)
+
+    self.assertEqual(len(node.body[0].body), 1)
+    with self.test_session() as sess:
+      # NOTE: In this case we can also capture the side effect because the
+      # argument is a tensor ans we can wrap it inside an identity.
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   'expected in throw'):
+        sess.run(tf_test_fn(constant_op.constant(-1)))
+
+  def test_multiline_block(self):
+
+    def test_fn(a):
+      state_ops.assign(a, a + 1)
+      b = a + 1
+      state_ops.assign(a, b + 1)
+      c = b + 1
+      d = c + 1
+      return d
+
+    tf_test_fn, node = self._transform_and_compile(test_fn)
+
+    self.assertEqual(len(node.body[0].body), 1)
+    with self.test_session() as sess:
+      v = variables.Variable(2)
+      sess.run(v.initializer)
+      self.assertEqual(6, sess.run(tf_test_fn(v)))
+
+  def test_multiline_nested_block(self):
+
+    def test_fn(a):
+      with ops.name_scope('foo'):
+        state_ops.assign(a, a + 1)
+        b = a + 1
+        # state_ops.assign(a, b + 1)
+        c = b + 1
+        d = c + 1
+      return d
+
+    tf_test_fn, node = self._transform_and_compile(test_fn)
+
+    self.assertEqual(len(node.body[0].body[0].body), 1)
+    with self.test_session() as sess:
+      v = variables.Variable(2)
+      sess.run(v.initializer)
+      self.assertEqual(6, sess.run(tf_test_fn(v)))
+
+  def test_multiline_block_unsafe(self):
+
+    def test_fn(a):
+      state_ops.assign(a, a + 1)
+      b = a + 1
+      state_ops.assign(a, a + 1)
+      c = b + 1
+      d = c + 1
+      return d
 
-    # TODO(mdan): Configure the namespaces instead of doing these hacks.
-    ops.identity = array_ops.identity
-    setattr(result, 'tf', ops)
+    tf_test_fn, node = self._transform_and_compile(test_fn)
 
+    self.assertEqual(len(node.body[0].body), 1)
     with self.test_session() as sess:
       v = variables.Variable(2)
       sess.run(v.initializer)
-      self.assertEqual(3, sess.run(result.test_fn(v)))
+      # NOTE: This intentionally highlights the flakiness. The test should be
+      # tightened down once that is solved.
+      self.assertTrue(sess.run(tf_test_fn(v)) in (6, 7))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/py2tf/pyct/BUILD b/tensorflow/contrib/py2tf/pyct/BUILD
index 054eb17fb6..91054fe61d 100644
--- a/tensorflow/contrib/py2tf/pyct/BUILD
+++ b/tensorflow/contrib/py2tf/pyct/BUILD
@@ -1,5 +1,7 @@
 licenses(["notice"])  # Apache 2.0
 
+exports_files(["LICENSE"])
+
 load("//tensorflow:tensorflow.bzl", "py_test")
 
 filegroup(
@@ -19,9 +21,9 @@ py_library(
     srcs = [
         "__init__.py",
         "anno.py",
+        "ast_util.py",
         "compiler.py",
         "context.py",
-        "copier.py",
         "parser.py",
         "pretty_printer.py",
         "qual_names.py",
@@ -49,8 +51,8 @@ py_test(
 )
 
 py_test(
-    name = "compiler_test",
-    srcs = ["compiler_test.py"],
+    name = "ast_util_test",
+    srcs = ["ast_util_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":pyct",
@@ -60,8 +62,8 @@ py_test(
 )
 
 py_test(
-    name = "copier_test",
-    srcs = ["copier_test.py"],
+    name = "compiler_test",
+    srcs = ["compiler_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":pyct",
diff --git a/tensorflow/contrib/py2tf/pyct/anno.py b/tensorflow/contrib/py2tf/pyct/anno.py
index c6d41f9e12..7a0528b6d0 100644
--- a/tensorflow/contrib/py2tf/pyct/anno.py
+++ b/tensorflow/contrib/py2tf/pyct/anno.py
@@ -39,6 +39,11 @@ class Basic(NoValue):
   QN = 'Qualified name, as it appeared in the code.'
   SKIP_PROCESSING = (
       'This node should be preserved as is and not processed any further.')
+  INDENT_BLOCK_REMAINDER = (
+      'When a node is annotated with this, the remainder of the block should '
+      'be indented below it. The annotation contains a tuple '
+      '(new_body, name_map), where `new_body` is the new indented block and '
+      '`name_map` allows renaming symbols.')
 
 
 def getanno(node, key, field_name='___pyct_anno'):
@@ -57,3 +62,11 @@ def setanno(node, key, value, field_name='___pyct_anno'):
   # So that the annotations survive gast_to_ast() and ast_to_gast()
   if field_name not in node._fields:
     node._fields += (field_name,)
+
+
+def delanno(node, key, field_name='___pyct_anno'):
+  annotations = getattr(node, field_name)
+  del annotations[key]
+  if not annotations:
+    delattr(node, field_name)
+    node._fields = tuple(f for f in node._fields if f != field_name)
diff --git a/tensorflow/contrib/py2tf/pyct/anno_test.py b/tensorflow/contrib/py2tf/pyct/anno_test.py
index 19e3b45762..ff40bfe1f5 100644
--- a/tensorflow/contrib/py2tf/pyct/anno_test.py
+++ b/tensorflow/contrib/py2tf/pyct/anno_test.py
@@ -37,6 +37,11 @@ class AnnoTest(test.TestCase):
     self.assertTrue(anno.hasanno(node, 'foo'))
     self.assertEqual(3, anno.getanno(node, 'foo'))
 
+    anno.delanno(node, 'foo')
+    self.assertFalse(anno.hasanno(node, 'foo'))
+    with self.assertRaises(AttributeError):
+      anno.getanno(node, 'foo')
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/py2tf/pyct/copier.py b/tensorflow/contrib/py2tf/pyct/ast_util.py
similarity index 73%
rename from tensorflow/contrib/py2tf/pyct/copier.py
rename to tensorflow/contrib/py2tf/pyct/ast_util.py
index 41598fdc99..f916775b9c 100644
--- a/tensorflow/contrib/py2tf/pyct/copier.py
+++ b/tensorflow/contrib/py2tf/pyct/ast_util.py
@@ -66,3 +66,31 @@ def copy_clean(node):
     return tuple(copier.visit(n) for n in node)
   else:
     return copier.visit(node)
+
+
+class SymbolRenamer(gast.NodeTransformer):
+  """Transformer that can rename symbols to a simple names."""
+
+  def __init__(self, name_map):
+    self.name_map = name_map
+
+  def _process(self, node):
+    qn = anno.getanno(node, anno.Basic.QN)
+    if qn in self.name_map:
+      return gast.Name(str(self.name_map[qn]), node.ctx, None)
+    return self.generic_visit(node)
+
+  def visit_Name(self, node):
+    return self._process(node)
+
+  def visit_Attribute(self, node):
+    return self._process(node)
+
+
+def rename_symbols(node, name_map):
+  renamer = SymbolRenamer(name_map)
+  if isinstance(node, list):
+    return [renamer.visit(n) for n in node]
+  elif isinstance(node, tuple):
+    return tuple(renamer.visit(n) for n in node)
+  return renamer.visit(node)
diff --git a/tensorflow/contrib/py2tf/pyct/copier_test.py b/tensorflow/contrib/py2tf/pyct/ast_util_test.py
similarity index 57%
rename from tensorflow/contrib/py2tf/pyct/copier_test.py
rename to tensorflow/contrib/py2tf/pyct/ast_util_test.py
index a6b35eda14..e0b00c1781 100644
--- a/tensorflow/contrib/py2tf/pyct/copier_test.py
+++ b/tensorflow/contrib/py2tf/pyct/ast_util_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for copier module."""
+"""Tests for ast_util module."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -20,11 +20,37 @@ from __future__ import print_function
 
 import ast
 
-from tensorflow.contrib.py2tf.pyct import copier
+from tensorflow.contrib.py2tf.pyct import ast_util
+from tensorflow.contrib.py2tf.pyct import qual_names
 from tensorflow.python.platform import test
 
 
-class CopierTest(test.TestCase):
+class AstUtilTest(test.TestCase):
+
+  def test_rename_symbols(self):
+    node = ast.Tuple([
+        ast.Name('a', ast.Load()),
+        ast.Name('b', ast.Load()),
+        ast.Attribute(ast.Name('b', None), 'c', ast.Store()),
+        ast.Attribute(
+            ast.Attribute(ast.Name('b', None), 'c', ast.Load()), 'd',
+            None)
+    ], None)
+    node = qual_names.resolve(node)
+    node = ast_util.rename_symbols(
+        node,
+        {
+            qual_names.QN('a'): qual_names.QN('renamed_a'),
+            qual_names.QN('b.c'): qual_names.QN('renamed_b_c'),
+        })
+
+    self.assertEqual(node.elts[0].id, 'renamed_a')
+    self.assertTrue(isinstance(node.elts[0].ctx, ast.Load))
+    self.assertEqual(node.elts[1].id, 'b')
+    self.assertEqual(node.elts[2].id, 'renamed_b_c')
+    self.assertTrue(isinstance(node.elts[2].ctx, ast.Store))
+    self.assertEqual(node.elts[3].value.id, 'renamed_b_c')
+    self.assertTrue(isinstance(node.elts[3].value.ctx, ast.Load))
 
   def test_copy_clean(self):
     ret = ast.Return(
@@ -43,7 +69,7 @@ class CopierTest(test.TestCase):
         body=[ret],
         decorator_list=[],
         returns=None)
-    new_node = copier.copy_clean(node)
+    new_node = ast_util.copy_clean(node)
     self.assertFalse(node is new_node)
     self.assertFalse(ret is new_node.body[0])
     self.assertFalse(hasattr(new_node.body[0], '__foo'))
diff --git a/tensorflow/contrib/py2tf/pyct/templates.py b/tensorflow/contrib/py2tf/pyct/templates.py
index 1039fc8713..5fd5252619 100644
--- a/tensorflow/contrib/py2tf/pyct/templates.py
+++ b/tensorflow/contrib/py2tf/pyct/templates.py
@@ -26,7 +26,7 @@ import textwrap
 
 import gast
 
-from tensorflow.contrib.py2tf.pyct import copier
+from tensorflow.contrib.py2tf.pyct import ast_util
 from tensorflow.contrib.py2tf.pyct import parser
 from tensorflow.contrib.py2tf.pyct import qual_names
 
@@ -77,7 +77,7 @@ class ReplaceTransformer(gast.NodeTransformer):
     if node.id not in self.replacements:
       return node
 
-    new_nodes = copier.copy_clean(self.replacements[node.id])
+    new_nodes = ast_util.copy_clean(self.replacements[node.id])
     if isinstance(new_nodes, gast.AST):
       new_nodes = [new_nodes]
 
diff --git a/tensorflow/contrib/py2tf/utils/BUILD b/tensorflow/contrib/py2tf/utils/BUILD
index 01804aa883..502720047e 100644
--- a/tensorflow/contrib/py2tf/utils/BUILD
+++ b/tensorflow/contrib/py2tf/utils/BUILD
@@ -1,5 +1,7 @@
 licenses(["notice"])  # Apache 2.0
 
+exports_files(["LICENSE"])
+
 load("//tensorflow:tensorflow.bzl", "py_test")
 
 filegroup(
@@ -19,6 +21,7 @@ py_library(
     srcs = [
         "__init__.py",
         "context_managers.py",
+        "misc.py",
     ],
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:__subpackages__"],
@@ -35,3 +38,13 @@ py_test(
         "//tensorflow/python:client_testlib",
     ],
 )
+
+py_test(
+    name = "misc_test",
+    srcs = ["misc_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":utils",
+        "//tensorflow/python:client_testlib",
+    ],
+)
diff --git a/tensorflow/contrib/py2tf/utils/__init__.py b/tensorflow/contrib/py2tf/utils/__init__.py
index bca33e89e9..2ab0d7b9fd 100644
--- a/tensorflow/contrib/py2tf/utils/__init__.py
+++ b/tensorflow/contrib/py2tf/utils/__init__.py
@@ -19,3 +19,4 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.py2tf.utils.context_managers import control_dependency_on_returns
+from tensorflow.contrib.py2tf.utils.misc import alias_tensors
diff --git a/tensorflow/contrib/py2tf/utils/misc.py b/tensorflow/contrib/py2tf/utils/misc.py
new file mode 100644
index 0000000000..ee5cedd080
--- /dev/null
+++ b/tensorflow/contrib/py2tf/utils/misc.py
@@ -0,0 +1,48 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Miscellaneous utilities that don't fit anywhere else."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+def alias_tensors(tf, *args):
+  """Wrap any Tensor arguments with an identity op.
+
+  Any other argument, including Variables, is returned unchanged.
+
+  Args:
+    tf: The TensorFlow module.
+    *args: Any arguments. Must contain at least one element.
+
+  Returns:
+    Same as *args, with Tensor instances replaced as described.
+
+  Raises:
+    ValueError: If args doesn't meet the requirements.
+  """
+
+  def alias_if_tensor(a):
+    return tf.identity(a) if isinstance(a, tf.Tensor) else a
+
+  # TODO(mdan): Recurse into containers?
+  # TODO(mdan): Anything we can do about variables? Fake a scope reuse?
+  if len(args) > 1:
+    return (alias_if_tensor(a) for a in args)
+  elif len(args) == 1:
+    return alias_if_tensor(args[0])
+
+  raise ValueError('at least one argument required')
diff --git a/tensorflow/contrib/py2tf/utils/misc_test.py b/tensorflow/contrib/py2tf/utils/misc_test.py
new file mode 100644
index 0000000000..5613cc052a
--- /dev/null
+++ b/tensorflow/contrib/py2tf/utils/misc_test.py
@@ -0,0 +1,64 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for misc module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import imp
+
+from tensorflow.contrib.py2tf.utils import misc
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class ContextManagersTest(test.TestCase):
+
+  def test_alias_single_tensor(self):
+    a = constant_op.constant(1)
+    fake_tf = imp.new_module('fake_tf')
+    fake_tf.identity = array_ops.identity
+    fake_tf.Tensor = ops.Tensor
+
+    new_a = misc.alias_tensors(fake_tf, a)
+    self.assertFalse(new_a is a)
+    with self.test_session() as sess:
+      self.assertEqual(1, sess.run(new_a))
+
+  def test_alias_tensors(self):
+    a = constant_op.constant(1)
+    v = variables.Variable(2)
+    s = 'a'
+    l = [1, 2, 3]
+    fake_tf = imp.new_module('fake_tf')
+    fake_tf.identity = array_ops.identity
+    fake_tf.Tensor = ops.Tensor
+
+    new_a, new_v, new_s, new_l = misc.alias_tensors(fake_tf, a, v, s, l)
+
+    self.assertFalse(new_a is a)
+    self.assertTrue(new_v is v)
+    self.assertTrue(new_s is s)
+    self.assertTrue(new_l is l)
+    with self.test_session() as sess:
+      self.assertEqual(1, sess.run(new_a))
+
+
+if __name__ == '__main__':
+  test.main()
-- 
GitLab


From 87b29b7f3768328f02170aeb4338dd232be00248 Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Tue, 6 Feb 2018 12:04:48 -0800
Subject: [PATCH 720/966] Second, cleaner, attempt at external control
 dependency handling.

PiperOrigin-RevId: 184718016
---
 tensorflow/python/framework/ops_test.py       |  1 -
 .../kernel_tests/control_flow_ops_py_test.py  | 30 +++++++++++++++++++
 tensorflow/python/ops/control_flow_ops.py     | 23 +++++++++-----
 3 files changed, 45 insertions(+), 9 deletions(-)

diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index c5e177d521..c6deafd89e 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -916,7 +916,6 @@ class CreateOpFromTFOperationTest(test_util.TensorFlowTestCase):
 
     op = g.get_operation_by_name("myloop/myop")
     self.assertIsNotNone(op)
-    self.assertEqual(len(op.control_inputs), 1)
     # External control dep is removed and replaced with internal control dep
     self.assertNotEqual(op.control_inputs[0], c.op)
     self.assertIsNotNone(op.control_inputs[0]._get_control_flow_context())
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index 4fafc36014..15ff0ec09b 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -704,6 +704,36 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(c, b, [n], parallel_iterations=20)
       self.assertEqual(10000, r.eval())
 
+  def testWhileExternalControlDependencies(self):
+    with self.test_session():
+      v = variables.Variable(0.0)
+      v.initializer.run()
+      increment = v.assign_add(1.0)
+
+      def body_fn(i):
+        with ops.control_dependencies([increment]):
+          return i + i
+
+      result = control_flow_ops.while_loop(cond=lambda i: i < 1,
+                                           body=body_fn, loop_vars=[1])
+      result.eval()
+      self.assertAllEqual(v.eval(), 1.0)
+
+  def testWhileExternalControlDependenciesNoInput(self):
+    with self.test_session():
+      v = variables.Variable(0.0)
+      v.initializer.run()
+      increment = v.assign_add(1.0)
+
+      def body_fn(unused_i):
+        with ops.control_dependencies([increment]):
+          return constant_op.constant(5, name="five")
+
+      result = control_flow_ops.while_loop(cond=lambda i: i < 5,
+                                           body=body_fn, loop_vars=[0])
+      result.eval()
+      self.assertAllEqual(v.eval(), 1.0)
+
   def testWhileWithRefs_1(self):
     with self.test_session() as sess:
       x = variables.Variable(0)._ref()  # pylint: disable=protected-access
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index e75eb0843f..3a5c2f210a 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -1633,10 +1633,13 @@ class ControlFlowContext(object):
         ctxt = util.GetOutputContext(x)
         if ctxt is not None and ctxt.GetWhileContext() == while_ctxt:
           internal_control_inputs.append(x)
+    external_control_inputs = []
     if len(internal_control_inputs) != len(op.control_inputs):
+      external_control_inputs = list(set(op.control_inputs)
+                                     - set(internal_control_inputs))
       op._remove_all_control_inputs()
       op._add_control_inputs(internal_control_inputs)
-    return internal_control_inputs
+    return internal_control_inputs, external_control_inputs
 
   # pylint: enable=protected-access
 
@@ -2445,14 +2448,12 @@ class WhileContext(ControlFlowContext):
   def _AddOpInternal(self, op):
     """Add `op` to the current context.
 
-    In the case that op has only external data inputs, we remove all of its
-    external control inputs so all its inputs are in the same while loop
-    context. This is valid because op now has an Enter input that has all
-    the right control dependency.
+    We move any external control dependencies of the op to the loop pivot, to
+    ensure they get executed.
     """
     if not op.inputs:
       # Remove any external control dependency on this op
-      control_inputs = self._RemoveExternalControlEdges(op)
+      control_inputs, external_inputs = self._RemoveExternalControlEdges(op)
       # Add a control edge from the control pivot to this op.
       if not control_inputs:
         # pylint: disable=protected-access
@@ -2465,14 +2466,20 @@ class WhileContext(ControlFlowContext):
         x = op.inputs[index]
         real_x = self.AddValue(x)
         if real_x != x:
-          op._update_input(index, real_x)
+          op._update_input(index, real_x)  # pylint: disable=protected-access
       # Remove any external control dependency on this op.
-      self._RemoveExternalControlEdges(op)
+      _, external_inputs = self._RemoveExternalControlEdges(op)
       # Add a control dependency to prevent loop invariants from
       # enabling ops that should not be executed.
       self._MaybeAddControlDependency(op)
       for x in op.outputs:
         self._values.add(x.name)
+    if external_inputs:
+      # Use an identity to pull control inputs as data inputs. Note that we
+      # ignore ops which don't have outputs. TODO(apassos): fix that
+      external_inputs = [array_ops.identity(x.outputs[0]).op
+                         for x in external_inputs if x.outputs]
+      op._add_control_inputs(external_inputs)  # pylint: disable=protected-access
     if self._outer_context or not util.IsLoopExit(op):
       op.graph.prevent_fetching(op)
       for x in op.outputs:
-- 
GitLab


From a9a8dbc3d28c99eccb0cc5bccb32f06594b279db Mon Sep 17 00:00:00 2001
From: Shanqing Cai <cais@google.com>
Date: Tue, 6 Feb 2018 12:27:02 -0800
Subject: [PATCH 721/966] tfdbg: deflake session_debug_file_test

by disabling grappler in the test.

PiperOrigin-RevId: 184721353
---
 tensorflow/python/debug/lib/session_debug_testlib.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/debug/lib/session_debug_testlib.py b/tensorflow/python/debug/lib/session_debug_testlib.py
index 20a40018bf..f4fac14019 100644
--- a/tensorflow/python/debug/lib/session_debug_testlib.py
+++ b/tensorflow/python/debug/lib/session_debug_testlib.py
@@ -988,7 +988,7 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
   def testWatchingVariableUpdateOpsSeesUpdatedValues(self):
     """Watch output slots on Variable-updating ops, with no emitted edges."""
 
-    with session.Session() as sess:
+    with session.Session(config=no_rewrite_session_config()) as sess:
       u_init = constant_op.constant(10.0)
       u = variables.Variable(u_init, name="gdo/u")
       v_init = constant_op.constant(20.0)
-- 
GitLab


From ff647be633bd5038e7f9450ea899ac40d4da944b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 6 Feb 2018 12:30:05 -0800
Subject: [PATCH 722/966] Fixed typo

PiperOrigin-RevId: 184721743
---
 tensorflow/contrib/tpu/python/tpu/tpu_config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_config.py b/tensorflow/contrib/tpu/python/tpu/tpu_config.py
index b8ae2bf798..a1076b763d 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_config.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_config.py
@@ -51,7 +51,7 @@ class TPUConfig(
   r"""TPU related configuration required by `TPUEstimator`.
 
   Args:
-    iterations_per_loop: This is the number of train steps runnining in TPU
+    iterations_per_loop: This is the number of train steps running in TPU
       system before returning to CPU host for each `Session.run`. This means
       global step is increased `iterations_per_loop` times in one `Session.run`.
       It is recommended to be set as number of global steps for next checkpoint.
-- 
GitLab


From 6dbbb2883edf119cc0788d0a792e69c6ec07d437 Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Tue, 6 Feb 2018 12:44:12 -0800
Subject: [PATCH 723/966] Fetch OpDefs from C API instead of using python op
 registry in ops.py.

PiperOrigin-RevId: 184723558
---
 tensorflow/python/framework/ops.py | 33 ++++++++++++++++++------------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index ea589cc4d4..1fdaf9664c 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -1657,7 +1657,7 @@ class Operation(object):
       self._c_op = c_op
     elif self._graph._c_graph:  # pylint: disable=protected-access
       if op_def is None:
-        op_def = self._graph._registered_ops[node_def.op]
+        op_def = self._graph._get_op_def(node_def.op)
       # TODO(skyewm): op_def_library.apply_op() flattens the incoming inputs.
       # Refactor so we don't have to do this here.
       grouped_inputs = self._reconstruct_sequence_inputs(
@@ -2164,16 +2164,7 @@ class Operation(object):
     """
     # pylint: enable=line-too-long
     if self._c_op:
-      with c_api_util.tf_buffer() as buf:
-        with errors.raise_exception_on_not_ok_status() as status:
-          # pylint: disable=protected-access
-          c_api.TF_GraphGetOpDef(self._graph._c_graph,
-                                 compat.as_bytes(self.type), buf, status)
-          # pylint: enable=protected-access
-        data = c_api.TF_GetBuffer(buf)
-      op_def = op_def_pb2.OpDef()
-      op_def.ParseFromString(compat.as_bytes(data))
-      return op_def
+      return self._graph._get_op_def(self.type)
     else:
       return self._op_def_val
 
@@ -3377,8 +3368,8 @@ class Graph(object):
     # (2) "is_stateful" is set in OpDef
     # (3) "container" attribute is in OpDef
     # (4) "container" attribute is None
-    if (self._container and op.type in self._registered_ops and
-        self._registered_ops[op.type].is_stateful):
+    # TODO(skyewm): remove op.op_def check when _USE_C_API is removed.
+    if self._container and op.op_def and op.op_def.is_stateful:
       try:
         container_attr = op.get_attr("container")
       except ValueError:
@@ -3658,6 +3649,22 @@ class Graph(object):
   def _last_id(self):
     return self._next_id_counter
 
+  def _get_op_def(self, type):
+    """Returns the `OpDef` proto for `type`. `type` is a string."""
+    if self._c_graph:
+      with c_api_util.tf_buffer() as buf:
+        with errors.raise_exception_on_not_ok_status() as status:
+          # pylint: disable=protected-access
+          c_api.TF_GraphGetOpDef(self._c_graph,
+                                 compat.as_bytes(type), buf, status)
+          # pylint: enable=protected-access
+        data = c_api.TF_GetBuffer(buf)
+      op_def = op_def_pb2.OpDef()
+      op_def.ParseFromString(compat.as_bytes(data))
+      return op_def
+    else:
+      return self._registered_ops[type]
+
   def as_default(self):
     """Returns a context manager that makes this `Graph` the default graph.
 
-- 
GitLab


From 97983eaeec3dab9da2b063d187e87f5a1670613d Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Tue, 6 Feb 2018 12:57:07 -0800
Subject: [PATCH 724/966] [TF:XLA] Bump open source llvm revision to r324323

PiperOrigin-RevId: 184725126
---
 tensorflow/workspace.bzl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 971e30bac8..31b71c85ce 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -472,11 +472,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "llvm",
       urls = [
-          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/299f8c346e1ab483463da5f02536ffd00b7ad9c6.tar.gz",
-          "https://github.com/llvm-mirror/llvm/archive/299f8c346e1ab483463da5f02536ffd00b7ad9c6.tar.gz",
+          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/ee84001a30d264f1f2acc6f8245b9886a3c0401b.tar.gz",
+          "https://github.com/llvm-mirror/llvm/archive/ee84001a30d264f1f2acc6f8245b9886a3c0401b.tar.gz",
       ],
-      sha256 = "0556bc6a85000c573d92fe00946b6418cbcd3844912696a81055e4768299dda4",
-      strip_prefix = "llvm-299f8c346e1ab483463da5f02536ffd00b7ad9c6",
+      sha256 = "d2b18ec6f1838d837c4cae8adce218630b028a6033aad2b06f2554b2132b264f",
+      strip_prefix = "llvm-ee84001a30d264f1f2acc6f8245b9886a3c0401b",
       build_file = str(Label("//third_party/llvm:llvm.BUILD")),
   )
 
-- 
GitLab


From 6f7c6763b80088152b6f9de5ba1046e75d28a26a Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Tue, 6 Feb 2018 12:58:56 -0800
Subject: [PATCH 725/966] Fixing the issue where an escape character was being
 included in the branch name changes.

PiperOrigin-RevId: 184725332
---
 tensorflow/tools/ci_build/update_version.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/tensorflow/tools/ci_build/update_version.py b/tensorflow/tools/ci_build/update_version.py
index 347d0769a9..52a0da9a14 100755
--- a/tensorflow/tools/ci_build/update_version.py
+++ b/tensorflow/tools/ci_build/update_version.py
@@ -261,14 +261,12 @@ def major_minor_change(old_version, new_version):
 def update_dockerfiles(old_version, new_version):
   """Update dockerfiles if there was a major change."""
   if major_minor_change(old_version, new_version):
-    old_r_major_minor = r"r%s\.%s" % (old_version.major, old_version.minor)
-    old_r_major_minor_string = old_r_major_minor.replace("\\", "")
-    r_major_minor = r"r%s\.%s" % (new_version.major, new_version.minor)
-    r_major_minor_string = r_major_minor.replace("\\", "")
+    old_r_major_minor = "r%s.%s" % (old_version.major, old_version.minor)
+    r_major_minor = "r%s.%s" % (new_version.major, new_version.minor)
 
     print("Detected Major.Minor change.")
     print("Updating pattern %s to %s in additional files"
-          % (old_r_major_minor_string, r_major_minor_string))
+          % (old_r_major_minor, r_major_minor))
 
     # Update dockerfiles
     replace_string_in_line(old_r_major_minor, r_major_minor, DEVEL_DOCKERFILE)
-- 
GitLab


From d7b8bce5c6e84f2a3cb5e7f5f5255a1d30afffb2 Mon Sep 17 00:00:00 2001
From: "Joshua V. Dillon" <jvdillon@google.com>
Date: Tue, 6 Feb 2018 13:03:03 -0800
Subject: [PATCH 726/966] Add utility function which makes implicit
 `tf.get_variable` dependencies an explicit argument of a callable.

PiperOrigin-RevId: 184725878
---
 tensorflow/contrib/bayesflow/BUILD            |  17 ++
 tensorflow/contrib/bayesflow/__init__.py      |   2 +
 .../kernel_tests/variable_utils_test.py       | 135 +++++++++++++++
 .../bayesflow/python/ops/variable_utils.py    |  29 ++++
 .../python/ops/variable_utils_impl.py         | 157 ++++++++++++++++++
 5 files changed, 340 insertions(+)
 create mode 100644 tensorflow/contrib/bayesflow/python/kernel_tests/variable_utils_test.py
 create mode 100644 tensorflow/contrib/bayesflow/python/ops/variable_utils.py
 create mode 100644 tensorflow/contrib/bayesflow/python/ops/variable_utils_impl.py

diff --git a/tensorflow/contrib/bayesflow/BUILD b/tensorflow/contrib/bayesflow/BUILD
index 82944f5363..34156c28fe 100644
--- a/tensorflow/contrib/bayesflow/BUILD
+++ b/tensorflow/contrib/bayesflow/BUILD
@@ -239,6 +239,23 @@ cuda_py_test(
     tags = ["notsan"],
 )
 
+cuda_py_test(
+    name = "variable_utils_test",
+    size = "small",
+    srcs = ["python/kernel_tests/variable_utils_test.py"],
+    additional_deps = [
+        ":bayesflow_py",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 cuda_py_test(
     name = "variational_sgd_optimizer_test",
     size = "small",
diff --git a/tensorflow/contrib/bayesflow/__init__.py b/tensorflow/contrib/bayesflow/__init__.py
index c411026346..528c4fbacd 100644
--- a/tensorflow/contrib/bayesflow/__init__.py
+++ b/tensorflow/contrib/bayesflow/__init__.py
@@ -30,6 +30,7 @@ from tensorflow.contrib.bayesflow.python.ops import mcmc_diagnostics
 from tensorflow.contrib.bayesflow.python.ops import metropolis_hastings
 from tensorflow.contrib.bayesflow.python.ops import monte_carlo
 from tensorflow.contrib.bayesflow.python.ops import optimizers
+from tensorflow.contrib.bayesflow.python.ops import variable_utils
 # pylint: enable=unused-import,line-too-long
 
 from tensorflow.python.util.all_util import remove_undocumented
@@ -48,6 +49,7 @@ _allowed_symbols = [
     'optimizers',
     'special_math',
     'stochastic_variables',
+    'variable_utils',
     'variational_inference',
 ]
 
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/variable_utils_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/variable_utils_test.py
new file mode 100644
index 0000000000..f978cf8641
--- /dev/null
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/variable_utils_test.py
@@ -0,0 +1,135 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for utility functions related to managing `tf.Variable`s."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import warnings
+
+import numpy as np
+
+from tensorflow.contrib.bayesflow.python.ops import variable_utils
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import variable_scope as varscope_ops
+from tensorflow.python.ops import variables as variables_ops
+from tensorflow.python.platform import test
+
+
+def test_fn(x):
+  x = ops.convert_to_tensor(x, name="x")
+  dtype = x.dtype.as_numpy_dtype
+  s = x.shape.as_list()
+  z = varscope_ops.get_variable(
+      name="z",
+      dtype=dtype,
+      initializer=np.arange(np.prod(s)).reshape(s).astype(dtype))
+  y = varscope_ops.get_variable(
+      name="y",
+      dtype=dtype,
+      initializer=np.arange(np.prod(s)).reshape(s).astype(dtype)**2)
+  return x + y + z
+
+
+class _WrapCallableTest(object):
+
+  def testDefaultArgsWorkCorrectly(self):
+    with self.test_session():
+      x = constant_op.constant(self.dtype([0.1, 0.2]))
+      wrapped_fn, vars_args = variable_utils.externalize_variables_as_args(
+          test_fn, [x])
+
+      varscope_ops.get_variable_scope().reuse_variables()
+
+      result = wrapped_fn(self.dtype(2), [3, 4, 5], 0.5)
+
+      y_actual = varscope_ops.get_variable("y", dtype=self.dtype)
+      z_actual = varscope_ops.get_variable("z", dtype=self.dtype)
+
+      variables_ops.global_variables_initializer().run()
+      result_ = result.eval()
+
+      self.assertEqual(self.dtype, result_.dtype)
+      self.assertAllEqual([5.5, 6.5, 7.5], result_)
+      self.assertAllEqual([y_actual, z_actual], vars_args)
+
+  def testNonDefaultArgsWorkCorrectly(self):
+    with self.test_session():
+      x = constant_op.constant(self.dtype([0.1, 0.2]))
+
+      _ = test_fn(self.dtype([0., 0.]))   # Needed to create vars.
+      varscope_ops.get_variable_scope().reuse_variables()
+
+      y_actual = varscope_ops.get_variable("y", dtype=self.dtype)
+
+      wrapped_fn, vars_args = variable_utils.externalize_variables_as_args(
+          test_fn, [x], possible_ancestor_vars=[y_actual])
+
+      result = wrapped_fn(self.dtype([2, 3]), 0.5)  # x, y
+
+      variables_ops.global_variables_initializer().run()
+      result_ = result.eval()
+
+      self.assertEqual(self.dtype, result_.dtype)
+      self.assertAllEqual([2.5, 4.5], result_)
+      self.assertAllEqual([y_actual], vars_args)
+
+  def testWarnings(self):
+    with self.test_session():
+      x = constant_op.constant(self.dtype([0.1, 0.2]))
+      wrapped_fn, _ = variable_utils.externalize_variables_as_args(
+          test_fn, [x], possible_ancestor_vars=[])
+      varscope_ops.get_variable_scope().reuse_variables()
+      with warnings.catch_warnings(record=True) as w:
+        wrapped_fn(self.dtype(2))
+      w = sorted(w, key=lambda w: str(w.message))
+      self.assertEqual(2, len(w))
+      self.assertRegexpMatches(
+          str(w[0].message),
+          r"Variable .* 'y:0' .* not found in bypass dict.")
+      self.assertRegexpMatches(
+          str(w[1].message),
+          r"Variable .* 'z:0' .* not found in bypass dict.")
+
+  def testExceptions(self):
+    with self.test_session():
+      x = constant_op.constant(self.dtype([0.1, 0.2]))
+      wrapped_fn, _ = variable_utils.externalize_variables_as_args(
+          test_fn,
+          [x],
+          possible_ancestor_vars=[],
+          assert_variable_override=True)
+      varscope_ops.get_variable_scope().reuse_variables()
+      with self.assertRaisesRegexp(ValueError, r"not found"):
+        wrapped_fn(self.dtype(2))
+
+
+class WrapCallableTest16(test.TestCase, _WrapCallableTest):
+  dtype = np.float16
+
+
+class WrapCallableTest32(test.TestCase, _WrapCallableTest):
+  dtype = np.float32
+
+
+class WrapCallableTest64(test.TestCase, _WrapCallableTest):
+  dtype = np.float64
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/bayesflow/python/ops/variable_utils.py b/tensorflow/contrib/bayesflow/python/ops/variable_utils.py
new file mode 100644
index 0000000000..eadf6f4d5f
--- /dev/null
+++ b/tensorflow/contrib/bayesflow/python/ops/variable_utils.py
@@ -0,0 +1,29 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility functions related to managing `tf.Variable`s."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# go/tf-wildcard-import
+from tensorflow.contrib.bayesflow.python.ops.variable_utils_impl import *  # pylint: disable=wildcard-import,unused-wildcard-import,g-importing-member
+from tensorflow.python.util import all_util
+
+_allowed_symbols = [
+    "externalize_variables_as_args",
+]
+
+all_util.remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/bayesflow/python/ops/variable_utils_impl.py b/tensorflow/contrib/bayesflow/python/ops/variable_utils_impl.py
new file mode 100644
index 0000000000..ca3d75b5bf
--- /dev/null
+++ b/tensorflow/contrib/bayesflow/python/ops/variable_utils_impl.py
@@ -0,0 +1,157 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility functions related to managing `tf.Variable`s.
+
+@@externalize_variables_as_args
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import warnings
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import gradients_impl as gradients_ops
+from tensorflow.python.ops import variable_scope as varscope_ops
+from tensorflow.python.ops import variables as variables_ops
+
+__all__ = [
+    "externalize_variables_as_args",
+]
+
+
+# Cause all warnings to always be triggered.
+# Not having this means subsequent calls wont trigger the warning.
+warnings.simplefilter("always")
+
+
+def externalize_variables_as_args(fn,
+                                  fn_args=(),
+                                  ancestor_variables=None,
+                                  possible_ancestor_vars=None,
+                                  assert_variable_override=False,
+                                  name=None):
+  """"Converts variables within a callable into explicit args.
+
+  Makes a new callable from `fn` which has arguments `list(fn_args) +
+  list(ancestor_variables)`. If `ancestor_variables` is not specified, it is
+  inferred by checking which of `possible_ancestor_vars` actually influences the
+  return value of `fn` (concretely, gradient of `fn(*fn_args)` is not `None`).
+  By default `possible_ancestor_vars` is `tf.trainable_variables() +
+  tf.get_collection(tf.GraphKeys.TRAINABLE_RESOURCE_VARIABLES)`.
+
+  #### Examples:
+
+  ```python
+  num_samples = 2
+  num_dims = 1
+  dtype = np.float32
+
+  def foo(x):
+    x = tf.convert_to_tensor(x, dtype=dtype, name="x")
+    s = x.shape.as_list()
+    y = tf.get_variable(
+        name="y",
+        dtype=dtype,
+        initializer=np.arange(np.prod(s)).reshape(s).astype(dtype))
+    return x + y
+
+  x = tf.constant(dtype([0.1, 0.2]))
+
+  wrapped_foo, discovered_ancestor_variables = (
+      externalize_variables_as_args(foo, [x]))
+
+  new_x = dtype([[1.], [2.]])
+  new_y = dtype([[3.], [4.]])
+  new_result = wrapped_foo(new_x, new_y)
+  # ==> [[4.], [6.]]
+
+  discovered_ancestor_variables == [tf.get_variable("y", dtype)]
+  # ==> [True]
+  ```
+
+  Args:
+    fn: Python callable which returns a `Tensor` and accepts `*fn_args`.
+    fn_args: Python list of args to `fn`. Represents dummy arguments passed to
+      `fn` to trace its execution; actual values are unimportant. These args are
+      only used to construct the output of `fn` and to resolve the ancestor
+      `tf.Variable`s.
+      Default value: `()` (i.e., `fn` takes no args).
+    ancestor_variables: Python list of `tf.Variable`s. When `None` the list is
+      expanded to non-`None` gradients of `fn(*fn_args)`. By directly providing
+      the `ancestor_variables` the internal call to `fn` is avoided.
+      Default value: `None` (i.e., `tf.Variable` dependencies are discovered).
+    possible_ancestor_vars: Python list of possible `tf.Variable`s which might
+      be a dependency of computing `fn(*fn_args)`.
+      Default value: `None` (i.e., expanded as described above).
+    assert_variable_override: Python `bool` indicating that not finding a
+      `tf.Variable` in the override list is an exception.
+      Default value: `False` (i.e., missing a `Variable` triggers a `warning`).
+    name: Python `str` name prefixed to Ops created by this function.
+      Default value: `None` (i.e., "externalize_variables_as_args").
+
+  Returns:
+    wrapped_fn: Python callable taking arguments like
+      `*(list(fn_args) + discovered_ancestor_variables)`.
+    discovered_ancestor_variables: Python list of `tf.Variable`s known to be a
+      dependency of `fn(*fn_args)`.
+
+  Raises:
+    ValueError: if `assert_variable_override` is `True` and `Variable` is
+      requested but not overridden.
+  """
+  def _make_bypassing_custom_getter_fn(new_var_dict):
+    """Return dict value rather than what would otherwise be dict key."""
+    def _custom_getter(getter, *args, **kwargs):
+      v = getter(*args, **kwargs)
+      new_v = new_var_dict.get(v, None)
+      if new_v is None:
+        msg = "Variable \"{}\" not found in bypass dict.".format(v)
+        if assert_variable_override:
+          raise ValueError(msg)
+        warnings.warn(msg)
+        return v
+      return new_v
+    return _custom_getter
+
+  with ops.name_scope(name, "externalize_variables_as_args"):
+    if ancestor_variables is not None and not ancestor_variables:
+      return fn, ()
+    if ancestor_variables is None:
+      y = fn(*fn_args)  # Side-effect: adds trainable vars.
+      if possible_ancestor_vars is None:
+        possible_ancestor_vars = (
+            variables_ops.trainable_variables() +
+            ops.get_collection(ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES))
+      # TODO(b/72873296): Add a dedicated op for identifying ancestors.
+      ancestors = [v for g, v
+                   in zip(gradients_ops.gradients(y, possible_ancestor_vars),
+                          possible_ancestor_vars)
+                   if g is not None]
+      ancestor_variables = sorted(ancestors, key=lambda v: v.name)
+  n = len(fn_args)
+  def _fn(*args):
+    with ops.name_scope("wrapped_fn"):
+      vars_dict = dict(
+          (k, ops.convert_to_tensor(
+              v, dtype=k.dtype.base_dtype, name=k.op.name))
+          for k, v in zip(ancestor_variables, args[n:]))
+      with varscope_ops.variable_scope(
+          varscope_ops.get_variable_scope(),
+          reuse=True,
+          custom_getter=_make_bypassing_custom_getter_fn(vars_dict)):
+        return fn(*args[:n])
+  return _fn, ancestor_variables
-- 
GitLab


From d55d0d863c4e705846fc61ba3af7569cca509d67 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <bsteiner@google.com>
Date: Tue, 6 Feb 2018 13:48:55 -0800
Subject: [PATCH 727/966] Added support for nested functions Properly handle
 the case of control dependencies

PiperOrigin-RevId: 184733444
---
 .../core/grappler/grappler_item_builder.cc    | 28 +++++--
 .../core/grappler/grappler_item_builder.h     |  3 +-
 .../grappler/grappler_item_builder_test.cc    | 82 ++++++++++++++++++-
 3 files changed, 101 insertions(+), 12 deletions(-)

diff --git a/tensorflow/core/grappler/grappler_item_builder.cc b/tensorflow/core/grappler/grappler_item_builder.cc
index 88fddaf019..7ba498dd06 100644
--- a/tensorflow/core/grappler/grappler_item_builder.cc
+++ b/tensorflow/core/grappler/grappler_item_builder.cc
@@ -512,7 +512,8 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
 
 std::unique_ptr<GrapplerItem> GrapplerItemFromFunctionDef(
     const FunctionDef& func,
-    const std::unordered_map<string, AttrValue>& func_attr) {
+    const std::unordered_map<string, AttrValue>& func_attr,
+    const FunctionDefLibrary& library) {
   if (func.signature().name().empty()) {
     LOG(ERROR) << "function name must be specified.";
     return nullptr;
@@ -543,6 +544,8 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromFunctionDef(
   }
 
   // Add the function body to the graph.
+  FunctionLibraryDefinition func_def(OpRegistry::Global(), library);
+
   for (const NodeDef& node : func.node_def()) {
     NodeDef* new_node = new_item->graph.add_node();
     *new_node = node;
@@ -557,15 +560,17 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromFunctionDef(
 
     // Functions use a custom format to encode connectivity. Map these custom
     // strings to regular ones.
-    const OpDef* op_def = nullptr;
-    Status status = OpRegistry::Global()->LookUpOpDef(node.op(), &op_def);
+    const OpRegistrationData* registration;
+    Status status = func_def.LookUp(node.op(), &registration);
     if (!status.ok()) {
       LOG(ERROR) << "Op " << node.op() << " not registered: " << status;
       return nullptr;
     }
+
     tensorflow::NameRangeMap inputs;
     tensorflow::NameRangeMap outputs;
-    status = tensorflow::NameRangesForNode(node, *op_def, &inputs, &outputs);
+    status = tensorflow::NameRangesForNode(node, registration->op_def, &inputs,
+                                           &outputs);
     if (!status.ok()) {
       LOG(ERROR) << "Op " << node.op() << " invalid: " << status;
       return nullptr;
@@ -587,12 +592,17 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromFunctionDef(
     // Rewrite the inputs to use the normal naming convention.
     for (int i = 0; i < node.input_size(); ++i) {
       const string& input = node.input(i);
-      auto it = port_map.find(input);
-      if (it == port_map.end()) {
-        LOG(ERROR) << "Unknown input: " << input;
-        return nullptr;
+      if (IsControlInput(input)) {
+        // No need to remap control dependencies.
+        continue;
+      } else {
+        auto it = port_map.find(input);
+        if (it == port_map.end()) {
+          LOG(ERROR) << "Unknown input: " << input;
+          return nullptr;
+        }
+        node.set_input(i, it->second);
       }
-      node.set_input(i, it->second);
     }
   }
 
diff --git a/tensorflow/core/grappler/grappler_item_builder.h b/tensorflow/core/grappler/grappler_item_builder.h
index cb116840c1..e892a3f556 100644
--- a/tensorflow/core/grappler/grappler_item_builder.h
+++ b/tensorflow/core/grappler/grappler_item_builder.h
@@ -62,7 +62,8 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
 // Returns nullptr if the given function def cannot be converted.
 std::unique_ptr<GrapplerItem> GrapplerItemFromFunctionDef(
     const FunctionDef& func,
-    const std::unordered_map<string, AttrValue>& func_attr);
+    const std::unordered_map<string, AttrValue>& func_attr,
+    const FunctionDefLibrary& library);
 
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/grappler_item_builder_test.cc b/tensorflow/core/grappler/grappler_item_builder_test.cc
index 8dcc4ec0f9..68437b6041 100644
--- a/tensorflow/core/grappler/grappler_item_builder_test.cc
+++ b/tensorflow/core/grappler/grappler_item_builder_test.cc
@@ -300,8 +300,9 @@ TEST_F(GrapplerItemBuilderTest, FromSimpleFunctionDef) {
 
   std::unordered_map<string, AttrValue> func_attr;
   func_attr["T"].set_type(DT_FLOAT);
+  FunctionDefLibrary library;
   std::unique_ptr<GrapplerItem> item =
-      GrapplerItemFromFunctionDef(func, func_attr);
+      GrapplerItemFromFunctionDef(func, func_attr, library);
   CHECK(item);
   EXPECT_EQ("XTimesTwo", item->id);
   EXPECT_EQ(4, item->graph.node_size());
@@ -366,8 +367,9 @@ TEST_F(GrapplerItemBuilderTest, FromFunctionDefWithMultiOutputNodes) {
 
   std::unordered_map<string, AttrValue> func_attr;
   func_attr["T"].set_type(DT_FLOAT);
+  FunctionDefLibrary library;
   std::unique_ptr<GrapplerItem> item =
-      GrapplerItemFromFunctionDef(func, func_attr);
+      GrapplerItemFromFunctionDef(func, func_attr, library);
   CHECK(item);
   EXPECT_EQ("SubGrad", item->id);
   EXPECT_EQ(12, item->graph.node_size());
@@ -401,6 +403,82 @@ TEST_F(GrapplerItemBuilderTest, FromFunctionDefWithMultiOutputNodes) {
   }
 }
 
+TEST_F(GrapplerItemBuilderTest, FromFunctionDefWithNestedFuncs) {
+  FunctionDefLibrary library;
+  *library.add_function() = FunctionDefHelper::Define(
+      // Name
+      "Swap",
+      // Args
+      {"i0: T", "i1: T"},
+      // Return values
+      {"o0: T", "o1: T"},
+      // Attr def
+      {"T: {float, double}"},
+      // Nodes
+      {{{"o0"}, "Identity", {"i1"}, {{"T", "$T"}}},
+       {{"o1"}, "Identity", {"i0"}, {{"T", "$T"}}}});
+
+  FunctionDef func = FunctionDefHelper::Create(
+      // Name
+      "ManySwapsFirst",
+      // Args
+      {"x: float", "y: float"},
+      // Return values
+      {"o: float"},
+      // attr def
+      {},
+      // Nodes
+      // o = x*x + y*y.  Furthermore, The 1st swap depends on x2, and
+      // y2 depends on the 2nd swap.  The 2nd swap has data dependency
+      // on the 1st swap.
+      {{{"a0"}, "Swap", {"x", "y"}, {{"T", DT_FLOAT}}, {"x2"}},
+       {{"a1"}, "Swap", {"a0:o0:0", "a0:o1:0"}, {{"T", DT_FLOAT}}},
+       {{"x2"}, "Mul", {"x", "x"}, {{"T", DT_FLOAT}}},
+       {{"y2"}, "Mul", {"y", "y"}, {{"T", DT_FLOAT}}, {"a1"}},
+       {{"o"}, "Add", {"x2:z:0", "y2:z:0"}, {{"T", DT_FLOAT}}}},
+      {{"o", "o:z:0"}});
+
+  std::unordered_map<string, AttrValue> func_attr;
+  func_attr["T"].set_type(DT_FLOAT);
+  std::unique_ptr<GrapplerItem> item =
+      GrapplerItemFromFunctionDef(func, func_attr, library);
+
+  for (const NodeDef &node : item->graph.node()) {
+    if (node.name() == "x" || node.name() == "y") {
+      EXPECT_EQ("Placeholder", node.op());
+      EXPECT_EQ(DT_FLOAT, node.attr().at("T").type());
+      EXPECT_EQ(0, node.input_size());
+    } else if (node.name() == "a0") {
+      EXPECT_EQ("Swap", node.op());
+      EXPECT_EQ(3, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("y", node.input(1));
+      EXPECT_EQ("^x2", node.input(2));
+    } else if (node.name() == "a1") {
+      EXPECT_EQ("Swap", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("a0:0", node.input(0));
+      EXPECT_EQ("a0:1", node.input(1));
+    } else if (node.name() == "x2") {
+      EXPECT_EQ("Mul", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("x", node.input(1));
+    } else if (node.name() == "y2") {
+      EXPECT_EQ("Mul", node.op());
+      EXPECT_EQ(3, node.input_size());
+      EXPECT_EQ("y", node.input(0));
+      EXPECT_EQ("y", node.input(1));
+      EXPECT_EQ("^a1", node.input(2));
+    } else if (node.name() == "o") {
+      EXPECT_EQ("Add", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("x2:0", node.input(0));
+      EXPECT_EQ("y2:0", node.input(1));
+    }
+  }
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
-- 
GitLab


From ee5a6641e3a2db7807a655bef21614e97b82c769 Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Tue, 6 Feb 2018 13:53:56 -0800
Subject: [PATCH 728/966] Make TFE_Py_FastpathExecute work for all types of ops

MatMul benchmarks:
entry {
  name: "MicroBenchmarks.benchmark_gen_math_ops_matmul_2_by_2_CPU"
  iters: 30000
  wall_time: 11.580435435
  extras {
    key: "examples_per_sec"
    value {
      double_value: 86352.538781
    }
  }
}

entry {
  name: "MicroBenchmarks.benchmark_tfe_py_fastpath_execute_matmul_2_by_2_CPU"
  iters: 30000
  wall_time: 7.02576637268
  extras {
    key: "examples_per_sec"
    value {
      double_value: 142333.227004
    }
  }
}

PiperOrigin-RevId: 184734289
---
 tensorflow/c/c_api.cc                      |   4 +
 tensorflow/python/eager/benchmarks_test.py |  29 +-
 tensorflow/python/eager/core.py            |  14 +
 tensorflow/python/eager/pywrap_tfe.h       |  20 +-
 tensorflow/python/eager/pywrap_tfe_src.cc  | 630 ++++++++++++++++++---
 tensorflow/python/eager/pywrap_tfe_test.py | 114 +++-
 tensorflow/python/pywrap_tfe.i             |   1 +
 7 files changed, 690 insertions(+), 122 deletions(-)

diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index 3c7f041b39..b10af0f060 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -109,6 +109,10 @@ TF_Status* TF_NewStatus() { return new TF_Status; }
 void TF_DeleteStatus(TF_Status* s) { delete s; }
 
 void TF_SetStatus(TF_Status* s, TF_Code code, const char* msg) {
+  if (code == TF_OK) {
+    s->status = Status::OK();
+    return;
+  }
   s->status = Status(static_cast<Code>(code), tensorflow::StringPiece(msg));
 }
 
diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index 75526ba9c1..7a60bd68e4 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -28,11 +28,14 @@ from __future__ import print_function
 import time
 
 import numpy as np
+import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import backprop  # pylint: disable=unused-import
 from tensorflow.python.eager import context
+from tensorflow.python.eager import core
+from tensorflow.python.eager import execute
 from tensorflow.python.eager import function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import dtypes
@@ -46,19 +49,25 @@ CPU = "/device:CPU:0"
 GPU = "/device:GPU:0"
 
 
-def record_gradient_callback(inputs, attrs, results):
-  return backprop._record_gradient("MatMul", inputs, attrs, results, None)
-
-
-def c_tfe_py_fastpath_execute(a, b, transpose_a=False, transpose_b=False):
+def c_tfe_py_fastpath_execute(a,
+                              b,
+                              transpose_a=False,
+                              transpose_b=False,
+                              name=None):
   ctx = context.context()
   assert not ctx.in_graph_mode(
   ), "The prototype doesn't contain C code for graph construction"
-  ctx_handle = ctx._handle  # pylint: disable=protected-access
-
-  return pywrap_tensorflow.TFE_Py_FastPathExecute(
-      ctx_handle, None, "MatMul", record_gradient_callback, a, b,
-      "transpose_a", transpose_a, "transpose_b", transpose_b)[0]
+  try:
+    return pywrap_tensorflow.TFE_Py_FastPathExecute(
+        ctx._handle, ctx.device_name, "MatMul", execute.record_gradient, name,
+        ctx._post_execution_callbacks, a, b, "transpose_a", transpose_a,
+        "transpose_b", transpose_b)
+  except core._NotOkStatusException as e:
+    if name is not None:
+      message = e.message + " name: " + name
+    else:
+      message = e.message
+    six.raise_from(core._status_to_exception(e.code, message), None)
 
 
 class MicroBenchmarks(test.Benchmark):
diff --git a/tensorflow/python/eager/core.py b/tensorflow/python/eager/core.py
index 483b717210..8fb6930020 100644
--- a/tensorflow/python/eager/core.py
+++ b/tensorflow/python/eager/core.py
@@ -47,3 +47,17 @@ class _NotOkStatusException(Exception):
 
 
 pywrap_tensorflow.TFE_Py_RegisterExceptionClass(_NotOkStatusException)
+
+
+class _FallbackException(Exception):
+  """Exception class to handle fallback from the fastpath.
+
+  The fastpath that we refer to here is the one implemented to reduce per-op
+  overheads (TFE_Py_FastPathExecute_C). If the conditions for executing the op
+  on the fastpath are not met, we fallback to a safer (and more complete)
+  slowpath, and this Exception is raised to signal that transition.
+  """
+  pass
+
+
+pywrap_tensorflow.TFE_Py_RegisterFallbackExceptionClass(_FallbackException)
diff --git a/tensorflow/python/eager/pywrap_tfe.h b/tensorflow/python/eager/pywrap_tfe.h
index 4aea134fa9..925eeb553c 100644
--- a/tensorflow/python/eager/pywrap_tfe.h
+++ b/tensorflow/python/eager/pywrap_tfe.h
@@ -45,9 +45,17 @@ void TFE_Py_Execute(TFE_Context* ctx, const char* device_name,
                     PyObject* attrs, TFE_OutputTensorHandles* outputs,
                     TF_Status* out_status);
 
+// Registers e as the Exception to be raised when the conditions of
+// TFE_Py_FastPathExecute_C have not been met. When this exception is set, it
+// is a signal to the calling code that it should fall back to the safer (and
+// more complete) code path.
+PyObject* TFE_Py_RegisterExceptionClass(PyObject* e);
+
 // Registers e as the Exception class for handling not ok Status. Returns
 // Py_None if registration succeeds, else throws a TypeError and returns NULL.
-PyObject* TFE_Py_RegisterExceptionClass(PyObject* e);
+//
+// This function is not thread-safe.
+PyObject* TFE_Py_RegisterFallbackExceptionClass(PyObject* e);
 
 // Returns 0 if 'status' is TF_OK. Otherwise, raises an exception (using
 // `exception` if not nullptr, else using the class registered via
@@ -142,10 +150,12 @@ PyObject* TFE_Py_TapeGradient(PyObject* tape, PyObject* vspace,
 //          or NULL for automatic selection.
 //  Item 3: op_name: Name of the TensorFlow op to execute.
 //  Item 4: record_gradient_callback: Callback that records the gradient of the
-//          result.
-//          The callback takes (inputs, attrs, result) - all sequences and
-//          records the gradient.
-//  Item 5 onwards: inputs - This is a list of inputs followed by a list of
+//          result. The callback takes (op_name, inputs, attrs, result, name)
+//          - all sequences and records the gradient.
+//  Item 5: name: An optional name for the operation.
+//  Item 6: List representing all callbacks to execute after successful
+//  op execute.
+//  Item 7 onwards: inputs - This is a list of inputs followed by a list of
 //        attrs. It is not necessary for type attrs to be present.
 //
 // This is named _C since there doesn't seem to be any way to make it visible
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index d927f3abed..6e96cf6ebe 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -23,9 +23,11 @@ limitations under the License.
 #include "tensorflow/c/eager/tape.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/gtl/compactptrset.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/python/eager/pywrap_tensor.h"
 
@@ -56,6 +58,7 @@ PARSE_VALUE(ParseInt64Value, int64_t, PyLong_Check, PyLong_AsLong)
 #else
 PARSE_VALUE(ParseIntValue, int, PyInt_Check, PyInt_AsLong)
 PARSE_VALUE(ParseInt64Value, int64_t, PyInt_Check, PyInt_AsLong)
+PARSE_VALUE(ParseInt64LongValue, int64_t, PyLong_Check, PyLong_AsLong)
 #endif
 PARSE_VALUE(ParseFloatValue, float, PyFloat_Check, PyFloat_AsDouble)
 #undef PARSE_VALUE
@@ -86,8 +89,40 @@ bool ParseBoolValue(const string& key, PyObject* py_value, TF_Status* status,
   return true;
 }
 
-bool SetOpAttrList(TFE_Op* op, const char* key, PyObject* py_list,
-                   TF_AttrType type, TF_Status* status) {
+bool IsInteger(PyObject* py_value) {
+#if PY_MAJOR_VERSION >= 3
+  return PyLong_Check(py_value);
+#else
+  return PyInt_Check(py_value);
+#endif
+}
+
+// The passed in py_value is expected to be an object of the python type
+// dtypes.DType or an int.
+bool ParseTypeValue(const string& key, PyObject* py_value, TF_Status* status,
+                    int* value) {
+  if (IsInteger(py_value)) {
+    return ParseIntValue(key, py_value, status, value);
+  }
+
+  PyObject* py_type_enum = PyObject_GetAttrString(py_value, "_type_enum");
+  if (py_type_enum == nullptr) {
+    return false;
+  }
+
+  if (!ParseIntValue(key, py_type_enum, status, value)) {
+    Py_DECREF(py_type_enum);
+    return false;
+  }
+
+  Py_DECREF(py_type_enum);
+  return true;
+}
+
+bool SetOpAttrList(
+    TFE_Op* op, const char* key, PyObject* py_list, TF_AttrType type,
+    tensorflow::gtl::FlatMap<string, tensorflow::int64>* attr_list_sizes,
+    TF_Status* status) {
   if (!PySequence_Check(py_list)) {
     TF_SetStatus(
         status, TF_INVALID_ARGUMENT,
@@ -97,6 +132,7 @@ bool SetOpAttrList(TFE_Op* op, const char* key, PyObject* py_list,
     return false;
   }
   const int num_values = PySequence_Size(py_list);
+  if (attr_list_sizes != nullptr) (*attr_list_sizes)[key] = num_values;
 
 #define PARSE_LIST(c_type, parse_fn)                                \
   std::unique_ptr<c_type[]> values(new c_type[num_values]);         \
@@ -118,7 +154,7 @@ bool SetOpAttrList(TFE_Op* op, const char* key, PyObject* py_list,
     PARSE_LIST(unsigned char, ParseBoolValue);
     TFE_OpSetAttrBoolList(op, key, values.get(), num_values);
   } else if (type == TF_ATTR_TYPE) {
-    PARSE_LIST(int, ParseIntValue);
+    PARSE_LIST(int, ParseTypeValue);
     TFE_OpSetAttrTypeList(op, key,
                           reinterpret_cast<const TF_DataType*>(values.get()),
                           num_values);
@@ -183,8 +219,65 @@ bool SetOpAttrList(TFE_Op* op, const char* key, PyObject* py_list,
   return true;
 }
 
-bool SetOpAttrScalar(TFE_Context* ctx, TFE_Op* op, const char* key,
-                     PyObject* py_value, TF_AttrType type, TF_Status* status) {
+void SetOpAttrListDefault(
+    TFE_Op* op, const tensorflow::OpDef::AttrDef& attr, const char* key,
+    TF_AttrType type,
+    tensorflow::gtl::FlatMap<string, tensorflow::int64>* attr_list_sizes,
+    TF_Status* status) {
+  if (type == TF_ATTR_STRING) {
+    int num_values = attr.default_value().list().s_size();
+    std::unique_ptr<const char* []> values(new const char*[num_values]);
+    (*attr_list_sizes)[key] = num_values;
+    for (int i = 0; i < num_values; i++) {
+      values[i] = attr.default_value().list().s(i).data();
+    }
+    TFE_OpSetAttrStringList(op, key, values.get(), num_values);
+  } else if (type == TF_ATTR_INT) {
+    int num_values = attr.default_value().list().i_size();
+    std::unique_ptr<int64_t[]> values(new int64_t[num_values]);
+    (*attr_list_sizes)[key] = num_values;
+    for (int i = 0; i < num_values; i++) {
+      values[i] = attr.default_value().list().i(i);
+    }
+    TFE_OpSetAttrIntList(op, key, values.get(), num_values);
+  } else if (type == TF_ATTR_FLOAT) {
+    int num_values = attr.default_value().list().f_size();
+    std::unique_ptr<float[]> values(new float[num_values]);
+    (*attr_list_sizes)[key] = num_values;
+    for (int i = 0; i < num_values; i++) {
+      values[i] = attr.default_value().list().f(i);
+    }
+    TFE_OpSetAttrFloatList(op, key, values.get(), num_values);
+  } else if (type == TF_ATTR_BOOL) {
+    int num_values = attr.default_value().list().b_size();
+    std::unique_ptr<unsigned char[]> values(new unsigned char[num_values]);
+    (*attr_list_sizes)[key] = num_values;
+    for (int i = 0; i < num_values; i++) {
+      values[i] = attr.default_value().list().b(i);
+    }
+    TFE_OpSetAttrBoolList(op, key, values.get(), num_values);
+  } else if (type == TF_ATTR_TYPE) {
+    int num_values = attr.default_value().list().type_size();
+    std::unique_ptr<int[]> values(new int[num_values]);
+    (*attr_list_sizes)[key] = num_values;
+    for (int i = 0; i < num_values; i++) {
+      values[i] = attr.default_value().list().type(i);
+    }
+    TFE_OpSetAttrTypeList(op, key,
+                          reinterpret_cast<const TF_DataType*>(values.get()),
+                          attr.default_value().list().type_size());
+  } else {
+    TF_SetStatus(status, TF_UNIMPLEMENTED,
+                 "Shapes, tensors and lists are not yet implemented for "
+                 "default valued attributes for an operation.");
+  }
+}
+
+bool SetOpAttrScalar(
+    TFE_Context* ctx, TFE_Op* op, const char* key, PyObject* py_value,
+    TF_AttrType type,
+    tensorflow::gtl::FlatMap<string, tensorflow::int64>* attr_list_sizes,
+    TF_Status* status) {
   if (type == TF_ATTR_STRING) {
     const char* value;
     if (!ParseStringValue(key, py_value, status, &value)) return false;
@@ -193,6 +286,10 @@ bool SetOpAttrScalar(TFE_Context* ctx, TFE_Op* op, const char* key,
     int64_t value;
     if (!ParseInt64Value(key, py_value, status, &value)) return false;
     TFE_OpSetAttrInt(op, key, value);
+    // attr_list_sizes is set for all int attributes (since at this point we are
+    // not aware if that attribute might be used to calculate the size of an
+    // output list or not).
+    if (attr_list_sizes != nullptr) (*attr_list_sizes)[key] = value;
   } else if (type == TF_ATTR_FLOAT) {
     float value;
     if (!ParseFloatValue(key, py_value, status, &value)) return false;
@@ -203,7 +300,7 @@ bool SetOpAttrScalar(TFE_Context* ctx, TFE_Op* op, const char* key,
     TFE_OpSetAttrBool(op, key, value);
   } else if (type == TF_ATTR_TYPE) {
     int value;
-    if (!ParseIntValue(key, py_value, status, &value)) return false;
+    if (!ParseTypeValue(key, py_value, status, &value)) return false;
     TFE_OpSetAttrType(op, key, static_cast<TF_DataType>(value));
   } else if (type == TF_ATTR_SHAPE) {
     if (py_value == Py_None) {
@@ -269,6 +366,40 @@ bool SetOpAttrScalar(TFE_Context* ctx, TFE_Op* op, const char* key,
   return true;
 }
 
+void SetOpAttrScalarDefault(
+    TFE_Op* op, const tensorflow::OpDef::AttrDef& attr, const char* attr_name,
+    TF_AttrType type,
+    tensorflow::gtl::FlatMap<string, tensorflow::int64>* attr_list_sizes,
+    TF_Status* status) {
+  if (type == TF_ATTR_STRING) {
+    if (attr.default_value().value_case() == tensorflow::AttrValue::kS) {
+      TFE_OpSetAttrString(op, attr_name, attr.default_value().s().data());
+    }
+  } else if (type == TF_ATTR_INT) {
+    if (attr.default_value().value_case() == tensorflow::AttrValue::kI) {
+      TFE_OpSetAttrInt(op, attr_name,
+                       static_cast<int64_t>(attr.default_value().i()));
+      (*attr_list_sizes)[attr.name()] = attr.default_value().i();
+    }
+  } else if (type == TF_ATTR_FLOAT) {
+    if (attr.default_value().value_case() == tensorflow::AttrValue::kF) {
+      TFE_OpSetAttrFloat(op, attr_name, attr.default_value().f());
+    }
+  } else if (type == TF_ATTR_BOOL) {
+    if (attr.default_value().value_case() == tensorflow::AttrValue::kB) {
+      TFE_OpSetAttrBool(op, attr_name, attr.default_value().b());
+    }
+  } else if (type == TF_ATTR_TYPE) {
+    if (attr.default_value().value_case() == tensorflow::AttrValue::kType) {
+      TFE_OpSetAttrType(op, attr_name,
+                        static_cast<TF_DataType>(attr.default_value().type()));
+    }
+  } else {
+    TF_SetStatus(status, TF_UNIMPLEMENTED,
+                 "Shapes, tensors and lists are not yet implemented.");
+  }
+}
+
 // start_index is the index at which the Tuple/List attrs will start getting
 // processed.
 void SetOpAttrs(TFE_Context* ctx, TFE_Op* op, PyObject* attrs, int start_index,
@@ -294,9 +425,38 @@ void SetOpAttrs(TFE_Context* ctx, TFE_Op* op, PyObject* attrs, int start_index,
     const TF_AttrType type = TFE_OpGetAttrType(op, key, &is_list, out_status);
     if (TF_GetCode(out_status) != TF_OK) return;
     if (is_list != 0) {
-      if (!SetOpAttrList(op, key, py_value, type, out_status)) return;
+      if (!SetOpAttrList(op, key, py_value, type, nullptr, out_status)) return;
+    } else {
+      if (!SetOpAttrScalar(ctx, op, key, py_value, type, nullptr, out_status))
+        return;
+    }
+  }
+}
+
+// This function will set the op attrs required. If an attr has the value of
+// None, then it will read the AttrDef to get the default value and set that
+// instead. Any failure in this function will simply fall back to the slow path.
+void SetOpAttrWithDefaults(
+    TFE_Context* ctx, TFE_Op* op, const tensorflow::OpDef::AttrDef& attr,
+    const char* attr_name, PyObject* attr_value,
+    tensorflow::gtl::FlatMap<string, tensorflow::int64>* attr_list_sizes,
+    TF_Status* status) {
+  unsigned char is_list = 0;
+  const TF_AttrType type = TFE_OpGetAttrType(op, attr_name, &is_list, status);
+  if (TF_GetCode(status) != TF_OK) return;
+  if (attr_value == Py_None) {
+    if (is_list != 0) {
+      SetOpAttrListDefault(op, attr, attr_name, type, attr_list_sizes, status);
     } else {
-      if (!SetOpAttrScalar(ctx, op, key, py_value, type, out_status)) return;
+      SetOpAttrScalarDefault(op, attr, attr_name, type, attr_list_sizes,
+                             status);
+    }
+  } else {
+    if (is_list != 0) {
+      SetOpAttrList(op, attr_name, attr_value, type, attr_list_sizes, status);
+    } else {
+      SetOpAttrScalar(ctx, op, attr_name, attr_value, type, attr_list_sizes,
+                      status);
     }
   }
 }
@@ -305,6 +465,9 @@ void SetOpAttrs(TFE_Context* ctx, TFE_Op* op, PyObject* attrs, int start_index,
 tensorflow::mutex exception_class_mutex(tensorflow::LINKER_INITIALIZED);
 PyObject* exception_class GUARDED_BY(exception_class_mutex) = nullptr;
 
+// Python subclass of Exception that is created to signal fallback.
+PyObject* fallback_exception_class = nullptr;
+
 tensorflow::mutex _uid_mutex(tensorflow::LINKER_INITIALIZED);
 tensorflow::int64 _uid GUARDED_BY(_uid_mutex) = 0;
 
@@ -360,6 +523,37 @@ PyObject* TFE_Py_RegisterExceptionClass(PyObject* e) {
   }
 }
 
+PyObject* TFE_Py_RegisterFallbackExceptionClass(PyObject* e) {
+  if (fallback_exception_class != nullptr) {
+    Py_DECREF(fallback_exception_class);
+  }
+  if (PyObject_IsSubclass(e, PyExc_Exception) <= 0) {
+    fallback_exception_class = nullptr;
+    PyErr_SetString(PyExc_TypeError,
+                    "TFE_Py_RegisterFallbackExceptionClass: "
+                    "Registered class should be subclass of Exception.");
+    return nullptr;
+  } else {
+    Py_INCREF(e);
+    fallback_exception_class = e;
+    Py_RETURN_NONE;
+  }
+}
+
+void RaiseFallbackException(const char* message) {
+  if (fallback_exception_class != nullptr) {
+    PyErr_SetObject(fallback_exception_class, Py_BuildValue("s", message));
+    return;
+  }
+
+  PyErr_SetString(
+      PyExc_RuntimeError,
+      tensorflow::strings::StrCat(
+          "Fallback exception type not set, attempting to fallback due to ",
+          message)
+          .data());
+}
+
 int MaybeRaiseExceptionFromTFStatus(TF_Status* status, PyObject* exception) {
   if (TF_GetCode(status) == TF_OK) return 0;
   const char* msg = TF_Message(status);
@@ -1036,17 +1230,62 @@ PyObject* TFE_Py_TapeGradient(PyObject* tape, PyObject* vspace,
 }
 
 namespace {
-static const int kFastPathExecuteInputStartIndex = 4;
+static const int kFastPathExecuteInputStartIndex = 6;
+
+PyObject* GetPythonObjectFromString(const char* s) {
+#if PY_MAJOR_VERSION >= 3
+  return PyUnicode_FromString(s);
+#else
+  return PyBytes_FromString(s);
+#endif
+}
 
-bool CheckEagerTensors(PyObject* seq, int start_index, int num_to_check) {
-  for (int i = start_index; i < start_index + num_to_check; i++) {
-    PyObject* item = PyTuple_GET_ITEM(seq, i);
-    if (!EagerTensor_CheckExact(item)) return false;
+bool CheckEagerTensors(PyObject* seq, int start_index,
+                       const tensorflow::OpDef& op_def) {
+  for (int i = 0; i < op_def.input_arg_size(); i++) {
+    PyObject* item = PyTuple_GET_ITEM(seq, i + start_index);
+    if (!op_def.input_arg(i).number_attr().empty() ||
+        !op_def.input_arg(i).type_list_attr().empty()) {
+      // This item should be a list input.
+      if (!PyList_Check(item)) return false;
+      for (Py_ssize_t j = 0; j < PyList_Size(item); j++) {
+        if (!EagerTensor_CheckExact(PyList_GET_ITEM(item, j))) return false;
+      }
+    } else if (!EagerTensor_CheckExact(item)) {
+      return false;
+    }
   }
 
   return true;
 }
 
+// Adds input and type attr to the op, and to the list of flattened
+// inputs/attrs.
+bool AddInputToOp(PyObject* input, const tensorflow::OpDef::ArgDef* input_arg,
+                  std::vector<PyObject*>* flattened_attrs,
+                  std::vector<PyObject*>* flattened_inputs, TFE_Op* op,
+                  TF_Status* status) {
+  TFE_TensorHandle* input_handle = EagerTensor_Handle(input);
+  if (input_arg != nullptr && !input_arg->type_attr().empty()) {
+    auto dtype = TFE_TensorHandleDataType(input_handle);
+    TFE_OpSetAttrType(op, input_arg->type_attr().data(), dtype);
+    if (flattened_attrs != nullptr) {
+      flattened_attrs->push_back(
+          GetPythonObjectFromString(input_arg->type_attr().data()));
+      flattened_attrs->push_back(PyLong_FromLong(dtype));
+    }
+  }
+
+  if (flattened_inputs != nullptr) {
+    flattened_inputs->push_back(input);
+  }
+  TFE_OpAddInput(op, input_handle, status);
+  if (MaybeRaiseExceptionFromTFStatus(status, nullptr)) {
+    return false;
+  }
+  return true;
+}
+
 const tensorflow::OpDef* GetOpDef(PyObject* py_op_name) {
   const char* op_name = TFE_GetPythonString(py_op_name);
   if (op_name == nullptr) {
@@ -1073,59 +1312,109 @@ const char* GetDeviceName(PyObject* py_device_name) {
   return nullptr;
 }
 
-bool MaybeRunRecordGradientCallback(const tensorflow::OpDef* op_def,
-                                    PyObject* args, PyObject* result,
-                                    PyObject* record_gradient_callback) {
-  if (*ThreadTapeIsStopped() || GetTapeSet()->empty() ||
-      record_gradient_callback == Py_None) {
-    return true;
-  }
-  if (!PyCallable_Check(record_gradient_callback)) {
-    PyErr_SetString(
-        PyExc_TypeError,
-        Printf(
-            "expected a function for record_gradient_callback, got %s instead",
-            record_gradient_callback->ob_type->tp_name)
-            .c_str());
+bool RaiseIfNotPyList(PyObject* list, const string& attr_name) {
+  if (!PyList_Check(list)) {
+    PyErr_SetString(PyExc_TypeError,
+                    Printf("expected a list for attr %s, got %s instead",
+                           attr_name.data(), list->ob_type->tp_name)
+                        .data());
+
     return false;
   }
+  return true;
+}
 
-  PyObject* inputs = PyTuple_New(op_def->input_arg_size());
-  for (int i = 0; i < op_def->input_arg_size(); i++) {
-    auto* input = PyTuple_GET_ITEM(args, kFastPathExecuteInputStartIndex + i);
+bool RunCallbacks(bool run_gradient_callback, bool run_post_exec_callbacks,
+                  const tensorflow::OpDef* op_def, PyObject* args,
+                  const std::vector<PyObject*>& flattened_inputs,
+                  const std::vector<PyObject*>& flattened_attrs,
+                  PyObject* flattened_result, PyObject* op_name, PyObject* name,
+                  PyObject* record_gradient_callback, PyObject* callbacks) {
+  PyObject* inputs = PyTuple_New(flattened_inputs.size());
+  for (int i = 0; i < flattened_inputs.size(); i++) {
+    PyObject* input = flattened_inputs[i];
     Py_INCREF(input);
     PyTuple_SET_ITEM(inputs, i, input);
   }
 
-  int args_size = PyTuple_GET_SIZE(args);
-  int num_attrs =
-      args_size - op_def->input_arg_size() - kFastPathExecuteInputStartIndex;
+  int num_non_inferred_attrs = PyTuple_GET_SIZE(args) -
+                               op_def->input_arg_size() -
+                               kFastPathExecuteInputStartIndex;
+  int num_attrs = flattened_attrs.size() + num_non_inferred_attrs;
   PyObject* attrs = PyTuple_New(num_attrs);
-  for (int i = 0; i < num_attrs; i++) {
+
+  for (int i = 0; i < num_non_inferred_attrs; i++) {
     auto* attr = PyTuple_GET_ITEM(
         args, kFastPathExecuteInputStartIndex + op_def->input_arg_size() + i);
     Py_INCREF(attr);
     PyTuple_SET_ITEM(attrs, i, attr);
   }
+  for (int i = num_non_inferred_attrs; i < num_attrs; i++) {
+    // Not INCREFing anything in flattened_attrs as each of those is a new
+    // reference, so allow the attrs tuple to steal the reference.
+    PyTuple_SET_ITEM(attrs, i, flattened_attrs.at(i - num_non_inferred_attrs));
+  }
 
-  PyObject* callback_args = Py_BuildValue("OOO", inputs, attrs, result);
-  PyObject_CallObject(record_gradient_callback, callback_args);
+  auto cleaner = tensorflow::gtl::MakeCleanup([inputs, attrs] {
+    Py_DECREF(inputs);
+    Py_DECREF(attrs);
+  });
+
+  if (run_gradient_callback) {
+    if (!PyCallable_Check(record_gradient_callback)) {
+      PyErr_SetString(PyExc_TypeError,
+                      Printf("expected a function for "
+                             "record_gradient_callback, got %s instead",
+                             record_gradient_callback->ob_type->tp_name)
+                          .c_str());
+      return false;
+    }
+
+    PyObject* callback_args =
+        Py_BuildValue("OOOOO", op_name, inputs, attrs, flattened_result, name);
+    PyObject* callback_result =
+        PyObject_CallObject(record_gradient_callback, callback_args);
+    Py_DECREF(callback_args);
+    if (!callback_result) {
+      return false;
+    }
+    Py_DECREF(callback_result);
+  }
+
+  if (run_post_exec_callbacks) {
+    // TODO(nareshmodi): update the execution callback interface to match the
+    // record_gradient interface so that this doesn't need to be rebuilt.
+    PyObject* callback_args =
+        Py_BuildValue("OOOOO", op_name, name, attrs, inputs, flattened_result);
+
+    for (Py_ssize_t i = 0; i < PyList_Size(callbacks); i++) {
+      PyObject* callback_fn = PyList_GET_ITEM(callbacks, i);
+      if (!PyCallable_Check(callback_fn)) {
+        PyErr_SetString(
+            PyExc_TypeError,
+            Printf("expected a function for "
+                   "post execution callback in index %ld, got %s instead",
+                   i, callback_fn->ob_type->tp_name)
+                .c_str());
+        return false;
+      }
+      PyObject* callback_result =
+          PyObject_CallObject(callback_fn, callback_args);
+      if (!callback_result) {
+        Py_DECREF(callback_args);
+        return false;
+      }
+      Py_DECREF(callback_result);
+    }
+    Py_DECREF(callback_args);
+  }
 
-  Py_DECREF(inputs);
-  Py_DECREF(callback_args);
-  Py_DECREF(attrs);
   return true;
 }
+
 }  // namespace
 
 PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
-  TFE_Context* ctx = reinterpret_cast<TFE_Context*>(
-      PyCapsule_GetPointer(PyTuple_GET_ITEM(args, 0), nullptr));
-  const tensorflow::OpDef* op_def = GetOpDef(PyTuple_GET_ITEM(args, 2));
-  if (op_def == nullptr) return nullptr;
-  const char* device_name = GetDeviceName(PyTuple_GET_ITEM(args, 1));
-  PyObject* record_gradient_callback = PyTuple_GET_ITEM(args, 3);
-
   Py_ssize_t args_size = PyTuple_GET_SIZE(args);
   if (args_size < kFastPathExecuteInputStartIndex) {
     PyErr_SetString(
@@ -1136,6 +1425,16 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
     return nullptr;
   }
 
+  TFE_Context* ctx = reinterpret_cast<TFE_Context*>(
+      PyCapsule_GetPointer(PyTuple_GET_ITEM(args, 0), nullptr));
+  const char* device_name = GetDeviceName(PyTuple_GET_ITEM(args, 1));
+  PyObject* op_name = PyTuple_GET_ITEM(args, 2);
+  const tensorflow::OpDef* op_def = GetOpDef(op_name);
+  if (op_def == nullptr) return nullptr;
+  PyObject* record_gradient_callback = PyTuple_GET_ITEM(args, 3);
+  PyObject* name = PyTuple_GET_ITEM(args, 4);
+  PyObject* callbacks = PyTuple_GET_ITEM(args, 5);
+
   if (args_size < kFastPathExecuteInputStartIndex + op_def->input_arg_size()) {
     PyErr_SetString(
         PyExc_ValueError,
@@ -1147,13 +1446,10 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
     return nullptr;
   }
 
-  if (!CheckEagerTensors(args, kFastPathExecuteInputStartIndex,
-                         op_def->input_arg_size())) {
-    // TODO(nareshmodi): Maybe some other way of signalling that this should
-    // fall back?
-    PyErr_SetString(PyExc_NotImplementedError,
-                    "This function does not handle the case of the path where "
-                    "all inputs are not already EagerTensors.");
+  if (!CheckEagerTensors(args, kFastPathExecuteInputStartIndex, *op_def)) {
+    RaiseFallbackException(
+        "This function does not handle the case of the path where "
+        "all inputs are not already EagerTensors.");
     return nullptr;
   }
 
@@ -1167,62 +1463,236 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
     return nullptr;
   }
 
-  TFE_OpSetDevice(op, device_name, status);
-  if (MaybeRaiseExceptionFromTFStatus(status, nullptr)) {
-    return nullptr;
+  // Mapping of attr name to size - used to calculate the number of values
+  // to be expected by the TFE_Execute run.
+  tensorflow::gtl::FlatMap<string, tensorflow::int64> attr_list_sizes;
+
+  // Set non-inferred attrs, including setting defaults if the attr is passed in
+  // as None.
+  for (int i = kFastPathExecuteInputStartIndex + op_def->input_arg_size();
+       i < args_size; i += 2) {
+    PyObject* py_attr_name = PyTuple_GET_ITEM(args, i);
+    const tensorflow::StringPiece attr_name(TFE_GetPythonString(py_attr_name));
+    PyObject* py_attr_value = PyTuple_GET_ITEM(args, i + 1);
+
+    // Not creating an index since most of the time there are not more than a
+    // few attrs.
+    // TODO(nareshmodi): Maybe include the index as part of the
+    // OpRegistrationData.
+    for (const auto& attr : op_def->attr()) {
+      if (attr_name == attr.name()) {
+        SetOpAttrWithDefaults(ctx, op, attr, attr_name.data(), py_attr_value,
+                              &attr_list_sizes, status);
+
+        if (TF_GetCode(status) != TF_OK) {
+          RaiseFallbackException(TF_Message(status));
+          return nullptr;
+        }
+
+        break;
+      }
+    }
   }
 
-  // Add non-type attrs.
-  SetOpAttrs(ctx, op, args,
-             kFastPathExecuteInputStartIndex + op_def->input_arg_size(),
-             status);
+  TFE_OpSetDevice(op, device_name, status);
   if (MaybeRaiseExceptionFromTFStatus(status, nullptr)) {
     return nullptr;
   }
 
-  // Add type attrs and inputs.
+  // TODO(nareshmodi): Add a benchmark for the fast-path with gradient callbacks
+  // (similar to benchmark_tf_gradient_function_*). Also consider using an
+  // InlinedVector for flattened_attrs and flattened_inputs if the benchmarks
+  // point out problems with heap allocs.
+  bool run_gradient_callback = !*ThreadTapeIsStopped() &&
+                               !GetTapeSet()->empty() &&
+                               record_gradient_callback != Py_None;
+  bool run_post_exec_callbacks =
+      callbacks != Py_None && PyList_Size(callbacks) > 0;
+  bool run_callbacks = run_gradient_callback || run_post_exec_callbacks;
+  // Flat attrs and inputs as required by the record_gradient call. The attrs
+  // here only contain inferred attrs (non-inferred attrs are added directly
+  // from the input args).
+  // All items in flattened_attrs contain new references.
+  // All items in flattened_inputs contain borrowed references.
+  // TODO(nareshmodi): figure out why PyList_New/PyList_Append don't work
+  // directly.
+  std::unique_ptr<std::vector<PyObject*>> flattened_attrs = nullptr;
+  std::unique_ptr<std::vector<PyObject*>> flattened_inputs = nullptr;
+
+  if (run_callbacks) {
+    flattened_attrs.reset(new std::vector<PyObject*>);
+    flattened_inputs.reset(new std::vector<PyObject*>);
+  }
+
+  // Add inferred attrs and inputs.
+  // The following code might set duplicate type attrs. This will result in
+  // the CacheKey for the generated AttrBuilder possibly differing from
+  // those where the type attrs are correctly set. Inconsistent CacheKeys
+  // for ops means that there might be unnecessarily duplicated kernels.
+  // TODO(nareshmodi): Fix this.
   for (int i = 0; i < op_def->input_arg_size(); i++) {
     const auto& input_arg = op_def->input_arg(i);
 
     PyObject* input =
         PyTuple_GET_ITEM(args, kFastPathExecuteInputStartIndex + i);
-    TFE_TensorHandle* input_handle = EagerTensor_Handle(input);
-
-    // The following code might set duplicate type attrs. This will result in
-    // the CacheKey for the generated AttrBuilder possibly differing from those
-    // where the type attrs are correctly set. Inconsistent CacheKeys for ops
-    // means that there might be unnecessarily duplicated kernels.
-    // TODO(nareshmodi): Fix this.
-    if (!input_arg.type_attr().empty()) {
-      TFE_OpSetAttrType(op, input_arg.type_attr().data(),
-                        TFE_TensorHandleDataType(input_handle));
+    if (!input_arg.number_attr().empty()) {
+      // The item is a homogeneous list.
+      if (!RaiseIfNotPyList(input, input_arg.number_attr())) return nullptr;
+      Py_ssize_t len = PyList_Size(input);
+
+      TFE_OpSetAttrInt(op, input_arg.number_attr().data(), len);
+      if (run_callbacks) {
+        flattened_attrs->push_back(
+            GetPythonObjectFromString(input_arg.number_attr().data()));
+        flattened_attrs->push_back(PyLong_FromLong(len));
+      }
+      attr_list_sizes[input_arg.number_attr()] = len;
+
+      if (len > 0) {
+        // First item adds the type attr.
+        if (!AddInputToOp(PyList_GET_ITEM(input, 0), &input_arg,
+                          flattened_attrs.get(), flattened_inputs.get(), op,
+                          status)) {
+          return nullptr;
+        }
+
+        for (Py_ssize_t j = 1; j < len; j++) {
+          // Since the list is homogeneous, we don't need to re-add the attr.
+          if (!AddInputToOp(PyList_GET_ITEM(input, j), nullptr /* input_arg */,
+                            nullptr /* flattened_attrs */,
+                            flattened_inputs.get(), op, status)) {
+            return nullptr;
+          }
+        }
+      }
+    } else if (!input_arg.type_list_attr().empty()) {
+      // The item is a heterogeneous list.
+      if (!RaiseIfNotPyList(input, input_arg.type_list_attr())) return nullptr;
+      const string& attr_name = input_arg.type_list_attr();
+      Py_ssize_t len = PyList_Size(input);
+      tensorflow::gtl::InlinedVector<TF_DataType, 4> attr_value(len);
+      PyObject* py_attr_value = nullptr;
+      if (run_callbacks) {
+        py_attr_value = PyTuple_New(len);
+      }
+      for (Py_ssize_t j = 0; j < len; j++) {
+        PyObject* py_input = PyList_GET_ITEM(input, j);
+        TFE_TensorHandle* input_handle = EagerTensor_Handle(py_input);
+        attr_value[j] = TFE_TensorHandleDataType(input_handle);
+
+        TFE_OpAddInput(op, input_handle, status);
+        if (MaybeRaiseExceptionFromTFStatus(status, nullptr)) {
+          return nullptr;
+        }
+
+        if (run_callbacks) {
+          flattened_inputs->push_back(py_input);
+
+          PyTuple_SET_ITEM(py_attr_value, j, PyLong_FromLong(attr_value[j]));
+        }
+      }
+      if (run_callbacks) {
+        flattened_attrs->push_back(GetPythonObjectFromString(attr_name.data()));
+        flattened_attrs->push_back(py_attr_value);
+      }
+      TFE_OpSetAttrTypeList(op, attr_name.data(), attr_value.data(),
+                            attr_value.size());
+      attr_list_sizes[attr_name] = len;
+    } else {
+      // The item is a single item.
+      if (!AddInputToOp(input, &input_arg, flattened_attrs.get(),
+                        flattened_inputs.get(), op, status)) {
+        return nullptr;
+      }
     }
+  }
 
-    TFE_OpAddInput(op, input_handle, status);
-    if (MaybeRaiseExceptionFromTFStatus(status, nullptr)) {
-      return nullptr;
+  int num_retvals = 0;
+  for (int i = 0; i < op_def->output_arg_size(); i++) {
+    const auto& output_arg = op_def->output_arg(i);
+    if (!output_arg.number_attr().empty()) {
+      num_retvals += attr_list_sizes[output_arg.number_attr()];
+    } else if (!output_arg.type_list_attr().empty()) {
+      num_retvals += attr_list_sizes[output_arg.type_list_attr()];
+    } else {
+      num_retvals++;
     }
   }
 
-  int num_retvals = op_def->output_arg_size();
   tensorflow::gtl::InlinedVector<TFE_TensorHandle*, 2> retvals(num_retvals);
 
   Py_BEGIN_ALLOW_THREADS;
   TFE_Execute(op, retvals.data(), &num_retvals, status);
   Py_END_ALLOW_THREADS;
-  if (MaybeRaiseExceptionFromTFStatus(status, nullptr)) {
+  if (TF_GetCode(status) != TF_OK) {
+    // Augment the status with the op_name for easier debugging similar to
+    // TFE_Py_Execute.
+    TF_SetStatus(status, TF_GetCode(status),
+                 tensorflow::strings::StrCat(TF_Message(status), " [Op:",
+                                             TFE_GetPythonString(op_name), "]")
+                     .c_str());
+
+    MaybeRaiseExceptionFromTFStatus(status, nullptr);
     return nullptr;
   }
 
-  PyObject* result = PyTuple_New(num_retvals);
+  PyObject* flat_result = PyList_New(num_retvals);
   for (int i = 0; i < num_retvals; ++i) {
-    PyTuple_SET_ITEM(result, i, EagerTensorFromHandle(retvals[i]));
+    PyList_SET_ITEM(flat_result, i, EagerTensorFromHandle(retvals[i]));
   }
 
-  if (!MaybeRunRecordGradientCallback(op_def, args, result,
-                                      record_gradient_callback)) {
+  if (run_callbacks &&
+      !RunCallbacks(run_gradient_callback, run_post_exec_callbacks, op_def,
+                    args, *flattened_inputs, *flattened_attrs, flat_result,
+                    op_name, name, record_gradient_callback, callbacks)) {
     return nullptr;
   }
 
+  // Unflatten results.
+  if (op_def->output_arg_size() == 0) {
+    Py_RETURN_NONE;
+  }
+
+  if (op_def->output_arg_size() == 1) {
+    if (!op_def->output_arg(0).number_attr().empty() ||
+        !op_def->output_arg(0).type_list_attr().empty()) {
+      return flat_result;
+    } else {
+      auto* result = PyList_GET_ITEM(flat_result, 0);
+      Py_INCREF(result);
+      Py_DECREF(flat_result);
+      return result;
+    }
+  }
+
+  // Correctly output the results that are made into a namedtuple.
+  PyObject* result = PyList_New(op_def->output_arg_size());
+  int flat_result_index = 0;
+  for (int i = 0; i < op_def->output_arg_size(); i++) {
+    if (!op_def->output_arg(i).number_attr().empty()) {
+      int list_length = attr_list_sizes[op_def->output_arg(i).number_attr()];
+      PyObject* inner_list = PyList_New(list_length);
+      for (int j = 0; j < list_length; j++) {
+        PyObject* obj = PyList_GET_ITEM(flat_result, flat_result_index++);
+        Py_INCREF(obj);
+        PyList_SET_ITEM(inner_list, j, obj);
+      }
+      PyList_SET_ITEM(result, i, inner_list);
+    } else if (!op_def->output_arg(i).type_list_attr().empty()) {
+      int list_length = attr_list_sizes[op_def->output_arg(i).type_list_attr()];
+      PyObject* inner_list = PyList_New(list_length);
+      for (int j = 0; j < list_length; j++) {
+        PyObject* obj = PyList_GET_ITEM(flat_result, flat_result_index++);
+        Py_INCREF(obj);
+        PyList_SET_ITEM(inner_list, j, obj);
+      }
+      PyList_SET_ITEM(result, i, inner_list);
+    } else {
+      PyObject* obj = PyList_GET_ITEM(flat_result, flat_result_index++);
+      Py_INCREF(obj);
+      PyList_SET_ITEM(result, i, obj);
+    }
+  }
+  Py_DECREF(flat_result);
   return result;
 }
diff --git a/tensorflow/python/eager/pywrap_tfe_test.py b/tensorflow/python/eager/pywrap_tfe_test.py
index d4f4ed592f..49323e6640 100644
--- a/tensorflow/python/eager/pywrap_tfe_test.py
+++ b/tensorflow/python/eager/pywrap_tfe_test.py
@@ -21,28 +21,15 @@ from __future__ import print_function
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
+from tensorflow.python.eager import execute
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 
 
-def record_gradient_callback(inputs, attrs, results):
-  return backprop._record_gradient("MatMul", inputs, attrs, results, None)
-
-
-def c_tfe_py_fastpath_execute(a, b, transpose_a=False, transpose_b=False):
-  ctx = context.context()
-  assert not ctx.in_graph_mode(
-  ), "The prototype doesn't contain C code for graph construction"
-  ctx_handle = ctx._handle  # pylint: disable=protected-access
-
-  return pywrap_tensorflow.TFE_Py_FastPathExecute(
-      ctx_handle, ctx.device_name, "MatMul", record_gradient_callback, a, b,
-      "transpose_a", transpose_a, "transpose_b", transpose_b)[0]
-
-
 class Tests(test.TestCase):
 
   @test_util.assert_no_new_tensors
@@ -54,31 +41,100 @@ class Tests(test.TestCase):
     a_100_by_784 = random_ops.random_uniform((100, 784))
     b_100_by_784 = random_ops.random_uniform((100, 784))
 
+    ctx = context.context()
+
     self.assertAllClose(
         math_ops.matmul(a_2_by_2, b_2_by_2),
-        c_tfe_py_fastpath_execute(a_2_by_2, b_2_by_2))
+        pywrap_tensorflow.TFE_Py_FastPathExecute(
+            ctx._handle, ctx.device_name, "MatMul", execute.record_gradient,
+            None, None, a_2_by_2, b_2_by_2, "transpose_a", False, "transpose_b",
+            False))
     self.assertAllClose(
         math_ops.matmul(a_100_by_784, b_100_by_784, transpose_b=True),
-        c_tfe_py_fastpath_execute(a_100_by_784, b_100_by_784, transpose_b=True))
+        pywrap_tensorflow.TFE_Py_FastPathExecute(
+            ctx._handle, ctx.device_name, "MatMul", execute.record_gradient,
+            None, None, a_100_by_784, b_100_by_784, "transpose_a", False,
+            "transpose_b", True))
 
   @test_util.assert_no_new_tensors
   @test_util.assert_no_garbage_created
   def testFastpathExecute_TapeWrite(self):
+    ctx = context.context()
     with backprop.GradientTape(persistent=True) as tape:
       a_2_by_2 = constant_op.constant(1.0, shape=[2, 2])
       tape.watch(a_2_by_2)
-      z = c_tfe_py_fastpath_execute(a_2_by_2, a_2_by_2)
+      z = pywrap_tensorflow.TFE_Py_FastPathExecute(
+          ctx._handle, ctx.device_name, "MatMul", execute.record_gradient, None,
+          None, a_2_by_2, a_2_by_2, "transpose_a", False, "transpose_b", False)
     dz_dy = tape.gradient(z, [a_2_by_2])[0]
     self.assertAllEqual(dz_dy.numpy(),
                         constant_op.constant(4.0, shape=[2, 2]).numpy())
 
+  # Tests homogeneous list op
   @test_util.assert_no_new_tensors
   @test_util.assert_no_garbage_created
-  def testFastpathExecute_MatMulSlowPath(self):
-    a_2_by_2 = random_ops.random_uniform((2, 2)).cpu().numpy()
+  def testFastpathExecute_AddNCorrectResponse(self):
+    ctx = context.context()
+    a_2_by_2 = random_ops.random_uniform((2, 2))
+    b_2_by_2 = random_ops.random_uniform((2, 2))
+
+    self.assertAllClose(
+        math_ops.add_n([a_2_by_2, b_2_by_2]),
+        pywrap_tensorflow.TFE_Py_FastPathExecute(
+            ctx._handle, ctx.device_name, "AddN", execute.record_gradient, None,
+            None, [a_2_by_2, b_2_by_2]))
+
+  # Tests homogeneous list op
+  @test_util.assert_no_new_tensors
+  @test_util.assert_no_garbage_created
+  def testFastpathExecute_AddNTapeWrite(self):
+    ctx = context.context()
+    a_2_by_2 = random_ops.random_uniform((2, 2))
+    b_2_by_2 = random_ops.random_uniform((2, 2))
 
-    with self.assertRaises(NotImplementedError):
-      c_tfe_py_fastpath_execute(a_2_by_2, a_2_by_2)
+    with backprop.GradientTape(persistent=True) as tape:
+      tape.watch(a_2_by_2)
+      tape.watch(b_2_by_2)
+      z1 = pywrap_tensorflow.TFE_Py_FastPathExecute(
+          ctx._handle, ctx.device_name, "AddN", execute.record_gradient, None,
+          None, [a_2_by_2, b_2_by_2])
+      z2 = math_ops.add_n([a_2_by_2, b_2_by_2])
+    dz1_dy = tape.gradient(z1, [a_2_by_2])[0]
+    dz2_dy = tape.gradient(z2, [a_2_by_2])[0]
+    self.assertAllEqual(dz1_dy.numpy(), dz2_dy.numpy())
+
+  # Tests heterogeneous list op
+  @test_util.assert_no_new_tensors
+  @test_util.assert_no_garbage_created
+  def testFastpathExecute_IdentityNCorrectResponse(self):
+    ctx = context.context()
+    a_2_by_2 = random_ops.random_uniform((2, 2))
+    b_2_by_2 = random_ops.random_uniform((2, 2))
+
+    self.assertAllClose(
+        array_ops.identity_n([a_2_by_2, b_2_by_2]),
+        pywrap_tensorflow.TFE_Py_FastPathExecute(
+            ctx._handle, ctx.device_name, "IdentityN", execute.record_gradient,
+            None, None, [a_2_by_2, b_2_by_2]))
+
+  # Tests heterogeneous list op
+  @test_util.assert_no_new_tensors
+  @test_util.assert_no_garbage_created
+  def testFastpathExecute_IdentityNTapeWrite(self):
+    ctx = context.context()
+    a_2_by_2 = random_ops.random_uniform((2, 2))
+    b_2_by_2 = random_ops.random_uniform((2, 2))
+
+    with backprop.GradientTape(persistent=True) as tape:
+      tape.watch(a_2_by_2)
+      tape.watch(b_2_by_2)
+      z1 = pywrap_tensorflow.TFE_Py_FastPathExecute(
+          ctx._handle, ctx.device_name, "IdentityN", execute.record_gradient,
+          None, None, [a_2_by_2, b_2_by_2])
+      z2 = array_ops.identity_n([a_2_by_2, b_2_by_2])
+    dz1_dy = tape.gradient(z1[0], [a_2_by_2])[0]
+    dz2_dy = tape.gradient(z2[0], [a_2_by_2])[0]
+    self.assertAllEqual(dz1_dy.numpy(), dz2_dy.numpy())
 
   @test_util.assert_no_new_tensors
   @test_util.assert_no_garbage_created
@@ -89,20 +145,24 @@ class Tests(test.TestCase):
     ), "The prototype doesn't contain C code for graph construction"
     ctx_handle = ctx._handle  # pylint: disable=protected-access
 
+    # Not enough base params
     with self.assertRaisesRegexp(ValueError,
-                                 "at least 4 items in the input tuple"):
+                                 "at least 6 items in the input tuple"):
       pywrap_tensorflow.TFE_Py_FastPathExecute(ctx_handle, ctx.device_name,
                                                "Identity")
 
+    # Not enough inputs
     with self.assertRaisesRegexp(ValueError,
-                                 "Expected to be at least 5, was 4"):
+                                 "Expected to be at least 7, was 6"):
       pywrap_tensorflow.TFE_Py_FastPathExecute(
-          ctx_handle, ctx_handle, "Identity", record_gradient_callback)
+          ctx_handle, ctx_handle, "Identity", backprop._record_gradient, None,
+          [])
 
+    # Bad type
     with self.assertRaisesRegexp(TypeError, "expected a string for op_name"):
       pywrap_tensorflow.TFE_Py_FastPathExecute(
-          ctx_handle, ctx.device_name, ctx_handle, record_gradient_callback,
-          a_2_by_2)
+          ctx_handle, ctx.device_name, ctx_handle, backprop._record_gradient,
+          None, [], a_2_by_2)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/pywrap_tfe.i b/tensorflow/python/pywrap_tfe.i
index 3f25311a83..50f481d29e 100644
--- a/tensorflow/python/pywrap_tfe.i
+++ b/tensorflow/python/pywrap_tfe.i
@@ -29,6 +29,7 @@ limitations under the License.
 %rename("%s") TFE_OpNameGetAttrType;
 %rename("%s") TFE_Py_InitEagerTensor;
 %rename("%s") TFE_Py_RegisterExceptionClass;
+%rename("%s") TFE_Py_RegisterFallbackExceptionClass;
 %rename("%s") TFE_Py_Execute;
 %rename("%s") TFE_Py_FastPathExecute;
 %rename("%s") TFE_Py_UID;
-- 
GitLab


From 5eba03ef1098de5918973d6e691e5cda31e3147a Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Tue, 6 Feb 2018 13:54:56 -0800
Subject: [PATCH 729/966] Another rolling back of performance regression.

PiperOrigin-RevId: 184734426
---
 tensorflow/core/framework/tensor.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tensorflow/core/framework/tensor.h b/tensorflow/core/framework/tensor.h
index 94c39c53a6..62c42ba652 100644
--- a/tensorflow/core/framework/tensor.h
+++ b/tensorflow/core/framework/tensor.h
@@ -660,8 +660,7 @@ void Tensor::FillDimsAndValidateCompatibleShape(
 template <typename T, size_t NDIMS>
 typename TTypes<T, NDIMS>::Tensor Tensor::shaped(
     gtl::ArraySlice<int64> new_sizes) {
-  CheckType(DataTypeToEnum<T>::v());
-  CHECK(IsAligned());
+  CheckTypeAndIsAligned(DataTypeToEnum<T>::v());
   Eigen::array<Eigen::DenseIndex, NDIMS> dims;
   FillDimsAndValidateCompatibleShape(new_sizes, &dims);
   return typename TTypes<T, NDIMS>::Tensor(base<T>(), dims);
-- 
GitLab


From bfe8b85cad3be1a82234500fce3064c98dd20d09 Mon Sep 17 00:00:00 2001
From: gracehoney <31743510+aaroey@users.noreply.github.com>
Date: Tue, 6 Feb 2018 14:04:35 -0800
Subject: [PATCH 730/966] 1. Use tensorflow::string instead of std::string 2.
 Fix swig problem in trt_conversion.i 3. Fix c++ styles 4. Fix BUILD
 dependencies

---
 tensorflow/contrib/tensorrt/BUILD             |  29 +-
 .../contrib/tensorrt/convert/convert_graph.cc |  57 +-
 .../contrib/tensorrt/convert/convert_graph.h  |   4 +-
 .../contrib/tensorrt/convert/convert_nodes.cc | 595 +++++++++---------
 .../contrib/tensorrt/kernels/trt_engine_op.cc |  10 +-
 tensorflow/contrib/tensorrt/log/trt_logger.h  |   4 +-
 .../contrib/tensorrt/segment/segment.cc       |   6 +-
 tensorflow/contrib/tensorrt/segment/segment.h |   6 +-
 .../contrib/tensorrt/segment/segment_test.cc  |  22 +-
 tensorflow/contrib/tensorrt/trt_conversion.i  |   6 +-
 10 files changed, 377 insertions(+), 362 deletions(-)

diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index bcd3c8c547..3fb4553a51 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -11,16 +11,17 @@ exports_files(["LICENSE"])
 
 load(
     "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
+    "tf_copts",
+    "tf_cuda_library",
     "tf_custom_op_library",
+    "tf_custom_op_library_additional_deps",
     "tf_gen_op_libs",
     "tf_gen_op_wrapper_py",
-    "tf_py_wrap_cc",
-    "tf_cc_test",
-    "tf_cuda_cc_test",
-    "tf_cuda_library",
-    "tf_custom_op_py_library",
-    "tf_copts",
 )
+load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
+load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
 load(
     "@local_config_tensorrt//:build_defs.bzl",
     "if_tensorrt",
@@ -62,12 +63,8 @@ tf_cuda_library(
     visibility = ["//visibility:public"],
     deps = [
         ":trt_logging",
-        "//tensorflow/core:framework_headers_lib",
-        "//third_party/eigen3",
         "@local_config_tensorrt//:nv_infer",
-        "@nsync//:nsync_headers",
-        "@protobuf_archive//:protobuf",
-    ],
+    ] + tf_custom_op_library_additional_deps(),
 )
 
 cc_library(
@@ -77,14 +74,11 @@ cc_library(
     copts = tf_copts(),
     deps = [
         ":trt_logging",
-        "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:gpu_headers_lib",
         "//tensorflow/core:lib_proto_parsing",
         "//tensorflow/core:stream_executor_headers_lib",
-        "//third_party/eigen3",
         "@local_config_tensorrt//:nv_infer",
-        "@nsync//:nsync_headers",
-    ],
+    ] + tf_custom_op_library_additional_deps(),
     alwayslink = 1,
 )
 
@@ -185,7 +179,6 @@ tf_cuda_library(
     deps = [
         ":segment",
         ":trt_logging",
-        "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:framework_lite",
         "//tensorflow/core:graph",
         "//tensorflow/core:protos_all_cc",
@@ -195,9 +188,7 @@ tf_cuda_library(
         "//tensorflow/core/grappler/optimizers:constant_folding",
         "//tensorflow/core/grappler/optimizers:layout_optimizer",
         "@local_config_tensorrt//:nv_infer",
-        "@nsync//:nsync_headers",
-        "@protobuf_archive//:protobuf_headers",
-    ],
+    ] + tf_custom_op_library_additional_deps(),
 )
 
 # Library for the segmenting portion of TensorRT operation creation
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index d94f8ac4ba..e3364467f7 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -15,16 +15,14 @@ limitations under the License.
 
 #include "tensorflow/contrib/tensorrt/convert/convert_graph.h"
 
-#include <list>
 #include <map>
 #include <set>
-#include <sstream>
-#include <string>
 #include <unordered_map>
-#include <unordered_set>
 #include <utility>
 #include <vector>
 
+#include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
+#include "tensorflow/contrib/tensorrt/segment/segment.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/graph/algorithm.h"
@@ -40,15 +38,13 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/device_properties.pb.h"
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
-#include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
-#include "tensorflow/contrib/tensorrt/segment/segment.h"
 #include "tensorrt/include/NvInfer.h"
 
-//------------------------------------------------------------------------------
 namespace tensorflow {
 namespace tensorrt {
 namespace convert {
@@ -58,7 +54,7 @@ static bool IsTensorRTCandidate(const tensorflow::NodeDef& node_def) {
   // LINT.IfChange
   // TODO(jie): Segmentation shouldn't associated with op name.
   //            Split it into a registration for each kernel.
-  static const std::set<std::string> candidate_ops = {
+  static const std::set<string> candidate_ops = {
       "Identity", "Const", "Conv2D", "MaxPool", "BiasAdd", "Relu",
       "Add",      "Mul",   "Sub",    "Rsqrt",   "Pad"  // "Placeholder" ,"Mean"
   };
@@ -95,22 +91,21 @@ void GetSubGraphOutgoingEdges(const tensorflow::Graph& graph,
   }
 }
 
-std::pair<std::string, int> ParseTensorName(std::string name,
-                                            int default_idx = 0) {
+std::pair<string, int> ParseTensorName(string name, int default_idx = 0) {
   int idx = default_idx;
   size_t sep = name.find_last_of(':');
-  if (sep != std::string::npos) {
+  if (sep != string::npos) {
     name = name.substr(0, sep);
     idx = std::stoi(name.substr(sep + 1));
   }
   return std::make_pair(name, idx);
 }
 
-std::unordered_map<std::string, std::vector<int>> BuildTensorNameMap(
-    const std::vector<std::string>& tensor_names) {
-  std::unordered_map<std::string, std::vector<int>> result;
-  for (std::string const& tensor_name : tensor_names) {
-    std::string node_name;
+std::unordered_map<string, std::vector<int>> BuildTensorNameMap(
+    const std::vector<string>& tensor_names) {
+  std::unordered_map<string, std::vector<int>> result;
+  for (string const& tensor_name : tensor_names) {
+    string node_name;
     int index;
     std::tie(node_name, index) = ParseTensorName(tensor_name);
     result[node_name].push_back(index);
@@ -119,10 +114,10 @@ std::unordered_map<std::string, std::vector<int>> BuildTensorNameMap(
 }
 
 tensorflow::Status ConvertSubGraphToTensorRT(
-    const std::vector<std::string>& output_names,
+    const std::vector<string>& output_names,
     const std::set<int>& subgraph_node_ids,
-    size_t max_batch_size,  // max batch size that engine will be created for
-    // max amount of memory that engine will be allowed to consume, in bytes
+    size_t max_batch_size,  // Max batch size that engine will be created for
+    // Max amount of memory that engine will be allowed to consume, in bytes
     size_t max_workspace_size_bytes,
     const tensorflow::grappler::GraphProperties& graph_properties,
     tensorflow::Graph* graph) {
@@ -163,7 +158,6 @@ tensorflow::Status ConvertSubGraphToTensorRT(
       &trt_node_def));
   tensorflow::Status status;
   tensorflow::Node* trt_node = graph->AddNode(trt_node_def, &status);
-
   TF_RETURN_IF_ERROR(status);
 
   // Re-map outgoing edges to use the new TRT node instead of the orig subgraph
@@ -175,7 +169,8 @@ tensorflow::Status ConvertSubGraphToTensorRT(
   for (const tensorflow::Edge* edge : subgraph_outgoing_edges) {
     std::pair<int, int> old_src = {edge->src()->id(), edge->src_output()};
     int new_src_output = subgraph_edge_to_output_map.at(old_src);
-    graph->UpdateEdge(trt_node, new_src_output, edge->dst(), edge->dst_input());
+    TF_RETURN_IF_ERROR(graph->UpdateEdge(trt_node, new_src_output, edge->dst(),
+                                         edge->dst_input()));
   }
   // Remove the original subgraph
   for (int node_id : subgraph_node_ids) {
@@ -191,7 +186,7 @@ tensorflow::Status ConvertSubGraphToTensorRT(
 
 tensorflow::Status BuildNodeMap(
     const tensorflow::Graph& graph,
-    std::unordered_map<std::string, tensorflow::Node*>* node_map) {
+    std::unordered_map<string, tensorflow::Node*>* node_map) {
   for (auto* node : graph.op_nodes()) {
     if (!node_map->insert({node->name(), node}).second) {
       return tensorflow::errors::AlreadyExists(
@@ -205,19 +200,19 @@ tensorflow::Status BuildNodeMap(
 
 tensorflow::Status ConvertGraphDefToTensorRT(
     const tensorflow::GraphDef& graph_def,
-    const std::vector<std::string>& output_names, size_t max_batch_size,
+    const std::vector<string>& output_names, size_t max_batch_size,
     size_t max_workspace_size_bytes, tensorflow::GraphDef* new_graph_def) {
-  // optimization pass
+  // Optimization pass
   tensorflow::grappler::GrapplerItem item;
   item.fetch = output_names;
   tensorflow::GraphDef gdef;
 
-  // layout optimization
+  // Layout optimization
   item.graph = graph_def;
   tensorflow::grappler::LayoutOptimizer optimizer;
   tensorflow::grappler::Cluster* cluster;
 
-  // virtual cluster
+  // Virtual cluster
   tensorflow::DeviceProperties device_properties;
   device_properties.set_type("GPU");
   device_properties.mutable_environment()->insert({"architecture", "6"});
@@ -226,14 +221,14 @@ tensorflow::Status ConvertGraphDefToTensorRT(
 
   TF_RETURN_IF_ERROR(optimizer.Optimize(cluster, item, &gdef));
 
-  // constant folding
+  // Constant folding
   item.graph = gdef;
   tensorflow::grappler::ConstantFolding fold(nullptr);
   TF_RETURN_IF_ERROR(fold.Optimize(nullptr, item, &gdef));
 
   // AJ refactoring shape inference through grappler/GraphProperties.
   tensorflow::grappler::GraphProperties static_graph_properties(item);
-  static_graph_properties.InferStatically(false);
+  TF_RETURN_IF_ERROR(static_graph_properties.InferStatically(false));
 
   // Build full graph
   tensorflow::FunctionLibraryDefinition flib(tensorflow::OpRegistry::Global(),
@@ -258,11 +253,11 @@ tensorflow::Status ConvertGraphDefToTensorRT(
   if (segments.size() > 1) {
     VLOG(0) << "MULTIPLE tensorrt candidate conversion: " << segments.size();
   }
-  std::unordered_map<std::string, tensorflow::Node*> node_map;
+  std::unordered_map<string, tensorflow::Node*> node_map;
   TF_RETURN_IF_ERROR(BuildNodeMap(graph, &node_map));
-  for (const std::set<std::string>& subgraph_node_names : segments) {
+  for (const std::set<string>& subgraph_node_names : segments) {
     std::set<int> subgraph_node_ids;
-    for (const std::string& node_name : subgraph_node_names) {
+    for (const string& node_name : subgraph_node_names) {
       subgraph_node_ids.insert(node_map.at(node_name)->id());
     }
     TF_RETURN_IF_ERROR(ConvertSubGraphToTensorRT(
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.h b/tensorflow/contrib/tensorrt/convert/convert_graph.h
index fc918a9ec2..154ad3f2e8 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.h
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.h
@@ -15,11 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_TENSORRT_CONVERT_CONVERT_GRAPH_H_
 #define TENSORFLOW_CONTRIB_TENSORRT_CONVERT_CONVERT_GRAPH_H_
 
-#include <string>
 #include <vector>
 
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
@@ -34,7 +34,7 @@ namespace convert {
 //                 engine building.
 tensorflow::Status ConvertGraphDefToTensorRT(
     const tensorflow::GraphDef& graph_def,
-    const std::vector<std::string>& output_names, size_t max_batch_size,
+    const std::vector<string>& output_names, size_t max_batch_size,
     size_t max_workspace_size_bytes, tensorflow::GraphDef* new_graph_def);
 
 }  // namespace convert
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index c84684d485..779d15a4b1 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -20,21 +20,26 @@ limitations under the License.
 #include <map>
 #include <memory>
 #include <set>
-#include <string>
 #include <unordered_map>
 #include <utility>
 #include <vector>
 
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/tensor_coding.h"
+#include "tensorflow/core/platform/types.h"
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
@@ -51,8 +56,8 @@ namespace convert {
 
 namespace {
 
-inline tensorflow::Status convert_dtype(tensorflow::DataType tf_dtype,
-                                        nvinfer1::DataType* trt_dtype) {
+inline tensorflow::Status ConvertDType(tensorflow::DataType tf_dtype,
+                                       nvinfer1::DataType* trt_dtype) {
   switch (tf_dtype) {
     case tensorflow::DataType::DT_FLOAT:
       *trt_dtype = nvinfer1::DataType::kFLOAT;
@@ -69,7 +74,7 @@ inline tensorflow::Status convert_dtype(tensorflow::DataType tf_dtype,
   return tensorflow::Status::OK();
 }
 
-inline nvinfer1::Dims get_tensor_shape(const tensorflow::Tensor& tensor) {
+inline nvinfer1::Dims GetTensorShape(const tensorflow::Tensor& tensor) {
   nvinfer1::Dims dims;
   dims.nbDims = tensor.dims();
   for (int i = 0; i < dims.nbDims; i++) {
@@ -78,7 +83,7 @@ inline nvinfer1::Dims get_tensor_shape(const tensorflow::Tensor& tensor) {
   return dims;
 }
 
-inline int64_t get_shape_size(nvinfer1::Dims shape) {
+inline int64_t GetShapeSize(nvinfer1::Dims shape) {
   // Returns total number of elements in shape
   int64_t count = 1;
   for (int d = 0; d < shape.nbDims; ++d) {
@@ -87,24 +92,24 @@ inline int64_t get_shape_size(nvinfer1::Dims shape) {
   return count;
 }
 
-static std::vector<std::pair<int, int>> createSamePadding(
-    nvinfer1::DimsHW& stride, nvinfer1::DimsHW& kernel,
-    std::vector<int64_t> inputDims) {
-  std::vector<std::pair<int, int>> padding(inputDims.size());
-  CHECK_EQ((size_t)stride.nbDims, inputDims.size());  // TODO(jie): N+C? NC+?
+static std::vector<std::pair<int, int>> CreateSamePadding(
+    const nvinfer1::DimsHW& stride, const nvinfer1::DimsHW& kernel,
+    const std::vector<int64_t>& input_dims) {
+  std::vector<std::pair<int, int>> padding(input_dims.size());
+  CHECK_EQ((size_t)stride.nbDims, input_dims.size());  // TODO(jie): N+C? NC+?
 
-  for (size_t i = 0; i < inputDims.size(); ++i) {
-    // formula to calculate the padding
-    int p = ((inputDims[i] - 1) / stride.d[i]) * stride.d[i] + kernel.d[i] -
-            inputDims[i];
+  for (size_t i = 0; i < input_dims.size(); ++i) {
+    // Formula to calculate the padding
+    int p = ((input_dims[i] - 1) / stride.d[i]) * stride.d[i] + kernel.d[i] -
+            input_dims[i];
     p = (p > 0) ? p : 0;
 
-    // right precedence padding, like in TensorFlow
+    // Right precedence padding, like in TensorFlow
     int left = p / 2;
     int right = p - left;
 
     VLOG(2) << "PADDING_" << i << " pre: " << left << ", post: " << right
-            << "paras: " << inputDims[i] << ", " << stride.d[i] << ", "
+            << "paras: " << input_dims[i] << ", " << stride.d[i] << ", "
             << "kernel: " << kernel.d[i];
     padding[i] = {left, right};
   }
@@ -113,53 +118,69 @@ static std::vector<std::pair<int, int>> createSamePadding(
 
 class TRT_ShapedWeights {
  public:
-  nvinfer1::Dims shape_;
-  tensorflow::DataType type_;
-  const void* values_;
-  bool dummy_flag_;
+  TRT_ShapedWeights(tensorflow::DataType type, const void* values,
+                    nvinfer1::Dims shape, bool owned_values = false)
+      : shape_(shape),
+        type_(type),
+        values_(values),
+        owned_values_(owned_values),
+        dummy_flag_(false) {
+    // Note: this->shape.type[] is not used
+  }
+
+  explicit TRT_ShapedWeights(tensorflow::DataType type)
+      : type_(type),
+        values_(nullptr),
+        owned_values_(false),
+        dummy_flag_(true) {}
+
+  ~TRT_ShapedWeights() {
+    if (values_ && owned_values_) delete static_cast<const char*>(values_);
+  }
+
+  TRT_ShapedWeights(const TRT_ShapedWeights&) = default;
+
   int64_t count() const {
     int64_t c = 1;
     for (int i = 0; i < shape_.nbDims; i++) c *= shape_.d[i];
     return c;
   }
-  TRT_ShapedWeights(tensorflow::DataType type, const void* values,
-                    nvinfer1::Dims shape)
-      : shape_(shape), type_(type), values_(values), dummy_flag_(false) {
-    // Note: this->shape.type[] is not used
-  }
-  explicit TRT_ShapedWeights(tensorflow::DataType type)
-      : type_(type), values_(nullptr), dummy_flag_(true) {}
-  nvinfer1::Weights getWeightsForTRT() const {
+
+  nvinfer1::Weights GetWeightsForTRT() const {
     nvinfer1::DataType trt_type(nvinfer1::DataType::kFLOAT);
-    TF_CHECK_OK(convert_dtype(type_, &trt_type));
+    TF_CHECK_OK(ConvertDType(type_, &trt_type));
     if (dummy_flag_) return nvinfer1::Weights{trt_type, nullptr, 0};
 
     // Note: this->shape.type[] is not used
-    return nvinfer1::Weights{trt_type, values_, get_shape_size(shape_)};
+    return nvinfer1::Weights{trt_type, values_, GetShapeSize(shape_)};
   }
+
   size_t size_bytes() const {
     int type_size = tensorflow::DataTypeSize(this->type_);
     return this->count() * type_size;
   }
-  // default converter
-  operator nvinfer1::Weights() const { return getWeightsForTRT(); }
+
+  // Default converter
+  operator nvinfer1::Weights() const { return GetWeightsForTRT(); }
+
+  nvinfer1::Dims shape_;
+  tensorflow::DataType type_;
+  const void* values_;
+  bool owned_values_;
+  bool dummy_flag_;
 };
 
 class TRT_TensorOrWeights {
-  union {
-    nvinfer1::ITensor* _tensor_;
-    TRT_ShapedWeights _weights_;
-  };
-  enum { TRT_NODE_TENSOR, TRT_NODE_WEIGHTS } _variant_;
-
  public:
   explicit TRT_TensorOrWeights(nvinfer1::ITensor* tensor)
       : _tensor_(tensor), _variant_(TRT_NODE_TENSOR) {}
-  explicit TRT_TensorOrWeights(TRT_ShapedWeights const& weights)
+  TRT_TensorOrWeights(const TRT_ShapedWeights& weights)
       : _weights_(weights), _variant_(TRT_NODE_WEIGHTS) {}
-  TRT_TensorOrWeights() = delete;
+  ~TRT_TensorOrWeights() {}
+
   bool is_tensor() const { return _variant_ == TRT_NODE_TENSOR; }
   bool is_weights() const { return _variant_ == TRT_NODE_WEIGHTS; }
+
   nvinfer1::ITensor* tensor() {
     CHECK_EQ(this->is_tensor(), true);
     return _tensor_;
@@ -172,7 +193,7 @@ class TRT_TensorOrWeights {
     CHECK_EQ(this->is_weights(), true);
     return _weights_;
   }
-  TRT_ShapedWeights const& weights() const {
+  const TRT_ShapedWeights& weights() const {
     CHECK_EQ(this->is_weights(), true);
     return _weights_;
   }
@@ -183,19 +204,20 @@ class TRT_TensorOrWeights {
       return this->weights().shape_;
     }
   }
-};
 
-class TRT_LayerOrWeights {
+ private:
   union {
-    nvinfer1::ILayer* _layer_;
+    nvinfer1::ITensor* _tensor_;
     TRT_ShapedWeights _weights_;
   };
-  enum { TRT_NODE_LAYER, TRT_NODE_WEIGHTS } _variant_;
+  enum { TRT_NODE_TENSOR, TRT_NODE_WEIGHTS } _variant_;
+};
 
+class TRT_LayerOrWeights {
  public:
   explicit TRT_LayerOrWeights(nvinfer1::ILayer* layer)
       : _layer_(layer), _variant_(TRT_NODE_LAYER) {}
-  explicit TRT_LayerOrWeights(TRT_ShapedWeights const& weights)
+  explicit TRT_LayerOrWeights(const TRT_ShapedWeights& weights)
       : _weights_(weights), _variant_(TRT_NODE_WEIGHTS) {}
   bool is_layer() const { return _variant_ == TRT_NODE_LAYER; }
   bool is_weights() const { return _variant_ == TRT_NODE_WEIGHTS; }
@@ -216,46 +238,54 @@ class TRT_LayerOrWeights {
       return TRT_TensorOrWeights(_weights_);
     }
   }
+
+ private:
+  union {
+    nvinfer1::ILayer* _layer_;
+    TRT_ShapedWeights _weights_;
+  };
+  enum { TRT_NODE_LAYER, TRT_NODE_WEIGHTS } _variant_;
 };
 
 class TFAttrs {
-  typedef std::map<std::string, tensorflow::AttrValue const*> AttrMap;
-  AttrMap _attrs;
-
  public:
-  explicit TFAttrs(tensorflow::NodeDef const& tf_node) {
-    for (auto const& attr : tf_node.attr()) {
+  explicit TFAttrs(const tensorflow::NodeDef& tf_node) {
+    for (const auto& attr : tf_node.attr()) {
       _attrs.insert({attr.first, &attr.second});
     }
   }
-  bool count(std::string key) const { return _attrs.count(key); }
-  tensorflow::AttrValue const* at(std::string key) const {
+  bool count(string key) const { return _attrs.count(key); }
+  tensorflow::AttrValue const* at(string key) const {
     if (!_attrs.count(key)) {
       LOG(FATAL) << "Attribute not found: " << key;
     }
     return _attrs.at(key);
   }
   template <typename T>
-  T get(std::string key) const;
+  T get(string key) const;
   template <typename T>
-  T getShape(std::string key) const;
-  template <typename T>
-  T get(std::string key, T const& default_value) const {
+  T get(string key, const T& default_value) const {
     return _attrs.count(key) ? this->get<T>(key) : default_value;
   }
+
+ private:
+  typedef std::map<string, tensorflow::AttrValue const*> AttrMap;
+  AttrMap _attrs;
 };
 
 template <>
-std::string TFAttrs::get<std::string>(std::string key) const {
+string TFAttrs::get<string>(string key) const {
   return this->at(key)->s();
 }
+
 template <>
-std::vector<int> TFAttrs::get<std::vector<int>>(std::string key) const {
+std::vector<int> TFAttrs::get<std::vector<int>>(string key) const {
   auto attr = this->at(key)->list().i();
   return std::vector<int>(attr.begin(), attr.end());
 }
+
 template <>
-nvinfer1::Dims TFAttrs::get<nvinfer1::Dims>(std::string key) const {
+nvinfer1::Dims TFAttrs::get<nvinfer1::Dims>(string key) const {
   auto values = this->get<std::vector<int>>(key);
   nvinfer1::Dims dims;
   dims.nbDims = values.size();
@@ -265,19 +295,19 @@ nvinfer1::Dims TFAttrs::get<nvinfer1::Dims>(std::string key) const {
 }
 
 template <>
-nvinfer1::DataType TFAttrs::get<nvinfer1::DataType>(std::string key) const {
+nvinfer1::DataType TFAttrs::get<nvinfer1::DataType>(string key) const {
   nvinfer1::DataType trt_dtype(nvinfer1::DataType::kFLOAT);
-  TF_CHECK_OK(convert_dtype(this->at(key)->type(), &trt_dtype));
+  TF_CHECK_OK(ConvertDType(this->at(key)->type(), &trt_dtype));
   return trt_dtype;
 }
 
 template <>
-tensorflow::DataType TFAttrs::get<tensorflow::DataType>(std::string key) const {
+tensorflow::DataType TFAttrs::get<tensorflow::DataType>(string key) const {
   return this->at(key)->type();
 }
 
 template <typename T>
-void reorder4(nvinfer1::DimsNCHW shape, T const* idata,
+void Reorder4(nvinfer1::DimsNCHW shape, T const* idata,
               nvinfer1::DimsNCHW istrides, T* odata,
               nvinfer1::DimsNCHW ostrides) {
   for (int n = 0; n < shape.n(); ++n) {
@@ -293,8 +323,8 @@ void reorder4(nvinfer1::DimsNCHW shape, T const* idata,
   }
 }
 
-void reorder_rsck_to_kcrs(TRT_ShapedWeights const& iweights,
-                          TRT_ShapedWeights* oweights) {
+void ReorderRSCKToKCRS(const TRT_ShapedWeights& iweights,
+                       TRT_ShapedWeights* oweights) {
   CHECK_EQ(iweights.type_, oweights->type_);
   CHECK_EQ(iweights.size_bytes(), oweights->size_bytes());
   int r = iweights.shape_.d[0];
@@ -309,7 +339,7 @@ void reorder_rsck_to_kcrs(TRT_ShapedWeights const& iweights,
   nvinfer1::DimsNCHW ostrides = {c * r * s, r * s, s, 1};
   switch (iweights.type_) {
     case tensorflow::DataType::DT_FLOAT:
-      reorder4(
+      Reorder4(
           {k, c, r, s}, static_cast<float const*>(iweights.values_), istrides,
           static_cast<float*>(const_cast<void*>(oweights->values_)), ostrides);
       break;
@@ -336,23 +366,23 @@ inline std::shared_ptr<T> infer_object(T* obj) {
 class Converter;
 
 using OpConverter =
-    std::function<tensorflow::Status(Converter&, tensorflow::NodeDef const&,
+    std::function<tensorflow::Status(Converter&, const tensorflow::NodeDef&,
                                      std::vector<TRT_TensorOrWeights> const&,
                                      std::vector<TRT_TensorOrWeights>*)>;
 
 class Converter {
-  std::unordered_map<std::string, TRT_TensorOrWeights> _trt_tensors;
-  std::unordered_map<std::string, OpConverter> _op_registry;
+  std::unordered_map<string, TRT_TensorOrWeights> _trt_tensors;
+  std::unordered_map<string, OpConverter> _op_registry;
   nvinfer1::INetworkDefinition* _trt_network;
   std::list<std::vector<uint8_t>> _temp_bufs;
 
   void register_op_converters();
 
   std::vector<TRT_TensorOrWeights> get_inputs(
-      tensorflow::NodeDef const& node_def) {
+      const tensorflow::NodeDef& node_def) {
     std::vector<TRT_TensorOrWeights> inputs;
-    for (auto const& input_name : node_def.input()) {
-      VLOG(2) << "retrieve input: " << input_name;
+    for (const auto& input_name : node_def.input()) {
+      VLOG(2) << "Retrieve input: " << input_name;
       inputs.push_back(_trt_tensors.at(input_name));
     }
     return inputs;
@@ -373,16 +403,16 @@ class Converter {
     return weights;
   }
 
-  TRT_ShapedWeights get_temp_weights_like(TRT_ShapedWeights const& weights) {
+  TRT_ShapedWeights get_temp_weights_like(const TRT_ShapedWeights& weights) {
     return this->get_temp_weights(weights.type_, weights.shape_);
   }
 
-  tensorflow::Status convert_node(tensorflow::NodeDef const& node_def) {
+  tensorflow::Status convert_node(const tensorflow::NodeDef& node_def) {
     std::vector<TRT_TensorOrWeights> inputs = this->get_inputs(node_def);
-    std::string op = node_def.op();
+    string op = node_def.op();
     if (!_op_registry.count(op)) {
       return tensorflow::errors::Unimplemented(
-          "no converter registered for op: " + op);
+          "No converter registered for op: " + op);
     }
     OpConverter op_converter = _op_registry.at(op);
     std::vector<TRT_TensorOrWeights> outputs;
@@ -390,15 +420,15 @@ class Converter {
     for (size_t i = 0; i < outputs.size(); ++i) {
       TRT_TensorOrWeights output = outputs.at(i);
       // TODO(jie): tf protobuf seems to be omitting the :0 suffix
-      std::string output_name = node_def.name();
+      string output_name = node_def.name();
       if (i != 0) output_name = output_name + ":" + std::to_string(i);
       if (output.is_tensor()) {
         output.tensor()->setName(output_name.c_str());
       }
-      VLOG(2) << "write out tensor: " << output_name;
+      VLOG(2) << "Write out tensor: " << output_name;
       if (!_trt_tensors.insert({output_name, output}).second) {
         return tensorflow::errors::AlreadyExists(
-            "output tensor already exists for op: " + op);
+            "Output tensor already exists for op: " + op);
       }
     }
     return tensorflow::Status::OK();
@@ -406,24 +436,24 @@ class Converter {
 
   nvinfer1::INetworkDefinition* network() { return _trt_network; }
 
-  TRT_TensorOrWeights get_tensor(std::string name) {
+  TRT_TensorOrWeights get_tensor(string name) {
     if (!_trt_tensors.count(name)) {
       return TRT_TensorOrWeights(nullptr);
     }
     return _trt_tensors.at(name);
   }
 
-  bool insert_input_tensor(std::string name, nvinfer1::ITensor* tensor) {
+  bool insert_input_tensor(string name, nvinfer1::ITensor* tensor) {
     return _trt_tensors.insert({name, TRT_TensorOrWeights(tensor)}).second;
   }
 
-  nvinfer1::ITensor* transposeTensor(nvinfer1::ITensor* input_tensor,
+  nvinfer1::ITensor* TransposeTensor(nvinfer1::ITensor* input_tensor,
                                      std::vector<int> order) {
     auto dims = input_tensor->getDimensions();
 
     // TODO(jie): change the return to status and properly exit
     if (order.size() - 1 != size_t(dims.nbDims))
-      LOG(ERROR) << "dimension does not match, fail gracefully";
+      LOG(ERROR) << "Dimension does not match, fail gracefully";
 
     nvinfer1::IShuffleLayer* layer = this->network()->addShuffle(*input_tensor);
     nvinfer1::Permutation permutation;
@@ -432,13 +462,13 @@ class Converter {
     }
     layer->setFirstTranspose(permutation);
 
-    nvinfer1::Dims reshapeDims;
-    reshapeDims.nbDims = dims.nbDims;
-    for (int32_t i = 0; i < reshapeDims.nbDims; ++i) {
-      reshapeDims.d[i] = 0;
-      reshapeDims.type[i] = dims.type[i];
+    nvinfer1::Dims reshape_dims;
+    reshape_dims.nbDims = dims.nbDims;
+    for (int32_t i = 0; i < reshape_dims.nbDims; ++i) {
+      reshape_dims.d[i] = 0;
+      reshape_dims.type[i] = dims.type[i];
     }
-    layer->setReshapeDimensions(reshapeDims);
+    layer->setReshapeDimensions(reshape_dims);
     return layer->getOutput(0);
   }
 };
@@ -462,7 +492,7 @@ struct LambdaFactory {
       case OP_CATEGORY::NEG:
         return [](T t) -> T { return -t; };
       default:
-        VLOG(2) << "not supported op for unary: " << static_cast<int>(op);
+        VLOG(2) << "Not supported op for unary: " << static_cast<int>(op);
         return nullptr;
     }
   }
@@ -477,7 +507,7 @@ struct LambdaFactory {
       case OP_CATEGORY::MUL:
         return [](T l, T r) -> T { return l * r; };
       default:
-        LOG(WARNING) << "not supported op for binary: " << static_cast<int>(op);
+        LOG(WARNING) << "Not supported op for binary: " << static_cast<int>(op);
     }
     return [](T l, T r) -> T {
       LOG(FATAL) << "Unsupported op type ";
@@ -494,7 +524,7 @@ struct LambdaFactory {
           VLOG(2) << "LAMBDA VAL : " << val;
           return l + val;
         };
-      // return [val](T l)-> T {return l+val;};
+      // Return [val](T l)-> T {return l+val;};
       case OP_CATEGORY::SUB:
         return [val](T l) -> T {
           VLOG(2) << "LAMBDA VAL : " << val;
@@ -506,7 +536,7 @@ struct LambdaFactory {
           return l * val;
         };
       default:
-        LOG(WARNING) << "not supported op for binary: " << static_cast<int>(op);
+        LOG(WARNING) << "Not supported op for binary: " << static_cast<int>(op);
     }
     return [val](T l) -> T {
       LOG(FATAL) << "Unsupported op type ";
@@ -534,7 +564,7 @@ struct LambdaFactory {
           return val * l;
         };
       default:
-        LOG(ERROR) << "not supported op for binary: " << static_cast<int>(op);
+        LOG(ERROR) << "Not supported op for binary: " << static_cast<int>(op);
     }
     return [val](T l) -> T {
       LOG(FATAL) << "Unsupported op type ";
@@ -543,12 +573,10 @@ struct LambdaFactory {
   }
 };
 
-tensorflow::Status UnaryCompute(TRT_ShapedWeights const& iweights,
+tensorflow::Status UnaryCompute(const TRT_ShapedWeights& iweights,
                                 TRT_ShapedWeights* oweights,
                                 LambdaFactory unary_op) {
-  // assume iweights.type == oweights.type
   CHECK_EQ(iweights.type_, oweights->type_);
-
   switch (iweights.type_) {
     case tensorflow::DataType::DT_FLOAT: {
       auto inp = static_cast<float const*>(iweights.values_);
@@ -557,17 +585,18 @@ tensorflow::Status UnaryCompute(TRT_ShapedWeights const& iweights,
       break;
     }
     default:
-      return tensorflow::errors::Unimplemented("data type not supported: " +
-                                               iweights.type_);
+      return tensorflow::errors::Unimplemented(
+          "Data type not supported: " +
+          tensorflow::DataTypeString(iweights.type_));
   }
   return tensorflow::Status::OK();
 }
 
-tensorflow::Status BinaryCompute(TRT_ShapedWeights const& iweights_l,
-                                 TRT_ShapedWeights const& iweights_r,
+tensorflow::Status BinaryCompute(const TRT_ShapedWeights& iweights_l,
+                                 const TRT_ShapedWeights& iweights_r,
                                  TRT_ShapedWeights* oweights,
                                  LambdaFactory binary_op) {
-  // assume iweights_l.type == iweight_r.type
+  // Assume iweights_l.type == iweight_r.type
   CHECK_EQ(iweights_l.type_, oweights->type_);
   CHECK_EQ(iweights_r.type_, oweights->type_);
   VLOG(2) << "SANITY CHECK!";
@@ -579,7 +608,7 @@ tensorflow::Status BinaryCompute(TRT_ShapedWeights const& iweights_l,
       auto oup = static_cast<float*>(const_cast<void*>(oweights->values_));
 
       if (iweights_l.count() != iweights_r.count()) {
-        // we only supports broadcast of RankZero
+        // We only supports broadcast of RankZero
         if (iweights_l.count() == 1) {
           VLOG(2) << "I bet it is not working!" << (*inp_l);
           std::transform(inp_r, inp_r + iweights_r.count(), oup,
@@ -599,36 +628,37 @@ tensorflow::Status BinaryCompute(TRT_ShapedWeights const& iweights_l,
       break;
     }
     default:
-      return tensorflow::errors::Unimplemented("data type not supported: " +
-                                               iweights_l.type_);
+      return tensorflow::errors::Unimplemented(
+          "Data type not supported: " +
+          tensorflow::DataTypeString(iweights_l.type_));
   }
 
   return tensorflow::Status::OK();
 }
 
 tensorflow::Status ConstantFoldUnary(
-    Converter& ctx, tensorflow::NodeDef const& node_def,
+    Converter& ctx, const tensorflow::NodeDef& node_def,
     std::vector<TRT_TensorOrWeights> const& inputs,
     std::vector<TRT_TensorOrWeights>* outputs) {
   TRT_ShapedWeights weights_input = inputs.at(0).weights();
 
-  // allocate output weights
+  // Allocate output weights
   TRT_ShapedWeights weights_output = ctx.get_temp_weights_like(weights_input);
 
   // FIXME assume type matches input weights
-  // get trt type & shape
-  // maybe this part has to be moved into the block of rsqrt later
-  // check type consistency
+  // Get trt type & shape
+  // Maybe this part has to be moved into the block of rsqrt later
+  // Check type consistency
   CHECK_EQ(weights_input.type_,
            TFAttrs(node_def).get<tensorflow::DataType>("T"));
 
   // Maybe I should do a switch
   LambdaFactory unary_op;
   if (node_def.op() == "Rsqrt") {
-    // compute rsqrt
+    // Compute rsqrt
     unary_op.op = LambdaFactory::OP_CATEGORY::RSQRT;
     auto ret = UnaryCompute(weights_input, &weights_output, unary_op);
-    // pass the output
+    // PAss the output
     if (ret == tensorflow::Status::OK()) {
       outputs->push_back(TRT_TensorOrWeights(weights_output));
     }
@@ -643,13 +673,13 @@ tensorflow::Status ConstantFoldUnary(
 // Let's get the simple stuff working first. Maybe we should fall bakc to TF
 //   approach for constant folding
 tensorflow::Status ConstantFoldBinary(
-    Converter& ctx, tensorflow::NodeDef const& node_def,
+    Converter& ctx, const tensorflow::NodeDef& node_def,
     std::vector<TRT_TensorOrWeights> const& inputs,
     std::vector<TRT_TensorOrWeights>* outputs) {
   TRT_ShapedWeights weights_input_l = inputs.at(0).weights();
   TRT_ShapedWeights weights_input_r = inputs.at(1).weights();
 
-  // check type consistency
+  // Check type consistency
   CHECK_EQ(weights_input_l.type_, weights_input_r.type_);
 
   if (weights_input_l.shape_.nbDims != weights_input_r.shape_.nbDims)
@@ -657,12 +687,12 @@ tensorflow::Status ConstantFoldBinary(
         "Binary op implicit broadcast not supported: " + node_def.op());
 
   // TODO(jie): constant fold should really fall back to TF.
-  int nbDims = weights_input_l.shape_.nbDims;
+  int nb_dims = weights_input_l.shape_.nbDims;
   nvinfer1::Dims output_shape;
-  output_shape.nbDims = nbDims;
-  VLOG(2) << "nbDims: " << nbDims
-          << "the other: " << weights_input_r.shape_.nbDims;
-  for (int i = 0; i < nbDims; i++) {
+  output_shape.nbDims = nb_dims;
+  VLOG(2) << "nb_dims: " << nb_dims
+          << ", the other: " << weights_input_r.shape_.nbDims;
+  for (int i = 0; i < nb_dims; i++) {
     if (weights_input_l.shape_.d[i] == weights_input_r.shape_.d[i]) {
       output_shape.d[i] = weights_input_l.shape_.d[i];
     } else if (weights_input_l.shape_.d[i] == 1 ||
@@ -679,12 +709,12 @@ tensorflow::Status ConstantFoldBinary(
   }
 
   // FIXME assume type matches input weights
-  // get trt type & shape
+  // Get trt type & shape
   TFAttrs attrs(node_def);
-  // maybe this part has to be moved into the block of rsqrt later
+  // Maybe this part has to be moved into the block of rsqrt later
   tensorflow::DataType dtype = attrs.get<tensorflow::DataType>("T");
 
-  // allocate output weights
+  // Allocate output weights
   TRT_ShapedWeights weights_output = ctx.get_temp_weights(dtype, output_shape);
 
   // Maybe I should do a switch
@@ -702,7 +732,7 @@ tensorflow::Status ConstantFoldBinary(
   auto ret = BinaryCompute(weights_input_l, weights_input_r, &weights_output,
                            binary_op);
 
-  // pass the output
+  // Pass the output
   if (ret == tensorflow::Status::OK()) {
     outputs->push_back(TRT_TensorOrWeights(weights_output));
   }
@@ -710,42 +740,42 @@ tensorflow::Status ConstantFoldBinary(
   return ret;
 }
 
-// TODO(jie): broadcast is needed yet not implemented
-// only implemented channel wise for the time being
+// TODO(jie): broadcast is needed yet not implemented.
+// Only implemented channel wise for the time being
 tensorflow::Status BinaryTensorOpWeight(
-    Converter& ctx, tensorflow::NodeDef const& node_def,
+    Converter& ctx, const tensorflow::NodeDef& node_def,
     const nvinfer1::ITensor* tensor, TRT_ShapedWeights weights,
     std::vector<TRT_TensorOrWeights>* outputs) {
   // FIXME assume type matches input weights
-  // get trt type & shape
-  // maybe this part has to be moved into the block of rsqrt later
+  // Get trt type & shape
+  // Maybe this part has to be moved into the block of rsqrt later
 
-  // check type consistency
+  // Check type consistency
   auto dtype = TFAttrs(node_def).get<nvinfer1::DataType>("T");
-  CHECK_EQ_TYPE(tensor->getType(), dtype);  // cast to int for error messages
+  CHECK_EQ_TYPE(tensor->getType(), dtype);  // Cast to int for error messages
   nvinfer1::DataType ttype;
-  TF_CHECK_OK(convert_dtype(weights.type_, &ttype));
-  CHECK_EQ_TYPE(ttype, dtype);  // cast to int for error message
+  TF_CHECK_OK(ConvertDType(weights.type_, &ttype));
+  CHECK_EQ_TYPE(ttype, dtype);  // Cast to int for error message
 
-  // check scale mode
+  // Check scale mode
   auto dims_w = weights.shape_;
   auto dims_t = tensor->getDimensions();
 
-  // default to channel-wise
+  // Default to channel-wise
   auto scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
 
   if (weights.count() == 1) {
     VLOG(2) << "UNIFORM";
     scale_mode = nvinfer1::ScaleMode::kUNIFORM;
   } else {
-    // no broadcasting on Batch dimension;
+    // No broadcasting on Batch dimension;
     assert(dims_w.d[0] == 1);
 
-    // broadcasting on Channel dimension only allowed in kUNIFORM
+    // Broadcasting on Channel dimension only allowed in kUNIFORM
     assert(dims_w.d[1] == dims_t.d[0]);
     assert(dims_w.nbDims == dims_t.nbDims);
 
-    // default is element;
+    // Default is element;
     for (int i = 2; i < dims_w.nbDims; i++) {
       if (dims_w.d[i] != dims_t.d[i - 1]) {
         scale_mode = nvinfer1::ScaleMode::kCHANNEL;
@@ -762,59 +792,58 @@ tensorflow::Status BinaryTensorOpWeight(
     }
   }
 
-  // prepare weights
-  TRT_ShapedWeights shiftWeights(weights.type_);
-  TRT_ShapedWeights scaleWeights(weights.type_);
-  TRT_ShapedWeights powerWeights(weights.type_);
+  // Prepare weights
+  TRT_ShapedWeights shift_weights(weights.type_);
+  TRT_ShapedWeights scale_weights(weights.type_);
+  TRT_ShapedWeights power_weights(weights.type_);
 
   // Maybe I should do a switch
   if (node_def.op() == "Sub") {
     TRT_ShapedWeights neg_weights = ctx.get_temp_weights_like(weights);
     LambdaFactory unary_op;
     unary_op.op = LambdaFactory::OP_CATEGORY::NEG;
-    UnaryCompute(weights, &neg_weights, unary_op);
-    shiftWeights = neg_weights;
+    TF_RETURN_IF_ERROR(UnaryCompute(weights, &neg_weights, unary_op));
+    shift_weights = neg_weights;
   } else if (node_def.op() == "Mul") {
-    scaleWeights = weights;
+    scale_weights = weights;
   } else if (node_def.op() == "Add") {
-    shiftWeights = weights;
+    shift_weights = weights;
   } else {
     return tensorflow::errors::Unimplemented("Binary op not supported: " +
                                              node_def.op());
   }
 
   nvinfer1::IScaleLayer* layer = ctx.network()->addScale(
-      *const_cast<nvinfer1::ITensor*>(tensor), scale_mode, shiftWeights,
-      scaleWeights, powerWeights);
+      *const_cast<nvinfer1::ITensor*>(tensor), scale_mode, shift_weights,
+      scale_weights, power_weights);
 
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
 
-  // pass the output
+  // Pass the output
   outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return tensorflow::Status::OK();
 }
 
 tensorflow::Status BinaryTensorOpTensor(
-    Converter& ctx, tensorflow::NodeDef const& node_def,
+    Converter& ctx, const tensorflow::NodeDef& node_def,
     const nvinfer1::ITensor* tensor_l, const nvinfer1::ITensor* tensor_r,
     std::vector<TRT_TensorOrWeights>* outputs) {
-  static const std::unordered_map<std::string, nvinfer1::ElementWiseOperation>
-      ops{
-          {"Add", nvinfer1::ElementWiseOperation::kSUM},
-          {"Mul", nvinfer1::ElementWiseOperation::kPROD},
-          // {"max", nvinfer1::ElementWiseOperation::kMAX},
-          // {"min", nvinfer1::ElementWiseOperation::kMIN},
-          {"Sub", nvinfer1::ElementWiseOperation::kSUB},
-          {"Div", nvinfer1::ElementWiseOperation::kDIV},
-      };
+  static const std::unordered_map<string, nvinfer1::ElementWiseOperation> ops{
+      {"Add", nvinfer1::ElementWiseOperation::kSUM},
+      {"Mul", nvinfer1::ElementWiseOperation::kPROD},
+      // {"max", nvinfer1::ElementWiseOperation::kMAX},
+      // {"min", nvinfer1::ElementWiseOperation::kMIN},
+      {"Sub", nvinfer1::ElementWiseOperation::kSUB},
+      {"Div", nvinfer1::ElementWiseOperation::kDIV},
+  };
 
   // FIXME assume type matches input weights
-  // get trt type & shape
+  // Get trt type & shape
   TFAttrs attrs(node_def);
-  // maybe this part has to be moved into the block of rsqrt later
+  // Maybe this part has to be moved into the block of rsqrt later
   nvinfer1::DataType dtype = attrs.get<nvinfer1::DataType>("T");
 
-  // check type consistency
+  // Check type consistency
   CHECK_EQ_TYPE(tensor_l->getType(), dtype);
   CHECK_EQ_TYPE(tensor_r->getType(), dtype);
   auto op_pair = ops.find(node_def.op());
@@ -829,17 +858,17 @@ tensorflow::Status BinaryTensorOpTensor(
 
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
 
-  // pass the output
+  // Pass the output
   outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return tensorflow::Status::OK();
 }
 
 tensorflow::Status ConvertPlaceholder(
-    Converter& ctx, tensorflow::NodeDef const& node_def,
+    Converter& ctx, const tensorflow::NodeDef& node_def,
     std::vector<TRT_TensorOrWeights> const& inputs,
     std::vector<TRT_TensorOrWeights>* outputs) {
   VLOG(2) << "Placeholder should have been replace already";
-  return tensorflow::errors::Unimplemented("cannot convert Placeholder op");
+  return tensorflow::errors::Unimplemented(", cannot convert Placeholder op");
   // OK this make sense since we are supposed to replace it with input
   TFAttrs attrs(node_def);
   nvinfer1::DataType dtype = attrs.get<nvinfer1::DataType>("dtype");
@@ -858,14 +887,14 @@ tensorflow::Status ConvertPlaceholder(
 }
 
 tensorflow::Status ConvertConv2D(Converter& ctx,
-                                 tensorflow::NodeDef const& node_def,
-                                 std::vector<TRT_TensorOrWeights> const& inputs,
+                                 const tensorflow::NodeDef& node_def,
+                                 const std::vector<TRT_TensorOrWeights>& inputs,
                                  std::vector<TRT_TensorOrWeights>* outputs) {
   nvinfer1::ITensor const* tensor = inputs.at(0).tensor();
   // TODO(jie): handle NHWC/NCHW transpose;
   TRT_ShapedWeights weights_rsck = inputs.at(1).weights();
   TRT_ShapedWeights weights = ctx.get_temp_weights_like(weights_rsck);
-  reorder_rsck_to_kcrs(weights_rsck, &weights);
+  ReorderRSCKToKCRS(weights_rsck, &weights);
   TRT_ShapedWeights biases(weights.type_);
   int noutput = weights.shape_.d[0];
   nvinfer1::DimsHW kernel_size;
@@ -875,9 +904,9 @@ tensorflow::Status ConvertConv2D(Converter& ctx,
 
   int h_index = 2;
   int w_index = 3;
-  auto data_format = attrs.get<std::string>("data_format");
+  auto data_format = attrs.get<string>("data_format");
   if (data_format == "NHWC") {
-    tensor = ctx.transposeTensor(const_cast<nvinfer1::ITensor*>(tensor),
+    tensor = ctx.TransposeTensor(const_cast<nvinfer1::ITensor*>(tensor),
                                  {0, 3, 1, 2});
     h_index = 1;
     w_index = 2;
@@ -891,11 +920,11 @@ tensorflow::Status ConvertConv2D(Converter& ctx,
   auto tensor_dim = tensor->getDimensions();
   std::vector<std::pair<int, int>> padding;
   // TODO(jie): padding.
-  if (attrs.get<std::string>("padding") == "SAME") {
+  if (attrs.get<string>("padding") == "SAME") {
     // This is NCHW tensor with no batch dimension.
     //  1 -> h
     //  2 -> w
-    padding = createSamePadding(
+    padding = CreateSamePadding(
         stride, kernel_size,
         {static_cast<int>(tensor_dim.d[1]), static_cast<int>(tensor_dim.d[2])});
   } else {
@@ -905,18 +934,18 @@ tensorflow::Status ConvertConv2D(Converter& ctx,
   if (padding[0].first != padding[0].second ||
       padding[1].first != padding[1].second) {
     // TODO(jie): handle asymmetric padding
-    VLOG(2) << "padding!!!: " << padding[0].first << padding[0].second
+    VLOG(2) << "Padding!!!: " << padding[0].first << padding[0].second
             << padding[1].first << padding[1].second;
 
     auto dim_before = tensor->getDimensions();
     VLOG(2) << "TENSOR before: " << dim_before.d[0] << ", " << dim_before.d[1]
             << dim_before.d[2] << ", " << dim_before.d[3];
-    auto padLayer = ctx.network()->addPadding(
+    auto pad_layer = ctx.network()->addPadding(
         *const_cast<nvinfer1::ITensor*>(tensor),
         nvinfer1::DimsHW(padding[0].first, padding[1].first),
         nvinfer1::DimsHW(padding[0].second, padding[1].second));
     padding = {{0, 0}, {0, 0}};
-    tensor = padLayer->getOutput(0);
+    tensor = pad_layer->getOutput(0);
     auto dim_after = tensor->getDimensions();
     VLOG(2) << "TENSOR after: " << dim_after.d[0] << ", " << dim_after.d[1]
             << dim_after.d[2] << ", " << dim_after.d[3];
@@ -937,7 +966,7 @@ tensorflow::Status ConvertConv2D(Converter& ctx,
 
   if (data_format == "NHWC") {
     // TODO(jie): transpose it back!
-    output_tensor = ctx.transposeTensor(output_tensor, {0, 2, 3, 1});
+    output_tensor = ctx.TransposeTensor(output_tensor, {0, 2, 3, 1});
   } else {
     VLOG(2) << "NCHW !!!!";
   }
@@ -946,7 +975,7 @@ tensorflow::Status ConvertConv2D(Converter& ctx,
 }
 
 tensorflow::Status ConvertPool(Converter& ctx,
-                               tensorflow::NodeDef const& node_def,
+                               const tensorflow::NodeDef& node_def,
                                std::vector<TRT_TensorOrWeights> const& inputs,
                                std::vector<TRT_TensorOrWeights>* outputs) {
   nvinfer1::ITensor const* tensor = inputs.at(0).tensor();
@@ -954,11 +983,11 @@ tensorflow::Status ConvertPool(Converter& ctx,
 
   int h_index = 2;
   int w_index = 3;
-  auto data_format = attrs.get<std::string>("data_format");
+  auto data_format = attrs.get<string>("data_format");
   if (data_format == "NHWC") {
     h_index = 1;
     w_index = 2;
-    tensor = ctx.transposeTensor(const_cast<nvinfer1::ITensor*>(tensor),
+    tensor = ctx.TransposeTensor(const_cast<nvinfer1::ITensor*>(tensor),
                                  {0, 3, 1, 2});
   } else {
     VLOG(2) << "NCHW !!!!";
@@ -968,7 +997,7 @@ tensorflow::Status ConvertPool(Converter& ctx,
   if (node_def.op() == "MaxPool")
     type = nvinfer1::PoolingType::kMAX;
   else
-    return tensorflow::errors::Unimplemented("only supports Max pool");
+    return tensorflow::errors::Unimplemented("Only supports Max pool");
 
   // TODO(jie): NCHW
   auto tf_stride = attrs.get<std::vector<int>>("strides");
@@ -980,16 +1009,16 @@ tensorflow::Status ConvertPool(Converter& ctx,
   auto tensor_dim = tensor->getDimensions();
   std::vector<std::pair<int, int>> padding;
   // TODO(jie): padding.
-  if (attrs.get<std::string>("padding") == "SAME") {
+  if (attrs.get<string>("padding") == "SAME") {
     // This is NCHW tensor with no batch dimension.
     //  1 -> h
     //  2 -> w
-    padding = createSamePadding(
+    padding = CreateSamePadding(
         stride, ksize,
         {static_cast<int>(tensor_dim.d[1]), static_cast<int>(tensor_dim.d[2])});
-  } else if (attrs.get<std::string>("padding") == "VALID") {
+  } else if (attrs.get<string>("padding") == "VALID") {
     // No padding for valid padding here
-    VLOG(2) << "no padding added for VALID padding in pool" << node_def.name();
+    VLOG(2) << "No padding added for VALID padding in pool" << node_def.name();
     padding = {{0, 0}, {0, 0}};
   } else {
     return tensorflow::errors::Unimplemented(
@@ -999,14 +1028,14 @@ tensorflow::Status ConvertPool(Converter& ctx,
   if (padding[0].first != padding[0].second ||
       padding[1].first != padding[1].second) {
     // TODO(jie): handle asymmetric padding
-    VLOG(2) << "padding!!!: " << padding[0].first << padding[0].second
+    VLOG(2) << "Padding!!!: " << padding[0].first << padding[0].second
             << padding[1].first << padding[1].second;
-    auto padLayer = ctx.network()->addPadding(
+    auto pad_layer = ctx.network()->addPadding(
         *const_cast<nvinfer1::ITensor*>(tensor),
         nvinfer1::DimsHW(padding[0].first, padding[1].first),
         nvinfer1::DimsHW(padding[0].second, padding[1].second));
     padding = {{0, 0}, {0, 0}};
-    tensor = padLayer->getOutput(0);
+    tensor = pad_layer->getOutput(0);
   }
 
   nvinfer1::IPoolingLayer* layer = ctx.network()->addPooling(
@@ -1019,7 +1048,7 @@ tensorflow::Status ConvertPool(Converter& ctx,
 
   if (data_format == "NHWC") {
     // TODO(jie): transpose it back!
-    output_tensor = ctx.transposeTensor(output_tensor, {0, 2, 3, 1});
+    output_tensor = ctx.TransposeTensor(output_tensor, {0, 2, 3, 1});
   } else {
     VLOG(2) << "NCHW !!!!";
   }
@@ -1028,7 +1057,7 @@ tensorflow::Status ConvertPool(Converter& ctx,
 }
 
 tensorflow::Status ConvertActivation(
-    Converter& ctx, tensorflow::NodeDef const& node_def,
+    Converter& ctx, const tensorflow::NodeDef& node_def,
     std::vector<TRT_TensorOrWeights> const& inputs,
     std::vector<TRT_TensorOrWeights>* outputs) {
   nvinfer1::ITensor const* tensor = inputs.at(0).tensor();
@@ -1040,14 +1069,14 @@ tensorflow::Status ConvertActivation(
 }
 
 tensorflow::Status ConvertScale(Converter& ctx,
-                                tensorflow::NodeDef const& node_def,
+                                const tensorflow::NodeDef& node_def,
                                 std::vector<TRT_TensorOrWeights> const& inputs,
                                 std::vector<TRT_TensorOrWeights>* outputs) {
   if (inputs.size() != 2 || !inputs.at(0).is_tensor() ||
       !inputs.at(1).is_weights())
     return tensorflow::errors::Unimplemented(
-        "only supports tensor op weight for now, at " + node_def.name());
-  // implement tensor binaryOp weight [channel wise] for now;
+        "Only supports tensor op weight for now, at " + node_def.name());
+  // Implement tensor binaryOp weight [channel wise] for now;
   nvinfer1::ITensor const* tensor = inputs.at(0).tensor();
 
   // TODO(jie): handle NHWC/NCHW transpose;
@@ -1056,10 +1085,10 @@ tensorflow::Status ConvertScale(Converter& ctx,
 
   TFAttrs attrs(node_def);
 
-  // transpose NHWC
-  auto data_format = attrs.get<std::string>("data_format");
+  // Transpose NHWC
+  auto data_format = attrs.get<string>("data_format");
   if (data_format == "NHWC") {
-    tensor = ctx.transposeTensor(const_cast<nvinfer1::ITensor*>(tensor),
+    tensor = ctx.TransposeTensor(const_cast<nvinfer1::ITensor*>(tensor),
                                  {0, 3, 1, 2});
     // TODO(jie): transpose it
   } else {
@@ -1072,7 +1101,7 @@ tensorflow::Status ConvertScale(Converter& ctx,
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
   if (data_format == "NHWC") {
     // TODO(jie): transpose it back!
-    output_tensor = ctx.transposeTensor(output_tensor, {0, 2, 3, 1});
+    output_tensor = ctx.TransposeTensor(output_tensor, {0, 2, 3, 1});
   } else {
     VLOG(2) << "NCHW !!!!";
   }
@@ -1081,20 +1110,19 @@ tensorflow::Status ConvertScale(Converter& ctx,
 }
 
 tensorflow::Status ConvertConst(Converter& ctx,
-                                tensorflow::NodeDef const& node_def,
+                                const tensorflow::NodeDef& node_def,
                                 std::vector<TRT_TensorOrWeights> const& inputs,
                                 std::vector<TRT_TensorOrWeights>* outputs) {
-  auto const& weights_tensor = node_def.attr().at("value").tensor();
+  const auto& weights_tensor = node_def.attr().at("value").tensor();
 
-  // get trt type & shape
+  // Get trt type & shape
   TFAttrs attrs(node_def);
-  // nvinfer1::DataType dtype = attrs.get<nvinfer1::DataType>("dtype");
   tensorflow::DataType dtype = attrs.get<tensorflow::DataType>("dtype");
 
-  // create shaped weights as output
+  // Create shaped weights as output
   tensorflow::Tensor tensor;
   if (!tensor.FromProto(weights_tensor))
-    return tensorflow::errors::Internal("cannot parse weight tensor proto: " +
+    return tensorflow::errors::Internal("Cannot parse weight tensor proto: " +
                                         node_def.name());
 
   TRT_ShapedWeights weights(dtype);
@@ -1102,11 +1130,11 @@ tensorflow::Status ConvertConst(Converter& ctx,
     VLOG(2) << "SCALAR!!!" << node_def.name();
     nvinfer1::Dims scalar_shape;
     if (tensor.dims() > 0) {
-      VLOG(2) << "dimensions: " << tensor.dims();
+      VLOG(2) << "Dimensions: " << tensor.dims();
       weights = TRT_ShapedWeights(dtype, weights_tensor.float_val().data(),
-                                  get_tensor_shape(tensor));
+                                  GetTensorShape(tensor));
     } else {
-      VLOG(2) << "dimensions: " << tensor.dims();
+      VLOG(2) << "Dimensions: " << tensor.dims();
       scalar_shape.nbDims = 1;
       scalar_shape.d[0] = 1;
       scalar_shape.type[0] = nvinfer1::DimensionType::kSPATIAL;
@@ -1119,19 +1147,23 @@ tensorflow::Status ConvertConst(Converter& ctx,
     }
   } else if (!weights_tensor.tensor_content().empty()) {
     VLOG(2) << "TENSOR!!!" << node_def.name();
-    weights = TRT_ShapedWeights(dtype, weights_tensor.tensor_content().data(),
-                                get_tensor_shape(tensor));
+    const auto& content = weights_tensor.tensor_content();
+    char* buf = new char[content.size() + 1];
+    buf[content.size()] = 0;
+    port::CopyToArray(content, buf);
+    weights = TRT_ShapedWeights(dtype, buf, GetTensorShape(tensor),
+                                /*owned_values=*/true);
   } else {
     return tensorflow::errors::Unimplemented(
-        "not supported constant type, at " + node_def.name());
+        "Not supported constant type, at " + node_def.name());
   }
-  // pass the output
+  // Pass the output
   outputs->push_back(TRT_TensorOrWeights(weights));
   return tensorflow::Status::OK();
 }
 
 tensorflow::Status ConvertIdentity(
-    Converter& ctx, tensorflow::NodeDef const& node_def,
+    Converter& ctx, const tensorflow::NodeDef& node_def,
     std::vector<TRT_TensorOrWeights> const& inputs,
     std::vector<TRT_TensorOrWeights>* outputs) {
   outputs->push_back(inputs.at(0));
@@ -1139,7 +1171,7 @@ tensorflow::Status ConvertIdentity(
 }
 
 tensorflow::Status ConvertBinary(Converter& ctx,
-                                 tensorflow::NodeDef const& node_def,
+                                 const tensorflow::NodeDef& node_def,
                                  std::vector<TRT_TensorOrWeights> const& inputs,
                                  std::vector<TRT_TensorOrWeights>* outputs) {
   if (inputs.size() != 2)
@@ -1166,7 +1198,7 @@ tensorflow::Status ConvertBinary(Converter& ctx,
 }
 
 tensorflow::Status ConvertUnary(Converter& ctx,
-                                tensorflow::NodeDef const& node_def,
+                                const tensorflow::NodeDef& node_def,
                                 std::vector<TRT_TensorOrWeights> const& inputs,
                                 std::vector<TRT_TensorOrWeights>* outputs) {
   if (inputs.size() != 1)
@@ -1184,7 +1216,7 @@ tensorflow::Status ConvertUnary(Converter& ctx,
 }
 
 tensorflow::Status ConvertReduce(Converter& ctx,
-                                 tensorflow::NodeDef const& node_def,
+                                 const tensorflow::NodeDef& node_def,
                                  std::vector<TRT_TensorOrWeights> const& inputs,
                                  std::vector<TRT_TensorOrWeights>* outputs) {
   if (inputs.size() != 2 || !inputs.at(0).is_tensor() ||
@@ -1192,18 +1224,18 @@ tensorflow::Status ConvertReduce(Converter& ctx,
     return tensorflow::errors::InvalidArgument(
         "Input expects tensor and weights, at" + node_def.name());
 
-  // implement tensor binaryOp weight [channel wise] for now;
+  // Implement tensor binaryOp weight [channel wise] for now;
   nvinfer1::ITensor const* tensor = inputs.at(0).tensor();
   auto dims = tensor->getDimensions();
-  // restore implicit batch dimension
-  int nbDims = dims.nbDims + 1;
+  // Restore implicit batch dimension
+  int nb_dims = dims.nbDims + 1;
 
   TRT_ShapedWeights index_list = inputs.at(1).weights();
 
   TFAttrs attrs(node_def);
-  // TODO(jie): handle data type
-  // index type here is done through TF type
-  //   so I can leverage their EnumToDataType for my cast
+  // TODO(jie): handle data type.
+  // Index type here is done through TF type, so I can leverage their
+  // EnumToDataType for my cast
   auto index_type = attrs.get<tensorflow::DataType>("Tidx");
 
   // Only expect to handle INT32 as attributes for now
@@ -1212,9 +1244,9 @@ tensorflow::Status ConvertReduce(Converter& ctx,
   auto index_list_data =
       static_cast<int*>(const_cast<void*>(index_list.values_));
 
-  // hack warning:
-  //   have to fall back to pool layer since reduce is not in public TRT yet.
-  if (nbDims != 4)
+  // Hack warning: have to fall back to pool layer since reduce is not in public
+  // TRT yet.
+  if (nb_dims != 4)
     return tensorflow::errors::InvalidArgument(
         "TRT only support reduce on 4 dimensional tensors, at" +
         node_def.name());
@@ -1224,7 +1256,7 @@ tensorflow::Status ConvertReduce(Converter& ctx,
         node_def.name());
 
   std::set<int> idx_set;
-  // we cannot operate on Channel. permutation flag used to transpose tensor
+  // We cannot operate on Channel. permutation flag used to transpose tensor
   int permuted_index = -1;
   for (int i = 0; i < index_list.count(); i++) {
     if (index_list_data[i] == 0)
@@ -1234,26 +1266,26 @@ tensorflow::Status ConvertReduce(Converter& ctx,
     idx_set.emplace(index_list_data[i]);
   }
 
-  std::vector<int> permutation_order(nbDims);
+  std::vector<int> permutation_order(nb_dims);
   nvinfer1::DimsHW pool_kernel;
   if (permuted_index == 1) {
-    for (int i = 2; i < nbDims; i++) {
+    for (int i = 2; i < nb_dims; i++) {
       if (idx_set.count(i)) {
         permuted_index = i;
         break;
       }
     }
-    for (int i = 0; i < nbDims; i++) permutation_order[i] = i;
+    for (int i = 0; i < nb_dims; i++) permutation_order[i] = i;
 
     permutation_order[permuted_index] = 1;
     permutation_order[1] = permuted_index;
 
-    // apply permutation before extracting dimension for pool_kernel
-    tensor = ctx.transposeTensor(const_cast<nvinfer1::ITensor*>(tensor),
+    // Apply permutation before extracting dimension for pool_kernel
+    tensor = ctx.TransposeTensor(const_cast<nvinfer1::ITensor*>(tensor),
                                  permutation_order);
   }
 
-  // apply permutation before extracting dimension for pool_kernel
+  // Apply permutation before extracting dimension for pool_kernel
   pool_kernel.d[0] = (idx_set.count(2) || permuted_index == 2) ? dims.d[1] : 1;
   pool_kernel.d[1] = (idx_set.count(3) || permuted_index == 3) ? dims.d[2] : 1;
 
@@ -1269,15 +1301,15 @@ tensorflow::Status ConvertReduce(Converter& ctx,
         "Op not supported " + node_def.op() + " , at " + node_def.name());
   }
   if (permuted_index != -1) {
-    // apply permutation before extracting dimension for pool_kernel
-    output_tensor = ctx.transposeTensor(
+    // Apply permutation before extracting dimension for pool_kernel
+    output_tensor = ctx.TransposeTensor(
         const_cast<nvinfer1::ITensor*>(output_tensor), permutation_order);
   }
   return tensorflow::Status::OK();
 }
 
 tensorflow::Status ConvertPad(Converter& ctx,
-                              tensorflow::NodeDef const& node_def,
+                              const tensorflow::NodeDef& node_def,
                               std::vector<TRT_TensorOrWeights> const& inputs,
                               std::vector<TRT_TensorOrWeights>* outputs) {
   if (inputs.size() != 2 || !inputs.at(0).is_tensor() ||
@@ -1285,21 +1317,21 @@ tensorflow::Status ConvertPad(Converter& ctx,
     return tensorflow::errors::InvalidArgument(
         "Input expects tensor and weights, at" + node_def.name());
 
-  // implement tensor binaryOp weight [channel wise] for now;
+  // Implement tensor binaryOp weight [channel wise] for now;
   nvinfer1::ITensor const* tensor = inputs.at(0).tensor();
   auto dims = tensor->getDimensions();
-  // restore implicit batch dimension
-  int nbDims = dims.nbDims + 1;
+  // Restore implicit batch dimension
+  int nb_dims = dims.nbDims + 1;
 
   TRT_ShapedWeights pads = inputs.at(1).weights();
 
   TFAttrs attrs(node_def);
-  // padding type here is done through TF type
+  // Padding type here is done through TF type
   //   so I can leverage their EnumToDataType for my cast
   auto padding_type = attrs.get<tensorflow::DataType>("Tpaddings");
   // TODO(jie): handle data type conversion for TRT?
 
-  if (pads.shape_.d[0] != nbDims || pads.shape_.d[1] != 2)
+  if (pads.shape_.d[0] != nb_dims || pads.shape_.d[1] != 2)
     return tensorflow::errors::InvalidArgument(
         "Pad only supports explicit padding on 4 dimensional tensor, at " +
         node_def.name());
@@ -1311,28 +1343,28 @@ tensorflow::Status ConvertPad(Converter& ctx,
   auto pad_data = static_cast<int*>(const_cast<void*>(pads.values_));
 
   std::vector<int32_t> pad_index;
-  for (int i = 0; i < nbDims; i++) {
+  for (int i = 0; i < nb_dims; i++) {
     if (pad_data[2 * i] != 0 || pad_data[2 * i + 1] != 0)
       pad_index.push_back(i);
   }
 
-  // no padding at all, we should exit
+  // No padding at all, we should exit
   if (pad_index.size() == 0) {
     outputs->push_back(inputs.at(0));
     return tensorflow::Status::OK();
   }
 
-  // only supports padding on less than 2 axis GIE-2579
+  // Only supports padding on less than 2 axis GIE-2579
   if (pad_index.size() > 2)
     return tensorflow::errors::InvalidArgument(
         "Padding layer does not support padding on > 2");
 
-  // padding on batch dimension is not supported
+  // Padding on batch dimension is not supported
   if (pad_index[0] == 0)
     return tensorflow::errors::InvalidArgument(
         "Padding layer does not support padding on batch dimension");
 
-  // not doing the legit thing here. ignoring padding on dim 1 and 3;
+  // Not doing the legit thing here. ignoring padding on dim 1 and 3;
   // TODO(jie): implement pad as uff parser
   if (pad_index.size() == 2 && pad_index[0] == 0 && pad_index[1] == 3)
     return tensorflow::errors::Unimplemented(
@@ -1345,7 +1377,7 @@ tensorflow::Status ConvertPad(Converter& ctx,
   std::vector<int32_t> permuted_pad_index(pad_index);
   if (pad_index[0] == 1) {
     legit_pad = false;
-    tensor = ctx.transposeTensor(const_cast<nvinfer1::ITensor*>(tensor),
+    tensor = ctx.TransposeTensor(const_cast<nvinfer1::ITensor*>(tensor),
                                  {0, 3, 2, 1});
     permuted_pad_index[0] = 3;
   }
@@ -1366,7 +1398,7 @@ tensorflow::Status ConvertPad(Converter& ctx,
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
 
   if (!legit_pad)
-    output_tensor = ctx.transposeTensor(
+    output_tensor = ctx.TransposeTensor(
         const_cast<nvinfer1::ITensor*>(output_tensor), {0, 3, 2, 1});
 
   outputs->push_back(TRT_TensorOrWeights(output_tensor));
@@ -1382,7 +1414,7 @@ void Converter::register_op_converters() {
   // This could be really handled as ConvertBinary
   _op_registry["BiasAdd"] = ConvertScale;
   _op_registry["Const"] = ConvertConst;
-  // _op_registry["MatMul"] = ConvertFullyConnected; // not used in vgg
+  // _op_registry["MatMul"] = ConvertFullyConnected;  // Not used in vgg
   // TODO(ben,jie): this is a temp hack.
   _op_registry["Identity"] = ConvertIdentity;  // Identity should be removed
   // _op_registry["AvgPool"] = ConvertPool;
@@ -1415,47 +1447,48 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
   std::list<tensorflow::Node*> order;
   for (tensorflow::Node* node : order_vec) {
     if (subgraph_node_ids.count(node->id())) {
-      order.push_front(node);  // we want topological order to contstruct the
-                               // network layer by layer
+      // We want topological order to contstruct the
+      // network layer by layer
+      order.push_front(node);
     }
   }
-  // topological order is needed to build TRT network
+  // Topological order is needed to build TRT network
 
   tensorflow::tensorrt::Logger trt_logger;
 
   auto trt_builder = infer_object(nvinfer1::createInferBuilder(trt_logger));
   if (!trt_builder) {
     return tensorflow::errors::Internal(
-        "failed to create TensorRT builder object");
+        "Failed to create TensorRT builder object");
   }
 
   auto trt_network = infer_object(trt_builder->createNetwork());
   if (!trt_network) {
     return tensorflow::errors::Internal(
-        "failed to create TensorRT network object");
+        "Failed to create TensorRT network object");
   }
 
   // Build the network
   Converter converter(trt_network.get());
 
-  std::vector<std::string> input_names;
+  std::vector<string> input_names;
   std::vector<tensorflow::DataType> input_dtypes;
   for (std::pair<int, int> const& input : input_inds) {
     int node_id = input.first;
     int output_idx = input.second;
     tensorflow::Node* node = graph.FindNodeId(node_id);
     auto node_name = node->name();
-    input_names.push_back(node_name);  // insert original node name without port
+    input_names.push_back(node_name);  // Insert original node name without port
     // TODO(jie): alternative :)
     if (!graph_properties.HasOutputProperties(node_name))
-      return tensorflow::errors::Internal("failed to find input node: " +
+      return tensorflow::errors::Internal("Failed to find input node: " +
                                           node_name);
 
     auto op_info_vec = graph_properties.GetOutputProperties(node_name);
     if (static_cast<int>(op_info_vec.size()) < output_idx)
       return tensorflow::errors::Internal(
-          "accessing output index of: " + std::to_string(output_idx) +
-          ", at node: " + node_name + "with output entry from shape_map: " +
+          "Accessing output index of: " + std::to_string(output_idx) +
+          ", at node: " + node_name + " with output entry from shape_map: " +
           std::to_string(op_info_vec.size()));
 
     auto op_info = op_info_vec.at(output_idx);
@@ -1464,11 +1497,11 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
     input_dtypes.push_back(tf_dtype);
 
     nvinfer1::DataType dtype(nvinfer1::DataType::kFLOAT);
-    TF_CHECK_OK(convert_dtype(tf_dtype, &dtype));
+    TF_CHECK_OK(ConvertDType(tf_dtype, &dtype));
 
-    VLOG(2) << "accessing output index of: " << std::to_string(output_idx)
+    VLOG(2) << "Accessing output index of: " << std::to_string(output_idx)
             << ", at node: " << node_name
-            << "with output entry from shape_map: "
+            << " with output entry from shape_map: "
             << std::to_string(op_info_vec.size());
 
     // TODO(ben,jie): update TRT input format/dimension
@@ -1492,35 +1525,35 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
     if (!input_tensor)
       return tensorflow::errors::InvalidArgument(
           "Failed to create Input layer");
-    VLOG(2) << "input tensor name :" << input_tensor_name;
+    VLOG(2) << "Input tensor name :" << input_tensor_name;
 
     if (!converter.insert_input_tensor(input_tensor_name, input_tensor))
       return tensorflow::errors::AlreadyExists(
-          "output tensor already exists for op: " + input_tensor_name);
+          "Output tensor already exists for op: " + input_tensor_name);
   }
 
-  VLOG(2) << "finished sorting";
+  VLOG(2) << "Finished sorting";
 
   for (const tensorflow::Node* node : order) {
-    tensorflow::NodeDef const& node_def = node->def();
-    VLOG(2) << "converting node: " << node_def.name() << " , " << node_def.op();
+    const tensorflow::NodeDef& node_def = node->def();
+    VLOG(2) << "Converting node: " << node_def.name() << " , " << node_def.op();
     TF_RETURN_IF_ERROR(converter.convert_node(node_def));
   }
 
-  VLOG(2) << "finished conversion";
+  VLOG(2) << "Finished conversion";
 
   // Gather output metadata
-  std::vector<std::string> output_names;
+  std::vector<string> output_names;
   std::vector<tensorflow::DataType> output_dtypes;
   for (std::pair<int, int> const& output : output_inds) {
     int node_id = output.first;
     int output_idx = output.second;
     tensorflow::Node* node = graph.FindNodeId(node_id);
-    std::string op_name = node->name();
-    std::string tensor_name = op_name;
+    string op_name = node->name();
+    string tensor_name = op_name;
     if (output_idx != 0)
       tensor_name = tensor_name + ":" + std::to_string(output_idx);
-    VLOG(2) << "output tensor name: " << tensor_name;
+    VLOG(2) << "Output tensor name: " << tensor_name;
     output_names.push_back(tensor_name);
     auto tensor_or_weights = converter.get_tensor(tensor_name);
     if (!tensor_or_weights.is_tensor()) {
@@ -1536,42 +1569,42 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
     tensorflow::DataType tf_dtype = node->output_type(output_idx);
     output_dtypes.push_back(tf_dtype);
     nvinfer1::DataType trt_dtype = nvinfer1::DataType::kFLOAT;
-    TF_RETURN_IF_ERROR(convert_dtype(tf_dtype, &trt_dtype));
+    TF_RETURN_IF_ERROR(ConvertDType(tf_dtype, &trt_dtype));
     tensor->setType(trt_dtype);
   }
 
-  VLOG(2) << "finished output";
+  VLOG(2) << "Finished output";
   static int static_id = 0;
 
   // Build the engine
   trt_builder->setMaxBatchSize(max_batch_size);
   trt_builder->setMaxWorkspaceSize(max_workspace_size_bytes);
-  VLOG(0) << "starting build engine " << static_id;
+  VLOG(0) << "Starting build engine " << static_id;
   // TODO(ben,jie): half2 and int8 mode support
-  std::string engine_plan_string;
+  string engine_plan_string;
   {
     auto trt_engine =
         infer_object(trt_builder->buildCudaEngine(*converter.network()));
-    VLOG(0) << "built network";
+    VLOG(0) << "Built network";
     auto engine_plan = infer_object(trt_engine->serialize());
-    VLOG(0) << "serialized engine";
+    VLOG(0) << "Serialized engine";
     const char* engine_plan_data =
         static_cast<const char*>(engine_plan->data());
-    engine_plan_string = std::move(
-        std::string(engine_plan_data, engine_plan_data + engine_plan->size()));
+    engine_plan_string =
+        string(engine_plan_data, engine_plan_data + engine_plan->size());
   }
 
-  VLOG(0) << "finished engine";
+  VLOG(0) << "Finished engine";
 
   // Build the TRT op
   // TODO(sami,ben,jie): proper naming!
   tensorflow::NodeDefBuilder op_builder(
-      "my_trt_op" + std::to_string(static_id++), "TRTEngineOp");
+      tensorflow::strings::StrCat("my_trt_op", static_id++), "TRTEngineOp");
   std::vector<tensorflow::NodeDefBuilder::NodeOut> income_edges;
   for (size_t i = 0; i < input_names.size(); ++i) {
     int output_idx = input_inds.at(i).second;
-    // we wired up the input here already, it is redundant to do it again in
-    //  ConvertSubGraphToTensorRT(convert_graph.cc)
+    // We wired up the input here already, it is redundant to do it again in
+    // ConvertSubGraphToTensorRT(convert_graph.cc)
     auto incoming_edge = tensorflow::NodeDefBuilder::NodeOut(
         input_names.at(i), output_idx, input_dtypes.at(i));
     income_edges.push_back(incoming_edge);
@@ -1580,7 +1613,7 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
       income_edges);
   op_builder.Input(input_list);
 
-  VLOG(0) << "finished op preparation";
+  VLOG(0) << "Finished op preparation";
 
   auto status = op_builder.Attr("serialized_engine", engine_plan_string)
                     .Attr("input_nodes", input_names)
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
index 50c9e64deb..1f25048f83 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
@@ -14,12 +14,14 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/contrib/tensorrt/kernels/trt_engine_op.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
-#include "cuda/include/cuda_runtime_api.h"
 #include "tensorflow/contrib/tensorrt/log/trt_logger.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/platform/types.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "cuda/include/cuda_runtime_api.h"
 
 namespace tensorflow {
 namespace tensorrt {
@@ -27,7 +29,7 @@ static ::tensorflow::tensorrt::Logger logger;
 
 TRTEngineOp::TRTEngineOp(OpKernelConstruction* context) : OpKernel(context) {
   // read serialized_engine
-  std::string serialized_engine;
+  string serialized_engine;
   OP_REQUIRES_OK(context,
                  context->GetAttr("serialized_engine", &serialized_engine));
 
diff --git a/tensorflow/contrib/tensorrt/log/trt_logger.h b/tensorflow/contrib/tensorrt/log/trt_logger.h
index 32bfcf62af..d71f66b933 100644
--- a/tensorflow/contrib/tensorrt/log/trt_logger.h
+++ b/tensorflow/contrib/tensorrt/log/trt_logger.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_TENSORRT_LOG_TRT_LOGGER_H_
 #define TENSORFLOW_CONTRIB_TENSORRT_LOG_TRT_LOGGER_H_
 
-#include <string>
+#include "tensorflow/core/platform/types.h"
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
@@ -30,7 +30,7 @@ class Logger : public nvinfer1::ILogger {
  private:
   void log(nvinfer1::ILogger::Severity severity, const char* msg) override;
 
-  std::string name_;
+  string name_;
 };
 
 }  // namespace tensorrt
diff --git a/tensorflow/contrib/tensorrt/segment/segment.cc b/tensorflow/contrib/tensorrt/segment/segment.cc
index 1672226643..6193f0b0a1 100644
--- a/tensorflow/contrib/tensorrt/segment/segment.cc
+++ b/tensorflow/contrib/tensorrt/segment/segment.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/contrib/tensorrt/segment/segment.h"
 
 #include <set>
-#include <string>
 #include <unordered_map>
 #include <vector>
 
@@ -26,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace tensorrt {
@@ -216,7 +216,7 @@ tensorflow::Status SegmentGraph(
 
   // Collect the segments/subgraphs. Each subgraph is represented by a
   // set of the names of the nodes in that subgraph.
-  std::unordered_map<std::string, std::set<std::string>> sg_map;
+  std::unordered_map<string, std::set<string>> sg_map;
   for (auto& u : node_segments) {
     if ((u.Value() != nullptr) && (u.ParentValue() != nullptr)) {
       sg_map[u.ParentValue()->name()].insert(u.Value()->name());
@@ -227,7 +227,7 @@ tensorflow::Status SegmentGraph(
   for (const auto& itr : sg_map) {
     const auto& segment_node_names = itr.second;
     if (VLOG_IS_ON(1)) {
-      std::string s;
+      string s;
       for (const auto& name : segment_node_names) {
         s += " " + name;
       }
diff --git a/tensorflow/contrib/tensorrt/segment/segment.h b/tensorflow/contrib/tensorrt/segment/segment.h
index 1c7ccde7d3..ee6e2b3ed2 100644
--- a/tensorflow/contrib/tensorrt/segment/segment.h
+++ b/tensorflow/contrib/tensorrt/segment/segment.h
@@ -17,22 +17,22 @@ limitations under the License.
 #define TENSORFLOW_CONTRIB_TENSORRT_SEGMENT_SEGMENT_H_
 
 #include <set>
-#include <string>
 #include <vector>
 
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace tensorrt {
 namespace segment {
 
-using SegmentNodesVector = std::vector<std::set<std::string>>;
+using SegmentNodesVector = std::vector<std::set<string>>;
 
 struct SegmentOptions {
   // Segment must contain at least this many nodes.
   int minimum_segment_size = 2;
-  std::set<std::string> exclude_node_list;
+  std::set<string> exclude_node_list;
 };
 
 // Get the subgraphs of a graph that can be handled by TensorRT.
diff --git a/tensorflow/contrib/tensorrt/segment/segment_test.cc b/tensorflow/contrib/tensorrt/segment/segment_test.cc
index bac115facd..d7e10c123d 100644
--- a/tensorflow/contrib/tensorrt/segment/segment_test.cc
+++ b/tensorflow/contrib/tensorrt/segment/segment_test.cc
@@ -13,13 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/segment/segment.h"
 #include "tensorflow/c/c_api.h"
+#include "tensorflow/contrib/tensorrt/segment/segment.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace tensorrt {
@@ -35,7 +36,7 @@ class SegmentTest : public ::testing::Test {
                     TF_Status* s, const char* name);
 
   std::function<bool(const NodeDef&)> MakeCandidateFn(
-      const std::set<std::string>& node_names);
+      const std::set<string>& node_names);
 
  protected:
   void PlaceholderHelper(TF_Graph* graph, TF_Status* s, const char* name,
@@ -60,7 +61,7 @@ bool SegmentTest::GetGraphDef(TF_Graph* graph,
 }
 
 std::function<bool(const NodeDef&)> SegmentTest::MakeCandidateFn(
-    const std::set<std::string>& node_names) {
+    const std::set<string>& node_names) {
   return [node_names](const NodeDef& node) -> bool {
     return node_names.find(node.name()) != node_names.end();
   };
@@ -103,7 +104,6 @@ TF_Operation* SegmentTest::Add(TF_Operation* l, TF_Operation* r,
   return op;
 }
 
-//------------------------------------------------------------------------------
 TEST_F(SegmentTest, Empty) {
   TF_Graph* graph = TF_NewGraph();
 
@@ -119,7 +119,6 @@ TEST_F(SegmentTest, Empty) {
   EXPECT_TRUE(segments.empty());
 }
 
-//------------------------------------------------------------------------------
 TEST_F(SegmentTest, Simple) {
   TF_Status* s = TF_NewStatus();
   TF_Graph* graph = TF_NewGraph();
@@ -163,14 +162,13 @@ TEST_F(SegmentTest, Simple) {
 
   // Expect all Add operations to be collapsed into a single segment
   ASSERT_EQ(segments.size(), 1);
-  std::vector<std::string> expected{"add0", "add1", "add2", "add3", "add4"};
+  std::vector<string> expected{"add0", "add1", "add2", "add3", "add4"};
   for (const auto& ex : expected) {
     EXPECT_TRUE(segments[0].find(ex) != segments[0].end())
         << "Missing expected node " << ex;
   }
 }
 
-//------------------------------------------------------------------------------
 TEST_F(SegmentTest, AvoidCycle) {
   TF_Status* s = TF_NewStatus();
   TF_Graph* graph = TF_NewGraph();
@@ -218,7 +216,6 @@ TEST_F(SegmentTest, AvoidCycle) {
   EXPECT_EQ(segments.size(), 0);
 }
 
-//------------------------------------------------------------------------------
 TEST_F(SegmentTest, Multiple) {
   TF_Status* s = TF_NewStatus();
   TF_Graph* graph = TF_NewGraph();
@@ -274,20 +271,19 @@ TEST_F(SegmentTest, Multiple) {
   // Expect two subgraphs
   EXPECT_EQ(segments.size(), 2);
 
-  std::vector<std::string> expected0{"add0", "add1", "add2", "add3"};
+  std::vector<string> expected0{"add0", "add1", "add2", "add3"};
   for (const auto& ex : expected0) {
     EXPECT_TRUE(segments[0].find(ex) != segments[0].end())
         << "Missing expected node " << ex;
   }
 
-  std::vector<std::string> expected1{"add6", "add8"};
+  std::vector<string> expected1{"add6", "add8"};
   for (const auto& ex : expected1) {
     EXPECT_TRUE(segments[1].find(ex) != segments[1].end())
         << "Missing expected node " << ex;
   }
 }
 
-//------------------------------------------------------------------------------
 TEST_F(SegmentTest, BigIfElse) {
   TF_Status* s = TF_NewStatus();
   TF_Graph* graph = TF_NewGraph();
@@ -343,13 +339,13 @@ TEST_F(SegmentTest, BigIfElse) {
   // Expect 2 subgraphs
   EXPECT_EQ(segments.size(), 2);
 
-  std::vector<std::string> expected0{"add3", "add4", "add5", "add6", "add7"};
+  std::vector<string> expected0{"add3", "add4", "add5", "add6", "add7"};
   for (const auto& ex : expected0) {
     EXPECT_TRUE(segments[0].find(ex) != segments[0].end())
         << "Missing expected node " << ex;
   }
 
-  std::vector<std::string> expected1{"add0", "add1"};
+  std::vector<string> expected1{"add0", "add1"};
   for (const auto& ex : expected1) {
     EXPECT_TRUE(segments[1].find(ex) != segments[1].end())
         << "Missing expected node " << ex;
diff --git a/tensorflow/contrib/tensorrt/trt_conversion.i b/tensorflow/contrib/tensorrt/trt_conversion.i
index f085380054..1570ee6f04 100644
--- a/tensorflow/contrib/tensorrt/trt_conversion.i
+++ b/tensorflow/contrib/tensorrt/trt_conversion.i
@@ -13,15 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-/* wrap trt_conversion */
+/* Wrap trt_conversion */
 %{
 #define SWIG_FILE_WITH_INIT
 %}
-%include "std_string.i"
 %include "std_pair.i"
-%include "tensorflow/python/lib/core/strings.i"
 %include "tensorflow/python/platform/base.i"
-%template(StringPair) std::pair<string,string>;
+%template(StringPair) std::pair<string, string>;
 %template() std::pair<swig::SwigPtr_PyObject, swig::SwigPtr_PyObject>;
 
 %{
-- 
GitLab


From 497b4fb1440d95161a7e7c577557fa4c101a6b98 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Tue, 6 Feb 2018 14:05:01 -0800
Subject: [PATCH 731/966] Export CXX11_ABI_FLAG and MONOLITHIC_BUILD constants.

PiperOrigin-RevId: 184736216
---
 tensorflow/python/framework/versions.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/python/framework/versions.py b/tensorflow/python/framework/versions.py
index bdcbc15af6..06955b8858 100644
--- a/tensorflow/python/framework/versions.py
+++ b/tensorflow/python/framework/versions.py
@@ -35,7 +35,9 @@ tf_export("GIT_VERSION").export_constant(__name__, "GIT_VERSION")
 COMPILER_VERSION = __compiler_version__
 tf_export("COMPILER_VERSION").export_constant(__name__, "COMPILER_VERSION")
 CXX11_ABI_FLAG = __cxx11_abi_flag__
+tf_export("CXX11_ABI_FLAG").export_constant(__name__, "CXX11_ABI_FLAG")
 MONOLITHIC_BUILD = __monolithic_build__
+tf_export("MONOLITHIC_BUILD").export_constant(__name__, "MONOLITHIC_BUILD")
 
 GRAPH_DEF_VERSION = pywrap_tensorflow.GRAPH_DEF_VERSION
 tf_export("GRAPH_DEF_VERSION").export_constant(__name__, "GRAPH_DEF_VERSION")
-- 
GitLab


From 71248e6f4f79f7d9b6f35854e6bab2caeabfb555 Mon Sep 17 00:00:00 2001
From: "Joshua V. Dillon" <jvdillon@google.com>
Date: Tue, 6 Feb 2018 14:17:52 -0800
Subject: [PATCH 732/966] Automated g4 rollback of changelist 184551259

PiperOrigin-RevId: 184738583
---
 .../bayesflow/python/kernel_tests/hmc_test.py |  831 ++++++---
 .../contrib/bayesflow/python/ops/hmc.py       |   11 +-
 .../contrib/bayesflow/python/ops/hmc_impl.py  | 1598 +++++++++++------
 3 files changed, 1650 insertions(+), 790 deletions(-)

diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/hmc_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/hmc_test.py
index cbc66b6dc1..51aed6438d 100644
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/hmc_test.py
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/hmc_test.py
@@ -18,30 +18,40 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
+
 import numpy as np
-from scipy import special
 from scipy import stats
 
 from tensorflow.contrib.bayesflow.python.ops import hmc
+from tensorflow.contrib.bayesflow.python.ops.hmc_impl import _compute_energy_change
+from tensorflow.contrib.bayesflow.python.ops.hmc_impl import _leapfrog_integrator
 
+from tensorflow.contrib.distributions.python.ops import independent as independent_lib
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import gen_linalg_ops
+from tensorflow.python.ops import gradients_impl as gradients_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import gamma as gamma_lib
+from tensorflow.python.ops.distributions import normal as normal_lib
 from tensorflow.python.platform import test
-from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.platform import tf_logging as logging_ops
+
+
+def _reduce_variance(x, axis=None, keepdims=False):
+  sample_mean = math_ops.reduce_mean(x, axis, keepdims=True)
+  return math_ops.reduce_mean(
+      math_ops.squared_difference(x, sample_mean), axis, keepdims)
 
 
-# TODO(b/66964210): Test float16.
 class HMCTest(test.TestCase):
 
   def setUp(self):
     self._shape_param = 5.
     self._rate_param = 10.
-    self._expected_x = (special.digamma(self._shape_param)
-                        - np.log(self._rate_param))
-    self._expected_exp_x = self._shape_param / self._rate_param
 
     random_seed.set_random_seed(10003)
     np.random.seed(10003)
@@ -63,63 +73,46 @@ class HMCTest(test.TestCase):
                                self._rate_param * math_ops.exp(x),
                                event_dims)
 
-  def _log_gamma_log_prob_grad(self, x, event_dims=()):
-    """Computes log-pdf and gradient of a log-gamma random variable.
-
-    Args:
-      x: Value of the random variable.
-      event_dims: Dimensions not to treat as independent. Default is (),
-        i.e., all dimensions are independent.
-
-    Returns:
-      log_prob: The log-pdf up to a normalizing constant.
-      grad: The gradient of the log-pdf with respect to x.
-    """
-    return (math_ops.reduce_sum(self._shape_param * x -
-                                self._rate_param * math_ops.exp(x),
-                                event_dims),
-            self._shape_param - self._rate_param * math_ops.exp(x))
-
-  def _n_event_dims(self, x_shape, event_dims):
-    return np.prod([int(x_shape[i]) for i in event_dims])
-
-  def _integrator_conserves_energy(self, x, event_dims, sess,
+  def _integrator_conserves_energy(self, x, independent_chain_ndims, sess,
                                    feed_dict=None):
-    def potential_and_grad(x):
-      log_prob, grad = self._log_gamma_log_prob_grad(x, event_dims)
-      return -log_prob, -grad
-
-    step_size = array_ops.placeholder(np.float32, [], name='step_size')
-    hmc_lf_steps = array_ops.placeholder(np.int32, [], name='hmc_lf_steps')
+    step_size = array_ops.placeholder(np.float32, [], name="step_size")
+    hmc_lf_steps = array_ops.placeholder(np.int32, [], name="hmc_lf_steps")
 
     if feed_dict is None:
       feed_dict = {}
     feed_dict[hmc_lf_steps] = 1000
 
-    m = random_ops.random_normal(array_ops.shape(x))
-    potential_0, grad_0 = potential_and_grad(x)
-    old_energy = potential_0 + 0.5 * math_ops.reduce_sum(m * m,
-                                                         event_dims)
-
-    _, new_m, potential_1, _ = (
-        hmc.leapfrog_integrator(step_size, hmc_lf_steps, x,
-                                m, potential_and_grad, grad_0))
+    event_dims = math_ops.range(independent_chain_ndims,
+                                array_ops.rank(x))
 
-    new_energy = potential_1 + 0.5 * math_ops.reduce_sum(new_m * new_m,
+    m = random_ops.random_normal(array_ops.shape(x))
+    log_prob_0 = self._log_gamma_log_prob(x, event_dims)
+    grad_0 = gradients_ops.gradients(log_prob_0, x)
+    old_energy = -log_prob_0 + 0.5 * math_ops.reduce_sum(m**2., event_dims)
+
+    new_m, _, log_prob_1, _ = _leapfrog_integrator(
+        current_momentums=[m],
+        target_log_prob_fn=lambda x: self._log_gamma_log_prob(x, event_dims),
+        current_state_parts=[x],
+        step_sizes=[step_size],
+        num_leapfrog_steps=hmc_lf_steps,
+        current_target_log_prob=log_prob_0,
+        current_grads_target_log_prob=grad_0)
+    new_m = new_m[0]
+
+    new_energy = -log_prob_1 + 0.5 * math_ops.reduce_sum(new_m * new_m,
                                                          event_dims)
 
     x_shape = sess.run(x, feed_dict).shape
-    n_event_dims = self._n_event_dims(x_shape, event_dims)
-    feed_dict[step_size] = 0.1 / n_event_dims
-    old_energy_val, new_energy_val = sess.run([old_energy, new_energy],
-                                              feed_dict)
-    logging.vlog(1, 'average energy change: {}'.format(
-        abs(old_energy_val - new_energy_val).mean()))
-
-    self.assertAllEqual(np.ones_like(new_energy_val, dtype=np.bool),
-                        abs(old_energy_val - new_energy_val) < 1.)
-
-  def _integrator_conserves_energy_wrapper(self, event_dims):
+    event_size = np.prod(x_shape[independent_chain_ndims:])
+    feed_dict[step_size] = 0.1 / event_size
+    old_energy_, new_energy_ = sess.run([old_energy, new_energy],
+                                        feed_dict)
+    logging_ops.vlog(1, "average energy relative change: {}".format(
+        (1. - new_energy_ / old_energy_).mean()))
+    self.assertAllClose(old_energy_, new_energy_, atol=0., rtol=0.02)
+
+  def _integrator_conserves_energy_wrapper(self, independent_chain_ndims):
     """Tests the long-term energy conservation of the leapfrog integrator.
 
     The leapfrog integrator is symplectic, so for sufficiently small step
@@ -127,135 +120,218 @@ class HMCTest(test.TestCase):
     the energy of the system blowing up or collapsing.
 
     Args:
-      event_dims: A tuple of dimensions that should not be treated as
-        independent. This allows for multiple chains to be run independently
-        in parallel. Default is (), i.e., all dimensions are independent.
+      independent_chain_ndims: Python `int` scalar representing the number of
+        dims associated with independent chains.
     """
-    with self.test_session() as sess:
-      x_ph = array_ops.placeholder(np.float32, name='x_ph')
-
-      feed_dict = {x_ph: np.zeros([50, 10, 2])}
-      self._integrator_conserves_energy(x_ph, event_dims, sess, feed_dict)
+    with self.test_session(graph=ops.Graph()) as sess:
+      x_ph = array_ops.placeholder(np.float32, name="x_ph")
+      feed_dict = {x_ph: np.random.rand(50, 10, 2)}
+      self._integrator_conserves_energy(x_ph, independent_chain_ndims,
+                                        sess, feed_dict)
 
   def testIntegratorEnergyConservationNullShape(self):
-    self._integrator_conserves_energy_wrapper([])
+    self._integrator_conserves_energy_wrapper(0)
 
   def testIntegratorEnergyConservation1(self):
-    self._integrator_conserves_energy_wrapper([1])
+    self._integrator_conserves_energy_wrapper(1)
 
   def testIntegratorEnergyConservation2(self):
-    self._integrator_conserves_energy_wrapper([2])
-
-  def testIntegratorEnergyConservation12(self):
-    self._integrator_conserves_energy_wrapper([1, 2])
-
-  def testIntegratorEnergyConservation012(self):
-    self._integrator_conserves_energy_wrapper([0, 1, 2])
-
-  def _chain_gets_correct_expectations(self, x, event_dims, sess,
-                                       feed_dict=None):
+    self._integrator_conserves_energy_wrapper(2)
+
+  def testIntegratorEnergyConservation3(self):
+    self._integrator_conserves_energy_wrapper(3)
+
+  def testSampleChainSeedReproducibleWorksCorrectly(self):
+    with self.test_session(graph=ops.Graph()) as sess:
+      num_results = 10
+      independent_chain_ndims = 1
+
+      def log_gamma_log_prob(x):
+        event_dims = math_ops.range(independent_chain_ndims,
+                                    array_ops.rank(x))
+        return self._log_gamma_log_prob(x, event_dims)
+
+      kwargs = dict(
+          target_log_prob_fn=log_gamma_log_prob,
+          current_state=np.random.rand(4, 3, 2),
+          step_size=0.1,
+          num_leapfrog_steps=2,
+          num_burnin_steps=150,
+          seed=52,
+      )
+
+      samples0, kernel_results0 = hmc.sample_chain(
+          **dict(list(kwargs.items()) + list(dict(
+              num_results=2 * num_results,
+              num_steps_between_results=0).items())))
+
+      samples1, kernel_results1 = hmc.sample_chain(
+          **dict(list(kwargs.items()) + list(dict(
+              num_results=num_results,
+              num_steps_between_results=1).items())))
+
+      [
+          samples0_,
+          samples1_,
+          target_log_prob0_,
+          target_log_prob1_,
+      ] = sess.run([
+          samples0,
+          samples1,
+          kernel_results0.current_target_log_prob,
+          kernel_results1.current_target_log_prob,
+      ])
+      self.assertAllClose(samples0_[::2], samples1_,
+                          atol=1e-5, rtol=1e-5)
+      self.assertAllClose(target_log_prob0_[::2], target_log_prob1_,
+                          atol=1e-5, rtol=1e-5)
+
+  def _chain_gets_correct_expectations(self, x, independent_chain_ndims,
+                                       sess, feed_dict=None):
+    counter = collections.Counter()
     def log_gamma_log_prob(x):
+      counter["target_calls"] += 1
+      event_dims = math_ops.range(independent_chain_ndims,
+                                  array_ops.rank(x))
       return self._log_gamma_log_prob(x, event_dims)
 
-    step_size = array_ops.placeholder(np.float32, [], name='step_size')
-    hmc_lf_steps = array_ops.placeholder(np.int32, [], name='hmc_lf_steps')
-    hmc_n_steps = array_ops.placeholder(np.int32, [], name='hmc_n_steps')
+    num_results = array_ops.placeholder(
+        np.int32, [], name="num_results")
+    step_size = array_ops.placeholder(
+        np.float32, [], name="step_size")
+    num_leapfrog_steps = array_ops.placeholder(
+        np.int32, [], name="num_leapfrog_steps")
 
     if feed_dict is None:
       feed_dict = {}
-    feed_dict.update({step_size: 0.1,
-                      hmc_lf_steps: 2,
-                      hmc_n_steps: 300})
-
-    sample_chain, acceptance_prob_chain = hmc.chain([hmc_n_steps],
-                                                    step_size,
-                                                    hmc_lf_steps,
-                                                    x, log_gamma_log_prob,
-                                                    event_dims)
-
-    acceptance_probs, samples = sess.run([acceptance_prob_chain, sample_chain],
-                                         feed_dict)
-    samples = samples[feed_dict[hmc_n_steps] // 2:]
-    expected_x_est = samples.mean()
-    expected_exp_x_est = np.exp(samples).mean()
-
-    logging.vlog(1, 'True      E[x, exp(x)]: {}\t{}'.format(
-        self._expected_x, self._expected_exp_x))
-    logging.vlog(1, 'Estimated E[x, exp(x)]: {}\t{}'.format(
-        expected_x_est, expected_exp_x_est))
-    self.assertNear(expected_x_est, self._expected_x, 2e-2)
-    self.assertNear(expected_exp_x_est, self._expected_exp_x, 2e-2)
-    self.assertTrue((acceptance_probs > 0.5).all())
-    self.assertTrue((acceptance_probs <= 1.0).all())
-
-  def _chain_gets_correct_expectations_wrapper(self, event_dims):
-    with self.test_session() as sess:
-      x_ph = array_ops.placeholder(np.float32, name='x_ph')
-
-      feed_dict = {x_ph: np.zeros([50, 10, 2])}
-      self._chain_gets_correct_expectations(x_ph, event_dims, sess,
-                                            feed_dict)
+    feed_dict.update({num_results: 150,
+                      step_size: 0.05,
+                      num_leapfrog_steps: 2})
+
+    samples, kernel_results = hmc.sample_chain(
+        num_results=num_results,
+        target_log_prob_fn=log_gamma_log_prob,
+        current_state=x,
+        step_size=step_size,
+        num_leapfrog_steps=num_leapfrog_steps,
+        num_burnin_steps=150,
+        seed=42)
+
+    self.assertAllEqual(dict(target_calls=2), counter)
+
+    expected_x = (math_ops.digamma(self._shape_param)
+                  - np.log(self._rate_param))
+
+    expected_exp_x = self._shape_param / self._rate_param
+
+    acceptance_probs_, samples_, expected_x_ = sess.run(
+        [kernel_results.acceptance_probs, samples, expected_x],
+        feed_dict)
+
+    actual_x = samples_.mean()
+    actual_exp_x = np.exp(samples_).mean()
+
+    logging_ops.vlog(1, "True      E[x, exp(x)]: {}\t{}".format(
+        expected_x_, expected_exp_x))
+    logging_ops.vlog(1, "Estimated E[x, exp(x)]: {}\t{}".format(
+        actual_x, actual_exp_x))
+    self.assertNear(actual_x, expected_x_, 2e-2)
+    self.assertNear(actual_exp_x, expected_exp_x, 2e-2)
+    self.assertAllEqual(np.ones_like(acceptance_probs_, np.bool),
+                        acceptance_probs_ > 0.5)
+    self.assertAllEqual(np.ones_like(acceptance_probs_, np.bool),
+                        acceptance_probs_ <= 1.)
+
+  def _chain_gets_correct_expectations_wrapper(self, independent_chain_ndims):
+    with self.test_session(graph=ops.Graph()) as sess:
+      x_ph = array_ops.placeholder(np.float32, name="x_ph")
+      feed_dict = {x_ph: np.random.rand(50, 10, 2)}
+      self._chain_gets_correct_expectations(x_ph, independent_chain_ndims,
+                                            sess, feed_dict)
 
   def testHMCChainExpectationsNullShape(self):
-    self._chain_gets_correct_expectations_wrapper([])
+    self._chain_gets_correct_expectations_wrapper(0)
 
   def testHMCChainExpectations1(self):
-    self._chain_gets_correct_expectations_wrapper([1])
+    self._chain_gets_correct_expectations_wrapper(1)
 
   def testHMCChainExpectations2(self):
-    self._chain_gets_correct_expectations_wrapper([2])
-
-  def testHMCChainExpectations12(self):
-    self._chain_gets_correct_expectations_wrapper([1, 2])
+    self._chain_gets_correct_expectations_wrapper(2)
 
-  def _kernel_leaves_target_invariant(self, initial_draws, event_dims,
+  def _kernel_leaves_target_invariant(self, initial_draws,
+                                      independent_chain_ndims,
                                       sess, feed_dict=None):
     def log_gamma_log_prob(x):
+      event_dims = math_ops.range(independent_chain_ndims, array_ops.rank(x))
       return self._log_gamma_log_prob(x, event_dims)
 
     def fake_log_prob(x):
       """Cooled version of the target distribution."""
       return 1.1 * log_gamma_log_prob(x)
 
-    step_size = array_ops.placeholder(np.float32, [], name='step_size')
+    step_size = array_ops.placeholder(np.float32, [], name="step_size")
 
     if feed_dict is None:
       feed_dict = {}
 
     feed_dict[step_size] = 0.4
 
-    sample, acceptance_probs, _, _ = hmc.kernel(step_size, 5, initial_draws,
-                                                log_gamma_log_prob, event_dims)
-    bad_sample, bad_acceptance_probs, _, _ = hmc.kernel(
-        step_size, 5, initial_draws, fake_log_prob, event_dims)
-    (acceptance_probs_val, bad_acceptance_probs_val, initial_draws_val,
-     updated_draws_val, fake_draws_val) = sess.run([acceptance_probs,
-                                                    bad_acceptance_probs,
-                                                    initial_draws, sample,
-                                                    bad_sample], feed_dict)
+    sample, kernel_results = hmc.kernel(
+        target_log_prob_fn=log_gamma_log_prob,
+        current_state=initial_draws,
+        step_size=step_size,
+        num_leapfrog_steps=5,
+        seed=43)
+
+    bad_sample, bad_kernel_results = hmc.kernel(
+        target_log_prob_fn=fake_log_prob,
+        current_state=initial_draws,
+        step_size=step_size,
+        num_leapfrog_steps=5,
+        seed=44)
+
+    [
+        acceptance_probs_,
+        bad_acceptance_probs_,
+        initial_draws_,
+        updated_draws_,
+        fake_draws_,
+    ] = sess.run([
+        kernel_results.acceptance_probs,
+        bad_kernel_results.acceptance_probs,
+        initial_draws,
+        sample,
+        bad_sample,
+    ], feed_dict)
+
     # Confirm step size is small enough that we usually accept.
-    self.assertGreater(acceptance_probs_val.mean(), 0.5)
-    self.assertGreater(bad_acceptance_probs_val.mean(), 0.5)
+    self.assertGreater(acceptance_probs_.mean(), 0.5)
+    self.assertGreater(bad_acceptance_probs_.mean(), 0.5)
+
     # Confirm step size is large enough that we sometimes reject.
-    self.assertLess(acceptance_probs_val.mean(), 0.99)
-    self.assertLess(bad_acceptance_probs_val.mean(), 0.99)
-    _, ks_p_value_true = stats.ks_2samp(initial_draws_val.flatten(),
-                                        updated_draws_val.flatten())
-    _, ks_p_value_fake = stats.ks_2samp(initial_draws_val.flatten(),
-                                        fake_draws_val.flatten())
-    logging.vlog(1, 'acceptance rate for true target: {}'.format(
-        acceptance_probs_val.mean()))
-    logging.vlog(1, 'acceptance rate for fake target: {}'.format(
-        bad_acceptance_probs_val.mean()))
-    logging.vlog(1, 'K-S p-value for true target: {}'.format(ks_p_value_true))
-    logging.vlog(1, 'K-S p-value for fake target: {}'.format(ks_p_value_fake))
+    self.assertLess(acceptance_probs_.mean(), 0.99)
+    self.assertLess(bad_acceptance_probs_.mean(), 0.99)
+
+    _, ks_p_value_true = stats.ks_2samp(initial_draws_.flatten(),
+                                        updated_draws_.flatten())
+    _, ks_p_value_fake = stats.ks_2samp(initial_draws_.flatten(),
+                                        fake_draws_.flatten())
+
+    logging_ops.vlog(1, "acceptance rate for true target: {}".format(
+        acceptance_probs_.mean()))
+    logging_ops.vlog(1, "acceptance rate for fake target: {}".format(
+        bad_acceptance_probs_.mean()))
+    logging_ops.vlog(1, "K-S p-value for true target: {}".format(
+        ks_p_value_true))
+    logging_ops.vlog(1, "K-S p-value for fake target: {}".format(
+        ks_p_value_fake))
     # Make sure that the MCMC update hasn't changed the empirical CDF much.
     self.assertGreater(ks_p_value_true, 1e-3)
     # Confirm that targeting the wrong distribution does
     # significantly change the empirical CDF.
     self.assertLess(ks_p_value_fake, 1e-6)
 
-  def _kernel_leaves_target_invariant_wrapper(self, event_dims):
+  def _kernel_leaves_target_invariant_wrapper(self, independent_chain_ndims):
     """Tests that the kernel leaves the target distribution invariant.
 
     Draws some independent samples from the target distribution,
@@ -267,86 +343,160 @@ class HMCTest(test.TestCase):
     does change the target distribution. (And that we can detect that.)
 
     Args:
-      event_dims: A tuple of dimensions that should not be treated as
-        independent. This allows for multiple chains to be run independently
-        in parallel. Default is (), i.e., all dimensions are independent.
+      independent_chain_ndims: Python `int` scalar representing the number of
+        dims associated with independent chains.
     """
-    with self.test_session() as sess:
+    with self.test_session(graph=ops.Graph()) as sess:
       initial_draws = np.log(np.random.gamma(self._shape_param,
                                              size=[50000, 2, 2]))
       initial_draws -= np.log(self._rate_param)
-      x_ph = array_ops.placeholder(np.float32, name='x_ph')
+      x_ph = array_ops.placeholder(np.float32, name="x_ph")
 
       feed_dict = {x_ph: initial_draws}
 
-      self._kernel_leaves_target_invariant(x_ph, event_dims, sess,
-                                           feed_dict)
-
-  def testKernelLeavesTargetInvariantNullShape(self):
-    self._kernel_leaves_target_invariant_wrapper([])
+      self._kernel_leaves_target_invariant(x_ph, independent_chain_ndims,
+                                           sess, feed_dict)
 
   def testKernelLeavesTargetInvariant1(self):
-    self._kernel_leaves_target_invariant_wrapper([1])
+    self._kernel_leaves_target_invariant_wrapper(1)
 
   def testKernelLeavesTargetInvariant2(self):
-    self._kernel_leaves_target_invariant_wrapper([2])
+    self._kernel_leaves_target_invariant_wrapper(2)
 
-  def testKernelLeavesTargetInvariant12(self):
-    self._kernel_leaves_target_invariant_wrapper([1, 2])
+  def testKernelLeavesTargetInvariant3(self):
+    self._kernel_leaves_target_invariant_wrapper(3)
+
+  def _ais_gets_correct_log_normalizer(self, init, independent_chain_ndims,
+                                       sess, feed_dict=None):
+    counter = collections.Counter()
 
-  def _ais_gets_correct_log_normalizer(self, init, event_dims, sess,
-                                       feed_dict=None):
     def proposal_log_prob(x):
-      return math_ops.reduce_sum(-0.5 * x * x - 0.5 * np.log(2*np.pi),
-                                 event_dims)
+      counter["proposal_calls"] += 1
+      event_dims = math_ops.range(independent_chain_ndims, array_ops.rank(x))
+      return -0.5 * math_ops.reduce_sum(x**2. + np.log(2 * np.pi),
+                                        axis=event_dims)
 
     def target_log_prob(x):
+      counter["target_calls"] += 1
+      event_dims = math_ops.range(independent_chain_ndims, array_ops.rank(x))
       return self._log_gamma_log_prob(x, event_dims)
 
     if feed_dict is None:
       feed_dict = {}
 
-    w, _, _ = hmc.ais_chain(200, 0.5, 2, init, target_log_prob,
-                            proposal_log_prob, event_dims)
+    num_steps = 200
+
+    _, ais_weights, _ = hmc.sample_annealed_importance_chain(
+        proposal_log_prob_fn=proposal_log_prob,
+        num_steps=num_steps,
+        target_log_prob_fn=target_log_prob,
+        step_size=0.5,
+        current_state=init,
+        num_leapfrog_steps=2,
+        seed=45)
+
+    # We have three calls because the calculation of `ais_weights` entails
+    # another call to the `convex_combined_log_prob_fn`. We could refactor
+    # things to avoid this, if needed (eg, b/72994218).
+    self.assertAllEqual(dict(target_calls=3, proposal_calls=3), counter)
+
+    event_shape = array_ops.shape(init)[independent_chain_ndims:]
+    event_size = math_ops.reduce_prod(event_shape)
+
+    log_true_normalizer = (
+        -self._shape_param * math_ops.log(self._rate_param)
+        + math_ops.lgamma(self._shape_param))
+    log_true_normalizer *= math_ops.cast(event_size, log_true_normalizer.dtype)
+
+    log_estimated_normalizer = (math_ops.reduce_logsumexp(ais_weights)
+                                - np.log(num_steps))
+
+    ratio_estimate_true = math_ops.exp(ais_weights - log_true_normalizer)
+    ais_weights_size = array_ops.size(ais_weights)
+    standard_error = math_ops.sqrt(
+        _reduce_variance(ratio_estimate_true)
+        / math_ops.cast(ais_weights_size, ratio_estimate_true.dtype))
+
+    [
+        ratio_estimate_true_,
+        log_true_normalizer_,
+        log_estimated_normalizer_,
+        standard_error_,
+        ais_weights_size_,
+        event_size_,
+    ] = sess.run([
+        ratio_estimate_true,
+        log_true_normalizer,
+        log_estimated_normalizer,
+        standard_error,
+        ais_weights_size,
+        event_size,
+    ], feed_dict)
+
+    logging_ops.vlog(1, "        log_true_normalizer: {}\n"
+                        "   log_estimated_normalizer: {}\n"
+                        "           ais_weights_size: {}\n"
+                        "                 event_size: {}\n".format(
+                            log_true_normalizer_,
+                            log_estimated_normalizer_,
+                            ais_weights_size_,
+                            event_size_))
+    self.assertNear(ratio_estimate_true_.mean(), 1., 4. * standard_error_)
+
+  def _ais_gets_correct_log_normalizer_wrapper(self, independent_chain_ndims):
+    """Tests that AIS yields reasonable estimates of normalizers."""
+    with self.test_session(graph=ops.Graph()) as sess:
+      x_ph = array_ops.placeholder(np.float32, name="x_ph")
+      initial_draws = np.random.normal(size=[30, 2, 1])
+      self._ais_gets_correct_log_normalizer(
+          x_ph,
+          independent_chain_ndims,
+          sess,
+          feed_dict={x_ph: initial_draws})
 
-    w_val = sess.run(w, feed_dict)
-    init_shape = sess.run(init, feed_dict).shape
-    normalizer_multiplier = np.prod([init_shape[i] for i in event_dims])
+  def testAIS1(self):
+    self._ais_gets_correct_log_normalizer_wrapper(1)
 
-    true_normalizer = -self._shape_param * np.log(self._rate_param)
-    true_normalizer += special.gammaln(self._shape_param)
-    true_normalizer *= normalizer_multiplier
+  def testAIS2(self):
+    self._ais_gets_correct_log_normalizer_wrapper(2)
 
-    n_weights = np.prod(w_val.shape)
-    normalized_w = np.exp(w_val - true_normalizer)
-    standard_error = np.std(normalized_w) / np.sqrt(n_weights)
-    logging.vlog(1, 'True normalizer {}, estimated {}, n_weights {}'.format(
-        true_normalizer, np.log(normalized_w.mean()) + true_normalizer,
-        n_weights))
-    self.assertNear(normalized_w.mean(), 1.0, 4.0 * standard_error)
+  def testAIS3(self):
+    self._ais_gets_correct_log_normalizer_wrapper(3)
 
-  def _ais_gets_correct_log_normalizer_wrapper(self, event_dims):
-    """Tests that AIS yields reasonable estimates of normalizers."""
-    with self.test_session() as sess:
-      x_ph = array_ops.placeholder(np.float32, name='x_ph')
+  def testSampleAIChainSeedReproducibleWorksCorrectly(self):
+    with self.test_session(graph=ops.Graph()) as sess:
+      independent_chain_ndims = 1
+      x = np.random.rand(4, 3, 2)
 
-      initial_draws = np.random.normal(size=[30, 2, 1])
-      feed_dict = {x_ph: initial_draws}
+      def proposal_log_prob(x):
+        event_dims = math_ops.range(independent_chain_ndims, array_ops.rank(x))
+        return -0.5 * math_ops.reduce_sum(x**2. + np.log(2 * np.pi),
+                                          axis=event_dims)
 
-      self._ais_gets_correct_log_normalizer(x_ph, event_dims, sess,
-                                            feed_dict)
+      def target_log_prob(x):
+        event_dims = math_ops.range(independent_chain_ndims, array_ops.rank(x))
+        return self._log_gamma_log_prob(x, event_dims)
 
-  def testAISNullShape(self):
-    self._ais_gets_correct_log_normalizer_wrapper([])
+      ais_kwargs = dict(
+          proposal_log_prob_fn=proposal_log_prob,
+          num_steps=200,
+          target_log_prob_fn=target_log_prob,
+          step_size=0.5,
+          current_state=x,
+          num_leapfrog_steps=2,
+          seed=53)
 
-  def testAIS1(self):
-    self._ais_gets_correct_log_normalizer_wrapper([1])
+      _, ais_weights0, _ = hmc.sample_annealed_importance_chain(
+          **ais_kwargs)
 
-  def testAIS2(self):
-    self._ais_gets_correct_log_normalizer_wrapper([2])
+      _, ais_weights1, _ = hmc.sample_annealed_importance_chain(
+          **ais_kwargs)
 
-  def testAIS12(self):
-    self._ais_gets_correct_log_normalizer_wrapper([1, 2])
+      [ais_weights0_, ais_weights1_] = sess.run([
+          ais_weights0, ais_weights1])
+
+      self.assertAllClose(ais_weights0_, ais_weights1_,
+                          atol=1e-5, rtol=1e-5)
 
   def testNanRejection(self):
     """Tests that an update that yields NaN potentials gets rejected.
@@ -359,86 +509,263 @@ class HMCTest(test.TestCase):
     """
     def _unbounded_exponential_log_prob(x):
       """An exponential distribution with log-likelihood NaN for x < 0."""
-      per_element_potentials = array_ops.where(x < 0,
-                                               np.nan * array_ops.ones_like(x),
-                                               -x)
+      per_element_potentials = array_ops.where(
+          x < 0.,
+          array_ops.fill(array_ops.shape(x), x.dtype.as_numpy_dtype(np.nan)),
+          -x)
       return math_ops.reduce_sum(per_element_potentials)
 
-    with self.test_session() as sess:
+    with self.test_session(graph=ops.Graph()) as sess:
       initial_x = math_ops.linspace(0.01, 5, 10)
-      updated_x, acceptance_probs, _, _ = hmc.kernel(
-          2., 5, initial_x, _unbounded_exponential_log_prob, [0])
-      initial_x_val, updated_x_val, acceptance_probs_val = sess.run(
-          [initial_x, updated_x, acceptance_probs])
-
-      logging.vlog(1, 'initial_x = {}'.format(initial_x_val))
-      logging.vlog(1, 'updated_x = {}'.format(updated_x_val))
-      logging.vlog(1, 'acceptance_probs = {}'.format(acceptance_probs_val))
-
-      self.assertAllEqual(initial_x_val, updated_x_val)
-      self.assertEqual(acceptance_probs_val, 0.)
+      updated_x, kernel_results = hmc.kernel(
+          target_log_prob_fn=_unbounded_exponential_log_prob,
+          current_state=initial_x,
+          step_size=2.,
+          num_leapfrog_steps=5,
+          seed=46)
+      initial_x_, updated_x_, acceptance_probs_ = sess.run(
+          [initial_x, updated_x, kernel_results.acceptance_probs])
+
+      logging_ops.vlog(1, "initial_x = {}".format(initial_x_))
+      logging_ops.vlog(1, "updated_x = {}".format(updated_x_))
+      logging_ops.vlog(1, "acceptance_probs = {}".format(acceptance_probs_))
+
+      self.assertAllEqual(initial_x_, updated_x_)
+      self.assertEqual(acceptance_probs_, 0.)
 
   def testNanFromGradsDontPropagate(self):
     """Test that update with NaN gradients does not cause NaN in results."""
     def _nan_log_prob_with_nan_gradient(x):
       return np.nan * math_ops.reduce_sum(x)
 
-    with self.test_session() as sess:
+    with self.test_session(graph=ops.Graph()) as sess:
       initial_x = math_ops.linspace(0.01, 5, 10)
-      updated_x, acceptance_probs, new_log_prob, new_grad = hmc.kernel(
-          2., 5, initial_x, _nan_log_prob_with_nan_gradient, [0])
-      initial_x_val, updated_x_val, acceptance_probs_val = sess.run(
-          [initial_x, updated_x, acceptance_probs])
-
-      logging.vlog(1, 'initial_x = {}'.format(initial_x_val))
-      logging.vlog(1, 'updated_x = {}'.format(updated_x_val))
-      logging.vlog(1, 'acceptance_probs = {}'.format(acceptance_probs_val))
-
-      self.assertAllEqual(initial_x_val, updated_x_val)
-      self.assertEqual(acceptance_probs_val, 0.)
+      updated_x, kernel_results = hmc.kernel(
+          target_log_prob_fn=_nan_log_prob_with_nan_gradient,
+          current_state=initial_x,
+          step_size=2.,
+          num_leapfrog_steps=5,
+          seed=47)
+      initial_x_, updated_x_, acceptance_probs_ = sess.run(
+          [initial_x, updated_x, kernel_results.acceptance_probs])
+
+      logging_ops.vlog(1, "initial_x = {}".format(initial_x_))
+      logging_ops.vlog(1, "updated_x = {}".format(updated_x_))
+      logging_ops.vlog(1, "acceptance_probs = {}".format(acceptance_probs_))
+
+      self.assertAllEqual(initial_x_, updated_x_)
+      self.assertEqual(acceptance_probs_, 0.)
 
       self.assertAllFinite(
-          gradients_impl.gradients(updated_x, initial_x)[0].eval())
-      self.assertTrue(
-          gradients_impl.gradients(new_grad, initial_x)[0] is None)
+          gradients_ops.gradients(updated_x, initial_x)[0].eval())
+      self.assertAllEqual([True], [g is None for g in gradients_ops.gradients(
+          kernel_results.proposed_grads_target_log_prob, initial_x)])
+      self.assertAllEqual([False], [g is None for g in gradients_ops.gradients(
+          kernel_results.proposed_grads_target_log_prob,
+          kernel_results.proposed_state)])
 
       # Gradients of the acceptance probs and new log prob are not finite.
-      _ = new_log_prob  # Prevent unused arg error.
       # self.assertAllFinite(
-      #     gradients_impl.gradients(acceptance_probs, initial_x)[0].eval())
+      #     gradients_ops.gradients(acceptance_probs, initial_x)[0].eval())
       # self.assertAllFinite(
-      #     gradients_impl.gradients(new_log_prob, initial_x)[0].eval())
+      #     gradients_ops.gradients(new_log_prob, initial_x)[0].eval())
+
+  def _testChainWorksDtype(self, dtype):
+    with self.test_session(graph=ops.Graph()) as sess:
+      states, kernel_results = hmc.sample_chain(
+          num_results=10,
+          target_log_prob_fn=lambda x: -math_ops.reduce_sum(x**2., axis=-1),
+          current_state=np.zeros(5).astype(dtype),
+          step_size=0.01,
+          num_leapfrog_steps=10,
+          seed=48)
+      states_, acceptance_probs_ = sess.run(
+          [states, kernel_results.acceptance_probs])
+      self.assertEqual(dtype, states_.dtype)
+      self.assertEqual(dtype, acceptance_probs_.dtype)
 
   def testChainWorksIn64Bit(self):
-    def log_prob(x):
-      return - math_ops.reduce_sum(x * x, axis=-1)
-    states, acceptance_probs = hmc.chain(
-        n_iterations=10,
-        step_size=np.float64(0.01),
-        n_leapfrog_steps=10,
-        initial_x=np.zeros(5).astype(np.float64),
-        target_log_prob_fn=log_prob,
-        event_dims=[-1])
-    with self.test_session() as sess:
-      states_, acceptance_probs_ = sess.run([states, acceptance_probs])
-    self.assertEqual(np.float64, states_.dtype)
-    self.assertEqual(np.float64, acceptance_probs_.dtype)
+    self._testChainWorksDtype(np.float64)
 
   def testChainWorksIn16Bit(self):
-    def log_prob(x):
-      return - math_ops.reduce_sum(x * x, axis=-1)
-    states, acceptance_probs = hmc.chain(
-        n_iterations=10,
-        step_size=np.float16(0.01),
-        n_leapfrog_steps=10,
-        initial_x=np.zeros(5).astype(np.float16),
-        target_log_prob_fn=log_prob,
-        event_dims=[-1])
-    with self.test_session() as sess:
-      states_, acceptance_probs_ = sess.run([states, acceptance_probs])
-    self.assertEqual(np.float16, states_.dtype)
-    self.assertEqual(np.float16, acceptance_probs_.dtype)
-
-
-if __name__ == '__main__':
+    self._testChainWorksDtype(np.float16)
+
+  def testChainWorksCorrelatedMultivariate(self):
+    dtype = np.float32
+    true_mean = dtype([0, 0])
+    true_cov = dtype([[1, 0.5],
+                      [0.5, 1]])
+    num_results = 2000
+    counter = collections.Counter()
+    with self.test_session(graph=ops.Graph()) as sess:
+      def target_log_prob(x, y):
+        counter["target_calls"] += 1
+        # Corresponds to unnormalized MVN.
+        # z = matmul(inv(chol(true_cov)), [x, y] - true_mean)
+        z = array_ops.stack([x, y], axis=-1) - true_mean
+        z = array_ops.squeeze(
+            gen_linalg_ops.matrix_triangular_solve(
+                np.linalg.cholesky(true_cov),
+                z[..., array_ops.newaxis]),
+            axis=-1)
+        return -0.5 * math_ops.reduce_sum(z**2., axis=-1)
+      states, _ = hmc.sample_chain(
+          num_results=num_results,
+          target_log_prob_fn=target_log_prob,
+          current_state=[dtype(-2), dtype(2)],
+          step_size=[0.5, 0.5],
+          num_leapfrog_steps=2,
+          num_burnin_steps=200,
+          num_steps_between_results=1,
+          seed=54)
+      self.assertAllEqual(dict(target_calls=2), counter)
+      states = array_ops.stack(states, axis=-1)
+      self.assertEqual(num_results, states.shape[0].value)
+      sample_mean = math_ops.reduce_mean(states, axis=0)
+      x = states - sample_mean
+      sample_cov = math_ops.matmul(x, x, transpose_a=True) / dtype(num_results)
+      [sample_mean_, sample_cov_] = sess.run([
+          sample_mean, sample_cov])
+      self.assertAllClose(true_mean, sample_mean_,
+                          atol=0.05, rtol=0.)
+      self.assertAllClose(true_cov, sample_cov_,
+                          atol=0., rtol=0.1)
+
+
+class _EnergyComputationTest(object):
+
+  def testHandlesNanFromPotential(self):
+    with self.test_session(graph=ops.Graph()) as sess:
+      x = [1, np.inf, -np.inf, np.nan]
+      target_log_prob, proposed_target_log_prob = [
+          self.dtype(x.flatten()) for x in np.meshgrid(x, x)]
+      num_chains = len(target_log_prob)
+      dummy_momentums = [-1, 1]
+      momentums = [self.dtype([dummy_momentums] * num_chains)]
+      proposed_momentums = [self.dtype([dummy_momentums] * num_chains)]
+
+      target_log_prob = ops.convert_to_tensor(target_log_prob)
+      momentums = [ops.convert_to_tensor(momentums[0])]
+      proposed_target_log_prob = ops.convert_to_tensor(proposed_target_log_prob)
+      proposed_momentums = [ops.convert_to_tensor(proposed_momentums[0])]
+
+      energy = _compute_energy_change(
+          target_log_prob,
+          momentums,
+          proposed_target_log_prob,
+          proposed_momentums,
+          independent_chain_ndims=1)
+      grads = gradients_ops.gradients(energy, momentums)
+
+      [actual_energy, grads_] = sess.run([energy, grads])
+
+      # Ensure energy is `inf` (note: that's positive inf) in weird cases and
+      # finite otherwise.
+      expected_energy = self.dtype([0] + [np.inf]*(num_chains - 1))
+      self.assertAllEqual(expected_energy, actual_energy)
+
+      # Ensure gradient is finite.
+      self.assertAllEqual(np.ones_like(grads_).astype(np.bool),
+                          np.isfinite(grads_))
+
+  def testHandlesNanFromKinetic(self):
+    with self.test_session(graph=ops.Graph()) as sess:
+      x = [1, np.inf, -np.inf, np.nan]
+      momentums, proposed_momentums = [
+          [np.reshape(self.dtype(x), [-1, 1])]
+          for x in np.meshgrid(x, x)]
+      num_chains = len(momentums[0])
+      target_log_prob = np.ones(num_chains, self.dtype)
+      proposed_target_log_prob = np.ones(num_chains, self.dtype)
+
+      target_log_prob = ops.convert_to_tensor(target_log_prob)
+      momentums = [ops.convert_to_tensor(momentums[0])]
+      proposed_target_log_prob = ops.convert_to_tensor(proposed_target_log_prob)
+      proposed_momentums = [ops.convert_to_tensor(proposed_momentums[0])]
+
+      energy = _compute_energy_change(
+          target_log_prob,
+          momentums,
+          proposed_target_log_prob,
+          proposed_momentums,
+          independent_chain_ndims=1)
+      grads = gradients_ops.gradients(energy, momentums)
+
+      [actual_energy, grads_] = sess.run([energy, grads])
+
+      # Ensure energy is `inf` (note: that's positive inf) in weird cases and
+      # finite otherwise.
+      expected_energy = self.dtype([0] + [np.inf]*(num_chains - 1))
+      self.assertAllEqual(expected_energy, actual_energy)
+
+      # Ensure gradient is finite.
+      g = grads_[0].reshape([len(x), len(x)])[:, 0]
+      self.assertAllEqual(np.ones_like(g).astype(np.bool), np.isfinite(g))
+
+      # The remaining gradients are nan because the momentum was itself nan or
+      # inf.
+      g = grads_[0].reshape([len(x), len(x)])[:, 1:]
+      self.assertAllEqual(np.ones_like(g).astype(np.bool), np.isnan(g))
+
+
+class EnergyComputationTest16(test.TestCase, _EnergyComputationTest):
+  dtype = np.float16
+
+
+class EnergyComputationTest32(test.TestCase, _EnergyComputationTest):
+  dtype = np.float32
+
+
+class EnergyComputationTest64(test.TestCase, _EnergyComputationTest):
+  dtype = np.float64
+
+
+class _HMCHandlesLists(object):
+
+  def testStateParts(self):
+    with self.test_session(graph=ops.Graph()) as sess:
+      dist_x = normal_lib.Normal(loc=self.dtype(0), scale=self.dtype(1))
+      dist_y = independent_lib.Independent(
+          gamma_lib.Gamma(concentration=self.dtype([1, 2]),
+                          rate=self.dtype([0.5, 0.75])),
+          reinterpreted_batch_ndims=1)
+      def target_log_prob(x, y):
+        return dist_x.log_prob(x) + dist_y.log_prob(y)
+      x0 = [dist_x.sample(seed=1), dist_y.sample(seed=2)]
+      samples, _ = hmc.sample_chain(
+          num_results=int(2e3),
+          target_log_prob_fn=target_log_prob,
+          current_state=x0,
+          step_size=0.85,
+          num_leapfrog_steps=3,
+          num_burnin_steps=int(250),
+          seed=49)
+      actual_means = [math_ops.reduce_mean(s, axis=0) for s in samples]
+      actual_vars = [_reduce_variance(s, axis=0) for s in samples]
+      expected_means = [dist_x.mean(), dist_y.mean()]
+      expected_vars = [dist_x.variance(), dist_y.variance()]
+      [
+          actual_means_,
+          actual_vars_,
+          expected_means_,
+          expected_vars_,
+      ] = sess.run([
+          actual_means,
+          actual_vars,
+          expected_means,
+          expected_vars,
+      ])
+      self.assertAllClose(expected_means_, actual_means_, atol=0.05, rtol=0.16)
+      self.assertAllClose(expected_vars_, actual_vars_, atol=0., rtol=0.25)
+
+
+class HMCHandlesLists32(_HMCHandlesLists, test.TestCase):
+  dtype = np.float32
+
+
+class HMCHandlesLists64(_HMCHandlesLists, test.TestCase):
+  dtype = np.float64
+
+
+if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/bayesflow/python/ops/hmc.py b/tensorflow/contrib/bayesflow/python/ops/hmc.py
index 977d42fc16..7fd5652c5c 100644
--- a/tensorflow/contrib/bayesflow/python/ops/hmc.py
+++ b/tensorflow/contrib/bayesflow/python/ops/hmc.py
@@ -12,8 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Hamiltonian Monte Carlo, a gradient-based MCMC algorithm.
-"""
+"""Hamiltonian Monte Carlo, a gradient-based MCMC algorithm."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -24,11 +23,9 @@ from tensorflow.contrib.bayesflow.python.ops.hmc_impl import *  # pylint: disabl
 from tensorflow.python.util import all_util
 
 _allowed_symbols = [
-    'chain',
-    'kernel',
-    'leapfrog_integrator',
-    'leapfrog_step',
-    'ais_chain'
+    "sample_chain",
+    "sample_annealed_importance_chain",
+    "kernel",
 ]
 
 all_util.remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/bayesflow/python/ops/hmc_impl.py b/tensorflow/contrib/bayesflow/python/ops/hmc_impl.py
index 5685a942e9..f724910c59 100644
--- a/tensorflow/contrib/bayesflow/python/ops/hmc_impl.py
+++ b/tensorflow/contrib/bayesflow/python/ops/hmc_impl.py
@@ -14,17 +14,16 @@
 # ==============================================================================
 """Hamiltonian Monte Carlo, a gradient-based MCMC algorithm.
 
-@@chain
-@@update
-@@leapfrog_integrator
-@@leapfrog_step
-@@ais_chain
+@@sample_chain
+@@sample_annealed_importance_chain
+@@kernel
 """
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import numpy as np
 
 from tensorflow.python.framework import dtypes
@@ -32,168 +31,326 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import functional_ops
-from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import gradients_impl as gradients_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
-from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.ops.distributions import util as distributions_util
 
 __all__ = [
-    'chain',
-    'kernel',
-    'leapfrog_integrator',
-    'leapfrog_step',
-    'ais_chain'
+    "sample_chain",
+    "sample_annealed_importance_chain",
+    "kernel",
 ]
 
 
-def _make_potential_and_grad(target_log_prob_fn):
-  def potential_and_grad(x):
-    log_prob_result = -target_log_prob_fn(x)
-    grad_result = gradients_impl.gradients(math_ops.reduce_sum(log_prob_result),
-                                           x)[0]
-    return log_prob_result, grad_result
-  return potential_and_grad
-
-
-def chain(n_iterations, step_size, n_leapfrog_steps, initial_x,
-          target_log_prob_fn, event_dims=(), name=None):
+KernelResults = collections.namedtuple(
+    "KernelResults",
+    [
+        "acceptance_probs",
+        "current_grads_target_log_prob",  # "Current result" means "accepted".
+        "current_target_log_prob",  # "Current result" means "accepted".
+        "energy_change",
+        "is_accepted",
+        "proposed_grads_target_log_prob",
+        "proposed_state",
+        "proposed_target_log_prob",
+        "random_positive",
+    ])
+
+
+def _make_dummy_kernel_results(
+    dummy_state,
+    dummy_target_log_prob,
+    dummy_grads_target_log_prob):
+  return KernelResults(
+      acceptance_probs=dummy_target_log_prob,
+      current_grads_target_log_prob=dummy_grads_target_log_prob,
+      current_target_log_prob=dummy_target_log_prob,
+      energy_change=dummy_target_log_prob,
+      is_accepted=array_ops.ones_like(dummy_target_log_prob, dtypes.bool),
+      proposed_grads_target_log_prob=dummy_grads_target_log_prob,
+      proposed_state=dummy_state,
+      proposed_target_log_prob=dummy_target_log_prob,
+      random_positive=dummy_target_log_prob,
+  )
+
+
+def sample_chain(
+    num_results,
+    target_log_prob_fn,
+    current_state,
+    step_size,
+    num_leapfrog_steps,
+    num_burnin_steps=0,
+    num_steps_between_results=0,
+    seed=None,
+    current_target_log_prob=None,
+    current_grads_target_log_prob=None,
+    name=None):
   """Runs multiple iterations of one or more Hamiltonian Monte Carlo chains.
 
-  Hamiltonian Monte Carlo (HMC) is a Markov chain Monte Carlo (MCMC)
-  algorithm that takes a series of gradient-informed steps to produce
-  a Metropolis proposal. This function samples from an HMC Markov
-  chain whose initial state is `initial_x` and whose stationary
-  distribution has log-density `target_log_prob_fn()`.
-
-  This function can update multiple chains in parallel. It assumes
-  that all dimensions of `initial_x` not specified in `event_dims` are
-  independent, and should therefore be updated independently. The
-  output of `target_log_prob_fn()` should sum log-probabilities across
-  all event dimensions. Slices along dimensions not in `event_dims`
-  may have different target distributions; this is up to
+  Hamiltonian Monte Carlo (HMC) is a Markov chain Monte Carlo (MCMC) algorithm
+  that takes a series of gradient-informed steps to produce a Metropolis
+  proposal. This function samples from an HMC Markov chain at `current_state`
+  and whose stationary distribution has log-unnormalized-density
   `target_log_prob_fn()`.
 
-  This function basically just wraps `hmc.kernel()` in a tf.scan() loop.
+  This function samples from multiple chains in parallel. It assumes that the
+  the leftmost dimensions of (each) `current_state` (part) index an independent
+  chain.  The function `target_log_prob_fn()` sums log-probabilities across
+  event dimensions (i.e., current state (part) rightmost dimensions). Each
+  element of the output of `target_log_prob_fn()` represents the (possibly
+  unnormalized) log-probability of the joint distribution over (all) the current
+  state (parts).
 
-  Args:
-    n_iterations: Integer number of Markov chain updates to run.
-    step_size: Scalar step size or array of step sizes for the
-      leapfrog integrator. Broadcasts to the shape of
-      `initial_x`. Larger step sizes lead to faster progress, but
-      too-large step sizes make rejection exponentially more likely.
-      When possible, it's often helpful to match per-variable step
-      sizes to the standard deviations of the target distribution in
-      each variable.
-    n_leapfrog_steps: Integer number of steps to run the leapfrog
-      integrator for. Total progress per HMC step is roughly
-      proportional to step_size * n_leapfrog_steps.
-    initial_x: Tensor of initial state(s) of the Markov chain(s).
-    target_log_prob_fn: Python callable which takes an argument like `initial_x`
-      and returns its (possibly unnormalized) log-density under the target
-      distribution.
-    event_dims: List of dimensions that should not be treated as
-      independent. This allows for multiple chains to be run independently
-      in parallel. Default is (), i.e., all dimensions are independent.
-    name: Python `str` name prefixed to Ops created by this function.
+  The `current_state` can be represented as a single `Tensor` or a `list` of
+  `Tensors` which collectively represent the current state. When specifying a
+  `list`, one must also specify a list of `step_size`s.
 
-  Returns:
-    acceptance_probs: Tensor with the acceptance probabilities for each
-      iteration. Has shape matching `target_log_prob_fn(initial_x)`.
-    chain_states: Tensor with the state of the Markov chain at each iteration.
-      Has shape `[n_iterations, initial_x.shape[0],...,initial_x.shape[-1]`.
+  Note: `target_log_prob_fn` is called exactly twice.
+
+  Only one out of every `num_steps_between_samples + 1` steps is included in the
+  returned results. This "thinning" comes at a cost of reduced statistical
+  power, while reducing memory requirements and autocorrelation. For more
+  discussion see [1].
+
+  [1]: "Statistically efficient thinning of a Markov chain sampler."
+       Art B. Owen. April 2017.
+       http://statweb.stanford.edu/~owen/reports/bestthinning.pdf
 
   #### Examples:
 
-  ```python
-  # Sampling from a standard normal (note `log_joint()` is unnormalized):
-  def log_joint(x):
-    return tf.reduce_sum(-0.5 * tf.square(x))
-  chain, acceptance_probs = hmc.chain(1000, 0.5, 2, tf.zeros(10), log_joint,
-                                      event_dims=[0])
-  # Discard first half of chain as warmup/burn-in
-  warmed_up = chain[500:]
-  mean_est = tf.reduce_mean(warmed_up, 0)
-  var_est = tf.reduce_mean(tf.square(warmed_up), 0) - tf.square(mean_est)
-  ```
+  ##### Sample from a diagonal-variance Gaussian.
 
   ```python
-  # Sampling from a diagonal-variance Gaussian:
-  variances = tf.linspace(1., 3., 10)
-  def log_joint(x):
-    return tf.reduce_sum(-0.5 / variances * tf.square(x))
-  chain, acceptance_probs = hmc.chain(1000, 0.5, 2, tf.zeros(10), log_joint,
-                                      event_dims=[0])
-  # Discard first half of chain as warmup/burn-in
-  warmed_up = chain[500:]
-  mean_est = tf.reduce_mean(warmed_up, 0)
-  var_est = tf.reduce_mean(tf.square(warmed_up), 0) - tf.square(mean_est)
+  tfd = tf.contrib.distributions
+
+  def make_likelihood(true_variances):
+    return tfd.MultivariateNormalDiag(
+        scale_diag=tf.sqrt(true_variances))
+
+  dims = 10
+  dtype = np.float32
+  true_variances = tf.linspace(dtype(1), dtype(3), dims)
+  likelihood = make_likelihood(true_variances)
+
+  states, kernel_results = hmc.sample_chain(
+      num_results=1000,
+      target_log_prob_fn=likelihood.log_prob,
+      current_state=tf.zeros(dims),
+      step_size=0.5,
+      num_leapfrog_steps=2,
+      num_burnin_steps=500)
+
+  # Compute sample stats.
+  sample_mean = tf.reduce_mean(states, axis=0)
+  sample_var = tf.reduce_mean(
+      tf.squared_difference(states, sample_mean),
+      axis=0)
   ```
 
-  ```python
-  # Sampling from factor-analysis posteriors with known factors W:
-  # mu[i, j] ~ Normal(0, 1)
-  # x[i] ~ Normal(matmul(mu[i], W), I)
-  def log_joint(mu, x, W):
-    prior = -0.5 * tf.reduce_sum(tf.square(mu), 1)
-    x_mean = tf.matmul(mu, W)
-    likelihood = -0.5 * tf.reduce_sum(tf.square(x - x_mean), 1)
-    return prior + likelihood
-  chain, acceptance_probs = hmc.chain(1000, 0.1, 2,
-                                      tf.zeros([x.shape[0], W.shape[0]]),
-                                      lambda mu: log_joint(mu, x, W),
-                                      event_dims=[1])
-  # Discard first half of chain as warmup/burn-in
-  warmed_up = chain[500:]
-  mean_est = tf.reduce_mean(warmed_up, 0)
-  var_est = tf.reduce_mean(tf.square(warmed_up), 0) - tf.square(mean_est)
+  ##### Sampling from factor-analysis posteriors with known factors.
+
+  I.e.,
+
+  ```none
+  for i=1..n:
+    w[i] ~ Normal(0, eye(d))            # prior
+    x[i] ~ Normal(loc=matmul(w[i], F))  # likelihood
   ```
 
+  where `F` denotes factors.
+
   ```python
-  # Sampling from the posterior of a Bayesian regression model.:
-
-  # Run 100 chains in parallel, each with a different initialization.
-  initial_beta = tf.random_normal([100, x.shape[1]])
-  chain, acceptance_probs = hmc.chain(1000, 0.1, 10, initial_beta,
-                                      log_joint_partial, event_dims=[1])
-  # Discard first halves of chains as warmup/burn-in
-  warmed_up = chain[500:]
-  # Averaging across samples within a chain and across chains
-  mean_est = tf.reduce_mean(warmed_up, [0, 1])
-  var_est = tf.reduce_mean(tf.square(warmed_up), [0, 1]) - tf.square(mean_est)
+  tfd = tf.contrib.distributions
+
+  def make_prior(dims, dtype):
+    return tfd.MultivariateNormalDiag(
+        loc=tf.zeros(dims, dtype))
+
+  def make_likelihood(weights, factors):
+    return tfd.MultivariateNormalDiag(
+        loc=tf.tensordot(weights, factors, axes=[[0], [-1]]))
+
+  # Setup data.
+  num_weights = 10
+  num_factors = 4
+  num_chains = 100
+  dtype = np.float32
+
+  prior = make_prior(num_weights, dtype)
+  weights = prior.sample(num_chains)
+  factors = np.random.randn(num_factors, num_weights).astype(dtype)
+  x = make_likelihood(weights, factors).sample(num_chains)
+
+  def target_log_prob(w):
+    # Target joint is: `f(w) = p(w, x | factors)`.
+    return prior.log_prob(w) + make_likelihood(w, factors).log_prob(x)
+
+  # Get `num_results` samples from `num_chains` independent chains.
+  chains_states, kernels_results = hmc.sample_chain(
+      num_results=1000,
+      target_log_prob_fn=target_log_prob,
+      current_state=tf.zeros([num_chains, dims], dtype),
+      step_size=0.1,
+      num_leapfrog_steps=2,
+      num_burnin_steps=500)
+
+  # Compute sample stats.
+  sample_mean = tf.reduce_mean(chains_states, axis=[0, 1])
+  sample_var = tf.reduce_mean(
+      tf.squared_difference(chains_states, sample_mean),
+      axis=[0, 1])
   ```
+
+  Args:
+    num_results: Integer number of Markov chain draws.
+    target_log_prob_fn: Python callable which takes an argument like
+      `current_state` (or `*current_state` if it's a list) and returns its
+      (possibly unnormalized) log-density under the target distribution.
+    current_state: `Tensor` or Python `list` of `Tensor`s representing the
+      current state(s) of the Markov chain(s). The first `r` dimensions index
+      independent chains, `r = tf.rank(target_log_prob_fn(*current_state))`.
+    step_size: `Tensor` or Python `list` of `Tensor`s representing the step size
+      for the leapfrog integrator. Must broadcast with the shape of
+      `current_state`. Larger step sizes lead to faster progress, but too-large
+      step sizes make rejection exponentially more likely. When possible, it's
+      often helpful to match per-variable step sizes to the standard deviations
+      of the target distribution in each variable.
+    num_leapfrog_steps: Integer number of steps to run the leapfrog integrator
+      for. Total progress per HMC step is roughly proportional to `step_size *
+      num_leapfrog_steps`.
+    num_burnin_steps: Integer number of chain steps to take before starting to
+      collect results.
+      Default value: 0 (i.e., no burn-in).
+    num_steps_between_results: Integer number of chain steps between collecting
+      a result. Only one out of every `num_steps_between_samples + 1` steps is
+      included in the returned results. This "thinning" comes at a cost of
+      reduced statistical power, while reducing memory requirements and
+      autocorrelation. For more discussion see [1].
+      Default value: 0 (i.e., no subsampling).
+    seed: Python integer to seed the random number generator.
+    current_target_log_prob: (Optional) `Tensor` representing the value of
+      `target_log_prob_fn` at the `current_state`. The only reason to specify
+      this argument is to reduce TF graph size.
+      Default value: `None` (i.e., compute as needed).
+    current_grads_target_log_prob: (Optional) Python list of `Tensor`s
+      representing gradient of `target_log_prob` at the `current_state` and wrt
+      the `current_state`. Must have same shape as `current_state`. The only
+      reason to specify this argument is to reduce TF graph size.
+      Default value: `None` (i.e., compute as needed).
+    name: Python `str` name prefixed to Ops created by this function.
+      Default value: `None` (i.e., "hmc_sample_chain").
+
+  Returns:
+    accepted_states: Tensor or Python list of `Tensor`s representing the
+      state(s) of the Markov chain(s) at each result step. Has same shape as
+      input `current_state` but with a prepended `num_results`-size dimension.
+    kernel_results: `collections.namedtuple` of internal calculations used to
+      advance the chain.
   """
-  with ops.name_scope(name, 'hmc_chain', [n_iterations, step_size,
-                                          n_leapfrog_steps, initial_x]):
-    initial_x = ops.convert_to_tensor(initial_x, name='initial_x')
-    non_event_shape = array_ops.shape(target_log_prob_fn(initial_x))
-
-    def body(a, _):
-      updated_x, acceptance_probs, log_prob, grad = kernel(
-          step_size, n_leapfrog_steps, a[0], target_log_prob_fn, event_dims,
-          a[2], a[3])
-      return updated_x, acceptance_probs, log_prob, grad
-
-    potential_and_grad = _make_potential_and_grad(target_log_prob_fn)
-    potential, grad = potential_and_grad(initial_x)
-    return functional_ops.scan(
-        body, array_ops.zeros(n_iterations, dtype=initial_x.dtype),
-        (initial_x,
-         array_ops.zeros(non_event_shape, dtype=initial_x.dtype),
-         -potential, -grad))[:2]
-
-
-def ais_chain(n_iterations, step_size, n_leapfrog_steps, initial_x,
-              target_log_prob_fn, proposal_log_prob_fn, event_dims=(),
-              name=None):
+  with ops.name_scope(
+      name, "hmc_sample_chain",
+      [num_results, current_state, step_size, num_leapfrog_steps,
+       num_burnin_steps, num_steps_between_results, seed,
+       current_target_log_prob, current_grads_target_log_prob]):
+    with ops.name_scope("initialize"):
+      [
+          current_state,
+          step_size,
+          current_target_log_prob,
+          current_grads_target_log_prob,
+      ] = _prepare_args(
+          target_log_prob_fn,
+          current_state,
+          step_size,
+          current_target_log_prob,
+          current_grads_target_log_prob)
+      num_results = ops.convert_to_tensor(
+          num_results,
+          dtype=dtypes.int32,
+          name="num_results")
+      num_leapfrog_steps = ops.convert_to_tensor(
+          num_leapfrog_steps,
+          dtype=dtypes.int32,
+          name="num_leapfrog_steps")
+      num_burnin_steps = ops.convert_to_tensor(
+          num_burnin_steps,
+          dtype=dtypes.int32,
+          name="num_burnin_steps")
+      num_steps_between_results = ops.convert_to_tensor(
+          num_steps_between_results,
+          dtype=dtypes.int32,
+          name="num_steps_between_results")
+
+    def _run_chain(num_steps, current_state, kernel_results):
+      """Runs the chain(s) for `num_steps`."""
+      def _loop_body(iter_, current_state, kernel_results):
+        return [iter_ + 1] + list(kernel(
+            target_log_prob_fn,
+            current_state,
+            step_size,
+            num_leapfrog_steps,
+            seed,
+            kernel_results.current_target_log_prob,
+            kernel_results.current_grads_target_log_prob))
+      while_loop_kwargs = dict(
+          cond=lambda iter_, *args: iter_ < num_steps,
+          body=_loop_body,
+          loop_vars=[
+              np.int32(0),
+              current_state,
+              kernel_results,
+          ],
+      )
+      if seed is not None:
+        while_loop_kwargs["parallel_iterations"] = 1
+      return control_flow_ops.while_loop(
+          **while_loop_kwargs)[1:]  # Lop-off "iter_".
+
+    def _scan_body(args_list, iter_):
+      """Closure which implements `tf.scan` body."""
+      current_state, kernel_results = args_list
+      return _run_chain(
+          1 + array_ops.where(math_ops.equal(iter_, 0),
+                              num_burnin_steps,
+                              num_steps_between_results),
+          current_state,
+          kernel_results)
+
+    scan_kwargs = dict(
+        fn=_scan_body,
+        elems=math_ops.range(num_results),  # iter_: used to choose burnin.
+        initializer=[
+            current_state,
+            _make_dummy_kernel_results(
+                current_state,
+                current_target_log_prob,
+                current_grads_target_log_prob),
+        ])
+    if seed is not None:
+      scan_kwargs["parallel_iterations"] = 1
+    return functional_ops.scan(**scan_kwargs)
+
+
+def sample_annealed_importance_chain(
+    proposal_log_prob_fn,
+    num_steps,
+    target_log_prob_fn,
+    current_state,
+    step_size,
+    num_leapfrog_steps,
+    seed=None,
+    name=None):
   """Runs annealed importance sampling (AIS) to estimate normalizing constants.
 
-  This routine uses Hamiltonian Monte Carlo to sample from a series of
+  This function uses Hamiltonian Monte Carlo to sample from a series of
   distributions that slowly interpolates between an initial "proposal"
-  distribution
+  distribution:
 
   `exp(proposal_log_prob_fn(x) - proposal_log_normalizer)`
 
-  and the target distribution
+  and the target distribution:
 
   `exp(target_log_prob_fn(x) - target_log_normalizer)`,
 
@@ -202,113 +359,203 @@ def ais_chain(n_iterations, step_size, n_leapfrog_steps, initial_x,
   normalizing constants of the initial distribution and the target
   distribution:
 
-  E[exp(w)] = exp(target_log_normalizer - proposal_log_normalizer).
+  `E[exp(ais_weights)] = exp(target_log_normalizer - proposal_log_normalizer)`.
 
-  Args:
-    n_iterations: Integer number of Markov chain updates to run. More
-      iterations means more expense, but smoother annealing between q
-      and p, which in turn means exponentially lower variance for the
-      normalizing constant estimator.
-    step_size: Scalar step size or array of step sizes for the
-      leapfrog integrator. Broadcasts to the shape of
-      `initial_x`. Larger step sizes lead to faster progress, but
-      too-large step sizes make rejection exponentially more likely.
-      When possible, it's often helpful to match per-variable step
-      sizes to the standard deviations of the target distribution in
-      each variable.
-    n_leapfrog_steps: Integer number of steps to run the leapfrog
-      integrator for. Total progress per HMC step is roughly
-      proportional to step_size * n_leapfrog_steps.
-    initial_x: Tensor of initial state(s) of the Markov chain(s). Must
-      be a sample from q, or results will be incorrect.
-    target_log_prob_fn: Python callable which takes an argument like `initial_x`
-      and returns its (possibly unnormalized) log-density under the target
-      distribution.
-    proposal_log_prob_fn: Python callable that returns the log density of the
-      initial distribution.
-    event_dims: List of dimensions that should not be treated as
-      independent. This allows for multiple chains to be run independently
-      in parallel. Default is (), i.e., all dimensions are independent.
-    name: Python `str` name prefixed to Ops created by this function.
-
-  Returns:
-    ais_weights: Tensor with the estimated weight(s). Has shape matching
-      `target_log_prob_fn(initial_x)`.
-    chain_states: Tensor with the state(s) of the Markov chain(s) the final
-      iteration. Has shape matching `initial_x`.
-    acceptance_probs: Tensor with the acceptance probabilities for the final
-      iteration. Has shape matching `target_log_prob_fn(initial_x)`.
+  Note: `proposal_log_prob_fn` and `target_log_prob_fn` are called exactly three
+  times (although this may be reduced to two times, in the future).
 
   #### Examples:
 
+  ##### Estimate the normalizing constant of a log-gamma distribution.
+
   ```python
-  # Estimating the normalizing constant of a log-gamma distribution:
-  def proposal_log_prob(x):
-    # Standard normal log-probability. This is properly normalized.
-    return tf.reduce_sum(-0.5 * tf.square(x) - 0.5 * np.log(2 * np.pi), 1)
-  def target_log_prob(x):
-    # Unnormalized log-gamma(2, 3) distribution.
-    # True normalizer is (lgamma(2) - 2 * log(3)) * x.shape[1]
-    return tf.reduce_sum(2. * x - 3. * tf.exp(x), 1)
+  tfd = tf.contrib.distributions
+
   # Run 100 AIS chains in parallel
-  initial_x = tf.random_normal([100, 20])
-  w, _, _ = hmc.ais_chain(1000, 0.2, 2, initial_x, target_log_prob,
-                          proposal_log_prob, event_dims=[1])
-  log_normalizer_estimate = tf.reduce_logsumexp(w) - np.log(100)
+  num_chains = 100
+  dims = 20
+  dtype = np.float32
+
+  proposal = tfd.MultivatiateNormalDiag(
+     loc=tf.zeros([dims], dtype=dtype))
+
+  target = tfd.TransformedDistribution(
+    distribution=tfd.Gamma(concentration=dtype(2),
+                           rate=dtype(3)),
+    bijector=tfd.bijectors.Invert(tfd.bijectors.Exp()),
+    event_shape=[dims])
+
+  chains_state, ais_weights, kernels_results = (
+      hmc.sample_annealed_importance_chain(
+          proposal_log_prob_fn=proposal.log_prob,
+          num_steps=1000,
+          target_log_prob_fn=target.log_prob,
+          step_size=0.2,
+          current_state=proposal.sample(num_chains),
+          num_leapfrog_steps=2))
+
+  log_estimated_normalizer = (tf.reduce_logsumexp(ais_weights)
+                              - np.log(num_chains))
+  log_true_normalizer = tf.lgamma(2.) - 2. * tf.log(3.)
   ```
 
+  ##### Estimate marginal likelihood of a Bayesian regression model.
+
   ```python
-  # Estimating the marginal likelihood of a Bayesian regression model:
-  base_measure = -0.5 * np.log(2 * np.pi)
-  def proposal_log_prob(x):
-    # Standard normal log-probability. This is properly normalized.
-    return tf.reduce_sum(-0.5 * tf.square(x) + base_measure, 1)
-  def regression_log_joint(beta, x, y):
-    # This function returns a vector whose ith element is log p(beta[i], y | x).
-    # Each row of beta corresponds to the state of an independent Markov chain.
-    log_prior = tf.reduce_sum(-0.5 * tf.square(beta) + base_measure, 1)
-    means = tf.matmul(beta, x, transpose_b=True)
-    log_likelihood = tf.reduce_sum(-0.5 * tf.square(y - means) +
-                                   base_measure, 1)
-    return log_prior + log_likelihood
-  def log_joint_partial(beta):
-    return regression_log_joint(beta, x, y)
+  tfd = tf.contrib.distributions
+
+  def make_prior(dims, dtype):
+    return tfd.MultivariateNormalDiag(
+        loc=tf.zeros(dims, dtype))
+
+  def make_likelihood(weights, x):
+    return tfd.MultivariateNormalDiag(
+        loc=tf.tensordot(weights, x, axes=[[0], [-1]]))
+
   # Run 100 AIS chains in parallel
-  initial_beta = tf.random_normal([100, x.shape[1]])
-  w, beta_samples, _ = hmc.ais_chain(1000, 0.1, 2, initial_beta,
-                                     log_joint_partial, proposal_log_prob,
-                                     event_dims=[1])
-  log_normalizer_estimate = tf.reduce_logsumexp(w) - np.log(100)
+  num_chains = 100
+  dims = 10
+  dtype = np.float32
+
+  # Make training data.
+  x = np.random.randn(num_chains, dims).astype(dtype)
+  true_weights = np.random.randn(dims).astype(dtype)
+  y = np.dot(x, true_weights) + np.random.randn(num_chains)
+
+  # Setup model.
+  prior = make_prior(dims, dtype)
+  def target_log_prob_fn(weights):
+    return prior.log_prob(weights) + make_likelihood(weights, x).log_prob(y)
+
+  proposal = tfd.MultivariateNormalDiag(
+      loc=tf.zeros(dims, dtype))
+
+  weight_samples, ais_weights, kernel_results = (
+      hmc.sample_annealed_importance_chain(
+        num_steps=1000,
+        proposal_log_prob_fn=proposal.log_prob,
+        target_log_prob_fn=target_log_prob_fn
+        current_state=tf.zeros([num_chains, dims], dtype),
+        step_size=0.1,
+        num_leapfrog_steps=2))
+  log_normalizer_estimate = (tf.reduce_logsumexp(ais_weights)
+                             - np.log(num_chains))
   ```
+
+  Args:
+    proposal_log_prob_fn: Python callable that returns the log density of the
+      initial distribution.
+    num_steps: Integer number of Markov chain updates to run. More
+      iterations means more expense, but smoother annealing between q
+      and p, which in turn means exponentially lower variance for the
+      normalizing constant estimator.
+    target_log_prob_fn: Python callable which takes an argument like
+      `current_state` (or `*current_state` if it's a list) and returns its
+      (possibly unnormalized) log-density under the target distribution.
+    current_state: `Tensor` or Python `list` of `Tensor`s representing the
+      current state(s) of the Markov chain(s). The first `r` dimensions index
+      independent chains, `r = tf.rank(target_log_prob_fn(*current_state))`.
+    step_size: `Tensor` or Python `list` of `Tensor`s representing the step size
+      for the leapfrog integrator. Must broadcast with the shape of
+      `current_state`. Larger step sizes lead to faster progress, but too-large
+      step sizes make rejection exponentially more likely. When possible, it's
+      often helpful to match per-variable step sizes to the standard deviations
+      of the target distribution in each variable.
+    num_leapfrog_steps: Integer number of steps to run the leapfrog integrator
+      for. Total progress per HMC step is roughly proportional to `step_size *
+      num_leapfrog_steps`.
+    seed: Python integer to seed the random number generator.
+    name: Python `str` name prefixed to Ops created by this function.
+      Default value: `None` (i.e., "hmc_sample_annealed_importance_chain").
+
+  Returns:
+    accepted_state: `Tensor` or Python list of `Tensor`s representing the
+      state(s) of the Markov chain(s) at the final iteration. Has same shape as
+      input `current_state`.
+    ais_weights: Tensor with the estimated weight(s). Has shape matching
+      `target_log_prob_fn(current_state)`.
+    kernel_results: `collections.namedtuple` of internal calculations used to
+      advance the chain.
   """
-  with ops.name_scope(name, 'hmc_ais_chain',
-                      [n_iterations, step_size, n_leapfrog_steps, initial_x]):
-    non_event_shape = array_ops.shape(target_log_prob_fn(initial_x))
-
-    beta_series = math_ops.linspace(0., 1., n_iterations+1)[1:]
-    def _body(a, beta):  # pylint: disable=missing-docstring
-      def log_prob_beta(x):
-        return ((1 - beta) * proposal_log_prob_fn(x) +
-                beta * target_log_prob_fn(x))
-      last_x = a[0]
-      w = a[2]
-      w += (1. / n_iterations) * (target_log_prob_fn(last_x) -
-                                  proposal_log_prob_fn(last_x))
-      # TODO(b/66917083): There's an opportunity for gradient reuse here.
-      updated_x, acceptance_probs, _, _ = kernel(step_size, n_leapfrog_steps,
-                                                 last_x, log_prob_beta,
-                                                 event_dims)
-      return updated_x, acceptance_probs, w
-
-    x, acceptance_probs, w = functional_ops.scan(
-        _body, beta_series,
-        (initial_x, array_ops.zeros(non_event_shape, dtype=initial_x.dtype),
-         array_ops.zeros(non_event_shape, dtype=initial_x.dtype)))
-  return w[-1], x[-1], acceptance_probs[-1]
-
-
-def kernel(step_size, n_leapfrog_steps, x, target_log_prob_fn, event_dims=(),
-           x_log_prob=None, x_grad=None, name=None):
+  def make_convex_combined_log_prob_fn(iter_):
+    def _fn(*args):
+      p = proposal_log_prob_fn(*args)
+      t = target_log_prob_fn(*args)
+      dtype = p.dtype.base_dtype
+      beta = (math_ops.cast(iter_ + 1, dtype)
+              / math_ops.cast(num_steps, dtype))
+      return (1. - beta) * p + beta * t
+    return _fn
+
+  with ops.name_scope(
+      name, "hmc_sample_annealed_importance_chain",
+      [num_steps, current_state, step_size, num_leapfrog_steps, seed]):
+    with ops.name_scope("initialize"):
+      [
+          current_state,
+          step_size,
+          current_log_prob,
+          current_grads_log_prob,
+      ] = _prepare_args(
+          make_convex_combined_log_prob_fn(iter_=0),
+          current_state,
+          step_size,
+          description="convex_combined_log_prob")
+      num_steps = ops.convert_to_tensor(
+          num_steps,
+          dtype=dtypes.int32,
+          name="num_steps")
+      num_leapfrog_steps = ops.convert_to_tensor(
+          num_leapfrog_steps,
+          dtype=dtypes.int32,
+          name="num_leapfrog_steps")
+    def _loop_body(iter_, ais_weights, current_state, kernel_results):
+      """Closure which implements `tf.while_loop` body."""
+      current_state_parts = (list(current_state)
+                             if _is_list_like(current_state)
+                             else [current_state])
+      # TODO(b/72994218): Consider refactoring things to avoid this unecessary
+      # call.
+      ais_weights += ((target_log_prob_fn(*current_state_parts)
+                       - proposal_log_prob_fn(*current_state_parts))
+                      / math_ops.cast(num_steps, ais_weights.dtype))
+      return [iter_ + 1, ais_weights] + list(kernel(
+          make_convex_combined_log_prob_fn(iter_),
+          current_state,
+          step_size,
+          num_leapfrog_steps,
+          seed,
+          kernel_results.current_target_log_prob,
+          kernel_results.current_grads_target_log_prob))
+
+    while_loop_kwargs = dict(
+        cond=lambda iter_, *args: iter_ < num_steps,
+        body=_loop_body,
+        loop_vars=[
+            np.int32(0),  # iter_
+            array_ops.zeros_like(current_log_prob),  # ais_weights
+            current_state,
+            _make_dummy_kernel_results(current_state,
+                                       current_log_prob,
+                                       current_grads_log_prob),
+        ])
+    if seed is not None:
+      while_loop_kwargs["parallel_iterations"] = 1
+
+    [ais_weights, current_state, kernel_results] = control_flow_ops.while_loop(
+        **while_loop_kwargs)[1:]  # Lop-off "iter_".
+
+    return [current_state, ais_weights, kernel_results]
+
+
+def kernel(target_log_prob_fn,
+           current_state,
+           step_size,
+           num_leapfrog_steps,
+           seed=None,
+           current_target_log_prob=None,
+           current_grads_target_log_prob=None,
+           name=None):
   """Runs one iteration of Hamiltonian Monte Carlo.
 
   Hamiltonian Monte Carlo (HMC) is a Markov chain Monte Carlo (MCMC)
@@ -316,334 +563,623 @@ def kernel(step_size, n_leapfrog_steps, x, target_log_prob_fn, event_dims=(),
   a Metropolis proposal. This function applies one step of HMC to
   randomly update the variable `x`.
 
-  This function can update multiple chains in parallel. It assumes
-  that all dimensions of `x` not specified in `event_dims` are
-  independent, and should therefore be updated independently. The
-  output of `target_log_prob_fn()` should sum log-probabilities across
-  all event dimensions. Slices along dimensions not in `event_dims`
-  may have different target distributions; for example, if
-  `event_dims == (1,)`, then `x[0, :]` could have a different target
-  distribution from x[1, :]. This is up to `target_log_prob_fn()`.
-
-  Args:
-    step_size: Scalar step size or array of step sizes for the
-      leapfrog integrator. Broadcasts to the shape of
-      `x`. Larger step sizes lead to faster progress, but
-      too-large step sizes make rejection exponentially more likely.
-      When possible, it's often helpful to match per-variable step
-      sizes to the standard deviations of the target distribution in
-      each variable.
-    n_leapfrog_steps: Integer number of steps to run the leapfrog
-      integrator for. Total progress per HMC step is roughly
-      proportional to step_size * n_leapfrog_steps.
-    x: Tensor containing the value(s) of the random variable(s) to update.
-    target_log_prob_fn: Python callable which takes an argument like `initial_x`
-      and returns its (possibly unnormalized) log-density under the target
-      distribution.
-    event_dims: List of dimensions that should not be treated as
-      independent. This allows for multiple chains to be run independently
-      in parallel. Default is (), i.e., all dimensions are independent.
-    x_log_prob (optional): Tensor containing the cached output of a previous
-      call to `target_log_prob_fn()` evaluated at `x` (such as that provided by
-      a previous call to `kernel()`). Providing `x_log_prob` and
-      `x_grad` saves one gradient computation per call to `kernel()`.
-    x_grad (optional): Tensor containing the cached gradient of
-      `target_log_prob_fn()` evaluated at `x` (such as that provided by
-      a previous call to `kernel()`). Providing `x_log_prob` and
-      `x_grad` saves one gradient computation per call to `kernel()`.
-    name: Python `str` name prefixed to Ops created by this function.
-
-  Returns:
-    updated_x: The updated variable(s) x. Has shape matching `initial_x`.
-    acceptance_probs: Tensor with the acceptance probabilities for the final
-      iteration. This is useful for diagnosing step size problems etc. Has
-      shape matching `target_log_prob_fn(initial_x)`.
-    new_log_prob: The value of `target_log_prob_fn()` evaluated at `updated_x`.
-    new_grad: The value of the gradient of `target_log_prob_fn()` evaluated at
-      `updated_x`.
+  This function can update multiple chains in parallel. It assumes that all
+  leftmost dimensions of `current_state` index independent chain states (and are
+  therefore updated independently). The output of `target_log_prob_fn()` should
+  sum log-probabilities across all event dimensions. Slices along the rightmost
+  dimensions may have different target distributions; for example,
+  `current_state[0, :]` could have a different target distribution from
+  `current_state[1, :]`. This is up to `target_log_prob_fn()`. (The number of
+  independent chains is `tf.size(target_log_prob_fn(*current_state))`.)
 
   #### Examples:
 
+  ##### Simple chain with warm-up.
+
   ```python
+  tfd = tf.contrib.distributions
+
   # Tuning acceptance rates:
+  dtype = np.float32
   target_accept_rate = 0.631
-  def target_log_prob(x):
-    # Standard normal
-    return tf.reduce_sum(-0.5 * tf.square(x))
-  initial_x = tf.zeros([10])
-  initial_log_prob = target_log_prob(initial_x)
-  initial_grad = tf.gradients(initial_log_prob, initial_x)[0]
-  # Algorithm state
-  x = tf.Variable(initial_x, name='x')
-  step_size = tf.Variable(1., name='step_size')
-  last_log_prob = tf.Variable(initial_log_prob, name='last_log_prob')
-  last_grad = tf.Variable(initial_grad, name='last_grad')
-  # Compute updates
-  new_x, acceptance_prob, log_prob, grad = hmc.kernel(step_size, 3, x,
-                                                      target_log_prob,
-                                                      event_dims=[0],
-                                                      x_log_prob=last_log_prob)
-  x_update = tf.assign(x, new_x)
-  log_prob_update = tf.assign(last_log_prob, log_prob)
-  grad_update = tf.assign(last_grad, grad)
-  step_size_update = tf.assign(step_size,
-                               tf.where(acceptance_prob > target_accept_rate,
-                                        step_size * 1.01, step_size / 1.01))
-  adaptive_updates = [x_update, log_prob_update, grad_update, step_size_update]
-  sampling_updates = [x_update, log_prob_update, grad_update]
-
-  sess = tf.Session()
-  sess.run(tf.global_variables_initializer())
+  num_warmup_iter = 500
+  num_chain_iter = 500
+
+  x = tf.get_variable(name="x", initializer=dtype(1))
+  step_size = tf.get_variable(name="step_size", initializer=dtype(1))
+
+  target = tfd.Normal(loc=dtype(0), scale=dtype(1))
+
+  new_x, other_results = hmc.kernel(
+      target_log_prob_fn=target.log_prob,
+      current_state=x,
+      step_size=step_size,
+      num_leapfrog_steps=3)[:4]
+
+  x_update = x.assign(new_x)
+
+  step_size_update = step_size.assign_add(
+      step_size * tf.where(
+        other_results.acceptance_probs > target_accept_rate,
+        0.01, -0.01))
+
+  warmup = tf.group([x_update, step_size_update])
+
+  tf.global_variables_initializer().run()
+
+  sess.graph.finalize()  # No more graph building.
+
   # Warm up the sampler and adapt the step size
-  for i in xrange(500):
-    sess.run(adaptive_updates)
-  # Collect samples without adapting step size
-  samples = np.zeros([500, 10])
-  for i in xrange(500):
-    x_val, _ = sess.run([new_x, sampling_updates])
-    samples[i] = x_val
-  ```
+  for _ in xrange(num_warmup_iter):
+    sess.run(warmup)
 
-  ```python
-  # Empirical-Bayes estimation of a hyperparameter by MCMC-EM:
-
-  # Problem setup
-  N = 150
-  D = 10
-  x = np.random.randn(N, D).astype(np.float32)
-  true_sigma = 0.5
-  true_beta = true_sigma * np.random.randn(D).astype(np.float32)
-  y = x.dot(true_beta) + np.random.randn(N).astype(np.float32)
-
-  def log_prior(beta, log_sigma):
-    return tf.reduce_sum(-0.5 / tf.exp(2 * log_sigma) * tf.square(beta) -
-                         log_sigma)
-  def regression_log_joint(beta, log_sigma, x, y):
-    # This function returns log p(beta | log_sigma) + log p(y | x, beta).
-    means = tf.matmul(tf.expand_dims(beta, 0), x, transpose_b=True)
-    means = tf.squeeze(means)
-    log_likelihood = tf.reduce_sum(-0.5 * tf.square(y - means))
-    return log_prior(beta, log_sigma) + log_likelihood
-  def log_joint_partial(beta):
-    return regression_log_joint(beta, log_sigma, x, y)
-  # Our estimate of log(sigma)
-  log_sigma = tf.Variable(0., name='log_sigma')
-  # The state of the Markov chain
-  beta = tf.Variable(tf.random_normal([x.shape[1]]), name='beta')
-  new_beta, _, _, _ = hmc.kernel(0.1, 5, beta, log_joint_partial,
-                                 event_dims=[0])
-  beta_update = tf.assign(beta, new_beta)
-  optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)
-  with tf.control_dependencies([beta_update]):
-    log_sigma_update = optimizer.minimize(-log_prior(beta, log_sigma),
-                                          var_list=[log_sigma])
-
-  sess = tf.Session()
-  sess.run(tf.global_variables_initializer())
-  log_sigma_history = np.zeros(1000)
-  for i in xrange(1000):
-    log_sigma_val, _ = sess.run([log_sigma, log_sigma_update])
-    log_sigma_history[i] = log_sigma_val
-  # Should converge to something close to true_sigma
-  plt.plot(np.exp(log_sigma_history))
+  # Collect samples without adapting step size
+  samples = np.zeros([num_chain_iter])
+  for i in xrange(num_chain_iter):
+    _, x_, target_log_prob_, grad_ = sess.run([
+        x_update,
+        x,
+        other_results.target_log_prob,
+        other_results.grads_target_log_prob])
+    samples[i] = x_
+
+  print(samples.mean(), samples.std())
   ```
-  """
-  with ops.name_scope(name, 'hmc_kernel', [step_size, n_leapfrog_steps, x]):
-    potential_and_grad = _make_potential_and_grad(target_log_prob_fn)
-    x = ops.convert_to_tensor(x, name='x')
-
-    x_shape = array_ops.shape(x)
-    m = random_ops.random_normal(x_shape, dtype=x.dtype)
-
-    kinetic_0 = 0.5 * math_ops.reduce_sum(math_ops.square(m), event_dims)
-
-    if (x_log_prob is not None) and (x_grad is not None):
-      log_potential_0, grad_0 = -x_log_prob, -x_grad  # pylint: disable=invalid-unary-operand-type
-    else:
-      if x_log_prob is not None:
-        logging.warn('x_log_prob was provided, but x_grad was not,'
-                     ' so x_log_prob was not used.')
-      if x_grad is not None:
-        logging.warn('x_grad was provided, but x_log_prob was not,'
-                     ' so x_grad was not used.')
-      log_potential_0, grad_0 = potential_and_grad(x)
-
-    new_x, new_m, log_potential_1, grad_1 = leapfrog_integrator(
-        step_size, n_leapfrog_steps, x, m, potential_and_grad, grad_0)
-
-    kinetic_1 = 0.5 * math_ops.reduce_sum(math_ops.square(new_m), event_dims)
-
-    energy_change = log_potential_1 - log_potential_0 + kinetic_1 - kinetic_0
-    # Treat NaN as infinite energy (and therefore guaranteed rejection).
-    energy_change = array_ops.where(
-        math_ops.is_nan(energy_change),
-        array_ops.fill(array_ops.shape(energy_change),
-                       energy_change.dtype.as_numpy_dtype(np.inf)),
-        energy_change)
-    acceptance_probs = math_ops.exp(math_ops.minimum(-energy_change, 0.))
-    accepted = (
-        random_ops.random_uniform(
-            array_ops.shape(acceptance_probs), dtype=x.dtype)
-        < acceptance_probs)
-    new_log_prob = -array_ops.where(accepted, log_potential_1, log_potential_0)
-
-    # TODO(b/65738010): This should work, but it doesn't for now.
-    # reduced_shape = math_ops.reduced_shape(x_shape, event_dims)
-    reduced_shape = array_ops.shape(math_ops.reduce_sum(x, event_dims,
-                                                        keep_dims=True))
-    accepted = array_ops.reshape(accepted, reduced_shape)
-    accepted = math_ops.logical_or(
-        accepted, math_ops.cast(array_ops.zeros_like(x), dtypes.bool))
-    new_x = array_ops.where(accepted, new_x, x)
-    new_grad = -array_ops.where(accepted, grad_1, grad_0)
-
-  # TODO(langmore) Gradients of acceptance_probs and new_log_prob with respect
-  # to initial_x will propagate NaNs (see testNanFromGradsDontPropagate).  This
-  # should be fixed.
-  return new_x, acceptance_probs, new_log_prob, new_grad
-
-
-def leapfrog_integrator(step_size, n_steps, initial_position, initial_momentum,
-                        potential_and_grad, initial_grad, name=None):
-  """Applies `n_steps` steps of the leapfrog integrator.
-
-  This just wraps `leapfrog_step()` in a `tf.while_loop()`, reusing
-  gradient computations where possible.
 
-  Args:
-    step_size: Scalar step size or array of step sizes for the
-      leapfrog integrator. Broadcasts to the shape of
-      `initial_position`. Larger step sizes lead to faster progress, but
-      too-large step sizes lead to larger discretization error and
-      worse energy conservation.
-    n_steps: Number of steps to run the leapfrog integrator.
-    initial_position: Tensor containing the value(s) of the position variable(s)
-      to update.
-    initial_momentum: Tensor containing the value(s) of the momentum variable(s)
-      to update.
-    potential_and_grad: Python callable that takes a position tensor like
-      `initial_position` and returns the potential energy and its gradient at
-      that position.
-    initial_grad: Tensor with the value of the gradient of the potential energy
-      at `initial_position`.
-    name: Python `str` name prefixed to Ops created by this function.
+  ##### Sample from more complicated posterior.
 
-  Returns:
-    updated_position: Updated value of the position.
-    updated_momentum: Updated value of the momentum.
-    new_potential: Potential energy of the new position. Has shape matching
-      `potential_and_grad(initial_position)`.
-    new_grad: Gradient from potential_and_grad() evaluated at the new position.
-      Has shape matching `initial_position`.
+  I.e.,
 
-  Example: Simple quadratic potential.
+  ```none
+    W ~ MVN(loc=0, scale=sigma * eye(dims))
+    for i=1...num_samples:
+        X[i] ~ MVN(loc=0, scale=eye(dims))
+      eps[i] ~ Normal(loc=0, scale=1)
+        Y[i] = X[i].T * W + eps[i]
+  ```
 
   ```python
-  def potential_and_grad(position):
-    return tf.reduce_sum(0.5 * tf.square(position)), position
-  position = tf.placeholder(np.float32)
-  momentum = tf.placeholder(np.float32)
-  potential, grad = potential_and_grad(position)
-  new_position, new_momentum, new_potential, new_grad = hmc.leapfrog_integrator(
-    0.1, 3, position, momentum, potential_and_grad, grad)
-
-  sess = tf.Session()
-  position_val = np.random.randn(10)
-  momentum_val = np.random.randn(10)
-  potential_val, grad_val = sess.run([potential, grad],
-                                     {position: position_val})
-  positions = np.zeros([100, 10])
-  for i in xrange(100):
-    position_val, momentum_val, potential_val, grad_val = sess.run(
-      [new_position, new_momentum, new_potential, new_grad],
-      {position: position_val, momentum: momentum_val})
-    positions[i] = position_val
-  # Should trace out sinusoidal dynamics.
-  plt.plot(positions[:, 0])
-  ```
-  """
-  def leapfrog_wrapper(step_size, x, m, grad, l):
-    x, m, _, grad = leapfrog_step(step_size, x, m, potential_and_grad, grad)
-    return step_size, x, m, grad, l + 1
+  tfd = tf.contrib.distributions
+
+  def make_training_data(num_samples, dims, sigma):
+    dt = np.asarray(sigma).dtype
+    zeros = tf.zeros(dims, dtype=dt)
+    x = tfd.MultivariateNormalDiag(
+        loc=zeros).sample(num_samples, seed=1)
+    w = tfd.MultivariateNormalDiag(
+        loc=zeros,
+        scale_identity_multiplier=sigma).sample(seed=2)
+    noise = tfd.Normal(
+        loc=dt(0),
+        scale=dt(1)).sample(num_samples, seed=3)
+    y = tf.tensordot(x, w, axes=[[1], [0]]) + noise
+    return y, x, w
+
+  def make_prior(sigma, dims):
+    # p(w | sigma)
+    return tfd.MultivariateNormalDiag(
+        loc=tf.zeros([dims], dtype=sigma.dtype),
+        scale_identity_multiplier=sigma)
+
+  def make_likelihood(x, w):
+    # p(y | x, w)
+    return tfd.MultivariateNormalDiag(
+        loc=tf.tensordot(x, w, axes=[[1], [0]]))
+
+  # Setup assumptions.
+  dtype = np.float32
+  num_samples = 150
+  dims = 10
+  num_iters = int(5e3)
+
+  true_sigma = dtype(0.5)
+  y, x, true_weights = make_training_data(num_samples, dims, true_sigma)
+
+  # Estimate of `log(true_sigma)`.
+  log_sigma = tf.get_variable(name="log_sigma", initializer=dtype(0))
+  sigma = tf.exp(log_sigma)
+
+  # State of the Markov chain.
+  weights = tf.get_variable(
+      name="weights",
+      initializer=np.random.randn(dims).astype(dtype))
+
+  prior = make_prior(sigma, dims)
+
+  def joint_log_prob_fn(w):
+    # f(w) = log p(w, y | x)
+    return prior.log_prob(w) + make_likelihood(x, w).log_prob(y)
+
+  weights_update = weights.assign(
+      hmc.kernel(target_log_prob_fn=joint_log_prob,
+                 current_state=weights,
+                 step_size=0.1,
+                 num_leapfrog_steps=5)[0])
+
+  with tf.control_dependencies([weights_update]):
+    loss = -prior.log_prob(weights)
+
+  optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)
+  log_sigma_update = optimizer.minimize(loss, var_list=[log_sigma])
+
+  sess.graph.finalize()  # No more graph building.
 
-  def counter_fn(a, b, c, d, counter):  # pylint: disable=unused-argument
-    return counter < n_steps
+  tf.global_variables_initializer().run()
 
-  with ops.name_scope(name, 'leapfrog_integrator',
-                      [step_size, n_steps, initial_position, initial_momentum,
-                       initial_grad]):
-    _, new_x, new_m, new_grad, _ = control_flow_ops.while_loop(
-        counter_fn, leapfrog_wrapper, [step_size, initial_position,
-                                       initial_momentum, initial_grad,
-                                       array_ops.constant(0)], back_prop=False)
-    # We're counting on the runtime to eliminate this redundant computation.
-    new_potential, new_grad = potential_and_grad(new_x)
-  return new_x, new_m, new_potential, new_grad
+  sigma_history = np.zeros(num_iters, dtype)
+  weights_history = np.zeros([num_iters, dims], dtype)
 
+  for i in xrange(num_iters):
+    _, sigma_, weights_, _ = sess.run([log_sigma_update, sigma, weights])
+    weights_history[i, :] = weights_
+    sigma_history[i] = sigma_
 
-def leapfrog_step(step_size, position, momentum, potential_and_grad, grad,
-                  name=None):
-  """Applies one step of the leapfrog integrator.
+  true_weights_ = sess.run(true_weights)
 
-  Assumes a simple quadratic kinetic energy function: 0.5 * ||momentum||^2.
+  # Should converge to something close to true_sigma.
+  plt.plot(sigma_history);
+  plt.ylabel("sigma");
+  plt.xlabel("iteration");
+  ```
 
   Args:
-    step_size: Scalar step size or array of step sizes for the
-      leapfrog integrator. Broadcasts to the shape of
-      `position`. Larger step sizes lead to faster progress, but
-      too-large step sizes lead to larger discretization error and
-      worse energy conservation.
-    position: Tensor containing the value(s) of the position variable(s)
-      to update.
-    momentum: Tensor containing the value(s) of the momentum variable(s)
-      to update.
-    potential_and_grad: Python callable that takes a position tensor like
-      `position` and returns the potential energy and its gradient at that
-      position.
-    grad: Tensor with the value of the gradient of the potential energy
-      at `position`.
+    target_log_prob_fn: Python callable which takes an argument like
+      `current_state` (or `*current_state` if it's a list) and returns its
+      (possibly unnormalized) log-density under the target distribution.
+    current_state: `Tensor` or Python `list` of `Tensor`s representing the
+      current state(s) of the Markov chain(s). The first `r` dimensions index
+      independent chains, `r = tf.rank(target_log_prob_fn(*current_state))`.
+    step_size: `Tensor` or Python `list` of `Tensor`s representing the step size
+      for the leapfrog integrator. Must broadcast with the shape of
+      `current_state`. Larger step sizes lead to faster progress, but too-large
+      step sizes make rejection exponentially more likely. When possible, it's
+      often helpful to match per-variable step sizes to the standard deviations
+      of the target distribution in each variable.
+    num_leapfrog_steps: Integer number of steps to run the leapfrog integrator
+      for. Total progress per HMC step is roughly proportional to `step_size *
+      num_leapfrog_steps`.
+    seed: Python integer to seed the random number generator.
+    current_target_log_prob: (Optional) `Tensor` representing the value of
+      `target_log_prob_fn` at the `current_state`. The only reason to
+      specify this argument is to reduce TF graph size.
+      Default value: `None` (i.e., compute as needed).
+    current_grads_target_log_prob: (Optional) Python list of `Tensor`s
+      representing gradient of `current_target_log_prob` at the `current_state`
+      and wrt the `current_state`. Must have same shape as `current_state`. The
+      only reason to specify this argument is to reduce TF graph size.
+      Default value: `None` (i.e., compute as needed).
     name: Python `str` name prefixed to Ops created by this function.
+      Default value: `None` (i.e., "hmc_kernel").
 
   Returns:
-    updated_position: Updated value of the position.
-    updated_momentum: Updated value of the momentum.
-    new_potential: Potential energy of the new position. Has shape matching
-      `potential_and_grad(position)`.
-    new_grad: Gradient from potential_and_grad() evaluated at the new position.
-      Has shape matching `position`.
+    accepted_state: Tensor or Python list of `Tensor`s representing the state(s)
+      of the Markov chain(s) at each result step. Has same shape as
+      `current_state`.
+    kernel_results: `collections.namedtuple` of internal calculations used to
+      advance the chain.
+
+  Raises:
+    ValueError: if there isn't one `step_size` or a list with same length as
+      `current_state`.
+  """
+  with ops.name_scope(
+      name, "hmc_kernel",
+      [current_state, step_size, num_leapfrog_steps, seed,
+       current_target_log_prob, current_grads_target_log_prob]):
+    with ops.name_scope("initialize"):
+      [current_state_parts, step_sizes, current_target_log_prob,
+       current_grads_target_log_prob] = _prepare_args(
+           target_log_prob_fn, current_state, step_size,
+           current_target_log_prob, current_grads_target_log_prob,
+           maybe_expand=True)
+      independent_chain_ndims = distributions_util.prefer_static_rank(
+          current_target_log_prob)
+      current_momentums = []
+      for s in current_state_parts:
+        current_momentums.append(random_ops.random_normal(
+            shape=array_ops.shape(s),
+            dtype=s.dtype.base_dtype,
+            seed=seed))
+        seed = distributions_util.gen_new_seed(
+            seed, salt="hmc_kernel_momentums")
+
+      num_leapfrog_steps = ops.convert_to_tensor(
+          num_leapfrog_steps,
+          dtype=dtypes.int32,
+          name="num_leapfrog_steps")
+    [
+        proposed_momentums,
+        proposed_state_parts,
+        proposed_target_log_prob,
+        proposed_grads_target_log_prob,
+    ] = _leapfrog_integrator(current_momentums,
+                             target_log_prob_fn,
+                             current_state_parts,
+                             step_sizes,
+                             num_leapfrog_steps,
+                             current_target_log_prob,
+                             current_grads_target_log_prob)
+
+    energy_change = _compute_energy_change(current_target_log_prob,
+                                           current_momentums,
+                                           proposed_target_log_prob,
+                                           proposed_momentums,
+                                           independent_chain_ndims)
+
+    # u < exp(min(-energy, 0)),  where u~Uniform[0,1)
+    # ==> -log(u) >= max(e, 0)
+    # ==> -log(u) >= e
+    # (Perhaps surprisingly, we don't have a better way to obtain a random
+    # uniform from positive reals, i.e., `tf.random_uniform(minval=0,
+    # maxval=np.inf)` won't work.)
+    random_uniform = random_ops.random_uniform(
+        shape=array_ops.shape(energy_change),
+        dtype=energy_change.dtype,
+        seed=seed)
+    random_positive = -math_ops.log(random_uniform)
+    is_accepted = random_positive >= energy_change
+
+    accepted_target_log_prob = array_ops.where(is_accepted,
+                                               proposed_target_log_prob,
+                                               current_target_log_prob)
+
+    accepted_state_parts = [_choose(is_accepted,
+                                    proposed_state_part,
+                                    current_state_part,
+                                    independent_chain_ndims)
+                            for current_state_part, proposed_state_part
+                            in zip(current_state_parts, proposed_state_parts)]
+
+    accepted_grads_target_log_prob = [
+        _choose(is_accepted,
+                proposed_grad,
+                grad,
+                independent_chain_ndims)
+        for proposed_grad, grad
+        in zip(proposed_grads_target_log_prob, current_grads_target_log_prob)]
+
+    maybe_flatten = lambda x: x if _is_list_like(current_state) else x[0]
+    return [
+        maybe_flatten(accepted_state_parts),
+        KernelResults(
+            acceptance_probs=math_ops.exp(math_ops.minimum(-energy_change, 0.)),
+            current_grads_target_log_prob=accepted_grads_target_log_prob,
+            current_target_log_prob=accepted_target_log_prob,
+            energy_change=energy_change,
+            is_accepted=is_accepted,
+            proposed_grads_target_log_prob=proposed_grads_target_log_prob,
+            proposed_state=maybe_flatten(proposed_state_parts),
+            proposed_target_log_prob=proposed_target_log_prob,
+            random_positive=random_positive,
+        ),
+    ]
+
+
+def _leapfrog_integrator(current_momentums,
+                         target_log_prob_fn,
+                         current_state_parts,
+                         step_sizes,
+                         num_leapfrog_steps,
+                         current_target_log_prob=None,
+                         current_grads_target_log_prob=None,
+                         name=None):
+  """Applies `num_leapfrog_steps` of the leapfrog integrator.
+
+  Assumes a simple quadratic kinetic energy function: `0.5 ||momentum||**2`.
+
+  #### Examples:
 
-  Example: Simple quadratic potential.
+  ##### Simple quadratic potential.
 
   ```python
-  def potential_and_grad(position):
-    # Simple quadratic potential
-    return tf.reduce_sum(0.5 * tf.square(position)), position
+  tfd = tf.contrib.distributions
+
+  dims = 10
+  num_iter = int(1e3)
+  dtype = np.float32
+
   position = tf.placeholder(np.float32)
   momentum = tf.placeholder(np.float32)
-  potential, grad = potential_and_grad(position)
-  new_position, new_momentum, new_potential, new_grad = hmc.leapfrog_step(
-    0.1, position, momentum, potential_and_grad, grad)
-
-  sess = tf.Session()
-  position_val = np.random.randn(10)
-  momentum_val = np.random.randn(10)
-  potential_val, grad_val = sess.run([potential, grad],
-                                     {position: position_val})
-  positions = np.zeros([100, 10])
-  for i in xrange(100):
-    position_val, momentum_val, potential_val, grad_val = sess.run(
-      [new_position, new_momentum, new_potential, new_grad],
-      {position: position_val, momentum: momentum_val})
-    positions[i] = position_val
-  # Should trace out sinusoidal dynamics.
-  plt.plot(positions[:, 0])
+
+  [
+      new_momentums,
+      new_positions,
+  ] = hmc._leapfrog_integrator(
+      current_momentums=[momentum],
+      target_log_prob_fn=tfd.MultivariateNormalDiag(
+          loc=tf.zeros(dims, dtype)).log_prob,
+      current_state_parts=[position],
+      step_sizes=0.1,
+      num_leapfrog_steps=3)[:2]
+
+  sess.graph.finalize()  # No more graph building.
+
+  momentum_ = np.random.randn(dims).astype(dtype)
+  position_ = np.random.randn(dims).astype(dtype)
+
+  positions = np.zeros([num_iter, dims], dtype)
+  for i in xrange(num_iter):
+    position_, momentum_ = sess.run(
+        [new_momentums[0], new_position[0]],
+        feed_dict={position: position_, momentum: momentum_})
+    positions[i] = position_
+
+  plt.plot(positions[:, 0]);  # Sinusoidal.
   ```
+
+  Args:
+    current_momentums: Tensor containing the value(s) of the momentum
+      variable(s) to update.
+    target_log_prob_fn: Python callable which takes an argument like
+      `*current_state_parts` and returns its (possibly unnormalized) log-density
+      under the target distribution.
+    current_state_parts: Python `list` of `Tensor`s representing the current
+      state(s) of the Markov chain(s). The first `independent_chain_ndims` of
+      the `Tensor`(s) index different chains.
+    step_sizes: Python `list` of `Tensor`s representing the step size for the
+      leapfrog integrator. Must broadcast with the shape of
+      `current_state_parts`.  Larger step sizes lead to faster progress, but
+      too-large step sizes make rejection exponentially more likely. When
+      possible, it's often helpful to match per-variable step sizes to the
+      standard deviations of the target distribution in each variable.
+    num_leapfrog_steps: Integer number of steps to run the leapfrog integrator
+      for. Total progress per HMC step is roughly proportional to `step_size *
+      num_leapfrog_steps`.
+    current_target_log_prob: (Optional) `Tensor` representing the value of
+      `target_log_prob_fn(*current_state_parts)`. The only reason to specify
+      this argument is to reduce TF graph size.
+      Default value: `None` (i.e., compute as needed).
+    current_grads_target_log_prob: (Optional) Python list of `Tensor`s
+      representing gradient of `target_log_prob_fn(*current_state_parts`) wrt
+      `current_state_parts`. Must have same shape as `current_state_parts`. The
+      only reason to specify this argument is to reduce TF graph size.
+      Default value: `None` (i.e., compute as needed).
+    name: Python `str` name prefixed to Ops created by this function.
+      Default value: `None` (i.e., "hmc_leapfrog_integrator").
+
+  Returns:
+    proposed_momentums: Updated value of the momentum.
+    proposed_state_parts: Tensor or Python list of `Tensor`s representing the
+      state(s) of the Markov chain(s) at each result step. Has same shape as
+      input `current_state_parts`.
+    proposed_target_log_prob: `Tensor` representing the value of
+      `target_log_prob_fn` at `accepted_state`.
+    proposed_grads_target_log_prob: Gradient of `proposed_target_log_prob` wrt
+      `accepted_state`.
+
+  Raises:
+    ValueError: if `len(momentums) != len(state_parts)`.
+    ValueError: if `len(state_parts) != len(step_sizes)`.
+    ValueError: if `len(state_parts) != len(grads_target_log_prob)`.
+    TypeError: if `not target_log_prob.dtype.is_floating`.
   """
-  with ops.name_scope(name, 'leapfrog_step', [step_size, position, momentum,
-                                              grad]):
-    momentum -= 0.5 * step_size * grad
-    position += step_size * momentum
-    potential, grad = potential_and_grad(position)
-    momentum -= 0.5 * step_size * grad
-
-  return position, momentum, potential, grad
+  def _loop_body(step,
+                 current_momentums,
+                 current_state_parts,
+                 ignore_current_target_log_prob,  # pylint: disable=unused-argument
+                 current_grads_target_log_prob):
+    return [step + 1] + list(_leapfrog_step(current_momentums,
+                                            target_log_prob_fn,
+                                            current_state_parts,
+                                            step_sizes,
+                                            current_grads_target_log_prob))
+
+  with ops.name_scope(
+      name, "hmc_leapfrog_integrator",
+      [current_momentums, current_state_parts, step_sizes, num_leapfrog_steps,
+       current_target_log_prob, current_grads_target_log_prob]):
+    if len(current_momentums) != len(current_state_parts):
+      raise ValueError("`momentums` must be in one-to-one correspondence "
+                       "with `state_parts`")
+    num_leapfrog_steps = ops.convert_to_tensor(num_leapfrog_steps,
+                                               name="num_leapfrog_steps")
+    current_target_log_prob, current_grads_target_log_prob = (
+        _maybe_call_fn_and_grads(
+            target_log_prob_fn,
+            current_state_parts,
+            current_target_log_prob,
+            current_grads_target_log_prob))
+    return control_flow_ops.while_loop(
+        cond=lambda iter_, *args: iter_ < num_leapfrog_steps,
+        body=_loop_body,
+        loop_vars=[
+            np.int32(0),  # iter_
+            current_momentums,
+            current_state_parts,
+            current_target_log_prob,
+            current_grads_target_log_prob,
+        ],
+        back_prop=False)[1:]  # Lop-off "iter_".
+
+
+def _leapfrog_step(current_momentums,
+                   target_log_prob_fn,
+                   current_state_parts,
+                   step_sizes,
+                   current_grads_target_log_prob,
+                   name=None):
+  """Applies one step of the leapfrog integrator."""
+  with ops.name_scope(
+      name, "_leapfrog_step",
+      [current_momentums, current_state_parts, step_sizes,
+       current_grads_target_log_prob]):
+    proposed_momentums = [m + 0.5 * ss * g for m, ss, g
+                          in zip(current_momentums,
+                                 step_sizes,
+                                 current_grads_target_log_prob)]
+    proposed_state_parts = [x + ss * m for x, ss, m
+                            in zip(current_state_parts,
+                                   step_sizes,
+                                   proposed_momentums)]
+    proposed_target_log_prob = target_log_prob_fn(*proposed_state_parts)
+    if not proposed_target_log_prob.dtype.is_floating:
+      raise TypeError("`target_log_prob_fn` must produce a `Tensor` "
+                      "with `float` `dtype`.")
+    proposed_grads_target_log_prob = gradients_ops.gradients(
+        proposed_target_log_prob, proposed_state_parts)
+    if any(g is None for g in proposed_grads_target_log_prob):
+      raise ValueError(
+          "Encountered `None` gradient. Does your target `target_log_prob_fn` "
+          "access all `tf.Variable`s via `tf.get_variable`?\n"
+          "  current_state_parts: {}\n"
+          "  proposed_state_parts: {}\n"
+          "  proposed_grads_target_log_prob: {}".format(
+              current_state_parts,
+              proposed_state_parts,
+              proposed_grads_target_log_prob))
+    proposed_momentums = [m + 0.5 * ss * g for m, ss, g
+                          in zip(proposed_momentums,
+                                 step_sizes,
+                                 proposed_grads_target_log_prob)]
+    return [
+        proposed_momentums,
+        proposed_state_parts,
+        proposed_target_log_prob,
+        proposed_grads_target_log_prob,
+    ]
+
+
+def _compute_energy_change(current_target_log_prob,
+                           current_momentums,
+                           proposed_target_log_prob,
+                           proposed_momentums,
+                           independent_chain_ndims,
+                           name=None):
+  """Helper to `kernel` which computes the energy change."""
+  with ops.name_scope(
+      name, "compute_energy_change",
+      ([current_target_log_prob, proposed_target_log_prob,
+        independent_chain_ndims] +
+       current_momentums + proposed_momentums)):
+    # Abbreviate lk0=log_kinetic_energy and lk1=proposed_log_kinetic_energy
+    # since they're a mouthful and lets us inline more.
+    lk0, lk1 = [], []
+    for current_momentum, proposed_momentum in zip(current_momentums,
+                                                   proposed_momentums):
+      axis = math_ops.range(independent_chain_ndims,
+                            array_ops.rank(current_momentum))
+      lk0.append(_log_sum_sq(current_momentum, axis))
+      lk1.append(_log_sum_sq(proposed_momentum, axis))
+
+    lk0 = -np.log(2.) + math_ops.reduce_logsumexp(array_ops.stack(lk0, axis=-1),
+                                                  axis=-1)
+    lk1 = -np.log(2.) + math_ops.reduce_logsumexp(array_ops.stack(lk1, axis=-1),
+                                                  axis=-1)
+    lp0 = -current_target_log_prob   # log_potential
+    lp1 = -proposed_target_log_prob  # proposed_log_potential
+    x = array_ops.stack([lp1, math_ops.exp(lk1), -lp0, -math_ops.exp(lk0)],
+                        axis=-1)
+
+    # The sum is NaN if any element is NaN or we see both +Inf and -Inf.
+    # Thus we will replace such rows with infinite energy change which implies
+    # rejection. Recall that float-comparisons with NaN are always False.
+    is_sum_determinate = (
+        math_ops.reduce_all(math_ops.is_finite(x) | (x >= 0.), axis=-1) &
+        math_ops.reduce_all(math_ops.is_finite(x) | (x <= 0.), axis=-1))
+    is_sum_determinate = array_ops.tile(
+        is_sum_determinate[..., array_ops.newaxis],
+        multiples=array_ops.concat([
+            array_ops.ones(array_ops.rank(is_sum_determinate),
+                           dtype=dtypes.int32),
+            [4],
+        ], axis=0))
+    x = array_ops.where(is_sum_determinate,
+                        x,
+                        array_ops.fill(array_ops.shape(x),
+                                       value=x.dtype.as_numpy_dtype(np.inf)))
+
+    return math_ops.reduce_sum(x, axis=-1)
+
+
+def _choose(is_accepted,
+            accepted,
+            rejected,
+            independent_chain_ndims,
+            name=None):
+  """Helper to `kernel` which expand_dims `is_accepted` to apply tf.where."""
+  def _expand_is_accepted_like(x):
+    with ops.name_scope("_choose"):
+      expand_shape = array_ops.concat([
+          array_ops.shape(is_accepted),
+          array_ops.ones([array_ops.rank(x) - array_ops.rank(is_accepted)],
+                         dtype=dtypes.int32),
+      ], axis=0)
+      multiples = array_ops.concat([
+          array_ops.ones([array_ops.rank(is_accepted)], dtype=dtypes.int32),
+          array_ops.shape(x)[independent_chain_ndims:],
+      ], axis=0)
+      m = array_ops.tile(array_ops.reshape(is_accepted, expand_shape),
+                         multiples)
+      m.set_shape(x.shape)
+      return m
+  with ops.name_scope(name, "_choose", values=[
+      is_accepted, accepted, rejected, independent_chain_ndims]):
+    return array_ops.where(_expand_is_accepted_like(accepted),
+                           accepted,
+                           rejected)
+
+
+def _maybe_call_fn_and_grads(fn,
+                             fn_arg_list,
+                             fn_result=None,
+                             grads_fn_result=None,
+                             description="target_log_prob"):
+  """Helper which computes `fn_result` and `grads` if needed."""
+  fn_arg_list = (list(fn_arg_list) if _is_list_like(fn_arg_list)
+                 else [fn_arg_list])
+  if fn_result is None:
+    fn_result = fn(*fn_arg_list)
+  if not fn_result.dtype.is_floating:
+    raise TypeError("`{}` must be a `Tensor` with `float` `dtype`.".format(
+        description))
+  if grads_fn_result is None:
+    grads_fn_result = gradients_ops.gradients(
+        fn_result, fn_arg_list)
+  if len(fn_arg_list) != len(grads_fn_result):
+    raise ValueError("`{}` must be in one-to-one correspondence with "
+                     "`grads_{}`".format(*[description]*2))
+  if any(g is None for g in grads_fn_result):
+    raise ValueError("Encountered `None` gradient.")
+  return fn_result, grads_fn_result
+
+
+def _prepare_args(target_log_prob_fn, state, step_size,
+                  target_log_prob=None, grads_target_log_prob=None,
+                  maybe_expand=False, description="target_log_prob"):
+  """Helper which processes input args to meet list-like assumptions."""
+  state_parts = list(state) if _is_list_like(state) else [state]
+  state_parts = [ops.convert_to_tensor(s, name="state")
+                 for s in state_parts]
+  target_log_prob, grads_target_log_prob = _maybe_call_fn_and_grads(
+      target_log_prob_fn,
+      state_parts,
+      target_log_prob,
+      grads_target_log_prob,
+      description)
+  step_sizes = list(step_size) if _is_list_like(step_size) else [step_size]
+  step_sizes = [
+      ops.convert_to_tensor(
+          s, name="step_size", dtype=target_log_prob.dtype)
+      for s in step_sizes]
+  if len(step_sizes) == 1:
+    step_sizes *= len(state_parts)
+  if len(state_parts) != len(step_sizes):
+    raise ValueError("There should be exactly one `step_size` or it should "
+                     "have same length as `current_state`.")
+  maybe_flatten = lambda x: x if maybe_expand or _is_list_like(state) else x[0]
+  return [
+      maybe_flatten(state_parts),
+      maybe_flatten(step_sizes),
+      target_log_prob,
+      grads_target_log_prob,
+  ]
+
+
+def _is_list_like(x):
+  """Helper which returns `True` if input is `list`-like."""
+  return isinstance(x, (tuple, list))
+
+
+def _log_sum_sq(x, axis=None):
+  """Computes log(sum(x**2))."""
+  return math_ops.reduce_logsumexp(2. * math_ops.log(math_ops.abs(x)), axis)
-- 
GitLab


From 2139e6ee7c181db11bec61742336304017cbbd59 Mon Sep 17 00:00:00 2001
From: Eugene Brevdo <ebrevdo@google.com>
Date: Tue, 6 Feb 2018 14:37:21 -0800
Subject: [PATCH 733/966] [TF Ops] Bugfix to Operation initializer: error
 message uses node_def.

self.node_def may not yet be accessible when using the C api.

PiperOrigin-RevId: 184742074
---
 tensorflow/python/framework/ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 1fdaf9664c..4d7dcdbee1 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -1618,7 +1618,7 @@ class Operation(object):
           for i, x in zip(inputs, input_types)):
         raise TypeError("In op '%s', input types (%s) are not compatible "
                         "with expected types (%s)" %
-                        (self.node_def.name, [i.dtype for i in inputs],
+                        (node_def.name, [i.dtype for i in inputs],
                          input_types))
 
     # Build the list of control inputs.
-- 
GitLab


From 340a9b0b02caa7ba2ccca97205ae17c48372d391 Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Tue, 6 Feb 2018 14:43:18 -0800
Subject: [PATCH 734/966] Handles possible infinite recursion in while loop
 fix.

PiperOrigin-RevId: 184743192
---
 tensorflow/python/ops/control_flow_ops.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 3a5c2f210a..9ae9a71e4b 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -2477,8 +2477,11 @@ class WhileContext(ControlFlowContext):
     if external_inputs:
       # Use an identity to pull control inputs as data inputs. Note that we
       # ignore ops which don't have outputs. TODO(apassos): fix that
-      external_inputs = [array_ops.identity(x.outputs[0]).op
-                         for x in external_inputs if x.outputs]
+      with ops.control_dependencies(None):
+        self.Enter()
+        external_inputs = [array_ops.identity(x.outputs[0]).op
+                           for x in external_inputs if x.outputs]
+        self.Exit()
       op._add_control_inputs(external_inputs)  # pylint: disable=protected-access
     if self._outer_context or not util.IsLoopExit(op):
       op.graph.prevent_fetching(op)
-- 
GitLab


From af46e2c2e8a060ce2d6d315d77e2a26ff23d5b05 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 6 Feb 2018 15:09:51 -0800
Subject: [PATCH 735/966] Creates tf.contrib.feature_column module.

PiperOrigin-RevId: 184747924
---
 tensorflow/BUILD                              |  1 +
 tensorflow/contrib/BUILD                      |  1 +
 tensorflow/contrib/__init__.py                |  1 +
 tensorflow/contrib/cmake/python_modules.txt   |  3 ++
 tensorflow/contrib/cmake/tf_tests.cmake       |  1 +
 tensorflow/contrib/feature_column/BUILD       | 37 +++++++++++++++++++
 tensorflow/contrib/feature_column/__init__.py | 30 +++++++++++++++
 .../sequential_feature_column.py              | 19 ++++++++++
 8 files changed, 93 insertions(+)
 create mode 100644 tensorflow/contrib/feature_column/BUILD
 create mode 100644 tensorflow/contrib/feature_column/__init__.py
 create mode 100644 tensorflow/contrib/feature_column/python/feature_column/sequential_feature_column.py

diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index bda0a83af3..e89667cbfd 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -469,6 +469,7 @@ filegroup(
         "//tensorflow/contrib/factorization:all_files",
         "//tensorflow/contrib/factorization/examples:all_files",
         "//tensorflow/contrib/factorization/kernels:all_files",
+        "//tensorflow/contrib/feature_column:all_files",
         "//tensorflow/contrib/ffmpeg:all_files",
         "//tensorflow/contrib/ffmpeg/default:all_files",
         "//tensorflow/contrib/framework:all_files",
diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index ac6f01365b..0451f00629 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -37,6 +37,7 @@ py_library(
         "//tensorflow/contrib/eager/python:tfe",
         "//tensorflow/contrib/estimator:estimator_py",
         "//tensorflow/contrib/factorization:factorization_py",
+        "//tensorflow/contrib/feature_column:feature_column_py",
         "//tensorflow/contrib/ffmpeg:ffmpeg_ops_py",
         "//tensorflow/contrib/framework:framework_py",
         "//tensorflow/contrib/fused_conv:fused_conv_py",
diff --git a/tensorflow/contrib/__init__.py b/tensorflow/contrib/__init__.py
index 8f6a3cb1ca..46b579b889 100644
--- a/tensorflow/contrib/__init__.py
+++ b/tensorflow/contrib/__init__.py
@@ -33,6 +33,7 @@ from tensorflow.contrib import deprecated
 from tensorflow.contrib import distributions
 from tensorflow.contrib import estimator
 from tensorflow.contrib import factorization
+from tensorflow.contrib import feature_column
 from tensorflow.contrib import framework
 from tensorflow.contrib import gan
 from tensorflow.contrib import graph_editor
diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt
index 9ce8b3cc9c..ad8c995eef 100644
--- a/tensorflow/contrib/cmake/python_modules.txt
+++ b/tensorflow/contrib/cmake/python_modules.txt
@@ -174,6 +174,9 @@ tensorflow/contrib/factorization/kernels
 tensorflow/contrib/factorization/ops
 tensorflow/contrib/factorization/python
 tensorflow/contrib/factorization/python/ops
+tensorflow/contrib/feature_column
+tensorflow/contrib/feature_column/python
+tensorflow/contrib/feature_column/python/feature_column
 tensorflow/contrib/ffmpeg
 tensorflow/contrib/ffmpeg/default
 tensorflow/contrib/framework
diff --git a/tensorflow/contrib/cmake/tf_tests.cmake b/tensorflow/contrib/cmake/tf_tests.cmake
index 2e79eadf7f..f9a7e6e8b9 100644
--- a/tensorflow/contrib/cmake/tf_tests.cmake
+++ b/tensorflow/contrib/cmake/tf_tests.cmake
@@ -156,6 +156,7 @@ if (tensorflow_BUILD_PYTHON_TESTS)
     "${tensorflow_source_dir}/tensorflow/contrib/coder/*_test.py"
     "${tensorflow_source_dir}/tensorflow/contrib/data/*_test.py"
     "${tensorflow_source_dir}/tensorflow/contrib/factorization/*_test.py"
+    "${tensorflow_source_dir}/tensorflow/contrib/feature_column/python/feature_column/*_test.py"
     "${tensorflow_source_dir}/tensorflow/contrib/image/*_test.py"
     "${tensorflow_source_dir}/tensorflow/python/keras/_impl/keras/*_test.py"
     "${tensorflow_source_dir}/tensorflow/contrib/periodic_resample/python/kernel_tests/*_test.py"
diff --git a/tensorflow/contrib/feature_column/BUILD b/tensorflow/contrib/feature_column/BUILD
new file mode 100644
index 0000000000..6fc053759c
--- /dev/null
+++ b/tensorflow/contrib/feature_column/BUILD
@@ -0,0 +1,37 @@
+package(
+    default_visibility = [
+        "//tensorflow:internal",
+    ],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+py_library(
+    name = "feature_column_py",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":sequential_feature_column",
+    ],
+)
+
+py_library(
+    name = "sequential_feature_column",
+    srcs = ["python/feature_column/sequential_feature_column.py"],
+    srcs_version = "PY2AND3",
+    deps = [],
+)
diff --git a/tensorflow/contrib/feature_column/__init__.py b/tensorflow/contrib/feature_column/__init__.py
new file mode 100644
index 0000000000..6da7b12693
--- /dev/null
+++ b/tensorflow/contrib/feature_column/__init__.py
@@ -0,0 +1,30 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Experimental utilities for tf.feature_column."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import,line-too-long,wildcard-import
+from tensorflow.contrib.feature_column.python.feature_column.sequential_feature_column import *
+
+from tensorflow.python.util.all_util import remove_undocumented
+# pylint: enable=unused-import,line-too-long,wildcard-import
+
+_allowed_symbols = [
+]
+
+remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequential_feature_column.py b/tensorflow/contrib/feature_column/python/feature_column/sequential_feature_column.py
new file mode 100644
index 0000000000..690a44ff43
--- /dev/null
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequential_feature_column.py
@@ -0,0 +1,19 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Experimental methods for tf.feature_column sequential input."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
-- 
GitLab


From 956fb325c72be9ac3c0c9d1f42ab7aad37d3223c Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Tue, 6 Feb 2018 15:13:41 -0800
Subject: [PATCH 736/966] Add read resource variable benchmarks.

Initial benchmarks:
entry {
  name: "MicroBenchmarks.benchmark_read_variable_op_2_by_2_CPU"
  iters: 30000
  wall_time: 22.9616721471
  extras {
    key: "examples_per_sec"
    value {
      double_value: 43550.8352176
    }
  }
}

entry {
  name: "MicroBenchmarks.benchmark_read_variable_op_with_tape_2_by_2_CPU"
  iters: 30000
  wall_time: 27.3616631826
  extras {
    key: "examples_per_sec"
    value {
      double_value: 36547.4859232
    }
  }
}

PiperOrigin-RevId: 184748548
---
 tensorflow/python/eager/benchmarks_test.py | 35 ++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index 7a60bd68e4..9aceaba4d5 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -44,6 +44,7 @@ from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import resource_variable_ops
 
 CPU = "/device:CPU:0"
 GPU = "/device:GPU:0"
@@ -271,6 +272,14 @@ class MicroBenchmarks(test.Benchmark):
     func = lambda: f(m, m, transpose_b)
     self._run(func, num_iters)
 
+  def _benchmark_read_variable(self, m, num_iters):
+    self._run(m.value, num_iters)
+
+  def _benchmark_read_variable_with_tape(self, m, num_iters):
+    with backprop.GradientTape() as tape:
+      tape.watch(m)
+      self._run(m.value, num_iters)
+
   # Benchmarks for A^2, A of dimension 2 by 2.
   def benchmark_np_matmul_2_by_2(self):
     self._benchmark_np_matmul(
@@ -407,6 +416,32 @@ class MicroBenchmarks(test.Benchmark):
       self._benchmark_defun_matmul(
           m, transpose_b=True, num_iters=self._num_iters_100_by_784)
 
+  def benchmark_read_variable_op_2_by_2_CPU(self):
+    with context.device(CPU):
+      m = resource_variable_ops.ResourceVariable(self._m_2_by_2)
+      self._benchmark_read_variable(m, num_iters=self._num_iters_2_by_2)
+
+  def benchmark_read_variable_op_2_by_2_GPU(self):
+    if not context.num_gpus():
+      return
+    with context.device(GPU):
+      m = resource_variable_ops.ResourceVariable(self._m_2_by_2)
+      self._benchmark_read_variable(m, num_iters=self._num_iters_2_by_2)
+
+  def benchmark_read_variable_op_with_tape_2_by_2_CPU(self):
+    with context.device(CPU):
+      m = resource_variable_ops.ResourceVariable(self._m_2_by_2)
+      self._benchmark_read_variable_with_tape(
+          m, num_iters=self._num_iters_2_by_2)
+
+  def benchmark_read_variable_op_with_tape_2_by_2_GPU(self):
+    if not context.num_gpus():
+      return
+    with context.device(GPU):
+      m = resource_variable_ops.ResourceVariable(self._m_2_by_2)
+      self._benchmark_read_variable_with_tape(
+          m, num_iters=self._num_iters_2_by_2)
+
 
 if __name__ == "__main__":
   test.main()
-- 
GitLab


From 901d119b938d9ff4239f27fbede488ae3d05d598 Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Tue, 6 Feb 2018 15:30:40 -0800
Subject: [PATCH 737/966] [XLA] Add and use new Literal::MakeTupleOwned
 overload.

Previously MakeTupleOwned was cumbersome to use, because you had to
explicitly materialize a vector<unique_ptr<Literal>>.  With this new
overload, you can pass unique_ptrs directly.

PiperOrigin-RevId: 184751119
---
 tensorflow/compiler/xla/literal_util.cc | 12 +++++++++---
 tensorflow/compiler/xla/literal_util.h  | 21 +++++++++++++++++++++
 2 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/xla/literal_util.cc b/tensorflow/compiler/xla/literal_util.cc
index 89279b659c..09db011719 100644
--- a/tensorflow/compiler/xla/literal_util.cc
+++ b/tensorflow/compiler/xla/literal_util.cc
@@ -1257,11 +1257,17 @@ string Literal::ToString(bool print_layout) const {
 
 /* static */ std::unique_ptr<Literal> Literal::MakeTupleOwned(
     std::vector<std::unique_ptr<Literal>> elements) {
-  std::vector<const Literal*> element_ptrs;
+  std::vector<Shape> element_shapes;
+  element_shapes.reserve(elements.size());
   for (const auto& element : elements) {
-    element_ptrs.push_back(element.get());
+    element_shapes.push_back(element->shape());
+  }
+  auto literal = MakeUnique<Literal>(ShapeUtil::MakeTupleShape(element_shapes));
+  for (int64 i = 0; i < elements.size(); ++i) {
+    TF_CHECK_OK(
+        literal->MoveFrom(std::move(*elements[i]), /*dest_shape_index=*/{i}));
   }
-  return MakeTuple(element_ptrs);
+  return literal;
 }
 
 void Literal::EachCellAsString(
diff --git a/tensorflow/compiler/xla/literal_util.h b/tensorflow/compiler/xla/literal_util.h
index 2b68b8f177..d996004888 100644
--- a/tensorflow/compiler/xla/literal_util.h
+++ b/tensorflow/compiler/xla/literal_util.h
@@ -485,6 +485,27 @@ class Literal {
   static std::unique_ptr<Literal> MakeTupleOwned(
       std::vector<std::unique_ptr<Literal>> elements);
 
+  // This overload lets you pass a braced list of unique_ptr<Literal>s to
+  // MakeTupleOwned:
+  //
+  //   Literal::MakeTupleOwned(Literal::CreateR1(...), ...).
+  //
+  // Simply relying on the MakeTupleOwned(std::vector<unique_ptr<Literal>>)
+  // overload doesn't work because std::initializer_list's elements are always
+  // const.
+  //
+  // The arguments to this function must all be unique_ptr<Literal>.
+  template <typename... Ts>
+  static std::unique_ptr<Literal> MakeTupleOwned(
+      std::unique_ptr<Ts>... elements) {
+    std::array<std::unique_ptr<Literal>, sizeof...(Ts)> arr{
+        std::move(elements)...};
+    std::vector<std::unique_ptr<Literal>> v;
+    v.insert(v.begin(), std::make_move_iterator(arr.begin()),
+             std::make_move_iterator(arr.end()));
+    return MakeTupleOwned(std::move(v));
+  }
+
   // Returns a string representation of the literal value.
   // Warning: this function can take minutes for multi-million element Literals.
   string ToString(bool print_layout = false) const;
-- 
GitLab


From 0282f080fa0555a0f4af1a0293e529327d09b153 Mon Sep 17 00:00:00 2001
From: James Qin <jamesqin@google.com>
Date: Tue, 6 Feb 2018 16:25:59 -0800
Subject: [PATCH 738/966] Address Adagrad/RMSProp incompatibility with CudnnRNN

CudnnRNN layers have variables of unknown shapes, which Adagrad/RMSProp didn't handle before.

This fixes 6620(#6620).

PiperOrigin-RevId: 184759579
---
 .../python/kernel_tests/cudnn_rnn_test.py     | 53 +++++++++++++++++++
 tensorflow/python/training/rmsprop.py         |  9 +++-
 tensorflow/python/training/slot_creator.py    |  3 ++
 3 files changed, 63 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
index 49d305cb0d..9897c31a98 100644
--- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
+++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
@@ -51,7 +51,11 @@ from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import googletest
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import adagrad
+from tensorflow.python.training import adam
 from tensorflow.python.training import gradient_descent
+from tensorflow.python.training import momentum
+from tensorflow.python.training import rmsprop
 from tensorflow.python.training import saver as saver_lib
 
 
@@ -316,6 +320,55 @@ class CudnnRNNTestBasic(TensorFlowTestCase):
       self.assertEqual(0, total_sum2_v)
       self.assertEqual(0, total_sum3_v)
 
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def testOptimizersSupport(self):
+    for opt in ("adagrad", "adam", "rmsprop", "momentum", "sgd"):
+      self._TestOptimizerSupportHelper(opt)
+
+  def _GetOptimizer(self, opt):
+    if opt == "adagrad":
+      return adagrad.AdagradOptimizer(learning_rate=1e-2)
+    elif opt == "adam":
+      return adam.AdamOptimizer(learning_rate=1e-2)
+    elif opt == "rmsprop":
+      return rmsprop.RMSPropOptimizer(learning_rate=1e-2)
+    elif opt == "momentum":
+      return momentum.MomentumOptimizer(learning_rate=1e-2, momentum=0.9)
+    elif opt == "sgd":
+      return gradient_descent.GradientDescentOptimizer(learning_rate=1e-2)
+    else:
+      raise ValueError("Unsupported optimizer: %s" % opt)
+
+  def _TestOptimizerSupportHelper(self, opt):
+    num_layers = 4
+    num_units = 2
+    batch_size = 8
+    direction = CUDNN_RNN_UNIDIRECTION
+    dir_count = 1
+
+    with ops.Graph().as_default() as g:
+      kernel_initializer = init_ops.constant_initializer(0.)
+      bias_initializer = init_ops.constant_initializer(0.)
+      inputs = random_ops.random_uniform([
+          num_layers * dir_count, batch_size, num_units], dtype=dtypes.float32)
+
+      lstm = cudnn_rnn.CudnnLSTM(num_layers, num_units,
+                                 direction=direction,
+                                 kernel_initializer=kernel_initializer,
+                                 bias_initializer=bias_initializer,
+                                 name="awesome_lstm")
+      outputs, _ = lstm(inputs)
+      loss = math_ops.reduce_sum(outputs)
+      optimizer = self._GetOptimizer(opt)
+      train_op = optimizer.minimize(loss)
+
+    with self.test_session(use_gpu=True, graph=g) as sess:
+      sess.run(variables.global_variables_initializer())
+      sess.run(train_op)
+
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
   def testSaveableGraphDeviceAssignment(self):
     num_layers = 4
     num_units = 2
diff --git a/tensorflow/python/training/rmsprop.py b/tensorflow/python/training/rmsprop.py
index 89d1099a49..341b970c92 100644
--- a/tensorflow/python/training/rmsprop.py
+++ b/tensorflow/python/training/rmsprop.py
@@ -42,6 +42,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.training import optimizer
@@ -107,9 +108,13 @@ class RMSPropOptimizer(optimizer.Optimizer):
 
   def _create_slots(self, var_list):
     for v in var_list:
-      init_rms = init_ops.ones_initializer(dtype=v.dtype)
+      if v.get_shape().is_fully_defined():
+        init_rms = init_ops.ones_initializer(dtype=v.dtype.base_dtype)
+      else:
+        init_rms = array_ops.ones_like(v)
       self._get_or_make_slot_with_initializer(v, init_rms, v.get_shape(),
-                                              v.dtype, "rms", self._name)
+                                              v.dtype.base_dtype, "rms",
+                                              self._name)
       if self._centered:
         self._zeros_slot(v, "mg", self._name)
       self._zeros_slot(v, "momentum", self._name)
diff --git a/tensorflow/python/training/slot_creator.py b/tensorflow/python/training/slot_creator.py
index ea28b5ddfc..731fe34273 100644
--- a/tensorflow/python/training/slot_creator.py
+++ b/tensorflow/python/training/slot_creator.py
@@ -60,6 +60,9 @@ def _create_slot_var(primary, val, scope, validate_shape, shape, dtype):
   # scope.
   current_partitioner = variable_scope.get_variable_scope().partitioner
   variable_scope.get_variable_scope().set_partitioner(None)
+  # When init from val instead of callable initializer, the shape is expected to
+  # be None, not <unknown> or any fully defined shape.
+  shape = shape if callable(val) else None
   slot = variable_scope.get_variable(
       scope, initializer=val, trainable=False,
       use_resource=_is_resource(primary),
-- 
GitLab


From 87b5c8f011324384e4f4916d22f75b3c4bd7d7b1 Mon Sep 17 00:00:00 2001
From: Andrew Selle <aselle@google.com>
Date: Tue, 6 Feb 2018 17:10:31 -0800
Subject: [PATCH 739/966] Sync the opensource and non-opensource build

PiperOrigin-RevId: 184765632
---
 tensorflow/contrib/lite/BUILD | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/tensorflow/contrib/lite/BUILD b/tensorflow/contrib/lite/BUILD
index 5f89cbbe43..d303154cc2 100644
--- a/tensorflow/contrib/lite/BUILD
+++ b/tensorflow/contrib/lite/BUILD
@@ -106,6 +106,7 @@ cc_library(
     srcs = [
         "allocation.cc",
         "error_reporter.cc",
+        "graph_info.cc",
         "interpreter.cc",
         "model.cc",
         "nnapi_delegate.cc",
@@ -115,6 +116,7 @@ cc_library(
         "allocation.h",
         "context.h",
         "error_reporter.h",
+        "graph_info.h",
         "interpreter.h",
         "model.h",
         "nnapi_delegate.h",
@@ -171,6 +173,19 @@ cc_test(
     ],
 )
 
+# Test graph utils
+cc_test(
+    name = "graph_info_test",
+    size = "small",
+    srcs = ["graph_info_test.cc"],
+    deps = [
+        ":framework",
+        ":string_util",
+        "//tensorflow/contrib/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 # Test arena allocator
 cc_test(
     name = "simple_memory_arena_test",
-- 
GitLab


From 9126444b41b243ca9bc2359d8e91a05fc0039e71 Mon Sep 17 00:00:00 2001
From: Yuefeng Zhou <yuefengz@google.com>
Date: Tue, 6 Feb 2018 17:32:50 -0800
Subject: [PATCH 740/966] Fix memory tracking in the case where temp memory is
 used as output memory. Track all persistent memory allocation in
 allocate_persistent call except for variables and queues where persistent
 memory is tracked in variables ops and queue ops. Deallocation of persistent
 memory is ignored.

PiperOrigin-RevId: 184768231
---
 tensorflow/c/eager/runtime.cc                 |  2 +-
 .../core/common_runtime/bfc_allocator.cc      |  6 +-
 .../core/common_runtime/bfc_allocator.h       |  6 +-
 tensorflow/core/common_runtime/executor.cc    |  2 +-
 .../common_runtime/gpu/gpu_debug_allocator.cc | 20 +++--
 .../common_runtime/gpu/gpu_debug_allocator.h  | 10 +--
 .../core/common_runtime/gpu/process_state.h   |  4 +-
 tensorflow/core/framework/allocator.cc        |  2 +-
 tensorflow/core/framework/allocator.h         | 18 ++--
 tensorflow/core/framework/op_kernel.cc        | 85 ++++++++++++++++---
 tensorflow/core/framework/op_kernel.h         | 33 ++++---
 .../core/framework/tracking_allocator.cc      |  6 +-
 .../core/framework/tracking_allocator.h       |  8 +-
 .../core/framework/tracking_allocator_test.cc |  4 +-
 tensorflow/core/kernels/assign_op.h           |  3 +
 .../core/kernels/reduction_ops_common.h       |  3 -
 16 files changed, 145 insertions(+), 67 deletions(-)

diff --git a/tensorflow/c/eager/runtime.cc b/tensorflow/c/eager/runtime.cc
index 12abfcba2f..f77a937f1f 100644
--- a/tensorflow/c/eager/runtime.cc
+++ b/tensorflow/c/eager/runtime.cc
@@ -315,7 +315,7 @@ Status KernelAndDevice::Run(std::vector<Tensor>* input_tensors,
       allocator_pair.second->GetRecordsAndUnRef();
     }
     auto* ms = stats->mutable_memory_stats();
-    ms->set_temp_memory_size(context.temp_memory_size());
+    ms->set_temp_memory_size(context.temp_memory_allocated());
     for (const auto& alloc_id : context.persistent_alloc_ids()) {
       ms->mutable_persistent_tensor_alloc_ids()->Add(alloc_id);
     }
diff --git a/tensorflow/core/common_runtime/bfc_allocator.cc b/tensorflow/core/common_runtime/bfc_allocator.cc
index 63594e83fa..e9f839289a 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/bfc_allocator.cc
@@ -521,7 +521,7 @@ void BFCAllocator::AddAllocVisitor(Visitor visitor) {
 
 bool BFCAllocator::TracksAllocationSizes() { return true; }
 
-size_t BFCAllocator::RequestedSize(void* ptr) {
+size_t BFCAllocator::RequestedSize(const void* ptr) {
   mutex_lock l(lock_);
   BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
   CHECK(h != kInvalidChunkHandle)
@@ -530,7 +530,7 @@ size_t BFCAllocator::RequestedSize(void* ptr) {
   return c->requested_size;
 }
 
-size_t BFCAllocator::AllocatedSize(void* ptr) {
+size_t BFCAllocator::AllocatedSize(const void* ptr) {
   mutex_lock l(lock_);
   BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
   CHECK(h != kInvalidChunkHandle)
@@ -539,7 +539,7 @@ size_t BFCAllocator::AllocatedSize(void* ptr) {
   return c->size;
 }
 
-int64 BFCAllocator::AllocationId(void* ptr) {
+int64 BFCAllocator::AllocationId(const void* ptr) {
   mutex_lock l(lock_);
   BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
   CHECK(h != kInvalidChunkHandle)
diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h
index 9353997753..b8e773503c 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.h
+++ b/tensorflow/core/common_runtime/bfc_allocator.h
@@ -62,11 +62,11 @@ class BFCAllocator : public VisitableAllocator {
 
   bool TracksAllocationSizes() override;
 
-  size_t RequestedSize(void* ptr) override;
+  size_t RequestedSize(const void* ptr) override;
 
-  size_t AllocatedSize(void* ptr) override;
+  size_t AllocatedSize(const void* ptr) override;
 
-  int64 AllocationId(void* ptr) override;
+  int64 AllocationId(const void* ptr) override;
 
   void GetStats(AllocatorStats* stats) override;
 
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index e3416da988..6998cbecee 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -172,7 +172,7 @@ void SetMemory(NodeExecStatsWrapper* stats, OpKernelContext* ctx) {
     stats->AddAllocation(allocator_pair.first, allocator_pair.second);
   }
   auto* ms = stats->stats()->mutable_memory_stats();
-  ms->set_temp_memory_size(ctx->temp_memory_size());
+  ms->set_temp_memory_size(ctx->temp_memory_allocated());
   for (const auto& alloc_id : ctx->persistent_alloc_ids()) {
     ms->mutable_persistent_tensor_alloc_ids()->Add(alloc_id);
   }
diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
index cd29a5c50b..63ed0b8be1 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
@@ -121,18 +121,20 @@ void GPUDebugAllocator::AddFreeVisitor(Visitor visitor) {
 
 bool GPUDebugAllocator::TracksAllocationSizes() { return true; }
 
-size_t GPUDebugAllocator::RequestedSize(void* ptr) {
-  auto req_size =
-      base_allocator_->RequestedSize(static_cast<char*>(ptr) - MASK_BYTES);
+size_t GPUDebugAllocator::RequestedSize(const void* ptr) {
+  auto req_size = base_allocator_->RequestedSize(static_cast<const char*>(ptr) -
+                                                 MASK_BYTES);
   return req_size - 2 * MASK_BYTES;
 }
 
-size_t GPUDebugAllocator::AllocatedSize(void* ptr) {
-  return base_allocator_->AllocatedSize(static_cast<char*>(ptr) - MASK_BYTES);
+size_t GPUDebugAllocator::AllocatedSize(const void* ptr) {
+  return base_allocator_->AllocatedSize(static_cast<const char*>(ptr) -
+                                        MASK_BYTES);
 }
 
-int64 GPUDebugAllocator::AllocationId(void* ptr) {
-  return base_allocator_->AllocationId(static_cast<char*>(ptr) - MASK_BYTES);
+int64 GPUDebugAllocator::AllocationId(const void* ptr) {
+  return base_allocator_->AllocationId(static_cast<const char*>(ptr) -
+                                       MASK_BYTES);
 }
 
 void GPUDebugAllocator::GetStats(AllocatorStats* stats) {
@@ -201,11 +203,11 @@ void GPUNanResetAllocator::AddFreeVisitor(Visitor visitor) {
   return base_allocator_->AddFreeVisitor(visitor);
 }
 
-size_t GPUNanResetAllocator::RequestedSize(void* ptr) {
+size_t GPUNanResetAllocator::RequestedSize(const void* ptr) {
   return base_allocator_->RequestedSize(ptr);
 }
 
-size_t GPUNanResetAllocator::AllocatedSize(void* ptr) {
+size_t GPUNanResetAllocator::AllocatedSize(const void* ptr) {
   return base_allocator_->AllocatedSize(ptr);
 }
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
index 139fa2847e..adce3a8436 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
@@ -42,9 +42,9 @@ class GPUDebugAllocator : public VisitableAllocator {
   void AddAllocVisitor(Visitor visitor) override;
   void AddFreeVisitor(Visitor visitor) override;
   bool TracksAllocationSizes() override;
-  size_t RequestedSize(void* ptr) override;
-  size_t AllocatedSize(void* ptr) override;
-  int64 AllocationId(void* ptr) override;
+  size_t RequestedSize(const void* ptr) override;
+  size_t AllocatedSize(const void* ptr) override;
+  int64 AllocationId(const void* ptr) override;
   void GetStats(AllocatorStats* stats) override;
   void ClearStats() override;
 
@@ -73,8 +73,8 @@ class GPUNanResetAllocator : public VisitableAllocator {
   void DeallocateRaw(void* ptr) override;
   void AddAllocVisitor(Visitor visitor) override;
   void AddFreeVisitor(Visitor visitor) override;
-  size_t RequestedSize(void* ptr) override;
-  size_t AllocatedSize(void* ptr) override;
+  size_t RequestedSize(const void* ptr) override;
+  size_t AllocatedSize(const void* ptr) override;
   void GetStats(AllocatorStats* stats) override;
   void ClearStats() override;
 
diff --git a/tensorflow/core/common_runtime/gpu/process_state.h b/tensorflow/core/common_runtime/gpu/process_state.h
index abe458f685..f6e2349673 100644
--- a/tensorflow/core/common_runtime/gpu/process_state.h
+++ b/tensorflow/core/common_runtime/gpu/process_state.h
@@ -155,8 +155,8 @@ class RecordingAllocator : public Allocator {
     a_->DeallocateRaw(p);
   }
   bool TracksAllocationSizes() override { return a_->TracksAllocationSizes(); }
-  size_t RequestedSize(void* p) override { return a_->RequestedSize(p); }
-  size_t AllocatedSize(void* p) override { return a_->AllocatedSize(p); }
+  size_t RequestedSize(const void* p) override { return a_->RequestedSize(p); }
+  size_t AllocatedSize(const void* p) override { return a_->AllocatedSize(p); }
   void GetStats(AllocatorStats* stats) override { a_->GetStats(stats); }
   void ClearStats() override { a_->ClearStats(); }
   ProcessState::MDMap* mm_;  // not owned
diff --git a/tensorflow/core/framework/allocator.cc b/tensorflow/core/framework/allocator.cc
index 2bd19663fc..94bf34afa4 100644
--- a/tensorflow/core/framework/allocator.cc
+++ b/tensorflow/core/framework/allocator.cc
@@ -113,7 +113,7 @@ class CPUAllocator : public Allocator {
     stats_.max_alloc_size = 0;
   }
 
-  size_t AllocatedSizeSlow(void* ptr) override {
+  size_t AllocatedSizeSlow(const void* ptr) override {
     return port::MallocExtension_GetAllocatedSize(ptr);
   }
 
diff --git a/tensorflow/core/framework/allocator.h b/tensorflow/core/framework/allocator.h
index 5a95d3a15d..3ce1b61246 100644
--- a/tensorflow/core/framework/allocator.h
+++ b/tensorflow/core/framework/allocator.h
@@ -156,7 +156,7 @@ class Allocator {
   //
   // REQUIRES: 'ptr!=nullptr' and points to a buffer previously
   // allocated by this allocator.
-  virtual size_t RequestedSize(void* ptr) {
+  virtual size_t RequestedSize(const void* ptr) {
     CHECK(false) << "allocator doesn't track sizes";
     return size_t(0);
   }
@@ -169,7 +169,7 @@ class Allocator {
   //
   // REQUIRES: 'ptr!=nullptr' and points to a buffer previously
   // allocated by this allocator.
-  virtual size_t AllocatedSize(void* ptr) { return RequestedSize(ptr); }
+  virtual size_t AllocatedSize(const void* ptr) { return RequestedSize(ptr); }
 
   // Returns either 0 or an identifier assigned to the buffer at 'ptr'
   // when the buffer was returned by AllocateRaw. If non-zero, the
@@ -180,7 +180,7 @@ class Allocator {
   //
   // REQUIRES: 'ptr!=nullptr' and points to a buffer previously
   // allocated by this allocator.
-  virtual int64 AllocationId(void* ptr) { return 0; }
+  virtual int64 AllocationId(const void* ptr) { return 0; }
 
   // Returns the allocated size of the buffer at 'ptr' if known,
   // otherwise returns 0. This method can be called when
@@ -188,7 +188,7 @@ class Allocator {
   //
   // REQUIRES: 'ptr!=nullptr' and points to a buffer previously
   // allocated by this allocator.
-  virtual size_t AllocatedSizeSlow(void* ptr) {
+  virtual size_t AllocatedSizeSlow(const void* ptr) {
     if (TracksAllocationSizes()) {
       return AllocatedSize(ptr);
     }
@@ -312,17 +312,19 @@ class AllocatorWrapper : public Allocator {
     return wrapped_->TracksAllocationSizes();
   }
 
-  size_t RequestedSize(void* ptr) override {
+  size_t RequestedSize(const void* ptr) override {
     return wrapped_->RequestedSize(ptr);
   }
 
-  size_t AllocatedSize(void* ptr) override {
+  size_t AllocatedSize(const void* ptr) override {
     return wrapped_->AllocatedSize(ptr);
   }
 
-  int64 AllocationId(void* ptr) override { return wrapped_->AllocationId(ptr); }
+  int64 AllocationId(const void* ptr) override {
+    return wrapped_->AllocationId(ptr);
+  }
 
-  size_t AllocatedSizeSlow(void* ptr) override {
+  size_t AllocatedSizeSlow(const void* ptr) override {
     return wrapped_->AllocatedSizeSlow(ptr);
   }
 
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index 56c013db9d..5d58d3e78e 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -260,7 +260,7 @@ OpKernelContext::OpKernelContext(Params* params)
 OpKernelContext::OpKernelContext(Params* params, int num_outputs)
     : params_(params),
       outputs_(num_outputs),
-      temp_memory_size_(0),
+      temp_memory_allocated_(0),
       persistent_memory_allocated_(0) {
   Allocator* eigen_gpu_allocator = get_allocator(AllocatorAttributes());
   params_->ensure_eigen_gpu_device();
@@ -669,12 +669,11 @@ Status OpKernelContext::allocate_temp(
     const AllocationAttributes& allocation_attr) {
   Status s =
       allocate_tensor(type, shape, out_temp, allocator_attr, allocation_attr);
-  if (track_allocations() && out_temp->TotalBytes() > 0) {
+  if (track_allocations() && s.ok() && out_temp->TotalBytes() > 0) {
     Allocator* a = get_allocator(allocator_attr);
     if (a->TracksAllocationSizes()) {
-      int64 alloc_size =
-          a->AllocatedSize(const_cast<char*>(out_temp->tensor_data().data()));
-      record_temp_memory_size(alloc_size);
+      int64 alloc_size = a->AllocatedSize(out_temp->tensor_data().data());
+      record_temp_memory_allocation(alloc_size, *out_temp);
     }
   }
   return s;
@@ -692,6 +691,15 @@ Status OpKernelContext::allocate_persistent(DataType type,
     if (out_tensor) {
       *out_tensor = out_persistent->AccessTensor(this);
     }
+    if (track_allocations()) {
+      Tensor* t = out_persistent->AccessTensor(this);
+      Allocator* a = get_allocator(attr);
+      if (a->TracksAllocationSizes()) {
+        int64 alloc_size = a->AllocatedSize(t->tensor_data().data());
+        int64 alloc_id = a->AllocationId(t->tensor_data().data());
+        record_persistent_memory_allocation(alloc_size, alloc_id);
+      }
+    }
   }
   return s;
 }
@@ -716,6 +724,22 @@ void OpKernelContext::set_output(int index, const Tensor& tensor) {
   DCHECK_EQ(mutable_output(index), nullptr);
   record_tensor_reference(tensor);
   outputs_[index] = TensorValue(new Tensor(tensor));
+  if (track_allocations() && tensor.TotalBytes() > 0) {
+    mutex_lock l(stats_mu_);
+    if (!temp_tensor_buffer_and_size_) {
+      return;
+    }
+    auto it = std::find_if(temp_tensor_buffer_and_size_->begin(),
+                           temp_tensor_buffer_and_size_->end(),
+                           [&tensor](const std::pair<const void*, int64>& e) {
+                             return e.first == static_cast<const void*>(
+                                                   tensor.tensor_data().data());
+                           });
+    if (it != temp_tensor_buffer_and_size_->end()) {
+      temp_memory_allocated_ -= it->second;
+      temp_tensor_buffer_and_size_->erase(it);
+    }
+  }
 }
 
 void OpKernelContext::set_output_ref(int index, mutex* mu,
@@ -793,19 +817,60 @@ Status OpKernelContext::MatchSignature(const DataTypeSlice expected_inputs,
                               outputs);
 }
 
-bool OpKernelContext::allocate_on_host(AllocatorAttributes alloc_attr) const {
-  return alloc_attr.on_host() || device()->attributes().device_type() == "CPU";
+void OpKernelContext::record_temp_memory_allocation(int64 size,
+                                                    const Tensor& t) {
+  mutex_lock l(stats_mu_);
+  temp_memory_allocated_ += size;
+  if (!temp_tensor_buffer_and_size_) {
+    temp_tensor_buffer_and_size_.reset(
+        new gtl::InlinedVector<std::pair<const void*, int64>, 2>());
+  }
+  temp_tensor_buffer_and_size_->emplace_back(
+      static_cast<const void*>(t.tensor_data().data()), size);
+}
+
+int64 OpKernelContext::temp_memory_allocated() const {
+  mutex_lock l(stats_mu_);
+  return temp_memory_allocated_;
 }
 
 void OpKernelContext::record_persistent_memory_allocation(int64 size,
                                                           int64 alloc_id) {
+  mutex_lock l(stats_mu_);
   persistent_memory_allocated_ += size;
-  persistent_alloc_ids_.push_back(alloc_id);
+  if (alloc_id >= 0) {
+    if (!persistent_alloc_ids_) {
+      persistent_alloc_ids_.reset(new gtl::InlinedVector<int64, 2>());
+    }
+    persistent_alloc_ids_->push_back(alloc_id);
+  }
+}
+
+int64 OpKernelContext::persistent_memory_allocated() const {
+  mutex_lock l(stats_mu_);
+  return persistent_memory_allocated_;
 }
 
 std::vector<int64> OpKernelContext::persistent_alloc_ids() const {
-  return std::vector<int64>(persistent_alloc_ids_.begin(),
-                            persistent_alloc_ids_.end());
+  mutex_lock l(stats_mu_);
+  if (persistent_alloc_ids_) {
+    return std::vector<int64>(persistent_alloc_ids_->begin(),
+                              persistent_alloc_ids_->end());
+  } else {
+    return std::vector<int64>();
+  }
+}
+
+void OpKernelContext::clear_recorded_memory() {
+  mutex_lock l(stats_mu_);
+  temp_memory_allocated_ = 0;
+  persistent_memory_allocated_ = 0;
+  if (temp_tensor_buffer_and_size_) {
+    temp_tensor_buffer_and_size_->clear();
+  }
+  if (persistent_alloc_ids_) {
+    persistent_alloc_ids_->clear();
+  }
 }
 
 // OpKernel registration ------------------------------------------------------
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index c45026c6af..5ccd45efc9 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -1046,24 +1046,27 @@ class OpKernelContext {
   TensorValue release_output(int index);
 
   bool track_allocations() const { return params_->track_allocations; }
-  bool allocate_on_host(AllocatorAttributes alloc_attr) const;
 
-  // Records temporary memory sizes.
-  void record_temp_memory_size(int64 size) { temp_memory_size_ += size; }
+  // Records temp memory allocation. Tensor object is recorded to identify the
+  // case where temp memory is used as output memory.
+  void record_temp_memory_allocation(int64 size, const Tensor& t)
+      LOCKS_EXCLUDED(stats_mu_);
 
   // Returns recorded size of temporary memory;
-  int64 temp_memory_size() const { return temp_memory_size_; }
+  int64 temp_memory_allocated() const LOCKS_EXCLUDED(stats_mu_);
 
   // Records persistent memory allocation, size can be negative indicating
   // deallocation.
-  void record_persistent_memory_allocation(int64 size, int64 alloc_id = -1);
+  void record_persistent_memory_allocation(int64 size, int64 alloc_id = -1)
+      LOCKS_EXCLUDED(stats_mu_);
 
   // Returns recorded size and ids of persistent memory.
-  int64 persistent_memory_allocated() const {
-    return persistent_memory_allocated_;
-  }
+  int64 persistent_memory_allocated() const LOCKS_EXCLUDED(stats_mu_);
+
+  std::vector<int64> persistent_alloc_ids() const LOCKS_EXCLUDED(stats_mu_);
 
-  std::vector<int64> persistent_alloc_ids() const;
+  // Resets counters for temp and persistent memory and recorded ids.
+  void clear_recorded_memory() LOCKS_EXCLUDED(stats_mu_);
 
   bool input_is_ref(int index) const;
 
@@ -1108,9 +1111,15 @@ class OpKernelContext {
 
   bool is_output_dead_ = false;
 
-  int64 temp_memory_size_;
-  gtl::InlinedVector<int64, 2> persistent_alloc_ids_;
-  int64 persistent_memory_allocated_;
+  // The following data members are only used when allocation tracking is
+  // enabled.
+  mutable mutex stats_mu_;
+  int64 temp_memory_allocated_ GUARDED_BY(stats_mu_);
+  int64 persistent_memory_allocated_ GUARDED_BY(stats_mu_);
+  std::unique_ptr<gtl::InlinedVector<std::pair<const void*, int64>, 2>>
+      temp_tensor_buffer_and_size_ GUARDED_BY(stats_mu_);
+  std::unique_ptr<gtl::InlinedVector<int64, 2>> persistent_alloc_ids_
+      GUARDED_BY(stats_mu_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(OpKernelContext);
 };
diff --git a/tensorflow/core/framework/tracking_allocator.cc b/tensorflow/core/framework/tracking_allocator.cc
index 65c98ad1ee..2df402573a 100644
--- a/tensorflow/core/framework/tracking_allocator.cc
+++ b/tensorflow/core/framework/tracking_allocator.cc
@@ -113,7 +113,7 @@ bool TrackingAllocator::TracksAllocationSizes() {
   return track_sizes_locally_ || allocator_->TracksAllocationSizes();
 }
 
-size_t TrackingAllocator::RequestedSize(void* ptr) {
+size_t TrackingAllocator::RequestedSize(const void* ptr) {
   if (track_sizes_locally_) {
     mutex_lock lock(mu_);
     auto it = in_use_.find(ptr);
@@ -126,7 +126,7 @@ size_t TrackingAllocator::RequestedSize(void* ptr) {
   }
 }
 
-size_t TrackingAllocator::AllocatedSize(void* ptr) {
+size_t TrackingAllocator::AllocatedSize(const void* ptr) {
   if (track_sizes_locally_) {
     mutex_lock lock(mu_);
     auto it = in_use_.find(ptr);
@@ -139,7 +139,7 @@ size_t TrackingAllocator::AllocatedSize(void* ptr) {
   }
 }
 
-int64 TrackingAllocator::AllocationId(void* ptr) {
+int64 TrackingAllocator::AllocationId(const void* ptr) {
   if (track_sizes_locally_) {
     mutex_lock lock(mu_);
     auto it = in_use_.find(ptr);
diff --git a/tensorflow/core/framework/tracking_allocator.h b/tensorflow/core/framework/tracking_allocator.h
index 4825ed414f..f6c3c0b71b 100644
--- a/tensorflow/core/framework/tracking_allocator.h
+++ b/tensorflow/core/framework/tracking_allocator.h
@@ -64,9 +64,9 @@ class TrackingAllocator : public Allocator {
                     const AllocationAttributes& allocation_attr) override;
   void DeallocateRaw(void* ptr) override;
   bool TracksAllocationSizes() override;
-  size_t RequestedSize(void* ptr) override;
-  size_t AllocatedSize(void* ptr) override;
-  int64 AllocationId(void* ptr) override;
+  size_t RequestedSize(const void* ptr) override;
+  size_t AllocatedSize(const void* ptr) override;
+  int64 AllocationId(const void* ptr) override;
   void GetStats(AllocatorStats* stats) override;
   void ClearStats() override;
 
@@ -125,7 +125,7 @@ class TrackingAllocator : public Allocator {
     size_t allocated_size;
     int64 allocation_id;
   };
-  std::unordered_map<void*, Chunk> in_use_ GUARDED_BY(mu_);
+  std::unordered_map<const void*, Chunk> in_use_ GUARDED_BY(mu_);
   int64 next_allocation_id_ GUARDED_BY(mu_);
 };
 
diff --git a/tensorflow/core/framework/tracking_allocator_test.cc b/tensorflow/core/framework/tracking_allocator_test.cc
index 4e32a907f2..2cdc7edd2d 100644
--- a/tensorflow/core/framework/tracking_allocator_test.cc
+++ b/tensorflow/core/framework/tracking_allocator_test.cc
@@ -39,7 +39,7 @@ class TestableSizeTrackingAllocator : public Allocator {
     port::Free(ptr);
   }
   bool TracksAllocationSizes() override { return true; }
-  size_t RequestedSize(void* ptr) override {
+  size_t RequestedSize(const void* ptr) override {
     const auto& iter = size_map_.find(ptr);
     EXPECT_NE(size_map_.end(), iter);
     return iter->second;
@@ -47,7 +47,7 @@ class TestableSizeTrackingAllocator : public Allocator {
   void GetStats(AllocatorStats* stats) override { stats->Clear(); }
 
  private:
-  std::unordered_map<void*, size_t> size_map_;
+  std::unordered_map<const void*, size_t> size_map_;
 };
 
 class NoMemoryAllocator : public Allocator {
diff --git a/tensorflow/core/kernels/assign_op.h b/tensorflow/core/kernels/assign_op.h
index 1d2e1c8c9a..a312e8e8a4 100644
--- a/tensorflow/core/kernels/assign_op.h
+++ b/tensorflow/core/kernels/assign_op.h
@@ -109,6 +109,9 @@ class AssignOp : public OpKernel {
         OP_REQUIRES_OK(
             context, context->allocate_persistent(old_lhs.dtype(), rhs.shape(),
                                                   &copy, &copyTensor, attr));
+        // We track memory of variables in variable ops instead of in this
+        // assign op.
+        context->clear_recorded_memory();
         context->replace_ref_input(0, *copyTensor, /* lock_held */ true);
         if (use_exclusive_lock_) {
           Copy(context, copyTensor, rhs);
diff --git a/tensorflow/core/kernels/reduction_ops_common.h b/tensorflow/core/kernels/reduction_ops_common.h
index d7bebfb24c..03d6e82e01 100644
--- a/tensorflow/core/kernels/reduction_ops_common.h
+++ b/tensorflow/core/kernels/reduction_ops_common.h
@@ -239,9 +239,6 @@ class ReductionOp : public OpKernel {
     if (!out.CopyFrom(tmp_out, helper.out_shape())) {
       ctx->SetStatus(errors::Internal("Error during reduction copy."));
     }
-    if (ctx->track_allocations()) {
-      ctx->record_temp_memory_size(-static_cast<int64>(out.AllocatedBytes()));
-    }
     ctx->set_output(0, out);
   }
 
-- 
GitLab


From 24134b189e9846ab222ef9723d9b030dfd665ed7 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Tue, 6 Feb 2018 17:37:02 -0800
Subject: [PATCH 741/966] [tf.data] Fix a memory leak when an iterator is
 reinitialized many times in a session.

Previously, we would instantiate a new function handle for each
function in a dataset each time an iterator on that dataset was
initialized. These would only be deleted at session closure, which
could lead to an apparent leak of memory over the lifetime of session.

PiperOrigin-RevId: 184768730
---
 .../core/kernels/data/captured_function.cc    |  6 +++++-
 tensorflow/core/kernels/data/iterator_ops.cc  | 19 ++++++++++++-------
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/tensorflow/core/kernels/data/captured_function.cc b/tensorflow/core/kernels/data/captured_function.cc
index f3e4f1cd3f..f248f7897f 100644
--- a/tensorflow/core/kernels/data/captured_function.cc
+++ b/tensorflow/core/kernels/data/captured_function.cc
@@ -32,7 +32,11 @@ Status CapturedFunction::Create(
   return Status::OK();
 }
 
-CapturedFunction::~CapturedFunction() {}
+CapturedFunction::~CapturedFunction() {
+  if (lib_ != nullptr) {
+    lib_->ReleaseHandle(f_handle_).IgnoreError();
+  }
+}
 
 namespace {
 class CallFrameBase : public CallFrameInterface {
diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index 8a420ac26d..fc3e291afb 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -725,18 +725,23 @@ class OneShotIteratorOp : public AsyncOpKernel {
   Status TryInit(OpKernelContext* ctx, IteratorResource** iterator,
                  ContainerInfo* cinfo) {
     TF_RETURN_IF_ERROR(cinfo->Init(ctx->resource_manager(), def()));
-    FunctionLibraryRuntime* lib = ctx->function_library();
+
+    FunctionLibraryRuntime* lib;
+    std::unique_ptr<FunctionLibraryDefinition> flib_def(nullptr);
+    std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(nullptr);
+    TF_RETURN_IF_ERROR(ctx->function_library()->Clone(&flib_def, &pflr, &lib));
 
     // Create an IteratorResource that will hold the iterator for this op.
     TF_RETURN_IF_ERROR(
         ctx->resource_manager()->LookupOrCreate<IteratorResource>(
             cinfo->container(), cinfo->name(), iterator,
-            [lib, this](IteratorResource** ret) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-              *ret = new IteratorResource(output_dtypes_, output_shapes_,
-                                          graph_def_version_, nullptr, nullptr,
-                                          nullptr, lib);
-              return Status::OK();
-            }));
+            [lib, this, &flib_def, &pflr](IteratorResource** ret)
+                EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+                  *ret = new IteratorResource(
+                      output_dtypes_, output_shapes_, graph_def_version_,
+                      nullptr, std::move(flib_def), std::move(pflr), lib);
+                  return Status::OK();
+                }));
 
     core::ScopedUnref unref_iterator(*iterator);
 
-- 
GitLab


From 60d5caeb2d506401d480503e21cc97c9a784c81b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 6 Feb 2018 18:26:02 -0800
Subject: [PATCH 742/966] We used to bypass fake-quant nodes in
 resolve_reorder_axes, as a hack as we needed to preserve fake-quant nodes on
 constant weights as the only way to encode min-max information when exporting
 to GraphDef.

Not anymore. Now we unconditionally enable the resolve_constant_fake_quant
transformation, and we don't do this bypass anymore; instead, when exporting
to GraphDef, we re-add FakeQuant nodes just before exporting, around
constant arrays that have minmax.

PiperOrigin-RevId: 184774680
---
 .../contrib/lite/toco/export_tensorflow.cc    | 24 ++++++++++++++++++
 .../contrib/lite/toco/export_tensorflow.h     |  2 ++
 .../resolve_constant_fake_quant.cc            |  1 +
 .../resolve_reorder_axes.cc                   | 25 ++++++-------------
 tensorflow/contrib/lite/toco/toco_tooling.cc  |  8 +++---
 tensorflow/contrib/lite/toco/tooling_util.cc  | 14 ++++++++++-
 tensorflow/contrib/lite/toco/tooling_util.h   |  5 ++++
 7 files changed, 58 insertions(+), 21 deletions(-)

diff --git a/tensorflow/contrib/lite/toco/export_tensorflow.cc b/tensorflow/contrib/lite/toco/export_tensorflow.cc
index be6d506bf3..0bfbe0e3a5 100644
--- a/tensorflow/contrib/lite/toco/export_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/export_tensorflow.cc
@@ -1624,6 +1624,30 @@ void ExportTensorFlowGraphDefImplementation(const Model& model,
 }
 }  // namespace
 
+void EncodeConstantArraysMinMaxByWrappingThemInFakeQuantNodes(Model* model) {
+  for (const auto& array_kv : model->GetArrayMap()) {
+    const string& array_name = array_kv.first;
+    Array& array = *array_kv.second;
+    if (!array.buffer || !array.minmax) {
+      continue;
+    }
+    const string& wrapped_array_name =
+        AvailableArrayName(*model, array_name + "/data");
+    Array& wrapped_array = model->GetOrCreateArray(wrapped_array_name);
+    wrapped_array.data_type = array.data_type;
+    wrapped_array.copy_shape(array.shape());
+    wrapped_array.buffer = std::move(array.buffer);
+    FakeQuantOperator* fakequant_op = new FakeQuantOperator;
+    fakequant_op->inputs = {wrapped_array_name};
+    fakequant_op->outputs = {array_name};
+    fakequant_op->minmax.reset(new MinMax);
+    *fakequant_op->minmax = *array.minmax;
+    const auto& it = FindOpWithInput(*model, array_name);
+    model->operators.emplace(it, fakequant_op);
+  }
+  CheckInvariants(*model);
+}
+
 void ExportTensorFlowGraphDef(const Model& model,
                               string* output_file_contents) {
   CHECK(output_file_contents->empty());
diff --git a/tensorflow/contrib/lite/toco/export_tensorflow.h b/tensorflow/contrib/lite/toco/export_tensorflow.h
index 79682153a8..d7310bb75f 100644
--- a/tensorflow/contrib/lite/toco/export_tensorflow.h
+++ b/tensorflow/contrib/lite/toco/export_tensorflow.h
@@ -22,6 +22,8 @@ namespace toco {
 
 void ExportTensorFlowGraphDef(const Model& model, string* output_file_contents);
 
+void EncodeConstantArraysMinMaxByWrappingThemInFakeQuantNodes(Model* model);
+
 }  // namespace toco
 
 #endif  // TENSORFLOW_CONTRIB_LITE_TOCO_EXPORT_TENSORFLOW_H_
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fake_quant.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fake_quant.cc
index 81fe37d7e0..944901ece7 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fake_quant.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fake_quant.cc
@@ -50,6 +50,7 @@ bool ResolveConstantFakeQuant::Run(Model* model, std::size_t op_index) {
   output_array.data_type = ArrayDataType::kFloat;
   CHECK(!output_array.buffer);
   const auto& input_buffer = input_array.GetBuffer<ArrayDataType::kFloat>();
+  output_array.GetOrCreateMinMax() = *fakequant_op->minmax;
   auto& output_buffer = output_array.GetMutableBuffer<ArrayDataType::kFloat>();
   const int size = input_buffer.data.size();
   output_buffer.data.resize(size);
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_reorder_axes.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_reorder_axes.cc
index 5c68f87f6c..bc70db0bd8 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_reorder_axes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_reorder_axes.cc
@@ -60,16 +60,7 @@ bool ResolveReorderAxes::Run(Model* model, std::size_t op_index) {
   const auto& output_array_name = reorder_op->outputs[0];
   auto& input_array = model->GetArray(input_array_name);
   auto& output_array = model->GetArray(output_array_name);
-  string constant_input_array_name = input_array_name;
   if (!input_array.buffer) {
-    const auto* op_producing_input = GetOpWithOutput(*model, input_array_name);
-    if (op_producing_input &&
-        op_producing_input->type == OperatorType::kFakeQuant) {
-      constant_input_array_name = op_producing_input->inputs[0];
-    }
-  }
-  auto& constant_input_array = model->GetArray(constant_input_array_name);
-  if (!constant_input_array.buffer) {
     return false;
   }
   // Yield until output dims have been resolved.
@@ -77,14 +68,14 @@ bool ResolveReorderAxes::Run(Model* model, std::size_t op_index) {
     return false;
   }
   // Reorder the input array dims and buffer data
-  if (constant_input_array.buffer->type == ArrayDataType::kFloat) {
-    ReorderAxes<float, ArrayDataType::kFloat>(
-        reorder_op->input_axes_order, reorder_op->output_axes_order,
-        &constant_input_array, &output_array);
-  } else if (constant_input_array.buffer->type == ArrayDataType::kInt32) {
-    ReorderAxes<uint8, ArrayDataType::kUint8>(
-        reorder_op->input_axes_order, reorder_op->output_axes_order,
-        &constant_input_array, &output_array);
+  if (input_array.buffer->type == ArrayDataType::kFloat) {
+    ReorderAxes<float, ArrayDataType::kFloat>(reorder_op->input_axes_order,
+                                              reorder_op->output_axes_order,
+                                              &input_array, &output_array);
+  } else if (input_array.buffer->type == ArrayDataType::kInt32) {
+    ReorderAxes<uint8, ArrayDataType::kUint8>(reorder_op->input_axes_order,
+                                              reorder_op->output_axes_order,
+                                              &input_array, &output_array);
   } else {
     LOG(FATAL) << "Cannot ReorderAxes unless input buffer is float or uint8.";
   }
diff --git a/tensorflow/contrib/lite/toco/toco_tooling.cc b/tensorflow/contrib/lite/toco/toco_tooling.cc
index b715881774..ac77284be0 100644
--- a/tensorflow/contrib/lite/toco/toco_tooling.cc
+++ b/tensorflow/contrib/lite/toco/toco_tooling.cc
@@ -95,6 +95,7 @@ void MakeGeneralGraphTransformationsSet(
   transformations->Add(new ResolveTransposeAttributes);
   transformations->Add(new ResolveConstantShapeOrRank);
   transformations->Add(new MakeInitialDequantizeOperator);
+  transformations->Add(new ResolveConstantFakeQuant);
 }
 
 bool SupportsQuantization(FileFormat format) {
@@ -212,9 +213,6 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
   } else {
     transformations.Add(new UnfuseActivationFunctions);
   }
-  if (output_format != TENSORFLOW_GRAPHDEF) {
-    transformations.Add(new ResolveConstantFakeQuant);
-  }
   if (toco_flags.drop_fake_quant()) {
     transformations.Add(new DropFakeQuant);
   } else {
@@ -267,6 +265,10 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
                             dequantization_transformations);
   }
 
+  if (output_format == TENSORFLOW_GRAPHDEF) {
+    EncodeConstantArraysMinMaxByWrappingThemInFakeQuantNodes(model);
+  }
+
   LogDump(kLogLevelModelChanged, "AFTER TRANSFORMATIONS", *model);
 
   if (output_format != GRAPHVIZ_DOT && output_format != TFLITE) {
diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc
index 0a168c671e..e82a851935 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.cc
+++ b/tensorflow/contrib/lite/toco/tooling_util.cc
@@ -159,6 +159,18 @@ std::vector<std::unique_ptr<Operator>>::const_iterator FindOpWithInput(
   return model.operators.end();
 }
 
+std::vector<std::unique_ptr<Operator>>::iterator FindOpWithInput(
+    Model& model, const string& array_name) {
+  for (auto it = model.operators.begin(); it != model.operators.end(); ++it) {
+    for (auto& input : it->get()->inputs) {
+      if (input == array_name) {
+        return it;
+      }
+    }
+  }
+  return model.operators.end();
+}
+
 std::vector<std::unique_ptr<Operator>>::const_iterator FindOp(
     const Model& model, const Operator* op) {
   for (auto it = model.operators.begin(); it != model.operators.end(); ++it) {
@@ -406,7 +418,7 @@ void LogArray(int log_level, const Model& model, const string& name) {
   }
   if (array.quantization_params) {
     VLOG(log_level) << "  QuantizationParams: zero_point="
-                    << array.quantization_params->zero_point
+                    << static_cast<int>(array.quantization_params->zero_point)
                     << ", scale=" << array.quantization_params->scale;
   }
 }
diff --git a/tensorflow/contrib/lite/toco/tooling_util.h b/tensorflow/contrib/lite/toco/tooling_util.h
index a023bab1a0..3c952e9c61 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.h
+++ b/tensorflow/contrib/lite/toco/tooling_util.h
@@ -67,10 +67,15 @@ Operator* GetOpWithOutput(const Model& model, const string& array_name);
 
 std::vector<std::unique_ptr<Operator>>::iterator FindOpWithOutput(
     Model& model, const string& array_name);
+
 Operator* GetOpWithOutput(const Model& model, const string& array_name);
 
 std::vector<std::unique_ptr<Operator>>::const_iterator FindOpWithInput(
     const Model& model, const string& array_name);
+
+std::vector<std::unique_ptr<Operator>>::iterator FindOpWithInput(
+    Model& model, const string& array_name);
+
 Operator* GetOpWithInput(const Model& model, const string& array_name);
 Operator* GetFirstOpWithInput(const Model& model, const string& array_name);
 
-- 
GitLab


From 5bfd3e3c9433492aa4aed97d2a8a6c9284bdd77e Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Tue, 6 Feb 2018 18:28:00 -0800
Subject: [PATCH 743/966] [XLA:CPU] Use VectorSupportLibrary for LLVM IR
 implementation of tanh

No behavioral change intended; this is only refactoring.

VectorSupportLibrary was added after the LLVM IR implementation of tanh so the
tanh implementation was not using VectorSupportLibrary.

The main impetus for this change is that I'm about to add LLVM IR
implementations of Exp and Log, and those are going to use VectorSupportLibrary.
I did not want to have an inconsistency between the tanh and exp, log.

PiperOrigin-RevId: 184774860
---
 tensorflow/compiler/xla/service/cpu/BUILD     |  1 +
 .../xla/service/cpu/llvm_ir_runtime.cc        | 36 +++++++------------
 .../xla/service/cpu/vector_support_library.cc | 20 +++++++++++
 .../xla/service/cpu/vector_support_library.h  | 14 ++++++++
 4 files changed, 48 insertions(+), 23 deletions(-)

diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index 2f02591631..7228e8da42 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -497,6 +497,7 @@ cc_library(
         "llvm_ir_runtime.h",
     ],
     deps = [
+        ":vector_support_library",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
         "//tensorflow/core:lib",
         "@llvm//:core",
diff --git a/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc b/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
index 0336fa6131..3cd6b6e853 100644
--- a/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
+++ b/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/Transforms/Utils/Cloning.h"
+#include "tensorflow/compiler/xla/service/cpu/vector_support_library.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
@@ -42,27 +43,22 @@ llvm::Function* EmitVectorF32TanhIfNeeded(llvm::Module* module,
   }
 
   llvm::LLVMContext* context = &module->getContext();
-  llvm::Type* float_type = llvm::Type::getFloatTy(*context);
-  llvm::VectorType* vector_type =
-      llvm::VectorType::get(float_type, vector_width);
 
   llvm::BasicBlock* vector_tanh_body =
       llvm::BasicBlock::Create(*context, "body", vector_tanh_function);
 
   llvm::IRBuilder<> ir_builder(vector_tanh_body);
-
   llvm::FastMathFlags fast_math_flags;
   fast_math_flags.setFast();
   ir_builder.setFastMathFlags(fast_math_flags);
 
+  VectorSupportLibrary vsl(F32, vector_width, &ir_builder, "tanh_f32");
+
   llvm::Value* input = &*vector_tanh_function->arg_begin();
-  CHECK_EQ(input->getType(), vector_type);
+  CHECK_EQ(input->getType(), vsl.vector_type());
 
   // This implements the same rational interpolant as implemented in Eigen3.
-  llvm::Value* input_clamped = llvm_ir::EmitFloatMin(
-      llvm_ir::EmitFloatMax(input, llvm::ConstantFP::get(vector_type, -9.0),
-                            &ir_builder),
-      llvm::ConstantFP::get(vector_type, 9.0), &ir_builder);
+  llvm::Value* input_clamped = vsl.Clamp(input, /*low=*/-9.0, /*high=*/9.0);
 
   std::array<float, 7> numerator_coeffs{
       -2.76076847742355e-16f, 2.00018790482477e-13f, -8.60467152213735e-11f,
@@ -73,26 +69,20 @@ llvm::Function* EmitVectorF32TanhIfNeeded(llvm::Module* module,
       1.19825839466702e-06f, 1.18534705686654e-04f, 2.26843463243900e-03f,
       4.89352518554385e-03f};
 
-  llvm::Value* input_squared =
-      ir_builder.CreateFMul(input_clamped, input_clamped);
-  llvm::Value* numerator =
-      llvm::ConstantFP::get(vector_type, numerator_coeffs[0]);
+  llvm::Value* input_squared = vsl.Mul(input_clamped, input_clamped);
+  llvm::Value* numerator = vsl.SplatFloat(numerator_coeffs[0]);
   for (int i = 1; i < numerator_coeffs.size(); i++) {
-    numerator = ir_builder.CreateFAdd(
-        ir_builder.CreateFMul(input_squared, numerator),
-        llvm::ConstantFP::get(vector_type, numerator_coeffs[i]));
+    numerator = vsl.MulAdd(input_squared, numerator, numerator_coeffs[i]);
   }
-  numerator = ir_builder.CreateFMul(input_clamped, numerator);
 
-  llvm::Value* denominator =
-      llvm::ConstantFP::get(vector_type, denominator_coeffs[0]);
+  numerator = vsl.Mul(input_clamped, numerator);
+
+  llvm::Value* denominator = vsl.SplatFloat(denominator_coeffs[0]);
   for (int i = 1; i < denominator_coeffs.size(); i++) {
-    denominator = ir_builder.CreateFAdd(
-        ir_builder.CreateFMul(input_squared, denominator),
-        llvm::ConstantFP::get(vector_type, denominator_coeffs[i]));
+    denominator = vsl.MulAdd(input_squared, denominator, denominator_coeffs[i]);
   }
 
-  llvm::Value* result = ir_builder.CreateFDiv(numerator, denominator);
+  llvm::Value* result = vsl.Div(numerator, denominator);
   ir_builder.CreateRet(result);
 
   DCHECK(!llvm::verifyFunction(*vector_tanh_function));
diff --git a/tensorflow/compiler/xla/service/cpu/vector_support_library.cc b/tensorflow/compiler/xla/service/cpu/vector_support_library.cc
index 128b465be2..a910e41050 100644
--- a/tensorflow/compiler/xla/service/cpu/vector_support_library.cc
+++ b/tensorflow/compiler/xla/service/cpu/vector_support_library.cc
@@ -54,6 +54,26 @@ llvm::Value* VectorSupportLibrary::Add(llvm::Value* lhs, llvm::Value* rhs) {
   return AddInternal(lhs, rhs);
 }
 
+llvm::Value* VectorSupportLibrary::Div(llvm::Value* lhs, llvm::Value* rhs) {
+  CHECK(lhs->getType() == scalar_type() || lhs->getType() == vector_type());
+  if (scalar_type_->isFloatingPointTy()) {
+    return ir_builder()->CreateFDiv(lhs, rhs, name());
+  } else {
+    LOG(FATAL) << "Division for integers is unimplemented";
+  }
+}
+
+llvm::Value* VectorSupportLibrary::Clamp(llvm::Value* a, double low,
+                                         double high) {
+  CHECK_LT(low, high);
+  CHECK(scalar_type_->isFloatingPointTy());
+  llvm::Type* type = a->getType();
+  CHECK(type == vector_type() || type == scalar_type());
+  return llvm_ir::EmitFloatMin(
+      llvm_ir::EmitFloatMax(a, llvm::ConstantFP::get(type, low), ir_builder_),
+      llvm::ConstantFP::get(type, high), ir_builder_);
+}
+
 llvm::Value* VectorSupportLibrary::AddInternal(llvm::Value* lhs,
                                                llvm::Value* rhs) {
   if (scalar_type_->isFloatingPointTy()) {
diff --git a/tensorflow/compiler/xla/service/cpu/vector_support_library.h b/tensorflow/compiler/xla/service/cpu/vector_support_library.h
index 8fbac2a667..6091146824 100644
--- a/tensorflow/compiler/xla/service/cpu/vector_support_library.h
+++ b/tensorflow/compiler/xla/service/cpu/vector_support_library.h
@@ -46,11 +46,25 @@ class VectorSupportLibrary {
   llvm::Value* Add(int64 lhs, llvm::Value* rhs) {
     return Add(ir_builder()->getInt64(lhs), rhs);
   }
+  llvm::Value* Add(double lhs, llvm::Value* rhs) {
+    return Add(llvm::ConstantFP::get(vector_type(), lhs), rhs);
+  }
+
+  llvm::Value* Div(llvm::Value* lhs, llvm::Value* rhs);
 
   llvm::Value* MulAdd(llvm::Value* a, llvm::Value* b, llvm::Value* c) {
     return Add(c, Mul(a, b));
   }
 
+  llvm::Value* MulAdd(llvm::Value* a, llvm::Value* b, double c) {
+    return Add(llvm::ConstantFP::get(vector_type(), c), Mul(a, b));
+  }
+
+  llvm::Value* Clamp(llvm::Value* a, double low, double high);
+  llvm::Value* SplatFloat(double d) {
+    return llvm::ConstantFP::get(vector_type(), d);
+  }
+
   llvm::Value* ComputeOffsetPointer(llvm::Value* base_pointer,
                                     llvm::Value* offset_elements);
   llvm::Value* ComputeOffsetPointer(llvm::Value* base_pointer,
-- 
GitLab


From 746f9b967ec4e8d7cabd40d6f2ee568abe9dc14d Mon Sep 17 00:00:00 2001
From: Jonathan Hseu <jhseu@google.com>
Date: Tue, 6 Feb 2018 20:40:00 -0800
Subject: [PATCH 744/966] TPUEstimator: Revert the global_step change and
 require the user to explicitly pass it.

PiperOrigin-RevId: 184784330
---
 .../contrib/tpu/python/tpu/tpu_estimator.py   | 52 ++++---------------
 1 file changed, 11 insertions(+), 41 deletions(-)

diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index a236c08991..7ebf1c6469 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -517,7 +517,7 @@ class TPUEstimatorSpec(
     if self.eval_metrics is not None:
       host_calls['eval_metrics'] = self.eval_metrics
     if self.host_call is not None:
-      host_calls['host_call'] = wrap_hostcall_with_global_step(self.host_call)
+      host_calls['host_call'] = self.host_call
     host_call_ret = _OutfeedHostCall.create_cpu_hostcall(host_calls)
     eval_metric_ops = None
     if self.eval_metrics is not None:
@@ -1351,19 +1351,21 @@ class _ModelFnWrapper(object):
           self._call_model_fn(features, labels))
       loss, train_op = estimator_spec.loss, estimator_spec.train_op
 
-      host_call_outfeed_ops = []
       if isinstance(estimator_spec, TPUEstimatorSpec):
         captured_scaffold_fn.capture(estimator_spec.scaffold_fn)
-        if estimator_spec.host_call is not None:
-          host_call.record({
-              'host_call': wrap_hostcall_with_global_step(
-                  estimator_spec.host_call)})
-          host_call_outfeed_ops = host_call.create_enqueue_op()
       else:
         captured_scaffold_fn.capture(None)
 
-      with ops.control_dependencies([train_op] + host_call_outfeed_ops):
-        return array_ops.identity(loss)
+      # We must run train_op to update the variables prior to running the
+      # outfeed.
+      with ops.control_dependencies([train_op]):
+        host_call_outfeed_ops = []
+        if (isinstance(estimator_spec, TPUEstimatorSpec) and
+            estimator_spec.host_call is not None):
+          host_call.record({'host_call': estimator_spec.host_call})
+          host_call_outfeed_ops = host_call.create_enqueue_op()
+        with ops.control_dependencies(host_call_outfeed_ops):
+          return array_ops.identity(loss)
 
     return train_step, host_call, captured_scaffold_fn
 
@@ -1708,38 +1710,6 @@ class _OutfeedHostCall(object):
     return ret
 
 
-def wrap_hostcall_with_global_step(hostcall):
-  """Wrap the hostcall so that we update the global step upon every call."""
-  if hostcall is None:
-    return None
-  host_fn, tensors = hostcall
-
-  def global_step_host_fn(_global_step, *args, **kwargs):  # pylint: disable=invalid-name
-    # Note that we don't have any ordering here, so the graph may see a
-    # global_step that's off by 1.
-    state_ops.assign(
-        training.get_global_step(),
-        math_ops.cast(_global_step[0], dtypes.int64))
-    return host_fn(*args, **kwargs)
-  # Give the global step tensor a batch dimension. Reshape is not supported for
-  # int64, so we cast it to int32.
-  # TODO(jhseu): Remove the cast once int64 is supported.
-  global_step_tensor = array_ops.reshape(
-      math_ops.cast(training.get_global_step(), dtypes.int32), [1])
-  if isinstance(tensors, dict):
-    outfeed_tensors = {'_global_step': global_step_tensor}
-    outfeed_tensors.update(tensors)
-    return global_step_host_fn, outfeed_tensors
-  else:
-    fn_args = util.fn_args(host_fn)
-    if len(tensors) != len(fn_args):
-      raise RuntimeError(
-          'In TPUEstimatorSpec.host_call, length of tensors {} does not match '
-          'method args of the function, which takes {}.'.format(
-              len(tensors), len(fn_args)))
-    return global_step_host_fn, [global_step_tensor] + list(tensors)
-
-
 class _OutfeedHostCallHook(session_run_hook.SessionRunHook):
   """Hook to run host calls when use_tpu=False."""
 
-- 
GitLab


From a93ee9a9ae54685ef2812ba7421bf0dec9056d3a Mon Sep 17 00:00:00 2001
From: Russell Power <power@google.com>
Date: Tue, 6 Feb 2018 21:19:50 -0800
Subject: [PATCH 745/966] Clear feed error on session start.

An existing hook can be re-used on a different session: rather than terminating
the session we should clear the error state.

PiperOrigin-RevId: 184787368
---
 tensorflow/contrib/tpu/python/tpu/tpu_estimator.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index 7ebf1c6469..7d2f6556fb 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -660,7 +660,7 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
       # for TPU computation waits for the infeed enqueue forever. Close the
       # Session to cancel the main thread Session.run execution.
       #
-      # However, sleep for 2 minutes before explicit closing to give some time
+      # We sleep for a few seconds before closing to give some time
       # for the TPU compilation error, if any, propagating, from TPU to CPU
       # host. Compilation errors should be reported by the main thread so that
       # the program can be interrupted and users can take action.  Due to a race
@@ -673,7 +673,7 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
 
       # If the main session is still running, the infeed/outfeed errors are
       # legitimate, and should be logged.
-      if not self._finished:
+      if not self._finished and self._feed_error:
         logging.error('Feed error: %s', self._feed_error)
         logging.error('Closing session.  A RuntimeError should follow.')
         session.close()
@@ -731,10 +731,12 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
         name='OutfeedController', target=self._run_outfeed, args=(session,))
 
   def before_run(self, run_context):
-    if self._feed_error:
-      logging.warning('Feed error occurred, terminating session.')
-      run_context.request_stop()
-      return
+    self._feed_error = None
+
+    # Wait for the cancellation timer to complete before continuing.
+    if self._session_cancel_timer:
+      self._session_cancel_timer.join()
+      self._session_cancel_timer = None
 
     iterations = run_context.session.run(self._iterations_per_loop_var)
 
-- 
GitLab


From 2ec89055c17cd17dc1b1259d9eb76d1177c9b0e8 Mon Sep 17 00:00:00 2001
From: Zhixian Yan <zhixianyan@google.com>
Date: Tue, 6 Feb 2018 21:32:48 -0800
Subject: [PATCH 746/966] LSTM for TFlite/Toco

PiperOrigin-RevId: 184788311
---
 tensorflow/contrib/lite/toco/BUILD            |   5 +
 .../graph_transformations.h                   |   2 +
 .../identify_lstm_merge_inputs.cc             | 185 ++++++++
 .../identify_lstm_split_inputs.cc             | 171 +++++++
 .../toco/graph_transformations/lstm_utils.cc  |  97 ++++
 .../toco/graph_transformations/lstm_utils.h   | 102 ++++
 .../propagate_fixed_sizes.cc                  |   5 +-
 .../toco/graph_transformations/tests/BUILD    |  11 +
 .../tests/lstm_utils_test.cc                  | 442 ++++++++++++++++++
 .../contrib/lite/toco/tflite/operator.cc      |  24 +
 tensorflow/contrib/lite/toco/toco_tooling.cc  |   9 +-
 11 files changed, 1048 insertions(+), 5 deletions(-)
 create mode 100644 tensorflow/contrib/lite/toco/graph_transformations/identify_lstm_merge_inputs.cc
 create mode 100644 tensorflow/contrib/lite/toco/graph_transformations/identify_lstm_split_inputs.cc
 create mode 100644 tensorflow/contrib/lite/toco/graph_transformations/lstm_utils.cc
 create mode 100644 tensorflow/contrib/lite/toco/graph_transformations/lstm_utils.h
 create mode 100644 tensorflow/contrib/lite/toco/graph_transformations/tests/lstm_utils_test.cc

diff --git a/tensorflow/contrib/lite/toco/BUILD b/tensorflow/contrib/lite/toco/BUILD
index 20c156a932..864d0254f2 100644
--- a/tensorflow/contrib/lite/toco/BUILD
+++ b/tensorflow/contrib/lite/toco/BUILD
@@ -188,7 +188,10 @@ cc_library(
         "graph_transformations/identify_l2_normalization.cc",
         "graph_transformations/identify_l2_pool.cc",
         "graph_transformations/identify_lstm.cc",
+        "graph_transformations/identify_lstm_merge_inputs.cc",
+        "graph_transformations/identify_lstm_split_inputs.cc",
         "graph_transformations/identify_relu1.cc",
+        "graph_transformations/lstm_utils.cc",
         "graph_transformations/make_initial_dequantize_operator.cc",
         "graph_transformations/propagate_array_data_types.cc",
         "graph_transformations/propagate_fixed_sizes.cc",
@@ -235,6 +238,7 @@ cc_library(
     ],
     hdrs = [
         "graph_transformations/graph_transformations.h",
+        "graph_transformations/lstm_utils.h",
     ],
     visibility = ["//visibility:public"],
     deps = [
@@ -245,6 +249,7 @@ cc_library(
         ":tooling_util",
         ":types_proto_cc",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
index cf90ebe996..5d7ada1b74 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
+++ b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
@@ -124,6 +124,8 @@ DECLARE_GRAPH_TRANSFORMATION(FuseBinaryIntoPrecedingAffine)
 DECLARE_GRAPH_TRANSFORMATION(IdentifyL2Normalization)
 DECLARE_GRAPH_TRANSFORMATION(IdentifyL2Pool)
 DECLARE_GRAPH_TRANSFORMATION(IdentifyLstmCell)
+DECLARE_GRAPH_TRANSFORMATION(SplitLstmCellInputs)
+DECLARE_GRAPH_TRANSFORMATION(MergeLstmCellInputs)
 DECLARE_GRAPH_TRANSFORMATION(IdentifyRelu1)
 DECLARE_GRAPH_TRANSFORMATION(MakeInitialDequantizeOperator)
 DECLARE_GRAPH_TRANSFORMATION(PropagateArrayDataTypes)
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm_merge_inputs.cc b/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm_merge_inputs.cc
new file mode 100644
index 0000000000..45335fd78c
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm_merge_inputs.cc
@@ -0,0 +1,185 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "absl/memory/memory.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/lstm_utils.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+
+namespace toco {
+
+bool MergeLstmCellInputs::Run(Model* model, std::size_t op_index) {
+  // Find lstm cell.
+  auto op_it = model->operators.begin() + op_index;
+  auto src_op = op_it->get();
+  if (src_op->type != OperatorType::kLstmCell) {
+    return false;
+  }
+
+  // Already a compact LstmCell with LstmCellOperator::NUM_INPUTS of inputs,
+  // do not need to merge cell inputs.
+  if (src_op->inputs.size() == LstmCellOperator::NUM_INPUTS) {
+    return false;
+  }
+
+  // Identify prev_activ_input, prev_state_input as required Op inputs,
+  // using the rnn_states in the model flag.
+  string prev_activ_input;
+  if (!GetMatchingRnnArray(model, src_op->outputs[kOutputTensor],
+                           &prev_activ_input)) {
+    return false;
+  }
+  string prev_state_input;
+  if (!GetMatchingRnnArray(model, src_op->outputs[kCellStateTensor],
+                           &prev_state_input)) {
+    return false;
+  }
+
+  // Get LstmCell's cell, input, output size.
+  int num_cell = model->GetArray(src_op->inputs[kInputToInputWeightsTensor])
+                     .shape()
+                     .dims(0);
+  int num_input = model->GetArray(src_op->inputs[kInputToInputWeightsTensor])
+                      .shape()
+                      .dims(1);
+  int num_output =
+      model->GetArray(src_op->inputs[kRecurrentToInputWeightsTensor])
+          .shape()
+          .dims(1);
+
+  // Make sure n_cell and n_output are equal as there is no projection.
+  CHECK_EQ(num_cell, num_output);
+
+  // Create tensorflow_graphdef style's one big weight tensor.
+  const string base_name(FindLongestCommonPrefix(
+      src_op->outputs[kOutputTensor], src_op->outputs[kCellStateTensor]));
+  string merged_weights = AvailableArrayName(*model, base_name + "weights");
+  auto& array = model->GetOrCreateArray(merged_weights);
+  array.data_type = ArrayDataType::kFloat;
+  int weights_dim1 = 4 * num_cell;
+  int weights_dim2 = num_input + num_output;
+  Shape shape = Shape({weights_dim1, weights_dim2});
+  array.copy_shape(shape);
+  auto& buffer = array.GetMutableBuffer<ArrayDataType::kFloat>();
+  buffer.data.resize(weights_dim1 * weights_dim2);
+
+  // Merge 8 small weight tensors to 1 weight tensor.
+  CopyArrayToSubArray(
+      buffer, weights_dim2,
+      model->GetArray(src_op->inputs[kInputToInputWeightsTensor]), 0, 0);
+  CopyArrayToSubArray(
+      buffer, weights_dim2,
+      model->GetArray(src_op->inputs[kInputToCellWeightsTensor]), num_cell, 0);
+  CopyArrayToSubArray(
+      buffer, weights_dim2,
+      model->GetArray(src_op->inputs[kInputToForgetWeightsTensor]),
+      num_cell * 2, 0);
+  CopyArrayToSubArray(
+      buffer, weights_dim2,
+      model->GetArray(src_op->inputs[kInputToOutputWeightsTensor]),
+      num_cell * 3, 0);
+  CopyArrayToSubArray(
+      buffer, weights_dim2,
+      model->GetArray(src_op->inputs[kRecurrentToInputWeightsTensor]), 0,
+      num_input);
+  CopyArrayToSubArray(
+      buffer, weights_dim2,
+      model->GetArray(src_op->inputs[kRecurrentToCellWeightsTensor]), num_cell,
+      num_input);
+  CopyArrayToSubArray(
+      buffer, weights_dim2,
+      model->GetArray(src_op->inputs[kRecurrentToForgetWeightsTensor]),
+      num_cell * 2, num_input);
+  CopyArrayToSubArray(
+      buffer, weights_dim2,
+      model->GetArray(src_op->inputs[kRecurrentToOutputWeightsTensor]),
+      num_cell * 3, num_input);
+
+  // Create tensorflow_graphdef style's one big bias tensor.
+  string merged_biases = AvailableArrayName(*model, base_name + "biases");
+  auto& bias_array = model->GetOrCreateArray(merged_biases);
+  bias_array.data_type = ArrayDataType::kFloat;
+  bias_array.copy_shape(Shape({weights_dim1}));
+  auto& bias_buffer = bias_array.GetMutableBuffer<ArrayDataType::kFloat>();
+  bias_buffer.data.resize(weights_dim1);
+
+  // Merge 4 small bias tensors into a big one.
+  CopyArrayToSubArray(bias_buffer, weights_dim2,
+                      model->GetArray(src_op->inputs[kInputGateBiasTensor]), 0,
+                      0);
+  CopyArrayToSubArray(bias_buffer, weights_dim2,
+                      model->GetArray(src_op->inputs[kCellGateBiasTensor]),
+                      num_cell, 0);
+  CopyArrayToSubArray(bias_buffer, weights_dim2,
+                      model->GetArray(src_op->inputs[kForgetGateBiasTensor]),
+                      num_cell * 2, 0);
+  CopyArrayToSubArray(bias_buffer, weights_dim2,
+                      model->GetArray(src_op->inputs[kOutputGateBiasTensor]),
+                      num_cell * 3, 0);
+
+  // Emplace a new LSTM cell operator (use basic 5 inputs kernel).
+  auto lstm_cell_op = absl::make_unique<LstmCellOperator>();
+
+  // Compact LstmCell's 5 inputs.
+  lstm_cell_op->inputs.resize(LstmCellOperator::NUM_INPUTS);
+  lstm_cell_op->inputs[LstmCellOperator::DATA_INPUT] =
+      src_op->inputs[kInputTensor];
+  lstm_cell_op->inputs[LstmCellOperator::WEIGHTS_INPUT] = merged_weights;
+  lstm_cell_op->inputs[LstmCellOperator::BIASES_INPUT] = merged_biases;
+  lstm_cell_op->inputs[LstmCellOperator::PREV_ACTIV_INPUT] = prev_activ_input;
+  lstm_cell_op->inputs[LstmCellOperator::PREV_STATE_INPUT] = prev_state_input;
+
+  // Reorder LstmCell's 4 outputs.
+  lstm_cell_op->outputs.resize(LstmCellOperator::NUM_OUTPUTS);
+  lstm_cell_op->outputs[LstmCellOperator::ACTIV_OUTPUT] =
+      src_op->outputs[kOutputTensor];
+  lstm_cell_op->outputs[LstmCellOperator::STATE_OUTPUT] =
+      src_op->outputs[kCellStateTensor];
+  lstm_cell_op->outputs[LstmCellOperator::CONCAT_TEMP] =
+      src_op->outputs[kScratchBufferTensor];
+  lstm_cell_op->outputs[LstmCellOperator::ACTIV_TEMP] =
+      src_op->outputs[kOutputStateTensor];
+
+  // Add the op into model.
+  model->operators.emplace(op_it, std::move(lstm_cell_op));
+  AddMessageF("Creating compact LstmCell replacing previous lstm cell");
+
+  // Delete arrays and operators replaced by the LSTM cell operator. Order is
+  // important - DeleteArrayIfUnused() only succeeds if dependent operators
+  // have been removed first. Start at the output and work towards the input.
+  // Erase curr lstm op being replaced.
+  DeleteArrayIfUnused(src_op->inputs[kInputToInputWeightsTensor], model);
+  DeleteArrayIfUnused(src_op->inputs[kInputToForgetWeightsTensor], model);
+  DeleteArrayIfUnused(src_op->inputs[kInputToCellWeightsTensor], model);
+  DeleteArrayIfUnused(src_op->inputs[kInputToOutputWeightsTensor], model);
+  DeleteArrayIfUnused(src_op->inputs[kRecurrentToInputWeightsTensor], model);
+  DeleteArrayIfUnused(src_op->inputs[kRecurrentToForgetWeightsTensor], model);
+  DeleteArrayIfUnused(src_op->inputs[kRecurrentToCellWeightsTensor], model);
+  DeleteArrayIfUnused(src_op->inputs[kRecurrentToOutputWeightsTensor], model);
+  DeleteArrayIfUnused(src_op->inputs[kInputGateBiasTensor], model);
+  DeleteArrayIfUnused(src_op->inputs[kForgetGateBiasTensor], model);
+  DeleteArrayIfUnused(src_op->inputs[kCellGateBiasTensor], model);
+  DeleteArrayIfUnused(src_op->inputs[kOutputGateBiasTensor], model);
+  model->operators.erase(FindOp(*model, src_op));
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm_split_inputs.cc b/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm_split_inputs.cc
new file mode 100644
index 0000000000..eca717680a
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm_split_inputs.cc
@@ -0,0 +1,171 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "absl/memory/memory.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/lstm_utils.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+
+namespace toco {
+
+bool SplitLstmCellInputs::Run(Model* model, std::size_t op_index) {
+  // Find lstm cell.
+  auto op_it = model->operators.begin() + op_index;
+  auto curr_op = op_it->get();
+  if (curr_op->type != OperatorType::kLstmCell) {
+    return false;
+  }
+
+  // Already an extended LstmCell with kExtendedLstmInputCount of inputs,
+  // do not need to split cell inputs.
+  if (curr_op->inputs.size() == kExtendedLstmInputCount) {
+    return false;
+  }
+
+  // Make sure the WEIGHTS_INPUT and BIASES_INPUT are constant arrays,
+  // that are able to be split into smaller weight and bias tensors.
+  if (!IsConstantParameterArray(
+          *model, curr_op->inputs[LstmCellOperator::WEIGHTS_INPUT]) ||
+      !IsConstantParameterArray(
+          *model, curr_op->inputs[LstmCellOperator::BIASES_INPUT])) {
+    return false;
+  }
+
+  // Make sure propagate_fixed_sizes has defined the size of the output.
+  if (!model->GetArray(curr_op->outputs[LstmCellOperator::ACTIV_OUTPUT])
+           .has_shape()) {
+    return false;
+  }
+
+  // Emplace a new LstmCell operator with extended inputs (kernel/lstm.cc).
+  auto lstm_cell_op = absl::make_unique<LstmCellOperator>();
+  lstm_cell_op->inputs.resize(kExtendedLstmInputCount);
+  int num_input = model->GetArray(curr_op->inputs[LstmCellOperator::DATA_INPUT])
+                      .shape()
+                      .dims(1);
+
+  // n_cell and n_output have the same size when there is no projection.
+  int num_cell =
+      model->GetArray(curr_op->outputs[LstmCellOperator::ACTIV_OUTPUT])
+          .shape()
+          .dims(1);
+  int num_output = num_cell;
+
+  // Data input.
+  lstm_cell_op->inputs[kInputTensor] =
+      curr_op->inputs[LstmCellOperator::ACTIV_OUTPUT];
+
+  // Get original weight tensor and decompose 1 tensor to 8 sub tensors.
+  Array& kernel =
+      model->GetArray(curr_op->inputs[LstmCellOperator::WEIGHTS_INPUT]);
+  const string base_name(FindLongestCommonPrefix(
+      curr_op->outputs[LstmCellOperator::ACTIV_OUTPUT],
+      curr_op->outputs[LstmCellOperator::STATE_OUTPUT]));
+
+  // Input weight tensors of size {n_cell, n_input}.
+  CopySubArrayToArray(
+      model, &(lstm_cell_op->inputs[kInputToInputWeightsTensor]),
+      base_name + "weight_i_i", num_cell, num_input, kernel, 0, 0);
+  CopySubArrayToArray(model, &(lstm_cell_op->inputs[kInputToCellWeightsTensor]),
+                      base_name + "weight_c_i", num_cell, num_input, kernel,
+                      num_cell, 0);
+  CopySubArrayToArray(
+      model, &(lstm_cell_op->inputs[kInputToForgetWeightsTensor]),
+      base_name + "weight_f_i", num_cell, num_input, kernel, num_cell * 2, 0);
+  CopySubArrayToArray(
+      model, &(lstm_cell_op->inputs[kInputToOutputWeightsTensor]),
+      base_name + "weight_o_i", num_cell, num_input, kernel, num_cell * 3, 0);
+
+  // Recurrent weight tensors of size {n_cell, n_output}.
+  CopySubArrayToArray(
+      model, &(lstm_cell_op->inputs[kRecurrentToInputWeightsTensor]),
+      base_name + "weight_i_r", num_cell, num_output, kernel, 0, num_input);
+  CopySubArrayToArray(model,
+                      &(lstm_cell_op->inputs[kRecurrentToCellWeightsTensor]),
+                      base_name + "weight_c_r", num_cell, num_output, kernel,
+                      num_cell, num_input);
+  CopySubArrayToArray(model,
+                      &(lstm_cell_op->inputs[kRecurrentToForgetWeightsTensor]),
+                      base_name + "weight_f_r", num_cell, num_output, kernel,
+                      num_cell * 2, num_input);
+  CopySubArrayToArray(model,
+                      &(lstm_cell_op->inputs[kRecurrentToOutputWeightsTensor]),
+                      base_name + "weight_o_r", num_cell, num_output, kernel,
+                      num_cell * 3, num_input);
+
+  // Peephole (optional).
+  CreateOptionalArray(model, &(lstm_cell_op->inputs[kCellToInputWeightsTensor]),
+                      base_name + "peephole_c_i");
+  CreateOptionalArray(model,
+                      &(lstm_cell_op->inputs[kCellToForgetWeightsTensor]),
+                      base_name + "peephole_c_f");
+  CreateOptionalArray(model,
+                      &(lstm_cell_op->inputs[kCellToOutputWeightsTensor]),
+                      base_name + "peephole_c_o");
+
+  // Get original bias tensor and decompose 1 tensor to 4 sub tensors
+  Array& bias =
+      model->GetArray(curr_op->inputs[LstmCellOperator::BIASES_INPUT]);
+  CopySubArrayToArray(model, &(lstm_cell_op->inputs[kInputGateBiasTensor]),
+                      base_name + "bias_i", num_cell, 1, bias, 0, 0);
+  CopySubArrayToArray(model, &(lstm_cell_op->inputs[kCellGateBiasTensor]),
+                      base_name + "bias_c", num_cell, 1, bias, num_cell, 0);
+  CopySubArrayToArray(model, &(lstm_cell_op->inputs[kForgetGateBiasTensor]),
+                      base_name + "bias_f", num_cell, 1, bias, num_cell * 2, 0);
+  CopySubArrayToArray(model, &(lstm_cell_op->inputs[kOutputGateBiasTensor]),
+                      base_name + "bias_o", num_cell, 1, bias, num_cell * 3, 0);
+
+  // Projection (optional).
+  CreateOptionalArray(model, &(lstm_cell_op->inputs[kProjectionWeightsTensor]),
+                      base_name + "proj_weight");
+  CreateOptionalArray(model, &(lstm_cell_op->inputs[kProjectionBiasTensor]),
+                      base_name + "proj_bias");
+
+  // Reorder LstmCell's outputs.
+  lstm_cell_op->outputs.resize(LstmCellOperator::NUM_OUTPUTS);
+  lstm_cell_op->outputs[kScratchBufferTensor] =
+      curr_op->outputs[LstmCellOperator::CONCAT_TEMP];
+  lstm_cell_op->outputs[kOutputStateTensor] =
+      curr_op->outputs[LstmCellOperator::ACTIV_TEMP];
+  lstm_cell_op->outputs[kCellStateTensor] =
+      curr_op->outputs[LstmCellOperator::STATE_OUTPUT];
+  lstm_cell_op->outputs[kOutputTensor] =
+      curr_op->outputs[LstmCellOperator::ACTIV_OUTPUT];
+
+  // Add the op into model.
+  model->operators.emplace(op_it, std::move(lstm_cell_op));
+  AddMessageF("Creating extended LstmCell replacing previous lstm cell");
+
+  // Delete arrays and operators replaced by the LSTM cell operator. Order is
+  // important - DeleteArrayIfUnused() only succeeds if dependent operators
+  // have been removed first. Start at the output and work towards the input.
+  // Erase curr lstm op being replaced.
+  DeleteArrayIfUnused(curr_op->inputs[LstmCellOperator::WEIGHTS_INPUT], model);
+  DeleteArrayIfUnused(curr_op->inputs[LstmCellOperator::BIASES_INPUT], model);
+  DeleteArrayIfUnused(curr_op->inputs[LstmCellOperator::PREV_ACTIV_INPUT],
+                      model);
+  DeleteArrayIfUnused(curr_op->inputs[LstmCellOperator::PREV_STATE_INPUT],
+                      model);
+  model->operators.erase(FindOp(*model, curr_op));
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/lstm_utils.cc b/tensorflow/contrib/lite/toco/graph_transformations/lstm_utils.cc
new file mode 100644
index 0000000000..910a960589
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/lstm_utils.cc
@@ -0,0 +1,97 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/toco/graph_transformations/lstm_utils.h"
+
+namespace toco {
+
+void CreateOptionalArray(Model* model, string* input_array_buffer,
+                         const string& array_name) {
+  *input_array_buffer = array_name;
+  model->CreateOptionalArray(array_name);
+}
+
+void CopyArrayData(const Buffer<ArrayDataType::kFloat>& src_buffer,
+                   int src_stride, int src_start_idx1, int src_start_idx2,
+                   Buffer<ArrayDataType::kFloat>* dst_buffer, int dst_stride,
+                   int dst_start_idx1, int dst_start_idx2, int dim1_copy_size,
+                   int dim2_copy_size) {
+  int src_offset = src_start_idx1 * src_stride + src_start_idx2;
+  int dst_offset = dst_start_idx1 * dst_stride + dst_start_idx2;
+  for (int i = 0; i < dim1_copy_size; i++) {
+    for (int j = 0; j < dim2_copy_size; j++) {
+      int idx_src = src_offset + i * src_stride + j;
+      int idx_dst = dst_offset + i * dst_stride + j;
+      dst_buffer->data[idx_dst] = src_buffer.data[idx_src];
+    }
+  }
+}
+
+Buffer<ArrayDataType::kFloat>* CreateFloatArrayBuffer(Model* model,
+                                                      string* array_name,
+                                                      const Shape& shape) {
+  *array_name = AvailableArrayName(*model, *array_name);
+  auto& array = model->GetOrCreateArray(*array_name);
+  array.data_type = ArrayDataType::kFloat;
+  array.copy_shape(shape);
+  Buffer<ArrayDataType::kFloat>* buffer =
+      &(array.GetMutableBuffer<ArrayDataType::kFloat>());
+  buffer->data.resize(RequiredBufferSizeForShape(shape));
+  return buffer;
+}
+
+void CopySubArrayToArray(Model* model, string* array_name,
+                         const string& tensor_name, int dim1_size,
+                         int dim2_size, const Array& original_array,
+                         int start_idx1, int start_idx2) {
+  // Determine whether it's bias or not, create shape, buffer.
+  bool is_bias = dim2_size == 1;
+  Shape shape = is_bias ? Shape({dim1_size}) : Shape({dim1_size, dim2_size});
+  Buffer<ArrayDataType::kFloat>* buffer =
+      CreateFloatArrayBuffer(model, array_name, shape);
+  auto& orig_buffer = original_array.GetBuffer<ArrayDataType::kFloat>();
+
+  // Copy data from big tensor.
+  CopyArrayData(orig_buffer, is_bias ? 1 : original_array.shape().dims(1),
+                start_idx1, start_idx2, buffer, dim2_size, 0, 0, dim1_size,
+                dim2_size);
+}
+
+void CopyArrayToSubArray(Buffer<ArrayDataType::kFloat>& tensor_buffer,
+                         int tensor_stride, const Array& sub_array,
+                         int start_idx1, int start_idx2) {
+  // Get tensor data.
+  bool is_bias = sub_array.shape().dims().size() == 1;
+  int dim1_copy_size = sub_array.shape().dims()[0];
+  int dim2_copy_size = is_bias ? 1 : sub_array.shape().dims(1);
+  auto& sub_buffer = sub_array.GetBuffer<ArrayDataType::kFloat>();
+
+  // Copy data from sub tensor.
+  CopyArrayData(sub_buffer, dim2_copy_size, 0, 0, &tensor_buffer,
+                is_bias ? 1 : tensor_stride, start_idx1, start_idx2,
+                dim1_copy_size, dim2_copy_size);
+}
+
+bool GetMatchingRnnArray(Model* model, const string& back_edge_source_array,
+                         string* rnn_array) {
+  for (const auto& rnn_state : model->flags.rnn_states()) {
+    if (rnn_state.back_edge_source_array() == back_edge_source_array) {
+      *rnn_array = rnn_state.state_array();
+      return true;
+    }
+  }
+  return false;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/lstm_utils.h b/tensorflow/contrib/lite/toco/graph_transformations/lstm_utils.h
new file mode 100644
index 0000000000..881c2d4dc8
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/lstm_utils.h
@@ -0,0 +1,102 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+
+namespace toco {
+
+// For consistency with the parameters defined in extended LstmCell's kernel
+// (tensorflow/contrib/lite/kernels/lstm.cc),
+// use lowercase for these constants.
+
+enum ExtendedLstmCellInputs {
+  kInputTensor = 0,
+  kInputToInputWeightsTensor = 1,  // Optional
+  kInputToForgetWeightsTensor = 2,
+  kInputToCellWeightsTensor = 3,
+  kInputToOutputWeightsTensor = 4,
+  kRecurrentToInputWeightsTensor = 5,  // Optional
+  kRecurrentToForgetWeightsTensor = 6,
+  kRecurrentToCellWeightsTensor = 7,
+  kRecurrentToOutputWeightsTensor = 8,
+  kCellToInputWeightsTensor = 9,    // Optional
+  kCellToForgetWeightsTensor = 10,  // Optional
+  kCellToOutputWeightsTensor = 11,  // Optional
+  kInputGateBiasTensor = 12,        // Optional
+  kForgetGateBiasTensor = 13,
+  kCellGateBiasTensor = 14,
+  kOutputGateBiasTensor = 15,
+  kProjectionWeightsTensor = 16,  // Optional
+  kProjectionBiasTensor = 17,     // Optional
+  kExtendedLstmInputCount = 18
+};
+
+enum ExtendedLstmCellOutputs {
+  kScratchBufferTensor = 0,
+  kOutputStateTensor = 1,
+  kCellStateTensor = 2,
+  kOutputTensor = 3
+};
+
+// Create optional array used for optional tensor in ExtendedLstmCell inputs.
+void CreateOptionalArray(Model* model, string* input_array_buffer,
+                         const string& array_name);
+
+// Create float array and get its buffer.
+Buffer<ArrayDataType::kFloat>* CreateFloatArrayBuffer(Model* model,
+                                                      string* array_name,
+                                                      const Shape& shape);
+
+// Copy data from one array to the other one (supports 1D and 2D array),
+// for 1D array, the 2nd dim's size is 1.
+// Arguments:
+//   src_buffer: the source buffer
+//   src_stride: the stride of source buffer, i.e., 2nd dim's size
+//   src_start_idx1: the 1st dim index of start point in src matrix
+//   src_start_idx2: the 2nd dim index of start point in src matrix
+//   dst_buffer: the destination buffer
+//   dst_stride: the stride of destination buffer, i.e., 2nd dim's size
+//   dst_start_idx1: the 1st dim index of start point in dst matrix
+//   dst_start_idx2: the 2nd dim index of start point in dst matrix
+//   dim1_copy_size: 1st dim size of copy data
+//   dim2_copy_size: 2nd dim size of copy data
+void CopyArrayData(const Buffer<ArrayDataType::kFloat>& src_buffer,
+                   int src_stride, int src_start_idx1, int src_start_idx2,
+                   Buffer<ArrayDataType::kFloat>* dst_buffer, int dst_stride,
+                   int dst_start_idx1, int dst_start_idx2, int dim1_copy_size,
+                   int dim2_copy_size);
+
+// Copy a subset of array data and create a smaller array,
+// mostly used for spliting weights and bias for Lstm cell.
+void CopySubArrayToArray(Model* model, string* array_name,
+                         const string& tensor_name, int dim1_size,
+                         int dim2_size, const Array& original_array,
+                         int start_idx1, int start_idx2);
+
+// Copy array data to a large array's submatrix,
+// mostly used for merging weights and bias for Lstm cell.
+void CopyArrayToSubArray(Buffer<ArrayDataType::kFloat>& tensor_buffer,
+                         int tensor_stride, const Array& sub_array,
+                         int start_idx1, int start_idx2);
+
+// Get mating rnn array inputs using rnn_states flag.
+bool GetMatchingRnnArray(Model* model, const string& back_edge_source_array,
+                         string* rnn_array);
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index fa7e70d90b..2a44849ffa 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -728,9 +728,8 @@ void ProcessResizeBilinearOperator(Model* model, ResizeBilinearOperator* op) {
 }
 
 void ProcessLstmCellOperator(Model* model, LstmCellOperator* op) {
-  // I/O arrays should be allocated on creation of op.
-  QCHECK_EQ(op->inputs.size(), LstmCellOperator::NUM_INPUTS);
-  QCHECK_EQ(op->outputs.size(), LstmCellOperator::NUM_OUTPUTS);
+  // Only required for compact LstmCell with default NUM_INPUTS of inputs.
+  if (op->inputs.size() != LstmCellOperator::NUM_INPUTS) return;
 
   const auto& input_array =
       model->GetArray(op->inputs[LstmCellOperator::DATA_INPUT]);
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/tests/BUILD b/tensorflow/contrib/lite/toco/graph_transformations/tests/BUILD
index 8931498782..2f94f9cd8a 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/tests/BUILD
+++ b/tensorflow/contrib/lite/toco/graph_transformations/tests/BUILD
@@ -18,6 +18,17 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "lstm_utils_test",
+    srcs = ["lstm_utils_test.cc"],
+    deps = [
+        "//tensorflow/contrib/lite/toco:graph_transformations",
+        "//tensorflow/contrib/lite/toco:model",
+        "//tensorflow/contrib/lite/toco:tooling_util",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/tests/lstm_utils_test.cc b/tensorflow/contrib/lite/toco/graph_transformations/tests/lstm_utils_test.cc
new file mode 100644
index 0000000000..6aae0775d3
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/tests/lstm_utils_test.cc
@@ -0,0 +1,442 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <tuple>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/toco/graph_transformations/lstm_utils.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+
+namespace toco {
+
+namespace {
+
+// A gmock matcher that check that elements of a float vector match to a given
+// tolerance.
+std::vector<testing::Matcher<float>> ArrayFloatNear(
+    const std::vector<float>& values, float max_abs_error = 1e-5) {
+  std::vector<testing::Matcher<float>> matchers;
+  matchers.reserve(values.size());
+  for (const float& v : values) {
+    matchers.emplace_back(testing::FloatNear(v, max_abs_error));
+  }
+  return matchers;
+}
+}  // namespace
+
+class CopyArrayDataTest : public ::testing::Test {
+ public:
+  CopyArrayDataTest() {}
+
+  void PrepareBuffers(Model* model, std::initializer_list<float> src_data,
+                      int src_dim_1, int src_dim_2,
+                      std::initializer_list<float> dst_data, int dst_dim_1,
+                      int dst_dim_2) {
+    string src_array = "src_array";
+    src_buffer_ = CreateFloatArrayBuffer(
+        model, &src_array,
+        src_dim_2 == 1 ? Shape({src_dim_1}) : Shape({src_dim_1, src_dim_2}));
+    PopulateBuffer(src_buffer_, src_data);
+    string dst_array = "dst_array";
+    dst_buffer_ = CreateFloatArrayBuffer(
+        model, &dst_array,
+        dst_dim_2 == 1 ? Shape({dst_dim_1}) : Shape({dst_dim_1, dst_dim_2}));
+    PopulateBuffer(dst_buffer_, dst_data);
+  }
+
+  Buffer<ArrayDataType::kFloat>* GetSrcBuffer() { return src_buffer_; }
+  Buffer<ArrayDataType::kFloat>* GetDstBuffer() { return dst_buffer_; }
+
+  void PopulateBuffer(Buffer<ArrayDataType::kFloat>* buffer,
+                      const std::vector<float>& init_data) {
+    for (int i = 0; i < init_data.size(); i++) {
+      buffer->data[i] = init_data[i];
+    }
+  }
+  void UpdateBuffer(Buffer<ArrayDataType::kFloat>* buffer,
+                    std::initializer_list<float> data) {
+    buffer->data.resize(data.size());
+    PopulateBuffer(buffer, data);
+  }
+
+ private:
+  Buffer<ArrayDataType::kFloat>* src_buffer_;
+  Buffer<ArrayDataType::kFloat>* dst_buffer_;
+};
+
+// Copy from 1 big 2D array to 8 smaller ones.
+TEST_F(CopyArrayDataTest, CopyFromBigArrayToSmallerArrayes2D) {
+  // Init src_buffer, dst_buffer.
+  Model model;
+  std::initializer_list<float> large_tf_weight_data = {
+      -0.320407, -0.108683, 0.406358,  -0.410811, -0.285786, -0.15769,
+      -0.194201, 0.170866,  0.084135,  0.201878,  0.21519,   -0.284458,
+      0.495906,  -0.073818, 0.045578,  0.149816,  -0.447073, -0.453578,
+      0.116766,  0.21808,   0.047326,  -0.001985, 0.402193,  0.315517,
+      0.38258,   0.43599,   0.11986,   0.465195,  0.33548,   -0.118789,
+      -0.414159, 0.049269,  0.156108,  0.093459,  -0.129103, -0.086274,
+      0.186188,  -0.324923, 0.4117,    -0.344439, 0.240465,  -0.343331,
+      -0.463082, -0.231706, -0.487465, -0.186592, -0.020756, -0.239007,
+      0.364817,  0.459106,  -0.171447, -0.006542, 0.204032,  -0.375317,
+      -0.041911, 0.051664,  0.320483,  0.155899,  0.156555,  -0.249823,
+      -0.353107, 0.031563,  -0.340771, -0.052532, 0.134631,  -0.257957,
+      -0.50141,  0.486939,  -0.43853,  0.268426,  -0.08754,  -0.109447,
+      -0.502462, -0.028055, -0.121838, -0.046016, 0.105309,  -0.070774,
+      0.495683,  -0.475088, 0.048654,  -0.38582,  0.411018,  -0.315606,
+      0.349628,  0.21698,   0.258989,  -0.097902, 0.331218,  0.034602,
+      0.418069,  -0.089025, -0.417513, 0.07609,   0.393821,  0.404733,
+      -0.055418, -0.43903,  -0.447049, 0.013125,  0.278503,  0.459869,
+      0.143755,  -0.177335, -0.162247, -0.432371, 0.153714,  -0.047403,
+      -0.446775, -0.418363, 0.019743,  0.042025};
+  std::initializer_list<float> tflite_lstm_input_weight = {0, 0, 0, 0, 0, 0,
+                                                           0, 0, 0, 0, 0, 0};
+  PrepareBuffers(&model, large_tf_weight_data, /*src_dim_1=*/16,
+                 /*src_dim_2=*/7, tflite_lstm_input_weight,
+                 /*dst_dim_1=*/4, /*dst_dim_2=*/3);
+
+  // Copy src starts at (0,0), size (4,3).
+  CopyArrayData(*(GetSrcBuffer()),
+                /*src_stride=*/7, /*src_start_idx1=*/0,
+                /*src_start_idx2=*/0, GetDstBuffer(), /*dst_stride=*/3,
+                /*dst_start_idx1=*/0, /*dst_start_idx2=*/0,
+                /*dim1_copy_size=*/4, /*dim2_copy_size=*/3);
+  std::vector<float> expected = {-0.320407, -0.108683, 0.406358, 0.170866,
+                                 0.084135,  0.201878,  0.045578, 0.149816,
+                                 -0.447073, -0.001985, 0.402193, 0.315517};
+  EXPECT_THAT(GetDstBuffer()->data, ElementsAreArray(ArrayFloatNear(expected)));
+
+  // Copy src starts at (4,0), size (4,3).
+  CopyArrayData(*(GetSrcBuffer()),
+                /*src_stride=*/7, /*src_start_idx1=*/4,
+                /*src_start_idx2=*/0, GetDstBuffer(), /*dst_stride=*/3,
+                /*dst_start_idx1=*/0, /*dst_start_idx2=*/0,
+                /*dim1_copy_size=*/4, /*dim2_copy_size=*/3);
+  expected = {0.33548,   -0.118789, -0.414159, -0.086274, 0.186188,  -0.324923,
+              -0.463082, -0.231706, -0.487465, 0.459106,  -0.171447, -0.006542};
+  EXPECT_THAT(GetDstBuffer()->data, ElementsAreArray(ArrayFloatNear(expected)));
+
+  // Copy src starts at (8,0), size (4,3).
+  CopyArrayData(*(GetSrcBuffer()),
+                /*src_stride=*/7, /*src_start_idx1=*/8,
+                /*src_start_idx2=*/0, GetDstBuffer(), /*dst_stride=*/3,
+                /*dst_start_idx1=*/0, /*dst_start_idx2=*/0,
+                /*dim1_copy_size=*/4, /*dim2_copy_size=*/3);
+  expected = {0.320483, 0.155899,  0.156555,  -0.052532, 0.134631, -0.257957,
+              -0.08754, -0.109447, -0.502462, -0.070774, 0.495683, -0.475088};
+  EXPECT_THAT(GetDstBuffer()->data, ElementsAreArray(ArrayFloatNear(expected)));
+
+  // Copy src starts at (12,0), size (4,3).
+  CopyArrayData(*(GetSrcBuffer()),
+                /*src_stride=*/7, /*src_start_idx1=*/12,
+                /*src_start_idx2=*/0, GetDstBuffer(), /*dst_stride=*/3,
+                /*dst_start_idx1=*/0, /*dst_start_idx2=*/0,
+                /*dim1_copy_size=*/4, /*dim2_copy_size=*/3);
+  expected = {0.349628,  0.21698,  0.258989, -0.089025, -0.417513, 0.07609,
+              -0.447049, 0.013125, 0.278503, -0.432371, 0.153714,  -0.047403};
+  EXPECT_THAT(GetDstBuffer()->data, ElementsAreArray(ArrayFloatNear(expected)));
+
+  // New dst_buffer with size 16.
+  std::initializer_list<float> tflite_lstm_recurrent_weight = {
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  PrepareBuffers(&model, large_tf_weight_data, /*src_dim_1=*/16,
+                 /*src_dim_2=*/7, tflite_lstm_recurrent_weight,
+                 /*dst_dim_1=*/4, /*dst_dim_2=*/4);
+
+  // Copy src starts at (0,3), size (4,4).
+  CopyArrayData(*(GetSrcBuffer()),
+                /*src_stride=*/7, /*src_start_idx1=*/0,
+                /*src_start_idx2=*/3, GetDstBuffer(), /*dst_stride=*/4,
+                /*dst_start_idx1=*/0, /*dst_start_idx2=*/0,
+                /*dim1_copy_size=*/4, /*dim2_copy_size=*/4);
+  expected = {-0.410811, -0.285786, -0.15769,  -0.194201, 0.21519, -0.284458,
+              0.495906,  -0.073818, -0.453578, 0.116766,  0.21808, 0.047326,
+              0.38258,   0.43599,   0.11986,   0.465195};
+  EXPECT_THAT(GetDstBuffer()->data, ElementsAreArray(ArrayFloatNear(expected)));
+
+  // Copy src starts at (4,3), size (4,4).
+  CopyArrayData(*(GetSrcBuffer()),
+                /*src_stride=*/7, /*src_start_idx1=*/4,
+                /*src_start_idx2=*/3, GetDstBuffer(), /*dst_stride=*/4,
+                /*dst_start_idx1=*/0, /*dst_start_idx2=*/0,
+                /*dim1_copy_size=*/4, /*dim2_copy_size=*/4);
+  expected = {0.049269, 0.156108,  0.093459,  -0.129103, 0.4117,    -0.344439,
+              0.240465, -0.343331, -0.186592, -0.020756, -0.239007, 0.364817,
+              0.204032, -0.375317, -0.041911, 0.051664};
+  EXPECT_THAT(GetDstBuffer()->data, ElementsAreArray(ArrayFloatNear(expected)));
+
+  // Copy src starts at (8,3), size (4,4).
+  CopyArrayData(*(GetSrcBuffer()),
+                /*src_stride=*/7, /*src_start_idx1=*/8,
+                /*src_start_idx2=*/3, GetDstBuffer(), /*dst_stride=*/4,
+                /*dst_start_idx1=*/0, /*dst_start_idx2=*/0,
+                /*dim1_copy_size=*/4, /*dim2_copy_size=*/4);
+  expected = {-0.249823, -0.353107, 0.031563,  -0.340771, -0.50141,  0.486939,
+              -0.43853,  0.268426,  -0.028055, -0.121838, -0.046016, 0.105309,
+              0.048654,  -0.38582,  0.411018,  -0.315606};
+  EXPECT_THAT(GetDstBuffer()->data, ElementsAreArray(ArrayFloatNear(expected)));
+
+  // Copy src starts at (12,3), size (4,4).
+  CopyArrayData(*(GetSrcBuffer()),
+                /*src_stride=*/7, /*src_start_idx1=*/12,
+                /*src_start_idx2=*/3, GetDstBuffer(), /*dst_stride=*/4,
+                /*dst_start_idx1=*/0, /*dst_start_idx2=*/0,
+                /*dim1_copy_size=*/4, /*dim2_copy_size=*/4);
+  expected = {-0.097902, 0.331218,  0.034602, 0.418069, 0.393821,  0.404733,
+              -0.055418, -0.43903,  0.459869, 0.143755, -0.177335, -0.162247,
+              -0.446775, -0.418363, 0.019743, 0.042025};
+  EXPECT_THAT(GetDstBuffer()->data, ElementsAreArray(ArrayFloatNear(expected)));
+}
+
+// Copy from 1 big 1D array to 4 small ones.
+TEST_F(CopyArrayDataTest, CopyFromBigArrayToSmallerArrayes1D) {
+  // Init src_buffer, dst_buffer.
+  Model model;
+  std::initializer_list<float> large_tf_bias_data = {
+      0.980304, 0.419808, 0.080278, 0.728548, 0.581674, 0.672433,
+      0.434190, 0.844357, 0.229587, 0.785629, 0.022065, 0.753082,
+      0.422080, 0.539481, 0.878386, 0.168965};
+  std::initializer_list<float> tflite_lstm_i_bias = {0, 0, 0, 0};
+  PrepareBuffers(&model, large_tf_bias_data, /*src_dim_1=*/16,
+                 /*src_dim_2=*/1, tflite_lstm_i_bias,
+                 /*dst_dim_1=*/4, /*dst_dim_2=*/1);
+
+  // Copy starts at (0,), size (4,).
+  CopyArrayData(*(GetSrcBuffer()),
+                /*src_stride=*/1, /*src_start_idx1=*/0,
+                /*src_start_idx2=*/0, GetDstBuffer(), /*dst_stride=*/1,
+                /*dst_start_idx1=*/0, /*dst_start_idx2=*/0,
+                /*dim1_copy_size=*/4, /*dim2_copy_size=*/1);
+  std::vector<float> expected = {0.980304, 0.419808, 0.080278, 0.728548};
+  EXPECT_THAT(GetDstBuffer()->data, ElementsAreArray(ArrayFloatNear(expected)));
+
+  // Copy starts at (4,), size (4,).
+  CopyArrayData(*(GetSrcBuffer()),
+                /*src_stride=*/1, /*src_start_idx1=*/4,
+                /*src_start_idx2=*/0, GetDstBuffer(), /*dst_stride=*/1,
+                /*dst_start_idx1=*/0, /*dst_start_idx2=*/0,
+                /*dim1_copy_size=*/4, /*dim2_copy_size=*/1);
+  expected = {0.581674, 0.672433, 0.434190, 0.844357};
+  EXPECT_THAT(GetDstBuffer()->data, ElementsAreArray(ArrayFloatNear(expected)));
+
+  // Copy starts at (8,), size (4,).
+  CopyArrayData(*(GetSrcBuffer()),
+                /*src_stride=*/1, /*src_start_idx1=*/8,
+                /*src_start_idx2=*/0, GetDstBuffer(), /*dst_stride=*/1,
+                /*dst_start_idx1=*/0, /*dst_start_idx2=*/0,
+                /*dim1_copy_size=*/4, /*dim2_copy_size=*/1);
+  expected = {0.229587, 0.785629, 0.022065, 0.753082};
+  EXPECT_THAT(GetDstBuffer()->data, ElementsAreArray(ArrayFloatNear(expected)));
+
+  // Copy starts at (12,), size (4,).
+  CopyArrayData(*(GetSrcBuffer()),
+                /*src_stride=*/1, /*src_start_idx1=*/12,
+                /*src_start_idx2=*/0, GetDstBuffer(), /*dst_stride=*/1,
+                /*dst_start_idx1=*/0, /*dst_start_idx2=*/0,
+                /*dim1_copy_size=*/4, /*dim2_copy_size=*/1);
+  expected = {0.422080, 0.539481, 0.878386, 0.168965};
+  EXPECT_THAT(GetDstBuffer()->data, ElementsAreArray(ArrayFloatNear(expected)));
+}
+
+// Copy from 8 small 2D arrayes to 1 big one.
+TEST_F(CopyArrayDataTest, CopyFromSmallArrayesToBigArray2D) {
+  // Init src_buffer, dst_buffer.
+  Model model;
+  std::initializer_list<float> large_tf_weights_data = {
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+  // Copy dst starts (0, 0), size (4, 3).
+  std::initializer_list<float> tflite_lstm_i2i_weight = {
+      -0.320407, -0.108683, 0.406358,  0.170866,  0.084135, 0.201878,
+      0.045578,  0.149816,  -0.447073, -0.001985, 0.402193, 0.315517};
+  PrepareBuffers(&model, tflite_lstm_i2i_weight, /*src_dim_1=*/4,
+                 /*src_dim_2=*/3, large_tf_weights_data,
+                 /*dst_dim_1=*/16, /*dst_dim_2=*/7);
+  CopyArrayData(*(GetSrcBuffer()),
+                /*src_stride=*/3, /*src_start_idx1=*/0,
+                /*src_start_idx2=*/0, GetDstBuffer(), /*dst_stride=*/7,
+                /*dst_start_idx1=*/0, /*dst_start_idx2=*/0,
+                /*dim1_copy_size=*/4, /*dim2_copy_size=*/3);
+
+  // Copy dst starts (4, 0), size (4, 3).
+  std::initializer_list<float> tflite_lstm_i2c_weight = {
+      0.33548,   -0.118789, -0.414159, -0.086274, 0.186188,  -0.324923,
+      -0.463082, -0.231706, -0.487465, 0.459106,  -0.171447, -0.006542};
+  PopulateBuffer(GetSrcBuffer(), tflite_lstm_i2c_weight);
+  CopyArrayData(*(GetSrcBuffer()),
+                /*src_stride=*/3, /*src_start_idx1=*/0,
+                /*src_start_idx2=*/0, GetDstBuffer(), /*dst_stride=*/7,
+                /*dst_start_idx1=*/4, /*dst_start_idx2=*/0,
+                /*dim1_copy_size=*/4, /*dim2_copy_size=*/3);
+
+  // Copy dst starts (8, 0), size (4, 3).
+  std::initializer_list<float> tflite_lstm_i2f_weight = {
+      0.320483, 0.155899,  0.156555,  -0.052532, 0.134631, -0.257957,
+      -0.08754, -0.109447, -0.502462, -0.070774, 0.495683, -0.475088};
+  PopulateBuffer(GetSrcBuffer(), tflite_lstm_i2f_weight);
+  CopyArrayData(*(GetSrcBuffer()),
+                /*src_stride=*/3, /*src_start_idx1=*/0,
+                /*src_start_idx2=*/0, GetDstBuffer(), /*dst_stride=*/7,
+                /*dst_start_idx1=*/8, /*dst_start_idx2=*/0,
+                /*dim1_copy_size=*/4, /*dim2_copy_size=*/3);
+
+  // Copy dst starts (12, 0), size (4, 3).
+  std::initializer_list<float> tflite_lstm_i2o_weight = {
+      0.349628,  0.21698,  0.258989, -0.089025, -0.417513, 0.07609,
+      -0.447049, 0.013125, 0.278503, -0.432371, 0.153714,  -0.047403};
+  PopulateBuffer(GetSrcBuffer(), tflite_lstm_i2o_weight);
+  CopyArrayData(*(GetSrcBuffer()),
+                /*src_stride=*/3, /*src_start_idx1=*/0,
+                /*src_start_idx2=*/0, GetDstBuffer(), /*dst_stride=*/7,
+                /*dst_start_idx1=*/12, /*dst_start_idx2=*/0,
+                /*dim1_copy_size=*/4, /*dim2_copy_size=*/3);
+
+  // Copy dst starts (0, 3), size (4, 4).
+  std::initializer_list<float> tflite_lstm_i2r_weight = {
+      -0.410811, -0.285786, -0.15769,  -0.194201, 0.21519, -0.284458,
+      0.495906,  -0.073818, -0.453578, 0.116766,  0.21808, 0.047326,
+      0.38258,   0.43599,   0.11986,   0.465195};
+  UpdateBuffer(GetSrcBuffer(), tflite_lstm_i2r_weight);
+  CopyArrayData(*(GetSrcBuffer()),
+                /*src_stride=*/4, /*src_start_idx1=*/0,
+                /*src_start_idx2=*/0, GetDstBuffer(), /*dst_stride=*/7,
+                /*dst_start_idx1=*/0, /*dst_start_idx2=*/3,
+                /*dim1_copy_size=*/4, /*dim2_copy_size=*/4);
+
+  // Copy dst starts (4, 3), size (4, 4).
+  std::initializer_list<float> tflite_lstm_c2r_weight = {
+      0.049269, 0.156108,  0.093459,  -0.129103, 0.4117,    -0.344439,
+      0.240465, -0.343331, -0.186592, -0.020756, -0.239007, 0.364817,
+      0.204032, -0.375317, -0.041911, 0.051664};
+  PopulateBuffer(GetSrcBuffer(), tflite_lstm_c2r_weight);
+  CopyArrayData(*(GetSrcBuffer()),
+                /*src_stride=*/4, /*src_start_idx1=*/0,
+                /*src_start_idx2=*/0, GetDstBuffer(), /*dst_stride=*/7,
+                /*dst_start_idx1=*/4, /*dst_start_idx2=*/3,
+                /*dim1_copy_size=*/4, /*dim2_copy_size=*/4);
+
+  // Copy dst starts (8, 3), size (4, 4).
+  std::initializer_list<float> tflite_lstm_f2r_weight = {
+      -0.249823, -0.353107, 0.031563,  -0.340771, -0.50141,  0.486939,
+      -0.43853,  0.268426,  -0.028055, -0.121838, -0.046016, 0.105309,
+      0.048654,  -0.38582,  0.411018,  -0.315606};
+  PopulateBuffer(GetSrcBuffer(), tflite_lstm_f2r_weight);
+  CopyArrayData(*(GetSrcBuffer()),
+                /*src_stride=*/4, /*src_start_idx1=*/0,
+                /*src_start_idx2=*/0, GetDstBuffer(), /*dst_stride=*/7,
+                /*dst_start_idx1=*/8, /*dst_start_idx2=*/3,
+                /*dim1_copy_size=*/4, /*dim2_copy_size=*/4);
+
+  // Copy dst starts (12, 3), size (4, 4).
+  std::initializer_list<float> tflite_lstm_o2r_weight = {
+      -0.097902, 0.331218,  0.034602, 0.418069, 0.393821,  0.404733,
+      -0.055418, -0.43903,  0.459869, 0.143755, -0.177335, -0.162247,
+      -0.446775, -0.418363, 0.019743, 0.042025};
+  PopulateBuffer(GetSrcBuffer(), tflite_lstm_o2r_weight);
+  CopyArrayData(*(GetSrcBuffer()),
+                /*src_stride=*/4, /*src_start_idx1=*/0,
+                /*src_start_idx2=*/0, GetDstBuffer(), /*dst_stride=*/7,
+                /*dst_start_idx1=*/12, /*dst_start_idx2=*/3,
+                /*dim1_copy_size=*/4, /*dim2_copy_size=*/4);
+
+  std::vector<float> expected = {
+      -0.320407, -0.108683, 0.406358,  -0.410811, -0.285786, -0.15769,
+      -0.194201, 0.170866,  0.084135,  0.201878,  0.21519,   -0.284458,
+      0.495906,  -0.073818, 0.045578,  0.149816,  -0.447073, -0.453578,
+      0.116766,  0.21808,   0.047326,  -0.001985, 0.402193,  0.315517,
+      0.38258,   0.43599,   0.11986,   0.465195,  0.33548,   -0.118789,
+      -0.414159, 0.049269,  0.156108,  0.093459,  -0.129103, -0.086274,
+      0.186188,  -0.324923, 0.4117,    -0.344439, 0.240465,  -0.343331,
+      -0.463082, -0.231706, -0.487465, -0.186592, -0.020756, -0.239007,
+      0.364817,  0.459106,  -0.171447, -0.006542, 0.204032,  -0.375317,
+      -0.041911, 0.051664,  0.320483,  0.155899,  0.156555,  -0.249823,
+      -0.353107, 0.031563,  -0.340771, -0.052532, 0.134631,  -0.257957,
+      -0.50141,  0.486939,  -0.43853,  0.268426,  -0.08754,  -0.109447,
+      -0.502462, -0.028055, -0.121838, -0.046016, 0.105309,  -0.070774,
+      0.495683,  -0.475088, 0.048654,  -0.38582,  0.411018,  -0.315606,
+      0.349628,  0.21698,   0.258989,  -0.097902, 0.331218,  0.034602,
+      0.418069,  -0.089025, -0.417513, 0.07609,   0.393821,  0.404733,
+      -0.055418, -0.43903,  -0.447049, 0.013125,  0.278503,  0.459869,
+      0.143755,  -0.177335, -0.162247, -0.432371, 0.153714,  -0.047403,
+      -0.446775, -0.418363, 0.019743,  0.042025};
+
+  EXPECT_THAT(GetDstBuffer()->data, ElementsAreArray(ArrayFloatNear(expected)));
+}
+
+// Copy from 4 small 1D arrayes to 1 big one.
+TEST_F(CopyArrayDataTest, CopyFromSmallArrayesToBigArray1D) {
+  // Init src_buffer, dst_buffer.
+  Model model;
+  std::initializer_list<float> large_tf_bias_data = {0, 0, 0, 0, 0, 0, 0, 0,
+                                                     0, 0, 0, 0, 0, 0, 0, 0};
+
+  std::initializer_list<float> tflite_lstm_i_bias = {0.980304, 0.419808,
+                                                     0.080278, 0.728548};
+
+  PrepareBuffers(&model, tflite_lstm_i_bias, /*src_dim_1=*/4,
+                 /*src_dim_2=*/1, large_tf_bias_data,
+                 /*dst_dim_1=*/16, /*dst_dim_2=*/1);
+
+  // Copy starts at (0,), size (4,).
+  CopyArrayData(*(GetSrcBuffer()),
+                /*src_stride=*/1, /*src_start_idx1=*/0,
+                /*src_start_idx2=*/0, GetDstBuffer(), /*dst_stride=*/1,
+                /*dst_start_idx1=*/0, /*dst_start_idx2=*/0,
+                /*dim1_copy_size=*/4, /*dim2_copy_size=*/1);
+
+  // Copy starts at (4,), size (4,).
+  std::initializer_list<float> tflite_lstm_cell_bias = {0.581674, 0.672433,
+                                                        0.434190, 0.844357};
+  PopulateBuffer(GetSrcBuffer(), tflite_lstm_cell_bias);
+  CopyArrayData(*(GetSrcBuffer()),
+                /*src_stride=*/1, /*src_start_idx1=*/0,
+                /*src_start_idx2=*/0, GetDstBuffer(), /*dst_stride=*/1,
+                /*dst_start_idx1=*/4, /*dst_start_idx2=*/0,
+                /*dim1_copy_size=*/4, /*dim2_copy_size=*/1);
+
+  // Copy starts at (8,0), size (4,).
+  std::initializer_list<float> tflite_lstm_forget_bias = {0.229587, 0.785629,
+                                                          0.022065, 0.753082};
+  PopulateBuffer(GetSrcBuffer(), tflite_lstm_forget_bias);
+  CopyArrayData(*(GetSrcBuffer()),
+                /*src_stride=*/1, /*src_start_idx1=*/0,
+                /*src_start_idx2=*/0, GetDstBuffer(), /*dst_stride=*/1,
+                /*dst_start_idx1=*/8, /*dst_start_idx2=*/0,
+                /*dim1_copy_size=*/4, /*dim2_copy_size=*/1);
+
+  // Copy starts at (12,), size (4,).
+  std::initializer_list<float> tflite_lstm_output_bias = {0.422080, 0.539481,
+                                                          0.878386, 0.168965};
+  PopulateBuffer(GetSrcBuffer(), tflite_lstm_output_bias);
+  CopyArrayData(*(GetSrcBuffer()),
+                /*src_stride=*/1, /*src_start_idx1=*/0,
+                /*src_start_idx2=*/0, GetDstBuffer(), /*dst_stride=*/1,
+                /*dst_start_idx1=*/12, /*dst_start_idx2=*/0,
+                /*dim1_copy_size=*/4, /*dim2_copy_size=*/1);
+
+  std::vector<float> expected = {0.980304, 0.419808, 0.080278, 0.728548,
+                                 0.581674, 0.672433, 0.434190, 0.844357,
+                                 0.229587, 0.785629, 0.022065, 0.753082,
+                                 0.422080, 0.539481, 0.878386, 0.168965};
+
+  EXPECT_THAT(GetDstBuffer()->data, ElementsAreArray(ArrayFloatNear(expected)));
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/tflite/operator.cc b/tensorflow/contrib/lite/toco/tflite/operator.cc
index 04aaedd59d..ff54b350bf 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator.cc
@@ -524,6 +524,28 @@ class Transpose
                    TocoOperator* op) const override {}
 };
 
+class Lstm : public BuiltinOperator<LstmCellOperator, ::tflite::LSTMOptions,
+                                    ::tflite::BuiltinOptions_LSTMOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    // Current toco converter only supports tanh, no clip.
+    return ::tflite::CreateLSTMOptions(*builder, /*fused_activation_function=*/
+                                       ::tflite::ActivationFunctionType_TANH,
+                                       /*cell_clip=*/0.0,
+                                       /*proj_clip=*/0.0);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    // Only support tanh activation, so check that tflite type is tanh.
+    CHECK(options.fused_activation_function() ==
+          ::tflite::ActivationFunctionType_TANH);
+  }
+};
+
 class Mean : public BuiltinOperator<MeanOperator, ::tflite::MeanOptions,
                                     ::tflite::BuiltinOptions_MeanOptions> {
  public:
@@ -779,6 +801,8 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList() {
       new Squeeze(::tflite::BuiltinOperator_SQUEEZE, OperatorType::kSqueeze));
   ops.emplace_back(new StridedSlice(::tflite::BuiltinOperator_STRIDED_SLICE,
                                     OperatorType::kStridedSlice));
+  ops.emplace_back(
+      new Lstm(::tflite::BuiltinOperator_LSTM, OperatorType::kLstmCell));
 
   // Custom Operators.
   ops.emplace_back(new Cast("CAST", OperatorType::kCast));
diff --git a/tensorflow/contrib/lite/toco/toco_tooling.cc b/tensorflow/contrib/lite/toco/toco_tooling.cc
index ac77284be0..47da8b68ea 100644
--- a/tensorflow/contrib/lite/toco/toco_tooling.cc
+++ b/tensorflow/contrib/lite/toco/toco_tooling.cc
@@ -107,7 +107,8 @@ bool SupportsFusedActivationFunction(FileFormat format) {
 }
 
 bool SupportsLstmCell(FileFormat format) {
-  return (format == TENSORFLOW_GRAPHDEF || format == GRAPHVIZ_DOT);
+  return (format == TENSORFLOW_GRAPHDEF || format == GRAPHVIZ_DOT ||
+          format == TFLITE);
 }
 
 bool SupportsPreallocatedWorkspace(FileFormat format) {
@@ -225,9 +226,13 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
     }
   }
   transformations.Add(new ConvertPureConvToDepthwise);
-  // TFLite export does not yet support fused LSTM cell.
   if (SupportsLstmCell(output_format)) {
     transformations.Add(new IdentifyLstmCell);
+    if (output_format == TFLITE) {
+      transformations.Add(new toco::SplitLstmCellInputs);
+    } else {
+      transformations.Add(new toco::MergeLstmCellInputs);
+    }
   }
   transformations.Add(new ResolveConstantConcatenation);
   RunGraphTransformations(model, "general graph transformations",
-- 
GitLab


From b9018073ec1afc7dfc302ab171db8bf5b177c2dd Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Tue, 6 Feb 2018 22:05:26 -0800
Subject: [PATCH 747/966] Add pylint check for W0611 unused-import in
 ci_sanity.sh and fix existing pylint errors.

PiperOrigin-RevId: 184790548
---
 tensorflow/contrib/copy_graph/python/util/copy_test.py       | 2 --
 .../eager/python/examples/rnn_colorbot/rnn_colorbot.py       | 1 -
 .../framework/python/ops/accumulate_n_v2_eager_test.py       | 5 -----
 .../contrib/framework/python/ops/accumulate_n_v2_test.py     | 1 -
 tensorflow/contrib/framework/python/ops/variables.py         | 3 +--
 .../single_image_random_dot_stereograms_ops_test.py          | 5 -----
 tensorflow/contrib/learn/python/learn/datasets/base.py       | 2 --
 tensorflow/contrib/learn/python/learn/ops/ops_test.py        | 1 -
 tensorflow/contrib/model_pruning/python/layers/layers.py     | 1 -
 tensorflow/contrib/nn/python/ops/alpha_dropout.py            | 2 --
 tensorflow/contrib/nn/python/ops/alpha_dropout_test.py       | 1 -
 .../contrib/opt/python/training/nadam_optimizer_test.py      | 3 ---
 .../periodic_resample/python/ops/periodic_resample_op.py     | 3 ++-
 .../python/kernel_tests/reduce_slice_ops_test.py             | 1 -
 .../contrib/rnn/python/kernel_tests/core_rnn_cell_test.py    | 2 --
 tensorflow/contrib/rnn/python/ops/rnn_cell.py                | 4 ++--
 tensorflow/contrib/session_bundle/gc.py                      | 1 -
 tensorflow/contrib/sparsemax/python/ops/sparsemax.py         | 1 -
 tensorflow/contrib/sparsemax/python/ops/sparsemax_loss.py    | 2 --
 tensorflow/examples/label_image/label_image.py               | 1 -
 tensorflow/examples/tutorials/mnist/input_data.py            | 2 ++
 tensorflow/python/client/session_test.py                     | 2 --
 tensorflow/python/framework/load_library.py                  | 4 ++--
 tensorflow/python/framework/test_util.py                     | 2 +-
 tensorflow/python/kernel_tests/conv2d_transpose_test.py      | 1 -
 tensorflow/python/kernel_tests/decode_bmp_op_test.py         | 1 -
 tensorflow/python/kernel_tests/decode_raw_op_test.py         | 1 -
 tensorflow/python/kernel_tests/fifo_queue_test.py            | 1 -
 .../python/kernel_tests/random/random_shuffle_queue_test.py  | 1 -
 tensorflow/python/kernel_tests/softmax_op_test.py            | 2 --
 tensorflow/python/ops/candidate_sampling_ops.py              | 4 ++--
 tensorflow/python/ops/control_flow_ops.py                    | 1 -
 tensorflow/python/ops/gradients_impl.py                      | 2 +-
 tensorflow/python/ops/nn_grad_test.py                        | 2 +-
 tensorflow/python/ops/nn_impl.py                             | 2 +-
 tensorflow/python/tools/saved_model_cli.py                   | 2 +-
 tensorflow/python/training/training_ops.py                   | 2 +-
 tensorflow/tools/ci_build/ci_sanity.sh                       | 3 ++-
 38 files changed, 19 insertions(+), 58 deletions(-)

diff --git a/tensorflow/contrib/copy_graph/python/util/copy_test.py b/tensorflow/contrib/copy_graph/python/util/copy_test.py
index 2798d31229..05744bec4e 100644
--- a/tensorflow/contrib/copy_graph/python/util/copy_test.py
+++ b/tensorflow/contrib/copy_graph/python/util/copy_test.py
@@ -17,9 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
 from tensorflow.contrib.copy_graph.python.util import copy_elements
-from tensorflow.contrib.framework.python.framework import tensor_util
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
diff --git a/tensorflow/contrib/eager/python/examples/rnn_colorbot/rnn_colorbot.py b/tensorflow/contrib/eager/python/examples/rnn_colorbot/rnn_colorbot.py
index 40919f2d4c..aa87b94e7b 100644
--- a/tensorflow/contrib/eager/python/examples/rnn_colorbot/rnn_colorbot.py
+++ b/tensorflow/contrib/eager/python/examples/rnn_colorbot/rnn_colorbot.py
@@ -65,7 +65,6 @@ import six
 import tensorflow as tf
 
 from tensorflow.contrib.eager.python import tfe
-from tensorflow.python.eager import context
 
 try:
   import matplotlib.pyplot as plt  # pylint: disable=g-import-not-at-top
diff --git a/tensorflow/contrib/framework/python/ops/accumulate_n_v2_eager_test.py b/tensorflow/contrib/framework/python/ops/accumulate_n_v2_eager_test.py
index 8f44698da8..35974b9e21 100644
--- a/tensorflow/contrib/framework/python/ops/accumulate_n_v2_eager_test.py
+++ b/tensorflow/contrib/framework/python/ops/accumulate_n_v2_eager_test.py
@@ -27,16 +27,11 @@ import numpy as np
 from tensorflow.contrib.framework.python.ops import accumulate_n_v2 as av2
 
 from tensorflow.python.eager import backprop
-from tensorflow.python.eager import context as eager_context
-from tensorflow.python.eager import tape
 
 
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes as dtypes_lib
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import gradients
-from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import test
 
diff --git a/tensorflow/contrib/framework/python/ops/accumulate_n_v2_test.py b/tensorflow/contrib/framework/python/ops/accumulate_n_v2_test.py
index 6f65fe771e..45962098e9 100644
--- a/tensorflow/contrib/framework/python/ops/accumulate_n_v2_test.py
+++ b/tensorflow/contrib/framework/python/ops/accumulate_n_v2_test.py
@@ -22,7 +22,6 @@ import numpy as np
 
 from tensorflow.contrib.framework.python.ops import accumulate_n_v2 as av2
 
-from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_lib
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
diff --git a/tensorflow/contrib/framework/python/ops/variables.py b/tensorflow/contrib/framework/python/ops/variables.py
index 3f1ece4510..a9d47ac9b9 100644
--- a/tensorflow/contrib/framework/python/ops/variables.py
+++ b/tensorflow/contrib/framework/python/ops/variables.py
@@ -32,9 +32,8 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import gen_state_ops
-from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.platform import resource_loader
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import saver as tf_saver
 from tensorflow.python.training import training_util
 from tensorflow.python.util.deprecation import deprecated
diff --git a/tensorflow/contrib/image/python/kernel_tests/single_image_random_dot_stereograms_ops_test.py b/tensorflow/contrib/image/python/kernel_tests/single_image_random_dot_stereograms_ops_test.py
index bf0c97245f..3f4029e558 100644
--- a/tensorflow/contrib/image/python/kernel_tests/single_image_random_dot_stereograms_ops_test.py
+++ b/tensorflow/contrib/image/python/kernel_tests/single_image_random_dot_stereograms_ops_test.py
@@ -18,13 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-
-from six.moves import xrange  # pylint: disable=redefined-builtin
-
 from tensorflow.contrib.image.python.ops.single_image_random_dot_stereograms \
     import single_image_random_dot_stereograms
-from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import googletest
diff --git a/tensorflow/contrib/learn/python/learn/datasets/base.py b/tensorflow/contrib/learn/python/learn/datasets/base.py
index 18bf16e246..ca720ae5ed 100644
--- a/tensorflow/contrib/learn/python/learn/datasets/base.py
+++ b/tensorflow/contrib/learn/python/learn/datasets/base.py
@@ -23,13 +23,11 @@ import csv
 import os
 from os import path
 import random
-import tempfile
 import time
 
 import numpy as np
 from six.moves import urllib
 
-from tensorflow.contrib.framework import deprecated
 from tensorflow.python.platform import gfile
 
 Dataset = collections.namedtuple('Dataset', ['data', 'target'])
diff --git a/tensorflow/contrib/learn/python/learn/ops/ops_test.py b/tensorflow/contrib/learn/python/learn/ops/ops_test.py
index d0b9eb8abc..80d4923db3 100644
--- a/tensorflow/contrib/learn/python/learn/ops/ops_test.py
+++ b/tensorflow/contrib/learn/python/learn/ops/ops_test.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.layers import conv2d
 from tensorflow.contrib.learn.python.learn import ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
diff --git a/tensorflow/contrib/model_pruning/python/layers/layers.py b/tensorflow/contrib/model_pruning/python/layers/layers.py
index dfebb9a679..988748ad75 100644
--- a/tensorflow/contrib/model_pruning/python/layers/layers.py
+++ b/tensorflow/contrib/model_pruning/python/layers/layers.py
@@ -21,7 +21,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
 import six
 
 from tensorflow.contrib.framework.python.ops import add_arg_scope
diff --git a/tensorflow/contrib/nn/python/ops/alpha_dropout.py b/tensorflow/contrib/nn/python/ops/alpha_dropout.py
index d7b61a5844..2f92d05ba8 100644
--- a/tensorflow/contrib/nn/python/ops/alpha_dropout.py
+++ b/tensorflow/contrib/nn/python/ops/alpha_dropout.py
@@ -18,7 +18,6 @@ from __future__ import print_function
 
 import numbers
 
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
@@ -26,7 +25,6 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn_impl
 
 
 def alpha_dropout(x, keep_prob, noise_shape=None, seed=None, name=None): # pylint: disable=invalid-name
diff --git a/tensorflow/contrib/nn/python/ops/alpha_dropout_test.py b/tensorflow/contrib/nn/python/ops/alpha_dropout_test.py
index 2ff978ab89..54a98e6f14 100644
--- a/tensorflow/contrib/nn/python/ops/alpha_dropout_test.py
+++ b/tensorflow/contrib/nn/python/ops/alpha_dropout_test.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 from tensorflow.contrib.nn.python.ops.alpha_dropout import alpha_dropout
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import nn_impl
diff --git a/tensorflow/contrib/opt/python/training/nadam_optimizer_test.py b/tensorflow/contrib/opt/python/training/nadam_optimizer_test.py
index b0a257d264..825c08a09a 100644
--- a/tensorflow/contrib/opt/python/training/nadam_optimizer_test.py
+++ b/tensorflow/contrib/opt/python/training/nadam_optimizer_test.py
@@ -21,12 +21,9 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.contrib.opt.python.training import nadam_optimizer
-from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
diff --git a/tensorflow/contrib/periodic_resample/python/ops/periodic_resample_op.py b/tensorflow/contrib/periodic_resample/python/ops/periodic_resample_op.py
index 6a09f70f44..348623d8f8 100644
--- a/tensorflow/contrib/periodic_resample/python/ops/periodic_resample_op.py
+++ b/tensorflow/contrib/periodic_resample/python/ops/periodic_resample_op.py
@@ -18,13 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-
+# pylint: disable=unused-import
 from tensorflow.contrib.periodic_resample.python.ops import gen_periodic_resample_op
 
 from tensorflow.contrib.periodic_resample.python.ops.gen_periodic_resample_op import periodic_resample
 
 from tensorflow.contrib.util import loader
 from tensorflow.python.platform import resource_loader
+# pylint: enable=unused-import
 
 _periodic_resample_op = loader.load_op_library(
     resource_loader.get_path_to_datafile('_periodic_resample_op.so'))
diff --git a/tensorflow/contrib/reduce_slice_ops/python/kernel_tests/reduce_slice_ops_test.py b/tensorflow/contrib/reduce_slice_ops/python/kernel_tests/reduce_slice_ops_test.py
index 60a193db4c..468886da20 100644
--- a/tensorflow/contrib/reduce_slice_ops/python/kernel_tests/reduce_slice_ops_test.py
+++ b/tensorflow/contrib/reduce_slice_ops/python/kernel_tests/reduce_slice_ops_test.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-import unittest
 
 from tensorflow.contrib.reduce_slice_ops.python.ops import reduce_slice_ops
 from tensorflow.python.framework.test_util import TensorFlowTestCase
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
index e1838c1739..09527e8473 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
@@ -39,8 +39,6 @@ from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
-from tensorflow.python.framework import test_util
-from tensorflow.contrib.rnn.python.ops import rnn_cell as contrib_rnn_cell
 
 # pylint: enable=protected-access
 Linear = core_rnn_cell._Linear  # pylint: disable=invalid-name
diff --git a/tensorflow/contrib/rnn/python/ops/rnn_cell.py b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
index eb8fd0c1cd..124e841fc2 100644
--- a/tensorflow/contrib/rnn/python/ops/rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
@@ -32,12 +32,12 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_impl  # pylint: disable=unused-import
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import partitioned_variables  # pylint: disable=unused-import
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import variable_scope as vs
-from tensorflow.python.ops import partitioned_variables
-from tensorflow.python.ops import nn_impl
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
 
diff --git a/tensorflow/contrib/session_bundle/gc.py b/tensorflow/contrib/session_bundle/gc.py
index 249c23c88f..514cc0f652 100644
--- a/tensorflow/contrib/session_bundle/gc.py
+++ b/tensorflow/contrib/session_bundle/gc.py
@@ -70,7 +70,6 @@ import heapq
 import math
 import os
 
-from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.python.platform import gfile
 from tensorflow.python.util.deprecation import deprecated
 
diff --git a/tensorflow/contrib/sparsemax/python/ops/sparsemax.py b/tensorflow/contrib/sparsemax/python/ops/sparsemax.py
index 73a5cf1e92..890ca20f4c 100644
--- a/tensorflow/contrib/sparsemax/python/ops/sparsemax.py
+++ b/tensorflow/contrib/sparsemax/python/ops/sparsemax.py
@@ -23,7 +23,6 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
-from tensorflow.python.platform import resource_loader
 
 __all__ = ["sparsemax"]
 
diff --git a/tensorflow/contrib/sparsemax/python/ops/sparsemax_loss.py b/tensorflow/contrib/sparsemax/python/ops/sparsemax_loss.py
index ba18f89e16..582d1e6136 100644
--- a/tensorflow/contrib/sparsemax/python/ops/sparsemax_loss.py
+++ b/tensorflow/contrib/sparsemax/python/ops/sparsemax_loss.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.util import loader
-from tensorflow.python.platform import resource_loader
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
diff --git a/tensorflow/examples/label_image/label_image.py b/tensorflow/examples/label_image/label_image.py
index 1c1bd57d71..fe5e0fc684 100644
--- a/tensorflow/examples/label_image/label_image.py
+++ b/tensorflow/examples/label_image/label_image.py
@@ -18,7 +18,6 @@ from __future__ import division
 from __future__ import print_function
 
 import argparse
-import sys
 
 import numpy as np
 import tensorflow as tf
diff --git a/tensorflow/examples/tutorials/mnist/input_data.py b/tensorflow/examples/tutorials/mnist/input_data.py
index f1a7e1c4af..fa148ae3e6 100644
--- a/tensorflow/examples/tutorials/mnist/input_data.py
+++ b/tensorflow/examples/tutorials/mnist/input_data.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+# pylint: disable=unused-import
 import gzip
 import os
 import tempfile
@@ -27,3 +28,4 @@ from six.moves import urllib
 from six.moves import xrange  # pylint: disable=redefined-builtin
 import tensorflow as tf
 from tensorflow.contrib.learn.python.learn.datasets.mnist import read_data_sets
+# pylint: enable=unused-import
diff --git a/tensorflow/python/client/session_test.py b/tensorflow/python/client/session_test.py
index f12c005511..490572254b 100644
--- a/tensorflow/python/client/session_test.py
+++ b/tensorflow/python/client/session_test.py
@@ -31,7 +31,6 @@ from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.framework import types_pb2
 from tensorflow.core.lib.core import error_codes_pb2
 from tensorflow.core.protobuf import config_pb2
-from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import constant_op
@@ -48,7 +47,6 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import gen_control_flow_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
 # Import resource_variable_ops for the variables-to-tensor implicit conversion.
 from tensorflow.python.ops import resource_variable_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import state_ops
diff --git a/tensorflow/python/framework/load_library.py b/tensorflow/python/framework/load_library.py
index c997ead829..1f2aa264c1 100644
--- a/tensorflow/python/framework/load_library.py
+++ b/tensorflow/python/framework/load_library.py
@@ -21,10 +21,10 @@ from __future__ import print_function
 import hashlib
 import imp
 import sys
-import threading
+import threading  # pylint: disable=unused-import
 
 from tensorflow.core.framework import op_def_pb2
-from tensorflow.core.lib.core import error_codes_pb2
+from tensorflow.core.lib.core import error_codes_pb2  # pylint: disable=unused-import
 from tensorflow.python import pywrap_tensorflow as py_tf
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.util import compat
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 15e8f5a38d..3864a4aa9e 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -49,7 +49,7 @@ from tensorflow.python.client import device_lib
 from tensorflow.python.client import session
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
-from tensorflow.python.eager import tape
+from tensorflow.python.eager import tape  # pylint: disable=unused-import
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
diff --git a/tensorflow/python/kernel_tests/conv2d_transpose_test.py b/tensorflow/python/kernel_tests/conv2d_transpose_test.py
index 7d0bc54b69..1a65c3f429 100644
--- a/tensorflow/python/kernel_tests/conv2d_transpose_test.py
+++ b/tensorflow/python/kernel_tests/conv2d_transpose_test.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
-from tensorflow.python.client import device_lib
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
diff --git a/tensorflow/python/kernel_tests/decode_bmp_op_test.py b/tensorflow/python/kernel_tests/decode_bmp_op_test.py
index c67c26b7be..35f8f76991 100644
--- a/tensorflow/python/kernel_tests/decode_bmp_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_bmp_op_test.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors_impl
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import image_ops
 from tensorflow.python.platform import test
diff --git a/tensorflow/python/kernel_tests/decode_raw_op_test.py b/tensorflow/python/kernel_tests/decode_raw_op_test.py
index 0c7025f54e..122a9ed469 100644
--- a/tensorflow/python/kernel_tests/decode_raw_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_raw_op_test.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-import sys
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
diff --git a/tensorflow/python/kernel_tests/fifo_queue_test.py b/tensorflow/python/kernel_tests/fifo_queue_test.py
index 748135440e..ce73e7ad3e 100644
--- a/tensorflow/python/kernel_tests/fifo_queue_test.py
+++ b/tensorflow/python/kernel_tests/fifo_queue_test.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 import random
-import re
 import time
 
 import numpy as np
diff --git a/tensorflow/python/kernel_tests/random/random_shuffle_queue_test.py b/tensorflow/python/kernel_tests/random/random_shuffle_queue_test.py
index c4e16ff628..b7a79f239c 100644
--- a/tensorflow/python/kernel_tests/random/random_shuffle_queue_test.py
+++ b/tensorflow/python/kernel_tests/random/random_shuffle_queue_test.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 import random
-import re
 import time
 
 import numpy as np
diff --git a/tensorflow/python/kernel_tests/softmax_op_test.py b/tensorflow/python/kernel_tests/softmax_op_test.py
index bb3f6970e4..ac08f2aec0 100644
--- a/tensorflow/python/kernel_tests/softmax_op_test.py
+++ b/tensorflow/python/kernel_tests/softmax_op_test.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
 import numpy as np
 
 from tensorflow.python.framework import constant_op
diff --git a/tensorflow/python/ops/candidate_sampling_ops.py b/tensorflow/python/ops/candidate_sampling_ops.py
index 20445c78a2..220ef1754d 100644
--- a/tensorflow/python/ops/candidate_sampling_ops.py
+++ b/tensorflow/python/ops/candidate_sampling_ops.py
@@ -20,9 +20,9 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import random_seed
-from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import array_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import gen_candidate_sampling_ops
-from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import math_ops  # pylint: disable=unused-import
 from tensorflow.python.util.tf_export import tf_export
 
 
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 9ae9a71e4b..3a6fdaafb9 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -55,7 +55,6 @@ import collections
 import functools
 
 import six
-from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.protobuf import control_flow_pb2
diff --git a/tensorflow/python/ops/gradients_impl.py b/tensorflow/python/ops/gradients_impl.py
index 314726ede6..28b26a09a5 100644
--- a/tensorflow/python/ops/gradients_impl.py
+++ b/tensorflow/python/ops/gradients_impl.py
@@ -35,7 +35,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import check_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import control_flow_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_util
diff --git a/tensorflow/python/ops/nn_grad_test.py b/tensorflow/python/ops/nn_grad_test.py
index aa7539ae9f..49d54beb20 100644
--- a/tensorflow/python/ops/nn_grad_test.py
+++ b/tensorflow/python/ops/nn_grad_test.py
@@ -24,7 +24,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
-from tensorflow.python.ops import nn_grad
+from tensorflow.python.ops import nn_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.platform import test
 
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index 55fcd176d6..5fa5708114 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -27,7 +27,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import candidate_sampling_ops
 from tensorflow.python.ops import embedding_ops
-from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import gen_array_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
diff --git a/tensorflow/python/tools/saved_model_cli.py b/tensorflow/python/tools/saved_model_cli.py
index 21e8e803fc..667a4b1db8 100644
--- a/tensorflow/python/tools/saved_model_cli.py
+++ b/tensorflow/python/tools/saved_model_cli.py
@@ -38,7 +38,7 @@ from tensorflow.core.framework import types_pb2
 from tensorflow.python.client import session
 from tensorflow.python.debug.wrappers import local_cli_wrapper
 from tensorflow.python.framework import ops as ops_lib
-from tensorflow.python.platform import app
+from tensorflow.python.platform import app  # pylint: disable=unused-import
 from tensorflow.python.saved_model import loader
 from tensorflow.python.tools import saved_model_utils
 
diff --git a/tensorflow/python/training/training_ops.py b/tensorflow/python/training/training_ops.py
index e98c32b614..d7133cfb50 100644
--- a/tensorflow/python/training/training_ops.py
+++ b/tensorflow/python/training/training_ops.py
@@ -19,7 +19,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.training import gen_training_ops
+from tensorflow.python.training import gen_training_ops  # pylint: disable=unused-import
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
 from tensorflow.python.training.gen_training_ops import *
diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index a58db51cb8..636d5a1e81 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -185,7 +185,8 @@ do_pylint() {
   # C0330 bad-continuation
   # C0301 line-too-long
   # C0326 bad-whitespace
-  grep -E '(\[E|\[W0311|\[W0312|\[C0330|\[C0301|\[C0326)' ${OUTPUT_FILE} > ${ERRORS_FILE}
+  # W0611 unused-import
+  grep -E '(\[E|\[W0311|\[W0312|\[C0330|\[C0301|\[C0326|\[W0611)' ${OUTPUT_FILE} > ${ERRORS_FILE}
 
   N_ERRORS=0
   while read -r LINE; do
-- 
GitLab


From 9b08301f474bb1175acf6ef77e6eb2b6552339ba Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Tue, 6 Feb 2018 22:48:48 -0800
Subject: [PATCH 748/966] [XLA:CPU] Add an LLVM IR implementation of Exp

This lets us avoid the usual set of issues that crop up when XLA generated code
has to call into C++.

PiperOrigin-RevId: 184793093
---
 .../xla/service/cpu/compiler_functor.cc       | 15 ++-
 .../xla/service/cpu/cpu_runtime_avx.cc        |  6 --
 .../xla/service/cpu/cpu_runtime_avx.h         |  1 -
 .../xla/service/cpu/cpu_runtime_sse4_1.cc     |  7 --
 .../xla/service/cpu/cpu_runtime_sse4_1.h      |  4 -
 .../xla/service/cpu/llvm_ir_runtime.cc        | 93 ++++++++++++++++++-
 .../xla/service/cpu/llvm_ir_runtime.h         |  2 +
 .../xla/service/cpu/simple_orc_jit.cc         |  3 -
 .../xla/service/cpu/vector_support_library.cc | 20 ++++
 .../xla/service/cpu/vector_support_library.h  | 12 +++
 .../xla/tests/array_elementwise_ops_test.cc   | 38 +++++++-
 11 files changed, 168 insertions(+), 33 deletions(-)

diff --git a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
index 04b4a8c5c8..2723661712 100644
--- a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
+++ b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
@@ -200,25 +200,16 @@ std::vector<llvm::VecDesc> VectorFunctionsForTargetLibraryInfoImpl(
   std::vector<llvm::VecDesc> vector_functions;
 
   const llvm::VecDesc four_wide_vector_functions_neon[] = {
-      {"expf", runtime::kExpV4F32NEONSymbolName, 4},
-      {"llvm.exp.f32", runtime::kExpV4F32NEONSymbolName, 4},
-
       {"logf", runtime::kLogV4F32NEONSymbolName, 4},
       {"llvm.log.f32", runtime::kLogV4F32NEONSymbolName, 4},
   };
 
   const llvm::VecDesc four_wide_vector_functions_sse[] = {
-      {"expf", runtime::kExpV4F32SSESymbolName, 4},
-      {"llvm.exp.f32", runtime::kExpV4F32SSESymbolName, 4},
-
       {"logf", runtime::kLogV4F32SSESymbolName, 4},
       {"llvm.log.f32", runtime::kLogV4F32SSESymbolName, 4},
   };
 
   const llvm::VecDesc eight_wide_vector_functions_avx[] = {
-      {"expf", runtime::kExpV8F32AVXSymbolName, 8},
-      {"llvm.exp.f32", runtime::kExpV8F32AVXSymbolName, 8},
-
       {"logf", runtime::kLogV8F32AVXSymbolName, 8},
       {"llvm.log.f32", runtime::kLogV8F32AVXSymbolName, 8},
   };
@@ -231,6 +222,12 @@ std::vector<llvm::VecDesc> VectorFunctionsForTargetLibraryInfoImpl(
 
       {"tanhf", runtime::kTanhV8F32SymbolName, 8},
       {"llvm.tanh.f32", runtime::kTanhV8F32SymbolName, 8},
+
+      {"expf", runtime::kExpV4F32SymbolName, 4},
+      {"llvm.exp.f32", runtime::kExpV4F32SymbolName, 4},
+
+      {"expf", runtime::kExpV8F32SymbolName, 8},
+      {"llvm.exp.f32", runtime::kExpV8F32SymbolName, 8},
   };
 
   llvm::SmallVector<llvm::StringRef, 32> features;
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.cc
index b1c1142e8d..62bb87f2b0 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.cc
@@ -20,11 +20,6 @@ limitations under the License.
 #include "third_party/eigen3/Eigen/Core"
 
 #ifdef TF_XLA_HAS_AVX
-xla::cpu::runtime::V8F32AVX __xla_cpu_runtime_ExpV8F32AVX(
-    xla::cpu::runtime::V8F32AVX x) {
-  return Eigen::internal::pexp(x);
-}
-
 xla::cpu::runtime::V8F32AVX __xla_cpu_runtime_LogV8F32AVX(
     xla::cpu::runtime::V8F32AVX x) {
   return Eigen::internal::plog(x);
@@ -35,7 +30,6 @@ namespace xla {
 namespace cpu {
 namespace runtime {
 
-const char *const kExpV8F32AVXSymbolName = "__xla_cpu_runtime_ExpV8F32AVX";
 const char *const kLogV8F32AVXSymbolName = "__xla_cpu_runtime_LogV8F32AVX";
 
 }  // namespace runtime
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.h b/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.h
index e5c782f93f..f473c689f2 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.h
@@ -33,7 +33,6 @@ namespace xla {
 namespace cpu {
 namespace runtime {
 
-extern const char *const kExpV8F32AVXSymbolName;
 extern const char *const kLogV8F32AVXSymbolName;
 
 #ifdef TF_XLA_HAS_AVX
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.cc
index d8ecf231cc..1d5b5c2c1e 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.cc
@@ -21,12 +21,6 @@ limitations under the License.
 
 #ifdef TF_XLA_HAS_SSE4_1
 
-xla::cpu::runtime::V4F32SSE __xla_cpu_runtime_ExpV4F32SSE(
-    xla::cpu::runtime::V4F32SSE x) {
-  Eigen::internal::Packet4f p = x;
-  return Eigen::internal::pexp(p);
-}
-
 xla::cpu::runtime::V4F32SSE __xla_cpu_runtime_LogV4F32SSE(
     xla::cpu::runtime::V4F32SSE x) {
   Eigen::internal::Packet4f p = x;
@@ -39,7 +33,6 @@ namespace xla {
 namespace cpu {
 namespace runtime {
 
-const char *const kExpV4F32SSESymbolName = "__xla_cpu_runtime_ExpV4F32SSE";
 const char *const kLogV4F32SSESymbolName = "__xla_cpu_runtime_LogV4F32SSE";
 
 }  // namespace runtime
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.h b/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.h
index aeb1eda23f..3b3d18112a 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.h
@@ -35,7 +35,6 @@ namespace xla {
 namespace cpu {
 namespace runtime {
 
-extern const char *const kExpV4F32SSESymbolName;
 extern const char *const kLogV4F32SSESymbolName;
 
 #ifdef TF_XLA_HAS_SSE4_1
@@ -52,9 +51,6 @@ extern "C" {
 // The following functions are vectorized versions of a selection of libm
 // library functions.
 // References to these functions are created by the LLVM vectorizer.
-xla::cpu::runtime::V4F32SSE __xla_cpu_runtime_ExpV4F32SSE(
-    xla::cpu::runtime::V4F32SSE x);
-
 xla::cpu::runtime::V4F32SSE __xla_cpu_runtime_LogV4F32SSE(
     xla::cpu::runtime::V4F32SSE x);
 #endif
diff --git a/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc b/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
index 3cd6b6e853..38fcd278e9 100644
--- a/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
+++ b/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
@@ -29,6 +29,8 @@ namespace runtime {
 
 const char* const kTanhV4F32SymbolName = "__xla_cpu_runtime_TanhV4F32";
 const char* const kTanhV8F32SymbolName = "__xla_cpu_runtime_TanhV8F32";
+const char* const kExpV4F32SymbolName = "__xla_cpu_runtime_ExpV4F32";
+const char* const kExpV8F32SymbolName = "__xla_cpu_runtime_ExpV8F32";
 
 namespace {
 llvm::Function* EmitVectorF32TanhIfNeeded(llvm::Module* module,
@@ -88,6 +90,86 @@ llvm::Function* EmitVectorF32TanhIfNeeded(llvm::Module* module,
   DCHECK(!llvm::verifyFunction(*vector_tanh_function));
   return vector_tanh_function;
 }
+
+llvm::Function* EmitVectorF32ExpIfNeeded(llvm::Module* module,
+                                         llvm::StringRef function_name,
+                                         int vector_width,
+                                         bool enable_fast_math) {
+  llvm::Function* vector_exp_function = module->getFunction(function_name);
+  if (vector_exp_function == nullptr) {
+    // If the function declaration is not present in the module, there can't be
+    // any calls to resolve.  Don't emit the function in this case.
+    return nullptr;
+  }
+
+  llvm::LLVMContext* context = &module->getContext();
+
+  llvm::BasicBlock* vector_exp_body =
+      llvm::BasicBlock::Create(*context, "body", vector_exp_function);
+
+  llvm::IRBuilder<> ir_builder(vector_exp_body);
+  llvm::FastMathFlags fast_math_flags;
+  fast_math_flags.setFast();
+  ir_builder.setFastMathFlags(fast_math_flags);
+
+  VectorSupportLibrary vsl(F32, vector_width, &ir_builder, "exp_f32");
+
+  // This implements the same polynomial approximation as implemented in Eigen3.
+
+  const double exp_hi = 88.3762626647950;
+  const double exp_lo = -88.3762626647949;
+
+  const double cephes_LOG2EF = 1.44269504088896341;
+  const double cephes_exp_C1 = 0.693359375;
+  const double cephes_exp_C2 = -2.12194440e-4;
+
+  const double cephes_exp_p0 = 1.9875691500E-4;
+  const double cephes_exp_p1 = 1.3981999507E-3;
+  const double cephes_exp_p2 = 8.3334519073E-3;
+  const double cephes_exp_p3 = 4.1665795894E-2;
+  const double cephes_exp_p4 = 1.6666665459E-1;
+  const double cephes_exp_p5 = 5.0000001201E-1;
+
+  llvm::Value* input = &*vector_exp_function->arg_begin();
+  llvm::Value* input_clamped =
+      vsl.Clamp(input, /*low=*/exp_lo, /*high=*/exp_hi);
+  llvm::Value* fx = vsl.Floor(vsl.MulAdd(input_clamped, cephes_LOG2EF, 0.5));
+  llvm::Value* tmp = vsl.Mul(cephes_exp_C1, fx);
+  llvm::Value* z = vsl.Mul(cephes_exp_C2, fx);
+  llvm::Value* x = vsl.Sub(input_clamped, tmp);
+  x = vsl.Sub(x, z);
+  z = vsl.Mul(x, x);
+
+  llvm::Value* y = vsl.MulAdd(x, cephes_exp_p0, cephes_exp_p1);
+  y = vsl.MulAdd(y, x, cephes_exp_p2);
+  y = vsl.MulAdd(y, x, cephes_exp_p3);
+  y = vsl.MulAdd(y, x, cephes_exp_p4);
+  y = vsl.MulAdd(y, x, cephes_exp_p5);
+  y = vsl.MulAdd(y, z, x);
+  y = vsl.Add(1.0, y);
+
+  // VectorSupportLibrary (intentionally) can't juggle more than one type at a
+  // time so drop down to IRBuilder for this bit.
+  llvm::Value* vector_constant_0x7f =
+      ir_builder.CreateVectorSplat(vector_width, ir_builder.getInt32(0x7f));
+  llvm::Value* vector_constant_23 =
+      ir_builder.CreateVectorSplat(vector_width, ir_builder.getInt32(23));
+  llvm::Type* i32_vector_type =
+      llvm::VectorType::get(ir_builder.getInt32Ty(), vector_width);
+  // fx is clamped so we don't have to worry about it being out of range for
+  // i32.
+  llvm::Value* emm0 = ir_builder.CreateFPToSI(fx, i32_vector_type);
+  emm0 = ir_builder.CreateAdd(emm0, vector_constant_0x7f);
+  emm0 = ir_builder.CreateShl(emm0, vector_constant_23);
+  llvm::Value* emm0_f32 = ir_builder.CreateBitCast(emm0, vsl.vector_type());
+
+  llvm::Value* result = vsl.Max(vsl.Mul(y, emm0_f32), input);
+
+  ir_builder.CreateRet(result);
+
+  CHECK(!llvm::verifyFunction(*vector_exp_function));
+  return vector_exp_function;
+}
 }  // namespace
 
 void RewriteIRRuntimeFunctions(llvm::Module* module, bool enable_fast_math) {
@@ -98,11 +180,18 @@ void RewriteIRRuntimeFunctions(llvm::Module* module, bool enable_fast_math) {
       EmitVectorF32TanhIfNeeded(module, kTanhV8F32SymbolName,
                                 /*vector_width=*/8, enable_fast_math);
 
+  auto* exp_v4f32 =
+      EmitVectorF32ExpIfNeeded(module, kExpV4F32SymbolName,
+                               /*vector_width=*/4, enable_fast_math);
+  auto* exp_v8f32 =
+      EmitVectorF32ExpIfNeeded(module, kExpV8F32SymbolName,
+                               /*vector_width=*/8, enable_fast_math);
+
   // Gather all the call sites, force inline them and then delete the vector
   // function bodies.
 
   std::vector<llvm::CallInst*> calls_to_inline;
-  for (auto* function : {tanh_v4f32, tanh_v8f32}) {
+  for (auto* function : {tanh_v4f32, tanh_v8f32, exp_v4f32, exp_v8f32}) {
     if (function != nullptr) {
       for (auto* user : function->users()) {
         calls_to_inline.push_back(llvm::cast<llvm::CallInst>(user));
@@ -115,7 +204,7 @@ void RewriteIRRuntimeFunctions(llvm::Module* module, bool enable_fast_math) {
     CHECK(llvm::InlineFunction(call_to_inline, inline_function_info));
   }
 
-  for (auto* function : {tanh_v4f32, tanh_v8f32}) {
+  for (auto* function : {tanh_v4f32, tanh_v8f32, exp_v4f32, exp_v8f32}) {
     if (function != nullptr) {
       function->eraseFromParent();
     }
diff --git a/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.h b/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.h
index 7f31fb98b0..afb9ac71fa 100644
--- a/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.h
+++ b/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.h
@@ -25,6 +25,8 @@ namespace runtime {
 
 extern const char* const kTanhV4F32SymbolName;
 extern const char* const kTanhV8F32SymbolName;
+extern const char* const kExpV4F32SymbolName;
+extern const char* const kExpV8F32SymbolName;
 
 // The following CPU runtime functions have LLVM-IR only implementations:
 //
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
index de5e9b4119..19250b8e7e 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
@@ -216,15 +216,12 @@ bool RegisterKnownJITSymbols() {
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF32);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF64);
 #ifdef TF_XLA_HAS_NEON
-  REGISTER_CPU_RUNTIME_SYMBOL(ExpV4F32NEON);
   REGISTER_CPU_RUNTIME_SYMBOL(LogV4F32NEON);
 #endif
 #ifdef TF_XLA_HAS_SSE4_1
-  REGISTER_CPU_RUNTIME_SYMBOL(ExpV4F32SSE);
   REGISTER_CPU_RUNTIME_SYMBOL(LogV4F32SSE);
 #endif
 #ifdef TF_XLA_HAS_AVX
-  REGISTER_CPU_RUNTIME_SYMBOL(ExpV8F32AVX);
   REGISTER_CPU_RUNTIME_SYMBOL(LogV8F32AVX);
 #endif
   REGISTER_CPU_RUNTIME_SYMBOL(ParallelForkJoin);
diff --git a/tensorflow/compiler/xla/service/cpu/vector_support_library.cc b/tensorflow/compiler/xla/service/cpu/vector_support_library.cc
index a910e41050..098d89d8c9 100644
--- a/tensorflow/compiler/xla/service/cpu/vector_support_library.cc
+++ b/tensorflow/compiler/xla/service/cpu/vector_support_library.cc
@@ -54,6 +54,26 @@ llvm::Value* VectorSupportLibrary::Add(llvm::Value* lhs, llvm::Value* rhs) {
   return AddInternal(lhs, rhs);
 }
 
+llvm::Value* VectorSupportLibrary::Sub(llvm::Value* lhs, llvm::Value* rhs) {
+  CHECK(lhs->getType() == scalar_type() || lhs->getType() == vector_type());
+  return ir_builder()->CreateFSub(lhs, rhs);
+}
+
+llvm::Value* VectorSupportLibrary::Max(llvm::Value* lhs, llvm::Value* rhs) {
+  CHECK(lhs->getType() == scalar_type() || lhs->getType() == vector_type());
+  if (scalar_type_->isFloatingPointTy()) {
+    return llvm_ir::EmitFloatMax(lhs, rhs, ir_builder_);
+  } else {
+    LOG(FATAL) << "Max for integers is unimplemented";
+  }
+}
+
+llvm::Value* VectorSupportLibrary::Floor(llvm::Value* a) {
+  CHECK(a->getType() == scalar_type() || a->getType() == vector_type());
+  return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::floor, {a},
+                                      {a->getType()}, ir_builder());
+}
+
 llvm::Value* VectorSupportLibrary::Div(llvm::Value* lhs, llvm::Value* rhs) {
   CHECK(lhs->getType() == scalar_type() || lhs->getType() == vector_type());
   if (scalar_type_->isFloatingPointTy()) {
diff --git a/tensorflow/compiler/xla/service/cpu/vector_support_library.h b/tensorflow/compiler/xla/service/cpu/vector_support_library.h
index 6091146824..5c6e2ecad9 100644
--- a/tensorflow/compiler/xla/service/cpu/vector_support_library.h
+++ b/tensorflow/compiler/xla/service/cpu/vector_support_library.h
@@ -41,6 +41,9 @@ class VectorSupportLibrary {
   llvm::Value* Mul(int64 lhs, llvm::Value* rhs) {
     return Mul(ir_builder()->getInt64(lhs), rhs);
   }
+  llvm::Value* Mul(double lhs, llvm::Value* rhs) {
+    return Mul(llvm::ConstantFP::get(rhs->getType(), lhs), rhs);
+  }
 
   llvm::Value* Add(llvm::Value* lhs, llvm::Value* rhs);
   llvm::Value* Add(int64 lhs, llvm::Value* rhs) {
@@ -50,6 +53,8 @@ class VectorSupportLibrary {
     return Add(llvm::ConstantFP::get(vector_type(), lhs), rhs);
   }
 
+  llvm::Value* Sub(llvm::Value* lhs, llvm::Value* rhs);
+  llvm::Value* Max(llvm::Value* lhs, llvm::Value* rhs);
   llvm::Value* Div(llvm::Value* lhs, llvm::Value* rhs);
 
   llvm::Value* MulAdd(llvm::Value* a, llvm::Value* b, llvm::Value* c) {
@@ -60,6 +65,13 @@ class VectorSupportLibrary {
     return Add(llvm::ConstantFP::get(vector_type(), c), Mul(a, b));
   }
 
+  llvm::Value* MulAdd(llvm::Value* a, double b, double c) {
+    return Add(llvm::ConstantFP::get(a->getType(), c),
+               Mul(a, llvm::ConstantFP::get(a->getType(), b)));
+  }
+
+  llvm::Value* Floor(llvm::Value* a);
+
   llvm::Value* Clamp(llvm::Value* a, double low, double high);
   llvm::Value* SplatFloat(double d) {
     return llvm::ConstantFP::get(vector_type(), d);
diff --git a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
index b788631fa3..177b58979c 100644
--- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
@@ -2048,7 +2048,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, TanhF32s) {
 XLA_TEST_F(ArrayElementwiseOpTest, TanhF32sVector) {
   // This is like the test ArrayElementwiseOpTest.TanhF32s above, except that
   // the input tensor is large enough to exercise the vectorized tanh
-  // implementation.
+  // implementation on XLA CPU.
   ComputationBuilder builder(client_, TestName());
   auto input_literal = Literal::CreateR2<float>(
       {{1.02, -0.32, 0.85, 0.90, 1.23, -0.91, -0.49, 0.80},
@@ -2089,6 +2089,42 @@ XLA_TEST_F(ArrayElementwiseOpTest, TanhF32sVector) {
       ErrorSpec(0.004, 0.004));
 }
 
+XLA_TEST_F(ArrayElementwiseOpTest, ExpF32sVector) {
+  // The input tensor is large enough to exercise the vectorized exp
+  // implementation on XLA CPU.
+  ComputationBuilder builder(client_, TestName());
+
+  // Just to help make sense of the scales here -- exp(89) saturates float32 and
+  // exp(-10) is smaller than our error spec.
+  std::unique_ptr<Literal> input_literal = Literal::CreateR2<float>(
+      {{1.02, -0.32, 0.85, 0.9, 1.23, -0.91, -0.49, 0.8},
+       {-1.31, -1.44, -0.13, -1.31, -0.79, 1.41, 1.21, 1.05},
+       {-195.6, -194.5, -193.4, -192.3, -191.2, -190.1, -189.0, -187.9},
+       {-19.6, -18.5, -17.4, -16.3, -15.2, -14.1, -13.0, -11.9},
+       {-10.8, -9.7, -8.6, -7.5, -6.4, -5.3, -4.2, -3.1},
+       {-2.0, -0.9, 0.2, 1.3, 2.4, 3.5, 4.6, 5.7},
+       {6.8, 7.9, 9.0, 10.1, 11.2, 12.3, 13.4, 14.5},
+       {15.6, 16.7, 17.8, 18.9, 20.0, 21.1, 22.2, 23.3},
+       {24.4, 25.5, 26.6, 27.7, 28.8, 29.9, 31.0, 32.1},
+       {68.4, 69.5, 70.6, 71.7, 72.8, 73.9, 75.0, 76.1},
+       {77.2, 78.3, 79.4, 80.5, 81.6, 82.7, 83.8, 84.9},
+       {85.2, 86.3, 86.4, 86.5, 87.6, 87.7, 87.8, 87.9}});
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<GlobalData> input_data,
+                          client_->TransferToServer(*input_literal));
+
+  auto input = builder.Parameter(0, input_literal->shape(), "input");
+  builder.Exp(input);
+
+  Array2D<float> expected_result(input_literal->shape().dimensions(0),
+                                 input_literal->shape().dimensions(1));
+  expected_result.Each([&](int64 r, int64 c, float* result) {
+    *result = std::exp(input_literal->Get<float>({r, c}));
+  });
+
+  ComputeAndCompareR2<float>(&builder, expected_result, {input_data.get()},
+                             error_spec_);
+}
+
 XLA_TEST_F(ArrayElementwiseOpTest, AddChainFoldLeft) {
   // a ------ (add) --------- (add)
   //         /               /
-- 
GitLab


From 39f499b18ad2c2aa1b96b793d5e2404ba765b36f Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 6 Feb 2018 22:53:33 -0800
Subject: [PATCH 749/966] Fix incorrect links in CONTRIBUTING.md (#16814)

This fix fixes two incorrect links in CONTRIBUTING.md
about license examples. The reason for broken links
is because tensorboard is in another repo.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 CONTRIBUTING.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index de4fded6ae..16d7051343 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -68,8 +68,8 @@ Include a license at the top of new files.
 * [Java license example](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/java/src/main/java/org/tensorflow/Graph.java#L1)
 * [Go license example](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/go/operation.go#L1)
 * [Bash license example](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/ci_build/ci_sanity.sh#L2)
-* [HTML license example](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tensorboard/dist/index.html#L2)
-* [JavaScript/TypeScript license example](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tensorboard/components/tf_backend/backend.ts#L1)
+* [HTML license example](https://github.com/tensorflow/tensorboard/blob/master/tensorboard/components/tf_backend/tf-backend.html#L2)
+* [JavaScript/TypeScript license example](https://github.com/tensorflow/tensorboard/blob/master/tensorboard/components/tf_backend/backend.ts#L1)
 
 Bazel BUILD files also need to include a license section, e.g.,
 [BUILD example](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/BUILD#L61).
-- 
GitLab


From 8aa14cd682053e1e643f0a74ec25cf3b87bf2712 Mon Sep 17 00:00:00 2001
From: cclauss <cclauss@bluewin.ch>
Date: Wed, 7 Feb 2018 07:54:21 +0100
Subject: [PATCH 750/966] Typo in variable name: BETA --> self.BETA (#16666)

__BETA__ is defined on line 118 as a class member so it can only be accessed via __self__ or via the __ElasticAverageOptimizer__.

flake8 testing of https://github.com/tensorflow/tensorflow

$ __flake8 . --count --select=E901,E999,F821,F822,F823 --show-source --statistics__
```
./tensorflow/contrib/opt/python/training/elastic_average_optimizer.py:153:27: F821 undefined name 'BETA'
      self._moving_rate = BETA / communication_period / num_worker
                          ^
```
---
 .../contrib/opt/python/training/elastic_average_optimizer.py    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/opt/python/training/elastic_average_optimizer.py b/tensorflow/contrib/opt/python/training/elastic_average_optimizer.py
index 716ee9cdf7..5763593b81 100644
--- a/tensorflow/contrib/opt/python/training/elastic_average_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/elastic_average_optimizer.py
@@ -150,7 +150,7 @@ class ElasticAverageOptimizer(optimizer.Optimizer):
     self._global_map = ea_custom_getter._global_map
 
     if moving_rate is None:
-      self._moving_rate = BETA / communication_period / num_worker
+      self._moving_rate = self.BETA / communication_period / num_worker
     else:
       self._moving_rate = moving_rate
     if rho is None:
-- 
GitLab


From 802080ac2a529598df484f54058cd75023ff3f8c Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Tue, 6 Feb 2018 23:54:26 -0800
Subject: [PATCH 751/966] [XLA] Use HloVerifiedTestBase in
 AlgebraicSimplifierTest

And fix the fallout.  Thanks to asbirlea@ for noticing this!

PiperOrigin-RevId: 184796949
---
 .../xla/service/algebraic_simplifier_test.cc  | 413 ++++++++----------
 tensorflow/compiler/xla/window_util.cc        |   2 +
 2 files changed, 175 insertions(+), 240 deletions(-)

diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index e43ea50af4..0f08eb3a32 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -61,13 +61,12 @@ TEST_F(AlgebraicSimplifierTest, AddZero) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kAdd, param0, zero));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kAdd);
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
 }
@@ -83,13 +82,12 @@ TEST_F(AlgebraicSimplifierTest, AddConstOnLHS) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kAdd, constant, param0));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kAdd);
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_THAT(root, op::Add(param0, op::Constant()));
 }
@@ -110,13 +108,12 @@ TEST_F(AlgebraicSimplifierTest, AddReassociateMergeConstants) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kAdd, add1, constant2));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kAdd);
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_THAT(root, op::Add(param0, op::Add(constant1, constant2)));
 }
@@ -133,13 +130,12 @@ TEST_F(AlgebraicSimplifierTest, AddBroadcastZeroR0Operand) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r2f32, HloOpcode::kAdd, bcast, param0));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kAdd);
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
 }
@@ -156,13 +152,12 @@ TEST_F(AlgebraicSimplifierTest, AddBroadcastZeroR1Operand) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r2f32, HloOpcode::kAdd, bcast, param0));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kAdd);
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
 }
@@ -178,13 +173,12 @@ TEST_F(AlgebraicSimplifierTest, SubZero) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kSubtract, param0, zero));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kSubtract);
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
 }
@@ -200,13 +194,12 @@ TEST_F(AlgebraicSimplifierTest, SubConstCanonicalization) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       r0f32, HloOpcode::kSubtract, param0, constant));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kSubtract);
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_THAT(root, op::Add(param0, op::Negate(constant)));
 }
@@ -226,15 +219,14 @@ TEST_F(AlgebraicSimplifierTest, LhsDivOfDiv) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kDivide, div, param2));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Divide(op::Divide(param0, param1), param2));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Divide(param0, op::Multiply(param1, param2)));
@@ -255,15 +247,14 @@ TEST_F(AlgebraicSimplifierTest, RhsDivOfDiv) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kDivide, param0, div));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Divide(param0, op::Divide(param1, param2)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Divide(op::Multiply(param0, param2), param1));
@@ -289,8 +280,7 @@ TEST_F(AlgebraicSimplifierTest, DivOfDivAndDiv) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r2f32, HloOpcode::kDivide, div0, div1));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(
       computation->root_instruction(),
@@ -298,7 +288,7 @@ TEST_F(AlgebraicSimplifierTest, DivOfDivAndDiv) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_THAT(
       computation->root_instruction(),
@@ -320,15 +310,14 @@ TEST_F(AlgebraicSimplifierTest, DivOfExp) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kDivide, param0, exp));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Divide(param0, op::Exp(param1)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Multiply(param0, op::Exp(op::Negate(param1))));
@@ -349,15 +338,14 @@ TEST_F(AlgebraicSimplifierTest, DivOfPower) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kDivide, param0, power));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Divide(param0, op::Power(param1, param2)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Multiply(param0, op::Power(param1, op::Negate(param2))));
@@ -380,15 +368,14 @@ TEST_F(AlgebraicSimplifierTest, DivOfBroadcastingPower) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r1f32, HloOpcode::kDivide, param0, power));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Divide(param0, op::Power(param1, param2)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   ASSERT_THAT(computation->root_instruction(),
               op::Multiply(param0, op::Power(param1, op::Negate(param2))));
@@ -411,12 +398,11 @@ TEST_F(AlgebraicSimplifierTest, DivideByConstant) {
   builder.AddInstruction(HloInstruction::CreateBinary(r1f32, HloOpcode::kDivide,
                                                       param0, constant));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Multiply(param0, op::Divide(op::Constant(), constant)));
@@ -438,11 +424,10 @@ TEST_F(AlgebraicSimplifierTest, PowerOfPower) {
   builder.AddInstruction(HloInstruction::CreateBinary(r1f32, HloOpcode::kPower,
                                                       inner_power, exp2));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
   EXPECT_THAT(computation->root_instruction(),
               op::Power(base, op::Multiply(exp1, exp2)));
 }
@@ -451,24 +436,23 @@ TEST_F(AlgebraicSimplifierTest, PowerOfPower) {
 // numbers.
 TEST_F(AlgebraicSimplifierTest, PowerOfPowerComplex) {
   Shape r0c64 = ShapeUtil::MakeShape(C64, {});
-  Shape r1f32 = ShapeUtil::MakeShape(F32, {7});
+  Shape r1c64 = ShapeUtil::MakeShape(C64, {7});
   HloComputation::Builder builder(TestName());
   HloInstruction* base = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, r1f32, "param0"));
+      HloInstruction::CreateParameter(0, r1c64, "param0"));
   HloInstruction* exp1 = builder.AddInstruction(
       HloInstruction::CreateParameter(1, r0c64, "param1"));
   HloInstruction* exp2 = builder.AddInstruction(
       HloInstruction::CreateParameter(2, r0c64, "param2"));
   HloInstruction* inner_power = builder.AddInstruction(
-      HloInstruction::CreateBinary(r1f32, HloOpcode::kPower, base, exp1));
-  builder.AddInstruction(HloInstruction::CreateBinary(r1f32, HloOpcode::kPower,
+      HloInstruction::CreateBinary(r1c64, HloOpcode::kPower, base, exp1));
+  builder.AddInstruction(HloInstruction::CreateBinary(r1c64, HloOpcode::kPower,
                                                       inner_power, exp2));
 
-  auto module = CreateNewModule();
-  module->AddEntryComputation(builder.Build());
+  module().AddEntryComputation(builder.Build());
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_FALSE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_FALSE(simplifier.Run(&module()).ValueOrDie());
 }
 
 // Test that A/1 is simplified to A for a scalar.
@@ -482,13 +466,12 @@ TEST_F(AlgebraicSimplifierTest, DivOneScalar) {
   HloInstruction* div = builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kDivide, param0, one));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, div);
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
 }
@@ -504,13 +487,12 @@ TEST_F(AlgebraicSimplifierTest, DivOneArray) {
   HloInstruction* div = builder.AddInstruction(
       HloInstruction::CreateBinary(r2f32, HloOpcode::kDivide, param0, one));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, div);
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
 }
@@ -529,13 +511,12 @@ TEST_F(AlgebraicSimplifierTest, ComplexOfRealImagC) {
   HloInstruction* cplx = builder.AddInstruction(
       HloInstruction::CreateBinary(r2c64, HloOpcode::kComplex, real, imag));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, cplx);
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
 }
@@ -554,13 +535,12 @@ TEST_F(AlgebraicSimplifierTest, RealOfComplex) {
   HloInstruction* real = builder.AddInstruction(
       HloInstruction::CreateUnary(r2f32, HloOpcode::kReal, cplx));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, real);
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
 }
@@ -579,13 +559,12 @@ TEST_F(AlgebraicSimplifierTest, ImagOfComplex) {
   HloInstruction* imag = builder.AddInstruction(
       HloInstruction::CreateUnary(r2f32, HloOpcode::kImag, cplx));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, imag);
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param1);
 }
@@ -607,13 +586,12 @@ TEST_F(AlgebraicSimplifierTest, SelectMakeTuple) {
   HloInstruction* add = builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kAdd, get, param2));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, add);
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_THAT(root, op::Add(param1, param2));
 }
@@ -633,15 +611,14 @@ TEST_F(AlgebraicSimplifierTest, ExpDiv) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kDivide, exp0, exp1));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Divide(op::Exp(param0), op::Exp(param1)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Exp(op::Subtract(param0, param1)));
@@ -662,15 +639,14 @@ TEST_F(AlgebraicSimplifierTest, ExpMul) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kMultiply, exp0, exp1));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Multiply(op::Exp(param0), op::Exp(param1)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Exp(op::Add(param0, param1)));
@@ -689,15 +665,14 @@ TEST_F(AlgebraicSimplifierTest, PowExp) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kPower, exp0, param1));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Power(op::Exp(param0), param1));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Exp(op::Multiply(param0, param1)));
@@ -716,15 +691,14 @@ TEST_F(AlgebraicSimplifierTest, LnPow) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32, HloOpcode::kLog, pow));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Log(op::Power(param0, param1)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Multiply(op::Log(param0), param1));
@@ -741,14 +715,13 @@ TEST_F(AlgebraicSimplifierTest, LnExp) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32, HloOpcode::kLog, exp0));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Log(op::Exp(param0)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_EQ(computation->root_instruction(), param0);
 }
@@ -770,15 +743,14 @@ TEST_F(AlgebraicSimplifierTest, LnExpDiv) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32, HloOpcode::kLog, div));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Log(op::Divide(op::Exp(param0), op::Exp(param1))));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Subtract(param0, param1));
 }
@@ -795,14 +767,13 @@ TEST_F(AlgebraicSimplifierTest, Pow0Scalar) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kPower, param0, zero));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Power(param0, zero));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   HloInstruction* root = computation->root_instruction();
   EXPECT_THAT(root, op::Constant());
@@ -820,14 +791,13 @@ TEST_F(AlgebraicSimplifierTest, Pow0Vector) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r1f32, HloOpcode::kPower, param0, zero));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Power(param0, zero));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   HloInstruction* root = computation->root_instruction();
   EXPECT_THAT(root, op::Broadcast());
@@ -849,14 +819,13 @@ TEST_F(AlgebraicSimplifierTest, Pow1) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kPower, param0, one));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Power(param0, one));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_EQ(computation->root_instruction(), param0);
 }
@@ -872,14 +841,13 @@ TEST_F(AlgebraicSimplifierTest, Pow2) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kPower, param0, two));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Power(param0, two));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Multiply(param0, param0));
 }
@@ -895,14 +863,13 @@ TEST_F(AlgebraicSimplifierTest, PowNegative1) {
   builder.AddInstruction(HloInstruction::CreateBinary(r0f32, HloOpcode::kPower,
                                                       param0, negative_one));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Power(param0, negative_one));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   HloInstruction* root = computation->root_instruction();
   EXPECT_THAT(root, op::Divide(op::Broadcast(), param0));
@@ -941,16 +908,15 @@ TEST_F(AlgebraicSimplifierTest, ZeroSizedConvolution) {
   dim->set_base_dilation(1);
   dim->set_window_reversal(false);
   // Create add computation.
-  std::unique_ptr<HloModule> module = CreateNewModule();
   builder.AddInstruction(HloInstruction::CreateConvolve(
       ShapeUtil::MakeShape(F32, {3, 3, 3}), lhs, rhs, window, dnums));
-  module->AddEntryComputation(builder.Build());
+  module().AddEntryComputation(builder.Build());
   HloPassFix<AlgebraicSimplifier> simplifier(/*is_layout_sensitive=*/false,
                                              non_bitcasting_callback());
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
+  EXPECT_THAT(module().entry_computation()->root_instruction(),
               op::Convolution(lhs, rhs));
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  EXPECT_THAT(module().entry_computation()->root_instruction(),
               op::Broadcast(op::Constant()));
 }
 
@@ -969,7 +935,6 @@ TEST_F(AlgebraicSimplifierTest, ZeroSizedReduceWindow) {
     dim->set_base_dilation(1);
   }
   // Create add computation.
-  std::unique_ptr<HloModule> module = CreateNewModule();
   HloComputation* add_computation = nullptr;
   {
     HloComputation::Builder builder(TestName() + ".add");
@@ -980,20 +945,20 @@ TEST_F(AlgebraicSimplifierTest, ZeroSizedReduceWindow) {
         HloInstruction::CreateParameter(1, scalar_shape, "p1"));
     builder.AddInstruction(
         HloInstruction::CreateBinary(scalar_shape, HloOpcode::kAdd, p0, p1));
-    add_computation = module->AddEmbeddedComputation(builder.Build());
+    add_computation = module().AddEmbeddedComputation(builder.Build());
   }
   builder.AddInstruction(HloInstruction::CreateReduceWindow(
       ShapeUtil::MakeShape(F32, {5, 2}), param,
       builder.AddInstruction(
           HloInstruction::CreateConstant(Literal::CreateR0<float>(0.0f))),
       window, add_computation));
-  module->AddEntryComputation(builder.Build());
+  module().AddEntryComputation(builder.Build());
   HloPassFix<AlgebraicSimplifier> simplifier(/*is_layout_sensitive=*/false,
                                              non_bitcasting_callback());
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
+  EXPECT_THAT(module().entry_computation()->root_instruction(),
               op::ReduceWindow(param, op::Constant()));
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  EXPECT_THAT(module().entry_computation()->root_instruction(),
               op::Broadcast(op::Constant()));
 }
 
@@ -1014,14 +979,13 @@ TEST_F(AlgebraicSimplifierTest, ZeroSizedPad) {
       builder.AddInstruction(
           HloInstruction::CreateConstant(Literal::CreateR0(0.0f))),
       padding));
-  std::unique_ptr<HloModule> module = CreateNewModule();
-  module->AddEntryComputation(builder.Build());
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
+  module().AddEntryComputation(builder.Build());
+  EXPECT_THAT(module().entry_computation()->root_instruction(),
               op::Pad(param, op::Constant()));
   HloPassFix<AlgebraicSimplifier> simplifier(/*is_layout_sensitive=*/false,
                                              non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  EXPECT_THAT(module().entry_computation()->root_instruction(),
               op::Broadcast(op::Constant()));
 }
 
@@ -1039,17 +1003,16 @@ TEST_F(AlgebraicSimplifierTest, ReshapeBroadcast) {
       ShapeUtil::MakeShape(F32, {3, 2}), broadcast));
 
   auto computation = builder.Build();
-  auto module = CreateNewModule();
-  module->AddEntryComputation(std::move(computation));
+  module().AddEntryComputation(std::move(computation));
 
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
+  EXPECT_THAT(module().entry_computation()->root_instruction(),
               op::Reshape(op::Broadcast(op::Reshape(op))));
 
   HloPassFix<AlgebraicSimplifier> simplifier(/*is_layout_sensitive=*/false,
                                              non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
-  EXPECT_THAT(module->entry_computation()->root_instruction(), op);
+  EXPECT_THAT(module().entry_computation()->root_instruction(), op);
 }
 
 // Test that convert(A, $TYPE) is simplified to A if A is of type $TYPE.
@@ -1060,14 +1023,13 @@ TEST_F(AlgebraicSimplifierTest, ConvertBetweenSameType) {
   builder.AddInstruction(
       HloInstruction::CreateConvert(ShapeUtil::MakeShape(F32, {}), input));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Convert(input));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), input);
 }
@@ -1081,14 +1043,13 @@ TEST_F(AlgebraicSimplifierTest, RemoveCopy) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(param0->shape(), HloOpcode::kCopy, param0));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Copy(param0));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), param0);
 }
@@ -1102,14 +1063,13 @@ TEST_F(AlgebraicSimplifierTest, RemoveUnaryConcatenate) {
   builder.AddInstruction(
       HloInstruction::CreateConcatenate(param0->shape(), {param0}, 0));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Concatenate(param0));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), param0);
 }
@@ -1132,8 +1092,7 @@ TEST_F(AlgebraicSimplifierTest, RemoveEmptyConcatenateOperands) {
   builder.AddInstruction(HloInstruction::CreateConcatenate(
       result_shape, {empty_literal, param0, param0, empty_slice, param1}, 0));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(
       computation->root_instruction(),
@@ -1141,7 +1100,7 @@ TEST_F(AlgebraicSimplifierTest, RemoveEmptyConcatenateOperands) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Concatenate(param0, param0, param1));
@@ -1163,15 +1122,14 @@ TEST_F(AlgebraicSimplifierTest, OnlyEmptyConcatenateOperands) {
   builder.AddInstruction(HloInstruction::CreateConcatenate(
       result_shape, {empty_literal, empty_slice}, 0));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Concatenate(empty_literal, empty_slice));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_EQ(computation->root_instruction(), empty_literal);
 }
@@ -1188,14 +1146,13 @@ TEST_F(AlgebraicSimplifierTest, ConcatenateOfBroadcastBecomesPad) {
   HloInstruction* broadcast = builder.AddInstruction(
       HloInstruction::CreateBroadcast(r1f32, param1, {}));
   builder.AddInstruction(HloInstruction::CreateConcatenate(
-      param0->shape(), {broadcast, param0}, 0));
+      ShapeUtil::MakeShape(F32, {200}), {broadcast, param0}, 0));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
   EXPECT_THAT(computation->root_instruction(), op::Pad(param0, param1));
 }
 
@@ -1209,8 +1166,7 @@ TEST_F(AlgebraicSimplifierTest, CopyWithDifferentLayout) {
   HloInstruction* copy = builder.AddInstruction(
       HloInstruction::CreateUnary(param0->shape(), HloOpcode::kCopy, param0));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   // Set to different layouts.
   *param0->mutable_shape()->mutable_layout() = LayoutUtil::MakeLayout({0, 1});
@@ -1220,7 +1176,7 @@ TEST_F(AlgebraicSimplifierTest, CopyWithDifferentLayout) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
                                  non_bitcasting_callback());
-  EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
+  EXPECT_FALSE(simplifier.Run(&module()).ValueOrDie());
 
   // Copy has not been removed.
   EXPECT_THAT(computation->root_instruction(), op::Copy(param0));
@@ -1236,8 +1192,7 @@ TEST_F(AlgebraicSimplifierTest, CopyWithSameLayout) {
   HloInstruction* copy = builder.AddInstruction(
       HloInstruction::CreateUnary(param0->shape(), HloOpcode::kCopy, param0));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   // Set to same layouts.
   *param0->mutable_shape()->mutable_layout() = LayoutUtil::MakeLayout({0, 1});
@@ -1247,7 +1202,7 @@ TEST_F(AlgebraicSimplifierTest, CopyWithSameLayout) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   // Copy has been removed.
   EXPECT_THAT(computation->root_instruction(), param0);
@@ -1268,14 +1223,13 @@ TEST_F(AlgebraicSimplifierTest, NoBitcastAdded) {
   *reshape->mutable_shape()->mutable_layout() =
       LayoutUtil::MakeLayout({0, 1, 2, 3, 4, 5});
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Reshape(param0));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
                                  non_bitcasting_callback());
-  EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
+  EXPECT_FALSE(simplifier.Run(&module()).ValueOrDie());
 
   // Reshape is not replaced with a bitcast.
   EXPECT_THAT(computation->root_instruction(), op::Reshape(param0));
@@ -1314,8 +1268,7 @@ TEST_F(AlgebraicSimplifierTest, ReshapeReplacedWithBitcast) {
   builder.AddInstruction(HloInstruction::CreateTuple(
       {transformable_reshape, dimensions_wrong_reshape, layout_wrong_reshape}));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Tuple(transformable_reshape, dimensions_wrong_reshape,
@@ -1323,7 +1276,7 @@ TEST_F(AlgebraicSimplifierTest, ReshapeReplacedWithBitcast) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
                                  bitcasting_callback());
-  simplifier.Run(module.get()).ValueOrDie();
+  simplifier.Run(&module()).ValueOrDie();
 
   // Verify that only the first reshape is replaced.
   EXPECT_THAT(
@@ -1344,8 +1297,7 @@ TEST_F(AlgebraicSimplifierTest, ReshapeAfterEffectiveUnary) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(ShapeUtil::MakeShape(F32, {1, 2, 3, 4, 5}),
                                    HloOpcode::kMaximum, movable_reshape, zero));
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Maximum(op::Reshape(param), zero));
@@ -1353,7 +1305,7 @@ TEST_F(AlgebraicSimplifierTest, ReshapeAfterEffectiveUnary) {
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  bitcasting_callback());
 
-  simplifier.Run(module.get()).ValueOrDie();
+  simplifier.Run(&module()).ValueOrDie();
   EXPECT_THAT(computation->root_instruction(),
               op::Reshape(op::Maximum(param, zero)));
 }
@@ -1371,8 +1323,7 @@ TEST_F(AlgebraicSimplifierTest, ReshapeToScalarNotHoistedAfterEffectiveUnary) {
       HloInstruction::CreateConstant(Literal::CreateR1<float>({1., 2., 3.})));
   builder.AddInstruction(HloInstruction::CreateBinary(
       ShapeUtil::MakeShape(F32, {3}), HloOpcode::kMaximum, reshape, zero));
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Maximum(op::Reshape(param), zero));
@@ -1380,7 +1331,7 @@ TEST_F(AlgebraicSimplifierTest, ReshapeToScalarNotHoistedAfterEffectiveUnary) {
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  bitcasting_callback());
 
-  simplifier.Run(module.get()).ValueOrDie();
+  simplifier.Run(&module()).ValueOrDie();
 
   EXPECT_THAT(computation->root_instruction(),
               op::Maximum(op::Reshape(param), zero));
@@ -1405,9 +1356,8 @@ TEST_F(AlgebraicSimplifierTest, FailureToSinkReshapeDoesntAffectChangedBit) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  bitcasting_callback());
-  auto module = CreateNewModule();
-  module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  module().AddEntryComputation(builder.Build());
+  EXPECT_TRUE(simplifier.Run(&module()).ValueOrDie());
 }
 
 // Regression test for a bug where if we failed to sink a reshape, we'd set the
@@ -1424,14 +1374,14 @@ TEST_F(AlgebraicSimplifierTest, FailureToSinkBroadcastDoesntAffectChangedBit) {
       builder.AddInstruction(HloInstruction::CreateConstant(
           Literal::CreateR2<float>({{0, 0}, {0, 0}})))));
 
-  builder.AddInstruction(HloInstruction::CreateBroadcast(
-      ShapeUtil::MakeShape(F32, {2, 2, 2}), add, /*broadcast_dimensions=*/{0}));
+  builder.AddInstruction(
+      HloInstruction::CreateBroadcast(ShapeUtil::MakeShape(F32, {2, 2, 2}), add,
+                                      /*broadcast_dimensions=*/{0, 1}));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  bitcasting_callback());
-  auto module = CreateNewModule();
-  module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  module().AddEntryComputation(builder.Build());
+  EXPECT_TRUE(simplifier.Run(&module()).ValueOrDie());
 }
 
 TEST_F(AlgebraicSimplifierTest, TransposeEqualsBitcast1) {
@@ -1448,14 +1398,13 @@ TEST_F(AlgebraicSimplifierTest, TransposeEqualsBitcast1) {
   *transpose->mutable_shape()->mutable_layout() =
       LayoutUtil::MakeLayout({0, 1, 2, 3});
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Transpose(param));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
                                  bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   // Verify that the reshape is replaced.
   EXPECT_THAT(computation->root_instruction(), op::Bitcast(param));
@@ -1475,14 +1424,13 @@ TEST_F(AlgebraicSimplifierTest, TransposeEqualsBitcast2) {
   *transpose->mutable_shape()->mutable_layout() =
       LayoutUtil::MakeLayout({3, 1, 2, 0});
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Transpose(param));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
                                  bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   // Verify that the reshape is replaced.
   EXPECT_THAT(computation->root_instruction(), op::Bitcast(param));
@@ -1501,15 +1449,14 @@ TEST_F(AlgebraicSimplifierTest, ReshapesMerged) {
   builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(F32, {1, 2, 1, 1, 2, 1}), reshape1));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Reshape(op::Reshape(param0)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Reshape(param0));
 }
@@ -1529,14 +1476,13 @@ TEST_F(AlgebraicSimplifierTest, CopiesMerged) {
       ShapeUtil::MakeShapeWithLayout(F32, {2, 2, 2}, {0, 2, 1}),
       HloOpcode::kCopy, copy1));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Copy(op::Copy(param0)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Copy(param0));
 }
@@ -1554,14 +1500,13 @@ TEST_F(AlgebraicSimplifierTest, TransposesMerged) {
   builder.AddInstruction(HloInstruction::CreateTranspose(
       ShapeUtil::MakeShape(F32, {4, 3, 2}), transpose1, {1, 0, 2}));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Transpose(transpose1));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Transpose(param0));
   EXPECT_EQ(std::vector<int64>({2, 1, 0}),
@@ -1576,17 +1521,16 @@ TEST_F(AlgebraicSimplifierTest, ReshapeAndBroadcastMerged) {
   auto reshape1 = builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(F32, {1, 5, 1}), param0));
   builder.AddInstruction(HloInstruction::CreateBroadcast(
-      ShapeUtil::MakeShape(F32, {1, 2, 3, 5, 1}), reshape1, {0, 2, 3}));
+      ShapeUtil::MakeShape(F32, {1, 2, 3, 5, 1}), reshape1, {0, 3, 2}));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Broadcast(op::Reshape(param0)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Broadcast(param0));
 }
@@ -1601,15 +1545,14 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshapeMerged) {
   builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(F32, {2, 3, 7, 2, 1, 3, 2}), broadcast1));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Reshape(op::Broadcast(param0)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Broadcast(param0));
 }
@@ -1623,15 +1566,14 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_1_3x1_3) {
   builder.AddInstruction(
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(F32, {3}), broadcast));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Reshape(op::Broadcast(param)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
+  EXPECT_FALSE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Reshape(op::Broadcast(param)));
@@ -1646,15 +1588,14 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_4_3x2x4_6x1x1x4) {
   builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(F32, {6, 1, 1, 4}), broadcast));
 
-  auto module = CreateNewModule();
-  HloComputation* computation = module->AddEntryComputation(builder.Build());
+  HloComputation* computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Reshape(op::Broadcast(param)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Broadcast(param));
   EXPECT_THAT(computation->root_instruction()->dimensions(),
@@ -1670,15 +1611,14 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_1_3x2x1_6x1x1x1) {
   builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(F32, {6, 1, 1, 1}), broadcast));
 
-  auto module = CreateNewModule();
-  HloComputation* computation = module->AddEntryComputation(builder.Build());
+  HloComputation* computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Reshape(op::Broadcast(param)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Broadcast(param));
   const std::vector<int64> broadcast_dims =
@@ -1696,15 +1636,14 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_4_3x2x4x2_6x8) {
   builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(F32, {6, 8}), broadcast));
 
-  auto module = CreateNewModule();
-  HloComputation* computation = module->AddEntryComputation(builder.Build());
+  HloComputation* computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Reshape(op::Broadcast(param)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
+  EXPECT_FALSE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Reshape(op::Broadcast(param)));
@@ -2410,12 +2349,11 @@ TEST_F(AlgebraicSimplifierTest, IteratorInvalidation) {
   call_builder.AddInstruction(
       HloInstruction::CreateCall(r1f32, {zero, one}, dot_computation.get()));
 
-  auto module = CreateNewModule();
-  module->AddEmbeddedComputation(std::move(dot_computation));
-  module->AddEntryComputation(call_builder.Build());
+  module().AddEmbeddedComputation(std::move(dot_computation));
+  module().AddEntryComputation(call_builder.Build());
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 }
 
 // Test that a constant with tuple shape becomes a tuple of constants.
@@ -2428,12 +2366,11 @@ TEST_F(AlgebraicSimplifierTest, ConstantTupleBecomesTupleOfConstants) {
                           Literal::CreateR1<float>(constant_vector).get()});
   builder.AddInstruction(HloInstruction::CreateConstant(std::move(value)));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
   EXPECT_THAT(computation->root_instruction(),
               op::Tuple(op::Constant(), op::Constant()));
 }
@@ -2453,11 +2390,10 @@ TEST_F(AlgebraicSimplifierTest, TrivialDynamicSlice) {
           HloInstruction::CreateConstant(Literal::CreateR1<int>({0, 0, 0}))),
       /*slice_sizes=*/{10, 100, 1000}));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
   EXPECT_THAT(computation->root_instruction(), op::Parameter());
 }
 
@@ -2487,11 +2423,10 @@ TEST_F(AlgebraicSimplifierTest, TrivialDynamicUpdateSlice) {
       builder.AddInstruction(
           HloInstruction::CreateConstant(Literal::CreateR1<int>({0, 0, 0})))));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
   EXPECT_THAT(computation->root_instruction(),
               op::DynamicSlice(op::Parameter(), op::Parameter()));
 }
@@ -2554,15 +2489,16 @@ TEST_P(PadReduceWindowEffectiveBroadcastTest, DoIt) {
 
   PaddingConfig padding = window_util::MakeSymmetricPadding(
       decorate_spatials(param.symmetric_pad_spatials, 0, 0));
+  TF_ASSERT_OK_AND_ASSIGN(
+      const Shape pad_shape,
+      ShapeInference::InferPadShape(input->shape(),
+                                    ShapeUtil::MakeShape(F32, {}), padding));
   HloInstruction* pad = builder.AddInstruction(HloInstruction::CreatePad(
-      ShapeUtil::MakeShape(
-          F32, decorate_spatials(param.reduce_window_spatials, 128, 2048)),
-      input,
+      pad_shape, input,
       builder.AddInstruction(
           HloInstruction::CreateConstant(Literal::CreateR0(0.0f))),
       padding));
 
-  std::unique_ptr<HloModule> module = CreateNewModule();
   HloComputation* add_computation = nullptr;
   {
     HloComputation::Builder builder(TestName() + ".add");
@@ -2573,24 +2509,24 @@ TEST_P(PadReduceWindowEffectiveBroadcastTest, DoIt) {
         HloInstruction::CreateParameter(1, scalar_shape, "p1"));
     builder.AddInstruction(
         HloInstruction::CreateBinary(scalar_shape, HloOpcode::kAdd, p0, p1));
-    add_computation = module->AddEmbeddedComputation(builder.Build());
+    add_computation = module().AddEmbeddedComputation(builder.Build());
   }
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      const Shape output_shape,
-      ShapeInference::InferPadShape(input_shape, ShapeUtil::MakeShape(F32, {}),
-                                    padding));
   Window window = window_util::MakeWindow(
       decorate_spatials(param.reduce_window_spatials, 1, 1));
   auto zero = builder.AddInstruction(
       HloInstruction::CreateConstant(Literal::CreateR0<float>(0.0f)));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape output_shape,
+                          ShapeInference::InferReduceWindowShape(
+                              pad->shape(), zero->shape(), window,
+                              add_computation->ComputeProgramShape()));
   builder.AddInstruction(HloInstruction::CreateReduceWindow(
       output_shape, pad, zero, window, add_computation));
 
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(module.get()));
+  TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(&module()));
   ASSERT_TRUE(run_successful);
 
   EXPECT_TRUE(
@@ -2667,11 +2603,10 @@ TEST_P(DotStrengthReductionTest, DotStrengthReduction) {
   dot_dnums.add_rhs_contracting_dimensions(0);
   builder.AddInstruction(
       HloInstruction::CreateDot(dot_shape, lhs, rhs, dot_dnums));
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, simplifier.Run(module.get()));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, simplifier.Run(&module()));
   const bool dot_should_be_transformed = m == 1 || k == 1 || n == 1;
   const bool computation_should_be_modified =
       dot_should_be_transformed || (transpose_lhs && transpose_rhs);
@@ -2699,7 +2634,7 @@ struct DotOfConcatTestSpec {
 };
 
 class DotOfConcatSimplificationTest
-    : public HloTestBase,
+    : public HloVerifiedTestBase,
       public ::testing::WithParamInterface<DotOfConcatTestSpec> {};
 
 // Test that we transform
@@ -2745,11 +2680,10 @@ TEST_P(DotOfConcatSimplificationTest, ConstantLHS) {
   builder.AddInstruction(
       HloInstruction::CreateDot(dot_shape, lhs, rhs, dot_dnums));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(module.get()));
+  TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(&module()));
   ASSERT_TRUE(run_successful);
 
   EXPECT_TRUE(
@@ -2790,17 +2724,17 @@ TEST_P(DotOfConcatSimplificationTest, ConstantRHS) {
   HloInstruction* lhs2 = builder.AddInstruction(
       HloInstruction::CreateParameter(2, lhs2_shape, "lhs2"));
   HloInstruction* lhs3 = builder.AddInstruction(
-      HloInstruction::CreateParameter(3, lhs2_shape, "lhs3"));
+      HloInstruction::CreateParameter(3, lhs3_shape, "lhs3"));
 
   Shape lhs_shape = ShapeUtil::MakeShape(F32, {spec.m, spec.k});
   HloInstruction* lhs =
       builder.AddInstruction(HloInstruction::CreateConcatenate(
           lhs_shape, {lhs0, lhs1, lhs2, lhs3}, 1));
 
-  Shape rhs_shape = ShapeUtil::MakeShape(F32, {spec.k, spec.m});
+  Shape rhs_shape = ShapeUtil::MakeShape(F32, {spec.k, spec.n});
   auto* rhs = builder.AddInstruction(
       HloInstruction::CreateConstant(Literal::CreateR2F32Linspace(
-          /*from=*/10.0, /*to=*/10000.0, /*rows=*/spec.k, /*cols=*/spec.m)));
+          /*from=*/10.0, /*to=*/10000.0, /*rows=*/spec.k, /*cols=*/spec.n)));
 
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(1);
@@ -2810,11 +2744,10 @@ TEST_P(DotOfConcatSimplificationTest, ConstantRHS) {
   builder.AddInstruction(
       HloInstruction::CreateDot(dot_shape, lhs, rhs, dot_dnums));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(module.get()));
+  TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(&module()));
   ASSERT_TRUE(run_successful);
   EXPECT_TRUE(
       ShapeUtil::Equal(computation->root_instruction()->shape(), dot_shape));
diff --git a/tensorflow/compiler/xla/window_util.cc b/tensorflow/compiler/xla/window_util.cc
index 55f42ed3a4..93284b80f9 100644
--- a/tensorflow/compiler/xla/window_util.cc
+++ b/tensorflow/compiler/xla/window_util.cc
@@ -32,6 +32,8 @@ Window MakeWindow(tensorflow::gtl::ArraySlice<int64> sizes) {
     auto* dimension = window.add_dimensions();
     dimension->set_size(size);
     dimension->set_stride(1);
+    dimension->set_base_dilation(1);
+    dimension->set_window_dilation(1);
   }
   return window;
 }
-- 
GitLab


From 68a7f292b324e08fefd2c47dde444720db361f4d Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Tue, 6 Feb 2018 23:55:28 -0800
Subject: [PATCH 752/966] [XLA:CPU] Assert more thoroughly on preconditions in
 VectorSupportlibrary

No behavior change intended.

PiperOrigin-RevId: 184797003
---
 tensorflow/compiler/xla/service/cpu/BUILD     |  1 +
 .../xla/service/cpu/vector_support_library.cc | 38 +++++++++++++++----
 .../xla/service/cpu/vector_support_library.h  |  5 +++
 3 files changed, 36 insertions(+), 8 deletions(-)

diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index 7228e8da42..1a91dd8ff7 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -853,6 +853,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
         "@llvm//:core",
+        "@llvm//:support",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/service/cpu/vector_support_library.cc b/tensorflow/compiler/xla/service/cpu/vector_support_library.cc
index 098d89d8c9..ec4215b468 100644
--- a/tensorflow/compiler/xla/service/cpu/vector_support_library.cc
+++ b/tensorflow/compiler/xla/service/cpu/vector_support_library.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/cpu/vector_support_library.h"
 
+#include "llvm/Support/raw_ostream.h"
 #include "tensorflow/compiler/xla/service/cpu/target_machine_features.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 
@@ -35,8 +36,27 @@ VectorSupportLibrary::VectorSupportLibrary(PrimitiveType primitive_type,
   vector_pointer_type_ = llvm::PointerType::getUnqual(vector_type_);
 }
 
+static string TypeToString(llvm::Type* type) {
+  std::string o;
+  llvm::raw_string_ostream ostream(o);
+  type->print(ostream);
+  return ostream.str();
+}
+
+void VectorSupportLibrary::AssertCorrectTypes(
+    std::initializer_list<llvm::Value*> values) {
+  for (llvm::Value* v : values) {
+    llvm::Type* type = v->getType();
+    if (type != scalar_type() && type != vector_type()) {
+      LOG(FATAL) << "Expected either " << TypeToString(scalar_type()) << " or "
+                 << TypeToString(vector_type()) << " but got "
+                 << TypeToString(type);
+    }
+  }
+}
+
 llvm::Value* VectorSupportLibrary::Mul(llvm::Value* lhs, llvm::Value* rhs) {
-  CHECK(lhs->getType() == scalar_type() || lhs->getType() == vector_type());
+  AssertCorrectTypes({lhs, rhs});
   return MulInternal(lhs, rhs);
 }
 
@@ -50,17 +70,17 @@ llvm::Value* VectorSupportLibrary::MulInternal(llvm::Value* lhs,
 }
 
 llvm::Value* VectorSupportLibrary::Add(llvm::Value* lhs, llvm::Value* rhs) {
-  CHECK(lhs->getType() == scalar_type() || lhs->getType() == vector_type());
+  AssertCorrectTypes({lhs, rhs});
   return AddInternal(lhs, rhs);
 }
 
 llvm::Value* VectorSupportLibrary::Sub(llvm::Value* lhs, llvm::Value* rhs) {
-  CHECK(lhs->getType() == scalar_type() || lhs->getType() == vector_type());
+  AssertCorrectTypes({lhs, rhs});
   return ir_builder()->CreateFSub(lhs, rhs);
 }
 
 llvm::Value* VectorSupportLibrary::Max(llvm::Value* lhs, llvm::Value* rhs) {
-  CHECK(lhs->getType() == scalar_type() || lhs->getType() == vector_type());
+  AssertCorrectTypes({lhs, rhs});
   if (scalar_type_->isFloatingPointTy()) {
     return llvm_ir::EmitFloatMax(lhs, rhs, ir_builder_);
   } else {
@@ -69,13 +89,13 @@ llvm::Value* VectorSupportLibrary::Max(llvm::Value* lhs, llvm::Value* rhs) {
 }
 
 llvm::Value* VectorSupportLibrary::Floor(llvm::Value* a) {
-  CHECK(a->getType() == scalar_type() || a->getType() == vector_type());
+  AssertCorrectTypes({a});
   return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::floor, {a},
                                       {a->getType()}, ir_builder());
 }
 
 llvm::Value* VectorSupportLibrary::Div(llvm::Value* lhs, llvm::Value* rhs) {
-  CHECK(lhs->getType() == scalar_type() || lhs->getType() == vector_type());
+  AssertCorrectTypes({lhs, rhs});
   if (scalar_type_->isFloatingPointTy()) {
     return ir_builder()->CreateFDiv(lhs, rhs, name());
   } else {
@@ -85,10 +105,10 @@ llvm::Value* VectorSupportLibrary::Div(llvm::Value* lhs, llvm::Value* rhs) {
 
 llvm::Value* VectorSupportLibrary::Clamp(llvm::Value* a, double low,
                                          double high) {
+  AssertCorrectTypes({a});
+  llvm::Type* type = a->getType();
   CHECK_LT(low, high);
   CHECK(scalar_type_->isFloatingPointTy());
-  llvm::Type* type = a->getType();
-  CHECK(type == vector_type() || type == scalar_type());
   return llvm_ir::EmitFloatMin(
       llvm_ir::EmitFloatMax(a, llvm::ConstantFP::get(type, low), ir_builder_),
       llvm::ConstantFP::get(type, high), ir_builder_);
@@ -133,6 +153,7 @@ llvm::Value* VectorSupportLibrary::LoadScalar(llvm::Value* pointer) {
 
 void VectorSupportLibrary::StoreVector(llvm::Value* value,
                                        llvm::Value* pointer) {
+  AssertCorrectTypes({value});
   if (pointer->getType() != vector_pointer_type()) {
     pointer = ir_builder()->CreateBitCast(pointer, vector_pointer_type());
   }
@@ -142,6 +163,7 @@ void VectorSupportLibrary::StoreVector(llvm::Value* value,
 
 void VectorSupportLibrary::StoreScalar(llvm::Value* value,
                                        llvm::Value* pointer) {
+  AssertCorrectTypes({value});
   if (pointer->getType() != scalar_pointer_type()) {
     pointer =
         ir_builder()->CreateBitCast(pointer, scalar_pointer_type(), name());
diff --git a/tensorflow/compiler/xla/service/cpu/vector_support_library.h b/tensorflow/compiler/xla/service/cpu/vector_support_library.h
index 5c6e2ecad9..5c5d703db5 100644
--- a/tensorflow/compiler/xla/service/cpu/vector_support_library.h
+++ b/tensorflow/compiler/xla/service/cpu/vector_support_library.h
@@ -170,6 +170,11 @@ class VectorSupportLibrary {
 
   llvm::Value* AddReduce(llvm::Value* vector);
 
+  // Checks that each value in `values` is either of type scalar_type() or
+  // vector_type().  This LOG(FATAL)'s so it should only be called in cases
+  // where a mismatching type is a programmer bug.
+  void AssertCorrectTypes(std::initializer_list<llvm::Value*> values);
+
   // Perform an X86 AVX style horizontal add between `lhs` and `rhs`.  The
   // resulting IR for an 8-float wide vector is expected to lower to a single
   // vhaddps instruction on a CPU that supports vhaddps, and not be too bad in
-- 
GitLab


From 0255d9a9ca66d233ca4befcceff5d6e70b16d0e0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 7 Feb 2018 06:30:32 -0800
Subject: [PATCH 753/966] Support for quantized LstmCell, with initial
 reference runtime code. The current 'optimized' code is just a copy of the
 reference code, a true optimized implementation will follow separately.

PiperOrigin-RevId: 184830223
---
 .../contrib/lite/kernels/internal/common.h    |  11 +
 .../internal/optimized/optimized_ops.h        | 160 ++++++++++++
 .../kernels/internal/quantization_util.cc     |  40 ++-
 .../lite/kernels/internal/quantization_util.h |  13 +-
 .../internal/quantization_util_test.cc        |   2 +-
 .../internal/reference/reference_ops.h        | 232 ++++++++++++++++++
 .../graph_transformations/hardcode_min_max.cc | 104 ++++++++
 .../toco/graph_transformations/quantize.cc    | 149 +++++++----
 tensorflow/contrib/lite/toco/model.h          |  25 ++
 tensorflow/contrib/lite/toco/tooling_util.cc  |  33 +--
 tensorflow/contrib/lite/toco/tooling_util.h   |   3 -
 tensorflow/workspace.bzl                      |   8 +-
 12 files changed, 679 insertions(+), 101 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/internal/common.h b/tensorflow/contrib/lite/kernels/internal/common.h
index fdeacedace..18601df22c 100644
--- a/tensorflow/contrib/lite/kernels/internal/common.h
+++ b/tensorflow/contrib/lite/kernels/internal/common.h
@@ -102,6 +102,17 @@ inline int32 MultiplyByQuantizedMultiplierGreaterThanOne(
                                            quantized_multiplier);
 }
 
+inline int32 MultiplyByQuantizedMultiplier(int32 x, int32 quantized_multiplier,
+                                           int shift) {
+  using gemmlowp::RoundingDivideByPOT;
+  using gemmlowp::SaturatingRoundingDoublingHighMul;
+  int left_shift = shift > 0 ? shift : 0;
+  int right_shift = shift > 0 ? 0 : -shift;
+  return RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
+                                 x * (1 << left_shift), quantized_multiplier),
+                             right_shift);
+}
+
 }  // namespace tflite
 
 #endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMMON_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
index d5b0f45fd8..5a8df67812 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -2081,6 +2081,166 @@ inline void LstmCell(const float* input_data, const Dims<4>& input_dims,
       output_state_map.tanh();
 }
 
+// Quantized LSTM cell. Currently just a copy of the reference impl in
+// reference_ops.h. See the big function comment there, not replicating it
+// here.
+template <int StateIntegerBits>
+void LstmCell(const uint8* input_data_uint8, const Dims<4>& input_dims,
+              const uint8* prev_activ_data_uint8,
+              const Dims<4>& prev_activ_dims, const uint8* weights_data_uint8,
+              const Dims<4>& weights_dims, const int32* bias_data_int32,
+              const Dims<4>& bias_dims, const int16* prev_state_data_int16,
+              const Dims<4>& prev_state_dims, int16* output_state_data_int16,
+              const Dims<4>& output_state_dims, uint8* output_activ_data_uint8,
+              const Dims<4>& output_activ_dims, uint8* concat_temp_data_uint8,
+              const Dims<4>& concat_temp_dims, int16* activ_temp_data_int16,
+              const Dims<4>& activ_temp_dims, int32 weights_zero_point,
+              int32 accum_multiplier, int accum_shift) {
+  gemmlowp::ScopedProfilingLabel label(
+      "LstmCell/quantized (8bit external, 16bit internal)");
+  // Gather dimensions information, and perform consistency checks.
+  const int batches =
+      MatchingArraySize(input_dims, 3, prev_activ_dims, 3, prev_state_dims, 3,
+                        output_state_dims, 3, output_activ_dims, 3);
+  const int height =
+      MatchingArraySize(input_dims, 2, prev_activ_dims, 2, prev_state_dims, 2,
+                        output_state_dims, 2, output_activ_dims, 2);
+  const int width =
+      MatchingArraySize(input_dims, 1, prev_activ_dims, 1, prev_state_dims, 1,
+                        output_state_dims, 1, output_activ_dims, 1);
+  TFLITE_CHECK_EQ(ArraySize(weights_dims, 2), 1);
+  TFLITE_CHECK_EQ(ArraySize(weights_dims, 3), 1);
+  const int input_depth = ArraySize(input_dims, 0);
+  const int prev_activ_depth = ArraySize(prev_activ_dims, 0);
+  const int total_input_depth = prev_activ_depth + input_depth;
+  TFLITE_CHECK_EQ(ArraySize(weights_dims, 0), total_input_depth);
+  TFLITE_CHECK_EQ(MatchingArraySize(bias_dims, 1, bias_dims, 2, bias_dims, 3),
+                  1);
+  const int intern_activ_depth =
+      MatchingArraySize(weights_dims, 1, bias_dims, 0);
+  TFLITE_CHECK_EQ(intern_activ_depth % 4, 0);
+  const int output_depth =
+      MatchingArraySize(prev_state_dims, 0, prev_activ_dims, 0,
+                        output_state_dims, 0, output_activ_dims, 0);
+  TFLITE_CHECK_EQ(output_depth, intern_activ_depth / 4);
+  const int fc_batches = ArraySize(activ_temp_dims, 1) *
+                         ArraySize(activ_temp_dims, 2) *
+                         ArraySize(activ_temp_dims, 3);
+  const int fc_output_depth =
+      MatchingArraySize(weights_dims, 1, activ_temp_dims, 0);
+  const int fc_accum_depth = ArraySize(weights_dims, 0);
+  TFLITE_CHECK_EQ(fc_output_depth, 4 * output_depth);
+
+  // Depth-concatenate prev_activ and input data together.
+  uint8 const* concat_input_arrays_data[2] = {input_data_uint8,
+                                              prev_activ_data_uint8};
+  Dims<4> const* concat_input_arrays_dims[2] = {&input_dims, &prev_activ_dims};
+  Concatenation<FusedActivationFunctionType::kNone, uint8>(
+      0, concat_input_arrays_data, concat_input_arrays_dims, 2,
+      concat_temp_data_uint8, concat_temp_dims);
+
+  // Implementation of the fully connected node inside the LSTM cell.
+  // The operands are 8-bit integers, the accumulators are internally 32bit
+  // integers, and the output is 16-bit fixed-point with 3 integer bits so
+  // the output range is [-2^3, 2^3] == [-8, 8]. The rationale for that
+  // is explained in the function comment above.
+  for (int b = 0; b < fc_batches; ++b) {
+    for (int out_c = 0; out_c < fc_output_depth; ++out_c) {
+      // Internal accumulation.
+      // Initialize accumulator with the bias-value.
+      int32 accum = bias_data_int32[out_c];
+      // Accumulation loop.
+      for (int d = 0; d < fc_accum_depth; ++d) {
+        int16 input_val = concat_temp_data_uint8[b * fc_accum_depth + d] - 128;
+        int16 weights_val =
+            weights_data_uint8[out_c * fc_accum_depth + d] - weights_zero_point;
+        accum += input_val * weights_val;
+      }
+      // Down-scale the final int32 accumulator to the scale used by our
+      // (16-bit, using 3 integer bits) fixed-point format. The quantized
+      // multiplier and shift here have been pre-computed offline
+      // (e.g. by toco).
+      // Note that the implicit assumption here, that this multiplier is smaller
+      // than one, is equivalent to the assumption that the fully-connected
+      // weights min-max is enclosed within [-4, 4] (it may be narrower).
+      // If that eventually fails, offline tools (e.g. toco) will fail early
+      // and that will be easy to support as needed. For now, assuming that
+      // this multiplier is less than one allows us to use a simpler, more
+      // accurate implementation.
+      accum =
+          MultiplyByQuantizedMultiplier(accum, accum_multiplier, accum_shift);
+      // Saturate, cast to int16, and store to the temporary activations array.
+      accum = std::max(-32768, std::min(32767, accum));
+      activ_temp_data_int16[out_c + fc_output_depth * b] = accum;
+    }
+  }
+
+  // Rest of the LSTM cell: tanh and logistic math functions, and some adds
+  // and muls, all done in 16-bit fixed-point.
+  const int outer_size = batches * width * height;
+  for (int b = 0; b < outer_size; ++b) {
+    for (int c = 0; c < output_depth; ++c) {
+      // Define the fixed-point data types that we will use here. All use
+      // int16 as the underlying integer type i.e. all are 16-bit fixed-point.
+      // They only differ by the number of integral vs. fractional bits,
+      // determining the range of values that they can represent.
+      //
+      // F0 uses 0 integer bits, range [-1, 1].
+      // This is the return type of math functions such as tanh, logistic,
+      // whose range is in [-1, 1].
+      using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+      // F3 uses 3 integer bits, range [-8, 8].
+      // This is the range of the previous fully-connected node's output,
+      // which is our input here.
+      using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
+      // FS uses StateIntegerBits integer bits, range [-2^StateIntegerBits,
+      // 2^StateIntegerBits]. It's used to represent the internal state, whose
+      // number of integer bits is currently dictated by the model. See comment
+      // on the StateIntegerBits template parameter above.
+      using FS = gemmlowp::FixedPoint<std::int16_t, StateIntegerBits>;
+      // Implementation of input gate, using fixed-point logistic function.
+      F3 input_gate_input = F3::FromRaw(
+          activ_temp_data_int16[b * fc_output_depth + 0 * output_depth + c]);
+      F0 input_gate_output = gemmlowp::logistic(input_gate_input);
+      // Implementation of input modulation gate, using fixed-point tanh
+      // function.
+      F3 input_modulation_gate_input = F3::FromRaw(
+          activ_temp_data_int16[b * fc_output_depth + 1 * output_depth + c]);
+      F0 input_modulation_gate_output =
+          gemmlowp::tanh(input_modulation_gate_input);
+      // Implementation of forget gate, using fixed-point logistic function.
+      F3 forget_gate_input = F3::FromRaw(
+          activ_temp_data_int16[b * fc_output_depth + 2 * output_depth + c]);
+      F0 forget_gate_output = gemmlowp::logistic(forget_gate_input);
+      // Implementation of output gate, using fixed-point logistic function.
+      F3 output_gate_input = F3::FromRaw(
+          activ_temp_data_int16[b * fc_output_depth + 3 * output_depth + c]);
+      F0 output_gate_output = gemmlowp::logistic(output_gate_input);
+      // Implementation of internal multiplication nodes, still in fixed-point.
+      F0 input_times_input_modulation =
+          input_gate_output * input_modulation_gate_output;
+      FS prev_state = FS::FromRaw(prev_state_data_int16[b * output_depth + c]);
+      FS prev_state_times_forget_state = forget_gate_output * prev_state;
+      // Implementation of internal addition node, saturating.
+      FS new_state = gemmlowp::SaturatingAdd(
+          gemmlowp::Rescale<StateIntegerBits>(input_times_input_modulation),
+          prev_state_times_forget_state);
+      // Implementation of last internal tanh node, still in fixed-point.
+      F0 output_activ_int16 = output_gate_output * gemmlowp::tanh(new_state);
+      // Store the new internal state back to memory, as 16-bit integers.
+      output_state_data_int16[b * output_depth + c] = new_state.raw();
+      // Down-scale the output activations to 8-bit integers, saturating,
+      // and store back to memory.
+      int16 rescaled_output_activ =
+          gemmlowp::RoundingDivideByPOT(output_activ_int16.raw(), 8);
+      int16 clamped_output_activ =
+          std::max<int16>(-128, std::min<int16>(127, rescaled_output_activ));
+      output_activ_data_uint8[b * output_depth + c] =
+          128 + clamped_output_activ;
+    }
+  }
+}
+
 template <FusedActivationFunctionType Ac, typename Scalar>
 void TensorFlowSplit(const Scalar* input_data, const Dims<4>& input_dims,
                      int outputs_count, Scalar* const* output_data,
diff --git a/tensorflow/contrib/lite/kernels/internal/quantization_util.cc b/tensorflow/contrib/lite/kernels/internal/quantization_util.cc
index 98f2e365c5..18be6777a5 100644
--- a/tensorflow/contrib/lite/kernels/internal/quantization_util.cc
+++ b/tensorflow/contrib/lite/kernels/internal/quantization_util.cc
@@ -22,27 +22,20 @@ limitations under the License.
 
 namespace tflite {
 
-void QuantizeMultiplierSmallerThanOne(double double_multiplier,
-                                      int32_t* quantized_multiplier,
-                                      int* right_shift) {
-  TFLITE_CHECK(double_multiplier >= 0.);
-  TFLITE_CHECK(double_multiplier < 1.);
+void QuantizeMultiplier(double double_multiplier, int32_t* quantized_multiplier,
+                        int* shift) {
   if (double_multiplier == 0.) {
     *quantized_multiplier = 0;
-    *right_shift = 0;
+    *shift = 0;
     return;
   }
-  TFLITE_CHECK(double_multiplier > 0.);
-  const double q = std::frexp(double_multiplier, right_shift);
-  *right_shift *= -1;
-
+  const double q = std::frexp(double_multiplier, shift);
   auto q_fixed = static_cast<int64_t>(TfLiteRound(q * (1ll << 31)));
   TFLITE_CHECK(q_fixed <= (1ll << 31));
   if (q_fixed == (1ll << 31)) {
     q_fixed /= 2;
-    --*right_shift;
+    ++*shift;
   }
-  TFLITE_CHECK_GE(*right_shift, 0);
   TFLITE_CHECK_LE(q_fixed, std::numeric_limits<int32_t>::max());
   *quantized_multiplier = static_cast<int32_t>(q_fixed);
 }
@@ -50,17 +43,20 @@ void QuantizeMultiplierSmallerThanOne(double double_multiplier,
 void QuantizeMultiplierGreaterThanOne(double double_multiplier,
                                       int32_t* quantized_multiplier,
                                       int* left_shift) {
-  TFLITE_CHECK(double_multiplier > 1.);
-  const double q = std::frexp(double_multiplier, left_shift);
-  auto q_fixed = static_cast<int64_t>(TfLiteRound(q * (1ll << 31)));
-  TFLITE_CHECK(q_fixed <= (1ll << 31));
-  if (q_fixed == (1ll << 31)) {
-    q_fixed /= 2;
-    ++*left_shift;
-  }
+  TFLITE_CHECK_GT(double_multiplier, 1.);
+  QuantizeMultiplier(double_multiplier, quantized_multiplier, left_shift);
   TFLITE_CHECK_GE(*left_shift, 0);
-  TFLITE_CHECK_LE(q_fixed, std::numeric_limits<int32_t>::max());
-  *quantized_multiplier = static_cast<int32_t>(q_fixed);
+}
+
+void QuantizeMultiplierSmallerThanOne(double double_multiplier,
+                                      int32_t* quantized_multiplier,
+                                      int* right_shift) {
+  TFLITE_CHECK_LT(double_multiplier, 1.);
+  TFLITE_CHECK_GT(double_multiplier, 0.);
+  int shift;
+  QuantizeMultiplier(double_multiplier, quantized_multiplier, &shift);
+  TFLITE_CHECK_LE(shift, 0);
+  *right_shift = -shift;
 }
 
 void PreprocessSoftmaxScaling(double beta, double input_scale,
diff --git a/tensorflow/contrib/lite/kernels/internal/quantization_util.h b/tensorflow/contrib/lite/kernels/internal/quantization_util.h
index efb7191c8d..ba06bc0975 100644
--- a/tensorflow/contrib/lite/kernels/internal/quantization_util.h
+++ b/tensorflow/contrib/lite/kernels/internal/quantization_util.h
@@ -20,7 +20,8 @@ limitations under the License.
 namespace tflite {
 
 // Decompose a double multiplier into a Q0.31 int32 representation of its
-// significand, and shift representation of its exponent.
+// significand, and shift representation of NEGATIVE its exponent ---
+// this is intended as a RIGHT-shift.
 //
 // Restricted to the case where the multiplier < 1 (and non-negative).
 void QuantizeMultiplierSmallerThanOne(double double_multiplier,
@@ -35,6 +36,16 @@ void QuantizeMultiplierGreaterThanOne(double double_multiplier,
                                       int32_t* quantized_multiplier,
                                       int* left_shift);
 
+// Decompose a double multiplier into a Q0.31 int32 representation of its
+// significand, and shift representation of its exponent.
+//
+// Handles an arbitrary positive multiplier. The 'shift' output-value is
+// basically the 'floating-point exponent' of the multiplier:
+// Negative for a right-shift (when the multiplier is <1), positive for a
+// left-shift (when the multiplier is >1)
+void QuantizeMultiplier(double double_multiplier, int32_t* quantized_multiplier,
+                        int* shift);
+
 // This first creates a multiplier in a double equivalent of
 // Q(input_integer_bits).(31-input_integer_bits) representation, with extra
 // precision in the double's fractional bits.  It then splits the result into
diff --git a/tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc b/tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc
index d6f306e2cb..19b1b408ec 100644
--- a/tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc
+++ b/tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc
@@ -31,7 +31,7 @@ TEST(QuantizationUtilTest, QuantizeMultiplierSmallerThanOne) {
   };
 
   EXPECT_DEATH(quantize(-0.1), "");
-  EXPECT_THAT(quantize(0.0), Pair(0, 0));
+  EXPECT_DEATH(quantize(0.0), "");
   EXPECT_THAT(quantize(0.25), Pair(1073741824, 1));
 
   // Around 0.5 we can see the change in exponent and how we try hard to
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index 40e5c48a4c..c5f09af8fe 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -1358,6 +1358,238 @@ inline void LstmCell(const float* input_data, const Dims<4>& input_dims,
   }
 }
 
+// Quantized LSTM cell implementation.
+// The quantization of the input, output arrays is as follows:
+//  - The input activations are quantized as uint8 on the interval
+//    [-1, 127/128].
+//    The rationale for that is that that is the natural interval for output
+//    activations (see next point) and these need to be concatenated together.
+//    We could accommodate different ranges by re-scaling, but we empirically
+//    found that setting the input activations range to be [-1, 127/128] in the
+//    first place, removing the need for re-scaling, greatly improves accuracy.
+//  - The output activations are quantized as uint8 on the interval
+//    [-1, 127/128].
+//    The rationale for that is that the definition of a LSTM cell makes them
+//    intrinsically constrained in [-1, 1]; tweaking that to [-1, 127/128]
+//    makes for simpler, more accurate fixed-point arithmetic.
+//  - The output-at-previous-timestep state array is obviously quantized as
+//    the output activations.
+//  - The internal LSTM memory (not the output-at-previous-timestep, the other
+//    internal state array) is int16-quantized and may use any power-of-two,
+//    symmetric range i.e. [-2^N, 2^N * 32767/32768] for any N, which we call
+//    StateIntegerBits below, see the below discussion of that template
+//    parameter ("The StateIntegerBits template parameter").
+//  - The output of the internal fully-connected node is int16-quantized
+//    on the interval [-8, 8 * 32767/32768], the rationale for which is
+//    explained just below ("Why [-8, 8] for fully-connected output?").
+//
+//
+// === The StateIntegerBits template parameter ===
+//
+// The StateIntegerBits template parameter controls the fixed-point format used
+// to represent the internal memory of the LSTM cell (not the
+// output-at-previous-timestep, the other internal state array). It's currently
+// a template parameter so that the model can control that. The most typical
+// value for StateIntegerBits is 4. Other plausible values are anywhere between
+// 3 and 5. We might eventually standardize on a single supported value, e.g. 4,
+// and drop that template parameter. The reason why it can't be a runtime
+// parameter is that this controls the fixed-point format used, i.e. we need to
+// generate actually different code based on it. In particular, we generate code
+// for a fixed-point tanh() implementation for that format, which internally
+// uses a fixed-point exp() implementation, which internally uses a
+// barrel-shifter with a number of steps that depends on StateIntegerBits.
+// Another consequence of that is that a higher value of StateIntegerBits
+// results in a more expensive implementation (more barrel shifter steps
+// needed).
+//
+//
+// === Why [-8, 8] for fully-connected output? ===
+//
+// This array is only fed to Logistic and Tanh functions, for which
+// the quantized implementation will want to use fixed-point arithmetic,
+// requiring a power-of-two representation interval. Thus, we should right
+// away quantize this array to a power-of-two interval; otherwise,
+// implementation will need to rescale that, losing any benefit that a tighter
+// representation interval might otherwise yield, while introducting some
+// numerical error and computational overhead.
+//
+// Now, Logistic and Tanh
+// are nearly constant (nearly equal to their horizontal asymptotes)
+// outside of a small bounded interval around 0:
+//
+//   Logistic(4) = 1 - 1.8e-2     Tanh(4) = 1 - 6.7e-4
+//   Logistic(8) = 1 - 3.4e-4     Tanh(8) = 1 - 2.3e-7
+//   Logistic(16) = 1 - 1.1e-7    Tanh(16) = 1 - 2.5e-14
+//
+// From this, we see that clamping to [-4, 4] would be too inaccurate
+// (the error of 1.8e-2 on Logistic would be felt even in 8bit precision)
+// while clamping to [-16, 16] would make no difference even in float32.
+// However, for a fixed-point implementation in 16-bit integers, using 5
+// integer bits to represent the [-16, 16] range would leave only 11
+// fractional bits, giving an increment of 2^-11 = 4.9e-4 between consecutive
+// representable values. Notice that that is higher than the
+// worst-case clamping error with clamping to [-8, 8]: 3.4e-4 for Logistic.
+// Using [-8, 8] thus seems like the better compromise overall, enjoying
+// an increment of 2.4e-4 between representable values and a worst-case
+// clamping error of 3.4e-4, both better than the increment of 4.9e-4 with
+// [-16, 16].
+//
+// Moreover, all other things being equal, it is nice to choose the narrower
+// representation range, as that makes the implementation of fixed-point
+// math functions a little cheaper (each integer bit requires an additional
+// barrel-shifter atep in the implementation of exp(-x)). That is further
+// reason to prefer [-8, 8] over [-16, 16]. The choice of [-16, 16] would make
+// sense for 32-bit float or 32-bit fixed-point quantization, but we are
+// aiming for 16-bit fixed-point quantization of these internal nodes here.
+//
+template <int StateIntegerBits>
+void LstmCell(const uint8* input_data_uint8, const Dims<4>& input_dims,
+              const uint8* prev_activ_data_uint8,
+              const Dims<4>& prev_activ_dims, const uint8* weights_data_uint8,
+              const Dims<4>& weights_dims, const int32* bias_data_int32,
+              const Dims<4>& bias_dims, const int16* prev_state_data_int16,
+              const Dims<4>& prev_state_dims, int16* output_state_data_int16,
+              const Dims<4>& output_state_dims, uint8* output_activ_data_uint8,
+              const Dims<4>& output_activ_dims, uint8* concat_temp_data_uint8,
+              const Dims<4>& concat_temp_dims, int16* activ_temp_data_int16,
+              const Dims<4>& activ_temp_dims, int32 weights_zero_point,
+              int32 accum_multiplier, int accum_shift) {
+  // Gather dimensions information, and perform consistency checks.
+  const int batches =
+      MatchingArraySize(input_dims, 3, prev_activ_dims, 3, prev_state_dims, 3,
+                        output_state_dims, 3, output_activ_dims, 3);
+  const int height =
+      MatchingArraySize(input_dims, 2, prev_activ_dims, 2, prev_state_dims, 2,
+                        output_state_dims, 2, output_activ_dims, 2);
+  const int width =
+      MatchingArraySize(input_dims, 1, prev_activ_dims, 1, prev_state_dims, 1,
+                        output_state_dims, 1, output_activ_dims, 1);
+  TFLITE_CHECK_EQ(ArraySize(weights_dims, 2), 1);
+  TFLITE_CHECK_EQ(ArraySize(weights_dims, 3), 1);
+  const int input_depth = ArraySize(input_dims, 0);
+  const int prev_activ_depth = ArraySize(prev_activ_dims, 0);
+  const int total_input_depth = prev_activ_depth + input_depth;
+  TFLITE_CHECK_EQ(ArraySize(weights_dims, 0), total_input_depth);
+  TFLITE_CHECK_EQ(MatchingArraySize(bias_dims, 1, bias_dims, 2, bias_dims, 3),
+                  1);
+  const int intern_activ_depth =
+      MatchingArraySize(weights_dims, 1, bias_dims, 0);
+  TFLITE_CHECK_EQ(intern_activ_depth % 4, 0);
+  const int output_depth =
+      MatchingArraySize(prev_state_dims, 0, prev_activ_dims, 0,
+                        output_state_dims, 0, output_activ_dims, 0);
+  TFLITE_CHECK_EQ(output_depth, intern_activ_depth / 4);
+  const int fc_batches = ArraySize(activ_temp_dims, 1) *
+                         ArraySize(activ_temp_dims, 2) *
+                         ArraySize(activ_temp_dims, 3);
+  const int fc_output_depth =
+      MatchingArraySize(weights_dims, 1, activ_temp_dims, 0);
+  const int fc_accum_depth = ArraySize(weights_dims, 0);
+  TFLITE_CHECK_EQ(fc_output_depth, 4 * output_depth);
+
+  // Depth-concatenate prev_activ and input data together.
+  uint8 const* concat_input_arrays_data[2] = {input_data_uint8,
+                                              prev_activ_data_uint8};
+  Dims<4> const* concat_input_arrays_dims[2] = {&input_dims, &prev_activ_dims};
+  Concatenation<FusedActivationFunctionType::kNone, uint8>(
+      0, concat_input_arrays_data, concat_input_arrays_dims, 2,
+      concat_temp_data_uint8, concat_temp_dims);
+
+  // Implementation of the fully connected node inside the LSTM cell.
+  // The operands are 8-bit integers, the accumulators are internally 32bit
+  // integers, and the output is 16-bit fixed-point with 3 integer bits so
+  // the output range is [-2^3, 2^3] == [-8, 8]. The rationale for that
+  // is explained in the function comment above.
+  for (int b = 0; b < fc_batches; ++b) {
+    for (int out_c = 0; out_c < fc_output_depth; ++out_c) {
+      // Internal accumulation.
+      // Initialize accumulator with the bias-value.
+      int32 accum = bias_data_int32[out_c];
+      // Accumulation loop.
+      for (int d = 0; d < fc_accum_depth; ++d) {
+        int16 input_val = concat_temp_data_uint8[b * fc_accum_depth + d] - 128;
+        int16 weights_val =
+            weights_data_uint8[out_c * fc_accum_depth + d] - weights_zero_point;
+        accum += input_val * weights_val;
+      }
+      // Down-scale the final int32 accumulator to the scale used by our
+      // (16-bit, using 3 integer bits) fixed-point format. The quantized
+      // multiplier and shift here have been pre-computed offline
+      // (e.g. by toco).
+      accum =
+          MultiplyByQuantizedMultiplier(accum, accum_multiplier, accum_shift);
+      // Saturate, cast to int16, and store to the temporary activations array.
+      accum = std::max(-32768, std::min(32767, accum));
+      activ_temp_data_int16[out_c + fc_output_depth * b] = accum;
+    }
+  }
+
+  // Rest of the LSTM cell: tanh and logistic math functions, and some adds
+  // and muls, all done in 16-bit fixed-point.
+  const int outer_size = batches * width * height;
+  for (int b = 0; b < outer_size; ++b) {
+    for (int c = 0; c < output_depth; ++c) {
+      // Define the fixed-point data types that we will use here. All use
+      // int16 as the underlying integer type i.e. all are 16-bit fixed-point.
+      // They only differ by the number of integral vs. fractional bits,
+      // determining the range of values that they can represent.
+      //
+      // F0 uses 0 integer bits, range [-1, 1].
+      // This is the return type of math functions such as tanh, logistic,
+      // whose range is in [-1, 1].
+      using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+      // F3 uses 3 integer bits, range [-8, 8].
+      // This is the range of the previous fully-connected node's output,
+      // which is our input here.
+      using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
+      // FS uses StateIntegerBits integer bits, range [-2^StateIntegerBits,
+      // 2^StateIntegerBits]. It's used to represent the internal state, whose
+      // number of integer bits is currently dictated by the model. See comment
+      // on the StateIntegerBits template parameter above.
+      using FS = gemmlowp::FixedPoint<std::int16_t, StateIntegerBits>;
+      // Implementation of input gate, using fixed-point logistic function.
+      F3 input_gate_input = F3::FromRaw(
+          activ_temp_data_int16[b * fc_output_depth + 0 * output_depth + c]);
+      F0 input_gate_output = gemmlowp::logistic(input_gate_input);
+      // Implementation of input modulation gate, using fixed-point tanh
+      // function.
+      F3 input_modulation_gate_input = F3::FromRaw(
+          activ_temp_data_int16[b * fc_output_depth + 1 * output_depth + c]);
+      F0 input_modulation_gate_output =
+          gemmlowp::tanh(input_modulation_gate_input);
+      // Implementation of forget gate, using fixed-point logistic function.
+      F3 forget_gate_input = F3::FromRaw(
+          activ_temp_data_int16[b * fc_output_depth + 2 * output_depth + c]);
+      F0 forget_gate_output = gemmlowp::logistic(forget_gate_input);
+      // Implementation of output gate, using fixed-point logistic function.
+      F3 output_gate_input = F3::FromRaw(
+          activ_temp_data_int16[b * fc_output_depth + 3 * output_depth + c]);
+      F0 output_gate_output = gemmlowp::logistic(output_gate_input);
+      // Implementation of internal multiplication nodes, still in fixed-point.
+      F0 input_times_input_modulation =
+          input_gate_output * input_modulation_gate_output;
+      FS prev_state = FS::FromRaw(prev_state_data_int16[b * output_depth + c]);
+      FS prev_state_times_forget_state = forget_gate_output * prev_state;
+      // Implementation of internal addition node, saturating.
+      FS new_state = gemmlowp::SaturatingAdd(
+          gemmlowp::Rescale<StateIntegerBits>(input_times_input_modulation),
+          prev_state_times_forget_state);
+      // Implementation of last internal tanh node, still in fixed-point.
+      F0 output_activ_int16 = output_gate_output * gemmlowp::tanh(new_state);
+      // Store the new internal state back to memory, as 16-bit integers.
+      output_state_data_int16[b * output_depth + c] = new_state.raw();
+      // Down-scale the output activations to 8-bit integers, saturating,
+      // and store back to memory.
+      int16 rescaled_output_activ =
+          gemmlowp::RoundingDivideByPOT(output_activ_int16.raw(), 8);
+      int16 clamped_output_activ =
+          std::max<int16>(-128, std::min<int16>(127, rescaled_output_activ));
+      output_activ_data_uint8[b * output_depth + c] =
+          128 + clamped_output_activ;
+    }
+  }
+}
+
 template <FusedActivationFunctionType Ac, typename Scalar>
 void TensorFlowSplit(const Scalar* input_data, const Dims<4>& input_dims,
                      int outputs_count, Scalar* const* output_data,
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc b/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc
index f1892136cf..1b0be85810 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc
@@ -177,6 +177,106 @@ bool HardcodeMinMaxForOutput(Model* model, Operator* op, double min,
   output_minmax.max = max;
   return true;
 }
+
+// Propagates MinMax from any of the listed arrays, to all others.
+// If multiple of these arrays have MinMax, then these are required
+// to agree with each other.
+bool PropagateMinMaxAmongArrays(Model* model,
+                                const std::vector<string> array_names) {
+  string reference_array_name;
+  MinMax* reference_minmax = nullptr;
+  for (const string& array_name : array_names) {
+    if (model->GetArray(array_name).minmax) {
+      reference_array_name = array_name;
+      reference_minmax = model->GetArray(array_name).minmax.get();
+      break;
+    }
+  }
+  // No MinMax info is available to propagate.
+  if (!reference_minmax) {
+    return false;
+  }
+  bool changed = false;
+  for (const string& array_name : array_names) {
+    auto& array = model->GetArray(array_name);
+    if (array.minmax) {
+      CHECK(*array.minmax == *reference_minmax)
+          << "Both the following arrays have minmax, and they disagree: "
+          << reference_array_name << " and " << array_name
+          << ". Expected that either only one of them would have minmax, or at "
+             "least that they would agree.";
+    } else {
+      array.GetOrCreateMinMax() = *reference_minmax;
+      changed = true;
+    }
+  }
+  return changed;
+}
+
+bool HardcodeMinMaxForLstmCell(Model* model, Operator* op) {
+  CHECK_EQ(op->inputs.size(), LstmCellOperator::NUM_INPUTS);
+  CHECK_EQ(op->outputs.size(), LstmCellOperator::NUM_OUTPUTS);
+
+  bool changed = false;
+  changed |= PropagateMinMaxAmongArrays(
+      model, {op->inputs[LstmCellOperator::PREV_STATE_INPUT],
+              op->outputs[LstmCellOperator::STATE_OUTPUT]});
+
+  auto& input_activations =
+      model->GetArray(op->inputs[LstmCellOperator::DATA_INPUT]);
+  if (!input_activations.minmax) {
+    auto& minmax = input_activations.GetOrCreateMinMax();
+    minmax.min = -1;
+    minmax.max = 127. / 128.;
+    changed = true;
+  }
+
+  auto& prev_output_activations =
+      model->GetArray(op->inputs[LstmCellOperator::PREV_ACTIV_INPUT]);
+  if (!prev_output_activations.minmax) {
+    auto& minmax = prev_output_activations.GetOrCreateMinMax();
+    minmax.min = -1;
+    minmax.max = 127. / 128.;
+    changed = true;
+  }
+
+  auto& output_concat_temp =
+      model->GetArray(op->outputs[LstmCellOperator::CONCAT_TEMP]);
+  if (!output_concat_temp.minmax) {
+    auto& minmax = output_concat_temp.GetOrCreateMinMax();
+    minmax.min = -1;
+    minmax.max = 127. / 128.;
+    changed = true;
+  }
+
+  auto& output_activations =
+      model->GetArray(op->outputs[LstmCellOperator::ACTIV_OUTPUT]);
+  if (!output_activations.minmax) {
+    auto& minmax = output_activations.GetOrCreateMinMax();
+    minmax.min = -1;
+    minmax.max = 127. / 128.;
+    changed = true;
+  }
+
+  // (This comment should morph into proper documentation for
+  // quantization of LSTM models. It isn't just a local implementation detail,
+  // the training code for LSTM models needs to be adjusted to that.)
+  //
+  // Finally, output_activations_temp holds the output of the fully-connected
+  // node inside the LSTM cell. For it, we hardcode a minmax of [-8, 8].
+  // The rationale for that is given in a lengthy comment on the LstmCell
+  // quantized runtime implementation in reference_ops.h.
+  auto& output_activations_temp =
+      model->GetArray(op->outputs[LstmCellOperator::ACTIV_TEMP]);
+  if (!output_activations_temp.minmax) {
+    auto& minmax = output_activations_temp.GetOrCreateMinMax();
+    minmax.min = -8;
+    minmax.max = 8 * 32767. / 32768.;
+    changed = true;
+  }
+
+  return changed;
+}
 }  // namespace
 
 bool HardcodeMinMax::Run(Model* model, std::size_t op_index) {
@@ -225,6 +325,10 @@ bool HardcodeMinMax::Run(Model* model, std::size_t op_index) {
       changed = HardcodeMinMaxForOutput(model, op, -127. / 128., 1.0);
       break;
 
+    case OperatorType::kLstmCell:
+      changed = HardcodeMinMaxForLstmCell(model, op);
+      break;
+
     default:
       break;
   }
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
index 139c19022e..3a674f884d 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
@@ -49,7 +49,7 @@ bool SupportsQuantization(const Operator& op) {
          type == OperatorType::kTensorFlowReshape ||
          type == OperatorType::kTanh || type == OperatorType::kMul ||
          type == OperatorType::kSpaceToDepth ||
-         type == OperatorType::kDepthToSpace;
+         type == OperatorType::kDepthToSpace || type == OperatorType::kLstmCell;
 }
 
 template <ArrayDataType A>
@@ -104,6 +104,9 @@ void QuantizeArray(GraphTransformation* transformation, Model* model,
     case ArrayDataType::kUint8:
       return QuantizeArray<ArrayDataType::kUint8>(transformation, model, name,
                                                   quantization_params);
+    case ArrayDataType::kInt16:
+      return QuantizeArray<ArrayDataType::kInt16>(transformation, model, name,
+                                                  quantization_params);
     case ArrayDataType::kInt32:
       return QuantizeArray<ArrayDataType::kInt32>(transformation, model, name,
                                                   quantization_params);
@@ -172,36 +175,62 @@ bool ChooseQuantizationForOperatorInput(
   if (array.data_type != ArrayDataType::kFloat) {
     return false;
   }
+
+  // Quantization of bias vectors
+  bool is_bias_vector = false;
+  int activations_input_index;
+  int weights_input_index;
   if (op.type == OperatorType::kConv ||
       op.type == OperatorType::kDepthwiseConv ||
       op.type == OperatorType::kFullyConnected) {
     if (input_index == 2) {
-      // Quantization of bias vector.
-      // We need both of the mandatory inputs (input activations and weights) to
-      // have
-      // been already quantized.
-      const auto& input_activations = model->GetArray(op.inputs[0]);
-      const auto& input_weights = model->GetArray(op.inputs[1]);
-      if (!input_activations.quantization_params ||
-          !input_weights.quantization_params) {
-        return false;
-      }
-      const auto input_activations_scale =
-          input_activations.quantization_params->scale;
-      const auto input_weights_scale = input_weights.quantization_params->scale;
-      quantization_params->scale =
-          input_activations_scale * input_weights_scale;
-      quantization_params->zero_point = 0;
-      *quantized_data_type = ArrayDataType::kInt32;
-      transformation->AddMessageF(
-          "Input array %s is a bias vector. Choosing quantization params "
-          "accordingly.",
-          input);
-      return true;
+      is_bias_vector = true;
+      activations_input_index = 0;
+      weights_input_index = 1;
+    }
+  }
+  if (op.type == OperatorType::kLstmCell) {
+    if (input_index == LstmCellOperator::BIASES_INPUT) {
+      is_bias_vector = true;
+      activations_input_index = LstmCellOperator::DATA_INPUT;
+      weights_input_index = LstmCellOperator::WEIGHTS_INPUT;
+    }
+  }
+  if (is_bias_vector) {
+    // Quantization of bias vector.
+    // We need both of the mandatory inputs (input activations and weights) to
+    // have been already quantized.
+    const auto& input_activations =
+        model->GetArray(op.inputs[activations_input_index]);
+    const auto& input_weights = model->GetArray(op.inputs[weights_input_index]);
+    if (!input_activations.quantization_params ||
+        !input_weights.quantization_params) {
+      return false;
     }
+    const auto input_activations_scale =
+        input_activations.quantization_params->scale;
+    const auto input_weights_scale = input_weights.quantization_params->scale;
+    quantization_params->scale = input_activations_scale * input_weights_scale;
+    quantization_params->zero_point = 0;
+    *quantized_data_type = ArrayDataType::kInt32;
+    transformation->AddMessageF(
+        "Input array %s is a bias vector. Choosing quantization params "
+        "accordingly.",
+        input);
+    return true;
   }
 
   const MinMax& minmax = GetOrComputeMinMax(model, input);
+
+  if (op.type == OperatorType::kLstmCell) {
+    if (input_index == LstmCellOperator::PREV_STATE_INPUT) {
+      GetQuantizationParamsFromMinMax<ArrayDataType::kInt16>(
+          model->flags, minmax, quantization_params);
+      *quantized_data_type = ArrayDataType::kInt16;
+      return true;
+    }
+  }
+
   GetQuantizationParamsFromMinMax<ArrayDataType::kUint8>(model->flags, minmax,
                                                          quantization_params);
   transformation->AddMessageF(
@@ -310,6 +339,15 @@ bool ChooseQuantizationForOperatorOutput(
     return true;
   }
   const MinMax& minmax = GetOrComputeMinMax(model, output);
+  if (op.type == OperatorType::kLstmCell) {
+    if (output_index == LstmCellOperator::STATE_OUTPUT ||
+        output_index == LstmCellOperator::ACTIV_TEMP) {
+      GetQuantizationParamsFromMinMax<ArrayDataType::kInt16>(
+          model->flags, minmax, quantization_params);
+      *quantized_data_type = ArrayDataType::kInt16;
+      return true;
+    }
+  }
   GetQuantizationParamsFromMinMax<ArrayDataType::kUint8>(model->flags, minmax,
                                                          quantization_params);
   *quantized_data_type = ArrayDataType::kUint8;
@@ -405,41 +443,52 @@ bool Quantize::Run(Model* model, std::size_t op_index) {
     if (ChooseQuantizationForOperatorInput(this, model, op, input_index,
                                            &quantized_data_type,
                                            &quantization_params)) {
-      changed = true;
       const auto& input = op.inputs[input_index];
       if (IsConstantParameterArray(*model, input)) {
         QuantizeArray(this, model, input, quantized_data_type,
                       quantization_params);
-      } else if (toco::IsRnnStateArray(*model, input)) {
-        // Simply Quantize the Array
-        auto& array = model->GetArray(op.inputs[input_index]);
-        array.GetOrCreateQuantizationParams() = quantization_params;
-        array.data_type = quantized_data_type;
+        changed = true;
       } else {
         auto dequantize_it = FindOpWithOutput(*model, input);
-        CHECK(dequantize_it != model->operators.end())
-            << "Cannot quantize input \"" << input
-            << "\" on operator with output \"" << op.outputs[0]
-            << "\". Nothing feeding input.";
-        auto* dequantize_op = dequantize_it->get();
-        CHECK(dequantize_op->type == OperatorType::kDequantize)
-            << "Cannot quantize input \"" << input
-            << "\" on operator with output \"" << op.outputs[0]
-            << "\". Input is not fed by a Dequantize operator.";
-        op.inputs[input_index] = dequantize_op->inputs[0];
-        // Check if the output of that Dequantize op was not used by any
-        // other operator. We will then erase that Dequantize op.
-        if (!CountOpsWithInput(*model, dequantize_op->outputs[0])) {
-          // If any of the model's output_arrays was pointing to the
-          // Dequantize op's output, let it point to the Dequantize op's
-          // input instead.
-          for (int i = 0; i < model->flags.output_arrays_size(); i++) {
-            if (model->flags.output_arrays(i) == dequantize_op->outputs[0]) {
-              model->flags.set_output_arrays(i, dequantize_op->inputs[0]);
+        if (dequantize_it != model->operators.end()) {
+          auto* dequantize_op = dequantize_it->get();
+          CHECK(dequantize_op->type == OperatorType::kDequantize);
+          op.inputs[input_index] = dequantize_op->inputs[0];
+          // Check if the output of that Dequantize op was not used by any
+          // other operator. We will then erase that Dequantize op.
+          if (!CountOpsWithInput(*model, dequantize_op->outputs[0])) {
+            // If any of the model's output_arrays was pointing to the
+            // Dequantize op's output, let it point to the Dequantize op's
+            // input instead.
+            for (int i = 0; i < model->flags.output_arrays_size(); i++) {
+              if (model->flags.output_arrays(i) == dequantize_op->outputs[0]) {
+                model->flags.set_output_arrays(i, dequantize_op->inputs[0]);
+              }
+            }
+            model->EraseArray(dequantize_op->outputs[0]);
+            model->operators.erase(dequantize_it);
+          }
+          changed = true;
+        } else {
+          // This input array is not produced by a Dequantize op.
+          // We have encountered this situation in RNN graphs, whose cyclic
+          // nature defeats the basic assumption underlying the quantization
+          // algorithm implemented here. For now, when we have seen this
+          // happening, the array in question was a RNN state array itself,
+          // so let us just implement this case here, and guard that assumption
+          // with a CHECK. A more general fix would involve revisiting the
+          // design of this whole Quantization transformation.
+          bool is_rnn_state_array = false;
+          for (const auto& rnn_state : model->flags.rnn_states()) {
+            if (rnn_state.state_array() == input) {
+              is_rnn_state_array = true;
+              break;
             }
           }
-          model->EraseArray(dequantize_op->outputs[0]);
-          model->operators.erase(dequantize_it);
+          CHECK(is_rnn_state_array);
+          QuantizeArray(this, model, input, quantized_data_type,
+                        quantization_params);
+          changed = true;
         }
       }
     }
diff --git a/tensorflow/contrib/lite/toco/model.h b/tensorflow/contrib/lite/toco/model.h
index 17d5007450..b2a70eac3e 100644
--- a/tensorflow/contrib/lite/toco/model.h
+++ b/tensorflow/contrib/lite/toco/model.h
@@ -158,9 +158,14 @@ enum class ArrayDataType {
   kNone,
   kBool,
   kFloat,
+  kInt8,
   kUint8,
+  kInt16,
+  kUint16,
   kInt32,
+  kUint32,
   kInt64,
+  kUint64,
   kString
 };
 
@@ -180,18 +185,38 @@ struct DataTypeImpl<ArrayDataType::kFloat> {
   typedef float Type;
 };
 template <>
+struct DataTypeImpl<ArrayDataType::kInt8> {
+  typedef int8 Type;
+};
+template <>
 struct DataTypeImpl<ArrayDataType::kUint8> {
   typedef uint8 Type;
 };
 template <>
+struct DataTypeImpl<ArrayDataType::kInt16> {
+  typedef int16 Type;
+};
+template <>
+struct DataTypeImpl<ArrayDataType::kUint16> {
+  typedef uint16 Type;
+};
+template <>
 struct DataTypeImpl<ArrayDataType::kInt32> {
   typedef int32 Type;
 };
 template <>
+struct DataTypeImpl<ArrayDataType::kUint32> {
+  typedef uint32 Type;
+};
+template <>
 struct DataTypeImpl<ArrayDataType::kInt64> {
   typedef int64 Type;
 };
 template <>
+struct DataTypeImpl<ArrayDataType::kUint64> {
+  typedef uint64 Type;
+};
+template <>
 struct DataTypeImpl<ArrayDataType::kString> {
   typedef string Type;
 };
diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc
index e82a851935..bfab486b26 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.cc
+++ b/tensorflow/contrib/lite/toco/tooling_util.cc
@@ -1305,12 +1305,23 @@ int ElementSize(ArrayDataType data_type) {
   switch (data_type) {
     case ArrayDataType::kFloat:
       return 4;
-    case ArrayDataType::kInt32:
-      return 4;
+    case ArrayDataType::kInt8:
+      return 1;
     case ArrayDataType::kUint8:
       return 1;
+    case ArrayDataType::kInt16:
+      return 2;
+    case ArrayDataType::kUint16:
+      return 2;
+    case ArrayDataType::kInt32:
+      return 4;
+    case ArrayDataType::kUint32:
+      return 4;
     case ArrayDataType::kInt64:
       return 8;
+    case ArrayDataType::kUint64:
+      return 8;
+
     // Usually not critical limitation because strings are only input and/or
     // output.
     case ArrayDataType::kString:
@@ -1769,22 +1780,4 @@ void UseArraysExtraInfo(Model* model) {
   }
 }
 
-bool IsRnnSourceArray(const toco::Model& model, const string& array_name) {
-  for (const auto& rnn_state : model.flags.rnn_states()) {
-    if (array_name == rnn_state.back_edge_source_array()) {
-      return true;
-    }
-  }
-  return false;
-}
-
-bool IsRnnStateArray(const toco::Model& model, const string& array_name) {
-  for (const auto& rnn_state : model.flags.rnn_states()) {
-    if (array_name == rnn_state.state_array()) {
-      return true;
-    }
-  }
-  return false;
-}
-
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/tooling_util.h b/tensorflow/contrib/lite/toco/tooling_util.h
index 3c952e9c61..7397fa3eda 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.h
+++ b/tensorflow/contrib/lite/toco/tooling_util.h
@@ -290,9 +290,6 @@ ArrayDataType ConvertIODataTypeToArrayDataType(IODataType type);
 
 void UseArraysExtraInfo(Model* model);
 
-bool IsRnnSourceArray(const toco::Model& model, const string& array_name);
-bool IsRnnStateArray(const toco::Model& model, const string& array_name);
-
 }  // namespace toco
 
 #endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TOOLING_UTIL_H_
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 31b71c85ce..2c320cf68a 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -178,11 +178,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "gemmlowp",
       urls = [
-          "https://mirror.bazel.build/github.com/google/gemmlowp/archive/010bb3e71a26ca1d0884a167081d092b43563996.zip",
-          "https://github.com/google/gemmlowp/archive/010bb3e71a26ca1d0884a167081d092b43563996.zip",
+          "https://mirror.bazel.build/github.com/google/gemmlowp/archive/d4d1e29a62192d8defdc057b913ef36ca582ac98.zip",
+          "https://github.com/google/gemmlowp/archive/d4d1e29a62192d8defdc057b913ef36ca582ac98.zip",
       ],
-      sha256 = "dd2557072bde12141419cb8320a9c25e6ec41a8ae53c2ac78c076a347bb46d9d",
-      strip_prefix = "gemmlowp-010bb3e71a26ca1d0884a167081d092b43563996",
+      sha256 = "e2bee7afd3c43028f23dd0d7f85ddd8b21aaf79c572b658e56164ef502b2b9c7",
+      strip_prefix = "gemmlowp-d4d1e29a62192d8defdc057b913ef36ca582ac98",
   )
 
   tf_http_archive(
-- 
GitLab


From 7edb75615655b112f31228f5746224cec08cc450 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 7 Feb 2018 07:16:17 -0800
Subject: [PATCH 754/966] Utilities for type checking and multiple dispatch.

PiperOrigin-RevId: 184834434
---
 tensorflow/contrib/py2tf/utils/BUILD          | 22 ++++++++
 tensorflow/contrib/py2tf/utils/__init__.py    |  3 ++
 .../contrib/py2tf/utils/multiple_dispatch.py  | 54 +++++++++++++++++++
 .../py2tf/utils/multiple_dispatch_test.py     | 49 +++++++++++++++++
 tensorflow/contrib/py2tf/utils/type_check.py  | 33 ++++++++++++
 .../contrib/py2tf/utils/type_check_test.py    | 43 +++++++++++++++
 6 files changed, 204 insertions(+)
 create mode 100644 tensorflow/contrib/py2tf/utils/multiple_dispatch.py
 create mode 100644 tensorflow/contrib/py2tf/utils/multiple_dispatch_test.py
 create mode 100644 tensorflow/contrib/py2tf/utils/type_check.py
 create mode 100644 tensorflow/contrib/py2tf/utils/type_check_test.py

diff --git a/tensorflow/contrib/py2tf/utils/BUILD b/tensorflow/contrib/py2tf/utils/BUILD
index 502720047e..4b7a4b16c7 100644
--- a/tensorflow/contrib/py2tf/utils/BUILD
+++ b/tensorflow/contrib/py2tf/utils/BUILD
@@ -22,6 +22,8 @@ py_library(
         "__init__.py",
         "context_managers.py",
         "misc.py",
+        "multiple_dispatch.py",
+        "type_check.py",
     ],
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:__subpackages__"],
@@ -48,3 +50,23 @@ py_test(
         "//tensorflow/python:client_testlib",
     ],
 )
+
+py_test(
+    name = "multiple_dispatch_test",
+    srcs = ["multiple_dispatch_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":utils",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "type_check_test",
+    srcs = ["type_check_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":utils",
+        "//tensorflow/python:client_testlib",
+    ],
+)
diff --git a/tensorflow/contrib/py2tf/utils/__init__.py b/tensorflow/contrib/py2tf/utils/__init__.py
index 2ab0d7b9fd..1cbb0e0029 100644
--- a/tensorflow/contrib/py2tf/utils/__init__.py
+++ b/tensorflow/contrib/py2tf/utils/__init__.py
@@ -20,3 +20,6 @@ from __future__ import print_function
 
 from tensorflow.contrib.py2tf.utils.context_managers import control_dependency_on_returns
 from tensorflow.contrib.py2tf.utils.misc import alias_tensors
+from tensorflow.contrib.py2tf.utils.multiple_dispatch import run_cond
+from tensorflow.contrib.py2tf.utils.multiple_dispatch import run_while
+from tensorflow.contrib.py2tf.utils.type_check import is_tensor
diff --git a/tensorflow/contrib/py2tf/utils/multiple_dispatch.py b/tensorflow/contrib/py2tf/utils/multiple_dispatch.py
new file mode 100644
index 0000000000..7ea3a1b0a8
--- /dev/null
+++ b/tensorflow/contrib/py2tf/utils/multiple_dispatch.py
@@ -0,0 +1,54 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for type-dependent behavior used in py2tf-generated code."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.py2tf.utils.type_check import is_tensor
+from tensorflow.python.ops import control_flow_ops
+
+
+def run_cond(condition, true_fn, false_fn):
+  if is_tensor(condition):
+    return control_flow_ops.cond(condition, true_fn, false_fn)
+  else:
+    return py_cond(condition, true_fn, false_fn)
+
+
+def py_cond(condition, true_fn, false_fn):
+  if condition:
+    return true_fn()
+  else:
+    return false_fn()
+
+
+def run_while(cond_fn, body_fn, init_args):
+  if not isinstance(init_args, (tuple, list)) or not init_args:
+    raise ValueError(
+        'init_args must be a non-empty list or tuple, found %s' % init_args)
+
+  if is_tensor(init_args):
+    return control_flow_ops.while_loop(cond_fn, body_fn, init_args)
+  else:
+    return py_while_loop(cond_fn, body_fn, init_args)
+
+
+def py_while_loop(cond_fn, body_fn, init_args):
+  state = init_args
+  while cond_fn(state):
+    state = body_fn(state)
+  return state
diff --git a/tensorflow/contrib/py2tf/utils/multiple_dispatch_test.py b/tensorflow/contrib/py2tf/utils/multiple_dispatch_test.py
new file mode 100644
index 0000000000..7cc9defd14
--- /dev/null
+++ b/tensorflow/contrib/py2tf/utils/multiple_dispatch_test.py
@@ -0,0 +1,49 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for multiple_dispatch."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from tensorflow.contrib.py2tf.utils import multiple_dispatch
+from tensorflow.python.client.session import Session
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework.constant_op import constant
+from tensorflow.python.ops.control_flow_ops import cond
+from tensorflow.python.platform import test
+
+
+class MultipleDispatchTest(test.TestCase):
+
+  def test_run_cond_python(self):
+    true_fn = lambda: 2.0
+    false_fn = lambda: 3.0
+    self.assertEqual(multiple_dispatch.run_cond(True, true_fn, false_fn), 2.0)
+    self.assertEqual(multiple_dispatch.run_cond(False, true_fn, false_fn), 3.0)
+
+  def test_run_cond_tf(self):
+
+    true_fn = lambda: constant([2.0])
+    false_fn = lambda: constant([3.0])
+    with Session() as sess:
+
+      out = cond(constant(True), true_fn, false_fn)
+      self.assertEqual(sess.run(out), 2.0)
+      out = cond(constant(False, dtype=dtypes.bool), true_fn, false_fn)
+      self.assertEqual(sess.run(out), 3.0)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/py2tf/utils/type_check.py b/tensorflow/contrib/py2tf/utils/type_check.py
new file mode 100644
index 0000000000..9ca2dec872
--- /dev/null
+++ b/tensorflow/contrib/py2tf/utils/type_check.py
@@ -0,0 +1,33 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities used in py2tf-generated code."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import tensor_util
+
+
+def is_tensor(*args):
+  """Check if all arguments are tensors.
+
+  Args:
+    *args: Python objects that may or may not be tensors.
+
+  Returns:
+    True if all *args are TensorFlow types, False if one or more are not.
+  """
+  return any([tensor_util.is_tensor(a) for a in args])
diff --git a/tensorflow/contrib/py2tf/utils/type_check_test.py b/tensorflow/contrib/py2tf/utils/type_check_test.py
new file mode 100644
index 0000000000..7d0428e9cc
--- /dev/null
+++ b/tensorflow/contrib/py2tf/utils/type_check_test.py
@@ -0,0 +1,43 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for type_check."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy
+
+from tensorflow.contrib.py2tf.utils import type_check
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+class TypeCheckTest(test.TestCase):
+
+  def test_checks(self):
+    self.assertTrue(type_check.is_tensor(constant_op.constant([1, 2, 3])))
+    self.assertTrue(
+        type_check.is_tensor(test_util.variables.Variable([1, 2, 3])))
+    self.assertTrue(
+        type_check.is_tensor(
+            test_util.array_ops.placeholder(test_util.dtypes.float32)))
+    self.assertFalse(type_check.is_tensor(3))
+    self.assertFalse(type_check.is_tensor(numpy.eye(3)))
+
+
+if __name__ == '__main__':
+  test.main()
-- 
GitLab


From cfa374cefe132be886c26a374c51454177c68868 Mon Sep 17 00:00:00 2001
From: gracehoney <31743510+aaroey@users.noreply.github.com>
Date: Wed, 7 Feb 2018 08:12:05 -0800
Subject: [PATCH 755/966] Fix the segfault in convert_nodes.cc

---
 .../contrib/tensorrt/convert/convert_nodes.cc | 114 +++++++++++-------
 1 file changed, 70 insertions(+), 44 deletions(-)

diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index 779d15a4b1..5b8cf903e8 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -119,26 +119,29 @@ static std::vector<std::pair<int, int>> CreateSamePadding(
 class TRT_ShapedWeights {
  public:
   TRT_ShapedWeights(tensorflow::DataType type, const void* values,
-                    nvinfer1::Dims shape, bool owned_values = false)
+                    nvinfer1::Dims shape,
+                    const std::vector<char>* owned_values = nullptr)
       : shape_(shape),
         type_(type),
         values_(values),
-        owned_values_(owned_values),
+        owned_values_(owned_values ? *owned_values : std::vector<char>({})),
         dummy_flag_(false) {
     // Note: this->shape.type[] is not used
   }
 
   explicit TRT_ShapedWeights(tensorflow::DataType type)
-      : type_(type),
+      : shape_(),
+        type_(type),
         values_(nullptr),
-        owned_values_(false),
+        owned_values_(),
         dummy_flag_(true) {}
 
-  ~TRT_ShapedWeights() {
-    if (values_ && owned_values_) delete static_cast<const char*>(values_);
-  }
-
-  TRT_ShapedWeights(const TRT_ShapedWeights&) = default;
+  TRT_ShapedWeights(const TRT_ShapedWeights& rhs)
+      : shape_(rhs.shape_),
+        type_(rhs.type_),
+        values_(rhs.values_),
+        owned_values_(rhs.owned_values_),
+        dummy_flag_(rhs.dummy_flag_) {}
 
   int64_t count() const {
     int64_t c = 1;
@@ -152,7 +155,18 @@ class TRT_ShapedWeights {
     if (dummy_flag_) return nvinfer1::Weights{trt_type, nullptr, 0};
 
     // Note: this->shape.type[] is not used
-    return nvinfer1::Weights{trt_type, values_, GetShapeSize(shape_)};
+    return nvinfer1::Weights{trt_type, GetValues(), GetShapeSize(shape_)};
+  }
+
+  const void* GetValues() const {
+    if (values_) return values_;
+    if (owned_values_.size()) return owned_values_.data();
+    return nullptr;
+  }
+
+  void SetValues(const void* values) {
+    values_ = values;
+    owned_values_.clear();
   }
 
   size_t size_bytes() const {
@@ -165,51 +179,55 @@ class TRT_ShapedWeights {
 
   nvinfer1::Dims shape_;
   tensorflow::DataType type_;
+
+ private:
   const void* values_;
-  bool owned_values_;
+  std::vector<char> owned_values_;
   bool dummy_flag_;
 };
 
 class TRT_TensorOrWeights {
  public:
   explicit TRT_TensorOrWeights(nvinfer1::ITensor* tensor)
-      : _tensor_(tensor), _variant_(TRT_NODE_TENSOR) {}
-  TRT_TensorOrWeights(const TRT_ShapedWeights& weights)
-      : _weights_(weights), _variant_(TRT_NODE_WEIGHTS) {}
+      : _tensor_(tensor), _weights_(DT_FLOAT), _variant_(TRT_NODE_TENSOR) {}
+  explicit TRT_TensorOrWeights(const TRT_ShapedWeights& weights)
+      : _tensor_(nullptr), _weights_(weights), _variant_(TRT_NODE_WEIGHTS) {}
+  TRT_TensorOrWeights(const TRT_TensorOrWeights& rhs)
+      : _tensor_(rhs._tensor_),
+        _weights_(rhs._weights_),
+        _variant_(rhs._variant_) {}
   ~TRT_TensorOrWeights() {}
 
   bool is_tensor() const { return _variant_ == TRT_NODE_TENSOR; }
   bool is_weights() const { return _variant_ == TRT_NODE_WEIGHTS; }
 
   nvinfer1::ITensor* tensor() {
-    CHECK_EQ(this->is_tensor(), true);
+    CHECK_EQ(is_tensor(), true);
     return _tensor_;
   }
-  nvinfer1::ITensor const* tensor() const {
-    CHECK_EQ(this->is_tensor(), true);
+  const nvinfer1::ITensor* tensor() const {
+    CHECK_EQ(is_tensor(), true);
     return _tensor_;
   }
   TRT_ShapedWeights& weights() {
-    CHECK_EQ(this->is_weights(), true);
+    CHECK_EQ(is_weights(), true);
     return _weights_;
   }
   const TRT_ShapedWeights& weights() const {
-    CHECK_EQ(this->is_weights(), true);
+    CHECK_EQ(is_weights(), true);
     return _weights_;
   }
   nvinfer1::Dims shape() const {
-    if (this->is_tensor()) {
-      return this->tensor()->getDimensions();
+    if (is_tensor()) {
+      return tensor()->getDimensions();
     } else {
-      return this->weights().shape_;
+      return weights().shape_;
     }
   }
 
  private:
-  union {
-    nvinfer1::ITensor* _tensor_;
-    TRT_ShapedWeights _weights_;
-  };
+  nvinfer1::ITensor* _tensor_;
+  TRT_ShapedWeights _weights_;
   enum { TRT_NODE_TENSOR, TRT_NODE_WEIGHTS } _variant_;
 };
 
@@ -307,7 +325,7 @@ tensorflow::DataType TFAttrs::get<tensorflow::DataType>(string key) const {
 }
 
 template <typename T>
-void Reorder4(nvinfer1::DimsNCHW shape, T const* idata,
+void Reorder4(nvinfer1::DimsNCHW shape, const T* idata,
               nvinfer1::DimsNCHW istrides, T* odata,
               nvinfer1::DimsNCHW ostrides) {
   for (int n = 0; n < shape.n(); ++n) {
@@ -339,9 +357,10 @@ void ReorderRSCKToKCRS(const TRT_ShapedWeights& iweights,
   nvinfer1::DimsNCHW ostrides = {c * r * s, r * s, s, 1};
   switch (iweights.type_) {
     case tensorflow::DataType::DT_FLOAT:
-      Reorder4(
-          {k, c, r, s}, static_cast<float const*>(iweights.values_), istrides,
-          static_cast<float*>(const_cast<void*>(oweights->values_)), ostrides);
+      Reorder4({k, c, r, s}, static_cast<float const*>(iweights.GetValues()),
+               istrides,
+               static_cast<float*>(const_cast<void*>(oweights->GetValues())),
+               ostrides);
       break;
     default:
       LOG(FATAL) << "!!!!!!!!!!!!!!!!!!!!!!!!broke!!!!!!!!!!!!";
@@ -399,7 +418,7 @@ class Converter {
     TRT_ShapedWeights weights(type, nullptr, shape);
     // TODO(jie): check weights size_bytes. 0 means type error
     _temp_bufs.push_back(std::vector<uint8_t>(weights.size_bytes()));
-    weights.values_ = _temp_bufs.back().data();
+    weights.SetValues(_temp_bufs.back().data());
     return weights;
   }
 
@@ -579,8 +598,8 @@ tensorflow::Status UnaryCompute(const TRT_ShapedWeights& iweights,
   CHECK_EQ(iweights.type_, oweights->type_);
   switch (iweights.type_) {
     case tensorflow::DataType::DT_FLOAT: {
-      auto inp = static_cast<float const*>(iweights.values_);
-      auto oup = static_cast<float*>(const_cast<void*>(oweights->values_));
+      auto inp = static_cast<float const*>(iweights.GetValues());
+      auto oup = static_cast<float*>(const_cast<void*>(oweights->GetValues()));
       std::transform(inp, inp + iweights.count(), oup, unary_op.unary<float>());
       break;
     }
@@ -603,9 +622,9 @@ tensorflow::Status BinaryCompute(const TRT_ShapedWeights& iweights_l,
 
   switch (iweights_l.type_) {
     case tensorflow::DataType::DT_FLOAT: {
-      auto inp_l = static_cast<float const*>(iweights_l.values_);
-      auto inp_r = static_cast<float const*>(iweights_r.values_);
-      auto oup = static_cast<float*>(const_cast<void*>(oweights->values_));
+      auto inp_l = static_cast<const float*>(iweights_l.GetValues());
+      auto inp_r = static_cast<const float*>(iweights_r.GetValues());
+      auto oup = static_cast<float*>(const_cast<void*>(oweights->GetValues()));
 
       if (iweights_l.count() != iweights_r.count()) {
         // We only supports broadcast of RankZero
@@ -1117,7 +1136,7 @@ tensorflow::Status ConvertConst(Converter& ctx,
 
   // Get trt type & shape
   TFAttrs attrs(node_def);
-  tensorflow::DataType dtype = attrs.get<tensorflow::DataType>("dtype");
+  const tensorflow::DataType dtype = attrs.get<tensorflow::DataType>("dtype");
 
   // Create shaped weights as output
   tensorflow::Tensor tensor;
@@ -1148,11 +1167,18 @@ tensorflow::Status ConvertConst(Converter& ctx,
   } else if (!weights_tensor.tensor_content().empty()) {
     VLOG(2) << "TENSOR!!!" << node_def.name();
     const auto& content = weights_tensor.tensor_content();
-    char* buf = new char[content.size() + 1];
-    buf[content.size()] = 0;
-    port::CopyToArray(content, buf);
-    weights = TRT_ShapedWeights(dtype, buf, GetTensorShape(tensor),
-                                /*owned_values=*/true);
+
+    std::vector<char> values;
+    if (content.size() > 0) {
+      const int dtype_size = tensorflow::DataTypeSize(dtype);
+      CHECK_EQ(0, content.size() % dtype_size)
+          << "Tensor content size (" << content.size()
+          << ") is not a multiple of " << dtype_size;
+      values.resize(content.size());
+      port::CopyToArray(content, values.data());
+    }
+    weights =
+        TRT_ShapedWeights(dtype, nullptr, GetTensorShape(tensor), &values);
   } else {
     return tensorflow::errors::Unimplemented(
         "Not supported constant type, at " + node_def.name());
@@ -1242,7 +1268,7 @@ tensorflow::Status ConvertReduce(Converter& ctx,
   if (index_type != tensorflow::DataType::DT_INT32)
     return tensorflow::errors::Unimplemented("Tidx supports only DT_INT32");
   auto index_list_data =
-      static_cast<int*>(const_cast<void*>(index_list.values_));
+      static_cast<int*>(const_cast<void*>(index_list.GetValues()));
 
   // Hack warning: have to fall back to pool layer since reduce is not in public
   // TRT yet.
@@ -1340,7 +1366,7 @@ tensorflow::Status ConvertPad(Converter& ctx,
   if (padding_type != tensorflow::DataType::DT_INT32)
     return tensorflow::errors::Unimplemented(
         "Tpaddings supports only DT_INT32");
-  auto pad_data = static_cast<int*>(const_cast<void*>(pads.values_));
+  auto pad_data = static_cast<int*>(const_cast<void*>(pads.GetValues()));
 
   std::vector<int32_t> pad_index;
   for (int i = 0; i < nb_dims; i++) {
-- 
GitLab


From a7c225c89e4169492f0eef57c913463b976dd44e Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 7 Feb 2018 16:24:34 +0000
Subject: [PATCH 756/966] Remove warnings in tf.losses.softmax_cross_entropy

This fix tries to address the issue raised in 16534 where
tf.losses.softmax_cross_entropy causes warnings due
to the calling of tf.nn.softmax_cross_entropy_with_logits.

This fix switches to tf.nn.softmax_cross_entropy_with_logits_v2
to remove the warning.

This fix fixes 16534.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/losses/losses_impl.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py
index 8b3c61b933..0907ea69eb 100644
--- a/tensorflow/python/ops/losses/losses_impl.py
+++ b/tensorflow/python/ops/losses/losses_impl.py
@@ -726,9 +726,12 @@ def softmax_cross_entropy(
       smooth_negatives = label_smoothing / num_classes
       onehot_labels = onehot_labels * smooth_positives + smooth_negatives
 
-    losses = nn.softmax_cross_entropy_with_logits(labels=onehot_labels,
-                                                  logits=logits,
-                                                  name="xentropy")
+    onehot_labels = array_ops.stop_gradient(
+        onehot_labels, name="labels_stop_gradient")
+    losses = nn.softmax_cross_entropy_with_logits_v2(
+        labels=onehot_labels, logits=logits, name="xentropy")
+
+
     return compute_weighted_loss(
         losses, weights, scope, loss_collection, reduction=reduction)
 
-- 
GitLab


From 840c30b0f38bcf0d94fe86e50a619465f935adda Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 7 Feb 2018 08:48:05 -0800
Subject: [PATCH 757/966] Refactor score definition in GMM operations. This is
 simplified to be the per-sample likelihood of the data.

PiperOrigin-RevId: 184843634
---
 .../contrib/factorization/python/ops/gmm.py   | 24 +++---
 .../factorization/python/ops/gmm_ops.py       | 79 ++++++-------------
 .../factorization/python/ops/gmm_ops_test.py  |  8 +-
 .../factorization/python/ops/gmm_test.py      | 37 +--------
 4 files changed, 44 insertions(+), 104 deletions(-)

diff --git a/tensorflow/contrib/factorization/python/ops/gmm.py b/tensorflow/contrib/factorization/python/ops/gmm.py
index f72280c4ec..b2dfe48b2d 100644
--- a/tensorflow/contrib/factorization/python/ops/gmm.py
+++ b/tensorflow/contrib/factorization/python/ops/gmm.py
@@ -24,17 +24,16 @@ import numpy as np
 from tensorflow.contrib import framework
 from tensorflow.contrib.factorization.python.ops import gmm_ops
 from tensorflow.contrib.framework.python.framework import checkpoint_utils
-from tensorflow.python.training import training_util
 from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import model_fn as model_fn_lib
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import logging_ops as logging
-from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops.control_flow_ops import with_dependencies
 from tensorflow.python.training import session_run_hook
+from tensorflow.python.training import training_util
 
 
 def _streaming_sum(scalar_tensor):
@@ -70,8 +69,8 @@ class _InitializeClustersHook(session_run_hook.SessionRunHook):
 class GMM(estimator.Estimator):
   """An estimator for GMM clustering."""
   SCORES = 'scores'
+  LOG_LIKELIHOOD = 'loss'
   ASSIGNMENTS = 'assignments'
-  ALL_SCORES = 'all_scores'
 
   def __init__(self,
                num_clusters,
@@ -113,10 +112,7 @@ class GMM(estimator.Estimator):
       yield result[GMM.ASSIGNMENTS]
 
   def score(self, input_fn=None, batch_size=None, steps=None):
-    """Predict total sum of distances to nearest clusters.
-
-    Note that this function is different from the corresponding one in sklearn
-    which returns the negative of the sum of distances.
+    """Predict total log-likelihood.
 
     Args:
       input_fn: see predict.
@@ -124,11 +120,11 @@ class GMM(estimator.Estimator):
       steps: see predict.
 
     Returns:
-      Total sum of distances to nearest clusters.
+      Total log-likelihood.
     """
     results = self.evaluate(input_fn=input_fn, batch_size=batch_size,
                             steps=steps)
-    return np.sum(results[GMM.SCORES])
+    return np.log(np.sum(np.exp(results[GMM.SCORES])))
 
   def weights(self):
     """Returns the cluster weights."""
@@ -158,9 +154,10 @@ class GMM(estimator.Estimator):
     def _model_fn(features, labels, mode, config):
       """Model function."""
       assert labels is None, labels
-      (all_scores,
+      (loss,
+       scores,
        model_predictions,
-       losses, training_op,
+       training_op,
        init_op,
        is_initialized) = gmm_ops.gmm(self._parse_tensor_or_dict(features),
                                      self._training_initial_clusters,
@@ -168,16 +165,15 @@ class GMM(estimator.Estimator):
                                      self._covariance_type,
                                      self._params)
       incr_step = state_ops.assign_add(training_util.get_global_step(), 1)
-      loss = math_ops.reduce_sum(losses)
       training_op = with_dependencies([training_op, incr_step], loss)
       training_hooks = [_InitializeClustersHook(
           init_op, is_initialized, config.is_chief)]
       predictions = {
-          GMM.ALL_SCORES: all_scores[0],
           GMM.ASSIGNMENTS: model_predictions[0][0],
       }
       eval_metric_ops = {
-          GMM.SCORES: _streaming_sum(loss),
+          GMM.SCORES: scores,
+          GMM.LOG_LIKELIHOOD: _streaming_sum(loss),
       }
       return model_fn_lib.ModelFnOps(mode=mode, predictions=predictions,
                                      eval_metric_ops=eval_metric_ops,
diff --git a/tensorflow/contrib/factorization/python/ops/gmm_ops.py b/tensorflow/contrib/factorization/python/ops/gmm_ops.py
index a61681c7f5..98d6434f47 100644
--- a/tensorflow/contrib/factorization/python/ops/gmm_ops.py
+++ b/tensorflow/contrib/factorization/python/ops/gmm_ops.py
@@ -21,7 +21,6 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -36,7 +35,6 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.embedding_ops import embedding_lookup
-from tensorflow.python.summary import summary
 
 # Machine epsilon.
 MEPS = np.finfo(float).eps
@@ -253,14 +251,16 @@ class GmmAlgorithm(object):
     return ret
 
   def scores(self):
-    """Returns the distances to each class.
+    """Returns the per-sample likelihood fo the data.
 
     Returns:
-      A tuple with two Tensors. The first contains the distance to
-    each class. The second contains the distance to the assigned
-    class.
+      Log probabilities of each data point.
     """
-    return (self._all_scores, self._scores)
+    return self._scores
+
+  def log_likelihood_op(self):
+    """Returns the log-likelihood operation."""
+    return self._log_likelihood_op
 
   def _define_graph(self, data):
     """Define graph for a single iteration.
@@ -276,7 +276,8 @@ class GmmAlgorithm(object):
       self._define_expectation_operation(shard_id)
       self._define_partial_maximization_operation(shard_id, shard)
     self._define_maximization_operation(len(data))
-    self._define_distance_to_clusters(data)
+    self._define_loglikelihood_operation()
+    self._define_score_samples()
 
   def _define_full_covariance_probs(self, shard_id, shard):
     """Defines the full covariance probabilties per example in a class.
@@ -440,50 +441,20 @@ class GmmAlgorithm(object):
                 state_ops.assign(
                     self._covs, new_covs, validate_shape=False))
 
-  def _define_distance_to_clusters(self, data):
-    """Defines the Mahalanobis distance to the assigned Gaussian."""
-    # TODO(xavigonzalvo): reuse (input - mean) * cov^-1 * (input -
-    # mean) from log probability function.
-    self._all_scores = []
-    for shard in data:
-      all_scores = []
-      shard = array_ops.expand_dims(shard, 0)
-      for c in xrange(self._num_classes):
-        if self._covariance_type == FULL_COVARIANCE:
-          cov = self._covs[c, :, :]
-        elif self._covariance_type == DIAG_COVARIANCE:
-          cov = array_ops.diag(self._covs[c, :])
-        inverse = linalg_ops.matrix_inverse(cov + self._min_var)
-        inv_cov = array_ops.tile(
-            array_ops.expand_dims(inverse, 0),
-            array_ops.stack([self._num_examples, 1, 1]))
-        diff = array_ops.transpose(shard - self._means[c, :, :], perm=[1, 0, 2])
-        m_left = math_ops.matmul(diff, inv_cov)
-        all_scores.append(
-            math_ops.sqrt(
-                math_ops.matmul(
-                    m_left, array_ops.transpose(
-                        diff, perm=[0, 2, 1]))))
-      self._all_scores.append(
-          array_ops.reshape(
-              array_ops.concat(all_scores, 1),
-              array_ops.stack([self._num_examples, self._num_classes])))
-
-    # Distance to the associated class.
-    self._all_scores = array_ops.concat(self._all_scores, 0)
-    assignments = array_ops.concat(self.assignments(), 0)
-    rows = math_ops.to_int64(math_ops.range(0, self._num_examples))
-    indices = array_ops.concat(
-        [array_ops.expand_dims(rows, 1), array_ops.expand_dims(assignments, 1)],
-        1)
-    self._scores = array_ops.gather_nd(self._all_scores, indices)
-
   def _define_loglikelihood_operation(self):
     """Defines the total log-likelihood of current iteration."""
-    self._ll_op = []
+    op = []
     for prior_probs in self._prior_probs:
-      self._ll_op.append(math_ops.reduce_sum(math_ops.log(prior_probs)))
-    summary.scalar('ll', math_ops.reduce_sum(self._ll_op))
+      op.append(math_ops.reduce_logsumexp(prior_probs))
+    self._log_likelihood_op = math_ops.reduce_logsumexp(op)
+
+  def _define_score_samples(self):
+    """Defines the likelihood of each data sample."""
+    op = []
+    for shard_id, prior_probs in enumerate(self._prior_probs):
+      op.append(prior_probs + math_ops.log(self._w[shard_id]))
+    self._scores = array_ops.squeeze(
+        math_ops.reduce_logsumexp(op, axis=2, keep_dims=True), axis=0)
 
 
 def gmm(inp,
@@ -511,14 +482,9 @@ def gmm(inp,
   Returns:
     Note: tuple of lists returned to be consistent with skflow
     A tuple consisting of:
-    all_scores: A matrix (or list of matrices) of dimensions (num_input,
-      num_clusters) where the value is the distance of an input vector and a
-      cluster center.
     assignments: A vector (or list of vectors). Each element in the vector
       corresponds to an input row in 'inp' and specifies the cluster id
       corresponding to the input.
-    scores: Similar to assignments but specifies the distance to the
-      assigned cluster instead.
     training_op: an op that runs an iteration of training.
     init_op: an op that runs the initialization.
   """
@@ -532,6 +498,7 @@ def gmm(inp,
   gmm_tool = GmmAlgorithm(inp, num_clusters, initial_means, params,
                           covariance_type, random_seed)
   assignments = gmm_tool.assignments()
-  all_scores, scores = gmm_tool.scores()
-  return ([all_scores], [assignments], [scores], gmm_tool.training_ops(),
+  scores = gmm_tool.scores()
+  loss = gmm_tool.log_likelihood_op()
+  return (loss, scores, [assignments], gmm_tool.training_ops(),
           gmm_tool.init_ops(), gmm_tool.is_initialized())
diff --git a/tensorflow/contrib/factorization/python/ops/gmm_ops_test.py b/tensorflow/contrib/factorization/python/ops/gmm_ops_test.py
index c50e82db8a..888c3c238c 100644
--- a/tensorflow/contrib/factorization/python/ops/gmm_ops_test.py
+++ b/tensorflow/contrib/factorization/python/ops/gmm_ops_test.py
@@ -122,17 +122,23 @@ class GmmOpsTest(test.TestCase):
       g.seed = 5
       with self.test_session() as sess:
         data = constant_op.constant(self.data, dtype=dtypes.float32)
-        _, assignments, _, training_op, init_op, _ = gmm_ops.gmm(
+        loss_op, scores, assignments, training_op, init_op, _ = gmm_ops.gmm(
             data, 'random', num_classes, random_seed=self.seed)
 
         variables.global_variables_initializer().run()
         sess.run(init_op)
+        first_loss = sess.run(loss_op)
         for _ in xrange(self.iterations):
           sess.run(training_op)
         assignments = sess.run(assignments)
+        end_loss = sess.run(loss_op)
+        scores = sess.run(scores)
+        self.assertEqual((self.num_examples, 1), scores.shape)
         accuracy = np.mean(
             np.asarray(self.true_assignments) == np.squeeze(assignments))
         logging.info('Accuracy: %f', accuracy)
+        logging.info('First loss: %f, end loss: %f', first_loss, end_loss)
+        self.assertGreater(end_loss, first_loss)
         self.assertGreater(accuracy, 0.98)
 
   def testParams(self):
diff --git a/tensorflow/contrib/factorization/python/ops/gmm_test.py b/tensorflow/contrib/factorization/python/ops/gmm_test.py
index 7717b47dae..00a4734eb6 100644
--- a/tensorflow/contrib/factorization/python/ops/gmm_test.py
+++ b/tensorflow/contrib/factorization/python/ops/gmm_test.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.contrib.factorization.python.ops import gmm as gmm_lib
 from tensorflow.contrib.learn.python.learn.estimators import kmeans
@@ -30,12 +29,9 @@ from tensorflow.python.framework import random_seed as random_seed_lib
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import random_ops
-from tensorflow.python.platform import flags
 from tensorflow.python.platform import test
 from tensorflow.python.training import queue_runner
 
-FLAGS = flags.FLAGS
-
 
 class GMMTest(test.TestCase):
 
@@ -64,9 +60,8 @@ class GMMTest(test.TestCase):
     self.batch_size = self.num_points
     self.true_centers = self.make_random_centers(self.num_centers,
                                                  self.num_dims)
-    self.points, self.assignments, self.scores = self.make_random_points(
+    self.points, self.assignments = self.make_random_points(
         self.true_centers, self.num_points)
-    self.true_score = np.add.reduce(self.scores)
 
     # Use initial means from kmeans (just like scikit-learn does).
     clusterer = kmeans.KMeansClustering(num_clusters=self.num_centers)
@@ -86,24 +81,7 @@ class GMMTest(test.TestCase):
     offsets = np.round(
         np.random.randn(num_points, num_dims).astype(np.float32) * 20)
     points = centers[assignments] + offsets
-    means = [
-        np.mean(
-            points[assignments == center], axis=0)
-        for center in xrange(num_centers)
-    ]
-    covs = [
-        np.cov(points[assignments == center].T)
-        for center in xrange(num_centers)
-    ]
-    scores = []
-    for r in xrange(num_points):
-      scores.append(
-          np.sqrt(
-              np.dot(
-                  np.dot(points[r, :] - means[assignments[r]],
-                         np.linalg.inv(covs[assignments[r]])), points[r, :] -
-                  means[assignments[r]])))
-    return (points, assignments, scores)
+    return (points, assignments)
 
   def test_weights(self):
     """Tests the shape of the weights."""
@@ -136,8 +114,7 @@ class GMMTest(test.TestCase):
     gmm.fit(input_fn=self.input_fn(), steps=10)
     score2 = gmm.score(input_fn=self.input_fn(batch_size=self.num_points),
                        steps=1)
-    self.assertGreater(score1, score2)
-    self.assertNear(self.true_score, score2, self.true_score * 0.15)
+    self.assertLess(score1, score2)
 
   def test_infer(self):
     gmm = gmm_lib.GMM(self.num_centers,
@@ -149,8 +126,7 @@ class GMMTest(test.TestCase):
 
     # Make a small test set
     num_points = 40
-    points, true_assignments, true_offsets = (
-        self.make_random_points(clusters, num_points))
+    points, true_assignments = self.make_random_points(clusters, num_points)
 
     assignments = []
     for item in gmm.predict_assignments(
@@ -159,11 +135,6 @@ class GMMTest(test.TestCase):
     assignments = np.ravel(assignments)
     self.assertAllEqual(true_assignments, assignments)
 
-    # Test score
-    score = gmm.score(input_fn=self.input_fn(points=points,
-                                             batch_size=num_points), steps=1)
-    self.assertNear(score, np.sum(true_offsets), 4.05)
-
   def _compare_with_sklearn(self, cov_type):
     # sklearn version.
     iterations = 40
-- 
GitLab


From 39944db497317824ccfd6e9d04d3ea1a6b8c7851 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 7 Feb 2018 09:01:44 -0800
Subject: [PATCH 758/966] Adding support for standalone Tanh operator.

PiperOrigin-RevId: 184845130
---
 .../internal/optimized/optimized_ops.h        | 183 ++++++++++++++----
 .../internal/reference/reference_ops.h        |   9 +-
 .../toco/graph_transformations/quantize.cc    |   2 +-
 3 files changed, 147 insertions(+), 47 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
index 5a8df67812..cd52385f41 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -3102,51 +3102,152 @@ inline void Tanh(const uint8* input_data, const Dims<4>& input_dims,
                  int32 input_zero_point, int32 input_range_radius,
                  int32 input_multiplier, int input_left_shift,
                  uint8* output_data, const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          const uint8 input_val_u8 = input_data[Offset(input_dims, c, x, y, b)];
-          const int32 input_val_centered =
-              static_cast<int32>(input_val_u8) - input_zero_point;
-          uint8 output_val;
-          if (input_val_centered <= -input_range_radius) {
-            output_val = 0;
-          } else if (input_val_centered >= input_range_radius) {
-            output_val = 255;
-          } else {
-            const int32 input_val_rescaled =
-                MultiplyByQuantizedMultiplierGreaterThanOne(
-                    input_val_centered, input_multiplier, input_left_shift);
-            using FixedPoint4 = gemmlowp::FixedPoint<int32, 4>;
-            using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
-            const FixedPoint4 input_val_f4 =
-                FixedPoint4::FromRaw(input_val_rescaled);
-            const FixedPoint0 output_val_f0 = gemmlowp::tanh(input_val_f4);
-
-            using gemmlowp::RoundingDivideByPOT;
-            int32 output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 24);
-            // TODO(mjmatthews): properly wire through this zero offset
-            output_val_s32 += 127;
-            if (output_val_s32 == -1) {
-              // May underflow since we cannot properly represent -1.0f
-              output_val_s32 = 0;
-            }
-            TFLITE_DCHECK_GE(output_val_s32, 0);
-            TFLITE_DCHECK_LE(output_val_s32, 255);
-            output_val = static_cast<uint8>(output_val_s32);
-          }
-          output_data[Offset(output_dims, c, x, y, b)] = output_val;
-        }
+  // Note that this is almost the exact same code as in Logistic().
+  gemmlowp::ScopedProfilingLabel label("Tanh");
+  /* batches */ MatchingArraySize(input_dims, 3, output_dims, 3);
+  /* height */ MatchingArraySize(input_dims, 2, output_dims, 2);
+  /* width */ MatchingArraySize(input_dims, 1, output_dims, 1);
+  /* depth */ MatchingArraySize(input_dims, 0, output_dims, 0);
+  const int size = RequiredBufferSizeForDims(input_dims);
+
+  int c = 0;
+  int32_t output_zero_point = 128;
+#ifdef USE_NEON
+  // Handle 16 values at a time
+  for (; c <= size - 16; c += 16) {
+    // Read input uint8 values, cast to int16 and subtract input_zero_point
+    uint8x16_t input_val_u8 = vld1q_u8(input_data + c);
+    int16x8_t input_val_centered_0 =
+        vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(input_val_u8))),
+                  vdupq_n_s16(input_zero_point));
+    int16x8_t input_val_centered_1 =
+        vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(input_val_u8))),
+                  vdupq_n_s16(input_zero_point));
+
+    // Prepare the bit masks that we will use at the end to implement the logic
+    // that was expressed in the scalar code with branching:
+    //   if (input_val_centered < -input_range_radius) {
+    //     output_val = 0;
+    //   } else if (input_val_centered > input_range_radius) {
+    //     output_val = 255;
+    //   } else {
+    //     ...
+    uint16x8_t mask_rightclamp_0 =
+        vcgtq_s16(input_val_centered_0, vdupq_n_s16(input_range_radius));
+    uint16x8_t mask_rightclamp_1 =
+        vcgtq_s16(input_val_centered_1, vdupq_n_s16(input_range_radius));
+    uint16x8_t mask_leftclamp_0 =
+        vcgeq_s16(input_val_centered_0, vdupq_n_s16(-input_range_radius));
+    uint16x8_t mask_leftclamp_1 =
+        vcgeq_s16(input_val_centered_1, vdupq_n_s16(-input_range_radius));
+    uint8x16_t mask_rightclamp = vcombine_u8(vshrn_n_u16(mask_rightclamp_0, 8),
+                                             vshrn_n_u16(mask_rightclamp_1, 8));
+    uint8x16_t mask_leftclamp = vcombine_u8(vshrn_n_u16(mask_leftclamp_0, 8),
+                                            vshrn_n_u16(mask_leftclamp_1, 8));
+
+    // This performs what is expressed in the scalar code as
+    // const int32 input_val_rescaled =
+    //     MultiplyByQuantizedMultiplierGreaterThanOne(
+    //         input_val_centered, input_multiplier, input_left_shift);
+    int32x4_t input_val_rescaled_0 =
+        vshlq_s32(vmovl_s16(vget_low_s16(input_val_centered_0)),
+                  vdupq_n_s32(input_left_shift));
+    int32x4_t input_val_rescaled_1 =
+        vshlq_s32(vmovl_s16(vget_high_s16(input_val_centered_0)),
+                  vdupq_n_s32(input_left_shift));
+    int32x4_t input_val_rescaled_2 =
+        vshlq_s32(vmovl_s16(vget_low_s16(input_val_centered_1)),
+                  vdupq_n_s32(input_left_shift));
+    int32x4_t input_val_rescaled_3 =
+        vshlq_s32(vmovl_s16(vget_high_s16(input_val_centered_1)),
+                  vdupq_n_s32(input_left_shift));
+    input_val_rescaled_0 =
+        vqrdmulhq_n_s32(input_val_rescaled_0, input_multiplier);
+    input_val_rescaled_1 =
+        vqrdmulhq_n_s32(input_val_rescaled_1, input_multiplier);
+    input_val_rescaled_2 =
+        vqrdmulhq_n_s32(input_val_rescaled_2, input_multiplier);
+    input_val_rescaled_3 =
+        vqrdmulhq_n_s32(input_val_rescaled_3, input_multiplier);
+
+    // Invoke gemmlowp::tanh on FixedPoint wrapping int32x4_t
+    using FixedPoint4 = gemmlowp::FixedPoint<int32x4_t, 4>;
+    using FixedPoint0 = gemmlowp::FixedPoint<int32x4_t, 0>;
+    const FixedPoint4 input_val_f4_0 =
+        FixedPoint4::FromRaw(input_val_rescaled_0);
+    const FixedPoint4 input_val_f4_1 =
+        FixedPoint4::FromRaw(input_val_rescaled_1);
+    const FixedPoint4 input_val_f4_2 =
+        FixedPoint4::FromRaw(input_val_rescaled_2);
+    const FixedPoint4 input_val_f4_3 =
+        FixedPoint4::FromRaw(input_val_rescaled_3);
+    const FixedPoint0 output_val_f0_0 = gemmlowp::tanh(input_val_f4_0);
+    const FixedPoint0 output_val_f0_1 = gemmlowp::tanh(input_val_f4_1);
+    const FixedPoint0 output_val_f0_2 = gemmlowp::tanh(input_val_f4_2);
+    const FixedPoint0 output_val_f0_3 = gemmlowp::tanh(input_val_f4_3);
+
+    // Divide by 2^24 as in the scalar code
+    using gemmlowp::RoundingDivideByPOT;
+    int32x4_t output_val_s32_0 = RoundingDivideByPOT(output_val_f0_0.raw(), 24);
+    int32x4_t output_val_s32_1 = RoundingDivideByPOT(output_val_f0_1.raw(), 24);
+    int32x4_t output_val_s32_2 = RoundingDivideByPOT(output_val_f0_2.raw(), 24);
+    int32x4_t output_val_s32_3 = RoundingDivideByPOT(output_val_f0_3.raw(), 24);
+
+    // Add the output zero point
+    int32x4_t output_zero_point_s32 = vdupq_n_s32(output_zero_point);
+    output_val_s32_0 = vaddq_s32(output_val_s32_0, output_zero_point_s32);
+    output_val_s32_1 = vaddq_s32(output_val_s32_1, output_zero_point_s32);
+    output_val_s32_2 = vaddq_s32(output_val_s32_2, output_zero_point_s32);
+    output_val_s32_3 = vaddq_s32(output_val_s32_3, output_zero_point_s32);
+
+    // Cast output values to uint8, saturating
+    int16x8_t output_val_s16_0 = vcombine_s16(vqmovn_s32(output_val_s32_0),
+                                              vqmovn_s32(output_val_s32_1));
+    int16x8_t output_val_s16_1 = vcombine_s16(vqmovn_s32(output_val_s32_2),
+                                              vqmovn_s32(output_val_s32_3));
+    uint8x16_t output_val_u8 = vcombine_u8(vqmovun_s16(output_val_s16_0),
+                                           vqmovun_s16(output_val_s16_1));
+
+    // Perform the bit-masking with the bit masks computed at the beginning,
+    // see the comment there.
+    output_val_u8 = vorrq_u8(output_val_u8, mask_rightclamp);
+    output_val_u8 = vandq_u8(output_val_u8, mask_leftclamp);
+
+    // Store back to memory
+    vst1q_u8(output_data + c, output_val_u8);
+  }
+#endif
+  // Leftover loop: handle one value at a time with scalar code.
+  for (; c < size; ++c) {
+    const uint8 input_val_u8 = input_data[c];
+    const int32 input_val_centered =
+        static_cast<int32>(input_val_u8) - input_zero_point;
+    uint8 output_val;
+    if (input_val_centered < -input_range_radius) {
+      output_val = 0;
+    } else if (input_val_centered > input_range_radius) {
+      output_val = 255;
+    } else {
+      const int32 input_val_rescaled =
+          MultiplyByQuantizedMultiplierGreaterThanOne(
+              input_val_centered, input_multiplier, input_left_shift);
+      using FixedPoint4 = gemmlowp::FixedPoint<int32, 4>;
+      using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
+      const FixedPoint4 input_val_f4 = FixedPoint4::FromRaw(input_val_rescaled);
+      const FixedPoint0 output_val_f0 = gemmlowp::tanh(input_val_f4);
+      using gemmlowp::RoundingDivideByPOT;
+      int32 output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 24);
+      output_val_s32 += output_zero_point;
+      if (output_val_s32 == 256) {
+        output_val_s32 = 255;
       }
+      TFLITE_DCHECK_GE(output_val_s32, 0);
+      TFLITE_DCHECK_LE(output_val_s32, 255);
+      output_val = static_cast<uint8>(output_val_s32);
     }
+    output_data[c] = output_val;
   }
 }
-
 inline void Dequantize(const uint8* input_data, const Dims<4>& input_dims,
                        int32 zero_point, double scale, float* output_data,
                        const Dims<4>& output_dims) {
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index c5f09af8fe..f18543f4e4 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -2279,6 +2279,7 @@ inline void Tanh(const uint8* input_data, const Dims<4>& input_dims,
                  int32 input_zero_point, int32 input_range_radius,
                  int32 input_multiplier, int input_left_shift,
                  uint8* output_data, const Dims<4>& output_dims) {
+  const int32 output_zero_point = 128;
   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
   const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
   const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
@@ -2307,11 +2308,9 @@ inline void Tanh(const uint8* input_data, const Dims<4>& input_dims,
 
             using gemmlowp::RoundingDivideByPOT;
             int32 output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 24);
-            // TODO(mjmatthews): properly wire through this zero offset
-            output_val_s32 += 127;
-            if (output_val_s32 == -1) {
-              // May underflow since we cannot properly represent -1.0f
-              output_val_s32 = 0;
+            output_val_s32 += output_zero_point;
+            if (output_val_s32 == 256) {
+              output_val_s32 = 255;
             }
             TFLITE_DCHECK_GE(output_val_s32, 0);
             TFLITE_DCHECK_LE(output_val_s32, 255);
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
index 3a674f884d..d7f804ee43 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
@@ -294,7 +294,7 @@ bool ChooseHardcodedQuantizationForOperatorOutput(
   if (op.type == OperatorType::kTanh) {
     // Tanh has the range: [-1, 1].
     *quantized_data_type = ArrayDataType::kUint8;
-    quantization_params->zero_point = 127;
+    quantization_params->zero_point = 128;
     quantization_params->scale = 1. / 128.;
     // 0 should be exactly representable, as values will typically be centered
     // around 0, with many values near 0.
-- 
GitLab


From 8b801656734a9df8f198ec2146c4b94f48084793 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 7 Feb 2018 09:08:51 -0800
Subject: [PATCH 759/966] Make TypeError more explicit on
 _assertAllCloseRecursive

PiperOrigin-RevId: 184846656
---
 tensorflow/python/framework/test_util.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 3864a4aa9e..bfdd98819e 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -1146,13 +1146,19 @@ class TensorFlowTestCase(googletest.TestCase):
           del path[-1]
     # a and b are ndarray like objects
     else:
-      self._assertArrayLikeAllClose(
-          a,
-          b,
-          rtol=rtol,
-          atol=atol,
-          msg="Mismatched value: a%s is different from b%s." % (path_str,
-                                                                path_str))
+      try:
+        self._assertArrayLikeAllClose(
+            a,
+            b,
+            rtol=rtol,
+            atol=atol,
+            msg="Mismatched value: a%s is different from b%s." % (path_str,
+                                                                  path_str))
+      except TypeError as e:
+        msg = "Error: a%s has %s, but b%s has %s" % (
+            path_str, type(a), path_str, type(b))
+        e.args = ((e.args[0] + ' : ' + msg,) + e.args[1:])
+        raise
 
   def assertAllClose(self, a, b, rtol=1e-6, atol=1e-6):
     """Asserts that two structures of numpy arrays, have near values.
-- 
GitLab


From f6010282b872b2c74eaf710a879b5b4a1756ae17 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Wed, 7 Feb 2018 10:18:53 -0800
Subject: [PATCH 760/966] [XLA:CPU] Fix/suppress issues caught by the C++
 linter

PiperOrigin-RevId: 184856538
---
 tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.h |  6 +++---
 tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc | 11 ++++++-----
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.h b/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.h
index afb9ac71fa..90050c4459 100644
--- a/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.h
+++ b/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_LLVM_IR_RUNTINE_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_LLVM_IR_RUNTINE_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_LLVM_IR_RUNTIME_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_LLVM_IR_RUNTIME_H_
 
 #include "llvm/IR/Module.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
@@ -42,4 +42,4 @@ void RewriteIRRuntimeFunctions(llvm::Module* module, bool enable_fast_math);
 }  // namespace cpu
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_LLVM_IR_RUNTINE_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_LLVM_IR_RUNTIME_H_
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
index 19250b8e7e..34c8e31060 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
@@ -270,15 +270,15 @@ bool RegisterKnownJITSymbols() {
   REGISTER_LIBM_SYMBOL(ilogb, int (*)(double));
   REGISTER_LIBM_SYMBOL(ldexp, double (*)(double, int));
   REGISTER_LIBM_SYMBOL(lgamma, double (*)(double));
-  REGISTER_LIBM_SYMBOL(llrint, long long (*)(double));
-  REGISTER_LIBM_SYMBOL(llround, long long (*)(double));
+  REGISTER_LIBM_SYMBOL(llrint, long long (*)(double));   // NOLINT(runtime/int)
+  REGISTER_LIBM_SYMBOL(llround, long long (*)(double));  // NOLINT(runtime/int)
   REGISTER_LIBM_SYMBOL(log, double (*)(double));
   REGISTER_LIBM_SYMBOL(log10, double (*)(double));
   REGISTER_LIBM_SYMBOL(log1p, double (*)(double));
   REGISTER_LIBM_SYMBOL(log2, double (*)(double));
   REGISTER_LIBM_SYMBOL(logb, double (*)(double));
-  REGISTER_LIBM_SYMBOL(lrint, long (*)(double));
-  REGISTER_LIBM_SYMBOL(lround, long (*)(double));
+  REGISTER_LIBM_SYMBOL(lrint, long (*)(double));   // NOLINT(runtime/int)
+  REGISTER_LIBM_SYMBOL(lround, long (*)(double));  // NOLINT(runtime/int)
   REGISTER_LIBM_SYMBOL(modf, double (*)(double, double*));
   REGISTER_LIBM_SYMBOL(nan, double (*)(const char*));
   REGISTER_LIBM_SYMBOL(nearbyint, double (*)(double));
@@ -289,7 +289,8 @@ bool RegisterKnownJITSymbols() {
   REGISTER_LIBM_SYMBOL(remquo, double (*)(double, double, int*));
   REGISTER_LIBM_SYMBOL(rint, double (*)(double));
   REGISTER_LIBM_SYMBOL(round, double (*)(double));
-  REGISTER_LIBM_SYMBOL(scalbln, double (*)(double, long));
+  REGISTER_LIBM_SYMBOL(scalbln,
+                       double (*)(double, long));  // NOLINT(runtime/int)
   REGISTER_LIBM_SYMBOL(scalbn, double (*)(double, int));
   REGISTER_LIBM_SYMBOL(sin, double (*)(double));
 #ifdef __APPLE__
-- 
GitLab


From 0e3b2b5805871f6d3dffff486a113b3c26c33b1f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 7 Feb 2018 10:30:39 -0800
Subject: [PATCH 761/966] Improve model_pruner:   * Actually remove nodes
 marked for removal if fetches are known.   * Remove trivial nodes even in the
 presence of control inputs, except     for Identity nodes when       a) they
 are anchored on an Identity following a Switch node and          removal
 would require anchoring a control identity on the Switch, or       b) they
 have control inputs and feed a Merge node.   * Remove nodes only when
 in_degree * out_degree <= in_degree + out_degree.

Move input deduping utility function to utils.{h,cc}.

PiperOrigin-RevId: 184858685
---
 tensorflow/core/grappler/optimizers/BUILD     |   1 +
 .../optimizers/dependency_optimizer.cc        |  43 ++---
 .../optimizers/dependency_optimizer.h         |   2 -
 .../grappler/optimizers/graph_rewriter.cc     |  62 ++++++-
 .../core/grappler/optimizers/graph_rewriter.h |  16 +-
 .../core/grappler/optimizers/model_pruner.cc  |  31 ++--
 .../grappler/optimizers/model_pruner_test.cc  | 119 ++++++------
 tensorflow/core/grappler/utils.cc             |  14 ++
 tensorflow/core/grappler/utils.h              |   3 +
 tensorflow/core/grappler/utils_test.cc        | 174 +++++++++++-------
 .../python/debug/lib/debug_gradients_test.py  |  42 ++---
 11 files changed, 301 insertions(+), 206 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 2ac31ebf6a..3432de9dcd 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -140,6 +140,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
     ],
 )
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
index db64e53026..edb0db65e9 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
@@ -52,21 +52,14 @@ bool RemoveInput(NodeDef* node, const string& input, NodeMap* node_map) {
   return removed_input;
 }
 
-// Remove duplicate control inputs.
-void PruneControlInputs(NodeDef* node) {
-  std::unordered_set<string> inputs;
-  int pos = 0;
-  while (pos < node->input_size()) {
-    const string& input = node->input(pos);
-    if (!inputs.insert(NodeName(input)).second && IsControlInput(input)) {
-      VLOG(1) << "**** Removing duplicate control input: " << input
-              << " from node " << node->DebugString();
-      node->mutable_input()->SwapElements(pos, node->input_size() - 1);
-      node->mutable_input()->RemoveLast();
-    } else {
-      ++pos;
-    }
+void DeleteNodes(const std::set<int>& nodes_to_delete, GraphDef* graph) {
+  int last = graph->node_size() - 1;
+  for (auto it = nodes_to_delete.rbegin(); it != nodes_to_delete.rend(); ++it) {
+    const int index = *it;
+    graph->mutable_node()->SwapElements(index, last);
+    last--;
   }
+  graph->mutable_node()->DeleteSubrange(last + 1, nodes_to_delete.size());
 }
 
 }  // namespace
@@ -75,6 +68,7 @@ bool DependencyOptimizer::SafeToRemoveIdentity(const NodeDef& node) {
   if (!IsIdentity(node)) {
     return true;
   }
+
   if (nodes_to_preserve_.find(node.name()) != nodes_to_preserve_.end()) {
     return false;
   }
@@ -397,22 +391,8 @@ void DependencyOptimizer::OptimizeNode(int node_idx,
 
 void DependencyOptimizer::CleanControlInputs() {
   for (int i = 0; i < optimized_graph_->node_size(); ++i) {
-    PruneControlInputs(optimized_graph_->mutable_node(i));
-  }
-}
-
-void DependencyOptimizer::DeleteNodes(const std::set<int>& nodes_to_delete) {
-  int last = optimized_graph_->node_size() - 1;
-  for (auto it = nodes_to_delete.rbegin(); it != nodes_to_delete.rend(); ++it) {
-    const int index = *it;
-    optimized_graph_->mutable_node()->SwapElements(index, last);
-    last--;
+    DedupControlInputs(optimized_graph_->mutable_node(i));
   }
-  optimized_graph_->mutable_node()->DeleteSubrange(last + 1,
-                                                   nodes_to_delete.size());
-  // Rebuild the NodeMap which was invalidated by the node swapping above.
-  node_map_.reset(new NodeMap(optimized_graph_));
-  BuildNodeToIdx();
 }
 
 Status DependencyOptimizer::OptimizeDependencies() {
@@ -437,7 +417,9 @@ Status DependencyOptimizer::OptimizeDependencies() {
   if (fetch_nodes_known_) {
     VLOG(1) << "Deleted " << nodes_to_delete.size() << " out of "
             << optimized_graph_->node_size() << " nodes.";
-    DeleteNodes(nodes_to_delete);
+    DeleteNodes(nodes_to_delete, optimized_graph_);
+    node_map_.reset(new NodeMap(optimized_graph_));
+    BuildNodeToIdx();
   }
   return Status::OK();
 }
@@ -576,7 +558,6 @@ Status DependencyOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
     Status topo_sort_status;
     // Perform topological sort to prepare the graph for transitive reduction.
     topo_sort_status = TopologicalSort(optimized_graph_);
-
     // Set up index-based graph datastructures to speed up analysis steps below.
     node_map_.reset(new NodeMap(optimized_graph_));
     BuildNodeToIdx();
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer.h b/tensorflow/core/grappler/optimizers/dependency_optimizer.h
index 0f47528a04..61ed154793 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer.h
@@ -52,8 +52,6 @@ class DependencyOptimizer : public GraphOptimizer {
   void CleanControlInputs();
   // Builds a map from the &optimized_graph_->node(i) to i.
   void BuildNodeToIdx();
-  // Removes the given set of nodes from the graph.
-  void DeleteNodes(const std::set<int>& nodes_to_delete);
   // Tries to optimize the node with the given index, possibly additional
   // optimizations by inserting nodes in nodes_to_simplify, and pruning nodes by
   // inserting them in nodes_to_delete.
diff --git a/tensorflow/core/grappler/optimizers/graph_rewriter.cc b/tensorflow/core/grappler/optimizers/graph_rewriter.cc
index 2d47ded156..b45ceb12a7 100644
--- a/tensorflow/core/grappler/optimizers/graph_rewriter.cc
+++ b/tensorflow/core/grappler/optimizers/graph_rewriter.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
 
 namespace tensorflow {
@@ -61,10 +62,19 @@ void GraphRewriter::ForwardInputs(
     const NodeDef& original_node,
     const std::unordered_set<const NodeDef*>& nodes_to_delete,
     NodeDef* new_node) {
-  ForwardInputsInternal(original_node, nodes_to_delete, new_node);
+  ForwardInputsInternal(original_node, nodes_to_delete, false, new_node);
   if (!new_node->name().empty()) {
     optimized_nodes_[new_node->name()] = new_node;
   }
+  // Reorder inputs such that control inputs come after regular inputs.
+  int pos = 0;
+  for (int i = 0; i < new_node->input_size(); ++i) {
+    if (!IsControlInput(new_node->input(i))) {
+      new_node->mutable_input()->SwapElements(pos, i);
+      ++pos;
+    }
+  }
+  DedupControlInputs(new_node);
 }
 
 bool GraphRewriter::DrivesControlDependency(const NodeDef& node) const {
@@ -72,6 +82,10 @@ bool GraphRewriter::DrivesControlDependency(const NodeDef& node) const {
          control_dependency_drivers_.end();
 }
 
+bool GraphRewriter::FeedsMerge(const NodeDef& node) const {
+  return merge_feeders_.find(&node) != merge_feeders_.end();
+}
+
 bool GraphRewriter::IsDrivenByControlDependency(const NodeDef& node) const {
   for (const auto& input : node.input()) {
     CHECK(!input.empty());
@@ -94,12 +108,27 @@ bool GraphRewriter::ReceivesRefValue(const NodeDef& node) const {
   return ref_receivers_.find(&node) != ref_receivers_.end();
 }
 
+bool GraphRewriter::IsDrivenBySwitch(const NodeDef& node) const {
+  return switch_receivers_.find(&node) != switch_receivers_.end();
+}
+
+bool GraphRewriter::RemovalIncreasesEdgeCount(const NodeDef& node) const {
+  const int in_degree = node.input_size();
+  auto itr = nodes_.find(node.name());
+  if (itr == nodes_.end()) {
+    return true;
+  }
+  const int out_degree = itr->second->out_degree;
+  return in_degree * out_degree > in_degree + out_degree;
+}
+
 void GraphRewriter::RecordConnectivity(
     const NodeDef& node, const std::unordered_set<string>& function_names) {
   const bool is_function =
       function_names.find(node.op()) != function_names.end();
 
   bool ref_receiver = false;
+  bool switch_receiver = false;
   for (const auto& input : node.input()) {
     int position = 0;
     string input_node_name = ParseNodeName(input, &position);
@@ -107,8 +136,14 @@ void GraphRewriter::RecordConnectivity(
     if (itr == nodes_.end()) {
       continue;
     }
-    const NodeInfo* fanin_info = itr->second.get();
+
+    NodeInfo* fanin_info = itr->second.get();
     const NodeDef* fanin = fanin_info->def;
+    if (IsMerge(node)) {
+      merge_feeders_.insert(fanin);
+    }
+    // Update out_degree of fanin.
+    ++fanin_info->out_degree;
     if (position < 0) {
       // This is a control edge
       control_dependency_drivers_.insert(fanin);
@@ -120,7 +155,9 @@ void GraphRewriter::RecordConnectivity(
       if (is_function) {
         function_neighbors_.insert(fanin);
       }
-
+      if (IsSwitch(*fanin)) {
+        switch_receiver = true;
+      }
       if (position < fanin_info->outputs.size() &&
           IsRefType(fanin_info->outputs[position])) {
         ref_receiver = true;
@@ -134,34 +171,41 @@ void GraphRewriter::RecordConnectivity(
   if (ref_receiver) {
     ref_receivers_.insert(&node);
   }
+  if (switch_receiver) {
+    switch_receivers_.insert(&node);
+  }
 }
 
 void GraphRewriter::ForwardInputsInternal(
     const NodeDef& node,
     const std::unordered_set<const NodeDef*>& nodes_to_delete,
-    NodeDef* new_node) {
+    bool add_as_control, NodeDef* new_node) {
   // To speed things up, use the optimized version of the node if
   // available.
   auto itr = optimized_nodes_.find(node.name());
   if (itr != optimized_nodes_.end()) {
     for (const string& input : itr->second->input()) {
-      *new_node->add_input() = input;
+      *new_node->add_input() =
+          add_as_control ? AsControlDependency(NodeName(input)) : input;
     }
     return;
   }
   for (const auto& input : node.input()) {
-    string input_node_name = NodeName(input);
+    const string input_node_name = NodeName(input);
     auto itr = nodes_.find(input_node_name);
     if (itr == nodes_.end()) {
       // Invalid input, preserve it as is.
-      *new_node->add_input() = input;
+      *new_node->add_input() =
+          add_as_control ? AsControlDependency(NodeName(input)) : input;
       continue;
     }
     const NodeDef* input_node = itr->second->def;
     if (nodes_to_delete.find(input_node) != nodes_to_delete.end()) {
-      ForwardInputsInternal(*input_node, nodes_to_delete, new_node);
+      ForwardInputsInternal(*input_node, nodes_to_delete,
+                            add_as_control || IsControlInput(input), new_node);
     } else {
-      *new_node->add_input() = input;
+      *new_node->add_input() =
+          add_as_control ? AsControlDependency(NodeName(input)) : input;
     }
   }
 }
diff --git a/tensorflow/core/grappler/optimizers/graph_rewriter.h b/tensorflow/core/grappler/optimizers/graph_rewriter.h
index 4b9c9feef8..3d48d628e2 100644
--- a/tensorflow/core/grappler/optimizers/graph_rewriter.h
+++ b/tensorflow/core/grappler/optimizers/graph_rewriter.h
@@ -58,15 +58,27 @@ class GraphRewriter {
   // Returns true if the node has input from a stateful op.
   bool ReceivesRefValue(const NodeDef& node) const;
 
+  // Returns true if the node is driven by a Switch node.
+  bool IsDrivenBySwitch(const NodeDef& node) const;
+
+  // Returns true if the node feeds a Merge node.
+  bool FeedsMerge(const NodeDef& node) const;
+
+  // Returns true if removal of this degree would increase edge count, i.e. if
+  // in-degree * out-degree > in-degree + out-degree or if the condition could
+  // not be verified.
+  bool RemovalIncreasesEdgeCount(const NodeDef& node) const;
+
  private:
   void RecordConnectivity(const NodeDef& node,
                           const std::unordered_set<string>& function_names);
   void ForwardInputsInternal(
       const NodeDef& original_node,
       const std::unordered_set<const NodeDef*>& nodes_to_delete,
-      NodeDef* new_node);
+      bool add_as_control, NodeDef* new_node);
 
   struct NodeInfo {
+    int out_degree = 0;
     const NodeDef* def;
 
     // These are filled in when the NodeInfo is built, but not that they
@@ -80,6 +92,8 @@ class GraphRewriter {
   std::unordered_set<const NodeDef*> function_neighbors_;
   std::unordered_set<const NodeDef*> cross_device_receivers_;
   std::unordered_set<const NodeDef*> ref_receivers_;
+  std::unordered_set<const NodeDef*> switch_receivers_;
+  std::unordered_set<const NodeDef*> merge_feeders_;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/model_pruner.cc b/tensorflow/core/grappler/optimizers/model_pruner.cc
index c9bec7890e..01282401a3 100644
--- a/tensorflow/core/grappler/optimizers/model_pruner.cc
+++ b/tensorflow/core/grappler/optimizers/model_pruner.cc
@@ -26,10 +26,17 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
-bool IsTrivialOp(const NodeDef& node) {
+bool IsTrivialOp(const NodeDef& node, const GraphRewriter& rewriter) {
   // Remove the stop gradient nodes since they serve no purpose once the graph
   // is built. Also remove Identity ops.
-  if (IsStopGradient(node) || IsIdentity(node)) {
+  if (IsStopGradient(node)) {
+    return true;
+  }
+  if (IsIdentity(node) &&
+      !(rewriter.FeedsMerge(node) &&
+        rewriter.IsDrivenByControlDependency(node)) &&
+      !(rewriter.IsDrivenBySwitch(node) &&
+        rewriter.DrivesControlDependency(node))) {
     return true;
   }
   if (IsAddN(node) && NumNonControlInputs(node) <= 1) {
@@ -41,7 +48,7 @@ bool IsTrivialOp(const NodeDef& node) {
 
 Status ModelPruner::Optimize(Cluster* cluster, const GrapplerItem& item,
                              GraphDef* pruned_graph) {
-  std::unordered_set<string> nodes_to_preserve = item.NodesToPreserve();
+  const std::unordered_set<string>& nodes_to_preserve = item.NodesToPreserve();
 
   // Prune all the nodes that won't be executed, ie all the nodes that aren't in
   // the fanin of a fetch node. If fetch nodes aren't specified, we'll assume
@@ -72,7 +79,7 @@ Status ModelPruner::Optimize(Cluster* cluster, const GrapplerItem& item,
   // Check if we can further prune the graph, by removing the trivial ops.
   std::unordered_set<const NodeDef*> nodes_to_delete;
   for (auto& node : runnable_item.graph.node()) {
-    if (!IsTrivialOp(node)) {
+    if (!IsTrivialOp(node, rewriter)) {
       continue;
     }
 
@@ -95,8 +102,7 @@ Status ModelPruner::Optimize(Cluster* cluster, const GrapplerItem& item,
     //   converting references to non-references. It is important to preserve
     //   these non-references since the partitioner will avoid sending
     //   non-references across partitions more than once.
-    if (!rewriter.DrivesControlDependency(node) &&
-        !rewriter.IsDrivenByControlDependency(node) &&
+    if (!rewriter.RemovalIncreasesEdgeCount(node) &&
         !rewriter.IsConnectedToFunction(node) &&
         !rewriter.IsDrivenByAnotherDevice(node) &&
         !rewriter.ReceivesRefValue(node)) {
@@ -112,13 +118,16 @@ Status ModelPruner::Optimize(Cluster* cluster, const GrapplerItem& item,
     return Status::OK();
   }
 
+  const bool fetches_are_known = !item.fetch.empty();
   for (auto& node : runnable_item.graph.node()) {
-    NodeDef* new_node = pruned_graph->add_node();
-    *new_node = node;
-    new_node->clear_input();
-    rewriter.ForwardInputs(node, nodes_to_delete, new_node);
+    if (!fetches_are_known ||
+        nodes_to_delete.find(&node) == nodes_to_delete.end()) {
+      NodeDef* new_node = pruned_graph->add_node();
+      *new_node = node;
+      new_node->clear_input();
+      rewriter.ForwardInputs(node, nodes_to_delete, new_node);
+    }
   }
-
   VLOG(1) << "Pruned " << nodes_to_delete.size()
           << " nodes from the graph. The graph now contains "
           << pruned_graph->node_size() << " nodes.";
diff --git a/tensorflow/core/grappler/optimizers/model_pruner_test.cc b/tensorflow/core/grappler/optimizers/model_pruner_test.cc
index ee722f311e..c39444299e 100644
--- a/tensorflow/core/grappler/optimizers/model_pruner_test.cc
+++ b/tensorflow/core/grappler/optimizers/model_pruner_test.cc
@@ -156,47 +156,42 @@ TEST_F(ModelPrunerTest, NoOpPruning) {
   const NodeDef& new_e = output.node(4);
   EXPECT_EQ(NodeName(e.name()), new_e.name());
 
-  EXPECT_EQ(1, new_e.input_size());
-  EXPECT_EQ(NodeName(d.name()), new_e.input(0));
-  EXPECT_EQ(2, new_d.input_size());
-  EXPECT_EQ(NodeName(b.name()), new_d.input(0));
-  EXPECT_EQ(1, new_c.input_size());
-  EXPECT_EQ(NodeName(b.name()), new_c.input(0));
+  for (const auto& new_node : output.node()) {
+    if (new_node.name() != "a") {
+      EXPECT_EQ(1, new_node.input_size());
+      EXPECT_EQ("a", new_node.input(0));
+    }
+  }
 }
 
-TEST_F(ModelPrunerTest, PruningSkipsCtrlDependencies) {
-  // Build a simple graph with a few trivially prunable ops.
-  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-
-  Output a = ops::Const(s.WithOpName("a"), 0.0f, {10, 10});
-  Output b = ops::Sqrt(s.WithOpName("b"), {a});
-  Output c = ops::Identity(s.WithOpName("c"), b);
-  Output d = ops::Identity(s.WithOpName("d"), c);
-  Output e = ops::Sqrt(s.WithOpName("e").WithControlDependencies(c), {d});
+TEST_F(ModelPrunerTest, PreserveIdentities) {
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+  ops::Variable v_in(scope.WithOpName("v_in"), {3}, DT_FLOAT);
+  ops::Variable v_ctrl(scope.WithOpName("v_ctrl"), {}, DT_BOOL);
+  ops::Switch s(scope.WithOpName("switch"), v_in, v_ctrl);
+  // id0 is preserved because it is fed by a Switch and drives a
+  // control dependency.
+  Output id0 = ops::Identity(scope.WithOpName("id0"), s.output_true);
+  // id1 is preserved because it feeds a Merge.
+  Output id1 = ops::Identity(
+      scope.WithOpName("id1").WithControlDependencies(v_ctrl), s.output_false);
+  Output id2 = ops::Identity(scope.WithOpName("id2"), id0);
+  Output id3 =
+      ops::Identity(scope.WithOpName("id3").WithControlDependencies(id0), id1);
+  auto merge = ops::Merge(scope.WithOpName("merge"), {id0, id1});
 
   GrapplerItem item;
-  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+  item.fetch.push_back("id2");
+  item.fetch.push_back("id3");
+  item.fetch.push_back("merge");
 
   ModelPruner pruner;
   GraphDef output;
   Status status = pruner.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
 
-  EXPECT_EQ(5, output.node_size());
-  const NodeDef& new_a = output.node(0);
-  EXPECT_EQ(NodeName(a.name()), new_a.name());
-  const NodeDef& new_b = output.node(1);
-  EXPECT_EQ(NodeName(b.name()), new_b.name());
-  const NodeDef& new_c = output.node(2);
-  EXPECT_EQ(NodeName(c.name()), new_c.name());
-  const NodeDef& new_d = output.node(3);
-  EXPECT_EQ(NodeName(d.name()), new_d.name());
-  const NodeDef& new_e = output.node(4);
-  EXPECT_EQ(NodeName(e.name()), new_e.name());
-
-  EXPECT_EQ(2, new_e.input_size());
-  EXPECT_EQ(NodeName(c.name()), new_e.input(0));
-  EXPECT_EQ("^c", new_e.input(1));
+  TF_EXPECT_OK(status);
+  EXPECT_EQ(item.graph.node_size(), output.node_size());
 }
 
 TEST_F(ModelPrunerTest, PruningSkipsRefOutputs) {
@@ -239,54 +234,47 @@ TEST_F(ModelPrunerTest, PruningSkipsRefOutputs) {
   EXPECT_EQ("b", new_e.input(0));
 }
 
-TEST_F(ModelPrunerTest, PruningPerservesCtrlDependencies) {
+TEST_F(ModelPrunerTest, PruningForwardsCtrlDependencies) {
   // Build a simple graph with a few trivially prunable ops.
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
 
   Output a = ops::Const(s.WithOpName("a"), 0.0f, {10, 10});
   Output b = ops::Sqrt(s.WithOpName("b"), {a});
   Output c = ops::Sqrt(s.WithOpName("c"), {a});
-  Output d = ops::Identity(s.WithOpName("d"), c);
-  Output e = ops::Identity(s.WithOpName("e"), d);
-  Output f = ops::Sqrt(s.WithOpName("f"), {e});
+  Output d = ops::Identity(s.WithOpName("d").WithControlDependencies(b), c);
+  Output e = ops::Identity(s.WithOpName("e").WithControlDependencies(c), d);
+  Output f = ops::Sqrt(s.WithOpName("f"), {d});
+  Output g = ops::Sqrt(s.WithOpName("g"), {e});
 
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-
-  // Add a control dependency between b and d and another one between c and e.
-  // They should be properly forwarded.
-  EXPECT_EQ("d", item.graph.node(3).name());
-  EXPECT_EQ("e", item.graph.node(4).name());
-  *item.graph.mutable_node(3)->add_input() = "^b";
-  *item.graph.mutable_node(4)->add_input() = "^c";
+  item.fetch.push_back("f");
+  item.fetch.push_back("g");
 
   ModelPruner pruner;
   GraphDef output;
   Status status = pruner.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
+  LOG(INFO) << "After: " << output.DebugString();
 
-  EXPECT_EQ(6, output.node_size());
-  const NodeDef& new_a = output.node(0);
-  EXPECT_EQ(NodeName(a.name()), new_a.name());
-  const NodeDef& new_b = output.node(1);
-  EXPECT_EQ(NodeName(b.name()), new_b.name());
-  const NodeDef& new_c = output.node(2);
-  EXPECT_EQ(NodeName(c.name()), new_c.name());
-  const NodeDef& new_d = output.node(3);
-  EXPECT_EQ(NodeName(d.name()), new_d.name());
-  const NodeDef& new_e = output.node(4);
-  EXPECT_EQ(NodeName(e.name()), new_e.name());
-  const NodeDef& new_f = output.node(5);
-  EXPECT_EQ(NodeName(f.name()), new_f.name());
-
-  EXPECT_EQ(1, new_f.input_size());
-  EXPECT_EQ(NodeName(e.name()), new_f.input(0));
-  EXPECT_EQ(2, new_e.input_size());
-  EXPECT_EQ(NodeName(d.name()), new_e.input(0));
-  EXPECT_EQ("^c", new_e.input(1));
-  EXPECT_EQ(2, new_d.input_size());
-  EXPECT_EQ(NodeName(c.name()), new_d.input(0));
-  EXPECT_EQ("^b", new_d.input(1));
+  EXPECT_EQ(5, output.node_size());
+  for (const auto& new_node : output.node()) {
+    // "d" and "e" should be removed.
+    EXPECT_NE("d", new_node.name());
+    EXPECT_NE("e", new_node.name());
+    if (new_node.name() == "g") {
+      EXPECT_EQ(2, new_node.input_size());
+      // The input from switch should be forwarded to id3.
+      EXPECT_EQ("c", new_node.input(0));
+      EXPECT_EQ("^b", new_node.input(1));
+    }
+    if (new_node.name() == "f") {
+      EXPECT_EQ(2, new_node.input_size());
+      // The input from switch should be forwarded to id3.
+      EXPECT_EQ("c", new_node.input(0));
+      EXPECT_EQ("^b", new_node.input(1));
+    }
+  }
 }
 
 TEST_F(ModelPrunerTest, PruningPerservesFetch) {
@@ -296,6 +284,7 @@ TEST_F(ModelPrunerTest, PruningPerservesFetch) {
   Output a = ops::Const(s.WithOpName("a"), 0.0f, {10, 10});
   Output b = ops::Sqrt(s.WithOpName("b"), {a});
   Output c = ops::Identity(s.WithOpName("c"), b);
+  Output d = ops::Identity(s.WithOpName("d"), c);
 
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
diff --git a/tensorflow/core/grappler/utils.cc b/tensorflow/core/grappler/utils.cc
index 634577ed30..7bfaf36865 100644
--- a/tensorflow/core/grappler/utils.cc
+++ b/tensorflow/core/grappler/utils.cc
@@ -312,6 +312,20 @@ void PermuteNodesInPlace(GraphDef* graph, std::vector<int>* permutation,
   }
 }
 
+void DedupControlInputs(NodeDef* node) {
+  std::unordered_set<string> inputs;
+  int pos = 0;
+  while (pos < node->input_size()) {
+    const string& input = node->input(pos);
+    if (!inputs.insert(NodeName(input)).second && IsControlInput(input)) {
+      node->mutable_input()->SwapElements(pos, node->input_size() - 1);
+      node->mutable_input()->RemoveLast();
+    } else {
+      ++pos;
+    }
+  }
+}
+
 namespace {
 template <typename T>
 inline void STLSortAndRemoveDuplicates(T* v) {
diff --git a/tensorflow/core/grappler/utils.h b/tensorflow/core/grappler/utils.h
index 8840c44d05..4ecb28f681 100644
--- a/tensorflow/core/grappler/utils.h
+++ b/tensorflow/core/grappler/utils.h
@@ -143,6 +143,9 @@ int NumNonControlInputs(const NodeDef& node);
 // Number of connected non-control outputs.
 int NumNonControlOutputs(const NodeDef& node, const NodeMap& node_map);
 
+// Removes redundant control inputs from node.
+void DedupControlInputs(NodeDef* node);
+
 // Returns the data type in attribute `attr_name` of `node`. If that attribute
 // doesn't exist, returns DT_INVALID.
 DataType GetDataTypeFromAttr(const NodeDef& node, const string& attr_name);
diff --git a/tensorflow/core/grappler/utils_test.cc b/tensorflow/core/grappler/utils_test.cc
index ba4e6b1bae..eabce5b5ee 100644
--- a/tensorflow/core/grappler/utils_test.cc
+++ b/tensorflow/core/grappler/utils_test.cc
@@ -29,83 +29,84 @@ namespace {
 class UtilsTest : public ::testing::Test {
  protected:
   NodeDef CreateConcatOffsetNode() const {
-    const string gdef_ascii = R"EOF(
-name: "gradients/InceptionV3/Mixed_7c/Branch_1/concat_v2_grad/ConcatOffset"
-op: "ConcatOffset"
-input: "InceptionV3/Mixed_7c/Branch_1/concat_v2/axis"
-input: "gradients/InceptionV3/Mixed_7c/Branch_1/concat_v2_grad/Shape"
-input: "gradients/InceptionV3/Mixed_7c/Branch_1/concat_v2_grad/Shape_1"
-attr {
-  key: "N"
-  value {
-    i: 2
-  }
-}
-    )EOF";
+    const string gdef_ascii =
+        " name: 'gradients/InceptionV3/Mixed_7c/Branch_1/concat_v2_grad/"
+        "ConcatOffset'"
+        " op: 'ConcatOffset'"
+        " input: 'InceptionV3/Mixed_7c/Branch_1/concat_v2/axis'"
+        " input: 'gradients/InceptionV3/Mixed_7c/Branch_1/concat_v2_grad/Shape'"
+        " input: "
+        " 'gradients/InceptionV3/Mixed_7c/Branch_1/concat_v2_grad/Shape_1'"
+        " attr {"
+        "  key: 'N'"
+        "  value {"
+        "    i: 2"
+        "  }"
+        " }";
     NodeDef node;
     CHECK(protobuf::TextFormat::ParseFromString(gdef_ascii, &node));
     return node;
   }
 
   NodeDef CreateDequeueNode() const {
-    const string gdef_ascii = R"EOF(
-name: "Train/TrainInput/input_producer_Dequeue"
-op: "QueueDequeueV2"
-input: "Train/TrainInput/input_producer"
-attr {
-  key: "component_types"
-  value {
-    list {
-      type: DT_INT32
-    }
-  }
-}
-attr {
-  key: "timeout_ms"
-  value {
-    i: -1
-  }
-}
-    )EOF";
+    const string gdef_ascii =
+        " name: 'Train/TrainInput/input_producer_Dequeue'"
+        " op: 'QueueDequeueV2'"
+        " input: 'Train/TrainInput/input_producer'"
+        " attr {"
+        "  key: 'component_types'"
+        "   value {"
+        "     list {"
+        "       type: DT_INT32"
+        "     }"
+        "   }"
+        " }"
+        " attr {"
+        "   key: 'timeout_ms'"
+        "   value {"
+        "     i: -1"
+        "   }"
+        " }";
+
     NodeDef node;
     CHECK(protobuf::TextFormat::ParseFromString(gdef_ascii, &node));
     return node;
   }
 
   NodeDef CreateFusedBatchNormNode() const {
-    const string gdef_ascii = R"EOF(
-name: "InceptionV3/Conv2d_1a_3x3/BatchNorm/FusedBatchNorm"
-op: "FusedBatchNorm"
-input: "InceptionV3/Conv2d_1a_3x3/BatchNorm/FusedBatchNorm"
-input: "InceptionV3/Conv2d_1a_3x3/BatchNorm/gamma/read"
-input: "InceptionV3/Conv2d_1a_3x3/BatchNorm/beta/read"
-input: "InceptionV3/Conv2d_1a_3x3/BatchNorm/Const"
-input: "InceptionV3/Conv2d_1a_3x3/BatchNorm/Const_1"
-attr {
-  key: "T"
-  value {
-    type: DT_FLOAT
-  }
-}
-attr {
-  key: "data_format"
-  value {
-    s: "NHWC"
-  }
-}
-attr {
-  key: "epsilon"
-  value {
-    f: 0.001
-  }
-}
-attr {
-  key: "is_training"
-  value {
-    b: true
-  }
-}
-    )EOF";
+    const string gdef_ascii =
+        " name: 'InceptionV3/Conv2d_1a_3x3/BatchNorm/FusedBatchNorm'"
+        " op: 'FusedBatchNorm'"
+        " input: 'InceptionV3/Conv2d_1a_3x3/BatchNorm/FusedBatchNorm'"
+        " input: 'InceptionV3/Conv2d_1a_3x3/BatchNorm/gamma/read'"
+        " input: 'InceptionV3/Conv2d_1a_3x3/BatchNorm/beta/read'"
+        " input: 'InceptionV3/Conv2d_1a_3x3/BatchNorm/Const'"
+        " input: 'InceptionV3/Conv2d_1a_3x3/BatchNorm/Const_1'"
+        " attr {"
+        "   key: 'T'"
+        "   value {"
+        "     type: DT_FLOAT"
+        "   }"
+        " }"
+        " attr {"
+        "   key: 'data_format'"
+        "   value {"
+        "     s: 'NHWC'"
+        "   }"
+        " }"
+        " attr {"
+        "   key: 'epsilon'"
+        "   value {"
+        "     f: 0.001"
+        "   }"
+        " }"
+        " attr {"
+        "   key: 'is_training'"
+        "   value {"
+        "     b: true"
+        "   }"
+        " }";
+
     NodeDef node;
     CHECK(protobuf::TextFormat::ParseFromString(gdef_ascii, &node));
     return node;
@@ -250,6 +251,49 @@ TEST_F(UtilsTest, GetTailOfChain) {
   EXPECT_EQ("noop", tail->name());
 }
 
+TEST_F(UtilsTest, DedupControlInputs) {
+  NodeDef foo;
+  foo.set_name("foo");
+  foo.add_input("bar");
+  DedupControlInputs(&foo);
+  EXPECT_EQ(1, foo.input_size());
+  EXPECT_EQ("bar", foo.input(0));
+
+  foo.set_input(0, "^bar");
+  DedupControlInputs(&foo);
+  EXPECT_EQ(1, foo.input_size());
+  EXPECT_EQ("^bar", foo.input(0));
+
+  foo.set_input(0, "bar");
+  foo.add_input("bar");
+  DedupControlInputs(&foo);
+  EXPECT_EQ(2, foo.input_size());
+  EXPECT_EQ("bar", foo.input(0));
+  EXPECT_EQ("bar", foo.input(1));
+
+  foo.set_input(1, "^bar");
+  DedupControlInputs(&foo);
+  EXPECT_EQ(1, foo.input_size());
+  EXPECT_EQ("bar", foo.input(0));
+
+  foo.set_input(0, "^bar");
+  foo.add_input("^bar");
+  DedupControlInputs(&foo);
+  EXPECT_EQ(1, foo.input_size());
+  EXPECT_EQ("^bar", foo.input(0));
+
+  foo.set_input(0, "bar");
+  foo.add_input("gnu");
+  foo.add_input("^bar");
+  foo.add_input("^gnu");
+  DedupControlInputs(&foo);
+  EXPECT_EQ(2, foo.input_size());
+  EXPECT_EQ("bar", foo.input(0));
+  EXPECT_EQ("gnu", foo.input(1));
+}
+
+TEST_F(UtilsTest, DeleteNodes) {}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/python/debug/lib/debug_gradients_test.py b/tensorflow/python/debug/lib/debug_gradients_test.py
index c1e9869d97..01867fc69d 100644
--- a/tensorflow/python/debug/lib/debug_gradients_test.py
+++ b/tensorflow/python/debug/lib/debug_gradients_test.py
@@ -40,6 +40,7 @@ class IdentifyGradientTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
     rewriter_config = rewriter_config_pb2.RewriterConfig(
+        disable_model_pruning=True,
         dependency_optimization=rewriter_config_pb2.RewriterConfig.OFF)
     graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config)
     config = config_pb2.ConfigProto(graph_options=graph_options)
@@ -117,8 +118,8 @@ class IdentifyGradientTest(test_util.TensorFlowTestCase):
   def testCallingIdentifyGradientTwiceWithTheSameGradientsDebuggerErrors(self):
     grad_debugger = debug_gradients.GradientsDebugger()
     grad_debugger.identify_gradient(self.w)
-    with self.assertRaisesRegexp(
-        ValueError, "The graph already contains an op named .*"):
+    with self.assertRaisesRegexp(ValueError,
+                                 "The graph already contains an op named .*"):
       grad_debugger.identify_gradient(self.w)
 
   def testIdentifyGradientWorksOnMultipleLosses(self):
@@ -144,10 +145,10 @@ class IdentifyGradientTest(test_util.TensorFlowTestCase):
     self.assertIsNot(dz1_dy, dz2_dy)
 
     self.sess.run(variables.global_variables_initializer())
-    self.assertAllClose(5.0 ** 2, self.sess.run(z1))
-    self.assertAllClose(5.0 ** 0.5, self.sess.run(z2))
+    self.assertAllClose(5.0**2, self.sess.run(z1))
+    self.assertAllClose(5.0**0.5, self.sess.run(z2))
     self.assertAllClose(2.0 * 5.0, self.sess.run(dz1_dy))
-    self.assertAllClose(0.5 * (5.0 ** -0.5), self.sess.run(dz2_dy))
+    self.assertAllClose(0.5 * (5.0**-0.5), self.sess.run(dz2_dy))
 
   def testIdentifyGradientRaisesLookupErrorForUnknownXTensor(self):
     grad_debugger_1 = debug_gradients.GradientsDebugger()
@@ -259,8 +260,8 @@ class IdentifyGradientTest(test_util.TensorFlowTestCase):
     self.sess.run(variables.global_variables_initializer())
     self.assertAllClose(3.0, self.sess.run(u_grad))
     self.assertAllClose(2.0, self.sess.run(v_grad))
-    self.assertAllClose(
-        3.0, self.sess.run(grad_debugger.gradient_tensor("u:0")))
+    self.assertAllClose(3.0, self.sess.run(
+        grad_debugger.gradient_tensor("u:0")))
 
   def testWatchGradientsWorksOnMultipleTensors(self):
     y = math_ops.add(self.w, -1.0, name="y")
@@ -277,10 +278,10 @@ class IdentifyGradientTest(test_util.TensorFlowTestCase):
     self.assertIsInstance(grad_debugger.gradient_tensor("w:0"), ops.Tensor)
 
     self.sess.run(variables.global_variables_initializer())
-    self.assertAllClose(
-        1.0, self.sess.run(grad_debugger.gradient_tensor("w:0")))
-    self.assertAllClose(
-        3.0, self.sess.run(grad_debugger.gradient_tensor("u:0")))
+    self.assertAllClose(1.0, self.sess.run(
+        grad_debugger.gradient_tensor("w:0")))
+    self.assertAllClose(3.0, self.sess.run(
+        grad_debugger.gradient_tensor("u:0")))
 
   def testWatchGradientsByXTensorsWorks(self):
     y = math_ops.add(self.w, -1.0, name="foo/y")
@@ -290,8 +291,8 @@ class IdentifyGradientTest(test_util.TensorFlowTestCase):
     # But we can still get the gradient tensors by using
     # watch_gradients_by_x_tensors().
     grad_debugger = debug_gradients.GradientsDebugger()
-    with grad_debugger.watch_gradients_by_tensors(
-        self.sess.graph, [self.w, self.u, y]):
+    with grad_debugger.watch_gradients_by_tensors(self.sess.graph,
+                                                  [self.w, self.u, y]):
       gradient_descent.GradientDescentOptimizer(0.1).minimize(z)
 
     self.assertEqual(3, len(grad_debugger.gradient_tensors()))
@@ -324,18 +325,18 @@ class IdentifyGradientTest(test_util.TensorFlowTestCase):
     self.assertIsNot(dz1_dy, dz2_dy)
 
     self.sess.run(variables.global_variables_initializer())
-    self.assertAllClose(5.0 ** 2, self.sess.run(z1))
-    self.assertAllClose(5.0 ** 0.5, self.sess.run(z2))
+    self.assertAllClose(5.0**2, self.sess.run(z1))
+    self.assertAllClose(5.0**0.5, self.sess.run(z2))
     self.assertAllClose(2.0 * 5.0, self.sess.run(dz1_dy))
-    self.assertAllClose(0.5 * (5.0 ** -0.5), self.sess.run(dz2_dy))
+    self.assertAllClose(0.5 * (5.0**-0.5), self.sess.run(dz2_dy))
 
   def testGradientsValuesFromDumpWorks(self):
     y = math_ops.add(self.w, -1.0, name="y")
     z = math_ops.square(y, name="z")
 
     grad_debugger = debug_gradients.GradientsDebugger()
-    with grad_debugger.watch_gradients_by_tensors(
-        self.sess.graph, [self.w, self.u, y]):
+    with grad_debugger.watch_gradients_by_tensors(self.sess.graph,
+                                                  [self.w, self.u, y]):
       train_op = gradient_descent.GradientDescentOptimizer(0.1).minimize(z)
 
     self.sess.run(variables.global_variables_initializer())
@@ -343,10 +344,7 @@ class IdentifyGradientTest(test_util.TensorFlowTestCase):
     run_options = config_pb2.RunOptions(output_partition_graphs=True)
     dump_dir = tempfile.mkdtemp()
     debug_url = "file://" + dump_dir
-    debug_utils.watch_graph(
-        run_options,
-        self.sess.graph,
-        debug_urls=debug_url)
+    debug_utils.watch_graph(run_options, self.sess.graph, debug_urls=debug_url)
     run_metadata = config_pb2.RunMetadata()
     self.assertAllClose(2.0, self.sess.run(self.u))
     self.sess.run(train_op, options=run_options, run_metadata=run_metadata)
-- 
GitLab


From 3d86d8ce14989ca65a59ad4cf37f690694bf6267 Mon Sep 17 00:00:00 2001
From: Phil <ijund.phil@gmail.com>
Date: Wed, 7 Feb 2018 19:59:59 +0100
Subject: [PATCH 762/966] Add unsortedsegment(prod/min/max/sqrt_n/mean).
 (#15858)

* Add unsortedsegment(prod/min/max/sqrt_n/mean).

This commit adds CPU/GPU implementations for prod/min/max
ops and python implementations for mean/sqrt_n. Also, it adapts and unifies the
corresponding tests of all unsorted reductions.
Note: The new gradient of unsorted_segment_max fixes the crash occuring when
negative indices on CPU are used.

* update golden API

* Fix compilation of atomicAdd for cuda_arch < 600. \n This commit moves the std::complex specialization of atomicAdd below the double specialization of atomicAdd for cuda_arch 600.

* Enable bfloat16, change inline to EIGEN_STRONG_INLINE.

* fix includes of cuda_device_functions; fix typo
---
 .../base_api/api_def_UnsortedSegmentMax.pbtxt |  13 +-
 .../base_api/api_def_UnsortedSegmentMin.pbtxt |  33 ++
 .../api_def_UnsortedSegmentProd.pbtxt         |  32 ++
 tensorflow/core/framework/numeric_types.h     |  40 ++-
 tensorflow/core/kernels/cwise_op_maximum.cc   |   4 +-
 tensorflow/core/kernels/reshape_op.cc         |   1 -
 .../core/kernels/segment_reduction_ops.cc     | 305 +++++++++++-------
 .../core/kernels/segment_reduction_ops.h      | 116 ++++---
 .../kernels/segment_reduction_ops_gpu.cu.cc   | 138 ++++----
 tensorflow/core/ops/math_ops.cc               |  20 ++
 tensorflow/core/util/cuda_device_functions.h  | 143 +++++++-
 tensorflow/core/util/cuda_kernel_helper.h     |  54 ----
 tensorflow/python/eager/backprop.py           |   2 +
 .../segment_reduction_ops_test.py             | 165 ++++++----
 tensorflow/python/ops/math_grad.py            | 136 ++++++--
 tensorflow/python/ops/math_ops.py             |  84 +++++
 tensorflow/tools/api/golden/tensorflow.pbtxt  |  12 +
 17 files changed, 900 insertions(+), 398 deletions(-)
 create mode 100644 tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMin.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_UnsortedSegmentProd.pbtxt

diff --git a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMax.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMax.pbtxt
index 8298d62f25..c6b22be30c 100644
--- a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMax.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMax.pbtxt
@@ -14,20 +14,21 @@ Has same shape as data, except for dimension 0 which
 has size `num_segments`.
 END
   }
-  summary: "Computes the Max along segments of a tensor."
+  summary: "Computes the maximum along segments of a tensor."
   description: <<END
 Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
 segments.
 
-This operator is similar to the [unsorted segment sum operator](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
-Instead of computing the sum over segments, it computes the maximum
-such that:
+This operator is similar to the unsorted segment sum operator found
+[(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+Instead of computing the sum over segments, it computes the maximum such that:
 
 \\(output_i = \max_j data_j\\) where max is over `j` such
 that `segment_ids[j] == i`.
 
-If the maximum is empty for a given segment ID `i`, it outputs the smallest possible value for specific numeric type,
- `output[i] = numeric_limits<T>::min()`.
+If the maximum is empty for a given segment ID `i`, it outputs the smallest
+possible value for the specific numeric type,
+`output[i] = numeric_limits<T>::lowest()`.
 
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
 <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentMax.png" alt>
diff --git a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMin.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMin.pbtxt
new file mode 100644
index 0000000000..55ea69b5dd
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMin.pbtxt
@@ -0,0 +1,33 @@
+op {
+  graph_op_name: "UnsortedSegmentMin"
+  in_arg {
+    name: "segment_ids"
+    description: <<END
+A 1-D tensor whose rank is equal to the rank of `data`'s
+first dimension.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Has same shape as data, except for dimension 0 which
+has size `num_segments`.
+END
+  }
+  summary: "Computes the minimum along segments of a tensor."
+  description: <<END
+Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+segments.
+
+This operator is similar to the unsorted segment sum operator found
+[(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+Instead of computing the sum over segments, it computes the minimum such that:
+
+\\(output_i = \min_j data_j\\) where min is over `j` such
+that `segment_ids[j] == i`.
+
+If the minimum is empty for a given segment ID `i`, it outputs the largest
+possible value for the specific numeric type,
+`output[i] = numeric_limits<T>::max()`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentProd.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentProd.pbtxt
new file mode 100644
index 0000000000..577ff53d60
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentProd.pbtxt
@@ -0,0 +1,32 @@
+op {
+  graph_op_name: "UnsortedSegmentProd"
+  in_arg {
+    name: "segment_ids"
+    description: <<END
+A 1-D tensor whose rank is equal to the rank of `data`'s
+first dimension.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Has same shape as data, except for dimension 0 which
+has size `num_segments`.
+END
+  }
+  summary: "Computes the product along segments of a tensor."
+  description: <<END
+Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+segments.
+
+This operator is similar to the unsorted segment sum operator found
+[(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+Instead of computing the sum over segments, it computes the product of all
+entries belonging to a segment such that:
+
+\\(output_i = \prod_j data_j\\) where the product is over `j` such
+that `segment_ids[j] == i`.
+
+If there is no entry for a given segment ID `i`, it outputs 1.
+END
+}
diff --git a/tensorflow/core/framework/numeric_types.h b/tensorflow/core/framework/numeric_types.h
index 8249059c29..4c38fbbe59 100644
--- a/tensorflow/core/framework/numeric_types.h
+++ b/tensorflow/core/framework/numeric_types.h
@@ -17,7 +17,6 @@ limitations under the License.
 #define TENSORFLOW_FRAMEWORK_NUMERIC_TYPES_H_
 
 #include <complex>
-
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 // Disable clang-format to prevent 'FixedPoint' header from being included
 // before 'Tensor' header on which it depends.
@@ -43,12 +42,47 @@ typedef Eigen::QUInt16 quint16;
 
 }  // namespace tensorflow
 
+
+
+
+static inline tensorflow::bfloat16 FloatToBFloat16(float float_val) {
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+    return *reinterpret_cast<tensorflow::bfloat16*>(
+        reinterpret_cast<uint16_t*>(&float_val));
+#else
+    return *reinterpret_cast<tensorflow::bfloat16*>(
+        &(reinterpret_cast<uint16_t*>(&float_val)[1]));
+#endif
+}
+    
 namespace Eigen {
 // TODO(xpan): We probably need to overwrite more methods to have correct eigen
-// behavior. E.g. loest(), is_integer, etc. See NumTraits.h in eigen.
+// behavior. E.g. epsilon(), dummy_precision, etc. See NumTraits.h in eigen.
 template <>
 struct NumTraits<tensorflow::bfloat16>
-    : GenericNumTraits<tensorflow::bfloat16> {};
+    : GenericNumTraits<tensorflow::bfloat16> {
+  enum {
+    IsInteger = 0,
+    IsSigned = 1,
+    RequireInitialization = 0
+  };
+  static EIGEN_STRONG_INLINE tensorflow::bfloat16 highest() {
+    return FloatToBFloat16(NumTraits<float>::highest());
+  }
+
+  static EIGEN_STRONG_INLINE tensorflow::bfloat16 lowest() {
+    return FloatToBFloat16(NumTraits<float>::lowest());
+  }
+
+  static EIGEN_STRONG_INLINE tensorflow::bfloat16 infinity() {
+    return FloatToBFloat16(NumTraits<float>::infinity());
+  }
+
+  static EIGEN_STRONG_INLINE tensorflow::bfloat16 quiet_NaN() {
+    return FloatToBFloat16(NumTraits<float>::quiet_NaN());
+  }
+};
+
 
 using ::tensorflow::operator==;
 using ::tensorflow::operator!=;
diff --git a/tensorflow/core/kernels/cwise_op_maximum.cc b/tensorflow/core/kernels/cwise_op_maximum.cc
index 8c54f22f10..e8a58eea80 100644
--- a/tensorflow/core/kernels/cwise_op_maximum.cc
+++ b/tensorflow/core/kernels/cwise_op_maximum.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER5(BinaryOp, CPU, "Maximum", functor::maximum, float, Eigen::half,
-          double, int32, int64);
+REGISTER6(BinaryOp, CPU, "Maximum", functor::maximum, float, Eigen::half,
+          bfloat16, double, int32, int64);
 #if GOOGLE_CUDA
 REGISTER4(BinaryOp, GPU, "Maximum", functor::maximum, float, Eigen::half,
           double, int64);
diff --git a/tensorflow/core/kernels/reshape_op.cc b/tensorflow/core/kernels/reshape_op.cc
index 8b86596721..33c63e7050 100644
--- a/tensorflow/core/kernels/reshape_op.cc
+++ b/tensorflow/core/kernels/reshape_op.cc
@@ -43,7 +43,6 @@ REGISTER_KERNEL_BUILDER(Name("Reshape")
                               .TypeConstraint<int64>("Tshape"), \
                           ReshapeOp);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
-TF_CALL_bfloat16(REGISTER_GPU_KERNEL);
 TF_CALL_bool(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 
diff --git a/tensorflow/core/kernels/segment_reduction_ops.cc b/tensorflow/core/kernels/segment_reduction_ops.cc
index 27b8081eb8..6c4685a50a 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops.cc
@@ -20,10 +20,10 @@ limitations under the License.
 #define EIGEN_USE_GPU
 #endif  // GOOGLE_CUDA
 
-#include "tensorflow/core/kernels/segment_reduction_ops.h"
-#include <vector>
 #include "third_party/eigen3/Eigen/Core"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/kernels/segment_reduction_ops.h"
+#include <vector>
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -356,158 +356,180 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_SORTED_KERNELS_ALL);
 #undef REGISTER_GPU_SORTED_KERNELS_ALL
 #endif  // GOOGLE_CUDA
 
+// ____________________________________________________________________________
+// Unsorted segment reduction ops.
+
 namespace functor {
 
-// UnsortedSegmentSumFunctor implementation for CPUDevice.
-// todo: Remove duplicate code in UnsortedSegmentSumFunctor and
-// UnsortedSegmentMaxFunctor.
-template <typename T, typename Index>
-struct UnsortedSegmentSumFunctor<CPUDevice, T, Index>
-    : UnsortedSegmentBaseFunctor<CPUDevice, T, Index> {
-  void operator()(OpKernelContext* ctx, const CPUDevice& d,
-                  const Index output_rows, const TensorShape& segment_ids_shape,
+// The ReductionFunctor implementation for CPU.
+template <typename T, typename Index, typename InitialValueF,
+          typename ReductionF>
+struct UnsortedSegmentFunctor<CPUDevice, T, Index, InitialValueF, ReductionF> {
+  void operator()(OpKernelContext* ctx, const Index num_segments,
+                  const TensorShape& segment_ids_shape,
                   typename TTypes<Index>::ConstFlat segment_ids,
                   const Index data_size, const T* data,
-                  typename TTypes<T, 2>::Tensor output) override {
-    output.setZero();
+                  typename TTypes<T, 2>::Tensor output) {
+    output.setConstant(InitialValueF()());
     if (data_size == 0) {
       return;
     }
     const int64 N = segment_ids.dimension(0);
+    ReductionF reduction;
     auto data_flat = typename TTypes<T, 2>::ConstTensor(data, N, data_size / N);
     for (int64 i = 0; i < N; ++i) {
       Index j = internal::SubtleMustCopy(segment_ids(i));
       if (j < 0) {
         continue;
       }
-      OP_REQUIRES(ctx, FastBoundsCheck(j, output_rows),
+      OP_REQUIRES(ctx, FastBoundsCheck(j, num_segments),
                   errors::InvalidArgument(
                       "segment_ids", SliceDebugString(segment_ids_shape, i),
-                      " = ", j, " is out of range [0, ", output_rows, ")"));
-      output.template chip<0>(j) += data_flat.template chip<0>(i);
+                      " = ", j, " is out of range [0, ", num_segments, ")"));
+      reduction(data_flat.template chip<0>(i), output.template chip<0>(j));
     }
   }
 };
-// UnsortedSegmentMaxFunctor implementation for CPUDevice.
-template <typename T, typename Index>
-struct UnsortedSegmentMaxFunctor<CPUDevice, T, Index>
-    : UnsortedSegmentBaseFunctor<CPUDevice, T, Index> {
-  void operator()(OpKernelContext* ctx, const CPUDevice& d,
-                  const Index output_rows, const TensorShape& segment_ids_shape,
-                  typename TTypes<Index>::ConstFlat segment_ids,
-                  const Index data_size, const T* data,
-                  typename TTypes<T, 2>::Tensor output) override {
-    output.setConstant(std::numeric_limits<T>::lowest());
-    if (data_size == 0) {
-      return;
-    }
-    const int64 N = segment_ids.dimension(0);
-    auto data_flat = typename TTypes<T, 2>::ConstTensor(data, N, data_size / N);
-    for (int64 i = 0; i < N; ++i) {
-      Index j = internal::SubtleMustCopy(segment_ids(i));
-      OP_REQUIRES(ctx, FastBoundsCheck(j, output_rows),
-                  errors::InvalidArgument(
-                      "segment_ids", SliceDebugString(segment_ids_shape, i),
-                      " = ", j, " is out of range [0, ", output_rows, ")"));
-      output.template chip<0>(j) =
-          data_flat.template chip<0>(i).cwiseMax(output.template chip<0>(j));
-    }
+
+template <typename T>
+using MatrixChip = Eigen::TensorChippingOp<0l, typename TTypes<T, 2>::Matrix>;
+
+template <typename T>
+using constMatrixChip =
+    Eigen::TensorChippingOp<0l, const typename TTypes<T, 2>::ConstMatrix>;
+
+// reduction functors
+template <typename T>
+struct SumOp {
+  void operator()(const constMatrixChip<T> data, MatrixChip<T> output) {
+    output += data;
+  }
+};
+
+template <typename T>
+struct MaxOp {
+  void operator()(const constMatrixChip<T> data, MatrixChip<T> output) {
+    output = data.cwiseMax(output);
+  }
+};
+
+template <typename T>
+struct MinOp {
+  void operator()(const constMatrixChip<T> data, MatrixChip<T> output) {
+    output = data.cwiseMin(output);
+  }
+};
+
+template <typename T>
+struct ProdOp {
+  void operator()(const constMatrixChip<T> data, MatrixChip<T> output) {
+    output *= data;
   }
 };
 }  // namespace functor
 
-// Base class for SegmentReductionOps that can handle unsorted segment
-// definitions
-// and specifying the size of the output in addition to a reduction function
-template <typename Device, class T, class Index>
-class UnsortedSegmentBaseOp : public OpKernel {
+// Static check routines not in the templated class to reduce code size
+static void UnsortedSegmentReductionValidation(OpKernel* op_kernel,
+                                               OpKernelContext* context,
+                                               const Tensor& data,
+                                               const Tensor& segment_ids,
+                                               const Tensor& num_segments) {
+  OP_REQUIRES(
+      context, op_kernel->IsLegacyScalar(num_segments.shape()),
+      errors::InvalidArgument("num_segments should be a scalar, not shape ",
+                              num_segments.shape().DebugString()));
+  OP_REQUIRES(
+      context, TensorShapeUtils::StartsWith(data.shape(), segment_ids.shape()),
+      errors::InvalidArgument("data.shape = ", data.shape().DebugString(),
+                              " does not start with segment_ids.shape = ",
+                              segment_ids.shape().DebugString()));
+}
+
+static bool UnsortedSegmentReductionDoValidation(OpKernel* op_kernel,
+                                                 OpKernelContext* context,
+                                                 const Tensor& data,
+                                                 const Tensor& segment_ids,
+                                                 const Tensor& num_segments) {
+  UnsortedSegmentReductionValidation(op_kernel, context, data, segment_ids,
+                                     num_segments);
+  return context->status().ok();
+}
+
+// The UnsortedSegmentReduction OpKernel. The DeviceReductionFunctor
+// is the device specific implementation of the reduction. These device
+// specific implementations are templated themselves with the corresponding
+// initial value functors and reduction functors.
+template <typename T, typename Index, typename DeviceReductionFunctor>
+class UnsortedSegmentReductionOp : public OpKernel {
  public:
-  explicit UnsortedSegmentBaseOp(
-      OpKernelConstruction* context,
-      functor::UnsortedSegmentBaseFunctor<Device, T, Index>& functor)
-      : OpKernel(context), reduction_functor_(functor) {}
+  explicit UnsortedSegmentReductionOp(OpKernelConstruction* context)
+      : OpKernel(context), reduction_functor_(DeviceReductionFunctor()) {}
 
   void Compute(OpKernelContext* context) override {
     const Tensor& data = context->input(0);
     const Tensor& segment_ids = context->input(1);
     const Tensor& num_segments = context->input(2);
-
-    OP_REQUIRES(
-        context, IsLegacyScalar(num_segments.shape()),
-        errors::InvalidArgument("num_segments should be a scalar, not shape ",
-                                num_segments.shape().DebugString()));
-    OP_REQUIRES(
-        context,
-        TensorShapeUtils::StartsWith(data.shape(), segment_ids.shape()),
-        errors::InvalidArgument("data.shape = ", data.shape().DebugString(),
-                                " does not start with segment_ids.shape = ",
-                                segment_ids.shape().DebugString()));
-
+    if (!UnsortedSegmentReductionDoValidation(this, context, data, segment_ids,
+                                              num_segments)) {
+      return;
+    }
     const auto segment_flat = segment_ids.flat<Index>();
     const Index output_rows =
         internal::SubtleMustCopy(num_segments.scalar<int32>()());
     OP_REQUIRES(context, output_rows >= 0,
                 errors::InvalidArgument("Input num_segments == ", output_rows,
                                         " must not be negative."));
-
     TensorShape output_shape;
     output_shape.AddDim(output_rows);
     for (int i = segment_ids.dims(); i < data.dims(); i++) {
       output_shape.AddDim(data.dim_size(i));
     }
-
     Tensor* output = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
     auto output_flat = output->flat_outer_dims<T>();
-
     auto data_ptr = data.template flat<T>().data();
-    reduction_functor_(context, context->template eigen_device<Device>(),
-                       output_rows, segment_ids.shape(), segment_flat,
+    reduction_functor_(context, output_rows, segment_ids.shape(), segment_flat,
                        data.NumElements(), data_ptr, output_flat);
   }
 
- private:
-  functor::UnsortedSegmentBaseFunctor<Device, T, Index>& reduction_functor_;
-};
-
-template <typename Device, class T, class Index>
-class UnsortedSegmentSumOp : public UnsortedSegmentBaseOp<Device, T, Index> {
- public:
-  explicit UnsortedSegmentSumOp(OpKernelConstruction* context)
-      : UnsortedSegmentBaseOp<Device, T, Index>(context, sum_functor_) {}
-
- private:
-  functor::UnsortedSegmentSumFunctor<Device, T, Index> sum_functor_;
+ protected:
+  DeviceReductionFunctor reduction_functor_;
 };
 
-template <typename Device, class T, class Index>
-class UnsortedSegmentMaxOp : public UnsortedSegmentBaseOp<Device, T, Index> {
- public:
-  explicit UnsortedSegmentMaxOp(OpKernelConstruction* context)
-      : UnsortedSegmentBaseOp<Device, T, Index>(context, max_functor_) {}
-
- private:
-  functor::UnsortedSegmentMaxFunctor<Device, T, Index> max_functor_;
-};
-
-#define REGISTER_REAL_CPU_UNSORTED_KERNELS(type, index_type)                  \
-  REGISTER_KERNEL_BUILDER(Name("UnsortedSegmentSum")                          \
-                              .Device(DEVICE_CPU)                             \
-                              .TypeConstraint<type>("T")                      \
-                              .TypeConstraint<index_type>("Tindices"),        \
-                          UnsortedSegmentSumOp<CPUDevice, type, index_type>); \
-  REGISTER_KERNEL_BUILDER(Name("UnsortedSegmentMax")                          \
-                              .Device(DEVICE_CPU)                             \
-                              .TypeConstraint<type>("T")                      \
-                              .TypeConstraint<index_type>("Tindices"),        \
-                          UnsortedSegmentMaxOp<CPUDevice, type, index_type>);
-
-#define REGISTER_COMPLEX_CPU_UNSORTED_KERNELS(type, index_type)        \
-  REGISTER_KERNEL_BUILDER(Name("UnsortedSegmentSum")                   \
-                              .Device(DEVICE_CPU)                      \
-                              .TypeConstraint<type>("T")               \
-                              .TypeConstraint<index_type>("Tindices"), \
-                          UnsortedSegmentSumOp<CPUDevice, type, index_type>);
+#define REGISTER_CPU_KERNEL_UNSORTEDSEGMENT(                           \
+    name, type, index_type, initial_value_functor, reduction_functor)  \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name(name)                                                       \
+          .Device(DEVICE_CPU)                                          \
+          .TypeConstraint<type>("T")                                   \
+          .TypeConstraint<index_type>("Tindices"),                     \
+      UnsortedSegmentReductionOp<                                      \
+          type, index_type,                                            \
+          functor::UnsortedSegmentFunctor<CPUDevice, type, index_type, \
+                                          initial_value_functor,       \
+                                          reduction_functor> >)
+
+#define REGISTER_REAL_CPU_UNSORTED_KERNELS(type, index_type)                   \
+  REGISTER_CPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentSum", type, index_type,  \
+                                      functor::Zero<type>,                     \
+                                      functor::SumOp<type>);                   \
+  REGISTER_CPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentMax", type, index_type,  \
+                                      functor::Lowest<type>,                   \
+                                      functor::MaxOp<type>);                   \
+  REGISTER_CPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentMin", type, index_type,  \
+                                      functor::Highest<type>,                  \
+                                      functor::MinOp<type>);                   \
+  REGISTER_CPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentProd", type, index_type, \
+                                      functor::One<type>,                      \
+                                      functor::ProdOp<type>);
+
+#define REGISTER_COMPLEX_CPU_UNSORTED_KERNELS(type, index_type)                \
+  REGISTER_CPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentSum", type, index_type,  \
+                                      functor::Zero<type>,                     \
+                                      functor::SumOp<type>);                   \
+  REGISTER_CPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentProd", type, index_type, \
+                                      functor::One<type>,                      \
+                                      functor::ProdOp<type>)
 
 #define REGISTER_REAL_CPU_UNSORTED_KERNELS_ALL(type) \
   REGISTER_REAL_CPU_UNSORTED_KERNELS(type, int32);   \
@@ -520,31 +542,72 @@ class UnsortedSegmentMaxOp : public UnsortedSegmentBaseOp<Device, T, Index> {
 TF_CALL_REAL_NUMBER_TYPES(REGISTER_REAL_CPU_UNSORTED_KERNELS_ALL);
 REGISTER_COMPLEX_CPU_UNSORTED_KERNELS_ALL(complex64);
 REGISTER_COMPLEX_CPU_UNSORTED_KERNELS_ALL(complex128);
+
 #undef REGISTER_REAL_CPU_UNSORTED_KERNELS
+#undef REGISTER_CPU_KERNEL_UNSORTEDSEGMENT
 #undef REGISTER_COMPLEX_CPU_UNSORTED_KERNELS
 #undef REGISTER_COMPLEX_CPU_UNSORTED_KERNELS_ALL
 #undef REGISTER_REAL_CPU_UNSORTED_KERNELS_ALL
 
 #if GOOGLE_CUDA
-#define REGISTER_GPU_UNSORTED_KERNELS(type, index_type)                \
-  REGISTER_KERNEL_BUILDER(Name("UnsortedSegmentSum")                   \
-                              .Device(DEVICE_GPU)                      \
-                              .HostMemory("num_segments")              \
-                              .TypeConstraint<type>("T")               \
-                              .TypeConstraint<index_type>("Tindices"), \
-                          UnsortedSegmentSumOp<GPUDevice, type, index_type>);
-
-#define REGISTER_GPU_UNSORTED_KERNELS_ALL(type) \
-  REGISTER_GPU_UNSORTED_KERNELS(type, int32);   \
-  REGISTER_GPU_UNSORTED_KERNELS(type, int64);
+#define REGISTER_GPU_KERNEL_UNSORTEDSEGMENT(                                 \
+    name, type, index_type, initial_value_functor, reduction_kernel_functor) \
+  REGISTER_KERNEL_BUILDER(                                                   \
+      Name(name)                                                             \
+          .Device(DEVICE_GPU)                                                \
+          .HostMemory("num_segments")                                        \
+          .TypeConstraint<type>("T")                                         \
+          .TypeConstraint<index_type>("Tindices"),                           \
+      UnsortedSegmentReductionOp<                                            \
+          type, index_type,                                                  \
+          functor::UnsortedSegmentFunctor<GPUDevice, type, index_type,       \
+                                          initial_value_functor,             \
+                                          reduction_kernel_functor> >)
+
+// sum is the only op that supports all input types currently
+#define REGISTER_REAL_GPU_UNSORTED_KERNELS(type, index_type)                   \
+  REGISTER_GPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentMax", type, index_type,  \
+                                      functor::Lowest<type>,                   \
+                                      functor::MaxOpGpu<type>);                \
+  REGISTER_GPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentMin", type, index_type,  \
+                                      functor::Highest<type>,                  \
+                                      functor::MinOpGpu<type>);                \
+  REGISTER_GPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentProd", type, index_type, \
+                                      functor::One<type>,                      \
+                                      functor::ProdOpGpu<type>);
+
+#define REGISTER_SUM_GPU_UNSORTED_KERNELS(type, index_type)                   \
+  REGISTER_GPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentSum", type, index_type, \
+                                      functor::Zero<type>,                    \
+                                      functor::SumOpGpu<type>);
+
+#define REGISTER_REAL_GPU_UNSORTED_KERNELS_ALL(type) \
+  REGISTER_REAL_GPU_UNSORTED_KERNELS(type, int32);   \
+  REGISTER_REAL_GPU_UNSORTED_KERNELS(type, int64);
+
+#define REGISTER_SUM_GPU_UNSORTED_KERNELS_ALL(type) \
+  REGISTER_SUM_GPU_UNSORTED_KERNELS(type, int32);   \
+  REGISTER_SUM_GPU_UNSORTED_KERNELS(type, int64);
+
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_REAL_GPU_UNSORTED_KERNELS_ALL);
+TF_CALL_int32(REGISTER_REAL_GPU_UNSORTED_KERNELS_ALL);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_SUM_GPU_UNSORTED_KERNELS_ALL);
+TF_CALL_int32(REGISTER_SUM_GPU_UNSORTED_KERNELS_ALL);
+TF_CALL_complex64(REGISTER_SUM_GPU_UNSORTED_KERNELS_ALL);
+TF_CALL_complex128(REGISTER_SUM_GPU_UNSORTED_KERNELS_ALL);
+
+#undef REGISTER_GPU_KERNEL_UNSORTEDSEGMENT
+#undef REGISTER_REAL_GPU_UNSORTED_KERNELS
+#undef REGISTER_SUM_GPU_UNSORTED_KERNELS
+#undef REGISTER_REAL_GPU_UNSORTED_KERNELS_ALL
+#undef REGISTER_SUM_GPU_UNSORTED_KERNELS_ALL
 
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_UNSORTED_KERNELS_ALL);
-TF_CALL_complex64(REGISTER_GPU_UNSORTED_KERNELS_ALL);
-TF_CALL_complex128(REGISTER_GPU_UNSORTED_KERNELS_ALL);
-#undef REGISTER_GPU_UNSORTED_KERNELS
-#undef REGISTER_GPU_UNSORTED_KERNELS_ALL
 #endif  // GOOGLE_CUDA
 
+// ____________________________________________________________________________
+// Sparse segment reduction ops.
+
 // Same as SegmentReductionOp but takes as input a "sparse" tensor, represented
 // by two dense tensors, one containing the data, and the other containing
 // indices into the data.
diff --git a/tensorflow/core/kernels/segment_reduction_ops.h b/tensorflow/core/kernels/segment_reduction_ops.h
index 5c9cfe0906..51814273b3 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.h
+++ b/tensorflow/core/kernels/segment_reduction_ops.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
-#define TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor.h"
@@ -46,59 +46,81 @@ struct SegmentSumFunctor {
                   const Index data_size, const T* data,
                   typename TTypes<T, 2>::Tensor output);
 };
-#endif
 
-// BaseFunctor for definition of UnsorteSegmentReductionOp
-// for usage without templates.
-template <typename Device, typename T, typename Index>
-struct UnsortedSegmentBaseFunctor {
-  virtual ~UnsortedSegmentBaseFunctor() {}
-  virtual void operator()(OpKernelContext* ctx, const Device& d,
-                          const Index output_rows,
-                          const TensorShape& segment_ids_shape,
-                          typename TTypes<Index>::ConstFlat segment_ids,
-                          const Index data_size, const T* data,
-                          typename TTypes<T, 2>::Tensor output){};
-};
+#endif
 
-// Functor for UnsortedSegmentSumOp.
-// output_rows: the number of output segments (unique segment ids in
-//                'segment_ids').
-// segment_ids_shape: shape of 'segment_ids' tensor.
-// segment_ids: unsorted map from input to output segment ids at which to
-//                perform segment sum operation.
-// data_size: size of input data tensor.
-// data: input data tensor.
-// output: output reshaped to {output_rows, output.size/output_rows}
-template <typename Device, typename T, typename Index>
-struct UnsortedSegmentSumFunctor
-    : public UnsortedSegmentBaseFunctor<Device, T, Index> {
-  void operator()(OpKernelContext* ctx, const Device& d,
-                  const Index output_rows, const TensorShape& segment_ids_shape,
+template <typename Device, typename T, typename Index, typename InitialValueF,
+          typename ReductionF>
+struct UnsortedSegmentFunctor {
+  void operator()(OpKernelContext* ctx, const Index num_segments,
+                  const TensorShape& segment_ids_shape,
                   typename TTypes<Index>::ConstFlat segment_ids,
                   const Index data_size, const T* data,
                   typename TTypes<T, 2>::Tensor output);
 };
 
-// Functor for UnsortedSegmentMaxOp.
-// output_rows: the number of output segments (unique segment ids in
-//                'segment_ids').
-// segment_ids_shape: shape of 'segment_ids' tensor.
-// segment_ids: unsorted map from input to output segment ids at which to
-//                perform segment sum operation.
-// data_size: size of input data tensor.
-// data: input data tensor.
-// output: output reshaped to {output_rows, output.size/output_rows}
-template <typename Device, typename T, typename Index>
-struct UnsortedSegmentMaxFunctor
-    : public UnsortedSegmentBaseFunctor<Device, T, Index> {
-  void operator()(OpKernelContext* ctx, const Device& d,
-                  const Index output_rows, const TensorShape& segment_ids_shape,
-                  typename TTypes<Index>::ConstFlat segment_ids,
-                  const Index data_size, const T* data,
-                  typename TTypes<T, 2>::Tensor output);
+#ifdef GOOGLE_CUDA
+// reduction functors for the gpu
+template <typename T>
+struct SumOpGpu {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(T* dest,
+                                                        const T& value) {
+    CudaAtomicAdd(dest, value);
+  }
+};
+
+template <typename T>
+struct ProdOpGpu {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(T* dest,
+                                                        const T& value) {
+    CudaAtomicMul(dest, value);
+  }
+};
+
+template <typename T>
+struct MaxOpGpu {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(T* dest,
+                                                        const T& value) {
+    CudaAtomicMax(dest, value);
+  }
+};
+
+template <typename T>
+struct MinOpGpu {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(T* dest,
+                                                        const T& value) {
+    CudaAtomicMin(dest, value);
+  }
 };
+
+#endif  // GOOGLE_CUDA
+
+// initial value functors
+template <typename T>
+struct Zero {
+  EIGEN_STRONG_INLINE T operator()() const { return T(0); }
+};
+
+template <typename T>
+struct One {
+  EIGEN_STRONG_INLINE T operator()() const { return T(1); }
+};
+
+template <typename T>
+struct Lowest {
+  EIGEN_STRONG_INLINE T operator()() const {
+    return Eigen::NumTraits<T>::lowest();
+  }
+};
+
+template <typename T>
+struct Highest {
+  EIGEN_STRONG_INLINE T operator()() const {
+    return Eigen::NumTraits<T>::highest();
+  }
+};
+
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
diff --git a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc
index 39d520698e..ba979e6bb2 100644
--- a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc
@@ -18,42 +18,15 @@ limitations under the License.
 #define EIGEN_USE_GPU
 
 #include "tensorflow/core/kernels/segment_reduction_ops.h"
-
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/util/cuda_device_functions.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
 
+
 namespace tensorflow {
 
 using GPUDevice = Eigen::GpuDevice;
 
-// Helper for UnusortedSegmentSumCustomKernel that adds value into dest
-// atomically.
-template <typename T>
-static __device__ __forceinline__ void AccumulateInto(T* dest, const T& value) {
-  CudaAtomicAdd(dest, value);
-}
-
-// Specializations of AccumulateInto for complex types, which CudaAtomicAdd does
-// not support. We treat a std::complex<T>* as a T* (the C++ standard section
-// 26.4.4 allows this explicitly) and atomic add the real and imaginary
-// components individually. The operation as a whole is not atomic, but we can
-// safely treat the components independently for the purpose of accumulating.
-template <>
-__device__ __forceinline__ void AccumulateInto(
-    std::complex<float>* dest, const std::complex<float>& value) {
-  auto dest_scalar = reinterpret_cast<float*>(dest);
-  CudaAtomicAdd(dest_scalar, value.real());
-  CudaAtomicAdd(dest_scalar + 1, value.imag());
-}
-
-template <>
-__device__ __forceinline__ void AccumulateInto(
-    std::complex<double>* dest, const std::complex<double>& value) {
-  auto dest_scalar = reinterpret_cast<double*>(dest);
-  CudaAtomicAdd(dest_scalar, value.real());
-  CudaAtomicAdd(dest_scalar + 1, value.imag());
-}
-
 // SortedSegmentSumFunctor kernel reduces input data just as
 // UnsortedSegmentSumCustomKernel does except that input data
 // is partitioned along the outer reduction dimension. This is
@@ -81,7 +54,7 @@ __global__ void SortedSegmentSumCustomKernel(const Index input_outer_dim_size,
                                              const Index* segment_ids,
                                              const T* input, T* output,
                                              const Index total_stripe_count) {
-  CUDA_1D_KERNEL_LOOP(stripe_index, total_stripe_count) {
+  for (int stripe_index : CudaGridRangeX(total_stripe_count)) {
     const Index segment_offset = stripe_index % inner_dim_size;
     const Index input_outer_dim_index_base =
         stripe_index / inner_dim_size * Index(OuterDimTileSize);
@@ -106,7 +79,7 @@ __global__ void SortedSegmentSumCustomKernel(const Index input_outer_dim_size,
         // decide whether to write result to global memory using atomic
         // operations
         if (last_output_segment_id == first_segment_id) {
-          AccumulateInto<T>(output + output_index, sum);
+          CudaAtomicAdd(output + output_index, sum);
         } else {
           *(output + output_index) = sum;
         }
@@ -121,31 +94,31 @@ __global__ void SortedSegmentSumCustomKernel(const Index input_outer_dim_size,
     // the following strip.
     const Index output_index =
         last_output_segment_id * inner_dim_size + segment_offset;
-    AccumulateInto<T>(output + output_index, sum);
+    CudaAtomicAdd(output + output_index, sum);
   }
 }
 
-// UnsortedSegmentSumFunctor kernel processes 'input_total_size' elements.
+// UnsortedSegmentSumKernel processes 'input_total_size' elements.
 // Each element is mapped from input to output by a combination of its
 // 'segment_ids' mapping and 'inner_dim_size'.
-template <typename T, typename Index>
-__global__ void UnsortedSegmentSumCustomKernel(
-    const Index input_outer_dim_size, const Index inner_dim_size,
-    const Index output_outer_dim_size, const Index* segment_ids, const T* input,
-    T* output) {
+template <typename T, typename Index, typename KernelReductionFunctor>
+__global__ void UnsortedSegmentCustomKernel(const Index input_outer_dim_size,
+                                            const Index inner_dim_size,
+                                            const Index output_outer_dim_size,
+                                            const Index* segment_ids,
+                                            const T* input, T* output) {
   const Index input_total_size = input_outer_dim_size * inner_dim_size;
   const Index output_total_size = output_outer_dim_size * inner_dim_size;
-  CUDA_1D_KERNEL_LOOP(input_index, input_total_size) {
+  for (int input_index : CudaGridRangeX(input_total_size)) {
     const Index input_segment_index = input_index / inner_dim_size;
     const Index segment_offset = input_index % inner_dim_size;
     const Index output_segment_index = segment_ids[input_segment_index];
-
     if (output_segment_index < 0 || output_segment_index >= output_total_size) {
       continue;
     }
     const Index output_index =
         output_segment_index * inner_dim_size + segment_offset;
-    AccumulateInto<T>(output + output_index, ldg(input + input_index));
+    KernelReductionFunctor()(output + output_index, ldg(input + input_index));
   }
 }
 
@@ -190,41 +163,39 @@ void SegmentSumFunctor<T, Index>::operator()(
       <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
           input_outer_dim_size, input_inner_dim_size, output_rows,
           segment_ids.data(), data, output.data(), total_stripe_count);
-};
+}
 
-// UnsortedSegmentSumFunctor implementation for GPUDevice.
-template <typename T, typename Index>
-struct UnsortedSegmentSumFunctor<GPUDevice, T, Index>
-    : UnsortedSegmentBaseFunctor<GPUDevice, T, Index> {
-  void operator()(OpKernelContext* ctx, const GPUDevice& d,
-                  const Index output_rows, const TensorShape& segment_ids_shape,
+template <typename T, typename Index, typename InitialValueF,
+          typename ReductionF>
+struct UnsortedSegmentFunctor<GPUDevice, T, Index, InitialValueF, ReductionF> {
+  void operator()(OpKernelContext* ctx, const Index num_segments,
+                  const TensorShape& segment_ids_shape,
                   typename TTypes<Index>::ConstFlat segment_ids,
                   const Index data_size, const T* data,
-                  typename TTypes<T, 2>::Tensor output) override {
+                  typename TTypes<T, 2>::Tensor output) {
     if (output.size() == 0) {
       return;
     }
-    // Set 'output' to zeros.
+    // Set 'output' to initial value.
+    GPUDevice d = ctx->template eigen_device<GPUDevice>();
     CudaLaunchConfig config = GetCudaLaunchConfig(output.size(), d);
-    SetZero<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        output.size(), output.data());
+    SetToValue<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+        output.size(), output.data(), InitialValueF()());
     if (data_size == 0 || segment_ids_shape.num_elements() == 0) {
       return;
     }
-
-    // Launch kernel to compute unsorted segment sum.
+    // Launch kernel to compute unsorted segment reduction.
     // Notes:
-    // *) 'input_total_size' is the total number of elements to process.
+    // *) 'data_size' is the total number of elements to process.
     // *) 'segment_ids.shape' is a prefix of data's shape.
     // *) 'input_outer_dim_size' is the total number of segments to process.
-    const Index input_total_size = data_size;
     const Index input_outer_dim_size = segment_ids.dimension(0);
-    const Index input_inner_dim_size = input_total_size / input_outer_dim_size;
+    const Index input_inner_dim_size = data_size / input_outer_dim_size;
+    config = GetCudaLaunchConfig(data_size, d);
 
-    config = GetCudaLaunchConfig(input_total_size, d);
-    UnsortedSegmentSumCustomKernel<T, Index>
+    UnsortedSegmentCustomKernel<T, Index, ReductionF>
         <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-            input_outer_dim_size, input_inner_dim_size, output_rows,
+            input_outer_dim_size, input_inner_dim_size, num_segments,
             segment_ids.data(), data, output.data());
   }
 };
@@ -238,19 +209,40 @@ struct UnsortedSegmentSumFunctor<GPUDevice, T, Index>
 
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_SORTED_GPU_SPECS);
 
-#define DEFINE_GPU_SPECS_INDEX(T, Index) \
-  template struct UnsortedSegmentSumFunctor<GPUDevice, T, Index>
-
-#define DEFINE_GPU_SPECS(T)         \
-  DEFINE_GPU_SPECS_INDEX(T, int32); \
-  DEFINE_GPU_SPECS_INDEX(T, int64);
-
-TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
-TF_CALL_complex64(DEFINE_GPU_SPECS);
-TF_CALL_complex128(DEFINE_GPU_SPECS);
-
-#undef DEFINE_GPU_SPECS
-#undef DEFINE_GPU_SPECS_INDEX
+#define DEFINE_REAL_UNSORTED_GPU_SPECS_INDEX(T, Index)                         \
+  template struct UnsortedSegmentFunctor<                                      \
+      GPUDevice, T, Index, functor::Lowest<T>, functor::MaxOpGpu<T>>;          \
+  template struct UnsortedSegmentFunctor<                                      \
+      GPUDevice, T, Index, functor::Highest<T>, functor::MinOpGpu<T>>;         \
+  template struct UnsortedSegmentFunctor<GPUDevice, T, Index, functor::One<T>, \
+                                         functor::ProdOpGpu<T>>;
+
+// sum is the only op that supports all input types currently
+#define DEFINE_SUM_UNSORTED_GPU_SPECS_INDEX(T, Index) \
+  template struct UnsortedSegmentFunctor<             \
+      GPUDevice, T, Index, functor::Zero<T>, functor::SumOpGpu<T>>;
+
+#define DEFINE_REAL_GPU_SPECS(T)                  \
+  DEFINE_REAL_UNSORTED_GPU_SPECS_INDEX(T, int32); \
+  DEFINE_REAL_UNSORTED_GPU_SPECS_INDEX(T, int64);
+
+#define DEFINE_SUM_GPU_SPECS(T)                  \
+  DEFINE_SUM_UNSORTED_GPU_SPECS_INDEX(T, int32); \
+  DEFINE_SUM_UNSORTED_GPU_SPECS_INDEX(T, int64);
+
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_REAL_GPU_SPECS);
+TF_CALL_int32(DEFINE_REAL_GPU_SPECS);
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_SUM_GPU_SPECS);
+TF_CALL_int32(DEFINE_SUM_GPU_SPECS);
+TF_CALL_complex64(DEFINE_SUM_GPU_SPECS);
+TF_CALL_complex128(DEFINE_SUM_GPU_SPECS);
+
+#undef DEFINE_SORTED_GPU_SPECS_INDEX
+#undef DEFINE_SORTED_GPU_SPECS
+#undef DEFINE_REAL_UNSORTED_GPU_SPECS_INDEX
+#undef DEFINE_SUM_UNSORTED_GPU_SPECS_INDEX
+#undef DEFINE_REAL_GPU_SPECS
+#undef DEFINE_SUM_GPU_SPECS
 
 }  // namespace functor
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 872ebe98c1..8f33d51d5a 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -1065,6 +1065,26 @@ REGISTER_OP("UnsortedSegmentMax")
     .Attr("Tnumsegments: {int32,int64} = DT_INT32")
     .SetShapeFn(UnsortedSegmentReductionShapeFn);
 
+REGISTER_OP("UnsortedSegmentMin")
+    .Input("data: T")
+    .Input("segment_ids: Tindices")
+    .Input("num_segments: Tnumsegments")
+    .Output("output: T")
+    .Attr("T: realnumbertype")
+    .Attr("Tindices: {int32,int64}")
+    .Attr("Tnumsegments: {int32,int64} = DT_INT32")
+    .SetShapeFn(UnsortedSegmentReductionShapeFn);
+
+REGISTER_OP("UnsortedSegmentProd")
+    .Input("data: T")
+    .Input("segment_ids: Tindices")
+    .Input("num_segments: Tnumsegments")
+    .Output("output: T")
+    .Attr("T: realnumbertype")
+    .Attr("Tindices: {int32,int64}")
+    .Attr("Tnumsegments: {int32,int64} = DT_INT32")
+    .SetShapeFn(UnsortedSegmentReductionShapeFn);
+
 REGISTER_OP("SparseSegmentSum")
     .Input("data: T")
     .Input("indices: Tidx")
diff --git a/tensorflow/core/util/cuda_device_functions.h b/tensorflow/core/util/cuda_device_functions.h
index 525bef16a0..f2d4e470c8 100644
--- a/tensorflow/core/util/cuda_device_functions.h
+++ b/tensorflow/core/util/cuda_device_functions.h
@@ -28,6 +28,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <complex>
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "cuda/include/cuda.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -389,6 +390,17 @@ __global__ void SetZero(const int count, T* ptr) {
   }
 }
 
+// Helper to set all tensor entries to a specific value.
+template <typename T>
+__global__ void SetToValue(const int count, T* ptr, T value) {
+  // Check that the grid is one dimensional and index doesn't overflow.
+  assert(blockDim.y == 1 && blockDim.z == 1);
+  assert(blockDim.x * gridDim.x / blockDim.x == gridDim.x);
+  for (int i : CudaGridRangeX(count)) {
+    ptr[i] = value;
+  }
+}
+
 namespace detail {
 // Helper function for atomic accumulation implemented as CAS.
 template <typename T, typename F>
@@ -420,6 +432,47 @@ __device__ double CudaAtomicCasHelper(double* ptr, F accumulate) {
       }));
 }
 
+// Overload of above function for half. Note that we don't have
+// atomicCAS() for anything less than 32 bits, so we need to include the
+// other 16 bits in the operation.
+//
+// This version is going to be very slow
+// under high concurrency, since most threads will be spinning on failing
+// their compare-and-swap tests. (The fact that we get false sharing on the
+// neighboring fp16 makes this even worse.) If you are doing a large reduction,
+// you are much better off with doing the intermediate steps in fp32 and then
+// switching to fp16 as late as you can in the calculations.
+//
+// Note: Assumes little endian.
+template <typename F>
+__device__ Eigen::half CudaAtomicCasHelper(Eigen::half* ptr, F accumulate) {
+#if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__)
+  static_assert(__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__, "Not little endian");
+#endif
+  namespace half_impl = Eigen::half_impl;
+  intptr_t intptr = reinterpret_cast<intptr_t>(ptr);
+  assert(!(intptr & 0x1));  // should be 2-aligned.
+  if (intptr & 0x2) {
+    // The half is in the second part of the uint32 (upper 16 bits).
+    uint32* address = reinterpret_cast<uint32*>(intptr - 2);
+    uint32 result = CudaAtomicCasHelper(address, [accumulate](uint32 arg) {
+      unsigned short high = static_cast<unsigned short>(arg >> 16);
+      Eigen::half acc = accumulate(half_impl::raw_uint16_to_half(high));
+      return (static_cast<uint32>(acc.x) << 16) | (arg & 0xffff);
+    });
+    return half_impl::raw_uint16_to_half(static_cast<uint16>(result >> 16));
+  } else {
+    // The half is in the first part of the uint32 (lower 16 bits).
+    uint32* address = reinterpret_cast<uint32*>(intptr);
+    uint32 result = CudaAtomicCasHelper(address, [accumulate](uint32 arg) {
+      unsigned short low = static_cast<unsigned short>(arg & 0xffff);
+      Eigen::half acc = accumulate(half_impl::raw_uint16_to_half(low));
+      return (arg & 0xffff0000) | static_cast<uint32>(acc.x);
+    });
+    return half_impl::raw_uint16_to_half(static_cast<uint16>(result & 0xffff));
+  }
+}
+
 template <typename From, typename To>
 using ToTypeIfConvertible =
     typename std::enable_if<std::is_convertible<From, To>::value, To>::type;
@@ -433,6 +486,14 @@ template <typename T, typename U>
 __device__ detail::ToTypeIfConvertible<U, T> CudaAtomicAdd(T* ptr, U value) {
   return atomicAdd(ptr, value);
 }
+
+__device__ inline Eigen::half CudaAtomicAdd(Eigen::half* ptr,
+                                            Eigen::half value) {
+  return detail::CudaAtomicCasHelper(
+      ptr, [value](Eigen::half a) { return a + value; });
+}
+
+
 #if __CUDA_ARCH__ < 600
 __device__ inline double CudaAtomicAdd(double* ptr, double value) {
   return detail::CudaAtomicCasHelper(ptr,
@@ -450,27 +511,74 @@ __device__ inline double CudaAtomicAdd(double* ptr, double value) {
   return result;
 }
 #endif
-
+// CudaAtomicAdd
+// Specializations of CudaAtomicAdd for complex types, which CudaAtomicAdd does
+// not support. We treat a std::complex<T>* as a T* (the C++ standard section
+// 26.4.4 allows this explicitly) and atomic add the real and imaginary
+// components individually. The operation as a whole is not atomic, but we can
+// safely treat the components independently for the purpose of accumulating.
+__device__ inline std::complex<float> CudaAtomicAdd(std::complex<float>* ptr,
+                                                    std::complex<float> value) {
+  auto ptr_scalar = reinterpret_cast<float*>(ptr);
+  return std::complex<float>(CudaAtomicAdd(ptr_scalar, value.real()),
+                             CudaAtomicAdd(ptr_scalar + 1, value.imag()));
+}
+
+__device__ inline std::complex<double> CudaAtomicAdd(
+    std::complex<double>* ptr, std::complex<double> value) {
+  auto ptr_scalar = reinterpret_cast<double*>(ptr);
+  return std::complex<double>(CudaAtomicAdd(ptr_scalar, value.real()),
+                              CudaAtomicAdd(ptr_scalar + 1, value.imag()));
+}
+
+// CudaAtomicSub
 template <typename T, typename U>
 __device__ detail::ToTypeIfConvertible<U, T> CudaAtomicSub(T* ptr, U value) {
   return atomicSub(ptr, value);
 }
+
 // Specializations of substraction which add the negative value.
 __device__ inline float CudaAtomicSub(float* ptr, float value) {
   return CudaAtomicAdd(ptr, -value);
 }
+
 __device__ inline double CudaAtomicSub(double* ptr, double value) {
   return CudaAtomicAdd(ptr, -value);
 }
+
 __device__ inline tensorflow::uint64 CudaAtomicSub(tensorflow::uint64* ptr,
                                                    tensorflow::uint64 value) {
   return CudaAtomicAdd(ptr, -value);
 }
 
+__device__ inline Eigen::half CudaAtomicSub(Eigen::half* ptr,
+                                            Eigen::half value) {
+  return detail::CudaAtomicCasHelper(
+      ptr, [value](Eigen::half a) { return a - value; });
+}
+
+// CudaAtomicMax
 template <typename T, typename U>
 __device__ detail::ToTypeIfConvertible<U, T> CudaAtomicMax(T* ptr, U value) {
   return atomicMax(ptr, value);
 }
+
+__device__ inline float CudaAtomicMax(float* ptr, float value) {
+  return detail::CudaAtomicCasHelper(
+      ptr, [value](float a) { return max(a, value); });
+}
+
+__device__ inline double CudaAtomicMax(double* ptr, double value) {
+  return detail::CudaAtomicCasHelper(
+      ptr, [value](double a) { return max(a, value); });
+}
+
+__device__ inline Eigen::half CudaAtomicMax(Eigen::half* ptr,
+                                            Eigen::half value) {
+  return detail::CudaAtomicCasHelper(
+      ptr, [value](Eigen::half a) { return max(a, value); });
+}
+
 #if __CUDA_ARCH__ < 320
 __device__ inline tensorflow::uint64 CudaAtomicMax(tensorflow::uint64* ptr,
                                                    tensorflow::uint64 value) {
@@ -479,10 +587,43 @@ __device__ inline tensorflow::uint64 CudaAtomicMax(tensorflow::uint64* ptr,
 }
 #endif
 
+// CudaAtomicMin
+template <typename T, typename U>
+__device__ detail::ToTypeIfConvertible<U, T> CudaAtomicMin(T* ptr, U value) {
+  return atomicMin(ptr, value);
+}
+
+__device__ inline float CudaAtomicMin(float* ptr, float value) {
+  return detail::CudaAtomicCasHelper(
+      ptr, [value](float a) { return min(a, value); });
+}
+
+__device__ inline double CudaAtomicMin(double* ptr, double value) {
+  return detail::CudaAtomicCasHelper(
+      ptr, [value](double a) { return min(a, value); });
+}
+
+__device__ inline Eigen::half CudaAtomicMin(Eigen::half* ptr,
+                                            Eigen::half value) {
+  return detail::CudaAtomicCasHelper(
+      ptr, [value](Eigen::half a) { return min(a, value); });
+}
+
+#if __CUDA_ARCH__ < 320
+__device__ inline tensorflow::uint64 CudaAtomicMin(tensorflow::uint64* ptr,
+                                                   tensorflow::uint64 value) {
+  return detail::CudaAtomicCasHelper(
+      ptr, [value](tensorflow::uint64 a) { return min(a, value); });
+}
+#endif
+
+// CudaAtomicMul
 template <typename T, typename U>
 __device__ detail::ToTypeIfConvertible<U, T> CudaAtomicMul(T* ptr, U value) {
   return detail::CudaAtomicCasHelper(ptr, [value](T a) { return a * value; });
 }
+
+// CudaAtomicDiv
 template <typename T, typename U>
 __device__ detail::ToTypeIfConvertible<U, T> CudaAtomicDiv(T* ptr, U value) {
   return detail::CudaAtomicCasHelper(ptr, [value](T a) { return a / value; });
diff --git a/tensorflow/core/util/cuda_kernel_helper.h b/tensorflow/core/util/cuda_kernel_helper.h
index 18a4c008f1..3c59524cb6 100644
--- a/tensorflow/core/util/cuda_kernel_helper.h
+++ b/tensorflow/core/util/cuda_kernel_helper.h
@@ -90,60 +90,6 @@ __device__ EIGEN_ALWAYS_INLINE Eigen::half CudaShuffleXorSync(
       CudaShuffleXorSync(mask, static_cast<uint16>(value), lane_mask, width));
 }
 
-namespace detail {
-// Overload of above function for half. Note that we don't have
-// atomicCAS() for anything less than 32 bits, so we need to include the
-// other 16 bits in the operation.
-//
-// This version is going to be very slow
-// under high concurrency, since most threads will be spinning on failing
-// their compare-and-swap tests. (The fact that we get false sharing on the
-// neighboring fp16 makes this even worse.) If you are doing a large reduction,
-// you are much better off with doing the intermediate steps in fp32 and then
-// switching to fp16 as late as you can in the calculations.
-//
-// Note: Assumes little endian.
-template <typename F>
-__device__ Eigen::half CudaAtomicCasHelper(Eigen::half* ptr, F accumulate) {
-#if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__)
-  static_assert(__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__, "Not little endian");
-#endif
-  namespace half_impl = Eigen::half_impl;
-  intptr_t intptr = reinterpret_cast<intptr_t>(ptr);
-  assert(!(intptr & 0x1));  // should be 2-aligned.
-  if (intptr & 0x2) {
-    // The half is in the second part of the uint32 (upper 16 bits).
-    uint32* address = reinterpret_cast<uint32*>(intptr - 2);
-    uint32 result = CudaAtomicCasHelper(address, [accumulate](uint32 arg) {
-      unsigned short high = static_cast<unsigned short>(arg >> 16);
-      Eigen::half acc = accumulate(half_impl::raw_uint16_to_half(high));
-      return (static_cast<uint32>(acc.x) << 16) | (arg & 0xffff);
-    });
-    return half_impl::raw_uint16_to_half(static_cast<uint16>(result >> 16));
-  } else {
-    // The half is in the first part of the uint32 (lower 16 bits).
-    uint32* address = reinterpret_cast<uint32*>(intptr);
-    uint32 result = CudaAtomicCasHelper(address, [accumulate](uint32 arg) {
-      unsigned short low = static_cast<unsigned short>(arg & 0xffff);
-      Eigen::half acc = accumulate(half_impl::raw_uint16_to_half(low));
-      return (arg & 0xffff0000) | static_cast<uint32>(acc.x);
-    });
-    return half_impl::raw_uint16_to_half(static_cast<uint16>(result & 0xffff));
-  }
-}
-}  // namespace detail
-
-__device__ inline Eigen::half CudaAtomicAdd(Eigen::half* ptr,
-                                            Eigen::half value) {
-  return detail::CudaAtomicCasHelper(
-      ptr, [value](Eigen::half a) { return a + value; });
-}
-__device__ inline Eigen::half CudaAtomicSub(Eigen::half* ptr,
-                                            Eigen::half value) {
-  return detail::CudaAtomicCasHelper(
-      ptr, [value](Eigen::half a) { return a - value; });
-}
-
 namespace cuda_helper {
 template <typename IntType>
 __device__ IntType upper_bound(IntType* first, IntType count, IntType val) {
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index d79d1fc0a6..898f5e90f3 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -157,6 +157,8 @@ _ops_which_dont_need_outputs = set([
     "SegmentMax",
     "UnsortedSegmentSum",
     "UnsortedSegmentMax",
+    "UnsortedSegmentMin",
+    "UnsortedSegmentProd",
     "Abs",
     "Neg",
     "ReciprocalGrad",
diff --git a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
index 5a54f448d0..bbce6b7d47 100644
--- a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
@@ -46,7 +46,8 @@ class SegmentReductionHelper(test.TestCase):
     return constant_op.constant(
         np_values, shape=input_shape, dtype=dtype), np_values
 
-  def _segmentReduce(self, indices, x, op1, op2=None, num_segments=None):
+  def _segmentReduce(self, indices, x, op1, op2=None, num_segments=None,
+                     initial_value=0):
     if not x.size:
       return np.array([])
     indices = np.asarray(indices)
@@ -64,13 +65,8 @@ class SegmentReductionHelper(test.TestCase):
       else:
         output[index] = x_flat[i]
     # zero initialize values that are still uncalcuated.
-    # output = [o if o is not None else np.zeros(slice_shape) for o in output]
-    if not op1 == np.max:
-      output = [o if o is not None else np.zeros(slice_shape) for o in output]
-    else:
-      zeroslice = np.zeros(slice_shape)
-      zeroslice.fill(dtype.min)
-      output = [o if o is not None else zeroslice for o in output]
+    initial_value_slice = np.ones(slice_shape) * initial_value
+    output = [o if o is not None else initial_value_slice for o in output]
     if op2 is not None:
       output = [op2(o) for o in output]
     output = [o.reshape(slice_shape) for o in output]
@@ -82,6 +78,9 @@ class SegmentReductionHelper(test.TestCase):
   def _mean_reduce_op(self, x):
     return x[0] / x[1] if isinstance(x, tuple) else x
 
+  def _sqrt_n_reduce_op(self, x):
+    return x[0] / np.sqrt(x[1]) if isinstance(x, tuple) else x
+
 
 class SegmentReductionOpTest(SegmentReductionHelper):
 
@@ -244,27 +243,61 @@ class SegmentReductionOpTest(SegmentReductionHelper):
       self.assertAllClose(jacob_t, jacob_n)
 
 
-class UnsortedSegmentSumTest(SegmentReductionHelper):
+class UnsortedSegmentTest(SegmentReductionHelper):
+
+  def __init__(self, methodName='runTest'):
+    # Each item is np_op1, np_op2, tf_op, initial_value functor
+    self.ops_list = [(np.add, None,
+                      math_ops.unsorted_segment_sum, lambda t: 0),
+                     (self._mean_cum_op, self._mean_reduce_op,
+                      math_ops.unsorted_segment_mean, lambda t: 0),
+                     (self._mean_cum_op, self._sqrt_n_reduce_op,
+                      math_ops.unsorted_segment_sqrt_n, lambda t: 0),
+                     (np.ndarray.__mul__, None,
+                      math_ops.unsorted_segment_prod, lambda t: 1),
+                     (np.minimum, None,
+                      math_ops.unsorted_segment_min, lambda t: t.max),
+                     (np.maximum, None,
+                      math_ops.unsorted_segment_max, lambda t: t.min)]
+
+    # A subset of ops has been enabled for complex numbers
+    self.complex_ops_list = [(np.add, None,
+                              math_ops.unsorted_segment_sum, lambda t: 0)]
+    self.differentiable_dtypes = [dtypes_lib.float16, dtypes_lib.float32,
+                                  dtypes_lib.float64]
+    self.all_dtypes = (self.differentiable_dtypes +
+                       [dtypes_lib.bfloat16,
+                        dtypes_lib.int64, dtypes_lib.int32,
+                        dtypes_lib.complex64, dtypes_lib.complex128])
+    super(UnsortedSegmentTest, self).__init__(methodName=methodName)
 
   def testValues(self):
-    dtypes = [
-        dtypes_lib.float32, dtypes_lib.float64, dtypes_lib.int64,
-        dtypes_lib.int32, dtypes_lib.complex64, dtypes_lib.complex128
-    ]
     indices_flat = np.array([0, 4, 0, 8, 3, 8, 4, 7, 7, 3])
     num_segments = 12
     for indices in indices_flat, indices_flat.reshape(5, 2):
       shape = indices.shape + (2,)
-      for dtype in dtypes:
-        with self.test_session(use_gpu=True):
-          tf_x, np_x = self._input(shape, dtype=dtype)
-          np_ans = self._segmentReduce(
-              indices, np_x, np.add, op2=None, num_segments=num_segments)
-          s = math_ops.unsorted_segment_sum(
-              data=tf_x, segment_ids=indices, num_segments=num_segments)
-          tf_ans = s.eval()
-        self.assertAllClose(np_ans, tf_ans)
-        self.assertShapeEqual(np_ans, s)
+      for dtype in self.all_dtypes:
+        ops_list = self.complex_ops_list if dtype.is_complex else self.ops_list
+        tf_x, np_x = self._input(shape, dtype=dtype)
+        for use_gpu in [True, False]:
+          with self.test_session(use_gpu=True):
+            for np_op1, np_op2, tf_op, init_op in ops_list:
+              # sqrt_n doesn't support integers
+              if (np_op2 == self._sqrt_n_reduce_op and dtype.is_integer):
+                continue
+              # todo(philjd): enable this test once real_div supports bfloat16
+              if (np_op2 in [self._sqrt_n_reduce_op, self._mean_reduce_op] and
+                  dtype == dtypes_lib.bfloat16):
+                continue
+              np_ans = self._segmentReduce(
+                  indices, np_x, np_op1, np_op2, num_segments=num_segments,
+                  initial_value=init_op(dtype))
+              s = tf_op(tf_x, segment_ids=indices, num_segments=num_segments)
+              tf_ans = s.eval()
+              if dtype is dtypes_lib.bfloat16:
+                tf_ans = tf_ans.astype(np.float32)
+              self.assertAllClose(np_ans, tf_ans)
+              self.assertShapeEqual(np_ans, s)
 
   def testNumSegmentsTypes(self):
     dtypes = [dtypes_lib.int32, dtypes_lib.int64]
@@ -287,25 +320,51 @@ class UnsortedSegmentSumTest(SegmentReductionHelper):
         self.assertAllClose(np_ans, tf_ans)
         self.assertShapeEqual(np_ans, s)
 
-  def testGradientSegmentSum(self):
+  def testGradients(self):
     num_cols = 2
-    indices_flat = np.array([0, 4, 0, 8, 3, 8, 4, 7, 7, 3])
+    indices_flat = np.array([0, 4, 0, -1, 3, -1, 4, 7, 7, 3])
     num_segments = max(indices_flat) + 3
-    for dtype in [dtypes_lib.float32, dtypes_lib.float64, dtypes_lib.complex64,
-                  dtypes_lib.complex128]:
+    for dtype in self.differentiable_dtypes:
+      ops_list = self.complex_ops_list if dtype.is_complex else self.ops_list
       for indices in indices_flat, indices_flat.reshape(5, 2):
         shape = indices.shape + (num_cols,)
-        with self.test_session(use_gpu=True):
-          tf_x, np_x = self._input(shape, dtype=dtype)
-          s = math_ops.unsorted_segment_sum(
-              data=tf_x, segment_ids=indices, num_segments=num_segments)
+        # test CPU and GPU as tf.gather behaves differently on each device
+        for use_gpu in [False, True]:
+          with self.test_session(use_gpu=use_gpu):
+            for _, _, tf_op, _ in ops_list:
+              tf_x, np_x = self._input(shape, dtype=dtype)
+              s = tf_op(tf_x, indices, num_segments)
+              jacob_t, jacob_n = gradient_checker.compute_gradient(
+                  tf_x,
+                  shape,
+                  s, [num_segments, num_cols],
+                  x_init_value=np_x,
+                  delta=1)
+            self.assertAllClose(jacob_t, jacob_n)
+
+  def testProdGrad(self):
+    # additional test for the prod gradient to ensure correct handling of zeros
+    values = np.array([0, 0, 1, 0, 2, 2, 3, 3, 3], dtype=np.float32)
+    indices = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2], dtype=np.int32)
+    indices_neg = np.array([-1, 0, 0, -1, 1, 1, -1, 2, 2], dtype=np.int32)
+    values_tf = constant_op.constant(values)
+    # ground truth partial derivatives
+    gradients_indices = np.zeros((9, 3), dtype=np.float32)
+    gradients_indices_neg = np.zeros((9, 3), dtype=np.float32)
+    # the derivative w.r.t. to the other segments is zero, so here we only
+    # explicitly set the grad values for the corresponding segment
+    gradients_indices[range(9), indices] = [0, 0, 0, 4, 0, 0, 9, 9, 9]
+    gradients_indices_neg[range(9), indices_neg] = [0, 1, 0, 0, 2, 2, 0, 3, 3]
+    for use_gpu in [False, True]:
+      with self.test_session(use_gpu=use_gpu):
+        for ind, grad_gt in [(indices, gradients_indices),
+                             (indices_neg, gradients_indices_neg)]:
+          s = math_ops.unsorted_segment_prod(values_tf,
+                                             constant_op.constant(ind), 3)
           jacob_t, jacob_n = gradient_checker.compute_gradient(
-              tf_x,
-              shape,
-              s, [num_segments, num_cols],
-              x_init_value=np_x,
-              delta=1)
-        self.assertAllClose(jacob_t, jacob_n)
+              values_tf, (9,), s, (3,), x_init_value=values, delta=1)
+          self.assertAllClose(jacob_t, jacob_n)
+          self.assertAllClose(jacob_t, grad_gt)
 
   def testGradientMatchesSegmentSum(self):
     # Strategy: compute the gradient for UnsortedSegmentSum and SegmentSum
@@ -318,8 +377,7 @@ class UnsortedSegmentSumTest(SegmentReductionHelper):
     num_cols = 2
     shape = [n, num_cols]
     num_segments = max(indices) + 1
-    for dtype in [dtypes_lib.float32, dtypes_lib.float64, dtypes_lib.complex64,
-                  dtypes_lib.complex128]:
+    for dtype in self.differentiable_dtypes:
       with self.test_session(use_gpu=True):
         tf_x, np_x = self._input(shape, dtype=dtype)
         # Results from UnsortedSegmentSum
@@ -353,9 +411,8 @@ class UnsortedSegmentSumTest(SegmentReductionHelper):
           unsorted.eval()
 
   def testEmptySecondDimension(self):
-    dtypes = [
-        np.float32, np.float64, np.int64, np.int32, np.complex64, np.complex128
-    ]
+    dtypes = [np.float16, np.float32, np.float64, np.int64, np.int32,
+              np.complex64, np.complex128]
     with self.test_session(use_gpu=True):
       for dtype in dtypes:
         for itype in (np.int32, np.int64):
@@ -364,36 +421,14 @@ class UnsortedSegmentSumTest(SegmentReductionHelper):
           unsorted = math_ops.unsorted_segment_sum(data, segment_ids, 2)
           self.assertAllEqual(unsorted.eval(), np.zeros((2, 0), dtype=dtype))
 
-  def testGradientSegmentMax(self):
-    num_cols = 2
-    indices_flat = np.array([0, 4, 0, 8, 3, 8, 4, 7, 7, 3])
-    num_segments = max(indices_flat) + 3
-    for indices in indices_flat, indices_flat.reshape(5, 2):
-      shape = indices.shape + (num_cols,)
-      with self.test_session(use_gpu=True):
-        tf_x, np_x = self._input(shape, dtype=dtypes_lib.float64)
-        s = math_ops.unsorted_segment_max(
-            data=tf_x, segment_ids=indices, num_segments=num_segments)
-        jacob_t, jacob_n = gradient_checker.compute_gradient(
-            tf_x,
-            shape,
-            s,
-            [num_segments, num_cols],
-            x_init_value=np_x.astype(np.double), delta=1)
-      self.assertAllClose(jacob_t, jacob_n)
-
   def testDropNegatives(self):
     # Note: the test is done by replacing segment_ids with 8 to -1
     # for index  and replace values generated by numpy with 0.
-    dtypes = [
-        dtypes_lib.float32, dtypes_lib.float64, dtypes_lib.int64,
-        dtypes_lib.int32, dtypes_lib.complex64, dtypes_lib.complex128
-    ]
     indices_flat = np.array([0, 4, 0, 8, 3, 8, 4, 7, 7, 3])
     num_segments = 12
     for indices in indices_flat, indices_flat.reshape(5, 2):
       shape = indices.shape + (2,)
-      for dtype in dtypes:
+      for dtype in self.all_dtypes:
         with self.test_session(use_gpu=True):
           tf_x, np_x = self._input(shape, dtype=dtype)
           np_ans = self._segmentReduce(
diff --git a/tensorflow/python/ops/math_grad.py b/tensorflow/python/ops/math_grad.py
index 53308484c4..c6cc4e1860 100644
--- a/tensorflow/python/ops/math_grad.py
+++ b/tensorflow/python/ops/math_grad.py
@@ -228,56 +228,142 @@ def _SparseSegmentSqrtNWithNumSegmentsGrad(op, grad):
                                               dim0), None, None, None)
 
 
-def _SegmentMinOrMaxGrad(op, grad, is_sorted):
-  """Gradient for SegmentMin and (unsorted) SegmentMax.
-
-  They share similar code.
-  """
-  zeros = array_ops.zeros(
-      array_ops.shape(op.inputs[0]), dtype=op.inputs[0].dtype)
-
+def _SegmentMinOrMaxGrad(op, grad):
+  """ Gradient for SegmentMin and SegmentMax. """
+  zeros = array_ops.zeros_like(op.inputs[0], dtype=op.inputs[0].dtype)
   # Get the number of selected (minimum or maximum) elements in each segment.
   gathered_outputs = array_ops.gather(op.outputs[0], op.inputs[1])
   is_selected = math_ops.equal(op.inputs[0], gathered_outputs)
-  if is_sorted:
-    num_selected = math_ops.segment_sum(
-        math_ops.cast(is_selected, grad.dtype), op.inputs[1])
-  else:
-    num_selected = math_ops.unsorted_segment_sum(
-        math_ops.cast(is_selected, grad.dtype), op.inputs[1], op.inputs[2])
-
+  num_selected = math_ops.segment_sum(math_ops.cast(is_selected, grad.dtype),
+                                      op.inputs[1])
   # Compute the gradient for each segment. The gradient for the ith segment is
   # divided evenly among the selected elements in that segment.
   weighted_grads = math_ops.div(grad, num_selected)
   gathered_grads = array_ops.gather(weighted_grads, op.inputs[1])
-
-  if is_sorted:
-    return array_ops.where(is_selected, gathered_grads, zeros), None
-  else:
-    return array_ops.where(is_selected, gathered_grads, zeros), None, None
+  return array_ops.where(is_selected, gathered_grads, zeros), None
 
 
 @ops.RegisterGradient("SegmentMin")
 def _SegmentMinGrad(op, grad):
   """Gradient for SegmentMin."""
-  return _SegmentMinOrMaxGrad(op, grad, True)
+  return _SegmentMinOrMaxGrad(op, grad)
 
 
 @ops.RegisterGradient("SegmentMax")
 def _SegmentMaxGrad(op, grad):
   """Gradient for SegmentMax."""
-  return _SegmentMinOrMaxGrad(op, grad, True)
+  return _SegmentMinOrMaxGrad(op, grad)
+
+
+def _GatherDropNegatives(params, ids, zero_clipped_indices=None,
+                         is_positive=None):
+  """ Helper function for unsorted segment ops. Gathers params for
+      positive segment ids and gathers 0 for inputs with negative segment id.
+      Also returns the clipped indices and a boolean mask with the same shape
+      as ids where a positive id is masked as true. With this, the latter two
+      can be passed as arguments to this function to reuse them.
+  """
+  if zero_clipped_indices is None:
+    zero_clipped_indices = math_ops.maximum(ids, array_ops.zeros_like(ids))
+  gathered = array_ops.gather(params, zero_clipped_indices)
+  if is_positive is None:
+    is_positive = math_ops.greater_equal(ids, 0)
+    # tf.where(condition, x, y) requires condition to have the same shape as x
+    # and y.
+    # todo(philjd): remove this if tf.where supports broadcasting (#9284)
+    for _ in range(gathered.shape.ndims - is_positive.shape.ndims):
+      is_positive = array_ops.expand_dims(is_positive, -1)
+    is_positive = (is_positive &
+                   array_ops.ones_like(gathered, dtype=dtypes.bool))
+  # replace gathered params of negative indices with 0
+  zero_slice = array_ops.zeros_like(gathered)
+  return (array_ops.where(is_positive, gathered, zero_slice),
+          zero_clipped_indices, is_positive)
+
+
+def _UnsortedSegmentMinOrMaxGrad(op, grad):
+  """ Gradient for UnsortedSegmentMin and UnsortedSegmentMax. """
+  # Get the number of selected (minimum or maximum) elements in each segment.
+  gathered_outputs, zero_clipped_indices, is_positive = \
+      _GatherDropNegatives(op.outputs[0], op.inputs[1])
+  is_selected = math_ops.equal(op.inputs[0], gathered_outputs)
+  is_selected = math_ops.logical_and(is_selected, is_positive)
+  num_selected = math_ops.unsorted_segment_sum(
+      math_ops.cast(is_selected, grad.dtype), op.inputs[1], op.inputs[2])
+  # Compute the gradient for each segment. The gradient for the ith segment is
+  # divided evenly among the selected elements in that segment.
+  weighted_grads = math_ops.div(grad, num_selected)
+  gathered_grads, _, _ = _GatherDropNegatives(weighted_grads, None,
+                                              zero_clipped_indices,
+                                              is_positive)
+  zeros = array_ops.zeros_like(gathered_grads)
+  return array_ops.where(is_selected, gathered_grads, zeros), None, None
 
 
 @ops.RegisterGradient("UnsortedSegmentSum")
 def _UnsortedSegmentSumGrad(op, grad):
-  """Gradient for SegmentSum."""
-  return array_ops.gather(grad, op.inputs[1]), None, None
+  """Gradient for UnsortedSegmentSum."""
+  return _GatherDropNegatives(grad, op.inputs[1])[0], None, None
 
 
 @ops.RegisterGradient("UnsortedSegmentMax")
 def _UnsortedSegmentMaxGrad(op, grad):
-  return _SegmentMinOrMaxGrad(op, grad, False)
+  """ Gradient for UnsortedSegmentMax. """
+  return _UnsortedSegmentMinOrMaxGrad(op, grad)
+
+
+@ops.RegisterGradient("UnsortedSegmentMin")
+def _UnsortedSegmentMinGrad(op, grad):
+  """ Gradient for UnsortedSegmentMin. """
+  return _UnsortedSegmentMinOrMaxGrad(op, grad)
+
+
+@ops.RegisterGradient("UnsortedSegmentProd")
+def _UnsortedSegmentProdGrad(op, grad):
+  """ Gradient for UnsortedSegmentProd.
+  The gradient can be expressed for each segment by dividing the segment's
+  product by each element of the segment input tensor, but this approach can't
+  deal with zeros in the input.
+  Unlike reduce_prod we can't use cumsum here as individual segments may have
+  a different number of elements. Therefore we consider three cases:
+  1) A segment input contains no zeros and we can safely divide by the input
+     tensor.
+  2) A segment contains exactly one zero. Then the gradient of each input of
+     the segment is zero except for the 0-input, there the gradient is
+     the product of the remaining segment entries.
+  3) A segment contains at least two zeros. The gradient is zero for all
+     segment inputs.
+  """
+  # Note that unsorted_segment_sum will filter out the negative indices,
+  # so we don't need to do a logical_and with is_positive here
+  is_zero = math_ops.equal(op.inputs[0], 0)
+  num_zeros = gen_math_ops.unsorted_segment_sum(
+      math_ops.cast(is_zero, dtype=dtypes.int32), op.inputs[1], op.inputs[2])
+  # handle case 3 and set the gradient to 0 for segments with more than one
+  # 0 as input
+  grad = array_ops.where(math_ops.greater(num_zeros, 1),
+                         array_ops.zeros_like(grad), grad)
+  # replace all zeros with ones and compute the unsorted_segment_prod
+  non_zero_data = array_ops.where(is_zero, array_ops.ones_like(op.inputs[0]),
+                                  op.inputs[0])
+  non_zero_prod = gen_math_ops.unsorted_segment_prod(
+      non_zero_data, op.inputs[1], op.inputs[2])
+  # clip the indices for gather to be positive
+  zero_clipped_indices = math_ops.maximum(op.inputs[1],
+                                          array_ops.zeros_like(op.inputs[1]))
+  gathered_prod = array_ops.gather(op.outputs[0], zero_clipped_indices)
+  gathered_non_zero_prod = array_ops.gather(non_zero_prod,
+                                            zero_clipped_indices)
+  prod_divided_by_el = gathered_prod / op.inputs[0]  # May contain nan/inf.
+  # Now fetch the individual results for segments containing 0 and those that
+  # don't. is_zero will also fetch results for entries with negative index
+  # but the following gather_drop_negatives sets the corresponding entry in
+  # grad to 0 for these
+  partial_derivative = array_ops.where(is_zero, gathered_non_zero_prod,
+                                       prod_divided_by_el)
+  gathered_grad = _GatherDropNegatives(grad, op.inputs[1],
+                                       zero_clipped_indices)[0]
+  return gathered_grad * partial_derivative, None, None
 
 
 @ops.RegisterGradient("Abs")
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 9a8ac93de9..aac72b331e 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -131,6 +131,9 @@ See the @{$python/math_ops} guide.
 @@segment_mean
 @@unsorted_segment_sum
 @@unsorted_segment_max
+@@unsorted_segment_min
+@@unsorted_segment_prod
+@@unsorted_segment_sqrt_n
 @@sparse_segment_sum
 @@sparse_segment_mean
 @@sparse_segment_sqrt_n
@@ -2552,6 +2555,87 @@ def reduced_shape(input_shape, axes):
       ])  # [1, 1]
 
 
+def _unsorted_segment_N(data, segment_ids, num_segments):
+  """ Helper function for unsorted_segment_mean/_sqrtN. Computes the number
+      of segment entries with 0-entries set to 1 to allow division by N.
+  """
+  # bincount doesn't support negative indices so we use unsorted_segment_sum
+  ones_tensor = array_ops.ones(segment_ids.shape, dtype=data.dtype)
+  N = gen_math_ops.unsorted_segment_sum(ones_tensor, segment_ids, num_segments)
+  # add dimensions for all non-reduced axes
+  ndims_output = data.shape.ndims - segment_ids.shape.ndims
+  broadcast_shape = [num_segments] + [1] * ndims_output
+  N = array_ops.reshape(N, broadcast_shape)
+  return gen_math_ops.maximum(N, 1)
+
+
+@tf_export("unsorted_segment_mean")
+def unsorted_segment_mean(data, segment_ids, num_segments, name=None):
+  r""" Computes the mean along segments of a tensor.
+
+  Read @{$math_ops#segmentation$the section on segmentation} for an explanation
+  of segments.
+
+  This operator is similar to the unsorted segment sum operator found
+  [here](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+  Instead of computing the sum over segments, it computes the mean of all
+  entries belonging to a segment such that:
+
+  \\(output_i = 1/N_i \sum data_j\\) where the sum is over `j` such
+  that `segment_ids[j] == i` with \\N_i\\ being the number of occurrences
+  of id \\i\\.
+
+  If there is no entry for a given segment ID `i`, it outputs 0.
+
+  segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+  first dimension.
+
+  output: Has same shape as data, except for dimension 0 which
+  has size `num_segments`.
+  """
+  with ops.name_scope(name, "UnsortedSegmentMean"):
+    data = ops.convert_to_tensor(data)
+    segment_ids = ops.convert_to_tensor(segment_ids)
+    N = _unsorted_segment_N(data, segment_ids, num_segments)
+    summed = gen_math_ops.unsorted_segment_sum(data, segment_ids, num_segments)
+    return summed / N
+
+
+@tf_export("unsorted_segment_sqrt_n")
+def unsorted_segment_sqrt_n(data, segment_ids, num_segments, name=None):
+  r"""Computes the sum along segments of a tensor divided by the sqrt(N).
+
+  Read @{$math_ops#segmentation$the section on segmentation} for an explanation
+  of segments.
+
+  This operator is similar to the unsorted segment sum operator found
+  [here](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+  Additionally to computing the sum over segments, it divides the results by
+  sqrt(N).
+
+  \\(output_i = 1/sqrt(N_i) \sum data_j\\) where the sum is over `j` such
+  that `segment_ids[j] == i` with \\N_i\\ being the number of occurrences
+  of id \\i\\.
+
+  If there is no entry for a given segment ID `i`, it outputs 0.
+
+  Note that this op only supports floating point and complex dtypes,
+  due to tf.sqrt only supporting these types.
+
+  segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+  first dimension.
+
+  output: Has same shape as data, except for dimension 0 which
+  has size `num_segments`.
+  """
+  with ops.name_scope(name, "UnsortedSegmentSqrtN"):
+    data = ops.convert_to_tensor(data)
+    segment_ids = ops.convert_to_tensor(segment_ids)
+    N = _unsorted_segment_N(data, segment_ids, num_segments)
+    summed = gen_math_ops.unsorted_segment_sum(data, segment_ids, num_segments)
+    return summed / gen_math_ops.sqrt(N)
+
+
 @tf_export("sparse_segment_sum")
 def sparse_segment_sum(data, indices, segment_ids, name=None,
                        num_segments=None):
diff --git a/tensorflow/tools/api/golden/tensorflow.pbtxt b/tensorflow/tools/api/golden/tensorflow.pbtxt
index e8890e9cc0..066c4513ff 100644
--- a/tensorflow/tools/api/golden/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.pbtxt
@@ -2056,6 +2056,18 @@ tf_module {
     name: "unsorted_segment_max"
     argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "unsorted_segment_min"
+    argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "unsorted_segment_prod"
+    argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "unsorted_segment_sqrt_n"
+    argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "unsorted_segment_sum"
     argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-- 
GitLab


From 39ee0bc175633691234691e126c46f2cb38effaa Mon Sep 17 00:00:00 2001
From: Ankur Taly <ataly@google.com>
Date: Wed, 7 Feb 2018 11:15:47 -0800
Subject: [PATCH 763/966] Updated copyright from 2017 to 2018

---
 tensorflow/contrib/lite/graph_info.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/lite/graph_info.cc b/tensorflow/contrib/lite/graph_info.cc
index 186b80fe82..e60ed2c246 100644
--- a/tensorflow/contrib/lite/graph_info.cc
+++ b/tensorflow/contrib/lite/graph_info.cc
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
-- 
GitLab


From 27ffe518682584804c4b95f3aad85480b6ec9349 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 7 Feb 2018 11:17:01 -0800
Subject: [PATCH 764/966] make calling NNAPI work again (this a copy of #16256
 which apparently got lost in the one of the merges)

PiperOrigin-RevId: 184866202
---
 tensorflow/contrib/lite/interpreter.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/contrib/lite/interpreter.cc b/tensorflow/contrib/lite/interpreter.cc
index 9dd60abc86..5aa0cbafd6 100644
--- a/tensorflow/contrib/lite/interpreter.cc
+++ b/tensorflow/contrib/lite/interpreter.cc
@@ -303,7 +303,6 @@ TfLiteStatus Interpreter::Invoke() {
 
   TfLiteStatus status = kTfLiteOk;
   if (nnapi_delegate_) {
-    TF_LITE_ENSURE_STATUS(PrepareOpsAndTensors());
     if (next_execution_plan_index_to_prepare_ == execution_plan_.size()) {
       TF_LITE_ENSURE_OK(&context_, nnapi_delegate_->Invoke(this));
       return kTfLiteOk;
-- 
GitLab


From e9890e0adf063d2d3d3e1fab6a2b1e5e75a80d1b Mon Sep 17 00:00:00 2001
From: gracehoney <31743510+aaroey@users.noreply.github.com>
Date: Wed, 7 Feb 2018 11:24:09 -0800
Subject: [PATCH 765/966] Fix #includes and namespace scope, and add missing
 TF_RETURN_IF_ERROR, to make internal build happy.

---
 tensorflow/contrib/tensorrt/convert/convert_nodes.cc | 5 -----
 tensorflow/contrib/tensorrt/ops/trt_engine_op.cc     | 4 ++--
 tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc     | 9 +++++----
 3 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index 5b8cf903e8..68ee403c4e 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -24,13 +24,8 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "tensorflow/core/framework/attr_value.pb.h"
-#include "tensorflow/core/framework/graph.pb.h"
-#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
-#include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
diff --git a/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc b/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc
index fa72bce039..079d73f7be 100644
--- a/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc
+++ b/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc
@@ -37,7 +37,7 @@ REGISTER_OP("TRTEngineOp")
     .Output("out_tensor: OutT")
     .SetShapeFn(shape_inference::TRTEngineOpShapeInference);
 
+}  // namespace tensorflow
+
 #endif  // GOOGLE_TENSORRT
 #endif  // GOOGLE_CUDA
-
-}  // namespace tensorflow
diff --git a/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc b/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
index ebaf996a29..8b475177bc 100644
--- a/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
+++ b/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
 #include "tensorflow/contrib/tensorrt/log/trt_logger.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorrt/include/NvInfer.h"
 
 namespace tensorflow {
@@ -29,14 +30,14 @@ namespace shape_inference {
 tensorflow::Status TRTEngineOpShapeInference(InferenceContext* context) {
   tensorflow::tensorrt::Logger logger;
   string serialized_engine;
-  context->GetAttr("serialized_engine", &serialized_engine);
+  TF_RETURN_IF_ERROR(context->GetAttr("serialized_engine", &serialized_engine));
   nvinfer1::IRuntime* infer = nvinfer1::createInferRuntime(logger);
   nvinfer1::ICudaEngine* trt_engine = infer->deserializeCudaEngine(
       serialized_engine.c_str(), serialized_engine.size(), nullptr);
 
   int num_batch = -1;
   std::vector<::tensorflow::DataType> input_type;
-  context->GetAttr("InT", &input_type);
+  TF_RETURN_IF_ERROR(context->GetAttr("InT", &input_type));
   for (size_t i = 0; i < context->num_inputs(); i++) {
     // Check if input shape is legit
     auto input_shape = context->input(i);
@@ -56,11 +57,11 @@ tensorflow::Status TRTEngineOpShapeInference(InferenceContext* context) {
 
   // Arrange input here
   std::vector<string> input_nodes;
-  context->GetAttr("input_nodes", &input_nodes);
+  TF_RETURN_IF_ERROR(context->GetAttr("input_nodes", &input_nodes));
 
   // Arrange output here
   std::vector<string> output_nodes;
-  context->GetAttr("output_nodes", &output_nodes);
+  TF_RETURN_IF_ERROR(context->GetAttr("output_nodes", &output_nodes));
   for (size_t i = 0; i < output_nodes.size(); i++) {
     int binding_index = trt_engine->getBindingIndex(output_nodes[i].c_str());
     ShapeHandle output_shape;
-- 
GitLab


From af78a018a6f303f3788b3f99c30d82adeab8067f Mon Sep 17 00:00:00 2001
From: Yifei Feng <fengyifei2026@gmail.com>
Date: Wed, 7 Feb 2018 11:51:55 -0800
Subject: [PATCH 766/966] Update BUILD

---
 tensorflow/tools/lib_package/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index 7717d8d7de..35e55c0d31 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -113,6 +113,7 @@ genrule(
         "@jemalloc//:COPYING",
         "@jpeg//:LICENSE.md",
         "@libxsmm_archive//:LICENSE",
+        "@llvm//:LICENSE.TXT",
         "@lmdb//:LICENSE",
         "@local_config_sycl//sycl:LICENSE.text",
         "@nasm//:LICENSE",
-- 
GitLab


From a5bc8407b92032a441867f01cf262c27cbfb5217 Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Wed, 7 Feb 2018 11:48:15 -0800
Subject: [PATCH 767/966] Prototype for object-based save/restore

Includes deferred restoration (mostly useful for eager execution).

Slot variables are created with their checkpointed values as soon as the variable they're slotting for is restored, so there's no need to override slot creation (optimizers already check for existing slot variables before creating a new one).

Changes the behavior of unnamed Checkpointable dependencies so that only other unnamed dependencies can interfere (named dependencies do not get a number). This should be a bit more robust, and will support property assignment syntax sugar in a future CL. It does mean that removing a name from a dependency will break the checkpoint (just like changing its name would).

One minor fix for slot creation eager compatibility.

PiperOrigin-RevId: 184871747
---
 .../proto/checkpointable_object_graph.proto   |   7 +-
 tensorflow/contrib/eager/python/BUILD         |   6 +
 .../contrib/eager/python/checkpointable.py    | 446 ++++++++++++++++--
 .../eager/python/checkpointable_test.py       | 164 ++++++-
 tensorflow/python/training/slot_creator.py    |   3 +-
 5 files changed, 563 insertions(+), 63 deletions(-)

diff --git a/tensorflow/contrib/eager/proto/checkpointable_object_graph.proto b/tensorflow/contrib/eager/proto/checkpointable_object_graph.proto
index c962638aa1..b4a39e6c68 100644
--- a/tensorflow/contrib/eager/proto/checkpointable_object_graph.proto
+++ b/tensorflow/contrib/eager/proto/checkpointable_object_graph.proto
@@ -14,7 +14,8 @@ message CheckpointableObjectGraph {
       // An index into `CheckpointableObjectGraph.nodes`, indicating the object
       // being referenced.
       int32 node_id = 1;
-      // A numeric identifier for this object within its parent.
+      // A numeric identifier for this object within its parent. Zero means
+      // unset, in which case there should be a local_name.
       int32 local_uid = 2;
       // A user-provided name for the edge. May be blank/omitted, in which case
       // there is no explicitly provided local name; fall back on local_uid.
@@ -28,6 +29,8 @@ message CheckpointableObjectGraph {
       // The full name of the variable. Used to allow name-based loading of
       // checkpoints which were saved using an object-based API.
       string full_name = 2;
+      // The generated name of the variable in the checkpoint.
+      string checkpoint_key = 3;
     }
 
     message SlotVariableReference {
@@ -42,6 +45,8 @@ message CheckpointableObjectGraph {
       // The full name of the slot variable. Used to allow name-based loading of
       // checkpoints which were saved using an object-based API.
       string full_name = 4;
+      // The generated name of the variable in the checkpoint.
+      string checkpoint_key = 5;
     }
 
     // Objects which this object depends on.
diff --git a/tensorflow/contrib/eager/python/BUILD b/tensorflow/contrib/eager/python/BUILD
index e984c63af7..3df056b070 100644
--- a/tensorflow/contrib/eager/python/BUILD
+++ b/tensorflow/contrib/eager/python/BUILD
@@ -226,8 +226,12 @@ py_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         "//tensorflow/contrib/eager/proto:checkpointable_object_graph_proto_py",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:state_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python/eager:context",
     ],
 )
 
@@ -243,6 +247,8 @@ py_test(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:layers",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:state_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
diff --git a/tensorflow/contrib/eager/python/checkpointable.py b/tensorflow/contrib/eager/python/checkpointable.py
index b141ffb2bc..47ce5897c0 100644
--- a/tensorflow/contrib/eager/python/checkpointable.py
+++ b/tensorflow/contrib/eager/python/checkpointable.py
@@ -19,28 +19,30 @@ from __future__ import print_function
 
 import collections
 import re
+import weakref
 
 from tensorflow.contrib.eager.proto import checkpointable_object_graph_pb2
+from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import io_ops
+from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.training import optimizer as optimizer_lib
 from tensorflow.python.training import saver as saver_lib
+from tensorflow.python.training import slot_creator
+from tensorflow.python.training import training
 
 _CheckpointableReference = collections.namedtuple(
     "_CheckpointableReference",
     [
-        "name",  # The local name if explicitly specified, else None.
-        "local_uid",  # 0 for the first dependency, 1 for the next, ... Used for
-        # routing checkpointed variables to their correct
-        # Checkpointables when "name" is not set (see docstring of
-        # `track_checkpointable`).
-        "ref"  # The Checkpointable object being referenced.
-    ])
-
-_OwnedVariable = collections.namedtuple(
-    "_OwnedVariable",
-    [
-        "name",  # The variable's (local) name.
-        "variable"  # The owned variable object.
+        # The local name if explicitly specified, else None.
+        "name",
+        # 1 for the first dependency, 2 for the next, ... Used for routing
+        # checkpointed variables to their correct Checkpointables when "name" is
+        # not set (see docstring of `track_checkpointable`).
+        "local_uid",
+        # The Checkpointable object being referenced.
+        "ref"
     ])
 
 # Validation regular expression for the local names of Checkpointable
@@ -59,6 +61,23 @@ _VALID_LOCAL_NAME = re.compile(r"^[A-Za-z0-9.][A-Za-z0-9_.-]*$")
 _OPTIMIZER_SLOTS_NAME = "_OPTIMIZER_SLOT"
 
 
+def _assign_existing_variable(variable_to_restore, value_pointer):
+  """Set a variable from a _ValuePointer object."""
+  base_type = variable_to_restore.dtype.base_dtype
+  with ops.colocate_with(variable_to_restore):
+    # TODO(allenl): Handle partitioned variables
+    value_to_restore, = io_ops.restore_v2(
+        prefix=value_pointer.save_path,
+        tensor_names=[value_pointer.checkpoint_key],
+        shape_and_slices=[""],
+        dtypes=[base_type],
+        name="checkpoint_initializer")
+    initializer_op = state_ops.assign(variable_to_restore, value_to_restore)
+    variable_to_restore._initializer_op = initializer_op  # pylint:disable=protected-access
+    if value_pointer.session is not None:
+      value_pointer.session.run(initializer_op)
+
+
 class Checkpointable(object):
   """Manages variables and dependencies on other objects.
 
@@ -70,14 +89,18 @@ class Checkpointable(object):
   """
 
   def __init__(self):
-    # Basically less useful OrderedDicts but without the reference cycles.
-    # TODO(allenl): Switch these to OrderedDict once TensorFlow supports only
+    # Basically a less useful OrderedDict but without the reference cycles.
+    # TODO(allenl): Switch this to OrderedDict once TensorFlow supports only
     # Python 3.6+.
-    self._checkpoint_dependencies = []  # A list of _CheckpointableReference
-    # objects.
+    # A list of _CheckpointableReference objects.
+    self._checkpoint_dependencies = []
     self._dependency_names = set()
-    self._owned_variables = []  # A list of _OwnedVariable objects.
-    self._owned_variable_names = set()
+    # Start numbering at 1, since an un-set protocol buffer integer is
+    # indistinguishable from 0.
+    self._next_unnamed_checkpoint_dependency_uid = 1
+    self._owned_variables = {}  # local name -> variable object
+    self._deferred_restorations = {}  # local name -> _VariableRestoration
+                                      # object
 
   def add_variable(self, name, shape, dtype=None, initializer=None, **kwargs):
     """Create a new variable object to be saved with this `Checkpointable`.
@@ -101,7 +124,7 @@ class Checkpointable(object):
     Raises:
       ValueError: If the variable name is not unique.
     """
-    if name in self._owned_variable_names:
+    if name in self._owned_variables:
       raise ValueError(
           ("A variable named '%s' already exists in this Checkpointable, but "
            "Checkpointable.add_variable called to create another with "
@@ -114,12 +137,38 @@ class Checkpointable(object):
       getter = kwargs.pop("getter")
     else:
       getter = variable_scope.get_variable
-    # TODO(allenl): handle deferred loading
+    deferred_restoration = self._deferred_restorations.pop(name, None)
+    if deferred_restoration is not None:
+      dtype = deferred_restoration.value_pointer.dtype
+      base_type = dtype.base_dtype
+      # TODO(allenl): Handle partitioned variables here too
+      initializer, = io_ops.restore_v2(
+          prefix=deferred_restoration.value_pointer.save_path,
+          tensor_names=[deferred_restoration.value_pointer.checkpoint_key],
+          shape_and_slices=[""],
+          dtypes=[base_type],
+          name="checkpoint_initializer")
+      # We need to un-set the shape so get_variable doesn't complain, but we
+      # also need to set the static shape information on the initializer if
+      # possible so we don't get a variable with an unknown shape.
+      initializer.set_shape(shape)
+      # Un-set shape since we're using a constant initializer
+      shape = None
+
     new_variable = getter(
         name=name, shape=shape, dtype=dtype, initializer=initializer, **kwargs)
-    self._owned_variables.append(
-        _OwnedVariable(name=name, variable=new_variable))
-    self._owned_variable_names.add(name)
+    if deferred_restoration is not None:
+      if deferred_restoration.value_pointer.session is not None:
+        deferred_restoration.value_pointer.session.run(new_variable.initializer)
+      for slot_restoration in deferred_restoration.slot_restorations:
+        strong_ref = slot_restoration.optimizer_ref()
+        if strong_ref is None:
+          # If the optimizer object has been garbage collected, there's no need
+          # to create the slot variable.
+          continue
+        strong_ref._process_slot_restoration(  # pylint: disable=protected-access
+            slot_restoration, new_variable)
+    self._owned_variables[name] = new_variable
     return new_variable
 
   def track_checkpointable(self, checkpointable, name=None):
@@ -130,13 +179,15 @@ class Checkpointable(object):
 
     Variables in a checkpoint are mapped to `Checkpointable`s based on names if
     provided when the checkpoint was written, but otherwise use the order those
-    `Checkpointable`s were declared as dependencies. Both `name` arguments and
-    the dependency declaration order should be deterministic.
+    `Checkpointable`s were declared as dependencies.
 
-    There are two sufficient conditions to avoid breaking existing checkpoints
-    when modifying a class: (1) New dependencies must be declared after existing
-    dependencies, and (2) dependencies which were previously declared may never
-    be removed (a trivial placeholder with the same name may be used instead).
+    There are three sufficient conditions to avoid breaking existing checkpoints
+    when modifying a class: (1) New un-named dependencies must be declared after
+    existing un-named dependencies, (2) un-named dependencies which were
+    previously declared may never be removed (a trivial placeholder may be used
+    instead if the dependency is no longer needed), and (3) names may not change
+    (un-named dependencies may not later be named, named dependencies must keep
+    the same name).
 
     Args:
       checkpointable: A `Checkpointable` which this object depends on.
@@ -172,16 +223,62 @@ class Checkpointable(object):
              "a Checkpointable with this name is already declared as a "
              "dependency. If provided, names must be unique.") % (name,))
       self._dependency_names.add(name)
+      local_uid = None
+    else:
+      # TODO(allenl): Should this be exposed to allow users to stop depending on
+      # things and still load checkpoints when not using names?
+      local_uid = self._next_unnamed_checkpoint_dependency_uid
+      self._next_unnamed_checkpoint_dependency_uid += 1
     self._checkpoint_dependencies.append(
         _CheckpointableReference(
-            name=name,
-            ref=checkpointable,
-            # TODO(allenl): Should this be exposed to allow users to stop
-            # depending on things and still load checkpoints when not using
-            # names?
-            local_uid=len(self._checkpoint_dependencies)))
+            name=name, ref=checkpointable, local_uid=local_uid))
     return checkpointable
 
+  def _process_restoration(self, restoration):
+    """Restore a variable and its slot variables (may be deferred)."""
+    variable_to_restore = self._owned_variables.get(restoration.name, None)
+    if variable_to_restore is not None:
+      # This variable already exists, so just do an assignment for this and any
+      # slot variables which depend on it.
+      _assign_existing_variable(
+          variable_to_restore, value_pointer=restoration.value_pointer)
+      for slot_restoration in restoration.slot_restorations:
+        strong_ref = slot_restoration.optimizer_ref()
+        if strong_ref is None:
+          continue
+        strong_ref._process_slot_restoration(  # pylint: disable=protected-access
+            slot_restoration, variable_to_restore)
+    else:
+      # Save this restoration for later. This intentionally overwrites any
+      # previous deferred restorations, since that gives the same semantics as
+      # direct assignment.
+      self._deferred_restorations[restoration.name] = restoration
+
+  def _process_slot_restoration(self, slot_restoration, variable):
+    """Restore a slot variable's value (creating it if necessary)."""
+    # TODO(allenl): Move this to Optimizer
+    assert isinstance(self, optimizer_lib.Optimizer)
+    named_slots = self._slot_dict(slot_restoration.slot_name)
+    variable_key = optimizer_lib._var_key(variable)  # pylint: disable=protected-access
+    existing_slot_variable = named_slots.get(variable_key, None)
+    if existing_slot_variable is None:
+      base_dtype = slot_restoration.value_pointer.dtype.base_dtype
+      initializer, = io_ops.restore_v2(
+          prefix=slot_restoration.value_pointer.save_path,
+          tensor_names=[slot_restoration.value_pointer.checkpoint_key],
+          shape_and_slices=[""],
+          dtypes=[base_dtype],
+          name="checkpoint_initializer")
+      new_slot_variable = slot_creator.create_slot(variable, initializer,
+                                                   slot_restoration.slot_name)
+      if slot_restoration.value_pointer.session is not None:
+        slot_restoration.value_pointer.session.run(
+            new_slot_variable.initializer)
+      named_slots[variable_key] = new_slot_variable
+    else:
+      _assign_existing_variable(
+          existing_slot_variable, value_pointer=slot_restoration.value_pointer)
+
   @property
   def checkpoint_dependencies(self):
     """Other `Checkpointable` objects on which this object depends."""
@@ -237,9 +334,9 @@ def _variable_naming_for_object(path_to_root):
   if object_prefix:
     object_prefix += "/"
 
-  def _name_single_variable(owned_variable):
+  def _name_single_variable(local_name):
     """Names a variable within an object."""
-    return object_prefix + _escape_variable_name(owned_variable.name)
+    return object_prefix + _escape_variable_name(local_name)
 
   return _name_single_variable
 
@@ -289,26 +386,31 @@ def _serialize_non_slot_variables(checkpointable_objects, path_to_root,
   for checkpoint_id, checkpointable in enumerate(checkpointable_objects):
     naming_scheme = _variable_naming_for_object(path_to_root[checkpointable])
     object_proto = object_graph_proto.nodes.add()
-    for owned_variable in checkpointable.ref._owned_variables:  # pylint: disable=protected-access
-      variable_name = naming_scheme(owned_variable)
-      named_variables[variable_name] = owned_variable.variable
+    for (local_name, owned_variable) in sorted(
+        checkpointable.ref._owned_variables.items(),  # pylint: disable=protected-access
+        key=lambda x: x[0]):
+      variable_name = naming_scheme(local_name)
+      named_variables[variable_name] = owned_variable
       non_slot_variables.append((
           variable_name,  # The variable's full checkpoint name
-          owned_variable,  # The variable's _OwnedVariable object
+          owned_variable,  # The variable object
+          local_name,  # The variable's local name
           checkpoint_id))  # The checkpoint ID of the node which owns this
       # variable.
       variable_proto = object_proto.variables.add()
-      variable_proto.local_name = owned_variable.name
+      variable_proto.local_name = local_name
+      variable_proto.checkpoint_key = variable_name
       # Figure out the name-based Saver's name for this variable.
       saver_dict = saver_lib.BaseSaverBuilder.OpListToDict(
-          [owned_variable.variable], convert_variable_to_tensor=False)
+          [owned_variable], convert_variable_to_tensor=False)
       variable_full_name, = saver_dict.keys()
       variable_proto.full_name = variable_full_name
 
     for child in checkpointable.ref.checkpoint_dependencies:
       child_proto = object_proto.children.add()
       child_proto.node_id = checkpoint_node_ids[child]
-      child_proto.local_uid = child.local_uid
+      if child.local_uid is not None:
+        child_proto.local_uid = child.local_uid
       if child.name is not None:
         child_proto.local_name = child.name
   return named_variables, non_slot_variables
@@ -326,24 +428,25 @@ def _serialize_slot_variables(checkpointable_objects, path_to_root,
           optimizer=checkpointable_ref.ref,
           path_to_root=path_to_root[checkpointable_ref])
       slot_names = checkpointable_ref.ref.get_slot_names()
-      for (variable_path, owned_variable,
+      for (variable_path, original_variable, original_variable_local_name,
            original_node_checkpoint_id) in non_slot_variables:
         for slot_name in slot_names:
           slot_variable = checkpointable_ref.ref.get_slot(
-              owned_variable.variable, slot_name)
+              original_variable, slot_name)
           if slot_variable is not None:
             checkpoint_name = naming_scheme(
                 variable_path=variable_path, slot_name=slot_name)
             named_slot_variables[checkpoint_name] = slot_variable
             slot_variable_proto = optimizer_object_proto.slot_variables.add()
             slot_variable_proto.slot_name = slot_name
+            slot_variable_proto.checkpoint_key = checkpoint_name
             # Figure out the name-based Saver's name for this variable.
             saver_dict = saver_lib.BaseSaverBuilder.OpListToDict(
                 [slot_variable], convert_variable_to_tensor=False)
             slot_variable_full_name, = saver_dict.keys()
             slot_variable_proto.full_name = slot_variable_full_name
             slot_variable_proto.original_variable_local_name = (
-                owned_variable.name)
+                original_variable_local_name)
             slot_variable_proto.original_variable_node_id = (
                 original_node_checkpoint_id)
   return named_slot_variables
@@ -390,3 +493,250 @@ def _serialize_object_graph(root_checkpointable):
 
   named_variables.update(named_slot_variables)
   return named_variables, object_graph_proto
+
+
+def _set_reference(reference_proto_table, key, checkpointable, parent,
+                   object_id_map):
+  """Record a checkpoint<->object correspondence, with error checking.
+
+  Args:
+    reference_proto_table: Map from names or numbers to `ObjectReference` protos
+      within the parent object.
+    key: Either a numeric or string identifier for the reference.
+    checkpointable: The object to record a correspondence for.
+    parent: The parent Python object, for creating a useful error message.
+    object_id_map: The map from `node_id` to Python object in which to record
+      the reference.
+  Returns:
+    The `node_id` of the Object proto corresponding to the specified Python
+    object.
+  Raises:
+    AssertionError: If another object is already bound to the `Object` proto.
+  """
+  reference_proto = reference_proto_table[key]
+  set_reference = object_id_map.setdefault(reference_proto.node_id,
+                                           checkpointable)
+  if set_reference is not checkpointable:
+    raise AssertionError(
+        ("Unable to load the checkpoint into this object graph. Either "
+         "the Checkpointable object references in the Python program "
+         "have changed in an incompatible way, or the checkpoint was "
+         "generated in an incompatible program.\n\nTwo checkpoint "
+         "references (one being '%s' in %s) resolved to different "
+         "objects (%s and %s).") % (key, parent, set_reference,
+                                    checkpointable))
+  return reference_proto.node_id
+
+
+def _checkpoint_object_id_map(root_checkpointable, object_graph_proto):
+  """Match a checkpointed object graph to a Python object graph.
+
+  Args:
+    root_checkpointable: A Checkpointable object.
+    object_graph_proto: A CheckpointableObjectGraph protocol buffer representing
+      a serialized object graph.
+  Returns:
+    A dictionary mapping from checkpoint node ids (indices into
+    `object_graph_proto.nodes`) to `Checkpointable` objects which are
+    dependencies of `root_checkpointable`.
+  """
+  node_list = object_graph_proto.nodes
+  # Queue of (checkpointable object, node id)
+  to_visit = collections.deque([(root_checkpointable, 0)])
+  object_id_map = {0: root_checkpointable}
+  seen = set()
+  while to_visit:
+    checkpointable, node_id = to_visit.popleft()
+    object_proto = node_list[node_id]
+    named_children = {}
+    numbered_children = {}
+    for child_reference in object_proto.children:
+      if child_reference.local_name:
+        named_children[child_reference.local_name] = child_reference
+      else:
+        if not child_reference.local_uid:
+          raise AssertionError(
+              ("The checkpointed object graph contains a reference with "
+               "neither a name nor a number (corrupted?). The reference was "
+               "from the node %s.") % (object_proto,))
+        numbered_children[child_reference.local_uid] = child_reference
+
+    for checkpointable_reference in checkpointable._checkpoint_dependencies:  # pylint: disable=protected-access
+      if checkpointable_reference.name is not None:
+        child_node_id = _set_reference(
+            reference_proto_table=named_children,
+            key=checkpointable_reference.name,
+            checkpointable=checkpointable_reference.ref,
+            parent=checkpointable,
+            object_id_map=object_id_map)
+      else:
+        if checkpointable_reference.local_uid is None:
+          raise AssertionError(
+              ("A Checkpointable reference was created with no name and no "
+               "number in %s.") % (checkpointable,))
+        child_node_id = _set_reference(
+            reference_proto_table=numbered_children,
+            key=checkpointable_reference.local_uid,
+            checkpointable=checkpointable_reference.ref,
+            parent=checkpointable,
+            object_id_map=object_id_map)
+      if child_node_id not in seen:
+        seen.add(child_node_id)
+        to_visit.append((checkpointable_reference.ref, child_node_id))
+
+  return object_id_map
+
+
+_ValuePointer = collections.namedtuple(
+    "_ValuePointer",
+    [
+        # Information needed to look up the value to restore.
+        "save_path",
+        "checkpoint_key",
+        "dtype",
+        # The session to use when restoring (None when executing eagerly)
+        "session",
+    ])
+
+_SlotVariableRestoration = collections.namedtuple(
+    "_SlotVariableRestoration",
+    [
+        # A weak reference to the Optimizer object
+        "optimizer_ref",
+        # The slot name
+        "slot_name",
+        # The _ValuePointer to use when restoring
+        "value_pointer",
+    ])
+
+_VariableRestoration = collections.namedtuple(
+    "_VariableRestoration",
+    [
+        # The variable's (local) name.
+        "name",
+        # _SlotVariableRestoration objects indicating slot variables which
+        # should be created once this variable has been restored.
+        "slot_restorations",
+        # The _ValuePointer to use when restoring
+        "value_pointer",
+    ])
+
+
+def _gather_restorations(object_graph_proto, save_path, object_id_map,
+                         dtype_map, session):
+  """Iterate over variables to restore, matching with Checkpointable objects."""
+  variable_to_slot_restorations = {}
+  for node_id, node in enumerate(object_graph_proto.nodes):
+    for slot_variable in node.slot_variables:
+      original_variable_key = (slot_variable.original_variable_node_id,
+                               slot_variable.original_variable_local_name)
+      variable_to_slot_restorations.setdefault(
+          original_variable_key, []).append(
+              _SlotVariableRestoration(
+                  optimizer_ref=weakref.ref(object_id_map[node_id]),
+                  slot_name=slot_variable.slot_name,
+                  value_pointer=_ValuePointer(
+                      save_path=save_path,
+                      checkpoint_key=slot_variable.checkpoint_key,
+                      dtype=dtype_map[slot_variable.checkpoint_key],
+                      session=session)))
+
+  for node_id, node in enumerate(object_graph_proto.nodes):
+    for variable in node.variables:
+      slots_key = (node_id, variable.local_name)
+      variable_restore = _VariableRestoration(
+          name=variable.local_name,
+          slot_restorations=variable_to_slot_restorations.get(slots_key, []),
+          value_pointer=_ValuePointer(
+              save_path=save_path,
+              checkpoint_key=variable.checkpoint_key,
+              dtype=dtype_map[variable.checkpoint_key],
+              session=session))
+      yield variable_restore, object_id_map[node_id]
+
+
+def save(file_prefix, root_checkpointable, global_step=None, session=None):
+  """Save a training checkpoint.
+
+  Args:
+    file_prefix: A prefix to use for the checkpoint filenames
+      (/path/to/directory/and_a_prefix). Names are generated based on this
+      prefix and the global step, if provided.
+    root_checkpointable: A Checkpointable object to save. The checkpoint
+      includes variables created by this object and any Checkpointable objects
+      it depends on.
+    global_step: An integer variable or Tensor, used to number
+      checkpoints. Typically this value is saved along with other variables in
+      training checkpoints, which will happen automatically if it was created by
+      `root_checkpointable` or one of its dependencies (via
+      `Checkpointable.add_variable`).
+    session: The session to evaluate variables in. Ignored when executing
+      eagerly. If not provided when graph building, the default session is used.
+
+  Returns:
+    The full path to the checkpoint.
+
+    Currently also returns the serialized object graph proto, but that will go
+    away once it's saved with the checkpoint.
+  """
+  named_variables, serialized_graph = _serialize_object_graph(
+      root_checkpointable)
+  if context.in_graph_mode():
+    if session is None:
+      session = ops.get_default_session()
+  else:
+    session = None
+  with ops.device("/device:CPU:0"):
+    save_path = saver_lib.Saver(var_list=named_variables).save(
+        sess=session,
+        save_path=file_prefix,
+        write_meta_graph=False,
+        global_step=global_step)
+  # TODO(allenl): Save the graph with the checkpoint, then returning it and
+  # taking it as an argument to restore won't be necessary.
+  return serialized_graph, save_path
+
+
+# NOTE: Will be restore(file_prefix, root_checkpointable) once the object graph
+# is saved with the checkpoint.
+def restore(save_path, root_checkpointable, object_graph_proto, session=None):
+  """Restore a training checkpoint.
+
+  Restores the values of variables created with `Checkpointable.add_variable` in
+  the dependency graph of `root_checkpointable`. Either assigns values
+  immediately (if variables to restore have been created already), or defers
+  restoration until the variables are created.
+
+  When building a graph, restorations are executed in the default session if
+  `session` is `None`. Variable initializers read checkpointed values.
+
+  Args:
+    save_path: The path to the checkpoint, as returned by `save` or
+      `tf.train.latest_checkpoint`. If None (as when there is no latest
+      checkpoint for `tf.train.latest_checkpoint` to return), does nothing.
+    root_checkpointable: The root of the object graph to restore. Variables to
+      restore need not have been created yet, but all dependencies on other
+      Checkpointable objects should already be declared. Objects in the
+      dependency graph are matched to objects in the checkpointed graph, and
+      matching objects have their variables restored (or the checkpointed values
+      saved for eventual restoration when the variable is created).
+    object_graph_proto: (Temporary) the checkpointed object graph. This will
+      eventually be saved with the checkpoint, and will not be part of the final
+      API.
+    session: The session to evaluate assignment ops in. Ignored when executing
+      eagerly. If not provided when graph building, the default session is used.
+  """
+  if save_path is None:
+    return
+  object_id_map = _checkpoint_object_id_map(root_checkpointable,
+                                            object_graph_proto)
+  reader = training.NewCheckpointReader(save_path)
+  dtype_map = reader.get_variable_to_dtype_map()
+  if context.in_graph_mode():
+    if session is None:
+      session = ops.get_default_session()
+  else:
+    session = None
+  for restoration, checkpointable in _gather_restorations(
+      object_graph_proto, save_path, object_id_map, dtype_map, session=session):
+    checkpointable._process_restoration(restoration)  # pylint: disable=protected-access
diff --git a/tensorflow/contrib/eager/python/checkpointable_test.py b/tensorflow/contrib/eager/python/checkpointable_test.py
index ff419614f5..d823053283 100644
--- a/tensorflow/contrib/eager/python/checkpointable_test.py
+++ b/tensorflow/contrib/eager/python/checkpointable_test.py
@@ -17,6 +17,8 @@ from __future__ import division
 from __future__ import print_function
 
 import functools
+import os
+
 import six
 
 from tensorflow.contrib.eager.python import checkpointable
@@ -28,9 +30,12 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.layers import core
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.training import adam
+from tensorflow.python.training import saver as core_saver
 from tensorflow.python.training import training_util
 
 
@@ -101,11 +106,6 @@ class CheckpointableAdam(adam.AdamOptimizer, checkpointable.Checkpointable):
 
     return v
 
-  # TODO(allenl): Override slot variable creation (_get_or_make_slot,
-  # _get_or_make_slot_with_initializer, _zeros_slot) to allow deferred
-  # loading. Likely no need to run this through add_variable, since gathering
-  # slot variables is special cased anyway.
-
 
 class MyNetwork(CheckpointableNetwork):
   """A concrete Network for testing."""
@@ -175,8 +175,8 @@ class CheckpointNamingTests(test.TestCase):
     expected_checkpoint_names = (
         # Created in the root node, so no prefix.
         "global_step",
-        # No name provided to track_checkpointable(), so the position (1, after
-        # the named track_checkpointable() which is 0) is used instead.
+        # No name provided to track_checkpointable(), so the position is used
+        # instead (one-based).
         "network/_1/kernel",
         # track_checkpointable() with a name provided, so that's used
         "network/named_dense/kernel",
@@ -212,20 +212,158 @@ class CheckpointNamingTests(test.TestCase):
         0].node_id]
     self.assertEqual("beta1_power", optimizer_node.variables[0].local_name)
     self.assertEqual("beta1_power", optimizer_node.variables[0].full_name)
+    # Variable ordering is arbitrary but deterministic (alphabetized)
     self.assertEqual(
-        "kernel", optimizer_node.slot_variables[0].original_variable_local_name)
+        "bias", optimizer_node.slot_variables[0].original_variable_local_name)
     original_variable_owner = serialized_graph.nodes[
         optimizer_node.slot_variables[0].original_variable_node_id]
-    self.assertEqual("kernel", original_variable_owner.variables[0].local_name)
+    self.assertEqual("network/named_dense/bias",
+                     original_variable_owner.variables[0].checkpoint_key)
+    self.assertEqual("bias", original_variable_owner.variables[0].local_name)
     self.assertEqual("m", optimizer_node.slot_variables[0].slot_name)
+    self.assertEqual("network/named_dense/bias/_OPTIMIZER_SLOT/optimizer/m",
+                     optimizer_node.slot_variables[0].checkpoint_key)
     # We strip off the :0 suffix, as variable.name-based saving does.
-    self.assertEqual("my_network/checkpointable_dense_layer/kernel/Adam",
+    self.assertEqual("my_network/checkpointable_dense_layer/bias/Adam",
                      optimizer_node.slot_variables[0].full_name)
-    self.assertEqual("my_network/checkpointable_dense_layer/kernel/Adam:0",
+    self.assertEqual("my_network/checkpointable_dense_layer/bias/Adam:0",
                      optimizer.get_slot(
-                         var=named_variables["network/named_dense/kernel"],
+                         var=named_variables["network/named_dense/bias"],
                          name="m").name)
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testSaveRestore(self):
+    network = MyNetwork()
+    optimizer = CheckpointableAdam(0.001)
+    root_checkpointable = Root(optimizer=optimizer, network=network)
+    input_value = constant_op.constant([[3.]])
+    if context.in_eager_mode():
+      optimizer.minimize(
+          lambda: network(input_value),
+          global_step=root_checkpointable.global_step)
+    else:
+      train_op = optimizer.minimize(
+          network(input_value), global_step=root_checkpointable.global_step)
+      self.evaluate(variables.global_variables_initializer())
+      self.evaluate(train_op)
+    prefix = os.path.join(self.get_temp_dir(), "ckpt")
+    self.evaluate(state_ops.assign(network._named.variables[1], [42.]))
+    m_bias_slot = optimizer.get_slot(network._named.variables[1], "m")
+    self.evaluate(state_ops.assign(m_bias_slot, [1.5]))
+    serialized_graph, save_path = checkpointable.save(
+        file_prefix=prefix,
+        root_checkpointable=root_checkpointable,
+        global_step=root_checkpointable.global_step)
+    self.evaluate(state_ops.assign(network._named.variables[1], [43.]))
+    self.evaluate(state_ops.assign(root_checkpointable.global_step, 3))
+    optimizer_variables = self.evaluate(optimizer.variables())
+    self.evaluate(state_ops.assign(m_bias_slot, [-2.]))
+    # Immediate restoration
+    checkpointable.restore(
+        save_path=save_path,
+        root_checkpointable=root_checkpointable,
+        object_graph_proto=serialized_graph)
+    self.assertAllEqual([42.], self.evaluate(network._named.variables[1]))
+    self.assertAllEqual(1, self.evaluate(root_checkpointable.global_step))
+    self.assertAllEqual([1.5], self.evaluate(m_bias_slot))
+    with ops.Graph().as_default():
+      on_create_network = MyNetwork()
+      on_create_optimizer = CheckpointableAdam(0.001)
+      on_create_root = Root(
+          optimizer=on_create_optimizer, network=on_create_network)
+      with self.test_session(graph=ops.get_default_graph()):
+        # Deferred restoration
+        checkpointable.restore(
+            save_path=save_path,
+            root_checkpointable=on_create_root,
+            object_graph_proto=serialized_graph)
+        on_create_network(constant_op.constant([[3.]]))  # create variables
+        self.assertAllEqual(1, self.evaluate(on_create_root.global_step))
+        self.assertAllEqual([42.],
+                            self.evaluate(
+                                on_create_network._named.variables[1]))
+        on_create_m_bias_slot = on_create_optimizer.get_slot(
+            on_create_network._named.variables[1], "m")
+        # Optimizer slot variables are created when the original variable is
+        # restored.
+        self.assertAllEqual([1.5], self.evaluate(on_create_m_bias_slot))
+        # beta1_power and beta2_power haven't been created yet, but everything
+        # else matches.
+        self.assertAllEqual(optimizer_variables[2:],
+                            self.evaluate(on_create_optimizer.variables()))
+        on_create_optimizer._create_slots(
+            [resource_variable_ops.ResourceVariable([1.])])
+        beta1_power, beta2_power = on_create_optimizer._get_beta_accumulators()
+        self.assertAllEqual(optimizer_variables[0], self.evaluate(beta1_power))
+        self.assertAllEqual(optimizer_variables[1], self.evaluate(beta2_power))
+
+  def testDeferredRestorationUsageEager(self):
+    """An idiomatic eager execution example."""
+    num_training_steps = 10
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    latest_object_graph = None  # Will be saved with the checkpoint eventually.
+    for training_continuation in range(3):
+      with ops.Graph().as_default():
+        network = MyNetwork()
+        optimizer = CheckpointableAdam(0.001)
+        root = Root(optimizer=optimizer, network=network)
+        checkpointable.restore(
+            save_path=core_saver.latest_checkpoint(checkpoint_directory),
+            root_checkpointable=root,
+            object_graph_proto=latest_object_graph)
+        for _ in range(num_training_steps):
+          # TODO(allenl): Use a Dataset and serialize/checkpoint it.
+          input_value = constant_op.constant([[3.]])
+          optimizer.minimize(
+              lambda: network(input_value),  # pylint: disable=cell-var-from-loop
+              global_step=root.global_step)
+        latest_object_graph, _ = checkpointable.save(
+            file_prefix=checkpoint_prefix,
+            root_checkpointable=root)
+        self.assertEqual((training_continuation + 1) * num_training_steps,
+                         root.global_step.numpy())
+
+  def testUsageGraph(self):
+    """Expected usage when graph building."""
+    with context.graph_mode():
+      num_training_steps = 10
+      checkpoint_directory = self.get_temp_dir()
+      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+      latest_object_graph = None
+      for training_continuation in range(3):
+        with ops.Graph().as_default():
+          network = MyNetwork()
+          optimizer = CheckpointableAdam(0.001)
+          root = Root(optimizer=optimizer, network=network)
+          input_value = constant_op.constant([[3.]])
+          train_op = optimizer.minimize(
+              network(input_value),
+              global_step=root.global_step)
+          init_op = variables.global_variables_initializer()
+          checkpoint_path = core_saver.latest_checkpoint(checkpoint_directory)
+          with self.test_session(graph=ops.get_default_graph()) as session:
+            if checkpoint_path is None:
+              self.assertEqual(0, training_continuation)
+              session.run(init_op)
+              # Another alternative would be to run initializers automatically
+              # if no checkpoint is being loaded. This would make deferred
+              # loading a bit more useful with graph execution.
+            else:
+              checkpointable.restore(
+                  save_path=checkpoint_path,
+                  root_checkpointable=root,
+                  object_graph_proto=latest_object_graph,
+                  session=session)
+            for _ in range(num_training_steps):
+              session.run(train_op)
+            latest_object_graph, _ = checkpointable.save(
+                file_prefix=checkpoint_prefix,
+                root_checkpointable=root,
+                session=session)
+            self.assertEqual((training_continuation + 1) * num_training_steps,
+                             session.run(root.global_step))
+
   def _get_checkpoint_name(self, name):
     root = checkpointable.Checkpointable()
     with variable_scope.variable_scope("get_checkpoint_name"):
@@ -255,7 +393,7 @@ class CheckpointNamingTests(test.TestCase):
     leaf.add_variable(name="v", shape=[])
     named_variables, _ = checkpointable._serialize_object_graph(root)
     variable_name, = named_variables.keys()
-    self.assertEqual(r"_0/v", variable_name)
+    self.assertEqual(r"_1/v", variable_name)
 
   @test_util.run_in_graph_and_eager_modes()
   def testLocalNameValidation(self):
diff --git a/tensorflow/python/training/slot_creator.py b/tensorflow/python/training/slot_creator.py
index 731fe34273..18a5b89d30 100644
--- a/tensorflow/python/training/slot_creator.py
+++ b/tensorflow/python/training/slot_creator.py
@@ -111,7 +111,8 @@ def create_slot(primary, val, name, colocate_with_primary=True):
   # and the same name has been previously used, the scope name will add '_N'
   # as suffix for unique identifications.
   validate_shape = val.get_shape().is_fully_defined()
-  with variable_scope.variable_scope(None, primary.op.name + "/" + name):
+  prefix = primary.op.name if context.in_graph_mode() else primary._shared_name  # pylint: disable=protected-access
+  with variable_scope.variable_scope(None, prefix + "/" + name):
     if colocate_with_primary:
       with ops.colocate_with(primary):
         return _create_slot_var(primary, val, "", validate_shape, None, None)
-- 
GitLab


From a2909ce47d76cf43f1cd850f8dac4cd87ad87b89 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 7 Feb 2018 12:01:40 -0800
Subject: [PATCH 768/966] Update to type-dependent while loop, and tests for
 it.

PiperOrigin-RevId: 184874151
---
 .../contrib/py2tf/utils/multiple_dispatch.py  |  6 ++--
 .../py2tf/utils/multiple_dispatch_test.py     | 30 +++++++++++++++----
 2 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/tensorflow/contrib/py2tf/utils/multiple_dispatch.py b/tensorflow/contrib/py2tf/utils/multiple_dispatch.py
index 7ea3a1b0a8..d8a67255a4 100644
--- a/tensorflow/contrib/py2tf/utils/multiple_dispatch.py
+++ b/tensorflow/contrib/py2tf/utils/multiple_dispatch.py
@@ -41,7 +41,7 @@ def run_while(cond_fn, body_fn, init_args):
     raise ValueError(
         'init_args must be a non-empty list or tuple, found %s' % init_args)
 
-  if is_tensor(init_args):
+  if is_tensor(*init_args):
     return control_flow_ops.while_loop(cond_fn, body_fn, init_args)
   else:
     return py_while_loop(cond_fn, body_fn, init_args)
@@ -49,6 +49,6 @@ def run_while(cond_fn, body_fn, init_args):
 
 def py_while_loop(cond_fn, body_fn, init_args):
   state = init_args
-  while cond_fn(state):
-    state = body_fn(state)
+  while cond_fn(*state):
+    state = body_fn(*state)
   return state
diff --git a/tensorflow/contrib/py2tf/utils/multiple_dispatch_test.py b/tensorflow/contrib/py2tf/utils/multiple_dispatch_test.py
index 7cc9defd14..5bb4d4086b 100644
--- a/tensorflow/contrib/py2tf/utils/multiple_dispatch_test.py
+++ b/tensorflow/contrib/py2tf/utils/multiple_dispatch_test.py
@@ -19,9 +19,7 @@ from __future__ import division
 from __future__ import print_function
 from tensorflow.contrib.py2tf.utils import multiple_dispatch
 from tensorflow.python.client.session import Session
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework.constant_op import constant
-from tensorflow.python.ops.control_flow_ops import cond
 from tensorflow.python.platform import test
 
 
@@ -38,12 +36,34 @@ class MultipleDispatchTest(test.TestCase):
     true_fn = lambda: constant([2.0])
     false_fn = lambda: constant([3.0])
     with Session() as sess:
-
-      out = cond(constant(True), true_fn, false_fn)
+      out = multiple_dispatch.run_cond(constant(True), true_fn, false_fn)
       self.assertEqual(sess.run(out), 2.0)
-      out = cond(constant(False, dtype=dtypes.bool), true_fn, false_fn)
+      out = multiple_dispatch.run_cond(constant(False), true_fn, false_fn)
       self.assertEqual(sess.run(out), 3.0)
 
+  def test_run_while_python(self):
+    cond_fn = lambda x, t, s: x > t
+    body_fn = lambda x, t, s: (x * s, t, s)
+
+    x, _, _ = multiple_dispatch.run_while(cond_fn, body_fn, [3.0, 1.0, 0.5])
+    self.assertEqual(x, 0.75)
+
+    x, _, _ = multiple_dispatch.run_while(cond_fn, body_fn, [3.0, 4.0, 0.5])
+    self.assertEqual(x, 3.0)
+
+  def test_run_while_tf(self):
+    cond_fn = lambda x, t, s: x > t
+    body_fn = lambda x, t, s: (x * s, t, s)
+
+    with Session() as sess:
+      x, _, _ = multiple_dispatch.run_while(cond_fn, body_fn,
+                                            [constant(3.0), 1.0, 0.5])
+      self.assertEqual(sess.run(x), 0.75)
+
+      x, _, _ = multiple_dispatch.run_while(cond_fn, body_fn,
+                                            [constant(3.0), 4.0, 0.5])
+      self.assertEqual(sess.run(x), 3.0)
+
 
 if __name__ == '__main__':
   test.main()
-- 
GitLab


From 90ce80131a8b5213d9f3eb9649d63921db7874a4 Mon Sep 17 00:00:00 2001
From: Yu-Cheng Ling <ycling@google.com>
Date: Wed, 7 Feb 2018 12:05:01 -0800
Subject: [PATCH 769/966] Update TFLite iOS Camera Example app to use TFLite
 CocoaPod.

PiperOrigin-RevId: 184874871
---
 tensorflow/contrib/lite/examples/ios/camera/Podfile         | 2 +-
 .../camera/tflite_camera_example.xcodeproj/project.pbxproj  | 6 ------
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/tensorflow/contrib/lite/examples/ios/camera/Podfile b/tensorflow/contrib/lite/examples/ios/camera/Podfile
index 4ae6fb6b94..c7d3b1c966 100644
--- a/tensorflow/contrib/lite/examples/ios/camera/Podfile
+++ b/tensorflow/contrib/lite/examples/ios/camera/Podfile
@@ -2,4 +2,4 @@ platform :ios, '8.0'
 inhibit_all_warnings!
 
 target 'tflite_camera_example'
-       pod 'TensorFlow-experimental'
+       pod 'TensorFlowLite'
diff --git a/tensorflow/contrib/lite/examples/ios/camera/tflite_camera_example.xcodeproj/project.pbxproj b/tensorflow/contrib/lite/examples/ios/camera/tflite_camera_example.xcodeproj/project.pbxproj
index c98183276b..b0236e9c60 100644
--- a/tensorflow/contrib/lite/examples/ios/camera/tflite_camera_example.xcodeproj/project.pbxproj
+++ b/tensorflow/contrib/lite/examples/ios/camera/tflite_camera_example.xcodeproj/project.pbxproj
@@ -16,7 +16,6 @@
 		1CDB2D4E1ED3AA35007929E9 /* Info.plist in Resources */ = {isa = PBXBuildFile; fileRef = 1CDB2D4D1ED3AA35007929E9 /* Info.plist */; };
 		54DC6C3C5F734F3A58069F0C /* libPods-tflite_camera_example.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 3BA8BF92C84895BFE59D8236 /* libPods-tflite_camera_example.a */; };
 		AC1F82661FBA3CBD0052BA77 /* labels.txt in Resources */ = {isa = PBXBuildFile; fileRef = AC1F82641FBA3CBD0052BA77 /* labels.txt */; };
-		AC1F82691FBA3F930052BA77 /* libtensorflow-lite.a in Frameworks */ = {isa = PBXBuildFile; fileRef = AC1F82681FBA3F930052BA77 /* libtensorflow-lite.a */; };
 		ACA1A4CA1FBB6C28009B8D86 /* mobilenet_quant_v1_224.tflite in Resources */ = {isa = PBXBuildFile; fileRef = ACA1A4C91FBB6C28009B8D86 /* mobilenet_quant_v1_224.tflite */; };
 /* End PBXBuildFile section */
 
@@ -38,7 +37,6 @@
 		3BC5BE4BBD09374D3E98F082 /* Pods-tflite_camera_example.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-tflite_camera_example.debug.xcconfig"; path = "Pods/Target Support Files/Pods-tflite_camera_example/Pods-tflite_camera_example.debug.xcconfig"; sourceTree = "<group>"; };
 		55ED318E8D29C8AFEF03DF1E /* Pods-tflite_camera_example.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-tflite_camera_example.release.xcconfig"; path = "Pods/Target Support Files/Pods-tflite_camera_example/Pods-tflite_camera_example.release.xcconfig"; sourceTree = "<group>"; };
 		AC1F82641FBA3CBD0052BA77 /* labels.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = labels.txt; sourceTree = "<group>"; };
-		AC1F82681FBA3F930052BA77 /* libtensorflow-lite.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = "libtensorflow-lite.a"; path = "../../../gen/lib/libtensorflow-lite.a"; sourceTree = "<group>"; };
 		ACA1A4C91FBB6C28009B8D86 /* mobilenet_quant_v1_224.tflite */ = {isa = PBXFileReference; lastKnownFileType = file; path = mobilenet_quant_v1_224.tflite; sourceTree = "<group>"; };
 /* End PBXFileReference section */
 
@@ -47,7 +45,6 @@
 			isa = PBXFrameworksBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
-				AC1F82691FBA3F930052BA77 /* libtensorflow-lite.a in Frameworks */,
 				1CB47D491ED3AD1700DF7666 /* AVFoundation.framework in Frameworks */,
 				1CA5EB931ED3ABFB00247A34 /* CoreMedia.framework in Frameworks */,
 				54DC6C3C5F734F3A58069F0C /* libPods-tflite_camera_example.a in Frameworks */,
@@ -60,7 +57,6 @@
 		24D7686C331131624F4454A0 /* Frameworks */ = {
 			isa = PBXGroup;
 			children = (
-				AC1F82681FBA3F930052BA77 /* libtensorflow-lite.a */,
 				1CB47D481ED3AD1700DF7666 /* AVFoundation.framework */,
 				1CA5EB921ED3ABFB00247A34 /* CoreMedia.framework */,
 				1C0D734A1ECCC460008C1DAB /* CoreGraphics.framework */,
@@ -336,7 +332,6 @@
 					../../../downloads/,
 				);
 				IPHONEOS_DEPLOYMENT_TARGET = 8.0;
-				LIBRARY_SEARCH_PATHS = ../../../gen/lib/;
 				MTL_ENABLE_DEBUG_INFO = YES;
 				ONLY_ACTIVE_ARCH = YES;
 				SDKROOT = iphoneos;
@@ -384,7 +379,6 @@
 					../../../downloads/,
 				);
 				IPHONEOS_DEPLOYMENT_TARGET = 8.0;
-				LIBRARY_SEARCH_PATHS = ../../../gen/lib/;
 				MTL_ENABLE_DEBUG_INFO = NO;
 				SDKROOT = iphoneos;
 				TARGETED_DEVICE_FAMILY = "1,2";
-- 
GitLab


From 65f676426d808cf20abd6a4a30ad48668ee5fdf2 Mon Sep 17 00:00:00 2001
From: Mingsheng Hong <hongm@google.com>
Date: Wed, 7 Feb 2018 12:22:55 -0800
Subject: [PATCH 770/966] Initial XLA support for TF eager. This is
 prerequisite for TF compiler's XLA support.

This CL adds XLA support for the following TFE_Op's:

1. A TF op such as MatMul, with full support of constant and resource params.

2. A TF_Function as TFE_Op, where the function must have no constant and
resource params. Removing this restriction requires more discussion and will be
deferred to a later time.

PiperOrigin-RevId: 184877345
---
 tensorflow/c/BUILD                  |   4 +
 tensorflow/c/c_test_util.cc         |  29 +++-
 tensorflow/c/c_test_util.h          |   3 +
 tensorflow/c/eager/BUILD            |   1 +
 tensorflow/c/eager/c_api.cc         | 253 +++++++++++++++++++++++++++-
 tensorflow/c/eager/c_api.h          |   7 +
 tensorflow/c/eager/c_api_internal.h |   1 +
 tensorflow/c/eager/c_api_test.cc    | 213 ++++++++++++++++++++++-
 8 files changed, 502 insertions(+), 9 deletions(-)

diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index c46cb32aa4..314cbc657c 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -135,6 +135,10 @@ tf_cuda_library(
     testonly = 1,
     srcs = ["c_test_util.cc"],
     hdrs = ["c_test_util.h"],
+    visibility = [
+        "//learning/brain:__subpackages__",
+        "//tensorflow:__subpackages__",
+    ],
     deps = [
         ":c_api",
         "//tensorflow/core:lib",
diff --git a/tensorflow/c/c_test_util.cc b/tensorflow/c/c_test_util.cc
index 37439ff0be..3c1d5b5bf8 100644
--- a/tensorflow/c/c_test_util.cc
+++ b/tensorflow/c/c_test_util.cc
@@ -124,8 +124,9 @@ TF_Operation* ScalarConst(double v, TF_Graph* graph, TF_Status* s,
   return Const(tensor.get(), graph, s, name);
 }
 
-void AddHelper(TF_Operation* l, TF_Operation* r, TF_Graph* graph, TF_Status* s,
-               const char* name, TF_Operation** op, bool check) {
+void AddOpHelper(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
+                 TF_Status* s, const char* name, TF_Operation** op,
+                 bool check) {
   TF_OperationDescription* desc = TF_NewOperation(graph, "AddN", name);
   TF_Output add_inputs[2] = {{l, 0}, {r, 0}};
   TF_AddInputList(desc, add_inputs, 2);
@@ -139,14 +140,14 @@ void AddHelper(TF_Operation* l, TF_Operation* r, TF_Graph* graph, TF_Status* s,
 TF_Operation* Add(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
                   TF_Status* s, const char* name) {
   TF_Operation* op;
-  AddHelper(l, r, graph, s, name, &op, true);
+  AddOpHelper(l, r, graph, s, name, &op, true);
   return op;
 }
 
 TF_Operation* AddNoCheck(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
                          TF_Status* s, const char* name) {
   TF_Operation* op;
-  AddHelper(l, r, graph, s, name, &op, false);
+  AddOpHelper(l, r, graph, s, name, &op, false);
   return op;
 }
 
@@ -160,6 +161,26 @@ TF_Operation* AddWithCtrlDependency(TF_Operation* l, TF_Operation* r,
   return TF_FinishOperation(desc, s);
 }
 
+void BinaryOpHelper(const char* op_name, TF_Operation* l, TF_Operation* r,
+                    TF_Graph* graph, TF_Status* s, const char* name,
+                    TF_Operation** op, bool check) {
+  TF_OperationDescription* desc = TF_NewOperation(graph, op_name, name);
+  TF_AddInput(desc, {l, 0});
+  TF_AddInput(desc, {r, 0});
+  *op = TF_FinishOperation(desc, s);
+  if (check) {
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    ASSERT_NE(*op, nullptr);
+  }
+}
+
+TF_Operation* Min(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
+                  TF_Status* s, const char* name) {
+  TF_Operation* op;
+  BinaryOpHelper("Min", l, r, graph, s, name, &op, true);
+  return op;
+}
+
 TF_Operation* Add(TF_Output l, TF_Output r, TF_Graph* graph, TF_Status* s,
                   const char* name) {
   TF_OperationDescription* desc = TF_NewOperation(graph, "AddN", name);
diff --git a/tensorflow/c/c_test_util.h b/tensorflow/c/c_test_util.h
index 6acc2fec00..77520be010 100644
--- a/tensorflow/c/c_test_util.h
+++ b/tensorflow/c/c_test_util.h
@@ -69,6 +69,9 @@ TF_Operation* AddWithCtrlDependency(TF_Operation* l, TF_Operation* r,
 TF_Operation* Add(TF_Output l, TF_Output r, TF_Graph* graph, TF_Status* s,
                   const char* name = "add");
 
+TF_Operation* Min(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
+                  TF_Status* s, const char* name = "min");
+
 TF_Operation* Neg(TF_Operation* n, TF_Graph* graph, TF_Status* s,
                   const char* name = "neg");
 
diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index 3505f70dc1..e55cb672e9 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -72,6 +72,7 @@ tf_cuda_cc_test(
     ],
     deps = [
         ":c_api",
+        "//tensorflow/c:c_test_util",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index 3a6d2ce45b..9cd1accde9 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -47,19 +48,23 @@ using tensorflow::int64;
 using tensorflow::string;
 
 namespace {
-bool IsCPU(tensorflow::Device* d) {
+bool IsCPU(const tensorflow::Device* d) {
   return d == nullptr || d->tensorflow_gpu_device_info() == nullptr;
 }
 
-bool IsXLA(tensorflow::Device* d) {
+bool IsXLA(const tensorflow::Device* d) {
   if (d == nullptr) return false;
   const auto& device_type = d->attributes().device_type();
   return device_type.find("XLA") != std::string::npos;
 }
 
-string DeviceName(tensorflow::Device* d) {
+string DeviceName(const tensorflow::Device* d) {
   return (d == nullptr) ? "cpu:0" : d->name();
 }
+
+#ifdef TENSORFLOW_EAGER_USE_XLA
+std::atomic_int_fast64_t func_id_generator(0);
+#endif  // TENSORFLOW_EAGER_USE_XLA
 }  // namespace
 
 extern "C" {
@@ -281,6 +286,14 @@ const char* TFE_OpGetDevice(TFE_Op* op, TF_Status* status) {
   return device->name().c_str();
 }
 
+void TFE_OpSetXLACompilation(TFE_Op* op, unsigned char enable) {
+  op->use_xla = enable;
+#ifndef TENSORFLOW_EAGER_USE_XLA
+  LOG(WARNING) << "This call is a no-op, as the TensorFlow library is not "
+                  "built with XLA support.";
+#endif  // TENSORFLOW_EAGER_USE_XLA
+}
+
 void TFE_OpAddInput(TFE_Op* op, TFE_TensorHandle* h, TF_Status* status) {
   // Questionable heuristic ...
   //
@@ -523,6 +536,228 @@ tensorflow::Status ValidateInputTypeAndPlacement(
   }
   return tensorflow::Status::OK();
 }
+
+#ifdef TENSORFLOW_EAGER_USE_XLA
+// Synthesizes and returns a wrapper function over `op`, which must be a
+// primitive op (e.g. matmul).
+//
+// The wrapper function conforms to the function signature expected by
+// _XlaLaunchOp, with input params ordered by <constants, (variable) args and
+// resources>. For example, if the op has input params <Const1, Arg2, Const3,
+// Resource4, Arg5>, they will be reordered to <Const1, Const3, Arg2, Arg5,
+// Resource4> as the input params to the synthesized function.
+//
+// It populates `const_input_types`, `arg_input_types` and
+// `op_input_to_func_input` based on the reordering results, that the caller can
+// use them to build an _XlaLaunchOp. On error, it returns NULL, and sets
+// `status` accordingly.
+const tensorflow::FunctionDef* OpToFunction(
+    TFE_Op* op, std::vector<TF_DataType>* const_input_types,
+    std::vector<TF_DataType>* arg_input_types,
+    tensorflow::gtl::FlatMap<int, int>* op_input_to_func_input,
+    TF_Status* status) {
+  DCHECK(!op->is_function());
+
+  tensorflow::FunctionDef fdef;
+
+  // Get the OpDef of the op we are trying to encapsulate.
+  TFE_Context* ctx = op->ctx;
+  const tensorflow::OpRegistrationData* op_data;
+  {
+    tensorflow::tf_shared_lock l(ctx->functions_mu);
+    status->status = ctx->func_lib_def.LookUp(op->name, &op_data);
+    if (!status->status.ok()) {
+      return nullptr;
+    }
+  }
+  const tensorflow::OpDef& op_def = op_data->op_def;
+
+  tensorflow::OpDef* signature = fdef.mutable_signature();
+
+  // Handle constant inputs.
+  const std::unordered_set<string> const_inputs(
+      *tensorflow::XlaOpRegistry::CompileTimeConstantInputs(op->name));
+
+  // First add place holders for the input args, so that we can refer to them by
+  // position in the next loop. Also tally up the resource inputs.
+  int num_resource_inputs = 0;
+  for (int i = 0; i < op_def.input_arg_size(); ++i) {
+    if (op_def.input_arg(i).type() == tensorflow::DT_RESOURCE) {
+      ++num_resource_inputs;
+    }
+    signature->add_input_arg();
+  }
+
+  // Now we map the input params from `op_def` to `signature`, where the param
+  // ordering for `signature` is: <constants, args, resources>.
+  int const_index = 0;
+  int arg_index = const_inputs.size();
+  int resource_index = op_def.input_arg_size() - num_resource_inputs;
+  for (int i = 0; i < op_def.input_arg_size(); ++i) {
+    const tensorflow::OpDef::ArgDef& op_input_arg = op_def.input_arg(i);
+    tensorflow::OpDef::ArgDef* func_input_arg = nullptr;
+    if (const_inputs.find(op_input_arg.name()) != const_inputs.end()) {
+      VLOG(1) << "For const input, mapping op input " << i << " to func input "
+              << const_index;
+      (*op_input_to_func_input)[i] = const_index;
+      func_input_arg = signature->mutable_input_arg(const_index++);
+      const_input_types->push_back(
+          static_cast<TF_DataType>(op->inputs[i].dtype()));
+    } else if (op_input_arg.type() == tensorflow::DT_RESOURCE) {
+      VLOG(1) << "For resource input, mapping op input " << i
+              << " to func input " << resource_index;
+      (*op_input_to_func_input)[i] = resource_index;
+      func_input_arg = signature->mutable_input_arg(resource_index++);
+    } else {
+      VLOG(1) << "For arg input, mapping op input " << i << " to func input "
+              << arg_index;
+      (*op_input_to_func_input)[i] = arg_index;
+      func_input_arg = signature->mutable_input_arg(arg_index++);
+      arg_input_types->push_back(
+          static_cast<TF_DataType>(op->inputs[i].dtype()));
+    }
+
+    func_input_arg->set_name(op_input_arg.name());
+    func_input_arg->set_type(op->inputs[i].dtype());
+  }
+  VLOG(1) << "Added OpDef Inputs: " << fdef.DebugString();
+
+  // Resources args are at the end of the function input params, and we should
+  // have iterated over all of them.
+  DCHECK_EQ(signature->input_arg_size(), resource_index);
+
+  // Make the synthesized function's name unique.
+  signature->set_name(tensorflow::strings::StrCat(
+      op_def.name(), func_id_generator.fetch_add(1)));
+
+  // Add the node def and set its input names to match op_def's names.
+  const tensorflow::NodeDef& ndef = op->attrs.BuildNodeDef();
+  DCHECK_EQ(signature->input_arg_size(), ndef.input_size());
+  *fdef.add_node_def() = ndef;
+  for (int i = 0; i < op_def.input_arg_size(); ++i) {
+    fdef.mutable_node_def(0)->set_input(i, op_def.input_arg(i).name());
+  }
+  VLOG(1) << "Added NodeDef: " << fdef.DebugString();
+
+  // Fix the output names and set output types.
+  for (int i = 0; i < op_def.output_arg_size(); ++i) {
+    tensorflow::OpDef::ArgDef* arg = signature->add_output_arg();
+    const tensorflow::OpDef::ArgDef& op_def_arg = op_def.output_arg(i);
+    const string& out_tensor_name = tensorflow::strings::StrCat(
+        ndef.name(), ":", op_def_arg.name(), ":", 0);
+    arg->set_name(op_def_arg.name());
+    (*fdef.mutable_ret())[op_def_arg.name()] = out_tensor_name;
+    const string& type_attr = op_def_arg.type_attr();
+    if (!type_attr.empty()) {
+      auto i = ndef.attr().find(type_attr);
+      if (i == ndef.attr().end()) {
+        status->status = tensorflow::errors::InvalidArgument(
+            tensorflow::strings::StrCat("Could not find attr ", type_attr,
+                                        " in NodeDef ", ndef.DebugString()));
+        return nullptr;
+      }
+      arg->set_type(i->second.type());
+    }
+  }
+  VLOG(1) << "Fixed Output names and all types: " << fdef.DebugString();
+
+  tensorflow::mutex_lock l(ctx->functions_mu);
+  status->status = ctx->func_lib_def.AddFunctionDef(fdef);
+  if (!status->status.ok()) return nullptr;
+  const auto ret = ctx->func_lib_def.Find(signature->name());
+  DCHECK(ret != nullptr);
+  return ret;
+}
+
+// Builds an _XLALaunchOp as a wrapper over 'op', so that 'op' can be executed
+// via XLA.
+std::unique_ptr<TFE_Op> BuildXlaLaunch(TFE_Op* op, TF_Status* status) {
+  VLOG(1) << "Creating _XlaLaunchOp for TFE_Op " << op->name;
+  auto launch_op =
+      std::unique_ptr<TFE_Op>(TFE_NewOp(op->ctx, "_XlaLaunch", status));
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  if (op->device) {
+    TFE_OpSetDevice(launch_op.get(), op->device->name().c_str(), status);
+    if (TF_GetCode(status) != TF_OK) return nullptr;
+  }
+
+  const tensorflow::FunctionDef* fdef;
+  {
+    tensorflow::tf_shared_lock l(op->ctx->functions_mu);
+    fdef = op->ctx->func_lib_def.Find(op->name);
+  }
+  std::vector<TF_DataType> const_input_types;
+  std::vector<TF_DataType> arg_input_types;
+  tensorflow::gtl::FlatMap<int, int> op_input_to_func_input;
+  if (fdef == nullptr) {
+    // See if this is a primitive op, and if so create a function for it, so
+    // that _XlaLaunchOp can access it.
+    fdef = OpToFunction(op, &const_input_types, &arg_input_types,
+                        &op_input_to_func_input, status);
+    if (!status->status.ok()) return nullptr;
+  } else {
+    // TODO(hongm): XlaOpRegistry::CompileTimeConstantInputs() does not work for
+    // functions, so we need to find another way to handle constant inputs.
+    for (int i = const_input_types.size();
+         i < fdef->signature().input_arg_size(); ++i) {
+      VLOG(1) << "Adding Targs from input arg " << i;
+      const tensorflow::OpDef::ArgDef& arg = fdef->signature().input_arg(i);
+      arg_input_types.push_back(static_cast<TF_DataType>(arg.type()));
+    }
+  }
+  DCHECK(fdef != nullptr);
+
+  // Copy inputs and their devices.
+  // Since input param reordering may have occurred between `op` and `launch_op`
+  // via `op_input_to_func_input`, adjust the actual inputs accordingly.
+  launch_op->inputs = op->inputs;
+  launch_op->input_devices = op->input_devices;
+  if (!op_input_to_func_input.empty()) {
+    DCHECK_EQ(op->inputs.size(), op_input_to_func_input.size());
+    if (!op->input_devices.empty()) {
+      DCHECK_EQ(op->input_devices.size(), op_input_to_func_input.size());
+    }
+    for (int i = 0; i < op_input_to_func_input.size(); ++i) {
+      VLOG(1) << "mapping op input " << i << " to func input "
+              << op_input_to_func_input[i];
+
+      launch_op->inputs[op_input_to_func_input[i]] = op->inputs[i];
+      if (!op->input_devices.empty()) {
+        launch_op->input_devices[op_input_to_func_input[i]] =
+            op->input_devices[i];
+      }
+    }
+  }
+  launch_op->attrs.NumInputs(op->inputs.size());
+
+  TFE_OpSetAttrTypeList(launch_op.get(), "Tconstants", const_input_types.data(),
+                        const_input_types.size());
+
+  // Set Targs and Nresources attrs.
+  TFE_OpSetAttrTypeList(launch_op.get(), "Targs", arg_input_types.data(),
+                        arg_input_types.size());
+  const int num_resource_inputs = fdef->signature().input_arg_size() -
+                                  const_input_types.size() -
+                                  arg_input_types.size();
+  TFE_OpSetAttrInt(launch_op.get(), "Nresources", num_resource_inputs);
+
+  // Set Tresults attr.
+  std::vector<TF_DataType> tresults;
+  for (const tensorflow::OpDef::ArgDef& arg : fdef->signature().output_arg()) {
+    tresults.push_back(static_cast<TF_DataType>(arg.type()));
+  }
+  TFE_OpSetAttrTypeList(launch_op.get(), "Tresults", tresults.data(),
+                        tresults.size());
+
+  // Set function attr.
+  tensorflow::AttrValue attr_value;
+  tensorflow::NameAttrList* func = attr_value.mutable_func();
+  func->set_name(fdef->signature().name());
+  launch_op->attrs.Set("function", attr_value);
+
+  return launch_op;
+}
+#endif  // TENSORFLOW_EAGER_USE_XLA
 }  // namespace
 
 void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals,
@@ -531,6 +766,18 @@ void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals,
   // TODO(ashankar): ASSUMPTION: ctx->devices()[0] is always CPU
   tensorflow::Device* device =
       (op->device == nullptr) ? ctx->devices()[0] : op->device;
+
+#ifdef TENSORFLOW_EAGER_USE_XLA
+  std::unique_ptr<TFE_Op> xla_launch_op;
+  if (op->use_xla && op->name != "_XlaLaunch") {
+    xla_launch_op = BuildXlaLaunch(op, status);
+    if (!status->status.ok()) {
+      return;
+    }
+    op = xla_launch_op.get();
+  }
+#endif  // TENSORFLOW_EAGER_USE_XLA
+
   std::vector<tensorflow::Tensor> outputs(1);
   const tensorflow::MemoryTypeVector* output_memory_types = nullptr;
   tensorflow::Fprint128 cache_key = op->attrs.CacheKey(device->name());
diff --git a/tensorflow/c/eager/c_api.h b/tensorflow/c/eager/c_api.h
index 6a2aff1591..9506cf7390 100644
--- a/tensorflow/c/eager/c_api.h
+++ b/tensorflow/c/eager/c_api.h
@@ -158,6 +158,13 @@ TF_CAPI_EXPORT extern void TFE_OpSetDevice(TFE_Op* op, const char* device_name,
 TF_CAPI_EXPORT extern const char* TFE_OpGetDevice(TFE_Op* op,
                                                   TF_Status* status);
 
+// When 'enable' is set to 1, and if TensorFlow library is built with XLA
+// support, a subsequent TFE_Execute() call on `op` will run the op via XLA.
+//
+// If the library is not built with XLA support, this call would be a no-op.
+TF_CAPI_EXPORT extern void TFE_OpSetXLACompilation(TFE_Op* op,
+                                                   unsigned char enable);
+
 TF_CAPI_EXPORT extern void TFE_OpAddInput(TFE_Op* op, TFE_TensorHandle* h, TF_Status* status);
 
 TF_CAPI_EXPORT extern TF_AttrType TFE_OpGetAttrType(TFE_Op* op, const char* attr_name,
diff --git a/tensorflow/c/eager/c_api_internal.h b/tensorflow/c/eager/c_api_internal.h
index f2abffb7bc..7b9f1db02e 100644
--- a/tensorflow/c/eager/c_api_internal.h
+++ b/tensorflow/c/eager/c_api_internal.h
@@ -121,6 +121,7 @@ struct TFE_Op {
   std::vector<tensorflow::Tensor> inputs;
   std::vector<tensorflow::Device*> input_devices;
   tensorflow::Device* device;
+  bool use_xla = false;
 };
 
 #endif  // TENSORFLOW_C_EAGER_C_API_INTERNAL_H_
diff --git a/tensorflow/c/eager/c_api_test.cc b/tensorflow/c/eager/c_api_test.cc
index b0409af87c..4a3ecbc0ab 100644
--- a/tensorflow/c/eager/c_api_test.cc
+++ b/tensorflow/c/eager/c_api_test.cc
@@ -60,6 +60,38 @@ TFE_Op* MatMulOp(TFE_Context* ctx, TFE_TensorHandle* a, TFE_TensorHandle* b) {
   return op;
 }
 
+TFE_TensorHandle* TestAxisTensorHandle() {
+  int64_t dims[] = {1};
+  int data[] = {1};
+  TF_Tensor* t = TF_AllocateTensor(
+      TF_INT32, &dims[0], sizeof(dims) / sizeof(int64_t), sizeof(data));
+  memcpy(TF_TensorData(t), &data[0], TF_TensorByteSize(t));
+  TF_Status* status = TF_NewStatus();
+  TFE_TensorHandle* th = TFE_NewTensorHandle(t, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TF_DeleteTensor(t);
+  TF_DeleteStatus(status);
+  return th;
+}
+
+TFE_Op* MinOp(TFE_Context* ctx, TFE_TensorHandle* input,
+              TFE_TensorHandle* axis) {
+  TF_Status* status = TF_NewStatus();
+
+  TFE_Op* op = TFE_NewOp(ctx, "Min", status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_OpAddInput(op, input, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_OpAddInput(op, axis, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_OpSetAttrBool(op, "keep_dims", 1);
+  TFE_OpSetAttrType(op, "Tidx", TF_INT32);
+  TF_DeleteStatus(status);
+  TFE_OpSetAttrType(op, "T", TFE_TensorHandleDataType(input));
+
+  return op;
+}
+
 // If there is a GPU device, returns true and sets 'gpu_device_name'
 // accordingly.
 bool GetGPUDeviceName(TFE_Context* ctx, string* gpu_device_name) {
@@ -410,7 +442,7 @@ TEST(CAPI, SetAndGetOpDevices) {
   TF_DeleteStatus(status);
 }
 
-TEST(CAPI, Execute) {
+TEST(CAPI, Execute_MatMul_CPU) {
   TF_Status* status = TF_NewStatus();
   TFE_ContextOptions* opts = TFE_NewContextOptions();
   TFE_Context* ctx = TFE_NewContext(opts, status);
@@ -443,6 +475,117 @@ TEST(CAPI, Execute) {
   TF_DeleteStatus(status);
 }
 
+TEST(CAPI, Execute_Min_CPU) {
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_TensorHandle* input = TestMatrixTensorHandle();
+  TFE_TensorHandle* axis = TestAxisTensorHandle();
+  TFE_Op* minOp = MinOp(ctx, input, axis);
+  TFE_TensorHandle* retvals[2] = {nullptr};
+  int num_retvals = 2;  // Should be reduced to 1 by the TFE_Execute call.
+  TFE_Execute(minOp, &retvals[0], &num_retvals, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteOp(minOp);
+  TFE_DeleteTensorHandle(input);
+  TFE_DeleteTensorHandle(axis);
+  TFE_DeleteContext(ctx, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  ASSERT_EQ(1, num_retvals);
+
+  TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status);
+  TFE_DeleteTensorHandle(retvals[0]);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  float output[2] = {0};
+  EXPECT_EQ(sizeof(output), TF_TensorByteSize(t));
+  memcpy(&output[0], TF_TensorData(t), TF_TensorByteSize(t));
+  TF_DeleteTensor(t);
+  EXPECT_EQ(1, output[0]);
+  EXPECT_EQ(3, output[1]);
+  TF_DeleteStatus(status);
+}
+
+#ifdef TENSORFLOW_EAGER_USE_XLA
+TEST(CAPI, Execute_MatMul_XLA_CPU) {
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_TensorHandle* m = TestMatrixTensorHandle();
+  TFE_Op* matmul = MatMulOp(ctx, m, m);
+
+  TFE_OpSetXLACompilation(matmul, true);
+
+  TFE_TensorHandle* retvals[2] = {nullptr};
+  int num_retvals = 2;  // Should be reduced to 1 by the TFE_Execute call.
+  TFE_Execute(matmul, &retvals[0], &num_retvals, status);
+  // Running a primitive TF operator via XLA is not yet supported.
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TFE_DeleteOp(matmul);
+  TFE_DeleteTensorHandle(m);
+  TFE_DeleteContext(ctx, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  EXPECT_EQ(1, num_retvals);
+
+  TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status);
+  TFE_DeleteTensorHandle(retvals[0]);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  float product[4] = {0};
+  EXPECT_EQ(sizeof(product), TF_TensorByteSize(t));
+  memcpy(&product[0], TF_TensorData(t), TF_TensorByteSize(t));
+  TF_DeleteTensor(t);
+  EXPECT_EQ(7, product[0]);
+  EXPECT_EQ(10, product[1]);
+  EXPECT_EQ(15, product[2]);
+  EXPECT_EQ(22, product[3]);
+
+  TF_DeleteStatus(status);
+}
+
+TEST(CAPI, Execute_Min_XLA_CPU) {
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_TensorHandle* input = TestMatrixTensorHandle();
+  TFE_TensorHandle* axis = TestAxisTensorHandle();
+  TFE_Op* minOp = MinOp(ctx, input, axis);
+
+  TFE_OpSetXLACompilation(minOp, true);
+
+  TFE_TensorHandle* retvals[2] = {nullptr};
+  int num_retvals = 2;  // Should be reduced to 1 by the TFE_Execute call.
+  TFE_Execute(minOp, &retvals[0], &num_retvals, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteOp(minOp);
+  TFE_DeleteTensorHandle(input);
+  TFE_DeleteTensorHandle(axis);
+  TFE_DeleteContext(ctx, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  ASSERT_EQ(1, num_retvals);
+
+  TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status);
+  TFE_DeleteTensorHandle(retvals[0]);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  float output[2] = {0};
+  EXPECT_EQ(sizeof(output), TF_TensorByteSize(t));
+  memcpy(&output[0], TF_TensorData(t), TF_TensorByteSize(t));
+  TF_DeleteTensor(t);
+  EXPECT_EQ(1, output[0]);
+  EXPECT_EQ(3, output[1]);
+  TF_DeleteStatus(status);
+}
+#endif  // TENSORFLOW_EAGER_USE_XLA
+
 TEST(CAPI, ExecuteWithTracing) {
   TF_Status* status = TF_NewStatus();
   TFE_ContextOptions* opts = TFE_NewContextOptions();
@@ -484,7 +627,7 @@ TEST(CAPI, ExecuteWithTracing) {
   TF_DeleteStatus(status);
 }
 
-TEST(CAPI, Function) {
+TEST(CAPI, Function_ident_CPU) {
   // First create a simple identity function.
   TF_Graph* function_graph = TF_NewGraph();
   TF_OperationDescription* arg_descr =
@@ -545,6 +688,72 @@ TEST(CAPI, Function) {
   TF_DeleteStatus(status);
 }
 
+#ifdef TENSORFLOW_EAGER_USE_XLA
+TEST(CAPI, Function_ident_XLA_CPU) {
+  // First create a simple identity function.
+  TF_Graph* function_graph = TF_NewGraph();
+  TF_OperationDescription* arg_descr =
+      TF_NewOperation(function_graph, "Placeholder", "arg");
+  TF_SetAttrType(arg_descr, "dtype", TF_INT32);
+  TF_Status* status = TF_NewStatus();
+  TF_Operation* arg = TF_FinishOperation(arg_descr, status);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  TF_OperationDescription* id_descr =
+      TF_NewOperation(function_graph, "Identity", "id");
+  TF_SetAttrType(id_descr, "T", TF_INT32);
+  TF_AddInput(id_descr, {arg, 0});
+  TF_Operation* id = TF_FinishOperation(id_descr, status);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  TF_Output input{arg, 0};
+  TF_Output output{id, 0};
+  TF_Function* fn =
+      TF_GraphToFunction(function_graph, "ident", 0, 1, &id, 1, &input, 1,
+                         &output, nullptr, nullptr, "test", status);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  TF_DeleteGraph(function_graph);
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+  TFE_ContextAddFunction(ctx, fn, status);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  TF_DeleteFunction(fn);
+
+  TF_Tensor* t =
+      TF_AllocateTensor(TF_INT32, nullptr, 0, 1 * sizeof(tensorflow::int32));
+  *reinterpret_cast<tensorflow::int32*>(TF_TensorData(t)) = 42;
+  TFE_TensorHandle* h = TFE_NewTensorHandle(t, status);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  TF_DeleteTensor(t);
+
+  TFE_Op* op = TFE_NewOp(ctx, "ident", status);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  TFE_OpAddInput(op, h, status);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+
+  // Now run it via XLA.
+  TFE_OpSetXLACompilation(op, true);
+
+  std::vector<TFE_TensorHandle*> result;
+  result.push_back(nullptr);
+  int num_retvals = 1;
+  TFE_Execute(op, result.data(), &num_retvals, status);
+  TFE_DeleteOp(op);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  ASSERT_EQ(num_retvals, 1);
+
+  TF_Tensor* r = TFE_TensorHandleResolve(result[0], status);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  EXPECT_EQ(*reinterpret_cast<tensorflow::int32*>(TF_TensorData(r)), 42);
+  TFE_DeleteTensorHandle(h);
+  TF_DeleteTensor(r);
+  TFE_DeleteTensorHandle(result[0]);
+  TFE_DeleteContext(ctx, status);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  TF_DeleteStatus(status);
+}
+#endif  // TENSORFLOW_EAGER_USE_XLA
+
 string MatMulFunction() {
   tensorflow::FunctionDef def;
   CHECK(tensorflow::protobuf::TextFormat::ParseFromString(
-- 
GitLab


From 7ebc013313876748af71fde5af03859b9728181a Mon Sep 17 00:00:00 2001
From: Yao Zhang <yaozhang@google.com>
Date: Wed, 7 Feb 2018 12:29:25 -0800
Subject: [PATCH 771/966] Set the number of warmup steps for building the cost
 model.

PiperOrigin-RevId: 184878186
---
 tensorflow/python/grappler/cluster.i | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/python/grappler/cluster.i b/tensorflow/python/grappler/cluster.i
index 0c8d04ff29..8079cb307b 100644
--- a/tensorflow/python/grappler/cluster.i
+++ b/tensorflow/python/grappler/cluster.i
@@ -140,6 +140,7 @@ static GCluster TF_NewCluster(bool allow_soft_placement,
           timeout_s, num_cpu_cores, num_gpus);
   cluster_->DisableDetailedStats(disable_detailed_stats);
   cluster_->AllowSoftPlacement(allow_soft_placement);
+  cluster_->SetNumWarmupSteps(10);
   tensorflow::Status status = cluster_->Provision();
   tensorflow::Set_TF_Status_from_Status(out_status, status);
   return GCluster(cluster_);
-- 
GitLab


From 528d0c5e4d148655b797368fd55fe6304730fece Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 7 Feb 2018 12:32:53 -0800
Subject: [PATCH 772/966] Add BatchMatMul support and improve tensorflow
 graphdef export by adding ops and fixing typing and resolving constant
 Transpose ops.

PiperOrigin-RevId: 184878663
---
 tensorflow/contrib/lite/toco/BUILD            |   4 +
 .../contrib/lite/toco/export_tensorflow.cc    | 300 +++++++++++++++---
 .../convert_trivial_stack_to_reshape.cc       |  81 +++++
 .../graph_transformations.h                   |   4 +
 .../propagate_fixed_sizes.cc                  |  54 +++-
 .../remove_trivial_slice.cc                   |  69 ++++
 .../resolve_constant_transpose.cc             | 180 +++++++++++
 .../resolve_tensorflow_matmul.cc              |  56 +++-
 .../unroll_batch_matmul.cc                    | 172 ++++++++++
 .../contrib/lite/toco/import_tensorflow.cc    |  81 ++---
 tensorflow/contrib/lite/toco/model.h          |  27 +-
 tensorflow/contrib/lite/toco/toco_tooling.cc  |   6 +-
 tensorflow/contrib/lite/toco/tooling_util.cc  |  24 +-
 tensorflow/contrib/lite/toco/tooling_util.h   |  10 +
 14 files changed, 954 insertions(+), 114 deletions(-)
 create mode 100644 tensorflow/contrib/lite/toco/graph_transformations/convert_trivial_stack_to_reshape.cc
 create mode 100644 tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_slice.cc
 create mode 100644 tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_transpose.cc
 create mode 100644 tensorflow/contrib/lite/toco/graph_transformations/unroll_batch_matmul.cc

diff --git a/tensorflow/contrib/lite/toco/BUILD b/tensorflow/contrib/lite/toco/BUILD
index 864d0254f2..45031de09c 100644
--- a/tensorflow/contrib/lite/toco/BUILD
+++ b/tensorflow/contrib/lite/toco/BUILD
@@ -174,6 +174,7 @@ cc_library(
         "graph_transformations/convert_pure_conv_to_depthwise.cc",
         "graph_transformations/convert_reorder_axes.cc",
         "graph_transformations/convert_trivial_addn_to_add.cc",
+        "graph_transformations/convert_trivial_stack_to_reshape.cc",
         "graph_transformations/convert_trivial_transpose_to_reshape.cc",
         "graph_transformations/create_im2col_arrays.cc",
         "graph_transformations/dequantize.cc",
@@ -207,6 +208,7 @@ cc_library(
         "graph_transformations/remove_trivial_passthrough.h",
         "graph_transformations/remove_trivial_quantized_activation_func.cc",
         "graph_transformations/remove_trivial_reshape.cc",
+        "graph_transformations/remove_trivial_slice.cc",
         "graph_transformations/remove_unused_op.cc",
         "graph_transformations/reorder_activation_functions.cc",
         "graph_transformations/resolve_batch_normalization.cc",
@@ -219,6 +221,7 @@ cc_library(
         "graph_transformations/resolve_constant_shape_or_rank.cc",
         "graph_transformations/resolve_constant_stack.cc",
         "graph_transformations/resolve_constant_strided_slice.cc",
+        "graph_transformations/resolve_constant_transpose.cc",
         "graph_transformations/resolve_constant_unary.cc",
         "graph_transformations/resolve_mean_attributes.cc",
         "graph_transformations/resolve_pad_attributes.cc",
@@ -235,6 +238,7 @@ cc_library(
         "graph_transformations/resolve_tensorflow_tile.cc",
         "graph_transformations/resolve_transpose_attributes.cc",
         "graph_transformations/unfuse_activation_functions.cc",
+        "graph_transformations/unroll_batch_matmul.cc",
     ],
     hdrs = [
         "graph_transformations/graph_transformations.h",
diff --git a/tensorflow/contrib/lite/toco/export_tensorflow.cc b/tensorflow/contrib/lite/toco/export_tensorflow.cc
index 0bfbe0e3a5..70d7a9d4a5 100644
--- a/tensorflow/contrib/lite/toco/export_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/export_tensorflow.cc
@@ -46,6 +46,32 @@ using tensorflow::TensorProto;
 namespace toco {
 namespace {
 
+tensorflow::DataType GetTensorFlowDataType(ArrayDataType data_type) {
+  switch (data_type) {
+    case ArrayDataType::kBool:
+      return tensorflow::DT_BOOL;
+    case ArrayDataType::kFloat:
+      return tensorflow::DT_FLOAT;
+    case ArrayDataType::kUint8:
+      return tensorflow::DT_UINT8;
+    case ArrayDataType::kInt32:
+      return tensorflow::DT_INT32;
+    case ArrayDataType::kInt64:
+      return tensorflow::DT_INT64;
+    case ArrayDataType::kString:
+      return tensorflow::DT_STRING;
+    default:
+    case ArrayDataType::kNone:
+      LOG(FATAL) << "Unsupported data type: " << static_cast<int>(data_type);
+      return tensorflow::DT_INVALID;
+  }
+}
+
+tensorflow::DataType GetTensorFlowDataType(const Model& model,
+                                           const string& array_name) {
+  return GetTensorFlowDataType(model.GetArray(array_name).data_type);
+}
+
 // TensorFlow sometimes forbids what it calls "legacy scalars",
 // which are 1-D shapes where the unique shape size is 1.
 // See OpKernel::IsLegacyScalar and OpKernel::allow_legacy_scalars.
@@ -212,6 +238,24 @@ void ConvertIntTensorConst(const Model& model, const string& name,
   }
 }
 
+void CreateIntTensorConst(const string& name, const std::vector<int32>& data,
+                          GraphDef* tensorflow_graph) {
+  if (HasAlreadyExportedConst(name, *tensorflow_graph)) {
+    return;
+  }
+  auto* const_op = tensorflow_graph->add_node();
+  const_op->set_op("Const");
+  const_op->set_name(name);
+  (*const_op->mutable_attr())["dtype"].set_type(DT_INT32);
+  auto* tensor = (*const_op->mutable_attr())["value"].mutable_tensor();
+  tensor->set_dtype(DT_INT32);
+  for (auto index : data) {
+    tensor->add_int_val(index);
+  }
+  auto* shape = tensor->mutable_tensor_shape();
+  shape->add_dim()->set_size(data.size());
+}
+
 void CreateMatrixShapeTensorConst(const string& name, int rows, int cols,
                                   GraphDef* tensorflow_graph) {
   if (HasAlreadyExportedConst(name, *tensorflow_graph)) {
@@ -445,14 +489,23 @@ void ConvertSpaceToDepthOperator(const Model& model,
 void ConvertFullyConnectedOperator(const Model& model,
                                    const FullyConnectedOperator& src_op,
                                    GraphDef* tensorflow_graph) {
-  const string reshape_output = src_op.outputs[0] + "/reshape";
-  const string reshape_shape = src_op.outputs[0] + "/reshape/shape";
+  // Reshape input activations to have the shape expected by the MatMul.
+  const string reshape_output =
+      AvailableArrayName(model, src_op.outputs[0] + "/reshape");
+  const string reshape_shape =
+      AvailableArrayName(model, reshape_output + "/shape");
+  const auto& fc_weights_array = model.GetArray(src_op.inputs[1]);
+  const auto& fc_weights_shape = fc_weights_array.shape();
+  CHECK_EQ(fc_weights_shape.dimensions_count(), 2);
+  CreateMatrixShapeTensorConst(reshape_shape, fc_weights_shape.dims(1), -1,
+                               tensorflow_graph);
   auto* reshape_op = tensorflow_graph->add_node();
   reshape_op->set_op("Reshape");
   reshape_op->set_name(reshape_output);
   reshape_op->add_input(src_op.inputs[0]);
   reshape_op->add_input(reshape_shape);
-  (*reshape_op->mutable_attr())["T"].set_type(DT_FLOAT);
+  (*reshape_op->mutable_attr())["T"].set_type(
+      GetTensorFlowDataType(model, src_op.inputs[0]));
 
   const bool has_bias = src_op.inputs.size() >= 3;
   string matmul_output = src_op.outputs[0];
@@ -460,38 +513,43 @@ void ConvertFullyConnectedOperator(const Model& model,
     matmul_output += "/matmul";
   }
 
+  // Transpose the RHS input from column-major to row-major to match TensorFlow
+  // expectations. This is the inverse of the transpose we do during
+  // ResolveTensorFlowMatMul.
+  const string transpose_output =
+      AvailableArrayName(model, matmul_output + "/transpose_weights");
+  const string transpose_perm =
+      AvailableArrayName(model, transpose_output + "/perm");
+  CreateIntTensorConst(transpose_perm, {1, 0}, tensorflow_graph);
+  auto transpose_op = tensorflow_graph->add_node();
+  transpose_op->set_op("Transpose");
+  transpose_op->set_name(transpose_output);
+  *transpose_op->add_input() = src_op.inputs[1];
+  *transpose_op->add_input() = transpose_perm;
+  (*transpose_op->mutable_attr())["T"].set_type(
+      GetTensorFlowDataType(model, src_op.inputs[1]));
+  (*transpose_op->mutable_attr())["Tperm"].set_type(DT_INT32);
+
   auto* matmul_op = tensorflow_graph->add_node();
   matmul_op->set_op("MatMul");
-
   matmul_op->set_name(matmul_output);
   *matmul_op->add_input() = reshape_output;
-  *matmul_op->add_input() = src_op.inputs[1];
-  (*matmul_op->mutable_attr())["T"].set_type(DT_FLOAT);
+  *matmul_op->add_input() = transpose_op->name();
+  (*matmul_op->mutable_attr())["T"].set_type(
+      GetTensorFlowDataType(model, src_op.inputs[0]));
   (*matmul_op->mutable_attr())["transpose_a"].set_b(false);
   (*matmul_op->mutable_attr())["transpose_b"].set_b(false);
   CHECK(model.HasArray(src_op.inputs[1]));
-  const string& fc_weights_name =
-      WalkUpToConstantArray(model, src_op.inputs[1]);
-  const auto& fc_weights_array = model.GetArray(fc_weights_name);
-  const auto& fc_weights_shape = fc_weights_array.shape();
-  CHECK_EQ(fc_weights_shape.dimensions_count(), 2);
-  CreateMatrixShapeTensorConst(reshape_shape, fc_weights_shape.dims(1), -1,
-                               tensorflow_graph);
-
-  CHECK(fc_weights_array.buffer);
-  CHECK(fc_weights_array.buffer->type == ArrayDataType::kFloat);
-  const float* fc_weights_data =
-      fc_weights_array.GetBuffer<ArrayDataType::kFloat>().data.data();
-  ConvertFloatTensorConst(fc_weights_name, fc_weights_shape, fc_weights_data,
-                          AxesOrder::kCR, AxesOrder::kRC, tensorflow_graph);
 
+  // Add the bias, if it exists.
   if (has_bias) {
     auto* biasadd_op = tensorflow_graph->add_node();
     biasadd_op->set_op("BiasAdd");
     biasadd_op->set_name(src_op.outputs[0]);
     biasadd_op->add_input(matmul_output);
     biasadd_op->add_input(src_op.inputs[2]);
-    (*biasadd_op->mutable_attr())["T"].set_type(DT_FLOAT);
+    (*biasadd_op->mutable_attr())["T"].set_type(
+        GetTensorFlowDataType(model, src_op.inputs[0]));
     CHECK(model.HasArray(src_op.inputs[2]));
     const auto& bias_array = model.GetArray(src_op.inputs[2]);
     // TODO(b/62904716) Bias arrays should be 1-D, and used directly.
@@ -657,6 +715,45 @@ void ConvertSoftmaxOperator(const Model& model, const SoftmaxOperator& src_op,
   (*softmax_op->mutable_attr())["T"].set_type(DT_FLOAT);
 }
 
+void ConvertLogSoftmaxOperator(const Model& model,
+                               const LogSoftmaxOperator& src_op,
+                               GraphDef* tensorflow_graph) {
+  string softmax_input;
+  Operator* providing_op = GetOpWithOutput(model, src_op.inputs[0]);
+  if (providing_op->type == OperatorType::kTensorFlowReshape) {
+    softmax_input = src_op.inputs[0];
+  } else {
+    // Insert a reshape operator that reduces the dimensions down to the 2 that
+    // are required for TensorFlow Logits.
+    const string reshape_output =
+        src_op.outputs[0] + "/log_softmax_insert_reshape";
+    const string softmax_size = src_op.outputs[0] + "/log_softmax_insert_size";
+    softmax_input = reshape_output;
+
+    auto* reshape_op = tensorflow_graph->add_node();
+    reshape_op->set_op("Reshape");
+    reshape_op->set_name(reshape_output);
+    *reshape_op->add_input() = src_op.inputs[0];
+    *reshape_op->add_input() = softmax_size;
+    (*reshape_op->mutable_attr())["T"].set_type(DT_FLOAT);
+
+    const auto& input_shape = model.GetArray(src_op.inputs[0]).shape();
+    int32 flattened_size = 1;
+    for (int i = 0; i < input_shape.dimensions_count() - 1; ++i) {
+      flattened_size *= input_shape.dims(i);
+    }
+    const std::vector<int32> shape_data = {
+        flattened_size, input_shape.dims(input_shape.dimensions_count() - 1)};
+    CreateReshapeShapeTensorConst(softmax_size, shape_data, tensorflow_graph);
+  }
+
+  auto* log_softmax_op = tensorflow_graph->add_node();
+  log_softmax_op->set_op("LogSoftmax");
+  log_softmax_op->set_name(src_op.outputs[0]);
+  *log_softmax_op->add_input() = softmax_input;
+  (*log_softmax_op->mutable_attr())["T"].set_type(DT_FLOAT);
+}
+
 void ConvertL2NormalizationOperator(const L2NormalizationOperator& src_op,
                                     GraphDef* tensorflow_graph) {
   const string square_output = src_op.outputs[0] + "/square";
@@ -799,7 +896,8 @@ void ConvertConcatenationOperator(const Model& model,
     *dc_op->add_input() = input;
   }
   *dc_op->add_input() = dummy_axis;
-  (*dc_op->mutable_attr())["T"].set_type(DT_FLOAT);
+  (*dc_op->mutable_attr())["T"].set_type(
+      GetTensorFlowDataType(model, src_op.inputs[0]));
   (*dc_op->mutable_attr())["Tidx"].set_type(DT_INT32);
   (*dc_op->mutable_attr())["N"].set_i(src_op.inputs.size());
 }
@@ -813,7 +911,8 @@ void ConvertTensorFlowReshapeOperator(const Model& model,
   CHECK_EQ(src_op.inputs.size(), 2);
   *reshape_op->add_input() = src_op.inputs[0];
   *reshape_op->add_input() = src_op.inputs[1];
-  (*reshape_op->mutable_attr())["T"].set_type(DT_FLOAT);
+  (*reshape_op->mutable_attr())["T"].set_type(
+      GetTensorFlowDataType(model, src_op.outputs[0]));
   const auto& shape_array = model.GetArray(src_op.inputs[1]);
   QCHECK(shape_array.data_type == ArrayDataType::kInt32)
       << "Only int32 shape is supported.";
@@ -910,24 +1009,6 @@ void ConvertSplitOperator(const Model& model,
                                   tensorflow_graph);
 }
 
-tensorflow::DataType GetTensorFlowDataType(const Model& model,
-                                           const string& array_name) {
-  auto& dtype = model.GetArray(array_name).data_type;
-  CHECK(dtype == ArrayDataType::kFloat || dtype == ArrayDataType::kInt32 ||
-        dtype == ArrayDataType::kUint8 || dtype == ArrayDataType::kInt64);
-  if (dtype == ArrayDataType::kFloat) {
-    return tensorflow::DT_FLOAT;
-  } else if (dtype == ArrayDataType::kInt32) {
-    return tensorflow::DT_INT32;
-  } else if (dtype == ArrayDataType::kUint8) {
-    return tensorflow::DT_UINT8;
-  } else if (dtype == ArrayDataType::kInt64) {
-    return tensorflow::DT_INT64;
-  } else {
-    LOG(FATAL) << "Wrong data type";
-  }
-}
-
 void ConvertCastOperator(const Model& model, const CastOperator& src_op,
                          GraphDef* tensorflow_graph) {
   auto* cast_op = tensorflow_graph->add_node();
@@ -982,6 +1063,113 @@ void ConvertArgMaxOperator(const Model& model, const ArgMaxOperator& src_op,
       GetTensorFlowDataType(model, src_op.outputs[0]));
 }
 
+void ConvertTransposeOperator(const Model& model,
+                              const TransposeOperator& src_op,
+                              GraphDef* tensorflow_graph) {
+  auto* transpose_op = tensorflow_graph->add_node();
+  transpose_op->set_op("Transpose");
+  transpose_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 2);
+  *transpose_op->add_input() = src_op.inputs[0];
+  *transpose_op->add_input() = src_op.inputs[1];
+  (*transpose_op->mutable_attr())["T"].set_type(
+      GetTensorFlowDataType(model, src_op.inputs[0]));
+  (*transpose_op->mutable_attr())["Tperm"].set_type(
+      GetTensorFlowDataType(model, src_op.inputs[1]));
+}
+
+void ConvertTensorFlowShapeOperator(const Model& model,
+                                    const TensorFlowShapeOperator& src_op,
+                                    GraphDef* tensorflow_graph) {
+  auto* shape_op = tensorflow_graph->add_node();
+  shape_op->set_op("Shape");
+  shape_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 1);
+  *shape_op->add_input() = src_op.inputs[0];
+  (*shape_op->mutable_attr())["T"].set_type(
+      GetTensorFlowDataType(model, src_op.inputs[0]));
+  (*shape_op->mutable_attr())["out_type"].set_type(
+      GetTensorFlowDataType(model, src_op.outputs[0]));
+}
+
+void ConvertRankOperator(const Model& model, const RankOperator& src_op,
+                         GraphDef* tensorflow_graph) {
+  auto* rank_op = tensorflow_graph->add_node();
+  rank_op->set_op("Rank");
+  rank_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 1);
+  *rank_op->add_input() = src_op.inputs[0];
+  (*rank_op->mutable_attr())["T"].set_type(
+      GetTensorFlowDataType(model, src_op.inputs[0]));
+}
+
+void ConvertRangeOperator(const Model& model, const RangeOperator& src_op,
+                          GraphDef* tensorflow_graph) {
+  auto* range_op = tensorflow_graph->add_node();
+  range_op->set_op("Range");
+  range_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 3);
+  *range_op->add_input() = src_op.inputs[0];
+  *range_op->add_input() = src_op.inputs[1];
+  *range_op->add_input() = src_op.inputs[2];
+  (*range_op->mutable_attr())["Tidx"].set_type(
+      GetTensorFlowDataType(src_op.dtype));
+}
+
+void ConvertStackOperator(const Model& model, const StackOperator& src_op,
+                          GraphDef* tensorflow_graph) {
+  auto* stack_op = tensorflow_graph->add_node();
+  stack_op->set_op("Stack");
+  stack_op->set_name(src_op.outputs[0]);
+  for (const auto& input : src_op.inputs) {
+    *stack_op->add_input() = input;
+  }
+  (*stack_op->mutable_attr())["elem_type"].set_type(
+      GetTensorFlowDataType(model, src_op.outputs[0]));
+  (*stack_op->mutable_attr())["axis"].set_i(src_op.axis);
+}
+
+void ConvertFillOperator(const Model& model, const FillOperator& src_op,
+                         GraphDef* tensorflow_graph) {
+  auto* fill_op = tensorflow_graph->add_node();
+  fill_op->set_op("Fill");
+  fill_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 2);
+  *fill_op->add_input() = src_op.inputs[0];
+  *fill_op->add_input() = src_op.inputs[1];
+  (*fill_op->mutable_attr())["index_type"].set_type(
+      GetTensorFlowDataType(model, src_op.inputs[0]));
+  (*fill_op->mutable_attr())["T"].set_type(
+      GetTensorFlowDataType(model, src_op.inputs[1]));
+}
+
+void ConvertFloorDivOperator(const Model& model, const FloorDivOperator& src_op,
+                             GraphDef* tensorflow_graph) {
+  auto* floor_div_op = tensorflow_graph->add_node();
+  floor_div_op->set_op("FloorDiv");
+  floor_div_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 2);
+  *floor_div_op->add_input() = src_op.inputs[0];
+  *floor_div_op->add_input() = src_op.inputs[1];
+  (*floor_div_op->mutable_attr())["T"].set_type(
+      GetTensorFlowDataType(model, src_op.inputs[0]));
+}
+
+void ConvertExpandDimsOperator(const Model& model,
+                               const ExpandDimsOperator& src_op,
+                               GraphDef* tensorflow_graph) {
+  auto* expand_dims_op = tensorflow_graph->add_node();
+  expand_dims_op->set_op("ExpandDims");
+  expand_dims_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 2);
+  *expand_dims_op->add_input() = src_op.inputs[0];
+  *expand_dims_op->add_input() = src_op.inputs[1];
+  (*expand_dims_op->mutable_attr())["T"].set_type(
+      GetTensorFlowDataType(model, src_op.inputs[0]));
+  (*expand_dims_op->mutable_attr())["Tdim"].set_type(
+      GetTensorFlowDataType(model, src_op.inputs[1]));
+}
+
 void ConvertResizeBilinearOperator(const Model& model,
                                    const ResizeBilinearOperator& src_op,
                                    GraphDef* tensorflow_graph) {
@@ -1447,6 +1635,10 @@ void ConvertOperator(const Model& model, const Operator& src_op,
   } else if (src_op.type == OperatorType::kSoftmax) {
     ConvertSoftmaxOperator(model, static_cast<const SoftmaxOperator&>(src_op),
                            tensorflow_graph);
+  } else if (src_op.type == OperatorType::kLogSoftmax) {
+    ConvertLogSoftmaxOperator(model,
+                              static_cast<const LogSoftmaxOperator&>(src_op),
+                              tensorflow_graph);
   } else if (src_op.type == OperatorType::kLocalResponseNormalization) {
     ConvertLocalResponseNormalizationOperator(
         static_cast<const LocalResponseNormalizationOperator&>(src_op),
@@ -1535,6 +1727,32 @@ void ConvertOperator(const Model& model, const Operator& src_op,
   } else if (src_op.type == OperatorType::kArgMax) {
     ConvertArgMaxOperator(model, static_cast<const ArgMaxOperator&>(src_op),
                           tensorflow_graph);
+  } else if (src_op.type == OperatorType::kTranspose) {
+    ConvertTransposeOperator(
+        model, static_cast<const TransposeOperator&>(src_op), tensorflow_graph);
+  } else if (src_op.type == OperatorType::kTensorFlowShape) {
+    ConvertTensorFlowShapeOperator(
+        model, static_cast<const TensorFlowShapeOperator&>(src_op),
+        tensorflow_graph);
+  } else if (src_op.type == OperatorType::kRank) {
+    ConvertRankOperator(model, static_cast<const RankOperator&>(src_op),
+                        tensorflow_graph);
+  } else if (src_op.type == OperatorType::kRange) {
+    ConvertRangeOperator(model, static_cast<const RangeOperator&>(src_op),
+                         tensorflow_graph);
+  } else if (src_op.type == OperatorType::kStack) {
+    ConvertStackOperator(model, static_cast<const StackOperator&>(src_op),
+                         tensorflow_graph);
+  } else if (src_op.type == OperatorType::kFill) {
+    ConvertFillOperator(model, static_cast<const FillOperator&>(src_op),
+                        tensorflow_graph);
+  } else if (src_op.type == OperatorType::kFloorDiv) {
+    ConvertFloorDivOperator(model, static_cast<const FloorDivOperator&>(src_op),
+                            tensorflow_graph);
+  } else if (src_op.type == OperatorType::kExpandDims) {
+    ConvertExpandDimsOperator(model,
+                              static_cast<const ExpandDimsOperator&>(src_op),
+                              tensorflow_graph);
   } else {
     LOG(FATAL) << "Unhandled operator type " << OperatorTypeName(src_op.type);
   }
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/convert_trivial_stack_to_reshape.cc b/tensorflow/contrib/lite/toco/graph_transformations/convert_trivial_stack_to_reshape.cc
new file mode 100644
index 0000000000..0615b5e6c6
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/convert_trivial_stack_to_reshape.cc
@@ -0,0 +1,81 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool ConvertTrivialStackToReshape::Run(Model* model, std::size_t op_index) {
+  auto stack_it = model->operators.begin() + op_index;
+  if (stack_it->get()->type != OperatorType::kStack) {
+    return false;
+  }
+  auto* stack_op = static_cast<StackOperator*>(stack_it->get());
+  if (stack_op->inputs.size() > 1) {
+    // Not trivial.
+    return false;
+  }
+  CHECK_EQ(stack_op->outputs.size(), 1);
+
+  const auto& input_array = model->GetArray(stack_op->inputs[0]);
+  if (!input_array.has_shape()) {
+    // Yield until input dims have been resolved.
+    return false;
+  }
+  if (input_array.shape().dimensions_count() == 0) {
+    // Input array cannot be 0-D.
+    // (Unsure if this is TF behavior, but was required to get a test to pass.)
+    return false;
+  }
+
+  AddMessageF("Converting trivial %s to a reshape", LogName(*stack_op));
+
+  // Note that we could convert to ExpandDims but toco prefers reshapes.
+  auto* reshape_op = new TensorFlowReshapeOperator;
+  reshape_op->inputs = {stack_op->inputs[0]};
+  reshape_op->outputs = stack_op->outputs;
+
+  // Create shape param.
+  string shape_array_name =
+      AvailableArrayName(*model, stack_op->outputs[0] + "_shape");
+  Array& shape_array = model->GetOrCreateArray(shape_array_name);
+  *(shape_array.mutable_shape()->mutable_dims()) = {
+      1 + input_array.shape().dimensions_count()};
+  reshape_op->inputs.push_back(shape_array_name);
+  shape_array.data_type = ArrayDataType::kInt32;
+  auto& shape_buffer = shape_array.GetMutableBuffer<ArrayDataType::kInt32>();
+  shape_buffer.data.push_back(1);
+  for (int dim : input_array.shape().dims()) {
+    shape_buffer.data.push_back(dim);
+  }
+
+  // Replace the operator in the graph.
+  const auto reshape_it = model->operators.emplace(stack_it, reshape_op);
+  stack_it = reshape_it + 1;
+  CHECK_EQ(stack_it->get(), stack_op);
+  model->operators.erase(stack_it);
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
index 5d7ada1b74..3ab01ae643 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
+++ b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
@@ -115,6 +115,7 @@ void RunGraphTransformations(Model* model, const string& message,
 DECLARE_GRAPH_TRANSFORMATION(ConvertExpandDimsToReshape)
 DECLARE_GRAPH_TRANSFORMATION(ConvertPureConvToDepthwise)
 DECLARE_GRAPH_TRANSFORMATION(ConvertTrivialAddNToAdd)
+DECLARE_GRAPH_TRANSFORMATION(ConvertTrivialStackToReshape)
 DECLARE_GRAPH_TRANSFORMATION(ConvertTrivialTransposeToReshape)
 DECLARE_GRAPH_TRANSFORMATION(ConvertReorderAxes)
 DECLARE_GRAPH_TRANSFORMATION(EnsureBiasVectors)
@@ -138,6 +139,7 @@ DECLARE_GRAPH_TRANSFORMATION(RemoveTensorFlowIdentity)
 DECLARE_GRAPH_TRANSFORMATION(RemoveTrivialBinaryOperator)
 DECLARE_GRAPH_TRANSFORMATION(RemoveTrivialConcatenation)
 DECLARE_GRAPH_TRANSFORMATION(RemoveTrivialConcatenationInput)
+DECLARE_GRAPH_TRANSFORMATION(RemoveTrivialSlice)
 DECLARE_GRAPH_TRANSFORMATION(RemoveTrivialQuantizedActivationFunc)
 DECLARE_GRAPH_TRANSFORMATION(RemoveUnusedOp)
 DECLARE_GRAPH_TRANSFORMATION(ResolveBatchNormalization)
@@ -156,8 +158,10 @@ DECLARE_GRAPH_TRANSFORMATION(ResolveTensorFlowSwitch)
 DECLARE_GRAPH_TRANSFORMATION(ResolveTensorFlowTile)
 DECLARE_GRAPH_TRANSFORMATION(ResolveConstantFakeQuant)
 DECLARE_GRAPH_TRANSFORMATION(ResolveConstantConcatenation)
+DECLARE_GRAPH_TRANSFORMATION(ResolveConstantTranspose)
 DECLARE_GRAPH_TRANSFORMATION(DropFakeQuant)
 DECLARE_GRAPH_TRANSFORMATION(UnfuseActivationFunctions)
+DECLARE_GRAPH_TRANSFORMATION(UnrollBatchMatMul)
 DECLARE_GRAPH_TRANSFORMATION(ResolveSpaceToBatchNDAttributes)
 DECLARE_GRAPH_TRANSFORMATION(ResolveBatchToSpaceNDAttributes)
 DECLARE_GRAPH_TRANSFORMATION(ResolvePadAttributes)
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index 2a44849ffa..3de251ed70 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -61,23 +61,42 @@ void ComputeConvSizes(const Shape& input_shape, int output_depth, int kwidth,
   output_shape->ReplaceDims({batch, output_height, output_width, output_depth});
 }
 
-void ComputeBinaryOperatorOutputSize(const Shape& input_shape1,
-                                     const Shape& input_shape2,
+void ComputeBinaryOperatorOutputSize(const Shape& input_shape_x,
+                                     const Shape& input_shape_y,
                                      Array* output_array) {
-  const int size1 = RequiredBufferSizeForShape(input_shape1);
-  const int size2 = RequiredBufferSizeForShape(input_shape2);
-  if (size1 > size2) {
-    output_array->copy_shape(input_shape1);
-  } else if (size2 > size1) {
-    output_array->copy_shape(input_shape2);
-  } else {
-    CHECK_EQ(size1, size2);
-    const int dims1 = input_shape1.dimensions_count();
-    const int dims2 = input_shape2.dimensions_count();
-    if (dims1 >= dims2) {
-      output_array->copy_shape(input_shape1);
+  // This matches the code in BroadcastBinaryOpShapeFn from tensorflow.
+  // It zips together the two input shapes and pads with 1 to make them the
+  // same length. For each dimension we broadcast if either dimension is 1 and
+  // otherwise expect them to match.
+  int rank_x = input_shape_x.dimensions_count();
+  int rank_y = input_shape_y.dimensions_count();
+  int rank_out = std::max(rank_x, rank_y);
+  std::vector<int>* dims_out = output_array->mutable_shape()->mutable_dims();
+  dims_out->clear();
+  dims_out->reserve(rank_out);
+  for (int i = 0; i < rank_out; ++i) {
+    int dim_x = i < (rank_out - rank_x)
+                    ? 1
+                    : input_shape_x.dims(i - (rank_out - rank_x));
+    bool dim_y_is_one = i < (rank_out - rank_y);
+    int dim_y = dim_y_is_one ? 1 : input_shape_y.dims(i - (rank_out - rank_y));
+    if (dim_x == -1 || dim_y == -1) {
+      // One or both dimensions is unknown.
+      QCHECK(false) << "Shapes must be specified";
+    } else if (dim_x == 1 || dim_y == 1) {
+      // Broadcast one dimension to the other that is 1.
+      if (dim_x == 1 && !dim_y_is_one) {
+        // Broadcast dim_y to dim_x (1).
+        dims_out->push_back(dim_y);
+      } else {
+        // Broadcast dim_x to dim_y (1).
+        DCHECK_EQ(dim_y, 1);
+        dims_out->push_back(dim_x);
+      }
     } else {
-      output_array->copy_shape(input_shape2);
+      // Expect the dimensions to match.
+      CHECK_EQ(dim_x, dim_y) << "Dimensions must match";
+      dims_out->push_back(dim_x);
     }
   }
   CHECK(output_array->has_shape());
@@ -1217,7 +1236,8 @@ void ProcessTransposeOperator(Model* model, TransposeOperator* op) {
   std::vector<int32> const& perm =
       perm_array.GetBuffer<ArrayDataType::kInt32>().data;
   CHECK_EQ(perm.size(), input_shape.dimensions_count())
-      << "Transpose permutation input must be same length as input dimensions";
+      << "Transpose permutation input " << op->inputs[0]
+      << " must be same length as input dimensions";
   std::vector<int>* output_dims = output_array.mutable_shape()->mutable_dims();
   for (int i = 0; i < perm.size(); i++) {
     int axis = perm[i];
@@ -1274,6 +1294,7 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
     case OperatorType::kRelu1:
     case OperatorType::kRelu6:
     case OperatorType::kSoftmax:
+    case OperatorType::kLogSoftmax:
     case OperatorType::kLogistic:
     case OperatorType::kTanh:
     case OperatorType::kLocalResponseNormalization:
@@ -1423,6 +1444,7 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
     case OperatorType::kLstmCell:
       ProcessLstmCellOperator(model, static_cast<LstmCellOperator*>(op));
       break;
+    case OperatorType::kBatchMatMul:
     case OperatorType::kTensorFlowMatMul:
       // MatMul operators are converted to FullyConnected, after which their
       // shapes are propagated.
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_slice.cc b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_slice.cc
new file mode 100644
index 0000000000..0cbbcd7c81
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_slice.cc
@@ -0,0 +1,69 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <iterator>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+namespace {
+
+bool IsSliceTrivial(const Model& model, const Operator& op,
+                    RemoveTrivialSlice* transformation) {
+  CHECK(op.type == OperatorType::kSlice);
+
+  // Slices are trivial if they are slicing the entire input contents.
+  const auto& input_array = model.GetArray(op.inputs[0]);
+  const auto& output_array = model.GetArray(op.outputs[0]);
+  if (input_array.has_shape() && output_array.has_shape()) {
+    if (input_array.shape() == output_array.shape()) {
+      transformation->AddMessageF(
+          "%s is trivial because its input and output shapes are equal",
+          LogName(op));
+      return true;
+    }
+  }
+
+  return false;
+}
+
+}  // namespace
+
+bool RemoveTrivialSlice::Run(Model* model, std::size_t op_index) {
+  const auto reshape_it = model->operators.begin() + op_index;
+  auto* slice_op = reshape_it->get();
+  if (slice_op->type != OperatorType::kSlice) {
+    return false;
+  }
+
+  if (!IsSliceTrivial(*model, *slice_op, this)) {
+    return false;
+  }
+
+  AddMessageF("Removing trivial %s", LogName(*slice_op));
+
+  CHECK_EQ(slice_op->inputs.size(), 3);
+  return RemoveTrivialPassthroughOp(this, model, op_index);
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_transpose.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_transpose.cc
new file mode 100644
index 0000000000..4f984bfde5
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_transpose.cc
@@ -0,0 +1,180 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+namespace {
+
+// Transposes an array up to rank 4.
+// This is ShuffleArrayTemplate with non-enum permutation.
+template <ArrayDataType Type>
+void Transpose(Model* model, const Array& input_array,
+               const std::vector<int>& perm, Array* output_array) {
+  const Shape& input_shape = input_array.shape();
+  const std::vector<DataType<Type>>& input_data =
+      input_array.GetBuffer<Type>().data;
+
+  const Shape& output_shape = output_array->shape();
+  std::vector<DataType<Type>>& output_data =
+      output_array->GetMutableBuffer<Type>().data;
+  output_data.resize(RequiredBufferSizeForShape(output_shape));
+
+  CHECK(input_shape.dimensions_count() == output_shape.dimensions_count());
+  const int dim = input_shape.dimensions_count();
+  CHECK_LE(dim, 4);
+  CHECK(perm.size() >= dim);
+  for (int i = 0; i < dim; i++) {
+    CHECK(perm[i] >= 0 && perm[i] < dim);
+    CHECK(input_shape.dims(perm[i]) == output_shape.dims(i));
+  }
+  Shape extended_input_shape = input_shape;
+  ExtendShape(&extended_input_shape, 4);
+  Shape extended_output_shape = output_shape;
+  ExtendShape(&extended_output_shape, 4);
+  std::vector<int> extended_perm;
+  ExtendShuffle(perm, 4, &extended_perm);
+
+  const std::vector<int>& extended_input_dims = extended_input_shape.dims();
+  const std::vector<int>& extended_output_dims = extended_output_shape.dims();
+
+  // TODO(starka): Rework to handle different numbers of dimensions.
+  int input_strides[4];
+  input_strides[3] = 1;
+  input_strides[2] = extended_input_dims[3];
+  input_strides[1] = input_strides[2] * extended_input_dims[2];
+  input_strides[0] = input_strides[1] * extended_input_dims[1];
+  const int input_stride_0 = input_strides[extended_perm[3]];
+  const int input_stride_1 = input_strides[extended_perm[2]];
+  const int input_stride_2 = input_strides[extended_perm[1]];
+  const int input_stride_3 = input_strides[extended_perm[0]];
+
+  const int output_size_0 = extended_output_dims[3];
+  const int output_size_1 = extended_output_dims[2];
+  const int output_size_2 = extended_output_dims[1];
+  const int output_size_3 = extended_output_dims[0];
+  const int output_stride_0 = 1;
+  const int output_stride_1 = output_size_0;
+  const int output_stride_2 = output_stride_1 * output_size_1;
+  const int output_stride_3 = output_stride_2 * output_size_2;
+
+  for (int i3 = 0; i3 < output_size_3; i3++) {
+    const DataType<Type>* const input_ptr_3 =
+        input_data.data() + i3 * input_stride_3;
+    DataType<Type>* const output_ptr_3 =
+        output_data.data() + i3 * output_stride_3;
+    for (int i2 = 0; i2 < output_size_2; i2++) {
+      const DataType<Type>* const input_ptr_2 =
+          input_ptr_3 + i2 * input_stride_2;
+      DataType<Type>* const output_ptr_2 = output_ptr_3 + i2 * output_stride_2;
+      for (int i1 = 0; i1 < output_size_1; i1++) {
+        const DataType<Type>* input_ptr = input_ptr_2 + i1 * input_stride_1;
+        DataType<Type>* output_ptr = output_ptr_2 + i1 * output_stride_1;
+        DataType<Type>* const output_ptr_end =
+            output_ptr + output_size_0 * output_stride_0;
+        while (output_ptr != output_ptr_end) {
+          *output_ptr = *input_ptr;
+          input_ptr += input_stride_0;
+          output_ptr += output_stride_0;
+        }
+      }
+    }
+  }
+}
+
+}  // namespace
+
+bool ResolveConstantTranspose::Run(Model* model, std::size_t op_index) {
+  auto it = model->operators.begin() + op_index;
+  const auto* base_op = it->get();
+  if (base_op->type != OperatorType::kTranspose) {
+    return false;
+  }
+  const auto* op = static_cast<const TransposeOperator*>(base_op);
+
+  CHECK_EQ(op->inputs.size(), 2);
+  CHECK_EQ(op->outputs.size(), 1);
+  auto& output_array = model->GetArray(op->outputs[0]);
+  if (output_array.data_type == ArrayDataType::kNone) {
+    // Yield until the output type has been set by PropagateArrayDataTypes.
+    return false;
+  }
+  if (!output_array.has_shape()) {
+    // Yield until the output shape has been set by PropagateFixedShapes.
+    return false;
+  }
+
+  // We require constant inputs.
+  if (!IsConstantParameterArray(*model, op->inputs[0]) ||
+      !IsConstantParameterArray(*model, op->inputs[1])) {
+    return false;
+  }
+  const Array& input_array = model->GetArray(op->inputs[0]);
+
+  if (input_array.minmax) {
+    output_array.GetOrCreateMinMax() = input_array.GetMinMax();
+  }
+
+  if (op->perm.empty()) {
+    // Yield until perm has been populated by ResolveTransposeAttributes.
+    return false;
+  }
+
+  // We currently only support 1-4 dimensions.
+  CHECK_LE(op->perm.size(), 4);
+
+  CHECK(!output_array.buffer);
+  switch (output_array.data_type) {
+    case ArrayDataType::kFloat:
+      Transpose<ArrayDataType::kFloat>(model, input_array, op->perm,
+                                       &output_array);
+      break;
+    case ArrayDataType::kUint8:
+      Transpose<ArrayDataType::kUint8>(model, input_array, op->perm,
+                                       &output_array);
+      break;
+    case ArrayDataType::kInt32:
+      Transpose<ArrayDataType::kInt32>(model, input_array, op->perm,
+                                       &output_array);
+      break;
+    case ArrayDataType::kInt64:
+      Transpose<ArrayDataType::kInt64>(model, input_array, op->perm,
+                                       &output_array);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported data type given to Transpose op with output \""
+                 << op->outputs[0] << "\"";
+      break;
+  }
+
+  // Erase input arrays if no longer used.
+  for (const auto& input : op->inputs) {
+    if (IsDiscardableArray(*model, input) &&
+        CountOpsWithInput(*model, input) == 1) {
+      model->EraseArray(input);
+    }
+  }
+
+  // Erase the operator.
+  model->operators.erase(it);
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc
index ad1e56888e..f38203c80f 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc
@@ -29,7 +29,36 @@ bool ResolveTensorFlowMatMul::Run(Model* model, std::size_t op_index) {
   if (matmul_it->get()->type != OperatorType::kTensorFlowMatMul) {
     return false;
   }
-  const auto* matmul_op = matmul_it->get();
+  const auto* matmul_op =
+      static_cast<const TensorFlowMatMulOperator*>(matmul_it->get());
+
+  // Reorder the axes on the second input. TensorFlow uses row-major ordering
+  // on both inputs, however this is inefficient for the FullyConnected
+  // operator. We'll transpose the second input to be in column-major order now
+  // and let constant propagation optimize things (if possible).
+  auto* transpose_op = new TransposeOperator;
+  transpose_op->inputs = {
+      matmul_op->inputs[1],
+      CreateInt32Array(
+          model,
+          AvailableArrayName(*model, matmul_op->inputs[1] + "/transpose/perm"),
+          {1, 0})};
+  transpose_op->outputs = {
+      AvailableArrayName(*model, matmul_op->inputs[1] + "/transpose")};
+  model->GetOrCreateArray(transpose_op->outputs[0]);
+  model->operators.emplace(matmul_it, transpose_op);
+
+  // Refresh iterator.
+  matmul_it = model->operators.begin();
+  for (; matmul_it != model->operators.end(); ++matmul_it) {
+    if (matmul_it->get() == matmul_op) {
+      break;
+    }
+  }
+  DCHECK_EQ(matmul_it->get(), matmul_op);
+
+  string input_lhs = matmul_op->inputs[0];
+  string input_rhs = transpose_op->outputs[0];
 
   // Find the op producing the array passed to this MatMul
   auto previous_op_it = model->operators.begin();
@@ -47,22 +76,26 @@ bool ResolveTensorFlowMatMul::Run(Model* model, std::size_t op_index) {
   }
   Operator* previous_op = (found) ? previous_op_it->get() : nullptr;
 
-  // construct the new FullyConnectedOperator
+  // Construct the new FullyConnectedOperator.
   auto* fc_op = new FullyConnectedOperator;
   fc_op->outputs = matmul_op->outputs;
 
-  // insert the newly constructed FullyConnectedOperator
-  auto fc_it = model->operators.emplace(matmul_it, fc_op);
+  // Insert the newly constructed FullyConnectedOperator.
+  model->operators.emplace(matmul_it, fc_op) + 1;
 
-  // refresh invalidated iterator
-  matmul_it = fc_it + 1;
+  // Refresh iterator.
+  matmul_it = model->operators.begin();
+  for (; matmul_it != model->operators.end(); ++matmul_it) {
+    if (matmul_it->get() == matmul_op) {
+      break;
+    }
+  }
   DCHECK_EQ(matmul_it->get(), matmul_op);
 
   // The way that TensorFlow encodes FullyConnected ops is as a pair
   // (Reshape, MatMul), so we want to remove the Reshape op and rewrite the
-  // MatMul
-  // op as a FullyConnected. However, TensorFlow skips the Reshape ops if the
-  // input doesn't need reshaping, so we can't just match (Reshape, MatMul)
+  // MatMul op as a FullyConnected. However, TensorFlow skips the Reshape ops if
+  // the input doesn't need reshaping, so we can't just match (Reshape, MatMul)
   // pairs.
   if (previous_op && previous_op->type == OperatorType::kTensorFlowReshape) {
     AddMessageF("Combining %s and %s into %s", LogName(*previous_op),
@@ -72,7 +105,7 @@ bool ResolveTensorFlowMatMul::Run(Model* model, std::size_t op_index) {
       model->EraseArray(previous_op_output);
     }
     CHECK_EQ(previous_op->inputs.size(), 2);
-    fc_op->inputs = {previous_op->inputs[0], matmul_op->inputs[1]};
+    input_lhs = previous_op->inputs[0];
     // Only remove Reshape node if no other node uses its output.
     if (CountOpsWithInput(*model, previous_op_output) == 1) {
       const auto& previous_op_shape = previous_op->inputs[1];
@@ -95,9 +128,10 @@ bool ResolveTensorFlowMatMul::Run(Model* model, std::size_t op_index) {
   } else {
     AddMessageF("Replacing %s by a FullyConnected operator",
                 LogName(*matmul_op));
-    fc_op->inputs = {matmul_op->inputs[0], matmul_op->inputs[1]};
   }
 
+  fc_op->inputs = {input_lhs, input_rhs};
+
   // erase the MatMul operator
   model->operators.erase(matmul_it);
   return true;
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/unroll_batch_matmul.cc b/tensorflow/contrib/lite/toco/graph_transformations/unroll_batch_matmul.cc
new file mode 100644
index 0000000000..da81ea2ff3
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/unroll_batch_matmul.cc
@@ -0,0 +1,172 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+// Unrolls a BatchMatMul on the batch dimension.
+// We need to slice each batch out of the inputs, matmul them individually, then
+// stack them all back together at the end.
+//
+// This transform effectively looks like:
+//  result_slices = []
+//  for bat in B:
+//    slice_a = tf.reshape(tf.slice(a, [bat, 0, 0], [1, M, N]), [M, N])
+//    slice_b = tf.reshape(tf.slice(b, [bat, 0, 0], [1, M, N]), [M, N])
+//    slice_c = tf.matmul(slice_a, slice_b)
+//    result_slices[bat] = slice_c
+//  result = tf.stack(result_slices)
+bool UnrollBatchMatMul::Run(Model* model, std::size_t op_index) {
+  auto batch_op_it = model->operators.begin() + op_index;
+  if (batch_op_it->get()->type != OperatorType::kBatchMatMul) {
+    return false;
+  }
+  const auto* batch_op =
+      static_cast<const BatchMatMulOperator*>(batch_op_it->get());
+
+  // We must have the shape of at least one input to know our batch size.
+  const auto& input_array_a = model->GetArray(batch_op->inputs[0]);
+  const auto& input_array_b = model->GetArray(batch_op->inputs[1]);
+  if (!input_array_a.has_shape() || !input_array_b.has_shape()) return false;
+
+  // We only support the rank 3 case. If you are batching on rank > 3 you'll
+  // have to figure that out.
+  CHECK_EQ(input_array_a.shape().dimensions_count(),
+           input_array_b.shape().dimensions_count())
+      << "Input dimensions must have the same rank";
+  if (input_array_a.shape().dimensions_count() == 2) {
+    // This is really just a MatMul. This likely means that someone hand-crafted
+    // a graphdef with a BatchMatMul when they really wanted a MatMul.
+    AddMessageF("Replacing non-batch BatchMatMul %s by a MatMul operator",
+                LogName(*batch_op));
+    auto* matmul_op = new TensorFlowMatMulOperator;
+    matmul_op->inputs = batch_op->inputs;
+    matmul_op->outputs = batch_op->outputs;
+    const auto matmul_op_it = model->operators.emplace(batch_op_it, matmul_op);
+    batch_op_it = matmul_op_it + 1;
+    CHECK_EQ(batch_op_it->get(), batch_op);
+    model->operators.erase(batch_op_it);
+    return true;
+  }
+  CHECK_EQ(input_array_a.shape().dimensions_count(), 3)
+      << "Input arrays must have rank 3";
+
+  // Perform the matmul for each slice of the batch.
+  int batch_count = input_array_a.shape().dims(0);
+  AddMessageF("Unrolling BatchMatMul %s %d times", LogName(*batch_op),
+              batch_count);
+  auto tail_it = batch_op_it;
+  std::vector<string> stack_inputs;
+  for (int batch = 0; batch < batch_count; ++batch) {
+    std::string batch_name =
+        std::string(batch_op->outputs[0]) + "_b" + std::to_string(batch);
+
+    // tf.slice(a, ...).
+    auto* slice_a_op = new SliceOperator;
+    slice_a_op->inputs = {
+        batch_op->inputs[0],
+        CreateInt32Array(model, batch_name + "/slice_a/slice/begin",
+                         {batch, 0, 0}),
+        CreateInt32Array(
+            model, batch_name + "/slice_a/slice/size",
+            {1, input_array_a.shape().dims(1), input_array_a.shape().dims(2)}),
+    };
+    slice_a_op->outputs = {AvailableArrayName(*model, batch_name + "/slice_a")};
+    auto& slice_a_op_output = model->GetOrCreateArray(slice_a_op->outputs[0]);
+    slice_a_op_output.data_type = input_array_a.data_type;
+    tail_it = model->operators.emplace(tail_it, slice_a_op) + 1;
+
+    // Reshape to remove the first dimension ([1,M,N] -> [M,N]).
+    auto* slice_a_reshape_op = new TensorFlowReshapeOperator;
+    slice_a_reshape_op->inputs = {
+        slice_a_op->outputs[0],
+        CreateInt32Array(model, batch_name + "/slice_a/reshape/shape",
+                         {-1, input_array_a.shape().dims(2)})};
+    slice_a_reshape_op->outputs = {
+        AvailableArrayName(*model, batch_name + "/slice_a/reshape")};
+    auto& slice_a_reshape_op_output =
+        model->GetOrCreateArray(slice_a_reshape_op->outputs[0]);
+    slice_a_reshape_op_output.data_type = input_array_a.data_type;
+    tail_it = model->operators.emplace(tail_it, slice_a_reshape_op) + 1;
+
+    // tf.slice(b, ...).
+    auto* slice_b_op = new SliceOperator;
+    slice_b_op->inputs = {
+        batch_op->inputs[1],
+        CreateInt32Array(model, batch_name + "/slice_b/slice/begin", {0, 0, 0}),
+        CreateInt32Array(
+            model, batch_name + "/slice_b/slice/size",
+            {1, input_array_b.shape().dims(1), input_array_b.shape().dims(2)}),
+    };
+    slice_b_op->outputs = {AvailableArrayName(*model, batch_name + "/slice_b")};
+    auto& slice_b_op_output = model->GetOrCreateArray(slice_b_op->outputs[0]);
+    slice_b_op_output.data_type = input_array_b.data_type;
+    tail_it = model->operators.emplace(tail_it, slice_b_op) + 1;
+
+    // Reshape to remove the first dimension ([1,M,N] -> [M,N]).
+    auto* slice_b_reshape_op = new TensorFlowReshapeOperator;
+    slice_b_reshape_op->inputs = {
+        slice_b_op->outputs[0],
+        CreateInt32Array(model, batch_name + "/slice_b/reshape/shape",
+                         {-1, input_array_b.shape().dims(2)})};
+    slice_b_reshape_op->outputs = {
+        AvailableArrayName(*model, batch_name + "/slice_b/reshape")};
+    auto& slice_b_reshape_op_output =
+        model->GetOrCreateArray(slice_b_reshape_op->outputs[0]);
+    slice_b_reshape_op_output.data_type = input_array_b.data_type;
+    tail_it = model->operators.emplace(tail_it, slice_b_reshape_op) + 1;
+
+    // tf.matmul(slice_a, slice_b).
+    auto* matmul_op = new TensorFlowMatMulOperator;
+    matmul_op->inputs = {slice_a_reshape_op->outputs[0],
+                         slice_b_reshape_op->outputs[0]};
+    matmul_op->outputs = {AvailableArrayName(*model, batch_name)};
+    auto& matmul_op_output = model->GetOrCreateArray(matmul_op->outputs[0]);
+    matmul_op_output.data_type = input_array_a.data_type;
+    tail_it = model->operators.emplace(tail_it, matmul_op) + 1;
+
+    // Add to stack.
+    stack_inputs.push_back(matmul_op->outputs[0]);
+  }
+
+  // The stack that will join all the individual matmul results together.
+  auto* stack_op = new StackOperator;
+  stack_op->inputs = stack_inputs;
+  stack_op->outputs = {batch_op->outputs[0]};
+  stack_op->axis = 0;
+  model->operators.emplace(tail_it, stack_op);
+
+  // Remove the old batch matmul now that we've unrolled.
+  batch_op_it = model->operators.begin();
+  for (; batch_op_it != model->operators.end(); ++batch_op_it) {
+    if (batch_op_it->get() == batch_op) {
+      break;
+    }
+  }
+  CHECK(batch_op_it != model->operators.end());
+  CHECK(batch_op_it->get() == batch_op);
+  model->operators.erase(batch_op_it);
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.cc b/tensorflow/contrib/lite/toco/import_tensorflow.cc
index c12706e52d..41d6c832f0 100644
--- a/tensorflow/contrib/lite/toco/import_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/import_tensorflow.cc
@@ -839,6 +839,7 @@ void ConvertSwitchOperator(const NodeDef& node,
   op->outputs.push_back(node.name() + ":1");
   model->operators.emplace_back(op);
 }
+
 void ConvertSoftmaxOperator(const NodeDef& node,
                             const TensorFlowImportFlags& tf_import_flags,
                             Model* model) {
@@ -854,6 +855,18 @@ void ConvertSoftmaxOperator(const NodeDef& node,
   model->operators.emplace_back(softmax);
 }
 
+void ConvertLogSoftmaxOperator(const NodeDef& node,
+                               const TensorFlowImportFlags& tf_import_flags,
+                               Model* model) {
+  CHECK_EQ(node.op(), "LogSoftmax");
+  CheckInputsCount(node, tf_import_flags, 1);
+  const auto& input_name = node.input(0);
+  auto* log_softmax = new LogSoftmaxOperator;
+  log_softmax->inputs.push_back(input_name);
+  log_softmax->outputs.push_back(node.name());
+  model->operators.emplace_back(log_softmax);
+}
+
 void ConvertLRNOperator(const NodeDef& node,
                         const TensorFlowImportFlags& tf_import_flags,
                         Model* model) {
@@ -962,49 +975,37 @@ void ConvertReshapeOperator(const NodeDef& node,
   model->operators.emplace_back(op);
 }
 
+void ConvertBatchMatMulOperator(const NodeDef& node,
+                                const TensorFlowImportFlags& tf_import_flags,
+                                Model* model) {
+  CheckInputsCount(node, tf_import_flags, 2);
+
+  // https://www.tensorflow.org/versions/r0.12/api_docs/python/math_ops/matrix_math_functions
+  CHECK(!HasAttr(node, "adj_a") || (GetBoolAttr(node, "adj_a") == false));
+  CHECK(!HasAttr(node, "adj_b") || (GetBoolAttr(node, "adj_b") == false));
+
+  auto* batch_matmul = new BatchMatMulOperator;
+  batch_matmul->inputs = {node.input(0), node.input(1)};
+  batch_matmul->outputs = {node.name()};
+  model->operators.emplace_back(batch_matmul);
+}
+
 void ConvertMatMulOperator(const NodeDef& node,
                            const TensorFlowImportFlags& tf_import_flags,
                            Model* model) {
   CheckInputsCount(node, tf_import_flags, 2);
-  if (node.op() == "MatMul") {
-    // Transpose flags should be easy to support, but we don't have a
-    // GraphDef with them to test on at the moment.
-    CHECK_EQ(GetBoolAttr(node, "transpose_a"), false);
-    CHECK_EQ(GetBoolAttr(node, "transpose_b"), false);
-    CHECK(!HasAttr(node, "adjoint_a") ||
-          (GetBoolAttr(node, "adjoint_a") == false));
-    CHECK(!HasAttr(node, "adjoint_b") ||
-          (GetBoolAttr(node, "adjoint_b") == false));
-  } else if (node.op() == "BatchMatMul") {
-    // https://www.tensorflow.org/versions/r0.12/api_docs/python/math_ops/matrix_math_functions
-    CHECK(!HasAttr(node, "adj_a") || (GetBoolAttr(node, "adj_a") == false));
-    CHECK(!HasAttr(node, "adj_b") || (GetBoolAttr(node, "adj_b") == false));
-  } else {
-    LOG(FATAL) << "op must be 'MatMul' or 'BatchMatMul'";
-  }
 
-  const auto& input_name = node.input(0);
-  const auto& weights_name = node.input(1);
-  const auto& reordered_weights_name = weights_name + "_reordered";
-  // Check if a ReorderAxesOperator was already created for these weights
-  // (that happens when multiple layers share the same weights).
-  const Operator* existing_reorder =
-      GetOpWithOutput(*model, reordered_weights_name);
-  if (existing_reorder) {
-    // Check that it is safe to rely on the _reordered naming of the output
-    // array!
-    CHECK(existing_reorder->type == OperatorType::kReorderAxes);
-  } else {
-    // Create a new ReorderAxesOperator
-    auto* reorder = new ReorderAxesOperator;
-    reorder->inputs = {weights_name};
-    reorder->outputs = {reordered_weights_name};
-    reorder->input_axes_order = AxesOrder::kRC;
-    reorder->output_axes_order = AxesOrder::kCR;
-    model->operators.emplace_back(reorder);
-  }
+  // Transpose flags should be easy to support, but we don't have a
+  // GraphDef with them to test on at the moment.
+  CHECK_EQ(GetBoolAttr(node, "transpose_a"), false);
+  CHECK_EQ(GetBoolAttr(node, "transpose_b"), false);
+  CHECK(!HasAttr(node, "adjoint_a") ||
+        (GetBoolAttr(node, "adjoint_a") == false));
+  CHECK(!HasAttr(node, "adjoint_b") ||
+        (GetBoolAttr(node, "adjoint_b") == false));
+
   auto* matmul = new TensorFlowMatMulOperator;
-  matmul->inputs = {input_name, reordered_weights_name};
+  matmul->inputs = {node.input(0), node.input(1)};
   matmul->outputs = {node.name()};
   model->operators.emplace_back(matmul);
 }
@@ -1859,7 +1860,9 @@ std::unique_ptr<Model> ImportTensorFlowGraphDef(
       ConvertAvgPoolOperator(node, tf_import_flags, model);
     } else if (node.op() == "Reshape") {
       ConvertReshapeOperator(node, tf_import_flags, model);
-    } else if (node.op() == "MatMul" || node.op() == "BatchMatMul") {
+    } else if (node.op() == "BatchMatMul") {
+      ConvertBatchMatMulOperator(node, tf_import_flags, model);
+    } else if (node.op() == "MatMul") {
       ConvertMatMulOperator(node, tf_import_flags, model);
     } else if (node.op() == "Div" || node.op() == "RealDiv") {
       ConvertDivOperator(node, tf_import_flags, model);
@@ -1898,6 +1901,8 @@ std::unique_ptr<Model> ImportTensorFlowGraphDef(
       ConvertLRNOperator(node, tf_import_flags, model);
     } else if (node.op() == "Softmax") {
       ConvertSoftmaxOperator(node, tf_import_flags, model);
+    } else if (node.op() == "LogSoftmax") {
+      ConvertLogSoftmaxOperator(node, tf_import_flags, model);
     } else if (node.op() == "All") {
       ConvertAllOperator(node, tf_import_flags, model);
     } else if (node.op() == "Assert") {
diff --git a/tensorflow/contrib/lite/toco/model.h b/tensorflow/contrib/lite/toco/model.h
index b2a70eac3e..8f12bc59fb 100644
--- a/tensorflow/contrib/lite/toco/model.h
+++ b/tensorflow/contrib/lite/toco/model.h
@@ -34,6 +34,7 @@ enum class OperatorType {
   kAdd,
   kAddN,
   kAveragePool,
+  kBatchMatMul,
   kBatchNormalization,
   kConv,
   kConcatenation,
@@ -61,6 +62,7 @@ enum class OperatorType {
   kRelu1,
   kRelu6,
   kSoftmax,
+  kLogSoftmax,
   kSub,
   kTanh,
   kTransposeConv,
@@ -736,6 +738,19 @@ struct TensorFlowIdentityOperator : Operator {
   TensorFlowIdentityOperator() : Operator(OperatorType::kTensorFlowIdentity) {}
 };
 
+// Batch matrix multiplication operator. This comes from the (deprecated)
+// tf.batch_matmul or a tf.matmul that has rank 3. dims(0) is the batch count
+// and it can be trivially unrolled into a series of matmuls on each element.
+//
+// Inputs:
+//   inputs[0]: required: the left-hand side matrix
+//   inputs[1]: required: the right-hand side matrix
+//
+// TensorFlow equivalent: MatMul
+struct BatchMatMulOperator : Operator {
+  BatchMatMulOperator() : Operator(OperatorType::kBatchMatMul) {}
+};
+
 // General matrix multiplication operator. We don't want to support general
 // matrix multiplication at inference time, so we resolve it during tooling
 // to more specific operator types, namely, FullyConnected.
@@ -1240,6 +1255,16 @@ struct SoftmaxOperator : Operator {
   float beta = 0.f;
 };
 
+// LogSoftmax activation function.
+//
+// Inputs:
+//   inputs[0]: required: the logits input array
+//
+// TensorFlow equivalent: LogSoftmax
+struct LogSoftmaxOperator : Operator {
+  LogSoftmaxOperator() : Operator(OperatorType::kLogSoftmax) {}
+};
+
 // Cast operator.
 //
 // Inputs:
@@ -1568,7 +1593,7 @@ class Model {
 
   bool HasArray(const string& name) const { return arrays.count(name) > 0; }
   Array& GetArray(const string& name) const {
-    DCHECK(HasArray(name));
+    DCHECK(HasArray(name)) << "Array not found: " << name;
     return *arrays.at(name);
   }
   Array& GetOrCreateArray(const string& name) {
diff --git a/tensorflow/contrib/lite/toco/toco_tooling.cc b/tensorflow/contrib/lite/toco/toco_tooling.cc
index 47da8b68ea..5472c52c96 100644
--- a/tensorflow/contrib/lite/toco/toco_tooling.cc
+++ b/tensorflow/contrib/lite/toco/toco_tooling.cc
@@ -53,18 +53,22 @@ void MakeGeneralGraphTransformationsSet(
   CHECK(transformations->empty());
   transformations->Add(new ConvertExpandDimsToReshape);
   transformations->Add(new ConvertTrivialAddNToAdd);
+  transformations->Add(new ConvertTrivialStackToReshape);
   transformations->Add(new ConvertTrivialTransposeToReshape);
   transformations->Add(new ConvertReorderAxes);
   transformations->Add(new ResolveReshapeAttributes);
+  transformations->Add(new ResolveTransposeAttributes);
   transformations->Add(new PropagateArrayDataTypes);
   transformations->Add(new PropagateFixedSizes);
   transformations->Add(new RemoveTensorFlowAssert);
   transformations->Add(new RemoveTensorFlowIdentity);
   transformations->Add(new RemoveTrivialConcatenation);
   transformations->Add(new RemoveTrivialConcatenationInput);
+  transformations->Add(new RemoveTrivialSlice);
   transformations->Add(new RemoveUnusedOp);
   transformations->Add(new EnsureBiasVectors);
   transformations->Add(new ResolveReorderAxes);
+  transformations->Add(new UnrollBatchMatMul);
   transformations->Add(new ResolveTensorFlowMatMul);
   transformations->Add(new FuseBinaryIntoPrecedingAffine);
   transformations->Add(new FuseBinaryIntoFollowingAffine);
@@ -75,6 +79,7 @@ void MakeGeneralGraphTransformationsSet(
   transformations->Add(new ResolveConstantRange);
   transformations->Add(new ResolveConstantStack);
   transformations->Add(new ResolveConstantStridedSlice);
+  transformations->Add(new ResolveConstantTranspose);
   transformations->Add(new ResolveConstantUnaryOperator);
   transformations->Add(new ResolveTensorFlowMerge);
   transformations->Add(new ResolveSqueezeAttributes);
@@ -92,7 +97,6 @@ void MakeGeneralGraphTransformationsSet(
   transformations->Add(new ResolveStridedSliceAttributes);
   transformations->Add(new ResolveSliceAttributes);
   transformations->Add(new ResolveMeanAttributes);
-  transformations->Add(new ResolveTransposeAttributes);
   transformations->Add(new ResolveConstantShapeOrRank);
   transformations->Add(new MakeInitialDequantizeOperator);
   transformations->Add(new ResolveConstantFakeQuant);
diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc
index bfab486b26..1add90fb82 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.cc
+++ b/tensorflow/contrib/lite/toco/tooling_util.cc
@@ -229,6 +229,7 @@ const char* OperatorTypeName(OperatorType type) {
     HANDLE_OPERATORTYPENAME_CASE(Add)
     HANDLE_OPERATORTYPENAME_CASE(AddN)
     HANDLE_OPERATORTYPENAME_CASE(AveragePool)
+    HANDLE_OPERATORTYPENAME_CASE(BatchMatMul)
     HANDLE_OPERATORTYPENAME_CASE(BatchNormalization)
     HANDLE_OPERATORTYPENAME_CASE(Conv)
     HANDLE_OPERATORTYPENAME_CASE(Concatenation)
@@ -250,6 +251,7 @@ const char* OperatorTypeName(OperatorType type) {
     HANDLE_OPERATORTYPENAME_CASE(Relu6)
     HANDLE_OPERATORTYPENAME_CASE(ReorderAxes)
     HANDLE_OPERATORTYPENAME_CASE(Softmax)
+    HANDLE_OPERATORTYPENAME_CASE(LogSoftmax)
     HANDLE_OPERATORTYPENAME_CASE(Div)
     HANDLE_OPERATORTYPENAME_CASE(Tanh)
     HANDLE_OPERATORTYPENAME_CASE(TensorFlowAll)
@@ -1419,6 +1421,21 @@ bool IsArrayFullyConnectedWeights(const Model& model, const string& name) {
   return is_fc_weights;
 }
 
+string CreateInt32Array(Model* model, const string& param_name,
+                        const std::vector<int>& value) {
+  auto param_array_name = AvailableArrayName(*model, param_name);
+  auto& param_array = model->GetOrCreateArray(param_array_name);
+  param_array.mutable_shape()->ReplaceDims({static_cast<int>(value.size())});
+  param_array.data_type = ArrayDataType::kInt32;
+  auto& param_array_data =
+      param_array.GetMutableBuffer<ArrayDataType::kInt32>().data;
+  param_array_data.resize(RequiredBufferSizeForShape(param_array.shape()));
+  for (int i = 0; i < value.size(); ++i) {
+    param_array_data[i] = value[i];
+  }
+  return param_array_name;
+}
+
 bool EstimateArithmeticOpsCount(const Model& model, int64* result) {
   int64 total = 0;
   for (const auto& op : model.operators) {
@@ -1466,6 +1483,7 @@ bool EstimateArithmeticOpsCount(const Model& model, int64* result) {
       }
       case OperatorType::kLogistic:
       case OperatorType::kSoftmax:
+      case OperatorType::kLogSoftmax:
       case OperatorType::kTanh: {
         const auto& output_array = model.GetArray(op->outputs[0]);
         if (!output_array.has_shape()) {
@@ -1565,10 +1583,6 @@ void GetShuffleShape(AxesOrder input_axes_order, AxesOrder output_axes_order,
   }
 }
 
-namespace {
-
-// Extend shuffle is designed to match ExtendShape, which pads the shape with
-// unit dimensions at the beginning.
 void ExtendShuffle(const std::vector<int>& input_shuffle, int newdim,
                    std::vector<int>* extended_shuffle) {
   *extended_shuffle = input_shuffle;
@@ -1583,8 +1597,6 @@ void ExtendShuffle(const std::vector<int>& input_shuffle, int newdim,
   }
 }
 
-}  // end anonymous namespace
-
 void ShuffleDims(const Shape& input_shape, AxesOrder input_axes_order,
                  AxesOrder output_axes_order, Shape* output_shape) {
   if (input_axes_order == AxesOrder::kHWIM &&
diff --git a/tensorflow/contrib/lite/toco/tooling_util.h b/tensorflow/contrib/lite/toco/tooling_util.h
index 7397fa3eda..3addccaa10 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.h
+++ b/tensorflow/contrib/lite/toco/tooling_util.h
@@ -260,6 +260,11 @@ void PrintArrayShape(Model* model, const string& name);
 void MakeArrayDims(int num_dims, int batch, int height, int width, int depth,
                    std::vector<int>* out_dims);
 
+// Defines a constant int32 array with the provided values formatted for use
+// as op parameters.
+string CreateInt32Array(Model* model, const string& param_name,
+                        const std::vector<int>& value);
+
 bool EstimateArithmeticOpsCount(const Model& model, int64* result);
 
 int AxesCount(AxesOrder axes_order);
@@ -269,6 +274,11 @@ int AxesCount(AxesOrder axes_order);
 void GetShuffleShape(AxesOrder input_axes_order, AxesOrder output_axes_order,
                      std::vector<int>* shuffle);
 
+// Extend shuffle is designed to match ExtendShape, which pads the shape with
+// unit dimensions at the beginning.
+void ExtendShuffle(const std::vector<int>& input_shuffle, int newdim,
+                   std::vector<int>* extended_shuffle);
+
 void ShuffleDims(const Shape& input_shape, AxesOrder input_axes_order,
                  AxesOrder output_axes_order, Shape* output_shape);
 void ShuffleArray(const Shape& input_shape, AxesOrder input_axes_order,
-- 
GitLab


From 42ce11f3a24133448def0c571e11cff4bd4f9b38 Mon Sep 17 00:00:00 2001
From: Rasmus Munk Larsen <rmlarsen@google.com>
Date: Wed, 7 Feb 2018 12:45:39 -0800
Subject: [PATCH 773/966] Use assertAllClose instead of assertArrayNear.

assertArrayNear compares absolute instead of relative error. For floating point computation you want the latter.
---
 tensorflow/python/kernel_tests/neon_depthwise_conv_op_test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/kernel_tests/neon_depthwise_conv_op_test.py b/tensorflow/python/kernel_tests/neon_depthwise_conv_op_test.py
index 30795eed8a..d8ce9fffbd 100644
--- a/tensorflow/python/kernel_tests/neon_depthwise_conv_op_test.py
+++ b/tensorflow/python/kernel_tests/neon_depthwise_conv_op_test.py
@@ -148,7 +148,7 @@ class DepthwiseConv2DTest(test.TestCase):
     print("depthwise conv_2d: ", tensor_in_sizes, "*", filter_in_sizes,
           ", stride:", stride, ", padding: ", padding, ", max diff: ",
           np.amax(np.absolute(native_result - interface_result)))
-    self.assertArrayNear(
+    self.assertAllClose(
         np.ravel(native_result), np.ravel(interface_result), 1e-5)
     self.assertShapeEqual(native_result, conv_native)
     self.assertShapeEqual(native_result, conv_interface)
@@ -213,7 +213,7 @@ class DepthwiseConv2DTest(test.TestCase):
             t1, t2, strides=[1, stride, stride, 1], padding=padding)
         value = sess.run(conv)
     print("value = ", value)
-    self.assertArrayNear(expected, np.ravel(value), 1e-5)
+    self.assertAllClose(expected, np.ravel(value), 1e-5)
     self.assertShapeEqual(value, conv)
 
   def testConv2D2x2Filter(self):
-- 
GitLab


From 1893d25a795e29d276ae3484cfe0727eb657e4ad Mon Sep 17 00:00:00 2001
From: Rasmus Munk Larsen <rmlarsen@google.com>
Date: Wed, 7 Feb 2018 12:56:06 -0800
Subject: [PATCH 774/966] Fix bug and speed up Grappler constant folding

Fix bug in and speed up ConstantFolding::CreateNodeDef():
  * Fix bug trying to store more than kintmax32 values in a repeated proto field.
  * Speed up populating compressed format. Example: tensorflow/python/kernel_tests/large_concat_op_test with size = 2**29+6 goes from ~30 seconds to ~15 seconds. The fraction of time spent in ConstantFolding::CreateNodeDef() goes down from about 35% to about 12%.
---
 .../grappler/optimizers/constant_folding.cc   | 34 +++++++++++--------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index 37a4759fdd..1e6f11c8aa 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -808,20 +808,26 @@ NodeDef ConstantFolding::CreateNodeDef(const string& name,
   // Use the packed representation whenever possible to avoid generating large
   // graphdefs. Moreover, avoid repeating the last values if they're equal.
   if (tensor->NumElements() > 4) {
-#define POPULATE_TENSOR_PROTO(tensor, t, TYPE, NAME)         \
-  optimized = true;                                          \
-  TYPE last = tensor->flat<TYPE>()(0);                       \
-  int last_index = 0;                                        \
-  for (int i = 0; i < tensor->NumElements(); ++i) {          \
-    TYPE cur = tensor->flat<TYPE>()(i);                      \
-    t->add_##NAME##_val(cur);                                \
-    if (cur != last) {                                       \
-      last = cur;                                            \
-      last_index = i;                                        \
-    }                                                        \
-  }                                                          \
-  /* Remove all identical trailing values to save memory. */ \
-  t->mutable_##NAME##_val()->Truncate(last_index + 1);
+#define POPULATE_TENSOR_PROTO(tensor, t, TYPE, NAME)                \
+  const TYPE* val_ptr = tensor->flat<TYPE>().data();                \
+  TYPE last = *val_ptr;                                             \
+  int64 last_index = 0;                                             \
+  for (int64 i = 0; i < tensor->NumElements(); ++i) {               \
+    TYPE cur = *val_ptr++;                                          \
+    if (cur != last) {                                              \
+      last = cur;                                                   \
+      last_index = i;                                               \
+    }                                                               \
+  }                                                                 \
+  if (last_index < kint32max) {                                     \
+    optimized = true;                                               \
+    t->mutable_##NAME##_val()->Reserve(last_index + 1);             \
+    t->mutable_##NAME##_val()->AddNAlreadyReserved(last_index + 1); \
+    val_ptr = tensor->flat<TYPE>().data();                          \
+    for (int64 i = 0; i <= last_index; ++i) {                       \
+      t->set_##NAME##_val(i, *val_ptr++);                           \
+    }                                                               \
+  }
 
     if (tensor->dtype() == DT_FLOAT) {
       POPULATE_TENSOR_PROTO(tensor, t, float, float)
-- 
GitLab


From d1eeceb562d6defc4b517ad83cea56a894ff4c98 Mon Sep 17 00:00:00 2001
From: Yu-Cheng Ling <ycling@google.com>
Date: Wed, 7 Feb 2018 13:39:01 -0800
Subject: [PATCH 775/966] Makefile flag to use Apple Accelerate for Conv on
 iOS.

PiperOrigin-RevId: 184888096
---
 tensorflow/contrib/lite/ios_makefile.inc                   | 2 ++
 tensorflow/contrib/lite/kernels/conv.cc                    | 4 +---
 .../contrib/lite/kernels/internal/optimized/cblas_conv.h   | 7 +++++--
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/tensorflow/contrib/lite/ios_makefile.inc b/tensorflow/contrib/lite/ios_makefile.inc
index 26cfe6c3e2..fc6594c3a0 100644
--- a/tensorflow/contrib/lite/ios_makefile.inc
+++ b/tensorflow/contrib/lite/ios_makefile.inc
@@ -22,6 +22,7 @@ ifeq ($(TARGET), IOS)
 	IOS_ARCH := x86_64
 	CXXFLAGS += -miphoneos-version-min=$(MIN_SDK_VERSION) \
 		-DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK \
+		-DTFLITE_USE_APPLE_ACCELERATE_FOR_CONV \
 		-fembed-bitcode \
 		-Wno-c++11-narrowing \
 		-mno-thumb \
@@ -42,6 +43,7 @@ ifeq ($(TARGET), IOS)
 		-O3
 	LDFLAGS := -fembed-bitcode \
 		-miphoneos-version-min=${MIN_SDK_VERSION} \
+		-framework Accelerate \
 		-arch $(IOS_ARCH)
 	OBJDIR := $(OBJDIR)ios_$(IOS_ARCH)/
 	LIBDIR := $(LIBDIR)ios_$(IOS_ARCH)/
diff --git a/tensorflow/contrib/lite/kernels/conv.cc b/tensorflow/contrib/lite/kernels/conv.cc
index 1fba3cbbce..66d2c04bba 100644
--- a/tensorflow/contrib/lite/kernels/conv.cc
+++ b/tensorflow/contrib/lite/kernels/conv.cc
@@ -463,9 +463,7 @@ TfLiteRegistration* Register_CONVOLUTION_CBLAS_OPT() {
 }
 
 TfLiteRegistration* Register_CONV_2D() {
-// TODO(ycling): Define a compilation flag and use CBLAS kernel when a
-// fast CBLAS implementatino is available.
-#ifdef TFLITE_USE_CBLAS_CONVOLUTION_KERNEL
+#ifdef TFLITE_USE_APPLE_ACCELERATE_FOR_CONV
   return Register_CONVOLUTION_CBLAS_OPT();
 #else
   return Register_CONVOLUTION_MULTITHREADED_OPT();
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/cblas_conv.h b/tensorflow/contrib/lite/kernels/internal/optimized/cblas_conv.h
index fcb9fac671..4a90e7e640 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/cblas_conv.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/cblas_conv.h
@@ -19,9 +19,12 @@ limitations under the License.
 // The Conv implementation based on CBLAS interface. This is only used on iOS
 // for now, utilizing Apple's Accelerate framework.
 
-// TODO(ycling): Update the BUILD file and integrate with Apple Accelerate
-// Famework when it's available.
+#if TFLITE_USE_APPLE_ACCELERATE_FOR_CONV
+#include <Accelerate/Accelerate.h>
+#else
 #include "tensorflow/contrib/lite/kernels/internal/optimized/cblas_reference.h"
+#endif
+
 #include "tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h"
 #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
 
-- 
GitLab


From a271c36b5ead4686b72d972b193bf1f534a92ffd Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Wed, 7 Feb 2018 13:44:30 -0800
Subject: [PATCH 776/966] [tf.data] Move the C++ Dataset class implementations
 to the framework library.

This enables the use of the `DatasetOpKernel` subclasses in custom op library
code. A subsequent change will move `tf.contrib.data` kernel implementations
to a custom op library.

Implementation note: This change moves some classes from
"tensorflow/core/graph/..." into the framework library, which does not
include any code in "tensorflow/core/common_runtime/...". To break the
dependency from "tensorflow/core/framework/dataset.cc" to
"tensorflow/core/common_runtime/...", the `GraphDefBuilderToGraph()`
method has been split out from the `GraphDefBuilder` class (where it
was previously exposed as the `GraphDefBuilder::ToGraph()` utility
method) and added to a new
"tensorflow/core/graph/graph_def_builder_util.h" module.  This method
depends on ".../graph/graph_constructor.cc", which depends directly on
".../common_runtime/shape_refiner.h" and indirectly on
".../common_runtime/graph_runner.h". Since this method was used only
in tests, these have been updated to point to the new utility method.

PiperOrigin-RevId: 184888903
---
 .../jit/mark_for_compilation_pass_test.cc     | 27 +++++++-------
 tensorflow/contrib/cmake/tf_core_cpu.cmake    |  6 ++++
 .../contrib/cmake/tf_core_framework.cmake     |  6 ++++
 tensorflow/core/BUILD                         | 12 +++++--
 tensorflow/core/common_runtime/placer_test.cc |  3 +-
 .../{kernels/data => framework}/dataset.cc    |  8 ++---
 tensorflow/core/framework/dataset.h           |  6 ++--
 tensorflow/core/graph/algorithm_test.cc       |  5 +--
 tensorflow/core/graph/graph_def_builder.cc    | 11 ------
 tensorflow/core/graph/graph_def_builder.h     |  8 -----
 .../core/graph/graph_def_builder_test.cc      |  3 +-
 .../core/graph/graph_def_builder_util.cc      | 28 +++++++++++++++
 .../core/graph/graph_def_builder_util.h       | 35 +++++++++++++++++++
 tensorflow/core/graph/subgraph_test.cc        |  3 +-
 tensorflow/core/kernels/data/BUILD            |  3 +-
 tensorflow/core/kernels/data/iterator_ops.cc  | 17 +++++++++
 16 files changed, 132 insertions(+), 49 deletions(-)
 rename tensorflow/core/{kernels/data => framework}/dataset.cc (97%)
 create mode 100644 tensorflow/core/graph/graph_def_builder_util.cc
 create mode 100644 tensorflow/core/graph/graph_def_builder_util.h

diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
index 454f0aeae9..1a8858ccce 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/graph/graph_def_builder_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -80,7 +81,7 @@ TEST(XlaCompilationTest, Chains) {
         ops::UnaryOp("UncompilableUnary", c, builder.opts().WithName("D"));
     Node* e = ops::UnaryOp("Relu", d, builder.opts().WithName("E"));
     ops::UnaryOp("Relu", e, builder.opts().WithName("F"));
-    TF_EXPECT_OK(builder.ToGraph(graph.get()));
+    TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
 
   TF_ASSERT_OK(MarkForCompilation(&graph));
@@ -105,7 +106,7 @@ TEST(XlaCompilationTest, UncompilableCycles) {
     Node* b =
         ops::UnaryOp("UncompilableUnary", a, builder.opts().WithName("B"));
     ops::BinaryOp("MatMul", a, b, builder.opts().WithName("C"));
-    TF_EXPECT_OK(builder.ToGraph(graph.get()));
+    TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
 
   TF_ASSERT_OK(MarkForCompilation(&graph));
@@ -125,7 +126,7 @@ TEST(XlaCompilationTest, CompilableCycles) {
                                          .WithAttr("value", Tensor()));
     Node* b = ops::UnaryOp("Relu", a, builder.opts().WithName("B"));
     ops::BinaryOp("MatMul", a, b, builder.opts().WithName("C"));
-    TF_EXPECT_OK(builder.ToGraph(graph.get()));
+    TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
 
   TF_ASSERT_OK(MarkForCompilation(&graph));
@@ -148,7 +149,7 @@ TEST(XlaCompilationTest, UnsupportedTypes) {
                      .WithAttr("value", Tensor(DT_COMPLEX128, TensorShape())));
     Node* b = ops::UnaryOp("Neg", a, builder.opts().WithName("B"));
     ops::BinaryOp("MatMul", a, b, builder.opts().WithName("C"));
-    TF_EXPECT_OK(builder.ToGraph(graph.get()));
+    TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
 
   TF_ASSERT_OK(MarkForCompilation(&graph));
@@ -177,7 +178,7 @@ TEST(XlaCompilationTest, ConcatWithConstArg) {
     concat_builder.Input(dim).Input({a, a}).Attr("N", 2);
     builder.opts().FinalizeBuilder(&concat_builder);
 
-    TF_EXPECT_OK(builder.ToGraph(graph.get()));
+    TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
 
   TF_ASSERT_OK(MarkForCompilation(&graph));
@@ -212,7 +213,7 @@ TEST(XlaCompilationTest, FunctionCalls) {
     Node* c = ops::UnaryOp("Relu", b, builder.opts().WithName("C"));
     ops::UnaryOp("UncompilableFn", c, builder.opts().WithName("D"));
     ops::BinaryOp("NoInlineFn", c, c, builder.opts().WithName("E"));
-    TF_EXPECT_OK(builder.ToGraph(graph.get()));
+    TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
 
   TF_ASSERT_OK(MarkForCompilation(&graph, &flib_def));
@@ -244,7 +245,7 @@ TEST(XlaCompilationTest, MetadataOpsDontStartClusters) {
     Node* c = ops::UnaryOp("Rank", b, builder.opts().WithName("C"));
     Node* d = ops::UnaryOp("Size", c, builder.opts().WithName("D"));
     ops::UnaryOp("Shape", d, builder.opts().WithName("E"));
-    TF_EXPECT_OK(builder.ToGraph(graph.get()));
+    TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
   TF_ASSERT_OK(MarkForCompilation(&graph));
   auto clusters = GetClusters(*graph);
@@ -330,7 +331,7 @@ TEST(XlaCompilationTest, SymbolicGradients) {
     d_builder.Input({c, c});
     builder.opts().FinalizeBuilder(&d_builder);
 
-    TF_EXPECT_OK(builder.ToGraph(graph.get()));
+    TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
 
   TF_ASSERT_OK(MarkForCompilation(&graph));
@@ -382,7 +383,7 @@ TEST(XlaCompilationTest, CyclesWithAllDifferentScopes) {
     ops::BinaryOp(
         "MatMul", a, b,
         builder.opts().WithName("C").WithAttr(kXlaScopeAttr, "ScopeC"));
-    TF_CHECK_OK(builder.ToGraph(graph.get()));
+    TF_CHECK_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
 
   TF_ASSERT_OK(MarkForCompilation(&graph));
@@ -413,7 +414,7 @@ TEST(XlaCompilationTest, CyclesWithSplittingScopes) {
     ops::BinaryOp(
         "Add", b, c,
         builder.opts().WithName("D").WithAttr(kXlaScopeAttr, "Scope2"));
-    TF_CHECK_OK(builder.ToGraph(graph.get()));
+    TF_CHECK_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
 
   TF_ASSERT_OK(MarkForCompilation(&graph));
@@ -443,7 +444,7 @@ TEST(XlaCompilationTest, CyclesWithDifferentScopesAndBridge) {
         "Relu", a,
         builder.opts().WithName("B").WithAttr(kXlaScopeAttr, "ScopeB"));
     ops::BinaryOp("MatMul", a, b, builder.opts().WithName("C"));
-    TF_CHECK_OK(builder.ToGraph(graph.get()));
+    TF_CHECK_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
 
   TF_ASSERT_OK(MarkForCompilation(&graph));
@@ -484,7 +485,7 @@ TEST(XlaCompilationTest, Resources) {
     Node* c = ops::UnaryOp("ResourceOutput", b, builder.opts().WithName("C"));
     Node* d = ops::UnaryOp("ResourceInput", c, builder.opts().WithName("D"));
     ops::UnaryOp("Relu", d, builder.opts().WithName("E"));
-    TF_EXPECT_OK(builder.ToGraph(graph.get()));
+    TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
   TF_ASSERT_OK(MarkForCompilation(&graph));
   auto clusters = GetClusters(*graph);
@@ -541,7 +542,7 @@ TEST(XlaCompilationTest, Retval) {
                      .WithAttr("T", DT_FLOAT)
                      .WithAttr("index", 0));
 
-    TF_EXPECT_OK(builder.ToGraph(graph.get()));
+    TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
 
   TF_ASSERT_OK(MarkForCompilation(&graph));
diff --git a/tensorflow/contrib/cmake/tf_core_cpu.cmake b/tensorflow/contrib/cmake/tf_core_cpu.cmake
index e4213ea2a4..96ac60d095 100644
--- a/tensorflow/contrib/cmake/tf_core_cpu.cmake
+++ b/tensorflow/contrib/cmake/tf_core_cpu.cmake
@@ -50,6 +50,12 @@ file(GLOB_RECURSE tf_core_cpu_exclude_srcs
     "${tensorflow_source_dir}/tensorflow/core/graph/edgeset.cc"
     "${tensorflow_source_dir}/tensorflow/core/graph/graph.h"
     "${tensorflow_source_dir}/tensorflow/core/graph/graph.cc"
+    "${tensorflow_source_dir}/tensorflow/core/graph/graph_def_builder.h"
+    "${tensorflow_source_dir}/tensorflow/core/graph/graph_def_builder.cc"
+    "${tensorflow_source_dir}/tensorflow/core/graph/node_builder.h"
+    "${tensorflow_source_dir}/tensorflow/core/graph/node_builder.cc"
+    "${tensorflow_source_dir}/tensorflow/core/graph/tensor_id.h"
+    "${tensorflow_source_dir}/tensorflow/core/graph/tensor_id.cc"
     "${tensorflow_source_dir}/tensorflow/core/graph/while_context.h"
     "${tensorflow_source_dir}/tensorflow/core/graph/while_context.cc"
     "${tensorflow_source_dir}/tensorflow/core/grappler/clusters/single_machine.h"
diff --git a/tensorflow/contrib/cmake/tf_core_framework.cmake b/tensorflow/contrib/cmake/tf_core_framework.cmake
index 129c208ecd..a1c320347f 100644
--- a/tensorflow/contrib/cmake/tf_core_framework.cmake
+++ b/tensorflow/contrib/cmake/tf_core_framework.cmake
@@ -292,6 +292,12 @@ file(GLOB_RECURSE tf_core_framework_srcs
     "${tensorflow_source_dir}/tensorflow/core/graph/edgeset.cc"
     "${tensorflow_source_dir}/tensorflow/core/graph/graph.h"
     "${tensorflow_source_dir}/tensorflow/core/graph/graph.cc"
+    "${tensorflow_source_dir}/tensorflow/core/graph/graph_def_builder.h"
+    "${tensorflow_source_dir}/tensorflow/core/graph/graph_def_builder.cc"
+    "${tensorflow_source_dir}/tensorflow/core/graph/node_builder.h"
+    "${tensorflow_source_dir}/tensorflow/core/graph/node_builder.cc"
+    "${tensorflow_source_dir}/tensorflow/core/graph/tensor_id.h"
+    "${tensorflow_source_dir}/tensorflow/core/graph/tensor_id.cc"
     "${tensorflow_source_dir}/tensorflow/core/graph/while_context.h"
     "${tensorflow_source_dir}/tensorflow/core/graph/while_context.cc"
     "${tensorflow_source_dir}/tensorflow/core/util/*.h"
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 7fade697de..c25aac3acf 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -784,6 +784,7 @@ tf_cuda_library(
         "graph/graph.h",
         "graph/graph_constructor.h",
         "graph/graph_def_builder.h",
+        "graph/graph_def_builder_util.h",
         "graph/node_builder.h",
         "graph/validate.h",
         "graph/while_context.h",
@@ -1718,6 +1719,9 @@ FRAMEWORK_INTERNAL_PRIVATE_HEADERS = [
     "platform/variant_coding.h",
     "graph/edgeset.h",
     "graph/graph.h",
+    "graph/graph_def_builder.h",
+    "graph/node_builder.h",
+    "graph/tensor_id.h",
 ] + glob(
     [
         "example/**/*.h",
@@ -1804,6 +1808,9 @@ tf_cuda_library(
         ] + [
             "graph/edgeset.cc",
             "graph/graph.cc",
+            "graph/graph_def_builder.cc",
+            "graph/node_builder.cc",
+            "graph/tensor_id.cc",
             "graph/while_context.h",
             "graph/while_context.cc",
         ],
@@ -1932,6 +1939,7 @@ GRAPH_HDRS = [
     "graph/graph.h",
     "graph/graph_constructor.h",  # NOTE(mrry): Don't include the .cc since it depends on common_runtime.
     "graph/graph_def_builder.h",
+    "graph/graph_def_builder_util.h",
     "graph/graph_partition.h",
     "graph/mkl_layout_pass.h",
     "graph/mkl_tfconversion_pass.h",
@@ -1952,12 +1960,9 @@ tf_cuda_library(
         "graph/colors.cc",
         "graph/control_flow.cc",
         "graph/costmodel.cc",
-        "graph/graph_def_builder.cc",
         "graph/graph_partition.cc",
-        "graph/node_builder.cc",
         "graph/optimizer_cse.cc",
         "graph/subgraph.cc",
-        "graph/tensor_id.cc",
         "graph/validate.cc",
     ],
     hdrs = GRAPH_HDRS,
@@ -1986,6 +1991,7 @@ tf_cuda_library(
         "common_runtime/shape_refiner.h",
         "framework/versions.h",
         "graph/graph_constructor.cc",  # Depends on common_runtime.
+        "graph/graph_def_builder_util.cc",  # Depends on common_runtime.
         "public/session.h",
         "public/session_options.h",
         "public/version.h",
diff --git a/tensorflow/core/common_runtime/placer_test.cc b/tensorflow/core/common_runtime/placer_test.cc
index 02c9cd5313..098024d219 100644
--- a/tensorflow/core/common_runtime/placer_test.cc
+++ b/tensorflow/core/common_runtime/placer_test.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/graph/graph_def_builder_util.h"
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -193,7 +194,7 @@ class PlacerTest : public ::testing::Test {
   // Builds the given graph, and (if successful) indexes the node
   // names for use in placement, and later lookup.
   Status BuildGraph(const GraphDefBuilder& builder, Graph* out_graph) {
-    TF_RETURN_IF_ERROR(builder.ToGraph(out_graph));
+    TF_RETURN_IF_ERROR(GraphDefBuilderToGraph(builder, out_graph));
     nodes_by_name_.clear();
     for (Node* node : out_graph->nodes()) {
       nodes_by_name_[node->name()] = node->id();
diff --git a/tensorflow/core/kernels/data/dataset.cc b/tensorflow/core/framework/dataset.cc
similarity index 97%
rename from tensorflow/core/kernels/data/dataset.cc
rename to tensorflow/core/framework/dataset.cc
index d18cb16018..4145ef7bc9 100644
--- a/tensorflow/core/kernels/data/dataset.cc
+++ b/tensorflow/core/framework/dataset.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/data/dataset.h"
-#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/dataset.h"
+
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/graph/node_builder.h"
 
@@ -265,10 +265,6 @@ void BinaryDatasetOpKernel::MakeDataset(OpKernelContext* ctx,
   MakeDataset(ctx, input, another_input, output);
 }
 
-Allocator* IteratorContext::allocator(AllocatorAttributes attrs) {
-  return params_.lib->device()->GetAllocator(attrs);
-}
-
 const char GraphDatasetBase::kDatasetGraphKey[] = "_DATASET_GRAPH";
 const char GraphDatasetBase::kDatasetGraphOutputNodeKey[] =
     "_DATASET_GRAPH_OUTPUT_NODE";
diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
index 96566c285a..6ab23d92a4 100644
--- a/tensorflow/core/framework/dataset.h
+++ b/tensorflow/core/framework/dataset.h
@@ -274,7 +274,7 @@ class IteratorContext {
     std::shared_ptr<const FunctionLibraryDefinition> function_library = nullptr;
 
     // The Allocator to be used to allocate the output of an iterator.
-    Allocator* allocator = nullptr;
+    std::function<Allocator*(AllocatorAttributes)> allocator_getter = nullptr;
   };
 
   explicit IteratorContext(Params params) : params_(std::move(params)) {}
@@ -301,7 +301,9 @@ class IteratorContext {
 
   void set_lib(FunctionLibraryRuntime* lib) { params_.lib = lib; }
 
-  Allocator* allocator(AllocatorAttributes attrs);
+  Allocator* allocator(AllocatorAttributes attrs) {
+    return params_.allocator_getter(attrs);
+  }
 
  private:
   Params params_;
diff --git a/tensorflow/core/graph/algorithm_test.cc b/tensorflow/core/graph/algorithm_test.cc
index 0cdcdb6685..99ced0c0f5 100644
--- a/tensorflow/core/graph/algorithm_test.cc
+++ b/tensorflow/core/graph/algorithm_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/graph/graph_def_builder_util.h"
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -81,7 +82,7 @@ TEST(AlgorithmTest, ReversePostOrder) {
   BinaryOp("TestMul", w2, {input, 1}, b.opts().WithName("t3"));
 
   Graph g(OpRegistry::Global());
-  TF_ASSERT_OK(b.ToGraph(&g));
+  TF_ASSERT_OK(GraphDefBuilderToGraph(b, &g));
   std::vector<Node*> order;
 
   // Test reverse post order:
@@ -139,7 +140,7 @@ TEST(AlgorithmTest, ReversePostOrderStable) {
     BinaryOp("TestMul", w1, {input, 1}, b.opts().WithName("t3"));
 
     Graph g(OpRegistry::Global());
-    TF_ASSERT_OK(b.ToGraph(&g));
+    TF_ASSERT_OK(GraphDefBuilderToGraph(b, &g));
     std::vector<Node*> order;
 
     // Test reverse post order generates expected ordering.
diff --git a/tensorflow/core/graph/graph_def_builder.cc b/tensorflow/core/graph/graph_def_builder.cc
index 33d2021f38..7a58347bd1 100644
--- a/tensorflow/core/graph/graph_def_builder.cc
+++ b/tensorflow/core/graph/graph_def_builder.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include <utility>
 
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/lib/core/errors.h"
 
@@ -72,16 +71,6 @@ Status GraphDefBuilder::ToGraphDef(GraphDef* graph_def) const {
   return status_;
 }
 
-Status GraphDefBuilder::ToGraph(Graph* graph) const {
-  if (status_.ok()) {
-    GraphDef graph_def;
-    graph_.ToGraphDef(&graph_def);
-    GraphConstructorOptions opts;
-    TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(opts, graph_def, graph));
-  }
-  return status_;
-}
-
 string GraphDefBuilder::Options::GetNameForOp(StringPiece op) const {
   if (name_.empty()) return graph_->NewName(op);
   return name_;
diff --git a/tensorflow/core/graph/graph_def_builder.h b/tensorflow/core/graph/graph_def_builder.h
index a2c0c4d553..776a74c6d8 100644
--- a/tensorflow/core/graph/graph_def_builder.h
+++ b/tensorflow/core/graph/graph_def_builder.h
@@ -161,14 +161,6 @@ class GraphDefBuilder {
   // successful, and if so fill *graph_def.
   Status ToGraphDef(GraphDef* graph_def) const;
 
-  // Like ToGraphDef(), but converts to a Graph (using the default
-  // GraphConstructorOptions).
-  // TODO(josh11b): Make this faster; right now it converts
-  // Graph->GraphDef->Graph.  This cleans up the graph (e.g. adds
-  // edges from the source and to the sink node, resolves back edges
-  // by name), and makes sure the resulting graph is valid.
-  Status ToGraph(Graph* graph) const;
-
   // Adds the function and gradient definitions in `fdef_lib` to this graph's op
   // registry. Ignores duplicate functions, and returns a bad status if an
   // imported function differs from an existing function or op with the same
diff --git a/tensorflow/core/graph/graph_def_builder_test.cc b/tensorflow/core/graph/graph_def_builder_test.cc
index e928c81b45..be3c2be800 100644
--- a/tensorflow/core/graph/graph_def_builder_test.cc
+++ b/tensorflow/core/graph/graph_def_builder_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/graph_def_builder_util.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
@@ -34,7 +35,7 @@ TEST(GraphDefBuilderTest, Version) {
 
   // Check version when we convert to a Graph
   Graph graph(OpRegistry::Global());
-  TF_EXPECT_OK(builder.ToGraph(&graph));
+  TF_EXPECT_OK(GraphDefBuilderToGraph(builder, &graph));
   ASSERT_EQ(graph.versions().producer(), TF_GRAPH_DEF_VERSION);
   ASSERT_EQ(graph.versions().min_consumer(), TF_GRAPH_DEF_VERSION_MIN_CONSUMER);
 
diff --git a/tensorflow/core/graph/graph_def_builder_util.cc b/tensorflow/core/graph/graph_def_builder_util.cc
new file mode 100644
index 0000000000..102c72185f
--- /dev/null
+++ b/tensorflow/core/graph/graph_def_builder_util.cc
@@ -0,0 +1,28 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/graph/graph_def_builder_util.h"
+
+#include "tensorflow/core/graph/graph_constructor.h"
+
+namespace tensorflow {
+
+Status GraphDefBuilderToGraph(const GraphDefBuilder& builder, Graph* graph) {
+  GraphDef graph_def;
+  TF_RETURN_IF_ERROR(builder.ToGraphDef(&graph_def));
+  GraphConstructorOptions opts;
+  return ConvertGraphDefToGraph(opts, graph_def, graph);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/graph/graph_def_builder_util.h b/tensorflow/core/graph/graph_def_builder_util.h
new file mode 100644
index 0000000000..4a157e5b71
--- /dev/null
+++ b/tensorflow/core/graph/graph_def_builder_util.h
@@ -0,0 +1,35 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_GRAPH_GRAPH_DEF_BUILDER_UTIL_H_
+#define TENSORFLOW_CORE_GRAPH_GRAPH_DEF_BUILDER_UTIL_H_
+
+#include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+class Graph;
+
+// Converts the `GraphDef` being built by `builder` to a `Graph` and
+// stores it in `*graph`.
+// TODO(josh11b): Make this faster; right now it converts
+// Graph->GraphDef->Graph.  This cleans up the graph (e.g. adds
+// edges from the source and to the sink node, resolves back edges
+// by name), and makes sure the resulting graph is valid.
+Status GraphDefBuilderToGraph(const GraphDefBuilder& builder, Graph* graph);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPH_GRAPH_DEF_BUILDER_UTIL_H_
diff --git a/tensorflow/core/graph/subgraph_test.cc b/tensorflow/core/graph/subgraph_test.cc
index fde1ea1743..7219d9812f 100644
--- a/tensorflow/core/graph/subgraph_test.cc
+++ b/tensorflow/core/graph/subgraph_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/graph/graph_def_builder_util.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -361,7 +362,7 @@ static void BM_SubgraphHelper(int iters, int num_nodes,
         last_node = ops::SourceOp("In", b.opts().WithName(name));
       }
     }
-    TF_CHECK_OK(b.ToGraph(&g));
+    TF_CHECK_OK(GraphDefBuilderToGraph(b, &g));
   }
 
   std::vector<string> fed;
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index c4e21257ff..8e91baaa1c 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -44,9 +44,10 @@ tf_kernel_library(
     ],
 )
 
+# TODO(mrry): Remove this empty forwarding library.
 cc_library(
     name = "dataset",
-    srcs = ["dataset.cc"],
+    srcs = [],
     hdrs = ["dataset.h"],
     deps = [
         "//tensorflow/core:core_cpu",
diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index fc3e291afb..d7d4ad5cf7 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -160,6 +160,10 @@ class IteratorResource : public ResourceBase {
       params.runner = *(ctx->runner());
       params.function_library = flib_def;
       params.lib = lib_;
+      DeviceBase* device = lib_->device();
+      params.allocator_getter = [device](AllocatorAttributes attrs) {
+        return device->GetAllocator(attrs);
+      };
       IteratorContext iter_ctx(std::move(params));
 
       TF_RETURN_IF_ERROR(captured_iterator->Restore(&iter_ctx, reader));
@@ -605,6 +609,11 @@ class ToSingleElementOp : public AsyncOpKernel {
       params.env = ctx->env();
       params.runner = *(ctx->runner());
       params.lib = ctx->function_library();
+      DeviceBase* device = ctx->function_library()->device();
+      params.allocator_getter = [device](AllocatorAttributes attrs) {
+        return device->GetAllocator(attrs);
+      };
+
       IteratorContext iter_ctx(std::move(params));
 
       std::vector<Tensor> components;
@@ -863,6 +872,10 @@ class IteratorGetNextOp : public AsyncOpKernel {
           };
           params.runner = *(ctx->runner());
           params.function_library = iterator->function_library();
+          DeviceBase* device = ctx->function_library()->device();
+          params.allocator_getter = [device](AllocatorAttributes attrs) {
+            return device->GetAllocator(attrs);
+          };
           IteratorContext iter_ctx(std::move(params));
 
           OP_REQUIRES_OK_ASYNC(
@@ -905,6 +918,10 @@ class IteratorGetNextSyncOp : public OpKernel {
     };
     params.runner = *(ctx->runner());
     params.function_library = iterator->function_library();
+    DeviceBase* device = ctx->function_library()->device();
+    params.allocator_getter = [device](AllocatorAttributes attrs) {
+      return device->GetAllocator(attrs);
+    };
     IteratorContext iter_ctx(std::move(params));
 
     OP_REQUIRES_OK(ctx,
-- 
GitLab


From ef16b9cc9a8b77ff589179eb4abda5e282070bff Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 7 Feb 2018 13:46:29 -0800
Subject: [PATCH 777/966] Change output of slim.learning.train to total_loss =
 None, if no training step was actually performed.

PiperOrigin-RevId: 184889196
---
 tensorflow/contrib/slim/python/slim/learning.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/slim/python/slim/learning.py b/tensorflow/contrib/slim/python/slim/learning.py
index 83f33806e0..6a200de1ea 100644
--- a/tensorflow/contrib/slim/python/slim/learning.py
+++ b/tensorflow/contrib/slim/python/slim/learning.py
@@ -738,7 +738,7 @@ def train(train_op,
   if summary_writer is not None:
     train_step_kwargs['summary_writer'] = sv.summary_writer
 
-  total_loss = 0
+  total_loss = None
   should_retry = True
   while should_retry:
     try:
@@ -771,10 +771,10 @@ def train(train_op,
               logging.info('Stopping Training.')
               sv.request_stop()
               break
-        except errors.OutOfRangeError:
+        except errors.OutOfRangeError as e:
           # OutOfRangeError is thrown when epoch limit per
           # tf.train.limit_epochs is reached.
-          logging.info('Caught OutOfRangeError. Stopping Training.')
+          logging.info('Caught OutOfRangeError. Stopping Training. %s', e)
         if logdir and sv.is_chief:
           logging.info('Finished training! Saving model to disk.')
           sv.saver.save(sess, sv.save_path, global_step=sv.global_step)
-- 
GitLab


From 190b918c8c82fe43265d2d101be94715f679d747 Mon Sep 17 00:00:00 2001
From: Suharsh Sivakumar <suharshs@google.com>
Date: Wed, 7 Feb 2018 13:47:02 -0800
Subject: [PATCH 778/966] Generalize quantization rewrite to not rely on names.

It should now work with most graphs regardless if they were built slim or not.

PiperOrigin-RevId: 184889280
---
 tensorflow/contrib/quantize/BUILD             |   6 +-
 .../contrib/quantize/python/quant_ops.py      |  16 +-
 .../contrib/quantize/python/quantize.py       | 614 ++++++++----------
 .../python/quantize_parameterized_test.py     | 150 +----
 .../contrib/quantize/python/quantize_test.py  |  11 +-
 5 files changed, 313 insertions(+), 484 deletions(-)

diff --git a/tensorflow/contrib/quantize/BUILD b/tensorflow/contrib/quantize/BUILD
index b7d525a1fa..ada336e623 100644
--- a/tensorflow/contrib/quantize/BUILD
+++ b/tensorflow/contrib/quantize/BUILD
@@ -75,7 +75,9 @@ py_library(
         ":graph_matcher",
         ":input_to_ops",
         "//tensorflow/contrib/graph_editor:graph_editor_py",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:layers",
         "//tensorflow/python:math_ops",
@@ -83,6 +85,7 @@ py_library(
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:ops",
         "//tensorflow/python:training",
+        "//tensorflow/python:util",
         "//tensorflow/python:variables",
     ],
 )
@@ -162,7 +165,6 @@ py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:session",
         "//tensorflow/python:variables",
@@ -174,7 +176,7 @@ py_library(
     srcs = ["python/quantize.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":common",
+        ":graph_matcher",
         ":input_to_ops",
         ":quant_ops",
         "//tensorflow/contrib/graph_editor:graph_editor_py",
diff --git a/tensorflow/contrib/quantize/python/quant_ops.py b/tensorflow/contrib/quantize/python/quant_ops.py
index f80d427ff0..0a8e35080c 100644
--- a/tensorflow/contrib/quantize/python/quant_ops.py
+++ b/tensorflow/contrib/quantize/python/quant_ops.py
@@ -53,7 +53,7 @@ def LastValueQuantize(inputs,
                       init_max=6.0,
                       updates_collection=ops.GraphKeys.UPDATE_OPS,
                       vars_collection=ops.GraphKeys.MOVING_AVERAGE_VARIABLES,
-                      scope=None,
+                      name_prefix='LastValueQuant',
                       reuse=None,
                       is_training=True,
                       num_bits=8,
@@ -73,7 +73,7 @@ def LastValueQuantize(inputs,
       computation.
     vars_collection: (Optional) collection where to store variables for
       quantization interval ends.
-    scope: Optional scope for variable_scope.
+    name_prefix: name_prefix for created nodes.
     reuse: whether or not the layer and its variables should be reused. To be
       able to reuse the layer scope must be given.
     is_training: Whether the op is applied to a training or eval graph.
@@ -84,13 +84,13 @@ def LastValueQuantize(inputs,
     a tensor containing quantized values.
   """
   with variable_scope.variable_scope(
-      scope, 'LastValueQuantize', values=[inputs], reuse=reuse):
+      None, default_name=name_prefix, values=[inputs], reuse=reuse):
     input_shape = inputs.get_shape()
     input_dim = len(input_shape)
     if per_channel:
       # Only support quantizing 1-, 2- and 4-dimensional tensors.
       assert input_dim in [1, 2, 4], ('Expected 1D, 2D or 4D input, was: %s in '
-                                      ' scope: %s' % (input_shape, scope))
+                                      ' scope: %s' % (input_shape, name_prefix))
       min_max_shape = [input_shape[-1]]
     else:
       min_max_shape = []
@@ -165,7 +165,7 @@ def MovingAvgQuantize(inputs,
                       ema_decay=0.999,
                       updates_collection=ops.GraphKeys.UPDATE_OPS,
                       vars_collection=ops.GraphKeys.MOVING_AVERAGE_VARIABLES,
-                      scope=None,
+                      name_prefix='MovingAvgQuantize',
                       reuse=None,
                       is_training=True,
                       num_bits=8,
@@ -186,7 +186,7 @@ def MovingAvgQuantize(inputs,
       computation.
     vars_collection: (Optional) collection where to store variables for
       quantization interval ends.
-    scope: Optional scope for variable_scope.
+    name_prefix: name_prefix for created nodes.
     reuse: whether or not the layer and its variables should be reused. To be
       able to reuse the layer scope must be given.
     is_training: Whether the op is applied to a training or eval graph.
@@ -197,13 +197,13 @@ def MovingAvgQuantize(inputs,
     a tensor containing quantized values.
   """
   with variable_scope.variable_scope(
-      scope, 'MovingAvgQuantize', values=[inputs], reuse=reuse):
+      None, default_name=name_prefix, values=[inputs], reuse=reuse):
     input_shape = inputs.get_shape()
     input_dim = len(input_shape)
     if per_channel:
       # Only support quantizing 1-, 2- and 4-dimensional tensors.
       assert input_dim in [1, 2, 4], ('Expected 1D, 2D or 4D input, was: %s in '
-                                      ' scope: %s' % (input_shape, scope))
+                                      ' scope: %s' % (input_shape, name_prefix))
       min_max_shape = [input_shape[-1]]
     else:
       min_max_shape = []
diff --git a/tensorflow/contrib/quantize/python/quantize.py b/tensorflow/contrib/quantize/python/quantize.py
index 50a2b4c91c..1a63b0a2ce 100644
--- a/tensorflow/contrib/quantize/python/quantize.py
+++ b/tensorflow/contrib/quantize/python/quantize.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Logic to update a Tensorflow model graph with quantization operations."""
+"""Logic to update a TensorFlow model graph with quantization operations."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import re
 from tensorflow.contrib import graph_editor
-from tensorflow.contrib.quantize.python import common
+from tensorflow.contrib.quantize.python import graph_matcher
 from tensorflow.contrib.quantize.python import input_to_ops
 from tensorflow.contrib.quantize.python import quant_ops
 from tensorflow.python.framework import ops
@@ -28,30 +28,29 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.training import training_util
 
-# Operation types used to select operations of interest.
+# Quantizable operation types that are supported by the quantization rewrite.
 _QUANTIZABLE_TYPES = {'Conv2D', 'MatMul', 'DepthwiseConv2dNative'}
 
-# Custom key for storing and retrieving update ops used by quantizing nodes.
-_UPDATE_QUANT_OPS = 'update_quant_ops'
+# Activations that are supported by the quantization rewrite.
+_ACTIVATION_TYPES = {'Relu', 'Relu6', 'Identity'}
+
+# Weight types that are supported by the quantization rewrite.
+# TODO(suharshs): Add support for ResourceVariable.
+_WEIGHT_TYPES = {'Variable', 'VariableV2'}
 
 
 def Quantize(graph,
              weight_bits=8,
-             weight_narrow_range=False,
              activation_bits=8,
              ema_decay=0.999,
              quant_delay=None,
              vars_collection=ops.GraphKeys.MOVING_AVERAGE_VARIABLES,
-             is_training=True,
-             quantize_folded_weights_use_ema=False):
+             is_training=True):
   """Updates graph with quantization operations.
 
   Args:
     graph: Graph to modify.
     weight_bits: Number of bits to use for quantizing weights.
-    weight_narrow_range: Whether to use a more efficient narrow range for
-      weights quantization.  With weight_narrow_range true, the range is
-      [1; 2^weight_bits - 1], with it false [0; 2^weight_bits - 1].
     activation_bits: Number of bits to use for quantizing activations.
     ema_decay: (Optional) Float, EMA decay parameter.  EMA is used to update
       quantization intervals for quantizing activations (see here about EMA:
@@ -62,345 +61,274 @@ def Quantize(graph,
     vars_collection: (Optional) Collection where to store the variables for
       quantization interval ends.
     is_training: (Optional) Whether quantizing training graph or eval graph.
-    quantize_folded_weights_use_ema: (Optional, default False) Whether to
-      quantize weights after batchnorm-folding with exponential average
-      quantization.
   Raises:
     ValueError: When quantization fails.
   """
-  context = _QuantizeContext(graph, weight_bits, weight_narrow_range,
-                             activation_bits, ema_decay, quant_delay,
-                             vars_collection, is_training,
-                             quantize_folded_weights_use_ema)
-
-  graph_ops = graph.get_operations()
-
-  # Filter out backprop and summary related operations, leave only interesting
-  # op types.
-  def _IsInterestingOpWithWeights(op):
-    return (op.type in _QUANTIZABLE_TYPES and
-            not op.name.startswith(common.SKIPPED_PREFIXES))
-
-  for op in (op for op in graph_ops if _IsInterestingOpWithWeights(op)):
-    if op.name.endswith('/depthwise'):
-      # Separable convolution may consist of 2 convolution nodes. If so, skip
-      # .../depthwise and only quantize the top one.
-      separable_conv = context.GetOperationByNameDontThrow(
-          op.name[:-len('/depthwise')])
-      if separable_conv and separable_conv.type == 'Conv2D':
-        continue
-    # Quantize add ops that come after Conv2D or DepthwiseConv2dNative.
-    if op.type in ['Conv2D', 'DepthwiseConv2dNative']:
-      add_context_re = re.search(r'^(.*)/[^/]+/', op.name)
-      if add_context_re is not None:
-        context.add_contexts.add(add_context_re.group(1))
-    if not op.name.endswith('_Fold'):
-      folded_op = context.GetOperationByNameDontThrow(op.name + '_Fold')
-      # Do nothing if found, it will be quantized when it is iterated over.
-      if not folded_op:
-        context.QuantizeOpWithWeights(op, folded=False)
-    else:
-      context.QuantizeOpWithWeights(op, folded=True)
-
-  context.QuantizeAddContexts()
-
-  # Once all quantization ops have been inserted in the graph, collect update
-  # ops for their variables and modify the TF Slim update barrier (see
-  # https://www.tensorflow.org/code/tensorflow/contrib/slim/python/slim/learning.py)
-  # to depend on them.
-  try:
-    update_barrier = graph.get_operation_by_name('update_barrier')
-  except KeyError:
-    # In evaluation graph, this barrier may not exist.
-    return None
-  update_quant_ops = graph.get_collection_ref(_UPDATE_QUANT_OPS)
-  graph_editor.add_control_inputs(update_barrier, update_quant_ops)
-
-
-class _QuantizeContext(object):
-  """Context holds references needed for quantization."""
-
-  def __init__(self,
-               graph,
-               weight_bits,
-               weight_narrow_range,
-               activation_bits,
-               ema_decay=0.999,
-               quant_delay=None,
-               vars_collection=ops.GraphKeys.MOVING_AVERAGE_VARIABLES,
-               is_training=True,
-               quantize_folded_weights_use_ema=False):
-    """Initializes context to hold references needed for quantization.
-
-    Args:
-      graph: Graph to modify.
-      weight_bits: Number of bits to use for quantizing weights.
-      weight_narrow_range: Whether to use a more efficient narrow range for
-        weights quantization.  With weight_narrow_range true, the range is
-        [1; 2^weight_bits - 1], with it false [0; 2^weight_bits - 1].
-      activation_bits: Number of bits to use for quantizing activations.
-      ema_decay: (Optional) Float, EMA decay parameter.
-      quant_delay: (Optional, default None) Int, count of global steps for which
-        to delay quantization.  This helps weights stabilize at the start of
-        training.
-      vars_collection: (Optional) Collection where to store the variables for
-        quantization interval ends.
-      is_training: (Optional) Whether quantizing training or eval graph.
-      quantize_folded_weights_use_ema: (Optional, default False) Whether to
-        quantize weights after batchnorm-folding with exponential average
-        quantization.
-    """
-    self.graph = graph
-    self.weight_bits = weight_bits
-    self.weight_narrow_range = weight_narrow_range
-    self.activation_bits = activation_bits
-    self.ema_decay = ema_decay
-    self.quant_delay = quant_delay
-    self.vars_collection = vars_collection
-    self.is_training = is_training
-    self.quantize_folded_weights_use_ema = quantize_folded_weights_use_ema
-    self.input_to_ops_map = input_to_ops.InputToOps(graph)
-    self.add_contexts = set()
-
-  def QuantizeAddContexts(self):
-    """Quantizes all add ops in self.add_contexts."""
-    # Loop through sorted self.add_contexts so that op creation is
-    # deterministic. This is needed when using multiple worker replicas so that
-    # the ops can be initialized consistently.
-    for add_context in sorted(self.add_contexts):
-      add_op = self.GetOperationByNamesDontThrow([
-          add_context + '/Add', add_context + '/add'])
-      if add_op is not None:
-        self._InsertQuantOp(
-            add_context,
-            add_op,
-            self.input_to_ops_map.ConsumerOperations(add_op),
-            name='add_quant',
-            moving_avg=True,
-            bits=self.activation_bits,
-            narrow_range=False)
-
-  def QuantizeOpWithWeights(self, op, folded):
-    """Quantizes around the specific operation with or without batch norm.
-
-    Args:
-      op: Operation to quantize.
-      folded: Operation has been folded and needs special handling if True.
-    Raises:
-      ValueError: When quantization fails.
-    """
-    # Op name component before the last slash will be used as context.
-    context = re.search(r'^(.*)/([^/]+)', op.name).group(1)
-
-    # Quantize weights.
-    if folded:
-      producer_op = self.graph.get_operation_by_name(context + '/mul_fold')
-    else:
-      try:
-        input_idx = next(i for i, v in enumerate(op.inputs)
-                         if '/weights/' in v.name or
-                         '/depthwise_weights' in v.name)
-      except StopIteration:
-        raise ValueError('No inputs to quantize for op: %s' % op)
-      producer_op = op.inputs[input_idx].op
-
-    # If batch norm is used, the folded weights depend on the batch std, hence
-    # it is sensible to use EMA during training to smooth out the noise. This is
-    # controlled by the flag quantize_folded_weights_use_ema. Its default is
-    # False for backward compatibility.
-    # If there is no batch norm, weights do not depend on the batch and using
-    # the latest value of min and max is more efficient.
-    weight_use_ema = folded and self.quantize_folded_weights_use_ema
-    self._InsertQuantOp(
+  input_to_ops_map = input_to_ops.InputToOps(graph)
+  for layer_match in _FindLayersToQuantize(graph):
+    # Quantize the weights.
+    context = _GetContextFromOp(layer_match.layer_op)
+    _InsertQuantOp(
         context,
-        producer_op, [op],
+        layer_match.weight_tensor.op, [layer_match.layer_op],
         name='weights_quant',
-        moving_avg=weight_use_ema,
-        delay_requested=weight_use_ema,
-        bits=self.weight_bits,
-        narrow_range=self.weight_narrow_range)
-
-    # Important: do not quantize biases here.  During inference they are
-    # quantized to 32 bits, which is much finer than 8 bit quantization and
-    # depends on weight and input activation ranges.
-
-    # Find activation and (optionally) Add operations to quantize.
-    activation_op, add_op, add_context = self._GetReluAndAddOperations(context,
-                                                                       op)
-    if add_op:
-      original_context = context
-      context = add_context
-
-    # Quantize activation outputs.
-    consumer_ops = self.input_to_ops_map.ConsumerOperations(activation_op)
-    self._InsertQuantOp(
-        context,
-        activation_op,
+        moving_avg=False,
+        bits=weight_bits,
+        ema_decay=ema_decay,
+        quant_delay=quant_delay,
+        is_training=is_training,
+        narrow_range=True,
+        vars_collection=vars_collection)
+
+    # Quantize the activations.
+    consumer_ops = input_to_ops_map.ConsumerOperations(
+        layer_match.activation_op)
+    add_context = context
+    if layer_match.bypass_op:
+      add_context = re.search(r'^(.*)/([^/]+)', context).group(1)
+    _InsertQuantOp(
+        add_context,
+        layer_match.activation_op,
         consumer_ops,
         name='act_quant',
         moving_avg=True,
         init_min=0.0,
-        bits=self.activation_bits,
-        narrow_range=False)
-
-    # When a bypass connection was found, also quantize Add op input.
-    if add_op:
-      def _QuantizeAddInput(add_input):
-        if folded:
-          return add_input.op.name.endswith('/add_fold')
-        else:
-          return add_input.op.name.startswith(original_context + '/')
-
-      for add_input in add_op.inputs:
-        if _QuantizeAddInput(add_input):
-          self._InsertQuantOp(
-              original_context,
-              add_input.op, [add_op],
-              name='conv_quant',
-              moving_avg=True,
-              bits=self.activation_bits,
-              narrow_range=False)
-
-  def _GetReluAndAddOperations(self, context, op):
-    """Looks up a Relu* and Add operations in given context.
-
-    Args:
-      context: Context where to look for operations.
-      op: Operation to quantize.
-
-    Returns:
-      A triplet (Operation, Operation, string), the first element is an end
-      point operation, the second is Add operation (optional), the third element
-      is string context where the Add operation was found (optional).
-
-    Raises:
-      ValueError: When operations cannot be found.
-    """
-    activation_op = common.GetEndpointActivationOp(self.graph, context)
-    if activation_op:
-      return activation_op, None, None
-
-    if '/' in context:
-      # If no activation op is there, look for them one level up.
-      add_context = re.search(r'^(.*)/([^/]+)', context).group(1)
-      activation_op = common.GetEndpointActivationOp(self.graph, add_context)
-    if not activation_op:
-      # Still no Relu, can happen on the top layer, just find the next node up,
-      # make sure it is BiasAdd.
-      consumers = [c for outp in op.outputs for c in outp.consumers()]
-      if len(consumers) != 1 or consumers[0].type != 'BiasAdd':
-        raise ValueError('Failed to quantize op: %s, %s' % (op.name, op.type))
-      return consumers[0], None, None
-    if add_context:
-      add_op = self.GetOperationByNamesDontThrow([
-          add_context + '/Add', add_context + '/add'])
-      return activation_op, add_op, add_context
-    else:
-      raise ValueError('Failed to quantize op: %s, %s' % (op.name, op.type))
-
-  def GetOperationByNameDontThrow(self, name):
-    """Returns an Operation with the given name.
-
-    Args:
-      name: Name of Operation to return.
-
-    Returns:
-      The Operation with the given name. None if the name does not correspond to
-      any operation in the graph.
-    """
-    try:
-      return self.graph.get_operation_by_name(name)
-    except KeyError:
-      return None
-
-  def GetOperationByNamesDontThrow(self, names):
-    """Returns an Operation with one of the given names.
-
-    Args:
-      names: Names of Operation to return.
-
-    Returns:
-      The Operation with one of the given names. None if none of the names
-      corresponds to any operation in the graph.
-    """
-    for name in names:
-      op = self.GetOperationByNameDontThrow(name)
-      if op is not None:
-        return op
-    return None
-
-  def _InsertQuantOp(
-      self,
-      context,
-      producer,
-      consumers,
-      name,
-      moving_avg=True,
-      init_min=-6.0,
-      init_max=6.0,
-      delay_requested=True,
-      bits=8,
-      narrow_range=False,):
-    """Inserts a quant op between a producer op and (multiple) consumer ops.
-
-    Args:
-      context: Context where producer and consumer operations are nested.
-      producer: Producer operation of the pairs where quantization will be
-        inserted.
-      consumers: Consumer operations of the pairs.
-      name: Name for the new quantization op within the context.
-      moving_avg: Specifies whether to use exponential moving average or just
-        the last value seen.
-      init_min: Starting minimum value for the new quantization op.
-      init_max: Starting maximum value for the new quantization op.
-      delay_requested: If true, implement quantization delay where needed.
-        False value explicitly disables delay quantization everywhere.
-      bits: Number of bits to use for quantization, must be between 2 and 8.
-      narrow_range: Whether to use the narrow quantization range
+        ema_decay=ema_decay,
+        quant_delay=quant_delay,
+        bits=activation_bits,
+        vars_collection=vars_collection)
+
+    # Quantize the inputs and output to the bypass (if it exists). The input to
+    # the bypass is the bias add, and the output is the activation.
+    if layer_match.bypass_op is not None:
+      _InsertQuantOp(
+          context,
+          layer_match.bias_add_op, [layer_match.bypass_op],
+          name='conv_quant',
+          moving_avg=True,
+          ema_decay=ema_decay,
+          quant_delay=quant_delay,
+          vars_collection=vars_collection,
+          bits=activation_bits)
+      _InsertQuantOp(
+          add_context,
+          layer_match.bypass_op,
+          input_to_ops_map.ConsumerOperations(layer_match.bypass_op),
+          name='add_quant',
+          moving_avg=True,
+          bits=activation_bits)
+
+
+def _FindLayersToQuantize(graph):
+  """Matches layers in graph to quantize.
+
+  Args:
+    graph: Graph to perform match on.
+
+  Yields:
+    _LayerMatches.
+  """
+  input_pattern = graph_matcher.OpTypePattern('*')
+  weight_var_pattern = graph_matcher.OpTypePattern('|'.join(_WEIGHT_TYPES))
+  weight_pattern = graph_matcher.OpTypePattern(
+      'Identity', inputs=[weight_var_pattern])
+
+  folded_weight_pattern = graph_matcher.OpTypePattern('Mul')
+
+  # The weights inputs to the layer operation can either be from the Variable or
+  # the folded weight (Mul).
+  layer_pattern = graph_matcher.OpTypePattern(
+      '|'.join(_QUANTIZABLE_TYPES),
+      inputs=[
+          input_pattern,
+          graph_matcher.OneofPattern([weight_pattern, folded_weight_pattern])
+      ])
+
+  folded_bias_mul_pattern = graph_matcher.OpTypePattern(
+      'Mul', inputs=[graph_matcher.OpTypePattern('*'), layer_pattern])
+  post_layer_op_correction_pattern = graph_matcher.OpTypePattern(
+      'Add', inputs=[folded_bias_mul_pattern,
+                     graph_matcher.OpTypePattern('*')])
+  folded_bias_add_pattern = graph_matcher.OpTypePattern(
+      'Add',
+      inputs=[
+          post_layer_op_correction_pattern,
+          graph_matcher.OpTypePattern('*')
+      ])
+
+  bias_add_pattern = graph_matcher.OpTypePattern(
+      'Add|BiasAdd', inputs=[layer_pattern, '*'])
+
+  # The bias can come from the bias add or the folded bias add.
+  bypass_pattern_a = graph_matcher.OpTypePattern(
+      'Add',
+      inputs=[
+          graph_matcher.OneofPattern(
+              [bias_add_pattern, folded_bias_add_pattern]), '*'
+      ])
+  bypass_pattern_b = graph_matcher.OpTypePattern(
+      'Add',
+      inputs=[
+          '*',
+          graph_matcher.OneofPattern(
+              [bias_add_pattern, folded_bias_add_pattern])
+      ])
+
+  # The input to the activation can come from bias add, fold bias add or the
+  # bypasses.
+  activation_pattern = graph_matcher.OpTypePattern(
+      '|'.join(_ACTIVATION_TYPES),
+      inputs=[
+          graph_matcher.OneofPattern([
+              bias_add_pattern, folded_bias_add_pattern, bypass_pattern_a,
+              bypass_pattern_b
+          ])
+      ])
+
+  layer_matcher = graph_matcher.GraphMatcher(activation_pattern)
+  for match_result in layer_matcher.match_graph(graph):
+    layer_op = match_result.get_op(layer_pattern)
+    weight_tensor = match_result.get_tensor(weight_pattern)
+    if weight_tensor is None:
+      weight_tensor = match_result.get_tensor(folded_weight_pattern)
+    activation_op = match_result.get_op(activation_pattern)
+    bias_add_op = match_result.get_op(bias_add_pattern)
+    if bias_add_op is None:
+      bias_add_op = match_result.get_op(folded_bias_add_pattern)
+    bypass_op = match_result.get_op(bypass_pattern_a)
+    if bypass_op is None:
+      bypass_op = match_result.get_op(bypass_pattern_b)
+    yield _LayerMatch(layer_op, weight_tensor, activation_op, bypass_op,
+                      bias_add_op)
+
+
+class _LayerMatch(object):
+  """Contains all information related to a matched Layer."""
+
+  def __init__(self, layer_op, weight_tensor, activation_op, bypass_op,
+               bias_add_op):
+    self._layer_op = layer_op
+    self._weight_tensor = weight_tensor
+    self._activation_op = activation_op
+    self._bypass_op = bypass_op
+    self._bias_add_op = bias_add_op
+
+  @property
+  def layer_op(self):
+    return self._layer_op
+
+  @property
+  def weight_tensor(self):
+    return self._weight_tensor
+
+  @property
+  def activation_op(self):
+    return self._activation_op
+
+  @property
+  def bypass_op(self):
+    return self._bypass_op
+
+  @property
+  def bias_add_op(self):
+    return self._bias_add_op
+
+
+def _InsertQuantOp(context,
+                   producer,
+                   consumers,
+                   name,
+                   moving_avg=True,
+                   init_min=-6.0,
+                   init_max=6.0,
+                   bits=8,
+                   ema_decay=0.999,
+                   quant_delay=None,
+                   vars_collection=ops.GraphKeys.MOVING_AVERAGE_VARIABLES,
+                   is_training=True,
+                   narrow_range=False):
+  """Inserts a quant op between a producer op and (multiple) consumer ops.
+
+  Args:
+    context: Context w,here producer and consumer operations are nested.
+    producer: Producer operation of the pairs where quantization will be
+      inserted.
+    consumers: Consumer operations of the pairs.
+    name: Name for the new quantization op within the context.
+    moving_avg: Specifies whether to use exponential moving average or just
+      the last value seen.
+    init_min: Starting minimum value for the new quantization op.
+    init_max: Starting maximum value for the new quantization op.
+    bits: Number of bits to use for quantization, must be between 2 and 8.
+    ema_decay: (Optional) Float, EMA decay parameter.  EMA is used to update
+      quantization intervals for quantizing activations (see here about EMA:
+      https://en.wikipedia.org/wiki/Moving_average#Exponential_moving_average).
+    quant_delay: (Optional, default None) Int, count of global steps for which
+      to delay quantization.  This helps weights stabilize at the start of
+      training.
+    vars_collection: (Optional) Collection where to store the variables for
+      quantization interval ends.
+    is_training: (Optional) Whether quantizing training graph or eval graph.
+    narrow_range: Whether to use the narrow quantization range
         [1; 2^bits - 1] or wide range [0; 2^bits - 1].
-    Raises:
-      ValueError: When producer operation is not directly connected to the
-        consumer operation.
-    """
-    scope = context + '/' + name
-    inputs = producer.outputs[0]
-    if moving_avg:
-      quant = (quant_ops.MovingAvgQuantize(
-          inputs,
-          init_min=init_min,
-          init_max=init_max,
-          ema_decay=self.ema_decay,
-          is_training=self.is_training,
-          num_bits=bits,
-          narrow_range=narrow_range,
-          updates_collection=_UPDATE_QUANT_OPS,
-          vars_collection=self.vars_collection,
-          scope=scope))
-    else:
-      quant = (quant_ops.LastValueQuantize(
-          inputs,
-          init_min=init_min,
-          init_max=init_max,
-          is_training=self.is_training,
-          num_bits=bits,
-          narrow_range=narrow_range,
-          updates_collection=_UPDATE_QUANT_OPS,
-          vars_collection=self.vars_collection,
-          scope=scope))
-
-    if delay_requested and self.quant_delay and self.quant_delay > 0:
-      activate_quant = math_ops.greater_equal(
-          training_util.get_or_create_global_step(),
-          self.quant_delay,
-          name=scope + '/activate_quant')
-      quant = control_flow_ops.cond(
-          activate_quant,
-          lambda: quant,
-          lambda: inputs,
-          name=scope + '/delayed_quant')
-
-    nodes_modified_count = graph_editor.reroute_ts(
-        [quant], [inputs], can_modify=consumers)
-    if nodes_modified_count != len(consumers):
-      raise ValueError('Some inputs not quantized for ops: [%s]' %
-                       ', '.join([consumer.name for consumer in consumers]))
+  Raises:
+    ValueError: When producer operation is not directly connected to the
+      consumer operation.
+  """
+  name_prefix = _AddContextToName(context, name)
+  inputs = producer.outputs[0]
+  if moving_avg:
+    quant = (
+        quant_ops.MovingAvgQuantize(
+            inputs,
+            init_min=init_min,
+            init_max=init_max,
+            ema_decay=ema_decay,
+            is_training=is_training,
+            num_bits=bits,
+            narrow_range=narrow_range,
+            vars_collection=vars_collection,
+            name_prefix=name_prefix))
+  else:
+    quant = (
+        quant_ops.LastValueQuantize(
+            inputs,
+            init_min=init_min,
+            init_max=init_max,
+            is_training=is_training,
+            num_bits=bits,
+            narrow_range=narrow_range,
+            vars_collection=vars_collection,
+            name_prefix=name_prefix))
+
+  if quant_delay and quant_delay > 0:
+    activate_quant = math_ops.greater_equal(
+        training_util.get_or_create_global_step(),
+        quant_delay,
+        name=name_prefix + '/activate_quant')
+    quant = control_flow_ops.cond(
+        activate_quant,
+        lambda: quant,
+        lambda: inputs,
+        name=name_prefix + '/delayed_quant')
+
+  nodes_modified_count = graph_editor.reroute_ts(
+      [quant], [inputs], can_modify=consumers)
+  if nodes_modified_count != len(consumers):
+    raise ValueError('Some inputs not quantized for ops: [%s]' % ', '.join(
+        [consumer.name for consumer in consumers]))
+
+
+def _GetContextFromOp(op):
+  """Gets the root context name from the op name."""
+  context_re = re.search(r'^(.*)/([^/]+)', op.name)
+  if context_re:
+    return context_re.group(1)
+  return ''
+
+
+def _AddContextToName(context, name):
+  """Adds the context to the name if it exists."""
+  if not context:
+    return name
+  return context + '/' + name
diff --git a/tensorflow/contrib/quantize/python/quantize_parameterized_test.py b/tensorflow/contrib/quantize/python/quantize_parameterized_test.py
index 57dab03f16..f1fe322049 100644
--- a/tensorflow/contrib/quantize/python/quantize_parameterized_test.py
+++ b/tensorflow/contrib/quantize/python/quantize_parameterized_test.py
@@ -101,7 +101,9 @@ class QuantizeTest(test_util.TensorFlowTestCase):
         scope + '/weights_quant/AssignMaxLast', scope + '/weights/read'
     ]
     self._AssertInputOpsAre(weights_quant, expected_inputs)
-    output_op_name = scope + '/Conv2D'
+    output_op_name = (
+        scope + '/weights_quant/delayed_quant/Switch_1'
+        if delay else scope + '/Conv2D')
     self._AssertOutputGoesToOps(weights_quant, graph, [output_op_name])
 
     if with_bypass:
@@ -176,7 +178,9 @@ class QuantizeTest(test_util.TensorFlowTestCase):
         scope + '/weights_quant/AssignMaxLast', scope + '/weights/read'
     ]
     self._AssertInputOpsAre(weights_quant, expected_inputs)
-    output_op_name = scope + '/MatMul'
+    output_op_name = (
+        scope + '/weights_quant/delayed_quant/Switch_1'
+        if delay else scope + '/MatMul')
     self._AssertOutputGoesToOps(weights_quant, graph, [output_op_name])
 
     if with_bypass:
@@ -252,7 +256,9 @@ class QuantizeTest(test_util.TensorFlowTestCase):
         scope + '/depthwise_weights/read'
     ]
     self._AssertInputOpsAre(weights_quant, expected_inputs)
-    output_op_name = scope + '/depthwise'
+    output_op_name = (
+        scope + '/weights_quant/delayed_quant/Switch_1'
+        if delay else scope + '/depthwise')
     self._AssertOutputGoesToOps(weights_quant, graph, [output_op_name])
 
     if with_bypass:
@@ -316,40 +322,11 @@ class QuantizeTest(test_util.TensorFlowTestCase):
     for params in parameters_list:
       test_fn(params[0], params[1], params[2], params[3], params[4])
 
-  def _TestQuantize_Conv2dWithBatchNorm(self, activation, activation_op_name,
-                                        with_bypass, delay, fused_batch_norm):
-    """Tests quantization: inputs -> Conv2d with batch norm -> Activation.
-
-    Args:
-      activation: Callable that returns an Operation, a factory method for the
-        Activation.
-      activation_op_name: String, name of the Activation operation.
-      with_bypass: Bool, when true there is an extra connection added from
-        inputs to just before Activation.
-      delay: Int (optional), delay in number of steps until quantization starts.
-      fused_batch_norm: Bool, when true use FusedBatchNorm.
-    """
-    self._testQuantize_Conv2dWithBatchNorm(
-        activation,
-        activation_op_name,
-        with_bypass,
-        delay,
-        fused_batch_norm,
-        use_ema=True)
-    self._testQuantize_Conv2dWithBatchNorm(
-        activation,
-        activation_op_name,
-        with_bypass,
-        delay,
-        fused_batch_norm,
-        use_ema=False)
-
   def testQuantize_Conv2dWithBatchNorm(self):
     self._RunBatchNormTestOverParameters(self._TestQuantize_Conv2dWithBatchNorm)
 
-  def _testQuantize_Conv2dWithBatchNorm(self, activation, activation_op_name,
-                                        with_bypass, delay, fused_batch_norm,
-                                        use_ema):
+  def _TestQuantize_Conv2dWithBatchNorm(self, activation, activation_op_name,
+                                        with_bypass, delay, fused_batch_norm):
     """Tests quantization: inputs -> Conv2d with batch norm -> Activation.
 
     Args:
@@ -360,7 +337,6 @@ class QuantizeTest(test_util.TensorFlowTestCase):
         inputs to just before Activation.
       delay: Int (optional), delay in number of steps until quantization starts.
       fused_batch_norm: Bool, when true use FusedBatchNorm.
-      use_ema: Bool, when true uses EMA quantization for BN folded weights.
     """
     graph = ops.Graph()
     with graph.as_default():
@@ -394,23 +370,19 @@ class QuantizeTest(test_util.TensorFlowTestCase):
 
       fold_batch_norms.FoldBatchNorms(graph)
 
-      quantize.Quantize(
-          graph, quant_delay=delay, quantize_folded_weights_use_ema=use_ema)
+      quantize.Quantize(graph, quant_delay=delay)
 
     quantization_node_name = 'FakeQuantWithMinMaxVars'
     weights_quant = graph.get_operation_by_name(scope + '/weights_quant/' +
                                                 quantization_node_name)
     self.assertEqual(weights_quant.type, quantization_node_name)
     expected_inputs = [
-        scope + '/weights_quant/' + ('AssignMinEma'
-                                     if use_ema else 'AssignMinLast'),
-        scope + '/weights_quant/' + ('AssignMaxEma'
-                                     if use_ema else 'AssignMaxLast'),
-        scope + '/mul_fold'
+        scope + '/weights_quant/' + 'AssignMinLast',
+        scope + '/weights_quant/' + 'AssignMaxLast', scope + '/mul_fold'
     ]
     self._AssertInputOpsAre(weights_quant, expected_inputs)
     output_op_name = scope + ('/weights_quant/delayed_quant/Switch_1'
-                              if (delay and use_ema) else '/Conv2D_Fold')
+                              if delay else '/Conv2D_Fold')
     self._AssertOutputGoesToOps(weights_quant, graph, [output_op_name])
 
     if with_bypass:
@@ -438,40 +410,11 @@ class QuantizeTest(test_util.TensorFlowTestCase):
                       if delay else 'control_dependency')
     self._AssertOutputGoesToOps(act_quant, graph, [output_op_name])
 
-  def _TestQuantize_FCWithBatchNorm(self, activation, activation_op_name,
-                                    with_bypass, delay, fused_batch_norm):
-    """Tests quantization: inputs -> FC with batch norm -> Activation.
-
-    Args:
-      activation: Callable that returns an Operation, a factory method for the
-        Activation.
-      activation_op_name: String, name of the Activation operation.
-      with_bypass: Bool, when true there is an extra connection added from
-        inputs to just before Activation.
-      delay: Int (optional), delay in number of steps until quantization starts.
-      fused_batch_norm: Bool, when true use FusedBatchNorm.
-    """
-    self._testQuantize_FCWithBatchNorm(
-        activation,
-        activation_op_name,
-        with_bypass,
-        delay,
-        fused_batch_norm,
-        use_ema=True)
-    self._testQuantize_FCWithBatchNorm(
-        activation,
-        activation_op_name,
-        with_bypass,
-        delay,
-        fused_batch_norm,
-        use_ema=False)
-
   def testQuantize_FCWithBatchNorm(self):
     self._RunBatchNormTestOverParameters(self._TestQuantize_FCWithBatchNorm)
 
-  def _testQuantize_FCWithBatchNorm(self, activation, activation_op_name,
-                                    with_bypass, delay, fused_batch_norm,
-                                    use_ema):
+  def _TestQuantize_FCWithBatchNorm(self, activation, activation_op_name,
+                                    with_bypass, delay, fused_batch_norm):
     """Tests quantization: inputs -> FC with batch norm -> Activation.
 
     Args:
@@ -482,7 +425,6 @@ class QuantizeTest(test_util.TensorFlowTestCase):
         inputs to just before Activation.
       delay: Int (optional), delay in number of steps until quantization starts.
       fused_batch_norm: Bool, when true use FusedBatchNorm.
-      use_ema: Bool, when true uses EMA quantization for BN folded weights.
     """
     graph = ops.Graph()
     with graph.as_default():
@@ -513,23 +455,19 @@ class QuantizeTest(test_util.TensorFlowTestCase):
 
       fold_batch_norms.FoldBatchNorms(graph)
 
-      quantize.Quantize(
-          graph, quant_delay=delay, quantize_folded_weights_use_ema=use_ema)
+      quantize.Quantize(graph, quant_delay=delay)
 
     quantization_node_name = 'FakeQuantWithMinMaxVars'
     weights_quant = graph.get_operation_by_name(scope + '/weights_quant/' +
                                                 quantization_node_name)
     self.assertEqual(weights_quant.type, quantization_node_name)
     expected_inputs = [
-        scope + '/weights_quant/' + ('AssignMinEma'
-                                     if use_ema else 'AssignMinLast'),
-        scope + '/weights_quant/' + ('AssignMaxEma'
-                                     if use_ema else 'AssignMaxLast'),
-        scope + '/mul_fold'
+        scope + '/weights_quant/' + 'AssignMinLast',
+        scope + '/weights_quant/' + 'AssignMaxLast', scope + '/mul_fold'
     ]
     self._AssertInputOpsAre(weights_quant, expected_inputs)
     output_op_name = scope + ('/weights_quant/delayed_quant/Switch_1'
-                              if delay and use_ema else '/MatMul_Fold')
+                              if delay else '/MatMul_Fold')
     self._AssertOutputGoesToOps(weights_quant, graph, [output_op_name])
 
     if with_bypass:
@@ -557,42 +495,13 @@ class QuantizeTest(test_util.TensorFlowTestCase):
                       if delay else 'control_dependency')
     self._AssertOutputGoesToOps(act_quant, graph, [output_op_name])
 
-  def _TestQuantize_DepthwiseConv2dWithBatchNorm(
-      self, activation, activation_op_name, with_bypass, delay,
-      fused_batch_norm):
-    """Tests quantization: inputs -> DWConv2d with batch norm -> Activation.
-
-    Args:
-      activation: Callable that returns an Operation, a factory method for the
-        Activation.
-      activation_op_name: String, name of the Activation operation.
-      with_bypass: Bool, when true there is an extra connection added from
-        inputs to just before Activation.
-      delay: Int (optional), delay in number of steps until quantization starts.
-      fused_batch_norm: Bool, when true use FusedBatchNorm.
-    """
-    self._testQuantize_DepthwiseConv2dWithBatchNorm(
-        activation,
-        activation_op_name,
-        with_bypass,
-        delay,
-        fused_batch_norm,
-        use_ema=True)
-    self._testQuantize_DepthwiseConv2dWithBatchNorm(
-        activation,
-        activation_op_name,
-        with_bypass,
-        delay,
-        fused_batch_norm,
-        use_ema=False)
-
   def testQuantize_DepthwiseConv2dWithBatchNorm(self):
     self._RunBatchNormTestOverParameters(
         self._TestQuantize_DepthwiseConv2dWithBatchNorm)
 
-  def _testQuantize_DepthwiseConv2dWithBatchNorm(
+  def _TestQuantize_DepthwiseConv2dWithBatchNorm(
       self, activation, activation_op_name, with_bypass, delay,
-      fused_batch_norm, use_ema):
+      fused_batch_norm):
     """Tests quantization: inputs -> DWConv2d with batch norm -> Activation.
 
     Args:
@@ -603,7 +512,6 @@ class QuantizeTest(test_util.TensorFlowTestCase):
         inputs to just before Activation.
       delay: Int (optional), delay in number of steps until quantization starts.
       fused_batch_norm: Bool, when true use FusedBatchNorm.
-      use_ema: Bool, when true uses EMA quantization for BN folded weights.
     """
     graph = ops.Graph()
     with graph.as_default():
@@ -637,22 +545,18 @@ class QuantizeTest(test_util.TensorFlowTestCase):
 
       fold_batch_norms.FoldBatchNorms(graph)
 
-      quantize.Quantize(
-          graph, quant_delay=delay, quantize_folded_weights_use_ema=use_ema)
+      quantize.Quantize(graph, quant_delay=delay)
     quantization_node_name = 'FakeQuantWithMinMaxVars'
     weights_quant = graph.get_operation_by_name(scope + '/weights_quant/' +
                                                 quantization_node_name)
     self.assertEqual(weights_quant.type, quantization_node_name)
     expected_inputs = [
-        scope + '/weights_quant/' + ('AssignMinEma'
-                                     if use_ema else 'AssignMinLast'),
-        scope + '/weights_quant/' + ('AssignMaxEma'
-                                     if use_ema else 'AssignMaxLast'),
-        scope + '/mul_fold'
+        scope + '/weights_quant/' + 'AssignMinLast',
+        scope + '/weights_quant/' + 'AssignMaxLast', scope + '/mul_fold'
     ]
     self._AssertInputOpsAre(weights_quant, expected_inputs)
     output_op_name = scope + ('/weights_quant/delayed_quant/Switch_1'
-                              if delay and use_ema else '/depthwise_Fold')
+                              if delay else '/depthwise_Fold')
     self._AssertOutputGoesToOps(weights_quant, graph, [output_op_name])
 
     if with_bypass:
diff --git a/tensorflow/contrib/quantize/python/quantize_test.py b/tensorflow/contrib/quantize/python/quantize_test.py
index 1e4dd7cf67..53cbd66741 100644
--- a/tensorflow/contrib/quantize/python/quantize_test.py
+++ b/tensorflow/contrib/quantize/python/quantize_test.py
@@ -45,13 +45,10 @@ class QuantizeTest(test_util.TensorFlowTestCase):
                     activation_fn=None, scope='test')
       relu = nn_ops.relu6(inputs)
 
-    context = quantize._QuantizeContext(graph=graph, weight_bits=8,
-                                        weight_narrow_range=True,
-                                        activation_bits=8)
     # Inserting a quantization op between two unconnected ops should fail with
     # ValueError.
     with self.assertRaises(ValueError) as err:
-      context._InsertQuantOp('test', conv.op, [relu.op], 'FailingQuantOp')
+      quantize._InsertQuantOp('test', conv.op, [relu.op], 'FailingQuantOp')
     self.assertEqual(
         str(err.exception), 'Some inputs not quantized for ops: [Relu6]')
 
@@ -70,8 +67,7 @@ class QuantizeTest(test_util.TensorFlowTestCase):
       with ops.control_dependencies([update_barrier]):
         array_ops.identity(node, name='control_dependency')
 
-    quantize.Quantize(graph=graph, weight_bits=8, weight_narrow_range=True,
-                      activation_bits=8)
+    quantize.Quantize(graph=graph, weight_bits=8, activation_bits=8)
 
     quantization_node_name = 'FakeQuantWithMinMaxVars'
     add_quant = graph.get_operation_by_name('test/add_quant/' +
@@ -94,8 +90,7 @@ class QuantizeTest(test_util.TensorFlowTestCase):
       with ops.control_dependencies([update_barrier]):
         array_ops.identity(node, name='control_dependency')
 
-    quantize.Quantize(graph=graph, weight_bits=8, weight_narrow_range=True,
-                      activation_bits=8)
+    quantize.Quantize(graph=graph, weight_bits=8, activation_bits=8)
 
     quantization_node_name = 'FakeQuantWithMinMaxVars'
     add_quant = graph.get_operation_by_name('test/add_quant/' +
-- 
GitLab


From 8461760f9f6cde8ed97507484d2a879140141032 Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Wed, 7 Feb 2018 14:13:05 -0800
Subject: [PATCH 779/966] Better documentation for contrib summaries.

Also all_summary_ops returns None in eager mode instead of error.

PiperOrigin-RevId: 184893777
---
 tensorflow/contrib/summary/summary.py     | 36 +++++++++++++++++++++++
 tensorflow/contrib/summary/summary_ops.py | 12 ++++----
 2 files changed, 41 insertions(+), 7 deletions(-)

diff --git a/tensorflow/contrib/summary/summary.py b/tensorflow/contrib/summary/summary.py
index 7d3b8b7437..2d6d7ea6a3 100644
--- a/tensorflow/contrib/summary/summary.py
+++ b/tensorflow/contrib/summary/summary.py
@@ -18,6 +18,42 @@ The operations in this package are safe to use with eager execution turned on or
 off. It has a more flexible API that allows summaries to be written directly
 from ops to places other than event log files, rather than propagating protos
 from @{tf.summary.merge_all} to @{tf.summary.FileWriter}.
+
+To use with eager execution enabled, write your code as follows:
+
+global_step = tf.train.get_or_create_global_step()
+summary_writer = tf.contrib.summary.create_file_writer(
+    train_dir, flush_millis=10000)
+with summary_writer.as_default(), tf.contrib.summary.always_record_summaries():
+  # model code goes here
+  # and in it call
+  tf.contrib.summary.scalar("loss", my_loss)
+  # In this case every call to tf.contrib.summary.scalar will generate a record
+  # ...
+
+To use it with graph execution, write your code as follows:
+
+global_step = tf.train.get_or_create_global_step()
+summary_writer = tf.contrib.summary.create_file_writer(
+    train_dir, flush_millis=10000)
+with summary_writer.as_default(), tf.contrib.summary.always_record_summaries():
+  # model definition code goes here
+  # and in it call
+  tf.contrib.summary.scalar("loss", my_loss)
+  # In this case every call to tf.contrib.summary.scalar will generate an op,
+  # note the need to run tf.contrib.summary.all_summary_ops() to make sure these
+  # ops get executed.
+  # ...
+  train_op = ....
+
+with tf.Session(...) as sess:
+  tf.global_variables_initializer().run()
+  tf.contrib.summary.initialize(graph=tf.get_default_graph())
+  # ...
+  while not_done_training:
+    sess.run([train_op, tf.contrib.summary.all_summary_ops()])
+    # ...
+
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/contrib/summary/summary_ops.py b/tensorflow/contrib/summary/summary_ops.py
index a6968d8b2a..068ae35c71 100644
--- a/tensorflow/contrib/summary/summary_ops.py
+++ b/tensorflow/contrib/summary/summary_ops.py
@@ -154,10 +154,12 @@ def initialize(
       to @{tf.get_default_session}.
 
   Raises:
-    RuntimeError: If in eager mode, or if the current thread has no
-      default @{tf.contrib.summary.SummaryWriter}.
+    RuntimeError: If  the current thread has no default
+      @{tf.contrib.summary.SummaryWriter}.
     ValueError: If session wasn't passed and no default session.
   """
+  if context.in_eager_mode():
+    return
   if context.context().summary_writer_resource is None:
     raise RuntimeError("No default tf.contrib.summary.SummaryWriter found")
   if session is None:
@@ -292,13 +294,9 @@ def all_summary_ops():
 
   Returns:
     The summary ops.
-
-  Raises:
-    RuntimeError: If in Eager mode.
   """
   if context.in_eager_mode():
-    raise RuntimeError(
-        "tf.contrib.summary.all_summary_ops is only supported in graph mode.")
+    return None
   return ops.get_collection(ops.GraphKeys._SUMMARY_COLLECTION)  # pylint: disable=protected-access
 
 
-- 
GitLab


From 7200ecfef82cac74ba0fb1bd58a9a7b69bba27b7 Mon Sep 17 00:00:00 2001
From: Andrew Harp <andrewharp@users.noreply.github.com>
Date: Wed, 7 Feb 2018 17:19:11 -0500
Subject: [PATCH 780/966] Bump JetPack default to 3.2 in Android build script
 (#16842)

---
 tensorflow/contrib/makefile/build_all_android.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/makefile/build_all_android.sh b/tensorflow/contrib/makefile/build_all_android.sh
index f67c516186..fc88f59e09 100755
--- a/tensorflow/contrib/makefile/build_all_android.sh
+++ b/tensorflow/contrib/makefile/build_all_android.sh
@@ -52,7 +52,7 @@ shift $((OPTIND - 1))
 
 if [ "$ARCH" == "tegra" ]; then
     if [[ -z "${JETPACK}" ]]; then
-        export JETPACK="$HOME/JetPack_Android_3.0"
+        export JETPACK="$HOME/JetPack_Android_3.2"
     fi
     if [ ! -d ${JETPACK} ]; then
         echo "Can't find Jetpack at ${JETPACK}"
-- 
GitLab


From 8d6f10ff1b56289ef55197b21196e40dbacb8299 Mon Sep 17 00:00:00 2001
From: Francisco Guerrero <frankgh@users.noreply.github.com>
Date: Wed, 7 Feb 2018 17:19:25 -0500
Subject: [PATCH 781/966] =?UTF-8?q?Fixes=20issue=20when=20linking=20of=20r?=
 =?UTF-8?q?ule=20'//tensorflow/contrib/lite/toco:toco=E2=80=A6=20(#16838)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Fixes issue when linking of rule '//tensorflow/contrib/lite/toco:toco' fails because LD_LIBRARY_PATH is not configured

* Check if LD_LIBRARY_PATH is in environ_cp
---
 configure.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/configure.py b/configure.py
index 27519b4aba..151ad5dba8 100644
--- a/configure.py
+++ b/configure.py
@@ -1400,6 +1400,8 @@ def main():
     if is_linux():
       set_tf_tensorrt_install_path(environ_cp)
     set_tf_cuda_compute_capabilities(environ_cp)
+    if 'LD_LIBRARY_PATH' in environ_cp and environ_cp.get('LD_LIBRARY_PATH') != '1':
+      write_action_env_to_bazelrc('LD_LIBRARY_PATH', environ_cp.get('LD_LIBRARY_PATH'))
 
     set_tf_cuda_clang(environ_cp)
     if environ_cp.get('TF_CUDA_CLANG') == '1':
-- 
GitLab


From d90054e7c0f41f4bab81df0548577a73b939a87a Mon Sep 17 00:00:00 2001
From: Michael Case <mikecase@google.com>
Date: Wed, 7 Feb 2018 14:36:00 -0800
Subject: [PATCH 782/966] Merge changes from github.

PiperOrigin-RevId: 184897758
---
 ISSUE_TEMPLATE.md                             |   2 +-
 README.md                                     |   4 +-
 RELEASE.md                                    |  27 +-
 WORKSPACE                                     |   8 +-
 configure.py                                  |   7 +-
 tensorflow/BUILD                              |   6 +
 tensorflow/cc/BUILD                           |   1 +
 .../cc/tools/freeze_saved_model_test.cc       |   2 +-
 tensorflow/compiler/aot/BUILD                 |  20 +-
 tensorflow/compiler/aot/tests/BUILD           |  45 +-
 tensorflow/compiler/tests/binary_ops_test.py  |   8 +-
 .../compiler/tf2xla/kernels/pooling_ops.cc    |  49 +-
 .../compiler/xla/client/computation_builder.h |   2 +-
 .../compiler/xla/tools/parser/hlo_parser.cc   |   2 +-
 tensorflow/contrib/BUILD                      |   2 +
 .../android/TensorFlowInferenceInterface.java |  10 +
 tensorflow/contrib/cmake/python_modules.txt   |   3 +
 tensorflow/contrib/cmake/tf_core_ops.cmake    |   1 +
 tensorflow/contrib/cmake/tf_python.cmake      |   1 +
 .../contrib/cmake/tools/create_def_file.py    |   6 +-
 tensorflow/contrib/coder/README.md            |   2 +-
 .../contrib/coder/kernels/range_coder.cc      |   2 +-
 .../kernel_tests/cudnn_rnn_ops_benchmark.py   |   1 +
 tensorflow/contrib/eager/python/evaluator.py  |   2 +-
 .../eager/python/examples/resnet50/README.md  |   2 +-
 .../python/examples/resnet50/resnet50.py      |   2 +-
 .../python/examples/resnet50/resnet50_test.py |   1 +
 .../eager/python/examples/rnn_ptb/README.md   |   2 +-
 .../eager/python/examples/rnn_ptb/rnn_ptb.py  |   5 +-
 .../eager/python/examples/spinn/data.py       |  10 +-
 .../eager/python/examples/spinn/spinn_test.py |   1 +
 .../contrib/eager/python/network_test.py      |   4 +-
 tensorflow/contrib/eager/python/saver.py      |   2 +-
 tensorflow/contrib/ffmpeg/decode_video_op.cc  |  12 +-
 .../contrib/framework/python/ops/variables.py |   4 +-
 .../eval/python/classifier_metrics_impl.py    |  31 +-
 .../python/losses/python/losses_impl_test.py  |   2 +-
 tensorflow/contrib/hvx/README.md              | 137 ++---
 tensorflow/contrib/kafka/BUILD                | 105 ++++
 tensorflow/contrib/kafka/__init__.py          |  32 ++
 .../kafka/kernels/kafka_dataset_ops.cc        | 321 ++++++++++++
 tensorflow/contrib/kafka/ops/kafka_ops.cc     |  44 ++
 .../kafka/python/kernel_tests/kafka_test.py   | 115 +++++
 .../kafka/python/kernel_tests/kafka_test.sh   |  48 ++
 .../kafka/python/ops/kafka_dataset_ops.py     |  74 +++
 tensorflow/contrib/layers/__init__.py         |   1 +
 .../contrib/layers/python/layers/layers.py    |  37 +-
 .../layers/python/layers/layers_test.py       |  14 +
 .../learn/python/learn/datasets/synthetic.py  |   2 +-
 .../python/learn/datasets/synthetic_test.py   |   3 +
 .../learn/python/learn/estimators/dnn_test.py |   2 +-
 tensorflow/contrib/lite/build_def.bzl         |  10 +-
 .../contrib/lite/examples/label_image/BUILD   |  10 +-
 .../examples/label_image/bitmap_helpers.h     |  16 +-
 .../label_image/bitmap_helpers_impl.h         |  87 +++-
 .../lite/examples/label_image/label_image.cc  |  48 +-
 .../lite/examples/label_image/label_image.h   |   7 +-
 .../lite/examples/label_image/label_image.md  |  12 +-
 .../contrib/lite/kernels/internal/BUILD       |  24 +
 .../kernels/internal/optimized/cpu_check.h    |   2 +-
 .../internal/optimized/neon_tensor_utils.cc   |   2 +-
 .../lite/kernels/internal/tensor_utils.cc     |   1 +
 .../contrib/lite/nnapi/NeuralNetworksShim.h   |   2 +-
 .../graph_transformations.cc                  |   1 +
 .../resolve_constant_concatenation.cc         |   5 +-
 tensorflow/contrib/lite/toco/model.h          |   1 +
 tensorflow/contrib/lite/toco/tooling_util.cc  |   5 +-
 tensorflow/contrib/makefile/Makefile          |  91 ++--
 .../contrib/makefile/build_all_android.sh     |   2 +-
 tensorflow/contrib/makefile/build_all_ios.sh  |   2 +-
 .../build_and_run_inception_hexagon.sh        |   4 +-
 .../sub_makefiles/android/Makefile.in         |   2 +-
 tensorflow/contrib/makefile/tf_op_files.txt   |   3 +
 tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc  |   2 +-
 tensorflow/contrib/mpi/mpi_rendezvous_mgr.h   |   1 +
 tensorflow/contrib/ndlstm/__init__.py         |   4 +
 tensorflow/contrib/ndlstm/python/lstm1d.py    |  12 +-
 .../opt/python/training/external_optimizer.py |   4 -
 .../training/external_optimizer_test.py       |  39 ++
 tensorflow/contrib/py2tf/impl/api.py          |   4 +-
 .../python/util/graph_compute_order.py        |   2 +-
 .../reduce_slice_ops/ops/reduce_slice_ops.cc  |  24 +-
 .../python/kernel_tests/core_rnn_cell_test.py |  15 +
 .../rnn/python/kernel_tests/rnn_cell_test.py  |   1 -
 tensorflow/contrib/rnn/python/ops/rnn_cell.py |  24 +-
 .../seq2seq/python/ops/attention_wrapper.py   |   3 +-
 .../contrib/session_bundle/bundle_shim.py     |  11 +-
 .../contrib/session_bundle/constants.py       |   3 +
 .../slim/python/slim/evaluation_test.py       |   3 +-
 .../kernel_tests/linear_equations_test.py     |  63 ++-
 .../solvers/python/kernel_tests/util_test.py  |  37 ++
 .../solvers/python/ops/linear_equations.py    |  52 +-
 tensorflow/contrib/solvers/python/ops/util.py |  17 +
 .../pip_package/cloud_tpu_profiler/main.py    |  26 +-
 .../contrib/tpu/profiler/pip_package/setup.py |  16 +-
 tensorflow/core/BUILD                         |   5 +
 .../base_api/api_def_MatchingFiles.pbtxt      |   1 +
 .../core/api_def/base_api/api_def_Roll.pbtxt  |  52 ++
 .../base_api/api_def_UnravelIndex.pbtxt       |  32 ++
 .../core/common_runtime/gpu/gpu_device.cc     |   3 +-
 tensorflow/core/distributed_runtime/BUILD     |   1 +
 .../distributed_runtime/master_session.cc     |   2 +
 .../rpc/grpc_worker_service.cc                |  18 +
 .../rpc/grpc_worker_service.h                 |   3 +
 .../core/distributed_runtime/session_mgr.cc   |  78 +++
 .../core/distributed_runtime/session_mgr.h    |   9 +
 tensorflow/core/framework/register_types.h    |   2 +-
 .../core/framework/variant_op_registry.cc     |  24 +-
 .../core/framework/variant_op_registry.h      |  41 +-
 tensorflow/core/graph/mkl_layout_pass.cc      |  17 +-
 tensorflow/core/graph/mkl_layout_pass_test.cc |   6 +-
 tensorflow/core/graph/testlib.cc              |  10 +
 tensorflow/core/graph/testlib.h               |   4 +
 tensorflow/core/kernels/BUILD                 |  46 ++
 .../core/kernels/compare_and_bitpack_op.cc    |  15 +-
 tensorflow/core/kernels/decode_bmp_op.cc      |  19 +-
 .../core/kernels/fractional_pool_common.h     |   2 +-
 tensorflow/core/kernels/mkl_aggregate_ops.cc  |  13 +-
 tensorflow/core/kernels/mkl_avgpooling_op.cc  |  31 +-
 tensorflow/core/kernels/mkl_concat_op.cc      |   6 +-
 .../core/kernels/mkl_conv_grad_filter_ops.cc  |   6 +-
 .../core/kernels/mkl_conv_grad_input_ops.cc   |   6 +-
 tensorflow/core/kernels/mkl_conv_ops.cc       |   7 +-
 tensorflow/core/kernels/mkl_conv_ops.h        |   6 +-
 .../core/kernels/mkl_cwise_ops_common.cc      |   2 +-
 .../core/kernels/mkl_fused_batch_norm_op.cc   |   6 +-
 tensorflow/core/kernels/mkl_identity_op.cc    |   4 +-
 .../core/kernels/mkl_input_conversion_op.cc   |  62 ++-
 tensorflow/core/kernels/mkl_lrn_op.cc         |   6 +-
 tensorflow/core/kernels/mkl_maxpooling_op.cc  |  10 +-
 .../core/kernels/mkl_pooling_ops_common.cc    |   6 +-
 .../core/kernels/mkl_pooling_ops_common.h     |   8 +-
 tensorflow/core/kernels/mkl_relu_op.cc        |   8 +-
 tensorflow/core/kernels/mkl_reshape_op.cc     |   6 +-
 tensorflow/core/kernels/mkl_softmax_op.cc     |   4 +-
 tensorflow/core/kernels/mkl_tfconv_op.h       |   4 +-
 tensorflow/core/kernels/roll_op.cc            | 334 ++++++++++++
 tensorflow/core/kernels/roll_op_test.cc       | 484 ++++++++++++++++++
 tensorflow/core/kernels/unravel_index_op.cc   | 122 +++++
 tensorflow/core/lib/io/random_inputstream.cc  |  37 ++
 tensorflow/core/lib/io/random_inputstream.h   |   2 +
 tensorflow/core/ops/array_ops.cc              |   7 +
 tensorflow/core/ops/image_ops.cc              |  24 +
 tensorflow/core/ops/manip_ops.cc              |  33 ++
 tensorflow/core/ops/nn_ops.cc                 |   8 +-
 tensorflow/core/platform/cpu_feature_guard.cc |   9 +-
 .../core/platform/profile_utils/cpu_utils.h   |   4 +-
 tensorflow/core/platform/s3/s3_file_system.cc | 122 +++--
 tensorflow/core/platform/s3/s3_file_system.h  |  22 +
 .../core/platform/s3/s3_file_system_test.cc   |   2 +
 tensorflow/core/platform/windows/cpu_info.h   |   2 +
 tensorflow/core/profiler/README.md            |   5 +-
 .../core/profiler/internal/tfprof_stats.h     |   4 +-
 tensorflow/core/profiler/profiler.cc          |   8 +-
 tensorflow/core/public/version.h              |   2 +-
 tensorflow/core/util/mkl_util.h               |  32 +-
 tensorflow/core/util/mkl_util_test.cc         |   4 +-
 tensorflow/docs_src/about/bib.md              |   2 +-
 .../api_guides/python/contrib.signal.md       |   6 +-
 .../api_guides/python/regression_examples.md  |   2 +-
 .../docs_src/get_started/custom_estimators.md |   4 +-
 .../get_started/datasets_quickstart.md        |   4 +-
 .../docs_src/get_started/feature_columns.md   |   4 +-
 .../get_started/premade_estimators.md         |   2 +-
 tensorflow/docs_src/install/install_c.md      |   2 +-
 tensorflow/docs_src/install/install_go.md     |   2 +-
 tensorflow/docs_src/install/install_java.md   |  22 +-
 tensorflow/docs_src/install/install_linux.md  |  28 +-
 tensorflow/docs_src/install/install_mac.md    |  10 +-
 .../docs_src/install/install_sources.md       |  24 +-
 .../docs_src/install/install_windows.md       |   6 +-
 .../docs_src/programmers_guide/graphs.md      |   4 +-
 tensorflow/examples/android/BUILD             |   2 +-
 tensorflow/examples/android/build.gradle      |   9 +-
 .../examples/android/download-models.gradle   |   2 +-
 .../demo/LegacyCameraConnectionFragment.java  |   7 +-
 .../demo/tracking/MultiBoxTracker.java        |   4 +-
 tensorflow/examples/udacity/Dockerfile        |   2 +-
 tensorflow/python/BUILD                       |  36 ++
 tensorflow/python/__init__.py                 |   2 +
 tensorflow/python/client/session_benchmark.py |   1 +
 tensorflow/python/data/ops/dataset_ops.py     |  49 +-
 tensorflow/python/data/util/nest.py           |   6 +-
 tensorflow/python/data/util/sparse.py         |   2 +-
 tensorflow/python/debug/cli/tensor_format.py  |   2 +-
 tensorflow/python/debug/lib/debug_data.py     |   2 +-
 .../python/eager/execution_callbacks.py       |   2 +-
 .../estimator/canned/dnn_testing_utils.py     |   2 +-
 .../estimator/canned/linear_testing_utils.py  |   2 +-
 tensorflow/python/estimator/estimator.py      |   3 +-
 tensorflow/python/estimator/run_config.py     |   5 +-
 .../keras/_impl/keras/layers/convolutional.py |   2 +-
 tensorflow/python/kernel_tests/BUILD          |  13 +
 .../python/kernel_tests/array_ops_test.py     |  21 +
 .../python/kernel_tests/constant_op_test.py   |  13 +-
 .../python/kernel_tests/conv_ops_test.py      |   3 +-
 .../kernel_tests/decode_jpeg_op_test.py       |   1 +
 tensorflow/python/kernel_tests/io_ops_test.py |   2 +-
 tensorflow/python/kernel_tests/losses_test.py |  16 +-
 .../python/kernel_tests/manip_ops_test.py     | 138 +++++
 tensorflow/python/kernel_tests/rnn_test.py    |   1 +
 .../python/kernel_tests/tensordot_op_test.py  |  54 +-
 .../python/kernel_tests/topk_op_test.py       |   2 +-
 tensorflow/python/layers/convolutional.py     |  11 +-
 tensorflow/python/layers/utils.py             |   2 +-
 tensorflow/python/ops/array_ops.py            |   7 +-
 tensorflow/python/ops/functional_ops.py       |   2 +-
 tensorflow/python/ops/gradients_impl.py       |   1 +
 tensorflow/python/ops/image_ops.py            |   4 +
 tensorflow/python/ops/image_ops_impl.py       | 106 +++-
 tensorflow/python/ops/image_ops_test.py       | 112 ++++
 tensorflow/python/ops/linalg_grad.py          |  59 +--
 tensorflow/python/ops/losses/losses_impl.py   |   7 +-
 tensorflow/python/ops/manip_grad.py           |  31 ++
 tensorflow/python/ops/manip_ops.py            |  38 ++
 tensorflow/python/ops/math_ops.py             |  10 +-
 tensorflow/python/ops/rnn.py                  |   6 +
 tensorflow/python/ops/standard_ops.py         |  73 +--
 tensorflow/python/saved_model/loader_impl.py  |   9 +-
 tensorflow/python/tools/freeze_graph.py       |  38 +-
 tensorflow/python/tools/freeze_graph_test.py  |  16 +-
 .../tools/optimize_for_inference_lib.py       |   1 +
 .../tools/optimize_for_inference_test.py      |  92 ++--
 tensorflow/python/tools/saved_model_cli.py    |   3 +-
 .../training/basic_session_run_hooks.py       |   4 +-
 tensorflow/python/training/input.py           |   2 +
 tensorflow/python/training/saver.py           |  12 +-
 tensorflow/python/util/compat_internal.py     |  34 ++
 .../stream_executor/cuda/cuda_diagnostics.cc  |   2 +-
 tensorflow/stream_executor/dso_loader.cc      |   9 +
 tensorflow/tensorflow.bzl                     |   7 +
 .../tools/api/golden/tensorflow.image.pbtxt   |  18 +-
 ...flow.keras.layers.-conv3-d-transpose.pbtxt |   1 +
 ...ras.layers.-convolution3-d-transpose.pbtxt |   1 +
 .../tools/api/golden/tensorflow.manip.pbtxt   |   7 +
 tensorflow/tools/api/golden/tensorflow.pbtxt  |   8 +
 tensorflow/tools/ci_build/ci_sanity.sh        |  16 +-
 .../ci_build/windows/libtensorflow_cpu.sh     |   2 +-
 .../ci_build/windows/libtensorflow_gpu.sh     |   2 +-
 .../tools/docker/jupyter_notebook_config.py   |   1 +
 tensorflow/tools/docs/pretty_docs.py          |   2 +-
 tensorflow/tools/lib_package/BUILD            |   5 +
 tensorflow/tools/pip_package/BUILD            |  11 +
 .../tools/pip_package/build_pip_package.sh    |   2 +-
 tensorflow/tools/pip_package/setup.py         |  11 +-
 tensorflow/workspace.bzl                      |  21 +-
 third_party/com_google_absl.BUILD             |   5 +
 third_party/flatbuffers/flatbuffers.BUILD     |   2 +
 third_party/gast.BUILD                        |   2 +-
 third_party/gpus/cuda_configure.bzl           |   2 +-
 third_party/jpeg/jpeg.BUILD                   |  50 ++
 third_party/kafka/BUILD                       | 147 ++++++
 third_party/kafka/config.patch                |  44 ++
 third_party/pcre.BUILD                        |   2 +-
 third_party/py/python_configure.bzl           |   2 +-
 third_party/termcolor.BUILD                   |   2 +-
 256 files changed, 4478 insertions(+), 898 deletions(-)
 create mode 100644 tensorflow/contrib/kafka/BUILD
 create mode 100644 tensorflow/contrib/kafka/__init__.py
 create mode 100644 tensorflow/contrib/kafka/kernels/kafka_dataset_ops.cc
 create mode 100644 tensorflow/contrib/kafka/ops/kafka_ops.cc
 create mode 100644 tensorflow/contrib/kafka/python/kernel_tests/kafka_test.py
 create mode 100644 tensorflow/contrib/kafka/python/kernel_tests/kafka_test.sh
 create mode 100644 tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py
 create mode 100644 tensorflow/core/api_def/base_api/api_def_Roll.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_UnravelIndex.pbtxt
 create mode 100644 tensorflow/core/kernels/roll_op.cc
 create mode 100644 tensorflow/core/kernels/roll_op_test.cc
 create mode 100644 tensorflow/core/kernels/unravel_index_op.cc
 create mode 100644 tensorflow/core/ops/manip_ops.cc
 create mode 100644 tensorflow/python/kernel_tests/manip_ops_test.py
 create mode 100644 tensorflow/python/ops/manip_grad.py
 create mode 100644 tensorflow/python/ops/manip_ops.py
 create mode 100644 tensorflow/python/util/compat_internal.py
 create mode 100644 tensorflow/tools/api/golden/tensorflow.manip.pbtxt
 create mode 100644 third_party/com_google_absl.BUILD
 create mode 100644 third_party/kafka/BUILD
 create mode 100644 third_party/kafka/config.patch

diff --git a/ISSUE_TEMPLATE.md b/ISSUE_TEMPLATE.md
index 1a401997c6..2f3df7cda9 100644
--- a/ISSUE_TEMPLATE.md
+++ b/ISSUE_TEMPLATE.md
@@ -4,7 +4,7 @@ https://stackoverflow.com/questions/tagged/tensorflow
 
 If you open a GitHub issue, here is our policy:
 
-1. It must be a bug or a feature request.
+1. It must be a bug, a feature request, or a significant problem with documentation (for small docs fixes please send a PR instead).
 2. The form below must be filled out.
 3. It shouldn't be a TensorBoard issue. Those go [here](https://github.com/tensorflow/tensorboard/issues).
 
diff --git a/README.md b/README.md
index 0c93813e58..916e5200b2 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@
 
 | **`Linux CPU`** | **`Linux GPU`** | **`Mac OS CPU`** | **`Windows CPU`** | **`Android`** |
 |-----------------|---------------------|------------------|-------------------|---------------|
-| [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-cpu)](https://ci.tensorflow.org/job/tensorflow-master-cpu) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-linux-gpu)](https://ci.tensorflow.org/job/tensorflow-master-linux-gpu) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-mac)](https://ci.tensorflow.org/job/tensorflow-master-mac) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-win-cmake-py)](https://ci.tensorflow.org/job/tensorflow-master-win-cmake-py) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-android)](https://ci.tensorflow.org/job/tensorflow-master-android) |
+| [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-cpu)](https://ci.tensorflow.org/job/tensorflow-master-cpu) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-linux-gpu)](https://ci.tensorflow.org/job/tensorflow-master-linux-gpu) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-mac)](https://ci.tensorflow.org/job/tensorflow-master-mac) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-win-cmake-py)](https://ci.tensorflow.org/job/tensorflow-master-win-cmake-py) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-android)](https://ci.tensorflow.org/job/tensorflow-master-android) [ ![Download](https://api.bintray.com/packages/google/tensorflow/tensorflow/images/download.svg) ](https://bintray.com/google/tensorflow/tensorflow/_latestVersion) |
 
 **TensorFlow** is an open source software library for numerical computation using
 data flow graphs.  The graph nodes represent mathematical operations, while
@@ -27,7 +27,7 @@ guidelines](CONTRIBUTING.md). This project adheres to TensorFlow's
 uphold this code.**
 
 **We use [GitHub issues](https://github.com/tensorflow/tensorflow/issues) for
-tracking requests and bugs. So please see 
+tracking requests and bugs. So please see
 [TensorFlow Discuss](https://groups.google.com/a/tensorflow.org/forum/#!forum/discuss) for general questions
 and discussion, and please direct specific questions to [Stack Overflow](https://stackoverflow.com/questions/tagged/tensorflow).**
 
diff --git a/RELEASE.md b/RELEASE.md
index fdf10407fd..b11b1e40db 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,18 +1,39 @@
 # Release 1.5.0
 
 ## Breaking Changes
-* Prebuilt binaries are now built against CUDA 9 and cuDNN 7.
+* Prebuilt binaries are now built against CUDA 9.0 and cuDNN 7.
 * Our Linux binaries are built using ubuntu 16 containers, potentially
   introducing glibc incompatibility issues with ubuntu 14.
 * Starting from 1.6 release, our prebuilt binaries will use AVX instructions.
   This may break TF on older CPUs.
 
+## Known Bugs
+* Using XLA:GPU with CUDA 9 and CUDA 9.1 results in garbage results and/or
+  `CUDA_ILLEGAL_ADDRESS` failures.
+
+  Google discovered in mid-December 2017 that the PTX-to-SASS compiler in CUDA 9
+  and CUDA 9.1 sometimes does not properly compute the carry bit when
+  decomposing 64-bit address calculations with large offsets (e.g. `load [x +
+  large_constant]`) into 32-bit arithmetic in SASS.
+
+  As a result, these versions of `ptxas` miscompile most XLA programs which use
+  more than 4GB of temp memory.  This results in garbage results and/or
+  `CUDA_ERROR_ILLEGAL_ADDRESS` failures.
+
+  A fix in CUDA 9.1.121 is expected in late February 2018.  We do not expect a
+  fix for CUDA 9.0.x.  Until the fix is available, the only workaround is to
+  [downgrade](https://developer.nvidia.com/cuda-toolkit-archive) to CUDA 8.0.x
+  or disable XLA:GPU.
+
+  TensorFlow will print a warning if you use XLA:GPU with a known-bad version of
+  CUDA; see e00ba24c4038e7644da417ddc639169b6ea59122.
+
 ## Major Features And Improvements
 * [Eager execution](https://github.com/tensorflow/tensorflow/tree/r1.5/tensorflow/contrib/eager)
   preview version is now available.
 * [TensorFlow Lite](https://github.com/tensorflow/tensorflow/tree/r1.5/tensorflow/contrib/lite)
   dev preview is now available.
-* CUDA 9 and cuDNN 7 support.
+* CUDA 9.0 and cuDNN 7 support.
 * Accelerated Linear Algebra (XLA):
   * Add `complex64` support to XLA compiler.
   * `bfloat` support is now added to XLA infrastructure.
@@ -523,7 +544,7 @@ answered questions, and were part of inspiring discussions.
 * Fixed LIBXSMM integration.
 * Make decode_jpeg/decode_png/decode_gif handle all formats, since users frequently try to decode an image as the wrong type.
 * Improve implicit broadcasting lowering.
-* Improving stability of GCS/Bigquery clients by a faster retrying of stale transmissions.
+* Improving stability of GCS/BigQuery clients by a faster retrying of stale transmissions.
 * Remove OpKernelConstruction::op_def() as part of minimizing proto dependencies.
 * VectorLaplaceDiag distribution added.
 * Android demo no longer requires libtensorflow_demo.so to run (libtensorflow_inference.so still required)
diff --git a/WORKSPACE b/WORKSPACE
index 7ae39374f1..1e38a9a8cd 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -41,12 +41,12 @@ load("//tensorflow:workspace.bzl", "tf_workspace")
 tf_workspace()
 
 new_http_archive(
-    name = "inception5h",
+    name = "inception_v1",
     build_file = "models.BUILD",
-    sha256 = "d13569f6a98159de37e92e9c8ec4dae8f674fbf475f69fe6199b514f756d4364",
+    sha256 = "7efe12a8363f09bc24d7b7a450304a15655a57a7751929b2c1593a71183bb105",
     urls = [
-        "http://storage.googleapis.com/download.tensorflow.org/models/inception5h.zip",
-        "http://download.tensorflow.org/models/inception5h.zip",
+        "http://storage.googleapis.com/download.tensorflow.org/models/inception_v1.zip",
+        "http://download.tensorflow.org/models/inception_v1.zip",
     ],
 )
 
diff --git a/configure.py b/configure.py
index 083fed1710..27519b4aba 100644
--- a/configure.py
+++ b/configure.py
@@ -298,7 +298,7 @@ def get_var(environ_cp,
       System".
     enabled_by_default: boolean for default behavior.
     question: optional string for how to ask for user input.
-    yes_reply: optionanl string for reply when feature is enabled.
+    yes_reply: optional string for reply when feature is enabled.
     no_reply: optional string for reply when feature is disabled.
 
   Returns:
@@ -411,7 +411,7 @@ def set_action_env_var(environ_cp,
       System".
     enabled_by_default: boolean for default behavior.
     question: optional string for how to ask for user input.
-    yes_reply: optionanl string for reply when feature is enabled.
+    yes_reply: optional string for reply when feature is enabled.
     no_reply: optional string for reply when feature is disabled.
   """
   var = int(
@@ -1354,6 +1354,7 @@ def main():
     environ_cp['TF_NEED_GCP'] = '0'
     environ_cp['TF_NEED_HDFS'] = '0'
     environ_cp['TF_NEED_JEMALLOC'] = '0'
+    environ_cp['TF_NEED_KAFKA'] = '0'
     environ_cp['TF_NEED_OPENCL_SYCL'] = '0'
     environ_cp['TF_NEED_COMPUTECPP'] = '0'
     environ_cp['TF_NEED_OPENCL'] = '0'
@@ -1372,6 +1373,8 @@ def main():
                 'with_hdfs_support', True, 'hdfs')
   set_build_var(environ_cp, 'TF_NEED_S3', 'Amazon S3 File System',
                 'with_s3_support', True, 's3')
+  set_build_var(environ_cp, 'TF_NEED_KAFKA', 'Apache Kafka Platform',
+                'with_kafka_support', False, 'kafka')
   set_build_var(environ_cp, 'TF_ENABLE_XLA', 'XLA JIT', 'with_xla_support',
                 False, 'xla')
   set_build_var(environ_cp, 'TF_NEED_GDR', 'GDR', 'with_gdr_support',
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index e89667cbfd..a73e89bc1a 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -211,6 +211,12 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+config_setting(
+    name = "with_kafka_support",
+    define_values = {"with_kafka_support": "true"},
+    visibility = ["//visibility:public"],
+)
+
 # Crosses between platforms and file system libraries not supported on those
 # platforms due to limitations in nested select() statements.
 config_setting(
diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index c9ade5fb83..9060c19e9d 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -433,6 +433,7 @@ tf_gen_op_wrappers_cc(
         "linalg_ops",
         "logging_ops",
         "lookup_ops",
+        "manip_ops",
         "math_ops",
         "nn_ops",
         "no_op",
diff --git a/tensorflow/cc/tools/freeze_saved_model_test.cc b/tensorflow/cc/tools/freeze_saved_model_test.cc
index 57244a4f0a..52a81a5028 100644
--- a/tensorflow/cc/tools/freeze_saved_model_test.cc
+++ b/tensorflow/cc/tools/freeze_saved_model_test.cc
@@ -71,7 +71,7 @@ class FreezeTest : public ::testing::Test {
     return Status::OK();
   }
 
-  // Adds `graph_def` to `saved_model_bundle` and intializes a session with
+  // Adds `graph_def` to `saved_model_bundle` and initializes a session with
   // `init_node`.
   Status AddGraphDefToSavedModelBundle(const GraphDef& graph_def,
                                        const string& init_node,
diff --git a/tensorflow/compiler/aot/BUILD b/tensorflow/compiler/aot/BUILD
index 0540260efd..bc46918df9 100644
--- a/tensorflow/compiler/aot/BUILD
+++ b/tensorflow/compiler/aot/BUILD
@@ -132,7 +132,10 @@ tf_library(
     config = "test_graph_tfadd.config.pbtxt",
     cpp_class = "AddComp",
     graph = "test_graph_tfadd.pbtxt",
-    tags = ["manual"],
+    tags = [
+        "manual",
+        "notap",
+    ],
 )
 
 # A test of tf_library that includes a graph with an unknown op, but where
@@ -143,7 +146,10 @@ tf_library(
     config = "test_graph_tfunknownop.config.pbtxt",
     cpp_class = "UnknownOpAddComp",
     graph = "test_graph_tfunknownop.pbtxt",
-    tags = ["manual"],
+    tags = [
+        "manual",
+        "notap",
+    ],
 )
 
 # A test of tf_library that includes a graph with an unknown op, but where
@@ -155,7 +161,10 @@ tf_library(
     config = "test_graph_tfunknownop2.config.pbtxt",
     cpp_class = "UnknownOpAddComp",
     graph = "test_graph_tfunknownop.pbtxt",
-    tags = ["manual"],
+    tags = [
+        "manual",
+        "notap",
+    ],
 )
 
 # A test of tf_library that includes a graph with an unknown op, but where
@@ -166,7 +175,10 @@ tf_library(
     config = "test_graph_tfunknownop3.config.pbtxt",
     cpp_class = "UnknownOpAddComp",
     graph = "test_graph_tfunknownop.pbtxt",
-    tags = ["manual"],
+    tags = [
+        "manual",
+        "notap",
+    ],
 )
 
 # Utility library for benchmark binaries, used by the *_benchmark rules that are
diff --git a/tensorflow/compiler/aot/tests/BUILD b/tensorflow/compiler/aot/tests/BUILD
index 7dfd49cc3b..43d8ae4108 100644
--- a/tensorflow/compiler/aot/tests/BUILD
+++ b/tensorflow/compiler/aot/tests/BUILD
@@ -74,7 +74,10 @@ tf_library(
     # compile but the others in this directory succeed, you may need to
     # expand the "required by all tf_library targets" list in tfcompile.bzl.
     include_standard_runtime_deps = False,
-    tags = ["manual"],
+    tags = [
+        "manual",
+        "notap",
+    ],
 )
 
 tf_library(
@@ -84,7 +87,10 @@ tf_library(
     cpp_class = "AddWithCkptComp",
     freeze_checkpoint = "test_graph_tfadd_with_ckpt.ckpt",
     graph = "test_graph_tfadd_with_ckpt.pb",
-    tags = ["manual"],
+    tags = [
+        "manual",
+        "notap",
+    ],
 )
 
 tf_library(
@@ -95,7 +101,10 @@ tf_library(
     freeze_checkpoint = "test_graph_tfadd_with_ckpt_saver.ckpt",
     freeze_saver = "test_graph_tfadd_with_ckpt_saver.saver",
     graph = "test_graph_tfadd_with_ckpt_saver.pb",
-    tags = ["manual"],
+    tags = [
+        "manual",
+        "notap",
+    ],
 )
 
 tf_library(
@@ -104,7 +113,10 @@ tf_library(
     config = "test_graph_tffunction.config.pbtxt",
     cpp_class = "FunctionComp",
     graph = "test_graph_tffunction.pb",
-    tags = ["manual"],
+    tags = [
+        "manual",
+        "notap",
+    ],
 )
 
 tf_library(
@@ -113,7 +125,10 @@ tf_library(
     config = "test_graph_tfgather.config.pbtxt",
     cpp_class = "GatherComp",
     graph = "test_graph_tfgather.pb",
-    tags = ["manual"],
+    tags = [
+        "manual",
+        "notap",
+    ],
 )
 
 tf_library(
@@ -122,7 +137,10 @@ tf_library(
     config = "test_graph_tfmatmul.config.pbtxt",
     cpp_class = "foo::bar::MatMulComp",
     graph = "test_graph_tfmatmul.pb",
-    tags = ["manual"],
+    tags = [
+        "manual",
+        "notap",
+    ],
 )
 
 tf_library(
@@ -131,7 +149,10 @@ tf_library(
     config = "test_graph_tfmatmulandadd.config.pbtxt",
     cpp_class = "MatMulAndAddComp",
     graph = "test_graph_tfmatmulandadd.pb",
-    tags = ["manual"],
+    tags = [
+        "manual",
+        "notap",
+    ],
     tfcompile_flags = "--gen_name_to_index --gen_program_shape",
 )
 
@@ -141,13 +162,19 @@ tf_library(
     config = "test_graph_tfsplits.config.pbtxt",
     cpp_class = "SplitsComp",
     graph = "test_graph_tfsplits.pb",
-    tags = ["manual"],
+    tags = [
+        "manual",
+        "notap",
+    ],
 )
 
 tf_cc_test(
     name = "tfcompile_test",
     srcs = ["tfcompile_test.cc"],
-    tags = ["manual"],
+    tags = [
+        "manual",
+        "notap",
+    ],
     deps = [
         ":test_graph_tfadd",
         ":test_graph_tfadd_with_ckpt",
diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index 9d34cdfe10..30a6d3a74d 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -774,15 +774,15 @@ class BinaryOpsTest(XLATestCase):
   def DISABLED_testSparseMatMul(self):
     # Binary wrappers for sparse_matmul with different hints
     def SparseMatmulWrapperTF(a, b):
-      return tf.sparse_matmul(a, b, a_is_sparse=True)
+      return math_ops.sparse_matmul(a, b, a_is_sparse=True)
 
     def SparseMatmulWrapperFT(a, b):
-      return tf.sparse_matmul(a, b, b_is_sparse=True)
+      return math_ops.sparse_matmul(a, b, b_is_sparse=True)
 
     def SparseMatmulWrapperTT(a, b):
-      return tf.sparse_matmul(a, b, a_is_sparse=True, b_is_sparse=True)
+      return math_ops.sparse_matmul(a, b, a_is_sparse=True, b_is_sparse=True)
 
-    self._testMatMul(tf.sparse_matmul)
+    self._testMatMul(math_ops.sparse_matmul)
     self._testMatMul(SparseMatmulWrapperTF)
     self._testMatMul(SparseMatmulWrapperFT)
     self._testMatMul(SparseMatmulWrapperTT)
diff --git a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
index 2ba572fd0e..d4fb5dd4e0 100644
--- a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
@@ -38,8 +38,22 @@ class PoolingOp : public XlaOpKernel {
   PoolingOp(OpKernelConstruction* ctx, int num_spatial_dims)
       : XlaOpKernel(ctx), num_spatial_dims_(num_spatial_dims) {
     if (ctx->num_inputs() == 1) {
-      OP_REQUIRES_OK(ctx, ctx->GetAttr("ksize", &ksize_));
-      OP_REQUIRES_OK(ctx, ctx->GetAttr("strides", &stride_));
+      std::vector<int32> ksize_int;
+      std::vector<int32> stride_int;
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("ksize", &ksize_int));
+      OP_REQUIRES(ctx, ksize_int.size() == num_dims(),
+                  errors::InvalidArgument("Sliding window ksize field must "
+                                          "specify ",
+                                          num_dims(), " dimensions"));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("strides", &stride_int));
+      OP_REQUIRES(ctx, stride_int.size() == num_dims(),
+                  errors::InvalidArgument("Sliding window stride field must "
+                                          "specify ",
+                                          num_dims(), " dimensions"));
+      for (int i = 0; i < num_dims(); ++i) {
+        ksize_.push_back(ksize_int[i]);
+        stride_.push_back(stride_int[i]);
+      }
     }
     Padding padding;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("padding", &padding));
@@ -65,28 +79,33 @@ class PoolingOp : public XlaOpKernel {
     xla::ComputationDataHandle input = ctx->Input(0);
     const TensorShape input_shape = ctx->InputShape(0);
 
+    std::vector<int64> ksize = ksize_;
+    std::vector<int64> stride = stride_;
     if (ctx->num_inputs() != 1) {
       const TensorShape ksize_shape = ctx->InputShape(1);
+      // Validate input sizes.
       OP_REQUIRES(ctx, TensorShapeUtils::IsVector(ksize_shape),
                   errors::InvalidArgument("ksize must be a vector, not shape ",
                                           ksize_shape.DebugString()));
-      OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(1, &ksize_));
+      OP_REQUIRES(ctx, ksize_shape.num_elements() == num_dims(),
+                  errors::InvalidArgument("Sliding window ksize field must "
+                                          "specify ",
+                                          num_dims(), " dimensions"));
+      ksize.clear();
+      OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(1, &ksize));
 
       const TensorShape stride_shape = ctx->InputShape(2);
+      // Validate input sizes.
       OP_REQUIRES(ctx, TensorShapeUtils::IsVector(stride_shape),
                   errors::InvalidArgument("stride must be a vector, not shape ",
                                           stride_shape.DebugString()));
-      OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(2, &stride_));
+      OP_REQUIRES(ctx, stride_shape.num_elements() == num_dims(),
+                  errors::InvalidArgument("Sliding window stride field must "
+                                          "specify ",
+                                          num_dims(), " dimensions"));
+      stride.clear();
+      OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(2, &stride));
     }
-
-    OP_REQUIRES(ctx, ksize_.size() == num_dims(),
-                errors::InvalidArgument("Sliding window ksize field must "
-                                        "specify ",
-                                        num_dims(), " dimensions"));
-    OP_REQUIRES(ctx, stride_.size() == num_dims(),
-                errors::InvalidArgument("Sliding window stride field must "
-                                        "specify ",
-                                        num_dims(), " dimensions"));
     OP_REQUIRES(ctx, input_shape.dims() == num_dims(),
                 errors::InvalidArgument("Input to ", type_string(),
                                         " operator must have ", num_dims(),
@@ -94,8 +113,8 @@ class PoolingOp : public XlaOpKernel {
 
     const DataType type = input_type(0);
     xla::ComputationDataHandle pooled = ctx->builder()->ReduceWindow(
-        input, InitValue(ctx->builder(), type), *Reduction(ctx, type), ksize_,
-        stride_, padding_);
+        input, InitValue(ctx->builder(), type), *Reduction(ctx, type), ksize,
+        stride, padding_);
     ctx->SetOutput(0, PostProcessOutput(ctx, pooled, type, input_shape));
   }
 
diff --git a/tensorflow/compiler/xla/client/computation_builder.h b/tensorflow/compiler/xla/client/computation_builder.h
index d82ba63e8a..ea4cdb7667 100644
--- a/tensorflow/compiler/xla/client/computation_builder.h
+++ b/tensorflow/compiler/xla/client/computation_builder.h
@@ -67,7 +67,7 @@ class ComputationBuilder {
   // OpMetadata is often applied to a series of XLA HLO instructions. As a
   // result, OpMetadata is set on the Computation Builder. All subsequent
   // instructions generated via this Computation Builder will have the same
-  // OpMetadata attached until a call to ClearOpMetdata.
+  // OpMetadata attached until a call to ClearOpMetadata.
   void SetOpMetadata(const OpMetadata& metadata) { metadata_ = metadata; }
 
   // Clears the HloMetadata state.
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc b/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
index 42e7f91f26..d9c4d094b8 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
+++ b/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
@@ -2173,7 +2173,7 @@ bool HloParser::ParseConvolutionDimensionNumbers(
 //
 //  {[2:3:4], [5:6:7], [8:9]}
 //
-// The the parsed result will be:
+// The parsed result will be:
 //
 //  {/*starts=*/{2, 5, 8}, /*limits=*/{3, 6, 9}, /*strides=*/{4, 7, 1}}
 //
diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index 0451f00629..3ed8cef56c 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -50,6 +50,7 @@ py_library(
         "//tensorflow/contrib/image:single_image_random_dot_stereograms_py",
         "//tensorflow/contrib/input_pipeline:input_pipeline_py",
         "//tensorflow/contrib/integrate:integrate_py",
+        "//tensorflow/contrib/kafka",
         "//tensorflow/contrib/keras",
         "//tensorflow/contrib/kernel_methods",
         "//tensorflow/contrib/kfac",
@@ -142,6 +143,7 @@ cc_library(
         "//tensorflow/contrib/factorization:all_ops",
         "//tensorflow/contrib/framework:all_ops",
         "//tensorflow/contrib/input_pipeline:input_pipeline_ops_op_lib",
+        "//tensorflow/contrib/kafka:kafka_ops_op_lib",
         "//tensorflow/contrib/layers:sparse_feature_cross_op_op_lib",
         "//tensorflow/contrib/nccl:nccl_ops_op_lib",
         "//tensorflow/contrib/nearest_neighbor:nearest_neighbor_ops_op_lib",
diff --git a/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java b/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java
index dc5b9fb887..abddadac5b 100644
--- a/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java
+++ b/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java
@@ -194,6 +194,11 @@ public class TensorFlowInferenceInterface {
    * @param outputNames A list of output nodes which should be filled by the inference pass.
    */
   public void run(String[] outputNames, boolean enableStats) {
+    run(outputNames, enableStats, new String[] {});
+  }
+
+  /** An overloaded version of runInference that allows supplying targetNodeNames as well */
+  public void run(String[] outputNames, boolean enableStats, String[] targetNodeNames) {
     // Release any Tensors from the previous run calls.
     closeFetches();
 
@@ -204,6 +209,11 @@ public class TensorFlowInferenceInterface {
       runner.fetch(tid.name, tid.outputIndex);
     }
 
+    // Add targets.
+    for (String t : targetNodeNames) {
+      runner.addTarget(t);
+    }
+
     // Run the session.
     try {
       if (enableStats) {
diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt
index ad8c995eef..57a52bf4ca 100644
--- a/tensorflow/contrib/cmake/python_modules.txt
+++ b/tensorflow/contrib/cmake/python_modules.txt
@@ -6,6 +6,7 @@ tensorflow/core/example
 tensorflow/core/framework
 tensorflow/core/lib
 tensorflow/core/lib/core
+tensorflow/core/profiler
 tensorflow/core/protobuf
 tensorflow/core/util
 tensorflow/examples
@@ -219,6 +220,8 @@ tensorflow/contrib/input_pipeline/python/ops
 tensorflow/contrib/integrate
 tensorflow/contrib/integrate/python
 tensorflow/contrib/integrate/python/ops
+tensorflow/contrib/kafka/python
+tensorflow/contrib/kafka/python/ops
 tensorflow/contrib/keras
 tensorflow/contrib/keras/api
 tensorflow/contrib/keras/api/keras
diff --git a/tensorflow/contrib/cmake/tf_core_ops.cmake b/tensorflow/contrib/cmake/tf_core_ops.cmake
index 138993db35..c42bc35ce7 100644
--- a/tensorflow/contrib/cmake/tf_core_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_core_ops.cmake
@@ -30,6 +30,7 @@ set(tf_op_lib_names
 		"list_ops"
     "lookup_ops"
     "logging_ops"
+    "manip_ops"
     "math_ops"
     "nn_ops"
     "no_op"
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index 294b9c5941..34c466fa01 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -335,6 +335,7 @@ GENERATE_PYTHON_OP_LIB("list_ops")
 GENERATE_PYTHON_OP_LIB("logging_ops")
 GENERATE_PYTHON_OP_LIB("lookup_ops")
 GENERATE_PYTHON_OP_LIB("nn_ops")
+GENERATE_PYTHON_OP_LIB("manip_ops")
 GENERATE_PYTHON_OP_LIB("parsing_ops")
 GENERATE_PYTHON_OP_LIB("random_ops")
 GENERATE_PYTHON_OP_LIB("remote_fused_graph_ops"
diff --git a/tensorflow/contrib/cmake/tools/create_def_file.py b/tensorflow/contrib/cmake/tools/create_def_file.py
index f67698eb99..53c2285699 100644
--- a/tensorflow/contrib/cmake/tools/create_def_file.py
+++ b/tensorflow/contrib/cmake/tools/create_def_file.py
@@ -31,7 +31,7 @@ from __future__ import division
 from __future__ import print_function
 
 import argparse
-import io
+import codecs
 import os
 import re
 import subprocess
@@ -103,7 +103,7 @@ def main():
   for lib_path in args.input:
     proc = subprocess.Popen([DUMPBIN, "/nologo", "/linkermember:1", lib_path],
                             stdout=subprocess.PIPE)
-    for line in io.TextIOWrapper(proc.stdout, encoding="utf-8"):
+    for line in codecs.getreader("utf-8")(proc.stdout):
       cols = line.split()
       if len(cols) < 2:
         continue
@@ -131,7 +131,7 @@ def main():
     # We compare on undname but use the decorated name from candidates.
     dupes = 0
     proc = subprocess.Popen([UNDNAME, tmpfile.name], stdout=subprocess.PIPE)
-    for idx, line in enumerate(io.TextIOWrapper(proc.stdout, encoding="utf-8")):
+    for idx, line in enumerate(codecs.getreader("utf-8")(proc.stdout)):
       decorated = candidates[idx]
       if decorated in taken:
         # Symbol is already in output, done.
diff --git a/tensorflow/contrib/coder/README.md b/tensorflow/contrib/coder/README.md
index e1e867db5a..c6c379c458 100644
--- a/tensorflow/contrib/coder/README.md
+++ b/tensorflow/contrib/coder/README.md
@@ -30,7 +30,7 @@ following sense:
   around,
 - The number of CDF axes does not extend, i.e., `CDF.ndim == data.ndim + 1`.
 
-In the previous example where data has shape (10, 10), the followings are
+In the previous example where data has shape (10, 10), the following are
 acceptable CDF shapes:
 
 - (10, 10, 65)
diff --git a/tensorflow/contrib/coder/kernels/range_coder.cc b/tensorflow/contrib/coder/kernels/range_coder.cc
index f4f076b6c4..21b35155ff 100644
--- a/tensorflow/contrib/coder/kernels/range_coder.cc
+++ b/tensorflow/contrib/coder/kernels/range_coder.cc
@@ -276,7 +276,7 @@ void RangeEncoder::Finalize(string* sink) {
     }
   } else if (base_ != 0) {
     // If base == 0, then pick 0 from [base, base + size) and no zeros are
-    // explcitly written.
+    // explicitly written.
     //
     // Otherwise, pick (base + (2^16 - base[16:0])), i.e., round up base to the
     // next multiple of 2^16. As 2^16 < size, this value should be in the
diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py
index 4fc5ff1bd1..933df6d71d 100644
--- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py
+++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import time
 
+from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.contrib import rnn as contrib_rnn
 from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops
 from tensorflow.contrib.rnn.python.ops import lstm_ops
diff --git a/tensorflow/contrib/eager/python/evaluator.py b/tensorflow/contrib/eager/python/evaluator.py
index 3faaeef590..68e7b5421f 100644
--- a/tensorflow/contrib/eager/python/evaluator.py
+++ b/tensorflow/contrib/eager/python/evaluator.py
@@ -178,7 +178,7 @@ class Evaluator(object):
       call_op: An op that updates evaluation state on a mini-batch of examples.
         Must generate an tf.errors.OutOfRangeError when done.
       results_op: A dictionary of tensors that compute the final evaluation
-        results from the evaulation state.
+        results from the evaluation state.
       sess: The Session to run the evaluation in. Defaults to the default
         Session.
 
diff --git a/tensorflow/contrib/eager/python/examples/resnet50/README.md b/tensorflow/contrib/eager/python/examples/resnet50/README.md
index db023e6c97..79e4600529 100644
--- a/tensorflow/contrib/eager/python/examples/resnet50/README.md
+++ b/tensorflow/contrib/eager/python/examples/resnet50/README.md
@@ -34,7 +34,7 @@ bazel run -c opt --config=cuda :resnet50_graph_test -- --benchmarks=.
 
 (Or remove the `--config=cuda` flag for running on CPU instead of GPU).
 
-On October 31, 2017, the benchmarks demostrated comparable performance
+On October 31, 2017, the benchmarks demonstrated comparable performance
 for eager and graph execution of this particular model when using
 a single NVIDIA Titan X (Pascal) GPU on a host with an
 Intel Xeon E5-1650 CPU @ 3.50GHz and a batch size of 32.
diff --git a/tensorflow/contrib/eager/python/examples/resnet50/resnet50.py b/tensorflow/contrib/eager/python/examples/resnet50/resnet50.py
index b302a87e0e..9982fdb07e 100644
--- a/tensorflow/contrib/eager/python/examples/resnet50/resnet50.py
+++ b/tensorflow/contrib/eager/python/examples/resnet50/resnet50.py
@@ -97,7 +97,7 @@ class _ConvBlock(tfe.Network):
 
   Args:
       kernel_size: the kernel size of middle conv layer at main path
-      filters: list of integers, the filterss of 3 conv layer at main path
+      filters: list of integers, the filters of 3 conv layer at main path
       stage: integer, current stage label, used for generating layer names
       block: 'a','b'..., current block label, used for generating layer names
       data_format: data_format for the input ('channels_first' or
diff --git a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py
index 76e06269b6..0ff8746884 100644
--- a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py
+++ b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py
@@ -22,6 +22,7 @@ import gc
 import tempfile
 import time
 
+from six.moves import xrange  # pylint: disable=redefined-builtin
 import tensorflow as tf
 
 import tensorflow.contrib.eager as tfe
diff --git a/tensorflow/contrib/eager/python/examples/rnn_ptb/README.md b/tensorflow/contrib/eager/python/examples/rnn_ptb/README.md
index 743ebb68ee..966177e91c 100644
--- a/tensorflow/contrib/eager/python/examples/rnn_ptb/README.md
+++ b/tensorflow/contrib/eager/python/examples/rnn_ptb/README.md
@@ -40,7 +40,7 @@ bazel run -c opt --config=cuda :rnn_ptb_graph_test -- --benchmarks=.
 
 (Or remove the `--config=cuda` flag for running on CPU instead of GPU).
 
-On October 31, 2017, the benchmarks demostrated slightly better performance
+On October 31, 2017, the benchmarks demonstrated slightly better performance
 (3-6%) for graph execution over eager execution for this particular model when
 using a single NVIDIA Titan X (Pascal) GPU on a host with an Intel Xeon E5-1650
 CPU @ 3.50GHz and a batch size of 32.
diff --git a/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py b/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py
index 7b9637a9d5..5c5c59c877 100644
--- a/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py
+++ b/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py
@@ -88,7 +88,7 @@ class Embedding(tf.layers.Layer):
 
 
 class PTBModel(tfe.Network):
-  """LSTM for word language modelling.
+  """LSTM for word language modeling.
 
   Model described in:
   (Zaremba, et. al.) Recurrent Neural Network Regularization
@@ -339,8 +339,7 @@ if __name__ == "__main__":
       "http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz")
   parser.add_argument(
       "--logdir", type=str, default="", help="Directory for checkpoint.")
-  parser.add_argument(
-      "--epoch", type=int, default=20, help="Number of epoches.")
+  parser.add_argument("--epoch", type=int, default=20, help="Number of epochs.")
   parser.add_argument("--batch-size", type=int, default=20, help="Batch size.")
   parser.add_argument(
       "--seq-len", type=int, default=35, help="Sequence length.")
diff --git a/tensorflow/contrib/eager/python/examples/spinn/data.py b/tensorflow/contrib/eager/python/examples/spinn/data.py
index a6e046320f..fcaae0a4f8 100644
--- a/tensorflow/contrib/eager/python/examples/spinn/data.py
+++ b/tensorflow/contrib/eager/python/examples/spinn/data.py
@@ -51,11 +51,11 @@ def get_non_parenthesis_words(items):
   """Get the non-parenthesis items from a SNLI parsed sentence.
 
   Args:
-    items: Data items from a parsed SNLI setence, with parentheses. E.g.,
+    items: Data items from a parsed SNLI sentence, with parentheses. E.g.,
       ["(", "Man", "(", "(", "(", "(", "(", "wearing", "pass", ")", ...
 
   Returns:
-    A list of non-parenthis word items, all converted to lower case. E.g.,
+    A list of non-parentheses word items, all converted to lower case. E.g.,
       ["man", "wearing", "pass", ...
   """
   return [x.lower() for x in items if x not in PARENTHESES and x]
@@ -201,7 +201,7 @@ def load_word_vectors(data_root, vocab):
 
 
 def calculate_bins(length2count, min_bin_size):
-  """Cacluate bin boundaries given a histogram of lengths and mininum bin size.
+  """Calculate bin boundaries given a histogram of lengths and minimum bin size.
 
   Args:
     length2count: A `dict` mapping length to sentence count.
@@ -335,9 +335,9 @@ class SnliData(object):
         # The sorting above and the batching here makes sure that sentences of
         # similar max lengths are batched together, minimizing the inefficiency
         # due to uneven max lengths. The sentences are batched differently in
-        # each call to get_generator() due to the shuffling before sotring
+        # each call to get_generator() due to the shuffling before sorting
         # above. The pad_and_reverse_word_ids() and pad_transitions() functions
-        # take care of any remaning unevenness of the max sentence lengths.
+        # take care of any remaining unevenness of the max sentence lengths.
         end = min(begin + batch_size, len(labels))
         # Transpose, because the SPINN model requires time-major, instead of
         # batch-major.
diff --git a/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py b/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py
index 84e25cf81a..7b2f09cba1 100644
--- a/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py
+++ b/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py
@@ -26,6 +26,7 @@ import tempfile
 import time
 
 import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
 import tensorflow as tf
 
 # pylint: disable=g-bad-import-order
diff --git a/tensorflow/contrib/eager/python/network_test.py b/tensorflow/contrib/eager/python/network_test.py
index 8e6b947e5c..3329fc6c51 100644
--- a/tensorflow/contrib/eager/python/network_test.py
+++ b/tensorflow/contrib/eager/python/network_test.py
@@ -539,7 +539,7 @@ class NetworkTest(test.TestCase):
         # No issue here since the name is unique within its scope.
         name_conflict3 = MyNetwork(name="name_conflict")
       net2 = MyNetwork()  # name=outside_scope/my_network_2 to avoid the
-                          # variable_scope my_network_1 below.
+      # variable_scope my_network_1 below.
       vs_name_conflict = MyNetwork(name="vs_name_conflict")  # conflict below
     with variable_scope.variable_scope("intervening_scope"):
       with variable_scope.variable_scope(captured_scope):
@@ -688,7 +688,7 @@ class NetworkTest(test.TestCase):
     net2(one)
     # Layer names typically are globally unique rather than being unique within
     # the scope of their first use. However, within a Network they must be named
-    # locally so that previous Layer consutrciton does not interfere with
+    # locally so that previous Layer construction does not interfere with
     # variable naming (e.g. add a Layer construction before the Network,
     # suddenly your previously saved checkpoint is incompatible).
     self.assertEqual("dense", net1.l1.name)
diff --git a/tensorflow/contrib/eager/python/saver.py b/tensorflow/contrib/eager/python/saver.py
index 57b070ec6e..62421849c7 100644
--- a/tensorflow/contrib/eager/python/saver.py
+++ b/tensorflow/contrib/eager/python/saver.py
@@ -82,7 +82,7 @@ def restore_variables_on_create(save_path, map_func=None):
       map_func_wrapper = lambda self, x: x
     else:
       if not callable(map_func):
-        raise ValueError("map_func must be callaled.")
+        raise ValueError("map_func must be callable.")
       map_func_wrapper = lambda self, x: map_func(x)
 
     ckpt_var_cache = dict()
diff --git a/tensorflow/contrib/ffmpeg/decode_video_op.cc b/tensorflow/contrib/ffmpeg/decode_video_op.cc
index d44032968d..6f8ad486d1 100644
--- a/tensorflow/contrib/ffmpeg/decode_video_op.cc
+++ b/tensorflow/contrib/ffmpeg/decode_video_op.cc
@@ -102,16 +102,12 @@ REGISTER_OP("DecodeVideo")
       return Status::OK();
     })
     .Doc(R"doc(
-Processes the contents of an audio file into a tensor using FFmpeg to decode
+Processes the contents of an video file into a tensor using FFmpeg to decode
 the file.
 
-One row of the tensor is created for each channel in the audio file. Each
-channel contains audio samples starting at the beginning of the audio and
-having `1/samples_per_second` time between them. If the `channel_count` is
-different from the contents of the file, channels will be merged or created.
-
-contents: The binary audio file contents, as a string or rank-0 string
-    tensor.
+contents: The binary contents of the video file to decode. This is a
+    scalar.
+output: A rank-4 `Tensor` that has `[frames, height, width, 3]` RGB as output.
 )doc");
 
 }  // namespace ffmpeg
diff --git a/tensorflow/contrib/framework/python/ops/variables.py b/tensorflow/contrib/framework/python/ops/variables.py
index a9d47ac9b9..0754c3e0e3 100644
--- a/tensorflow/contrib/framework/python/ops/variables.py
+++ b/tensorflow/contrib/framework/python/ops/variables.py
@@ -25,6 +25,7 @@ import re
 from tensorflow.contrib.framework.python.ops import add_arg_scope as contrib_add_arg_scope
 from tensorflow.contrib.framework.python.ops import gen_variable_ops
 from tensorflow.contrib.util import loader
+from tensorflow.core.protobuf import saver_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.framework import device as tf_device
 from tensorflow.python.framework import dtypes
@@ -684,7 +685,8 @@ def assign_from_checkpoint_fn(model_path, var_list, ignore_missing_vars=False,
             'Variable %s missing in checkpoint %s', var, model_path)
     var_list = available_vars
   if var_list:
-    saver = tf_saver.Saver(var_list, reshape=reshape_variables)
+    saver = tf_saver.Saver(var_list, reshape=reshape_variables,
+                           write_version=saver_pb2.SaverDef.V1)
     def callback(session):
       saver.restore(session, model_path)
     return callback
diff --git a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
index 986a5ff6dc..fdfabd07c1 100644
--- a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
+++ b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
@@ -28,6 +28,7 @@ from __future__ import division
 from __future__ import print_function
 
 import functools
+import os
 import sys
 import tarfile
 
@@ -189,20 +190,34 @@ def get_graph_def_from_resource(filename):
   return graph_pb2.GraphDef.FromString(resource_loader.load_resource(filename))
 
 
-def get_graph_def_from_url_tarball(url, filename):
-  """Get a GraphDef proto from a tarball on the web."""
-  def _progress(count, block_size, total_size):
-    sys.stdout.write('\r>> Downloading %s %.1f%%' % (
-        url, float(count * block_size) / float(total_size) * 100.0))
-    sys.stdout.flush()
-  tar_filename, _ = urllib.request.urlretrieve(url, reporthook=_progress)
+def get_graph_def_from_url_tarball(url, filename, tar_filename=None):
+  """Get a GraphDef proto from a tarball on the web.
+
+  Args:
+    url: Web address of tarball
+    filename: Filename of graph definition within tarball
+    tar_filename: Temporary download filename (None = always download)
+
+  Returns:
+    A GraphDef loaded from a file in the downloaded tarball.
+  """
+  if not (tar_filename and os.path.exists(tar_filename)):
+
+    def _progress(count, block_size, total_size):
+      sys.stdout.write('\r>> Downloading %s %.1f%%' %
+                       (url,
+                        float(count * block_size) / float(total_size) * 100.0))
+      sys.stdout.flush()
+
+    tar_filename, _ = urllib.request.urlretrieve(url, tar_filename, _progress)
   with tarfile.open(tar_filename, 'r:gz') as tar:
     proto_str = tar.extractfile(filename).read()
   return graph_pb2.GraphDef.FromString(proto_str)
 
 
 def _default_graph_def_fn():
-  return get_graph_def_from_url_tarball(INCEPTION_URL, INCEPTION_FROZEN_GRAPH)
+  return get_graph_def_from_url_tarball(INCEPTION_URL, INCEPTION_FROZEN_GRAPH,
+                                        os.path.basename(INCEPTION_URL))
 
 
 def run_inception(images,
diff --git a/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py b/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py
index 7d2a7a254f..56ac45554d 100644
--- a/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py
+++ b/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py
@@ -620,7 +620,7 @@ class CombineAdversarialLossTest(test.TestCase):
     with self.test_session(use_gpu=True) as sess:
       for _ in range(10):  # spot check closeness on more than one sample.
         gnorm_np, precond_gnorm_np = sess.run([gnorm, precond_gnorm])
-        self.assertNear(gnorm_np, precond_gnorm_np, 1e-5)
+        self.assertNear(gnorm_np, precond_gnorm_np, 1e-4)
 
 
 class CycleConsistencyLossTest(test.TestCase):
diff --git a/tensorflow/contrib/hvx/README.md b/tensorflow/contrib/hvx/README.md
index 5a6f2f3086..163993a3f6 100644
--- a/tensorflow/contrib/hvx/README.md
+++ b/tensorflow/contrib/hvx/README.md
@@ -1,60 +1,67 @@
 # TensorFlow Runtime with HVX Acceleration
 
-## Description
+This README explain how to build and use the TensorFlow runtime with HVX Acceleration. HVX is an extension of Hexagon, a DSP provided by Qualcomm, which can compute vector calculations faster using less energy than ARM processors.
 
-This README explain how to build and use the TensorFlow Runtime with HVX Acceleration. HVX is an extension of Hexagon which is a DSP provided by qualcomm which can compute vector calculations faster using lower energy than ARM processors.
+## Dependencies
+
+* [Android SDK](https://developer.android.com/studio/index.html).
+* [Android NDK](https://developer.android.com/ndk/index.html). Save the path in `${NDK_ROOT}`.
+* A rooted Qualcomm-based Android device connected to the computer (preferably, a [Snapdragon Development Board](https://developer.qualcomm.com/hardware/additional-snapdragon), but it could be a rooted phone with a Qualcomm SoC, albeit this guide may not work with it). The device needs to be rooted for development and testing purposes, and shouldn't be needed in production. See [Behold, The Snapdragon MDP](https://developer.qualcomm.com/blog/behold-snapdragon-mdp) for more information.
+* [Hexagon SDK v3.0](https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools). Save the path in `${QUALCOMM_SDK}`.
+* The current directory should be TensorFlow source code (`git clone https://github.com/tensorflow/tensorflow.git && cd tensorflow`), and saved into `${TF_ROOT_DIR}`.
+
+You may also need to add a test signature in the device to run HVX-based binaries. Follow the instructions in `${QUALCOMM_SDK}/docs/Tools_Signing.html`, using Python 2.
+
+Note that if the device is not rooted, you may not be able to get the serial number, push the test signature and/or run binary files that call HVX libraries.
 
 ## Quick Start Guide
 
-We provides several tools to build and run inference with this runtime quickly.
+We provide several tools to build and run inference with this runtime quickly.
 
-#### All-in-one script to run inception model with prebuild hexagon library
-If you don’t need to build your own implementation of hexagon HVX, we provide a shortcut to execute graphs by using pre-compiled binaries.
+### Run inception model with a prebuilt Hexagon library
 
+If you don’t need to build your own implementation of Hexagon HVX, we provide a shortcut to execute graphs by using pre-compiled binaries.
+
+```shell
+./tensorflow/contrib/makefile/samples/build_and_run_inception_hexagon.sh -p
 ```
-git clone https://github.com/tensorflow/tensorflow.git
-cd tensorflow
-NDK_ROOT="/path/to/ndk" ./tensorflow/contrib/makefile/build_all_android.sh -X
-```
-(-X downloads dependencies to hexagon HVX and graphs, and copy all dependencies to android and execute a test)
 
-#### All-in-one script to run inception model by building entire libraries from source code
- If you want to build your own implementation of hexagon HVX, we provide a sample all-in-one script to execute graphs which downloads source and build everything for hexagon.
+The `-p` option makes the script download dependencies (i.e., Hexagon HVX binaries and graphs models), copy them to the Android device and execute a test.
 
-```
-git clone https://github.com/tensorflow/tensorflow.git
-cd tensorflow
-QUALCOMM_SDK="/path/to/qualcomm/sdk" NDK_ROOT="/path/to/ndk" ./tensorflow/contrib/makefile/samples/build_and_run_inception_hexagon.sh
+### Run inception model by building all from the source code
+
+If you want to build your own implementation of Hexagon HVX, we provide a sample all-in-one script to execute graphs which downloads the source and builds everything that's necessary.
+
+```shell
+./tensorflow/contrib/makefile/samples/build_and_run_inception_hexagon.sh
 ```
 
 ## Building libraries
 
 If you've finished walking through the quick start guide, you may want to try building each binary manually.
 
-#### Build libhexagon_nn_skel.so
-Download hexagon nn library from codeaurora.org and build it.
+### Build libhexagon\_nn\_skel.so
 
-```
+Download Hexagon NN library from codeaurora.org and build it.
+
+```shell
 git clone https://source.codeaurora.org/quic/hexagon_nn/nnlib
 cd nnlib
 ```
 
-(Just follow instructions in README.HOW_TO_BUILD. You can find libhexagon_nn_skel.so in hexagon_Release_dynamic_toolv72_v60/ship)
-Then copy the generated binary to GEN_LIBS_DIR
+Just follow the instructions in `README.HOW_TO_BUILD`. You can find the file `libhexagon_nn_skel.so` in `hexagon_Release_dynamic_toolv72_v60/ship`.
+Then copy the generated binary to `${GEN_LIBS_DIR}`.
 
-```
+```shell
 GEN_LIBS_DIR="/path/to/a/dir/to/store/hexagon/libraries"
 cp -v "hexagon_Release_dynamic_toolv72_v60/ship/libhexagon_nn_skel.so" "${GEN_LIBS_DIR}"
 ```
 
-#### Build libhexagon_controller.so
+### Build libhexagon\_controller.so
+
 Download tensorflow and build hexagon controller.
 
-```
-git clone https://github.com/tensorflow/tensorflow.git
-cd tensorflow
-TF_ROOT_DIR="$(pwd)"
-QUALCOMM_SDK="/path/to/qualcomm/sdk"
+```shell
 GENERATED_NNLIB_DIRECTORY="/path/to/nnlib"
 GENERATED_HEXAGON_CONTROLLER_DIRECTORY="${QUALCOMM_SDK}/examples/common/generated_hexagon_controller"
 rm -rf "${GENERATED_HEXAGON_CONTROLLER_DIRECTORY}"
@@ -70,12 +77,12 @@ make tree VERBOSE=1 V=android_Release
 cp -v "${GENERATED_HEXAGON_CONTROLLER_DIRECTORY}/android_Release/ship/libhexagon_controller.so" "${GEN_LIBS_DIR}"
 ```
 
-#### Build tensorflow linking hexagon library
-Build tensorflow with the build_all_android.sh with specifying -x option.
+### Build TensorFlow linking Hexagon library
 
-```
+Build TensorFlow with `build_all_android.sh` specifying the `-x` option.
+
+```shell
 BUILD_ALL_ANDROID_PATH="${TF_ROOT_DIR}/tensorflow/contrib/makefile/build_all_android.sh"
-NDK_ROOT="/path/to/ndk/root"
 
 CC_PREFIX=${CC_PREFIX} NDK_ROOT=${NDK_ROOT} "${BUILD_ALL_ANDROID_PATH}" \
 -x "${GEN_LIBS_DIR}" \
@@ -83,11 +90,11 @@ CC_PREFIX=${CC_PREFIX} NDK_ROOT=${NDK_ROOT} "${BUILD_ALL_ANDROID_PATH}" \
 -t hexagon_graph_execution
 ```
 
-#### Push binaries to your Android device
+### Push binaries to your Android device
 
 Before running tests on your Android device, you need to push several binaries to it.
 
-```
+```shell
 adb push "${GEN_LIBS_DIR}/libhexagon_controller.so" "/data/local/tmp"
 adb push "${GEN_LIBS_DIR}/libhexagon_nn_skel.so" "/vendor/lib/rfsa/adsp"
 adb push -p \
@@ -100,40 +107,54 @@ adb shell chmod "${ANDROID_EXEC_FILE_MODE}" \
 adb wait-for-device
 ```
 
-#### Run tests on the device
+### Run tests on the device
 
 Finally, you can run the inference tests on your device.
 
-```
+```shell
 adb shell 'LD_LIBRARY_PATH=/data/local/tmp:$LD_LIBRARY_PATH' \
 "/data/local/tmp/hexagon_graph_execution"
 ```
 
-#### Troubleshooting
-If you're using the Open-Q 820 Snapdragon development kit, you may run into an issue with running the executable due to a missing testsig library. From the Hexagon SDK documentation: *Dynamic shared objects are required to be digitally signed and then authenticated at runtime before they are allowed to be loaded and executed.* Generating a testsig library is necessary to run the unsigned sample library built from this project.
+### Troubleshooting
+
+#### Testsig issue
+
+If you're using the Open-Q 820 Snapdragon Development Kit, you may run into an issue with running the executable due to a missing `testsig` library. From the Hexagon SDK documentation: *Dynamic shared objects are required to be digitally signed and then authenticated at runtime before they are allowed to be loaded and executed.* Generating a testsig library is necessary to run the unsigned sample library built from this project.
 
-If the lack of a testsig library is your problem, you will see errors of the type:
+If the lack of a `testsig` library is your problem, you will see errors of the type:
 `vendor/qcom/proprietary/adsprpc/src/fastrpc_apps_user.c:169::error: -1: 0 == (nErr = remotectl_open(name, (int*)ph, dlerrstr, sizeof(dlerrstr), &dlerr))`
-appearing in adb logcat.
-
-There are several ways to create the testsig library, the only prerequisite is Python and the correct version of the Hexagon-SDK. The following steps is one way to create this library:
-1. Run adb as root: `adb root`
-2. Run the command `adb shell cat /sys/devices/soc0/serial_number`
-3. Convert the decimal number you get as output to hex
-4. Run the python script: `python ${QUALCOMM_SDK}/tools/elfsigner/elfsigner.py -t $(SERIAL_NUMBER_HEX_VALUE)`
-5. The output of the python script is a shared library stored in ${QUALCOMM_SDK}/tools/elfsigner/output/testsig-$(SERIAL_NUMBER_HEX_VALUE).so
-6. Push the shared library to your device:
+appearing in `adb logcat` or ["Expected: (version) >= (1), actual: 0 vs 1" while running a binary from adb](https://github.com/tensorflow/tensorflow/issues/11210).
+
+You need to add a test signature, as described at the beginning of this README. After rebooting your device, you should be able to run the sample application.
+
+#### Qualcomm SDK Linux installation fails with "Malformed \uxxxx encoding"
+
+The installation file is based on LaunchAnywhere, which fails in Linux if the `PS1` env variable contains non-common Unicode chars:
+
 ```
-adb root
-adb wait-for-device
-adb remount
-adb wait-for-device
-adb shell mkdir /system/lib/rfsa
-adb shell mkdir /system/lib/rfsa/adsp
-adb push ${QUALCOMM_SDK}/tools/elfsigner/output/testsig-$(SERIAL_NUMBER_HEX_VALUE).so /system/lib/rfsa/adsp/
+Preparing to install...
+Extracting the JRE from the installer archive...
+Unpacking the JRE...
+Extracting the installation resources from the installer archive...
+Configuring the installer for this system's environment...
+
+Launching installer...
+
+An internal LaunchAnywhere application error has occurred and this application cannot proceed. (LAX)
+
+Stack Trace:
+java.lang.IllegalArgumentException: Malformed \uxxxx encoding.
+  at java.util.Properties.loadConvert(Properties.java:574)
+  at java.util.Properties.load0(Properties.java:391)
+  at java.util.Properties.load(Properties.java:317)
+  at com.zerog.common.java.util.PropertiesUtil.loadProperties(Unknown Source)
+  at com.zerog.lax.LAX.<init>(Unknown Source)
+  at com.zerog.lax.LAX.main(Unknown Source)
 ```
 
-After rebooting your device, you should be able to run the sample application.
+It can be solved by temporarily assigning the `PS1` environment variable to something simple, such as '$'.
+
+## Maintainers
 
-Maintainers:
-- Satoshi Kataoka (satok@google.com, github.com/satok16)
+* Satoshi Kataoka (satok@google.com, github.com/satok16)
diff --git a/tensorflow/contrib/kafka/BUILD b/tensorflow/contrib/kafka/BUILD
new file mode 100644
index 0000000000..efb403462a
--- /dev/null
+++ b/tensorflow/contrib/kafka/BUILD
@@ -0,0 +1,105 @@
+package(
+    default_visibility = ["//visibility:private"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "tf_gen_op_libs")
+load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
+load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+
+tf_kernel_library(
+    name = "kafka_kernels",
+    srcs = ["kernels/kafka_dataset_ops.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels:bounds_check_lib",
+        "//tensorflow/core/kernels:dataset",
+        "//third_party/eigen3",
+        "@kafka",
+    ],
+)
+
+tf_gen_op_libs(
+    op_lib_names = ["kafka_ops"],
+    deps = [
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_gen_op_wrapper_py(
+    name = "gen_kafka_ops",
+    out = "python/ops/gen_kafka_ops.py",
+    require_shape_functions = True,
+    deps = [":kafka_ops_op_lib"],
+)
+
+py_library(
+    name = "kafka",
+    srcs = [
+        "__init__.py",
+        "python/ops/kafka_dataset_ops.py",
+    ],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":gen_kafka_ops",
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python/data/ops:readers",
+    ],
+)
+
+# The Kafka server has to be setup before running the test.
+# The Kafka server is setup through Docker so the Docker engine
+# has to be installed.
+#
+# Once the Docker engine is ready:
+# To setup the Kafka server:
+# $ bash tensorflow/contrib/kafka/python/kernel_tests/kafka_test.sh start kafka
+#
+# After the test is complete:
+# To team down the Kafka server:
+# $ bash tensorflow/contrib/kafka/python/kernel_tests/kafka_test.sh stop kafka
+tf_py_test(
+    name = "kafka_test",
+    srcs = ["python/kernel_tests/kafka_test.py"],
+    additional_deps = [
+        ":kafka",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+    tags = [
+        "manual",
+        "notap",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/kafka/__init__.py b/tensorflow/contrib/kafka/__init__.py
new file mode 100644
index 0000000000..4d755c4056
--- /dev/null
+++ b/tensorflow/contrib/kafka/__init__.py
@@ -0,0 +1,32 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Kafka Dataset.
+
+@@KafkaDataset
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.kafka.python.ops.kafka_dataset_ops import KafkaDataset
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = [
+    "KafkaDataset",
+]
+
+remove_undocumented(__name__)
diff --git a/tensorflow/contrib/kafka/kernels/kafka_dataset_ops.cc b/tensorflow/contrib/kafka/kernels/kafka_dataset_ops.cc
new file mode 100644
index 0000000000..88ef5f3571
--- /dev/null
+++ b/tensorflow/contrib/kafka/kernels/kafka_dataset_ops.cc
@@ -0,0 +1,321 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/dataset.h"
+
+#include "tensorflow/core/framework/tensor.h"
+
+#include "src-cpp/rdkafkacpp.h"
+
+namespace tensorflow {
+
+class KafkaDatasetOp : public DatasetOpKernel {
+ public:
+  using DatasetOpKernel::DatasetOpKernel;
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override {
+    const Tensor* topics_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("topics", &topics_tensor));
+    OP_REQUIRES(
+        ctx, topics_tensor->dims() <= 1,
+        errors::InvalidArgument("`topics` must be a scalar or a vector."));
+
+    std::vector<string> topics;
+    topics.reserve(topics_tensor->NumElements());
+    for (int i = 0; i < topics_tensor->NumElements(); ++i) {
+      topics.push_back(topics_tensor->flat<string>()(i));
+    }
+
+    std::string servers = "";
+    OP_REQUIRES_OK(ctx,
+                   ParseScalarArgument<std::string>(ctx, "servers", &servers));
+    std::string group = "";
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<std::string>(ctx, "group", &group));
+    bool eof = false;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<bool>(ctx, "eof", &eof));
+    int64 timeout = -1;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "timeout", &timeout));
+    OP_REQUIRES(ctx, (timeout > 0),
+                errors::InvalidArgument(
+                    "Timeout value should be large than 0, got ", timeout));
+    *output = new Dataset(ctx, std::move(topics), servers, group, eof, timeout);
+  }
+
+ private:
+  class Dataset : public GraphDatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, std::vector<string> topics,
+            const string& servers, const string& group, const bool eof,
+            const int64 timeout)
+        : GraphDatasetBase(ctx),
+          topics_(std::move(topics)),
+          servers_(servers),
+          group_(group),
+          eof_(eof),
+          timeout_(timeout) {}
+
+    std::unique_ptr<IteratorBase> MakeIterator(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(
+          new Iterator({this, strings::StrCat(prefix, "::Kafka")}));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      static DataTypeVector* dtypes = new DataTypeVector({DT_STRING});
+      return *dtypes;
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      static std::vector<PartialTensorShape>* shapes =
+          new std::vector<PartialTensorShape>({{}});
+      return *shapes;
+    }
+
+    string DebugString() override { return "KafkaDatasetOp::Dataset"; }
+
+   protected:
+    Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* topics = nullptr;
+      TF_RETURN_IF_ERROR(b->AddVector(topics_, &topics));
+      Node* servers = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(servers_, &servers));
+      Node* group = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(group_, &group));
+      Node* eof = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(eof_, &eof));
+      Node* timeout = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(timeout_, &timeout));
+      TF_RETURN_IF_ERROR(
+          b->AddDataset(this, {topics, servers, group, eof, timeout}, output));
+      return Status::OK();
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params) {}
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        do {
+          // We are currently processing a topic, so try to read the next line.
+          if (consumer_.get()) {
+            while (true) {
+              if (limit_ >= 0 &&
+                  (topic_partition_->offset() >= limit_ || offset_ >= limit_)) {
+                // EOF current topic
+                break;
+              }
+              std::unique_ptr<RdKafka::Message> message(
+                  consumer_->consume(dataset()->timeout_));
+              if (message->err() == RdKafka::ERR_NO_ERROR) {
+                // Produce the line as output.
+                Tensor line_tensor(cpu_allocator(), DT_STRING, {});
+                line_tensor.scalar<string>()() =
+                    std::string(static_cast<const char*>(message->payload()),
+                                message->len());
+                out_tensors->emplace_back(std::move(line_tensor));
+                *end_of_sequence = false;
+                // Sync offset
+                offset_ = message->offset();
+                return Status::OK();
+              }
+
+              if (message->err() == RdKafka::ERR__PARTITION_EOF &&
+                  dataset()->eof_) {
+                // EOF current topic
+                break;
+              }
+              if (message->err() != RdKafka::ERR__TIMED_OUT) {
+                return errors::Internal("Failed to consume:",
+                                        message->errstr());
+              }
+              message.reset(nullptr);
+              consumer_->poll(0);
+            }
+
+            // We have reached the end of the current topic, so maybe
+            // move on to next topic.
+            ResetStreamsLocked();
+            ++current_topic_index_;
+          }
+
+          // Iteration ends when there are no more topic to process.
+          if (current_topic_index_ == dataset()->topics_.size()) {
+            *end_of_sequence = true;
+            return Status::OK();
+          }
+
+          TF_RETURN_IF_ERROR(SetupStreamsLocked(ctx->env()));
+        } while (true);
+      }
+
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("current_topic_index"),
+                                               current_topic_index_));
+
+        // `consumer_` is empty if
+        // 1. GetNext has not been called even once.
+        // 2. All topics have been read and iterator has been exhausted.
+        if (consumer_.get()) {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("current_pos"), offset_));
+        }
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        ResetStreamsLocked();
+        int64 current_topic_index;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("current_topic_index"),
+                                              &current_topic_index));
+        current_topic_index_ = size_t(current_topic_index);
+        // The key "current_pos" is written only if the iterator was saved
+        // with an open topic.
+        if (reader->Contains(full_name("current_pos"))) {
+          int64 current_pos;
+          TF_RETURN_IF_ERROR(
+              reader->ReadScalar(full_name("current_pos"), &current_pos));
+
+          TF_RETURN_IF_ERROR(SetupStreamsLocked(ctx->env()));
+          topic_partition_->set_offset(current_pos);
+          if (topic_partition_->offset() != current_pos) {
+            return errors::Internal("Failed to restore to offset ",
+                                    current_pos);
+          }
+          offset_ = current_pos;
+        }
+        return Status::OK();
+      }
+
+     private:
+      // Sets up Kafka streams to read from the topic at
+      // `current_topic_index_`.
+      Status SetupStreamsLocked(Env* env) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        if (current_topic_index_ >= dataset()->topics_.size()) {
+          return errors::InvalidArgument(
+              "current_topic_index_:", current_topic_index_,
+              " >= topics_.size():", dataset()->topics_.size());
+        }
+
+        // Actually move on to next topic.
+        string entry = dataset()->topics_[current_topic_index_];
+
+        std::vector<string> parts = str_util::Split(entry, ":");
+        if (parts.size() < 1) {
+          return errors::InvalidArgument("Invalid parameters: ", entry);
+        }
+        string topic = parts[0];
+        int32 partition = 0;
+        if (parts.size() > 1) {
+          if (!strings::safe_strto32(parts[1], &partition)) {
+            return errors::InvalidArgument("Invalid parameters: ", entry);
+          }
+        }
+        int64 offset = 0;
+        if (parts.size() > 2) {
+          if (!strings::safe_strto64(parts[2], &offset)) {
+            return errors::InvalidArgument("Invalid parameters: ", entry);
+          }
+        }
+
+        topic_partition_.reset(
+            RdKafka::TopicPartition::create(topic, partition, offset));
+
+        offset_ = topic_partition_->offset();
+        limit_ = -1;
+        if (parts.size() > 3) {
+          if (!strings::safe_strto64(parts[3], &limit_)) {
+            return errors::InvalidArgument("Invalid parameters: ", entry);
+          }
+        }
+
+        std::unique_ptr<RdKafka::Conf> conf(
+            RdKafka::Conf::create(RdKafka::Conf::CONF_GLOBAL));
+        std::unique_ptr<RdKafka::Conf> topic_conf(
+            RdKafka::Conf::create(RdKafka::Conf::CONF_TOPIC));
+
+        std::string errstr;
+
+        RdKafka::Conf::ConfResult result =
+            conf->set("default_topic_conf", topic_conf.get(), errstr);
+        if (result != RdKafka::Conf::CONF_OK) {
+          return errors::Internal("Failed to set default_topic_conf:", errstr);
+        }
+
+        result = conf->set("bootstrap.servers", dataset()->servers_, errstr);
+        if (result != RdKafka::Conf::CONF_OK) {
+          return errors::Internal("Failed to set bootstrap.servers ",
+                                  dataset()->servers_, ":", errstr);
+        }
+        result = conf->set("group.id", dataset()->group_, errstr);
+        if (result != RdKafka::Conf::CONF_OK) {
+          return errors::Internal("Failed to set group.id ", dataset()->group_,
+                                  ":", errstr);
+        }
+
+        consumer_.reset(RdKafka::KafkaConsumer::create(conf.get(), errstr));
+        if (!consumer_.get()) {
+          return errors::Internal("Failed to create consumer:", errstr);
+        }
+
+        std::vector<RdKafka::TopicPartition*> partitions;
+        partitions.emplace_back(topic_partition_.get());
+        RdKafka::ErrorCode err = consumer_->assign(partitions);
+        if (err != RdKafka::ERR_NO_ERROR) {
+          return errors::Internal(
+              "Failed to assign partition [", topic_partition_->topic(), ", ",
+              topic_partition_->partition(), ", ", topic_partition_->offset(),
+              "]:", RdKafka::err2str(err));
+        }
+
+        return Status::OK();
+      }
+
+      // Resets all Kafka streams.
+      void ResetStreamsLocked() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        consumer_->unassign();
+        consumer_->close();
+        consumer_.reset(nullptr);
+      }
+
+      mutex mu_;
+      size_t current_topic_index_ GUARDED_BY(mu_) = 0;
+      int64 offset_ GUARDED_BY(mu_) = 0;
+      int64 limit_ GUARDED_BY(mu_) = -1;
+      std::unique_ptr<RdKafka::TopicPartition> topic_partition_ GUARDED_BY(mu_);
+      std::unique_ptr<RdKafka::KafkaConsumer> consumer_ GUARDED_BY(mu_);
+    };
+
+    const std::vector<string> topics_;
+    const std::string servers_;
+    const std::string group_;
+    const bool eof_;
+    const int64 timeout_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("KafkaDataset").Device(DEVICE_CPU),
+                        KafkaDatasetOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/kafka/ops/kafka_ops.cc b/tensorflow/contrib/kafka/ops/kafka_ops.cc
new file mode 100644
index 0000000000..8cdf16103b
--- /dev/null
+++ b/tensorflow/contrib/kafka/ops/kafka_ops.cc
@@ -0,0 +1,44 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+REGISTER_OP("KafkaDataset")
+    .Input("topics: string")
+    .Input("servers: string")
+    .Input("group: string")
+    .Input("eof: bool")
+    .Input("timeout: int64")
+    .Output("handle: variant")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a dataset that emits the messages of one or more Kafka topics.
+
+topics: A `tf.string` tensor containing one or more subscriptions,
+  in the format of [topic:partition:offset:length],
+  by default length is -1 for unlimited.
+servers: A list of bootstrap servers.
+group: The consumer group id.
+eof: If True, the kafka reader will stop on EOF.
+timeout: The timeout value for the Kafka Consumer to wait
+  (in millisecond).
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/kafka/python/kernel_tests/kafka_test.py b/tensorflow/contrib/kafka/python/kernel_tests/kafka_test.py
new file mode 100644
index 0000000000..621911876f
--- /dev/null
+++ b/tensorflow/contrib/kafka/python/kernel_tests/kafka_test.py
@@ -0,0 +1,115 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License.  You may obtain a copy of
+# the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+# License for the specific language governing permissions and limitations under
+# the License.
+# ==============================================================================
+"""Tests for KafkaDataset."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.kafka.python.ops import kafka_dataset_ops
+from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class KafkaDatasetTest(test.TestCase):
+
+  def setUp(self):
+    # The Kafka server has to be setup before the test
+    # and tear down after the test manually.
+    # The docker engine has to be installed.
+    #
+    # To setup the Kafka server:
+    # $ bash kafka_test.sh start kafka
+    #
+    # To team down the Kafka server:
+    # $ bash kafka_test.sh stop kafka
+    pass
+
+  def testKafkaDataset(self):
+    topics = array_ops.placeholder(dtypes.string, shape=[None])
+    num_epochs = array_ops.placeholder(dtypes.int64, shape=[])
+    batch_size = array_ops.placeholder(dtypes.int64, shape=[])
+
+    repeat_dataset = kafka_dataset_ops.KafkaDataset(
+        topics, group="test", eof=True).repeat(num_epochs)
+    batch_dataset = repeat_dataset.batch(batch_size)
+
+    iterator = iterator_ops.Iterator.from_structure(batch_dataset.output_types)
+    init_op = iterator.make_initializer(repeat_dataset)
+    init_batch_op = iterator.make_initializer(batch_dataset)
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      # Basic test: read from topic 0.
+      sess.run(init_op, feed_dict={topics: ["test:0:0:4"], num_epochs: 1})
+      for i in range(5):
+        self.assertEqual("D" + str(i), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Basic test: read from topic 1.
+      sess.run(init_op, feed_dict={topics: ["test:0:5:-1"], num_epochs: 1})
+      for i in range(5):
+        self.assertEqual("D" + str(i + 5), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Basic test: read from both topics.
+      sess.run(
+          init_op,
+          feed_dict={
+              topics: ["test:0:0:4", "test:0:5:-1"],
+              num_epochs: 1
+          })
+      for j in range(2):
+        for i in range(5):
+          self.assertEqual("D" + str(i + j * 5), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Test repeated iteration through both files.
+      sess.run(
+          init_op,
+          feed_dict={
+              topics: ["test:0:0:4", "test:0:5:-1"],
+              num_epochs: 10
+          })
+      for _ in range(10):
+        for j in range(2):
+          for i in range(5):
+            self.assertEqual("D" + str(i + j * 5), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Test batched and repeated iteration through both files.
+      sess.run(
+          init_batch_op,
+          feed_dict={
+              topics: ["test:0:0:4", "test:0:5:-1"],
+              num_epochs: 10,
+              batch_size: 5
+          })
+      for _ in range(10):
+        self.assertAllEqual(["D" + str(i) for i in range(5)],
+                            sess.run(get_next))
+        self.assertAllEqual(["D" + str(i + 5) for i in range(5)],
+                            sess.run(get_next))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/kafka/python/kernel_tests/kafka_test.sh b/tensorflow/contrib/kafka/python/kernel_tests/kafka_test.sh
new file mode 100644
index 0000000000..adf027b8e7
--- /dev/null
+++ b/tensorflow/contrib/kafka/python/kernel_tests/kafka_test.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+set -e
+set -o pipefail
+
+if [ "$#" -ne 2 ]; then
+  echo "Usage: $0 start|stop <kafka container name>" >&2
+  exit 1
+fi
+
+container=$2
+if [ "$1" == "start" ]; then
+    docker run -d --rm --net=host --name=$container spotify/kafka
+    echo Wait 5 secs until kafka is up and running
+    sleep 5
+    echo Create test topic
+    docker exec $container bash -c '/opt/kafka_2.11-0.10.1.0/bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 1 --topic test'
+    echo Create test message
+    docker exec $container bash -c 'echo -e "D0\nD1\nD2\nD3\nD4\nD5\nD6\nD7\nD8\nD9" > /test'
+    echo Produce test message
+    docker exec $container bash -c '/opt/kafka_2.11-0.10.1.0/bin/kafka-console-producer.sh --topic test --broker-list 127.0.0.1:9092 < /test'
+
+    echo Container $container started successfully
+elif [ "$1" == "stop" ]; then
+    docker rm -f $container
+
+    echo Container $container stopped successfully
+else
+  echo "Usage: $0 start|stop <kafka container name>" >&2
+  exit 1
+fi
+
+
+
diff --git a/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py b/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py
new file mode 100644
index 0000000000..8e51d27a34
--- /dev/null
+++ b/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py
@@ -0,0 +1,74 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Kafka Dataset."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.kafka.python.ops import gen_kafka_ops
+from tensorflow.python.data.ops.readers import Dataset
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+
+
+class KafkaDataset(Dataset):
+  """A Kafka Dataset that consumes the message.
+  """
+
+  def __init__(self,
+               topics,
+               servers="localhost",
+               group="",
+               eof=False,
+               timeout=1000):
+    """Create a KafkaReader.
+
+    Args:
+      topics: A `tf.string` tensor containing one or more subscriptions,
+              in the format of [topic:partition:offset:length],
+              by default length is -1 for unlimited.
+      servers: A list of bootstrap servers.
+      group: The consumer group id.
+      eof: If True, the kafka reader will stop on EOF.
+      timeout: The timeout value for the Kafka Consumer to wait
+               (in millisecond).
+    """
+    super(KafkaDataset, self).__init__()
+    self._topics = ops.convert_to_tensor(
+        topics, dtype=dtypes.string, name="topics")
+    self._servers = ops.convert_to_tensor(
+        servers, dtype=dtypes.string, name="servers")
+    self._group = ops.convert_to_tensor(
+        group, dtype=dtypes.string, name="group")
+    self._eof = ops.convert_to_tensor(eof, dtype=dtypes.bool, name="eof")
+    self._timeout = ops.convert_to_tensor(
+        timeout, dtype=dtypes.int64, name="timeout")
+
+  def _as_variant_tensor(self):
+    return gen_kafka_ops.kafka_dataset(self._topics, self._servers, self._group,
+                                       self._eof, self._timeout)
+
+  @property
+  def output_classes(self):
+    return ops.Tensor
+
+  @property
+  def output_shapes(self):
+    return tensor_shape.scalar()
+
+  @property
+  def output_types(self):
+    return dtypes.string
diff --git a/tensorflow/contrib/layers/__init__.py b/tensorflow/contrib/layers/__init__.py
index 6c624929f2..ef419862b4 100644
--- a/tensorflow/contrib/layers/__init__.py
+++ b/tensorflow/contrib/layers/__init__.py
@@ -27,6 +27,7 @@ See the @{$python/contrib.layers} guide.
 @@convolution2d_transpose
 @@conv3d_transpose
 @@convolution3d_transpose
+@@dense_to_sparse
 @@dropout
 @@elu
 @@embedding_lookup_unique
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index 7c52da7b49..1c3af19a6c 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -29,6 +29,7 @@ from tensorflow.contrib.framework.python.ops import variables
 from tensorflow.contrib.layers.python.layers import initializers
 from tensorflow.contrib.layers.python.layers import utils
 from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
@@ -58,12 +59,12 @@ __all__ = [
     'avg_pool2d', 'avg_pool3d', 'batch_norm', 'bias_add', 'conv2d', 'conv3d',
     'conv2d_in_plane', 'conv2d_transpose', 'conv3d_transpose', 'convolution',
     'convolution2d', 'convolution2d_in_plane', 'convolution2d_transpose',
-    'convolution3d', 'convolution3d_transpose', 'dropout', 'elu', 'flatten',
-    'fully_connected', 'GDN', 'gdn', 'layer_norm', 'linear', 'pool',
-    'max_pool2d', 'max_pool3d', 'one_hot_encoding', 'relu', 'relu6', 'repeat',
-    'scale_gradient', 'separable_conv2d', 'separable_convolution2d', 'softmax',
-    'spatial_softmax', 'stack', 'unit_norm', 'legacy_fully_connected',
-    'legacy_linear', 'legacy_relu', 'maxout'
+    'convolution3d', 'convolution3d_transpose', 'dense_to_sparse', 'dropout',
+    'elu', 'flatten', 'fully_connected', 'GDN', 'gdn', 'layer_norm', 'linear',
+    'pool', 'max_pool2d', 'max_pool3d', 'one_hot_encoding', 'relu', 'relu6',
+    'repeat', 'scale_gradient', 'separable_conv2d', 'separable_convolution2d',
+    'softmax', 'spatial_softmax', 'stack', 'unit_norm',
+    'legacy_fully_connected', 'legacy_linear', 'legacy_relu', 'maxout'
 ]
 
 DATA_FORMAT_NCHW = 'NCHW'
@@ -1400,6 +1401,30 @@ def convolution3d_transpose(
     return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
 
 
+@add_arg_scope
+def dense_to_sparse(tensor, eos_token=0, outputs_collections=None, scope=None):
+  """Converts a dense tensor into a sparse tensor.
+  An example use would be to convert dense labels to sparse ones
+  so that they can be fed to the ctc_loss.
+
+  Args:
+     tensor: An `int` `Tensor` to be converted to a `Sparse`.
+     eos_token: An integer.
+       It is part of the target label that signfies the end of a sentence.
+     outputs_collections: Collection to add the outputs.
+     scope: Optional scope for name_scope.
+  """
+  with variable_scope.variable_scope(scope, 'dense_to_sparse', [tensor]) as sc:
+    tensor = ops.convert_to_tensor(tensor)
+    indices = array_ops.where(
+        math_ops.not_equal(tensor, constant_op.constant(eos_token,
+                                                        tensor.dtype)))
+    values = array_ops.gather_nd(tensor, indices)
+    shape = array_ops.shape(tensor, out_type=dtypes.int64)
+    outputs = sparse_tensor.SparseTensor(indices, values, shape)
+    return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
+
+
 @add_arg_scope
 def dropout(inputs,
             keep_prob=0.5,
diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py
index 49b23ce8fa..972ff10bf9 100644
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@@ -44,6 +44,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import template
 from tensorflow.python.ops import variable_scope
@@ -1301,6 +1302,19 @@ class ConvolutionInPlaneTest(test.TestCase):
       self.assertAllClose(result, expected, rtol=1e-5, atol=1e-5)
 
 
+class DenseToSparseTest(test.TestCase):
+
+  def testDenseFromConstantToSparse(self):
+    expected_constant = np.reshape(np.arange(24, dtype=np.int64), (3, 4, 2))
+    tensor = constant_op.constant(expected_constant)
+    sparse = _layers.dense_to_sparse(tensor)
+    dense = sparse_ops.sparse_to_dense(sparse.indices, sparse.dense_shape,
+                                       sparse.values)
+    with self.test_session() as sess:
+      constant = sess.run(dense)
+      self.assertAllEqual(expected_constant, constant)
+
+
 class DropoutTest(test.TestCase):
 
   def testCreateDropout(self):
diff --git a/tensorflow/contrib/learn/python/learn/datasets/synthetic.py b/tensorflow/contrib/learn/python/learn/datasets/synthetic.py
index 649996c49c..9a843168c2 100644
--- a/tensorflow/contrib/learn/python/learn/datasets/synthetic.py
+++ b/tensorflow/contrib/learn/python/learn/datasets/synthetic.py
@@ -151,7 +151,7 @@ def spirals(n_samples=100,
   # Add more points if n_samples is not divisible by n_classes (unbalanced!)
   extras = n_samples % n_classes
   if extras > 0:
-    x_exrta, y_extra = _modes[mode](np.random.rand(extras) * 2 * np.pi, *args,
+    x_extra, y_extra = _modes[mode](np.random.rand(extras) * 2 * np.pi, *args,
                                     **kwargs)
     spir_x = np.append(spir_x, x_extra)
     spir_y = np.append(spir_y, y_extra)
diff --git a/tensorflow/contrib/learn/python/learn/datasets/synthetic_test.py b/tensorflow/contrib/learn/python/learn/datasets/synthetic_test.py
index 613d8d39a3..5809995c8c 100644
--- a/tensorflow/contrib/learn/python/learn/datasets/synthetic_test.py
+++ b/tensorflow/contrib/learn/python/learn/datasets/synthetic_test.py
@@ -136,6 +136,9 @@ class SyntheticTest(test.TestCase):
       self.assertRaises(AssertionError, np.testing.assert_array_equal,
                         spir0.data, spir1.data)
 
+  def test_spirals_synthetic(self):
+    synthetic.spirals(3)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py b/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py
index 12f9bba531..2bd57597c2 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py
@@ -1224,7 +1224,7 @@ class DNNRegressorTest(test.TestCase):
       self, predictions, expected_shape):
     predictions_nparray = np.array(predictions)
     self.assertAllEqual(expected_shape, predictions_nparray.shape)
-    self.assertTrue(np.issubdtype(predictions_nparray.dtype, np.float))
+    self.assertTrue(np.issubdtype(predictions_nparray.dtype, np.floating))
 
   def testPredict_AsIterableFalse(self):
     """Tests predict method with as_iterable=False."""
diff --git a/tensorflow/contrib/lite/build_def.bzl b/tensorflow/contrib/lite/build_def.bzl
index 0a097d5a69..19829e4991 100644
--- a/tensorflow/contrib/lite/build_def.bzl
+++ b/tensorflow/contrib/lite/build_def.bzl
@@ -5,25 +5,25 @@ def tflite_copts():
   copts = [
       "-DFARMHASH_NO_CXX_STRING",
   ] + select({
-          "//tensorflow:android_arm64": [
+          str(Label("//tensorflow:android_arm64")): [
               "-std=c++11",
               "-O3",
           ],
-          "//tensorflow:android_arm": [
+          str(Label("//tensorflow:android_arm")): [
               "-mfpu=neon",
               "-mfloat-abi=softfp",
               "-std=c++11",
               "-O3",
           ],
-          "//tensorflow:android_x86": [
+          str(Label("//tensorflow:android_x86")): [
               "-DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK",
           ],
-          "//tensorflow:ios_x86_64": [
+          str(Label("//tensorflow:ios_x86_64")): [
               "-msse4.1",
           ],
           "//conditions:default": [],
   }) + select({
-      "//tensorflow:with_default_optimizations": [],
+      str(Label("//tensorflow:with_default_optimizations")): [],
       "//conditions:default": ["-DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK"],
   })
 
diff --git a/tensorflow/contrib/lite/examples/label_image/BUILD b/tensorflow/contrib/lite/examples/label_image/BUILD
index 476d85c031..959347b549 100644
--- a/tensorflow/contrib/lite/examples/label_image/BUILD
+++ b/tensorflow/contrib/lite/examples/label_image/BUILD
@@ -42,7 +42,15 @@ cc_library(
         "bitmap_helpers_impl.h",
         "label_image.h",
     ],
-    deps = ["//tensorflow/contrib/lite:string"],
+    deps = [
+        "//tensorflow/contrib/lite:builtin_op_data",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite:schema_fbs_version",
+        "//tensorflow/contrib/lite:string",
+        "//tensorflow/contrib/lite:string_util",
+        "//tensorflow/contrib/lite/kernels:builtin_ops",
+        "//tensorflow/contrib/lite/schema:schema_fbs",
+    ],
 )
 
 # TODO(ahentz): Test disabled as it has a memory leek from read_bmp
diff --git a/tensorflow/contrib/lite/examples/label_image/bitmap_helpers.h b/tensorflow/contrib/lite/examples/label_image/bitmap_helpers.h
index 860e27e5ba..97343dde6b 100644
--- a/tensorflow/contrib/lite/examples/label_image/bitmap_helpers.h
+++ b/tensorflow/contrib/lite/examples/label_image/bitmap_helpers.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_BITMAP_HELPERS_H
-#define TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_BITMAP_HELPERS_H
+#ifndef TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_BITMAP_HELPERS_H_
+#define TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_BITMAP_HELPERS_H_
 
 #include "tensorflow/contrib/lite/examples/label_image/bitmap_helpers_impl.h"
 #include "tensorflow/contrib/lite/examples/label_image/label_image.h"
@@ -26,15 +26,15 @@ uint8_t* read_bmp(const std::string& input_bmp_name, int* width, int* height,
                   int* channels, Settings* s);
 
 template <class T>
-void downsize(T* out, uint8_t* in, int image_height, int image_width,
-              int image_channels, int wanted_height, int wanted_width,
-              int wanted_channels, Settings* s);
+void resize(T* out, uint8_t* in, int image_height, int image_width,
+            int image_channels, int wanted_height, int wanted_width,
+            int wanted_channels, Settings* s);
 
 // explicit instantiation
-template void downsize<uint8_t>(uint8_t*, unsigned char*, int, int, int, int,
-                                int, int, Settings*);
-template void downsize<float>(float*, unsigned char*, int, int, int, int, int,
+template void resize<uint8_t>(uint8_t*, unsigned char*, int, int, int, int, int,
                               int, Settings*);
+template void resize<float>(float*, unsigned char*, int, int, int, int, int,
+                            int, Settings*);
 
 }  // namespace label_image
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/examples/label_image/bitmap_helpers_impl.h b/tensorflow/contrib/lite/examples/label_image/bitmap_helpers_impl.h
index 64a931082b..d57f597875 100644
--- a/tensorflow/contrib/lite/examples/label_image/bitmap_helpers_impl.h
+++ b/tensorflow/contrib/lite/examples/label_image/bitmap_helpers_impl.h
@@ -13,8 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_BITMAP_HELPERS_IMPL_H
-#define TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_BITMAP_HELPERS_IMPL_H
+#ifndef TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_BITMAP_HELPERS_IMPL_H_
+#define TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_BITMAP_HELPERS_IMPL_H_
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/string_util.h"
+#include "tensorflow/contrib/lite/version.h"
 
 #include "tensorflow/contrib/lite/examples/label_image/label_image.h"
 
@@ -22,28 +28,67 @@ namespace tflite {
 namespace label_image {
 
 template <class T>
-void downsize(T* out, uint8_t* in, int image_height, int image_width,
-              int image_channels, int wanted_height, int wanted_width,
-              int wanted_channels, Settings* s) {
-  for (int y = 0; y < wanted_height; ++y) {
-    const int in_y = (y * image_height) / wanted_height;
-    uint8_t* in_row = in + (in_y * image_width * image_channels);
-    T* out_row = out + (y * wanted_width * wanted_channels);
-    for (int x = 0; x < wanted_width; ++x) {
-      const int in_x = (x * image_width) / wanted_width;
-      uint8_t* in_pixel = in_row + (in_x * image_channels);
-      T* out_pixel = out_row + (x * wanted_channels);
-      for (int c = 0; c < wanted_channels; ++c) {
-        if (s->input_floating)
-          out_pixel[c] = (in_pixel[c] - s->input_mean) / s->input_std;
-        else
-          out_pixel[c] = in_pixel[c];
-      }
-    }
+void resize(T* out, uint8_t* in, int image_height, int image_width,
+            int image_channels, int wanted_height, int wanted_width,
+            int wanted_channels, Settings* s) {
+  int number_of_pixels = image_height * image_width * image_channels;
+  std::unique_ptr<Interpreter> interpreter(new Interpreter);
+
+  int base_index = 0;
+
+  // two inputs: input and new_sizes
+  interpreter->AddTensors(2, &base_index);
+  // one output
+  interpreter->AddTensors(1, &base_index);
+  // set input and output tensors
+  interpreter->SetInputs({0, 1});
+  interpreter->SetOutputs({2});
+
+  // set parameters of tensors
+  TfLiteQuantizationParams quant;
+  interpreter->SetTensorParametersReadWrite(
+      0, kTfLiteFloat32, "input",
+      {1, image_height, image_width, image_channels}, quant);
+  interpreter->SetTensorParametersReadWrite(1, kTfLiteInt32, "new_size", {2},
+                                            quant);
+  interpreter->SetTensorParametersReadWrite(
+      2, kTfLiteFloat32, "output",
+      {1, wanted_height, wanted_width, wanted_channels}, quant);
+
+  ops::builtin::BuiltinOpResolver resolver;
+  TfLiteRegistration* resize_op =
+      resolver.FindOp(BuiltinOperator_RESIZE_BILINEAR);
+  interpreter->AddNodeWithParameters({0, 1}, {2}, nullptr, 0, nullptr,
+                                     resize_op, nullptr);
+
+  interpreter->AllocateTensors();
+
+  // fill input image
+  // in[] are integers, cannot do memcpy() directly
+  auto input = interpreter->typed_tensor<float>(0);
+  for (int i = 0; i < number_of_pixels; i++) {
+    input[i] = in[i];
+  }
+
+  // fill new_sizes
+  interpreter->typed_tensor<int>(1)[0] = wanted_height;
+  interpreter->typed_tensor<int>(1)[1] = wanted_width;
+
+  interpreter->Invoke();
+
+  auto output = interpreter->typed_tensor<float>(2);
+  auto output_number_of_pixels =
+      wanted_height * wanted_height * wanted_channels;
+
+  for (int i = 0; i < output_number_of_pixels; i++) {
+    if (s->input_floating)
+      out[i] = (output[i] - s->input_mean) / s->input_std;
+    else
+      out[i] = (uint8_t)output[i];
   }
 }
 
 }  // namespace label_image
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_BITMAP_HELPERS_IMPL_H
+#endif  // TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_BITMAP_HELPERS_IMPL_H_
diff --git a/tensorflow/contrib/lite/examples/label_image/label_image.cc b/tensorflow/contrib/lite/examples/label_image/label_image.cc
index 4d2e1ce0bc..a91467d345 100644
--- a/tensorflow/contrib/lite/examples/label_image/label_image.cc
+++ b/tensorflow/contrib/lite/examples/label_image/label_image.cc
@@ -148,14 +148,22 @@ void RunInference(Settings* s) {
   int wanted_width = dims->data[2];
   int wanted_channels = dims->data[3];
 
-  if (s->input_floating) {
-    downsize<float>(interpreter->typed_tensor<float>(input), in, image_height,
+  switch (interpreter->tensor(input)->type) {
+    case kTfLiteFloat32:
+      s->input_floating = true;
+      resize<float>(interpreter->typed_tensor<float>(input), in, image_height,
                     image_width, image_channels, wanted_height, wanted_width,
                     wanted_channels, s);
-  } else {
-    downsize<uint8_t>(interpreter->typed_tensor<uint8_t>(input), in,
+      break;
+    case kTfLiteUInt8:
+      resize<uint8_t>(interpreter->typed_tensor<uint8_t>(input), in,
                       image_height, image_width, image_channels, wanted_height,
                       wanted_width, wanted_channels, s);
+      break;
+    default:
+      LOG(FATAL) << "cannot handle input type "
+                 << interpreter->tensor(input)->type << " yet";
+      exit(-1);
   }
 
   struct timeval start_time, stop_time;
@@ -177,13 +185,21 @@ void RunInference(Settings* s) {
 
   std::vector<std::pair<float, int>> top_results;
 
-  if (s->input_floating) {
-    get_top_n<float>(interpreter->typed_output_tensor<float>(0), output_size,
-                     num_results, threshold, &top_results, s->input_floating);
-  } else {
-    get_top_n<uint8_t>(interpreter->typed_output_tensor<uint8_t>(0),
-                       output_size, num_results, threshold, &top_results,
-                       s->input_floating);
+  int output = interpreter->outputs()[0];
+  switch (interpreter->tensor(output)->type) {
+    case kTfLiteFloat32:
+      get_top_n<float>(interpreter->typed_output_tensor<float>(0), output_size,
+                       num_results, threshold, &top_results, true);
+      break;
+    case kTfLiteUInt8:
+      get_top_n<uint8_t>(interpreter->typed_output_tensor<uint8_t>(0),
+                         output_size, num_results, threshold, &top_results,
+                         false);
+      break;
+    default:
+      LOG(FATAL) << "cannot handle output type "
+                 << interpreter->tensor(input)->type << " yet";
+      exit(-1);
   }
 
   std::vector<string> labels;
@@ -203,13 +219,11 @@ void display_usage() {
   LOG(INFO) << "label_image\n"
             << "--accelerated, -a: [0|1], use Android NNAPI or note\n"
             << "--count, -c: loop interpreter->Invoke() for certain times\n"
-            << "--input_floating, -f: [0|1] type of input layer is floating "
-               "point numbers\n"
             << "--input_mean, -b: input mean\n"
             << "--input_std, -s: input standard deviation\n"
             << "--image, -i: image_name.bmp\n"
             << "--labels, -l: labels for the model\n"
-            << "--tflite_mode, -m: model_name.tflite\n"
+            << "--tflite_model, -m: model_name.tflite\n"
             << "--threads, -t: number of threads\n"
             << "--verbose, -v: [0|1] print more information\n"
             << "\n";
@@ -223,7 +237,6 @@ int Main(int argc, char** argv) {
     static struct option long_options[] = {
         {"accelerated", required_argument, 0, 'a'},
         {"count", required_argument, 0, 'c'},
-        {"input_floating", required_argument, 0, 'f'},
         {"verbose", required_argument, 0, 'v'},
         {"image", required_argument, 0, 'i'},
         {"labels", required_argument, 0, 'l'},
@@ -254,11 +267,6 @@ int Main(int argc, char** argv) {
         s.loop_count = strtol(  // NOLINT(runtime/deprecated_fn)
             optarg, (char**)NULL, 10);
         break;
-      case 'f':
-        s.input_floating = strtol(  // NOLINT(runtime/deprecated_fn)
-            optarg, (char**)NULL, 10);
-        s.input_layer_type = "float";
-        break;
       case 'i':
         s.input_bmp_name = optarg;
         break;
diff --git a/tensorflow/contrib/lite/examples/label_image/label_image.h b/tensorflow/contrib/lite/examples/label_image/label_image.h
index ce98e06fc1..4de32e33fb 100644
--- a/tensorflow/contrib/lite/examples/label_image/label_image.h
+++ b/tensorflow/contrib/lite/examples/label_image/label_image.h
@@ -16,9 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_LABEL_IMAGE_H
 #define TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_LABEL_IMAGE_H
 
-#include <string>
 #include "tensorflow/contrib/lite/string.h"
 
+namespace tflite {
+namespace label_image {
+
 struct Settings {
   bool verbose = false;
   bool accel = false;
@@ -33,4 +35,7 @@ struct Settings {
   int number_of_threads = 4;
 };
 
+}  // namespace label_image
+}  // namespace tflite
+
 #endif  // TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_LABEL_IMAGE_H
diff --git a/tensorflow/contrib/lite/examples/label_image/label_image.md b/tensorflow/contrib/lite/examples/label_image/label_image.md
index d6019d673f..9ce32cf101 100644
--- a/tensorflow/contrib/lite/examples/label_image/label_image.md
+++ b/tensorflow/contrib/lite/examples/label_image/label_image.md
@@ -1,8 +1,12 @@
 label_image for TensorFlow Lite inspired by TensorFlow's label_image.
+
+To build label_image for Android, run $TENSORFLOW_ROOT/configure 
+and set Android NDK or configure NDK setting in 
+$TENSORFLOW_ROOT/WORKSPACE first.
  
 To build it for android ARMv8:
 ```
-> bazel build --cxxopt=-std=c++11 \
+> bazel build --config monolithic --cxxopt=-std=c++11 \
   --crosstool_top=//external:android/crosstool \
   --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
   --cpu=arm64-v8a \
@@ -10,13 +14,13 @@ To build it for android ARMv8:
 ```
 or
 ```
-> bazel build --config android_arm64 --cxxopt=-std=c++11 \
+> bazel build --config android_arm64 --config monolithic --cxxopt=-std=c++11 \
   //tensorflow/contrib/lite/examples/label_image:label_image
 ```
 
 To build it for android arm-v7a:
 ```
-> bazel build --cxxopt=-std=c++11 \
+> bazel build --config monolithic --cxxopt=-std=c++11 \
   --crosstool_top=//external:android/crosstool \
   --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
   --cpu=armeabi-v7a \
@@ -24,7 +28,7 @@ To build it for android arm-v7a:
 ```
 or
 ```
-> bazel build --config android_arm --cxxopt=-std=c++11 \
+> bazel build --config android_arm --config monolithic --cxxopt=-std=c++11 \
   //tensorflow/contrib/lite/examples/label_image:label_image
 ```
 
diff --git a/tensorflow/contrib/lite/kernels/internal/BUILD b/tensorflow/contrib/lite/kernels/internal/BUILD
index 4691a543e9..a6ccc99a51 100644
--- a/tensorflow/contrib/lite/kernels/internal/BUILD
+++ b/tensorflow/contrib/lite/kernels/internal/BUILD
@@ -278,6 +278,8 @@ cc_library(
         "optimized/neon_tensor_utils.cc",
     ],
     hdrs = [
+        "common.h",
+        "optimized/cpu_check.h",
         "optimized/neon_tensor_utils.h",
         "optimized/tensor_utils_impl.h",
     ],
@@ -285,8 +287,11 @@ cc_library(
     deps = [
         ":cpu_check",
         ":portable_tensor_utils",
+        ":types",
         "//tensorflow/contrib/lite:builtin_op_data",
         "//tensorflow/contrib/lite/kernels:activation_functor",
+        "@arm_neon_2_x86_sse",
+        "@gemmlowp",
     ],
 )
 
@@ -306,14 +311,21 @@ cc_library(
         "tensor_utils.cc",
     ],
     hdrs = [
+        "common.h",
+        "compatibility.h",
+        "optimized/cpu_check.h",
+        "optimized/neon_tensor_utils.h",
         "optimized/tensor_utils_impl.h",
         "reference/portable_tensor_utils.h",
         "tensor_utils.h",
+        "types.h",
     ],
     copts = NEON_FLAGS_IF_APPLICABLE,
     deps = [
         "//tensorflow/contrib/lite/kernels:activation_functor",
         "//tensorflow/contrib/lite:builtin_op_data",
+        "@arm_neon_2_x86_sse",
+        "@gemmlowp",
     ] + select({
         ":arm": [
             ":neon_tensor_utils",
@@ -333,6 +345,18 @@ cc_library(
         ":ios_arm64": [
             ":neon_tensor_utils",
         ],
+        ":x86_64": [
+            ":neon_tensor_utils",
+        ],
+        ":x86": [
+            ":neon_tensor_utils",
+        ],
+        ":k8": [
+            ":neon_tensor_utils",
+        ],
+        ":darwin": [
+            ":neon_tensor_utils",
+        ],
         "//conditions:default": [
             ":portable_tensor_utils",
         ],
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/cpu_check.h b/tensorflow/contrib/lite/kernels/internal/optimized/cpu_check.h
index 6cb556bf45..3a53d3ab07 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/cpu_check.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/cpu_check.h
@@ -34,7 +34,7 @@ inline bool TestCPUFeatureNeon() {
 #endif  // __aarch64__
 }
 
-#elif __ARM_NEON
+#elif defined USE_NEON || defined __ARM_NEON
 
 inline bool TestCPUFeatureNeon() { return true; }
 
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.cc b/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.cc
index bf0bdfb1fb..883c7f270d 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.cc
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.cc
@@ -16,11 +16,11 @@ limitations under the License.
 
 #include "tensorflow/contrib/lite/builtin_op_data.h"
 #include "tensorflow/contrib/lite/kernels/activation_functor.h"
+#include "tensorflow/contrib/lite/kernels/internal/common.h"
 #include "tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h"
 
 #ifdef USE_NEON
 
-#include <arm_neon.h>
 #define kFloatWeightsPerNeonLane 4
 
 namespace tflite {
diff --git a/tensorflow/contrib/lite/kernels/internal/tensor_utils.cc b/tensorflow/contrib/lite/kernels/internal/tensor_utils.cc
index 904a97803a..f4181b18a8 100644
--- a/tensorflow/contrib/lite/kernels/internal/tensor_utils.cc
+++ b/tensorflow/contrib/lite/kernels/internal/tensor_utils.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
+#include "tensorflow/contrib/lite/kernels/internal/common.h"
 
 #ifndef USE_NEON
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
diff --git a/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h b/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
index 7019c29959..76032771af 100644
--- a/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
+++ b/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
@@ -1571,7 +1571,7 @@ inline int ANeuralNetworksModel_addOperation(ANeuralNetworksModel* model,
 }
 
 /**
- * Specfifies which operands will be the model's inputs and outputs.
+ * Specifies which operands will be the model's inputs and outputs.
  *
  * An operand cannot be used for both input and output. Doing so will
  * return an error.
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.cc b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.cc
index 2340f0e850..6961e23690 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.cc
@@ -132,6 +132,7 @@ bool GraphTransformationsPass(int increment, Model* model,
   CHECK(increment == 1 || increment == -1);
   bool changed = false;
   if (model->operators.empty()) {
+    LOG(INFO) << "Model is empty!!!";
     return false;
   }
   int op_index = increment == 1 ? 0 : model->operators.size() - 1;
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc
index 833c97c758..e79e2a32fc 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc
@@ -189,7 +189,10 @@ bool ResolveConstantConcatenation::Run(Model* model, std::size_t op_index) {
 
   // Remove all the resolved arrays.
   for (const string& input_name : concat_op->inputs) {
-    model->EraseArray(input_name);
+    // Check to prevent removal of shared tensors
+    if (CountOpsWithInput(*model, input_name) == 1) {
+      model->EraseArray(input_name);
+    }
   }
 
   // Remove concatenate operator
diff --git a/tensorflow/contrib/lite/toco/model.h b/tensorflow/contrib/lite/toco/model.h
index 8f12bc59fb..0bee694387 100644
--- a/tensorflow/contrib/lite/toco/model.h
+++ b/tensorflow/contrib/lite/toco/model.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_LITE_TOCO_MODEL_H_
 #define TENSORFLOW_CONTRIB_LITE_TOCO_MODEL_H_
 
+#include <functional>
 #include <initializer_list>
 #include <memory>
 #include <string>
diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc
index 1add90fb82..ce0fde57f4 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.cc
+++ b/tensorflow/contrib/lite/toco/tooling_util.cc
@@ -698,10 +698,11 @@ void CheckNonExistentIOArrays(const Model& model) {
 void CheckNoMissingArray(const Model& model) {
   for (const auto& op : model.operators) {
     for (const auto& input : op->inputs) {
-      CHECK(model.HasArray(input) || model.optional_arrays.count(input));
+      CHECK(model.HasArray(input) || model.optional_arrays.count(input))
+          << "Input: " << input << " missing for op: " << op->outputs[0] << ".";
     }
     for (const auto& output : op->outputs) {
-      CHECK(model.HasArray(output));
+      CHECK(model.HasArray(output)) << "Output: " << output << " missing.";
     }
   }
   CheckNonExistentIOArrays(model);
diff --git a/tensorflow/contrib/makefile/Makefile b/tensorflow/contrib/makefile/Makefile
index dd5770dc99..81327407d4 100644
--- a/tensorflow/contrib/makefile/Makefile
+++ b/tensorflow/contrib/makefile/Makefile
@@ -377,10 +377,10 @@ $(MARCH_OPTION) \
 
 	ifeq ($(BUILD_FOR_TEGRA),1)
 		NVCC := $(JETPACK)/cuda/bin/nvcc
-		NVCCFLAGS := -x=cu -D__CUDACC__ -DNVCC -DNVIDIA_TEGRA -ccbin $(NDK_ROOT)/toolchains/$(TOOLCHAIN)/prebuilt/$(ANDROID_HOST_OS_ARCH)/bin/$(BIN_PREFIX)-g++ --std c++11 --expt-relaxed-constexpr -m64 -gencode arch=compute_53,\"code=sm_53\" -gencode arch=compute_62,\"code=sm_62\" -DEIGEN_AVOID_STL_ARRAY -DTENSORFLOW_USE_EIGEN_THREADPOOL -DLANG_CXX11 -DEIGEN_HAS_C99_MATH -DGOOGLE_CUDA=1 -DTF_EXTRA_CUDA_CAPABILITIES=5.3
+		NVCCFLAGS := -x=cu -D__CUDACC__ -DNVCC -DANDROID_TEGRA -ccbin $(NDK_ROOT)/toolchains/$(TOOLCHAIN)/prebuilt/$(ANDROID_HOST_OS_ARCH)/bin/$(BIN_PREFIX)-g++ --std c++11 --expt-relaxed-constexpr -m64 -gencode arch=compute_53,\"code=sm_53\" -gencode arch=compute_62,\"code=sm_62\" -DEIGEN_AVOID_STL_ARRAY -DTENSORFLOW_USE_EIGEN_THREADPOOL -DLANG_CXX11 -DEIGEN_HAS_C99_MATH -DGOOGLE_CUDA=1 -DTF_EXTRA_CUDA_CAPABILITIES=5.3
 		CXXFLAGS4NVCC =\
 -DIS_SLIM_BUILD \
--DNVIDIA_TEGRA \
+-DANDROID_TEGRA \
 -fno-exceptions \
 -DNDEBUG $(OPTFLAGS) \
 -march=armv8-a \
@@ -391,7 +391,7 @@ $(MARCH_OPTION) \
 		CXXFLAGS +=\
 -DGOOGLE_CUDA=1 \
 -D__ANDROID_TYPES_FULL__ \
--DNVIDIA_TEGRA \
+-DANDROID_TEGRA \
 -DEIGEN_AVOID_STL_ARRAY \
 -DEIGEN_HAS_C99_MATH \
 -DLANG_CXX11 -DTENSORFLOW_USE_EIGEN_THREADPOOL -DTF_EXTRA_CUDA_CAPABILITIES=5.3
@@ -407,7 +407,7 @@ $(MARCH_OPTION) \
 -I$(JETPACK)/cuda/extras/CUPTI/include
 
 
-		LIBS += \
+		CUDA_LIBS := \
 -ltfcuda \
 -lcudart_static \
 -lcudnn \
@@ -420,10 +420,10 @@ $(MARCH_OPTION) \
 -lculibos \
 -lcurand_static
 
-		OBJDIR := $(OBJDIR)Tegra/
-		LIBDIR := $(LIBDIR)Tegra/
-		BINDIR := $(BINDIR)Tegra/
-		DEPDIR := $(DEPDIR)Tegra/
+		OBJDIR := $(OBJDIR)android_arm64-v8a/
+		LIBDIR := $(LIBDIR)android_arm64-v8a/
+		BINDIR := $(BINDIR)android_arm64-v8a/
+		DEPDIR := $(DEPDIR)android_arm64-v8a/
 
 		TEGRA_LIBS := \
 -L$(JETPACK)/cuda/targets/aarch64-linux-androideabi/lib \
@@ -606,7 +606,8 @@ $(wildcard tensorflow/core/util/*/*.cc) \
 tensorflow/core/util/version_info.cc
 # Remove duplicates (for version_info.cc)
 CORE_CC_ALL_SRCS := $(sort $(CORE_CC_ALL_SRCS))
-CORE_CC_EXCLUDE_SRCS := \
+
+CORE_CC_EXCLUDE_SRCS_NON_GPU := \
 $(wildcard tensorflow/core/*/*test.cc) \
 $(wildcard tensorflow/core/*/*testutil*) \
 $(wildcard tensorflow/core/*/*testlib*) \
@@ -626,49 +627,31 @@ $(wildcard tensorflow/core/lib/jpeg/*) \
 $(wildcard tensorflow/core/lib/png/*) \
 $(wildcard tensorflow/core/util/events_writer.*) \
 $(wildcard tensorflow/core/util/reporter.*) \
-$(wildcard tensorflow/core/platform/default/cuda_libdevice_path.*) \
-$(wildcard tensorflow/core/platform/default/stream_executor.*) \
 $(wildcard tensorflow/core/platform/default/test_benchmark.*) \
-$(wildcard tensorflow/core/platform/cuda.h) \
-$(wildcard tensorflow/core/platform/cuda_libdevice_path.*) \
 $(wildcard tensorflow/core/platform/cloud/*) \
 $(wildcard tensorflow/core/platform/google/*) \
 $(wildcard tensorflow/core/platform/google/*/*) \
 $(wildcard tensorflow/core/platform/jpeg.*) \
 $(wildcard tensorflow/core/platform/png.*) \
 $(wildcard tensorflow/core/platform/s3/*) \
-$(wildcard tensorflow/core/platform/stream_executor.*) \
 $(wildcard tensorflow/core/platform/windows/*) \
-$(wildcard tensorflow/core/user_ops/*.cu.cc) \
-$(wildcard tensorflow/core/common_runtime/gpu/*) \
-$(wildcard tensorflow/core/common_runtime/gpu_device_factory.*) \
 $(wildcard tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.*) \
 $(wildcard tensorflow/core/grappler/inputs/file_input_yielder.*) \
-$(wildcard tensorflow/core/grappler/clusters/single_machine.*)
+$(wildcard tensorflow/core/grappler/clusters/single_machine.*) \
+tensorflow/core/util/cuda_kernel_helper_test.cu.cc
+
+CORE_CC_EXCLUDE_SRCS := \
+$(CORE_CC_EXCLUDE_SRCS_NON_GPU) \
+$(wildcard tensorflow/core/platform/stream_executor.*) \
+$(wildcard tensorflow/core/platform/default/cuda_libdevice_path.*) \
+$(wildcard tensorflow/core/platform/cuda.h) \
+$(wildcard tensorflow/core/platform/cuda_libdevice_path.*) \
+$(wildcard tensorflow/core/user_ops/*.cu.cc) \
+$(wildcard tensorflow/core/common_runtime/gpu/*) \
+$(wildcard tensorflow/core/common_runtime/gpu_device_factory.*)
 
 ifeq ($(BUILD_FOR_TEGRA),1)
-CORE_CC_ALL_SRCS := \
-$(wildcard tensorflow/core/*.cc) \
-$(wildcard tensorflow/core/common_runtime/*.cc) \
-$(wildcard tensorflow/core/common_runtime/gpu/*.cc) \
-$(wildcard tensorflow/core/framework/*.cc) \
-$(wildcard tensorflow/core/graph/*.cc) \
-$(wildcard tensorflow/core/platform/*.cc) \
-$(wildcard tensorflow/core/platform/*/*.cc) \
-$(wildcard tensorflow/core/platform/*/*/*.cc) \
-$(wildcard tensorflow/core/util/*.cc) \
-$(wildcard tensorflow/core/util/*/*.cc) \
-$(wildcard tensorflow/cc/training/*.cc) \
-$(wildcard tensorflow/stream_executor/*.cc) \
-$(wildcard tensorflow/stream_executor/*/*.cc) \
-$(wildcard tensorflow/core/grappler/optimizers/*.cc) \
-$(wildcard tensorflow/core/grappler/*.cc) \
-$(wildcard tensorflow/core/grappler/costs/*.cc) \
-$(wildcard tensorflow/core/grappler/clusters/*.cc) \
-$(wildcard tensorflow/core/grappler/utils/*.cc) \
-$(wildcard tensorflow/core/lib/core/*.cc) \
-$(wildcard tensorflow/core/lib/*/*.cc) \
-tensorflow/core/grappler/inputs/utils.cc \
+CORE_CC_ALL_SRCS := $(CORE_CC_ALL_SRCS) \
 tensorflow/core/kernels/concat_lib_gpu.cc \
 tensorflow/core/kernels/cuda_solvers.cc \
 tensorflow/core/kernels/cudnn_pooling_gpu.cc \
@@ -677,28 +660,14 @@ tensorflow/core/kernels/fractional_avg_pool_op.cc \
 tensorflow/core/kernels/fractional_max_pool_op.cc \
 tensorflow/core/kernels/fractional_pool_common.cc \
 tensorflow/core/kernels/pooling_ops_3d.cc \
-tensorflow/core/kernels/sparse_fill_empty_rows_op.cc
+tensorflow/core/kernels/sparse_fill_empty_rows_op.cc \
+tensorflow/core/kernels/list_kernels.cc \
+$(wildcard tensorflow/core/common_runtime/gpu/*.cc) \
+$(wildcard tensorflow/stream_executor/*.cc) \
+$(wildcard tensorflow/stream_executor/*/*.cc)
 
 CORE_CC_EXCLUDE_SRCS := \
-$(wildcard tensorflow/core/*/*test.cc) \
-$(wildcard tensorflow/core/*/*testutil*) \
-$(wildcard tensorflow/core/*/*testlib*) \
-$(wildcard tensorflow/core/*/*/*test.cc) \
-$(wildcard tensorflow/core/*/*/*testutil*) \
-$(wildcard tensorflow/core/framework/op_gen_lib.cc) \
-$(wildcard tensorflow/core/lib/gif/*) \
-$(wildcard tensorflow/core/lib/jpeg/*) \
-$(wildcard tensorflow/core/lib/png/*) \
-$(wildcard tensorflow/core/lib/db/*) \
-$(wildcard tensorflow/core/platform/jpeg.*) \
-$(wildcard tensorflow/core/platform/png.*) \
-$(wildcard tensorflow/core/platform/cloud/*) \
-$(wildcard tensorflow/core/platform/s3/*) \
-$(wildcard tensorflow/core/platform/windows/*) \
-$(wildcard tensorflow/core/*/*/*testlib*) \
-$(wildcard tensorflow/cc/training/*test.cc) \
-tensorflow/core/lib/io/record_reader.cc \
-tensorflow/core/util/cuda_kernel_helper_test.cu.cc
+$(CORE_CC_EXCLUDE_SRCS_NON_GPU)
 
 CUDA_CC_SRCS := $(wildcard tensorflow/core/kernels/*.cu.cc)
 CUDA_CC_OBJS := $(addprefix $(OBJDIR), $(CUDA_CC_SRCS:.cc=.o))
@@ -760,7 +729,7 @@ $(BENCHMARK_NAME): $(BENCHMARK_OBJS) $(LIB_PATH) $(CUDA_LIB_DEPS)
 	@mkdir -p $(dir $@)
 	$(CXX) $(CXXFLAGS) $(INCLUDES) \
 	-o $(BENCHMARK_NAME) $(BENCHMARK_OBJS) \
-	$(LIBFLAGS) $(TEGRA_LIBS) $(LIB_PATH) $(LDFLAGS) $(LIBS)
+	$(LIBFLAGS) $(TEGRA_LIBS) $(LIB_PATH) $(LDFLAGS) $(LIBS) $(CUDA_LIBS)
 
 # NVCC compilation rules for Tegra
 ifeq ($(BUILD_FOR_TEGRA),1)
diff --git a/tensorflow/contrib/makefile/build_all_android.sh b/tensorflow/contrib/makefile/build_all_android.sh
index 980a44a595..f67c516186 100755
--- a/tensorflow/contrib/makefile/build_all_android.sh
+++ b/tensorflow/contrib/makefile/build_all_android.sh
@@ -18,7 +18,7 @@
 set -e
 
 usage() {
-  echo "Usage: NDK_ROOT=<path to ndk root> $(basename "$0") [-Es:t:Tx:a:X]"
+  echo "Usage: NDK_ROOT=<path to ndk root> $(basename "$0") [-Es:t:Tx:a]"
   echo "-E enable experimental hexnn ops"
   echo "-s [sub_makefiles] sub makefiles separated by white space"
   echo "-t [build_target] build target for Android makefile [default=all]"
diff --git a/tensorflow/contrib/makefile/build_all_ios.sh b/tensorflow/contrib/makefile/build_all_ios.sh
index a18df256f9..2d99791839 100755
--- a/tensorflow/contrib/makefile/build_all_ios.sh
+++ b/tensorflow/contrib/makefile/build_all_ios.sh
@@ -96,7 +96,7 @@ if [[ "${ONLY_MAKE_TENSORFLOW}" != "true" ]]; then
 
     if [[ -z "${BUILD_ARCH}" ]]; then
         # Compile protobuf for the target iOS device architectures.
-        tensorflow/contrib/makefile/compile_ios_protobuf.sh -a ${DEFAULT_ARCH}
+        tensorflow/contrib/makefile/compile_ios_protobuf.sh
     else
         # Compile protobuf for the target iOS device architectures.
         tensorflow/contrib/makefile/compile_ios_protobuf.sh -a ${BUILD_ARCH}
diff --git a/tensorflow/contrib/makefile/samples/build_and_run_inception_hexagon.sh b/tensorflow/contrib/makefile/samples/build_and_run_inception_hexagon.sh
index 861bb885c7..203ff4f890 100755
--- a/tensorflow/contrib/makefile/samples/build_and_run_inception_hexagon.sh
+++ b/tensorflow/contrib/makefile/samples/build_and_run_inception_hexagon.sh
@@ -76,6 +76,8 @@ GEN_LIBS_DIR="${GEN_DIR}/libs"
 GEN_DOWNLOAD_DIR="${GEN_DIR}/downloads"
 URL_BASE="https://storage.googleapis.com/download.tensorflow.org"
 
+ARCH="armeabi-v7a"
+
 source "${SCRIPT_DIR}/../build_helper.subr"
 
 rm -rf "${GEN_DIR}"
@@ -219,7 +221,7 @@ if [[ "${BUILD_ONLY}" != "true" ]]; then
     adb push "${GEN_LIBS_DIR}/libhexagon_nn_skel.so" "/vendor/lib/rfsa/adsp"
 
     adb push -p \
-        "${TF_ROOT_DIR}/tensorflow/contrib/makefile/gen/bin/hexagon_graph_execution" \
+        "${TF_ROOT_DIR}/tensorflow/contrib/makefile/gen/bin/android_${ARCH}/hexagon_graph_execution" \
         "/data/local/tmp/"
     adb wait-for-device
     adb shell chmod "${ANDROID_EXEC_FILE_MODE}" \
diff --git a/tensorflow/contrib/makefile/sub_makefiles/android/Makefile.in b/tensorflow/contrib/makefile/sub_makefiles/android/Makefile.in
index d9277ed60c..3081084ee7 100644
--- a/tensorflow/contrib/makefile/sub_makefiles/android/Makefile.in
+++ b/tensorflow/contrib/makefile/sub_makefiles/android/Makefile.in
@@ -54,7 +54,7 @@ $(INFERENCE_SO_PATH): $(LIB_OBJS) $(INFERENCE_OBJS) $(CUDA_LIB_DEPS)
 	-o $@ $(INFERENCE_OBJS) $(LIB_OBJS) $(TEGRA_LIBS) \
 	$(LIBFLAGS) $(LDFLAGS) \
 	-shared -Wl,-soname,$(INFERENCE_SO_NAME) \
-	$(LIBS)
+	$(LIBS) $(CUDA_LIBS)
 
 $(INFERENCE_SO_NAME): $(INFERENCE_SO_PATH)
 
diff --git a/tensorflow/contrib/makefile/tf_op_files.txt b/tensorflow/contrib/makefile/tf_op_files.txt
index 5f27566398..5a812af4e9 100644
--- a/tensorflow/contrib/makefile/tf_op_files.txt
+++ b/tensorflow/contrib/makefile/tf_op_files.txt
@@ -91,6 +91,7 @@ tensorflow/core/kernels/reduction_ops_max.cc
 tensorflow/core/kernels/reduction_ops_common.cc
 tensorflow/core/kernels/reduction_ops_any.cc
 tensorflow/core/kernels/reduction_ops_all.cc
+tensorflow/core/kernels/roll_op.cc
 tensorflow/core/kernels/queue_ops.cc
 tensorflow/core/kernels/queue_base.cc
 tensorflow/core/kernels/pooling_ops_common.cc
@@ -270,6 +271,7 @@ tensorflow/core/ops/parsing_ops.cc
 tensorflow/core/ops/no_op.cc
 tensorflow/core/ops/nn_ops.cc
 tensorflow/core/ops/nn_grad.cc
+tensorflow/core/ops/manip_ops.cc
 tensorflow/core/ops/math_ops.cc
 tensorflow/core/ops/math_grad.cc
 tensorflow/core/ops/logging_ops.cc
@@ -291,3 +293,4 @@ tensorflow/core/kernels/batchtospace_op.cc
 tensorflow/core/kernels/warn_about_ints.cc
 tensorflow/core/kernels/segment_reduction_ops.cc
 tensorflow/core/kernels/batch_util.cc
+tensorflow/core/ops/audio_ops.cc
diff --git a/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc b/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc
index c2c42b8ed7..6a7f5efecd 100644
--- a/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc
+++ b/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc
@@ -151,7 +151,7 @@ MPIRemoteRendezvous::~MPIRemoteRendezvous() {}
 void MPIRendezvousMgr::AddRequest(RecvTensorRequest request,
                                   const int mpi_dst) {
   TF_CHECK_OK(recv_tensor_recent_request_ids_.TrackUnique(
-      req.request_id(), "RecvTensor (MPIRendezvousMgr)", req));
+      request.request_id(), "RecvTensor (MPIRendezvousMgr)", request));
   const int64 step_id = request.step_id();
   const std::string& key = request.rendezvous_key();
   Rendezvous::ParsedKey parsed;
diff --git a/tensorflow/contrib/mpi/mpi_rendezvous_mgr.h b/tensorflow/contrib/mpi/mpi_rendezvous_mgr.h
index e665922135..5596601ddb 100644
--- a/tensorflow/contrib/mpi/mpi_rendezvous_mgr.h
+++ b/tensorflow/contrib/mpi/mpi_rendezvous_mgr.h
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/contrib/mpi/mpi_msg.pb.h"
 #include "tensorflow/contrib/mpi/mpi_utils.h"
 #include "tensorflow/core/distributed_runtime/base_rendezvous_mgr.h"
+#include "tensorflow/core/distributed_runtime/recent_request_ids.h"
 #include "tensorflow/core/distributed_runtime/request_id.h"
 #include "tensorflow/core/distributed_runtime/worker_env.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
diff --git a/tensorflow/contrib/ndlstm/__init__.py b/tensorflow/contrib/ndlstm/__init__.py
index 52e83069cb..a5dd100b26 100644
--- a/tensorflow/contrib/ndlstm/__init__.py
+++ b/tensorflow/contrib/ndlstm/__init__.py
@@ -12,7 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""Library of multidimensional LSTM models and related code."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+
+from tensorflow.contrib.ndlstm.python import lstm1d
+from tensorflow.contrib.ndlstm.python import lstm2d
diff --git a/tensorflow/contrib/ndlstm/python/lstm1d.py b/tensorflow/contrib/ndlstm/python/lstm1d.py
index d3c3531f40..2e2e9086c0 100644
--- a/tensorflow/contrib/ndlstm/python/lstm1d.py
+++ b/tensorflow/contrib/ndlstm/python/lstm1d.py
@@ -22,7 +22,6 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.contrib.framework.python.ops import variables
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import rnn
@@ -85,18 +84,11 @@ def ndlstm_base_dynamic(inputs, noutput, scope=None, reverse=False):
     Output sequence (length, batch_size, noutput)
   """
   with variable_scope.variable_scope(scope, "SeqLstm", [inputs]):
-    # TODO(tmb) make batch size, sequence_length dynamic
-    # example: sequence_length = tf.shape(inputs)[0]
-    _, batch_size, _ = _shape(inputs)
-    lstm_cell = rnn_cell.BasicLSTMCell(noutput, state_is_tuple=False)
-    state = array_ops.zeros([batch_size, lstm_cell.state_size])
-    sequence_length = int(inputs.get_shape()[0])
-    sequence_lengths = math_ops.to_int64(
-        array_ops.fill([batch_size], sequence_length))
+    lstm_cell = rnn_cell.BasicLSTMCell(noutput)
     if reverse:
       inputs = array_ops.reverse_v2(inputs, [0])
     outputs, _ = rnn.dynamic_rnn(
-        lstm_cell, inputs, sequence_lengths, state, time_major=True)
+        lstm_cell, inputs, time_major=True, dtype=inputs.dtype)
     if reverse:
       outputs = array_ops.reverse_v2(outputs, [0])
     return outputs
diff --git a/tensorflow/contrib/opt/python/training/external_optimizer.py b/tensorflow/contrib/opt/python/training/external_optimizer.py
index f243317f1d..82ebca7f20 100644
--- a/tensorflow/contrib/opt/python/training/external_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/external_optimizer.py
@@ -397,10 +397,6 @@ class ScipyOptimizerInterface(ExternalOptimizerInterface):
             'automatically and cannot be injected manually'.format(kwarg))
 
     minimize_kwargs.update(optimizer_kwargs)
-    if method == 'SLSQP':
-      # SLSQP doesn't support step callbacks. Obviate associated warning
-      # message.
-      del minimize_kwargs['callback']
 
     import scipy.optimize  # pylint: disable=g-import-not-at-top
     result = scipy.optimize.minimize(*minimize_args, **minimize_kwargs)
diff --git a/tensorflow/contrib/opt/python/training/external_optimizer_test.py b/tensorflow/contrib/opt/python/training/external_optimizer_test.py
index 0f597d0a24..953586ee70 100644
--- a/tensorflow/contrib/opt/python/training/external_optimizer_test.py
+++ b/tensorflow/contrib/opt/python/training/external_optimizer_test.py
@@ -299,6 +299,45 @@ class ScipyOptimizerInterfaceTest(TestCase):
       method = optimizer.optimizer_kwargs.get('method')
       self.assertEqual('SLSQP', method)
 
+  def test_callbacks(self):
+    vector_val = np.array([7., -2.], dtype=np.float32)
+    vector = variables.Variable(vector_val, 'vector')
+
+    minimum_location_val = np.arange(2)
+    minimum_location = constant_op.constant(
+        minimum_location_val, dtype=dtypes.float32)
+
+    loss = math_ops.reduce_sum(math_ops.square(vector - minimum_location)) / 2.
+    loss_val_first = ((vector_val - minimum_location_val)**2).sum() / 2.
+
+    optimizer = external_optimizer.ScipyOptimizerInterface(loss, method='SLSQP')
+
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+
+      initial_vector_val = sess.run(vector)
+
+      extra_fetches = [loss]
+
+      step_callback = test.mock.Mock()
+      loss_callback = test.mock.Mock()
+
+      optimizer.minimize(
+          sess,
+          fetches=extra_fetches,
+          loss_callback=loss_callback,
+          step_callback=step_callback)
+
+      loss_val_last = sess.run(loss)
+
+      call_first = test.mock.call(loss_val_first)
+      call_last = test.mock.call(loss_val_last)
+      loss_calls = [call_first, call_last]
+      loss_callback.assert_has_calls(loss_calls, any_order=True)
+
+      args, _ = step_callback.call_args
+      self.assertAllClose(minimum_location_val, args[0])
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/py2tf/impl/api.py b/tensorflow/contrib/py2tf/impl/api.py
index 8ff6618912..85d40f3158 100644
--- a/tensorflow/contrib/py2tf/impl/api.py
+++ b/tensorflow/contrib/py2tf/impl/api.py
@@ -86,8 +86,8 @@ def convert_inline(f, *args, **kwargs):
 def convert(recursive=False, arg_types=None):
   """Decorator that compiles a function to graph mode.
 
-  The decorator is dynamic - invoking compilation whenever the decorated fuction
-  is called. This means the parameter values are known at compilation.
+  The decorator is dynamic - invoking compilation whenever the decorated
+  function is called. This means the parameter values are known at compilation.
 
   Args:
     recursive: Whether to recusrively convert any functions that the decorator
diff --git a/tensorflow/contrib/receptive_field/python/util/graph_compute_order.py b/tensorflow/contrib/receptive_field/python/util/graph_compute_order.py
index b2360fec6c..0388079f20 100644
--- a/tensorflow/contrib/receptive_field/python/util/graph_compute_order.py
+++ b/tensorflow/contrib/receptive_field/python/util/graph_compute_order.py
@@ -61,7 +61,7 @@ def _compute_output_resolution(input_spatial_resolution, kernel_size, stride,
     stride: Stride (int).
     total_padding: Total padding to be applied (int).
   Returns:
-    output_resolution: Ouput dimension (int) or None.
+    output_resolution: Output dimension (int) or None.
   """
   if (input_spatial_resolution is None) or (kernel_size is None) or (
       stride is None) or (total_padding is None):
diff --git a/tensorflow/contrib/reduce_slice_ops/ops/reduce_slice_ops.cc b/tensorflow/contrib/reduce_slice_ops/ops/reduce_slice_ops.cc
index b8b56c0e22..92879ab535 100644
--- a/tensorflow/contrib/reduce_slice_ops/ops/reduce_slice_ops.cc
+++ b/tensorflow/contrib/reduce_slice_ops/ops/reduce_slice_ops.cc
@@ -87,9 +87,9 @@ and 'indices' is [[0,1]
                   [1,1]
                   [0,2]],
 
-the the output will be [[ 1, 2, 3]
-                        [ 0, 0, 0]
-                        [41,52,63]].
+the output will be [[ 1, 2, 3]
+                    [ 0, 0, 0]
+                    [41,52,63]].
 ```
 
 The data must be at least rank 1. The indices must be of shape (?,2) where the
@@ -132,9 +132,9 @@ and 'indices' is [[0,1]
                   [1,1]
                   [0,2]],
 
-the the output will be [[ 1,  2,  3]
-                        [ 1,  1,  1]
-                        [40,100,180]].
+the output will be [[ 1,  2,  3]
+                    [ 1,  1,  1]
+                    [40,100,180]].
 ```
 
 The data must be at least rank 1. The indices can be of shape (?,2) where the
@@ -189,9 +189,9 @@ and 'indices' is [[0,1]
                   [1,1]
                   [0,2]],
 
-the the output will be [[          1,         20,          3]
-                        [ -BIG_VALUE, -BIG_VALUE, -BIG_VALUE]
-                        [        400,         20,         60]].
+the output will be [[          1,         20,          3]
+                    [ -BIG_VALUE, -BIG_VALUE, -BIG_VALUE]
+                    [        400,         20,         60]].
 ```
 
 The data must be at least rank 1. The indices can be of shape (?,2) where the
@@ -246,9 +246,9 @@ and 'indices' is [[0,1]
                   [1,1]
                   [0,2]],
 
-the the output will be [[          1,         20,          3]
-                        [ +BIG_VALUE, +BIG_VALUE, +BIG_VALUE]
-                        [          1,          5,          3]].
+the output will be [[          1,         20,          3]
+                    [ +BIG_VALUE, +BIG_VALUE, +BIG_VALUE]
+                    [          1,          5,          3]].
 ```
 
 The data must be at least rank 1. The indices can be of shape (?,2) where the
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
index 09527e8473..0e62b315b6 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
@@ -157,6 +157,21 @@ class RNNCellTest(test.TestCase):
         # Smoke test
         self.assertAllClose(res[0], [[0.509682, 0.509682]])
 
+  def testSRUCellWithDiffSize(self):
+    with self.test_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 3])
+        m = array_ops.zeros([1, 2])
+        g, _ = contrib_rnn_cell.SRUCell(2)(x, m)
+        sess.run([variables_lib.global_variables_initializer()])
+        res = sess.run([g], {
+            x.name: np.array([[1., 1., 1.]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
+        # Smoke test
+        self.assertAllClose(res[0], [[0.55255556, 0.55255556]])
+
   def testBasicLSTMCell(self):
     for dtype in [dtypes.float16, dtypes.float32]:
       np_dtype = dtype.as_numpy_dtype
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
index c780e85d72..51933be29d 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
@@ -1635,6 +1635,5 @@ class WeightNormLSTMCellTest(test.TestCase):
     self.assertAllClose(expected_c, actual_c, 1e-5)
     self.assertAllClose(expected_h, actual_h, 1e-5)
 
-
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/rnn/python/ops/rnn_cell.py b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
index 124e841fc2..fe07493d0f 100644
--- a/tensorflow/contrib/rnn/python/ops/rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
@@ -2731,25 +2731,9 @@ class SRUCell(rnn_cell_impl._LayerRNNCell):
 
     input_depth = inputs_shape[1].value
 
-    # Here the contributor believes that the following constraints
-    # are implied. The reasoning is explained here with reference to
-    # the paper https://arxiv.org/pdf/1709.02755.pdf upon which this
-    # implementation is based.
-    # In section 2.1 Equation 5, specifically:
-    # h_t = r_t \odot g(c_t) + (1 - r_t) \odot x_t
-    # the pointwise operation between r_t and x_t means they have
-    # the same shape (since we are implementing an RNN cell, braodcasting
-    # does not happen to input of a single timestep); by the same
-    # reasons, x_t has the same shape as h_t, essentially mandating that
-    # input_depth = unit_num.
-    if input_depth != self._num_units:
-      raise ValueError("SRU requires input_depth == num_units, got "
-                       "input_depth = %s, num_units = %s" % (input_depth,
-                                                             self._num_units))
-
     self._kernel = self.add_variable(
         rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
-        shape=[input_depth, 3 * self._num_units])
+        shape=[input_depth, 4 * self._num_units])
 
     self._bias = self.add_variable(
         rnn_cell_impl._BIAS_VARIABLE_NAME,
@@ -2762,8 +2746,8 @@ class SRUCell(rnn_cell_impl._LayerRNNCell):
     """Simple recurrent unit (SRU) with num_units cells."""
 
     U = math_ops.matmul(inputs, self._kernel)
-    x_bar, f_intermediate, r_intermediate = array_ops.split(
-        value=U, num_or_size_splits=3, axis=1)
+    x_bar, f_intermediate, r_intermediate, x_tx = array_ops.split(
+        value=U, num_or_size_splits=4, axis=1)
 
     f_r = math_ops.sigmoid(
         nn_ops.bias_add(
@@ -2771,7 +2755,7 @@ class SRUCell(rnn_cell_impl._LayerRNNCell):
     f, r = array_ops.split(value=f_r, num_or_size_splits=2, axis=1)
 
     c = f * state + (1.0 - f) * x_bar
-    h = r * self._activation(c) + (1.0 - r) * inputs
+    h = r * self._activation(c) + (1.0 - r) * x_tx
 
     return h, c
 
diff --git a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
index 95dea312f3..d6b5eceb47 100644
--- a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
@@ -924,8 +924,7 @@ class LuongMonotonicAttention(_BaseMonotonicAttentionMechanism):
         _monotonic_probability_fn, sigmoid_noise=sigmoid_noise, mode=mode,
         seed=sigmoid_noise_seed)
     super(LuongMonotonicAttention, self).__init__(
-        query_layer=layers_core.Dense(
-            num_units, name="query_layer", use_bias=False, dtype=dtype),
+        query_layer=None,
         memory_layer=layers_core.Dense(
             num_units, name="memory_layer", use_bias=False, dtype=dtype),
         memory=memory,
diff --git a/tensorflow/contrib/session_bundle/bundle_shim.py b/tensorflow/contrib/session_bundle/bundle_shim.py
index 062c9cc680..1db97020a2 100644
--- a/tensorflow/contrib/session_bundle/bundle_shim.py
+++ b/tensorflow/contrib/session_bundle/bundle_shim.py
@@ -82,7 +82,8 @@ def _convert_default_signature_to_signature_def(signatures):
   """
   default_signature = signatures.default_signature
   signature_def = meta_graph_pb2.SignatureDef()
-  if default_signature.WhichOneof("type") == "regression_signature":
+  if (default_signature.WhichOneof("type") ==
+      legacy_constants.REGRESSION_SIGNATURE):
     regression_signature = default_signature.regression_signature
     signature_def.method_name = signature_constants.REGRESS_METHOD_NAME
     _add_input_to_signature_def(regression_signature.input.tensor_name,
@@ -91,7 +92,8 @@ def _convert_default_signature_to_signature_def(signatures):
     _add_output_to_signature_def(regression_signature.output.tensor_name,
                                  signature_constants.REGRESS_OUTPUTS,
                                  signature_def)
-  elif default_signature.WhichOneof("type") == "classification_signature":
+  elif (default_signature.WhichOneof("type") ==
+        legacy_constants.CLASSIFICATION_SIGNATURE):
     classification_signature = default_signature.classification_signature
     signature_def.method_name = signature_constants.CLASSIFY_METHOD_NAME
     _add_input_to_signature_def(classification_signature.input.tensor_name,
@@ -132,8 +134,9 @@ def _convert_named_signatures_to_signature_def(signatures):
       signature_constants.PREDICT_OUTPUTS]
   # TODO(pdudnik): what if there are other signatures? Mimic cr/140900781 once
   # it is submitted.
-  if (input_signature.WhichOneof("type") != "generic_signature" or
-      output_signature.WhichOneof("type") != "generic_signature"):
+  if (input_signature.WhichOneof("type") != legacy_constants.GENERIC_SIGNATURE
+      or output_signature.WhichOneof("type") !=
+      legacy_constants.GENERIC_SIGNATURE):
     raise RuntimeError("Named input and output signatures can only be "
                        "up-converted if they are generic signature. "
                        "Input signature type is %s, output signature type is "
diff --git a/tensorflow/contrib/session_bundle/constants.py b/tensorflow/contrib/session_bundle/constants.py
index 6ced73241a..e833baee79 100644
--- a/tensorflow/contrib/session_bundle/constants.py
+++ b/tensorflow/contrib/session_bundle/constants.py
@@ -32,3 +32,6 @@ INIT_OP_KEY = "serving_init_op"
 SIGNATURES_KEY = "serving_signatures"
 ASSETS_KEY = "serving_assets"
 GRAPH_KEY = "serving_graph"
+REGRESSION_SIGNATURE = "regression_signature"
+CLASSIFICATION_SIGNATURE = "classification_signature"
+GENERIC_SIGNATURE = "generic_signature"
diff --git a/tensorflow/contrib/slim/python/slim/evaluation_test.py b/tensorflow/contrib/slim/python/slim/evaluation_test.py
index 870f504d10..8a267ddac7 100644
--- a/tensorflow/contrib/slim/python/slim/evaluation_test.py
+++ b/tensorflow/contrib/slim/python/slim/evaluation_test.py
@@ -29,7 +29,6 @@ from tensorflow.contrib.framework.python.ops import variables as variables_lib
 from tensorflow.contrib.metrics.python.ops import metric_ops
 from tensorflow.contrib.slim.python.slim import evaluation
 from tensorflow.contrib.training.python.training import evaluation as evaluation_lib
-from tensorflow.core.protobuf import saver_pb2
 from tensorflow.python.debug.lib import debug_data
 from tensorflow.python.debug.wrappers import hooks
 from tensorflow.python.framework import constant_op
@@ -236,7 +235,7 @@ class SingleEvaluationTest(test.TestCase):
   def _prepareCheckpoint(self, checkpoint_path):
     init_op = control_flow_ops.group(variables.global_variables_initializer(),
                                      variables.local_variables_initializer())
-    saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V1)
+    saver = saver_lib.Saver()
     with self.test_session() as sess:
       sess.run(init_op)
       saver.save(sess, checkpoint_path)
diff --git a/tensorflow/contrib/solvers/python/kernel_tests/linear_equations_test.py b/tensorflow/contrib/solvers/python/kernel_tests/linear_equations_test.py
index 930df2414b..a1282847be 100644
--- a/tensorflow/contrib/solvers/python/kernel_tests/linear_equations_test.py
+++ b/tensorflow/contrib/solvers/python/kernel_tests/linear_equations_test.py
@@ -45,32 +45,67 @@ def _get_linear_equations_tests(dtype_, use_static_shape_, shape_):
         low=-1.0, high=1.0, size=np.prod(shape_)).reshape(shape_).astype(dtype_)
     # Make a selfadjoint, positive definite.
     a_np = np.dot(a_np.T, a_np)
+    # jacobi preconditioner
+    jacobi_np = np.zeros_like(a_np)
+    jacobi_np[range(a_np.shape[0]), range(a_np.shape[1])] = (
+        1.0 / a_np.diagonal())
     rhs_np = np.random.uniform(
         low=-1.0, high=1.0, size=shape_[0]).astype(dtype_)
+    x_np = np.zeros_like(rhs_np)
     tol = 1e-6 if dtype_ == np.float64 else 1e-3
     max_iter = 20
     with self.test_session() as sess:
       if use_static_shape_:
         a = constant_op.constant(a_np)
         rhs = constant_op.constant(rhs_np)
+        x = constant_op.constant(x_np)
+        jacobi = constant_op.constant(jacobi_np)
       else:
         a = array_ops.placeholder(dtype_)
         rhs = array_ops.placeholder(dtype_)
+        x = array_ops.placeholder(dtype_)
+        jacobi = array_ops.placeholder(dtype_)
       operator = util.create_operator(a)
-      cg_graph = linear_equations.conjugate_gradient(
-          operator, rhs, tol=tol, max_iter=max_iter)
-      if use_static_shape_:
-        cg_val = sess.run(cg_graph)
-      else:
-        cg_val = sess.run(cg_graph, feed_dict={a: a_np, rhs: rhs_np})
-      norm_r0 = np.linalg.norm(rhs_np)
-      norm_r = np.sqrt(cg_val.gamma)
-      self.assertLessEqual(norm_r, tol * norm_r0)
-      # Validate that we get an equally small residual norm with numpy
-      # using the computed solution.
-      r_np = rhs_np - np.dot(a_np, cg_val.x)
-      norm_r_np = np.linalg.norm(r_np)
-      self.assertLessEqual(norm_r_np, tol * norm_r0)
+      preconditioners = [
+          None, util.identity_operator(a),
+          util.create_operator(jacobi)
+      ]
+      cg_results = []
+      for preconditioner in preconditioners:
+        cg_graph = linear_equations.conjugate_gradient(
+            operator,
+            rhs,
+            preconditioner=preconditioner,
+            x=x,
+            tol=tol,
+            max_iter=max_iter)
+        if use_static_shape_:
+          cg_val = sess.run(cg_graph)
+        else:
+          cg_val = sess.run(
+              cg_graph,
+              feed_dict={
+                  a: a_np,
+                  rhs: rhs_np,
+                  x: x_np,
+                  jacobi: jacobi_np
+              })
+        norm_r0 = np.linalg.norm(rhs_np)
+        norm_r = np.linalg.norm(cg_val.r)
+        self.assertLessEqual(norm_r, tol * norm_r0)
+        # Validate that we get an equally small residual norm with numpy
+        # using the computed solution.
+        r_np = rhs_np - np.dot(a_np, cg_val.x)
+        norm_r_np = np.linalg.norm(r_np)
+        self.assertLessEqual(norm_r_np, tol * norm_r0)
+        cg_results.append(cg_val)
+      # Validate that we get same results using identity_preconditioner
+      # and None
+      self.assertEqual(cg_results[0].i, cg_results[1].i)
+      self.assertAlmostEqual(cg_results[0].gamma, cg_results[1].gamma)
+      self.assertAllClose(cg_results[0].r, cg_results[1].r, rtol=tol)
+      self.assertAllClose(cg_results[0].x, cg_results[1].x, rtol=tol)
+      self.assertAllClose(cg_results[0].p, cg_results[1].p, rtol=tol)
 
   return [test_conjugate_gradient]
 
diff --git a/tensorflow/contrib/solvers/python/kernel_tests/util_test.py b/tensorflow/contrib/solvers/python/kernel_tests/util_test.py
index 1566984b27..5d7534657b 100644
--- a/tensorflow/contrib/solvers/python/kernel_tests/util_test.py
+++ b/tensorflow/contrib/solvers/python/kernel_tests/util_test.py
@@ -63,6 +63,43 @@ class UtilTest(test.TestCase):
   def testCreateOperatorUnknownShape(self):
     self._testCreateOperator(False)
 
+  def _testIdentityOperator(self, use_static_shape_):
+    for dtype in np.float32, np.float64:
+      a_np = np.array([[1., 2.], [3., 4.], [5., 6.]], dtype=dtype)
+      x_np = np.array([[2.], [-3.]], dtype=dtype)
+      y_np = np.array([[2], [-3.], [5.]], dtype=dtype)
+      with self.test_session() as sess:
+        if use_static_shape_:
+          a = constant_op.constant(a_np, dtype=dtype)
+          x = constant_op.constant(x_np, dtype=dtype)
+          y = constant_op.constant(y_np, dtype=dtype)
+        else:
+          a = array_ops.placeholder(dtype)
+          x = array_ops.placeholder(dtype)
+          y = array_ops.placeholder(dtype)
+        id_op = util.identity_operator(a)
+        ax = id_op.apply(x)
+        aty = id_op.apply_adjoint(y)
+        op_shape = ops.convert_to_tensor(id_op.shape)
+        if use_static_shape_:
+          op_shape_val, ax_val, aty_val = sess.run([op_shape, ax, aty])
+        else:
+          op_shape_val, ax_val, aty_val = sess.run(
+              [op_shape, ax, aty], feed_dict={
+                  a: a_np,
+                  x: x_np,
+                  y: y_np
+              })
+      self.assertAllEqual(op_shape_val, [3, 2])
+      self.assertAllClose(ax_val, x_np)
+      self.assertAllClose(aty_val, y_np)
+
+  def testIdentityOperator(self):
+    self._testIdentityOperator(True)
+
+  def testIdentityOperatorUnknownShape(self):
+    self._testIdentityOperator(False)
+
   def testL2Norm(self):
     with self.test_session():
       x_np = np.array([[2], [-3.], [5.]])
diff --git a/tensorflow/contrib/solvers/python/ops/linear_equations.py b/tensorflow/contrib/solvers/python/ops/linear_equations.py
index 8cba56eba6..2395707257 100644
--- a/tensorflow/contrib/solvers/python/ops/linear_equations.py
+++ b/tensorflow/contrib/solvers/python/ops/linear_equations.py
@@ -26,11 +26,14 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 
 
 def conjugate_gradient(operator,
                        rhs,
+                       preconditioner=None,
+                       x=None,
                        tol=1e-4,
                        max_iter=20,
                        name="conjugate_gradient"):
@@ -55,6 +58,15 @@ def conjugate_gradient(operator,
         vector with the result of applying the operator to `x`, i.e. if
        `operator` represents matrix `A`, `apply` should return `A * x`.
     rhs: A rank-1 `Tensor` of shape `[N]` containing the right-hand size vector.
+    preconditioner: An object representing a linear operator, see `operator`
+      for detail. The preconditioner should approximate the inverse of `A`.
+      An efficient preconditioner could dramatically improve the rate of
+      convergence. If `preconditioner` represents matrix `M`(`M` approximates
+      `A^{-1}`), the algorithm uses `preconditioner.apply(x)` to estimate
+      `A^{-1}x`. For this to be useful, the cost of applying `M` should be
+      much lower than computing `A^{-1}` directly.
+    x: A rank-1 `Tensor` of shape `[N]` containing the initial guess for the
+      solution.
     tol: A float scalar convergence tolerance.
     max_iter: An integer giving the maximum number of iterations.
     name: A name scope for the operation.
@@ -65,35 +77,49 @@ def conjugate_gradient(operator,
       - x: A rank-1 `Tensor` of shape `[N]` containing the computed solution.
       - r: A rank-1 `Tensor` of shape `[M]` containing the residual vector.
       - p: A rank-1 `Tensor` of shape `[N]`. `A`-conjugate basis vector.
-      - gamma: \\(||r||_2^2\\)
+      - gamma: \\(r \dot M \dot r\\), equivalent to  \\(||r||_2^2\\) when
+        `preconditioner=None`.
   """
   # ephemeral class holding CG state.
   cg_state = collections.namedtuple("CGState", ["i", "x", "r", "p", "gamma"])
 
   def stopping_criterion(i, state):
-    return math_ops.logical_and(i < max_iter, state.gamma > tol)
+    return math_ops.logical_and(i < max_iter, linalg_ops.norm(state.r) > tol)
 
-  # TODO(rmlarsen): add preconditioning
-  def cg_step(i, state):
+  def cg_step(i, state):  # pylint: disable=missing-docstring
     z = operator.apply(state.p)
     alpha = state.gamma / util.dot(state.p, z)
     x = state.x + alpha * state.p
     r = state.r - alpha * z
-    gamma = util.l2norm_squared(r)
-    beta = gamma / state.gamma
-    p = r + beta * state.p
+    if preconditioner is None:
+      gamma = util.dot(r, r)
+      beta = gamma / state.gamma
+      p = r + beta * state.p
+    else:
+      q = preconditioner.apply(r)
+      gamma = util.dot(r, q)
+      beta = gamma / state.gamma
+      p = q + beta * state.p
     return i + 1, cg_state(i + 1, x, r, p, gamma)
 
   with ops.name_scope(name):
     n = operator.shape[1:]
     rhs = array_ops.expand_dims(rhs, -1)
-    gamma0 = util.l2norm_squared(rhs)
-    tol = tol * tol * gamma0
-    x = array_ops.expand_dims(
-        array_ops.zeros(
-            n, dtype=rhs.dtype.base_dtype), -1)
+    if x is None:
+      x = array_ops.expand_dims(
+          array_ops.zeros(n, dtype=rhs.dtype.base_dtype), -1)
+      r0 = rhs
+    else:
+      x = array_ops.expand_dims(x, -1)
+      r0 = rhs - operator.apply(x)
+    if preconditioner is None:
+      p0 = r0
+    else:
+      p0 = preconditioner.apply(r0)
+    gamma0 = util.dot(r0, p0)
+    tol *= linalg_ops.norm(r0)
     i = constant_op.constant(0, dtype=dtypes.int32)
-    state = cg_state(i=i, x=x, r=rhs, p=rhs, gamma=gamma0)
+    state = cg_state(i=i, x=x, r=r0, p=p0, gamma=gamma0)
     _, state = control_flow_ops.while_loop(stopping_criterion, cg_step,
                                            [i, state])
     return cg_state(
diff --git a/tensorflow/contrib/solvers/python/ops/util.py b/tensorflow/contrib/solvers/python/ops/util.py
index 777e0c185d..96947e8eea 100644
--- a/tensorflow/contrib/solvers/python/ops/util.py
+++ b/tensorflow/contrib/solvers/python/ops/util.py
@@ -45,6 +45,23 @@ def create_operator(matrix):
       apply_adjoint=lambda v: math_ops.matmul(matrix, v, adjoint_a=True))
 
 
+def identity_operator(matrix):
+  """Creates a linear operator from a rank-2 identity tensor."""
+
+  linear_operator = collections.namedtuple(
+      "LinearOperator", ["shape", "dtype", "apply", "apply_adjoint"])
+  shape = matrix.get_shape()
+  if shape.is_fully_defined():
+    shape = shape.as_list()
+  else:
+    shape = array_ops.shape(matrix)
+  return linear_operator(
+      shape=shape,
+      dtype=matrix.dtype,
+      apply=lambda v: v,
+      apply_adjoint=lambda v: v)
+
+
 # TODO(rmlarsen): Measure if we should just call matmul.
 def dot(x, y):
   return math_ops.reduce_sum(math_ops.conj(x) * y)
diff --git a/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py b/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
index 7970c20a26..78d237e6a2 100644
--- a/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
+++ b/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
@@ -17,6 +17,7 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+from absl import flags
 
 import os
 import subprocess
@@ -24,13 +25,21 @@ import sys
 
 import tensorflow as tf
 
-tf.flags.DEFINE_string('service_addr', '',
-                       'Address of TPU profiler service e.g. localhost:8466')
-tf.flags.DEFINE_string('logdir', '',
-                       'Path of TensorBoard log directory e.g. /tmp/tb_log')
-tf.flags.DEFINE_integer('duration_ms', 2000, 'Duration of tracing in ms.')
+flags.DEFINE_string(
+    'service_addr', None, 'Address of TPU profiler service e.g. '
+    'localhost:8466')
+flags.DEFINE_string(
+    'logdir', None, 'Path of TensorBoard log directory e.g. /tmp/tb_log, '
+    'gs://tb_bucket')
+flags.DEFINE_integer('duration_ms', 2000, 'Duration of tracing in ms.')
+flags.DEFINE_integer(
+    'num_tracing_attempts', 3, 'Automatically retry N times when no trace '
+    'event is collected.')
+flags.DEFINE_boolean(
+    'include_dataset_ops', True, 'Set to false to profile longer TPU '
+    'device traces.')
 
-FLAGS = tf.flags.FLAGS
+FLAGS = flags.FLAGS
 EXECUTABLE = 'data/capture_tpu_profile'
 
 
@@ -42,10 +51,13 @@ def main(unused_argv=None):
   if not FLAGS.service_addr or not FLAGS.logdir:
     sys.exit('service_addr and logdir must be provided.')
   executable_path = os.path.join(os.path.dirname(__file__), EXECUTABLE)
+  logdir = os.path.expandvars(os.path.expanduser(FLAGS.logdir))
   cmd = [executable_path]
-  cmd.append('--logdir='+FLAGS.logdir)
+  cmd.append('--logdir='+logdir)
   cmd.append('--service_addr='+FLAGS.service_addr)
   cmd.append('--duration_ms='+str(FLAGS.duration_ms))
+  cmd.append('--num_tracing_attempts='+str(FLAGS.num_tracing_attempts))
+  cmd.append('--include_dataset_ops='+str(FLAGS.include_dataset_ops).lower())
   subprocess.call(cmd)
 
 
diff --git a/tensorflow/contrib/tpu/profiler/pip_package/setup.py b/tensorflow/contrib/tpu/profiler/pip_package/setup.py
index 179d29602b..33ade16003 100644
--- a/tensorflow/contrib/tpu/profiler/pip_package/setup.py
+++ b/tensorflow/contrib/tpu/profiler/pip_package/setup.py
@@ -20,16 +20,12 @@ from __future__ import print_function
 
 from setuptools import setup
 
-_VERSION = '1.3.0-a1'
+_VERSION = '1.5.0-rc1'
 
 CONSOLE_SCRIPTS = [
     'capture_tpu_profile=cloud_tpu_profiler.main:run_main',
 ]
 
-REQUIRED_PACKAGES = [
-    'tensorflow >= 1.2.0',
-]
-
 setup(
     name='cloud_tpu_profiler',
     version=_VERSION.replace('-', ''),
@@ -45,27 +41,22 @@ setup(
     entry_points={
         'console_scripts': CONSOLE_SCRIPTS,
     },
-    install_requires=REQUIRED_PACKAGES,
     classifiers=[
         # How mature is this project? Common values are
         #   3 - Alpha
         #   4 - Beta
         #   5 - Production/Stable
-        'Development Status :: 3 - Alpha',
-        
+        'Development Status :: 4 - Beta',
         'Intended Audience :: Developers',
         'Intended Audience :: Education',
         'Intended Audience :: Science/Research',
-        
         'License :: OSI Approved :: Apache Software License',
-        
         'Programming Language :: Python :: 2',
         'Programming Language :: Python :: 2.7',
         'Programming Language :: Python :: 3',
         'Programming Language :: Python :: 3.4',
         'Programming Language :: Python :: 3.5',
         'Programming Language :: Python :: 3.6',
-        
         'Topic :: Scientific/Engineering',
         'Topic :: Scientific/Engineering :: Mathematics',
         'Topic :: Scientific/Engineering :: Artificial Intelligence',
@@ -74,4 +65,5 @@ setup(
         'Topic :: Software Development :: Libraries :: Python Modules',
     ],
     license='Apache 2.0',
-    keywords='tensorflow performance tpu',)
+    keywords='tensorflow performance tpu',
+)
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index c25aac3acf..7fa0b79766 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -454,6 +454,7 @@ tf_cuda_library(
         "framework/reader_interface.h",
         "framework/reader_op_kernel.h",
         "framework/register_types.h",
+        "framework/register_types_traits.h",
         "framework/resource_mgr.h",
         "framework/resource_op_kernel.h",
         "framework/selective_registration.h",
@@ -611,6 +612,7 @@ tf_gen_op_libs(
         "list_ops",
         "lookup_ops",
         "logging_ops",
+        "manip_ops",
         "math_ops",
         "nn_ops",
         "no_op",
@@ -693,6 +695,7 @@ cc_library(
         ":list_ops_op_lib",
         ":logging_ops_op_lib",
         ":lookup_ops_op_lib",
+        ":manip_ops_op_lib",
         ":math_ops_op_lib",
         ":nn_ops_op_lib",
         ":no_op_op_lib",
@@ -831,6 +834,7 @@ cc_library(
         "//tensorflow/core/kernels:list_kernels",
         "//tensorflow/core/kernels:lookup",
         "//tensorflow/core/kernels:logging",
+        "//tensorflow/core/kernels:manip",
         "//tensorflow/core/kernels:math",
         "//tensorflow/core/kernels:multinomial_op",
         "//tensorflow/core/kernels:nn",
@@ -1153,6 +1157,7 @@ cc_library(
     deps = [
         ":protos_all_cc_impl",
         "//third_party/eigen3",
+        "@nsync//:nsync_cpp",
         "@protobuf_archive//:protobuf",
     ],
     alwayslink = 1,
diff --git a/tensorflow/core/api_def/base_api/api_def_MatchingFiles.pbtxt b/tensorflow/core/api_def/base_api/api_def_MatchingFiles.pbtxt
index 8da76684e5..97fd39f647 100644
--- a/tensorflow/core/api_def/base_api/api_def_MatchingFiles.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_MatchingFiles.pbtxt
@@ -16,5 +16,6 @@ END
   description: <<END
 Note that this routine only supports wildcard characters in the
 basename portion of the pattern, not in the directory portion.
+Note also that the order of filenames returned can be non-deterministic.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_Roll.pbtxt b/tensorflow/core/api_def/base_api/api_def_Roll.pbtxt
new file mode 100644
index 0000000000..b308ad1f9d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Roll.pbtxt
@@ -0,0 +1,52 @@
+op {
+  graph_op_name: "Roll"
+  in_arg {
+    name: "shift"
+    description: <<END
+Dimension must be 0-D or 1-D. `shift[i]` specifies the number of places by which
+elements are shifted positively (towards larger indices) along the dimension
+specified by `axis[i]`. Negative shifts will roll the elements in the opposite
+direction.
+END
+  }
+  in_arg {
+    name: "axis"
+    description: <<END
+Dimension must be 0-D or 1-D. `axis[i]` specifies the dimension that the shift
+`shift[i]` should occur. If the same axis is referenced more than once, the
+total shift for that axis will be the sum of all the shifts that belong to that
+axis.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Has the same shape and size as the input. The elements are shifted
+positively (towards larger indices) by the offsets of `shift` along the
+dimensions of `axis`.
+END
+  }
+  summary: "Rolls the elements of a tensor along an axis."
+  description: <<END
+The elements are shifted positively (towards larger indices) by the offset of
+`shift` along the dimension of `axis`. Negative `shift` values will shift
+elements in the opposite direction. Elements that roll passed the last position
+will wrap around to the first and vice versa. Multiple shifts along multiple
+axes may be specified.
+
+For example:
+
+```
+# 't' is [0, 1, 2, 3, 4]
+roll(t, shift=2, axis=0) ==> [3, 4, 0, 1, 2]
+
+# shifting along multiple dimensions
+# 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
+roll(t, shift=[1, -2], axis=[0, 1]) ==> [[7, 8, 9, 5, 6], [2, 3, 4, 0, 1]]
+
+# shifting along the same axis multiple times
+# 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
+roll(t, shift=[2, -3], axis=[1, 1]) ==> [[1, 2, 3, 4, 0], [6, 7, 8, 9, 5]]
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_UnravelIndex.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnravelIndex.pbtxt
new file mode 100644
index 0000000000..97c380700a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_UnravelIndex.pbtxt
@@ -0,0 +1,32 @@
+op {
+  graph_op_name: "UnravelIndex"
+  in_arg {
+    name: "indices"
+    description: <<END
+An 0-D or 1-D `int` Tensor whose elements are indices into the
+flattened version of an array of dimensions dims.
+END
+  }
+  in_arg {
+    name: "dims"
+    description: <<END
+An 1-D `int` Tensor. The shape of the array to use for unraveling
+indices.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+An 2-D (or 1-D if indices is 0-D) tensor where each row has the
+same shape as the indices array.
+END
+  }
+  summary: "Converts a flat index or array of flat indices into a tuple of"
+  description: <<END
+coordinate arrays.
+
+@compatibility(numpy)
+Equivalent to np.unravel_index
+@end_compatibility
+END
+}
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index 04b5541863..a9485a835e 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -762,7 +762,8 @@ int64 MinSystemMemory(int64 available_memory) {
   // is necessary.
   min_system_memory *= 2;
 #endif
-#if defined(NVIDIA_TEGRA)
+
+#if defined(ANDROID_TEGRA)
   // 1GB system mem for NVIDIA Tegra devices since they use the same mem for RAM
   // and Video RAM
   min_system_memory = 1 << 30;
diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD
index f4ee841032..9e152aa082 100644
--- a/tensorflow/core/distributed_runtime/BUILD
+++ b/tensorflow/core/distributed_runtime/BUILD
@@ -145,6 +145,7 @@ cc_library(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:worker_proto_cc",
     ],
 )
 
diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index dcc25e4426..878a1398c9 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -1448,6 +1448,7 @@ Status MasterSession::DoPartialRun(CallOptions* opts,
     const auto count = run_state->count;
     pss.collect_timeline =
         req.options().trace_level() == RunOptions::FULL_TRACE;
+    pss.collect_rpcs = req.options().trace_level() == RunOptions::FULL_TRACE;
     pss.report_tensor_allocations_upon_oom =
         req.options().report_tensor_allocations_upon_oom();
 
@@ -1610,6 +1611,7 @@ Status MasterSession::DoRunWithLocalExecution(
   TRACEPRINTF("stepid %llu", step_id);
 
   pss.collect_timeline = req.options().trace_level() == RunOptions::FULL_TRACE;
+  pss.collect_rpcs = req.options().trace_level() == RunOptions::FULL_TRACE;
   pss.report_tensor_allocations_upon_oom =
       req.options().report_tensor_allocations_upon_oom();
   // Build the cost model every 'build_cost_model_every' steps after skipping an
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
index 95811476f7..b20e744a97 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
@@ -444,6 +444,24 @@ void GrpcWorker::GrpcRecvTensorAsync(CallOptions* opts,
       });
 }
 
+void GrpcWorker::LoggingAsync(const LoggingRequest* request,
+                              LoggingResponse* response, StatusCallback done) {
+  auto env = this->env();
+  if (env) {
+    auto session_mgr = (SessionMgr*)env->session_mgr;
+    if (session_mgr) {
+      session_mgr->SetLogging(request->rpc_logging());
+      for (const auto& step_id : request->fetch_step_id()) {
+        session_mgr->RetrieveLogs(step_id, response);
+      }
+      if (request->clear()) {
+        session_mgr->ClearLogs();
+      }
+    }
+  }
+  done(Status::OK());
+}
+
 WorkerEnv* GrpcWorker::env() { return env_; }
 
 std::unique_ptr<GrpcWorker> NewGrpcWorker(WorkerEnv* env) {
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
index 78a21fd9f6..fbddbda9e6 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
@@ -40,6 +40,9 @@ class GrpcWorker : public Worker {
                                    ::grpc::ByteBuffer* response,
                                    StatusCallback done);
 
+  virtual void LoggingAsync(const LoggingRequest* request,
+                            LoggingResponse* response, StatusCallback done);
+
   WorkerEnv* env();
 
  private:
diff --git a/tensorflow/core/distributed_runtime/session_mgr.cc b/tensorflow/core/distributed_runtime/session_mgr.cc
index 8db49e7f15..90664c3612 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr.cc
@@ -64,8 +64,13 @@ Status SessionMgr::CreateSession(const string& session,
     TF_RETURN_IF_ERROR(worker_cache_factory_(server_def, &worker_cache));
   }
 
+  if (worker_cache != nullptr & default_worker_cache_.get() != nullptr) {
+    worker_cache->SetLogging(this->is_logging_active_);
+  }
+
   CHECK(!worker_env_->local_devices.empty())
       << "The WorkerEnv must have at least one device in `local_devices`.";
+
   std::vector<Device*> renamed_devices;
   for (Device* d : worker_env_->local_devices) {
     renamed_devices.push_back(RenamedDevice::NewRenamedDevice(
@@ -113,4 +118,77 @@ std::shared_ptr<WorkerSession> SessionMgr::LegacySession() {
   return legacy_session_;
 }
 
+void SessionMgr::SetLogging(bool active) {
+  mutex_lock l(mu_);
+  this->is_logging_active_ = active;
+  // Legacy Session
+  if (legacy_session_) {
+    auto* worker_cache = legacy_session_->worker_cache.get();
+    if (worker_cache) {
+      worker_cache->SetLogging(active);
+    }
+  }
+
+  for (const auto& session_kv : sessions_) {
+    auto session = session_kv.second.get();
+    if (session) {
+      auto* worker_cache = session->worker_cache.get();
+      if (worker_cache) {
+        worker_cache->SetLogging(active);
+      }
+    }
+  }
+}
+
+void SessionMgr::RetrieveLogs(tensorflow::int64 step_id,
+                              LoggingResponse* response) {
+  mutex_lock l(mu_);
+  // Legacy Session
+  if (legacy_session_) {
+    auto* worker_cache = legacy_session_->worker_cache.get();
+    if (worker_cache) {
+      auto step_stats = StepStats();
+      if (worker_cache->RetrieveLogs(step_id, &step_stats)) {
+        auto* labeled_step_stats = response->add_step();
+        labeled_step_stats->set_step_id(step_id);
+        labeled_step_stats->mutable_step_stats()->Swap(&step_stats);
+      }
+    }
+  }
+  for (const auto& session_kv : sessions_) {
+    auto session = session_kv.second.get();
+    if (session) {
+      auto* worker_cache = session->worker_cache.get();
+      if (worker_cache) {
+        auto step_stats = StepStats();
+        if (worker_cache->RetrieveLogs(step_id, &step_stats)) {
+          auto* labeled_step_stats = response->add_step();
+          labeled_step_stats->set_step_id(step_id);
+          labeled_step_stats->mutable_step_stats()->Swap(&step_stats);
+        }
+      }
+    }
+  }
+}
+
+void SessionMgr::ClearLogs() {
+  mutex_lock l(mu_);
+  // Legacy Session
+  if (legacy_session_) {
+    auto* worker_cache = legacy_session_->worker_cache.get();
+    if (worker_cache) {
+      worker_cache->ClearLogs();
+    }
+  }
+
+  for (const auto& session_kv : sessions_) {
+    auto session = session_kv.second.get();
+    if (session) {
+      auto* worker_cache = session->worker_cache.get();
+      if (worker_cache) {
+        worker_cache->ClearLogs();
+      }
+    }
+  }
+}
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/session_mgr.h b/tensorflow/core/distributed_runtime/session_mgr.h
index 3ce260d12e..4c9702d522 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.h
+++ b/tensorflow/core/distributed_runtime/session_mgr.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/protobuf/tensorflow_server.pb.h"
+#include "tensorflow/core/protobuf/worker.pb.h"
 
 namespace tensorflow {
 
@@ -56,6 +57,12 @@ class SessionMgr {
 
   static string WorkerNameFromServerDef(const ServerDef& server_def);
 
+  void SetLogging(bool active);
+
+  void RetrieveLogs(tensorflow::int64 step_id, LoggingResponse* response);
+
+  void ClearLogs();
+
  private:
   const WorkerEnv* const worker_env_;  // Not owned.
 
@@ -75,6 +82,8 @@ class SessionMgr {
   std::unique_ptr<WorkerCacheInterface> default_worker_cache_;
   std::shared_ptr<WorkerSession> legacy_session_;
 
+  bool is_logging_active_ = false;
+
   const WorkerCacheFactory worker_cache_factory_;
 
   std::shared_ptr<WorkerSession> WorkerSessionForSessionUnlocked(
diff --git a/tensorflow/core/framework/register_types.h b/tensorflow/core/framework/register_types.h
index e448a60f5e..e90596980f 100644
--- a/tensorflow/core/framework/register_types.h
+++ b/tensorflow/core/framework/register_types.h
@@ -53,7 +53,7 @@ limitations under the License.
 */
 
 #if !defined(IS_MOBILE_PLATFORM) || defined(SUPPORT_SELECTIVE_REGISTRATION) || \
-    defined(NVIDIA_TEGRA)
+    defined(ANDROID_TEGRA)
 
 // All types are supported, so all macros are invoked.
 //
diff --git a/tensorflow/core/framework/variant_op_registry.cc b/tensorflow/core/framework/variant_op_registry.cc
index 395329da3b..ee07db1aee 100644
--- a/tensorflow/core/framework/variant_op_registry.cc
+++ b/tensorflow/core/framework/variant_op_registry.cc
@@ -182,7 +182,7 @@ Status VariantDeviceCopy(
 // Special casing UnaryOpFn per op and per device.
 UnaryVariantOpRegistry::VariantUnaryOpFn* UnaryVariantOpRegistry::GetUnaryOpFn(
     VariantUnaryOp op, StringPiece device, StringPiece type_name) {
-  auto found = unary_op_fns.find(std::make_tuple(op, device, type_name));
+  auto found = unary_op_fns.find({op, device, type_name});
   if (found == unary_op_fns.end()) return nullptr;
   return &found->second;
 }
@@ -195,12 +195,10 @@ void UnaryVariantOpRegistry::RegisterUnaryOpFn(
   CHECK_EQ(existing, nullptr)
       << "Unary VariantUnaryOpFn for type_name: " << type_name
       << " already registered for device type: " << device;
-  unary_op_fns.insert(
-      std::pair<std::tuple<VariantUnaryOp, StringPiece, StringPiece>,
-                VariantUnaryOpFn>(
-          std::make_tuple(op, GetPersistentStringPiece(device),
-                          GetPersistentStringPiece(type_name)),
-          unary_op_fn));
+  unary_op_fns.insert(std::pair<FuncTuple<VariantUnaryOp>, VariantUnaryOpFn>(
+      {op, GetPersistentStringPiece(device),
+       GetPersistentStringPiece(type_name)},
+      unary_op_fn));
 }
 
 namespace {
@@ -229,7 +227,7 @@ REGISTER_VARIANT_ZEROS_LIKE_TYPE(bool);
 UnaryVariantOpRegistry::VariantBinaryOpFn*
 UnaryVariantOpRegistry::GetBinaryOpFn(VariantBinaryOp op, StringPiece device,
                                       StringPiece type_name) {
-  auto found = binary_op_fns.find(std::make_tuple(op, device, type_name));
+  auto found = binary_op_fns.find({op, device, type_name});
   if (found == binary_op_fns.end()) return nullptr;
   return &found->second;
 }
@@ -242,12 +240,10 @@ void UnaryVariantOpRegistry::RegisterBinaryOpFn(
   CHECK_EQ(existing, nullptr)
       << "Unary VariantBinaryOpFn for type_name: " << type_name
       << " already registered for device type: " << device;
-  binary_op_fns.insert(
-      std::pair<std::tuple<VariantBinaryOp, StringPiece, StringPiece>,
-                VariantBinaryOpFn>(
-          std::make_tuple(op, GetPersistentStringPiece(device),
-                          GetPersistentStringPiece(type_name)),
-          add_fn));
+  binary_op_fns.insert(std::pair<FuncTuple<VariantBinaryOp>, VariantBinaryOpFn>(
+      {op, GetPersistentStringPiece(device),
+       GetPersistentStringPiece(type_name)},
+      add_fn));
 }
 
 namespace {
diff --git a/tensorflow/core/framework/variant_op_registry.h b/tensorflow/core/framework/variant_op_registry.h
index 13f6908cae..e94100e994 100644
--- a/tensorflow/core/framework/variant_op_registry.h
+++ b/tensorflow/core/framework/variant_op_registry.h
@@ -166,6 +166,21 @@ class UnaryVariantOpRegistry {
       device_copy_fns;
 
   // Map std::tuple<Op, device, type_name> to function.
+
+  // this breaks by falling victim to "too perfect forwarding"
+  // see https://stackoverflow.com/questions/44475317/variadic-template-issue
+  // and references therein
+  template <typename Op>
+  struct FuncTuple {
+    FuncTuple(const Op& op, const StringPiece& dev, const StringPiece& tname)
+        : op_type_(op), device_(dev), typename_(tname){};
+    Op op_type_;
+    StringPiece device_, typename_;
+  };
+  // friend declaration for operator==
+  // needed for clang
+  template <typename Op>
+  friend bool operator==(const FuncTuple<Op>& l, const FuncTuple<Op>& r);
   struct TupleHash {
     template <typename Op>
     std::size_t operator()(
@@ -176,18 +191,25 @@ class UnaryVariantOpRegistry {
       ret = Hash64Combine(ret, sp_hasher_(std::get<2>(x)));
       return ret;
     }
+
+    template <typename Op>
+    std::size_t operator()(const FuncTuple<Op>& x) const {
+      // The hash of an enum is just its value as a std::size_t.
+      std::size_t ret = static_cast<std::size_t>(x.op_type_);
+      ret = Hash64Combine(ret, sp_hasher_(x.device_));
+      ret = Hash64Combine(ret, sp_hasher_(x.typename_));
+      return ret;
+    }
     StringPieceHasher sp_hasher_;
   };
-  std::unordered_map<std::tuple<VariantUnaryOp, StringPiece, StringPiece>,
-                     VariantUnaryOpFn, TupleHash>
+  std::unordered_map<FuncTuple<VariantUnaryOp>, VariantUnaryOpFn, TupleHash>
       unary_op_fns;
-  std::unordered_map<std::tuple<VariantBinaryOp, StringPiece, StringPiece>,
-                     VariantBinaryOpFn, TupleHash>
+  std::unordered_map<FuncTuple<VariantBinaryOp>, VariantBinaryOpFn, TupleHash>
       binary_op_fns;
 
   // Find or insert a string into a persistent string storage
-  // container; return the StringPiece pointing to the permanent
-  // string location.
+  // container; return the StringPiece pointing to the permanent string
+  // location.
   static StringPiece GetPersistentStringPiece(const string& str) {
     const auto string_storage = PersistentStringStorage();
     auto found = string_storage->find(str);
@@ -199,7 +221,12 @@ class UnaryVariantOpRegistry {
     }
   }
 };
-
+template <typename Op>
+inline bool operator==(const UnaryVariantOpRegistry::FuncTuple<Op>& lhs,
+                       const UnaryVariantOpRegistry::FuncTuple<Op>& rhs) {
+  return (lhs.op_type_ == rhs.op_type_) && (lhs.device_ == rhs.device_) &&
+         (lhs.typename_ == rhs.typename_);
+}
 // Gets a TensorShape from a Tensor containing a scalar Variant.
 // Returns an Internal error if the Variant does not have a registered shape
 // function, or if it's a serialized Variant that cannot be decoded.
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index 68c3136019..7d3be15299 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -42,7 +42,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
 // This pass implements rewriting of graph to support following scenarios:
 // (A) Merging nodes in the graph
@@ -2211,7 +2211,7 @@ Status MklLayoutRewritePass::Run(const GraphOptimizationPassOptions& options) {
   return Status::OK();
 }
 
-#else   // INTEL_MKL_DNN
+#else   // INTEL_MKL_ML
 
 // This pass implements rewriting of graph to support following scenarios:
 // (A) Merging nodes in the graph
@@ -2452,9 +2452,8 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     // NOTE: names are alphabetically sorted.
     rinfo_.push_back({csinfo_.addn, mkl_op_registry::GetMklOpName(csinfo_.addn),
                       CopyAttrsAddN, AddNRewrite});
-    /* rinfo_.push_back({csinfo_.add,
-                      mkl_op_registry::GetMklOpName(csinfo_.add),
-                      CopyAttrsDataType, AlwaysRewrite}); */
+    rinfo_.push_back({csinfo_.add, mkl_op_registry::GetMklOpName(csinfo_.add),
+                      CopyAttrsDataType, AlwaysRewrite});
     rinfo_.push_back({csinfo_.avg_pool,
                       mkl_op_registry::GetMklOpName(csinfo_.avg_pool),
                       CopyAttrsPooling, AlwaysRewrite});
@@ -2502,14 +2501,13 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     rinfo_.push_back({csinfo_.max_pool_grad,
                       mkl_op_registry::GetMklOpName(csinfo_.max_pool_grad),
                       CopyAttrsPooling, AlwaysRewrite});
-    /*
+
     rinfo_.push_back({csinfo_.maximum,
                       mkl_op_registry::GetMklOpName(csinfo_.maximum),
                       CopyAttrsDataType, AlwaysRewrite});
     rinfo_.push_back({csinfo_.mul,
                       mkl_op_registry::GetMklOpName(csinfo_.mul),
                       CopyAttrsDataType, AlwaysRewrite});
-    */
     rinfo_.push_back({csinfo_.relu, mkl_op_registry::GetMklOpName(csinfo_.relu),
                       CopyAttrsDataType, AlwaysRewrite});
     rinfo_.push_back({csinfo_.relu_grad,
@@ -2529,14 +2527,13 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     rinfo_.push_back({csinfo_.softmax,
                       mkl_op_registry::GetMklOpName(csinfo_.softmax),
                       CopyAttrsDataType, AlwaysRewrite});
-    /*
+
     rinfo_.push_back({csinfo_.squared_difference,
                       mkl_op_registry::GetMklOpName(csinfo_.squared_difference),
                       CopyAttrsDataType, AlwaysRewrite});
     rinfo_.push_back({csinfo_.sub,
                       mkl_op_registry::GetMklOpName(csinfo_.sub),
                       CopyAttrsDataType, AlwaysRewrite});
-    */
 
     // Add info about which ops to add workspace edge to and the slots.
     wsinfo_.push_back({csinfo_.lrn, csinfo_.lrn_grad, 0, 2, 1, 3});
@@ -4317,7 +4314,7 @@ Status MklLayoutRewritePass::Run(const GraphOptimizationPassOptions& options) {
 
   return Status::OK();
 }
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 }  // namespace tensorflow
 
 #endif
diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc
index 320d5a48c7..5e2a465e22 100644
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@@ -38,7 +38,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
 namespace {
 
@@ -1899,7 +1899,7 @@ BENCHMARK(BM_MklLayoutRewritePass)->Arg(1000)->Arg(10000);
 
 }  // namespace
 
-#else  // INTEL_MKL_DNN
+#else  // INTEL_MKL_ML
 
 namespace {
 
@@ -3532,7 +3532,7 @@ BENCHMARK(BM_MklLayoutRewritePass)->Arg(1000)->Arg(10000);
 
 }  // namespace
 
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/graph/testlib.cc b/tensorflow/core/graph/testlib.cc
index d5b026eae3..0d88d1ff72 100644
--- a/tensorflow/core/graph/testlib.cc
+++ b/tensorflow/core/graph/testlib.cc
@@ -273,6 +273,16 @@ Node* Reverse(Graph* g, Node* tensor, Node* axis) {
   return Binary(g, "ReverseV2", tensor, axis);
 }
 
+Node* Roll(Graph* g, Node* input, Node* shift, Node* axis) {
+  Node* ret;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Roll", g->op_registry())
+                  .Input(input)
+                  .Input(shift)
+                  .Input(axis)
+                  .Finalize(g, &ret));
+  return ret;
+}
+
 Node* Error(Graph* g, Node* input, const string& errmsg) {
   Node* ret;
   TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Error")
diff --git a/tensorflow/core/graph/testlib.h b/tensorflow/core/graph/testlib.h
index 06597778bb..eb9038d619 100644
--- a/tensorflow/core/graph/testlib.h
+++ b/tensorflow/core/graph/testlib.h
@@ -117,6 +117,10 @@ Node* RandomGamma(Graph* g, Node* shape, Node* alpha);
 // Output dtype determined by lam.
 Node* RandomPoisson(Graph* g, Node* shape, Node* lam);
 
+// Rolls tensor by an offset of <shift> along the corresponding
+// <axis> dimensions.
+Node* Roll(Graph* g, Node* input, Node* shift, Node* axis);
+
 // Generates random parameters from the truncated standard normal distribution
 // of the nput shape
 Node* TruncatedNormal(Graph* g, Node* input, DataType dtype);
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index fd99409c9b..e7192ec42f 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -629,6 +629,7 @@ cc_library(
         ":transpose_op",
         ":unique_op",
         ":unpack_op",
+        ":unravel_index_op",
         ":where_op",
     ],
 )
@@ -883,6 +884,12 @@ tf_kernel_library(
     deps = ARRAY_DEPS + [":split_lib"],
 )
 
+tf_kernel_library(
+    name = "unravel_index_op",
+    prefix = "unravel_index_op",
+    deps = ARRAY_DEPS,
+)
+
 tf_kernel_library(
     name = "where_op",
     srcs = ["where_op.cc"],
@@ -2582,6 +2589,45 @@ tf_cc_tests(
     ],
 )
 
+cc_library(
+    name = "manip",
+    deps = [
+        ":roll_op",
+    ],
+)
+
+MANIP_DEPS = [
+    "//tensorflow/core:framework",
+    "//tensorflow/core:lib",
+    "//tensorflow/core:manip_ops_op_lib",
+    "//third_party/eigen3",
+]
+
+tf_kernel_library(
+    name = "roll_op",
+    prefix = "roll_op",
+    deps = MANIP_DEPS,
+)
+
+tf_cc_test(
+    name = "roll_op_test",
+    size = "small",
+    srcs = ["roll_op_test.cc"],
+    deps = [
+        ":ops_testutil",
+        ":ops_util",
+        ":roll_op",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 MATH_DEPS = [
     ":bounds_check",
     ":fill_functor",
diff --git a/tensorflow/core/kernels/compare_and_bitpack_op.cc b/tensorflow/core/kernels/compare_and_bitpack_op.cc
index 9f626a274a..224fe534e3 100644
--- a/tensorflow/core/kernels/compare_and_bitpack_op.cc
+++ b/tensorflow/core/kernels/compare_and_bitpack_op.cc
@@ -110,7 +110,19 @@ struct ComputeShard<T,
       typename TTypes<bool>::ConstMatrix input,
       typename TTypes<uint8>::Matrix output, bool /*thresh*/, int64 start,
       int64 limit) {
-    // NOTE(ebrevdo): This assumes memory is little-endian.
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+    for (int64 i = start; i < limit; ++i) {
+      uint8* out = output.data() + i;
+      const int64 block = *reinterpret_cast<const int64*>(input.data() + 8 * i);
+      *out = ((((block & (1LL << (7 * 8))) >> (7 * 8 - 7))) |
+              (((block & (1LL << (6 * 8))) >> (6 * 8 - 6))) |
+              (((block & (1LL << (5 * 8))) >> (5 * 8 - 5))) |
+              (((block & (1LL << (4 * 8))) >> (4 * 8 - 4))) |
+              (((block & (1LL << (3 * 8))) >> (3 * 8 - 3))) |
+              (((block & (1LL << (2 * 8))) >> (2 * 8 - 2))) |
+              (((block & (1LL << 8)) >> (1 * 8 - 1))) | (((block & (1LL)))));
+    }
+#else
     for (int64 i = start; i < limit; ++i) {
       uint8* out = output.data() + i;
       const int64 block = *reinterpret_cast<const int64*>(input.data() + 8 * i);
@@ -123,6 +135,7 @@ struct ComputeShard<T,
            (((block & (1LL << (2 * 8))) >> (2 * 8 - 5))) |
            (((block & (1LL << 8)) >> (1 * 8 - 6))) | (((block & (1LL)) << 7)));
     }
+#endif
   }
 };
 
diff --git a/tensorflow/core/kernels/decode_bmp_op.cc b/tensorflow/core/kernels/decode_bmp_op.cc
index c778278e8f..b7d120a617 100644
--- a/tensorflow/core/kernels/decode_bmp_op.cc
+++ b/tensorflow/core/kernels/decode_bmp_op.cc
@@ -39,6 +39,13 @@ class DecodeBmpOp : public OpKernel {
         errors::InvalidArgument("channels must be 0, 1, 3 or 4, got ",
                                 channels_));
   }
+  inline int32 ByteSwapInt32ForBigEndian(int32 x) {
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    return le32toh(x);
+#else
+    return x;
+#endif
+  }
 
   void Compute(OpKernelContext* context) override {
     const Tensor& contents = context->input(0);
@@ -56,14 +63,18 @@ class DecodeBmpOp : public OpKernel {
                                         input.size(), " bytes"));
 
     const uint8* img_bytes = reinterpret_cast<const uint8*>(input.data());
-    const int32 header_size = internal::SubtleMustCopy(
+    int32 header_size_ = internal::SubtleMustCopy(
         *(reinterpret_cast<const int32*>(img_bytes + 10)));
-    const int32 width = internal::SubtleMustCopy(
+    const int32 header_size = ByteSwapInt32ForBigEndian(header_size_);
+    int32 width_ = internal::SubtleMustCopy(
         *(reinterpret_cast<const int32*>(img_bytes + 18)));
-    const int32 height = internal::SubtleMustCopy(
+    const int32 width = ByteSwapInt32ForBigEndian(width_);
+    int32 height_ = internal::SubtleMustCopy(
         *(reinterpret_cast<const int32*>(img_bytes + 22)));
-    const int32 bpp = internal::SubtleMustCopy(
+    const int32 height = ByteSwapInt32ForBigEndian(height_);
+    int32 bpp_ = internal::SubtleMustCopy(
         *(reinterpret_cast<const int32*>(img_bytes + 28)));
+    const int32 bpp = ByteSwapInt32ForBigEndian(bpp_);
 
     if (channels_) {
       OP_REQUIRES(context, (channels_ == bpp / 8),
diff --git a/tensorflow/core/kernels/fractional_pool_common.h b/tensorflow/core/kernels/fractional_pool_common.h
index df0bbbfa06..2d7a230fc0 100644
--- a/tensorflow/core/kernels/fractional_pool_common.h
+++ b/tensorflow/core/kernels/fractional_pool_common.h
@@ -57,7 +57,7 @@ static inline void RandomShuffle(Iter first, Iter last, const Random& uniform) {
 //     * sum(generated_diff_pooling_sequence) = input_length
 //     * Let's define floor(input_length / output_length) = K, then
 //       K <= generated_diff_pooling_sequence[i] <= K+1
-// For example, when input_length = 10, output_length = 6, the followings are
+// For example, when input_length = 10, output_length = 6, the following are
 // valid pooling sequence:
 //     * [1, 2, 2, 1, 2, 2]
 //     * [1, 1, 2, 2, 2, 2]
diff --git a/tensorflow/core/kernels/mkl_aggregate_ops.cc b/tensorflow/core/kernels/mkl_aggregate_ops.cc
index 89d37d2f87..b539b00009 100644
--- a/tensorflow/core/kernels/mkl_aggregate_ops.cc
+++ b/tensorflow/core/kernels/mkl_aggregate_ops.cc
@@ -28,7 +28,7 @@ limitations under the License.
 #include "mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 using mkldnn::stream;
 using mkldnn::sum;
@@ -37,7 +37,7 @@ using mkldnn::sum;
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
 template <typename Device, typename T>
 class MklAddNOp : public OpKernel {
@@ -285,7 +285,7 @@ class MklAddNOp : public OpKernel {
   } MklAddNOpContext;
 };
 
-#else  // INTEL_MKL_DNN
+#else  // INTEL_MKL_ML
 template <typename Device, typename T>
 class MklAddNOp : public OpKernel {
  public:
@@ -317,8 +317,11 @@ class MklAddNOp : public OpKernel {
                                                 : src2_tensor.dims();
       // if the shapes of two tensors are not same raise op error
       TensorShape src1_shape, src2_shape;
-      src1_shape = src1_tensor.shape();
-      src2_shape = src2_tensor.shape();
+      src1_shape = input1_in_mkl_format ? src1_mkl_shape.GetTfShape()
+                                        : src1_tensor.shape();
+      src2_shape = input2_in_mkl_format ? src2_mkl_shape.GetTfShape()
+                                        : src2_tensor.shape();
+
       if (!src1_shape.IsSameSize(src2_shape)) {
         ctx->SetStatus(errors::InvalidArgument(
             "Inputs to operation ", this->name(), " of type ",
diff --git a/tensorflow/core/kernels/mkl_avgpooling_op.cc b/tensorflow/core/kernels/mkl_avgpooling_op.cc
index a7c569ee05..d545d34fdf 100644
--- a/tensorflow/core/kernels/mkl_avgpooling_op.cc
+++ b/tensorflow/core/kernels/mkl_avgpooling_op.cc
@@ -24,7 +24,7 @@
 
 #include "tensorflow/core/kernels/mkl_pooling_ops_common.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 using mkldnn::algorithm;
 using mkldnn::engine;
@@ -40,8 +40,7 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-// For now, MKL-ML is default. So making MKL-DNN not a default choice.
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
 template <typename Device, typename T>
 class MklAvgPoolingOp : public OpKernel {
@@ -429,7 +428,7 @@ class MklAvgPoolingGradOp : public OpKernel {
   TensorFormat data_format_;
 };  // MklAvgPoolingGradOp
 
-#else  // INTEL_MKL_DNN is defined
+#else
 
 template <typename Device, typename T>
 class MklAvgPoolingOp : public MklPoolingForwardOpBase<T> {
@@ -466,6 +465,28 @@ class MklAvgPoolingOp : public MklPoolingForwardOpBase<T> {
       memory::dims output_dims_mkl_order;
       this->GetOutputDims(pool_params, &output_dims_mkl_order);
 
+      // If input is an empty tensor, allocate an empty output tensor and return
+      if (input_tensor.NumElements() == 0) {
+        MklDnnShape output_mkl_shape;
+        output_mkl_shape.SetMklTensor(false);
+        TensorShape output_tf_shape;
+        if (pool_params.data_format == TensorFormat::FORMAT_NCHW) {
+          output_tf_shape = MklDnnDimsToTFShape(output_dims_mkl_order);
+        } else {
+          memory::dims output_dims_NHWC_order;
+          output_dims_NHWC_order = {pool_params.tensor_in_batch,
+                                    static_cast<int>(pool_params.out_height),
+                                    static_cast<int>(pool_params.out_width),
+                                    pool_params.out_depth};
+          output_tf_shape = MklDnnDimsToTFShape(output_dims_NHWC_order);
+        }
+        const int kOutputIndex = 0;
+        AllocateOutputSetMklShape(context, kOutputIndex, &output_tensor,
+                                  output_tf_shape, output_mkl_shape);
+        CHECK_NOTNULL(output_tensor);
+        return;
+      }
+
       // If input is in Mkl layout, then just get the memory format from it
       // directly, instead of using input data_format to AvgPool.
       if (dnn_shape_input.IsMklTensor()) {
@@ -678,7 +699,7 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
   }
 };  // MklAvgPoolingGradOp
 
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 
 REGISTER_KERNEL_BUILDER(Name("_MklAvgPool")
                             .Device(DEVICE_CPU)
diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl_concat_op.cc
index 7da63604d2..f1f267e849 100644
--- a/tensorflow/core/kernels/mkl_concat_op.cc
+++ b/tensorflow/core/kernels/mkl_concat_op.cc
@@ -30,7 +30,7 @@ limitations under the License.
 #include "mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 
 using mkldnn::concat;
@@ -62,7 +62,7 @@ class EigenConcatBaseOp : public OpKernel {
   // we need to have empty Compute because Compute is pure virtual function.
   void Compute(OpKernelContext* c) {}
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
   void Compute(OpKernelContext* c, const std::vector<Tensor>& values) {
     const Tensor* concat_dim_tensor;
@@ -230,7 +230,7 @@ class EigenConcatBaseOp : public OpKernel {
 #endif
 };
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
 // --------------------------------------------------------------------------
 //                      Mkl Concat Op
diff --git a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
index ef3f8cfec1..1401bc65a4 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
@@ -42,7 +42,7 @@ limitations under the License.
 #include "mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 
 using mkldnn::convolution_backward_weights;
@@ -55,7 +55,7 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
 template <typename Device, class T>
 class MklConv2DCustomBackpropFilterOp : public OpKernel {
@@ -655,7 +655,7 @@ class MklConv2DCustomBackpropFilterOp
 TF_CALL_float(REGISTER_MKL_FILTER_KERNELS);
 #undef REGISTER_MKL_FILTER_KERNELS
 
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
index a6745489f4..eeed009531 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
@@ -44,7 +44,7 @@ limitations under the License.
 #include "tensorflow/core/util/use_cudnn.h"
 #include "tensorflow/core/util/work_sharder.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 
 using mkldnn::convolution_backward_data;
@@ -56,7 +56,7 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
 template <typename Device, class T>
 class MklConv2DCustomBackpropInputOp : public OpKernel {
@@ -493,7 +493,7 @@ class MklConv2DCustomBackpropInputOp
   }
 };
 
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 
 #define REGISTER_MKL_CPU_KERNELS(T)                                 \
   REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropInput")           \
diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc
index e44fba754b..2953426d58 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@@ -41,7 +41,8 @@ limitations under the License.
 
 #include "tensorflow/core/util/mkl_util.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
+
 #include "mkldnn.hpp"
 
 using mkldnn::prop_kind;
@@ -58,8 +59,8 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-// For now, MKL-ML is default. So making MKL-DNN not a default choice.
-#ifndef INTEL_MKL_DNN
+// MKL-DNN is now default. MKL-ML must be specified explicitly.
+#ifdef INTEL_MKL_ML
 
 template <typename Device, typename T, bool biasEnabled>
 class MklConv2DOp : public OpKernel {
diff --git a/tensorflow/core/kernels/mkl_conv_ops.h b/tensorflow/core/kernels/mkl_conv_ops.h
index 8b65eaea0d..9dd88221a8 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.h
+++ b/tensorflow/core/kernels/mkl_conv_ops.h
@@ -40,7 +40,7 @@ limitations under the License.
 
 #include "tensorflow/core/util/mkl_util.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 
 using mkldnn::prop_kind;
@@ -52,7 +52,7 @@ using mkldnn::convolution_forward;
 
 namespace tensorflow {
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 
 class MklDnnConvUtil {
  protected:
@@ -553,7 +553,7 @@ class MklConv2DBackpropCommonOp : public OpKernel {
   Padding padding_;
   TensorFormat data_format_;
 };
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 
 /////////////////////////////////////////////////////////////////////
 ///  Dummy Mkl op that is just used for operators that are intermediate
diff --git a/tensorflow/core/kernels/mkl_cwise_ops_common.cc b/tensorflow/core/kernels/mkl_cwise_ops_common.cc
index c065724e0d..58f0c30f32 100644
--- a/tensorflow/core/kernels/mkl_cwise_ops_common.cc
+++ b/tensorflow/core/kernels/mkl_cwise_ops_common.cc
@@ -1,4 +1,4 @@
-﻿/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0(the "License");
 you may not use this file except in compliance with the License.
diff --git a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
index 0b6d838e09..8313224d7f 100644
--- a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 
 using mkldnn::batch_normalization_backward;
@@ -41,7 +41,7 @@ using mkldnn::use_scale_shift;
 namespace tensorflow {
 using CPUDevice = Eigen::ThreadPoolDevice;
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
 template <typename Device, typename T>
 class MklFusedBatchNormOp : public OpKernel {
@@ -683,7 +683,7 @@ class MklFusedBatchNormGradOp : public OpKernel {
 };
 #endif
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 
 template <typename Device, typename T>
 class MklFusedBatchNormOp : public OpKernel {
diff --git a/tensorflow/core/kernels/mkl_identity_op.cc b/tensorflow/core/kernels/mkl_identity_op.cc
index 9ee27ee21c..6c027f8e72 100644
--- a/tensorflow/core/kernels/mkl_identity_op.cc
+++ b/tensorflow/core/kernels/mkl_identity_op.cc
@@ -28,14 +28,14 @@ limitations under the License.
 #include "mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 #endif
 
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
 template <typename Device, typename T>
 class MklIdentityOp : public OpKernel {
diff --git a/tensorflow/core/kernels/mkl_input_conversion_op.cc b/tensorflow/core/kernels/mkl_input_conversion_op.cc
index 73d41efce1..5a8799ae93 100644
--- a/tensorflow/core/kernels/mkl_input_conversion_op.cc
+++ b/tensorflow/core/kernels/mkl_input_conversion_op.cc
@@ -31,7 +31,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/mkl_tfconv_op.h"
 #include "tensorflow/core/util/mkl_util.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 
 using mkldnn::stream;
@@ -59,7 +59,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 //     convert the TF format input to MKL format
 ///////////////////////////////////////////////////////////
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 template <typename Device, typename T>
 class MklInputConversionOp : public OpKernel {
  public:
@@ -293,14 +293,58 @@ class MklInputConversionOp : public OpKernel {
     // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
     // If both inputs are in MKL format
     if (input_shape_0.IsMklTensor() && input_shape_1.IsMklTensor()) {
-      // If both have the same shape, pass them through
       if (tf_shapes_are_same) {
-        VLOG(1) << "MklInputConversionOp: No conversion needed, "
-                << "copying MKL inputs with identical shapes to output";
-
-        ForwardMklTensorInToOut(context, 0, 0);
-        ForwardMklTensorInToOut(context, 1, 1);
-        return;
+        auto input0_md = input_shape_0.GetMklLayout();
+        auto input1_md = input_shape_1.GetMklLayout();
+
+        // If both have the same shape and same format, pass them through
+        if (input0_md.data.format == input1_md.data.format) {
+          VLOG(1) << "MklInputConversionOp: No conversion needed, "
+                  << "copying MKL inputs with identical shapes to output";
+
+          ForwardMklTensorInToOut(context, 0, 0);
+          ForwardMklTensorInToOut(context, 1, 1);
+          return;
+        } else {
+          VLOG(1) << "MklInputConversionOp: Shape is same, but format is "
+                     "different, "
+                  << "need to convert to same format";
+
+          // Convert input0, and keep input1 unchanged
+          // Create MklDnnShape for output mkl tensor based on input0
+          Tensor* tensor_out;
+          MklDnnShape mkl_output_mkl_shape;
+          mkl_output_mkl_shape.SetMklTensor(true);
+          mkl_output_mkl_shape.SetElemType(MklDnnType<T>());
+          mkl_output_mkl_shape.SetTfLayout(input_shape_0.GetDimension(),
+                                           input_shape_0.GetSizesAsMklDnnDims(),
+                                           input_shape_0.GetTfDataFormat());
+
+          // Get MKL layout from input1 as destination layout
+          mkl_output_mkl_shape.SetMklLayout(&input1_md);
+
+          // Create output Mkl tensor for index 0
+          AllocateOutputSetMklShape(context, 0, &tensor_out,
+                                    input_tensor_0.shape(),
+                                    mkl_output_mkl_shape);
+
+          // Create MklDnnData object for input0 tesnsor
+          auto cpu_engine = engine(engine::cpu, 0);
+          MklDnnData<T> input(&cpu_engine);
+          input.SetUsrMem(input0_md, &input_tensor_0);
+
+          // Create reorder from input0's layout to input1's layout
+          std::vector<primitive> net;
+          CHECK_EQ(input.CheckReorderToOpMem(
+                       memory::primitive_desc(input1_md, cpu_engine),
+                       tensor_out, &net),
+                   true);
+          stream(stream::kind::eager).submit(net).wait();
+
+          // Input1 will be passed through
+          ForwardMklTensorInToOut(context, 1, 1);
+          return;
+        }
       }
 
       // Sanity check
diff --git a/tensorflow/core/kernels/mkl_lrn_op.cc b/tensorflow/core/kernels/mkl_lrn_op.cc
index a8b45004b7..5f0a12a1fb 100644
--- a/tensorflow/core/kernels/mkl_lrn_op.cc
+++ b/tensorflow/core/kernels/mkl_lrn_op.cc
@@ -38,7 +38,7 @@ limitations under the License.
 #include "tensorflow/core/util/work_sharder.h"
 #endif
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 using mkldnn::lrn_across_channels;
 using mkldnn::lrn_backward;
@@ -67,7 +67,7 @@ void GetBandMatrix(int depth, int depth_radius,
 
 }  // namespace
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
 template <typename T>
 class MklLRNOp : public OpKernel {
@@ -1343,7 +1343,7 @@ class MklLRNGradOp : public OpKernel {
   float beta_;
 };
 
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 
 #define REGISTER_MKL_LRN_CPU(T)                                     \
   REGISTER_KERNEL_BUILDER(Name("_MklLRN")                           \
diff --git a/tensorflow/core/kernels/mkl_maxpooling_op.cc b/tensorflow/core/kernels/mkl_maxpooling_op.cc
index 0de27ccd60..14607f26e0 100644
--- a/tensorflow/core/kernels/mkl_maxpooling_op.cc
+++ b/tensorflow/core/kernels/mkl_maxpooling_op.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/padding.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include <algorithm>
 #include "mkldnn.hpp"
 using mkldnn::algorithm;
@@ -39,8 +39,8 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-// For now, MKL-ML is default. So making MKL-DNN not a default choice.
-#ifndef INTEL_MKL_DNN
+// MKL-DNN is now default. MKL-ML must be specified explicitly.
+#ifdef INTEL_MKL_ML
 
 // An implementation of MaxPooling (forward).
 template <typename Device, typename T>
@@ -494,7 +494,7 @@ class MklMaxPoolingGradOp : public OpKernel {
   bool workspace_enabled_;
 };  // MklMaxPoolingGradOp
 
-#else  // INTEL_MKL_DNN is defined
+#else
 
 // An implementation of MaxPooling (forward).
 template <typename Device, typename T>
@@ -793,7 +793,7 @@ class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase<T> {
   }
 };  // MklMaxPoolingGradOp
 
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 
 REGISTER_KERNEL_BUILDER(Name("_MklMaxPool")
                             .Device(DEVICE_CPU)
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.cc b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
index ef8597b057..5ef6ce2a57 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.cc
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
@@ -42,7 +42,7 @@ void MklPoolParameters::Init(OpKernelContext* context,
   Init(context, ksize, stride, padding, data_format);
 }
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 // Initialization for MKL format
 void MklPoolParameters::Init(OpKernelContext* context,
                              const std::vector<int32>& ksize,
@@ -72,7 +72,7 @@ void MklPoolParameters::Init(OpKernelContext* context,
 
   Init(context, ksize, stride, padding, data_format);
 }
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 // Common Initialization for TensorFlow and MKL formats
 void MklPoolParameters::Init(OpKernelContext* context,
                              const std::vector<int32>& ksize,
@@ -107,7 +107,7 @@ void MklPoolParameters::Init(OpKernelContext* context,
     OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
                                 tensor_in_cols, window_cols, col_stride,
                                 padding, &out_width, &pad_left, &pad_right));
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
     // TF can work with int64, but mkldnn only supports int32
     // Fail if the height or width are greater than MAX_INT
 
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.h b/tensorflow/core/kernels/mkl_pooling_ops_common.h
index 880e45ab1e..279167aba2 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.h
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.h
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/padding.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 using mkldnn::memory;
 using mkldnn::pooling_backward;
@@ -85,7 +85,7 @@ struct MklPoolParameters {
   void Init(OpKernelContext* context, const std::vector<int32>& ksize,
             const std::vector<int32>& stride, Padding padding,
             TensorFormat data_format, const TensorShape& tensor_in_shape);
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
   void Init(OpKernelContext* context, const std::vector<int32>& ksize,
             const std::vector<int32>& stride, Padding padding,
             TensorFormat data_format, const MklShape* mkl_in_shape);
@@ -102,7 +102,7 @@ struct MklPoolParameters {
             TensorFormat data_format);
 };
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 
 template <class T>
 class MklPoolingOpBase : public OpKernel {
@@ -395,7 +395,7 @@ class MklPoolingBackwardOpBase : public MklPoolingOpBase<T> {
     return grad_reorder_needed ? target_diff_dst_md : original_input_grad_md;
   }
 };
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 
 //-------------------------------------------------------------------
 // Utility functions
diff --git a/tensorflow/core/kernels/mkl_relu_op.cc b/tensorflow/core/kernels/mkl_relu_op.cc
index 873aca30ca..51db3991e2 100644
--- a/tensorflow/core/kernels/mkl_relu_op.cc
+++ b/tensorflow/core/kernels/mkl_relu_op.cc
@@ -28,7 +28,7 @@ limitations under the License.
 #include "tensorflow/core/platform/default/logging.h"
 #include "tensorflow/core/util/mkl_util.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 
 using mkldnn::algorithm;
@@ -58,7 +58,7 @@ struct MklReluHelpers {
   }
 };
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
 template <typename Device, typename T>
 class MklReluOp : public OpKernel {
@@ -368,7 +368,7 @@ void MklReluGradOp<Device, T>::Compute(OpKernelContext* context) {
   mkl_context.MklCleanup();
 }
 
-#else  // INTEL_MKL_DNN
+#else  // INTEL_MKL_ML
 
 template <typename Device, typename T, algorithm alg_kind>
 class MklReluOpBase : public OpKernel {
@@ -849,7 +849,7 @@ class MklTanhGradOp : public MklReluGradOpBase<Device, T, eltwise_tanh> {
                           MklReluGradOp<CPUDevice, type>);
 TF_CALL_float(REGISTER_RELU_MKL_SUPPORTED_KERNELS_TYPES);
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 
 // register dnn kernels for supported operations and supported types
 #define REGISTER_ELU_MKL_SUPPORTED_KERNELS_TYPES(type)              \
diff --git a/tensorflow/core/kernels/mkl_reshape_op.cc b/tensorflow/core/kernels/mkl_reshape_op.cc
index 7d471e1e4c..5dbc4a2709 100644
--- a/tensorflow/core/kernels/mkl_reshape_op.cc
+++ b/tensorflow/core/kernels/mkl_reshape_op.cc
@@ -28,7 +28,7 @@ limitations under the License.
 #include "mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 using mkldnn::stream;
 #endif
@@ -40,7 +40,7 @@ class MklReshapeOp : public OpKernel {
  public:
   explicit MklReshapeOp(OpKernelConstruction* context) : OpKernel(context) {}
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
   void Compute(OpKernelContext* context) override {
     const Tensor& input = MklGetInput(context, 0);
     const Tensor& sizes = MklGetInput(context, 1);
@@ -312,7 +312,7 @@ class MklReshapeOp : public OpKernel {
     }
   }
 
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 
  private:
   const int kInputSlotIdx = 0;
diff --git a/tensorflow/core/kernels/mkl_softmax_op.cc b/tensorflow/core/kernels/mkl_softmax_op.cc
index c46eabdde1..aceef1e234 100644
--- a/tensorflow/core/kernels/mkl_softmax_op.cc
+++ b/tensorflow/core/kernels/mkl_softmax_op.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 // See docs in ../ops/nn_ops.cc.
 #ifdef INTEL_MKL
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/numeric_op.h"
@@ -156,5 +156,5 @@ TF_CALL_float(REGISTER_SOFTMAX_MKL_SUPPORTED_KERNELS_TYPES);
 
 }  // namespace tensorflow
 
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_tfconv_op.h b/tensorflow/core/kernels/mkl_tfconv_op.h
index c4d5a45d3c..5fafa14b5d 100644
--- a/tensorflow/core/kernels/mkl_tfconv_op.h
+++ b/tensorflow/core/kernels/mkl_tfconv_op.h
@@ -35,7 +35,7 @@ limitations under the License.
 #include "mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 using mkldnn::stream;
 #endif
 
@@ -61,7 +61,7 @@ class MklToTfOp : public OpKernel {
     VLOG(1) << "MKLToTFConversion complete successfully.";
   }
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
   static void ConvertMklToTf(OpKernel* op_kernel, OpKernelContext* context,
                              string data_format_str, DataType op_data_type,
                              bool has_avx512f, uint input_number) {
diff --git a/tensorflow/core/kernels/roll_op.cc b/tensorflow/core/kernels/roll_op.cc
new file mode 100644
index 0000000000..bcbdbee058
--- /dev/null
+++ b/tensorflow/core/kernels/roll_op.cc
@@ -0,0 +1,334 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/register_types_traits.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+
+#define EIGEN_USE_THREADS
+using CPUDevice = Eigen::ThreadPoolDevice;
+
+// dim_size - the size of each dimension
+// dim_range - the number of indices over in the flattened tensor
+//    you need to skip in order to make it over from one side of a dimension
+//    to the other. Used to make the shifts wrap around after a threshold.
+// threshold - the index for each dimension that the roll starts to wrap
+//    back to the front
+template <typename T>
+void DoRoll(OpKernelContext* context, const int64 num_elements,
+            const int num_dims, const gtl::ArraySlice<int>& dim_size,
+            const T* input, T* output, const gtl::ArraySlice<int>& threshold,
+            const gtl::ArraySlice<int64>& dim_range) {
+  auto work = [input, output, num_dims, &dim_size, &threshold, &dim_range](
+                  int64 start, int64 end) {
+    // array of indices for each dimension
+    gtl::InlinedVector<int, 4> indices(num_dims);
+    int offset = 0;  // the shift along the flattened tensor for current element
+    // initialize indices and offset
+    for (int i = 0; i < num_dims; i++) {
+      // stride is the number of indices over in the flattened tensor
+      // you need to skip in order to make it over to an adjacent element
+      // along a dimension. dim_size[i] != 0 because we set it to max(dim, 1)
+      const int64 stride = dim_range[i] / dim_size[i];
+      const int shift = dim_size[i] - threshold[i];
+      const int indx = (start / stride) % dim_size[i];
+      indices[i] = indx;
+      // calculate dimension index after the shift
+      const int shifted_indx = (indx + shift) % dim_size[i];
+      offset += (shifted_indx - indx) * stride;
+    }
+
+    for (int64 i = start; i < end; i++) {
+      output[i + offset] = input[i];
+      // create next combination of indices
+      // while at it adjust offset if needed
+      for (int j = num_dims - 1; j >= 0; j--) {
+        const int indx = (indices[j] + 1) % dim_size[j];
+        indices[j] = indx;
+        if (indx != 0) {
+          if (indx == threshold[j]) {  // we've reached the threshold
+            // dim_range[j] = threshold[j] + shift[j]
+            // offset = shift[j] + ... other offsets
+            // offset - dim_range[j] = -threshold[j] + ... other offsets
+            // thus we undo our previous offset as well as add a new offset of
+            // -threshold[j] in one operation
+            offset -= dim_range[j];  // now wraps around
+          }
+          break;                         // indx != 0 don't need to carry
+        } else if (threshold[j] != 0) {  // if threshold is 0 shift is 0
+          offset += dim_range[j];        // indx became 0 so reverse wrap around
+        }
+      }
+    }
+  };
+  // Shard
+  auto worker_threads = context->device()->tensorflow_cpu_worker_threads();
+  // 15 - expiramentally determined with float and bool types
+  const int cost_per_element = 15 * sizeof(T);  // rough esitmate
+  Shard(worker_threads->num_threads, worker_threads->workers, num_elements,
+        cost_per_element, std::move(work));
+}
+
+// dim_size - the size of each dimension
+// dim_range - the number of indices over in the flattened tensor
+//    you need to skip in order to make it over from one side of a dimension
+//    to the other. Used to make the shifts wrap around after a threshold.
+// threshold - the index for each dimension that the roll starts to wrap
+//    back to the front
+// isd - inner shift dimension
+template <typename T>
+// Use memcpy to copy memory in groups when the data type supports memcpy
+void DoRollWithMemcpy(OpKernelContext* context, const int64 num_elements,
+                      const int num_dims, const gtl::ArraySlice<int>& dim_size,
+                      const T* input, T* output,
+                      const gtl::ArraySlice<int>& threshold,
+                      const gtl::ArraySlice<int64>& dim_range,
+                      const int64 isd) {
+  auto work = [input, output, num_dims, &dim_size, &threshold, &dim_range, isd](
+                  int64 start, int64 end) {
+    // the number of indices over in the flattened tensor you need to skip in
+    // order to make it over from one side of the isd to the other
+    const int64 isd_range = std::max<int>(dim_range[isd], 1);
+    // the distance along the flattend tensor to the next element in the isd
+    const int64 isd_stride = isd_range / std::max<int>(dim_size[isd], 1);
+
+    // start and end represent the i-th group currently so we will convert
+    // them into numbers representing the i-th elements.
+    // there are 2 groups per isd one for all elements before threshold[isd]
+    // and another for all elements after threshold[isd].
+    const int64 start_remainder = (start % 2) * threshold[isd] * isd_stride;
+    const int64 end_remainder = (end % 2) * threshold[isd] * isd_stride;
+    start = (start / 2) * isd_range + start_remainder;
+    end = (end / 2) * isd_range + end_remainder;
+
+    const T* in_ptr = &input[0];
+    T* out_ptr = &output[0];
+    in_ptr += start;
+    out_ptr += start;
+
+    // array of indices for each dimension
+    // indicies = [i, j, k, l, m, n]
+    gtl::InlinedVector<int, 4> indicies(num_dims);
+    // the offset needed to make all inner non-shifting dimensions become 0
+    int64 remainder_offset = 0;
+    // initialize indicies
+    for (int i = 0; i < num_dims; i++) {
+      // stride is the number of indices over in the flattened tensor
+      // you need to skip in order to make it over to an adjacent element
+      // along a dimension. dim_size[i] != 0 because we set it to max(dim, 1)
+      const int64 stride = dim_range[i] / dim_size[i];
+      const int shift = dim_size[i] - threshold[i];
+      const int indx = (start / stride) % dim_size[i];
+      indicies[i] = indx;
+      // calculate dimension index after the shift
+      int out_indx = (indx + shift) % dim_size[i];
+      if (i > isd) {
+        // trailing zeroes for indices after the inner shifted dimension
+        out_indx = 0;
+        remainder_offset += (out_indx - indx) * stride;
+      }
+      out_ptr += (out_indx - indx) * stride;
+    }
+    // set trailing zeroes for indices after the inner shifted dimension
+    for (int i = num_dims - 1; i > isd; i--) indicies[i] = 0;
+
+    // the number of indices in the isd dimension the next group will skip
+    // to make it to the next threshold or end point
+    int isd_indx_skip = 0;
+    // the size of the next group
+    int64 group_size = 0;
+    // initialize isd_indx_skip and group_size
+    if (indicies[isd] < threshold[isd]) {
+      isd_indx_skip = threshold[isd] - indicies[isd];
+      group_size = isd_indx_skip * isd_stride + remainder_offset;
+    } else {
+      isd_indx_skip = dim_size[isd] - indicies[isd];
+      group_size = isd_indx_skip * isd_stride + remainder_offset;
+    }
+
+    int64 i = start;
+    while (i < end) {
+      // copy group of elements
+      memcpy(out_ptr, in_ptr, group_size * sizeof(T));
+
+      // shift i and the pointers over to the next group position
+      i += group_size;
+      out_ptr += group_size;
+      in_ptr += group_size;
+
+      // produce next combination of indices and adjust the out_ptr position
+      // to fix the offset if necessary
+      // the isd (inner shift dim) should skip to next threshold or endpoint
+      // all dimensions to the left increment by 1 when a digit is carried
+      // all dimensions to the right remain set to 0
+      //            +1 +1 +1 +isd_indx_skip
+      // indicies = [i, j, k, l, 0, 0]
+      //                      ^isd
+      for (int j = isd; j >= 0; j--) {
+        int inc = 1;
+        if (j == isd) inc = isd_indx_skip;
+        const int indx = (indicies[j] + inc) % dim_size[j];
+        indicies[j] = indx;
+        if (indx != 0) {
+          if (indx == threshold[j]) {
+            out_ptr -= dim_range[j];  // now wraps around
+          }
+          break;                         // indx != 0 don't need to carry
+        } else if (threshold[j] != 0) {  // if threshold is 0 shift is 0
+          out_ptr += dim_range[j];       // indx became 0 so reverse wrap around
+        }
+      }
+
+      // set isd_indx_skip and group_size for next iteration
+      if (indicies[isd] < threshold[isd]) {
+        isd_indx_skip = threshold[isd] - indicies[isd];
+        group_size = isd_indx_skip * isd_stride;
+      } else {
+        isd_indx_skip = dim_size[isd] - indicies[isd];
+        group_size = isd_indx_skip * isd_stride;
+      }
+    }
+  };
+  // Shard
+  auto worker_threads = context->device()->tensorflow_cpu_worker_threads();
+  const int64 ave_group_size = dim_range[isd] / 2;
+  const int total_work = 2 * num_elements / std::max<int>(dim_range[isd], 1);
+  // 25000 - expiramentally determined with float and bool types
+  const int cost_per_group = 25000 * sizeof(T) * ave_group_size;
+  Shard(worker_threads->num_threads, worker_threads->workers, total_work,
+        cost_per_group, std::move(work));
+}
+
+template <typename Device, typename T, typename Tshift, typename Taxis>
+class RollOp : public OpKernel {
+ public:
+  explicit RollOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    // Grab the input tensor
+    const Tensor& input = context->input(0);
+    const Tensor& shift = context->input(1);
+    const Tensor& axis = context->input(2);
+
+    auto shift_flat = shift.flat<Tshift>();
+    auto axis_flat = axis.flat<Taxis>();
+
+    OP_REQUIRES(context, TensorShapeUtils::IsVectorOrHigher(input.shape()),
+                errors::InvalidArgument("input must be 1-D or higher"));
+    OP_REQUIRES(context, shift.shape().dims() <= 1,
+                errors::InvalidArgument(
+                    "shift must be a scalar or a 1-D vector. Found: ",
+                    shift.shape().DebugString()));
+    OP_REQUIRES(context, axis.shape().dims() <= 1,
+                errors::InvalidArgument(
+                    "axis must be a scalar or a 1-D vector. Found: ",
+                    axis.shape().DebugString()));
+    OP_REQUIRES(
+        context, shift.shape() == axis.shape(),
+        errors::InvalidArgument("shift and axis must have the same size"));
+    const int64 num_elements = input.NumElements();
+    const int num_shifts = static_cast<int>(shift_flat.size());
+    const int num_dims = input.dims();
+
+    // if there are any duplicate axes, shift_mod_sum will have the
+    // total modulo sum of shifts for each dimension
+    gtl::InlinedVector<int, 4> shift_mod_sum(num_dims, 0);
+    for (int i = 0; i < num_shifts; i++) {
+      const int axis = axis_flat(i);
+      OP_REQUIRES(context, axis < num_dims,
+                  errors::InvalidArgument("axis ", axis, " is out of range"));
+      const int ds = std::max<int>(static_cast<int>(input.dim_size(axis)), 1);
+      const int sum = shift_mod_sum[axis] + static_cast<int>(shift_flat(i));
+      // modulo that works with negatives: ((x % y) + y) % y
+      shift_mod_sum[axis] = (sum % ds + ds) % ds;
+    }
+    // the size of each dimension
+    gtl::InlinedVector<int, 4> dim_size(num_dims);
+    // threshold[i] is the index that the roll starts to wrap back to the front
+    gtl::InlinedVector<int, 4> threshold(num_dims);
+    // dim_range is the number of indices over in the flattened tensor
+    // you need to skip in order to make it over from one side of a dimension
+    // to the other. Used to make the shifts wrap around after a threshold.
+    gtl::InlinedVector<int64, 4> dim_range(num_dims);
+    int64 dim_size_prod = 1;  // dimension size product
+    // inner shift dimension (inner most shifted dimension)
+    int64 isd = 0;
+    for (int i = num_dims - 1; i >= 0; i--) {
+      if (isd == 0 && shift_mod_sum[i] != 0) isd = i;
+      const int ds = std::max<int>(static_cast<int>(input.dim_size(i)), 1);
+      dim_size[i] = ds;
+      threshold[i] = (ds - shift_mod_sum[i]) % ds;
+      dim_size_prod *= static_cast<int64>(input.dim_size(i));
+      dim_range[i] = dim_size_prod;
+    }
+
+    Tensor* output = NULL;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, input.shape(), &output));
+    auto input_flat = input.flat<T>().data();
+    auto output_flat = output->flat<T>().data();
+
+    if (std::is_same<Device, CPUDevice>::value) {
+      if (DataTypeCanUseMemcpy(DataTypeToEnum<T>::v())) {
+        // V2 copies memory in groups instead of element by element
+        DoRollWithMemcpy<T>(context, num_elements, num_dims, dim_size,
+                            input_flat, output_flat, threshold, dim_range, isd);
+      } else {
+        // incase memcpy does not work for current data type
+        DoRoll<T>(context, num_elements, num_dims, dim_size, input_flat,
+                  output_flat, threshold, dim_range);
+      }
+    }
+  }
+};
+
+// Register the CPU kernels.
+#define REGISTER_CPU(type)                                       \
+  REGISTER_KERNEL_BUILDER(Name("Roll")                           \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<int32>("Tshift")   \
+                              .TypeConstraint<int32>("Taxis"),   \
+                          RollOp<CPUDevice, type, int32, int32>) \
+  REGISTER_KERNEL_BUILDER(Name("Roll")                           \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<int64>("Tshift")   \
+                              .TypeConstraint<int32>("Taxis"),   \
+                          RollOp<CPUDevice, type, int64, int32>) \
+  REGISTER_KERNEL_BUILDER(Name("Roll")                           \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<int32>("Tshift")   \
+                              .TypeConstraint<int64>("Taxis"),   \
+                          RollOp<CPUDevice, type, int32, int64>) \
+  REGISTER_KERNEL_BUILDER(Name("Roll")                           \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<int64>("Tshift")   \
+                              .TypeConstraint<int64>("Taxis"),   \
+                          RollOp<CPUDevice, type, int64, int64>)
+
+TF_CALL_ALL_TYPES(REGISTER_CPU);
+#undef REGISTER_CPU
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/roll_op_test.cc b/tensorflow/core/kernels/roll_op_test.cc
new file mode 100644
index 0000000000..90b6f8d0f3
--- /dev/null
+++ b/tensorflow/core/kernels/roll_op_test.cc
@@ -0,0 +1,484 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <functional>
+#include <memory>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+namespace {
+
+class RollOpTest : public OpsTestBase {
+ protected:
+  void MakeOp(DataType data_type, DataType index_type) {
+    TF_ASSERT_OK(NodeDefBuilder("myop", "Roll")
+                     .Input(FakeInput(data_type))
+                     .Input(FakeInput(index_type))
+                     .Input(FakeInput(index_type))
+                     .Finalize(node_def()));
+    TF_ASSERT_OK(InitOp());
+  }
+};
+
+TEST_F(RollOpTest, ScalarIndices) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({5}), {0, 1, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({}), {3});
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({5}));
+  test::FillValues<float>(&expected, {2, 3, 4, 0, 1});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, ScalarIndices_NoMemcpy) {
+  MakeOp(DT_STRING, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<string>(TensorShape({5}), {"a", "b", "c", "d", "e"});
+  AddInputFromArray<int32>(TensorShape({}), {3});
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_STRING, TensorShape({5}));
+  test::FillValues<string>(&expected, {"c", "d", "e", "a", "b"});
+  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, ScalarIndices_Complex) {
+  MakeOp(DT_COMPLEX64, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<std::complex<float>>(
+      TensorShape({5}), {std::complex<float>(0, 10), std::complex<float>(1, 11),
+                         std::complex<float>(2, 12), std::complex<float>(3, 13),
+                         std::complex<float>(4, 14)});
+  AddInputFromArray<int32>(TensorShape({}), {3});
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_COMPLEX64, TensorShape({5}));
+  test::FillValues<std::complex<float>>(
+      &expected, {std::complex<float>(2, 12), std::complex<float>(3, 13),
+                  std::complex<float>(4, 14), std::complex<float>(0, 10),
+                  std::complex<float>(1, 11)});
+  test::ExpectTensorEqual<std::complex<float>>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Simple_TwoD32) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({3, 5}),
+                           {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
+  AddInputFromArray<int32>(TensorShape({2}), {2, -1});
+  AddInputFromArray<int32>(TensorShape({2}), {0, 1});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({3, 5}));
+  test::FillValues<float>(&expected,
+                          {6, 7, 8, 9, 5, 11, 12, 13, 14, 10, 1, 2, 3, 4, 0});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Simple_TwoD32_NoMemcpy) {
+  MakeOp(DT_STRING, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<string>(TensorShape({3, 5}),
+                            {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j",
+                             "k", "l", "m", "n", "o"});
+  AddInputFromArray<int32>(TensorShape({2}), {2, -1});
+  AddInputFromArray<int32>(TensorShape({2}), {0, 1});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_STRING, TensorShape({3, 5}));
+  test::FillValues<string>(&expected, {"g", "h", "i", "j", "f", "l", "m", "n",
+                                       "o", "k", "b", "c", "d", "e", "a"});
+  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Simple_ThreeD32) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({2, 2, 3}),
+                           {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
+  AddInputFromArray<int32>(TensorShape({3}), {1, -1, -1});
+  AddInputFromArray<int32>(TensorShape({3}), {0, 1, 2});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 2, 3}));
+  test::FillValues<float>(&expected, {10, 11, 9, 7, 8, 6, 4, 5, 3, 1, 2, 0});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Simple_ThreeD32_NoMemcpy) {
+  MakeOp(DT_STRING, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<string>(
+      TensorShape({2, 2, 3}),
+      {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"});
+  AddInputFromArray<int32>(TensorShape({3}), {1, -1, -1});
+  AddInputFromArray<int32>(TensorShape({3}), {0, 1, 2});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_STRING, TensorShape({2, 2, 3}));
+  test::FillValues<string>(
+      &expected, {"k", "l", "j", "h", "i", "g", "e", "f", "d", "b", "c", "a"});
+  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Simple_TwoD64) {
+  MakeOp(DT_FLOAT, DT_INT64);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({5, 3}),
+                           {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
+  AddInputFromArray<int64>(TensorShape({2}), {-1, 4});
+  AddInputFromArray<int64>(TensorShape({2}), {0, 1});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({5, 3}));
+  test::FillValues<float>(&expected,
+                          {5, 3, 4, 8, 6, 7, 11, 9, 10, 14, 12, 13, 2, 0, 1});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Simple_TwoD64_NoMemcpy) {
+  MakeOp(DT_STRING, DT_INT64);
+
+  // Feed and run
+  AddInputFromArray<string>(TensorShape({5, 3}),
+                            {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j",
+                             "k", "l", "m", "n", "o"});
+  AddInputFromArray<int64>(TensorShape({2}), {-1, 4});
+  AddInputFromArray<int64>(TensorShape({2}), {0, 1});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_STRING, TensorShape({5, 3}));
+  test::FillValues<string>(&expected, {"f", "d", "e", "i", "g", "h", "l", "j",
+                                       "k", "o", "m", "n", "c", "a", "b"});
+  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Simple_ThreeD64) {
+  MakeOp(DT_FLOAT, DT_INT64);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({4, 1, 3}),
+                           {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
+  AddInputFromArray<int64>(TensorShape({3}), {4, 3, 2});
+  AddInputFromArray<int64>(TensorShape({3}), {0, 1, 2});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({4, 1, 3}));
+  test::FillValues<float>(&expected, {1, 2, 0, 4, 5, 3, 7, 8, 6, 10, 11, 9});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Simple_ThreeD64_NoMemcpy) {
+  MakeOp(DT_STRING, DT_INT64);
+
+  // Feed and run
+  AddInputFromArray<string>(
+      TensorShape({4, 1, 3}),
+      {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"});
+  AddInputFromArray<int64>(TensorShape({3}), {4, 3, 2});
+  AddInputFromArray<int64>(TensorShape({3}), {0, 1, 2});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_STRING, TensorShape({4, 1, 3}));
+  test::FillValues<string>(
+      &expected, {"b", "c", "a", "e", "f", "d", "h", "i", "g", "k", "l", "j"});
+  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, ZeroShift_ThreeD32) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({2, 2, 3}),
+                           {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
+  AddInputFromArray<int32>(TensorShape({3}), {0, 0, 0});
+  AddInputFromArray<int32>(TensorShape({3}), {0, 1, 2});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 2, 3}));
+  test::FillValues<float>(&expected, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, ZeroShift_ThreeD32_NoMemcpy) {
+  MakeOp(DT_STRING, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<string>(
+      TensorShape({2, 2, 3}),
+      {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"});
+  AddInputFromArray<int32>(TensorShape({3}), {0, 0, 0});
+  AddInputFromArray<int32>(TensorShape({3}), {0, 1, 2});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_STRING, TensorShape({2, 2, 3}));
+  test::FillValues<string>(
+      &expected, {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"});
+  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, ZeroSize_ThreeD32) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({5, 0, 0}), {});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({5, 0, 0}));
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, ZeroSize_ThreeD32_NoMemcpy) {
+  MakeOp(DT_STRING, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<string>(TensorShape({5, 0, 0}), {});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_STRING, TensorShape({5, 0, 0}));
+  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, OneSize_ThreeD32) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({1, 1, 1}), {5});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 1}));
+  test::FillValues<float>(&expected, {5});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, OneSize_ThreeD32_NoMemcpy) {
+  MakeOp(DT_STRING, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<string>(TensorShape({1, 1, 1}), {"a"});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_STRING, TensorShape({1, 1, 1}));
+  test::FillValues<string>(&expected, {"a"});
+  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, MultiShifts_TwoD32) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({3, 5}),
+                           {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
+  AddInputFromArray<int32>(TensorShape({4}), {-2, 2, -1, 1});
+  AddInputFromArray<int32>(TensorShape({4}), {1, 0, 0, 1});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({3, 5}));
+  test::FillValues<float>(&expected,
+                          {11, 12, 13, 14, 10, 1, 2, 3, 4, 0, 6, 7, 8, 9, 5});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, MultiShifts_TwoD32_NoMemcpy) {
+  MakeOp(DT_STRING, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<string>(TensorShape({3, 5}),
+                            {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j",
+                             "k", "l", "m", "n", "o"});
+  AddInputFromArray<int32>(TensorShape({4}), {-2, 2, -1, 1});
+  AddInputFromArray<int32>(TensorShape({4}), {1, 0, 0, 1});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_STRING, TensorShape({3, 5}));
+  test::FillValues<string>(&expected, {"l", "m", "n", "o", "k", "b", "c", "d",
+                                       "e", "a", "g", "h", "i", "j", "f"});
+  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Error_InputMustBeVectorOrHigher) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({}), {7});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  Status s = RunOpKernel();
+  EXPECT_TRUE(StringPiece(s.ToString()).contains("input must be 1-D or higher"))
+      << s;
+}
+
+TEST_F(RollOpTest, Error_AxisMustBeScalarOrVector) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({2, 2}), {1, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  AddInputFromArray<int32>(TensorShape({1, 2}), {0, 1});
+  Status s = RunOpKernel();
+  EXPECT_TRUE(StringPiece(s.ToString())
+                  .contains("axis must be a scalar or a 1-D vector"))
+      << s;
+}
+
+TEST_F(RollOpTest, Error_ShiftMustBeScalarOrVector) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({2, 2}), {1, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({1, 2}), {0, 1});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  Status s = RunOpKernel();
+  EXPECT_TRUE(StringPiece(s.ToString())
+                  .contains("shift must be a scalar or a 1-D vector"))
+      << s;
+}
+
+TEST_F(RollOpTest, Error_ShiftAndAxisMustBeSameSize) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({2, 2}), {1, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({1}), {1});
+  AddInputFromArray<int32>(TensorShape({2}), {0, 1});
+  Status s = RunOpKernel();
+  EXPECT_TRUE(StringPiece(s.ToString())
+                  .contains("shift and axis must have the same size"))
+      << s;
+}
+
+TEST_F(RollOpTest, Error_AxisOutOfRange) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({4}), {1, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  Status s = RunOpKernel();
+  EXPECT_TRUE(StringPiece(s.ToString()).contains("is out of range")) << s;
+}
+
+// isd - (inner shift dimension) The inner most dimension to be shifted.
+//    All outer dimensions will also be shifted for testing.
+static Graph* RollGraph(const TensorShape& shape, int isd) {
+  Graph* g = new Graph(OpRegistry::Global());
+  Tensor input(DT_FLOAT, shape);
+  input.flat<float>().setRandom();
+  const int dims = static_cast<int>(input.dims());
+  Tensor shift(DT_INT32, TensorShape({dims}));
+  for (int i = 0; i < dims; i++) {
+    // shift the inner shift dimension and all outer dimensions
+    shift.flat<int32>()(i) = (i <= isd) ? 2 : 0;
+  }
+  Tensor axis(DT_INT32, TensorShape({dims}));
+  for (int i = 0; i < dims; i++) {
+    axis.flat<int32>()(i) = i;
+  }
+  test::graph::Roll(g, test::graph::Constant(g, input),
+                    test::graph::Constant(g, shift),
+                    test::graph::Constant(g, axis));
+  return g;
+}
+
+#define BM_ROLL_OUTER(DEVICE)                                                 \
+  static void BM_##DEVICE##_roll_outer(int iters, int rows, int columns) {    \
+    TensorShape shape{rows, columns};                                         \
+    const int64 num_items = static_cast<int64>(iters) * shape.num_elements(); \
+    testing::ItemsProcessed(num_items);                                       \
+    testing::BytesProcessed(num_items * sizeof(float));                       \
+    testing::UseRealTime();                                                   \
+    test::Benchmark(#DEVICE, RollGraph(shape, 0)).Run(iters);                 \
+  }                                                                           \
+  BENCHMARK(BM_##DEVICE##_roll_outer)                                         \
+      ->ArgPair(256, 256)                                                     \
+      ->ArgPair(512, 512)                                                     \
+      ->ArgPair(1024, 1024)                                                   \
+      ->ArgPair(2048, 2048)
+
+#define BM_ROLL_ALL(DEVICE)                                                   \
+  static void BM_##DEVICE##_roll_all(int iters, int rows, int columns) {      \
+    TensorShape shape{rows, columns};                                         \
+    const int64 num_items = static_cast<int64>(iters) * shape.num_elements(); \
+    testing::ItemsProcessed(num_items);                                       \
+    testing::BytesProcessed(num_items * sizeof(float));                       \
+    testing::UseRealTime();                                                   \
+    test::Benchmark(#DEVICE, RollGraph(shape, 1)).Run(iters);                 \
+  }                                                                           \
+  BENCHMARK(BM_##DEVICE##_roll_all)                                           \
+      ->ArgPair(256, 256)                                                     \
+      ->ArgPair(512, 512)                                                     \
+      ->ArgPair(1024, 1024)                                                   \
+      ->ArgPair(2048, 2048)
+
+BM_ROLL_OUTER(cpu);
+BM_ROLL_ALL(cpu);
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/unravel_index_op.cc b/tensorflow/core/kernels/unravel_index_op.cc
new file mode 100644
index 0000000000..a61272675b
--- /dev/null
+++ b/tensorflow/core/kernels/unravel_index_op.cc
@@ -0,0 +1,122 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+
+namespace tensorflow {
+
+namespace {
+template <typename T>
+struct mod_op {
+  const T operator()(const T& a, const T& b) const { return a % b; }
+};
+}  // namespace
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename Tidx>
+class UnravelIndexOp : public OpKernel {
+ public:
+  explicit UnravelIndexOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& indices_tensor = ctx->input(0);
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsVector(indices_tensor.shape()) ||
+                    TensorShapeUtils::IsScalar(indices_tensor.shape()),
+                errors::InvalidArgument(
+                    "The indices can only be scalar or vector, got \"",
+                    indices_tensor.shape().DebugString(), "\""));
+
+    const Tensor& dims_tensor = ctx->input(1);
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsVector(dims_tensor.shape()),
+        errors::InvalidArgument("The indices can only be 1-D, got \"",
+                                dims_tensor.shape().DebugString(), "\""));
+
+    auto dims = dims_tensor.vec<Tidx>();
+
+    Eigen::array<bool, 1> reverse({true});
+
+    Tensor strides_tensor;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_temp(DataTypeToEnum<Tidx>::value,
+                                      TensorShape({dims_tensor.NumElements()}),
+                                      &strides_tensor));
+
+    auto strides = strides_tensor.vec<Tidx>();
+    strides = dims.reverse(reverse)
+                  .scan(0, Eigen::internal::ProdReducer<Tidx>(), false)
+                  .reverse(reverse);
+
+    Tensor strides_shifted_tensor;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_temp(DataTypeToEnum<Tidx>::value,
+                                      TensorShape({dims_tensor.NumElements()}),
+                                      &strides_shifted_tensor));
+
+    auto strides_shifted = strides_shifted_tensor.vec<Tidx>();
+    strides_shifted = dims.reverse(reverse)
+                          .scan(0, Eigen::internal::ProdReducer<Tidx>(), true)
+                          .reverse(reverse);
+
+    Tensor* output_tensor = nullptr;
+    if (TensorShapeUtils::IsScalar(indices_tensor.shape())) {
+      OP_REQUIRES_OK(
+          ctx, ctx->allocate_output(0, TensorShape({dims_tensor.NumElements()}),
+                                    &output_tensor));
+
+      auto output = output_tensor->vec<Tidx>();
+
+      output = output.constant(indices_tensor.scalar<Tidx>()());
+      output = output.binaryExpr(strides, mod_op<Tidx>()) / strides_shifted;
+    } else {
+      OP_REQUIRES_OK(
+          ctx, ctx->allocate_output(0,
+                                    TensorShape({dims_tensor.NumElements(),
+                                                 indices_tensor.NumElements()}),
+                                    &output_tensor));
+
+      auto output = output_tensor->matrix<Tidx>();
+
+      Eigen::array<int64, 2> reshape{{dims_tensor.NumElements(), 1}};
+      Eigen::array<int64, 2> bcast({1, indices_tensor.NumElements()});
+      Eigen::array<int64, 2> indices_reshape{{1, indices_tensor.NumElements()}};
+      Eigen::array<int64, 2> indices_bcast({dims_tensor.NumElements(), 1});
+
+      output = indices_tensor.vec<Tidx>()
+                   .reshape(indices_reshape)
+                   .broadcast(indices_bcast);
+      output = output.binaryExpr(strides.reshape(reshape).broadcast(bcast),
+                                 mod_op<Tidx>()) /
+               strides_shifted.reshape(reshape).broadcast(bcast);
+    }
+  }
+};
+
+#define REGISTER_KERNEL(type)                                               \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("UnravelIndex").Device(DEVICE_CPU).TypeConstraint<type>("Tidx"), \
+      UnravelIndexOp<type>);
+TF_CALL_int32(REGISTER_KERNEL) TF_CALL_int64(REGISTER_KERNEL)
+#undef REGISTER_KERNEL
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/lib/io/random_inputstream.cc b/tensorflow/core/lib/io/random_inputstream.cc
index 8b8c1392a1..09336e79cd 100644
--- a/tensorflow/core/lib/io/random_inputstream.cc
+++ b/tensorflow/core/lib/io/random_inputstream.cc
@@ -57,6 +57,43 @@ Status RandomAccessInputStream::ReadNBytes(int64 bytes_to_read,
   return Status::OK();
 }
 
+// To limit memory usage, the default implementation of SkipNBytes() only reads
+// 8MB at a time.
+static constexpr int64 kMaxSkipSize = 8 * 1024 * 1024;
+
+Status RandomAccessInputStream::SkipNBytes(int64 bytes_to_skip) {
+  if (bytes_to_skip < 0) {
+    return errors::InvalidArgument("Can't skip a negative number of bytes");
+  }
+  std::unique_ptr<char[]> scratch(new char[kMaxSkipSize]);
+  // Try to read 1 bytes first, if we could complete the read then EOF is
+  // not reached yet and we could return.
+  if (bytes_to_skip > 0) {
+    StringPiece data;
+    Status s = file_->Read(pos_ + bytes_to_skip - 1, 1, &data, scratch.get());
+    if ((s.ok() || errors::IsOutOfRange(s)) && data.size() == 1) {
+      pos_ += bytes_to_skip;
+      return Status::OK();
+    }
+  }
+  // Read kDefaultSkipSize at a time till bytes_to_skip.
+  while (bytes_to_skip > 0) {
+    int64 bytes_to_read = std::min<int64>(kMaxSkipSize, bytes_to_skip);
+    StringPiece data;
+    Status s = file_->Read(pos_, bytes_to_read, &data, scratch.get());
+    if (s.ok() || errors::IsOutOfRange(s)) {
+      pos_ += data.size();
+    } else {
+      return s;
+    }
+    if (data.size() < bytes_to_read) {
+      return errors::OutOfRange("reached end of file");
+    }
+    bytes_to_skip -= bytes_to_read;
+  }
+  return Status::OK();
+}
+
 int64 RandomAccessInputStream::Tell() const { return pos_; }
 
 }  // namespace io
diff --git a/tensorflow/core/lib/io/random_inputstream.h b/tensorflow/core/lib/io/random_inputstream.h
index 09ebe9ba49..bdbdbd71ff 100644
--- a/tensorflow/core/lib/io/random_inputstream.h
+++ b/tensorflow/core/lib/io/random_inputstream.h
@@ -34,6 +34,8 @@ class RandomAccessInputStream : public InputStreamInterface {
 
   Status ReadNBytes(int64 bytes_to_read, string* result) override;
 
+  Status SkipNBytes(int64 bytes_to_skip) override;
+
   int64 Tell() const override;
 
   Status Seek(int64 position) {
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index 87dfa77689..267ce88440 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -335,6 +335,13 @@ REGISTER_OP("Unpack")
       return Status::OK();
     });
 
+REGISTER_OP("UnravelIndex")
+    .Input("indices: Tidx")
+    .Input("dims: Tidx")
+    .Output("output: Tidx")
+    .Attr("Tidx: {int32, int64} = DT_INT32")
+    .SetShapeFn([](InferenceContext* c) { return Status::OK(); });
+
 // --------------------------------------------------------------------------
 // TODO(josh11b): Remove the >= 2 constraint, once we can rewrite the graph
 // in the N == 1 case to remove the node.
diff --git a/tensorflow/core/ops/image_ops.cc b/tensorflow/core/ops/image_ops.cc
index ef2ac267cc..a62e2d782b 100644
--- a/tensorflow/core/ops/image_ops.cc
+++ b/tensorflow/core/ops/image_ops.cc
@@ -586,6 +586,17 @@ REGISTER_OP("NonMaxSuppression")
     .Output("selected_indices: int32")
     .Attr("iou_threshold: float = 0.5")
     .SetShapeFn([](InferenceContext* c) {
+      // Get inputs and validate ranks.
+      ShapeHandle boxes;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &boxes));
+      ShapeHandle scores;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &scores));
+      ShapeHandle max_output_size;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &max_output_size));
+      // The boxes is a 2-D float Tensor of shape [num_boxes, 4].
+      DimensionHandle unused;
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(boxes, 1), 4, &unused));
+
       c->set_output(0, c->Vector(c->UnknownDim()));
       return Status::OK();
     });
@@ -597,6 +608,19 @@ REGISTER_OP("NonMaxSuppressionV2")
     .Input("iou_threshold: float")
     .Output("selected_indices: int32")
     .SetShapeFn([](InferenceContext* c) {
+      // Get inputs and validate ranks.
+      ShapeHandle boxes;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &boxes));
+      ShapeHandle scores;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &scores));
+      ShapeHandle max_output_size;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &max_output_size));
+      ShapeHandle iou_threshold;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &iou_threshold));
+      // The boxes is a 2-D float Tensor of shape [num_boxes, 4].
+      DimensionHandle unused;
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(boxes, 1), 4, &unused));
+
       c->set_output(0, c->Vector(c->UnknownDim()));
       return Status::OK();
     });
diff --git a/tensorflow/core/ops/manip_ops.cc b/tensorflow/core/ops/manip_ops.cc
new file mode 100644
index 0000000000..95b4774fe6
--- /dev/null
+++ b/tensorflow/core/ops/manip_ops.cc
@@ -0,0 +1,33 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+// --------------------------------------------------------------------------
+REGISTER_OP("Roll")
+    .Input("input: T")
+    .Input("shift: Tshift")
+    .Input("axis: Taxis")
+    .Output("output: T")
+    .Attr("T: type")
+    .Attr("Tshift: {int32,int64}")
+    .Attr("Taxis: {int32,int64}")
+    .SetShapeFn(shape_inference::UnchangedShape);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index 62661fe4bd..67481fd202 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -1818,7 +1818,7 @@ REGISTER_OP("_MklMaxPool")
     .Input("input: T")
     .Input("mkl_input: uint8")
     .Output("output: T")
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
     .Output("workspace: T")
 #else
     .Output("workspace: uint8")
@@ -1844,7 +1844,7 @@ REGISTER_OP("_MklMaxPoolGrad")
     .Input("orig_input: T")
     .Input("orig_output: T")
     .Input("grad: T")
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
     .Input("workspace: T")
 #else
     .Input("workspace: uint8")
@@ -1916,7 +1916,7 @@ REGISTER_OP("_MklLRN")
     .Input("input: T")
     .Input("mkl_input: uint8")
     .Output("output: T")
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
     .Output("workspace: T")
 #else
     .Output("workspace: uint8")
@@ -1944,7 +1944,7 @@ REGISTER_OP("_MklLRNGrad")
     .Input("input_grads: T")
     .Input("input_image: T")
     .Input("output_image: T")
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
     .Input("workspace: T")
 #else
     .Input("workspace: uint8")
diff --git a/tensorflow/core/platform/cpu_feature_guard.cc b/tensorflow/core/platform/cpu_feature_guard.cc
index b0d7b3a67a..b570658158 100644
--- a/tensorflow/core/platform/cpu_feature_guard.cc
+++ b/tensorflow/core/platform/cpu_feature_guard.cc
@@ -97,14 +97,17 @@ std::once_flag g_cpu_feature_guard_warn_once_flag;
 void InfoAboutUnusedCPUFeatures() {
   std::call_once(g_cpu_feature_guard_warn_once_flag, [] {
     string missing_instructions;
-#ifdef PLATFORM_WINDOWS
+#if defined(_MSC_VER) && !defined(__clang__)
+
 #ifndef __AVX__
     CheckIfFeatureUnused(CPUFeature::AVX, "AVX", missing_instructions);
 #endif  // __AVX__
 #ifndef __AVX2__
     CheckIfFeatureUnused(CPUFeature::AVX2, "AVX2", missing_instructions);
 #endif  // __AVX2__
-#else   // ifdef platform windows
+
+#else  // if defined(_MSC_VER) && !defined(__clang__)
+
 #ifndef __SSE__
     CheckIfFeatureUnused(CPUFeature::SSE, "SSE", missing_instructions);
 #endif  // __SSE__
@@ -132,7 +135,7 @@ void InfoAboutUnusedCPUFeatures() {
 #ifndef __FMA__
     CheckIfFeatureUnused(CPUFeature::FMA, "FMA", missing_instructions);
 #endif  // __FMA__
-#endif  // else of ifdef platform windows
+#endif  // else of if defined(_MSC_VER) && !defined(__clang__)
     if (!missing_instructions.empty()) {
       LOG(INFO) << "Your CPU supports instructions that this TensorFlow "
                 << "binary was not compiled to use:" << missing_instructions;
diff --git a/tensorflow/core/platform/profile_utils/cpu_utils.h b/tensorflow/core/platform/profile_utils/cpu_utils.h
index 2da20bb1b8..7b580c8bf6 100644
--- a/tensorflow/core/platform/profile_utils/cpu_utils.h
+++ b/tensorflow/core/platform/profile_utils/cpu_utils.h
@@ -42,7 +42,7 @@ namespace profile_utils {
 class CpuUtils {
  public:
   // Constant for invalid frequency.
-  // This value is returned when the furequency is not obtained somehow.
+  // This value is returned when the frequency is not obtained somehow.
   static constexpr int64 INVALID_FREQUENCY = -1;
   static constexpr uint64 DUMMY_CYCLE_CLOCK = 1;
 
@@ -105,7 +105,7 @@ class CpuUtils {
   static int64 GetCycleCounterFrequency();
 #endif
 
-  // Return micro secound per each clock
+  // Return micro second per each clock
   // As this method caches the cpu frequency internally,
   // the first call will incur overhead, but not subsequent calls.
   static double GetMicroSecPerClock();
diff --git a/tensorflow/core/platform/s3/s3_file_system.cc b/tensorflow/core/platform/s3/s3_file_system.cc
index ebda3a2065..52bf0d4694 100644
--- a/tensorflow/core/platform/s3/s3_file_system.cc
+++ b/tensorflow/core/platform/s3/s3_file_system.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <aws/core/Aws.h>
 #include <aws/core/config/AWSProfileConfigLoader.h>
 #include <aws/core/utils/FileSystemUtils.h>
+#include <aws/core/utils/StringUtils.h>
 #include <aws/core/utils/logging/AWSLogging.h>
 #include <aws/core/utils/logging/LogSystemInterface.h>
 #include <aws/s3/S3Client.h>
@@ -128,6 +129,15 @@ Aws::Client::ClientConfiguration& GetDefaultClientConfig() {
   return cfg;
 };
 
+void ShutdownClient(Aws::S3::S3Client* s3_client) {
+  if (s3_client != nullptr) {
+    delete s3_client;
+    Aws::SDKOptions options;
+    Aws::ShutdownAPI(options);
+    AWSLogSystem::ShutdownAWSLogging();
+  }
+}
+
 Status ParseS3Path(const string& fname, bool empty_object_ok, string* bucket,
                    string* object) {
   if (!bucket || !object) {
@@ -155,12 +165,12 @@ Status ParseS3Path(const string& fname, bool empty_object_ok, string* bucket,
 
 class S3RandomAccessFile : public RandomAccessFile {
  public:
-  S3RandomAccessFile(const string& bucket, const string& object)
-      : bucket_(bucket), object_(object) {}
+  S3RandomAccessFile(const string& bucket, const string& object,
+                     std::shared_ptr<Aws::S3::S3Client> s3_client)
+      : bucket_(bucket), object_(object), s3_client_(s3_client) {}
 
   Status Read(uint64 offset, size_t n, StringPiece* result,
               char* scratch) const override {
-    Aws::S3::S3Client s3Client(GetDefaultClientConfig());
     Aws::S3::Model::GetObjectRequest getObjectRequest;
     getObjectRequest.WithBucket(bucket_.c_str()).WithKey(object_.c_str());
     string bytes = strings::StrCat("bytes=", offset, "-", offset + n - 1);
@@ -168,7 +178,7 @@ class S3RandomAccessFile : public RandomAccessFile {
     getObjectRequest.SetResponseStreamFactory([]() {
       return Aws::New<Aws::StringStream>(kS3FileSystemAllocationTag);
     });
-    auto getObjectOutcome = s3Client.GetObject(getObjectRequest);
+    auto getObjectOutcome = this->s3_client_->GetObject(getObjectRequest);
     if (!getObjectOutcome.IsSuccess()) {
       n = 0;
       *result = StringPiece(scratch, n);
@@ -186,13 +196,16 @@ class S3RandomAccessFile : public RandomAccessFile {
  private:
   string bucket_;
   string object_;
+  std::shared_ptr<Aws::S3::S3Client> s3_client_;
 };
 
 class S3WritableFile : public WritableFile {
  public:
-  S3WritableFile(const string& bucket, const string& object)
+  S3WritableFile(const string& bucket, const string& object,
+                 std::shared_ptr<Aws::S3::S3Client> s3_client)
       : bucket_(bucket),
         object_(object),
+        s3_client_(s3_client),
         sync_needed_(true),
         outfile_(Aws::MakeShared<Aws::Utils::TempFile>(
             kS3FileSystemAllocationTag, "/tmp/s3_filesystem_XXXXXX",
@@ -231,17 +244,13 @@ class S3WritableFile : public WritableFile {
     if (!sync_needed_) {
       return Status::OK();
     }
-    Aws::Client::ClientConfiguration clientConfig = GetDefaultClientConfig();
-    clientConfig.connectTimeoutMs = 300000;
-    clientConfig.requestTimeoutMs = 600000;
-    Aws::S3::S3Client s3Client(clientConfig);
     Aws::S3::Model::PutObjectRequest putObjectRequest;
     putObjectRequest.WithBucket(bucket_.c_str()).WithKey(object_.c_str());
     long offset = outfile_->tellp();
     outfile_->seekg(0);
     putObjectRequest.SetBody(outfile_);
     putObjectRequest.SetContentLength(offset);
-    auto putObjectOutcome = s3Client.PutObject(putObjectRequest);
+    auto putObjectOutcome = this->s3_client_->PutObject(putObjectRequest);
     outfile_->clear();
     outfile_->seekp(offset);
     if (!putObjectOutcome.IsSuccess()) {
@@ -256,6 +265,7 @@ class S3WritableFile : public WritableFile {
  private:
   string bucket_;
   string object_;
+  std::shared_ptr<Aws::S3::S3Client> s3_client_;
   bool sync_needed_;
   std::shared_ptr<Aws::Utils::TempFile> outfile_;
 };
@@ -274,31 +284,46 @@ class S3ReadOnlyMemoryRegion : public ReadOnlyMemoryRegion {
 
 }  // namespace
 
-S3FileSystem::S3FileSystem() {
-  AWSLogSystem::InitializeAWSLogging();
-
-  Aws::SDKOptions options;
-  options.cryptoOptions.sha256Factory_create_fn = []() {
-    return Aws::MakeShared<S3SHA256Factory>(S3CryptoAllocationTag);
-  };
-  options.cryptoOptions.sha256HMACFactory_create_fn = []() {
-    return Aws::MakeShared<S3SHA256HmacFactory>(S3CryptoAllocationTag);
-  };
-  Aws::InitAPI(options);
-}
-
-S3FileSystem::~S3FileSystem() {
-  Aws::SDKOptions options;
-  Aws::ShutdownAPI(options);
+S3FileSystem::S3FileSystem()
+    : s3_client_(nullptr, ShutdownClient), client_lock_() {}
+
+S3FileSystem::~S3FileSystem() {}
+
+// Initializes s3_client_, if needed, and returns it.
+std::shared_ptr<Aws::S3::S3Client> S3FileSystem::GetS3Client() {
+  std::lock_guard<mutex> lock(this->client_lock_);
+
+  if (this->s3_client_.get() == nullptr) {
+    AWSLogSystem::InitializeAWSLogging();
+
+    Aws::SDKOptions options;
+    options.cryptoOptions.sha256Factory_create_fn = []() {
+      return Aws::MakeShared<S3SHA256Factory>(S3CryptoAllocationTag);
+    };
+    options.cryptoOptions.sha256HMACFactory_create_fn = []() {
+      return Aws::MakeShared<S3SHA256HmacFactory>(S3CryptoAllocationTag);
+    };
+    Aws::InitAPI(options);
+
+    // The creation of S3Client disables virtual addressing:
+    //   S3Client(clientConfiguration, signPayloads, useVirtualAdressing = true)
+    // The purpose is to address the issue encountered when there is an `.`
+    // in the bucket name. Due to TLS hostname validation or DNS rules,
+    // the bucket may not be resolved. Disabling of virtual addressing
+    // should address the issue. See GitHub issue 16397 for details.
+    this->s3_client_ = std::shared_ptr<Aws::S3::S3Client>(new Aws::S3::S3Client(
+        GetDefaultClientConfig(),
+        Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy::Never, false));
+  }
 
-  AWSLogSystem::ShutdownAWSLogging();
+  return this->s3_client_;
 }
 
 Status S3FileSystem::NewRandomAccessFile(
     const string& fname, std::unique_ptr<RandomAccessFile>* result) {
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseS3Path(fname, false, &bucket, &object));
-  result->reset(new S3RandomAccessFile(bucket, object));
+  result->reset(new S3RandomAccessFile(bucket, object, this->GetS3Client()));
   return Status::OK();
 }
 
@@ -306,7 +331,7 @@ Status S3FileSystem::NewWritableFile(const string& fname,
                                      std::unique_ptr<WritableFile>* result) {
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseS3Path(fname, false, &bucket, &object));
-  result->reset(new S3WritableFile(bucket, object));
+  result->reset(new S3WritableFile(bucket, object, this->GetS3Client()));
   return Status::OK();
 }
 
@@ -321,7 +346,7 @@ Status S3FileSystem::NewAppendableFile(const string& fname,
 
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseS3Path(fname, false, &bucket, &object));
-  result->reset(new S3WritableFile(bucket, object));
+  result->reset(new S3WritableFile(bucket, object, this->GetS3Client()));
 
   while (true) {
     status = reader->Read(offset, kS3ReadAppendableFileBufferSize, &read_chunk,
@@ -372,7 +397,6 @@ Status S3FileSystem::GetChildren(const string& dir,
     prefix.push_back('/');
   }
 
-  Aws::S3::S3Client s3Client(GetDefaultClientConfig());
   Aws::S3::Model::ListObjectsRequest listObjectsRequest;
   listObjectsRequest.WithBucket(bucket.c_str())
       .WithPrefix(prefix.c_str())
@@ -383,7 +407,8 @@ Status S3FileSystem::GetChildren(const string& dir,
 
   Aws::S3::Model::ListObjectsResult listObjectsResult;
   do {
-    auto listObjectsOutcome = s3Client.ListObjects(listObjectsRequest);
+    auto listObjectsOutcome =
+        this->GetS3Client()->ListObjects(listObjectsRequest);
     if (!listObjectsOutcome.IsSuccess()) {
       string error = strings::StrCat(
           listObjectsOutcome.GetError().GetExceptionName().c_str(), ": ",
@@ -417,11 +442,10 @@ Status S3FileSystem::Stat(const string& fname, FileStatistics* stats) {
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseS3Path(fname, true, &bucket, &object));
 
-  Aws::S3::S3Client s3Client(GetDefaultClientConfig());
   if (object.empty()) {
     Aws::S3::Model::HeadBucketRequest headBucketRequest;
     headBucketRequest.WithBucket(bucket.c_str());
-    auto headBucketOutcome = s3Client.HeadBucket(headBucketRequest);
+    auto headBucketOutcome = this->GetS3Client()->HeadBucket(headBucketRequest);
     if (!headBucketOutcome.IsSuccess()) {
       string error = strings::StrCat(
           headBucketOutcome.GetError().GetExceptionName().c_str(), ": ",
@@ -439,7 +463,7 @@ Status S3FileSystem::Stat(const string& fname, FileStatistics* stats) {
   headObjectRequest.WithBucket(bucket.c_str()).WithKey(object.c_str());
   headObjectRequest.SetResponseStreamFactory(
       []() { return Aws::New<Aws::StringStream>(kS3FileSystemAllocationTag); });
-  auto headObjectOutcome = s3Client.HeadObject(headObjectRequest);
+  auto headObjectOutcome = this->GetS3Client()->HeadObject(headObjectRequest);
   if (headObjectOutcome.IsSuccess()) {
     stats->length = headObjectOutcome.GetResult().GetContentLength();
     stats->is_directory = 0;
@@ -457,7 +481,8 @@ Status S3FileSystem::Stat(const string& fname, FileStatistics* stats) {
       .WithMaxKeys(1);
   listObjectsRequest.SetResponseStreamFactory(
       []() { return Aws::New<Aws::StringStream>(kS3FileSystemAllocationTag); });
-  auto listObjectsOutcome = s3Client.ListObjects(listObjectsRequest);
+  auto listObjectsOutcome =
+      this->GetS3Client()->ListObjects(listObjectsRequest);
   if (listObjectsOutcome.IsSuccess()) {
     if (listObjectsOutcome.GetResult().GetContents().size() > 0) {
       stats->length = 0;
@@ -475,11 +500,11 @@ Status S3FileSystem::DeleteFile(const string& fname) {
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseS3Path(fname, false, &bucket, &object));
 
-  Aws::S3::S3Client s3Client(GetDefaultClientConfig());
   Aws::S3::Model::DeleteObjectRequest deleteObjectRequest;
   deleteObjectRequest.WithBucket(bucket.c_str()).WithKey(object.c_str());
 
-  auto deleteObjectOutcome = s3Client.DeleteObject(deleteObjectRequest);
+  auto deleteObjectOutcome =
+      this->GetS3Client()->DeleteObject(deleteObjectRequest);
   if (!deleteObjectOutcome.IsSuccess()) {
     string error = strings::StrCat(
         deleteObjectOutcome.GetError().GetExceptionName().c_str(), ": ",
@@ -494,10 +519,9 @@ Status S3FileSystem::CreateDir(const string& dirname) {
   TF_RETURN_IF_ERROR(ParseS3Path(dirname, true, &bucket, &object));
 
   if (object.empty()) {
-    Aws::S3::S3Client s3Client(GetDefaultClientConfig());
     Aws::S3::Model::HeadBucketRequest headBucketRequest;
     headBucketRequest.WithBucket(bucket.c_str());
-    auto headBucketOutcome = s3Client.HeadBucket(headBucketRequest);
+    auto headBucketOutcome = this->GetS3Client()->HeadBucket(headBucketRequest);
     if (!headBucketOutcome.IsSuccess()) {
       return errors::NotFound("The bucket ", bucket, " was not found.");
     }
@@ -517,7 +541,6 @@ Status S3FileSystem::DeleteDir(const string& dirname) {
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseS3Path(dirname, false, &bucket, &object));
 
-  Aws::S3::S3Client s3Client(GetDefaultClientConfig());
   string prefix = object;
   if (prefix.back() != '/') {
     prefix.push_back('/');
@@ -528,7 +551,8 @@ Status S3FileSystem::DeleteDir(const string& dirname) {
       .WithMaxKeys(2);
   listObjectsRequest.SetResponseStreamFactory(
       []() { return Aws::New<Aws::StringStream>(kS3FileSystemAllocationTag); });
-  auto listObjectsOutcome = s3Client.ListObjects(listObjectsRequest);
+  auto listObjectsOutcome =
+      this->GetS3Client()->ListObjects(listObjectsRequest);
   if (listObjectsOutcome.IsSuccess()) {
     auto contents = listObjectsOutcome.GetResult().GetContents();
     if (contents.size() > 1 ||
@@ -568,8 +592,6 @@ Status S3FileSystem::RenameFile(const string& src, const string& target) {
     }
   }
 
-  Aws::S3::S3Client s3Client(GetDefaultClientConfig());
-
   Aws::S3::Model::CopyObjectRequest copyObjectRequest;
   Aws::S3::Model::DeleteObjectRequest deleteObjectRequest;
 
@@ -582,7 +604,8 @@ Status S3FileSystem::RenameFile(const string& src, const string& target) {
 
   Aws::S3::Model::ListObjectsResult listObjectsResult;
   do {
-    auto listObjectsOutcome = s3Client.ListObjects(listObjectsRequest);
+    auto listObjectsOutcome =
+        this->GetS3Client()->ListObjects(listObjectsRequest);
     if (!listObjectsOutcome.IsSuccess()) {
       string error = strings::StrCat(
           listObjectsOutcome.GetError().GetExceptionName().c_str(), ": ",
@@ -595,13 +618,15 @@ Status S3FileSystem::RenameFile(const string& src, const string& target) {
       Aws::String src_key = object.GetKey();
       Aws::String target_key = src_key;
       target_key.replace(0, src_object.length(), target_object.c_str());
-      Aws::String source = Aws::String(src_bucket.c_str()) + "/" + src_key;
+      Aws::String source = Aws::String(src_bucket.c_str()) + "/" +
+                           Aws::Utils::StringUtils::URLEncode(src_key.c_str());
 
       copyObjectRequest.SetBucket(target_bucket.c_str());
       copyObjectRequest.SetKey(target_key);
       copyObjectRequest.SetCopySource(source);
 
-      auto copyObjectOutcome = s3Client.CopyObject(copyObjectRequest);
+      auto copyObjectOutcome =
+          this->GetS3Client()->CopyObject(copyObjectRequest);
       if (!copyObjectOutcome.IsSuccess()) {
         string error = strings::StrCat(
             copyObjectOutcome.GetError().GetExceptionName().c_str(), ": ",
@@ -612,7 +637,8 @@ Status S3FileSystem::RenameFile(const string& src, const string& target) {
       deleteObjectRequest.SetBucket(src_bucket.c_str());
       deleteObjectRequest.SetKey(src_key.c_str());
 
-      auto deleteObjectOutcome = s3Client.DeleteObject(deleteObjectRequest);
+      auto deleteObjectOutcome =
+          this->GetS3Client()->DeleteObject(deleteObjectRequest);
       if (!deleteObjectOutcome.IsSuccess()) {
         string error = strings::StrCat(
             deleteObjectOutcome.GetError().GetExceptionName().c_str(), ": ",
diff --git a/tensorflow/core/platform/s3/s3_file_system.h b/tensorflow/core/platform/s3/s3_file_system.h
index 31ba3cecc5..31264be621 100644
--- a/tensorflow/core/platform/s3/s3_file_system.h
+++ b/tensorflow/core/platform/s3/s3_file_system.h
@@ -16,7 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_S3_S3_FILE_SYSTEM_H_
 #define TENSORFLOW_CONTRIB_S3_S3_FILE_SYSTEM_H_
 
+#include <aws/s3/S3Client.h>
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
 
 namespace tensorflow {
 
@@ -53,6 +55,26 @@ class S3FileSystem : public FileSystem {
   Status GetFileSize(const string& fname, uint64* size) override;
 
   Status RenameFile(const string& src, const string& target) override;
+
+ private:
+  // Returns the member S3 client, initializing as-needed.
+  // When the client tries to access the object in S3, e.g.,
+  //   s3://bucket-name/path/to/object
+  // the behavior could be controlled by various environmental
+  // variables.
+  // By default S3 access regional endpoint, with region
+  // controlled by `AWS_REGION`. The endpoint could be overridden
+  // explicitly with `S3_ENDPOINT`. S3 uses HTTPS by default.
+  // If S3_USE_HTTPS=0 is specified, HTTP is used. Also,
+  // S3_VERIFY_SSL=0 could disable SSL verification in case
+  // HTTPS is used.
+  // This S3 Client does not support Virtual Hosted–Style Method
+  // for a bucket.
+  std::shared_ptr<Aws::S3::S3Client> GetS3Client();
+
+  std::shared_ptr<Aws::S3::S3Client> s3_client_;
+  // Lock held when checking for s3_client_ initialization.
+  mutex client_lock_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/s3/s3_file_system_test.cc b/tensorflow/core/platform/s3/s3_file_system_test.cc
index 0b42f5fcec..d4411d9865 100644
--- a/tensorflow/core/platform/s3/s3_file_system_test.cc
+++ b/tensorflow/core/platform/s3/s3_file_system_test.cc
@@ -130,6 +130,8 @@ TEST_F(S3FileSystemTest, NewReadOnlyMemoryRegionFromFile) {
 
 TEST_F(S3FileSystemTest, FileExists) {
   const string fname = TmpDir("FileExists");
+  // Ensure the file doesn't yet exist.
+  TF_ASSERT_OK(s3fs.DeleteFile(fname));
   EXPECT_EQ(error::Code::NOT_FOUND, s3fs.FileExists(fname).code());
   TF_ASSERT_OK(WriteString(fname, "test"));
   TF_EXPECT_OK(s3fs.FileExists(fname));
diff --git a/tensorflow/core/platform/windows/cpu_info.h b/tensorflow/core/platform/windows/cpu_info.h
index d6e78dbc8f..f20939d3c0 100644
--- a/tensorflow/core/platform/windows/cpu_info.h
+++ b/tensorflow/core/platform/windows/cpu_info.h
@@ -22,8 +22,10 @@ limitations under the License.
 // Byte order defines provided by gcc. MSVC doesn't define those so
 // we define them here.
 // We assume that all windows platform out there are little endian.
+#if defined(_MSC_VER) && !defined(__clang__)
 #define __ORDER_LITTLE_ENDIAN__ 0x4d2
 #define __ORDER_BIG_ENDIAN__ 0x10e1
 #define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
+#endif
 
 #endif  // TENSORFLOW_PLATFORM_WINDOWS_CPU_INFO_H_
diff --git a/tensorflow/core/profiler/README.md b/tensorflow/core/profiler/README.md
index 460f935e4a..57d76eb4cb 100644
--- a/tensorflow/core/profiler/README.md
+++ b/tensorflow/core/profiler/README.md
@@ -240,8 +240,9 @@ Open a Chrome browser, enter URL chrome://tracing and load the timeline file.
 # can also generate memory profile using `-select bytes`
 tfprof> code -select accelerator_micros -max_depth 100000 -output pprof:outfile=<filename>  -trim_name_regexes .*apply_op.*
 
-# Use pprof to visualize the generated file.
-pprof -png --nodecount=100 --sample_index=1 <filename>
+# Use google-pprof, from the google-perftools package to visualize the generated file.
+# On Ubuntu you can install it with `apt-get install it google-perftools`.
+google-pprof --pdf --nodecount=100 <filename>
 ```
 
 ![PprofGraph](g3doc/pprof.jpg)
diff --git a/tensorflow/core/profiler/internal/tfprof_stats.h b/tensorflow/core/profiler/internal/tfprof_stats.h
index 0790cb0ca6..db148c936c 100644
--- a/tensorflow/core/profiler/internal/tfprof_stats.h
+++ b/tensorflow/core/profiler/internal/tfprof_stats.h
@@ -83,7 +83,7 @@ class TFStats {
   const MultiGraphNodeProto& ShowMultiGraphNode(const string& cmd,
                                                 const Options& opts) const;
 
-  // A a (partial) graph to existing graph.
+  // Add a (partial) graph to existing graph.
   void AddGraph(std::unique_ptr<GraphDef> graph);
 
   // Add a step of run time meta data.
@@ -118,7 +118,7 @@ class TFStats {
   MultiGraphNodeProto empty_multi_graph_node_;
 
   std::map<int64, string> id_to_string_;
-  // Graph nodes covered by RunMetdata, that is traced with run time stats.
+  // Graph nodes covered by RunMetadata, that is traced with run time stats.
   std::set<int64> covered_nodes_;
 };
 
diff --git a/tensorflow/core/profiler/profiler.cc b/tensorflow/core/profiler/profiler.cc
index 2cc212d589..808e3c853b 100644
--- a/tensorflow/core/profiler/profiler.cc
+++ b/tensorflow/core/profiler/profiler.cc
@@ -206,8 +206,12 @@ int Run(int argc, char** argv) {
         "graph_path,op_log_path,run_meta_path\n");
     std::unique_ptr<GraphDef> graph(new GraphDef());
     if (!FLAGS_graph_path.empty()) {
-      TF_CHECK_OK(
-          ReadProtoFile(Env::Default(), FLAGS_graph_path, graph.get(), false));
+      s = ReadProtoFile(Env::Default(), FLAGS_graph_path, graph.get(), false);
+      if (!s.ok()) {
+        fprintf(stderr, "Failed to read graph_path: %s\n",
+                s.ToString().c_str());
+        return 1;
+      }
     }
 
     std::unique_ptr<OpLogProto> op_log(new OpLogProto());
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 67da7bf452..b02f899b87 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -24,7 +24,7 @@ limitations under the License.
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
 // "-beta", "-rc", "-rc.1")
-#define TF_VERSION_SUFFIX "-rc1"
+#define TF_VERSION_SUFFIX ""
 
 #define TF_STR_HELPER(x) #x
 #define TF_STR(x) TF_STR_HELPER(x)
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 864e7e39c2..db4c5c35e3 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -35,7 +35,7 @@ limitations under the License.
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 
 using mkldnn::engine;
@@ -325,7 +325,7 @@ class MklShape {
       nullptr;  // TF dimension corresponding to this MKL dimension
 };
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 
 // Forward decl
 TensorFormat MklDnnDataFormatToTFDataFormat(memory::format format);
@@ -660,7 +660,7 @@ class MklDnnShape {
 
 typedef std::vector<MklShape> MklShapeList;
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 typedef std::vector<MklDnnShape> MklDnnShapeList;
 #endif
 
@@ -674,7 +674,7 @@ inline bool AreAllMklTensors(const MklShapeList& shapes) {
   return true;
 }
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 template <typename T>
 inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
                              const MklShape& mkl_shape) {
@@ -725,7 +725,7 @@ inline void GetMklShape(OpKernelContext* ctext, int n, MklShape* mklshape) {
           sizeof(uint8));
 }
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 inline void GetMklShape(OpKernelContext* ctext, int n, MklDnnShape* mklshape) {
   mklshape->DeSerializeMklDnnShape(
       ctext->input(GetTensorMetaDataIndex(n, ctext->num_inputs()))
@@ -749,7 +749,7 @@ inline void GetMklInputList(OpKernelContext* ctext, StringPiece name,
   ctext->input_list(name, input_tensors);
 }
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
 inline void GetMklShapeList(OpKernelContext* ctext, StringPiece name,
                             MklShapeList* mkl_shapes) {
@@ -779,7 +779,7 @@ inline void GetMklShapeList(OpKernelContext* ctext, StringPiece name,
 
 #endif
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 /// Get shape of input tensor pointed by 'input_idx' in TensorShape format.
 /// If the input tensor is in MKL layout, then obtains TensorShape from
 /// MklShape.
@@ -814,7 +814,7 @@ inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
       second_tensor->flat<uint8>().size() * sizeof(uint8));
 }
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 // Allocate the second output tensor that will contain
 // the MKL shape serialized
 inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
@@ -851,7 +851,7 @@ inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
       second_tensor->flat<uint8>().size() * sizeof(uint8));
 }
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 // Allocate the output tensor, create a second output tensor that will contain
 // the MKL shape serialized
 inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
@@ -875,7 +875,7 @@ inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
 
 // Allocates a temp tensor and returns the data buffer for temporary storage.
 // Currently
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 template <typename T>
 inline void AllocTmpBuffer(OpKernelContext* context, Tensor* tensor_out,
                            const memory::primitive_desc& pd, void** buf_out) {
@@ -994,7 +994,7 @@ inline void CopyMklTensorInToOut(OpKernelContext* context, int idx_in,
   context->set_output(idx_meta_out, meta_output);
 }
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 inline void CopyTfTensorInToOutWithShape(OpKernelContext* context, int idx_in,
                                          int idx_out,
                                          const TensorShape& shape) {
@@ -1032,7 +1032,7 @@ inline void CopyTfTensorInToOutWithShape(OpKernelContext* context, int idx_in,
 }
 #endif
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
 inline void ForwardTfTensorInToOut(OpKernelContext* context, int idx_in,
                                    int idx_out) {
@@ -1090,7 +1090,7 @@ inline void ForwardMklTensorInToOut(OpKernelContext* context, int idx_in,
   }
 }
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 inline void ForwardMklTensorInToOutWithMklShape(OpKernelContext* context,
                                                 int idx_in, int idx_out,
                                                 const MklDnnShape& mkl_shape) {
@@ -1132,7 +1132,7 @@ inline void SetDummyMklShapeOutput(OpKernelContext* context,
   AllocateOutputSetMklShape(context, idx_data_out, mkl_shape_output);
 }
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 // We don't need these functions in MKLDNN. We have defined equality operator
 // on MklDnnShape class directly.
 
@@ -1242,7 +1242,7 @@ inline void MklNCHWToNHWC(const Tensor& input, Tensor** output) {
 
 // -------------------------------------------------------------------
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 
 /// Return MKL-DNN data type (memory::data_type) for input type T
 ///
@@ -1753,7 +1753,7 @@ class MklDnnData {
   }
 };
 
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 
 }  // namespace tensorflow
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/util/mkl_util_test.cc b/tensorflow/core/util/mkl_util_test.cc
index 8b73eadb40..cd1d0713ad 100644
--- a/tensorflow/core/util/mkl_util_test.cc
+++ b/tensorflow/core/util/mkl_util_test.cc
@@ -22,7 +22,7 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 
 TEST(MklUtilTest, MklDnnTfShape) {
   auto cpu_engine = engine(engine::cpu, 0);
@@ -84,7 +84,7 @@ TEST(MklUtilTest, MklDnnBlockedFormatTest) {
   EXPECT_EQ(b_md2.data.format, mkldnn_blocked);
 }
 
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 }  // namespace
 }  // namespace tensorflow
 
diff --git a/tensorflow/docs_src/about/bib.md b/tensorflow/docs_src/about/bib.md
index c9f0c532c6..5593a3d95c 100644
--- a/tensorflow/docs_src/about/bib.md
+++ b/tensorflow/docs_src/about/bib.md
@@ -60,7 +60,7 @@ author={
     Lukasz~Kaiser and
     Manjunath~Kudlur and
     Josh~Levenberg and
-    Dan~Man\'{e} and
+    Dandelion~Man\'{e} and
     Rajat~Monga and
     Sherry~Moore and
     Derek~Murray and
diff --git a/tensorflow/docs_src/api_guides/python/contrib.signal.md b/tensorflow/docs_src/api_guides/python/contrib.signal.md
index 85ef3ad134..0f7690f80a 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.signal.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.signal.md
@@ -28,14 +28,14 @@ The `axis` parameter to @{tf.contrib.signal.frame} allows you to frame tensors
 with inner structure (e.g. a spectrogram):
 
 ```python
-# `magnitude_spectrograms` is a [batch_size, ?, 127] tensor of spectrograms. We
+# `magnitude_spectrograms` is a [batch_size, ?, 129] tensor of spectrograms. We
 # would like to produce overlapping fixed-size spectrogram patches; for example,
 # for use in a situation where a fixed size input is needed.
 magnitude_spectrograms = tf.abs(tf.contrib.signal.stft(
     signals, frame_length=256, frame_step=64, fft_length=256))
 
-# `spectrogram_patches` is a [batch_size, ?, 64, 127] tensor containing a
-# variable number of [64, 127] spectrogram patches per batch item.
+# `spectrogram_patches` is a [batch_size, ?, 64, 129] tensor containing a
+# variable number of [64, 129] spectrogram patches per batch item.
 spectrogram_patches = tf.contrib.signal.frame(
     magnitude_spectrograms, frame_length=64, frame_step=16, axis=1)
 ```
diff --git a/tensorflow/docs_src/api_guides/python/regression_examples.md b/tensorflow/docs_src/api_guides/python/regression_examples.md
index 45cb9d829c..dae50a8f03 100644
--- a/tensorflow/docs_src/api_guides/python/regression_examples.md
+++ b/tensorflow/docs_src/api_guides/python/regression_examples.md
@@ -229,4 +229,4 @@ passed through to the `model_fn` when the `model_fn` is called.
 The `model_fn` returns an
 @{tf.estimator.EstimatorSpec$`EstimatorSpec`} which is a simple structure
 indicating to the `Estimator` which operations should be run to accomplish
-varions tasks.
+various tasks.
diff --git a/tensorflow/docs_src/get_started/custom_estimators.md b/tensorflow/docs_src/get_started/custom_estimators.md
index 6343cc4ee4..42a246678a 100644
--- a/tensorflow/docs_src/get_started/custom_estimators.md
+++ b/tensorflow/docs_src/get_started/custom_estimators.md
@@ -15,7 +15,7 @@ git clone https://github.com/tensorflow/models/
 cd models/samples/core/get_started
 ```
 
-In this document we wil be looking at
+In this document we will be looking at
 [`custom_estimator.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/custom_estimator.py).
 You can run it with the following command:
 
@@ -161,7 +161,7 @@ classifier = tf.estimator.Estimator(
 
 To implement a typical model function, you must do the following:
 
-* (Define the model)[#define_the_model].
+* [Define the model](#define_the_model).
 * Specify additional calculations for each of
   the [three different modes](#modes):
   * [Predict](#predict)
diff --git a/tensorflow/docs_src/get_started/datasets_quickstart.md b/tensorflow/docs_src/get_started/datasets_quickstart.md
index ecfbf160f0..a8a2ab6e56 100644
--- a/tensorflow/docs_src/get_started/datasets_quickstart.md
+++ b/tensorflow/docs_src/get_started/datasets_quickstart.md
@@ -169,7 +169,7 @@ the number of examples in the `Dataset` ensures that the data is completely
 shuffled. The Iris data set only contains 150 examples.
 
 The @{tf.data.Dataset.repeat$`repeat`} method has the `Dataset` restart when
-it reaches the end. To limit the number of epochss, set the `count` argument.
+it reaches the end. To limit the number of epochs, set the `count` argument.
 
 The @{tf.data.Dataset.repeat$`batch`} method collects a number of examples and
 stacks them, to create batches. This adds a dimension to their shape. The new
@@ -282,7 +282,7 @@ produce the necessary `(features, label)` pairs.
 
 We will start by building a function to parse a single line.
 
-The following `iris_data.parse_line` function acomplishes this taks using the
+The following `iris_data.parse_line` function accomplishes this task using the
 @{tf.decode_csv} function, and some simple python code:
 
 We must parse each of the lines in the dataset in order to generate the
diff --git a/tensorflow/docs_src/get_started/feature_columns.md b/tensorflow/docs_src/get_started/feature_columns.md
index e3308ed716..ad3e1fe3e3 100644
--- a/tensorflow/docs_src/get_started/feature_columns.md
+++ b/tensorflow/docs_src/get_started/feature_columns.md
@@ -461,8 +461,8 @@ permitting a richer palette of numbers for every cell, an embedding column
 contains far fewer cells than an indicator column.
 
 Let's look at an example comparing indicator and embedding columns. Suppose our
-input examples consists of different words from a limited palette of only 81
-words. Further suppose that the data set provides provides the following input
+input examples consist of different words from a limited palette of only 81
+words. Further suppose that the data set provides the following input
 words in 4 separate examples:
 
 * `"dog"`
diff --git a/tensorflow/docs_src/get_started/premade_estimators.md b/tensorflow/docs_src/get_started/premade_estimators.md
index 45850a8996..4f01f997c3 100644
--- a/tensorflow/docs_src/get_started/premade_estimators.md
+++ b/tensorflow/docs_src/get_started/premade_estimators.md
@@ -372,7 +372,7 @@ Test set accuracy: 0.967
 
 We now have a trained model that produces good evaluation results.
 We can now use the trained model to predict the species of an Iris flower
-based on some unlabeled measurments. As with training and evaluation, we make
+based on some unlabeled measurements. As with training and evaluation, we make
 predictions using a single function call:
 
 ```python
diff --git a/tensorflow/docs_src/install/install_c.md b/tensorflow/docs_src/install/install_c.md
index ba1a4118ae..14add7c77e 100644
--- a/tensorflow/docs_src/install/install_c.md
+++ b/tensorflow/docs_src/install/install_c.md
@@ -38,7 +38,7 @@ enable TensorFlow for C:
          OS="linux" # Change to "darwin" for macOS
          TARGET_DIRECTORY="/usr/local"
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.5.0-rc1.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.5.0.tar.gz" |
            sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_go.md b/tensorflow/docs_src/install/install_go.md
index 87cc647317..d2af9d9843 100644
--- a/tensorflow/docs_src/install/install_go.md
+++ b/tensorflow/docs_src/install/install_go.md
@@ -38,7 +38,7 @@ steps to install this library and enable TensorFlow for Go:
          TF_TYPE="cpu" # Change to "gpu" for GPU support
          TARGET_DIRECTORY='/usr/local'
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.5.0-rc1.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.5.0.tar.gz" |
          sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_java.md b/tensorflow/docs_src/install/install_java.md
index 37e109a6e4..e5388c4b1e 100644
--- a/tensorflow/docs_src/install/install_java.md
+++ b/tensorflow/docs_src/install/install_java.md
@@ -36,7 +36,7 @@ following to the project's `pom.xml` to use the TensorFlow Java APIs:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>tensorflow</artifactId>
-  <version>1.5.0-rc1</version>
+  <version>1.5.0</version>
 </dependency>
 ```
 
@@ -65,7 +65,7 @@ As an example, these steps will create a Maven project that uses TensorFlow:
                <dependency>
                  <groupId>org.tensorflow</groupId>
                  <artifactId>tensorflow</artifactId>
-                 <version>1.5.0-rc1</version>
+                 <version>1.5.0</version>
                </dependency>
              </dependencies>
          </project>
@@ -123,12 +123,12 @@ instead:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow</artifactId>
-  <version>1.5.0-rc1</version>
+  <version>1.5.0</version>
 </dependency>
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow_jni_gpu</artifactId>
-  <version>1.5.0-rc1</version>
+  <version>1.5.0</version>
 </dependency>
 ```
 
@@ -147,7 +147,7 @@ refer to the simpler instructions above instead.
 Take the following steps to install TensorFlow for Java on Linux or macOS:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.5.0-rc1.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.5.0.jar),
      which is the TensorFlow Java Archive (JAR).
 
   2. Decide whether you will run TensorFlow for Java on CPU(s) only or with
@@ -166,7 +166,7 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
          OS=$(uname -s | tr '[:upper:]' '[:lower:]')
          mkdir -p ./jni
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.5.0-rc1.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.5.0.tar.gz" |
            tar -xz -C ./jni
 
 ### Install on Windows
@@ -174,10 +174,10 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
 Take the following steps to install TensorFlow for Java on Windows:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.5.0-rc1.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.5.0.jar),
      which is the TensorFlow Java Archive (JAR).
   2. Download the following Java Native Interface (JNI) file appropriate for
-     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.5.0-rc1.zip).
+     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.5.0.zip).
   3. Extract this .zip file.
 
 
@@ -225,7 +225,7 @@ must be part of your `classpath`. For example, you can include the
 downloaded `.jar` in your `classpath` by using the `-cp` compilation flag
 as follows:
 
-<pre><b>javac -cp libtensorflow-1.5.0-rc1.jar HelloTF.java</b></pre>
+<pre><b>javac -cp libtensorflow-1.5.0.jar HelloTF.java</b></pre>
 
 
 ### Running
@@ -239,11 +239,11 @@ two files are available to the JVM:
 For example, the following command line executes the `HelloTF` program on Linux
 and macOS X:
 
-<pre><b>java -cp libtensorflow-1.5.0-rc1.jar:. -Djava.library.path=./jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.5.0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
 
 And the following command line executes the `HelloTF` program on Windows:
 
-<pre><b>java -cp libtensorflow-1.5.0-rc1.jar;. -Djava.library.path=jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.5.0.jar;. -Djava.library.path=jni HelloTF</b></pre>
 
 If the program prints <tt>Hello from <i>version</i></tt>, you've successfully
 installed TensorFlow for Java and are ready to use the API.  If the program
diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index 03f12dff08..cd8c14599f 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -31,13 +31,13 @@ If you are installing TensorFlow with GPU support using one of the
 mechanisms described in this guide, then the following NVIDIA software
 must be installed on your system:
 
-  * CUDA® Toolkit 8.0. For details, see
+  * CUDA® Toolkit 9.0. For details, see
     [NVIDIA's documentation](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/#axzz4VZnqTJ2A).
     Ensure that you append the relevant Cuda pathnames to the
     `LD_LIBRARY_PATH` environment variable as described in the
     NVIDIA documentation.
-  * The NVIDIA drivers associated with CUDA Toolkit 8.0.
-  * cuDNN v6.0. For details, see
+  * The NVIDIA drivers associated with CUDA Toolkit 9.0.
+  * cuDNN v7.0. For details, see
     [NVIDIA's documentation](https://developer.nvidia.com/cudnn).
     Ensure that you create the `CUDA_HOME` environment variable as
     described in the NVIDIA documentation.
@@ -188,7 +188,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      Virtualenv environment:
 
      <pre>(tensorflow)$ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc1-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common_installation_problems).
@@ -293,7 +293,7 @@ take the following steps:
 
      <pre>
      $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc1-cp34-cp34m-linux_x86_64.whl</b>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp34-cp34m-linux_x86_64.whl</b>
      </pre>
 
      If this step fails, see
@@ -480,7 +480,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
 
      <pre>
      (tensorflow)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc1-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -648,14 +648,14 @@ This section documents the relevant values for Linux installations.
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc1-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp27-none-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0rc1-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0-cp27-none-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -667,14 +667,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc1-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0rc1-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -686,14 +686,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc1-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0rc1-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
@@ -705,14 +705,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc1-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0rc1-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
diff --git a/tensorflow/docs_src/install/install_mac.md b/tensorflow/docs_src/install/install_mac.md
index e13ddadab7..f49d3a2f08 100644
--- a/tensorflow/docs_src/install/install_mac.md
+++ b/tensorflow/docs_src/install/install_mac.md
@@ -115,7 +115,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      TensorFlow in the active Virtualenv is as follows:
 
      <pre> $ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0rc1-py2-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0-py3-none-any.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common-installation-problems).
@@ -238,7 +238,7 @@ take the following steps:
      issue the following command:
 
      <pre> $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0rc1-py2-none-any.whl</b> </pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0-py3-none-any.whl</b> </pre>
 
      If the preceding command fails, see
      [installation problems](#common-installation-problems).
@@ -347,7 +347,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
      TensorFlow for Python 2.7:
 
      <pre> (<i>targetDirectory</i>)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0rc1-py2-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0-py2-none-any.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -520,7 +520,7 @@ This section documents the relevant values for Mac OS installations.
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0rc1-py2-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0-py2-none-any.whl
 </pre>
 
 
@@ -528,5 +528,5 @@ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0rc1-py2-none-a
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0rc1-py3-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0-py3-none-any.whl
 </pre>
diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index 485863bf2e..bc7d2080dc 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -133,7 +133,7 @@ The following NVIDIA <i>hardware</i> must be installed on your system:
 
 The following NVIDIA <i>software</i> must be installed on your system:
 
-  * NVIDIA's Cuda Toolkit (>= 7.0). We recommend version 8.0.
+  * NVIDIA's Cuda Toolkit (>= 7.0). We recommend version 9.0.
     For details, see
     [NVIDIA's documentation](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/#axzz4VZnqTJ2A).
     Ensure that you append the relevant Cuda pathnames to the
@@ -289,11 +289,11 @@ Do you wish to build TensorFlow with CUDA support? [y/N] <b>Y</b>
 CUDA support will be enabled for TensorFlow
 Do you want to use clang as CUDA compiler? [y/N]
 nvcc will be used as CUDA compiler
-Please specify the Cuda SDK version you want to use, e.g. 7.0. [Leave empty to default to CUDA 8.0]: <b>8.0</b>
-Please specify the location where CUDA 8.0 toolkit is installed. Refer to README.md for more details. [Default is /usr/local/cuda]:
+Please specify the Cuda SDK version you want to use, e.g. 7.0. [Leave empty to default to CUDA 9.0]: <b>9.0</b>
+Please specify the location where CUDA 9.0 toolkit is installed. Refer to README.md for more details. [Default is /usr/local/cuda]:
 Please specify which gcc should be used by nvcc as the host compiler. [Default is /usr/bin/gcc]:
-Please specify the cuDNN version you want to use. [Leave empty to default to cuDNN 6.0]: <b>6</b>
-Please specify the location where cuDNN 6 library is installed. Refer to README.md for more details. [Default is /usr/local/cuda]:
+Please specify the cuDNN version you want to use. [Leave empty to default to cuDNN 7.0]: <b>7</b>
+Please specify the location where cuDNN 7 library is installed. Refer to README.md for more details. [Default is /usr/local/cuda]:
 Please specify a list of comma-separated Cuda compute capabilities you want to build with.
 You can find the compute capability of your device at: https://developer.nvidia.com/cuda-gpus.
 Please note that each additional compute capability significantly increases your build time and binary size.
@@ -359,10 +359,10 @@ Invoke `pip install` to install that pip package.
 The filename of the `.whl` file depends on your platform.
 For example, the following command will install the pip package
 
-for TensorFlow 1.5.0rc1 on Linux:
+for TensorFlow 1.5.0 on Linux:
 
 <pre>
-$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.5.0rc1-py2-none-any.whl</b>
+$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.5.0-py2-none-any.whl</b>
 </pre>
 
 ## Validate your installation
@@ -461,8 +461,8 @@ Stack Overflow and specify the `tensorflow` tag.
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
 
-<tr><td>tensorflow-1.5.0-rc1</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.8.0</td><td>N/A</td><td>N/A</td></tr>
-<tr><td>tensorflow_gpu-1.5.0-rc1</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.8.0</td><td>7</td><td>9</td></tr>
+<tr><td>tensorflow-1.5.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.8.0</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.5.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.8.0</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.4.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.5.4</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.4.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.5.4</td><td>6</td><td>8</td></tr>
 <tr><td>tensorflow-1.3.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.4.5</td><td>N/A</td><td>N/A</td></tr>
@@ -478,7 +478,7 @@ Stack Overflow and specify the `tensorflow` tag.
 **Mac**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
-<tr><td>tensorflow-1.5.0-rc1</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.8.1</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow-1.5.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.8.1</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.4.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.5.4</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.3.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.4.5</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.2.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.4.5</td><td>N/A</td><td>N/A</td></tr>
@@ -491,8 +491,8 @@ Stack Overflow and specify the `tensorflow` tag.
 **Windows**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
-<tr><td>tensorflow-1.5.0-rc1</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
-<tr><td>tensorflow_gpu-1.5.0-rc1</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
+<tr><td>tensorflow-1.5.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.5.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.4.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.4.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>6</td><td>8</td></tr>
 <tr><td>tensorflow-1.3.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
diff --git a/tensorflow/docs_src/install/install_windows.md b/tensorflow/docs_src/install/install_windows.md
index 8d0eb7966f..86a111c2ec 100644
--- a/tensorflow/docs_src/install/install_windows.md
+++ b/tensorflow/docs_src/install/install_windows.md
@@ -30,13 +30,13 @@ If you are installing TensorFlow with GPU support using one of the mechanisms
 described in this guide, then the following NVIDIA software must be
 installed on your system:
 
-  * CUDA® Toolkit 8.0. For details, see
+  * CUDA® Toolkit 9.0. For details, see
     [NVIDIA's
     documentation](http://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/)
     Ensure that you append the relevant Cuda pathnames to the `%PATH%`
     environment variable as described in the NVIDIA documentation.
-  * The NVIDIA drivers associated with CUDA Toolkit 8.0.
-  * cuDNN v6.0. For details, see
+  * The NVIDIA drivers associated with CUDA Toolkit 9.0.
+  * cuDNN v7.0. For details, see
     [NVIDIA's documentation](https://developer.nvidia.com/cudnn).
     Note that cuDNN is typically installed in a different location from the
     other CUDA DLLs. Ensure that you add the directory where you installed
diff --git a/tensorflow/docs_src/programmers_guide/graphs.md b/tensorflow/docs_src/programmers_guide/graphs.md
index 2b4896c381..9049a5a9f3 100644
--- a/tensorflow/docs_src/programmers_guide/graphs.md
+++ b/tensorflow/docs_src/programmers_guide/graphs.md
@@ -125,14 +125,14 @@ an operation:
   @{tf.Tensor} accepts an optional `name` argument. For example,
   `tf.constant(42.0, name="answer")` creates a new @{tf.Operation} named
   `"answer"` and returns a @{tf.Tensor} named `"answer:0"`. If the default graph
-  already contained an operation named `"answer"`, the TensorFlow would append
+  already contains an operation named `"answer"`, then TensorFlow would append
   `"_1"`, `"_2"`, and so on to the name, in order to make it unique.
 
 * The @{tf.name_scope} function makes it possible to add a **name scope** prefix
   to all operations created in a particular context. The current name scope
   prefix is a `"/"`-delimited list of the names of all active @{tf.name_scope}
   context managers. If a name scope has already been used in the current
-  context, TensorFlow appens `"_1"`, `"_2"`, and so on. For example:
+  context, TensorFlow appends `"_1"`, `"_2"`, and so on. For example:
 
   ```python
   c_0 = tf.constant(0, name="c")  # => operation named "c"
diff --git a/tensorflow/examples/android/BUILD b/tensorflow/examples/android/BUILD
index 46df5973e8..1214647797 100644
--- a/tensorflow/examples/android/BUILD
+++ b/tensorflow/examples/android/BUILD
@@ -92,7 +92,7 @@ android_binary(
 filegroup(
     name = "external_assets",
     srcs = [
-        "@inception5h//:model_files",
+        "@inception_v1//:model_files",
         "@mobile_ssd//:model_files",
         "@speech_commands//:model_files",
         "@stylize//:model_files",
diff --git a/tensorflow/examples/android/build.gradle b/tensorflow/examples/android/build.gradle
index f7bdf8b816..0767726aa9 100644
--- a/tensorflow/examples/android/build.gradle
+++ b/tensorflow/examples/android/build.gradle
@@ -56,10 +56,12 @@ def nativeOutDir = 'libs/' + cpuType
 def nativeBuildRule = 'buildNativeBazel'
 def demoLibPath = '../../../bazel-bin/tensorflow/examples/android/libtensorflow_demo.so'
 def inferenceLibPath = '../../../bazel-bin/tensorflow/contrib/android/libtensorflow_inference.so'
+
+// Override for Makefile builds.
 if (nativeBuildSystem == 'makefile') {
     nativeBuildRule = 'buildNativeMake'
-    demoLibPath = '../../../tensorflow/contrib/makefile/gen/lib/libtensorflow_demo.so'
-    inferenceLibPath = '../../../tensorflow/contrib/makefile/gen/lib/libtensorflow_inference.so'
+    demoLibPath = '../../../tensorflow/contrib/makefile/gen/lib/android_' + cpuType + '/libtensorflow_demo.so'
+    inferenceLibPath = '../../../tensorflow/contrib/makefile/gen/lib/android_' + cpuType + '/libtensorflow_inference.so'
 }
 
 // If building with Bazel, this is the location of the bazel binary.
@@ -154,7 +156,8 @@ task buildNativeMake(type: Exec) {
          '-s',  \
          'tensorflow/contrib/makefile/sub_makefiles/android/Makefile.in',  \
          '-t',  \
-         'libtensorflow_inference.so libtensorflow_demo.so'  \
+         'libtensorflow_inference.so libtensorflow_demo.so all'  \
+         , '-a', cpuType  \
          //, '-T'  // Uncomment to skip protobuf and speed up subsequent builds.
 }
 
diff --git a/tensorflow/examples/android/download-models.gradle b/tensorflow/examples/android/download-models.gradle
index 0e2cf65f53..d3b67eab52 100644
--- a/tensorflow/examples/android/download-models.gradle
+++ b/tensorflow/examples/android/download-models.gradle
@@ -9,7 +9,7 @@
  */
 // hard coded model files
 // LINT.IfChange
-def models = ['inception5h.zip',
+def models = ['inception_v1.zip',
               'object_detection/ssd_mobilenet_v1_android_export.zip',
               'stylize_v1.zip',
               'speech_commands_conv_actions.zip']
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/LegacyCameraConnectionFragment.java b/tensorflow/examples/android/src/org/tensorflow/demo/LegacyCameraConnectionFragment.java
index a317273acd..068c7b0d94 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/LegacyCameraConnectionFragment.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/LegacyCameraConnectionFragment.java
@@ -81,8 +81,11 @@ public class LegacyCameraConnectionFragment extends Fragment {
 
           try {
             Camera.Parameters parameters = camera.getParameters();
-            parameters.setFocusMode(Camera.Parameters.FOCUS_MODE_CONTINUOUS_PICTURE);
-
+            List<String> focusModes = parameters.getSupportedFocusModes();
+            if (focusModes != null
+                && focusModes.contains(Camera.Parameters.FOCUS_MODE_CONTINUOUS_PICTURE)) {
+              parameters.setFocusMode(Camera.Parameters.FOCUS_MODE_CONTINUOUS_PICTURE);
+            }
             List<Camera.Size> cameraSizes = parameters.getSupportedPreviewSizes();
             Size[] sizes = new Size[cameraSizes.size()];
             int i = 0;
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/tracking/MultiBoxTracker.java b/tensorflow/examples/android/src/org/tensorflow/demo/tracking/MultiBoxTracker.java
index 2fe2ba539e..af6af2bc8f 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/tracking/MultiBoxTracker.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/tracking/MultiBoxTracker.java
@@ -199,7 +199,7 @@ public class MultiBoxTracker {
       final int w,
       final int h,
       final int rowStride,
-      final int sensorOrienation,
+      final int sensorOrientation,
       final byte[] frame,
       final long timestamp) {
     if (objectTracker == null && !initialized) {
@@ -209,7 +209,7 @@ public class MultiBoxTracker {
       objectTracker = ObjectTracker.getInstance(w, h, rowStride, true);
       frameWidth = w;
       frameHeight = h;
-      this.sensorOrientation = sensorOrienation;
+      this.sensorOrientation = sensorOrientation;
       initialized = true;
 
       if (objectTracker == null) {
diff --git a/tensorflow/examples/udacity/Dockerfile b/tensorflow/examples/udacity/Dockerfile
index 3ca58566c1..00eb853e52 100644
--- a/tensorflow/examples/udacity/Dockerfile
+++ b/tensorflow/examples/udacity/Dockerfile
@@ -8,7 +8,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
-RUN pip install scikit-learn pyreadline Pillow
+RUN pip install scikit-learn pyreadline Pillow imageio
 RUN rm -rf /notebooks/*
 ADD *.ipynb /notebooks/
 WORKDIR /notebooks
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 2b4d5b8e0f..f5cd7885e7 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -76,6 +76,7 @@ py_library(
         ":layers",
         ":lib",
         ":list_ops",
+        ":manip_ops",
         ":math_ops",
         ":metrics",
         ":nn",
@@ -1423,6 +1424,14 @@ tf_gen_op_wrapper_private_py(
     ],
 )
 
+tf_gen_op_wrapper_private_py(
+    name = "manip_ops_gen",
+    visibility = [
+        "//learning/brain/python/ops:__pkg__",
+        "//tensorflow/python/kernel_tests:__pkg__",
+    ],
+)
+
 tf_gen_op_wrapper_private_py(
     name = "math_ops_gen",
     visibility = [
@@ -1755,6 +1764,8 @@ py_library(
         ":linalg_grad",
         ":linalg_ops",
         ":logging_ops",
+        ":manip_grad",
+        ":manip_ops",
         ":math_grad",
         ":math_ops",
         ":platform",
@@ -1877,6 +1888,29 @@ py_library(
     ],
 )
 
+py_library(
+    name = "manip_grad",
+    srcs = ["ops/manip_grad.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":control_flow_ops",
+        ":framework_for_generated_wrappers",
+        ":manip_ops",
+    ],
+)
+
+py_library(
+    name = "manip_ops",
+    srcs = ["ops/manip_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":dtypes",
+        ":framework_ops",
+        ":manip_ops_gen",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_library(
     name = "logging_ops",
     srcs = ["ops/logging_ops.py"],
@@ -2339,6 +2373,8 @@ py_library(
         ":linalg_ops",
         ":logging_ops",
         ":lookup_ops",
+        ":manip_grad",
+        ":manip_ops",
         ":math_grad",
         ":math_ops",
         ":numerics",
diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index bc9ddec2a5..ea7604d30f 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -84,6 +84,7 @@ from tensorflow.python.feature_column import feature_column_lib as feature_colum
 from tensorflow.python.layers import layers
 from tensorflow.python.ops import bitwise_ops as bitwise
 from tensorflow.python.ops import image_ops as image
+from tensorflow.python.ops import manip_ops as manip
 from tensorflow.python.ops import metrics
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import sets
@@ -241,6 +242,7 @@ _allowed_symbols.extend([
     'linalg',
     'logging',
     'losses',
+    'manip',
     'metrics',
     'newaxis',
     'nn',
diff --git a/tensorflow/python/client/session_benchmark.py b/tensorflow/python/client/session_benchmark.py
index 721bca91b7..da74855193 100644
--- a/tensorflow/python/client/session_benchmark.py
+++ b/tensorflow/python/client/session_benchmark.py
@@ -22,6 +22,7 @@ import time
 
 import numpy as np
 
+from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.python.client import session
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index 5d318531d5..c4b7e4919b 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -203,7 +203,7 @@ class Dataset(object):
       tensors: A nested structure of tensors.
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
     """
     return TensorDataset(tensors)
 
@@ -216,7 +216,7 @@ class Dataset(object):
         0th dimension.
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
     """
     return TensorSliceDataset(tensors)
 
@@ -229,7 +229,7 @@ class Dataset(object):
       sparse_tensor: A `tf.SparseTensor`.
 
     Returns:
-      A `Dataset` of rank-(N-1) sparse tensors.
+      Dataset: A `Dataset` of rank-(N-1) sparse tensors.
     """
     return SparseTensorSliceDataset(sparse_tensor)
 
@@ -315,7 +315,7 @@ class Dataset(object):
         `generator`.
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
     """
     if not callable(generator):
       raise TypeError("`generator` must be callable.")
@@ -458,7 +458,7 @@ class Dataset(object):
         len(args) == 3 -> start = args[0], stop = args[1, stop = args[2]
 
     Returns:
-      A `RangeDataset`.
+      Dataset: A `RangeDataset`.
 
     Raises:
       ValueError: if len(args) == 0.
@@ -502,7 +502,7 @@ class Dataset(object):
       datasets: A nested structure of datasets.
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
     """
     return ZipDataset(datasets)
 
@@ -528,7 +528,7 @@ class Dataset(object):
       dataset: `Dataset` to be concatenated.
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
     """
     return ConcatenateDataset(self, dataset)
 
@@ -540,7 +540,7 @@ class Dataset(object):
         maximum number elements that will be buffered when prefetching.
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
     """
     return PrefetchDataset(self, buffer_size)
 
@@ -558,12 +558,14 @@ class Dataset(object):
         - /path/to/dir/b.py
         - /path/to/dir/c.py
 
+    NOTE: The order of the file names returned can be non-deterministic.
+
     Args:
       file_pattern: A string or scalar string `tf.Tensor`, representing
         the filename pattern that will be matched.
 
     Returns:
-     A `Dataset` of strings corresponding to file names.
+     Dataset: A `Dataset` of strings corresponding to file names.
     """
     return Dataset.from_tensor_slices(gen_io_ops.matching_files(file_pattern))
 
@@ -580,7 +582,7 @@ class Dataset(object):
         indefinitely.
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
     """
     return RepeatDataset(self, count)
 
@@ -604,7 +606,7 @@ class Dataset(object):
         iterated over. (Defaults to `True`.)
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
     """
     return ShuffleDataset(self, buffer_size, seed, reshuffle_each_iteration)
 
@@ -617,7 +619,7 @@ class Dataset(object):
         If a filename is not provided, the dataset will be cached in memory.
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
     """
     return CacheDataset(self, filename)
 
@@ -631,7 +633,7 @@ class Dataset(object):
         dataset, the new dataset will contain all elements of this dataset.
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
     """
     return TakeDataset(self, count)
 
@@ -646,7 +648,7 @@ class Dataset(object):
         is -1, skips the entire dataset.
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
     """
     return SkipDataset(self, count)
 
@@ -693,7 +695,7 @@ class Dataset(object):
       index: A `tf.int64` scalar `tf.Tensor`, representing the worker index.
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
 
     Raises:
       ValueError: if `num_shards` or `index` are illegal values. Note: error
@@ -737,7 +739,7 @@ class Dataset(object):
         consecutive elements of this dataset to combine in a single batch.
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
     """
     return BatchDataset(self, batch_size)
 
@@ -766,7 +768,7 @@ class Dataset(object):
         the empty string for string types.
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
     """
     return PaddedBatchDataset(self, batch_size, padded_shapes, padding_values)
 
@@ -782,7 +784,7 @@ class Dataset(object):
         specified, elements will be processed sequentially.
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
     """
     if num_parallel_calls is None:
       return MapDataset(self, map_func)
@@ -798,7 +800,7 @@ class Dataset(object):
         `Dataset`.
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
     """
     return FlatMapDataset(self, map_func)
 
@@ -867,7 +869,7 @@ class Dataset(object):
         input element before cycling to another input element.
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
     """
     return InterleaveDataset(self, map_func, cycle_length, block_length)
 
@@ -880,7 +882,7 @@ class Dataset(object):
         scalar `tf.bool` tensor.
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
     """
     return FilterDataset(self, predicate)
 
@@ -901,10 +903,11 @@ class Dataset(object):
 
     Args:
       transformation_func: A function that takes one `Dataset` argument and
-        returns a `Dataset`.
+          returns a `Dataset`.
 
     Returns:
-      The `Dataset` returned by applying `transformation_func` to this dataset.
+      Dataset: The `Dataset` returned by applying `transformation_func` to this
+          dataset.
     """
     dataset = transformation_func(self)
     if not isinstance(dataset, Dataset):
diff --git a/tensorflow/python/data/util/nest.py b/tensorflow/python/data/util/nest.py
index e387e35740..e90ce3fb40 100644
--- a/tensorflow/python/data/util/nest.py
+++ b/tensorflow/python/data/util/nest.py
@@ -266,7 +266,7 @@ def map_structure(func, *structure, **check_types_dict):
   and the return value will contain the results in the same structure.
 
   Args:
-    func: A callable that acceps as many arguments are there are structures.
+    func: A callable that accepts as many arguments are there are structures.
     *structure: scalar, or tuple or list of constructed scalars and/or other
       tuples/lists, or scalars.  Note: numpy arrays are considered scalars.
     **check_types_dict: only valid keyword argument is `check_types`. If set to
@@ -479,8 +479,8 @@ def map_structure_up_to(shallow_tree, func, *inputs):
   The `inputs`, can be thought of as having the same structure as
   `shallow_tree`, but with leaf nodes that are themselves tree structures.
 
-  This function therefore will return something with the same base structure as
-  `shallow_tree`.
+  This function, therefore, will return something with the same base structure
+  as `shallow_tree`.
 
   Examples:
 
diff --git a/tensorflow/python/data/util/sparse.py b/tensorflow/python/data/util/sparse.py
index 5ebcb4ea81..5e6d224709 100644
--- a/tensorflow/python/data/util/sparse.py
+++ b/tensorflow/python/data/util/sparse.py
@@ -141,7 +141,7 @@ def serialize_sparse_tensors(tensors):
     tensors: a tensor structure to serialize.
 
   Returns:
-    `tensors` with any sparse tensors replaced by the their serialized version.
+    `tensors` with any sparse tensors replaced by their serialized version.
   """
 
   ret = nest.pack_sequence_as(tensors, [
diff --git a/tensorflow/python/debug/cli/tensor_format.py b/tensorflow/python/debug/cli/tensor_format.py
index d4aea76d65..e0759a8bc1 100644
--- a/tensorflow/python/debug/cli/tensor_format.py
+++ b/tensorflow/python/debug/cli/tensor_format.py
@@ -535,7 +535,7 @@ def numeric_summary(tensor):
   if not isinstance(tensor, np.ndarray) or not np.size(tensor):
     return debugger_cli_common.RichTextLines([
         "No numeric summary available due to empty tensor."])
-  elif (np.issubdtype(tensor.dtype, np.float) or
+  elif (np.issubdtype(tensor.dtype, np.floating) or
         np.issubdtype(tensor.dtype, np.complex) or
         np.issubdtype(tensor.dtype, np.integer)):
     counts = [
diff --git a/tensorflow/python/debug/lib/debug_data.py b/tensorflow/python/debug/lib/debug_data.py
index c4b13a1045..8d355aa27f 100644
--- a/tensorflow/python/debug/lib/debug_data.py
+++ b/tensorflow/python/debug/lib/debug_data.py
@@ -222,7 +222,7 @@ def has_inf_or_nan(datum, tensor):
     # Also return False for data types that cannot be represented as numpy
     # arrays.
     return False
-  elif (np.issubdtype(tensor.dtype, np.float) or
+  elif (np.issubdtype(tensor.dtype, np.floating) or
         np.issubdtype(tensor.dtype, np.complex) or
         np.issubdtype(tensor.dtype, np.integer)):
     return np.any(np.isnan(tensor)) or np.any(np.isinf(tensor))
diff --git a/tensorflow/python/eager/execution_callbacks.py b/tensorflow/python/eager/execution_callbacks.py
index 2f1654dda4..988442c971 100644
--- a/tensorflow/python/eager/execution_callbacks.py
+++ b/tensorflow/python/eager/execution_callbacks.py
@@ -153,7 +153,7 @@ def inf_nan_callback(op_type,
       continue
 
     numpy_dtype = output.dtype.as_numpy_dtype
-    if (np.issubdtype(numpy_dtype, np.float) or
+    if (np.issubdtype(numpy_dtype, np.floating) or
         np.issubdtype(numpy_dtype, np.complex) or
         np.issubdtype(numpy_dtype, np.integer)):
       try:
diff --git a/tensorflow/python/estimator/canned/dnn_testing_utils.py b/tensorflow/python/estimator/canned/dnn_testing_utils.py
index 2bdec69303..706575985f 100644
--- a/tensorflow/python/estimator/canned/dnn_testing_utils.py
+++ b/tensorflow/python/estimator/canned/dnn_testing_utils.py
@@ -877,7 +877,7 @@ class BaseDNNWarmStartingTest(object):
 
     # Create a second DNNClassifier, warm-started from the first.  Use a
     # learning_rate = 0.0 optimizer to check values (use SGD so we don't have
-    # accumulator values that change).  Use a a new FeatureColumn with a
+    # accumulator values that change).  Use a new FeatureColumn with a
     # different vocabulary for occupation.
     new_vocab_list = ['doctor', 'consultant', 'engineer']
     new_vocab_file = os.path.join(self._ckpt_and_vocab_dir,
diff --git a/tensorflow/python/estimator/canned/linear_testing_utils.py b/tensorflow/python/estimator/canned/linear_testing_utils.py
index cccb9af4b2..3e9183cf1b 100644
--- a/tensorflow/python/estimator/canned/linear_testing_utils.py
+++ b/tensorflow/python/estimator/canned/linear_testing_utils.py
@@ -2003,7 +2003,7 @@ class BaseLinearWarmStartingTest(object):
 
     # Create a second LinearClassifier, warm-started from the first.  Use a
     # learning_rate = 0.0 optimizer to check values (use SGD so we don't have
-    # accumulator values that change).  Use a a new FeatureColumn with a
+    # accumulator values that change).  Use a new FeatureColumn with a
     # different vocabulary for occupation.
     new_vocab_list = ['doctor', 'consultant', 'engineer']
     new_vocab_file = os.path.join(self._ckpt_and_vocab_dir,
diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index 6da890cd22..17fab3df4d 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -55,6 +55,7 @@ from tensorflow.python.training import saver
 from tensorflow.python.training import training
 from tensorflow.python.training import training_util
 from tensorflow.python.util import compat
+from tensorflow.python.util import compat_internal
 from tensorflow.python.util import nest
 
 
@@ -179,7 +180,7 @@ class Estimator(object):
       self._config = config
 
     # Model directory.
-    model_dir = compat.path_to_str(model_dir)
+    model_dir = compat_internal.path_to_str(model_dir)
     if (model_dir is not None) and (self._config.model_dir is not None):
       if model_dir != self._config.model_dir:
         # TODO(alanyee): remove this suppression after it is no longer needed
diff --git a/tensorflow/python/estimator/run_config.py b/tensorflow/python/estimator/run_config.py
index 61a537022b..0c636a8da1 100644
--- a/tensorflow/python/estimator/run_config.py
+++ b/tensorflow/python/estimator/run_config.py
@@ -27,7 +27,7 @@ import six
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import server_lib
-from tensorflow.python.util import compat
+from tensorflow.python.util import compat_internal
 
 
 _USE_DEFAULT = object()
@@ -444,7 +444,8 @@ class RunConfig(object):
     if tf_config:
       logging.info('TF_CONFIG environment variable: %s', tf_config)
 
-    model_dir = _get_model_dir(tf_config, compat.path_to_str(model_dir))
+    model_dir = _get_model_dir(tf_config,
+                               compat_internal.path_to_str(model_dir))
 
     RunConfig._replace(
         self,
diff --git a/tensorflow/python/keras/_impl/keras/layers/convolutional.py b/tensorflow/python/keras/_impl/keras/layers/convolutional.py
index b2ad4c4b65..2ee0732775 100644
--- a/tensorflow/python/keras/_impl/keras/layers/convolutional.py
+++ b/tensorflow/python/keras/_impl/keras/layers/convolutional.py
@@ -563,7 +563,7 @@ class Conv2DTranspose(tf_convolutional_layers.Conv2DTranspose, Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-class Conv3DTranspose(tf_convolutional_layers.Conv3D, Layer):
+class Conv3DTranspose(tf_convolutional_layers.Conv3DTranspose, Layer):
   """Transposed convolution layer (sometimes called Deconvolution).
 
   The need for transposed convolutions generally arises
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index c87b7652ad..3a6058054b 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -1601,6 +1601,19 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "manip_ops_test",
+    size = "small",
+    srcs = ["manip_ops_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:manip_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+    ],
+    tags = ["no_windows_gpu"],
+)
+
 cuda_py_test(
     name = "matmul_op_test",
     size = "small",
diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index aae6d0a36e..7ec4624310 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -1162,6 +1162,27 @@ class InvertPermutationTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(y.eval(), [2, 4, 3, 0, 1])
 
 
+class UnravelIndexTest(test_util.TensorFlowTestCase):
+
+  def testUnravelIndex(self):
+    with self.test_session():
+      for dtype in [dtypes.int32, dtypes.int64]:
+        indices_1 = constant_op.constant(1621, dtype=dtype)
+        dims_1 = constant_op.constant([6, 7, 8, 9], dtype=dtype)
+        out_1 = array_ops.unravel_index(indices_1, dims_1)
+        self.assertAllEqual(out_1.eval(), [3, 1, 4, 1])
+
+        indices_2 = constant_op.constant([1621], dtype=dtype)
+        dims_2 = constant_op.constant([6, 7, 8, 9], dtype=dtype)
+        out_2 = array_ops.unravel_index(indices_2, dims_2)
+        self.assertAllEqual(out_2.eval(), [[3], [1], [4], [1]])
+
+        indices_3 = constant_op.constant([22, 41, 37], dtype=dtype)
+        dims_3 = constant_op.constant([7, 6], dtype=dtype)
+        out_3 = array_ops.unravel_index(indices_3, dims_3)
+        self.assertAllEqual(out_3.eval(), [[3, 6, 6], [4, 5, 1]])
+
+
 class GuaranteeConstOpTest(test_util.TensorFlowTestCase):
 
   def testSimple(self):
diff --git a/tensorflow/python/kernel_tests/constant_op_test.py b/tensorflow/python/kernel_tests/constant_op_test.py
index 030c690167..16e56349c4 100644
--- a/tensorflow/python/kernel_tests/constant_op_test.py
+++ b/tensorflow/python/kernel_tests/constant_op_test.py
@@ -454,18 +454,19 @@ class ZerosLikeTest(test.TestCase):
 
   def testZerosLikeCPU(self):
     for dtype in [
-        dtypes_lib.float32, dtypes_lib.float64, dtypes_lib.int8,
-        dtypes_lib.uint8, dtypes_lib.int16, dtypes_lib.uint16, dtypes_lib.int32,
-        dtypes_lib.int64, dtypes_lib.bool, dtypes_lib.complex64,
-        dtypes_lib.complex128, dtypes_lib.string
+        dtypes_lib.half, dtypes_lib.float32, dtypes_lib.float64,
+        dtypes_lib.int8, dtypes_lib.uint8, dtypes_lib.int16, dtypes_lib.uint16,
+        dtypes_lib.int32, dtypes_lib.int64, dtypes_lib.bool,
+        dtypes_lib.complex64, dtypes_lib.complex128, dtypes_lib.string
     ]:
       self._compareZeros(dtype, fully_defined_shape=False, use_gpu=False)
       self._compareZeros(dtype, fully_defined_shape=True, use_gpu=False)
 
   def testZerosLikeGPU(self):
     for dtype in [
-        dtypes_lib.float32, dtypes_lib.float64, dtypes_lib.int32,
-        dtypes_lib.bool, dtypes_lib.int64, dtypes_lib.string
+        dtypes_lib.half, dtypes_lib.float32, dtypes_lib.float64,
+        dtypes_lib.int32, dtypes_lib.int64, dtypes_lib.complex64,
+        dtypes_lib.complex128, dtypes_lib.bool
     ]:
       self._compareZeros(dtype, fully_defined_shape=False, use_gpu=True)
       self._compareZeros(dtype, fully_defined_shape=True, use_gpu=True)
diff --git a/tensorflow/python/kernel_tests/conv_ops_test.py b/tensorflow/python/kernel_tests/conv_ops_test.py
index 3e9bd3dade..edfb20d6a2 100644
--- a/tensorflow/python/kernel_tests/conv_ops_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_test.py
@@ -24,6 +24,7 @@ import time
 
 import numpy as np
 
+from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.contrib import layers
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.framework import constant_op
@@ -519,7 +520,7 @@ class Conv2DTest(test.TestCase):
           dilations=[2, 2],
           padding="VALID")
 
-  # TODO this currently fails.
+  # TODO(yzhwang): this currently fails.
   # self._VerifyValues(tensor_in_sizes=[1, 8, 8, 1],
   #                   filter_in_sizes=[2, 2, 1, 1],
   #                   strides=[4, 4], padding="SAME",
diff --git a/tensorflow/python/kernel_tests/decode_jpeg_op_test.py b/tensorflow/python/kernel_tests/decode_jpeg_op_test.py
index ead55cd03b..89fd26c544 100644
--- a/tensorflow/python/kernel_tests/decode_jpeg_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_jpeg_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import os
 import time
 
+from six.moves import xrange
 from tensorflow.python.client import session
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
diff --git a/tensorflow/python/kernel_tests/io_ops_test.py b/tensorflow/python/kernel_tests/io_ops_test.py
index f91875c6f0..61944f7e31 100644
--- a/tensorflow/python/kernel_tests/io_ops_test.py
+++ b/tensorflow/python/kernel_tests/io_ops_test.py
@@ -1,4 +1,4 @@
-﻿# -*- coding: utf-8 -*-
+# -*- coding: utf-8 -*-
 # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensorflow/python/kernel_tests/losses_test.py b/tensorflow/python/kernel_tests/losses_test.py
index 00c6706593..197dbf44af 100644
--- a/tensorflow/python/kernel_tests/losses_test.py
+++ b/tensorflow/python/kernel_tests/losses_test.py
@@ -953,14 +953,14 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
     # Compute the expected loss 'manually'.
     total = np.zeros((batch_size,))
     for b in range(batch_size):
-      for i in range(dims):
-        for j in range(dims):
+      for i in range(dims - 1):
+        for j in range(i + 1, dims):
           x = self._predictions[b, i].item() - self._predictions[b, j].item()
           y = self._labels[b, i].item() - self._labels[b, j].item()
           diff = (x - y)
           total[b] += (diff * diff)
 
-    self._expected_losses = np.divide(total, 9.0)
+    self._expected_losses = np.divide(total, 3.0)
 
   def testValueErrorThrownWhenWeightIsNone(self):
     with self.test_session():
@@ -1059,8 +1059,7 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
         [[4, 8, 12], [1, 2, 3], [4, 5, 6]],
         [[8, 1, 3], [7, 8, 9], [10, 11, 12]],
     ])
-    self._test_valid_weights(
-        labels, predictions, expected_loss=122.22222)
+    self._test_valid_weights(labels, predictions, expected_loss=137.5)
 
   def test3dWeightedScalar(self):
     labels = np.array([
@@ -1073,8 +1072,7 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
     ])
     weight = 3.0
     self._test_valid_weights(
-        labels, predictions, expected_loss=weight * 122.22222,
-        weights=weight)
+        labels, predictions, expected_loss=weight * 137.5, weights=weight)
 
   def _test_invalid_weights(
       self, labels, predictions, weights=1.0):
@@ -1124,7 +1122,9 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
     ])
     self._test_valid_weights(
         # TODO(ptucker): This doesn't look right.
-        labels, predictions, expected_loss=9 * 122.22222,
+        labels,
+        predictions,
+        expected_loss=9 * 137.5,
         weights=np.ones((2, 3, 3)))
 
   def testLossWithAllZeroBatchSpecificWeights(self):
diff --git a/tensorflow/python/kernel_tests/manip_ops_test.py b/tensorflow/python/kernel_tests/manip_ops_test.py
new file mode 100644
index 0000000000..b8200ac0cb
--- /dev/null
+++ b/tensorflow/python/kernel_tests/manip_ops_test.py
@@ -0,0 +1,138 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for manip_ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import manip_ops
+from tensorflow.python.platform import test as test_lib
+
+# pylint: disable=g-import-not-at-top
+try:
+  from distutils.version import StrictVersion as Version
+  # numpy.roll for multiple shifts was introduced in numpy version 1.12.0
+  NP_ROLL_CAN_MULTISHIFT = Version(np.version.version) >= Version("1.12.0")
+except ImportError:
+  NP_ROLL_CAN_MULTISHIFT = False
+# pylint: enable=g-import-not-at-top
+
+
+class RollTest(test_util.TensorFlowTestCase):
+
+  def _testRoll(self, np_input, shift, axis):
+    expected_roll = np.roll(np_input, shift, axis)
+    with self.test_session():
+      roll = manip_ops.roll(np_input, shift, axis)
+      self.assertAllEqual(roll.eval(), expected_roll)
+
+  def _testGradient(self, np_input, shift, axis):
+    with self.test_session():
+      inx = constant_op.constant(np_input.tolist())
+      xs = list(np_input.shape)
+      y = manip_ops.roll(inx, shift, axis)
+      # Expected y's shape to be the same
+      ys = xs
+      jacob_t, jacob_n = gradient_checker.compute_gradient(
+          inx, xs, y, ys, x_init_value=np_input)
+      self.assertAllClose(jacob_t, jacob_n, rtol=1e-5, atol=1e-5)
+
+  def _testAll(self, np_input, shift, axis):
+    self._testRoll(np_input, shift, axis)
+    if np_input.dtype == np.float32:
+      self._testGradient(np_input, shift, axis)
+
+  def testIntTypes(self):
+    for t in [np.int32, np.int64]:
+      self._testAll(np.random.randint(-100, 100, (5)).astype(t), 3, 0)
+      if NP_ROLL_CAN_MULTISHIFT:
+        self._testAll(
+            np.random.randint(-100, 100, (4, 4, 3)).astype(t), [1, -2, 3],
+            [0, 1, 2])
+        self._testAll(
+            np.random.randint(-100, 100, (4, 2, 1, 3)).astype(t), [0, 1, -2],
+            [1, 2, 3])
+
+  def testFloatTypes(self):
+    for t in [np.float32, np.float64]:
+      self._testAll(np.random.rand(5).astype(t), 2, 0)
+      if NP_ROLL_CAN_MULTISHIFT:
+        self._testAll(np.random.rand(3, 4).astype(t), [1, 2], [1, 0])
+        self._testAll(np.random.rand(1, 3, 4).astype(t), [1, 0, -3], [0, 1, 2])
+
+  def testComplexTypes(self):
+    for t in [np.complex64, np.complex128]:
+      x = np.random.rand(4, 4).astype(t)
+      self._testAll(x + 1j * x, 2, 0)
+      if NP_ROLL_CAN_MULTISHIFT:
+        x = np.random.rand(2, 5).astype(t)
+        self._testAll(x + 1j * x, [1, 2], [1, 0])
+        x = np.random.rand(3, 2, 1, 1).astype(t)
+        self._testAll(x + 1j * x, [2, 1, 1, 0], [0, 3, 1, 2])
+
+  def testRollInputMustVectorHigherRaises(self):
+    tensor = 7
+    shift = 1
+    axis = 0
+    with self.test_session():
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "input must be 1-D or higher"):
+        manip_ops.roll(tensor, shift, axis).eval()
+
+  def testRollAxisMustBeScalarOrVectorRaises(self):
+    tensor = [[1, 2], [3, 4]]
+    shift = 1
+    axis = [[0, 1]]
+    with self.test_session():
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "axis must be a scalar or a 1-D vector"):
+        manip_ops.roll(tensor, shift, axis).eval()
+
+  def testRollShiftMustBeScalarOrVectorRaises(self):
+    tensor = [[1, 2], [3, 4]]
+    shift = [[0, 1]]
+    axis = 1
+    with self.test_session():
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "shift must be a scalar or a 1-D vector"):
+        manip_ops.roll(tensor, shift, axis).eval()
+
+  def testRollShiftAndAxisMustBeSameSizeRaises(self):
+    tensor = [[1, 2], [3, 4]]
+    shift = [1]
+    axis = [0, 1]
+    with self.test_session():
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "shift and axis must have the same size"):
+        manip_ops.roll(tensor, shift, axis).eval()
+
+  def testRollAxisOutOfRangeRaises(self):
+    tensor = [1, 2]
+    shift = 1
+    axis = 1
+    with self.test_session():
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "is out of range"):
+        manip_ops.roll(tensor, shift, axis).eval()
+
+
+if __name__ == "__main__":
+  test_lib.main()
diff --git a/tensorflow/python/kernel_tests/rnn_test.py b/tensorflow/python/kernel_tests/rnn_test.py
index 0c77d1db92..daa42938e6 100644
--- a/tensorflow/python/kernel_tests/rnn_test.py
+++ b/tensorflow/python/kernel_tests/rnn_test.py
@@ -23,6 +23,7 @@ import timeit
 
 import numpy as np
 
+from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.contrib import rnn as contrib_rnn
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
diff --git a/tensorflow/python/kernel_tests/tensordot_op_test.py b/tensorflow/python/kernel_tests/tensordot_op_test.py
index f1670a47f5..8ad29afd0a 100644
--- a/tensorflow/python/kernel_tests/tensordot_op_test.py
+++ b/tensorflow/python/kernel_tests/tensordot_op_test.py
@@ -66,7 +66,7 @@ class TensordotTest(test_lib.TestCase):
     a = [[1, 2], [3, 4]]
     b = [[1, 2], [3, 4]]
     # Invalid static axes.
-    for axes_value in -1, 0, [1], [[1]], [[1], [0, 1]]:
+    for axes_value in -1, 3, [1], [[1]], [[1], [0, 1]]:
       with self.assertRaises(ValueError):
         math_ops.tensordot(a, b, axes_value)
 
@@ -91,7 +91,7 @@ class TensordotTest(test_lib.TestCase):
 
   # Test case for 11950
   def test_valid_axis(self):
-    for axes_value in [1, 2], [[1], [2]]:
+    for axes_value in [1, 2], [[1], [2]], [[], []], 0:
       with self.test_session() as sess:
         np_a = np.ones((3, 3))
         np_b = np.array([2, 3, 1])[None, None]
@@ -105,29 +105,29 @@ class TensordotTest(test_lib.TestCase):
         self.assertAllEqual(tf_ans, np_ans)
 
   def test_partial_shape_inference(self):
-    a = array_ops.placeholder(dtypes.float32)
-    b = array_ops.placeholder(dtypes.float32)
-    axes = ([1], [0])
-    output = math_ops.tensordot(a, b, axes)
-    self.assertEqual(output.get_shape().ndims, None)
-    a.set_shape([None, 2])
-    b.set_shape([2, 3])
-    output = math_ops.tensordot(a, b, axes)
-    output_shape = output.get_shape()
-    self.assertEqual(output_shape.ndims, 2)
-    output_shape = output_shape.as_list()
-    self.assertEqual(output_shape[0], None)
-    self.assertEqual(output_shape[1], 3)
-    a = array_ops.placeholder(dtypes.float32)
-    b = array_ops.placeholder(dtypes.float32)
-    a.set_shape([2, 2])
-    b.set_shape([2, None])
-    output = math_ops.tensordot(a, b, axes)
-    output_shape = output.get_shape()
-    self.assertEqual(output_shape.ndims, 2)
-    output_shape = output_shape.as_list()
-    self.assertEqual(output_shape[0], 2)
-    self.assertEqual(output_shape[1], None)
+    for axes in ([1], [0]), 1:
+      a = array_ops.placeholder(dtypes.float32)
+      b = array_ops.placeholder(dtypes.float32)
+      output = math_ops.tensordot(a, b, axes)
+      self.assertEqual(output.get_shape().ndims, None)
+      a.set_shape([None, 2])
+      b.set_shape([2, 3])
+      output = math_ops.tensordot(a, b, axes)
+      output_shape = output.get_shape()
+      self.assertEqual(output_shape.ndims, 2)
+      output_shape = output_shape.as_list()
+      self.assertEqual(output_shape[0], None)
+      self.assertEqual(output_shape[1], 3)
+      a = array_ops.placeholder(dtypes.float32)
+      b = array_ops.placeholder(dtypes.float32)
+      a.set_shape([2, 2])
+      b.set_shape([2, None])
+      output = math_ops.tensordot(a, b, axes)
+      output_shape = output.get_shape()
+      self.assertEqual(output_shape.ndims, 2)
+      output_shape = output_shape.as_list()
+      self.assertEqual(output_shape[0], 2)
+      self.assertEqual(output_shape[1], None)
 
 
 def _get_tensordot_tests(dtype_, rank_a_, rank_b_, num_dims_, dynamic_shape_):
@@ -196,8 +196,8 @@ def _get_tensordot_tests(dtype_, rank_a_, rank_b_, num_dims_, dynamic_shape_):
         low=-1.0, high=1.0, size=np.prod(shape)).reshape(shape).astype(dtype_)
     b_np = np.random.uniform(
         low=-1.0, high=1.0, size=np.prod(shape)).reshape(shape).astype(dtype_)
-    all_axes = [1]
-    if a_np.ndim > 1:
+    all_axes = [0, 1]
+    if a_np.ndim > 2:
       all_axes.append(a_np.ndim - 1)
     for axes in all_axes:
       np_ans = np.tensordot(a_np, b_np, axes=axes)
diff --git a/tensorflow/python/kernel_tests/topk_op_test.py b/tensorflow/python/kernel_tests/topk_op_test.py
index efb5b9f364..6ab931fdb9 100644
--- a/tensorflow/python/kernel_tests/topk_op_test.py
+++ b/tensorflow/python/kernel_tests/topk_op_test.py
@@ -58,7 +58,7 @@ class TopKTest(test.TestCase):
         # Do some special casing of equality of indices: if indices
         # are not the same, but values are floating type, ensure that
         # the values are within epsilon of each other.
-        if not np.issubdtype(np_expected_values.dtype, np.float):
+        if not np.issubdtype(np_expected_values.dtype, np.floating):
           # Values are not floating point type; check indices exactly
           self.assertAllEqual(np_expected_indices, indices)
         else:
diff --git a/tensorflow/python/layers/convolutional.py b/tensorflow/python/layers/convolutional.py
index 79c421f4c9..e8dba3cea3 100644
--- a/tensorflow/python/layers/convolutional.py
+++ b/tensorflow/python/layers/convolutional.py
@@ -1094,7 +1094,7 @@ class SeparableConv1D(_SeparableConv):
       strides = (1, 1, 1) + self.strides
       spatial_start_dim = 2
 
-    # Explictly broadcast inputs and kernels to 4D.
+    # Explicitly broadcast inputs and kernels to 4D.
     # TODO(fchollet): refactor when a native separable_conv1d op is available.
     inputs = array_ops.expand_dims(inputs, spatial_start_dim)
     depthwise_kernel = array_ops.expand_dims(self.depthwise_kernel, 0)
@@ -1904,6 +1904,7 @@ class Conv3DTranspose(Conv3D):
           dtype=self.dtype)
     else:
       self.bias = None
+    self.built = True
 
   def call(self, inputs):
     inputs_shape = array_ops.shape(inputs)
@@ -1974,6 +1975,8 @@ class Conv3DTranspose(Conv3D):
 
     if self.use_bias:
       outputs_shape = outputs.shape.as_list()
+      if outputs_shape[0] is None:
+        outputs_shape[0] = -1
       if self.data_format == 'channels_first':
         outputs_4d = array_ops.reshape(outputs, [
             outputs_shape[0], outputs_shape[1],
@@ -2007,11 +2010,11 @@ class Conv3DTranspose(Conv3D):
 
     output_shape[c_axis] = self.filters
     output_shape[d_axis] = utils.deconv_output_length(
-        output_shape[d_axis], stride_d, kernel_d, self.padding)
+        output_shape[d_axis], kernel_d, self.padding, stride_d)
     output_shape[h_axis] = utils.deconv_output_length(
-        output_shape[h_axis], stride_h, kernel_h, self.padding)
+        output_shape[h_axis], kernel_h, self.padding, stride_h)
     output_shape[w_axis] = utils.deconv_output_length(
-        output_shape[w_axis], stride_w, kernel_w, self.padding)
+        output_shape[w_axis], kernel_w, self.padding, stride_w)
     return tensor_shape.TensorShape(output_shape)
 
 
diff --git a/tensorflow/python/layers/utils.py b/tensorflow/python/layers/utils.py
index e8be347799..7407d9a7b3 100644
--- a/tensorflow/python/layers/utils.py
+++ b/tensorflow/python/layers/utils.py
@@ -81,7 +81,7 @@ def normalize_tuple(value, n, name):
     for single_value in value_tuple:
       try:
         int(single_value)
-      except ValueError:
+      except (ValueError, TypeError):
         raise ValueError('The `' + name + '` argument must be a tuple of ' +
                          str(n) + ' integers. Received: ' + str(value) + ' '
                          'including element ' + str(single_value) + ' of type' +
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index e3902f5a8a..ad409ad7e5 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -35,6 +35,7 @@ See the @{$python/array_ops} guide.
 @@reshape
 @@squeeze
 @@expand_dims
+@@unravel_index
 @@meshgrid
 @@slice
 @@strided_slice
@@ -1589,9 +1590,9 @@ def zeros_like(tensor, dtype=None, name=None, optimize=True):
 
   Args:
     tensor: A `Tensor`.
-    dtype: A type for the returned `Tensor`. Must be `float32`, `float64`,
-      `int8`, `uint8`, `int16`, `uint16`, int32`, `int64`,
-      `complex64`, `complex128` or `bool`.
+    dtype: A type for the returned `Tensor`. Must be `float16`, `float32`,
+      `float64`, `int8`, `uint8`, `int16`, `uint16`, `int32`, `int64`,
+      `complex64`, `complex128`, `bool` or `string`.
     name: A name for the operation (optional).
     optimize: if true, attempt to statically determine the shape of 'tensor'
     and encode it as a constant.
diff --git a/tensorflow/python/ops/functional_ops.py b/tensorflow/python/ops/functional_ops.py
index 7dbccf1caf..ac03d30fcd 100644
--- a/tensorflow/python/ops/functional_ops.py
+++ b/tensorflow/python/ops/functional_ops.py
@@ -458,7 +458,7 @@ def scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
 
   For example, if `elems` is `(t1, [t2, t3])` and `initializer` is
   `[i1, i2]` then an appropriate signature for `fn` in `python2` is:
-  `fn = lambda (acc_p1, acc_p2), (t1 [t2, t3]):` and `fn` must return a list,
+  `fn = lambda (acc_p1, acc_p2), (t1, [t2, t3]):` and `fn` must return a list,
   `[acc_n1, acc_n2]`.  An alternative correct signature for `fn`, and the
    one that works in `python3`, is:
   `fn = lambda a, t:`, where `a` and `t` correspond to the input tuples.
diff --git a/tensorflow/python/ops/gradients_impl.py b/tensorflow/python/ops/gradients_impl.py
index 28b26a09a5..9f06c0ee1f 100644
--- a/tensorflow/python/ops/gradients_impl.py
+++ b/tensorflow/python/ops/gradients_impl.py
@@ -44,6 +44,7 @@ from tensorflow.python.ops import image_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import linalg_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import linalg_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import logging_ops  # pylint: disable=unused-import
+from tensorflow.python.ops import manip_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import math_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
diff --git a/tensorflow/python/ops/image_ops.py b/tensorflow/python/ops/image_ops.py
index 3b0b5a978c..de12c5f63f 100644
--- a/tensorflow/python/ops/image_ops.py
+++ b/tensorflow/python/ops/image_ops.py
@@ -49,6 +49,10 @@ See the @{$python/image} guide.
 @@grayscale_to_rgb
 @@hsv_to_rgb
 @@rgb_to_hsv
+@@rgb_to_yiq
+@@yiq_to_rgb
+@@rgb_to_yuv
+@@yuv_to_rgb
 @@convert_image_dtype
 @@adjust_brightness
 @@random_brightness
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index 2c231ef56c..14a38f25d1 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -1508,7 +1508,7 @@ def sample_distorted_bounding_box(image_size,
                                   bounding_boxes,
                                   seed=None,
                                   seed2=None,
-                                  min_object_covered=None,
+                                  min_object_covered=0.1,
                                   aspect_ratio_range=None,
                                   area_range=None,
                                   max_attempts=None,
@@ -1669,3 +1669,107 @@ def non_max_suppression(boxes,
     return gen_image_ops._non_max_suppression_v2(boxes, scores, max_output_size,
                                                  iou_threshold)
     # pylint: enable=protected-access
+
+
+_rgb_to_yiq_kernel = [[0.299, 0.59590059,
+                       0.2115], [0.587, -0.27455667, -0.52273617],
+                      [0.114, -0.32134392, 0.31119955]]
+
+
+def rgb_to_yiq(images):
+  """Converts one or more images from RGB to YIQ.
+
+  Outputs a tensor of the same shape as the `images` tensor, containing the YIQ
+  value of the pixels.
+  The output is only well defined if the value in images are in [0,1].
+
+  Args:
+    images: 2-D or higher rank. Image data to convert. Last dimension must be
+    size 3.
+
+  Returns:
+    images: tensor with the same shape as `images`.
+  """
+  images = ops.convert_to_tensor(images, name='images')
+  kernel = ops.convert_to_tensor(
+      _rgb_to_yiq_kernel, dtype=images.dtype, name='kernel')
+  ndims = images.get_shape().ndims
+  return math_ops.tensordot(images, kernel, axes=[[ndims - 1], [0]])
+
+
+_yiq_to_rgb_kernel = [[1, 1, 1], [0.95598634, -0.27201283, -1.10674021],
+                      [0.6208248, -0.64720424, 1.70423049]]
+
+
+def yiq_to_rgb(images):
+  """Converts one or more images from YIQ to RGB.
+
+  Outputs a tensor of the same shape as the `images` tensor, containing the RGB
+  value of the pixels.
+  The output is only well defined if the Y value in images are in [0,1],
+  I value are in [-0.5957,0.5957] and Q value are in [-0.5226,0.5226].
+
+  Args:
+    images: 2-D or higher rank. Image data to convert. Last dimension must be
+    size 3.
+
+  Returns:
+    images: tensor with the same shape as `images`.
+  """
+  images = ops.convert_to_tensor(images, name='images')
+  kernel = ops.convert_to_tensor(
+      _yiq_to_rgb_kernel, dtype=images.dtype, name='kernel')
+  ndims = images.get_shape().ndims
+  return math_ops.tensordot(images, kernel, axes=[[ndims - 1], [0]])
+
+
+_rgb_to_yuv_kernel = [[0.299, -0.14714119,
+                       0.61497538], [0.587, -0.28886916, -0.51496512],
+                      [0.114, 0.43601035, -0.10001026]]
+
+
+def rgb_to_yuv(images):
+  """Converts one or more images from RGB to YUV.
+
+  Outputs a tensor of the same shape as the `images` tensor, containing the YUV
+  value of the pixels.
+  The output is only well defined if the value in images are in [0,1].
+
+  Args:
+    images: 2-D or higher rank. Image data to convert. Last dimension must be
+    size 3.
+
+  Returns:
+    images: tensor with the same shape as `images`.
+  """
+  images = ops.convert_to_tensor(images, name='images')
+  kernel = ops.convert_to_tensor(
+      _rgb_to_yuv_kernel, dtype=images.dtype, name='kernel')
+  ndims = images.get_shape().ndims
+  return math_ops.tensordot(images, kernel, axes=[[ndims - 1], [0]])
+
+
+_yuv_to_rgb_kernel = [[1, 1, 1], [0, -0.394642334, 2.03206185],
+                      [1.13988303, -0.58062185, 0]]
+
+
+def yuv_to_rgb(images):
+  """Converts one or more images from YUV to RGB.
+
+  Outputs a tensor of the same shape as the `images` tensor, containing the RGB
+  value of the pixels.
+  The output is only well defined if the Y value in images are in [0,1],
+  U and V value are in [-0.5,0.5].
+
+  Args:
+    images: 2-D or higher rank. Image data to convert. Last dimension must be
+    size 3.
+
+  Returns:
+    images: tensor with the same shape as `images`.
+  """
+  images = ops.convert_to_tensor(images, name='images')
+  kernel = ops.convert_to_tensor(
+      _yuv_to_rgb_kernel, dtype=images.dtype, name='kernel')
+  ndims = images.get_shape().ndims
+  return math_ops.tensordot(images, kernel, axes=[[ndims - 1], [0]])
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index 47dd8231c0..b12bd3d5b0 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -85,6 +85,64 @@ class RGBToHSVTest(test_util.TensorFlowTestCase):
       self.assertAllClose(rgb_tf, rgb_np)
 
 
+class RGBToYIQTest(test_util.TensorFlowTestCase):
+
+  def testBatch(self):
+    # Build an arbitrary RGB image
+    np.random.seed(7)
+    batch_size = 5
+    shape = (batch_size, 2, 7, 3)
+
+    for nptype in [np.float32, np.float64]:
+      inp = np.random.rand(*shape).astype(nptype)
+
+      # Convert to YIQ and back, as a batch and individually
+      with self.test_session(use_gpu=True) as sess:
+        batch0 = constant_op.constant(inp)
+        batch1 = image_ops.rgb_to_yiq(batch0)
+        batch2 = image_ops.yiq_to_rgb(batch1)
+        split0 = array_ops.unstack(batch0)
+        split1 = list(map(image_ops.rgb_to_yiq, split0))
+        split2 = list(map(image_ops.yiq_to_rgb, split1))
+        join1 = array_ops.stack(split1)
+        join2 = array_ops.stack(split2)
+        batch1, batch2, join1, join2 = sess.run([batch1, batch2, join1, join2])
+
+      # Verify that processing batch elements together is the same as separate
+      self.assertAllClose(batch1, join1, rtol=1e-4, atol=1e-4)
+      self.assertAllClose(batch2, join2, rtol=1e-4, atol=1e-4)
+      self.assertAllClose(batch2, inp, rtol=1e-4, atol=1e-4)
+
+
+class RGBToYUVTest(test_util.TensorFlowTestCase):
+
+  def testBatch(self):
+    # Build an arbitrary RGB image
+    np.random.seed(7)
+    batch_size = 5
+    shape = (batch_size, 2, 7, 3)
+
+    for nptype in [np.float32, np.float64]:
+      inp = np.random.rand(*shape).astype(nptype)
+
+      # Convert to YUV and back, as a batch and individually
+      with self.test_session(use_gpu=True) as sess:
+        batch0 = constant_op.constant(inp)
+        batch1 = image_ops.rgb_to_yuv(batch0)
+        batch2 = image_ops.yuv_to_rgb(batch1)
+        split0 = array_ops.unstack(batch0)
+        split1 = list(map(image_ops.rgb_to_yuv, split0))
+        split2 = list(map(image_ops.yuv_to_rgb, split1))
+        join1 = array_ops.stack(split1)
+        join2 = array_ops.stack(split2)
+        batch1, batch2, join1, join2 = sess.run([batch1, batch2, join1, join2])
+
+      # Verify that processing batch elements together is the same as separate
+      self.assertAllClose(batch1, join1, rtol=1e-4, atol=1e-4)
+      self.assertAllClose(batch2, join2, rtol=1e-4, atol=1e-4)
+      self.assertAllClose(batch2, inp, rtol=1e-4, atol=1e-4)
+
+
 class GrayscaleToRGBTest(test_util.TensorFlowTestCase):
 
   def _RGBToGrayscale(self, images):
@@ -1839,6 +1897,26 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([3], end.get_shape().as_list())
       self.assertAllEqual([1, 1, 4], bbox_for_drawing.get_shape().as_list())
 
+  def testDefaultMinObjectCovered(self):
+    # By default min_object_covered=0.1 if not provided
+    with self.test_session(use_gpu=True):
+      image_size = constant_op.constant(
+          [40, 50, 1], shape=[3], dtype=dtypes.int32)
+      bounding_box = constant_op.constant(
+          [0.0, 0.0, 1.0, 1.0],
+          shape=[4],
+          dtype=dtypes.float32,
+      )
+      begin, end, bbox_for_drawing = image_ops.sample_distorted_bounding_box(
+          image_size=image_size,
+          bounding_boxes=bounding_box,
+          aspect_ratio_range=(0.75, 1.33),
+          area_range=(0.05, 1.0))
+
+      self.assertAllEqual([3], begin.get_shape().as_list())
+      self.assertAllEqual([3], end.get_shape().as_list())
+      self.assertAllEqual([1, 1, 4], bbox_for_drawing.get_shape().as_list())
+
 
 class ResizeImagesTest(test_util.TensorFlowTestCase):
 
@@ -3092,6 +3170,40 @@ class NonMaxSuppressionTest(test_util.TensorFlowTestCase):
           boxes, scores, max_output_size, iou_threshold).eval()
       self.assertAllClose(selected_indices, [3, 0, 5])
 
+  def testInvalidShape(self):
+    # The boxes should be 2D of shape [num_boxes, 4].
+    with self.assertRaisesRegexp(ValueError,
+                                 "Shape must be rank 2 but is rank 1"):
+      boxes = constant_op.constant([0.0, 0.0, 1.0, 1.0])
+      scores = constant_op.constant([0.9])
+      image_ops.non_max_suppression(boxes, scores, 3, 0.5)
+
+    with self.assertRaisesRegexp(ValueError, "Dimension must be 4 but is 3"):
+      boxes = constant_op.constant([[0.0, 0.0, 1.0]])
+      scores = constant_op.constant([0.9])
+      image_ops.non_max_suppression(boxes, scores, 3, 0.5)
+
+    # The scores should be 1D of shape [num_boxes].
+    with self.assertRaisesRegexp(ValueError,
+                                 "Shape must be rank 1 but is rank 2"):
+      boxes = constant_op.constant([[0.0, 0.0, 1.0, 1.0]])
+      scores = constant_op.constant([[0.9]])
+      image_ops.non_max_suppression(boxes, scores, 3, 0.5)
+
+    # The max_output_size should be a scaler (0-D).
+    with self.assertRaisesRegexp(ValueError,
+                                 "Shape must be rank 0 but is rank 1"):
+      boxes = constant_op.constant([[0.0, 0.0, 1.0, 1.0]])
+      scores = constant_op.constant([0.9])
+      image_ops.non_max_suppression(boxes, scores, [3], 0.5)
+
+    # The iou_threshold should be a scaler (0-D).
+    with self.assertRaisesRegexp(ValueError,
+                                 "Shape must be rank 0 but is rank 2"):
+      boxes = constant_op.constant([[0.0, 0.0, 1.0, 1.0]])
+      scores = constant_op.constant([0.9])
+      image_ops.non_max_suppression(boxes, scores, 3, [[0.5]])
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/linalg_grad.py b/tensorflow/python/ops/linalg_grad.py
index 13a32c83d9..3cbbf3412a 100644
--- a/tensorflow/python/ops/linalg_grad.py
+++ b/tensorflow/python/ops/linalg_grad.py
@@ -277,20 +277,28 @@ def _SvdGrad(op, grad_s, grad_u, grad_v):
   # https://j-towns.github.io/papers/svd-derivative.pdf
   a = op.inputs[0]
   a_shape = a.get_shape().with_rank_at_least(2)
+  grad_s_mat = array_ops.matrix_diag(grad_s)
 
-  if op.get_attr("compute_uv"):
-    # TODO(rmlarsen): Make this work with complex types.
-    if a.dtype.is_complex:
-      raise NotImplementedError(
-          "SVD gradient is not implemented for complex types and "
-          "compute_uv=True.")
-    grad_u_shape = grad_u.get_shape().with_rank_at_least(2)
-    grad_v_shape = grad_v.get_shape().with_rank_at_least(2)
-    m = a_shape[-2].merge_with(grad_u_shape[-2])
-    n = a_shape[-1].merge_with(grad_v_shape[-2])
-    batch_shape = a_shape[:-2].merge_with(grad_u_shape[:-2]).merge_with(
-        grad_v_shape[:-2])
-    a_shape = batch_shape.concatenate([m, n])
+  if not op.get_attr("compute_uv"):
+    s, u, v = linalg_ops.svd(a, compute_uv=True)
+    grad_a = math_ops.matmul(u, math_ops.matmul(grad_s_mat, v, adjoint_b=True))
+    grad_a.set_shape(a_shape)
+    return grad_a
+
+  full_matrices = op.get_attr("full_matrices")
+
+  # TODO(rmlarsen): Make this work with complex types.
+  if a.dtype.is_complex:
+    raise NotImplementedError(
+        "SVD gradient is not implemented for complex types and "
+        "compute_uv=True.")
+  grad_u_shape = grad_u.get_shape().with_rank_at_least(2)
+  grad_v_shape = grad_v.get_shape().with_rank_at_least(2)
+  m = a_shape[-2].merge_with(grad_u_shape[-2])
+  n = a_shape[-1].merge_with(grad_v_shape[-2])
+  batch_shape = a_shape[:-2].merge_with(grad_u_shape[:-2]).merge_with(
+      grad_v_shape[:-2])
+  a_shape = batch_shape.concatenate([m, n])
 
   m = a_shape[-2].value
   n = a_shape[-1].value
@@ -300,12 +308,9 @@ def _SvdGrad(op, grad_s, grad_u, grad_v):
         "SVD gradient has not been implemented for input with unknown "
         "inner matrix shape.")
 
-  if not op.get_attr("compute_uv"):
-    s, u, v = linalg_ops.svd(a, compute_uv=True, full_matrices=True)
-  else:
-    s = op.outputs[0]
-    u = op.outputs[1]
-    v = op.outputs[2]
+  s = op.outputs[0]
+  u = op.outputs[1]
+  v = op.outputs[2]
 
   use_adjoint = False
   if m > n:
@@ -317,19 +322,7 @@ def _SvdGrad(op, grad_s, grad_u, grad_v):
     grad_u, grad_v = grad_v, grad_u
 
   with ops.control_dependencies([grad_s, grad_u, grad_v]):
-    grad_s_mat = array_ops.matrix_diag(grad_s)
-    if not op.get_attr("compute_uv"):
-      if use_adjoint:
-        grad_a = math_ops.matmul(
-            v[..., :, :m], math_ops.matmul(u, grad_s_mat), adjoint_b=True)
-      else:
-        grad_a = math_ops.matmul(u,
-                                 math_ops.matmul(
-                                     grad_s_mat, v[..., :, :m], adjoint_b=True))
-      grad_a.set_shape(a_shape)
-      return grad_a
-
-    if op.get_attr("full_matrices") and abs(m - n) > 1:
+    if full_matrices and abs(m - n) > 1:
       raise NotImplementedError(
           "svd gradient is not implemented for abs(m - n) > 1 "
           "when full_matrices is True")
@@ -371,7 +364,7 @@ def _SvdGrad(op, grad_s, grad_u, grad_v):
       gv1t_v1 = math_ops.matmul(gv1t, v1)
       term2_nous = gv1t - math_ops.matmul(gv1t_v1, v1, adjoint_b=True)
 
-      if op.get_attr("full_matrices"):
+      if full_matrices:
         v2 = v[..., :, m:n]
         grad_v2 = grad_v[..., :, m:n]
 
diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py
index e75a9b22e4..84afbf0627 100644
--- a/tensorflow/python/ops/losses/losses_impl.py
+++ b/tensorflow/python/ops/losses/losses_impl.py
@@ -547,12 +547,13 @@ def mean_pairwise_squared_error(
       num_present_per_batch = _num_present(diffs, weights, per_batch=True)
 
       term1 = 2.0 * _safe_div(sum_squares_diff_per_batch,
-                              num_present_per_batch)
+                              num_present_per_batch - 1)
 
       sum_diff = math_ops.reduce_sum(
           diffs, reduction_indices=reduction_indices, keep_dims=True)
-      term2 = 2.0 * _safe_div(math_ops.square(sum_diff),
-                              math_ops.square(num_present_per_batch))
+      term2 = 2.0 * _safe_div(
+          math_ops.square(sum_diff),
+          math_ops.multiply(num_present_per_batch, num_present_per_batch - 1))
 
       weighted_losses = math_ops.multiply(term1 - term2, weights)
       loss = math_ops.reduce_sum(weighted_losses)
diff --git a/tensorflow/python/ops/manip_grad.py b/tensorflow/python/ops/manip_grad.py
new file mode 100644
index 0000000000..bb2069359d
--- /dev/null
+++ b/tensorflow/python/ops/manip_grad.py
@@ -0,0 +1,31 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Gradients for operators defined in manip_ops.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import manip_ops
+
+
+@ops.RegisterGradient("Roll")
+def _RollGrad(op, grad):
+  # The gradient is just the roll reversed
+  shift = op.inputs[1]
+  axis = op.inputs[2]
+  roll_grad = manip_ops.roll(grad, -shift, axis)
+  return roll_grad, None, None
diff --git a/tensorflow/python/ops/manip_ops.py b/tensorflow/python/ops/manip_ops.py
new file mode 100644
index 0000000000..91e15b47b9
--- /dev/null
+++ b/tensorflow/python/ops/manip_ops.py
@@ -0,0 +1,38 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Operators for manipulating tensors.
+
+@@roll
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import gen_manip_ops as _gen_manip_ops
+from tensorflow.python.util.all_util import remove_undocumented
+
+
+# pylint: disable=protected-access
+def roll(input, shift, axis):  # pylint: disable=redefined-builtin
+  return _gen_manip_ops.roll(input, shift, axis)
+
+
+roll.__doc__ = _gen_manip_ops.roll.__doc__
+# pylint: enable=protected-access
+
+_allowed_symbols = ['roll']
+
+remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 827e3caa36..9a8ac93de9 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -2826,10 +2826,14 @@ def tensordot(a, b, axes, name=None):
     """Generates two sets of contraction axes for the two tensor arguments."""
     a_shape = a.get_shape()
     if isinstance(axes, compat.integral_types):
-      if axes < 1:
-        raise ValueError("'axes' must be at least 1.")
+      if axes < 0:
+        raise ValueError("'axes' must be at least 0.")
       if a_shape.ndims is not None:
-        return range(a_shape.ndims - axes, a_shape.ndims), range(axes)
+        if axes > a_shape.ndims:
+          raise ValueError("'axes' must not be larger than the number of "
+                           "dimensions of tensor %s." % a)
+        return (list(xrange(a_shape.ndims - axes, a_shape.ndims)),
+                list(xrange(axes)))
       else:
         rank = array_ops.rank(a)
         return (range(rank - axes, rank, dtype=dtypes.int32),
diff --git a/tensorflow/python/ops/rnn.py b/tensorflow/python/ops/rnn.py
index 24c6f64f0a..da80e72071 100644
--- a/tensorflow/python/ops/rnn.py
+++ b/tensorflow/python/ops/rnn.py
@@ -1127,6 +1127,12 @@ def raw_rnn(cell, loop_fn,
       def _copy_some_through(current, candidate):
         """Copy some tensors through via array_ops.where."""
         def copy_fn(cur_i, cand_i):
+          # TensorArray and scalar get passed through.
+          if isinstance(cur_i, tensor_array_ops.TensorArray):
+            return cand_i
+          if cur_i.shape.ndims == 0:
+            return cand_i
+          # Otherwise propagate the old or the new value.
           with ops.colocate_with(cand_i):
             return array_ops.where(elements_finished, cur_i, cand_i)
         return nest.map_structure(copy_fn, current, candidate)
diff --git a/tensorflow/python/ops/standard_ops.py b/tensorflow/python/ops/standard_ops.py
index 30bf4e4ef1..009d1dc3b9 100644
--- a/tensorflow/python/ops/standard_ops.py
+++ b/tensorflow/python/ops/standard_ops.py
@@ -25,6 +25,7 @@ import sys as _sys
 # Imports the following modules so that @RegisterGradient get executed.
 from tensorflow.python.ops import array_grad
 from tensorflow.python.ops import data_flow_grad
+from tensorflow.python.ops import manip_grad
 from tensorflow.python.ops import math_grad
 from tensorflow.python.ops import sparse_grad
 from tensorflow.python.ops import spectral_grad
@@ -42,11 +43,13 @@ from tensorflow.python.ops.special_math_ops import *
 # TODO(vrv): Switch to import * once we're okay with exposing the module.
 from tensorflow.python.ops.confusion_matrix import confusion_matrix
 from tensorflow.python.ops.control_flow_ops import Assert
+from tensorflow.python.ops.control_flow_ops import case
+from tensorflow.python.ops.control_flow_ops import cond
 from tensorflow.python.ops.control_flow_ops import group
 from tensorflow.python.ops.control_flow_ops import no_op
+# pylint: disable=redefined-builtin
 from tensorflow.python.ops.control_flow_ops import tuple
-from tensorflow.python.ops.control_flow_ops import cond
-from tensorflow.python.ops.control_flow_ops import case
+# pylint: enable=redefined-builtin
 from tensorflow.python.ops.control_flow_ops import while_loop
 from tensorflow.python.ops.data_flow_ops import *
 from tensorflow.python.ops.functional_ops import *
@@ -59,6 +62,7 @@ from tensorflow.python.ops.logging_ops import Print
 from tensorflow.python.ops.logging_ops import get_summary_op
 from tensorflow.python.ops.lookup_ops import initialize_all_tables
 from tensorflow.python.ops.lookup_ops import tables_initializer
+from tensorflow.python.ops.manip_ops import *
 from tensorflow.python.ops.math_ops import *
 from tensorflow.python.ops.numerics import *
 from tensorflow.python.ops.parsing_ops import *
@@ -105,6 +109,7 @@ from tensorflow.python.ops import init_ops as _init_ops
 from tensorflow.python.ops import io_ops as _io_ops
 from tensorflow.python.ops import linalg_ops as _linalg_ops
 from tensorflow.python.ops import logging_ops as _logging_ops
+from tensorflow.python.ops import manip_ops as _manip_ops
 from tensorflow.python.ops import math_ops as _math_ops
 from tensorflow.python.ops import numerics as _numerics
 from tensorflow.python.ops import parsing_ops as _parsing_ops
@@ -264,34 +269,36 @@ _allowed_symbols = (_allowed_symbols_array_ops +
                     _allowed_symbols_misc +
                     _allowed_symbols_partitioned_variables)
 
-remove_undocumented(__name__, _allowed_symbols,
-                    [_sys.modules[__name__],
-                     _array_ops,
-                     _check_ops,
-                     _clip_ops,
-                     _confusion_matrix,
-                     _control_flow_ops,
-                     _constant_op,
-                     _data_flow_ops,
-                     _functional_ops,
-                     _gradients,
-                     _histogram_ops,
-                     _init_ops,
-                     _io_ops,
-                     _linalg_ops,
-                     _logging_ops,
-                     _math_ops,
-                     _numerics,
-                     _parsing_ops,
-                     _partitioned_variables,
-                     _random_ops,
-                     _script_ops,
-                     _session_ops,
-                     _sparse_ops,
-                     _special_math_ops,
-                     _state_ops,
-                     _string_ops,
-                     _template,
-                     _tensor_array_ops,
-                     _variable_scope,
-                     _variables,])
+remove_undocumented(__name__, _allowed_symbols, [
+    _sys.modules[__name__],
+    _array_ops,
+    _check_ops,
+    _clip_ops,
+    _confusion_matrix,
+    _control_flow_ops,
+    _constant_op,
+    _data_flow_ops,
+    _functional_ops,
+    _gradients,
+    _histogram_ops,
+    _init_ops,
+    _io_ops,
+    _linalg_ops,
+    _logging_ops,
+    _manip_ops,
+    _math_ops,
+    _numerics,
+    _parsing_ops,
+    _partitioned_variables,
+    _random_ops,
+    _script_ops,
+    _session_ops,
+    _sparse_ops,
+    _special_math_ops,
+    _state_ops,
+    _string_ops,
+    _template,
+    _tensor_array_ops,
+    _variable_scope,
+    _variables,
+])
diff --git a/tensorflow/python/saved_model/loader_impl.py b/tensorflow/python/saved_model/loader_impl.py
index ddfd6be6da..bebf1d5e0d 100644
--- a/tensorflow/python/saved_model/loader_impl.py
+++ b/tensorflow/python/saved_model/loader_impl.py
@@ -235,13 +235,10 @@ def load(sess, tags, export_dir, **saver_kwargs):
     asset_tensors_dictionary = _get_asset_tensors(export_dir,
                                                   meta_graph_def_to_load)
 
-    main_op_tensor = _get_main_op_tensor(meta_graph_def_to_load)
+    main_op_tensor = (
+        _get_main_op_tensor(meta_graph_def_to_load) or
+        (_get_legacy_init_op_tensor(meta_graph_def_to_load)))
     if main_op_tensor is not None:
       sess.run(fetches=[main_op_tensor], feed_dict=asset_tensors_dictionary)
-    else:
-      legacy_init_op_tensor = _get_legacy_init_op_tensor(meta_graph_def_to_load)
-      if legacy_init_op_tensor is not None:
-        sess.run(
-            fetches=[legacy_init_op_tensor], feed_dict=asset_tensors_dictionary)
 
     return meta_graph_def_to_load
diff --git a/tensorflow/python/tools/freeze_graph.py b/tensorflow/python/tools/freeze_graph.py
index 0ddf09260b..affa97062a 100644
--- a/tensorflow/python/tools/freeze_graph.py
+++ b/tensorflow/python/tools/freeze_graph.py
@@ -72,7 +72,8 @@ def freeze_graph_with_def_protos(input_graph_def,
                                  variable_names_blacklist="",
                                  input_meta_graph_def=None,
                                  input_saved_model_dir=None,
-                                 saved_model_tags=None):
+                                 saved_model_tags=None,
+                                 checkpoint_version=saver_pb2.SaverDef.V2):
   """Converts all variables in a graph and checkpoint into constants."""
   del restore_op_name, filename_tensor_name  # Unused by updated loading code.
 
@@ -100,7 +101,8 @@ def freeze_graph_with_def_protos(input_graph_def,
     _ = importer.import_graph_def(input_graph_def, name="")
   with session.Session() as sess:
     if input_saver_def:
-      saver = saver_lib.Saver(saver_def=input_saver_def)
+      saver = saver_lib.Saver(
+          saver_def=input_saver_def, write_version=checkpoint_version)
       saver.restore(sess, input_checkpoint)
     elif input_meta_graph_def:
       restorer = saver_lib.import_meta_graph(
@@ -124,7 +126,8 @@ def freeze_graph_with_def_protos(input_graph_def,
           # 'global_step' or a similar housekeeping element) so skip it.
           continue
         var_list[key] = tensor
-      saver = saver_lib.Saver(var_list=var_list)
+      saver = saver_lib.Saver(
+          var_list=var_list, write_version=checkpoint_version)
       saver.restore(sess, input_checkpoint)
       if initializer_nodes:
         sess.run(initializer_nodes.split(","))
@@ -217,7 +220,8 @@ def freeze_graph(input_graph,
                  variable_names_blacklist="",
                  input_meta_graph=None,
                  input_saved_model_dir=None,
-                 saved_model_tags=tag_constants.SERVING):
+                 saved_model_tags=tag_constants.SERVING,
+                 checkpoint_version=saver_pb2.SaverDef.V2):
   """Converts all variables in a graph and checkpoint into constants."""
   input_graph_def = None
   if input_saved_model_dir:
@@ -233,10 +237,21 @@ def freeze_graph(input_graph,
   if input_saver:
     input_saver_def = _parse_input_saver_proto(input_saver, input_binary)
   freeze_graph_with_def_protos(
-      input_graph_def, input_saver_def, input_checkpoint, output_node_names,
-      restore_op_name, filename_tensor_name, output_graph, clear_devices,
-      initializer_nodes, variable_names_whitelist, variable_names_blacklist,
-      input_meta_graph_def, input_saved_model_dir, saved_model_tags.split(","))
+      input_graph_def,
+      input_saver_def,
+      input_checkpoint,
+      output_node_names,
+      restore_op_name,
+      filename_tensor_name,
+      output_graph,
+      clear_devices,
+      initializer_nodes,
+      variable_names_whitelist,
+      variable_names_blacklist,
+      input_meta_graph_def,
+      input_saved_model_dir,
+      saved_model_tags.split(","),
+      checkpoint_version=checkpoint_version)
 
 
 def main(unused_args):
@@ -246,7 +261,7 @@ def main(unused_args):
                FLAGS.output_graph, FLAGS.clear_devices, FLAGS.initializer_nodes,
                FLAGS.variable_names_whitelist, FLAGS.variable_names_blacklist,
                FLAGS.input_meta_graph, FLAGS.input_saved_model_dir,
-               FLAGS.saved_model_tags)
+               FLAGS.saved_model_tags, FLAGS.checkpoint_version)
 
 
 if __name__ == "__main__":
@@ -267,6 +282,11 @@ if __name__ == "__main__":
       type=str,
       default="",
       help="TensorFlow variables file to load.")
+  parser.add_argument(
+      "--checkpoint_version",
+      type=int,
+      default=saver_pb2.SaverDef.V2,
+      help="Tensorflow variable file format")
   parser.add_argument(
       "--output_graph",
       type=str,
diff --git a/tensorflow/python/tools/freeze_graph_test.py b/tensorflow/python/tools/freeze_graph_test.py
index feeed7102c..91f0061ebc 100644
--- a/tensorflow/python/tools/freeze_graph_test.py
+++ b/tensorflow/python/tools/freeze_graph_test.py
@@ -84,9 +84,19 @@ class FreezeGraphTest(test_util.TensorFlowTestCase):
     input_meta_graph = checkpoint_meta_graph_file
 
     freeze_graph.freeze_graph(
-        input_graph_path, input_saver_def_path, input_binary, checkpoint_path,
-        output_node_names, restore_op_name, filename_tensor_name,
-        output_graph_path, clear_devices, "", "", input_meta_graph)
+        input_graph_path,
+        input_saver_def_path,
+        input_binary,
+        checkpoint_path,
+        output_node_names,
+        restore_op_name,
+        filename_tensor_name,
+        output_graph_path,
+        clear_devices,
+        "",
+        "",
+        input_meta_graph,
+        checkpoint_version=saver_write_version)
 
     # Now we make sure the variable is now a constant, and that the graph still
     # produces the expected result.
diff --git a/tensorflow/python/tools/optimize_for_inference_lib.py b/tensorflow/python/tools/optimize_for_inference_lib.py
index c2687bf557..9c19271222 100644
--- a/tensorflow/python/tools/optimize_for_inference_lib.py
+++ b/tensorflow/python/tools/optimize_for_inference_lib.py
@@ -349,6 +349,7 @@ def fold_batch_norms(input_graph_def):
     bias_add_op.op = "BiasAdd"
     bias_add_op.name = node.name
     bias_add_op.attr["T"].CopyFrom(conv_op.attr["T"])
+    bias_add_op.attr["data_format"].CopyFrom(conv_op.attr["data_format"])
     bias_add_op.input.extend([new_conv_op.name, offset_op.name])
     new_ops.extend([scaled_weights_op, new_conv_op, offset_op, bias_add_op])
 
diff --git a/tensorflow/python/tools/optimize_for_inference_test.py b/tensorflow/python/tools/optimize_for_inference_test.py
index 7686bb0f14..084a4500f8 100644
--- a/tensorflow/python/tools/optimize_for_inference_test.py
+++ b/tensorflow/python/tools/optimize_for_inference_test.py
@@ -173,48 +173,56 @@ class OptimizeForInferenceTest(test.TestCase):
       self.assertNotEqual("BatchNormWithGlobalNormalization", node.op)
 
   def testFoldFusedBatchNorms(self):
-    with self.test_session() as sess:
-      inputs = [1, 4, 2, 5, 3, 6, -1, -4, -2, -5, -3, -6]
-      input_op = constant_op.constant(
-          np.array(inputs), shape=[1, 1, 6, 2], dtype=dtypes.float32)
-      weights = [1, 2, 3, 4, 0.1, 0.2, 0.3, 0.4]
-      weights_op = constant_op.constant(
-          np.array(weights), shape=[1, 2, 2, 2], dtype=dtypes.float32)
-      conv_op = nn_ops.conv2d(
-          input_op, weights_op, [1, 1, 1, 1], padding="SAME", name="conv_op")
-      mean_op = constant_op.constant(
-          np.array([10, 20]), shape=[2], dtype=dtypes.float32)
-      variance_op = constant_op.constant(
-          np.array([0.25, 0.5]), shape=[2], dtype=dtypes.float32)
-      beta_op = constant_op.constant(
-          np.array([0.1, 0.6]), shape=[2], dtype=dtypes.float32)
-      gamma_op = constant_op.constant(
-          np.array([1.0, 2.0]), shape=[2], dtype=dtypes.float32)
-      ops.get_default_graph().graph_def_versions.producer = 9
-      gen_nn_ops._fused_batch_norm(
-          conv_op,
-          gamma_op,
-          beta_op,
-          mean_op,
-          variance_op,
-          0.00001,
-          is_training=False,
-          name="output")
-      original_graph_def = sess.graph_def
-      original_result = sess.run(["output:0"])
-    optimized_graph_def = optimize_for_inference_lib.fold_batch_norms(
-        original_graph_def)
-
-    with self.test_session() as sess:
-      _ = importer.import_graph_def(
-          optimized_graph_def, input_map={}, name="optimized")
-      optimized_result = sess.run(["optimized/output:0"])
-
-    self.assertAllClose(
-        original_result, optimized_result, rtol=1e-04, atol=1e-06)
-
-    for node in optimized_graph_def.node:
-      self.assertNotEqual("FusedBatchNorm", node.op)
+    for data_format, use_gpu in [("NHWC", False), ("NCHW", True)]:
+      with self.test_session(use_gpu=use_gpu) as sess:
+        inputs = [1, 4, 2, 5, 3, 6, -1, -4, -2, -5, -3, -6]
+        input_op = constant_op.constant(
+            np.array(inputs),
+            shape=[1, 1, 6, 2] if data_format == "NHWC" else [1, 2, 1, 6],
+            dtype=dtypes.float32)
+        weights = [1, 2, 3, 4, 0.1, 0.2, 0.3, 0.4]
+        weights_op = constant_op.constant(
+            np.array(weights), shape=[1, 2, 2, 2], dtype=dtypes.float32)
+        conv_op = nn_ops.conv2d(
+            input_op,
+            weights_op, [1, 1, 1, 1],
+            padding="SAME",
+            data_format=data_format,
+            name="conv_op")
+        mean_op = constant_op.constant(
+            np.array([10, 20]), shape=[2], dtype=dtypes.float32)
+        variance_op = constant_op.constant(
+            np.array([0.25, 0.5]), shape=[2], dtype=dtypes.float32)
+        beta_op = constant_op.constant(
+            np.array([0.1, 0.6]), shape=[2], dtype=dtypes.float32)
+        gamma_op = constant_op.constant(
+            np.array([1.0, 2.0]), shape=[2], dtype=dtypes.float32)
+        ops.get_default_graph().graph_def_versions.producer = 9
+        gen_nn_ops._fused_batch_norm(
+            conv_op,
+            gamma_op,
+            beta_op,
+            mean_op,
+            variance_op,
+            0.00001,
+            is_training=False,
+            data_format=data_format,
+            name="output")
+        original_graph_def = sess.graph_def
+        original_result = sess.run(["output:0"])
+      optimized_graph_def = optimize_for_inference_lib.fold_batch_norms(
+          original_graph_def)
+
+      with self.test_session(use_gpu=use_gpu) as sess:
+        _ = importer.import_graph_def(
+            optimized_graph_def, input_map={}, name="optimized")
+        optimized_result = sess.run(["optimized/output:0"])
+
+      self.assertAllClose(
+          original_result, optimized_result, rtol=1e-04, atol=1e-06)
+
+      for node in optimized_graph_def.node:
+        self.assertNotEqual("FusedBatchNorm", node.op)
 
   def testFuseResizePadAndConv(self):
     with self.test_session() as sess:
diff --git a/tensorflow/python/tools/saved_model_cli.py b/tensorflow/python/tools/saved_model_cli.py
index 667a4b1db8..33f6debbcb 100644
--- a/tensorflow/python/tools/saved_model_cli.py
+++ b/tensorflow/python/tools/saved_model_cli.py
@@ -31,6 +31,7 @@ import warnings
 
 import numpy as np
 
+from six import integer_types
 from tensorflow.contrib.saved_model.python.saved_model import reader
 from tensorflow.contrib.saved_model.python.saved_model import signature_def_utils
 from tensorflow.core.example import example_pb2
@@ -440,7 +441,7 @@ def _create_example_string(example_dict):
     elif isinstance(feature_list[0], str):
       example.features.feature[feature_name].bytes_list.value.extend(
           feature_list)
-    elif isinstance(feature_list[0], (int, long)):
+    elif isinstance(feature_list[0], integer_types):
       example.features.feature[feature_name].int64_list.value.extend(
           feature_list)
     else:
diff --git a/tensorflow/python/training/basic_session_run_hooks.py b/tensorflow/python/training/basic_session_run_hooks.py
index 17e07e171a..aae757b99a 100644
--- a/tensorflow/python/training/basic_session_run_hooks.py
+++ b/tensorflow/python/training/basic_session_run_hooks.py
@@ -336,7 +336,7 @@ class CheckpointSaverListener(object):
   `CheckpointSaverHook`, as in this example:
 
   ```python
-  class ExampleCheckpointSaverListerner(CheckpointSaverListener):
+  class ExampleCheckpointSaverListener(CheckpointSaverListener):
     def begin(self):
       # You can add ops to the graph here.
       print('Starting the session.')
@@ -352,7 +352,7 @@ class CheckpointSaverListener(object):
       print('Done with the session.')
 
   ...
-  listener = ExampleCheckpointSaverListerner()
+  listener = ExampleCheckpointSaverListener()
   saver_hook = tf.train.CheckpointSaverHook(
       checkpoint_dir, listeners=[listener])
   with tf.train.MonitoredTrainingSession(chief_only_hooks=[saver_hook]):
diff --git a/tensorflow/python/training/input.py b/tensorflow/python/training/input.py
index 992184ec9e..bd9985a7c5 100644
--- a/tensorflow/python/training/input.py
+++ b/tensorflow/python/training/input.py
@@ -58,6 +58,8 @@ _restore_sparse = sparse_ops._take_many_sparse_from_tensors_map
 def match_filenames_once(pattern, name=None):
   """Save the list of files matching pattern, so it is only computed once.
 
+  NOTE: The order of the files returned can be non-deterministic.
+
   Args:
     pattern: A file pattern (glob), or 1D tensor of file patterns.
     name: A name for the operations (optional).
diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py
index 3888e9bba4..0c1c8e664b 100644
--- a/tensorflow/python/training/saver.py
+++ b/tensorflow/python/training/saver.py
@@ -1597,9 +1597,9 @@ class Saver(object):
         [Stripping Default-Valued Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
 
     Returns:
-      A string: path prefix used for the checkpoint files.  If the saver is
-        sharded, this string ends with: '-?????-of-nnnnn' where 'nnnnn'
-        is the number of shards created.
+      A string: path prefix used for the checkpoint files. If checkpoint
+        format is V1 and the saver is sharded, this string ends with:
+         '-?????-of-nnnnn' where 'nnnnn' is the number of shards created.
       If the saver is empty, returns None.
 
     Raises:
@@ -1749,6 +1749,12 @@ class Saver(object):
       return
     if save_path is None:
       raise ValueError("Can't load save_path when it is None.")
+    if (os.path.isfile(save_path) and
+        self._write_version not in (
+            saver_pb2.SaverDef.V1, saver_pb2.SaverDef.LEGACY)):
+      raise ValueError("The specified path: %s is a file."
+                       " Please specify only the path prefix"
+                       " to the checkpoint files." % save_path)
     logging.info("Restoring parameters from %s", save_path)
     if context.in_graph_mode():
       sess.run(self.saver_def.restore_op_name,
diff --git a/tensorflow/python/util/compat_internal.py b/tensorflow/python/util/compat_internal.py
new file mode 100644
index 0000000000..fee1d6fab7
--- /dev/null
+++ b/tensorflow/python/util/compat_internal.py
@@ -0,0 +1,34 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functions for Python 2 vs. 3 compatibility that are private to TensorFlow."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+def path_to_str(path):
+  """Returns the file system path representation of a `PathLike` object,
+  else as it is.
+
+  Args:
+    path: An object that can be converted to path representation.
+
+  Returns:
+    A `str` object.
+  """
+  if hasattr(path, "__fspath__"):
+    path = as_str_any(path.__fspath__())
+  return path
diff --git a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
index f35542e18f..933c103f52 100644
--- a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
+++ b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
@@ -232,7 +232,7 @@ port::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
       result = StringToDriverVersion(version);
     }
 #else
-#if !defined(PLATFORM_WINDOWS) && !defined(NVIDIA_TEGRA)
+#if !defined(PLATFORM_WINDOWS) && !defined(ANDROID_TEGRA)
   // Callback used when iterating through DSOs. Looks for the driver-interfacing
   // DSO and yields its version number into the callback data, when found.
   auto iterate_phdr =
diff --git a/tensorflow/stream_executor/dso_loader.cc b/tensorflow/stream_executor/dso_loader.cc
index 5210a81092..0c642912b1 100644
--- a/tensorflow/stream_executor/dso_loader.cc
+++ b/tensorflow/stream_executor/dso_loader.cc
@@ -96,10 +96,19 @@ string GetCudnnVersion() { return TF_CUDNN_VERSION; }
 }
 
 /* static */ port::Status DsoLoader::GetLibcuptiDsoHandle(void** dso_handle) {
+#if defined(ANDROID_TEGRA)
+  // On Android devices the CUDA version number is not added to the library
+  // name.
+  return GetDsoHandle(
+      FindDsoPath(port::Env::Default()->FormatLibraryFileName("cupti", ""),
+                  GetCudaCuptiLibraryPath()),
+      dso_handle);
+#else
   return GetDsoHandle(FindDsoPath(port::Env::Default()->FormatLibraryFileName(
                                       "cupti", GetCudaVersion()),
                                   GetCudaCuptiLibraryPath()),
                       dso_handle);
+#endif
 }
 
 static mutex& GetRpathMutex() {
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index bf4a9fe6ce..411b393b0a 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -270,6 +270,8 @@ def _rpath_linkopts(name):
       clean_dep("//tensorflow:darwin"): [
           "-Wl,%s" % (_make_search_paths("@loader_path", levels_to_root),),
       ],
+      clean_dep("//tensorflow:windows"): [],
+      clean_dep("//tensorflow:windows_msvc"): [],
       "//conditions:default": [
           "-Wl,%s" % (_make_search_paths("$$ORIGIN", levels_to_root),),
       ],
@@ -301,6 +303,7 @@ def tf_cc_shared_object(
               "-Wl,-install_name,@rpath/" + name.split("/")[-1],
           ],
           "//conditions:default": [
+              "-Wl,-soname," + name.split("/")[-1],
           ],
       }),
       **kwargs)
@@ -612,6 +615,8 @@ def tf_cc_test(name,
         "//tensorflow:android": [
             "-pie",
           ],
+        clean_dep("//tensorflow:windows"): [],
+        clean_dep("//tensorflow:windows_msvc"): [],
         "//conditions:default": [
             "-lpthread",
             "-lm"
@@ -1264,6 +1269,8 @@ def tf_custom_op_library(name, srcs=[], gpu_srcs=[], deps=[], linkopts=[]):
           "//conditions:default": [
               "-lm",
           ],
+          clean_dep("//tensorflow:windows"): [],
+          clean_dep("//tensorflow:windows_msvc"): [],
           clean_dep("//tensorflow:darwin"): [],
       }),)
 
diff --git a/tensorflow/tools/api/golden/tensorflow.image.pbtxt b/tensorflow/tools/api/golden/tensorflow.image.pbtxt
index f32353c957..baedf596e8 100644
--- a/tensorflow/tools/api/golden/tensorflow.image.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.image.pbtxt
@@ -168,13 +168,21 @@ tf_module {
     name: "rgb_to_hsv"
     argspec: "args=[\'images\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "rgb_to_yiq"
+    argspec: "args=[\'images\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "rgb_to_yuv"
+    argspec: "args=[\'images\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "rot90"
     argspec: "args=[\'image\', \'k\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
   }
   member_method {
     name: "sample_distorted_bounding_box"
-    argspec: "args=[\'image_size\', \'bounding_boxes\', \'seed\', \'seed2\', \'min_object_covered\', \'aspect_ratio_range\', \'area_range\', \'max_attempts\', \'use_image_if_no_bounding_boxes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'image_size\', \'bounding_boxes\', \'seed\', \'seed2\', \'min_object_covered\', \'aspect_ratio_range\', \'area_range\', \'max_attempts\', \'use_image_if_no_bounding_boxes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'0.1\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "total_variation"
@@ -184,4 +192,12 @@ tf_module {
     name: "transpose_image"
     argspec: "args=[\'image\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "yiq_to_rgb"
+    argspec: "args=[\'images\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "yuv_to_rgb"
+    argspec: "args=[\'images\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
index d898c54627..11e05f884d 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.keras.layers.Conv3DTranspose"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3DTranspose\'>"
+  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3DTranspose\'>"
   is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3D\'>"
   is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
index a7001bbe34..58724a1e16 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.keras.layers.Convolution3DTranspose"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3DTranspose\'>"
+  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3DTranspose\'>"
   is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3D\'>"
   is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.manip.pbtxt b/tensorflow/tools/api/golden/tensorflow.manip.pbtxt
new file mode 100644
index 0000000000..0b84165285
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.manip.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.manip"
+tf_module {
+  member_method {
+    name: "roll"
+    argspec: "args=[\'input\', \'shift\', \'axis\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.pbtxt b/tensorflow/tools/api/golden/tensorflow.pbtxt
index db1ed42185..e8890e9cc0 100644
--- a/tensorflow/tools/api/golden/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.pbtxt
@@ -396,6 +396,10 @@ tf_module {
     name: "losses"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "manip"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "metrics"
     mtype: "<type \'module\'>"
@@ -2044,6 +2048,10 @@ tf_module {
     name: "unique_with_counts"
     argspec: "args=[\'x\', \'out_idx\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
   }
+  member_method {
+    name: "unravel_index"
+    argspec: "args=[\'indices\', \'dims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "unsorted_segment_max"
     argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index 636d5a1e81..310c1b6248 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -322,7 +322,7 @@ do_external_licenses_check(){
   EXTRA_LICENSES_FILE="$(mktemp)_extra_licenses.log"
 
   echo "Getting external dependencies for ${BUILD_TARGET}"
- bazel query "attr('licenses', 'notice', deps(${BUILD_TARGET}))" --no_implicit_deps --no_host_deps --keep_going \
+ bazel query "attr('licenses', 'notice', deps(${BUILD_TARGET}))" --keep_going \
   | grep -E -v "^//tensorflow" \
   | sed -e 's|:.*||' \
   | sort \
@@ -331,7 +331,7 @@ do_external_licenses_check(){
 
   echo
   echo "Getting list of external licenses mentioned in ${LICENSES_TARGET}."
-  bazel query "deps(${LICENSES_TARGET})" --no_implicit_deps --no_host_deps --keep_going \
+  bazel query "deps(${LICENSES_TARGET})" --keep_going \
   | grep -E -v "^//tensorflow" \
   | sed -e 's|:.*||' \
   | sort \
@@ -345,6 +345,18 @@ do_external_licenses_check(){
 
   EXTERNAL_LICENSES_CHECK_END_TIME=$(date +'%s')
 
+  # Blacklist
+  echo ${MISSING_LICENSES_FILE}
+  grep -e "@bazel_tools//third_party/" -e "@com_google_absl//absl" -e "@org_tensorflow//" -v ${MISSING_LICENSES_FILE} > temp.txt
+  mv temp.txt ${MISSING_LICENSES_FILE}
+
+  # Whitelist
+  echo ${EXTRA_LICENSE_FILE}
+  grep -e "@bazel_tools//src/" -e "@bazel_tools//tools/" -e "@com_google_absl//" -e "//external" -e "@local" -v ${EXTRA_LICENSES_FILE} > temp.txt
+  mv temp.txt ${EXTRA_LICENSES_FILE}
+
+
+
   echo
   echo "do_external_licenses_check took $((EXTERNAL_LICENSES_CHECK_END_TIME - EXTERNAL_LICENSES_CHECK_START_TIME)) s"
   echo
diff --git a/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh b/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh
index fa28e3d79c..583d1d5f09 100755
--- a/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh
+++ b/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh
@@ -41,7 +41,7 @@ run_configure_for_cpu_build
 # build_libtensorflow_tarball in ../builds/libtensorflow.sh
 # cannot be used on Windows since it relies on pkg_tar rules.
 # So we do something special here
-bazel build -c opt \
+bazel build -c opt --copt=/arch:AVX \
   tensorflow:libtensorflow.so \
   tensorflow/tools/lib_package:clicenses_generate \
   tensorflow/java:libtensorflow_jni.so \
diff --git a/tensorflow/tools/ci_build/windows/libtensorflow_gpu.sh b/tensorflow/tools/ci_build/windows/libtensorflow_gpu.sh
index 573c926203..94276c6c5c 100644
--- a/tensorflow/tools/ci_build/windows/libtensorflow_gpu.sh
+++ b/tensorflow/tools/ci_build/windows/libtensorflow_gpu.sh
@@ -41,7 +41,7 @@ run_configure_for_gpu_build
 # build_libtensorflow_tarball in ../builds/libtensorflow.sh
 # cannot be used on Windows since it relies on pkg_tar rules.
 # So we do something special here
-bazel build -c opt \
+bazel build -c opt --copt=/arch:AVX \
   tensorflow:libtensorflow.so \
   tensorflow/tools/lib_package:clicenses_generate \
   tensorflow/java:libtensorflow_jni.so \
diff --git a/tensorflow/tools/docker/jupyter_notebook_config.py b/tensorflow/tools/docker/jupyter_notebook_config.py
index 0acbf6fcee..05dcefb099 100644
--- a/tensorflow/tools/docker/jupyter_notebook_config.py
+++ b/tensorflow/tools/docker/jupyter_notebook_config.py
@@ -15,6 +15,7 @@
 import os
 from IPython.lib import passwd
 
+c = c  # pylint:disable=undefined-variable
 c.NotebookApp.ip = '*'
 c.NotebookApp.port = int(os.getenv('PORT', 8888))
 c.NotebookApp.open_browser = False
diff --git a/tensorflow/tools/docs/pretty_docs.py b/tensorflow/tools/docs/pretty_docs.py
index ac04f566d0..543b5fa6fe 100644
--- a/tensorflow/tools/docs/pretty_docs.py
+++ b/tensorflow/tools/docs/pretty_docs.py
@@ -327,7 +327,7 @@ class _Metadata(object):
   """
 
   def __init__(self, name):
-    """Creata a Metadata builder.
+    """Create a Metadata builder.
 
     Args:
       name: The name of the page being described by the Metadata block.
diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index dbc81599de..35e55c0d31 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -99,6 +99,7 @@ genrule(
         "//third_party/hadoop:LICENSE.txt",
         "//third_party/eigen3:LICENSE",
         "//third_party/fft2d:LICENSE",
+        "@aws//:LICENSE",
         "@boringssl//:LICENSE",
         "@com_googlesource_code_re2//:LICENSE",
         "@cub_archive//:LICENSE.TXT",
@@ -112,8 +113,10 @@ genrule(
         "@jemalloc//:COPYING",
         "@jpeg//:LICENSE.md",
         "@libxsmm_archive//:LICENSE",
+        "@llvm//:LICENSE.TXT",
         "@lmdb//:LICENSE",
         "@local_config_sycl//sycl:LICENSE.text",
+        "@nasm//:LICENSE",
         "@nsync//:LICENSE",
         "@png_archive//:LICENSE",
         "@protobuf_archive//:LICENSE",
@@ -134,6 +137,7 @@ genrule(
         "//third_party/hadoop:LICENSE.txt",
         "//third_party/eigen3:LICENSE",
         "//third_party/fft2d:LICENSE",
+        "@aws//:LICENSE",
         "@boringssl//:LICENSE",
         "@com_googlesource_code_re2//:LICENSE",
         "@cub_archive//:LICENSE.TXT",
@@ -149,6 +153,7 @@ genrule(
         "@libxsmm_archive//:LICENSE",
         "@lmdb//:LICENSE",
         "@local_config_sycl//sycl:LICENSE.text",
+        "@nasm//:LICENSE",
         "@nsync//:LICENSE",
         "@png_archive//:LICENSE",
         "@protobuf_archive//:LICENSE",
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index e4fa6694d8..a9c4a8de42 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -88,13 +88,20 @@ filegroup(
         "//third_party/eigen3:LICENSE",
         "//third_party/fft2d:LICENSE",
         "//third_party/hadoop:LICENSE.txt",
+        "@absl_py//absl/flags:LICENSE",
+        "@arm_neon_2_x86_sse//:LICENSE",
+        "@astor_archive//:LICENSE",
+        "@aws//:LICENSE",
         "@boringssl//:LICENSE",
+        "@com_google_absl//:LICENSE",
         "@com_googlesource_code_re2//:LICENSE",
         "@cub_archive//:LICENSE.TXT",
         "@curl//:COPYING",
         "@eigen_archive//:COPYING.MPL2",
         "@farmhash_archive//:COPYING",
         "@fft2d//:fft/readme.txt",
+        "@flatbuffers//:LICENSE.txt",
+        "@gast_archive//:PKG-INFO",
         "@gemmlowp//:LICENSE",
         "@gif_archive//:COPYING",
         "@grpc//:LICENSE",
@@ -105,11 +112,15 @@ filegroup(
         "@lmdb//:LICENSE",
         "@local_config_sycl//sycl:LICENSE.text",
         "@grpc//third_party/nanopb:LICENSE.txt",
+        "@nasm//:LICENSE",
         "@nsync//:LICENSE",
+        "@pcre//:LICENCE",
         "@png_archive//:LICENSE",
         "@protobuf_archive//:LICENSE",
         "@six_archive//:LICENSE",
         "@snappy//:COPYING",
+        "@swig//:LICENSE",
+        "@termcolor_archive//:COPYING.txt",
         "@zlib_archive//:zlib.h",
         "@org_python_pypi_backports_weakref//:LICENSE",
     ] + if_mkl([
diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
index ca8c272a08..dc31e4c5f7 100755
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@@ -137,8 +137,8 @@ function main() {
         fi
       fi
     fi
-    # Install toco as a binary in aux-bin.
     mkdir "${TMPDIR}/tensorflow/aux-bin"
+    # Install toco as a binary in aux-bin.
     cp bazel-bin/tensorflow/contrib/lite/toco/toco ${TMPDIR}/tensorflow/aux-bin/
   fi
 
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 3cd4d12100..bc4315c600 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -29,7 +29,7 @@ from setuptools.dist import Distribution
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
-_VERSION = '1.5.0-rc1'
+_VERSION = '1.5.0'
 
 REQUIRED_PACKAGES = [
     'absl-py >= 0.1.6',
@@ -39,7 +39,7 @@ REQUIRED_PACKAGES = [
     'numpy >= 1.12.1',
     'six >= 1.10.0',
     'protobuf >= 3.4.0',
-    'tensorflow-tensorboard >= 0.4.0',
+    'tensorflow-tensorboard >= 1.5.0, < 1.6.0',
     'termcolor >= 1.1.0',
 ]
 
@@ -181,9 +181,10 @@ def find_files(pattern, root):
 
 matches = ['../' + x for x in find_files('*', 'external') if '.py' not in x]
 
-so_lib_paths = [i for i in os.listdir('.')
-                if os.path.isdir(i) 
-                and fnmatch.fnmatch(i, '_solib_*')]
+so_lib_paths = [
+    i for i in os.listdir('.')
+    if os.path.isdir(i) and fnmatch.fnmatch(i, '_solib_*')
+]
 
 for path in so_lib_paths:
   matches.extend(
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 2c320cf68a..12d3c739cc 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -114,16 +114,17 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
      sha256 = "5996380e3e8b981f55d1c8d58e709c00dbb4806ba367be75d0925a68cc2f6478",
      strip_prefix = "abseil-cpp-720c017e30339fd1786ce4aac68bc8559736e53f",
+     build_file = str(Label("//third_party:com_google_absl.BUILD")),
   )
 
   tf_http_archive(
       name = "eigen_archive",
       urls = [
-          "https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/14e1418fcf12.tar.gz",
-          "https://bitbucket.org/eigen/eigen/get/14e1418fcf12.tar.gz",
+          "https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/2355b229ea4c.tar.gz",
+          "https://bitbucket.org/eigen/eigen/get/2355b229ea4c.tar.gz",
       ],
-      sha256 = "2b526c6888639025323fd4f2600533c0f982d304ea48e4f1663e8066bd9f6368",
-      strip_prefix = "eigen-eigen-14e1418fcf12",
+      sha256 = "0cadb31a35b514bf2dfd6b5d38205da94ef326ec6908fc3fd7c269948467214f",
+      strip_prefix = "eigen-eigen-2355b229ea4c",
       build_file = str(Label("//third_party:eigen.BUILD")),
   )
 
@@ -555,6 +556,18 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = str(Label("//third_party:nccl.BUILD")),
   )
 
+  tf_http_archive(
+      name = "kafka",
+      urls = [
+          "https://mirror.bazel.build/github.com/edenhill/librdkafka/archive/v0.11.1.tar.gz",
+          "https://github.com/edenhill/librdkafka/archive/v0.11.1.tar.gz",
+      ],
+      sha256 = "dd035d57c8f19b0b612dd6eefe6e5eebad76f506e302cccb7c2066f25a83585e",
+      strip_prefix = "librdkafka-0.11.1",
+      build_file = str(Label("//third_party:kafka/BUILD")),
+      patch_file = str(Label("//third_party/kafka:config.patch")),
+  )
+
   tf_http_archive(
       name = "aws",
       urls = [
diff --git a/third_party/com_google_absl.BUILD b/third_party/com_google_absl.BUILD
new file mode 100644
index 0000000000..8fca145f75
--- /dev/null
+++ b/third_party/com_google_absl.BUILD
@@ -0,0 +1,5 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache
+
+exports_files(["LICENSE"])
diff --git a/third_party/flatbuffers/flatbuffers.BUILD b/third_party/flatbuffers/flatbuffers.BUILD
index f6b8e6ddb0..824c97be60 100644
--- a/third_party/flatbuffers/flatbuffers.BUILD
+++ b/third_party/flatbuffers/flatbuffers.BUILD
@@ -4,6 +4,8 @@ package(
 
 licenses(["notice"])  # Apache 2.0
 
+exports_files(["LICENSE.txt"])
+
 config_setting(
     name = "freebsd",
     values = {"cpu": "freebsd"},
diff --git a/third_party/gast.BUILD b/third_party/gast.BUILD
index 06db528ada..4866982e1f 100644
--- a/third_party/gast.BUILD
+++ b/third_party/gast.BUILD
@@ -3,7 +3,7 @@
 
 licenses(["notice"])  # BSD 3-clause
 
-exports_files(["LICENSE"])
+exports_files(["PKG-INFO"])
 
 py_library(
     name = "gast",
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index 8e1dd8a54f..255ae01190 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -826,7 +826,7 @@ def symlink_genrule_for_dir(repository_ctx, src_dir, dest_dir, genrule_name,
   if src_dir != None:
     src_dir = _norm_path(src_dir)
     dest_dir = _norm_path(dest_dir)
-    files = _read_dir(repository_ctx, src_dir)
+    files = '\n'.join(sorted(_read_dir(repository_ctx, src_dir).splitlines()))
     # Create a list with the src_dir stripped to use for outputs.
     dest_files = files.replace(src_dir, '').splitlines()
     src_files = files.splitlines()
diff --git a/third_party/jpeg/jpeg.BUILD b/third_party/jpeg/jpeg.BUILD
index 37924125cf..87a23925c4 100644
--- a/third_party/jpeg/jpeg.BUILD
+++ b/third_party/jpeg/jpeg.BUILD
@@ -34,6 +34,10 @@ libjpegturbo_copts = select({
         "-mfloat-abi=softfp",
         "-fprefetch-loop-arrays",
     ],
+    ":linux_ppc64le": [
+        "-mcpu=power8",
+        "-mtune=power8",
+    ],
     "//conditions:default": [],
 })
 
@@ -123,10 +127,50 @@ cc_library(
         ":k8": [":simd_x86_64"],
         ":armeabi-v7a": [":simd_armv7a"],
         ":arm64-v8a": [":simd_armv8a"],
+        ":linux_ppc64le": [":simd_altivec"],
         "//conditions:default": [":simd_none"],
     }),
 )
 
+cc_library(
+    name = "simd_altivec",
+    srcs = [
+        "jchuff.h",
+        "jconfig.h",
+        "jdct.h",
+        "jerror.h",
+        "jinclude.h",
+        "jmorecfg.h",
+        "jpegint.h",
+        "jpeglib.h",
+        "jsimd.h",
+        "jsimddct.h",
+        "simd/jccolor-altivec.c",
+        "simd/jcgray-altivec.c",
+        "simd/jcsample.h",
+        "simd/jcsample-altivec.c",
+        "simd/jdcolor-altivec.c",
+        "simd/jdmerge-altivec.c",
+        "simd/jdsample-altivec.c",
+        "simd/jfdctfst-altivec.c",
+        "simd/jfdctint-altivec.c",
+        "simd/jidctfst-altivec.c",
+        "simd/jidctint-altivec.c",
+        "simd/jquanti-altivec.c",
+        "simd/jsimd.h",
+        "simd/jsimd_altivec.h",
+        "simd/jsimd_powerpc.c",
+    ],
+    hdrs = [
+        "simd/jccolext-altivec.c",  # should have been named .inc
+        "simd/jcgryext-altivec.c",  # should have been named .inc
+        "simd/jdcolext-altivec.c",  # should have been named .inc
+        "simd/jdmrgext-altivec.c",  # should have been named .inc
+    ],
+    copts = libjpegturbo_copts,
+    nocopts = libjpegturbo_nocopts,
+)
+
 cc_library(
     name = "simd_x86_64",
     srcs = [
@@ -381,6 +425,7 @@ genrule(
         ":k8": "cp $(location jconfig_nowin_simd.h) $@",
         ":armeabi-v7a": "cp $(location jconfig_nowin_simd.h) $@",
         ":arm64-v8a": "cp $(location jconfig_nowin_simd.h) $@",
+        ":linux_ppc64le": "cp $(location jconfig_nowin_simd.h) $@",
         "//conditions:default": "cp $(location jconfig_nowin_nosimd.h) $@",
     }),
 )
@@ -498,3 +543,8 @@ config_setting(
     name = "windows_msvc",
     values = {"cpu": "x64_windows_msvc"},
 )
+
+config_setting(
+    name = "linux_ppc64le",
+    values = {"cpu": "ppc"},
+)
diff --git a/third_party/kafka/BUILD b/third_party/kafka/BUILD
new file mode 100644
index 0000000000..a61a9e1f6c
--- /dev/null
+++ b/third_party/kafka/BUILD
@@ -0,0 +1,147 @@
+# Description:
+#   Kafka C/C++ (librdkafka) client library
+
+licenses(["notice"])  # 2-clause BSD license
+
+exports_files(["LICENSE"])
+
+cc_library(
+    name = "kafka",
+    srcs = [
+        "config.h",
+        "src-cpp/ConfImpl.cpp",
+        "src-cpp/ConsumerImpl.cpp",
+        "src-cpp/HandleImpl.cpp",
+        "src-cpp/KafkaConsumerImpl.cpp",
+        "src-cpp/MessageImpl.cpp",
+        "src-cpp/MetadataImpl.cpp",
+        "src-cpp/QueueImpl.cpp",
+        "src-cpp/RdKafka.cpp",
+        "src-cpp/TopicImpl.cpp",
+        "src-cpp/TopicPartitionImpl.cpp",
+        "src/crc32c.c",
+        "src/crc32c.h",
+        "src/lz4.c",
+        "src/lz4.h",
+        "src/lz4frame.c",
+        "src/lz4frame.h",
+        "src/lz4frame_static.h",
+        "src/lz4hc.c",
+        "src/lz4hc.h",
+        "src/lz4opt.h",
+        "src/queue.h",
+        "src/rd.h",
+        "src/rdaddr.c",
+        "src/rdaddr.h",
+        "src/rdatomic.h",
+        "src/rdavg.h",
+        "src/rdavl.c",
+        "src/rdavl.h",
+        "src/rdbuf.c",
+        "src/rdbuf.h",
+        "src/rdcrc32.h",
+        "src/rddl.h",
+        "src/rdendian.h",
+        "src/rdgz.c",
+        "src/rdgz.h",
+        "src/rdinterval.h",
+        "src/rdkafka.c",
+        "src/rdkafka.h",
+        "src/rdkafka_assignor.c",
+        "src/rdkafka_assignor.h",
+        "src/rdkafka_broker.c",
+        "src/rdkafka_broker.h",
+        "src/rdkafka_buf.c",
+        "src/rdkafka_buf.h",
+        "src/rdkafka_cgrp.c",
+        "src/rdkafka_cgrp.h",
+        "src/rdkafka_conf.c",
+        "src/rdkafka_conf.h",
+        "src/rdkafka_event.h",
+        "src/rdkafka_feature.c",
+        "src/rdkafka_feature.h",
+        "src/rdkafka_int.h",
+        "src/rdkafka_interceptor.c",
+        "src/rdkafka_interceptor.h",
+        "src/rdkafka_lz4.c",
+        "src/rdkafka_lz4.h",
+        "src/rdkafka_metadata.c",
+        "src/rdkafka_metadata.h",
+        "src/rdkafka_metadata_cache.c",
+        "src/rdkafka_msg.c",
+        "src/rdkafka_msg.h",
+        "src/rdkafka_msgset.h",
+        "src/rdkafka_msgset_reader.c",
+        "src/rdkafka_msgset_writer.c",
+        "src/rdkafka_offset.c",
+        "src/rdkafka_offset.h",
+        "src/rdkafka_op.c",
+        "src/rdkafka_op.h",
+        "src/rdkafka_partition.c",
+        "src/rdkafka_partition.h",
+        "src/rdkafka_pattern.c",
+        "src/rdkafka_pattern.h",
+        "src/rdkafka_proto.h",
+        "src/rdkafka_queue.c",
+        "src/rdkafka_queue.h",
+        "src/rdkafka_range_assignor.c",
+        "src/rdkafka_request.c",
+        "src/rdkafka_request.h",
+        "src/rdkafka_roundrobin_assignor.c",
+        "src/rdkafka_sasl.c",
+        "src/rdkafka_sasl.h",
+        "src/rdkafka_sasl_int.h",
+        "src/rdkafka_sasl_plain.c",
+        "src/rdkafka_subscription.c",
+        "src/rdkafka_subscription.h",
+        "src/rdkafka_timer.c",
+        "src/rdkafka_timer.h",
+        "src/rdkafka_topic.c",
+        "src/rdkafka_topic.h",
+        "src/rdkafka_transport.c",
+        "src/rdkafka_transport.h",
+        "src/rdkafka_transport_int.h",
+        "src/rdlist.c",
+        "src/rdlist.h",
+        "src/rdlog.c",
+        "src/rdlog.h",
+        "src/rdports.c",
+        "src/rdports.h",
+        "src/rdposix.h",
+        "src/rdrand.c",
+        "src/rdrand.h",
+        "src/rdregex.c",
+        "src/rdregex.h",
+        "src/rdstring.c",
+        "src/rdstring.h",
+        "src/rdsysqueue.h",
+        "src/rdtime.h",
+        "src/rdtypes.h",
+        "src/rdunittest.c",
+        "src/rdunittest.h",
+        "src/rdvarint.c",
+        "src/rdvarint.h",
+        "src/snappy.c",
+        "src/snappy.h",
+        "src/tinycthread.c",
+        "src/tinycthread.h",
+        "src/xxhash.c",
+        "src/xxhash.h",
+    ],
+    hdrs = [
+        "config.h",
+    ],
+    defines = [
+    ],
+    includes = [
+        "src",
+        "src-cpp",
+    ],
+    linkopts = [
+        "-lpthread",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "@boringssl//:ssl",
+    ],
+)
diff --git a/third_party/kafka/config.patch b/third_party/kafka/config.patch
new file mode 100644
index 0000000000..fa5c2d35b4
--- /dev/null
+++ b/third_party/kafka/config.patch
@@ -0,0 +1,44 @@
+diff -Naur a/config.h b/config.h
+--- a/config.h	1970-01-01 00:00:00.000000000 +0000
++++ b/config.h	2017-10-28 00:57:03.316957390 +0000
+@@ -0,0 +1,40 @@
++#pragma once
++#define WITHOUT_OPTIMIZATION 0
++#define ENABLE_DEVEL 0
++#define ENABLE_REFCNT_DEBUG 0
++#define ENABLE_SHAREDPTR_DEBUG 0
++
++#define HAVE_ATOMICS_32 1
++#define HAVE_ATOMICS_32_SYNC 1
++
++#if (HAVE_ATOMICS_32)
++# if (HAVE_ATOMICS_32_SYNC)
++#  define ATOMIC_OP32(OP1,OP2,PTR,VAL) __sync_ ## OP1 ## _and_ ## OP2(PTR, VAL)
++# else
++#  define ATOMIC_OP32(OP1,OP2,PTR,VAL) __atomic_ ## OP1 ## _ ## OP2(PTR, VAL, __ATOMIC_SEQ_CST)
++# endif
++#endif
++
++#define HAVE_ATOMICS_64 1
++#define HAVE_ATOMICS_64_SYNC 1
++
++#if (HAVE_ATOMICS_64)
++# if (HAVE_ATOMICS_64_SYNC)
++#  define ATOMIC_OP64(OP1,OP2,PTR,VAL) __sync_ ## OP1 ## _and_ ## OP2(PTR, VAL)
++# else
++#  define ATOMIC_OP64(OP1,OP2,PTR,VAL) __atomic_ ## OP1 ## _ ## OP2(PTR, VAL, __ATOMIC_SEQ_CST)
++# endif
++#endif
++
++
++#define WITH_ZLIB 1
++#define WITH_LIBDL 1
++#define WITH_PLUGINS 0
++#define WITH_SNAPPY 1
++#define WITH_SOCKEM 1
++#define WITH_SSL 1
++#define WITH_SASL 0
++#define WITH_SASL_SCRAM 0
++#define WITH_SASL_CYRUS 0
++#define HAVE_REGEX 1
++#define HAVE_STRNDUP 1
diff --git a/third_party/pcre.BUILD b/third_party/pcre.BUILD
index e2cdec4029..3a8e7a10b4 100644
--- a/third_party/pcre.BUILD
+++ b/third_party/pcre.BUILD
@@ -1,6 +1,6 @@
 licenses(["notice"])  # BSD
 
-exports_files(["COPYING"])
+exports_files(["LICENCE"])
 
 cc_library(
     name = "pcre",
diff --git a/third_party/py/python_configure.bzl b/third_party/py/python_configure.bzl
index c16eb3a12a..954f21f5f8 100644
--- a/third_party/py/python_configure.bzl
+++ b/third_party/py/python_configure.bzl
@@ -118,7 +118,7 @@ def _symlink_genrule_for_dir(repository_ctx, src_dir, dest_dir, genrule_name,
   if src_dir != None:
     src_dir = _norm_path(src_dir)
     dest_dir = _norm_path(dest_dir)
-    files = _read_dir(repository_ctx, src_dir)
+    files = '\n'.join(sorted(_read_dir(repository_ctx, src_dir).splitlines()))
     # Create a list with the src_dir stripped to use for outputs.
     dest_files = files.replace(src_dir, '').splitlines()
     src_files = files.splitlines()
diff --git a/third_party/termcolor.BUILD b/third_party/termcolor.BUILD
index 6000e3289d..655d7cb85e 100644
--- a/third_party/termcolor.BUILD
+++ b/third_party/termcolor.BUILD
@@ -3,7 +3,7 @@
 
 licenses(["notice"])  # MIT
 
-exports_files(["LICENSE"])
+exports_files(["COPYING.txt"])
 
 py_library(
     name = "termcolor",
-- 
GitLab


From bbc4edea4c12485894b9f38931694080a805a47f Mon Sep 17 00:00:00 2001
From: Kay Zhu <kayzhu@google.com>
Date: Wed, 7 Feb 2018 14:51:03 -0800
Subject: [PATCH 783/966] [XLA] Keep the number of HloPasses minimum for the
 interpreter backend.

This would be useful to identify bugs for these HLO passes.

PiperOrigin-RevId: 184900236
---
 .../xla/service/interpreter/compiler.cc        | 18 +-----------------
 1 file changed, 1 insertion(+), 17 deletions(-)

diff --git a/tensorflow/compiler/xla/service/interpreter/compiler.cc b/tensorflow/compiler/xla/service/interpreter/compiler.cc
index c83880e030..9171e859c6 100644
--- a/tensorflow/compiler/xla/service/interpreter/compiler.cc
+++ b/tensorflow/compiler/xla/service/interpreter/compiler.cc
@@ -44,28 +44,12 @@ namespace interpreter {
 namespace se = ::perftools::gputools;
 namespace sep = ::perftools::gputools::interpreter;
 
-/*
- * Run optimization passes on the module. The graph is transformed by
- * each pass in the optimization pipeline. The service subdirectory
- * contains useful optimization passes.
- */
 Status InterpreterCompiler::RunHloOptimization(HloModule* hlo_module) {
   HloPassPipeline pipeline("Interpreter");
-  pipeline.AddPass<Inliner>();
-  pipeline.AddPass<HloSubcomputationUnification>();
-  pipeline.AddPass<HloCSE>(false);
-
-  pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(
-      false, [](const Shape&, const Shape&) { return false; });
-  pipeline.AddPass<WhileLoopSimplifier>();
-  pipeline.AddPass<ReshapeMover>();
-  pipeline.AddPass<HloConstantFolding>();
-  pipeline.AddPass<HloCSE>(true);
+
   pipeline.AddPass<LayoutAssignment>(
       hlo_module->mutable_entry_computation_layout());
 
-  pipeline.AddPass<HloDCE>();
-  pipeline.AddPass<FlattenCallGraph>();
   return pipeline.Run(hlo_module).status();
 }
 
-- 
GitLab


From 22d4a7d19d4dccca664075c30d7c15cb291d2241 Mon Sep 17 00:00:00 2001
From: Sami Kama <skama@nvidia.com>
Date: Wed, 7 Feb 2018 14:54:49 -0800
Subject: [PATCH 784/966] Fix swig interface problems

---
 tensorflow/contrib/tensorrt/trt_conversion.i | 39 +++++++++++++++++++-
 1 file changed, 37 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/tensorrt/trt_conversion.i b/tensorflow/contrib/tensorrt/trt_conversion.i
index 1570ee6f04..40c33927ca 100644
--- a/tensorflow/contrib/tensorrt/trt_conversion.i
+++ b/tensorflow/contrib/tensorrt/trt_conversion.i
@@ -19,9 +19,43 @@ limitations under the License.
 %}
 %include "std_pair.i"
 %include "tensorflow/python/platform/base.i"
-%template(StringPair) std::pair<string, string>;
-%template() std::pair<swig::SwigPtr_PyObject, swig::SwigPtr_PyObject>;
 
+%{
+  PyObject* pair_helper(std::pair<string,string>* in){
+    PyObject *first(nullptr), *second(nullptr), *tuple(nullptr);
+    first = PyBytes_FromStringAndSize(in->first.data(), in->first.length());
+    if(!first){
+      if(!PyErr_Occurred()){
+        PyErr_SetString(PyExc_TypeError,
+                        "Pair conversion first argument failed");
+      }
+      return NULL;
+    }
+    second=PyBytes_FromStringAndSize(in->second.data(), in->second.length());
+    if(!second){
+      if(!PyErr_Occurred()){
+        PyErr_SetString(PyExc_TypeError,
+                        "Pair conversion second argument failed");
+      }
+      return NULL;
+    }
+    tuple=Py_BuildValue("(OO)", first, second);
+    if(!tuple){
+      if(!PyErr_Occurred()){
+        PyErr_SetString(PyExc_TypeError,
+                        "Tuple creation from pair<string,string> failed!");
+      }
+      return NULL;
+    }
+    return tuple;
+  }
+
+%}
+%typemap(out) std::pair<string, string> {
+  PyObject *tuple = pair_helper(&$1);
+  if(!tuple) SWIG_fail;
+  $result = tuple;
+ }
 %{
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -95,4 +129,5 @@ std::pair<string, string> trt_convert(string graph_def_string,
                                       size_t max_batch_size,
                                       size_t max_workspace_size_bytes);
 
+
 %unignoreall
-- 
GitLab


From 5b464e1d737fb9b07e5d9dfdbe3f4eebf5218987 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 7 Feb 2018 15:08:24 -0800
Subject: [PATCH 785/966] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 184903402

---
 tensorflow/go/op/wrappers.go | 406 +++++++++++++++++------------------
 1 file changed, 203 insertions(+), 203 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index cb47651d7b..a7290ff117 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -629,6 +629,77 @@ func LookupTableImportV2(scope *Scope, table_handle tf.Output, keys tf.Output, v
 	return scope.AddOperation(opspec)
 }
 
+// MapPeekAttr is an optional argument to MapPeek.
+type MapPeekAttr func(optionalAttr)
+
+// MapPeekCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapPeekCapacity(value int64) MapPeekAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapPeekMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapPeekMemoryLimit(value int64) MapPeekAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapPeekContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapPeekContainer(value string) MapPeekAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapPeekSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapPeekSharedName(value string) MapPeekAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op peeks at the values at the specified key.  If the
+//
+// underlying container does not contain this key
+// this op will block until it does.
+func MapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapPeekAttr) (values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MapPeek",
+		Input: []tf.Input{
+			key, indices,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("MapPeek", err)
+		return
+	}
+	return values
+}
+
 // Returns (x - y)(x - y) element-wise.
 //
 // *NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
@@ -4509,6 +4580,68 @@ func CriticalSectionOp(scope *Scope, optional ...CriticalSectionOpAttr) (resourc
 	return op.Output(0)
 }
 
+// FakeQuantWithMinMaxArgsGradientAttr is an optional argument to FakeQuantWithMinMaxArgsGradient.
+type FakeQuantWithMinMaxArgsGradientAttr func(optionalAttr)
+
+// FakeQuantWithMinMaxArgsGradientMin sets the optional min attribute to value.
+// If not specified, defaults to -6
+func FakeQuantWithMinMaxArgsGradientMin(value float32) FakeQuantWithMinMaxArgsGradientAttr {
+	return func(m optionalAttr) {
+		m["min"] = value
+	}
+}
+
+// FakeQuantWithMinMaxArgsGradientMax sets the optional max attribute to value.
+// If not specified, defaults to 6
+func FakeQuantWithMinMaxArgsGradientMax(value float32) FakeQuantWithMinMaxArgsGradientAttr {
+	return func(m optionalAttr) {
+		m["max"] = value
+	}
+}
+
+// FakeQuantWithMinMaxArgsGradientNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxArgsGradientNumBits(value int64) FakeQuantWithMinMaxArgsGradientAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
+// FakeQuantWithMinMaxArgsGradientNarrowRange sets the optional narrow_range attribute to value.
+// If not specified, defaults to false
+func FakeQuantWithMinMaxArgsGradientNarrowRange(value bool) FakeQuantWithMinMaxArgsGradientAttr {
+	return func(m optionalAttr) {
+		m["narrow_range"] = value
+	}
+}
+
+// Compute gradients for a FakeQuantWithMinMaxArgs operation.
+//
+// Arguments:
+//	gradients: Backpropagated gradients above the FakeQuantWithMinMaxArgs operation.
+//	inputs: Values passed as inputs to the FakeQuantWithMinMaxArgs operation.
+//
+// Returns Backpropagated gradients below the FakeQuantWithMinMaxArgs operation:
+// `gradients * (inputs >= min && inputs <= max)`.
+func FakeQuantWithMinMaxArgsGradient(scope *Scope, gradients tf.Output, inputs tf.Output, optional ...FakeQuantWithMinMaxArgsGradientAttr) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FakeQuantWithMinMaxArgsGradient",
+		Input: []tf.Input{
+			gradients, inputs,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // AvgPool3DAttr is an optional argument to AvgPool3D.
 type AvgPool3DAttr func(optionalAttr)
 
@@ -20680,68 +20813,6 @@ func TFRecordDataset(scope *Scope, filenames tf.Output, compression_type tf.Outp
 	return op.Output(0)
 }
 
-// FakeQuantWithMinMaxArgsGradientAttr is an optional argument to FakeQuantWithMinMaxArgsGradient.
-type FakeQuantWithMinMaxArgsGradientAttr func(optionalAttr)
-
-// FakeQuantWithMinMaxArgsGradientMin sets the optional min attribute to value.
-// If not specified, defaults to -6
-func FakeQuantWithMinMaxArgsGradientMin(value float32) FakeQuantWithMinMaxArgsGradientAttr {
-	return func(m optionalAttr) {
-		m["min"] = value
-	}
-}
-
-// FakeQuantWithMinMaxArgsGradientMax sets the optional max attribute to value.
-// If not specified, defaults to 6
-func FakeQuantWithMinMaxArgsGradientMax(value float32) FakeQuantWithMinMaxArgsGradientAttr {
-	return func(m optionalAttr) {
-		m["max"] = value
-	}
-}
-
-// FakeQuantWithMinMaxArgsGradientNumBits sets the optional num_bits attribute to value.
-// If not specified, defaults to 8
-func FakeQuantWithMinMaxArgsGradientNumBits(value int64) FakeQuantWithMinMaxArgsGradientAttr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
-	}
-}
-
-// FakeQuantWithMinMaxArgsGradientNarrowRange sets the optional narrow_range attribute to value.
-// If not specified, defaults to false
-func FakeQuantWithMinMaxArgsGradientNarrowRange(value bool) FakeQuantWithMinMaxArgsGradientAttr {
-	return func(m optionalAttr) {
-		m["narrow_range"] = value
-	}
-}
-
-// Compute gradients for a FakeQuantWithMinMaxArgs operation.
-//
-// Arguments:
-//	gradients: Backpropagated gradients above the FakeQuantWithMinMaxArgs operation.
-//	inputs: Values passed as inputs to the FakeQuantWithMinMaxArgs operation.
-//
-// Returns Backpropagated gradients below the FakeQuantWithMinMaxArgs operation:
-// `gradients * (inputs >= min && inputs <= max)`.
-func FakeQuantWithMinMaxArgsGradient(scope *Scope, gradients tf.Output, inputs tf.Output, optional ...FakeQuantWithMinMaxArgsGradientAttr) (backprops tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FakeQuantWithMinMaxArgsGradient",
-		Input: []tf.Input{
-			gradients, inputs,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // BatchToSpace for 4-D tensors of type T.
 //
 // This is a legacy version of the more general BatchToSpaceND.
@@ -22254,6 +22325,76 @@ func TensorArrayCloseV2(scope *Scope, handle tf.Output) (o *tf.Operation) {
 	return scope.AddOperation(opspec)
 }
 
+// Forwards the value of an available tensor from `inputs` to `output`.
+//
+// `Merge` waits for at least one of the tensors in `inputs` to become available.
+// It is usually combined with `Switch` to implement branching.
+//
+// `Merge` forwards the first tensor to become available to `output`, and sets
+// `value_index` to its index in `inputs`.
+//
+// Arguments:
+//	inputs: The input tensors, exactly one of which will become available.
+//
+// Returns Will be set to the available input tensor.The index of the chosen input tensor in `inputs`.
+func Merge(scope *Scope, inputs []tf.Output) (output tf.Output, value_index tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Merge",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// QueueCloseV2Attr is an optional argument to QueueCloseV2.
+type QueueCloseV2Attr func(optionalAttr)
+
+// QueueCloseV2CancelPendingEnqueues sets the optional cancel_pending_enqueues attribute to value.
+//
+// value: If true, all pending enqueue requests that are
+// blocked on the given queue will be canceled.
+// If not specified, defaults to false
+func QueueCloseV2CancelPendingEnqueues(value bool) QueueCloseV2Attr {
+	return func(m optionalAttr) {
+		m["cancel_pending_enqueues"] = value
+	}
+}
+
+// Closes the given queue.
+//
+// This operation signals that no more elements will be enqueued in the
+// given queue. Subsequent Enqueue(Many) operations will fail.
+// Subsequent Dequeue(Many) operations will continue to succeed if
+// sufficient elements remain in the queue. Subsequent Dequeue(Many)
+// operations that would block will fail immediately.
+//
+// Arguments:
+//	handle: The handle to a queue.
+//
+// Returns the created operation.
+func QueueCloseV2(scope *Scope, handle tf.Output, optional ...QueueCloseV2Attr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QueueCloseV2",
+		Input: []tf.Input{
+			handle,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Computes inverse hyperbolic tangent of x element-wise.
 func Atanh(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
@@ -24203,147 +24344,6 @@ func MapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output
 	return scope.AddOperation(opspec)
 }
 
-// MapPeekAttr is an optional argument to MapPeek.
-type MapPeekAttr func(optionalAttr)
-
-// MapPeekCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapPeekCapacity(value int64) MapPeekAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// MapPeekMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapPeekMemoryLimit(value int64) MapPeekAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapPeekContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapPeekContainer(value string) MapPeekAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapPeekSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapPeekSharedName(value string) MapPeekAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op peeks at the values at the specified key.  If the
-//
-// underlying container does not contain this key
-// this op will block until it does.
-func MapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapPeekAttr) (values []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MapPeek",
-		Input: []tf.Input{
-			key, indices,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("MapPeek", err)
-		return
-	}
-	return values
-}
-
-// QueueCloseV2Attr is an optional argument to QueueCloseV2.
-type QueueCloseV2Attr func(optionalAttr)
-
-// QueueCloseV2CancelPendingEnqueues sets the optional cancel_pending_enqueues attribute to value.
-//
-// value: If true, all pending enqueue requests that are
-// blocked on the given queue will be canceled.
-// If not specified, defaults to false
-func QueueCloseV2CancelPendingEnqueues(value bool) QueueCloseV2Attr {
-	return func(m optionalAttr) {
-		m["cancel_pending_enqueues"] = value
-	}
-}
-
-// Closes the given queue.
-//
-// This operation signals that no more elements will be enqueued in the
-// given queue. Subsequent Enqueue(Many) operations will fail.
-// Subsequent Dequeue(Many) operations will continue to succeed if
-// sufficient elements remain in the queue. Subsequent Dequeue(Many)
-// operations that would block will fail immediately.
-//
-// Arguments:
-//	handle: The handle to a queue.
-//
-// Returns the created operation.
-func QueueCloseV2(scope *Scope, handle tf.Output, optional ...QueueCloseV2Attr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QueueCloseV2",
-		Input: []tf.Input{
-			handle,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Forwards the value of an available tensor from `inputs` to `output`.
-//
-// `Merge` waits for at least one of the tensors in `inputs` to become available.
-// It is usually combined with `Switch` to implement branching.
-//
-// `Merge` forwards the first tensor to become available to `output`, and sets
-// `value_index` to its index in `inputs`.
-//
-// Arguments:
-//	inputs: The input tensors, exactly one of which will become available.
-//
-// Returns Will be set to the available input tensor.The index of the chosen input tensor in `inputs`.
-func Merge(scope *Scope, inputs []tf.Output) (output tf.Output, value_index tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Merge",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
 // MapUnstageAttr is an optional argument to MapUnstage.
 type MapUnstageAttr func(optionalAttr)
 
-- 
GitLab


From cd63c718be123324b6c39e0f8fbe453319799746 Mon Sep 17 00:00:00 2001
From: Jie <jiej@nvidia.com>
Date: Wed, 7 Feb 2018 15:15:50 -0800
Subject: [PATCH 786/966] [update]   fix naming stuff in
 tensorflow/contrib/tensorrt/convert/convert_nodes.cc

---
 .../contrib/tensorrt/convert/convert_nodes.cc | 145 +++++++-----------
 1 file changed, 56 insertions(+), 89 deletions(-)

diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index 68ee403c4e..bba69ea93a 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -120,7 +120,7 @@ class TRT_ShapedWeights {
         type_(type),
         values_(values),
         owned_values_(owned_values ? *owned_values : std::vector<char>({})),
-        dummy_flag_(false) {
+        empty_weight_flag_(false) {
     // Note: this->shape.type[] is not used
   }
 
@@ -129,14 +129,14 @@ class TRT_ShapedWeights {
         type_(type),
         values_(nullptr),
         owned_values_(),
-        dummy_flag_(true) {}
+        empty_weight_flag_(true) {}
 
   TRT_ShapedWeights(const TRT_ShapedWeights& rhs)
       : shape_(rhs.shape_),
         type_(rhs.type_),
         values_(rhs.values_),
         owned_values_(rhs.owned_values_),
-        dummy_flag_(rhs.dummy_flag_) {}
+        empty_weight_flag_(rhs.empty_weight_flag_) {}
 
   int64_t count() const {
     int64_t c = 1;
@@ -147,7 +147,7 @@ class TRT_ShapedWeights {
   nvinfer1::Weights GetWeightsForTRT() const {
     nvinfer1::DataType trt_type(nvinfer1::DataType::kFLOAT);
     TF_CHECK_OK(ConvertDType(type_, &trt_type));
-    if (dummy_flag_) return nvinfer1::Weights{trt_type, nullptr, 0};
+    if (empty_weight_flag_) return nvinfer1::Weights{trt_type, nullptr, 0};
 
     // Note: this->shape.type[] is not used
     return nvinfer1::Weights{trt_type, GetValues(), GetShapeSize(shape_)};
@@ -178,39 +178,39 @@ class TRT_ShapedWeights {
  private:
   const void* values_;
   std::vector<char> owned_values_;
-  bool dummy_flag_;
+  bool empty_weight_flag_;
 };
 
 class TRT_TensorOrWeights {
  public:
   explicit TRT_TensorOrWeights(nvinfer1::ITensor* tensor)
-      : _tensor_(tensor), _weights_(DT_FLOAT), _variant_(TRT_NODE_TENSOR) {}
+      : tensor_(tensor), weights_(DT_FLOAT), variant_(TRT_NODE_TENSOR) {}
   explicit TRT_TensorOrWeights(const TRT_ShapedWeights& weights)
-      : _tensor_(nullptr), _weights_(weights), _variant_(TRT_NODE_WEIGHTS) {}
+      : tensor_(nullptr), weights_(weights), variant_(TRT_NODE_WEIGHTS) {}
   TRT_TensorOrWeights(const TRT_TensorOrWeights& rhs)
-      : _tensor_(rhs._tensor_),
-        _weights_(rhs._weights_),
-        _variant_(rhs._variant_) {}
+      : tensor_(rhs.tensor_),
+        weights_(rhs.weights_),
+        variant_(rhs.variant_) {}
   ~TRT_TensorOrWeights() {}
 
-  bool is_tensor() const { return _variant_ == TRT_NODE_TENSOR; }
-  bool is_weights() const { return _variant_ == TRT_NODE_WEIGHTS; }
+  bool is_tensor() const { return variant_ == TRT_NODE_TENSOR; }
+  bool is_weights() const { return variant_ == TRT_NODE_WEIGHTS; }
 
   nvinfer1::ITensor* tensor() {
     CHECK_EQ(is_tensor(), true);
-    return _tensor_;
+    return tensor_;
   }
   const nvinfer1::ITensor* tensor() const {
     CHECK_EQ(is_tensor(), true);
-    return _tensor_;
+    return tensor_;
   }
   TRT_ShapedWeights& weights() {
     CHECK_EQ(is_weights(), true);
-    return _weights_;
+    return weights_;
   }
   const TRT_ShapedWeights& weights() const {
     CHECK_EQ(is_weights(), true);
-    return _weights_;
+    return weights_;
   }
   nvinfer1::Dims shape() const {
     if (is_tensor()) {
@@ -221,69 +221,35 @@ class TRT_TensorOrWeights {
   }
 
  private:
-  nvinfer1::ITensor* _tensor_;
-  TRT_ShapedWeights _weights_;
-  enum { TRT_NODE_TENSOR, TRT_NODE_WEIGHTS } _variant_;
-};
-
-class TRT_LayerOrWeights {
- public:
-  explicit TRT_LayerOrWeights(nvinfer1::ILayer* layer)
-      : _layer_(layer), _variant_(TRT_NODE_LAYER) {}
-  explicit TRT_LayerOrWeights(const TRT_ShapedWeights& weights)
-      : _weights_(weights), _variant_(TRT_NODE_WEIGHTS) {}
-  bool is_layer() const { return _variant_ == TRT_NODE_LAYER; }
-  bool is_weights() const { return _variant_ == TRT_NODE_WEIGHTS; }
-  nvinfer1::ILayer* layer() {
-    CHECK_EQ(this->is_layer(), true);
-    return _layer_;
-  }
-  TRT_ShapedWeights& weights() {
-    CHECK_EQ(this->is_weights(), true);
-    return _weights_;
-  }
-  TRT_TensorOrWeights output(int index = 0) const {
-    if (this->is_layer()) {
-      nvinfer1::ITensor* tensor = _layer_->getOutput(index);
-      return TRT_TensorOrWeights(tensor);
-    } else {
-      CHECK_EQ(index, 0);
-      return TRT_TensorOrWeights(_weights_);
-    }
-  }
-
- private:
-  union {
-    nvinfer1::ILayer* _layer_;
-    TRT_ShapedWeights _weights_;
-  };
-  enum { TRT_NODE_LAYER, TRT_NODE_WEIGHTS } _variant_;
+  nvinfer1::ITensor* tensor_;
+  TRT_ShapedWeights weights_;
+  enum { TRT_NODE_TENSOR, TRT_NODE_WEIGHTS } variant_;
 };
 
 class TFAttrs {
  public:
   explicit TFAttrs(const tensorflow::NodeDef& tf_node) {
     for (const auto& attr : tf_node.attr()) {
-      _attrs.insert({attr.first, &attr.second});
+      attrs_.insert({attr.first, &attr.second});
     }
   }
-  bool count(string key) const { return _attrs.count(key); }
+  bool count(string key) const { return attrs_.count(key); }
   tensorflow::AttrValue const* at(string key) const {
-    if (!_attrs.count(key)) {
+    if (!attrs_.count(key)) {
       LOG(FATAL) << "Attribute not found: " << key;
     }
-    return _attrs.at(key);
+    return attrs_.at(key);
   }
   template <typename T>
   T get(string key) const;
   template <typename T>
   T get(string key, const T& default_value) const {
-    return _attrs.count(key) ? this->get<T>(key) : default_value;
+    return attrs_.count(key) ? this->get<T>(key) : default_value;
   }
 
  private:
   typedef std::map<string, tensorflow::AttrValue const*> AttrMap;
-  AttrMap _attrs;
+  AttrMap attrs_;
 };
 
 template <>
@@ -385,10 +351,10 @@ using OpConverter =
                                      std::vector<TRT_TensorOrWeights>*)>;
 
 class Converter {
-  std::unordered_map<string, TRT_TensorOrWeights> _trt_tensors;
-  std::unordered_map<string, OpConverter> _op_registry;
-  nvinfer1::INetworkDefinition* _trt_network;
-  std::list<std::vector<uint8_t>> _temp_bufs;
+  std::unordered_map<string, TRT_TensorOrWeights> trt_tensors_;
+  std::unordered_map<string, OpConverter> op_registry_;
+  nvinfer1::INetworkDefinition* trt_network_;
+  std::list<std::vector<uint8_t>> temp_bufs_;
 
   void register_op_converters();
 
@@ -397,14 +363,14 @@ class Converter {
     std::vector<TRT_TensorOrWeights> inputs;
     for (const auto& input_name : node_def.input()) {
       VLOG(2) << "Retrieve input: " << input_name;
-      inputs.push_back(_trt_tensors.at(input_name));
+      inputs.push_back(trt_tensors_.at(input_name));
     }
     return inputs;
   }
 
  public:
   explicit Converter(nvinfer1::INetworkDefinition* trt_network)
-      : _trt_network(trt_network) {
+      : trt_network_(trt_network) {
     this->register_op_converters();
   }
 
@@ -412,8 +378,8 @@ class Converter {
                                      nvinfer1::Dims shape) {
     TRT_ShapedWeights weights(type, nullptr, shape);
     // TODO(jie): check weights size_bytes. 0 means type error
-    _temp_bufs.push_back(std::vector<uint8_t>(weights.size_bytes()));
-    weights.SetValues(_temp_bufs.back().data());
+    temp_bufs_.push_back(std::vector<uint8_t>(weights.size_bytes()));
+    weights.SetValues(temp_bufs_.back().data());
     return weights;
   }
 
@@ -424,11 +390,11 @@ class Converter {
   tensorflow::Status convert_node(const tensorflow::NodeDef& node_def) {
     std::vector<TRT_TensorOrWeights> inputs = this->get_inputs(node_def);
     string op = node_def.op();
-    if (!_op_registry.count(op)) {
+    if (!op_registry_.count(op)) {
       return tensorflow::errors::Unimplemented(
           "No converter registered for op: " + op);
     }
-    OpConverter op_converter = _op_registry.at(op);
+    OpConverter op_converter = op_registry_.at(op);
     std::vector<TRT_TensorOrWeights> outputs;
     TF_RETURN_IF_ERROR(op_converter(*this, node_def, inputs, &outputs));
     for (size_t i = 0; i < outputs.size(); ++i) {
@@ -440,7 +406,7 @@ class Converter {
         output.tensor()->setName(output_name.c_str());
       }
       VLOG(2) << "Write out tensor: " << output_name;
-      if (!_trt_tensors.insert({output_name, output}).second) {
+      if (!trt_tensors_.insert({output_name, output}).second) {
         return tensorflow::errors::AlreadyExists(
             "Output tensor already exists for op: " + op);
       }
@@ -448,17 +414,17 @@ class Converter {
     return tensorflow::Status::OK();
   }
 
-  nvinfer1::INetworkDefinition* network() { return _trt_network; }
+  nvinfer1::INetworkDefinition* network() { return trt_network_; }
 
   TRT_TensorOrWeights get_tensor(string name) {
-    if (!_trt_tensors.count(name)) {
+    if (!trt_tensors_.count(name)) {
       return TRT_TensorOrWeights(nullptr);
     }
-    return _trt_tensors.at(name);
+    return trt_tensors_.at(name);
   }
 
   bool insert_input_tensor(string name, nvinfer1::ITensor* tensor) {
-    return _trt_tensors.insert({name, TRT_TensorOrWeights(tensor)}).second;
+    return trt_tensors_.insert({name, TRT_TensorOrWeights(tensor)}).second;
   }
 
   nvinfer1::ITensor* TransposeTensor(nvinfer1::ITensor* input_tensor,
@@ -1428,25 +1394,25 @@ tensorflow::Status ConvertPad(Converter& ctx,
 
 void Converter::register_op_converters() {
   // vgg_16 slim implementation
-  _op_registry["Placeholder"] = ConvertPlaceholder;
-  _op_registry["Conv2D"] = ConvertConv2D;
-  _op_registry["Relu"] = ConvertActivation;
-  _op_registry["MaxPool"] = ConvertPool;
+  op_registry_["Placeholder"] = ConvertPlaceholder;
+  op_registry_["Conv2D"] = ConvertConv2D;
+  op_registry_["Relu"] = ConvertActivation;
+  op_registry_["MaxPool"] = ConvertPool;
   // This could be really handled as ConvertBinary
-  _op_registry["BiasAdd"] = ConvertScale;
-  _op_registry["Const"] = ConvertConst;
-  // _op_registry["MatMul"] = ConvertFullyConnected;  // Not used in vgg
+  op_registry_["BiasAdd"] = ConvertScale;
+  op_registry_["Const"] = ConvertConst;
+  // op_registry_["MatMul"] = ConvertFullyConnected;  // Not used in vgg
   // TODO(ben,jie): this is a temp hack.
-  _op_registry["Identity"] = ConvertIdentity;  // Identity should be removed
-  // _op_registry["AvgPool"] = ConvertPool;
+  op_registry_["Identity"] = ConvertIdentity;  // Identity should be removed
+  // op_registry_["AvgPool"] = ConvertPool;
 
   // resnet_50_v1 slim implementation
-  _op_registry["Add"] = ConvertBinary;
-  _op_registry["Mul"] = ConvertBinary;
-  _op_registry["Sub"] = ConvertBinary;
-  _op_registry["Rsqrt"] = ConvertUnary;
-  _op_registry["Mean"] = ConvertReduce;
-  _op_registry["Pad"] = ConvertPad;
+  op_registry_["Add"] = ConvertBinary;
+  op_registry_["Mul"] = ConvertBinary;
+  op_registry_["Sub"] = ConvertBinary;
+  op_registry_["Rsqrt"] = ConvertUnary;
+  op_registry_["Mean"] = ConvertReduce;
+  op_registry_["Pad"] = ConvertPad;
   // TODO(ben,jie): Add more ops
 }
 
@@ -1595,6 +1561,7 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
   }
 
   VLOG(2) << "Finished output";
+  // TODO(jie): static_id is not thread safe.
   static int static_id = 0;
 
   // Build the engine
-- 
GitLab


From 63658bed986cd2d5332542db725628b543472e93 Mon Sep 17 00:00:00 2001
From: hsm207 <hsm207@users.noreply.github.com>
Date: Wed, 7 Feb 2018 19:20:15 -0500
Subject: [PATCH 787/966] Fix typo (#16822)

---
 tensorflow/contrib/slim/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/slim/README.md b/tensorflow/contrib/slim/README.md
index c7a54cb9a2..2d9df8f27e 100644
--- a/tensorflow/contrib/slim/README.md
+++ b/tensorflow/contrib/slim/README.md
@@ -145,7 +145,7 @@ regular_variables_and_model_variables = slim.get_variables()
 
 How does this work? When you create a model variable via TF-Slim's layers or
 directly via the `slim.model_variable` function, TF-Slim adds the variable to
-a the `tf.GraphKeys.MODEL_VARIABLES` collection. What if you have your own
+the `tf.GraphKeys.MODEL_VARIABLES` collection. What if you have your own
 custom layers or variable creation routine but still want TF-Slim to manage or
 be aware of your model variables? TF-Slim provides a convenience function for
 adding the model variable to its collection:
-- 
GitLab


From 163fd2ea39f550f45717dd70f26ebebdaf74411e Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 7 Feb 2018 16:21:13 -0800
Subject: [PATCH 788/966] Remove obsolete BernoulliWithSigmoidProbs (#16846)

As was pointed out by 9485, BernoulliWithSigmoidProbs is covered
by Bernoulli and is obsolete. This fix removes BernoulliWithSigmoidProbs.

This fix closes 9485.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/contrib/distributions/__init__.py  |  1 -
 .../python/contrib.distributions.md           |  1 -
 .../distributions/bernoulli_test.py           |  6 ------
 .../python/ops/distributions/bernoulli.py     | 20 -------------------
 4 files changed, 28 deletions(-)

diff --git a/tensorflow/contrib/distributions/__init__.py b/tensorflow/contrib/distributions/__init__.py
index 60a187e541..837af20ade 100644
--- a/tensorflow/contrib/distributions/__init__.py
+++ b/tensorflow/contrib/distributions/__init__.py
@@ -97,7 +97,6 @@ _allowed_symbols = [
     'Autoregressive',
     'Binomial',
     'Bernoulli',
-    'BernoulliWithSigmoidProbs',
     'Beta',
     'BetaWithSoftplusConcentration',
     'Categorical',
diff --git a/tensorflow/docs_src/api_guides/python/contrib.distributions.md b/tensorflow/docs_src/api_guides/python/contrib.distributions.md
index 7a3d509b75..533d7dac13 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.distributions.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.distributions.md
@@ -17,7 +17,6 @@ initialized with parameters that define the distributions.
 
 *   @{tf.contrib.distributions.Binomial}
 *   @{tf.contrib.distributions.Bernoulli}
-*   @{tf.contrib.distributions.BernoulliWithSigmoidProbs}
 *   @{tf.contrib.distributions.Beta}
 *   @{tf.contrib.distributions.Categorical}
 *   @{tf.contrib.distributions.Chi2}
diff --git a/tensorflow/python/kernel_tests/distributions/bernoulli_test.py b/tensorflow/python/kernel_tests/distributions/bernoulli_test.py
index a269d72273..dc5d63eec6 100644
--- a/tensorflow/python/kernel_tests/distributions/bernoulli_test.py
+++ b/tensorflow/python/kernel_tests/distributions/bernoulli_test.py
@@ -291,12 +291,6 @@ class BernoulliTest(test.TestCase):
                [np.sqrt(var(0.5)), np.sqrt(var(0.4))]],
               dtype=np.float32))
 
-  def testBernoulliWithSigmoidProbs(self):
-    p = np.array([8.3, 4.2])
-    dist = bernoulli.BernoulliWithSigmoidProbs(logits=p)
-    with self.test_session():
-      self.assertAllClose(math_ops.sigmoid(p).eval(), dist.probs.eval())
-
   def testBernoulliBernoulliKL(self):
     with self.test_session() as sess:
       batch_size = 6
diff --git a/tensorflow/python/ops/distributions/bernoulli.py b/tensorflow/python/ops/distributions/bernoulli.py
index 1f300b7147..553e5db8d8 100644
--- a/tensorflow/python/ops/distributions/bernoulli.py
+++ b/tensorflow/python/ops/distributions/bernoulli.py
@@ -167,26 +167,6 @@ class Bernoulli(distribution.Distribution):
     return math_ops.cast(self.probs > 0.5, self.dtype)
 
 
-class BernoulliWithSigmoidProbs(Bernoulli):
-  """Bernoulli with `probs = nn.sigmoid(logits)`."""
-
-  def __init__(self,
-               logits=None,
-               dtype=dtypes.int32,
-               validate_args=False,
-               allow_nan_stats=True,
-               name="BernoulliWithSigmoidProbs"):
-    parameters = locals()
-    with ops.name_scope(name):
-      super(BernoulliWithSigmoidProbs, self).__init__(
-          probs=nn.sigmoid(logits, name="sigmoid_probs"),
-          dtype=dtype,
-          validate_args=validate_args,
-          allow_nan_stats=allow_nan_stats,
-          name=name)
-    self._parameters = parameters
-
-
 @kullback_leibler.RegisterKL(Bernoulli, Bernoulli)
 def _kl_bernoulli_bernoulli(a, b, name=None):
   """Calculate the batched KL divergence KL(a || b) with a and b Bernoulli.
-- 
GitLab


From 7a5fb00a976ad1b4e3d13be6af5b9e3558499b2b Mon Sep 17 00:00:00 2001
From: brett koonce <koonce@hello.com>
Date: Wed, 7 Feb 2018 16:21:31 -0800
Subject: [PATCH 789/966] spelling fixes for contrib docs (#16811)

---
 tensorflow/contrib/makefile/README.md   |  2 +-
 tensorflow/contrib/tpu/tpu_estimator.md |  2 +-
 tensorflow/contrib/verbs/README.md      | 12 ++++++------
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/tensorflow/contrib/makefile/README.md b/tensorflow/contrib/makefile/README.md
index 0613de2cab..6959ca344f 100644
--- a/tensorflow/contrib/makefile/README.md
+++ b/tensorflow/contrib/makefile/README.md
@@ -268,7 +268,7 @@ selectively register only for the operators used in your graph.
 ```bash
 tensorflow/contrib/makefile/build_all_ios.sh -a arm64 -g $HOME/graphs/inception/tensorflow_inception_graph.pb
 ```
-Please note this is an aggresive optimization of the operators and the resulting library may not work with other graphs but will reduce the size of the final library.
+Please note this is an aggressive optimization of the operators and the resulting library may not work with other graphs but will reduce the size of the final library.
 
 The `compile_ios_tensorflow.sh` script can take optional command-line arguments.
 The first argument will be passed as a C++ optimization flag and defaults to
diff --git a/tensorflow/contrib/tpu/tpu_estimator.md b/tensorflow/contrib/tpu/tpu_estimator.md
index ca1255b16b..4ef8f9eebd 100644
--- a/tensorflow/contrib/tpu/tpu_estimator.md
+++ b/tensorflow/contrib/tpu/tpu_estimator.md
@@ -231,7 +231,7 @@ Refer to this link for all [Cloud TPU documentation](https://cloud.google.com/tp
 
 
 ### Profiling
-You can profile the `worker` by using instructions as spcified in the [Cloud TPU Tools](https://cloud.google.com/tpu/docs/cloud-tpu-tools).
+You can profile the `worker` by using instructions as specified in the [Cloud TPU Tools](https://cloud.google.com/tpu/docs/cloud-tpu-tools).
 
 
 ### Is `int64` supported?
diff --git a/tensorflow/contrib/verbs/README.md b/tensorflow/contrib/verbs/README.md
index 1b99f4ce4f..58fed4e5cb 100644
--- a/tensorflow/contrib/verbs/README.md
+++ b/tensorflow/contrib/verbs/README.md
@@ -25,9 +25,9 @@ The design is based on TensorFlow r1.0. An RDMA path is added between servers fo
 During the server setup, an RDMA manager is created to manage low-level RDMA components such as RDMA channel and RDMA adapter, an RDMA rendezvous manager is created to oversee send/recv operations between servers. Following the distributed TensorFlow design philosophy, the send operation is passive, i.e. merely placing a tensor in the local out-going table. It is the receive operation that actually initiates the tensor transfer.
 
 TensorFlow dynamically allocates memory for tensors that are to be sent or received. This causes difficulty for RDMA operations where pinned memory is required. Few remedies are possible:
-1. The memory is pinned, transfered, then unpinned for each and every tensor to be transferred. This incurs significant operation overhead since pinning and unpinning memory for each dynamically generated tensor is slow. 
+1. The memory is pinned, transferred, then unpinned for each and every tensor to be transferred. This incurs significant operation overhead since pinning and unpinning memory for each dynamically generated tensor is slow. 
 2. Buffer is pre-allocated and pinned for each tensor. This incurs large memory overhead and extra copying from the tensor to its pinned buffer, but may still be faster than the former.
-3. Following HKUST research on the use of GPU direct, and their [GDR implementation](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/gdr/README.md), there is a smart way to benefit from the TensorFlow allocation theme which is mostly pool based, i.e allocators pre-allocate a large memory block, and allocate the tensors from there. By attaching a custom Visitor to relevant alloactors, we can do a single registration of the entire memory block, which zeros the registration overhead. Once the block is registered, each new tensor allocated will be at a registred address, which will allow us to do direct RDMA writes to it.
+3. Following HKUST research on the use of GPU direct, and their [GDR implementation](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/gdr/README.md), there is a smart way to benefit from the TensorFlow allocation theme which is mostly pool based, i.e allocators pre-allocate a large memory block, and allocate the tensors from there. By attaching a custom Visitor to relevant allocators, we can do a single registration of the entire memory block, which zeros the registration overhead. Once the block is registered, each new tensor allocated will be at a registered address, which will allow us to do direct RDMA writes to it.
 
 For best performance, we will adopt HKUST 0 copies approach in our solution. This means:
 
@@ -77,7 +77,7 @@ When the receiver receives the **RDMA_MESSAGE_META_DATA_RESPONSE**, it will loca
 
 1. Update the local meta-data cache.
 2. Reallocate the result/proxy tensors. 
-3. Re-send the tensor request. For tracability, the new message has a different name: **RDMA_MESSAGE_TENSOR_RE_REQUEST**.
+3. Re-send the tensor request. For traceability, the new message has a different name: **RDMA_MESSAGE_TENSOR_RE_REQUEST**.
 
 When the sender receives a **RDMA_MESSAGE_TENSOR_RE_REQUEST**, it will locate the relevant **RdmaTensorResponse** using the request index specified in the message, and invoke its **Resume()** method, which will RDMA write the contents of the tensor that was cloned earlier, to the new remote address specified in the re-request.
 
@@ -93,7 +93,7 @@ When the receiver receives the RDMA write, it will locate the relevant **RdmaTen
 
 1. When the sender receives a tensor request, the source tensor may or may not be ready yet. The situation is handled through a process of tag matching:
 	* If the request arrives before the tensor is ready, then a callback is put in a local table, and will be invoked once the tensor arrives.
-	* If the tensor is ready before the request arives, than the tensor is put in a local table. When the request arrives, it will invoke the callback immediatly.
+	* If the tensor is ready before the request arives, than the tensor is put in a local table. When the request arrives, it will invoke the callback immediately.
    In code it is done by calling **RecvLocalAsync()**, which receives the tensor's key, step-id, and the callback.
 2. When the callback is invoked, the relevant tensor is removed from the tag matching table. In the case where we need to send the tensor's meta-data, the **RdmaTensorResponse** will store a copy of the tensor until the re-request arrives.
 3. The sending of protocol messages (**RDMA_MESSAGE_TENSOR_REQUEST**, **RDMA_MESSAGE_META_DATA_RESPONSE** and **RDMA_MESSAGE_TENSOR_RE_REQUEST**) is done by the class **RdmaMessageBuffer**. All messages are sent using RDMA writes from/to fixed messages buffers. This implies that we cannot send on a specific channel more than one message at a time. In order to synchronize the messages, the **RdmaMessageBuffer** holds the a local and remote buffer statuses which can be either busy or idle. When a write is issued, both statuses will be changed to busy. When the write-complete event is received, the local status is changed to idle. When the write is received on the remote side, the remote side will parse the message, and return an ACK back to the sending side on which the sending side will update the remote status to idle. When both the local and remote statuses are idle, the next message can be sent.
@@ -115,7 +115,7 @@ When the receiver receives the RDMA write, it will locate the relevant **RdmaTen
 		* Reallocate the result tensor (and proxy tensor if required).
 		* Re-send the request to the remote side.
 	* **RecvTensorContent()**    - Receive tensor content from the remote side (RDMA write was completed).
-		* Decode proto if required and/or move to GPU if the content was not written to it directly (GPU direct is not avaliable).
+		* Decode proto if required and/or move to GPU if the content was not written to it directly (GPU direct is not available).
 		* Invoke the done callback.
 * **class RdmaTensorResponse**   - Holds and manages information for a single tensor response throughout the entire send cycle. API:
 	* **Start()**                - Start the response sequence. 
@@ -153,7 +153,7 @@ When the receiver receives the RDMA write, it will locate the relevant **RdmaTen
 	* request_index - Request index.
 	* is_dead/data_type/tensor_shape/tensor_bytes - The up-to-date meta-data.
 	* checksum - In data validation mode, this will hold the checksum of the source tensor.
-* **RDMA_MESSAGE_TENSOR_RE_REQUEST** - (receiver ==> sender) Tensor re-requset after meta-data update and reallocation of result/proxy tensors.
+* **RDMA_MESSAGE_TENSOR_RE_REQUEST** - (receiver ==> sender) Tensor re-request after meta-data update and reallocation of result/proxy tensors.
 	* type - The message type.
 	* name (name_size) - Name of the requested tensor.
 	* step_id - Step ID.
-- 
GitLab


From d83d1a4998c3ffbbc4b303948a3c71a2e7483b2d Mon Sep 17 00:00:00 2001
From: Marcus Ong <15276837+ChaseOxide@users.noreply.github.com>
Date: Wed, 7 Feb 2018 18:25:49 -0600
Subject: [PATCH 790/966] CMake (Windows): Added support for ninja build and
 some fixes/changes (#16763)

* Added support for ninja build and some fixes/changes in CMake for Windows

* Fixed typo
---
 tensorflow/contrib/cmake/CMakeLists.txt       | 16 ++++++-
 .../contrib/cmake/external/boringssl.cmake    |  1 +
 .../contrib/cmake/external/farmhash.cmake     |  1 +
 tensorflow/contrib/cmake/external/fft2d.cmake |  1 +
 tensorflow/contrib/cmake/external/gif.cmake   |  1 +
 .../contrib/cmake/external/googletest.cmake   | 10 ++++-
 tensorflow/contrib/cmake/external/grpc.cmake  | 16 +++++--
 .../contrib/cmake/external/highwayhash.cmake  |  1 +
 .../contrib/cmake/external/jemalloc.cmake     | 15 ++++---
 tensorflow/contrib/cmake/external/jpeg.cmake  |  1 +
 .../contrib/cmake/external/jsoncpp.cmake      |  7 ++-
 tensorflow/contrib/cmake/external/lmdb.cmake  | 13 +++---
 tensorflow/contrib/cmake/external/nsync.cmake |  1 +
 tensorflow/contrib/cmake/external/png.cmake   | 17 +++++--
 .../contrib/cmake/external/protobuf.cmake     | 44 ++++++++++++++++---
 tensorflow/contrib/cmake/external/re2.cmake   |  7 ++-
 .../contrib/cmake/external/snappy.cmake       |  7 ++-
 .../contrib/cmake/external/sqlite.cmake       |  1 +
 tensorflow/contrib/cmake/external/zlib.cmake  | 17 +++++--
 .../cmake/tests/cuda/compatibility_test.c     |  7 +++
 .../cmake/tests/cuda/compatibility_test.cc    |  7 +++
 tensorflow/contrib/cmake/tf_cc_ops.cmake      |  6 ++-
 tensorflow/contrib/cmake/tf_python.cmake      | 25 ++++++++---
 tensorflow/contrib/cmake/tf_shared_lib.cmake  |  6 ++-
 24 files changed, 185 insertions(+), 43 deletions(-)
 create mode 100644 tensorflow/contrib/cmake/tests/cuda/compatibility_test.c
 create mode 100644 tensorflow/contrib/cmake/tests/cuda/compatibility_test.cc

diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index b732b23320..26cf8b18b4 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -286,7 +286,21 @@ if (tensorflow_ENABLE_GPU)
     list(APPEND CMAKE_LIBRARY_PATH "${tensorflow_CUDA_LIBRARY_PATH}/stubs")
   endif (NOT WIN32)
 
-  find_package(CUDA ${tensorflow_CUDA_VERSION} REQUIRED)
+  # later command will make use of the value in tensorflow_CUDA_VERSION
+  find_package(CUDA ${tensorflow_CUDA_VERSION} REQUIRED EXACT)
+
+  # Test compatibility of compiler on CUDA
+  try_compile(CUDA_TEST_COMPILE_C
+    ${CMAKE_CURRENT_BINARY_DIR}/tests/cuda
+    ${CMAKE_CURRENT_SOURCE_DIR}/tests/cuda/compatibility_test.c
+    CMAKE_FLAGS -DINCLUDE_DIRECTORIES=${CUDA_INCLUDE_DIRS})
+  try_compile(CUDA_TEST_COMPILE_CXX
+    ${CMAKE_CURRENT_BINARY_DIR}/tests/cuda
+    ${CMAKE_CURRENT_SOURCE_DIR}/tests/cuda/compatibility_test.cc
+    CMAKE_FLAGS -DINCLUDE_DIRECTORIES=${CUDA_INCLUDE_DIRS})
+  if(NOT (CUDA_TEST_COMPILE_C AND CUDA_TEST_COMPILE_CXX))
+    message(FATAL_ERROR "Selected compiler (or version) is not supported for CUDA")
+  endif()
 
   # by default we assume compute cabability 3.5 and 5.2. If you change this change it in
   # CUDA_NVCC_FLAGS and cuda_config.h below
diff --git a/tensorflow/contrib/cmake/external/boringssl.cmake b/tensorflow/contrib/cmake/external/boringssl.cmake
index 5ad477fdff..3c4bb01e24 100644
--- a/tensorflow/contrib/cmake/external/boringssl.cmake
+++ b/tensorflow/contrib/cmake/external/boringssl.cmake
@@ -37,6 +37,7 @@ ExternalProject_Add(boringssl
     GIT_TAG ${boringssl_TAG}
     DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
     # BUILD_IN_SOURCE 1
+    BUILD_BYPRODUCTS ${boringssl_STATIC_LIBRARIES}
     INSTALL_COMMAND ""
     CMAKE_CACHE_ARGS
         -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=${tensorflow_ENABLE_POSITION_INDEPENDENT_CODE}
diff --git a/tensorflow/contrib/cmake/external/farmhash.cmake b/tensorflow/contrib/cmake/external/farmhash.cmake
index 0cd0c1030c..d51569bc21 100644
--- a/tensorflow/contrib/cmake/external/farmhash.cmake
+++ b/tensorflow/contrib/cmake/external/farmhash.cmake
@@ -33,6 +33,7 @@ if(WIN32)
       URL_HASH ${farmhash_HASH}
       DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
       BUILD_IN_SOURCE 1
+      BUILD_BYPRODUCTS ${farmhash_STATIC_LIBRARIES}
       PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/patches/farmhash/CMakeLists.txt ${farmhash_BUILD}
       INSTALL_DIR ${farmhash_INSTALL}
       CMAKE_CACHE_ARGS
diff --git a/tensorflow/contrib/cmake/external/fft2d.cmake b/tensorflow/contrib/cmake/external/fft2d.cmake
index d3af2a4676..a7bc50d5bc 100644
--- a/tensorflow/contrib/cmake/external/fft2d.cmake
+++ b/tensorflow/contrib/cmake/external/fft2d.cmake
@@ -29,6 +29,7 @@ if(WIN32)
       URL_HASH ${fft2d_HASH}
       DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
       BUILD_IN_SOURCE 1
+      BUILD_BYPRODUCTS ${fft2d_STATIC_LIBRARIES}
       PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/patches/fft2d/CMakeLists.txt ${fft2d_BUILD}/src/fft2d/CMakeLists.txt
       INSTALL_DIR ${fft2d_INSTALL}
       CMAKE_CACHE_ARGS
diff --git a/tensorflow/contrib/cmake/external/gif.cmake b/tensorflow/contrib/cmake/external/gif.cmake
index 3d53c51fff..e1f8d13f8e 100644
--- a/tensorflow/contrib/cmake/external/gif.cmake
+++ b/tensorflow/contrib/cmake/external/gif.cmake
@@ -33,6 +33,7 @@ if(WIN32)
       PREFIX gif
       URL ${gif_URL}
       URL_HASH ${gif_HASH}
+      BUILD_BYPRODUCTS ${gif_STATIC_LIBRARIES}
       PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_SOURCE_DIR}/patches/gif/CMakeLists.txt ${gif_BUILD}
       INSTALL_DIR ${gif_INSTALL}
       DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
diff --git a/tensorflow/contrib/cmake/external/googletest.cmake b/tensorflow/contrib/cmake/external/googletest.cmake
index d09bb02890..7cc5ae6390 100644
--- a/tensorflow/contrib/cmake/external/googletest.cmake
+++ b/tensorflow/contrib/cmake/external/googletest.cmake
@@ -20,8 +20,13 @@ set(googletest_BUILD ${CMAKE_CURRENT_BINARY_DIR}/googletest/)
 set(googletest_TAG ec44c6c1675c25b9827aacd08c02433cccde7780)
 
 if(WIN32)
-  set(googletest_STATIC_LIBRARIES
-      ${CMAKE_CURRENT_BINARY_DIR}/googletest/src/googletest/googletest/$(Configuration)/gtest.lib)
+  if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
+    set(googletest_STATIC_LIBRARIES
+        ${CMAKE_CURRENT_BINARY_DIR}/googletest/src/googletest/googletest/$(Configuration)/gtest.lib)
+  else()
+    set(googletest_STATIC_LIBRARIES
+        ${CMAKE_CURRENT_BINARY_DIR}/googletest/src/googletest/googletest/gtest.lib)
+  endif()
 else()
   set(googletest_STATIC_LIBRARIES
       ${CMAKE_CURRENT_BINARY_DIR}/googletest/src/googletest/googletest/${CMAKE_BUILD_TYPE}/gtest.a)
@@ -33,6 +38,7 @@ ExternalProject_Add(googletest
     GIT_TAG ${googletest_TAG}
     DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
     BUILD_IN_SOURCE 1
+    BUILD_BYPRODUCTS ${googletest_STATIC_LIBRARIES}
     #PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_SOURCE_DIR}/patches/grpc/CMakeLists.txt ${GRPC_BUILD}
     INSTALL_COMMAND ""
     CMAKE_CACHE_ARGS
diff --git a/tensorflow/contrib/cmake/external/grpc.cmake b/tensorflow/contrib/cmake/external/grpc.cmake
index 28adb4fe84..a9f43a3ecb 100644
--- a/tensorflow/contrib/cmake/external/grpc.cmake
+++ b/tensorflow/contrib/cmake/external/grpc.cmake
@@ -20,10 +20,17 @@ set(GRPC_BUILD ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc)
 set(GRPC_TAG 730b778632e79cc3c96ad237f282d687ee325ce7)
 
 if(WIN32)
-  set(grpc_STATIC_LIBRARIES
-      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/Release/grpc++_unsecure.lib
-      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/Release/grpc_unsecure.lib
-      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/Release/gpr.lib)
+  if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
+    set(grpc_STATIC_LIBRARIES
+        ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/Release/grpc++_unsecure.lib
+        ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/Release/grpc_unsecure.lib
+        ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/Release/gpr.lib)
+  else()
+    set(grpc_STATIC_LIBRARIES
+        ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/grpc++_unsecure.lib
+        ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/grpc_unsecure.lib
+        ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/gpr.lib)
+  endif()
 else()
   set(grpc_STATIC_LIBRARIES
       ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgrpc++_unsecure.a
@@ -40,6 +47,7 @@ ExternalProject_Add(grpc
     GIT_TAG ${GRPC_TAG}
     DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
     BUILD_IN_SOURCE 1
+    BUILD_BYPRODUCTS ${grpc_STATIC_LIBRARIES}
     BUILD_COMMAND ${CMAKE_COMMAND} --build . --config Release --target grpc++_unsecure
     COMMAND ${CMAKE_COMMAND} --build . --config Release --target grpc_cpp_plugin
     INSTALL_COMMAND ""
diff --git a/tensorflow/contrib/cmake/external/highwayhash.cmake b/tensorflow/contrib/cmake/external/highwayhash.cmake
index 2c23bef8a3..a6e8a38d8c 100644
--- a/tensorflow/contrib/cmake/external/highwayhash.cmake
+++ b/tensorflow/contrib/cmake/external/highwayhash.cmake
@@ -42,6 +42,7 @@ ExternalProject_Add(highwayhash
     GIT_TAG ${highwayhash_TAG}
     DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
     BUILD_IN_SOURCE 1
+    BUILD_BYPRODUCTS ${highwayhash_STATIC_LIBRARIES}
     PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/patches/highwayhash/CMakeLists.txt ${highwayhash_BUILD}
     INSTALL_DIR ${highwayhash_INSTALL}
     CMAKE_CACHE_ARGS
diff --git a/tensorflow/contrib/cmake/external/jemalloc.cmake b/tensorflow/contrib/cmake/external/jemalloc.cmake
index 198ba13e64..afadcc007d 100644
--- a/tensorflow/contrib/cmake/external/jemalloc.cmake
+++ b/tensorflow/contrib/cmake/external/jemalloc.cmake
@@ -24,8 +24,11 @@ if (WIN32)
         ${jemalloc_INCLUDE_DIRS} 
         ${CMAKE_CURRENT_BINARY_DIR}/jemalloc/src/jemalloc/include/msvc_compat
     )
-    set(jemalloc_ADDITIONAL_CMAKE_OPTIONS -A x64)
-    set(jemalloc_STATIC_LIBRARIES ${jemalloc_BUILD}/Release/jemalloc.lib)
+    if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
+        set(jemalloc_STATIC_LIBRARIES ${jemalloc_BUILD}/Release/jemalloc.lib)
+    else()
+        set(jemalloc_STATIC_LIBRARIES ${jemalloc_BUILD}/jemalloc.lib)
+    endif()
 else()
     set(jemalloc_STATIC_LIBRARIES ${jemalloc_BUILD}/Release/jemalloc.a)
 endif()
@@ -36,12 +39,12 @@ ExternalProject_Add(jemalloc
     URL_HASH ${jemalloc_HASH}
     DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
     BUILD_IN_SOURCE 1
-    CONFIGURE_COMMAND ${CMAKE_COMMAND}
+    BUILD_BYPRODUCTS ${jemalloc_STATIC_LIBRARIES}
+    BUILD_COMMAND ${CMAKE_COMMAND} --build . --config Release --target jemalloc
+    INSTALL_COMMAND ${CMAKE_COMMAND} -E echo "Skipping install step."
+    CMAKE_CACHE_ARGS
         -DCMAKE_BUILD_TYPE:STRING=Release
         -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
         -Dwith-jemalloc-prefix:STRING=jemalloc_
         -Dwithout-export:BOOL=ON
-        ${jemalloc_ADDITIONAL_CMAKE_OPTIONS}
-    BUILD_COMMAND ${CMAKE_COMMAND} --build . --config Release --target jemalloc
-    INSTALL_COMMAND ${CMAKE_COMMAND} -E echo "Skipping install step."
 )
diff --git a/tensorflow/contrib/cmake/external/jpeg.cmake b/tensorflow/contrib/cmake/external/jpeg.cmake
index d9a165e856..c1c5842aa4 100644
--- a/tensorflow/contrib/cmake/external/jpeg.cmake
+++ b/tensorflow/contrib/cmake/external/jpeg.cmake
@@ -46,6 +46,7 @@ if (WIN32)
         PREFIX jpeg
         URL ${jpeg_URL}
         URL_HASH ${jpeg_HASH}
+        BUILD_BYPRODUCTS ${jpeg_STATIC_LIBRARIES}
         PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/patches/jpeg/CMakeLists.txt ${jpeg_BUILD}
         INSTALL_DIR ${jpeg_INSTALL}
         DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
diff --git a/tensorflow/contrib/cmake/external/jsoncpp.cmake b/tensorflow/contrib/cmake/external/jsoncpp.cmake
index 861201f97e..84c52e3652 100644
--- a/tensorflow/contrib/cmake/external/jsoncpp.cmake
+++ b/tensorflow/contrib/cmake/external/jsoncpp.cmake
@@ -23,7 +23,11 @@ set(jsoncpp_LIBRARIES ${jsoncpp_BUILD}/obj/so/libjsoncpp.so)
 set(jsoncpp_INCLUDES ${jsoncpp_BUILD})
 
 if(WIN32)
-  set(jsoncpp_STATIC_LIBRARIES ${jsoncpp_BUILD}/$(Configuration)/jsoncpp.lib)
+  if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
+    set(jsoncpp_STATIC_LIBRARIES ${jsoncpp_BUILD}/$(Configuration)/jsoncpp.lib)
+  else()
+    set(jsoncpp_STATIC_LIBRARIES ${jsoncpp_BUILD}/jsoncpp.lib)
+  endif()
 else()
   set(jsoncpp_STATIC_LIBRARIES ${jsoncpp_BUILD}/libjsoncpp.a)
 endif()
@@ -40,6 +44,7 @@ ExternalProject_Add(jsoncpp
     GIT_TAG ${jsoncpp_TAG}
     DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
     BUILD_IN_SOURCE 1
+    BUILD_BYPRODUCTS ${jsoncpp_STATIC_LIBRARIES}
     INSTALL_COMMAND ""
     CMAKE_CACHE_ARGS
         -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=${tensorflow_ENABLE_POSITION_INDEPENDENT_CODE}
diff --git a/tensorflow/contrib/cmake/external/lmdb.cmake b/tensorflow/contrib/cmake/external/lmdb.cmake
index 41b314e285..ed5ab788ac 100644
--- a/tensorflow/contrib/cmake/external/lmdb.cmake
+++ b/tensorflow/contrib/cmake/external/lmdb.cmake
@@ -20,10 +20,17 @@ set(lmdb_HASH SHA256=108532fb94c6f227558d45be3f3347b52539f0f58290a7bb31ec06c462d
 set(lmdb_BUILD ${CMAKE_BINARY_DIR}/lmdb/src/lmdb)
 set(lmdb_INSTALL ${CMAKE_BINARY_DIR}/lmdb/install)
 
+if(WIN32)
+    set(lmdb_STATIC_LIBRARIES ${lmdb_INSTALL}/lib/lmdb.lib)
+else()
+    set(lmdb_STATIC_LIBRARIES ${lmdb_INSTALL}/lib/liblmdb.a)
+endif()
+
 ExternalProject_Add(lmdb
     PREFIX lmdb
     URL ${lmdb_URL}
     URL_HASH ${lmdb_HASH}
+    BUILD_BYPRODUCTS ${lmdb_STATIC_LIBRARIES}
     PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different
         ${CMAKE_CURRENT_SOURCE_DIR}/patches/lmdb/CMakeLists.txt ${lmdb_BUILD}
     INSTALL_DIR ${lmdb_INSTALL}
@@ -35,12 +42,6 @@ ExternalProject_Add(lmdb
         -DCMAKE_INSTALL_PREFIX:STRING=${lmdb_INSTALL}
 )
 
-if(WIN32)
-    set(lmdb_STATIC_LIBRARIES ${lmdb_INSTALL}/lib/lmdb.lib)
-else()
-    set(lmdb_STATIC_LIBRARIES ${lmdb_INSTALL}/lib/liblmdb.a)
-endif()
-
 set(lmdb_HEADERS
     "${lmdb_INSTALL}/include/lmdb.h"
     "${lmdb_INSTALL}/include/midl.h"
diff --git a/tensorflow/contrib/cmake/external/nsync.cmake b/tensorflow/contrib/cmake/external/nsync.cmake
index 0508006047..f3a37ff508 100644
--- a/tensorflow/contrib/cmake/external/nsync.cmake
+++ b/tensorflow/contrib/cmake/external/nsync.cmake
@@ -42,6 +42,7 @@ ExternalProject_Add(nsync
     GIT_TAG ${nsync_TAG}
     DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
     BUILD_IN_SOURCE 1
+    BUILD_BYPRODUCTS ${nsync_STATIC_LIBRARIES}
     PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/patches/nsync/CMakeLists.txt ${nsync_BUILD}
     INSTALL_DIR ${nsync_INSTALL}
     CMAKE_CACHE_ARGS
diff --git a/tensorflow/contrib/cmake/external/png.cmake b/tensorflow/contrib/cmake/external/png.cmake
index b277be5690..6cd66a6599 100644
--- a/tensorflow/contrib/cmake/external/png.cmake
+++ b/tensorflow/contrib/cmake/external/png.cmake
@@ -21,9 +21,19 @@ set(png_BUILD ${CMAKE_BINARY_DIR}/png/src/png)
 set(png_INSTALL ${CMAKE_BINARY_DIR}/png/install)
 
 if(WIN32)
-  set(png_STATIC_LIBRARIES 
-    debug ${CMAKE_BINARY_DIR}/png/install/lib/libpng12_staticd.lib
-    optimized ${CMAKE_BINARY_DIR}/png/install/lib/libpng12_static.lib)
+  if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
+    set(png_STATIC_LIBRARIES 
+      debug ${CMAKE_BINARY_DIR}/png/install/lib/libpng12_staticd.lib
+      optimized ${CMAKE_BINARY_DIR}/png/install/lib/libpng12_static.lib)
+  else()
+    if(CMAKE_BUILD_TYPE EQUAL Debug)
+      set(png_STATIC_LIBRARIES 
+        ${CMAKE_BINARY_DIR}/png/install/lib/libpng12_staticd.lib)
+    else()
+      set(png_STATIC_LIBRARIES 
+        ${CMAKE_BINARY_DIR}/png/install/lib/libpng12_static.lib)
+    endif()
+  endif()
 else()
   set(png_STATIC_LIBRARIES ${CMAKE_BINARY_DIR}/png/install/lib/libpng12.a)
 endif()
@@ -38,6 +48,7 @@ ExternalProject_Add(png
     DEPENDS zlib
     URL ${png_URL}
     URL_HASH ${png_HASH}
+    BUILD_BYPRODUCTS ${png_STATIC_LIBRARIES}
     INSTALL_DIR ${png_INSTALL}
     DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
     CMAKE_CACHE_ARGS
diff --git a/tensorflow/contrib/cmake/external/protobuf.cmake b/tensorflow/contrib/cmake/external/protobuf.cmake
index fd05fa6d47..aba8a5244e 100644
--- a/tensorflow/contrib/cmake/external/protobuf.cmake
+++ b/tensorflow/contrib/cmake/external/protobuf.cmake
@@ -19,11 +19,34 @@ set(PROTOBUF_URL https://github.com/google/protobuf.git)
 set(PROTOBUF_TAG 396336eb961b75f03b25824fe86cf6490fb75e3a)
 
 if(WIN32)
-  set(protobuf_STATIC_LIBRARIES 
-    debug ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/$(Configuration)/libprotobufd.lib
-    optimized ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/$(Configuration)/libprotobuf.lib)
-  set(PROTOBUF_PROTOC_EXECUTABLE ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/$(Configuration)/protoc.exe)
-  set(PROTOBUF_ADDITIONAL_CMAKE_OPTIONS	-Dprotobuf_MSVC_STATIC_RUNTIME:BOOL=OFF -A x64)
+  if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
+    set(protobuf_STATIC_LIBRARIES 
+      debug ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/$(Configuration)/libprotobufd.lib
+      optimized ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/$(Configuration)/libprotobuf.lib)
+    set(PROTOBUF_PROTOC_EXECUTABLE ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/$(Configuration)/protoc.exe)
+  else()
+    if(CMAKE_BUILD_TYPE EQUAL Debug)
+      set(protobuf_STATIC_LIBRARIES
+        ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/libprotobufd.lib)
+    else()
+      set(protobuf_STATIC_LIBRARIES
+        ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/libprotobuf.lib)
+    endif()
+    set(PROTOBUF_PROTOC_EXECUTABLE ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/protoc.exe)
+  endif()
+
+  # This section is to make sure CONFIGURE_COMMAND use the same generator settings
+  set(PROTOBUF_GENERATOR_PLATFORM)
+  if (CMAKE_GENERATOR_PLATFORM)
+    set(PROTOBUF_GENERATOR_PLATFORM -A ${CMAKE_GENERATOR_PLATFORM})
+  endif()
+  set(PROTOBUF_GENERATOR_TOOLSET)
+  if (CMAKE_GENERATOR_TOOLSET)
+  set(PROTOBUF_GENERATOR_TOOLSET -T ${CMAKE_GENERATOR_TOOLSET})
+  endif()
+  set(PROTOBUF_ADDITIONAL_CMAKE_OPTIONS	-Dprotobuf_MSVC_STATIC_RUNTIME:BOOL=OFF
+    -G${CMAKE_GENERATOR} ${PROTOBUF_GENERATOR_PLATFORM} ${PROTOBUF_GENERATOR_TOOLSET})
+  # End of section
 else()
   set(protobuf_STATIC_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/libprotobuf.a)
   set(PROTOBUF_PROTOC_EXECUTABLE ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/protoc)
@@ -36,10 +59,15 @@ ExternalProject_Add(protobuf
     GIT_TAG ${PROTOBUF_TAG}
     DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
     BUILD_IN_SOURCE 1
+    BUILD_BYPRODUCTS ${PROTOBUF_PROTOC_EXECUTABLE} ${protobuf_STATIC_LIBRARIES}
     SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf
+    # SOURCE_SUBDIR cmake/ # Requires CMake 3.7, this will allow removal of CONFIGURE_COMMAND
+    # CONFIGURE_COMMAND resets some settings made in CMAKE_CACHE_ARGS and the generator used
     CONFIGURE_COMMAND ${CMAKE_COMMAND} cmake/
-        -Dprotobuf_BUILD_TESTS=OFF
-        -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+        -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=${tensorflow_ENABLE_POSITION_INDEPENDENT_CODE}
+        -DCMAKE_BUILD_TYPE:STRING=Release
+        -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
+        -Dprotobuf_BUILD_TESTS:BOOL=OFF
         -DZLIB_ROOT=${ZLIB_INSTALL}
         ${PROTOBUF_ADDITIONAL_CMAKE_OPTIONS}
     INSTALL_COMMAND ""
@@ -47,5 +75,7 @@ ExternalProject_Add(protobuf
         -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=${tensorflow_ENABLE_POSITION_INDEPENDENT_CODE}
         -DCMAKE_BUILD_TYPE:STRING=Release
         -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
+        -Dprotobuf_BUILD_TESTS:BOOL=OFF
+        -Dprotobuf_MSVC_STATIC_RUNTIME:BOOL=OFF
         -DZLIB_ROOT:STRING=${ZLIB_INSTALL}
 )
diff --git a/tensorflow/contrib/cmake/external/re2.cmake b/tensorflow/contrib/cmake/external/re2.cmake
index 371d8447f9..c4bc0b1707 100644
--- a/tensorflow/contrib/cmake/external/re2.cmake
+++ b/tensorflow/contrib/cmake/external/re2.cmake
@@ -21,7 +21,11 @@ set(re2_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/re2/install)
 set(re2_TAG e7efc48)
 
 if(WIN32)
-  set(re2_STATIC_LIBRARIES ${re2_BUILD}/$(Configuration)/re2.lib)
+  if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
+    set(re2_STATIC_LIBRARIES ${re2_BUILD}/$(Configuration)/re2.lib)
+  else()
+    set(re2_STATIC_LIBRARIES ${re2_BUILD}/re2.lib)
+  endif()
 else()
   set(re2_STATIC_LIBRARIES ${re2_BUILD}/libre2.a)
 endif()
@@ -36,6 +40,7 @@ ExternalProject_Add(re2
     GIT_TAG ${re2_TAG}
     INSTALL_DIR ${re2_INSTALL}
     BUILD_IN_SOURCE 1
+    BUILD_BYPRODUCTS ${re2_STATIC_LIBRARIES}
     DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
     CMAKE_CACHE_ARGS
         -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=${tensorflow_ENABLE_POSITION_INDEPENDENT_CODE}
diff --git a/tensorflow/contrib/cmake/external/snappy.cmake b/tensorflow/contrib/cmake/external/snappy.cmake
index fd57734298..f54197643b 100644
--- a/tensorflow/contrib/cmake/external/snappy.cmake
+++ b/tensorflow/contrib/cmake/external/snappy.cmake
@@ -20,7 +20,11 @@ set(snappy_BUILD ${CMAKE_CURRENT_BINARY_DIR}/snappy/src/snappy)
 set(snappy_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/snappy/src/snappy)
 
 if(WIN32)
-    set(snappy_STATIC_LIBRARIES ${snappy_BUILD}/$(Configuration)/snappy.lib)
+    if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
+        set(snappy_STATIC_LIBRARIES ${snappy_BUILD}/$(Configuration)/snappy.lib)
+    else()
+        set(snappy_STATIC_LIBRARIES ${snappy_BUILD}/snappy.lib)
+    endif()
 else()
     set(snappy_STATIC_LIBRARIES ${snappy_BUILD}/libsnappy.a)
 endif()
@@ -35,6 +39,7 @@ ExternalProject_Add(snappy
     GIT_TAG ${snappy_TAG}
     DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
     BUILD_IN_SOURCE 1
+    BUILD_BYPRODUCTS ${snappy_STATIC_LIBRARIES}
     INSTALL_COMMAND ""
     LOG_DOWNLOAD ON
     LOG_CONFIGURE ON
diff --git a/tensorflow/contrib/cmake/external/sqlite.cmake b/tensorflow/contrib/cmake/external/sqlite.cmake
index 8297c60712..57c4ae7651 100644
--- a/tensorflow/contrib/cmake/external/sqlite.cmake
+++ b/tensorflow/contrib/cmake/external/sqlite.cmake
@@ -36,6 +36,7 @@ if (WIN32)
         PREFIX sqlite
         URL ${sqlite_URL}
         URL_HASH ${sqlite_HASH}
+        BUILD_BYPRODUCTS ${sqlite_STATIC_LIBRARIES}
         PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/patches/sqlite/CMakeLists.txt ${sqlite_BUILD}
         INSTALL_DIR ${sqlite_INSTALL}
         DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
diff --git a/tensorflow/contrib/cmake/external/zlib.cmake b/tensorflow/contrib/cmake/external/zlib.cmake
index 5bec14fb00..c5eb0cbcc7 100644
--- a/tensorflow/contrib/cmake/external/zlib.cmake
+++ b/tensorflow/contrib/cmake/external/zlib.cmake
@@ -21,9 +21,19 @@ set(ZLIB_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/zlib/install)
 set(ZLIB_TAG 50893291621658f355bc5b4d450a8d06a563053d)
 
 if(WIN32)
-  set(zlib_STATIC_LIBRARIES
-      debug ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/zlibstaticd.lib
-      optimized ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/zlibstatic.lib)
+  if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
+    set(zlib_STATIC_LIBRARIES
+        debug ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/zlibstaticd.lib
+        optimized ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/zlibstatic.lib)
+  else()
+    if(CMAKE_BUILD_TYPE EQUAL Debug)
+      set(zlib_STATIC_LIBRARIES
+          ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/zlibstaticd.lib)
+    else()
+      set(zlib_STATIC_LIBRARIES
+          ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/zlibstatic.lib)
+    endif()
+  endif()
 else()
   set(zlib_STATIC_LIBRARIES
       ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/libz.a)
@@ -40,6 +50,7 @@ ExternalProject_Add(zlib
     GIT_TAG ${ZLIB_TAG}
     INSTALL_DIR ${ZLIB_INSTALL}
     BUILD_IN_SOURCE 1
+    BUILD_BYPRODUCTS ${zlib_STATIC_LIBRARIES}
     DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
     CMAKE_CACHE_ARGS
         -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=${tensorflow_ENABLE_POSITION_INDEPENDENT_CODE}
diff --git a/tensorflow/contrib/cmake/tests/cuda/compatibility_test.c b/tensorflow/contrib/cmake/tests/cuda/compatibility_test.c
new file mode 100644
index 0000000000..968ab13a0c
--- /dev/null
+++ b/tensorflow/contrib/cmake/tests/cuda/compatibility_test.c
@@ -0,0 +1,7 @@
+// This is a program to test if compiler is compatible with CUDA.
+#define __CUDACC__
+#include "crt/host_config.h"
+
+int main(void) {
+  return 0;
+}
diff --git a/tensorflow/contrib/cmake/tests/cuda/compatibility_test.cc b/tensorflow/contrib/cmake/tests/cuda/compatibility_test.cc
new file mode 100644
index 0000000000..968ab13a0c
--- /dev/null
+++ b/tensorflow/contrib/cmake/tests/cuda/compatibility_test.cc
@@ -0,0 +1,7 @@
+// This is a program to test if compiler is compatible with CUDA.
+#define __CUDACC__
+#include "crt/host_config.h"
+
+int main(void) {
+  return 0;
+}
diff --git a/tensorflow/contrib/cmake/tf_cc_ops.cmake b/tensorflow/contrib/cmake/tf_cc_ops.cmake
index f3cf3e7044..f73da0b8ab 100644
--- a/tensorflow/contrib/cmake/tf_cc_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_cc_ops.cmake
@@ -149,7 +149,11 @@ add_library(tf_cc OBJECT ${tf_cc_srcs})
 add_dependencies(tf_cc tf_cc_framework tf_cc_ops)
 
 if (WIN32)
-  set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/pywrap_tensorflow_internal.lib")
+  if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
+    set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/pywrap_tensorflow_internal.lib")
+  else()
+    set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow_internal.lib")
+  endif()
 else (WIN32)
   set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal.so")
 endif (WIN32)
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index 34c466fa01..27ecf78036 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -541,7 +541,11 @@ if(WIN32)
 	${nsync_STATIC_LIBRARIES}
     )
 
-    set(pywrap_tensorflow_deffile "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/pywrap_tensorflow.def")
+    if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
+        set(pywrap_tensorflow_deffile "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/pywrap_tensorflow.def")
+    else()
+        set(pywrap_tensorflow_deffile "${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow.def")
+    endif()
     set_source_files_properties(${pywrap_tensorflow_deffile} PROPERTIES GENERATED TRUE)
 
     add_custom_command(TARGET pywrap_tensorflow_internal_static POST_BUILD
@@ -549,6 +553,7 @@ if(WIN32)
             --input "${pywrap_tensorflow_internal_static_dependencies}"
             --output "${pywrap_tensorflow_deffile}"
             --target _pywrap_tensorflow_internal.pyd
+        BYPRODUCTS ${pywrap_tensorflow_deffile} # Required for Ninja
     )
 endif(WIN32)
 
@@ -702,11 +707,19 @@ add_custom_command(TARGET tf_python_copy_scripts_to_destination PRE_BUILD
                                    ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/testing/python/framework/)
 
 if(WIN32)
-  add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/$(Configuration)/pywrap_tensorflow_internal.dll
-                                     ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.pyd
-    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/$(Configuration)/pywrap_tensorflow_internal.lib
-                                     ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/)
+  if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
+    add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/$(Configuration)/pywrap_tensorflow_internal.dll
+                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.pyd
+      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/$(Configuration)/pywrap_tensorflow_internal.lib
+                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/)
+  else()
+    add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow_internal.dll
+                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.pyd
+      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow_internal.lib
+                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/)
+  endif()
 else()
   add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
     COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal.so
diff --git a/tensorflow/contrib/cmake/tf_shared_lib.cmake b/tensorflow/contrib/cmake/tf_shared_lib.cmake
index 571d2b0dec..6d36d5fc5c 100644
--- a/tensorflow/contrib/cmake/tf_shared_lib.cmake
+++ b/tensorflow/contrib/cmake/tf_shared_lib.cmake
@@ -46,7 +46,11 @@ if(WIN32)
       $<TARGET_FILE:tf_protos_cc>
   )
 
-  set(tensorflow_deffile "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/tensorflow.def")
+  if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
+    set(tensorflow_deffile "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/tensorflow.def")
+  else()
+    set(tensorflow_deffile "${CMAKE_CURRENT_BINARY_DIR}/tensorflow.def")
+  endif()
   set_source_files_properties(${tensorflow_deffile} PROPERTIES GENERATED TRUE)
 
   add_custom_command(TARGET tensorflow_static POST_BUILD
-- 
GitLab


From 0e42ad5756e4675b513cdd7cda405620e9501612 Mon Sep 17 00:00:00 2001
From: Sergei Lebedev <superbobry@gmail.com>
Date: Thu, 8 Feb 2018 01:26:36 +0100
Subject: [PATCH 791/966] Fixed a typo in `group_by_window` documentation
 (#16711)

---
 tensorflow/contrib/data/python/ops/grouping.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/data/python/ops/grouping.py b/tensorflow/contrib/data/python/ops/grouping.py
index ef91c56726..67b085002a 100644
--- a/tensorflow/contrib/data/python/ops/grouping.py
+++ b/tensorflow/contrib/data/python/ops/grouping.py
@@ -45,7 +45,7 @@ def group_by_window(key_func,
     key_func: A function mapping a nested structure of tensors
       (having shapes and types defined by `self.output_shapes` and
       `self.output_types`) to a scalar `tf.int64` tensor.
-    reduce_func: A function mapping a key and a dataset of up to `batch_size`
+    reduce_func: A function mapping a key and a dataset of up to `window_size`
       consecutive elements matching that key to another dataset.
     window_size: A `tf.int64` scalar `tf.Tensor`, representing the number of
       consecutive elements matching the same key to combine in a single
-- 
GitLab


From 7ee1613aad72301bdbad192747edaac1ff94ac1d Mon Sep 17 00:00:00 2001
From: cclauss <cclauss@bluewin.ch>
Date: Thu, 8 Feb 2018 01:28:06 +0100
Subject: [PATCH 792/966] Fix undefined name: import as_str_any for line 35
 (#16668)

flake8 testing of https://github.com/tensorflow/tensorflow on Python 2.7.14

$ __flake8 . --count --select=E901,E999,F821,F822,F823 --show-source --statistics__
```
./tensorflow/python/util/compat_internal.py:33:12: F821 undefined name 'as_str_any'
    path = as_str_any(path.__fspath__())
           ^
```
---
 tensorflow/python/util/compat_internal.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/python/util/compat_internal.py b/tensorflow/python/util/compat_internal.py
index a299b2fc3c..9e60e689d2 100644
--- a/tensorflow/python/util/compat_internal.py
+++ b/tensorflow/python/util/compat_internal.py
@@ -19,6 +19,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.util.compat import as_str_any
+
 
 def path_to_str(path):
   """Returns the file system path representation of a `PathLike` object, else as it is.
-- 
GitLab


From 14ebbebc290510b6cfa491349862e6c5aca4200a Mon Sep 17 00:00:00 2001
From: Eugene Brevdo <ebrevdo@google.com>
Date: Wed, 7 Feb 2018 16:24:34 -0800
Subject: [PATCH 793/966] Remove tf.contrib.ndlstm as it is not maintained and
 barely used.

Users can find an external implementation by the original author at:

https://github.com/tmbarchive/tfndlstm

PiperOrigin-RevId: 184914822
---
 tensorflow/BUILD                              |   1 -
 tensorflow/contrib/BUILD                      |   1 -
 tensorflow/contrib/__init__.py                |   1 -
 tensorflow/contrib/cmake/python_modules.txt   |   2 -
 tensorflow/contrib/ndlstm/BUILD               |  92 --------
 tensorflow/contrib/ndlstm/README.md           |  31 ---
 tensorflow/contrib/ndlstm/__init__.py         |  22 --
 tensorflow/contrib/ndlstm/python/__init__.py  |  25 --
 tensorflow/contrib/ndlstm/python/lstm1d.py    | 184 ---------------
 .../contrib/ndlstm/python/lstm1d_test.py      | 106 ---------
 tensorflow/contrib/ndlstm/python/lstm2d.py    | 213 ------------------
 .../contrib/ndlstm/python/lstm2d_test.py      |  98 --------
 tensorflow/contrib/ndlstm/python/misc.py      |  99 --------
 tensorflow/contrib/ndlstm/python/misc_test.py |  78 -------
 tensorflow/contrib/specs/BUILD                |   1 -
 tensorflow/contrib/specs/README.md            |  11 -
 tensorflow/contrib/specs/python/specs_ops.py  |  20 --
 tensorflow/contrib/specs/python/specs_test.py |  30 ---
 tensorflow/tools/docs/generate_1_0.py         |   1 -
 tensorflow/tools/docs/generate_lib.py         |   1 -
 tensorflow/tools/pip_package/BUILD            |   2 -
 21 files changed, 1019 deletions(-)
 delete mode 100644 tensorflow/contrib/ndlstm/BUILD
 delete mode 100644 tensorflow/contrib/ndlstm/README.md
 delete mode 100644 tensorflow/contrib/ndlstm/__init__.py
 delete mode 100644 tensorflow/contrib/ndlstm/python/__init__.py
 delete mode 100644 tensorflow/contrib/ndlstm/python/lstm1d.py
 delete mode 100644 tensorflow/contrib/ndlstm/python/lstm1d_test.py
 delete mode 100644 tensorflow/contrib/ndlstm/python/lstm2d.py
 delete mode 100644 tensorflow/contrib/ndlstm/python/lstm2d_test.py
 delete mode 100644 tensorflow/contrib/ndlstm/python/misc.py
 delete mode 100644 tensorflow/contrib/ndlstm/python/misc_test.py

diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index a73e89bc1a..2e71783b0d 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -535,7 +535,6 @@ filegroup(
         "//tensorflow/contrib/model_pruning:all_files",
         "//tensorflow/contrib/model_pruning/examples/cifar10:all_files",
         "//tensorflow/contrib/nccl:all_files",
-        "//tensorflow/contrib/ndlstm:all_files",
         "//tensorflow/contrib/nearest_neighbor:all_files",
         "//tensorflow/contrib/nn:all_files",
         "//tensorflow/contrib/opt:all_files",
diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index 3ed8cef56c..f48c2fe92d 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -71,7 +71,6 @@ py_library(
         "//tensorflow/contrib/metrics:metrics_py",
         "//tensorflow/contrib/model_pruning",
         "//tensorflow/contrib/nccl:nccl_py",
-        "//tensorflow/contrib/ndlstm",
         "//tensorflow/contrib/nearest_neighbor:nearest_neighbor_py",
         "//tensorflow/contrib/nn:nn_py",
         "//tensorflow/contrib/opt:opt_py",
diff --git a/tensorflow/contrib/__init__.py b/tensorflow/contrib/__init__.py
index 46b579b889..4f6f539027 100644
--- a/tensorflow/contrib/__init__.py
+++ b/tensorflow/contrib/__init__.py
@@ -84,7 +84,6 @@ from tensorflow.contrib import training
 from tensorflow.contrib import util
 from tensorflow.contrib.eager.python import tfe as eager
 from tensorflow.contrib.lite.python import lite
-from tensorflow.contrib.ndlstm import python as ndlstm
 from tensorflow.contrib.receptive_field import receptive_field_api as receptive_field
 from tensorflow.contrib.remote_fused_graph import pylib as remote_fused_graph
 from tensorflow.contrib.specs import python as specs
diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt
index 57a52bf4ca..2720c43b78 100644
--- a/tensorflow/contrib/cmake/python_modules.txt
+++ b/tensorflow/contrib/cmake/python_modules.txt
@@ -329,8 +329,6 @@ tensorflow/contrib/nccl/kernels
 tensorflow/contrib/nccl/ops
 tensorflow/contrib/nccl/python
 tensorflow/contrib/nccl/python/ops
-tensorflow/contrib/ndlstm
-tensorflow/contrib/ndlstm/python
 tensorflow/contrib/nearest_neighbor/kernels
 tensorflow/contrib/nearest_neighbor/ops
 tensorflow/contrib/nearest_neighbor/python
diff --git a/tensorflow/contrib/ndlstm/BUILD b/tensorflow/contrib/ndlstm/BUILD
deleted file mode 100644
index 8403f84188..0000000000
--- a/tensorflow/contrib/ndlstm/BUILD
+++ /dev/null
@@ -1,92 +0,0 @@
-# Description:
-#   Contains classes implementing 1D and 2D LSTMs for image and signal
-#   processing problems.
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-package(default_visibility = ["//tensorflow:__subpackages__"])
-
-load("//tensorflow:tensorflow.bzl", "tf_py_test")
-
-py_library(
-    name = "ndlstm",
-    srcs = [
-        "__init__.py",
-        "python/__init__.py",
-        "python/lstm1d.py",
-        "python/lstm2d.py",
-        "python/misc.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/contrib/framework:framework_py",
-        "//tensorflow/contrib/layers:layers_py",
-        "//tensorflow/contrib/rnn:rnn_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:rnn",
-        "//tensorflow/python:rnn_cell",
-        "//tensorflow/python:sparse_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-    ],
-)
-
-tf_py_test(
-    name = "lstm1d_test",
-    srcs = ["python/lstm1d_test.py"],
-    additional_deps = [
-        ":ndlstm",
-        "//third_party/py/numpy",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:variables",
-    ],
-)
-
-tf_py_test(
-    name = "lstm2d_test",
-    srcs = ["python/lstm2d_test.py"],
-    additional_deps = [
-        ":ndlstm",
-        "//third_party/py/numpy",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:variables",
-    ],
-)
-
-tf_py_test(
-    name = "misc_test",
-    srcs = ["python/misc_test.py"],
-    additional_deps = [
-        ":ndlstm",
-        "//third_party/py/numpy",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:variables",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/ndlstm/README.md b/tensorflow/contrib/ndlstm/README.md
deleted file mode 100644
index 7ccb57f1b3..0000000000
--- a/tensorflow/contrib/ndlstm/README.md
+++ /dev/null
@@ -1,31 +0,0 @@
-Library of multidimensional LSTM models and related code.
-
-# 2D LSTM code
-
-The 2D LSTM layers take tensors of the form (batch_size, height, width,
-depth), compatible with convolutional layers, as inputs. The library
-transposes and reshapes these tensors in a way that allows batches of
-images to be processed by LSTMs.
-
-The library currently provides:
-
- - a separable 2D LSTM layer
- - a simple 2D convolutional layer that can be swapped out against 2D LSTM
- - layers to reduce images to sequences and images to final state vectors
- - layers for sequence classification, pixel-wise classification
-
-# Other Dimensions
-
-There is 1D LSTM code in `lstm1d.py`. This code implements 1D LSTM versions
-suitable as a basis for higher dimensional LSTMs. It is intended for constant
-batch size and uses a different layout.  Although the code is perfectly fine for
-1D use, you may find other 1D LSTM implementations to be more convenient if you
-are interested in sequence problems.
-
-# Upcoming Changes
-
- - PyramidLSTM
- - support for 3D and 4D
- - optional use of native fused LSTM op
- - easy-to-use command line drivers and examples
- - operators for patch-wise processing
diff --git a/tensorflow/contrib/ndlstm/__init__.py b/tensorflow/contrib/ndlstm/__init__.py
deleted file mode 100644
index a5dd100b26..0000000000
--- a/tensorflow/contrib/ndlstm/__init__.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Library of multidimensional LSTM models and related code."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.ndlstm.python import lstm1d
-from tensorflow.contrib.ndlstm.python import lstm2d
diff --git a/tensorflow/contrib/ndlstm/python/__init__.py b/tensorflow/contrib/ndlstm/python/__init__.py
deleted file mode 100644
index 1aa51a6ec4..0000000000
--- a/tensorflow/contrib/ndlstm/python/__init__.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Init file, giving convenient access to all ndlstm ops."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=wildcard-import,g-importing-member
-from tensorflow.contrib.ndlstm.python.lstm1d import *
-from tensorflow.contrib.ndlstm.python.lstm2d import *
-from tensorflow.contrib.ndlstm.python.misc import *
-# pylint: enable=wildcard-import
diff --git a/tensorflow/contrib/ndlstm/python/lstm1d.py b/tensorflow/contrib/ndlstm/python/lstm1d.py
deleted file mode 100644
index 2e2e9086c0..0000000000
--- a/tensorflow/contrib/ndlstm/python/lstm1d.py
+++ /dev/null
@@ -1,184 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""LSTM layers for sequences."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from six.moves import xrange  # pylint: disable=redefined-builtin
-from tensorflow.contrib.framework.python.ops import variables
-from tensorflow.python.framework import constant_op
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import rnn
-from tensorflow.python.ops import rnn_cell
-from tensorflow.python.ops import variable_scope
-
-
-def _shape(tensor):
-  return tensor.get_shape().as_list()
-
-
-def ndlstm_base_unrolled(inputs, noutput, scope=None, reverse=False):
-  """Run an LSTM, either forward or backward.
-
-  This is a 1D LSTM implementation using unrolling and the TensorFlow
-  LSTM op.
-
-  Args:
-    inputs: input sequence (length, batch_size, ninput)
-    noutput: depth of output
-    scope: optional scope name
-    reverse: run LSTM in reverse
-
-  Returns:
-    Output sequence (length, batch_size, noutput)
-
-  """
-  with variable_scope.variable_scope(scope, "SeqLstmUnrolled", [inputs]):
-    length, batch_size, _ = _shape(inputs)
-    lstm_cell = rnn_cell.BasicLSTMCell(noutput, state_is_tuple=False)
-    state = array_ops.zeros([batch_size, lstm_cell.state_size])
-    output_u = []
-    inputs_u = array_ops.unstack(inputs)
-    if reverse:
-      inputs_u = list(reversed(inputs_u))
-    for i in xrange(length):
-      if i > 0:
-        variable_scope.get_variable_scope().reuse_variables()
-      output, state = lstm_cell(inputs_u[i], state)
-      output_u += [output]
-    if reverse:
-      output_u = list(reversed(output_u))
-    outputs = array_ops.stack(output_u)
-    return outputs
-
-
-def ndlstm_base_dynamic(inputs, noutput, scope=None, reverse=False):
-  """Run an LSTM, either forward or backward.
-
-  This is a 1D LSTM implementation using dynamic_rnn and
-  the TensorFlow LSTM op.
-
-  Args:
-    inputs: input sequence (length, batch_size, ninput)
-    noutput: depth of output
-    scope: optional scope name
-    reverse: run LSTM in reverse
-
-  Returns:
-    Output sequence (length, batch_size, noutput)
-  """
-  with variable_scope.variable_scope(scope, "SeqLstm", [inputs]):
-    lstm_cell = rnn_cell.BasicLSTMCell(noutput)
-    if reverse:
-      inputs = array_ops.reverse_v2(inputs, [0])
-    outputs, _ = rnn.dynamic_rnn(
-        lstm_cell, inputs, time_major=True, dtype=inputs.dtype)
-    if reverse:
-      outputs = array_ops.reverse_v2(outputs, [0])
-    return outputs
-
-
-def ndlstm_base(inputs, noutput, scope=None, reverse=False, dynamic=True):
-  """Implements a 1D LSTM, either forward or backward.
-
-  This is a base case for multidimensional LSTM implementations, which
-  tend to be used differently from sequence-to-sequence
-  implementations.  For general 1D sequence to sequence
-  transformations, you may want to consider another implementation
-  from TF slim.
-
-  Args:
-    inputs: input sequence (length, batch_size, ninput)
-    noutput: depth of output
-    scope: optional scope name
-    reverse: run LSTM in reverse
-    dynamic: use dynamic_rnn
-
-  Returns:
-    Output sequence (length, batch_size, noutput)
-
-  """
-  # TODO(tmb) maybe add option for other LSTM implementations, like
-  # slim.rnn.basic_lstm_cell
-  if dynamic:
-    return ndlstm_base_dynamic(inputs, noutput, scope=scope, reverse=reverse)
-  else:
-    return ndlstm_base_unrolled(inputs, noutput, scope=scope, reverse=reverse)
-
-
-def sequence_to_final(inputs, noutput, scope=None, name=None, reverse=False):
-  """Run an LSTM across all steps and returns only the final state.
-
-  Args:
-    inputs: (length, batch_size, depth) tensor
-    noutput: size of output vector
-    scope: optional scope name
-    name: optional name for output tensor
-    reverse: run in reverse
-
-  Returns:
-    Batch of size (batch_size, noutput).
-  """
-  with variable_scope.variable_scope(scope, "SequenceToFinal", [inputs]):
-    length, batch_size, _ = _shape(inputs)
-    lstm = rnn_cell.BasicLSTMCell(noutput, state_is_tuple=False)
-    state = array_ops.zeros([batch_size, lstm.state_size])
-    inputs_u = array_ops.unstack(inputs)
-    if reverse:
-      inputs_u = list(reversed(inputs_u))
-    for i in xrange(length):
-      if i > 0:
-        variable_scope.get_variable_scope().reuse_variables()
-      output, state = lstm(inputs_u[i], state)
-    outputs = array_ops.reshape(output, [batch_size, noutput], name=name)
-    return outputs
-
-
-def sequence_softmax(inputs, noutput, scope=None, name=None, linear_name=None):
-  """Run a softmax layer over all the time steps of an input sequence.
-
-  Args:
-    inputs: (length, batch_size, depth) tensor
-    noutput: output depth
-    scope: optional scope name
-    name: optional name for output tensor
-    linear_name: name for linear (pre-softmax) output
-
-  Returns:
-    A tensor of size (length, batch_size, noutput).
-
-  """
-  length, _, ninputs = _shape(inputs)
-  inputs_u = array_ops.unstack(inputs)
-  output_u = []
-  with variable_scope.variable_scope(scope, "SequenceSoftmax", [inputs]):
-    initial_w = random_ops.truncated_normal([0 + ninputs, noutput], stddev=0.1)
-    initial_b = constant_op.constant(0.1, shape=[noutput])
-    w = variables.model_variable("weights", initializer=initial_w)
-    b = variables.model_variable("biases", initializer=initial_b)
-    for i in xrange(length):
-      with variable_scope.variable_scope(scope, "SequenceSoftmaxStep",
-                                         [inputs_u[i]]):
-        # TODO(tmb) consider using slim.fully_connected(...,
-        # activation_fn=tf.nn.softmax)
-        linear = nn_ops.xw_plus_b(inputs_u[i], w, b, name=linear_name)
-        output = nn_ops.softmax(linear)
-        output_u += [output]
-    outputs = array_ops.stack(output_u, name=name)
-  return outputs
diff --git a/tensorflow/contrib/ndlstm/python/lstm1d_test.py b/tensorflow/contrib/ndlstm/python/lstm1d_test.py
deleted file mode 100644
index 49b15cc814..0000000000
--- a/tensorflow/contrib/ndlstm/python/lstm1d_test.py
+++ /dev/null
@@ -1,106 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for 1D LSTM."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.contrib.ndlstm.python import lstm1d as lstm1d_lib
-from tensorflow.python.framework import constant_op
-from tensorflow.python.ops import gradient_checker
-from tensorflow.python.ops import gradients_impl
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import test
-
-lstm1d = lstm1d_lib
-
-
-def _rand(*size):
-  return np.random.uniform(size=size).astype("f")
-
-
-class Lstm1DTest(test.TestCase):
-
-  def testSequenceToSequenceDims(self):
-    with self.test_session():
-      inputs = constant_op.constant(_rand(17, 1, 5))
-      outputs = lstm1d.ndlstm_base(inputs, 8)
-      variables.global_variables_initializer().run()
-      names = [v.name for v in variables.trainable_variables()]
-      self.assertEqual(len(names), 2)
-      result = outputs.eval()
-      self.assertEqual(tuple(result.shape), (17, 1, 8))
-
-  def testSequenceToSequenceGradient(self):
-    with self.test_session():
-      size = (17, 1, 15)
-      output_size = (17, 1, 8)
-      inputs = constant_op.constant(_rand(*size))
-      outputs = lstm1d.ndlstm_base(inputs, 8, dynamic=False)
-      variables.global_variables_initializer().run()
-      gradients = gradients_impl.gradients(outputs, inputs)
-      if 1:  # pylint: disable=using-constant-test
-        gradients = gradients_impl.gradients(outputs, inputs)[0].eval()
-        self.assertEqual(gradients.shape, size)
-      else:
-        # TODO(tmb) tf.test.compute_gradient error is currently broken
-        # with dynamic_rnn. Enable this test case eventually.
-        err = gradient_checker.compute_gradient_error(
-            inputs, size, outputs, output_size, delta=1e-4)
-        self.assert_(not np.isnan(err))
-        self.assert_(err < 0.1)
-
-  def testSequenceToSequenceGradientReverse(self):
-    with self.test_session():
-      size = (17, 1, 15)
-      output_size = (17, 1, 8)
-      inputs = constant_op.constant(_rand(*size))
-      outputs = lstm1d.ndlstm_base(inputs, 8, reverse=1, dynamic=False)
-      variables.global_variables_initializer().run()
-      if 1:  # pylint: disable=using-constant-test
-        gradients = gradients_impl.gradients(outputs, inputs)[0].eval()
-        self.assertEqual(gradients.shape, size)
-      else:
-        # TODO(tmb) tf.test.compute_gradient error is currently broken
-        # with dynamic_rnn. Enable this test case eventually.
-        err = gradient_checker.compute_gradient_error(
-            inputs, size, outputs, output_size, delta=1e-4)
-        self.assert_(not np.isnan(err))
-        self.assert_(err < 0.1)
-
-  def testSequenceToFinalDims(self):
-    with self.test_session():
-      inputs = constant_op.constant(_rand(17, 6, 5))
-      outputs = lstm1d.sequence_to_final(inputs, 8)
-      variables.global_variables_initializer().run()
-      names = [v.name for v in variables.trainable_variables()]
-      self.assertEqual(len(names), 2)
-      result = outputs.eval()
-      self.assertEqual(tuple(result.shape), (6, 8))
-
-  def testSequenceSoftmaxDims(self):
-    with self.test_session():
-      inputs = constant_op.constant(_rand(17, 1, 5))
-      outputs = lstm1d.sequence_softmax(inputs, 8)
-      variables.global_variables_initializer().run()
-      result = outputs.eval()
-      self.assertEqual(tuple(result.shape), (17, 1, 8))
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/ndlstm/python/lstm2d.py b/tensorflow/contrib/ndlstm/python/lstm2d.py
deleted file mode 100644
index ebbb4ccf11..0000000000
--- a/tensorflow/contrib/ndlstm/python/lstm2d.py
+++ /dev/null
@@ -1,213 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""A small library of functions dealing with LSTMs applied to images.
-
-Tensors in this library generally have the shape (num_images, height, width,
-depth).
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.ndlstm.python import lstm1d
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import variable_scope
-
-
-def _shape(tensor):
-  """Get the shape of a tensor as an int list."""
-  return tensor.get_shape().as_list()
-
-
-def images_to_sequence(tensor):
-  """Convert a batch of images into a batch of sequences.
-
-  Args:
-    tensor: a (num_images, height, width, depth) tensor
-
-  Returns:
-    (width, num_images*height, depth) sequence tensor
-  """
-
-  num_image_batches, height, width, depth = _shape(tensor)
-  transposed = array_ops.transpose(tensor, [2, 0, 1, 3])
-  return array_ops.reshape(transposed,
-                           [width, num_image_batches * height, depth])
-
-
-def sequence_to_images(tensor, num_image_batches):
-  """Convert a batch of sequences into a batch of images.
-
-  Args:
-    tensor: (num_steps, num_batches, depth) sequence tensor
-    num_image_batches: the number of image batches
-
-  Returns:
-    (num_images, height, width, depth) tensor
-  """
-
-  width, num_batches, depth = _shape(tensor)
-  height = num_batches // num_image_batches
-  reshaped = array_ops.reshape(tensor,
-                               [width, num_image_batches, height, depth])
-  return array_ops.transpose(reshaped, [1, 2, 0, 3])
-
-
-def horizontal_lstm(images, num_filters_out, scope=None):
-  """Run an LSTM bidirectionally over all the rows of each image.
-
-  Args:
-    images: (num_images, height, width, depth) tensor
-    num_filters_out: output depth
-    scope: optional scope name
-
-  Returns:
-    (num_images, height, width, num_filters_out) tensor, where
-    num_steps is width and new num_batches is num_image_batches * height
-  """
-  with variable_scope.variable_scope(scope, "HorizontalLstm", [images]):
-    batch_size, _, _, _ = _shape(images)
-    sequence = images_to_sequence(images)
-    with variable_scope.variable_scope("lr"):
-      hidden_sequence_lr = lstm1d.ndlstm_base(sequence, num_filters_out // 2)
-    with variable_scope.variable_scope("rl"):
-      hidden_sequence_rl = (lstm1d.ndlstm_base(
-          sequence, num_filters_out - num_filters_out // 2, reverse=1))
-    output_sequence = array_ops.concat([hidden_sequence_lr, hidden_sequence_rl],
-                                       2)
-    output = sequence_to_images(output_sequence, batch_size)
-    return output
-
-
-def get_blocks(images, kernel_size):
-  """Split images in blocks
-
-  Args:
-    images: (num_images, height, width, depth) tensor
-    kernel_size: A list of length 2 holding the [kernel_height, kernel_width] of
-      of the pooling. Can be an int if both values are the same.
-
-  Returns:
-    (num_images, height/kernel_height, width/kernel_width,
-    depth*kernel_height*kernel_width) tensor
-  """
-  with variable_scope.variable_scope("image_blocks"):
-    batch_size, height, width, chanels = _shape(images)
-
-    if height % kernel_size[0] != 0:
-      offset = array_ops.zeros([batch_size,
-                                kernel_size[0] - (height % kernel_size[0]),
-                                width,
-                                chanels])
-      images = array_ops.concat([images, offset], 1)
-      batch_size, height, width, chanels = _shape(images)
-    if width % kernel_size[1] != 0:
-      offset = array_ops.zeros([batch_size,
-                                height,
-                                kernel_size[1] - (width % kernel_size[1]),
-                                chanels])
-      images = array_ops.concat([images, offset], 2)
-      batch_size, height, width, chanels = _shape(images)
-
-    h, w = int(height / kernel_size[0]), int(width / kernel_size[1])
-    features = kernel_size[1] * kernel_size[0] * chanels
-
-    lines = array_ops.split(images, h, axis=1)
-    line_blocks = []
-    for line in lines:
-      line = array_ops.transpose(line, [0, 2, 3, 1])
-      line = array_ops.reshape(line, [batch_size, w, features])
-      line_blocks.append(line)
-
-    return array_ops.stack(line_blocks, axis=1)
-
-
-def separable_lstm(images, num_filters_out,
-                   kernel_size=None, nhidden=None, scope=None):
-  """Run bidirectional LSTMs first horizontally then vertically.
-
-  Args:
-    images: (num_images, height, width, depth) tensor
-    num_filters_out: output layer depth
-    kernel_size: A list of length 2 holding the [kernel_height, kernel_width] of
-      of the pooling. Can be an int if both values are the same. Set to None for
-      not using blocks
-    nhidden: hidden layer depth
-    scope: optional scope name
-
-  Returns:
-    (num_images, height/kernel_height, width/kernel_width,
-    num_filters_out) tensor
-  """
-  with variable_scope.variable_scope(scope, "SeparableLstm", [images]):
-    if nhidden is None:
-      nhidden = num_filters_out
-    if kernel_size is not None:
-      images = get_blocks(images, kernel_size)
-    hidden = horizontal_lstm(images, nhidden)
-    with variable_scope.variable_scope("vertical"):
-      transposed = array_ops.transpose(hidden, [0, 2, 1, 3])
-      output_transposed = horizontal_lstm(transposed, num_filters_out)
-    output = array_ops.transpose(output_transposed, [0, 2, 1, 3])
-    return output
-
-
-def reduce_to_sequence(images, num_filters_out, scope=None):
-  """Reduce an image to a sequence by scanning an LSTM vertically.
-
-  Args:
-    images: (num_images, height, width, depth) tensor
-    num_filters_out: output layer depth
-    scope: optional scope name
-
-  Returns:
-    A (width, num_images, num_filters_out) sequence.
-  """
-  with variable_scope.variable_scope(scope, "ReduceToSequence", [images]):
-    batch_size, height, width, depth = _shape(images)
-    transposed = array_ops.transpose(images, [1, 0, 2, 3])
-    reshaped = array_ops.reshape(transposed,
-                                 [height, batch_size * width, depth])
-    reduced = lstm1d.sequence_to_final(reshaped, num_filters_out)
-    output = array_ops.reshape(reduced, [batch_size, width, num_filters_out])
-    return output
-
-
-def reduce_to_final(images, num_filters_out, nhidden=None, scope=None):
-  """Reduce an image to a final state by running two LSTMs.
-
-  Args:
-    images: (num_images, height, width, depth) tensor
-    num_filters_out: output layer depth
-    nhidden: hidden layer depth (defaults to num_filters_out)
-    scope: optional scope name
-
-  Returns:
-    A (num_images, num_filters_out) batch.
-  """
-  with variable_scope.variable_scope(scope, "ReduceToFinal", [images]):
-    nhidden = nhidden or num_filters_out
-    batch_size, height, width, depth = _shape(images)
-    transposed = array_ops.transpose(images, [1, 0, 2, 3])
-    reshaped = array_ops.reshape(transposed,
-                                 [height, batch_size * width, depth])
-    with variable_scope.variable_scope("reduce1"):
-      reduced = lstm1d.sequence_to_final(reshaped, nhidden)
-      transposed_hidden = array_ops.reshape(reduced,
-                                            [batch_size, width, nhidden])
-      hidden = array_ops.transpose(transposed_hidden, [1, 0, 2])
-    with variable_scope.variable_scope("reduce2"):
-      output = lstm1d.sequence_to_final(hidden, num_filters_out)
-    return output
diff --git a/tensorflow/contrib/ndlstm/python/lstm2d_test.py b/tensorflow/contrib/ndlstm/python/lstm2d_test.py
deleted file mode 100644
index f1b37d701b..0000000000
--- a/tensorflow/contrib/ndlstm/python/lstm2d_test.py
+++ /dev/null
@@ -1,98 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for 2D LSTMs."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.contrib.ndlstm.python import lstm2d as lstm2d_lib
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import test
-
-lstm2d = lstm2d_lib
-
-
-def _rand(*size):
-  return np.random.uniform(size=size).astype("f")
-
-
-class Lstm2DTest(test_util.TensorFlowTestCase):
-
-  def testImagesToSequenceDims(self):
-    with self.test_session():
-      inputs = constant_op.constant(_rand(2, 7, 11, 5))
-      outputs = lstm2d.images_to_sequence(inputs)
-      variables.global_variables_initializer().run()
-      result = outputs.eval()
-      self.assertEqual(tuple(result.shape), (11, 14, 5))
-
-  def testSequenceToImagesDims(self):
-    with self.test_session():
-      inputs = constant_op.constant(_rand(11, 14, 5))
-      outputs = lstm2d.sequence_to_images(inputs, 2)
-      variables.global_variables_initializer().run()
-      result = outputs.eval()
-      self.assertEqual(tuple(result.shape), (2, 7, 11, 5))
-
-  def testImagesAndSequenceDims(self):
-    with self.test_session():
-      size = (2, 7, 11, 5)
-      inputs = constant_op.constant(_rand(*size))
-      sequence = lstm2d.images_to_sequence(inputs)
-      outputs = lstm2d.sequence_to_images(sequence, size[0])
-      variables.global_variables_initializer().run()
-      result = outputs.eval()
-      self.assertEqual(tuple(result.shape), size)
-
-  def testSeparableLstmDims(self):
-    with self.test_session():
-      inputs = constant_op.constant(_rand(2, 7, 11, 5))
-      outputs = lstm2d.separable_lstm(inputs, 8)
-      variables.global_variables_initializer().run()
-      result = outputs.eval()
-      self.assertEqual(tuple(result.shape), (2, 7, 11, 8))
-
-  def testSeparableLstmDimsBlocks(self):
-    with self.test_session():
-      inputs = constant_op.constant(_rand(2, 7, 11, 5))
-      outputs = lstm2d.separable_lstm(inputs, 8, kernel_size=[2, 2])
-      variables.global_variables_initializer().run()
-      result = outputs.eval()
-      self.assertEqual(tuple(result.shape), (2, 4, 6, 8))
-
-  def testReduceToSequenceDims(self):
-    with self.test_session():
-      inputs = constant_op.constant(_rand(2, 7, 11, 5))
-      outputs = lstm2d.reduce_to_sequence(inputs, 8)
-      variables.global_variables_initializer().run()
-      result = outputs.eval()
-      self.assertEqual(tuple(result.shape), (2, 11, 8))
-
-  def testReduceToFinalDims(self):
-    with self.test_session():
-      inputs = constant_op.constant(_rand(2, 7, 11, 5))
-      outputs = lstm2d.reduce_to_final(inputs, 8, 12)
-      variables.global_variables_initializer().run()
-      result = outputs.eval()
-      self.assertEqual(tuple(result.shape), (2, 8))
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/ndlstm/python/misc.py b/tensorflow/contrib/ndlstm/python/misc.py
deleted file mode 100644
index 38eeff84ca..0000000000
--- a/tensorflow/contrib/ndlstm/python/misc.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Miscellaneous functions useful for nD-LSTM models.
-
-Some of these functions duplicate functionality in tfslim with
-slightly different interfaces.
-
-Tensors in this library generally have the shape (num_images, height, width,
-depth).
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.layers.python.layers import layers
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import sparse_ops
-
-
-def _shape(tensor):
-  """Get the shape of a tensor as an int list."""
-  return tensor.get_shape().as_list()
-
-
-def pixels_as_vector(images, scope=None):
-  """Reduce images to vectors by combining all pixels."""
-  with ops.name_scope(scope, "PixelsAsVector", [images]):
-    batch_size, height, width, depth = _shape(images)
-    return array_ops.reshape(images, [batch_size, height * width * depth])
-
-
-def pool_as_vector(images, scope=None):
-  """Reduce images to vectors by averaging all pixels."""
-  with ops.name_scope(scope, "PoolAsVector", [images]):
-    return math_ops.reduce_mean(images, [1, 2])
-
-
-def one_hot_planes(labels, num_classes, scope=None):
-  """Compute 1-hot encodings for planes.
-
-  Given a label, this computes a label image that contains
-  1 at all pixels in the plane corresponding to the target
-  class and 0 in all other planes.
-
-  Args:
-    labels: (batch_size,) tensor
-    num_classes: number of classes
-    scope: optional scope name
-
-  Returns:
-    Tensor of shape (batch_size, 1, 1, num_classes) with a 1-hot encoding.
-  """
-  with ops.name_scope(scope, "OneHotPlanes", [labels]):
-    batch_size, = _shape(labels)
-    batched = layers.one_hot_encoding(labels, num_classes)
-    return array_ops.reshape(batched, [batch_size, 1, 1, num_classes])
-
-
-def one_hot_mask(labels, num_classes, scope=None):
-  """Compute 1-hot encodings for masks.
-
-  Given a label image, this computes the one hot encoding at
-  each pixel.
-
-  Args:
-    labels: (batch_size, width, height, 1) tensor containing labels.
-    num_classes: number of classes
-    scope: optional scope name
-
-  Returns:
-    Tensor of shape (batch_size, width, height, num_classes) with
-    a 1-hot encoding.
-  """
-  with ops.name_scope(scope, "OneHotMask", [labels]):
-    height, width, depth = _shape(labels)
-    assert depth == 1
-    sparse_labels = math_ops.to_int32(array_ops.reshape(labels, [-1, 1]))
-    sparse_size, _ = _shape(sparse_labels)
-    indices = array_ops.reshape(math_ops.range(0, sparse_size, 1), [-1, 1])
-    concated = array_ops.concat([indices, sparse_labels], 1)
-    dense_result = sparse_ops.sparse_to_dense(concated,
-                                              [sparse_size, num_classes], 1.0,
-                                              0.0)
-    result = array_ops.reshape(dense_result, [height, width, num_classes])
-    return result
diff --git a/tensorflow/contrib/ndlstm/python/misc_test.py b/tensorflow/contrib/ndlstm/python/misc_test.py
deleted file mode 100644
index fac9023da3..0000000000
--- a/tensorflow/contrib/ndlstm/python/misc_test.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Miscellaneous tests."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.contrib.ndlstm.python import misc as misc_lib
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import test
-
-misc = misc_lib
-
-
-def _rand(*size):
-  return np.random.uniform(size=size).astype("f")
-
-
-class LstmMiscTest(test_util.TensorFlowTestCase):
-
-  def testPixelsAsVectorDims(self):
-    with self.test_session():
-      inputs = constant_op.constant(_rand(2, 7, 11, 5))
-      outputs = misc.pixels_as_vector(inputs)
-      variables.global_variables_initializer().run()
-      result = outputs.eval()
-      self.assertEqual(tuple(result.shape), (2, 7 * 11 * 5))
-
-  def testPoolAsVectorDims(self):
-    with self.test_session():
-      inputs = constant_op.constant(_rand(2, 7, 11, 5))
-      outputs = misc.pool_as_vector(inputs)
-      variables.global_variables_initializer().run()
-      result = outputs.eval()
-      self.assertEqual(tuple(result.shape), (2, 5))
-
-  def testOneHotPlanes(self):
-    with self.test_session():
-      inputs = constant_op.constant([0, 1, 3])
-      outputs = misc.one_hot_planes(inputs, 4)
-      variables.global_variables_initializer().run()
-      result = outputs.eval()
-      self.assertEqual(tuple(result.shape), (3, 1, 1, 4))
-      target = np.array([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 0, 1]])
-      self.assertAllClose(result.reshape(-1), target.reshape(-1))
-
-  def testOneHotMask(self):
-    with self.test_session():
-      data = np.array([[0, 1, 2], [2, 0, 1]]).reshape(2, 3, 1)
-      inputs = constant_op.constant(data)
-      outputs = misc.one_hot_mask(inputs, 3)
-      variables.global_variables_initializer().run()
-      result = outputs.eval()
-      self.assertEqual(tuple(result.shape), (2, 3, 3))
-      target = np.array([[[1, 0, 0], [0, 1, 0]], [[0, 1, 0], [0, 0, 1]],
-                         [[0, 0, 1], [1, 0, 0]]]).transpose(1, 2, 0)
-      self.assertAllClose(result.reshape(-1), target.reshape(-1))
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/specs/BUILD b/tensorflow/contrib/specs/BUILD
index 4b688690ae..084953a0a2 100644
--- a/tensorflow/contrib/specs/BUILD
+++ b/tensorflow/contrib/specs/BUILD
@@ -23,7 +23,6 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/contrib/layers:layers_py",
-        "//tensorflow/contrib/ndlstm",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:logging_ops",
diff --git a/tensorflow/contrib/specs/README.md b/tensorflow/contrib/specs/README.md
index b764e6e714..bcf34e601f 100644
--- a/tensorflow/contrib/specs/README.md
+++ b/tensorflow/contrib/specs/README.md
@@ -59,17 +59,6 @@ Reshaping:
  - `Squeeze` = tf.squeeze
  - `Expand` = tf.expand_dims
 
-Multidimensional LSTM:
-
-These are intended as alternatives to 2D convolutions.  For sequence models,
-there will be other modeling primitives.
-
- - `Lstm2` = Fun(lstm2d.separable_lstm)  # 2D-to-2D
- - `Lstm2to1` = Fun(lstm2d.reduce_to_sequence)  # 2D-to-1D
- - `Lstm2to0` = Fun(lstm2d.reduce_to_final)  # 2D-to-vector
- - `Clstm2(n, m)` is a `Cl(n, [3,3])` followed by `Lstm2(m)`
- - `Dws(n)` is a depthwise convolution `Cs(n, [1, 1])`
-
 Other:
 
  - `Id` = identity
diff --git a/tensorflow/contrib/specs/python/specs_ops.py b/tensorflow/contrib/specs/python/specs_ops.py
index a6bd4d16c2..49b989b8d0 100644
--- a/tensorflow/contrib/specs/python/specs_ops.py
+++ b/tensorflow/contrib/specs/python/specs_ops.py
@@ -23,8 +23,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.layers.python.layers import layers
-from tensorflow.contrib.ndlstm.python import lstm1d
-from tensorflow.contrib.ndlstm.python import lstm2d
 from tensorflow.contrib.specs.python import specs_lib
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import logging_ops
@@ -122,17 +120,6 @@ Sig = Fun(math_ops.sigmoid)
 Tanh = Fun(math_ops.tanh)
 Smax = Fun(nn_ops.softmax)
 
-# 2D LSTM
-
-Lstm2 = Fun(lstm2d.separable_lstm)
-Lstm2to1 = Fun(lstm2d.reduce_to_sequence)  # 2D to 1D
-Lstm2to0 = Fun(lstm2d.reduce_to_final)  # 2D to depth-only
-
-
-def Clstm2(n, *args, **kw):
-  """2D LSTM with 3x3 pre-convolution."""
-  return Cl(n, [3, 3]) | Lstm2(*args, **kw)
-
 
 def Dws(n):
   """Depth-wise convolution + sigmoid (used after LSTM)."""
@@ -143,13 +130,6 @@ def Dwm(n):
   """Depth-wise convolution + softmax (used after LSTM)."""
   return Cm(n, [1, 1])
 
-
-# 1D LSTM
-
-Lstm1 = Fun(lstm1d.ndlstm_base)
-Lstm1to0 = Fun(lstm1d.sequence_to_final)  # 1D to depth-only
-Ssm = Fun(lstm1d.sequence_softmax)
-
 # Sharing of Variables
 
 
diff --git a/tensorflow/contrib/specs/python/specs_test.py b/tensorflow/contrib/specs/python/specs_test.py
index 41782a9fc9..9a4ad36793 100644
--- a/tensorflow/contrib/specs/python/specs_test.py
+++ b/tensorflow/contrib/specs/python/specs_test.py
@@ -149,36 +149,6 @@ class SpecsTest(test.TestCase):
       self.assertEqual(tuple(result.shape), (10, 20))
       self.assertEqual(summaries.tf_spec_structure(spec, inputs), "_ sig sig")
 
-  def testLstm2(self):
-    with self.test_session():
-      inputs = constant_op.constant(_rand(1, 64, 64, 5))
-      spec = "net = Lstm2(15)"
-      outputs = specs.create_net(spec, inputs)
-      self.assertEqual(outputs.get_shape().as_list(), [1, 64, 64, 15])
-      variables.global_variables_initializer().run()
-      result = outputs.eval()
-      self.assertEqual(tuple(result.shape), (1, 64, 64, 15))
-
-  def testLstm2to1(self):
-    with self.test_session():
-      inputs = constant_op.constant(_rand(1, 64, 64, 5))
-      spec = "net = Lstm2to1(15)"
-      outputs = specs.create_net(spec, inputs)
-      self.assertEqual(outputs.get_shape().as_list(), [1, 64, 15])
-      variables.global_variables_initializer().run()
-      result = outputs.eval()
-      self.assertEqual(tuple(result.shape), (1, 64, 15))
-
-  def testLstm2to0(self):
-    with self.test_session():
-      inputs = constant_op.constant(_rand(1, 64, 64, 5))
-      spec = "net = Lstm2to0(15)"
-      outputs = specs.create_net(spec, inputs)
-      self.assertEqual(outputs.get_shape().as_list(), [1, 15])
-      variables.global_variables_initializer().run()
-      result = outputs.eval()
-      self.assertEqual(tuple(result.shape), (1, 15))
-
   def testKeywordRestriction(self):
     with self.test_session():
       inputs = constant_op.constant(_rand(10, 20))
diff --git a/tensorflow/tools/docs/generate_1_0.py b/tensorflow/tools/docs/generate_1_0.py
index cdc03fdcac..f4384e0ced 100644
--- a/tensorflow/tools/docs/generate_1_0.py
+++ b/tensorflow/tools/docs/generate_1_0.py
@@ -53,7 +53,6 @@ if __name__ == '__main__':
           'factorization',
           'grid_rnn',
           'labeled_tensor',
-          'ndlstm',
           'quantization',
           'session_bundle',
           'slim',
diff --git a/tensorflow/tools/docs/generate_lib.py b/tensorflow/tools/docs/generate_lib.py
index 003f972070..34dd419f15 100644
--- a/tensorflow/tools/docs/generate_lib.py
+++ b/tensorflow/tools/docs/generate_lib.py
@@ -215,7 +215,6 @@ def _get_default_do_not_descend_map():
           # Block contrib.keras to de-clutter the docs
           'keras',
           'labeled_tensor',
-          'ndlstm',
           'quantization',
           'session_bundle',
           'slim',
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index a9c4a8de42..3189bd09fc 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -70,7 +70,6 @@ py_binary(
         "//tensorflow/python/eager:eager_pip",
         "//tensorflow/contrib/summary:summary_test_util",
         # These targets don't build on Windows yet. Exclude them for now.
-        # "//tensorflow/contrib/ndlstm",
         # "//tensorflow/contrib/slim",
         # "//tensorflow/contrib/slim/python/slim/nets:nets_pip",
         # "//tensorflow/contrib/specs",
@@ -159,7 +158,6 @@ sh_binary(
             "//tensorflow/contrib/lite/toco:toco",
             "//tensorflow/contrib/lite/toco/python:toco_wrapper",
             "//tensorflow/contrib/lite/toco/python:toco_from_protos",
-            "//tensorflow/contrib/ndlstm:ndlstm",
             "//tensorflow/contrib/nn:nn_py",
             "//tensorflow/contrib/predictor:predictor_pip",
             "//tensorflow/contrib/py2tf:py2tf",
-- 
GitLab


From d9764c54fa21e988e1993ac015062fafea4b30cf Mon Sep 17 00:00:00 2001
From: Yusuke Yamada <yamachu.dev@gmail.com>
Date: Thu, 8 Feb 2018 09:35:13 +0900
Subject: [PATCH 794/966] Fix document typo (#16489)

---
 tensorflow/contrib/lite/g3doc/custom_operators.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/lite/g3doc/custom_operators.md b/tensorflow/contrib/lite/g3doc/custom_operators.md
index 204a489a93..d7cc854eba 100644
--- a/tensorflow/contrib/lite/g3doc/custom_operators.md
+++ b/tensorflow/contrib/lite/g3doc/custom_operators.md
@@ -73,7 +73,7 @@ TfLiteStatus SinEval(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteRegistration* Register_SIN() {
-  static TfLiteRegistration r = {nullptr, nullptr, SinResize, SinEval};
+  static TfLiteRegistration r = {nullptr, nullptr, SinPrepare, SinEval};
   return &r;
 }
 ```
-- 
GitLab


From 466d30632e83dd97696c2f013d6c7456554ece34 Mon Sep 17 00:00:00 2001
From: Michael Case <mikecase@google.com>
Date: Wed, 7 Feb 2018 16:26:55 -0800
Subject: [PATCH 795/966] Disable internally failing array_ops test.

PiperOrigin-RevId: 184915141
---
 tensorflow/python/kernel_tests/array_ops_test.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index 7ec4624310..1e2ea82988 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import time
+import unittest
 
 import numpy as np
 
@@ -1164,6 +1165,8 @@ class InvertPermutationTest(test_util.TensorFlowTestCase):
 
 class UnravelIndexTest(test_util.TensorFlowTestCase):
 
+  # TODO(b/73086570): Reenable test.
+  @unittest.skip("Test does not pass internally.")
   def testUnravelIndex(self):
     with self.test_session():
       for dtype in [dtypes.int32, dtypes.int64]:
-- 
GitLab


From 17ffceb08910e78f186fd932c5d4175febc9d402 Mon Sep 17 00:00:00 2001
From: cclauss <cclauss@bluewin.ch>
Date: Thu, 8 Feb 2018 01:35:29 +0100
Subject: [PATCH 796/966] resolve undefined name array_ops (#16485)

---
 tensorflow/contrib/framework/python/ops/accumulate_n_v2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/framework/python/ops/accumulate_n_v2.py b/tensorflow/contrib/framework/python/ops/accumulate_n_v2.py
index 2375ee4f55..476528b0dd 100644
--- a/tensorflow/contrib/framework/python/ops/accumulate_n_v2.py
+++ b/tensorflow/contrib/framework/python/ops/accumulate_n_v2.py
@@ -22,6 +22,7 @@ from __future__ import print_function
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
 
@@ -108,4 +109,3 @@ def _AddNGrad(op, grad):
   """Same as gradient for AddN. Copies the gradient to all inputs."""
   # Not broadcasting.
   return [grad] * len(op.inputs)
-
-- 
GitLab


From 7a5664f77f5c99bed6edf391c70bb9ea852e0c5e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 7 Feb 2018 16:34:57 -0800
Subject: [PATCH 797/966] Update ops-related pbtxt files.

PiperOrigin-RevId: 184916250
---
 .../core/ops/compat/ops_history.v1.pbtxt      | 71 +++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 | 71 +++++++++++++++++++
 2 files changed, 142 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 2580eaf987..8db4373fdc 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -46689,6 +46689,49 @@ op {
     }
   }
 }
+op {
+  name: "Roll"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "shift"
+    type_attr: "Tshift"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Taxis"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tshift"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Taxis"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "Round"
   input_arg {
@@ -65317,6 +65360,34 @@ op {
     }
   }
 }
+op {
+  name: "UnravelIndex"
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "dims"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tidx"
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "UnsortedSegmentMax"
   input_arg {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 8df126735b..2e96211fdc 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -22134,6 +22134,49 @@ op {
     }
   }
 }
+op {
+  name: "Roll"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "shift"
+    type_attr: "Tshift"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Taxis"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tshift"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Taxis"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "Round"
   input_arg {
@@ -30887,6 +30930,34 @@ op {
     }
   }
 }
+op {
+  name: "UnravelIndex"
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "dims"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tidx"
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "UnsortedSegmentMax"
   input_arg {
-- 
GitLab


From b1f5f433959406c7aad634c05e85ccd62fd06e87 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 7 Feb 2018 16:46:31 -0800
Subject: [PATCH 798/966] Support CopyFile with streaming (#12658)

* Support CopyFile with streaming

This fix tries to address the issue raised in 12641 where
it was not possible to have CopyFile with streaming. The original
implementation copies the whole content of the file to a string
buffer and write to the file. This could be an issue
if the file size is large (than memory of the host).

This fix streams the CopyFile operation.

This fix fixes 12641.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Use sendfile for CopyFile implementation in Linux

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Merge CopyFile for same fs and different fs

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* `sendfile64` -> `sendfile` to fix Android build

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add sendfile processing for Darwin

This commit adds sendfile processing for OSX Darwin.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Not using sendfile in MacOSX

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Address review feedback

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Remove the size check and test OUT_OF_RANGE instead.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Small fixes

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Rename CopyFile to FileSystemCopyFile to fix Windows build errors

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/core/platform/env.cc               | 37 ++++++++++
 tensorflow/core/platform/env.h                |  8 +++
 tensorflow/core/platform/file_system.cc       |  4 ++
 tensorflow/core/platform/file_system.h        |  3 +
 .../core/platform/posix/posix_file_system.cc  | 72 +++++++++++++++++++
 .../core/platform/posix/posix_file_system.h   |  2 +
 tensorflow/python/lib/io/file_io.i            | 16 ++---
 7 files changed, 131 insertions(+), 11 deletions(-)

diff --git a/tensorflow/core/platform/env.cc b/tensorflow/core/platform/env.cc
index 1bcca1243f..12509c250e 100644
--- a/tensorflow/core/platform/env.cc
+++ b/tensorflow/core/platform/env.cc
@@ -44,6 +44,9 @@ limitations under the License.
 
 namespace tensorflow {
 
+// 128KB copy buffer
+constexpr size_t kCopyFileBufferSize = 128 * 1024;
+
 class FileSystemRegistryImpl : public FileSystemRegistry {
  public:
   Status Register(const string& scheme, Factory factory) override;
@@ -278,6 +281,17 @@ Status Env::RenameFile(const string& src, const string& target) {
   return src_fs->RenameFile(src, target);
 }
 
+Status Env::CopyFile(const string& src, const string& target) {
+  FileSystem* src_fs;
+  FileSystem* target_fs;
+  TF_RETURN_IF_ERROR(GetFileSystemForFile(src, &src_fs));
+  TF_RETURN_IF_ERROR(GetFileSystemForFile(target, &target_fs));
+  if (src_fs == target_fs) {
+    return src_fs->CopyFile(src, target);
+  }
+  return FileSystemCopyFile(src_fs, src, target_fs, target);
+}
+
 string Env::GetExecutablePath() {
   char exe_path[PATH_MAX] = {0};
 #ifdef __APPLE__
@@ -406,6 +420,29 @@ Status WriteStringToFile(Env* env, const string& fname,
   return s;
 }
 
+Status FileSystemCopyFile(FileSystem* src_fs, const string& src,
+                          FileSystem* target_fs, const string& target) {
+  std::unique_ptr<RandomAccessFile> src_file;
+  TF_RETURN_IF_ERROR(src_fs->NewRandomAccessFile(src, &src_file));
+
+  std::unique_ptr<WritableFile> target_file;
+  TF_RETURN_IF_ERROR(target_fs->NewWritableFile(target, &target_file));
+
+  uint64 offset = 0;
+  std::unique_ptr<char[]> scratch(new char[kCopyFileBufferSize]);
+  Status s = Status::OK();
+  while (s.ok()) {
+    StringPiece result;
+    s = src_file->Read(offset, kCopyFileBufferSize, &result, scratch.get());
+    if (!(s.ok() || s.code() == error::OUT_OF_RANGE)) {
+      return s;
+    }
+    TF_RETURN_IF_ERROR(target_file->Append(result));
+    offset += result.size();
+  }
+  return target_file->Close();
+}
+
 // A ZeroCopyInputStream on a RandomAccessFile.
 namespace {
 class FileStream : public ::tensorflow::protobuf::io::ZeroCopyInputStream {
diff --git a/tensorflow/core/platform/env.h b/tensorflow/core/platform/env.h
index 34aaf3f78b..4ce4e0b4e0 100644
--- a/tensorflow/core/platform/env.h
+++ b/tensorflow/core/platform/env.h
@@ -214,6 +214,9 @@ class Env {
   /// replaced.
   Status RenameFile(const string& src, const string& target);
 
+  /// \brief Copy the src to target.
+  Status CopyFile(const string& src, const string& target);
+
   /// \brief Returns the absolute path of the current executable. It resolves
   /// symlinks if there is any.
   string GetExecutablePath();
@@ -381,6 +384,11 @@ struct ThreadOptions {
   size_t guard_size = 0;  // 0: use system default value
 };
 
+/// A utility routine: copy contents of `src` in file system `src_fs`
+/// to `target` in file system `target_fs`.
+Status FileSystemCopyFile(FileSystem* src_fs, const string& src,
+                          FileSystem* target_fs, const string& target);
+
 /// A utility routine: reads contents of named file into `*data`
 Status ReadFileToString(Env* env, const string& fname, string* data);
 
diff --git a/tensorflow/core/platform/file_system.cc b/tensorflow/core/platform/file_system.cc
index b9866cf641..271d73f5f1 100644
--- a/tensorflow/core/platform/file_system.cc
+++ b/tensorflow/core/platform/file_system.cc
@@ -265,4 +265,8 @@ Status FileSystem::RecursivelyCreateDir(const string& dirname) {
   return Status::OK();
 }
 
+Status FileSystem::CopyFile(const string& src, const string& target) {
+  return FileSystemCopyFile(this, src, this, target);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/file_system.h b/tensorflow/core/platform/file_system.h
index d32efcea09..3085b6958f 100644
--- a/tensorflow/core/platform/file_system.h
+++ b/tensorflow/core/platform/file_system.h
@@ -189,6 +189,9 @@ class FileSystem {
   /// \brief Overwrites the target if it exists.
   virtual Status RenameFile(const string& src, const string& target) = 0;
 
+  /// \brief Copy the src to target.
+  virtual Status CopyFile(const string& src, const string& target);
+
   /// \brief Translate an URI to a filename for the FileSystem implementation.
   ///
   /// The implementation in this class cleans up the path, removing
diff --git a/tensorflow/core/platform/posix/posix_file_system.cc b/tensorflow/core/platform/posix/posix_file_system.cc
index fb7a5a9995..9a8021565c 100644
--- a/tensorflow/core/platform/posix/posix_file_system.cc
+++ b/tensorflow/core/platform/posix/posix_file_system.cc
@@ -18,6 +18,9 @@ limitations under the License.
 #include <fcntl.h>
 #include <stdio.h>
 #include <sys/mman.h>
+#if !defined(__APPLE__)
+#include <sys/sendfile.h>
+#endif
 #include <sys/stat.h>
 #include <sys/time.h>
 #include <sys/types.h>
@@ -34,6 +37,9 @@ limitations under the License.
 
 namespace tensorflow {
 
+// 128KB of copy buffer
+constexpr size_t kPosixCopyFileBufferSize = 128 * 1024;
+
 // pread() based random-access
 class PosixRandomAccessFile : public RandomAccessFile {
  private:
@@ -276,4 +282,70 @@ Status PosixFileSystem::RenameFile(const string& src, const string& target) {
   return result;
 }
 
+Status PosixFileSystem::CopyFile(const string& src, const string& target) {
+  string translated_src = TranslateName(src);
+  struct stat sbuf;
+  if (stat(translated_src.c_str(), &sbuf) != 0) {
+    return IOError(src, errno);
+  }
+  int src_fd = open(translated_src.c_str(), O_RDONLY);
+  if (src_fd < 0) {
+    return IOError(src, errno);
+  }
+  string translated_target = TranslateName(target);
+  // O_WRONLY | O_CREAT:
+  //   Open file for write and if file does not exist, create the file.
+  // S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH:
+  //   Create the file with permission of 0644
+  int target_fd = open(translated_target.c_str(), O_WRONLY | O_CREAT,
+                       S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
+  if (target_fd < 0) {
+    close(src_fd);
+    return IOError(target, errno);
+  }
+  int rc = 0;
+  off_t offset = 0;
+  std::unique_ptr<char[]> buffer(new char[kPosixCopyFileBufferSize]);
+  while (offset < sbuf.st_size) {
+    // Use uint64 for safe compare SSIZE_MAX
+    uint64 chunk = sbuf.st_size - offset;
+    if (chunk > SSIZE_MAX) {
+      chunk = SSIZE_MAX;
+    }
+#if defined(__linux__) && !defined(__ANDROID__)
+    rc = sendfile(target_fd, src_fd, &offset, static_cast<size_t>(chunk));
+#else
+    if (chunk > kPosixCopyFileBufferSize) {
+      chunk = kPosixCopyFileBufferSize;
+    }
+    rc = read(src_fd, buffer.get(), static_cast<size_t>(chunk));
+    if (rc <= 0) {
+      break;
+    }
+    rc = write(target_fd, buffer.get(), static_cast<size_t>(chunk));
+    offset += chunk;
+#endif
+    if (rc <= 0) {
+      break;
+    }
+  }
+
+  Status result = Status::OK();
+  if (rc < 0) {
+    result = IOError(target, errno);
+  }
+
+  // Keep the error code
+  rc = close(target_fd);
+  if (rc < 0 && result == Status::OK()) {
+    result = IOError(target, errno);
+  }
+  rc = close(src_fd);
+  if (rc < 0 && result == Status::OK()) {
+    result = IOError(target, errno);
+  }
+
+  return result;
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/posix/posix_file_system.h b/tensorflow/core/platform/posix/posix_file_system.h
index fe050fd5a0..98ffa43b8a 100644
--- a/tensorflow/core/platform/posix/posix_file_system.h
+++ b/tensorflow/core/platform/posix/posix_file_system.h
@@ -56,6 +56,8 @@ class PosixFileSystem : public FileSystem {
   Status GetFileSize(const string& fname, uint64* size) override;
 
   Status RenameFile(const string& src, const string& target) override;
+
+  Status CopyFile(const string& src, const string& target) override;
 };
 
 Status IOError(const string& context, int err_number);
diff --git a/tensorflow/python/lib/io/file_io.i b/tensorflow/python/lib/io/file_io.i
index c0c4e035fc..891a7b0fd0 100644
--- a/tensorflow/python/lib/io/file_io.i
+++ b/tensorflow/python/lib/io/file_io.i
@@ -110,21 +110,15 @@ void RecursivelyCreateDir(const string& dirname, TF_Status* out_status) {
   }
 }
 
-void CopyFile(const string& oldpath, const string& newpath, bool overwrite,
+void CopyFile(const string& src, const string& target, bool overwrite,
               TF_Status* out_status) {
-  // If overwrite is false and the newpath file exists then it's an error.
-  if (!overwrite && tensorflow::Env::Default()->FileExists(newpath).ok()) {
+  // If overwrite is false and the target file exists then its an error.
+  if (!overwrite && tensorflow::Env::Default()->FileExists(target).ok()) {
     TF_SetStatus(out_status, TF_ALREADY_EXISTS, "file already exists");
     return;
   }
-  string file_content;
-  tensorflow::Status status = ReadFileToString(tensorflow::Env::Default(),
-      oldpath, &file_content);
-  if (!status.ok()) {
-    Set_TF_Status_from_Status(out_status, status);
-    return;
-  }
-  status = WriteStringToFile(tensorflow::Env::Default(), newpath, file_content);
+  tensorflow::Status status =
+      tensorflow::Env::Default()->CopyFile(src, target);
   if (!status.ok()) {
     Set_TF_Status_from_Status(out_status, status);
   }
-- 
GitLab


From 6b1208c647079e822f19a4c869af52bf3a06d532 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 7 Feb 2018 16:48:00 -0800
Subject: [PATCH 799/966] Add uint32 and uint64 kernel support for `Invert`
 (#15154)

* Add uint32 and uint64 kernel support for `Invert`

This fix adds uint32 and uint64 kernel support for `Invert`.

In bitwise_ops.cc, uint32 and uint64 have been registered
for `Invert` like other bitwise ops
`BitwiseAnd`/`BitwiseOr`/`BitwiseXor`/`LeftShift`/`RightShift`.
However, no uint32 and uint64 kernels available for `Invert` yet.

This fix add uint32 and uint64 kernel for `Invert`,
and adds additional test cases to cover the changes.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add test cases for uint32 and uint64 support with `Invert`

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add missing uint32 and uint64 in GPU for invert

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add DEFINE_UNARY8 for invert

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/core/kernels/cwise_op_gpu_invert.cu.cc |  2 +-
 tensorflow/core/kernels/cwise_op_invert.cc        | 10 +++++-----
 tensorflow/core/kernels/cwise_ops_gpu_common.cu.h |  3 +++
 tensorflow/python/ops/bitwise_ops_test.py         |  2 +-
 4 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/tensorflow/core/kernels/cwise_op_gpu_invert.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_invert.cu.cc
index 62f33612db..1072ef3aa6 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_invert.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_invert.cu.cc
@@ -19,7 +19,7 @@ limitations under the License.
 
 namespace tensorflow {
 namespace functor {
-DEFINE_UNARY6(invert, int8, int16, int32, int64, uint8, uint16);
+DEFINE_UNARY8(invert, int8, int16, int32, int64, uint8, uint16, uint32, uint64);
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cwise_op_invert.cc b/tensorflow/core/kernels/cwise_op_invert.cc
index f5cafcc780..98c8d7e9b2 100644
--- a/tensorflow/core/kernels/cwise_op_invert.cc
+++ b/tensorflow/core/kernels/cwise_op_invert.cc
@@ -16,17 +16,17 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER6(UnaryOp, CPU, "Invert", functor::invert, int8, int16, int32, int64,
-          uint8, uint16);
+REGISTER8(UnaryOp, CPU, "Invert", functor::invert, int8, int16, int32, int64,
+          uint8, uint16, uint32, uint64);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER6(UnaryOp, SYCL, "Invert", functor::invert, int8, int16, int32, int64,
-          uint8, uint16);
+          uint8, uint16, uint32, uint64);
 #endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
-REGISTER6(UnaryOp, GPU, "Invert", functor::invert, int8, int16, int32, int64,
-          uint8, uint16);
+REGISTER8(UnaryOp, GPU, "Invert", functor::invert, int8, int16, int32, int64,
+          uint8, uint16, uint32, uint64);
 #endif  // GOOGLE_CUDA
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h b/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h
index 6dd108f722..965e42dcce 100644
--- a/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h
+++ b/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h
@@ -136,6 +136,9 @@ struct ApproximateEqual<GPUDevice, T> {
 #define DEFINE_UNARY7(F, T0, T1, T2, T3, T4, T5, T6) \
   DEFINE_UNARY2(F, T0, T1);                          \
   DEFINE_UNARY5(F, T2, T3, T4, T5, T6)
+#define DEFINE_UNARY8(F, T0, T1, T2, T3, T4, T5, T6, T7) \
+  DEFINE_UNARY4(F, T0, T1, T2, T3);                      \
+  DEFINE_UNARY4(F, T4, T5, T6, T7)
 
 // Macros to explicitly instantiate kernels on GPU for multiple types
 // (T0, T1, etc.) for BinaryFunctor.
diff --git a/tensorflow/python/ops/bitwise_ops_test.py b/tensorflow/python/ops/bitwise_ops_test.py
index f9b025b787..c4cfc0da19 100644
--- a/tensorflow/python/ops/bitwise_ops_test.py
+++ b/tensorflow/python/ops/bitwise_ops_test.py
@@ -71,7 +71,7 @@ class BitwiseOpTest(test_util.TensorFlowTestCase):
 
   def testInvertOp(self):
     dtype_list = [dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64,
-                  dtypes.uint8, dtypes.uint16]
+                  dtypes.uint8, dtypes.uint16, dtypes.uint32, dtypes.uint64]
     inputs = [0, 5, 3, 14]
     with self.test_session(use_gpu=True) as sess:
       for dtype in dtype_list:
-- 
GitLab


From f7f7036d1cdc5716aff976fae0ea4d1b9a931b56 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 7 Feb 2018 16:48:21 -0800
Subject: [PATCH 800/966] Add optimized gif support for decode_gif (#16804)

* Add optimized gif support for decode_gif

While revisiting the issue of 15838, I noticed that
currently optimized gif is not supported. However,
optimized gif is actually possible to be processed
as essentially the subsequent frame just adds
the content on top of the previous frame on canvas.

This fix adds the support for optimized gif with decode_gif.

As is shown in the added test case, optimized gif (`optimized.gif`)
could be handled the same way as original gif (`scan.gif`).

This fix fixes 15838.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Format gif_io.cc with clang-format -i --style=Google

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add test case to cover optimized gif support for decode_gif.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add `#include <algorithm>` to fix Windows build errors

This commit add `#include <algorithm>` to fix
build errors on Windows platform.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/core/lib/gif/gif_io.cc       | 42 +++++++++++++++++++++----
 tensorflow/python/ops/image_ops_test.py | 26 +++------------
 2 files changed, 40 insertions(+), 28 deletions(-)

diff --git a/tensorflow/core/lib/gif/gif_io.cc b/tensorflow/core/lib/gif/gif_io.cc
index e5deb2b873..9a5215320f 100644
--- a/tensorflow/core/lib/gif/gif_io.cc
+++ b/tensorflow/core/lib/gif/gif_io.cc
@@ -16,6 +16,7 @@ limitations under the License.
 // Functions to read images in GIF format.
 
 #include "tensorflow/core/lib/gif/gif_io.h"
+#include <algorithm>
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/gif.h"
@@ -89,23 +90,52 @@ uint8* Decode(const void* srcdata, int datasize,
   uint8* const dstdata = allocate_output(num_frames, width, height, channel);
   if (!dstdata) return nullptr;
   for (int k = 0; k < num_frames; k++) {
+    uint8* this_dst = dstdata + k * width * channel * height;
+
     SavedImage* this_image = &gif_file->SavedImages[k];
     GifImageDesc* img_desc = &this_image->ImageDesc;
+
+    int imgLeft = img_desc->Left;
+    int imgTop = img_desc->Top;
+    int imgRight = img_desc->Left + img_desc->Width;
+    int imgBottom = img_desc->Top + img_desc->Height;
+
     if (img_desc->Left != 0 || img_desc->Top != 0 || img_desc->Width != width ||
         img_desc->Height != height) {
-      *error_string = strings::StrCat("can't process optimized gif");
-      return nullptr;
+      // If the first frame does not fill the entire canvas then return error.
+      if (k == 0) {
+        *error_string =
+            strings::StrCat("the first frame does not fill the canvas");
+        return nullptr;
+      }
+      // Otherwise previous frame will be reused to fill the unoccupied canvas.
+      imgLeft = std::max(imgLeft, 0);
+      imgTop = std::max(imgTop, 0);
+      imgRight = std::min(imgRight, width);
+      imgBottom = std::min(imgBottom, height);
+
+      uint8* last_dst = dstdata + (k - 1) * width * channel * height;
+      for (int i = 0; i < height; ++i) {
+        uint8* p_dst = this_dst + i * width * channel;
+        uint8* l_dst = last_dst + i * width * channel;
+        for (int j = 0; j < width; ++j) {
+          p_dst[j * channel + 0] = l_dst[j * channel + 0];
+          p_dst[j * channel + 1] = l_dst[j * channel + 1];
+          p_dst[j * channel + 2] = l_dst[j * channel + 2];
+        }
+      }
     }
 
     ColorMapObject* color_map = this_image->ImageDesc.ColorMap
                                     ? this_image->ImageDesc.ColorMap
                                     : gif_file->SColorMap;
 
-    uint8* this_dst = dstdata + k * width * channel * height;
-    for (int i = 0; i < height; ++i) {
+    for (int i = imgTop; i < imgBottom; ++i) {
       uint8* p_dst = this_dst + i * width * channel;
-      for (int j = 0; j < width; ++j) {
-        GifByteType color_index = this_image->RasterBits[i * width + j];
+      for (int j = imgLeft; j < imgRight; ++j) {
+        GifByteType color_index =
+            this_image->RasterBits[(i - img_desc->Top) * (img_desc->Width) +
+                                   (j - img_desc->Left)];
         const GifColorType& gif_color = color_map->Colors[color_index];
         p_dst[j * channel + 0] = gif_color.Red;
         p_dst[j * channel + 1] = gif_color.Green;
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index 82b77ee8e3..0dc1c56e7d 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -2821,20 +2821,9 @@ class PngTest(test_util.TensorFlowTestCase):
 
 class GifTest(test_util.TensorFlowTestCase):
 
-  def testOptimizedGifErrorString(self):
-    filename = "tensorflow/core/lib/gif/testdata/optimized.gif"
-
-    with self.test_session(use_gpu=True) as sess:
-      gif = io_ops.read_file(filename)
-      image = image_ops.decode_gif(gif)
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   "can't process optimized gif"):
-        gif, image = sess.run([gif, image])
-
-  def testValid(self):
+  def _testValid(self, filename):
     # Read some real GIFs
     prefix = "tensorflow/core/lib/gif/testdata/"
-    filename = "scan.gif"
     WIDTH = 20
     HEIGHT = 40
     STRIDE = 5
@@ -2861,16 +2850,9 @@ class GifTest(test_util.TensorFlowTestCase):
 
         self.assertAllClose(frame, gt)
 
-  def testInValid(self):
-    # Read some real GIFs
-    prefix = "tensorflow/core/lib/gif/testdata/"
-    filename = "optimized.gif"
-
-    with self.test_session(use_gpu=True) as sess:
-      gif0 = io_ops.read_file(prefix + filename)
-      image0 = image_ops.decode_gif(gif0)
-      with self.assertRaises(errors.InvalidArgumentError):
-        gif0, image0 = sess.run([gif0, image0])
+  def testValid(self):
+    self._testValid("scan.gif")
+    self._testValid("optimized.gif")
 
   def testShape(self):
     with self.test_session(use_gpu=True) as sess:
-- 
GitLab


From 0b32f622e581e28897b91d93f6fefb47bb21b061 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Wed, 7 Feb 2018 16:48:45 -0800
Subject: [PATCH 801/966] [XLA:CPU] Fix test case for vectorized Exp and Tanh
 to actually vectorize

I just noticed that the test case for ArrayElementwiseOpTest::ExpF32sVector and
possibly for ArrayElementwiseOpTest::ExpF32sVector does not actually vectorize
the intrinsic calls.  This is most likely a very recent regression because I
remember fixing at least one issue in the emitter demonstrated by the test.
Despite that I think the current approach is better since we have unit tests
that check that we at least vectorize the vector-of-F32's case.

PiperOrigin-RevId: 184918373
---
 .../xla/tests/array_elementwise_ops_test.cc   | 90 +++++++++----------
 1 file changed, 43 insertions(+), 47 deletions(-)

diff --git a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
index 177b58979c..87ac7731ba 100644
--- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
@@ -2050,39 +2050,35 @@ XLA_TEST_F(ArrayElementwiseOpTest, TanhF32sVector) {
   // the input tensor is large enough to exercise the vectorized tanh
   // implementation on XLA CPU.
   ComputationBuilder builder(client_, TestName());
-  auto input_literal = Literal::CreateR2<float>(
-      {{1.02, -0.32, 0.85, 0.90, 1.23, -0.91, -0.49, 0.80},
-       {-0.67, 0.16, -0.07, 0.39, -0.41, 0.04, 1.36, 1.25},
-       {0.41, 0.65, -1.08, 0.32, -1.45, -0.77, -1.09, 0.91},
-       {-1.03, -0.30, -1.11, -1.17, 1.50, -0.85, 0.04, 1.02},
-       {0.34, -0.61, 0.41, 0.07, -0.02, 1.42, -0.62, 0.81},
-       {0.08, 0.81, -0.30, 1.17, -0.65, -0.44, 0.92, 1.26},
-       {-1.29, 1.35, 0.08, -1.24, -0.92, 0.49, 1.17, -0.45},
-       {-1.31, -1.44, -0.13, -1.31, -0.79, 1.41, 1.21, 1.05}});
-  auto input_data =
-      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
+  auto input_literal = Literal::CreateR1<float>(
+      {1.02,  -0.32, 0.85,  0.90,  1.23,  -0.91, -0.49, 0.80,  -0.67, 0.16,
+       -0.07, 0.39,  -0.41, 0.04,  1.36,  1.25,  0.41,  0.65,  -1.08, 0.32,
+       -1.45, -0.77, -1.09, 0.91,  -1.03, -0.30, -1.11, -1.17, 1.50,  -0.85,
+       0.04,  1.02,  0.34,  -0.61, 0.41,  0.07,  -0.02, 1.42,  -0.62, 0.81,
+       0.08,  0.81,  -0.30, 1.17,  -0.65, -0.44, 0.92,  1.26,  -1.29, 1.35,
+       0.08,  -1.24, -0.92, 0.49,  1.17,  -0.45, -1.31, -1.44, -0.13, -1.31,
+       -0.79, 1.41,  1.21,  1.05});
+  TF_ASSERT_OK_AND_ASSIGN(auto input_data,
+                          client_->TransferToServer(*input_literal));
 
   auto input = builder.Parameter(0, input_literal->shape(), "input");
   builder.Tanh(input);
 
-  ComputeAndCompareR2<float>(
+  ComputeAndCompareR1<float>(
       &builder,
-      {{0.77009583, -0.30665702, 0.69070244, 0.71401149, 0.84400684,
-        -0.71985596, -0.45764771, 0.66664988},
-       {-0.58278900, 0.16050975, -0.06770509, 0.36843640, -0.38476998,
-        0.04018109, 0.87562293, 0.84788644},
-       {0.38603750, 0.57294142, -0.79140943, 0.31032649, -0.89590985,
-        -0.64770776, -0.79625875, 0.72234446},
-       {-0.77389336, -0.28871772, -0.80428445, -0.82541436, 0.90456349,
-        -0.68856895, 0.03877772, 0.76877952},
-       {0.32561871, -0.54546672, 0.39072621, 0.07273290, -0.01924866,
-        0.88924897, -0.55283129, 0.67183107},
-       {0.08006320, 0.66944766, -0.29068485, 0.82573754, -0.57170743,
-        -0.41581789, 0.72739530, 0.85025692},
-       {-0.85931867, 0.87357593, 0.07782833, -0.84597743, -0.72748238,
-        0.45396307, 0.82449573, -0.42462519},
-       {-0.86363792, -0.89368379, -0.12621804, -0.86445558, -0.65565848,
-        0.88789743, 0.83566397, 0.78287679}},
+      {0.77009583,  -0.30665702, 0.69070244,  0.71401149,  0.84400684,
+       -0.71985596, -0.45764771, 0.66664988,  -0.58278900, 0.16050975,
+       -0.06770509, 0.36843640,  -0.38476998, 0.04018109,  0.87562293,
+       0.84788644,  0.38603750,  0.57294142,  -0.79140943, 0.31032649,
+       -0.89590985, -0.64770776, -0.79625875, 0.72234446,  -0.77389336,
+       -0.28871772, -0.80428445, -0.82541436, 0.90456349,  -0.68856895,
+       0.03877772,  0.76877952,  0.32561871,  -0.54546672, 0.39072621,
+       0.07273290,  -0.01924866, 0.88924897,  -0.55283129, 0.67183107,
+       0.08006320,  0.66944766,  -0.29068485, 0.82573754,  -0.57170743,
+       -0.41581789, 0.72739530,  0.85025692,  -0.85931867, 0.87357593,
+       0.07782833,  -0.84597743, -0.72748238, 0.45396307,  0.82449573,
+       -0.42462519, -0.86363792, -0.89368379, -0.12621804, -0.86445558,
+       -0.65565848, 0.88789743,  0.83566397,  0.78287679},
       {input_data.get()},
       // The error spec is unusually high here to account for the fact that we
       // use a rational interpolant to approximate tanh.
@@ -2096,32 +2092,32 @@ XLA_TEST_F(ArrayElementwiseOpTest, ExpF32sVector) {
 
   // Just to help make sense of the scales here -- exp(89) saturates float32 and
   // exp(-10) is smaller than our error spec.
-  std::unique_ptr<Literal> input_literal = Literal::CreateR2<float>(
-      {{1.02, -0.32, 0.85, 0.9, 1.23, -0.91, -0.49, 0.8},
-       {-1.31, -1.44, -0.13, -1.31, -0.79, 1.41, 1.21, 1.05},
-       {-195.6, -194.5, -193.4, -192.3, -191.2, -190.1, -189.0, -187.9},
-       {-19.6, -18.5, -17.4, -16.3, -15.2, -14.1, -13.0, -11.9},
-       {-10.8, -9.7, -8.6, -7.5, -6.4, -5.3, -4.2, -3.1},
-       {-2.0, -0.9, 0.2, 1.3, 2.4, 3.5, 4.6, 5.7},
-       {6.8, 7.9, 9.0, 10.1, 11.2, 12.3, 13.4, 14.5},
-       {15.6, 16.7, 17.8, 18.9, 20.0, 21.1, 22.2, 23.3},
-       {24.4, 25.5, 26.6, 27.7, 28.8, 29.9, 31.0, 32.1},
-       {68.4, 69.5, 70.6, 71.7, 72.8, 73.9, 75.0, 76.1},
-       {77.2, 78.3, 79.4, 80.5, 81.6, 82.7, 83.8, 84.9},
-       {85.2, 86.3, 86.4, 86.5, 87.6, 87.7, 87.8, 87.9}});
+  std::unique_ptr<Literal> input_literal = Literal::CreateR1<float>(
+      {1.02,   -0.32,  0.85,   0.9,    1.23,   -0.91,  -0.49, 0.8,    -1.31,
+       -1.44,  -0.13,  -1.31,  -0.79,  1.41,   1.21,   1.05,  -195.6, -194.5,
+       -193.4, -192.3, -191.2, -190.1, -189.0, -187.9, -19.6, -18.5,  -17.4,
+       -16.3,  -15.2,  -14.1,  -13.0,  -11.9,  -10.8,  -9.7,  -8.6,   -7.5,
+       -6.4,   -5.3,   -4.2,   -3.1,   -2.0,   -0.9,   0.2,   1.3,    2.4,
+       3.5,    4.6,    5.7,    6.8,    7.9,    9.0,    10.1,  11.2,   12.3,
+       13.4,   14.5,   15.6,   16.7,   17.8,   18.9,   20.0,  21.1,   22.2,
+       23.3,   24.4,   25.5,   26.6,   27.7,   28.8,   29.9,  31.0,   32.1,
+       68.4,   69.5,   70.6,   71.7,   72.8,   73.9,   75.0,  76.1,   77.2,
+       78.3,   79.4,   80.5,   81.6,   82.7,   83.8,   84.9,  85.2,   86.3,
+       86.4,   86.5,   87.6,   87.7,   87.8,   87.9});
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<GlobalData> input_data,
                           client_->TransferToServer(*input_literal));
 
   auto input = builder.Parameter(0, input_literal->shape(), "input");
   builder.Exp(input);
 
-  Array2D<float> expected_result(input_literal->shape().dimensions(0),
-                                 input_literal->shape().dimensions(1));
-  expected_result.Each([&](int64 r, int64 c, float* result) {
-    *result = std::exp(input_literal->Get<float>({r, c}));
-  });
+  std::vector<float> expected_result;
+  int64 input_size = input_literal->shape().dimensions(0);
+  expected_result.reserve(input_size);
+  for (int64 i = 0; i < input_size; i++) {
+    expected_result.push_back(std::exp(input_literal->Get<float>({i})));
+  }
 
-  ComputeAndCompareR2<float>(&builder, expected_result, {input_data.get()},
+  ComputeAndCompareR1<float>(&builder, expected_result, {input_data.get()},
                              error_spec_);
 }
 
-- 
GitLab


From 4cdfdaeae2540ac2eef13fc45928ff22cf8fd83e Mon Sep 17 00:00:00 2001
From: Russell Power <power@google.com>
Date: Wed, 7 Feb 2018 17:12:01 -0800
Subject: [PATCH 802/966] Add operation to forward log messages from remote
 workers to a local system.

PiperOrigin-RevId: 184921657
---
 .../contrib/tpu/ops/tpu_configuration_ops.cc  | 18 ++++-
 tensorflow/core/BUILD                         | 15 +++-
 tensorflow/core/util/event.proto              |  5 ++
 tensorflow/core/util/session_message.cc       | 71 +++++++++++++++++++
 tensorflow/core/util/session_message.h        | 55 ++++++++++++++
 5 files changed, 162 insertions(+), 2 deletions(-)
 create mode 100644 tensorflow/core/util/session_message.cc
 create mode 100644 tensorflow/core/util/session_message.h

diff --git a/tensorflow/contrib/tpu/ops/tpu_configuration_ops.cc b/tensorflow/contrib/tpu/ops/tpu_configuration_ops.cc
index 28417b89e0..f8de8baa65 100644
--- a/tensorflow/contrib/tpu/ops/tpu_configuration_ops.cc
+++ b/tensorflow/contrib/tpu/ops/tpu_configuration_ops.cc
@@ -212,4 +212,20 @@ An op that shuts down a running distributed TPU system. The Op returns
 an error if no system is running.
 )doc");
 
-}  // namespace tensorflow
+REGISTER_OP("SessionStatus")
+    .Input("fetch_start_timestamp: double")
+    .Output("status: string")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Not for public usage.
+
+Returns messages from the current session as a serialized SessionStatusProto.
+
+This includes the current state of the compiler, along with any critical
+logging or warning messages.
+
+fetch_start_timestamp: any messages earlier than this will be excluded from the
+returned proto.
+)doc");
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 7fa0b79766..d0c9a72af9 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -193,6 +193,7 @@ CORE_PROTO_SRCS = [
     "protobuf/rewriter_config.proto",
     "protobuf/tensor_bundle.proto",
     "protobuf/saver.proto",
+    "util/event.proto",
     "util/memmapped_file_system.proto",
     "util/saved_tensor_slice.proto",
 ]
@@ -211,7 +212,6 @@ ADDITIONAL_CORE_PROTO_SRCS = [
     "protobuf/named_tensor.proto",
     "protobuf/saved_model.proto",
     "protobuf/tensorflow_server.proto",
-    "util/event.proto",
     "util/test_log.proto",
 ]
 
@@ -376,6 +376,17 @@ cc_library(
     hdrs = ["platform/abi.h"],
 )
 
+cc_library(
+    name = "session_message",
+    srcs = ["util/session_message.cc"],
+    hdrs = ["util/session_message.h"],
+    deps = [
+        ":framework",
+        ":lib",
+        ":protos_all_cc",
+    ],
+)
+
 cc_library(
     name = "stacktrace_handler",
     srcs = ["platform/stacktrace_handler.cc"],
@@ -1744,6 +1755,7 @@ FRAMEWORK_INTERNAL_PRIVATE_HEADERS = [
         "framework/reader_base.*",
         "util/memmapped_file_system.*",
         "util/memmapped_file_system_writer.*",
+        "util/session_message.*",
         "util/version_info.cc",
     ],
 ) + select({
@@ -1830,6 +1842,7 @@ tf_cuda_library(
             "framework/resource_handle.cc",
             "util/memmapped_file_system.*",
             "util/memmapped_file_system_writer.*",
+            "util/session_message.cc",
             "util/version_info.cc",
         ],
     ) + select({
diff --git a/tensorflow/core/util/event.proto b/tensorflow/core/util/event.proto
index 5c3799c132..65d2c5a09c 100644
--- a/tensorflow/core/util/event.proto
+++ b/tensorflow/core/util/event.proto
@@ -80,3 +80,8 @@ message TaggedRunMetadata {
   // deserialization.
   bytes run_metadata = 2;
 }
+
+// For communicating live events back to a coordinator
+message SessionStatus {
+  repeated Event event = 1;
+}
diff --git a/tensorflow/core/util/session_message.cc b/tensorflow/core/util/session_message.cc
new file mode 100644
index 0000000000..28a6517a1a
--- /dev/null
+++ b/tensorflow/core/util/session_message.cc
@@ -0,0 +1,71 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/util/session_message.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/util/event.pb.h"
+
+static const int kMaxLogEvents = 1000;
+
+namespace tensorflow {
+
+SessionLogger::SessionLogger() : status_(new SessionStatus) {}
+
+SessionLogger::~SessionLogger() {}
+
+string SessionLogger::DebugString() { return "SessionLogger"; }
+
+void SessionLogger::Log(StringPiece message) {
+  mutex_lock lock(mu_);
+
+  Event* event = status_->add_event();
+  event->set_wall_time(Env::Default()->NowMicros());
+  event->set_step(0);
+  LogMessage* log = event->mutable_log_message();
+  log->set_message(message.ToString());
+  log->set_level(LogMessage::INFO);
+
+  // Clip log events by 10% if we overflow
+  if (status_->event_size() > kMaxLogEvents) {
+    auto events = status_->mutable_event();
+    events->DeleteSubrange(0, kMaxLogEvents / 10);
+  }
+}
+
+SessionLogger* GetSessionLogger(ResourceMgr* rm) {
+  SessionLogger* logger;
+
+  std::function<Status(SessionLogger**)> status_creator =
+      [](SessionLogger** result) {
+        *result = new SessionLogger();
+        return Status::OK();
+      };
+
+  if (!rm->LookupOrCreate<SessionLogger>("session", "status", &logger,
+                                         status_creator)
+           .ok()) {
+    return nullptr;
+  }
+
+  return logger;
+}
+
+void LogSessionMessage(ResourceMgr* rm, StringPiece message) {
+  return GetSessionLogger(rm)->Log(message);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/util/session_message.h b/tensorflow/core/util/session_message.h
new file mode 100644
index 0000000000..c0f3d78b46
--- /dev/null
+++ b/tensorflow/core/util/session_message.h
@@ -0,0 +1,55 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_SESSION_MESSAGE_H_
+#define TENSORFLOW_CORE_UTIL_SESSION_MESSAGE_H_
+
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+
+class ResourceMgr;
+class SessionStatus;
+
+class SessionLogger : public ResourceBase {
+ public:
+  SessionLogger();
+  ~SessionLogger();
+
+  void Log(StringPiece message);
+  string DebugString() override;
+
+  const SessionStatus& status() { return *status_; }
+
+ private:
+  std::unique_ptr<SessionStatus> status_;
+  mutex mu_;
+};
+
+// Return a SessionLogger instance for the current session.  If the logger
+// will be used across multiple computations, you must explicitly acquire
+// and release references using Ref()/Unref().
+//
+// Returns nullptr if a logger cannot be created.
+SessionLogger* GetSessionLogger(ResourceMgr* rm);
+
+// Attach `message` to the logger for the current session.
+void LogSessionMessage(ResourceMgr* rm, StringPiece message);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_SESSION_MESSAGE_H
-- 
GitLab


From ddf9536dfeb358237219e27c185c729fd4c8537b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 7 Feb 2018 17:16:32 -0800
Subject: [PATCH 803/966] Remove note about accumulator variables, since those
 are not added to TRAINABLE_VARIABLES.

PiperOrigin-RevId: 184922273
---
 tensorflow/python/estimator/warm_starting_util.py | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/tensorflow/python/estimator/warm_starting_util.py b/tensorflow/python/estimator/warm_starting_util.py
index 48110ef57f..57db968d56 100644
--- a/tensorflow/python/estimator/warm_starting_util.py
+++ b/tensorflow/python/estimator/warm_starting_util.py
@@ -117,21 +117,13 @@ class WarmStartSettings(
   ws = WarmStartSettings(ckpt_to_initialize_from="/tmp/model-1000")
   ```
 
-  Warm-start only the embeddings (input layer) and their accumulator variables:
+  Warm-start only the embeddings (input layer):
 
   ```
   ws = WarmStartSettings(ckpt_to_initialize_from="/tmp",
                          vars_to_warm_start=".*input_layer.*")
   ```
 
-  Warm-start everything except the optimizer accumulator variables
-  (DNN defaults to Adagrad):
-
-  ```
-  ws = WarmStartSettings(ckpt_to_initialize_from="/tmp",
-                         vars_to_warm_start="^(?!.*(Adagrad))")
-  ```
-
   Warm-start all weights but the embedding parameters corresponding to
   `sc_vocab_file` have a different vocab from the one used in the current
   model:
@@ -423,6 +415,8 @@ def _warm_start(warm_start_settings):
   # Both warm_start_settings.vars_to_warm_start = '.*' and
   # warm_start_settings.vars_to_warm_start = None will match everything here.
   for v in ops.get_collection(
+      # TODO(eddz): Allow for different collections here (to support
+      # warm-starting accumulators).
       ops.GraphKeys.TRAINABLE_VARIABLES,
       scope=warm_start_settings.vars_to_warm_start):
     if not isinstance(v, list):
-- 
GitLab


From 78e4ed153a853536622ff606fc5f6c48a1573ac6 Mon Sep 17 00:00:00 2001
From: Yao Zhang <yaozhang@google.com>
Date: Wed, 7 Feb 2018 18:11:51 -0800
Subject: [PATCH 804/966] Reduce the number of concats to avoid test timeout.

PiperOrigin-RevId: 184929151
---
 tensorflow/python/kernel_tests/concat_op_test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/kernel_tests/concat_op_test.py b/tensorflow/python/kernel_tests/concat_op_test.py
index a5fd3bc334..127bc6bb20 100644
--- a/tensorflow/python/kernel_tests/concat_op_test.py
+++ b/tensorflow/python/kernel_tests/concat_op_test.py
@@ -495,9 +495,9 @@ class ConcatOpTest(test.TestCase):
         p = []
         shape = np.array([7, 13])
         if test.is_gpu_available():
-          num_tensors = 10000
+          num_tensors = 5000
         else:
-          num_tensors = 1000
+          num_tensors = 500
         for i in np.arange(num_tensors):
           input_shape = shape
           placeholder = array_ops.placeholder(dtypes.float32, shape=input_shape)
-- 
GitLab


From 39e84da541377b6a87846f06cc66f13f670d3e6d Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Wed, 7 Feb 2018 19:24:57 -0800
Subject: [PATCH 805/966] [XLA:CPU] Fix tfcompile's use of freeze_graph.py

Not sure if I am holding it wrong, but I could not make the existing
FLAGS.check_version work.

PiperOrigin-RevId: 184935374
---
 tensorflow/compiler/aot/BUILD           |  5 -----
 tensorflow/compiler/aot/tests/BUILD     |  9 ---------
 tensorflow/compiler/aot/tfcompile.bzl   |  1 +
 tensorflow/python/tools/freeze_graph.py | 12 ++++++++++--
 4 files changed, 11 insertions(+), 16 deletions(-)

diff --git a/tensorflow/compiler/aot/BUILD b/tensorflow/compiler/aot/BUILD
index bc46918df9..0900e87eba 100644
--- a/tensorflow/compiler/aot/BUILD
+++ b/tensorflow/compiler/aot/BUILD
@@ -134,7 +134,6 @@ tf_library(
     graph = "test_graph_tfadd.pbtxt",
     tags = [
         "manual",
-        "notap",
     ],
 )
 
@@ -148,7 +147,6 @@ tf_library(
     graph = "test_graph_tfunknownop.pbtxt",
     tags = [
         "manual",
-        "notap",
     ],
 )
 
@@ -163,7 +161,6 @@ tf_library(
     graph = "test_graph_tfunknownop.pbtxt",
     tags = [
         "manual",
-        "notap",
     ],
 )
 
@@ -177,7 +174,6 @@ tf_library(
     graph = "test_graph_tfunknownop.pbtxt",
     tags = [
         "manual",
-        "notap",
     ],
 )
 
@@ -201,7 +197,6 @@ cc_library(
     name = "benchmark_extra_android",
     tags = [
         "manual",
-        "notap",
     ],
     visibility = ["//visibility:public"],
 )
diff --git a/tensorflow/compiler/aot/tests/BUILD b/tensorflow/compiler/aot/tests/BUILD
index 43d8ae4108..28aab6eb61 100644
--- a/tensorflow/compiler/aot/tests/BUILD
+++ b/tensorflow/compiler/aot/tests/BUILD
@@ -76,7 +76,6 @@ tf_library(
     include_standard_runtime_deps = False,
     tags = [
         "manual",
-        "notap",
     ],
 )
 
@@ -89,7 +88,6 @@ tf_library(
     graph = "test_graph_tfadd_with_ckpt.pb",
     tags = [
         "manual",
-        "notap",
     ],
 )
 
@@ -103,7 +101,6 @@ tf_library(
     graph = "test_graph_tfadd_with_ckpt_saver.pb",
     tags = [
         "manual",
-        "notap",
     ],
 )
 
@@ -115,7 +112,6 @@ tf_library(
     graph = "test_graph_tffunction.pb",
     tags = [
         "manual",
-        "notap",
     ],
 )
 
@@ -127,7 +123,6 @@ tf_library(
     graph = "test_graph_tfgather.pb",
     tags = [
         "manual",
-        "notap",
     ],
 )
 
@@ -139,7 +134,6 @@ tf_library(
     graph = "test_graph_tfmatmul.pb",
     tags = [
         "manual",
-        "notap",
     ],
 )
 
@@ -151,7 +145,6 @@ tf_library(
     graph = "test_graph_tfmatmulandadd.pb",
     tags = [
         "manual",
-        "notap",
     ],
     tfcompile_flags = "--gen_name_to_index --gen_program_shape",
 )
@@ -164,7 +157,6 @@ tf_library(
     graph = "test_graph_tfsplits.pb",
     tags = [
         "manual",
-        "notap",
     ],
 )
 
@@ -173,7 +165,6 @@ tf_cc_test(
     srcs = ["tfcompile_test.cc"],
     tags = [
         "manual",
-        "notap",
     ],
     deps = [
         ":test_graph_tfadd",
diff --git a/tensorflow/compiler/aot/tfcompile.bzl b/tensorflow/compiler/aot/tfcompile.bzl
index 58572fea3d..eb3e632c7b 100644
--- a/tensorflow/compiler/aot/tfcompile.bzl
+++ b/tensorflow/compiler/aot/tfcompile.bzl
@@ -103,6 +103,7 @@ def tf_library(name, graph, config,
 
     # Now run freeze_graph to convert variables into constants.
     freeze_args = (" --input_graph=$(location " + graph + ")" +
+                   " --checkpoint_version=1" +
                    " --input_binary=" + str(not graph.endswith(".pbtxt")) +
                    " --input_checkpoint=$(location " + freeze_checkpoint + ")" +
                    " --output_graph=$(location " + freeze_file + ")" +
diff --git a/tensorflow/python/tools/freeze_graph.py b/tensorflow/python/tools/freeze_graph.py
index affa97062a..074b8e7132 100644
--- a/tensorflow/python/tools/freeze_graph.py
+++ b/tensorflow/python/tools/freeze_graph.py
@@ -255,13 +255,21 @@ def freeze_graph(input_graph,
 
 
 def main(unused_args):
+  if FLAGS.checkpoint_version == 1:
+    checkpoint_version = saver_pb2.SaverDef.V1
+  elif FLAGS.checkpoint_version == 2:
+    checkpoint_version = saver_pb2.SaverDef.V2
+  else:
+    print("Invalid checkpoint version (must be '1' or '2'): %d" %
+          FLAGS.checkpoint_version)
+    return -1
   freeze_graph(FLAGS.input_graph, FLAGS.input_saver, FLAGS.input_binary,
                FLAGS.input_checkpoint, FLAGS.output_node_names,
                FLAGS.restore_op_name, FLAGS.filename_tensor_name,
                FLAGS.output_graph, FLAGS.clear_devices, FLAGS.initializer_nodes,
                FLAGS.variable_names_whitelist, FLAGS.variable_names_blacklist,
                FLAGS.input_meta_graph, FLAGS.input_saved_model_dir,
-               FLAGS.saved_model_tags, FLAGS.checkpoint_version)
+               FLAGS.saved_model_tags, checkpoint_version)
 
 
 if __name__ == "__main__":
@@ -285,7 +293,7 @@ if __name__ == "__main__":
   parser.add_argument(
       "--checkpoint_version",
       type=int,
-      default=saver_pb2.SaverDef.V2,
+      default=2,
       help="Tensorflow variable file format")
   parser.add_argument(
       "--output_graph",
-- 
GitLab


From 7e3acec7cdff6175a5af8f824fe1782d7c95a556 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Wed, 7 Feb 2018 20:17:11 -0800
Subject: [PATCH 806/966] [tf.data] Move C++ code backing
 `tf.contrib.data.ignore_errors()` to contrib.

This change moves the `OpKernel` and `DatasetBase` implementations to
"tensorflow/contrib/data/kernels", where they are packaged as a custom
op library. This demonstrates (and enforces by continuous integration)
the ability to build a C++ Dataset implementation in a custom op
library.

Other contrib Dataset implementations will move in subsequent changes.

PiperOrigin-RevId: 184938885
---
 tensorflow/contrib/BUILD                      |  2 +
 .../contrib/cmake/tf_core_kernels.cmake       |  3 +-
 tensorflow/contrib/cmake/tf_core_ops.cmake    |  2 +-
 tensorflow/contrib/cmake/tf_python.cmake      |  4 +-
 tensorflow/contrib/data/BUILD                 |  8 ++--
 tensorflow/contrib/data/kernels/BUILD         | 22 +++++++++
 .../data/kernels}/ignore_errors_dataset_op.cc |  2 +-
 .../{prefetching_ops.cc => dataset_ops.cc}    | 10 ++++
 .../contrib/data/python/kernel_tests/BUILD    |  2 +-
 tensorflow/contrib/data/python/ops/BUILD      | 32 ++++++++-----
 .../data/python/ops/contrib_op_loader.py      | 24 ++++++++++
 .../contrib/data/python/ops/error_ops.py      |  3 +-
 .../data/python/ops/prefetching_ops.py        | 12 ++---
 tensorflow/contrib/eager/python/BUILD         |  2 +-
 .../api_def_IgnoreErrorsDataset.pbtxt         |  4 --
 tensorflow/core/kernels/data/BUILD            | 13 -----
 .../core/ops/compat/ops_history.v1.pbtxt      | 47 -------------------
 tensorflow/core/ops/dataset_ops.cc            |  7 ---
 tensorflow/tools/pip_package/BUILD            |  2 +-
 19 files changed, 98 insertions(+), 103 deletions(-)
 rename tensorflow/{core/kernels/data => contrib/data/kernels}/ignore_errors_dataset_op.cc (98%)
 rename tensorflow/contrib/data/ops/{prefetching_ops.cc => dataset_ops.cc} (86%)
 create mode 100644 tensorflow/contrib/data/python/ops/contrib_op_loader.py
 delete mode 100644 tensorflow/core/api_def/base_api/api_def_IgnoreErrorsDataset.pbtxt

diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index f48c2fe92d..6b3343bb2f 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -117,6 +117,7 @@ cc_library(
         "//tensorflow/contrib/boosted_trees:boosted_trees_kernels",
         "//tensorflow/contrib/coder:all_kernels",
         "//tensorflow/contrib/cudnn_rnn:cudnn_rnn_kernels",
+        "//tensorflow/contrib/data/kernels:dataset_kernels",
         "//tensorflow/contrib/factorization/kernels:all_kernels",
         "//tensorflow/contrib/input_pipeline:input_pipeline_ops_kernels",
         "//tensorflow/contrib/layers:sparse_feature_cross_op_kernel",
@@ -139,6 +140,7 @@ cc_library(
         "//tensorflow/contrib/boosted_trees:boosted_trees_ops_op_lib",
         "//tensorflow/contrib/coder:all_ops",
         "//tensorflow/contrib/cudnn_rnn:cudnn_rnn_ops_op_lib",
+        "//tensorflow/contrib/data:dataset_ops_op_lib",
         "//tensorflow/contrib/factorization:all_ops",
         "//tensorflow/contrib/framework:all_ops",
         "//tensorflow/contrib/input_pipeline:input_pipeline_ops_op_lib",
diff --git a/tensorflow/contrib/cmake/tf_core_kernels.cmake b/tensorflow/contrib/cmake/tf_core_kernels.cmake
index 6927bf03f0..f219d5eb57 100644
--- a/tensorflow/contrib/cmake/tf_core_kernels.cmake
+++ b/tensorflow/contrib/cmake/tf_core_kernels.cmake
@@ -69,8 +69,9 @@ if(tensorflow_BUILD_CONTRIB_KERNELS)
       "${tensorflow_source_dir}/tensorflow/contrib/coder/ops/coder_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/cudnn_rnn/kernels/cudnn_rnn_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/data/kernels/ignore_errors_dataset_op.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/data/kernels/prefetching_kernels.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/data/ops/prefetching_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/data/ops/dataset_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/factorization/kernels/clustering_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/factorization/kernels/masked_matmul_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/factorization/kernels/wals_solver_ops.cc"
diff --git a/tensorflow/contrib/cmake/tf_core_ops.cmake b/tensorflow/contrib/cmake/tf_core_ops.cmake
index c42bc35ce7..59e094812a 100644
--- a/tensorflow/contrib/cmake/tf_core_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_core_ops.cmake
@@ -85,7 +85,7 @@ GENERATE_CONTRIB_OP_LIBRARY(boosted_trees_quantiles "${tensorflow_source_dir}/te
 GENERATE_CONTRIB_OP_LIBRARY(boosted_trees_stats_accumulator "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/ops/stats_accumulator_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(coder "${tensorflow_source_dir}/tensorflow/contrib/coder/ops/coder_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(cudnn_rnn "${tensorflow_source_dir}/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc")
-GENERATE_CONTRIB_OP_LIBRARY(data_prefetching "${tensorflow_source_dir}/tensorflow/contrib/data/ops/prefetching_ops.cc")
+GENERATE_CONTRIB_OP_LIBRARY(data_dataset "${tensorflow_source_dir}/tensorflow/contrib/data/ops/dataset_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(factorization_clustering "${tensorflow_source_dir}/tensorflow/contrib/factorization/ops/clustering_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(factorization_factorization "${tensorflow_source_dir}/tensorflow/contrib/factorization/ops/factorization_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(framework_variable "${tensorflow_source_dir}/tensorflow/contrib/framework/ops/variable_ops.cc")
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index 34c466fa01..b3c6663b8b 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -368,8 +368,8 @@ GENERATE_PYTHON_OP_LIB("contrib_coder_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/coder/python/ops/gen_coder_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_cudnn_rnn_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/cudnn_rnn/ops/gen_cudnn_rnn_ops.py)
-GENERATE_PYTHON_OP_LIB("contrib_data_prefetching_ops"
-  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/data/python/ops/gen_prefetching_ops.py)
+GENERATE_PYTHON_OP_LIB("contrib_data_dataset_ops"
+  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/data/python/ops/gen_dataset_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_factorization_clustering_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/factorization/python/ops/gen_clustering_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_factorization_factorization_ops"
diff --git a/tensorflow/contrib/data/BUILD b/tensorflow/contrib/data/BUILD
index 8ecc003348..0458199ff7 100644
--- a/tensorflow/contrib/data/BUILD
+++ b/tensorflow/contrib/data/BUILD
@@ -27,13 +27,13 @@ py_library(
 )
 
 tf_custom_op_library(
-    name = "_prefetching_ops.so",
-    srcs = ["ops/prefetching_ops.cc"],
-    deps = ["//tensorflow/contrib/data/kernels:prefetching_kernels"],
+    name = "_dataset_ops.so",
+    srcs = ["ops/dataset_ops.cc"],
+    deps = ["//tensorflow/contrib/data/kernels:dataset_kernels"],
 )
 
 tf_gen_op_libs(
-    op_lib_names = ["prefetching_ops"],
+    op_lib_names = ["dataset_ops"],
 )
 
 filegroup(
diff --git a/tensorflow/contrib/data/kernels/BUILD b/tensorflow/contrib/data/kernels/BUILD
index 4cb53741eb..56471911c5 100644
--- a/tensorflow/contrib/data/kernels/BUILD
+++ b/tensorflow/contrib/data/kernels/BUILD
@@ -17,6 +17,28 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "ignore_errors_dataset_op",
+    srcs = ["ignore_errors_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:framework_headers_lib",
+        "//third_party/eigen3",
+        "@protobuf_archive//:protobuf_headers",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "dataset_kernels",
+    deps = [
+        ":ignore_errors_dataset_op",
+        ":prefetching_kernels",
+        "//tensorflow/core:framework_headers_lib",
+        "//third_party/eigen3",
+        "@protobuf_archive//:protobuf_headers",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/core/kernels/data/ignore_errors_dataset_op.cc b/tensorflow/contrib/data/kernels/ignore_errors_dataset_op.cc
similarity index 98%
rename from tensorflow/core/kernels/data/ignore_errors_dataset_op.cc
rename to tensorflow/contrib/data/kernels/ignore_errors_dataset_op.cc
index 99df699d71..bb29df60e8 100644
--- a/tensorflow/core/kernels/data/ignore_errors_dataset_op.cc
+++ b/tensorflow/contrib/data/kernels/ignore_errors_dataset_op.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/lib/random/random.h"
 
 namespace tensorflow {
diff --git a/tensorflow/contrib/data/ops/prefetching_ops.cc b/tensorflow/contrib/data/ops/dataset_ops.cc
similarity index 86%
rename from tensorflow/contrib/data/ops/prefetching_ops.cc
rename to tensorflow/contrib/data/ops/dataset_ops.cc
index 23cb62b6f0..289ffa1d9c 100644
--- a/tensorflow/contrib/data/ops/prefetching_ops.cc
+++ b/tensorflow/contrib/data/ops/dataset_ops.cc
@@ -17,6 +17,16 @@ limitations under the License.
 
 namespace tensorflow {
 
+REGISTER_OP("IgnoreErrorsDataset")
+    .Input("input_dataset: variant")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a dataset that contains the elements of `input_dataset` ignoring errors.
+)doc");
+
 REGISTER_OP("FunctionBufferingResource")
     .Input("string_arg: string")
     .Input("target_device: string")
diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index 04a21f2b0f..3ee82933bc 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -535,7 +535,7 @@ py_test(
         "no_oss",  # b/68785503
     ],
     deps = [
-        "//tensorflow/contrib/data/python/ops:prefetching_py",
+        "//tensorflow/contrib/data/python/ops:prefetching_ops",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
diff --git a/tensorflow/contrib/data/python/ops/BUILD b/tensorflow/contrib/data/python/ops/BUILD
index 4349085a10..0eb5d4635d 100644
--- a/tensorflow/contrib/data/python/ops/BUILD
+++ b/tensorflow/contrib/data/python/ops/BUILD
@@ -109,6 +109,8 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        ":contrib_op_loader",
+        ":gen_dataset_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dataset_ops_gen",
@@ -130,36 +132,44 @@ py_library(
 )
 
 tf_gen_op_wrapper_py(
-    name = "prefetching_ops",
-    out = "gen_prefetching_ops.py",
-    deps = ["//tensorflow/contrib/data:prefetching_ops_op_lib"],
+    name = "gen_dataset_ops",
+    out = "gen_dataset_ops.py",
+    deps = ["//tensorflow/contrib/data:dataset_ops_op_lib"],
 )
 
 tf_kernel_library(
-    name = "prefetching_ops_kernels",
+    name = "dataset_ops_kernels",
     deps = [
-        "//tensorflow/contrib/data/kernels:prefetching_kernels",
+        "//tensorflow/contrib/data/kernels:dataset_kernels",
         "//tensorflow/core:framework",
     ],
     alwayslink = 1,
 )
 
 tf_custom_op_py_library(
-    name = "prefetching_py",
-    srcs = ["prefetching_ops.py"],
-    dso = ["//tensorflow/contrib/data:_prefetching_ops.so"],
+    name = "contrib_op_loader",
+    srcs = ["contrib_op_loader.py"],
+    dso = ["//tensorflow/contrib/data:_dataset_ops.so"],
     kernels = [
-        ":prefetching_ops_kernels",
-        "//tensorflow/contrib/data:prefetching_ops_op_lib",
+        ":dataset_ops_kernels",
+        "//tensorflow/contrib/data:dataset_ops_op_lib",
     ],
     srcs_version = "PY2AND3",
     deps = [
-        ":prefetching_ops",
+        ":gen_dataset_ops",
         "//tensorflow/contrib/util:util_py",
         "//tensorflow/python:platform",
     ],
 )
 
+py_library(
+    name = "prefetching_ops",
+    srcs = ["prefetching_ops.py"],
+    deps = [
+        ":contrib_op_loader",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/data/python/ops/contrib_op_loader.py b/tensorflow/contrib/data/python/ops/contrib_op_loader.py
new file mode 100644
index 0000000000..8f495a9dc9
--- /dev/null
+++ b/tensorflow/contrib/data/python/ops/contrib_op_loader.py
@@ -0,0 +1,24 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Python helper for loading contrib ops and kernels."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.util import loader
+from tensorflow.python.platform import resource_loader
+
+_dataset_ops = loader.load_op_library(
+    resource_loader.get_path_to_datafile("../../_dataset_ops.so"))
diff --git a/tensorflow/contrib/data/python/ops/error_ops.py b/tensorflow/contrib/data/python/ops/error_ops.py
index aa629cba47..6c21e489f7 100644
--- a/tensorflow/contrib/data/python/ops/error_ops.py
+++ b/tensorflow/contrib/data/python/ops/error_ops.py
@@ -17,10 +17,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.contrib.data.python.ops import contrib_op_loader  # pylint: disable=unused-import
+from tensorflow.contrib.data.python.ops import gen_dataset_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import sparse
-from tensorflow.python.ops import gen_dataset_ops
 
 
 def ignore_errors():
diff --git a/tensorflow/contrib/data/python/ops/prefetching_ops.py b/tensorflow/contrib/data/python/ops/prefetching_ops.py
index cfe8012b56..96a9e9ed66 100644
--- a/tensorflow/contrib/data/python/ops/prefetching_ops.py
+++ b/tensorflow/contrib/data/python/ops/prefetching_ops.py
@@ -17,12 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.data.python.ops import gen_prefetching_ops
-from tensorflow.contrib.util import loader
-from tensorflow.python.platform import resource_loader
-
-_prefetching_ops = loader.load_op_library(
-    resource_loader.get_path_to_datafile("../../_prefetching_ops.so"))
+from tensorflow.contrib.data.python.ops import contrib_op_loader  # pylint: disable=unused-import
+from tensorflow.contrib.data.python.ops import gen_dataset_ops
 
 
 # TODO(rohanj): Add a python class that constructs resource in the __init__
@@ -35,7 +31,7 @@ def function_buffering_resource(string_arg,
                                 thread_pool_size=1,
                                 container="",
                                 name=None):
-  return gen_prefetching_ops.function_buffering_resource(
+  return gen_dataset_ops.function_buffering_resource(
       string_arg=string_arg,
       target_device=target_device,
       shared_name=shared_name,
@@ -49,7 +45,7 @@ def function_buffering_resource(string_arg,
 def function_buffering_resource_get_next(function_buffer_resource,
                                          output_types,
                                          name=None):
-  return gen_prefetching_ops.function_buffering_resource_get_next(
+  return gen_dataset_ops.function_buffering_resource_get_next(
       function_buffer_resource=function_buffer_resource,
       output_types=output_types,
       name=name)
diff --git a/tensorflow/contrib/eager/python/BUILD b/tensorflow/contrib/eager/python/BUILD
index 3df056b070..46406e3908 100644
--- a/tensorflow/contrib/eager/python/BUILD
+++ b/tensorflow/contrib/eager/python/BUILD
@@ -52,7 +52,7 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:internal"],
     deps = [
-        "//tensorflow/contrib/data/python/ops:prefetching_py",
+        "//tensorflow/contrib/data/python/ops:prefetching_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:errors",
diff --git a/tensorflow/core/api_def/base_api/api_def_IgnoreErrorsDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_IgnoreErrorsDataset.pbtxt
deleted file mode 100644
index e492d90287..0000000000
--- a/tensorflow/core/api_def/base_api/api_def_IgnoreErrorsDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "IgnoreErrorsDataset"
-  summary: "Creates a dataset that contains the elements of `input_dataset` ignoring errors."
-}
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index 8e91baaa1c..1e3b0c231f 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -318,18 +318,6 @@ tf_kernel_library(
     ],
 )
 
-tf_kernel_library(
-    name = "ignore_errors_dataset_op",
-    srcs = ["ignore_errors_dataset_op.cc"],
-    deps = [
-        ":dataset",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
 tf_kernel_library(
     name = "stats_dataset_ops",
     srcs = ["stats_dataset_ops.cc"],
@@ -532,7 +520,6 @@ tf_kernel_library(
         ":filter_dataset_op",
         ":flat_map_dataset_op",
         ":group_by_window_dataset_op",
-        ":ignore_errors_dataset_op",
         ":interleave_dataset_op",
         ":iterator_ops",
         ":map_and_batch_dataset_op",
diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 8db4373fdc..7a7ec5c898 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -21551,53 +21551,6 @@ op {
     }
   }
 }
-op {
-  name: "IgnoreErrorsDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "IgnoreErrorsDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "Imag"
   input_arg {
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index 3c8e9a8a5f..9e98f56c74 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -107,13 +107,6 @@ REGISTER_OP("SkipDataset")
     .Attr("output_shapes: list(shape) >= 1")
     .SetShapeFn(shape_inference::ScalarShape);
 
-REGISTER_OP("IgnoreErrorsDataset")
-    .Input("input_dataset: variant")
-    .Output("handle: variant")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
-
 REGISTER_OP("BytesProducedStatsDataset")
     .Input("input_dataset: variant")
     .Input("tag: string")
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 3189bd09fc..d9d7929959 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -147,7 +147,7 @@ sh_binary(
             "//tensorflow/contrib/boosted_trees:boosted_trees_pip",
             "//tensorflow/contrib/cluster_resolver:cluster_resolver_pip",
             "//tensorflow/contrib/data/python/kernel_tests:dataset_serialization_test",
-            "//tensorflow/contrib/data/python/ops:prefetching_py",
+            "//tensorflow/contrib/data/python/ops:contrib_op_loader",
             "//tensorflow/contrib/eager/python/examples:examples_pip",
             "//tensorflow/contrib/eager/python:checkpointable",
             "//tensorflow/contrib/eager/python:evaluator",
-- 
GitLab


From 120a110f704244cf233ab741949f601fbe4bb71c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 7 Feb 2018 20:49:37 -0800
Subject: [PATCH 807/966] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 184941052

---
 tensorflow/go/op/wrappers.go | 331 +++++++++++++++++------------------
 1 file changed, 157 insertions(+), 174 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index a7290ff117..db01f1068d 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -4580,68 +4580,6 @@ func CriticalSectionOp(scope *Scope, optional ...CriticalSectionOpAttr) (resourc
 	return op.Output(0)
 }
 
-// FakeQuantWithMinMaxArgsGradientAttr is an optional argument to FakeQuantWithMinMaxArgsGradient.
-type FakeQuantWithMinMaxArgsGradientAttr func(optionalAttr)
-
-// FakeQuantWithMinMaxArgsGradientMin sets the optional min attribute to value.
-// If not specified, defaults to -6
-func FakeQuantWithMinMaxArgsGradientMin(value float32) FakeQuantWithMinMaxArgsGradientAttr {
-	return func(m optionalAttr) {
-		m["min"] = value
-	}
-}
-
-// FakeQuantWithMinMaxArgsGradientMax sets the optional max attribute to value.
-// If not specified, defaults to 6
-func FakeQuantWithMinMaxArgsGradientMax(value float32) FakeQuantWithMinMaxArgsGradientAttr {
-	return func(m optionalAttr) {
-		m["max"] = value
-	}
-}
-
-// FakeQuantWithMinMaxArgsGradientNumBits sets the optional num_bits attribute to value.
-// If not specified, defaults to 8
-func FakeQuantWithMinMaxArgsGradientNumBits(value int64) FakeQuantWithMinMaxArgsGradientAttr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
-	}
-}
-
-// FakeQuantWithMinMaxArgsGradientNarrowRange sets the optional narrow_range attribute to value.
-// If not specified, defaults to false
-func FakeQuantWithMinMaxArgsGradientNarrowRange(value bool) FakeQuantWithMinMaxArgsGradientAttr {
-	return func(m optionalAttr) {
-		m["narrow_range"] = value
-	}
-}
-
-// Compute gradients for a FakeQuantWithMinMaxArgs operation.
-//
-// Arguments:
-//	gradients: Backpropagated gradients above the FakeQuantWithMinMaxArgs operation.
-//	inputs: Values passed as inputs to the FakeQuantWithMinMaxArgs operation.
-//
-// Returns Backpropagated gradients below the FakeQuantWithMinMaxArgs operation:
-// `gradients * (inputs >= min && inputs <= max)`.
-func FakeQuantWithMinMaxArgsGradient(scope *Scope, gradients tf.Output, inputs tf.Output, optional ...FakeQuantWithMinMaxArgsGradientAttr) (backprops tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FakeQuantWithMinMaxArgsGradient",
-		Input: []tf.Input{
-			gradients, inputs,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // AvgPool3DAttr is an optional argument to AvgPool3D.
 type AvgPool3DAttr func(optionalAttr)
 
@@ -19980,118 +19918,6 @@ func ConcatenateDataset(scope *Scope, input_dataset tf.Output, another_dataset t
 	return op.Output(0)
 }
 
-// Creates a dataset that contains the elements of `input_dataset` ignoring errors.
-func IgnoreErrorsDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "IgnoreErrorsDataset",
-		Input: []tf.Input{
-			input_dataset,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// CropAndResizeGradImageAttr is an optional argument to CropAndResizeGradImage.
-type CropAndResizeGradImageAttr func(optionalAttr)
-
-// CropAndResizeGradImageMethod sets the optional method attribute to value.
-//
-// value: A string specifying the interpolation method. Only 'bilinear' is
-// supported for now.
-// If not specified, defaults to "bilinear"
-func CropAndResizeGradImageMethod(value string) CropAndResizeGradImageAttr {
-	return func(m optionalAttr) {
-		m["method"] = value
-	}
-}
-
-// Computes the gradient of the crop_and_resize op wrt the input image tensor.
-//
-// Arguments:
-//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
-//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
-// specifies the coordinates of a box in the `box_ind[i]` image and is specified
-// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
-// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
-// `[0, 1]` interval of normalized image height is mapped to
-// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
-// which case the sampled crop is an up-down flipped version of the original
-// image. The width dimension is treated similarly. Normalized coordinates
-// outside the `[0, 1]` range are allowed, in which case we use
-// `extrapolation_value` to extrapolate the input image values.
-//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
-// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
-//	image_size: A 1-D tensor with value `[batch, image_height, image_width, depth]`
-// containing the original image size. Both `image_height` and `image_width` need
-// to be positive.
-//
-//
-// Returns A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
-func CropAndResizeGradImage(scope *Scope, grads tf.Output, boxes tf.Output, box_ind tf.Output, image_size tf.Output, T tf.DataType, optional ...CropAndResizeGradImageAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"T": T}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "CropAndResizeGradImage",
-		Input: []tf.Input{
-			grads, boxes, box_ind, image_size,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Reads and outputs the entire contents of the input filename.
-func ReadFile(scope *Scope, filename tf.Output) (contents tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReadFile",
-		Input: []tf.Input{
-			filename,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Concatenates tensors along one dimension.
-//
-// Arguments:
-//	values: List of `N` Tensors to concatenate. Their ranks and types must match,
-// and their sizes must match in all dimensions except `concat_dim`.
-//	axis: 0-D.  The dimension along which to concatenate.  Must be in the
-// range [-rank(values), rank(values)).
-//
-// Returns A `Tensor` with the concatenation of values stacked along the
-// `concat_dim` dimension.  This tensor's shape matches that of `values` except
-// in `concat_dim` where it has the sum of the sizes.
-func ConcatV2(scope *Scope, values []tf.Output, axis tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ConcatV2",
-		Input: []tf.Input{
-			tf.OutputList(values), axis,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Adds a value to the current value of a variable.
 //
 // Any ReadVariableOp which depends directly or indirectly on this assign is
@@ -20813,6 +20639,68 @@ func TFRecordDataset(scope *Scope, filenames tf.Output, compression_type tf.Outp
 	return op.Output(0)
 }
 
+// FakeQuantWithMinMaxArgsGradientAttr is an optional argument to FakeQuantWithMinMaxArgsGradient.
+type FakeQuantWithMinMaxArgsGradientAttr func(optionalAttr)
+
+// FakeQuantWithMinMaxArgsGradientMin sets the optional min attribute to value.
+// If not specified, defaults to -6
+func FakeQuantWithMinMaxArgsGradientMin(value float32) FakeQuantWithMinMaxArgsGradientAttr {
+	return func(m optionalAttr) {
+		m["min"] = value
+	}
+}
+
+// FakeQuantWithMinMaxArgsGradientMax sets the optional max attribute to value.
+// If not specified, defaults to 6
+func FakeQuantWithMinMaxArgsGradientMax(value float32) FakeQuantWithMinMaxArgsGradientAttr {
+	return func(m optionalAttr) {
+		m["max"] = value
+	}
+}
+
+// FakeQuantWithMinMaxArgsGradientNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxArgsGradientNumBits(value int64) FakeQuantWithMinMaxArgsGradientAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
+// FakeQuantWithMinMaxArgsGradientNarrowRange sets the optional narrow_range attribute to value.
+// If not specified, defaults to false
+func FakeQuantWithMinMaxArgsGradientNarrowRange(value bool) FakeQuantWithMinMaxArgsGradientAttr {
+	return func(m optionalAttr) {
+		m["narrow_range"] = value
+	}
+}
+
+// Compute gradients for a FakeQuantWithMinMaxArgs operation.
+//
+// Arguments:
+//	gradients: Backpropagated gradients above the FakeQuantWithMinMaxArgs operation.
+//	inputs: Values passed as inputs to the FakeQuantWithMinMaxArgs operation.
+//
+// Returns Backpropagated gradients below the FakeQuantWithMinMaxArgs operation:
+// `gradients * (inputs >= min && inputs <= max)`.
+func FakeQuantWithMinMaxArgsGradient(scope *Scope, gradients tf.Output, inputs tf.Output, optional ...FakeQuantWithMinMaxArgsGradientAttr) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FakeQuantWithMinMaxArgsGradient",
+		Input: []tf.Input{
+			gradients, inputs,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // BatchToSpace for 4-D tensors of type T.
 //
 // This is a legacy version of the more general BatchToSpaceND.
@@ -22325,6 +22213,101 @@ func TensorArrayCloseV2(scope *Scope, handle tf.Output) (o *tf.Operation) {
 	return scope.AddOperation(opspec)
 }
 
+// CropAndResizeGradImageAttr is an optional argument to CropAndResizeGradImage.
+type CropAndResizeGradImageAttr func(optionalAttr)
+
+// CropAndResizeGradImageMethod sets the optional method attribute to value.
+//
+// value: A string specifying the interpolation method. Only 'bilinear' is
+// supported for now.
+// If not specified, defaults to "bilinear"
+func CropAndResizeGradImageMethod(value string) CropAndResizeGradImageAttr {
+	return func(m optionalAttr) {
+		m["method"] = value
+	}
+}
+
+// Computes the gradient of the crop_and_resize op wrt the input image tensor.
+//
+// Arguments:
+//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
+//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
+// specifies the coordinates of a box in the `box_ind[i]` image and is specified
+// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
+// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
+// `[0, 1]` interval of normalized image height is mapped to
+// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
+// which case the sampled crop is an up-down flipped version of the original
+// image. The width dimension is treated similarly. Normalized coordinates
+// outside the `[0, 1]` range are allowed, in which case we use
+// `extrapolation_value` to extrapolate the input image values.
+//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
+// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
+//	image_size: A 1-D tensor with value `[batch, image_height, image_width, depth]`
+// containing the original image size. Both `image_height` and `image_width` need
+// to be positive.
+//
+//
+// Returns A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
+func CropAndResizeGradImage(scope *Scope, grads tf.Output, boxes tf.Output, box_ind tf.Output, image_size tf.Output, T tf.DataType, optional ...CropAndResizeGradImageAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"T": T}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CropAndResizeGradImage",
+		Input: []tf.Input{
+			grads, boxes, box_ind, image_size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Reads and outputs the entire contents of the input filename.
+func ReadFile(scope *Scope, filename tf.Output) (contents tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReadFile",
+		Input: []tf.Input{
+			filename,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Concatenates tensors along one dimension.
+//
+// Arguments:
+//	values: List of `N` Tensors to concatenate. Their ranks and types must match,
+// and their sizes must match in all dimensions except `concat_dim`.
+//	axis: 0-D.  The dimension along which to concatenate.  Must be in the
+// range [-rank(values), rank(values)).
+//
+// Returns A `Tensor` with the concatenation of values stacked along the
+// `concat_dim` dimension.  This tensor's shape matches that of `values` except
+// in `concat_dim` where it has the sum of the sizes.
+func ConcatV2(scope *Scope, values []tf.Output, axis tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ConcatV2",
+		Input: []tf.Input{
+			tf.OutputList(values), axis,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Forwards the value of an available tensor from `inputs` to `output`.
 //
 // `Merge` waits for at least one of the tensors in `inputs` to become available.
-- 
GitLab


From e78223d585b836c134423e1fc48d0347e8833183 Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Wed, 7 Feb 2018 20:50:09 -0800
Subject: [PATCH 808/966] Move TPU doc to "using_tpu". Add short titles to some
 docs.

PiperOrigin-RevId: 184941101
---
 tensorflow/docs_src/install/leftnav_files        | 14 +++++++-------
 .../docs_src/programmers_guide/debugger.md       |  2 +-
 .../docs_src/programmers_guide/leftnav_files     |  9 ++++++---
 .../using_tpu.md}                                |  0
 tensorflow/docs_src/tutorials/leftnav_files      | 16 ++++++++--------
 5 files changed, 22 insertions(+), 19 deletions(-)
 rename tensorflow/docs_src/{api_guides/python/TPUEstimator.md => programmers_guide/using_tpu.md} (100%)

diff --git a/tensorflow/docs_src/install/leftnav_files b/tensorflow/docs_src/install/leftnav_files
index 0e8b5ae7a1..e523e06f67 100644
--- a/tensorflow/docs_src/install/leftnav_files
+++ b/tensorflow/docs_src/install/leftnav_files
@@ -1,16 +1,16 @@
 index.md
 
 ### Python
-install_linux.md
-install_mac.md
-install_windows.md
-install_sources.md
+install_linux.md: Ubuntu
+install_mac.md: MacOS
+install_windows.md: Windows
+install_sources.md: From source
 >>>
 migration.md
 
 ### Other Languages
-install_java.md
-install_go.md
-install_c.md
+install_java.md: Java
+install_go.md: Go
+install_c.md: C
 
 
diff --git a/tensorflow/docs_src/programmers_guide/debugger.md b/tensorflow/docs_src/programmers_guide/debugger.md
index 9eaee27028..dbc4517087 100644
--- a/tensorflow/docs_src/programmers_guide/debugger.md
+++ b/tensorflow/docs_src/programmers_guide/debugger.md
@@ -1,4 +1,4 @@
-# Debugging TensorFlow Programs
+# TensorFlow Debugger
 
 <!-- [comment]: TODO(barryr): Links to and from sections on "Graphs" & "Monitoring Learning". -->
 
diff --git a/tensorflow/docs_src/programmers_guide/leftnav_files b/tensorflow/docs_src/programmers_guide/leftnav_files
index 38de3ccc3e..3fe4cb2dda 100644
--- a/tensorflow/docs_src/programmers_guide/leftnav_files
+++ b/tensorflow/docs_src/programmers_guide/leftnav_files
@@ -10,7 +10,10 @@ tensors.md
 variables.md
 graphs.md
 saved_model.md
+
+### Accelerators
 using_gpu.md
+using_tpu.md
 
 ### ML Concepts
 embedding.md
@@ -19,9 +22,9 @@ embedding.md
 debugger.md
 
 ### TensorBoard
-summaries_and_tensorboard.md
-graph_viz.md
-tensorboard_histograms.md
+summaries_and_tensorboard.md: Visualizing Learning
+graph_viz.md: Graphs
+tensorboard_histograms.md: Histograms
 
 ### Misc
 version_compat.md
diff --git a/tensorflow/docs_src/api_guides/python/TPUEstimator.md b/tensorflow/docs_src/programmers_guide/using_tpu.md
similarity index 100%
rename from tensorflow/docs_src/api_guides/python/TPUEstimator.md
rename to tensorflow/docs_src/programmers_guide/using_tpu.md
diff --git a/tensorflow/docs_src/tutorials/leftnav_files b/tensorflow/docs_src/tutorials/leftnav_files
index 41ffdc8601..888052428f 100644
--- a/tensorflow/docs_src/tutorials/leftnav_files
+++ b/tensorflow/docs_src/tutorials/leftnav_files
@@ -1,22 +1,22 @@
 index.md
 
 ### Images
-layers.md
-image_recognition.md
-image_retraining.md
+layers.md: MNIST
+image_recognition.md: Image Recognition
+image_retraining.md: Image Retraining
 deep_cnn.md
 
 ### Sequences
 recurrent.md
-seq2seq.md
-recurrent_quickdraw.md
+seq2seq.md: Neural Machine Translation
+recurrent_quickdraw.md: Drawing Classification
 audio_recognition.md
 
 ### Data Representation
-wide.md
-wide_and_deep.md
+wide.md: Linear Models
+wide_and_deep.md: Wide & Deep Learning
 word2vec.md
-kernel_methods.md
+kernel_methods.md: Kernel Methods
 
 ### Non-ML
 mandelbrot.md
-- 
GitLab


From 5a174f605ee22ef8ecb562215051c8a11551d5bb Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 7 Feb 2018 21:05:41 -0800
Subject: [PATCH 809/966] Temporarily weaken Identity pruning in model_pruner
 while investigating test failure of
 //robotics/learning/sensor_predict:utils_multi_sensor_rnn_test.

PiperOrigin-RevId: 184942554
---
 .../core/grappler/optimizers/model_pruner.cc       | 14 ++++++++------
 .../core/grappler/optimizers/model_pruner_test.cc  |  5 +++++
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/model_pruner.cc b/tensorflow/core/grappler/optimizers/model_pruner.cc
index 01282401a3..ece9df012e 100644
--- a/tensorflow/core/grappler/optimizers/model_pruner.cc
+++ b/tensorflow/core/grappler/optimizers/model_pruner.cc
@@ -32,12 +32,14 @@ bool IsTrivialOp(const NodeDef& node, const GraphRewriter& rewriter) {
   if (IsStopGradient(node)) {
     return true;
   }
-  if (IsIdentity(node) &&
-      !(rewriter.FeedsMerge(node) &&
-        rewriter.IsDrivenByControlDependency(node)) &&
-      !(rewriter.IsDrivenBySwitch(node) &&
-        rewriter.DrivesControlDependency(node))) {
-    return true;
+  if (IsIdentity(node)) {
+    if (rewriter.FeedsMerge(node) || rewriter.IsDrivenBySwitch(node) ||
+        rewriter.IsDrivenByControlDependency(node) ||
+        rewriter.DrivesControlDependency(node)) {
+      return false;
+    } else {
+      return true;
+    }
   }
   if (IsAddN(node) && NumNonControlInputs(node) <= 1) {
     return true;
diff --git a/tensorflow/core/grappler/optimizers/model_pruner_test.cc b/tensorflow/core/grappler/optimizers/model_pruner_test.cc
index c39444299e..8480a74572 100644
--- a/tensorflow/core/grappler/optimizers/model_pruner_test.cc
+++ b/tensorflow/core/grappler/optimizers/model_pruner_test.cc
@@ -234,6 +234,10 @@ TEST_F(ModelPrunerTest, PruningSkipsRefOutputs) {
   EXPECT_EQ("b", new_e.input(0));
 }
 
+// TODO(rmlarsen): Reenable this test when the issues with
+// //robotics/learning/sensor_predict:utils_multi_sensor_rnn_test
+// have been resolved.
+/*
 TEST_F(ModelPrunerTest, PruningForwardsCtrlDependencies) {
   // Build a simple graph with a few trivially prunable ops.
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
@@ -276,6 +280,7 @@ TEST_F(ModelPrunerTest, PruningForwardsCtrlDependencies) {
     }
   }
 }
+*/
 
 TEST_F(ModelPrunerTest, PruningPerservesFetch) {
   // Build a simple graph with a few trivially prunable ops.
-- 
GitLab


From fcfe9df7f9c63dfe451046f3bce4c78fb4c1b390 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 7 Feb 2018 21:11:56 -0800
Subject: [PATCH 810/966] Fixing the anchor link for the Math ops guide on
 Segmentation (it's a capital S but our docs link to a lowercase s).

PiperOrigin-RevId: 184942994
---
 tensorflow/core/api_def/base_api/api_def_SegmentMax.pbtxt     | 2 +-
 tensorflow/core/api_def/base_api/api_def_SegmentMean.pbtxt    | 2 +-
 tensorflow/core/api_def/base_api/api_def_SegmentMin.pbtxt     | 2 +-
 tensorflow/core/api_def/base_api/api_def_SegmentProd.pbtxt    | 2 +-
 tensorflow/core/api_def/base_api/api_def_SegmentSum.pbtxt     | 2 +-
 .../core/api_def/base_api/api_def_SparseSegmentMean.pbtxt     | 2 +-
 .../base_api/api_def_SparseSegmentMeanWithNumSegments.pbtxt   | 2 +-
 .../core/api_def/base_api/api_def_SparseSegmentSqrtN.pbtxt    | 2 +-
 .../base_api/api_def_SparseSegmentSqrtNWithNumSegments.pbtxt  | 2 +-
 .../core/api_def/base_api/api_def_SparseSegmentSum.pbtxt      | 2 +-
 .../base_api/api_def_SparseSegmentSumWithNumSegments.pbtxt    | 2 +-
 .../core/api_def/base_api/api_def_UnsortedSegmentMax.pbtxt    | 2 +-
 .../core/api_def/base_api/api_def_UnsortedSegmentSum.pbtxt    | 2 +-
 tensorflow/python/ops/math_ops.py                             | 4 ++--
 14 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/tensorflow/core/api_def/base_api/api_def_SegmentMax.pbtxt b/tensorflow/core/api_def/base_api/api_def_SegmentMax.pbtxt
index db890cb2f5..5e2912fcdd 100644
--- a/tensorflow/core/api_def/base_api/api_def_SegmentMax.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SegmentMax.pbtxt
@@ -16,7 +16,7 @@ END
   }
   summary: "Computes the maximum along segments of a tensor."
   description: <<END
-Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
 segments.
 
 Computes a tensor such that
diff --git a/tensorflow/core/api_def/base_api/api_def_SegmentMean.pbtxt b/tensorflow/core/api_def/base_api/api_def_SegmentMean.pbtxt
index 4713c52310..a7d85b3f4e 100644
--- a/tensorflow/core/api_def/base_api/api_def_SegmentMean.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SegmentMean.pbtxt
@@ -16,7 +16,7 @@ END
   }
   summary: "Computes the mean along segments of a tensor."
   description: <<END
-Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
 segments.
 
 Computes a tensor such that
diff --git a/tensorflow/core/api_def/base_api/api_def_SegmentMin.pbtxt b/tensorflow/core/api_def/base_api/api_def_SegmentMin.pbtxt
index 6316bfd1a5..74fc598218 100644
--- a/tensorflow/core/api_def/base_api/api_def_SegmentMin.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SegmentMin.pbtxt
@@ -16,7 +16,7 @@ END
   }
   summary: "Computes the minimum along segments of a tensor."
   description: <<END
-Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
 segments.
 
 Computes a tensor such that
diff --git a/tensorflow/core/api_def/base_api/api_def_SegmentProd.pbtxt b/tensorflow/core/api_def/base_api/api_def_SegmentProd.pbtxt
index a16d03d467..4c4363e524 100644
--- a/tensorflow/core/api_def/base_api/api_def_SegmentProd.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SegmentProd.pbtxt
@@ -16,7 +16,7 @@ END
   }
   summary: "Computes the product along segments of a tensor."
   description: <<END
-Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
 segments.
 
 Computes a tensor such that
diff --git a/tensorflow/core/api_def/base_api/api_def_SegmentSum.pbtxt b/tensorflow/core/api_def/base_api/api_def_SegmentSum.pbtxt
index 0686e17f9b..583ab3904f 100644
--- a/tensorflow/core/api_def/base_api/api_def_SegmentSum.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SegmentSum.pbtxt
@@ -16,7 +16,7 @@ END
   }
   summary: "Computes the sum along segments of a tensor."
   description: <<END
-Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
 segments.
 
 Computes a tensor such that
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentMean.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentMean.pbtxt
index 18e6660595..866e04e97b 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseSegmentMean.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentMean.pbtxt
@@ -21,7 +21,7 @@ END
   }
   summary: "Computes the mean along sparse segments of a tensor."
   description: <<END
-Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
 segments.
 
 Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentMeanWithNumSegments.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentMeanWithNumSegments.pbtxt
index d6e1054003..af4bc75fa0 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseSegmentMeanWithNumSegments.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentMeanWithNumSegments.pbtxt
@@ -30,7 +30,7 @@ END
 Like `SparseSegmentMean`, but allows missing ids in `segment_ids`. If an id is
 misisng, the `output` tensor at that position will be zeroed.
 
-Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
 segments.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtN.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtN.pbtxt
index 3fdeb66aed..194bcea726 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtN.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtN.pbtxt
@@ -23,7 +23,7 @@ END
   description: <<END
 N is the size of the segment being reduced.
 
-Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
 segments.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtNWithNumSegments.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtNWithNumSegments.pbtxt
index 9ba98b8191..8b502928a5 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtNWithNumSegments.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtNWithNumSegments.pbtxt
@@ -32,7 +32,7 @@ N is the size of the segment being reduced.
 Like `SparseSegmentSqrtN`, but allows missing ids in `segment_ids`. If an id is
 misisng, the `output` tensor at that position will be zeroed.
 
-Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
 segments.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSum.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSum.pbtxt
index cdf44a89a3..dfd50bf273 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSum.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSum.pbtxt
@@ -21,7 +21,7 @@ END
   }
   summary: "Computes the sum along sparse segments of a tensor."
   description: <<END
-Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
 segments.
 
 Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSumWithNumSegments.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSumWithNumSegments.pbtxt
index 3aeaba38e9..3bc16577ff 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSumWithNumSegments.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSumWithNumSegments.pbtxt
@@ -30,7 +30,7 @@ END
 Like `SparseSegmentSum`, but allows missing ids in `segment_ids`. If an id is
 misisng, the `output` tensor at that position will be zeroed.
 
-Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
 segments.
 
 For example:
diff --git a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMax.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMax.pbtxt
index 8298d62f25..4e69e0bc63 100644
--- a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMax.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMax.pbtxt
@@ -16,7 +16,7 @@ END
   }
   summary: "Computes the Max along segments of a tensor."
   description: <<END
-Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
 segments.
 
 This operator is similar to the [unsorted segment sum operator](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
diff --git a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentSum.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentSum.pbtxt
index 77a96d1e03..eb5d0d1247 100644
--- a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentSum.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentSum.pbtxt
@@ -16,7 +16,7 @@ END
   }
   summary: "Computes the sum along segments of a tensor."
   description: <<END
-Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
 segments.
 
 Computes a tensor such that
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 9a8ac93de9..448cc905d6 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -2557,7 +2557,7 @@ def sparse_segment_sum(data, indices, segment_ids, name=None,
                        num_segments=None):
   r"""Computes the sum along sparse segments of a tensor.
 
-  Read @{$math_ops#segmentation$the section on segmentation} for an explanation
+  Read @{$math_ops#Segmentation$the section on segmentation} for an explanation
   of segments.
 
   Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
@@ -2632,7 +2632,7 @@ def sparse_segment_mean(data, indices, segment_ids, name=None,
                         num_segments=None):
   r"""Computes the mean along sparse segments of a tensor.
 
-  Read @{$math_ops#segmentation$the section on segmentation} for an explanation
+  Read @{$math_ops#Segmentation$the section on segmentation} for an explanation
   of segments.
 
   Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
-- 
GitLab


From bbe79073d63f610ec6f3448d616fe8d72c2502d4 Mon Sep 17 00:00:00 2001
From: Jie <jiej@nvidia.com>
Date: Wed, 7 Feb 2018 21:31:25 -0800
Subject: [PATCH 811/966] [debug]   convert const data goes out of scope.

---
 .../contrib/tensorrt/convert/convert_nodes.cc      | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index bba69ea93a..4e9fc9efab 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -1128,18 +1128,8 @@ tensorflow::Status ConvertConst(Converter& ctx,
   } else if (!weights_tensor.tensor_content().empty()) {
     VLOG(2) << "TENSOR!!!" << node_def.name();
     const auto& content = weights_tensor.tensor_content();
-
-    std::vector<char> values;
-    if (content.size() > 0) {
-      const int dtype_size = tensorflow::DataTypeSize(dtype);
-      CHECK_EQ(0, content.size() % dtype_size)
-          << "Tensor content size (" << content.size()
-          << ") is not a multiple of " << dtype_size;
-      values.resize(content.size());
-      port::CopyToArray(content, values.data());
-    }
-    weights =
-        TRT_ShapedWeights(dtype, nullptr, GetTensorShape(tensor), &values);
+      weights = TRT_ShapedWeights(dtype, weights_tensor.tensor_content().data(),
+                                  GetTensorShape(tensor));
   } else {
     return tensorflow::errors::Unimplemented(
         "Not supported constant type, at " + node_def.name());
-- 
GitLab


From 255712a4544a404177d75d492713c0267f874f60 Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Wed, 7 Feb 2018 21:56:28 -0800
Subject: [PATCH 812/966] Switch to using ResourceVariable in Keras for full
 Eager-mode compatibility.

PiperOrigin-RevId: 184945626
---
 .../python/keras/_impl/keras/backend.py       | 29 +++++++++-------
 .../python/keras/_impl/keras/constraints.py   |  9 ++---
 .../python/keras/_impl/keras/optimizers.py    | 34 +++++++++++--------
 3 files changed, 38 insertions(+), 34 deletions(-)

diff --git a/tensorflow/python/keras/_impl/keras/backend.py b/tensorflow/python/keras/_impl/keras/backend.py
index 098ea063f9..59050f24fb 100644
--- a/tensorflow/python/keras/_impl/keras/backend.py
+++ b/tensorflow/python/keras/_impl/keras/backend.py
@@ -48,6 +48,7 @@ from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import tensor_array_grad  # pylint: disable=unused-import
@@ -589,7 +590,7 @@ def variable(value, dtype=None, name=None, constraint=None):
     v._keras_shape = sparse_coo.shape
     v._uses_learning_phase = False
     return v
-  v = variables_module.Variable(
+  v = resource_variable_ops.ResourceVariable(
       value,
       dtype=dtypes_module.as_dtype(dtype),
       name=name,
@@ -3195,7 +3196,7 @@ def categorical_crossentropy(target, output, from_logits=False):
   # expects logits, Keras expects probabilities.
   if not from_logits:
     # scale preds so that the class probas of each sample sum to 1
-    output /= math_ops.reduce_sum(
+    output = output / math_ops.reduce_sum(  # pylint: disable=g-no-augmented-assignment
         output, len(output.get_shape()) - 1, True)
     # manual computation of crossentropy
     epsilon_ = _to_tensor(epsilon(), output.dtype.base_dtype)
@@ -4099,44 +4100,46 @@ def bias_add(x, bias, data_format=None):
     raise ValueError(
         'Unexpected bias dimensions %d, expect to be 1 or %d dimensions' %
         (len(bias_shape), ndim(x)))
+  # pylint: disable=g-no-augmented-assignment
   if ndim(x) == 5:
     if data_format == 'channels_first':
       if len(bias_shape) == 1:
-        x += reshape(bias, (1, bias_shape[0], 1, 1, 1))
+        x = x + reshape(bias, (1, bias_shape[0], 1, 1, 1))
       else:
-        x += reshape(bias, (1, bias_shape[3]) + bias_shape[:3])
+        x = x + reshape(bias, (1, bias_shape[3]) + bias_shape[:3])
     elif data_format == 'channels_last':
       if len(bias_shape) == 1:
-        x += reshape(bias, (1, 1, 1, bias_shape[0]))
+        x = x + reshape(bias, (1, 1, 1, bias_shape[0]))
       else:
-        x += reshape(bias, (1,) + bias_shape)
+        x = x + reshape(bias, (1,) + bias_shape)
   elif ndim(x) == 4:
     if data_format == 'channels_first':
       if len(bias_shape) == 1:
         if _has_nchw_support():
           x = nn.bias_add(x, bias, data_format='NCHW')
         else:
-          x += reshape(bias, (1, bias_shape[0], 1, 1))
+          x = x + reshape(bias, (1, bias_shape[0], 1, 1))
       else:
-        x += reshape(bias, (1, bias_shape[2]) + bias_shape[:2])
+        x = x + reshape(bias, (1, bias_shape[2]) + bias_shape[:2])
     elif data_format == 'channels_last':
       if len(bias_shape) == 1:
         x = nn.bias_add(x, bias, data_format='NHWC')
       else:
-        x += reshape(bias, (1,) + bias_shape)
+        x = x + reshape(bias, (1,) + bias_shape)
   elif ndim(x) == 3:
     if data_format == 'channels_first':
       if len(bias_shape) == 1:
-        x += reshape(bias, (1, bias_shape[0], 1))
+        x = x + reshape(bias, (1, bias_shape[0], 1))
       else:
-        x += reshape(bias, (1, bias_shape[1], bias_shape[0]))
+        x = x + reshape(bias, (1, bias_shape[1], bias_shape[0]))
     elif data_format == 'channels_last':
       if len(bias_shape) == 1:
-        x += reshape(bias, (1, 1, bias_shape[0]))
+        x = x + reshape(bias, (1, 1, bias_shape[0]))
       else:
-        x += reshape(bias, (1,) + bias_shape)
+        x = x + reshape(bias, (1,) + bias_shape)
   else:
     x = nn.bias_add(x, bias)
+  # pylint: enable=g-no-augmented-assignment
   return x
 
 
diff --git a/tensorflow/python/keras/_impl/keras/constraints.py b/tensorflow/python/keras/_impl/keras/constraints.py
index 4b051c93f3..05b834a7a9 100644
--- a/tensorflow/python/keras/_impl/keras/constraints.py
+++ b/tensorflow/python/keras/_impl/keras/constraints.py
@@ -64,8 +64,7 @@ class MaxNorm(Constraint):
   def __call__(self, w):
     norms = K.sqrt(K.sum(K.square(w), axis=self.axis, keepdims=True))
     desired = K.clip(norms, 0, self.max_value)
-    w *= (desired / (K.epsilon() + norms))
-    return w
+    return w * (desired / (K.epsilon() + norms))
 
   def get_config(self):
     return {'max_value': self.max_value, 'axis': self.axis}
@@ -76,8 +75,7 @@ class NonNeg(Constraint):
   """
 
   def __call__(self, w):
-    w *= K.cast(K.greater_equal(w, 0.), K.floatx())
-    return w
+    return w * K.cast(K.greater_equal(w, 0.), K.floatx())
 
 
 class UnitNorm(Constraint):
@@ -148,8 +146,7 @@ class MinMaxNorm(Constraint):
     desired = (
         self.rate * K.clip(norms, self.min_value, self.max_value) +
         (1 - self.rate) * norms)
-    w *= (desired / (K.epsilon() + norms))
-    return w
+    return w * (desired / (K.epsilon() + norms))
 
   def get_config(self):
     return {
diff --git a/tensorflow/python/keras/_impl/keras/optimizers.py b/tensorflow/python/keras/_impl/keras/optimizers.py
index a55a5e39a6..a082677851 100644
--- a/tensorflow/python/keras/_impl/keras/optimizers.py
+++ b/tensorflow/python/keras/_impl/keras/optimizers.py
@@ -24,7 +24,6 @@ import copy
 import six
 from six.moves import zip  # pylint: disable=redefined-builtin
 
-from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes as dtypes_module
 from tensorflow.python.framework import ops
 from tensorflow.python.keras._impl.keras import backend as K
@@ -180,8 +179,9 @@ class SGD(Optimizer):
 
     lr = self.lr
     if self.initial_decay > 0:
-      lr *= (1. /
-             (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay))))
+      lr = lr * (1. /  # pylint: disable=g-no-augmented-assignment
+                 (1. + self.decay * K.cast(self.iterations,
+                                           K.dtype(self.decay))))
     # momentum
     shapes = [K.int_shape(p) for p in params]
     moments = [K.zeros(shape) for shape in shapes]
@@ -251,8 +251,9 @@ class RMSprop(Optimizer):
 
     lr = self.lr
     if self.initial_decay > 0:
-      lr *= (1. /
-             (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay))))
+      lr = lr * (1. /  # pylint: disable=g-no-augmented-assignment
+                 (1. + self.decay * K.cast(self.iterations,
+                                           K.dtype(self.decay))))
 
     for p, g, a in zip(params, grads, accumulators):
       # update accumulator
@@ -311,8 +312,9 @@ class Adagrad(Optimizer):
 
     lr = self.lr
     if self.initial_decay > 0:
-      lr *= (1. /
-             (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay))))
+      lr = lr * (1. /  # pylint: disable=g-no-augmented-assignment
+                 (1. + self.decay * K.cast(self.iterations,
+                                           K.dtype(self.decay))))
 
     for p, g, a in zip(params, grads, accumulators):
       new_a = a + K.square(g)  # update accumulator
@@ -373,8 +375,9 @@ class Adadelta(Optimizer):
 
     lr = self.lr
     if self.initial_decay > 0:
-      lr *= (1. /
-             (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay))))
+      lr = lr * (1. /  # pylint: disable=g-no-augmented-assignment
+                 (1. + self.decay * K.cast(self.iterations,
+                                           K.dtype(self.decay))))
 
     for p, g, a, d_a in zip(params, grads, accumulators, delta_accumulators):
       # update accumulator
@@ -451,8 +454,9 @@ class Adam(Optimizer):
 
     lr = self.lr
     if self.initial_decay > 0:
-      lr *= (1. /
-             (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay))))
+      lr = lr * (1. /  # pylint: disable=g-no-augmented-assignment
+                 (1. + self.decay * K.cast(self.iterations,
+                                           K.dtype(self.decay))))
 
     t = K.cast(self.iterations, K.floatx()) + 1
     lr_t = lr * (
@@ -539,8 +543,9 @@ class Adamax(Optimizer):
 
     lr = self.lr
     if self.initial_decay > 0:
-      lr *= (1. /
-             (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay))))
+      lr = lr * (1. /  # pylint: disable=g-no-augmented-assignment
+                 (1. + self.decay * K.cast(self.iterations,
+                                           K.dtype(self.decay))))
 
     t = K.cast(self.iterations, K.floatx()) + 1
     lr_t = lr / (1. - K.pow(self.beta_1, t))
@@ -681,8 +686,7 @@ class TFOptimizer(Optimizer):
   def __init__(self, optimizer):  # pylint: disable=super-init-not-called
     self.optimizer = optimizer
     with K.name_scope(self.__class__.__name__):
-      if context.in_graph_mode():
-        self.iterations = K.variable(0, dtype='int64', name='iterations')
+      self.iterations = K.variable(0, dtype='int64', name='iterations')
 
   def apply_gradients(self, grads):
     self.optimizer.apply_gradients(grads)
-- 
GitLab


From ab2c71d78d0aeaefb0e3347a320c0ff94e382ace Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 7 Feb 2018 22:18:45 -0800
Subject: [PATCH 813/966] Update ops-related pbtxt files.

PiperOrigin-RevId: 184947099
---
 tensorflow/core/ops/ops.pbtxt | 23 -----------------------
 1 file changed, 23 deletions(-)

diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 2e96211fdc..d9ac21c8ce 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -10237,29 +10237,6 @@ op {
     }
   }
 }
-op {
-  name: "IgnoreErrorsDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "Imag"
   input_arg {
-- 
GitLab


From 14f5cfc159dff6855bde7ac5b0e037eec0229e89 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 8 Feb 2018 03:18:36 -0800
Subject: [PATCH 814/966] Automated g4 rollback of changelist 184303789

PiperOrigin-RevId: 184970903
---
 tensorflow/python/ops/array_grad.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/tensorflow/python/ops/array_grad.py b/tensorflow/python/ops/array_grad.py
index c9292184e6..9745d38dc2 100644
--- a/tensorflow/python/ops/array_grad.py
+++ b/tensorflow/python/ops/array_grad.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
@@ -115,6 +116,19 @@ def _ConcatGradHelper(op, grad, start_value_index, end_value_index, dim_index):
                                                         non_neg_concat_dim)
       out_grads = array_ops.split(grad, sizes, non_neg_concat_dim)
     else:
+      if constant_op.is_constant(concat_dim):
+        # If concat_dim is a constant defined in a different context,
+        # then we duplicate it in the current context to avoid passing it
+        # through an Enter node.
+        # This is a small optimization in general, but it is required when
+        # compiling with XLA, as XLA needs the concat input to be folded into a
+        # constant.
+        grad_context = control_flow_util.GetOutputContext(grad.op)
+        dim_context = control_flow_util.GetOutputContext(concat_dim.op)
+        if dim_context != grad_context:
+          value = tensor_util.constant_value(concat_dim)
+          concat_dim = constant_op.constant(value=value, dtype=concat_dim.dtype)
+
       # Using mod here for convenience since concat_dim is already verified
       # in concat implementation to be within the allowed [-rank, rank) range.
       non_neg_concat_dim = concat_dim % array_ops.rank(input_values[0])
-- 
GitLab


From 122e1d9ab919415470b895aa6d440f7795c8f256 Mon Sep 17 00:00:00 2001
From: Ilya Biryukov <ibiryukov@google.com>
Date: Thu, 8 Feb 2018 04:18:40 -0800
Subject: [PATCH 815/966] Patch cub on download to fix compilation error with
 clang.

The same patch was sent via PR to cub upstream:
https://github.com/NVlabs/cub/pull/125

PiperOrigin-RevId: 184975304
---
 tensorflow/workspace.bzl                      |  3 +++
 third_party/cub/BUILD                         |  0
 .../cub/fix_compilation_in_clang.patch        | 23 +++++++++++++++++++
 3 files changed, 26 insertions(+)
 create mode 100644 third_party/cub/BUILD
 create mode 100644 third_party/cub/fix_compilation_in_clang.patch

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 12d3c739cc..afd371d016 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -670,6 +670,9 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       sha256 = "20a1a39fd97e5da7f40f5f2e7fd73fd2ea59f9dc4bb8a6c5f228aa543e727e31",
       strip_prefix = "cub-1.7.4",
       build_file = str(Label("//third_party:cub.BUILD")),
+      # TODO: remove the patch when upstream fix is accepted and released.
+      #       PR with a fix: https://github.com/NVlabs/cub/pull/125
+      patch_file = str(Label("//third_party/cub:fix_compilation_in_clang.patch")),
   )
 
   tf_http_archive(
diff --git a/third_party/cub/BUILD b/third_party/cub/BUILD
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/third_party/cub/fix_compilation_in_clang.patch b/third_party/cub/fix_compilation_in_clang.patch
new file mode 100644
index 0000000000..384e674f20
--- /dev/null
+++ b/third_party/cub/fix_compilation_in_clang.patch
@@ -0,0 +1,23 @@
+From 565b77f7c82048871a4d5e3e506dc663d53cd469 Mon Sep 17 00:00:00 2001
+From: Ilya Biryukov <ibiryukov@google.com>
+Date: Fri, 26 Jan 2018 18:46:06 +0100
+Subject: [PATCH] Added missing 'template' keyword.
+
+To unbreak compilation with clang.
+---
+ cub/device/dispatch/dispatch_radix_sort.cuh | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/device/dispatch/dispatch_radix_sort.cuh
+index 7fbc621f..f622e212 100644
+--- a/cub/device/dispatch/dispatch_radix_sort.cuh
++++ b/cub/device/dispatch/dispatch_radix_sort.cuh
+@@ -104,7 +104,7 @@ __global__ void DeviceRadixSortUpsweepKernel(
+     CTA_SYNC();
+ 
+     // Write out digit counts (striped)
+-    upsweep.ExtractCounts<IS_DESCENDING>(d_spine, gridDim.x, blockIdx.x);
++    upsweep.template ExtractCounts<IS_DESCENDING>(d_spine, gridDim.x, blockIdx.x);
+ }
+ 
+ 
-- 
GitLab


From 474543b4ff5b9ecd2ad949e268c2a81f2986dccc Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 8 Feb 2018 07:36:03 -0800
Subject: [PATCH 816/966] Emitting type-dependent cond and while_loop code.

PiperOrigin-RevId: 184993166
---
 .../contrib/py2tf/converters/control_flow.py  |  6 +--
 .../py2tf/converters/control_flow_test.py     |  5 +++
 tensorflow/contrib/py2tf/impl/BUILD           |  1 +
 tensorflow/contrib/py2tf/impl/api_test.py     |  6 ++-
 tensorflow/contrib/py2tf/utils/BUILD          |  1 +
 .../contrib/py2tf/utils/multiple_dispatch.py  | 38 ++++++++++++++++++-
 6 files changed, 51 insertions(+), 6 deletions(-)

diff --git a/tensorflow/contrib/py2tf/converters/control_flow.py b/tensorflow/contrib/py2tf/converters/control_flow.py
index 6a94258103..46316919c8 100644
--- a/tensorflow/contrib/py2tf/converters/control_flow.py
+++ b/tensorflow/contrib/py2tf/converters/control_flow.py
@@ -99,7 +99,7 @@ class ControlFlowTransformer(transformer.Base):
           aliased_new_names, = aliased_orig_names,
           orelse
           return (all_results,)
-        results = tf.cond(test, body_name, orelse_name)
+        results = py2tf_utils.run_cond(test, body_name, orelse_name)
       """
       body_name = self.context.namer.new_symbol('if_true', all_referenced)
       return templates.replace(
@@ -122,7 +122,7 @@ class ControlFlowTransformer(transformer.Base):
         def orelse_name():
           orelse
           return (all_results,)
-        results = tf.cond(test, body_name, orelse_name)
+        results = py2tf_utils.run_cond(test, body_name, orelse_name)
       """
       body_name = self.context.namer.new_symbol('if_true', all_referenced)
       return templates.replace(
@@ -168,7 +168,7 @@ class ControlFlowTransformer(transformer.Base):
       def body_name(state_ssf):
         body
         return state_ssf,
-      state_ast_tuple = tf.while_loop(test_name, body_name, [state])
+      state_ast_tuple = py2tf_utils.run_while(test_name, body_name, [state])
     """
     node = templates.replace(
         template,
diff --git a/tensorflow/contrib/py2tf/converters/control_flow_test.py b/tensorflow/contrib/py2tf/converters/control_flow_test.py
index f192bf1b46..677d60e0af 100644
--- a/tensorflow/contrib/py2tf/converters/control_flow_test.py
+++ b/tensorflow/contrib/py2tf/converters/control_flow_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.contrib.py2tf import utils
 from tensorflow.contrib.py2tf.converters import control_flow
 from tensorflow.contrib.py2tf.converters import converter_test_base
 from tensorflow.contrib.py2tf.pyct import compiler
@@ -53,6 +54,7 @@ class ControlFlowTest(converter_test_base.TestCase):
     node = control_flow.transform(node, self.ctx)
     result = compiler.ast_to_object(node)
     setattr(result, 'tf', control_flow_ops)
+    setattr(result, 'py2tf_utils', utils)
 
     with self.test_session() as sess:
       self.assertEqual((10, 5, 5),
@@ -69,6 +71,7 @@ class ControlFlowTest(converter_test_base.TestCase):
     node = control_flow.transform(node, self.ctx)
     result = compiler.ast_to_object(node)
     setattr(result, 'tf', control_flow_ops)
+    setattr(result, 'py2tf_utils', utils)
 
     with self.test_session() as sess:
       self.assertEqual(0, sess.run(result.test_fn(constant_op.constant(5))))
@@ -88,6 +91,7 @@ class ControlFlowTest(converter_test_base.TestCase):
     node = control_flow.transform(node, self.ctx)
     result = compiler.ast_to_object(node)
     setattr(result, 'tf', control_flow_ops)
+    setattr(result, 'py2tf_utils', utils)
 
     with self.test_session() as sess:
       self.assertEqual((-1, 0), sess.run(
@@ -106,6 +110,7 @@ class ControlFlowTest(converter_test_base.TestCase):
     node = control_flow.transform(node, self.ctx)
     result = compiler.ast_to_object(node)
     setattr(result, 'tf', control_flow_ops)
+    setattr(result, 'py2tf_utils', utils)
 
     with self.test_session() as sess:
       self.assertEqual(-1, sess.run(result.test_fn(constant_op.constant(1))))
diff --git a/tensorflow/contrib/py2tf/impl/BUILD b/tensorflow/contrib/py2tf/impl/BUILD
index 22f0c25cab..f5378917a3 100644
--- a/tensorflow/contrib/py2tf/impl/BUILD
+++ b/tensorflow/contrib/py2tf/impl/BUILD
@@ -39,6 +39,7 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":impl",
+        "//tensorflow/contrib/py2tf/utils",
         "//tensorflow/python:client_testlib",
     ],
 )
diff --git a/tensorflow/contrib/py2tf/impl/api_test.py b/tensorflow/contrib/py2tf/impl/api_test.py
index dbd079a3ca..02cd8ed2d0 100644
--- a/tensorflow/contrib/py2tf/impl/api_test.py
+++ b/tensorflow/contrib/py2tf/impl/api_test.py
@@ -32,7 +32,9 @@ class ApiTest(test.TestCase):
     config.DEFAULT_UNCOMPILED_MODULES.add((math_ops.__name__,))
     config.COMPILED_IMPORT_STATEMENTS = (
         'from tensorflow.python.ops '
-        'import control_flow_ops as tf',)
+        'import control_flow_ops as tf',
+        'from tensorflow.contrib.py2tf import utils as '
+        'py2tf_utils')
 
   def test_decorator_recurses(self):
 
@@ -183,7 +185,7 @@ class ApiTest(test.TestCase):
     compiled_code = api.to_code(test_fn)
 
     # Just check for some key words and that it is parseable Python code.
-    self.assertRegexpMatches(compiled_code, 'tf\\.while_loop')
+    self.assertRegexpMatches(compiled_code, 'py2tf_utils\\.run_while')
     self.assertIsNotNone(parser.parse_str(compiled_code))
 
 
diff --git a/tensorflow/contrib/py2tf/utils/BUILD b/tensorflow/contrib/py2tf/utils/BUILD
index 4b7a4b16c7..c2987fcace 100644
--- a/tensorflow/contrib/py2tf/utils/BUILD
+++ b/tensorflow/contrib/py2tf/utils/BUILD
@@ -28,6 +28,7 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
+        "@six_archive//:six",
     ],
 )
 
diff --git a/tensorflow/contrib/py2tf/utils/multiple_dispatch.py b/tensorflow/contrib/py2tf/utils/multiple_dispatch.py
index d8a67255a4..a855fdc075 100644
--- a/tensorflow/contrib/py2tf/utils/multiple_dispatch.py
+++ b/tensorflow/contrib/py2tf/utils/multiple_dispatch.py
@@ -18,11 +18,26 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import six
+
 from tensorflow.contrib.py2tf.utils.type_check import is_tensor
 from tensorflow.python.ops import control_flow_ops
 
 
 def run_cond(condition, true_fn, false_fn):
+  """Type-dependent functional conditional.
+
+  Args:
+    condition: A Tensor or Python bool.
+    true_fn: A Python callable implementing the true branch of the conditional.
+    false_fn: A Python callable implementing the false branch of the
+      conditional.
+
+  Returns:
+    result: The result of calling the appropriate branch. If condition is a
+    Tensor, tf.cond will be used. Otherwise, a standard Python if statement will
+    be ran.
+  """
   if is_tensor(condition):
     return control_flow_ops.cond(condition, true_fn, false_fn)
   else:
@@ -37,11 +52,32 @@ def py_cond(condition, true_fn, false_fn):
 
 
 def run_while(cond_fn, body_fn, init_args):
+  """Type-dependent functional while loop.
+
+  Args:
+    cond_fn: A Python callable implementing the stop conditions of the loop.
+    body_fn: A Python callable implementing the body of the loop.
+    init_args: The initial values of the arguments that will be passed to both
+      cond_fn and body_fn.
+
+  Returns:
+    result: A list of values with the same shape and type as init_args. If any
+    of the init_args, or any variables closed-over in cond_fn are Tensors,
+    tf.while_loop will be used, otherwise a Python while loop will be ran.
+
+  Raises:
+    ValueError: if init_args is not a tuple or list with one or more elements.
+  """
   if not isinstance(init_args, (tuple, list)) or not init_args:
     raise ValueError(
         'init_args must be a non-empty list or tuple, found %s' % init_args)
 
-  if is_tensor(*init_args):
+  # TODO(alexbw): statically determine all active variables in cond_fn,
+  # and pass them directly
+  closure_vars = tuple(
+      [c.cell_contents for c in six.get_function_closure(cond_fn) or []])
+  possibly_tensors = tuple(init_args) + closure_vars
+  if is_tensor(*possibly_tensors):
     return control_flow_ops.while_loop(cond_fn, body_fn, init_args)
   else:
     return py_while_loop(cond_fn, body_fn, init_args)
-- 
GitLab


From 7c2c44d1fee37294ce34a8cfc2d4fb2a2f3f71b9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?=
 <facai.yan@gmail.com>
Date: Fri, 9 Feb 2018 00:54:08 +0800
Subject: [PATCH 817/966] py_func convert unicode string results to bytes for
 python2 (#16322)

* TST: add test case

* BUG: allow unicode string for python 2

* DOC: revise doc

* Fix lint error
---
 tensorflow/python/kernel_tests/py_func_test.py | 11 +++++++++++
 tensorflow/python/ops/script_ops.py            |  4 ++--
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/kernel_tests/py_func_test.py b/tensorflow/python/kernel_tests/py_func_test.py
index c7181497d8..61fb3f12e4 100644
--- a/tensorflow/python/kernel_tests/py_func_test.py
+++ b/tensorflow/python/kernel_tests/py_func_test.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -212,6 +213,16 @@ class PyFuncTest(test.TestCase):
       value.op.run()
       self.assertAllEqual(np_array, [1.0, 2.0])
 
+  def testReturnUnicodeString(self):
+    with self.test_session():
+      correct = u"你好 世界"
+
+      def unicode_string():
+        return correct
+
+      z, = script_ops.py_func(unicode_string, [], [dtypes.string])
+      self.assertEqual(z.eval(), correct.encode("utf8"))
+
   def testBadNumpyReturnType(self):
     with self.test_session():
 
diff --git a/tensorflow/python/ops/script_ops.py b/tensorflow/python/ops/script_ops.py
index 1b9071ee93..551b3b0ed4 100644
--- a/tensorflow/python/ops/script_ops.py
+++ b/tensorflow/python/ops/script_ops.py
@@ -97,7 +97,7 @@ class FuncRegistry(object):
     components of a tensor have different lengths.  This is bad: ignoring the
     padding is wrong for text data, and removing the padding is wrong for binary
     data.  To avoid this bug, we redo the conversion using an object dtype.
-    Additionally, we convert unicode strings to (byte-)strings for Python3
+    Additionally, we convert unicode strings to (byte-)strings for
     compatibility.
 
     Args:
@@ -111,7 +111,7 @@ class FuncRegistry(object):
     if result.dtype.char == "S" and result is not value:
       return np.asarray(value, order="C", dtype=object)
     elif result.dtype.char == "U" and result is not value:
-      value = np.vectorize(lambda x: x.encode())(value)
+      value = np.vectorize(lambda x: x.encode("utf8"))(value)
       return np.asarray(value, order="C", dtype=object)
     elif result.dtype.char == "U":
       return result.astype(np.bytes_)
-- 
GitLab


From 576216ffa81a06c91e946e39862bcf33926a5237 Mon Sep 17 00:00:00 2001
From: Scott Tseng <scott.tseng@kikatech.com>
Date: Fri, 9 Feb 2018 00:58:36 +0800
Subject: [PATCH 818/966] Enable some passes for graph_transform on Windows
 (#16121)

* Enable some passes for graph_transform on Windows

Don't know why but the following passes are disabled on Windows:

* quantize_weights
* quantize_nodes
* round_weights

This patch re-enabled them.

* Fix BUILD file in response to format checker
---
 tensorflow/contrib/cmake/tf_tools.cmake |  3 ---
 tensorflow/tools/graph_transforms/BUILD | 11 +++++------
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/tensorflow/contrib/cmake/tf_tools.cmake b/tensorflow/contrib/cmake/tf_tools.cmake
index cb58a2e7df..58c7df95c8 100644
--- a/tensorflow/contrib/cmake/tf_tools.cmake
+++ b/tensorflow/contrib/cmake/tf_tools.cmake
@@ -48,9 +48,6 @@ file(GLOB_RECURSE tf_tools_transform_graph_lib_exclude_srcs
     "${tensorflow_source_dir}/tensorflow/tools/graph_transforms/compare_graphs.cc"
     "${tensorflow_source_dir}/tensorflow/tools/graph_transforms/summarize_graph_main.cc"
     "${tensorflow_source_dir}/tensorflow/tools/graph_transforms/transform_graph_main.cc"
-    "${tensorflow_source_dir}/tensorflow/tools/graph_transforms/quantize_nodes.cc"
-    "${tensorflow_source_dir}/tensorflow/tools/graph_transforms/quantize_weights.cc"
-    "${tensorflow_source_dir}/tensorflow/tools/graph_transforms/round_weights.cc"
 )
 list(REMOVE_ITEM tf_tools_transform_graph_lib_srcs ${tf_tools_transform_graph_lib_exclude_srcs})
 
diff --git a/tensorflow/tools/graph_transforms/BUILD b/tensorflow/tools/graph_transforms/BUILD
index b5465b7fb3..8601b3d0f1 100644
--- a/tensorflow/tools/graph_transforms/BUILD
+++ b/tensorflow/tools/graph_transforms/BUILD
@@ -99,22 +99,21 @@ cc_library(
         "freeze_requantization_ranges.cc",
         "fuse_convolutions.cc",
         "insert_logging.cc",
-        "remove_ema.cc",
         "obfuscate_names.cc",
+        "quantize_nodes.cc",
+        "quantize_weights.cc",
         "remove_attribute.cc",
         "remove_device.cc",
+        "remove_ema.cc",
         "remove_nodes.cc",
         "rename_attribute.cc",
         "rename_op.cc",
+        "round_weights.cc",
         "set_device.cc",
         "sort_by_execution_order.cc",
         "sparsify_gather.cc",
         "strip_unused_nodes.cc",
-    ] + if_not_windows([
-        "quantize_nodes.cc",
-        "quantize_weights.cc",
-        "round_weights.cc",
-    ]),
+    ],
     hdrs = [
         "fold_constants_lib.h",
     ],
-- 
GitLab


From 07ec52c0ee57f10cd0e261c498a536877e439799 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?=
 <facai.yan@gmail.com>
Date: Fri, 9 Feb 2018 00:59:08 +0800
Subject: [PATCH 819/966] Fix static shape inference for keras.layers.LSTM
 (#15234)

* TST: add test case

* BUG: fix static shape inference

* TST: clean code

* BUG: support dims > 3
---
 tensorflow/python/keras/_impl/keras/backend.py  |  8 ++++++++
 .../python/keras/_impl/keras/backend_test.py    |  9 +++++++++
 .../keras/_impl/keras/layers/lstm_test.py       | 17 +++++++++++++++++
 3 files changed, 34 insertions(+)

diff --git a/tensorflow/python/keras/_impl/keras/backend.py b/tensorflow/python/keras/_impl/keras/backend.py
index 098ea063f9..3b8023e938 100644
--- a/tensorflow/python/keras/_impl/keras/backend.py
+++ b/tensorflow/python/keras/_impl/keras/backend.py
@@ -2781,6 +2781,7 @@ def rnn(step_function,
   ndim = len(inputs.get_shape())
   if ndim < 3:
     raise ValueError('Input should be at least 3D.')
+  inputs_shape = inputs.get_shape()
   axes = [1, 0] + list(range(2, ndim))
   inputs = array_ops.transpose(inputs, (axes))
 
@@ -2965,6 +2966,13 @@ def rnn(step_function,
 
   axes = [1, 0] + list(range(2, len(outputs.get_shape())))
   outputs = array_ops.transpose(outputs, axes)
+
+  # Static shape inference: (samples, time, ...)
+  outputs_shape = outputs.get_shape().as_list()
+  outputs_shape[0] = inputs_shape[0]
+  outputs_shape[1] = inputs_shape[1]
+  outputs.set_shape(outputs_shape)
+
   last_output._uses_learning_phase = uses_learning_phase
   return last_output, outputs, new_states
 
diff --git a/tensorflow/python/keras/_impl/keras/backend_test.py b/tensorflow/python/keras/_impl/keras/backend_test.py
index 27833e368d..f29ca49378 100644
--- a/tensorflow/python/keras/_impl/keras/backend_test.py
+++ b/tensorflow/python/keras/_impl/keras/backend_test.py
@@ -915,6 +915,15 @@ class BackendNNOpsTest(test.TestCase):
         last_output, outputs, new_states = keras.backend.rnn(rnn_fn, inputs,
                                                              initial_states,
                                                              **kwargs)
+        # check static shape inference
+        self.assertEquals(last_output.get_shape().as_list(),
+                          [num_samples, output_dim])
+        self.assertEquals(outputs.get_shape().as_list(),
+                          [num_samples, timesteps, output_dim])
+        for state in new_states:
+          self.assertEquals(state.get_shape().as_list(),
+                            [num_samples, output_dim])
+
         last_output_list[i].append(keras.backend.eval(last_output))
         outputs_list[i].append(keras.backend.eval(outputs))
         self.assertEqual(len(new_states), 1)
diff --git a/tensorflow/python/keras/_impl/keras/layers/lstm_test.py b/tensorflow/python/keras/_impl/keras/layers/lstm_test.py
index 8d359bf17c..deb1d7c0c6 100644
--- a/tensorflow/python/keras/_impl/keras/layers/lstm_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/lstm_test.py
@@ -39,6 +39,23 @@ class LSTMLayerTest(test.TestCase):
                   'return_sequences': True},
           input_shape=(num_samples, timesteps, embedding_dim))
 
+  def test_static_shape_inference_LSTM(self):
+    # Github issue: 15165
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+
+    model = keras.models.Sequential()
+    inputs = keras.layers.Dense(embedding_dim,
+                                input_shape=(timesteps, embedding_dim))
+    model.add(inputs)
+    layer = keras.layers.LSTM(units, return_sequences=True)
+    model.add(layer)
+    outputs = model.layers[-1].output
+    self.assertEquals(outputs.get_shape().as_list(),
+                      [None, timesteps, units])
+
   def test_dynamic_behavior_LSTM(self):
     num_samples = 2
     timesteps = 3
-- 
GitLab


From 65e8efa179bc9d75f6fe9a40d2ae1c9ec26e6b47 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 8 Feb 2018 09:03:30 -0800
Subject: [PATCH 820/966] More updates:  * add a verbose flag, useful for
 debugging  * fix the canonicalization of old-style print statement  * fix
 broken py_func wrapper  * expand the conditional statement to return a dummy
 value if we cannot fine any return values, and call without an assignment so
 that the side effect guards catch it instead  * streamline the converter
 tests a bit more  * avoid aliasing "tf" and "self" in the side effect guards 
 * improve the namer to generate shorter names

PiperOrigin-RevId: 185002802
---
 tensorflow/contrib/py2tf/converters/BUILD     |  13 +-
 .../contrib/py2tf/converters/asserts.py       |   2 +-
 .../converters/break_canonicalization_test.py |  53 +++---
 .../py2tf/converters/builtin_functions.py     |  19 +-
 .../converters/builtin_functions_test.py      |  48 +++--
 .../contrib/py2tf/converters/call_trees.py    |  27 ++-
 .../py2tf/converters/call_trees_test.py       |  59 +++---
 .../continue_canonicalization_test.py         |  48 +++--
 .../contrib/py2tf/converters/control_flow.py  | 124 ++++++++-----
 .../py2tf/converters/control_flow_test.py     |  61 +++----
 .../py2tf/converters/converter_test_base.py   |  37 +++-
 .../py2tf/converters/decorators_test.py       |  18 +-
 .../converters/for_canonicalization_test.py   |  20 +--
 .../converters/logical_expressions_test.py    |  18 +-
 .../py2tf/converters/side_effect_guards.py    |   7 +-
 .../converters/side_effect_guards_test.py     | 168 +++++++++---------
 tensorflow/contrib/py2tf/impl/api.py          |  14 +-
 tensorflow/contrib/py2tf/impl/naming.py       |   8 +-
 tensorflow/contrib/py2tf/pyct/BUILD           |  10 ++
 tensorflow/contrib/py2tf/pyct/compiler.py     |   2 +-
 .../contrib/py2tf/pyct/compiler_test.py       |  19 +-
 tensorflow/contrib/py2tf/pyct/qual_names.py   |  15 +-
 .../contrib/py2tf/pyct/qual_names_test.py     | 108 +++++++++++
 tensorflow/contrib/py2tf/pyct/templates.py    |   4 +-
 .../contrib/py2tf/pyct/templates_test.py      |   9 +-
 25 files changed, 561 insertions(+), 350 deletions(-)
 create mode 100644 tensorflow/contrib/py2tf/pyct/qual_names_test.py

diff --git a/tensorflow/contrib/py2tf/converters/BUILD b/tensorflow/contrib/py2tf/converters/BUILD
index 68ea247778..62de8107a3 100644
--- a/tensorflow/contrib/py2tf/converters/BUILD
+++ b/tensorflow/contrib/py2tf/converters/BUILD
@@ -47,6 +47,7 @@ py_library(
         "//tensorflow/contrib/py2tf/pyct/static_analysis",
         "//tensorflow/contrib/py2tf/utils",
         "@gast_archive//:gast",
+        "@six_archive//:six",
     ],
 )
 
@@ -73,8 +74,8 @@ py_test(
 )
 
 py_test(
-    name = "call_trees_test",
-    srcs = ["call_trees_test.py"],
+    name = "builtin_functions_test",
+    srcs = ["builtin_functions_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":test_lib",
@@ -84,8 +85,8 @@ py_test(
 )
 
 py_test(
-    name = "decorators_test",
-    srcs = ["decorators_test.py"],
+    name = "call_trees_test",
+    srcs = ["call_trees_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":test_lib",
@@ -117,8 +118,8 @@ py_test(
 )
 
 py_test(
-    name = "builtin_functions_test",
-    srcs = ["builtin_functions_test.py"],
+    name = "decorators_test",
+    srcs = ["decorators_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":test_lib",
diff --git a/tensorflow/contrib/py2tf/converters/asserts.py b/tensorflow/contrib/py2tf/converters/asserts.py
index 2d6ee1d098..5b9b8e772b 100644
--- a/tensorflow/contrib/py2tf/converters/asserts.py
+++ b/tensorflow/contrib/py2tf/converters/asserts.py
@@ -35,7 +35,7 @@ class AssertsTransformer(transformer.Base):
     # Note: The lone tf.Assert call will be wrapped with control_dependencies
     # by side_effect_guards.
     template = """
-      tf.Assert(test, [tf.constant(msg)])
+      tf.Assert(test, [msg])
     """
 
     if node.msg is None:
diff --git a/tensorflow/contrib/py2tf/converters/break_canonicalization_test.py b/tensorflow/contrib/py2tf/converters/break_canonicalization_test.py
index 54c4d99361..2243398100 100644
--- a/tensorflow/contrib/py2tf/converters/break_canonicalization_test.py
+++ b/tensorflow/contrib/py2tf/converters/break_canonicalization_test.py
@@ -19,18 +19,10 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.py2tf.converters import break_canonicalization
-from tensorflow.contrib.py2tf.converters import control_flow
 from tensorflow.contrib.py2tf.converters import converter_test_base
-from tensorflow.contrib.py2tf.pyct import compiler
 from tensorflow.python.platform import test
 
 
-class TestNamer(control_flow.SymbolNamer):
-
-  def new_symbol(self, name_root, _):
-    return name_root
-
-
 class BreakCanonicalizationTest(converter_test_base.TestCase):
 
   def test_basic_break(self):
@@ -44,15 +36,15 @@ class BreakCanonicalizationTest(converter_test_base.TestCase):
         v.append(x)
       return v
 
-    node = self.parse_and_analyze(test_fn, {}, namer=TestNamer())
+    node = self.parse_and_analyze(test_fn, {})
     node = break_canonicalization.transform(node, self.ctx)
-    result = compiler.ast_to_object(node)
 
-    self.assertEqual(test_fn(0), result.test_fn(0))
-    self.assertEqual(test_fn(1), result.test_fn(1))
-    self.assertEqual(test_fn(2), result.test_fn(2))
-    self.assertEqual(test_fn(3), result.test_fn(3))
-    self.assertEqual(test_fn(4), result.test_fn(4))
+    with self.compiled(node) as result:
+      self.assertEqual(test_fn(0), result.test_fn(0))
+      self.assertEqual(test_fn(1), result.test_fn(1))
+      self.assertEqual(test_fn(2), result.test_fn(2))
+      self.assertEqual(test_fn(3), result.test_fn(3))
+      self.assertEqual(test_fn(4), result.test_fn(4))
 
   def test_basic_break_for_loop(self):
 
@@ -76,16 +68,17 @@ class BreakCanonicalizationTest(converter_test_base.TestCase):
         v.append(x)
       return v
 
-    node = self.parse_and_analyze(test_fn, {}, namer=TestNamer())
+    node = self.parse_and_analyze(test_fn, {})
     node = break_canonicalization.transform(node, self.ctx)
-    result = compiler.ast_to_object(node)
 
-    # The break is incompletely canonicalized. Everything is in place, but
-    # the loop does not break.
-    self.assertEqual(test_equiv_fn([]), result.test_fn([]))
-    self.assertEqual(test_equiv_fn([1]), result.test_fn([1]))
-    self.assertEqual(test_equiv_fn([2]), result.test_fn([2]))
-    self.assertEqual(test_equiv_fn([1, 2, 3, 4]), result.test_fn([1, 2, 3, 4]))
+    with self.compiled(node) as result:
+      # The break is incompletely canonicalized. Everything is in place, but
+      # the loop does not break.
+      self.assertEqual(test_equiv_fn([]), result.test_fn([]))
+      self.assertEqual(test_equiv_fn([1]), result.test_fn([1]))
+      self.assertEqual(test_equiv_fn([2]), result.test_fn([2]))
+      self.assertEqual(
+          test_equiv_fn([1, 2, 3, 4]), result.test_fn([1, 2, 3, 4]))
 
   def test_continue_deeply_nested(self):
 
@@ -104,15 +97,15 @@ class BreakCanonicalizationTest(converter_test_base.TestCase):
         v.append(x)
       return v, u, w
 
-    node = self.parse_and_analyze(test_fn, {}, namer=TestNamer())
+    node = self.parse_and_analyze(test_fn, {})
     node = break_canonicalization.transform(node, self.ctx)
-    result = compiler.ast_to_object(node)
 
-    self.assertEqual(test_fn(0), result.test_fn(0))
-    self.assertEqual(test_fn(1), result.test_fn(1))
-    self.assertEqual(test_fn(2), result.test_fn(2))
-    self.assertEqual(test_fn(3), result.test_fn(3))
-    self.assertEqual(test_fn(4), result.test_fn(4))
+    with self.compiled(node) as result:
+      self.assertEqual(test_fn(0), result.test_fn(0))
+      self.assertEqual(test_fn(1), result.test_fn(1))
+      self.assertEqual(test_fn(2), result.test_fn(2))
+      self.assertEqual(test_fn(3), result.test_fn(3))
+      self.assertEqual(test_fn(4), result.test_fn(4))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/py2tf/converters/builtin_functions.py b/tensorflow/contrib/py2tf/converters/builtin_functions.py
index 3e56634106..310681dd01 100644
--- a/tensorflow/contrib/py2tf/converters/builtin_functions.py
+++ b/tensorflow/contrib/py2tf/converters/builtin_functions.py
@@ -25,7 +25,18 @@ from tensorflow.contrib.py2tf.pyct import transformer
 
 
 class BuiltinFunctionTransformer(transformer.Base):
-  """Transforms Print nodes to Call so they can be handled as functions."""
+  """Handles builtin functions and canonicalizes old-style print statement.
+
+  This transformer only covers functions that are translated into a
+  TF equivalent, like `len`.
+  Note that the `print` statement is converted to a function call here, but
+  wrapping the print function to a `py_func` is done by `call_trees` as a
+  generic uncompilable function wrap.
+  """
+
+  # TODO(mdan): Handle print entirely in here.
+  # Fully handling print here makes sense especially since we're considering
+  # using tf.Print instead.
 
   def __init__(self, context):
     super(BuiltinFunctionTransformer, self).__init__(context)
@@ -48,10 +59,14 @@ class BuiltinFunctionTransformer(transformer.Base):
 
   def visit_Print(self, node):
     self.generic_visit(node)
+    args = node.values
+    # Following is the case when calling print(a, b)
+    if len(args) == 1 and isinstance(args[0], gast.Tuple):
+      args = args[0].elts
     template = """
       fname(args)
     """
-    return templates.replace(template, fname='print', args=node.values)
+    return templates.replace(template, fname='print', args=args)
 
   # pylint:enable=invalid-name
 
diff --git a/tensorflow/contrib/py2tf/converters/builtin_functions_test.py b/tensorflow/contrib/py2tf/converters/builtin_functions_test.py
index be76066242..983d1ffc03 100644
--- a/tensorflow/contrib/py2tf/converters/builtin_functions_test.py
+++ b/tensorflow/contrib/py2tf/converters/builtin_functions_test.py
@@ -18,11 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import gast
+import sys
+
+import six
 
 from tensorflow.contrib.py2tf.converters import builtin_functions
 from tensorflow.contrib.py2tf.converters import converter_test_base
-from tensorflow.contrib.py2tf.pyct import compiler
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
@@ -37,13 +38,12 @@ class BuiltinFunctionsTest(converter_test_base.TestCase):
 
     node = self.parse_and_analyze(test_fn, {'len': len})
     node = builtin_functions.transform(node, self.ctx)
-    result = compiler.ast_to_object(node)
-    setattr(result, 'tf', array_ops)
 
-    with self.test_session() as sess:
-      self.assertEqual(3,
-                       sess.run(
-                           result.test_fn(constant_op.constant([0, 0, 0]))))
+    with self.compiled(node, array_ops.shape) as result:
+      with self.test_session() as sess:
+        self.assertEqual(3,
+                         sess.run(
+                             result.test_fn(constant_op.constant([0, 0, 0]))))
 
   def test_print(self):
 
@@ -52,10 +52,36 @@ class BuiltinFunctionsTest(converter_test_base.TestCase):
 
     node = self.parse_and_analyze(test_fn, {'print': print})
     node = builtin_functions.transform(node, self.ctx)
-    result = compiler.ast_to_object(node)
 
-    result.test_fn('a')
-    self.assertTrue(isinstance(node.body[0].body[0].value, gast.Call))
+    with self.compiled(node) as result:
+      try:
+        out_capturer = six.StringIO()
+        sys.stdout = out_capturer
+        result.test_fn('a')
+        self.assertEqual(out_capturer.getvalue(), 'a\n')
+      finally:
+        sys.stdout = sys.__stdout__
+
+  def test_print_tuple(self):
+
+    def test_fn(a, b, c):
+      print(a, b, c)
+
+    node = self.parse_and_analyze(test_fn, {'print': print})
+    node = builtin_functions.transform(node, self.ctx)
+
+    with self.compiled(node) as result:
+      try:
+        out_capturer = six.StringIO()
+        sys.stdout = out_capturer
+        result.test_fn('a', 1, [2, 3])
+        # It appears that the print output looks odd only under Python 2.
+        if six.PY2:
+          self.assertEqual(out_capturer.getvalue(), "('a', 1, [2, 3])\n")
+        else:
+          self.assertEqual(out_capturer.getvalue(), 'a 1 [2, 3]\n')
+      finally:
+        sys.stdout = sys.__stdout__
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/py2tf/converters/call_trees.py b/tensorflow/contrib/py2tf/converters/call_trees.py
index 834baf258d..60096d5a7b 100644
--- a/tensorflow/contrib/py2tf/converters/call_trees.py
+++ b/tensorflow/contrib/py2tf/converters/call_trees.py
@@ -28,6 +28,7 @@ import gast
 
 from tensorflow.contrib.py2tf.pyct import anno
 from tensorflow.contrib.py2tf.pyct import parser
+from tensorflow.contrib.py2tf.pyct import qual_names
 from tensorflow.contrib.py2tf.pyct import templates
 from tensorflow.contrib.py2tf.pyct import transformer
 from tensorflow.contrib.py2tf.pyct.static_analysis.annos import NodeAnno
@@ -197,21 +198,33 @@ class CallTreeTransformer(transformer.Base):
     return node
 
   def _wrap_to_py_func_no_return(self, node):
+    func_qn = anno.getanno(node.func, anno.Basic.QN)
     args_scope = anno.getanno(node, NodeAnno.ARGS_SCOPE)
+    wrapper_name = self.context.namer.new_symbol(func_qn.ssf(),
+                                                 args_scope.referenced)
+    wrapper_args = []
+    for arg in node.args:
+      if anno.hasanno(arg, anno.Basic.QN):
+        arg_qn = anno.getanno(arg, anno.Basic.QN)
+      else:
+        arg_qn = qual_names.QN('arg')
+      wrapper_args.append(
+          self.context.namer.new_symbol(arg_qn.ssf(), args_scope.referenced))
     # TODO(mdan): Properly handle varargs, kwargs, etc.
+    # TODO(mdan): This is best handled as a dynamic dispatch.
+    # That way we can separate tensors from non-tensor args.
     template = """
-      def wrapper(args):
-        call(args)
+      def wrapper(wrapper_args):
+        call(wrapper_args)
         return 1
-      tf.py_func(wrapper, [args], [tf.int64])
+      tf.py_func(wrapper, original_args, [tf.int64])
     """
     wrapper_def, call_expr = templates.replace(
         template,
         call=node.func,
-        wrapper=self.context.namer.compiled_function_name(node.func.id)[0],
-        args=tuple(args_scope.used))
-    anno.setanno(call_expr.value, NodeAnno.ARGS_SCOPE, args_scope)
-    # TODO(mdan): Rename this annotation to 'graph_ready'
+        wrapper=wrapper_name,
+        original_args=gast.List(elts=node.args, ctx=None),
+        wrapper_args=wrapper_args)
     anno.setanno(wrapper_def, anno.Basic.SKIP_PROCESSING, True)
 
     return (wrapper_def, call_expr)
diff --git a/tensorflow/contrib/py2tf/converters/call_trees_test.py b/tensorflow/contrib/py2tf/converters/call_trees_test.py
index e63c10de0f..18a5c1e6e3 100644
--- a/tensorflow/contrib/py2tf/converters/call_trees_test.py
+++ b/tensorflow/contrib/py2tf/converters/call_trees_test.py
@@ -20,23 +20,11 @@ from __future__ import print_function
 
 from tensorflow.contrib.py2tf.converters import call_trees
 from tensorflow.contrib.py2tf.converters import converter_test_base
-from tensorflow.contrib.py2tf.pyct import compiler
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
-class TestNamer(call_trees.FunctionNamer):
-
-  def compiled_function_name(self,
-                             original_fqn,
-                             live_entity=None,
-                             owner_type=None):
-    if owner_type is not None:
-      return None, False
-    return ('renamed_%s' % '_'.join(original_fqn)), True
-
-
 class CallTreesTest(converter_test_base.TestCase):
 
   def test_basic(self):
@@ -50,14 +38,14 @@ class CallTreesTest(converter_test_base.TestCase):
     def test_fn_2(a):
       return test_fn_1(a) + 1
 
-    node = self.parse_and_analyze(
-        test_fn_2, {'test_fn_1': test_fn_1}, namer=TestNamer())
+    node = self.parse_and_analyze(test_fn_2, {'test_fn_1': test_fn_1})
     node = call_trees.transform(node, self.ctx, (), ())
-    result = compiler.ast_to_object(node)
-    # Only test_fn_2 is transformed, so we'll insert renamed_test_fn_1 manually.
-    setattr(result, 'renamed_test_fn_1', renamed_test_fn_1)
 
-    self.assertEquals(3, result.test_fn_2(1))
+    with self.compiled(node) as result:
+      # Only test_fn_2 is transformed, so we'll insert renamed_test_fn_1
+      # manually.
+      result.renamed_test_fn_1 = renamed_test_fn_1
+      self.assertEquals(3, result.test_fn_2(1))
 
   def test_simple_methods(self):
 
@@ -71,13 +59,12 @@ class CallTreesTest(converter_test_base.TestCase):
 
     node = self.parse_and_analyze(
         TestClass.test_fn_2, {'TestClass': TestClass},
-        namer=TestNamer(),
         arg_types={'self': (TestClass.__name__, TestClass)})
     node = call_trees.transform(node, self.ctx, (), ())
-    result = compiler.ast_to_object(node)
 
-    tc = TestClass()
-    self.assertEquals(3, result.test_fn_2(tc, 1))
+    with self.compiled(node) as result:
+      tc = TestClass()
+      self.assertEquals(3, result.test_fn_2(tc, 1))
 
   def test_uncompiled_modules(self):
 
@@ -86,26 +73,22 @@ class CallTreesTest(converter_test_base.TestCase):
       a = math_ops.add(a, constant_op.constant(1))
       return a
 
-    node = self.parse_and_analyze(
-        test_fn, {
-            'math_ops': math_ops,
-            'constant_op': constant_op
-        },
-        namer=TestNamer())
+    node = self.parse_and_analyze(test_fn, {
+        'math_ops': math_ops,
+        'constant_op': constant_op
+    })
     node = call_trees.transform(node, self.ctx,
                                 set(((math_ops.__name__,),
                                      (constant_op.__name__,))), ())
-    result = compiler.ast_to_object(node)
-    setattr(result, 'math_ops', math_ops)
-    setattr(result, 'constant_op', constant_op)
-
-    with self.test_session() as sess:
-      # Not renamed, because the converter doesn't rename the definition itself.
-      # (the caller is responsible for that).
-      result_tensor = result.test_fn(constant_op.constant(1))
-      result_val = sess.run(result_tensor)
 
-    self.assertEquals(3, result_val)
+    with self.compiled(node) as result:
+      result.math_ops = math_ops
+      result.constant_op = constant_op
+      with self.test_session() as sess:
+        # Not renamed, because the converter doesn't rename the definition
+        # itself (the caller is responsible for that).
+        result_tensor = result.test_fn(constant_op.constant(1))
+        self.assertEquals(3, sess.run(result_tensor))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/py2tf/converters/continue_canonicalization_test.py b/tensorflow/contrib/py2tf/converters/continue_canonicalization_test.py
index 4b18819559..2a0fb2d88b 100644
--- a/tensorflow/contrib/py2tf/converters/continue_canonicalization_test.py
+++ b/tensorflow/contrib/py2tf/converters/continue_canonicalization_test.py
@@ -19,18 +19,10 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.py2tf.converters import continue_canonicalization
-from tensorflow.contrib.py2tf.converters import control_flow
 from tensorflow.contrib.py2tf.converters import converter_test_base
-from tensorflow.contrib.py2tf.pyct import compiler
 from tensorflow.python.platform import test
 
 
-class TestNamer(control_flow.SymbolNamer):
-
-  def new_symbol(self, name_root, _):
-    return name_root
-
-
 class ContinueCanonicalizationTest(converter_test_base.TestCase):
 
   def test_basic_continue(self):
@@ -44,15 +36,15 @@ class ContinueCanonicalizationTest(converter_test_base.TestCase):
         v.append(x)
       return v
 
-    node = self.parse_and_analyze(test_fn, {}, namer=TestNamer())
+    node = self.parse_and_analyze(test_fn, {})
     node = continue_canonicalization.transform(node, self.ctx)
-    result = compiler.ast_to_object(node)
 
-    self.assertEqual(test_fn(0), result.test_fn(0))
-    self.assertEqual(test_fn(1), result.test_fn(1))
-    self.assertEqual(test_fn(2), result.test_fn(2))
-    self.assertEqual(test_fn(3), result.test_fn(3))
-    self.assertEqual(test_fn(4), result.test_fn(4))
+    with self.compiled(node) as result:
+      self.assertEqual(test_fn(0), result.test_fn(0))
+      self.assertEqual(test_fn(1), result.test_fn(1))
+      self.assertEqual(test_fn(2), result.test_fn(2))
+      self.assertEqual(test_fn(3), result.test_fn(3))
+      self.assertEqual(test_fn(4), result.test_fn(4))
 
   def test_basic_continue_for_loop(self):
 
@@ -65,14 +57,14 @@ class ContinueCanonicalizationTest(converter_test_base.TestCase):
         v.append(x)
       return v
 
-    node = self.parse_and_analyze(test_fn, {}, namer=TestNamer())
+    node = self.parse_and_analyze(test_fn, {})
     node = continue_canonicalization.transform(node, self.ctx)
-    result = compiler.ast_to_object(node)
 
-    self.assertEqual(test_fn([]), result.test_fn([]))
-    self.assertEqual(test_fn([1]), result.test_fn([1]))
-    self.assertEqual(test_fn([2]), result.test_fn([2]))
-    self.assertEqual(test_fn([1, 2, 3]), result.test_fn([1, 2, 3]))
+    with self.compiled(node) as result:
+      self.assertEqual(test_fn([]), result.test_fn([]))
+      self.assertEqual(test_fn([1]), result.test_fn([1]))
+      self.assertEqual(test_fn([2]), result.test_fn([2]))
+      self.assertEqual(test_fn([1, 2, 3]), result.test_fn([1, 2, 3]))
 
   def test_continue_deeply_nested(self):
 
@@ -91,15 +83,15 @@ class ContinueCanonicalizationTest(converter_test_base.TestCase):
         v.append(x)
       return v, u, w
 
-    node = self.parse_and_analyze(test_fn, {}, namer=TestNamer())
+    node = self.parse_and_analyze(test_fn, {})
     node = continue_canonicalization.transform(node, self.ctx)
-    result = compiler.ast_to_object(node)
 
-    self.assertEqual(test_fn(0), result.test_fn(0))
-    self.assertEqual(test_fn(1), result.test_fn(1))
-    self.assertEqual(test_fn(2), result.test_fn(2))
-    self.assertEqual(test_fn(3), result.test_fn(3))
-    self.assertEqual(test_fn(4), result.test_fn(4))
+    with self.compiled(node) as result:
+      self.assertEqual(test_fn(0), result.test_fn(0))
+      self.assertEqual(test_fn(1), result.test_fn(1))
+      self.assertEqual(test_fn(2), result.test_fn(2))
+      self.assertEqual(test_fn(3), result.test_fn(3))
+      self.assertEqual(test_fn(4), result.test_fn(4))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/py2tf/converters/control_flow.py b/tensorflow/contrib/py2tf/converters/control_flow.py
index 46316919c8..d53e3e4fd6 100644
--- a/tensorflow/contrib/py2tf/converters/control_flow.py
+++ b/tensorflow/contrib/py2tf/converters/control_flow.py
@@ -54,6 +54,49 @@ class ControlFlowTransformer(transformer.Base):
   def visit_For(self, node):
     assert False, 'for statement should have been canonicalized at this point'
 
+  def _create_cond_branch(self, body_name, aliased_orig_names,
+                          aliased_new_names, body, returns):
+    if aliased_orig_names:
+      template = """
+        def body_name():
+          aliased_new_names, = aliased_orig_names,
+          body
+          return (returns,)
+      """
+      return templates.replace(
+          template,
+          body_name=body_name,
+          body=body,
+          aliased_orig_names=aliased_orig_names,
+          aliased_new_names=aliased_new_names,
+          returns=returns)
+    else:
+      template = """
+        def body_name():
+          body
+          return (returns,)
+      """
+      return templates.replace(
+          template, body_name=body_name, body=body, returns=returns)
+
+  def _create_cond_expr(self, results, test, body_name, orelse_name):
+    if results is not None:
+      template = """
+        results = py2tf_utils.run_cond(test, body_name, orelse_name)
+      """
+      return templates.replace(
+          template,
+          test=test,
+          results=results,
+          body_name=body_name,
+          orelse_name=orelse_name)
+    else:
+      template = """
+        py2tf_utils.run_cond(test, body_name, orelse_name)
+      """
+      return templates.replace(
+          template, test=test, body_name=body_name, orelse_name=orelse_name)
+
   def visit_If(self, node):
     self.generic_visit(node)
 
@@ -67,7 +110,7 @@ class ControlFlowTransformer(transformer.Base):
       raise ValueError(
           'The else branch creates new symbols that the if branch does not.')
 
-    all_modified = tuple(body_scope.modified | orelse_scope.modified)
+    modified = tuple(body_scope.modified | orelse_scope.modified)
     all_referenced = body_scope.referenced | orelse_scope.referenced
 
     # Alias the closure variables inside the conditional functions
@@ -84,56 +127,41 @@ class ControlFlowTransformer(transformer.Base):
     node_body = ast_util.rename_symbols(node.body, alias_map)
     node_orelse = ast_util.rename_symbols(node.orelse, alias_map)
 
-    if len(all_modified) == 1:
-      results = all_modified[0]
+    if not modified:
+      # When the cond would return no value, we leave the cond called without
+      # results. That in turn should trigger the side effect guards. The
+      # branch functions will return a dummy value that ensures cond
+      # actually has some return value as well.
+      results = None
+    elif len(modified) == 1:
+      results = modified[0]
     else:
-      results = gast.Tuple([s.ast() for s in all_modified], None)
+      results = gast.Tuple([s.ast() for s in modified], None)
 
-    if aliased_orig_names:
-      template = """
-        def body_name():
-          aliased_new_names, = aliased_orig_names,
-          body
-          return (all_results,)
-        def orelse_name():
-          aliased_new_names, = aliased_orig_names,
-          orelse
-          return (all_results,)
-        results = py2tf_utils.run_cond(test, body_name, orelse_name)
-      """
-      body_name = self.context.namer.new_symbol('if_true', all_referenced)
-      return templates.replace(
-          template,
-          test=node.test,
-          body_name=body_name,
-          body=node_body,
-          orelse_name=self.context.namer.new_symbol('if_false', all_referenced),
-          orelse=node_orelse,
-          aliased_orig_names=tuple(aliased_orig_names),
-          aliased_new_names=tuple(aliased_new_names),
-          all_results=tuple(alias_map[s] if s in aliased_orig_names else s
-                            for s in all_modified),
-          results=results)
+    body_name = self.context.namer.new_symbol('if_true', all_referenced)
+    orelse_name = self.context.namer.new_symbol('if_false', all_referenced)
+    if modified:
+      body_returns = tuple(
+          alias_map[s] if s in aliased_orig_names else s for s in modified)
     else:
-      template = """
-        def body_name():
-          body
-          return (all_results,)
-        def orelse_name():
-          orelse
-          return (all_results,)
-        results = py2tf_utils.run_cond(test, body_name, orelse_name)
-      """
-      body_name = self.context.namer.new_symbol('if_true', all_referenced)
-      return templates.replace(
-          template,
-          test=node.test,
-          body_name=body_name,
-          body=node_body,
-          orelse_name=self.context.namer.new_symbol('if_false', all_referenced),
-          orelse=node_orelse,
-          all_results=tuple(s for s in all_modified),
-          results=results)
+      body_returns = templates.replace('tf.ones(())')[0].value
+
+    body_def = self._create_cond_branch(
+        body_name,
+        aliased_orig_names=tuple(aliased_orig_names),
+        aliased_new_names=tuple(aliased_new_names),
+        body=node_body,
+        returns=body_returns)
+    orelse_def = self._create_cond_branch(
+        orelse_name,
+        aliased_orig_names=tuple(aliased_orig_names),
+        aliased_new_names=tuple(aliased_new_names),
+        body=node_orelse,
+        returns=body_returns)
+    cond_expr = self._create_cond_expr(results, node.test, body_name,
+                                       orelse_name)
+
+    return body_def + orelse_def + cond_expr
 
   def visit_While(self, node):
     self.generic_visit(node)
diff --git a/tensorflow/contrib/py2tf/converters/control_flow_test.py b/tensorflow/contrib/py2tf/converters/control_flow_test.py
index 677d60e0af..b785b284a7 100644
--- a/tensorflow/contrib/py2tf/converters/control_flow_test.py
+++ b/tensorflow/contrib/py2tf/converters/control_flow_test.py
@@ -18,26 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf import utils
 from tensorflow.contrib.py2tf.converters import control_flow
 from tensorflow.contrib.py2tf.converters import converter_test_base
-from tensorflow.contrib.py2tf.pyct import compiler
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.platform import test
 
 
-class TestNamer(control_flow.SymbolNamer):
-
-  def new_symbol(self, name_root, used):
-    i = 0
-    while True:
-      name = '%s%d' % (name_root, i)
-      if name not in used:
-        return name
-      i += 1
-
-
 class ControlFlowTest(converter_test_base.TestCase):
 
   def test_simple_while(self):
@@ -50,15 +37,13 @@ class ControlFlowTest(converter_test_base.TestCase):
         i += 1
       return s, i, n
 
-    node = self.parse_and_analyze(test_fn, {}, namer=TestNamer())
+    node = self.parse_and_analyze(test_fn, {})
     node = control_flow.transform(node, self.ctx)
-    result = compiler.ast_to_object(node)
-    setattr(result, 'tf', control_flow_ops)
-    setattr(result, 'py2tf_utils', utils)
 
-    with self.test_session() as sess:
-      self.assertEqual((10, 5, 5),
-                       sess.run(result.test_fn(constant_op.constant(5))))
+    with self.compiled(node, control_flow_ops.while_loop) as result:
+      with self.test_session() as sess:
+        self.assertEqual((10, 5, 5),
+                         sess.run(result.test_fn(constant_op.constant(5))))
 
   def test_while_single_var(self):
 
@@ -67,14 +52,12 @@ class ControlFlowTest(converter_test_base.TestCase):
         n -= 1
       return n
 
-    node = self.parse_and_analyze(test_fn, {}, namer=TestNamer())
+    node = self.parse_and_analyze(test_fn, {})
     node = control_flow.transform(node, self.ctx)
-    result = compiler.ast_to_object(node)
-    setattr(result, 'tf', control_flow_ops)
-    setattr(result, 'py2tf_utils', utils)
 
-    with self.test_session() as sess:
-      self.assertEqual(0, sess.run(result.test_fn(constant_op.constant(5))))
+    with self.compiled(node, control_flow_ops.while_loop) as result:
+      with self.test_session() as sess:
+        self.assertEqual(0, sess.run(result.test_fn(constant_op.constant(5))))
 
   def test_simple_if(self):
 
@@ -87,17 +70,15 @@ class ControlFlowTest(converter_test_base.TestCase):
         b = 2 * n
       return a, b
 
-    node = self.parse_and_analyze(test_fn, {}, namer=TestNamer())
+    node = self.parse_and_analyze(test_fn, {})
     node = control_flow.transform(node, self.ctx)
-    result = compiler.ast_to_object(node)
-    setattr(result, 'tf', control_flow_ops)
-    setattr(result, 'py2tf_utils', utils)
 
-    with self.test_session() as sess:
-      self.assertEqual((-1, 0), sess.run(
-          result.test_fn(constant_op.constant(1))))
-      self.assertEqual((0, -2),
-                       sess.run(result.test_fn(constant_op.constant(-1))))
+    with self.compiled(node, control_flow_ops.cond) as result:
+      with self.test_session() as sess:
+        self.assertEqual((-1, 0),
+                         sess.run(result.test_fn(constant_op.constant(1))))
+        self.assertEqual((0, -2),
+                         sess.run(result.test_fn(constant_op.constant(-1))))
 
   def test_if_single_var(self):
 
@@ -106,14 +87,12 @@ class ControlFlowTest(converter_test_base.TestCase):
         n = -n
       return n
 
-    node = self.parse_and_analyze(test_fn, {}, namer=TestNamer())
+    node = self.parse_and_analyze(test_fn, {})
     node = control_flow.transform(node, self.ctx)
-    result = compiler.ast_to_object(node)
-    setattr(result, 'tf', control_flow_ops)
-    setattr(result, 'py2tf_utils', utils)
 
-    with self.test_session() as sess:
-      self.assertEqual(-1, sess.run(result.test_fn(constant_op.constant(1))))
+    with self.compiled(node, control_flow_ops.cond) as result:
+      with self.test_session() as sess:
+        self.assertEqual(-1, sess.run(result.test_fn(constant_op.constant(1))))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/py2tf/converters/converter_test_base.py b/tensorflow/contrib/py2tf/converters/converter_test_base.py
index 5b23db33e1..67747183dd 100644
--- a/tensorflow/contrib/py2tf/converters/converter_test_base.py
+++ b/tensorflow/contrib/py2tf/converters/converter_test_base.py
@@ -18,8 +18,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import contextlib
 import imp
 
+from tensorflow.contrib.py2tf import utils
+from tensorflow.contrib.py2tf.pyct import compiler
 from tensorflow.contrib.py2tf.pyct import context
 from tensorflow.contrib.py2tf.pyct import parser
 from tensorflow.contrib.py2tf.pyct import qual_names
@@ -29,9 +32,41 @@ from tensorflow.contrib.py2tf.pyct.static_analysis import type_info
 from tensorflow.python.platform import test
 
 
+class FakeNamer(object):
+
+  def new_symbol(self, name_root, used):
+    i = 0
+    while True:
+      name = '%s%d' % (name_root, i)
+      if name not in used:
+        return name
+      i += 1
+
+  def compiled_function_name(self,
+                             original_fqn,
+                             live_entity=None,
+                             owner_type=None):
+    del live_entity
+    if owner_type is not None:
+      return None, False
+    return ('renamed_%s' % '_'.join(original_fqn)), True
+
+
 class TestCase(test.TestCase):
   """Base class for unit tests in this module. Contains relevant utilities."""
 
+  @contextlib.contextmanager
+  def compiled(self, node, *symbols):
+    source = '<compile failed>'
+    try:
+      result, source = compiler.ast_to_object(node)
+      result.tf = self.make_fake_tf(*symbols)
+      result.py2tf_utils = utils
+      yield result
+    except Exception:  # pylint:disable=broad-except
+      print('Offending compiled code:\n%s' % source)
+      raise
+
   def make_fake_tf(self, *symbols):
     fake_tf = imp.new_module('fake_tf')
     for s in symbols:
@@ -51,7 +86,7 @@ class TestCase(test.TestCase):
                         recursive=True):
     node, source = parser.parse_entity(test_fn)
     ctx = context.EntityContext(
-        namer=namer,
+        namer=namer or FakeNamer(),
         source_code=source,
         source_file=None,
         namespace=namespace,
diff --git a/tensorflow/contrib/py2tf/converters/decorators_test.py b/tensorflow/contrib/py2tf/converters/decorators_test.py
index f50d593043..402fa0dda2 100644
--- a/tensorflow/contrib/py2tf/converters/decorators_test.py
+++ b/tensorflow/contrib/py2tf/converters/decorators_test.py
@@ -53,14 +53,17 @@ class DecoratorsTest(converter_test_base.TestCase):
     node = node.body[0].body[0]
 
     node = decorators.transform(node, remove_decorators=())
-    result = compiler.ast_to_object(
+    # Since the decorator is not removed, we need to include its source
+    # code. We cannot do it after the fact because decorators are executed
+    # on load.
+    result, _ = compiler.ast_to_object(
         node,
         source_prefix=textwrap.dedent(tf_inspect.getsource(function_decorator)))
     self.assertEqual(2, result.test_fn(1))
 
     node = decorators.transform(node, remove_decorators=(function_decorator,))
-    result = compiler.ast_to_object(node)
-    self.assertEqual(1, result.test_fn(1))
+    with self.compiled(node) as result:
+      self.assertEqual(1, result.test_fn(1))
 
   def test_simple_decorator(self):
 
@@ -82,14 +85,17 @@ class DecoratorsTest(converter_test_base.TestCase):
     node = node.body[0].body[0]
 
     node = decorators.transform(node, remove_decorators=())
-    result = compiler.ast_to_object(
+    # Since the decorator is not removed, we need to include its source
+    # code. We cannot do it after the fact because decorators are executed
+    # on load.
+    result, _ = compiler.ast_to_object(
         node,
         source_prefix=textwrap.dedent(tf_inspect.getsource(simple_decorator)))
     self.assertEqual(2, result.test_fn(1))
 
     node = decorators.transform(node, remove_decorators=(simple_decorator,))
-    result = compiler.ast_to_object(node)
-    self.assertEqual(1, result.test_fn(1))
+    with self.compiled(node) as result:
+      self.assertEqual(1, result.test_fn(1))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/py2tf/converters/for_canonicalization_test.py b/tensorflow/contrib/py2tf/converters/for_canonicalization_test.py
index 142bd4aea1..910c4dcc00 100644
--- a/tensorflow/contrib/py2tf/converters/for_canonicalization_test.py
+++ b/tensorflow/contrib/py2tf/converters/for_canonicalization_test.py
@@ -18,19 +18,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.converters import control_flow
 from tensorflow.contrib.py2tf.converters import converter_test_base
 from tensorflow.contrib.py2tf.converters import for_canonicalization
-from tensorflow.contrib.py2tf.pyct import compiler
 from tensorflow.python.platform import test
 
 
-class TestNamer(control_flow.SymbolNamer):
-
-  def new_symbol(self, name_root, _):
-    return name_root
-
-
 class ControlFlowTest(converter_test_base.TestCase):
 
   def test_basic_for(self):
@@ -41,14 +33,14 @@ class ControlFlowTest(converter_test_base.TestCase):
         s += e
       return s
 
-    node = self.parse_and_analyze(test_fn, {}, namer=TestNamer())
+    node = self.parse_and_analyze(test_fn, {})
     node = for_canonicalization.transform(node, self.ctx)
-    result = compiler.ast_to_object(node)
 
-    l = [1, 2, 3]
-    self.assertEqual(test_fn(l), result.test_fn(l))
-    l = []
-    self.assertEqual(test_fn(l), result.test_fn(l))
+    with self.compiled(node) as result:
+      l = [1, 2, 3]
+      self.assertEqual(test_fn(l), result.test_fn(l))
+      l = []
+      self.assertEqual(test_fn(l), result.test_fn(l))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/py2tf/converters/logical_expressions_test.py b/tensorflow/contrib/py2tf/converters/logical_expressions_test.py
index d711065099..a28326c517 100644
--- a/tensorflow/contrib/py2tf/converters/logical_expressions_test.py
+++ b/tensorflow/contrib/py2tf/converters/logical_expressions_test.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 from tensorflow.contrib.py2tf.converters import converter_test_base
 from tensorflow.contrib.py2tf.converters import logical_expressions
-from tensorflow.contrib.py2tf.pyct import compiler
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
@@ -34,12 +33,11 @@ class GradientsFunctionTest(converter_test_base.TestCase):
 
     node = self.parse_and_analyze(test_fn, {})
     node = logical_expressions.transform(node)
-    result = compiler.ast_to_object(node)
-    setattr(result, 'tf', math_ops)
 
-    with self.test_session() as sess:
-      self.assertTrue(sess.run(result.test_fn(1, 1)))
-      self.assertFalse(sess.run(result.test_fn(1, 2)))
+    with self.compiled(node, math_ops.equal) as result:
+      with self.test_session() as sess:
+        self.assertTrue(sess.run(result.test_fn(1, 1)))
+        self.assertFalse(sess.run(result.test_fn(1, 2)))
 
   def test_bool_ops(self):
 
@@ -48,11 +46,11 @@ class GradientsFunctionTest(converter_test_base.TestCase):
 
     node = self.parse_and_analyze(test_fn, {})
     node = logical_expressions.transform(node)
-    result = compiler.ast_to_object(node)
-    setattr(result, 'tf', math_ops)
 
-    with self.test_session() as sess:
-      self.assertTrue(sess.run(result.test_fn(True, False, True)))
+    with self.compiled(node, math_ops.logical_or,
+                       math_ops.logical_and) as result:
+      with self.test_session() as sess:
+        self.assertTrue(sess.run(result.test_fn(True, False, True)))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/py2tf/converters/side_effect_guards.py b/tensorflow/contrib/py2tf/converters/side_effect_guards.py
index 948cb96c3f..5895dc4954 100644
--- a/tensorflow/contrib/py2tf/converters/side_effect_guards.py
+++ b/tensorflow/contrib/py2tf/converters/side_effect_guards.py
@@ -128,8 +128,11 @@ class SideEffectGuardTransformer(transformer.Base):
       # _visit_and_reindent.
       args_scope = anno.getanno(node.value, NodeAnno.ARGS_SCOPE)
       # NOTE: We can't guard object attributes because they may not be writable.
-      guarded_args = tuple(
-          s for s in args_scope.used if not s.is_composite())
+      # In addition, avoid renaming well-known names.
+      # TODO(mdan): Move these names into config.
+      unguarded_names = (qual_names.QN('self'), qual_names.QN('tf'))
+      guarded_args = tuple(s for s in args_scope.used
+                           if not s.is_composite() and s not in unguarded_names)
 
       # TODO(mdan): Include all arguments which depended on guarded_args too.
       # For example, the following will still cause a race:
diff --git a/tensorflow/contrib/py2tf/converters/side_effect_guards_test.py b/tensorflow/contrib/py2tf/converters/side_effect_guards_test.py
index 409c8b02c5..b41c6fa5b9 100644
--- a/tensorflow/contrib/py2tf/converters/side_effect_guards_test.py
+++ b/tensorflow/contrib/py2tf/converters/side_effect_guards_test.py
@@ -18,153 +18,155 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf import utils
 from tensorflow.contrib.py2tf.converters import converter_test_base
 from tensorflow.contrib.py2tf.converters import side_effect_guards
-from tensorflow.contrib.py2tf.pyct import compiler
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
-class TestNamer(side_effect_guards.SymbolNamer):
-
-  def new_symbol(self, name_root, _):
-    return 'renamed_%s' % name_root
-
-
 class SideEffectGuardsTest(converter_test_base.TestCase):
 
-  def _transform_and_compile(self, test_fn):
-    ns = {
-        'control_flow_ops': control_flow_ops,
-        'constant_op': constant_op,
-        'gen_math_ops': gen_math_ops,
-        'ops': ops,
-        'state_ops': state_ops,
-    }
-    node = self.parse_and_analyze(
-        test_fn, ns,
-        namer=TestNamer())
-    node = side_effect_guards.transform(node, self.ctx)
-    result = compiler.ast_to_object(node)
-    self.attach_namespace(result, **ns)
-    result.tf = self.make_fake_tf(array_ops.identity, control_flow_ops.Assert,
-                                  gen_math_ops.greater,
-                                  ops.control_dependencies, ops.Tensor)
-    result.py2tf_utils = utils
-    return result.test_fn, node
-
   def test_side_effect_on_return_only_variable(self):
 
+    tf = None
+
     def test_fn(a):
-      state_ops.assign(a, a + 1)
+      tf.assign(a, a + 1)
       return a
 
-    tf_test_fn, node = self._transform_and_compile(test_fn)
+    node = self.parse_and_analyze(test_fn, {})
+    node = side_effect_guards.transform(node, self.ctx)
 
-    self.assertEqual(len(node.body[0].body), 1)
-    with self.test_session() as sess:
-      v = variables.Variable(2)
-      sess.run(v.initializer)
-      # NOTE: We don't expect the assignment to execute in this case, because
-      # variables cannot be reliably guarded.
-      self.assertEqual(2, sess.run(tf_test_fn(v)))
+    with self.compiled(node, state_ops.assign, ops.control_dependencies,
+                       ops.Tensor) as result:
+      self.assertEqual(len(node.body[0].body), 1)
+      with self.test_session() as sess:
+        v = variables.Variable(2)
+        sess.run(v.initializer)
+        # NOTE: We don't expect the assignment to execute in this case, because
+        # variables cannot be reliably guarded.
+        self.assertEqual(2, sess.run(result.test_fn(v)))
 
   def test_side_effect_on_used_variable(self):
 
+    tf = None
+
     def test_fn(a):
-      state_ops.assign(a, a + 1)
+      tf.assign(a, a + 1)
       return a + 1
 
-    tf_test_fn, node = self._transform_and_compile(test_fn)
+    node = self.parse_and_analyze(test_fn, {})
+    node = side_effect_guards.transform(node, self.ctx)
 
-    self.assertEqual(len(node.body[0].body), 1)
-    with self.test_session() as sess:
-      v = variables.Variable(2)
-      sess.run(v.initializer)
-      # NOTE: Unlike test_side_effect_on_return_only_variable, the variable was
-      # used in the local scope and so we could catch the assign's side effect.
-      self.assertEqual(4, sess.run(tf_test_fn(v)))
+    with self.compiled(node, state_ops.assign, ops.control_dependencies,
+                       ops.Tensor) as result:
+      self.assertEqual(len(node.body[0].body), 1)
+      with self.test_session() as sess:
+        v = variables.Variable(2)
+        sess.run(v.initializer)
+        # NOTE: Unlike test_side_effect_on_return_only_variable, the variable
+        # was used in the local scope and so we could catch the assign's side
+        # effect.
+        self.assertEqual(4, sess.run(result.test_fn(v)))
 
   def test_side_effect_on_tensor(self):
 
+    tf = None
+
     def test_fn(a):
-      control_flow_ops.Assert(gen_math_ops.greater(a, 0), ['expected in throw'])
+      tf.Assert(a > 0, ['expected in throw'])
       return a
 
-    tf_test_fn, node = self._transform_and_compile(test_fn)
+    node = self.parse_and_analyze(test_fn, {})
+    node = side_effect_guards.transform(node, self.ctx)
 
-    self.assertEqual(len(node.body[0].body), 1)
-    with self.test_session() as sess:
-      # NOTE: In this case we can also capture the side effect because the
-      # argument is a tensor ans we can wrap it inside an identity.
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   'expected in throw'):
-        sess.run(tf_test_fn(constant_op.constant(-1)))
+    with self.compiled(node, array_ops.identity, control_flow_ops.Assert,
+                       ops.control_dependencies, ops.Tensor) as result:
+      self.assertEqual(len(node.body[0].body), 1)
+      with self.test_session() as sess:
+        # NOTE: In this case we can also capture the side effect because the
+        # argument is a tensor ans we can wrap it inside an identity.
+        with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                     'expected in throw'):
+          sess.run(result.test_fn(constant_op.constant(-1)))
 
   def test_multiline_block(self):
 
+    tf = None
+
     def test_fn(a):
-      state_ops.assign(a, a + 1)
+      tf.assign(a, a + 1)
       b = a + 1
-      state_ops.assign(a, b + 1)
+      tf.assign(a, b + 1)
       c = b + 1
       d = c + 1
       return d
 
-    tf_test_fn, node = self._transform_and_compile(test_fn)
+    node = self.parse_and_analyze(test_fn, {})
+    node = side_effect_guards.transform(node, self.ctx)
 
-    self.assertEqual(len(node.body[0].body), 1)
-    with self.test_session() as sess:
-      v = variables.Variable(2)
-      sess.run(v.initializer)
-      self.assertEqual(6, sess.run(tf_test_fn(v)))
+    with self.compiled(node, array_ops.identity, state_ops.assign,
+                       ops.control_dependencies, ops.Tensor) as result:
+      self.assertEqual(len(node.body[0].body), 1)
+      with self.test_session() as sess:
+        v = variables.Variable(2)
+        sess.run(v.initializer)
+        self.assertEqual(6, sess.run(result.test_fn(v)))
 
   def test_multiline_nested_block(self):
 
+    tf = None
+
     def test_fn(a):
-      with ops.name_scope('foo'):
-        state_ops.assign(a, a + 1)
+      with tf.name_scope('foo'):
+        tf.assign(a, a + 1)
         b = a + 1
-        # state_ops.assign(a, b + 1)
         c = b + 1
         d = c + 1
       return d
 
-    tf_test_fn, node = self._transform_and_compile(test_fn)
+    node = self.parse_and_analyze(test_fn, {})
+    node = side_effect_guards.transform(node, self.ctx)
 
-    self.assertEqual(len(node.body[0].body[0].body), 1)
-    with self.test_session() as sess:
-      v = variables.Variable(2)
-      sess.run(v.initializer)
-      self.assertEqual(6, sess.run(tf_test_fn(v)))
+    with self.compiled(node, array_ops.identity, state_ops.assign,
+                       ops.control_dependencies, ops.name_scope,
+                       ops.Tensor) as result:
+      self.assertEqual(len(node.body[0].body[0].body), 1)
+      with self.test_session() as sess:
+        v = variables.Variable(2)
+        sess.run(v.initializer)
+        self.assertEqual(6, sess.run(result.test_fn(v)))
 
   def test_multiline_block_unsafe(self):
 
+    tf = None
+
     def test_fn(a):
-      state_ops.assign(a, a + 1)
+      tf.assign(a, a + 1)
       b = a + 1
-      state_ops.assign(a, a + 1)
+      tf.assign(a, a + 1)
       c = b + 1
       d = c + 1
       return d
 
-    tf_test_fn, node = self._transform_and_compile(test_fn)
+    node = self.parse_and_analyze(test_fn, {})
+    node = side_effect_guards.transform(node, self.ctx)
 
-    self.assertEqual(len(node.body[0].body), 1)
-    with self.test_session() as sess:
-      v = variables.Variable(2)
-      sess.run(v.initializer)
-      # NOTE: This intentionally highlights the flakiness. The test should be
-      # tightened down once that is solved.
-      self.assertTrue(sess.run(tf_test_fn(v)) in (6, 7))
+    with self.compiled(node, array_ops.identity, state_ops.assign,
+                       ops.control_dependencies, ops.Tensor) as result:
+      self.assertEqual(len(node.body[0].body), 1)
+      with self.test_session() as sess:
+        v = variables.Variable(2)
+        sess.run(v.initializer)
+        # NOTE: This intentionally highlights the flakiness. The test should be
+        # tightened down once that is solved.
+        self.assertTrue(sess.run(result.test_fn(v)) in (6, 7))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/py2tf/impl/api.py b/tensorflow/contrib/py2tf/impl/api.py
index 85d40f3158..8ae1c70169 100644
--- a/tensorflow/contrib/py2tf/impl/api.py
+++ b/tensorflow/contrib/py2tf/impl/api.py
@@ -27,6 +27,7 @@ from tensorflow.contrib.py2tf.impl import config
 from tensorflow.contrib.py2tf.impl import conversion
 from tensorflow.contrib.py2tf.pyct import compiler
 from tensorflow.contrib.py2tf.pyct import parser
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import tf_inspect
 
 # TODO(mdan): Properly document the type hints.
@@ -83,7 +84,7 @@ def convert_inline(f, *args, **kwargs):
   return convert(arg_value_hints)(f)(*args, **kwargs)
 
 
-def convert(recursive=False, arg_types=None):
+def convert(recursive=False, verbose=False, arg_types=None):
   """Decorator that compiles a function to graph mode.
 
   The decorator is dynamic - invoking compilation whenever the decorated
@@ -92,6 +93,7 @@ def convert(recursive=False, arg_types=None):
   Args:
     recursive: Whether to recusrively convert any functions that the decorator
         function may call.
+    verbose: Whether to output the compiled code in the logs.
     arg_types: See to_graph.
 
   Returns:
@@ -125,6 +127,7 @@ def convert(recursive=False, arg_types=None):
       wrapped = to_graph(
           f,
           recursive=recursive,
+          verbose=verbose,
           arg_values=arg_values,
           arg_types=arg_types,
           partial_types=partial_types)
@@ -140,6 +143,7 @@ def convert(recursive=False, arg_types=None):
 
 def to_graph(e,
              recursive=True,
+             verbose=False,
              arg_values=None,
              arg_types=None,
              partial_types=None):
@@ -155,6 +159,7 @@ def to_graph(e,
     e: A Python entity.
     recursive: Whether to recusrively convert any functions that the decorator
         function may call.
+    verbose: Whether to output the compiled code in the logs.
     arg_values: A dict containing value hints for symbols like function
         parameters.
     arg_types: A dict containing type hints for symbols like function
@@ -178,14 +183,17 @@ def to_graph(e,
     module.body.append(parser.parse_str(import_line))
   for dep in conversion_map.dependency_cache.values():
     module.body.append(dep)
-  compiled_node = compiler.ast_to_object(module)
+  compiled_node, compiled_src = compiler.ast_to_object(module)
 
   # The compiled code should see everything the entry function saw.
   # TODO(mdan): This might not work well if the call tree spans modules?
   if tf_inspect.isfunction(e):
     compiled_node.__dict__.update(six.get_function_globals(e))
-
   compiled_fn = getattr(compiled_node, name)
+
+  if verbose:
+    logging.info('Compiled output of %s:\n\n%s\n', e, compiled_src)
+
   return compiled_fn
 
 
diff --git a/tensorflow/contrib/py2tf/impl/naming.py b/tensorflow/contrib/py2tf/impl/naming.py
index d31462cba0..51326091de 100644
--- a/tensorflow/contrib/py2tf/impl/naming.py
+++ b/tensorflow/contrib/py2tf/impl/naming.py
@@ -115,8 +115,14 @@ class Namer(object):
       else:
         raise ValueError('Unexpected symbol type "%s"' % type(s))
 
+    pieces = name_root.split('_')
+    if pieces[-1].isdigit():
+      name_root = '_'.join(pieces[:-1])
+      n = int(pieces[-1])
+    else:
+      n = 0
     new_name = name_root
-    n = 0
+
     while (new_name in self.global_namespace or
            new_name in all_reserved_locals or new_name in self.generated_names):
       n += 1
diff --git a/tensorflow/contrib/py2tf/pyct/BUILD b/tensorflow/contrib/py2tf/pyct/BUILD
index 91054fe61d..e3c0da4b10 100644
--- a/tensorflow/contrib/py2tf/pyct/BUILD
+++ b/tensorflow/contrib/py2tf/pyct/BUILD
@@ -92,6 +92,16 @@ py_test(
     ],
 )
 
+py_test(
+    name = "qual_names_test",
+    srcs = ["qual_names_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":pyct",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
 py_test(
     name = "templates_test",
     srcs = ["templates_test.py"],
diff --git a/tensorflow/contrib/py2tf/pyct/compiler.py b/tensorflow/contrib/py2tf/pyct/compiler.py
index fc71469d1e..0caadf18c0 100644
--- a/tensorflow/contrib/py2tf/pyct/compiler.py
+++ b/tensorflow/contrib/py2tf/pyct/compiler.py
@@ -63,4 +63,4 @@ def ast_to_object(node, indentation='  ', source_prefix=None):
       f.write(source_prefix)
       f.write('\n')
     f.write(source)
-  return imp.load_source(module_name, f.name)
+  return imp.load_source(module_name, f.name), source
diff --git a/tensorflow/contrib/py2tf/pyct/compiler_test.py b/tensorflow/contrib/py2tf/pyct/compiler_test.py
index e0cde43566..c1f84238ef 100644
--- a/tensorflow/contrib/py2tf/pyct/compiler_test.py
+++ b/tensorflow/contrib/py2tf/pyct/compiler_test.py
@@ -41,6 +41,7 @@ class CompilerTest(test.TestCase):
                 targets=[gast.Name('a', gast.Store(), None)],
                 value=gast.Str('c'))
         ])
+
     self.assertEqual(
         textwrap.dedent("""
             if 1:
@@ -70,15 +71,19 @@ class CompilerTest(test.TestCase):
         decorator_list=[],
         returns=None)
 
-    mod = compiler.ast_to_object(node)
+    module, source = compiler.ast_to_object(node)
 
-    self.assertEqual(2, mod.f(1))
-    with open(mod.__file__, 'r') as temp_output:
+    expected_source = """
+      def f(a):
+        return a + 1
+    """
+    self.assertEqual(
+        textwrap.dedent(expected_source).strip(),
+        source.strip())
+    self.assertEqual(2, module.f(1))
+    with open(module.__file__, 'r') as temp_output:
       self.assertEqual(
-          textwrap.dedent("""
-              def f(a):
-                return a + 1
-          """).strip(),
+          textwrap.dedent(expected_source).strip(),
           temp_output.read().strip())
 
 
diff --git a/tensorflow/contrib/py2tf/pyct/qual_names.py b/tensorflow/contrib/py2tf/pyct/qual_names.py
index 11e3838467..8717ee6cff 100644
--- a/tensorflow/contrib/py2tf/pyct/qual_names.py
+++ b/tensorflow/contrib/py2tf/pyct/qual_names.py
@@ -31,9 +31,7 @@ from tensorflow.contrib.py2tf.pyct import anno
 
 
 class QN(object):
-  """Represents a qualified name.
-
-  """
+  """Represents a qualified name."""
 
   def __init__(self, base, attr=None):
     if attr:
@@ -42,8 +40,15 @@ class QN(object):
       self._parent = base
       self.qn = base.qn + (attr,)
     else:
-      self._parent = None
-      self.qn = tuple(base.split('.'))
+      if isinstance(base, QN):
+        if base.is_composite():
+          self._parent = base.parent
+        else:
+          self._parent = None
+        self.qn = base.qn
+      else:
+        self._parent = None
+        self.qn = tuple(base.split('.'))
 
   def is_composite(self):
     return len(self.qn) > 1
diff --git a/tensorflow/contrib/py2tf/pyct/qual_names_test.py b/tensorflow/contrib/py2tf/pyct/qual_names_test.py
new file mode 100644
index 0000000000..1b1eee2dec
--- /dev/null
+++ b/tensorflow/contrib/py2tf/pyct/qual_names_test.py
@@ -0,0 +1,108 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for qual_names module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import textwrap
+
+from tensorflow.contrib.py2tf.pyct import anno
+from tensorflow.contrib.py2tf.pyct import parser
+from tensorflow.contrib.py2tf.pyct import qual_names
+from tensorflow.python.platform import test
+
+
+class QNTest(test.TestCase):
+
+  def test_basic(self):
+    a = qual_names.QN('a')
+    self.assertEqual(a.qn, ('a',))
+    self.assertEqual(str(a), 'a')
+    self.assertEqual(a.ssf(), 'a')
+    self.assertEqual(a.ast().id, 'a')
+    self.assertFalse(a.is_composite())
+    with self.assertRaises(ValueError):
+      _ = a.parent
+
+    a_b = qual_names.QN(a, 'b')
+    self.assertEqual(a_b.qn, ('a', 'b'))
+    self.assertEqual(str(a_b), 'a.b')
+    self.assertEqual(a_b.ssf(), 'a_b')
+    self.assertEqual(a_b.ast().value.id, 'a')
+    self.assertEqual(a_b.ast().attr, 'b')
+    self.assertTrue(a_b.is_composite())
+    self.assertEqual(a_b.parent.qn, ('a',))
+
+    a2 = qual_names.QN(a)
+    self.assertEqual(a2.qn, ('a',))
+    with self.assertRaises(ValueError):
+      _ = a.parent
+
+    a_b2 = qual_names.QN(a_b)
+    self.assertEqual(a_b2.qn, ('a', 'b'))
+    self.assertEqual(a_b2.parent.qn, ('a',))
+
+    self.assertTrue(a2 == a)
+    self.assertFalse(a2 is a)
+
+    self.assertTrue(a_b.parent == a)
+    self.assertTrue(a_b2.parent == a)
+
+    self.assertTrue(a_b2 == a_b)
+    self.assertFalse(a_b2 is a_b)
+    self.assertFalse(a_b2 == a)
+
+    with self.assertRaises(ValueError):
+      qual_names.QN('a', 'b')
+
+  def test_hashable(self):
+    d = {qual_names.QN('a'): 'a', qual_names.QN('b'): 'b'}
+
+    self.assertEqual(d[qual_names.QN('a')], 'a')
+    self.assertEqual(d[qual_names.QN('b')], 'b')
+    self.assertTrue(qual_names.QN('c') not in d)
+
+
+class QNResolverTest(test.TestCase):
+
+  def assertQNStringIs(self, node, qn_str):
+    self.assertEqual(str(anno.getanno(node, anno.Basic.QN)), qn_str)
+
+  def test_resolve(self):
+    samples = """
+      a
+      a.b
+      (c, d.e)
+      [f, (g.h.i)]
+      j(k, l)
+    """
+    nodes = qual_names.resolve(parser.parse_str(textwrap.dedent(samples)))
+    nodes = tuple(n.value for n in nodes.body)
+
+    self.assertQNStringIs(nodes[0], 'a')
+    self.assertQNStringIs(nodes[1], 'a.b')
+    self.assertQNStringIs(nodes[2].elts[0], 'c')
+    self.assertQNStringIs(nodes[2].elts[1], 'd.e')
+    self.assertQNStringIs(nodes[3].elts[0], 'f')
+    self.assertQNStringIs(nodes[3].elts[1], 'g.h.i')
+    self.assertQNStringIs(nodes[4].func, 'j')
+    self.assertQNStringIs(nodes[4].args[0], 'k')
+    self.assertQNStringIs(nodes[4].args[1], 'l')
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/py2tf/pyct/templates.py b/tensorflow/contrib/py2tf/pyct/templates.py
index 5fd5252619..c40e4d0fb7 100644
--- a/tensorflow/contrib/py2tf/pyct/templates.py
+++ b/tensorflow/contrib/py2tf/pyct/templates.py
@@ -59,7 +59,7 @@ class ReplaceTransformer(gast.NodeTransformer):
       repl = self.replacements[node.name]
       if not isinstance(repl, (gast.Name, ast.Name)):
         raise ValueError(
-            'A function name can only be replaced by a Name node. Found: %s',
+            'A function name can only be replaced by a Name node. Found: %s' %
             repl)
       node.name = repl.id
     return node
@@ -70,6 +70,8 @@ class ReplaceTransformer(gast.NodeTransformer):
       node.ctx = gast.Load()
     elif isinstance(node, gast.Name):
       node.ctx = ctx
+    elif isinstance(node, (gast.Str, gast.Num)):
+      pass
     else:
       raise ValueError('unexpected node type "%s"' % node)
 
diff --git a/tensorflow/contrib/py2tf/pyct/templates_test.py b/tensorflow/contrib/py2tf/pyct/templates_test.py
index 0e3d07e378..8ccfde8573 100644
--- a/tensorflow/contrib/py2tf/pyct/templates_test.py
+++ b/tensorflow/contrib/py2tf/pyct/templates_test.py
@@ -34,7 +34,8 @@ class TemplatesTest(test.TestCase):
     """
 
     node = templates.replace(template, b=('a', 'c'))[0]
-    result = compiler.ast_to_object(node)
+    result, _ = compiler.ast_to_object(node)
+
     self.assertEquals((2, 3), result.test_fn(2, 3))
 
   def test_replace_variable(self):
@@ -46,7 +47,7 @@ class TemplatesTest(test.TestCase):
     """
 
     node = templates.replace(template, a='b')[0]
-    result = compiler.ast_to_object(node)
+    result, _ = compiler.ast_to_object(node)
     self.assertEquals(7, result.test_fn(2))
 
   def test_replace_function_name(self):
@@ -58,7 +59,7 @@ class TemplatesTest(test.TestCase):
     """
 
     node = templates.replace(template, fname='test_fn')[0]
-    result = compiler.ast_to_object(node)
+    result, _ = compiler.ast_to_object(node)
     self.assertEquals(7, result.test_fn(2))
 
   def test_code_block(self):
@@ -75,7 +76,7 @@ class TemplatesTest(test.TestCase):
                 gast.Name('a', None, None)
             ], gast.BinOp(gast.Name('a', None, None), gast.Add(), gast.Num(1))),
         ] * 2)[0]
-    result = compiler.ast_to_object(node)
+    result, _ = compiler.ast_to_object(node)
     self.assertEquals(3, result.test_fn(1))
 
 
-- 
GitLab


From 14042ac29bc4838023ace0ce723598f35194f927 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 8 Feb 2018 09:29:32 -0800
Subject: [PATCH 821/966] Enable algebraic optimizations for operations with
 neutral and absorbing elements by default - not only when feeds are absent or
 in aggressive mode.

PiperOrigin-RevId: 185006374
---
 tensorflow/core/grappler/optimizers/constant_folding.cc | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index 1e6f11c8aa..e27bd97325 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -1443,15 +1443,14 @@ Status ConstantFolding::SimplifyGraph(GraphDef* output,
       graph_modified_ = true;
       continue;
     }
-    const bool safe_to_use_shapes =
-        use_shape_info && (feed_nodes_.empty() || is_aggressive);
+
     const bool is_mul = IsMul(*node);
     const bool is_matmul = IsMatMul(*node);
     const bool is_add = IsAdd(*node) || IsBiasAdd(*node);
     const bool is_sub = IsSub(*node);
     const bool is_any_div = IsAnyDiv(*node);
     // Simplify arithmetic operations with ones or zeros.
-    if (safe_to_use_shapes &&
+    if (use_shape_info &&
         (is_mul || is_matmul || is_add || is_sub || is_any_div) &&
         properties.HasInputProperties(node->name()) &&
         properties.HasOutputProperties(node->name())) {
-- 
GitLab


From a4f0b3afb631f40024996c16a8bf2a146fb3dc8c Mon Sep 17 00:00:00 2001
From: Suharsh Sivakumar <suharshs@google.com>
Date: Thu, 8 Feb 2018 09:34:49 -0800
Subject: [PATCH 822/966] Make quantization rewrites happen in place.

If no graph is provided, then the default graph is used.

PiperOrigin-RevId: 185007107
---
 tensorflow/contrib/quantize/BUILD             |  30 ---
 .../contrib/quantize/python/copy_graph.py     |  32 ----
 .../quantize/python/copy_graph_test.py        |  55 ------
 .../quantize/python/fold_batch_norms_test.py  |  14 +-
 .../contrib/quantize/python/quantize_graph.py | 177 ++++++------------
 .../quantize/python/quantize_graph_test.py    |  82 ++++----
 6 files changed, 98 insertions(+), 292 deletions(-)
 delete mode 100644 tensorflow/contrib/quantize/python/copy_graph.py
 delete mode 100644 tensorflow/contrib/quantize/python/copy_graph_test.py

diff --git a/tensorflow/contrib/quantize/BUILD b/tensorflow/contrib/quantize/BUILD
index ada336e623..42e295e622 100644
--- a/tensorflow/contrib/quantize/BUILD
+++ b/tensorflow/contrib/quantize/BUILD
@@ -95,7 +95,6 @@ py_test(
     srcs = ["python/fold_batch_norms_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":copy_graph",
         ":fold_batch_norms",
         "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/python:array_ops",
@@ -110,31 +109,7 @@ py_test(
         "//tensorflow/python:random_ops",
         "//tensorflow/python:random_seed",
         "//tensorflow/python:session",
-        "//tensorflow/python:variables",
-    ],
-)
-
-py_library(
-    name = "copy_graph",
-    srcs = ["python/copy_graph.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python:training",
-    ],
-)
-
-py_test(
-    name = "copy_graph_test",
-    size = "small",
-    srcs = ["python/copy_graph_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":copy_graph",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
         "//tensorflow/python:variables",
     ],
 )
@@ -235,12 +210,9 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        ":copy_graph",
         ":fold_batch_norms",
         ":quantize",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python:util",
-        "//tensorflow/python:variables",
     ],
 )
 
@@ -253,13 +225,11 @@ py_test(
         ":quantize_graph",
         "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:init_ops",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:variables",
     ],
 )
 
diff --git a/tensorflow/contrib/quantize/python/copy_graph.py b/tensorflow/contrib/quantize/python/copy_graph.py
deleted file mode 100644
index 0376fcba82..0000000000
--- a/tensorflow/contrib/quantize/python/copy_graph.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Utility to copy a tf.Graph."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.framework import ops
-from tensorflow.python.training import saver as saver_lib
-
-
-def CopyGraph(graph):
-  """Return a copy of graph."""
-  meta_graph = saver_lib.export_meta_graph(
-      graph=graph, collection_list=graph.get_all_collection_keys())
-  graph_copy = ops.Graph()
-  with graph_copy.as_default():
-    _ = saver_lib.import_meta_graph(meta_graph)
-  return graph_copy
diff --git a/tensorflow/contrib/quantize/python/copy_graph_test.py b/tensorflow/contrib/quantize/python/copy_graph_test.py
deleted file mode 100644
index 7ff9ad9f84..0000000000
--- a/tensorflow/contrib/quantize/python/copy_graph_test.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for copy_graph."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.quantize.python import copy_graph
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import googletest
-
-
-class CopyGraphTest(test_util.TensorFlowTestCase):
-
-  def _CompareNodeInGraph(self, node, graph):
-    graph_node = graph.get_operation_by_name(node.name)
-    self.assertEqual(str(node.node_def), str(graph_node.node_def))
-
-  def testCopyGraph(self):
-    graph = ops.Graph()
-    with graph.as_default():
-      a = constant_op.constant(1.0)
-      b = variables.Variable(2.0)
-      c = a + b
-    graph_copy = copy_graph.CopyGraph(graph)
-    # Ensure that the three original nodes are in the new graph.
-    # import_meta_graph also adds a saver node to the graph which we don't care
-    # about in this specific use case.
-    for tensor in [a, b, c]:
-      self._CompareNodeInGraph(tensor.op, graph_copy)
-    # Test that the graph collections are the same.
-    for key in graph.get_all_collection_keys():
-      self.assertEqual(
-          len(graph.get_collection(key)),
-          len(graph_copy.get_collection(key)), 'Collection %s differs.')
-
-
-if __name__ == '__main__':
-  googletest.main()
diff --git a/tensorflow/contrib/quantize/python/fold_batch_norms_test.py b/tensorflow/contrib/quantize/python/fold_batch_norms_test.py
index 330bd8a647..c90a18ab03 100644
--- a/tensorflow/contrib/quantize/python/fold_batch_norms_test.py
+++ b/tensorflow/contrib/quantize/python/fold_batch_norms_test.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.layers.python.layers import layers
-from tensorflow.contrib.quantize.python import copy_graph
 from tensorflow.contrib.quantize.python import fold_batch_norms
 from tensorflow.python.client import session
 from tensorflow.python.framework import dtypes
@@ -34,6 +33,7 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
+from tensorflow.python.training import saver as saver_lib
 
 batch_norm = layers.batch_norm
 conv2d = layers.conv2d
@@ -379,7 +379,7 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
       if with_bypass:
         node = math_ops.add(inputs, node, name='test/Add')
       relu_node = relu(node, name='test/' + relu_op_name)
-    folded_g = copy_graph.CopyGraph(unfolded_g)
+    folded_g = self._CopyGraph(unfolded_g)
     with folded_g.as_default():
       fold_batch_norms.FoldBatchNorms(
           folded_g,
@@ -462,5 +462,15 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
       out_op = graph.get_operation_by_name(out_op_name)
       self.assertIn(op.outputs[0].name, [str(t.name) for t in out_op.inputs])
 
+  def _CopyGraph(self, graph):
+    """Return a copy of graph."""
+    meta_graph = saver_lib.export_meta_graph(
+        graph=graph, collection_list=graph.get_all_collection_keys())
+    graph_copy = ops.Graph()
+    with graph_copy.as_default():
+      _ = saver_lib.import_meta_graph(meta_graph)
+    return graph_copy
+
+
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/contrib/quantize/python/quantize_graph.py b/tensorflow/contrib/quantize/python/quantize_graph.py
index 89b744c559..81471d4c50 100644
--- a/tensorflow/contrib/quantize/python/quantize_graph.py
+++ b/tensorflow/contrib/quantize/python/quantize_graph.py
@@ -18,40 +18,28 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.quantize.python import copy_graph
 from tensorflow.contrib.quantize.python import fold_batch_norms
 from tensorflow.contrib.quantize.python import quantize
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import variables
 
 
-def _create_graph(input_graph,
-                  is_training,
-                  elements=None,
-                  device_name_or_function=None):
-  """Returns a transformed training input_graph for simulated quantization.
+def _create_graph(input_graph=None, is_training=True):
+  """Rewrites input_graph in place for simulated quantization.
 
-  The forward pass has fake quantization ops inserted to simulate the error
-  introduced by quantization.
+  The graph has fake quantization ops inserted to simulate the error
+  introduced by quantization. Since the graph is transformed in place,
+  the expected behavior of previously held references to nodes and tensors may
+  change.
 
   Args:
-    input_graph: The tf.Graph to be transformed.
+    input_graph: The tf.Graph to be transformed, if None then defaults to the
+      default graph.
     is_training: Whether quantizing training or eval graph.
-    elements: (Optional) List of Tensors and Operations in input_graph whose
-        corresponding elements in the new graph will be returned.
-    device_name_or_function: (Optional) The device name or function to use.
-
-  Returns:
-    g is new tf.Graph that is rewritten for simulated quantization.
-    l is a list of Tensors/Operations in g corresponding to the provided input
-        elements, if elements is not None.
 
   Raises:
     ValueError: If elements contains an element that isn't a tf.Tensor or
         tf.Operation.
   """
-  # TODO(suharshs): Describe the process in more detail in the doc string.
-  g = copy_graph.CopyGraph(input_graph)
   if is_training:
     # TODO(raghuramank): Need to make freeze_batch_norm_delay
     # a function of the batch size. For now setting this to 250 epochs
@@ -59,146 +47,87 @@ def _create_graph(input_graph,
     freeze_batch_norm_delay = 5000000
   else:
     freeze_batch_norm_delay = None
-  with g.as_default():
-    with ops.device(device_name_or_function):
-      fold_batch_norms.FoldBatchNorms(
-          g,
-          freeze_batch_norm_delay=freeze_batch_norm_delay,
-          is_training=is_training)
-      quantize.Quantize(g, is_training=is_training)
-  if elements is None:
-    return g
-
-  return_elements = []
-  for element in elements:
-    if isinstance(element, (ops.Tensor, variables.Variable)):
-      return_elements.append(g.get_tensor_by_name(element.name))
-    elif isinstance(element, ops.Operation):
-      return_elements.append(g.get_operation_by_name(element.name))
-    else:
-      raise ValueError(
-          'elements must consist of Tensor or Operation objects, got: ',
-          str(element))
-  return g, return_elements
-
-
-def create_training_graph(input_graph,
-                          elements=None,
-                          device_name_or_function=None):
-  """Returns a transformed training input_graph for simulated quantization.
-
-  The forward pass has fake quantization ops inserted to simulate the error
-  introduced by quantization.
+  if input_graph is None:
+    input_graph = ops.get_default_graph()
+  with input_graph.as_default():
+    fold_batch_norms.FoldBatchNorms(
+        input_graph,
+        freeze_batch_norm_delay=freeze_batch_norm_delay,
+        is_training=is_training)
+    quantize.Quantize(input_graph, is_training=is_training)
+
+
+def create_training_graph(input_graph=None):
+  """Rewrites a training input_graph in place for simulated quantization.
+
+  The graph has fake quantization ops inserted to simulate the error
+  introduced by quantization. Since the graph is transformed in place,
+  the expected behavior of previously held references to nodes and tensors may
+  change.
 
   Args:
     input_graph: The tf.Graph to be transformed.
-    elements: (Optional) List of Tensors and Operations in input_graph whose
-        corresponding elements in the new graph will be returned.
-    device_name_or_function: (Optional) The device name or function to use.
-
-  Returns:
-    g is new tf.Graph that is rewritten for simulated quantization.
-    l is a list of Tensors/Operations in g corresponding to the provided input
-        elements, if elements is not None.
 
   Raises:
     ValueError: If elements contains an element that isn't a tf.Tensor or
         tf.Operation.
   """
-  return _create_graph(
-      input_graph=input_graph,
-      is_training=True,
-      elements=elements,
-      device_name_or_function=device_name_or_function)
+  _create_graph(input_graph=input_graph, is_training=True)
 
 
-def create_eval_graph(input_graph, elements=None, device_name_or_function=None):
-  """Returns a transformed eval input_graph for simulated quantization.
+def create_eval_graph(input_graph=None):
+  """Rewrites an eval input_graph in place for simulated quantization.
 
-  The forward pass has fake quantization ops inserted to simulate the error
-  introduced by quantization.
+  The graph has fake quantization ops inserted to simulate the error
+  introduced by quantization. Since the graph is transformed in place,
+  the expected behavior of previously held references to nodes and tensors may
+  change.
 
   Args:
-    input_graph: The tf.Graph to be transformed.
-    elements: (Optional) List of Tensors and Operations in input_graph whose
-        corresponding elements in the new graph will be returned.
-    device_name_or_function: (Optional) The device name or function to use.
+    input_graph: The tf.Graph to be transformed, if None then defaults to the
+      default graph.
 
-  Returns:
-    g is new tf.Graph that is rewritten for simulated quantization.
-    l is a list of Tensors/Operations in g corresponding to the provided input
-        elements, if elements is not None.
 
   Raises:
     ValueError: If elements contains an element that isn't a tf.Tensor or
         tf.Operation.
   """
-  return _create_graph(
-      input_graph=input_graph,
-      is_training=False,
-      elements=elements,
-      device_name_or_function=device_name_or_function)
+  _create_graph(input_graph=input_graph, is_training=False)
 
 
-def experimental_create_training_graph(input_graph,
-                                       elements=None,
-                                       device_name_or_function=None):
-  """Returns a transformed training input_graph for simulated quantization.
+def experimental_create_training_graph(input_graph=None):
+  """Rewrites a training input_graph in place for simulated quantization.
 
-  This function has additional experimental options not (yet) available to
-  create_training_graph. The resulting behavior may be undefined.
-  The forward pass has fake quantization ops inserted to simulate the error
-  introduced by quantization.
+  The graph has fake quantization ops inserted to simulate the error
+  introduced by quantization. Since the graph is transformed in place,
+  the expected behavior of previously held references to nodes and tensors may
+  change.
 
   Args:
-    input_graph: The tf.Graph to be transformed.
-    elements: (Optional) List of Tensors and Operations in input_graph whose
-        corresponding elements in the new graph will be returned.
-    device_name_or_function: (Optional) The device name or function to use.
-
-  Returns:
-    g is new tf.Graph that is rewritten for simulated quantization.
-    l is a list of Tensors/Operations in g corresponding to the provided input
-        elements, if elements is not None.
+    input_graph: The tf.Graph to be transformed, if None then defaults to the
+      default graph.
 
   Raises:
     ValueError: If elements contains an element that isn't a tf.Tensor or
         tf.Operation.
   """
-  return _create_graph(
-      input_graph=input_graph,
-      is_training=True,
-      elements=elements,
-      device_name_or_function=device_name_or_function)
+  _create_graph(input_graph=input_graph, is_training=True)
 
 
-def experimental_create_eval_graph(input_graph,
-                                   elements=None,
-                                   device_name_or_function=None):
-  """Returns a transformed eval input_graph for simulated quantization.
+def experimental_create_eval_graph(input_graph=None):
+  """Rewrites an eval input_graph in place for simulated quantization.
 
-  This function has additional experimental options not (yet) available to
-  create_eval_graph. The resulting behavior may be undefined.
-  The forward pass has fake quantization ops inserted to simulate the error
-  introduced by quantization.
+  The graph has fake quantization ops inserted to simulate the error
+  introduced by quantization. Since the graph is transformed in place,
+  the expected behavior of previously held references to nodes and tensors may
+  change.
 
   Args:
-    input_graph: The tf.Graph to be transformed.
-    elements: (Optional) List of Tensors and Operations in input_graph whose
-        corresponding elements in the new graph will be returned.
-    device_name_or_function: (Optional) The device name or function to use.
-
-  Returns:
-    g is new tf.Graph that is rewritten for simulated quantization.
-    l is a list of Tensors/Operations in g corresponding to the provided input
-        elements, if elements is not None.
+    input_graph: The tf.Graph to be transformed, if None then defaults to the
+      default graph.
 
   Raises:
     ValueError: If elements contains an element that isn't a tf.Tensor or
         tf.Operation.
   """
-  return _create_graph(
-      input_graph=input_graph,
-      is_training=False,
-      elements=elements,
-      device_name_or_function=device_name_or_function)
+  _create_graph(input_graph=input_graph, is_training=False)
diff --git a/tensorflow/contrib/quantize/python/quantize_graph_test.py b/tensorflow/contrib/quantize/python/quantize_graph_test.py
index 514862a0ab..7e08ebcb5c 100644
--- a/tensorflow/contrib/quantize/python/quantize_graph_test.py
+++ b/tensorflow/contrib/quantize/python/quantize_graph_test.py
@@ -20,13 +20,11 @@ from __future__ import print_function
 
 from tensorflow.contrib.layers.python.layers import layers
 from tensorflow.contrib.quantize.python import quantize_graph
-from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
 
@@ -44,46 +42,10 @@ class QuantizeGraphTest(test_util.TensorFlowTestCase):
     for fn in rewrite_fns:
       test_fn(fn)
 
-  def testReturnedElements(self):
-    self._RunTestOverParameters(self._TestReturnElements)
+  def testRewrite(self):
+    self._RunTestOverParameters(self._TestRewrite)
 
-  def _TestReturnElements(self, fn):
-    graph = ops.Graph()
-    with graph.as_default():
-      a = constant_op.constant(1.0)
-      b = variables.Variable(2.0)
-      c = a + b
-    elements = [a, b, c.op]
-    q_graph, returned_elements = fn(graph, elements=elements)
-    # Make sure q_graph is different from graph.
-    self.assertTrue(graph != q_graph)
-    # Check that the returned elements are part of the new graph.
-    for returned_element in returned_elements:
-      self.assertEqual(q_graph, returned_element.graph)
-    # Check that the elements match with the one from the input graph.
-    for element, returned_element in zip(elements, returned_elements):
-      self.assertEqual(element.name, returned_element.name)
-
-  def testNoReturnElements(self):
-    self._RunTestOverParameters(self._TestNoReturnElements)
-
-  def _TestNoReturnElements(self, fn):
-    graph = ops.Graph()
-    with graph.as_default():
-      a = constant_op.constant(1.0)
-      b = variables.Variable(2.0)
-      _ = a + b
-    q_graph = fn(graph)
-    # Check that quantize_graph didn't return a tuple when elements isn't
-    # provided.
-    self.assertTrue(isinstance(q_graph, ops.Graph))
-    # Make sure q_graph is different from graph.
-    self.assertTrue(graph != q_graph)
-
-  def testDeviceName(self):
-    self._RunTestOverParameters(self._TestDeviceName)
-
-  def _TestDeviceName(self, fn):
+  def _TestRewrite(self, fn):
     graph = ops.Graph()
     with graph.as_default():
       batch_size, height, width, depth = 5, 128, 128, 3
@@ -98,18 +60,40 @@ class QuantizeGraphTest(test_util.TensorFlowTestCase):
           scope='test')
       _ = nn_ops.relu6(conv)
 
-    device_name = '/job:oink/task:0/device:CPU:0'
-    q_graph = fn(graph, device_name_or_function=device_name)
-
     orig_variable_names = set(
         [v.name for v in graph.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
-    q_variables = q_graph.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+
+    fn(graph)
+
+    q_variables = graph.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
     # Ensure that variables were added.
     self.assertTrue(len(orig_variable_names) < len(q_variables))
-    # All added variables should have the specified device name.
-    for var in q_variables:
-      if var.name not in orig_variable_names:
-        self.assertEqual(var.device, device_name)
+
+  def testDefaultGraph(self):
+    self._RunTestOverParameters(self._TestRewrite)
+
+  def _TestDefaultGraph(self, fn):
+    with ops.Graph().as_default() as g:
+      batch_size, height, width, depth = 5, 128, 128, 3
+      inputs = array_ops.zeros((batch_size, height, width, depth))
+      conv = layers.conv2d(
+          inputs,
+          32, [5, 5],
+          stride=2,
+          padding='SAME',
+          weights_initializer=self._WeightInit(0.09),
+          activation_fn=None,
+          scope='test')
+      _ = nn_ops.relu6(conv)
+
+      orig_variable_names = set(
+          [v.name for v in g.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
+
+      fn()
+
+      q_variables = g.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+      # Ensure that variables were added.
+      self.assertTrue(len(orig_variable_names) < len(q_variables))
 
   def _WeightInit(self, stddev):
     """Returns truncated normal variable initializer.
-- 
GitLab


From 1baac7862739525351d25202800dc04e8ec3868b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 8 Feb 2018 10:20:26 -0800
Subject: [PATCH 823/966] Make MklCpuAllocator a VisitableAllocator, instead of
 just an Allocator. This allows it to work more efficiently with RDMA
 networking.

PiperOrigin-RevId: 185013628
---
 tensorflow/core/common_runtime/mkl_cpu_allocator.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/common_runtime/mkl_cpu_allocator.h b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
index c7a2b616c7..0eb47f4e56 100644
--- a/tensorflow/core/common_runtime/mkl_cpu_allocator.h
+++ b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
@@ -46,7 +46,7 @@ class MklSubAllocator : public SubAllocator {
 
 /// CPU allocator for MKL that wraps BFC allocator and intercepts
 /// and redirects memory allocation calls from MKL.
-class MklCPUAllocator : public Allocator {
+class MklCPUAllocator : public VisitableAllocator {
  public:
   // Constructor and other standard functions
 
@@ -119,6 +119,14 @@ class MklCPUAllocator : public Allocator {
 
   void ClearStats() override { allocator_->ClearStats(); }
 
+  void AddAllocVisitor(Visitor visitor) override {
+    allocator_->AddAllocVisitor(visitor);
+  }
+
+  void AddFreeVisitor(Visitor visitor) override {
+    allocator_->AddFreeVisitor(visitor);
+  }
+
  private:
   // Hooks provided by this allocator for memory allocation routines from MKL
 
-- 
GitLab


From 89666bb3d7eb4dee1398bd0fc37f697164fa403f Mon Sep 17 00:00:00 2001
From: Bixia Zheng <bixia@google.com>
Date: Thu, 8 Feb 2018 10:22:16 -0800
Subject: [PATCH 824/966] Remove cudnn_type parameter from a few template
 member functions in class CudnnSupport.

The cudnn_type value in those functions is implied by the device memory
type DeviceMemory<T> and is a compile-time constant.

PiperOrigin-RevId: 185013909
---
 tensorflow/stream_executor/cuda/cuda_dnn.cc | 150 ++++++++++----------
 tensorflow/stream_executor/cuda/cuda_dnn.h  |   7 +-
 2 files changed, 77 insertions(+), 80 deletions(-)

diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 384445e6c1..b6abd42767 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -109,6 +109,24 @@ string ToString(cudnnStatus_t status) {
   }
 }
 
+template <typename T>
+cudnnDataType_t GetCudnnDataType();
+
+template <>
+cudnnDataType_t GetCudnnDataType<double>() {
+  return CUDNN_DATA_DOUBLE;
+}
+
+template <>
+cudnnDataType_t GetCudnnDataType<float>() {
+  return CUDNN_DATA_FLOAT;
+}
+
+template <>
+cudnnDataType_t GetCudnnDataType<Eigen::half>() {
+  return CUDNN_DATA_HALF;
+}
+
 namespace wrap {
 
 static port::ThreadPool* InitCudnnThreadpool() {
@@ -2106,7 +2124,6 @@ inline cudnnConvolutionFwdAlgo_t GetCudnnConvolutionForwardAlgo(
 
 dnn::AlgorithmDesc GetCudnnConvolutionForwardAlgorithm(
     Stream* stream, CUDAExecutor* parent, void* dnn_handle,
-    int cudnn_type,  // Actually cudnnDataType_t.
     const dnn::AlgorithmConfig& algorithm_config, bool is_profiling,
     const ScopedTensorDescriptor& input_nd,
     const ScopedFilterDescriptor& filter,
@@ -2264,8 +2281,8 @@ cudnnDataType_t GetConvComputeType<Eigen::half>() {
 
 template <class T>
 bool CudnnSupport::DoConvolveImpl(
-    Stream* stream, int cudnn_type,  // Actually cudnnDataType_t.
-    const BatchDescriptor& batch_descriptor, const DeviceMemory<T>& input_data,
+    Stream* stream, const BatchDescriptor& batch_descriptor,
+    const DeviceMemory<T>& input_data,
     const FilterDescriptor& filter_descriptor,
     const DeviceMemory<T>& filter_data,
     const ConvolutionDescriptor& convolution_descriptor,
@@ -2273,12 +2290,11 @@ bool CudnnSupport::DoConvolveImpl(
     ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  ScopedTensorDescriptor input_nd{parent_, batch_descriptor,
-      static_cast<cudnnDataType_t>(cudnn_type)};
-  ScopedTensorDescriptor output_nd{parent_, output_descriptor,
-      static_cast<cudnnDataType_t>(cudnn_type)};
+  cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
+  ScopedTensorDescriptor input_nd{parent_, batch_descriptor, cudnn_type};
+  ScopedTensorDescriptor output_nd{parent_, output_descriptor, cudnn_type};
   ScopedFilterDescriptor filter{parent_, filter_descriptor, batch_descriptor,
-      static_cast<cudnnDataType_t>(cudnn_type)};
+                                cudnn_type};
   ScopedConvolutionDescriptor conv{parent_, convolution_descriptor,
                                    GetConvComputeType<T>()};
 
@@ -2505,9 +2521,8 @@ bool CudnnSupport::DoFusedConvolveImpl(
   const bool is_profiling = output_profile_result != nullptr;
   DeviceMemory<uint8> scratch;
   dnn::AlgorithmDesc algotype = GetCudnnConvolutionForwardAlgorithm(
-      stream, parent_, dnn_handle_, cudnn_data_type, algorithm_config,
-      is_profiling, conv_input_nd, filter, conv, output_nd, scratch_allocator,
-      &scratch);
+      stream, parent_, dnn_handle_, algorithm_config, is_profiling,
+      conv_input_nd, filter, conv, output_nd, scratch_allocator, &scratch);
   if (algotype.is_default()) {
     if (!is_profiling) {
       LOG(ERROR) << "No suitable algorithm found";
@@ -2888,9 +2903,9 @@ bool CudnnSupport::DoConvolve(
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   return DoConvolveImpl<float>(
-      stream, CUDNN_DATA_FLOAT, batch_descriptor, input_data, filter_descriptor,
-      filter_data, convolution_descriptor, output_descriptor, output_data,
-      scratch_allocator, algorithm_config, output_profile_result);
+      stream, batch_descriptor, input_data, filter_descriptor, filter_data,
+      convolution_descriptor, output_descriptor, output_data, scratch_allocator,
+      algorithm_config, output_profile_result);
 }
 
 bool CudnnSupport::DoConvolve(
@@ -2916,9 +2931,9 @@ bool CudnnSupport::DoConvolve(
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   return DoConvolveImpl<Eigen::half>(
-      stream, CUDNN_DATA_HALF, batch_descriptor, input_data, filter_descriptor,
-      filter_data, convolution_descriptor, output_descriptor, output_data,
-      scratch_allocator, algorithm_config, output_profile_result);
+      stream, batch_descriptor, input_data, filter_descriptor, filter_data,
+      convolution_descriptor, output_descriptor, output_data, scratch_allocator,
+      algorithm_config, output_profile_result);
 }
 
 bool CudnnSupport::DoFusedConvolve(
@@ -3027,7 +3042,6 @@ bool CudnnSupport::DoFusedConvolve(
 template<class T>
 DeviceMemory<T> CudnnSupport::MaybeTransformLayout(
     Stream* stream,
-    int cudnn_type,  // Actually cudnnDataType_t.
     BatchDescriptor* output_descriptor,
     DeviceMemory<T> backward_output_data,
     std::unique_ptr<TemporaryDeviceMemory<T>>* transform_scratch) {
@@ -3041,11 +3055,11 @@ DeviceMemory<T> CudnnSupport::MaybeTransformLayout(
   BatchDescriptor transformed_output_descriptor;
   transformed_output_descriptor.CloneFrom(*output_descriptor);
   transformed_output_descriptor.set_layout(dnn::DataLayout::kBatchDepthYX);
-  ScopedTensorDescriptor orig_out_back_nd{
-      parent_, *output_descriptor, static_cast<cudnnDataType_t>(cudnn_type)};
+  cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
+  ScopedTensorDescriptor orig_out_back_nd{parent_, *output_descriptor,
+                                          cudnn_type};
   ScopedTensorDescriptor transformed_out_back_nd{
-      parent_, transformed_output_descriptor,
-      static_cast<cudnnDataType_t>(cudnn_type)};
+      parent_, transformed_output_descriptor, cudnn_type};
 
   float alpha = 1.0f;
   float beta = 0.0f;
@@ -3092,7 +3106,6 @@ bool CudnnSupport::DoTransformTensor(Stream* stream,
 template <class T>
 bool CudnnSupport::DoConvolveBackwardDataImpl(
     Stream* stream,
-    int cudnn_type,  // Actually cudnnDataType_t.
     const FilterDescriptor& filter_descriptor,
     const DeviceMemory<T>& filter_data,
     const BatchDescriptor& output_descriptor_in,
@@ -3119,15 +3132,13 @@ bool CudnnSupport::DoConvolveBackwardDataImpl(
   output_descriptor.CloneFrom(output_descriptor_in);
   std::unique_ptr<TemporaryDeviceMemory<T>> transform_scratch;
   backward_output_data = MaybeTransformLayout(
-      stream, cudnn_type, &output_descriptor, backward_output_data,
-      &transform_scratch);
+      stream, &output_descriptor, backward_output_data, &transform_scratch);
 
-  ScopedTensorDescriptor out_back_nd{parent_, output_descriptor,
-                                     static_cast<cudnnDataType_t>(cudnn_type)};
-  ScopedTensorDescriptor in_back_nd{parent_, input_descriptor,
-                                    static_cast<cudnnDataType_t>(cudnn_type)};
+  cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
+  ScopedTensorDescriptor out_back_nd{parent_, output_descriptor, cudnn_type};
+  ScopedTensorDescriptor in_back_nd{parent_, input_descriptor, cudnn_type};
   ScopedFilterDescriptor filter{parent_, filter_descriptor, input_descriptor,
-                                static_cast<cudnnDataType_t>(cudnn_type)};
+                                cudnn_type};
   ScopedConvolutionDescriptor conv{parent_, convolution_descriptor,
                                    GetConvComputeType<T>()};
 
@@ -3315,11 +3326,11 @@ bool CudnnSupport::DoConvolveBackwardData(
     ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  return DoConvolveBackwardDataImpl(
-      stream, CUDNN_DATA_FLOAT, filter_descriptor, filter_data,
-      output_descriptor_in, backward_output_data, convolution_descriptor,
-      input_descriptor, backward_input_data, scratch_allocator,
-      algorithm_config, output_profile_result);
+  return DoConvolveBackwardDataImpl(stream, filter_descriptor, filter_data,
+                                    output_descriptor_in, backward_output_data,
+                                    convolution_descriptor, input_descriptor,
+                                    backward_input_data, scratch_allocator,
+                                    algorithm_config, output_profile_result);
 }
 
 bool CudnnSupport::DoConvolveBackwardData(
@@ -3333,17 +3344,16 @@ bool CudnnSupport::DoConvolveBackwardData(
     ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  return DoConvolveBackwardDataImpl(
-      stream, CUDNN_DATA_HALF, filter_descriptor, filter_data,
-      output_descriptor_in, backward_output_data, convolution_descriptor,
-      input_descriptor, backward_input_data, scratch_allocator,
-      algorithm_config, output_profile_result);
+  return DoConvolveBackwardDataImpl(stream, filter_descriptor, filter_data,
+                                    output_descriptor_in, backward_output_data,
+                                    convolution_descriptor, input_descriptor,
+                                    backward_input_data, scratch_allocator,
+                                    algorithm_config, output_profile_result);
 }
 
 template <class T>
 bool CudnnSupport::DoConvolveBackwardFilterImpl(
-    Stream* stream, int cudnn_type,  // Actually cudnnDataType_t.
-    const dnn::BatchDescriptor& input_descriptor,
+    Stream* stream, const dnn::BatchDescriptor& input_descriptor,
     const DeviceMemory<T>& input_data,
     const dnn::BatchDescriptor& output_descriptor_in,
     DeviceMemory<T> backward_output_data,
@@ -3369,16 +3379,13 @@ bool CudnnSupport::DoConvolveBackwardFilterImpl(
   output_descriptor.CloneFrom(output_descriptor_in);
   std::unique_ptr<TemporaryDeviceMemory<T>> transform_scratch;
   backward_output_data = MaybeTransformLayout(
-      stream, static_cast<cudnnDataType_t>(cudnn_type),
-      &output_descriptor, backward_output_data,
-      &transform_scratch);
-
-  ScopedTensorDescriptor out_back_nd{parent_, output_descriptor,
-        static_cast<cudnnDataType_t>(cudnn_type)};
-  ScopedTensorDescriptor input_nd{parent_, input_descriptor,
-          static_cast<cudnnDataType_t>(cudnn_type)};
+      stream, &output_descriptor, backward_output_data, &transform_scratch);
+
+  cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
+  ScopedTensorDescriptor out_back_nd{parent_, output_descriptor, cudnn_type};
+  ScopedTensorDescriptor input_nd{parent_, input_descriptor, cudnn_type};
   ScopedFilterDescriptor filter{parent_, filter_descriptor, input_descriptor,
-        static_cast<cudnnDataType_t>(cudnn_type)};
+                                cudnn_type};
   ScopedConvolutionDescriptor conv{parent_, convolution_descriptor,
                                    GetConvComputeType<T>()};
 
@@ -3568,10 +3575,10 @@ bool CudnnSupport::DoConvolveBackwardFilter(
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   return DoConvolveBackwardFilterImpl(
-      stream, CUDNN_DATA_FLOAT, input_descriptor, input_data,
-      output_descriptor_in, backward_output_data, convolution_descriptor,
-      filter_descriptor, backward_filter_data, scratch_allocator,
-      algorithm_config, output_profile_result);
+      stream, input_descriptor, input_data, output_descriptor_in,
+      backward_output_data, convolution_descriptor, filter_descriptor,
+      backward_filter_data, scratch_allocator, algorithm_config,
+      output_profile_result);
 }
 
 bool CudnnSupport::DoConvolveBackwardFilter(
@@ -3586,16 +3593,15 @@ bool CudnnSupport::DoConvolveBackwardFilter(
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   return DoConvolveBackwardFilterImpl(
-      stream, CUDNN_DATA_HALF, input_descriptor, input_data,
-      output_descriptor_in, backward_output_data, convolution_descriptor,
-      filter_descriptor, backward_filter_data, scratch_allocator,
-      algorithm_config, output_profile_result);
+      stream, input_descriptor, input_data, output_descriptor_in,
+      backward_output_data, convolution_descriptor, filter_descriptor,
+      backward_filter_data, scratch_allocator, algorithm_config,
+      output_profile_result);
 }
 
 template <class T>
 bool CudnnSupport::DoConvolveBackwardBiasImpl(
-    Stream* stream, int cudnn_type,  // Actually cudnnDataType_t.
-    const dnn::BatchDescriptor& input_descriptor,
+    Stream* stream, const dnn::BatchDescriptor& input_descriptor,
     const DeviceMemory<T>& input_data,
     const dnn::BatchDescriptor& bias_descriptor,
     DeviceMemory<T>* backward_bias_data) {
@@ -3606,10 +3612,9 @@ bool CudnnSupport::DoConvolveBackwardBiasImpl(
     LOG(FATAL) << "failed to set stream for cudnn handle: " << ToString(status);
   }
 
-  ScopedTensorDescriptor input_nd{parent_, input_descriptor,
-                                  static_cast<cudnnDataType_t>(cudnn_type)};
-  ScopedTensorDescriptor bias_nd{parent_, bias_descriptor,
-                                 static_cast<cudnnDataType_t>(cudnn_type)};
+  cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
+  ScopedTensorDescriptor input_nd{parent_, input_descriptor, cudnn_type};
+  ScopedTensorDescriptor bias_nd{parent_, bias_descriptor, cudnn_type};
 
   // Alpha is the scaling factor for input.
   float alpha = 1.0;
@@ -3633,9 +3638,8 @@ bool CudnnSupport::DoConvolveBackwardBias(
     const DeviceMemory<double>& input_data,
     const BatchDescriptor& bias_descriptor,
     DeviceMemory<double>* backward_bias_data) {
-  return DoConvolveBackwardBiasImpl(stream, CUDNN_DATA_DOUBLE, input_descriptor,
-                                    input_data, bias_descriptor,
-                                    backward_bias_data);
+  return DoConvolveBackwardBiasImpl(stream, input_descriptor, input_data,
+                                    bias_descriptor, backward_bias_data);
 }
 
 bool CudnnSupport::DoConvolveBackwardBias(
@@ -3643,9 +3647,8 @@ bool CudnnSupport::DoConvolveBackwardBias(
     const DeviceMemory<float>& input_data,
     const BatchDescriptor& bias_descriptor,
     DeviceMemory<float>* backward_bias_data) {
-  return DoConvolveBackwardBiasImpl(stream, CUDNN_DATA_FLOAT, input_descriptor,
-                                    input_data, bias_descriptor,
-                                    backward_bias_data);
+  return DoConvolveBackwardBiasImpl(stream, input_descriptor, input_data,
+                                    bias_descriptor, backward_bias_data);
 }
 
 bool CudnnSupport::DoConvolveBackwardBias(
@@ -3653,9 +3656,8 @@ bool CudnnSupport::DoConvolveBackwardBias(
     const DeviceMemory<Eigen::half>& input_data,
     const BatchDescriptor& bias_descriptor,
     DeviceMemory<Eigen::half>* backward_bias_data) {
-  return DoConvolveBackwardBiasImpl(stream, CUDNN_DATA_HALF, input_descriptor,
-                                    input_data, bias_descriptor,
-                                    backward_bias_data);
+  return DoConvolveBackwardBiasImpl(stream, input_descriptor, input_data,
+                                    bias_descriptor, backward_bias_data);
 }
 
 bool CudnnSupport::DoMatMul(Stream* stream,
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h
index ee28c0bf57..40aa974dd9 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.h
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.h
@@ -611,7 +611,6 @@ class CudnnSupport : public dnn::DnnSupport {
   template<class T>
   DeviceMemory<T> MaybeTransformLayout(
       Stream* stream,
-      int cudnn_type,  // Actually cudnnDataType_t.
       dnn::BatchDescriptor* output_descriptor,
       DeviceMemory<T> backward_output_data,
       std::unique_ptr<TemporaryDeviceMemory<T>>* transform_scratch)
@@ -644,7 +643,6 @@ class CudnnSupport : public dnn::DnnSupport {
 
   template <class T>
   bool DoConvolveImpl(Stream* stream,
-                      int cudnn_type,  // Actually cudnnDataType_t.
                       const dnn::BatchDescriptor& batch_descriptor,
                       const DeviceMemory<T>& input_data,
                       const dnn::FilterDescriptor& filter_descriptor,
@@ -675,7 +673,6 @@ class CudnnSupport : public dnn::DnnSupport {
   template <class T>
   bool DoConvolveBackwardDataImpl(
       Stream* stream,
-      int cudnn_type,  // Actually cudnnDataType_t.
       const dnn::FilterDescriptor& filter_descriptor,
       const DeviceMemory<T>& filter_data,
       const dnn::BatchDescriptor& output_descriptor,
@@ -688,8 +685,7 @@ class CudnnSupport : public dnn::DnnSupport {
 
   template <class T>
   bool DoConvolveBackwardFilterImpl(
-      Stream* stream, int cudnn_type,  // Actually cudnnDataType_t.
-      const dnn::BatchDescriptor& input_descriptor,
+      Stream* stream, const dnn::BatchDescriptor& input_descriptor,
       const DeviceMemory<T>& input_data,
       const dnn::BatchDescriptor& output_descriptor_in,
       DeviceMemory<T> backward_output_data,
@@ -702,7 +698,6 @@ class CudnnSupport : public dnn::DnnSupport {
 
   template <class T>
   bool DoConvolveBackwardBiasImpl(Stream* stream,
-                                  int cudnn_type,  // Actually cudnnDataType_t.
                                   const dnn::BatchDescriptor& input_descriptor,
                                   const DeviceMemory<T>& input_data,
                                   const dnn::BatchDescriptor& bias_descriptor,
-- 
GitLab


From b3b5d4f06129a3b3b34d1f9310c52155f80a56c0 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Thu, 8 Feb 2018 10:35:08 -0800
Subject: [PATCH 825/966] Fix broken link in CONTRIBUTING.md (#16869)

This fix fixes the broken link in CONTRIBUTING.md.
Without `https://`, the markdown will render the link incorrectly to
https://github.com/tensorflow/tensorflow/blob/master/www.docker.com

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 CONTRIBUTING.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 16d7051343..2e2182bb21 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -163,7 +163,7 @@ There are two ways to run TensorFlow unit tests.
    bazel test ${flags} //tensorflow/python/...
    ```
 
-2. Using [Docker](www.docker.com) and TensorFlow's CI scripts.
+2. Using [Docker](https://www.docker.com) and TensorFlow's CI scripts.
 
    ```bash
    # Install Docker first, then this will build and run cpu tests
-- 
GitLab


From 4d9645a687d8f35b1cd4b3f9303be584d2357142 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Thu, 8 Feb 2018 10:41:03 -0800
Subject: [PATCH 826/966] Improve shape function of SampleDistortedBoundingBox
 and fix some test cases (#16870)

* Update test case to expose the issue of sample_distorted_bounding_box

This commit expoes the issue of sample_distorted_bounding_box,
as shape function does not check the ranking like inside Compute().

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Adjust shape to pass the test.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Improve shape function of SampleDistortedBoundingBox and fix some test cases

This fix tries to improve the shape function of SampleDistortedBoundingBox
and fix several test case issues.
As is shown in the kernel of SampleDistortedBoundingBox, the shape of
SampleDistortedBoundingBox are required to be 1-D `[height, width, channels]`
for image_size, 3-D with shape `[batch, N, 4]` for bounding_boxes.

In the test case, the uses shape is incorrect but becasue of no check in shape
function, the test case passes.

This fix adds the shape check for SampleDistortedBoundingBox, and
fixes the incorrect test cases.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/core/ops/image_ops.cc        | 24 ++++++++++++++++++++++++
 tensorflow/python/ops/image_ops_test.py | 17 ++++++++++++-----
 2 files changed, 36 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/ops/image_ops.cc b/tensorflow/core/ops/image_ops.cc
index a62e2d782b..4c05b274fe 100644
--- a/tensorflow/core/ops/image_ops.cc
+++ b/tensorflow/core/ops/image_ops.cc
@@ -455,6 +455,17 @@ REGISTER_OP("SampleDistortedBoundingBox")
     .Attr("use_image_if_no_bounding_boxes: bool = false")
     .SetIsStateful()
     .SetShapeFn([](InferenceContext* c) {
+      // Get inputs and validate ranks.
+      ShapeHandle image_size;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &image_size));
+      ShapeHandle bounding_boxes;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 3, &bounding_boxes));
+      // image_size: 1-D with [height, width, channels]
+      // bounding_boxes: 3-D with shape [batch, N, 4]
+      DimensionHandle unused;
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(image_size, 0), 3, &unused));
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(bounding_boxes, 2), 4, &unused));
+
       c->set_output(0, c->Vector(3));
       c->set_output(1, c->Vector(3));
       c->set_output(2, c->MakeShape({1, 1, 4}));
@@ -477,6 +488,19 @@ REGISTER_OP("SampleDistortedBoundingBoxV2")
     .Attr("use_image_if_no_bounding_boxes: bool = false")
     .SetIsStateful()
     .SetShapeFn([](InferenceContext* c) {
+      // Get inputs and validate ranks.
+      ShapeHandle image_size;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &image_size));
+      ShapeHandle bounding_boxes;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 3, &bounding_boxes));
+      ShapeHandle min_object_covered;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &min_object_covered));
+      // image_size: 1-D with [height, width, channels]
+      // bounding_boxes: 3-D with shape [batch, N, 4]
+      DimensionHandle unused;
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(image_size, 0), 3, &unused));
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(bounding_boxes, 2), 4, &unused));
+
       c->set_output(0, c->Vector(3));
       c->set_output(1, c->Vector(3));
       c->set_output(2, c->MakeShape({1, 1, 4}));
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index 0dc1c56e7d..9aa14a86f7 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -1869,8 +1869,8 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
       image_size = constant_op.constant(
           [40, 50, 1], shape=[3], dtype=dtypes.int32)
       bounding_box = constant_op.constant(
-          [0.0, 0.0, 1.0, 1.0],
-          shape=[4],
+          [[[0.0, 0.0, 1.0, 1.0]]],
+          shape=[1, 1, 4],
           dtype=dtypes.float32,
       )
       begin, end, bbox_for_drawing = image_ops.sample_distorted_bounding_box(
@@ -1884,6 +1884,10 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([3], begin.get_shape().as_list())
       self.assertAllEqual([3], end.get_shape().as_list())
       self.assertAllEqual([1, 1, 4], bbox_for_drawing.get_shape().as_list())
+      # Actual run to make sure shape is correct inside Compute().
+      begin = begin.eval()
+      end = end.eval()
+      bbox_for_drawing = bbox_for_drawing.eval()
 
       begin, end, bbox_for_drawing = image_ops.sample_distorted_bounding_box(
           image_size=image_size,
@@ -1903,8 +1907,8 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
       image_size = constant_op.constant(
           [40, 50, 1], shape=[3], dtype=dtypes.int32)
       bounding_box = constant_op.constant(
-          [0.0, 0.0, 1.0, 1.0],
-          shape=[4],
+          [[[0.0, 0.0, 1.0, 1.0]]],
+          shape=[1, 1, 4],
           dtype=dtypes.float32,)
       begin, end, bbox_for_drawing = image_ops.sample_distorted_bounding_box(
           image_size=image_size,
@@ -1915,7 +1919,10 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([3], begin.get_shape().as_list())
       self.assertAllEqual([3], end.get_shape().as_list())
       self.assertAllEqual([1, 1, 4], bbox_for_drawing.get_shape().as_list())
-
+      # Actual run to make sure shape is correct inside Compute().
+      begin = begin.eval()
+      end = end.eval()
+      bbox_for_drawing = bbox_for_drawing.eval()
 
 class ResizeImagesTest(test_util.TensorFlowTestCase):
 
-- 
GitLab


From 40ef7e13af4bb92585369bcf1a0f658ea6bd27b3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 8 Feb 2018 10:37:40 -0800
Subject: [PATCH 827/966] Update for LLVM API change r324405

PiperOrigin-RevId: 185016276
---
 .../xla/service/cpu/simple_orc_jit.cc         | 36 ++++++++++++++++---
 .../compiler/xla/service/cpu/simple_orc_jit.h |  5 +++
 2 files changed, 36 insertions(+), 5 deletions(-)

diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
index 34c8e31060..acbb7557fc 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <utility>
 
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
+#include "llvm/ExecutionEngine/JITSymbol.h"
 #include "llvm/ExecutionEngine/SectionMemoryManager.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/Support/CodeGen.h"
@@ -139,10 +140,35 @@ SimpleOrcJIT::SimpleOrcJIT(const llvm::TargetOptions& target_options,
                                 /*MAttrs=*/DetectMachineAttributes()))),
       disassembler_(*target_machine_),
       data_layout_(target_machine_->createDataLayout()),
-      object_layer_([] {
-        return std::make_shared<llvm::SectionMemoryManager>(
-            orc_jit_memory_mapper::GetInstance());
-      }),
+      execution_session_(string_pool_),
+      symbol_resolver_(llvm::orc::createLegacyLookupResolver(
+          [this](const std::string& name) -> llvm::JITSymbol {
+            if (const uint8* from_constant_pool =
+                    external_constant_pool_.Find(string(name))) {
+              return llvm::JITEvaluatedSymbol(
+                  reinterpret_cast<uint64_t>(from_constant_pool),
+                  llvm::JITSymbolFlags::None);
+            }
+
+            void* func_addr = CustomCallTargetRegistry::Global()->Lookup(name);
+            if (func_addr == nullptr) {
+              return nullptr;
+            }
+            llvm::JITEvaluatedSymbol symbol_info(
+                reinterpret_cast<uint64_t>(func_addr),
+                llvm::JITSymbolFlags::None);
+            return symbol_info;
+          },
+          [](llvm::Error Err) {
+            cantFail(std::move(Err), "lookupFlags failed");
+          })),
+      object_layer_(
+          execution_session_,
+          [](llvm::orc::VModuleKey) {
+            return std::make_shared<llvm::SectionMemoryManager>(
+                orc_jit_memory_mapper::GetInstance());
+          },
+          [this](llvm::orc::VModuleKey K) { return symbol_resolver_; }),
       compile_layer_(
           object_layer_,
           CompilerFunctor(target_machine_.get(), &disassembler_, opt_level,
@@ -157,7 +183,7 @@ SimpleOrcJIT::SimpleOrcJIT(const llvm::TargetOptions& target_options,
 SimpleOrcJIT::ModuleHandleT SimpleOrcJIT::AddModule(
     std::unique_ptr<llvm::Module> module) {
   auto handle = cantFail(compile_layer_.addModule(
-      std::move(module), MakeUnique<SimpleResolver>(external_constant_pool())));
+      execution_session_.allocateVModule(), std::move(module)));
   module_handles_.push_back(handle);
   return handle;
 }
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
index ded01e9e4d..0f6456170f 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
@@ -21,8 +21,10 @@ limitations under the License.
 #include <vector>
 
 #include "llvm/ADT/Triple.h"
+#include "llvm/ExecutionEngine/Orc/Core.h"
 #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
 #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
+#include "llvm/ExecutionEngine/Orc/SymbolStringPool.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Target/TargetMachine.h"
 #include "tensorflow/compiler/xla/service/cpu/compiler_functor.h"
@@ -100,6 +102,9 @@ class SimpleOrcJIT {
   std::unique_ptr<llvm::TargetMachine> target_machine_;
   const Disassembler disassembler_;
   const llvm::DataLayout data_layout_;
+  llvm::orc::SymbolStringPool string_pool_;
+  llvm::orc::ExecutionSession execution_session_;
+  std::shared_ptr<llvm::orc::SymbolResolver> symbol_resolver_;
   ObjLayerT object_layer_;
   CompileLayerT compile_layer_;
   ExternalConstantPool external_constant_pool_;
-- 
GitLab


From 5dbd3d79195d1763d9d5d52ccd22d1889400ca8b Mon Sep 17 00:00:00 2001
From: Gunhan Gulsoy <gunan@google.com>
Date: Thu, 8 Feb 2018 10:45:37 -0800
Subject: [PATCH 828/966] Make configure script more lenient to the length of
 CUDA and cuDNN versions entered. (#16853)

---
 configure.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/configure.py b/configure.py
index 151ad5dba8..5e481739a9 100644
--- a/configure.py
+++ b/configure.py
@@ -827,6 +827,28 @@ def set_gcc_host_compiler_path(environ_cp):
   write_action_env_to_bazelrc('GCC_HOST_COMPILER_PATH', gcc_host_compiler_path)
 
 
+def reformat_version_sequence(version_str, sequence_count):
+  """Reformat the version string to have the given number of sequences.
+
+  For example:
+  Given (7, 2) -> 7.0
+        (7.0.1, 2) -> 7.0
+        (5, 1) -> 5
+        (5.0.3.2, 1) -> 5
+
+  Args:
+      version_str: String, the version string.
+      sequence_count: int, an integer.
+  Returns:
+      string, reformatted version string.
+  """
+  v = version_str.split('.')
+  if len(v) < sequence_count:
+    v = v + (['0'] * (sequence_count - len(v)))
+
+  return '.'.join(v[:sequence_count])
+
+
 def set_tf_cuda_version(environ_cp):
   """Set CUDA_TOOLKIT_PATH and TF_CUDA_VERSION."""
   ask_cuda_version = (
@@ -837,6 +859,7 @@ def set_tf_cuda_version(environ_cp):
     # Configure the Cuda SDK version to use.
     tf_cuda_version = get_from_env_or_user_or_default(
         environ_cp, 'TF_CUDA_VERSION', ask_cuda_version, _DEFAULT_CUDA_VERSION)
+    tf_cuda_version = reformat_version_sequence(str(tf_cuda_version), 2)
 
     # Find out where the CUDA toolkit is installed
     default_cuda_path = _DEFAULT_CUDA_PATH
@@ -893,6 +916,7 @@ def set_tf_cudnn_version(environ_cp):
     tf_cudnn_version = get_from_env_or_user_or_default(
         environ_cp, 'TF_CUDNN_VERSION', ask_cudnn_version,
         _DEFAULT_CUDNN_VERSION)
+    tf_cudnn_version = reformat_version_sequence(str(tf_cudnn_version) ,1)
 
     default_cudnn_path = environ_cp.get('CUDA_TOOLKIT_PATH')
     ask_cudnn_path = (r'Please specify the location where cuDNN %s library is '
-- 
GitLab


From 169337f8dae87e75e4535fb22fcaac85d341036c Mon Sep 17 00:00:00 2001
From: Glenn Weidner <gweidner@us.ibm.com>
Date: Thu, 8 Feb 2018 10:46:12 -0800
Subject: [PATCH 829/966] Fix error message in record_reader (#16808)

---
 tensorflow/core/lib/io/record_reader.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/lib/io/record_reader.cc b/tensorflow/core/lib/io/record_reader.cc
index 9cc6c4034f..254fdf115d 100644
--- a/tensorflow/core/lib/io/record_reader.cc
+++ b/tensorflow/core/lib/io/record_reader.cc
@@ -49,7 +49,7 @@ RecordReaderOptions RecordReaderOptions::CreateRecordReaderOptions(
 #endif  // IS_SLIM_BUILD
   } else if (compression_type != compression::kNone) {
     LOG(ERROR) << "Unsupported compression_type:" << compression_type
-               << ". No comprression will be used.";
+               << ". No compression will be used.";
   }
   return options;
 }
-- 
GitLab


From 1287b1ce7d9af83326e8d6e8f9955d9bdd9c30ec Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Thu, 8 Feb 2018 10:39:32 -0800
Subject: [PATCH 830/966] [tf.data] Remove deprecated reader dataset classes in
 `tf.contrib.data`.

This change removes the following classes:
* `tf.contrib.data.FixedLengthRecordDataset`
* `tf.contrib.data.TextLineDataset`
* `tf.contrib.data.TFRecordDataset`

IF THIS BREAKS YOU: Replace `tf.contrib.data` with `tf.data` when constructing a
`FixedLengthRecordDataset`, `TextLineDataset`, or `TFRecordDataset`. Note that
you may have to modify downstream transformations to use the core API. See
"tensorflow/contrib/data/README.md" for details of how to update your code to use
the core API.

PiperOrigin-RevId: 185016587
---
 tensorflow/contrib/data/__init__.py           |   6 -
 .../python/kernel_tests/iterator_ops_test.py  |   2 +-
 .../kernel_tests/reader_dataset_ops_test.py   | 317 +-----------------
 tensorflow/contrib/data/python/ops/readers.py |  81 +----
 4 files changed, 11 insertions(+), 395 deletions(-)

diff --git a/tensorflow/contrib/data/__init__.py b/tensorflow/contrib/data/__init__.py
index daeb6a6105..22de13b558 100644
--- a/tensorflow/contrib/data/__init__.py
+++ b/tensorflow/contrib/data/__init__.py
@@ -23,9 +23,6 @@ See the @{$datasets$Importing Data} Programmer's Guide for an overview.
 @@Dataset
 @@Counter
 @@Iterator
-@@TFRecordDataset
-@@FixedLengthRecordDataset
-@@TextLineDataset
 
 @@batch_and_drop_remainder
 @@dense_to_sparse_batch
@@ -66,11 +63,8 @@ from tensorflow.contrib.data.python.ops.grouping import group_by_window
 from tensorflow.contrib.data.python.ops.interleave_ops import parallel_interleave
 from tensorflow.contrib.data.python.ops.interleave_ops import sloppy_interleave
 from tensorflow.contrib.data.python.ops.iterator_ops import make_saveable_from_iterator
-from tensorflow.contrib.data.python.ops.readers import FixedLengthRecordDataset
 from tensorflow.contrib.data.python.ops.readers import read_batch_features
 from tensorflow.contrib.data.python.ops.readers import SqlDataset
-from tensorflow.contrib.data.python.ops.readers import TextLineDataset
-from tensorflow.contrib.data.python.ops.readers import TFRecordDataset
 from tensorflow.contrib.data.python.ops.resampling import rejection_resample
 from tensorflow.contrib.data.python.ops.scan_ops import scan
 from tensorflow.contrib.data.python.ops.shuffle_ops import shuffle_and_repeat
diff --git a/tensorflow/contrib/data/python/kernel_tests/iterator_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/iterator_ops_test.py
index bda9a2a4a3..9d11865dda 100644
--- a/tensorflow/contrib/data/python/kernel_tests/iterator_ops_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/iterator_ops_test.py
@@ -21,10 +21,10 @@ import os
 import numpy as np
 
 from tensorflow.contrib.data.python.ops import dataset_ops
-from tensorflow.contrib.data.python.ops import readers
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.data.ops import readers
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
diff --git a/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
index 1c42a3d855..6efe97444a 100644
--- a/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
@@ -26,6 +26,7 @@ from tensorflow.contrib.data.python.ops import readers
 from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
 from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.data.ops import readers as core_readers
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -76,101 +77,12 @@ class TextLineDatasetTestBase(test.TestCase):
     return filenames
 
 
-class TextLineDatasetTest(TextLineDatasetTestBase):
-
-  def _testTextLineDataset(self, compression_type=None):
-    test_filenames = self._createFiles(
-        2, 5, crlf=True, compression_type=compression_type)
-    filenames = array_ops.placeholder(dtypes.string, shape=[None])
-    num_epochs = array_ops.placeholder(dtypes.int64, shape=[])
-    batch_size = array_ops.placeholder(dtypes.int64, shape=[])
-
-    repeat_dataset = readers.TextLineDataset(
-        filenames, compression_type=compression_type).repeat(num_epochs)
-    batch_dataset = repeat_dataset.batch(batch_size)
-
-    iterator = iterator_ops.Iterator.from_structure(batch_dataset.output_types)
-    init_op = iterator.make_initializer(repeat_dataset)
-    init_batch_op = iterator.make_initializer(batch_dataset)
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      # Basic test: read from file 0.
-      sess.run(
-          init_op, feed_dict={filenames: [test_filenames[0]],
-                              num_epochs: 1})
-      for i in range(5):
-        self.assertEqual(self._lineText(0, i), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Basic test: read from file 1.
-      sess.run(
-          init_op, feed_dict={filenames: [test_filenames[1]],
-                              num_epochs: 1})
-      for i in range(5):
-        self.assertEqual(self._lineText(1, i), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Basic test: read from both files.
-      sess.run(init_op, feed_dict={filenames: test_filenames, num_epochs: 1})
-      for j in range(2):
-        for i in range(5):
-          self.assertEqual(self._lineText(j, i), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Test repeated iteration through both files.
-      sess.run(init_op, feed_dict={filenames: test_filenames, num_epochs: 10})
-      for _ in range(10):
-        for j in range(2):
-          for i in range(5):
-            self.assertEqual(self._lineText(j, i), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Test batched and repeated iteration through both files.
-      sess.run(
-          init_batch_op,
-          feed_dict={filenames: test_filenames,
-                     num_epochs: 10,
-                     batch_size: 5})
-      for _ in range(10):
-        self.assertAllEqual([self._lineText(0, i) for i in range(5)],
-                            sess.run(get_next))
-        self.assertAllEqual([self._lineText(1, i) for i in range(5)],
-                            sess.run(get_next))
-
-  def testTextLineDatasetNoCompression(self):
-    self._testTextLineDataset()
-
-  def testTextLineDatasetGzipCompression(self):
-    self._testTextLineDataset(compression_type="GZIP")
-
-  def testTextLineDatasetZlibCompression(self):
-    self._testTextLineDataset(compression_type="ZLIB")
-
-  def testTextLineDatasetBuffering(self):
-    test_filenames = self._createFiles(2, 5, crlf=True)
-
-    repeat_dataset = readers.TextLineDataset(test_filenames, buffer_size=10)
-    iterator = repeat_dataset.make_one_shot_iterator()
-
-    with self.test_session() as sess:
-      for j in range(2):
-        for i in range(5):
-          self.assertEqual(self._lineText(j, i), sess.run(iterator.get_next()))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(iterator.get_next())
-
-
 class TextLineDatasetSerializationTest(
     TextLineDatasetTestBase,
     dataset_serialization_test_base.DatasetSerializationTestBase):
 
   def _build_iterator_graph(self, test_filenames, compression_type=None):
-    return readers.TextLineDataset(
+    return core_readers.TextLineDataset(
         test_filenames, compression_type=compression_type, buffer_size=10)
 
   def testTextLineCore(self):
@@ -217,101 +129,13 @@ class FixedLengthRecordReaderTestBase(test.TestCase):
     return filenames
 
 
-class FixedLengthRecordReaderTest(FixedLengthRecordReaderTestBase):
-
-  def testFixedLengthRecordDataset(self):
-    test_filenames = self._createFiles()
-    filenames = array_ops.placeholder(dtypes.string, shape=[None])
-    num_epochs = array_ops.placeholder(dtypes.int64, shape=[])
-    batch_size = array_ops.placeholder(dtypes.int64, shape=[])
-
-    repeat_dataset = (readers.FixedLengthRecordDataset(
-        filenames, self._record_bytes, self._header_bytes, self._footer_bytes)
-                      .repeat(num_epochs))
-    batch_dataset = repeat_dataset.batch(batch_size)
-
-    iterator = iterator_ops.Iterator.from_structure(batch_dataset.output_types)
-    init_op = iterator.make_initializer(repeat_dataset)
-    init_batch_op = iterator.make_initializer(batch_dataset)
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      # Basic test: read from file 0.
-      sess.run(
-          init_op, feed_dict={filenames: [test_filenames[0]],
-                              num_epochs: 1})
-      for i in range(self._num_records):
-        self.assertEqual(self._record(0, i), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Basic test: read from file 1.
-      sess.run(
-          init_op, feed_dict={filenames: [test_filenames[1]],
-                              num_epochs: 1})
-      for i in range(self._num_records):
-        self.assertEqual(self._record(1, i), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Basic test: read from both files.
-      sess.run(init_op, feed_dict={filenames: test_filenames, num_epochs: 1})
-      for j in range(self._num_files):
-        for i in range(self._num_records):
-          self.assertEqual(self._record(j, i), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Test repeated iteration through both files.
-      sess.run(init_op, feed_dict={filenames: test_filenames, num_epochs: 10})
-      for _ in range(10):
-        for j in range(self._num_files):
-          for i in range(self._num_records):
-            self.assertEqual(self._record(j, i), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Test batched and repeated iteration through both files.
-      sess.run(
-          init_batch_op,
-          feed_dict={
-              filenames: test_filenames,
-              num_epochs: 10,
-              batch_size: self._num_records
-          })
-      for _ in range(10):
-        for j in range(self._num_files):
-          self.assertAllEqual(
-              [self._record(j, i) for i in range(self._num_records)],
-              sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFixedLengthRecordDatasetBuffering(self):
-    test_filenames = self._createFiles()
-    dataset = readers.FixedLengthRecordDataset(
-        test_filenames,
-        self._record_bytes,
-        self._header_bytes,
-        self._footer_bytes,
-        buffer_size=10)
-    iterator = dataset.make_one_shot_iterator()
-
-    with self.test_session() as sess:
-      for j in range(self._num_files):
-        for i in range(self._num_records):
-          self.assertEqual(self._record(j, i), sess.run(iterator.get_next()))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(iterator.get_next())
-
-
 class FixedLengthRecordDatasetSerializationTest(
     FixedLengthRecordReaderTestBase,
     dataset_serialization_test_base.DatasetSerializationTestBase):
 
   def _build_iterator_graph(self, num_epochs, compression_type=None):
     filenames = self._createFiles()
-    return readers.FixedLengthRecordDataset(
+    return core_readers.FixedLengthRecordDataset(
         filenames, self._record_bytes, self._header_bytes,
         self._footer_bytes).repeat(num_epochs)
 
@@ -338,9 +162,8 @@ class TFRecordDatasetTestBase(test.TestCase):
     self.compression_type = array_ops.placeholder_with_default("", shape=[])
     self.batch_size = array_ops.placeholder(dtypes.int64, shape=[])
 
-    repeat_dataset = readers.TFRecordDataset(self.filenames,
-                                             self.compression_type).repeat(
-                                                 self.num_epochs)
+    repeat_dataset = core_readers.TFRecordDataset(
+        self.filenames, self.compression_type).repeat(self.num_epochs)
     batch_dataset = repeat_dataset.batch(self.batch_size)
 
     iterator = iterator_ops.Iterator.from_structure(batch_dataset.output_types)
@@ -363,129 +186,6 @@ class TFRecordDatasetTestBase(test.TestCase):
     return filenames
 
 
-class TFRecordDatasetTest(TFRecordDatasetTestBase):
-
-  def testReadOneEpoch(self):
-    with self.test_session() as sess:
-      # Basic test: read from file 0.
-      sess.run(
-          self.init_op,
-          feed_dict={
-              self.filenames: [self.test_filenames[0]],
-              self.num_epochs: 1
-          })
-      for i in range(self._num_records):
-        self.assertAllEqual(self._record(0, i), sess.run(self.get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.get_next)
-
-      # Basic test: read from file 1.
-      sess.run(
-          self.init_op,
-          feed_dict={
-              self.filenames: [self.test_filenames[1]],
-              self.num_epochs: 1
-          })
-      for i in range(self._num_records):
-        self.assertAllEqual(self._record(1, i), sess.run(self.get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.get_next)
-
-      # Basic test: read from both files.
-      sess.run(
-          self.init_op,
-          feed_dict={self.filenames: self.test_filenames,
-                     self.num_epochs: 1})
-      for j in range(self._num_files):
-        for i in range(self._num_records):
-          self.assertAllEqual(self._record(j, i), sess.run(self.get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.get_next)
-
-  def testReadTenEpochs(self):
-    with self.test_session() as sess:
-      sess.run(
-          self.init_op,
-          feed_dict={self.filenames: self.test_filenames,
-                     self.num_epochs: 10})
-      for _ in range(10):
-        for j in range(self._num_files):
-          for i in range(self._num_records):
-            self.assertAllEqual(self._record(j, i), sess.run(self.get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.get_next)
-
-  def testReadTenEpochsOfBatches(self):
-    with self.test_session() as sess:
-      sess.run(
-          self.init_batch_op,
-          feed_dict={
-              self.filenames: self.test_filenames,
-              self.num_epochs: 10,
-              self.batch_size: self._num_records
-          })
-      for _ in range(10):
-        for j in range(self._num_files):
-          values = sess.run(self.get_next)
-          self.assertAllEqual(
-              [self._record(j, i) for i in range(self._num_records)], values)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.get_next)
-
-  def testReadZlibFiles(self):
-    zlib_files = []
-    for i, fn in enumerate(self.test_filenames):
-      with open(fn, "rb") as f:
-        cdata = zlib.compress(f.read())
-
-        zfn = os.path.join(self.get_temp_dir(), "tfrecord_%s.z" % i)
-        with open(zfn, "wb") as f:
-          f.write(cdata)
-        zlib_files.append(zfn)
-
-    with self.test_session() as sess:
-      sess.run(
-          self.init_op,
-          feed_dict={self.filenames: zlib_files,
-                     self.compression_type: "ZLIB"})
-      for j in range(self._num_files):
-        for i in range(self._num_records):
-          self.assertAllEqual(self._record(j, i), sess.run(self.get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.get_next)
-
-  def testReadGzipFiles(self):
-    gzip_files = []
-    for i, fn in enumerate(self.test_filenames):
-      with open(fn, "rb") as f:
-        gzfn = os.path.join(self.get_temp_dir(), "tfrecord_%s.gz" % i)
-        with gzip.GzipFile(gzfn, "wb") as gzf:
-          gzf.write(f.read())
-        gzip_files.append(gzfn)
-
-    with self.test_session() as sess:
-      sess.run(
-          self.init_op,
-          feed_dict={self.filenames: gzip_files,
-                     self.compression_type: "GZIP"})
-      for j in range(self._num_files):
-        for i in range(self._num_records):
-          self.assertAllEqual(self._record(j, i), sess.run(self.get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.get_next)
-
-  def testReadWithBuffer(self):
-    one_mebibyte = 2**20
-    d = readers.TFRecordDataset(self.test_filenames, buffer_size=one_mebibyte)
-    iterator = d.make_one_shot_iterator()
-    with self.test_session() as sess:
-      for j in range(self._num_files):
-        for i in range(self._num_records):
-          self.assertAllEqual(self._record(j, i), sess.run(iterator.get_next()))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(iterator.get_next())
-
-
 class TFRecordDatasetSerializationTest(
     TFRecordDatasetTestBase,
     dataset_serialization_test_base.DatasetSerializationTestBase):
@@ -517,7 +217,7 @@ class TFRecordDatasetSerializationTest(
           gzip_files.append(gzfn)
       filenames = gzip_files
 
-    return readers.TFRecordDataset(
+    return core_readers.TFRecordDataset(
         filenames, compression_type,
         buffer_size=buffer_size).repeat(num_epochs).batch(batch_size)
 
@@ -575,7 +275,7 @@ class ReadBatchFeaturesTest(test.TestCase):
             "record": parsing_ops.FixedLenFeature([], dtypes.int64),
             "keywords": parsing_ops.VarLenFeature(dtypes.string)
         },
-        reader=readers.TFRecordDataset,
+        reader=core_readers.TFRecordDataset,
         randomize_input=False,
         num_epochs=self.num_epochs)
 
@@ -714,12 +414,11 @@ class ReadBatchFeaturesTest(test.TestCase):
               self._next_actual_batch(sess)
 
   def testReadWithEquivalentDataset(self):
-    # TODO(mrry): Add support for tf.SparseTensor as a Dataset component.
     features = {
         "file": parsing_ops.FixedLenFeature([], dtypes.int64),
         "record": parsing_ops.FixedLenFeature([], dtypes.int64),
     }
-    dataset = (readers.TFRecordDataset(self.test_filenames)
+    dataset = (core_readers.TFRecordDataset(self.test_filenames)
                .map(lambda x: parsing_ops.parse_single_example(x, features))
                .repeat(10).batch(2))
     iterator = dataset.make_initializable_iterator()
diff --git a/tensorflow/contrib/data/python/ops/readers.py b/tensorflow/contrib/data/python/ops/readers.py
index 347e5edc7b..57f3010277 100644
--- a/tensorflow/contrib/data/python/ops/readers.py
+++ b/tensorflow/contrib/data/python/ops/readers.py
@@ -17,9 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.data.python.ops import dataset_ops as contrib_dataset_ops
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.ops import readers
 from tensorflow.python.data.util import nest
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -27,74 +25,6 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.platform import gfile
-from tensorflow.python.util import deprecation
-
-
-class TextLineDataset(contrib_dataset_ops.Dataset):
-  """A `Dataset` comprising lines from one or more text files."""
-
-  @deprecation.deprecated(None, "Use `tf.data.TextLineDataset`.")
-  def __init__(self, filenames, compression_type=None, buffer_size=None):
-    """Creates a `TextLineDataset`.
-
-    Args:
-      filenames: A `tf.string` tensor containing one or more filenames.
-      compression_type: (Optional.) A `tf.string` scalar evaluating to one of
-        `""` (no compression), `"ZLIB"`, or `"GZIP"`.
-      buffer_size: (Optional.) A `tf.int64` scalar denoting the number of bytes
-        to buffer. A value of 0 results in the default buffering values chosen
-        based on the compression type.
-    """
-    dataset = readers.TextLineDataset(filenames, compression_type,
-                                      buffer_size)
-    super(TextLineDataset, self).__init__(dataset)
-
-
-class TFRecordDataset(contrib_dataset_ops.Dataset):
-  """A `Dataset` comprising records from one or more TFRecord files."""
-
-  @deprecation.deprecated(None, "Use `tf.data.TFRecordDataset`.")
-  def __init__(self, filenames, compression_type=None, buffer_size=None):
-    """Creates a `TFRecordDataset`.
-
-    Args:
-      filenames: A `tf.string` tensor containing one or more filenames.
-      compression_type: (Optional.) A `tf.string` scalar evaluating to one of
-        `""` (no compression), `"ZLIB"`, or `"GZIP"`.
-      buffer_size: (Optional.) A `tf.int64` scalar representing the number of
-        bytes in the read buffer. 0 means no buffering.
-    """
-    dataset = readers.TFRecordDataset(filenames, compression_type,
-                                      buffer_size)
-    super(TFRecordDataset, self).__init__(dataset)
-
-
-class FixedLengthRecordDataset(contrib_dataset_ops.Dataset):
-  """A `Dataset` of fixed-length records from one or more binary files."""
-
-  @deprecation.deprecated(None, "Use `tf.data.FixedLengthRecordDataset`.")
-  def __init__(self,
-               filenames,
-               record_bytes,
-               header_bytes=None,
-               footer_bytes=None,
-               buffer_size=None):
-    """Creates a `FixedLengthRecordDataset`.
-
-    Args:
-      filenames: A `tf.string` tensor containing one or more filenames.
-      record_bytes: A `tf.int64` scalar representing the number of bytes in
-        each record.
-      header_bytes: (Optional.) A `tf.int64` scalar representing the number of
-        bytes to skip at the start of a file.
-      footer_bytes: (Optional.) A `tf.int64` scalar representing the number of
-        bytes to ignore at the end of a file.
-      buffer_size: (Optional.) A `tf.int64` scalar representing the number of
-        bytes to buffer when reading.
-    """
-    dataset = readers.FixedLengthRecordDataset(
-        filenames, record_bytes, header_bytes, footer_bytes, buffer_size)
-    super(FixedLengthRecordDataset, self).__init__(dataset)
 
 
 def read_batch_features(file_pattern,
@@ -216,14 +146,7 @@ def _get_file_names(file_pattern, randomize_input):
   return file_names
 
 
-class SqlDataset(contrib_dataset_ops.Dataset):
-
-  def __init__(self, driver_name, data_source_name, query, output_types):
-    dataset = _SqlDataset(driver_name, data_source_name, query, output_types)
-    super(SqlDataset, self).__init__(dataset)
-
-
-class _SqlDataset(dataset_ops.Dataset):
+class SqlDataset(dataset_ops.Dataset):
   """A `Dataset` consisting of the results from a SQL query."""
 
   def __init__(self, driver_name, data_source_name, query, output_types):
@@ -255,7 +178,7 @@ class _SqlDataset(dataset_ops.Dataset):
       output_types: A tuple of `tf.DType` objects representing the types of the
         columns returned by `query`.
     """
-    super(_SqlDataset, self).__init__()
+    super(SqlDataset, self).__init__()
     self._driver_name = ops.convert_to_tensor(
         driver_name, dtype=dtypes.string, name="driver_name")
     self._data_source_name = ops.convert_to_tensor(
-- 
GitLab


From 4272366ff2152434cc5cdd94595a2352e5a0d826 Mon Sep 17 00:00:00 2001
From: Brad Wannow <wannow21@gmail.com>
Date: Thu, 8 Feb 2018 12:47:34 -0600
Subject: [PATCH 831/966] Update CONTRIBUTING.md (#16806)

* Update CONTRIBUTING.md

Edited a few grammar issues.

* Fix
---
 CONTRIBUTING.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 2e2182bb21..3dad41a88c 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -41,7 +41,7 @@ TensorFlow coding style.
 #### General guidelines and philosophy for contribution
 
 * Include unit tests when you contribute new features, as they help to
-  a) prove that your code works correctly, b) guard against future breaking
+  a) prove that your code works correctly, and b) guard against future breaking
   changes to lower the maintenance cost.
 * Bug fixes also generally require unit tests, because the presence of bugs
   usually indicates insufficient test coverage.
@@ -51,7 +51,7 @@ TensorFlow coding style.
   non-backward-compatible API changes without a major release. Reviewers of your
   pull request will comment on any API compatibility issues.
 * When you contribute a new feature to TensorFlow, the maintenance burden is (by
-  default) transferred to the TensorFlow team. This means that benefit of
+  default) transferred to the TensorFlow team. This means that benefit of the
   contribution must be compared against the cost of maintaining the feature.
 * Full new features (e.g., a new op implementing a cutting-edge algorithm)
   typically will live in
-- 
GitLab


From 38569baeb896c50b90d6f613f2ab2637e2708a34 Mon Sep 17 00:00:00 2001
From: Rajendra arora <rajendraarora16@yahoo.com>
Date: Fri, 9 Feb 2018 00:18:19 +0530
Subject: [PATCH 832/966] Fixes variable name (#16797)

* Fixes variable typo name

* fixes optional typo

* Fix typo

* fixes deallocations

* typo fix

* typo fix

* fixes variable name

* fixes param typo

* Fixes typo

* fix typo

* fixes
---
 tensorflow/compiler/tf2xla/graph_compiler.h          |  2 +-
 tensorflow/compiler/xla/index_util.h                 |  2 +-
 .../compiler/xla/service/buffer_assignment_test.cc   |  2 +-
 tensorflow/compiler/xla/service/cpu/ir_emitter.cc    |  2 +-
 tensorflow/compiler/xla/service/cpu/ir_function.cc   |  4 ++--
 tensorflow/contrib/all_reduce/python/all_reduce.py   |  2 +-
 tensorflow/contrib/data/python/ops/dataset_ops.py    |  2 +-
 .../contrib/eager/python/examples/gan/README.md      |  2 +-
 .../contrib/layers/python/layers/layers_test.py      |  4 ++--
 tensorflow/contrib/lite/simple_memory_arena.h        |  4 ++--
 .../tensorflow_graph_matching/resolve_svdf_test.cc   | 12 ++++++------
 tensorflow/contrib/metrics/python/ops/metric_ops.py  |  2 +-
 tensorflow/core/framework/op_def_util.h              |  2 +-
 tensorflow/core/framework/op_kernel.cc               |  2 +-
 tensorflow/core/kernels/constant_op_test.cc          |  2 +-
 tensorflow/core/lib/gtl/optional.h                   |  2 +-
 tensorflow/python/data/ops/dataset_ops.py            |  2 +-
 tensorflow/python/debug/cli/cli_shared.py            |  2 +-
 .../python/kernel_tests/conv2d_transpose_test.py     |  6 +++---
 tensorflow/stream_executor/dnn.h                     |  2 +-
 20 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/graph_compiler.h b/tensorflow/compiler/tf2xla/graph_compiler.h
index ba00160b6d..127562eb23 100644
--- a/tensorflow/compiler/tf2xla/graph_compiler.h
+++ b/tensorflow/compiler/tf2xla/graph_compiler.h
@@ -70,7 +70,7 @@ class GraphCompiler {
 
  private:
   // Partially sets params. This partially set params can be reused
-  // across multple nodes visit.
+  // across multiple nodes visit.
   void PartiallySetupParams(OpKernelContext::Params* params);
 
   // Tests if a node is a functional node. A functional node represents a
diff --git a/tensorflow/compiler/xla/index_util.h b/tensorflow/compiler/xla/index_util.h
index 0b9188e852..142006f262 100644
--- a/tensorflow/compiler/xla/index_util.h
+++ b/tensorflow/compiler/xla/index_util.h
@@ -37,7 +37,7 @@ class IndexUtil {
   static int64 MultidimensionalIndexToLinearIndex(
       const Shape& shape, tensorflow::gtl::ArraySlice<int64> multi_index);
 
-  // Coverts a linear index into multidimensional index (eg {x, y, z}) based on
+  // Converts a linear index into multidimensional index (eg {x, y, z}) based on
   // the shape and its layout. The first index in the returned multidimensional
   // index is dimension 0.
   static std::vector<int64> LinearIndexToMultidimensionalIndex(
diff --git a/tensorflow/compiler/xla/service/buffer_assignment_test.cc b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
index 6fc9d783f1..f3aeba8d61 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
@@ -614,7 +614,7 @@ TEST_F(BufferAssignmentTest, TrivialMap) {
   BufferAllocation map_buffer = GetAssignedOutputAllocation(*buffers, map);
   EXPECT_NE(param0_buffer.index(), map_buffer.index());
 
-  // The final computation node of the map is an add of an f32 parm and a
+  // The final computation node of the map is an add of an f32 param and a
   // constant.
   EXPECT_EQ(HloOpcode::kAdd, inner_last->opcode());
   const BufferAllocation& inner_add_buffer =
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index 0b2d3d4746..d9eeb1c3bd 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -1332,7 +1332,7 @@ IrEmitter::ReductionGenerator IrEmitter::MatchReductionGenerator(
   if (ShapeUtil::ElementIsComplex(root_shape)) {
     // TODO(b/65408531): Complex add could by done via bitcast to <float x [2N]>
     // Complex multiply would be more challenging. We could perhaps use a
-    // strided load to get all reals in a vector, all imags in a vector, or use
+    // strided load to get all reals in a vector, all images in a vector, or use
     // CreateShuffleVector on a bitcast to float x [2N].
     *failure_reason = "complex values not supported";
     return nullptr;
diff --git a/tensorflow/compiler/xla/service/cpu/ir_function.cc b/tensorflow/compiler/xla/service/cpu/ir_function.cc
index ca8c290dd1..2d6f2f3818 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_function.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_function.cc
@@ -209,9 +209,9 @@ std::vector<llvm::Value*> GetArrayFunctionCallArguments(
         parameter_addresses[i], ir_builder->getInt8PtrTy(),
         AsStringRef(tensorflow::strings::StrCat(name, "_parameter_", i,
                                                 "_address_as_i8ptr")));
-    llvm::Value* slot_in_param_adresses = ir_builder->CreateInBoundsGEP(
+    llvm::Value* slot_in_param_addresses = ir_builder->CreateInBoundsGEP(
         parameter_addresses_buffer, {ir_builder->getInt64(i)});
-    ir_builder->CreateStore(parameter_as_i8ptr, slot_in_param_adresses);
+    ir_builder->CreateStore(parameter_as_i8ptr, slot_in_param_addresses);
   }
 
   const auto to_int8_ptr = [=](llvm::Value* ptr) {
diff --git a/tensorflow/contrib/all_reduce/python/all_reduce.py b/tensorflow/contrib/all_reduce/python/all_reduce.py
index 28f60b3499..4c67deb1ff 100644
--- a/tensorflow/contrib/all_reduce/python/all_reduce.py
+++ b/tensorflow/contrib/all_reduce/python/all_reduce.py
@@ -758,7 +758,7 @@ def _build_nccl_hybrid(input_tensors, red_op, upper_level_f):
 
 
 def _reduce_non_singleton(input_tensors, red_f, un_op):
-  """If input_tenors has more than one element apply red_f, else apply un_op."""
+  """If input_tensors has more than one element apply red_f, else apply un_op."""
   if len(input_tensors) > 1:
     return red_f(input_tensors)
   else:
diff --git a/tensorflow/contrib/data/python/ops/dataset_ops.py b/tensorflow/contrib/data/python/ops/dataset_ops.py
index fafd231061..ff15c4451a 100644
--- a/tensorflow/contrib/data/python/ops/dataset_ops.py
+++ b/tensorflow/contrib/data/python/ops/dataset_ops.py
@@ -483,7 +483,7 @@ class Dataset(dataset_ops.Dataset):
           num_threads=None,
           output_buffer_size=None,
           num_parallel_calls=None):
-    """Maps `map_func` across this datset.
+    """Maps `map_func` across this dataset.
 
     Args:
       map_func: A function mapping a nested structure of tensors (having
diff --git a/tensorflow/contrib/eager/python/examples/gan/README.md b/tensorflow/contrib/eager/python/examples/gan/README.md
index e8c9db1a1e..208a64b05d 100644
--- a/tensorflow/contrib/eager/python/examples/gan/README.md
+++ b/tensorflow/contrib/eager/python/examples/gan/README.md
@@ -11,7 +11,7 @@ Other eager execution examples can be found under the parent directory.
 - `mnist.py`: Model definitions and training routines.
 - `mnist_test.py`: Benchmarks for training and using the models using eager
 execution.
-- `mnist_graph_test.py`: Benchmarks for trainig and using the models using
+- `mnist_graph_test.py`: Benchmarks for training and using the models using
 graph execution. The same model definitions and loss functions are used in
 all benchmarks.
 
diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py
index 8945690db8..2bd6c6eb58 100644
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@@ -708,7 +708,7 @@ class Convolution2dTransposeTests(test.TestCase):
         _layers.convolution2d_transpose(images, 32, 3, data_format='CHWN')
 
   def testOutputSizeWithStrideOneSamePaddingNCHW(self):
-    # `NCHW` data fomat is only supported for `GPU` device.
+    # `NCHW` data format is only supported for `GPU` device.
     if test.is_gpu_available(cuda_only=True):
       with self.test_session(use_gpu=True) as sess:
         num_filters = 32
@@ -2195,7 +2195,7 @@ class BatchNormTest(test.TestCase):
       # After initialization moving_mean == 0 and moving_variance == 1.
       self.assertAllClose(mean, [0] * 3)
       self.assertAllClose(variance, [1] * 3)
-      # Simulate assigment from saver restore.
+      # Simulate assignment from saver restore.
       init_assigns = [
           state_ops.assign(moving_mean, expected_mean),
           state_ops.assign(moving_variance, expected_var)
diff --git a/tensorflow/contrib/lite/simple_memory_arena.h b/tensorflow/contrib/lite/simple_memory_arena.h
index 0c5e00a1f2..0535522374 100644
--- a/tensorflow/contrib/lite/simple_memory_arena.h
+++ b/tensorflow/contrib/lite/simple_memory_arena.h
@@ -36,9 +36,9 @@ struct ArenaAlloc {
   }
 };
 
-// This small class is responsible for allocating, dealocating and reusing
+// This small class is responsible for allocating, deallocating and reusing
 // dynamic memory from a common underlying buffer. The arena can be used in
-// scenarios when the pattern of memory allocations and dealocations is
+// scenarios when the pattern of memory allocations and deallocations is
 // repetitive, e.g. running NN inference in multiple iterations.
 class SimpleMemoryArena {
  public:
diff --git a/tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_svdf_test.cc b/tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_svdf_test.cc
index 664e828c19..646d048496 100644
--- a/tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_svdf_test.cc
+++ b/tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_svdf_test.cc
@@ -103,11 +103,11 @@ class ResolveSvdfTest : public ::testing::Test {
     // Add the float vector as an attribute to the node.
     (*node->mutable_attr())["dtype"].set_type(tensorflow::DT_FLOAT);
     tensorflow::TensorProto* allocated_tensor = new tensorflow::TensorProto;
-    tensorflow::TensorShapeProto* allocated_tesnor_shape =
+    tensorflow::TensorShapeProto* allocated_tensor_shape =
         new tensorflow::TensorShapeProto;
-    auto tensor_shape_dim0 = allocated_tesnor_shape->add_dim();
+    auto tensor_shape_dim0 = allocated_tensor_shape->add_dim();
     tensor_shape_dim0->set_size(values.size());
-    allocated_tensor->set_allocated_tensor_shape(allocated_tesnor_shape);
+    allocated_tensor->set_allocated_tensor_shape(allocated_tensor_shape);
     allocated_tensor->set_tensor_content(
         string(reinterpret_cast<const char*>(values.data()),
                values.size() * sizeof(float)));
@@ -122,11 +122,11 @@ class ResolveSvdfTest : public ::testing::Test {
     // Add the float vector as an attribute to the node.
     (*node->mutable_attr())["dtype"].set_type(tensorflow::DT_INT32);
     tensorflow::TensorProto* allocated_tensor = new tensorflow::TensorProto;
-    tensorflow::TensorShapeProto* allocated_tesnor_shape =
+    tensorflow::TensorShapeProto* allocated_tensor_shape =
         new tensorflow::TensorShapeProto;
-    auto tensor_shape_dim0 = allocated_tesnor_shape->add_dim();
+    auto tensor_shape_dim0 = allocated_tensor_shape->add_dim();
     tensor_shape_dim0->set_size(values.size());
-    allocated_tensor->set_allocated_tensor_shape(allocated_tesnor_shape);
+    allocated_tensor->set_allocated_tensor_shape(allocated_tensor_shape);
     allocated_tensor->set_tensor_content(
         string(reinterpret_cast<const char*>(values.data()),
                values.size() * sizeof(int)));
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index c2340d0377..d3ce51a611 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -1226,7 +1226,7 @@ def precision_recall_at_equal_thresholds(labels,
     predictions: A floating point `Tensor` of arbitrary shape and whose values
       are in the range `[0, 1]`.
     weights: Optional; If provided, a `Tensor` that has the same dtype as,
-      and broadcastable to, `predictions`. This tensor is multplied by counts.
+      and broadcastable to, `predictions`. This tensor is multiplied by counts.
     num_thresholds: Optional; Number of thresholds, evenly distributed in
       `[0, 1]`. Should be `>= 2`. Defaults to 201. Note that the number of bins
       is 1 less than `num_thresholds`. Using an even `num_thresholds` value
diff --git a/tensorflow/core/framework/op_def_util.h b/tensorflow/core/framework/op_def_util.h
index d1613ee89b..0ba1325a03 100644
--- a/tensorflow/core/framework/op_def_util.h
+++ b/tensorflow/core/framework/op_def_util.h
@@ -82,7 +82,7 @@ bool AttrDefEqual(const OpDef::AttrDef& a1, const OpDef::AttrDef& a2);
 uint64 AttrDefHash(const OpDef::AttrDef& a);
 
 // Returns true if all AttrDefs in `a1` equal corresponding AttrDefs in
-// `a2`. Corrspondence is established by name.
+// `a2`. Correspondence is established by name.
 bool RepeatedAttrDefEqual(const protobuf::RepeatedPtrField<OpDef::AttrDef>& a1,
                           const protobuf::RepeatedPtrField<OpDef::AttrDef>& a2);
 
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index 5d58d3e78e..8654437059 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -474,7 +474,7 @@ std::unique_ptr<Tensor> OpKernelContext::forward_input(
     return nullptr;
   }
   // Check that input and output memory types match, i.e.
-  // that they either both live in host or both live in device memmory.
+  // that they either both live in host or both live in device memory.
   if (input_memory_type(input_index) != output_memory_type) {
     return nullptr;
   }
diff --git a/tensorflow/core/kernels/constant_op_test.cc b/tensorflow/core/kernels/constant_op_test.cc
index 7a05d9371d..a6baae73d8 100644
--- a/tensorflow/core/kernels/constant_op_test.cc
+++ b/tensorflow/core/kernels/constant_op_test.cc
@@ -77,7 +77,7 @@ void ConstantOpTest::PersistentMemoryTrackingTest(bool on_gpu) {
     EXPECT_EQ(ctx.persistent_memory_allocated(), 480);
   }
 
-  // Remove memry leak errors.
+  // Remove memory leak errors.
   for (auto allocator_pair : ctx.wrapped_allocators()) {
     allocator_pair.second->GetRecordsAndUnRef();
   }
diff --git a/tensorflow/core/lib/gtl/optional.h b/tensorflow/core/lib/gtl/optional.h
index fa33c24c0c..4ee3f88d18 100644
--- a/tensorflow/core/lib/gtl/optional.h
+++ b/tensorflow/core/lib/gtl/optional.h
@@ -478,7 +478,7 @@ class optional : private internal_optional::optional_data<T>,
     return *this;
   }
 
-  // Copy assigment, standard semantics.
+  // Copy assignment, standard semantics.
   optional& operator=(const optional& src) = default;
 
   // Move assignment, standard semantics.
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index c4b7e4919b..b665443b7a 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -773,7 +773,7 @@ class Dataset(object):
     return PaddedBatchDataset(self, batch_size, padded_shapes, padding_values)
 
   def map(self, map_func, num_parallel_calls=None):
-    """Maps `map_func` across this datset.
+    """Maps `map_func` across this dataset.
 
     Args:
       map_func: A function mapping a nested structure of tensors (having
diff --git a/tensorflow/python/debug/cli/cli_shared.py b/tensorflow/python/debug/cli/cli_shared.py
index a0fe6066ac..dea019fef5 100644
--- a/tensorflow/python/debug/cli/cli_shared.py
+++ b/tensorflow/python/debug/cli/cli_shared.py
@@ -175,7 +175,7 @@ def format_tensor(tensor,
     include_numeric_summary: Whether a text summary of the numeric values (if
       applicable) will be included.
     write_path: A path to save the tensor value (after any slicing) to
-      (optinal). `numpy.save()` is used to save the value.
+      (optional). `numpy.save()` is used to save the value.
 
   Returns:
     An instance of `debugger_cli_common.RichTextLines` representing the
diff --git a/tensorflow/python/kernel_tests/conv2d_transpose_test.py b/tensorflow/python/kernel_tests/conv2d_transpose_test.py
index 7d0bc54b69..7ed1b2336b 100644
--- a/tensorflow/python/kernel_tests/conv2d_transpose_test.py
+++ b/tensorflow/python/kernel_tests/conv2d_transpose_test.py
@@ -175,7 +175,7 @@ class Conv2DTransposeTest(test.TestCase):
     self.assertLess(err, err_tolerance)
 
   def testConv2DTransposeSingleStrideNCHW(self):
-    # `NCHW` data fomat is only supported for CUDA device.
+    # `NCHW` data format is only supported for CUDA device.
     if test.is_gpu_available(cuda_only=True):
       with self.test_session(use_gpu=True):
         strides = [1, 1, 1, 1]
@@ -210,7 +210,7 @@ class Conv2DTransposeTest(test.TestCase):
                 self.assertAllClose(target, value[n, k, h, w])
 
   def testConv2DTransposeSameNCHW(self):
-    # `NCHW` data fomat is only supported for CUDA device.
+    # `NCHW` data format is only supported for CUDA device.
     if test.is_gpu_available(cuda_only=True):
       with self.test_session(use_gpu=True):
         strides = [1, 1, 2, 2]
@@ -246,7 +246,7 @@ class Conv2DTransposeTest(test.TestCase):
                 self.assertAllClose(target, value[n, k, h, w])
 
   def testConv2DTransposeValidNCHW(self):
-    # `NCHW` data fomat is only supported for CUDA device.
+    # `NCHW` data format is only supported for CUDA device.
     if test.is_gpu_available(cuda_only=True):
       with self.test_session(use_gpu=True):
         strides = [1, 1, 2, 2]
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index f4162b0962..aa88fe770f 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -896,7 +896,7 @@ class DnnSupport {
   //  offset: offset parameters.
   //  estimated_mean: population mean estimated during training.
   //    Used for inference only; empty for training.
-  //  estimated_variance: population variance estimated during traning,
+  //  estimated_variance: population variance estimated during training,
   //    used for inference only; empty for training.
   //  x_desc: dimensions of the input data, which is the same as the dimensions
   //    of the output.
-- 
GitLab


From e26b24bde8e711d856685d7db4a72ed0d9b2715c Mon Sep 17 00:00:00 2001
From: Phil <ijund.phil@gmail.com>
Date: Thu, 8 Feb 2018 19:48:42 +0100
Subject: [PATCH 833/966] Adds parameter 'msg' to tf.TensorFlowTestCase.
 (#16667)

This commit adds a msg parameter that defaults to None to the following
functions:
- assertProtoEquals
- assertArrayNear
- assertNDArrayNear
- assertAllClose
- assertAllEqual
- assertShapeEqual
- assertDeviceEqual
---
 tensorflow/python/framework/test_util.py | 94 ++++++++++++++----------
 1 file changed, 55 insertions(+), 39 deletions(-)

diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 15e8f5a38d..78ed1f6864 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -692,7 +692,7 @@ class TensorFlowTestCase(googletest.TestCase):
       self._tempdir = tempfile.mkdtemp(dir=googletest.GetTempDir())
     return self._tempdir
 
-  def _AssertProtoEquals(self, a, b):
+  def _AssertProtoEquals(self, a, b, msg=None):
     """Asserts that a and b are the same proto.
 
     Uses ProtoEq() first, as it returns correct results
@@ -702,11 +702,12 @@ class TensorFlowTestCase(googletest.TestCase):
     Args:
       a: a proto.
       b: another proto.
+      msg: Optional message to report on failure.
     """
     if not compare.ProtoEq(a, b):
-      compare.assertProtoEqual(self, a, b, normalize_numbers=True)
+      compare.assertProtoEqual(self, a, b, normalize_numbers=True, msg=msg)
 
-  def assertProtoEquals(self, expected_message_maybe_ascii, message):
+  def assertProtoEquals(self, expected_message_maybe_ascii, message, msg=None):
     """Asserts that message is same as parsed expected_message_ascii.
 
     Creates another prototype of message, reads the ascii message into it and
@@ -715,8 +716,9 @@ class TensorFlowTestCase(googletest.TestCase):
     Args:
       expected_message_maybe_ascii: proto message in original or ascii form.
       message: the message to validate.
+      msg: Optional message to report on failure.
     """
-
+    msg = msg if msg else ""
     if isinstance(expected_message_maybe_ascii, type(message)):
       expected_message = expected_message_maybe_ascii
       self._AssertProtoEquals(expected_message, message)
@@ -726,20 +728,21 @@ class TensorFlowTestCase(googletest.TestCase):
           expected_message_maybe_ascii,
           expected_message,
           descriptor_pool=descriptor_pool.Default())
-      self._AssertProtoEquals(expected_message, message)
+      self._AssertProtoEquals(expected_message, message, msg=msg)
     else:
-      assert False, ("Can't compare protos of type %s and %s" %
-                     (type(expected_message_maybe_ascii), type(message)))
+      assert False, ("Can't compare protos of type %s and %s. %s" %
+                     (type(expected_message_maybe_ascii), type(message), msg))
 
   def assertProtoEqualsVersion(
       self,
       expected,
       actual,
       producer=versions.GRAPH_DEF_VERSION,
-      min_consumer=versions.GRAPH_DEF_VERSION_MIN_CONSUMER):
+      min_consumer=versions.GRAPH_DEF_VERSION_MIN_CONSUMER,
+      msg=None):
     expected = "versions { producer: %d min_consumer: %d };\n%s" % (
         producer, min_consumer, expected)
-    self.assertProtoEquals(expected, actual)
+    self.assertProtoEquals(expected, actual, msg=msg)
 
   def assertStartsWith(self, actual, expected_start, msg=None):
     """Assert that actual.startswith(expected_start) is True.
@@ -1029,7 +1032,7 @@ class TensorFlowTestCase(googletest.TestCase):
                     "%f != %f +/- %f%s" % (f1, f2, err, " (%s)" % msg
                                            if msg is not None else ""))
 
-  def assertArrayNear(self, farray1, farray2, err):
+  def assertArrayNear(self, farray1, farray2, err, msg=None):
     """Asserts that two float arrays are near each other.
 
     Checks that for all elements of farray1 and farray2
@@ -1039,23 +1042,25 @@ class TensorFlowTestCase(googletest.TestCase):
       farray1: a list of float values.
       farray2: a list of float values.
       err: a float value.
+      msg: Optional message to report on failure.
     """
-    self.assertEqual(len(farray1), len(farray2))
+    self.assertEqual(len(farray1), len(farray2), msg=msg)
     for f1, f2 in zip(farray1, farray2):
-      self.assertNear(float(f1), float(f2), err)
+      self.assertNear(float(f1), float(f2), err, msg=msg)
 
   def _NDArrayNear(self, ndarray1, ndarray2, err):
     return np.linalg.norm(ndarray1 - ndarray2) < err
 
-  def assertNDArrayNear(self, ndarray1, ndarray2, err):
+  def assertNDArrayNear(self, ndarray1, ndarray2, err, msg=None):
     """Asserts that two numpy arrays have near values.
 
     Args:
       ndarray1: a numpy ndarray.
       ndarray2: a numpy ndarray.
       err: a float. The maximum absolute difference allowed.
+      msg: Optional message to report on failure.
     """
-    self.assertTrue(self._NDArrayNear(ndarray1, ndarray2, err))
+    self.assertTrue(self._NDArrayNear(ndarray1, ndarray2, err), msg=msg)
 
   def _GetNdArray(self, a):
     if not isinstance(a, np.ndarray):
@@ -1097,9 +1102,11 @@ class TensorFlowTestCase(googletest.TestCase):
       np.testing.assert_allclose(
           a, b, rtol=rtol, atol=atol, err_msg=msg, equal_nan=True)
 
-  def _assertAllCloseRecursive(self, a, b, rtol=1e-6, atol=1e-6, path=None):
+  def _assertAllCloseRecursive(self, a, b, rtol=1e-6, atol=1e-6, path=None,
+                               msg=None):
     path = path or []
     path_str = (("[" + "][".join([str(p) for p in path]) + "]") if path else "")
+    msg = msg if msg else ""
 
     # Check if a and/or b are namedtuples.
     if hasattr(a, "_asdict"):
@@ -1108,18 +1115,18 @@ class TensorFlowTestCase(googletest.TestCase):
       b = b._asdict()
     a_is_dict = isinstance(a, dict)
     if a_is_dict != isinstance(b, dict):
-      raise ValueError("Can't compare dict to non-dict, a%s vs b%s." %
-                       (path_str, path_str))
+      raise ValueError("Can't compare dict to non-dict, a%s vs b%s. %s" %
+                       (path_str, path_str, msg))
     if a_is_dict:
       self.assertItemsEqual(
           a.keys(),
           b.keys(),
-          msg="mismatched keys: a%s has keys %s, but b%s has keys %s" %
-          (path_str, a.keys(), path_str, b.keys()))
+          msg="mismatched keys: a%s has keys %s, but b%s has keys %s. %s" %
+          (path_str, a.keys(), path_str, b.keys(), msg))
       for k in a:
         path.append(k)
         self._assertAllCloseRecursive(
-            a[k], b[k], rtol=rtol, atol=atol, path=path)
+            a[k], b[k], rtol=rtol, atol=atol, path=path, msg=msg)
         del path[-1]
     elif isinstance(a, (list, tuple)):
       # Try to directly compare a, b as ndarrays; if not work, then traverse
@@ -1132,17 +1139,17 @@ class TensorFlowTestCase(googletest.TestCase):
             b_as_ndarray,
             rtol=rtol,
             atol=atol,
-            msg="Mismatched value: a%s is different from b%s." % (path_str,
-                                                                  path_str))
+            msg="Mismatched value: a%s is different from b%s. %s" %
+            (path_str, path_str, msg))
       except (ValueError, TypeError) as e:
         if len(a) != len(b):
           raise ValueError(
-              "Mismatched length: a%s has %d items, but b%s has %d items" %
-              (path_str, len(a), path_str, len(b)))
+              "Mismatched length: a%s has %d items, but b%s has %d items. %s" %
+              (path_str, len(a), path_str, len(b), msg))
         for idx, (a_ele, b_ele) in enumerate(zip(a, b)):
           path.append(str(idx))
           self._assertAllCloseRecursive(
-              a_ele, b_ele, rtol=rtol, atol=atol, path=path)
+              a_ele, b_ele, rtol=rtol, atol=atol, path=path, msg=msg)
           del path[-1]
     # a and b are ndarray like objects
     else:
@@ -1151,10 +1158,10 @@ class TensorFlowTestCase(googletest.TestCase):
           b,
           rtol=rtol,
           atol=atol,
-          msg="Mismatched value: a%s is different from b%s." % (path_str,
-                                                                path_str))
+          msg="Mismatched value: a%s is different from b%s. %s" %
+          (path_str, path_str, msg))
 
-  def assertAllClose(self, a, b, rtol=1e-6, atol=1e-6):
+  def assertAllClose(self, a, b, rtol=1e-6, atol=1e-6, msg=None):
     """Asserts that two structures of numpy arrays, have near values.
 
     `a` and `b` can be arbitrarily nested structures. A layer of a nested
@@ -1167,6 +1174,7 @@ class TensorFlowTestCase(googletest.TestCase):
           numpy `ndarray`, or any arbitrarily nested of structure of these.
       rtol: relative tolerance.
       atol: absolute tolerance.
+      msg: Optional message to report on failure.
 
     Raises:
       ValueError: if only one of `a[p]` and `b[p]` is a dict or
@@ -1174,7 +1182,7 @@ class TensorFlowTestCase(googletest.TestCase):
           to the nested structure, e.g. given `a = [(1, 1), {'d': (6, 7)}]` and
           `[p] = [1]['d']`, then `a[p] = (6, 7)`.
     """
-    self._assertAllCloseRecursive(a, b, rtol=rtol, atol=atol)
+    self._assertAllCloseRecursive(a, b, rtol=rtol, atol=atol, msg=msg)
 
   def assertAllCloseAccordingToType(self,
                                     a,
@@ -1186,7 +1194,8 @@ class TensorFlowTestCase(googletest.TestCase):
                                     half_rtol=1e-3,
                                     half_atol=1e-3,
                                     bfloat16_rtol=1e-2,
-                                    bfloat16_atol=1e-2):
+                                    bfloat16_atol=1e-2,
+                                    msg=None):
     """Like assertAllClose, but also suitable for comparing fp16 arrays.
 
     In particular, the tolerance is reduced to 1e-3 if at least
@@ -1203,6 +1212,7 @@ class TensorFlowTestCase(googletest.TestCase):
       half_atol: absolute tolerance for float16.
       bfloat16_rtol: relative tolerance for bfloat16.
       bfloat16_atol: absolute tolerance for bfloat16.
+      msg: Optional message to report on failure.
     """
     a = self._GetNdArray(a)
     b = self._GetNdArray(b)
@@ -1219,19 +1229,21 @@ class TensorFlowTestCase(googletest.TestCase):
       rtol = max(rtol, bfloat16_rtol)
       atol = max(atol, bfloat16_atol)
 
-    self.assertAllClose(a, b, rtol=rtol, atol=atol)
+    self.assertAllClose(a, b, rtol=rtol, atol=atol, msg=msg)
 
-  def assertAllEqual(self, a, b):
+  def assertAllEqual(self, a, b, msg=None):
     """Asserts that two numpy arrays have the same values.
 
     Args:
       a: the expected numpy ndarray or anything can be converted to one.
       b: the actual numpy ndarray or anything can be converted to one.
+      msg: Optional message to report on failure.
     """
+    msg = msg if msg else ""
     a = self._GetNdArray(a)
     b = self._GetNdArray(b)
-    self.assertEqual(a.shape, b.shape, "Shape mismatch: expected %s, got %s." %
-                     (a.shape, b.shape))
+    self.assertEqual(a.shape, b.shape, "Shape mismatch: expected %s, got %s."
+                                       " %s" % (a.shape, b.shape, msg))
     same = (a == b)
 
     if a.dtype == np.float32 or a.dtype == np.float64:
@@ -1248,7 +1260,7 @@ class TensorFlowTestCase(googletest.TestCase):
         x, y = a, b
       print("not equal lhs = ", x)
       print("not equal rhs = ", y)
-      np.testing.assert_array_equal(a, b)
+      np.testing.assert_array_equal(a, b, err_msg=msg)
 
   # pylint: disable=g-doc-return-or-yield
   @contextlib.contextmanager
@@ -1298,12 +1310,13 @@ class TensorFlowTestCase(googletest.TestCase):
     return self.assertRaisesWithPredicateMatch(errors.OpError,
                                                expected_err_re_or_predicate)
 
-  def assertShapeEqual(self, np_array, tf_tensor):
+  def assertShapeEqual(self, np_array, tf_tensor, msg=None):
     """Asserts that a Numpy ndarray and a TensorFlow tensor have the same shape.
 
     Args:
       np_array: A Numpy ndarray or Numpy scalar.
       tf_tensor: A Tensor.
+      msg: Optional message to report on failure.
 
     Raises:
       TypeError: If the arguments have the wrong type.
@@ -1312,19 +1325,22 @@ class TensorFlowTestCase(googletest.TestCase):
       raise TypeError("np_array must be a Numpy ndarray or Numpy scalar")
     if not isinstance(tf_tensor, ops.Tensor):
       raise TypeError("tf_tensor must be a Tensor")
-    self.assertAllEqual(np_array.shape, tf_tensor.get_shape().as_list())
+    self.assertAllEqual(np_array.shape, tf_tensor.get_shape().as_list(),
+                        msg=msg)
 
-  def assertDeviceEqual(self, device1, device2):
+  def assertDeviceEqual(self, device1, device2, msg=None):
     """Asserts that the two given devices are the same.
 
     Args:
       device1: A string device name or TensorFlow `DeviceSpec` object.
       device2: A string device name or TensorFlow `DeviceSpec` object.
+      msg: Optional message to report on failure.
     """
     device1 = pydev.canonical_name(device1)
     device2 = pydev.canonical_name(device2)
     self.assertEqual(device1, device2,
-                     "Devices %s and %s are not equal" % (device1, device2))
+                     "Devices %s and %s are not equal. %s" % 
+                     (device1, device2, msg))
 
   # Fix Python 3 compatibility issues
   if six.PY3:
-- 
GitLab


From 8a6cdb125ee52f077c5f39e6a03f17cff55b29e5 Mon Sep 17 00:00:00 2001
From: Ben <bstriner@users.noreply.github.com>
Date: Thu, 8 Feb 2018 13:49:23 -0500
Subject: [PATCH 834/966] Add option to not include histograms (#16579)

* Add option to not include histograms

* Add test for model_summaries=False
---
 .../gan/python/eval/python/summaries_impl.py       |  7 +++++--
 .../gan/python/eval/python/summaries_test.py       | 14 ++++++++++----
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/tensorflow/contrib/gan/python/eval/python/summaries_impl.py b/tensorflow/contrib/gan/python/eval/python/summaries_impl.py
index 74811ff409..0d1afad72d 100644
--- a/tensorflow/contrib/gan/python/eval/python/summaries_impl.py
+++ b/tensorflow/contrib/gan/python/eval/python/summaries_impl.py
@@ -39,12 +39,13 @@ def _assert_is_image(data):
   data.shape[1:].assert_is_fully_defined()
 
 
-def add_gan_model_image_summaries(gan_model, grid_size=4):
+def add_gan_model_image_summaries(gan_model, grid_size=4, model_summaries=True):
   """Adds image summaries for real and fake images.
 
   Args:
     gan_model: A GANModel tuple.
     grid_size: The size of an image grid.
+    model_summaries: Also add summaries of the model.
 
   Raises:
     ValueError: If real and generated data aren't images.
@@ -83,7 +84,9 @@ def add_gan_model_image_summaries(gan_model, grid_size=4):
           image_shape=generated_image_shape,
           num_channels=generated_channels),
       max_outputs=1)
-  add_gan_model_summaries(gan_model)
+
+  if model_summaries:
+    add_gan_model_summaries(gan_model)
 
 
 def add_image_comparison_summaries(gan_model, num_comparisons=2,
diff --git a/tensorflow/contrib/gan/python/eval/python/summaries_test.py b/tensorflow/contrib/gan/python/eval/python/summaries_test.py
index a02d8772e1..7956db4334 100644
--- a/tensorflow/contrib/gan/python/eval/python/summaries_test.py
+++ b/tensorflow/contrib/gan/python/eval/python/summaries_test.py
@@ -72,8 +72,10 @@ def get_cyclegan_model():
 class SummariesTest(test.TestCase):
 
   def _test_add_gan_model_image_summaries_impl(self, get_model_fn,
-                                               expected_num_summary_ops):
-    summaries.add_gan_model_image_summaries(get_model_fn(), grid_size=2)
+                                               expected_num_summary_ops,
+                                               model_summaries):
+    summaries.add_gan_model_image_summaries(get_model_fn(), grid_size=2,
+                                            model_summaries=model_summaries)
 
     self.assertEquals(expected_num_summary_ops,
                       len(ops.get_collection(ops.GraphKeys.SUMMARIES)))
@@ -82,10 +84,14 @@ class SummariesTest(test.TestCase):
       summary.merge_all().eval()
 
   def test_add_gan_model_image_summaries(self):
-    self._test_add_gan_model_image_summaries_impl(get_gan_model, 5)
+    self._test_add_gan_model_image_summaries_impl(get_gan_model, 5, True)
+
+  def test_add_gan_model_image_summaries_no_model(self):
+    self._test_add_gan_model_image_summaries_impl(get_gan_model, 2, False)
 
   def test_add_gan_model_image_summaries_for_cyclegan(self):
-    self._test_add_gan_model_image_summaries_impl(get_cyclegan_model, 10)
+    self._test_add_gan_model_image_summaries_impl(get_cyclegan_model, 10,
+                                                  True)
 
   def _test_add_gan_model_summaries_impl(self, get_model_fn,
                                          expected_num_summary_ops):
-- 
GitLab


From 75009033eafdc63b649fb65c701e5770d98eda58 Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Thu, 8 Feb 2018 10:41:37 -0800
Subject: [PATCH 835/966] Fix minor bug with tf.keras model calling (related to
 mask caching).

PiperOrigin-RevId: 185016910
---
 tensorflow/python/keras/_impl/keras/engine/topology.py      | 4 ++--
 tensorflow/python/keras/_impl/keras/engine/topology_test.py | 1 +
 tensorflow/python/layers/network.py                         | 3 +--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/keras/_impl/keras/engine/topology.py b/tensorflow/python/keras/_impl/keras/engine/topology.py
index 8354a2b8fd..718c92f408 100644
--- a/tensorflow/python/keras/_impl/keras/engine/topology.py
+++ b/tensorflow/python/keras/_impl/keras/engine/topology.py
@@ -784,8 +784,8 @@ class Network(tf_network.GraphNetwork, Layer):
       masks = [None for _ in range(len(inputs))]
     else:
       masks = _to_list(mask)
-    cache_key = ','.join([str(id(x)) for x in inputs])
-    cache_key += '_' + ','.join([str(id(x)) for x in masks])
+    cache_key = (tf_layers_util.object_list_uid(inputs)
+                 + '_' + tf_layers_util.object_list_uid(masks))
     if cache_key in self._output_mask_cache:
       return self._output_mask_cache[cache_key]
     else:
diff --git a/tensorflow/python/keras/_impl/keras/engine/topology_test.py b/tensorflow/python/keras/_impl/keras/engine/topology_test.py
index 479ee877fd..85979d1745 100644
--- a/tensorflow/python/keras/_impl/keras/engine/topology_test.py
+++ b/tensorflow/python/keras/_impl/keras/engine/topology_test.py
@@ -340,6 +340,7 @@ class TopologyConstructionTest(test.TestCase):
       e = keras.layers.Input(shape=(32,), name='input_e')
       f = keras.layers.Input(shape=(32,), name='input_f')
       g, h = model([e, f])
+      self.assertEqual(g.name, 'model_1/dense_2/BiasAdd:0')
 
       self.assertListEqual(g.get_shape().as_list(), c.get_shape().as_list())
       self.assertListEqual(h.get_shape().as_list(), d.get_shape().as_list())
diff --git a/tensorflow/python/layers/network.py b/tensorflow/python/layers/network.py
index 745843975c..7bcf25064c 100644
--- a/tensorflow/python/layers/network.py
+++ b/tensorflow/python/layers/network.py
@@ -955,8 +955,7 @@ class GraphNetwork(base.Layer):
       cache_key = (layers_util.object_list_uid(inputs)
                    + '_' + layers_util.object_list_uid(masks))
       self._output_tensor_cache[cache_key] = output_tensors
-      if output_masks is not None:
-        self._output_mask_cache[cache_key] = output_masks
+      self._output_mask_cache[cache_key] = output_masks
       if output_shapes is not None:
         input_shapes = [layers_util.static_shape(x) for x in inputs]
         cache_key = layers_util.object_list_uid(input_shapes)
-- 
GitLab


From 40f0cf009641ef0d827729bb01e8ed50d97fd109 Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Thu, 8 Feb 2018 10:42:01 -0800
Subject: [PATCH 836/966] Update the wrapper gen code to call out to the
 fastpath function.

PiperOrigin-RevId: 185016979
---
 .../python/eager/python_eager_op_gen.cc       | 643 ++++++++++++------
 tensorflow/python/eager/pywrap_tfe.h          |  10 +-
 tensorflow/python/eager/pywrap_tfe_src.cc     |  30 +-
 tensorflow/python/framework/python_op_gen.cc  |   7 +-
 .../python/framework/python_op_gen_internal.h |   1 +
 5 files changed, 457 insertions(+), 234 deletions(-)

diff --git a/tensorflow/python/eager/python_eager_op_gen.cc b/tensorflow/python/eager/python_eager_op_gen.cc
index 0f18f28c95..34e1e4cd8f 100644
--- a/tensorflow/python/eager/python_eager_op_gen.cc
+++ b/tensorflow/python/eager/python_eager_op_gen.cc
@@ -42,6 +42,8 @@ namespace {
 
 const int kRightMargin = 78;
 
+constexpr char kEagerFallbackSuffix[] = "_eager_fallback";
+
 string AttrVarName(const string& attr_name,
                    std::unordered_map<string, string>* attr_expressions) {
   const string var = strings::StrCat("_attr_", attr_name);
@@ -49,11 +51,12 @@ string AttrVarName(const string& attr_name,
   return var;
 }
 
-void AddInferredAttr(const string& attr_name, const string& value_expression,
-                     string* result,
+void AddInferredAttr(const string& indentation, const string& attr_name,
+                     const string& value_expression, string* result,
                      std::unordered_map<string, string>* attr_expressions) {
-  strings::StrAppend(result, "  ", AttrVarName(attr_name, attr_expressions),
-                     " = ", value_expression, "\n");
+  strings::StrAppend(result, indentation,
+                     AttrVarName(attr_name, attr_expressions), " = ",
+                     value_expression, "\n");
 }
 
 string VectorToTuple(const std::vector<string>& l) {
@@ -121,11 +124,33 @@ class GenEagerPythonOp : public python_op_gen_internal::GenPythonOp {
   string Code() override;
 
  protected:
-  void ExpectListArg(const string& arg_name);
-  void AddEagerInferredAttrs();
-  void AddEagerInputCasts();
-  void AddEagerAttrs();
-  void AddEagerExecute(const string& num_outputs_expr);
+  void HandleGraphMode(const string& function_setup);
+
+  string GetEagerNotAllowedError();
+  void ExpectListArg(const string& indentation, const string& arg_name,
+                     string* output);
+  bool GetEagerFunctionSetup(const string& indentation, string* function_setup);
+  void GetOutputSizesAndNumOutputsExpr(std::vector<string>* output_sizes,
+                                       string* num_outputs_expr);
+
+  void AddEagerFunctionTeardown(const string& indentation,
+                                const std::vector<string>& output_sizes,
+                                bool execute_record_gradient);
+
+  bool AddEagerFastPathAndGraphCode(const string& parameters,
+                                    const std::vector<string>& output_sizes,
+                                    const string& eager_not_allowed_error);
+  bool AddEagerFallbackCode(const string& parameters,
+                            const std::vector<string>& output_sizes,
+                            const string& num_outputs_expr,
+                            const string& eager_not_allowed_error);
+  void AddEagerFastPathExecute();
+
+  void AddEagerInferredAttrs(const string& indentation);
+  void AddEagerInputCasts(const string& indentation);
+  void AddEagerAttrs(const string& indentation);
+  void AddEagerExecute(const string& indentation,
+                       const string& num_outputs_expr);
 
   void AddAttrForArg(const string& attr, int arg_index) {
     gtl::InsertIfNotPresent(&inferred_attrs_, attr,
@@ -148,6 +173,13 @@ class GenEagerPythonOp : public python_op_gen_internal::GenPythonOp {
   typedef std::unordered_map<string, std::vector<int>> AttrToArgMap;
   AttrToArgMap attr_to_args_;
   std::unordered_map<string, string> attr_expressions_;
+  // This has all the input args followed by those attrs that don't have
+  // defaults.
+  std::vector<python_op_gen_internal::ParamNames> params_no_default_;
+  // The parameters with defaults (these have to be listed after those without).
+  // No input args are included, just attrs.
+  std::vector<std::pair<python_op_gen_internal::ParamNames, string>>
+      params_with_default_;
 };
 
 string GetEagerPythonOp(const OpDef& op_def, const ApiDef& api_def,
@@ -207,18 +239,12 @@ string GenEagerPythonOp::Code() {
   if (api_def_.visibility() == ApiDef::SKIP) {
     return "";
   }
-  // This has all the input args followed by those attrs that don't have
-  // defaults.
-  std::vector<python_op_gen_internal::ParamNames> params_no_default;
-  // The parameters with defaults (these have to be listed after those without).
-  // No input args are included, just attrs.
-  std::vector<std::pair<python_op_gen_internal::ParamNames, string>>
-      params_with_default;
 
   for (int i = 0; i < api_def_.arg_order_size(); ++i) {
     const auto& arg = *FindInputArg(api_def_.arg_order(i), op_def_);
     const auto& api_def_arg = *FindInputArg(api_def_.arg_order(i), api_def_);
-    params_no_default.emplace_back(api_def_arg.name(), api_def_arg.rename_to());
+    params_no_default_.emplace_back(api_def_arg.name(),
+                                    api_def_arg.rename_to());
     if (!arg.type_attr().empty()) {
       AddAttrForArg(arg.type_attr(), i);
     } else if (!arg.type_list_attr().empty()) {
@@ -235,7 +261,7 @@ string GenEagerPythonOp::Code() {
     if (inferred_attrs_.find(attr.name()) == inferred_attrs_.end()) {
       if (api_def_attr.has_default_value()) {
         if (attr.type() == "tensor") {
-          params_with_default.emplace_back(
+          params_with_default_.emplace_back(
               python_op_gen_internal::ParamNames(api_def_attr.name(),
                                                  api_def_attr.rename_to()),
               strings::StrCat(
@@ -247,22 +273,22 @@ string GenEagerPythonOp::Code() {
           for (const auto& pb : api_def_attr.default_value().list().tensor()) {
             pbtxt.emplace_back(TensorPBString(pb));
           }
-          params_with_default.emplace_back(
+          params_with_default_.emplace_back(
               python_op_gen_internal::ParamNames(api_def_attr.name(),
                                                  api_def_attr.rename_to()),
               strings::StrCat("[_execute.make_tensor(_pb, \"",
                               api_def_attr.rename_to(), "\") for _pb in ",
                               VectorToTuple(pbtxt), "]"));
         } else {
-          params_with_default.emplace_back(
+          params_with_default_.emplace_back(
               python_op_gen_internal::ParamNames(api_def_attr.name(),
                                                  api_def_attr.rename_to()),
               python_op_gen_internal::AttrValueToPython(
                   attr.type(), api_def_attr.default_value(), "_dtypes."));
         }
       } else {
-        params_no_default.emplace_back(api_def_attr.name(),
-                                       api_def_attr.rename_to());
+        params_no_default_.emplace_back(api_def_attr.name(),
+                                        api_def_attr.rename_to());
       }
     }
   }
@@ -270,29 +296,29 @@ string GenEagerPythonOp::Code() {
   // Save the list of attr parameters (attrs that won't be inferred),
   // those with defaults go at the end.
   // Get the attrs in the order we want by taking the attrs without defaults
-  // from the end of params_no_default, and adding params_no_default.
-  attrs_.reserve(params_no_default.size() - op_def_.input_arg_size() +
-                 params_with_default.size());
-  for (int i = op_def_.input_arg_size(); i < params_no_default.size(); ++i) {
-    attrs_.push_back(params_no_default[i].GetName());
+  // from the end of params_no_default_, and adding params_no_default_.
+  attrs_.reserve(params_no_default_.size() - op_def_.input_arg_size() +
+                 params_with_default_.size());
+  for (int i = op_def_.input_arg_size(); i < params_no_default_.size(); ++i) {
+    attrs_.push_back(params_no_default_[i].GetName());
   }
-  for (const auto& p : params_with_default) {
+  for (const auto& p : params_with_default_) {
     attrs_.push_back(p.first.GetName());
   }
 
-  param_names_.reserve(params_no_default.size() + params_with_default.size());
-  param_names_.insert(param_names_.begin(), params_no_default.begin(),
-                      params_no_default.end());
-  for (const auto& param_and_default : params_with_default) {
+  param_names_.reserve(params_no_default_.size() + params_with_default_.size());
+  param_names_.insert(param_names_.begin(), params_no_default_.begin(),
+                      params_no_default_.end());
+  for (const auto& param_and_default : params_with_default_) {
     param_names_.push_back(param_and_default.first);
   }
 
   string parameters;
-  for (const auto& param : params_no_default) {
+  for (const auto& param : params_no_default_) {
     if (!parameters.empty()) strings::StrAppend(&parameters, ", ");
     strings::StrAppend(&parameters, param.GetRenameTo());
   }
-  for (const auto& param_and_default : params_with_default) {
+  for (const auto& param_and_default : params_with_default_) {
     if (!parameters.empty()) strings::StrAppend(&parameters, ", ");
     strings::StrAppend(&parameters, param_and_default.first.GetRenameTo(), "=",
                        param_and_default.second);
@@ -300,19 +326,125 @@ string GenEagerPythonOp::Code() {
   if (!parameters.empty()) strings::StrAppend(&parameters, ", ");
   strings::StrAppend(&parameters, "name=None");
 
-  AddExport();
-  AddDefLine(parameters);
-  AddDocStringDescription();
-  AddDocStringArgs();
-  AddDocStringInputs();
-  AddDocStringAttrs();
-  AddDocStringNameArg();
-  AddOutputGlobals();
-  AddDocStringOutputs();
-  strings::StrAppend(&result_, "  \"\"\"\n");
+  // Add attr_expressions_ for attrs that are params.
+  for (int i = 0; i < attrs_.size(); ++i) {
+    const string& attr_name = attrs_[i];
+    const string& attr_api_name =
+        param_names_[i + op_def_.input_arg_size()].GetRenameTo();
+    attr_expressions_[attr_name] = attr_api_name;
+  }
+  // Add attr_expressions_ for attrs that are inferred.
+  for (int i = 0; i < op_def_.attr_size(); ++i) {
+    const auto& attr(op_def_.attr(i));
+    if (attr.type() == "int") {
+      auto arg_list = attr_to_args_.find(attr.name());
+      if (arg_list != attr_to_args_.end()) {
+        AttrVarName(attr.name(), &attr_expressions_);
+      }
+    }
+  }
+
+  string num_outputs_expr;
+  std::vector<string> output_sizes(num_outs_);
+  GetOutputSizesAndNumOutputsExpr(&output_sizes, &num_outputs_expr);
+
+  string eager_not_allowed_error = GetEagerNotAllowedError();
+
+  if (!AddEagerFastPathAndGraphCode(parameters, output_sizes,
+                                    eager_not_allowed_error)) {
+    return result_;
+  }
+
+  if (!AddEagerFallbackCode(parameters, output_sizes, num_outputs_expr,
+                            eager_not_allowed_error)) {
+    return result_;
+  }
+
+  return prelude_ + result_;
+}
+
+void GenEagerPythonOp::HandleGraphMode(const string& function_setup) {
+  // Handle graph-mode case
+  strings::StrAppend(&result_,
+                     "  _ctx = _context.context()\n"
+                     "  if _ctx.in_graph_mode():\n",
+                     function_setup,
+                     "    _, _, _op = _op_def_lib._apply_op_helper(\n");
+  AddBodyNoReturn("        ");
+  if (num_outs_ > 0) {
+    strings::StrAppend(&result_, "    _result = _op.outputs[:]\n");
+    // Special case handling for stateful op with single list output
+    // that might be empty.
+    if (num_outs_ == 1 && op_def_.is_stateful() &&
+        (!op_def_.output_arg(0).number_attr().empty() ||
+         !op_def_.output_arg(0).type_list_attr().empty())) {
+      // TODO(josh11b): Can skip this if the number_attr/type_list_attr has
+      // a constraint indicating that this can never be empty.
+      strings::StrAppend(&result_,
+                         "    if not _result:\n"
+                         "      return _op\n");
+    }
+    strings::StrAppend(&result_, "    _inputs_flat = _op.inputs\n");
 
-  // Function body.
+    // Compute graph-mode attrs.
+    if (op_def_.attr_size() > 0) {
+      string attr_values;
+      for (int i = 0; i < op_def_.attr_size(); ++i) {
+        if (i > 0) strings::StrAppend(&attr_values, ", ");
+        const auto& attr_name(op_def_.attr(i).name());
+        strings::StrAppend(&attr_values, "\"", attr_name, "\", _op.get_attr(\"",
+                           attr_name, "\")");
+      }
+      strings::StrAppend(&attr_values, ")");
+      strings::StrAppend(&result_,
+                         WordWrap("    _attrs = (", attr_values, kRightMargin),
+                         "\n");
+    } else {
+      strings::StrAppend(&result_, "    _attrs = None\n");
+    }
+  } else {
+    strings::StrAppend(&result_, "    return _op\n");
+  }
+}
 
+string GenEagerPythonOp::GetEagerNotAllowedError() {
+  bool eager_allowed = true;
+  string ref_arg;
+  for (int i = 0; i < op_def_.input_arg_size(); ++i) {
+    const auto& arg = op_def_.input_arg(i);
+    if (arg.is_ref()) {
+      eager_allowed = false;
+      DCHECK_EQ(op_def_.input_arg(i).name(), api_def_.in_arg(i).name());
+      ref_arg = api_def_.in_arg(i).rename_to();
+    }
+  }
+  for (int i = 0; i < op_def_.output_arg_size(); ++i) {
+    const auto& arg = op_def_.output_arg(i);
+    if (arg.is_ref()) {
+      eager_allowed = false;
+      DCHECK_EQ(op_def_.output_arg(i).name(), api_def_.out_arg(i).name());
+      ref_arg = api_def_.out_arg(i).rename_to();
+    }
+  }
+
+  if (eager_allowed) return "";
+
+  return strings::StrCat("raise RuntimeError(\"", op_name_,
+                         " op does not support eager execution. ", "Arg '",
+                         ref_arg, "' is a ref.\")\n");
+}
+
+void GenEagerPythonOp::ExpectListArg(const string& indentation,
+                                     const string& arg_name, string* output) {
+  strings::StrAppend(output, indentation, "if not isinstance(", arg_name,
+                     ", (list, tuple)):\n", indentation, "  raise TypeError(\n",
+                     indentation, "      \"Expected list for '", arg_name,
+                     "' argument to \"\n", indentation, "      \"'", op_name_,
+                     "' Op, not %r.\" % ", arg_name, ")\n");
+}
+
+bool GenEagerPythonOp::GetEagerFunctionSetup(const string& indentation,
+                                             string* function_setup) {
   // Validate list inputs, infer length attrs.
   for (int i = 0; i < op_def_.attr_size(); ++i) {
     const auto& attr(op_def_.attr(i));
@@ -324,32 +456,27 @@ string GenEagerPythonOp::Code() {
         for (auto iter = arg_list->second.begin();
              iter != arg_list->second.end(); ++iter) {
           const string& arg_api_name = param_names_[*iter].GetRenameTo();
-          ExpectListArg(arg_api_name);
+          ExpectListArg(indentation, arg_api_name, function_setup);
           if (iter == arg_list->second.begin()) {
-            AddInferredAttr(attr.name(),
+            AddInferredAttr(indentation, attr.name(),
                             strings::StrCat("len(", arg_api_name, ")"),
-                            &result_, &attr_expressions_);
+                            function_setup, &attr_expressions_);
           } else {
             const auto& attr_var = attr_expressions_[attr.name()];
-            strings::StrAppend(&result_, "  if len(", arg_api_name,
-                               ") != ", attr_var,
-                               ":\n"
-                               "    raise ValueError(\n"
-                               "        \"List argument '",
-                               arg_api_name, "' to '", op_name_,
-                               "' Op with length %d \"\n"
-                               "        \"must match length %d of argument '",
-                               inferred_attrs_[attr.name()],
-                               "'.\" %\n"
-                               "        (len(",
-                               arg_api_name, "), ", attr_var, "))\n");
+            strings::StrAppend(
+                function_setup, indentation, "if len(", arg_api_name,
+                ") != ", attr_var, ":\n", indentation, "  raise ValueError(\n",
+                indentation, "      \"List argument '", arg_api_name, "' to '",
+                op_name_, "' Op with length %d \"\n", indentation,
+                "      \"must match length %d of argument '",
+                inferred_attrs_[attr.name()], "'.\" %\n", indentation,
+                "      (len(", arg_api_name, "), ", attr_var, "))\n");
           }
         }
       }
     }
   }
 
-  // Values for non-inferred attrs.
   for (int i = 0; i < attrs_.size(); ++i) {
     const string& attr_name = attrs_[i];
     const auto& param = param_names_[i + op_def_.input_arg_size()];
@@ -357,241 +484,299 @@ string GenEagerPythonOp::Code() {
     const string& attr_api_name = param.GetRenameTo();
     StringPiece attr_type = attr.type();
     attr_expressions_[attr_name] = attr_api_name;
-    const int default_index = i - (attrs_.size() - params_with_default.size());
+    const int default_index = i - (attrs_.size() - params_with_default_.size());
     if (default_index >= 0) {
-      const string& default_value = params_with_default[default_index].second;
-      strings::StrAppend(&result_, "  if ", attr_api_name, " is None:\n");
-      strings::StrAppend(&result_, "    ", attr_api_name, " = ", default_value,
-                         "\n");
+      const string& default_value = params_with_default_[default_index].second;
+      strings::StrAppend(function_setup, indentation, "if ", attr_api_name,
+                         " is None:\n");
+      strings::StrAppend(function_setup, indentation, "  ", attr_api_name,
+                         " = ", default_value, "\n");
     }
     if (attr_type.starts_with("list(")) {
-      ExpectListArg(attr_api_name);
+      ExpectListArg(indentation, attr_api_name, function_setup);
     }
 
     if (attr_type == "string") {
-      strings::StrAppend(&result_, "  ", attr_api_name, " = _execute.make_str(",
-                         attr_api_name, ", \"", attr_api_name, "\")\n");
+      strings::StrAppend(function_setup, indentation, attr_api_name,
+                         " = _execute.make_str(", attr_api_name, ", \"",
+                         attr_api_name, "\")\n");
     } else if (attr_type == "list(string)") {
-      strings::StrAppend(&result_, "  ", attr_api_name,
+      strings::StrAppend(function_setup, indentation, attr_api_name,
                          " = [_execute.make_str(_s, \"", attr_api_name,
                          "\") for _s in ", attr_api_name, "]\n");
     } else if (attr_type == "int") {
-      strings::StrAppend(&result_, "  ", attr_api_name, " = _execute.make_int(",
-                         attr_api_name, ", \"", attr_api_name, "\")\n");
+      strings::StrAppend(function_setup, indentation, attr_api_name,
+                         " = _execute.make_int(", attr_api_name, ", \"",
+                         attr_api_name, "\")\n");
     } else if (attr_type == "list(int)") {
-      strings::StrAppend(&result_, "  ", attr_api_name,
+      strings::StrAppend(function_setup, indentation, attr_api_name,
                          " = [_execute.make_int(_i, \"", attr_api_name,
                          "\") for _i in ", attr_api_name, "]\n");
     } else if (attr_type == "float") {
-      strings::StrAppend(&result_, "  ", attr_api_name,
+      strings::StrAppend(function_setup, indentation, attr_api_name,
                          " = _execute.make_float(", attr_api_name, ", \"",
                          attr_api_name, "\")\n");
     } else if (attr_type == "list(float)") {
-      strings::StrAppend(&result_, "  ", attr_api_name,
+      strings::StrAppend(function_setup, indentation, attr_api_name,
                          " = [_execute.make_float(_f, \"", attr_api_name,
                          "\") for _f in ", attr_api_name, "]\n");
     } else if (attr_type == "bool") {
-      strings::StrAppend(&result_, "  ", attr_api_name,
+      strings::StrAppend(function_setup, indentation, attr_api_name,
                          " = _execute.make_bool(", attr_api_name, ", \"",
                          attr_api_name, "\")\n");
     } else if (attr_type == "list(bool)") {
-      strings::StrAppend(&result_, "  ", attr_api_name,
+      strings::StrAppend(function_setup, indentation, attr_api_name,
                          " = [_execute.make_bool(_b, \"", attr_api_name,
                          "\") for _b in ", attr_api_name, "]\n");
     } else if (attr_type == "type") {
-      strings::StrAppend(&result_, "  ", attr_api_name,
+      strings::StrAppend(function_setup, indentation, attr_api_name,
                          " = _execute.make_type(", attr_api_name, ", \"",
                          attr_api_name, "\")\n");
     } else if (attr_type == "list(type)") {
-      strings::StrAppend(&result_, "  ", attr_api_name,
+      strings::StrAppend(function_setup, indentation, attr_api_name,
                          " = [_execute.make_type(_t, \"", attr_api_name,
                          "\") for _t in ", attr_api_name, "]\n");
     } else if (attr_type == "shape") {
-      strings::StrAppend(&result_, "  ", attr_api_name,
+      strings::StrAppend(function_setup, indentation, attr_api_name,
                          " = _execute.make_shape(", attr_api_name, ", \"",
                          attr_api_name, "\")\n");
     } else if (attr_type == "list(shape)") {
-      strings::StrAppend(&result_, "  ", attr_api_name,
+      strings::StrAppend(function_setup, indentation, attr_api_name,
                          " = [_execute.make_shape(_s, \"", attr_api_name,
                          "\") for _s in ", attr_api_name, "]\n");
     } else if (attr_type == "tensor") {
-      strings::StrAppend(&result_, "  ", attr_api_name,
+      strings::StrAppend(function_setup, indentation, attr_api_name,
                          " = _execute.make_tensor(", attr_api_name, ", \"",
                          attr_api_name, "\")\n");
     } else if (attr_type == "list(tensor)") {
-      strings::StrAppend(&result_, "  ", attr_api_name,
+      strings::StrAppend(function_setup, indentation, attr_api_name,
                          " = [_execute.make_tensor(_t, \"", attr_api_name,
                          "\") for _t in ", attr_api_name, "]\n");
     } else if (attr_type != "func") {
-      return strings::StrCat("# No definition for ", function_name_,
-                             " since we don't support attrs with type\n"
-                             "# '",
-                             attr_type, "' right now.\n\n");
-    }
-  }
-
-  // Figure out the list of inputs.
-  const string inputs = FlattenInputs(nullptr, nullptr);
-
-  // Handle graph-mode case
-  strings::StrAppend(&result_,
-                     "  _ctx = _context.context()\n"
-
-                     "  if _ctx.in_graph_mode():\n"
-                     "    _, _, _op = _op_def_lib._apply_op_helper(\n");
-  AddBodyNoReturn("        ");
-  if (num_outs_ > 0) {
-    strings::StrAppend(&result_, "    _result = _op.outputs[:]\n");
-    // Special case handling for stateful op with single list output
-    // that might be empty.
-    if (num_outs_ == 1 && op_def_.is_stateful() &&
-        (!op_def_.output_arg(0).number_attr().empty() ||
-         !op_def_.output_arg(0).type_list_attr().empty())) {
-      // TODO(josh11b): Can skip this if the number_attr/type_list_attr has
-      // a constraint indicating that this can never be empty.
-      strings::StrAppend(&result_,
-                         "    if not _result:\n"
-                         "      return _op\n");
+      *function_setup =
+          strings::StrCat("# No definition for ", function_name_,
+                          " since we don't support attrs with type\n"
+                          "# '",
+                          attr_type, "' right now.\n\n");
+      return false;
     }
-    strings::StrAppend(&result_, "    _inputs_flat = _op.inputs\n");
-
-    // Compute graph-mode attrs.
-    if (op_def_.attr_size() > 0) {
-      string attr_values;
-      for (int i = 0; i < op_def_.attr_size(); ++i) {
-        if (i > 0) strings::StrAppend(&attr_values, ", ");
-        const auto& attr_name(op_def_.attr(i).name());
-        strings::StrAppend(&attr_values, "\"", attr_name, "\", _op.get_attr(\"",
-                           attr_name, "\")");
-      }
-      strings::StrAppend(&attr_values, ")");
-      strings::StrAppend(&result_,
-                         WordWrap("    _attrs = (", attr_values, kRightMargin),
-                         "\n");
-    } else {
-      strings::StrAppend(&result_, "    _attrs = None\n");
-    }
-  } else {
-    strings::StrAppend(&result_, "    return _op\n");
   }
+  return true;
+}
 
-  // Handle eager-mode case
-  strings::StrAppend(&result_, "  else:\n");
-
+// If output i is list output, output_sizes[i] will be set to a
+// string with the python expression that will evaluate to its
+// length. output_sizes[i] is empty for non-list outputs.
+void GenEagerPythonOp::GetOutputSizesAndNumOutputsExpr(
+    std::vector<string>* output_sizes, string* num_outputs_expr) {
   // Expression representing the number of outputs.
   int num_fixed_outputs = 0;
-  string num_outputs_expr;
-  // If output i is list output, output_sizes[i] will be set to a
-  // string with the python expression that will evaluate to its
-  // length. output_sizes[i] is empty for non-list outputs.
-  std::vector<string> output_sizes(num_outs_);
   for (int i = 0; i < num_outs_; ++i) {
     const auto& arg(op_def_.output_arg(i));
     if (!arg.number_attr().empty()) {
-      if (!num_outputs_expr.empty()) {
-        strings::StrAppend(&num_outputs_expr, " + ");
+      if (!num_outputs_expr->empty()) {
+        strings::StrAppend(num_outputs_expr, " + ");
       }
-      output_sizes[i] = attr_expressions_[arg.number_attr()];
-      strings::StrAppend(&num_outputs_expr, output_sizes[i]);
+      (*output_sizes)[i] = attr_expressions_[arg.number_attr()];
+      strings::StrAppend(num_outputs_expr, (*output_sizes)[i]);
     } else if (!arg.type_list_attr().empty()) {
-      if (!num_outputs_expr.empty()) {
-        strings::StrAppend(&num_outputs_expr, " + ");
+      if (!num_outputs_expr->empty()) {
+        strings::StrAppend(num_outputs_expr, " + ");
       }
       // Have to be careful to use an expression that works in both
       // graph and eager paths here.
       const auto iter = inferred_attrs_.find(arg.type_list_attr());
       if (iter == inferred_attrs_.end()) {
-        output_sizes[i] = strings::StrCat(
+        (*output_sizes)[i] = strings::StrCat(
             "len(", attr_expressions_[arg.type_list_attr()], ")");
       } else {
-        output_sizes[i] = strings::StrCat("len(", iter->second, ")");
+        (*output_sizes)[i] = strings::StrCat("len(", iter->second, ")");
       }
-      strings::StrAppend(&num_outputs_expr, output_sizes[i]);
+      strings::StrAppend(num_outputs_expr, (*output_sizes)[i]);
     } else {
       ++num_fixed_outputs;
     }
   }
   if (num_fixed_outputs > 0) {
-    if (!num_outputs_expr.empty()) {
-      strings::StrAppend(&num_outputs_expr, " + ");
-    }
-    strings::StrAppend(&num_outputs_expr, num_fixed_outputs);
-  } else if (num_outputs_expr.empty()) {
-    num_outputs_expr = "0";
-  }
-
-  bool eager_allowed = true;
-  string ref_arg;
-  for (int i = 0; i < op_def_.input_arg_size(); ++i) {
-    const auto& arg = op_def_.input_arg(i);
-    if (arg.is_ref()) {
-      eager_allowed = false;
-      DCHECK_EQ(op_def_.input_arg(i).name(), api_def_.in_arg(i).name());
-      ref_arg = api_def_.in_arg(i).rename_to();
-    }
-  }
-  for (int i = 0; i < op_def_.output_arg_size(); ++i) {
-    const auto& arg = op_def_.output_arg(i);
-    if (arg.is_ref()) {
-      eager_allowed = false;
-      DCHECK_EQ(op_def_.output_arg(i).name(), api_def_.out_arg(i).name());
-      ref_arg = api_def_.out_arg(i).rename_to();
+    if (!num_outputs_expr->empty()) {
+      strings::StrAppend(num_outputs_expr, " + ");
     }
+    strings::StrAppend(num_outputs_expr, num_fixed_outputs);
+  } else if (num_outputs_expr->empty()) {
+    *num_outputs_expr = "0";
   }
+}
 
-  if (eager_allowed) {
-    AddEagerInferredAttrs();
-    AddEagerInputCasts();
-    strings::StrAppend(&result_, "    _inputs_flat = ", inputs, "\n");
-    AddEagerAttrs();
-    AddEagerExecute(num_outputs_expr);
-  } else {
-    strings::StrAppend(&result_,
-                       "    raise RuntimeError(\n"
-                       "        \"",
-                       op_name_, " op does not support eager execution. ",
-                       "Arg '", ref_arg, "'' is a ref.\")\n");
-  }
-
+void GenEagerPythonOp::AddEagerFunctionTeardown(
+    const string& indentation, const std::vector<string>& output_sizes,
+    bool execute_record_gradient) {
   if (num_outs_ > 0) {
-    strings::StrAppend(&result_, "  _execute.record_gradient(\n", "      \"",
-                       op_def_.name(),
-                       "\", _inputs_flat, _attrs, _result, name)\n");
+    if (execute_record_gradient) {
+      strings::StrAppend(&result_, indentation, "_execute.record_gradient(\n",
+                         "      \"", op_def_.name(),
+                         "\", _inputs_flat, _attrs, _result, name)\n");
+    }
     if (num_outs_ == 1 && !output_sizes[0].empty()) {
       // Single list result.
     } else if (num_outs_ == 1) {
       // Execute returns a single-element list which we need to destructure.
-      strings::StrAppend(&result_, "  _result, = _result\n");
+      strings::StrAppend(&result_, indentation, "_result, = _result\n");
     } else {
       // Have multiple outputs, so we will need to reformat the return
       // value of execute() to be a list with one entry per op output
       // (that entry will be a list of tensors if that output is of list
       // type).
       // For list outputs, convert the right subrange of _result into a list.
-      Unflatten("  ", output_sizes, "_result", &result_);
+      Unflatten(indentation, output_sizes, "_result", &result_);
       // Convert to a named tuple.
-      strings::StrAppend(&result_, "  _result = _", op_def_.name(),
+      strings::StrAppend(&result_, indentation, "_result = _", op_def_.name(),
                          "Output._make(_result)\n");
     }
   } else {
-    strings::StrAppend(&result_, "    _result = None\n");
+    strings::StrAppend(&result_, indentation, "_result = None\n");
   }
-  strings::StrAppend(&result_, "  return _result\n\n");
-  return prelude_ + result_;
+  strings::StrAppend(&result_, indentation, "return _result\n\n");
 }
 
-void GenEagerPythonOp::ExpectListArg(const string& arg_name) {
-  strings::StrAppend(&result_, "  if not isinstance(", arg_name,
-                     ", (list, tuple)):\n"
-                     "    raise TypeError(\n"
-                     "        \"Expected list for '",
-                     arg_name,
-                     "' argument to \"\n"
-                     "        \"'",
-                     op_name_, "' Op, not %r.\" % ", arg_name, ")\n");
+bool GenEagerPythonOp::AddEagerFastPathAndGraphCode(
+    const string& parameters, const std::vector<string>& output_sizes,
+    const string& eager_not_allowed_error) {
+  AddDefLine(function_name_, parameters);
+  AddDocStringDescription();
+  AddDocStringArgs();
+  AddDocStringInputs();
+  AddDocStringAttrs();
+  AddDocStringNameArg();
+  AddOutputGlobals();  // Added to prelude_
+  AddDocStringOutputs();
+  strings::StrAppend(&result_, "  \"\"\"\n");
+
+  // Handle graph-mode case
+  string function_setup;
+  if (!GetEagerFunctionSetup("    ", &function_setup)) {
+    result_ = function_setup;
+    return false;
+  }
+  HandleGraphMode(function_setup);
+  AddEagerFunctionTeardown("    ", output_sizes,
+                           true /* execute_record_gradient */);
+
+  // Handle eager-mode case
+  strings::StrAppend(&result_, "  else:\n");
+
+  if (eager_not_allowed_error.empty()) {
+    AddEagerFastPathExecute();
+  } else {
+    strings::StrAppend(&result_, "    ", eager_not_allowed_error);
+  }
+
+  strings::StrAppend(&result_, "\n\n");
+  return true;
 }
 
-void GenEagerPythonOp::AddEagerInferredAttrs() {
+bool GenEagerPythonOp::AddEagerFallbackCode(
+    const string& parameters, const std::vector<string>& output_sizes,
+    const string& num_outputs_expr, const string& eager_not_allowed_error) {
+  if (!eager_not_allowed_error.empty()) {
+    strings::StrAppend(&result_, "  ", eager_not_allowed_error);
+    return true;
+  }
+
+  AddDefLine(strings::StrCat(function_name_, kEagerFallbackSuffix), parameters);
+  strings::StrAppend(
+      &result_, "  r\"\"\"This is the slowpath function for Eager mode.\n");
+  strings::StrAppend(&result_, "  This is for function ", function_name_,
+                     "\n  \"\"\"\n");
+
+  strings::StrAppend(&result_, "  _ctx = _context.context()\n");
+
+  string function_setup;
+  if (!GetEagerFunctionSetup("  ", &function_setup)) {
+    result_ = function_setup;
+    return false;
+  }
+  strings::StrAppend(&result_, function_setup);
+
+  AddEagerInferredAttrs("  ");
+  AddEagerInputCasts("  ");
+  strings::StrAppend(
+      &result_, "  _inputs_flat = ", FlattenInputs(nullptr, nullptr), "\n");
+  AddEagerAttrs("  ");
+  AddEagerExecute("  ", num_outputs_expr);
+
+  AddEagerFunctionTeardown("  ", output_sizes,
+                           true /* execute_record_gradient */);
+
+  return true;
+}
+
+void GenEagerPythonOp::AddEagerFastPathExecute() {
+  string fastpath_execute_params = strings::StrCat(
+      "_ctx._handle, _ctx.device_name, \"", op_def_.name(), "\", ",
+      "_execute.record_gradient, name, _ctx._post_execution_callbacks");
+  string fallback_params;
+
+  for (int i = 0; i < api_def_.in_arg_size(); i++) {
+    const string param_name = param_names_[i].GetRenameTo();
+    strings::StrAppend(&fastpath_execute_params, ", ", param_name);
+    if (!fallback_params.empty()) strings::StrAppend(&fallback_params, ", ");
+    strings::StrAppend(&fallback_params, param_name);
+  }
+
+  for (const auto& attr : api_def_.attr()) {
+    if (inferred_attrs_.find(attr.name()) == inferred_attrs_.end()) {
+      strings::StrAppend(&fastpath_execute_params, ", \"", attr.name(), "\", ",
+                         attr.rename_to());
+
+      if (!fallback_params.empty()) strings::StrAppend(&fallback_params, ", ");
+      strings::StrAppend(&fallback_params, attr.rename_to(), "=",
+                         attr.rename_to());
+    }
+  }
+
+  if (!fallback_params.empty()) strings::StrAppend(&fallback_params, ", ");
+  strings::StrAppend(&fallback_params, "name=name");
+
+  strings::StrAppend(&result_, "    try:\n");
+  strings::StrAppend(
+      &result_, "      ",
+      "_result = _pywrap_tensorflow.TFE_Py_FastPathExecute(\n",
+      WordWrap(strings::StrCat("        "),
+               strings::StrCat(fastpath_execute_params, ")"), kRightMargin),
+      "\n");
+
+  if (op_def_.output_arg_size() > 1) {
+    const string output_tuple_name =
+        strings::StrCat("_", op_def_.name(), "Output");
+    strings::StrAppend(&result_, "      ", "_result = ", output_tuple_name,
+                       "._make(_result)\n");
+  }
+  strings::StrAppend(&result_, "      ", "return _result\n");
+
+  // Handle fallback.
+  strings::StrAppend(&result_, "    ", "except _core._FallbackException:\n");
+  strings::StrAppend(
+      &result_, "      ", "return ", function_name_, kEagerFallbackSuffix,
+      "(\n",
+      WordWrap(strings::StrCat("          "),
+               strings::StrCat(fallback_params, ")"), kRightMargin),
+      "\n");
+
+  // Any errors thrown from execute need to be unwrapped from
+  // _NotOkStatusException.
+  strings::StrAppend(&result_, "    ",
+                     "except _core._NotOkStatusException as e:\n");
+  strings::StrAppend(&result_, "      ", "if name is not None:\n");
+  strings::StrAppend(&result_, "        ",
+                     "message = e.message + \" name: \" + name\n");
+  strings::StrAppend(&result_, "      ", "else:\n");
+  strings::StrAppend(&result_, "        ", "message = e.message\n");
+  strings::StrAppend(
+      &result_, "      ",
+      "_six.raise_from(_core._status_to_exception(e.code, message), None)\n");
+}
+
+void GenEagerPythonOp::AddEagerInferredAttrs(const string& indentation) {
   // Figure out values for inferred attrs, and cast to eager tensors.
   for (int i = 0; i < op_def_.attr_size(); ++i) {
     const auto& attr(op_def_.attr(i));
@@ -618,24 +803,24 @@ void GenEagerPythonOp::AddEagerInferredAttrs() {
           const string inputs_var =
               param_names_[arg_list->second.front()].GetRenameTo();
           if (output_sizes.front().empty()) {
-            strings::StrAppend(&result_, "    ", var_name, ", (", inputs_var,
-                               ",) = ", conversion, "\n");
+            strings::StrAppend(&result_, indentation, var_name, ", (",
+                               inputs_var, ",) = ", conversion, "\n");
           } else {
-            strings::StrAppend(&result_, "    ", var_name, ", ", inputs_var,
-                               " = ", conversion, "\n");
+            strings::StrAppend(&result_, indentation, var_name, ", ",
+                               inputs_var, " = ", conversion, "\n");
           }
         } else {
           const string inputs_var = strings::StrCat("_inputs_", attr.name());
-          strings::StrAppend(&result_, "    ", var_name, ", ", inputs_var,
+          strings::StrAppend(&result_, indentation, var_name, ", ", inputs_var,
                              " = ", conversion, "\n");
           // Convert from a flat list of eager tensors back to the
           // parameter variables.
-          Unflatten("    ", output_sizes, inputs_var, &result_);
+          Unflatten(indentation, output_sizes, inputs_var, &result_);
           std::vector<string> p;
           for (int j : arg_list->second) {
             p.emplace_back(param_names_[j].GetRenameTo());
           }
-          strings::StrAppend(&result_, "    ", VectorToTuple(p), " = ",
+          strings::StrAppend(&result_, indentation, VectorToTuple(p), " = ",
                              inputs_var, "\n");
         }
       } else if (attr.type() == "list(type)") {
@@ -662,14 +847,14 @@ void GenEagerPythonOp::AddEagerInferredAttrs() {
           inputs_var = param_names_[arg_list->second.front()].GetRenameTo();
           conversion = "_execute.convert_to_mixed_eager_tensors";
         }
-        strings::StrAppend(&result_, "    ", var_name, ", ", inputs_var, " = ",
-                           conversion, "(", inputs_var, ", _ctx)\n");
+        strings::StrAppend(&result_, indentation, var_name, ", ", inputs_var,
+                           " = ", conversion, "(", inputs_var, ", _ctx)\n");
       }
     }
   }
 }
 
-void GenEagerPythonOp::AddEagerInputCasts() {
+void GenEagerPythonOp::AddEagerInputCasts(const string& indentation) {
   // Cast remaining args to eager tensors
   for (int i = 0; i < op_def_.input_arg_size(); ++i) {
     const auto& arg(op_def_.input_arg(i));
@@ -678,12 +863,12 @@ void GenEagerPythonOp::AddEagerInputCasts() {
     const string fn = arg.number_attr().empty() ? "" : "n_";
     const string dtype =
         python_op_gen_internal::DataTypeToPython(arg.type(), "_dtypes.");
-    strings::StrAppend(&result_, "    ", param, " = _ops.convert_", fn,
+    strings::StrAppend(&result_, indentation, param, " = _ops.convert_", fn,
                        "to_tensor(", param, ", ", dtype, ")\n");
   }
 }
 
-void GenEagerPythonOp::AddEagerAttrs() {
+void GenEagerPythonOp::AddEagerAttrs(const string& indentation) {
   // Compute eager attrs
   if (op_def_.attr_size() > 0) {
     string attr_values;
@@ -695,14 +880,19 @@ void GenEagerPythonOp::AddEagerAttrs() {
     }
     strings::StrAppend(&attr_values, ")");
     strings::StrAppend(
-        &result_, WordWrap("    _attrs = (", attr_values, kRightMargin), "\n");
+        &result_,
+        WordWrap(indentation, strings::StrCat("_attrs = (", attr_values),
+                 kRightMargin),
+        "\n");
   } else {
-    strings::StrAppend(&result_, "    _attrs = None\n");
+    strings::StrAppend(&result_, indentation, "_attrs = None\n");
   }
 }
 
-void GenEagerPythonOp::AddEagerExecute(const string& num_outputs_expr) {
-  const string return_prefix = "    _result = _execute.execute(";
+void GenEagerPythonOp::AddEagerExecute(const string& indentation,
+                                       const string& num_outputs_expr) {
+  const string return_prefix =
+      strings::StrCat(indentation, "_result = _execute.execute(");
   const string return_args = strings::StrCat(
       "b\"", op_def_.name(), "\", ", num_outputs_expr,
       ", inputs=_inputs_flat, attrs=_attrs, ctx=_ctx, name=name)");
@@ -723,8 +913,8 @@ string GetEagerPythonOps(const OpList& ops, const ApiDefMap& api_defs,
 This file is MACHINE GENERATED! Do not edit.
 )");
 
-  // Mention the original source file so someone tracing back through generated
-  // Python code will know where to look next.
+  // Mention the original source file so someone tracing back through
+  // generated Python code will know where to look next.
   if (!source_file_name.empty()) {
     strings::StrAppend(&result, "Original C++ source file: ");
     strings::StrAppend(&result, source_file_name);
@@ -734,11 +924,14 @@ This file is MACHINE GENERATED! Do not edit.
   strings::StrAppend(&result, R"("""
 
 import collections as _collections
+import six as _six
 
-from tensorflow.python.eager import execute as _execute
+from tensorflow.python import pywrap_tensorflow as _pywrap_tensorflow
 from tensorflow.python.eager import context as _context
 from tensorflow.python.eager import core as _core
+from tensorflow.python.eager import execute as _execute
 from tensorflow.python.framework import dtypes as _dtypes
+from tensorflow.python.framework import errors as _errors
 from tensorflow.python.framework import tensor_shape as _tensor_shape
 
 from tensorflow.core.framework import op_def_pb2 as _op_def_pb2
diff --git a/tensorflow/python/eager/pywrap_tfe.h b/tensorflow/python/eager/pywrap_tfe.h
index 925eeb553c..16b7d1a119 100644
--- a/tensorflow/python/eager/pywrap_tfe.h
+++ b/tensorflow/python/eager/pywrap_tfe.h
@@ -45,14 +45,16 @@ void TFE_Py_Execute(TFE_Context* ctx, const char* device_name,
                     PyObject* attrs, TFE_OutputTensorHandles* outputs,
                     TF_Status* out_status);
 
+// Registers e as the Exception class for handling not ok Status. Returns
+// Py_None if registration succeeds, else throws a TypeError and returns NULL.
+//
+// This function is not thread-safe.
+PyObject* TFE_Py_RegisterExceptionClass(PyObject* e);
+
 // Registers e as the Exception to be raised when the conditions of
 // TFE_Py_FastPathExecute_C have not been met. When this exception is set, it
 // is a signal to the calling code that it should fall back to the safer (and
 // more complete) code path.
-PyObject* TFE_Py_RegisterExceptionClass(PyObject* e);
-
-// Registers e as the Exception class for handling not ok Status. Returns
-// Py_None if registration succeeds, else throws a TypeError and returns NULL.
 //
 // This function is not thread-safe.
 PyObject* TFE_Py_RegisterFallbackExceptionClass(PyObject* e);
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index 6e96cf6ebe..bb7b89ab74 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -63,6 +63,17 @@ PARSE_VALUE(ParseInt64LongValue, int64_t, PyLong_Check, PyLong_AsLong)
 PARSE_VALUE(ParseFloatValue, float, PyFloat_Check, PyFloat_AsDouble)
 #undef PARSE_VALUE
 
+Py_ssize_t TensorShapeNumDims(PyObject* value) {
+  const auto size = PySequence_Size(value);
+  if (size == -1) {
+    // TensorShape.__len__ raises an error in the scenario where the shape is an
+    // unknown, which needs to be cleared.
+    // TODO(nareshmodi): ensure that this is actually a TensorShape.
+    PyErr_Clear();
+  }
+  return size;
+}
+
 bool ParseStringValue(const string& key, PyObject* py_value, TF_Status* status,
                       const char** value) {
   if (PyBytes_Check(py_value)) {
@@ -174,8 +185,10 @@ bool SetOpAttrList(
                   .c_str());
           return false;
         }
-        const auto size = PySequence_Size(py_value);
-        total_dims += size;
+        const auto size = TensorShapeNumDims(py_value);
+        if (size >= 0) {
+          total_dims += size;
+        }
       }
     }
     // Allocate a buffer that can fit all of the dims together.
@@ -191,7 +204,12 @@ bool SetOpAttrList(
         dims[i] = nullptr;
         num_dims[i] = -1;
       } else {
-        const auto size = PySequence_Size(py_value);
+        const auto size = TensorShapeNumDims(py_value);
+        if (size == -1) {
+          dims[i] = nullptr;
+          num_dims[i] = -1;
+          continue;
+        }
         dims[i] = offset;
         num_dims[i] = size;
         for (int j = 0; j < size; ++j) {
@@ -314,7 +332,11 @@ bool SetOpAttrScalar(
                          .c_str());
         return false;
       }
-      const auto num_dims = PySequence_Size(py_value);
+      const auto num_dims = TensorShapeNumDims(py_value);
+      if (num_dims == -1) {
+        TFE_OpSetAttrShape(op, key, nullptr, -1, status);
+        return true;
+      }
       std::unique_ptr<int64_t[]> dims(new int64_t[num_dims]);
       for (int i = 0; i < num_dims; ++i) {
         auto inner_py_value = PySequence_ITEM(py_value, i);
diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index 85cba59be4..c95149d177 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -580,8 +580,13 @@ void GenPythonOp::AddExport() {
   strings::StrAppend(&result_, ")\n");
 }
 
+void GenPythonOp::AddDefLine(const string& function_name,
+                             const string& parameters) {
+  strings::StrAppend(&result_, "def ", function_name, "(", parameters, "):\n");
+}
+
 void GenPythonOp::AddDefLine(const string& parameters) {
-  strings::StrAppend(&result_, "def ", function_name_, "(", parameters, "):\n");
+  AddDefLine(function_name_, parameters);
 }
 
 void GenPythonOp::AddDocStringDescription() {
diff --git a/tensorflow/python/framework/python_op_gen_internal.h b/tensorflow/python/framework/python_op_gen_internal.h
index d09b36a3e8..4319e5a782 100644
--- a/tensorflow/python/framework/python_op_gen_internal.h
+++ b/tensorflow/python/framework/python_op_gen_internal.h
@@ -73,6 +73,7 @@ class GenPythonOp {
 
  protected:
   // Print: def Function(parameters):
+  void AddDefLine(const string& function_name, const string& parameters);
   void AddDefLine(const string& parameters);
 
   // Format the Op's descriptions so that it can be a Python docstring.
-- 
GitLab


From 023542fec72fffd27d52a7e77dd95f3e0004abee Mon Sep 17 00:00:00 2001
From: fo40225 <fo40225@users.noreply.github.com>
Date: Fri, 9 Feb 2018 02:54:31 +0800
Subject: [PATCH 837/966] python 2.7 unit test error repair on windows (#16725)

* python 2.7 unit test error repair on windows

* Fix lint error.
---
 tensorflow/contrib/cmake/python_modules.txt             | 7 +++++++
 tensorflow/python/debug/cli/tensor_format.py            | 2 +-
 tensorflow/python/keras/_impl/keras/layers/recurrent.py | 3 ++-
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt
index 57a52bf4ca..7eb880a478 100644
--- a/tensorflow/contrib/cmake/python_modules.txt
+++ b/tensorflow/contrib/cmake/python_modules.txt
@@ -299,7 +299,9 @@ tensorflow/contrib/linear_optimizer/kernels/g3doc
 tensorflow/contrib/linear_optimizer/python
 tensorflow/contrib/linear_optimizer/python/ops
 # TODO(drpngx): Fix failing imports
+# tensorflow/contrib/lite
 # tensorflow/contrib/lite/python
+# tensorflow/contrib/lite/toco
 # tensorflow/contrib/lite/toco/python
 tensorflow/contrib/lookup
 tensorflow/contrib/losses
@@ -331,6 +333,7 @@ tensorflow/contrib/nccl/python
 tensorflow/contrib/nccl/python/ops
 tensorflow/contrib/ndlstm
 tensorflow/contrib/ndlstm/python
+tensorflow/contrib/nearest_neighbor
 tensorflow/contrib/nearest_neighbor/kernels
 tensorflow/contrib/nearest_neighbor/ops
 tensorflow/contrib/nearest_neighbor/python
@@ -362,6 +365,7 @@ tensorflow/contrib/reduce_slice_ops/kernels
 tensorflow/contrib/reduce_slice_ops/ops
 tensorflow/contrib/reduce_slice_ops/python
 tensorflow/contrib/reduce_slice_ops/python/ops
+tensorflow/contrib/remote_fused_graph
 tensorflow/contrib/remote_fused_graph/pylib
 tensorflow/contrib/remote_fused_graph/pylib/python
 tensorflow/contrib/remote_fused_graph/pylib/python/ops
@@ -411,6 +415,7 @@ tensorflow/contrib/summary
 tensorflow/contrib/tensorboard
 tensorflow/contrib/tensorboard/plugins
 tensorflow/contrib/tensorboard/plugins/projector
+tensorflow/contrib/tensorboard/plugins/trace
 tensorflow/contrib/tensor_forest
 tensorflow/contrib/tensor_forest/client
 tensorflow/contrib/tensor_forest/hybrid
@@ -421,6 +426,7 @@ tensorflow/contrib/tensor_forest/hybrid/python/layers
 tensorflow/contrib/tensor_forest/hybrid/python/models
 tensorflow/contrib/tensor_forest/hybrid/python/ops
 tensorflow/contrib/tensor_forest/kernels
+tensorflow/contrib/tensor_forest/proto
 tensorflow/contrib/tensor_forest/python
 tensorflow/contrib/tensor_forest/python/ops
 tensorflow/contrib/testing
@@ -441,6 +447,7 @@ tensorflow/contrib/timeseries/python/timeseries/state_space_models
 tensorflow/contrib/tpu
 tensorflow/contrib/tpu/ops
 tensorflow/contrib/tpu/profiler
+tensorflow/contrib/tpu/proto
 tensorflow/contrib/tpu/python
 tensorflow/contrib/tpu/python/ops
 tensorflow/contrib/tpu/python/profiler
diff --git a/tensorflow/python/debug/cli/tensor_format.py b/tensorflow/python/debug/cli/tensor_format.py
index e0759a8bc1..9ba84e3f22 100644
--- a/tensorflow/python/debug/cli/tensor_format.py
+++ b/tensorflow/python/debug/cli/tensor_format.py
@@ -134,7 +134,7 @@ def format_tensor(tensor,
 
   if include_metadata:
     lines.append("  dtype: %s" % str(tensor.dtype))
-    lines.append("  shape: %s" % str(tensor.shape))
+    lines.append("  shape: %s" % str(tensor.shape).replace("L", ""))
 
   if lines:
     lines.append("")
diff --git a/tensorflow/python/keras/_impl/keras/layers/recurrent.py b/tensorflow/python/keras/_impl/keras/layers/recurrent.py
index 1b0f6cb6cf..926e447c03 100644
--- a/tensorflow/python/keras/_impl/keras/layers/recurrent.py
+++ b/tensorflow/python/keras/_impl/keras/layers/recurrent.py
@@ -19,6 +19,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numbers
 import numpy as np
 
 from tensorflow.python.framework import tensor_shape
@@ -400,7 +401,7 @@ class RNN(Layer):
   @property
   def states(self):
     if self._states is None:
-      if isinstance(self.cell.state_size, int):
+      if isinstance(self.cell.state_size, numbers.Integral):
         num_states = 1
       else:
         num_states = len(self.cell.state_size)
-- 
GitLab


From 8202d047cc2e47ffd3529f84ea0463a171b3b44a Mon Sep 17 00:00:00 2001
From: lazypanda1 <35884075+lazypanda1@users.noreply.github.com>
Date: Thu, 8 Feb 2018 12:54:57 -0600
Subject: [PATCH 838/966] Fixing assert message for beta distribution (#16786)

---
 tensorflow/python/kernel_tests/distributions/beta_test.py | 4 +++-
 tensorflow/python/ops/distributions/beta.py               | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/kernel_tests/distributions/beta_test.py b/tensorflow/python/kernel_tests/distributions/beta_test.py
index 91a451f033..ab5041a6eb 100644
--- a/tensorflow/python/kernel_tests/distributions/beta_test.py
+++ b/tensorflow/python/kernel_tests/distributions/beta_test.py
@@ -107,8 +107,10 @@ class BetaTest(test.TestCase):
         dist.prob([-1., 0.1, 0.5]).eval()
       with self.assertRaisesOpError("sample must be positive"):
         dist.prob([0., 0.1, 0.5]).eval()
-      with self.assertRaisesOpError("sample must be no larger than `1`"):
+      with self.assertRaisesOpError("sample must be less than `1`"):
         dist.prob([.1, .2, 1.2]).eval()
+      with self.assertRaisesOpError("sample must be less than `1`"):
+        dist.prob([.1, .2, 1.0]).eval()
 
   def testPdfTwoBatches(self):
     with self.test_session():
diff --git a/tensorflow/python/ops/distributions/beta.py b/tensorflow/python/ops/distributions/beta.py
index 6d6b40b045..be4ef550dd 100644
--- a/tensorflow/python/ops/distributions/beta.py
+++ b/tensorflow/python/ops/distributions/beta.py
@@ -309,7 +309,7 @@ class Beta(distribution.Distribution):
             message="sample must be positive"),
         check_ops.assert_less(
             x, array_ops.ones([], self.dtype),
-            message="sample must be no larger than `1`."),
+            message="sample must be less than `1`."),
     ], x)
 
 
-- 
GitLab


From 44484309f313ee125019fc665a16a4fd3a6134c2 Mon Sep 17 00:00:00 2001
From: "Haichen \"HC\" Li" <lihc2012@gmail.com>
Date: Thu, 8 Feb 2018 13:55:47 -0500
Subject: [PATCH 839/966] remove keep_dims warning in maxout layer (#16769)

---
 tensorflow/python/layers/maxout.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/layers/maxout.py b/tensorflow/python/layers/maxout.py
index 20ce6c9770..765a1c4fda 100644
--- a/tensorflow/python/layers/maxout.py
+++ b/tensorflow/python/layers/maxout.py
@@ -106,6 +106,6 @@ class MaxOut(base.Layer):
       if shape[i] is None:
         shape[i] = gen_array_ops.shape(inputs)[i]
     outputs = math_ops.reduce_max(
-        gen_array_ops.reshape(inputs, shape), -1, keep_dims=False)
+        gen_array_ops.reshape(inputs, shape), -1, keepdims=False)
 
     return outputs
-- 
GitLab


From e65a6202dbfd034233a9a0b453461f1e3b7fce8d Mon Sep 17 00:00:00 2001
From: Jie <jiej@nvidia.com>
Date: Thu, 8 Feb 2018 11:09:53 -0800
Subject: [PATCH 840/966] [update]   Avoid access of data to comply to internal
 build

---
 tensorflow/contrib/tensorrt/convert/convert_nodes.cc | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index 4e9fc9efab..96426e93fd 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -1128,8 +1128,15 @@ tensorflow::Status ConvertConst(Converter& ctx,
   } else if (!weights_tensor.tensor_content().empty()) {
     VLOG(2) << "TENSOR!!!" << node_def.name();
     const auto& content = weights_tensor.tensor_content();
-      weights = TRT_ShapedWeights(dtype, weights_tensor.tensor_content().data(),
-                                  GetTensorShape(tensor));
+
+    weights = ctx.get_temp_weights(dtype, GetTensorShape(tensor));
+    if (content.size() > 0) {
+      const int dtype_size = tensorflow::DataTypeSize(dtype);
+      CHECK_EQ(0, content.size() % dtype_size)
+          << "Tensor content size (" << content.size()
+          << ") is not a multiple of " << dtype_size;
+      port::CopyToArray(content, static_cast<char*>(const_cast<void*>(weights.GetValues())));
+    }
   } else {
     return tensorflow::errors::Unimplemented(
         "Not supported constant type, at " + node_def.name());
-- 
GitLab


From 1726dfc979de175eb093b3ee88907fbdc238ce79 Mon Sep 17 00:00:00 2001
From: Jie <jiej@nvidia.com>
Date: Thu, 8 Feb 2018 11:16:08 -0800
Subject: [PATCH 841/966] [clang-format]

---
 .../contrib/tensorrt/convert/convert_nodes.cc       | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index 96426e93fd..5efef61f0a 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -188,9 +188,7 @@ class TRT_TensorOrWeights {
   explicit TRT_TensorOrWeights(const TRT_ShapedWeights& weights)
       : tensor_(nullptr), weights_(weights), variant_(TRT_NODE_WEIGHTS) {}
   TRT_TensorOrWeights(const TRT_TensorOrWeights& rhs)
-      : tensor_(rhs.tensor_),
-        weights_(rhs.weights_),
-        variant_(rhs.variant_) {}
+      : tensor_(rhs.tensor_), weights_(rhs.weights_), variant_(rhs.variant_) {}
   ~TRT_TensorOrWeights() {}
 
   bool is_tensor() const { return variant_ == TRT_NODE_TENSOR; }
@@ -828,9 +826,9 @@ tensorflow::Status BinaryTensorOpTensor(
   CHECK_EQ_TYPE(tensor_r->getType(), dtype);
   auto op_pair = ops.find(node_def.op());
   if (op_pair == ops.end())
-    return tensorflow::errors::Unimplemented(
-        "binary op: " + node_def.op() +
-        " not supported at: " + node_def.name());
+    return tensorflow::errors::Unimplemented("binary op: " + node_def.op() +
+                                             " not supported at: " +
+                                             node_def.name());
 
   nvinfer1::IElementWiseLayer* layer = ctx.network()->addElementWise(
       *const_cast<nvinfer1::ITensor*>(tensor_l),
@@ -1135,7 +1133,8 @@ tensorflow::Status ConvertConst(Converter& ctx,
       CHECK_EQ(0, content.size() % dtype_size)
           << "Tensor content size (" << content.size()
           << ") is not a multiple of " << dtype_size;
-      port::CopyToArray(content, static_cast<char*>(const_cast<void*>(weights.GetValues())));
+      port::CopyToArray(
+          content, static_cast<char*>(const_cast<void*>(weights.GetValues())));
     }
   } else {
     return tensorflow::errors::Unimplemented(
-- 
GitLab


From 3b25be3081d9fa1ab6976334c1a2c0f6f8d0d1a7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 8 Feb 2018 11:20:39 -0800
Subject: [PATCH 842/966] Preserving order when removing nodes.

PiperOrigin-RevId: 185023366
---
 tensorflow/tools/graph_transforms/sparsify_gather.cc | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/tensorflow/tools/graph_transforms/sparsify_gather.cc b/tensorflow/tools/graph_transforms/sparsify_gather.cc
index 214ec721e2..701e350fc3 100644
--- a/tensorflow/tools/graph_transforms/sparsify_gather.cc
+++ b/tensorflow/tools/graph_transforms/sparsify_gather.cc
@@ -212,6 +212,14 @@ Status RemoveInputAtIndex(NodeDef* n, int index) {
   return Status::OK();
 }
 
+Status RemoveNodeAtIndex(GraphDef* g, int index) {
+  for (int i = index; i < g->node_size() - 1; i++) {
+    g->mutable_node()->SwapElements(i, i + 1);
+  }
+  g->mutable_node()->RemoveLast();
+  return Status::OK();
+}
+
 Status SparsifyGatherInternal(
     const GraphDef& input_graph_def,
     const std::unique_ptr<std::unordered_map<string, string> >&
@@ -493,9 +501,7 @@ Status SparsifyGatherInternal(
               removed_node_names.push_back(parsed_input);
             }
           }
-          replaced_graph_def.mutable_node()->SwapElements(
-              i, replaced_graph_def.node_size() - 1);
-          replaced_graph_def.mutable_node()->RemoveLast();
+          TF_RETURN_IF_ERROR(RemoveNodeAtIndex(&replaced_graph_def, i));
           continue;
         }
         int j = 0;
-- 
GitLab


From 176d2b940ad96286b08c8f7ef8271df1639a45eb Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Fri, 9 Feb 2018 04:32:10 +0900
Subject: [PATCH 843/966] Fix two links. (#16854)

---
 tensorflow/docs_src/install/install_sources.md | 2 +-
 tensorflow/docs_src/install/install_windows.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index 0dbb15188e..7853ec11f5 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -393,7 +393,7 @@ TensorFlow programs:
 
 <pre>Hello, TensorFlow!</pre>
 
-If you are new to TensorFlow, see @{$get_started/get_started$Getting Started with
+If you are new to TensorFlow, see @{$get_started/premade_estimators$Getting Started with
 TensorFlow}.
 
 If the system outputs an error message instead of a greeting, see [Common
diff --git a/tensorflow/docs_src/install/install_windows.md b/tensorflow/docs_src/install/install_windows.md
index 86a111c2ec..657d37f6bc 100644
--- a/tensorflow/docs_src/install/install_windows.md
+++ b/tensorflow/docs_src/install/install_windows.md
@@ -153,7 +153,7 @@ TensorFlow programs:
 
 <pre>Hello, TensorFlow!</pre>
 
-If you are new to TensorFlow, see @{$get_started/get_started$Getting Started with
+If you are new to TensorFlow, see @{$get_started/premade_estimators$Getting Started with
 TensorFlow}.
 
 If the system outputs an error message instead of a greeting, see [Common
-- 
GitLab


From e595927d05f4d3669be29efd5f9cb5702a63b1c0 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Thu, 8 Feb 2018 11:28:55 -0800
Subject: [PATCH 844/966] [tf.data] Remove deprecated
 `tf.contrib.data.Iterator` alias.

This change removes the following class:
* `tf.contrib.data.Iterator`.

IF THIS BREAKS YOU: Replace `tf.contrib.data.Iterator` with `tf.data.Iterator`
when explicitly constructing an iterator. The API for the resulting object is
identical.

PiperOrigin-RevId: 185024771
---
 tensorflow/contrib/data/__init__.py           |   2 -
 .../contrib/data/python/kernel_tests/BUILD    |  44 +-
 .../kernel_tests/get_single_element_test.py   |  57 ++
 .../kernel_tests/iterator_ops_cluster_test.py | 108 ---
 .../python/kernel_tests/iterator_ops_test.py  | 625 ------------------
 5 files changed, 59 insertions(+), 777 deletions(-)
 create mode 100644 tensorflow/contrib/data/python/kernel_tests/get_single_element_test.py
 delete mode 100644 tensorflow/contrib/data/python/kernel_tests/iterator_ops_cluster_test.py
 delete mode 100644 tensorflow/contrib/data/python/kernel_tests/iterator_ops_test.py

diff --git a/tensorflow/contrib/data/__init__.py b/tensorflow/contrib/data/__init__.py
index 22de13b558..21db1044b0 100644
--- a/tensorflow/contrib/data/__init__.py
+++ b/tensorflow/contrib/data/__init__.py
@@ -22,7 +22,6 @@ See the @{$datasets$Importing Data} Programmer's Guide for an overview.
 
 @@Dataset
 @@Counter
-@@Iterator
 
 @@batch_and_drop_remainder
 @@dense_to_sparse_batch
@@ -68,7 +67,6 @@ from tensorflow.contrib.data.python.ops.readers import SqlDataset
 from tensorflow.contrib.data.python.ops.resampling import rejection_resample
 from tensorflow.contrib.data.python.ops.scan_ops import scan
 from tensorflow.contrib.data.python.ops.shuffle_ops import shuffle_and_repeat
-from tensorflow.python.data.ops.iterator_ops import Iterator
 # pylint: enable=unused-import
 
 from tensorflow.python.util.all_util import remove_undocumented
diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index 3ee82933bc..f58872f2a8 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -208,59 +208,19 @@ py_test(
 )
 
 tf_py_test(
-    name = "iterator_ops_cluster_test",
+    name = "get_single_element_test",
     size = "small",
-    srcs = ["iterator_ops_cluster_test.py"],
-    additional_deps = [
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:function",
-        "//tensorflow/python:functional_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python/data/ops:iterator_ops",
-    ],
-    grpc_enabled = True,
-    tags = [
-        "no_windows",
-        "oss_serial",
-    ],
-)
-
-tf_py_test(
-    name = "iterator_ops_test",
-    size = "small",
-    srcs = ["iterator_ops_test.py"],
+    srcs = ["get_single_element_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
         "//tensorflow/contrib/data/python/ops:dataset_ops",
-        "//tensorflow/contrib/data/python/ops:readers",
-        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
-        "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:function",
-        "//tensorflow/python:functional_ops",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:io_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:script_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:training",
-        "//tensorflow/python/data/ops:iterator_ops",
     ],
-    grpc_enabled = True,
 )
 
 py_test(
diff --git a/tensorflow/contrib/data/python/kernel_tests/get_single_element_test.py b/tensorflow/contrib/data/python/kernel_tests/get_single_element_test.py
new file mode 100644
index 0000000000..03d30bd100
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/get_single_element_test.py
@@ -0,0 +1,57 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the experimental input pipeline ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.data.python.ops import dataset_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class GetSingleElementTest(test.TestCase):
+
+  def testGetSingleElement(self):
+    skip_value = array_ops.placeholder(dtypes.int64, shape=[])
+    take_value = array_ops.placeholder_with_default(
+        constant_op.constant(1, dtype=dtypes.int64), shape=[])
+
+    dataset = (dataset_ops.Dataset.range(100)
+               .skip(skip_value)
+               .map(lambda x: x * x)
+               .take(take_value))
+
+    element = dataset_ops.get_single_element(dataset)
+
+    with self.test_session() as sess:
+      self.assertEqual(0, sess.run(element, feed_dict={skip_value: 0}))
+      self.assertEqual(25, sess.run(element, feed_dict={skip_value: 5}))
+      self.assertEqual(100, sess.run(element, feed_dict={skip_value: 10}))
+
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   "Dataset was empty."):
+        sess.run(element, feed_dict={skip_value: 100})
+
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   "Dataset had more than one element."):
+        sess.run(element, feed_dict={skip_value: 0, take_value: 2})
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/iterator_ops_cluster_test.py b/tensorflow/contrib/data/python/kernel_tests/iterator_ops_cluster_test.py
deleted file mode 100644
index 02379d064d..0000000000
--- a/tensorflow/contrib/data/python/kernel_tests/iterator_ops_cluster_test.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops that need test_util."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.data.python.ops import dataset_ops
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.client import session
-from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import function
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import functional_ops
-from tensorflow.python.platform import test
-
-
-class IteratorClusterTest(test.TestCase):
-
-  def testRemoteIteratorWithoutRemoteCallFail(self):
-    worker_config = config_pb2.ConfigProto()
-    worker_config.device_count["CPU"] = 2
-    worker, _ = test_util.create_local_cluster(
-        1, 1, worker_config=worker_config)
-
-    with ops.device("/job:worker/replica:0/task:0/cpu:1"):
-      dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
-      iterator_3 = dataset_3.make_one_shot_iterator()
-      iterator_3_handle = iterator_3.string_handle()
-
-    with ops.device("/job:worker/replica:0/task:0/cpu:0"):
-      remote_it = iterator_ops.Iterator.from_string_handle(
-          iterator_3_handle, dataset_3.output_types, dataset_3.output_shapes)
-      get_next_op = remote_it.get_next()
-
-    with session.Session(worker[0].target) as sess:
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(get_next_op)
-
-  def _testRemoteIteratorHelper(self, device0, device1, target):
-    with ops.device(device1):
-      dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
-      iterator_3 = dataset_3.make_one_shot_iterator()
-      iterator_3_handle = iterator_3.string_handle()
-
-    @function.Defun(dtypes.string)
-    def _remote_fn(h):
-      remote_iterator = iterator_ops.Iterator.from_string_handle(
-          h, dataset_3.output_types, dataset_3.output_shapes)
-      return remote_iterator.get_next()
-
-    with ops.device(device0):
-      target_placeholder = array_ops.placeholder(dtypes.string, shape=[])
-      remote_op = functional_ops.remote_call(
-          args=[iterator_3_handle],
-          Tout=[dtypes.int32],
-          f=_remote_fn,
-          target=target_placeholder)
-
-    with session.Session(target) as sess:
-      elem = sess.run(remote_op, feed_dict={target_placeholder: device1})
-      self.assertEqual(elem, [1])
-      # Fails when target is cpu:0 where the resource is not located.
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(remote_op, feed_dict={target_placeholder: device0})
-      elem = sess.run(iterator_3.get_next())
-      self.assertEqual(elem, [2])
-      elem = sess.run(remote_op, feed_dict={target_placeholder: device1})
-      self.assertEqual(elem, [3])
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(remote_op, feed_dict={target_placeholder: device1})
-
-  def testRemoteIteratorUsingRemoteCallOp(self):
-    worker_config = config_pb2.ConfigProto()
-    worker_config.device_count["CPU"] = 2
-    worker, _ = test_util.create_local_cluster(
-        1, 1, worker_config=worker_config)
-
-    self._testRemoteIteratorHelper("/job:worker/replica:0/task:0/cpu:0",
-                                   "/job:worker/replica:0/task:0/cpu:1",
-                                   worker[0].target)
-
-  def testRemoteIteratorUsingRemoteCallOpCrossProcess(self):
-    workers, _ = test_util.create_local_cluster(2, 1)
-
-    self._testRemoteIteratorHelper("/job:worker/replica:0/task:0/cpu:0",
-                                   "/job:worker/replica:0/task:1/cpu:0",
-                                   workers[0].target)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/iterator_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/iterator_ops_test.py
deleted file mode 100644
index 9d11865dda..0000000000
--- a/tensorflow/contrib/data/python/kernel_tests/iterator_ops_test.py
+++ /dev/null
@@ -1,625 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import numpy as np
-
-from tensorflow.contrib.data.python.ops import dataset_ops
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.client import session
-from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.data.ops import readers
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import function
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import functional_ops
-from tensorflow.python.ops import gen_dataset_ops
-from tensorflow.python.ops import gradients_impl
-from tensorflow.python.ops import io_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import parsing_ops
-from tensorflow.python.ops import script_ops
-from tensorflow.python.platform import test
-from tensorflow.python.training import server_lib
-
-
-class IteratorTest(test.TestCase):
-
-  def testAttemptingGradientsRaiseExceptions(self):
-    component = constant_op.constant([1])
-    side = constant_op.constant(0)
-    add = lambda x: x + side
-    dataset = dataset_ops.Dataset.from_tensor_slices(component).map(add)
-    value = dataset.make_one_shot_iterator().get_next()
-    with self.assertRaisesRegexp(LookupError, "No gradient defined"):
-      gradients_impl.gradients(value, component)
-    with self.assertRaisesRegexp(LookupError, "No gradient defined"):
-      gradients_impl.gradients(value, side)
-    with self.assertRaisesRegexp(LookupError, "No gradient defined"):
-      gradients_impl.gradients(value, [component, side])
-
-  def testOneShotIterator(self):
-    components = (np.arange(7),
-                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
-                  np.array(37.0) * np.arange(7))
-
-    def _map_fn(x, y, z):
-      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
-
-    iterator = (dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
-                .repeat(14).make_one_shot_iterator())
-    get_next = iterator.get_next()
-
-    self.assertEqual([c.shape[1:] for c in components],
-                     [t.shape for t in get_next])
-
-    with self.test_session() as sess:
-      for _ in range(14):
-        for i in range(7):
-          result = sess.run(get_next)
-          for component, result_component in zip(components, result):
-            self.assertAllEqual(component[i]**2, result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testOneShotIteratorCaptureByValue(self):
-    components = (np.arange(7),
-                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
-                  np.array(37.0) * np.arange(7))
-    tensor_components = tuple([ops.convert_to_tensor(c) for c in components])
-
-    def _map_fn(x, y, z):
-      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
-
-    iterator = (dataset_ops.Dataset.from_tensor_slices(tensor_components)
-                .map(_map_fn).repeat(14).make_one_shot_iterator())
-    get_next = iterator.get_next()
-
-    self.assertEqual([c.shape[1:] for c in components],
-                     [t.shape for t in get_next])
-
-    with self.test_session() as sess:
-      for _ in range(14):
-        for i in range(7):
-          result = sess.run(get_next)
-          for component, result_component in zip(components, result):
-            self.assertAllEqual(component[i]**2, result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testOneShotIteratorInsideContainer(self):
-    components = (np.arange(7),
-                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
-                  np.array(37.0) * np.arange(7))
-
-    def within_container():
-      def _map_fn(x, y, z):
-        return math_ops.square(x), math_ops.square(y), math_ops.square(z)
-      iterator = (dataset_ops.Dataset.from_tensor_slices(components)
-                  .map(_map_fn).repeat(14).make_one_shot_iterator())
-      return iterator.get_next()
-
-    server = server_lib.Server.create_local_server()
-
-    # Create two iterators within unique containers, and run them to
-    # make sure that the resources aren't shared.
-    #
-    # The test below would fail if cname were the same across both
-    # sessions.
-    for i in range(2):
-      with session.Session(server.target) as sess:
-        cname = "iteration%d" % i
-        with ops.container(cname):
-          get_next = within_container()
-
-        for _ in range(14):
-          for i in range(7):
-            result = sess.run(get_next)
-            for component, result_component in zip(components, result):
-              self.assertAllEqual(component[i]**2, result_component)
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-  def testOneShotIteratorNonBlocking(self):
-    dataset = dataset_ops.Dataset.from_tensors([1, 2, 3]).map(lambda x: x * x)
-    iterator = dataset.make_one_shot_iterator()
-    next_element = iterator.get_next()
-
-    # Create a session with a single thread to ensure that the
-    # one-shot iterator initializer does not deadlock.
-    config = config_pb2.ConfigProto(inter_op_parallelism_threads=1,
-                                    use_per_session_threads=True)
-    with session.Session(config=config) as sess:
-      self.assertAllEqual([1, 4, 9], sess.run(next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-    # Test with multiple threads invoking the one-shot iterator concurrently.
-    with session.Session(config=config) as sess:
-      results = []
-      def consumer_thread():
-        try:
-          results.append(sess.run(next_element))
-        except errors.OutOfRangeError:
-          results.append(None)
-
-      num_threads = 8
-      threads = [
-          self.checkedThread(consumer_thread) for _ in range(num_threads)]
-      for t in threads:
-        t.start()
-      for t in threads:
-        t.join()
-
-      self.assertEqual(num_threads, len(results))
-      self.assertEqual(num_threads - 1,
-                       len([None for r in results if r is None]))
-      self.assertAllEqual([[1, 4, 9]], [r for r in results if r is not None])
-
-  def testOneShotIteratorInitializerFails(self):
-    # Define a dataset whose initialization will always fail.
-    dataset = dataset_ops.Dataset.from_tensors(
-        array_ops.check_numerics(
-            constant_op.constant(1.0) / constant_op.constant(0.0), "oops"))
-    iterator = dataset.make_one_shot_iterator()
-    next_element = iterator.get_next()
-
-    with self.test_session() as sess:
-      with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"):
-        sess.run(next_element)
-
-      # Test that subsequent attempts to use the iterator also fail.
-      with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"):
-        sess.run(next_element)
-
-    with self.test_session() as sess:
-      def consumer_thread():
-        with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"):
-          sess.run(next_element)
-
-      num_threads = 8
-      threads = [
-          self.checkedThread(consumer_thread) for _ in range(num_threads)]
-      for t in threads:
-        t.start()
-      for t in threads:
-        t.join()
-
-  def testSimpleSharedResource(self):
-    components = (
-        np.array(1, dtype=np.int64),
-        np.array([1, 2, 3], dtype=np.int64),
-        np.array(37.0, dtype=np.float64)
-    )
-
-    server = server_lib.Server.create_local_server()
-
-    # Create two non-overlapping sessions that share the same iterator
-    # resource on the same server, and verify that an action of the
-    # first session (initializing the iterator) is visible in the
-    # second session.
-    with ops.Graph().as_default():
-      iterator = (dataset_ops.Dataset.from_tensors(components)
-                  .map(lambda x, y, z: (x, y, z)).make_initializable_iterator(
-                      shared_name="shared_iterator"))
-      init_op = iterator.initializer
-      get_next = iterator.get_next()
-
-      with session.Session(server.target) as sess:
-        sess.run(init_op)
-        results = sess.run(get_next)
-        for component, result_component in zip(components, results):
-          self.assertAllEqual(component, result_component)
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-        # Re-initialize the iterator in the first session.
-        sess.run(init_op)
-
-    with ops.Graph().as_default():
-      # Re-define the iterator manually, without defining any of the
-      # functions in this graph, to ensure that we are not
-      # accidentally redefining functions with the same names in the
-      # new graph.
-      iterator = iterator_ops.Iterator.from_structure(
-          shared_name="shared_iterator",
-          output_types=(dtypes.int64, dtypes.int64, dtypes.float64),
-          output_shapes=([], [3], []))
-      get_next = iterator.get_next()
-
-      with session.Session(server.target) as sess:
-        # Use the iterator without re-initializing in the second session.
-        results = sess.run(get_next)
-        for component, result_component in zip(components, results):
-          self.assertAllEqual(component, result_component)
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-  def testNotInitializedError(self):
-    components = (np.array(1), np.array([1, 2, 3]), np.array(37.0))
-    iterator = (dataset_ops.Dataset.from_tensors(components)
-                .make_initializable_iterator())
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      with self.assertRaisesRegexp(errors.FailedPreconditionError,
-                                   "iterator has not been initialized"):
-        sess.run(get_next)
-
-  def testReinitializableIterator(self):
-    dataset_3 = dataset_ops.Dataset.from_tensors(
-        constant_op.constant([1, 2, 3]))
-    dataset_4 = dataset_ops.Dataset.from_tensors(
-        constant_op.constant([4, 5, 6, 7]))
-    iterator = iterator_ops.Iterator.from_structure(dataset_3.output_types,
-                                                    [None])
-
-    dataset_3_init_op = iterator.make_initializer(dataset_3)
-    dataset_4_init_op = iterator.make_initializer(dataset_4)
-    get_next = iterator.get_next()
-
-    self.assertEqual(dataset_3.output_types, iterator.output_types)
-    self.assertEqual(dataset_4.output_types, iterator.output_types)
-    self.assertEqual([None], iterator.output_shapes.as_list())
-
-    with self.test_session() as sess:
-      # The iterator is initially uninitialized.
-      with self.assertRaises(errors.FailedPreconditionError):
-        sess.run(get_next)
-
-      # Initialize with one dataset.
-      sess.run(dataset_3_init_op)
-      self.assertAllEqual([1, 2, 3], sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Initialize with a different dataset.
-      sess.run(dataset_4_init_op)
-      self.assertAllEqual([4, 5, 6, 7], sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Reinitialize with the first dataset.
-      sess.run(dataset_3_init_op)
-      self.assertAllEqual([1, 2, 3], sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testReinitializableIteratorStaticErrors(self):
-    # Non-matching structure for types and shapes.
-    with self.assertRaises(TypeError):
-      iterator = iterator_ops.Iterator.from_structure((dtypes.int64,
-                                                       dtypes.float64), [None])
-
-    # Test validation of dataset argument.
-    iterator = iterator_ops.Iterator.from_structure((dtypes.int64,
-                                                     dtypes.float64))
-
-    # Incompatible structure.
-    with self.assertRaises(ValueError):
-      iterator.make_initializer(
-          dataset_ops.Dataset.from_tensors(((constant_op.constant(
-              [1, 2, 3], dtype=dtypes.int64),), (constant_op.constant(
-                  [4., 5., 6., 7.], dtype=dtypes.float64),))))
-
-    # Incompatible types.
-    with self.assertRaises(TypeError):
-      iterator.make_initializer(
-          dataset_ops.Dataset.from_tensors((constant_op.constant(
-              [1, 2, 3], dtype=dtypes.int32), constant_op.constant(
-                  [4., 5., 6., 7.], dtype=dtypes.float32))))
-
-    # Incompatible shapes.
-    iterator = iterator_ops.Iterator.from_structure(
-        (dtypes.int64, dtypes.float64), ([None], []))
-    with self.assertRaises(TypeError):
-      iterator.make_initializer(
-          dataset_ops.Dataset.from_tensors((constant_op.constant(
-              [1, 2, 3], dtype=dtypes.int64), constant_op.constant(
-                  [4., 5., 6., 7.], dtype=dtypes.float64))))
-
-  def testIteratorStringHandle(self):
-    dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
-    dataset_4 = dataset_ops.Dataset.from_tensor_slices([10, 20, 30, 40])
-
-    iterator_3 = dataset_3.make_one_shot_iterator()
-    iterator_4 = dataset_4.make_one_shot_iterator()
-
-    handle_placeholder = array_ops.placeholder(dtypes.string, shape=[])
-    feedable_iterator = iterator_ops.Iterator.from_string_handle(
-        handle_placeholder, dataset_3.output_types, dataset_3.output_shapes)
-    next_element = feedable_iterator.get_next()
-
-    self.assertEqual(dataset_3.output_types, feedable_iterator.output_types)
-    self.assertEqual(dataset_4.output_types, feedable_iterator.output_types)
-    self.assertEqual([], feedable_iterator.output_shapes)
-
-    with self.test_session() as sess:
-      iterator_3_handle = sess.run(iterator_3.string_handle())
-      iterator_4_handle = sess.run(iterator_4.string_handle())
-
-      self.assertEqual(
-          10, sess.run(next_element,
-                       feed_dict={handle_placeholder: iterator_4_handle}))
-      self.assertEqual(
-          1, sess.run(next_element,
-                      feed_dict={handle_placeholder: iterator_3_handle}))
-      self.assertEqual(
-          20, sess.run(next_element,
-                       feed_dict={handle_placeholder: iterator_4_handle}))
-      self.assertEqual(
-          2, sess.run(next_element,
-                      feed_dict={handle_placeholder: iterator_3_handle}))
-      self.assertEqual(
-          30, sess.run(next_element,
-                       feed_dict={handle_placeholder: iterator_4_handle}))
-      self.assertEqual(
-          3, sess.run(next_element,
-                      feed_dict={handle_placeholder: iterator_3_handle}))
-      self.assertEqual(
-          40, sess.run(next_element,
-                       feed_dict={handle_placeholder: iterator_4_handle}))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element,
-                 feed_dict={handle_placeholder: iterator_3_handle})
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element,
-                 feed_dict={handle_placeholder: iterator_4_handle})
-
-  def testIteratorStringHandleError(self):
-    dataset_int_scalar = (dataset_ops.Dataset.from_tensor_slices([1, 2,
-                                                                  3]).repeat())
-    dataset_float_vector = (dataset_ops.Dataset.from_tensors([1.0, 2.0, 3.0]))
-
-    handle_placeholder = array_ops.placeholder(dtypes.string, shape=[])
-
-    feedable_int_scalar = iterator_ops.Iterator.from_string_handle(
-        handle_placeholder, dtypes.int32, [])
-    feedable_int_vector = iterator_ops.Iterator.from_string_handle(
-        handle_placeholder, dtypes.int32, [None])
-    feedable_int_any = iterator_ops.Iterator.from_string_handle(
-        handle_placeholder, dtypes.int32)
-
-    with self.test_session() as sess:
-      handle_int_scalar = sess.run(
-          dataset_int_scalar.make_one_shot_iterator().string_handle())
-      handle_float_vector = sess.run(
-          dataset_float_vector.make_one_shot_iterator().string_handle())
-
-      self.assertEqual(1,
-                       sess.run(
-                           feedable_int_scalar.get_next(),
-                           feed_dict={handle_placeholder: handle_int_scalar}))
-
-      self.assertEqual(2,
-                       sess.run(
-                           feedable_int_any.get_next(),
-                           feed_dict={handle_placeholder: handle_int_scalar}))
-
-      with self.assertRaises(errors.InvalidArgumentError):
-        print(sess.run(
-            feedable_int_vector.get_next(),
-            feed_dict={handle_placeholder: handle_int_scalar}))
-
-      with self.assertRaises(errors.InvalidArgumentError):
-        print(sess.run(
-            feedable_int_vector.get_next(),
-            feed_dict={handle_placeholder: handle_float_vector}))
-
-  def testRemoteIteratorUsingRemoteCallOpDirectSession(self):
-    worker_config = config_pb2.ConfigProto()
-    worker_config.device_count["CPU"] = 3
-
-    with ops.device("/job:localhost/replica:0/task:0/cpu:1"):
-      dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
-      iterator_3 = dataset_3.make_one_shot_iterator()
-      iterator_3_handle = iterator_3.string_handle()
-
-    @function.Defun(dtypes.string)
-    def _remote_fn(h):
-      remote_iterator = iterator_ops.Iterator.from_string_handle(
-          h, dataset_3.output_types, dataset_3.output_shapes)
-      return remote_iterator.get_next()
-
-    with ops.device("/job:localhost/replica:0/task:0/cpu:0"):
-      target_placeholder = array_ops.placeholder(dtypes.string, shape=[])
-      remote_op = functional_ops.remote_call(
-          args=[iterator_3_handle],
-          Tout=[dtypes.int32],
-          f=_remote_fn,
-          target=target_placeholder)
-
-    with self.test_session(config=worker_config) as sess:
-      elem = sess.run(
-          remote_op,
-          feed_dict={
-              target_placeholder: "/job:localhost/replica:0/task:0/cpu:1"
-          })
-      self.assertEqual(elem, [1])
-      # Fails when target is cpu:2 where the resource is not located.
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(
-            remote_op,
-            feed_dict={
-                target_placeholder: "/job:localhost/replica:0/task:0/cpu:2"
-            })
-      elem = sess.run(
-          remote_op,
-          feed_dict={
-              target_placeholder: "/job:localhost/replica:0/task:0/cpu:1"
-          })
-      self.assertEqual(elem, [2])
-      elem = sess.run(
-          remote_op,
-          feed_dict={
-              target_placeholder: "/job:localhost/replica:0/task:0/cpu:1"
-          })
-      self.assertEqual(elem, [3])
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(
-            remote_op,
-            feed_dict={
-                target_placeholder: "/job:localhost/replica:0/task:0/cpu:1"
-            })
-
-  def testRemoteIteratorUsingRemoteCallOpDirectSessionGPUCPU(self):
-    if not test_util.is_gpu_available():
-      self.skipTest("No GPU available")
-
-    with ops.device("/job:localhost/replica:0/task:0/cpu:0"):
-      dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
-      iterator_3 = dataset_3.make_one_shot_iterator()
-      iterator_3_handle = iterator_3.string_handle()
-
-    def _encode_raw(byte_array):
-      return bytes(bytearray(byte_array))
-
-    @function.Defun(dtypes.uint8)
-    def _remote_fn(h):
-      handle = script_ops.py_func(_encode_raw, [h], dtypes.string)
-      remote_iterator = iterator_ops.Iterator.from_string_handle(
-          handle, dataset_3.output_types, dataset_3.output_shapes)
-      return remote_iterator.get_next()
-
-    with ops.device("/job:localhost/replica:0/task:0/device:GPU:0"):
-      target_placeholder = array_ops.placeholder(dtypes.string, shape=[])
-      iterator_3_handle_uint8 = parsing_ops.decode_raw(
-          bytes=iterator_3_handle, out_type=dtypes.uint8)
-      remote_op = functional_ops.remote_call(
-          args=[iterator_3_handle_uint8],
-          Tout=[dtypes.int32],
-          f=_remote_fn,
-          target=target_placeholder)
-
-    with self.test_session() as sess:
-      elem = sess.run(
-          remote_op,
-          feed_dict={
-              target_placeholder: "/job:localhost/replica:0/task:0/cpu:0"
-          })
-      self.assertEqual(elem, [1])
-      elem = sess.run(
-          remote_op,
-          feed_dict={
-              target_placeholder: "/job:localhost/replica:0/task:0/cpu:0"
-          })
-      self.assertEqual(elem, [2])
-      elem = sess.run(
-          remote_op,
-          feed_dict={
-              target_placeholder: "/job:localhost/replica:0/task:0/cpu:0"
-          })
-      self.assertEqual(elem, [3])
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(
-            remote_op,
-            feed_dict={
-                target_placeholder: "/job:localhost/replica:0/task:0/cpu:0"
-            })
-
-  def testIncorrectIteratorRestore(self):
-
-    def _path():
-      return os.path.join(self.get_temp_dir(), "iterator")
-
-    def _save_op(iterator_resource):
-      iterator_state_variant = gen_dataset_ops.serialize_iterator(
-          iterator_resource)
-      save_op = io_ops.write_file(
-          _path(), parsing_ops.serialize_tensor(iterator_state_variant))
-      return save_op
-
-    def _restore_op(iterator_resource):
-      iterator_state_variant = parsing_ops.parse_tensor(
-          io_ops.read_file(_path()), dtypes.variant)
-      restore_op = gen_dataset_ops.deserialize_iterator(iterator_resource,
-                                                        iterator_state_variant)
-      return restore_op
-
-    def _build_range_dataset_graph():
-      start = 1
-      stop = 10
-      iterator = dataset_ops.Dataset.range(start,
-                                           stop).make_initializable_iterator()
-      init_op = iterator.initializer
-      get_next = iterator.get_next()
-      save_op = _save_op(iterator._iterator_resource)
-      restore_op = _restore_op(iterator._iterator_resource)
-      return init_op, get_next, save_op, restore_op
-
-    def _build_reader_dataset_graph():
-      filenames = ["test"]  # Does not exist but we don't care in this test.
-      iterator = readers.FixedLengthRecordDataset(
-          filenames, 1, 0, 0).make_initializable_iterator()
-      init_op = iterator.initializer
-      get_next_op = iterator.get_next()
-      save_op = _save_op(iterator._iterator_resource)
-      restore_op = _restore_op(iterator._iterator_resource)
-      return init_op, get_next_op, save_op, restore_op
-
-    # Saving iterator for RangeDataset graph.
-    with ops.Graph().as_default() as g:
-      init_op, _, save_op, _ = _build_range_dataset_graph()
-      with self.test_session(graph=g) as sess:
-        sess.run(init_op)
-        sess.run(save_op)
-
-    # Attempt to restore the saved iterator into an IteratorResource of
-    # incompatible type. An iterator of RangeDataset has output type int64,
-    # while an iterator of FixedLengthRecordDataset has output type string.
-    # So an InvalidArgumentError should be raised by
-    # IteratorResource::set_iterator.
-    with ops.Graph().as_default() as g:
-      _, _, _, restore_op = _build_reader_dataset_graph()
-      with self.test_session(graph=g) as sess:
-        with self.assertRaises(errors.InvalidArgumentError):
-          sess.run(restore_op)
-
-  def testToSingleElement(self):
-    skip_value = array_ops.placeholder(dtypes.int64, shape=[])
-    take_value = array_ops.placeholder_with_default(
-        constant_op.constant(1, dtype=dtypes.int64), shape=[])
-
-    dataset = (dataset_ops.Dataset.range(100)
-               .skip(skip_value)
-               .map(lambda x: x * x)
-               .take(take_value))
-
-    element = dataset_ops.get_single_element(dataset)
-
-    with self.test_session() as sess:
-      self.assertEqual(0, sess.run(element, feed_dict={skip_value: 0}))
-      self.assertEqual(25, sess.run(element, feed_dict={skip_value: 5}))
-      self.assertEqual(100, sess.run(element, feed_dict={skip_value: 10}))
-
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   "Dataset was empty."):
-        sess.run(element, feed_dict={skip_value: 100})
-
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   "Dataset had more than one element."):
-        sess.run(element, feed_dict={skip_value: 0, take_value: 2})
-
-
-if __name__ == "__main__":
-  test.main()
-- 
GitLab


From bcd57f5696e9b5ac4903042f43229dd140c5d4f8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 8 Feb 2018 11:31:09 -0800
Subject: [PATCH 845/966] Update llvm revision to r324405, which introduces an
 Orc API change.

PiperOrigin-RevId: 185025122
---
 tensorflow/workspace.bzl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index afd371d016..53831db693 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -473,11 +473,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "llvm",
       urls = [
-          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/ee84001a30d264f1f2acc6f8245b9886a3c0401b.tar.gz",
-          "https://github.com/llvm-mirror/llvm/archive/ee84001a30d264f1f2acc6f8245b9886a3c0401b.tar.gz",
+          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/35ca32d346a6124ec5ddf66bd113d2ffc2ae4b66.tar.gz",
+          "https://github.com/llvm-mirror/llvm/archive/35ca32d346a6124ec5ddf66bd113d2ffc2ae4b66.tar.gz",
       ],
-      sha256 = "d2b18ec6f1838d837c4cae8adce218630b028a6033aad2b06f2554b2132b264f",
-      strip_prefix = "llvm-ee84001a30d264f1f2acc6f8245b9886a3c0401b",
+      sha256 = "f7f991a25f3b4acfe39520d5a548f01b687d17f0b6ba152634084a48de14be77",
+      strip_prefix = "llvm-35ca32d346a6124ec5ddf66bd113d2ffc2ae4b66",
       build_file = str(Label("//third_party/llvm:llvm.BUILD")),
   )
 
-- 
GitLab


From 16a2dfc15a530c5d417cd5cf8bbf595eca3bfd4c Mon Sep 17 00:00:00 2001
From: Yu-Cheng Ling <ycling@google.com>
Date: Thu, 8 Feb 2018 11:40:05 -0800
Subject: [PATCH 846/966] Changing ".lite" to ".tflite" in documents.

PiperOrigin-RevId: 185026484
---
 tensorflow/contrib/lite/README.md                  | 10 +++++-----
 .../contrib/lite/toco/g3doc/cmdline_examples.md    | 14 +++++++-------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/tensorflow/contrib/lite/README.md b/tensorflow/contrib/lite/README.md
index 55a524b207..53140b5473 100644
--- a/tensorflow/contrib/lite/README.md
+++ b/tensorflow/contrib/lite/README.md
@@ -142,7 +142,7 @@ Since we employ several formats, the following definitions may be useful:
 
  - SavedModel - A collection of GraphDef and CheckPoint together with a signature that labels input and output arguments to a model. A GraphDef and Checkpoint can be extracted from a saved model.
 
- - TensorFlow lite model (.lite) - a serialized flatbuffer, containing TensorFlow lite operators and Tensors for the TensorFlow lite interpreter. This is most analogous to TensorFlow frozen GraphDefs.
+ - TensorFlow lite model (.tflite) - a serialized flatbuffer, containing TensorFlow lite operators and Tensors for the TensorFlow lite interpreter. This is most analogous to TensorFlow frozen GraphDefs.
 
 ### Freeze Graph
 To use this .pb GraphDef file within TensorFlow Lite, the application developer will need checkpoints containing trained weight parameters. The .pb contains only the structure of the graph. The process of merging the checkpoint values with the graph structure is known as "freezing" the graph.
@@ -164,9 +164,9 @@ bazel-bin/tensorflow/python/tools/freeze_graph\
 The user has to first build the freeze_graph script using bazel and then run the script.  The input_binary flag has to be enabled to ensure that the protobuf is read and written in binary format.  The user has to input the .pb and the .ckpt files to freeze the graph The output_node_names may not be obvious outside of the code that built the model. The easiest way to find them is to visualize the graph, either with
 graphviz, or [in tensorboard](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets-2/#3).
 
-This frozen Graphdef is now ready to be converted to flatbuffer format (.lite) for use on Android or iOS.  On Android users have the flexibility to use either the float or quantized versions of the frozen graphdef, if available, using the Tensorflow Optimizing Converter tool.
+This frozen Graphdef is now ready to be converted to flatbuffer format (.tflite) for use on Android or iOS.  On Android users have the flexibility to use either the float or quantized versions of the frozen graphdef, if available, using the Tensorflow Optimizing Converter tool.
 
-Here is a sample command line to convert the frozen Graphdef to '.lite' format for  The Tensorflow Optimizing Converter supports both float and quantized models, however, different configuration parameters are needed depending on whether a FLOAT or QUANTIZED mode is being used.
+Here is a sample command line to convert the frozen Graphdef to '.tflite' format for  The Tensorflow Optimizing Converter supports both float and quantized models, however, different configuration parameters are needed depending on whether a FLOAT or QUANTIZED mode is being used.
 (Here is a link to the pb [file](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_1.0_224_frozen.tgz)).
 
 ```
@@ -175,7 +175,7 @@ bazel build tensorflow/contrib/lite/toco:toco
 bazel-bin/tensorflow/contrib/lite/toco/toco -- \
   --input_file=$(pwd)/mobilenet_v1_1.0_224/frozen_graph.pb \
   --input_format=TENSORFLOW_GRAPHDEF  --output_format=TFLITE \
-  --output_file=/tmp/mobilenet_v1_1.0_224.lite --inference_type=FLOAT \
+  --output_file=/tmp/mobilenet_v1_1.0_224.tflite --inference_type=FLOAT \
   --input_type=FLOAT --input_arrays=input \
   --output_arrays=MobilenetV1/Predictions/Reshape_1 --input_shapes=1,224,224,3
 ```
@@ -211,7 +211,7 @@ and then visualize the resulting HTML file in a browser.
 
 ## Step 3. Use the TensorFlow Lite model for inference in a mobile app
 
-After completion of Step 2 the developer should have a .lite model.
+After completion of Step 2 the developer should have a .tflite model.
 
 ### For Android
 Because Android apps need to be written in Java, and core TensorFlow is in C++, a JNI library is provided to interface between the two. Its interface is aimed only at inference, so it provides the ability to load a graph, set up inputs, and run the model to calculate particular outputs. The full documentation for the set of methods can be seen [here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/g3doc/). The demo app is also open sourced on [github](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/java/demo/app).
diff --git a/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md b/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md
index 7e152f5ba8..372c525589 100644
--- a/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md
+++ b/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md
@@ -23,7 +23,7 @@ curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_
 bazel run --config=opt \
   //tensorflow/contrib/lite/toco:toco -- \
   --input_file=/tmp/mobilenet_v1_0.50_128/frozen_graph.pb \
-  --output_file=/tmp/foo.lite \
+  --output_file=/tmp/foo.tflite \
   --input_format=TENSORFLOW_GRAPHDEF \
   --output_format=TFLITE \
   --inference_type=FLOAT \
@@ -101,7 +101,7 @@ direction, let us just give an example of that:
 ```
 bazel run --config=opt \
   //tensorflow/contrib/lite/toco:toco -- \
-  --input_file=/tmp/foo.lite \
+  --input_file=/tmp/foo.tflite \
   --output_file=/tmp/foo.pb \
   --input_format=TFLITE \
   --output_format=TENSORFLOW_GRAPHDEF \
@@ -130,7 +130,7 @@ flatbuffer is done like this:
 bazel run --config=opt \
   //tensorflow/contrib/lite/toco:toco -- \
   --input_file=/tmp/some_quantized_graph.pb \
-  --output_file=/tmp/foo.lite \
+  --output_file=/tmp/foo.tflite \
   --input_format=TENSORFLOW_GRAPHDEF \
   --output_format=TFLITE \
   --inference_type=QUANTIZED_UINT8 \
@@ -207,7 +207,7 @@ curl https://storage.googleapis.com/download.tensorflow.org/models/inception_v1_
 bazel run --config=opt \
   //tensorflow/contrib/lite/toco:toco -- \
   --input_file=/tmp/inception_v1_2016_08_28_frozen.pb \
-  --output_file=/tmp/foo.lite \
+  --output_file=/tmp/foo.tflite \
   --input_format=TENSORFLOW_GRAPHDEF \
   --output_format=TFLITE \
   --inference_type=FLOAT \
@@ -235,7 +235,7 @@ curl https://storage.googleapis.com/download.tensorflow.org/models/inception_v1_
 bazel run --config=opt \
   //tensorflow/contrib/lite/toco:toco -- \
   --input_file=/tmp/inception_v1_2016_08_28_frozen.pb \
-  --output_file=/tmp/foo.lite \
+  --output_file=/tmp/foo.tflite \
   --input_format=TENSORFLOW_GRAPHDEF \
   --output_format=TFLITE \
   --inference_type=FLOAT \
@@ -308,7 +308,7 @@ curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_
 bazel run --config=opt \
   //tensorflow/contrib/lite/toco:toco -- \
   --input_file=/tmp/mobilenet_v1_0.50_128/frozen_graph.pb \
-  --output_file=/tmp/foo.lite \
+  --output_file=/tmp/foo.tflite \
   --input_format=TENSORFLOW_GRAPHDEF \
   --output_format=TFLITE \
   --inference_type=FLOAT \
@@ -415,7 +415,7 @@ curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_
 bazel run --config=opt \
   //tensorflow/contrib/lite/toco:toco -- \
   --input_file=/tmp/mobilenet_v1_0.50_128/frozen_graph.pb \
-  --output_file=/tmp/foo.lite \
+  --output_file=/tmp/foo.tflite \
   --input_format=TENSORFLOW_GRAPHDEF \
   --output_format=TFLITE \
   --inference_type=FLOAT \
-- 
GitLab


From e99724b78b9f6834b918ae8a599597f863cba8d4 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Thu, 8 Feb 2018 11:40:28 -0800
Subject: [PATCH 847/966] Adding tf_export decorators/calls to TensorFlow
 functions and constants.

PiperOrigin-RevId: 185026527
---
 .../python/estimator/canned/baseline.py       |  3 ++
 tensorflow/python/estimator/canned/dnn.py     |  3 ++
 .../estimator/canned/dnn_linear_combined.py   |  3 ++
 tensorflow/python/estimator/canned/linear.py  |  3 ++
 .../python/estimator/canned/parsing_utils.py  |  3 ++
 .../python/feature_column/feature_column.py   |  2 +
 .../keras/_impl/keras/engine/topology.py      |  4 ++
 .../keras/_impl/keras/engine/training.py      |  2 +
 .../keras/_impl/keras/preprocessing/image.py  | 15 ++++++
 .../_impl/keras/preprocessing/sequence.py     |  4 ++
 .../keras/_impl/keras/preprocessing/text.py   |  4 ++
 .../keras/_impl/keras/utils/data_utils.py     |  5 ++
 .../keras/_impl/keras/utils/io_utils.py       |  2 +
 .../keras/_impl/keras/utils/layer_utils.py    |  2 +
 .../keras/_impl/keras/utils/np_utils.py       |  3 ++
 .../keras/_impl/keras/utils/vis_utils.py      |  2 +
 .../_impl/keras/wrappers/scikit_learn.py      |  3 ++
 tensorflow/python/ops/histogram_ops.py        |  2 +
 tensorflow/python/summary/summary.py          |  8 +++
 tensorflow/python/summary/summary_iterator.py |  2 +
 tensorflow/python/summary/text_summary.py     |  2 +
 tensorflow/python/util/compat.py              |  2 +
 tensorflow/tools/api/generator/BUILD          | 51 +++++++++++--------
 23 files changed, 109 insertions(+), 21 deletions(-)

diff --git a/tensorflow/python/estimator/canned/baseline.py b/tensorflow/python/estimator/canned/baseline.py
index 138152ac1c..3e92a77543 100644
--- a/tensorflow/python/estimator/canned/baseline.py
+++ b/tensorflow/python/estimator/canned/baseline.py
@@ -59,6 +59,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops.losses import losses
 from tensorflow.python.training import training_util
+from tensorflow.python.util.tf_export import tf_export
 
 # The default learning rate of 0.3 is a historical artifact of the initial
 # implementation, but seems a reasonable choice.
@@ -173,6 +174,7 @@ def _baseline_model_fn(features, labels, mode, head, optimizer,
       train_op_fn=train_op_fn)
 
 
+@tf_export('estimator.BaselineClassifier')
 class BaselineClassifier(estimator.Estimator):
   """A classifier that can establish a simple baseline.
 
@@ -275,6 +277,7 @@ class BaselineClassifier(estimator.Estimator):
         config=config)
 
 
+@tf_export('estimator.BaselineRegressor')
 class BaselineRegressor(estimator.Estimator):
   """A regressor that can establish a simple baseline.
 
diff --git a/tensorflow/python/estimator/canned/dnn.py b/tensorflow/python/estimator/canned/dnn.py
index 0f274a23c0..c29b5cabc7 100644
--- a/tensorflow/python/estimator/canned/dnn.py
+++ b/tensorflow/python/estimator/canned/dnn.py
@@ -33,6 +33,7 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops.losses import losses
 from tensorflow.python.summary import summary
 from tensorflow.python.training import training_util
+from tensorflow.python.util.tf_export import tf_export
 
 # The default learning rate of 0.05 is a historical artifact of the initial
 # implementation, but seems a reasonable choice.
@@ -198,6 +199,7 @@ def _dnn_model_fn(features,
         logits=logits)
 
 
+@tf_export('estimator.DNNClassifier')
 class DNNClassifier(estimator.Estimator):
   """A classifier for TensorFlow DNN models.
 
@@ -358,6 +360,7 @@ class DNNClassifier(estimator.Estimator):
         warm_start_from=warm_start_from)
 
 
+@tf_export('estimator.DNNRegressor')
 class DNNRegressor(estimator.Estimator):
   """A regressor for TensorFlow DNN models.
 
diff --git a/tensorflow/python/estimator/canned/dnn_linear_combined.py b/tensorflow/python/estimator/canned/dnn_linear_combined.py
index 1a0f4c5c39..0c54013a52 100644
--- a/tensorflow/python/estimator/canned/dnn_linear_combined.py
+++ b/tensorflow/python/estimator/canned/dnn_linear_combined.py
@@ -37,6 +37,7 @@ from tensorflow.python.ops.losses import losses
 from tensorflow.python.summary import summary
 from tensorflow.python.training import sync_replicas_optimizer
 from tensorflow.python.training import training_util
+from tensorflow.python.util.tf_export import tf_export
 
 # The default learning rates are a historical artifact of the initial
 # implementation.
@@ -225,6 +226,7 @@ def _dnn_linear_combined_model_fn(features,
       logits=logits)
 
 
+@tf_export('estimator.DNNLinearCombinedClassifier')
 class DNNLinearCombinedClassifier(estimator.Estimator):
   """An estimator for TensorFlow Linear and DNN joined classification models.
 
@@ -405,6 +407,7 @@ class DNNLinearCombinedClassifier(estimator.Estimator):
         warm_start_from=warm_start_from)
 
 
+@tf_export('estimator.DNNLinearCombinedRegressor')
 class DNNLinearCombinedRegressor(estimator.Estimator):
   """An estimator for TensorFlow Linear and DNN joined models for regression.
 
diff --git a/tensorflow/python/estimator/canned/linear.py b/tensorflow/python/estimator/canned/linear.py
index a5b1172e72..a2f24ef270 100644
--- a/tensorflow/python/estimator/canned/linear.py
+++ b/tensorflow/python/estimator/canned/linear.py
@@ -34,6 +34,7 @@ from tensorflow.python.ops.losses import losses
 from tensorflow.python.summary import summary
 from tensorflow.python.training import ftrl
 from tensorflow.python.training import training_util
+from tensorflow.python.util.tf_export import tf_export
 
 
 # The default learning rate of 0.2 is a historical artifact of the initial
@@ -170,6 +171,7 @@ def _linear_model_fn(features, labels, mode, head, feature_columns, optimizer,
         logits=logits)
 
 
+@tf_export('estimator.LinearClassifier')
 class LinearClassifier(estimator.Estimator):
   """Linear classifier model.
 
@@ -322,6 +324,7 @@ class LinearClassifier(estimator.Estimator):
         warm_start_from=warm_start_from)
 
 
+@tf_export('estimator.LinearRegressor')
 class LinearRegressor(estimator.Estimator):
   """An estimator for TensorFlow Linear regression problems.
 
diff --git a/tensorflow/python/estimator/canned/parsing_utils.py b/tensorflow/python/estimator/canned/parsing_utils.py
index f153272947..74e5e5a1be 100644
--- a/tensorflow/python/estimator/canned/parsing_utils.py
+++ b/tensorflow/python/estimator/canned/parsing_utils.py
@@ -23,8 +23,10 @@ import six
 from tensorflow.python.feature_column import feature_column as fc
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import parsing_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('estimator.classifier_parse_example_spec')
 def classifier_parse_example_spec(feature_columns,
                                   label_key,
                                   label_dtype=dtypes.int64,
@@ -164,6 +166,7 @@ def classifier_parse_example_spec(feature_columns,
   return parsing_spec
 
 
+@tf_export('estimator.regressor_parse_example_spec')
 def regressor_parse_example_spec(feature_columns,
                                  label_key,
                                  label_dtype=dtypes.float32,
diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index 5947d8f6e2..52e42ef018 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -158,6 +158,7 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_utils
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import tf_export
 
 
 def _internal_input_layer(features,
@@ -662,6 +663,7 @@ def embedding_column(
       trainable=trainable)
 
 
+@tf_export('feature_column.shared_embedding_columns')
 def shared_embedding_columns(
     categorical_columns, dimension, combiner='mean', initializer=None,
     shared_embedding_collection_name=None, ckpt_to_load_from=None,
diff --git a/tensorflow/python/keras/_impl/keras/engine/topology.py b/tensorflow/python/keras/_impl/keras/engine/topology.py
index 718c92f408..d1c1d2c8c4 100644
--- a/tensorflow/python/keras/_impl/keras/engine/topology.py
+++ b/tensorflow/python/keras/_impl/keras/engine/topology.py
@@ -39,6 +39,7 @@ from tensorflow.python.layers import base as tf_base_layers
 from tensorflow.python.layers import network as tf_network
 from tensorflow.python.layers import utils as tf_layers_util
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
 
 
 # pylint: disable=g-import-not-at-top
@@ -60,6 +61,7 @@ TFBaseLayer = tf_base_layers.Layer
 # pylint: enable=invalid-name
 
 
+@tf_export('keras.layers.Layer')
 class Layer(tf_base_layers.Layer):
   """Abstract base layer class.
 
@@ -490,6 +492,7 @@ class Layer(tf_base_layers.Layer):
     self._activity_regularizer = activity_regularizer
 
 
+@tf_export('keras.layers.InputLayer')
 class InputLayer(tf_network.InputLayer, Layer):
   """Layer to be used as an entry point into a graph.
 
@@ -552,6 +555,7 @@ class InputLayer(tf_network.InputLayer, Layer):
     return config
 
 
+@tf_export('keras.layers.Input', 'keras.Input')
 def Input(  # pylint: disable=invalid-name
     shape=None,
     batch_size=None,
diff --git a/tensorflow/python/keras/_impl/keras/engine/training.py b/tensorflow/python/keras/_impl/keras/engine/training.py
index 43d95b1f19..faca964d4c 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training.py
@@ -37,6 +37,7 @@ from tensorflow.python.keras._impl.keras.utils.data_utils import Sequence
 from tensorflow.python.keras._impl.keras.utils.generic_utils import Progbar
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import optimizer as tf_optimizer_module
+from tensorflow.python.util.tf_export import tf_export
 
 try:
   from scipy.sparse import issparse  # pylint: disable=g-import-not-at-top
@@ -563,6 +564,7 @@ def _standardize_weights(y,
       return np.ones((y.shape[0], y.shape[1]), dtype=K.floatx())
 
 
+@tf_export('keras.models.Model', 'keras.Model')
 class Model(Network):
   """The `Model` class adds training & evaluation routines to a `Network`.
   """
diff --git a/tensorflow/python/keras/_impl/keras/preprocessing/image.py b/tensorflow/python/keras/_impl/keras/preprocessing/image.py
index db1fdd4e6b..d12f108639 100644
--- a/tensorflow/python/keras/_impl/keras/preprocessing/image.py
+++ b/tensorflow/python/keras/_impl/keras/preprocessing/image.py
@@ -32,6 +32,7 @@ import numpy as np
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.utils.data_utils import Sequence
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
 
 try:
   from scipy import linalg
@@ -62,6 +63,7 @@ if pil_image is not None:
     _PIL_INTERPOLATION_METHODS['lanczos'] = pil_image.LANCZOS
 
 
+@tf_export('keras.preprocessing.image.random_rotation')
 def random_rotation(x,
                     rg,
                     row_axis=1,
@@ -96,6 +98,7 @@ def random_rotation(x,
   return x
 
 
+@tf_export('keras.preprocessing.image.random_shift')
 def random_shift(x,
                  wrg,
                  hrg,
@@ -132,6 +135,7 @@ def random_shift(x,
   return x
 
 
+@tf_export('keras.preprocessing.image.random_shear')
 def random_shear(x,
                  intensity,
                  row_axis=1,
@@ -166,6 +170,7 @@ def random_shear(x,
   return x
 
 
+@tf_export('keras.preprocessing.image.random_zoom')
 def random_zoom(x,
                 zoom_range,
                 row_axis=1,
@@ -209,6 +214,7 @@ def random_zoom(x,
   return x
 
 
+@tf_export('keras.preprocessing.image.random_channel_shift')
 def random_channel_shift(x, intensity, channel_axis=0):
   x = np.rollaxis(x, channel_axis, 0)
   min_x, max_x = np.min(x), np.max(x)
@@ -230,6 +236,7 @@ def transform_matrix_offset_center(matrix, x, y):
   return transform_matrix
 
 
+@tf_export('keras.preprocessing.image.apply_transform')
 def apply_transform(x,
                     transform_matrix,
                     channel_axis=0,
@@ -267,6 +274,7 @@ def apply_transform(x,
   return x
 
 
+@tf_export('keras.preprocessing.image.flip_axis')
 def flip_axis(x, axis):
   x = np.asarray(x).swapaxes(axis, 0)
   x = x[::-1, ...]
@@ -274,6 +282,7 @@ def flip_axis(x, axis):
   return x
 
 
+@tf_export('keras.preprocessing.image.array_to_img')
 def array_to_img(x, data_format=None, scale=True):
   """Converts a 3D Numpy array to a PIL Image instance.
 
@@ -324,6 +333,7 @@ def array_to_img(x, data_format=None, scale=True):
     raise ValueError('Unsupported channel number: ', x.shape[2])
 
 
+@tf_export('keras.preprocessing.image.img_to_array')
 def img_to_array(img, data_format=None):
   """Converts a PIL Image instance to a Numpy array.
 
@@ -358,6 +368,7 @@ def img_to_array(img, data_format=None):
   return x
 
 
+@tf_export('keras.preprocessing.image.load_img')
 def load_img(path, grayscale=False, target_size=None, interpolation='nearest'):
   """Loads an image into PIL format.
 
@@ -411,6 +422,7 @@ def list_pictures(directory, ext='jpg|jpeg|bmp|png|ppm'):
   ]
 
 
+@tf_export('keras.preprocessing.image.ImageDataGenerator')
 class ImageDataGenerator(object):
   """Generate minibatches of image data with real-time data augmentation.
 
@@ -824,6 +836,7 @@ class ImageDataGenerator(object):
           vt.T / np.sqrt(s_expand**2 + self.zca_epsilon)).dot(vt)
 
 
+@tf_export('keras.preprocessing.image.Iterator')
 class Iterator(Sequence):
   """Base class for image data iterators.
 
@@ -913,6 +926,7 @@ class Iterator(Sequence):
     raise NotImplementedError
 
 
+@tf_export('keras.preprocessing.image.NumpyArrayIterator')
 class NumpyArrayIterator(Iterator):
   """Iterator yielding data from a Numpy array.
 
@@ -1094,6 +1108,7 @@ def _list_valid_filenames_in_directory(directory, white_list_formats,
   return classes, filenames
 
 
+@tf_export('keras.preprocessing.image.DirectoryIterator')
 class DirectoryIterator(Iterator):
   """Iterator capable of reading images from a directory on disk.
 
diff --git a/tensorflow/python/keras/_impl/keras/preprocessing/sequence.py b/tensorflow/python/keras/_impl/keras/preprocessing/sequence.py
index 4d59250af0..a423d96d3d 100644
--- a/tensorflow/python/keras/_impl/keras/preprocessing/sequence.py
+++ b/tensorflow/python/keras/_impl/keras/preprocessing/sequence.py
@@ -22,8 +22,10 @@ import random
 
 import numpy as np
 from six.moves import range  # pylint: disable=redefined-builtin
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('keras.preprocessing.sequence.pad_sequences')
 def pad_sequences(sequences,
                   maxlen=None,
                   dtype='int32',
@@ -104,6 +106,7 @@ def pad_sequences(sequences,
   return x
 
 
+@tf_export('keras.preprocessing.sequence.make_sampling_table')
 def make_sampling_table(size, sampling_factor=1e-5):
   """Generates a word rank-based probabilistic sampling table.
 
@@ -137,6 +140,7 @@ def make_sampling_table(size, sampling_factor=1e-5):
   return np.minimum(1., f / np.sqrt(f))
 
 
+@tf_export('keras.preprocessing.sequence.skipgrams')
 def skipgrams(sequence,
               vocabulary_size,
               window_size=4,
diff --git a/tensorflow/python/keras/_impl/keras/preprocessing/text.py b/tensorflow/python/keras/_impl/keras/preprocessing/text.py
index 8f7f25dc0a..1e3828ccf1 100644
--- a/tensorflow/python/keras/_impl/keras/preprocessing/text.py
+++ b/tensorflow/python/keras/_impl/keras/preprocessing/text.py
@@ -28,6 +28,7 @@ from six.moves import range  # pylint: disable=redefined-builtin
 from six.moves import zip  # pylint: disable=redefined-builtin
 
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
 
 
 if sys.version_info < (3,):
@@ -36,6 +37,7 @@ else:
   maketrans = str.maketrans
 
 
+@tf_export('keras.preprocessing.text.text_to_word_sequence')
 def text_to_word_sequence(text,
                           filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                           lower=True,
@@ -64,6 +66,7 @@ def text_to_word_sequence(text,
   return [i for i in seq if i]
 
 
+@tf_export('keras.preprocessing.text.one_hot')
 def one_hot(text,
             n,
             filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
@@ -129,6 +132,7 @@ def hashing_trick(text,
   return [(hash_function(w) % (n - 1) + 1) for w in seq]
 
 
+@tf_export('keras.preprocessing.text.Tokenizer')
 class Tokenizer(object):
   """Text tokenization utility class.
 
diff --git a/tensorflow/python/keras/_impl/keras/utils/data_utils.py b/tensorflow/python/keras/_impl/keras/utils/data_utils.py
index fcee9fbcc3..e87c8f48ef 100644
--- a/tensorflow/python/keras/_impl/keras/utils/data_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/data_utils.py
@@ -40,6 +40,7 @@ from six.moves.urllib.error import URLError
 from six.moves.urllib.request import urlopen
 
 from tensorflow.python.keras._impl.keras.utils.generic_utils import Progbar
+from tensorflow.python.util.tf_export import tf_export
 
 
 try:
@@ -138,6 +139,7 @@ def _extract_archive(file_path, path='.', archive_format='auto'):
   return False
 
 
+@tf_export('keras.utils.get_file')
 def get_file(fname,
              origin,
              untar=False,
@@ -318,6 +320,7 @@ def validate_file(fpath, file_hash, algorithm='auto', chunk_size=65535):
     return False
 
 
+@tf_export('keras.utils.Sequence')
 class Sequence(object):
   """Base object for fitting to a sequence of data, such as a dataset.
 
@@ -414,6 +417,7 @@ def get_index(uid, i):
   return _SHARED_SEQUENCES[uid][i]
 
 
+@tf_export('keras.utils.SequenceEnqueuer')
 class SequenceEnqueuer(object):
   """Base class to enqueue inputs.
 
@@ -613,6 +617,7 @@ class OrderedEnqueuer(SequenceEnqueuer):
     _SHARED_SEQUENCES[self.uid] = None
 
 
+@tf_export('keras.utils.GeneratorEnqueuer')
 class GeneratorEnqueuer(SequenceEnqueuer):
   """Builds a queue out of a data generator.
 
diff --git a/tensorflow/python/keras/_impl/keras/utils/io_utils.py b/tensorflow/python/keras/_impl/keras/utils/io_utils.py
index b36c769843..bbf1d2a3d9 100644
--- a/tensorflow/python/keras/_impl/keras/utils/io_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/io_utils.py
@@ -22,6 +22,7 @@ from collections import defaultdict
 import sys
 
 import numpy as np
+from tensorflow.python.util.tf_export import tf_export
 
 
 try:
@@ -30,6 +31,7 @@ except ImportError:
   h5py = None
 
 
+@tf_export('keras.utils.HDF5Matrix')
 class HDF5Matrix(object):
   """Representation of HDF5 dataset to be used instead of a Numpy array.
 
diff --git a/tensorflow/python/keras/_impl/keras/utils/layer_utils.py b/tensorflow/python/keras/_impl/keras/utils/layer_utils.py
index a2d32424b5..a9c8fa68c9 100644
--- a/tensorflow/python/keras/_impl/keras/utils/layer_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/layer_utils.py
@@ -23,6 +23,7 @@ import numpy as np
 
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.utils.conv_utils import convert_kernel
+from tensorflow.python.util.tf_export import tf_export
 
 
 def count_params(weights):
@@ -190,6 +191,7 @@ def print_summary(model, line_length=None, positions=None, print_fn=None):
   print_fn('_' * line_length)
 
 
+@tf_export('keras.utils.convert_all_kernels_in_model')
 def convert_all_kernels_in_model(model):
   """Converts all convolution kernels in a model from Theano to TensorFlow.
 
diff --git a/tensorflow/python/keras/_impl/keras/utils/np_utils.py b/tensorflow/python/keras/_impl/keras/utils/np_utils.py
index 231833e776..a611be08aa 100644
--- a/tensorflow/python/keras/_impl/keras/utils/np_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/np_utils.py
@@ -18,8 +18,10 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('keras.utils.to_categorical')
 def to_categorical(y, num_classes=None):
   """Converts a class vector (integers) to binary class matrix.
 
@@ -48,6 +50,7 @@ def to_categorical(y, num_classes=None):
   return categorical
 
 
+@tf_export('keras.utils.normalize')
 def normalize(x, axis=-1, order=2):
   """Normalizes a Numpy array.
 
diff --git a/tensorflow/python/keras/_impl/keras/utils/vis_utils.py b/tensorflow/python/keras/_impl/keras/utils/vis_utils.py
index 0c5f2c19c7..45c1b92075 100644
--- a/tensorflow/python/keras/_impl/keras/utils/vis_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/vis_utils.py
@@ -20,6 +20,7 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+from tensorflow.python.util.tf_export import tf_export
 
 
 try:
@@ -127,6 +128,7 @@ def model_to_dot(model, show_shapes=False, show_layer_names=True, rankdir='TB'):
   return dot
 
 
+@tf_export('keras.utils.plot_model')
 def plot_model(model,
                to_file='model.png',
                show_shapes=False,
diff --git a/tensorflow/python/keras/_impl/keras/wrappers/scikit_learn.py b/tensorflow/python/keras/_impl/keras/wrappers/scikit_learn.py
index 223ceac3de..2884dc84cc 100644
--- a/tensorflow/python/keras/_impl/keras/wrappers/scikit_learn.py
+++ b/tensorflow/python/keras/_impl/keras/wrappers/scikit_learn.py
@@ -26,6 +26,7 @@ import numpy as np
 from tensorflow.python.keras._impl.keras.models import Sequential
 from tensorflow.python.keras._impl.keras.utils.generic_utils import has_arg
 from tensorflow.python.keras._impl.keras.utils.np_utils import to_categorical
+from tensorflow.python.util.tf_export import tf_export
 
 
 class BaseWrapper(object):
@@ -187,6 +188,7 @@ class BaseWrapper(object):
     return res
 
 
+@tf_export('keras.wrappers.scikit_learn.KerasClassifier')
 class KerasClassifier(BaseWrapper):
   """Implementation of the scikit-learn classifier API for Keras.
   """
@@ -309,6 +311,7 @@ class KerasClassifier(BaseWrapper):
                      'the `model.compile()` method.')
 
 
+@tf_export('keras.wrappers.scikit_learn.KerasRegressor')
 class KerasRegressor(BaseWrapper):
   """Implementation of the scikit-learn regressor API for Keras.
   """
diff --git a/tensorflow/python/ops/histogram_ops.py b/tensorflow/python/ops/histogram_ops.py
index f079e56b10..6a975160b0 100644
--- a/tensorflow/python/ops/histogram_ops.py
+++ b/tensorflow/python/ops/histogram_ops.py
@@ -32,8 +32,10 @@ from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('histogram_fixed_width_bins')
 def histogram_fixed_width_bins(values,
                                value_range,
                                nbins=100,
diff --git a/tensorflow/python/summary/summary.py b/tensorflow/python/summary/summary.py
index 92c1fcadd2..b80ad79074 100644
--- a/tensorflow/python/summary/summary.py
+++ b/tensorflow/python/summary/summary.py
@@ -72,8 +72,10 @@ from tensorflow.python.summary.writer.writer_cache import FileWriterCache
 
 from tensorflow.python.util import compat as _compat
 from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('summary.scalar')
 def scalar(name, tensor, collections=None, family=None):
   """Outputs a `Summary` protocol buffer containing a single scalar value.
 
@@ -102,6 +104,7 @@ def scalar(name, tensor, collections=None, family=None):
   return val
 
 
+@tf_export('summary.image')
 def image(name, tensor, max_outputs=3, collections=None, family=None):
   """Outputs a `Summary` protocol buffer with images.
 
@@ -156,6 +159,7 @@ def image(name, tensor, max_outputs=3, collections=None, family=None):
   return val
 
 
+@tf_export('summary.histogram')
 def histogram(name, values, collections=None, family=None):
   # pylint: disable=line-too-long
   """Outputs a `Summary` protocol buffer with a histogram.
@@ -195,6 +199,7 @@ def histogram(name, values, collections=None, family=None):
   return val
 
 
+@tf_export('summary.audio')
 def audio(name, tensor, sample_rate, max_outputs=3, collections=None,
           family=None):
   # pylint: disable=line-too-long
@@ -242,6 +247,7 @@ def audio(name, tensor, sample_rate, max_outputs=3, collections=None,
   return val
 
 
+@tf_export('summary.merge')
 def merge(inputs, collections=None, name=None):
   # pylint: disable=line-too-long
   """Merges summaries.
@@ -286,6 +292,7 @@ def merge(inputs, collections=None, name=None):
   return val
 
 
+@tf_export('summary.merge_all')
 def merge_all(key=_ops.GraphKeys.SUMMARIES, scope=None):
   """Merges all summaries collected in the default graph.
 
@@ -318,6 +325,7 @@ def merge_all(key=_ops.GraphKeys.SUMMARIES, scope=None):
     return merge(summary_ops)
 
 
+@tf_export('summary.get_summary_description')
 def get_summary_description(node_def):
   """Given a TensorSummary node_def, retrieve its SummaryDescription.
 
diff --git a/tensorflow/python/summary/summary_iterator.py b/tensorflow/python/summary/summary_iterator.py
index 6969c4cf15..321b11ffb7 100644
--- a/tensorflow/python/summary/summary_iterator.py
+++ b/tensorflow/python/summary/summary_iterator.py
@@ -21,8 +21,10 @@ from __future__ import print_function
 
 from tensorflow.core.util import event_pb2
 from tensorflow.python.lib.io import tf_record
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('train.summary_iterator')
 def summary_iterator(path):
   # pylint: disable=line-too-long
   """An iterator for reading `Event` protocol buffers from an event file.
diff --git a/tensorflow/python/summary/text_summary.py b/tensorflow/python/summary/text_summary.py
index 94a85d73e2..6418c847f3 100644
--- a/tensorflow/python/summary/text_summary.py
+++ b/tensorflow/python/summary/text_summary.py
@@ -26,10 +26,12 @@ from __future__ import print_function
 from tensorflow.core.framework import summary_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops.summary_ops import tensor_summary
+from tensorflow.python.util.tf_export import tf_export
 
 PLUGIN_NAME = "text"
 
 
+@tf_export("summary.text")
 def text_summary(name, tensor, collections=None):
   """Summarizes textual data.
 
diff --git a/tensorflow/python/util/compat.py b/tensorflow/python/util/compat.py
index 7e5f192b8f..4163fcac79 100644
--- a/tensorflow/python/util/compat.py
+++ b/tensorflow/python/util/compat.py
@@ -42,6 +42,7 @@ import six as _six
 
 from tensorflow.python.util.all_util import remove_undocumented
 from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export('compat.as_bytes', 'compat.as_str')
@@ -112,6 +113,7 @@ def as_str_any(value):
     return str(value)
 
 
+@tf_export('compat.path_to_str')
 def path_to_str(path):
   """Returns the file system path representation of a `PathLike` object, else as it is.
 
diff --git a/tensorflow/tools/api/generator/BUILD b/tensorflow/tools/api/generator/BUILD
index dab1b4b8f4..2d27b94d85 100644
--- a/tensorflow/tools/api/generator/BUILD
+++ b/tensorflow/tools/api/generator/BUILD
@@ -41,22 +41,26 @@ genrule(
     # every module exported using tf_export. For e.g. if an op is decorated with
     # @tf_export('module1.module2', 'module3'). Then, outs should include
     # api/module1/module2/__init__.py and api/module3/__init__.py.
+    # keep sorted
     outs = [
         "api/__init__.py",
+        "api/app/__init__.py",
         "api/bitwise/__init__.py",
+        "api/compat/__init__.py",
         "api/contrib/__init__.py",
         "api/contrib/stat_summarizer/__init__.py",
+        "api/data/__init__.py",
         "api/distributions/__init__.py",
         "api/distributions/bijectors/__init__.py",
         "api/errors/__init__.py",
-        "api/image/__init__.py",
-        "api/linalg/__init__.py",
-        "api/nn/__init__.py",
-        "api/spectral/__init__.py",
-        "api/train/__init__.py",
-        "api/app/__init__.py",
+        "api/estimator/__init__.py",
+        "api/estimator/export/__init__.py",
+        "api/estimator/inputs/__init__.py",
+        "api/feature_column/__init__.py",
         "api/gfile/__init__.py",
         "api/graph_util/__init__.py",
+        "api/image/__init__.py",
+        "api/initializers/__init__.py",
         "api/keras/__init__.py",
         "api/keras/backend/__init__.py",
         "api/keras/datasets/__init__.py",
@@ -66,27 +70,25 @@ genrule(
         "api/keras/datasets/imdb/__init__.py",
         "api/keras/datasets/mnist/__init__.py",
         "api/keras/datasets/reuters/__init__.py",
+        "api/keras/initializers/__init__.py",
+        "api/keras/layers/__init__.py",
+        "api/keras/models/__init__.py",
+        "api/keras/preprocessing/__init__.py",
+        "api/keras/preprocessing/image/__init__.py",
+        "api/keras/preprocessing/sequence/__init__.py",
+        "api/keras/preprocessing/text/__init__.py",
         "api/keras/utils/__init__.py",
+        "api/keras/wrappers/__init__.py",
+        "api/keras/wrappers/scikit_learn/__init__.py",
+        "api/linalg/__init__.py",
         "api/logging/__init__.py",
-        "api/resource_loader/__init__.py",
-        "api/sysconfig/__init__.py",
-        "api/test/__init__.py",
-        "api/initializers/__init__.py",
-        "api/keras/initializers/__init__.py",
+        "api/losses/__init__.py",
         "api/metrics/__init__.py",
+        "api/nn/__init__.py",
         "api/nn/rnn_cell/__init__.py",
-        "api/sets/__init__.py",
-        "api/summary/__init__.py",
-        "api/train/queue_runner/__init__.py",
-        "api/compat/__init__.py",
-        "api/data/__init__.py",
-        "api/estimator/__init__.py",
-        "api/estimator/export/__init__.py",
-        "api/estimator/inputs/__init__.py",
-        "api/feature_column/__init__.py",
-        "api/losses/__init__.py",
         "api/profiler/__init__.py",
         "api/python_io/__init__.py",
+        "api/resource_loader/__init__.py",
         "api/saved_model/__init__.py",
         "api/saved_model/builder/__init__.py",
         "api/saved_model/constants/__init__.py",
@@ -96,6 +98,13 @@ genrule(
         "api/saved_model/signature_def_utils/__init__.py",
         "api/saved_model/tag_constants/__init__.py",
         "api/saved_model/utils/__init__.py",
+        "api/sets/__init__.py",
+        "api/spectral/__init__.py",
+        "api/summary/__init__.py",
+        "api/sysconfig/__init__.py",
+        "api/test/__init__.py",
+        "api/train/__init__.py",
+        "api/train/queue_runner/__init__.py",
     ],
     cmd = "$(location create_python_api) $(OUTS)",
     tools = ["create_python_api"],
-- 
GitLab


From 52d51aae52d978a66242a4a2f3342aab7e112443 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 8 Feb 2018 12:04:37 -0800
Subject: [PATCH 848/966] Expose python:platform build target.

PiperOrigin-RevId: 185030536
---
 tensorflow/python/BUILD | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index f5cd7885e7..f563d32388 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -1,5 +1,8 @@
 # Description:
 # Python support for TensorFlow.
+#
+# Public targets:
+#  ":platform" - Low-level and platform-specific Python code.
 
 package(
     default_visibility = [
@@ -132,6 +135,7 @@ py_library(
         ],
     ) + ["platform/build_info.py"],
     srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
     deps = [
         ":lib",
         ":pywrap_tensorflow",
-- 
GitLab


From 88e18c3ef01ee0f63f4f0df61311773f405ba375 Mon Sep 17 00:00:00 2001
From: Jerome <jvgonzalvo@up.edu.ph>
Date: Fri, 9 Feb 2018 04:16:38 +0800
Subject: [PATCH 849/966] Move some ndlstm functions to contrib (#16816)

* Added ctc_loss_dense_labels. This does the conversion of dense labels into sparse ones to be passed into the core ctc_loss function.

* Removed constant_op from the import.

* Matched ctc_loss_dense_labels with the other layers ops.

* Added ctc_loss_dense_labels to contrib.layers __init__.py file

* Added missing comma to list of ops.

* Reordred arguments for ctc_loss_dense_labels

Labels should be first then inputs for ctc_loss.

* Removed ctc_loss_dense_labels.

Replaced it with dense_to_sparse instead so that there'll be only one ctc_loss function.

* Replaced ctc_loss_dense_labels with dense_to_sparse

* Fixed dense_to_sparse. Some of the names of the variables did not match with that of the parameters.

* Updated documentation for dense_to_sparse since it can accept a tensor of any shape.

* Added test case for dense_to_sparse.

* Updated documentation. Dense to sparse accepts int tensors.

* Fixed testDenseFromConstantToSparse.

The sparse_to_dense order of arguments in the test are wrong and the expected constant should be of int64.

* Modified implementation of ndlstm_base_dynamic.

It now uses a BasicLSTMCell that has state_is_tuple=True to address deprecation. Right now it is still unknown why it was set to false in the first place.

* Imported lstm1d and lstm2d in ndlstm __init__.py.

Makes importing ndlstm modules easier.

* Added testGetBlocks in lstm2d_test.

* Removed testGetBlocks.py

* Modified lstm1d.ndlstm_base_unrolled to use lstm_cell with state_is_tuple = True.

* Copied some lstm2d.py functions in ndlstm module to contrib.layers.

* Update lstm1d.py

Reverted changes made.

* Update layers_test.py

Fixed failing test.

* Modified layers.py and layers_test.py.

Made them pass pylint tests.
---
 tensorflow/contrib/layers/__init__.py         |  2 +
 .../contrib/layers/python/layers/layers.py    | 70 +++++++++++++++++--
 .../layers/python/layers/layers_test.py       | 49 +++++++++++++
 3 files changed, 115 insertions(+), 6 deletions(-)

diff --git a/tensorflow/contrib/layers/__init__.py b/tensorflow/contrib/layers/__init__.py
index ef419862b4..337c9e06b8 100644
--- a/tensorflow/contrib/layers/__init__.py
+++ b/tensorflow/contrib/layers/__init__.py
@@ -35,6 +35,7 @@ See the @{$python/contrib.layers} guide.
 @@fully_connected
 @@GDN
 @@gdn
+@@images_to_sequence
 @@layer_norm
 @@linear
 @@max_pool2d
@@ -50,6 +51,7 @@ See the @{$python/contrib.layers} guide.
 @@scale_gradient
 @@separable_conv2d
 @@separable_convolution2d
+@@sequence_to_images
 @@softmax
 @@spatial_softmax
 @@stack
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index fb7b2e315e..9a69d9ec04 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -60,12 +60,12 @@ __all__ = [
     'conv2d_in_plane', 'conv2d_transpose', 'conv3d_transpose', 'convolution',
     'convolution2d', 'convolution2d_in_plane', 'convolution2d_transpose',
     'convolution3d', 'convolution3d_transpose', 'dense_to_sparse',
-    'dropout', 'elu', 'flatten', 'fully_connected', 'GDN', 'gdn', 'layer_norm',
-    'linear', 'pool', 'max_pool2d', 'max_pool3d', 'one_hot_encoding', 'relu',
-    'relu6', 'repeat', 'scale_gradient', 'separable_conv2d',
-    'separable_convolution2d', 'softmax', 'spatial_softmax', 'stack',
-    'unit_norm', 'legacy_fully_connected', 'legacy_linear', 'legacy_relu',
-    'maxout'
+    'dropout', 'elu', 'flatten', 'fully_connected', 'GDN', 'gdn',
+    'images_to_sequence', 'layer_norm', 'linear', 'pool', 'max_pool2d',
+    'max_pool3d', 'one_hot_encoding', 'relu', 'relu6', 'repeat',
+    'scale_gradient', 'separable_conv2d', 'separable_convolution2d',
+    'sequence_to_images', 'softmax', 'spatial_softmax', 'stack', 'unit_norm',
+    'legacy_fully_connected', 'legacy_linear', 'legacy_relu', 'maxout'
 ]
 
 DATA_FORMAT_NCHW = 'NCHW'
@@ -2187,6 +2187,34 @@ def layer_norm(inputs,
     return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
 
 
+@add_arg_scope
+def images_to_sequence(inputs, data_format=DATA_FORMAT_NHWC,
+                       outputs_collections=None, scope=None):
+  """Convert a batch of images into a batch of sequences.
+  Args:
+    inputs: a (num_images, height, width, depth) tensor
+    data_format: A string. `NHWC` (default) and `NCHW` are supported.
+    outputs_collections: The collections to which the outputs are added.
+    scope: Optional scope for name_scope.
+  Returns:
+    (width, num_images*height, depth) sequence tensor
+  """
+  if data_format not in (DATA_FORMAT_NCHW, DATA_FORMAT_NHWC):
+    raise ValueError('data_format has to be either NCHW or NHWC.')
+  with ops.name_scope(scope, 'ImagesToSequence', [inputs]) as sc:
+    inputs = ops.convert_to_tensor(inputs)
+    df = ('channels_first'
+          if data_format and data_format.startswith('NC') else 'channels_last')
+    if df == 'channels_first':
+      inputs = array_ops.transpose(inputs, [0, 2, 3, 1])
+    _, _, width, depth = inputs.get_shape().as_list()
+    s = array_ops.shape(inputs)
+    batch_size, height = s[0], s[1]
+    transposed = array_ops.transpose(inputs, [2, 0, 1, 3])
+    outputs = array_ops.reshape(transposed, [width, batch_size * height, depth])
+    return utils.collect_named_outputs(outputs_collections, sc, outputs)
+
+
 @add_arg_scope
 def max_pool2d(inputs,
                kernel_size,
@@ -2666,6 +2694,36 @@ def separable_convolution2d(
     return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
 
 
+@add_arg_scope
+def sequence_to_images(inputs, height, output_data_format='channels_last',
+                       outputs_collections=None, scope=None):
+  """Convert a batch of sequences into a batch of images.
+  Args:
+    inputs: (num_steps, num_batches, depth) sequence tensor
+    height: the height of the images
+    output_data_format: Format of output tensor.
+      Currently supports `'channels_first'` and `'channels_last'`.
+    outputs_collections: The collections to which the outputs are added.
+    scope: Optional scope for name_scope.
+  Returns:
+    A tensor representing the output of the operation.
+  """
+  with ops.name_scope(scope, 'SequenceToImages', [inputs]) as sc:
+    inputs = ops.convert_to_tensor(inputs)
+    width, num_batches, depth = inputs.get_shape().as_list()
+    if num_batches is None:
+      num_batches = -1
+    else:
+      num_batches = num_batches // height
+    reshaped = array_ops.reshape(inputs,
+                                 [width, num_batches, height, depth])
+    if output_data_format == 'channels_first':
+      outputs = array_ops.transpose(reshaped, [1, 3, 2, 0])
+    else:
+      outputs = array_ops.transpose(reshaped, [1, 2, 0, 3])
+    return utils.collect_named_outputs(outputs_collections, sc, outputs)
+
+
 @add_arg_scope
 def softmax(logits, scope=None):
   """Performs softmax on Nth dimension of N-dimensional logit tensor.
diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py
index 2bd6c6eb58..8a75607e1c 100644
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@@ -2949,6 +2949,28 @@ class GDNTest(test.TestCase):
     self.assertAllClose(y, x * np.sqrt(1 + .1 * (x**2)), rtol=0, atol=1e-6)
 
 
+class ImagesToSequenceTest(test.TestCase):
+
+  def testInvalidDataFormat(self):
+    height, width = 7, 11
+    images = np.random.uniform(size=(5, height, width, 2))
+    with self.assertRaisesRegexp(ValueError,
+                                 'data_format has to be either NCHW or NHWC.'):
+      _layers.images_to_sequence(images, data_format='CHWN')
+
+  def testImagesToSequenceDims(self):
+    height, width = 7, 11
+    images = np.random.uniform(size=(2, height, width, 5)).astype(np.float32)
+    output = _layers.images_to_sequence(images)
+    self.assertListEqual(output.get_shape().as_list(), [11, 14, 5])
+
+  def testImagesToSequenceNCHW(self):
+    height, width = 7, 11
+    images = np.random.uniform(size=(2, 5, height, width)).astype(np.float32)
+    output = _layers.images_to_sequence(images, data_format='NCHW')
+    self.assertListEqual(output.get_shape().as_list(), [11, 14, 5])
+
+
 class MaxPool2DTest(test.TestCase):
 
   def testInvalidDataFormat(self):
@@ -3417,6 +3439,33 @@ class ScaleGradientTests(test.TestCase):
       np.testing.assert_array_equal([3 * 2], g_x.eval())
 
 
+class SequenceToImagesTest(test.TestCase):
+
+  def testImagesToSequenceDims(self):
+    num_batches = 14
+    num_time_steps = 11
+    num_channels = 5
+    desired_height = 7
+    sequence = np.random.uniform(size=(num_time_steps,
+                                       num_batches,
+                                       num_channels)).astype(np.float32)
+    output = _layers.sequence_to_images(sequence, desired_height)
+    self.assertListEqual(output.get_shape().as_list(), [2, 7, 11, 5])
+
+  def testImagesToSequenceNCHW(self):
+    num_batches = 14
+    num_time_steps = 11
+    num_channels = 5
+    desired_height = 7
+    sequence = np.random.uniform(size=(num_time_steps,
+                                       num_batches,
+                                       num_channels)).astype(np.float32)
+    output = _layers.sequence_to_images(sequence,
+                                        desired_height,
+                                        output_data_format='channels_first')
+    self.assertListEqual(output.get_shape().as_list(), [2, 5, 7, 11])
+
+
 class SoftmaxTests(test.TestCase):
 
   def setUp(self):
-- 
GitLab


From d6ff644382fc6afb193dc328eea3de0e2e4f3bb4 Mon Sep 17 00:00:00 2001
From: Johnson145 <Johnson145@users.noreply.github.com>
Date: Thu, 8 Feb 2018 21:16:52 +0100
Subject: [PATCH 850/966] fix typo (#16812)

---
 tensorflow/contrib/lite/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/lite/README.md b/tensorflow/contrib/lite/README.md
index 55a524b207..3b3166d823 100644
--- a/tensorflow/contrib/lite/README.md
+++ b/tensorflow/contrib/lite/README.md
@@ -172,7 +172,7 @@ Here is a sample command line to convert the frozen Graphdef to '.lite' format f
 ```
 bazel build tensorflow/contrib/lite/toco:toco
 
-bazel-bin/tensorflow/contrib/lite/toco/toco -- \
+bazel-bin/tensorflow/contrib/lite/toco/toco \
   --input_file=$(pwd)/mobilenet_v1_1.0_224/frozen_graph.pb \
   --input_format=TENSORFLOW_GRAPHDEF  --output_format=TFLITE \
   --output_file=/tmp/mobilenet_v1_1.0_224.lite --inference_type=FLOAT \
-- 
GitLab


From 5e23338fec26f0c5ad588742b8df80e5bf1a940d Mon Sep 17 00:00:00 2001
From: matthieudelaro <matthieudelaro@users.noreply.github.com>
Date: Thu, 8 Feb 2018 21:17:37 +0100
Subject: [PATCH 851/966] Fix missing . (#16460)

---
 tensorflow/docs_src/extend/tool_developers/index.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/docs_src/extend/tool_developers/index.md b/tensorflow/docs_src/extend/tool_developers/index.md
index f02cd23be8..f5440bc96d 100644
--- a/tensorflow/docs_src/extend/tool_developers/index.md
+++ b/tensorflow/docs_src/extend/tool_developers/index.md
@@ -156,7 +156,7 @@ of checkpoints and freezes them together into a single file.
 
 What this does is load the `GraphDef`, pull in the values for all the variables
 from the latest checkpoint file, and then replace each `Variable` op with a
-`Const` that has the numerical data for the weights stored in its attributes
+`Const` that has the numerical data for the weights stored in its attributes.
 It then strips away all the extraneous nodes that aren't used for forward
 inference, and saves out the resulting `GraphDef` into an output file.
 
-- 
GitLab


From 5e10de83a4586c85bbbca313a878987f606fb00b Mon Sep 17 00:00:00 2001
From: Ian Langmore <langmore@google.com>
Date: Thu, 8 Feb 2018 12:15:32 -0800
Subject: [PATCH 852/966] Add effective_sample_size to
 tf.contrib.bayesflow.mcmc_diagnostics. Also, start dealing with list args in
 a more regular manner.

PiperOrigin-RevId: 185032115
---
 tensorflow/contrib/bayesflow/BUILD            |   1 +
 .../kernel_tests/mcmc_diagnostics_test.py     | 156 ++++++++-
 .../bayesflow/python/ops/mcmc_diagnostics.py  |   1 +
 .../python/ops/mcmc_diagnostics_impl.py       | 299 ++++++++++++++----
 4 files changed, 393 insertions(+), 64 deletions(-)

diff --git a/tensorflow/contrib/bayesflow/BUILD b/tensorflow/contrib/bayesflow/BUILD
index 34156c28fe..8c856bb0b7 100644
--- a/tensorflow/contrib/bayesflow/BUILD
+++ b/tensorflow/contrib/bayesflow/BUILD
@@ -144,6 +144,7 @@ cuda_py_test(
     additional_deps = [
         ":bayesflow_py",
         "//third_party/py/numpy",
+        "//tensorflow/python:spectral_ops_test_util",
         "//tensorflow/contrib/distributions:distributions_py",
         "//tensorflow/python/ops/distributions",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/mcmc_diagnostics_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/mcmc_diagnostics_test.py
index 7652b6a7ce..d68fc9081a 100644
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/mcmc_diagnostics_test.py
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/mcmc_diagnostics_test.py
@@ -22,11 +22,165 @@ import numpy as np
 
 from tensorflow.contrib.bayesflow.python.ops import mcmc_diagnostics_impl as mcmc_diagnostics
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import spectral_ops_test_util
 from tensorflow.python.platform import test
 
 rng = np.random.RandomState(42)
 
 
+class _EffectiveSampleSizeTest(object):
+
+  @property
+  def use_static_shape(self):
+    raise NotImplementedError(
+        "Subclass failed to implement `use_static_shape`.")
+
+  def _check_versus_expected_effective_sample_size(self,
+                                                   x_,
+                                                   expected_ess,
+                                                   sess,
+                                                   atol=1e-2,
+                                                   rtol=1e-2,
+                                                   max_lags_threshold=None,
+                                                   max_lags=None):
+    x = array_ops.placeholder_with_default(
+        input=x_, shape=x_.shape if self.use_static_shape else None)
+    ess = mcmc_diagnostics.effective_sample_size(
+        x, max_lags_threshold=max_lags_threshold, max_lags=max_lags)
+    if self.use_static_shape:
+      self.assertAllEqual(x.shape[1:], ess.shape)
+
+    ess_ = sess.run(ess)
+
+    self.assertAllClose(
+        np.ones_like(ess_) * expected_ess, ess_, atol=atol, rtol=rtol)
+
+  def testIidRank1NormalHasFullEssMaxLags10(self):
+    # With a length 5000 iid normal sequence, and max_lags = 10, we should
+    # have a good estimate of ESS, and it should be close to the full sequence
+    # length of 5000.
+    # The choice of max_lags = 10 is a short cutoff, reasonable only since we
+    # know the correlation length should be zero right away.
+    with self.test_session() as sess:
+      with spectral_ops_test_util.fft_kernel_label_map():
+        self._check_versus_expected_effective_sample_size(
+            x_=rng.randn(5000).astype(np.float32),
+            expected_ess=5000,
+            sess=sess,
+            max_lags=10,
+            rtol=0.3)
+
+  def testIidRank2NormalHasFullEssMaxLags10(self):
+    # See similar test for Rank1Normal for reasoning.
+    with self.test_session() as sess:
+      with spectral_ops_test_util.fft_kernel_label_map():
+        self._check_versus_expected_effective_sample_size(
+            x_=rng.randn(5000, 2).astype(np.float32),
+            expected_ess=5000,
+            sess=sess,
+            max_lags=10,
+            rtol=0.3)
+
+  def testIidRank1NormalHasFullEssMaxLagThresholdZero(self):
+    # With a length 5000 iid normal sequence, and max_lags_threshold = 0,
+    # we should have a super-duper estimate of ESS, and it should be very close
+    # to the full sequence length of 5000.
+    # The choice of max_lags_cutoff = 0 means we cutoff as soon as the auto-corr
+    # is below zero.  This should happen very quickly, due to the fact that the
+    # theoretical auto-corr is [1, 0, 0,...]
+    with self.test_session() as sess:
+      with spectral_ops_test_util.fft_kernel_label_map():
+        self._check_versus_expected_effective_sample_size(
+            x_=rng.randn(5000).astype(np.float32),
+            expected_ess=5000,
+            sess=sess,
+            max_lags_threshold=0.,
+            rtol=0.1)
+
+  def testIidRank2NormalHasFullEssMaxLagThresholdZero(self):
+    # See similar test for Rank1Normal for reasoning.
+    with self.test_session() as sess:
+      with spectral_ops_test_util.fft_kernel_label_map():
+        self._check_versus_expected_effective_sample_size(
+            x_=rng.randn(5000, 2).astype(np.float32),
+            expected_ess=5000,
+            sess=sess,
+            max_lags_threshold=0.,
+            rtol=0.1)
+
+  def testLength10CorrelationHasEssOneTenthTotalLengthUsingMaxLags50(self):
+    # Create x_, such that
+    #   x_[i] = iid_x_[0], i = 0,...,9
+    #   x_[i] = iid_x_[1], i = 10,..., 19,
+    #   and so on.
+    iid_x_ = rng.randn(5000, 1).astype(np.float32)
+    x_ = (iid_x_ * np.ones((5000, 10)).astype(np.float32)).reshape((50000,))
+    with self.test_session() as sess:
+      with spectral_ops_test_util.fft_kernel_label_map():
+        self._check_versus_expected_effective_sample_size(
+            x_=x_,
+            expected_ess=50000 // 10,
+            sess=sess,
+            max_lags=50,
+            rtol=0.2)
+
+  def testLength10CorrelationHasEssOneTenthTotalLengthUsingMaxLagsThresholdZero(
+      self):
+    # Create x_, such that
+    #   x_[i] = iid_x_[0], i = 0,...,9
+    #   x_[i] = iid_x_[1], i = 10,..., 19,
+    #   and so on.
+    iid_x_ = rng.randn(5000, 1).astype(np.float32)
+    x_ = (iid_x_ * np.ones((5000, 10)).astype(np.float32)).reshape((50000,))
+    with self.test_session() as sess:
+      with spectral_ops_test_util.fft_kernel_label_map():
+        self._check_versus_expected_effective_sample_size(
+            x_=x_,
+            expected_ess=50000 // 10,
+            sess=sess,
+            max_lags_threshold=0.,
+            rtol=0.1)
+
+  def testListArgs(self):
+    # x_ has correlation length 10 ==> ESS = N / 10
+    # y_ has correlation length 1  ==> ESS = N
+    iid_x_ = rng.randn(5000, 1).astype(np.float32)
+    x_ = (iid_x_ * np.ones((5000, 10)).astype(np.float32)).reshape((50000,))
+    y_ = rng.randn(50000).astype(np.float32)
+    states = [x_, x_, y_, y_]
+    max_lags_threshold = [0., None, 0., None]
+    max_lags = [None, 5, None, 5]
+
+    # See other tests for reasoning on tolerance.
+    with self.test_session() as sess:
+      with spectral_ops_test_util.fft_kernel_label_map():
+        ess = mcmc_diagnostics.effective_sample_size(
+            states,
+            max_lags_threshold=max_lags_threshold,
+            max_lags=max_lags)
+        ess_ = sess.run(ess)
+    self.assertAllEqual(4, len(ess_))
+
+    self.assertAllClose(50000 // 10, ess_[0], rtol=0.3)
+    self.assertAllClose(50000 // 10, ess_[1], rtol=0.3)
+    self.assertAllClose(50000, ess_[2], rtol=0.1)
+    self.assertAllClose(50000, ess_[3], rtol=0.1)
+
+
+class EffectiveSampleSizeStaticTest(test.TestCase, _EffectiveSampleSizeTest):
+
+  @property
+  def use_static_shape(self):
+    return True
+
+
+class EffectiveSampleSizeDynamicTest(test.TestCase, _EffectiveSampleSizeTest):
+
+  @property
+  def use_static_shape(self):
+    return False
+
+
 class _PotentialScaleReductionTest(object):
 
   @property
@@ -48,7 +202,7 @@ class _PotentialScaleReductionTest(object):
     state_1 = rng.randn(n_samples, 3, 4) + offset
 
     rhat = mcmc_diagnostics.potential_scale_reduction(
-        state=[state_0, state_1], independent_chain_ndims=1)
+        chains_states=[state_0, state_1], independent_chain_ndims=1)
 
     self.assertIsInstance(rhat, list)
     with self.test_session() as sess:
diff --git a/tensorflow/contrib/bayesflow/python/ops/mcmc_diagnostics.py b/tensorflow/contrib/bayesflow/python/ops/mcmc_diagnostics.py
index 5f3e6ade70..f3a645eafc 100644
--- a/tensorflow/contrib/bayesflow/python/ops/mcmc_diagnostics.py
+++ b/tensorflow/contrib/bayesflow/python/ops/mcmc_diagnostics.py
@@ -25,6 +25,7 @@ from tensorflow.contrib.bayesflow.python.ops.mcmc_diagnostics_impl import *
 from tensorflow.python.util.all_util import remove_undocumented
 
 _allowed_symbols = [
+    "effective_sample_size",
     "potential_scale_reduction",
 ]
 
diff --git a/tensorflow/contrib/bayesflow/python/ops/mcmc_diagnostics_impl.py b/tensorflow/contrib/bayesflow/python/ops/mcmc_diagnostics_impl.py
index 3b6f92463e..bb8b915a9b 100644
--- a/tensorflow/contrib/bayesflow/python/ops/mcmc_diagnostics_impl.py
+++ b/tensorflow/contrib/bayesflow/python/ops/mcmc_diagnostics_impl.py
@@ -14,6 +14,7 @@
 # ==============================================================================
 """Utilities for Markov Chain Monte Carlo (MCMC) sampling.
 
+@@effective_sample_size
 @@potential_scale_reduction
 """
 
@@ -21,20 +22,189 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.contrib.distributions.python.ops import sample_stats
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 
 __all__ = [
+    "effective_sample_size",
     "potential_scale_reduction",
 ]
 
 
-def potential_scale_reduction(state, independent_chain_ndims=1, name=None):
+def effective_sample_size(states,
+                          max_lags_threshold=None,
+                          max_lags=None,
+                          name=None):
+  """Estimate a lower bound on effective sample size for each independent chain.
+
+  Roughly speaking, the "effective sample size" (ESS) is the size of an iid
+  sample with the same variance as `state`.
+
+  More precisely, given a stationary sequence of possibly correlated random
+  variables `X_1, X_2,...,X_N`, each identically distributed ESS is the number
+  such that
+
+  ```Variance{ N**-1 * Sum{X_i} } = ESS**-1 * Variance{ X_1 }.```
+
+  If the sequence is uncorrelated, `ESS = N`.  In general, one should expect
+  `ESS <= N`, with more highly correlated sequences having smaller `ESS`.
+
+  #### Example of using ESS to estimate standard error.
+
+  ```
+  tfd = tf.contrib.distributions
+  tfb = tf.contrib.bayesflow
+
+  target = tfd.MultivariateNormalDiag(scale_diag=[1., 2.])
+
+  # Get 1000 states from one chain.
+  states = tfb.hmc.sample_chain(
+      num_results=1000,
+      target_log_prob_fn=target.log_prob,
+      current_state=tf.constant([0., 0.]),
+      step_size=0.05,
+      num_leapfrog_steps=20,
+      num_burnin_steps=200)
+  states.shape
+  ==> (1000, 2)
+
+  ess = effective_sample_size(states)
+  ==> Shape (2,) Tensor
+
+  mean, variance = tf.nn.moments(states, axis=0)
+  standard_error = tf.sqrt(variance / ess)
+  ```
+
+  Some math shows that, with `R_k` the auto-correlation sequence,
+  `R_k := Covariance{X_1, X_{1+k}} / Variance{X_1}`, we have
+
+  ```ESS(N) =  N / [ 1 + 2 * ( (N - 1) / N * R_1 + ... + 1 / N * R_{N-1}  ) ]```
+
+  This function estimates the above by first estimating the auto-correlation.
+  Since `R_k` must be estimated using only `N - k` samples, it becomes
+  progressively noisier for larger `k`.  For this reason, the summation over
+  `R_k` should be truncated at some number `max_lags < N`.  Since many MCMC
+  methods generate chains where `R_k > 0`, a reasonable critera is to truncate
+  at the first index where the estimated auto-correlation becomes negative.
+
+  Args:
+    states:  `Tensor` or list of `Tensor` objects.  Dimension zero should index
+      identically distributed states.
+    max_lags_threshold:  `Tensor` or list of `Tensor` objects.
+      Must broadcast with `state`.  The auto-correlation sequence is truncated
+      after the first appearance of a term less than `max_lags_threshold`.  If
+      both `max_lags` and `max_lags_threshold` are `None`,
+      `max_lags_threshold` defaults to `0`.
+    max_lags:  `Tensor` or list of `Tensor` objects.  Must be `int`-like and
+      scalar valued.  The auto-correlation sequence is truncated to this length.
+      May be provided only if `max_lags_threshold` is not.
+    name:  `String` name to prepend to created ops.
+
+  Returns:
+    ess:  `Tensor` or list of `Tensor` objects.  The effective sample size of
+      each component of `states`.  Shape will be `states.shape[1:]`.
+
+  Raises:
+    ValueError:  If `states` and `max_lags_threshold` or `states` and `max_lags`
+      are both lists with different lengths.
+  """
+  states_was_list = _is_list_like(states)
+
+  # Convert all args to lists.
+  if not states_was_list:
+    states = [states]
+
+  max_lags = _broadcast_maybelist_arg(states, max_lags, "max_lags")
+  max_lags_threshold = _broadcast_maybelist_arg(states, max_lags_threshold,
+                                                "max_lags_threshold")
+
+  # Process items, one at a time.
+  with ops.name_scope(name, "effective_sample_size"):
+    ess_list = [
+        _effective_sample_size_single_state(s, ml, mlt)
+        for (s, ml, mlt) in zip(states, max_lags, max_lags_threshold)
+    ]
+
+  if states_was_list:
+    return ess_list
+  return ess_list[0]
+
+
+def _effective_sample_size_single_state(states, max_lags, max_lags_threshold):
+  """ESS computation for one single Tensor argument."""
+  if max_lags is not None and max_lags_threshold is not None:
+    raise ValueError(
+        "Expected at most one of max_lags, max_lags_threshold to be provided.  "
+        "Found: {}, {}".format(max_lags, max_lags_threshold))
+
+  if max_lags_threshold is None:
+    max_lags_threshold = 0.
+
+  with ops.name_scope(
+      "effective_sample_size_single_state",
+      values=[states, max_lags, max_lags_threshold]):
+
+    states = ops.convert_to_tensor(states, name="states")
+    dt = states.dtype
+
+    if max_lags is not None:
+      auto_corr = sample_stats.auto_correlation(
+          states, axis=0, max_lags=max_lags)
+    elif max_lags_threshold is not None:
+      max_lags_threshold = ops.convert_to_tensor(
+          max_lags_threshold, dtype=dt, name="max_lags_threshold")
+      auto_corr = sample_stats.auto_correlation(states, axis=0)
+      # Get a binary mask to zero out values of auto_corr below the threshold.
+      #   mask[i, ...] = 1 if auto_corr[j, ...] > threshold for all j <= i,
+      #   mask[i, ...] = 0, otherwise.
+      # So, along dimension zero, the mask will look like [1, 1, ..., 0, 0,...]
+      # Building step by step,
+      #   Assume auto_corr = [1, 0.5, 0.0, 0.3], and max_lags_threshold = 0.2.
+      # Step 1:  mask = [False, False, True, False]
+      mask = auto_corr < max_lags_threshold
+      # Step 2:  mask = [0, 0, 1, 1]
+      mask = math_ops.cast(mask, dtype=dt)
+      # Step 3:  mask = [0, 0, 1, 2]
+      mask = math_ops.cumsum(mask, axis=0)
+      # Step 4:  mask = [1, 1, 0, 0]
+      mask = math_ops.maximum(1. - mask, 0.)
+      auto_corr *= mask
+    else:
+      auto_corr = sample_stats.auto_correlation(states, axis=0)
+
+    # With R[k] := auto_corr[k, ...],
+    # ESS = N / {1 + 2 * Sum_{k=1}^N (N - k) / N * R[k]}
+    #     = N / {-1 + 2 * Sum_{k=0}^N (N - k) / N * R[k]} (since R[0] = 1)
+    #     approx N / {-1 + 2 * Sum_{k=0}^M (N - k) / N * R[k]}
+    #, where M is the max_lags truncation point chosen above.
+
+    # Get the factor (N - k) / N, and give it shape [M, 1,...,1], having total
+    # ndims the same as auto_corr
+    n = _axis_size(states, axis=0)
+    k = math_ops.range(0., _axis_size(auto_corr, axis=0))
+    nk_factor = (n - k) / n
+    if auto_corr.shape.ndims is not None:
+      new_shape = [-1] + [1] * (auto_corr.shape.ndims - 1)
+    else:
+      new_shape = array_ops.concat(
+          ([-1],
+           array_ops.ones([array_ops.rank(auto_corr) - 1], dtype=dtypes.int32)),
+          axis=0)
+    nk_factor = array_ops.reshape(nk_factor, new_shape)
+
+    return n / (-1 + 2 * math_ops.reduce_sum(nk_factor * auto_corr, axis=0))
+
+
+def potential_scale_reduction(chains_states,
+                              independent_chain_ndims=1,
+                              name=None):
   """Gelman and Rubin's potential scale reduction factor for chain convergence.
 
-  Given `N > 1` samples from each of `C > 1` independent chains, the potential
+  Given `N > 1` states from each of `C > 1` independent chains, the potential
   scale reduction factor, commonly referred to as R-hat, measures convergence of
   the chains (to the same target) by testing for equality of means.
   Specifically, R-hat measures the degree to which variance (of the means)
@@ -71,18 +241,18 @@ def potential_scale_reduction(state, independent_chain_ndims=1, name=None):
   ==> (10, 2)
 
   # Get 1000 samples from the 10 independent chains.
-  state = tfb.hmc.sample_chain(
+  chains_states, _ = tfb.hmc.sample_chain(
       num_results=1000,
       target_log_prob_fn=target.log_prob,
       current_state=initial_state,
       step_size=0.05,
       num_leapfrog_steps=20,
       num_burnin_steps=200)
-  state.shape
+  chains_states.shape
   ==> (1000, 10, 2)
 
   rhat = tfb.mcmc_diagnostics.potential_scale_reduction(
-      state, independent_chain_ndims=1)
+      chains_states, independent_chain_ndims=1)
 
   # The second dimension needed a longer burn-in.
   rhat.eval()
@@ -108,9 +278,9 @@ def potential_scale_reduction(state, independent_chain_ndims=1, name=None):
       Journal of Computational and Graphical Statistics, 1998. Vol 7, No. 4.
 
   Args:
-    state:  `Tensor` or Python `list` of `Tensor`s representing the state(s) of
-      a Markov Chain at each result step.  The `ith` state is assumed to have
-      shape `[Ni, Ci1, Ci2,...,CiD] + A`.
+    chains_states:  `Tensor` or Python `list` of `Tensor`s representing the
+      state(s) of a Markov Chain at each result step.  The `ith` state is
+      assumed to have shape `[Ni, Ci1, Ci2,...,CiD] + A`.
       Dimension `0` indexes the `Ni > 1` result steps of the Markov Chain.
       Dimensions `1` through `D` index the `Ci1 x ... x CiD` independent
       chains to be tested for convergence to the same target.
@@ -129,6 +299,10 @@ def potential_scale_reduction(state, independent_chain_ndims=1, name=None):
   Raises:
     ValueError:  If `independent_chain_ndims < 1`.
   """
+  chains_states_was_list = _is_list_like(chains_states)
+  if not chains_states_was_list:
+    chains_states = [chains_states]
+
   # tensor_util.constant_value returns None iff a constant value (as a numpy
   # array) is not efficiently computable.  Therefore, we try constant_value then
   # check for None.
@@ -140,66 +314,53 @@ def potential_scale_reduction(state, independent_chain_ndims=1, name=None):
       raise ValueError(
           "Argument `independent_chain_ndims` must be `>= 1`, found: {}".format(
               independent_chain_ndims))
-  with ops.name_scope(
-      name,
-      "potential_scale_reduction",
-      values=[state, independent_chain_ndims]):
-    if _is_list_like(state):
-      return [
-          _potential_scale_reduction_single_state(s, independent_chain_ndims)
-          for s in state
-      ]
-    return _potential_scale_reduction_single_state(state,
-                                                   independent_chain_ndims)
+
+  with ops.name_scope(name, "potential_scale_reduction"):
+    rhat_list = [
+        _potential_scale_reduction_single_state(s, independent_chain_ndims)
+        for s in chains_states
+    ]
+
+  if chains_states_was_list:
+    return rhat_list
+  return rhat_list[0]
 
 
 def _potential_scale_reduction_single_state(state, independent_chain_ndims):
   """potential_scale_reduction for one single state `Tensor`."""
-  # We assume exactly one leading dimension indexes e.g. correlated samples from
-  # each Markov chain.
-  state = ops.convert_to_tensor(state, name="state")
-  sample_ndims = 1
-
-  sample_axis = math_ops.range(0, sample_ndims)
-  chain_axis = math_ops.range(sample_ndims,
-                              sample_ndims + independent_chain_ndims)
-  sample_and_chain_axis = math_ops.range(0,
-                                         sample_ndims + independent_chain_ndims)
-
-  n = _axis_size(state, sample_axis)
-  m = _axis_size(state, chain_axis)
-
-  # In the language of [2],
-  # B / n is the between chain variance, the variance of the chain means.
-  # W is the within sequence variance, the mean of the chain variances.
-  b_div_n = _reduce_variance(
-      math_ops.reduce_mean(state, sample_axis, keepdims=True),
-      sample_and_chain_axis,
-      biased=False)
-  w = math_ops.reduce_mean(
-      _reduce_variance(state, sample_axis, keepdims=True, biased=True),
-      sample_and_chain_axis)
-
-  # sigma^2_+ is an estimate of the true variance, which would be unbiased if
-  # each chain was drawn from the target.  c.f. "law of total variance."
-  sigma_2_plus = w + b_div_n
-
-  return ((m + 1.) / m) * sigma_2_plus / w - (n - 1.) / (m * n)
-
-
-def effective_sample_size(state,
-                          independent_chain_ndims=1,
-                          max_lags=None,
-                          max_lags_threshold=None,
-                          name="effective_sample_size"):
-  if max_lags is not None and max_lags_threshold is not None:
-    raise ValueError(
-        "Expected at most one of max_lags, max_lags_threshold to be provided.  "
-        "Found: {}, {}".format(max_lags, max_lags_threshold))
   with ops.name_scope(
-      name,
-      values=[state, independent_chain_ndims, max_lags, max_lags_threshold]):
-    pass
+      "potential_scale_reduction_single_state",
+      values=[state, independent_chain_ndims]):
+    # We assume exactly one leading dimension indexes e.g. correlated samples
+    # from each Markov chain.
+    state = ops.convert_to_tensor(state, name="state")
+    sample_ndims = 1
+
+    sample_axis = math_ops.range(0, sample_ndims)
+    chain_axis = math_ops.range(sample_ndims,
+                                sample_ndims + independent_chain_ndims)
+    sample_and_chain_axis = math_ops.range(
+        0, sample_ndims + independent_chain_ndims)
+
+    n = _axis_size(state, sample_axis)
+    m = _axis_size(state, chain_axis)
+
+    # In the language of [2],
+    # B / n is the between chain variance, the variance of the chain means.
+    # W is the within sequence variance, the mean of the chain variances.
+    b_div_n = _reduce_variance(
+        math_ops.reduce_mean(state, sample_axis, keepdims=True),
+        sample_and_chain_axis,
+        biased=False)
+    w = math_ops.reduce_mean(
+        _reduce_variance(state, sample_axis, keepdims=True, biased=True),
+        sample_and_chain_axis)
+
+    # sigma^2_+ is an estimate of the true variance, which would be unbiased if
+    # each chain was drawn from the target.  c.f. "law of total variance."
+    sigma_2_plus = w + b_div_n
+
+    return ((m + 1.) / m) * sigma_2_plus / w - (n - 1.) / (m * n)
 
 
 # TODO(b/72873233) Move some variant of this to sample_stats.
@@ -226,3 +387,15 @@ def _axis_size(x, axis=None):
 def _is_list_like(x):
   """Helper which returns `True` if input is `list`-like."""
   return isinstance(x, (tuple, list))
+
+
+def _broadcast_maybelist_arg(states, secondary_arg, name):
+  """Broadcast a listable secondary_arg to that of states."""
+  if _is_list_like(secondary_arg):
+    if len(secondary_arg) != len(states):
+      raise ValueError("Argument `%s` was a list of different length ({}) than "
+                       "`states` ({})".format(name, len(states)))
+  else:
+    secondary_arg = [secondary_arg] * len(states)
+
+  return secondary_arg
-- 
GitLab


From 6e267c6249e590fee374d93f30e78812fc5fed61 Mon Sep 17 00:00:00 2001
From: Jonathan Hseu <jhseu@google.com>
Date: Thu, 8 Feb 2018 12:30:53 -0800
Subject: [PATCH 853/966] Revert "Fix missing . (#16460)"

This reverts commit 5e23338fec26f0c5ad588742b8df80e5bf1a940d.
---
 tensorflow/docs_src/extend/tool_developers/index.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/docs_src/extend/tool_developers/index.md b/tensorflow/docs_src/extend/tool_developers/index.md
index f5440bc96d..f02cd23be8 100644
--- a/tensorflow/docs_src/extend/tool_developers/index.md
+++ b/tensorflow/docs_src/extend/tool_developers/index.md
@@ -156,7 +156,7 @@ of checkpoints and freezes them together into a single file.
 
 What this does is load the `GraphDef`, pull in the values for all the variables
 from the latest checkpoint file, and then replace each `Variable` op with a
-`Const` that has the numerical data for the weights stored in its attributes.
+`Const` that has the numerical data for the weights stored in its attributes
 It then strips away all the extraneous nodes that aren't used for forward
 inference, and saves out the resulting `GraphDef` into an output file.
 
-- 
GitLab


From 1fede2b6a98113ac1f22d8965adb866c1f04fb8e Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Thu, 8 Feb 2018 12:29:56 -0800
Subject: [PATCH 854/966] [XLA] Improve comment about why we can't unroll loops
 with static trip count of 1 if they contain side-effecting ops.

PiperOrigin-RevId: 185034032
---
 tensorflow/compiler/xla/service/while_loop_simplifier.cc | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier.cc b/tensorflow/compiler/xla/service/while_loop_simplifier.cc
index 87a7f86f4e..981de9b220 100644
--- a/tensorflow/compiler/xla/service/while_loop_simplifier.cc
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier.cc
@@ -564,9 +564,11 @@ static StatusOr<bool> TryRemoveWhileLoop(HloInstruction* while_op) {
   //
   // This is not a fundamental limitation.  The control operands can be moved
   // onto the new HLOs after simplification, and any side-effecting ops inside
-  // the loop aren't removed, just cloned and added back to the loop.
-  // Nevertheless our infrastructure sees loop simplification as removal of
-  // these nodes and currently doesn't allow it.
+  // the loop aren't removed, just cloned and added back to the loop.  But
+  // moving an op out of the loop also removes implicit control dependencies
+  // between the op and the ops outside the loop, so we'd have to add those back
+  // for things like infeed/outfeed.  It gets complicated.  So for now we just
+  // avoid it.
   if (!while_op->parent()->IsRemovable(while_op) || while_op->HasSideEffect()) {
     VLOG(2) << "Not attempting to remove while loop it is not removable: "
             << while_op->ToShortString();
-- 
GitLab


From fe914e1b7df800cd7a1b61c8d3a5eec6c3b08b07 Mon Sep 17 00:00:00 2001
From: HyoukJoong Lee <hyouklee@google.com>
Date: Thu, 8 Feb 2018 12:30:28 -0800
Subject: [PATCH 855/966] Bug fix in buffer assignment for colocated buffers

PiperOrigin-RevId: 185034095
---
 .../compiler/xla/service/buffer_assignment.cc | 286 +++++++++---------
 .../compiler/xla/service/buffer_assignment.h  |  16 +-
 .../xla/service/buffer_assignment_test.cc     | 111 +++++++
 .../compiler/xla/service/buffer_liveness.cc   |   9 +-
 4 files changed, 266 insertions(+), 156 deletions(-)

diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
index 774b11478c..f0a9de5f94 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -1122,140 +1122,6 @@ void BufferAssigner::AddSetToColocatedBufferSets(
   }
 }
 
-// Conceptually the same as AddSetToColocatedBufferSets, but specific to the
-// colocated buffers for while instructions. 'colocated_set' contains the
-// buffers for a single while instruction that must be colocated. The idea here
-// is to apply a memory-saving heuristic for separate while instructions whose
-// buffers are disjoint in liveness, by using the colocation mechanism to force
-// buffer sharing. This often reduces memory for multi-layer RNNs.
-//
-// TODO(b/32491382): We should be able to remove this heuristic after we
-// implement module-level liveness analysis, which would let us directly detect
-// buffer sharing opportunities between the while instruction buffer and the
-// buffers from the predicate and body computation, as well as sharing across
-// different while instructions.
-void BufferAssigner::AddWhileSetToColocatedBufferSets(
-    const std::vector<const LogicalBuffer*>& colocated_set,
-    const LogicalBuffer* while_init_buffer,
-    const LogicalBuffer* while_result_buffer, const HloInstruction* while_hlo,
-    const HloComputation& computation, const BufferLiveness& buffer_liveness,
-    const LogicalBuffer::SizeFunction& buffer_size,
-    std::vector<ColocatedBufferSet>* colocated_buffer_sets) {
-  CHECK(!colocated_set.empty());
-  const TuplePointsToAnalysis& points_to_analysis =
-      buffer_liveness.points_to_analysis();
-
-  // Parallel while loops cannot safely share colocated buffer sets.
-  if (buffer_liveness.hlo_ordering().SequentialOrder(computation) == nullptr) {
-    AddSetToColocatedBufferSets(colocated_set, colocated_buffer_sets);
-    return;
-  }
-
-  // Scan 'colocated_buffer_sets' in reverse order for locality; colocated sets
-  // are added in postorder over computations and instructions.
-  const int64 init_buffer_size = buffer_size(*while_init_buffer);
-  const bool is_live_out = buffer_liveness.MaybeLiveOut(*while_result_buffer);
-  for (int i = colocated_buffer_sets->size() - 1; i >= 0; --i) {
-    const ColocatedBufferSet& predecessor_set = (*colocated_buffer_sets)[i];
-
-    // Skip predecessor sets not associated with while loops.
-    if (std::all_of(predecessor_set.begin(), predecessor_set.end(),
-                    [](const LogicalBuffer* buffer) {
-                      return buffer->instruction()->opcode() !=
-                             HloOpcode::kWhile;
-                    })) {
-      continue;
-    }
-
-    // Skip predecessor sets already associated with 'while_hlo'.
-    if (std::any_of(predecessor_set.begin(), predecessor_set.end(),
-                    [&while_hlo](const LogicalBuffer* buffer) {
-                      return buffer->instruction() == while_hlo;
-                    })) {
-      continue;
-    }
-
-    // Skip predecessor sets with entry parameter if the while result is live
-    // out.
-    if (is_live_out &&
-        std::any_of(predecessor_set.begin(), predecessor_set.end(),
-                    [](const LogicalBuffer* buffer) {
-                      auto* instruction = buffer->instruction();
-                      auto* computation = instruction->parent();
-                      auto* module = computation->parent();
-                      return instruction->opcode() == HloOpcode::kParameter &&
-                             computation == module->entry_computation();
-                    })) {
-      continue;
-    }
-
-    // Build vector of predecessor while result and init buffers, which are
-    // checked for liveness interference below. We must check both the result
-    // and init buffers because they're aliased together, but
-    // TuplePointsToAnalysis is unaware of this aliasing.
-    std::vector<const LogicalBuffer*> predecessor_while_buffers;
-    for (const LogicalBuffer* buffer : predecessor_set) {
-      const HloInstruction* instruction = buffer->instruction();
-      if (instruction->opcode() == HloOpcode::kWhile &&
-          buffer_size(*buffer) == init_buffer_size &&
-          instruction->parent() == &computation) {
-        predecessor_while_buffers.push_back(buffer);
-        // Add the init buffer at the same index, which must also exist in the
-        // predecessor set, and must be unambiguous.
-        const PointsToSet& init_points_to =
-            points_to_analysis.GetPointsToSet(instruction->operand(0));
-        const auto& init_buffers = init_points_to.element(buffer->index());
-        CHECK_EQ(init_buffers.size(), 1);
-        CHECK_GT(predecessor_set.count(init_buffers[0]), 0);
-        predecessor_while_buffers.push_back(init_buffers[0]);
-      }
-    }
-    if (predecessor_while_buffers.empty()) {
-      continue;
-    }
-
-    // Skip predecessor set if the live range of any predecessor
-    // buffers overlaps with 'while_init_buffer' or
-    // 'while_result_buffer' (we need to check both since they're
-    // aliased together, but the points-to analysis is unaware of this
-    // aliasing). Note that tuple element buffer forwarding can cause
-    // the same buffer to appear on both sides of the interference
-    // comparison below.
-    auto may_interfere_with_init_or_result = [&](const LogicalBuffer* buffer) {
-      if (while_init_buffer->id() != buffer->id() &&
-          buffer_liveness.MayInterfere(*while_init_buffer, *buffer)) {
-        return true;
-      }
-
-      if (while_result_buffer->id() != buffer->id() &&
-          buffer_liveness.MayInterfere(*while_result_buffer, *buffer)) {
-        return true;
-      }
-
-      return false;
-    };
-
-    if (std::any_of(predecessor_while_buffers.begin(),
-                    predecessor_while_buffers.end(),
-                    may_interfere_with_init_or_result)) {
-      continue;
-    }
-
-    // All our checks have passed; merge 'predecessor_set' with 'colocated_set',
-    // and add the merged set to 'colocated_buffer_sets'. This forces the
-    // colocation of buffers across different while instructions.
-    FlatSet<const LogicalBuffer*> unique;
-    unique.insert(predecessor_set.begin(), predecessor_set.end());
-    unique.insert(colocated_set.begin(), colocated_set.end());
-    std::vector<const LogicalBuffer*> merged_set(unique.begin(), unique.end());
-    AddSetToColocatedBufferSets(merged_set, colocated_buffer_sets);
-    return;
-  }
-
-  // Failed to merge into predecessor set; add 'colocated_set' as-is.
-  AddSetToColocatedBufferSets(colocated_set, colocated_buffer_sets);
-}
-
 namespace {
 
 // Checks that points-to set of 'instruction' is unambiguous and distinct
@@ -1272,8 +1138,130 @@ const LogicalBuffer* AddBufferToColocatedSet(
   return colocated_set->back();
 }
 
+// Given the interference map of a graph (the list of interfering node indices
+// for each node), perform graph coloring such that interfering nodes are
+// assigned to different colors. Returns the assigned color of the nodes, where
+// the colors are represented as integer values [0, color_count).
+std::vector<int64> ColorInterferenceGraph(
+    const std::vector<std::vector<int64>>& interference_map) {
+  const int64 node_count = interference_map.size();
+
+  // Sort the nodes such that we assign nodes with more interference first. This
+  // relies on the common heuristic of assigning the most constrained node
+  // first, but it would be good to investigate other ordering heuristics too.
+  std::vector<int64> nodes(node_count);
+  std::iota(nodes.begin(), nodes.end(), 0);
+  std::sort(nodes.begin(), nodes.end(),
+            [&interference_map](const int64 i, const int64 j) {
+              return interference_map[i].size() > interference_map[j].size();
+            });
+
+  const int64 kColorUnassigned = -1;
+  std::vector<int64> assigned_colors(node_count, kColorUnassigned);
+  for (int64 node : nodes) {
+    // Mark the colors that are already assigned to the neighbors.
+    std::vector<bool> available_colors(node_count, true);
+    for (int64 neighbor : interference_map[node]) {
+      int64 color = assigned_colors[neighbor];
+      if (color != kColorUnassigned) {
+        available_colors[color] = false;
+      }
+    }
+
+    // Find the color that is not yet assigned to the neighbors.
+    int64 color = kColorUnassigned;
+    for (color = 0; color < available_colors.size(); ++color) {
+      if (available_colors[color]) {
+        break;
+      }
+    }
+    CHECK_NE(color, kColorUnassigned);
+    assigned_colors[node] = color;
+  }
+  return assigned_colors;
+}
+
 }  // namespace
 
+std::vector<BufferAssigner::ColocatedBufferSet>
+BufferAssigner::MergeColocatedBufferSets(
+    const std::vector<ColocatedBufferSet>& colocated_buffer_sets,
+    const BufferLiveness& buffer_liveness,
+    const LogicalBuffer::SizeFunction& buffer_size) {
+  VLOG(1) << "colocation sets count before coalescing:"
+          << colocated_buffer_sets.size();
+
+  // Returns true if the given buffer is for the entry parameter.
+  auto is_entry_parameter = [](const LogicalBuffer& buffer) {
+    auto* instruction = buffer.instruction();
+    auto* computation = instruction->parent();
+    auto* module = computation->parent();
+    return instruction->opcode() == HloOpcode::kParameter &&
+           computation == module->entry_computation();
+  };
+
+  // Returns true if the two colocated buffer sets (specified by their indices
+  // into the colocated_buffer_sets) can be merged into a single set.
+  auto cannot_merge_buffer_sets = [&colocated_buffer_sets, &buffer_liveness,
+                                   &buffer_size,
+                                   &is_entry_parameter](int64 i, int64 j) {
+    for (auto& buffer_a : colocated_buffer_sets[i]) {
+      for (auto& buffer_b : colocated_buffer_sets[j]) {
+        // Do not merge if the set includes live outs or entry parameters.
+        if ((buffer_liveness.MaybeLiveOut(*buffer_a) &&
+             is_entry_parameter(*buffer_b)) ||
+            (buffer_liveness.MaybeLiveOut(*buffer_b) &&
+             is_entry_parameter(*buffer_a))) {
+          return true;
+        }
+        // Do not merge if the buffers interfere with each other.
+        if (buffer_a->id() != buffer_b->id() &&
+            buffer_liveness.MayInterfere(*buffer_a, *buffer_b)) {
+          return true;
+        }
+        // Do not merge if the buffer sizes are different.
+        if (buffer_size(*buffer_a) != buffer_size(*buffer_b)) {
+          return true;
+        }
+      }
+    }
+    return false;
+  };
+
+  // Build the interference map among the colocated buffer sets (nodes), by
+  // adding an edge between any two nodes that cannot be merged into a single
+  // colocated buffer set.
+  std::vector<std::vector<int64>> interference_map(
+      colocated_buffer_sets.size());
+  for (int64 i = 0; i < colocated_buffer_sets.size(); ++i) {
+    for (int64 j = i + 1; j < colocated_buffer_sets.size(); ++j) {
+      if (cannot_merge_buffer_sets(i, j)) {
+        interference_map[i].push_back(j);
+        interference_map[j].push_back(i);
+      }
+    }
+  }
+
+  // Assign a color to each colocation set in colocated_buffer_sets, such that
+  // the sets that can be merged are assigned with the same color.
+  auto assigned_colors = ColorInterferenceGraph(interference_map);
+
+  // Merge the buffer sets with the same color.
+  CHECK(!assigned_colors.empty());
+  int64 num_sets =
+      *std::max_element(assigned_colors.begin(), assigned_colors.end()) + 1;
+  std::vector<ColocatedBufferSet> new_colocated_buffer_sets(num_sets);
+  for (int64 i = 0; i < colocated_buffer_sets.size(); ++i) {
+    const auto& buffer_set = colocated_buffer_sets[i];
+    new_colocated_buffer_sets[assigned_colors[i]].insert(buffer_set.begin(),
+                                                         buffer_set.end());
+  }
+
+  VLOG(1) << "colocation sets count after coalescing:"
+          << colocated_buffer_sets.size();
+  return new_colocated_buffer_sets;
+}
+
 // Builds sets of buffers in 'colocated_buffer_sets' which should be colocated
 // in the same allocation (currently just supports kWhile, kCall, and
 // kConditional).
@@ -1299,12 +1287,11 @@ void BufferAssigner::BuildColocatedBufferSets(
                 const Shape& /*subshape*/, const ShapeIndex& index) {
               std::vector<const LogicalBuffer*> colocated_set;
               // Add while.init.
-              auto* init_buffer =
-                  AddBufferToColocatedSet(while_hlo->operand(0), index,
-                                          points_to_analysis, &colocated_set);
+              AddBufferToColocatedSet(while_hlo->operand(0), index,
+                                      points_to_analysis, &colocated_set);
               // Add while.result.
-              auto* result_buffer = AddBufferToColocatedSet(
-                  while_hlo, index, points_to_analysis, &colocated_set);
+              AddBufferToColocatedSet(while_hlo, index, points_to_analysis,
+                                      &colocated_set);
               // Add while.cond.parameter.
               AddBufferToColocatedSet(
                   while_hlo->while_condition()->parameter_instruction(0), index,
@@ -1317,10 +1304,7 @@ void BufferAssigner::BuildColocatedBufferSets(
               AddBufferToColocatedSet(
                   while_hlo->while_body()->root_instruction(), index,
                   points_to_analysis, &colocated_set);
-              AddWhileSetToColocatedBufferSets(
-                  colocated_set, init_buffer, result_buffer, while_hlo,
-                  *computation, buffer_liveness, buffer_size,
-                  colocated_buffer_sets);
+              AddSetToColocatedBufferSets(colocated_set, colocated_buffer_sets);
             });
       } else if (opcode == HloOpcode::kCall) {
         const HloInstruction* call_hlo = instruction;
@@ -1400,6 +1384,22 @@ void BufferAssigner::BuildColocatedBufferSets(
       }
     }
   }
+
+  if (colocated_buffer_sets->empty()) {
+    return;
+  }
+
+  // Try to find more coalescing opportunities among the colocated buffer sets.
+  //
+  // TODO(b/32491382): We should be able to remove this by using the
+  // module-level liveness analysis, which would let us directly detect buffer
+  // sharing opportunities between the while instruction buffer and the buffers
+  // from the predicate and body computation, as well as sharing across
+  // different while instructions.
+  std::vector<ColocatedBufferSet> new_colocated_buffer_sets =
+      MergeColocatedBufferSets(*colocated_buffer_sets, buffer_liveness,
+                               buffer_size);
+  std::swap(*colocated_buffer_sets, new_colocated_buffer_sets);
 }
 
 // Assigns all colocated buffer sets in 'colocated_buffer_sets' to the same
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.h b/tensorflow/compiler/xla/service/buffer_assignment.h
index 08a40bfeb2..65019b6b17 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.h
+++ b/tensorflow/compiler/xla/service/buffer_assignment.h
@@ -528,15 +528,13 @@ class BufferAssigner {
       const std::vector<const LogicalBuffer*>& colocated_set,
       std::vector<ColocatedBufferSet>* colocated_buffer_sets);
 
-  // Conceptually the same as AddSetToColocatedBufferSets, but specific to the
-  // colocated buffers for while instructions.
-  void AddWhileSetToColocatedBufferSets(
-      const std::vector<const LogicalBuffer*>& colocated_set,
-      const LogicalBuffer* while_init_buffer,
-      const LogicalBuffer* while_result_buffer, const HloInstruction* while_hlo,
-      const HloComputation& computation, const BufferLiveness& buffer_liveness,
-      const LogicalBuffer::SizeFunction& buffer_size,
-      std::vector<ColocatedBufferSet>* colocated_buffer_sets);
+  // Given a list of colocated buffer sets (each colocated buffer set represents
+  // the logical buffers that would be assigned to the same physical buffer),
+  // try to merge the sets if the buffers can be shared. Returns the merged set.
+  std::vector<ColocatedBufferSet> MergeColocatedBufferSets(
+      const std::vector<ColocatedBufferSet>& colocated_buffer_sets,
+      const BufferLiveness& buffer_liveness,
+      const LogicalBuffer::SizeFunction& buffer_size);
 
   // Split a set of buffers into several sets, each of which contains buffers
   // colored with the same color.
diff --git a/tensorflow/compiler/xla/service/buffer_assignment_test.cc b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
index 6fc9d783f1..ef067cc31f 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
@@ -1587,6 +1587,117 @@ TEST_F(WhileBufferAssignmentTest, TwoForwardWhileLoops) {
             assignment->GetUniqueSlice(while1, {1}).ConsumeValueOrDie());
 }
 
+// Tests that the colocated buffers for while instructions are properly assigned
+// during buffer assignment such that the result tuple elements are not assigned
+// to the same buffer.
+//
+// %infeed --> %while.0 --> %while.1 --+
+//                                     +-- %tuple
+//   %zero -->   %add   --> %while.2 --+
+//
+// Execution Order:
+// %infeed -> %while.0 -> %while.1 -> %zero -> %add -> %while.2 -> %tuple
+//
+// The HLO computation used in this test requires specific ordering to expose
+// the bug (b/72496031). During buffer assignment, the visitation order of
+// colocated buffers is %while.2 -> while.0 -> while.1, and the buffer
+// assignment was coalescing the colocated buffers for all 3 while instructions,
+// therefore assigning the same buffer to the two result tuple elements.
+TEST_F(WhileBufferAssignmentTest, ColocatedBuffers) {
+  const Shape r0s32 = ShapeUtil::MakeShape(S32, {});
+
+  // Builds a condition computation: x -> x < 4
+  auto build_cond = [&]() {
+    auto builder = HloComputation::Builder("cond");
+    auto const4 = builder.AddInstruction(
+        HloInstruction::CreateConstant(Literal::CreateR0<int>(4)));
+    auto param =
+        builder.AddInstruction(HloInstruction::CreateParameter(0, r0s32, "x"));
+    builder.AddInstruction(HloInstruction::CreateBinary(
+        ShapeUtil::MakeShape(PRED, {}), HloOpcode::kLt, param, const4));
+    return builder.Build();
+  };
+
+  // Builds a body computation: x -> x + 9
+  auto build_body = [&]() {
+    auto builder = HloComputation::Builder("body");
+    auto const9 = builder.AddInstruction(
+        HloInstruction::CreateConstant(Literal::CreateR0<int>(9)));
+    auto param =
+        builder.AddInstruction(HloInstruction::CreateParameter(0, r0s32, "x"));
+    builder.AddInstruction(
+        HloInstruction::CreateBinary(r0s32, HloOpcode::kAdd, param, const9));
+    return builder.Build();
+  };
+
+  // Build the entry computation as described in the comment above.
+  auto module = xla::MakeUnique<HloModule>(TestName());
+  auto builder = HloComputation::Builder("entry");
+
+  auto infeed = builder.AddInstruction(HloInstruction::CreateInfeed(r0s32, ""));
+  auto cond0 = module->AddEmbeddedComputation(build_cond());
+  auto body0 = module->AddEmbeddedComputation(build_body());
+  auto while0 = builder.AddInstruction(
+      HloInstruction::CreateWhile(r0s32, cond0, body0, infeed));
+
+  auto cond1 = module->AddEmbeddedComputation(build_cond());
+  auto body1 = module->AddEmbeddedComputation(build_body());
+  auto while1 = builder.AddInstruction(
+      HloInstruction::CreateWhile(r0s32, cond1, body1, while0));
+
+  auto zero = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<int32>(0)));
+  auto add = builder.AddInstruction(
+      HloInstruction::CreateBinary(r0s32, HloOpcode::kAdd, zero, zero));
+  auto cond2 = module->AddEmbeddedComputation(build_cond());
+  auto body2 = module->AddEmbeddedComputation(build_body());
+  auto while2 = builder.AddInstruction(
+      HloInstruction::CreateWhile(r0s32, cond2, body2, add));
+
+  auto tuple =
+      builder.AddInstruction(HloInstruction::CreateTuple({while2, while1}));
+  module->AddEntryComputation(builder.Build());
+
+  // Run CopyInsertion and check if the graph constructed above doesn't need
+  // any copies inserted for BufferAssignment to run.
+  int64 instruction_count = module->instruction_count();
+  CopyInsertion copy_insertion;
+  ASSERT_IS_OK(copy_insertion.Run(module.get()).status());
+  ASSERT_EQ(instruction_count, module->instruction_count());
+
+  // Create a sequential order among all the instructions in the entry
+  // computation, since the issue this test stresses depends on the order the
+  // nodes are traversed during BufferAssignment.
+  SequentialHloOrdering::HloModuleSequence sequence;
+  sequence[module->entry_computation()] = {infeed, while0, while1, zero,
+                                           add,    while2, tuple};
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto assignment,
+      BufferAssigner::Run(
+          module.get(),
+          xla::MakeUnique<SequentialHloOrdering>(module.get(), sequence),
+          backend().compiler()->BufferSizeBytesFunction(),
+          [](LogicalBuffer::Color) { return 1; }));
+
+  // The result tuple elements must be assigned with different buffers.
+  TF_ASSERT_OK_AND_ASSIGN(auto slice0, assignment->GetUniqueSlice(tuple, {0}));
+  TF_ASSERT_OK_AND_ASSIGN(auto slice1, assignment->GetUniqueSlice(tuple, {1}));
+  EXPECT_NE(slice0, slice1);
+
+  // while0 and while1 result buffers must be equal to slice1.
+  TF_ASSERT_OK_AND_ASSIGN(auto slice_while0,
+                          assignment->GetUniqueSlice(while0, {}));
+  TF_ASSERT_OK_AND_ASSIGN(auto slice_while1,
+                          assignment->GetUniqueSlice(while1, {}));
+  EXPECT_EQ(slice1, slice_while0);
+  EXPECT_EQ(slice1, slice_while1);
+
+  // while2 result buffer must be equal to slice0.
+  TF_ASSERT_OK_AND_ASSIGN(auto slice_while2,
+                          assignment->GetUniqueSlice(while2, {}));
+  EXPECT_EQ(slice0, slice_while2);
+}
+
 TEST_F(WhileBufferAssignmentTest, OneForwardBackwardWhileLoopSet) {
   auto module = xla::MakeUnique<HloModule>(TestName());
   auto builder = HloComputation::Builder("entry");
diff --git a/tensorflow/compiler/xla/service/buffer_liveness.cc b/tensorflow/compiler/xla/service/buffer_liveness.cc
index e7749252ce..37982aaef9 100644
--- a/tensorflow/compiler/xla/service/buffer_liveness.cc
+++ b/tensorflow/compiler/xla/service/buffer_liveness.cc
@@ -117,11 +117,12 @@ bool BufferLiveness::live_range_strictly_before(const LogicalBuffer& a,
 
     // If the root instruction aliases the buffer 'a', the live range of 'a' is
     // until the end of the computation and can never be strictly before another
-    // buffer. This is needed to prevent the root instruction's buffers from
-    // being reused by later instructions even when the root is not the last
-    // instruction in the schedule.
+    // buffer defined in the same computation. This is needed to prevent the
+    // root instruction's buffers from being reused by later instructions even
+    // when the root is not the last instruction in the schedule.
     if (alias.instruction()->parent()->root_instruction() ==
-        alias.instruction()) {
+            alias.instruction() &&
+        alias.instruction()->parent() == b.instruction()->parent()) {
       return false;
     }
   }
-- 
GitLab


From 785ac4418d60e3f69115c2a05ee989d620635a71 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 8 Feb 2018 13:00:09 -0800
Subject: [PATCH 856/966] [TF:XLA] Handle fusion instructions in HloInstruction
 equality comparison.

Two fusion instructions are equal iff the fusion kind matches and the fused subcomputations are structurally equal.

PiperOrigin-RevId: 185038129
---
 .../compiler/xla/service/hlo_instruction.cc   |   6 +-
 .../xla/service/hlo_instruction_test.cc       | 149 ++++++++++++++----
 2 files changed, 124 insertions(+), 31 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 277648f072..0e4437b73b 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -1661,8 +1661,12 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kTuple:
       return true;
 
-    // These opcodes have complex or special behavior so just return false.
     case HloOpcode::kFusion:
+      return fusion_kind() == other.fusion_kind() &&
+             eq_computations(fused_instructions_computation(),
+                             other.fused_instructions_computation());
+
+    // These opcodes have complex or special behavior so just return false.
     case HloOpcode::kRng:
     case HloOpcode::kTrace:
     case HloOpcode::kWhile:
diff --git a/tensorflow/compiler/xla/service/hlo_instruction_test.cc b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
index 1038ab5555..94e9bfe56e 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
@@ -825,17 +825,42 @@ TEST_F(HloInstructionTest, ComplexFusionOp) {
   EXPECT_THAT(c1->users(), ElementsAre(fusion));
 }
 
-// Convenience function for comparing two HloInstructions inside of
-// std::unique_ptrs.
-static bool Identical(std::unique_ptr<HloInstruction> instruction1,
-                      std::unique_ptr<HloInstruction> instruction2) {
+// Convenience function for comparing two HloInstructions.
+static bool Identical(const HloInstruction& instruction1,
+                      const HloInstruction& instruction2) {
   // Verify Identical is reflexive for both instructions.
-  EXPECT_TRUE(instruction1->Identical(*instruction1));
-  EXPECT_TRUE(instruction2->Identical(*instruction2));
+  EXPECT_TRUE(instruction1.Identical(instruction1));
+  EXPECT_TRUE(instruction2.Identical(instruction2));
 
-  bool is_equal = instruction1->Identical(*instruction2);
+  bool is_equal = instruction1.Identical(instruction2);
   // Verify Identical is symmetric.
-  EXPECT_EQ(is_equal, instruction2->Identical(*instruction1));
+  EXPECT_EQ(is_equal, instruction2.Identical(instruction1));
+  return is_equal;
+}
+
+// Convenience function for comparing two HloInstructions for structural
+// equality.
+static bool StructuralEqual(const HloInstruction& instruction1,
+                            const HloInstruction& instruction2) {
+  auto eq_operand_shapes = [](const HloInstruction* a,
+                              const HloInstruction* b) {
+    return ShapeUtil::Equal(a->shape(), b->shape());
+  };
+  auto eq_computations = [](const HloComputation* a, const HloComputation* b) {
+    return *a == *b;
+  };
+
+  // Verify Identical is reflexive for both instructions.
+  EXPECT_TRUE(
+      instruction1.Identical(instruction1, eq_operand_shapes, eq_computations));
+  EXPECT_TRUE(
+      instruction2.Identical(instruction2, eq_operand_shapes, eq_computations));
+
+  bool is_equal =
+      instruction1.Identical(instruction2, eq_operand_shapes, eq_computations);
+  // Verify Identical is symmetric.
+  EXPECT_EQ(is_equal, instruction2.Identical(instruction1, eq_operand_shapes,
+                                             eq_computations));
   return is_equal;
 }
 
@@ -858,42 +883,42 @@ TEST_F(HloInstructionTest, IdenticalInstructions) {
 
   // Operations which only depend on their operands and opcode.
   EXPECT_TRUE(
-      Identical(HloInstruction::CreateUnary(shape, HloOpcode::kCopy, op1),
-                HloInstruction::CreateUnary(shape, HloOpcode::kCopy, op1)));
+      Identical(*HloInstruction::CreateUnary(shape, HloOpcode::kCopy, op1),
+                *HloInstruction::CreateUnary(shape, HloOpcode::kCopy, op1)));
   EXPECT_FALSE(
-      Identical(HloInstruction::CreateUnary(shape, HloOpcode::kCopy, op1),
-                HloInstruction::CreateUnary(shape, HloOpcode::kCopy, op2)));
+      Identical(*HloInstruction::CreateUnary(shape, HloOpcode::kCopy, op1),
+                *HloInstruction::CreateUnary(shape, HloOpcode::kCopy, op2)));
   EXPECT_FALSE(
-      Identical(HloInstruction::CreateUnary(shape, HloOpcode::kCopy, op1),
-                HloInstruction::CreateUnary(shape, HloOpcode::kNegate, op1)));
+      Identical(*HloInstruction::CreateUnary(shape, HloOpcode::kCopy, op1),
+                *HloInstruction::CreateUnary(shape, HloOpcode::kNegate, op1)));
 
   // Tuples.
-  EXPECT_TRUE(Identical(HloInstruction::CreateTuple({op1, op2}),
-                        HloInstruction::CreateTuple({op1, op2})));
-  EXPECT_FALSE(Identical(HloInstruction::CreateTuple({op1, op2}),
-                         HloInstruction::CreateTuple({op2, op1})));
+  EXPECT_TRUE(Identical(*HloInstruction::CreateTuple({op1, op2}),
+                        *HloInstruction::CreateTuple({op1, op2})));
+  EXPECT_FALSE(Identical(*HloInstruction::CreateTuple({op1, op2}),
+                         *HloInstruction::CreateTuple({op2, op1})));
 
   // Broadcasts.
-  EXPECT_TRUE(Identical(HloInstruction::CreateBroadcast(shape, op1, {0, 1}),
-                        HloInstruction::CreateBroadcast(shape, op1, {0, 1})));
-  EXPECT_FALSE(Identical(HloInstruction::CreateBroadcast(shape, op1, {0, 1}),
-                         HloInstruction::CreateBroadcast(shape, op1, {1, 0})));
+  EXPECT_TRUE(Identical(*HloInstruction::CreateBroadcast(shape, op1, {0, 1}),
+                        *HloInstruction::CreateBroadcast(shape, op1, {0, 1})));
+  EXPECT_FALSE(Identical(*HloInstruction::CreateBroadcast(shape, op1, {0, 1}),
+                         *HloInstruction::CreateBroadcast(shape, op1, {1, 0})));
   Shape bcast_shape1 = ShapeUtil::MakeShape(F32, {2, 2, 42});
   Shape bcast_shape2 = ShapeUtil::MakeShape(F32, {2, 2, 123});
   EXPECT_FALSE(
-      Identical(HloInstruction::CreateBroadcast(bcast_shape1, op1, {0, 1}),
-                HloInstruction::CreateBroadcast(bcast_shape2, op1, {0, 1})));
+      Identical(*HloInstruction::CreateBroadcast(bcast_shape1, op1, {0, 1}),
+                *HloInstruction::CreateBroadcast(bcast_shape2, op1, {0, 1})));
 
   // Binary operands.
   EXPECT_TRUE(Identical(
-      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, op1, op2),
-      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, op1, op2)));
+      *HloInstruction::CreateBinary(shape, HloOpcode::kAdd, op1, op2),
+      *HloInstruction::CreateBinary(shape, HloOpcode::kAdd, op1, op2)));
   EXPECT_FALSE(Identical(
-      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, op1, op2),
-      HloInstruction::CreateBinary(shape, HloOpcode::kDivide, op2, op1)));
+      *HloInstruction::CreateBinary(shape, HloOpcode::kAdd, op1, op2),
+      *HloInstruction::CreateBinary(shape, HloOpcode::kDivide, op2, op1)));
   EXPECT_FALSE(Identical(
-      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, op1, op2),
-      HloInstruction::CreateBinary(shape, HloOpcode::kDivide, op1, op2)));
+      *HloInstruction::CreateBinary(shape, HloOpcode::kAdd, op1, op2),
+      *HloInstruction::CreateBinary(shape, HloOpcode::kDivide, op1, op2)));
 }
 
 TEST_F(HloInstructionTest, FunctionVisitor) {
@@ -1089,6 +1114,70 @@ TEST_F(HloInstructionTest, CloneOfFusionPreservesShape) {
       ShapeUtil::Equal(root->operand(1)->shape(), root2->operand(1)->shape()));
   EXPECT_TRUE(ShapeUtil::Equal(root->operand(1)->operand(0)->shape(),
                                root2->operand(1)->operand(0)->shape()));
+  EXPECT_TRUE(StructuralEqual(*fusion, *fusion2));
+}
+
+TEST_F(HloInstructionTest, FusionEquality) {
+  HloModule module(TestName());
+  HloComputation::Builder builder(TestName());
+
+  // Create two fusion instructions containing a single unary operation.
+  auto parameter =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, r0f32_, "x"));
+  auto exp = builder.AddInstruction(
+      HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, parameter));
+  auto neg = builder.AddInstruction(
+      HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, parameter));
+  auto* computation = module.AddEntryComputation(builder.Build());
+  auto* fusion = computation->CreateFusionInstruction(
+      {exp}, HloInstruction::FusionKind::kLoop);
+  auto* fusion2 = computation->CreateFusionInstruction(
+      {neg}, HloInstruction::FusionKind::kLoop);
+  EXPECT_FALSE(StructuralEqual(*fusion, *fusion2));
+
+  auto clone = fusion->Clone();
+  EXPECT_TRUE(StructuralEqual(*fusion, *clone));
+}
+
+TEST_F(HloInstructionTest, NestedFusionEquality) {
+  HloModule module(TestName());
+  HloComputation::Builder builder(TestName());
+
+  // Build a nested fusion computation.
+  Shape data_shape = ShapeUtil::MakeShape(F32, {2, 2});
+  auto a = builder.AddInstruction(HloInstruction::CreateConstant(
+      Literal::CreateR2<float>({{1.0, 0.0}, {0.0, 1.0}})));
+  auto b = builder.AddInstruction(HloInstruction::CreateConstant(
+      Literal::CreateR2<float>({{2.0, 2.0}, {2.0, 2.0}})));
+  auto b_t = builder.AddInstruction(
+      HloInstruction::CreateTranspose(data_shape, b, {1, 0}));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  auto dot = builder.AddInstruction(
+      HloInstruction::CreateDot(data_shape, a, b_t, dot_dnums));
+  auto one = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
+  auto add_operand = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(data_shape, one, {1}));
+  auto add = builder.AddInstruction(HloInstruction::CreateBinary(
+      data_shape, HloOpcode::kAdd, dot, add_operand));
+  auto sub = builder.AddInstruction(HloInstruction::CreateBinary(
+      data_shape, HloOpcode::kSubtract, dot, add_operand));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(data_shape, HloOpcode::kMultiply, add, sub));
+  auto computation = module.AddEntryComputation(builder.Build());
+
+  auto nested_fusion = computation->CreateFusionInstruction(
+      {dot, b_t}, HloInstruction::FusionKind::kTransposeDot);
+
+  auto fusion = computation->CreateFusionInstruction(
+      {add, nested_fusion}, HloInstruction::FusionKind::kOutput);
+  auto fusion2 = computation->CreateFusionInstruction(
+      {sub, nested_fusion}, HloInstruction::FusionKind::kOutput);
+  auto clone = fusion->Clone();
+  EXPECT_TRUE(StructuralEqual(*fusion, *clone));
+  EXPECT_FALSE(StructuralEqual(*fusion, *fusion2));
 }
 
 TEST_F(HloInstructionTest, CloneSuffixNames) {
-- 
GitLab


From 4e6fe74804d60b221d9a109cc4d798b3c79957e0 Mon Sep 17 00:00:00 2001
From: Michael Case <mikecase@google.com>
Date: Thu, 8 Feb 2018 13:10:23 -0800
Subject: [PATCH 857/966] Fix pylint issues.

---
 .../py2tf/converters/side_effect_guards.py    |  2 --
 .../python/kernel_tests/array_ops_test.py     | 20 -------------------
 .../distributions/bernoulli_test.py           |  1 -
 3 files changed, 23 deletions(-)

diff --git a/tensorflow/contrib/py2tf/converters/side_effect_guards.py b/tensorflow/contrib/py2tf/converters/side_effect_guards.py
index fd38c2192d..948cb96c3f 100644
--- a/tensorflow/contrib/py2tf/converters/side_effect_guards.py
+++ b/tensorflow/contrib/py2tf/converters/side_effect_guards.py
@@ -34,8 +34,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from contextlib import contextmanager
-
 import gast
 
 from tensorflow.contrib.py2tf.pyct import anno
diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index 586130f806..1e2ea82988 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -1162,26 +1162,6 @@ class InvertPermutationTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(y.get_shape(), [5])
         self.assertAllEqual(y.eval(), [2, 4, 3, 0, 1])
 
-class UnravelIndexTest(test_util.TensorFlowTestCase):
-
-  def testUnravelIndex(self):
-    with self.test_session():
-      for dtype in [dtypes.int32, dtypes.int64]:
-        indices_1 = constant_op.constant(1621, dtype=dtype)
-        dims_1 = constant_op.constant([6, 7, 8, 9], dtype=dtype)
-        out_1 = array_ops.unravel_index(indices_1, dims_1)
-        self.assertAllEqual(out_1.eval(), [3, 1, 4, 1])
-
-        indices_2 = constant_op.constant([1621], dtype=dtype)
-        dims_2 = constant_op.constant([6, 7, 8, 9], dtype=dtype)
-        out_2 = array_ops.unravel_index(indices_2, dims_2)
-        self.assertAllEqual(out_2.eval(), [[3], [1], [4], [1]])
-
-        indices_3 = constant_op.constant([22, 41, 37], dtype=dtype)
-        dims_3 = constant_op.constant([7, 6], dtype=dtype)
-        out_3 = array_ops.unravel_index(indices_3, dims_3)
-        self.assertAllEqual(out_3.eval(), [[3, 6, 6], [4, 5, 1]])
-
 
 class UnravelIndexTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/python/kernel_tests/distributions/bernoulli_test.py b/tensorflow/python/kernel_tests/distributions/bernoulli_test.py
index dc5d63eec6..09812db816 100644
--- a/tensorflow/python/kernel_tests/distributions/bernoulli_test.py
+++ b/tensorflow/python/kernel_tests/distributions/bernoulli_test.py
@@ -25,7 +25,6 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.distributions import bernoulli
 from tensorflow.python.ops.distributions import kullback_leibler
 from tensorflow.python.platform import test
-- 
GitLab


From b4d59a8f82437b6da897c2c5fe773db7127efdd8 Mon Sep 17 00:00:00 2001
From: Jie <jiej@nvidia.com>
Date: Thu, 8 Feb 2018 13:12:31 -0800
Subject: [PATCH 858/966] [cleanup]   remove TRT_ShapedWeights owned_values_   
  fallback to converter temporary memory allocation to avoid redundant memcpy

---
 .../contrib/tensorrt/convert/convert_nodes.cc | 28 ++++---------------
 1 file changed, 5 insertions(+), 23 deletions(-)

diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index 5efef61f0a..5c22c6265e 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -114,28 +114,18 @@ static std::vector<std::pair<int, int>> CreateSamePadding(
 class TRT_ShapedWeights {
  public:
   TRT_ShapedWeights(tensorflow::DataType type, const void* values,
-                    nvinfer1::Dims shape,
-                    const std::vector<char>* owned_values = nullptr)
-      : shape_(shape),
-        type_(type),
-        values_(values),
-        owned_values_(owned_values ? *owned_values : std::vector<char>({})),
-        empty_weight_flag_(false) {
+                    nvinfer1::Dims shape)
+      : shape_(shape), type_(type), values_(values), empty_weight_flag_(false) {
     // Note: this->shape.type[] is not used
   }
 
   explicit TRT_ShapedWeights(tensorflow::DataType type)
-      : shape_(),
-        type_(type),
-        values_(nullptr),
-        owned_values_(),
-        empty_weight_flag_(true) {}
+      : shape_(), type_(type), values_(nullptr), empty_weight_flag_(true) {}
 
   TRT_ShapedWeights(const TRT_ShapedWeights& rhs)
       : shape_(rhs.shape_),
         type_(rhs.type_),
         values_(rhs.values_),
-        owned_values_(rhs.owned_values_),
         empty_weight_flag_(rhs.empty_weight_flag_) {}
 
   int64_t count() const {
@@ -153,16 +143,9 @@ class TRT_ShapedWeights {
     return nvinfer1::Weights{trt_type, GetValues(), GetShapeSize(shape_)};
   }
 
-  const void* GetValues() const {
-    if (values_) return values_;
-    if (owned_values_.size()) return owned_values_.data();
-    return nullptr;
-  }
+  const void* GetValues() const { return values_; }
 
-  void SetValues(const void* values) {
-    values_ = values;
-    owned_values_.clear();
-  }
+  void SetValues(const void* values) { values_ = values; }
 
   size_t size_bytes() const {
     int type_size = tensorflow::DataTypeSize(this->type_);
@@ -177,7 +160,6 @@ class TRT_ShapedWeights {
 
  private:
   const void* values_;
-  std::vector<char> owned_values_;
   bool empty_weight_flag_;
 };
 
-- 
GitLab


From caced55cbc205a9423a480cae0bb9e7a9a10f3a1 Mon Sep 17 00:00:00 2001
From: Asim Shankar <ashankar@google.com>
Date: Thu, 8 Feb 2018 13:17:53 -0800
Subject: [PATCH 859/966] C API: Fixes #7394

Ideally, when TF_NewTensor is provided with invalid arguments it would provide
a detailed error message. However, for now, to keep the existing API, signal
failure by returning nullptr.

PiperOrigin-RevId: 185040858
---
 tensorflow/c/c_api.cc      | 20 +++++++++++++-------
 tensorflow/c/c_api.h       |  4 ++++
 tensorflow/c/c_api_test.cc | 11 +++++++++++
 3 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index b10af0f060..85f1d1639b 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -64,6 +64,7 @@ using tensorflow::AllocationDescription;
 using tensorflow::DataType;
 using tensorflow::Graph;
 using tensorflow::GraphDef;
+using tensorflow::mutex_lock;
 using tensorflow::NameRangeMap;
 using tensorflow::NameRangesForNode;
 using tensorflow::NewSession;
@@ -77,6 +78,7 @@ using tensorflow::RunMetadata;
 using tensorflow::RunOptions;
 using tensorflow::Session;
 using tensorflow::Status;
+using tensorflow::string;
 using tensorflow::Tensor;
 using tensorflow::TensorBuffer;
 using tensorflow::TensorId;
@@ -87,8 +89,6 @@ using tensorflow::error::Code;
 using tensorflow::errors::FailedPrecondition;
 using tensorflow::errors::InvalidArgument;
 using tensorflow::gtl::ArraySlice;
-using tensorflow::mutex_lock;
-using tensorflow::string;
 using tensorflow::strings::StrCat;
 
 extern "C" {
@@ -199,11 +199,11 @@ TF_Tensor* TF_NewTensor(TF_DataType dtype, const int64_t* dims, int num_dims,
       reinterpret_cast<intptr_t>(data) % EIGEN_MAX_ALIGN_BYTES != 0) {
     // TF_STRING and TF_RESOURCE tensors have a different representation in
     // TF_Tensor than they do in tensorflow::Tensor. So a copy here is a waste
-    // (any alignement requirements will be taken care of by TF_TensorToTensor
+    // (any alignment requirements will be taken care of by TF_TensorToTensor
     // and TF_TensorFromTensor).
     //
-    // Other types have the same represntation, so copy only if it is safe to do
-    // so.
+    // Other types have the same representation, so copy only if it is safe to
+    // do so.
     buf->data_ = allocate_tensor("TF_NewTensor", len);
     std::memcpy(buf->data_, data, len);
     buf->deallocator_ = deallocate_buffer;
@@ -215,7 +215,13 @@ TF_Tensor* TF_NewTensor(TF_DataType dtype, const int64_t* dims, int num_dims,
     buf->deallocator_ = deallocator;
     buf->deallocator_arg_ = deallocator_arg;
   }
-  return new TF_Tensor{dtype, TensorShape(dimvec), buf};
+  TF_Tensor* ret = new TF_Tensor{dtype, TensorShape(dimvec), buf};
+  size_t elem_size = TF_DataTypeSize(dtype);
+  if (elem_size > 0 && len < (elem_size * ret->shape.num_elements())) {
+    delete ret;
+    return nullptr;
+  }
+  return ret;
 }
 
 TF_Tensor* TF_TensorMaybeMove(TF_Tensor* tensor) {
@@ -2148,7 +2154,7 @@ Status CopyGraph(Graph* src_graph, Graph* dst_graph,
     opts.return_tensors.push_back(ToTensorId(nodes_to_return[i]));
   }
 
-  // TOOD(skyewm): change to OutputTensor
+  // TODO(skyewm): change to OutputTensor
   tensorflow::ImportGraphDefResults results;
   TF_RETURN_IF_ERROR(
       ImportGraphDef(opts, gdef, dst_graph, dst_refiner, &results));
diff --git a/tensorflow/c/c_api.h b/tensorflow/c/c_api.h
index d2e45341bf..6d30905a1a 100644
--- a/tensorflow/c/c_api.h
+++ b/tensorflow/c/c_api.h
@@ -226,6 +226,10 @@ typedef struct TF_Tensor TF_Tensor;
 //      (*deallocator)(data, len, deallocator_arg)
 // Clients must provide a custom deallocator function so they can pass in
 // memory managed by something like numpy.
+//
+// May return NULL (and invoke the deallocator) if the provided data buffer
+// (data, len) is inconsistent with a tensor of the given TF_DataType
+// and the shape specified by (dima, num_dims).
 TF_CAPI_EXPORT extern TF_Tensor* TF_NewTensor(
     TF_DataType, const int64_t* dims, int num_dims, void* data, size_t len,
     void (*deallocator)(void* data, size_t len, void* arg),
diff --git a/tensorflow/c/c_api_test.cc b/tensorflow/c/c_api_test.cc
index 01954eb235..0f71bd8f32 100644
--- a/tensorflow/c/c_api_test.cc
+++ b/tensorflow/c/c_api_test.cc
@@ -94,6 +94,17 @@ TEST(CAPI, Tensor) {
   EXPECT_TRUE(deallocator_called);
 }
 
+void NoOpDeallocator(void* data, size_t, void*) {}
+
+TEST(CAPI, MalformedTensor) {
+  // See https://github.com/tensorflow/tensorflow/issues/7394
+  // num_dims = 0 implies a scalar, so should be backed by at least 4 bytes of
+  // data.
+  TF_Tensor* t =
+      TF_NewTensor(TF_FLOAT, nullptr, 0, nullptr, 0, &NoOpDeallocator, nullptr);
+  ASSERT_TRUE(t == nullptr);
+}
+
 TEST(CAPI, AllocateTensor) {
   const int num_bytes = 6 * sizeof(float);
   int64_t dims[] = {2, 3};
-- 
GitLab


From c52ac787c6b716f5abbcebca2a57e3dc3f157200 Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Thu, 8 Feb 2018 13:30:17 -0800
Subject: [PATCH 860/966] Internal functional _If and _While ops.

PiperOrigin-RevId: 185042663
---
 tensorflow/core/BUILD                     |   1 +
 tensorflow/core/kernels/BUILD             |  11 +
 tensorflow/core/kernels/functional_ops.cc | 322 ++++++++++++++++++++++
 tensorflow/core/ops/functional_ops.cc     |  59 ++++
 4 files changed, 393 insertions(+)
 create mode 100644 tensorflow/core/kernels/functional_ops.cc

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index d0c9a72af9..a7f8533014 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -838,6 +838,7 @@ cc_library(
         "//tensorflow/core/kernels:dataset_ops",
         "//tensorflow/core/kernels:fake_quant_ops",
         "//tensorflow/core/kernels:function_ops",
+        "//tensorflow/core/kernels:functional_ops",
         "//tensorflow/core/kernels:histogram_op",
         "//tensorflow/core/kernels:image",
         "//tensorflow/core/kernels:io",
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index e7192ec42f..523e395699 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -1942,6 +1942,17 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "functional_ops",
+    prefix = "functional_ops",
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:functional_ops_op_lib",
+        "//tensorflow/core:lib",
+        "//third_party/eigen3",
+    ],
+)
+
 cc_library(
     name = "image",
     deps = [
diff --git a/tensorflow/core/kernels/functional_ops.cc b/tensorflow/core/kernels/functional_ops.cc
new file mode 100644
index 0000000000..b687088db1
--- /dev/null
+++ b/tensorflow/core/kernels/functional_ops.cc
@@ -0,0 +1,322 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef FunctionLibraryRuntime::Handle FHandle;
+typedef std::vector<Tensor> TensorVec;
+
+namespace {
+
+// Helper to instantiate function "func" in the library "lib".
+Status Instantiate(FunctionLibraryRuntime* lib, const NameAttrList& func,
+                   FunctionLibraryRuntime::Handle* handle) {
+  return lib->Instantiate(func.name(), AttrSlice(&func.attr()), handle);
+}
+
+// If "t" is a scalar of a supported type, returns t != 0 in "*v".
+Status ToBool(gtl::ArraySlice<Tensor> t, bool* v) {
+  if (t.size() != 1) {
+    return errors::InvalidArgument(
+        "Expected a single scalar which can be converted to a boolean, got ",
+        t.size(), " tensors.");
+  }
+  if (TensorShapeUtils::IsScalar(t[0].shape())) {
+    switch (t[0].dtype()) {
+#define CASE(T)                   \
+  case DataTypeToEnum<T>::value:  \
+    *v = t[0].scalar<T>()() != 0; \
+    break;
+
+      CASE(float);
+      CASE(double);
+      CASE(int32);
+      CASE(uint8);
+      CASE(int16);
+      CASE(int8);
+      CASE(int64);
+#undef CASE
+      case DT_BOOL:
+        *v = t[0].scalar<bool>()();
+        break;
+      case DT_STRING:
+        *v = !t[0].scalar<string>()().empty();
+        break;
+      default:
+        return errors::InvalidArgument(DataTypeString(t[0].dtype()),
+                                       " cannot be converted to a boolean");
+    }
+  } else {
+    *v = t[0].NumElements() > 0;
+  }
+  return Status::OK();
+}
+
+// Sets "rets" to be the output of "ctx". Validates rets' types based
+// on "kernel".
+Status SetOutputs(const OpKernel* kernel, OpKernelContext* ctx,
+                  gtl::ArraySlice<Tensor> rets) {
+  if (rets.size() != ctx->num_outputs()) {
+    return errors::Internal("Expect to produce ", ctx->num_outputs(),
+                            " tensors, but only get ", rets.size());
+  }
+  for (int i = 0; i < rets.size(); ++i) {
+    if (rets[i].dtype() != kernel->output_type(i)) {
+      return errors::Internal("Expect ", i, "-th output is of type ",
+                              DataTypeString(kernel->output_type(i)),
+                              " but get ", DataTypeString(rets[i].dtype()));
+    }
+    ctx->set_output(i, rets[i]);
+  }
+  return Status::OK();
+}
+
+void SetRunOptions(OpKernelContext* ctx, FunctionLibraryRuntime::Options* opts,
+                   bool always_collect_stats) {
+  opts->step_id = ctx->step_id();
+  opts->rendezvous = ctx->rendezvous();
+  opts->cancellation_manager = ctx->cancellation_manager();
+  if (always_collect_stats) {
+    opts->stats_collector = ctx->stats_collector();
+  }
+  opts->runner = ctx->runner();
+}
+
+}  // end namespace
+
+class FunctionalIf : public AsyncOpKernel {
+ public:
+  explicit FunctionalIf(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
+    auto lib = ctx->function_library();
+    OP_REQUIRES(ctx, lib != nullptr, errors::Internal("No function library"));
+    const NameAttrList* func;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("then_branch", &func));
+    OP_REQUIRES_OK(ctx, Instantiate(lib, *func, &then_handle_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("else_branch", &func));
+    OP_REQUIRES_OK(ctx, Instantiate(lib, *func, &else_handle_));
+  }
+
+  ~FunctionalIf() override {}
+
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
+    bool cond;
+    OP_REQUIRES_OK(ctx, ToBool({ctx->input(0)}, &cond));
+    (new State(this, ctx, cond, done))->Start();
+  }
+
+ private:
+  FHandle then_handle_;
+  FHandle else_handle_;
+
+  class State {
+   public:
+    State(FunctionalIf* kernel, OpKernelContext* ctx, bool cond,
+          DoneCallback done)
+        : kernel_(kernel),
+          ctx_(ctx),
+          cond_(cond),
+          done_(done),
+          lib_(CHECK_NOTNULL(ctx_->function_library())) {
+      SetRunOptions(ctx_, &opts_, true /* always_collect_stats */);
+      for (int i = 1; i < ctx_->num_inputs(); ++i) {
+        args_.push_back(ctx_->input(i));
+      }
+    }
+
+    ~State() {}
+
+    void Start() {
+      FHandle handle = cond_ ? kernel_->then_handle_ : kernel_->else_handle_;
+      rets_.clear();
+      lib_->Run(
+          // Evaluate one of the branch.
+          opts_, handle, args_, &rets_,
+          // Done callback
+          [this](Status s) {
+            if (s.ok()) {
+              s = SetOutputs(kernel_, ctx_, rets_);
+            }
+            ctx_->SetStatus(s);
+            auto done = done_;
+            delete this;
+            done();
+          });
+    }
+
+   private:
+    FunctionalIf* const kernel_;
+    OpKernelContext* const ctx_;
+    const bool cond_;
+    const DoneCallback done_;
+    FunctionLibraryRuntime* const lib_;
+    FunctionLibraryRuntime::Options opts_;
+    TensorVec args_;
+    TensorVec rets_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("_If").Device(DEVICE_CPU), FunctionalIf);
+REGISTER_KERNEL_BUILDER(Name("_If").Device(DEVICE_GPU).HostMemory("cond"),
+                        FunctionalIf);
+
+class FunctionalWhile : public AsyncOpKernel {
+ public:
+  explicit FunctionalWhile(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("cond", &cond_func_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("body", &body_func_));
+  }
+
+  ~FunctionalWhile() override {}
+
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
+    auto lib = ctx->function_library();
+    OP_REQUIRES_ASYNC(ctx, lib != nullptr,
+                      errors::Internal("No function library"), done);
+
+    // TODO(b/37549631): Because this op has `SetIsStateful()` in its
+    // op registration, this kernel may be shared by multiple
+    // subgraphs, which have different associated
+    // `FunctionLibraryRuntime` objects and hence different `FHandle`
+    // namespaces. We currently work around this by caching the map
+    // from `FunctionLibraryRuntime*` to `FHandle` pairs for the two
+    // functions this op uses.
+    FHandle cond_handle;
+    FHandle body_handle;
+    {
+      mutex_lock l(mu_);
+      const auto iter = handles_.find(lib);
+      if (iter == handles_.end()) {
+        OP_REQUIRES_OK_ASYNC(ctx, Instantiate(lib, cond_func_, &cond_handle),
+                             done);
+        OP_REQUIRES_OK_ASYNC(ctx, Instantiate(lib, body_func_, &body_handle),
+                             done);
+        handles_[lib] = {cond_handle, body_handle};
+      } else {
+        cond_handle = iter->second.first;
+        body_handle = iter->second.second;
+      }
+    }
+
+    (new State(this, ctx, cond_handle, body_handle, done))->Start();
+  }
+
+ private:
+  NameAttrList cond_func_;
+  NameAttrList body_func_;
+
+  mutex mu_;
+  std::unordered_map<FunctionLibraryRuntime*, std::pair<FHandle, FHandle>>
+      handles_ GUARDED_BY(mu_);
+
+  class State {
+   public:
+    State(FunctionalWhile* kernel, OpKernelContext* ctx, FHandle cond_handle,
+          FHandle body_handle, DoneCallback done)
+        : kernel_(kernel),
+          ctx_(ctx),
+          cond_handle_(cond_handle),
+          body_handle_(body_handle),
+          done_(done),
+          lib_(CHECK_NOTNULL(ctx_->function_library())) {
+      SetRunOptions(ctx_, &opts_, false /* always_collect_stats */);
+      for (int i = 0; i < ctx_->num_inputs(); ++i) {
+        args_.push_back(ctx_->input(i));
+      }
+    }
+
+    ~State() {}
+
+    void Start() { EvalCond(); }
+
+   private:
+    FunctionalWhile* const kernel_;
+    OpKernelContext* const ctx_;
+    const FHandle cond_handle_;
+    const FHandle body_handle_;
+    const DoneCallback done_;
+    FunctionLibraryRuntime* const lib_;
+    FunctionLibraryRuntime::Options opts_;
+    TensorVec args_;
+    TensorVec rets_;
+
+    void EvalCond() {
+      lib_->Run(
+          // Evaluate the condition.
+          opts_, cond_handle_, args_, &rets_,
+          // Done cb.
+          [this](const Status& s) {
+            if (!s.ok()) {
+              return Finish(s);
+            }
+            StartBody();
+          });
+    }
+
+    void StartBody() {
+      bool cond;
+      Status s = ToBool(rets_, &cond);
+      if (!s.ok()) {
+        return Finish(s);
+      }
+      if (!cond) {
+        return Finish(Status::OK());
+      }
+      rets_.clear();
+      lib_->Run(
+          // Evaluate the body.
+          opts_, body_handle_, args_, &rets_,
+          // Done callback
+          [this](const Status& s) {
+            if (!s.ok()) {
+              return Finish(s);
+            }
+            if (args_.size() != rets_.size()) {
+              return Finish(errors::InvalidArgument(
+                  "While loop body returned ", rets_.size(),
+                  " arguments. Expected: ", args_.size()));
+            }
+            args_.clear();
+            using std::swap;
+            swap(args_, rets_);
+            EvalCond();
+          });
+    }
+
+    void Finish(Status s) {
+      if (s.ok()) {
+        s = SetOutputs(kernel_, ctx_, args_);
+      }
+      ctx_->SetStatus(s);
+      done_();
+      delete this;
+    }
+  };
+};
+REGISTER_KERNEL_BUILDER(Name("_While").Device(DEVICE_CPU), FunctionalWhile);
+REGISTER_KERNEL_BUILDER(Name("_While").Device(DEVICE_GPU), FunctionalWhile);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/functional_ops.cc b/tensorflow/core/ops/functional_ops.cc
index 515b31623b..9e18d20db6 100644
--- a/tensorflow/core/ops/functional_ops.cc
+++ b/tensorflow/core/ops/functional_ops.cc
@@ -48,4 +48,63 @@ REGISTER_OP("RemoteCall")
     .Attr("Tout: list(type)")
     .Attr("f: func")
     .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("_If")
+    .Input("cond: Tcond")
+    .Input("input: Tin")
+    .Output("output: Tout")
+    .Attr("Tcond: type")
+    .Attr("Tin: list(type)")
+    .Attr("Tout: list(type)")
+    .Attr("then_branch: func")
+    .Attr("else_branch: func")
+    .SetShapeFn(shape_inference::UnknownShape)
+    .Doc(R"doc(
+output = cond ? then_branch(input) : else_branch(input)
+
+cond: A Tensor. If the tensor is a scalar of non-boolean type, the
+    scalar is converted to a boolean according to the
+    following rule: if the scalar is a numerical value, non-zero means
+    True and zero means False; if the scalar is a string, non-empty
+    means True and empty means False. If the tensor is not a scalar,
+    being empty means False and being non-empty means True.
+input: A list of input tensors.
+then_branch: A function that takes 'inputs' and returns a list of
+    tensors, whose types are the same as what else_branch returns.
+else_branch: A function that takes 'inputs' and returns a list of
+    tensors.  whose types are the same as what then_branch returns.
+)doc");
+
+// TODO(b/37549631) setting the While Op to always be stateful is too
+// conservative.
+REGISTER_OP("_While")
+    .Input("input: T")
+    .Output("output: T")
+    .Attr("T: list(type) >= 0")
+    .Attr("cond: func")
+    .Attr("body: func")
+    .SetIsStateful()
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      for (int i = 0; i < c->num_outputs(); ++i) {
+        c->set_output(i, c->input(i));
+      }
+      return Status::OK();
+    })
+    .Doc(R"doc(
+output = input; While (Cond(output)) { output = Body(output) }
+
+input: A list of input tensors whose types are T.
+output: A list of output tensors whose types are T.
+cond: A function takes 'input' and returns a tensor.  If the tensor is
+    a scalar of non-boolean, the scalar is converted to a boolean
+    according to the following rule: if the scalar is a numerical
+    value, non-zero means True and zero means False; if the scalar is
+    a string, non-empty means True and empty means False. If the
+    tensor is not a scalar, non-emptiness means True and False
+    otherwise.
+body: A function that takes a list of tensors and returns another
+      list of tensors. Both lists have the same types as specified
+      by T.
+)doc");
+
 }  // end namespace tensorflow
-- 
GitLab


From 166d6e869e4c559829c8ad4d7cc19a792c2bf444 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 8 Feb 2018 13:37:48 -0800
Subject: [PATCH 861/966] Plumbs a variable from the batching function
 decorator to the queue depth control option in the underlying BatchingQueue.
 It defaults to 10 if not set, which was the original default value in the
 BatchingQueue options struct.

PiperOrigin-RevId: 185043844
---
 tensorflow/contrib/batching/python/ops/batch_ops.py | 9 +++++++--
 tensorflow/core/kernels/batch_kernels.cc            | 9 +++++++--
 tensorflow/core/ops/batch_ops.cc                    | 1 +
 3 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/tensorflow/contrib/batching/python/ops/batch_ops.py b/tensorflow/contrib/batching/python/ops/batch_ops.py
index 4e0b3f9af9..921d6917a4 100644
--- a/tensorflow/contrib/batching/python/ops/batch_ops.py
+++ b/tensorflow/contrib/batching/python/ops/batch_ops.py
@@ -53,10 +53,13 @@ def _UnbatchGrad(op, grad):   # pylint: disable=invalid-name
   ]
 
 
-def batch_function(num_batch_threads, max_batch_size, batch_timeout_micros,
+def batch_function(num_batch_threads,
+                   max_batch_size,
+                   batch_timeout_micros,
                    allowed_batch_sizes=None,
                    grad_timeout_micros=60 * 1000 * 1000,
-                   unbatch_timeout_micros=60 * 1000 * 1000):
+                   unbatch_timeout_micros=60 * 1000 * 1000,
+                   max_enqueued_batches=10):
   """Batches the computation done by the decorated function.
 
   So, for example, in the following code
@@ -94,6 +97,7 @@ def batch_function(num_batch_threads, max_batch_size, batch_timeout_micros,
      documentation of the unbatch op for more details. Defaults to 60s.
     unbatch_timeout_micros: The timeout to use for unbatching. See the
      documentation of the unbatch op for more details. Defaults to 60s.
+    max_enqueued_batches: The maximum depth of the batch queue. Defaults to 10.
 
   Returns:
     The decorated function will return the unbatched computation output Tensors.
@@ -111,6 +115,7 @@ def batch_function(num_batch_threads, max_batch_size, batch_timeout_micros,
             num_batch_threads=num_batch_threads,
             max_batch_size=max_batch_size,
             batch_timeout_micros=batch_timeout_micros,
+            max_enqueued_batches=max_enqueued_batches,
             allowed_batch_sizes=allowed_batch_sizes,
             grad_timeout_micros=grad_timeout_micros,
             shared_name=name)
diff --git a/tensorflow/core/kernels/batch_kernels.cc b/tensorflow/core/kernels/batch_kernels.cc
index c447db842d..546e51be53 100644
--- a/tensorflow/core/kernels/batch_kernels.cc
+++ b/tensorflow/core/kernels/batch_kernels.cc
@@ -207,7 +207,7 @@ Status Split(OpKernelContext* context, const Tensor& input,
 class BatchResource : public ResourceBase {
  public:
   static Status Create(int32 num_batch_threads, int32 max_batch_size,
-                       int32 batch_timeout_micros,
+                       int32 batch_timeout_micros, int32 max_enqueued_batches,
                        const std::vector<int32>& allowed_batch_sizes,
                        std::unique_ptr<BatchResource>* resource) {
     std::unique_ptr<BatchResource> new_resource(new BatchResource);
@@ -218,6 +218,8 @@ class BatchResource : public ResourceBase {
         Batcher::Create(batcher_options, &new_resource->batcher_));
 
     new_resource->batcher_queue_options_.max_batch_size = max_batch_size;
+    new_resource->batcher_queue_options_.max_enqueued_batches =
+        max_enqueued_batches;
     new_resource->batcher_queue_options_.batch_timeout_micros =
         batch_timeout_micros;
 
@@ -513,6 +515,8 @@ class BatchKernel : public AsyncOpKernel {
     OP_REQUIRES_OK(c, c->GetAttr("max_batch_size", &max_batch_size_));
     OP_REQUIRES_OK(c,
                    c->GetAttr("batch_timeout_micros", &batch_timeout_micros_));
+    OP_REQUIRES_OK(c,
+                   c->GetAttr("max_enqueued_batches", &max_enqueued_batches_));
     OP_REQUIRES_OK(c, c->GetAttr("allowed_batch_sizes", &allowed_batch_sizes_));
     OP_REQUIRES_OK(c, ValidateAllowedBatchSizes());
   }
@@ -524,7 +528,7 @@ class BatchKernel : public AsyncOpKernel {
           std::unique_ptr<BatchResource> new_resource;
           TF_RETURN_IF_ERROR(BatchResource::Create(
               num_batch_threads_, max_batch_size_, batch_timeout_micros_,
-              allowed_batch_sizes_, &new_resource));
+              max_enqueued_batches_, allowed_batch_sizes_, &new_resource));
           *r = new_resource.release();
           return Status::OK();
         };
@@ -570,6 +574,7 @@ class BatchKernel : public AsyncOpKernel {
   int32 num_batch_threads_;
   int32 max_batch_size_;
   int32 batch_timeout_micros_;
+  int32 max_enqueued_batches_;
   std::vector<int32> allowed_batch_sizes_;
 };
 
diff --git a/tensorflow/core/ops/batch_ops.cc b/tensorflow/core/ops/batch_ops.cc
index a64582acee..0a62965eed 100644
--- a/tensorflow/core/ops/batch_ops.cc
+++ b/tensorflow/core/ops/batch_ops.cc
@@ -26,6 +26,7 @@ REGISTER_OP("Batch")
     .Output("id: int64")
     .Attr("num_batch_threads: int")
     .Attr("max_batch_size: int")
+    .Attr("max_enqueued_batches: int = 10")
     .Attr("batch_timeout_micros: int")
     .Attr("allowed_batch_sizes: list(int) = []")
     .Attr("grad_timeout_micros: int")
-- 
GitLab


From 597377fca28c76306e749f78f8073f55726d54c9 Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Thu, 8 Feb 2018 13:38:50 -0800
Subject: [PATCH 862/966] Prototype object-based save/restore syntax sugar

- Overrides __setattr__ to allow implicit dependencies
- Supports any valid Python 2 identifier as a Checkpointable dependency name (was messing with underscore prefixes)

PiperOrigin-RevId: 185044022
---
 .../contrib/eager/python/checkpointable.py    | 50 +++++++---
 .../eager/python/checkpointable_test.py       | 94 ++++++++++++-------
 2 files changed, 98 insertions(+), 46 deletions(-)

diff --git a/tensorflow/contrib/eager/python/checkpointable.py b/tensorflow/contrib/eager/python/checkpointable.py
index 47ce5897c0..ce4e07874e 100644
--- a/tensorflow/contrib/eager/python/checkpointable.py
+++ b/tensorflow/contrib/eager/python/checkpointable.py
@@ -46,9 +46,10 @@ _CheckpointableReference = collections.namedtuple(
     ])
 
 # Validation regular expression for the local names of Checkpointable
-# objects. In particular, disallows "/" in names, and reserves
-# underscore-prefixed names.
-_VALID_LOCAL_NAME = re.compile(r"^[A-Za-z0-9.][A-Za-z0-9_.-]*$")
+# objects. In particular, disallows "/" in names, and reserves dash-prefixed
+# names (which are not valid Python identifiers, so we're not restricting the
+# __setattr__ syntax that way).
+_VALID_LOCAL_NAME = re.compile(r"^[A-Za-z0-9_.][A-Za-z0-9_.-]*$")
 
 # Keyword for identifying that the next bit of a checkpoint variable name is a
 # slot name. May not be the local name of a checkpointable. Checkpoint names for
@@ -58,7 +59,7 @@ _VALID_LOCAL_NAME = re.compile(r"^[A-Za-z0-9.][A-Za-z0-9_.-]*$")
 #
 # Where <path to variable> is a full path from the checkpoint root to the
 # variable being slotted for.
-_OPTIMIZER_SLOTS_NAME = "_OPTIMIZER_SLOT"
+_OPTIMIZER_SLOTS_NAME = "-OPTIMIZER_SLOT"
 
 
 def _assign_existing_variable(variable_to_restore, value_pointer):
@@ -89,12 +90,12 @@ class Checkpointable(object):
   """
 
   def __init__(self):
-    # Basically a less useful OrderedDict but without the reference cycles.
-    # TODO(allenl): Switch this to OrderedDict once TensorFlow supports only
-    # Python 3.6+.
     # A list of _CheckpointableReference objects.
     self._checkpoint_dependencies = []
-    self._dependency_names = set()
+    # Maps names -> Checkpointable objects for named dependencies
+    self._dependency_names = {}
+    # Set of all tracked Checkpointables
+    self._already_tracked = set()
     # Start numbering at 1, since an un-set protocol buffer integer is
     # indistinguishable from 0.
     self._next_unnamed_checkpoint_dependency_uid = 1
@@ -102,6 +103,27 @@ class Checkpointable(object):
     self._deferred_restorations = {}  # local name -> _VariableRestoration
                                       # object
 
+  def __setattr__(self, name, value):
+    """Support self.foo = checkpointable syntax.
+
+    `self.foo = checkpointable` is equivalent to
+    `self.foo = self.track_checkpointable(checkpointable, name='foo')`.
+
+    No new tracking if `value` is not a `Checkpointable`, or if `value` is
+    already being tracked (either because of an explicit `track_checkpointable`
+    or a previous `__setattr__`).
+
+    Args:
+      name: The name of the property being set.
+      value: The new value for the property.
+    """
+    # Give child classes (e.g. Network) priority, then track only if the object
+    # hasn't been added to _already_tracked.
+    super(Checkpointable, self).__setattr__(name, value)
+    if (isinstance(value, Checkpointable)
+        and value not in self._already_tracked):
+      self.track_checkpointable(value, name=name)
+
   def add_variable(self, name, shape, dtype=None, initializer=None, **kwargs):
     """Create a new variable object to be saved with this `Checkpointable`.
 
@@ -217,12 +239,13 @@ class Checkpointable(object):
             ("Checkpointable names must match the regular expression '%s', but "
              "got an invalid name '%s' instead.") % (_VALID_LOCAL_NAME.pattern,
                                                      name))
-      if name in self._dependency_names:
+      if (name in self._dependency_names
+          and self._dependency_names[name] is not checkpointable):
         raise ValueError(
             ("Called Checkpointable.track_checkpointable() with name='%s', but "
              "a Checkpointable with this name is already declared as a "
              "dependency. If provided, names must be unique.") % (name,))
-      self._dependency_names.add(name)
+      self._dependency_names[name] = checkpointable
       local_uid = None
     else:
       # TODO(allenl): Should this be exposed to allow users to stop depending on
@@ -232,6 +255,7 @@ class Checkpointable(object):
     self._checkpoint_dependencies.append(
         _CheckpointableReference(
             name=name, ref=checkpointable, local_uid=local_uid))
+    self._already_tracked.add(checkpointable)
     return checkpointable
 
   def _process_restoration(self, restoration):
@@ -305,8 +329,10 @@ def _breadth_first_checkpointable_traversal(root_checkpointable):
 
 
 def _object_prefix_from_path(path_to_root):
-  return "/".join((checkpointable.name if checkpointable.name else "_%d" % (
-      checkpointable.local_uid,)) for checkpointable in path_to_root)
+  return "/".join(
+      (checkpointable.name if checkpointable.name
+       else "-unnamed_%d" % (checkpointable.local_uid,))
+      for checkpointable in path_to_root)
 
 
 def _escape_variable_name(variable_name):
diff --git a/tensorflow/contrib/eager/python/checkpointable_test.py b/tensorflow/contrib/eager/python/checkpointable_test.py
index d823053283..4b92ad59e7 100644
--- a/tensorflow/contrib/eager/python/checkpointable_test.py
+++ b/tensorflow/contrib/eager/python/checkpointable_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.layers import base
 from tensorflow.python.layers import core
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
@@ -64,6 +65,13 @@ class CheckpointableNetwork(network_lib.Network, checkpointable.Checkpointable):
     network_lib.Network.__init__(self)
     checkpointable.Checkpointable.__init__(self)
 
+  def __setattr__(self, name, value):
+    if isinstance(value, base.Layer) and value not in self._already_tracked:
+      self.track_layer(value, name=name)
+    # Checkpointable is next in the method resolution order, so this will catch
+    # Checkpointable objects which aren't Layers.
+    super(CheckpointableNetwork, self).__setattr__(name, value)
+
   def track_layer(self, layer, name=None):
     self.track_checkpointable(layer, name=name)
     return super(CheckpointableNetwork, self).track_layer(layer)
@@ -107,18 +115,34 @@ class CheckpointableAdam(adam.AdamOptimizer, checkpointable.Checkpointable):
     return v
 
 
+class NonLayerCheckpointable(checkpointable.Checkpointable):
+
+  def __init__(self):
+    super(NonLayerCheckpointable, self).__init__()
+    with variable_scope.variable_scope(None, default_name="non_layer"):
+      # Unfortunately using tf.get_variable to implement self.add_variable
+      # (necessary for backwards compatibile naming with Layers) we can still
+      # run into duplicate variable errors (when building a graph, not when
+      # executing eagerly), thus the variable scope.
+      #
+      # TODO(allenl): Consider creating a ResourceVariable directly by
+      # default so that variable reuse isn't an issue.
+      self._a_variable = self.add_variable("a_variable", shape=[])
+
+
 class MyNetwork(CheckpointableNetwork):
   """A concrete Network for testing."""
 
   def __init__(self):
     super(MyNetwork, self).__init__()
-    self._named = self.track_layer(
-        CheckpointableDenseLayer(1, use_bias=True), name="named_dense")
+    self._named_dense = CheckpointableDenseLayer(1, use_bias=True)
     self._unnamed = self.track_layer(
         CheckpointableDenseLayer(1, use_bias=False))
+    # We can still track Checkpointables which aren't Layers.
+    self._non_layer = NonLayerCheckpointable()
 
   def call(self, values):
-    return self._unnamed(self._named(values))
+    return self._unnamed(self._named_dense(values))
 
 
 class Root(checkpointable.Checkpointable):
@@ -126,8 +150,8 @@ class Root(checkpointable.Checkpointable):
 
   def __init__(self, optimizer, network):
     super(Root, self).__init__()
-    self.track_checkpointable(optimizer, name="optimizer")
-    self.track_checkpointable(network, name="network")
+    self._optimizer = optimizer
+    self._network = self.track_checkpointable(network, "network")
     self._global_step = None
 
   @property
@@ -177,36 +201,38 @@ class CheckpointNamingTests(test.TestCase):
         "global_step",
         # No name provided to track_checkpointable(), so the position is used
         # instead (one-based).
-        "network/_1/kernel",
+        "network/-unnamed_1/kernel",
         # track_checkpointable() with a name provided, so that's used
-        "network/named_dense/kernel",
-        "network/named_dense/bias",
+        "network/_named_dense/kernel",
+        "network/_named_dense/bias",
+        # non-Layer dependency of the network
+        "network/_non_layer/a_variable",
         # The optimizer creates two non-slot variables
-        "optimizer/beta1_power",
-        "optimizer/beta2_power",
+        "_optimizer/beta1_power",
+        "_optimizer/beta2_power",
         # Slot variables
-        "network/_1/kernel/_OPTIMIZER_SLOT/optimizer/m",
-        "network/_1/kernel/_OPTIMIZER_SLOT/optimizer/v",
-        "network/named_dense/kernel/_OPTIMIZER_SLOT/optimizer/m",
-        "network/named_dense/kernel/_OPTIMIZER_SLOT/optimizer/v",
-        "network/named_dense/bias/_OPTIMIZER_SLOT/optimizer/m",
-        "network/named_dense/bias/_OPTIMIZER_SLOT/optimizer/v",
+        "network/-unnamed_1/kernel/-OPTIMIZER_SLOT/_optimizer/m",
+        "network/-unnamed_1/kernel/-OPTIMIZER_SLOT/_optimizer/v",
+        "network/_named_dense/kernel/-OPTIMIZER_SLOT/_optimizer/m",
+        "network/_named_dense/kernel/-OPTIMIZER_SLOT/_optimizer/v",
+        "network/_named_dense/bias/-OPTIMIZER_SLOT/_optimizer/m",
+        "network/_named_dense/bias/-OPTIMIZER_SLOT/_optimizer/v",
     )
     six.assertCountEqual(self, expected_checkpoint_names,
                          named_variables.keys())
     # Check that we've mapped to the right variable objects (not exhaustive)
     self.assertEqual("global_step:0", named_variables["global_step"].name)
     self.assertEqual("my_network/checkpointable_dense_layer_1/kernel:0",
-                     named_variables["network/_1/kernel"].name)
+                     named_variables["network/-unnamed_1/kernel"].name)
     self.assertEqual("my_network/checkpointable_dense_layer/kernel:0",
-                     named_variables["network/named_dense/kernel"].name)
+                     named_variables["network/_named_dense/kernel"].name)
     self.assertEqual("beta1_power:0",
-                     named_variables["optimizer/beta1_power"].name)
+                     named_variables["_optimizer/beta1_power"].name)
     self.assertEqual("beta2_power:0",
-                     named_variables["optimizer/beta2_power"].name)
+                     named_variables["_optimizer/beta2_power"].name)
     # Spot check the generated protocol buffers.
     self.assertEqual(0, serialized_graph.nodes[0].children[0].local_uid)
-    self.assertEqual("optimizer",
+    self.assertEqual("_optimizer",
                      serialized_graph.nodes[0].children[0].local_name)
     optimizer_node = serialized_graph.nodes[serialized_graph.nodes[0].children[
         0].node_id]
@@ -217,18 +243,18 @@ class CheckpointNamingTests(test.TestCase):
         "bias", optimizer_node.slot_variables[0].original_variable_local_name)
     original_variable_owner = serialized_graph.nodes[
         optimizer_node.slot_variables[0].original_variable_node_id]
-    self.assertEqual("network/named_dense/bias",
+    self.assertEqual("network/_named_dense/bias",
                      original_variable_owner.variables[0].checkpoint_key)
     self.assertEqual("bias", original_variable_owner.variables[0].local_name)
     self.assertEqual("m", optimizer_node.slot_variables[0].slot_name)
-    self.assertEqual("network/named_dense/bias/_OPTIMIZER_SLOT/optimizer/m",
+    self.assertEqual("network/_named_dense/bias/-OPTIMIZER_SLOT/_optimizer/m",
                      optimizer_node.slot_variables[0].checkpoint_key)
     # We strip off the :0 suffix, as variable.name-based saving does.
     self.assertEqual("my_network/checkpointable_dense_layer/bias/Adam",
                      optimizer_node.slot_variables[0].full_name)
     self.assertEqual("my_network/checkpointable_dense_layer/bias/Adam:0",
                      optimizer.get_slot(
-                         var=named_variables["network/named_dense/bias"],
+                         var=named_variables["network/_named_dense/bias"],
                          name="m").name)
 
   @test_util.run_in_graph_and_eager_modes()
@@ -247,14 +273,14 @@ class CheckpointNamingTests(test.TestCase):
       self.evaluate(variables.global_variables_initializer())
       self.evaluate(train_op)
     prefix = os.path.join(self.get_temp_dir(), "ckpt")
-    self.evaluate(state_ops.assign(network._named.variables[1], [42.]))
-    m_bias_slot = optimizer.get_slot(network._named.variables[1], "m")
+    self.evaluate(state_ops.assign(network._named_dense.variables[1], [42.]))
+    m_bias_slot = optimizer.get_slot(network._named_dense.variables[1], "m")
     self.evaluate(state_ops.assign(m_bias_slot, [1.5]))
     serialized_graph, save_path = checkpointable.save(
         file_prefix=prefix,
         root_checkpointable=root_checkpointable,
         global_step=root_checkpointable.global_step)
-    self.evaluate(state_ops.assign(network._named.variables[1], [43.]))
+    self.evaluate(state_ops.assign(network._named_dense.variables[1], [43.]))
     self.evaluate(state_ops.assign(root_checkpointable.global_step, 3))
     optimizer_variables = self.evaluate(optimizer.variables())
     self.evaluate(state_ops.assign(m_bias_slot, [-2.]))
@@ -263,7 +289,7 @@ class CheckpointNamingTests(test.TestCase):
         save_path=save_path,
         root_checkpointable=root_checkpointable,
         object_graph_proto=serialized_graph)
-    self.assertAllEqual([42.], self.evaluate(network._named.variables[1]))
+    self.assertAllEqual([42.], self.evaluate(network._named_dense.variables[1]))
     self.assertAllEqual(1, self.evaluate(root_checkpointable.global_step))
     self.assertAllEqual([1.5], self.evaluate(m_bias_slot))
     with ops.Graph().as_default():
@@ -281,9 +307,9 @@ class CheckpointNamingTests(test.TestCase):
         self.assertAllEqual(1, self.evaluate(on_create_root.global_step))
         self.assertAllEqual([42.],
                             self.evaluate(
-                                on_create_network._named.variables[1]))
+                                on_create_network._named_dense.variables[1]))
         on_create_m_bias_slot = on_create_optimizer.get_slot(
-            on_create_network._named.variables[1], "m")
+            on_create_network._named_dense.variables[1], "m")
         # Optimizer slot variables are created when the original variable is
         # restored.
         self.assertAllEqual([1.5], self.evaluate(on_create_m_bias_slot))
@@ -393,16 +419,16 @@ class CheckpointNamingTests(test.TestCase):
     leaf.add_variable(name="v", shape=[])
     named_variables, _ = checkpointable._serialize_object_graph(root)
     variable_name, = named_variables.keys()
-    self.assertEqual(r"_1/v", variable_name)
+    self.assertEqual(r"-unnamed_1/v", variable_name)
 
   @test_util.run_in_graph_and_eager_modes()
   def testLocalNameValidation(self):
     root = checkpointable.Checkpointable()
     leaf = checkpointable.Checkpointable()
     with self.assertRaisesRegexp(ValueError, "invalid name"):
-      # Leading underscores are reserved, which avoids conflicts with
-      # un-named edges in paths and the optimizer slots identifier.
-      root.track_checkpointable(leaf, name="_12")
+      # Leading dashes are reserved, which avoids conflicts with un-named edges
+      # in paths and the optimizer slots identifier.
+      root.track_checkpointable(leaf, name="-unnamed-12")
 
 
 if __name__ == "__main__":
-- 
GitLab


From fa3fb289ba6a1718f9c76b2277a58f95f5e878ab Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Thu, 8 Feb 2018 13:43:20 -0800
Subject: [PATCH 863/966] Adding tf_export decorators/calls to TensorFlow
 functions and constants.

PiperOrigin-RevId: 185044705
---
 tensorflow/python/estimator/estimator.py          | 10 +++++++---
 tensorflow/python/estimator/exporter.py           |  4 ++++
 tensorflow/python/estimator/model_fn.py           |  3 +++
 tensorflow/python/estimator/run_config.py         |  2 ++
 tensorflow/python/estimator/training.py           |  4 ++++
 tensorflow/python/estimator/warm_starting_util.py |  3 +++
 .../keras/_impl/keras/datasets/boston_housing.py  |  2 ++
 .../python/keras/_impl/keras/datasets/cifar10.py  |  2 ++
 .../python/keras/_impl/keras/datasets/cifar100.py |  2 ++
 .../python/keras/_impl/keras/datasets/imdb.py     |  3 +++
 .../python/keras/_impl/keras/datasets/mnist.py    |  2 ++
 .../python/keras/_impl/keras/datasets/reuters.py  |  3 +++
 tensorflow/python/layers/base.py                  |  3 +++
 tensorflow/python/layers/convolutional.py         | 15 +++++++++++++++
 tensorflow/python/layers/core.py                  |  7 +++++++
 tensorflow/python/layers/network.py               |  2 ++
 tensorflow/python/layers/normalization.py         |  3 +++
 tensorflow/python/layers/pooling.py               | 13 +++++++++++++
 tensorflow/tools/api/generator/BUILD              |  1 +
 19 files changed, 81 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index 17fab3df4d..5d36108bbf 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -57,12 +57,14 @@ from tensorflow.python.training import training_util
 from tensorflow.python.util import compat
 from tensorflow.python.util import compat_internal
 from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
 
 
 _VALID_MODEL_FN_ARGS = set(
     ['features', 'labels', 'mode', 'params', 'self', 'config'])
 
 
+@tf_export('estimator.Estimator')
 class Estimator(object):
   """Estimator class to train and evaluate TensorFlow models.
 
@@ -502,9 +504,11 @@ class Estimator(object):
 
   def _assert_members_are_not_overridden(self):
     """Asserts members of `Estimator` are not overridden."""
-    allowed_overrides = set(['_call_input_fn', '_create_global_step',
-                             '_convert_train_steps_to_hooks',
-                             '_convert_eval_steps_to_hooks'])
+    allowed_overrides = set([
+        '_call_input_fn', '_create_global_step',
+        '_convert_train_steps_to_hooks', '_convert_eval_steps_to_hooks',
+        '_tf_api_names'
+    ])
     estimator_members = set([m for m in Estimator.__dict__.keys()
                              if not m.startswith('__')])
     subclass_members = set(self.__class__.__dict__.keys())
diff --git a/tensorflow/python/estimator/exporter.py b/tensorflow/python/estimator/exporter.py
index ba522f396d..a3f04626d1 100644
--- a/tensorflow/python/estimator/exporter.py
+++ b/tensorflow/python/estimator/exporter.py
@@ -25,8 +25,10 @@ from tensorflow.python.estimator import gc
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('estimator.Exporter')
 class Exporter(object):
   """A class representing a type of model export."""
 
@@ -123,6 +125,7 @@ class _SavedModelExporter(Exporter):
     return export_result
 
 
+@tf_export('estimator.FinalExporter')
 class FinalExporter(Exporter):
   """This class exports the serving graph and checkpoints in the end.
 
@@ -174,6 +177,7 @@ class FinalExporter(Exporter):
                                              is_the_final_export)
 
 
+@tf_export('estimator.LatestExporter')
 class LatestExporter(Exporter):
   """This class regularly exports the serving graph and checkpoints.
 
diff --git a/tensorflow/python/estimator/model_fn.py b/tensorflow/python/estimator/model_fn.py
index b08f83fc56..8111ab564c 100644
--- a/tensorflow/python/estimator/model_fn.py
+++ b/tensorflow/python/estimator/model_fn.py
@@ -31,8 +31,10 @@ from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.training import monitored_session
 from tensorflow.python.training import session_run_hook
 from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('estimator.ModeKeys')
 class ModeKeys(object):
   """Standard names for model modes.
 
@@ -52,6 +54,7 @@ LOSS_METRIC_KEY = 'loss'
 AVERAGE_LOSS_METRIC_KEY = 'average_loss'
 
 
+@tf_export('estimator.EstimatorSpec')
 class EstimatorSpec(
     collections.namedtuple('EstimatorSpec', [
         'mode', 'predictions', 'loss', 'train_op', 'eval_metric_ops',
diff --git a/tensorflow/python/estimator/run_config.py b/tensorflow/python/estimator/run_config.py
index 0c636a8da1..3e021242c4 100644
--- a/tensorflow/python/estimator/run_config.py
+++ b/tensorflow/python/estimator/run_config.py
@@ -28,6 +28,7 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import server_lib
 from tensorflow.python.util import compat_internal
+from tensorflow.python.util.tf_export import tf_export
 
 
 _USE_DEFAULT = object()
@@ -286,6 +287,7 @@ class TaskType(object):
   EVALUATOR = 'evaluator'
 
 
+@tf_export('estimator.RunConfig')
 class RunConfig(object):
   """This class specifies the configurations for an `Estimator` run."""
 
diff --git a/tensorflow/python/estimator/training.py b/tensorflow/python/estimator/training.py
index 2e84c5014f..63328dcfb5 100644
--- a/tensorflow/python/estimator/training.py
+++ b/tensorflow/python/estimator/training.py
@@ -35,6 +35,7 @@ from tensorflow.python.training import basic_session_run_hooks
 from tensorflow.python.training import server_lib
 from tensorflow.python.training import session_run_hook
 from tensorflow.python.util import compat
+from tensorflow.python.util.tf_export import tf_export
 
 _MAX_DELAY_SECS = 60
 _DELAY_SECS_PER_WORKER = 5
@@ -114,6 +115,7 @@ def _is_google_env():
   return tf_config.get(_ENVIRONMENT_KEY) == _ENVIRONMENT_GOOGLE_VALUE
 
 
+@tf_export('estimator.TrainSpec')
 class TrainSpec(
     collections.namedtuple('TrainSpec', ['input_fn', 'max_steps', 'hooks'])):
   """Configuration for the "train" part for the `train_and_evaluate` call.
@@ -158,6 +160,7 @@ class TrainSpec(
         cls, input_fn=input_fn, max_steps=max_steps, hooks=hooks)
 
 
+@tf_export('estimator.EvalSpec')
 class EvalSpec(
     collections.namedtuple('EvalSpec', [
         'input_fn', 'steps', 'name', 'hooks', 'exporters', 'start_delay_secs',
@@ -246,6 +249,7 @@ class EvalSpec(
         throttle_secs=throttle_secs)
 
 
+@tf_export('estimator.train_and_evaluate')
 def train_and_evaluate(estimator, train_spec, eval_spec):
   """Train and evaluate the `estimator`.
 
diff --git a/tensorflow/python/estimator/warm_starting_util.py b/tensorflow/python/estimator/warm_starting_util.py
index 57db968d56..adb013f5c6 100644
--- a/tensorflow/python/estimator/warm_starting_util.py
+++ b/tensorflow/python/estimator/warm_starting_util.py
@@ -30,8 +30,10 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_ops
 from tensorflow.python.training import checkpoint_utils
 from tensorflow.python.training import saver
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("estimator.VocabInfo")
 class VocabInfo(
     collections.namedtuple("VocabInfo", [
         "new_vocab",
@@ -81,6 +83,7 @@ class VocabInfo(
     )
 
 
+@tf_export("estimator.WarmStartSettings")
 class WarmStartSettings(
     collections.namedtuple("WarmStartSettings", [
         "ckpt_to_initialize_from",
diff --git a/tensorflow/python/keras/_impl/keras/datasets/boston_housing.py b/tensorflow/python/keras/_impl/keras/datasets/boston_housing.py
index cfd7df61d5..13fa9aed2b 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/boston_housing.py
+++ b/tensorflow/python/keras/_impl/keras/datasets/boston_housing.py
@@ -21,8 +21,10 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('keras.datasets.boston_housing.load_data')
 def load_data(path='boston_housing.npz', test_split=0.2, seed=113):
   """Loads the Boston Housing dataset.
 
diff --git a/tensorflow/python/keras/_impl/keras/datasets/cifar10.py b/tensorflow/python/keras/_impl/keras/datasets/cifar10.py
index fb9d98d42c..6b77243382 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/cifar10.py
+++ b/tensorflow/python/keras/_impl/keras/datasets/cifar10.py
@@ -25,8 +25,10 @@ import numpy as np
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.datasets.cifar import load_batch
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('keras.datasets.cifar10.load_data')
 def load_data():
   """Loads CIFAR10 dataset.
 
diff --git a/tensorflow/python/keras/_impl/keras/datasets/cifar100.py b/tensorflow/python/keras/_impl/keras/datasets/cifar100.py
index 95aace599a..28d74116a5 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/cifar100.py
+++ b/tensorflow/python/keras/_impl/keras/datasets/cifar100.py
@@ -25,8 +25,10 @@ import numpy as np
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.datasets.cifar import load_batch
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('keras.datasets.cifar100.load_data')
 def load_data(label_mode='fine'):
   """Loads CIFAR100 dataset.
 
diff --git a/tensorflow/python/keras/_impl/keras/datasets/imdb.py b/tensorflow/python/keras/_impl/keras/datasets/imdb.py
index 880c9c821b..e2dddf7730 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/imdb.py
+++ b/tensorflow/python/keras/_impl/keras/datasets/imdb.py
@@ -25,8 +25,10 @@ import numpy as np
 from tensorflow.python.keras._impl.keras.preprocessing.sequence import _remove_long_seq
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('keras.datasets.imdb.load_data')
 def load_data(path='imdb.npz',
               num_words=None,
               skip_top=0,
@@ -128,6 +130,7 @@ def load_data(path='imdb.npz',
   return (x_train, y_train), (x_test, y_test)
 
 
+@tf_export('keras.datasets.imdb.get_word_index')
 def get_word_index(path='imdb_word_index.json'):
   """Retrieves the dictionary mapping word indices back to words.
 
diff --git a/tensorflow/python/keras/_impl/keras/datasets/mnist.py b/tensorflow/python/keras/_impl/keras/datasets/mnist.py
index ec12a31dcf..e30691373e 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/mnist.py
+++ b/tensorflow/python/keras/_impl/keras/datasets/mnist.py
@@ -21,8 +21,10 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('keras.datasets.mnist.load_data')
 def load_data(path='mnist.npz'):
   """Loads the MNIST dataset.
 
diff --git a/tensorflow/python/keras/_impl/keras/datasets/reuters.py b/tensorflow/python/keras/_impl/keras/datasets/reuters.py
index 95cf8852a9..b711696b5e 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/reuters.py
+++ b/tensorflow/python/keras/_impl/keras/datasets/reuters.py
@@ -25,8 +25,10 @@ import numpy as np
 from tensorflow.python.keras._impl.keras.preprocessing.sequence import _remove_long_seq
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('keras.datasets.reuters.load_data')
 def load_data(path='reuters.npz',
               num_words=None,
               skip_top=0,
@@ -112,6 +114,7 @@ def load_data(path='reuters.npz',
   return (x_train, y_train), (x_test, y_test)
 
 
+@tf_export('keras.datasets.reuters.get_word_index')
 def get_word_index(path='reuters_word_index.json'):
   """Retrieves the dictionary mapping word indices back to words.
 
diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py
index 5dea732cba..3a3c559541 100644
--- a/tensorflow/python/layers/base.py
+++ b/tensorflow/python/layers/base.py
@@ -37,8 +37,10 @@ from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('layers.Layer')
 class Layer(object):
   """Base layer class.
 
@@ -1228,6 +1230,7 @@ class Layer(object):
                                  ', found shape=' + str(shape))
 
 
+@tf_export('keras.layers.InputSpec', 'layers.InputSpec')
 class InputSpec(object):
   """Specifies the ndim, dtype and shape of every input to a layer.
 
diff --git a/tensorflow/python/layers/convolutional.py b/tensorflow/python/layers/convolutional.py
index e8dba3cea3..689046fe78 100644
--- a/tensorflow/python/layers/convolutional.py
+++ b/tensorflow/python/layers/convolutional.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
 class _Conv(base.Layer):
@@ -222,6 +223,7 @@ class _Conv(base.Layer):
                                       new_space)
 
 
+@tf_export('layers.Conv1D')
 class Conv1D(_Conv):
   """1D convolution layer (e.g. temporal convolution).
 
@@ -311,6 +313,7 @@ class Conv1D(_Conv):
         name=name, **kwargs)
 
 
+@tf_export('layers.conv1d')
 def conv1d(inputs,
            filters,
            kernel_size,
@@ -411,6 +414,7 @@ def conv1d(inputs,
   return layer.apply(inputs)
 
 
+@tf_export('layers.Conv2D')
 class Conv2D(_Conv):
   """2D convolution layer (e.g. spatial convolution over images).
 
@@ -507,6 +511,7 @@ class Conv2D(_Conv):
         name=name, **kwargs)
 
 
+@tf_export('layers.conv2d')
 def conv2d(inputs,
            filters,
            kernel_size,
@@ -614,6 +619,7 @@ def conv2d(inputs,
   return layer.apply(inputs)
 
 
+@tf_export('layers.Conv3D')
 class Conv3D(_Conv):
   """3D convolution layer (e.g. spatial convolution over volumes).
 
@@ -711,6 +717,7 @@ class Conv3D(_Conv):
         name=name, **kwargs)
 
 
+@tf_export('layers.conv3d')
 def conv3d(inputs,
            filters,
            kernel_size,
@@ -980,6 +987,7 @@ class _SeparableConv(_Conv):
     raise NotImplementedError
 
 
+@tf_export('layers.SeparableConv1D')
 class SeparableConv1D(_SeparableConv):
   """Depthwise separable 1D convolution.
 
@@ -1123,6 +1131,7 @@ class SeparableConv1D(_SeparableConv):
     return outputs
 
 
+@tf_export('layers.SeparableConv2D')
 class SeparableConv2D(_SeparableConv):
   """Depthwise separable 2D convolution.
 
@@ -1260,6 +1269,7 @@ class SeparableConv2D(_SeparableConv):
     return outputs
 
 
+@tf_export('layers.separable_conv1d')
 def separable_conv1d(inputs,
                      filters,
                      kernel_size,
@@ -1376,6 +1386,7 @@ def separable_conv1d(inputs,
   return layer.apply(inputs)
 
 
+@tf_export('layers.separable_conv2d')
 def separable_conv2d(inputs,
                      filters,
                      kernel_size,
@@ -1497,6 +1508,7 @@ def separable_conv2d(inputs,
   return layer.apply(inputs)
 
 
+@tf_export('layers.Conv2DTranspose')
 class Conv2DTranspose(Conv2D):
   """Transposed 2D convolution layer (sometimes called 2D Deconvolution).
 
@@ -1695,6 +1707,7 @@ class Conv2DTranspose(Conv2D):
     return tensor_shape.TensorShape(output_shape)
 
 
+@tf_export('layers.conv2d_transpose')
 def conv2d_transpose(inputs,
                      filters,
                      kernel_size,
@@ -1790,6 +1803,7 @@ def conv2d_transpose(inputs,
   return layer.apply(inputs)
 
 
+@tf_export('layers.Conv3DTranspose')
 class Conv3DTranspose(Conv3D):
   """Transposed 3D convolution layer (sometimes called 3D Deconvolution).
 
@@ -2018,6 +2032,7 @@ class Conv3DTranspose(Conv3D):
     return tensor_shape.TensorShape(output_shape)
 
 
+@tf_export('layers.conv3d_transpose')
 def conv3d_transpose(inputs,
                      filters,
                      kernel_size,
diff --git a/tensorflow/python/layers/core.py b/tensorflow/python/layers/core.py
index 7bf62d45b8..ec4fca78f0 100644
--- a/tensorflow/python/layers/core.py
+++ b/tensorflow/python/layers/core.py
@@ -37,8 +37,10 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import standard_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('layers.Dense')
 class Dense(base.Layer):
   """Densely-connected layer class.
 
@@ -173,6 +175,7 @@ class Dense(base.Layer):
     return input_shape[:-1].concatenate(self.units)
 
 
+@tf_export('layers.dense')
 def dense(
     inputs, units,
     activation=None,
@@ -248,6 +251,7 @@ def dense(
   return layer.apply(inputs)
 
 
+@tf_export('layers.Dropout')
 class Dropout(base.Layer):
   """Applies Dropout to the input.
 
@@ -309,6 +313,7 @@ class Dropout(base.Layer):
     return input_shape
 
 
+@tf_export('layers.dropout')
 def dropout(inputs,
             rate=0.5,
             noise_shape=None,
@@ -350,6 +355,7 @@ def dropout(inputs,
   return layer.apply(inputs, training=training)
 
 
+@tf_export('layers.Flatten')
 class Flatten(base.Layer):
   """Flattens an input tensor while preserving the batch axis (axis 0).
 
@@ -386,6 +392,7 @@ class Flatten(base.Layer):
     return tensor_shape.TensorShape(output_shape)
 
 
+@tf_export('layers.flatten')
 def flatten(inputs, name=None):
   """Flattens an input tensor while preserving the batch axis (axis 0).
 
diff --git a/tensorflow/python/layers/network.py b/tensorflow/python/layers/network.py
index 7bcf25064c..6de8f35502 100644
--- a/tensorflow/python/layers/network.py
+++ b/tensorflow/python/layers/network.py
@@ -30,6 +30,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
 
 
 class InputLayer(base.Layer):
@@ -117,6 +118,7 @@ class InputLayer(base.Layer):
         output_tensors=[input_tensor])
 
 
+@tf_export('layers.Input')
 def Input(  # pylint: disable=invalid-name
     shape=None,
     batch_size=None,
diff --git a/tensorflow/python/layers/normalization.py b/tensorflow/python/layers/normalization.py
index 890c12f6e0..656d566ab5 100644
--- a/tensorflow/python/layers/normalization.py
+++ b/tensorflow/python/layers/normalization.py
@@ -39,8 +39,10 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.training import moving_averages
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('layers.BatchNormalization')
 class BatchNormalization(base.Layer):
   """Batch Normalization layer from http://arxiv.org/abs/1502.03167.
 
@@ -629,6 +631,7 @@ class BatchNormalization(base.Layer):
     return input_shape
 
 
+@tf_export('layers.batch_normalization')
 def batch_normalization(inputs,
                         axis=-1,
                         momentum=0.99,
diff --git a/tensorflow/python/layers/pooling.py b/tensorflow/python/layers/pooling.py
index ab06a3a408..50503ce093 100644
--- a/tensorflow/python/layers/pooling.py
+++ b/tensorflow/python/layers/pooling.py
@@ -26,6 +26,7 @@ from tensorflow.python.layers import base
 from tensorflow.python.layers import utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import nn
+from tensorflow.python.util.tf_export import tf_export
 
 
 class _Pooling1D(base.Layer):
@@ -96,6 +97,7 @@ class _Pooling1D(base.Layer):
     return tensor_shape.TensorShape([input_shape[0], length, input_shape[2]])
 
 
+@tf_export('layers.AveragePooling1D')
 class AveragePooling1D(_Pooling1D):
   """Average Pooling layer for 1D inputs.
 
@@ -127,6 +129,7 @@ class AveragePooling1D(_Pooling1D):
         **kwargs)
 
 
+@tf_export('layers.average_pooling1d')
 def average_pooling1d(inputs, pool_size, strides,
                       padding='valid', data_format='channels_last',
                       name=None):
@@ -161,6 +164,7 @@ def average_pooling1d(inputs, pool_size, strides,
   return layer.apply(inputs)
 
 
+@tf_export('layers.MaxPooling1D')
 class MaxPooling1D(_Pooling1D):
   """Max Pooling layer for 1D inputs.
 
@@ -192,6 +196,7 @@ class MaxPooling1D(_Pooling1D):
         **kwargs)
 
 
+@tf_export('layers.max_pooling1d')
 def max_pooling1d(inputs, pool_size, strides,
                   padding='valid', data_format='channels_last',
                   name=None):
@@ -297,6 +302,7 @@ class _Pooling2D(base.Layer):
           [input_shape[0], rows, cols, input_shape[3]])
 
 
+@tf_export('layers.AveragePooling2D')
 class AveragePooling2D(_Pooling2D):
   """Average pooling layer for 2D inputs (e.g. images).
 
@@ -328,6 +334,7 @@ class AveragePooling2D(_Pooling2D):
         padding=padding, data_format=data_format, name=name, **kwargs)
 
 
+@tf_export('layers.average_pooling2d')
 def average_pooling2d(inputs,
                       pool_size, strides,
                       padding='valid', data_format='channels_last',
@@ -365,6 +372,7 @@ def average_pooling2d(inputs,
   return layer.apply(inputs)
 
 
+@tf_export('layers.MaxPooling2D')
 class MaxPooling2D(_Pooling2D):
   """Max pooling layer for 2D inputs (e.g. images).
 
@@ -396,6 +404,7 @@ class MaxPooling2D(_Pooling2D):
         padding=padding, data_format=data_format, name=name, **kwargs)
 
 
+@tf_export('layers.max_pooling2d')
 def max_pooling2d(inputs,
                   pool_size, strides,
                   padding='valid', data_format='channels_last',
@@ -515,6 +524,7 @@ class _Pooling3D(base.Layer):
           [input_shape[0], len_dim1, len_dim2, len_dim3, input_shape[4]])
 
 
+@tf_export('layers.AveragePooling3D')
 class AveragePooling3D(_Pooling3D):
   """Average pooling layer for 3D inputs (e.g. volumes).
 
@@ -548,6 +558,7 @@ class AveragePooling3D(_Pooling3D):
         padding=padding, data_format=data_format, name=name, **kwargs)
 
 
+@tf_export('layers.average_pooling3d')
 def average_pooling3d(inputs,
                       pool_size, strides,
                       padding='valid', data_format='channels_last',
@@ -587,6 +598,7 @@ def average_pooling3d(inputs,
   return layer.apply(inputs)
 
 
+@tf_export('layers.MaxPooling3D')
 class MaxPooling3D(_Pooling3D):
   """Max pooling layer for 3D inputs (e.g. volumes).
 
@@ -620,6 +632,7 @@ class MaxPooling3D(_Pooling3D):
         padding=padding, data_format=data_format, name=name, **kwargs)
 
 
+@tf_export('layers.max_pooling3d')
 def max_pooling3d(inputs,
                   pool_size, strides,
                   padding='valid', data_format='channels_last',
diff --git a/tensorflow/tools/api/generator/BUILD b/tensorflow/tools/api/generator/BUILD
index 2d27b94d85..a1c960be43 100644
--- a/tensorflow/tools/api/generator/BUILD
+++ b/tensorflow/tools/api/generator/BUILD
@@ -80,6 +80,7 @@ genrule(
         "api/keras/utils/__init__.py",
         "api/keras/wrappers/__init__.py",
         "api/keras/wrappers/scikit_learn/__init__.py",
+        "api/layers/__init__.py",
         "api/linalg/__init__.py",
         "api/logging/__init__.py",
         "api/losses/__init__.py",
-- 
GitLab


From 037895185e970e3bdc789fbf6aad643c271415d4 Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Thu, 8 Feb 2018 14:10:51 -0800
Subject: [PATCH 864/966] Don't fail if control dependency is on an input of
 the function.

PiperOrigin-RevId: 185049319
---
 tensorflow/c/c_api_function.cc      | 27 +++++++++++++++++++--------
 tensorflow/c/c_api_function_test.cc | 24 +++++++++++++++++-------
 2 files changed, 36 insertions(+), 15 deletions(-)

diff --git a/tensorflow/c/c_api_function.cc b/tensorflow/c/c_api_function.cc
index 46271e0514..384e6c8cb9 100644
--- a/tensorflow/c/c_api_function.cc
+++ b/tensorflow/c/c_api_function.cc
@@ -44,8 +44,12 @@ class NodeNameMapping {
  public:
   NodeNameMapping() = default;
 
-  // Normalize the input/output name and make it unique.
-  string GetIOName(const string& name);
+  // Normalize the input name and make it unique. This is the same as the
+  // function for output, expect that it adds a name mapping for the name.
+  string GetInputName(const string& name);
+
+  // Normalize the output name and make it unique.
+  string GetOutputName(const string& name);
 
   // Make the node name unique.
   string Uniquify(const string& name);
@@ -107,7 +111,13 @@ string NodeNameMapping::UniquifyHelper(const string& name) const {
   }
 }
 
-string NodeNameMapping::GetIOName(const string& name) {
+string NodeNameMapping::GetInputName(const string& name) {
+  const string& input_name = GetOutputName(name);
+  name_mapping_[name] = input_name;
+  return input_name;
+}
+
+string NodeNameMapping::GetOutputName(const string& name) {
   const string& input_name = UniquifyHelper(Normalize(name));
   // Record that we used this name, but don't add it to name_mapping_
   // since this name is not for a node.
@@ -214,10 +224,11 @@ Status FillFunctionBody(
 
     // Add control inputs.
     for (const Edge* edge : control_edges) {
-      // Add this control input only if the src node is in the body.
+      // Add this control input only if the src node is in the body or a part of
+      // the inputs.
       const string normalized = node_names.Lookup(edge->src()->name());
       // If we did not find a name for the source of control edge, this
-      // source must be outside of the body. Raise an error.
+      // source must be outside of the body, and not an input. Raise an error.
       if (normalized.empty()) {
         return InvalidArgument(
             "The source of control edge ", edge->DebugString(),
@@ -279,7 +290,7 @@ Status GraphToFunctionDef(const Graph& fn_body, const string& fn_name,
       TF_RETURN_IF_ERROR(node_names.UseOutputName(output_names[i]));
       argdef->set_name(output_names[i]);
     } else {
-      argdef->set_name(node_names.GetIOName(node->name()));
+      argdef->set_name(node_names.GetOutputName(node->name()));
     }
   }
 
@@ -289,7 +300,7 @@ Status GraphToFunctionDef(const Graph& fn_body, const string& fn_name,
     int idx = inputs[i].index;
     OpDef::ArgDef* argdef = fdef->mutable_signature()->add_input_arg();
     argdef->set_type(node->output_type(idx));
-    const string& input_name = node_names.GetIOName(node->name());
+    const string& input_name = node_names.GetInputName(node->name());
     argdef->set_name(input_name);
     tensor_renaming[strings::StrCat(node->name(), ":", idx)] = input_name;
   }
@@ -467,7 +478,7 @@ Status ComputeBodyNodes(
   return Status::OK();
 }
 
-}  // anonymous namespace
+}  // namespace
 }  // namespace tensorflow
 
 using tensorflow::Node;
diff --git a/tensorflow/c/c_api_function_test.cc b/tensorflow/c/c_api_function_test.cc
index dbce66d231..7ca50119ea 100644
--- a/tensorflow/c/c_api_function_test.cc
+++ b/tensorflow/c/c_api_function_test.cc
@@ -331,6 +331,11 @@ class CApiFunctionTest : public ::testing::Test {
           << "Failed to find expected edge " << e.ToString()
           << " in fdef: " << fdef.DebugString();
     }
+    for (const EdgeSpec& e : c_edges) {
+      ASSERT_TRUE(a_edges.find(e) != a_edges.end())
+          << "Failed to find expected control edge " << e.ToString()
+          << " in fdef: " << fdef.DebugString();
+    }
 
     // If caller specified all edges, check that we have seen all
     if (is_exact_edges) {
@@ -980,7 +985,7 @@ TEST_F(CApiFunctionTest, ControlDependency) {
   VerifyFDef(
       {"add_0", "scalar"}, M({{"feed1"}, {"feed2"}}), M({{"add"}}),
       {{"feed1", "add_0:0"}, {"feed2", "add_0:1"}, {"add_0:sum:0", "add"}},
-      {{"scalar", "add_0"}});
+      {{"^scalar", "add_0:2"}});
 }
 
 TEST_F(CApiFunctionTest, ControlDependencyOutsideOfBody) {
@@ -1023,12 +1028,17 @@ TEST_F(CApiFunctionTest, ControlDependencyOutsideOfBody_FromInputNode) {
   TF_Operation* add =
       AddWithCtrlDependency(feed1, feed2, func_graph_, feed1, s_);
   EXPECT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
-  Define(-1, {}, {feed1, feed2}, {add}, {}, true);
-  EXPECT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(s_));
-  EXPECT_EQ(string("The source of control edge [id=3 feed1:-1 -> add:-1] "
-                   "is not in the body. Encountered while creating "
-                   "function 'MyFunc'"),
-            string(TF_Message(s_)));
+  Define(-1, {}, {feed1, feed2}, {add}, {});
+
+  // Use, run, and verify
+  TF_Operation* two = ScalarConst(2, host_graph_, s_);
+  TF_Operation* func_feed = Placeholder(host_graph_, s_);
+  TF_Operation* func_op = Use({two, func_feed});
+  Run({{func_feed, Int32Tensor(3)}}, func_op, 2 + 3);
+  VerifyFDef(
+      {"add_0"}, M({{"feed1"}, {"feed2"}}), M({{"add"}}),
+      {{"feed1", "add_0:0"}, {"feed2", "add_0:1"}, {"add_0:sum:0", "add"}},
+      {{"^feed1", "add_0:2"}});
 }
 
 TEST_F(CApiFunctionTest, DuplicateInputsAreNotAllowed) {
-- 
GitLab


From af7d56e660a91cdb0ce235bddf8a1f01c8c8a593 Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Thu, 8 Feb 2018 14:19:23 -0800
Subject: [PATCH 865/966] Object-based saving prototype: allow only named edges

Simplifies the logic a bit, and removes one source of potential checkpoint
incompatibilities.

Classes like Sequential which need anonymous references to other objects may
generate string names containing numbers.

PiperOrigin-RevId: 185050714
---
 .../proto/checkpointable_object_graph.proto   |   8 +-
 .../contrib/eager/python/checkpointable.py    | 111 ++++++------------
 .../eager/python/checkpointable_test.py       |  21 ++--
 3 files changed, 48 insertions(+), 92 deletions(-)

diff --git a/tensorflow/contrib/eager/proto/checkpointable_object_graph.proto b/tensorflow/contrib/eager/proto/checkpointable_object_graph.proto
index b4a39e6c68..4f71aec96a 100644
--- a/tensorflow/contrib/eager/proto/checkpointable_object_graph.proto
+++ b/tensorflow/contrib/eager/proto/checkpointable_object_graph.proto
@@ -14,12 +14,8 @@ message CheckpointableObjectGraph {
       // An index into `CheckpointableObjectGraph.nodes`, indicating the object
       // being referenced.
       int32 node_id = 1;
-      // A numeric identifier for this object within its parent. Zero means
-      // unset, in which case there should be a local_name.
-      int32 local_uid = 2;
-      // A user-provided name for the edge. May be blank/omitted, in which case
-      // there is no explicitly provided local name; fall back on local_uid.
-      string local_name = 3;
+      // A user-provided name for the edge.
+      string local_name = 2;
     }
 
     message VariableReference {
diff --git a/tensorflow/contrib/eager/python/checkpointable.py b/tensorflow/contrib/eager/python/checkpointable.py
index ce4e07874e..09d405475b 100644
--- a/tensorflow/contrib/eager/python/checkpointable.py
+++ b/tensorflow/contrib/eager/python/checkpointable.py
@@ -37,10 +37,6 @@ _CheckpointableReference = collections.namedtuple(
     [
         # The local name if explicitly specified, else None.
         "name",
-        # 1 for the first dependency, 2 for the next, ... Used for routing
-        # checkpointed variables to their correct Checkpointables when "name" is
-        # not set (see docstring of `track_checkpointable`).
-        "local_uid",
         # The Checkpointable object being referenced.
         "ref"
     ])
@@ -96,9 +92,6 @@ class Checkpointable(object):
     self._dependency_names = {}
     # Set of all tracked Checkpointables
     self._already_tracked = set()
-    # Start numbering at 1, since an un-set protocol buffer integer is
-    # indistinguishable from 0.
-    self._next_unnamed_checkpoint_dependency_uid = 1
     self._owned_variables = {}  # local name -> variable object
     self._deferred_restorations = {}  # local name -> _VariableRestoration
                                       # object
@@ -193,29 +186,24 @@ class Checkpointable(object):
     self._owned_variables[name] = new_variable
     return new_variable
 
-  def track_checkpointable(self, checkpointable, name=None):
+  def track_checkpointable(self, checkpointable, name):
     """Declare a dependency on another `Checkpointable` object.
 
     Indicates that checkpoints for this object should include variables from
     `checkpointable`.
 
-    Variables in a checkpoint are mapped to `Checkpointable`s based on names if
-    provided when the checkpoint was written, but otherwise use the order those
-    `Checkpointable`s were declared as dependencies.
-
-    There are three sufficient conditions to avoid breaking existing checkpoints
-    when modifying a class: (1) New un-named dependencies must be declared after
-    existing un-named dependencies, (2) un-named dependencies which were
-    previously declared may never be removed (a trivial placeholder may be used
-    instead if the dependency is no longer needed), and (3) names may not change
-    (un-named dependencies may not later be named, named dependencies must keep
-    the same name).
+    Variables in a checkpoint are mapped to `Checkpointable`s based on names. To
+    avoid breaking existing checkpoints when modifying a class, neither variable
+    names nor dependency names (the names passed to `track_checkpointable`) may
+    change.
 
     Args:
       checkpointable: A `Checkpointable` which this object depends on.
       name: A local name for `checkpointable`, used for loading checkpoints into
-        the correct objects. If provided, it must be unique within this
-        `Checkpointable`. If None, dependency declaration order is used instead.
+        the correct objects. Python 2 identifiers are valid names, with the
+        addition of leading numerals, periods anywhere, and non-leading dashes.
+        Specifically names must match the regular expression
+        `^[A-Za-z0-9_.][A-Za-z0-9_.-]*$`.
 
     Returns:
       `checkpointable`, for convenience when declaring a dependency and
@@ -233,28 +221,20 @@ class Checkpointable(object):
       raise TypeError(
           ("Checkpointable.track_checkpointable() passed type %s, not a "
            "Checkpointable.") % (type(checkpointable),))
-    if name is not None:
-      if not _VALID_LOCAL_NAME.match(name):
-        raise ValueError(
-            ("Checkpointable names must match the regular expression '%s', but "
-             "got an invalid name '%s' instead.") % (_VALID_LOCAL_NAME.pattern,
-                                                     name))
-      if (name in self._dependency_names
-          and self._dependency_names[name] is not checkpointable):
-        raise ValueError(
-            ("Called Checkpointable.track_checkpointable() with name='%s', but "
-             "a Checkpointable with this name is already declared as a "
-             "dependency. If provided, names must be unique.") % (name,))
-      self._dependency_names[name] = checkpointable
-      local_uid = None
-    else:
-      # TODO(allenl): Should this be exposed to allow users to stop depending on
-      # things and still load checkpoints when not using names?
-      local_uid = self._next_unnamed_checkpoint_dependency_uid
-      self._next_unnamed_checkpoint_dependency_uid += 1
+    if not _VALID_LOCAL_NAME.match(name):
+      raise ValueError(
+          ("Checkpointable names must match the regular expression '%s', but "
+           "got an invalid name '%s' instead.") % (_VALID_LOCAL_NAME.pattern,
+                                                   name))
+    if (name in self._dependency_names
+        and self._dependency_names[name] is not checkpointable):
+      raise ValueError(
+          ("Called Checkpointable.track_checkpointable() with name='%s', but "
+           "a Checkpointable with this name is already declared as a "
+           "dependency. Names must be unique.") % (name,))
+    self._dependency_names[name] = checkpointable
     self._checkpoint_dependencies.append(
-        _CheckpointableReference(
-            name=name, ref=checkpointable, local_uid=local_uid))
+        _CheckpointableReference(name=name, ref=checkpointable))
     self._already_tracked.add(checkpointable)
     return checkpointable
 
@@ -313,7 +293,7 @@ def _breadth_first_checkpointable_traversal(root_checkpointable):
   """Find shortest paths to all variables owned by dependencies of root."""
   bfs_sorted = []
   root_checkpointable_reference = _CheckpointableReference(
-      name=None, local_uid=0, ref=root_checkpointable)
+      name=None, ref=root_checkpointable)
   to_visit = collections.deque([root_checkpointable_reference])
   path_to_root = {root_checkpointable_reference: ()}
   while to_visit:
@@ -330,9 +310,7 @@ def _breadth_first_checkpointable_traversal(root_checkpointable):
 
 def _object_prefix_from_path(path_to_root):
   return "/".join(
-      (checkpointable.name if checkpointable.name
-       else "-unnamed_%d" % (checkpointable.local_uid,))
-      for checkpointable in path_to_root)
+      (checkpointable.name for checkpointable in path_to_root))
 
 
 def _escape_variable_name(variable_name):
@@ -435,10 +413,7 @@ def _serialize_non_slot_variables(checkpointable_objects, path_to_root,
     for child in checkpointable.ref.checkpoint_dependencies:
       child_proto = object_proto.children.add()
       child_proto.node_id = checkpoint_node_ids[child]
-      if child.local_uid is not None:
-        child_proto.local_uid = child.local_uid
-      if child.name is not None:
-        child_proto.local_name = child.name
+      child_proto.local_name = child.name
   return named_variables, non_slot_variables
 
 
@@ -575,37 +550,22 @@ def _checkpoint_object_id_map(root_checkpointable, object_graph_proto):
     checkpointable, node_id = to_visit.popleft()
     object_proto = node_list[node_id]
     named_children = {}
-    numbered_children = {}
     for child_reference in object_proto.children:
       if child_reference.local_name:
         named_children[child_reference.local_name] = child_reference
       else:
-        if not child_reference.local_uid:
-          raise AssertionError(
-              ("The checkpointed object graph contains a reference with "
-               "neither a name nor a number (corrupted?). The reference was "
-               "from the node %s.") % (object_proto,))
-        numbered_children[child_reference.local_uid] = child_reference
+        raise AssertionError(
+            ("The checkpointed object graph contains a reference without "
+             "a name (corrupted?). The reference was from the node %s.")
+            % (object_proto,))
 
     for checkpointable_reference in checkpointable._checkpoint_dependencies:  # pylint: disable=protected-access
-      if checkpointable_reference.name is not None:
-        child_node_id = _set_reference(
-            reference_proto_table=named_children,
-            key=checkpointable_reference.name,
-            checkpointable=checkpointable_reference.ref,
-            parent=checkpointable,
-            object_id_map=object_id_map)
-      else:
-        if checkpointable_reference.local_uid is None:
-          raise AssertionError(
-              ("A Checkpointable reference was created with no name and no "
-               "number in %s.") % (checkpointable,))
-        child_node_id = _set_reference(
-            reference_proto_table=numbered_children,
-            key=checkpointable_reference.local_uid,
-            checkpointable=checkpointable_reference.ref,
-            parent=checkpointable,
-            object_id_map=object_id_map)
+      child_node_id = _set_reference(
+          reference_proto_table=named_children,
+          key=checkpointable_reference.name,
+          checkpointable=checkpointable_reference.ref,
+          parent=checkpointable,
+          object_id_map=object_id_map)
       if child_node_id not in seen:
         seen.add(child_node_id)
         to_visit.append((checkpointable_reference.ref, child_node_id))
@@ -766,3 +726,4 @@ def restore(save_path, root_checkpointable, object_graph_proto, session=None):
   for restoration, checkpointable in _gather_restorations(
       object_graph_proto, save_path, object_id_map, dtype_map, session=session):
     checkpointable._process_restoration(restoration)  # pylint: disable=protected-access
+
diff --git a/tensorflow/contrib/eager/python/checkpointable_test.py b/tensorflow/contrib/eager/python/checkpointable_test.py
index 4b92ad59e7..f0c3df56bc 100644
--- a/tensorflow/contrib/eager/python/checkpointable_test.py
+++ b/tensorflow/contrib/eager/python/checkpointable_test.py
@@ -72,7 +72,7 @@ class CheckpointableNetwork(network_lib.Network, checkpointable.Checkpointable):
     # Checkpointable objects which aren't Layers.
     super(CheckpointableNetwork, self).__setattr__(name, value)
 
-  def track_layer(self, layer, name=None):
+  def track_layer(self, layer, name):
     self.track_checkpointable(layer, name=name)
     return super(CheckpointableNetwork, self).track_layer(layer)
 
@@ -136,13 +136,13 @@ class MyNetwork(CheckpointableNetwork):
   def __init__(self):
     super(MyNetwork, self).__init__()
     self._named_dense = CheckpointableDenseLayer(1, use_bias=True)
-    self._unnamed = self.track_layer(
-        CheckpointableDenseLayer(1, use_bias=False))
+    self._via_track_layer = self.track_layer(
+        CheckpointableDenseLayer(1, use_bias=False), name="via_track_layer")
     # We can still track Checkpointables which aren't Layers.
     self._non_layer = NonLayerCheckpointable()
 
   def call(self, values):
-    return self._unnamed(self._named_dense(values))
+    return self._via_track_layer(self._named_dense(values))
 
 
 class Root(checkpointable.Checkpointable):
@@ -201,7 +201,7 @@ class CheckpointNamingTests(test.TestCase):
         "global_step",
         # No name provided to track_checkpointable(), so the position is used
         # instead (one-based).
-        "network/-unnamed_1/kernel",
+        "network/via_track_layer/kernel",
         # track_checkpointable() with a name provided, so that's used
         "network/_named_dense/kernel",
         "network/_named_dense/bias",
@@ -211,8 +211,8 @@ class CheckpointNamingTests(test.TestCase):
         "_optimizer/beta1_power",
         "_optimizer/beta2_power",
         # Slot variables
-        "network/-unnamed_1/kernel/-OPTIMIZER_SLOT/_optimizer/m",
-        "network/-unnamed_1/kernel/-OPTIMIZER_SLOT/_optimizer/v",
+        "network/via_track_layer/kernel/-OPTIMIZER_SLOT/_optimizer/m",
+        "network/via_track_layer/kernel/-OPTIMIZER_SLOT/_optimizer/v",
         "network/_named_dense/kernel/-OPTIMIZER_SLOT/_optimizer/m",
         "network/_named_dense/kernel/-OPTIMIZER_SLOT/_optimizer/v",
         "network/_named_dense/bias/-OPTIMIZER_SLOT/_optimizer/m",
@@ -223,7 +223,7 @@ class CheckpointNamingTests(test.TestCase):
     # Check that we've mapped to the right variable objects (not exhaustive)
     self.assertEqual("global_step:0", named_variables["global_step"].name)
     self.assertEqual("my_network/checkpointable_dense_layer_1/kernel:0",
-                     named_variables["network/-unnamed_1/kernel"].name)
+                     named_variables["network/via_track_layer/kernel"].name)
     self.assertEqual("my_network/checkpointable_dense_layer/kernel:0",
                      named_variables["network/_named_dense/kernel"].name)
     self.assertEqual("beta1_power:0",
@@ -231,7 +231,6 @@ class CheckpointNamingTests(test.TestCase):
     self.assertEqual("beta2_power:0",
                      named_variables["_optimizer/beta2_power"].name)
     # Spot check the generated protocol buffers.
-    self.assertEqual(0, serialized_graph.nodes[0].children[0].local_uid)
     self.assertEqual("_optimizer",
                      serialized_graph.nodes[0].children[0].local_name)
     optimizer_node = serialized_graph.nodes[serialized_graph.nodes[0].children[
@@ -415,11 +414,11 @@ class CheckpointNamingTests(test.TestCase):
   def testNumberedPath(self):
     root = checkpointable.Checkpointable()
     leaf = checkpointable.Checkpointable()
-    root.track_checkpointable(leaf)
+    root.track_checkpointable(leaf, name="leaf")
     leaf.add_variable(name="v", shape=[])
     named_variables, _ = checkpointable._serialize_object_graph(root)
     variable_name, = named_variables.keys()
-    self.assertEqual(r"-unnamed_1/v", variable_name)
+    self.assertEqual(r"leaf/v", variable_name)
 
   @test_util.run_in_graph_and_eager_modes()
   def testLocalNameValidation(self):
-- 
GitLab


From 250adf3bd0eb5e031922aef4c5044e4784735997 Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Thu, 8 Feb 2018 14:23:25 -0800
Subject: [PATCH 866/966] Updates to fastpath execution code

- Add default value shape handling
- Add default value func handling (including adding a new function for this in
  the eager C API)
- Update callback execution interface to match record_gradient call

PiperOrigin-RevId: 185051447
---
 tensorflow/c/eager/c_api.cc                   |  13 ++
 tensorflow/c/eager/c_api.h                    |  86 +++++----
 tensorflow/python/eager/execute.py            |   2 +-
 .../python/eager/execution_callbacks.py       |  36 ++--
 tensorflow/python/eager/pywrap_tfe_src.cc     | 171 +++++++++++++-----
 5 files changed, 218 insertions(+), 90 deletions(-)

diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index 9cd1accde9..8e834eb99c 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -455,6 +455,19 @@ void TFE_OpSetAttrShapeList(TFE_Op* op, const char* attr_name,
                     proto.get(), num_values));
 }
 
+void TFE_OpSetAttrFunctionList(TFE_Op* op, const char* attr_name,
+                               const TFE_Op** value, int num_values) {
+  std::unique_ptr<tensorflow::NameAttrList[]> funcs(
+      new tensorflow::NameAttrList[num_values]);
+  for (int i = 0; i < num_values; i++) {
+    funcs[i].set_name(value[i]->name);
+    value[i]->attrs.FillAttrValueMap(funcs[i].mutable_attr());
+  }
+  op->attrs.Set(attr_name,
+                tensorflow::gtl::ArraySlice<const tensorflow::NameAttrList>(
+                    funcs.get(), num_values));
+}
+
 namespace {
 
 tensorflow::Status ValidateInputTypeAndPlacement(
diff --git a/tensorflow/c/eager/c_api.h b/tensorflow/c/eager/c_api.h
index 9506cf7390..7a321b54da 100644
--- a/tensorflow/c/eager/c_api.h
+++ b/tensorflow/c/eager/c_api.h
@@ -87,7 +87,8 @@ typedef struct TFE_Context TFE_Context;
 
 TF_CAPI_EXPORT extern TFE_Context* TFE_NewContext(
     const TFE_ContextOptions* opts, TF_Status* status);
-TF_CAPI_EXPORT extern void TFE_DeleteContext(TFE_Context* ctx, TF_Status* status);
+TF_CAPI_EXPORT extern void TFE_DeleteContext(TFE_Context* ctx,
+                                             TF_Status* status);
 TF_CAPI_EXPORT extern TF_DeviceList* TFE_ContextListDevices(TFE_Context* ctx,
                                                             TF_Status* status);
 
@@ -119,8 +120,10 @@ TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_NewTensorHandle(TF_Tensor* t,
 TF_CAPI_EXPORT extern void TFE_DeleteTensorHandle(TFE_TensorHandle* h);
 TF_CAPI_EXPORT extern TF_DataType TFE_TensorHandleDataType(TFE_TensorHandle* h);
 TF_CAPI_EXPORT extern int TFE_TensorHandleNumDims(TFE_TensorHandle* h);
-TF_CAPI_EXPORT extern int64_t TFE_TensorHandleDim(TFE_TensorHandle* h, int dim_index);
-TF_CAPI_EXPORT extern const char* TFE_TensorHandleDeviceName(TFE_TensorHandle* h);
+TF_CAPI_EXPORT extern int64_t TFE_TensorHandleDim(TFE_TensorHandle* h,
+                                                  int dim_index);
+TF_CAPI_EXPORT extern const char* TFE_TensorHandleDeviceName(
+    TFE_TensorHandle* h);
 TF_CAPI_EXPORT extern TF_Tensor* TFE_TensorHandleResolve(TFE_TensorHandle* h,
                                                          TF_Status* status);
 
@@ -130,10 +133,9 @@ TF_CAPI_EXPORT extern TF_Tensor* TFE_TensorHandleResolve(TFE_TensorHandle* h,
 // that shares the underlying buffer. Otherwise, it currently requires at least
 // one of the source or destination devices to be CPU (i.e., for the source or
 // destination tensor to be placed in host memory).
-TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_TensorHandleCopyToDevice(TFE_TensorHandle* h,
-                                                                     TFE_Context* ctx,
-                                                                     const char* device_name,
-                                                                     TF_Status* status);
+TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_TensorHandleCopyToDevice(
+    TFE_TensorHandle* h, TFE_Context* ctx, const char* device_name,
+    TF_Status* status);
 
 // Description of the TensorFlow op to execute.
 //
@@ -148,7 +150,8 @@ TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_TensorHandleCopyToDevice(TFE_TensorH
 //     the additional sanity checks there seem unnecessary;
 typedef struct TFE_Op TFE_Op;
 
-TF_CAPI_EXPORT extern TFE_Op* TFE_NewOp(TFE_Context* ctx, const char* op_or_function_name,
+TF_CAPI_EXPORT extern TFE_Op* TFE_NewOp(TFE_Context* ctx,
+                                        const char* op_or_function_name,
                                         TF_Status* status);
 TF_CAPI_EXPORT extern void TFE_DeleteOp(TFE_Op* op);
 
@@ -165,10 +168,13 @@ TF_CAPI_EXPORT extern const char* TFE_OpGetDevice(TFE_Op* op,
 TF_CAPI_EXPORT extern void TFE_OpSetXLACompilation(TFE_Op* op,
                                                    unsigned char enable);
 
-TF_CAPI_EXPORT extern void TFE_OpAddInput(TFE_Op* op, TFE_TensorHandle* h, TF_Status* status);
+TF_CAPI_EXPORT extern void TFE_OpAddInput(TFE_Op* op, TFE_TensorHandle* h,
+                                          TF_Status* status);
 
-TF_CAPI_EXPORT extern TF_AttrType TFE_OpGetAttrType(TFE_Op* op, const char* attr_name,
-                                                    unsigned char* is_list, TF_Status* status);
+TF_CAPI_EXPORT extern TF_AttrType TFE_OpGetAttrType(TFE_Op* op,
+                                                    const char* attr_name,
+                                                    unsigned char* is_list,
+                                                    TF_Status* status);
 // Get an attribute type given an op name; a fusion of TFE_NewOp and
 // TFE_OpGetAttrType for use from Python without the overhead of the individual
 // calls and memory management of TFE_Op.
@@ -176,10 +182,13 @@ TF_CAPI_EXPORT extern TF_AttrType TFE_OpNameGetAttrType(
     TFE_Context* ctx, const char* op_or_function_name, const char* attr_name,
     unsigned char* is_list, TF_Status* status);
 
-TF_CAPI_EXPORT extern void TFE_OpSetAttrString(TFE_Op* op, const char* attr_name,
+TF_CAPI_EXPORT extern void TFE_OpSetAttrString(TFE_Op* op,
+                                               const char* attr_name,
                                                const char* value);
-TF_CAPI_EXPORT extern void TFE_OpSetAttrInt(TFE_Op* op, const char* attr_name, int64_t value);
-TF_CAPI_EXPORT extern void TFE_OpSetAttrFloat(TFE_Op* op, const char* attr_name, float value);
+TF_CAPI_EXPORT extern void TFE_OpSetAttrInt(TFE_Op* op, const char* attr_name,
+                                            int64_t value);
+TF_CAPI_EXPORT extern void TFE_OpSetAttrFloat(TFE_Op* op, const char* attr_name,
+                                              float value);
 TF_CAPI_EXPORT extern void TFE_OpSetAttrBool(TFE_Op* op, const char* attr_name,
                                              unsigned char value);
 TF_CAPI_EXPORT extern void TFE_OpSetAttrType(TFE_Op* op, const char* attr_name,
@@ -188,7 +197,8 @@ TF_CAPI_EXPORT extern void TFE_OpSetAttrType(TFE_Op* op, const char* attr_name,
 // -1 and `dims` can be null.  If a dimension is unknown, the
 // corresponding entry in the `dims` array must be -1.
 TF_CAPI_EXPORT extern void TFE_OpSetAttrShape(TFE_Op* op, const char* attr_name,
-                                              const int64_t* dims, const int num_dims,
+                                              const int64_t* dims,
+                                              const int num_dims,
                                               TF_Status* out_status);
 
 // Sets the attribute attr_name to be a function specified by 'function'.
@@ -199,19 +209,33 @@ TF_CAPI_EXPORT extern void TFE_OpSetAttrFunction(TFE_Op* op,
                                                  const char* attr_name,
                                                  const TFE_Op* value);
 
-TF_CAPI_EXPORT extern void TFE_OpSetAttrStringList(TFE_Op* op, const char* attr_name,
-                                                   const char** value, int num_values);
-TF_CAPI_EXPORT extern void TFE_OpSetAttrIntList(TFE_Op* op, const char* attr_name,
-                                                const int64_t* values, int num_values);
-TF_CAPI_EXPORT extern void TFE_OpSetAttrFloatList(TFE_Op* op, const char* attr_name,
-                                                  const float* values, int num_values);
-TF_CAPI_EXPORT extern void TFE_OpSetAttrBoolList(TFE_Op* op, const char* attr_name,
-                                                 const unsigned char* values, int num_values);
-TF_CAPI_EXPORT extern void TFE_OpSetAttrTypeList(TFE_Op* op, const char* attr_name,
-                                                 const TF_DataType* values, int num_values);
-TF_CAPI_EXPORT extern void TFE_OpSetAttrShapeList(TFE_Op* op, const char* attr_name,
-                                                  const int64_t** dims, const int* num_dims,
-                                                  int num_values, TF_Status* out_status);
+TF_CAPI_EXPORT extern void TFE_OpSetAttrStringList(TFE_Op* op,
+                                                   const char* attr_name,
+                                                   const char** value,
+                                                   int num_values);
+TF_CAPI_EXPORT extern void TFE_OpSetAttrIntList(TFE_Op* op,
+                                                const char* attr_name,
+                                                const int64_t* values,
+                                                int num_values);
+TF_CAPI_EXPORT extern void TFE_OpSetAttrFloatList(TFE_Op* op,
+                                                  const char* attr_name,
+                                                  const float* values,
+                                                  int num_values);
+TF_CAPI_EXPORT extern void TFE_OpSetAttrBoolList(TFE_Op* op,
+                                                 const char* attr_name,
+                                                 const unsigned char* values,
+                                                 int num_values);
+TF_CAPI_EXPORT extern void TFE_OpSetAttrTypeList(TFE_Op* op,
+                                                 const char* attr_name,
+                                                 const TF_DataType* values,
+                                                 int num_values);
+TF_CAPI_EXPORT extern void TFE_OpSetAttrShapeList(
+    TFE_Op* op, const char* attr_name, const int64_t** dims,
+    const int* num_dims, int num_values, TF_Status* out_status);
+TF_CAPI_EXPORT extern void TFE_OpSetAttrFunctionList(TFE_Op* op,
+                                                     const char* attr_name,
+                                                     const TFE_Op** value,
+                                                     int num_values);
 
 // Execute the operation defined by 'op' and return handles to computed
 // tensors in 'retvals'.
@@ -226,9 +250,9 @@ TF_CAPI_EXPORT extern void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals,
 
 // Add a function (serialized FunctionDef protocol buffer) to ctx so
 // that it can be invoked using TFE_Execute.
-TF_CAPI_EXPORT extern void TFE_ContextAddFunctionDef(TFE_Context* ctx,
-                                                     const char* serialized_function_def,
-                                                     size_t size, TF_Status* status);
+TF_CAPI_EXPORT extern void TFE_ContextAddFunctionDef(
+    TFE_Context* ctx, const char* serialized_function_def, size_t size,
+    TF_Status* status);
 
 // Adds a function (created from TF_GraphToFunction or
 // TF_FunctionImportFunctionDef) to the context, allowing it to be executed with
diff --git a/tensorflow/python/eager/execute.py b/tensorflow/python/eager/execute.py
index 306cf07aab..2ff5b8d8f4 100644
--- a/tensorflow/python/eager/execute.py
+++ b/tensorflow/python/eager/execute.py
@@ -72,7 +72,7 @@ def execute_with_callbacks(op_name, num_outputs, inputs, attrs, ctx, name=None):
   """Monkey-patch to execute to enable execution callbacks."""
   tensors = quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
   for callback in ctx.post_execution_callbacks:
-    callback(op_name, name, attrs, inputs, tensors)
+    callback(op_name, inputs, attrs, tensors, name)
 
   return tensors
 
diff --git a/tensorflow/python/eager/execution_callbacks.py b/tensorflow/python/eager/execution_callbacks.py
index 988442c971..535361498a 100644
--- a/tensorflow/python/eager/execution_callbacks.py
+++ b/tensorflow/python/eager/execution_callbacks.py
@@ -104,10 +104,10 @@ class InfOrNanError(Exception):
 
 
 def inf_nan_callback(op_type,
-                     op_name,
-                     attrs,
                      inputs,
+                     attrs,
                      outputs,
+                     op_name,
                      check_inf=True,
                      check_nan=True,
                      action=_DEFAULT_CALLBACK_ACTION):
@@ -121,14 +121,14 @@ def inf_nan_callback(op_type,
 
   Args:
     op_type: Name of the TFE operation type (e.g., `MatMul`).
-    op_name: Name of the TFE operation. This name is set by client and can be
-      `None` if it unset.
-    attrs: Attributes of the TFE operation, as a tuple of alternating attribute
-      names and attribute values.
     inputs: The `list` of input tensors to the operation, currently unused by
       this callback.
+    attrs: Attributes of the TFE operation, as a tuple of alternating attribute
+      names and attribute values.
     outputs: The `list` of output tensors from the operation, checked by this
       callback for `inf` and `nan` values.
+    op_name: Name of the TFE operation. This name is set by client and can be
+      `None` if it unset.
     check_inf: (`bool`) Whether this callback should check for `inf` values in
       the output tensor values.
     check_nan: (`bool`) Whether this callback should check for `nan` values in
@@ -187,26 +187,38 @@ def inf_nan_callback(op_type,
 
 
 def inf_callback(op_type,
-                 op_name,
-                 attrs,
                  inputs,
+                 attrs,
                  outputs,
+                 op_name,
                  action=_DEFAULT_CALLBACK_ACTION):
   """A specialization of `inf_nan_callback` that checks for `inf`s only."""
   inf_nan_callback(
-      op_type, op_name, attrs, inputs, outputs, check_inf=True, check_nan=False,
+      op_type,
+      inputs,
+      attrs,
+      outputs,
+      op_name,
+      check_inf=True,
+      check_nan=False,
       action=action)
 
 
 def nan_callback(op_type,
-                 op_name,
-                 attrs,
                  inputs,
+                 attrs,
                  outputs,
+                 op_name,
                  action=_DEFAULT_CALLBACK_ACTION):
   """A specialization of `inf_nan_callback` that checks for `nan`s only."""
   inf_nan_callback(
-      op_type, op_name, attrs, inputs, outputs, check_inf=False, check_nan=True,
+      op_type,
+      inputs,
+      attrs,
+      outputs,
+      op_name,
+      check_inf=False,
+      check_nan=True,
       action=action)
 
 
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index bb7b89ab74..77a639be3b 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -237,9 +237,29 @@ bool SetOpAttrList(
   return true;
 }
 
+// This is only declared here since GetFunc makes a recursive call to
+// SetOpAttrScalarDefault.
+void SetOpAttrScalarDefault(
+    TFE_Context* ctx, TFE_Op* op, const tensorflow::AttrValue& default_value,
+    const char* attr_name,
+    tensorflow::gtl::FlatMap<string, tensorflow::int64>* attr_list_sizes,
+    TF_Status* status);
+
+TFE_Op* GetFunc(TFE_Context* ctx, const tensorflow::NameAttrList& func,
+                TF_Status* status) {
+  TFE_Op* func_op = TFE_NewOp(ctx, func.name().data(), status);
+  for (const auto& attr : func.attr()) {
+    if (TF_GetCode(status) != TF_OK) return nullptr;
+    SetOpAttrScalarDefault(ctx, func_op, attr.second, attr.first.data(),
+                           nullptr, status);
+    if (TF_GetCode(status) != TF_OK) return nullptr;
+  }
+  return func_op;
+}
+
 void SetOpAttrListDefault(
-    TFE_Op* op, const tensorflow::OpDef::AttrDef& attr, const char* key,
-    TF_AttrType type,
+    TFE_Context* ctx, TFE_Op* op, const tensorflow::OpDef::AttrDef& attr,
+    const char* key, TF_AttrType type,
     tensorflow::gtl::FlatMap<string, tensorflow::int64>* attr_list_sizes,
     TF_Status* status) {
   if (type == TF_ATTR_STRING) {
@@ -284,10 +304,48 @@ void SetOpAttrListDefault(
     TFE_OpSetAttrTypeList(op, key,
                           reinterpret_cast<const TF_DataType*>(values.get()),
                           attr.default_value().list().type_size());
+  } else if (type == TF_ATTR_SHAPE) {
+    int num_values = attr.default_value().list().shape_size();
+    (*attr_list_sizes)[key] = num_values;
+    int total_dims = 0;
+    for (int i = 0; i < num_values; ++i) {
+      if (!attr.default_value().list().shape(i).unknown_rank()) {
+        total_dims += attr.default_value().list().shape(i).dim_size();
+      }
+    }
+    // Allocate a buffer that can fit all of the dims together.
+    std::unique_ptr<int64_t[]> buffer(new int64_t[total_dims]);
+    // Copy the input dims into the buffer and set dims to point to
+    // the start of each list's dims.
+    std::unique_ptr<const int64_t* []> dims(new const int64_t*[num_values]);
+    std::unique_ptr<int[]> num_dims(new int[num_values]);
+    int64_t* offset = buffer.get();
+    for (int i = 0; i < num_values; ++i) {
+      const auto& shape = attr.default_value().list().shape(i);
+      if (shape.unknown_rank()) {
+        dims[i] = nullptr;
+        num_dims[i] = -1;
+      } else {
+        for (int j = 0; j < shape.dim_size(); j++) {
+          *offset = shape.dim(j).size();
+          ++offset;
+        }
+      }
+    }
+    TFE_OpSetAttrShapeList(op, key, dims.get(), num_dims.get(), num_values,
+                           status);
+  } else if (type == TF_ATTR_FUNC) {
+    int num_values = attr.default_value().list().func_size();
+    (*attr_list_sizes)[key] = num_values;
+    std::unique_ptr<const TFE_Op* []> funcs(new const TFE_Op*[num_values]);
+    for (int i = 0; i < num_values; i++) {
+      funcs[i] = GetFunc(ctx, attr.default_value().list().func(i), status);
+    }
+    TFE_OpSetAttrFunctionList(op, key, funcs.get(), num_values);
   } else {
     TF_SetStatus(status, TF_UNIMPLEMENTED,
-                 "Shapes, tensors and lists are not yet implemented for "
-                 "default valued attributes for an operation.");
+                 "Lists of tensors are not yet implemented for default valued "
+                 "attributes for an operation.");
   }
 }
 
@@ -389,36 +447,61 @@ bool SetOpAttrScalar(
 }
 
 void SetOpAttrScalarDefault(
-    TFE_Op* op, const tensorflow::OpDef::AttrDef& attr, const char* attr_name,
-    TF_AttrType type,
+    TFE_Context* ctx, TFE_Op* op, const tensorflow::AttrValue& default_value,
+    const char* attr_name,
     tensorflow::gtl::FlatMap<string, tensorflow::int64>* attr_list_sizes,
     TF_Status* status) {
-  if (type == TF_ATTR_STRING) {
-    if (attr.default_value().value_case() == tensorflow::AttrValue::kS) {
-      TFE_OpSetAttrString(op, attr_name, attr.default_value().s().data());
-    }
-  } else if (type == TF_ATTR_INT) {
-    if (attr.default_value().value_case() == tensorflow::AttrValue::kI) {
-      TFE_OpSetAttrInt(op, attr_name,
-                       static_cast<int64_t>(attr.default_value().i()));
-      (*attr_list_sizes)[attr.name()] = attr.default_value().i();
-    }
-  } else if (type == TF_ATTR_FLOAT) {
-    if (attr.default_value().value_case() == tensorflow::AttrValue::kF) {
-      TFE_OpSetAttrFloat(op, attr_name, attr.default_value().f());
-    }
-  } else if (type == TF_ATTR_BOOL) {
-    if (attr.default_value().value_case() == tensorflow::AttrValue::kB) {
-      TFE_OpSetAttrBool(op, attr_name, attr.default_value().b());
-    }
-  } else if (type == TF_ATTR_TYPE) {
-    if (attr.default_value().value_case() == tensorflow::AttrValue::kType) {
+  switch (default_value.value_case()) {
+    case tensorflow::AttrValue::kS:
+      TFE_OpSetAttrString(op, attr_name, default_value.s().data());
+      break;
+    case tensorflow::AttrValue::kI:
+      TFE_OpSetAttrInt(op, attr_name, static_cast<int64_t>(default_value.i()));
+      (*attr_list_sizes)[attr_name] = default_value.i();
+      break;
+    case tensorflow::AttrValue::kF:
+      TFE_OpSetAttrFloat(op, attr_name, default_value.f());
+      break;
+    case tensorflow::AttrValue::kB:
+      TFE_OpSetAttrBool(op, attr_name, default_value.b());
+      break;
+    case tensorflow::AttrValue::kType:
       TFE_OpSetAttrType(op, attr_name,
-                        static_cast<TF_DataType>(attr.default_value().type()));
-    }
-  } else {
-    TF_SetStatus(status, TF_UNIMPLEMENTED,
-                 "Shapes, tensors and lists are not yet implemented.");
+                        static_cast<TF_DataType>(default_value.type()));
+      break;
+    case tensorflow::AttrValue::kShape: {
+      const auto& tensor_shape = default_value.shape();
+      if (tensor_shape.unknown_rank()) {
+        TFE_OpSetAttrShape(op, attr_name, nullptr, -1, status);
+      } else {
+        const auto num_dims = tensor_shape.dim_size();
+        std::unique_ptr<int64_t[]> dims(new int64_t[num_dims]);
+        for (int i = 0; i < num_dims; ++i) {
+          dims[i] = tensor_shape.dim(i).size();
+        }
+        TFE_OpSetAttrShape(op, attr_name, dims.get(), num_dims, status);
+      }
+    } break;
+    case tensorflow::AttrValue::kFunc: {
+      const auto func_op = GetFunc(ctx, default_value.func(), status);
+      if (TF_GetCode(status) != TF_OK) return;
+      // TODO(nareshmodi): TFE_OpSetAttrFunction and TFE_OpSetAttrFunctionList
+      // require TFE_Op* and just convert it internally a NameAttrValue, so
+      // consider adding an overload to the C API to make this case easier.
+      TFE_OpSetAttrFunction(op, attr_name, func_op);
+    } break;
+    case tensorflow::AttrValue::kList:
+      TF_FALLTHROUGH_INTENDED;
+    case tensorflow::AttrValue::kTensor:
+      TF_FALLTHROUGH_INTENDED;
+    case tensorflow::AttrValue::kPlaceholder:
+      TF_FALLTHROUGH_INTENDED;
+    case tensorflow::AttrValue::VALUE_NOT_SET:
+      TF_SetStatus(
+          status, TF_UNIMPLEMENTED,
+          tensorflow::strings::StrCat("Unable to get setfor default value: ",
+                                      default_value.DebugString())
+              .data());
   }
 }
 
@@ -457,7 +540,8 @@ void SetOpAttrs(TFE_Context* ctx, TFE_Op* op, PyObject* attrs, int start_index,
 
 // This function will set the op attrs required. If an attr has the value of
 // None, then it will read the AttrDef to get the default value and set that
-// instead. Any failure in this function will simply fall back to the slow path.
+// instead. Any failure in this function will simply fall back to the slow
+// path.
 void SetOpAttrWithDefaults(
     TFE_Context* ctx, TFE_Op* op, const tensorflow::OpDef::AttrDef& attr,
     const char* attr_name, PyObject* attr_value,
@@ -468,10 +552,11 @@ void SetOpAttrWithDefaults(
   if (TF_GetCode(status) != TF_OK) return;
   if (attr_value == Py_None) {
     if (is_list != 0) {
-      SetOpAttrListDefault(op, attr, attr_name, type, attr_list_sizes, status);
+      SetOpAttrListDefault(ctx, op, attr, attr_name, type, attr_list_sizes,
+                           status);
     } else {
-      SetOpAttrScalarDefault(op, attr, attr_name, type, attr_list_sizes,
-                             status);
+      SetOpAttrScalarDefault(ctx, op, attr.default_value(), attr_name,
+                             attr_list_sizes, status);
     }
   } else {
     if (is_list != 0) {
@@ -1377,9 +1462,13 @@ bool RunCallbacks(bool run_gradient_callback, bool run_post_exec_callbacks,
     PyTuple_SET_ITEM(attrs, i, flattened_attrs.at(i - num_non_inferred_attrs));
   }
 
-  auto cleaner = tensorflow::gtl::MakeCleanup([inputs, attrs] {
+  PyObject* callback_args =
+      Py_BuildValue("OOOOO", op_name, inputs, attrs, flattened_result, name);
+
+  auto cleaner = tensorflow::gtl::MakeCleanup([inputs, attrs, callback_args] {
     Py_DECREF(inputs);
     Py_DECREF(attrs);
+    Py_DECREF(callback_args);
   });
 
   if (run_gradient_callback) {
@@ -1392,11 +1481,8 @@ bool RunCallbacks(bool run_gradient_callback, bool run_post_exec_callbacks,
       return false;
     }
 
-    PyObject* callback_args =
-        Py_BuildValue("OOOOO", op_name, inputs, attrs, flattened_result, name);
     PyObject* callback_result =
         PyObject_CallObject(record_gradient_callback, callback_args);
-    Py_DECREF(callback_args);
     if (!callback_result) {
       return false;
     }
@@ -1404,11 +1490,6 @@ bool RunCallbacks(bool run_gradient_callback, bool run_post_exec_callbacks,
   }
 
   if (run_post_exec_callbacks) {
-    // TODO(nareshmodi): update the execution callback interface to match the
-    // record_gradient interface so that this doesn't need to be rebuilt.
-    PyObject* callback_args =
-        Py_BuildValue("OOOOO", op_name, name, attrs, inputs, flattened_result);
-
     for (Py_ssize_t i = 0; i < PyList_Size(callbacks); i++) {
       PyObject* callback_fn = PyList_GET_ITEM(callbacks, i);
       if (!PyCallable_Check(callback_fn)) {
@@ -1423,12 +1504,10 @@ bool RunCallbacks(bool run_gradient_callback, bool run_post_exec_callbacks,
       PyObject* callback_result =
           PyObject_CallObject(callback_fn, callback_args);
       if (!callback_result) {
-        Py_DECREF(callback_args);
         return false;
       }
       Py_DECREF(callback_result);
     }
-    Py_DECREF(callback_args);
   }
 
   return true;
-- 
GitLab


From e8eb8aea16d66e52ecf77e65266f27faa460901d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 8 Feb 2018 14:27:03 -0800
Subject: [PATCH 867/966] Update ops-related pbtxt files.

PiperOrigin-RevId: 185052000
---
 .../core/ops/compat/ops_history.v1.pbtxt      | 77 +++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 |  7 ++
 2 files changed, 84 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 7a7ec5c898..fc9e5b02a2 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -7973,6 +7973,83 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "Batch"
+  input_arg {
+    name: "in_tensors"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "batched_tensors"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "batch_index"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "id"
+    type: DT_INT64
+  }
+  attr {
+    name: "num_batch_threads"
+    type: "int"
+  }
+  attr {
+    name: "max_batch_size"
+    type: "int"
+  }
+  attr {
+    name: "max_enqueued_batches"
+    type: "int"
+    default_value {
+      i: 10
+    }
+  }
+  attr {
+    name: "batch_timeout_micros"
+    type: "int"
+  }
+  attr {
+    name: "allowed_batch_sizes"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "grad_timeout_micros"
+    type: "int"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "batching_queue"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "BatchCholesky"
   input_arg {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index d9ac21c8ce..45ff08f38b 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -2763,6 +2763,13 @@ op {
     name: "max_batch_size"
     type: "int"
   }
+  attr {
+    name: "max_enqueued_batches"
+    type: "int"
+    default_value {
+      i: 10
+    }
+  }
   attr {
     name: "batch_timeout_micros"
     type: "int"
-- 
GitLab


From c6870d4848cd584dfff5950e76df56e057f5bbf1 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Thu, 8 Feb 2018 14:47:28 -0800
Subject: [PATCH 868/966] Adding tf_export decorators/calls to TensorFlow
 functions and constants.

PiperOrigin-RevId: 185055497
---
 .../python/keras/_impl/keras/activations.py   |  14 ++
 .../_impl/keras/applications/densenet.py      |   8 +
 .../keras/applications/imagenet_utils.py      |  13 ++
 .../keras/applications/inception_resnet_v2.py |   4 +
 .../_impl/keras/applications/inception_v3.py  |   5 +
 .../_impl/keras/applications/mobilenet.py     |   4 +
 .../keras/_impl/keras/applications/nasnet.py  |   5 +
 .../_impl/keras/applications/resnet50.py      |   3 +
 .../keras/_impl/keras/applications/vgg16.py   |   2 +
 .../keras/_impl/keras/applications/vgg19.py   |   2 +
 .../_impl/keras/applications/xception.py      |   4 +
 .../python/keras/_impl/keras/backend.py       | 138 ++++++++++++++++++
 .../python/keras/_impl/keras/callbacks.py     |  14 ++
 .../python/keras/_impl/keras/constraints.py   |   9 ++
 .../python/keras/_impl/keras/estimator.py     |   2 +
 .../python/keras/_impl/keras/initializers.py  |  10 ++
 tensorflow/python/keras/_impl/keras/losses.py |  26 ++++
 .../python/keras/_impl/keras/metrics.py       |   8 +
 tensorflow/python/keras/_impl/keras/models.py |   7 +
 .../python/keras/_impl/keras/optimizers.py    |  12 ++
 .../python/keras/_impl/keras/regularizers.py  |   9 ++
 tensorflow/tools/api/generator/BUILD          |  18 +++
 22 files changed, 317 insertions(+)

diff --git a/tensorflow/python/keras/_impl/keras/activations.py b/tensorflow/python/keras/_impl/keras/activations.py
index 4852b8c36a..236e17653e 100644
--- a/tensorflow/python/keras/_impl/keras/activations.py
+++ b/tensorflow/python/keras/_impl/keras/activations.py
@@ -24,8 +24,10 @@ from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.layers.base import Layer
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('keras.activations.softmax')
 def softmax(x, axis=-1):
   """Softmax activation function.
 
@@ -50,10 +52,12 @@ def softmax(x, axis=-1):
     raise ValueError('Cannot apply softmax to a tensor that is 1D')
 
 
+@tf_export('keras.activations.elu')
 def elu(x, alpha=1.0):
   return K.elu(x, alpha)
 
 
+@tf_export('keras.activations.selu')
 def selu(x):
   """Scaled Exponential Linear Unit. (Klambauer et al., 2017).
 
@@ -73,38 +77,47 @@ def selu(x):
   return scale * K.elu(x, alpha)
 
 
+@tf_export('keras.activations.softplus')
 def softplus(x):
   return K.softplus(x)
 
 
+@tf_export('keras.activations.softsign')
 def softsign(x):
   return K.softsign(x)
 
 
+@tf_export('keras.activations.relu')
 def relu(x, alpha=0., max_value=None):
   return K.relu(x, alpha=alpha, max_value=max_value)
 
 
+@tf_export('keras.activations.tanh')
 def tanh(x):
   return K.tanh(x)
 
 
+@tf_export('keras.activations.sigmoid')
 def sigmoid(x):
   return K.sigmoid(x)
 
 
+@tf_export('keras.activations.hard_sigmoid')
 def hard_sigmoid(x):
   return K.hard_sigmoid(x)
 
 
+@tf_export('keras.activations.linear')
 def linear(x):
   return x
 
 
+@tf_export('keras.activations.serialize')
 def serialize(activation):
   return activation.__name__
 
 
+@tf_export('keras.activations.deserialize')
 def deserialize(name, custom_objects=None):
   return deserialize_keras_object(
       name,
@@ -113,6 +126,7 @@ def deserialize(name, custom_objects=None):
       printable_module_name='activation function')
 
 
+@tf_export('keras.activations.get')
 def get(identifier):
   if identifier is None:
     return linear
diff --git a/tensorflow/python/keras/_impl/keras/applications/densenet.py b/tensorflow/python/keras/_impl/keras/applications/densenet.py
index 9e40d34930..6521f84104 100644
--- a/tensorflow/python/keras/_impl/keras/applications/densenet.py
+++ b/tensorflow/python/keras/_impl/keras/applications/densenet.py
@@ -45,6 +45,7 @@ from tensorflow.python.keras._impl.keras.layers import MaxPooling2D
 from tensorflow.python.keras._impl.keras.layers import ZeroPadding2D
 from tensorflow.python.keras._impl.keras.models import Model
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.util.tf_export import tf_export
 
 
 DENSENET121_WEIGHT_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.8/densenet121_weights_tf_dim_ordering_tf_kernels.h5'
@@ -298,6 +299,8 @@ def DenseNet(blocks,
   return model
 
 
+@tf_export('keras.applications.DenseNet121',
+           'keras.applications.densenet.DenseNet121')
 def DenseNet121(include_top=True,
                 weights='imagenet',
                 input_tensor=None,
@@ -308,6 +311,8 @@ def DenseNet121(include_top=True,
                   input_shape, pooling, classes)
 
 
+@tf_export('keras.applications.DenseNet169',
+           'keras.applications.densenet.DenseNet169')
 def DenseNet169(include_top=True,
                 weights='imagenet',
                 input_tensor=None,
@@ -318,6 +323,8 @@ def DenseNet169(include_top=True,
                   input_shape, pooling, classes)
 
 
+@tf_export('keras.applications.DenseNet201',
+           'keras.applications.densenet.DenseNet201')
 def DenseNet201(include_top=True,
                 weights='imagenet',
                 input_tensor=None,
@@ -328,6 +335,7 @@ def DenseNet201(include_top=True,
                   input_shape, pooling, classes)
 
 
+@tf_export('keras.applications.densenet.preprocess_input')
 def preprocess_input(x, data_format=None):
   """Preprocesses a numpy array encoding a batch of images.
 
diff --git a/tensorflow/python/keras/_impl/keras/applications/imagenet_utils.py b/tensorflow/python/keras/_impl/keras/applications/imagenet_utils.py
index f1f20f12a8..d9cb726137 100644
--- a/tensorflow/python/keras/_impl/keras/applications/imagenet_utils.py
+++ b/tensorflow/python/keras/_impl/keras/applications/imagenet_utils.py
@@ -25,6 +25,7 @@ import numpy as np
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
 
 
 CLASS_INDEX = None
@@ -162,6 +163,9 @@ def _preprocess_symbolic_input(x, data_format, mode):
   return x
 
 
+@tf_export('keras.applications.resnet50.preprocess_input',
+           'keras.applications.vgg19.preprocess_input',
+           'keras.applications.vgg16.preprocess_input')
 def preprocess_input(x, data_format=None, mode='caffe'):
   """Preprocesses a tensor or Numpy array encoding a batch of images.
 
@@ -193,6 +197,15 @@ def preprocess_input(x, data_format=None, mode='caffe'):
     return _preprocess_symbolic_input(x, data_format=data_format, mode=mode)
 
 
+@tf_export('keras.applications.nasnet.decode_predictions',
+           'keras.applications.resnet50.decode_predictions',
+           'keras.applications.vgg19.decode_predictions',
+           'keras.applications.vgg16.decode_predictions',
+           'keras.applications.inception_resnet_v2.decode_predictions',
+           'keras.applications.inception_v3.decode_predictions',
+           'keras.applications.densenet.decode_predictions',
+           'keras.applications.mobilenet.decode_predictions',
+           'keras.applications.xception.decode_predictions')
 def decode_predictions(preds, top=5):
   """Decodes the prediction of an ImageNet model.
 
diff --git a/tensorflow/python/keras/_impl/keras/applications/inception_resnet_v2.py b/tensorflow/python/keras/_impl/keras/applications/inception_resnet_v2.py
index 1dc15b5b34..bf3901fc54 100644
--- a/tensorflow/python/keras/_impl/keras/applications/inception_resnet_v2.py
+++ b/tensorflow/python/keras/_impl/keras/applications/inception_resnet_v2.py
@@ -46,11 +46,13 @@ from tensorflow.python.keras._impl.keras.layers import MaxPooling2D
 from tensorflow.python.keras._impl.keras.models import Model
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
 
 
 BASE_WEIGHT_URL = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.7/'
 
 
+@tf_export('keras.applications.inception_resnet_v2.preprocess_input')
 def preprocess_input(x):
   """Preprocesses a numpy array encoding a batch of images.
 
@@ -190,6 +192,8 @@ def inception_resnet_block(x, scale, block_type, block_idx, activation='relu'):
   return x
 
 
+@tf_export('keras.applications.InceptionResNetV2',
+           'keras.applications.inception_resnet_v2.InceptionResNetV2')
 def InceptionResNetV2(include_top=True,
                       weights='imagenet',
                       input_tensor=None,
diff --git a/tensorflow/python/keras/_impl/keras/applications/inception_v3.py b/tensorflow/python/keras/_impl/keras/applications/inception_v3.py
index ff57116f2d..e268e97bc6 100644
--- a/tensorflow/python/keras/_impl/keras/applications/inception_v3.py
+++ b/tensorflow/python/keras/_impl/keras/applications/inception_v3.py
@@ -50,6 +50,7 @@ from tensorflow.python.keras._impl.keras.layers import MaxPooling2D
 from tensorflow.python.keras._impl.keras.models import Model
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
 
 
 WEIGHTS_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.5/inception_v3_weights_tf_dim_ordering_tf_kernels.h5'
@@ -101,6 +102,8 @@ def conv2d_bn(x,
   return x
 
 
+@tf_export('keras.applications.InceptionV3',
+           'keras.applications.inception_v3.InceptionV3')
 def InceptionV3(include_top=True,
                 weights='imagenet',
                 input_tensor=None,
@@ -399,6 +402,8 @@ def InceptionV3(include_top=True,
   return model
 
 
+@tf_export('keras.applications.nasnet.preprocess_input',
+           'keras.applications.inception_v3.preprocess_input')
 def preprocess_input(x):
   """Preprocesses a numpy array encoding a batch of images.
 
diff --git a/tensorflow/python/keras/_impl/keras/applications/mobilenet.py b/tensorflow/python/keras/_impl/keras/applications/mobilenet.py
index 790bf8cead..027ae26113 100644
--- a/tensorflow/python/keras/_impl/keras/applications/mobilenet.py
+++ b/tensorflow/python/keras/_impl/keras/applications/mobilenet.py
@@ -93,6 +93,7 @@ from tensorflow.python.keras._impl.keras.models import Model
 from tensorflow.python.keras._impl.keras.utils import conv_utils
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
 
 
 BASE_WEIGHT_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.6/'
@@ -102,6 +103,7 @@ def relu6(x):
   return K.relu(x, max_value=6)
 
 
+@tf_export('keras.applications.mobilenet.preprocess_input')
 def preprocess_input(x):
   """Preprocesses a numpy array encoding a batch of images.
 
@@ -303,6 +305,8 @@ class DepthwiseConv2D(Conv2D):
     return config
 
 
+@tf_export('keras.applications.MobileNet',
+           'keras.applications.mobilenet.MobileNet')
 def MobileNet(input_shape=None,
               alpha=1.0,
               depth_multiplier=1,
diff --git a/tensorflow/python/keras/_impl/keras/applications/nasnet.py b/tensorflow/python/keras/_impl/keras/applications/nasnet.py
index 5dd038c096..08dae57f00 100644
--- a/tensorflow/python/keras/_impl/keras/applications/nasnet.py
+++ b/tensorflow/python/keras/_impl/keras/applications/nasnet.py
@@ -67,6 +67,7 @@ from tensorflow.python.keras._impl.keras.layers import ZeroPadding2D
 from tensorflow.python.keras._impl.keras.models import Model
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
 
 
 NASNET_MOBILE_WEIGHT_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.8/NASNet-mobile.h5'
@@ -323,6 +324,8 @@ def NASNet(input_shape=None,
   return model
 
 
+@tf_export('keras.applications.NASNetLarge',
+           'keras.applications.nasnet.NASNetLarge')
 def NASNetLarge(input_shape=None,
                 include_top=True,
                 weights='imagenet',
@@ -390,6 +393,8 @@ def NASNetLarge(input_shape=None,
       default_size=331)
 
 
+@tf_export('keras.applications.NASNetMobile',
+           'keras.applications.nasnet.NASNetMobile')
 def NASNetMobile(input_shape=None,
                  include_top=True,
                  weights='imagenet',
diff --git a/tensorflow/python/keras/_impl/keras/applications/resnet50.py b/tensorflow/python/keras/_impl/keras/applications/resnet50.py
index 5705b3481a..a47dd657bb 100644
--- a/tensorflow/python/keras/_impl/keras/applications/resnet50.py
+++ b/tensorflow/python/keras/_impl/keras/applications/resnet50.py
@@ -49,6 +49,7 @@ from tensorflow.python.keras._impl.keras.models import Model
 from tensorflow.python.keras._impl.keras.utils import layer_utils
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
 
 
 WEIGHTS_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.2/resnet50_weights_tf_dim_ordering_tf_kernels.h5'
@@ -146,6 +147,8 @@ def conv_block(input_tensor, kernel_size, filters, stage, block, strides=(2,
   return x
 
 
+@tf_export('keras.applications.ResNet50',
+           'keras.applications.resnet50.ResNet50')
 def ResNet50(include_top=True,
              weights='imagenet',
              input_tensor=None,
diff --git a/tensorflow/python/keras/_impl/keras/applications/vgg16.py b/tensorflow/python/keras/_impl/keras/applications/vgg16.py
index c91c24e6fb..9da74253ab 100644
--- a/tensorflow/python/keras/_impl/keras/applications/vgg16.py
+++ b/tensorflow/python/keras/_impl/keras/applications/vgg16.py
@@ -44,12 +44,14 @@ from tensorflow.python.keras._impl.keras.models import Model
 from tensorflow.python.keras._impl.keras.utils import layer_utils
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
 
 
 WEIGHTS_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.1/vgg16_weights_tf_dim_ordering_tf_kernels.h5'
 WEIGHTS_PATH_NO_TOP = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.1/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5'
 
 
+@tf_export('keras.applications.VGG16', 'keras.applications.vgg16.VGG16')
 def VGG16(include_top=True,
           weights='imagenet',
           input_tensor=None,
diff --git a/tensorflow/python/keras/_impl/keras/applications/vgg19.py b/tensorflow/python/keras/_impl/keras/applications/vgg19.py
index 223cd79d7b..961c1f9918 100644
--- a/tensorflow/python/keras/_impl/keras/applications/vgg19.py
+++ b/tensorflow/python/keras/_impl/keras/applications/vgg19.py
@@ -44,12 +44,14 @@ from tensorflow.python.keras._impl.keras.models import Model
 from tensorflow.python.keras._impl.keras.utils import layer_utils
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
 
 
 WEIGHTS_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.1/vgg19_weights_tf_dim_ordering_tf_kernels.h5'
 WEIGHTS_PATH_NO_TOP = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.1/vgg19_weights_tf_dim_ordering_tf_kernels_notop.h5'
 
 
+@tf_export('keras.applications.VGG19', 'keras.applications.vgg19.VGG19')
 def VGG19(include_top=True,
           weights='imagenet',
           input_tensor=None,
diff --git a/tensorflow/python/keras/_impl/keras/applications/xception.py b/tensorflow/python/keras/_impl/keras/applications/xception.py
index 0a6eb4953a..7e7ca5a18a 100644
--- a/tensorflow/python/keras/_impl/keras/applications/xception.py
+++ b/tensorflow/python/keras/_impl/keras/applications/xception.py
@@ -57,12 +57,15 @@ from tensorflow.python.keras._impl.keras.layers import SeparableConv2D
 from tensorflow.python.keras._impl.keras.models import Model
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
 
 
 TF_WEIGHTS_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.4/xception_weights_tf_dim_ordering_tf_kernels.h5'
 TF_WEIGHTS_PATH_NO_TOP = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.4/xception_weights_tf_dim_ordering_tf_kernels_notop.h5'
 
 
+@tf_export('keras.applications.Xception',
+           'keras.applications.xception.Xception')
 def Xception(include_top=True,
              weights='imagenet',
              input_tensor=None,
@@ -328,6 +331,7 @@ def Xception(include_top=True,
   return model
 
 
+@tf_export('keras.applications.xception.preprocess_input')
 def preprocess_input(x):
   """Preprocesses a numpy array encoding a batch of images.
 
diff --git a/tensorflow/python/keras/_impl/keras/backend.py b/tensorflow/python/keras/_impl/keras/backend.py
index 59050f24fb..e1126365bf 100644
--- a/tensorflow/python/keras/_impl/keras/backend.py
+++ b/tensorflow/python/keras/_impl/keras/backend.py
@@ -56,6 +56,7 @@ from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variables as variables_module
 from tensorflow.python.training import moving_averages
 from tensorflow.python.util import tf_inspect
+from tensorflow.python.util.tf_export import tf_export
 
 
 py_all = all
@@ -98,6 +99,7 @@ _IMAGE_DATA_FORMAT = 'channels_last'
 _LOCAL_DEVICES = None
 
 
+@tf_export('keras.backend.backend')
 def backend():
   """Publicly accessible method for determining the current backend.
 
@@ -109,6 +111,7 @@ def backend():
   return 'tensorflow'
 
 
+@tf_export('keras.backend.epsilon')
 def epsilon():
   """Returns the value of the fuzz factor used in numeric expressions.
 
@@ -124,6 +127,7 @@ def epsilon():
   return _EPSILON
 
 
+@tf_export('keras.backend.set_epsilon')
 def set_epsilon(value):
   """Sets the value of the fuzz factor used in numeric expressions.
 
@@ -144,6 +148,7 @@ def set_epsilon(value):
   _EPSILON = value
 
 
+@tf_export('keras.backend.floatx')
 def floatx():
   """Returns the default float type, as a string.
 
@@ -161,6 +166,7 @@ def floatx():
   return _FLOATX
 
 
+@tf_export('keras.backend.set_floatx')
 def set_floatx(value):
   """Sets the default float type.
 
@@ -186,6 +192,7 @@ def set_floatx(value):
   _FLOATX = str(value)
 
 
+@tf_export('keras.backend.cast_to_floatx')
 def cast_to_floatx(x):
   """Cast a Numpy array to the default Keras float type.
 
@@ -213,6 +220,7 @@ def cast_to_floatx(x):
   return np.asarray(x, dtype=_FLOATX)
 
 
+@tf_export('keras.backend.image_data_format')
 def image_data_format():
   """Returns the default image data format convention.
 
@@ -228,6 +236,7 @@ def image_data_format():
   return _IMAGE_DATA_FORMAT
 
 
+@tf_export('keras.backend.set_image_data_format')
 def set_image_data_format(data_format):
   """Sets the value of the image data format convention.
 
@@ -253,6 +262,7 @@ def set_image_data_format(data_format):
   _IMAGE_DATA_FORMAT = str(data_format)
 
 
+@tf_export('keras.backend.get_uid')
 def get_uid(prefix=''):
   """Associates a string prefix with an integer counter in a TensorFlow graph.
 
@@ -280,6 +290,7 @@ def get_uid(prefix=''):
   return layer_name_uids[prefix]
 
 
+@tf_export('keras.backend.reset_uids')
 def reset_uids():
   per_graph_layer_name_uids = tf_base_layers.PER_GRAPH_LAYER_NAME_UIDS
   keys = list(per_graph_layer_name_uids.keys())
@@ -287,6 +298,7 @@ def reset_uids():
     del per_graph_layer_name_uids[key]
 
 
+@tf_export('keras.backend.clear_session')
 def clear_session():
   """Destroys the current TF graph and creates a new one.
 
@@ -303,6 +315,7 @@ def clear_session():
   _GRAPH_LEARNING_PHASES[ops.get_default_graph()] = phase
 
 
+@tf_export('keras.backend.manual_variable_initialization')
 def manual_variable_initialization(value):
   """Sets the manual variable initialization flag.
 
@@ -319,6 +332,7 @@ def manual_variable_initialization(value):
   _MANUAL_VAR_INIT = value
 
 
+@tf_export('keras.backend.learning_phase')
 def learning_phase():
   """Returns the learning phase flag.
 
@@ -345,6 +359,7 @@ def learning_phase():
   return _GRAPH_LEARNING_PHASES[graph]
 
 
+@tf_export('keras.backend.set_learning_phase')
 def set_learning_phase(value):
   """Sets the learning phase to a fixed value.
 
@@ -363,6 +378,7 @@ def set_learning_phase(value):
     _GRAPH_LEARNING_PHASES[ops.get_default_graph()] = value
 
 
+@tf_export('keras.backend.get_session')
 def get_session():
   """Returns the TF session to be used by the backend.
 
@@ -398,6 +414,7 @@ def get_session():
   return session
 
 
+@tf_export('keras.backend.set_session')
 def set_session(session):
   """Sets the global TensorFlow session.
 
@@ -500,6 +517,7 @@ def _to_tensor(x, dtype):
   return ops.convert_to_tensor(x, dtype=dtype)
 
 
+@tf_export('keras.backend.is_sparse')
 def is_sparse(tensor):
   """Returns whether a tensor is a sparse tensor.
 
@@ -523,6 +541,7 @@ def is_sparse(tensor):
   return isinstance(tensor, sparse_tensor.SparseTensor)
 
 
+@tf_export('keras.backend.to_dense')
 def to_dense(tensor):
   """Converts a sparse tensor into a dense tensor and returns it.
 
@@ -552,6 +571,7 @@ def to_dense(tensor):
 name_scope = ops.name_scope
 
 
+@tf_export('keras.backend.variable')
 def variable(value, dtype=None, name=None, constraint=None):
   """Instantiates a variable and returns it.
 
@@ -624,6 +644,7 @@ def _initialize_variables(session):
       session.run(variables_module.variables_initializer(uninitialized_vars))
 
 
+@tf_export('keras.backend.constant')
 def constant(value, dtype=None, shape=None, name=None):
   """Creates a constant tensor.
 
@@ -692,6 +713,7 @@ def is_keras_tensor(x):
   return hasattr(x, '_keras_history')
 
 
+@tf_export('keras.backend.placeholder')
 def placeholder(shape=None, ndim=None, dtype=None, sparse=False, name=None):
   """Instantiates a placeholder tensor and returns it.
 
@@ -744,6 +766,7 @@ def is_placeholder(x):
     return False
 
 
+@tf_export('keras.backend.shape')
 def shape(x):
   """Returns the symbolic shape of a tensor or variable.
 
@@ -776,6 +799,7 @@ def shape(x):
   return array_ops.shape(x)
 
 
+@tf_export('keras.backend.int_shape')
 def int_shape(x):
   """Returns the shape of tensor or variable as a tuple of int or None entries.
 
@@ -803,6 +827,7 @@ def int_shape(x):
     return None
 
 
+@tf_export('keras.backend.ndim')
 def ndim(x):
   """Returns the number of axes in a tensor, as an integer.
 
@@ -830,6 +855,7 @@ def ndim(x):
   return None
 
 
+@tf_export('keras.backend.dtype')
 def dtype(x):
   """Returns the dtype of a Keras tensor or variable, as a string.
 
@@ -860,6 +886,7 @@ def dtype(x):
   return x.dtype.base_dtype.name
 
 
+@tf_export('keras.backend.eval')
 def eval(x):
   """Evaluates the value of a variable.
 
@@ -881,6 +908,7 @@ def eval(x):
   return to_dense(x).eval(session=get_session())
 
 
+@tf_export('keras.backend.zeros')
 def zeros(shape, dtype=None, name=None):
   """Instantiates an all-zeros variable and returns it.
 
@@ -913,6 +941,7 @@ def zeros(shape, dtype=None, name=None):
   return v
 
 
+@tf_export('keras.backend.ones')
 def ones(shape, dtype=None, name=None):
   """Instantiates an all-ones variable and returns it.
 
@@ -945,6 +974,7 @@ def ones(shape, dtype=None, name=None):
   return v
 
 
+@tf_export('keras.backend.eye')
 def eye(size, dtype=None, name=None):
   """Instantiate an identity matrix and returns it.
 
@@ -973,6 +1003,7 @@ def eye(size, dtype=None, name=None):
   return variable(linalg_ops.eye(size, dtype=tf_dtype), dtype, name)
 
 
+@tf_export('keras.backend.zeros_like')
 def zeros_like(x, dtype=None, name=None):
   """Instantiates an all-zeros variable of the same shape as another tensor.
 
@@ -998,6 +1029,7 @@ def zeros_like(x, dtype=None, name=None):
   return array_ops.zeros_like(x, dtype=dtype, name=name)
 
 
+@tf_export('keras.backend.ones_like')
 def ones_like(x, dtype=None, name=None):
   """Instantiates an all-ones variable of the same shape as another tensor.
 
@@ -1036,6 +1068,7 @@ def identity(x, name=None):
   return array_ops.identity(x, name=name)
 
 
+@tf_export('keras.backend.random_uniform_variable')
 def random_uniform_variable(shape, low, high, dtype=None, name=None, seed=None):
   """Instantiates a variable with values drawn from a uniform distribution.
 
@@ -1072,6 +1105,7 @@ def random_uniform_variable(shape, low, high, dtype=None, name=None, seed=None):
   return variable(value, dtype=dtype, name=name)
 
 
+@tf_export('keras.backend.random_normal_variable')
 def random_normal_variable(shape, mean, scale, dtype=None, name=None,
                            seed=None):
   """Instantiates a variable with values drawn from a normal distribution.
@@ -1109,6 +1143,7 @@ def random_normal_variable(shape, mean, scale, dtype=None, name=None,
   return variable(value, dtype=dtype, name=name)
 
 
+@tf_export('keras.backend.count_params')
 def count_params(x):
   """Returns the static number of elements in a variable or tensor.
 
@@ -1131,6 +1166,7 @@ def count_params(x):
   return np.prod(x.get_shape().as_list())
 
 
+@tf_export('keras.backend.cast')
 def cast(x, dtype):
   """Casts a tensor to a different dtype and returns it.
 
@@ -1166,10 +1202,12 @@ def cast(x, dtype):
 # UPDATES OPS
 
 
+@tf_export('keras.backend.update')
 def update(x, new_x):
   return state_ops.assign(x, new_x)
 
 
+@tf_export('keras.backend.update_add')
 def update_add(x, increment):
   """Update the value of `x` by adding `increment`.
 
@@ -1183,6 +1221,7 @@ def update_add(x, increment):
   return state_ops.assign_add(x, increment)
 
 
+@tf_export('keras.backend.update_sub')
 def update_sub(x, decrement):
   """Update the value of `x` by subtracting `decrement`.
 
@@ -1196,6 +1235,7 @@ def update_sub(x, decrement):
   return state_ops.assign_sub(x, decrement)
 
 
+@tf_export('keras.backend.moving_average_update')
 def moving_average_update(x, value, momentum):
   """Compute the moving average of a variable.
 
@@ -1214,6 +1254,7 @@ def moving_average_update(x, value, momentum):
 # LINEAR ALGEBRA
 
 
+@tf_export('keras.backend.dot')
 def dot(x, y):
   """Multiplies 2 tensors (and/or variables) and returns a *tensor*.
 
@@ -1285,6 +1326,7 @@ def dot(x, y):
   return out
 
 
+@tf_export('keras.backend.batch_dot')
 def batch_dot(x, y, axes=None):
   """Batchwise dot product.
 
@@ -1377,6 +1419,7 @@ def batch_dot(x, y, axes=None):
   return out
 
 
+@tf_export('keras.backend.transpose')
 def transpose(x):
   """Transposes a tensor and returns it.
 
@@ -1412,6 +1455,7 @@ def transpose(x):
   return array_ops.transpose(x)
 
 
+@tf_export('keras.backend.gather')
 def gather(reference, indices):
   """Retrieves the elements of indices `indices` in the tensor `reference`.
 
@@ -1428,6 +1472,7 @@ def gather(reference, indices):
 # ELEMENT-WISE OPERATIONS
 
 
+@tf_export('keras.backend.max')
 def max(x, axis=None, keepdims=False):
   """Maximum value in a tensor.
 
@@ -1445,6 +1490,7 @@ def max(x, axis=None, keepdims=False):
   return math_ops.reduce_max(x, axis, keepdims)
 
 
+@tf_export('keras.backend.min')
 def min(x, axis=None, keepdims=False):
   """Minimum value in a tensor.
 
@@ -1462,6 +1508,7 @@ def min(x, axis=None, keepdims=False):
   return math_ops.reduce_min(x, axis, keepdims)
 
 
+@tf_export('keras.backend.sum')
 def sum(x, axis=None, keepdims=False):
   """Sum of the values in a tensor, alongside the specified axis.
 
@@ -1479,6 +1526,7 @@ def sum(x, axis=None, keepdims=False):
   return math_ops.reduce_sum(x, axis, keepdims)
 
 
+@tf_export('keras.backend.prod')
 def prod(x, axis=None, keepdims=False):
   """Multiplies the values in a tensor, alongside the specified axis.
 
@@ -1522,6 +1570,7 @@ def cumprod(x, axis=0):
   return math_ops.cumprod(x, axis=axis)
 
 
+@tf_export('keras.backend.var')
 def var(x, axis=None, keepdims=False):
   """Variance of a tensor, alongside the specified axis.
 
@@ -1544,6 +1593,7 @@ def var(x, axis=None, keepdims=False):
       devs_squared, axis, keepdims)
 
 
+@tf_export('keras.backend.std')
 def std(x, axis=None, keepdims=False):
   """Standard deviation of a tensor, alongside the specified axis.
 
@@ -1561,6 +1611,7 @@ def std(x, axis=None, keepdims=False):
   return math_ops.sqrt(var(x, axis=axis, keepdims=keepdims))
 
 
+@tf_export('keras.backend.mean')
 def mean(x, axis=None, keepdims=False):
   """Mean of a tensor, alongside the specified axis.
 
@@ -1580,6 +1631,7 @@ def mean(x, axis=None, keepdims=False):
   return math_ops.reduce_mean(x, axis, keepdims)
 
 
+@tf_export('keras.backend.any')
 def any(x, axis=None, keepdims=False):
   """Bitwise reduction (logical OR).
 
@@ -1595,6 +1647,7 @@ def any(x, axis=None, keepdims=False):
   return math_ops.reduce_any(x, axis, keepdims)
 
 
+@tf_export('keras.backend.all')
 def all(x, axis=None, keepdims=False):
   """Bitwise reduction (logical AND).
 
@@ -1610,6 +1663,7 @@ def all(x, axis=None, keepdims=False):
   return math_ops.reduce_all(x, axis, keepdims)
 
 
+@tf_export('keras.backend.argmax')
 def argmax(x, axis=-1):
   """Returns the index of the maximum value along an axis.
 
@@ -1623,6 +1677,7 @@ def argmax(x, axis=-1):
   return math_ops.argmax(x, axis)
 
 
+@tf_export('keras.backend.argmin')
 def argmin(x, axis=-1):
   """Returns the index of the minimum value along an axis.
 
@@ -1636,6 +1691,7 @@ def argmin(x, axis=-1):
   return math_ops.argmin(x, axis)
 
 
+@tf_export('keras.backend.square')
 def square(x):
   """Element-wise square.
 
@@ -1648,6 +1704,7 @@ def square(x):
   return math_ops.square(x)
 
 
+@tf_export('keras.backend.abs')
 def abs(x):
   """Element-wise absolute value.
 
@@ -1660,6 +1717,7 @@ def abs(x):
   return math_ops.abs(x)
 
 
+@tf_export('keras.backend.sqrt')
 def sqrt(x):
   """Element-wise square root.
 
@@ -1675,6 +1733,7 @@ def sqrt(x):
   return math_ops.sqrt(x)
 
 
+@tf_export('keras.backend.exp')
 def exp(x):
   """Element-wise exponential.
 
@@ -1687,6 +1746,7 @@ def exp(x):
   return math_ops.exp(x)
 
 
+@tf_export('keras.backend.log')
 def log(x):
   """Element-wise log.
 
@@ -1720,6 +1780,7 @@ def logsumexp(x, axis=None, keepdims=False):
   return math_ops.reduce_logsumexp(x, axis, keepdims)
 
 
+@tf_export('keras.backend.round')
 def round(x):
   """Element-wise rounding to the closest integer.
 
@@ -1734,6 +1795,7 @@ def round(x):
   return math_ops.round(x)
 
 
+@tf_export('keras.backend.sign')
 def sign(x):
   """Element-wise sign.
 
@@ -1746,6 +1808,7 @@ def sign(x):
   return math_ops.sign(x)
 
 
+@tf_export('keras.backend.pow')
 def pow(x, a):
   """Element-wise exponentiation.
 
@@ -1759,6 +1822,7 @@ def pow(x, a):
   return math_ops.pow(x, a)
 
 
+@tf_export('keras.backend.clip')
 def clip(x, min_value, max_value):
   """Element-wise value clipping.
 
@@ -1779,6 +1843,7 @@ def clip(x, min_value, max_value):
   return clip_ops.clip_by_value(x, min_value, max_value)
 
 
+@tf_export('keras.backend.equal')
 def equal(x, y):
   """Element-wise equality between two tensors.
 
@@ -1792,6 +1857,7 @@ def equal(x, y):
   return math_ops.equal(x, y)
 
 
+@tf_export('keras.backend.not_equal')
 def not_equal(x, y):
   """Element-wise inequality between two tensors.
 
@@ -1805,6 +1871,7 @@ def not_equal(x, y):
   return math_ops.not_equal(x, y)
 
 
+@tf_export('keras.backend.greater')
 def greater(x, y):
   """Element-wise truth value of (x > y).
 
@@ -1818,6 +1885,7 @@ def greater(x, y):
   return math_ops.greater(x, y)
 
 
+@tf_export('keras.backend.greater_equal')
 def greater_equal(x, y):
   """Element-wise truth value of (x >= y).
 
@@ -1831,6 +1899,7 @@ def greater_equal(x, y):
   return math_ops.greater_equal(x, y)
 
 
+@tf_export('keras.backend.less')
 def less(x, y):
   """Element-wise truth value of (x < y).
 
@@ -1844,6 +1913,7 @@ def less(x, y):
   return math_ops.less(x, y)
 
 
+@tf_export('keras.backend.less_equal')
 def less_equal(x, y):
   """Element-wise truth value of (x <= y).
 
@@ -1857,6 +1927,7 @@ def less_equal(x, y):
   return math_ops.less_equal(x, y)
 
 
+@tf_export('keras.backend.maximum')
 def maximum(x, y):
   """Element-wise maximum of two tensors.
 
@@ -1870,6 +1941,7 @@ def maximum(x, y):
   return math_ops.maximum(x, y)
 
 
+@tf_export('keras.backend.minimum')
 def minimum(x, y):
   """Element-wise minimum of two tensors.
 
@@ -1883,6 +1955,7 @@ def minimum(x, y):
   return math_ops.minimum(x, y)
 
 
+@tf_export('keras.backend.sin')
 def sin(x):
   """Computes sin of x element-wise.
 
@@ -1895,6 +1968,7 @@ def sin(x):
   return math_ops.sin(x)
 
 
+@tf_export('keras.backend.cos')
 def cos(x):
   """Computes cos of x element-wise.
 
@@ -2009,6 +2083,7 @@ def _fused_normalize_batch_in_training(x,
       x, gamma, beta, epsilon=epsilon, data_format=tf_data_format)
 
 
+@tf_export('keras.backend.normalize_batch_in_training')
 def normalize_batch_in_training(x, gamma, beta, reduction_axes, epsilon=1e-3):
   """Computes mean and std for batch then apply batch_normalization on batch.
 
@@ -2038,6 +2113,7 @@ def normalize_batch_in_training(x, gamma, beta, reduction_axes, epsilon=1e-3):
           x, gamma, beta, reduction_axes, epsilon=epsilon)
 
 
+@tf_export('keras.backend.batch_normalization')
 def batch_normalization(x, mean, var, beta, gamma, epsilon=1e-3):
   """Applies batch normalization on x given mean, var, beta and gamma.
 
@@ -2061,6 +2137,7 @@ def batch_normalization(x, mean, var, beta, gamma, epsilon=1e-3):
 # SHAPE OPERATIONS
 
 
+@tf_export('keras.backend.concatenate')
 def concatenate(tensors, axis=-1):
   """Concatenates a list of tensors alongside the specified axis.
 
@@ -2084,6 +2161,7 @@ def concatenate(tensors, axis=-1):
     return array_ops.concat([to_dense(x) for x in tensors], axis)
 
 
+@tf_export('keras.backend.reshape')
 def reshape(x, shape):
   """Reshapes a tensor to the specified shape.
 
@@ -2097,6 +2175,7 @@ def reshape(x, shape):
   return array_ops.reshape(x, shape)
 
 
+@tf_export('keras.backend.permute_dimensions')
 def permute_dimensions(x, pattern):
   """Permutes axes in a tensor.
 
@@ -2111,6 +2190,7 @@ def permute_dimensions(x, pattern):
   return array_ops.transpose(x, perm=pattern)
 
 
+@tf_export('keras.backend.resize_images')
 def resize_images(x, height_factor, width_factor, data_format):
   """Resizes the images contained in a 4D tensor.
 
@@ -2155,6 +2235,7 @@ def resize_images(x, height_factor, width_factor, data_format):
     raise ValueError('Invalid data_format:', data_format)
 
 
+@tf_export('keras.backend.resize_volumes')
 def resize_volumes(x, depth_factor, height_factor, width_factor, data_format):
   """Resizes the volume contained in a 5D tensor.
 
@@ -2186,6 +2267,7 @@ def resize_volumes(x, depth_factor, height_factor, width_factor, data_format):
     raise ValueError('Invalid data_format:', data_format)
 
 
+@tf_export('keras.backend.repeat_elements')
 def repeat_elements(x, rep, axis):
   """Repeats the elements of a tensor along an axis, like `np.repeat`.
 
@@ -2238,6 +2320,7 @@ def repeat_elements(x, rep, axis):
   return x_rep
 
 
+@tf_export('keras.backend.repeat')
 def repeat(x, n):
   """Repeats a 2D tensor.
 
@@ -2257,6 +2340,7 @@ def repeat(x, n):
   return array_ops.tile(x, pattern)
 
 
+@tf_export('keras.backend.arange')
 def arange(start, stop=None, step=1, dtype='int32'):
   """Creates a 1D tensor containing a sequence of integers.
 
@@ -2302,6 +2386,7 @@ def tile(x, n):
   return array_ops.tile(x, n)
 
 
+@tf_export('keras.backend.flatten')
 def flatten(x):
   """Flatten a tensor.
 
@@ -2314,6 +2399,7 @@ def flatten(x):
   return array_ops.reshape(x, [-1])
 
 
+@tf_export('keras.backend.batch_flatten')
 def batch_flatten(x):
   """Turn a nD tensor into a 2D tensor with same 0th dimension.
 
@@ -2329,6 +2415,7 @@ def batch_flatten(x):
   return x
 
 
+@tf_export('keras.backend.expand_dims')
 def expand_dims(x, axis=-1):
   """Adds a 1-sized dimension at index "axis".
 
@@ -2342,6 +2429,7 @@ def expand_dims(x, axis=-1):
   return array_ops.expand_dims(x, axis)
 
 
+@tf_export('keras.backend.squeeze')
 def squeeze(x, axis):
   """Removes a 1-dimension from the tensor at index "axis".
 
@@ -2355,6 +2443,7 @@ def squeeze(x, axis):
   return array_ops.squeeze(x, [axis])
 
 
+@tf_export('keras.backend.temporal_padding')
 def temporal_padding(x, padding=(1, 1)):
   """Pads the middle dimension of a 3D tensor.
 
@@ -2371,6 +2460,7 @@ def temporal_padding(x, padding=(1, 1)):
   return array_ops.pad(x, pattern)
 
 
+@tf_export('keras.backend.spatial_2d_padding')
 def spatial_2d_padding(x, padding=((1, 1), (1, 1)), data_format=None):
   """Pads the 2nd and 3rd dimensions of a 4D tensor.
 
@@ -2401,6 +2491,7 @@ def spatial_2d_padding(x, padding=((1, 1), (1, 1)), data_format=None):
   return array_ops.pad(x, pattern)
 
 
+@tf_export('keras.backend.spatial_3d_padding')
 def spatial_3d_padding(x, padding=((1, 1), (1, 1), (1, 1)), data_format=None):
   """Pads 5D tensor with zeros along the depth, height, width dimensions.
 
@@ -2444,6 +2535,7 @@ def spatial_3d_padding(x, padding=((1, 1), (1, 1), (1, 1)), data_format=None):
   return array_ops.pad(x, pattern)
 
 
+@tf_export('keras.backend.stack')
 def stack(x, axis=0):
   """Stacks a list of rank `R` tensors into a rank `R+1` tensor.
 
@@ -2457,6 +2549,7 @@ def stack(x, axis=0):
   return array_ops.stack(x, axis=axis)
 
 
+@tf_export('keras.backend.one_hot')
 def one_hot(indices, num_classes):
   """Computes the one-hot representation of an integer tensor.
 
@@ -2475,6 +2568,7 @@ def one_hot(indices, num_classes):
   return array_ops.one_hot(indices, depth=num_classes, axis=-1)
 
 
+@tf_export('keras.backend.reverse')
 def reverse(x, axes):
   """Reverse a tensor along the specified axes.
 
@@ -2494,6 +2588,7 @@ def reverse(x, axes):
 # VALUE MANIPULATION
 
 
+@tf_export('keras.backend.get_value')
 def get_value(x):
   """Returns the value of a variable.
 
@@ -2506,6 +2601,7 @@ def get_value(x):
   return x.eval(session=get_session())
 
 
+@tf_export('keras.backend.batch_get_value')
 def batch_get_value(tensors):
   """Returns the value of more than one tensor variable.
 
@@ -2521,6 +2617,7 @@ def batch_get_value(tensors):
     return []
 
 
+@tf_export('keras.backend.set_value')
 def set_value(x, value):
   """Sets the value of a variable, from a Numpy array.
 
@@ -2542,6 +2639,7 @@ def set_value(x, value):
   get_session().run(assign_op, feed_dict={assign_placeholder: value})
 
 
+@tf_export('keras.backend.batch_set_value')
 def batch_set_value(tuples):
   """Sets the values of many tensor variables at once.
 
@@ -2568,6 +2666,7 @@ def batch_set_value(tuples):
     get_session().run(assign_ops, feed_dict=feed_dict)
 
 
+@tf_export('keras.backend.print_tensor')
 def print_tensor(x, message=''):
   """Prints `message` and the tensor value when evaluated.
 
@@ -2665,6 +2764,7 @@ class Function(object):
     return updated[:len(self.outputs)]
 
 
+@tf_export('keras.backend.function')
 def function(inputs, outputs, updates=None, **kwargs):
   """Instantiates a Keras function.
 
@@ -2690,6 +2790,7 @@ def function(inputs, outputs, updates=None, **kwargs):
   return Function(inputs, outputs, updates=updates, **kwargs)
 
 
+@tf_export('keras.backend.gradients')
 def gradients(loss, variables):
   """Returns the gradients of `variables` w.r.t. `loss`.
 
@@ -2704,6 +2805,7 @@ def gradients(loss, variables):
       loss, variables, colocate_gradients_with_ops=True)
 
 
+@tf_export('keras.backend.stop_gradient')
 def stop_gradient(variables):
   """Returns `variables` but with zero gradient w.r.t. every other variable.
 
@@ -2724,6 +2826,7 @@ def stop_gradient(variables):
 # CONTROL FLOW
 
 
+@tf_export('keras.backend.rnn')
 def rnn(step_function,
         inputs,
         initial_states,
@@ -2970,6 +3073,7 @@ def rnn(step_function,
   return last_output, outputs, new_states
 
 
+@tf_export('keras.backend.switch')
 def switch(condition, then_expression, else_expression):
   """Switches between two operations depending on a scalar value.
 
@@ -3033,6 +3137,7 @@ def switch(condition, then_expression, else_expression):
   return x
 
 
+@tf_export('keras.backend.in_train_phase')
 def in_train_phase(x, alt, training=None):
   """Selects `x` in train phase, and `alt` otherwise.
 
@@ -3076,6 +3181,7 @@ def in_train_phase(x, alt, training=None):
   return x
 
 
+@tf_export('keras.backend.in_test_phase')
 def in_test_phase(x, alt, training=None):
   """Selects `x` in test phase, and `alt` otherwise.
 
@@ -3099,6 +3205,7 @@ def in_test_phase(x, alt, training=None):
 # NN OPERATIONS
 
 
+@tf_export('keras.backend.relu')
 def relu(x, alpha=0., max_value=None):
   """Rectified linear unit.
 
@@ -3125,6 +3232,7 @@ def relu(x, alpha=0., max_value=None):
   return x
 
 
+@tf_export('keras.backend.elu')
 def elu(x, alpha=1.):
   """Exponential linear unit.
 
@@ -3142,6 +3250,7 @@ def elu(x, alpha=1.):
     return array_ops.where(x > 0, res, alpha * res)
 
 
+@tf_export('keras.backend.softmax')
 def softmax(x):
   """Softmax of a tensor.
 
@@ -3154,6 +3263,7 @@ def softmax(x):
   return nn.softmax(x)
 
 
+@tf_export('keras.backend.softplus')
 def softplus(x):
   """Softplus of a tensor.
 
@@ -3166,6 +3276,7 @@ def softplus(x):
   return nn.softplus(x)
 
 
+@tf_export('keras.backend.softsign')
 def softsign(x):
   """Softsign of a tensor.
 
@@ -3178,6 +3289,7 @@ def softsign(x):
   return nn.softsign(x)
 
 
+@tf_export('keras.backend.categorical_crossentropy')
 def categorical_crossentropy(target, output, from_logits=False):
   """Categorical crossentropy between an output tensor and a target tensor.
 
@@ -3208,6 +3320,7 @@ def categorical_crossentropy(target, output, from_logits=False):
     return nn.softmax_cross_entropy_with_logits(labels=target, logits=output)
 
 
+@tf_export('keras.backend.sparse_categorical_crossentropy')
 def sparse_categorical_crossentropy(target, output, from_logits=False):
   """Categorical crossentropy with integer targets.
 
@@ -3241,6 +3354,7 @@ def sparse_categorical_crossentropy(target, output, from_logits=False):
     return res
 
 
+@tf_export('keras.backend.binary_crossentropy')
 def binary_crossentropy(target, output, from_logits=False):
   """Binary crossentropy between an output tensor and a target tensor.
 
@@ -3264,6 +3378,7 @@ def binary_crossentropy(target, output, from_logits=False):
   return nn.sigmoid_cross_entropy_with_logits(labels=target, logits=output)
 
 
+@tf_export('keras.backend.sigmoid')
 def sigmoid(x):
   """Element-wise sigmoid.
 
@@ -3276,6 +3391,7 @@ def sigmoid(x):
   return nn.sigmoid(x)
 
 
+@tf_export('keras.backend.hard_sigmoid')
 def hard_sigmoid(x):
   """Segment-wise linear approximation of sigmoid.
 
@@ -3296,6 +3412,7 @@ def hard_sigmoid(x):
   return x
 
 
+@tf_export('keras.backend.tanh')
 def tanh(x):
   """Element-wise tanh.
 
@@ -3308,6 +3425,7 @@ def tanh(x):
   return nn.tanh(x)
 
 
+@tf_export('keras.backend.dropout')
 def dropout(x, level, noise_shape=None, seed=None):
   """Sets entries in `x` to zero at random, while scaling the entire tensor.
 
@@ -3330,6 +3448,7 @@ def dropout(x, level, noise_shape=None, seed=None):
   return nn.dropout(x * 1., retain_prob, noise_shape, seed=seed)
 
 
+@tf_export('keras.backend.l2_normalize')
 def l2_normalize(x, axis=None):
   """Normalizes a tensor wrt the L2 norm alongside the specified axis.
 
@@ -3343,6 +3462,7 @@ def l2_normalize(x, axis=None):
   return nn.l2_normalize(x, dim=axis)
 
 
+@tf_export('keras.backend.in_top_k')
 def in_top_k(predictions, targets, k):
   """Returns whether the `targets` are in the top `k` `predictions`.
 
@@ -3440,6 +3560,7 @@ def _preprocess_padding(padding):
   return padding
 
 
+@tf_export('keras.backend.conv1d')
 def conv1d(x,
            kernel,
            strides=1,
@@ -3489,6 +3610,7 @@ def conv1d(x,
   return x
 
 
+@tf_export('keras.backend.conv2d')
 def conv2d(x,
            kernel,
            strides=(1, 1),
@@ -3533,6 +3655,7 @@ def conv2d(x,
   return x
 
 
+@tf_export('keras.backend.conv2d_transpose')
 def conv2d_transpose(x,
                      kernel,
                      output_shape,
@@ -3654,6 +3777,7 @@ def separable_conv1d(x,
   return x
 
 
+@tf_export('keras.backend.separable_conv2d')
 def separable_conv2d(x,
                      depthwise_kernel,
                      pointwise_kernel,
@@ -3753,6 +3877,7 @@ def depthwise_conv2d(x,
   return x
 
 
+@tf_export('keras.backend.conv3d')
 def conv3d(x,
            kernel,
            strides=(1, 1, 1),
@@ -3858,6 +3983,7 @@ def conv3d_transpose(x,
   return x
 
 
+@tf_export('keras.backend.pool2d')
 def pool2d(x,
            pool_size,
            strides=(1, 1),
@@ -3910,6 +4036,7 @@ def pool2d(x,
   return x
 
 
+@tf_export('keras.backend.pool3d')
 def pool3d(x,
            pool_size,
            strides=(1, 1, 1),
@@ -4073,6 +4200,7 @@ def local_conv2d(inputs,
   return output
 
 
+@tf_export('keras.backend.bias_add')
 def bias_add(x, bias, data_format=None):
   """Adds a bias vector to a tensor.
 
@@ -4146,6 +4274,7 @@ def bias_add(x, bias, data_format=None):
 # RANDOMNESS
 
 
+@tf_export('keras.backend.random_normal')
 def random_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
   """Returns a tensor with normal distribution of values.
 
@@ -4168,6 +4297,7 @@ def random_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
       shape, mean=mean, stddev=stddev, dtype=dtype, seed=seed)
 
 
+@tf_export('keras.backend.random_uniform')
 def random_uniform(shape, minval=0.0, maxval=1.0, dtype=None, seed=None):
   """Returns a tensor with uniform distribution of values.
 
@@ -4191,6 +4321,7 @@ def random_uniform(shape, minval=0.0, maxval=1.0, dtype=None, seed=None):
       shape, minval=minval, maxval=maxval, dtype=dtype, seed=seed)
 
 
+@tf_export('keras.backend.random_binomial')
 def random_binomial(shape, p=0.0, dtype=None, seed=None):
   """Returns a tensor with random binomial distribution of values.
 
@@ -4212,6 +4343,7 @@ def random_binomial(shape, p=0.0, dtype=None, seed=None):
       array_ops.ones(shape, dtype=dtype), array_ops.zeros(shape, dtype=dtype))
 
 
+@tf_export('keras.backend.truncated_normal')
 def truncated_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
   """Returns a tensor with truncated random normal distribution of values.
 
@@ -4245,6 +4377,7 @@ def truncated_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
 # in TensorFlow's CTC implementation
 
 
+@tf_export('keras.backend.ctc_label_dense_to_sparse')
 def ctc_label_dense_to_sparse(labels, label_lengths):
   """Converts CTC labels from dense to sparse.
 
@@ -4289,6 +4422,7 @@ def ctc_label_dense_to_sparse(labels, label_lengths):
       math_ops.to_int64(indices), vals_sparse, math_ops.to_int64(label_shape))
 
 
+@tf_export('keras.backend.ctc_batch_cost')
 def ctc_batch_cost(y_true, y_pred, input_length, label_length):
   """Runs CTC loss algorithm on each batch element.
 
@@ -4318,6 +4452,7 @@ def ctc_batch_cost(y_true, y_pred, input_length, label_length):
           inputs=y_pred, labels=sparse_labels, sequence_length=input_length), 1)
 
 
+@tf_export('keras.backend.ctc_decode')
 def ctc_decode(y_pred, input_length, greedy=True, beam_width=100, top_paths=1):
   """Decodes the output of a softmax.
 
@@ -4369,6 +4504,7 @@ def ctc_decode(y_pred, input_length, greedy=True, beam_width=100, top_paths=1):
 # HIGH ORDER FUNCTIONS
 
 
+@tf_export('keras.backend.map_fn')
 def map_fn(fn, elems, name=None, dtype=None):
   """Map the function fn over the elements elems and return the outputs.
 
@@ -4384,6 +4520,7 @@ def map_fn(fn, elems, name=None, dtype=None):
   return functional_ops.map_fn(fn, elems, name=name, dtype=dtype)
 
 
+@tf_export('keras.backend.foldl')
 def foldl(fn, elems, initializer=None, name=None):
   """Reduce elems using fn to combine them from left to right.
 
@@ -4400,6 +4537,7 @@ def foldl(fn, elems, initializer=None, name=None):
   return functional_ops.foldl(fn, elems, initializer=initializer, name=name)
 
 
+@tf_export('keras.backend.foldr')
 def foldr(fn, elems, initializer=None, name=None):
   """Reduce elems using fn to combine them from right to left.
 
diff --git a/tensorflow/python/keras/_impl/keras/callbacks.py b/tensorflow/python/keras/_impl/keras/callbacks.py
index f0d9e0b0f5..b29bc39232 100644
--- a/tensorflow/python/keras/_impl/keras/callbacks.py
+++ b/tensorflow/python/keras/_impl/keras/callbacks.py
@@ -35,6 +35,7 @@ from tensorflow.python.keras._impl.keras.utils.generic_utils import Progbar
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.summary import summary as tf_summary
+from tensorflow.python.util.tf_export import tf_export
 
 
 try:
@@ -159,6 +160,7 @@ class CallbackList(object):
     return iter(self.callbacks)
 
 
+@tf_export('keras.callbacks.Callback')
 class Callback(object):
   """Abstract base class used to build new callbacks.
 
@@ -215,6 +217,7 @@ class Callback(object):
     pass
 
 
+@tf_export('keras.callbacks.BaseLogger')
 class BaseLogger(Callback):
   """Callback that accumulates epoch averages of metrics.
 
@@ -244,6 +247,7 @@ class BaseLogger(Callback):
           logs[k] = self.totals[k] / self.seen
 
 
+@tf_export('keras.callbacks.TerminateOnNaN')
 class TerminateOnNaN(Callback):
   """Callback that terminates training when a NaN loss is encountered.
   """
@@ -260,6 +264,7 @@ class TerminateOnNaN(Callback):
         self.model.stop_training = True
 
 
+@tf_export('keras.callbacks.ProgbarLogger')
 class ProgbarLogger(Callback):
   """Callback that prints metrics to stdout.
 
@@ -326,6 +331,7 @@ class ProgbarLogger(Callback):
       self.progbar.update(self.seen, self.log_values, force=True)
 
 
+@tf_export('keras.callbacks.History')
 class History(Callback):
   """Callback that records events into a `History` object.
 
@@ -345,6 +351,7 @@ class History(Callback):
       self.history.setdefault(k, []).append(v)
 
 
+@tf_export('keras.callbacks.ModelCheckpoint')
 class ModelCheckpoint(Callback):
   """Save the model after every epoch.
 
@@ -448,6 +455,7 @@ class ModelCheckpoint(Callback):
           self.model.save(filepath, overwrite=True)
 
 
+@tf_export('keras.callbacks.EarlyStopping')
 class EarlyStopping(Callback):
   """Stop training when a monitored quantity has stopped improving.
 
@@ -531,6 +539,7 @@ class EarlyStopping(Callback):
       print('Epoch %05d: early stopping' % (self.stopped_epoch + 1))
 
 
+@tf_export('keras.callbacks.RemoteMonitor')
 class RemoteMonitor(Callback):
   """Callback used to stream events to a server.
 
@@ -575,6 +584,7 @@ class RemoteMonitor(Callback):
                       'root server at ' + str(self.root))
 
 
+@tf_export('keras.callbacks.LearningRateScheduler')
 class LearningRateScheduler(Callback):
   """Learning rate scheduler.
 
@@ -603,6 +613,7 @@ class LearningRateScheduler(Callback):
             'rate to %s.' % (epoch + 1, lr))
 
 
+@tf_export('keras.callbacks.TensorBoard')
 class TensorBoard(Callback):
   # pylint: disable=line-too-long
   """Tensorboard basic visualizations.
@@ -772,6 +783,7 @@ class TensorBoard(Callback):
     self.writer.close()
 
 
+@tf_export('keras.callbacks.ReduceLROnPlateau')
 class ReduceLROnPlateau(Callback):
   """Reduce learning rate when a metric has stopped improving.
 
@@ -891,6 +903,7 @@ class ReduceLROnPlateau(Callback):
     return self.cooldown_counter > 0
 
 
+@tf_export('keras.callbacks.CSVLogger')
 class CSVLogger(Callback):
   """Callback that streams epoch results to a csv file.
 
@@ -971,6 +984,7 @@ class CSVLogger(Callback):
     self.writer = None
 
 
+@tf_export('keras.callbacks.LambdaCallback')
 class LambdaCallback(Callback):
   r"""Callback for creating simple, custom callbacks on-the-fly.
 
diff --git a/tensorflow/python/keras/_impl/keras/constraints.py b/tensorflow/python/keras/_impl/keras/constraints.py
index 05b834a7a9..ab62d575e3 100644
--- a/tensorflow/python/keras/_impl/keras/constraints.py
+++ b/tensorflow/python/keras/_impl/keras/constraints.py
@@ -24,8 +24,10 @@ import six
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras._impl.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('keras.constraints.Constraint')
 class Constraint(object):
 
   def __call__(self, w):
@@ -35,6 +37,7 @@ class Constraint(object):
     return {}
 
 
+@tf_export('keras.constraints.MaxNorm', 'keras.constraints.max_norm')
 class MaxNorm(Constraint):
   """MaxNorm weight constraint.
 
@@ -70,6 +73,7 @@ class MaxNorm(Constraint):
     return {'max_value': self.max_value, 'axis': self.axis}
 
 
+@tf_export('keras.constraints.NonNeg', 'keras.constraints.non_neg')
 class NonNeg(Constraint):
   """Constrains the weights to be non-negative.
   """
@@ -78,6 +82,7 @@ class NonNeg(Constraint):
     return w * K.cast(K.greater_equal(w, 0.), K.floatx())
 
 
+@tf_export('keras.constraints.UnitNorm', 'keras.constraints.unit_norm')
 class UnitNorm(Constraint):
   """Constrains the weights incident to each hidden unit to have unit norm.
 
@@ -106,6 +111,7 @@ class UnitNorm(Constraint):
     return {'axis': self.axis}
 
 
+@tf_export('keras.constraints.MinMaxNorm', 'keras.constraints.min_max_norm')
 class MinMaxNorm(Constraint):
   """MinMaxNorm weight constraint.
 
@@ -170,10 +176,12 @@ nonneg = non_neg
 unitnorm = unit_norm
 
 
+@tf_export('keras.constraints.serialize')
 def serialize(constraint):
   return serialize_keras_object(constraint)
 
 
+@tf_export('keras.constraints.deserialize')
 def deserialize(config, custom_objects=None):
   return deserialize_keras_object(
       config,
@@ -182,6 +190,7 @@ def deserialize(config, custom_objects=None):
       printable_module_name='constraint')
 
 
+@tf_export('keras.constraints.get')
 def get(identifier):
   if identifier is None:
     return None
diff --git a/tensorflow/python/keras/_impl/keras/estimator.py b/tensorflow/python/keras/_impl/keras/estimator.py
index 624e92a04b..db0140c2df 100644
--- a/tensorflow/python/keras/_impl/keras/estimator.py
+++ b/tensorflow/python/keras/_impl/keras/estimator.py
@@ -37,6 +37,7 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training import training_util
+from tensorflow.python.util.tf_export import tf_export
 
 _DEFAULT_SERVING_KEY = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
 
@@ -244,6 +245,7 @@ def _save_first_checkpoint(keras_model, estimator, custom_objects,
         saver.save(sess, os.path.join(estimator.model_dir, 'keras_model.ckpt'))
 
 
+@tf_export('keras.estimator.model_to_estimator')
 def model_to_estimator(keras_model=None,
                        keras_model_path=None,
                        custom_objects=None,
diff --git a/tensorflow/python/keras/_impl/keras/initializers.py b/tensorflow/python/keras/_impl/keras/initializers.py
index 8752faa534..338c669f97 100644
--- a/tensorflow/python/keras/_impl/keras/initializers.py
+++ b/tensorflow/python/keras/_impl/keras/initializers.py
@@ -32,8 +32,10 @@ from tensorflow.python.ops.init_ops import RandomUniform
 from tensorflow.python.ops.init_ops import TruncatedNormal
 from tensorflow.python.ops.init_ops import VarianceScaling
 from tensorflow.python.ops.init_ops import Zeros
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('keras.initializers.lecun_normal')
 def lecun_normal(seed=None):
   """LeCun normal initializer.
 
@@ -56,6 +58,7 @@ def lecun_normal(seed=None):
       scale=1., mode='fan_in', distribution='normal', seed=seed)
 
 
+@tf_export('keras.initializers.lecun_uniform')
 def lecun_uniform(seed=None):
   """LeCun uniform initializer.
 
@@ -77,6 +80,7 @@ def lecun_uniform(seed=None):
       scale=1., mode='fan_in', distribution='uniform', seed=seed)
 
 
+@tf_export('keras.initializers.glorot_normal')
 def glorot_normal(seed=None):
   """Glorot normal initializer, also called Xavier normal initializer.
 
@@ -99,6 +103,7 @@ def glorot_normal(seed=None):
       scale=1., mode='fan_avg', distribution='normal', seed=seed)
 
 
+@tf_export('keras.initializers.glorot_uniform')
 def glorot_uniform(seed=None):
   """Glorot uniform initializer, also called Xavier uniform initializer.
 
@@ -121,6 +126,7 @@ def glorot_uniform(seed=None):
       scale=1., mode='fan_avg', distribution='uniform', seed=seed)
 
 
+@tf_export('keras.initializers.he_normal')
 def he_normal(seed=None):
   """He normal initializer.
 
@@ -141,6 +147,7 @@ def he_normal(seed=None):
       scale=2., mode='fan_in', distribution='normal', seed=seed)
 
 
+@tf_export('keras.initializers.he_uniform')
 def he_uniform(seed=None):
   """He uniform variance scaling initializer.
 
@@ -178,10 +185,12 @@ orthogonal = Orthogonal
 # Utility functions
 
 
+@tf_export('keras.initializers.serialize')
 def serialize(initializer):
   return serialize_keras_object(initializer)
 
 
+@tf_export('keras.initializers.deserialize')
 def deserialize(config, custom_objects=None):
   return deserialize_keras_object(
       config,
@@ -190,6 +199,7 @@ def deserialize(config, custom_objects=None):
       printable_module_name='initializer')
 
 
+@tf_export('keras.initializers.get')
 def get(identifier):
   if isinstance(identifier, dict):
     return deserialize(identifier)
diff --git a/tensorflow/python/keras/_impl/keras/losses.py b/tensorflow/python/keras/_impl/keras/losses.py
index fe0ef54360..1576ed7b99 100644
--- a/tensorflow/python/keras/_impl/keras/losses.py
+++ b/tensorflow/python/keras/_impl/keras/losses.py
@@ -24,41 +24,54 @@ import six
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras._impl.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('keras.metrics.mean_squared_error',
+           'keras.losses.mean_squared_error')
 def mean_squared_error(y_true, y_pred):
   return K.mean(K.square(y_pred - y_true), axis=-1)
 
 
+@tf_export('keras.metrics.mean_absolute_error',
+           'keras.losses.mean_absolute_error')
 def mean_absolute_error(y_true, y_pred):
   return K.mean(K.abs(y_pred - y_true), axis=-1)
 
 
+@tf_export('keras.metrics.mean_absolute_percentage_error',
+           'keras.losses.mean_absolute_percentage_error')
 def mean_absolute_percentage_error(y_true, y_pred):
   diff = K.abs((y_true - y_pred) / K.clip(K.abs(y_true), K.epsilon(), None))
   return 100. * K.mean(diff, axis=-1)
 
 
+@tf_export('keras.metrics.mean_squared_logarithmic_error',
+           'keras.losses.mean_squared_logarithmic_error')
 def mean_squared_logarithmic_error(y_true, y_pred):
   first_log = K.log(K.clip(y_pred, K.epsilon(), None) + 1.)
   second_log = K.log(K.clip(y_true, K.epsilon(), None) + 1.)
   return K.mean(K.square(first_log - second_log), axis=-1)
 
 
+@tf_export('keras.metrics.squared_hinge', 'keras.losses.squared_hinge')
 def squared_hinge(y_true, y_pred):
   return K.mean(K.square(K.maximum(1. - y_true * y_pred, 0.)), axis=-1)
 
 
+@tf_export('keras.metrics.hinge', 'keras.losses.hinge')
 def hinge(y_true, y_pred):
   return K.mean(K.maximum(1. - y_true * y_pred, 0.), axis=-1)
 
 
+@tf_export('keras.losses.categorical_hinge')
 def categorical_hinge(y_true, y_pred):
   pos = K.sum(y_true * y_pred, axis=-1)
   neg = K.max((1. - y_true) * y_pred, axis=-1)
   return K.maximum(0., neg - pos + 1.)
 
 
+@tf_export('keras.losses.logcosh')
 def logcosh(y_true, y_pred):
   """Logarithm of the hyperbolic cosine of the prediction error.
 
@@ -81,28 +94,38 @@ def logcosh(y_true, y_pred):
   return K.mean(_logcosh(y_pred - y_true), axis=-1)
 
 
+@tf_export('keras.metrics.categorical_crossentropy',
+           'keras.losses.categorical_crossentropy')
 def categorical_crossentropy(y_true, y_pred):
   return K.categorical_crossentropy(y_true, y_pred)
 
 
+@tf_export('keras.metrics.sparse_categorical_crossentropy',
+           'keras.losses.sparse_categorical_crossentropy')
 def sparse_categorical_crossentropy(y_true, y_pred):
   return K.sparse_categorical_crossentropy(y_true, y_pred)
 
 
+@tf_export('keras.metrics.binary_crossentropy',
+           'keras.losses.binary_crossentropy')
 def binary_crossentropy(y_true, y_pred):
   return K.mean(K.binary_crossentropy(y_true, y_pred), axis=-1)
 
 
+@tf_export('keras.metrics.kullback_leibler_divergence',
+           'keras.losses.kullback_leibler_divergence')
 def kullback_leibler_divergence(y_true, y_pred):
   y_true = K.clip(y_true, K.epsilon(), 1)
   y_pred = K.clip(y_pred, K.epsilon(), 1)
   return K.sum(y_true * K.log(y_true / y_pred), axis=-1)
 
 
+@tf_export('keras.metrics.poisson', 'keras.losses.poisson')
 def poisson(y_true, y_pred):
   return K.mean(y_pred - y_true * K.log(y_pred + K.epsilon()), axis=-1)
 
 
+@tf_export('keras.metrics.cosine_proximity', 'keras.losses.cosine_proximity')
 def cosine_proximity(y_true, y_pred):
   y_true = K.l2_normalize(y_true, axis=-1)
   y_pred = K.l2_normalize(y_pred, axis=-1)
@@ -119,10 +142,12 @@ kld = KLD = kullback_leibler_divergence
 cosine = cosine_proximity
 
 
+@tf_export('keras.losses.serialize')
 def serialize(loss):
   return serialize_keras_object(loss)
 
 
+@tf_export('keras.losses.deserialize')
 def deserialize(name, custom_objects=None):
   return deserialize_keras_object(
       name,
@@ -131,6 +156,7 @@ def deserialize(name, custom_objects=None):
       printable_module_name='loss function')
 
 
+@tf_export('keras.losses.get')
 def get(identifier):
   if identifier is None:
     return None
diff --git a/tensorflow/python/keras/_impl/keras/metrics.py b/tensorflow/python/keras/_impl/keras/metrics.py
index 3c18e68260..0e2fb6365a 100644
--- a/tensorflow/python/keras/_impl/keras/metrics.py
+++ b/tensorflow/python/keras/_impl/keras/metrics.py
@@ -36,12 +36,15 @@ from tensorflow.python.keras._impl.keras.losses import poisson
 from tensorflow.python.keras._impl.keras.losses import sparse_categorical_crossentropy
 from tensorflow.python.keras._impl.keras.losses import squared_hinge
 from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('keras.metrics.binary_accuracy')
 def binary_accuracy(y_true, y_pred):
   return K.mean(K.equal(y_true, K.round(y_pred)), axis=-1)
 
 
+@tf_export('keras.metrics.categorical_accuracy')
 def categorical_accuracy(y_true, y_pred):
   return K.cast(
       K.equal(K.argmax(y_true, axis=-1), K.argmax(y_pred, axis=-1)), K.floatx())
@@ -54,10 +57,12 @@ def sparse_categorical_accuracy(y_true, y_pred):
                                          K.floatx())), K.floatx())
 
 
+@tf_export('keras.metrics.top_k_categorical_accuracy')
 def top_k_categorical_accuracy(y_true, y_pred, k=5):
   return K.mean(K.in_top_k(y_pred, K.argmax(y_true, axis=-1), k), axis=-1)
 
 
+@tf_export('keras.metrics.sparse_top_k_categorical_accuracy')
 def sparse_top_k_categorical_accuracy(y_true, y_pred, k=5):
   return K.mean(
       K.in_top_k(y_pred, K.cast(K.max(y_true, axis=-1), 'int32'), k), axis=-1)
@@ -72,10 +77,12 @@ msle = MSLE = mean_squared_logarithmic_error
 cosine = cosine_proximity
 
 
+@tf_export('keras.metrics.serialize')
 def serialize(metric):
   return metric.__name__
 
 
+@tf_export('keras.metrics.deserialize')
 def deserialize(name, custom_objects=None):
   return deserialize_keras_object(
       name,
@@ -84,6 +91,7 @@ def deserialize(name, custom_objects=None):
       printable_module_name='metric function')
 
 
+@tf_export('keras.metrics.get')
 def get(identifier):
   if isinstance(identifier, six.string_types):
     identifier = str(identifier)
diff --git a/tensorflow/python/keras/_impl/keras/models.py b/tensorflow/python/keras/_impl/keras/models.py
index 9cd547200d..20736d234e 100644
--- a/tensorflow/python/keras/_impl/keras/models.py
+++ b/tensorflow/python/keras/_impl/keras/models.py
@@ -38,6 +38,7 @@ from tensorflow.python.keras._impl.keras.engine.training import Model
 from tensorflow.python.keras._impl.keras.utils.generic_utils import has_arg
 from tensorflow.python.keras._impl.keras.utils.io_utils import ask_to_proceed_with_overwrite
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
 
 
 # pylint: disable=g-import-not-at-top
@@ -53,6 +54,7 @@ except ImportError:
 # pylint: enable=g-import-not-at-top
 
 
+@tf_export('keras.models.save_model')
 def save_model(model, filepath, overwrite=True, include_optimizer=True):
   """Save a model to a HDF5 file.
 
@@ -183,6 +185,7 @@ def save_model(model, filepath, overwrite=True, include_optimizer=True):
     f.flush()
 
 
+@tf_export('keras.models.load_model')
 def load_model(filepath, custom_objects=None, compile=True):  # pylint: disable=redefined-builtin
   """Loads a model saved via `save_model`.
 
@@ -302,6 +305,7 @@ def load_model(filepath, custom_objects=None, compile=True):  # pylint: disable=
   return model
 
 
+@tf_export('keras.models.model_from_config')
 def model_from_config(config, custom_objects=None):
   """Instantiates a Keras model from its config.
 
@@ -324,6 +328,7 @@ def model_from_config(config, custom_objects=None):
   return layer_module.deserialize(config, custom_objects=custom_objects)
 
 
+@tf_export('keras.models.model_from_yaml')
 def model_from_yaml(yaml_string, custom_objects=None):
   """Parses a yaml model configuration file and returns a model instance.
 
@@ -345,6 +350,7 @@ def model_from_yaml(yaml_string, custom_objects=None):
   return layer_module.deserialize(config, custom_objects=custom_objects)
 
 
+@tf_export('keras.models.model_from_json')
 def model_from_json(json_string, custom_objects=None):
   """Parses a JSON model configuration file and returns a model instance.
 
@@ -361,6 +367,7 @@ def model_from_json(json_string, custom_objects=None):
   return layer_module.deserialize(config, custom_objects=custom_objects)
 
 
+@tf_export('keras.models.Sequential', 'keras.Sequential')
 class Sequential(Model):
   """Linear stack of layers.
 
diff --git a/tensorflow/python/keras/_impl/keras/optimizers.py b/tensorflow/python/keras/_impl/keras/optimizers.py
index a082677851..76a97156ed 100644
--- a/tensorflow/python/keras/_impl/keras/optimizers.py
+++ b/tensorflow/python/keras/_impl/keras/optimizers.py
@@ -32,6 +32,7 @@ from tensorflow.python.keras._impl.keras.utils.generic_utils import serialize_ke
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.training import optimizer as tf_optimizer_module
+from tensorflow.python.util.tf_export import tf_export
 
 
 def clip_norm(g, c, n):
@@ -65,6 +66,7 @@ def clip_norm(g, c, n):
   return g
 
 
+@tf_export('keras.optimizers.Optimizer')
 class Optimizer(object):
   """Abstract optimizer base class.
 
@@ -149,6 +151,7 @@ class Optimizer(object):
     return cls(**config)
 
 
+@tf_export('keras.optimizers.SGD')
 class SGD(Optimizer):
   """Stochastic gradient descent optimizer.
 
@@ -213,6 +216,7 @@ class SGD(Optimizer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.optimizers.RMSprop')
 class RMSprop(Optimizer):
   """RMSProp optimizer.
 
@@ -279,6 +283,7 @@ class RMSprop(Optimizer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.optimizers.Adagrad')
 class Adagrad(Optimizer):
   """Adagrad optimizer.
 
@@ -338,6 +343,7 @@ class Adagrad(Optimizer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.optimizers.Adadelta')
 class Adadelta(Optimizer):
   """Adadelta optimizer.
 
@@ -410,6 +416,7 @@ class Adadelta(Optimizer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.optimizers.Adam')
 class Adam(Optimizer):
   """Adam optimizer.
 
@@ -504,6 +511,7 @@ class Adam(Optimizer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.optimizers.Adamax')
 class Adamax(Optimizer):
   """Adamax optimizer from Adam paper's Section 7.
 
@@ -586,6 +594,7 @@ class Adamax(Optimizer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.optimizers.Nadam')
 class Nadam(Optimizer):
   """Nesterov Adam optimizer.
 
@@ -724,10 +733,12 @@ adamax = Adamax
 nadam = Nadam
 
 
+@tf_export('keras.optimizers.serialize')
 def serialize(optimizer):
   return serialize_keras_object(optimizer)
 
 
+@tf_export('keras.optimizers.deserialize')
 def deserialize(config, custom_objects=None):
   """Inverse of the `serialize` function.
 
@@ -761,6 +772,7 @@ def deserialize(config, custom_objects=None):
       printable_module_name='optimizer')
 
 
+@tf_export('keras.optimizers.get')
 def get(identifier):
   """Retrieves a Keras Optimizer instance.
 
diff --git a/tensorflow/python/keras/_impl/keras/regularizers.py b/tensorflow/python/keras/_impl/keras/regularizers.py
index c53ee8a1ae..2c30844647 100644
--- a/tensorflow/python/keras/_impl/keras/regularizers.py
+++ b/tensorflow/python/keras/_impl/keras/regularizers.py
@@ -23,8 +23,10 @@ import six
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras._impl.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('keras.regularizers.Regularizer')
 class Regularizer(object):
   """Regularizer base class.
   """
@@ -37,6 +39,7 @@ class Regularizer(object):
     return cls(**config)
 
 
+@tf_export('keras.regularizers.L1L2')
 class L1L2(Regularizer):
   """Regularizer for L1 and L2 regularization.
 
@@ -64,22 +67,27 @@ class L1L2(Regularizer):
 # Aliases.
 
 
+@tf_export('keras.regularizers.l1')
 def l1(l=0.01):
   return L1L2(l1=l)
 
 
+@tf_export('keras.regularizers.l2')
 def l2(l=0.01):
   return L1L2(l2=l)
 
 
+@tf_export('keras.regularizers.l1_l2')
 def l1_l2(l1=0.01, l2=0.01):  # pylint: disable=redefined-outer-name
   return L1L2(l1=l1, l2=l2)
 
 
+@tf_export('keras.regularizers.serialize')
 def serialize(regularizer):
   return serialize_keras_object(regularizer)
 
 
+@tf_export('keras.regularizers.deserialize')
 def deserialize(config, custom_objects=None):
   return deserialize_keras_object(
       config,
@@ -88,6 +96,7 @@ def deserialize(config, custom_objects=None):
       printable_module_name='regularizer')
 
 
+@tf_export('keras.regularizers.get')
 def get(identifier):
   if identifier is None:
     return None
diff --git a/tensorflow/tools/api/generator/BUILD b/tensorflow/tools/api/generator/BUILD
index a1c960be43..e731127a63 100644
--- a/tensorflow/tools/api/generator/BUILD
+++ b/tensorflow/tools/api/generator/BUILD
@@ -62,7 +62,20 @@ genrule(
         "api/image/__init__.py",
         "api/initializers/__init__.py",
         "api/keras/__init__.py",
+        "api/keras/activations/__init__.py",
+        "api/keras/applications/__init__.py",
+        "api/keras/applications/densenet/__init__.py",
+        "api/keras/applications/inception_resnet_v2/__init__.py",
+        "api/keras/applications/inception_v3/__init__.py",
+        "api/keras/applications/mobilenet/__init__.py",
+        "api/keras/applications/nasnet/__init__.py",
+        "api/keras/applications/resnet50/__init__.py",
+        "api/keras/applications/vgg16/__init__.py",
+        "api/keras/applications/vgg19/__init__.py",
+        "api/keras/applications/xception/__init__.py",
         "api/keras/backend/__init__.py",
+        "api/keras/callbacks/__init__.py",
+        "api/keras/constraints/__init__.py",
         "api/keras/datasets/__init__.py",
         "api/keras/datasets/boston_housing/__init__.py",
         "api/keras/datasets/cifar10/__init__.py",
@@ -70,13 +83,18 @@ genrule(
         "api/keras/datasets/imdb/__init__.py",
         "api/keras/datasets/mnist/__init__.py",
         "api/keras/datasets/reuters/__init__.py",
+        "api/keras/estimator/__init__.py",
         "api/keras/initializers/__init__.py",
         "api/keras/layers/__init__.py",
+        "api/keras/losses/__init__.py",
+        "api/keras/metrics/__init__.py",
         "api/keras/models/__init__.py",
+        "api/keras/optimizers/__init__.py",
         "api/keras/preprocessing/__init__.py",
         "api/keras/preprocessing/image/__init__.py",
         "api/keras/preprocessing/sequence/__init__.py",
         "api/keras/preprocessing/text/__init__.py",
+        "api/keras/regularizers/__init__.py",
         "api/keras/utils/__init__.py",
         "api/keras/wrappers/__init__.py",
         "api/keras/wrappers/scikit_learn/__init__.py",
-- 
GitLab


From d37c028b35b88b8fb0512b2f3093e8f77760dfeb Mon Sep 17 00:00:00 2001
From: Raghuraman Krishnamoorthi <raghuramank@google.com>
Date: Thu, 8 Feb 2018 14:50:54 -0800
Subject: [PATCH 869/966] Updated API for quantize_graph: Programmable
 quant_delay to determine start of quantization in training. Remove
 delay_requested parameter in _InsertQuantOp, presence or absence of
 quant_delay is directly inferred from value of quant_delay. Any positive
 value causes insertion of delayed quantization into the graph.

Experimental APIs for quantization with more programmability, including bitwidths for weights and activations.

PiperOrigin-RevId: 185056081
---
 .../contrib/quantize/python/quantize_graph.py | 104 ++++++++++++++----
 .../quantize/python/quantize_graph_test.py    |   2 +
 .../python/quantize_parameterized_test.py     |  22 ++--
 3 files changed, 99 insertions(+), 29 deletions(-)

diff --git a/tensorflow/contrib/quantize/python/quantize_graph.py b/tensorflow/contrib/quantize/python/quantize_graph.py
index 81471d4c50..6120b89683 100644
--- a/tensorflow/contrib/quantize/python/quantize_graph.py
+++ b/tensorflow/contrib/quantize/python/quantize_graph.py
@@ -23,8 +23,13 @@ from tensorflow.contrib.quantize.python import quantize
 from tensorflow.python.framework import ops
 
 
-def _create_graph(input_graph=None, is_training=True):
-  """Rewrites input_graph in place for simulated quantization.
+def _create_graph(input_graph=None,
+                  is_training=True,
+                  weight_bits=8,
+                  activation_bits=8,
+                  quant_delay=None,
+                  freeze_bn_delay=None):
+  """Returns a transformed training input_graph for simulated quantization.
 
   The graph has fake quantization ops inserted to simulate the error
   introduced by quantization. Since the graph is transformed in place,
@@ -35,29 +40,37 @@ def _create_graph(input_graph=None, is_training=True):
     input_graph: The tf.Graph to be transformed, if None then defaults to the
       default graph.
     is_training: Whether quantizing training or eval graph.
+    weight_bits: Number of bits to use for quantizing weights.
+    activation_bits: Number of bits to use for quantizing activations.
+    quant_delay: Number of steps after which weights and activations are
+                 quantized during training.
+    freeze_bn_delay: Number of steps after which moving mean and variance are
+                     frozen and used instead of batch statistics during
+                     training. freeze_bn_delay should be greater than
+                     quant_delay and should correspond to the number of steps
+                     when training has almost converged
 
   Raises:
     ValueError: If elements contains an element that isn't a tf.Tensor or
         tf.Operation.
   """
-  if is_training:
-    # TODO(raghuramank): Need to make freeze_batch_norm_delay
-    # a function of the batch size. For now setting this to 250 epochs
-    # This corresponds to 5 million steps at a batch size of 64.
-    freeze_batch_norm_delay = 5000000
-  else:
-    freeze_batch_norm_delay = None
+
   if input_graph is None:
     input_graph = ops.get_default_graph()
   with input_graph.as_default():
     fold_batch_norms.FoldBatchNorms(
         input_graph,
-        freeze_batch_norm_delay=freeze_batch_norm_delay,
+        freeze_batch_norm_delay=freeze_bn_delay,
         is_training=is_training)
-    quantize.Quantize(input_graph, is_training=is_training)
+    quantize.Quantize(
+        input_graph,
+        is_training=is_training,
+        quant_delay=quant_delay,
+        weight_bits=weight_bits,
+        activation_bits=activation_bits)
 
 
-def create_training_graph(input_graph=None):
+def create_training_graph(input_graph=None, quant_delay=250000):
   """Rewrites a training input_graph in place for simulated quantization.
 
   The graph has fake quantization ops inserted to simulate the error
@@ -67,12 +80,32 @@ def create_training_graph(input_graph=None):
 
   Args:
     input_graph: The tf.Graph to be transformed.
+    quant_delay: Number of steps after which weights and activations are
+                 quantized during training.
 
   Raises:
     ValueError: If elements contains an element that isn't a tf.Tensor or
         tf.Operation.
   """
-  _create_graph(input_graph=input_graph, is_training=True)
+  # TODO(raghuramank) Need to have freeze_bn_delay be a function of batch size
+  # Currently the values below are hardcoded for mobilenetV1 on imagenet
+  # Please use the experimental API if you need to tune these values.
+  if quant_delay == 0:
+    # Corresponds to case of restoring from a floating point checkpoint
+    # In this case, we can freeze the moving mean and variance early on and
+    # switch to using them during training. Therefore, freeze_bn_delay is set to
+    # 200000
+    freeze_bn_delay = 200000
+  else:
+    # If training from scratch, set freeze_bn_delay to 100 epochs after quant
+    # delay. With a batch size of 64, this corresponds to 20000*100=2M steps.
+    freeze_bn_delay = quant_delay + 2000000
+
+  _create_graph(
+      input_graph=input_graph,
+      is_training=True,
+      quant_delay=quant_delay,
+      freeze_bn_delay=freeze_bn_delay)
 
 
 def create_eval_graph(input_graph=None):
@@ -95,8 +128,12 @@ def create_eval_graph(input_graph=None):
   _create_graph(input_graph=input_graph, is_training=False)
 
 
-def experimental_create_training_graph(input_graph=None):
-  """Rewrites a training input_graph in place for simulated quantization.
+def experimental_create_training_graph(input_graph=None,
+                                       weight_bits=8,
+                                       activation_bits=8,
+                                       quant_delay=250000,
+                                       freeze_bn_delay=500000):
+  """Returns a transformed training input_graph for simulated quantization.
 
   The graph has fake quantization ops inserted to simulate the error
   introduced by quantization. Since the graph is transformed in place,
@@ -104,18 +141,37 @@ def experimental_create_training_graph(input_graph=None):
   change.
 
   Args:
-    input_graph: The tf.Graph to be transformed, if None then defaults to the
+    input_graph: The tf.Graph to be transformed,if None then defaults to the
       default graph.
+    weight_bits:     Number of bits to use for quantizing weights.
+    activation_bits: Number of bits to use for quantizing activations.
+    quant_delay:     Number of steps after which weights and
+                     activations are quantized during training.
+    freeze_bn_delay: Number of steps after which moving mean and variance are
+                     frozen and used instead of batch statistics during
+                     training. freeze_bn_delay should be greater than
+                     quant_delay and should correspond to when training
+                     has almost converged
+
 
   Raises:
     ValueError: If elements contains an element that isn't a tf.Tensor or
         tf.Operation.
   """
-  _create_graph(input_graph=input_graph, is_training=True)
 
+  _create_graph(
+      input_graph=input_graph,
+      is_training=True,
+      weight_bits=weight_bits,
+      activation_bits=activation_bits,
+      quant_delay=quant_delay,
+      freeze_bn_delay=freeze_bn_delay)
 
-def experimental_create_eval_graph(input_graph=None):
-  """Rewrites an eval input_graph in place for simulated quantization.
+
+def experimental_create_eval_graph(input_graph=None,
+                                   weight_bits=8,
+                                   activation_bits=8):
+  """Returns a transformed eval input_graph for simulated quantization.
 
   The graph has fake quantization ops inserted to simulate the error
   introduced by quantization. Since the graph is transformed in place,
@@ -125,9 +181,17 @@ def experimental_create_eval_graph(input_graph=None):
   Args:
     input_graph: The tf.Graph to be transformed, if None then defaults to the
       default graph.
+    weight_bits: Number of bits to use for quantizing weights.
+    activation_bits: Number of bits to use for quantizing activations.
+
+
 
   Raises:
     ValueError: If elements contains an element that isn't a tf.Tensor or
         tf.Operation.
   """
-  _create_graph(input_graph=input_graph, is_training=False)
+  _create_graph(
+      input_graph=input_graph,
+      is_training=False,
+      weight_bits=weight_bits,
+      activation_bits=activation_bits)
diff --git a/tensorflow/contrib/quantize/python/quantize_graph_test.py b/tensorflow/contrib/quantize/python/quantize_graph_test.py
index 7e08ebcb5c..c57fcd4e4e 100644
--- a/tensorflow/contrib/quantize/python/quantize_graph_test.py
+++ b/tensorflow/contrib/quantize/python/quantize_graph_test.py
@@ -28,6 +28,8 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.platform import googletest
 
 
+# TODO(suharshs): Add tests for testing experimental APIs and additional
+# input arguments
 class QuantizeGraphTest(test_util.TensorFlowTestCase):
   # We have a lot of other tests that test the details of the rewrite, here we
   # just the specific features of the quantize_graph API.
diff --git a/tensorflow/contrib/quantize/python/quantize_parameterized_test.py b/tensorflow/contrib/quantize/python/quantize_parameterized_test.py
index f1fe322049..2d5307799b 100644
--- a/tensorflow/contrib/quantize/python/quantize_parameterized_test.py
+++ b/tensorflow/contrib/quantize/python/quantize_parameterized_test.py
@@ -101,9 +101,11 @@ class QuantizeTest(test_util.TensorFlowTestCase):
         scope + '/weights_quant/AssignMaxLast', scope + '/weights/read'
     ]
     self._AssertInputOpsAre(weights_quant, expected_inputs)
-    output_op_name = (
-        scope + '/weights_quant/delayed_quant/Switch_1'
-        if delay else scope + '/Conv2D')
+    if delay and delay > 0:
+      output_op_name = scope + '/weights_quant/delayed_quant/Switch_1'
+    else:
+      output_op_name = scope + '/Conv2D'
+
     self._AssertOutputGoesToOps(weights_quant, graph, [output_op_name])
 
     if with_bypass:
@@ -178,9 +180,10 @@ class QuantizeTest(test_util.TensorFlowTestCase):
         scope + '/weights_quant/AssignMaxLast', scope + '/weights/read'
     ]
     self._AssertInputOpsAre(weights_quant, expected_inputs)
-    output_op_name = (
-        scope + '/weights_quant/delayed_quant/Switch_1'
-        if delay else scope + '/MatMul')
+    if delay and delay > 0:
+      output_op_name = scope + '/weights_quant/delayed_quant/Switch_1'
+    else:
+      output_op_name = scope + '/MatMul'
     self._AssertOutputGoesToOps(weights_quant, graph, [output_op_name])
 
     if with_bypass:
@@ -256,9 +259,10 @@ class QuantizeTest(test_util.TensorFlowTestCase):
         scope + '/depthwise_weights/read'
     ]
     self._AssertInputOpsAre(weights_quant, expected_inputs)
-    output_op_name = (
-        scope + '/weights_quant/delayed_quant/Switch_1'
-        if delay else scope + '/depthwise')
+    if delay and delay > 0:
+      output_op_name = scope + '/weights_quant/delayed_quant/Switch_1'
+    else:
+      output_op_name = scope + '/depthwise'
     self._AssertOutputGoesToOps(weights_quant, graph, [output_op_name])
 
     if with_bypass:
-- 
GitLab


From e47f970f8395be1428c2d7fe0bf42aa4119803fa Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 8 Feb 2018 14:55:09 -0800
Subject: [PATCH 870/966] Add a _TensorProcessor for computing gradients but
 not applying them.

PiperOrigin-RevId: 185056764
---
 tensorflow/python/training/optimizer.py      | 25 +++++++++++++++++++-
 tensorflow/python/training/optimizer_test.py | 16 +++++++++++++
 2 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index 9ec588bac9..b806251f81 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -175,16 +175,39 @@ class _StreamingModelPortProcessor(_OptimizableVariable):
     return g
 
 
+class _TensorProcessor(_OptimizableVariable):
+  """Processor for ordinary Tensors.
+
+  Even though a Tensor can't really be updated, sometimes it is useful to
+  compute the gradients with respect to a Tensor using the optimizer. Updating
+  the Tensor is, of course, unsupported.
+  """
+
+  def __init__(self, v):
+    self._v = v
+
+  def target(self):
+    return self._v
+
+  def update_op(self, optimizer, g):
+    raise NotImplementedError("Trying to update a Tensor ", self._v)
+
+
 def _get_processor(v):
   """The processor of v."""
   if context.in_eager_mode():
-    return _DenseResourceVariableProcessor(v)
+    if isinstance(v, ops.Tensor):
+      return _TensorProcessor(v)
+    else:
+      return _DenseResourceVariableProcessor(v)
   if v.op.type == "VarHandleOp":
     return _DenseResourceVariableProcessor(v)
   if isinstance(v, variables.Variable):
     return _RefVariableProcessor(v)
   if v.op.type == "SubmodelPort":
     return _StreamingModelPortProcessor(v)
+  if isinstance(v, ops.Tensor):
+    return _TensorProcessor(v)
   raise NotImplementedError("Trying to optimize unsupported type ", v)
 
 
diff --git a/tensorflow/python/training/optimizer_test.py b/tensorflow/python/training/optimizer_test.py
index 6bdae39073..8652c61f01 100644
--- a/tensorflow/python/training/optimizer_test.py
+++ b/tensorflow/python/training/optimizer_test.py
@@ -221,6 +221,22 @@ class OptimizerTest(test.TestCase):
       self.assertAllClose([-14., -13.], self.evaluate(var0))
       self.assertAllClose([-6., -5.], self.evaluate(var1))
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testComputeGradientsWithTensors(self):
+    def f(x):
+      return x * x
+    x = ops.convert_to_tensor(1.0)
+    y = f if context.in_eager_mode() else f(x)
+    sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
+    grads_and_vars = sgd_op.compute_gradients(y, [x])
+    self.assertEqual(1, len(grads_and_vars))
+    grad, x_as_var = grads_and_vars[0]
+    self.assertIs(x, x_as_var)
+    self.assertEqual(2.0, self.evaluate(grad))
+
+    with self.assertRaises(NotImplementedError):
+      sgd_op.apply_gradients(grads_and_vars)
+
   def testTrainOp(self):
     with self.test_session():
       var0 = variables.Variable([1.0, 2.0])
-- 
GitLab


From 3d432ddefe43a6527cef1ffdcdb785a4f7db4a10 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 8 Feb 2018 14:57:46 -0800
Subject: [PATCH 871/966] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 185057233

---
 tensorflow/go/op/wrappers.go | 240 +++++++++++++++++------------------
 1 file changed, 120 insertions(+), 120 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index db01f1068d..13f38dfb32 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -220,6 +220,64 @@ func CreateSummaryFileWriter(scope *Scope, writer tf.Output, logdir tf.Output, m
 	return scope.AddOperation(opspec)
 }
 
+// FakeQuantWithMinMaxVarsPerChannelGradientAttr is an optional argument to FakeQuantWithMinMaxVarsPerChannelGradient.
+type FakeQuantWithMinMaxVarsPerChannelGradientAttr func(optionalAttr)
+
+// FakeQuantWithMinMaxVarsPerChannelGradientNumBits sets the optional num_bits attribute to value.
+//
+// value: The bitwidth of the quantization; between 2 and 8, inclusive.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxVarsPerChannelGradientNumBits(value int64) FakeQuantWithMinMaxVarsPerChannelGradientAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
+// FakeQuantWithMinMaxVarsPerChannelGradientNarrowRange sets the optional narrow_range attribute to value.
+//
+// value: Whether to quantize into 2^num_bits - 1 distinct values.
+// If not specified, defaults to false
+func FakeQuantWithMinMaxVarsPerChannelGradientNarrowRange(value bool) FakeQuantWithMinMaxVarsPerChannelGradientAttr {
+	return func(m optionalAttr) {
+		m["narrow_range"] = value
+	}
+}
+
+// Compute gradients for a FakeQuantWithMinMaxVarsPerChannel operation.
+//
+// Arguments:
+//	gradients: Backpropagated gradients above the FakeQuantWithMinMaxVars operation,
+// shape one of: `[d]`, `[b, d]`,  `[b, h, w, d]`.
+//	inputs: Values passed as inputs to the FakeQuantWithMinMaxVars operation, shape
+//   same as `gradients`.
+// min, max: Quantization interval, floats of shape `[d]`.
+//
+//
+//
+// Returns Backpropagated gradients w.r.t. inputs, shape same as
+// `inputs`:
+//   `gradients * (inputs >= min && inputs <= max)`.Backpropagated gradients w.r.t. min parameter, shape `[d]`:
+// `sum_per_d(gradients * (inputs < min))`.Backpropagated gradients w.r.t. max parameter, shape `[d]`:
+// `sum_per_d(gradients * (inputs > max))`.
+func FakeQuantWithMinMaxVarsPerChannelGradient(scope *Scope, gradients tf.Output, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsPerChannelGradientAttr) (backprops_wrt_input tf.Output, backprop_wrt_min tf.Output, backprop_wrt_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FakeQuantWithMinMaxVarsPerChannelGradient",
+		Input: []tf.Input{
+			gradients, inputs, min, max,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
 // Partitions `data` into `num_partitions` tensors using indices from `partitions`.
 //
 // For each index tuple `js` of size `partitions.ndim`, the slice `data[js, ...]`
@@ -4580,6 +4638,68 @@ func CriticalSectionOp(scope *Scope, optional ...CriticalSectionOpAttr) (resourc
 	return op.Output(0)
 }
 
+// FakeQuantWithMinMaxArgsGradientAttr is an optional argument to FakeQuantWithMinMaxArgsGradient.
+type FakeQuantWithMinMaxArgsGradientAttr func(optionalAttr)
+
+// FakeQuantWithMinMaxArgsGradientMin sets the optional min attribute to value.
+// If not specified, defaults to -6
+func FakeQuantWithMinMaxArgsGradientMin(value float32) FakeQuantWithMinMaxArgsGradientAttr {
+	return func(m optionalAttr) {
+		m["min"] = value
+	}
+}
+
+// FakeQuantWithMinMaxArgsGradientMax sets the optional max attribute to value.
+// If not specified, defaults to 6
+func FakeQuantWithMinMaxArgsGradientMax(value float32) FakeQuantWithMinMaxArgsGradientAttr {
+	return func(m optionalAttr) {
+		m["max"] = value
+	}
+}
+
+// FakeQuantWithMinMaxArgsGradientNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxArgsGradientNumBits(value int64) FakeQuantWithMinMaxArgsGradientAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
+// FakeQuantWithMinMaxArgsGradientNarrowRange sets the optional narrow_range attribute to value.
+// If not specified, defaults to false
+func FakeQuantWithMinMaxArgsGradientNarrowRange(value bool) FakeQuantWithMinMaxArgsGradientAttr {
+	return func(m optionalAttr) {
+		m["narrow_range"] = value
+	}
+}
+
+// Compute gradients for a FakeQuantWithMinMaxArgs operation.
+//
+// Arguments:
+//	gradients: Backpropagated gradients above the FakeQuantWithMinMaxArgs operation.
+//	inputs: Values passed as inputs to the FakeQuantWithMinMaxArgs operation.
+//
+// Returns Backpropagated gradients below the FakeQuantWithMinMaxArgs operation:
+// `gradients * (inputs >= min && inputs <= max)`.
+func FakeQuantWithMinMaxArgsGradient(scope *Scope, gradients tf.Output, inputs tf.Output, optional ...FakeQuantWithMinMaxArgsGradientAttr) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FakeQuantWithMinMaxArgsGradient",
+		Input: []tf.Input{
+			gradients, inputs,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // AvgPool3DAttr is an optional argument to AvgPool3D.
 type AvgPool3DAttr func(optionalAttr)
 
@@ -20639,68 +20759,6 @@ func TFRecordDataset(scope *Scope, filenames tf.Output, compression_type tf.Outp
 	return op.Output(0)
 }
 
-// FakeQuantWithMinMaxArgsGradientAttr is an optional argument to FakeQuantWithMinMaxArgsGradient.
-type FakeQuantWithMinMaxArgsGradientAttr func(optionalAttr)
-
-// FakeQuantWithMinMaxArgsGradientMin sets the optional min attribute to value.
-// If not specified, defaults to -6
-func FakeQuantWithMinMaxArgsGradientMin(value float32) FakeQuantWithMinMaxArgsGradientAttr {
-	return func(m optionalAttr) {
-		m["min"] = value
-	}
-}
-
-// FakeQuantWithMinMaxArgsGradientMax sets the optional max attribute to value.
-// If not specified, defaults to 6
-func FakeQuantWithMinMaxArgsGradientMax(value float32) FakeQuantWithMinMaxArgsGradientAttr {
-	return func(m optionalAttr) {
-		m["max"] = value
-	}
-}
-
-// FakeQuantWithMinMaxArgsGradientNumBits sets the optional num_bits attribute to value.
-// If not specified, defaults to 8
-func FakeQuantWithMinMaxArgsGradientNumBits(value int64) FakeQuantWithMinMaxArgsGradientAttr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
-	}
-}
-
-// FakeQuantWithMinMaxArgsGradientNarrowRange sets the optional narrow_range attribute to value.
-// If not specified, defaults to false
-func FakeQuantWithMinMaxArgsGradientNarrowRange(value bool) FakeQuantWithMinMaxArgsGradientAttr {
-	return func(m optionalAttr) {
-		m["narrow_range"] = value
-	}
-}
-
-// Compute gradients for a FakeQuantWithMinMaxArgs operation.
-//
-// Arguments:
-//	gradients: Backpropagated gradients above the FakeQuantWithMinMaxArgs operation.
-//	inputs: Values passed as inputs to the FakeQuantWithMinMaxArgs operation.
-//
-// Returns Backpropagated gradients below the FakeQuantWithMinMaxArgs operation:
-// `gradients * (inputs >= min && inputs <= max)`.
-func FakeQuantWithMinMaxArgsGradient(scope *Scope, gradients tf.Output, inputs tf.Output, optional ...FakeQuantWithMinMaxArgsGradientAttr) (backprops tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FakeQuantWithMinMaxArgsGradient",
-		Input: []tf.Input{
-			gradients, inputs,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // BatchToSpace for 4-D tensors of type T.
 //
 // This is a legacy version of the more general BatchToSpaceND.
@@ -28280,61 +28338,3 @@ func FakeQuantWithMinMaxVars(scope *Scope, inputs tf.Output, min tf.Output, max
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
-
-// FakeQuantWithMinMaxVarsPerChannelGradientAttr is an optional argument to FakeQuantWithMinMaxVarsPerChannelGradient.
-type FakeQuantWithMinMaxVarsPerChannelGradientAttr func(optionalAttr)
-
-// FakeQuantWithMinMaxVarsPerChannelGradientNumBits sets the optional num_bits attribute to value.
-//
-// value: The bitwidth of the quantization; between 2 and 8, inclusive.
-// If not specified, defaults to 8
-func FakeQuantWithMinMaxVarsPerChannelGradientNumBits(value int64) FakeQuantWithMinMaxVarsPerChannelGradientAttr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
-	}
-}
-
-// FakeQuantWithMinMaxVarsPerChannelGradientNarrowRange sets the optional narrow_range attribute to value.
-//
-// value: Whether to quantize into 2^num_bits - 1 distinct values.
-// If not specified, defaults to false
-func FakeQuantWithMinMaxVarsPerChannelGradientNarrowRange(value bool) FakeQuantWithMinMaxVarsPerChannelGradientAttr {
-	return func(m optionalAttr) {
-		m["narrow_range"] = value
-	}
-}
-
-// Compute gradients for a FakeQuantWithMinMaxVarsPerChannel operation.
-//
-// Arguments:
-//	gradients: Backpropagated gradients above the FakeQuantWithMinMaxVars operation,
-// shape one of: `[d]`, `[b, d]`,  `[b, h, w, d]`.
-//	inputs: Values passed as inputs to the FakeQuantWithMinMaxVars operation, shape
-//   same as `gradients`.
-// min, max: Quantization interval, floats of shape `[d]`.
-//
-//
-//
-// Returns Backpropagated gradients w.r.t. inputs, shape same as
-// `inputs`:
-//   `gradients * (inputs >= min && inputs <= max)`.Backpropagated gradients w.r.t. min parameter, shape `[d]`:
-// `sum_per_d(gradients * (inputs < min))`.Backpropagated gradients w.r.t. max parameter, shape `[d]`:
-// `sum_per_d(gradients * (inputs > max))`.
-func FakeQuantWithMinMaxVarsPerChannelGradient(scope *Scope, gradients tf.Output, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsPerChannelGradientAttr) (backprops_wrt_input tf.Output, backprop_wrt_min tf.Output, backprop_wrt_max tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FakeQuantWithMinMaxVarsPerChannelGradient",
-		Input: []tf.Input{
-			gradients, inputs, min, max,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-- 
GitLab


From f98264a1b9916e46a88089b605e962265ecde1a6 Mon Sep 17 00:00:00 2001
From: Eugene Brevdo <ebrevdo@google.com>
Date: Thu, 8 Feb 2018 15:01:49 -0800
Subject: [PATCH 872/966] [TF contrib RNN] Expose some rnn classes and
 functionality in contrib.

PiperOrigin-RevId: 185057994
---
 tensorflow/contrib/rnn/__init__.py                     |  7 +++++++
 tensorflow/contrib/rnn/python/ops/gru_ops.py           |  2 +-
 tensorflow/contrib/rnn/python/ops/lstm_ops.py          |  2 +-
 tensorflow/contrib/rnn/python/ops/rnn_cell.py          |  2 +-
 tensorflow/python/ops/rnn.py                           |  5 +++--
 tensorflow/python/ops/rnn_cell_impl.py                 | 10 +++++-----
 .../tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt   |  2 +-
 .../tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt     |  2 +-
 .../golden/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt    |  2 +-
 .../golden/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt  |  2 +-
 10 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/tensorflow/contrib/rnn/__init__.py b/tensorflow/contrib/rnn/__init__.py
index c568c6760f..67f31785b5 100644
--- a/tensorflow/contrib/rnn/__init__.py
+++ b/tensorflow/contrib/rnn/__init__.py
@@ -18,6 +18,7 @@ See @{$python/contrib.rnn} guide.
 
 <!--From core-->
 @@RNNCell
+@@LayerRNNCell
 @@BasicRNNCell
 @@BasicLSTMCell
 @@GRUCell
@@ -68,6 +69,10 @@ See @{$python/contrib.rnn} guide.
 @@static_bidirectional_rnn
 @@stack_bidirectional_dynamic_rnn
 @@stack_bidirectional_rnn
+
+<!--RNN utilities-->
+@@transpose_batch_time
+@@best_effort_input_batch_size
 """
 
 from __future__ import absolute_import
@@ -85,6 +90,8 @@ from tensorflow.contrib.rnn.python.ops.lstm_ops import *
 from tensorflow.contrib.rnn.python.ops.rnn import *
 from tensorflow.contrib.rnn.python.ops.rnn_cell import *
 
+from tensorflow.python.ops.rnn import _best_effort_input_batch_size as best_effort_input_batch_size
+from tensorflow.python.ops.rnn import _transpose_batch_time as transpose_batch_time
 from tensorflow.python.ops.rnn import static_bidirectional_rnn
 from tensorflow.python.ops.rnn import static_rnn
 from tensorflow.python.ops.rnn import static_state_saving_rnn
diff --git a/tensorflow/contrib/rnn/python/ops/gru_ops.py b/tensorflow/contrib/rnn/python/ops/gru_ops.py
index 4c964ec201..81ca12317b 100644
--- a/tensorflow/contrib/rnn/python/ops/gru_ops.py
+++ b/tensorflow/contrib/rnn/python/ops/gru_ops.py
@@ -32,7 +32,7 @@ from tensorflow.python.util.deprecation import deprecated_args
 _gru_ops_so = loader.load_op_library(
     resource_loader.get_path_to_datafile("_gru_ops.so"))
 
-LayerRNNCell = rnn_cell_impl._LayerRNNCell  # pylint: disable=invalid-name,protected-access
+LayerRNNCell = rnn_cell_impl.LayerRNNCell  # pylint: disable=invalid-name
 
 
 @ops.RegisterGradient("GRUBlockCell")
diff --git a/tensorflow/contrib/rnn/python/ops/lstm_ops.py b/tensorflow/contrib/rnn/python/ops/lstm_ops.py
index 04f342cd18..f700717394 100644
--- a/tensorflow/contrib/rnn/python/ops/lstm_ops.py
+++ b/tensorflow/contrib/rnn/python/ops/lstm_ops.py
@@ -34,7 +34,7 @@ from tensorflow.python.platform import resource_loader
 _lstm_ops_so = loader.load_op_library(
     resource_loader.get_path_to_datafile("_lstm_ops.so"))
 
-LayerRNNCell = rnn_cell_impl._LayerRNNCell  # pylint: disable=invalid-name,protected-access
+LayerRNNCell = rnn_cell_impl.LayerRNNCell  # pylint: disable=invalid-name
 
 
 # pylint: disable=invalid-name
diff --git a/tensorflow/contrib/rnn/python/ops/rnn_cell.py b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
index fe07493d0f..dce71c393a 100644
--- a/tensorflow/contrib/rnn/python/ops/rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
@@ -2682,7 +2682,7 @@ class LayerNormLSTMCell(rnn_cell_impl.RNNCell):
     return m, new_state
 
 
-class SRUCell(rnn_cell_impl._LayerRNNCell):
+class SRUCell(rnn_cell_impl.LayerRNNCell):
   """SRU, Simple Recurrent Unit
 
      Implementation based on
diff --git a/tensorflow/python/ops/rnn.py b/tensorflow/python/ops/rnn.py
index da80e72071..aa8d4327d2 100644
--- a/tensorflow/python/ops/rnn.py
+++ b/tensorflow/python/ops/rnn.py
@@ -83,8 +83,9 @@ def _best_effort_input_batch_size(flat_input):
   """Get static input batch size if available, with fallback to the dynamic one.
 
   Args:
-    flat_input: An iterable of time major input Tensors of shape [max_time,
-      batch_size, ...]. All inputs should have compatible batch sizes.
+    flat_input: An iterable of time major input Tensors of shape
+      `[max_time, batch_size, ...]`.
+    All inputs should have compatible batch sizes.
 
   Returns:
     The batch size in Python integer if available, or a scalar Tensor otherwise.
diff --git a/tensorflow/python/ops/rnn_cell_impl.py b/tensorflow/python/ops/rnn_cell_impl.py
index f1ac3e9baf..923348ea44 100644
--- a/tensorflow/python/ops/rnn_cell_impl.py
+++ b/tensorflow/python/ops/rnn_cell_impl.py
@@ -255,7 +255,7 @@ class RNNCell(base_layer.Layer):
     return output
 
 
-class _LayerRNNCell(RNNCell):
+class LayerRNNCell(RNNCell):
   """Subclass of RNNCells that act like proper `tf.Layer` objects.
 
   For backwards compatibility purposes, most `RNNCell` instances allow their
@@ -297,7 +297,7 @@ class _LayerRNNCell(RNNCell):
 
 
 @tf_export("nn.rnn_cell.BasicRNNCell")
-class BasicRNNCell(_LayerRNNCell):
+class BasicRNNCell(LayerRNNCell):
   """The most basic RNN cell.
 
   Args:
@@ -355,7 +355,7 @@ class BasicRNNCell(_LayerRNNCell):
 
 
 @tf_export("nn.rnn_cell.GRUCell")
-class GRUCell(_LayerRNNCell):
+class GRUCell(LayerRNNCell):
   """Gated Recurrent Unit cell (cf. http://arxiv.org/abs/1406.1078).
 
   Args:
@@ -473,7 +473,7 @@ class LSTMStateTuple(_LSTMStateTuple):
 
 
 @tf_export("nn.rnn_cell.BasicLSTMCell")
-class BasicLSTMCell(_LayerRNNCell):
+class BasicLSTMCell(LayerRNNCell):
   """Basic LSTM recurrent network cell.
 
   The implementation is based on: http://arxiv.org/abs/1409.2329.
@@ -598,7 +598,7 @@ class BasicLSTMCell(_LayerRNNCell):
 
 
 @tf_export("nn.rnn_cell.LSTMCell")
-class LSTMCell(_LayerRNNCell):
+class LSTMCell(LayerRNNCell):
   """Long short-term memory unit (LSTM) recurrent network cell.
 
   The default non-peephole implementation is based on:
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
index a2e728f94b..44536787f0 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.nn.rnn_cell.BasicLSTMCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.BasicLSTMCell\'>"
-  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl._LayerRNNCell\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<type \'object\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
index 4211faa1ec..768565d3ca 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.nn.rnn_cell.BasicRNNCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.BasicRNNCell\'>"
-  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl._LayerRNNCell\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<type \'object\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
index 06fdc638c8..6ecc134d4d 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.nn.rnn_cell.GRUCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.GRUCell\'>"
-  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl._LayerRNNCell\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<type \'object\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
index ef48cff0c3..4b3ca1578b 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.nn.rnn_cell.LSTMCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LSTMCell\'>"
-  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl._LayerRNNCell\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<type \'object\'>"
-- 
GitLab


From a00ba3780050b8e09f80788c56e3039da328c071 Mon Sep 17 00:00:00 2001
From: Yao Zhang <yaozhang@google.com>
Date: Thu, 8 Feb 2018 15:06:54 -0800
Subject: [PATCH 873/966] Add more vlogging to op level estimator.

PiperOrigin-RevId: 185058989
---
 tensorflow/core/grappler/costs/op_level_cost_estimator.cc | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
index 1af973855e..76db1afd4a 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -457,10 +457,15 @@ OpLevelCostEstimator::ConvolutionDimensionsFromInputs(
     const TensorShapeProto& original_image_shape,
     const TensorShapeProto& original_filter_shape, const OpInfo& op_features,
     bool* found_unknown_shapes) {
+  VLOG(2) << "op features: " << op_features.DebugString();
+  VLOG(2) << "Original image shape: " << original_image_shape.DebugString();
+  VLOG(2) << "Original filter shape: " << original_filter_shape.DebugString();
   auto image_shape =
       MaybeGetMinimumShape(original_image_shape, 4, found_unknown_shapes);
   auto filter_shape =
       MaybeGetMinimumShape(original_filter_shape, 4, found_unknown_shapes);
+  VLOG(2) << "Image shape: " << image_shape.DebugString();
+  VLOG(2) << "Filter shape: " << filter_shape.DebugString();
 
   int x_index, y_index, channel_index;
   const string& data_format = GetDataFormat(op_features);
-- 
GitLab


From a5d93698ad5ae1e6488b536abb8501cd6ec70551 Mon Sep 17 00:00:00 2001
From: Yao Zhang <yaozhang@google.com>
Date: Thu, 8 Feb 2018 15:06:57 -0800
Subject: [PATCH 874/966] Add memory usage report to cost analyzer tool; run
 all default optimizations.

PiperOrigin-RevId: 185058999
---
 tensorflow/python/grappler/cost_analyzer_tool.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/grappler/cost_analyzer_tool.py b/tensorflow/python/grappler/cost_analyzer_tool.py
index ac251f2bbd..51b77b471b 100644
--- a/tensorflow/python/grappler/cost_analyzer_tool.py
+++ b/tensorflow/python/grappler/cost_analyzer_tool.py
@@ -54,14 +54,16 @@ def main(_):
       metagraph = saver.export_meta_graph(
           graph_def=graph.as_graph_def(), graph=graph)
 
+  rewriter_config = rewriter_config_pb2.RewriterConfig()
   if FLAGS.rewriter_config is not None:
-    rewriter_config = rewriter_config_pb2.RewriterConfig()
     text_format.Merge(FLAGS.rewriter_config, rewriter_config)
-    optimized_graph = tf_optimizer.OptimizeGraph(rewriter_config, metagraph)
-    metagraph.graph_def.CopyFrom(optimized_graph)
+  optimized_graph = tf_optimizer.OptimizeGraph(rewriter_config, metagraph)
+  metagraph.graph_def.CopyFrom(optimized_graph)
 
   report = cost_analyzer.GenerateCostReport(metagraph, FLAGS.per_node_report)
   print(report)
+  report = cost_analyzer.GenerateMemoryReport(metagraph)
+  print(report)
 
 
 if __name__ == "__main__":
-- 
GitLab


From 2939af7253339963d0c631e46468bdc26930897a Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Thu, 8 Feb 2018 15:18:31 -0800
Subject: [PATCH 875/966] Object based saving prototype: create
 ResourceVariables directly by default.

This avoids variable reuse errors when building a graph. Where necessary for compatibility, we can still use get_variable.

PiperOrigin-RevId: 185060891
---
 tensorflow/contrib/eager/python/BUILD         |   6 +
 .../contrib/eager/python/checkpointable.py    |  62 ++++++++--
 .../eager/python/checkpointable_test.py       | 113 ++++++++++++++----
 3 files changed, 147 insertions(+), 34 deletions(-)

diff --git a/tensorflow/contrib/eager/python/BUILD b/tensorflow/contrib/eager/python/BUILD
index 46406e3908..cfb38a1d26 100644
--- a/tensorflow/contrib/eager/python/BUILD
+++ b/tensorflow/contrib/eager/python/BUILD
@@ -226,9 +226,13 @@ py_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         "//tensorflow/contrib/eager/proto:checkpointable_object_graph_proto_py",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:init_ops",
         "//tensorflow/python:io_ops",
+        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:state_ops",
+        "//tensorflow/python:tensor_shape",
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/eager:context",
@@ -246,7 +250,9 @@ py_test(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:init_ops",
         "//tensorflow/python:layers",
+        "//tensorflow/python:layers_base",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:training",
diff --git a/tensorflow/contrib/eager/python/checkpointable.py b/tensorflow/contrib/eager/python/checkpointable.py
index 09d405475b..896b38a734 100644
--- a/tensorflow/contrib/eager/python/checkpointable.py
+++ b/tensorflow/contrib/eager/python/checkpointable.py
@@ -23,8 +23,12 @@ import weakref
 
 from tensorflow.contrib.eager.proto import checkpointable_object_graph_pb2
 from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import io_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.training import optimizer as optimizer_lib
@@ -75,6 +79,40 @@ def _assign_existing_variable(variable_to_restore, value_pointer):
       value_pointer.session.run(initializer_op)
 
 
+def _default_getter(name, shape, dtype, initializer=None,
+                    partition_info=None, **kwargs):
+  """A pared-down version of get_variable which does not reuse variables."""
+  dtype = dtypes.as_dtype(dtype)
+  shape_object = tensor_shape.as_shape(shape)
+  with ops.init_scope():
+    if initializer is None:
+      initializer, initializing_from_value = (
+          variable_scope._get_default_variable_store()._get_default_initializer(  # pylint: disable=protected-access
+              name=name, shape=shape_object, dtype=dtype))
+    else:
+      initializing_from_value = not callable(initializer)
+    # Same logic as get_variable
+    if initializing_from_value:
+      if shape is not None:
+        raise ValueError("If initializer is a constant, do not specify shape.")
+      initial_value = initializer
+      variable_dtype = None
+    else:
+      # Instantiate initializer if provided initializer is a type object.
+      if isinstance(initializer, type(init_ops.Initializer)):
+        initializer = initializer(dtype=dtype)
+      def initial_value():
+        return initializer(
+            shape_object.as_list(), dtype=dtype, partition_info=partition_info)
+      variable_dtype = dtype.base_dtype
+    return resource_variable_ops.ResourceVariable(
+        initial_value=initial_value,
+        name=name,
+        dtype=variable_dtype,
+        **kwargs
+    )
+
+
 class Checkpointable(object):
   """Manages variables and dependencies on other objects.
 
@@ -117,7 +155,8 @@ class Checkpointable(object):
         and value not in self._already_tracked):
       self.track_checkpointable(value, name=name)
 
-  def add_variable(self, name, shape, dtype=None, initializer=None, **kwargs):
+  def add_variable(self, name, shape=None, dtype=dtypes.float32,
+                   initializer=None, **kwargs):
     """Create a new variable object to be saved with this `Checkpointable`.
 
     If the user has requested that this object or another `Checkpointable` which
@@ -131,14 +170,18 @@ class Checkpointable(object):
       dtype: The data type of the variable.
       initializer: The initializer to use. Ignored if deferred loading has been
         requested.
-      **kwargs: Passed to get_variable.
+      **kwargs: Passed to the ResourceVariable constructor.
 
     Returns:
       The new variable object.
 
     Raises:
       ValueError: If the variable name is not unique.
+      RuntimeError: If __init__ has not been called.
     """
+    if not hasattr(self, "_owned_variables"):
+      raise RuntimeError("Need to call Checkpointable.__init__ before adding "
+                         "variables.")
     if name in self._owned_variables:
       raise ValueError(
           ("A variable named '%s' already exists in this Checkpointable, but "
@@ -151,18 +194,19 @@ class Checkpointable(object):
       # be relatively uncommon in user code.
       getter = kwargs.pop("getter")
     else:
-      getter = variable_scope.get_variable
+      getter = _default_getter
     deferred_restoration = self._deferred_restorations.pop(name, None)
     if deferred_restoration is not None:
       dtype = deferred_restoration.value_pointer.dtype
       base_type = dtype.base_dtype
       # TODO(allenl): Handle partitioned variables here too
-      initializer, = io_ops.restore_v2(
-          prefix=deferred_restoration.value_pointer.save_path,
-          tensor_names=[deferred_restoration.value_pointer.checkpoint_key],
-          shape_and_slices=[""],
-          dtypes=[base_type],
-          name="checkpoint_initializer")
+      with ops.init_scope():
+        initializer, = io_ops.restore_v2(
+            prefix=deferred_restoration.value_pointer.save_path,
+            tensor_names=[deferred_restoration.value_pointer.checkpoint_key],
+            shape_and_slices=[""],
+            dtypes=[base_type],
+            name="checkpoint_initializer")
       # We need to un-set the shape so get_variable doesn't complain, but we
       # also need to set the static shape information on the initializer if
       # possible so we don't get a variable with an unknown shape.
diff --git a/tensorflow/contrib/eager/python/checkpointable_test.py b/tensorflow/contrib/eager/python/checkpointable_test.py
index f0c3df56bc..f7bc155dec 100644
--- a/tensorflow/contrib/eager/python/checkpointable_test.py
+++ b/tensorflow/contrib/eager/python/checkpointable_test.py
@@ -31,6 +31,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.layers import base
 from tensorflow.python.layers import core
+from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
@@ -119,15 +120,7 @@ class NonLayerCheckpointable(checkpointable.Checkpointable):
 
   def __init__(self):
     super(NonLayerCheckpointable, self).__init__()
-    with variable_scope.variable_scope(None, default_name="non_layer"):
-      # Unfortunately using tf.get_variable to implement self.add_variable
-      # (necessary for backwards compatibile naming with Layers) we can still
-      # run into duplicate variable errors (when building a graph, not when
-      # executing eagerly), thus the variable scope.
-      #
-      # TODO(allenl): Consider creating a ResourceVariable directly by
-      # default so that variable reuse isn't an issue.
-      self._a_variable = self.add_variable("a_variable", shape=[])
+    self.a_variable = self.add_variable(name="a_variable", shape=[])
 
 
 class MyNetwork(CheckpointableNetwork):
@@ -158,17 +151,92 @@ class Root(checkpointable.Checkpointable):
   def global_step(self):
     if self._global_step is None:
       # Get the default create_global_step utility to actually call
-      # self.add_variable, by setting a custom getter.
-      def _owned_variable_as_custom_getter(getter, *args, **kwargs):
-        return self.add_variable(*args, getter=getter, **kwargs)
-
-      with variable_scope.variable_scope(
-          "", custom_getter=_owned_variable_as_custom_getter):
+      # self.add_variable, by setting a custom creator.
+      def _owned_variable_as_creator(
+          next_creator, initial_value, **kwargs):
+        def _creator_as_getter(initializer, **kwargs):
+          return next_creator(initial_value=initializer, **kwargs)
+        return self.add_variable(
+            getter=_creator_as_getter, initializer=initial_value, shape=[],
+            **kwargs)
+
+      with variable_scope.variable_creator_scope(
+          _owned_variable_as_creator):
         self._global_step = training_util.create_global_step()
     return self._global_step
 
 
-class CheckpointNamingTests(test.TestCase):
+class InterfaceTests(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  def testAddVariable(self):
+    obj = NonLayerCheckpointable()
+    with self.assertRaisesRegexp(ValueError, "do not specify shape"):
+      obj.add_variable(
+          name="shape_specified_twice", shape=[], initializer=1)
+    constant_initializer = obj.add_variable(
+        name="constant_initializer", initializer=1)
+    with variable_scope.variable_scope("some_variable_scope"):
+      ones_initializer = obj.add_variable(
+          name="ones_initializer",
+          shape=[2],
+          initializer=init_ops.ones_initializer(dtype=dtypes.float32))
+    bare_initializer = obj.add_variable(
+        name="bare_initializer",
+        shape=[2, 2],
+        dtype=dtypes.float64,
+        initializer=init_ops.zeros_initializer)
+
+    # Even in graph mode, there are no naming conflicts between objects, only
+    # naming conflicts within an object.
+    other_duplicate = resource_variable_ops.ResourceVariable(
+        name="duplicate", initial_value=1.)
+    duplicate = obj.add_variable(name="duplicate", shape=[])
+    with self.assertRaisesRegexp(ValueError, "'duplicate' already exists"):
+      obj.add_variable(name="duplicate", shape=[])
+
+    if context.in_graph_mode():
+      self.evaluate(variables.global_variables_initializer())
+    self.assertEqual("constant_initializer:0", constant_initializer.name)
+    self.assertEqual(1, self.evaluate(constant_initializer))
+    self.assertEqual("some_variable_scope/ones_initializer:0",
+                     ones_initializer.name)
+    self.assertAllEqual([1, 1], self.evaluate(ones_initializer))
+    self.assertAllEqual([[0., 0.],
+                         [0., 0.]], self.evaluate(bare_initializer))
+    self.assertEqual("a_variable:0", obj.a_variable.name)
+    self.assertEqual("duplicate:0", other_duplicate.name)
+    if context.in_graph_mode():
+      # The .name attribute may be globally influenced, but the checkpoint name
+      # won't be (tested below).
+      self.assertEqual("duplicate_1:0", duplicate.name)
+    else:
+      # When executing eagerly, there's no uniquification of variable names. The
+      # checkpoint name will be the same.
+      self.assertEqual("duplicate:0", duplicate.name)
+    named_variables, _ = checkpointable._serialize_object_graph(obj)
+    expected_checkpoint_names = (
+        "a_variable",
+        "bare_initializer",
+        "constant_initializer",
+        "duplicate",
+        "ones_initializer",
+    )
+    six.assertCountEqual(
+        self, expected_checkpoint_names, named_variables.keys())
+
+  def testInitNotCalled(self):
+
+    class NoInit(checkpointable.Checkpointable):
+
+      def __init__(self):
+        pass
+
+    with self.assertRaisesRegexp(RuntimeError, "__init__"):
+      NoInit().add_variable("var", shape=[])
+
+
+class CheckpointingTests(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
   def testNamingWithOptimizer(self):
@@ -391,12 +459,7 @@ class CheckpointNamingTests(test.TestCase):
 
   def _get_checkpoint_name(self, name):
     root = checkpointable.Checkpointable()
-    with variable_scope.variable_scope("get_checkpoint_name"):
-      # Create the variable in a variable scope so that we get more relaxed
-      # naming rules (variables outside a scope may not start with "_", "/" or
-      # "-"). Since we don't use the scope part of the name, these cases are
-      # somewhat annoying.
-      root.add_variable(name=name, shape=[1, 2], dtype=dtypes.float64)
+    root.add_variable(name=name, shape=[1, 2], dtype=dtypes.float64)
     named_variables, _ = checkpointable._serialize_object_graph(root)
     checkpoint_name, = named_variables.keys()
     with ops.name_scope("root/" + checkpoint_name):
@@ -406,9 +469,9 @@ class CheckpointNamingTests(test.TestCase):
   @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
   def testVariableNameEscaping(self):
     self.assertEqual(r"a_S__b_S__c", self._get_checkpoint_name(r"a/b/c"))
-    self.assertEqual(r"", self._get_checkpoint_name(r""))
-    self.assertEqual(r"_S__", self._get_checkpoint_name(r"/"))
-    self.assertEqual(r"_S___S_._", self._get_checkpoint_name(r"/_S__"))
+    self.assertEqual(r"b", self._get_checkpoint_name(r"b"))
+    self.assertEqual(r"c_S__", self._get_checkpoint_name(r"c/"))
+    self.assertEqual(r"d_S___S_._", self._get_checkpoint_name(r"d/_S__"))
 
   @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
   def testNumberedPath(self):
-- 
GitLab


From f9a8009db5af7d1019d67f93890e8ad00773d91c Mon Sep 17 00:00:00 2001
From: Raghuraman Krishnamoorthi <raghuramank@google.com>
Date: Thu, 8 Feb 2018 15:21:22 -0800
Subject: [PATCH 876/966] Update fake quant op to support bitwidths in the
 range 2 to 16, from 2 to 8.

PiperOrigin-RevId: 185061307
---
 tensorflow/core/kernels/fake_quant_ops.cc     | 32 +++++++++++--------
 .../core/kernels/fake_quant_ops_functor.h     | 12 +++----
 2 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/tensorflow/core/kernels/fake_quant_ops.cc b/tensorflow/core/kernels/fake_quant_ops.cc
index 68762af8cf..f5e279eca4 100644
--- a/tensorflow/core/kernels/fake_quant_ops.cc
+++ b/tensorflow/core/kernels/fake_quant_ops.cc
@@ -45,7 +45,7 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
 namespace {
-bool IsNumBitsValid(int num_bits) { return num_bits >= 2 && num_bits <= 8; }
+bool IsNumBitsValid(int num_bits) { return num_bits >= 2 && num_bits <= 16; }
 }  // namespace
 
 // -----------------------------------------------------------------------------
@@ -65,8 +65,9 @@ class FakeQuantWithMinMaxArgsOp
                                 " >= ", max_));
     int num_bits;
     OP_REQUIRES_OK(context, context->GetAttr("num_bits", &num_bits));
-    OP_REQUIRES(context, IsNumBitsValid(num_bits),
-                InvalidArgument("num_bits must be between 2 and 8, inclusive"));
+    OP_REQUIRES(
+        context, IsNumBitsValid(num_bits),
+        InvalidArgument("num_bits must be between 2 and 16, inclusive"));
     bool narrow_range;
     OP_REQUIRES_OK(context, context->GetAttr("narrow_range", &narrow_range));
     quant_min_ = narrow_range ? 1 : 0;
@@ -104,8 +105,9 @@ class FakeQuantWithMinMaxArgsGradientOp
                                 " >= ", max_));
     int num_bits;
     OP_REQUIRES_OK(context, context->GetAttr("num_bits", &num_bits));
-    OP_REQUIRES(context, IsNumBitsValid(num_bits),
-                InvalidArgument("num_bits must be between 2 and 8, inclusive"));
+    OP_REQUIRES(
+        context, IsNumBitsValid(num_bits),
+        InvalidArgument("num_bits must be between 2 and 16, inclusive"));
     bool narrow_range;
     OP_REQUIRES_OK(context, context->GetAttr("narrow_range", &narrow_range));
     quant_min_ = narrow_range ? 1 : 0;
@@ -175,8 +177,9 @@ class FakeQuantWithMinMaxVarsOp : public OpKernel {
       : OpKernel::OpKernel(context) {
     int num_bits;
     OP_REQUIRES_OK(context, context->GetAttr("num_bits", &num_bits));
-    OP_REQUIRES(context, IsNumBitsValid(num_bits),
-                InvalidArgument("num_bits must be between 2 and 8, inclusive"));
+    OP_REQUIRES(
+        context, IsNumBitsValid(num_bits),
+        InvalidArgument("num_bits must be between 2 and 16, inclusive"));
     bool narrow_range;
     OP_REQUIRES_OK(context, context->GetAttr("narrow_range", &narrow_range));
     quant_min_ = narrow_range ? 1 : 0;
@@ -213,8 +216,9 @@ class FakeQuantWithMinMaxVarsGradientOp : public OpKernel {
       : OpKernel::OpKernel(context) {
     int num_bits;
     OP_REQUIRES_OK(context, context->GetAttr("num_bits", &num_bits));
-    OP_REQUIRES(context, IsNumBitsValid(num_bits),
-                InvalidArgument("num_bits must be between 2 and 8, inclusive"));
+    OP_REQUIRES(
+        context, IsNumBitsValid(num_bits),
+        InvalidArgument("num_bits must be between 2 and 16, inclusive"));
     bool narrow_range;
     OP_REQUIRES_OK(context, context->GetAttr("narrow_range", &narrow_range));
     quant_min_ = narrow_range ? 1 : 0;
@@ -302,8 +306,9 @@ class FakeQuantWithMinMaxVarsPerChannelOp : public OpKernel {
       : OpKernel::OpKernel(context) {
     int num_bits;
     OP_REQUIRES_OK(context, context->GetAttr("num_bits", &num_bits));
-    OP_REQUIRES(context, IsNumBitsValid(num_bits),
-                InvalidArgument("num_bits must be between 2 and 8, inclusive"));
+    OP_REQUIRES(
+        context, IsNumBitsValid(num_bits),
+        InvalidArgument("num_bits must be between 2 and 16, inclusive"));
     bool narrow_range;
     OP_REQUIRES_OK(context, context->GetAttr("narrow_range", &narrow_range));
     quant_min_ = narrow_range ? 1 : 0;
@@ -348,8 +353,9 @@ class FakeQuantWithMinMaxVarsPerChannelGradientOp : public OpKernel {
       : OpKernel::OpKernel(context) {
     int num_bits;
     OP_REQUIRES_OK(context, context->GetAttr("num_bits", &num_bits));
-    OP_REQUIRES(context, IsNumBitsValid(num_bits),
-                InvalidArgument("num_bits must be between 2 and 8, inclusive"));
+    OP_REQUIRES(
+        context, IsNumBitsValid(num_bits),
+        InvalidArgument("num_bits must be between 2 and 16, inclusive"));
     bool narrow_range;
     OP_REQUIRES_OK(context, context->GetAttr("narrow_range", &narrow_range));
     quant_min_ = narrow_range ? 1 : 0;
diff --git a/tensorflow/core/kernels/fake_quant_ops_functor.h b/tensorflow/core/kernels/fake_quant_ops_functor.h
index 81189866c3..d51acc38ef 100644
--- a/tensorflow/core/kernels/fake_quant_ops_functor.h
+++ b/tensorflow/core/kernels/fake_quant_ops_functor.h
@@ -45,16 +45,16 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void Nudge(
   const float quant_max_float = static_cast<float>(quant_max);
   *scale = (max - min) / (quant_max_float - quant_min_float);
   const float zero_point_from_min = quant_min_float - min / *scale;
-  const uint8 nudged_zero_point = [zero_point_from_min, quant_min,
-                                   quant_min_float, quant_max,
-                                   quant_max_float] {
+  const uint16 nudged_zero_point = [zero_point_from_min, quant_min,
+                                    quant_min_float, quant_max,
+                                    quant_max_float] {
     if (zero_point_from_min < quant_min_float) {
-      return static_cast<uint8>(quant_min);
+      return static_cast<uint16>(quant_min);
     }
     if (zero_point_from_min > quant_max_float) {
-      return static_cast<uint8>(quant_max);
+      return static_cast<uint16>(quant_max);
     }
-    return static_cast<uint8>(StdRound(zero_point_from_min));
+    return static_cast<uint16>(StdRound(zero_point_from_min));
   }();
   *nudged_min = (quant_min_float - nudged_zero_point) * (*scale);
   *nudged_max = (quant_max_float - nudged_zero_point) * (*scale);
-- 
GitLab


From c0d8660b70d799114dbf236290ea42fba0b7525d Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Thu, 8 Feb 2018 15:28:30 -0800
Subject: [PATCH 877/966] Adding tf_export decorators/calls to TensorFlow
 functions and constants.

PiperOrigin-RevId: 185062323
---
 .../keras/layers/advanced_activations.py      |  6 ++++++
 .../keras/_impl/keras/layers/convolutional.py | 21 +++++++++++++++++++
 .../keras/layers/convolutional_recurrent.py   |  2 ++
 .../python/keras/_impl/keras/layers/core.py   | 14 +++++++++++++
 .../keras/_impl/keras/layers/embeddings.py    |  2 ++
 .../python/keras/_impl/keras/layers/local.py  |  3 +++
 .../python/keras/_impl/keras/layers/merge.py  | 13 ++++++++++++
 .../python/keras/_impl/keras/layers/noise.py  |  4 ++++
 .../keras/_impl/keras/layers/normalization.py |  2 ++
 .../keras/_impl/keras/layers/pooling.py       | 16 ++++++++++++++
 .../keras/_impl/keras/layers/recurrent.py     |  9 ++++++++
 .../keras/_impl/keras/layers/wrappers.py      |  4 ++++
 12 files changed, 96 insertions(+)

diff --git a/tensorflow/python/keras/_impl/keras/layers/advanced_activations.py b/tensorflow/python/keras/_impl/keras/layers/advanced_activations.py
index ffbf77c4b8..7cac17c51a 100644
--- a/tensorflow/python/keras/_impl/keras/layers/advanced_activations.py
+++ b/tensorflow/python/keras/_impl/keras/layers/advanced_activations.py
@@ -26,8 +26,10 @@ from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
 from tensorflow.python.keras._impl.keras.engine.topology import shape_type_conversion
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('keras.layers.LeakyReLU')
 class LeakyReLU(Layer):
   """Leaky version of a Rectified Linear Unit.
 
@@ -66,6 +68,7 @@ class LeakyReLU(Layer):
     return input_shape
 
 
+@tf_export('keras.layers.PReLU')
 class PReLU(Layer):
   """Parametric Rectified Linear Unit.
 
@@ -163,6 +166,7 @@ class PReLU(Layer):
     return input_shape
 
 
+@tf_export('keras.layers.ELU')
 class ELU(Layer):
   """Exponential Linear Unit.
 
@@ -201,6 +205,7 @@ class ELU(Layer):
     return input_shape
 
 
+@tf_export('keras.layers.ThresholdedReLU')
 class ThresholdedReLU(Layer):
   """Thresholded Rectified Linear Unit.
 
@@ -239,6 +244,7 @@ class ThresholdedReLU(Layer):
     return input_shape
 
 
+@tf_export('keras.layers.Softmax')
 class Softmax(Layer):
   """Softmax activation function.
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/convolutional.py b/tensorflow/python/keras/_impl/keras/layers/convolutional.py
index 2ee0732775..bc43451114 100644
--- a/tensorflow/python/keras/_impl/keras/layers/convolutional.py
+++ b/tensorflow/python/keras/_impl/keras/layers/convolutional.py
@@ -38,8 +38,10 @@ from tensorflow.python.keras._impl.keras.layers.pooling import MaxPooling3D
 # pylint: enable=unused-import
 from tensorflow.python.keras._impl.keras.utils import conv_utils
 from tensorflow.python.layers import convolutional as tf_convolutional_layers
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('keras.layers.Conv1D', 'keras.layers.Convolution1D')
 class Conv1D(tf_convolutional_layers.Conv1D, Layer):
   """1D convolution layer (e.g. temporal convolution).
 
@@ -153,6 +155,7 @@ class Conv1D(tf_convolutional_layers.Conv1D, Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.Conv2D', 'keras.layers.Convolution2D')
 class Conv2D(tf_convolutional_layers.Conv2D, Layer):
   """2D convolution layer (e.g. spatial convolution over images).
 
@@ -286,6 +289,7 @@ class Conv2D(tf_convolutional_layers.Conv2D, Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.Conv3D', 'keras.layers.Convolution3D')
 class Conv3D(tf_convolutional_layers.Conv3D, Layer):
   """3D convolution layer (e.g. spatial convolution over volumes).
 
@@ -426,6 +430,8 @@ class Conv3D(tf_convolutional_layers.Conv3D, Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.Conv2DTranspose',
+           'keras.layers.Convolution2DTranspose')
 class Conv2DTranspose(tf_convolutional_layers.Conv2DTranspose, Layer):
   """Transposed convolution layer (sometimes called Deconvolution).
 
@@ -563,6 +569,8 @@ class Conv2DTranspose(tf_convolutional_layers.Conv2DTranspose, Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.Conv3DTranspose',
+           'keras.layers.Convolution3DTranspose')
 class Conv3DTranspose(tf_convolutional_layers.Conv3DTranspose, Layer):
   """Transposed convolution layer (sometimes called Deconvolution).
 
@@ -711,6 +719,8 @@ class Conv3DTranspose(tf_convolutional_layers.Conv3DTranspose, Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.SeparableConv1D',
+           'keras.layers.SeparableConvolution1D')
 class SeparableConv1D(tf_convolutional_layers.SeparableConv1D, Layer):
   """Depthwise separable 1D convolution.
 
@@ -849,6 +859,8 @@ class SeparableConv1D(tf_convolutional_layers.SeparableConv1D, Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.SeparableConv2D',
+           'keras.layers.SeparableConvolution2D')
 class SeparableConv2D(tf_convolutional_layers.SeparableConv2D, Layer):
   """Depthwise separable 2D convolution.
 
@@ -1012,6 +1024,7 @@ class SeparableConv2D(tf_convolutional_layers.SeparableConv2D, Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.UpSampling1D')
 class UpSampling1D(Layer):
   """Upsampling layer for 1D inputs.
 
@@ -1047,6 +1060,7 @@ class UpSampling1D(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.UpSampling2D')
 class UpSampling2D(Layer):
   """Upsampling layer for 2D inputs.
 
@@ -1114,6 +1128,7 @@ class UpSampling2D(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.UpSampling3D')
 class UpSampling3D(Layer):
   """Upsampling layer for 3D inputs.
 
@@ -1186,6 +1201,7 @@ class UpSampling3D(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.ZeroPadding1D')
 class ZeroPadding1D(Layer):
   """Zero-padding layer for 1D input (e.g. temporal sequence).
 
@@ -1226,6 +1242,7 @@ class ZeroPadding1D(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.ZeroPadding2D')
 class ZeroPadding2D(Layer):
   """Zero-padding layer for 2D input (e.g. picture).
 
@@ -1327,6 +1344,7 @@ class ZeroPadding2D(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.ZeroPadding3D')
 class ZeroPadding3D(Layer):
   """Zero-padding layer for 3D data (spatial or spatio-temporal).
 
@@ -1444,6 +1462,7 @@ class ZeroPadding3D(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.Cropping1D')
 class Cropping1D(Layer):
   """Cropping layer for 1D input (e.g. temporal sequence).
 
@@ -1488,6 +1507,7 @@ class Cropping1D(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.Cropping2D')
 class Cropping2D(Layer):
   """Cropping layer for 2D input (e.g. picture).
 
@@ -1619,6 +1639,7 @@ class Cropping2D(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.Cropping3D')
 class Cropping3D(Layer):
   """Cropping layer for 3D data (e.g.
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py b/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py
index 565db19e41..a04c3a24bf 100644
--- a/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py
+++ b/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py
@@ -29,6 +29,7 @@ from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine.topology import shape_type_conversion
 from tensorflow.python.keras._impl.keras.layers.recurrent import Recurrent
 from tensorflow.python.keras._impl.keras.utils import conv_utils
+from tensorflow.python.util.tf_export import tf_export
 
 
 class ConvRecurrent2D(Recurrent):
@@ -190,6 +191,7 @@ class ConvRecurrent2D(Recurrent):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.ConvLSTM2D')
 class ConvLSTM2D(ConvRecurrent2D):
   """Convolutional LSTM.
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/core.py b/tensorflow/python/keras/_impl/keras/layers/core.py
index ea2d3f2f04..50a197c80c 100644
--- a/tensorflow/python/keras/_impl/keras/layers/core.py
+++ b/tensorflow/python/keras/_impl/keras/layers/core.py
@@ -37,8 +37,10 @@ from tensorflow.python.keras._impl.keras.utils.generic_utils import func_dump
 from tensorflow.python.keras._impl.keras.utils.generic_utils import func_load
 from tensorflow.python.keras._impl.keras.utils.generic_utils import has_arg
 from tensorflow.python.layers import core as tf_core_layers
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('keras.layers.Masking')
 class Masking(Layer):
   """Masks a sequence by using a mask value to skip timesteps.
 
@@ -89,6 +91,7 @@ class Masking(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.Dropout')
 class Dropout(tf_core_layers.Dropout, Layer):
   """Applies Dropout to the input.
 
@@ -135,6 +138,7 @@ class Dropout(tf_core_layers.Dropout, Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.SpatialDropout1D')
 class SpatialDropout1D(Dropout):
   """Spatial 1D version of Dropout.
 
@@ -171,6 +175,7 @@ class SpatialDropout1D(Dropout):
     return noise_shape
 
 
+@tf_export('keras.layers.SpatialDropout2D')
 class SpatialDropout2D(Dropout):
   """Spatial 2D version of Dropout.
 
@@ -224,6 +229,7 @@ class SpatialDropout2D(Dropout):
       return (input_shape[0], 1, 1, input_shape[3])
 
 
+@tf_export('keras.layers.SpatialDropout3D')
 class SpatialDropout3D(Dropout):
   """Spatial 3D version of Dropout.
 
@@ -276,6 +282,7 @@ class SpatialDropout3D(Dropout):
       return (input_shape[0], 1, 1, 1, input_shape[4])
 
 
+@tf_export('keras.layers.Activation')
 class Activation(Layer):
   """Applies an activation function to an output.
 
@@ -309,6 +316,7 @@ class Activation(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.Reshape')
 class Reshape(Layer):
   """Reshapes an output to a certain shape.
 
@@ -414,6 +422,7 @@ class Reshape(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.Permute')
 class Permute(Layer):
   """Permutes the dimensions of the input according to a given pattern.
 
@@ -466,6 +475,7 @@ class Permute(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.Flatten')
 class Flatten(tf_core_layers.Flatten, Layer):
   """Flattens the input. Does not affect the batch size.
 
@@ -485,6 +495,7 @@ class Flatten(tf_core_layers.Flatten, Layer):
   pass
 
 
+@tf_export('keras.layers.RepeatVector')
 class RepeatVector(Layer):
   """Repeats the input n times.
 
@@ -528,6 +539,7 @@ class RepeatVector(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.Lambda')
 class Lambda(Layer):
   """Wraps arbitrary expression as a `Layer` object.
 
@@ -709,6 +721,7 @@ class Lambda(Layer):
     return cls(**config)
 
 
+@tf_export('keras.layers.Dense')
 class Dense(tf_core_layers.Dense, Layer):
   """Just your regular densely-connected NN layer.
 
@@ -813,6 +826,7 @@ class Dense(tf_core_layers.Dense, Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.ActivityRegularization')
 class ActivityRegularization(Layer):
   """Layer that applies an update to the cost function based input activity.
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/embeddings.py b/tensorflow/python/keras/_impl/keras/layers/embeddings.py
index f8e31068f8..ca92899a45 100644
--- a/tensorflow/python/keras/_impl/keras/layers/embeddings.py
+++ b/tensorflow/python/keras/_impl/keras/layers/embeddings.py
@@ -24,8 +24,10 @@ from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.engine import Layer
 from tensorflow.python.keras._impl.keras.engine.topology import shape_type_conversion
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('keras.layers.Embedding')
 class Embedding(Layer):
   """Turns positive integers (indexes) into dense vectors of fixed size.
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/local.py b/tensorflow/python/keras/_impl/keras/layers/local.py
index b844b071e0..798ac236a3 100644
--- a/tensorflow/python/keras/_impl/keras/layers/local.py
+++ b/tensorflow/python/keras/_impl/keras/layers/local.py
@@ -27,8 +27,10 @@ from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
 from tensorflow.python.keras._impl.keras.engine.topology import shape_type_conversion
 from tensorflow.python.keras._impl.keras.utils import conv_utils
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('keras.layers.LocallyConnected1D')
 class LocallyConnected1D(Layer):
   """Locally-connected layer for 1D inputs.
 
@@ -193,6 +195,7 @@ class LocallyConnected1D(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.LocallyConnected2D')
 class LocallyConnected2D(Layer):
   """Locally-connected layer for 2D inputs.
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/merge.py b/tensorflow/python/keras/_impl/keras/layers/merge.py
index 38b0b30297..cdf2878e83 100644
--- a/tensorflow/python/keras/_impl/keras/layers/merge.py
+++ b/tensorflow/python/keras/_impl/keras/layers/merge.py
@@ -23,6 +23,7 @@ from __future__ import print_function
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.engine.topology import Layer
 from tensorflow.python.keras._impl.keras.engine.topology import shape_type_conversion
+from tensorflow.python.util.tf_export import tf_export
 
 
 class _Merge(Layer):
@@ -210,6 +211,7 @@ class _Merge(Layer):
     return K.all(K.concatenate(masks, axis=0), axis=0, keepdims=False)
 
 
+@tf_export('keras.layers.Add')
 class Add(_Merge):
   """Layer that adds a list of inputs.
 
@@ -279,6 +281,7 @@ class Subtract(_Merge):
     return inputs[0] - inputs[1]
 
 
+@tf_export('keras.layers.Multiply')
 class Multiply(_Merge):
   """Layer that multiplies (element-wise) a list of inputs.
 
@@ -294,6 +297,7 @@ class Multiply(_Merge):
     return output
 
 
+@tf_export('keras.layers.Average')
 class Average(_Merge):
   """Layer that averages a list of inputs.
 
@@ -309,6 +313,7 @@ class Average(_Merge):
     return output / len(inputs)
 
 
+@tf_export('keras.layers.Maximum')
 class Maximum(_Merge):
   """Layer that computes the maximum (element-wise) a list of inputs.
 
@@ -339,6 +344,7 @@ class Minimum(_Merge):
     return output
 
 
+@tf_export('keras.layers.Concatenate')
 class Concatenate(_Merge):
   """Layer that concatenates a list of inputs.
 
@@ -429,6 +435,7 @@ class Concatenate(_Merge):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.Dot')
 class Dot(_Merge):
   """Layer that computes a dot product between samples in two tensors.
 
@@ -543,6 +550,7 @@ class Dot(_Merge):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.add')
 def add(inputs, **kwargs):
   """Functional interface to the `Add` layer.
 
@@ -599,6 +607,7 @@ def subtract(inputs, **kwargs):
   return Subtract(**kwargs)(inputs)
 
 
+@tf_export('keras.layers.multiply')
 def multiply(inputs, **kwargs):
   """Functional interface to the `Multiply` layer.
 
@@ -612,6 +621,7 @@ def multiply(inputs, **kwargs):
   return Multiply(**kwargs)(inputs)
 
 
+@tf_export('keras.layers.average')
 def average(inputs, **kwargs):
   """Functional interface to the `Average` layer.
 
@@ -625,6 +635,7 @@ def average(inputs, **kwargs):
   return Average(**kwargs)(inputs)
 
 
+@tf_export('keras.layers.maximum')
 def maximum(inputs, **kwargs):
   """Functional interface to the `Maximum` layer.
 
@@ -651,6 +662,7 @@ def minimum(inputs, **kwargs):
   return Minimum(**kwargs)(inputs)
 
 
+@tf_export('keras.layers.concatenate')
 def concatenate(inputs, axis=-1, **kwargs):
   """Functional interface to the `Concatenate` layer.
 
@@ -665,6 +677,7 @@ def concatenate(inputs, axis=-1, **kwargs):
   return Concatenate(axis=axis, **kwargs)(inputs)
 
 
+@tf_export('keras.layers.dot')
 def dot(inputs, axes, normalize=False, **kwargs):
   """Functional interface to the `Dot` layer.
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/noise.py b/tensorflow/python/keras/_impl/keras/layers/noise.py
index 04fffcc384..9010f49615 100644
--- a/tensorflow/python/keras/_impl/keras/layers/noise.py
+++ b/tensorflow/python/keras/_impl/keras/layers/noise.py
@@ -23,8 +23,10 @@ import numpy as np
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.engine import Layer
 from tensorflow.python.keras._impl.keras.engine.topology import shape_type_conversion
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('keras.layers.GaussianNoise')
 class GaussianNoise(Layer):
   """Apply additive zero-centered Gaussian noise.
 
@@ -70,6 +72,7 @@ class GaussianNoise(Layer):
     return input_shape
 
 
+@tf_export('keras.layers.GaussianDropout')
 class GaussianDropout(Layer):
   """Apply multiplicative 1-centered Gaussian noise.
 
@@ -116,6 +119,7 @@ class GaussianDropout(Layer):
     return input_shape
 
 
+@tf_export('keras.layers.AlphaDropout')
 class AlphaDropout(Layer):
   """Applies Alpha Dropout to the input.
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/normalization.py b/tensorflow/python/keras/_impl/keras/layers/normalization.py
index eecb14ceaa..0dedd5e8da 100644
--- a/tensorflow/python/keras/_impl/keras/layers/normalization.py
+++ b/tensorflow/python/keras/_impl/keras/layers/normalization.py
@@ -25,8 +25,10 @@ from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.engine import Layer
 from tensorflow.python.layers import normalization as tf_normalization_layers
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('keras.layers.BatchNormalization')
 class BatchNormalization(tf_normalization_layers.BatchNormalization, Layer):
   """Batch normalization layer (Ioffe and Szegedy, 2014).
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/pooling.py b/tensorflow/python/keras/_impl/keras/layers/pooling.py
index b133e2dfaf..15d5337976 100644
--- a/tensorflow/python/keras/_impl/keras/layers/pooling.py
+++ b/tensorflow/python/keras/_impl/keras/layers/pooling.py
@@ -24,8 +24,10 @@ from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
 from tensorflow.python.keras._impl.keras.utils import conv_utils
 from tensorflow.python.layers import pooling as tf_pooling_layers
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('keras.layers.MaxPool1D', 'keras.layers.MaxPooling1D')
 class MaxPooling1D(tf_pooling_layers.MaxPooling1D, Layer):
   """Max pooling operation for temporal data.
 
@@ -58,6 +60,7 @@ class MaxPooling1D(tf_pooling_layers.MaxPooling1D, Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.AveragePooling1D', 'keras.layers.AvgPool1D')
 class AveragePooling1D(tf_pooling_layers.AveragePooling1D, Layer):
   """Average pooling for temporal data.
 
@@ -91,6 +94,7 @@ class AveragePooling1D(tf_pooling_layers.AveragePooling1D, Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.MaxPool2D', 'keras.layers.MaxPooling2D')
 class MaxPooling2D(tf_pooling_layers.MaxPooling2D, Layer):
   """Max pooling operation for spatial data.
 
@@ -156,6 +160,7 @@ class MaxPooling2D(tf_pooling_layers.MaxPooling2D, Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.AveragePooling2D', 'keras.layers.AvgPool2D')
 class AveragePooling2D(tf_pooling_layers.AveragePooling2D, Layer):
   """Average pooling operation for spatial data.
 
@@ -221,6 +226,7 @@ class AveragePooling2D(tf_pooling_layers.AveragePooling2D, Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.MaxPool3D', 'keras.layers.MaxPooling3D')
 class MaxPooling3D(tf_pooling_layers.MaxPooling3D, Layer):
   """Max pooling operation for 3D data (spatial or spatio-temporal).
 
@@ -282,6 +288,7 @@ class MaxPooling3D(tf_pooling_layers.MaxPooling3D, Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.AveragePooling3D', 'keras.layers.AvgPool3D')
 class AveragePooling3D(tf_pooling_layers.AveragePooling3D, Layer):
   """Average pooling operation for 3D data (spatial or spatio-temporal).
 
@@ -359,6 +366,8 @@ class _GlobalPooling1D(Layer):
     raise NotImplementedError
 
 
+@tf_export('keras.layers.GlobalAveragePooling1D',
+           'keras.layers.GlobalAvgPool1D')
 class GlobalAveragePooling1D(_GlobalPooling1D):
   """Global average pooling operation for temporal data.
 
@@ -374,6 +383,7 @@ class GlobalAveragePooling1D(_GlobalPooling1D):
     return K.mean(inputs, axis=1)
 
 
+@tf_export('keras.layers.GlobalMaxPool1D', 'keras.layers.GlobalMaxPooling1D')
 class GlobalMaxPooling1D(_GlobalPooling1D):
   """Global max pooling operation for temporal data.
 
@@ -414,6 +424,8 @@ class _GlobalPooling2D(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.GlobalAveragePooling2D',
+           'keras.layers.GlobalAvgPool2D')
 class GlobalAveragePooling2D(_GlobalPooling2D):
   """Global average pooling operation for spatial data.
 
@@ -449,6 +461,7 @@ class GlobalAveragePooling2D(_GlobalPooling2D):
       return K.mean(inputs, axis=[2, 3])
 
 
+@tf_export('keras.layers.GlobalMaxPool2D', 'keras.layers.GlobalMaxPooling2D')
 class GlobalMaxPooling2D(_GlobalPooling2D):
   """Global max pooling operation for spatial data.
 
@@ -509,6 +522,8 @@ class _GlobalPooling3D(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.GlobalAveragePooling3D',
+           'keras.layers.GlobalAvgPool3D')
 class GlobalAveragePooling3D(_GlobalPooling3D):
   """Global Average pooling operation for 3D data.
 
@@ -544,6 +559,7 @@ class GlobalAveragePooling3D(_GlobalPooling3D):
       return K.mean(inputs, axis=[2, 3, 4])
 
 
+@tf_export('keras.layers.GlobalMaxPool3D', 'keras.layers.GlobalMaxPooling3D')
 class GlobalMaxPooling3D(_GlobalPooling3D):
   """Global Max pooling operation for 3D data.
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/recurrent.py b/tensorflow/python/keras/_impl/keras/layers/recurrent.py
index 1b0f6cb6cf..5c1b523aa3 100644
--- a/tensorflow/python/keras/_impl/keras/layers/recurrent.py
+++ b/tensorflow/python/keras/_impl/keras/layers/recurrent.py
@@ -32,8 +32,10 @@ from tensorflow.python.keras._impl.keras.engine import Layer
 from tensorflow.python.keras._impl.keras.engine.topology import shape_type_conversion
 from tensorflow.python.keras._impl.keras.utils.generic_utils import has_arg
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('keras.layers.StackedRNNCells')
 class StackedRNNCells(Layer):
   """Wrapper allowing a stack of RNN cells to behave as a single cell.
 
@@ -213,6 +215,7 @@ class StackedRNNCells(Layer):
     return losses
 
 
+@tf_export('keras.layers.RNN')
 class RNN(Layer):
   """Base class for recurrent layers.
 
@@ -785,6 +788,7 @@ class RNN(Layer):
     return super(RNN, self).get_losses_for(inputs)
 
 
+@tf_export('keras.layers.SimpleRNNCell')
 class SimpleRNNCell(Layer):
   """Cell class for SimpleRNN.
 
@@ -954,6 +958,7 @@ class SimpleRNNCell(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.SimpleRNN')
 class SimpleRNN(RNN):
   """Fully-connected RNN where the output is to be fed back to input.
 
@@ -1163,6 +1168,7 @@ class SimpleRNN(RNN):
     return cls(**config)
 
 
+@tf_export('keras.layers.GRUCell')
 class GRUCell(Layer):
   """Cell class for the GRU layer.
 
@@ -1411,6 +1417,7 @@ class GRUCell(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.GRU')
 class GRU(RNN):
   """Gated Recurrent Unit - Cho et al.
 
@@ -1646,6 +1653,7 @@ class GRU(RNN):
     return cls(**config)
 
 
+@tf_export('keras.layers.LSTMCell')
 class LSTMCell(Layer):
   """Cell class for the LSTM layer.
 
@@ -1927,6 +1935,7 @@ class LSTMCell(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.LSTM')
 class LSTM(RNN):
   """Long-Short Term Memory layer - Hochreiter 1997.
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/wrappers.py b/tensorflow/python/keras/_impl/keras/layers/wrappers.py
index 3667956f80..c697bcefc9 100644
--- a/tensorflow/python/keras/_impl/keras/layers/wrappers.py
+++ b/tensorflow/python/keras/_impl/keras/layers/wrappers.py
@@ -28,8 +28,10 @@ from tensorflow.python.keras._impl.keras.engine import Layer
 from tensorflow.python.keras._impl.keras.engine.topology import shape_type_conversion
 from tensorflow.python.keras._impl.keras.utils.generic_utils import has_arg
 from tensorflow.python.layers import utils as tf_layers_util
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('keras.layers.Wrapper')
 class Wrapper(Layer):
   """Abstract wrapper base class.
 
@@ -122,6 +124,7 @@ class Wrapper(Layer):
     return cls(layer, **config)
 
 
+@tf_export('keras.layers.TimeDistributed')
 class TimeDistributed(Wrapper):
   """This wrapper allows to apply a layer to every temporal slice of an input.
 
@@ -246,6 +249,7 @@ class TimeDistributed(Wrapper):
     return y
 
 
+@tf_export('keras.layers.Bidirectional')
 class Bidirectional(Wrapper):
   """Bidirectional wrapper for RNNs.
 
-- 
GitLab


From 5a2c810376ad4ea4d55f8eaa696250011c1f5005 Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Thu, 8 Feb 2018 15:36:04 -0800
Subject: [PATCH 878/966] Changes behavior of optimizer.minimize(loss) when
 loss is a function.

Now supports both eager and graph mode, and loss is always a callable
with no arguments. Supports variables and non-variables in var_list.

PiperOrigin-RevId: 185063691
---
 tensorflow/python/eager/backprop.py          | 10 ++++-
 tensorflow/python/eager/pywrap_tfe_src.cc    |  3 +-
 tensorflow/python/training/momentum_test.py  |  4 +-
 tensorflow/python/training/optimizer.py      | 46 ++++++++------------
 tensorflow/python/training/optimizer_test.py | 35 ++++++---------
 5 files changed, 44 insertions(+), 54 deletions(-)

diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index d79d1fc0a6..d76fecf126 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -858,13 +858,18 @@ class GradientTape(object):
         t = t.handle
       tape.watch(t)
 
-  def gradient(self, target, sources):
+  def watched_variables(self):
+    return self._tape.watched_variables()
+
+  def gradient(self, target, sources, output_gradients=None):
     """Computes the gradient using information traced by the tape.
 
     Args:
       target: the tensor to be differentiated.
       sources: a list of Tensors or Variables, the target will be
        differentiated with respect to the sources.
+      output_gradients: a list of gradients, one for each element of
+       target. Defaults to None.
 
     Returns:
       a list of Tensors (or IndexedSlices, or None), one for each element in
@@ -882,7 +887,8 @@ class GradientTape(object):
                else x
                for x in sources]
     grad = imperative_grad.imperative_grad(
-        _default_vspace, self._tape, [target], sources)
+        _default_vspace, self._tape, [target], sources,
+        output_gradients=output_gradients)
     if not self._persistent:
       self._tape = None
     return grad
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index 77a639be3b..cabbcc48fd 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -1332,8 +1332,7 @@ PyObject* TFE_Py_TapeGradient(PyObject* tape, PyObject* vspace,
     }
     return py_result;
   }
-  Py_INCREF(Py_None);
-  return Py_None;
+  return PyList_New(0);
 }
 
 namespace {
diff --git a/tensorflow/python/training/momentum_test.py b/tensorflow/python/training/momentum_test.py
index 6865513b0e..cda421cef8 100644
--- a/tensorflow/python/training/momentum_test.py
+++ b/tensorflow/python/training/momentum_test.py
@@ -247,7 +247,7 @@ class MomentumOptimizerTest(test.TestCase):
       # pylint: enable=cell-var-from-loop
 
       opt = momentum_lib.MomentumOptimizer(learning_rate=1.0, momentum=0.0)
-      sgd_op = opt.minimize(loss if context.in_eager_mode() else loss())
+      sgd_op = opt.minimize(loss)
       self.evaluate(variables.global_variables_initializer())
       # Run 1 step of sgd
       self.evaluate(sgd_op)
@@ -262,7 +262,7 @@ class MomentumOptimizerTest(test.TestCase):
       return math_ops.reduce_sum(embedding_ops.embedding_lookup(var0, [[1]]))
 
     opt = momentum_lib.MomentumOptimizer(learning_rate=1.0, momentum=0.0)
-    sgd_op = opt.minimize(loss if context.in_eager_mode() else loss())
+    sgd_op = opt.minimize(loss)
     self.evaluate(variables.global_variables_initializer())
     self.evaluate(sgd_op)
     self.assertAllCloseAccordingToType([[1, 1], [0, 0]], self.evaluate(var0))
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index b806251f81..f05c40b32d 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -405,7 +405,9 @@ class Optimizer(object):
     given variable.
 
     Args:
-      loss: A Tensor containing the value to minimize.
+      loss: A Tensor containing the value to minimize or a callable taking
+        no arguments which returns the value to minimize. When eager execution
+        is enabled it must be a callable.
       var_list: Optional list or tuple of `tf.Variable` to update to minimize
         `loss`.  Defaults to the list of variables collected in the graph
         under the key `GraphKeys.TRAINABLE_VARIABLES`.
@@ -424,37 +426,27 @@ class Optimizer(object):
     Raises:
       TypeError: If `var_list` contains anything else than `Variable` objects.
       ValueError: If some arguments are invalid.
-      RuntimeError: If called with eager execution enabled and if `grad_loss`
-        is not `None` or `loss` is not callable.
+      RuntimeError: If called with eager execution enabled and `loss` is
+        not callable.
 
     @compatibility(eager)
-    When eager execution is enabled, `loss` should be a Python function that
-    takes elements of `var_list` as arguments and computes the value to be
-    minimized. If `var_list` is None, `loss` should take no arguments.
-    Gradient computation is done with respect to the elements of `var_list` if
-    not None, else with respect to any trainable variables created during the
-    execution of the `loss` function.
-    `gate_gradients`, `aggregation_method`, `colocate_gradients_with_ops` and
-    `grad_loss` are ignored when eager execution is enabled.
+    When eager execution is enabled, `gate_gradients`, `aggregation_method`,
+    and `colocate_gradients_with_ops` are ignored.
     @end_compatibility
     """
-    if context.in_eager_mode():
-      if grad_loss is not None:
-        raise RuntimeError(
-            "`grad_loss` argument to Optimizer.compute_gradients "
-            "not supported when eager execution is enabled.")
-      if not callable(loss):
-        raise RuntimeError(
-            "`loss` passed to Optimizer.compute_gradients should "
-            "be a function when eager execution is enabled.")
-      # TODO(agarwal): consider passing parameters to the `loss` function.
+    if callable(loss):
+      with backprop.GradientTape() as tape:
+        if var_list is not None:
+          tape.watch(var_list)
+        loss_value = loss()
       if var_list is None:
-        return backprop.implicit_grad(loss)()
-      else:
-        var_list = nest.flatten(var_list)
-        grads = backprop.gradients_function(loss)(*var_list)
-        grads_and_vars = list(zip(grads, var_list))
-        return grads_and_vars
+        var_list = tape.watched_variables()
+      grads = tape.gradient(loss_value, var_list, grad_loss)
+      return list(zip(grads, var_list))
+    if context.in_eager_mode():
+      raise RuntimeError(
+          "`loss` passed to Optimizer.compute_gradients should "
+          "be a function when eager execution is enabled.")
     if gate_gradients not in [Optimizer.GATE_NONE, Optimizer.GATE_OP,
                               Optimizer.GATE_GRAPH]:
       raise ValueError("gate_gradients must be one of: Optimizer.GATE_NONE, "
diff --git a/tensorflow/python/training/optimizer_test.py b/tensorflow/python/training/optimizer_test.py
index 8652c61f01..0cab6410e8 100644
--- a/tensorflow/python/training/optimizer_test.py
+++ b/tensorflow/python/training/optimizer_test.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -44,11 +43,10 @@ class OptimizerTest(test.TestCase):
                                                     name='a_%d' % i)
       var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype,
                                                     name='b_%d' % i)
-      def loss(v0, v1):
-        return 5 * v0 + 3 * v1
+      def loss():
+        return 5 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
       # Note that for eager execution, minimize expects a function instead of a
       # Tensor.
-      cost = loss if context.in_eager_mode() else loss(var0, var1)
       global_step = resource_variable_ops.ResourceVariable(
           array_ops.zeros([], dtypes.int64), name='global_step_%d' % i)
       sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
@@ -58,7 +56,7 @@ class OptimizerTest(test.TestCase):
       self.assertAllClose([1.0, 2.0], self.evaluate(var0))
       self.assertAllClose([3.0, 4.0], self.evaluate(var1))
       # Run 1 step of sgd through optimizer
-      opt_op = sgd_op.minimize(cost, global_step, [var0, var1])
+      opt_op = sgd_op.minimize(loss, global_step, [var0, var1])
       self.evaluate(opt_op)
       # Validate updated params
       self.assertAllClose([-14., -13.], self.evaluate(var0))
@@ -125,10 +123,9 @@ class OptimizerTest(test.TestCase):
             [3.0, 4.0], dtype=dtype, trainable=False, name='b')
         return 5 * var0 + var1
       # pylint: enable=cell-var-from-loop
-      cost = loss if context.in_eager_mode() else loss()
       sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
       with self.assertRaisesRegexp(ValueError, 'No.*variables'):
-        sgd_op.minimize(cost)
+        sgd_op.minimize(loss)
 
   @test_util.run_in_graph_and_eager_modes()
   def testNoGradients(self):
@@ -140,14 +137,13 @@ class OptimizerTest(test.TestCase):
       var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype,
                                                     name='b%d' % i)
       # pylint: disable=cell-var-from-loop
-      def loss(_):
+      def loss():
         return 5 * var0
       # pylint: enable=cell-var-from-loop
-      cost = loss if context.in_eager_mode() else loss(var1)
       sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
       with self.assertRaisesRegexp(ValueError, 'No gradients'):
         # var1 has no gradient
-        sgd_op.minimize(cost, var_list=[var1])
+        sgd_op.minimize(loss, var_list=[var1])
 
   @test_util.run_in_graph_and_eager_modes()
   def testNoGradientsForAnyVariables_Minimize(self):
@@ -158,13 +154,12 @@ class OptimizerTest(test.TestCase):
                                                     name='a_%d' % i)
       var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype,
                                                     name='b_%d' % i)
-      def loss(unused_v1, unused_v2):
+      def loss():
         return constant_op.constant(5.0)
-      cost = loss if context.in_eager_mode() else loss(var0, var1)
       sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
       with self.assertRaisesRegexp(ValueError,
                                    'No gradients provided for any variable'):
-        sgd_op.minimize(cost, var_list=[var0, var1])
+        sgd_op.minimize(loss, var_list=[var0, var1])
 
   @test_util.run_in_graph_and_eager_modes()
   def testNoGradientsForAnyVariables_ApplyGradients(self):
@@ -189,11 +184,10 @@ class OptimizerTest(test.TestCase):
                                                     name='a%d' % i)
       var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype,
                                                     name='b%d' % i)
-      def loss(v0, v1):
-        return 5 * v0 + 3 * v1
-      cost = loss if context.in_eager_mode() else loss(var0, var1)
+      def loss():
+        return 5 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
       sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
-      grads_and_vars = sgd_op.compute_gradients(cost, [var0, var1])
+      grads_and_vars = sgd_op.compute_gradients(loss, [var0, var1])
       # Convert gradients to tf.Variables
       converted_grads = [
           resource_variable_ops.ResourceVariable(array_ops.zeros([2], dtype),
@@ -223,12 +217,11 @@ class OptimizerTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes()
   def testComputeGradientsWithTensors(self):
-    def f(x):
-      return x * x
     x = ops.convert_to_tensor(1.0)
-    y = f if context.in_eager_mode() else f(x)
+    def f():
+      return x * x
     sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
-    grads_and_vars = sgd_op.compute_gradients(y, [x])
+    grads_and_vars = sgd_op.compute_gradients(f, [x])
     self.assertEqual(1, len(grads_and_vars))
     grad, x_as_var = grads_and_vars[0]
     self.assertIs(x, x_as_var)
-- 
GitLab


From f87caf7cd62be25a9c7e390b55b4933fcdcc784c Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Thu, 8 Feb 2018 16:05:31 -0800
Subject: [PATCH 879/966] Make
 SavedModelTest.testStripDefaultAttrsInconsistentConsumerDefaults work with C
 API.

The test originally altered the Python version of the op registry,
which is not reflected in the C API. This changes the test to alter
the serialized node def instead of the op def, and renames the test to
testInconsistentConsumerDefaultAttrs.

PiperOrigin-RevId: 185067838
---
 tensorflow/python/framework/test_ops.cc       | 19 ++++
 tensorflow/python/saved_model/BUILD           |  1 +
 .../python/saved_model/saved_model_test.py    | 91 +++++++++++--------
 3 files changed, 72 insertions(+), 39 deletions(-)

diff --git a/tensorflow/python/framework/test_ops.cc b/tensorflow/python/framework/test_ops.cc
index c6c6c2233c..070b5ac11f 100644
--- a/tensorflow/python/framework/test_ops.cc
+++ b/tensorflow/python/framework/test_ops.cc
@@ -76,6 +76,11 @@ REGISTER_OP("TestStringOutput")
     .Output("output2: string")
     .SetShapeFn(shape_inference::UnknownShape);
 
+REGISTER_OP("TestAttr")
+    .Output("out: T")
+    .Attr("T: {float, double}")
+    .SetShapeFn(shape_inference::UnknownShape);
+
 namespace {
 enum KernelLabel { DEFAULT_LABEL, OVERLOAD_1_LABEL, OVERLOAD_2_LABEL };
 }  // namespace
@@ -188,6 +193,20 @@ class ResourceUsingOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(Name("ResourceUsingOp").Device(DEVICE_CPU),
                         ResourceUsingOp);
 
+class TestAttrOp : public OpKernel {
+ public:
+  explicit TestAttrOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    Tensor* output;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output));
+    output->scalar<float>()() = 1.0;
+  }
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("TestAttr").Device(DEVICE_CPU).TypeConstraint<float>("T"), TestAttrOp);
+
 // Various test ops without kernels. These are used to test graph construction.
 
 REGISTER_OP("A")
diff --git a/tensorflow/python/saved_model/BUILD b/tensorflow/python/saved_model/BUILD
index e34aa7cc2c..30e0a099d8 100644
--- a/tensorflow/python/saved_model/BUILD
+++ b/tensorflow/python/saved_model/BUILD
@@ -148,6 +148,7 @@ py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:saver_test_utils",
         "//tensorflow/python:state_ops",
+        "//tensorflow/python:test_ops",
         "//tensorflow/python:util",
         "//tensorflow/python:variables",
     ],
diff --git a/tensorflow/python/saved_model/saved_model_test.py b/tensorflow/python/saved_model/saved_model_test.py
index f92247d52e..d1f6bc27ef 100644
--- a/tensorflow/python/saved_model/saved_model_test.py
+++ b/tensorflow/python/saved_model/saved_model_test.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import os
 
-from tensorflow.core.framework import op_def_pb2
 from tensorflow.core.framework import types_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
@@ -28,8 +27,8 @@ from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
-from tensorflow.python.framework import op_def_registry
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import control_flow_ops
@@ -945,61 +944,75 @@ class SavedModelTest(test.TestCase):
     self.assertIn("T", node_def.attr)
     self.assertIn("Tout", node_def.attr)
 
-  def testStripDefaultAttrsInconsistentConsumerDefaults(self):
-    if ops._USE_C_API: return  # TODO(skyewm): get this working
-
+  # Tests the behavior of loading SavedModels that having missing attrs or attrs
+  # with incorrect types.
+  def testInconsistentConsumerDefaultAttrs(self):
     export_dir = self._get_export_dir(
         "test_strip_default_attrs_no_consumer_defaults")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
-    # Add a graph with two float32 variables and a Complex Op composing them
-    # with strip_default_attrs enabled. This must remove the following
-    # defaults for the "Complex" Op:
-    #   o "T"    : float32.   (input type)
-    #   o "Tout" : complex64. (output type)
+    # Add a graph with a single variable and a test op with a defaultless
+    # float32 attr, "test_attr".
     with session.Session(graph=ops.Graph()) as sess:
-      real_num = variables.Variable(1.0, dtype=dtypes.float32, name="real")
-      imag_num = variables.Variable(2.0, dtype=dtypes.float32, name="imag")
-      math_ops.complex(real_num, imag_num, name="complex")
+      variables.Variable(1.0, dtype=dtypes.float64, name="var")
+      test_ops.test_attr(T=dtypes.float32, name="test_attr")
       sess.run(variables.global_variables_initializer())
-      builder.add_meta_graph_and_variables(
-          sess, ["foo"], strip_default_attrs=True)
+      builder.add_meta_graph_and_variables(sess, ["foo"])
 
     # Save the SavedModel to disk in text format.
     builder.save(as_text=True)
 
-    # Update the Op registry to remove defaults for all attrs("T", "Tout") from
-    # the "Complex" OpDef.
-    complex_op_def = op_def_registry.get_registered_ops()["Complex"]
-    original_complex_op_def = op_def_pb2.OpDef()
-    original_complex_op_def.CopyFrom(complex_op_def)
-    for attr_def in complex_op_def.attr:
-      attr_def.ClearField("default_value")
+    # Rewrite the SavedModel to remove the T attr from "test_attr".
+    saved_model_file = os.path.join(
+        export_dir, constants.SAVED_MODEL_FILENAME_PBTXT)
+    with open(saved_model_file) as f:
+      original_saved_model = f.read()
+
+    no_attr_saved_model = original_saved_model.replace("""
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }""", "")
+    with open(saved_model_file, "w") as f:
+      f.write(no_attr_saved_model)
 
     # Loading the SavedModel via the loader must fail because the SavedModel
-    # does not have any attr values for the "Complex" node and the current
-    # op registry does not have have any default values for the "Complex" op.
+    # does not have any attr values for the "TestAttr" node, and there is no
+    # default specified in the TestAttr OpDef.
     sess = session.Session(graph=ops.Graph())
-    with self.assertRaisesRegexp(
-        ValueError,
-        "Expected one attr with name .*T(out)?.* in name: \"complex\".*"):
+    if ops._USE_C_API:
+      error_message = "NodeDef missing attr 'T' from Op<name=TestAttr"
+    else:
+      error_message = ("Expected one attr with name .*T(out)?.* in name: "
+                       "\"test_attr\".*")
+    with self.assertRaisesRegexp(ValueError, error_message):
       loader.load(sess, ["foo"], export_dir)
 
-    # Update the Op registry to change the defaults for attr "Tout"
-    # (complex64 -> complex128).
-    complex_op_def.CopyFrom(original_complex_op_def)
-    for attr_def in complex_op_def.attr:
-      if attr_def.name == "Tout":
-        attr_def.default_value.type = types_pb2.DT_COMPLEX128
-
-    # Loading the SavedModel via the loader must set "Tout" attr_value for the
-    # "Complex" node according to the latest defaults (complex128). This is
-    # expected to fail the model import as there is no OpKernel registered to
-    # handle attrs "T" (float32) and "Tout" (complex128).
+    # Rewrite the SavedModel to change the type of the T attr in "test_attr"
+    bad_type_saved_model = original_saved_model.replace("""
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }""", """
+      attr {
+        key: "T"
+        value {
+          type: DT_DOUBLE
+        }
+      }""")
+    with open(saved_model_file, "w") as f:
+      f.write(bad_type_saved_model)
+
+    # Loading the SavedModel via the loader must fail because there is no
+    # OpKernel registered to handle T = double.
     sess = session.Session(graph=ops.Graph())
     with self.assertRaisesRegexp(
         errors.InvalidArgumentError,
-        ".*No OpKernel was registered to support Op \'Complex\' with these "
+        ".*No OpKernel was registered to support Op \'TestAttr\' with these "
         "attrs..*"):
       loader.load(sess, ["foo"], export_dir)
 
-- 
GitLab


From 85bd37f98b5cc758f851f1c22a7a122175d60240 Mon Sep 17 00:00:00 2001
From: Brian Patton <bjp@google.com>
Date: Thu, 8 Feb 2018 16:09:05 -0800
Subject: [PATCH 880/966] Allow C64 infeeds.

PiperOrigin-RevId: 185068327
---
 tensorflow/contrib/tpu/python/ops/tpu_ops.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/tpu/python/ops/tpu_ops.py b/tensorflow/contrib/tpu/python/ops/tpu_ops.py
index 1c970655d0..9787621679 100644
--- a/tensorflow/contrib/tpu/python/ops/tpu_ops.py
+++ b/tensorflow/contrib/tpu/python/ops/tpu_ops.py
@@ -47,7 +47,8 @@ if platform.system() != "Windows":
   # types are supported.
 
   _SUPPORTED_INFEED_DTYPES = set([
-      dtypes.bool, dtypes.int32, dtypes.bfloat16, dtypes.float32
+      dtypes.bool, dtypes.int32, dtypes.bfloat16, dtypes.float32,
+      dtypes.complex64
   ])
 
   def infeed_dequeue(dtype, shape, name=None):
-- 
GitLab


From 899b00b2dd594fc7772eb3917437a90256598e3d Mon Sep 17 00:00:00 2001
From: Suharsh Sivakumar <suharshs@google.com>
Date: Thu, 8 Feb 2018 16:11:03 -0800
Subject: [PATCH 881/966] Add our own "global step" called quantization_step to
 avoid needing to reset the global step before running the graph rewrite.

PiperOrigin-RevId: 185068631
---
 tensorflow/contrib/quantize/BUILD             | 15 ++++-
 tensorflow/contrib/quantize/python/common.py  | 36 ++++++++++-
 .../contrib/quantize/python/common_test.py    | 59 +++++++++++++++++++
 .../quantize/python/fold_batch_norms.py       | 47 +++++++--------
 .../contrib/quantize/python/quantize.py       |  6 +-
 .../python/quantize_parameterized_test.py     | 13 ----
 6 files changed, 133 insertions(+), 43 deletions(-)
 create mode 100644 tensorflow/contrib/quantize/python/common_test.py

diff --git a/tensorflow/contrib/quantize/BUILD b/tensorflow/contrib/quantize/BUILD
index 42e295e622..aec9f47ccb 100644
--- a/tensorflow/contrib/quantize/BUILD
+++ b/tensorflow/contrib/quantize/BUILD
@@ -13,6 +13,20 @@ py_library(
     deps = [],
 )
 
+py_test(
+    name = "common_test",
+    size = "small",
+    srcs = ["python/common_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":common",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:session",
+    ],
+)
+
 py_library(
     name = "graph_matcher",
     srcs = [
@@ -198,7 +212,6 @@ py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:training",
     ],
 )
 
diff --git a/tensorflow/contrib/quantize/python/common.py b/tensorflow/contrib/quantize/python/common.py
index d0b0674c31..3a1fa61e43 100644
--- a/tensorflow/contrib/quantize/python/common.py
+++ b/tensorflow/contrib/quantize/python/common.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Constants used across this package."""
+"""Common utilities used across this package."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -21,6 +21,12 @@ from __future__ import print_function
 import collections
 import re
 
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+
 # Skip all operations that are backprop related or export summaries.
 SKIPPED_PREFIXES = (
     'gradients/', 'RMSProp/', 'Adagrad/', 'Const_', 'HistogramSummary',
@@ -86,3 +92,31 @@ def _GetOperationByNameDontThrow(graph, name):
     return graph.get_operation_by_name(name)
   except KeyError:
     return None
+
+
+def CreateOrGetQuantizationStep():
+  """Returns a Tensor of the number of steps the quantized graph has run.
+
+  Returns:
+    Quantization step Tensor.
+  """
+  quantization_step_name = 'fake_quantization_step'
+  quantization_step_tensor_name = quantization_step_name + '/AssignAdd:0'
+  g = ops.get_default_graph()
+  try:
+    return g.get_tensor_by_name(quantization_step_tensor_name)
+  except KeyError:
+    # Create in proper graph and base name_scope.
+    with g.name_scope(None):
+      quantization_step_tensor = variable_scope.get_variable(
+          quantization_step_name,
+          shape=[],
+          dtype=dtypes.int64,
+          initializer=init_ops.zeros_initializer(),
+          trainable=False,
+          collections=[ops.GraphKeys.GLOBAL_VARIABLES])
+      with g.name_scope(quantization_step_tensor.op.name + '/'):
+        # We return the incremented variable tensor. Since this is used in conds
+        # for quant_delay and freeze_bn_delay, it will run once per graph
+        # execution.
+        return state_ops.assign_add(quantization_step_tensor, 1)
diff --git a/tensorflow/contrib/quantize/python/common_test.py b/tensorflow/contrib/quantize/python/common_test.py
new file mode 100644
index 0000000000..d6237fe5e3
--- /dev/null
+++ b/tensorflow/contrib/quantize/python/common_test.py
@@ -0,0 +1,59 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for common utilities in this package."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.quantize.python import common
+from tensorflow.python.client import session
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import googletest
+
+
+class CommonTest(test_util.TensorFlowTestCase):
+
+  def testCreateOrGetQuantizationStep(self):
+    g = ops.Graph()
+    with session.Session(graph=g) as sess:
+      quantization_step_tensor = common.CreateOrGetQuantizationStep()
+
+      # Check that operations are added to the graph.
+      num_nodes = len(g.get_operations())
+      self.assertGreater(num_nodes, 0)
+
+      # Check that getting the quantization step doesn't change the graph.
+      get_quantization_step_tensor = common.CreateOrGetQuantizationStep()
+      self.assertEqual(quantization_step_tensor, get_quantization_step_tensor)
+      self.assertEqual(num_nodes, len(g.get_operations()))
+
+      # Ensure that running the graph increments the quantization step.
+      sess.run(variables.global_variables_initializer())
+      step_val = sess.run(quantization_step_tensor)
+      self.assertEqual(step_val, 1)
+
+      # Ensure that even running a graph that depends on the quantization step
+      # multiple times only executes it once.
+      a = quantization_step_tensor + 1
+      b = a + quantization_step_tensor
+      _, step_val = sess.run([b, quantization_step_tensor])
+      self.assertEqual(step_val, 2)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/contrib/quantize/python/fold_batch_norms.py b/tensorflow/contrib/quantize/python/fold_batch_norms.py
index 7fa0d484ec..36a848d2a8 100644
--- a/tensorflow/contrib/quantize/python/fold_batch_norms.py
+++ b/tensorflow/contrib/quantize/python/fold_batch_norms.py
@@ -31,7 +31,6 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import nn_ops
-from tensorflow.python.training import training_util
 from tensorflow.python.util import compat
 
 
@@ -43,11 +42,10 @@ def FoldBatchNorms(graph, freeze_batch_norm_delay=None, is_training=True):
 
   Args:
     graph: Graph to walk and modify.
-    freeze_batch_norm_delay: How many steps to wait before freezing
-    moving mean and variance and using them for batch normalization. This value
-    is used only when is_training is True.
-    is_training: Bool, true if training
-
+    freeze_batch_norm_delay: How many steps to wait before freezing moving mean
+      and variance and using them for batch normalization. This value is used
+      only when is_training is True.
+    is_training: Bool, true if training.
   Raises:
     ValueError: When batch norm folding fails.
   """
@@ -69,9 +67,9 @@ def _FoldFusedBatchNorms(graph, freeze_batch_norm_delay, is_training):
 
   Args:
     graph: Graph to walk and modify.
-    freeze_batch_norm_delay: How many steps to wait before freezing
-    moving mean and variance and using them for batch normalization
-    is_training: Bool, true if training
+    freeze_batch_norm_delay: How many steps to wait before freezing moving mean
+      and variance and using them for batch normalization.
+    is_training: Bool, true if training.
 
   Raises:
     ValueError: When batch norm folding fails.
@@ -305,10 +303,10 @@ def _ComputeBatchNormCorrections(context, match, freeze_batch_norm_delay,
   Args:
     context: The scope under which we look for batch norm params
     match: Object containg required batch norm tensors for correction
-      computation
+      computation.
     freeze_batch_norm_delay: Delay in steps at which computation switches
       from regular batch norm to frozen mean and variance.
-    fused_batch_norm: Bool, true if fused batch norm is used
+    fused_batch_norm: Bool, true if fused batch norm is used.
 
   Returns:
     A tuple of correction_scale, correction_recip, correction_offset
@@ -334,7 +332,7 @@ def _ComputeBatchNormCorrections(context, match, freeze_batch_norm_delay,
 
     if freeze_batch_norm_delay is not None:
       use_mv_avg = math_ops.greater_equal(
-          training_util.get_or_create_global_step(),
+          common.CreateOrGetQuantizationStep(),
           freeze_batch_norm_delay,
           name='use_moving_average')
     else:
@@ -426,8 +424,8 @@ def _FoldUnfusedBatchNorms(graph, freeze_batch_norm_delay, is_training):
 
   Args:
     graph: Graph to walk and modify.
-    freeze_batch_norm_delay: How many steps to wait before freezing
-    moving mean and variance and using them for batch normalization
+    freeze_batch_norm_delay: How many steps to wait before freezing moving mean
+      and variance and using them for batch normalization.
     is_training: Bool, True if training
 
   Raises:
@@ -472,11 +470,10 @@ def _GetBatchNormParams(graph, context, has_scaling):
   Args:
     graph: Graph to inspect.
     context: The scope under which we look for batch norm params
-    has_scaling: Bool that specifies if scaling is done as part of batch
-    norm
+    has_scaling: Bool that specifies if scaling is done as part of batch norm.
 
   Returns:
-   _BatchNormMatch containing all required batch norm parameters
+    _BatchNormMatch containing all required batch norm parameters.
   """
   gamma_tensor = None
   batch_mean_tensor = None
@@ -554,20 +551,20 @@ def _CreateFoldedOp(graph, context, has_scaling, freeze_batch_norm_delay,
   Args:
     graph: Graph to modify.
     context: String, batch norm context, i.e. node into which BatchNorm is
-        nested.
+      nested.
     has_scaling: Whether the batch norm has scaling enabled.
-    freeze_batch_norm_delay: How many steps to wait before freezing
-    moving mean and variance and using them for batch normalization
-    is_training: Bool, true if training
+    freeze_batch_norm_delay: How many steps to wait before freezing moving mean
+      and variance and using them for batch normalization.
+    is_training: Bool, true if training.
 
   Raises:
     ValueError: When operation type is not supported, or input and output tensor
-        shapes mismatch for created operations: mul_fold, add_fold.
+      shapes mismatch for created operations: mul_fold, add_fold.
 
   Returns:
     A pair of Operations, the first is the original consumer node of the batch
-        norm (../BatchNorm/batchnorm/add_1), the second is the consumer node of
-        the folded graph (add_fold).
+      norm (../BatchNorm/batchnorm/add_1), the second is the consumer node of
+      the folded graph (add_fold).
   """
   mul_scale_name = 'mul_1' if has_scaling else 'mul'
   mul_scale = graph.get_operation_by_name(context +
@@ -642,7 +639,7 @@ def _CloneOp(op, new_name, new_inputs):
     op: Operation to modify.
     new_name: String, a new name to set on cloned op.
     new_inputs: A list of tuples (idx, tensor), each input with corresponding
-        index will be replaced by the given Tensor in the cloned op.
+      index will be replaced by the given Tensor in the cloned op.
 
   Returns:
     Operation, the cloned op.
diff --git a/tensorflow/contrib/quantize/python/quantize.py b/tensorflow/contrib/quantize/python/quantize.py
index 1a63b0a2ce..e44b91f0d0 100644
--- a/tensorflow/contrib/quantize/python/quantize.py
+++ b/tensorflow/contrib/quantize/python/quantize.py
@@ -20,13 +20,13 @@ from __future__ import print_function
 
 import re
 from tensorflow.contrib import graph_editor
+from tensorflow.contrib.quantize.python import common
 from tensorflow.contrib.quantize.python import graph_matcher
 from tensorflow.contrib.quantize.python import input_to_ops
 from tensorflow.contrib.quantize.python import quant_ops
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.training import training_util
 
 # Quantizable operation types that are supported by the quantization rewrite.
 _QUANTIZABLE_TYPES = {'Conv2D', 'MatMul', 'DepthwiseConv2dNative'}
@@ -270,7 +270,7 @@ def _InsertQuantOp(context,
       quantization interval ends.
     is_training: (Optional) Whether quantizing training graph or eval graph.
     narrow_range: Whether to use the narrow quantization range
-        [1; 2^bits - 1] or wide range [0; 2^bits - 1].
+      [1; 2^bits - 1] or wide range [0; 2^bits - 1].
   Raises:
     ValueError: When producer operation is not directly connected to the
       consumer operation.
@@ -303,7 +303,7 @@ def _InsertQuantOp(context,
 
   if quant_delay and quant_delay > 0:
     activate_quant = math_ops.greater_equal(
-        training_util.get_or_create_global_step(),
+        common.CreateOrGetQuantizationStep(),
         quant_delay,
         name=name_prefix + '/activate_quant')
     quant = control_flow_ops.cond(
diff --git a/tensorflow/contrib/quantize/python/quantize_parameterized_test.py b/tensorflow/contrib/quantize/python/quantize_parameterized_test.py
index 2d5307799b..2e74f3b04d 100644
--- a/tensorflow/contrib/quantize/python/quantize_parameterized_test.py
+++ b/tensorflow/contrib/quantize/python/quantize_parameterized_test.py
@@ -29,7 +29,6 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.platform import googletest
-from tensorflow.python.training import training
 
 batch_norm = layers.batch_norm
 conv2d = layers.conv2d
@@ -73,8 +72,6 @@ class QuantizeTest(test_util.TensorFlowTestCase):
     """
     graph = ops.Graph()
     with graph.as_default():
-      training.create_global_step(graph)
-
       batch_size, height, width, depth = 5, 128, 128, 3
       inputs = array_ops.zeros((batch_size, height, width, depth))
       stride = 1 if with_bypass else 2
@@ -152,8 +149,6 @@ class QuantizeTest(test_util.TensorFlowTestCase):
     """
     graph = ops.Graph()
     with graph.as_default():
-      training.create_global_step(graph)
-
       batch_size, depth = 5, 256
       inputs = array_ops.zeros((batch_size, depth))
       out_depth = 256 if with_bypass else 128
@@ -229,8 +224,6 @@ class QuantizeTest(test_util.TensorFlowTestCase):
     """
     graph = ops.Graph()
     with graph.as_default():
-      training.create_global_step(graph)
-
       batch_size, height, width, depth = 5, 128, 128, 3
       inputs = array_ops.zeros((batch_size, height, width, depth))
       stride = 1 if with_bypass else 2
@@ -344,8 +337,6 @@ class QuantizeTest(test_util.TensorFlowTestCase):
     """
     graph = ops.Graph()
     with graph.as_default():
-      training.create_global_step(graph)
-
       batch_size, height, width, depth = 5, 128, 128, 3
       inputs = array_ops.zeros((batch_size, height, width, depth))
       stride = 1 if with_bypass else 2
@@ -432,8 +423,6 @@ class QuantizeTest(test_util.TensorFlowTestCase):
     """
     graph = ops.Graph()
     with graph.as_default():
-      training.create_global_step(graph)
-
       batch_size, depth = 5, 256
       inputs = array_ops.zeros((batch_size, depth))
       out_depth = 256 if with_bypass else 128
@@ -519,8 +508,6 @@ class QuantizeTest(test_util.TensorFlowTestCase):
     """
     graph = ops.Graph()
     with graph.as_default():
-      training.create_global_step(graph)
-
       batch_size, height, width, depth = 5, 128, 128, 3
       inputs = array_ops.zeros((batch_size, height, width, depth))
       stride = 1 if with_bypass else 2
-- 
GitLab


From 06a93f8d14d5f154d96b5576e454eaa4272facb4 Mon Sep 17 00:00:00 2001
From: Yu-Cheng Ling <ycling@google.com>
Date: Thu, 8 Feb 2018 16:15:41 -0800
Subject: [PATCH 882/966] Improve the flow to build TFLite iOS demo app

PiperOrigin-RevId: 185069356
---
 tensorflow/contrib/lite/README.md             |  2 +-
 .../contrib/lite/download_dependencies.sh     |  7 ---
 .../lite/examples/ios/download_models.sh      | 57 +++++++++++++++++++
 3 files changed, 58 insertions(+), 8 deletions(-)
 create mode 100755 tensorflow/contrib/lite/examples/ios/download_models.sh

diff --git a/tensorflow/contrib/lite/README.md b/tensorflow/contrib/lite/README.md
index 53140b5473..7cdf91da88 100644
--- a/tensorflow/contrib/lite/README.md
+++ b/tensorflow/contrib/lite/README.md
@@ -92,7 +92,7 @@ Similar to the Android demo app, there's an iOS camera app that uses exactly the
 
 This demo app requires a camera so it doesn't work with simulators. It need to be executed on a real iOS device. Follow the instructions to build and run the demo app:
 
-1.   Follow the Building section [here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/g3doc/ios.md#building) to build the universal iOS library for TensorFlow Lite.
+1.   Run `third_party/tensorflow/contrib/lite/examples/ios/download_models.sh` to download the model files used by the demo app.
 1.   Install [CocoaPods](https://cocoapods.org/) if it wasn't installed yet: `sudo gem install cocoapods`.
 1.   Run `pod install` in `tensorflow/contrib/lite/examples/ios/camera` to generate the workspace file.
 1.   Open the project by running `open tflite_camera_example.xcworkspace`, and build the app in XCode.
diff --git a/tensorflow/contrib/lite/download_dependencies.sh b/tensorflow/contrib/lite/download_dependencies.sh
index e1b7b3613a..a93ed201d6 100755
--- a/tensorflow/contrib/lite/download_dependencies.sh
+++ b/tensorflow/contrib/lite/download_dependencies.sh
@@ -36,8 +36,6 @@ ABSL_URL="$(grep -o 'https://github.com/abseil/abseil-cpp/.*tar.gz' "${BZL_FILE_
 NEON_2_SSE_URL="https://github.com/intel/ARM_NEON_2_x86_SSE/archive/master.zip"
 FARMHASH_URL="https://mirror.bazel.build/github.com/google/farmhash/archive/816a4ae622e964763ca0862d9dbd19324a1eaf45.tar.gz"
 FLATBUFFERS_URL="https://github.com/google/flatbuffers/archive/master.zip"
-MODELS_URL="https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_1.0_224_ios_lite_float_2017_11_08.zip"
-QUANTIZED_MODELS_URL="https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip"
 
 # TODO(petewarden): Some new code in Eigen triggers a clang bug with iOS arm64,
 #                   so work around it by patching the source.
@@ -93,8 +91,6 @@ download_and_extract "${ABSL_URL}" "${DOWNLOADS_DIR}/absl"
 download_and_extract "${NEON_2_SSE_URL}" "${DOWNLOADS_DIR}/neon_2_sse"
 download_and_extract "${FARMHASH_URL}" "${DOWNLOADS_DIR}/farmhash"
 download_and_extract "${FLATBUFFERS_URL}" "${DOWNLOADS_DIR}/flatbuffers"
-download_and_extract "${MODELS_URL}" "${DOWNLOADS_DIR}/models"
-download_and_extract "${QUANTIZED_MODELS_URL}" "${DOWNLOADS_DIR}/quantized_models"
 
 replace_by_sed 's#static uint32x4_t p4ui_CONJ_XOR = vld1q_u32( conj_XOR_DATA );#static uint32x4_t p4ui_CONJ_XOR; // = vld1q_u32( conj_XOR_DATA ); - Removed by script#' \
   "${DOWNLOADS_DIR}/eigen/Eigen/src/Core/arch/NEON/Complex.h"
@@ -103,7 +99,4 @@ replace_by_sed 's#static uint32x2_t p2ui_CONJ_XOR = vld1_u32( conj_XOR_DATA );#s
 replace_by_sed 's#static uint64x2_t p2ul_CONJ_XOR = vld1q_u64( p2ul_conj_XOR_DATA );#static uint64x2_t p2ul_CONJ_XOR;// = vld1q_u64( p2ul_conj_XOR_DATA ); - Removed by script#' \
   "${DOWNLOADS_DIR}/eigen/Eigen/src/Core/arch/NEON/Complex.h"
 
-cp ${DOWNLOADS_DIR}/models/models/* tensorflow/contrib/lite/examples/ios/simple/data/
-cp ${DOWNLOADS_DIR}/quantized_models/* tensorflow/contrib/lite/examples/ios/camera/data/
-
 echo "download_dependencies.sh completed successfully." >&2
diff --git a/tensorflow/contrib/lite/examples/ios/download_models.sh b/tensorflow/contrib/lite/examples/ios/download_models.sh
new file mode 100755
index 0000000000..ccd163758c
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/ios/download_models.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+set -ex
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+MODELS_URL="https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_1.0_224_ios_lite_float_2017_11_08.zip"
+QUANTIZED_MODELS_URL="https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip"
+DOWNLOADS_DIR=$(mktemp -d)
+
+cd $SCRIPT_DIR
+
+download_and_extract() {
+  local usage="Usage: download_and_extract URL DIR"
+  local url="${1:?${usage}}"
+  local dir="${2:?${usage}}"
+  echo "downloading ${url}" >&2
+  mkdir -p "${dir}"
+  tempdir=$(mktemp -d)
+  tempdir2=$(mktemp -d)
+
+  curl -L ${url} > ${tempdir}/zipped.zip
+  unzip ${tempdir}/zipped.zip -d ${tempdir2}
+
+  # If the zip file contains nested directories, extract the files from the
+  # inner directory.
+  if ls ${tempdir2}/*/* 1> /dev/null 2>&1; then
+    # unzip has no strip components, so unzip to a temp dir, and move the
+    # files we want from the tempdir to destination.
+    cp -R ${tempdir2}/*/* ${dir}/
+  else
+    cp -R ${tempdir2}/* ${dir}/
+  fi
+  rm -rf ${tempdir2} ${tempdir}
+}
+
+download_and_extract "${MODELS_URL}" "${DOWNLOADS_DIR}/models"
+download_and_extract "${QUANTIZED_MODELS_URL}" "${DOWNLOADS_DIR}/quantized_models"
+
+file ${DOWNLOADS_DIR}/models
+
+cp ${DOWNLOADS_DIR}/models/models/* simple/data/
+cp ${DOWNLOADS_DIR}/quantized_models/* camera/data/
+
-- 
GitLab


From 829bf153a982f520419434c1b6425c6f20314a95 Mon Sep 17 00:00:00 2001
From: Saurabh Saxena <srbs@google.com>
Date: Thu, 8 Feb 2018 16:31:20 -0800
Subject: [PATCH 883/966] Fix bug in checkpointing DT_VARIANT tensors. Also add
 an integration test that runs and checkpoints multiple input pipelines in one
 graph.

PiperOrigin-RevId: 185071397
---
 .../contrib/data/python/kernel_tests/BUILD    | 15 ++++
 .../serialization_integration_test.py         | 85 +++++++++++++++++++
 .../core/util/tensor_bundle/tensor_bundle.cc  |  2 +-
 3 files changed, 101 insertions(+), 1 deletion(-)
 create mode 100644 tensorflow/contrib/data/python/kernel_tests/serialization_integration_test.py

diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index f58872f2a8..7ca4a28dae 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -385,6 +385,21 @@ py_test(
     ],
 )
 
+py_test(
+    name = "serialization_integration_test",
+    size = "small",
+    srcs = ["serialization_integration_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        "//tensorflow/contrib/data/python/ops:iterator_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
 py_test(
     name = "shard_dataset_op_test",
     size = "small",
diff --git a/tensorflow/contrib/data/python/kernel_tests/serialization_integration_test.py b/tensorflow/contrib/data/python/kernel_tests/serialization_integration_test.py
new file mode 100644
index 0000000000..0a6b74dc3e
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/serialization_integration_test.py
@@ -0,0 +1,85 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Integration test for input pipeline serialization."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.contrib.data.python.ops import iterator_ops as contrib_iterator_ops
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import test
+from tensorflow.python.training import saver as saver_lib
+
+
+class MultipleInputPipelinesTest(test.TestCase):
+
+  def _build_input_pipeline(self, name, num_outputs):
+    with ops.name_scope(name):
+      ds = dataset_ops.Dataset.range(num_outputs).shuffle(
+          10, reshuffle_each_iteration=False).prefetch(10)
+      iterator = ds.make_initializable_iterator()
+      saveable = contrib_iterator_ops.make_saveable_from_iterator(iterator)
+      ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
+      return iterator.initializer, iterator.get_next()
+
+  def _build_graph(self, num_pipelines, num_outputs):
+    init_ops = []
+    get_next_ops = []
+    for i in range(num_pipelines):
+      name = "input_pipeline_%d" % i
+      init_op, get_next_op = self._build_input_pipeline(name, num_outputs)
+      init_ops.append(init_op)
+      get_next_ops.append(get_next_op)
+    saver = saver_lib.Saver()
+    return init_ops, get_next_ops, saver
+
+  def _ckpt_path(self):
+    return os.path.join(self.get_temp_dir(), "iterator")
+
+  def testConcurrentSaves(self):
+    num_pipelines = 100
+    num_outputs = 100
+    break_point = 10
+    all_outputs = [[] for _ in range(num_pipelines)]
+    with ops.Graph().as_default() as g:
+      init_ops, get_next_ops, saver = self._build_graph(num_pipelines,
+                                                        num_outputs)
+      with self.test_session(graph=g) as sess:
+        sess.run(init_ops)
+        for _ in range(break_point):
+          output = sess.run(get_next_ops)
+          for i in range(num_pipelines):
+            all_outputs[i].append(output[i])
+        saver.save(sess, self._ckpt_path())
+
+    with ops.Graph().as_default() as g:
+      init_ops, get_next_ops, saver = self._build_graph(num_pipelines,
+                                                        num_outputs)
+      with self.test_session(graph=g) as sess:
+        saver.restore(sess, self._ckpt_path())
+        for _ in range(num_outputs - break_point):
+          output = sess.run(get_next_ops)
+          for i in range(num_pipelines):
+            all_outputs[i].append(output[i])
+
+    for output in all_outputs:
+      self.assertSequenceEqual(sorted(output), range(num_outputs))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
index 462b420976..0426fee0e2 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
@@ -286,7 +286,7 @@ Status WriteVariantTensor(const Tensor& val, FileOutputBuffer* out,
     TF_RETURN_IF_ERROR(out->Append(len));
     *crc32c = crc32c::Extend(*crc32c, reinterpret_cast<const char*>(&elem_size),
                              sizeof(uint64));
-    *bytes_written += sizeof(uint64);
+    *bytes_written += len.size();
 
     // Write the serialized variant.
     TF_RETURN_IF_ERROR(out->Append(elem));
-- 
GitLab


From d47d9042f83ea74ad147c2a44cde6dfe4f3180f0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 8 Feb 2018 16:39:42 -0800
Subject: [PATCH 884/966] Automated g4 rollback of changelist 185006374

PiperOrigin-RevId: 185072479
---
 tensorflow/core/grappler/optimizers/constant_folding.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index e27bd97325..1e6f11c8aa 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -1443,14 +1443,15 @@ Status ConstantFolding::SimplifyGraph(GraphDef* output,
       graph_modified_ = true;
       continue;
     }
-
+    const bool safe_to_use_shapes =
+        use_shape_info && (feed_nodes_.empty() || is_aggressive);
     const bool is_mul = IsMul(*node);
     const bool is_matmul = IsMatMul(*node);
     const bool is_add = IsAdd(*node) || IsBiasAdd(*node);
     const bool is_sub = IsSub(*node);
     const bool is_any_div = IsAnyDiv(*node);
     // Simplify arithmetic operations with ones or zeros.
-    if (use_shape_info &&
+    if (safe_to_use_shapes &&
         (is_mul || is_matmul || is_add || is_sub || is_any_div) &&
         properties.HasInputProperties(node->name()) &&
         properties.HasOutputProperties(node->name())) {
-- 
GitLab


From 1fe6b129be3f7f8603b4cf36a6f39493b19f561e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 8 Feb 2018 16:47:58 -0800
Subject: [PATCH 885/966] Extended the Halton sequences to support
 randomization. Implemented the randomization scheme described in
 arXiv:1706.02808.

PiperOrigin-RevId: 185073515
---
 .../kernel_tests/halton_sequence_test.py      | 101 ++++++++--
 .../python/ops/halton_sequence_impl.py        | 185 +++++++++++++-----
 2 files changed, 219 insertions(+), 67 deletions(-)

diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/halton_sequence_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/halton_sequence_test.py
index 0a85862abf..c516ce45e0 100644
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/halton_sequence_test.py
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/halton_sequence_test.py
@@ -36,29 +36,35 @@ class HaltonSequenceTest(test.TestCase):
 
   def test_known_values_small_bases(self):
     with self.test_session():
-      # The first five elements of the Halton sequence with base 2 and 3
+      # The first five elements of the non-randomized Halton sequence
+      # with base 2 and 3.
       expected = np.array(((1. / 2, 1. / 3),
                            (1. / 4, 2. / 3),
                            (3. / 4, 1. / 9),
                            (1. / 8, 4. / 9),
                            (5. / 8, 7. / 9)), dtype=np.float32)
-      sample = halton.sample(2, num_samples=5)
+      sample = halton.sample(2, num_results=5, randomized=False)
       self.assertAllClose(expected, sample.eval(), rtol=1e-6)
 
-  def test_sample_indices(self):
+  def test_sequence_indices(self):
+    """Tests access of sequence elements by index."""
     with self.test_session():
       dim = 5
       indices = math_ops.range(10, dtype=dtypes.int32)
-      sample_direct = halton.sample(dim, num_samples=10)
-      sample_from_indices = halton.sample(dim, sample_indices=indices)
+      sample_direct = halton.sample(dim, num_results=10, randomized=False)
+      sample_from_indices = halton.sample(dim, sequence_indices=indices,
+                                          randomized=False)
       self.assertAllClose(sample_direct.eval(), sample_from_indices.eval(),
                           rtol=1e-6)
 
   def test_dtypes_works_correctly(self):
+    """Tests that all supported dtypes work without error."""
     with self.test_session():
       dim = 3
-      sample_float32 = halton.sample(dim, num_samples=10, dtype=dtypes.float32)
-      sample_float64 = halton.sample(dim, num_samples=10, dtype=dtypes.float64)
+      sample_float32 = halton.sample(dim, num_results=10, dtype=dtypes.float32,
+                                     seed=11)
+      sample_float64 = halton.sample(dim, num_results=10, dtype=dtypes.float64,
+                                     seed=21)
       self.assertEqual(sample_float32.eval().dtype, np.float32)
       self.assertEqual(sample_float64.eval().dtype, np.float64)
 
@@ -79,7 +85,8 @@ class HaltonSequenceTest(test.TestCase):
       p = normal_lib.Normal(loc=mu_p, scale=sigma_p)
       q = normal_lib.Normal(loc=mu_q, scale=sigma_q)
 
-      cdf_sample = halton.sample(2, num_samples=n, dtype=dtypes.float64)
+      cdf_sample = halton.sample(2, num_results=n, dtype=dtypes.float64,
+                                 seed=1729)
       q_sample = q.quantile(cdf_sample)
 
       # Compute E_p[X].
@@ -90,7 +97,7 @@ class HaltonSequenceTest(test.TestCase):
       # Compute E_p[X^2].
       e_x2 = mc.expectation_importance_sampler(
           f=math_ops.square, log_p=p.log_prob, sampling_dist_q=q, z=q_sample,
-          seed=42)
+          seed=1412)
 
       stddev = math_ops.sqrt(e_x2 - math_ops.square(e_x))
       # Keep the tolerance levels the same as in monte_carlo_test.py.
@@ -100,10 +107,10 @@ class HaltonSequenceTest(test.TestCase):
 
   def test_docstring_example(self):
     # Produce the first 1000 members of the Halton sequence in 3 dimensions.
-    num_samples = 1000
+    num_results = 1000
     dim = 3
     with self.test_session():
-      sample = halton.sample(dim, num_samples=num_samples)
+      sample = halton.sample(dim, num_results=num_results, seed=127)
 
       # Evaluate the integral of x_1 * x_2^2 * x_3^3  over the three dimensional
       # hypercube.
@@ -115,16 +122,76 @@ class HaltonSequenceTest(test.TestCase):
       # Produces a relative absolute error of 1.7%.
       self.assertAllClose(integral.eval(), true_value.eval(), rtol=0.02)
 
-    # Now skip the first 1000 samples and recompute the integral with the next
-    # thousand samples. The sample_indices argument can be used to do this.
+      # Now skip the first 1000 samples and recompute the integral with the next
+      # thousand samples. The sequence_indices argument can be used to do this.
 
-      sample_indices = math_ops.range(start=1000, limit=1000 + num_samples,
-                                      dtype=dtypes.int32)
-      sample_leaped = halton.sample(dim, sample_indices=sample_indices)
+      sequence_indices = math_ops.range(start=1000, limit=1000 + num_results,
+                                        dtype=dtypes.int32)
+      sample_leaped = halton.sample(dim, sequence_indices=sequence_indices,
+                                    seed=111217)
 
       integral_leaped = math_ops.reduce_mean(
           math_ops.reduce_prod(sample_leaped ** powers, axis=-1))
-      self.assertAllClose(integral_leaped.eval(), true_value.eval(), rtol=0.001)
+      self.assertAllClose(integral_leaped.eval(), true_value.eval(), rtol=0.01)
+
+  def test_randomized_qmc_basic(self):
+    """Tests the randomization of the Halton sequences."""
+    # This test is identical to the example given in Owen (2017), Figure 5.
+
+    dim = 20
+    num_results = 5000
+    replica = 10
+
+    with self.test_session():
+      sample = halton.sample(dim, num_results=num_results, seed=121117)
+      f = math_ops.reduce_mean(math_ops.reduce_sum(sample, axis=1) ** 2)
+      values = [f.eval() for _ in range(replica)]
+      self.assertAllClose(np.mean(values), 101.6667, atol=np.std(values) * 2)
+
+  def test_partial_sum_func_qmc(self):
+    """Tests the QMC evaluation of (x_j + x_{j+1} ...+x_{n})^2.
+
+    A good test of QMC is provided by the function:
+
+      f(x_1,..x_n, x_{n+1}, ..., x_{n+m}) = (x_{n+1} + ... x_{n+m} - m / 2)^2
+
+    with the coordinates taking values in the unit interval. The mean and
+    variance of this function (with the uniform distribution over the
+    unit-hypercube) is exactly calculable:
+
+      <f> = m / 12, Var(f) = m (5m - 3) / 360
+
+    The purpose of the "shift" (if n > 0) in the coordinate dependence of the
+    function is to provide a test for Halton sequence which exhibit more
+    dependence in the higher axes.
+
+    This test confirms that the mean squared error of RQMC estimation falls
+    as O(N^(2-e)) for any e>0.
+    """
+
+    n, m = 10, 10
+    dim = n + m
+    num_results_lo, num_results_hi = 1000, 10000
+    replica = 20
+    true_mean = m / 12.
+
+    def func_estimate(x):
+      return math_ops.reduce_mean(
+          (math_ops.reduce_sum(x[:, -m:], axis=-1) - m / 2.0) ** 2)
+
+    with self.test_session():
+      sample_lo = halton.sample(dim, num_results=num_results_lo, seed=1925)
+      sample_hi = halton.sample(dim, num_results=num_results_hi, seed=898128)
+      f_lo, f_hi = func_estimate(sample_lo), func_estimate(sample_hi)
+
+      estimates = np.array([(f_lo.eval(), f_hi.eval()) for _ in range(replica)])
+      var_lo, var_hi = np.mean((estimates - true_mean) ** 2, axis=0)
+
+      # Expect that the variance scales as N^2 so var_hi / var_lo ~ k / 10^2
+      # with k a fudge factor accounting for the residual N dependence
+      # of the QMC error and the sampling error.
+      log_rel_err = np.log(100 * var_hi / var_lo)
+      self.assertAllClose(log_rel_err, 0.0, atol=1.2)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/bayesflow/python/ops/halton_sequence_impl.py b/tensorflow/contrib/bayesflow/python/ops/halton_sequence_impl.py
index 8cabf18903..57900d6818 100644
--- a/tensorflow/contrib/bayesflow/python/ops/halton_sequence_impl.py
+++ b/tensorflow/contrib/bayesflow/python/ops/halton_sequence_impl.py
@@ -26,8 +26,9 @@ import numpy as np
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import math_ops
-
+from tensorflow.python.ops import random_ops
 
 __all__ = [
     'sample',
@@ -39,32 +40,45 @@ __all__ = [
 _MAX_DIMENSION = 1000
 
 
-def sample(dim, num_samples=None, sample_indices=None, dtype=None, name=None):
-  r"""Returns a sample from the `m` dimensional Halton sequence.
+def sample(dim,
+           num_results=None,
+           sequence_indices=None,
+           dtype=None,
+           randomized=True,
+           seed=None,
+           name=None):
+  r"""Returns a sample from the `dim` dimensional Halton sequence.
 
   Warning: The sequence elements take values only between 0 and 1. Care must be
   taken to appropriately transform the domain of a function if it differs from
   the unit cube before evaluating integrals using Halton samples. It is also
-  important to remember that quasi-random numbers are not a replacement for
-  pseudo-random numbers in every context. Quasi random numbers are completely
-  deterministic and typically have significant negative autocorrelation (unless
-  randomized).
+  important to remember that quasi-random numbers without randomization are not
+  a replacement for pseudo-random numbers in every context. Quasi random numbers
+  are completely deterministic and typically have significant negative
+  autocorrelation unless randomization is used.
 
   Computes the members of the low discrepancy Halton sequence in dimension
-  `dim`. The d-dimensional sequence takes values in the unit hypercube in d
-  dimensions. Currently, only dimensions up to 1000 are supported. The prime
-  base for the `k`-th axes is the k-th prime starting from 2. For example,
-  if dim = 3, then the bases will be [2, 3, 5] respectively and the first
-  element of the sequence will be: [0.5, 0.333, 0.2]. For a more complete
-  description of the Halton sequences see:
+  `dim`. The `dim`-dimensional sequence takes values in the unit hypercube in
+  `dim` dimensions. Currently, only dimensions up to 1000 are supported. The
+  prime base for the k-th axes is the k-th prime starting from 2. For example,
+  if `dim` = 3, then the bases will be [2, 3, 5] respectively and the first
+  element of the non-randomized sequence will be: [0.5, 0.333, 0.2]. For a more
+  complete description of the Halton sequences see:
   https://en.wikipedia.org/wiki/Halton_sequence. For low discrepancy sequences
   and their applications see:
   https://en.wikipedia.org/wiki/Low-discrepancy_sequence.
 
-  The user must supply either `num_samples` or `sample_indices` but not both.
+  If `randomized` is true, this function produces a scrambled version of the
+  Halton sequence introduced by Owen in arXiv:1706.02808. For the advantages of
+  randomization of low discrepancy sequences see:
+  https://en.wikipedia.org/wiki/Quasi-Monte_Carlo_method#Randomization_of_quasi-Monte_Carlo
+
+  The number of samples produced is controlled by the `num_results` and
+  `sequence_indices` parameters. The user must supply either `num_results` or
+  `sequence_indices` but not both.
   The former is the number of samples to produce starting from the first
-  element. If `sample_indices` is given instead, the specified elements of
-  the sequence are generated. For example, sample_indices=tf.range(10) is
+  element. If `sequence_indices` is given instead, the specified elements of
+  the sequence are generated. For example, sequence_indices=tf.range(10) is
   equivalent to specifying n=10.
 
   Example Use:
@@ -73,9 +87,9 @@ def sample(dim, num_samples=None, sample_indices=None, dtype=None, name=None):
   bf = tf.contrib.bayesflow
 
   # Produce the first 1000 members of the Halton sequence in 3 dimensions.
-  num_samples = 1000
+  num_results = 1000
   dim = 3
-  sample = bf.halton_sequence.sample(dim, num_samples=num_samples)
+  sample = bf.halton_sequence.sample(dim, num_results=num_results, seed=127)
 
   # Evaluate the integral of x_1 * x_2^2 * x_3^3  over the three dimensional
   # hypercube.
@@ -89,12 +103,13 @@ def sample(dim, num_samples=None, sample_indices=None, dtype=None, name=None):
   print ("Estimated: %f, True Value: %f" % values)
 
   # Now skip the first 1000 samples and recompute the integral with the next
-  # thousand samples. The sample_indices argument can be used to do this.
+  # thousand samples. The sequence_indices argument can be used to do this.
 
 
-  sample_indices = tf.range(start=1000, limit=1000 + num_samples,
-                            dtype=tf.int32)
-  sample_leaped = halton.sample(dim, sample_indices=sample_indices)
+  sequence_indices = tf.range(start=1000, limit=1000 + num_results,
+                              dtype=tf.int32)
+  sample_leaped = halton.sample(dim, sequence_indices=sequence_indices,
+                                seed=111217)
 
   integral_leaped = tf.reduce_mean(tf.reduce_prod(sample_leaped ** powers,
                                                   axis=-1))
@@ -107,51 +122,57 @@ def sample(dim, num_samples=None, sample_indices=None, dtype=None, name=None):
   Args:
     dim: Positive Python `int` representing each sample's `event_size.` Must
       not be greater than 1000.
-    num_samples: (Optional) positive Python `int`. The number of samples to
-      generate. Either this parameter or sample_indices must be specified but
+    num_results: (Optional) positive Python `int`. The number of samples to
+      generate. Either this parameter or sequence_indices must be specified but
       not both. If this parameter is None, then the behaviour is determined by
-      the `sample_indices`.
-    sample_indices: (Optional) `Tensor` of dtype int32 and rank 1. The elements
-      of the sequence to compute specified by their position in the sequence.
-      The entries index into the Halton sequence starting with 0 and hence,
-      must be whole numbers. For example, sample_indices=[0, 5, 6] will produce
-      the first, sixth and seventh elements of the sequence. If this parameter
-      is None, then the `num_samples` parameter must be specified which gives
-      the number of desired samples starting from the first sample.
+      the `sequence_indices`.
+    sequence_indices: (Optional) `Tensor` of dtype int32 and rank 1. The
+      elements of the sequence to compute specified by their position in the
+      sequence. The entries index into the Halton sequence starting with 0 and
+      hence, must be whole numbers. For example, sequence_indices=[0, 5, 6] will
+      produce the first, sixth and seventh elements of the sequence. If this
+      parameter is None, then the `num_results` parameter must be specified
+      which gives the number of desired samples starting from the first sample.
     dtype: (Optional) The dtype of the sample. One of `float32` or `float64`.
       Default is `float32`.
+    randomized: (Optional) bool indicating whether to produce a randomized
+      Halton sequence. If True, applies the randomization described in
+      Owen (2017) [arXiv:1706.02808].
+    seed: (Optional) Python integer to seed the random number generator. Only
+      used if `randomized` is True. If not supplied and `randomized` is True,
+      no seed is set.
     name:  (Optional) Python `str` describing ops managed by this function. If
     not supplied the name of this function is used.
 
   Returns:
     halton_elements: Elements of the Halton sequence. `Tensor` of supplied dtype
-    and `shape` `[num_samples, dim]` if `num_samples` was specified or shape
-    `[s, dim]` where s is the size of `sample_indices` if `sample_indices`
+    and `shape` `[num_results, dim]` if `num_results` was specified or shape
+    `[s, dim]` where s is the size of `sequence_indices` if `sequence_indices`
     were specified.
 
   Raises:
-    ValueError: if both `sample_indices` and `num_samples` were specified or
+    ValueError: if both `sequence_indices` and `num_results` were specified or
     if dimension `dim` is less than 1 or greater than 1000.
   """
   if dim < 1 or dim > _MAX_DIMENSION:
     raise ValueError(
         'Dimension must be between 1 and {}. Supplied {}'.format(_MAX_DIMENSION,
                                                                  dim))
-  if (num_samples is None) == (sample_indices is None):
-    raise ValueError('Either `num_samples` or `sample_indices` must be'
+  if (num_results is None) == (sequence_indices is None):
+    raise ValueError('Either `num_results` or `sequence_indices` must be'
                      ' specified but not both.')
 
   dtype = dtype or dtypes.float32
   if not dtype.is_floating:
     raise ValueError('dtype must be of `float`-type')
 
-  with ops.name_scope(name, 'sample', values=[sample_indices]):
+  with ops.name_scope(name, 'sample', values=[sequence_indices]):
     # Here and in the following, the shape layout is as follows:
     # [sample dimension, event dimension, coefficient dimension].
     # The coefficient dimension is an intermediate axes which will hold the
     # weights of the starting integer when expressed in the (prime) base for
     # an event dimension.
-    indices = _get_indices(num_samples, sample_indices, dtype)
+    indices = _get_indices(num_results, sequence_indices, dtype)
     radixes = array_ops.constant(_PRIMES[0:dim], dtype=dtype, shape=[dim, 1])
 
     max_sizes_by_axes = _base_expansion_size(math_ops.reduce_max(indices),
@@ -176,11 +197,74 @@ def sample(dim, num_samples=None, sample_indices=None, dtype=None, name=None):
     weights = radixes ** capped_exponents
     coeffs = math_ops.floor_div(indices, weights)
     coeffs *= 1 - math_ops.cast(weight_mask, dtype)
-    coeffs = (coeffs % radixes) / radixes
-    return math_ops.reduce_sum(coeffs / weights, axis=-1)
+    coeffs %= radixes
+    if not randomized:
+      coeffs /= radixes
+      return math_ops.reduce_sum(coeffs / weights, axis=-1)
+    coeffs = _randomize(coeffs, radixes, seed=seed)
+    coeffs *= 1 - math_ops.cast(weight_mask, dtype)
+    coeffs /= radixes
+    base_values = math_ops.reduce_sum(coeffs / weights, axis=-1)
+
+    # The randomization used in Owen (2017) does not leave 0 invariant. While
+    # we have accounted for the randomization of the first `max_size_by_axes`
+    # coefficients, we still need to correct for the trailing zeros. Luckily,
+    # this is equivalent to adding a uniform random value scaled so the first
+    # `max_size_by_axes` coefficients are zero. The following statements perform
+    # this correction.
+    zero_correction = random_ops.random_uniform([dim, 1], seed=seed,
+                                                dtype=dtype)
+    zero_correction /= (radixes ** max_sizes_by_axes)
+    return base_values + array_ops.reshape(zero_correction, [-1])
+
+
+def _randomize(coeffs, radixes, seed=None):
+  """Applies the Owen randomization to the coefficients."""
+  given_dtype = coeffs.dtype
+  coeffs = math_ops.to_int32(coeffs)
+  num_coeffs = array_ops.shape(coeffs)[-1]
+  radixes = array_ops.reshape(math_ops.to_int32(radixes), [-1])
+  perms = _get_permutations(num_coeffs, radixes, seed=seed)
+  perms = array_ops.reshape(perms, [-1])
+  radix_sum = math_ops.reduce_sum(radixes)
+  radix_offsets = array_ops.reshape(math_ops.cumsum(radixes, exclusive=True),
+                                    [-1, 1])
+  offsets = radix_offsets + math_ops.range(num_coeffs) * radix_sum
+  permuted_coeffs = array_ops.gather(perms, coeffs + offsets)
+  return math_ops.cast(permuted_coeffs, dtype=given_dtype)
+
+
+def _get_permutations(num_results, dims, seed=None):
+  """Uniform iid sample from the space of permutations.
+
+  Draws a sample of size `num_results` from the group of permutations of degrees
+  specified by the `dims` tensor. These are packed together into one tensor
+  such that each row is one sample from each of the dimensions in `dims`. For
+  example, if dims = [2,3] and num_results = 2, the result is a tensor of shape
+  [2, 2 + 3] and the first row of the result might look like:
+  [1, 0, 2, 0, 1]. The first two elements are a permutation over 2 elements
+  while the next three are a permutation over 3 elements.
 
+  Args:
+    num_results: A positive scalar `Tensor` of integral type. The number of
+      draws from the discrete uniform distribution over the permutation groups.
+    dims: A 1D `Tensor` of the same dtype as `num_results`. The degree of the
+      permutation groups from which to sample.
+    seed: (Optional) Python integer to seed the random number generator.
 
-def _get_indices(n, sample_indices, dtype, name=None):
+  Returns:
+    permutations: A `Tensor` of shape `[num_results, sum(dims)]` and the same
+    dtype as `dims`.
+  """
+  sample_range = math_ops.range(num_results)
+  def generate_one(d):
+    fn = lambda _: random_ops.random_shuffle(math_ops.range(d), seed=seed)
+    return functional_ops.map_fn(fn, sample_range)
+  return array_ops.concat([generate_one(d) for d in array_ops.unstack(dims)],
+                          axis=-1)
+
+
+def _get_indices(n, sequence_indices, dtype, name=None):
   """Generates starting points for the Halton sequence procedure.
 
   The k'th element of the sequence is generated starting from a positive integer
@@ -191,10 +275,10 @@ def _get_indices(n, sample_indices, dtype, name=None):
 
   Args:
     n: Positive `int`. The number of samples to generate. If this
-      parameter is supplied, then `sample_indices` should be None.
-    sample_indices: `Tensor` of dtype int32 and rank 1. The entries
+      parameter is supplied, then `sequence_indices` should be None.
+    sequence_indices: `Tensor` of dtype int32 and rank 1. The entries
       index into the Halton sequence starting with 0 and hence, must be whole
-      numbers. For example, sample_indices=[0, 5, 6] will produce the first,
+      numbers. For example, sequence_indices=[0, 5, 6] will produce the first,
       sixth and seventh elements of the sequence. If this parameter is not None
       then `n` must be None.
     dtype: The dtype of the sample. One of `float32` or `float64`.
@@ -204,14 +288,14 @@ def _get_indices(n, sample_indices, dtype, name=None):
   Returns:
     indices: `Tensor` of dtype `dtype` and shape = `[n, 1, 1]`.
   """
-  with ops.name_scope(name, 'get_indices', [n, sample_indices]):
-    if sample_indices is None:
-      sample_indices = math_ops.range(n, dtype=dtype)
+  with ops.name_scope(name, '_get_indices', [n, sequence_indices]):
+    if sequence_indices is None:
+      sequence_indices = math_ops.range(n, dtype=dtype)
     else:
-      sample_indices = math_ops.cast(sample_indices, dtype)
+      sequence_indices = math_ops.cast(sequence_indices, dtype)
 
     # Shift the indices so they are 1 based.
-    indices = sample_indices + 1
+    indices = sequence_indices + 1
 
     # Reshape to make space for the event dimension and the place value
     # coefficients.
@@ -261,4 +345,5 @@ def _primes_less_than(n):
 
 _PRIMES = _primes_less_than(7919+1)
 
+
 assert len(_PRIMES) == _MAX_DIMENSION
-- 
GitLab


From 7f1f8b6f75c03c80dd153d508ffe255da1b151c1 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Thu, 8 Feb 2018 17:02:48 -0800
Subject: [PATCH 886/966] Add tf_export decorators back to gen_*_ops.py files.

PiperOrigin-RevId: 185075262
---
 tensorflow/python/eager/python_eager_op_gen.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/python/eager/python_eager_op_gen.cc b/tensorflow/python/eager/python_eager_op_gen.cc
index 34e1e4cd8f..e6d03297e0 100644
--- a/tensorflow/python/eager/python_eager_op_gen.cc
+++ b/tensorflow/python/eager/python_eager_op_gen.cc
@@ -641,6 +641,7 @@ void GenEagerPythonOp::AddEagerFunctionTeardown(
 bool GenEagerPythonOp::AddEagerFastPathAndGraphCode(
     const string& parameters, const std::vector<string>& output_sizes,
     const string& eager_not_allowed_error) {
+  AddExport();
   AddDefLine(function_name_, parameters);
   AddDocStringDescription();
   AddDocStringArgs();
-- 
GitLab


From 72b1a058613c26938a57670b3f32e29ba0e58d23 Mon Sep 17 00:00:00 2001
From: Yao Zhang <yaozhang@google.com>
Date: Thu, 8 Feb 2018 23:17:54 -0800
Subject: [PATCH 887/966] Only convert format if input is of layout-agnostic
 type.

PiperOrigin-RevId: 185103227
---
 .../grappler/optimizers/layout_optimizer.cc   | 43 ++++++++++++-------
 .../python/grappler/layout_optimizer_test.py  | 31 +++++++++++++
 2 files changed, 58 insertions(+), 16 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.cc b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
index 433b3564fe..e1a2d65278 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
@@ -1400,39 +1400,44 @@ class HistogramSummaryProcessor : public AgnosticNodeProcessor {
 class IdentityNProcessor : public AgnosticNodeProcessor {
  public:
   explicit IdentityNProcessor(const OptimizeContext& opt_cxt)
-      : AgnosticNodeProcessor(opt_cxt) {}
-
- protected:
-  bool ShouldProcess() const override {
-    return !MustPreserve() && HasOutputs() && IsNodeAfterNCHWToNHWC() &&
-           IsOnGPU();
-  }
-
-  std::vector<int> GetInputPos() const override {
-    std::vector<int> input_pos;
+      : AgnosticNodeProcessor(opt_cxt) {
+    std::set<string> ops_format_agnostic = GetOpsFormatAgnostic();
     for (int i = 0; i < node_->input_size(); i++) {
       auto input = node_map_->GetNode(node_->input(i));
       int port;
       ParseNodeName(node_->input(i), &port);
       // Skip control input.
       if (port != -1) {
+        bool is_agnostic =
+            ops_format_agnostic.find(input->op()) != ops_format_agnostic.end();
         if (IsPortDimsFour(*input, port) &&
-            (IsNodeAfterNCHWToNHWC(*input) ||
+            ((IsNodeAfterNCHWToNHWC(*input) && is_agnostic) ||
              IsTransposeNCHWToNHWC(input->name()))) {
-          input_pos.push_back(i);
+          input_pos_.push_back(i);
         }
       }
     }
-    return input_pos;
   }
 
+ protected:
+  bool ShouldProcess() const override {
+    return !MustPreserve() && HasOutputs() && IsNodeAfterNCHWToNHWC() &&
+           IsOnGPU();
+  }
+
+  std::vector<int> GetInputPos() const override { return input_pos_; }
+
   std::set<int> GetOutputPos() const override {
+    std::vector<int> input_poses;
     std::set<int> output_pos{};
-    for (const auto& input_pos : GetInputPos()) {
+    for (const auto& input_pos : input_pos_) {
       output_pos.insert(input_pos);
     }
     return output_pos;
   }
+
+ private:
+  std::vector<int> input_pos_;
 };
 
 class ShapeProcessor : public IdentityNProcessor {
@@ -1471,10 +1476,16 @@ class MergeProcessor : public AgnosticNodeProcessor {
 
  private:
   bool IsEveryInputAfterNCHWToNHWC() const {
+    std::set<string> ops_format_agnostic = GetOpsFormatAgnostic();
     for (const auto& input : node_->input()) {
       auto input_node = node_map_->GetNode(input);
-      if (IsNodeAfterNCHWToNHWC(*input_node) ||
-          IsTransposeNCHWToNHWC(input_node->name())) {
+      int port;
+      ParseNodeName(input, &port);
+      bool is_agnostic = ops_format_agnostic.find(input_node->op()) !=
+                         ops_format_agnostic.end();
+      if (IsPortDimsFour(*input_node, port) &&
+          ((IsNodeAfterNCHWToNHWC(*input_node) && is_agnostic) ||
+           IsTransposeNCHWToNHWC(input_node->name()))) {
         continue;
       }
       return false;
diff --git a/tensorflow/python/grappler/layout_optimizer_test.py b/tensorflow/python/grappler/layout_optimizer_test.py
index 5bc9e4b803..25b1cdcbc5 100644
--- a/tensorflow/python/grappler/layout_optimizer_test.py
+++ b/tensorflow/python/grappler/layout_optimizer_test.py
@@ -1127,6 +1127,37 @@ class LayoutOptimizerTest(test.TestCase):
       self._assert_vec_nchw_to_nhwc('ShapeN-0-0', nodes)
       self.assertAllEqual(output_val_ref, output_val)
 
+  def testShapeNFollowedByNotConvertibleNodeReshape(self):
+    if test.is_gpu_available(cuda_only=True):
+      x = array_ops.placeholder(dtype='float32')
+      conv = _two_layer_model(x)
+      conv_reshape = array_ops.reshape(conv, [1, 1, 1, -1])
+      shapen = array_ops.shape_n([conv, conv_reshape])
+      shape = array_ops.identity(shapen[1])
+      ones = array_ops.ones(shape)
+      output = math_ops.add_n([conv_reshape, ones])
+
+      x_val = [1.7] * 784
+      with session.Session() as sess:
+        output_val_ref = sess.run(output, feed_dict={x: x_val})
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(
+            output, run_metadata=metadata, feed_dict={x: x_val})
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      expected_num_transposes = 2
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
+      self.assertAllEqual(output_val_ref, output_val)
+
   def testLoop(self):
     if test.is_gpu_available(cuda_only=True):
       output = _loop()
-- 
GitLab


From ca6eb7ef3b6f73d125ffb8b9e4e1a4f2d31f27fd Mon Sep 17 00:00:00 2001
From: gracehoney <31743510+aaroey@users.noreply.github.com>
Date: Fri, 9 Feb 2018 07:53:43 -0800
Subject: [PATCH 888/966] Make build happy by adding missing OP_REQUIRES_OK and
 if_tensorrt

---
 tensorflow/contrib/tensorrt/BUILD             | 22 ++++++++++++-------
 .../contrib/tensorrt/kernels/trt_engine_op.cc |  5 +++--
 2 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index 3fb4553a51..91d58dfa75 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -52,8 +52,9 @@ tf_custom_op_library(
         ":trt_engine_op_kernel",
         ":trt_shape_function",
         "//tensorflow/core:lib_proto_parsing",
+    ] + if_tensorrt([
         "@local_config_tensorrt//:nv_infer",
-    ],
+    ]),
 )
 
 tf_cuda_library(
@@ -63,8 +64,9 @@ tf_cuda_library(
     visibility = ["//visibility:public"],
     deps = [
         ":trt_logging",
+    ] + if_tensorrt([
         "@local_config_tensorrt//:nv_infer",
-    ] + tf_custom_op_library_additional_deps(),
+    ]) + tf_custom_op_library_additional_deps(),
 )
 
 cc_library(
@@ -77,16 +79,17 @@ cc_library(
         "//tensorflow/core:gpu_headers_lib",
         "//tensorflow/core:lib_proto_parsing",
         "//tensorflow/core:stream_executor_headers_lib",
+    ] + if_tensorrt([
         "@local_config_tensorrt//:nv_infer",
-    ] + tf_custom_op_library_additional_deps(),
+    ]) + tf_custom_op_library_additional_deps(),
     alwayslink = 1,
 )
 
 tf_gen_op_libs(
     op_lib_names = ["trt_engine_op"],
-    deps = [
+    deps = if_tensorrt([
         "@local_config_tensorrt//:nv_infer",
-    ],
+    ]),
 )
 
 tf_cuda_library(
@@ -96,8 +99,9 @@ tf_cuda_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/core:lib_proto_parsing",
+    ] + if_tensorrt([
         "@local_config_tensorrt//:nv_infer",
-    ],
+    ]),
 )
 
 tf_gen_op_wrapper_py(
@@ -114,8 +118,9 @@ tf_custom_op_py_library(
     srcs = ["python/ops/trt_engine_op.py"],
     dso = [
         ":python/ops/_trt_engine_op.so",
+    ] + if_tensorrt([
         "@local_config_tensorrt//:nv_infer",
-    ],
+    ]),
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -187,8 +192,9 @@ tf_cuda_library(
         "//tensorflow/core/grappler/costs:graph_properties",
         "//tensorflow/core/grappler/optimizers:constant_folding",
         "//tensorflow/core/grappler/optimizers:layout_optimizer",
+    ] + if_tensorrt([
         "@local_config_tensorrt//:nv_infer",
-    ] + tf_custom_op_library_additional_deps(),
+    ]) + tf_custom_op_library_additional_deps(),
 )
 
 # Library for the segmenting portion of TensorRT operation creation
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
index 1f25048f83..8efdf63ebe 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
@@ -96,8 +96,9 @@ void TRTEngineOp::Compute(OpKernelContext* context) {
       std::vector<int> trt_shape(dims.nbDims + 1);
       trt_shape[0] = num_batch;
       for (int j = 0; j < dims.nbDims; j++) trt_shape[j + 1] = dims.d[j];
-      TensorShapeUtils::MakeShape(trt_shape.data(), trt_shape.size(),
-                                  &output_shape);
+      OP_REQUIRES_OK(context,
+                     TensorShapeUtils::MakeShape(
+                         trt_shape.data(), trt_shape.size(), &output_shape));
     } else {
       LOG(FATAL) << "output node not found, at " << output_nodes_[i];
       break;
-- 
GitLab


From 09aea16dcb347a6b01c6446da16a553ec260e21d Mon Sep 17 00:00:00 2001
From: Sukriti Ramesh <sukritiramesh@google.com>
Date: Fri, 9 Feb 2018 07:55:46 -0800
Subject: [PATCH 889/966] Comment update.

PiperOrigin-RevId: 185141668
---
 tensorflow/python/estimator/estimator.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index 5d36108bbf..7bf838e5a0 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -617,7 +617,6 @@ class Estimator(object):
             sharded=True)
         saver_for_restore.restore(session, checkpoint_path)
 
-        # TODO(b/36111876): replace legacy_init_op with main_op mechanism
         # pylint: disable=protected-access
         local_init_op = (
             estimator_spec.scaffold.local_init_op or
-- 
GitLab


From 9b9567adccde7ab08d683b5b64a8870e74635f62 Mon Sep 17 00:00:00 2001
From: gracehoney <31743510+aaroey@users.noreply.github.com>
Date: Fri, 9 Feb 2018 08:46:15 -0800
Subject: [PATCH 890/966] Fix more build dependencies and includes.

---
 tensorflow/contrib/tensorrt/BUILD                    | 9 +++++++++
 tensorflow/contrib/tensorrt/convert/convert_graph.cc | 2 --
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index 91d58dfa75..cf67c27b70 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -184,8 +184,13 @@ tf_cuda_library(
     deps = [
         ":segment",
         ":trt_logging",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core:framework",
         "//tensorflow/core:framework_lite",
         "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:devices",
         "//tensorflow/core/grappler/clusters:virtual_cluster",
@@ -208,6 +213,8 @@ cc_library(
     linkstatic = 1,
     deps = [
         "//tensorflow/core:graph",
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:protos_all_cc",
         "@protobuf_archive//:protobuf_headers",
     ],
 )
@@ -219,6 +226,8 @@ tf_cc_test(
     deps = [
         ":segment",
         "//tensorflow/c:c_api",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index e3364467f7..f81752907d 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -23,8 +23,6 @@ limitations under the License.
 
 #include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
 #include "tensorflow/contrib/tensorrt/segment/segment.h"
-#include "tensorflow/core/framework/graph.pb.h"
-#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
-- 
GitLab


From 2621012aaddd655c642ac347fe4b76184bf03f90 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 9 Feb 2018 09:12:19 -0800
Subject: [PATCH 891/966] Update for LLVM API change r324700

PiperOrigin-RevId: 185149198
---
 .../xla/service/cpu/simple_orc_jit.cc         | 25 +++++++++----------
 .../compiler/xla/service/cpu/simple_orc_jit.h | 10 ++++----
 2 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
index acbb7557fc..2f4468cca7 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
@@ -180,19 +180,18 @@ SimpleOrcJIT::SimpleOrcJIT(const llvm::TargetOptions& target_options,
           << " features: " << target_machine_->getTargetFeatureString().str();
 }
 
-SimpleOrcJIT::ModuleHandleT SimpleOrcJIT::AddModule(
+SimpleOrcJIT::VModuleKeyT SimpleOrcJIT::AddModule(
     std::unique_ptr<llvm::Module> module) {
-  auto handle = cantFail(compile_layer_.addModule(
-      execution_session_.allocateVModule(), std::move(module)));
-  module_handles_.push_back(handle);
-  return handle;
+  auto key = execution_session_.allocateVModule();
+  cantFail(compile_layer_.addModule(key, std::move(module)));
+  module_keys_.push_back(key);
+  return key;
 }
 
-void SimpleOrcJIT::RemoveModule(SimpleOrcJIT::ModuleHandleT handle) {
-  module_handles_.erase(
-      std::remove(module_handles_.begin(), module_handles_.end(), handle),
-      module_handles_.end());
-  cantFail(compile_layer_.removeModule(handle));
+void SimpleOrcJIT::RemoveModule(SimpleOrcJIT::VModuleKeyT key) {
+  module_keys_.erase(std::remove(module_keys_.begin(), module_keys_.end(), key),
+                     module_keys_.end());
+  cantFail(compile_layer_.removeModule(key));
 }
 
 llvm::JITSymbol SimpleOrcJIT::FindSymbol(const std::string& name) {
@@ -204,10 +203,10 @@ llvm::JITSymbol SimpleOrcJIT::FindSymbol(const std::string& name) {
 
   // Resolve symbol from last module to first, allowing later redefinitions of
   // symbols shadow earlier ones.
-  for (auto& handle :
-       llvm::make_range(module_handles_.rbegin(), module_handles_.rend())) {
+  for (auto& key :
+       llvm::make_range(module_keys_.rbegin(), module_keys_.rend())) {
     if (auto symbol =
-            compile_layer_.findSymbolIn(handle, mangled_name,
+            compile_layer_.findSymbolIn(key, mangled_name,
                                         /*ExportedSymbolsOnly=*/true)) {
       return symbol;
     }
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
index 0f6456170f..50993afc8f 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
@@ -50,7 +50,7 @@ class SimpleOrcJIT {
       std::function<llvm::object::OwningBinary<llvm::object::ObjectFile>(
           llvm::Module&)>;
   using CompileLayerT = llvm::orc::IRCompileLayer<ObjLayerT, CompileFtor>;
-  using ModuleHandleT = CompileLayerT::ModuleHandleT;
+  using VModuleKeyT = llvm::orc::VModuleKey;
 
   // Create a new JIT, targeting the host architecture.
   // The |target_options| parameter allows customization of certain code
@@ -80,12 +80,12 @@ class SimpleOrcJIT {
     return target_machine_->getTargetTriple();
   }
 
-  // Add a module to the JIT. Returns an opaque handle that can be used to later
+  // Add a module to the JIT. Returns an opaque key that can be used to later
   // remove this module.
-  ModuleHandleT AddModule(std::unique_ptr<llvm::Module> module);
+  VModuleKeyT AddModule(std::unique_ptr<llvm::Module> module);
 
   // Remove a module from the JIT and free the memory associated with it.
-  void RemoveModule(ModuleHandleT handle);
+  void RemoveModule(VModuleKeyT key);
 
   // Get the runtime address of the compiled symbol whose name is given. Returns
   // nullptr if the symbol cannot be found.
@@ -98,7 +98,7 @@ class SimpleOrcJIT {
   }
 
  private:
-  std::vector<ModuleHandleT> module_handles_;
+  std::vector<VModuleKeyT> module_keys_;
   std::unique_ptr<llvm::TargetMachine> target_machine_;
   const Disassembler disassembler_;
   const llvm::DataLayout data_layout_;
-- 
GitLab


From 5e6e691ba320fba71aec0a426ff6e5608ddcad1d Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Fri, 9 Feb 2018 09:25:10 -0800
Subject: [PATCH 892/966] TFTS: Add exporting to SavedModel to the LSTM example

Was previously not using a state manager, which among other issues prevented
exporting.

Fixes #16590.

PiperOrigin-RevId: 185150900
---
 .../contrib/timeseries/examples/lstm.py       | 33 +++++++++++++++++--
 .../contrib/timeseries/examples/lstm_test.py  |  3 +-
 2 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/tensorflow/contrib/timeseries/examples/lstm.py b/tensorflow/contrib/timeseries/examples/lstm.py
index c834430b95..630f4fc057 100644
--- a/tensorflow/contrib/timeseries/examples/lstm.py
+++ b/tensorflow/contrib/timeseries/examples/lstm.py
@@ -20,12 +20,14 @@ from __future__ import print_function
 
 import functools
 from os import path
+import tempfile
 
 import numpy
 import tensorflow as tf
 
 from tensorflow.contrib.timeseries.python.timeseries import estimators as ts_estimators
 from tensorflow.contrib.timeseries.python.timeseries import model as ts_model
+from tensorflow.contrib.timeseries.python.timeseries import state_management
 
 try:
   import matplotlib  # pylint: disable=g-import-not-at-top
@@ -70,7 +72,7 @@ class _LSTMModel(ts_model.SequentialTimeSeriesModel):
     self._lstm_cell_run = None
     self._predict_from_lstm_output = None
 
-  def initialize_graph(self, input_statistics):
+  def initialize_graph(self, input_statistics=None):
     """Save templates for components, which can then be used repeatedly.
 
     This method is called every time a new graph is created. It's safe to start
@@ -168,12 +170,15 @@ class _LSTMModel(ts_model.SequentialTimeSeriesModel):
 
 
 def train_and_predict(
-    csv_file_name=_DATA_FILE, training_steps=200, estimator_config=None):
+    csv_file_name=_DATA_FILE, training_steps=200, estimator_config=None,
+    export_directory=None):
   """Train and predict using a custom time series model."""
   # Construct an Estimator from our LSTM model.
   estimator = ts_estimators.TimeSeriesRegressor(
       model=_LSTMModel(num_features=5, num_units=128),
-      optimizer=tf.train.AdamOptimizer(0.001), config=estimator_config)
+      optimizer=tf.train.AdamOptimizer(0.001), config=estimator_config,
+      # Set state to be saved across windows.
+      state_manager=state_management.ChainingStateManager())
   reader = tf.contrib.timeseries.CSVReader(
       csv_file_name,
       column_names=((tf.contrib.timeseries.TrainEvalFeatures.TIMES,)
@@ -192,6 +197,28 @@ def train_and_predict(
   predicted_mean = numpy.squeeze(numpy.concatenate(
       [evaluation["mean"][0], predictions["mean"]], axis=0))
   all_times = numpy.concatenate([times, predictions["times"]], axis=0)
+
+  # Export the model in SavedModel format.
+  if export_directory is None:
+    export_directory = tempfile.mkdtemp()
+  input_receiver_fn = estimator.build_raw_serving_input_receiver_fn()
+  export_location = estimator.export_savedmodel(
+      export_directory, input_receiver_fn)
+
+  # Predict using the SavedModel
+  with tf.Graph().as_default():
+    with tf.Session() as session:
+      signatures = tf.saved_model.loader.load(
+          session, [tf.saved_model.tag_constants.SERVING], export_location)
+      saved_model_output = (
+          tf.contrib.timeseries.saved_model_utils.predict_continuation(
+              continue_from=evaluation, signatures=signatures,
+              session=session, steps=100))
+      # The exported model gives the same results as the Estimator.predict()
+      # call above.
+      numpy.testing.assert_allclose(
+          predictions["mean"],
+          numpy.squeeze(saved_model_output["mean"], axis=0))
   return times, observed, all_times, predicted_mean
 
 
diff --git a/tensorflow/contrib/timeseries/examples/lstm_test.py b/tensorflow/contrib/timeseries/examples/lstm_test.py
index 3cace56726..ca56e38ca0 100644
--- a/tensorflow/contrib/timeseries/examples/lstm_test.py
+++ b/tensorflow/contrib/timeseries/examples/lstm_test.py
@@ -36,7 +36,8 @@ class LSTMExampleTest(test.TestCase):
   def test_periodicity_learned(self):
     (observed_times, observed_values,
      all_times, predicted_values) = lstm.train_and_predict(
-         training_steps=100, estimator_config=_SeedRunConfig())
+         training_steps=100, estimator_config=_SeedRunConfig(),
+         export_directory=self.get_temp_dir())
     self.assertAllEqual([100], observed_times.shape)
     self.assertAllEqual([100, 5], observed_values.shape)
     self.assertAllEqual([200], all_times.shape)
-- 
GitLab


From cf3c77f683b6bc27408a58ca9eb7ea5afb4568f6 Mon Sep 17 00:00:00 2001
From: gracehoney <31743510+aaroey@users.noreply.github.com>
Date: Fri, 9 Feb 2018 09:47:58 -0800
Subject: [PATCH 893/966] Fix LINT.IfChange comment

---
 tensorflow/contrib/tensorrt/convert/convert_graph.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index f81752907d..899448004f 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -56,8 +56,7 @@ static bool IsTensorRTCandidate(const tensorflow::NodeDef& node_def) {
       "Identity", "Const", "Conv2D", "MaxPool", "BiasAdd", "Relu",
       "Add",      "Mul",   "Sub",    "Rsqrt",   "Pad"  // "Placeholder" ,"Mean"
   };
-  // LINT.ThenChange(
-  //    https://www.tensorflow.org/code/tensorflow/contrib/tensorrt/convert/convert_nodes.h)
+  // LINT.ThenChange(//tensorflow/contrib/tensorrt/convert/convert_nodes.h)
   return candidate_ops.count(node_def.op());
 }
 
-- 
GitLab


From 81b698d0dad88b0bb33af720247a3eba1d248e9d Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Fri, 9 Feb 2018 10:13:05 -0800
Subject: [PATCH 894/966] TFTS: Better handling of exogenous features

Adds (dummy) exogenous features to the LSTM model-building example, and adds some small methods needed to support that (fetching the shape of embedded exogenous features).

Also makes it more automatic to export a SavedModel with exogenous features (placeholder shapes will be inferred from the given FeatureColumns), which makes the LSTM example friendlier.

PiperOrigin-RevId: 185157085
---
 .../examples/data/multivariate_periods.csv    | 200 +++++++++---------
 .../contrib/timeseries/examples/lstm.py       |  56 +++--
 .../python/timeseries/estimators.py           |  34 +--
 .../timeseries/python/timeseries/model.py     |  23 ++
 .../state_space_model_test.py                 |  14 +-
 5 files changed, 191 insertions(+), 136 deletions(-)

diff --git a/tensorflow/contrib/timeseries/examples/data/multivariate_periods.csv b/tensorflow/contrib/timeseries/examples/data/multivariate_periods.csv
index 02a60d1cf6..b49a0662c2 100644
--- a/tensorflow/contrib/timeseries/examples/data/multivariate_periods.csv
+++ b/tensorflow/contrib/timeseries/examples/data/multivariate_periods.csv
@@ -1,100 +1,100 @@
-0,0.926906299771,1.99107237682,2.56546245685,3.07914768197,4.04839057867
-1,0.108010001864,1.41645361423,2.1686839775,2.94963962176,4.1263503303
-2,-0.800567600028,1.0172132907,1.96434754116,2.99885333086,4.04300485864
-3,0.0607042871898,0.719540073421,1.9765012584,2.89265588817,4.0951014426
-4,0.933712200629,0.28052120776,1.41018552514,2.69232603996,4.06481164223
-5,-0.171730652974,0.260054421028,1.48770816369,2.62199129293,4.44572807842
-6,-1.00180162933,0.333045158863,1.50006392277,2.88888309683,4.24755865606
-7,0.0580061875336,0.688929398826,1.56543458772,2.99840358953,4.52726873347
-8,0.764139447412,1.24704875327,1.77649279698,3.13578593851,4.63238922951
-9,-0.230331874785,1.47903998963,2.03547545751,3.20624030377,4.77980005228
-10,-1.03846045211,2.01133000781,2.31977503972,3.67951536251,5.09716775897
-11,0.188643592253,2.23285349038,2.68338482249,3.49817168611,5.24928239634
-12,0.91207302309,2.24244446841,2.71362604985,3.96332587625,5.37802271594
-13,-0.296588665881,2.02594634141,3.07733910479,3.99698324956,5.56365901394
-14,-0.959961476551,1.45078629833,3.18996420137,4.3763059609,5.65356015609
-15,0.46313530679,1.01141441548,3.4980215948,4.20224896882,5.88842247449
-16,0.929354125798,0.626635305936,3.70508262244,4.51791573544,5.73945973251
-17,-0.519110731957,0.269249223148,3.39866823332,4.46802003061,5.82768174382
-18,-0.924330981367,0.349602834684,3.21762413294,4.72803587499,5.94918925767
-19,0.253239387885,0.345158023497,3.11071425333,4.79311566935,5.9489259713
-20,0.637408390225,0.698996675371,3.25232492145,4.73814732384,5.9612010251
-21,-0.407396859412,1.17456342803,2.49526823723,4.59323415742,5.82501686811
-22,-0.967485452118,1.66655933642,2.47284606244,4.58316034754,5.88721406681
-23,0.474480867904,1.95018556323,2.0228950072,4.48651142819,5.8255943735
-24,1.04309652155,2.23519892356,1.91924131572,4.19094661783,5.87457348436
-25,-0.517861513772,2.12501967336,1.70266619979,4.05280882887,5.72160912899
-26,-0.945301585146,1.65464653549,1.81567174251,3.92309850635,5.58270493814
-27,0.501153868974,1.40600764889,1.53991387719,3.72853247942,5.60169001727
-28,0.972859524418,1.00344321868,1.5175642828,3.64092376655,5.10567722582
-29,-0.70553406135,0.465306263885,1.7038540803,3.33236870312,5.09182481555
-30,-0.946093634916,0.294539309453,1.88052827037,2.93011492669,4.97354922696
-31,0.47922123231,0.308465865031,2.03445883031,2.90772899045,4.86241793548
-32,0.754030014252,0.549752241167,2.46115815089,2.95063349534,4.71834614627
-33,-0.64875949826,0.894615488148,2.5922463381,2.81269864022,4.43480095104
-34,-0.757829951086,1.39123914261,2.69258079904,2.61834837315,4.36580046156
-35,0.565653301088,1.72360022693,2.97794913834,2.80403840334,4.27327248459
-36,0.867440092372,2.21100730052,3.38648090792,2.84057515729,4.12210169576
-37,-0.894567758095,2.17549105818,3.45532493329,2.90446025717,4.00251740584
-38,-0.715442356893,2.15105389965,3.52041791902,3.03650393392,4.12809249577
-39,0.80671703672,1.81504564517,3.60463324866,3.00747789871,3.98440762467
-40,0.527014790142,1.31803513865,3.43842186337,3.3332594663,4.03232406566
-41,-0.795936862129,0.847809114454,3.09875133548,3.52863155938,3.94883924909
-42,-0.610245806946,0.425530441018,2.92581949152,3.77238736123,4.27287245021
-43,0.611662279431,0.178432049837,2.48128214822,3.73212087883,4.17319013831
-44,0.650866553108,0.220341648392,2.41694642022,4.2609098519,4.27271645905
-45,-0.774156982023,0.632667602331,2.05474356052,4.32889204886,4.18029723271
-46,-0.714058448409,0.924562377599,1.75706135146,4.52492718422,4.3972678094
-47,0.889627293379,1.46207968841,1.78299357672,4.64466731095,4.56317887554
-48,0.520140662861,1.8996333843,1.41377633823,4.48899091177,4.78805049769
-49,-1.03816935616,2.08997002059,1.51218375351,4.84167764204,4.93026048606
-50,-0.40772951362,2.30878972136,1.44144415128,4.76854460997,5.01538444629
-51,0.792730684781,1.91367048509,1.58887384677,4.71739397335,5.25690012199
-52,0.371311881576,1.67565079528,1.81688563053,4.60353107555,5.44265822961
-53,-0.814398070371,1.13374634126,1.80328814859,4.72264252878,5.52674761122
-54,-0.469017949323,0.601244136627,2.29690896736,4.49859178859,5.54126153454
-55,0.871044371426,0.407597593794,2.7499112487,4.19060637761,5.57693767301
-56,0.523764933017,0.247705192709,3.09002071379,4.02095509006,5.80510362182
-57,-0.881326403531,0.31513103164,3.11358205718,3.96079100808,5.81000652365
-58,-0.357928025339,0.486163915865,3.17884556771,3.72634990659,5.85693642011
-59,0.853038779822,1.04218094475,3.45835384454,3.36703969978,5.9585988449
-60,0.435311516013,1.59715085283,3.63313338588,3.11276729421,5.93643818229
-61,-1.02703719138,1.92205832542,3.47606111735,3.06247155999,6.02106646259
-62,-0.246661325557,2.14653802542,3.29446326567,2.89936259181,5.67531541272
-63,1.02554736569,2.25943737733,3.07031591528,2.78176218013,5.78206328989
-64,0.337814475969,2.07589147224,2.80356226089,2.55888206331,5.7094075496
-65,-1.12023369929,1.25333011618,2.56497288445,2.77361359194,5.50799418376
-66,-0.178980246554,1.11937139901,2.51598681313,2.91438309151,5.47469577206
-67,0.97550951531,0.60553823137,2.11657741073,2.88081098981,5.37034999502
-68,0.136653357206,0.365828836075,1.97386033165,3.13217903204,5.07254490219
-69,-1.05607596951,0.153152115069,1.52110743825,3.01308794192,5.08902539125
-70,-0.13095280331,0.337113974483,1.52703079853,3.16687131599,4.86649398514
-71,1.07081057754,0.714247566736,1.53761382634,3.45151989484,4.75892309166
-72,0.0153410376082,1.24631231847,1.61690939161,3.85481994498,4.35683752832
-73,-0.912801257303,1.60791309476,1.8729264524,4.03037260012,4.36072588913
-74,-0.0894895640338,2.02535207407,1.93484909619,4.09557485132,4.35327025188
-75,0.978646999652,2.20085086625,2.09003440427,4.27542353033,4.1805058388
-76,-0.113312642876,2.2444100761,2.50789248839,4.4151861502,4.03267168136
-77,-1.00215099149,1.84305628445,2.61691237246,4.45425147595,3.81203553766
-78,-0.0183234614205,1.49573923116,2.99308471214,4.71134960112,4.0273804959
-79,1.0823738177,1.12211589848,3.27079386925,4.94288270502,4.01851068083
-80,0.124370187893,0.616474412808,3.4284236674,4.76942168327,3.9749536483
-81,-0.929423379352,0.290977090976,3.34131726136,4.78590392707,4.10190661656
-82,0.23766302648,0.155302052254,3.49779513794,4.64605656795,4.15571321107
-83,1.03531486192,0.359702776204,3.4880725919,4.48167586667,4.21134561991
-84,-0.261234571382,0.713877760378,3.42756426614,4.426443869,4.25208300527
-85,-1.03572442277,1.25001113691,2.96908341113,4.25500915322,4.25723010649
-86,0.380034261243,1.70543355622,2.73605932518,4.16703432307,4.63700400788
-87,1.03734873488,1.97544410562,2.55586572141,3.84976673263,4.55282864289
-88,-0.177344253372,2.22614526325,2.09565864891,3.77378097953,4.82577400298
-89,-0.976821526892,2.18385079177,1.78522284118,3.67768223554,5.06302440873
-90,0.264820472091,1.86981946157,1.50048403865,3.43619796921,5.05651761669
-91,1.05642344868,1.47568646076,1.51347671977,3.20898518885,5.50149047462
-92,-0.311607433358,1.04226467636,1.52089650905,3.02291865417,5.4889046232
-93,-0.724285777937,0.553052311957,1.48573560173,2.7365973598,5.72549174225
-94,0.519859192905,0.226520626591,1.61543723167,2.84102086852,5.69330622288
-95,1.0323195039,0.260873217055,1.81913034804,2.83951143848,5.90325028086
-96,-0.53285682538,0.387695521405,1.70935609313,2.57977050631,5.79579213161
-97,-0.975127997215,0.920948771589,2.51292643636,2.71004616612,5.87016469227
-98,0.540246804099,1.36445470181,2.61949412896,2.98482553485,6.02447664937
-99,0.987764008058,1.85581989607,2.84685706149,2.94760204892,6.0212151724
+0,0.926906299771,1.99107237682,2.56546245685,3.07914768197,4.04839057867,1.,0.
+1,0.108010001864,1.41645361423,2.1686839775,2.94963962176,4.1263503303,1.,0.
+2,-0.800567600028,1.0172132907,1.96434754116,2.99885333086,4.04300485864,1.,0.
+3,0.0607042871898,0.719540073421,1.9765012584,2.89265588817,4.0951014426,1.,0.
+4,0.933712200629,0.28052120776,1.41018552514,2.69232603996,4.06481164223,1.,0.
+5,-0.171730652974,0.260054421028,1.48770816369,2.62199129293,4.44572807842,1.,0.
+6,-1.00180162933,0.333045158863,1.50006392277,2.88888309683,4.24755865606,1.,0.
+7,0.0580061875336,0.688929398826,1.56543458772,2.99840358953,4.52726873347,1.,0.
+8,0.764139447412,1.24704875327,1.77649279698,3.13578593851,4.63238922951,1.,0.
+9,-0.230331874785,1.47903998963,2.03547545751,3.20624030377,4.77980005228,1.,0.
+10,-1.03846045211,2.01133000781,2.31977503972,3.67951536251,5.09716775897,1.,0.
+11,0.188643592253,2.23285349038,2.68338482249,3.49817168611,5.24928239634,1.,0.
+12,0.91207302309,2.24244446841,2.71362604985,3.96332587625,5.37802271594,1.,0.
+13,-0.296588665881,2.02594634141,3.07733910479,3.99698324956,5.56365901394,1.,0.
+14,-0.959961476551,1.45078629833,3.18996420137,4.3763059609,5.65356015609,1.,0.
+15,0.46313530679,1.01141441548,3.4980215948,4.20224896882,5.88842247449,1.,0.
+16,0.929354125798,0.626635305936,3.70508262244,4.51791573544,5.73945973251,1.,0.
+17,-0.519110731957,0.269249223148,3.39866823332,4.46802003061,5.82768174382,1.,0.
+18,-0.924330981367,0.349602834684,3.21762413294,4.72803587499,5.94918925767,1.,0.
+19,0.253239387885,0.345158023497,3.11071425333,4.79311566935,5.9489259713,1.,0.
+20,0.637408390225,0.698996675371,3.25232492145,4.73814732384,5.9612010251,1.,0.
+21,-0.407396859412,1.17456342803,2.49526823723,4.59323415742,5.82501686811,1.,0.
+22,-0.967485452118,1.66655933642,2.47284606244,4.58316034754,5.88721406681,1.,0.
+23,0.474480867904,1.95018556323,2.0228950072,4.48651142819,5.8255943735,1.,0.
+24,1.04309652155,2.23519892356,1.91924131572,4.19094661783,5.87457348436,1.,0.
+25,-0.517861513772,2.12501967336,1.70266619979,4.05280882887,5.72160912899,1.,0.
+26,-0.945301585146,1.65464653549,1.81567174251,3.92309850635,5.58270493814,1.,0.
+27,0.501153868974,1.40600764889,1.53991387719,3.72853247942,5.60169001727,1.,0.
+28,0.972859524418,1.00344321868,1.5175642828,3.64092376655,5.10567722582,1.,0.
+29,-0.70553406135,0.465306263885,1.7038540803,3.33236870312,5.09182481555,1.,0.
+30,-0.946093634916,0.294539309453,1.88052827037,2.93011492669,4.97354922696,1.,0.
+31,0.47922123231,0.308465865031,2.03445883031,2.90772899045,4.86241793548,1.,0.
+32,0.754030014252,0.549752241167,2.46115815089,2.95063349534,4.71834614627,1.,0.
+33,-0.64875949826,0.894615488148,2.5922463381,2.81269864022,4.43480095104,1.,0.
+34,-0.757829951086,1.39123914261,2.69258079904,2.61834837315,4.36580046156,1.,0.
+35,0.565653301088,1.72360022693,2.97794913834,2.80403840334,4.27327248459,1.,0.
+36,0.867440092372,2.21100730052,3.38648090792,2.84057515729,4.12210169576,1.,0.
+37,-0.894567758095,2.17549105818,3.45532493329,2.90446025717,4.00251740584,1.,0.
+38,-0.715442356893,2.15105389965,3.52041791902,3.03650393392,4.12809249577,1.,0.
+39,0.80671703672,1.81504564517,3.60463324866,3.00747789871,3.98440762467,1.,0.
+40,0.527014790142,1.31803513865,3.43842186337,3.3332594663,4.03232406566,1.,0.
+41,-0.795936862129,0.847809114454,3.09875133548,3.52863155938,3.94883924909,1.,0.
+42,-0.610245806946,0.425530441018,2.92581949152,3.77238736123,4.27287245021,1.,0.
+43,0.611662279431,0.178432049837,2.48128214822,3.73212087883,4.17319013831,1.,0.
+44,0.650866553108,0.220341648392,2.41694642022,4.2609098519,4.27271645905,1.,0.
+45,-0.774156982023,0.632667602331,2.05474356052,4.32889204886,4.18029723271,1.,0.
+46,-0.714058448409,0.924562377599,1.75706135146,4.52492718422,4.3972678094,1.,0.
+47,0.889627293379,1.46207968841,1.78299357672,4.64466731095,4.56317887554,1.,0.
+48,0.520140662861,1.8996333843,1.41377633823,4.48899091177,4.78805049769,1.,0.
+49,-1.03816935616,2.08997002059,1.51218375351,4.84167764204,4.93026048606,1.,0.
+50,-0.40772951362,2.30878972136,1.44144415128,4.76854460997,5.01538444629,1.,0.
+51,0.792730684781,1.91367048509,1.58887384677,4.71739397335,5.25690012199,1.,0.
+52,0.371311881576,1.67565079528,1.81688563053,4.60353107555,5.44265822961,1.,0.
+53,-0.814398070371,1.13374634126,1.80328814859,4.72264252878,5.52674761122,1.,0.
+54,-0.469017949323,0.601244136627,2.29690896736,4.49859178859,5.54126153454,1.,0.
+55,0.871044371426,0.407597593794,2.7499112487,4.19060637761,5.57693767301,1.,0.
+56,0.523764933017,0.247705192709,3.09002071379,4.02095509006,5.80510362182,1.,0.
+57,-0.881326403531,0.31513103164,3.11358205718,3.96079100808,5.81000652365,1.,0.
+58,-0.357928025339,0.486163915865,3.17884556771,3.72634990659,5.85693642011,1.,0.
+59,0.853038779822,1.04218094475,3.45835384454,3.36703969978,5.9585988449,1.,0.
+60,0.435311516013,1.59715085283,3.63313338588,3.11276729421,5.93643818229,1.,0.
+61,-1.02703719138,1.92205832542,3.47606111735,3.06247155999,6.02106646259,1.,0.
+62,-0.246661325557,2.14653802542,3.29446326567,2.89936259181,5.67531541272,1.,0.
+63,1.02554736569,2.25943737733,3.07031591528,2.78176218013,5.78206328989,1.,0.
+64,0.337814475969,2.07589147224,2.80356226089,2.55888206331,5.7094075496,1.,0.
+65,-1.12023369929,1.25333011618,2.56497288445,2.77361359194,5.50799418376,1.,0.
+66,-0.178980246554,1.11937139901,2.51598681313,2.91438309151,5.47469577206,1.,0.
+67,0.97550951531,0.60553823137,2.11657741073,2.88081098981,5.37034999502,1.,0.
+68,0.136653357206,0.365828836075,1.97386033165,3.13217903204,5.07254490219,1.,0.
+69,-1.05607596951,0.153152115069,1.52110743825,3.01308794192,5.08902539125,1.,0.
+70,-0.13095280331,0.337113974483,1.52703079853,3.16687131599,4.86649398514,1.,0.
+71,1.07081057754,0.714247566736,1.53761382634,3.45151989484,4.75892309166,1.,0.
+72,0.0153410376082,1.24631231847,1.61690939161,3.85481994498,4.35683752832,1.,0.
+73,-0.912801257303,1.60791309476,1.8729264524,4.03037260012,4.36072588913,1.,0.
+74,-0.0894895640338,2.02535207407,1.93484909619,4.09557485132,4.35327025188,1.,0.
+75,0.978646999652,2.20085086625,2.09003440427,4.27542353033,4.1805058388,1.,0.
+76,-0.113312642876,2.2444100761,2.50789248839,4.4151861502,4.03267168136,1.,0.
+77,-1.00215099149,1.84305628445,2.61691237246,4.45425147595,3.81203553766,1.,0.
+78,-0.0183234614205,1.49573923116,2.99308471214,4.71134960112,4.0273804959,1.,0.
+79,1.0823738177,1.12211589848,3.27079386925,4.94288270502,4.01851068083,1.,0.
+80,0.124370187893,0.616474412808,3.4284236674,4.76942168327,3.9749536483,1.,0.
+81,-0.929423379352,0.290977090976,3.34131726136,4.78590392707,4.10190661656,1.,0.
+82,0.23766302648,0.155302052254,3.49779513794,4.64605656795,4.15571321107,1.,0.
+83,1.03531486192,0.359702776204,3.4880725919,4.48167586667,4.21134561991,1.,0.
+84,-0.261234571382,0.713877760378,3.42756426614,4.426443869,4.25208300527,1.,0.
+85,-1.03572442277,1.25001113691,2.96908341113,4.25500915322,4.25723010649,1.,0.
+86,0.380034261243,1.70543355622,2.73605932518,4.16703432307,4.63700400788,1.,0.
+87,1.03734873488,1.97544410562,2.55586572141,3.84976673263,4.55282864289,1.,0.
+88,-0.177344253372,2.22614526325,2.09565864891,3.77378097953,4.82577400298,1.,0.
+89,-0.976821526892,2.18385079177,1.78522284118,3.67768223554,5.06302440873,1.,0.
+90,0.264820472091,1.86981946157,1.50048403865,3.43619796921,5.05651761669,1.,0.
+91,1.05642344868,1.47568646076,1.51347671977,3.20898518885,5.50149047462,1.,0.
+92,-0.311607433358,1.04226467636,1.52089650905,3.02291865417,5.4889046232,1.,0.
+93,-0.724285777937,0.553052311957,1.48573560173,2.7365973598,5.72549174225,1.,0.
+94,0.519859192905,0.226520626591,1.61543723167,2.84102086852,5.69330622288,1.,0.
+95,1.0323195039,0.260873217055,1.81913034804,2.83951143848,5.90325028086,1.,0.
+96,-0.53285682538,0.387695521405,1.70935609313,2.57977050631,5.79579213161,1.,0.
+97,-0.975127997215,0.920948771589,2.51292643636,2.71004616612,5.87016469227,1.,0.
+98,0.540246804099,1.36445470181,2.61949412896,2.98482553485,6.02447664937,1.,0.
+99,0.987764008058,1.85581989607,2.84685706149,2.94760204892,6.0212151724,1.,0.
diff --git a/tensorflow/contrib/timeseries/examples/lstm.py b/tensorflow/contrib/timeseries/examples/lstm.py
index 630f4fc057..f37cafcc50 100644
--- a/tensorflow/contrib/timeseries/examples/lstm.py
+++ b/tensorflow/contrib/timeseries/examples/lstm.py
@@ -48,7 +48,8 @@ _DATA_FILE = path.join(_MODULE_PATH, "data/multivariate_periods.csv")
 class _LSTMModel(ts_model.SequentialTimeSeriesModel):
   """A time series model-building example using an RNNCell."""
 
-  def __init__(self, num_units, num_features, dtype=tf.float32):
+  def __init__(self, num_units, num_features, exogenous_feature_columns=None,
+               dtype=tf.float32):
     """Initialize/configure the model object.
 
     Note that we do not start graph building here. Rather, this object is a
@@ -58,6 +59,10 @@ class _LSTMModel(ts_model.SequentialTimeSeriesModel):
       num_units: The number of units in the model's LSTMCell.
       num_features: The dimensionality of the time series (features per
         timestep).
+      exogenous_feature_columns: A list of tf.contrib.layers.FeatureColumn
+          objects representing features which are inputs to the model but are
+          not predicted by it. These must then be present for training,
+          evaluation, and prediction.
       dtype: The floating point data type to use.
     """
     super(_LSTMModel, self).__init__(
@@ -65,6 +70,7 @@ class _LSTMModel(ts_model.SequentialTimeSeriesModel):
         train_output_names=["mean"],
         predict_output_names=["mean"],
         num_features=num_features,
+        exogenous_feature_columns=exogenous_feature_columns,
         dtype=dtype)
     self._num_units = num_units
     # Filled in by initialize_graph()
@@ -104,6 +110,8 @@ class _LSTMModel(ts_model.SequentialTimeSeriesModel):
         tf.zeros([], dtype=tf.int64),
         # The previous observation or prediction.
         tf.zeros([self.num_features], dtype=self.dtype),
+        # The most recently seen exogenous features.
+        tf.zeros(self._get_exogenous_embedding_shape(), dtype=self.dtype),
         # The state of the RNNCell (batch dimension removed since this parent
         # class will broadcast).
         [tf.squeeze(state_element, axis=0)
@@ -131,7 +139,7 @@ class _LSTMModel(ts_model.SequentialTimeSeriesModel):
       loss (note that we could also return other measures of goodness of fit,
       although only "loss" will be optimized).
     """
-    state_from_time, prediction, lstm_state = state
+    state_from_time, prediction, exogenous, lstm_state = state
     with tf.control_dependencies(
         [tf.assert_equal(current_times, state_from_time)]):
       # Subtract the mean and divide by the variance of the series.  Slightly
@@ -143,16 +151,22 @@ class _LSTMModel(ts_model.SequentialTimeSeriesModel):
           (prediction - transformed_values) ** 2, axis=-1)
       # Keep track of the new observation in model state. It won't be run
       # through the LSTM until the next _imputation_step.
-      new_state_tuple = (current_times, transformed_values, lstm_state)
+      new_state_tuple = (current_times, transformed_values,
+                         exogenous, lstm_state)
     return (new_state_tuple, predictions)
 
   def _prediction_step(self, current_times, state):
     """Advance the RNN state using a previous observation or prediction."""
-    _, previous_observation_or_prediction, lstm_state = state
+    _, previous_observation_or_prediction, exogenous, lstm_state = state
+    # Update LSTM state based on the most recent exogenous and endogenous
+    # features.
+    inputs = tf.concat([previous_observation_or_prediction, exogenous],
+                       axis=-1)
     lstm_output, new_lstm_state = self._lstm_cell_run(
-        inputs=previous_observation_or_prediction, state=lstm_state)
+        inputs=inputs, state=lstm_state)
     next_prediction = self._predict_from_lstm_output(lstm_output)
-    new_state_tuple = (current_times, next_prediction, new_lstm_state)
+    new_state_tuple = (current_times, next_prediction,
+                       exogenous, new_lstm_state)
     return new_state_tuple, {"mean": self._scale_back_data(next_prediction)}
 
   def _imputation_step(self, current_times, state):
@@ -164,9 +178,10 @@ class _LSTMModel(ts_model.SequentialTimeSeriesModel):
 
   def _exogenous_input_step(
       self, current_times, current_exogenous_regressors, state):
-    """Update model state based on exogenous regressors."""
-    raise NotImplementedError(
-        "Exogenous inputs are not implemented for this example.")
+    """Save exogenous regressors in model state for use in _prediction_step."""
+    state_from_time, prediction, _, lstm_state = state
+    return (state_from_time, prediction,
+            current_exogenous_regressors, lstm_state)
 
 
 def train_and_predict(
@@ -174,24 +189,37 @@ def train_and_predict(
     export_directory=None):
   """Train and predict using a custom time series model."""
   # Construct an Estimator from our LSTM model.
+  exogenous_feature_columns = [
+      # Exogenous features are not part of the loss, but can inform
+      # predictions. In this example the features have no extra information, but
+      # are included as an API example.
+      tf.contrib.layers.real_valued_column(
+          "2d_exogenous_feature", dimension=2)]
   estimator = ts_estimators.TimeSeriesRegressor(
-      model=_LSTMModel(num_features=5, num_units=128),
+      model=_LSTMModel(num_features=5, num_units=128,
+                       exogenous_feature_columns=exogenous_feature_columns),
       optimizer=tf.train.AdamOptimizer(0.001), config=estimator_config,
       # Set state to be saved across windows.
       state_manager=state_management.ChainingStateManager())
   reader = tf.contrib.timeseries.CSVReader(
       csv_file_name,
       column_names=((tf.contrib.timeseries.TrainEvalFeatures.TIMES,)
-                    + (tf.contrib.timeseries.TrainEvalFeatures.VALUES,) * 5))
+                    + (tf.contrib.timeseries.TrainEvalFeatures.VALUES,) * 5
+                    + ("2d_exogenous_feature",) * 2))
   train_input_fn = tf.contrib.timeseries.RandomWindowInputFn(
       reader, batch_size=4, window_size=32)
   estimator.train(input_fn=train_input_fn, steps=training_steps)
   evaluation_input_fn = tf.contrib.timeseries.WholeDatasetInputFn(reader)
   evaluation = estimator.evaluate(input_fn=evaluation_input_fn, steps=1)
   # Predict starting after the evaluation
+  predict_exogenous_features = {
+      "2d_exogenous_feature": numpy.concatenate(
+          [numpy.ones([1, 100, 1]), numpy.zeros([1, 100, 1])],
+          axis=-1)}
   (predictions,) = tuple(estimator.predict(
       input_fn=tf.contrib.timeseries.predict_continuation_input_fn(
-          evaluation, steps=100)))
+          evaluation, steps=100,
+          exogenous_features=predict_exogenous_features)))
   times = evaluation["times"][0]
   observed = evaluation["observed"][0, :, :]
   predicted_mean = numpy.squeeze(numpy.concatenate(
@@ -204,7 +232,6 @@ def train_and_predict(
   input_receiver_fn = estimator.build_raw_serving_input_receiver_fn()
   export_location = estimator.export_savedmodel(
       export_directory, input_receiver_fn)
-
   # Predict using the SavedModel
   with tf.Graph().as_default():
     with tf.Session() as session:
@@ -213,7 +240,8 @@ def train_and_predict(
       saved_model_output = (
           tf.contrib.timeseries.saved_model_utils.predict_continuation(
               continue_from=evaluation, signatures=signatures,
-              session=session, steps=100))
+              session=session, steps=100,
+              exogenous_features=predict_exogenous_features))
       # The exported model gives the same results as the Estimator.predict()
       # call above.
       numpy.testing.assert_allclose(
diff --git a/tensorflow/contrib/timeseries/python/timeseries/estimators.py b/tensorflow/contrib/timeseries/python/timeseries/estimators.py
index 3738dfa154..f8355f366f 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/estimators.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/estimators.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.contrib.layers.python.layers import feature_column
+
 from tensorflow.contrib.timeseries.python.timeseries import ar_model
 from tensorflow.contrib.timeseries.python.timeseries import feature_keys
 from tensorflow.contrib.timeseries.python.timeseries import head as ts_head_lib
@@ -72,15 +74,14 @@ class TimeSeriesRegressor(estimator_lib.Estimator):
   # tf.Example containing all features (times, values, any exogenous features)
   # and serialized model state (possibly also as a tf.Example).
   def build_raw_serving_input_receiver_fn(self,
-                                          exogenous_features=None,
                                           default_batch_size=None,
                                           default_series_length=None):
     """Build an input_receiver_fn for export_savedmodel which accepts arrays.
 
+    Automatically creates placeholders for exogenous `FeatureColumn`s passed to
+    the model.
+
     Args:
-      exogenous_features: A dictionary mapping feature keys to exogenous
-        features (either Numpy arrays or Tensors). Used to determine the shapes
-        of placeholders for these features.
       default_batch_size: If specified, must be a scalar integer. Sets the batch
         size in the static shape information of all feature Tensors, which means
         only this batch size will be accepted by the exported model. If None
@@ -94,9 +95,6 @@ class TimeSeriesRegressor(estimator_lib.Estimator):
       An input_receiver_fn which may be passed to the Estimator's
       export_savedmodel.
     """
-    if exogenous_features is None:
-      exogenous_features = {}
-
     def _serving_input_receiver_fn():
       """A receiver function to be passed to export_savedmodel."""
       placeholders = {}
@@ -119,14 +117,22 @@ class TimeSeriesRegressor(estimator_lib.Estimator):
                   dtype=self._model.dtype),
               shape=(default_batch_size, default_series_length,
                      self._model.num_features)))
-      for feature_key, feature_value in exogenous_features.items():
-        value_tensor = ops.convert_to_tensor(feature_value)
-        value_tensor.get_shape().with_rank_at_least(2)
-        feature_shape = value_tensor.get_shape().as_list()
-        feature_shape[0] = default_batch_size
-        feature_shape[1] = default_series_length
+      with ops.Graph().as_default():
+        # Default placeholders have only an unknown batch dimension. Make them
+        # in a separate graph, then splice in the series length to the shapes
+        # and re-create them in the outer graph.
+        exogenous_feature_shapes = {
+            key: (value.get_shape(), value.dtype) for key, value
+            in feature_column.make_place_holder_tensors_for_base_features(
+                self._model.exogenous_feature_columns).items()}
+      for feature_key, (batch_only_feature_shape, value_dtype) in (
+          exogenous_feature_shapes.items()):
+        batch_only_feature_shape = batch_only_feature_shape.with_rank_at_least(
+            1).as_list()
+        feature_shape = ([default_batch_size, default_series_length]
+                         + batch_only_feature_shape[1:])
         placeholders[feature_key] = array_ops.placeholder(
-            dtype=value_tensor.dtype, name=feature_key, shape=feature_shape)
+            dtype=value_dtype, name=feature_key, shape=feature_shape)
       # Models may not know the shape of their state without creating some
       # variables/ops. Avoid polluting the default graph by making a new one. We
       # use only static metadata from the returned Tensors.
diff --git a/tensorflow/contrib/timeseries/python/timeseries/model.py b/tensorflow/contrib/timeseries/python/timeseries/model.py
index b32b5c5494..bac7d1ebf5 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/model.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/model.py
@@ -22,6 +22,7 @@ import abc
 import collections
 
 from tensorflow.contrib import layers
+from tensorflow.contrib.layers import feature_column
 
 from tensorflow.contrib.timeseries.python.timeseries import math_utils
 from tensorflow.contrib.timeseries.python.timeseries.feature_keys import PredictionFeatures
@@ -83,6 +84,11 @@ class TimeSeriesModel(object):
     self._stats_means = None
     self._stats_sigmas = None
 
+  @property
+  def exogenous_feature_columns(self):
+    """`FeatureColumn` objects for features which are not predicted."""
+    return self._exogenous_feature_columns
+
   # TODO(allenl): Move more of the generic machinery for generating and
   # predicting into TimeSeriesModel, and possibly share it between generate()
   # and predict()
@@ -250,6 +256,23 @@ class TimeSeriesModel(object):
     """
     pass
 
+  def _get_exogenous_embedding_shape(self):
+    """Computes the shape of the vector returned by _process_exogenous_features.
+
+    Returns:
+      The shape as a list. Does not include a batch dimension.
+    """
+    if not self._exogenous_feature_columns:
+      return (0,)
+    with ops.Graph().as_default():
+      placeholder_features = (
+          feature_column.make_place_holder_tensors_for_base_features(
+              self._exogenous_feature_columns))
+      embedded = layers.input_from_feature_columns(
+          columns_to_tensors=placeholder_features,
+          feature_columns=self._exogenous_feature_columns)
+      return embedded.get_shape().as_list()[1:]
+
   def _process_exogenous_features(self, times, features):
     """Create a single vector from exogenous features.
 
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/state_space_model_test.py b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/state_space_model_test.py
index 5980fc5d5d..1fb4a3c121 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/state_space_model_test.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/state_space_model_test.py
@@ -187,9 +187,7 @@ class StateSpaceEquivalenceTests(test.TestCase):
     estimator.train(combined_input_fn, steps=1)
     export_location = estimator.export_savedmodel(
         self.get_temp_dir(),
-        estimator.build_raw_serving_input_receiver_fn(
-            exogenous_features={
-                "exogenous": numpy.zeros((0, 0), dtype=numpy.float32)}))
+        estimator.build_raw_serving_input_receiver_fn())
     with ops.Graph().as_default() as graph:
       random_model.initialize_graph()
       with self.test_session(graph=graph) as session:
@@ -209,7 +207,7 @@ class StateSpaceEquivalenceTests(test.TestCase):
             features={
                 feature_keys.FilteringFeatures.TIMES: [1, 2],
                 feature_keys.FilteringFeatures.VALUES: [1., 2.],
-                "exogenous": [-1., -2.]})
+                "exogenous": [[-1.], [-2.]]})
         second_split_filtering = saved_model_utils.filter_continuation(
             continue_from=first_split_filtering,
             signatures=signatures,
@@ -217,7 +215,7 @@ class StateSpaceEquivalenceTests(test.TestCase):
             features={
                 feature_keys.FilteringFeatures.TIMES: [3, 4],
                 feature_keys.FilteringFeatures.VALUES: [3., 4.],
-                "exogenous": [-3., -4.]
+                "exogenous": [[-3.], [-4.]]
             })
         combined_filtering = saved_model_utils.filter_continuation(
             continue_from={
@@ -227,7 +225,7 @@ class StateSpaceEquivalenceTests(test.TestCase):
             features={
                 feature_keys.FilteringFeatures.TIMES: [1, 2, 3, 4],
                 feature_keys.FilteringFeatures.VALUES: [1., 2., 3., 4.],
-                "exogenous": [-1., -2., -3., -4.]
+                "exogenous": [[-1.], [-2.], [-3.], [-4.]]
             })
         split_predict = saved_model_utils.predict_continuation(
             continue_from=second_split_filtering,
@@ -235,14 +233,14 @@ class StateSpaceEquivalenceTests(test.TestCase):
             session=session,
             steps=1,
             exogenous_features={
-                "exogenous": [[-5.]]})
+                "exogenous": [[[-5.]]]})
         combined_predict = saved_model_utils.predict_continuation(
             continue_from=combined_filtering,
             signatures=signatures,
             session=session,
             steps=1,
             exogenous_features={
-                "exogenous": [[-5.]]})
+                "exogenous": [[[-5.]]]})
     for state_key, combined_state_value in combined_filtering.items():
       if state_key == feature_keys.FilteringResults.TIMES:
         continue
-- 
GitLab


From 4a54f16a66d8d9b4df74ec810f130edf2373c444 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 9 Feb 2018 10:45:56 -0800
Subject: [PATCH 895/966]   regression_head accepts link_fn as parameter, so
 the use can use self defined link fn

PiperOrigin-RevId: 185161896
---
 tensorflow/contrib/learn/BUILD                |  1 +
 .../learn/python/learn/estimators/head.py     |  7 +++++--
 .../python/learn/estimators/head_test.py      | 20 +++++++++++++++++++
 3 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/learn/BUILD b/tensorflow/contrib/learn/BUILD
index 3c782b54a8..abf6e393bb 100644
--- a/tensorflow/contrib/learn/BUILD
+++ b/tensorflow/contrib/learn/BUILD
@@ -388,6 +388,7 @@ py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:session",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:variables",
diff --git a/tensorflow/contrib/learn/python/learn/estimators/head.py b/tensorflow/contrib/learn/python/learn/estimators/head.py
index bc0e6fc009..9b124b2c19 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/head.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/head.py
@@ -181,7 +181,8 @@ def regression_head(label_name=None,
                     weight_column_name=None,
                     label_dimension=1,
                     enable_centered_bias=False,
-                    head_name=None):
+                    head_name=None,
+                    link_fn=None):
   """Creates a `Head` for linear regression.
 
   Args:
@@ -199,6 +200,8 @@ def regression_head(label_name=None,
     head_name: name of the head. If provided, predictions, summary and metrics
       keys will be suffixed by `"/" + head_name` and the default variable scope
       will be `head_name`.
+    link_fn: link function to convert logits to predictions. If provided,
+      this link function will be used instead of identity.
 
   Returns:
     An instance of `Head` for linear regression.
@@ -210,7 +213,7 @@ def regression_head(label_name=None,
       enable_centered_bias=enable_centered_bias,
       head_name=head_name,
       loss_fn=_mean_squared_loss,
-      link_fn=array_ops.identity)
+      link_fn=(link_fn if link_fn is not None else array_ops.identity))
 
 
 def poisson_regression_head(label_name=None,
diff --git a/tensorflow/contrib/learn/python/learn/estimators/head_test.py b/tensorflow/contrib/learn/python/learn/estimators/head_test.py
index 3881bf533d..7c2d9bb076 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/head_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/head_test.py
@@ -33,6 +33,7 @@ from tensorflow.python.client import session
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import lookup_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.losses import losses as losses_lib
 from tensorflow.python.platform import test
@@ -153,6 +154,25 @@ class RegressionHeadTest(test.TestCase):
       _assert_no_variables(self)
       _assert_metrics(self, 5. / 3, {"loss": 5. / 3}, model_fn_ops)
 
+  def testRegressionWithLogitFn(self):
+    head = head_lib.regression_head(link_fn=math_ops.square)
+    def _assert_preditions(test_case, expected_predictions, model_fn_ops):
+      variables.initialize_local_variables().run()
+      test_case.assertAllClose(expected_predictions,
+                               model_fn_ops.predictions["scores"].eval())
+    with ops.Graph().as_default(), session.Session():
+      model_fn_ops = head.create_model_fn_ops(
+          {},
+          labels=((0.,), (1.,), (1.,)),
+          mode=model_fn.ModeKeys.TRAIN,
+          train_op_fn=head_lib.no_op_train_fn,
+          logits=((1.,), (1.,), (3.,)))
+      self._assert_output_alternatives(model_fn_ops)
+      _assert_summary_tags(self, ["loss"])
+      _assert_no_variables(self)
+      _assert_metrics(self, 5. / 3, {"loss": 5. / 3}, model_fn_ops)
+      _assert_preditions(self, ([1.0, 1.0, 9.0]), model_fn_ops)
+
   def testRegressionWithInvalidLogits(self):
     head = head_lib.regression_head()
     with ops.Graph().as_default(), session.Session():
-- 
GitLab


From eb049506e58b22ed62e5197b97bb4f9f7a5d92ec Mon Sep 17 00:00:00 2001
From: Suharsh Sivakumar <suharshs@google.com>
Date: Fri, 9 Feb 2018 11:01:07 -0800
Subject: [PATCH 896/966] Fix some doc strings that were accidentally removed.

PiperOrigin-RevId: 185164375
---
 .../contrib/quantize/python/quantize_graph.py | 46 ++++++++++---------
 1 file changed, 24 insertions(+), 22 deletions(-)

diff --git a/tensorflow/contrib/quantize/python/quantize_graph.py b/tensorflow/contrib/quantize/python/quantize_graph.py
index 6120b89683..b91e045175 100644
--- a/tensorflow/contrib/quantize/python/quantize_graph.py
+++ b/tensorflow/contrib/quantize/python/quantize_graph.py
@@ -29,7 +29,7 @@ def _create_graph(input_graph=None,
                   activation_bits=8,
                   quant_delay=None,
                   freeze_bn_delay=None):
-  """Returns a transformed training input_graph for simulated quantization.
+  """Rewrites an input_graph in place for simulated quantization.
 
   The graph has fake quantization ops inserted to simulate the error
   introduced by quantization. Since the graph is transformed in place,
@@ -43,16 +43,15 @@ def _create_graph(input_graph=None,
     weight_bits: Number of bits to use for quantizing weights.
     activation_bits: Number of bits to use for quantizing activations.
     quant_delay: Number of steps after which weights and activations are
-                 quantized during training.
+      quantized during training.
     freeze_bn_delay: Number of steps after which moving mean and variance are
-                     frozen and used instead of batch statistics during
-                     training. freeze_bn_delay should be greater than
-                     quant_delay and should correspond to the number of steps
-                     when training has almost converged
+      frozen and used instead of batch statistics during training.
+      freeze_bn_delay should be greater than quant_delay and should correspond
+      to the number of steps when training has almost converged
 
   Raises:
     ValueError: If elements contains an element that isn't a tf.Tensor or
-        tf.Operation.
+      tf.Operation.
   """
 
   if input_graph is None:
@@ -81,11 +80,11 @@ def create_training_graph(input_graph=None, quant_delay=250000):
   Args:
     input_graph: The tf.Graph to be transformed.
     quant_delay: Number of steps after which weights and activations are
-                 quantized during training.
+      quantized during training.
 
   Raises:
     ValueError: If elements contains an element that isn't a tf.Tensor or
-        tf.Operation.
+      tf.Operation.
   """
   # TODO(raghuramank) Need to have freeze_bn_delay be a function of batch size
   # Currently the values below are hardcoded for mobilenetV1 on imagenet
@@ -120,10 +119,9 @@ def create_eval_graph(input_graph=None):
     input_graph: The tf.Graph to be transformed, if None then defaults to the
       default graph.
 
-
   Raises:
     ValueError: If elements contains an element that isn't a tf.Tensor or
-        tf.Operation.
+      tf.Operation.
   """
   _create_graph(input_graph=input_graph, is_training=False)
 
@@ -133,7 +131,10 @@ def experimental_create_training_graph(input_graph=None,
                                        activation_bits=8,
                                        quant_delay=250000,
                                        freeze_bn_delay=500000):
-  """Returns a transformed training input_graph for simulated quantization.
+  """Rewrites a training input_graph in place for simulated quantization.
+
+  This function has additional experimental options not (yet) available to
+  create_training_graph. The resulting behavior may be undefined.
 
   The graph has fake quantization ops inserted to simulate the error
   introduced by quantization. Since the graph is transformed in place,
@@ -143,16 +144,14 @@ def experimental_create_training_graph(input_graph=None,
   Args:
     input_graph: The tf.Graph to be transformed,if None then defaults to the
       default graph.
-    weight_bits:     Number of bits to use for quantizing weights.
+    weight_bits: Number of bits to use for quantizing weights.
     activation_bits: Number of bits to use for quantizing activations.
-    quant_delay:     Number of steps after which weights and
-                     activations are quantized during training.
+    quant_delay: Number of steps after which weights and activations are
+      quantized during training.
     freeze_bn_delay: Number of steps after which moving mean and variance are
-                     frozen and used instead of batch statistics during
-                     training. freeze_bn_delay should be greater than
-                     quant_delay and should correspond to when training
-                     has almost converged
-
+      frozen and used instead of batch statistics during training.
+      freeze_bn_delay should be greater than quant_delay and should correspond
+      to when training has almost converged
 
   Raises:
     ValueError: If elements contains an element that isn't a tf.Tensor or
@@ -171,7 +170,10 @@ def experimental_create_training_graph(input_graph=None,
 def experimental_create_eval_graph(input_graph=None,
                                    weight_bits=8,
                                    activation_bits=8):
-  """Returns a transformed eval input_graph for simulated quantization.
+  """Rewrites an eval input_graph in place for simulated quantization.
+
+  This function has additional experimental options not (yet) available to
+  create_eval_graph. The resulting behavior may be undefined.
 
   The graph has fake quantization ops inserted to simulate the error
   introduced by quantization. Since the graph is transformed in place,
@@ -188,7 +190,7 @@ def experimental_create_eval_graph(input_graph=None,
 
   Raises:
     ValueError: If elements contains an element that isn't a tf.Tensor or
-        tf.Operation.
+      tf.Operation.
   """
   _create_graph(
       input_graph=input_graph,
-- 
GitLab


From 249b67204e5c1eb4805b8b021386aa0364045063 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 9 Feb 2018 11:14:03 -0800
Subject: [PATCH 897/966] Remove the unneeded "tf" argument for utility
 functions.

PiperOrigin-RevId: 185166507
---
 .../py2tf/converters/side_effect_guards.py    |  6 +++---
 .../converters/side_effect_guards_test.py     | 20 ++++++-------------
 .../contrib/py2tf/utils/context_managers.py   |  7 ++++---
 .../py2tf/utils/context_managers_test.py      |  9 ++++-----
 tensorflow/contrib/py2tf/utils/misc.py        |  8 +++++---
 tensorflow/contrib/py2tf/utils/misc_test.py   | 14 ++-----------
 6 files changed, 24 insertions(+), 40 deletions(-)

diff --git a/tensorflow/contrib/py2tf/converters/side_effect_guards.py b/tensorflow/contrib/py2tf/converters/side_effect_guards.py
index 5895dc4954..30976b3ec6 100644
--- a/tensorflow/contrib/py2tf/converters/side_effect_guards.py
+++ b/tensorflow/contrib/py2tf/converters/side_effect_guards.py
@@ -160,8 +160,8 @@ class SideEffectGuardTransformer(transformer.Base):
               [alias_map.get(s, s).ast() for s in guarded_args], None)
 
         template = """
-          with py2tf_utils.control_dependency_on_returns(tf, call):
-            aliased_guarded_args = py2tf_utils.alias_tensors(tf, guarded_args)
+          with py2tf_utils.control_dependency_on_returns(call):
+            aliased_guarded_args = py2tf_utils.alias_tensors(guarded_args)
         """
         control_deps_guard = templates.replace(
             template,
@@ -172,7 +172,7 @@ class SideEffectGuardTransformer(transformer.Base):
         alias_map = {}
 
         template = """
-          with py2tf_utils.control_dependency_on_returns(tf, call):
+          with py2tf_utils.control_dependency_on_returns(call):
             pass
         """
         control_deps_guard = templates.replace(template, call=node.value)[-1]
diff --git a/tensorflow/contrib/py2tf/converters/side_effect_guards_test.py b/tensorflow/contrib/py2tf/converters/side_effect_guards_test.py
index b41c6fa5b9..463db2e770 100644
--- a/tensorflow/contrib/py2tf/converters/side_effect_guards_test.py
+++ b/tensorflow/contrib/py2tf/converters/side_effect_guards_test.py
@@ -23,7 +23,6 @@ from tensorflow.contrib.py2tf.converters import side_effect_guards
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
@@ -43,8 +42,7 @@ class SideEffectGuardsTest(converter_test_base.TestCase):
     node = self.parse_and_analyze(test_fn, {})
     node = side_effect_guards.transform(node, self.ctx)
 
-    with self.compiled(node, state_ops.assign, ops.control_dependencies,
-                       ops.Tensor) as result:
+    with self.compiled(node, state_ops.assign) as result:
       self.assertEqual(len(node.body[0].body), 1)
       with self.test_session() as sess:
         v = variables.Variable(2)
@@ -64,8 +62,7 @@ class SideEffectGuardsTest(converter_test_base.TestCase):
     node = self.parse_and_analyze(test_fn, {})
     node = side_effect_guards.transform(node, self.ctx)
 
-    with self.compiled(node, state_ops.assign, ops.control_dependencies,
-                       ops.Tensor) as result:
+    with self.compiled(node, state_ops.assign) as result:
       self.assertEqual(len(node.body[0].body), 1)
       with self.test_session() as sess:
         v = variables.Variable(2)
@@ -86,8 +83,7 @@ class SideEffectGuardsTest(converter_test_base.TestCase):
     node = self.parse_and_analyze(test_fn, {})
     node = side_effect_guards.transform(node, self.ctx)
 
-    with self.compiled(node, array_ops.identity, control_flow_ops.Assert,
-                       ops.control_dependencies, ops.Tensor) as result:
+    with self.compiled(node, control_flow_ops.Assert) as result:
       self.assertEqual(len(node.body[0].body), 1)
       with self.test_session() as sess:
         # NOTE: In this case we can also capture the side effect because the
@@ -111,8 +107,7 @@ class SideEffectGuardsTest(converter_test_base.TestCase):
     node = self.parse_and_analyze(test_fn, {})
     node = side_effect_guards.transform(node, self.ctx)
 
-    with self.compiled(node, array_ops.identity, state_ops.assign,
-                       ops.control_dependencies, ops.Tensor) as result:
+    with self.compiled(node, state_ops.assign) as result:
       self.assertEqual(len(node.body[0].body), 1)
       with self.test_session() as sess:
         v = variables.Variable(2)
@@ -134,9 +129,7 @@ class SideEffectGuardsTest(converter_test_base.TestCase):
     node = self.parse_and_analyze(test_fn, {})
     node = side_effect_guards.transform(node, self.ctx)
 
-    with self.compiled(node, array_ops.identity, state_ops.assign,
-                       ops.control_dependencies, ops.name_scope,
-                       ops.Tensor) as result:
+    with self.compiled(node, state_ops.assign, ops.name_scope) as result:
       self.assertEqual(len(node.body[0].body[0].body), 1)
       with self.test_session() as sess:
         v = variables.Variable(2)
@@ -158,8 +151,7 @@ class SideEffectGuardsTest(converter_test_base.TestCase):
     node = self.parse_and_analyze(test_fn, {})
     node = side_effect_guards.transform(node, self.ctx)
 
-    with self.compiled(node, array_ops.identity, state_ops.assign,
-                       ops.control_dependencies, ops.Tensor) as result:
+    with self.compiled(node, state_ops.assign) as result:
       self.assertEqual(len(node.body[0].body), 1)
       with self.test_session() as sess:
         v = variables.Variable(2)
diff --git a/tensorflow/contrib/py2tf/utils/context_managers.py b/tensorflow/contrib/py2tf/utils/context_managers.py
index 47d9839997..38d9e11fe9 100644
--- a/tensorflow/contrib/py2tf/utils/context_managers.py
+++ b/tensorflow/contrib/py2tf/utils/context_managers.py
@@ -20,14 +20,15 @@ from __future__ import print_function
 
 import contextlib
 
+from tensorflow.python.framework import ops
 
-def control_dependency_on_returns(tf, return_value):
+
+def control_dependency_on_returns(return_value):
   """Create a TF control dependency on the return values of a function.
 
   If the function had no return value, a no-op context is returned.
 
   Args:
-    tf: The TensorFlow module.
     return_value: The return value to set as control dependency.
 
   Returns:
@@ -38,4 +39,4 @@ def control_dependency_on_returns(tf, return_value):
   # TODO(mdan): Filter to tensor objects.
   if not isinstance(return_value, (list, tuple)):
     return_value = (return_value,)
-  return tf.control_dependencies(return_value)
+  return ops.control_dependencies(return_value)
diff --git a/tensorflow/contrib/py2tf/utils/context_managers_test.py b/tensorflow/contrib/py2tf/utils/context_managers_test.py
index c903f08252..633ba93540 100644
--- a/tensorflow/contrib/py2tf/utils/context_managers_test.py
+++ b/tensorflow/contrib/py2tf/utils/context_managers_test.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 from tensorflow.contrib.py2tf.utils import context_managers
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import ops
 from tensorflow.python.platform import test
 
 
@@ -28,14 +27,14 @@ class ContextManagersTest(test.TestCase):
 
   def test_control_dependency_on_returns(self):
     # Just dry run them.
-    with context_managers.control_dependency_on_returns(ops, None):
+    with context_managers.control_dependency_on_returns(None):
       pass
     with context_managers.control_dependency_on_returns(
-        ops, constant_op.constant(1)):
+        constant_op.constant(1)):
       pass
     with context_managers.control_dependency_on_returns(
-        ops, [constant_op.constant(1),
-              constant_op.constant(2)]):
+        [constant_op.constant(1),
+         constant_op.constant(2)]):
       pass
 
 
diff --git a/tensorflow/contrib/py2tf/utils/misc.py b/tensorflow/contrib/py2tf/utils/misc.py
index ee5cedd080..1b06caf0bd 100644
--- a/tensorflow/contrib/py2tf/utils/misc.py
+++ b/tensorflow/contrib/py2tf/utils/misc.py
@@ -18,14 +18,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
 
-def alias_tensors(tf, *args):
+
+def alias_tensors(*args):
   """Wrap any Tensor arguments with an identity op.
 
   Any other argument, including Variables, is returned unchanged.
 
   Args:
-    tf: The TensorFlow module.
     *args: Any arguments. Must contain at least one element.
 
   Returns:
@@ -36,7 +38,7 @@ def alias_tensors(tf, *args):
   """
 
   def alias_if_tensor(a):
-    return tf.identity(a) if isinstance(a, tf.Tensor) else a
+    return array_ops.identity(a) if isinstance(a, ops.Tensor) else a
 
   # TODO(mdan): Recurse into containers?
   # TODO(mdan): Anything we can do about variables? Fake a scope reuse?
diff --git a/tensorflow/contrib/py2tf/utils/misc_test.py b/tensorflow/contrib/py2tf/utils/misc_test.py
index 5613cc052a..bfcb304c83 100644
--- a/tensorflow/contrib/py2tf/utils/misc_test.py
+++ b/tensorflow/contrib/py2tf/utils/misc_test.py
@@ -18,12 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import imp
-
 from tensorflow.contrib.py2tf.utils import misc
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
@@ -32,11 +28,8 @@ class ContextManagersTest(test.TestCase):
 
   def test_alias_single_tensor(self):
     a = constant_op.constant(1)
-    fake_tf = imp.new_module('fake_tf')
-    fake_tf.identity = array_ops.identity
-    fake_tf.Tensor = ops.Tensor
 
-    new_a = misc.alias_tensors(fake_tf, a)
+    new_a = misc.alias_tensors(a)
     self.assertFalse(new_a is a)
     with self.test_session() as sess:
       self.assertEqual(1, sess.run(new_a))
@@ -46,11 +39,8 @@ class ContextManagersTest(test.TestCase):
     v = variables.Variable(2)
     s = 'a'
     l = [1, 2, 3]
-    fake_tf = imp.new_module('fake_tf')
-    fake_tf.identity = array_ops.identity
-    fake_tf.Tensor = ops.Tensor
 
-    new_a, new_v, new_s, new_l = misc.alias_tensors(fake_tf, a, v, s, l)
+    new_a, new_v, new_s, new_l = misc.alias_tensors(a, v, s, l)
 
     self.assertFalse(new_a is a)
     self.assertTrue(new_v is v)
-- 
GitLab


From 6c85a66a16f07bab9b5dc3df33bc6b8111b76615 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Fri, 9 Feb 2018 11:16:01 -0800
Subject: [PATCH 898/966] Add export calls for protos.

PiperOrigin-RevId: 185166764
---
 tensorflow/python/__init__.py          | 26 ++++++++++++++++++++++++++
 tensorflow/python/profiler/profiler.py |  7 +++++++
 tensorflow/python/training/training.py | 18 ++++++++++++++++++
 3 files changed, 51 insertions(+)

diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index ea7604d30f..6d405d04b1 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -116,6 +116,7 @@ from tensorflow.python.platform import test
 
 from tensorflow.python.util.all_util import remove_undocumented
 from tensorflow.python.util.all_util import make_all
+from tensorflow.python.util.tf_export import tf_export
 
 # Import modules whose docstrings contribute, for use by remove_undocumented
 # below.
@@ -167,6 +168,31 @@ _allowed_symbols = [
     'TensorInfo',  # Used for tf.saved_model functionality.
 ]
 
+# Export protos
+# pylint: disable=undefined-variable
+tf_export('AttrValue')(AttrValue)
+tf_export('ConfigProto')(ConfigProto)
+tf_export('Event', 'summary.Event')(Event)
+tf_export('GPUOptions')(GPUOptions)
+tf_export('GraphDef')(GraphDef)
+tf_export('GraphOptions')(GraphOptions)
+tf_export('HistogramProto')(HistogramProto)
+tf_export('LogMessage')(LogMessage)
+tf_export('MetaGraphDef')(MetaGraphDef)
+tf_export('NameAttrList')(NameAttrList)
+tf_export('NodeDef')(NodeDef)
+tf_export('OptimizerOptions')(OptimizerOptions)
+tf_export('RunMetadata')(RunMetadata)
+tf_export('RunOptions')(RunOptions)
+tf_export('SessionLog', 'summary.SessionLog')(SessionLog)
+tf_export('Summary', 'summary.Summary')(Summary)
+tf_export('summary.SummaryDescription')(SummaryDescription)
+tf_export('SummaryMetadata')(SummaryMetadata)
+tf_export('summary.TaggedRunMetadata')(TaggedRunMetadata)
+tf_export('TensorInfo')(TensorInfo)
+# pylint: enable=undefined-variable
+
+
 # The following symbols are kept for compatibility. It is our plan
 # to remove them in the future.
 _allowed_symbols.extend([
diff --git a/tensorflow/python/profiler/profiler.py b/tensorflow/python/profiler/profiler.py
index 130dcb5134..fa7f30b236 100644
--- a/tensorflow/python/profiler/profiler.py
+++ b/tensorflow/python/profiler/profiler.py
@@ -31,6 +31,7 @@ from tensorflow.python.profiler.option_builder import ProfileOptionBuilder
 from tensorflow.python.profiler.tfprof_logger import write_op_log
 
 from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.util.tf_export import tf_export
 
 
 _allowed_symbols = [
@@ -48,6 +49,12 @@ _allowed_symbols.extend([
     'OpLogProto',
 ])
 
+# Export protos
+tf_export('profiler.GraphNodeProto')(GraphNodeProto)
+tf_export('profiler.MultiGraphNodeProto')(MultiGraphNodeProto)
+tf_export('profiler.AdviceProto')(AdviceProto)
+tf_export('profiler.OpLogProto')(OpLogProto)
+
 remove_undocumented(__name__, _allowed_symbols, [
     Profiler,
     profile,
diff --git a/tensorflow/python/training/training.py b/tensorflow/python/training/training.py
index 03811fa38d..cdba18af8c 100644
--- a/tensorflow/python/training/training.py
+++ b/tensorflow/python/training/training.py
@@ -189,6 +189,7 @@ from tensorflow.python.training.training_util import create_global_step
 from tensorflow.python.training.training_util import get_or_create_global_step
 from tensorflow.python.pywrap_tensorflow import do_quantize_training_on_graphdef
 from tensorflow.python.pywrap_tensorflow import NewCheckpointReader
+from tensorflow.python.util.tf_export import tf_export
 
 # pylint: disable=wildcard-import
 # Training data protos.
@@ -239,6 +240,23 @@ _allowed_symbols = [
     "SequenceExample",  # from example_pb2.
     "ServerDef",
 ]
+
+# pylint: disable=undefined-variable
+tf_export("train.BytesList")(BytesList)
+tf_export("train.ClusterDef")(ClusterDef)
+tf_export("train.Example")(Example)
+tf_export("train.Feature")(Feature)
+tf_export("train.Features")(Features)
+tf_export("train.FeatureList")(FeatureList)
+tf_export("train.FeatureLists")(FeatureLists)
+tf_export("train.FloatList")(FloatList)
+tf_export("train.Int64List")(Int64List)
+tf_export("train.JobDef")(JobDef)
+tf_export("train.SaverDef")(SaverDef)
+tf_export("train.SequenceExample")(SequenceExample)
+tf_export("train.ServerDef")(ServerDef)
+# pylint: enable=undefined-variable
+
 # Include extra modules for docstrings because:
 # * Input methods in tf.train are documented in io_ops.
 # * Saver methods in tf.train are documented in state_ops.
-- 
GitLab


From b37913a6d1a2327b3aebcef857638e4ad2f465c3 Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Fri, 9 Feb 2018 11:17:51 -0800
Subject: [PATCH 899/966] Fix lint warnings. NFC

PiperOrigin-RevId: 185167035
---
 tensorflow/python/kernel_tests/BUILD          |   2 +-
 .../python/ops/control_flow_ops_test.py       | 168 ++++++++++--------
 2 files changed, 98 insertions(+), 72 deletions(-)

diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 3a6058054b..d4ceb2e489 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -1294,7 +1294,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "control_flow_ops_py_test",
-    # TOOD(b/70473603): change this back to "small" once the C API is
+    # TODO(b/70473603): change this back to "small" once the C API is
     # permanently enabled
     size = "medium",
     srcs = ["control_flow_ops_py_test.py"],
diff --git a/tensorflow/python/ops/control_flow_ops_test.py b/tensorflow/python/ops/control_flow_ops_test.py
index f942f478f2..f22f3059d1 100644
--- a/tensorflow/python/ops/control_flow_ops_test.py
+++ b/tensorflow/python/ops/control_flow_ops_test.py
@@ -189,7 +189,7 @@ class SwitchTestCase(test_util.TensorFlowTestCase):
       zero = constant_op.constant(0)
       one = constant_op.constant(1)
       less_op = math_ops.less(zero, one)
-      switch_false, switch_true = control_flow_ops.switch(data, less_op)
+      _, switch_true = control_flow_ops.switch(data, less_op)
       self.assertAllEqual([1, 2, 3], switch_true.values.eval())
       self.assertAllEqual([0, 1], switch_true.indices.eval())
 
@@ -199,16 +199,17 @@ class SwitchTestCase(test_util.TensorFlowTestCase):
           "embedding_matrix", [5, 5],
           initializer=init_ops.random_normal_initializer())
 
-      def Cond(it, _):
+      def cond(it, _):
         return it < 5
 
-      def Body(it, cost):
+      def body(it, cost):
         embedding = embedding_ops.embedding_lookup(embedding_matrix + 0.0, [0])
         cost += math_ops.reduce_sum(embedding)
         return it + 1, cost
 
       _, cost = control_flow_ops.while_loop(
-          Cond, Body, [constant_op.constant(0), constant_op.constant(0.0)])
+          cond, body, [constant_op.constant(0),
+                       constant_op.constant(0.0)])
       optimizer = momentum.MomentumOptimizer(0.1, 0.9)
       train_op = optimizer.minimize(cost)
       with self.test_session() as sess:
@@ -223,16 +224,17 @@ class SwitchTestCase(test_util.TensorFlowTestCase):
           initializer=[[2.0], [3.0]],
           use_resource=True)
 
-      def Cond(it, _):
+      def cond(it, _):
         return it < 5
 
-      def Body(it, cost):
+      def body(it, cost):
         embedding = embedding_ops.embedding_lookup(embedding_matrix, [0])
         cost += math_ops.reduce_sum(embedding)
         return it + 1, cost
 
       _, cost = control_flow_ops.while_loop(
-          Cond, Body, [constant_op.constant(0), constant_op.constant(0.0)])
+          cond, body, [constant_op.constant(0),
+                       constant_op.constant(0.0)])
       with self.test_session() as sess:
         sess.run(variables.global_variables_initializer())
         self.assertAllEqual(10.0, cost.eval())
@@ -244,10 +246,10 @@ class SwitchTestCase(test_util.TensorFlowTestCase):
           initializer=init_ops.random_normal_initializer(),
           use_resource=use_resource)
 
-      def Cond(it, _):
+      def cond(it, _):
         return it < 5
 
-      def Body(it, cost):
+      def body(it, cost):
         embedding = embedding_ops.embedding_lookup(embedding_matrix, [0])
         cost = control_flow_ops.cond(
             math_ops.equal(it, 3), lambda: math_ops.square(cost),
@@ -255,7 +257,8 @@ class SwitchTestCase(test_util.TensorFlowTestCase):
         return it + 1, cost
 
       _, cost = control_flow_ops.while_loop(
-          Cond, Body, [constant_op.constant(0), constant_op.constant(0.0)])
+          cond, body, [constant_op.constant(0),
+                       constant_op.constant(0.0)])
 
       dynamic_grads = gradients_impl.gradients(cost, [embedding_matrix])[0]
       dynamic_grads = math_ops.segment_sum(dynamic_grads.values,
@@ -289,15 +292,15 @@ class SwitchTestCase(test_util.TensorFlowTestCase):
             dtype=dtype, size=num_steps)
         initial_i = constant_op.constant(0, dtype=dtypes.int32)
 
-        def Cond(i, _):
+        def cond(i, _):
           return i < num_steps  # pylint: disable=cell-var-from-loop
 
-        def Body(i, outputs):
+        def body(i, outputs):
           x = array_ops.gather(inputs, i)  # pylint: disable=cell-var-from-loop
           outputs = outputs.write(i, x)
           return i + 1, outputs
 
-        _, outputs = control_flow_ops.while_loop(Cond, Body,
+        _, outputs = control_flow_ops.while_loop(cond, body,
                                                  [initial_i, initial_outputs])
 
         outputs = math_ops.reduce_sum(outputs.stack())
@@ -316,15 +319,15 @@ class SwitchTestCase(test_util.TensorFlowTestCase):
             dtype=dtype, dynamic_size=True, size=1)
         initial_i = constant_op.constant(0, dtype=dtypes.int32)
 
-        def Cond(i, _):
+        def cond(i, _):
           return i < array_ops.size(inputs)  # pylint: disable=cell-var-from-loop
 
-        def Body(i, outputs):
+        def body(i, outputs):
           x = array_ops.gather(inputs, i)  # pylint: disable=cell-var-from-loop
           outputs = outputs.write(i, x)
           return i + 1, outputs
 
-        _, outputs = control_flow_ops.while_loop(Cond, Body,
+        _, outputs = control_flow_ops.while_loop(cond, body,
                                                  [initial_i, initial_outputs])
 
         outputs = math_ops.reduce_sum(outputs.stack())
@@ -460,11 +463,12 @@ class ContextTest(test_util.TensorFlowTestCase):
       control_flow_ops.while_loop(
           c, b, [i], maximum_iterations=maximum_iterations)
       for op in sess.graph.get_operations():
-        context = op._get_control_flow_context()
-        if context:
-          self.assertProtoEquals(context.to_proto(),
-                                 control_flow_ops.WhileContext.from_proto(
-                                     context.to_proto()).to_proto())
+        control_flow_context = op._get_control_flow_context()
+        if control_flow_context:
+          self.assertProtoEquals(
+              control_flow_context.to_proto(),
+              control_flow_ops.WhileContext.from_proto(
+                  control_flow_context.to_proto()).to_proto())
 
   def testWhileContext(self):
     self._testWhileContextHelper()
@@ -498,8 +502,9 @@ class ContextTest(test_util.TensorFlowTestCase):
           c_with_scope._to_values_def(export_scope="test_scope"))
 
 
-def _GetNestedShape(nested):
-  def _GetShape(tensor):
+def _get_nested_shape(nested):
+
+  def _get_shape(tensor):
     if isinstance(tensor, tensor_array_ops.TensorArray):
       return tensor_array_ops.TensorArray
     elif isinstance(tensor, ops.IndexedSlices):
@@ -507,10 +512,10 @@ def _GetNestedShape(nested):
     else:
       return tensor.get_shape()
 
-  return nest.map_structure(_GetShape, nested)
+  return nest.map_structure(_get_shape, nested)
 
 
-def _CreateTensorArray(size, shape):
+def _create_tensor_array(size, shape):
   ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=size,
                                     clear_after_read=False)
   for i in range(size):
@@ -518,13 +523,15 @@ def _CreateTensorArray(size, shape):
   return ta
 
 
-def _RawNestedShape(nested_shape):
-  def _RawShape(shape):
+def _raw_nested_shape(nested_shape):
+
+  def _raw_shape(shape):
     if isinstance(shape, tensor_shape.TensorShape) and shape.ndims is not None:
       return [x.value for x in shape]
     else:
       return None
-  return nest.map_structure(_RawShape, nested_shape)
+
+  return nest.map_structure(_raw_shape, nested_shape)
 
 
 # TODO(yori): Add tests for indexed slices.
@@ -543,13 +550,15 @@ class DataTypesTest(test_util.TensorFlowTestCase):
     condition = array_ops.placeholder(dtypes.bool)
     output_cond = control_flow_ops.cond(condition, fn_true, fn_false,
                                         strict=strict)
-    self.assertEqual(_RawNestedShape(_GetNestedShape(output_cond)),
-                     _RawNestedShape(expected_shape))
+    self.assertEqual(
+        _raw_nested_shape(_get_nested_shape(output_cond)),
+        _raw_nested_shape(expected_shape))
 
     output_case = control_flow_ops.case([(condition, fn_true)], fn_false,
                                         strict=strict)
-    self.assertEqual(_RawNestedShape(_GetNestedShape(output_case)),
-                     _RawNestedShape(expected_shape))
+    self.assertEqual(
+        _raw_nested_shape(_get_nested_shape(output_case)),
+        _raw_nested_shape(expected_shape))
 
   def _testReturnValues(self, fn_true, fn_false, expected_value_true,
                         expected_value_false, strict=False,
@@ -626,45 +635,55 @@ class DataTypesTest(test_util.TensorFlowTestCase):
       control_flow_ops.cond(constant_op.constant(True), fn_tensor, fn_none)
 
   def test_tensors(self):
-    def _BuildTrueBranch(dtype):
-      def _Build():
+
+    def _build_true_branch(dtype):
+
+      def _build():
         return (array_ops.zeros([2, 2], dtype=dtype),
                 array_ops.ones([3, 3], dtype=dtype))
-      return _Build
 
-    def _BuildFalseBranch(dtype):
-      def _Build():
+      return _build
+
+    def _build_false_branch(dtype):
+
+      def _build():
         return (array_ops.ones([2, 2], dtype=dtype),
                 array_ops.zeros([3, 3], dtype=dtype))
-      return _Build
+
+      return _build
 
     for dtype in (dtypes.float16, dtypes.int8, dtypes.int32, dtypes.uint8):
       shape = (tensor_shape.TensorShape([2, 2]),
                tensor_shape.TensorShape([3, 3]))
-      fn_true = _BuildTrueBranch(dtype)
-      fn_false = _BuildFalseBranch(dtype)
+      fn_true = _build_true_branch(dtype)
+      fn_false = _build_false_branch(dtype)
       self._testShape(fn_true, fn_false, shape)
       self._testReturnValues(fn_true, fn_false,
                              (np.zeros([2, 2]), np.ones([3, 3])),
                              (np.ones([2, 2]), np.zeros([3, 3])))
 
   def test_tensors_unknown_shape(self):
-    def _BuildTrueBranch(dtype):
+
+    def _build_true_branch(dtype):
       tensor = array_ops.placeholder(dtype=dtype, shape=None)
-      def _Build():
+
+      def _build():
         return tensor
-      return _Build, tensor
 
-    def _BuildFalseBranch(dtype):
+      return _build, tensor
+
+    def _build_false_branch(dtype):
       tensor = array_ops.placeholder(dtype=dtype, shape=None)
-      def _Build():
+
+      def _build():
         return tensor
-      return _Build, tensor
+
+      return _build, tensor
 
     for dtype in (dtypes.float16, dtypes.int8, dtypes.int32, dtypes.uint8):
       shape = tensor_shape.TensorShape(None)
-      fn_true, true_tensor = _BuildTrueBranch(dtype)
-      fn_false, false_tensor = _BuildFalseBranch(dtype)
+      fn_true, true_tensor = _build_true_branch(dtype)
+      fn_false, false_tensor = _build_false_branch(dtype)
       self._testShape(fn_true, fn_false, shape)
       self._testReturnValues(fn_true, fn_false,
                              np.zeros([2, 2]), np.ones([2, 2]),
@@ -674,11 +693,11 @@ class DataTypesTest(test_util.TensorFlowTestCase):
   def test_sparse_tensors(self):
     shape = tensor_shape.TensorShape([None, None])
 
-    def FnTrue():
+    def true_fn():
       return [sparse_tensor.SparseTensor(indices=[[0, 0], [1, 2]],
                                          values=[1, 2], dense_shape=[3, 4])]
 
-    def FnFalse():
+    def false_fn():
       return [sparse_tensor.SparseTensor(indices=[[0, 0], [2, 1]],
                                          values=[3, 4], dense_shape=[3, 4])]
 
@@ -686,26 +705,29 @@ class DataTypesTest(test_util.TensorFlowTestCase):
                                              values=[1, 2], dense_shape=[3, 4])
     value2 = sparse_tensor.SparseTensorValue(indices=[[0, 0], [2, 1]],
                                              values=[3, 4], dense_shape=[3, 4])
-    self._testShape(FnTrue, FnFalse, shape)
-    self._testReturnValues(FnTrue, FnFalse, value1, value2)
-    self._testShape(FnTrue, FnFalse, [shape], strict=True)
-    self._testReturnValues(FnTrue, FnFalse, [value1], [value2], strict=True)
+    self._testShape(true_fn, false_fn, shape)
+    self._testReturnValues(true_fn, false_fn, value1, value2)
+    self._testShape(true_fn, false_fn, [shape], strict=True)
+    self._testReturnValues(true_fn, false_fn, [value1], [value2], strict=True)
 
   def test_tensors_with_partially_specified_shapes(self):
-    def _BuildBranch(dtype, shape):
+
+    def _build_branch(dtype, shape):
       a = array_ops.placeholder(dtype=dtype, shape=shape[0])
       b = array_ops.placeholder(dtype=dtype, shape=shape[1])
       c = array_ops.placeholder(dtype=dtype, shape=shape[2])
-      def _Build():
+
+      def _build():
         return a, b, c
-      return _Build, (a, b, c)
+
+      return _build, (a, b, c)
 
     for dtype in (dtypes.float16, dtypes.int8, dtypes.int32, dtypes.uint8):
       shape = (tensor_shape.TensorShape([None, 2]),
                tensor_shape.TensorShape([None]),
                tensor_shape.TensorShape([3, None]))
-      fn_true, true_tensors = _BuildBranch(dtype, shape)
-      fn_false, false_tensors = _BuildBranch(dtype, shape)
+      fn_true, true_tensors = _build_branch(dtype, shape)
+      fn_false, false_tensors = _build_branch(dtype, shape)
       self._testShape(fn_true, fn_false, shape)
       self._testReturnValues(fn_true, fn_false,
                              (np.zeros([2, 2]), np.zeros(5), np.ones([3, 3])),
@@ -719,8 +741,8 @@ class DataTypesTest(test_util.TensorFlowTestCase):
 
   def test_tensor_arrays(self):
     element_shape = tensor_shape.TensorShape([2])
-    ta1 = _CreateTensorArray(4, element_shape)
-    ta2 = _CreateTensorArray(4, element_shape)
+    ta1 = _create_tensor_array(4, element_shape)
+    ta2 = _create_tensor_array(4, element_shape)
     shape = tensor_array_ops.TensorArray
     fn_true = lambda: ta1
     fn_false = lambda: ta2
@@ -728,7 +750,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
 
   def test_tensor_array_reads(self):
     shape = tensor_shape.TensorShape([2])
-    ta = _CreateTensorArray(4, shape)
+    ta = _create_tensor_array(4, shape)
     fn_true = lambda: ta.read(0)
     fn_false = lambda: ta.read(1)
     self._testShape(fn_true, fn_false, shape)
@@ -827,23 +849,26 @@ class DataTypesTest(test_util.TensorFlowTestCase):
              tensor_shape.TensorShape([5, 5]),
              tensor_shape.TensorShape([])]
 
-    def FnTrue():
+    def true_fn():
       return [constant_op.constant(1),
               TestTuple(constant_op.constant(2), [3, 4]),
               array_ops.zeros([5, 5]), 6]
 
-    def FnFalse():
+    def false_fn():
       return [constant_op.constant(11),
               TestTuple(constant_op.constant(12), [13, 14]),
               array_ops.ones([5, 5]), 16]
 
-    self._testShape(FnTrue, FnFalse, shape)
-    self._testReturnValues(FnTrue, FnFalse,
-                           [1, TestTuple(2, [3, 4]), np.zeros([5, 5]), 6],
-                           [11, TestTuple(12, [13, 14]), np.ones([5, 5]), 16])
+    self._testShape(true_fn, false_fn, shape)
+    self._testReturnValues(
+        true_fn, false_fn,
+        [1, TestTuple(2, [3, 4]), np.zeros([5, 5]), 6],
+        [11, TestTuple(12, [13, 14]),
+         np.ones([5, 5]), 16])
 
   def test_cond_inside_while_loop(self):
-    def Body(i, matrix):
+
+    def body(i, matrix):
       result_tuple, unused_matrix = control_flow_ops.cond(
           constant_op.constant(True),
           lambda: (TestTuple(matrix * 2, matrix * 4), matrix),
@@ -852,8 +877,9 @@ class DataTypesTest(test_util.TensorFlowTestCase):
 
     iteration, matrix = control_flow_ops.while_loop(
         lambda i, matrix: i < 10,
-        Body,
-        loop_vars=[constant_op.constant(0), array_ops.ones([2, 2])])
+        body,
+        loop_vars=[constant_op.constant(0),
+                   array_ops.ones([2, 2])])
 
     self.assertEqual(iteration.get_shape(), tensor_shape.TensorShape([]))
     self.assertEqual(matrix.get_shape(), tensor_shape.TensorShape([2, 2]))
-- 
GitLab


From 297d03703f400878a353918c5e1ae55ea927f6a9 Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Fri, 9 Feb 2018 11:28:21 -0800
Subject: [PATCH 900/966] Simplify and extend the management of
 input-conditional losses and updates.

Instead of keeping track of dependencies manually, we rely on the TF graph structure to find dependencies. The resulting implementation is cleaner and more robust.

This does not change any existing behavior. It extends the current behavior by allowing `get_updates_for(inputs)` and `get_losses_for(inputs)` to be called from *any* tensors upstream of the layer, not just the immediate layer's inputs.

PiperOrigin-RevId: 185168680
---
 .../keras/_impl/keras/engine/topology_test.py |  30 ++--
 .../keras/_impl/keras/engine/training.py      |   9 +-
 .../_impl/keras/layers/normalization_test.py  |  12 +-
 .../keras/_impl/keras/layers/recurrent.py     |  33 ++--
 .../_impl/keras/layers/recurrent_test.py      |  40 +++--
 .../keras/_impl/keras/layers/wrappers.py      |  27 +---
 tensorflow/python/keras/_impl/keras/models.py |  30 ----
 tensorflow/python/layers/base.py              | 130 ++++++++--------
 tensorflow/python/layers/base_test.py         |  88 +++++++++++
 tensorflow/python/layers/network.py           | 120 +++++++++------
 tensorflow/python/layers/network_test.py      | 141 ++++++++++++++++--
 tensorflow/python/layers/utils.py             |  42 ++++++
 tensorflow/python/layers/utils_test.py        |  30 ++++
 ...nsorflow.keras.layers.-bidirectional.pbtxt |   4 +-
 .../tensorflow.keras.layers.-g-r-u.pbtxt      |   2 +-
 .../tensorflow.keras.layers.-l-s-t-m.pbtxt    |   2 +-
 .../tensorflow.keras.layers.-r-n-n.pbtxt      |   2 +-
 ...ensorflow.keras.layers.-simple-r-n-n.pbtxt |   2 +-
 ...ow.keras.layers.-stacked-r-n-n-cells.pbtxt |   2 +-
 ...rflow.keras.layers.-time-distributed.pbtxt |   4 +-
 .../tensorflow.keras.layers.-wrapper.pbtxt    |   4 +-
 21 files changed, 519 insertions(+), 235 deletions(-)

diff --git a/tensorflow/python/keras/_impl/keras/engine/topology_test.py b/tensorflow/python/keras/_impl/keras/engine/topology_test.py
index 85979d1745..0673e42376 100644
--- a/tensorflow/python/keras/_impl/keras/engine/topology_test.py
+++ b/tensorflow/python/keras/_impl/keras/engine/topology_test.py
@@ -26,6 +26,8 @@ import numpy as np
 from tensorflow.python.framework import dtypes
 from tensorflow.python.keras._impl import keras
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import test
 
 try:
@@ -42,22 +44,28 @@ except ImportError:
 class TopologyConstructionTest(test.TestCase):
 
   def test_get_updates_for(self):
-    a = keras.layers.Input(shape=(2,))
+    a = keras.layers.Input(shape=(1,))
     dense_layer = keras.layers.Dense(1)
-    dense_layer.add_update(0, inputs=a)
-    dense_layer.add_update(1, inputs=None)
+    dense_layer.build((None, 1))
+    update_1 = state_ops.assign_add(dense_layer.kernel, a)
+    update_2 = state_ops.assign_add(dense_layer.kernel, [[1.]])
+    dense_layer.add_update(update_1, inputs=a)
+    dense_layer.add_update(update_2, inputs=None)
 
-    self.assertListEqual(dense_layer.get_updates_for(a), [0])
-    self.assertListEqual(dense_layer.get_updates_for(None), [1])
+    self.assertListEqual(dense_layer.get_updates_for(a), [update_1])
+    self.assertListEqual(dense_layer.get_updates_for(None), [update_2])
 
   def test_get_losses_for(self):
-    a = keras.layers.Input(shape=(2,))
+    a = keras.layers.Input(shape=(1,))
     dense_layer = keras.layers.Dense(1)
-    dense_layer.add_loss(0, inputs=a)
-    dense_layer.add_loss(1, inputs=None)
-
-    self.assertListEqual(dense_layer.get_losses_for(a), [0])
-    self.assertListEqual(dense_layer.get_losses_for(None), [1])
+    dense_layer.build((None, 1))
+    loss_1 = math_ops.reduce_sum(a)
+    loss_2 = math_ops.reduce_sum(dense_layer.kernel)
+    dense_layer.add_loss(loss_1, inputs=a)
+    dense_layer.add_loss(loss_2, inputs=None)
+
+    self.assertListEqual(dense_layer.get_losses_for(a), [loss_1])
+    self.assertListEqual(dense_layer.get_losses_for(None), [loss_2])
 
   def test_trainable_weights(self):
     a = keras.layers.Input(shape=(2,))
diff --git a/tensorflow/python/keras/_impl/keras/engine/training.py b/tensorflow/python/keras/_impl/keras/engine/training.py
index faca964d4c..118598831d 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training.py
@@ -1020,10 +1020,13 @@ class Model(Network):
 
       with K.name_scope('training'):
         with K.name_scope(self.optimizer.__class__.__name__):
-          training_updates = self.optimizer.get_updates(
+          # Training updates
+          updates = self.optimizer.get_updates(
               params=self._collected_trainable_weights, loss=self.total_loss)
-
-        updates = self.updates + training_updates
+        # Unconditional updates
+        updates += self.get_updates_for(None)
+        # Conditional updates relevant to this model
+        updates += self.get_updates_for(self._feed_inputs)
         # Gets loss and metrics. Updates weights at each call.
         self.train_function = K.function(
             inputs, [self.total_loss] + self.metrics_tensors,
diff --git a/tensorflow/python/keras/_impl/keras/layers/normalization_test.py b/tensorflow/python/keras/_impl/keras/layers/normalization_test.py
index 39a90e5970..2b3628c3f1 100644
--- a/tensorflow/python/keras/_impl/keras/layers/normalization_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/normalization_test.py
@@ -132,13 +132,19 @@ class NormalizationLayersTest(test.TestCase):
       model.compile('sgd', 'mse')
       model.train_on_batch(x, x)
 
-      assert len(model.updates) == 2
+      self.assertEqual(len(bn.updates), 4)
+      self.assertEqual(len(model.updates), 2)
+      self.assertEqual(len(model.get_updates_for(x1)), 0)
+      self.assertEqual(len(model.get_updates_for(x2)), 2)
 
       # Test model-level reuse
       x3 = keras.layers.Input(shape=(10,))
       y3 = model(x3)
-      new_model = keras.models.Model(x3, y3)
-      assert len(model.updates) == 2
+      new_model = keras.models.Model(x3, y3, name='new_model')
+
+      self.assertEqual(len(new_model.updates), 2)
+      self.assertEqual(len(model.updates), 4)
+      self.assertEqual(len(new_model.get_updates_for(x3)), 2)
       new_model.compile('sgd', 'mse')
       new_model.train_on_batch(x, x)
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/recurrent.py b/tensorflow/python/keras/_impl/keras/layers/recurrent.py
index 5c1b523aa3..4bf6ae975f 100644
--- a/tensorflow/python/keras/_impl/keras/layers/recurrent.py
+++ b/tensorflow/python/keras/_impl/keras/layers/recurrent.py
@@ -202,17 +202,16 @@ class StackedRNNCells(Layer):
     losses = []
     for cell in self.cells:
       if isinstance(cell, Layer):
-        cell_losses = cell.losses
-        losses += cell_losses
-    return losses
+        losses += cell.losses
+    return losses + self._losses
 
-  def get_losses_for(self, inputs=None):
-    losses = []
+  @property
+  def updates(self):
+    updates = []
     for cell in self.cells:
       if isinstance(cell, Layer):
-        cell_losses = cell.get_losses_for(inputs)
-        losses += cell_losses
-    return losses
+        updates += cell.updates
+    return updates + self._updates
 
 
 @tf_export('keras.layers.RNN')
@@ -617,7 +616,7 @@ class RNN(Layer):
     if self.stateful:
       updates = []
       for i in range(len(states)):
-        updates.append((self.states[i], states[i]))
+        updates.append(K.update(self.states[i], states[i]))
       self.add_update(updates, inputs)
 
     if self.return_sequences:
@@ -777,15 +776,17 @@ class RNN(Layer):
 
   @property
   def losses(self):
+    losses = []
     if isinstance(self.cell, Layer):
-      return self.cell.losses
-    return []
+      losses += self.cell.losses
+    return losses + self._losses
 
-  def get_losses_for(self, inputs=None):
+  @property
+  def updates(self):
+    updates = []
     if isinstance(self.cell, Layer):
-      cell_losses = self.cell.get_losses_for(inputs)
-      return cell_losses + super(RNN, self).get_losses_for(inputs)
-    return super(RNN, self).get_losses_for(inputs)
+      updates += self.cell.updates
+    return updates + self._updates
 
 
 @tf_export('keras.layers.SimpleRNNCell')
@@ -2463,7 +2464,7 @@ class Recurrent(Layer):
     if self.stateful:
       updates = []
       for i in range(len(states)):
-        updates.append((self.states[i], states[i]))
+        updates.append(K.update(self.states[i], states[i]))
       self.add_update(updates, inputs)
 
     # Properly set learning phase
diff --git a/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py b/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py
index a1407a24ea..ab48a63e35 100644
--- a/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py
@@ -353,13 +353,10 @@ class RNNTest(test.TestCase):
       self.assertAllClose(y_np, y_np_3, atol=1e-4)
 
   def test_stacked_rnn_attributes(self):
-    cells = [keras.layers.LSTMCell(3),
-             keras.layers.LSTMCell(3, kernel_regularizer='l2')]
+    cells = [keras.layers.LSTMCell(1),
+             keras.layers.LSTMCell(1)]
     layer = keras.layers.RNN(cells)
-    layer.build((None, None, 5))
-
-    # Test regularization losses
-    self.assertEqual(len(layer.losses), 1)
+    layer.build((None, None, 1))
 
     # Test weights
     self.assertEqual(len(layer.trainable_weights), 6)
@@ -367,11 +364,32 @@ class RNNTest(test.TestCase):
     self.assertEqual(len(layer.trainable_weights), 3)
     self.assertEqual(len(layer.non_trainable_weights), 3)
 
-    # Test `get_losses_for`
-    x = keras.Input((None, 5))
-    y = keras.backend.sum(x)
-    cells[0].add_loss(y, inputs=x)
-    self.assertEqual(layer.get_losses_for(x), [y])
+    # Test `get_losses_for` and `losses`
+    x = keras.Input((None, 1))
+    loss_1 = keras.backend.sum(x)
+    loss_2 = keras.backend.sum(cells[0].kernel)
+    cells[0].add_loss(loss_1, inputs=x)
+    cells[0].add_loss(loss_2)
+    self.assertEqual(len(layer.losses), 2)
+    self.assertEqual(layer.get_losses_for(None), [loss_2])
+    self.assertEqual(layer.get_losses_for(x), [loss_1])
+
+    # Test `get_updates_for` and `updates`
+    cells = [keras.layers.LSTMCell(1),
+             keras.layers.LSTMCell(1)]
+    layer = keras.layers.RNN(cells)
+    layer.build((None, None, 1))
+
+    x = keras.Input((None, 1))
+    update_1 = keras.backend.update_add(
+        cells[0].kernel, x[0, 0, 0] * cells[0].kernel)
+    update_2 = keras.backend.update_add(
+        cells[0].kernel, keras.backend.ones_like(cells[0].kernel))
+    cells[0].add_update(update_1, inputs=x)
+    cells[0].add_update(update_2)
+    self.assertEqual(len(layer.updates), 2)
+    self.assertEqual(layer.get_updates_for(None), [update_2])
+    self.assertEqual(layer.get_updates_for(x), [update_1])
 
   def test_rnn_dynamic_trainability(self):
     layer_class = keras.layers.SimpleRNN
diff --git a/tensorflow/python/keras/_impl/keras/layers/wrappers.py b/tensorflow/python/keras/_impl/keras/layers/wrappers.py
index c697bcefc9..f053aa1d09 100644
--- a/tensorflow/python/keras/_impl/keras/layers/wrappers.py
+++ b/tensorflow/python/keras/_impl/keras/layers/wrappers.py
@@ -71,34 +71,11 @@ class Wrapper(Layer):
 
   @property
   def updates(self):
-    if hasattr(self.layer, 'updates'):
-      return self.layer.updates
-    return []
-
-  def get_updates_for(self, inputs=None):
-    # If the wrapper modifies the inputs, use the modified inputs to
-    # get the updates from the inner layer.
-    inner_inputs = inputs
-    if inputs is not None:
-      uid = tf_layers_util.object_list_uid(inputs)
-      if uid in self._input_map:
-        inner_inputs = self._input_map[uid]
-
-    updates = self.layer.get_updates_for(inner_inputs)
-    updates += super(Wrapper, self).get_updates_for(inputs)
-    return updates
+    return self.layer.updates + self._updates
 
   @property
   def losses(self):
-    if hasattr(self.layer, 'losses'):
-      return self.layer.losses
-    return []
-
-  def get_losses_for(self, inputs=None):
-    if inputs is None:
-      losses = self.layer.get_losses_for(None)
-      return losses + super(Wrapper, self).get_losses_for(None)
-    return super(Wrapper, self).get_losses_for(inputs)
+    return self.layer.losses + self._losses
 
   def get_weights(self):
     return self.layer.get_weights()
diff --git a/tensorflow/python/keras/_impl/keras/models.py b/tensorflow/python/keras/_impl/keras/models.py
index 20736d234e..f5d44ef669 100644
--- a/tensorflow/python/keras/_impl/keras/models.py
+++ b/tensorflow/python/keras/_impl/keras/models.py
@@ -428,8 +428,6 @@ class Sequential(Model):
     # Used by Layer base class.
     self._dtype = None
     self._activity_regularizer = None
-    self._per_input_losses = {}
-    self._per_input_updates = {}
 
     # The following properties are not actually used by Keras;
     # they exist for compatibility with TF's variable scoping mechanism.
@@ -643,34 +641,6 @@ class Sequential(Model):
       return trainable_weights + weights
     return weights
 
-  @property
-  def updates(self):
-    if not self.built:
-      self.build()
-    return self.model.updates
-
-  @property
-  def state_updates(self):
-    if not self.built:
-      self.build()
-    return self.model.state_updates
-
-  def get_updates_for(self, inputs):
-    if not self.built:
-      self.build()
-    return self.model.get_updates_for(inputs)
-
-  @property
-  def losses(self):
-    if not self.built:
-      self.build()
-    return self.model.losses
-
-  def get_losses_for(self, inputs):
-    if not self.built:
-      self.build()
-    return self.model.get_losses_for(inputs)
-
   @property
   def regularizers(self):
     if not self.built:
diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py
index 3a3c559541..0d78ef25f2 100644
--- a/tensorflow/python/layers/base.py
+++ b/tensorflow/python/layers/base.py
@@ -127,8 +127,6 @@ class Layer(object):
     self._losses = []
     self._reuse = kwargs.get('_reuse')
     self._graph = ops.get_default_graph()
-    self._per_input_losses = {}
-    self._per_input_updates = {}
     self._dtype = None if dtype is None else dtypes.as_dtype(dtype).name
     call_fn_args = estimator_util.fn_args(self.call)
     self._compute_previous_mask = ('mask' in call_fn_args or
@@ -252,39 +250,32 @@ class Layer(object):
 
     Arguments:
       updates: Update op, or list/tuple of update ops.
-      inputs: Optional input tensor(s) that the update(s) depend on. Must
-        match the `inputs` argument passed to the `__call__` method at the time
-        the updates are created. If `None` is passed, the updates are assumed
-        to be unconditional, and will apply across all dataflows of the layer.
+      inputs: If anything other than None is passed, it signals the updates
+        are conditional on some of the layer's inputs,
+        and thus they should only be run where these inputs are available.
+        This is the case for BatchNormalization updates, for instance.
+        If None, the updates will be taken into account unconditionally,
+        and you are responsible for making sure that any dependency they might
+        have is available at runtime.
+        A step counter might fall into this category.
     """
     if context.in_eager_mode():
       return  # Updates already applied when in eager mode.
+
     updates = _to_list(updates)
-    if not updates:
-      return
     self._updates += updates
-    if inputs is not None:
-      inputs = nest.flatten(inputs)
-    if not inputs:
-      inputs = None
-    if inputs is not None:
-      # We compute an ID that uniquely identifies the list of tensors.
-      # This ID is order-sensitive.
-      inputs_hash = layers_util.object_list_uid(inputs)
+    if inputs is None:
+      for u in updates:
+        u._unconditional_update = True  # pylint: disable=protected-access
     else:
-      inputs_hash = None
-    if inputs_hash not in self._per_input_updates:
-      self._per_input_updates[inputs_hash] = []
-    self._per_input_updates[inputs_hash] += updates
+      for u in updates:
+        u._unconditional_update = False  # pylint: disable=protected-access
 
   def get_updates_for(self, inputs):
     """Retrieves updates relevant to a specific set of inputs.
 
     Arguments:
       inputs: Input tensor or list/tuple of input tensors.
-        Must match the `inputs` argument passed to the `__call__` method
-        at the time the updates were created.
-        If you pass `inputs=None`, unconditional updates are returned.
 
     Returns:
       List of update ops of the layer that depend on `inputs`.
@@ -293,18 +284,24 @@ class Layer(object):
       RuntimeError: If called in Eager mode.
     """
     if context.in_eager_mode():
-      raise RuntimeError('Layer.get_updates_for not supported in Eager mode.')
+      raise RuntimeError('`get_updates_for()` not supported in Eager mode.')
+
+    # Updates disabled if layer is not trainable and not explicitly stateful.
     if not self.trainable and not self.stateful:
       return []
-    if inputs is not None:
-      inputs = nest.flatten(inputs)
-    if not inputs:
-      inputs = None
-    if inputs is not None:
-      inputs_hash = layers_util.object_list_uid(inputs)
-    else:
-      inputs_hash = None
-    return self._per_input_updates.get(inputs_hash, [])
+
+    if inputs is None:
+      # Requesting unconditional updates.
+      return [x for x in self.updates if x._unconditional_update]  # pylint: disable=protected-access
+
+    # Requesting input-conditional updates.
+    inputs = nest.flatten(inputs)
+    reachable = layers_util.get_reachable_from_inputs(inputs, self.updates)
+    updates = []
+    for update in self.updates:
+      if update in reachable:
+        updates.append(update)
+    return updates
 
   @property
   def losses(self):
@@ -344,9 +341,11 @@ class Layer(object):
 
     Arguments:
       losses: Loss tensor, or list/tuple of tensors.
-      inputs: Optional input tensor(s) that the loss(es) depend on. Must
-        match the `inputs` argument passed to the `__call__` method at the time
-        the losses are created. If `None` is passed, the losses are assumed
+      inputs: If anything other than None is passed, it signals the losses
+        are conditional on some of the layer's inputs,
+        and thus they should only be run where these inputs are available.
+        This is the case for activity regularization losses, for instance.
+        If `None` is passed, the losses are assumed
         to be unconditional, and will apply across all dataflows of the layer
         (e.g. weight regularization losses).
 
@@ -354,24 +353,25 @@ class Layer(object):
       RuntimeError: If called in Eager mode.
     """
     if context.in_eager_mode():
+      # TODO(fchollet): it should be possible (and highly desirable) to support
+      # `add_loss` in eager mode. This allows great convenience and flexibility
+      # in defining custom losses on the fly (e.g. in VAEs).
+      # Simply appending the loss value to `self._losses`
+      # is the correct behavior.
+      # The only caveat is that we need to force the user to only call
+      # `add_loss` from inside a model or Layer's `call` method
+      # (otherwise the loss computation cannot be backproped through).
       raise RuntimeError('Layer.add_loss not supported in Eager mode.')
+
     losses = _to_list(losses)
-    if not losses:
-      return
     self._losses += losses
-    if inputs is not None:
-      inputs = nest.flatten(inputs)
-    if not inputs:
-      inputs = None
-    if inputs is not None:
-      # We compute an ID that uniquely identifies the list of tensors.
-      # This ID is order-sensitive.
-      inputs_hash = layers_util.object_list_uid(inputs)
+    if inputs is None:
+      for loss in losses:
+        loss._unconditional_loss = True  # pylint: disable=protected-access
     else:
-      inputs_hash = None
-    if inputs_hash not in self._per_input_losses:
-      self._per_input_losses[inputs_hash] = []
-    self._per_input_losses[inputs_hash] += losses
+      for loss in losses:
+        loss._unconditional_loss = False  # pylint: disable=protected-access
+    # TODO(fchollet): deprecate collection below.
     _add_elements_to_collection(losses, ops.GraphKeys.REGULARIZATION_LOSSES)
 
   def get_losses_for(self, inputs):
@@ -379,10 +379,6 @@ class Layer(object):
 
     Arguments:
       inputs: Input tensor or list/tuple of input tensors.
-        Must match the `inputs` argument passed to the `__call__`
-        method at the time the losses were created.
-        If you pass `inputs=None`, unconditional losses are returned,
-        such as weight regularization losses.
 
     Returns:
       List of loss tensors of the layer that depend on `inputs`.
@@ -392,15 +388,23 @@ class Layer(object):
     """
     if context.in_eager_mode():
       raise RuntimeError('Layer.get_losses_for not supported in Eager mode.')
-    if inputs is not None:
-      inputs = nest.flatten(inputs)
-    if not inputs:
-      inputs = None
-    if inputs is not None:
-      inputs_hash = layers_util.object_list_uid(inputs)
-    else:
-      inputs_hash = None
-    return self._per_input_losses.get(inputs_hash, [])
+
+    if inputs is None:
+      # Requesting unconditional losses.
+      return [x for x in self.losses if x._unconditional_loss]  # pylint: disable=protected-access
+
+    # Requesting input-conditional losses.
+    inputs = nest.flatten(inputs)
+    # Retrieve the set of tensors in the TF graph that depend on `inputs`.
+    # The losses we want to return will be part of this set.
+    # To avoid unnecessary work, we stop the search in case all of
+    # `self.losses` have been retrieved.
+    reachable = layers_util.get_reachable_from_inputs(inputs, self.losses)
+    losses = []
+    for loss in self.losses:
+      if loss in reachable:
+        losses.append(loss)
+    return losses
 
   def build(self, _):
     """Creates the variables of the layer."""
diff --git a/tensorflow/python/layers/base_test.py b/tensorflow/python/layers/base_test.py
index 06ba214c0f..91b8988d31 100644
--- a/tensorflow/python/layers/base_test.py
+++ b/tensorflow/python/layers/base_test.py
@@ -31,6 +31,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import test
 
@@ -555,6 +556,93 @@ class BaseLayerTest(test.TestCase):
         self.assertEqual(len(layer.trainable_variables), 1)
         self.assertEqual(layer.variables[0].graph, outer_graph)
 
+  def testGetUpdateFor(self):
+
+    class MyLayer(base_layers.Layer):
+
+      def build(self, input_shape):
+        self.a = self.add_variable('a',
+                                   (),
+                                   dtypes.float32,
+                                   trainable=False)
+        self.b = self.add_variable('b',
+                                   (),
+                                   dtypes.float32,
+                                   trainable=False)
+        self.add_update(state_ops.assign_add(self.a, 1., name='b_update'))
+        self.built = True
+
+      def call(self, inputs):
+        self.add_update(state_ops.assign_add(self.a, inputs, name='a_update'),
+                        inputs=True)
+        return inputs + 1
+
+    layer = MyLayer()
+    inputs = array_ops.placeholder(dtypes.float32, (), 'inputs')
+    intermediate_inputs = inputs + 1
+    outputs = layer.apply(intermediate_inputs)
+
+    self.assertEqual(len(layer.updates), 2)
+    self.assertEqual(len(layer.get_updates_for(None)), 1)
+    self.assertEqual(len(layer.get_updates_for([inputs])), 1)
+    self.assertEqual(len(layer.get_updates_for([intermediate_inputs])), 1)
+    self.assertEqual(len(layer.get_updates_for([outputs])), 0)
+
+    # Call same layer on new input, creating one more conditional update
+    inputs = array_ops.placeholder(dtypes.float32, (), 'inputs')
+    intermediate_inputs = inputs + 1
+    outputs = layer.apply(intermediate_inputs)
+
+    self.assertEqual(len(layer.updates), 3)
+    self.assertEqual(len(layer.get_updates_for(None)), 1)
+    # Check that we are successfully filtering out irrelevant updates
+    self.assertEqual(len(layer.get_updates_for([inputs])), 1)
+    self.assertEqual(len(layer.get_updates_for([intermediate_inputs])), 1)
+    self.assertEqual(len(layer.get_updates_for([outputs])), 0)
+
+  def testGetLossesFor(self):
+
+    class MyLayer(base_layers.Layer):
+
+      def build(self, input_shape):
+        self.a = self.add_variable('a',
+                                   (),
+                                   dtypes.float32,
+                                   trainable=False)
+        self.b = self.add_variable('b',
+                                   (),
+                                   dtypes.float32,
+                                   trainable=False)
+        self.add_loss(self.a)
+        self.built = True
+
+      def call(self, inputs):
+        self.add_loss(inputs, inputs=True)
+        return inputs + 1
+
+    layer = MyLayer()
+    inputs = array_ops.placeholder(dtypes.float32, (), 'inputs')
+    intermediate_inputs = inputs + 1
+    outputs = layer.apply(intermediate_inputs)
+
+    self.assertEqual(len(layer.losses), 2)
+    self.assertEqual(len(layer.get_losses_for(None)), 1)
+    self.assertEqual(len(layer.get_losses_for([inputs])), 1)
+    self.assertEqual(len(layer.get_losses_for([intermediate_inputs])), 1)
+    self.assertEqual(len(layer.get_losses_for([outputs])), 0)
+
+    # Call same layer on new input, creating one more conditional loss
+    inputs = array_ops.placeholder(dtypes.float32, (), 'inputs')
+    intermediate_inputs = inputs + 1
+    outputs = layer.apply(intermediate_inputs)
+
+    self.assertEqual(len(layer.losses), 3)
+    self.assertEqual(len(layer.get_losses_for(None)), 1)
+    # Check that we are successfully filtering out irrelevant losses
+    self.assertEqual(len(layer.get_losses_for([inputs])), 1)
+    self.assertEqual(len(layer.get_losses_for([intermediate_inputs])), 1)
+    self.assertEqual(len(layer.get_losses_for([outputs])), 0)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/layers/network.py b/tensorflow/python/layers/network.py
index 6de8f35502..499f53d21b 100644
--- a/tensorflow/python/layers/network.py
+++ b/tensorflow/python/layers/network.py
@@ -256,8 +256,6 @@ class GraphNetwork(base.Layer):
     # self.input_spec
 
     # Private attributes to implement compatibility with Layer.
-    self._per_input_losses = {}
-    self._per_input_updates = {}
     self._updates = []
     self._losses = []
     self._scope = None
@@ -587,28 +585,72 @@ class GraphNetwork(base.Layer):
 
     Will only include updates that are either
     unconditional, or conditional on inputs to this model
-    (e.g. will not include updates that depend on tensors
-    that aren't inputs to this model).
+    (e.g. will not include updates that were created by layers of this model
+    outside of the model).
+
+    Effectively, `network.updates` behaves like `layer.updates`.
+
+    Concrete example:
+
+    ```python
+      bn = keras.layers.BatchNormalization()
+      x1 = keras.layers.Input(shape=(10,))
+      _ = bn(x1)  # This creates 2 updates.
+
+      x2 = keras.layers.Input(shape=(10,))
+      y2 = bn(x2)  # This creates 2 more updates.
+
+      # The BN layer has now 4 updates.
+      self.assertEqual(len(bn.updates), 4)
+
+      # Let's create a model from x2 to y2.
+      model = keras.models.Model(x2, y2)
+
+      # The model does not list all updates from its underlying layers,
+      # but only the updates that are relevant to it. Updates created by layers
+      # outside of the model are discarded.
+      self.assertEqual(len(model.updates), 2)
+
+      # If you keep calling the model, you append to its updates, just like
+      # what happens for a layer.
+      x3 = keras.layers.Input(shape=(10,))
+      y3 = model(x3)
+      self.assertEqual(len(model.updates), 4)
+
+      # But if you call the inner BN layer independently, you don't affect
+      # the model's updates.
+      x4 = keras.layers.Input(shape=(10,))
+      _ = bn(x4)
+      self.assertEqual(len(model.updates), 4)
+    ```
 
     Returns:
         A list of update ops.
     """
     if not self.trainable and not self.stateful:
       return []
+
     updates = []
     for layer in self.layers:
-      if hasattr(layer, 'updates'):
-        # Collect updates that are dependent on inputs
-        # that are part of the model.
-        for node_index, node in enumerate(layer._inbound_nodes):  # pylint: disable=protected-access
-          node_key = _make_node_key(layer.name, node_index)
-          if node_key in self._network_nodes:
-            # The model owns this layer node.
-            inputs = node.input_tensors
-            updates += layer.get_updates_for(inputs)
-        # Collect unconditional updates.
-        updates += layer.get_updates_for(None)
-    return updates
+      updates += layer.updates
+
+    # `updates` might contain irrelevant updates, so it needs to be filtered
+    # with respect to inputs the model has been called on.
+    relevant_inputs = []
+    for i in range(len(self._inbound_nodes)):
+      inputs = self.get_input_at(i)
+      if isinstance(inputs, list):
+        relevant_inputs += inputs
+      else:
+        relevant_inputs.append(inputs)
+    reachable = layers_util.get_reachable_from_inputs(relevant_inputs, updates)
+    relevant_conditional_updates = [x for x in updates if x in reachable]
+    unconditional_updates = [
+        x for x in updates if x._unconditional_update]  # pylint: disable=protected-access
+    # A layer could be used multiple times in a nested structure,
+    # so the updates list must be de-duped.
+    return list(set(
+        relevant_conditional_updates + unconditional_updates + self._updates))
 
   @property
   def losses(self):
@@ -628,22 +670,22 @@ class GraphNetwork(base.Layer):
         losses += layer.losses
       return losses
 
-    # Retrieve losses for all internal layers.
     for layer in self.layers:
-      if hasattr(layer, 'losses'):
-        # Collect losses that are dependent on inputs
-        # that are part of the model.
-        for node_index, node in enumerate(layer._inbound_nodes):  # pylint: disable=protected-access
-          node_key = _make_node_key(layer.name, node_index)
-          if node_key in self._network_nodes:
-            # The model owns this layer node.
-            inputs = node.input_tensors
-            losses += layer.get_losses_for(inputs)
-        # Collect unconditional losses.
-        losses += layer.get_losses_for(None)
-    # Add any potential unconditional model-level loss.
-    losses += self.get_losses_for(None)
-    return losses
+      losses += layer.losses
+
+    relevant_inputs = []
+    for i in range(len(self._inbound_nodes)):
+      inputs = self.get_input_at(i)
+      if isinstance(inputs, list):
+        relevant_inputs += inputs
+      else:
+        relevant_inputs.append(inputs)
+    reachable = layers_util.get_reachable_from_inputs(relevant_inputs, losses)
+    relevant_conditional_losses = [x for x in losses if x in reachable]
+    unconditional_losses = [
+        x for x in losses if x._unconditional_loss]  # pylint: disable=protected-access
+    return list(set(
+        relevant_conditional_losses + unconditional_losses + self._losses))
 
   @property
   def trainable_weights(self):
@@ -805,7 +847,6 @@ class GraphNetwork(base.Layer):
           layer, node_index, tensor_index = self._output_coordinates[i]
           shape_key = layer.name + '_%s_%s' % (node_index, tensor_index)
           output_shapes.append(layers_to_output_shapes[shape_key])
-
         # Store in cache.
         self._output_shape_cache[cache_key] = output_shapes
     else:
@@ -915,20 +956,6 @@ class GraphNetwork(base.Layer):
                 # Apply activity regularizer if any:
                 layer.add_loss(regularization_losses, computed_tensors)
 
-          if context.in_graph_mode():
-            # Update model updates and losses:
-            # Keep track of updates that depend on the inputs
-            # (e.g. BN updates).
-            self.add_update(layer.get_updates_for(computed_tensors), inputs)
-            # Keep track of unconditional updates (e.g. a counter).
-            self.add_update(layer.get_updates_for(None), None)
-            # Keep track of losses that depend on the inputs
-            # (e.g. activity regularizers).
-            self.add_loss(layer.get_losses_for(computed_tensors), inputs)
-            # Keep track of unconditional losses
-            # (e.g. weight regularizers).
-            self.add_loss(layer.get_losses_for(None), None)
-
           # Update tensor_map.
           for x, y, mask in zip(reference_output_tensors, output_tensors,
                                 output_masks):
@@ -958,6 +985,7 @@ class GraphNetwork(base.Layer):
                    + '_' + layers_util.object_list_uid(masks))
       self._output_tensor_cache[cache_key] = output_tensors
       self._output_mask_cache[cache_key] = output_masks
+
       if output_shapes is not None:
         input_shapes = [layers_util.static_shape(x) for x in inputs]
         cache_key = layers_util.object_list_uid(input_shapes)
diff --git a/tensorflow/python/layers/network_test.py b/tensorflow/python/layers/network_test.py
index 7a2c7fb3fc..f46ebdf2af 100644
--- a/tensorflow/python/layers/network_test.py
+++ b/tensorflow/python/layers/network_test.py
@@ -27,29 +27,137 @@ from tensorflow.python.layers import base as base_layers
 from tensorflow.python.layers import core as core_layers
 from tensorflow.python.layers import network as network_layers
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import test
 
 
 class BaseLayerCompatibilityTest(test.TestCase):
 
-  def test_get_updates_for(self):
-    a = network_layers.Input(shape=(2,))
-    dense_layer = core_layers.Dense(1)
-    dense_layer.add_update(0, inputs=a)
-    dense_layer.add_update(1, inputs=None)
+  def test_get_updates(self):
 
-    self.assertEqual(dense_layer.get_updates_for(a), [0])
-    self.assertEqual(dense_layer.get_updates_for(None), [1])
+    class MyLayer(base_layers.Layer):
 
-  def test_get_losses_for(self):
-    a = network_layers.Input(shape=(2,))
-    dense_layer = core_layers.Dense(1)
-    dense_layer.add_loss(0, inputs=a)
-    dense_layer.add_loss(1, inputs=None)
+      def build(self, input_shape):
+        self.a = self.add_variable('a',
+                                   (1, 1),
+                                   'float32',
+                                   trainable=False)
+        self.b = self.add_variable('b',
+                                   (1, 1),
+                                   'float32',
+                                   trainable=False)
+        self.add_update(state_ops.assign_add(self.a, [[1.]]))
+        self.built = True
 
-    self.assertEqual(dense_layer.get_losses_for(a), [0])
-    self.assertEqual(dense_layer.get_losses_for(None), [1])
+      def call(self, inputs):
+        self.add_update(state_ops.assign_add(self.a, inputs),
+                        inputs=True)
+        return inputs + 1
+
+    x1 = network_layers.Input(shape=(1,))
+    layer = MyLayer()
+    _ = layer.apply(x1)
+
+    self.assertEqual(len(layer.updates), 2)
+    self.assertEqual(len(layer.get_updates_for(x1)), 1)
+    self.assertEqual(len(layer.get_updates_for(None)), 1)
+
+    x2 = network_layers.Input(shape=(1,))
+    y2 = layer.apply(x2)
+
+    self.assertEqual(len(layer.updates), 3)
+    self.assertEqual(len(layer.get_updates_for(x1)), 1)
+    self.assertEqual(len(layer.get_updates_for(x2)), 1)
+    self.assertEqual(len(layer.get_updates_for(None)), 1)
+
+    network = network_layers.GraphNetwork(x2, y2)
+    self.assertEqual(len(network.updates), 2)
+    self.assertEqual(len(network.get_updates_for(x1)), 0)
+    self.assertEqual(len(network.get_updates_for(x2)), 1)
+    self.assertEqual(len(network.get_updates_for(None)), 1)
+
+    x3 = network_layers.Input(shape=(1,))
+    _ = layer.apply(x3)
+    self.assertEqual(len(network.updates), 2)
+
+    x4 = network_layers.Input(shape=(1,))
+    _ = network(x4)
+    self.assertEqual(len(network.updates), 3)
+    self.assertEqual(len(network.get_updates_for(x2)), 1)
+    self.assertEqual(len(network.get_updates_for(x4)), 1)
+    self.assertEqual(len(network.get_updates_for(None)), 1)
+
+    network.add_update(state_ops.assign_add(layer.a, [[1]]))
+    self.assertEqual(len(network.updates), 4)
+    self.assertEqual(len(network.get_updates_for(None)), 2)
+
+    network.add_update(state_ops.assign_add(layer.a, x4), inputs=True)
+    self.assertEqual(len(network.updates), 5)
+    self.assertEqual(len(network.get_updates_for(x4)), 2)
+
+  def test_get_losses(self):
+
+    class MyLayer(base_layers.Layer):
+
+      def build(self, input_shape):
+        self.a = self.add_variable('a',
+                                   (1, 1),
+                                   'float32',
+                                   trainable=False)
+        self.b = self.add_variable('b',
+                                   (1, 1),
+                                   'float32',
+                                   trainable=False)
+        self.add_loss(math_ops.reduce_sum(self.a))
+        self.built = True
+
+      def call(self, inputs):
+        self.add_loss(math_ops.reduce_sum(inputs),
+                      inputs=True)
+        return inputs + 1
+
+    x1 = network_layers.Input(shape=(1,))
+    layer = MyLayer()
+    _ = layer.apply(x1)
+
+    self.assertEqual(len(layer.losses), 2)
+    self.assertEqual(len(layer.get_losses_for(x1)), 1)
+    self.assertEqual(len(layer.get_losses_for(None)), 1)
+
+    x2 = network_layers.Input(shape=(1,))
+    y2 = layer.apply(x2)
+
+    self.assertEqual(len(layer.losses), 3)
+    self.assertEqual(len(layer.get_losses_for(x1)), 1)
+    self.assertEqual(len(layer.get_losses_for(x2)), 1)
+    self.assertEqual(len(layer.get_losses_for(None)), 1)
+
+    network = network_layers.GraphNetwork(x2, y2)
+    self.assertEqual(len(network.losses), 2)
+    self.assertEqual(len(network.get_losses_for(x1)), 0)
+    self.assertEqual(len(network.get_losses_for(x2)), 1)
+    self.assertEqual(len(network.get_losses_for(None)), 1)
+
+    x3 = network_layers.Input(shape=(1,))
+    _ = layer.apply(x3)
+    self.assertEqual(len(network.losses), 2)
+
+    x4 = network_layers.Input(shape=(1,))
+    _ = network(x4)
+    self.assertEqual(len(network.losses), 3)
+    self.assertEqual(len(network.get_losses_for(x2)), 1)
+    self.assertEqual(len(network.get_losses_for(x4)), 1)
+    self.assertEqual(len(network.get_losses_for(None)), 1)
+
+    network.add_loss(math_ops.reduce_sum(layer.a))
+    self.assertEqual(len(network.losses), 4)
+    self.assertEqual(len(network.get_losses_for(None)), 2)
+
+    network.add_loss(math_ops.reduce_sum(x4), inputs=True)
+    self.assertEqual(len(network.losses), 5)
+    self.assertEqual(len(network.get_losses_for(x4)), 2)
 
   def testTopologicalAttributes(self):
     # test layer attributes / methods related to cross-layer connectivity.
@@ -299,9 +407,10 @@ class NetworkTest(test.TestCase):
 
   def testNetworkAttributes(self):
     x = network_layers.Input(shape=(32,))
-    z = core_layers.Dense(2, kernel_regularizer=lambda x: 0.01 * (x**2))(x)
+    layer = core_layers.Dense(2, kernel_regularizer=lambda x: 0.01 * (x**2))
+    z = layer(x)
     dense = core_layers.Dense(2, name='dense')
-    dense.add_update(1)
+    dense.add_update(state_ops.assign_add(layer.kernel, layer.kernel * 2.))
     y = dense(z)
     net = network_layers.GraphNetwork(x, y)
 
diff --git a/tensorflow/python/layers/utils.py b/tensorflow/python/layers/utils.py
index 7407d9a7b3..1bbf4e6dff 100644
--- a/tensorflow/python/layers/utils.py
+++ b/tensorflow/python/layers/utils.py
@@ -255,3 +255,45 @@ def static_shape(x):
     return tuple(x.get_shape().as_list())
   except ValueError:
     return None
+
+
+def get_reachable_from_inputs(inputs, targets=None):
+  """Returns the set of tensors reachable from `inputs`.
+
+  Stops if all targets have been found (target is optional).
+
+  Only valid in Symbolic mode, not Eager mode.
+
+  Args:
+    inputs: List of tensors.
+    targets: List of tensors.
+
+  Returns:
+    A set of tensors reachable from the inputs (includes the inputs themselves).
+  """
+  reachable = set(inputs)
+  if targets:
+    targets = set(targets)
+  queue = inputs[:]
+
+  while queue:
+    x = queue.pop()
+    outputs = []
+    try:
+      consumers = x.consumers()
+    except AttributeError:
+      # Case where x is a variable type
+      consumers = [x.op]
+    for z in consumers:
+      consumer_outputs = z.outputs
+      if consumer_outputs:  # May be None
+        outputs += consumer_outputs
+
+    for y in outputs:
+      if y not in reachable:
+        reachable.add(y)
+        queue.insert(0, y)
+
+    if targets and targets.issubset(reachable):
+      return reachable
+  return reachable
diff --git a/tensorflow/python/layers/utils_test.py b/tensorflow/python/layers/utils_test.py
index a560f6b6d2..c941aad7bc 100644
--- a/tensorflow/python/layers/utils_test.py
+++ b/tensorflow/python/layers/utils_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.layers import utils
+from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
@@ -87,5 +88,34 @@ class ConvUtilsTest(test.TestCase):
     self.assertEqual(3, utils.deconv_output_length(4, 2, 'full', 1))
     self.assertEqual(6, utils.deconv_output_length(4, 2, 'full', 2))
 
+
+class GraphUtilsTest(test.TestCase):
+
+  def testGetReachableFromInputs(self):
+
+    with self.test_session():
+      pl_1 = array_ops.placeholder(shape=None, dtype='float32')
+      pl_2 = array_ops.placeholder(shape=None, dtype='float32')
+      pl_3 = array_ops.placeholder(shape=None, dtype='float32')
+      x_1 = pl_1 + pl_2
+      x_2 = pl_2 * 2
+      x_3 = pl_3 + 1
+      x_4 = x_1 + x_2
+      x_5 = x_3 * pl_1
+
+      self.assertEqual(
+          utils.get_reachable_from_inputs([pl_1]),
+          {pl_1, x_1, x_4, x_5})
+      self.assertEqual(
+          utils.get_reachable_from_inputs([pl_1, pl_2]),
+          {pl_1, pl_2, x_1, x_2, x_4, x_5})
+      self.assertEqual(
+          utils.get_reachable_from_inputs([pl_3]),
+          {pl_3, x_3, x_5})
+      self.assertEqual(
+          utils.get_reachable_from_inputs([x_3]),
+          {x_3, x_5})
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-bidirectional.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-bidirectional.pbtxt
index 2f5e65a0c5..db26c3e568 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-bidirectional.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-bidirectional.pbtxt
@@ -159,7 +159,7 @@ tf_class {
   }
   member_method {
     name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "get_output_at"
@@ -175,7 +175,7 @@ tf_class {
   }
   member_method {
     name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "get_weights"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt
index d0f6d2a14f..c741d4d6e6 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt
@@ -227,7 +227,7 @@ tf_class {
   }
   member_method {
     name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "get_output_at"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m.pbtxt
index 0036d6805b..29d9cf78ab 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m.pbtxt
@@ -231,7 +231,7 @@ tf_class {
   }
   member_method {
     name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "get_output_at"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-r-n-n.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-r-n-n.pbtxt
index b29f65d79d..ad539a7c4c 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-r-n-n.pbtxt
@@ -162,7 +162,7 @@ tf_class {
   }
   member_method {
     name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "get_output_at"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n.pbtxt
index b875898a81..6fafc77b94 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n.pbtxt
@@ -219,7 +219,7 @@ tf_class {
   }
   member_method {
     name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "get_output_at"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
index db9f90caef..90c37bd986 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
@@ -158,7 +158,7 @@ tf_class {
   }
   member_method {
     name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "get_output_at"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-time-distributed.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-time-distributed.pbtxt
index 2a7059d9aa..40aa782a02 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-time-distributed.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-time-distributed.pbtxt
@@ -155,7 +155,7 @@ tf_class {
   }
   member_method {
     name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "get_output_at"
@@ -171,7 +171,7 @@ tf_class {
   }
   member_method {
     name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "get_weights"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-wrapper.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-wrapper.pbtxt
index 58bffa0875..27a54382a4 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-wrapper.pbtxt
@@ -154,7 +154,7 @@ tf_class {
   }
   member_method {
     name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "get_output_at"
@@ -170,7 +170,7 @@ tf_class {
   }
   member_method {
     name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "get_weights"
-- 
GitLab


From b357610f4d59953617e108f91e06c464aed144fa Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 9 Feb 2018 11:33:00 -0800
Subject: [PATCH 901/966] Add dynamic_rnn support for CloudTPU Magenta RNN

PiperOrigin-RevId: 185169370
---
 tensorflow/contrib/tpu/python/tpu/tpu_sharding.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_sharding.py b/tensorflow/contrib/tpu/python/tpu/tpu_sharding.py
index f8ba7d45e2..f5af03f33c 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_sharding.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_sharding.py
@@ -244,7 +244,8 @@ class ShardingPolicy(object):
               str(shapes), self.number_of_shards))
     unsharded_shapes = [self._unshard_shape(s) for s in shapes]
     for i in xrange(self.number_of_shards - 1):
-      if unsharded_shapes[i] != unsharded_shapes[self.number_of_shards - 1]:
+      if not unsharded_shapes[i].is_compatible_with(
+          unsharded_shapes[self.number_of_shards - 1]):
         raise ValueError(
             "sharded shapes %s are not consistent shards of a full shape "
             "sharded %d ways along dimension %d" % (
-- 
GitLab


From c967fed58227e919be4c844bae241874f8717933 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 9 Feb 2018 12:19:29 -0800
Subject: [PATCH 902/966] [XLA] Add a test for reduce window with large minor
 dimension.

PiperOrigin-RevId: 185175593
---
 tensorflow/compiler/xla/tests/reduce_window_test.cc | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/xla/tests/reduce_window_test.cc b/tensorflow/compiler/xla/tests/reduce_window_test.cc
index 7f3c72671d..b11b64e40a 100644
--- a/tensorflow/compiler/xla/tests/reduce_window_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_window_test.cc
@@ -272,7 +272,7 @@ XLA_TEST_P(ReduceWindowTest, NonstandardReduceFunction) {
 
   builder_.ReduceWindow(
       input,
-      CreateConstantFromLiteral(*Literal::CreateR0<float>(3.0f), &builder_),
+      CreateConstantFromLiteral(*Literal::CreateR0<float>(0.0f), &builder_),
       reduce_fn,
       /*window_dimensions=*/{1, 1, 2, 1},
       /*window_strides=*/{1, 1, 1, 1}, padding);
@@ -282,7 +282,7 @@ XLA_TEST_P(ReduceWindowTest, NonstandardReduceFunction) {
   };
 
   auto expected =
-      ReferenceUtil::ReduceWindow4DGeneric(input_array, 3.0f, reduce_func,
+      ReferenceUtil::ReduceWindow4DGeneric(input_array, 0.0f, reduce_func,
                                            /*window=*/{1, 1, 2, 1},
                                            /*stride=*/{1, 1, 1, 1}, padding);
 
@@ -800,6 +800,14 @@ const R4ReduceWindowTestData kR4ReduceWindowLargeTestValues[] = {
                            /*pad_high=*/{1, 1, 0, 0},
                            /*layout=*/{3, 2, 1, 0},
                            /*reducer=*/kAdd},
+
+    R4ReduceWindowTestData{/*base_bounds=*/{1, 1, 32768 - 3, 2},
+                           /*window_bounds=*/{1, 1, 4, 1},
+                           /*strides=*/{1, 1, 4, 1},
+                           /*pad_low=*/{0, 0, 1, 0},
+                           /*pad_high=*/{0, 0, 2, 0},
+                           /*layout=*/{3, 2, 1, 0},
+                           /*reducer=*/kMax},
 };
 
 INSTANTIATE_TEST_CASE_P(
-- 
GitLab


From df982b8dea49eba273e33e4283c3b14eab171b04 Mon Sep 17 00:00:00 2001
From: Guangda Lai <laigd@google.com>
Date: Fri, 9 Feb 2018 12:20:38 -0800
Subject: [PATCH 903/966] Split gpu_id.h and GpuIdManager out from build target
 //tensorflow/core:gpu_runtime, to reduce the size of dependencies, so when
 other lightweight libraries like grappler utils needs the TfToCudaGpuId
 translation function it doesn't need to depend on things like stream executor
 and cuda libraries.

PiperOrigin-RevId: 185175757
---
 tensorflow/core/BUILD                         | 20 +++++++++--
 .../core/common_runtime/gpu/gpu_device.cc     | 15 +++++----
 .../core/common_runtime/gpu/gpu_device.h      |  3 +-
 .../{gpu_id_utils.cc => gpu_id_manager.cc}    | 18 +++++-----
 .../core/common_runtime/gpu/gpu_id_manager.h  | 33 +++++++++++++++++++
 ...d_utils_test.cc => gpu_id_manager_test.cc} | 24 +++++++-------
 .../core/common_runtime/gpu/gpu_id_utils.h    |  8 ++---
 .../core/common_runtime/gpu/process_state.cc  |  3 +-
 8 files changed, 86 insertions(+), 38 deletions(-)
 rename tensorflow/core/common_runtime/gpu/{gpu_id_utils.cc => gpu_id_manager.cc} (79%)
 create mode 100644 tensorflow/core/common_runtime/gpu/gpu_id_manager.h
 rename tensorflow/core/common_runtime/gpu/{gpu_id_utils_test.cc => gpu_id_manager_test.cc} (67%)

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index a7f8533014..d1fb9f4445 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -2255,12 +2255,23 @@ tf_cuda_library(
     ] + tf_additional_device_tracer_deps(),
 )
 
+cc_library(
+    name = "gpu_id",
+    srcs = ["common_runtime/gpu/gpu_id_manager.cc"],
+    hdrs = [
+        "common_runtime/gpu/gpu_id.h",
+        "common_runtime/gpu/gpu_id_manager.h",
+    ],
+    deps = [
+        ":lib",
+    ],
+)
+
 GPU_RUNTIME_HEADERS = [
     "common_runtime/gpu/gpu_bfc_allocator.h",
     "common_runtime/gpu/gpu_cudamalloc_allocator.h",
     "common_runtime/gpu/gpu_debug_allocator.h",
     "common_runtime/gpu/gpu_device.h",
-    "common_runtime/gpu/gpu_id.h",
     "common_runtime/gpu/gpu_id_utils.h",
     "common_runtime/gpu/gpu_init.h",
     "common_runtime/gpu/gpu_managed_allocator.h",
@@ -2279,7 +2290,6 @@ tf_cuda_library(
         "common_runtime/gpu/gpu_debug_allocator.cc",
         "common_runtime/gpu/gpu_device.cc",
         "common_runtime/gpu/gpu_device_factory.cc",
-        "common_runtime/gpu/gpu_id_utils.cc",
         "common_runtime/gpu/gpu_managed_allocator.cc",
         "common_runtime/gpu/gpu_stream_util.cc",
         "common_runtime/gpu/gpu_util.cc",
@@ -2294,6 +2304,7 @@ tf_cuda_library(
         ":core_cpu_lib",
         ":framework",
         ":framework_internal",
+        ":gpu_id",
         ":gpu_init_impl",
         ":gpu_lib",
         ":graph",
@@ -2883,6 +2894,7 @@ tf_cc_tests_gpu(
     linkstatic = tf_kernel_tests_linkstatic(),
     deps = [
         ":gpu_headers_lib",
+        ":gpu_id",
         ":gpu_runtime",
         ":test",
     ],
@@ -2894,7 +2906,7 @@ tf_cc_tests_gpu(
     srcs = glob(["user_ops/**/*_test.cc"]) + [
         "common_runtime/gpu/gpu_bfc_allocator_test.cc",
         "common_runtime/gpu/gpu_device_test.cc",
-        "common_runtime/gpu/gpu_id_utils_test.cc",
+        "common_runtime/gpu/gpu_id_manager_test.cc",
         "common_runtime/gpu/gpu_event_mgr_test.cc",
         "common_runtime/gpu/pool_allocator_test.cc",
     ],
@@ -2906,6 +2918,7 @@ tf_cc_tests_gpu(
         ":direct_session",
         ":framework",
         ":framework_internal",
+        ":gpu_id",
         ":gpu_runtime",
         ":lib",
         ":lib_internal",
@@ -3301,6 +3314,7 @@ tf_cc_test_gpu(
         ":direct_session",
         ":framework",
         ":framework_internal",
+        ":gpu_id",
         ":gpu_runtime",
         ":lib",
         ":lib_internal",
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index a9485a835e..0fb908b080 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_stream_util.h"
@@ -99,7 +100,7 @@ class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
         reinterpret_cast<unsigned int*>(scratch + Eigen::kCudaScratchSize);
     stream_ = cuda_stream;
     allocator_ = alloc;
-    const int cuda_gpu_id = GpuIdUtil::TfToCudaGpuId(tf_gpu_id).value();
+    const int cuda_gpu_id = GpuIdManager::TfToCudaGpuId(tf_gpu_id).value();
     device_prop_ = &Eigen::m_deviceProperties[cuda_gpu_id];
   }
 
@@ -311,7 +312,7 @@ Status BaseGPUDevice::Init(const SessionOptions& options) {
   gpu_device_info_->stream = streams_[0]->compute;
   gpu_device_info_->default_context = device_contexts_[0];
   gpu_device_info_->event_mgr = em_.get();
-  gpu_device_info_->gpu_id = GpuIdUtil::TfToCudaGpuId(tf_gpu_id_).value();
+  gpu_device_info_->gpu_id = GpuIdManager::TfToCudaGpuId(tf_gpu_id_).value();
   set_tensorflow_gpu_device_info(gpu_device_info_);
 
   // Whether and how the GPU device uses its own threadpool.
@@ -955,7 +956,7 @@ Status BaseGPUDeviceFactory::CreateDevices(const SessionOptions& options,
     while (next_tf_gpu_id < memory_limit_bytes.size()) {
       TfGpuId tf_gpu_id(next_tf_gpu_id);
       ++next_tf_gpu_id;
-      GpuIdUtil::InsertTfCudaGpuIdPair(tf_gpu_id, cuda_gpu_id);
+      GpuIdManager::InsertTfCudaGpuIdPair(tf_gpu_id, cuda_gpu_id);
     }
   }
   const int num_tf_gpus = next_tf_gpu_id;
@@ -1006,7 +1007,7 @@ Status BaseGPUDeviceFactory::CreateGPUDevice(const SessionOptions& options,
   const string device_name =
       strings::StrCat(name_prefix, "/device:GPU:", tf_gpu_id.value());
   GpuIdUtil::CheckValidTfGpuId(tf_gpu_id);
-  CudaGpuId cuda_gpu_id = GpuIdUtil::TfToCudaGpuId(tf_gpu_id);
+  CudaGpuId cuda_gpu_id = GpuIdManager::TfToCudaGpuId(tf_gpu_id);
   int numa_node = dev_locality.numa_node();
   Bytes allocated_bytes = static_cast<Bytes>(memory_limit);
 
@@ -1078,7 +1079,7 @@ Status BaseGPUDeviceFactory::GetDeviceLocalities(
     all_tf_gpu_ids.push_back(TfGpuId(i));
   }
   for (TfGpuId tf_gpu_id : all_tf_gpu_ids) {
-    CudaGpuId cuda_gpu_id = GpuIdUtil::TfToCudaGpuId(tf_gpu_id);
+    CudaGpuId cuda_gpu_id = GpuIdManager::TfToCudaGpuId(tf_gpu_id);
     // Get GPU bus_id from its reported NUMA affinity.  Because GPUs are
     // virtualized in some environments, we can't just use the GPU id.
     // NUMA locales are indexed from 0, buses are indexed from 1.
@@ -1106,7 +1107,7 @@ Status BaseGPUDeviceFactory::GetDeviceLocalities(
     LocalLinks* links = dev_locality.mutable_links();
     for (const InterconnectMap& imap : interconnects) {
       for (TfGpuId tf_gpu_dst : all_tf_gpu_ids) {
-        CudaGpuId cuda_gpu_dst = GpuIdUtil::TfToCudaGpuId(tf_gpu_dst);
+        CudaGpuId cuda_gpu_dst = GpuIdManager::TfToCudaGpuId(tf_gpu_dst);
         if (imap.directed_links.find({cuda_gpu_id, cuda_gpu_dst}) !=
             imap.directed_links.end()) {
           InterconnectLink* ilink = links->add_link();
@@ -1121,7 +1122,7 @@ Status BaseGPUDeviceFactory::GetDeviceLocalities(
     // add high strength links to the others.
     for (TfGpuId tf_gpu_dst : all_tf_gpu_ids) {
       if (tf_gpu_id == tf_gpu_dst) continue;
-      CudaGpuId cuda_gpu_dst = GpuIdUtil::TfToCudaGpuId(tf_gpu_dst);
+      CudaGpuId cuda_gpu_dst = GpuIdManager::TfToCudaGpuId(tf_gpu_dst);
       if (cuda_gpu_id == cuda_gpu_dst) {
         InterconnectLink* ilink = links->add_link();
         ilink->set_device_id(tf_gpu_dst.value());
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h
index 82ce3a2e9d..c88daa8ff8 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.h
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
 #include "tensorflow/core/common_runtime/gpu_device_context.h"
 #include "tensorflow/core/common_runtime/local_device.h"
@@ -88,7 +89,7 @@ class BaseGPUDevice : public LocalDevice {
 
   // Returns the CUDA GPU id of this device within the native driver system;
   // e.g., for CUDA this is the ordinal of the GPU within the system.
-  int gpu_id() const { return GpuIdUtil::TfToCudaGpuId(tf_gpu_id_).value(); }
+  int gpu_id() const { return GpuIdManager::TfToCudaGpuId(tf_gpu_id_).value(); }
 
   // The executor that provides control for the device; e.g., for CUDA this
   // corresponds to the cuda context.
diff --git a/tensorflow/core/common_runtime/gpu/gpu_id_utils.cc b/tensorflow/core/common_runtime/gpu/gpu_id_manager.cc
similarity index 79%
rename from tensorflow/core/common_runtime/gpu/gpu_id_utils.cc
rename to tensorflow/core/common_runtime/gpu/gpu_id_manager.cc
index 92cd19453f..207afdca75 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_id_utils.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_id_manager.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
 
 #include <unordered_map>
 
@@ -24,10 +24,10 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 // Manages the map between TfGpuId and CUDA GPU id.
-class GpuIdManager {
+class TfToCudaGpuIdMap {
  public:
-  static GpuIdManager* singleton() {
-    static auto* manager = new GpuIdManager;
+  static TfToCudaGpuIdMap* singleton() {
+    static auto* manager = new TfToCudaGpuIdMap;
     return manager;
   }
 
@@ -62,13 +62,13 @@ class GpuIdManager {
 };
 }  // namespace
 
-void GpuIdUtil::InsertTfCudaGpuIdPair(TfGpuId tf_gpu_id,
-                                      CudaGpuId cuda_gpu_id) {
-  GpuIdManager::singleton()->InsertOrDie(tf_gpu_id, cuda_gpu_id);
+void GpuIdManager::InsertTfCudaGpuIdPair(TfGpuId tf_gpu_id,
+                                         CudaGpuId cuda_gpu_id) {
+  TfToCudaGpuIdMap::singleton()->InsertOrDie(tf_gpu_id, cuda_gpu_id);
 }
 
-CudaGpuId GpuIdUtil::TfToCudaGpuId(TfGpuId tf_gpu_id) {
-  return CudaGpuId(GpuIdManager::singleton()->FindOrDie(tf_gpu_id));
+CudaGpuId GpuIdManager::TfToCudaGpuId(TfGpuId tf_gpu_id) {
+  return CudaGpuId(TfToCudaGpuIdMap::singleton()->FindOrDie(tf_gpu_id));
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_id_manager.h b/tensorflow/core/common_runtime/gpu/gpu_id_manager.h
new file mode 100644
index 0000000000..33925d8c36
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/gpu_id_manager.h
@@ -0,0 +1,33 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ID_MANAGER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ID_MANAGER_H_
+
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
+
+namespace tensorflow {
+
+// Class that manages the translation between Tensorflow GPU ids and CUDA GPU
+// ids.
+class GpuIdManager {
+ public:
+  static void InsertTfCudaGpuIdPair(TfGpuId tf_gpu_id, CudaGpuId cuda_gpu_id);
+  static CudaGpuId TfToCudaGpuId(TfGpuId tf_gpu_id);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ID_MANAGER_H_
diff --git a/tensorflow/core/common_runtime/gpu/gpu_id_utils_test.cc b/tensorflow/core/common_runtime/gpu/gpu_id_manager_test.cc
similarity index 67%
rename from tensorflow/core/common_runtime/gpu/gpu_id_utils_test.cc
rename to tensorflow/core/common_runtime/gpu/gpu_id_manager_test.cc
index bebe00a431..bdbd8d065b 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_id_utils_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_id_manager_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
 
 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
 #include "tensorflow/core/platform/test.h"
@@ -21,33 +21,33 @@ limitations under the License.
 namespace tensorflow {
 namespace test {
 
-TEST(GpuIdTest, Basics) {
+TEST(GpuIdManagerTest, Basics) {
   TfGpuId key_0(0);
   CudaGpuId value_0(0);
-  GpuIdUtil::InsertTfCudaGpuIdPair(key_0, value_0);
-  EXPECT_EQ(value_0, GpuIdUtil::TfToCudaGpuId(key_0));
+  GpuIdManager::InsertTfCudaGpuIdPair(key_0, value_0);
+  EXPECT_EQ(value_0, GpuIdManager::TfToCudaGpuId(key_0));
 
   // Multiple calls to map the same value is ok.
-  GpuIdUtil::InsertTfCudaGpuIdPair(key_0, value_0);
-  EXPECT_EQ(value_0, GpuIdUtil::TfToCudaGpuId(key_0));
+  GpuIdManager::InsertTfCudaGpuIdPair(key_0, value_0);
+  EXPECT_EQ(value_0, GpuIdManager::TfToCudaGpuId(key_0));
 
   // Map a different TfGpuId to a different value.
   TfGpuId key_1(3);
   CudaGpuId value_1(2);
-  GpuIdUtil::InsertTfCudaGpuIdPair(key_1, value_1);
-  EXPECT_EQ(value_1, GpuIdUtil::TfToCudaGpuId(key_1));
+  GpuIdManager::InsertTfCudaGpuIdPair(key_1, value_1);
+  EXPECT_EQ(value_1, GpuIdManager::TfToCudaGpuId(key_1));
 
   // Mapping a different TfGpuId to the same value is ok.
   TfGpuId key_2(10);
-  GpuIdUtil::InsertTfCudaGpuIdPair(key_2, value_1);
-  EXPECT_EQ(value_1, GpuIdUtil::TfToCudaGpuId(key_2));
+  GpuIdManager::InsertTfCudaGpuIdPair(key_2, value_1);
+  EXPECT_EQ(value_1, GpuIdManager::TfToCudaGpuId(key_2));
 
   // Mapping the same TfGpuId to a different value will crash the program.
-  ASSERT_DEATH(GpuIdUtil::InsertTfCudaGpuIdPair(key_2, value_0),
+  ASSERT_DEATH(GpuIdManager::InsertTfCudaGpuIdPair(key_2, value_0),
                "Mapping the same TfGpuId to a different CUDA GPU id");
 
   // Getting an nonexistent mapping will crash the program.
-  ASSERT_DEATH(GpuIdUtil::TfToCudaGpuId(TfGpuId(100)),
+  ASSERT_DEATH(GpuIdManager::TfToCudaGpuId(TfGpuId(100)),
                "Could not find the mapping for TfGpuId");
 }
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_id_utils.h b/tensorflow/core/common_runtime/gpu/gpu_id_utils.h
index 6d196b16ed..2e90687fe8 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_id_utils.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_id_utils.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ID_UTILS_H_
 
 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
 #include "tensorflow/core/lib/gtl/int_type.h"
 #include "tensorflow/core/platform/stream_executor.h"
@@ -27,9 +28,6 @@ namespace gpu = ::perftools::gputools;
 // Utility methods for translation between Tensorflow GPU ids and CUDA GPU ids.
 class GpuIdUtil {
  public:
-  static void InsertTfCudaGpuIdPair(TfGpuId tf_gpu_id, CudaGpuId cuda_gpu_id);
-  static CudaGpuId TfToCudaGpuId(TfGpuId tf_gpu_id);
-
   // Convenient methods for getting the associated executor given a TfGpuId or
   // CudaGpuId.
   static gpu::port::StatusOr<gpu::StreamExecutor*> ExecutorForCudaGpuId(
@@ -42,12 +40,12 @@ class GpuIdUtil {
   }
   static gpu::port::StatusOr<gpu::StreamExecutor*> ExecutorForTfGpuId(
       TfGpuId tf_gpu_id) {
-    return ExecutorForCudaGpuId(GpuIdUtil::TfToCudaGpuId(tf_gpu_id));
+    return ExecutorForCudaGpuId(GpuIdManager::TfToCudaGpuId(tf_gpu_id));
   }
 
   // Verify that the cuda_gpu_id associated with a TfGpuId is legitimate.
   static void CheckValidTfGpuId(TfGpuId tf_gpu_id) {
-    const CudaGpuId cuda_gpu_id = GpuIdUtil::TfToCudaGpuId(tf_gpu_id);
+    const CudaGpuId cuda_gpu_id = GpuIdManager::TfToCudaGpuId(tf_gpu_id);
     const int visible_device_count = GPUMachineManager()->VisibleDeviceCount();
     CHECK_LT(cuda_gpu_id.value(), visible_device_count)
         << "cuda_gpu_id is outside discovered device range."
diff --git a/tensorflow/core/common_runtime/gpu/process_state.cc b/tensorflow/core/common_runtime/gpu/process_state.cc
index b195de7cba..61013bd1ac 100644
--- a/tensorflow/core/common_runtime/gpu/process_state.cc
+++ b/tensorflow/core/common_runtime/gpu/process_state.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
 #include "tensorflow/core/common_runtime/gpu/pool_allocator.h"
@@ -124,7 +125,7 @@ Allocator* ProcessState::GetGPUAllocator(const GPUOptions& options,
       return nullptr;
     }
 
-    const CudaGpuId cuda_gpu_id = GpuIdUtil::TfToCudaGpuId(tf_gpu_id);
+    const CudaGpuId cuda_gpu_id = GpuIdManager::TfToCudaGpuId(tf_gpu_id);
     gpu_allocator =
         new GPUBFCAllocator(cuda_gpu_id, total_bytes, options,
                             strings::StrCat("GPU_", tf_gpu_id.value(), "_bfc"));
-- 
GitLab


From b4223d0ea4fa9b18ee08fc027e2cce7de1e4c6bb Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Fri, 9 Feb 2018 13:07:00 -0800
Subject: [PATCH 904/966] Fixes issue with tfe.make_template when function
 objects don't have names

PiperOrigin-RevId: 185181846
---
 tensorflow/python/eager/function.py             |  6 +++++-
 tensorflow/python/kernel_tests/template_test.py | 13 +++++++++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index 246df9afef..767d719ea6 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -756,7 +756,11 @@ def defun(func):
      or more Tensor objects).
   """
   # TODO(apassos): deal with captured global state. Deal with control flow.
-  return tf_decorator.make_decorator(func, named_defun(func, func.__name__))
+  try:
+    name = func.__name__
+  except AttributeError:
+    name = "function"
+  return tf_decorator.make_decorator(func, named_defun(func, name))
 
 
 def make_defun_op(func, *args, **kwds):
diff --git a/tensorflow/python/kernel_tests/template_test.py b/tensorflow/python/kernel_tests/template_test.py
index 8792ab41a0..a519b69b22 100644
--- a/tensorflow/python/kernel_tests/template_test.py
+++ b/tensorflow/python/kernel_tests/template_test.py
@@ -17,10 +17,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
 import traceback
 
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -412,6 +414,17 @@ class TemplateTest(test.TestCase):
     self.assertEqual("s1_1/nested/dummy:0", v3[0].name)
     self.assertEqual("s1_1/nested_1/dummy:0", v3[1].name)
 
+  def test_graph_function_no_name(self):
+    with context.eager_mode():
+
+      def f(_, y):
+        return y + 1
+
+      partial = functools.partial(f, 1.0)
+      tmpl = template.make_template_internal(
+          "a", partial, create_graph_function_=True)
+      self.assertAllEqual(tmpl(ops.convert_to_tensor(1.0)), 2.0)
+
   @test_util.run_in_graph_and_eager_modes()
   def test_immediate_scope_creation(self):
     # Create templates in scope a then call in scope b. make_template should
-- 
GitLab


From 4edeeb55a2513fd798e146ba3d7ea6313437b7a3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 9 Feb 2018 13:30:40 -0800
Subject: [PATCH 905/966] Update llvm revision to r324720. This is needed
 because r324700 introduces an Orc API change.

PiperOrigin-RevId: 185185088
---
 tensorflow/workspace.bzl    | 8 ++++----
 third_party/llvm/llvm.BUILD | 1 +
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 53831db693..d0f9a8925f 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -473,11 +473,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "llvm",
       urls = [
-          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/35ca32d346a6124ec5ddf66bd113d2ffc2ae4b66.tar.gz",
-          "https://github.com/llvm-mirror/llvm/archive/35ca32d346a6124ec5ddf66bd113d2ffc2ae4b66.tar.gz",
+          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/562d4e516ab92302b34b7f4c8833455699bb48de.tar.gz",
+          "https://github.com/llvm-mirror/llvm/archive/562d4e516ab92302b34b7f4c8833455699bb48de.tar.gz",
       ],
-      sha256 = "f7f991a25f3b4acfe39520d5a548f01b687d17f0b6ba152634084a48de14be77",
-      strip_prefix = "llvm-35ca32d346a6124ec5ddf66bd113d2ffc2ae4b66",
+      sha256 = "cd041cda90f2e29fd3053f3faca182ad7ed871045d789c339d0f7c7d25310ef2",
+      strip_prefix = "llvm-562d4e516ab92302b34b7f4c8833455699bb48de",
       build_file = str(Label("//third_party/llvm:llvm.BUILD")),
   )
 
diff --git a/third_party/llvm/llvm.BUILD b/third_party/llvm/llvm.BUILD
index a9e1341a03..28293a3659 100644
--- a/third_party/llvm/llvm.BUILD
+++ b/third_party/llvm/llvm.BUILD
@@ -1024,6 +1024,7 @@ cc_library(
     deps = [
         ":arm_desc",
         ":arm_info",
+        ":arm_utils",
         ":config",
         ":mc_disassembler",
         ":support",
-- 
GitLab


From 49561fecc51564f11611ec5a20d10676760b85c0 Mon Sep 17 00:00:00 2001
From: Noah Eisen <ncteisen@google.com>
Date: Fri, 9 Feb 2018 14:07:40 -0800
Subject: [PATCH 906/966] Removes const qualifier from ListWorkers

PiperOrigin-RevId: 185190346
---
 tensorflow/core/distributed_runtime/rpc/grpc_channel.cc | 4 ++--
 tensorflow/core/distributed_runtime/rpc/grpc_channel.h  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc b/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
index 7efc0ba6d8..613188244f 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
@@ -157,7 +157,7 @@ class MultiGrpcChannelCache : public CachingGrpcChannelCache {
     }
   }
 
-  void ListWorkers(std::vector<string>* workers) const override {
+  void ListWorkers(std::vector<string>* workers) override {
     for (GrpcChannelCache* cache : caches_) {
       cache->ListWorkers(workers);
     }
@@ -216,7 +216,7 @@ class SparseGrpcChannelCache : public CachingGrpcChannelCache {
   }
   ~SparseGrpcChannelCache() override {}
 
-  void ListWorkers(std::vector<string>* workers) const override {
+  void ListWorkers(std::vector<string>* workers) override {
     workers->reserve(workers->size() + host_ports_.size());
     for (const auto& id_host_port : host_ports_) {
       workers->emplace_back(MakeAddress(job_id_, id_host_port.first));
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_channel.h b/tensorflow/core/distributed_runtime/rpc/grpc_channel.h
index de9840fca8..48b9d958aa 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_channel.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_channel.h
@@ -65,7 +65,7 @@ class GrpcChannelCache {
   // was created to handle.  Worker names are in the format
   //  /job:<job identifier>/task:<task id>
   // e.g. /job:mnist/task:2
-  virtual void ListWorkers(std::vector<string>* workers) const = 0;
+  virtual void ListWorkers(std::vector<string>* workers) = 0;
 
   // If found, returns a gRPC channel that is connected to the remote
   // worker named by 'target'. 'target' is of the following
-- 
GitLab


From 68a767e0aaa6fb3f01f5ce8f4fe533f36fcf7f82 Mon Sep 17 00:00:00 2001
From: Ian Langmore <langmore@google.com>
Date: Fri, 9 Feb 2018 14:17:47 -0800
Subject: [PATCH 907/966] TEST:  test of kernel_results added to hmc_test.py
 PiperOrigin-RevId: 185191871

---
 .../bayesflow/python/kernel_tests/hmc_test.py | 92 +++++++++++++++++++
 1 file changed, 92 insertions(+)

diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/hmc_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/hmc_test.py
index 51aed6438d..5bd834e562 100644
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/hmc_test.py
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/hmc_test.py
@@ -258,6 +258,98 @@ class HMCTest(test.TestCase):
   def testHMCChainExpectations2(self):
     self._chain_gets_correct_expectations_wrapper(2)
 
+  def testKernelResultsUsingTruncatedDistribution(self):
+    def log_prob(x):
+      return array_ops.where(
+          x >= 0.,
+          -x - x**2,  # Non-constant gradient.
+          array_ops.fill(x.shape, math_ops.cast(-np.inf, x.dtype)))
+    # This log_prob has the property that it is likely to attract
+    # the HMC flow toward, and below, zero...but for x <=0,
+    # log_prob(x) = -inf, which should result in rejection, as well
+    # as a non-finite log_prob.  Thus, this distribution gives us an opportunity
+    # to test out the kernel results ability to correctly capture rejections due
+    # to finite AND non-finite reasons.
+    # Why use a non-constant gradient?  This ensures the leapfrog integrator
+    # will not be exact.
+
+    num_results = 1000
+    # Large step size, will give rejections due to integration error in addition
+    # to rejection due to going into a region of log_prob = -inf.
+    step_size = 0.1
+    num_leapfrog_steps = 5
+    num_chains = 2
+
+    with self.test_session(graph=ops.Graph()) as sess:
+
+      # Start multiple independent chains.
+      initial_state = ops.convert_to_tensor([0.1] * num_chains)
+
+      states, kernel_results = hmc.sample_chain(
+          num_results=num_results,
+          target_log_prob_fn=log_prob,
+          current_state=initial_state,
+          step_size=step_size,
+          num_leapfrog_steps=num_leapfrog_steps,
+          seed=42)
+
+      states_, kernel_results_ = sess.run([states, kernel_results])
+      pstates_ = kernel_results_.proposed_state
+
+      neg_inf_mask = np.isneginf(kernel_results_.proposed_target_log_prob)
+
+      # First:  Test that the mathematical properties of the above log prob
+      # function in conjunction with HMC show up as expected in kernel_results_.
+
+      # We better have log_prob = -inf some of the time.
+      self.assertLess(0, neg_inf_mask.sum())
+      # We better have some rejections due to something other than -inf.
+      self.assertLess(neg_inf_mask.sum(), (~kernel_results_.is_accepted).sum())
+      # We better have been accepted a decent amount, even near the end of the
+      # chain, or else this HMC run just got stuck at some point.
+      self.assertLess(
+          0.1, kernel_results_.is_accepted[int(0.9 * num_results):].mean())
+      # We better not have any NaNs in proposed state or log_prob.
+      # We may have some NaN in grads, which involve multiplication/addition due
+      # to gradient rules.  This is the known "NaN grad issue with tf.where."
+      self.assertAllEqual(np.zeros_like(states_),
+                          np.isnan(kernel_results_.proposed_target_log_prob))
+      self.assertAllEqual(np.zeros_like(states_),
+                          np.isnan(states_))
+      # We better not have any +inf in states, grads, or log_prob.
+      self.assertAllEqual(np.zeros_like(states_),
+                          np.isposinf(kernel_results_.proposed_target_log_prob))
+      self.assertAllEqual(
+          np.zeros_like(states_),
+          np.isposinf(kernel_results_.proposed_grads_target_log_prob[0]))
+      self.assertAllEqual(np.zeros_like(states_),
+                          np.isposinf(states_))
+
+      # Second:  Test that kernel_results is congruent with itself and
+      # acceptance/rejection of states.
+
+      # Proposed state is negative iff proposed target log prob is -inf.
+      np.testing.assert_array_less(pstates_[neg_inf_mask], 0.)
+      np.testing.assert_array_less(0., pstates_[~neg_inf_mask])
+
+      # Acceptance probs are zero whenever proposed state is negative.
+      self.assertAllEqual(
+          np.zeros_like(pstates_[neg_inf_mask]),
+          kernel_results_.acceptance_probs[neg_inf_mask])
+
+      # The move is accepted ==> state = proposed state.
+      self.assertAllEqual(
+          states_[kernel_results_.is_accepted],
+          pstates_[kernel_results_.is_accepted],
+      )
+      # The move was rejected <==> state[t] == state[t - 1].
+      for t in range(1, num_results):
+        for i in range(num_chains):
+          if kernel_results_.is_accepted[t, i]:
+            self.assertNotEqual(states_[t, i], states_[t - 1, i])
+          else:
+            self.assertEqual(states_[t, i], states_[t - 1, i])
+
   def _kernel_leaves_target_invariant(self, initial_draws,
                                       independent_chain_ndims,
                                       sess, feed_dict=None):
-- 
GitLab


From ed5f003cc2c542c3c545369f71d4b57429da33fc Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Fri, 9 Feb 2018 14:25:28 -0800
Subject: [PATCH 908/966] Make import_graph_def add default attr values with
 the C API enabled.

It turns out that the original Python code modifies the graph_def argument
to add default attr values. I'm not sure if the behavior is covered by
our API guarantees since it's not documented, but let's keep the behavior
consistent for now.

PiperOrigin-RevId: 185193037
---
 tensorflow/python/framework/importer.py       | 42 ++++++++++++++-----
 .../python/saved_model/saved_model_test.py    |  2 -
 2 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/tensorflow/python/framework/importer.py b/tensorflow/python/framework/importer.py
index c26644362c..cc8f2392ba 100644
--- a/tensorflow/python/framework/importer.py
+++ b/tensorflow/python/framework/importer.py
@@ -150,7 +150,7 @@ def _MaybeDevice(device):
     yield
 
 
-def _ProcessGraphDefParam(graph_def):
+def _ProcessGraphDefParam(graph_def, op_dict):
   """Type-checks and possibly canonicalizes `graph_def`."""
   if not isinstance(graph_def, graph_pb2.GraphDef):
     # `graph_def` could be a dynamically-created message, so try a duck-typed
@@ -161,6 +161,22 @@ def _ProcessGraphDefParam(graph_def):
       graph_def.MergeFrom(old_graph_def)
     except TypeError:
       raise TypeError('graph_def must be a GraphDef proto.')
+  else:
+    # If we're using the graph_def provided by the caller, modify graph_def
+    # in-place to add attr defaults to the NodeDefs (this is visible to the
+    # caller).
+    # NOTE(skyewm): this is undocumented behavior that at least meta_graph.py
+    # depends on. It might make sense to move this to meta_graph.py and have
+    # import_graph_def not modify the graph_def argument (we'd have to make sure
+    # this doesn't break anything else.)
+    for node in graph_def.node:
+      if node.op not in op_dict:
+        # Assume unrecognized ops are functions for now. TF_ImportGraphDef will
+        # report an error if the op is actually missing.
+        continue
+      op_def = op_dict[node.op]
+      _SetDefaultAttrValues(node, op_def)
+
   return graph_def
 
 
@@ -369,6 +385,17 @@ def _GatherReturnElements(requested_return_elements, graph, results):
   return combined_return_elements
 
 
+def _SetDefaultAttrValues(node_def, op_def):
+  """Set any default attr values in `node_def` that aren't present."""
+  assert node_def.op == op_def.name
+  for attr_def in op_def.attr:
+    key = attr_def.name
+    if attr_def.HasField('default_value'):
+      value = node_def.attr[key]
+      if value is None or value.WhichOneof('value') is None:
+        node_def.attr[key].CopyFrom(attr_def.default_value)
+
+
 @tf_export('import_graph_def')
 @deprecated_args(None, 'Please file an issue at '
                  'https://github.com/tensorflow/tensorflow/issues if you depend'
@@ -420,12 +447,12 @@ def import_graph_def(graph_def,
       do not appear in `graph_def`, or `graph_def` is not well-formed (e.g.
       it refers to an unknown tensor).
   """
-  graph_def = _ProcessGraphDefParam(graph_def)
+  op_dict = op_def_registry.get_registered_ops()
+
+  graph_def = _ProcessGraphDefParam(graph_def, op_dict)
   input_map = _ProcessInputMapParam(input_map)
   return_elements = _ProcessReturnElementsParam(return_elements)
 
-  op_dict = op_def_registry.get_registered_ops()
-
   if producer_op_list is not None:
     # TODO(skyewm): make a copy of graph_def so we're not mutating the argument?
     _RemoveDefaultAttrs(op_dict, producer_op_list, graph_def)
@@ -535,16 +562,9 @@ def import_graph_def(graph_def,
         # Check to see if this op's name matches a previously seen op
         if node.name in name_to_op:
           raise ValueError('Duplicate name \'%s\' in GraphDef.' % node.name)
-        # Set any default attr values that aren't present.
         if node.op not in op_dict:
           raise ValueError('No op named %s in defined operations.' % node.op)
         op_def = op_dict[node.op]
-        for attr_def in op_def.attr:
-          key = attr_def.name
-          if attr_def.HasField('default_value'):
-            value = node.attr[key]
-            if value is None or value.WhichOneof('value') is None:
-              node.attr[key].CopyFrom(attr_def.default_value)
 
         output_types = _OutputTypes(node, op_dict)
         name_to_op[node.name] = g.create_op(
diff --git a/tensorflow/python/saved_model/saved_model_test.py b/tensorflow/python/saved_model/saved_model_test.py
index d1f6bc27ef..d9d3168825 100644
--- a/tensorflow/python/saved_model/saved_model_test.py
+++ b/tensorflow/python/saved_model/saved_model_test.py
@@ -873,8 +873,6 @@ class SavedModelTest(test.TestCase):
           42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
 
   def testStripDefaultAttrs(self):
-    if ops._USE_C_API: return  # TODO(skyewm): get this working
-
     export_dir = self._get_export_dir("test_strip_default_attrs")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
-- 
GitLab


From 3590c452ea8485d063874138eec92411297a9abb Mon Sep 17 00:00:00 2001
From: Mingsheng Hong <hongm@google.com>
Date: Fri, 9 Feb 2018 14:27:03 -0800
Subject: [PATCH 909/966] Enabled XLA for TF C API.

Summary of changes:

1. Set MarkForCompilationPassFlags::tf_xla_cpu_global_jit default to true in
C_API unit test env when XLA-execute is intended. Together with setting session
config config.graph_options.optimizer_options.global_jit_level to > 0, this
turns on XLA for the entire graph (eligible nodes only, with _Arg and _RetVal
nodes excluded).

We decided against defaulting MarkForCompilationPassFlags::tf_xla_cpu_global_jit
to true, due to performance concerns with the single-threaded nature of the XLA
CPU backend (see
https://www.tensorflow.org/performance/xla/jit#turning_on_jit_compilation).

2. In FindCompilationCandidates() during MarkForCompilationPass, skip compiling
any '_Arg'-typed nodes. This is necessary to avoid hitting a "Invalid argument
number" error during MarkForCompilationPass.

3. Extended C API based build rules to link in XLA libraries, and added unit
test "CAPI.Session_Min_XLA_CPU".

Also added some misc improvements and debugging aids.

PiperOrigin-RevId: 185193314
---
 tensorflow/c/BUILD                            |  8 ++
 tensorflow/c/c_api_test.cc                    | 80 +++++++++++++++++++
 tensorflow/c/c_test_util.cc                   | 17 +++-
 tensorflow/c/c_test_util.h                    |  2 +-
 .../compiler/jit/mark_for_compilation_pass.cc | 11 +++
 tensorflow/compiler/tf2xla/const_analysis.cc  |  2 +-
 tensorflow/core/graph/graph.cc                |  6 ++
 tensorflow/core/graph/graph.h                 |  7 ++
 .../core/graph/graph_constructor_test.cc      |  4 +-
 tensorflow/tools/lib_package/BUILD            |  1 +
 10 files changed, 132 insertions(+), 6 deletions(-)

diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index 314cbc657c..25a994be3e 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -91,6 +91,12 @@ tf_cuda_library(
             "//tensorflow/core:lib",
             "//tensorflow/core:lib_internal",
         ],
+    }) + select({
+        "//tensorflow:with_xla_support": [
+            "//tensorflow/compiler/tf2xla:xla_compiler",
+            "//tensorflow/compiler/jit",
+        ],
+        "//conditions:default": [],
     }),
 )
 
@@ -141,8 +147,10 @@ tf_cuda_library(
     ],
     deps = [
         ":c_api",
+        "//tensorflow/compiler/jit/legacy_flags:mark_for_compilation_pass_flags",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:session_options",
         "//tensorflow/core:test",
     ],
 )
diff --git a/tensorflow/c/c_api_test.cc b/tensorflow/c/c_api_test.cc
index 0f71bd8f32..66d1ea8cad 100644
--- a/tensorflow/c/c_api_test.cc
+++ b/tensorflow/c/c_api_test.cc
@@ -923,6 +923,86 @@ TEST(CAPI, Session) {
   TF_DeleteStatus(s);
 }
 
+TEST(CAPI, Session_Min_CPU) {
+  TF_Status* s = TF_NewStatus();
+  TF_Graph* graph = TF_NewGraph();
+
+  // Make a placeholder operation.
+  TF_Operation* feed = Placeholder(graph, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  // Make a constant operation with the scalar "0", for axis.
+  TF_Operation* one = ScalarConst(0, graph, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  // Add operation.
+  TF_Operation* min = Min(feed, one, graph, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  // Create a session for this graph.
+  CSession csession(graph, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  // Run the graph.
+  csession.SetInputs({{feed, Int32Tensor({3, 2, 5})}});
+  csession.SetOutputs({min});
+  csession.Run(s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_Tensor* out = csession.output_tensor(0);
+  ASSERT_TRUE(out != nullptr);
+  EXPECT_EQ(TF_INT32, TF_TensorType(out));
+  EXPECT_EQ(0, TF_NumDims(out));  // scalar
+  ASSERT_EQ(sizeof(int32), TF_TensorByteSize(out));
+  int32* output_contents = static_cast<int32*>(TF_TensorData(out));
+  EXPECT_EQ(2, *output_contents);
+
+  // Clean up
+  csession.CloseAndDelete(s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_DeleteGraph(graph);
+  TF_DeleteStatus(s);
+}
+
+TEST(CAPI, Session_Min_XLA_CPU) {
+  TF_Status* s = TF_NewStatus();
+  TF_Graph* graph = TF_NewGraph();
+
+  // Make a placeholder operation.
+  TF_Operation* feed = Placeholder(graph, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  // Make a constant operation with the scalar "0", for axis.
+  TF_Operation* one = ScalarConst(0, graph, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  // Add operation.
+  TF_Operation* min = Min(feed, one, graph, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  // Create a session for this graph.
+  CSession csession(graph, s, /*use_XLA=*/true);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  // Run the graph.
+  csession.SetInputs({{feed, Int32Tensor({3, 2, 5})}});
+  csession.SetOutputs({min});
+  csession.Run(s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_Tensor* out = csession.output_tensor(0);
+  ASSERT_TRUE(out != nullptr);
+  EXPECT_EQ(TF_INT32, TF_TensorType(out));
+  EXPECT_EQ(0, TF_NumDims(out));  // scalar
+  ASSERT_EQ(sizeof(int32), TF_TensorByteSize(out));
+  int32* output_contents = static_cast<int32*>(TF_TensorData(out));
+  EXPECT_EQ(2, *output_contents);
+
+  // Clean up
+  csession.CloseAndDelete(s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_DeleteGraph(graph);
+  TF_DeleteStatus(s);
+}
+
 TEST(CAPI, SessionPRun) {
   TF_Status* s = TF_NewStatus();
   TF_Graph* graph = TF_NewGraph();
diff --git a/tensorflow/c/c_test_util.cc b/tensorflow/c/c_test_util.cc
index 3c1d5b5bf8..2c5f08d672 100644
--- a/tensorflow/c/c_test_util.cc
+++ b/tensorflow/c/c_test_util.cc
@@ -15,11 +15,13 @@ limitations under the License.
 
 #include "tensorflow/c/c_test_util.h"
 
+#include "tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/session_options.h"
 
 using tensorflow::GraphDef;
 using tensorflow::NodeDef;
@@ -390,8 +392,21 @@ std::vector<string> GetFuncNames(const tensorflow::GraphDef& graph_def) {
   return names;
 }
 
-CSession::CSession(TF_Graph* graph, TF_Status* s) {
+CSession::CSession(TF_Graph* graph, TF_Status* s, bool use_XLA) {
   TF_SessionOptions* opts = TF_NewSessionOptions();
+  tensorflow::legacy_flags::MarkForCompilationPassFlags* flags =
+      tensorflow::legacy_flags::GetMarkForCompilationPassFlags();
+  flags->tf_xla_cpu_global_jit = use_XLA;
+  if (use_XLA) {
+    tensorflow::ConfigProto config;
+    config.mutable_graph_options()
+        ->mutable_optimizer_options()
+        ->set_global_jit_level(tensorflow::OptimizerOptions::ON_1);
+    std::string contents;
+    contents.resize(config.ByteSizeLong());
+    config.SerializeToArray(&contents[0], contents.size());
+    TF_SetConfig(opts, contents.data(), contents.size(), s);
+  }
   session_ = TF_NewSession(graph, opts, s);
   TF_DeleteSessionOptions(opts);
 }
diff --git a/tensorflow/c/c_test_util.h b/tensorflow/c/c_test_util.h
index 77520be010..805fafae05 100644
--- a/tensorflow/c/c_test_util.h
+++ b/tensorflow/c/c_test_util.h
@@ -111,7 +111,7 @@ std::vector<string> GetFuncNames(const tensorflow::GraphDef& graph_def);
 
 class CSession {
  public:
-  CSession(TF_Graph* graph, TF_Status* s);
+  CSession(TF_Graph* graph, TF_Status* s, bool use_XLA = false);
   explicit CSession(TF_Session* session);
 
   ~CSession();
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index 79b02baba8..a0211acbbe 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -190,6 +190,9 @@ Status FindCompilationCandidates(
       pflr->GetFLR(ProcessFunctionLibraryRuntime::kDefaultFLRDevice);
 
   for (Node* node : graph.op_nodes()) {
+    VLOG(2) << "FindCompilationCandidates(): Processing "
+            << node->DebugString();
+
     DeviceType device_type("");
     TF_RETURN_IF_ERROR(
         DeviceTypeOfDevice(node->assigned_device_name(), &device_type));
@@ -216,6 +219,13 @@ Status FindCompilationCandidates(
         !IsCompilableWhile(*node, jit_device_type, 0, lib_runtime)) {
       continue;
     }
+    // _Arg nodes in a top-level function represent feeds.
+    // Do not compile them.
+    if (node->type_string() == "_Arg") {
+      VLOG(2) << "Skipping jit compilation for '_Arg'-typed node "
+              << node->DebugString();
+      continue;
+    }
     // _Retval nodes in a top-level function represent fetches.
     // Do not compile them.
     if (node->type_string() == "_Retval") {
@@ -304,6 +314,7 @@ Status MarkForCompilationPass::Run(
         static_cast<OptimizerOptions::GlobalJitLevel>(flags->tf_xla_auto_jit);
   }
   bool cpu_global_jit = flags->tf_xla_cpu_global_jit;
+  VLOG(1) << "flags->tf_xla_cpu_global_jit = " << flags->tf_xla_cpu_global_jit;
   const FunctionLibraryDefinition* fld = options.flib_def;
 
   auto is_compilable = [global_jit_level, cpu_global_jit, fld](
diff --git a/tensorflow/compiler/tf2xla/const_analysis.cc b/tensorflow/compiler/tf2xla/const_analysis.cc
index 0249500910..82923722c5 100644
--- a/tensorflow/compiler/tf2xla/const_analysis.cc
+++ b/tensorflow/compiler/tf2xla/const_analysis.cc
@@ -64,7 +64,7 @@ Status BackwardsConstAnalysis(const Graph& g,
     // Mark any compile-time constant operator arguments as const.
     const std::unordered_set<string>* const_inputs =
         XlaOpRegistry::CompileTimeConstantInputs(node->type_string());
-    if (!const_inputs) return;
+    if (!const_inputs || const_inputs->empty()) return;
 
     NameRangeMap input_name_ranges;
     status =
diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc
index fd1b5d33b9..9b56216f1f 100644
--- a/tensorflow/core/graph/graph.cc
+++ b/tensorflow/core/graph/graph.cc
@@ -522,6 +522,12 @@ void Graph::ToGraphDef(GraphDef* graph_def) const {
   ToGraphDefSubRange(graph_def, 0);
 }
 
+GraphDef Graph::ToGraphDefDebug() const {
+  GraphDef ret;
+  ToGraphDef(&ret);
+  return ret;
+}
+
 void Graph::ToGraphDefSubRange(GraphDef* graph_def, int from_node_id) const {
   graph_def->Clear();
   *graph_def->mutable_versions() = versions();
diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h
index 93d8dd6f11..9d96cd4654 100644
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@@ -494,6 +494,13 @@ class Graph {
   // Serialize to a GraphDef.
   void ToGraphDef(GraphDef* graph_def) const;
 
+  // This version can be called from debugger to inspect the graph content.
+  // Use the previous version outside debug context for efficiency reasons.
+  //
+  // Note: We do not expose a DebugString() API, since GraphDef.DebugString() is
+  // not defined in some TensorFlow builds.
+  GraphDef ToGraphDefDebug() const;
+
   // Generate new node name with the specified prefix that is unique
   // across this graph.
   string NewName(StringPiece prefix);
diff --git a/tensorflow/core/graph/graph_constructor_test.cc b/tensorflow/core/graph/graph_constructor_test.cc
index 01bb1ac748..c59e478f15 100644
--- a/tensorflow/core/graph/graph_constructor_test.cc
+++ b/tensorflow/core/graph/graph_constructor_test.cc
@@ -160,9 +160,7 @@ class GraphConstructorTest : public ::testing::Test {
   }
 
   string GraphDebugString() const {
-    GraphDef def;
-    graph_.ToGraphDef(&def);
-    return def.DebugString();
+    return graph_.ToGraphDefDebug().DebugString();
   }
 
   Graph graph_;
diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index 35e55c0d31..614457e899 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -151,6 +151,7 @@ genrule(
         "@jemalloc//:COPYING",
         "@jpeg//:LICENSE.md",
         "@libxsmm_archive//:LICENSE",
+        "@llvm//:LICENSE.TXT",
         "@lmdb//:LICENSE",
         "@local_config_sycl//sycl:LICENSE.text",
         "@nasm//:LICENSE",
-- 
GitLab


From dc1e7b8a635426db809a094b313cc6be40ab9276 Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Fri, 9 Feb 2018 14:36:17 -0800
Subject: [PATCH 910/966] Fix read_variable_op GPU test

PiperOrigin-RevId: 185194768
---
 tensorflow/python/eager/benchmarks_test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index 9aceaba4d5..b56cbe80a7 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -425,7 +425,7 @@ class MicroBenchmarks(test.Benchmark):
     if not context.num_gpus():
       return
     with context.device(GPU):
-      m = resource_variable_ops.ResourceVariable(self._m_2_by_2)
+      m = resource_variable_ops.ResourceVariable(self._m_2_by_2.gpu())
       self._benchmark_read_variable(m, num_iters=self._num_iters_2_by_2)
 
   def benchmark_read_variable_op_with_tape_2_by_2_CPU(self):
@@ -438,7 +438,7 @@ class MicroBenchmarks(test.Benchmark):
     if not context.num_gpus():
       return
     with context.device(GPU):
-      m = resource_variable_ops.ResourceVariable(self._m_2_by_2)
+      m = resource_variable_ops.ResourceVariable(self._m_2_by_2.gpu())
       self._benchmark_read_variable_with_tape(
           m, num_iters=self._num_iters_2_by_2)
 
-- 
GitLab


From 7e23850c2145ed565c668d6ba327dbcf064d4ed8 Mon Sep 17 00:00:00 2001
From: Gunhan Gulsoy <gunan@google.com>
Date: Fri, 9 Feb 2018 14:37:24 -0800
Subject: [PATCH 911/966] Remove header dependence on cuda_config.h to fix
 opensource custom op support. Fixes #14454 Fixes #12860

PiperOrigin-RevId: 185194924
---
 tensorflow/core/common_runtime/gpu/gpu_device.cc | 4 ++++
 tensorflow/stream_executor/dso_loader.cc         | 4 ++++
 tensorflow/stream_executor/dso_loader.h          | 4 ----
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index 0fb908b080..15ff15fd5a 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -66,6 +66,10 @@ limitations under the License.
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/core/util/stream_executor_util.h"
 
+#if !defined(PLATFORM_GOOGLE)
+#include "cuda/cuda_config.h"
+#endif
+
 namespace tensorflow {
 
 // Eigen Ops directly allocate memory only for temporary buffers used
diff --git a/tensorflow/stream_executor/dso_loader.cc b/tensorflow/stream_executor/dso_loader.cc
index 0c642912b1..9516883627 100644
--- a/tensorflow/stream_executor/dso_loader.cc
+++ b/tensorflow/stream_executor/dso_loader.cc
@@ -33,6 +33,10 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
+#if !defined(PLATFORM_GOOGLE)
+#include "cuda/cuda_config.h"
+#endif
+
 namespace perftools {
 namespace gputools {
 namespace internal {
diff --git a/tensorflow/stream_executor/dso_loader.h b/tensorflow/stream_executor/dso_loader.h
index 9495f7253a..354c7b50b8 100644
--- a/tensorflow/stream_executor/dso_loader.h
+++ b/tensorflow/stream_executor/dso_loader.h
@@ -28,10 +28,6 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform.h"
 #include "tensorflow/stream_executor/platform/mutex.h"
 
-#if !defined(PLATFORM_GOOGLE)
-#include "cuda/cuda_config.h"
-#endif
-
 namespace perftools {
 namespace gputools {
 namespace internal {
-- 
GitLab


From 894a00ad03f1304044377e2846199e5bc2580fc5 Mon Sep 17 00:00:00 2001
From: Yu-Cheng Ling <ycling@google.com>
Date: Fri, 9 Feb 2018 14:52:15 -0800
Subject: [PATCH 912/966] Use TFLite CocoaPod in iOS Simple Demo app

PiperOrigin-RevId: 185196984
---
 .../contrib/lite/examples/ios/simple/Podfile  |   4 +-
 .../examples/ios/simple/RunModel-Info.plist   |   2 +-
 .../simple/simple.xcodeproj/project.pbxproj   | 104 ++++++++++++++----
 3 files changed, 83 insertions(+), 27 deletions(-)

diff --git a/tensorflow/contrib/lite/examples/ios/simple/Podfile b/tensorflow/contrib/lite/examples/ios/simple/Podfile
index 1740ad6457..e4aca2be82 100644
--- a/tensorflow/contrib/lite/examples/ios/simple/Podfile
+++ b/tensorflow/contrib/lite/examples/ios/simple/Podfile
@@ -1,5 +1,5 @@
 platform :ios, '8.0'
 inhibit_all_warnings!
 
-target 'tf_simple_example'
-       pod 'TensorFlow-experimental'
+target 'tflite_simple_example'
+       pod 'TensorFlowLite'
diff --git a/tensorflow/contrib/lite/examples/ios/simple/RunModel-Info.plist b/tensorflow/contrib/lite/examples/ios/simple/RunModel-Info.plist
index 1a3eaa8a2c..a19a43a754 100644
--- a/tensorflow/contrib/lite/examples/ios/simple/RunModel-Info.plist
+++ b/tensorflow/contrib/lite/examples/ios/simple/RunModel-Info.plist
@@ -7,7 +7,7 @@
 	<key>CFBundleDisplayName</key>
 	<string>tflite-simple-example</string>
 	<key>CFBundleExecutable</key>
-	<string>tf_simple_example</string>
+	<string>tflite_simple_example</string>
 	<key>CFBundleIdentifier</key>
 	<string>$(PRODUCT_BUNDLE_IDENTIFIER)</string>
 	<key>CFBundleInfoDictionaryVersion</key>
diff --git a/tensorflow/contrib/lite/examples/ios/simple/simple.xcodeproj/project.pbxproj b/tensorflow/contrib/lite/examples/ios/simple/simple.xcodeproj/project.pbxproj
index 9277c230b8..f5b8382d5a 100644
--- a/tensorflow/contrib/lite/examples/ios/simple/simple.xcodeproj/project.pbxproj
+++ b/tensorflow/contrib/lite/examples/ios/simple/simple.xcodeproj/project.pbxproj
@@ -9,7 +9,7 @@
 /* Begin PBXBuildFile section */
 		1C0D734B1ECCC460008C1DAB /* CoreGraphics.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 1C0D734A1ECCC460008C1DAB /* CoreGraphics.framework */; };
 		1CA45FFF1ECCC356002FA6A4 /* UIKit.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 1CA45FFE1ECCC356002FA6A4 /* UIKit.framework */; };
-		594C14AE1FB8F9B500EE8BFE /* libtensorflow-lite.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 594C14AD1FB8F9B500EE8BFE /* libtensorflow-lite.a */; };
+		1E6F42DBB39A4A3871D4F848 /* libPods-tflite_simple_example.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 73DBC33C5DD9A526EE6D1EF2 /* libPods-tflite_simple_example.a */; };
 		594C14B11FB9037100EE8BFE /* labels.txt in Resources */ = {isa = PBXBuildFile; fileRef = 594C14AF1FB9037100EE8BFE /* labels.txt */; };
 		594C14B21FB9037100EE8BFE /* mobilenet_v1_1.0_224.tflite in Resources */ = {isa = PBXBuildFile; fileRef = 594C14B01FB9037100EE8BFE /* mobilenet_v1_1.0_224.tflite */; };
 		59A3D0011CF4E68100C4259F /* AppDelegate.mm in Sources */ = {isa = PBXBuildFile; fileRef = 59A3CFF21CF4E68100C4259F /* AppDelegate.mm */; };
@@ -24,8 +24,7 @@
 		1C0D73481ECCC41B008C1DAB /* CoreImage.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreImage.framework; path = System/Library/Frameworks/CoreImage.framework; sourceTree = SDKROOT; };
 		1C0D734A1ECCC460008C1DAB /* CoreGraphics.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreGraphics.framework; path = System/Library/Frameworks/CoreGraphics.framework; sourceTree = SDKROOT; };
 		1CA45FFE1ECCC356002FA6A4 /* UIKit.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = UIKit.framework; path = System/Library/Frameworks/UIKit.framework; sourceTree = SDKROOT; };
-		5911579B1CF4011C00C31E3A /* tf_simple_example.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = tf_simple_example.app; sourceTree = BUILT_PRODUCTS_DIR; };
-		594C14AD1FB8F9B500EE8BFE /* libtensorflow-lite.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = "libtensorflow-lite.a"; path = "../../../gen/lib/libtensorflow-lite.a"; sourceTree = "<group>"; };
+		5911579B1CF4011C00C31E3A /* tflite_simple_example.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = tflite_simple_example.app; sourceTree = BUILT_PRODUCTS_DIR; };
 		594C14AF1FB9037100EE8BFE /* labels.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = labels.txt; sourceTree = "<group>"; };
 		594C14B01FB9037100EE8BFE /* mobilenet_v1_1.0_224.tflite */ = {isa = PBXFileReference; lastKnownFileType = file; path = mobilenet_v1_1.0_224.tflite; sourceTree = "<group>"; };
 		59A3CFF11CF4E68100C4259F /* AppDelegate.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = AppDelegate.h; sourceTree = "<group>"; };
@@ -38,7 +37,9 @@
 		59A3CFFE1CF4E68100C4259F /* RunModelViewController.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = RunModelViewController.h; sourceTree = "<group>"; };
 		59A3CFFF1CF4E68100C4259F /* RunModelViewController.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = RunModelViewController.mm; sourceTree = "<group>"; };
 		59A3D0001CF4E68100C4259F /* RunModelViewController.xib */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = file.xib; path = RunModelViewController.xib; sourceTree = "<group>"; };
-		73DBC33C5DD9A526EE6D1EF2 /* libPods-tf_simple_example.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = "libPods-tf_simple_example.a"; sourceTree = BUILT_PRODUCTS_DIR; };
+		5D6203B9FAEEB9824194DBE8 /* Pods-tflite_simple_example.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-tflite_simple_example.release.xcconfig"; path = "Pods/Target Support Files/Pods-tflite_simple_example/Pods-tflite_simple_example.release.xcconfig"; sourceTree = "<group>"; };
+		73DBC33C5DD9A526EE6D1EF2 /* libPods-tflite_simple_example.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = "libPods-tflite_simple_example.a"; sourceTree = BUILT_PRODUCTS_DIR; };
+		987DD5BCAB2DD8B682674E20 /* Pods-tflite_simple_example.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-tflite_simple_example.debug.xcconfig"; path = "Pods/Target Support Files/Pods-tflite_simple_example/Pods-tflite_simple_example.debug.xcconfig"; sourceTree = "<group>"; };
 /* End PBXFileReference section */
 
 /* Begin PBXFrameworksBuildPhase section */
@@ -46,9 +47,9 @@
 			isa = PBXFrameworksBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
-				594C14AE1FB8F9B500EE8BFE /* libtensorflow-lite.a in Frameworks */,
 				1C0D734B1ECCC460008C1DAB /* CoreGraphics.framework in Frameworks */,
 				1CA45FFF1ECCC356002FA6A4 /* UIKit.framework in Frameworks */,
+				1E6F42DBB39A4A3871D4F848 /* libPods-tflite_simple_example.a in Frameworks */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
@@ -58,11 +59,10 @@
 		24D7686C331131624F4454A0 /* Frameworks */ = {
 			isa = PBXGroup;
 			children = (
-				594C14AD1FB8F9B500EE8BFE /* libtensorflow-lite.a */,
 				1C0D734A1ECCC460008C1DAB /* CoreGraphics.framework */,
 				1C0D73481ECCC41B008C1DAB /* CoreImage.framework */,
 				1CA45FFE1ECCC356002FA6A4 /* UIKit.framework */,
-				73DBC33C5DD9A526EE6D1EF2 /* libPods-tf_simple_example.a */,
+				73DBC33C5DD9A526EE6D1EF2 /* libPods-tflite_simple_example.a */,
 			);
 			name = Frameworks;
 			sourceTree = "<group>";
@@ -82,13 +82,14 @@
 				59A3D0001CF4E68100C4259F /* RunModelViewController.xib */,
 				5911579C1CF4011C00C31E3A /* Products */,
 				24D7686C331131624F4454A0 /* Frameworks */,
+				5CE7E4179B26BF77944D8637 /* Pods */,
 			);
 			sourceTree = "<group>";
 		};
 		5911579C1CF4011C00C31E3A /* Products */ = {
 			isa = PBXGroup;
 			children = (
-				5911579B1CF4011C00C31E3A /* tf_simple_example.app */,
+				5911579B1CF4011C00C31E3A /* tflite_simple_example.app */,
 			);
 			name = Products;
 			sourceTree = "<group>";
@@ -103,24 +104,36 @@
 			path = data;
 			sourceTree = "<group>";
 		};
+		5CE7E4179B26BF77944D8637 /* Pods */ = {
+			isa = PBXGroup;
+			children = (
+				987DD5BCAB2DD8B682674E20 /* Pods-tflite_simple_example.debug.xcconfig */,
+				5D6203B9FAEEB9824194DBE8 /* Pods-tflite_simple_example.release.xcconfig */,
+			);
+			name = Pods;
+			sourceTree = "<group>";
+		};
 /* End PBXGroup section */
 
 /* Begin PBXNativeTarget section */
-		5911579A1CF4011C00C31E3A /* tf_simple_example */ = {
+		5911579A1CF4011C00C31E3A /* tflite_simple_example */ = {
 			isa = PBXNativeTarget;
-			buildConfigurationList = 591157B21CF4011D00C31E3A /* Build configuration list for PBXNativeTarget "tf_simple_example" */;
+			buildConfigurationList = 591157B21CF4011D00C31E3A /* Build configuration list for PBXNativeTarget "tflite_simple_example" */;
 			buildPhases = (
+				A507411BCC70190B9ABD2721 /* [CP] Check Pods Manifest.lock */,
 				591157971CF4011C00C31E3A /* Sources */,
 				591157981CF4011C00C31E3A /* Frameworks */,
 				591157991CF4011C00C31E3A /* Resources */,
+				25E1671BDC7334C678FB5DFB /* [CP] Embed Pods Frameworks */,
+				10976C49D86B7F8A59157601 /* [CP] Copy Pods Resources */,
 			);
 			buildRules = (
 			);
 			dependencies = (
 			);
-			name = tf_simple_example;
+			name = tflite_simple_example;
 			productName = tf_ios_makefile_example;
-			productReference = 5911579B1CF4011C00C31E3A /* tf_simple_example.app */;
+			productReference = 5911579B1CF4011C00C31E3A /* tflite_simple_example.app */;
 			productType = "com.apple.product-type.application";
 		};
 /* End PBXNativeTarget section */
@@ -152,7 +165,7 @@
 			projectDirPath = "";
 			projectRoot = "";
 			targets = (
-				5911579A1CF4011C00C31E3A /* tf_simple_example */,
+				5911579A1CF4011C00C31E3A /* tflite_simple_example */,
 			);
 		};
 /* End PBXProject section */
@@ -171,6 +184,57 @@
 		};
 /* End PBXResourcesBuildPhase section */
 
+/* Begin PBXShellScriptBuildPhase section */
+		10976C49D86B7F8A59157601 /* [CP] Copy Pods Resources */ = {
+			isa = PBXShellScriptBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			inputPaths = (
+			);
+			name = "[CP] Copy Pods Resources";
+			outputPaths = (
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+			shellPath = /bin/sh;
+			shellScript = "\"${SRCROOT}/Pods/Target Support Files/Pods-tflite_simple_example/Pods-tflite_simple_example-resources.sh\"\n";
+			showEnvVarsInLog = 0;
+		};
+		25E1671BDC7334C678FB5DFB /* [CP] Embed Pods Frameworks */ = {
+			isa = PBXShellScriptBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			inputPaths = (
+			);
+			name = "[CP] Embed Pods Frameworks";
+			outputPaths = (
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+			shellPath = /bin/sh;
+			shellScript = "\"${SRCROOT}/Pods/Target Support Files/Pods-tflite_simple_example/Pods-tflite_simple_example-frameworks.sh\"\n";
+			showEnvVarsInLog = 0;
+		};
+		A507411BCC70190B9ABD2721 /* [CP] Check Pods Manifest.lock */ = {
+			isa = PBXShellScriptBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			inputPaths = (
+				"${PODS_PODFILE_DIR_PATH}/Podfile.lock",
+				"${PODS_ROOT}/Manifest.lock",
+			);
+			name = "[CP] Check Pods Manifest.lock";
+			outputPaths = (
+				"$(DERIVED_FILE_DIR)/Pods-tflite_simple_example-checkManifestLockResult.txt",
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+			shellPath = /bin/sh;
+			shellScript = "diff \"${PODS_PODFILE_DIR_PATH}/Podfile.lock\" \"${PODS_ROOT}/Manifest.lock\" > /dev/null\nif [ $? != 0 ] ; then\n    # print error to STDERR\n    echo \"error: The sandbox is not in sync with the Podfile.lock. Run 'pod install' or update your CocoaPods installation.\" >&2\n    exit 1\nfi\n# This output is used by Xcode 'outputs' to avoid re-running this script phase.\necho \"SUCCESS\" > \"${SCRIPT_OUTPUT_FILE_0}\"\n";
+			showEnvVarsInLog = 0;
+		};
+/* End PBXShellScriptBuildPhase section */
+
 /* Begin PBXSourcesBuildPhase section */
 		591157971CF4011C00C31E3A /* Sources */ = {
 			isa = PBXSourcesBuildPhase;
@@ -274,6 +338,7 @@
 		};
 		591157B31CF4011D00C31E3A /* Debug */ = {
 			isa = XCBuildConfiguration;
+			baseConfigurationReference = 987DD5BCAB2DD8B682674E20 /* Pods-tflite_simple_example.debug.xcconfig */;
 			buildSettings = {
 				CLANG_DEBUG_INFORMATION_LEVEL = default;
 				CODE_SIGN_IDENTITY = "iPhone Developer";
@@ -283,15 +348,10 @@
 				GCC_ENABLE_CPP_RTTI = YES;
 				HEADER_SEARCH_PATHS = (
 					"$(inherited)",
-					../../../../../../,
-					../../../downloads/flatbuffers/include/,
-					../../../downloads/eigen/,
-					../../../downloads/,
 				);
 				INFOPLIST_FILE = "$(SRCROOT)/RunModel-Info.plist";
 				IPHONEOS_DEPLOYMENT_TARGET = 9.2;
 				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
-				LIBRARY_SEARCH_PATHS = ../../../gen/lib/;
 				OTHER_CPLUSPLUSFLAGS = "$(OTHER_CFLAGS)";
 				OTHER_LDFLAGS = "$(inherited)";
 				PRODUCT_BUNDLE_IDENTIFIER = "com.google.tflite-simple-example";
@@ -304,6 +364,7 @@
 		};
 		591157B41CF4011D00C31E3A /* Release */ = {
 			isa = XCBuildConfiguration;
+			baseConfigurationReference = 5D6203B9FAEEB9824194DBE8 /* Pods-tflite_simple_example.release.xcconfig */;
 			buildSettings = {
 				CLANG_DEBUG_INFORMATION_LEVEL = default;
 				CODE_SIGN_IDENTITY = "iPhone Developer";
@@ -313,15 +374,10 @@
 				GCC_ENABLE_CPP_RTTI = YES;
 				HEADER_SEARCH_PATHS = (
 					"$(inherited)",
-					../../../../../../,
-					../../../downloads/flatbuffers/include/,
-					../../../downloads/eigen/,
-					../../../downloads/,
 				);
 				INFOPLIST_FILE = "$(SRCROOT)/RunModel-Info.plist";
 				IPHONEOS_DEPLOYMENT_TARGET = 9.2;
 				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
-				LIBRARY_SEARCH_PATHS = ../../../gen/lib/;
 				ONLY_ACTIVE_ARCH = YES;
 				OTHER_CPLUSPLUSFLAGS = "$(OTHER_CFLAGS)";
 				OTHER_LDFLAGS = "$(inherited)";
@@ -344,7 +400,7 @@
 			defaultConfigurationIsVisible = 0;
 			defaultConfigurationName = Release;
 		};
-		591157B21CF4011D00C31E3A /* Build configuration list for PBXNativeTarget "tf_simple_example" */ = {
+		591157B21CF4011D00C31E3A /* Build configuration list for PBXNativeTarget "tflite_simple_example" */ = {
 			isa = XCConfigurationList;
 			buildConfigurations = (
 				591157B31CF4011D00C31E3A /* Debug */,
-- 
GitLab


From 5fe3a9603a81f50d0b374bc26ad34bbd03f3b4c4 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Fri, 9 Feb 2018 14:52:24 -0800
Subject: [PATCH 913/966] [tf.data] Remove deprecated `tf.contrib.data.Dataset`
 class.

This change removes the following class:
* `tf.contrib.data.Dataset`.

IF THIS BREAKS YOU: Replace `tf.contrib.data.Dataset` with `tf.data.Dataset`
when constructing a dataset. Note that you may have to modify downstream
transformations to use the core API. See "tensorflow/contrib/data/README.md" for
details of how to update your code to use the core API.

PiperOrigin-RevId: 185197005
---
 tensorflow/contrib/data/__init__.py           |   3 +-
 .../contrib/data/python/kernel_tests/BUILD    |  45 --
 .../kernel_tests/batch_dataset_op_test.py     | 261 +------
 .../python/kernel_tests/bucketing_test.py     |   2 +-
 .../kernel_tests/cache_dataset_op_test.py     | 300 --------
 .../concatenate_dataset_op_test.py            | 109 +--
 .../dataset_constructor_op_test.py            | 695 +-----------------
 .../kernel_tests/filter_dataset_op_test.py    | 134 +---
 .../kernel_tests/flat_map_dataset_op_test.py  | 125 +---
 .../kernel_tests/get_single_element_test.py   |   5 +-
 .../interleave_dataset_op_test.py             | 179 +----
 .../list_files_dataset_op_test.py             | 159 ----
 .../kernel_tests/map_dataset_op_test.py       | 614 +---------------
 .../kernel_tests/range_dataset_op_test.py     | 128 +---
 .../data/python/kernel_tests/resample_test.py |   2 +-
 .../kernel_tests/sequence_dataset_op_test.py  | 186 +----
 .../kernel_tests/shard_dataset_op_test.py     | 111 ---
 .../kernel_tests/shuffle_dataset_op_test.py   | 128 ----
 .../kernel_tests/unique_dataset_op_test.py    |   2 +-
 .../kernel_tests/zip_dataset_op_test.py       |  89 +--
 tensorflow/contrib/data/python/ops/BUILD      |   2 +-
 .../contrib/data/python/ops/dataset_ops.py    | 691 -----------------
 .../data/python/ops/get_single_element.py     |  67 ++
 .../python/estimator/extenders_test.py        |   2 +-
 .../contrib/kfac/python/ops/op_queue.py       |   2 +-
 25 files changed, 107 insertions(+), 3934 deletions(-)
 delete mode 100644 tensorflow/contrib/data/python/kernel_tests/cache_dataset_op_test.py
 delete mode 100644 tensorflow/contrib/data/python/kernel_tests/list_files_dataset_op_test.py
 delete mode 100644 tensorflow/contrib/data/python/kernel_tests/shard_dataset_op_test.py
 delete mode 100644 tensorflow/contrib/data/python/ops/dataset_ops.py
 create mode 100644 tensorflow/contrib/data/python/ops/get_single_element.py

diff --git a/tensorflow/contrib/data/__init__.py b/tensorflow/contrib/data/__init__.py
index 21db1044b0..092c50c370 100644
--- a/tensorflow/contrib/data/__init__.py
+++ b/tensorflow/contrib/data/__init__.py
@@ -54,10 +54,9 @@ from tensorflow.contrib.data.python.ops.batching import map_and_batch
 from tensorflow.contrib.data.python.ops.batching import padded_batch_and_drop_remainder
 from tensorflow.contrib.data.python.ops.batching import unbatch
 from tensorflow.contrib.data.python.ops.counter import Counter
-from tensorflow.contrib.data.python.ops.dataset_ops import Dataset
-from tensorflow.contrib.data.python.ops.dataset_ops import get_single_element
 from tensorflow.contrib.data.python.ops.enumerate_ops import enumerate_dataset
 from tensorflow.contrib.data.python.ops.error_ops import ignore_errors
+from tensorflow.contrib.data.python.ops.get_single_element import get_single_element
 from tensorflow.contrib.data.python.ops.grouping import group_by_window
 from tensorflow.contrib.data.python.ops.interleave_ops import parallel_interleave
 from tensorflow.contrib.data.python.ops.interleave_ops import sloppy_interleave
diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index 7ca4a28dae..e51d57cc89 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -52,24 +52,6 @@ py_test(
     ],
 )
 
-py_test(
-    name = "cache_dataset_op_test",
-    size = "small",
-    srcs = ["cache_dataset_op_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/data/ops:iterator_ops",
-        "//third_party/py/numpy",
-    ],
-)
-
 py_test(
     name = "concatenate_dataset_op_test",
     size = "small",
@@ -223,21 +205,6 @@ tf_py_test(
     ],
 )
 
-py_test(
-    name = "list_files_dataset_op_test",
-    size = "small",
-    srcs = ["list_files_dataset_op_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:util",
-    ],
-)
-
 py_test(
     name = "map_dataset_op_test",
     size = "medium",
@@ -400,18 +367,6 @@ py_test(
     ],
 )
 
-py_test(
-    name = "shard_dataset_op_test",
-    size = "small",
-    srcs = ["shard_dataset_op_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-    ],
-)
-
 py_test(
     name = "shuffle_dataset_op_test",
     size = "medium",
diff --git a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
index 0c2827b1e4..71dc1c1172 100644
--- a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
@@ -23,283 +23,24 @@ import numpy as np
 
 from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
 from tensorflow.contrib.data.python.ops import batching
-from tensorflow.contrib.data.python.ops import dataset_ops
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
-from tensorflow.python.util import compat
 
 
 class BatchDatasetTest(test.TestCase):
 
-  def testBatchDataset(self):
-    """Test an dataset that maps a TF function across its input elements."""
-    # The pipeline is TensorSliceDataset -> MapDataset(square_3) ->
-    # RepeatDataset(count) -> BatchDataset(batch_size).
-    components = (np.arange(7),
-                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
-                  np.array(37.0) * np.arange(7))
-
-    count = array_ops.placeholder(dtypes.int64, shape=[])
-    batch_size = array_ops.placeholder(dtypes.int64, shape=[])
-
-    def _map_fn(x, y, z):
-      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
-
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
-        .repeat(count).batch(batch_size).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([[None] + list(c.shape[1:]) for c in components],
-                     [t.shape.as_list() for t in get_next])
-
-    with self.test_session() as sess:
-      # Batch of a finite input, where the batch_size divides the
-      # total number of elements.
-      sess.run(init_op, feed_dict={count: 28, batch_size: 14})
-      num_batches = (28 * 7) // 14
-      for i in range(num_batches):
-        result = sess.run(get_next)
-        for component, result_component in zip(components, result):
-          for j in range(14):
-            self.assertAllEqual(component[(i * 14 + j) % 7]**2,
-                                result_component[j])
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Batch of a finite input, where the batch_size does not
-      # divide the total number of elements.
-      sess.run(init_op, feed_dict={count: 14, batch_size: 8})
-
-      # We expect (num_batches - 1) full-sized batches.
-      num_batches = int(math.ceil((14 * 7) / 8))
-      for i in range(num_batches - 1):
-        result = sess.run(get_next)
-        for component, result_component in zip(components, result):
-          for j in range(8):
-            self.assertAllEqual(component[(i * 8 + j) % 7]**2,
-                                result_component[j])
-      result = sess.run(get_next)
-      for component, result_component in zip(components, result):
-        for j in range((14 * 7) % 8):
-          self.assertAllEqual(component[((num_batches - 1) * 8 + j) % 7]**2,
-                              result_component[j])
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Batch of an empty input should fail straight away.
-      sess.run(init_op, feed_dict={count: 0, batch_size: 8})
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Empty batch should be an initialization time error.
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(init_op, feed_dict={count: 14, batch_size: 0})
-
   def assertSparseValuesEqual(self, a, b):
     self.assertAllEqual(a.indices, b.indices)
     self.assertAllEqual(a.values, b.values)
     self.assertAllEqual(a.dense_shape, b.dense_shape)
 
-  def testBatchSparse(self):
-
-    def _sparse(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=[[0]], values=(i * [1]), dense_shape=[1])
-
-    iterator = dataset_ops.Dataset.range(10).map(_sparse).batch(
-        5).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      for i in range(2):
-        actual = sess.run(get_next)
-        expected = sparse_tensor.SparseTensorValue(
-            indices=[[0, 0], [1, 0], [2, 0], [3, 0], [4, 0]],
-            values=[i * 5, i * 5 + 1, i * 5 + 2, i * 5 + 3, i * 5 + 4],
-            dense_shape=[5, 1])
-        self.assertTrue(sparse_tensor.is_sparse(actual))
-        self.assertSparseValuesEqual(actual, expected)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testNestedBatchSparse(self):
-
-    def _sparse(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=[[0]], values=(i * [1]), dense_shape=[1])
-
-    iterator = dataset_ops.Dataset.range(10).map(_sparse).batch(5).batch(
-        2).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      actual = sess.run(get_next)
-      expected = sparse_tensor.SparseTensorValue(
-          indices=[[0, 0, 0], [0, 1, 0], [0, 2, 0], [0, 3, 0], [0, 4, 0],
-                   [1, 0, 0], [1, 1, 0], [1, 2, 0], [1, 3, 0], [1, 4, 0]],
-          values=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
-          dense_shape=[2, 5, 1])
-      self.assertTrue(sparse_tensor.is_sparse(actual))
-      self.assertSparseValuesEqual(actual, expected)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testPaddedBatchDataset(self):
-    seq_lens = array_ops.placeholder(dtypes.int32, shape=[None])
-    padded_shape = array_ops.placeholder(dtypes.int64, shape=[1])
-
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(seq_lens)
-        .map(lambda x: array_ops.fill([x], x)).padded_batch(
-            4, padded_shapes=padded_shape).make_initializable_iterator())
-
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      # Test with random sequence lengths, and max padding.
-      random_seq_lens = np.random.randint(20, size=(32,)).astype(np.int32)
-      sess.run(
-          init_op, feed_dict={
-              padded_shape: [-1],
-              seq_lens: random_seq_lens
-          })
-      for i in range(8):
-        result = sess.run(get_next)
-        padded_len = np.max(result)
-        self.assertEqual((4, padded_len), result.shape)
-        for j in range(4):
-          seq_len = random_seq_lens[(i * 4) + j]
-          self.assertAllEqual(result[j, :seq_len], [seq_len] * seq_len)
-          self.assertAllEqual(result[j, seq_len:], [0] * (padded_len - seq_len))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Test with random sequence lengths, and constant padding.
-      sess.run(
-          init_op, feed_dict={
-              padded_shape: [25],
-              seq_lens: random_seq_lens
-          })
-      for i in range(8):
-        result = sess.run(get_next)
-        self.assertEqual((4, 25), result.shape)
-        for j in range(4):
-          seq_len = random_seq_lens[(i * 4) + j]
-          self.assertAllEqual(result[j, :seq_len], [seq_len] * seq_len)
-          self.assertAllEqual(result[j, seq_len:], [0] * (25 - seq_len))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Test correct handling of empty tensors.
-      sess.run(init_op, feed_dict={padded_shape: [-1], seq_lens: [0, 0, 0, 0]})
-      result = sess.run(get_next)
-      self.assertAllEqual([[], [], [], []], result)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Test error handling with constant sequence lengths, and
-      # too-short padding.
-      sess.run(init_op, feed_dict={padded_shape: [5], seq_lens: [6, 5, 5, 5]})
-      with self.assertRaises(errors.DataLossError):
-        result = sess.run(get_next)
-
-  def testPaddedBatchDatasetNonDefaultPadding(self):
-    seq_lens = array_ops.placeholder(dtypes.int32, shape=[None])
-    padded_shape = array_ops.placeholder(dtypes.int64, shape=[1])
-
-    def fill_tuple(x):
-      filled = array_ops.fill([x], x)
-      return (filled, string_ops.as_string(filled))
-
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(seq_lens).map(fill_tuple)
-        .padded_batch(
-            4,
-            padded_shapes=(padded_shape, padded_shape),
-            padding_values=(-1, "<end>")).make_initializable_iterator())
-
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      # Test with random sequence lengths, and max padding.
-      random_seq_lens = np.random.randint(20, size=(32,)).astype(np.int32)
-      sess.run(
-          init_op, feed_dict={
-              padded_shape: [-1],
-              seq_lens: random_seq_lens
-          })
-      for i in range(8):
-        result = sess.run(get_next)
-        padded_len = np.max(result[0])
-        self.assertEqual((4, padded_len), result[0].shape)
-        self.assertEqual((4, padded_len), result[1].shape)
-        for j in range(4):
-          seq_len = random_seq_lens[(i * 4) + j]
-          self.assertAllEqual(result[0][j, :seq_len], [seq_len] * seq_len)
-          self.assertAllEqual(result[0][j, seq_len:],
-                              [-1] * (padded_len - seq_len))
-          self.assertAllEqual(result[1][j, :seq_len],
-                              [compat.as_bytes(str(seq_len))] * seq_len)
-          self.assertAllEqual(result[1][j, seq_len:],
-                              [b"<end>"] * (padded_len - seq_len))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testPaddedBatchDatasetShapeSpecifications(self):
-    int_placeholder = array_ops.placeholder(dtypes.int32)
-    float_placeholder = array_ops.placeholder(dtypes.float32)
-    string_placeholder = array_ops.placeholder(dtypes.string)
-    input_dataset = dataset_ops.Dataset.from_tensors(
-        (int_placeholder, float_placeholder, string_placeholder))
-
-    # Test different ways of specifying the `padded_shapes` argument.
-    dynamic_padding_from_tensor_shapes = input_dataset.padded_batch(
-        32,
-        padded_shapes=(tensor_shape.TensorShape([None]),
-                       tensor_shape.TensorShape([None, None]),
-                       tensor_shape.TensorShape([37])))
-    dynamic_padding_from_lists = input_dataset.padded_batch(
-        32, padded_shapes=([None], [None, None], [37]))
-    dynamic_padding_from_lists_with_minus_one = input_dataset.padded_batch(
-        32, padded_shapes=([-1], [-1, -1], [37]))
-    dynamic_padding_from_tensors = input_dataset.padded_batch(
-        32,
-        padded_shapes=(constant_op.constant([-1], dtype=dtypes.int64),
-                       constant_op.constant([-1, -1], dtype=dtypes.int64),
-                       constant_op.constant([37], dtype=dtypes.int64)))
-
-    for dataset in [
-        dynamic_padding_from_tensor_shapes, dynamic_padding_from_lists,
-        dynamic_padding_from_lists_with_minus_one, dynamic_padding_from_tensors
-    ]:
-      self.assertEqual([None, None], dataset.output_shapes[0].as_list())
-      self.assertEqual([None, None, None], dataset.output_shapes[1].as_list())
-      self.assertEqual([None, 37], dataset.output_shapes[2].as_list())
-
-  def testPaddedBatchSparseError(self):
-
-    def _map_fn(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=[[0, 0]], values=(i * [1]), dense_shape=[1, 1]), i
-
-    with self.assertRaises(TypeError):
-      _ = dataset_ops.Dataset.range(10).map(_map_fn).padded_batch(10)
-
   def testDenseToSparseBatchDataset(self):
     components = np.random.randint(12, size=(100,)).astype(np.int32)
     iterator = (
diff --git a/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py b/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py
index 6de93059d8..f1b494e1a6 100644
--- a/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py
@@ -20,8 +20,8 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
-from tensorflow.contrib.data.python.ops import dataset_ops
 from tensorflow.contrib.data.python.ops import grouping
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
diff --git a/tensorflow/contrib/data/python/kernel_tests/cache_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/cache_dataset_op_test.py
deleted file mode 100644
index 9818020680..0000000000
--- a/tensorflow/contrib/data/python/kernel_tests/cache_dataset_op_test.py
+++ /dev/null
@@ -1,300 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from os import path
-import shutil
-import tempfile
-
-import numpy as np
-
-from tensorflow.contrib.data.python.ops import dataset_ops
-from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import test
-
-
-class FilesystemCacheDatasetTest(test.TestCase):
-
-  def setUp(self):
-    self.tmp_dir = tempfile.mkdtemp()
-    self.cache_prefix = path.join(self.tmp_dir, "cache")
-
-  def tearDown(self):
-    if self.tmp_dir:
-      shutil.rmtree(self.tmp_dir, ignore_errors=True)
-
-  def testCacheDatasetPassthrough(self):
-    components = (np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]),
-                  np.array([9.0, 10.0, 11.0, 12.0]))
-    count_placeholder = array_ops.placeholder_with_default(
-        constant_op.constant(5, dtypes.int64), shape=[])
-    filename_placeholder = array_ops.placeholder(dtypes.string, shape=[])
-
-    repeat_dataset = (dataset_ops.Dataset.from_tensor_slices(components)
-                      .repeat(count_placeholder))
-
-    cache_dataset = repeat_dataset.cache(filename_placeholder)
-
-    self.assertEqual(
-        tuple([c.shape[1:] for c in components]), cache_dataset.output_shapes)
-
-    # Create initialization ops for iterators without and with
-    # caching, respectively.
-    iterator = iterator_ops.Iterator.from_structure(cache_dataset.output_types,
-                                                    cache_dataset.output_shapes)
-    init_fifo_op = iterator.make_initializer(repeat_dataset)
-    init_cache_op = iterator.make_initializer(cache_dataset)
-
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      # First run without caching to collect the "ground truth".
-      sess.run(init_fifo_op)
-      elements = []
-      for _ in range(20):
-        elements.append(sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Assert that the cached dataset has the same elements as the
-      # "ground truth".
-      sess.run(
-          init_cache_op, feed_dict={filename_placeholder: self.cache_prefix})
-      cached_elements = []
-      for _ in range(20):
-        cached_elements.append(sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-      self.assertAllEqual(elements, cached_elements)
-
-      # Re-initialize with an empty upstream (to throw errors.OutOfRangeError
-      # if we didn't use the cache).
-      sess.run(
-          init_cache_op,
-          feed_dict={
-              count_placeholder: 0,
-              filename_placeholder: self.cache_prefix
-          })
-      replayed_elements = []
-      for _ in range(20):
-        replayed_elements.append(sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-      self.assertEqual(cached_elements, replayed_elements)
-
-      # Re-initialize with an empty upstream and a missing cache file (should
-      # throw errors.OutOfRangeError immediately).
-      sess.run(
-          init_cache_op,
-          feed_dict={
-              count_placeholder: 0,
-              filename_placeholder: self.cache_prefix + "nonsense"
-          })
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testConcurrentWriters(self):
-    components = (np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]),
-                  np.array([9.0, 10.0, 11.0, 12.0]))
-    filename_placeholder = array_ops.placeholder(dtypes.string, shape=[])
-
-    cache_dataset1 = (dataset_ops.Dataset.from_tensor_slices(components)
-                      .cache(filename_placeholder))
-    cache_dataset2 = (dataset_ops.Dataset.from_tensor_slices(components)
-                      .cache(filename_placeholder))
-
-    iterator1 = cache_dataset1.make_initializable_iterator()
-    iterator2 = cache_dataset2.make_initializable_iterator()
-    init_cache_op1 = iterator1.initializer
-    init_cache_op2 = iterator2.initializer
-
-    get_next1 = iterator1.get_next()
-    get_next2 = iterator2.get_next()
-
-    with self.test_session() as sess:
-      sess.run(
-          init_cache_op1, feed_dict={filename_placeholder: self.cache_prefix})
-      sess.run(get_next1)  # this should succeed
-
-      sess.run(
-          init_cache_op2, feed_dict={filename_placeholder: self.cache_prefix})
-      with self.assertRaises(errors.AlreadyExistsError):
-        sess.run(get_next2)
-
-      sess.run(get_next1)  # this should continue to succeed
-
-  def testConcurrentReaders(self):
-    components = (np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]),
-                  np.array([9.0, 10.0, 11.0, 12.0]))
-    filename_placeholder = array_ops.placeholder(dtypes.string, shape=[])
-
-    cache_dataset1 = (dataset_ops.Dataset.from_tensor_slices(components)
-                      .cache(filename_placeholder))
-    cache_dataset2 = (dataset_ops.Dataset.from_tensor_slices(components)
-                      .cache(filename_placeholder))
-
-    iterator1 = cache_dataset1.make_initializable_iterator()
-    iterator2 = cache_dataset2.make_initializable_iterator()
-    init_cache_op1 = iterator1.initializer
-    init_cache_op2 = iterator2.initializer
-
-    get_next1 = iterator1.get_next()
-    get_next2 = iterator2.get_next()
-
-    with self.test_session() as sess:
-      sess.run(
-          init_cache_op1, feed_dict={filename_placeholder: self.cache_prefix})
-      elements = []
-      for _ in range(4):
-        elements.append(sess.run(get_next1))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next1)
-
-      # Re-initialize
-      sess.run(
-          init_cache_op1, feed_dict={filename_placeholder: self.cache_prefix})
-      sess.run(
-          init_cache_op2, feed_dict={filename_placeholder: self.cache_prefix})
-
-      # Reading concurrently should succeed.
-      elements_itr1 = []
-      elements_itr2 = []
-      elements_itr2.append(sess.run(get_next2))
-      elements_itr1.append(sess.run(get_next1))
-      elements_itr2.append(sess.run(get_next2))
-      elements_itr1.append(sess.run(get_next1))
-      # Intentionally reversing the order
-      elements_itr1.append(sess.run(get_next1))
-      elements_itr2.append(sess.run(get_next2))
-      elements_itr1.append(sess.run(get_next1))
-      elements_itr2.append(sess.run(get_next2))
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next2)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next1)
-
-      self.assertAllEqual(elements, elements_itr1)
-      self.assertAllEqual(elements, elements_itr2)
-
-
-class MemoryCacheDatasetTest(test.TestCase):
-
-  def testCacheDatasetPassthrough(self):
-    repeat_count = variables.Variable(constant_op.constant(10, dtypes.int64))
-    dataset = dataset_ops.Dataset.range(3).flat_map(
-        lambda x: dataset_ops.Dataset.from_tensors(x).repeat(repeat_count))
-
-    cached_dataset = dataset.cache().repeat(2)
-    uncached_dataset = dataset.repeat(2)
-
-    # Needs to be initializable to capture the variable.
-    cached_iterator = cached_dataset.make_initializable_iterator()
-    cached_next = cached_iterator.get_next()
-    uncached_iterator = uncached_dataset.make_initializable_iterator()
-    uncached_next = uncached_iterator.get_next()
-
-    with self.test_session() as sess:
-
-      sess.run(repeat_count.initializer)
-      sess.run(cached_iterator.initializer)
-      sess.run(uncached_iterator.initializer)
-
-      for i in range(3):
-        for _ in range(10):
-          self.assertEqual(sess.run(cached_next), i)
-          self.assertEqual(sess.run(uncached_next), i)
-
-      sess.run(repeat_count.assign(0))
-
-      # The uncached iterator should now be empty.
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(uncached_next)
-
-      # The cached iterator replays from cache.
-      for i in range(3):
-        for _ in range(10):
-          self.assertEqual(sess.run(cached_next), i)
-
-      # The cached iterator should now be empty.
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(cached_next)
-
-  def testEmptyCacheReading(self):
-    components = (np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]),
-                  np.array([9.0, 10.0, 11.0, 12.0]))
-    count_placeholder = array_ops.placeholder_with_default(
-        constant_op.constant(5, dtypes.int64), shape=[])
-
-    repeat_dataset = (dataset_ops.Dataset.from_tensor_slices(components)
-                      .repeat(count_placeholder))
-
-    cache_dataset = repeat_dataset.cache()
-
-    # Create initialization ops for iterators without and with
-    # caching, respectively.
-    iterator = cache_dataset.make_initializable_iterator()
-    init_cache_op = iterator.initializer
-
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      # Initialize with an empty upstream and a missing cache file (should
-      # throw errors.OutOfRangeError immediately).
-      sess.run(init_cache_op, feed_dict={count_placeholder: 0})
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testConcurrentReaders(self):
-    count_placeholder = array_ops.placeholder_with_default(
-        constant_op.constant(5, dtypes.int64), shape=[])
-    dataset = dataset_ops.Dataset.range(count_placeholder).cache()
-    d1 = dataset.map(lambda x: x + 1)
-    d2 = dataset.map(lambda x: x + 6)
-
-    i1 = d1.make_initializable_iterator()
-    i2 = d2.make_initializable_iterator()
-
-    with self.test_session() as sess:
-      sess.run(i1.initializer)
-
-      self.assertEqual(1, sess.run(i1.get_next()))
-      self.assertEqual(2, sess.run(i1.get_next()))
-      self.assertEqual(3, sess.run(i1.get_next()))
-
-      sess.run(i2.initializer, feed_dict={count_placeholder: 3})
-
-      self.assertEqual(6, sess.run(i2.get_next()))
-      self.assertEqual(7, sess.run(i2.get_next()))
-      self.assertEqual(4, sess.run(i1.get_next()))  # interleave execution
-      self.assertEqual([8, 5], sess.run([i2.get_next(), i1.get_next()]))
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(i1.get_next())
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(i2.get_next())
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/concatenate_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/concatenate_dataset_op_test.py
index 063c710636..17f2980157 100644
--- a/tensorflow/contrib/data/python/kernel_tests/concatenate_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/concatenate_dataset_op_test.py
@@ -20,117 +20,10 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
-from tensorflow.contrib.data.python.ops import dataset_ops
-from tensorflow.python.data.util import nest
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import tensor_shape
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.platform import test
 
 
-class ConcatenateDatasetTest(test.TestCase):
-
-  def testConcatenateDataset(self):
-    input_components = (
-        np.tile(np.array([[1], [2], [3], [4]]), 20),
-        np.tile(np.array([[12], [13], [14], [15]]), 15),
-        np.array([37.0, 38.0, 39.0, 40.0]))
-    to_concatenate_components = (
-        np.tile(np.array([[1], [2], [3], [4], [5]]), 20),
-        np.tile(np.array([[12], [13], [14], [15], [16]]), 15),
-        np.array([37.0, 38.0, 39.0, 40.0, 41.0]))
-
-    input_dataset = dataset_ops.Dataset.from_tensor_slices(input_components)
-    dataset_to_concatenate = dataset_ops.Dataset.from_tensor_slices(
-        to_concatenate_components)
-    concatenated = input_dataset.concatenate(dataset_to_concatenate)
-    self.assertEqual(concatenated.output_shapes, (tensor_shape.TensorShape(
-        [20]), tensor_shape.TensorShape([15]), tensor_shape.TensorShape([])))
-
-    iterator = concatenated.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      for i in range(9):
-        result = sess.run(get_next)
-        if i < 4:
-          for component, result_component in zip(input_components, result):
-            self.assertAllEqual(component[i], result_component)
-        else:
-          for component, result_component in zip(to_concatenate_components,
-                                                 result):
-            self.assertAllEqual(component[i - 4], result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testConcatenateDatasetDifferentShape(self):
-    input_components = (
-        np.tile(np.array([[1], [2], [3], [4]]), 20),
-        np.tile(np.array([[12], [13], [14], [15]]), 4))
-    to_concatenate_components = (
-        np.tile(np.array([[1], [2], [3], [4], [5]]), 20),
-        np.tile(np.array([[12], [13], [14], [15], [16]]), 15))
-
-    input_dataset = dataset_ops.Dataset.from_tensor_slices(input_components)
-    dataset_to_concatenate = dataset_ops.Dataset.from_tensor_slices(
-        to_concatenate_components)
-    concatenated = input_dataset.concatenate(dataset_to_concatenate)
-    self.assertEqual(
-        [ts.as_list()
-         for ts in nest.flatten(concatenated.output_shapes)], [[20], [None]])
-
-    iterator = concatenated.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      for i in range(9):
-        result = sess.run(get_next)
-        if i < 4:
-          for component, result_component in zip(input_components, result):
-            self.assertAllEqual(component[i], result_component)
-        else:
-          for component, result_component in zip(to_concatenate_components,
-                                                 result):
-            self.assertAllEqual(component[i - 4], result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testConcatenateDatasetDifferentStructure(self):
-    input_components = (
-        np.tile(np.array([[1], [2], [3], [4]]), 5),
-        np.tile(np.array([[12], [13], [14], [15]]), 4))
-    to_concatenate_components = (
-        np.tile(np.array([[1], [2], [3], [4], [5]]), 20),
-        np.tile(np.array([[12], [13], [14], [15], [16]]), 15),
-        np.array([37.0, 38.0, 39.0, 40.0, 41.0]))
-
-    input_dataset = dataset_ops.Dataset.from_tensor_slices(input_components)
-    dataset_to_concatenate = dataset_ops.Dataset.from_tensor_slices(
-        to_concatenate_components)
-
-    with self.assertRaisesRegexp(ValueError,
-                                 "don't have the same number of elements"):
-      input_dataset.concatenate(dataset_to_concatenate)
-
-  def testConcatenateDatasetDifferentType(self):
-    input_components = (
-        np.tile(np.array([[1], [2], [3], [4]]), 5),
-        np.tile(np.array([[12], [13], [14], [15]]), 4))
-    to_concatenate_components = (
-        np.tile(np.array([[1.0], [2.0], [3.0], [4.0]]), 5),
-        np.tile(np.array([[12], [13], [14], [15]]), 15))
-
-    input_dataset = dataset_ops.Dataset.from_tensor_slices(input_components)
-    dataset_to_concatenate = dataset_ops.Dataset.from_tensor_slices(
-        to_concatenate_components)
-
-    with self.assertRaisesRegexp(TypeError, "have different types"):
-      input_dataset.concatenate(dataset_to_concatenate)
-
-
 class ConcatenateDatasetSerializationTest(
     dataset_serialization_test_base.DatasetSerializationTestBase):
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py b/tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py
index a90ba30e60..a842502cc6 100644
--- a/tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py
@@ -17,713 +17,20 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import threading
-
 import numpy as np
 
 from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
 from tensorflow.contrib.data.python.ops import batching
-from tensorflow.contrib.data.python.ops import dataset_ops
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.client import session
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import test
 
 
 class DatasetConstructorTest(test.TestCase):
 
-  def testFromTensors(self):
-    """Test an dataset that represents a single tuple of tensors."""
-    components = (np.array(1), np.array([1, 2, 3]), np.array(37.0))
-
-    iterator = (dataset_ops.Dataset.from_tensors(components)
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([c.shape for c in components],
-                     [t.shape for t in get_next])
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      results = sess.run(get_next)
-      for component, result_component in zip(components, results):
-        self.assertAllEqual(component, result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def assertSparseValuesEqual(self, a, b):
-    self.assertAllEqual(a.indices, b.indices)
-    self.assertAllEqual(a.values, b.values)
-    self.assertAllEqual(a.dense_shape, b.dense_shape)
-
-  def testFromTensorsSparse(self):
-    """Test an dataset that represents a single tuple of tensors."""
-    components = (sparse_tensor.SparseTensorValue(
-        indices=np.array([[0]]),
-        values=np.array([0]),
-        dense_shape=np.array([1])),
-                  sparse_tensor.SparseTensorValue(
-                      indices=np.array([[0, 0], [1, 1]]),
-                      values=np.array([-1, 1]),
-                      dense_shape=np.array([2, 2])))
-
-    iterator = (
-        dataset_ops.Dataset.from_tensors(components)
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual(
-        [tensor_shape.TensorShape(c.dense_shape) for c in components],
-        [shape for shape in iterator.output_shapes])
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      results = sess.run(get_next)
-      for component, result_component in zip(components, results):
-        self.assertSparseValuesEqual(component, result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFromTensorsMixed(self):
-    """Test an dataset that represents a single tuple of tensors."""
-    components = (np.array(1), np.array([1, 2, 3]), np.array(37.0),
-                  sparse_tensor.SparseTensorValue(
-                      indices=np.array([[0]]),
-                      values=np.array([0]),
-                      dense_shape=np.array([1])),
-                  sparse_tensor.SparseTensorValue(
-                      indices=np.array([[0, 0], [1, 1]]),
-                      values=np.array([-1, 1]),
-                      dense_shape=np.array([2, 2])))
-
-    iterator = (
-        dataset_ops.Dataset.from_tensors(components)
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([
-        tensor_shape.TensorShape(c.dense_shape)
-        if sparse_tensor.is_sparse(c) else c.shape for c in components
-    ], [shape for shape in iterator.output_shapes])
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      results = sess.run(get_next)
-      for component, result_component in zip(components, results):
-        if sparse_tensor.is_sparse(component):
-          self.assertSparseValuesEqual(component, result_component)
-        else:
-          self.assertAllEqual(component, result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFromTensorSlices(self):
-    """Test an dataset that represents the slices from a tuple of tensors."""
-    components = (
-        np.tile(np.array([[1], [2], [3], [4]]), 20), np.tile(
-            np.array([[12], [13], [14], [15]]), 22),
-        np.array([37.0, 38.0, 39.0, 40.0])
-    )
-
-    iterator = (dataset_ops.Dataset.from_tensor_slices(components)
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([c.shape[1:] for c in components],
-                     [t.shape for t in get_next])
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      for i in range(4):
-        results = sess.run(get_next)
-        for component, result_component in zip(components, results):
-          self.assertAllEqual(component[i], result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFromTensorSlicesSparse(self):
-    """Test an dataset that represents the slices from a tuple of tensors."""
-    components = (sparse_tensor.SparseTensorValue(
-        indices=np.array([[0, 0], [1, 0], [2, 0]]),
-        values=np.array([0, 0, 0]),
-        dense_shape=np.array([3, 1])),
-                  sparse_tensor.SparseTensorValue(
-                      indices=np.array([[0, 0], [1, 1], [2, 2]]),
-                      values=np.array([1, 2, 3]),
-                      dense_shape=np.array([3, 3])))
-
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components)
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual(
-        [tensor_shape.TensorShape(c.dense_shape[1:]) for c in components],
-        [shape for shape in iterator.output_shapes])
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      expected = [
-          (sparse_tensor.SparseTensorValue(
-              indices=np.array([[0]]),
-              values=np.array([0]),
-              dense_shape=np.array([1])),
-           sparse_tensor.SparseTensorValue(
-               indices=np.array([[0]]),
-               values=np.array([1]),
-               dense_shape=np.array([3]))),
-          (sparse_tensor.SparseTensorValue(
-              indices=np.array([[0]]),
-              values=np.array([0]),
-              dense_shape=np.array([1])),
-           sparse_tensor.SparseTensorValue(
-               indices=np.array([[1]]),
-               values=np.array([2]),
-               dense_shape=np.array([3]))),
-          (sparse_tensor.SparseTensorValue(
-              indices=np.array([[0]]),
-              values=np.array([0]),
-              dense_shape=np.array([1])),
-           sparse_tensor.SparseTensorValue(
-               indices=np.array([[2]]),
-               values=np.array([3]),
-               dense_shape=np.array([3]))),
-      ]
-      for i in range(3):
-        results = sess.run(get_next)
-        for component, result_component in zip(expected[i], results):
-          self.assertSparseValuesEqual(component, result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFromTensorSlicesMixed(self):
-    """Test an dataset that represents the slices from a tuple of tensors."""
-    components = (np.tile(np.array([[1], [2], [3]]), 20),
-                  np.tile(np.array([[12], [13], [14]]), 22),
-                  np.array([37.0, 38.0, 39.0]),
-                  sparse_tensor.SparseTensorValue(
-                      indices=np.array([[0, 0], [1, 0], [2, 0]]),
-                      values=np.array([0, 0, 0]),
-                      dense_shape=np.array([3, 1])),
-                  sparse_tensor.SparseTensorValue(
-                      indices=np.array([[0, 0], [1, 1], [2, 2]]),
-                      values=np.array([1, 2, 3]),
-                      dense_shape=np.array([3, 3])))
-
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components)
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([
-        tensor_shape.TensorShape(c.dense_shape[1:])
-        if sparse_tensor.is_sparse(c) else c.shape[1:] for c in components
-    ], [shape for shape in iterator.output_shapes])
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      expected = [
-          (sparse_tensor.SparseTensorValue(
-              indices=np.array([[0]]),
-              values=np.array([0]),
-              dense_shape=np.array([1])),
-           sparse_tensor.SparseTensorValue(
-               indices=np.array([[0]]),
-               values=np.array([1]),
-               dense_shape=np.array([3]))),
-          (sparse_tensor.SparseTensorValue(
-              indices=np.array([[0]]),
-              values=np.array([0]),
-              dense_shape=np.array([1])),
-           sparse_tensor.SparseTensorValue(
-               indices=np.array([[1]]),
-               values=np.array([2]),
-               dense_shape=np.array([3]))),
-          (sparse_tensor.SparseTensorValue(
-              indices=np.array([[0]]),
-              values=np.array([0]),
-              dense_shape=np.array([1])),
-           sparse_tensor.SparseTensorValue(
-               indices=np.array([[2]]),
-               values=np.array([3]),
-               dense_shape=np.array([3]))),
-      ]
-      for i in range(3):
-        results = sess.run(get_next)
-        for component, result_component in zip(
-            (zip(*components[:3])[i] + expected[i]), results):
-          if sparse_tensor.is_sparse(component):
-            self.assertSparseValuesEqual(component, result_component)
-          else:
-            self.assertAllEqual(component, result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFromTensorSlicesWithDict(self):
-    components = {"foo": [1, 2, 3], "bar": [[4.0], [5.0], [6.0]]}
-    iterator = (dataset_ops.Dataset.from_tensor_slices(components)
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual(dtypes.int32, iterator.output_types["foo"])
-    self.assertEqual(dtypes.float32, iterator.output_types["bar"])
-    self.assertEqual((), iterator.output_shapes["foo"])
-    self.assertEqual((1,), iterator.output_shapes["bar"])
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      for i in range(3):
-        results = sess.run(get_next)
-        self.assertEqual(components["foo"][i], results["foo"])
-        self.assertEqual(components["bar"][i], results["bar"])
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFromSparseTensorSlices(self):
-    """Test a dataset based on slices of a `tf.SparseTensor`."""
-    st = array_ops.sparse_placeholder(dtypes.float64)
-    iterator = (dataset_ops.Dataset.from_sparse_tensor_slices(st)
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = sparse_tensor.SparseTensor(*iterator.get_next())
-
-    with self.test_session() as sess:
-      slices = [[1., 2., 3.], [1.], [1.], [1., 2.], [], [1., 2.], [], [], []]
-
-      # Test with sparse tensor in the appropriate order.
-      indices = np.array(
-          [[i, j] for i in range(len(slices)) for j in range(len(slices[i]))])
-      values = np.array([val for s in slices for val in s])
-      dense_shape = np.array([len(slices), max(len(s) for s in slices) + 1])
-      sparse_feed = sparse_tensor.SparseTensorValue(indices, values,
-                                                    dense_shape)
-      sess.run(init_op, feed_dict={st: sparse_feed})
-      for i, s in enumerate(slices):
-        results = sess.run(get_next)
-        self.assertAllEqual(s, results.values)
-        expected_indices = np.array(
-            [[j] for j in range(len(slices[i]))]).reshape([-1, 1])
-        self.assertAllEqual(expected_indices, results.indices)
-        self.assertAllEqual(dense_shape[1:], results.dense_shape)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Test with sparse tensor in the reverse order, which is not
-      # currently supported.
-      reverse_order_indices = indices[::-1, :]
-      reverse_order_values = values[::-1]
-      sparse_feed = sparse_tensor.SparseTensorValue(
-          reverse_order_indices, reverse_order_values, dense_shape)
-      with self.assertRaises(errors.UnimplementedError):
-        sess.run(init_op, feed_dict={st: sparse_feed})
-
-      # Test with an empty sparse tensor.
-      empty_indices = np.empty((0, 4), dtype=np.int64)
-      empty_values = np.empty((0,), dtype=np.float64)
-      empty_dense_shape = [0, 4, 37, 9]
-      sparse_feed = sparse_tensor.SparseTensorValue(empty_indices, empty_values,
-                                                    empty_dense_shape)
-      sess.run(init_op, feed_dict={st: sparse_feed})
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  # pylint: disable=g-long-lambda,unnecessary-lambda
-  def testNestedStructure(self):
-    components = (np.array([1, 2, 3]), (np.array([4., 5.]), np.array([6., 7.])),
-                  np.array([8, 9, 10]))
-
-    dataset = dataset_ops.Dataset.from_tensors(components)
-    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
-                       dtypes.int64), dataset.output_types)
-    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
-
-    dataset = dataset.shuffle(10, 10)
-    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
-                       dtypes.int64), dataset.output_types)
-    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
-
-    dataset = dataset.repeat(-1)
-    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
-                       dtypes.int64), dataset.output_types)
-    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
-
-    dataset = dataset.filter(lambda x, y, z: True)
-    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
-                       dtypes.int64), dataset.output_types)
-    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
-
-    dataset = dataset.take(5)
-    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
-                       dtypes.int64), dataset.output_types)
-    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
-
-    dataset = dataset.map(lambda x, y, z: ((x, z), (y[0], y[1])))
-    self.assertEquals(((dtypes.int64, dtypes.int64),
-                       (dtypes.float64, dtypes.float64)), dataset.output_types)
-    self.assertEquals((([3], [3]), ([2], [2])), dataset.output_shapes)
-
-    dataset = dataset.flat_map(
-        lambda x, y: dataset_ops.Dataset.from_tensors(((x[0], x[1]),
-                                                       (y[0], y[1])))
-    )
-    self.assertEquals(((dtypes.int64, dtypes.int64),
-                       (dtypes.float64, dtypes.float64)), dataset.output_types)
-    self.assertEquals((([3], [3]), ([2], [2])), dataset.output_shapes)
-
-    dataset = dataset.batch(32)
-    self.assertEquals(((dtypes.int64, dtypes.int64),
-                       (dtypes.float64, dtypes.float64)), dataset.output_types)
-    self.assertEquals((([None, 3], [None, 3]), ([None, 2], [None, 2])),
-                      nest.pack_sequence_as(dataset.output_shapes, [
-                          s.as_list()
-                          for s in nest.flatten(dataset.output_shapes)
-                      ]))
-
-    iterator = dataset.make_one_shot_iterator()
-    (w, x), (y, z) = iterator.get_next()
-    self.assertEquals(dtypes.int64, w.dtype)
-    self.assertEquals(dtypes.int64, x.dtype)
-    self.assertEquals(dtypes.float64, y.dtype)
-    self.assertEquals(dtypes.float64, z.dtype)
-    self.assertEquals([None, 3], w.shape.as_list())
-    self.assertEquals([None, 3], x.shape.as_list())
-    self.assertEquals([None, 2], y.shape.as_list())
-    self.assertEquals([None, 2], z.shape.as_list())
-
-    iterator = dataset.make_initializable_iterator()
-    (w, x), (y, z) = iterator.get_next()
-    self.assertEquals(dtypes.int64, w.dtype)
-    self.assertEquals(dtypes.int64, x.dtype)
-    self.assertEquals(dtypes.float64, y.dtype)
-    self.assertEquals(dtypes.float64, z.dtype)
-    self.assertEquals([None, 3], w.shape.as_list())
-    self.assertEquals([None, 3], x.shape.as_list())
-    self.assertEquals([None, 2], y.shape.as_list())
-    self.assertEquals([None, 2], z.shape.as_list())
-
-    # Define a separate set of components with matching leading
-    # dimension for the from-slices constructor.
-    components_for_slices = (np.array([1, 2, 3]), (np.array(
-        [4., 5., 6.]), np.array([7., 8., 9.])), np.array([10, 11, 12]))
-
-    dataset = dataset_ops.Dataset.from_tensor_slices(components_for_slices)
-    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
-                       dtypes.int64), dataset.output_types)
-    self.assertEquals(([], ([], []), []), dataset.output_shapes)
-
-  def testNestedDict(self):
-    components = {"a": {"aa": 1, "ab": [2.0, 2.0]}, "b": [3, 3, 3]}
-    dataset = dataset_ops.Dataset.from_tensors(components)
-    self.assertEquals(dtypes.int32, dataset.output_types["a"]["aa"])
-    self.assertEquals(dtypes.float32, dataset.output_types["a"]["ab"])
-    self.assertEquals(dtypes.int32, dataset.output_types["b"])
-    self.assertEquals([], dataset.output_shapes["a"]["aa"])
-    self.assertEquals([2], dataset.output_shapes["a"]["ab"])
-    self.assertEquals([3], dataset.output_shapes["b"])
-
-  def testNonSequenceNestedStructure(self):
-    components = np.array([1, 2, 3])
-
-    dataset = dataset_ops.Dataset.from_tensors(components)
-    self.assertEquals(dtypes.int64, dataset.output_types)
-    self.assertEquals([3], dataset.output_shapes)
-
-    dataset = dataset.filter(
-        lambda x: math_ops.reduce_all(math_ops.equal(x, components)))
-    self.assertEquals(dtypes.int64, dataset.output_types)
-    self.assertEquals([3], dataset.output_shapes)
-
-    dataset = dataset.map(lambda x: array_ops.stack([x, x]))
-    self.assertEquals(dtypes.int64, dataset.output_types)
-    self.assertEquals([2, 3], dataset.output_shapes)
-
-    dataset = dataset.flat_map(
-        lambda x: dataset_ops.Dataset.from_tensor_slices(x))
-    self.assertEquals(dtypes.int64, dataset.output_types)
-    self.assertEquals([3], dataset.output_shapes)
-
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-    self.assertEquals(dtypes.int64, get_next.dtype)
-    self.assertEquals([3], get_next.shape)
-
-  def _testFromGenerator(self, generator, elem_sequence, num_repeats):
-    iterator = (
-        dataset_ops.Dataset.from_generator(generator, output_types=dtypes.int64)
-        .repeat(num_repeats)
-        .prefetch(5)
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      for _ in range(2):  # Run twice to test reinitialization.
-        sess.run(init_op)
-        for _ in range(num_repeats):
-          for elem in elem_sequence:
-            self.assertAllEqual(elem, sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-  def _testFromGeneratorOneShot(self, generator, elem_sequence, num_repeats):
-    iterator = (
-        dataset_ops.Dataset.from_generator(generator, output_types=dtypes.int64)
-        .repeat(num_repeats)
-        .prefetch(5)
-        .make_one_shot_iterator())
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      for _ in range(num_repeats):
-        for elem in elem_sequence:
-          self.assertAllEqual(elem, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFromGeneratorUsingFunction(self):
-    def generator():
-      for i in range(1, 100):
-        yield [i] * i
-    elem_sequence = list(generator())
-    self._testFromGenerator(generator, elem_sequence, 1)
-    self._testFromGenerator(generator, elem_sequence, 5)
-    self._testFromGeneratorOneShot(generator, elem_sequence, 1)
-    self._testFromGeneratorOneShot(generator, elem_sequence, 5)
-
-  def testFromGeneratorUsingList(self):
-    generator = lambda: [[i] * i for i in range(1, 100)]
-    elem_sequence = list(generator())
-    self._testFromGenerator(generator, elem_sequence, 1)
-    self._testFromGenerator(generator, elem_sequence, 5)
-
-  def testFromGeneratorUsingNdarray(self):
-    generator = lambda: np.arange(100, dtype=np.int64)
-    elem_sequence = list(generator())
-    self._testFromGenerator(generator, elem_sequence, 1)
-    self._testFromGenerator(generator, elem_sequence, 5)
-
-  def testFromGeneratorUsingGeneratorExpression(self):
-    # NOTE(mrry): Generator *expressions* are not repeatable (or in
-    # general reusable), because they eagerly evaluate the `for`
-    # expression as `iter(range(1, 100))` and discard the means of
-    # reconstructing `range(1, 100)`. Wrapping the generator
-    # expression in a `lambda` makes it repeatable.
-    generator = lambda: ([i] * i for i in range(1, 100))
-    elem_sequence = list(generator())
-    self._testFromGenerator(generator, elem_sequence, 1)
-    self._testFromGenerator(generator, elem_sequence, 5)
-
-  def testFromMultipleConcurrentGenerators(self):
-    num_inner_repeats = 5
-    num_outer_repeats = 100
-
-    def generator():
-      for i in range(1, 10):
-        yield ([i] * i, [i, i ** 2, i ** 3])
-    input_list = list(generator())
-
-    # The interleave transformation is essentially a flat map that
-    # draws from multiple input datasets concurrently (in a cyclic
-    # fashion). By placing `Datsaet.from_generator()` inside an
-    # interleave, we test its behavior when multiple iterators are
-    # active at the same time; by additionally prefetching inside the
-    # interleave, we create the possibility of parallel (modulo GIL)
-    # invocations to several iterators created by the same dataset.
-    def interleave_fn(_):
-      return (dataset_ops.Dataset.from_generator(
-          generator, output_types=(dtypes.int64, dtypes.int64),
-          output_shapes=([None], [3]))
-              .repeat(num_inner_repeats).prefetch(5))
-
-    iterator = (
-        dataset_ops.Dataset.range(num_outer_repeats)
-        .interleave(interleave_fn, cycle_length=10,
-                    block_length=len(input_list))
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      for _ in range(num_inner_repeats * num_outer_repeats):
-        for elem in input_list:
-          val0, val1 = sess.run(get_next)
-          self.assertAllEqual(elem[0], val0)
-          self.assertAllEqual(elem[1], val1)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFromGeneratorsRunningInParallel(self):
-    num_parallel_iterators = 3
-
-    # Define shared state that multiple iterator instances will access to
-    # demonstrate their concurrent activity.
-    lock = threading.Lock()
-    condition = threading.Condition(lock)
-    next_ticket = [0]  # GUARDED_BY(lock)
-
-    def generator():
-      # NOTE(mrry): We yield one element before the barrier, because
-      # the current implementation of `Dataset.interleave()` must
-      # fetch one element from each incoming dataset to start the
-      # prefetching.
-      yield 0
-
-      # Define a barrier that `num_parallel_iterators` iterators must enter
-      # before any can proceed. Demonstrates that multiple iterators may be
-      # active at the same time.
-      condition.acquire()
-      ticket = next_ticket[0]
-      next_ticket[0] += 1
-      if ticket == num_parallel_iterators - 1:
-        # The last iterator to join the barrier notifies the others.
-        condition.notify_all()
-      else:
-        # Wait until the last iterator enters the barrier.
-        while next_ticket[0] < num_parallel_iterators:
-          condition.wait()
-      condition.release()
-
-      yield 1
-
-    # As in `testFromMultipleConcurrentGenerators()`, we use a combination of
-    # `Dataset.interleave()` and `Dataset.prefetch()` to cause multiple
-    # iterators to be active concurrently.
-    def interleave_fn(_):
-      return dataset_ops.Dataset.from_generator(
-          generator, output_types=dtypes.int64, output_shapes=[]).prefetch(2)
-
-    iterator = (
-        dataset_ops.Dataset.range(num_parallel_iterators)
-        .interleave(
-            interleave_fn, cycle_length=num_parallel_iterators, block_length=1)
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      for elem in [0, 1]:
-        for _ in range(num_parallel_iterators):
-          self.assertAllEqual(elem, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFromGeneratorImplicitConversion(self):
-    def generator():
-      yield [1]
-      yield [2]
-      yield [3]
-
-    for dtype in [dtypes.int8, dtypes.int32, dtypes.int64]:
-      iterator = (dataset_ops.Dataset.from_generator(
-          generator, output_types=dtype, output_shapes=[1])
-                  .make_initializable_iterator())
-      init_op = iterator.initializer
-      get_next = iterator.get_next()
-
-      self.assertEqual(dtype, get_next.dtype)
-
-      with self.test_session() as sess:
-        sess.run(init_op)
-        for expected in [[1], [2], [3]]:
-          next_val = sess.run(get_next)
-          self.assertEqual(dtype.as_numpy_dtype, next_val.dtype)
-          self.assertAllEqual(expected, next_val)
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-  def testFromGeneratorTypeError(self):
-    def generator():
-      yield np.array([1, 2, 3], dtype=np.int64)
-      yield np.array([4, 5, 6], dtype=np.int64)
-      yield "ERROR"
-      yield np.array([7, 8, 9], dtype=np.int64)
-
-    iterator = (dataset_ops.Dataset.from_generator(
-        generator, output_types=dtypes.int64, output_shapes=[3])
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      self.assertAllEqual([1, 2, 3], sess.run(get_next))
-      self.assertAllEqual([4, 5, 6], sess.run(get_next))
-      with self.assertRaisesOpError(r"invalid literal for long\(\)"):
-        sess.run(get_next)
-      self.assertAllEqual([7, 8, 9], sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFromGeneratorShapeError(self):
-    def generator():
-      yield np.array([1, 2, 3], dtype=np.int64)
-      yield np.array([4, 5, 6], dtype=np.int64)
-      yield np.array([7, 8, 9, 10], dtype=np.int64)
-      yield np.array([11, 12, 13], dtype=np.int64)
-
-    iterator = (dataset_ops.Dataset.from_generator(
-        generator, output_types=dtypes.int64, output_shapes=[3])
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      self.assertAllEqual([1, 2, 3], sess.run(get_next))
-      self.assertAllEqual([4, 5, 6], sess.run(get_next))
-      with self.assertRaisesOpError(r"element of shape \(3,\) was expected"):
-        sess.run(get_next)
-      self.assertAllEqual([11, 12, 13], sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testSplitPipelineFailsWithPlacementError(self):
-    with session.Session(
-        target="",
-        config=config_pb2.ConfigProto(device_count={"CPU": 2})) as sess:
-
-      dataset = dataset_ops.Dataset.from_tensors(0)
-
-      # Define a pipeline that attempts to use variables on two
-      # different devices.
-      #
-      # Initialize the variables before creating to iterator, to avoid the
-      # placement algorithm overriding the DT_RESOURCE colocation constraints.
-      with ops.device("/cpu:0"):
-        var_0 = resource_variable_ops.ResourceVariable(initial_value=0)
-        dataset = dataset.map(lambda x: x + var_0.read_value())
-      sess.run(var_0.initializer)
-
-      with ops.device("/cpu:1"):
-        var_1 = resource_variable_ops.ResourceVariable(initial_value=0)
-        dataset = dataset.map(lambda x: x + var_1.read_value())
-      sess.run(var_1.initializer)
-
-      iterator = dataset.make_initializable_iterator()
-      sess.run(iterator.initializer)
-
-      with self.assertRaisesRegexp(
-          errors.FailedPreconditionError,
-          "Error while reading resource variable Variable"):
-        sess.run(iterator.get_next())
-
   def testRestructureDataset(self):
     components = (array_ops.placeholder(dtypes.int32),
                   (array_ops.placeholder(dtypes.int32, shape=[None]),
diff --git a/tensorflow/contrib/data/python/kernel_tests/filter_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/filter_dataset_op_test.py
index 06883934d0..b572d6ed77 100644
--- a/tensorflow/contrib/data/python/kernel_tests/filter_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/filter_dataset_op_test.py
@@ -20,144 +20,12 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
-from tensorflow.contrib.data.python.ops import dataset_ops
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
-class FilterDatasetTest(test.TestCase):
-
-  def testFilterDataset(self):
-    components = (
-        np.arange(7, dtype=np.int64),
-        np.array([[1, 2, 3]], dtype=np.int64) * np.arange(
-            7, dtype=np.int64)[:, np.newaxis],
-        np.array(37.0, dtype=np.float64) * np.arange(7)
-    )
-    count = array_ops.placeholder(dtypes.int64, shape=[])
-    modulus = array_ops.placeholder(dtypes.int64)
-
-    def _map_fn(x, y, z):
-      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
-
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
-        .repeat(count)
-        .filter(lambda x, _y, _z: math_ops.equal(math_ops.mod(x, modulus), 0))
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([c.shape[1:] for c in components],
-                     [t.shape for t in get_next])
-
-    with self.test_session() as sess:
-      # Test that we can dynamically feed a different modulus value for each
-      # iterator.
-      def do_test(count_val, modulus_val):
-        sess.run(init_op, feed_dict={count: count_val, modulus: modulus_val})
-        for _ in range(count_val):
-          for i in [x for x in range(7) if x**2 % modulus_val == 0]:
-            result = sess.run(get_next)
-            for component, result_component in zip(components, result):
-              self.assertAllEqual(component[i]**2, result_component)
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-      do_test(14, 2)
-      do_test(4, 18)
-
-      # Test an empty dataset.
-      do_test(0, 1)
-
-  def testFilterRange(self):
-    dataset = dataset_ops.Dataset.range(100).filter(
-        lambda x: math_ops.not_equal(math_ops.mod(x, 3), 2))
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      self.assertEqual(0, sess.run(get_next))
-      self.assertEqual(1, sess.run(get_next))
-      self.assertEqual(3, sess.run(get_next))
-
-  def testFilterDict(self):
-    iterator = (dataset_ops.Dataset.range(10)
-                .map(lambda x: {"foo": x * 2, "bar": x ** 2})
-                .filter(lambda d: math_ops.equal(d["bar"] % 2, 0))
-                .map(lambda d: d["foo"] + d["bar"])
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      for i in range(10):
-        if (i ** 2) % 2 == 0:
-          self.assertEqual(i * 2 + i ** 2, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testUseStepContainerInFilter(self):
-    input_data = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int64)
-
-    # Define a predicate that returns true for the first element of
-    # the sequence and not the second, and uses `tf.map_fn()`.
-    def _predicate(xs):
-      squared_xs = functional_ops.map_fn(lambda x: x * x, xs)
-      summed = math_ops.reduce_sum(squared_xs)
-      return math_ops.equal(summed, 1 + 4 + 9)
-
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices([[1, 2, 3], [4, 5, 6]])
-        .filter(_predicate)
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      self.assertAllEqual(input_data[0], sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def assertSparseValuesEqual(self, a, b):
-    self.assertAllEqual(a.indices, b.indices)
-    self.assertAllEqual(a.values, b.values)
-    self.assertAllEqual(a.dense_shape, b.dense_shape)
-
-  def testSparse(self):
-
-    def _map_fn(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=np.array([[0, 0]]),
-          values=(i * np.array([1])),
-          dense_shape=np.array([1, 1])), i
-
-    def _filter_fn(_, i):
-      return math_ops.equal(i % 2, 0)
-
-    iterator = (
-        dataset_ops.Dataset.range(10).map(_map_fn).filter(_filter_fn).map(
-            lambda x, i: x).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      for i in range(5):
-        actual = sess.run(get_next)
-        self.assertTrue(isinstance(actual, sparse_tensor.SparseTensorValue))
-        self.assertSparseValuesEqual(actual, _map_fn(i * 2)[0])
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-
 class FilterDatasetSerializationTest(
     dataset_serialization_test_base.DatasetSerializationTestBase):
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/flat_map_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/flat_map_dataset_op_test.py
index 86d69495ef..f3feecef32 100644
--- a/tensorflow/contrib/data/python/kernel_tests/flat_map_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/flat_map_dataset_op_test.py
@@ -17,13 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import random
-
-import numpy as np
-
 from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
-from tensorflow.contrib.data.python.ops import dataset_ops
-from tensorflow.python.client import session
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -34,124 +29,6 @@ from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import test
-from tensorflow.python.training import server_lib
-
-
-class FlatMapDatasetTest(test.TestCase):
-
-  # pylint: disable=g-long-lambda
-  def testFlatMapDataset(self):
-    repeats = [1, 2, 3, 4, 5, 0, 1]
-    components = np.array(repeats, dtype=np.int64)
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components)
-        .flat_map(lambda x: dataset_ops.Dataset.from_tensors([x]).repeat(x))
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      for i in repeats:
-        for _ in range(i):
-          self.assertEqual(i, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testNestedFlatMapDataset(self):
-    repeats = [[1, 2], [3, 4], [5, 0], [1, 7]]
-    components = np.array(repeats, dtype=np.int64)
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components)
-        .flat_map(lambda x: dataset_ops.Dataset.from_tensor_slices(x)
-                  .flat_map(lambda y: dataset_ops.Dataset.from_tensors(y)
-                            .repeat(y))).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      for row in repeats:
-        for i in row:
-          for _ in range(i):
-            self.assertEqual(i, sess.run(get_next))
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testSharedResourceNestedFlatMapDataset(self):
-    repeats = [[1, 2], [3, 4], [5, 0], [1, 7]]
-    components = np.array(repeats, dtype=np.int64)
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components)
-        .flat_map(lambda x: dataset_ops.Dataset.from_tensor_slices(x)
-                  .flat_map(lambda y: dataset_ops.Dataset.from_tensors(y)
-                            .repeat(y))).make_initializable_iterator(
-                                shared_name="shared_flat_map_iterator"))
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    # Create two concurrent sessions that share the same iterator
-    # resource on the same server, and verify that a random
-    # interleaving of `Session.run(get_next)` calls on the two
-    # sessions yields the expected result.
-    server = server_lib.Server.create_local_server()
-    with session.Session(server.target) as sess1:
-      with session.Session(server.target) as sess2:
-        for _ in range(3):
-          sess = random.choice([sess1, sess2])
-          sess.run(init_op)
-          for row in repeats:
-            for i in row:
-              for _ in range(i):
-                sess = random.choice([sess1, sess2])
-                self.assertEqual(i, sess.run(get_next))
-
-        with self.assertRaises(errors.OutOfRangeError):
-          sess = random.choice([sess1, sess2])
-          sess.run(get_next)
-
-  def testMapDict(self):
-    iterator = (dataset_ops.Dataset.range(10)
-                .map(lambda x: {"foo": x * 2, "bar": x ** 2})
-                .flat_map(lambda d: dataset_ops.Dataset.from_tensors(d["foo"])
-                          .repeat(d["bar"]))
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      for i in range(10):
-        for _ in range(i ** 2):
-          self.assertEqual(i * 2, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-  # pylint: enable=g-long-lambda
-
-  def testSparse(self):
-    def _map_fn(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=[[0, 0], [1, 1]], values=(i * [1, -1]), dense_shape=[2, 2])
-
-    def _flat_map_fn(x):
-      return dataset_ops.Dataset.from_tensor_slices(
-          sparse_ops.sparse_to_dense(x.indices, x.dense_shape, x.values))
-
-    iterator = (
-        dataset_ops.Dataset.range(10).map(_map_fn).flat_map(_flat_map_fn)
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      for i in range(10):
-        for j in range(2):
-          expected = [i, 0] if j % 2 == 0 else [0, -i]
-          self.assertAllEqual(expected, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
 
 
 class FlatMapDatasetSerializationTest(
diff --git a/tensorflow/contrib/data/python/kernel_tests/get_single_element_test.py b/tensorflow/contrib/data/python/kernel_tests/get_single_element_test.py
index 03d30bd100..32ea44f7c7 100644
--- a/tensorflow/contrib/data/python/kernel_tests/get_single_element_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/get_single_element_test.py
@@ -17,7 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.data.python.ops import dataset_ops
+from tensorflow.contrib.data.python.ops import get_single_element
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -37,7 +38,7 @@ class GetSingleElementTest(test.TestCase):
                .map(lambda x: x * x)
                .take(take_value))
 
-    element = dataset_ops.get_single_element(dataset)
+    element = get_single_element.get_single_element(dataset)
 
     with self.test_session() as sess:
       self.assertEqual(0, sess.run(element, feed_dict={skip_value: 0}))
diff --git a/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py
index db8429512b..256ad8d94d 100644
--- a/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py
@@ -26,8 +26,8 @@ import numpy as np
 from six.moves import zip_longest
 
 from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
-from tensorflow.contrib.data.python.ops import dataset_ops
 from tensorflow.contrib.data.python.ops import interleave_ops
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
@@ -38,182 +38,7 @@ from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
 
 
-class InterleaveDatasetTest(test.TestCase):
-
-  def _interleave(self, lists, cycle_length, block_length):
-    # TODO(b/69678297): Consolidate python interleave implementations.
-    num_open = 0
-
-    # `all_iterators` acts as a queue of iterators over each element of `lists`.
-    all_iterators = [iter(l) for l in lists]
-
-    # `open_iterators` are the iterators whose elements are currently being
-    # interleaved.
-    open_iterators = []
-    for i in range(cycle_length):
-      if all_iterators:
-        open_iterators.append(all_iterators.pop(0))
-        num_open += 1
-      else:
-        open_iterators.append(None)
-
-    while num_open or all_iterators:
-      for i in range(cycle_length):
-        if open_iterators[i] is None:
-          if all_iterators:
-            open_iterators[i] = all_iterators.pop(0)
-            num_open += 1
-          else:
-            continue
-        for _ in range(block_length):
-          try:
-            yield next(open_iterators[i])
-          except StopIteration:
-            open_iterators[i] = None
-            num_open -= 1
-            break
-
-  def testPythonImplementation(self):
-    input_lists = [[4, 4, 4, 4], [5, 5, 5, 5, 5], [6, 6, 6, 6, 6, 6],
-                   [4, 4, 4, 4], [5, 5, 5, 5, 5], [6, 6, 6, 6, 6, 6]]
-
-    # Cycle length 1 acts like `Dataset.flat_map()`.
-    expected_elements = itertools.chain(*input_lists)
-    for expected, produced in zip(
-        expected_elements, self._interleave(input_lists, 1, 1)):
-      self.assertEqual(expected, produced)
-
-    # Cycle length > 1.
-    expected_elements = [4, 5, 4, 5, 4, 5, 4,
-                         5, 5, 6, 6,  # NOTE(mrry): When we cycle back
-                                      # to a list and are already at
-                                      # the end of that list, we move
-                                      # on to the next element.
-                         4, 6, 4, 6, 4, 6, 4, 6, 5, 6, 5, 6, 5, 6, 5, 6, 5]
-    for expected, produced in zip(
-        expected_elements, self._interleave(input_lists, 2, 1)):
-      self.assertEqual(expected, produced)
-
-    # Cycle length > 1 and block length > 1.
-    expected_elements = [4, 4, 4, 5, 5, 5, 4, 5, 5, 6, 6, 6, 4, 4, 4, 6, 6, 6,
-                         4, 5, 5, 5, 6, 6, 6, 5, 5, 6, 6, 6]
-    for expected, produced in zip(
-        expected_elements, self._interleave(input_lists, 2, 3)):
-      self.assertEqual(expected, produced)
-
-    # Cycle length > len(input_values).
-    expected_elements = [4, 4, 5, 5, 6, 6, 4, 4, 5, 5, 6, 6, 4, 4, 5, 5, 6, 6,
-                         4, 4, 5, 5, 6, 6, 5, 6, 6, 5, 6, 6]
-    for expected, produced in zip(
-        expected_elements, self._interleave(input_lists, 7, 2)):
-      self.assertEqual(expected, produced)
-
-  def testInterleaveDataset(self):
-    input_values = array_ops.placeholder(dtypes.int64, shape=[None])
-    cycle_length = array_ops.placeholder(dtypes.int64, shape=[])
-    block_length = array_ops.placeholder(dtypes.int64, shape=[])
-
-    repeat_count = 2
-
-    dataset = (
-        dataset_ops.Dataset.from_tensor_slices(input_values)
-        .repeat(repeat_count)
-        .interleave(lambda x: dataset_ops.Dataset.from_tensors(x).repeat(x),
-                    cycle_length, block_length))
-    iterator = dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    next_element = iterator.get_next()
-
-    with self.test_session() as sess:
-      # Cycle length 1 acts like `Dataset.flat_map()`.
-      sess.run(init_op, feed_dict={input_values: [4, 5, 6],
-                                   cycle_length: 1, block_length: 3})
-
-      for expected_element in self._interleave(
-          [[4] * 4, [5] * 5, [6] * 6] * repeat_count, 1, 3):
-        self.assertEqual(expected_element, sess.run(next_element))
-
-      # Cycle length > 1.
-      # expected: [4, 5, 4, 5, 4, 5, 4, 5, 5, 6, 6, 4, 6, 4, 6, 4, 6, 4, 6, 5,
-      #            6, 5, 6, 5, 6, 5, 6, 5]
-      sess.run(init_op, feed_dict={input_values: [4, 5, 6],
-                                   cycle_length: 2, block_length: 1})
-      for expected_element in self._interleave(
-          [[4] * 4, [5] * 5, [6] * 6] * repeat_count, 2, 1):
-        self.assertEqual(expected_element, sess.run(next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-      # Cycle length > 1 and block length > 1.
-      # expected: [4, 4, 4, 5, 5, 5, 4, 5, 5, 6, 6, 6, 4, 4, 4, 6, 6, 6, 4, 5,
-      #            5, 5, 6, 6, 6, 5, 5, 6, 6, 6]
-      sess.run(init_op, feed_dict={input_values: [4, 5, 6],
-                                   cycle_length: 2, block_length: 3})
-      for expected_element in self._interleave(
-          [[4] * 4, [5] * 5, [6] * 6] * repeat_count, 2, 3):
-        self.assertEqual(expected_element, sess.run(next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-      # Cycle length > len(input_values) * repeat_count.
-      # expected: [4, 4, 5, 5, 6, 6, 4, 4, 5, 5, 6, 6, 4, 4, 5, 5, 6, 6, 4, 4,
-      #            5, 5, 6, 6, 5, 6, 6, 5, 6, 6]
-      sess.run(init_op, feed_dict={input_values: [4, 5, 6],
-                                   cycle_length: 7, block_length: 2})
-      for expected_element in self._interleave(
-          [[4] * 4, [5] * 5, [6] * 6] * repeat_count, 7, 2):
-        self.assertEqual(expected_element, sess.run(next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-      # Empty input.
-      sess.run(init_op, feed_dict={input_values: [],
-                                   cycle_length: 2, block_length: 3})
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-      # Non-empty input leading to empty output.
-      sess.run(init_op, feed_dict={input_values: [0, 0, 0],
-                                   cycle_length: 2, block_length: 3})
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-      # Mixture of non-empty and empty interleaved datasets.
-      sess.run(init_op, feed_dict={input_values: [4, 0, 6],
-                                   cycle_length: 2, block_length: 3})
-      for expected_element in self._interleave(
-          [[4] * 4, [], [6] * 6] * repeat_count, 2, 3):
-        self.assertEqual(expected_element, sess.run(next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-  def testSparse(self):
-
-    def _map_fn(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=[[0, 0], [1, 1]], values=(i * [1, -1]), dense_shape=[2, 2])
-
-    def _interleave_fn(x):
-      return dataset_ops.Dataset.from_tensor_slices(
-          sparse_ops.sparse_to_dense(x.indices, x.dense_shape, x.values))
-
-    iterator = (
-        dataset_ops.Dataset.range(10).map(_map_fn).interleave(
-            _interleave_fn, cycle_length=1).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      for i in range(10):
-        for j in range(2):
-          expected = [i, 0] if j % 2 == 0 else [0, -i]
-          self.assertAllEqual(expected, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-
-class InterleaveDatasetSeriazationTest(
+class InterleaveDatasetSerializationTest(
     dataset_serialization_test_base.DatasetSerializationTestBase):
 
   def _build_iterator_graph(self, input_values, cycle_length, block_length):
diff --git a/tensorflow/contrib/data/python/kernel_tests/list_files_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/list_files_dataset_op_test.py
deleted file mode 100644
index 27298de65f..0000000000
--- a/tensorflow/contrib/data/python/kernel_tests/list_files_dataset_op_test.py
+++ /dev/null
@@ -1,159 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from os import path
-import shutil
-import tempfile
-
-from tensorflow.contrib.data.python.ops import dataset_ops
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.ops import array_ops
-from tensorflow.python.platform import test
-from tensorflow.python.util import compat
-
-
-class ListFilesDatasetOpTest(test.TestCase):
-
-  def setUp(self):
-    self.tmp_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    shutil.rmtree(self.tmp_dir, ignore_errors=True)
-
-  def _touchTempFiles(self, filenames):
-    for filename in filenames:
-      open(path.join(self.tmp_dir, filename), 'a').close()
-
-  def testEmptyDirectory(self):
-    dataset = dataset_ops.Dataset.list_files(path.join(self.tmp_dir, '*'))
-    with self.test_session() as sess:
-      itr = dataset.make_one_shot_iterator()
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(itr.get_next())
-
-  def testSimpleDirectory(self):
-    filenames = ['a', 'b', 'c']
-    self._touchTempFiles(filenames)
-
-    dataset = dataset_ops.Dataset.list_files(path.join(self.tmp_dir, '*'))
-    with self.test_session() as sess:
-      itr = dataset.make_one_shot_iterator()
-
-      full_filenames = []
-      produced_filenames = []
-      for filename in filenames:
-        full_filenames.append(
-            compat.as_bytes(path.join(self.tmp_dir, filename)))
-        produced_filenames.append(compat.as_bytes(sess.run(itr.get_next())))
-      self.assertItemsEqual(full_filenames, produced_filenames)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(itr.get_next())
-
-  def testEmptyDirectoryInitializer(self):
-    filename_placeholder = array_ops.placeholder(dtypes.string, shape=[])
-    dataset = dataset_ops.Dataset.list_files(filename_placeholder)
-
-    with self.test_session() as sess:
-      itr = dataset.make_initializable_iterator()
-      sess.run(
-          itr.initializer,
-          feed_dict={filename_placeholder: path.join(self.tmp_dir, '*')})
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(itr.get_next())
-
-  def testSimpleDirectoryInitializer(self):
-    filenames = ['a', 'b', 'c']
-    self._touchTempFiles(filenames)
-
-    filename_placeholder = array_ops.placeholder(dtypes.string, shape=[])
-    dataset = dataset_ops.Dataset.list_files(filename_placeholder)
-
-    with self.test_session() as sess:
-      itr = dataset.make_initializable_iterator()
-      sess.run(
-          itr.initializer,
-          feed_dict={filename_placeholder: path.join(self.tmp_dir, '*')})
-
-      full_filenames = []
-      produced_filenames = []
-      for filename in filenames:
-        full_filenames.append(
-            compat.as_bytes(path.join(self.tmp_dir, filename)))
-        produced_filenames.append(compat.as_bytes(sess.run(itr.get_next())))
-
-      self.assertItemsEqual(full_filenames, produced_filenames)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(itr.get_next())
-
-  def testFileSuffixes(self):
-    filenames = ['a.txt', 'b.py', 'c.py', 'd.pyc']
-    self._touchTempFiles(filenames)
-
-    filename_placeholder = array_ops.placeholder(dtypes.string, shape=[])
-    dataset = dataset_ops.Dataset.list_files(filename_placeholder)
-
-    with self.test_session() as sess:
-      itr = dataset.make_initializable_iterator()
-      sess.run(
-          itr.initializer,
-          feed_dict={filename_placeholder: path.join(self.tmp_dir, '*.py')})
-
-      full_filenames = []
-      produced_filenames = []
-      for filename in filenames[1:-1]:
-        full_filenames.append(
-            compat.as_bytes(path.join(self.tmp_dir, filename)))
-        produced_filenames.append(compat.as_bytes(sess.run(itr.get_next())))
-      self.assertItemsEqual(full_filenames, produced_filenames)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(itr.get_next())
-
-  def testFileMiddles(self):
-    filenames = ['a.txt', 'b.py', 'c.pyc']
-    self._touchTempFiles(filenames)
-
-    filename_placeholder = array_ops.placeholder(dtypes.string, shape=[])
-    dataset = dataset_ops.Dataset.list_files(filename_placeholder)
-
-    with self.test_session() as sess:
-      itr = dataset.make_initializable_iterator()
-      sess.run(
-          itr.initializer,
-          feed_dict={filename_placeholder: path.join(self.tmp_dir, '*.py*')})
-
-      full_filenames = []
-      produced_filenames = []
-      for filename in filenames[1:]:
-        full_filenames.append(
-            compat.as_bytes(path.join(self.tmp_dir, filename)))
-        produced_filenames.append(compat.as_bytes(sess.run(itr.get_next())))
-
-      self.assertItemsEqual(full_filenames, produced_filenames)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(itr.get_next())
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
index d3ce89298b..8d40429279 100644
--- a/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
@@ -16,15 +16,12 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-from collections import namedtuple
 
 import os
-import threading
 
 import numpy as np
 
 from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
-from tensorflow.contrib.data.python.ops import dataset_ops as contrib_dataset_ops
 from tensorflow.contrib.data.python.ops import error_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
@@ -34,15 +31,9 @@ from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import data_flow_ops
-from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import io_ops
-from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import script_ops
-from tensorflow.python.ops import sparse_ops
-from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import test
 from tensorflow.python.util import compat
@@ -50,231 +41,11 @@ from tensorflow.python.util import compat
 
 class MapDatasetTest(test.TestCase):
 
-  def _buildMapDataset(self, components, count):
-    def _map_fn(x, y, z):
-      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
-
-    return (
-        contrib_dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
-        .repeat(count))
-
-  def testMapDataset(self):
-    """Test an dataset that maps a TF function across its input elements."""
-    # The pipeline is TensorSliceDataset -> MapDataset(square_3) ->
-    # RepeatDataset(count).
-    components = (np.arange(7),
-                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
-                  np.array(37.0) * np.arange(7))
-    count = array_ops.placeholder(dtypes.int64, shape=[])
-
-    dataset = self._buildMapDataset(components, count)
-    iterator = dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([c.shape[1:] for c in components],
-                     [t.shape for t in get_next])
-
-    with self.test_session() as sess:
-      # Test single-threaded access to the iterator.
-      sess.run(init_op, feed_dict={count: 14})
-      for _ in range(14):
-        for i in range(7):
-          result = sess.run(get_next)
-          for component, result_component in zip(components, result):
-            self.assertAllEqual(component[i]**2, result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Test multi-threaded access to the same iterator.
-      sess.run(init_op, feed_dict={count: 18})
-      results = []
-      def iterator_thread():
-        while True:
-          try:
-            results.append(sess.run(get_next))
-          except errors.OutOfRangeError:
-            return
-      threads = [self.checkedThread(target=iterator_thread) for _ in range(8)]
-      for t in threads:
-        t.start()
-      for t in threads:
-        t.join()
-
-      # `results` will contain the same elements components**2
-      # repeated 18 times, but in a non-deterministic order. Sort the
-      # results, and assert that each element of components**2 is
-      # produced 18 times.
-      results.sort(key=lambda x: x[0])
-      for i in range(7):
-        for j in range(18):
-          for component, result_component in zip(components,
-                                                 results[i * 18 + j]):
-            self.assertAllEqual(component[i]**2, result_component)
-
-  def _buildParallelMapDataset(self, components, count, num_threads,
-                               output_buffer_size):
-    def _map_fn(x, y, z):
-      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
-
-    return (contrib_dataset_ops.Dataset.from_tensor_slices(components).map(
-        _map_fn, num_threads=num_threads, output_buffer_size=output_buffer_size)
-            .repeat(count))
-
-  def testParallelMapDataset(self):
-    """Test an dataset that maps a TF function across its input elements."""
-    # The pipeline is TensorSliceDataset -> ParallelMapDataset(square_3) ->
-    # RepeatDataset(count).
-    components = (np.arange(7),
-                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
-                  np.array(37.0) * np.arange(7))
-    count = array_ops.placeholder(dtypes.int64, shape=[])
-    num_threads = array_ops.placeholder(dtypes.int32, shape=[])
-    output_buffer_size = array_ops.placeholder(dtypes.int64, shape=[])
-
-    dataset = self._buildParallelMapDataset(components, count, num_threads,
-                                            output_buffer_size)
-    iterator = dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([c.shape[1:] for c in components],
-                     [t.shape for t in get_next])
-
-    with self.test_session() as sess:
-      def do_test(num_threads_val, output_buffer_size_val):
-        # Test single-threaded access to the iterator.
-        sess.run(init_op, feed_dict={
-            count: 14,
-            num_threads: num_threads_val,
-            output_buffer_size: output_buffer_size_val})
-        for _ in range(14):
-          for i in range(7):
-            result = sess.run(get_next)
-            for component, result_component in zip(components, result):
-              self.assertAllEqual(component[i]**2, result_component)
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-        # Test multi-threaded access to the same iterator.
-        sess.run(init_op, feed_dict={
-            count: 18,
-            num_threads: num_threads_val,
-            output_buffer_size: output_buffer_size_val})
-        results = []
-        def iterator_thread():
-          while True:
-            try:
-              results.append(sess.run(get_next))
-            except errors.OutOfRangeError:
-              return
-        threads = [self.checkedThread(target=iterator_thread)
-                   for _ in range(64)]
-        for t in threads:
-          t.start()
-        for t in threads:
-          t.join()
-
-        # `results` will contain the same elements components**2
-        # repeated 18 times, but in a non-deterministic order. Sort the
-        # results, and assert that each element of components**2 is
-        # produced 18 times.
-        results.sort(key=lambda x: x[0])
-        for i in range(7):
-          for j in range(18):
-            for component, result_component in zip(components,
-                                                   results[i * 18 + j]):
-              self.assertAllEqual(component[i]**2, result_component)
-
-      for num_threads_val, output_buffer_size_val in [
-          (1, 1), (1, 2), (2, 2), (2, 4), (8, 8), (8, 16)]:
-        do_test(num_threads_val, output_buffer_size_val)
-
-  def testImplicitDisposeParallelMapDataset(self):
-    # Tests whether a parallel map dataset will be cleaned up correctly when
-    # the pipeline does not run it until exhaustion.
-    # The pipeline is TensorSliceDataset -> MapDataset(square_3) ->
-    # RepeatDataset(1000).
-    components = (np.arange(1000),
-                  np.array([[1, 2, 3]]) * np.arange(1000)[:, np.newaxis],
-                  np.array(37.0) * np.arange(1000))
-
-    dataset = self._buildParallelMapDataset(components, 1000, 100, 100)
-    # NOTE(mrry): Also test that the prefetching thread is cancelled correctly.
-    dataset = dataset.prefetch(100)
-    iterator = dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      for _ in range(3):
-        sess.run(get_next)
-
-  def testParallelMapUnspecifiedOutputSize(self):
-    components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)
-
-    dataset = (
-        contrib_dataset_ops.Dataset.from_tensor_slices(components).map(
-            lambda x: array_ops.check_numerics(x, "message"), num_threads=2))
-    iterator = dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      for _ in range(3):
-        sess.run(get_next)
-
-  def testParallelMapError(self):
-    components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)
-
-    dataset = (
-        contrib_dataset_ops.Dataset.from_tensor_slices(components).map(
-            lambda x: array_ops.check_numerics(x, "message"),
-            num_threads=2,
-            output_buffer_size=2))
-    iterator = dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      for _ in range(3):
-        sess.run(get_next)
-      # The 4th element is NaN, so `array_ops.check_numerics()` should fail.
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(get_next)
-      sess.run(get_next)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testPrefetchError(self):
-    components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)
-
-    dataset = (
-        contrib_dataset_ops.Dataset.from_tensor_slices(components)
-        .map(lambda x: array_ops.check_numerics(x, "message")).prefetch(2))
-    iterator = dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      for _ in range(3):
-        sess.run(get_next)
-      # The 4th element is NaN, so `array_ops.check_numerics()` should fail.
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(get_next)
-      sess.run(get_next)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
   def testMapIgnoreError(self):
     components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)
 
     dataset = (
-        contrib_dataset_ops.Dataset.from_tensor_slices(components)
+        dataset_ops.Dataset.from_tensor_slices(components)
         .map(lambda x: array_ops.check_numerics(x, "message")).apply(
             error_ops.ignore_errors()))
     iterator = dataset.make_initializable_iterator()
@@ -292,10 +63,9 @@ class MapDatasetTest(test.TestCase):
     components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)
 
     dataset = (
-        contrib_dataset_ops.Dataset.from_tensor_slices(components).map(
+        dataset_ops.Dataset.from_tensor_slices(components).map(
             lambda x: array_ops.check_numerics(x, "message"),
-            num_threads=2,
-            output_buffer_size=2).apply(error_ops.ignore_errors()))
+            num_parallel_calls=2).prefetch(2).apply(error_ops.ignore_errors()))
     iterator = dataset.make_initializable_iterator()
     init_op = iterator.initializer
     get_next = iterator.get_next()
@@ -317,8 +87,8 @@ class MapDatasetTest(test.TestCase):
       write_string_to_file(filename, filename)
 
     dataset = (
-        contrib_dataset_ops.Dataset.from_tensor_slices(filenames).map(
-            io_ops.read_file, num_threads=2, output_buffer_size=2).apply(
+        dataset_ops.Dataset.from_tensor_slices(filenames).map(
+            io_ops.read_file, num_parallel_calls=2).prefetch(2).apply(
                 error_ops.ignore_errors()))
     iterator = dataset.make_initializable_iterator()
     init_op = iterator.initializer
@@ -343,350 +113,6 @@ class MapDatasetTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
-  def testCaptureHashTable(self):
-    # NOTE(mrry): We must use the V2 variants of `HashTable`
-    # etc. because these produce a `tf.resource`-typed output that is
-    # compatible with the in-graph function implementation.
-    default_val = -1
-    keys = constant_op.constant(["brain", "salad", "surgery"])
-    values = constant_op.constant([0, 1, 2], dtypes.int64)
-    table = lookup_ops.HashTable(
-        lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
-
-    input_sentences = contrib_dataset_ops.Dataset.from_tensor_slices(
-        ["brain brain tank salad surgery", "surgery brain"])
-
-    iterator = (input_sentences
-                .map(lambda x: string_ops.string_split([x]).values)
-                .map(table.lookup)
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(table.init)
-      sess.run(init_op)
-
-      print(sess.run(get_next))
-      print(sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testCaptureQueue(self):
-    elements = np.random.randint(100, size=[200])
-    queue = data_flow_ops.FIFOQueue(200, dtypes.int64, shapes=[])
-    enqueue_op = queue.enqueue_many(elements)
-    close_op = queue.close()
-    iterator = (
-        contrib_dataset_ops.Dataset.from_tensors(0).repeat(-1)
-        .map(lambda _: queue.dequeue()).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(enqueue_op)
-      sess.run(close_op)
-      sess.run(init_op)
-      for element in elements:
-        self.assertEqual(element, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testCaptureSameResourceMultipleTimes(self):
-    elements = np.random.randint(100, size=[200])
-    queue = data_flow_ops.FIFOQueue(
-        200, dtypes.int64, shapes=[], shared_name="shared_queue")
-    queue_2 = data_flow_ops.FIFOQueue(
-        200, dtypes.int64, shapes=[], shared_name="shared_queue")
-
-    enqueue_op = queue.enqueue_many(elements)
-    close_op = queue.close()
-
-    iterator = (
-        contrib_dataset_ops.Dataset.from_tensors(0).repeat(-1)
-        .map(lambda _: (queue.dequeue(), queue_2.dequeue()))
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(enqueue_op)
-      sess.run(close_op)
-      sess.run(init_op)
-      for i in range(100):
-        self.assertEqual(sorted([elements[i * 2], elements[i * 2 + 1]]),
-                         sorted(sess.run(get_next)))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testCaptureVariable(self):
-    counter_var = variable_scope.get_variable(
-        "counter", (), dtypes.int32, use_resource=True)
-    iterator = (
-        contrib_dataset_ops.Dataset.from_tensors(0).repeat(10)
-        .map(lambda _: counter_var.assign_add(1)).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(counter_var.initializer)
-      sess.run(init_op)
-      for i in range(10):
-        self.assertEqual(i, sess.run(counter_var))
-        self.assertEqual(i + 1, sess.run(get_next))
-      self.assertEqual(10, sess.run(counter_var))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-      self.assertEqual(10, sess.run(counter_var))
-
-  def testCaptureUninitializedVariableError(self):
-    counter_var = variable_scope.get_variable(
-        "counter", (), dtypes.int32, use_resource=True)
-    iterator = (
-        contrib_dataset_ops.Dataset.from_tensors(0).repeat(10)
-        .map(lambda _: counter_var.assign_add(1)).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      with self.assertRaises(errors.NotFoundError):
-        sess.run(get_next)
-
-  def testSeededStatefulOperatorIsProperlyStateful(self):
-    iterator = (
-        contrib_dataset_ops.Dataset.from_tensors(0).repeat(10)
-        .map(lambda _: random_ops.random_uniform((), seed=11)).batch(2)
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      random_values = []
-      with self.assertRaises(errors.OutOfRangeError):
-        while True:
-          random_values.extend(sess.run(get_next))
-      self.assertEqual(10, len(random_values))
-      self.assertGreater(np.abs(np.diff(random_values)).max(), 1e-6)
-      sess.run(init_op)
-      random_values_2 = []
-      with self.assertRaises(errors.OutOfRangeError):
-        while True:
-          random_values_2.extend(sess.run(get_next))
-
-      # Randomness is repeatable given same seed
-      self.assertAllClose(random_values, random_values_2)
-
-  def testMapDict(self):
-    iterator = (contrib_dataset_ops.Dataset.range(10)
-                .map(lambda x: {"foo": x * 2, "bar": x ** 2})
-                .map(lambda d: d["foo"] + d["bar"])
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      for i in range(10):
-        self.assertEqual(i * 2 + i ** 2, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testMapNamedtuple(self, count=10):
-    # construct dataset of tuples
-    labels = contrib_dataset_ops.Dataset.range(count)
-    images = labels.map(lambda l: -l)
-    dataset_tuple = contrib_dataset_ops.Dataset.zip((labels, images))
-
-    # convert dataset of tuples to dataset of namedtuples
-    example = namedtuple("Example", ["label", "image"])
-    dataset_namedtuple = dataset_tuple.map(example)
-
-    def preprocess_tuple(label, image):
-      image = 2 * image
-      return label, image
-
-    def preprocess_namedtuple(example):
-      return example._replace(image=2 * example.image)
-
-    # preprocess both datasets
-    dataset_tuple = dataset_tuple.map(preprocess_tuple)
-    dataset_namedtuple = dataset_namedtuple.map(preprocess_namedtuple)
-
-    next_tuple = dataset_tuple.make_one_shot_iterator().get_next()
-    next_namedtuple = dataset_namedtuple.make_one_shot_iterator().get_next()
-
-    # make sure both datasets contain the same data
-    with self.test_session() as sess:
-      for i in range(count):
-        tuple_, namedtuple_ = sess.run([next_tuple, next_namedtuple])
-        self.assertEqual(tuple_, namedtuple_)
-        self.assertEqual(tuple_, (i, -2 * i))
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_namedtuple)
-
-  def testUseStepContainerInMap(self):
-    row = np.arange(6)
-    iterator = (
-        contrib_dataset_ops.Dataset.from_tensors(row)
-        .map(lambda elems: functional_ops.map_fn(lambda x: x * x, elems))
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      self.assertAllEqual(row ** 2, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testPrefetch(self):
-    # We will use this event to test that `_map_py_func()` has been
-    # invoked a certain number of times (6 times, to be exact) after
-    # consuming fewer elements from the iterator.
-    ev = threading.Event()
-
-    set_event_during_invocation = 5
-
-    def _map_py_func(x):
-      if x == set_event_during_invocation:
-        ev.set()
-      return x * x
-
-    def _map_fn(x):
-      return script_ops.py_func(_map_py_func, [x], x.dtype)
-
-    buffer_size_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = (
-        contrib_dataset_ops.Dataset.range(100).map(_map_fn)
-        .prefetch(buffer_size_placeholder).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      # Simple test that prefetch yields the expected values in the
-      # expected order.
-      for buffer_size in [1, 10, 100, 1000]:
-        sess.run(init_op, feed_dict={buffer_size_placeholder: buffer_size})
-        for i in range(100):
-          self.assertEqual(i * i, sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-      # We can indirectly observe that varying the buffer size has the
-      # intended effect by observing when `ev` is set (on the 6th
-      # invocation of `_map_py_func()`).
-      # NOTE(mrry): We do not test with `buffer_size ==
-      # set_event_during_invocation`, because we must consume at least
-      # one element to start the prefetching.
-      for buffer_size in range(1, set_event_during_invocation):
-        event_will_be_set_after_consuming = (
-            set_event_during_invocation - buffer_size + 1)
-
-        ev.clear()
-        sess.run(init_op, feed_dict={buffer_size_placeholder: buffer_size})
-        for i in range(event_will_be_set_after_consuming):
-          self.assertFalse(ev.is_set())
-          self.assertEqual(i * i, sess.run(get_next))
-        ev.wait()
-        for i in range(event_will_be_set_after_consuming, 100):
-          self.assertEqual(i * i, sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-  def testReturnList(self):
-    iterator = (
-        contrib_dataset_ops.Dataset.range(10)
-        .map(lambda x: [x, constant_op.constant(37.0)])
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      for i in range(10):
-        self.assertEqual((i, 37.0), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testMultiOutputPyFunc(self):
-    # The `tf.py_func()` op returns a list of tensors for its outputs.
-    def _map_fn(x_tensor):
-      def _map_py_func(x):
-        return x, np.array(37.0, dtype=np.float64)
-      return script_ops.py_func(
-          _map_py_func, [x_tensor], [dtypes.int64, dtypes.float64])
-
-    iterator = (
-        contrib_dataset_ops.Dataset.range(10).map(_map_fn)
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      for i in range(10):
-        self.assertEqual((i, 37.0), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def assertSparseValuesEqual(self, a, b):
-    self.assertAllEqual(a.indices, b.indices)
-    self.assertAllEqual(a.values, b.values)
-    self.assertAllEqual(a.dense_shape, b.dense_shape)
-
-  def testSparse(self):
-
-    def _sparse(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=np.array([[0, 0]]),
-          values=(i * np.array([1])),
-          dense_shape=np.array([1, 1]))
-
-    iterator = (
-        contrib_dataset_ops.Dataset.range(10).map(_sparse)
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      for i in range(10):
-        actual = sess.run(get_next)
-        self.assertTrue(isinstance(actual, sparse_tensor.SparseTensorValue))
-        self.assertSparseValuesEqual(actual, _sparse(i))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testSparseChain(self):
-
-    def _sparse(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=np.array([[0, 0]]),
-          values=(i * np.array([1])),
-          dense_shape=np.array([1, 1]))
-
-    def _check(i):
-      self.assertTrue(sparse_tensor.is_sparse(i))
-      return sparse_ops.sparse_concat(0, [i, i])
-
-    iterator = (
-        contrib_dataset_ops.Dataset.range(10).map(_sparse).map(_check)
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      for i in range(10):
-        actual = sess.run(get_next)
-        self.assertTrue(isinstance(actual, sparse_tensor.SparseTensorValue))
-        self.assertSparseValuesEqual(actual, _check(_sparse(i)).eval())
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
   def testCaptureResourceInMapFn(self):
 
     def _build_ds(iterator):
@@ -695,10 +121,10 @@ class MapDatasetTest(test.TestCase):
         get_next = iterator.get_next()
         return x * get_next
 
-      return contrib_dataset_ops.Dataset.range(10).map(_map_fn)
+      return dataset_ops.Dataset.range(10).map(_map_fn)
 
     def _build_graph():
-      captured_iterator = contrib_dataset_ops.Dataset.range(
+      captured_iterator = dataset_ops.Dataset.range(
           10).make_initializable_iterator()
       ds = _build_ds(captured_iterator)
       iterator = ds.make_initializable_iterator()
@@ -734,7 +160,7 @@ class MapDatasetSerializationTest(
       return math_ops.square(x), math_ops.square(y), math_ops.square(z)
 
     return (
-        contrib_dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
+        dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
         .repeat(self._num_epochs))
 
   def testSaveRestoreCore(self):
@@ -751,7 +177,7 @@ class MapDatasetSerializationTest(
         return random_ops.random_uniform(
             (), 0, 10, dtype=dtypes.int32) * math_ops.to_int32(x)
 
-      return contrib_dataset_ops.Dataset.range(100).map(_map_fn)
+      return dataset_ops.Dataset.range(100).map(_map_fn)
 
     self.verify_error_on_save(_build_ds, 15, errors.InvalidArgumentError)
 
@@ -760,7 +186,7 @@ class MapDatasetSerializationTest(
     def _build_ds():
       counter_var = variable_scope.get_variable(
           "counter", (), dtypes.int32, use_resource=True)
-      return (contrib_dataset_ops.Dataset.from_tensors(0).repeat(10).map(
+      return (dataset_ops.Dataset.from_tensors(0).repeat(10).map(
           lambda _: counter_var.assign_add(1)))
 
     self.verify_error_on_save(_build_ds, 15, errors.InvalidArgumentError)
@@ -769,7 +195,7 @@ class MapDatasetSerializationTest(
 
     def _build_ds():
       constant_var = constant_op.constant(5)
-      return (contrib_dataset_ops.Dataset.from_tensors(0).repeat(10).map(
+      return (dataset_ops.Dataset.from_tensors(0).repeat(10).map(
           lambda x: x + constant_var))
 
     self.run_core_tests(_build_ds, None, 10)
@@ -783,7 +209,7 @@ class MapDatasetSerializationTest(
       def defun_fn(x):
         return constant_op.constant(1000) + math_ops.to_int32(x)
 
-      return contrib_dataset_ops.Dataset.range(num_outputs).map(defun_fn)
+      return dataset_ops.Dataset.range(num_outputs).map(defun_fn)
 
     self.run_core_tests(_build_ds, None, num_outputs)
 
@@ -801,7 +227,7 @@ class MapDatasetSerializationTest(
 
         return constant_op.constant(11000) + defun_fn_deep(math_ops.to_int32(x))
 
-      return contrib_dataset_ops.Dataset.range(num_outputs).map(defun_fn)
+      return dataset_ops.Dataset.range(num_outputs).map(defun_fn)
 
     self.run_core_tests(_build_ds, None, num_outputs)
 
@@ -814,7 +240,7 @@ class MapDatasetSerializationTest(
           dense_shape=np.array([1, 1]))
 
     def _build_ds(num_outputs):
-      return contrib_dataset_ops.Dataset.range(num_outputs).map(_sparse)
+      return dataset_ops.Dataset.range(num_outputs).map(_sparse)
 
     num_outputs = 10
     self.run_core_tests(lambda: _build_ds(num_outputs),
@@ -866,7 +292,7 @@ class ParallelMapDatasetSerializationTest(
         return random_ops.random_uniform(
             (), 0, 10, dtype=dtypes.int32) * math_ops.to_int32(x)
 
-      return contrib_dataset_ops.Dataset.range(100).map(
+      return dataset_ops.Dataset.range(100).map(
           _map_fn, num_parallel_calls=2).prefetch(2)
 
     self.verify_error_on_save(_build_ds, 15, errors.InvalidArgumentError)
@@ -876,7 +302,7 @@ class ParallelMapDatasetSerializationTest(
     def _build_ds():
       counter_var = variable_scope.get_variable(
           "counter", (), dtypes.int32, use_resource=True)
-      return (contrib_dataset_ops.Dataset.from_tensors(0).repeat(10).map(
+      return (dataset_ops.Dataset.from_tensors(0).repeat(10).map(
           lambda _: counter_var.assign_add(1),
           num_parallel_calls=2).prefetch(2))
 
@@ -886,7 +312,7 @@ class ParallelMapDatasetSerializationTest(
 
     def _build_ds():
       constant_var = constant_op.constant(5)
-      return (contrib_dataset_ops.Dataset.from_tensors(0).repeat(10).map(
+      return (dataset_ops.Dataset.from_tensors(0).repeat(10).map(
           lambda x: x + constant_var, num_parallel_calls=2).prefetch(2))
 
     self.run_core_tests(_build_ds, None, 10)
@@ -900,7 +326,7 @@ class ParallelMapDatasetSerializationTest(
       def defun_fn(x):
         return constant_op.constant(1000) + math_ops.to_int32(x)
 
-      return contrib_dataset_ops.Dataset.range(num_outputs).map(
+      return dataset_ops.Dataset.range(num_outputs).map(
           defun_fn, num_parallel_calls=2).prefetch(2)
 
     self.run_core_tests(_build_ds, None, num_outputs)
@@ -919,7 +345,7 @@ class ParallelMapDatasetSerializationTest(
 
         return constant_op.constant(11000) + defun_fn_deep(math_ops.to_int32(x))
 
-      return contrib_dataset_ops.Dataset.range(num_outputs).map(
+      return dataset_ops.Dataset.range(num_outputs).map(
           defun_fn, num_parallel_calls=2).prefetch(2)
 
     self.run_core_tests(_build_ds, None, num_outputs)
@@ -929,7 +355,7 @@ class IgnoreErrorsSerializationTest(
     dataset_serialization_test_base.DatasetSerializationTestBase):
 
   def _build_ds(self, components):
-    return contrib_dataset_ops.Dataset.from_tensor_slices(components).map(
+    return dataset_ops.Dataset.from_tensor_slices(components).map(
         lambda x: array_ops.check_numerics(x, "message")).apply(
             error_ops.ignore_errors())
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/range_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/range_dataset_op_test.py
index a431670829..80e1cb0041 100644
--- a/tensorflow/contrib/data/python/kernel_tests/range_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/range_dataset_op_test.py
@@ -21,14 +21,13 @@ import os
 
 from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
 from tensorflow.contrib.data.python.ops import counter
-from tensorflow.contrib.data.python.ops import dataset_ops
 from tensorflow.contrib.data.python.ops import enumerate_ops
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import io_ops
 from tensorflow.python.ops import parsing_ops
@@ -38,131 +37,6 @@ from tensorflow.python.platform import test
 
 class RangeDatasetTest(test.TestCase):
 
-  def testStop(self):
-    stop = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = dataset_ops.Dataset.range(stop).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op, feed_dict={stop: 5})
-      for i in range(5):
-        self.assertEqual(i, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testStartStop(self):
-    start = array_ops.placeholder(dtypes.int64, shape=[])
-    stop = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = dataset_ops.Dataset.range(start,
-                                         stop).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op, feed_dict={start: 2, stop: 5})
-      for i in range(2, 5):
-        self.assertEqual(i, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testStartStopStep(self):
-    start = array_ops.placeholder(dtypes.int64, shape=[])
-    stop = array_ops.placeholder(dtypes.int64, shape=[])
-    step = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = dataset_ops.Dataset.range(start, stop,
-                                         step).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op, feed_dict={start: 2, stop: 10, step: 2})
-      for i in range(2, 10, 2):
-        self.assertEqual(i, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testZeroStep(self):
-    start = array_ops.placeholder(dtypes.int64, shape=[])
-    stop = array_ops.placeholder(dtypes.int64, shape=[])
-    step = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = dataset_ops.Dataset.range(start, stop,
-                                         step).make_initializable_iterator()
-    init_op = iterator.initializer
-
-    with self.test_session() as sess:
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(init_op, feed_dict={start: 2, stop: 10, step: 0})
-
-  def testNegativeStep(self):
-    start = array_ops.placeholder(dtypes.int64, shape=[])
-    stop = array_ops.placeholder(dtypes.int64, shape=[])
-    step = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = dataset_ops.Dataset.range(start, stop,
-                                         step).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op, feed_dict={start: 2, stop: 10, step: -1})
-      # This for loop is a no-op but will ensure that the implementation is
-      # consistent with range if it ever changes.
-      for i in range(2, 10, -1):
-        self.assertEqual(i, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testStopLessThanStart(self):
-    start = array_ops.placeholder(dtypes.int64, shape=[])
-    stop = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = dataset_ops.Dataset.range(start,
-                                         stop).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op, feed_dict={start: 10, stop: 2})
-      # This for loop is a no-op but will ensure that the implementation is
-      # consistent with range if it ever changes.
-      for i in range(10, 2):
-        self.assertEqual(i, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testStopLessThanStartWithPositiveStep(self):
-    start = array_ops.placeholder(dtypes.int64, shape=[])
-    stop = array_ops.placeholder(dtypes.int64, shape=[])
-    step = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = dataset_ops.Dataset.range(start, stop,
-                                         step).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op, feed_dict={start: 10, stop: 2, step: 2})
-      # This for loop is a no-op but will ensure that the implementation is
-      # consistent with range if it ever changes.
-      for i in range(10, 2, 2):
-        self.assertEqual(i, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testStopLessThanStartWithNegativeStep(self):
-    start = array_ops.placeholder(dtypes.int64, shape=[])
-    stop = array_ops.placeholder(dtypes.int64, shape=[])
-    step = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = dataset_ops.Dataset.range(start, stop,
-                                         step).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op, feed_dict={start: 10, stop: 2, step: -1})
-      for i in range(10, 2, -1):
-        self.assertEqual(i, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
   def testEnumerateDataset(self):
     components = (["a", "b"], [1, 2], [37.0, 38])
     start = constant_op.constant(20, dtype=dtypes.int64)
diff --git a/tensorflow/contrib/data/python/kernel_tests/resample_test.py b/tensorflow/contrib/data/python/kernel_tests/resample_test.py
index 0ac8d7359f..3c7b46629e 100644
--- a/tensorflow/contrib/data/python/kernel_tests/resample_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/resample_test.py
@@ -19,8 +19,8 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.data.python.ops import dataset_ops
 from tensorflow.contrib.data.python.ops import resampling
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
diff --git a/tensorflow/contrib/data/python/kernel_tests/sequence_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/sequence_dataset_op_test.py
index 1a26da82e5..36ddf30042 100644
--- a/tensorflow/contrib/data/python/kernel_tests/sequence_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/sequence_dataset_op_test.py
@@ -20,194 +20,10 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
-from tensorflow.contrib.data.python.ops import dataset_ops
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.ops import array_ops
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.platform import test
 
 
-class SequenceDatasetTest(test.TestCase):
-
-  def testRepeatTensorDataset(self):
-    """Test a dataset that repeats its input multiple times."""
-    components = (np.array(1), np.array([1, 2, 3]), np.array(37.0))
-    # This placeholder can be fed when dataset-definition subgraph
-    # runs (i.e. `init_op` below) to configure the number of
-    # repetitions used in a particular iterator.
-    count_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
-
-    iterator = (dataset_ops.Dataset.from_tensors(components)
-                .repeat(count_placeholder).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([c.shape for c in components],
-                     [t.shape for t in get_next])
-
-    with self.test_session() as sess:
-      # Test a finite repetition.
-      sess.run(init_op, feed_dict={count_placeholder: 3})
-      for _ in range(3):
-        results = sess.run(get_next)
-        for component, result_component in zip(components, results):
-          self.assertAllEqual(component, result_component)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Test a different finite repetition.
-      sess.run(init_op, feed_dict={count_placeholder: 7})
-      for _ in range(7):
-        results = sess.run(get_next)
-        for component, result_component in zip(components, results):
-          self.assertAllEqual(component, result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Test an empty repetition.
-      sess.run(init_op, feed_dict={count_placeholder: 0})
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Test an infinite repetition.
-      # NOTE(mrry): There's not a good way to test that the sequence
-      # actually is infinite.
-      sess.run(init_op, feed_dict={count_placeholder: -1})
-      for _ in range(17):
-        results = sess.run(get_next)
-        for component, result_component in zip(components, results):
-          self.assertAllEqual(component, result_component)
-
-  def testTakeTensorDataset(self):
-    components = (np.arange(10),)
-    count_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
-
-    iterator = (dataset_ops.Dataset.from_tensor_slices(components)
-                .take(count_placeholder).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([c.shape[1:] for c in components],
-                     [t.shape for t in get_next])
-
-    with self.test_session() as sess:
-      # Take fewer than input size
-      sess.run(init_op, feed_dict={count_placeholder: 4})
-      for i in range(4):
-        results = sess.run(get_next)
-        self.assertAllEqual(results, components[0][i:i+1])
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Take more than input size
-      sess.run(init_op, feed_dict={count_placeholder: 25})
-      for i in range(10):
-        results = sess.run(get_next)
-        self.assertAllEqual(results, components[0][i:i+1])
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Take all of input
-      sess.run(init_op, feed_dict={count_placeholder: -1})
-      for i in range(10):
-        results = sess.run(get_next)
-        self.assertAllEqual(results, components[0][i:i+1])
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Take nothing
-      sess.run(init_op, feed_dict={count_placeholder: 0})
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testSkipTensorDataset(self):
-    components = (np.arange(10),)
-    count_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
-
-    iterator = (dataset_ops.Dataset.from_tensor_slices(components)
-                .skip(count_placeholder).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([c.shape[1:] for c in components],
-                     [t.shape for t in get_next])
-
-    with self.test_session() as sess:
-      # Skip fewer than input size, we should skip
-      # the first 4 elements and then read the rest.
-      sess.run(init_op, feed_dict={count_placeholder: 4})
-      for i in range(4, 10):
-        results = sess.run(get_next)
-        self.assertAllEqual(results, components[0][i:i+1])
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Skip more than input size: get nothing.
-      sess.run(init_op, feed_dict={count_placeholder: 25})
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Skip exactly input size.
-      sess.run(init_op, feed_dict={count_placeholder: 10})
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Set -1 for 'count': skip the entire dataset.
-      sess.run(init_op, feed_dict={count_placeholder: -1})
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Skip nothing
-      sess.run(init_op, feed_dict={count_placeholder: 0})
-      for i in range(0, 10):
-        results = sess.run(get_next)
-        self.assertAllEqual(results, components[0][i:i+1])
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testRepeatRepeatTensorDataset(self):
-    """Test the composition of repeat datasets."""
-    components = (np.array(1), np.array([1, 2, 3]), np.array(37.0))
-    inner_count = array_ops.placeholder(dtypes.int64, shape=[])
-    outer_count = array_ops.placeholder(dtypes.int64, shape=[])
-
-    iterator = (dataset_ops.Dataset.from_tensors(components).repeat(inner_count)
-                .repeat(outer_count).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([c.shape for c in components],
-                     [t.shape for t in get_next])
-
-    with self.test_session() as sess:
-      sess.run(init_op, feed_dict={inner_count: 7, outer_count: 14})
-      for _ in range(7 * 14):
-        results = sess.run(get_next)
-        for component, result_component in zip(components, results):
-          self.assertAllEqual(component, result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testRepeatEmptyDataset(self):
-    """Test that repeating an empty dataset does not hang."""
-    iterator = (dataset_ops.Dataset.from_tensors(0).repeat(10).skip(10)
-                .repeat(-1).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      with self.assertRaisesRegexp(
-          errors.OutOfRangeError,
-          "Attempted to repeat an empty dataset infinitely."):
-        sess.run(get_next)
-
-
 class SequenceDatasetSerializationTest(
     dataset_serialization_test_base.DatasetSerializationTestBase):
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/shard_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/shard_dataset_op_test.py
deleted file mode 100644
index 0b3c32c06e..0000000000
--- a/tensorflow/contrib/data/python/kernel_tests/shard_dataset_op_test.py
+++ /dev/null
@@ -1,111 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.data.python.ops import dataset_ops
-from tensorflow.python.framework import errors
-from tensorflow.python.platform import test
-
-
-class ShardDatasetOpTest(test.TestCase):
-
-  def testSimpleCase(self):
-    dataset = dataset_ops.Dataset.range(10).shard(5, 2)
-    iterator = dataset.make_one_shot_iterator()
-
-    with self.test_session() as sess:
-      self.assertEqual(2, sess.run(iterator.get_next()))
-      self.assertEqual(7, sess.run(iterator.get_next()))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(iterator.get_next())
-
-  def testNestedData(self):
-    dataset_a = dataset_ops.Dataset.range(10)
-    dataset_b = dataset_ops.Dataset.range(10, 0, -1)
-    dataset = dataset_ops.Dataset.zip((dataset_a, dataset_b)).shard(5, 2)
-    iterator = dataset.make_one_shot_iterator()
-
-    with self.test_session() as sess:
-      self.assertEqual((2, 8), sess.run(iterator.get_next()))
-      self.assertEqual((7, 3), sess.run(iterator.get_next()))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(iterator.get_next())
-
-  def testOffsetZero(self):
-    dataset = dataset_ops.Dataset.range(10).shard(5, 0)
-    iterator = dataset.make_one_shot_iterator()
-
-    with self.test_session() as sess:
-      self.assertEqual(0, sess.run(iterator.get_next()))
-      self.assertEqual(5, sess.run(iterator.get_next()))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(iterator.get_next())
-
-  def testOffsetGreaterNumShards(self):
-    with self.assertRaises(ValueError):
-      dataset_ops.Dataset.range(10).shard(5, 7)
-
-  def testNegativeOffset(self):
-    with self.assertRaises(ValueError):
-      dataset_ops.Dataset.range(10).shard(5, -3)
-
-  def testNegativeNumShards(self):
-    with self.assertRaises(ValueError):
-      dataset_ops.Dataset.range(10).shard(-3, 1)
-
-  def testZeroNumShards(self):
-    with self.assertRaises(ValueError):
-      dataset_ops.Dataset.range(10).shard(0, 1)
-
-  def testIteratorEndsBeforeFirstElem(self):
-    dataset = dataset_ops.Dataset.range(1).shard(5, 2)
-    iterator = dataset.make_one_shot_iterator()
-
-    with self.test_session() as sess:
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(iterator.get_next())
-
-  def testLargerWorkerPool(self):
-    dataset = dataset_ops.Dataset.range(10).shard(7, 5)
-    iterator = dataset.make_one_shot_iterator()
-    with self.test_session() as sess:
-      self.assertEqual(5, sess.run(iterator.get_next()))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(iterator.get_next())
-
-  def testIndexEqualsNumShards(self):
-    dataset = dataset_ops.Dataset.range(10).shard(5, 4)
-    iterator = dataset.make_one_shot_iterator()
-    with self.test_session() as sess:
-      self.assertEqual(4, sess.run(iterator.get_next()))
-      self.assertEqual(9, sess.run(iterator.get_next()))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(iterator.get_next())
-
-  def testIndexEqualsNumShards2(self):
-    dataset = dataset_ops.Dataset.range(10).shard(4, 3)
-    iterator = dataset.make_one_shot_iterator()
-    with self.test_session() as sess:
-      self.assertEqual(3, sess.run(iterator.get_next()))
-      self.assertEqual(7, sess.run(iterator.get_next()))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(iterator.get_next())
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/shuffle_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/shuffle_dataset_op_test.py
index 45943d56ec..bcc644c097 100644
--- a/tensorflow/contrib/data/python/kernel_tests/shuffle_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/shuffle_dataset_op_test.py
@@ -17,144 +17,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-
 import numpy as np
 
 from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
-from tensorflow.contrib.data.python.ops import dataset_ops as contrib_dataset_ops
 from tensorflow.contrib.data.python.ops import shuffle_ops
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
-class ShuffleDatasetTest(test.TestCase):
-
-  def testShuffleDataset(self):
-    components = (
-        np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]),
-        np.array([9.0, 10.0, 11.0, 12.0])
-    )
-    count_placeholder = array_ops.placeholder_with_default(
-        constant_op.constant(5, dtypes.int64), shape=[])
-    buffer_size_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
-    seed_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
-
-    repeat_dataset = (
-        contrib_dataset_ops.Dataset.from_tensor_slices(components)
-        .repeat(count_placeholder))
-
-    shuffle_dataset = repeat_dataset.shuffle(buffer_size_placeholder,
-                                             seed_placeholder)
-
-    self.assertEqual(tuple([c.shape[1:] for c in components]),
-                     shuffle_dataset.output_shapes)
-
-    # Create initialization ops for iterators without and with
-    # shuffling, respectively.
-    iterator = iterator_ops.Iterator.from_structure(
-        shuffle_dataset.output_types, shuffle_dataset.output_shapes)
-    init_fifo_op = iterator.make_initializer(repeat_dataset)
-    init_shuffle_op = iterator.make_initializer(shuffle_dataset)
-
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      # First run without shuffling to collect the "ground truth".
-      sess.run(init_fifo_op)
-      unshuffled_elements = []
-      for _ in range(20):
-        unshuffled_elements.append(sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Assert that the shuffled dataset has the same elements as the
-      # "ground truth".
-      sess.run(
-          init_shuffle_op,
-          feed_dict={buffer_size_placeholder: 100,
-                     seed_placeholder: 37})
-      shuffled_elements = []
-      for _ in range(20):
-        shuffled_elements.append(sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-      self.assertAllEqual(
-          sorted(unshuffled_elements), sorted(shuffled_elements))
-
-      # Assert that shuffling twice with the same seeds gives the same sequence.
-      sess.run(
-          init_shuffle_op,
-          feed_dict={buffer_size_placeholder: 100,
-                     seed_placeholder: 37})
-      reshuffled_elements_same_seed = []
-      for _ in range(20):
-        reshuffled_elements_same_seed.append(sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-      self.assertEqual(shuffled_elements, reshuffled_elements_same_seed)
-
-      # Assert that shuffling twice with a different seed gives a different
-      # permutation of the same elements.
-      sess.run(
-          init_shuffle_op,
-          feed_dict={buffer_size_placeholder: 100,
-                     seed_placeholder: 1037})
-      reshuffled_elements_different_seed = []
-      for _ in range(20):
-        reshuffled_elements_different_seed.append(sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-      self.assertNotEqual(shuffled_elements, reshuffled_elements_different_seed)
-      self.assertAllEqual(
-          sorted(shuffled_elements), sorted(reshuffled_elements_different_seed))
-
-      # Assert that the shuffled dataset has the same elements as the
-      # "ground truth" when the buffer size is smaller than the input
-      # dataset.
-      sess.run(
-          init_shuffle_op,
-          feed_dict={buffer_size_placeholder: 2,
-                     seed_placeholder: 37})
-      reshuffled_elements_small_buffer = []
-      for _ in range(20):
-        reshuffled_elements_small_buffer.append(sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-      self.assertAllEqual(
-          sorted(unshuffled_elements), sorted(reshuffled_elements_small_buffer))
-
-      # Test the case of shuffling an empty dataset.
-      sess.run(init_shuffle_op, feed_dict={buffer_size_placeholder: 2,
-                                           seed_placeholder: 37,
-                                           count_placeholder: 0})
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testDefaultArguments(self):
-    components = [0, 1, 2, 3, 4]
-    iterator = (
-        contrib_dataset_ops.Dataset.from_tensor_slices(components).shuffle(5)
-        .repeat().make_one_shot_iterator())
-
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      counts = collections.defaultdict(lambda: 0)
-      for _ in range(10):
-        for _ in range(5):
-          counts[sess.run(get_next)] += 1
-
-    for i in range(5):
-      self.assertEqual(10, counts[i])
-
-
 class ShuffleDatasetSerializationTest(
     dataset_serialization_test_base.DatasetSerializationTestBase):
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/unique_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/unique_dataset_op_test.py
index 55296d5710..3c436f7a0b 100644
--- a/tensorflow/contrib/data/python/kernel_tests/unique_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/unique_dataset_op_test.py
@@ -18,8 +18,8 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
-from tensorflow.contrib.data.python.ops import dataset_ops
 from tensorflow.contrib.data.python.ops import unique
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.platform import test
diff --git a/tensorflow/contrib/data/python/kernel_tests/zip_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/zip_dataset_op_test.py
index 5d34b0024c..e39fa957f0 100644
--- a/tensorflow/contrib/data/python/kernel_tests/zip_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/zip_dataset_op_test.py
@@ -20,97 +20,10 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
-from tensorflow.contrib.data.python.ops import dataset_ops
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.ops import array_ops
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.platform import test
 
 
-class ZipDatasetTest(test.TestCase):
-
-  def testZipDataset(self):
-    component_placeholders = [
-        array_ops.placeholder(dtypes.int64),
-        array_ops.placeholder(dtypes.int64),
-        array_ops.placeholder(dtypes.float64)
-    ]
-
-    datasets = tuple([
-        dataset_ops.Dataset.from_tensor_slices(component_placeholder)
-        for component_placeholder in component_placeholders
-    ])
-    zipped = dataset_ops.Dataset.zip(datasets)
-
-    iterator = zipped.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      equal_length_components = [
-          np.tile(np.array([[1], [2], [3], [4]]), 20),
-          np.tile(np.array([[12], [13], [14], [15]]), 22),
-          np.array([37.0, 38.0, 39.0, 40.0])
-      ]
-      sess.run(init_op, feed_dict={ph: value for ph, value in zip(
-          component_placeholders, equal_length_components)})
-      for i in range(4):
-        results = sess.run(get_next)
-        for component, result_component in zip(
-            equal_length_components, results):
-          self.assertAllEqual(component[i], result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      variable_length_components = [[1, 2, 3, 4], [1, 2, 3, 4, 5], [1.0, 2.0]]
-      sess.run(init_op, feed_dict={ph: value for ph, value in zip(
-          component_placeholders, variable_length_components)})
-      for i in range(2):
-        results = sess.run(get_next)
-        for component, result_component in zip(
-            variable_length_components, results):
-          self.assertAllEqual(component[i], result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testNestedZipDataset(self):
-    component_placeholders = [
-        array_ops.placeholder(dtypes.int64, shape=[4, 20]),
-        array_ops.placeholder(dtypes.int64, shape=[4, 22]),
-        array_ops.placeholder(dtypes.float64, shape=[4])
-    ]
-
-    datasets = [
-        dataset_ops.Dataset.from_tensor_slices(component_placeholder)
-        for component_placeholder in component_placeholders
-    ]
-    zipped = dataset_ops.Dataset.zip((datasets[0], (datasets[1], datasets[2])))
-
-    iterator = zipped.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([20], get_next[0].shape)
-    self.assertEqual([22], get_next[1][0].shape)
-    self.assertEqual([], get_next[1][1].shape)
-
-    with self.test_session() as sess:
-      equal_length_components = [
-          np.tile(np.array([[1], [2], [3], [4]]), 20),
-          np.tile(np.array([[12], [13], [14], [15]]), 22),
-          np.array([37.0, 38.0, 39.0, 40.0])
-      ]
-      sess.run(init_op, feed_dict={ph: value for ph, value in zip(
-          component_placeholders, equal_length_components)})
-      for i in range(4):
-        result1, (result2, result3) = sess.run(get_next)
-        self.assertAllEqual(equal_length_components[0][i], result1)
-        self.assertAllEqual(equal_length_components[1][i], result2)
-        self.assertAllEqual(equal_length_components[2][i], result3)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-
 class ZipDatasetSerializationTest(
     dataset_serialization_test_base.DatasetSerializationTestBase):
 
diff --git a/tensorflow/contrib/data/python/ops/BUILD b/tensorflow/contrib/data/python/ops/BUILD
index 0eb5d4635d..b488357f22 100644
--- a/tensorflow/contrib/data/python/ops/BUILD
+++ b/tensorflow/contrib/data/python/ops/BUILD
@@ -15,7 +15,7 @@ py_library(
     name = "dataset_ops",
     srcs = [
         "counter.py",
-        "dataset_ops.py",
+        "get_single_element.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
diff --git a/tensorflow/contrib/data/python/ops/dataset_ops.py b/tensorflow/contrib/data/python/ops/dataset_ops.py
deleted file mode 100644
index fafd231061..0000000000
--- a/tensorflow/contrib/data/python/ops/dataset_ops.py
+++ /dev/null
@@ -1,691 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Python wrappers for Datasets and Iterators."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.data.python.ops import batching
-from tensorflow.contrib.data.python.ops import enumerate_ops
-from tensorflow.contrib.data.python.ops import error_ops
-from tensorflow.contrib.data.python.ops import grouping
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.util import nest
-from tensorflow.python.ops import gen_dataset_ops
-from tensorflow.python.ops import gen_io_ops
-from tensorflow.python.util import deprecation
-
-
-class Dataset(dataset_ops.Dataset):
-  """Represents a potentially large set of elements.
-
-  A `Dataset` can be used to represent an input pipeline as a
-  collection of elements (nested structures of tensors) and a "logical
-  plan" of transformations that act on those elements.
-  """
-
-  def __init__(self, dataset):
-    super(Dataset, self).__init__()
-    self._dataset = dataset
-
-  @deprecation.deprecated(None, "Use `ds._as_variant_tensor()`.")
-  def make_dataset_resource(self):
-    return self._as_variant_tensor()
-
-  def _as_variant_tensor(self):
-    return self._dataset._as_variant_tensor()  # pylint: disable=protected-access
-
-  @property
-  def output_classes(self):
-    return self._dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._dataset.output_types
-
-  @staticmethod
-  @deprecation.deprecated(None, "Use `tf.data.Dataset.from_tensors()`.")
-  def from_tensors(tensors):
-    """Creates a `Dataset` with a single element, comprising the given tensors.
-
-    Args:
-      tensors: A nested structure of tensors.
-
-    Returns:
-      A `Dataset`.
-    """
-    return Dataset(dataset_ops.TensorDataset(tensors))
-
-  @staticmethod
-  @deprecation.deprecated(None, "Use `tf.data.Dataset.from_tensor_slices()`.")
-  def from_tensor_slices(tensors):
-    """Creates a `Dataset` whose elements are slices of the given tensors.
-
-    Args:
-      tensors: A nested structure of tensors, each having the same size in the
-        0th dimension.
-
-    Returns:
-      A `Dataset`.
-    """
-    return Dataset(dataset_ops.TensorSliceDataset(tensors))
-
-  @staticmethod
-  @deprecation.deprecated(None,
-                          "Use `tf.data.Dataset.from_sparse_tensor_slices()`.")
-  def from_sparse_tensor_slices(sparse_tensor):
-    """Splits each rank-N `tf.SparseTensor` in this dataset row-wise.
-
-    Args:
-      sparse_tensor: A `tf.SparseTensor`.
-
-    Returns:
-      A `Dataset` of rank-(N-1) sparse tensors.
-    """
-    return Dataset(dataset_ops.SparseTensorSliceDataset(sparse_tensor))
-
-  @staticmethod
-  @deprecation.deprecated(None, "Use `tf.data.Dataset.from_generator()`.")
-  def from_generator(generator, output_types, output_shapes=None):
-    """Creates a `Dataset` whose elements are generated by `generator`.
-
-    The `generator` argument must be a callable object that returns
-    an object that support the `iter()` protocol (e.g. a generator function).
-    The elements generated by `generator` must be compatible with the given
-    `output_types` and (optional) `output_shapes` arguments.
-
-    For example:
-
-    ```python
-    import itertools
-
-    def gen():
-      for i in itertools.count(1):
-        yield (i, [1] * i)
-
-    ds = Dataset.from_generator(
-        gen, (tf.int64, tf.int64), (tf.TensorShape([]), tf.TensorShape([None])))
-    value = ds.make_one_shot_iterator().get_next()
-
-    sess.run(value)  # (1, array([1]))
-    sess.run(value)  # (2, array([1, 1]))
-    ```
-
-    Args:
-      generator: A callable object that takes no arguments and returns an
-        object that supports the `iter()` protocol.
-      output_types: A nested structure of `tf.DType` objects corresponding to
-        each component of an element yielded by `generator`.
-      output_shapes: (Optional.) A nested structure of `tf.TensorShape`
-        objects corresponding to each component of an element yielded by
-        `generator`.
-
-    Returns:
-      A `Dataset`.
-    """
-    return Dataset(dataset_ops.Dataset.from_generator(
-        generator, output_types, output_shapes))
-
-  @staticmethod
-  @deprecation.deprecated(None, "Use `tf.data.Dataset.range()`.")
-  def range(*args):
-    """Creates a `Dataset` of a step-separated range of values.
-
-    For example:
-
-    ```python
-    Dataset.range(5) == [0, 1, 2, 3, 4]
-    Dataset.range(2, 5) == [2, 3, 4]
-    Dataset.range(1, 5, 2) == [1, 3]
-    Dataset.range(1, 5, -2) == []
-    Dataset.range(5, 1) == []
-    Dataset.range(5, 1, -2) == [5, 3]
-    ```
-
-    Args:
-      *args: follow same semantics as python's xrange.
-        len(args) == 1 -> start = 0, stop = args[0], step = 1
-        len(args) == 2 -> start = args[0], stop = args[1], step = 1
-        len(args) == 3 -> start = args[0], stop = args[1, stop = args[2]
-
-    Returns:
-      A `RangeDataset`.
-
-    Raises:
-      ValueError: if len(args) == 0.
-    """
-    return Dataset(dataset_ops.RangeDataset(*args))
-
-  @staticmethod
-  @deprecation.deprecated(None, "Use `tf.data.Dataset.zip()`.")
-  def zip(datasets):
-    """Creates a `Dataset` by zipping together the given datasets.
-
-    This method has similar semantics to the built-in `zip()` function
-    in Python, with the main difference being that the `datasets`
-    argument can be an arbitrary nested structure of `Dataset` objects.
-    For example:
-
-    ```python
-    # NOTE: The following examples use `{ ... }` to represent the
-    # contents of a dataset.
-    a = { 1, 2, 3 }
-    b = { 4, 5, 6 }
-    c = { (7, 8), (9, 10), (11, 12) }
-    d = { 13, 14 }
-
-    # The nested structure of the `datasets` argument determines the
-    # structure of elements in the resulting dataset.
-    Dataset.zip((a, b)) == { (1, 4), (2, 5), (3, 6) }
-    Dataset.zip((b, a)) == { (4, 1), (5, 2), (6, 3) }
-
-    # The `datasets` argument may contain an arbitrary number of
-    # datasets.
-    Dataset.zip((a, b, c)) == { (1, 4, (7, 8)),
-                                (2, 5, (9, 10)),
-                                (3, 6, (11, 12)) }
-
-    # The number of elements in the resulting dataset is the same as
-    # the size of the smallest dataset in `datasets`.
-    Dataset.zip((a, d)) == { (1, 13), (2, 14) }
-    ```
-
-    Args:
-      datasets: A nested structure of datasets.
-
-    Returns:
-      A `Dataset`.
-    """
-    return Dataset(dataset_ops.ZipDataset(datasets))
-
-  def concatenate(self, dataset):
-    """Creates a `Dataset` by concatenating given dataset with this dataset.
-
-    ```python
-    # NOTE: The following examples use `{ ... }` to represent the
-    # contents of a dataset.
-    a = { 1, 2, 3 }
-    b = { 4, 5, 6, 7 }
-
-    # Input dataset and dataset to be concatenated should have same
-    # nested structures and output types.
-    # c = { (8, 9), (10, 11), (12, 13) }
-    # d = { 14.0, 15.0, 16.0 }
-    # a.concatenate(c) and a.concatenate(d) would result in error.
-
-    a.concatenate(b) == { 1, 2, 3, 4, 5, 6, 7 }
-    ```
-
-    Args:
-      dataset: `Dataset` to be concatenated.
-
-    Returns:
-      A `Dataset`.
-    """
-    return Dataset(dataset_ops.ConcatenateDataset(self._dataset, dataset))
-
-  def prefetch(self, buffer_size):
-    """Creates a `Dataset` that prefetches elements from this dataset.
-
-    Args:
-      buffer_size: A `tf.int64` scalar `tf.Tensor`, representing the
-        maximum number elements that will be buffered when prefetching.
-
-    Returns:
-      A `Dataset`.
-    """
-    return Dataset(dataset_ops.PrefetchDataset(self._dataset, buffer_size))
-
-  @staticmethod
-  @deprecation.deprecated(None, "Use `tf.data.Dataset.list_files()`.")
-  def list_files(file_pattern):
-    """A dataset of all files matching a pattern.
-
-    Example:
-      If we had the following files on our filesystem:
-        - /path/to/dir/a.txt
-        - /path/to/dir/b.py
-        - /path/to/dir/c.py
-      If we pass "/path/to/dir/*.py" as the directory, the dataset would
-      produce:
-        - /path/to/dir/b.py
-        - /path/to/dir/c.py
-
-    Args:
-      file_pattern: A string or scalar string `tf.Tensor`, representing
-        the filename pattern that will be matched.
-
-    Returns:
-     A `Dataset` of strings corresponding to file names.
-    """
-    return Dataset.from_tensor_slices(gen_io_ops.matching_files(file_pattern))
-
-  def repeat(self, count=None):
-    """Repeats this dataset `count` times.
-
-    Args:
-      count: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the
-        number of times the elements of this dataset should be repeated. The
-        default behavior (if `count` is `None` or `-1`) is for the elements to
-        be repeated indefinitely.
-
-    Returns:
-      A `Dataset`.
-    """
-    return Dataset(dataset_ops.RepeatDataset(self._dataset, count))
-
-  @deprecation.deprecated(
-      None, "Use `ds.apply(tf.contrib.data.enumerate_dataset())`.")
-  def enumerate(self, start=0):
-    """Deprecated: Use `Dataset.apply(tf.contrib.data.enumerate_dataset(..)`."""
-
-    return self.apply(enumerate_ops.enumerate_dataset(start))
-
-  def shuffle(self, buffer_size, seed=None):
-    """Randomly shuffles the elements of this dataset.
-
-    Args:
-      buffer_size: A `tf.int64` scalar `tf.Tensor`, representing the
-        number of elements from this dataset from which the new
-        dataset will sample.
-      seed: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the
-        random seed that will be used to create the distribution. See
-        @{tf.set_random_seed} for behavior.
-
-    Returns:
-      A `Dataset`.
-    """
-    return Dataset(dataset_ops.ShuffleDataset(self._dataset, buffer_size, seed))
-
-  def cache(self, filename=""):
-    """Caches the elements in this dataset.
-
-    Args:
-      filename: A `tf.string` scalar `tf.Tensor`, representing the name of a
-        directory on the filesystem to use for caching tensors in this Dataset.
-        If a filename is not provided, the dataset will be cached in memory.
-
-    Returns:
-      A `Dataset`.
-    """
-    return Dataset(dataset_ops.CacheDataset(self._dataset, filename))
-
-  def take(self, count):
-    """Creates a `Dataset` with at most `count` elements from this dataset.
-
-    Args:
-      count: A `tf.int64` scalar `tf.Tensor`, representing the number of
-        elements of this dataset that should be taken to form the new dataset.
-        If `count` is -1, or if `count` is greater than the size of this
-        dataset, the new dataset will contain all elements of this dataset.
-
-    Returns:
-      A `Dataset`.
-    """
-    return Dataset(dataset_ops.TakeDataset(self._dataset, count))
-
-  def skip(self, count):
-    """Creates a `Dataset` that skips `count` elements from this dataset.
-
-    Args:
-      count: A `tf.int64` scalar `tf.Tensor`, representing the number
-        of elements of this dataset that should be skipped to form the
-        new dataset.  If `count` is greater than the size of this
-        dataset, the new dataset will contain no elements.  If `count`
-        is -1, skips the entire dataset.
-
-    Returns:
-      A `Dataset`.
-    """
-    return Dataset(dataset_ops.SkipDataset(self._dataset, count))
-
-  def shard(self, num_shards, index):
-    """Creates a `Dataset` that includes only 1/`num_shards` of this dataset.
-
-    This dataset operator is very useful when running distributed training, as
-    it allows each worker to read a unique subset.
-
-    When reading a single input file, you can skip elements as follows:
-
-    ```python
-    d = tf.data.TFRecordDataset(FLAGS.input_file)
-    d = d.shard(FLAGS.num_workers, FLAGS.worker_index)
-    d = d.repeat(FLAGS.num_epochs)
-    d = d.shuffle(FLAGS.shuffle_buffer_size)
-    d = d.map(parser_fn, num_parallel_calls=FLAGS.num_map_threads)
-    ```
-
-    Important caveats:
-
-    - Be sure to shard before you use any randomizing operator (such as
-      shuffle).
-    - Generally it is best if the shard operator is used early in the dataset
-      pipeline. For example, when reading from a set of TFRecord files, shard
-      before converting the dataset to input samples. This avoids reading every
-      file on every worker. The following is an example of an efficient
-      sharding strategy within a complete pipeline:
-
-    ```python
-    d = tf.data.Dataset.list_files(FLAGS.pattern)
-    d = d.shard(FLAGS.num_workers, FLAGS.worker_index)
-    d = d.repeat(FLAGS.num_epochs)
-    d = d.shuffle(FLAGS.shuffle_buffer_size)
-    d = d.interleave(tf.data.TFRecordDataset,
-                     cycle_length=FLAGS.num_readers, block_length=1)
-    d = d.map(parser_fn, num_parallel_calls=FLAGS.num_map_threads)
-    ```
-
-    Args:
-      num_shards: A `tf.int64` scalar `tf.Tensor`, representing the number of
-        shards operating in parallel.
-      index: A `tf.int64` scalar `tf.Tensor`, representing the worker index.
-
-    Returns:
-      A `Dataset`.
-
-    Raises:
-      ValueError: if `num_shards` or `index` are illegal values. Note: error
-        checking is done on a best-effort basis, and aren't guaranteed to be
-        caught upon dataset creation. (e.g. providing in a placeholder tensor
-        bypasses the early checking, and will instead result in an error during
-        a session.run call.)
-    """
-    return Dataset(self._dataset.shard(num_shards, index))
-
-  @deprecation.deprecated(
-      None, "Use `ds.apply(tf.contrib.data.ignore_errors())`.")
-  def ignore_errors(self):
-    """Deprecated: Use `Dataset.apply(tf.contrib.data.ignore_errors())`."""
-
-    return self.apply(error_ops.ignore_errors())
-
-  def batch(self, batch_size):
-    """Combines consecutive elements of this dataset into batches.
-
-    Args:
-      batch_size: A `tf.int64` scalar `tf.Tensor`, representing the number of
-        consecutive elements of this dataset to combine in a single batch.
-
-    Returns:
-      A `Dataset`.
-    """
-    return Dataset(dataset_ops.BatchDataset(self._dataset, batch_size))
-
-  def padded_batch(self, batch_size, padded_shapes, padding_values=None):
-    """Combines consecutive elements of this dataset into padded batches.
-
-    Like `Dataset.dense_to_sparse_batch()`, this method combines
-    multiple consecutive elements of this dataset, which might have
-    different shapes, into a single element. The tensors in the
-    resulting element have an additional outer dimension, and are
-    padded to the respective shape in `padded_shapes`.
-
-    Args:
-      batch_size: A `tf.int64` scalar `tf.Tensor`, representing the number of
-        consecutive elements of this dataset to combine in a single batch.
-      padded_shapes: A nested structure of `tf.TensorShape` or
-        `tf.int64` vector tensor-like objects representing the shape
-        to which the respective component of each input element should
-        be padded prior to batching. Any unknown dimensions
-        (e.g. `tf.Dimension(None)` in a `tf.TensorShape` or `-1` in a
-        tensor-like object) will be padded to the maximum size of that
-        dimension in each batch.
-      padding_values: (Optional.) A nested structure of scalar-shaped
-        `tf.Tensor`, representing the padding values to use for the
-        respective components.  Defaults are `0` for numeric types and
-        the empty string for string types.
-
-    Returns:
-      A `Dataset`.
-    """
-    return Dataset(
-        dataset_ops.PaddedBatchDataset(self._dataset, batch_size, padded_shapes,
-                                       padding_values))
-
-  @deprecation.deprecated(
-      None, "Use `ds.apply(tf.contrib.data.dense_to_sparse_batch())`.")
-  def dense_to_sparse_batch(self, batch_size, row_shape):
-    """Use: `Dataset.apply(tf.contrib.data.dense_to_sparse_batch(...))`."""
-
-    return self.apply(batching.dense_to_sparse_batch(batch_size, row_shape))
-
-  @deprecation.deprecated(
-      None, "Use `ds.apply(tf.contrib.data.group_by_window())`.")
-  def group_by_window(self, key_func, reduce_func, window_size):
-    """Deprecated: Use `Dataset.apply(tf.contrib.data.group_by_window(...))`."""
-
-    return self.apply(
-        grouping.group_by_window(key_func, reduce_func, window_size))
-
-  @deprecation.deprecated_args(
-      None,
-      "Replace `num_threads=T` with `num_parallel_calls=T`. Replace "
-      "`output_buffer_size=N` with `ds.prefetch(N)` on the returned dataset.",
-      "num_threads", "output_buffer_size")
-  def map(self,
-          map_func,
-          num_threads=None,
-          output_buffer_size=None,
-          num_parallel_calls=None):
-    """Maps `map_func` across this datset.
-
-    Args:
-      map_func: A function mapping a nested structure of tensors (having
-        shapes and types defined by `self.output_shapes` and
-       `self.output_types`) to another nested structure of tensors.
-      num_threads: (Optional.) Deprecated, use `num_parallel_calls` instead.
-      output_buffer_size: (Optional.) A `tf.int64` scalar `tf.Tensor`,
-        representing the maximum number of processed elements that will be
-        buffered.
-      num_parallel_calls: (Optional.) A `tf.int32` scalar `tf.Tensor`,
-        representing the number elements to process in parallel. If not
-        specified, elements will be processed sequentially.
-
-    Returns:
-      A `Dataset`.
-    """
-    if num_threads is None and num_parallel_calls is None:
-      ret = Dataset(dataset_ops.MapDataset(self._dataset, map_func))
-    else:
-      if num_threads is None:
-        ret = Dataset(
-            dataset_ops.ParallelMapDataset(self._dataset, map_func,
-                                           num_parallel_calls))
-      else:
-        ret = Dataset(
-            dataset_ops.ParallelMapDataset(self._dataset, map_func,
-                                           num_threads))
-    if output_buffer_size is not None:
-      ret = ret.prefetch(output_buffer_size)
-    return ret
-
-  def flat_map(self, map_func):
-    """Maps `map_func` across this dataset and flattens the result.
-
-    Args:
-      map_func: A function mapping a nested structure of tensors (having shapes
-        and types defined by `self.output_shapes` and `self.output_types`) to a
-        `Dataset`.
-
-    Returns:
-      A `Dataset`.
-    """
-    return Dataset(dataset_ops.FlatMapDataset(self._dataset, map_func))
-
-  def interleave(self, map_func, cycle_length, block_length=1):
-    """Maps `map_func` across this dataset, and interleaves the results.
-
-    For example, you can use `Dataset.interleave()` to process many input files
-    concurrently:
-
-    ```python
-    # Preprocess 4 files concurrently, and interleave blocks of 16 records from
-    # each file.
-    filenames = ["/var/data/file1.txt", "/var/data/file2.txt", ...]
-    dataset = (Dataset.from_tensor_slices(filenames)
-               .interleave(lambda x:
-                   TextLineDataset(x).map(parse_fn, num_parallel_calls=1),
-                   cycle_length=4, block_length=16))
-    ```
-
-    The `cycle_length` and `block_length` arguments control the order in which
-    elements are produced. `cycle_length` controls the number of input elements
-    that are processed concurrently. If you set `cycle_length` to 1, this
-    transformation will handle one input element at a time, and will produce
-    identical results = to @{tf.data.Dataset.flat_map}. In general,
-    this transformation will apply `map_func` to `cycle_length` input elements,
-    open iterators on the returned `Dataset` objects, and cycle through them
-    producing `block_length` consecutive elements from each iterator, and
-    consuming the next input element each time it reaches the end of an
-    iterator.
-
-    For example:
-
-    ```python
-    # NOTE: The following examples use `{ ... }` to represent the
-    # contents of a dataset.
-    a = { 1, 2, 3, 4, 5 }
-
-    # NOTE: New lines indicate "block" boundaries.
-    a.interleave(lambda x: Dataset.from_tensors(x).repeat(6),
-                 cycle_length=2, block_length=4) == {
-        1, 1, 1, 1,
-        2, 2, 2, 2,
-        1, 1,
-        2, 2,
-        3, 3, 3, 3,
-        4, 4, 4, 4,
-        3, 3,
-        4, 4,
-        5, 5, 5, 5,
-        5, 5,
-    }
-    ```
-
-    NOTE: The order of elements yielded by this transformation is
-    deterministic, as long as `map_func` is a pure function. If
-    `map_func` contains any stateful operations, the order in which
-    that state is accessed is undefined.
-
-    Args:
-      map_func: A function mapping a nested structure of tensors (having shapes
-        and types defined by `self.output_shapes` and `self.output_types`) to a
-        `Dataset`.
-      cycle_length: The number of elements from this dataset that will be
-        processed concurrently.
-      block_length: The number of consecutive elements to produce from each
-        input element before cycling to another input element.
-
-    Returns:
-      A `Dataset`.
-    """
-    return Dataset(
-        dataset_ops.InterleaveDataset(self._dataset, map_func, cycle_length,
-                                      block_length))
-
-  @deprecation.deprecated(None, "Use `ds.apply(tf.contrib.data.unbatch())`.")
-  def unbatch(self):
-    """Deprecated: Use `Dataset.apply(tf.contrib.data.unbatch()`."""
-
-    return self.apply(batching.unbatch())
-
-  def filter(self, predicate):
-    """Filters this dataset according to `predicate`.
-
-    Args:
-      predicate: A function mapping a nested structure of tensors (having shapes
-        and types defined by `self.output_shapes` and `self.output_types`) to a
-        scalar `tf.bool` tensor.
-
-    Returns:
-      A `Dataset`.
-    """
-    return Dataset(dataset_ops.FilterDataset(self._dataset, predicate))
-
-  def apply(self, transformation_func):
-    """Apply a transformation function to this dataset.
-
-    `apply` enables chaining of custom `Dataset` transformations, which are
-    represented as functions that take one `Dataset` argument and return a
-    transformed `Dataset`.
-
-    For example:
-
-    ```
-    dataset = (dataset.map(lambda x: x ** 2)
-               .(group_by_window(key_func, reduce_func, window_size))
-               .map(lambda x: x ** 3))
-    ```
-
-    Args:
-      transformation_func: A function that takes one `Dataset` argument and
-        returns a `Dataset`.
-
-    Returns:
-      The `Dataset` returned by applying `transformation_func` to this dataset.
-    """
-    dataset = transformation_func(self)
-    if not isinstance(dataset, dataset_ops.Dataset):
-      raise TypeError("`transformation_func` must return a Dataset.")
-    return Dataset(dataset)
-
-
-def get_single_element(dataset):
-  """Returns the single element in `dataset` as a nested structure of tensors.
-
-  This function enables you to use a @{tf.data.Dataset} in a stateless
-  "tensor-in tensor-out" expression, without creating a @{tf.data.Iterator}.
-  This can be useful when your preprocessing transformations are expressed
-  as a `Dataset`, and you want to use the transformation at serving time.
-  For example:
-
-  ```python
-  input_batch = tf.placeholder(tf.string, shape=[BATCH_SIZE])
-
-  def preprocessing_fn(input_str):
-    # ...
-    return image, label
-
-  dataset = (tf.data.Dataset.from_tensor_slices(input_batch)
-             .map(preprocessing_fn, num_parallel_calls=BATCH_SIZE)
-             .batch(BATCH_SIZE))
-
-  image_batch, label_batch = tf.contrib.data.get_single_element(dataset)
-  ```
-
-  Args:
-    dataset: A @{tf.data.Dataset} object containing a single element.
-
-  Returns:
-    A nested structure of @{tf.Tensor} objects, corresponding to the single
-    element of `dataset`.
-
-  Raises:
-    TypeError: if `dataset` is not a `tf.data.Dataset` object.
-    InvalidArgumentError (at runtime): if `dataset` does not contain exactly
-      one element.
-  """
-  if not isinstance(dataset, dataset_ops.Dataset):
-    raise TypeError("`dataset` must be a `tf.data.Dataset` object.")
-  return nest.pack_sequence_as(
-      dataset.output_types,
-      gen_dataset_ops.dataset_to_single_element(
-          dataset._as_variant_tensor(),  # pylint: disable=protected-access
-          output_types=nest.flatten(dataset.output_types),
-          output_shapes=nest.flatten(dataset.output_shapes)))
diff --git a/tensorflow/contrib/data/python/ops/get_single_element.py b/tensorflow/contrib/data/python/ops/get_single_element.py
new file mode 100644
index 0000000000..a817b45b71
--- /dev/null
+++ b/tensorflow/contrib/data/python/ops/get_single_element.py
@@ -0,0 +1,67 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Python wrappers for Datasets and Iterators."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import nest
+from tensorflow.python.ops import gen_dataset_ops
+
+
+def get_single_element(dataset):
+  """Returns the single element in `dataset` as a nested structure of tensors.
+
+  This function enables you to use a @{tf.data.Dataset} in a stateless
+  "tensor-in tensor-out" expression, without creating a @{tf.data.Iterator}.
+  This can be useful when your preprocessing transformations are expressed
+  as a `Dataset`, and you want to use the transformation at serving time.
+  For example:
+
+  ```python
+  input_batch = tf.placeholder(tf.string, shape=[BATCH_SIZE])
+
+  def preprocessing_fn(input_str):
+    # ...
+    return image, label
+
+  dataset = (tf.data.Dataset.from_tensor_slices(input_batch)
+             .map(preprocessing_fn, num_parallel_calls=BATCH_SIZE)
+             .batch(BATCH_SIZE))
+
+  image_batch, label_batch = tf.contrib.data.get_single_element(dataset)
+  ```
+
+  Args:
+    dataset: A @{tf.data.Dataset} object containing a single element.
+
+  Returns:
+    A nested structure of @{tf.Tensor} objects, corresponding to the single
+    element of `dataset`.
+
+  Raises:
+    TypeError: if `dataset` is not a `tf.data.Dataset` object.
+    InvalidArgumentError (at runtime): if `dataset` does not contain exactly
+      one element.
+  """
+  if not isinstance(dataset, dataset_ops.Dataset):
+    raise TypeError("`dataset` must be a `tf.data.Dataset` object.")
+  return nest.pack_sequence_as(
+      dataset.output_types,
+      gen_dataset_ops.dataset_to_single_element(
+          dataset._as_variant_tensor(),  # pylint: disable=protected-access
+          output_types=nest.flatten(dataset.output_types),
+          output_shapes=nest.flatten(dataset.output_shapes)))
diff --git a/tensorflow/contrib/estimator/python/estimator/extenders_test.py b/tensorflow/contrib/estimator/python/estimator/extenders_test.py
index 5f4a3cc902..ad1a8ef152 100644
--- a/tensorflow/contrib/estimator/python/estimator/extenders_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/extenders_test.py
@@ -20,8 +20,8 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.data.python.ops import dataset_ops
 from tensorflow.contrib.estimator.python.estimator import extenders
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.estimator import estimator_lib
 from tensorflow.python.estimator.canned import linear
 from tensorflow.python.feature_column import feature_column as fc
diff --git a/tensorflow/contrib/kfac/python/ops/op_queue.py b/tensorflow/contrib/kfac/python/ops/op_queue.py
index 831870fca4..b6d9d37a31 100644
--- a/tensorflow/contrib/kfac/python/ops/op_queue.py
+++ b/tensorflow/contrib/kfac/python/ops/op_queue.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.data.python.ops import dataset_ops
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import ops as tf_ops
 
 
-- 
GitLab


From 6c29ff7f62dae6e669fe9b7b59beec3fc39cbde8 Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Fri, 9 Feb 2018 15:08:24 -0800
Subject: [PATCH 914/966] Recursively creating directories in
 CreateSummaryFileWriter

PiperOrigin-RevId: 185199219
---
 tensorflow/contrib/summary/summary_ops_test.py           | 7 -------
 tensorflow/contrib/tensorboard/db/summary_file_writer.cc | 2 +-
 2 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/tensorflow/contrib/summary/summary_ops_test.py b/tensorflow/contrib/summary/summary_ops_test.py
index dfaa4182bb..bb7215f879 100644
--- a/tensorflow/contrib/summary/summary_ops_test.py
+++ b/tensorflow/contrib/summary/summary_ops_test.py
@@ -29,7 +29,6 @@ from tensorflow.core.framework import types_pb2
 from tensorflow.python.eager import function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import state_ops
@@ -59,12 +58,6 @@ _NUMPY_NUMERIC_TYPES = {
 
 class TargetTest(test_util.TensorFlowTestCase):
 
-  def testInvalidDirectory(self):
-    logdir = '/tmp/apath/that/doesnt/exist'
-    self.assertFalse(gfile.Exists(logdir))
-    with self.assertRaises(errors.NotFoundError):
-      summary_ops.create_file_writer(logdir, max_queue=0, name='t0')
-
   def testShouldRecordSummary(self):
     self.assertFalse(summary_ops.should_record_summaries())
     with summary_ops.always_record_summaries():
diff --git a/tensorflow/contrib/tensorboard/db/summary_file_writer.cc b/tensorflow/contrib/tensorboard/db/summary_file_writer.cc
index d891e86e53..3868b1172f 100644
--- a/tensorflow/contrib/tensorboard/db/summary_file_writer.cc
+++ b/tensorflow/contrib/tensorboard/db/summary_file_writer.cc
@@ -42,7 +42,7 @@ class SummaryFileWriter : public SummaryWriterInterface {
       if (is_dir.code() != tensorflow::error::NOT_FOUND) {
         return is_dir;
       }
-      TF_RETURN_IF_ERROR(env_->CreateDir(logdir));
+      TF_RETURN_IF_ERROR(env_->RecursivelyCreateDir(logdir));
     }
     mutex_lock ml(mu_);
     events_writer_ =
-- 
GitLab


From 40ec7202b63c32f2f5ed57116096e33677c4b5df Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 9 Feb 2018 15:26:13 -0800
Subject: [PATCH 915/966] [XLA] Use a real priority queue in list scheduling

PiperOrigin-RevId: 185201882
---
 .../compiler/xla/service/hlo_scheduling.cc    | 31 ++++++++-----------
 1 file changed, 13 insertions(+), 18 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_scheduling.cc b/tensorflow/compiler/xla/service/hlo_scheduling.cc
index 2594c29efd..5f5a930dad 100644
--- a/tensorflow/compiler/xla/service/hlo_scheduling.cc
+++ b/tensorflow/compiler/xla/service/hlo_scheduling.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_scheduling.h"
 
+#include <queue>
 #include <utility>
 #include <vector>
 
@@ -217,32 +218,26 @@ class ListScheduler {
       }
     }
 
-    std::list<ReadyListEntry> ready_list;
+    auto priority_comparator = [this](const ReadyListEntry& lhs,
+                                      const ReadyListEntry& rhs) {
+      return GetPriority(lhs) < GetPriority(rhs);
+    };
+    std::priority_queue<ReadyListEntry, std::vector<ReadyListEntry>,
+                        decltype(priority_comparator)>
+        ready_queue(priority_comparator);
     for (auto* instruction : computation_.instructions()) {
       // Instruction with no operands or control predecessors will
       // not be in the map.
       if (unscheduled_pred_count.count(instruction) == 0) {
-        ready_list.push_back(MakeReadyListEntry(instruction));
+        ready_queue.emplace(MakeReadyListEntry(instruction));
       }
     }
 
-    while (!ready_list.empty()) {
-      // Select the highest priority HLO instruction from the ready list.
-      auto best_it = ready_list.begin();
-      Priority best_priority = GetPriority(*best_it);
-      for (auto ready_it = std::next(ready_list.begin());
-           ready_it != ready_list.end(); ++ready_it) {
-        Priority priority = GetPriority(*ready_it);
-        if (priority > best_priority) {
-          best_it = ready_it;
-          best_priority = priority;
-        }
-      }
-
+    while (!ready_queue.empty()) {
       // Remove the selected instruction from the ready list and add it to the
       // schedule.
-      const HloInstruction* best = best_it->instruction;
-      ready_list.erase(best_it);
+      const HloInstruction* best = ready_queue.top().instruction;
+      ready_queue.pop();
       schedule.push_back(best);
       scheduled_instructions_.insert(best);
 
@@ -257,7 +252,7 @@ class ListScheduler {
         int64 pred_count = --unscheduled_pred_count.at(inst);
         CHECK_GE(pred_count, 0);
         if (pred_count == 0) {
-          ready_list.push_back(MakeReadyListEntry(inst));
+          ready_queue.emplace(MakeReadyListEntry(inst));
         }
       };
       // TODO(b/34466113): Replace this and above with successors() or
-- 
GitLab


From 2adb6bbb1b4d31bba7113a4213bf5e7f0e154c78 Mon Sep 17 00:00:00 2001
From: Andrew Selle <aselle@google.com>
Date: Fri, 9 Feb 2018 15:45:00 -0800
Subject: [PATCH 916/966] Add delegate API to tflite.

- Context gets GetNodes, num_nodes and PartitionNodesIntoSubgraphs.
- TfLiteDelegate provides one function that need be implemented
- Delegates choose nodes and those nodes are all compacted into
  a new macro kernel.

PiperOrigin-RevId: 185204338
---
 tensorflow/contrib/lite/BUILD               |   2 +
 tensorflow/contrib/lite/context.h           | 103 +++++++++++---
 tensorflow/contrib/lite/interpreter.cc      | 111 +++++++++++++++
 tensorflow/contrib/lite/interpreter.h       |  49 +++++++
 tensorflow/contrib/lite/interpreter_test.cc | 146 +++++++++++++++++++-
 5 files changed, 387 insertions(+), 24 deletions(-)

diff --git a/tensorflow/contrib/lite/BUILD b/tensorflow/contrib/lite/BUILD
index d303154cc2..30664832fb 100644
--- a/tensorflow/contrib/lite/BUILD
+++ b/tensorflow/contrib/lite/BUILD
@@ -168,6 +168,8 @@ cc_test(
     deps = [
         ":framework",
         ":string_util",
+        "//tensorflow/contrib/lite/kernels/internal:tensor_utils",
+        "//tensorflow/contrib/lite/schema:schema_fbs",
         "//tensorflow/contrib/lite/testing:util",
         "@com_google_googletest//:gtest",
     ],
diff --git a/tensorflow/contrib/lite/context.h b/tensorflow/contrib/lite/context.h
index d6dfc20ae8..b0c4d3431f 100644
--- a/tensorflow/contrib/lite/context.h
+++ b/tensorflow/contrib/lite/context.h
@@ -38,6 +38,9 @@ extern "C" {
 
 typedef enum { kTfLiteOk = 0, kTfLiteError = 1 } TfLiteStatus;
 
+// Forward declare so GetNode can use this is in Context.
+typedef struct _TfLiteRegistration TfLiteRegistration;
+
 #define kOptionalTensor (-1)
 
 // Fixed size list of integers. Used for dimensions and inputs/outputs tensor
@@ -205,9 +208,56 @@ void TfLiteTensorReset(TfLiteType type, const char* name, TfLiteIntArray* dims,
 // Resize the allocated data of a (dynamic) tensor.
 void TfLiteTensorRealloc(size_t num_bytes, TfLiteTensor* tensor);
 
+// A structure representing an instance of a node.
+// This structure only exhibits the inputs, outputs and user defined data, not
+// other features like the type.
+typedef struct {
+  // Inputs to this node expressed as indices into the simulator's tensors.
+  TfLiteIntArray* inputs;
+
+  // Outputs to this node expressed as indices into the simulator's tensors.
+  TfLiteIntArray* outputs;
+
+  // Temporary tensors uses during the computations. This usually contains no
+  // tensors, but ops are allowed to change that if they need scratch space of
+  // any sort.
+  TfLiteIntArray* temporaries;
+
+  // Opaque data provided by the node implementer through `Registration.init`.
+  void* user_data;
+
+  // Opaque data provided to the node if the node is a builtin. This is usually
+  // a structure defined in builtin_op_data.h
+  void* builtin_data;
+
+  // Custom initial data. This is the opaque data provided in the flatbuffer.
+  // WARNING: This is an experimental interface that is subject to change.
+  const void* custom_initial_data;
+  int custom_initial_data_size;
+} TfLiteNode;
+
 typedef struct TfLiteContext {
   // Number of tensors in the context.
   int tensors_size;
+
+  // The execution plan contains a list of the node indices in execution
+  // order. execution_plan->size is the current number of nodes. And,
+  // execution_plan->data[0] is the first node that needs to be run.
+  // TfLiteDelegates can traverse the current execution plan by iterating
+  // through each member of this array and using GetNodeAndRegistration() to
+  // access details about a node. i.e.
+  // TfLiteIntArray* execution_plan;
+  // TF_LITE_ENSURE_STATUS(context->GetExecutionPlan(context, &execution_plan));
+  // for (int exec_index = 0; exec_index < execution_plan->size; exec_index++) {
+  //    int node_index = execution_plan->data[exec_index];
+  //    TfLiteNode* node;
+  //    TfLiteRegistration* reg;
+  //    context->GetNodeAndRegistration(context, node_index, &node, &reg);
+  // }
+  // WARNING: This is an experimental interface that is subject to change.
+  TfLiteStatus (*GetExecutionPlan)(struct TfLiteContext* context,
+                                   TfLiteIntArray** execution_plan);
+
   // An tensor of tensors in the interpreter context (of length `tensors_size`)
   TfLiteTensor* tensors;
 
@@ -227,34 +277,23 @@ typedef struct TfLiteContext {
   TfLiteStatus (*AddTensors)(struct TfLiteContext*, int tensors_to_add,
                              int* first_new_tensor_index);
 
+  // Get a Tensor node by node_index.
+  // WARNING: This is an experimental interface that is subject to change.
+  TfLiteStatus (*GetNodeAndRegistration)(struct TfLiteContext*, int node_index,
+                                         TfLiteNode** node,
+                                         TfLiteRegistration** registration);
+
+  // Replace ops with delegate.
+  TfLiteStatus (*ReplaceSubgraphsWithDelegateKernels)(
+      struct TfLiteContext*, TfLiteRegistration registration,
+      const TfLiteIntArray* nodes_to_replace);
+
   // TODO(ahentz): we should create a more general mechanism for this sort of
   // library-global objects.
   void* gemm_context;
 } TfLiteContext;
 
-// A structure representing an instance of a node.
-// This structure only exhibits the inputs, outputs and user defined data, not
-// other features like the type.
-typedef struct {
-  // Inputs to this node expressed as indices into the simulator's tensors.
-  TfLiteIntArray* inputs;
-
-  // Outputs to this node expressed as indices into the simulator's tensors.
-  TfLiteIntArray* outputs;
-
-  // Temporary tensors uses during the computations. This usually contains no
-  // tensors, but ops are allowed to change that if they need scratch space of
-  // any sort.
-  TfLiteIntArray* temporaries;
-
-  // Opaque data provided by the node implementer through `Registration.init`.
-  void* user_data;
-
-  // Opaque data provided to the node if the node is a builtin.
-  void* builtin_data;
-} TfLiteNode;
-
-typedef struct {
+typedef struct _TfLiteRegistration {
   // Initializes the op from serialized data.
   // If a built-in op:
   //   `buffer` is the op's params data (TfLiteLSTMParams*).
@@ -291,8 +330,26 @@ typedef struct {
   // NN API. Note, it is the responsibility of the registration binder to
   // set this properly.
   int32_t builtin_code;
+
+  // Custom op name. If the op is a builtin, this will be null.
+  // WARNING: This is an experimental interface that is subject to change.
+  const char* custom_name;
 } TfLiteRegistration;
 
+// WARNING: This is an experimental interface that is subject to change.
+typedef struct {
+  // Data that delegate needs to identify itself. This data is owned by the
+  // delegate. The delegate is owned in the user code, so the delegate is
+  // responsible for doing this when it is destroyed.
+  void* data_;
+  // Invoked by ModifyGraphWithDelegate. This prepare is called, giving the
+  // delegate a view of the current graph through TfLiteContext*. It typically
+  // will look at the nodes and call ReplaceSubgraphsWithDelegateKernels()
+  // to ask the TensorFlow lite runtime to create macro-nodes to represent
+  // delegated subgraphs of the original graph.
+  TfLiteStatus (*Prepare)(TfLiteContext* context, void* data);
+} TfLiteDelegate;
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/tensorflow/contrib/lite/interpreter.cc b/tensorflow/contrib/lite/interpreter.cc
index 5aa0cbafd6..6dea4e5916 100644
--- a/tensorflow/contrib/lite/interpreter.cc
+++ b/tensorflow/contrib/lite/interpreter.cc
@@ -77,6 +77,12 @@ Interpreter::Interpreter(ErrorReporter* error_reporter)
   context_.tensors = nullptr;
   context_.tensors_size = 0;
   context_.gemm_context = nullptr;
+
+  // Invalid to call these these except from TfLiteDelegate
+  context_.GetNodeAndRegistration = nullptr;
+  context_.ReplaceSubgraphsWithDelegateKernels = nullptr;
+  context_.GetExecutionPlan = nullptr;
+
   // Reserve some space for the tensors to avoid excessive resizing.
   tensors_.reserve(kSlotsToReserve);
   nodes_and_registration_.reserve(kSlotsToReserve);
@@ -100,6 +106,78 @@ Interpreter::~Interpreter() {
   }
 }
 
+TfLiteStatus Interpreter::ReplaceSubgraphsWithDelegateKernels(
+    TfLiteContext* context, TfLiteRegistration registration,
+    const TfLiteIntArray* nodes_to_replace) {
+  return static_cast<Interpreter*>(context->impl_)
+      ->ReplaceSubgraphsWithDelegateKernels(registration, nodes_to_replace);
+}
+
+TfLiteStatus Interpreter::ReplaceSubgraphsWithDelegateKernels(
+    TfLiteRegistration registration, const TfLiteIntArray* nodes_to_replace) {
+  // Analyze the graph to find all independent subgraphs that are either
+  // fully not-this-delegate or this-delegate computation.
+  InterpreterInfo info(this);
+  std::vector<Subgraph> subgraphs;
+  PartitionGraphIntoIndependentSubgraphs(&info, nodes_to_replace, &subgraphs);
+
+  execution_plan_.clear();
+  for (auto& subgraph : subgraphs) {
+    // Turn subgraph.nodes into a TfLiteIntArray compatible data structure.
+    // TODO(aselle): Avoid this copy by constructing subgraph.nodes that way
+    // in the first place
+    subgraph.nodes.insert(subgraph.nodes.begin(),
+                          static_cast<int>(subgraph.nodes.size()));
+    // Subgraphs calimed by the delegate should have a "macro" op created, the
+    // other subgraphs (kTfNonPartition) just have their nodes added back to
+    // the execution plan.
+    switch (subgraph.type) {
+      case Subgraph::kTfNonPartition:
+        for (auto it = subgraph.nodes.begin() + 1; it != subgraph.nodes.end();
+             ++it) {
+          execution_plan_.push_back(*it);
+        }
+        break;
+      case Subgraph::kTfPartition: {
+        void* builtin_data = nullptr;
+        int node_index;
+        // Create a node that represents computation of this subgraph.
+        AddNodeWithParameters(
+            subgraph.input_tensors, subgraph.output_tensors,
+            reinterpret_cast<const char*>(subgraph.nodes.data()),
+            subgraph.nodes.size() * sizeof(subgraph.nodes[0]), builtin_data,
+            &registration, &node_index);
+      } break;
+      case Subgraph::kTfUnexplored:
+        return kTfLiteError;
+        break;
+    }
+  }
+  return kTfLiteOk;
+}
+
+// Gets an TfLiteIntArray* representing the execution plan. The interpreter owns
+// this memory and it is only guaranteed to exist during the invocation of the
+// delegate prepare.
+TfLiteStatus Interpreter::GetExecutionPlan(TfLiteIntArray** execution_plan) {
+  // TODO(aselle): Do not make a copy here
+  plan_cache_.reset(TfLiteIntArrayCreate(execution_plan_.size()));
+  *execution_plan = plan_cache_.get();
+  static_assert(sizeof(plan_cache_->data[0]) == sizeof(execution_plan_[0]),
+                "TfLiteIntArray and execution_plan do not contain same type.");
+  memcpy(plan_cache_->data, execution_plan_.data(),
+         sizeof(plan_cache_->data[0]));
+  return kTfLiteOk;
+}
+
+// WARNING: This is an experimental interface that is subject to change.
+// Entry point for C node plugin API to get the execution plan
+TfLiteStatus Interpreter::GetExecutionPlan(struct TfLiteContext* context,
+                                           TfLiteIntArray** execution_plan) {
+  return static_cast<Interpreter*>(context->impl_)
+      ->GetExecutionPlan(execution_plan);
+}
+
 TfLiteStatus Interpreter::SetInputs(std::vector<int> inputs) {
   TF_LITE_ENSURE_OK(&context_,
                     CheckTensorIndices("inputs", inputs.data(), inputs.size()));
@@ -200,6 +278,7 @@ TfLiteStatus Interpreter::AddNodeWithParameters(
   int new_node_index = nodes_and_registration_.size();
   if (node_index) *node_index = new_node_index;
   nodes_and_registration_.resize(nodes_and_registration_.size() + 1);
+
   auto& node_and_reg = nodes_and_registration_.back();
   TfLiteNode& node = node_and_reg.first;
   if (node.inputs) TfLiteIntArrayFree(node.inputs);
@@ -388,6 +467,22 @@ TfLiteStatus Interpreter::AddTensors(TfLiteContext* context, int tensors_to_add,
       ->AddTensors(tensors_to_add, first_new_tensor_index);
 }
 
+TfLiteStatus Interpreter::GetNodeAndRegistration(
+    int node_index, TfLiteNode** node, TfLiteRegistration** registration) {
+  TF_LITE_ENSURE(&context_, node_index < nodes_size() && node_index >= 0);
+  TF_LITE_ENSURE(&context_, node != nullptr && registration != nullptr);
+  *node = &nodes_and_registration_[node_index].first;
+  *registration = &nodes_and_registration_[node_index].second;
+  return kTfLiteOk;
+}
+
+TfLiteStatus Interpreter::GetNodeAndRegistration(
+    struct TfLiteContext* context, int node_index, TfLiteNode** node,
+    TfLiteRegistration** registration) {
+  return static_cast<Interpreter*>(context->impl_)
+      ->GetNodeAndRegistration(node_index, node, registration);
+}
+
 TfLiteStatus Interpreter::SetTensorParametersReadOnly(
     int tensor_index, TfLiteType type, const char* name,
     const std::vector<int>& dims, TfLiteQuantizationParams quantization,
@@ -498,4 +593,20 @@ void Interpreter::SetNumThreads(int num_threads) {
   tflite::gemm_support::SetMaxNumThreads(&context_, num_threads);
 }
 
+TfLiteStatus Interpreter::ModifyGraphWithDelegate(TfLiteDelegate* delegate) {
+  // TODO(aselle): Consider if it is worth storing pointers to delegates.
+  // Setup additional context interface
+  context_.GetNodeAndRegistration = GetNodeAndRegistration;
+  context_.ReplaceSubgraphsWithDelegateKernels =
+      ReplaceSubgraphsWithDelegateKernels;
+  context_.GetExecutionPlan = GetExecutionPlan;
+
+  TfLiteStatus status = delegate->Prepare(&context_, delegate->data_);
+  // Remove additional context info.
+  context_.GetNodeAndRegistration = nullptr;
+  context_.ReplaceSubgraphsWithDelegateKernels = nullptr;
+  context_.GetExecutionPlan = nullptr;
+  return status;
+}
+
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/interpreter.h b/tensorflow/contrib/lite/interpreter.h
index 3b077c7a35..bab56a9d72 100644
--- a/tensorflow/contrib/lite/interpreter.h
+++ b/tensorflow/contrib/lite/interpreter.h
@@ -80,6 +80,12 @@ class NNAPIDelegate;
 // foo.Invoke();
 //
 
+struct TfLiteIntArrayDeleter {
+  void operator()(TfLiteIntArray* a) {
+    if (a) TfLiteIntArrayFree(a);
+  }
+};
+
 class Interpreter {
  public:
   // Instantiate an interpreter. All errors associated with reading and
@@ -247,6 +253,11 @@ class Interpreter {
   // Set the number of threads available to the interpreter.
   void SetNumThreads(int num_threads);
 
+  // Allow a delegate to look at the graph and modify the graph to handle
+  // parts of the graph themselves. After this is called, the graph may
+  // contain new nodes that replace 1 more nodes.
+  TfLiteStatus ModifyGraphWithDelegate(TfLiteDelegate* delegate);
+
  private:
   // Give 'op_reg' a chance to initialize itself using the contents of
   // 'buffer'.
@@ -325,6 +336,40 @@ class Interpreter {
   static TfLiteStatus AddTensors(TfLiteContext* context, int tensors_to_add,
                                  int* first_new_tensor_index);
 
+  // WARNING: This is an experimental API and subject to change.
+  // Entry point for C API ReplaceSubgraphsWithDelegateKernels
+  static TfLiteStatus ReplaceSubgraphsWithDelegateKernels(
+      TfLiteContext* context, TfLiteRegistration registration,
+      const TfLiteIntArray* nodes_to_replace);
+
+  // Update the execution graph to replace some of the nodes with stub
+  // nodes. Specifically any node index that has `nodes[index]==1` will be
+  // slated for replacement with a delegate kernel specified by registration.
+  // WARNING: This is an experimental interface that is subject to change.
+  TfLiteStatus ReplaceSubgraphsWithDelegateKernels(
+      TfLiteRegistration registration, const TfLiteIntArray* nodes_to_replace);
+
+  // WARNING: This is an experimental interface that is subject to change.
+  // Gets the internal pointer to a TensorFlow lite node by node_index.
+  TfLiteStatus GetNodeAndRegistration(int node_index, TfLiteNode** node,
+                                      TfLiteRegistration** registration);
+
+  // WARNING: This is an experimental interface that is subject to change.
+  // Entry point for C node plugin API to get a node by index.
+  static TfLiteStatus GetNodeAndRegistration(struct TfLiteContext*,
+                                             int node_index, TfLiteNode** node,
+                                             TfLiteRegistration** registration);
+
+  // WARNING: This is an experimental interface that is subject to change.
+  // Gets an TfLiteIntArray* representing the execution plan. The caller owns
+  // this memory and must free it with TfLiteIntArrayFree().
+  TfLiteStatus GetExecutionPlan(TfLiteIntArray** execution_plan);
+
+  // WARNING: This is an experimental interface that is subject to change.
+  // Entry point for C node plugin API to get the execution plan
+  static TfLiteStatus GetExecutionPlan(struct TfLiteContext* context,
+                                       TfLiteIntArray** execution_plan);
+
   // A pure C data structure used to communicate with the pure C plugin
   // interface. To avoid copying tensor metadata, this is also the definitive
   // structure to store tensors.
@@ -372,6 +417,10 @@ class Interpreter {
   // subset of the node indices.
   std::vector<int> execution_plan_;
 
+  // In the future, we'd like a TfLiteIntArray compatible representation.
+  // TODO(aselle): replace execution_plan_ with this.
+  std::unique_ptr<TfLiteIntArray, TfLiteIntArrayDeleter> plan_cache_;
+
   // Whether to delegate to NN API
   std::unique_ptr<NNAPIDelegate> nnapi_delegate_;
 
diff --git a/tensorflow/contrib/lite/interpreter_test.cc b/tensorflow/contrib/lite/interpreter_test.cc
index cfda19d72c..4b309748f7 100644
--- a/tensorflow/contrib/lite/interpreter_test.cc
+++ b/tensorflow/contrib/lite/interpreter_test.cc
@@ -16,9 +16,10 @@ limitations under the License.
 #include "tensorflow/contrib/lite/interpreter.h"
 #include <gtest/gtest.h>
 #include "tensorflow/contrib/lite/error_reporter.h"
+#include "tensorflow/contrib/lite/kernels/internal/compatibility.h"
+#include "tensorflow/contrib/lite/schema/schema_generated.h"
 #include "tensorflow/contrib/lite/string_util.h"
 #include "tensorflow/contrib/lite/testing/util.h"
-
 namespace tflite {
 namespace {
 
@@ -687,6 +688,149 @@ TEST_F(TestExecutionPlan, NullExecutionPlan) {
   ASSERT_EQ(run_order_, std::vector<int>());
 }
 
+// Build a kernel registration for an op that copies its one input
+// to an output
+TfLiteRegistration AddOpRegistration() {
+  TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
+
+  reg.custom_name = "my_add";
+  reg.builtin_code = tflite::BuiltinOperator_CUSTOM;
+
+  reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+    // Set output size to input size
+    TfLiteTensor* tensor0 = &context->tensors[node->inputs->data[0]];
+    TfLiteTensor* tensor1 = &context->tensors[node->inputs->data[1]];
+    TfLiteTensor* tensor2 = &context->tensors[node->outputs->data[0]];
+    TfLiteIntArray* newSize = TfLiteIntArrayCopy(tensor0->dims);
+    TfLiteIntArray* newSizeOther = TfLiteIntArrayCopy(tensor1->dims);
+    TF_LITE_ENSURE_EQ(context, newSize->size, newSizeOther->size);
+    TF_LITE_ENSURE_STATUS(context->ResizeTensor(context, tensor2, newSize));
+    return kTfLiteOk;
+  };
+
+  reg.invoke = [](TfLiteContext* context, TfLiteNode* node) {
+    // Copy input data to output data.
+    TfLiteTensor* a0 = &context->tensors[node->inputs->data[0]];
+    TfLiteTensor* a1 = &context->tensors[node->inputs->data[1]];
+    TfLiteTensor* out = &context->tensors[node->outputs->data[0]];
+    int num = a0->dims->data[0];
+    for (int i = 0; i < num; i++) {
+      out->data.f[i] = a0->data.f[i] + a1->data.f[i];
+    }
+    return kTfLiteOk;
+  };
+  return reg;
+}
+
+class TestDelegate : public ::testing::Test {
+ public:
+  TestDelegate() {
+    interpreter_.AddTensors(5);
+    interpreter_.SetInputs({0, 1});
+    interpreter_.SetOutputs({3, 4});
+    TfLiteQuantizationParams quant;
+    interpreter_.SetTensorParametersReadWrite(0, kTfLiteFloat32, "", {3},
+                                              quant);
+    interpreter_.SetTensorParametersReadWrite(1, kTfLiteFloat32, "", {3},
+                                              quant);
+    interpreter_.SetTensorParametersReadWrite(2, kTfLiteFloat32, "", {3},
+                                              quant);
+    interpreter_.SetTensorParametersReadWrite(3, kTfLiteFloat32, "", {3},
+                                              quant);
+    TfLiteRegistration reg = AddOpRegistration();
+    interpreter_.AddNodeWithParameters({0, 0}, {2}, nullptr, 0, nullptr, &reg);
+    interpreter_.AddNodeWithParameters({1, 1}, {3}, nullptr, 0, nullptr, &reg);
+    interpreter_.AddNodeWithParameters({2, 1}, {4}, nullptr, 0, nullptr, &reg);
+  }
+
+ protected:
+  class SimpleDelegate {
+   public:
+    // Create a simple implementation of a TfLiteDelegate. We use the C++ class
+    // SimpleDelegate and it can produce a handle TfLiteDelegate that is
+    // value-copyable and compatible with TfLite.
+    explicit SimpleDelegate(const std::vector<int>& nodes) : nodes_(nodes) {
+      delegate_.Prepare = [](TfLiteContext* context,
+                             void* data) -> TfLiteStatus {
+        auto* simple = reinterpret_cast<SimpleDelegate*>(data);
+        TfLiteIntArray* nodes_to_separate =
+            TfLiteIntArrayCreate(simple->nodes_.size());
+        // Mark nodes that we want in TfLiteIntArray* structure.
+        int index = 0;
+        for (auto node_index : simple->nodes_) {
+          nodes_to_separate->data[index++] = node_index;
+          // make sure node is add
+          TfLiteNode* node;
+          TfLiteRegistration* reg;
+          context->GetNodeAndRegistration(context, node_index, &node, &reg);
+          TFLITE_CHECK_EQ(reg->builtin_code, tflite::BuiltinOperator_CUSTOM);
+          TFLITE_CHECK_EQ(strcmp(reg->custom_name, "my_add"), 0);
+        }
+        // Check that all nodes are available
+        TfLiteIntArray* execution_plan;
+        TF_LITE_ENSURE_STATUS(
+            context->GetExecutionPlan(context, &execution_plan));
+        for (int exec_index = 0; exec_index < execution_plan->size;
+             exec_index++) {
+          int node_index = execution_plan->data[exec_index];
+          TfLiteNode* node;
+          TfLiteRegistration* reg;
+          context->GetNodeAndRegistration(context, node_index, &node, &reg);
+          TFLITE_CHECK_EQ(reg->builtin_code, tflite::BuiltinOperator_CUSTOM);
+          TFLITE_CHECK_EQ(strcmp(reg->custom_name, "my_add"), 0);
+        }
+
+        context->ReplaceSubgraphsWithDelegateKernels(
+            context, FakeFusedRegistration(), nodes_to_separate);
+        TfLiteIntArrayFree(nodes_to_separate);
+        return kTfLiteOk;
+      };
+      // Store type-punned data SimpleDelegate structure.
+      delegate_.data_ = reinterpret_cast<void*>(this);
+    }
+
+    static TfLiteRegistration FakeFusedRegistration() {
+      TfLiteRegistration reg = {nullptr};
+      reg.custom_name = "fake_fused_op";
+      return reg;
+    }
+
+    TfLiteDelegate* get_tf_lite_delegate() { return &delegate_; }
+
+   private:
+    std::vector<int> nodes_;
+    TfLiteDelegate delegate_;
+  };
+  Interpreter interpreter_;
+};
+
+TEST_F(TestDelegate, BasicDelegate) {
+  interpreter_.Invoke();
+  SimpleDelegate simple({0, 1, 2});
+  interpreter_.ModifyGraphWithDelegate(simple.get_tf_lite_delegate());
+
+  ASSERT_EQ(interpreter_.execution_plan().size(), 1);
+  int node = interpreter_.execution_plan()[0];
+  const auto* node_and_reg = interpreter_.node_and_registration(node);
+  ASSERT_EQ(node_and_reg->second.custom_name,
+            SimpleDelegate::FakeFusedRegistration().custom_name);
+}
+
+TEST_F(TestDelegate, ComplexDeligate) {
+  interpreter_.Invoke();
+  SimpleDelegate simple({1, 2});
+  interpreter_.ModifyGraphWithDelegate(simple.get_tf_lite_delegate());
+
+  ASSERT_EQ(interpreter_.execution_plan().size(), 2);
+  // 0th should be a non-delegated original op
+  ASSERT_EQ(interpreter_.execution_plan()[0], 0);
+  // 1st should be a new macro op (3) which didn't exist)
+  ASSERT_EQ(interpreter_.execution_plan()[1], 3);
+  const auto* node_and_reg = interpreter_.node_and_registration(3);
+  ASSERT_EQ(node_and_reg->second.custom_name,
+            SimpleDelegate::FakeFusedRegistration().custom_name);
+}
+
 }  // namespace
 }  // namespace tflite
 
-- 
GitLab


From 940540ae4f6d9333dea693aaeabdd18996ed957b Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Fri, 9 Feb 2018 16:02:24 -0800
Subject: [PATCH 917/966] Add pylint check for W0622 redefined-builtin in
 ci_sanity.sh and fix existing pylint errors.

PiperOrigin-RevId: 185206494
---
 tensorflow/__init__.py                        |  2 +-
 .../linear_optimizer/python/ops/sdca_ops.py   |  6 ++--
 .../slim/python/slim/evaluation_test.py       |  2 +-
 tensorflow/contrib/specs/python/__init__.py   |  4 +--
 .../tutorials/word2vec/word2vec_basic.py      |  2 +-
 tensorflow/python/__init__.py                 |  2 +-
 tensorflow/python/framework/dtypes.py         |  6 ++--
 tensorflow/python/framework/framework_lib.py  |  2 +-
 .../python/framework/op_def_library_test.py   |  4 +--
 tensorflow/python/framework/ops.py            |  2 +-
 .../kernel_tests/decode_jpeg_op_test.py       |  2 +-
 tensorflow/python/kernel_tests/pool_test.py   |  2 +-
 .../python/kernel_tests/softmax_op_test.py    |  6 ++--
 tensorflow/python/ops/control_flow_ops.py     |  4 +--
 tensorflow/python/ops/math_ops.py             |  8 +++---
 tensorflow/python/ops/nn_grad.py              |  2 +-
 tensorflow/python/ops/nn_ops.py               | 28 +++++++++----------
 tensorflow/python/ops/standard_ops.py         |  3 +-
 tensorflow/python/training/training.py        |  2 +-
 tensorflow/tools/ci_build/ci_sanity.sh        |  3 +-
 .../tools/compatibility/tf_upgrade_test.py    |  2 +-
 21 files changed, 46 insertions(+), 48 deletions(-)

diff --git a/tensorflow/__init__.py b/tensorflow/__init__.py
index 083634bd79..78ad6aec19 100644
--- a/tensorflow/__init__.py
+++ b/tensorflow/__init__.py
@@ -21,7 +21,7 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=wildcard-import
-from tensorflow.python import *
+from tensorflow.python import *  # pylint: disable=redefined-builtin
 # pylint: enable=wildcard-import
 
 from tensorflow.python.util.lazy_loader import LazyLoader
diff --git a/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py b/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py
index 7526f3ae0d..3f5fdc18bb 100644
--- a/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py
+++ b/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py
@@ -211,9 +211,8 @@ class SdcaModel(object):
             sums.append(
                 math_ops.reduce_sum(
                     math_ops.abs(math_ops.cast(weights, dtypes.float64))))
-      sum = math_ops.add_n(sums)
       # SDCA L1 regularization cost is: l1 * sum(|weights|)
-      return self._options['symmetric_l1_regularization'] * sum
+      return self._options['symmetric_l1_regularization'] * math_ops.add_n(sums)
 
   def _l2_loss(self, l2):
     """Computes the (un-normalized) l2 loss of the model."""
@@ -225,9 +224,8 @@ class SdcaModel(object):
             sums.append(
                 math_ops.reduce_sum(
                     math_ops.square(math_ops.cast(weights, dtypes.float64))))
-      sum = math_ops.add_n(sums)
       # SDCA L2 regularization cost is: l2 * sum(weights^2) / 2
-      return l2 * sum / 2.0
+      return l2 * math_ops.add_n(sums) / 2.0
 
   def _convert_n_to_tensor(self, input_list, as_ref=False):
     """Converts input list to a set of tensors."""
diff --git a/tensorflow/contrib/slim/python/slim/evaluation_test.py b/tensorflow/contrib/slim/python/slim/evaluation_test.py
index 8a267ddac7..7ab6805fac 100644
--- a/tensorflow/contrib/slim/python/slim/evaluation_test.py
+++ b/tensorflow/contrib/slim/python/slim/evaluation_test.py
@@ -41,7 +41,7 @@ from tensorflow.python.platform import flags
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.summary import summary_iterator
-from tensorflow.python.training import input
+from tensorflow.python.training import input  # pylint: disable=redefined-builtin
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training import session_run_hook
 
diff --git a/tensorflow/contrib/specs/python/__init__.py b/tensorflow/contrib/specs/python/__init__.py
index 52db61e421..b6cc754023 100644
--- a/tensorflow/contrib/specs/python/__init__.py
+++ b/tensorflow/contrib/specs/python/__init__.py
@@ -18,10 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# pylint: disable=wildcard-import,g-importing-member
+# pylint: disable=wildcard-import,g-importing-member,redefined-builtin
 from tensorflow.contrib.specs.python.params_ops import *
 from tensorflow.contrib.specs.python.specs import *
 from tensorflow.contrib.specs.python.specs_lib import *
 from tensorflow.contrib.specs.python.specs_ops import *
 from tensorflow.contrib.specs.python.summaries import *
-# pylint: enable=wildcard-import
+# pylint: enable=wildcard-import,redefined-builtin
diff --git a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
index f6906b0f79..14ae7fbf35 100644
--- a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
+++ b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
@@ -131,7 +131,7 @@ def generate_batch(batch_size, num_skips, skip_window):
   batch = np.ndarray(shape=(batch_size), dtype=np.int32)
   labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
   span = 2 * skip_window + 1  # [ skip_window target skip_window ]
-  buffer = collections.deque(maxlen=span)
+  buffer = collections.deque(maxlen=span)  # pylint: disable=redefined-builtin
   if data_index + span > len(data):
     data_index = 0
   buffer.extend(data[data_index:data_index + span])
diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index 6d405d04b1..02ed5517ca 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -60,7 +60,7 @@ from tensorflow.core.protobuf.tensorflow_server_pb2 import *
 from tensorflow.core.util.event_pb2 import *
 
 # Framework
-from tensorflow.python.framework.framework_lib import *
+from tensorflow.python.framework.framework_lib import *  # pylint: disable=redefined-builtin
 from tensorflow.python.framework.versions import *
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import graph_util
diff --git a/tensorflow/python/framework/dtypes.py b/tensorflow/python/framework/dtypes.py
index c825114483..99ae8b24f1 100644
--- a/tensorflow/python/framework/dtypes.py
+++ b/tensorflow/python/framework/dtypes.py
@@ -238,9 +238,9 @@ class DType(object):
       min, max : tuple
         Lower and upper intensity limits.
     """
-    min, max = dtype_range[self.as_numpy_dtype]
+    min, max = dtype_range[self.as_numpy_dtype]  # pylint: disable=redefined-builtin
     if clip_negative:
-      min = 0
+      min = 0  # pylint: disable=redefined-builtin
     return min, max
 
   def is_compatible_with(self, other):
@@ -356,7 +356,7 @@ complex128 = DType(types_pb2.DT_COMPLEX128)
 tf_export("complex128").export_constant(__name__, "complex128")
 int64 = DType(types_pb2.DT_INT64)
 tf_export("int64").export_constant(__name__, "int64")
-bool = DType(types_pb2.DT_BOOL)
+bool = DType(types_pb2.DT_BOOL)  # pylint: disable=redefined-builtin
 tf_export("bool").export_constant(__name__, "bool")
 qint8 = DType(types_pb2.DT_QINT8)
 tf_export("qint8").export_constant(__name__, "qint8")
diff --git a/tensorflow/python/framework/framework_lib.py b/tensorflow/python/framework/framework_lib.py
index d16fe979e6..3172f3c2c3 100644
--- a/tensorflow/python/framework/framework_lib.py
+++ b/tensorflow/python/framework/framework_lib.py
@@ -118,7 +118,7 @@ from tensorflow.python.framework.ops import register_tensor_conversion_function
 
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
-from tensorflow.python.framework.dtypes import *
+from tensorflow.python.framework.dtypes import *  # pylint: disable=redefined-builtin
 
 # Load a TensorFlow plugin
 from tensorflow.python.framework.load_library import *
diff --git a/tensorflow/python/framework/op_def_library_test.py b/tensorflow/python/framework/op_def_library_test.py
index 817007ce6c..84ca062ade 100644
--- a/tensorflow/python/framework/op_def_library_test.py
+++ b/tensorflow/python/framework/op_def_library_test.py
@@ -42,7 +42,7 @@ class OpDefLibraryTest(test_util.TensorFlowTestCase):
   def setUp(self):
     self._lib = test_ops._op_def_lib
 
-  def _add_op(self, ascii):
+  def _add_op(self, ascii):  # pylint: disable=redefined-builtin
     op_def = op_def_pb2.OpDef()
     text_format.Merge(ascii, op_def)
     self._lib.add_op(op_def)
@@ -1336,7 +1336,7 @@ class OpDefLibraryGraphTest(test_util.TensorFlowTestCase):
   def setUp(self):
     self._lib = test_ops._op_def_lib
 
-  def _add_op(self, ascii):
+  def _add_op(self, ascii):  # pylint: disable=redefined-builtin
     op_def = op_def_pb2.OpDef()
     text_format.Merge(ascii, op_def)
     self._lib.add_op(op_def)
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 4d7dcdbee1..77e83554c9 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -3649,7 +3649,7 @@ class Graph(object):
   def _last_id(self):
     return self._next_id_counter
 
-  def _get_op_def(self, type):
+  def _get_op_def(self, type):  # pylint: disable=redefined-builtin
     """Returns the `OpDef` proto for `type`. `type` is a string."""
     if self._c_graph:
       with c_api_util.tf_buffer() as buf:
diff --git a/tensorflow/python/kernel_tests/decode_jpeg_op_test.py b/tensorflow/python/kernel_tests/decode_jpeg_op_test.py
index 89fd26c544..510daf79dc 100644
--- a/tensorflow/python/kernel_tests/decode_jpeg_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_jpeg_op_test.py
@@ -21,7 +21,7 @@ from __future__ import print_function
 import os
 import time
 
-from six.moves import xrange
+from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.python.client import session
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
diff --git a/tensorflow/python/kernel_tests/pool_test.py b/tensorflow/python/kernel_tests/pool_test.py
index 6384897633..6ede654aad 100644
--- a/tensorflow/python/kernel_tests/pool_test.py
+++ b/tensorflow/python/kernel_tests/pool_test.py
@@ -96,7 +96,7 @@ def pool_direct_single_axis(
 
 
 def pool_direct(
-    input,
+    input,  # pylint: disable=redefined-builtin
     window_shape,
     pooling_type,
     padding,  # pylint: disable=redefined-builtin
diff --git a/tensorflow/python/kernel_tests/softmax_op_test.py b/tensorflow/python/kernel_tests/softmax_op_test.py
index ac08f2aec0..4d89831aae 100644
--- a/tensorflow/python/kernel_tests/softmax_op_test.py
+++ b/tensorflow/python/kernel_tests/softmax_op_test.py
@@ -99,10 +99,10 @@ class SoftmaxTest(test.TestCase):
 
   def _testOverflow(self, use_gpu=False):
     if use_gpu:
-      type = np.float32
+      type = np.float32  # pylint: disable=redefined-builtin
     else:
-      type = np.float64
-    max = np.finfo(type).max
+      type = np.float64  # pylint: disable=redefined-builtin
+    max = np.finfo(type).max  # pylint: disable=redefined-builtin
     features = np.array([[1., 1., 1., 1.], [max, 1., 2., 3.]]).astype(type)
     with self.test_session(use_gpu=use_gpu):
       tf_log_softmax = nn_ops.log_softmax(features)
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 3a6fdaafb9..c33f351289 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -313,7 +313,7 @@ def _Enter(data,
       return sparse_tensor.SparseTensor(indices, values, dense_shape)
 
 
-def exit(data, name=None):
+def exit(data, name=None):  # pylint: disable=redefined-builtin
   """Exits the current frame to its parent frame.
 
   Exit makes its input `data` available to the parent frame.
@@ -3343,7 +3343,7 @@ def group(*inputs, **kwargs):
 
 
 @tf_export("tuple")
-def tuple(tensors, name=None, control_inputs=None):
+def tuple(tensors, name=None, control_inputs=None):  # pylint: disable=redefined-builtin
   """Group tensors together.
 
   This creates a tuple of tensors with the same values as the `tensors`
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 448cc905d6..57b260ae91 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -237,7 +237,7 @@ def argmin(input,
 # pylint: disable=anomalous-backslash-in-string,protected-access
 # pylint: disable=g-docstring-has-escape
 @tf_export("abs")
-def abs(x, name=None):
+def abs(x, name=None):  # pylint: disable=redefined-builtin
   r"""Computes the absolute value of a tensor.
 
   Given a tensor `x` of complex numbers, this operation returns a tensor of type
@@ -542,7 +542,7 @@ def scalar_mul(scalar, x):
 
 
 @tf_export("pow")
-def pow(x, y, name=None):
+def pow(x, y, name=None):  # pylint: disable=redefined-builtin
   r"""Computes the power of one value to another.
 
   Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
@@ -712,7 +712,7 @@ def angle(input, name=None):
 
 
 @tf_export("round")
-def round(x, name=None):
+def round(x, name=None):  # pylint: disable=redefined-builtin
   """Rounds the values of a tensor to the nearest integer, element-wise.
 
   Rounds half to even.  Also known as bankers rounding. If you want to round
@@ -1207,7 +1207,7 @@ ops.Tensor._override_operator("__ge__", gen_math_ops.greater_equal)
 
 
 @tf_export("range")
-def range(start, limit=None, delta=1, dtype=None, name="range"):
+def range(start, limit=None, delta=1, dtype=None, name="range"):  # pylint: disable=redefined-builtin
   """Creates a sequence of numbers.
 
   Creates a sequence of numbers that begins at `start` and extends by
diff --git a/tensorflow/python/ops/nn_grad.py b/tensorflow/python/ops/nn_grad.py
index 5e6cafd6aa..2a883eb0d5 100644
--- a/tensorflow/python/ops/nn_grad.py
+++ b/tensorflow/python/ops/nn_grad.py
@@ -1010,7 +1010,7 @@ def _NthElementGrad(op, grad):
     A list of two tensors, the first being the gradient w.r.t. the input,
     the second being the gradient w.r.t. the N (None).
   """
-  input = op.inputs[0]
+  input = op.inputs[0]  # pylint: disable=redefined-builtin
   output = op.outputs[0]
 
   # Compute the number of elements which equal to output in each reduction
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index a691e281ee..47f48a7e16 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -48,8 +48,8 @@ local_response_normalization = gen_nn_ops.lrn
 
 
 def _non_atrous_convolution(
-    input,
-    filter,
+    input,  # pylint: disable=redefined-builtin
+    filter,  # pylint: disable=redefined-builtin
     padding,
     data_format=None,  # pylint: disable=redefined-builtin
     strides=None,
@@ -94,9 +94,9 @@ def _non_atrous_convolution(
 
   """
   with ops.name_scope(name, "non_atrous_convolution", [input, filter]) as scope:
-    input = ops.convert_to_tensor(input, name="input")
+    input = ops.convert_to_tensor(input, name="input")  # pylint: disable=redefined-builtin
     input_shape = input.get_shape()
-    filter = ops.convert_to_tensor(filter, name="filter")
+    filter = ops.convert_to_tensor(filter, name="filter")  # pylint: disable=redefined-builtin
     filter_shape = filter.get_shape()
     op = _NonAtrousConvolution(
         input_shape,
@@ -348,7 +348,7 @@ def with_space_to_batch(
     ValueError: if `spatial_dims` are invalid.
 
   """
-  input = ops.convert_to_tensor(input, name="input")
+  input = ops.convert_to_tensor(input, name="input")  # pylint: disable=redefined-builtin
   input_shape = input.get_shape()
 
   def build_op(num_spatial_dims, padding):
@@ -645,7 +645,7 @@ def _get_strides_and_dilation_rate(num_spatial_dims, strides, dilation_rate):
 
 @tf_export("nn.convolution")
 def convolution(
-    input,
+    input,  # pylint: disable=redefined-builtin
     filter,  # pylint: disable=redefined-builtin
     padding,
     strides=None,
@@ -766,9 +766,9 @@ def convolution(
   """
   # pylint: enable=line-too-long
   with ops.name_scope(name, "convolution", [input, filter]) as name:
-    input = ops.convert_to_tensor(input, name="input")
+    input = ops.convert_to_tensor(input, name="input")  # pylint: disable=redefined-builtin
     input_shape = input.get_shape()
-    filter = ops.convert_to_tensor(filter, name="filter")
+    filter = ops.convert_to_tensor(filter, name="filter")  # pylint: disable=redefined-builtin
     filter_shape = filter.get_shape()
     op = Convolution(
         input_shape,
@@ -962,7 +962,7 @@ def pool(
   # pylint: enable=line-too-long
   with ops.name_scope(name, "%s_pool" % (pooling_type.lower()),
                       [input]) as scope:
-    input = ops.convert_to_tensor(input, name="input")
+    input = ops.convert_to_tensor(input, name="input")  # pylint: disable=redefined-builtin
 
     num_spatial_dims = len(window_shape)
     if num_spatial_dims < 1 or num_spatial_dims > 3:
@@ -1223,7 +1223,7 @@ def conv2d_transpose(
     if data_format not in ("NCHW", "NHWC"):
       raise ValueError("data_format has to be either NCHW or NHWC.")
     value = ops.convert_to_tensor(value, name="value")
-    filter = ops.convert_to_tensor(filter, name="filter")
+    filter = ops.convert_to_tensor(filter, name="filter")  # pylint: disable=redefined-builtin
     axis = 3 if data_format == "NHWC" else 1
     if not value.get_shape()[axis].is_compatible_with(filter.get_shape()[3]):
       raise ValueError("input channels does not match filter's input channels, "
@@ -1447,7 +1447,7 @@ def conv3d_transpose(
   with ops.name_scope(name, "conv3d_transpose",
                       [value, filter, output_shape]) as name:
     value = ops.convert_to_tensor(value, name="value")
-    filter = ops.convert_to_tensor(filter, name="filter")
+    filter = ops.convert_to_tensor(filter, name="filter")  # pylint: disable=redefined-builtin
     axis = 1 if data_format == "NCDHW" else 4
     if not value.get_shape()[axis].is_compatible_with(filter.get_shape()[4]):
       raise ValueError("input channels does not match filter's input channels, "
@@ -2279,7 +2279,7 @@ def dropout(x, keep_prob, noise_shape=None, seed=None, name=None):  # pylint: di
 
 
 @tf_export("nn.top_k")
-def top_k(input, k=1, sorted=True, name=None):
+def top_k(input, k=1, sorted=True, name=None):  # pylint: disable=redefined-builtin
   """Finds values and indices of the `k` largest entries for the last dimension.
 
   If the input is a vector (rank=1), finds the `k` largest entries in the vector
@@ -2308,7 +2308,7 @@ def top_k(input, k=1, sorted=True, name=None):
   return gen_nn_ops._top_kv2(input, k=k, sorted=sorted, name=name)
 
 
-def nth_element(input, n, reverse=False, name=None):
+def nth_element(input, n, reverse=False, name=None):  # pylint: disable=redefined-builtin
   r"""Finds values of the `n`-th order statistic for the last dmension.
 
   If the input is a vector (rank-1), finds the entries which is the nth-smallest
@@ -2505,7 +2505,7 @@ def conv1d_transpose(
       spatial_start_dim = 2
       strides = [1, 1, 1, stride]
     value = array_ops.expand_dims(value, spatial_start_dim)
-    filter = array_ops.expand_dims(filter, 0)
+    filter = array_ops.expand_dims(filter, 0)  # pylint: disable=redefined-builtin
 
     result = gen_nn_ops.conv2d_backprop_input(
         input_sizes=output_shape_,
diff --git a/tensorflow/python/ops/standard_ops.py b/tensorflow/python/ops/standard_ops.py
index 009d1dc3b9..a2164f78b8 100644
--- a/tensorflow/python/ops/standard_ops.py
+++ b/tensorflow/python/ops/standard_ops.py
@@ -47,8 +47,7 @@ from tensorflow.python.ops.control_flow_ops import case
 from tensorflow.python.ops.control_flow_ops import cond
 from tensorflow.python.ops.control_flow_ops import group
 from tensorflow.python.ops.control_flow_ops import no_op
-# pylint: disable=redefined-builtin
-from tensorflow.python.ops.control_flow_ops import tuple
+from tensorflow.python.ops.control_flow_ops import tuple  # pylint: disable=redefined-builtin
 # pylint: enable=redefined-builtin
 from tensorflow.python.ops.control_flow_ops import while_loop
 from tensorflow.python.ops.data_flow_ops import *
diff --git a/tensorflow/python/training/training.py b/tensorflow/python/training/training.py
index cdba18af8c..78c8ce9208 100644
--- a/tensorflow/python/training/training.py
+++ b/tensorflow/python/training/training.py
@@ -135,7 +135,7 @@ from tensorflow.python.training.queue_runner import *
 
 # For the module level doc.
 from tensorflow.python.training import input as _input
-from tensorflow.python.training.input import *
+from tensorflow.python.training.input import *  # pylint: disable=redefined-builtin
 # pylint: enable=wildcard-import
 
 from tensorflow.python.training.basic_session_run_hooks import SecondOrStepTimer
diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index 310c1b6248..f980ced2e4 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -186,7 +186,8 @@ do_pylint() {
   # C0301 line-too-long
   # C0326 bad-whitespace
   # W0611 unused-import
-  grep -E '(\[E|\[W0311|\[W0312|\[C0330|\[C0301|\[C0326|\[W0611)' ${OUTPUT_FILE} > ${ERRORS_FILE}
+  # W0622 redefined-builtin
+  grep -E '(\[E|\[W0311|\[W0312|\[C0330|\[C0301|\[C0326|\[W0611|\[W0622)' ${OUTPUT_FILE} > ${ERRORS_FILE}
 
   N_ERRORS=0
   while read -r LINE; do
diff --git a/tensorflow/tools/compatibility/tf_upgrade_test.py b/tensorflow/tools/compatibility/tf_upgrade_test.py
index a495f9883b..3d02eacba6 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_test.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_test.py
@@ -114,7 +114,7 @@ class TestUpgrade(test_util.TensorFlowTestCase):
     self.assertEqual(errors, ["test.py:1: tf.reverse requires manual check."])
 
   def testListComprehension(self):
-    def _test(input, output):
+    def _test(input, output):  # pylint: disable=redefined-builtin
       _, unused_report, errors, new_text = self._upgrade(input)
       self.assertEqual(new_text, output)
     _test("tf.concat(0,  \t[x for x in y])\n",
-- 
GitLab


From 8ced9faa028ec050516f89f4f542f5aa25f6f3d0 Mon Sep 17 00:00:00 2001
From: Gunhan Gulsoy <gunan@google.com>
Date: Fri, 9 Feb 2018 16:10:34 -0800
Subject: [PATCH 918/966] Disable flaky spinn_test

PiperOrigin-RevId: 185207742
---
 tensorflow/contrib/eager/python/examples/spinn/BUILD | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/eager/python/examples/spinn/BUILD b/tensorflow/contrib/eager/python/examples/spinn/BUILD
index a1f8a759e2..21055cfe11 100644
--- a/tensorflow/contrib/eager/python/examples/spinn/BUILD
+++ b/tensorflow/contrib/eager/python/examples/spinn/BUILD
@@ -38,5 +38,9 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_test_lib",
     ],
-    tags = ["no_pip"],  # because spinn.py is under third_party/.
+    tags = [
+        "manual",
+        "no_gpu",
+        "no_pip",  # because spinn.py is under third_party/.
+    ],
 )
-- 
GitLab


From 26114e0dc164a797fcebba149b61a1dde89c3a09 Mon Sep 17 00:00:00 2001
From: Yuefeng Zhou <yuefengz@google.com>
Date: Fri, 9 Feb 2018 16:14:00 -0800
Subject: [PATCH 919/966] Use x*x instead of x^2 to calculate square in huber
 loss implementation. The reason is d(x^y)/dy = x^y * log(x) and when x is
 zero, it becomes NaN. Even if y is constant, the check op would still report
 failure.

PiperOrigin-RevId: 185208180
---
 tensorflow/python/ops/losses/losses_impl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py
index 84afbf0627..5222333d7e 100644
--- a/tensorflow/python/ops/losses/losses_impl.py
+++ b/tensorflow/python/ops/losses/losses_impl.py
@@ -422,7 +422,7 @@ def huber_loss(labels, predictions, weights=1.0, delta=1.0, scope=None,
     # This is necessary to avoid doubling the gradient, since there is already a
     # nonzero contribution to the gradient from the quadratic term.
     linear = (abs_error - quadratic)
-    losses = 0.5 * quadratic**2 + delta * linear
+    losses = 0.5 * quadratic * quadratic + delta * linear
     return compute_weighted_loss(
         losses, weights, scope, loss_collection, reduction=reduction)
 
-- 
GitLab


From 7a5d775685c7c853a0d3dc5118016ae30e43f7a2 Mon Sep 17 00:00:00 2001
From: Kay Zhu <kayzhu@google.com>
Date: Fri, 9 Feb 2018 16:39:48 -0800
Subject: [PATCH 920/966] [XLA] Implement GeneralDot semantics in HloEvaluator.

Also:
- add a general matmul test, enable interpreter to run dot_operation_test.
- remove now redundant/obselete CHECKS for HandleDot in HloEvaluator.
- improve documentation for DotGeneral a bit.
PiperOrigin-RevId: 185211512
---
 .../compiler/xla/service/hlo_evaluator.cc     | 118 ++++++++++++++----
 tensorflow/compiler/xla/tests/BUILD           |   6 +
 .../compiler/xla/tests/dot_operation_test.cc  |  36 +++++-
 .../performance/xla/operation_semantics.md    |  10 +-
 4 files changed, 138 insertions(+), 32 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index ab604064d5..81212cda42 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -1028,55 +1028,119 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     CHECK(ShapeUtil::IsArray(lhs->shape()));
     CHECK(ShapeUtil::IsArray(rhs->shape()));
 
-    // Dot only supports operands of rank 1 and 2.
-    const auto dot_rank = ShapeUtil::Rank(dot->shape());
+    const auto& dnums = dot->dot_dimension_numbers();
+
     const auto lhs_rank = ShapeUtil::Rank(lhs->shape());
     const auto rhs_rank = ShapeUtil::Rank(rhs->shape());
-    CHECK(lhs_rank > 0 && lhs_rank <= 2);
-    CHECK(rhs_rank > 0 && rhs_rank <= 2);
-    CHECK_EQ(dot_rank, lhs_rank + rhs_rank - 2);
 
     CHECK(ShapeUtil::SameElementType(lhs->shape(), rhs->shape()));
     CHECK(ShapeUtil::SameElementType(lhs->shape(), dot->shape()));
 
-    // Check contracted dimensions are the same.
-    //
-    // Determine the index of the contracted dimensions for input tensors.
-    // dimensions -1 of lhs and dimension 0 of rhs are contracted.
-    const int64 lhs_contracted_dimension =
-        ShapeUtil::GetDimensionNumber(lhs->shape(), -1);
-    const int64 rhs_contracted_dimension = 0;
-    CHECK_EQ(lhs->shape().dimensions(lhs_contracted_dimension),
-             rhs->shape().dimensions(rhs_contracted_dimension))
+    // There must be 1 and only 1 Contracting dimension for lhs and rhs.
+    CHECK_EQ(dnums.lhs_contracting_dimensions_size(), 1);
+    CHECK_EQ(dnums.rhs_contracting_dimensions_size(), 1);
+    const int64 lhs_contracting_dimension = dnums.lhs_contracting_dimensions(0);
+    const int64 rhs_contracting_dimension = dnums.rhs_contracting_dimensions(0);
+    // Contracted dimension sizes must be the same.
+    CHECK_EQ(lhs->shape().dimensions(lhs_contracting_dimension),
+             rhs->shape().dimensions(rhs_contracting_dimension))
         << "lhs contracted dimension: "
-        << lhs->shape().dimensions(lhs_contracted_dimension)
+        << lhs->shape().dimensions(lhs_contracting_dimension)
         << " rhs contracted dimension: "
-        << rhs->shape().dimensions(rhs_contracted_dimension);
+        << rhs->shape().dimensions(rhs_contracting_dimension);
     const int64 contracted_dimension_size =
-        lhs->shape().dimensions(lhs_contracted_dimension);
+        lhs->shape().dimensions(lhs_contracting_dimension);
 
     const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
     const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
 
     auto result = Literal::CreateFromShape(dot->shape());
+
+    CHECK_EQ(dnums.lhs_batch_dimensions_size(),
+             dnums.rhs_batch_dimensions_size());
+
+    std::vector<int64> lhs_non_contracting_dims;
+    for (int64 i = 0; i < lhs_rank; i++) {
+      if (i != lhs_contracting_dimension) {
+        lhs_non_contracting_dims.push_back(i);
+      }
+    }
+
+    std::vector<int64> rhs_non_batch_non_contracting_dims;
+    tensorflow::gtl::FlatSet<int64> batch_dims_set(
+        dnums.rhs_batch_dimensions().begin(),
+        dnums.rhs_batch_dimensions().end());
+    for (int64 i = 0; i < rhs_rank; i++) {
+      if (i != rhs_contracting_dimension && batch_dims_set.count(i) == 0) {
+        rhs_non_batch_non_contracting_dims.push_back(i);
+      }
+    }
+
+    const int64 batch_dim_size = dnums.lhs_batch_dimensions_size();
+    const int64 lhs_non_contracting_size = lhs_non_contracting_dims.size();
+
+    DimensionVector lhs_index(lhs_rank);
+    DimensionVector rhs_index(rhs_rank);
     TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
-        [&](tensorflow::gtl::ArraySlice<int64> multi_index) {
+        [&](tensorflow::gtl::ArraySlice<int64> result_index) {
           ElementwiseT result_val = static_cast<ElementwiseT>(0);
 
-          std::vector<int64> lhs_index(lhs_rank, 0);
-          std::vector<int64> rhs_index(rhs_rank, 0);
-          // Set index for non-contracted dimension for lhs and rhs.
-          if (lhs_rank > 1) {
-            lhs_index[0] = multi_index[0];
+          // Find the corresponding non-contracting indices for lhs and rhs.
+          //
+          // For `result_index`, its batch dimension, if exists, will be at the
+          // same dimension as the batch dimension of lhs and rhs. More
+          // specifically:
+          // - For lhs, the non-contracting dimensions, including the batch
+          // dimension have the same index as the `result_index`.
+          // - For rhs, the batch dimension is set seperately from other
+          // non-contracting dimensions, since these other non-contracting
+          // dimensions in rhs follow the non-contracting dimensions of lhs in
+          // the resulting index.
+          //
+          // As an example, for a resulting index:
+          //  result_index [result_batch, result_x, result_y]
+          // the effecting lhs and rhs indices are:
+          //  lhs [result_batch, lhs_non_contracting_dim, contracting_dim
+          //  rhs [result_batch, contracting_dim, rhs_non_contracting_dim]
+          // `result_x` is only affected by the lhs_non_contracting_dim and
+          // likewise `result_y` only depends on rhs_non_contracting_dim.
+          //
+          // so we can look up the lhs and rhs indices by:
+          //
+          // lhs:
+          //  batch index is the same as `result_batch`.
+          //    non-contracting dimension is the same as
+          //    result_index[lhs_non_contracting_dim]
+          // rhs:
+          //  batch index: the same as `result_batch`.
+          //  non-contracting dimension index: *not* the same as
+          //    result_index[rhs_non_contractng_dim], since the
+          //    non-contracting dimensions of lhs are included in the
+          //    result_index first. Instead, the non_contracting_dim of rhs must
+          //    be calculated as following:
+          //      lhs_non_contracting_dimensions_size +
+          //      (rhs_non_batch_non_contracting_dim - batch_dim_size) - 1
+          //
+          //    Note that (rhs_non_batch_contracting_dim - batch_dim_size) is
+          //    the index offset to the result_index that only depends on
+          //    the non_batch and non-contracting dimensions of rhs. -1 at the
+          //    end translates size to index.
+          for (auto i : lhs_non_contracting_dims) {
+            lhs_index[i] = result_index[i];
+          }
+          for (auto i : dnums.rhs_batch_dimensions()) {
+            rhs_index[i] = result_index[i];
           }
-          if (rhs_rank > 1) {
-            rhs_index[1] = multi_index[multi_index.size() - 1];
+          for (auto i : rhs_non_batch_non_contracting_dims) {
+            const int64 rhs_non_batch_non_contracting_dim =
+                lhs_non_contracting_size + (i - batch_dim_size) - 1;
+            rhs_index[i] = result_index[rhs_non_batch_non_contracting_dim];
           }
 
           // Accumulates resulting product along the contracted dimension.
           for (int64 i = 0; i < contracted_dimension_size; ++i) {
-            lhs_index[lhs_contracted_dimension] = i;
-            rhs_index[rhs_contracted_dimension] = i;
+            lhs_index[lhs_contracting_dimension] = i;
+            rhs_index[rhs_contracting_dimension] = i;
 
             result_val +=
                 static_cast<ElementwiseT>(lhs_literal.Get<ReturnT>(lhs_index)) *
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index d4820d1b6d..d43ef248aa 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -601,6 +601,9 @@ xla_test(
 xla_test(
     name = "dot_operation_test",
     srcs = ["dot_operation_test.cc"],
+    tags = [
+        "enable_for_xla_interpreter",
+    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array3d",
@@ -623,6 +626,9 @@ xla_test(
 xla_test(
     name = "dot_operation_runtime_test",
     srcs = ["dot_operation_test.cc"],
+    tags = [
+        "enable_for_xla_interpreter",
+    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array3d",
diff --git a/tensorflow/compiler/xla/tests/dot_operation_test.cc b/tensorflow/compiler/xla/tests/dot_operation_test.cc
index cc683701e6..6b0c04c2c0 100644
--- a/tensorflow/compiler/xla/tests/dot_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/dot_operation_test.cc
@@ -520,9 +520,39 @@ XLA_TEST_F(DotOperationTest, BatchMatMul) {
 
   ComputeAndCompareR4<float>(
       &builder,
-      /*expected=*/{{{{1300, 2400}, {13, 24}}, {{11400, 13600}, {114, 136}}},
-                    {{{42900, 79200}, {429, 792}},
-                     {{250800, 299200}, {2508, 2992}}}},
+      /*expected=*/
+      {{{{1300, 2400}, {13, 24}}, {{11400, 13600}, {114, 136}}},
+       {{{42900, 79200}, {429, 792}}, {{250800, 299200}, {2508, 2992}}}},
+      {x_data.get(), y_data.get()}, error_spec_);
+}
+
+XLA_TEST_F(DotOperationTest, GeneralMatMul) {
+  ComputationBuilder builder(client_, TestName());
+  auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {2, 2, 2}), "x");
+  auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {2, 2, 2}), "y");
+
+  DotDimensionNumbers dnums;
+  dnums.add_lhs_contracting_dimensions(2);
+  dnums.add_rhs_contracting_dimensions(1);
+  dnums.add_lhs_batch_dimensions(0);
+  dnums.add_rhs_batch_dimensions(0);
+
+  auto out = builder.DotGeneral(x, y, dnums);
+
+  auto x_data = client_
+                    ->TransferToServer(*Literal::CreateR3<float>(
+                        {{{1.0, 2.0}, {3.0, 4.0}}, {{5.0, 6.0}, {7.0, 8.0}}}))
+                    .ConsumeValueOrDie();
+
+  auto y_data = client_
+                    ->TransferToServer(*Literal::CreateR3<float>(
+                        {{{1.0, 0.0}, {0.0, 1.0}}, {{1.0, 0.0}, {0.0, 1.0}}}))
+                    .ConsumeValueOrDie();
+
+  ComputeAndCompareR3<float>(
+      &builder,
+      /*expected=*/
+      {{{1.0, 2.0}, {3.0, 4.0}}, {{5.0, 6.0}, {7.0, 8.0}}},
       {x_data.get(), y_data.get()}, error_spec_);
 }
 
diff --git a/tensorflow/docs_src/performance/xla/operation_semantics.md b/tensorflow/docs_src/performance/xla/operation_semantics.md
index f865c30aa8..5431572db8 100644
--- a/tensorflow/docs_src/performance/xla/operation_semantics.md
+++ b/tensorflow/docs_src/performance/xla/operation_semantics.md
@@ -717,6 +717,7 @@ in 'dimension_numbers'.
 Associated contracting dimension numbers from the 'lhs' and 'rhs' do not need
 to be the same, but must be listed in the same order in both
 'lhs/rhs_contracting_dimensions' arrays and have the same dimension sizes.
+There must be exactly one contracting dimension on both 'lhs' and 'rhs'.
 
 Example with contracting dimension numbers:
 
@@ -736,8 +737,9 @@ DotGeneral(lhs, rhs, dnums) -> { {6.0, 12.0},
 ```
 
 Associated batch dimension numbers from the 'lhs' and 'rhs' must have the same
-dimension number, must be listed in the same order in both arrays, and must
-have the same dimension sizes.
+dimension number, must be listed in the same order in both arrays, must
+have the same dimension sizes, and must be ordered before contracting and
+non-contracting/non-batch dimension numbers.
 
 Example with batch dimension numbers (batch size 2, 2x2 matrices):
 
@@ -769,6 +771,10 @@ DotGeneral(lhs, rhs, dnums) -> { { {1.0, 2.0},
 | [b0, m, k] `dot` [b0, k, n]         | [b0, m, n]        |  batch matmul    |
 | [b0, b1, m, k] `dot` [b0, b1, k, n] | [b0, b1, m, n]    |  batch matmul    |
 
+It follows that the resulting dimension number starts with the batch dimension,
+then the 'lhs' non-contracting/non-batch dimension, and finally the 'rhs'
+non-contracting/non-batch dimension.
+
 ## DynamicSlice
 
 See also
-- 
GitLab


From 816f59e6ab53c4553b0325b872b7be5ea73da89b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 9 Feb 2018 16:57:14 -0800
Subject: [PATCH 921/966] [XLA:Tool] Make Hlo parser report the location of the
 already defined instrucion/computation.

PiperOrigin-RevId: 185213461
---
 .../compiler/xla/tools/parser/hlo_parser.cc   | 48 +++++++++++--------
 .../xla/tools/parser/hlo_parser_test.cc       | 29 +++++++++++
 2 files changed, 58 insertions(+), 19 deletions(-)

diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc b/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
index d9c4d094b8..89def5d561 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
+++ b/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
@@ -220,10 +220,13 @@ class HloParser {
   bool AddComputation(const string& name, HloComputation* computation,
                       LocTy name_loc);
 
-  // The map from the instruction name to the instruction. This does not own the
-  // instructions.
-  std::unordered_map<string, HloInstruction*> instruction_pool_;
-  std::unordered_map<string, HloComputation*> computation_pool_;
+  // The map from the instruction/computation name to the
+  // instruction/computation itself and it's location. This does not own the
+  // pointers.
+  std::unordered_map<string, std::pair<HloInstruction*, LocTy>>
+      instruction_pool_;
+  std::unordered_map<string, std::pair<HloComputation*, LocTy>>
+      computation_pool_;
 
   HloLexer lexer_;
   std::unique_ptr<HloModule> module_;
@@ -340,15 +343,16 @@ bool HloParser::ParseComputation(HloComputation** entry_computation) {
     return false;
   }
 
-  HloInstruction* root =
-      tensorflow::gtl::FindPtrOrNull(instruction_pool_, root_name);
+  std::pair<HloInstruction*, LocTy>* root_node =
+      tensorflow::gtl::FindOrNull(instruction_pool_, root_name);
   // This means some instruction was marked as ROOT but we didn't find it in the
   // pool, which should not happen.
-  if (!root_name.empty() && root == nullptr) {
+  if (!root_name.empty() && root_node == nullptr) {
     LOG(FATAL) << "instruction " << root_name
                << " was marked as ROOT but the parser has not seen it before";
   }
 
+  HloInstruction* root = root_node == nullptr ? nullptr : root_node->first;
   // Now root can be either an existing instruction or a nullptr. If it's a
   // nullptr, the implementation of Builder will set the last instruction as
   // root instruction.
@@ -1229,13 +1233,13 @@ bool HloParser::ParseInstructionNames(
     if (!ParseName(&name)) {
       return Error(loc, "expects a instruction name");
     }
-    HloInstruction* instr =
-        tensorflow::gtl::FindPtrOrNull(instruction_pool_, name);
+    std::pair<HloInstruction*, LocTy>* instr =
+        tensorflow::gtl::FindOrNull(instruction_pool_, name);
     if (!instr) {
       return TokenError(
           Printf("instruction '%s' is not defined", name.c_str()));
     }
-    instructions->push_back(instr);
+    instructions->push_back(instr->first);
   } while (EatIfPresent(TokKind::kComma));
 
   return ParseToken(TokKind::kRbrace,
@@ -1705,12 +1709,12 @@ bool HloParser::ParseOperands(std::vector<HloInstruction*>* operands) {
       if (!ParseName(&name)) {
         return false;
       }
-      HloInstruction* instruction =
-          tensorflow::gtl::FindPtrOrNull(instruction_pool_, name);
+      std::pair<HloInstruction*, LocTy>* instruction =
+          tensorflow::gtl::FindOrNull(instruction_pool_, name);
       if (!instruction) {
         return Error(loc, StrCat("instruction does not exist: ", name));
       }
-      operands->push_back(instruction);
+      operands->push_back(instruction->first);
     } while (EatIfPresent(TokKind::kComma));
   }
   return ParseToken(TokKind::kRparen, "expects ')' at the end of operands");
@@ -1957,10 +1961,12 @@ bool HloParser::ParseComputationName(HloComputation** value) {
   if (!ParseName(&name)) {
     return Error(loc, "expects computation name");
   }
-  *value = tensorflow::gtl::FindPtrOrNull(computation_pool_, name);
-  if (*value == nullptr) {
+  std::pair<HloComputation*, LocTy>* computation =
+      tensorflow::gtl::FindOrNull(computation_pool_, name);
+  if (computation == nullptr) {
     return Error(loc, StrCat("computation does not exist: ", name));
   }
+  *value = computation->first;
   return true;
 }
 
@@ -2576,18 +2582,22 @@ bool HloParser::EatIfPresent(TokKind kind) {
 
 bool HloParser::AddInstruction(const string& name, HloInstruction* instruction,
                                LocTy name_loc) {
-  auto result = instruction_pool_.insert({name, instruction});
+  auto result = instruction_pool_.insert({name, {instruction, name_loc}});
   if (!result.second) {
-    return Error(name_loc, StrCat("instruction already exists: ", name));
+    Error(name_loc, StrCat("instruction already exists: ", name));
+    return Error(/*loc=*/result.first->second.second,
+                 "instruction previously defined here");
   }
   return true;
 }
 
 bool HloParser::AddComputation(const string& name, HloComputation* computation,
                                LocTy name_loc) {
-  auto result = computation_pool_.insert({name, computation});
+  auto result = computation_pool_.insert({name, {computation, name_loc}});
   if (!result.second) {
-    return Error(name_loc, StrCat("computation already exists: ", name));
+    Error(name_loc, StrCat("computation already exists: ", name));
+    return Error(/*loc=*/result.first->second.second,
+                 "computation previously defined here");
   }
   return true;
 }
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc b/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
index dd76d8d0fe..b8c6b59204 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
@@ -1275,6 +1275,35 @@ ENTRY consts {
                   "one computation should have only one ROOT");
 }
 
+TEST_F(HloParserTest, InstructionExists) {
+  const string original = R"(HloModule comp_exists
+c1 {
+  instr = f32[1]{0} constant({12345})
+}
+c2 {
+  instr = f32[1]{0} constant({67890})
+})";
+
+  ExpectHasSubstr(Parse(original).status().error_message(),
+                  R"(was parsing 3:3: error: instruction previously defined here
+  instr = f32[1]{0} constant({12345})
+  ^)");
+}
+
+TEST_F(HloParserTest, ComputationExists) {
+  const string original = R"(HloModule comp_exists
+comp {
+  const1 = f32[1]{0} constant({12345})
+}
+comp {
+  const2 = f32[1]{0} constant({67890})
+})";
+  ExpectHasSubstr(Parse(original).status().error_message(),
+                  R"(was parsing 2:1: error: computation previously defined here
+comp {
+^)");
+}
+
 }  // namespace
 }  // namespace tools
 }  // namespace xla
-- 
GitLab


From 5b71a126c4f0733eccefee76b599e0315f052bef Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Fri, 9 Feb 2018 17:14:30 -0800
Subject: [PATCH 922/966] import_graph_def: support "absolute" names with the C
 API enabled.

Passing a name with a trailing '/' to import_graph_def causes that
name to be used as-is (i.e. it is not appended to the existing name
scope and not de-duped with any existing name scopes. This is in order
to re-use an existing name scope). This didn't work with the C API
enabled because it was set to always have the C API uniquify the
prefix.

The fix is to not uniquify the prefix, since calling name_scope in
import_graph_def already has the logic to uniquify the prefix if
necessary. I'm not sure why I thought we needed the C API to do this
to being with.

In addition, this changes the graph_constructor.cc logic to uniquify
names if the prefix cannot be guaranteed unique (see the new test case
in graph_constructor_test.cc for why/when this is necessary).

PiperOrigin-RevId: 185215326
---
 tensorflow/core/graph/graph_constructor.cc    | 16 +++++---------
 .../core/graph/graph_constructor_test.cc      | 22 +++++++++++++++++--
 tensorflow/python/framework/importer.py       |  1 -
 tensorflow/python/framework/importer_test.py  | 19 ++++++++++++++++
 4 files changed, 45 insertions(+), 13 deletions(-)

diff --git a/tensorflow/core/graph/graph_constructor.cc b/tensorflow/core/graph/graph_constructor.cc
index 2a52c7516e..0629ff32d0 100644
--- a/tensorflow/core/graph/graph_constructor.cc
+++ b/tensorflow/core/graph/graph_constructor.cc
@@ -374,15 +374,8 @@ Status GraphConstructor::EnsureNoNameCollisions() {
       return errors::InvalidArgument("Imported node name prefix '", prefix_,
                                      "' would lead to invalid node names");
     }
-    if (NameExistsInGraph(prefix_no_slash)) {
-      if (opts_.uniquify_prefix) {
-        prefix_ = strings::StrCat(FindUniqueName(prefix_no_slash), "/");
-      } else {
-        return errors::InvalidArgument("Import node name prefix '",
-                                       prefix_no_slash,
-                                       "' conflicts with "
-                                       "name already used in the graph");
-      }
+    if (NameExistsInGraph(prefix_no_slash) && opts_.uniquify_prefix) {
+      prefix_ = strings::StrCat(FindUniqueName(prefix_no_slash), "/");
     }
   }
   return Status::OK();
@@ -990,7 +983,10 @@ Status GraphConstructor::Convert() {
     if (opts_.importing) {
       if (!prefix_.empty()) {
         AddPrefixToNodeDef(input_already_exists, &imported_node_def);
-      } else if (opts_.uniquify_names) {
+      }
+      // Note: no need to uniquify names if the prefix already guarantees
+      // uniqueness
+      if (opts_.uniquify_names && (prefix_.empty() || !opts_.uniquify_prefix)) {
         UniquifyNames(input_already_exists, &imported_node_def);
       }
       TF_RETURN_IF_ERROR(ModifyNodeDefForImport(&imported_node_def));
diff --git a/tensorflow/core/graph/graph_constructor_test.cc b/tensorflow/core/graph/graph_constructor_test.cc
index c59e478f15..963c1dc024 100644
--- a/tensorflow/core/graph/graph_constructor_test.cc
+++ b/tensorflow/core/graph/graph_constructor_test.cc
@@ -1834,7 +1834,7 @@ TEST_F(GraphConstructorTest, ImportGraphDef_UniquifyNames) {
   EXPECT_EQ(results.return_nodes[1]->name(), "B_2");
   EXPECT_EQ(results.return_nodes[1]->def().input(0), "A_2:0");
 
-  // Import with an already-used prefix
+  // Import with an already-used prefix and uniquify_prefix = true
   opts.prefix = "A";
   opts.uniquify_prefix = true;
   results = ImportGraphDefResults();
@@ -1846,9 +1846,27 @@ TEST_F(GraphConstructorTest, ImportGraphDef_UniquifyNames) {
   EXPECT_EQ(results.return_nodes[1]->def().input(0), "A_3/A");
 
   // Create B_3 node to keep the A/B numbering in sync
-  opts = ImportGraphDefOptions();
   ExpectOK("node { name: 'B_3' op: 'TestInput' }");
 
+  // Import with an already-used prefix and uniquify_prefix = false
+  opts.uniquify_prefix = false;
+  results = ImportGraphDefResults();
+  ExpectOK(graph_def_str, opts, &refiner, &results);
+
+  ASSERT_EQ(results.return_nodes.size(), 2);
+  EXPECT_EQ(results.return_nodes[0]->name(), "A/A");
+  EXPECT_EQ(results.return_nodes[1]->name(), "A/B");
+  EXPECT_EQ(results.return_nodes[1]->def().input(0), "A/A");
+
+  // Repeat the same import
+  results = ImportGraphDefResults();
+  ExpectOK(graph_def_str, opts, &refiner, &results);
+
+  ASSERT_EQ(results.return_nodes.size(), 2);
+  EXPECT_EQ(results.return_nodes[0]->name(), "A/A_1");
+  EXPECT_EQ(results.return_nodes[1]->name(), "A/B_1");
+  EXPECT_EQ(results.return_nodes[1]->def().input(0), "A/A_1:0");
+
   // Import with existing de-duped node names
   opts = ImportGraphDefOptions();
   opts.uniquify_names = true;
diff --git a/tensorflow/python/framework/importer.py b/tensorflow/python/framework/importer.py
index cc8f2392ba..6ecc1a40ae 100644
--- a/tensorflow/python/framework/importer.py
+++ b/tensorflow/python/framework/importer.py
@@ -270,7 +270,6 @@ def _PopulateTFImportGraphDefOptions(options, prefix, input_map,
   """Populates the TF_ImportGraphDefOptions `options`."""
   c_api.TF_ImportGraphDefOptionsSetPrefix(options, prefix)
   c_api.TF_ImportGraphDefOptionsSetUniquifyNames(options, True)
-  c_api.TF_ImportGraphDefOptionsSetUniquifyPrefix(options, True)
 
   for input_src, input_dst in input_map.items():
     input_src = compat.as_str(input_src)
diff --git a/tensorflow/python/framework/importer_test.py b/tensorflow/python/framework/importer_test.py
index acaec37f81..bf5d9fe093 100644
--- a/tensorflow/python/framework/importer_test.py
+++ b/tensorflow/python/framework/importer_test.py
@@ -154,6 +154,25 @@ class ImportGraphDefTest(test.TestCase):
       self.assertEqual(b3.name, "A_3/B")
       self.assertEqual(list(b3.inputs), [a3.outputs[0]])
 
+      # Import with an already-used name but with a '/' to indicate an
+      # "absolute" name scope (see the Graph.name_scope docstring).
+      a_a, a_b = importer.import_graph_def(
+          graph_def,
+          return_elements=["A", "B"],
+          name="A/")
+      self.assertEqual(a_a.name, "A/A")
+      self.assertEqual(a_b.name, "A/B")
+      self.assertEqual(list(a_b.inputs), [a_a.outputs[0]])
+
+      # Repeat the same import.
+      a_a1, a_b1 = importer.import_graph_def(
+          graph_def,
+          return_elements=["A", "B"],
+          name="A/")
+      self.assertEqual(a_a1.name, "A/A_1")
+      self.assertEqual(a_b1.name, "A/B_1")
+      self.assertEqual(list(a_b1.inputs), [a_a1.outputs[0]])
+
       # Import with existing de-duped node names
       a1_1, b1_1 = importer.import_graph_def(
           self._MakeGraphDef("""
-- 
GitLab


From 2d7b1d4ca7140bdcdf5eda5db642357202337f98 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Fri, 9 Feb 2018 17:30:03 -0800
Subject: [PATCH 923/966] Add a test that exhaustively checks Log/Exp/Tanh

PiperOrigin-RevId: 185216684
---
 tensorflow/compiler/xla/tests/BUILD           |  20 +++
 .../exhaustive_f32_elementwise_op_test.cc     | 128 ++++++++++++++++++
 .../compiler/xla/tests/literal_test_util.cc   |  19 ++-
 .../compiler/xla/tests/literal_test_util.h    |   8 +-
 4 files changed, 168 insertions(+), 7 deletions(-)
 create mode 100644 tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc

diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index d43ef248aa..60f3e61807 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -575,6 +575,26 @@ xla_test(
     ],
 )
 
+xla_test(
+    name = "exhaustive_f32_elementwise_op_test",
+    srcs = ["exhaustive_f32_elementwise_op_test.cc"],
+    backends = [
+        "cpu",
+        "gpu",
+    ],
+    shard_count = 48,
+    tags = [
+        "enormous",
+        "manual",
+    ],
+    deps = [
+        ":client_library_test_base",
+        ":literal_test_util",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:lib",
+    ],
+)
+
 xla_test(
     name = "reduce_precision_test",
     srcs = ["reduce_precision_test.cc"],
diff --git a/tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc b/tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc
new file mode 100644
index 0000000000..6fe7737de7
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc
@@ -0,0 +1,128 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/core/lib/core/casts.h"
+
+namespace xla {
+namespace {
+class ExhaustiveF32ElementwiseOpTest
+    : public ClientLibraryTestBase,
+      public ::testing::WithParamInterface<std::pair<int64, int64>> {
+ protected:
+  ErrorSpec error_spec_{0.0001, 0.0001, /*relaxed_nans=*/true};
+
+  template <typename EnqueueOpTy>
+  void ExhaustivelyTestF32Op(EnqueueOpTy enqueue_op,
+                             float (*evaluate_op)(float),
+                             std::pair<int64, int64> known_incorrect_range) {
+    int64 begin, end;
+    std::tie(begin, end) = GetParam();
+    int64 input_size = end - begin;
+    LOG(INFO) << "Checking range [" << begin << ", " << end << ")";
+
+    ComputationBuilder builder(client_, TestName());
+
+    std::unique_ptr<Literal> input_literal =
+        Literal::CreateFromDimensions(F32, {input_size});
+    for (int64 i = begin; i < end; i++) {
+      if (i >= known_incorrect_range.first &&
+          i < known_incorrect_range.second) {
+        // If the operation is known to be buggy on a specific input clamp that
+        // input to 0 under the assumption that the op is at least correct on 0.
+        input_literal->Set({i - begin}, 0.0f);
+      } else {
+        input_literal->Set({i - begin}, tensorflow::bit_cast<float, int>(i));
+      }
+    }
+
+    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<GlobalData> input_data,
+                            client_->TransferToServer(*input_literal));
+
+    auto input = builder.Parameter(0, input_literal->shape(), "input");
+    enqueue_op(&builder, input);
+
+    std::vector<float> expected_result;
+    expected_result.reserve(input_size);
+    for (int64 i = 0; i < input_size; i++) {
+      expected_result.push_back(evaluate_op(input_literal->Get<float>({i})));
+    }
+
+    ComputeAndCompareR1<float>(&builder, expected_result, {input_data.get()},
+                               error_spec_);
+  }
+};
+
+XLA_TEST_P(ExhaustiveF32ElementwiseOpTest, LogF32) {
+#ifdef XLA_TEST_BACKEND_CPU
+  // TODO(b/73141998): The vectorized Log implementation gives results outside
+  // our error spec in this range (these numbers are bitwise representations of
+  // floats expressed as a zero extended int64):
+  std::pair<int64, int64> known_incorrect_range = {1, 8315654};
+#else
+  std::pair<int64, int64> known_incorrect_range = {0, 0};
+#endif
+
+  ExhaustivelyTestF32Op(
+      [](ComputationBuilder* builder, const ComputationDataHandle& input) {
+        builder->Log(input);
+      },
+      std::log, known_incorrect_range);
+}
+
+XLA_TEST_P(ExhaustiveF32ElementwiseOpTest, ExpF32) {
+#ifdef XLA_TEST_BACKEND_CPU
+  // TODO(b/73142289): The vectorized Exp implementation gives results outside
+  // our error spec in this range (these numbers are bitwise representations of
+  // floats expressed as a zero extended int64):
+  std::pair<int64, int64> known_incorrect_range = {1107296256 + 11583654,
+                                                   1107296256 + 11629080};
+#else
+  std::pair<int64, int64> known_incorrect_range = {0, 0};
+#endif
+
+  ExhaustivelyTestF32Op(
+      [](ComputationBuilder* builder, const ComputationDataHandle& input) {
+        builder->Exp(input);
+      },
+      std::exp, known_incorrect_range);
+}
+
+XLA_TEST_P(ExhaustiveF32ElementwiseOpTest, TanhF32) {
+  ExhaustivelyTestF32Op(
+      [](ComputationBuilder* builder, const ComputationDataHandle& input) {
+        builder->Tanh(input);
+      },
+      std::tanh, /*known_incorrect_range=*/{0, 0});
+}
+
+std::vector<std::pair<int64, int64>> CreateExhaustiveParameters() {
+  // We break up the 2^32-element space into small'ish chunks to keep peak
+  // memory usage low.
+  std::vector<std::pair<int64, int64>> result;
+  const int64 step = 1 << 25;
+  for (int64 i = 0; i < (1l << 32); i += step) {
+    result.push_back({i, i + step});
+  }
+  return result;
+}
+
+INSTANTIATE_TEST_CASE_P(ExhaustiveF32ElementwiseOpTestInstance,
+                        ExhaustiveF32ElementwiseOpTest,
+                        ::testing::ValuesIn(CreateExhaustiveParameters()));
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/literal_test_util.cc b/tensorflow/compiler/xla/tests/literal_test_util.cc
index 474d2547ae..5aa71a9261 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util.cc
+++ b/tensorflow/compiler/xla/tests/literal_test_util.cc
@@ -449,8 +449,12 @@ class NearComparator {
 
  private:
   template <typename NativeT>
-  bool NanMismatch(NativeT lhs, NativeT rhs) {
-    return std::isnan(lhs) != std::isnan(rhs);
+  bool NanMismatch(NativeT expected, NativeT actual, bool relaxed_nans) {
+    if (relaxed_nans) {
+      return !std::isnan(expected) && std::isnan(actual);
+    } else {
+      return std::isnan(expected) != std::isnan(actual);
+    }
   }
 
   template <typename NativeT>
@@ -472,7 +476,8 @@ class NearComparator {
 
     const float abs_diff = std::abs(actual - expected);
     const float rel_err = abs_diff / std::abs(expected);
-    const bool nan_mismatch = NanMismatch<NativeT>(expected, actual);
+    const bool nan_mismatch =
+        NanMismatch<NativeT>(expected, actual, error_.relaxed_nans);
     const bool mismatch =
         (nan_mismatch || (abs_diff >= error_.abs && rel_err >= error_.rel));
     return !mismatch;
@@ -630,9 +635,11 @@ class NearComparator {
 };
 
 template <>
-bool NearComparator::NanMismatch<complex64>(complex64 lhs, complex64 rhs) {
-  return std::isnan(lhs.real()) != std::isnan(rhs.real()) ||
-         std::isnan(lhs.imag()) != std::isnan(rhs.imag());
+bool NearComparator::NanMismatch<complex64>(complex64 expected,
+                                            complex64 actual,
+                                            bool relaxed_nans) {
+  return NanMismatch(expected.real(), actual.real(), relaxed_nans) ||
+         NanMismatch(expected.imag(), actual.imag(), relaxed_nans);
 }
 
 template <>
diff --git a/tensorflow/compiler/xla/tests/literal_test_util.h b/tensorflow/compiler/xla/tests/literal_test_util.h
index 9b0724262d..7b757a4bd7 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util.h
+++ b/tensorflow/compiler/xla/tests/literal_test_util.h
@@ -40,10 +40,16 @@ namespace xla {
 
 // Structure describing permissible absolute and relative error bounds.
 struct ErrorSpec {
-  explicit ErrorSpec(float aabs, float arel = 0) : abs(aabs), rel(arel) {}
+  explicit ErrorSpec(float aabs, float arel = 0, bool relaxed_nans = false)
+      : abs(aabs), rel(arel), relaxed_nans(relaxed_nans) {}
 
   float abs;  // Absolute error bound.
   float rel;  // Relative error bound.
+
+  // If relaxed_nans is true then any result is valid if we are expecting NaNs.
+  // In effect, this allows the tested operation to produce incorrect results
+  // for inputs outside its mathematical domain.
+  bool relaxed_nans;
 };
 
 // Utility class for making expectations/assertions related to XLA literals.
-- 
GitLab


From 7ace7f14caf81c9acbac2e3ba26a754cbe78ead5 Mon Sep 17 00:00:00 2001
From: Guangda Lai <laigd@google.com>
Date: Fri, 9 Feb 2018 22:47:30 -0800
Subject: [PATCH 924/966] Fix grappler to use CudaGpuId instead of TfGpuId to
 query device states.

PiperOrigin-RevId: 185233116
---
 tensorflow/core/grappler/clusters/BUILD           |  6 +++---
 .../core/grappler/clusters/single_machine.cc      |  9 +++++++--
 tensorflow/core/grappler/clusters/utils.cc        | 15 +++++++++++----
 tensorflow/core/grappler/clusters/utils.h         |  3 ++-
 tensorflow/core/grappler/costs/BUILD              |  1 +
 tensorflow/core/grappler/costs/utils.cc           |  3 ++-
 6 files changed, 26 insertions(+), 11 deletions(-)

diff --git a/tensorflow/core/grappler/clusters/BUILD b/tensorflow/core/grappler/clusters/BUILD
index 5b8ce373bc..b15a709c5b 100644
--- a/tensorflow/core/grappler/clusters/BUILD
+++ b/tensorflow/core/grappler/clusters/BUILD
@@ -26,13 +26,12 @@ config_setting(
 tf_cuda_library(
     name = "utils",
     srcs = ["utils.cc"],
-    hdrs = [
-        "utils.h",
-    ],
+    hdrs = ["utils.h"],
     visibility = ["//visibility:public"],
     deps = [
         "//third_party/eigen3",
         "//tensorflow/core:framework",
+        "//tensorflow/core:gpu_id",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
     ] + select({
@@ -104,6 +103,7 @@ cc_library(
         "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:direct_session",
         "//tensorflow/core:framework",
+        "//tensorflow/core:gpu_id",
         "//tensorflow/core:lib",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/kernels:ops_util",
diff --git a/tensorflow/core/grappler/clusters/single_machine.cc b/tensorflow/core/grappler/clusters/single_machine.cc
index 862ce4ae88..3e97b31f2c 100644
--- a/tensorflow/core/grappler/clusters/single_machine.cc
+++ b/tensorflow/core/grappler/clusters/single_machine.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/cc/training/queue_runner.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
 #include "tensorflow/core/grappler/clusters/utils.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/kernels/ops_util.h"
@@ -79,13 +80,17 @@ Status SingleMachine::Provision() {
 
   std::vector<DeviceAttributes> devices;
   TF_RETURN_IF_ERROR(session_->ListDevices(&devices));
-  int gpu_id = 0;
   for (const auto& dev : devices) {
     DeviceProperties attr;
     if (dev.device_type() == "CPU") {
       attr = GetLocalCPUInfo();
     } else if (dev.device_type() == "GPU") {
-      attr = GetLocalGPUInfo(gpu_id++);
+      DeviceNameUtils::ParsedName parsed;
+      if (!DeviceNameUtils::ParseFullName(dev.name(), &parsed)) {
+        return errors::InvalidArgument(
+            strings::StrCat("Not able to parse GPU device name: ", dev.name()));
+      }
+      attr = GetLocalGPUInfo(TfGpuId(parsed.id));
     } else {
       attr.set_type(dev.device_type());
     }
diff --git a/tensorflow/core/grappler/clusters/utils.cc b/tensorflow/core/grappler/clusters/utils.cc
index aacd2ccb72..3e7a7a3356 100644
--- a/tensorflow/core/grappler/clusters/utils.cc
+++ b/tensorflow/core/grappler/clusters/utils.cc
@@ -27,6 +27,8 @@ limitations under the License.
 #include "include/libxsmm.h"
 #endif
 
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/cpu_info.h"
@@ -66,13 +68,14 @@ DeviceProperties GetLocalCPUInfo() {
   return device;
 }
 
-DeviceProperties GetLocalGPUInfo(int gpu_id) {
+DeviceProperties GetLocalGPUInfo(TfGpuId tf_gpu_id) {
   DeviceProperties device;
   device.set_type("GPU");
 
 #if GOOGLE_CUDA
   cudaDeviceProp properties;
-  cudaError_t error = cudaGetDeviceProperties(&properties, gpu_id);
+  CudaGpuId cuda_gpu_id = GpuIdManager::TfToCudaGpuId(tf_gpu_id);
+  cudaError_t error = cudaGetDeviceProperties(&properties, cuda_gpu_id.value());
   if (error == cudaSuccess) {
     device.set_vendor("NVidia");
     device.set_model(properties.name);
@@ -94,6 +97,10 @@ DeviceProperties GetLocalGPUInfo(int gpu_id) {
     // double data rate (DDR).
     device.set_bandwidth(properties.memoryBusWidth / 8 *
                          properties.memoryClockRate * 2);
+  } else {
+    LOG(ERROR) << "Failed to get device properties, error code: " << error;
+    device.set_type("UNKNOWN");
+    return device;
   }
 
   (*device.mutable_environment())["architecture"] =
@@ -110,9 +117,9 @@ DeviceProperties GetDeviceInfo(const DeviceNameUtils::ParsedName& device) {
     return GetLocalCPUInfo();
   } else if (device.type == "GPU") {
     if (device.has_id) {
-      return GetLocalGPUInfo(device.id);
+      return GetLocalGPUInfo(TfGpuId(device.id));
     } else {
-      return GetLocalGPUInfo(0);
+      return GetLocalGPUInfo(TfGpuId(0));
     }
   }
   DeviceProperties result;
diff --git a/tensorflow/core/grappler/clusters/utils.h b/tensorflow/core/grappler/clusters/utils.h
index 191942040a..4ea7e98390 100644
--- a/tensorflow/core/grappler/clusters/utils.h
+++ b/tensorflow/core/grappler/clusters/utils.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_GRAPPLER_CLUSTERS_UTILS_H_
 #define TENSORFLOW_GRAPPLER_CLUSTERS_UTILS_H_
 
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
 #include "tensorflow/core/protobuf/device_properties.pb.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
@@ -27,7 +28,7 @@ DeviceProperties GetLocalCPUInfo();
 
 // Returns the DeviceProperties for the specified GPU attached to the server on
 // which grappler is running.
-DeviceProperties GetLocalGPUInfo(int gpu_id);
+DeviceProperties GetLocalGPUInfo(TfGpuId tf_gpu_id);
 
 // Returns the DeviceProperties of the specified device
 DeviceProperties GetDeviceInfo(const DeviceNameUtils::ParsedName& device);
diff --git a/tensorflow/core/grappler/costs/BUILD b/tensorflow/core/grappler/costs/BUILD
index 0fe01e9c9e..5336df1f51 100644
--- a/tensorflow/core/grappler/costs/BUILD
+++ b/tensorflow/core/grappler/costs/BUILD
@@ -142,6 +142,7 @@ tf_cuda_library(
         "//third_party/eigen3",
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
+        "//tensorflow/core:gpu_id",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_proto_parsing",
         "//tensorflow/core:protos_all_cc",
diff --git a/tensorflow/core/grappler/costs/utils.cc b/tensorflow/core/grappler/costs/utils.cc
index 602f69f12e..ac30090607 100644
--- a/tensorflow/core/grappler/costs/utils.cc
+++ b/tensorflow/core/grappler/costs/utils.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "cuda/include/cudnn.h"
 #endif
 
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
 #include "tensorflow/core/framework/allocation_description.pb.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/op.h"
@@ -203,7 +204,7 @@ DeviceProperties GetDeviceInfo(const string& device_str) {
   DeviceNameUtils::ParsedName parsed;
   if (DeviceNameUtils::ParseFullName(device_str, &parsed)) {
     if (parsed.type == "GPU") {
-      return GetLocalGPUInfo(parsed.id);
+      return GetLocalGPUInfo(TfGpuId(parsed.id));
     } else if (parsed.type == "CPU") {
       return GetLocalCPUInfo();
     }
-- 
GitLab


From 69fbb7717102dba11c471cd77088a5d6c1274b71 Mon Sep 17 00:00:00 2001
From: Yao Zhang <yaozhang@google.com>
Date: Sat, 10 Feb 2018 01:45:11 -0800
Subject: [PATCH 925/966] Do not convert layout for Select if condition input
 is of unknown shape.

PiperOrigin-RevId: 185242138
---
 .../grappler/optimizers/layout_optimizer.cc   | 11 ++++++-
 .../python/grappler/layout_optimizer_test.py  | 31 +++++++++++++++++++
 2 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.cc b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
index e1a2d65278..5a62b77327 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
@@ -1428,7 +1428,6 @@ class IdentityNProcessor : public AgnosticNodeProcessor {
   std::vector<int> GetInputPos() const override { return input_pos_; }
 
   std::set<int> GetOutputPos() const override {
-    std::vector<int> input_poses;
     std::set<int> output_pos{};
     for (const auto& input_pos : input_pos_) {
       output_pos.insert(input_pos);
@@ -1572,6 +1571,16 @@ class SelectProcessor : public AgnosticNodeProcessor {
       : AgnosticNodeProcessor(opt_cxt) {}
 
  protected:
+  bool ShouldProcess() const override {
+    auto input0 = node_map_->GetNode(node_->input(0));
+    int input0_port;
+    ParseNodeName(node_->input(0), &input0_port);
+    bool is_input0_scalar_vector_4d = IsPortDimsN(*input0, input0_port, 0) ||
+                                      IsPortDimsN(*input0, input0_port, 1) ||
+                                      IsPortDimsN(*input0, input0_port, 4);
+    return AgnosticNodeProcessor::ShouldProcess() && is_input0_scalar_vector_4d;
+  }
+
   std::vector<int> GetInputPos() const override {
     auto input0 = node_map_->GetNode(node_->input(0));
     int input0_port;
diff --git a/tensorflow/python/grappler/layout_optimizer_test.py b/tensorflow/python/grappler/layout_optimizer_test.py
index 25b1cdcbc5..30dcdf31aa 100644
--- a/tensorflow/python/grappler/layout_optimizer_test.py
+++ b/tensorflow/python/grappler/layout_optimizer_test.py
@@ -771,6 +771,37 @@ class LayoutOptimizerTest(test.TestCase):
       self._assert_trans_nchw_to_nhwc('Select-0-0', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
+  def testSelectOpConditionUnknownShape(self):
+    if test.is_gpu_available(cuda_only=True):
+      random_seed.set_random_seed(0)
+      x = random_ops.truncated_normal([1, 784], seed=0)
+      conv = _two_layer_model(x)
+      add = math_ops.add(conv, conv)
+      condition = array_ops.placeholder(dtype='bool')
+      select = gen_math_ops._select(condition, conv, add)
+      output = array_ops.identity(select)
+
+      condition_val = np.zeros((1, 7, 7, 64))
+      with session.Session() as sess:
+        output_val_ref = sess.run(output, feed_dict={condition: condition_val})
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(
+            output, run_metadata=metadata, feed_dict={condition: condition_val})
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      expected_num_transposes = 3
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
   def testSelectOpScalarCondition(self):
     if test.is_gpu_available(cuda_only=True):
       random_seed.set_random_seed(0)
-- 
GitLab


From 00c135cc5f8e53316936fdad687d3b79f3561476 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 10 Feb 2018 03:47:15 -0800
Subject: [PATCH 926/966] Automated g4 rollback of changelist 185073515

PiperOrigin-RevId: 185246348
---
 .../kernel_tests/halton_sequence_test.py      | 101 ++--------
 .../python/ops/halton_sequence_impl.py        | 185 +++++-------------
 2 files changed, 67 insertions(+), 219 deletions(-)

diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/halton_sequence_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/halton_sequence_test.py
index c516ce45e0..0a85862abf 100644
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/halton_sequence_test.py
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/halton_sequence_test.py
@@ -36,35 +36,29 @@ class HaltonSequenceTest(test.TestCase):
 
   def test_known_values_small_bases(self):
     with self.test_session():
-      # The first five elements of the non-randomized Halton sequence
-      # with base 2 and 3.
+      # The first five elements of the Halton sequence with base 2 and 3
       expected = np.array(((1. / 2, 1. / 3),
                            (1. / 4, 2. / 3),
                            (3. / 4, 1. / 9),
                            (1. / 8, 4. / 9),
                            (5. / 8, 7. / 9)), dtype=np.float32)
-      sample = halton.sample(2, num_results=5, randomized=False)
+      sample = halton.sample(2, num_samples=5)
       self.assertAllClose(expected, sample.eval(), rtol=1e-6)
 
-  def test_sequence_indices(self):
-    """Tests access of sequence elements by index."""
+  def test_sample_indices(self):
     with self.test_session():
       dim = 5
       indices = math_ops.range(10, dtype=dtypes.int32)
-      sample_direct = halton.sample(dim, num_results=10, randomized=False)
-      sample_from_indices = halton.sample(dim, sequence_indices=indices,
-                                          randomized=False)
+      sample_direct = halton.sample(dim, num_samples=10)
+      sample_from_indices = halton.sample(dim, sample_indices=indices)
       self.assertAllClose(sample_direct.eval(), sample_from_indices.eval(),
                           rtol=1e-6)
 
   def test_dtypes_works_correctly(self):
-    """Tests that all supported dtypes work without error."""
     with self.test_session():
       dim = 3
-      sample_float32 = halton.sample(dim, num_results=10, dtype=dtypes.float32,
-                                     seed=11)
-      sample_float64 = halton.sample(dim, num_results=10, dtype=dtypes.float64,
-                                     seed=21)
+      sample_float32 = halton.sample(dim, num_samples=10, dtype=dtypes.float32)
+      sample_float64 = halton.sample(dim, num_samples=10, dtype=dtypes.float64)
       self.assertEqual(sample_float32.eval().dtype, np.float32)
       self.assertEqual(sample_float64.eval().dtype, np.float64)
 
@@ -85,8 +79,7 @@ class HaltonSequenceTest(test.TestCase):
       p = normal_lib.Normal(loc=mu_p, scale=sigma_p)
       q = normal_lib.Normal(loc=mu_q, scale=sigma_q)
 
-      cdf_sample = halton.sample(2, num_results=n, dtype=dtypes.float64,
-                                 seed=1729)
+      cdf_sample = halton.sample(2, num_samples=n, dtype=dtypes.float64)
       q_sample = q.quantile(cdf_sample)
 
       # Compute E_p[X].
@@ -97,7 +90,7 @@ class HaltonSequenceTest(test.TestCase):
       # Compute E_p[X^2].
       e_x2 = mc.expectation_importance_sampler(
           f=math_ops.square, log_p=p.log_prob, sampling_dist_q=q, z=q_sample,
-          seed=1412)
+          seed=42)
 
       stddev = math_ops.sqrt(e_x2 - math_ops.square(e_x))
       # Keep the tolerance levels the same as in monte_carlo_test.py.
@@ -107,10 +100,10 @@ class HaltonSequenceTest(test.TestCase):
 
   def test_docstring_example(self):
     # Produce the first 1000 members of the Halton sequence in 3 dimensions.
-    num_results = 1000
+    num_samples = 1000
     dim = 3
     with self.test_session():
-      sample = halton.sample(dim, num_results=num_results, seed=127)
+      sample = halton.sample(dim, num_samples=num_samples)
 
       # Evaluate the integral of x_1 * x_2^2 * x_3^3  over the three dimensional
       # hypercube.
@@ -122,76 +115,16 @@ class HaltonSequenceTest(test.TestCase):
       # Produces a relative absolute error of 1.7%.
       self.assertAllClose(integral.eval(), true_value.eval(), rtol=0.02)
 
-      # Now skip the first 1000 samples and recompute the integral with the next
-      # thousand samples. The sequence_indices argument can be used to do this.
+    # Now skip the first 1000 samples and recompute the integral with the next
+    # thousand samples. The sample_indices argument can be used to do this.
 
-      sequence_indices = math_ops.range(start=1000, limit=1000 + num_results,
-                                        dtype=dtypes.int32)
-      sample_leaped = halton.sample(dim, sequence_indices=sequence_indices,
-                                    seed=111217)
+      sample_indices = math_ops.range(start=1000, limit=1000 + num_samples,
+                                      dtype=dtypes.int32)
+      sample_leaped = halton.sample(dim, sample_indices=sample_indices)
 
       integral_leaped = math_ops.reduce_mean(
           math_ops.reduce_prod(sample_leaped ** powers, axis=-1))
-      self.assertAllClose(integral_leaped.eval(), true_value.eval(), rtol=0.01)
-
-  def test_randomized_qmc_basic(self):
-    """Tests the randomization of the Halton sequences."""
-    # This test is identical to the example given in Owen (2017), Figure 5.
-
-    dim = 20
-    num_results = 5000
-    replica = 10
-
-    with self.test_session():
-      sample = halton.sample(dim, num_results=num_results, seed=121117)
-      f = math_ops.reduce_mean(math_ops.reduce_sum(sample, axis=1) ** 2)
-      values = [f.eval() for _ in range(replica)]
-      self.assertAllClose(np.mean(values), 101.6667, atol=np.std(values) * 2)
-
-  def test_partial_sum_func_qmc(self):
-    """Tests the QMC evaluation of (x_j + x_{j+1} ...+x_{n})^2.
-
-    A good test of QMC is provided by the function:
-
-      f(x_1,..x_n, x_{n+1}, ..., x_{n+m}) = (x_{n+1} + ... x_{n+m} - m / 2)^2
-
-    with the coordinates taking values in the unit interval. The mean and
-    variance of this function (with the uniform distribution over the
-    unit-hypercube) is exactly calculable:
-
-      <f> = m / 12, Var(f) = m (5m - 3) / 360
-
-    The purpose of the "shift" (if n > 0) in the coordinate dependence of the
-    function is to provide a test for Halton sequence which exhibit more
-    dependence in the higher axes.
-
-    This test confirms that the mean squared error of RQMC estimation falls
-    as O(N^(2-e)) for any e>0.
-    """
-
-    n, m = 10, 10
-    dim = n + m
-    num_results_lo, num_results_hi = 1000, 10000
-    replica = 20
-    true_mean = m / 12.
-
-    def func_estimate(x):
-      return math_ops.reduce_mean(
-          (math_ops.reduce_sum(x[:, -m:], axis=-1) - m / 2.0) ** 2)
-
-    with self.test_session():
-      sample_lo = halton.sample(dim, num_results=num_results_lo, seed=1925)
-      sample_hi = halton.sample(dim, num_results=num_results_hi, seed=898128)
-      f_lo, f_hi = func_estimate(sample_lo), func_estimate(sample_hi)
-
-      estimates = np.array([(f_lo.eval(), f_hi.eval()) for _ in range(replica)])
-      var_lo, var_hi = np.mean((estimates - true_mean) ** 2, axis=0)
-
-      # Expect that the variance scales as N^2 so var_hi / var_lo ~ k / 10^2
-      # with k a fudge factor accounting for the residual N dependence
-      # of the QMC error and the sampling error.
-      log_rel_err = np.log(100 * var_hi / var_lo)
-      self.assertAllClose(log_rel_err, 0.0, atol=1.2)
+      self.assertAllClose(integral_leaped.eval(), true_value.eval(), rtol=0.001)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/bayesflow/python/ops/halton_sequence_impl.py b/tensorflow/contrib/bayesflow/python/ops/halton_sequence_impl.py
index 57900d6818..8cabf18903 100644
--- a/tensorflow/contrib/bayesflow/python/ops/halton_sequence_impl.py
+++ b/tensorflow/contrib/bayesflow/python/ops/halton_sequence_impl.py
@@ -26,9 +26,8 @@ import numpy as np
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
+
 
 __all__ = [
     'sample',
@@ -40,45 +39,32 @@ __all__ = [
 _MAX_DIMENSION = 1000
 
 
-def sample(dim,
-           num_results=None,
-           sequence_indices=None,
-           dtype=None,
-           randomized=True,
-           seed=None,
-           name=None):
-  r"""Returns a sample from the `dim` dimensional Halton sequence.
+def sample(dim, num_samples=None, sample_indices=None, dtype=None, name=None):
+  r"""Returns a sample from the `m` dimensional Halton sequence.
 
   Warning: The sequence elements take values only between 0 and 1. Care must be
   taken to appropriately transform the domain of a function if it differs from
   the unit cube before evaluating integrals using Halton samples. It is also
-  important to remember that quasi-random numbers without randomization are not
-  a replacement for pseudo-random numbers in every context. Quasi random numbers
-  are completely deterministic and typically have significant negative
-  autocorrelation unless randomization is used.
+  important to remember that quasi-random numbers are not a replacement for
+  pseudo-random numbers in every context. Quasi random numbers are completely
+  deterministic and typically have significant negative autocorrelation (unless
+  randomized).
 
   Computes the members of the low discrepancy Halton sequence in dimension
-  `dim`. The `dim`-dimensional sequence takes values in the unit hypercube in
-  `dim` dimensions. Currently, only dimensions up to 1000 are supported. The
-  prime base for the k-th axes is the k-th prime starting from 2. For example,
-  if `dim` = 3, then the bases will be [2, 3, 5] respectively and the first
-  element of the non-randomized sequence will be: [0.5, 0.333, 0.2]. For a more
-  complete description of the Halton sequences see:
+  `dim`. The d-dimensional sequence takes values in the unit hypercube in d
+  dimensions. Currently, only dimensions up to 1000 are supported. The prime
+  base for the `k`-th axes is the k-th prime starting from 2. For example,
+  if dim = 3, then the bases will be [2, 3, 5] respectively and the first
+  element of the sequence will be: [0.5, 0.333, 0.2]. For a more complete
+  description of the Halton sequences see:
   https://en.wikipedia.org/wiki/Halton_sequence. For low discrepancy sequences
   and their applications see:
   https://en.wikipedia.org/wiki/Low-discrepancy_sequence.
 
-  If `randomized` is true, this function produces a scrambled version of the
-  Halton sequence introduced by Owen in arXiv:1706.02808. For the advantages of
-  randomization of low discrepancy sequences see:
-  https://en.wikipedia.org/wiki/Quasi-Monte_Carlo_method#Randomization_of_quasi-Monte_Carlo
-
-  The number of samples produced is controlled by the `num_results` and
-  `sequence_indices` parameters. The user must supply either `num_results` or
-  `sequence_indices` but not both.
+  The user must supply either `num_samples` or `sample_indices` but not both.
   The former is the number of samples to produce starting from the first
-  element. If `sequence_indices` is given instead, the specified elements of
-  the sequence are generated. For example, sequence_indices=tf.range(10) is
+  element. If `sample_indices` is given instead, the specified elements of
+  the sequence are generated. For example, sample_indices=tf.range(10) is
   equivalent to specifying n=10.
 
   Example Use:
@@ -87,9 +73,9 @@ def sample(dim,
   bf = tf.contrib.bayesflow
 
   # Produce the first 1000 members of the Halton sequence in 3 dimensions.
-  num_results = 1000
+  num_samples = 1000
   dim = 3
-  sample = bf.halton_sequence.sample(dim, num_results=num_results, seed=127)
+  sample = bf.halton_sequence.sample(dim, num_samples=num_samples)
 
   # Evaluate the integral of x_1 * x_2^2 * x_3^3  over the three dimensional
   # hypercube.
@@ -103,13 +89,12 @@ def sample(dim,
   print ("Estimated: %f, True Value: %f" % values)
 
   # Now skip the first 1000 samples and recompute the integral with the next
-  # thousand samples. The sequence_indices argument can be used to do this.
+  # thousand samples. The sample_indices argument can be used to do this.
 
 
-  sequence_indices = tf.range(start=1000, limit=1000 + num_results,
-                              dtype=tf.int32)
-  sample_leaped = halton.sample(dim, sequence_indices=sequence_indices,
-                                seed=111217)
+  sample_indices = tf.range(start=1000, limit=1000 + num_samples,
+                            dtype=tf.int32)
+  sample_leaped = halton.sample(dim, sample_indices=sample_indices)
 
   integral_leaped = tf.reduce_mean(tf.reduce_prod(sample_leaped ** powers,
                                                   axis=-1))
@@ -122,57 +107,51 @@ def sample(dim,
   Args:
     dim: Positive Python `int` representing each sample's `event_size.` Must
       not be greater than 1000.
-    num_results: (Optional) positive Python `int`. The number of samples to
-      generate. Either this parameter or sequence_indices must be specified but
+    num_samples: (Optional) positive Python `int`. The number of samples to
+      generate. Either this parameter or sample_indices must be specified but
       not both. If this parameter is None, then the behaviour is determined by
-      the `sequence_indices`.
-    sequence_indices: (Optional) `Tensor` of dtype int32 and rank 1. The
-      elements of the sequence to compute specified by their position in the
-      sequence. The entries index into the Halton sequence starting with 0 and
-      hence, must be whole numbers. For example, sequence_indices=[0, 5, 6] will
-      produce the first, sixth and seventh elements of the sequence. If this
-      parameter is None, then the `num_results` parameter must be specified
-      which gives the number of desired samples starting from the first sample.
+      the `sample_indices`.
+    sample_indices: (Optional) `Tensor` of dtype int32 and rank 1. The elements
+      of the sequence to compute specified by their position in the sequence.
+      The entries index into the Halton sequence starting with 0 and hence,
+      must be whole numbers. For example, sample_indices=[0, 5, 6] will produce
+      the first, sixth and seventh elements of the sequence. If this parameter
+      is None, then the `num_samples` parameter must be specified which gives
+      the number of desired samples starting from the first sample.
     dtype: (Optional) The dtype of the sample. One of `float32` or `float64`.
       Default is `float32`.
-    randomized: (Optional) bool indicating whether to produce a randomized
-      Halton sequence. If True, applies the randomization described in
-      Owen (2017) [arXiv:1706.02808].
-    seed: (Optional) Python integer to seed the random number generator. Only
-      used if `randomized` is True. If not supplied and `randomized` is True,
-      no seed is set.
     name:  (Optional) Python `str` describing ops managed by this function. If
     not supplied the name of this function is used.
 
   Returns:
     halton_elements: Elements of the Halton sequence. `Tensor` of supplied dtype
-    and `shape` `[num_results, dim]` if `num_results` was specified or shape
-    `[s, dim]` where s is the size of `sequence_indices` if `sequence_indices`
+    and `shape` `[num_samples, dim]` if `num_samples` was specified or shape
+    `[s, dim]` where s is the size of `sample_indices` if `sample_indices`
     were specified.
 
   Raises:
-    ValueError: if both `sequence_indices` and `num_results` were specified or
+    ValueError: if both `sample_indices` and `num_samples` were specified or
     if dimension `dim` is less than 1 or greater than 1000.
   """
   if dim < 1 or dim > _MAX_DIMENSION:
     raise ValueError(
         'Dimension must be between 1 and {}. Supplied {}'.format(_MAX_DIMENSION,
                                                                  dim))
-  if (num_results is None) == (sequence_indices is None):
-    raise ValueError('Either `num_results` or `sequence_indices` must be'
+  if (num_samples is None) == (sample_indices is None):
+    raise ValueError('Either `num_samples` or `sample_indices` must be'
                      ' specified but not both.')
 
   dtype = dtype or dtypes.float32
   if not dtype.is_floating:
     raise ValueError('dtype must be of `float`-type')
 
-  with ops.name_scope(name, 'sample', values=[sequence_indices]):
+  with ops.name_scope(name, 'sample', values=[sample_indices]):
     # Here and in the following, the shape layout is as follows:
     # [sample dimension, event dimension, coefficient dimension].
     # The coefficient dimension is an intermediate axes which will hold the
     # weights of the starting integer when expressed in the (prime) base for
     # an event dimension.
-    indices = _get_indices(num_results, sequence_indices, dtype)
+    indices = _get_indices(num_samples, sample_indices, dtype)
     radixes = array_ops.constant(_PRIMES[0:dim], dtype=dtype, shape=[dim, 1])
 
     max_sizes_by_axes = _base_expansion_size(math_ops.reduce_max(indices),
@@ -197,74 +176,11 @@ def sample(dim,
     weights = radixes ** capped_exponents
     coeffs = math_ops.floor_div(indices, weights)
     coeffs *= 1 - math_ops.cast(weight_mask, dtype)
-    coeffs %= radixes
-    if not randomized:
-      coeffs /= radixes
-      return math_ops.reduce_sum(coeffs / weights, axis=-1)
-    coeffs = _randomize(coeffs, radixes, seed=seed)
-    coeffs *= 1 - math_ops.cast(weight_mask, dtype)
-    coeffs /= radixes
-    base_values = math_ops.reduce_sum(coeffs / weights, axis=-1)
-
-    # The randomization used in Owen (2017) does not leave 0 invariant. While
-    # we have accounted for the randomization of the first `max_size_by_axes`
-    # coefficients, we still need to correct for the trailing zeros. Luckily,
-    # this is equivalent to adding a uniform random value scaled so the first
-    # `max_size_by_axes` coefficients are zero. The following statements perform
-    # this correction.
-    zero_correction = random_ops.random_uniform([dim, 1], seed=seed,
-                                                dtype=dtype)
-    zero_correction /= (radixes ** max_sizes_by_axes)
-    return base_values + array_ops.reshape(zero_correction, [-1])
-
-
-def _randomize(coeffs, radixes, seed=None):
-  """Applies the Owen randomization to the coefficients."""
-  given_dtype = coeffs.dtype
-  coeffs = math_ops.to_int32(coeffs)
-  num_coeffs = array_ops.shape(coeffs)[-1]
-  radixes = array_ops.reshape(math_ops.to_int32(radixes), [-1])
-  perms = _get_permutations(num_coeffs, radixes, seed=seed)
-  perms = array_ops.reshape(perms, [-1])
-  radix_sum = math_ops.reduce_sum(radixes)
-  radix_offsets = array_ops.reshape(math_ops.cumsum(radixes, exclusive=True),
-                                    [-1, 1])
-  offsets = radix_offsets + math_ops.range(num_coeffs) * radix_sum
-  permuted_coeffs = array_ops.gather(perms, coeffs + offsets)
-  return math_ops.cast(permuted_coeffs, dtype=given_dtype)
-
-
-def _get_permutations(num_results, dims, seed=None):
-  """Uniform iid sample from the space of permutations.
-
-  Draws a sample of size `num_results` from the group of permutations of degrees
-  specified by the `dims` tensor. These are packed together into one tensor
-  such that each row is one sample from each of the dimensions in `dims`. For
-  example, if dims = [2,3] and num_results = 2, the result is a tensor of shape
-  [2, 2 + 3] and the first row of the result might look like:
-  [1, 0, 2, 0, 1]. The first two elements are a permutation over 2 elements
-  while the next three are a permutation over 3 elements.
+    coeffs = (coeffs % radixes) / radixes
+    return math_ops.reduce_sum(coeffs / weights, axis=-1)
 
-  Args:
-    num_results: A positive scalar `Tensor` of integral type. The number of
-      draws from the discrete uniform distribution over the permutation groups.
-    dims: A 1D `Tensor` of the same dtype as `num_results`. The degree of the
-      permutation groups from which to sample.
-    seed: (Optional) Python integer to seed the random number generator.
 
-  Returns:
-    permutations: A `Tensor` of shape `[num_results, sum(dims)]` and the same
-    dtype as `dims`.
-  """
-  sample_range = math_ops.range(num_results)
-  def generate_one(d):
-    fn = lambda _: random_ops.random_shuffle(math_ops.range(d), seed=seed)
-    return functional_ops.map_fn(fn, sample_range)
-  return array_ops.concat([generate_one(d) for d in array_ops.unstack(dims)],
-                          axis=-1)
-
-
-def _get_indices(n, sequence_indices, dtype, name=None):
+def _get_indices(n, sample_indices, dtype, name=None):
   """Generates starting points for the Halton sequence procedure.
 
   The k'th element of the sequence is generated starting from a positive integer
@@ -275,10 +191,10 @@ def _get_indices(n, sequence_indices, dtype, name=None):
 
   Args:
     n: Positive `int`. The number of samples to generate. If this
-      parameter is supplied, then `sequence_indices` should be None.
-    sequence_indices: `Tensor` of dtype int32 and rank 1. The entries
+      parameter is supplied, then `sample_indices` should be None.
+    sample_indices: `Tensor` of dtype int32 and rank 1. The entries
       index into the Halton sequence starting with 0 and hence, must be whole
-      numbers. For example, sequence_indices=[0, 5, 6] will produce the first,
+      numbers. For example, sample_indices=[0, 5, 6] will produce the first,
       sixth and seventh elements of the sequence. If this parameter is not None
       then `n` must be None.
     dtype: The dtype of the sample. One of `float32` or `float64`.
@@ -288,14 +204,14 @@ def _get_indices(n, sequence_indices, dtype, name=None):
   Returns:
     indices: `Tensor` of dtype `dtype` and shape = `[n, 1, 1]`.
   """
-  with ops.name_scope(name, '_get_indices', [n, sequence_indices]):
-    if sequence_indices is None:
-      sequence_indices = math_ops.range(n, dtype=dtype)
+  with ops.name_scope(name, 'get_indices', [n, sample_indices]):
+    if sample_indices is None:
+      sample_indices = math_ops.range(n, dtype=dtype)
     else:
-      sequence_indices = math_ops.cast(sequence_indices, dtype)
+      sample_indices = math_ops.cast(sample_indices, dtype)
 
     # Shift the indices so they are 1 based.
-    indices = sequence_indices + 1
+    indices = sample_indices + 1
 
     # Reshape to make space for the event dimension and the place value
     # coefficients.
@@ -345,5 +261,4 @@ def _primes_less_than(n):
 
 _PRIMES = _primes_less_than(7919+1)
 
-
 assert len(_PRIMES) == _MAX_DIMENSION
-- 
GitLab


From 717869253945f1e55cd7afdbaa864fac25f43c5a Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sat, 10 Feb 2018 04:00:39 -0800
Subject: [PATCH 927/966] Improve shape function of NonMaxSuppression (#16890)

* Improve shape function of NonMaxSuppression

In the docs for `tf.image.non_max_suppression`, the shapes
of the args `boxes` and `scores` are `[num_boxes, 4]` and `[num_boxes]`
respectively.

This fix improve the shape function of NonMaxSuppression so that
`boxes_shape[0] = scores_shape[0] = num_boxes`.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add additional test case for shape function of NonMaxSuppression

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/core/ops/image_ops.cc        | 8 ++++++++
 tensorflow/python/ops/image_ops_test.py | 9 +++++++++
 2 files changed, 17 insertions(+)

diff --git a/tensorflow/core/ops/image_ops.cc b/tensorflow/core/ops/image_ops.cc
index 4c05b274fe..c3b08e067a 100644
--- a/tensorflow/core/ops/image_ops.cc
+++ b/tensorflow/core/ops/image_ops.cc
@@ -619,6 +619,10 @@ REGISTER_OP("NonMaxSuppression")
       TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &max_output_size));
       // The boxes is a 2-D float Tensor of shape [num_boxes, 4].
       DimensionHandle unused;
+      // The boxes[0] and scores[0] are both num_boxes.
+      TF_RETURN_IF_ERROR(
+          c->Merge(c->Dim(boxes, 0), c->Dim(scores, 0), &unused));
+      // The boxes[1] is 4.
       TF_RETURN_IF_ERROR(c->WithValue(c->Dim(boxes, 1), 4, &unused));
 
       c->set_output(0, c->Vector(c->UnknownDim()));
@@ -643,6 +647,10 @@ REGISTER_OP("NonMaxSuppressionV2")
       TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &iou_threshold));
       // The boxes is a 2-D float Tensor of shape [num_boxes, 4].
       DimensionHandle unused;
+      // The boxes[0] and scores[0] are both num_boxes.
+      TF_RETURN_IF_ERROR(
+          c->Merge(c->Dim(boxes, 0), c->Dim(scores, 0), &unused));
+      // The boxes[1] is 4.
       TF_RETURN_IF_ERROR(c->WithValue(c->Dim(boxes, 1), 4, &unused));
 
       c->set_output(0, c->Vector(c->UnknownDim()));
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index e49af6cd92..91a7437652 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -3171,6 +3171,15 @@ class NonMaxSuppressionTest(test_util.TensorFlowTestCase):
       scores = constant_op.constant([0.9])
       image_ops.non_max_suppression(boxes, scores, 3, 0.5)
 
+    # The boxes is of shape [num_boxes, 4], and the scores is
+    # of shape [num_boxes]. So an error will thrown.
+    with self.assertRaisesRegexp(
+        ValueError, 'Dimensions must be equal, but are 1 and 2'):
+      boxes = constant_op.constant([[0.0, 0.0, 1.0, 1.0]])
+      scores = constant_op.constant([0.9, 0.75])
+      selected_indices = image_ops.non_max_suppression(
+          boxes, scores, 3, 0.5)
+
     # The scores should be 1D of shape [num_boxes].
     with self.assertRaisesRegexp(ValueError,
                                  "Shape must be rank 1 but is rank 2"):
-- 
GitLab


From f669885e1c135b1e5ad7b2e936083860a84b0aea Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 10 Feb 2018 11:22:55 -0800
Subject: [PATCH 928/966] Add python/util/is_in_graph_mode.py

PiperOrigin-RevId: 185260675
---
 tensorflow/python/eager/context.py         |  8 ++++++++
 tensorflow/python/util/deprecation.py      |  4 ++--
 tensorflow/python/util/is_in_graph_mode.py | 22 ++++++++++++++++++++++
 3 files changed, 32 insertions(+), 2 deletions(-)
 create mode 100644 tensorflow/python/util/is_in_graph_mode.py

diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index b6c7d82323..0e9c21b221 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -30,6 +30,7 @@ from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import errors
 from tensorflow.python.util import compat
+from tensorflow.python.util import is_in_graph_mode
 from tensorflow.python.util import tf_contextlib
 
 GRAPH_MODE = 0
@@ -599,3 +600,10 @@ def export_run_metadata():
     A RunMetadata protocol buffer.
   """
   return context().export_run_metadata()
+
+
+# Not every user creates a Context via context.context()
+# (for example, enable_eager_execution in python/framework/ops.py),
+# but they do all import this file.  Note that IS_IN_GRAPH_MODE and
+# in_graph_mode are both parameterless functions.
+is_in_graph_mode.IS_IN_GRAPH_MODE = in_graph_mode
diff --git a/tensorflow/python/util/deprecation.py b/tensorflow/python/util/deprecation.py
index fbec8fd2d8..376be39978 100644
--- a/tensorflow/python/util/deprecation.py
+++ b/tensorflow/python/util/deprecation.py
@@ -22,9 +22,9 @@ import collections
 import functools
 import re
 
-from tensorflow.python.eager import context
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import decorator_utils
+from tensorflow.python.util import is_in_graph_mode
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
@@ -400,7 +400,7 @@ def deprecated_args(date, instructions, *deprecated_arg_names_or_tuples,
       """Deprecation wrapper."""
       # TODO(apassos) figure out a way to have reasonable performance with
       # deprecation warnings and eager mode.
-      if context.in_graph_mode() and _PRINT_DEPRECATION_WARNINGS:
+      if is_in_graph_mode.IS_IN_GRAPH_MODE() and _PRINT_DEPRECATION_WARNINGS:
         invalid_args = []
         named_args = tf_inspect.getcallargs(func, *args, **kwargs)
         for arg_name, spec in iter(deprecated_positions.items()):
diff --git a/tensorflow/python/util/is_in_graph_mode.py b/tensorflow/python/util/is_in_graph_mode.py
new file mode 100644
index 0000000000..9ae89ecb71
--- /dev/null
+++ b/tensorflow/python/util/is_in_graph_mode.py
@@ -0,0 +1,22 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A function that tells you if the program is running in graph mode."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Call IS_IN_GRAPH_MODE() when you want to know whether the thread is in
+# graph mode.  By default, we always are.
+IS_IN_GRAPH_MODE = lambda: True
-- 
GitLab


From 45fae93d626e41c17fc988b88de0e2721771d222 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 10 Feb 2018 12:45:12 -0800
Subject: [PATCH 929/966] Getting rid of unnecessary GPUDevice typedef. Passing
 DepthwiseArgs by reference in host code.

PiperOrigin-RevId: 185263307
---
 tensorflow/core/kernels/depthwise_conv_op.h   |  2 +-
 .../core/kernels/depthwise_conv_op_gpu.cu.cc  | 70 ++++++++++---------
 2 files changed, 37 insertions(+), 35 deletions(-)

diff --git a/tensorflow/core/kernels/depthwise_conv_op.h b/tensorflow/core/kernels/depthwise_conv_op.h
index ba262d56ee..b2d5898891 100644
--- a/tensorflow/core/kernels/depthwise_conv_op.h
+++ b/tensorflow/core/kernels/depthwise_conv_op.h
@@ -83,7 +83,7 @@ struct LaunchDepthwiseConvBackpropFilterOp {
 #if GOOGLE_CUDA
 template <typename T>
 struct LaunchDepthwiseConvOp<Eigen::GpuDevice, T> {
-  void operator()(OpKernelContext* ctx, const DepthwiseArgs args,
+  void operator()(OpKernelContext* ctx, const DepthwiseArgs& args,
                   const T* input, const T* filter, T* output,
                   TensorFormat data_format);
 };
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
index 126b64f73d..1e9345828a 100644
--- a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
@@ -34,13 +34,12 @@ limitations under the License.
 
 namespace tensorflow {
 
-typedef Eigen::GpuDevice GPUDevice;
 using Eigen::GpuDevice;
 
 // Returns whether depthwise convolution forward or backward input pass can be
 // performed using the faster ('Small') variant of the kernel.
 EIGEN_DEVICE_FUNC bool CanLaunchDepthwiseConv2dGPUSmall(
-    const DepthwiseArgs args) {
+    const DepthwiseArgs& args) {
   return args.depth_multiplier == 1 && args.stride == 1 && args.in_rows <= 32 &&
          args.in_cols <= 32 && args.in_rows == args.out_rows &&
          args.in_cols == args.out_cols && args.pad_rows >= 0 &&
@@ -53,7 +52,7 @@ EIGEN_DEVICE_FUNC bool CanLaunchDepthwiseConv2dGPUSmall(
 // Returns whether depthwise convolution backward filter pass can be performed
 // using the faster ('Small') variant of the kernel.
 EIGEN_DEVICE_FUNC bool CanLaunchDepthwiseConv2dBackpropFilterGPUSmall(
-    const DepthwiseArgs args, const int block_rows) {
+    const DepthwiseArgs& args, const int block_rows) {
   return args.depth_multiplier == 1 && args.stride == 1 && args.in_rows <= 32 &&
          args.in_cols <= 32 && args.in_rows == args.out_rows &&
          args.in_cols == args.out_cols && args.pad_rows >= 0 &&
@@ -565,8 +564,9 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNCHWSmall(
 template <typename T, DepthwiseConv2dDirection kDirection,
           int kKnownFilterWidth, int kKnownFilterHeight, int kBlockSlices,
           bool kKnownEvenRows>
-void LaunchDepthwiseConv2dGPUSmall(const GpuDevice& d, const DepthwiseArgs args,
-                                   const T* input, const T* filter, T* output,
+void LaunchDepthwiseConv2dGPUSmall(const GpuDevice& d,
+                                   const DepthwiseArgs& args, const T* input,
+                                   const T* filter, T* output,
                                    TensorFormat data_format) {
   const int block_rows = (args.in_rows + 1) / 2;
   dim3 block_dim;
@@ -602,8 +602,9 @@ void LaunchDepthwiseConv2dGPUSmall(const GpuDevice& d, const DepthwiseArgs args,
 
 template <typename T, DepthwiseConv2dDirection kDirection,
           int kKnownFilterWidth, int kKnownFilterHeight, int kBlockSlices>
-void LaunchDepthwiseConv2dGPUSmall(const GpuDevice& d, const DepthwiseArgs args,
-                                   const T* input, const T* filter, T* output,
+void LaunchDepthwiseConv2dGPUSmall(const GpuDevice& d,
+                                   const DepthwiseArgs& args, const T* input,
+                                   const T* filter, T* output,
                                    TensorFormat data_format) {
   if (args.in_rows & 1) {
     LaunchDepthwiseConv2dGPUSmall<T, kDirection, kKnownFilterWidth,
@@ -618,8 +619,9 @@ void LaunchDepthwiseConv2dGPUSmall(const GpuDevice& d, const DepthwiseArgs args,
 
 template <typename T, DepthwiseConv2dDirection kDirection,
           int kKnownFilterWidth, int kKnownFilterHeight>
-void LaunchDepthwiseConv2dGPUSmall(const GpuDevice& d, const DepthwiseArgs args,
-                                   const T* input, const T* filter, T* output,
+void LaunchDepthwiseConv2dGPUSmall(const GpuDevice& d,
+                                   const DepthwiseArgs& args, const T* input,
+                                   const T* filter, T* output,
                                    TensorFormat data_format) {
   // Maximize (power of two) kBlockSlices while keeping a block within 1024
   // threads (2 pixels per thread).
@@ -641,7 +643,7 @@ void LaunchDepthwiseConv2dGPUSmall(const GpuDevice& d, const DepthwiseArgs args,
 
 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
           int kKnownDepthMultiplier>
-void LaunchDepthwiseConv2dGPU(const GpuDevice& d, const DepthwiseArgs args,
+void LaunchDepthwiseConv2dGPU(const GpuDevice& d, const DepthwiseArgs& args,
                               const T* input, const T* filter, T* output,
                               TensorFormat data_format) {
   void (*kernel)(const DepthwiseArgs, const T*, const T*, T*, int);
@@ -671,7 +673,7 @@ void LaunchDepthwiseConv2dGPU(const GpuDevice& d, const DepthwiseArgs args,
 }
 
 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight>
-void LaunchDepthwiseConv2dGPU(const GpuDevice& d, const DepthwiseArgs args,
+void LaunchDepthwiseConv2dGPU(const GpuDevice& d, const DepthwiseArgs& args,
                               const T* input, const T* filter, T* output,
                               TensorFormat data_format) {
   if (args.depth_multiplier == 1) {
@@ -692,12 +694,12 @@ void LaunchDepthwiseConv2dGPU(const GpuDevice& d, const DepthwiseArgs args,
 
 // A simple launch pad to launch the Cuda kernel for depthwise convolution.
 template <typename T>
-void LaunchDepthwiseConvOp<GPUDevice, T>::operator()(OpKernelContext* ctx,
-                                                     const DepthwiseArgs args,
+void LaunchDepthwiseConvOp<GpuDevice, T>::operator()(OpKernelContext* ctx,
+                                                     const DepthwiseArgs& args,
                                                      const T* input,
                                                      const T* filter, T* output,
                                                      TensorFormat data_format) {
-  const GPUDevice& d = ctx->eigen_device<GPUDevice>();
+  const GpuDevice& d = ctx->eigen_device<GpuDevice>();
   if (args.filter_rows == 3 && args.filter_cols == 3) {
     LaunchDepthwiseConv2dGPU<T, 3, 3>(d, args, input, filter, output,
                                       data_format);
@@ -711,9 +713,9 @@ void LaunchDepthwiseConvOp<GPUDevice, T>::operator()(OpKernelContext* ctx,
                   "Launch of gpu kernel for DepthwiseConv2dGPULaunch failed"));
 }
 
-template struct LaunchDepthwiseConvOp<GPUDevice, Eigen::half>;
-template struct LaunchDepthwiseConvOp<GPUDevice, float>;
-template struct LaunchDepthwiseConvOp<GPUDevice, double>;
+template struct LaunchDepthwiseConvOp<GpuDevice, Eigen::half>;
+template struct LaunchDepthwiseConvOp<GpuDevice, float>;
+template struct LaunchDepthwiseConvOp<GpuDevice, double>;
 
 // A Cuda kernel to compute the depthwise convolution backprop w.r.t. input.
 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
@@ -854,7 +856,7 @@ __global__ void __launch_bounds__(640, 2)
 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
           int kKnownDepthMultiplier>
 void LaunchDepthwiseConv2dBackpropInputGPU(const GpuDevice& d,
-                                           const DepthwiseArgs args,
+                                           const DepthwiseArgs& args,
                                            const T* out_backprop,
                                            const T* filter, T* in_backprop,
                                            TensorFormat data_format) {
@@ -879,7 +881,7 @@ void LaunchDepthwiseConv2dBackpropInputGPU(const GpuDevice& d,
 
 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight>
 void LaunchDepthwiseConv2dBackpropInputGPU(const GpuDevice& d,
-                                           const DepthwiseArgs args,
+                                           const DepthwiseArgs& args,
                                            const T* out_backprop,
                                            const T* filter, T* in_backprop,
                                            TensorFormat data_format) {
@@ -903,10 +905,10 @@ void LaunchDepthwiseConv2dBackpropInputGPU(const GpuDevice& d,
 
 // A simple launch pad to launch the Cuda kernel for depthwise convolution.
 template <typename T>
-void LaunchDepthwiseConvBackpropInputOp<GPUDevice, T>::operator()(
+void LaunchDepthwiseConvBackpropInputOp<GpuDevice, T>::operator()(
     OpKernelContext* ctx, const DepthwiseArgs& args, const T* out_backprop,
     const T* filter, T* in_backprop, TensorFormat data_format) {
-  const GPUDevice& d = ctx->eigen_device<GPUDevice>();
+  const GpuDevice& d = ctx->eigen_device<GpuDevice>();
   if (args.filter_rows == 3 && args.filter_cols == 3) {
     LaunchDepthwiseConv2dBackpropInputGPU<T, 3, 3>(
         d, args, out_backprop, filter, in_backprop, data_format);
@@ -921,9 +923,9 @@ void LaunchDepthwiseConvBackpropInputOp<GPUDevice, T>::operator()(
                                "utGPULaunch failed"));
 }
 
-template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, Eigen::half>;
-template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, float>;
-template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, double>;
+template struct LaunchDepthwiseConvBackpropInputOp<GpuDevice, Eigen::half>;
+template struct LaunchDepthwiseConvBackpropInputOp<GpuDevice, float>;
+template struct LaunchDepthwiseConvBackpropInputOp<GpuDevice, double>;
 
 // A Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
@@ -1450,7 +1452,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall(
 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
           int kBlockSlices, int kAccumPixels>
 bool TryLaunchDepthwiseConv2dBackpropFilterGPUSmall(
-    const GpuDevice& d, const DepthwiseArgs args, const int block_rows,
+    const GpuDevice& d, const DepthwiseArgs& args, const int block_rows,
     const T* out_backprop, const T* input, T* filter_backprop,
     TensorFormat data_format) {
   const int tile_cols = args.in_cols + args.filter_cols - 1;
@@ -1490,7 +1492,7 @@ bool TryLaunchDepthwiseConv2dBackpropFilterGPUSmall(
 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
           int kBlockSlices>
 bool TryLaunchDepthwiseConv2dBackpropFilterGPUSmall(
-    const GpuDevice& d, const DepthwiseArgs args, const int block_rows,
+    const GpuDevice& d, const DepthwiseArgs& args, const int block_rows,
     const T* out_backprop, const T* input, T* filter_backprop,
     TensorFormat data_format) {
   // Minimize (power of two) kAccumPixels, while satisfying
@@ -1513,7 +1515,7 @@ bool TryLaunchDepthwiseConv2dBackpropFilterGPUSmall(
 
 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight>
 bool TryLaunchDepthwiseConv2dBackpropFilterGPUSmall(
-    const GpuDevice& d, const DepthwiseArgs args, const T* out_backprop,
+    const GpuDevice& d, const DepthwiseArgs& args, const T* out_backprop,
     const T* input, T* filter_backprop, TensorFormat data_format) {
   // Maximize (power of two) kBlockSlices while keeping a block within 1024
   // threads (2 pixels per thread).
@@ -1560,7 +1562,7 @@ bool TryLaunchDepthwiseConv2dBackpropFilterGPUSmall(
 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
           int kKnownDepthMultiplier>
 void LaunchDepthwiseConv2dBackpropFilterGPU(const GpuDevice& d,
-                                            const DepthwiseArgs args,
+                                            const DepthwiseArgs& args,
                                             const T* out_backprop,
                                             const T* input, T* filter_backprop,
                                             TensorFormat data_format) {
@@ -1585,7 +1587,7 @@ void LaunchDepthwiseConv2dBackpropFilterGPU(const GpuDevice& d,
 
 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight>
 void LaunchDepthwiseConv2dBackpropFilterGPU(const GpuDevice& d,
-                                            const DepthwiseArgs args,
+                                            const DepthwiseArgs& args,
                                             const T* out_backprop,
                                             const T* input, T* filter_backprop,
                                             TensorFormat data_format) {
@@ -1608,10 +1610,10 @@ void LaunchDepthwiseConv2dBackpropFilterGPU(const GpuDevice& d,
 
 // A simple launch pad to launch the Cuda kernel for depthwise convolution.
 template <typename T>
-void LaunchDepthwiseConvBackpropFilterOp<GPUDevice, T>::operator()(
+void LaunchDepthwiseConvBackpropFilterOp<GpuDevice, T>::operator()(
     OpKernelContext* ctx, const DepthwiseArgs& args, const T* out_backprop,
     const T* input, T* filter_backprop, TensorFormat data_format) {
-  const GPUDevice& d = ctx->eigen_device<GPUDevice>();
+  const GpuDevice& d = ctx->eigen_device<GpuDevice>();
   auto stream = ctx->op_device_context()->stream();
 
   // Initialize the results to 0.
@@ -1634,8 +1636,8 @@ void LaunchDepthwiseConvBackpropFilterOp<GPUDevice, T>::operator()(
                                "terGPULaunch failed"));
 }
 
-template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, Eigen::half>;
-template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, float>;
-template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, double>;
+template struct LaunchDepthwiseConvBackpropFilterOp<GpuDevice, Eigen::half>;
+template struct LaunchDepthwiseConvBackpropFilterOp<GpuDevice, float>;
+template struct LaunchDepthwiseConvBackpropFilterOp<GpuDevice, double>;
 }  // namespace tensorflow
 #endif  // GOOGLE_CUDA
-- 
GitLab


From f114d0b9f50ab9ce0e723d18f1c467079ece87b7 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sat, 10 Feb 2018 19:14:41 -0800
Subject: [PATCH 930/966] Add S3 plugin to the list of file system plugin in
 doc (add_filesys.md) (#16920)

This fix adds S3 plugin to the list of file system plugin in
doc (add_filesys.md).

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/docs_src/extend/add_filesys.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/docs_src/extend/add_filesys.md b/tensorflow/docs_src/extend/add_filesys.md
index f0591b7b7d..06f11de4eb 100644
--- a/tensorflow/docs_src/extend/add_filesys.md
+++ b/tensorflow/docs_src/extend/add_filesys.md
@@ -81,6 +81,8 @@ filesystem implementations call their existing libraries. Examples include:
     plugin](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/platform/hadoop/hadoop_file_system.h)
 *   [GCS
     plugin](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/platform/cloud/gcs_file_system.h)
+*   [S3
+    plugin](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/platform/s3/s3_file_system.h)
 
 #### The File interfaces
 
-- 
GitLab


From d438afa1c468afb3cc991bfad89c2844637cbf3b Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sat, 10 Feb 2018 19:15:12 -0800
Subject: [PATCH 931/966] Fix the profiler python docstring link (#16916)

This fix fixes the python docstring link of the profiler:
`profilerg3doc` -> `profiler/g3doc`

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/profiler/option_builder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/profiler/option_builder.py b/tensorflow/python/profiler/option_builder.py
index 957ebe6ddd..2ad7adf769 100644
--- a/tensorflow/python/profiler/option_builder.py
+++ b/tensorflow/python/profiler/option_builder.py
@@ -300,7 +300,7 @@ class ProfileOptionBuilder(object):
     # pylint: disable=line-too-long
     """Only show profiler nodes consuming no less than 'min_float_ops'.
 
-    Please see https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/profilerg3doc/profile_model_architecture.md
+    Please see https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/profiler/g3doc/profile_model_architecture.md
     on the caveats of calculating float operations.
 
     Args:
-- 
GitLab


From 1c10d35b0c492dcce798688819e7eeb9f5d58f87 Mon Sep 17 00:00:00 2001
From: Loo Rong Jie <loorongjie@gmail.com>
Date: Sun, 11 Feb 2018 11:16:47 +0800
Subject: [PATCH 932/966] [MSVC] Workaround MSVC template/lambda parsing bug
 (#16904)

---
 tensorflow/compiler/xla/literal_util.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/literal_util.cc b/tensorflow/compiler/xla/literal_util.cc
index 09db011719..e0a9b148b4 100644
--- a/tensorflow/compiler/xla/literal_util.cc
+++ b/tensorflow/compiler/xla/literal_util.cc
@@ -234,7 +234,8 @@ Status Literal::CopySliceFromInternal(
       int64 src_index = linear_index(src_literal.shape(), src_indexes);
       int64 dest_index = linear_index(shape(), dest_indexes);
 
-      StridedCopy(data<NativeT>(), dest_index, stride_config.dest_stride,
+      // `this->` is needed to workaround MSVC bug: #16882
+      StridedCopy(this->data<NativeT>(), dest_index, stride_config.dest_stride,
                   src_literal.data<NativeT>(), src_index,
                   stride_config.source_stride, stride_config.minor_loop_size);
       return true;
-- 
GitLab


From 84c355be0013eed01d2d8ccd139fa2d4bbf902ce Mon Sep 17 00:00:00 2001
From: ManHyuk <manhyuk@kw.ac.kr>
Date: Sun, 11 Feb 2018 12:17:00 +0900
Subject: [PATCH 933/966] Fix typo (#16908)

* fit typo

* fix typo
---
 tensorflow/c/c_api_test.cc                  | 2 +-
 tensorflow/core/common_runtime/gpu/gpu_id.h | 2 +-
 tensorflow/core/protobuf/config.proto       | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/c/c_api_test.cc b/tensorflow/c/c_api_test.cc
index 01954eb235..8a2a743958 100644
--- a/tensorflow/c/c_api_test.cc
+++ b/tensorflow/c/c_api_test.cc
@@ -1956,7 +1956,7 @@ TEST_F(CApiAttributesTest, Tensor) {
 }
 
 TEST_F(CApiAttributesTest, StringTensor) {
-  // Create the string-Tensor "atttribute" value.
+  // Create the string-Tensor "attribute" value.
   char encoded[] = {
       0,   0, 0, 0, 0, 0, 0, 0,  // array[uint64] offsets
       1,                         // varint encoded string length
diff --git a/tensorflow/core/common_runtime/gpu/gpu_id.h b/tensorflow/core/common_runtime/gpu/gpu_id.h
index 4e9c4abce1..2a6caea296 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_id.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_id.h
@@ -40,7 +40,7 @@ namespace tensorflow {
 //   a BaseGPUDevice. Note that the configuration allows us to create multiple
 //   BaseGPUDevice per GPU hardware in order to use multi CUDA streams on the
 //   hardware, so the mapping between TF GPU id and CUDA GPU id is not a 1:1
-//   mappping, see the example below.
+//   mapping, see the example below.
 //
 // For example, assuming that in the machine we have GPU device with index 0, 1,
 // 2 and 3 (physical GPU id). Setting "CUDA_VISIBLE_DEVICES=1,2,3" will create
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index ccab69b9c0..3606c5f127 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -387,7 +387,7 @@ message RunOptions {
   // EXPERIMENTAL.  Options used to initialize DebuggerState, if enabled.
   DebugOptions debug_options = 6;
 
-  // When enabled, causes tensor alllocation information to be included in
+  // When enabled, causes tensor allocation information to be included in
   // the error message when the Run() call fails because the allocator ran
   // out of memory (OOM).
   //
-- 
GitLab


From 3378865c5509dfb6d18e6f95f28757437f67d3da Mon Sep 17 00:00:00 2001
From: DylanDmitri <d.dylan.g@gmail.com>
Date: Sat, 10 Feb 2018 21:17:13 -0600
Subject: [PATCH 934/966] typo fix (#16903)

Copied the new description from docstring on line 37. Used the phrase "spectogram timeslice" rather than "frequency window" for consistency with the tooltip on ```--window_size_ms```.
---
 tensorflow/examples/speech_commands/train.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/examples/speech_commands/train.py b/tensorflow/examples/speech_commands/train.py
index a4e80041f8..f084931215 100644
--- a/tensorflow/examples/speech_commands/train.py
+++ b/tensorflow/examples/speech_commands/train.py
@@ -357,12 +357,12 @@ if __name__ == '__main__':
       '--window_size_ms',
       type=float,
       default=30.0,
-      help='How long each spectrogram timeslice is',)
+      help='How long each spectrogram timeslice is.',)
   parser.add_argument(
       '--window_stride_ms',
       type=float,
       default=10.0,
-      help='How long each spectrogram timeslice is',)
+      help='How far to move in time between spectogram timeslices.',)
   parser.add_argument(
       '--dct_coefficient_count',
       type=int,
-- 
GitLab


From 16ca0705376b6ba8952ac262569a5f3b162d3af6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 10 Feb 2018 20:48:19 -0800
Subject: [PATCH 935/966] Add support for kConditional to the module group
 scheduler.

PiperOrigin-RevId: 185279412
---
 tensorflow/compiler/xla/map_util.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/tensorflow/compiler/xla/map_util.h b/tensorflow/compiler/xla/map_util.h
index 0ad0b91330..8db8c6f3de 100644
--- a/tensorflow/compiler/xla/map_util.h
+++ b/tensorflow/compiler/xla/map_util.h
@@ -65,6 +65,25 @@ MaybeFind(const Collection& collection,
   return {it->second};
 }
 
+// Returns a const reference to the value associated with the given key if it
+// exists, otherwise returns a const reference to the provided default value.
+//
+// WARNING: If a temporary object is passed as the default "value,"
+// this function will return a reference to that temporary object,
+// which will be destroyed at the end of the statement. A common
+// example: if you have a map with string values, and you pass a char*
+// as the default "value," either use the returned value immediately
+// or store it in a string (not string&).
+template <class Collection>
+const typename Collection::value_type::second_type& FindOrDefault(
+    const Collection& collection,
+    const typename Collection::value_type::first_type& key,
+    const typename Collection::value_type::second_type& value) {
+  auto it = collection.find(key);
+  if (it != collection.end()) return it->second;
+  return value;
+}
+
 // Inserts the key-value pair into the collection. Dies if key was already
 // present.
 template <class Collection>
-- 
GitLab


From cf4d066720bbf9c3c79aa62d9b1057939af5ff63 Mon Sep 17 00:00:00 2001
From: gracehoney <31743510+aaroey@users.noreply.github.com>
Date: Sat, 10 Feb 2018 22:44:01 -0800
Subject: [PATCH 936/966] Fix python lint errors internally. One important
 change is to rename CreateInferenceGraph to create_inference_graph.

---
 tensorflow/contrib/tensorrt/README.md         |  2 +-
 tensorflow/contrib/tensorrt/__init__.py       |  4 +++
 .../contrib/tensorrt/convert/convert_nodes.cc |  1 +
 .../contrib/tensorrt/python/__init__.py       | 22 ++++++++++++--
 .../tensorrt/python/ops/trt_engine_op.py      |  1 +
 .../contrib/tensorrt/python/trt_convert.py    | 30 ++++++++++++-------
 .../contrib/tensorrt/segment/segment_test.cc  |  2 +-
 .../contrib/tensorrt/test/test_tftrt.py       |  4 +--
 8 files changed, 49 insertions(+), 17 deletions(-)

diff --git a/tensorflow/contrib/tensorrt/README.md b/tensorflow/contrib/tensorrt/README.md
index 1e9524c26b..dfcce0fd00 100644
--- a/tensorflow/contrib/tensorrt/README.md
+++ b/tensorflow/contrib/tensorrt/README.md
@@ -29,7 +29,7 @@ import tensorflow as tf
 import tensorflow.contrib.tensorrt as trt
 #... create and train or load model
 gdef = sess.graph.as_graph_def()
-trt_gdef = trt.CreateInferenceGraph(
+trt_gdef = trt.create_inference_graph(
     gdef, #original graph_def
     ["output"], #name of output node(s)
     max_batch_size, #maximum batch size to run the inference
diff --git a/tensorflow/contrib/tensorrt/__init__.py b/tensorflow/contrib/tensorrt/__init__.py
index 5072ab1196..fd551d70b4 100644
--- a/tensorflow/contrib/tensorrt/__init__.py
+++ b/tensorflow/contrib/tensorrt/__init__.py
@@ -12,8 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =============================================================================
+"""Exposes the python wrapper for TensorRT graph transforms."""
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+# pylint: disable=unused-import,wildcard-import
 from tensorflow.contrib.tensorrt.python import *
+# pylint: enable=unused-import,wildcard-import
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index 5c22c6265e..9ee717dd7f 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"  // NOLINT
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
diff --git a/tensorflow/contrib/tensorrt/python/__init__.py b/tensorflow/contrib/tensorrt/python/__init__.py
index 4aeea48515..7e050a768c 100644
--- a/tensorflow/contrib/tensorrt/python/__init__.py
+++ b/tensorflow/contrib/tensorrt/python/__init__.py
@@ -1,8 +1,24 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Exposes the python wrapper for TensorRT graph transforms."""
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# pylint: disable=unused-import,wildcard-import
+# pylint: disable=unused-import,line-too-long
 from tensorflow.contrib.tensorrt.python.ops import trt_engine_op
-from tensorflow.contrib.tensorrt.python.trt_convert import CreateInferenceGraph
-# pylint: enable=unused-import,wildcard-import
+from tensorflow.contrib.tensorrt.python.trt_convert import create_inference_graph
+# pylint: enable=unused-import,line-too-long
diff --git a/tensorflow/contrib/tensorrt/python/ops/trt_engine_op.py b/tensorflow/contrib/tensorrt/python/ops/trt_engine_op.py
index 97db23797f..31a313182b 100644
--- a/tensorflow/contrib/tensorrt/python/ops/trt_engine_op.py
+++ b/tensorflow/contrib/tensorrt/python/ops/trt_engine_op.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =============================================================================
+"""Exposes the Python wrapper of TRTEngineOp."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/tensorrt/python/trt_convert.py b/tensorflow/contrib/tensorrt/python/trt_convert.py
index 5161831282..69bbf451e0 100644
--- a/tensorflow/contrib/tensorrt/python/trt_convert.py
+++ b/tensorflow/contrib/tensorrt/python/trt_convert.py
@@ -13,24 +13,26 @@
 # limitations under the License.
 # =============================================================================
 """Exposes the Python wrapper conversion to trt_graph."""
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# pylint: disable=unused-import,wildcard-import, line-too-long
+# pylint: disable=unused-import,line-too-long
+import six as _six
+from tensorflow.contrib.tensorrt.wrap_conversion import trt_convert
 from tensorflow.core.framework import graph_pb2
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import errors_impl as _impl
-from tensorflow.contrib.tensorrt.wrap_conversion import trt_convert
 from tensorflow.python.framework import ops
-import six as _six
+
 
 # TODO(skama): get outputs from session when implemented as c++
 # optimization pass
-def CreateInferenceGraph(input_graph_def,
-                         outputs,
-                         max_batch_size=1,
-                         max_workspace_size_bytes=2 << 20):
+def create_inference_graph(input_graph_def,
+                           outputs,
+                           max_batch_size=1,
+                           max_workspace_size_bytes=2 << 20):
   """Python wrapper for the TRT transormation.
 
 
@@ -42,11 +44,17 @@ def CreateInferenceGraph(input_graph_def,
 
   Returns:
     New GraphDef with TRTEngineOps placed in graph replacing subgraphs.
+
+  Raises:
+    RuntimeError: if the returned status message is malformed.
   """
+
   def py2bytes(inp):
     return inp
+
   def py3bytes(inp):
-    return inp.encode('utf-8', errors='surrogateescape')
+    return inp.encode("utf-8", errors="surrogateescape")
+
   if _six.PY2:
     to_bytes = py2bytes
   else:
@@ -70,16 +78,18 @@ def CreateInferenceGraph(input_graph_def,
                     max_workspace_size_bytes)
   status = out[0]
   output_graph_def_string = to_bytes(out[1])
-  del input_graph_def_str  #save some memory
+  del input_graph_def_str  # Save some memory
   if len(status) < 2:
     raise _impl.UnknownError(None, None, status)
   if status[:2] != "OK":
     msg = status.split(";")
     if len(msg) == 1:
       raise RuntimeError("Status message is malformed {}".format(status))
+    # pylint: disable=protected-access
     raise _impl._make_specific_exception(None, None, ";".join(msg[1:]),
                                          int(msg[0]))
+    # pylint: enable=protected-access
   output_graph_def = graph_pb2.GraphDef()
   output_graph_def.ParseFromString(output_graph_def_string)
-  del output_graph_def_string  #save some memory
+  del output_graph_def_string  # Save some memory
   return output_graph_def
diff --git a/tensorflow/contrib/tensorrt/segment/segment_test.cc b/tensorflow/contrib/tensorrt/segment/segment_test.cc
index d7e10c123d..93c113e6ea 100644
--- a/tensorflow/contrib/tensorrt/segment/segment_test.cc
+++ b/tensorflow/contrib/tensorrt/segment/segment_test.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/c/c_api.h"
 #include "tensorflow/contrib/tensorrt/segment/segment.h"
+#include "tensorflow/c/c_api.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
diff --git a/tensorflow/contrib/tensorrt/test/test_tftrt.py b/tensorflow/contrib/tensorrt/test/test_tftrt.py
index ad7a85cf5a..927a3e4719 100644
--- a/tensorflow/contrib/tensorrt/test/test_tftrt.py
+++ b/tensorflow/contrib/tensorrt/test/test_tftrt.py
@@ -63,8 +63,8 @@ if "__main__" in __name__:
   inpDims = (100, 24, 24, 2)
   dummy_input = np.random.random_sample(inpDims)
   gdef = getSimpleGraphDef()
-  trt_graph = trt.CreateInferenceGraph(gdef, ["output"],
-                                       inpDims[0])  # Get optimized graph
+  trt_graph = trt.create_inference_graph(gdef, ["output"],
+                                         inpDims[0])  # Get optimized graph
   o1 = runGraph(gdef, dummy_input)
   o2 = runGraph(trt_graph, dummy_input)
   assert (np.array_equal(o1, o2))
-- 
GitLab


From c2a3935e2bd27d0befa7db5f9c050cfec057e5bb Mon Sep 17 00:00:00 2001
From: Loo Rong Jie <loorongjie@gmail.com>
Date: Sun, 11 Feb 2018 16:41:35 +0800
Subject: [PATCH 937/966] [MSVC] Use explicit func pointer to static method
 instead of lambda func

---
 tensorflow/compiler/xla/service/hlo_instruction.h | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index 50931c563a..3170746157 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -571,12 +571,9 @@ class HloInstruction {
     if (opcode() != other.opcode()) {
       return false;
     }
-    auto eq_shapes = layout_sensitive
-                         ? [](const Shape& a,
-                              const Shape& b) { return ShapeUtil::Equal(a, b); }
-                         : [](const Shape& a, const Shape& b) {
-                             return ShapeUtil::Compatible(a, b);
-                           };
+    using EqShapeFuncType = bool (*)(const Shape&, const Shape&);
+    EqShapeFuncType eq_shapes =
+        layout_sensitive ? ShapeUtil::Equal : ShapeUtil::Compatible;
     if (!eq_shapes(shape(), other.shape())) {
       return false;
     }
-- 
GitLab


From 695ae5a3de96069c58a4041faa05fc4f68825035 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 11 Feb 2018 03:44:24 -0800
Subject: [PATCH 938/966] Disable flaky halton_sequence_test

PiperOrigin-RevId: 185294455
---
 tensorflow/contrib/bayesflow/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/contrib/bayesflow/BUILD b/tensorflow/contrib/bayesflow/BUILD
index 8c856bb0b7..74712aeb67 100644
--- a/tensorflow/contrib/bayesflow/BUILD
+++ b/tensorflow/contrib/bayesflow/BUILD
@@ -195,6 +195,7 @@ cuda_py_test(
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
     ],
+    tags = ["no_mac"],  # b/73192243
 )
 
 cuda_py_test(
-- 
GitLab


From b1a934877a5e04b695f8bc15f3d723891abe9c89 Mon Sep 17 00:00:00 2001
From: gracehoney <31743510+aaroey@users.noreply.github.com>
Date: Sun, 11 Feb 2018 13:01:30 -0800
Subject: [PATCH 939/966] Fix segment_test and run clang-format on
 trt_conversion.i.

---
 .../contrib/tensorrt/segment/segment_test.cc  |  9 ++++
 tensorflow/contrib/tensorrt/trt_conversion.i  | 52 +++++++++----------
 2 files changed, 34 insertions(+), 27 deletions(-)

diff --git a/tensorflow/contrib/tensorrt/segment/segment_test.cc b/tensorflow/contrib/tensorrt/segment/segment_test.cc
index 93c113e6ea..74cbc5f2b3 100644
--- a/tensorflow/contrib/tensorrt/segment/segment_test.cc
+++ b/tensorflow/contrib/tensorrt/segment/segment_test.cc
@@ -117,6 +117,7 @@ TEST_F(SegmentTest, Empty) {
 
   // Expect no segments/subgraphs.
   EXPECT_TRUE(segments.empty());
+  TF_DeleteGraph(graph);
 }
 
 TEST_F(SegmentTest, Simple) {
@@ -167,6 +168,8 @@ TEST_F(SegmentTest, Simple) {
     EXPECT_TRUE(segments[0].find(ex) != segments[0].end())
         << "Missing expected node " << ex;
   }
+  TF_DeleteGraph(graph);
+  TF_DeleteStatus(s);
 }
 
 TEST_F(SegmentTest, AvoidCycle) {
@@ -214,6 +217,8 @@ TEST_F(SegmentTest, AvoidCycle) {
 
   // Expect no subgraphs
   EXPECT_EQ(segments.size(), 0);
+  TF_DeleteGraph(graph);
+  TF_DeleteStatus(s);
 }
 
 TEST_F(SegmentTest, Multiple) {
@@ -282,6 +287,8 @@ TEST_F(SegmentTest, Multiple) {
     EXPECT_TRUE(segments[1].find(ex) != segments[1].end())
         << "Missing expected node " << ex;
   }
+  TF_DeleteGraph(graph);
+  TF_DeleteStatus(s);
 }
 
 TEST_F(SegmentTest, BigIfElse) {
@@ -350,6 +357,8 @@ TEST_F(SegmentTest, BigIfElse) {
     EXPECT_TRUE(segments[1].find(ex) != segments[1].end())
         << "Missing expected node " << ex;
   }
+  TF_DeleteGraph(graph);
+  TF_DeleteStatus(s);
 }
 
 }  // namespace test
diff --git a/tensorflow/contrib/tensorrt/trt_conversion.i b/tensorflow/contrib/tensorrt/trt_conversion.i
index 40c33927ca..d679945d56 100644
--- a/tensorflow/contrib/tensorrt/trt_conversion.i
+++ b/tensorflow/contrib/tensorrt/trt_conversion.i
@@ -21,41 +21,39 @@ limitations under the License.
 %include "tensorflow/python/platform/base.i"
 
 %{
-  PyObject* pair_helper(std::pair<string,string>* in){
-    PyObject *first(nullptr), *second(nullptr), *tuple(nullptr);
-    first = PyBytes_FromStringAndSize(in->first.data(), in->first.length());
-    if(!first){
-      if(!PyErr_Occurred()){
-        PyErr_SetString(PyExc_TypeError,
-                        "Pair conversion first argument failed");
-      }
-      return NULL;
+PyObject* pair_helper(std::pair<string, string>* in) {
+  PyObject *first(nullptr), *second(nullptr), *tuple(nullptr);
+  first = PyBytes_FromStringAndSize(in->first.data(), in->first.length());
+  if (!first) {
+    if (!PyErr_Occurred()) {
+      PyErr_SetString(PyExc_TypeError, "Pair conversion first argument failed");
     }
-    second=PyBytes_FromStringAndSize(in->second.data(), in->second.length());
-    if(!second){
-      if(!PyErr_Occurred()){
-        PyErr_SetString(PyExc_TypeError,
-                        "Pair conversion second argument failed");
-      }
-      return NULL;
+    return NULL;
+  }
+  second = PyBytes_FromStringAndSize(in->second.data(), in->second.length());
+  if (!second) {
+    if (!PyErr_Occurred()) {
+      PyErr_SetString(PyExc_TypeError,
+                      "Pair conversion second argument failed");
     }
-    tuple=Py_BuildValue("(OO)", first, second);
-    if(!tuple){
-      if(!PyErr_Occurred()){
-        PyErr_SetString(PyExc_TypeError,
-                        "Tuple creation from pair<string,string> failed!");
-      }
-      return NULL;
+    return NULL;
+  }
+  tuple = Py_BuildValue("(OO)", first, second);
+  if (!tuple) {
+    if (!PyErr_Occurred()) {
+      PyErr_SetString(PyExc_TypeError,
+                      "Tuple creation from pair<string,string> failed!");
     }
-    return tuple;
+    return NULL;
   }
-
+  return tuple;
+}
 %}
 %typemap(out) std::pair<string, string> {
   PyObject *tuple = pair_helper(&$1);
-  if(!tuple) SWIG_fail;
+  if (!tuple) SWIG_fail;
   $result = tuple;
- }
+}
 %{
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
-- 
GitLab


From 30bb88dba14d7bfb0472bd79f949014b6f2902a7 Mon Sep 17 00:00:00 2001
From: Jianwei Xie <xiejw@google.com>
Date: Sun, 11 Feb 2018 15:54:39 -0800
Subject: [PATCH 940/966] [TPUEstimator] Automatically detect the TPU system
 information, including topology for model parallelism.

PiperOrigin-RevId: 185318852
---
 tensorflow/contrib/tpu/BUILD                  |   2 +
 .../contrib/tpu/python/tpu/tpu_config.py      |  39 +-
 .../contrib/tpu/python/tpu/tpu_config_test.py |   5 -
 .../contrib/tpu/python/tpu/tpu_context.py     | 492 ++++++++++++++++++
 .../contrib/tpu/python/tpu/tpu_estimator.py   | 393 +-------------
 .../tpu/python/tpu/tpu_system_metadata.py     | 139 +++++
 6 files changed, 676 insertions(+), 394 deletions(-)
 create mode 100644 tensorflow/contrib/tpu/python/tpu/tpu_context.py
 create mode 100644 tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py

diff --git a/tensorflow/contrib/tpu/BUILD b/tensorflow/contrib/tpu/BUILD
index a7d54d8a0c..c48e84ddfa 100644
--- a/tensorflow/contrib/tpu/BUILD
+++ b/tensorflow/contrib/tpu/BUILD
@@ -36,7 +36,9 @@ py_library(
     name = "tpu_estimator",
     srcs = [
         "python/tpu/tpu_config.py",
+        "python/tpu/tpu_context.py",
         "python/tpu/tpu_estimator.py",
+        "python/tpu/tpu_system_metadata.py",
         "python/tpu/util.py",
     ],
     srcs_version = "PY2AND3",
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_config.py b/tensorflow/contrib/tpu/python/tpu/tpu_config.py
index a1076b763d..6440702182 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_config.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_config.py
@@ -55,17 +55,18 @@ class TPUConfig(
       system before returning to CPU host for each `Session.run`. This means
       global step is increased `iterations_per_loop` times in one `Session.run`.
       It is recommended to be set as number of global steps for next checkpoint.
-    num_shards: The number of model replicas in the system. For
-      non-model-parallelism case, this number equals the total number of TPU
-      cores. For model-parallelism, the total number of TPU cores equals
+    num_shards: (Deprecated, ignored by TPUEstimator).
+      The number of model replicas in the system. For non-model-parallelism
+      case, this number equals the total number of TPU cores. For
+      model-parallelism, the total number of TPU cores equals
       product(computation_shape) * num_shards.
-    computation_shape: A list of size 3 which describes the shape of a model
-      replica's block of cores. This is required by model-parallelism which
-      enables partitioning the model to multiple cores. For example, [2, 2, 1]
-        means the model
-      is partitioned across 4 cores which span two cores in both x and y
-      coordinates. Set it to `None` for non-model-parallelism. Please refer to
-      ${tf.contrib.tpu.TopologyProto} for the geometry of a TPU mesh.
+    computation_shape: Defaults to `None`, which disables model parallelism. A
+      list of size 3 which describes the shape of a model replica's block of
+      cores. This is required by model-parallelism which enables partitioning
+      the model to multiple cores. For example, [2, 2, 1] means the model is
+      partitioned across 4 cores which span two cores in both x and y
+      coordinates.  Please refer to ${tf.contrib.tpu.TopologyProto} for the
+      geometry of a TPU mesh.
     per_host_input_for_training: If `True`, `input_fn` is invoked Per-Host
       rather than Per-Core. With Per-Host input pipeline deployment, `input_fn`
       is invoked once on each host. With Per-Core input pipeline deployment, it
@@ -80,13 +81,14 @@ class TPUConfig(
     initial_infeed_sleep_secs: The number of seconds the infeed thread should
       wait before enqueueing the first batch. This helps avoid timeouts for
       models that require a long compilation time.
+
     Raises:
       ValueError: If `computation_shape` or `computation_shape` are invalid.
   """
 
   def __new__(cls,
               iterations_per_loop=2,
-              num_shards=2,
+              num_shards=None,
               computation_shape=None,
               per_host_input_for_training=True,
               tpu_job_name=None,
@@ -97,7 +99,8 @@ class TPUConfig(
                                     'TPUConfig iterations_per_loop')
 
     # Check num_shards.
-    util_lib.check_positive_integer(num_shards, 'TPUConfig num_shards')
+    if num_shards is not None:
+      util_lib.check_positive_integer(num_shards, 'TPUConfig num_shards')
 
     # Check computation_shape
     if computation_shape is not None and len(computation_shape) != 3:
@@ -112,18 +115,6 @@ class TPUConfig(
       if any(computation_shape_array < 1) or any(computation_shape_array > 2):
         raise ValueError('computation_shape elements can only be 1 or 2; got '
                          'computation_shape={}'.format(computation_shape))
-      max_replicas_per_host = (
-          _NUM_CORES_PER_HOST // np.prod(computation_shape_array))
-      if num_shards > max_replicas_per_host and (
-          num_shards % max_replicas_per_host != 0):
-        raise ValueError(
-            '{0} shards can not be evenly distributed across'
-            ' multiple hosts. Each shard needs {1} cores and each'
-            ' host has {2} cores. Thus {0} shards needs {3} hosts.'
-            ' Please adjust num shards so that num_shards is'
-            ' divisible by {4} or <= {4}.'.format(
-                num_shards, np.prod(computation_shape), _NUM_CORES_PER_HOST,
-                num_shards / max_replicas_per_host, max_replicas_per_host))
 
     # Check initial_infeed_sleep_secs.
     if initial_infeed_sleep_secs:
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_config_test.py b/tensorflow/contrib/tpu/python/tpu/tpu_config_test.py
index 2c9d7be232..37ef3dbe1e 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_config_test.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_config_test.py
@@ -53,11 +53,6 @@ class TPURunConfigTest(test.TestCase):
                                  'computation_shape elements can only be'):
       tpu_config_lib.TPUConfig(computation_shape=[1, 3, 1])
 
-  def test_fail_with_invalid_shards(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 'shards can not be evenly distributed across'):
-      tpu_config_lib.TPUConfig(num_shards=6, computation_shape=[1, 1, 2])
-
 
 class TPURunConfigMasterTest(test.TestCase):
 
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_context.py b/tensorflow/contrib/tpu/python/tpu/tpu_context.py
new file mode 100644
index 0000000000..8c65018d14
--- /dev/null
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_context.py
@@ -0,0 +1,492 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===================================================================
+"""TPU system metdata and associated tooling."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from contextlib import contextmanager
+import copy
+
+import numpy as np
+
+from tensorflow.contrib.tpu.python.tpu import device_assignment  as tpu_device_assignment
+from tensorflow.contrib.tpu.python.tpu import tpu_system_metadata as tpu_system_metadata_lib
+from tensorflow.python.estimator import model_fn as model_fn_lib
+from tensorflow.python.platform import tf_logging as logging
+
+
+_DEFAULT_JOB_NAME = 'tpu_worker'
+_DEFAULT_COORDINATOR_JOB_NAME = 'coordinator'
+_LOCAL_MASTERS = ('', 'local')
+
+
+class _TPUContext(object):
+  """A context holds immutable states of TPU computation.
+
+  This immutable object holds TPUEstimator config, train/eval batch size, and
+  `TPUEstimator.use_tpu`, which is expected to be passed around. It also
+  provides utility functions, basded on the current state, to determine other
+  information commonly required by TPU computation, such as TPU device names,
+  TPU hosts, shard batch size, etc.
+
+  N.B. As `mode` is not immutable state in Estimator, but essential to
+  distinguish between TPU training and evaluation, a common usage for
+  _TPUContext with `mode` is as follows:
+  ```
+  with _ctx.with_mode(mode) as ctx:
+    if ctx.is_running_on_cpu():
+       ...
+  ```
+  """
+
+  def __init__(self, config, train_batch_size, eval_batch_size,
+               predict_batch_size, use_tpu):
+    self._config = config
+    self._train_batch_size = train_batch_size
+    self._eval_batch_size = eval_batch_size
+    self._predict_batch_size = predict_batch_size
+    self._use_tpu = use_tpu
+    self._model_parallelism_enabled = (
+        use_tpu and config.tpu_config.computation_shape)
+    self._mode = None
+
+    self._lazy_tpu_system_metadata_dict = {}  # key by master address
+    self._lazy_device_assignment_dict = {}  # key by master address
+    self._lazy_validation_dict = {}  # key by ModeKeys
+
+  def _assert_mode(self):
+    if self._mode is None:
+      raise RuntimeError(
+          '`mode` needs to be set via contextmanager `with_mode`.')
+    return self._mode
+
+  @contextmanager
+  def with_mode(self, mode):
+    # NOTE(xiejw): Shallow copy is enough. It will share he lazy dictionaries,
+    # such as _lazy_tpu_system_metadata_dict between new copy and the original
+    # one. Note that all lazy states stored in properties _lazy_foo are sort of
+    # immutable as they should be same for the process lifetime.
+    new_ctx = copy.copy(self)
+    new_ctx._mode = mode  # pylint: disable=protected-access
+    yield new_ctx
+
+  @property
+  def mode(self):
+    return self._assert_mode()
+
+  def _get_master_address(self):
+    mode = self._assert_mode()
+    config = self._config
+    master = (
+        config.master
+        if mode != model_fn_lib.ModeKeys.EVAL else config.evaluation_master)
+    return master
+
+  def _get_tpu_system_metadata(self):
+    """Gets the (maybe cached) TPU system metadata."""
+    master = self._get_master_address()
+    tpu_system_metadata = self._lazy_tpu_system_metadata_dict.get(master)
+    if tpu_system_metadata is not None:
+      return tpu_system_metadata
+
+    # pylint: disable=protected-access
+    tpu_system_metadata = (
+        tpu_system_metadata_lib._query_tpu_system_metadata(
+            master, query_topology=self.model_parallelism_enabled))
+
+    self._lazy_tpu_system_metadata_dict[master] = tpu_system_metadata
+    return tpu_system_metadata
+
+  def _get_device_assignment(self):
+    """Gets the (maybe cached) TPU device assignment."""
+    master = self._get_master_address()
+    device_assignment = self._lazy_device_assignment_dict.get(master)
+    if device_assignment is not None:
+      return device_assignment
+
+    tpu_system_metadata = self._get_tpu_system_metadata()
+
+    device_assignment = tpu_device_assignment.device_assignment(
+        tpu_system_metadata.topology,
+        computation_shape=self._config.tpu_config.computation_shape,
+        num_replicas=self.num_replicas)
+
+    logging.info('computation_shape: %s',
+                 str(self._config.tpu_config.computation_shape))
+    logging.info('num_replicas: %d', self.num_replicas)
+    logging.info('device_assignment.topology.device_coordinates: %s',
+                 str(device_assignment.topology.device_coordinates))
+    logging.info('device_assignment.core_assignment: %s',
+                 str(device_assignment.core_assignment))
+
+    self._lazy_device_assignment_dict[master] = device_assignment
+    return device_assignment
+
+  @property
+  def model_parallelism_enabled(self):
+    return self._model_parallelism_enabled
+
+  @property
+  def device_assignment(self):
+    return (self._get_device_assignment()
+            if self._model_parallelism_enabled else None)
+
+  @property
+  def num_of_cores_per_host(self):
+    metadata = self._get_tpu_system_metadata()
+    return metadata.num_of_cores_per_host
+
+  @property
+  def num_cores(self):
+    metadata = self._get_tpu_system_metadata()
+    return metadata.num_cores
+
+  @property
+  def num_of_replicas_per_host(self):
+    if self.model_parallelism_enabled:
+      return self.num_replicas // self.num_hosts
+    else:
+      return self.num_of_cores_per_host
+
+  @property
+  def num_replicas(self):
+    num_cores_in_system = self.num_cores
+
+    if self.model_parallelism_enabled:
+      computation_shape_array = np.asarray(
+          self._config.tpu_config.computation_shape, dtype=np.int32)
+      num_cores_per_replica = np.prod(computation_shape_array)
+      if num_cores_per_replica > num_cores_in_system:
+        raise ValueError(
+            'The num of cores required by the model parallelism, specified by '
+            'TPUConfig.computation_shape, is larger than the total num of '
+            'TPU cores in the system. computation_shape: {}, num cores '
+            'in the system: {}'.format(
+                self._config.tpu_config.computation_shape,
+                num_cores_in_system))
+
+      if num_cores_in_system % num_cores_per_replica != 0:
+        raise RuntimeError(
+            'The num of cores in the system ({}) is not divisible by the num '
+            'of cores ({}) required by the model parallelism, specified by '
+            'TPUConfig.computation_shape. This should never happen!'.format(
+                num_cores_in_system, num_cores_per_replica))
+
+      return num_cores_in_system // num_cores_per_replica
+    else:
+      return num_cores_in_system
+
+  @property
+  def num_hosts(self):
+    metadata = self._get_tpu_system_metadata()
+    return metadata.num_hosts
+
+  @property
+  def config(self):
+    return self._config
+
+  def is_input_sharded_per_core(self):
+    """Return true if input_fn is invoked per-core (other than per-host)."""
+    mode = self._assert_mode()
+    return (mode == model_fn_lib.ModeKeys.TRAIN and
+            not self._config.tpu_config.per_host_input_for_training)
+
+  def is_running_on_cpu(self, is_export_mode=False):
+    """Determines whether the input_fn and model_fn should be invoked on CPU.
+
+    This API also validates user provided configuration, such as batch size,
+    according the lazy initialized TPU system metadata.
+
+    Args:
+      is_export_mode: Indicates whether the current mode is for exporting the
+        model, when mode == PREDICT. Only with this bool, we could
+        tell whether user is calling the Estimator.predict or
+        Estimator.export_savedmodel, which are running on TPU and CPU
+        respectively. Parent class Estimator does not distingush these two.
+
+    Returns:
+      bool, whether current input_fn or model_fn should be running on CPU.
+
+    Raises:
+      ValueError: any configuration is invalid.
+    """
+
+    is_running_on_cpu = self._is_running_on_cpu(is_export_mode)
+    if not is_running_on_cpu:
+      self._validate_tpu_configuration()
+    return is_running_on_cpu
+
+  def _is_running_on_cpu(self, is_export_mode):
+    """Determines whether the input_fn and model_fn should be invoked on CPU."""
+    mode = self._assert_mode()
+
+    if not self._use_tpu:
+      return True
+
+    if mode != model_fn_lib.ModeKeys.PREDICT:
+      return False
+
+    # There are actually 2 use cases when running with mode.PREDICT: prediction
+    # and saving the model.  We run actual predictions on the TPU, but
+    # model export is run on the CPU.
+    if is_export_mode:
+      return True
+
+    return False
+
+  @property
+  def global_batch_size(self):
+    mode = self._assert_mode()
+    if mode == model_fn_lib.ModeKeys.TRAIN:
+      return self._train_batch_size
+    elif mode == model_fn_lib.ModeKeys.EVAL:
+      return self._eval_batch_size
+    elif mode == model_fn_lib.ModeKeys.PREDICT:
+      return self._predict_batch_size
+    else:
+      return None
+
+  @property
+  def batch_size_for_input_fn(self):
+    """Returns the shard batch size for `input_fn`."""
+    global_batch_size = self.global_batch_size
+
+    if self.is_running_on_cpu():
+      return global_batch_size
+
+    # On TPU
+    if self.is_input_sharded_per_core():
+      # We prohibit per core input sharding for the model parallelism case,
+      # therefore it is safe to use num_cores here.
+      return global_batch_size // self.num_cores
+    else:
+      return global_batch_size // self.num_hosts
+
+  @property
+  def batch_size_for_model_fn(self):
+    """Returns the shard batch size for `model_fn`."""
+    global_batch_size = self.global_batch_size
+
+    if self.is_running_on_cpu():
+      return global_batch_size
+
+    # On TPU. always sharded per shard.
+    return global_batch_size // self.num_replicas
+
+  @property
+  def master_job(self):
+    """Returns the job name to use to place TPU computations on.
+
+    Returns:
+      A string containing the job name, or None if no job should be specified.
+
+    Raises:
+      ValueError: If the user needs to specify a tpu_job_name, because we are
+        unable to infer the job name automatically, or if the user-specified job
+        names are inappropriate.
+    """
+    run_config = self._config
+    # If the user specifies the tpu_job_name, use that.
+    if run_config.tpu_config.tpu_job_name:
+      return run_config.tpu_config.tpu_job_name
+
+    # The tpu job is determined by the run_config. Right now, this method is
+    # required as tpu_config is not part of the RunConfig.
+    mode = self._assert_mode()
+    master = (
+        run_config.evaluation_master
+        if mode == model_fn_lib.ModeKeys.EVAL else run_config.master)
+    if master in _LOCAL_MASTERS:
+      return None
+
+    if (not run_config.session_config or
+        not run_config.session_config.cluster_def.job):
+      return _DEFAULT_JOB_NAME
+    cluster_def = run_config.session_config.cluster_def
+    job_names = set([job.name for job in cluster_def.job])
+    if _DEFAULT_JOB_NAME in job_names:
+      # b/37868888 tracks allowing ClusterSpec propagation to reuse job names.
+      raise ValueError('Currently, tpu_worker is not an allowed job name.')
+    if len(job_names) == 1:
+      return cluster_def.job[0].name
+    if len(job_names) == 2:
+      if _DEFAULT_COORDINATOR_JOB_NAME in job_names:
+        job_names.remove(_DEFAULT_COORDINATOR_JOB_NAME)
+        return job_names.pop()
+      # TODO(b/67716447): Include more sophisticated heuristics.
+    raise ValueError(
+        'Could not infer TPU job name. Please specify a tpu_job_name as part '
+        'of your TPUConfig.')
+
+  @property
+  def tpu_host_placement_function(self):
+    """Returns the TPU host place function."""
+    master = self.master_job
+
+    def _placement_function(_sentinal=None, core_id=None, host_id=None):  # pylint: disable=invalid-name
+      assert _sentinal is None
+      if core_id is not None and host_id is not None:
+        raise RuntimeError(
+            'core_id and host_id can have only one non-None value.')
+
+      if master is None:
+        return '/replica:0/task:0/device:CPU:0'
+      else:
+        if core_id is not None:
+          host_id = core_id / self.num_of_cores_per_host
+        return '/job:%s/task:%d/device:CPU:0' % (master, host_id)
+
+    return _placement_function
+
+  @property
+  def tpu_device_placement_function(self):
+    """Returns a TPU device placement Fn."""
+    master = self.master_job
+    job_device = '' if master is None else ('/job:%s' % master)
+
+    def _placement_function(i):
+      if self.model_parallelism_enabled:
+        return self.device_assignment.tpu_device(replica=i, job=master)
+      else:
+        num_of_cores_per_host = self.num_of_cores_per_host
+        host_id = i / num_of_cores_per_host
+        ordinal_id = i % num_of_cores_per_host
+        return '%s/task:%d/device:TPU:%d' % (job_device, host_id, ordinal_id)
+
+    return _placement_function
+
+  @property
+  def tpu_ordinal_function(self):
+    """Returns the TPU ordinal fn."""
+
+    def _tpu_ordinal_function(index):
+      """Return the TPU ordinal associated with a shard.
+
+      Required because the enqueue ops are placed on CPU.
+
+      Args:
+        index: the shard index
+
+      Returns:
+        The ordinal of the TPU device the shard's infeed should be placed on.
+      """
+      if self.model_parallelism_enabled:
+        return self.device_assignment.tpu_ordinal(replica=index)
+      else:
+        return index % self.num_of_cores_per_host
+
+    return _tpu_ordinal_function
+
+  def _validate_tpu_configuration(self):
+    """Validates the configuration based on the TPU system metadata."""
+    mode = self._assert_mode()
+    if self._lazy_validation_dict.get(mode):
+      return
+
+    # All following information is obtained from TPU system metadata.
+    num_cores = self.num_cores
+    num_replicas = self.num_replicas
+    num_hosts = self.num_hosts
+
+    if not num_cores:
+      tpu_system_metadata = self._get_tpu_system_metadata()
+      raise RuntimeError(
+          'Cannot find any TPU cores in the system. Please double check '
+          'Tensorflow master address and TPU worker(s). Available devices '
+          'are {}.'.format(tpu_system_metadata.devices))
+
+    if mode == model_fn_lib.ModeKeys.TRAIN:
+      if self._train_batch_size % num_replicas != 0:
+        raise ValueError(
+            'train batch size {} must be divisible by number of replicas {}'
+            .format(self._train_batch_size, num_replicas))
+
+    elif mode == model_fn_lib.ModeKeys.EVAL:
+      if self._eval_batch_size is None:
+        raise ValueError(
+            'eval_batch_size in TPUEstimator constructor cannot be `None`'
+            'if .evaluate is running on TPU.')
+      if self._eval_batch_size % num_replicas != 0:
+        raise ValueError(
+            'eval batch size {} must be divisible by number of replicas {}'
+            .format(self._eval_batch_size, num_replicas))
+      if num_hosts > 1:
+        raise ValueError(
+            'TPUEstimator.evaluate should be running on single TPU worker. '
+            'got {}.'.format(num_hosts))
+    else:
+      assert mode == model_fn_lib.ModeKeys.PREDICT
+      if self._predict_batch_size is None:
+        raise ValueError(
+            'predict_batch_size in TPUEstimator constructor should not be '
+            '`None` if .predict is running on TPU.')
+      if self._predict_batch_size % num_replicas != 0:
+        raise ValueError(
+            'predict batch size {} must be divisible by number of replicas {}'
+            .format(self._predict_batch_size, num_replicas))
+      if num_hosts > 1:
+        raise ValueError(
+            'TPUEstimator.predict should be running on single TPU worker. '
+            'got {}.'.format(num_hosts))
+
+    # Record the state "validated" into lazy dictionary.
+    self._lazy_validation_dict[mode] = True
+
+
+class _OneCoreTPUContext(_TPUContext):
+  """Special _TPUContext for one core usage."""
+
+  def __init__(self, config, train_batch_size, eval_batch_size,
+               predict_batch_size, use_tpu):
+
+    super(_OneCoreTPUContext, self).__init__(
+        config, train_batch_size, eval_batch_size,
+        predict_batch_size, use_tpu)
+
+  def _get_tpu_system_metadata(self):
+    """Gets the (maybe cached) TPU system metadata."""
+    master = self._get_master_address()
+    tpu_system_metadata = self._lazy_tpu_system_metadata_dict.get(master)
+    if tpu_system_metadata is not None:
+      return tpu_system_metadata
+
+    tpu_system_metadata = (
+        tpu_system_metadata_lib._TPUSystemMetadata(  # pylint: disable=protected-access
+            num_cores=1,
+            num_hosts=1,
+            num_of_cores_per_host=1,
+            topology=None,
+            devices=[]))
+
+    self._lazy_tpu_system_metadata_dict[master] = tpu_system_metadata
+    return tpu_system_metadata
+
+
+def _get_tpu_context(config, train_batch_size, eval_batch_size,
+                     predict_batch_size, use_tpu):
+  """Returns an instance of `_TPUContext`."""
+
+  if (config.tpu_config.num_shards == 1 and
+      config.tpu_config.computation_shape is None):
+    logging.warning(
+        'Setting TPUConfig.num_shards==1 is an unsupported behavior. '
+        'Please fix as soon as possible (leaving num_shards as None.')
+    return _OneCoreTPUContext(config, train_batch_size, eval_batch_size,
+                              predict_batch_size, use_tpu)
+
+  return _TPUContext(config, train_batch_size, eval_batch_size,
+                     predict_batch_size, use_tpu)
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index 7d2f6556fb..ff53fe4f5d 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
-from contextlib import contextmanager
 import copy
 import signal
 import threading
@@ -34,13 +33,12 @@ from tensorflow.contrib.summary import summary_ops as contrib_summary
 from tensorflow.contrib.tpu.python.ops import tpu_ops
 from tensorflow.contrib.tpu.python.tpu import tpu
 from tensorflow.contrib.tpu.python.tpu import tpu_config
+from tensorflow.contrib.tpu.python.tpu import tpu_context
 from tensorflow.contrib.tpu.python.tpu import tpu_feed
 from tensorflow.contrib.tpu.python.tpu import training_loop
 from tensorflow.contrib.tpu.python.tpu import util as util_lib
-from tensorflow.contrib.tpu.python.tpu.device_assignment import device_assignment as tpu_device_assignment
 from tensorflow.core.framework.summary_pb2 import Summary
 from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.client import session as tf_session
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.estimator import estimator as estimator_lib
 from tensorflow.python.estimator import model_fn as model_fn_lib
@@ -71,7 +69,7 @@ _TPU_ESTIMATOR = 'tpu_estimator'
 _ITERATIONS_PER_LOOP_VAR = 'iterations_per_loop'
 _BATCH_SIZE_KEY = 'batch_size'
 _CROSS_REPLICA_SUM_OP = 'CrossReplicaSum'
-_PINGING_MASTER_TIMEOUT_IN_MS = 300 * 1000  # 5 minutes
+
 _RESERVED_PARAMS_KEYS = [_BATCH_SIZE_KEY]
 
 
@@ -149,285 +147,6 @@ def _increase_eval_step_op(iterations_per_loop):
       use_locking=True)
 
 
-_DEFAULT_JOB_NAME = 'tpu_worker'
-_DEFAULT_COORDINATOR_JOB_NAME = 'coordinator'
-_LOCAL_MASTERS = ('', 'local')
-
-
-class _TPUContext(object):
-  """A context holds immutable states of TPU computation.
-
-  This immutable object holds TPUEstimator config, train/eval batch size, and
-  `TPUEstimator.use_tpu`, which is expected to be passed around. It also
-  provides utility functions, basded on the current state, to determine other
-  information commonly required by TPU computation, such as TPU device names,
-  TPU hosts, shard batch size, etc.
-
-  N.B. As `mode` is not immutable state in Estimator, but essential to
-  distinguish between TPU training and evaluation, a common usage for
-  _TPUContext with `mode` is as follows:
-  ```
-  with _ctx.with_mode(mode) as ctx:
-    if ctx.is_running_on_cpu():
-       ...
-  ```
-  """
-
-  def __init__(self, config, train_batch_size, eval_batch_size,
-               predict_batch_size, use_tpu, device_assignment):
-    self._config = config
-    self._train_batch_size = train_batch_size
-    self._eval_batch_size = eval_batch_size
-    self._predict_batch_size = predict_batch_size
-    self._use_tpu = use_tpu
-    self._mode = None
-    self._device_assignment = device_assignment
-    self._max_cores_per_host = 8
-
-  def _assert_mode(self):
-    if self._mode is None:
-      raise RuntimeError(
-          '`mode` needs to be set via contextmanager `with_mode`.')
-    return self._mode
-
-  @property
-  def num_of_cores_per_host(self):
-    num_cores = self.num_cores
-    return min(num_cores, self._max_cores_per_host)
-
-  @property
-  def num_of_shards_per_host(self):
-    if self._device_assignment:
-      maximum_shards_per_host = (
-          self._max_cores_per_host //
-          self._device_assignment.num_cores_per_replica)
-      return min(self.num_shards, maximum_shards_per_host)
-    else:
-      num_cores = self.num_cores
-      return min(num_cores, self._max_cores_per_host)
-
-  @contextmanager
-  def with_mode(self, mode):
-    new_ctx = copy.copy(self)  # Shallow copy is enough.
-    new_ctx._mode = mode  # pylint: disable=protected-access
-    yield new_ctx
-
-  @property
-  def mode(self):
-    return self._assert_mode()
-
-  @property
-  def num_cores(self):
-    # TODO(xiejw): Adds lazy num_shards initialization.
-    if self._device_assignment:
-      return self._device_assignment.num_cores_per_replica * self.num_shards
-    else:
-      return self.num_shards
-
-  @property
-  def num_shards(self):
-    return self._config.tpu_config.num_shards
-
-  @property
-  def num_hosts(self):
-    return self.num_cores // self.num_of_cores_per_host
-
-  @property
-  def config(self):
-    return self._config
-
-  def is_input_sharded_per_core(self):
-    """Return true if input_fn is invoked per-core (other than per-host)."""
-    self._assert_mode()
-    return (self._mode == model_fn_lib.ModeKeys.TRAIN and
-            not self._config.tpu_config.per_host_input_for_training)
-
-  def is_running_on_cpu(self, is_export_mode=False):
-    """Determines whether the input_fn and model_fn should be invoked on CPU.
-
-    Args:
-      is_export_mode: Indicates whether the current mode is for exporting the
-        model, when mode == PREDICT. Only with this bool, we could
-        tell whether user is calling the Estimator.predict or
-        Estimator.export_savedmodel, which are running on TPU and CPU
-        respectively. Parent class Estimator does not distingush these two.
-
-    Returns:
-      bool, whether current input_fn or model_fn should be running on CPU.
-
-    Raises:
-      ValueError: any configuration is invalid.
-    """
-    mode = self._assert_mode()
-
-    if not self._use_tpu:
-      return True
-
-    if mode != model_fn_lib.ModeKeys.PREDICT:
-      return False
-
-    # There are actually 2 use cases when running with mode.PREDICT: prediction
-    # and saving the model.  We run actual predictions on the TPU, but
-    # model export is run on the CPU.
-    if is_export_mode:
-      return True
-
-    if self._predict_batch_size is None:
-      raise ValueError(
-          'predict_batch_size in TPUEstimator constructor should not be '
-          '`None` if .predict is running on TPU.')
-    if self.num_hosts > 1:
-      raise ValueError(
-          'TPUEstimator.predict should be running on single host.')
-
-    return False
-
-  @property
-  def global_batch_size(self):
-    mode = self._assert_mode()
-    if mode == model_fn_lib.ModeKeys.TRAIN:
-      return self._train_batch_size
-    elif mode == model_fn_lib.ModeKeys.EVAL:
-      return self._eval_batch_size
-    elif mode == model_fn_lib.ModeKeys.PREDICT:
-      return self._predict_batch_size
-    else:
-      return None
-
-  @property
-  def batch_size_for_input_fn(self):
-    """Returns the shard batch size for `input_fn`."""
-    global_batch_size = self.global_batch_size
-
-    if self.is_running_on_cpu():
-      return global_batch_size
-
-    # On TPU
-    if self.is_input_sharded_per_core():
-      # We prohibit per core input sharding for the model parallelism case,
-      # therefore it is safe to use num_cores here.
-      return global_batch_size // self.num_cores
-    else:
-      return global_batch_size // self.num_hosts
-
-  @property
-  def batch_size_for_model_fn(self):
-    """Returns the shard batch size for `model_fn`."""
-    global_batch_size = self.global_batch_size
-
-    if self.is_running_on_cpu():
-      return global_batch_size
-
-    return global_batch_size // self.num_shards
-
-  @property
-  def master_job(self):
-    """Returns the job name to use to place TPU computations on.
-
-    Returns:
-      A string containing the job name, or None if no job should be specified.
-
-    Raises:
-      ValueError: If the user needs to specify a tpu_job_name, because we are
-        unable to infer the job name automatically, or if the user-specified job
-        names are inappropriate.
-    """
-    run_config = self._config
-    # If the user specifies the tpu_job_name, use that.
-    if run_config.tpu_config.tpu_job_name:
-      return run_config.tpu_config.tpu_job_name
-
-    # The tpu job is determined by the run_config. Right now, this method is
-    # required as tpu_config is not part of the RunConfig.
-    mode = self._assert_mode()
-    master = (
-        run_config.evaluation_master
-        if mode == model_fn_lib.ModeKeys.EVAL else run_config.master)
-    if master in _LOCAL_MASTERS:
-      return None
-
-    if (not run_config.session_config or
-        not run_config.session_config.cluster_def.job):
-      return _DEFAULT_JOB_NAME
-    cluster_def = run_config.session_config.cluster_def
-    job_names = set([job.name for job in cluster_def.job])
-    if _DEFAULT_JOB_NAME in job_names:
-      # b/37868888 tracks allowing ClusterSpec propagation to reuse job names.
-      raise ValueError('Currently, tpu_worker is not an allowed job name.')
-    if len(job_names) == 1:
-      return cluster_def.job[0].name
-    if len(job_names) == 2:
-      if _DEFAULT_COORDINATOR_JOB_NAME in job_names:
-        job_names.remove(_DEFAULT_COORDINATOR_JOB_NAME)
-        return job_names.pop()
-      # TODO(b/67716447): Include more sophisticated heuristics.
-    raise ValueError(
-        'Could not infer TPU job name. Please specify a tpu_job_name as part '
-        'of your TPUConfig.')
-
-  @property
-  def tpu_host_placement_function(self):
-    """Returns the TPU host place function."""
-    master = self.master_job
-
-    def _placement_function(_sentinal=None, core_id=None, host_id=None):  # pylint: disable=invalid-name
-      assert _sentinal is None
-      if core_id is not None and host_id is not None:
-        raise RuntimeError(
-            'core_id and host_id can have only one non-None value.')
-
-      if master is None:
-        return '/replica:0/task:0/device:CPU:0'
-      else:
-        # This assumes that if using more than 8 shards,
-        # the job configuration varies 'task'.
-        if core_id is not None:
-          host_id = core_id / 8
-        return '/job:%s/task:%d/device:CPU:0' % (master, host_id)
-
-    return _placement_function
-
-  @property
-  def tpu_device_placement_function(self):
-    """Returns the TPU device place function."""
-    master = self.master_job
-    job_device = '' if master is None else ('/job:%s' % master)
-
-    def _placement_function(i):
-      if self._device_assignment:
-        return self._device_assignment.tpu_device(replica=i, job=master)
-      else:
-        return '%s/task:%d/device:TPU:%d' % (job_device, i / 8, i % 8)
-
-    return _placement_function
-
-  @property
-  def tpu_ordinal_function(self):
-    """Returns the TPU ordinal fn."""
-
-    def _tpu_ordinal_function(index):
-      """Return the TPU ordinal associated with a shard.
-
-      Required because the enqueue ops are placed on CPU.
-
-      Args:
-        index: the shard index
-
-      Returns:
-        The ordinal of the TPU device the shard's infeed should be placed on.
-      """
-      if self._device_assignment:
-        return self._device_assignment.tpu_ordinal(replica=index)
-      else:
-        return index % 8
-
-    return _tpu_ordinal_function
-
-  @property
-  def device_assignment(self):
-    return self._device_assignment
-
-
 class _SIGNAL(object):
   """Signal used to control the thread of infeed/outfeed.
 
@@ -963,20 +682,23 @@ def generate_per_host_enqueue_ops_fn_for_host(
     if is_dataset:
       hooks.append(inputs.dataset_initializer_hook())
 
+  # TODO(ylc): Refactoring the code to merge the tpu ordinal logic here and the
+  # _TPUContext.tpu_ordinal_function. We should either introduce another
+  # abstraction or a different helper method.
   def _tpu_ordinal_function_impl(shard_index_in_host):
     # We put both enqueue/dequeue op at tpu.core(0) in each replica.
     replica = ctx.device_assignment.lookup_replicas(
         host_id, (0, 0, 0))[shard_index_in_host]
     return ctx.device_assignment.tpu_ordinal(replica=replica)
 
-  if ctx.device_assignment:
+  if ctx.model_parallelism_enabled:
     tpu_ordinal_function = _tpu_ordinal_function_impl
   else:
     tpu_ordinal_function = None
 
   def enqueue_ops_fn():
     with ops.device(device):
-      num_of_shards_per_host = ctx.num_of_shards_per_host
+      num_of_replicas_per_host = ctx.num_of_replicas_per_host
       # Convert user input to features and labels.  If the user returns a
       # dataset, it is initialized and the features and labels extracted via
       # `dataset.iterator.get_next()`
@@ -994,7 +716,7 @@ def generate_per_host_enqueue_ops_fn_for_host(
           tuple_shapes=[t.shape for t in unsharded_tensor_list],
           shard_dimensions=batch_axis)
       captured_infeed_queue.capture(infeed_queue)
-      infeed_queue.set_number_of_shards(num_of_shards_per_host)
+      infeed_queue.set_number_of_shards(num_of_replicas_per_host)
       per_host_enqueue_ops = (
           infeed_queue.split_inputs_and_generate_enqueue_ops(
               unsharded_tensor_list,
@@ -1665,7 +1387,7 @@ class _OutfeedHostCall(object):
     # constraint it such that we have at most one outfeed dequeue and enqueue
     # per replica.
     tpu_device_placement_fn = self._ctx.tpu_device_placement_function
-    for i in xrange(self._ctx.num_shards):
+    for i in xrange(self._ctx.num_replicas):
       with ops.device(tpu_device_placement_fn(i)):
         outfeed_tensors = tpu_ops.outfeed_dequeue_tuple(
             dtypes=tensor_dtypes, shapes=tensor_shapes)
@@ -1890,12 +1612,12 @@ class TPUEstimator(estimator_lib.Estimator):
       train_batch_size: An int representing the global training batch size.
         TPUEstimator transforms this global batch size to a per-shard batch
         size, as params['batch_size'], when calling `input_fn` and `model_fn`.
-        Cannot be `None` if `use_tpu` is `True`. Must be divisible by
-        `config.tpu_config.num_shards`.
+        Cannot be `None` if `use_tpu` is `True`.
+        Must be divisible by total number of replicas.
       eval_batch_size: An int representing evaluation batch size.
-        Must be divisible by `config.tpu_config.num_shards`.
+        Must be divisible by total number of replicas.
       predict_batch_size: An int representing the prediction batch size.
-        Must be divisible by `config.tpu_config.num_shards`.
+        Must be divisible by total number of replicas.
       batch_axis: A python tuple of int values describing how each tensor
         produced by the Estimator `input_fn` should be split across the TPU
         compute shards. For example, if your input_fn produced (images, labels)
@@ -1919,45 +1641,24 @@ class TPUEstimator(estimator_lib.Estimator):
           _RESERVED_PARAMS_KEYS, params))
 
     if use_tpu:
+      # Perform some very basic validations. More validations will be found in
+      # _TPUContext.
       if train_batch_size is None:
         raise ValueError('`train_batch_size` cannot be `None`')
-      if not isinstance(train_batch_size, int):
-        raise ValueError('`train_batch_size` must be an int')
-      if train_batch_size < 1:
-        raise ValueError('`train_batch_size` must be positive')
-
-      # The specified batch size is the batch size for the entire computation.
-      # The input_fn and model_fn are called per-shard, so we want to calculate
-      # the per-shard batch size and pass that.
-      if train_batch_size % config.tpu_config.num_shards != 0:
-        raise ValueError(
-            'train batch size {} must be divisible by number of shards {}'
-            .format(train_batch_size, config.tpu_config.num_shards))
+      util_lib.check_positive_integer(train_batch_size, 'train_batch_size')
 
       if (not config.tpu_config.per_host_input_for_training and
           config.tpu_config.computation_shape):
         raise ValueError(
-            'Model parallelism only supports per host input for training.')
+            'Model parallelism only supports per host input for training. '
+            'Please adjust TPURunconfig.per_host_input_for_training.')
 
       if eval_batch_size is not None:
-        if not isinstance(eval_batch_size, int):
-          raise ValueError('`eval_batch_size` must be an int')
-        if eval_batch_size < 1:
-          raise ValueError('`eval_batch_size` must be positive')
-        if eval_batch_size % config.tpu_config.num_shards != 0:
-          raise ValueError(
-              'eval batch size {} must be divisible by number of shards {}'
-              .format(eval_batch_size, config.tpu_config.num_shards))
+        util_lib.check_positive_integer(eval_batch_size, 'eval_batch_size')
 
       if predict_batch_size is not None:
-        if not isinstance(predict_batch_size, int):
-          raise ValueError('`predict_batch_size` must be an int')
-        if predict_batch_size < 1:
-          raise ValueError('`predict_batch_size` must be positive')
-        if predict_batch_size % config.tpu_config.num_shards != 0:
-          raise ValueError(
-              'predict batch size {} must be divisible by number of shards {}'
-              .format(predict_batch_size, config.tpu_config.num_shards))
+        util_lib.check_positive_integer(predict_batch_size,
+                                        'predict_batch_size')
 
     # Verifies the model_fn signature according to Estimator framework.
     estimator_lib._verify_model_fn_args(model_fn, params)  # pylint: disable=protected-access
@@ -1976,35 +1677,12 @@ class TPUEstimator(estimator_lib.Estimator):
     self._iterations_per_training_loop = (
         self._config.tpu_config.iterations_per_loop)
 
-    if use_tpu and self._config.tpu_config.computation_shape:
-      try:
-        with tf_session.Session(
-            self._config.master,
-            config=config_pb2.ConfigProto(
-                operation_timeout_in_ms=_PINGING_MASTER_TIMEOUT_IN_MS)) as sess:
-          logging.info('Initializing TPU system to fetch topology for model '
-                       'parallelism.')
-          topology = sess.run(tpu.initialize_system())
-          device_assignment = tpu_device_assignment(
-              topology,
-              computation_shape=self._config.tpu_config.computation_shape,
-              num_replicas=self._config.tpu_config.num_shards)
-          logging.info('computation_shape: %s',
-                       str(self._config.tpu_config.computation_shape))
-          logging.info('num_replicas: %d', self._config.tpu_config.num_shards)
-          logging.info('device_assignment.topology.device_coordinates: %s',
-                       str(device_assignment.topology.device_coordinates))
-          logging.info('device_assignment.core_assignment: %s',
-                       str(device_assignment.core_assignment))
-      except errors.DeadlineExceededError:
-        raise ValueError(
-            'Fail to connect master (%s). Please double check %s is '
-            'correct.' % (self._config.master, self._config.master))
-    else:
-      device_assignment = None
     # All properties passed to _TPUContext are immutable.
-    self._ctx = _TPUContext(self._config, train_batch_size, eval_batch_size,
-                            predict_batch_size, use_tpu, device_assignment)
+    # pylint: disable=protected-access
+    self._ctx = tpu_context._get_tpu_context(
+        self._config, train_batch_size,
+        eval_batch_size, predict_batch_size,
+        use_tpu)
 
   def _create_global_step(self, graph):
     """Creates a global step suitable for TPUs.
@@ -2052,21 +1730,6 @@ class TPUEstimator(estimator_lib.Estimator):
 
     util_lib.check_positive_integer(steps, 'Eval steps')
 
-    # TODO(ylc): Support evaluating with model parallelism in different cluster.
-    if ctx.device_assignment and (self._config.evaluation_master !=
-                                  self._config.master):
-      raise ValueError(
-          'In the model-parallel case, both training and evaluation must run '
-          'in the same cluster.')
-
-    if self._config.tpu_config.num_shards > 8:
-      raise NotImplementedError(
-          'TPU evaluation is only supported with one host.')
-
-    if self._ctx._eval_batch_size is None:  # pylint: disable=protected-access
-      raise ValueError('`eval_batch_size` cannot be `None`'
-                       'if evaluate() is called on TPU.')
-
     return [
         evaluation._StopAfterNEvalsHook(  # pylint: disable=protected-access
             num_evals=steps),
@@ -2320,7 +1983,7 @@ def _eval_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
   (loss,) = tpu.shard(
       multi_tpu_eval_steps_on_single_shard,
       inputs=[],
-      num_shards=ctx.num_shards,
+      num_shards=ctx.num_replicas,
       outputs_from_all_shards=False,
       device_assignment=ctx.device_assignment)
 
@@ -2344,7 +2007,7 @@ def _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
   (loss,) = tpu.shard(
       multi_tpu_train_steps_on_single_shard,
       inputs=[],
-      num_shards=ctx.num_shards,
+      num_shards=ctx.num_replicas,
       outputs_from_all_shards=False,
       device_assignment=ctx.device_assignment)
 
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py b/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py
new file mode 100644
index 0000000000..e003313667
--- /dev/null
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py
@@ -0,0 +1,139 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===================================================================
+"""TPU system metadata and associated tooling."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import re
+
+from tensorflow.contrib.tpu.python.tpu import tpu
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import session as session_lib
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import tf_logging as logging
+
+_PINGING_MASTER_TIMEOUT_IN_MS = 60 * 1000  # 1 min
+_RETRY_TIMES = 120
+_INITIAL_TPU_SYSTEM_TIMEOUT_IN_MS = 300 * 1000  # 5 mins
+
+_TPU_DEVICE_REG = re.compile(r'.*task:(\d+)/.*device:TPU:(\d+)$')
+
+# _TPUSystemMetadata is used by TPUEstimator to hold TPU configuration,
+# including num_cores and num_hosts.
+_TPUSystemMetadata = collections.namedtuple('_TPUSystemMetadata', [
+    'num_cores',
+    'num_hosts',
+    'num_of_cores_per_host',
+    'topology',
+    'devices',
+])
+
+
+def _query_tpu_system_metadata(master_address, query_topology=False):
+  """Automatically detects the TPU system metadata in the system."""
+  tpu_core_count = 0
+  devices = []
+  device_dict = collections.defaultdict(list)
+
+  retry_count = 1
+  while True:
+    logging.info('Querying Tensorflow master (%s) for TPU system metadata.',
+                 master_address)
+    try:
+      with ops.Graph().as_default():
+        with session_lib.Session(
+            master_address,
+            config=config_pb2.ConfigProto(
+                operation_timeout_in_ms=_PINGING_MASTER_TIMEOUT_IN_MS)) as sess:
+          devices = sess.list_devices()
+          for device in devices:
+            match = _TPU_DEVICE_REG.match(device.name)
+            if match:
+              host_id = match.group(1)
+              core_id = match.group(2)
+              device_dict[host_id].append(core_id)
+              tpu_core_count += 1
+          break
+    except errors.DeadlineExceededError:
+      msg = ('Fail to connect Tensorflow master. It could be the TPU worker is '
+             'not ready (still under scheduling) or Tensorflow '
+             'master address is correct: got (%s).' %
+             (master_address))
+
+      # TODO(xiejw): For local or grpc master we might not need retry logic
+      # here.
+      if retry_count <= _RETRY_TIMES:
+        logging.warning('%s', msg)
+        logging.warning('Retrying (%d/%d).', retry_count, _RETRY_TIMES)
+        retry_count += 1
+      else:
+        raise ValueError(msg)
+
+  num_of_cores_per_host = 0
+  if tpu_core_count:
+    num_cores_per_host_set = set(
+        [len(core_ids) for core_ids in device_dict.values()])
+    if len(num_cores_per_host_set) != 1:
+      raise RuntimeError(
+          'TPU cores on each host is not same. This should not happen!. '
+          'devices: {}'.format(devices))
+    num_of_cores_per_host = num_cores_per_host_set.pop()
+
+  topology = None
+  if query_topology:
+    if not tpu_core_count:
+      raise RuntimeError(
+          'Cannot find any TPU cores in the system (master address {}). '
+          'This usually means the master address is incorrect or the '
+          'TPU worker has some problems. Available devices: {}'.format(
+              master_address, devices))
+
+    topology = _obtain_topology(master_address)
+
+  metadata = _TPUSystemMetadata(
+      num_cores=tpu_core_count,
+      num_hosts=len(device_dict),
+      num_of_cores_per_host=num_of_cores_per_host,
+      topology=topology,
+      devices=devices)
+
+  msg = 'Found TPU system %s' if tpu_core_count else 'Failed to find TPU: %s'
+  logging.info(msg, metadata)
+  return metadata
+
+
+def _obtain_topology(master_address):
+  try:
+    logging.info('Initializing TPU system (master: %s) to fetch topology '
+                 'for model parallelism. This might take a while.',
+                 master_address)
+    with ops.Graph().as_default():
+      session_config = config_pb2.ConfigProto(
+          operation_timeout_in_ms=_INITIAL_TPU_SYSTEM_TIMEOUT_IN_MS)
+      with session_lib.Session(
+          master_address, config=session_config) as sess:
+        topology = sess.run(tpu.initialize_system())
+        return topology
+  except errors.DeadlineExceededError:
+    raise ValueError(
+        'Fail to initialize TPU system with master (%s). '
+        'Please double check the TPU system is functional.' % (
+            master_address))
+
+
-- 
GitLab


From 2787540cf596df5da80dddec093210fc61ba7f77 Mon Sep 17 00:00:00 2001
From: Guangda Lai <laigd@google.com>
Date: Sun, 11 Feb 2018 18:09:11 -0800
Subject: [PATCH 941/966] Automated g4 rollback of changelist 185233116

PiperOrigin-RevId: 185324160
---
 tensorflow/core/grappler/clusters/BUILD           |  6 +++---
 .../core/grappler/clusters/single_machine.cc      |  9 ++-------
 tensorflow/core/grappler/clusters/utils.cc        | 15 ++++-----------
 tensorflow/core/grappler/clusters/utils.h         |  3 +--
 tensorflow/core/grappler/costs/BUILD              |  1 -
 tensorflow/core/grappler/costs/utils.cc           |  3 +--
 6 files changed, 11 insertions(+), 26 deletions(-)

diff --git a/tensorflow/core/grappler/clusters/BUILD b/tensorflow/core/grappler/clusters/BUILD
index b15a709c5b..5b8ce373bc 100644
--- a/tensorflow/core/grappler/clusters/BUILD
+++ b/tensorflow/core/grappler/clusters/BUILD
@@ -26,12 +26,13 @@ config_setting(
 tf_cuda_library(
     name = "utils",
     srcs = ["utils.cc"],
-    hdrs = ["utils.h"],
+    hdrs = [
+        "utils.h",
+    ],
     visibility = ["//visibility:public"],
     deps = [
         "//third_party/eigen3",
         "//tensorflow/core:framework",
-        "//tensorflow/core:gpu_id",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
     ] + select({
@@ -103,7 +104,6 @@ cc_library(
         "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:direct_session",
         "//tensorflow/core:framework",
-        "//tensorflow/core:gpu_id",
         "//tensorflow/core:lib",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/kernels:ops_util",
diff --git a/tensorflow/core/grappler/clusters/single_machine.cc b/tensorflow/core/grappler/clusters/single_machine.cc
index 3e97b31f2c..862ce4ae88 100644
--- a/tensorflow/core/grappler/clusters/single_machine.cc
+++ b/tensorflow/core/grappler/clusters/single_machine.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/cc/training/queue_runner.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
-#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
 #include "tensorflow/core/grappler/clusters/utils.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/kernels/ops_util.h"
@@ -80,17 +79,13 @@ Status SingleMachine::Provision() {
 
   std::vector<DeviceAttributes> devices;
   TF_RETURN_IF_ERROR(session_->ListDevices(&devices));
+  int gpu_id = 0;
   for (const auto& dev : devices) {
     DeviceProperties attr;
     if (dev.device_type() == "CPU") {
       attr = GetLocalCPUInfo();
     } else if (dev.device_type() == "GPU") {
-      DeviceNameUtils::ParsedName parsed;
-      if (!DeviceNameUtils::ParseFullName(dev.name(), &parsed)) {
-        return errors::InvalidArgument(
-            strings::StrCat("Not able to parse GPU device name: ", dev.name()));
-      }
-      attr = GetLocalGPUInfo(TfGpuId(parsed.id));
+      attr = GetLocalGPUInfo(gpu_id++);
     } else {
       attr.set_type(dev.device_type());
     }
diff --git a/tensorflow/core/grappler/clusters/utils.cc b/tensorflow/core/grappler/clusters/utils.cc
index 3e7a7a3356..aacd2ccb72 100644
--- a/tensorflow/core/grappler/clusters/utils.cc
+++ b/tensorflow/core/grappler/clusters/utils.cc
@@ -27,8 +27,6 @@ limitations under the License.
 #include "include/libxsmm.h"
 #endif
 
-#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
-#include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/cpu_info.h"
@@ -68,14 +66,13 @@ DeviceProperties GetLocalCPUInfo() {
   return device;
 }
 
-DeviceProperties GetLocalGPUInfo(TfGpuId tf_gpu_id) {
+DeviceProperties GetLocalGPUInfo(int gpu_id) {
   DeviceProperties device;
   device.set_type("GPU");
 
 #if GOOGLE_CUDA
   cudaDeviceProp properties;
-  CudaGpuId cuda_gpu_id = GpuIdManager::TfToCudaGpuId(tf_gpu_id);
-  cudaError_t error = cudaGetDeviceProperties(&properties, cuda_gpu_id.value());
+  cudaError_t error = cudaGetDeviceProperties(&properties, gpu_id);
   if (error == cudaSuccess) {
     device.set_vendor("NVidia");
     device.set_model(properties.name);
@@ -97,10 +94,6 @@ DeviceProperties GetLocalGPUInfo(TfGpuId tf_gpu_id) {
     // double data rate (DDR).
     device.set_bandwidth(properties.memoryBusWidth / 8 *
                          properties.memoryClockRate * 2);
-  } else {
-    LOG(ERROR) << "Failed to get device properties, error code: " << error;
-    device.set_type("UNKNOWN");
-    return device;
   }
 
   (*device.mutable_environment())["architecture"] =
@@ -117,9 +110,9 @@ DeviceProperties GetDeviceInfo(const DeviceNameUtils::ParsedName& device) {
     return GetLocalCPUInfo();
   } else if (device.type == "GPU") {
     if (device.has_id) {
-      return GetLocalGPUInfo(TfGpuId(device.id));
+      return GetLocalGPUInfo(device.id);
     } else {
-      return GetLocalGPUInfo(TfGpuId(0));
+      return GetLocalGPUInfo(0);
     }
   }
   DeviceProperties result;
diff --git a/tensorflow/core/grappler/clusters/utils.h b/tensorflow/core/grappler/clusters/utils.h
index 4ea7e98390..191942040a 100644
--- a/tensorflow/core/grappler/clusters/utils.h
+++ b/tensorflow/core/grappler/clusters/utils.h
@@ -16,7 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_GRAPPLER_CLUSTERS_UTILS_H_
 #define TENSORFLOW_GRAPPLER_CLUSTERS_UTILS_H_
 
-#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
 #include "tensorflow/core/protobuf/device_properties.pb.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
@@ -28,7 +27,7 @@ DeviceProperties GetLocalCPUInfo();
 
 // Returns the DeviceProperties for the specified GPU attached to the server on
 // which grappler is running.
-DeviceProperties GetLocalGPUInfo(TfGpuId tf_gpu_id);
+DeviceProperties GetLocalGPUInfo(int gpu_id);
 
 // Returns the DeviceProperties of the specified device
 DeviceProperties GetDeviceInfo(const DeviceNameUtils::ParsedName& device);
diff --git a/tensorflow/core/grappler/costs/BUILD b/tensorflow/core/grappler/costs/BUILD
index 5336df1f51..0fe01e9c9e 100644
--- a/tensorflow/core/grappler/costs/BUILD
+++ b/tensorflow/core/grappler/costs/BUILD
@@ -142,7 +142,6 @@ tf_cuda_library(
         "//third_party/eigen3",
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
-        "//tensorflow/core:gpu_id",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_proto_parsing",
         "//tensorflow/core:protos_all_cc",
diff --git a/tensorflow/core/grappler/costs/utils.cc b/tensorflow/core/grappler/costs/utils.cc
index ac30090607..602f69f12e 100644
--- a/tensorflow/core/grappler/costs/utils.cc
+++ b/tensorflow/core/grappler/costs/utils.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "cuda/include/cudnn.h"
 #endif
 
-#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
 #include "tensorflow/core/framework/allocation_description.pb.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/op.h"
@@ -204,7 +203,7 @@ DeviceProperties GetDeviceInfo(const string& device_str) {
   DeviceNameUtils::ParsedName parsed;
   if (DeviceNameUtils::ParseFullName(device_str, &parsed)) {
     if (parsed.type == "GPU") {
-      return GetLocalGPUInfo(TfGpuId(parsed.id));
+      return GetLocalGPUInfo(parsed.id);
     } else if (parsed.type == "CPU") {
       return GetLocalCPUInfo();
     }
-- 
GitLab


From aea6e3fa75a49f442a649332cc514f7cf31c38d2 Mon Sep 17 00:00:00 2001
From: Vijay Vasudevan <vrv@google.com>
Date: Sun, 11 Feb 2018 21:19:37 -0800
Subject: [PATCH 942/966] Provide more diagnostic shape information in output
 window error message.

PiperOrigin-RevId: 185331713
---
 tensorflow/core/framework/common_shape_fns.cc | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc
index 8bb87483e1..623248b6ce 100644
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@@ -49,7 +49,11 @@ Status GetWindowedOutputSizeVerboseV2(int64 input_size, int64 filter_size,
       break;
   }
   if (*output_size < 0) {
-    return errors::InvalidArgument("computed output size would be negative");
+    return errors::InvalidArgument(
+        "Computed output size would be negative: ", *output_size,
+        " [input_size: ", input_size,
+        ", effective_filter_size: ", effective_filter_size,
+        ", stride: ", stride, "]");
   }
   return Status::OK();
 }
-- 
GitLab


From 80df0867184077c3cf228e77d55074048e79e63a Mon Sep 17 00:00:00 2001
From: MyungJoo Ham <myungjoo.ham@gmail.com>
Date: Mon, 12 Feb 2018 16:04:43 +0900
Subject: [PATCH 943/966] CMAKE: optionally link to ZLIB as systemlib / shared
 objects. (#15382)

If the user has ZLIB (and devel pkg) installed at the system
and the user wants to keep using that ZLIB for tensorflow,
the cmake option "-Dsystemlib_ZLIB=ON" will allow to do so.

Another option "-Dsystemlib_ALL=ON" will turn on every
"systemlib_*" options.

Signed-off-by: MyungJoo Ham <myungjoo.ham@samsung.com>
---
 tensorflow/contrib/cmake/CMakeLists.txt      |  22 +++-
 tensorflow/contrib/cmake/external/zlib.cmake | 108 +++++++++++--------
 2 files changed, 82 insertions(+), 48 deletions(-)

diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index 26cf8b18b4..16317f538f 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -52,6 +52,7 @@ if (NOT WIN32)
   # for targets that link ${CMAKE_THREAD_LIBS_INIT}.
   find_package (Threads)
 
+  # Options for linking CUDA/CUDNN libraries
   option(tensorflow_PATH_STATIC_LIB "Additional library search path for libcudnn_static.a, libnccl_static.a, libculibos.a" /usr/local/cuda/lib64/)
   option(tensorflow_CUDNN_INCLUDE "cudnn.h header install path" /usr/include/)
   if (NOT tensorflow_CUDNN_INCLUDE)
@@ -73,6 +74,14 @@ if (NOT WIN32)
     # option's default value is OFF. Fill it with real default values
     set(tensorflow_CUDA_LIBRARY_PATH /usr/local/cuda/lib64)
   endif (NOT tensorflow_CUDA_LIBRARY_PATH)
+
+  # Options for linking other libraries
+  option(systemlib_ZLIB "Use the system installed library as shared objects instead of downloading ZLIB and statically linking to it: ZLIB" OFF)
+
+  option(systemlib_ALL "Turn on every possible systemlib_* options" OFF)
+  if (systemlib_ALL)
+    set (systmelib_ZLIB ON)
+  endif (systemlib_ALL)
 endif()
 
 if (WIN32)
@@ -188,8 +197,10 @@ if (tensorflow_BUILD_CC_TESTS)
   include(googletest)
 endif()
 
+add_definitions(${ADD_CFLAGS})
+link_directories(${ADD_LINK_DIRECTORY})
+
 set(tensorflow_EXTERNAL_LIBRARIES
-    ${zlib_STATIC_LIBRARIES}
     ${gif_STATIC_LIBRARIES}
     ${png_STATIC_LIBRARIES}
     ${jpeg_STATIC_LIBRARIES}
@@ -203,6 +214,15 @@ set(tensorflow_EXTERNAL_LIBRARIES
     ${re2_STATIC_LIBRARIES}
     ${sqlite_STATIC_LIBRARIES}
 )
+
+if (systemlib_ZLIB)
+  set(tensorflow_EXTERNAL_LIBRARIES ${tensorflow_EXTERNAL_LIBRARIES}
+      ${ZLIB_LIBRARIES})
+else (systemlib_ZLIB)
+  set(tensorflow_EXTERNAL_LIBRARIES ${tensorflow_EXTERNAL_LIBRARIES}
+    ${zlib_STATIC_LIBRARIES})
+endif (systemlib_ZLIB)
+
 set(tensorflow_EXTERNAL_DEPENDENCIES
     zlib_copy_headers_to_destination
     gif_copy_headers_to_destination
diff --git a/tensorflow/contrib/cmake/external/zlib.cmake b/tensorflow/contrib/cmake/external/zlib.cmake
index c5eb0cbcc7..116d423093 100644
--- a/tensorflow/contrib/cmake/external/zlib.cmake
+++ b/tensorflow/contrib/cmake/external/zlib.cmake
@@ -12,61 +12,75 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-include (ExternalProject)
+if (systemlib_ZLIB)
+  find_package(PkgConfig)
+  pkg_search_module(ZLIB REQUIRED zlib)
+  set(zlib_INCLUDE_DIR ${ZLIB_INCLUDE_DIRS})
+  set(ADD_LINK_DIRECTORY ${ADD_LINK_DIRECTORY} ${ZLIB_LIBRARY_DIRS})
+  set(ADD_CFLAGS ${ADD_CFLAGS} ${ZLIB_CFLAGS_OTHER})
 
-set(zlib_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/zlib_archive)
-set(ZLIB_URL https://github.com/madler/zlib)
-set(ZLIB_BUILD ${CMAKE_CURRENT_BINARY_DIR}/zlib/src/zlib)
-set(ZLIB_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/zlib/install)
-set(ZLIB_TAG 50893291621658f355bc5b4d450a8d06a563053d)
+  # To meet DEPENDS zlib from other projects.
+  # If we hit this line, zlib is already built and installed to the system.
+  add_custom_target(zlib)
+  add_custom_target(zlib_copy_headers_to_destination)
 
-if(WIN32)
-  if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
-    set(zlib_STATIC_LIBRARIES
-        debug ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/zlibstaticd.lib
-        optimized ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/zlibstatic.lib)
-  else()
-    if(CMAKE_BUILD_TYPE EQUAL Debug)
+else (systemlib_ZLIB)
+  include (ExternalProject)
+
+  set(zlib_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/zlib_archive)
+  set(ZLIB_URL https://github.com/madler/zlib)
+  set(ZLIB_BUILD ${CMAKE_CURRENT_BINARY_DIR}/zlib/src/zlib)
+  set(ZLIB_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/zlib/install)
+  set(ZLIB_TAG 50893291621658f355bc5b4d450a8d06a563053d)
+
+  if(WIN32)
+    if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
       set(zlib_STATIC_LIBRARIES
-          ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/zlibstaticd.lib)
+          debug ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/zlibstaticd.lib
+          optimized ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/zlibstatic.lib)
     else()
-      set(zlib_STATIC_LIBRARIES
-          ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/zlibstatic.lib)
+      if(CMAKE_BUILD_TYPE EQUAL Debug)
+        set(zlib_STATIC_LIBRARIES
+            ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/zlibstaticd.lib)
+      else()
+        set(zlib_STATIC_LIBRARIES
+            ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/zlibstatic.lib)
+      endif()
     endif()
+  else()
+    set(zlib_STATIC_LIBRARIES
+        ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/libz.a)
   endif()
-else()
-  set(zlib_STATIC_LIBRARIES
-      ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/libz.a)
-endif()
 
-set(ZLIB_HEADERS
-    "${ZLIB_INSTALL}/include/zconf.h"
-    "${ZLIB_INSTALL}/include/zlib.h"
-)
+  set(ZLIB_HEADERS
+      "${ZLIB_INSTALL}/include/zconf.h"
+      "${ZLIB_INSTALL}/include/zlib.h"
+  )
 
-ExternalProject_Add(zlib
-    PREFIX zlib
-    GIT_REPOSITORY ${ZLIB_URL}
-    GIT_TAG ${ZLIB_TAG}
-    INSTALL_DIR ${ZLIB_INSTALL}
-    BUILD_IN_SOURCE 1
-    BUILD_BYPRODUCTS ${zlib_STATIC_LIBRARIES}
-    DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
-    CMAKE_CACHE_ARGS
-        -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=${tensorflow_ENABLE_POSITION_INDEPENDENT_CODE}
-        -DCMAKE_BUILD_TYPE:STRING=Release
-        -DCMAKE_INSTALL_PREFIX:STRING=${ZLIB_INSTALL}
-)
+  ExternalProject_Add(zlib
+      PREFIX zlib
+      GIT_REPOSITORY ${ZLIB_URL}
+      GIT_TAG ${ZLIB_TAG}
+      INSTALL_DIR ${ZLIB_INSTALL}
+      BUILD_IN_SOURCE 1
+      BUILD_BYPRODUCTS ${zlib_STATIC_LIBRARIES}
+      DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
+      CMAKE_CACHE_ARGS
+          -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=${tensorflow_ENABLE_POSITION_INDEPENDENT_CODE}
+          -DCMAKE_BUILD_TYPE:STRING=Release
+          -DCMAKE_INSTALL_PREFIX:STRING=${ZLIB_INSTALL}
+  )
 
-# put zlib includes in the directory where they are expected
-add_custom_target(zlib_create_destination_dir
-    COMMAND ${CMAKE_COMMAND} -E make_directory ${zlib_INCLUDE_DIR}
-    DEPENDS zlib)
+  # put zlib includes in the directory where they are expected
+  add_custom_target(zlib_create_destination_dir
+      COMMAND ${CMAKE_COMMAND} -E make_directory ${zlib_INCLUDE_DIR}
+      DEPENDS zlib)
 
-add_custom_target(zlib_copy_headers_to_destination
-    DEPENDS zlib_create_destination_dir)
+  add_custom_target(zlib_copy_headers_to_destination
+      DEPENDS zlib_create_destination_dir)
 
-foreach(header_file ${ZLIB_HEADERS})
-    add_custom_command(TARGET zlib_copy_headers_to_destination PRE_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy_if_different ${header_file} ${zlib_INCLUDE_DIR})
-endforeach()
+  foreach(header_file ${ZLIB_HEADERS})
+      add_custom_command(TARGET zlib_copy_headers_to_destination PRE_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${header_file} ${zlib_INCLUDE_DIR})
+  endforeach()
+endif (systemlib_ZLIB)
-- 
GitLab


From 8330cce0556893a7236d87a09e7c495c55197de0 Mon Sep 17 00:00:00 2001
From: Christopher Yeh <chrisyeh96@users.noreply.github.com>
Date: Sun, 11 Feb 2018 23:05:17 -0800
Subject: [PATCH 944/966] Improve formatting of shapes in tf.losses
 documentation (#16921)

---
 tensorflow/python/ops/losses/losses_impl.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py
index 3368285bc6..ceeabd08f4 100644
--- a/tensorflow/python/ops/losses/losses_impl.py
+++ b/tensorflow/python/ops/losses/losses_impl.py
@@ -377,7 +377,7 @@ def huber_loss(labels, predictions, weights=1.0, delta=1.0, scope=None,
 
   `weights` acts as a coefficient for the loss. If a scalar is provided, then
   the loss is simply scaled by the given value. If `weights` is a tensor of size
-  [batch_size], then the total loss for each sample of the batch is rescaled
+  `[batch_size]`, then the total loss for each sample of the batch is rescaled
   by the corresponding element in the `weights` vector. If the shape of
   `weights` matches the shape of `predictions`, then the loss of each
   measurable element of `predictions` is scaled by the corresponding value of
@@ -435,7 +435,7 @@ def log_loss(labels, predictions, weights=1.0, epsilon=1e-7, scope=None,
 
   `weights` acts as a coefficient for the loss. If a scalar is provided, then
   the loss is simply scaled by the given value. If `weights` is a tensor of size
-  [batch_size], then the total loss for each sample of the batch is rescaled
+  `[batch_size]`, then the total loss for each sample of the batch is rescaled
   by the corresponding element in the `weights` vector. If the shape of
   `weights` matches the shape of `predictions`, then the loss of each
   measurable element of `predictions` is scaled by the corresponding value of
@@ -502,7 +502,7 @@ def mean_pairwise_squared_error(
 
   `weights` acts as a coefficient for the loss. If a scalar is provided, then
   the loss is simply scaled by the given value. If `weights` is a tensor of size
-  [batch_size], then the total loss for each sample of the batch is rescaled
+  `[batch_size]`, then the total loss for each sample of the batch is rescaled
   by the corresponding element in the `weights` vector.
 
   Args:
@@ -576,7 +576,7 @@ def mean_squared_error(
 
   `weights` acts as a coefficient for the loss. If a scalar is provided, then
   the loss is simply scaled by the given value. If `weights` is a tensor of size
-  [batch_size], then the total loss for each sample of the batch is rescaled
+  `[batch_size]`, then the total loss for each sample of the batch is rescaled
   by the corresponding element in the `weights` vector. If the shape of
   `weights` matches the shape of `predictions`, then the loss of each
   measurable element of `predictions` is scaled by the corresponding value of
@@ -796,7 +796,7 @@ def sparse_softmax_cross_entropy(
 
   `weights` acts as a coefficient for the loss. If a scalar is provided,
   then the loss is simply scaled by the given value. If `weights` is a
-  tensor of shape [`batch_size`], then the loss weights apply to each
+  tensor of shape `[batch_size]`, then the loss weights apply to each
   corresponding sample.
 
   Args:
-- 
GitLab


From b9f548d041ba8d66102c6d195e645051f1bee52f Mon Sep 17 00:00:00 2001
From: Yukun Chen <cykustcc@users.noreply.github.com>
Date: Mon, 12 Feb 2018 02:05:37 -0500
Subject: [PATCH 945/966] Fix warning about keep_dims. keep_dims -> keepdims
 for tf.reduce_sum(). (#16876)

* Fix warning about keep_dims. keep_dims -> keepdims for tf.reduce_sum().

* fix test failure.
---
 .../boosted_trees/python/utils/losses.py      |  4 +-
 .../python/ops/relaxed_onehot_categorical.py  |  2 +-
 .../python/ops/clustering_ops.py              |  4 +-
 .../contrib/layers/python/layers/layers.py    |  8 +-
 .../python/metric_learning/metric_loss_ops.py | 26 +++---
 .../contrib/signal/python/ops/spectral_ops.py |  2 +-
 tensorflow/examples/udacity/5_word2vec.ipynb  |  2 +-
 tensorflow/python/framework/function_test.py  |  2 +-
 .../python/kernel_tests/reduction_ops_test.py | 84 +++++++++----------
 .../kernel_tests/reduction_ops_test_big.py    | 18 ++--
 tensorflow/python/ops/clip_ops.py             |  2 +-
 tensorflow/python/ops/losses/losses_impl.py   |  8 +-
 tensorflow/python/ops/nn_grad.py              | 14 ++--
 13 files changed, 88 insertions(+), 88 deletions(-)

diff --git a/tensorflow/contrib/boosted_trees/python/utils/losses.py b/tensorflow/contrib/boosted_trees/python/utils/losses.py
index 1e8b3ac08a..ab7ac2aba6 100644
--- a/tensorflow/contrib/boosted_trees/python/utils/losses.py
+++ b/tensorflow/contrib/boosted_trees/python/utils/losses.py
@@ -78,7 +78,7 @@ def per_example_maxent_loss(labels, weights, logits, num_classes, eps=1e-15):
 
   # Calculate softmax probabilities for each class.
   unnormalized_probs = math_ops.exp(logits)
-  normalizers = math_ops.reduce_sum(unnormalized_probs, 1, keep_dims=True)
+  normalizers = math_ops.reduce_sum(unnormalized_probs, 1, keepdims=True)
   softmax_predictions = math_ops.divide(unnormalized_probs,
                                         math_ops.add(normalizers, eps))
 
@@ -120,7 +120,7 @@ def per_example_squared_loss(labels, weights, predictions):
     update_op: An update operation to update the loss's internal state.
   """
   unweighted_loss = math_ops.reduce_sum(
-      math_ops.square(predictions - labels), 1, keep_dims=True)
+      math_ops.square(predictions - labels), 1, keepdims=True)
 
   return unweighted_loss * weights, control_flow_ops.no_op()
 
diff --git a/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py b/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py
index b6becfa9fc..2aa771a71e 100644
--- a/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py
+++ b/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py
@@ -278,7 +278,7 @@ class ExpRelaxedOneHotCategorical(distribution.Distribution):
                       * math_ops.log(self.temperature))
     # compute the unnormalized density
     log_softmax = nn_ops.log_softmax(logits_2d - x_2d * self._temperature_2d)
-    log_unnorm_prob = math_ops.reduce_sum(log_softmax, [-1], keep_dims=False)
+    log_unnorm_prob = math_ops.reduce_sum(log_softmax, [-1], keepdims=False)
     # combine unnormalized density with normalization constant
     log_prob = log_norm_const + log_unnorm_prob
     # Reshapes log_prob to be consistent with shape of user-supplied logits
diff --git a/tensorflow/contrib/factorization/python/ops/clustering_ops.py b/tensorflow/contrib/factorization/python/ops/clustering_ops.py
index 6d3acb2750..23137e0a97 100644
--- a/tensorflow/contrib/factorization/python/ops/clustering_ops.py
+++ b/tensorflow/contrib/factorization/python/ops/clustering_ops.py
@@ -192,11 +192,11 @@ class KMeans(object):
         # Computes Euclidean distance. Note the first and third terms are
         # broadcast additions.
         squared_distance = (
-            math_ops.reduce_sum(math_ops.square(inp), 1, keep_dims=True) -
+            math_ops.reduce_sum(math_ops.square(inp), 1, keepdims=True) -
             2 * math_ops.matmul(inp, clusters, transpose_b=True) +
             array_ops.transpose(
                 math_ops.reduce_sum(
-                    math_ops.square(clusters), 1, keep_dims=True)))
+                    math_ops.square(clusters), 1, keepdims=True)))
         output.append(squared_distance)
 
     return output
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index 37338ea04f..c42eab4efc 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -779,7 +779,7 @@ def batch_norm(inputs,
       else:
         if data_format == DATA_FORMAT_NCHW:
           mean, variance = nn.weighted_moments(
-              inputs, moments_axes, batch_weights, keep_dims=True)
+              inputs, moments_axes, batch_weights, keepdims=True)
           mean = array_ops.reshape(mean, [-1])
           variance = array_ops.reshape(variance, [-1])
         else:
@@ -2833,9 +2833,9 @@ def spatial_softmax(features,
 
       softmax_attention = nn.softmax(features / temperature)
       expected_x = math_ops.reduce_sum(
-          pos_x * softmax_attention, [1], keep_dims=True)
+          pos_x * softmax_attention, [1], keepdims=True)
       expected_y = math_ops.reduce_sum(
-          pos_y * softmax_attention, [1], keep_dims=True)
+          pos_y * softmax_attention, [1], keepdims=True)
       expected_xy = array_ops.concat([expected_x, expected_y], 1)
       feature_keypoints = array_ops.reshape(expected_xy,
                                             [-1, num_channels.value * 2])
@@ -2968,7 +2968,7 @@ def poincare_normalize(x, axis=1, epsilon=1e-5, name=None):
   """
   with ops.name_scope(name, 'poincare_normalize', [x]) as name:
     x = ops.convert_to_tensor(x, name='x')
-    square_sum = math_ops.reduce_sum(math_ops.square(x), axis, keep_dims=True)
+    square_sum = math_ops.reduce_sum(math_ops.square(x), axis, keepdims=True)
     x_inv_norm = math_ops.rsqrt(square_sum)
     x_inv_norm = math_ops.minimum((1. - epsilon) * x_inv_norm, 1.)
     return math_ops.multiply(x, x_inv_norm, name=name)
diff --git a/tensorflow/contrib/losses/python/metric_learning/metric_loss_ops.py b/tensorflow/contrib/losses/python/metric_learning/metric_loss_ops.py
index c3a57ba51b..6842bc38eb 100644
--- a/tensorflow/contrib/losses/python/metric_learning/metric_loss_ops.py
+++ b/tensorflow/contrib/losses/python/metric_learning/metric_loss_ops.py
@@ -53,12 +53,12 @@ def pairwise_distance(feature, squared=False):
       math_ops.reduce_sum(
           math_ops.square(feature),
           axis=[1],
-          keep_dims=True),
+          keepdims=True),
       math_ops.reduce_sum(
           math_ops.square(
               array_ops.transpose(feature)),
           axis=[0],
-          keep_dims=True)) - 2.0 * math_ops.matmul(
+          keepdims=True)) - 2.0 * math_ops.matmul(
               feature, array_ops.transpose(feature))
 
   # Deal with numerical inaccuracies. Set small negatives to zero.
@@ -132,10 +132,10 @@ def masked_maximum(data, mask, dim=1):
     masked_maximums: N-D `Tensor`.
       The maximized dimension is of size 1 after the operation.
   """
-  axis_minimums = math_ops.reduce_min(data, dim, keep_dims=True)
+  axis_minimums = math_ops.reduce_min(data, dim, keepdims=True)
   masked_maximums = math_ops.reduce_max(
       math_ops.multiply(
-          data - axis_minimums, mask), dim, keep_dims=True) + axis_minimums
+          data - axis_minimums, mask), dim, keepdims=True) + axis_minimums
   return masked_maximums
 
 
@@ -151,10 +151,10 @@ def masked_minimum(data, mask, dim=1):
     masked_minimums: N-D `Tensor`.
       The minimized dimension is of size 1 after the operation.
   """
-  axis_maximums = math_ops.reduce_max(data, dim, keep_dims=True)
+  axis_maximums = math_ops.reduce_max(data, dim, keepdims=True)
   masked_minimums = math_ops.reduce_min(
       math_ops.multiply(
-          data - axis_maximums, mask), dim, keep_dims=True) + axis_maximums
+          data - axis_maximums, mask), dim, keepdims=True) + axis_maximums
   return masked_minimums
 
 
@@ -203,7 +203,7 @@ def triplet_semihard_loss(labels, embeddings, margin=1.0):
       math_ops.greater(
           math_ops.reduce_sum(
               math_ops.cast(
-                  mask, dtype=dtypes.float32), 1, keep_dims=True),
+                  mask, dtype=dtypes.float32), 1, keepdims=True),
           0.0), [batch_size, batch_size])
   mask_final = array_ops.transpose(mask_final)
 
@@ -290,7 +290,7 @@ def npairs_loss(labels, embeddings_anchor, embeddings_positive,
 
   labels_remapped = math_ops.to_float(
       math_ops.equal(labels, array_ops.transpose(labels)))
-  labels_remapped /= math_ops.reduce_sum(labels_remapped, 1, keep_dims=True)
+  labels_remapped /= math_ops.reduce_sum(labels_remapped, 1, keepdims=True)
 
   # Add the softmax loss.
   xent_loss = nn.softmax_cross_entropy_with_logits(
@@ -395,7 +395,7 @@ def npairs_loss_multilabel(sparse_labels, embeddings_anchor,
 
     multilabel_adjacency_matrix = _build_multilabel_adjacency(sparse_labels)
     labels_remapped = math_ops.to_float(multilabel_adjacency_matrix)
-    labels_remapped /= math_ops.reduce_sum(labels_remapped, 1, keep_dims=True)
+    labels_remapped /= math_ops.reduce_sum(labels_remapped, 1, keepdims=True)
 
     # Add the softmax loss.
     xent_loss = nn.softmax_cross_entropy_with_logits(
@@ -448,10 +448,10 @@ def lifted_struct_loss(labels, embeddings, margin=1.0):
   # Safe maximum: Temporarily shift negative distances
   #   above zero before taking max.
   #     this is to take the max only among negatives.
-  row_minimums = math_ops.reduce_min(diff, 1, keep_dims=True)
+  row_minimums = math_ops.reduce_min(diff, 1, keepdims=True)
   row_negative_maximums = math_ops.reduce_max(
       math_ops.multiply(
-          diff - row_minimums, mask), 1, keep_dims=True) + row_minimums
+          diff - row_minimums, mask), 1, keepdims=True) + row_minimums
 
   # Compute the loss.
   # Keep track of matrix of maximums where M_ij = max(m_i, m_j)
@@ -470,7 +470,7 @@ def lifted_struct_loss(labels, embeddings, margin=1.0):
       math_ops.reduce_sum(math_ops.multiply(
           math_ops.exp(
               diff_tiled - max_elements_vect),
-          mask_tiled), 1, keep_dims=True), [batch_size, batch_size])
+          mask_tiled), 1, keepdims=True), [batch_size, batch_size])
 
   loss_mat = max_elements + math_ops.log(
       loss_exp_left + array_ops.transpose(loss_exp_left))
@@ -686,7 +686,7 @@ def _find_loss_augmented_facility_idx(pairwise_distances, labels, chosen_ids,
                   array_ops.reshape(pairwise_distances_candidate, [1, -1])
               ], 0),
               axis=0,
-              keep_dims=True), [num_candidates, -1]),
+              keepdims=True), [num_candidates, -1]),
       axis=1)
 
   nmi_scores = array_ops.zeros([num_candidates])
diff --git a/tensorflow/contrib/signal/python/ops/spectral_ops.py b/tensorflow/contrib/signal/python/ops/spectral_ops.py
index bca2e01d7b..a8b5deff6c 100644
--- a/tensorflow/contrib/signal/python/ops/spectral_ops.py
+++ b/tensorflow/contrib/signal/python/ops/spectral_ops.py
@@ -144,7 +144,7 @@ def inverse_stft_window_fn(frame_step,
       overlaps = -(-frame_length // frame_step)  # Ceiling division.
       denom = array_ops.pad(denom, [(0, overlaps * frame_step - frame_length)])
       denom = array_ops.reshape(denom, [overlaps, frame_step])
-      denom = math_ops.reduce_sum(denom, 0, keep_dims=True)
+      denom = math_ops.reduce_sum(denom, 0, keepdims=True)
       denom = array_ops.tile(denom, [overlaps, 1])
       denom = array_ops.reshape(denom, [overlaps * frame_step])
 
diff --git a/tensorflow/examples/udacity/5_word2vec.ipynb b/tensorflow/examples/udacity/5_word2vec.ipynb
index 18c456cad7..3b43d1fb55 100644
--- a/tensorflow/examples/udacity/5_word2vec.ipynb
+++ b/tensorflow/examples/udacity/5_word2vec.ipynb
@@ -455,7 +455,7 @@
         "  \n",
         "  # Compute the similarity between minibatch examples and all embeddings.\n",
         "  # We use the cosine distance:\n",
-        "  norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))\n",
+        "  norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))\n",
         "  normalized_embeddings = embeddings / norm\n",
         "  valid_embeddings = tf.nn.embedding_lookup(\n",
         "    normalized_embeddings, valid_dataset)\n",
diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py
index b35cee0111..301a7f682d 100644
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@@ -1458,7 +1458,7 @@ class FunctionInlineControlTest(test.TestCase):
       def Cell(v):
         # If v is a vector [n, 1], x is a big square matrix.
         x = math_ops.tanh(v + array_ops.transpose(v, [1, 0]))
-        return math_ops.reduce_sum(x, 1, keep_dims=True)
+        return math_ops.reduce_sum(x, 1, keepdims=True)
 
       @function.Defun(dtype)
       def Forward(x):
diff --git a/tensorflow/python/kernel_tests/reduction_ops_test.py b/tensorflow/python/kernel_tests/reduction_ops_test.py
index 4231a79b2d..5314781629 100644
--- a/tensorflow/python/kernel_tests/reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/reduction_ops_test.py
@@ -110,10 +110,10 @@ class ReductionUnknownShape(test.TestCase):
 
 class BaseReductionTest(test.TestCase):
 
-  def _tf_reduce(self, x, reduction_axes, keep_dims):
+  def _tf_reduce(self, x, reduction_axes, keepdims):
     raise NotImplementedError()
 
-  def _np_reduce(self, x, reduction_axes, keep_dims):
+  def _np_reduce(self, x, reduction_axes, keepdims):
     raise NotImplementedError()
 
   def _makeIncremental(self, shape, dtype):
@@ -128,10 +128,10 @@ class BaseReductionTest(test.TestCase):
       data -= 2j * data
     return data
 
-  def _compare(self, x, reduction_axes, keep_dims, feed_dict=None):
-    np_ans = self._np_reduce(x, reduction_axes, keep_dims)
+  def _compare(self, x, reduction_axes, keepdims, feed_dict=None):
+    np_ans = self._np_reduce(x, reduction_axes, keepdims)
     with self.test_session(use_gpu=True) as sess:
-      tf_ans = self._tf_reduce(x, reduction_axes, keep_dims)
+      tf_ans = self._tf_reduce(x, reduction_axes, keepdims)
       out = sess.run(tf_ans, feed_dict)
     self.assertAllClose(np_ans, out)
     self.assertShapeEqual(np_ans, tf_ans)
@@ -140,8 +140,8 @@ class BaseReductionTest(test.TestCase):
     if reduction_axes is not None and np.shape(reduction_axes) == (1,):
       # Test scalar reduction_axes argument
       self._compareAll(x, reduction_axes[0])
-    self._compare(x, reduction_axes, keep_dims=False, feed_dict=feed_dict)
-    self._compare(x, reduction_axes, keep_dims=True, feed_dict=feed_dict)
+    self._compare(x, reduction_axes, keepdims=False, feed_dict=feed_dict)
+    self._compare(x, reduction_axes, keepdims=True, feed_dict=feed_dict)
 
   def _compareAllAxes(self, x, feed_dict=None):
     self._compareAll(x, None)
@@ -171,14 +171,14 @@ class BaseReductionTest(test.TestCase):
 
 class SumReductionTest(BaseReductionTest):
 
-  def _tf_reduce(self, x, reduction_axes, keep_dims):
-    return math_ops.reduce_sum(x, reduction_axes, keep_dims)
+  def _tf_reduce(self, x, reduction_axes, keepdims):
+    return math_ops.reduce_sum(x, reduction_axes, keepdims)
 
-  def _np_reduce(self, x, reduction_axes, keep_dims):
+  def _np_reduce(self, x, reduction_axes, keepdims):
     if isinstance(reduction_axes, list) or isinstance(reduction_axes,
                                                       np.ndarray):
       reduction_axes = tuple(reduction_axes)
-    return np.sum(x, axis=reduction_axes, keepdims=keep_dims)
+    return np.sum(x, axis=reduction_axes, keepdims=keepdims)
 
   def testAxesType(self):
     for dtype in [dtypes.int64, dtypes.int32]:
@@ -298,7 +298,7 @@ class SumReductionTest(BaseReductionTest):
     c_known_rank = array_ops.placeholder(dtypes.float32)
     c_known_rank.set_shape(tensor_shape.unknown_shape(ndims=3))
     s_known_rank = math_ops.reduce_sum(
-        c_known_rank, reduction_axes, keep_dims=True)
+        c_known_rank, reduction_axes, keepdims=True)
     self.assertEqual(3, s_known_rank.get_shape().ndims)
 
     np_input = np.random.randn(3, 3, 3)
@@ -308,11 +308,11 @@ class SumReductionTest(BaseReductionTest):
     unknown_indices = array_ops.placeholder(dtypes.int32)
     c_unknown_indices = constant_op.constant([[10.0], [20.0]])
     s_unknown_indices = math_ops.reduce_sum(
-        c_unknown_indices, unknown_indices, keep_dims=False)
+        c_unknown_indices, unknown_indices, keepdims=False)
     self.assertEqual(tensor_shape.unknown_shape(),
                      s_unknown_indices.get_shape())
     s_unknown_indices_keep = math_ops.reduce_sum(
-        c_unknown_indices, unknown_indices, keep_dims=True)
+        c_unknown_indices, unknown_indices, keepdims=True)
     self.assertEqual(2, s_unknown_indices_keep.get_shape().ndims)
 
   def testWrongShapeForReductionIndices(self):
@@ -372,10 +372,10 @@ class SumReductionTest(BaseReductionTest):
 
 class MeanReductionTest(BaseReductionTest):
 
-  def _tf_reduce(self, x, reduction_axes, keep_dims):
-    return math_ops.reduce_mean(x, reduction_axes, keep_dims)
+  def _tf_reduce(self, x, reduction_axes, keepdims):
+    return math_ops.reduce_mean(x, reduction_axes, keepdims)
 
-  def _np_reduce(self, x, reduction_axes, keep_dims):
+  def _np_reduce(self, x, reduction_axes, keepdims):
     if isinstance(reduction_axes, list) or isinstance(reduction_axes,
                                                       np.ndarray):
       reduction_axes = tuple(reduction_axes)
@@ -389,7 +389,7 @@ class MeanReductionTest(BaseReductionTest):
     # np.mean automatically converts integer inputs to float, while TensorFlow's
     # reduce_mean does not. For integer inputs, we emulate TensorFlow's behavior
     # using np.sum and truncating division.
-    np_sum = np.sum(x, axis=reduction_axes, keepdims=keep_dims)
+    np_sum = np.sum(x, axis=reduction_axes, keepdims=keepdims)
     if np.issubdtype(x.dtype, np.integer):
       return np_sum // count
     return np_sum / count
@@ -458,14 +458,14 @@ class MeanReductionTest(BaseReductionTest):
 
 class ProdReductionTest(BaseReductionTest):
 
-  def _tf_reduce(self, x, reduction_axes, keep_dims):
-    return math_ops.reduce_prod(x, reduction_axes, keep_dims)
+  def _tf_reduce(self, x, reduction_axes, keepdims):
+    return math_ops.reduce_prod(x, reduction_axes, keepdims)
 
-  def _np_reduce(self, x, reduction_axes, keep_dims):
+  def _np_reduce(self, x, reduction_axes, keepdims):
     if isinstance(reduction_axes, list) or isinstance(reduction_axes,
                                                       np.ndarray):
       reduction_axes = tuple(reduction_axes)
-    return np.prod(x, axis=reduction_axes, keepdims=keep_dims)
+    return np.prod(x, axis=reduction_axes, keepdims=keepdims)
 
   def testAxesType(self):
     for dtype in [dtypes.int64, dtypes.int32]:
@@ -549,17 +549,17 @@ class ProdReductionTest(BaseReductionTest):
 
 class MinReductionTest(test.TestCase):
 
-  def _compare(self, x, reduction_axes, keep_dims, use_gpu=False):
+  def _compare(self, x, reduction_axes, keepdims, use_gpu=False):
     np_ans = x
     if reduction_axes is None:
-      np_ans = np.amin(np_ans, keepdims=keep_dims)
+      np_ans = np.amin(np_ans, keepdims=keepdims)
     else:
       for ra in reduction_axes[::-1]:
-        np_ans = np.amin(np_ans, axis=ra, keepdims=keep_dims)
+        np_ans = np.amin(np_ans, axis=ra, keepdims=keepdims)
     with self.test_session(use_gpu=use_gpu):
       if reduction_axes is not None:
         reduction_axes = np.array(reduction_axes).astype(np.int32)
-      tf_ans = math_ops.reduce_min(x, reduction_axes, keep_dims)
+      tf_ans = math_ops.reduce_min(x, reduction_axes, keepdims)
       out = tf_ans.eval()
     self.assertAllClose(np_ans, out)
     self.assertShapeEqual(np_ans, tf_ans)
@@ -662,17 +662,17 @@ class MinReductionTest(test.TestCase):
 
 class MaxReductionTest(test.TestCase):
 
-  def _compare(self, x, reduction_axes, keep_dims, use_gpu=False):
+  def _compare(self, x, reduction_axes, keepdims, use_gpu=False):
     np_ans = x
     if reduction_axes is None:
-      np_ans = np.amax(np_ans, keepdims=keep_dims)
+      np_ans = np.amax(np_ans, keepdims=keepdims)
     else:
       for ra in reduction_axes[::-1]:
-        np_ans = np.amax(np_ans, axis=ra, keepdims=keep_dims)
+        np_ans = np.amax(np_ans, axis=ra, keepdims=keepdims)
     with self.test_session(use_gpu=use_gpu):
       if reduction_axes is not None:
         reduction_axes = np.array(reduction_axes).astype(np.int32)
-      tf_ans = math_ops.reduce_max(x, reduction_axes, keep_dims)
+      tf_ans = math_ops.reduce_max(x, reduction_axes, keepdims)
       out = tf_ans.eval()
     self.assertAllClose(np_ans, out)
     self.assertShapeEqual(np_ans, tf_ans)
@@ -789,17 +789,17 @@ class MaxReductionTest(test.TestCase):
 
 class AllReductionTest(test.TestCase):
 
-  def _compare(self, x, reduction_axes, keep_dims, use_gpu=False):
+  def _compare(self, x, reduction_axes, keepdims, use_gpu=False):
     np_ans = x
     if reduction_axes is None:
-      np_ans = np.all(np_ans, keepdims=keep_dims)
+      np_ans = np.all(np_ans, keepdims=keepdims)
     else:
       for ra in reduction_axes[::-1]:
-        np_ans = np.all(np_ans, axis=ra, keepdims=keep_dims)
+        np_ans = np.all(np_ans, axis=ra, keepdims=keepdims)
     with self.test_session(use_gpu=use_gpu):
       if reduction_axes is not None:
         reduction_axes = np.array(reduction_axes).astype(np.int32)
-      tf_ans = math_ops.reduce_all(x, reduction_axes, keep_dims)
+      tf_ans = math_ops.reduce_all(x, reduction_axes, keepdims)
       out = tf_ans.eval()
     self.assertAllEqual(np_ans, out)
     self.assertShapeEqual(np_ans, tf_ans)
@@ -838,17 +838,17 @@ class AllReductionTest(test.TestCase):
 
 class AnyReductionTest(test.TestCase):
 
-  def _compare(self, x, reduction_axes, keep_dims, use_gpu=False):
+  def _compare(self, x, reduction_axes, keepdims, use_gpu=False):
     np_ans = x
     if reduction_axes is None:
-      np_ans = np.any(np_ans, keepdims=keep_dims)
+      np_ans = np.any(np_ans, keepdims=keepdims)
     else:
       for ra in reduction_axes[::-1]:
-        np_ans = np.any(np_ans, axis=ra, keepdims=keep_dims)
+        np_ans = np.any(np_ans, axis=ra, keepdims=keepdims)
     with self.test_session(use_gpu=use_gpu):
       if reduction_axes is not None:
         reduction_axes = np.array(reduction_axes).astype(np.int32)
-      tf_ans = math_ops.reduce_any(x, reduction_axes, keep_dims)
+      tf_ans = math_ops.reduce_any(x, reduction_axes, keepdims)
       out = tf_ans.eval()
     self.assertAllEqual(np_ans, out)
     self.assertShapeEqual(np_ans, tf_ans)
@@ -890,18 +890,18 @@ class CountNonzeroReductionTest(test.TestCase):
   def _compare(self,
                x,
                reduction_axes,
-               keep_dims,
+               keepdims,
                use_gpu=False,
                feed_dict=None):
     np_ans = (x != 0).astype(np.int32)
     if reduction_axes is None:
-      np_ans = np.sum(np_ans, keepdims=keep_dims)
+      np_ans = np.sum(np_ans, keepdims=keepdims)
     else:
       reduction_axes = np.array(reduction_axes).astype(np.int32)
       for ra in reduction_axes.ravel()[::-1]:
-        np_ans = np.sum(np_ans, axis=ra, keepdims=keep_dims)
+        np_ans = np.sum(np_ans, axis=ra, keepdims=keepdims)
     with self.test_session(use_gpu=use_gpu) as sess:
-      tf_ans = math_ops.count_nonzero(x, reduction_axes, keep_dims)
+      tf_ans = math_ops.count_nonzero(x, reduction_axes, keepdims)
       out = sess.run(tf_ans, feed_dict)
     self.assertAllClose(np_ans, out)
     self.assertShapeEqual(np_ans, tf_ans)
diff --git a/tensorflow/python/kernel_tests/reduction_ops_test_big.py b/tensorflow/python/kernel_tests/reduction_ops_test_big.py
index 0959adb026..d70360775a 100644
--- a/tensorflow/python/kernel_tests/reduction_ops_test_big.py
+++ b/tensorflow/python/kernel_tests/reduction_ops_test_big.py
@@ -27,24 +27,24 @@ from tensorflow.python.platform import test
 
 class BaseReductionTest(test.TestCase):
 
-  def _tf_reduce(self, x, reduction_axes, keep_dims):
+  def _tf_reduce(self, x, reduction_axes, keepdims):
     raise NotImplementedError()
 
 
 class BigReductionTest(BaseReductionTest):
   """Test reductions for sum and boolean all over a wide range of shapes."""
 
-  def _tf_reduce_max(self, x, reduction_axes, keep_dims):
-    return math_ops.reduce_max(x, reduction_axes, keep_dims)
+  def _tf_reduce_max(self, x, reduction_axes, keepdims):
+    return math_ops.reduce_max(x, reduction_axes, keepdims)
 
-  def _tf_reduce_all(self, x, reduction_axes, keep_dims):
-    return math_ops.reduce_all(x, reduction_axes, keep_dims)
+  def _tf_reduce_all(self, x, reduction_axes, keepdims):
+    return math_ops.reduce_all(x, reduction_axes, keepdims)
 
-  def _tf_reduce_mean(self, x, reduction_axes, keep_dims):
-    return math_ops.reduce_mean(x, reduction_axes, keep_dims)
+  def _tf_reduce_mean(self, x, reduction_axes, keepdims):
+    return math_ops.reduce_mean(x, reduction_axes, keepdims)
 
-  def _tf_reduce_sum(self, x, reduction_axes, keep_dims):
-    return math_ops.reduce_sum(x, reduction_axes, keep_dims)
+  def _tf_reduce_sum(self, x, reduction_axes, keepdims):
+    return math_ops.reduce_sum(x, reduction_axes, keepdims)
 
   def testFloat32Sum(self):
     # make sure we test all possible kernel invocations
diff --git a/tensorflow/python/ops/clip_ops.py b/tensorflow/python/ops/clip_ops.py
index dd8c33247c..49f8c66531 100644
--- a/tensorflow/python/ops/clip_ops.py
+++ b/tensorflow/python/ops/clip_ops.py
@@ -110,7 +110,7 @@ def clip_by_norm(t, clip_norm, axes=None, name=None):
     t = ops.convert_to_tensor(t, name="t")
 
     # Calculate L2-norm, clip elements by ratio of clip_norm to L2-norm
-    l2norm = math_ops.sqrt(math_ops.reduce_sum(t * t, axes, keep_dims=True))
+    l2norm = math_ops.sqrt(math_ops.reduce_sum(t * t, axes, keepdims=True))
     intermediate = t * clip_norm
     # Assert that the shape is compatible with the initial shape,
     # to prevent unintentional broadcasting.
diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py
index ceeabd08f4..36b8c23feb 100644
--- a/tensorflow/python/ops/losses/losses_impl.py
+++ b/tensorflow/python/ops/losses/losses_impl.py
@@ -144,7 +144,7 @@ def _num_present(losses, weights, per_batch=False):
     if per_batch:
       return math_ops.reduce_sum(
           present, axis=math_ops.range(1, array_ops.rank(present)),
-          keep_dims=True, name=scope)
+          keepdims=True, name=scope)
     return math_ops.reduce_sum(present, name=scope)
 
 
@@ -311,7 +311,7 @@ def cosine_distance(
     predictions.get_shape().assert_is_compatible_with(labels.get_shape())
 
     radial_diffs = math_ops.multiply(predictions, labels)
-    losses = 1 - math_ops.reduce_sum(radial_diffs, axis=(axis,), keep_dims=True)
+    losses = 1 - math_ops.reduce_sum(radial_diffs, axis=(axis,), keepdims=True)
     return compute_weighted_loss(
         losses, weights, scope, loss_collection, reduction=reduction)
 
@@ -543,14 +543,14 @@ def mean_pairwise_squared_error(
 
       sum_squares_diff_per_batch = math_ops.reduce_sum(
           math_ops.square(diffs), reduction_indices=reduction_indices,
-          keep_dims=True)
+          keepdims=True)
       num_present_per_batch = _num_present(diffs, weights, per_batch=True)
 
       term1 = 2.0 * _safe_div(sum_squares_diff_per_batch,
                               num_present_per_batch - 1)
 
       sum_diff = math_ops.reduce_sum(
-          diffs, reduction_indices=reduction_indices, keep_dims=True)
+          diffs, reduction_indices=reduction_indices, keepdims=True)
       term2 = 2.0 * _safe_div(
           math_ops.square(sum_diff),
           math_ops.multiply(num_present_per_batch, num_present_per_batch - 1))
diff --git a/tensorflow/python/ops/nn_grad.py b/tensorflow/python/ops/nn_grad.py
index 5e6cafd6aa..e0fd1fe040 100644
--- a/tensorflow/python/ops/nn_grad.py
+++ b/tensorflow/python/ops/nn_grad.py
@@ -863,27 +863,27 @@ def _BatchNormGrad(grad_y,
     grad_y = math_ops.cast(grad_y, dtypes.float32)
   if is_training:
     if data_format == b"NHWC":
-      keep_dims = False
+      keepdims = False
       reduce_axis = [0, 1, 2]
     else:
-      keep_dims = True
+      keepdims = True
       reduce_axis = [0, 2, 3]
       shape = [1, array_ops.size(scale), 1, 1]
       scale = array_ops.reshape(scale, shape)
-    mean_grad_y = math_ops.reduce_mean(grad_y, reduce_axis, keep_dims=keep_dims)
-    mean_x = math_ops.reduce_mean(x, reduce_axis, keep_dims=keep_dims)
+    mean_grad_y = math_ops.reduce_mean(grad_y, reduce_axis, keepdims=keepdims)
+    mean_x = math_ops.reduce_mean(x, reduce_axis, keepdims=keepdims)
     var_x = math_ops.reduce_mean(
         math_ops.squared_difference(x, array_ops.stop_gradient(mean_x)),
         reduce_axis,
-        keep_dims=keep_dims)
+        keepdims=keepdims)
     grad_y_offset = grad_y - mean_grad_y
     x_offset = x - mean_x
     mean = math_ops.reduce_mean(
-        grad_y * x_offset, axis=reduce_axis, keep_dims=keep_dims)
+        grad_y * x_offset, axis=reduce_axis, keepdims=keepdims)
     grad_x = scale * math_ops.rsqrt(var_x + epsilon) * (
         grad_y_offset - math_ops.reciprocal(var_x + epsilon) * mean * x_offset)
     grad_scale = math_ops.rsqrt(var_x + epsilon) * math_ops.reduce_sum(
-        grad_y * x_offset, axis=reduce_axis, keep_dims=keep_dims)
+        grad_y * x_offset, axis=reduce_axis, keepdims=keepdims)
     if data_format == b"NCHW":
       grad_scale = array_ops.squeeze(grad_scale)
     grad_offset = math_ops.reduce_sum(grad_y, axis=reduce_axis)
-- 
GitLab


From 9384a14e7bf89bad7c05a3e784e6817a033c8beb Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Mon, 12 Feb 2018 12:04:41 +0000
Subject: [PATCH 946/966] Fix some warnings with keep_dims in
 `tf.contrib.distributions`

This fix fixes some warnings with keep_dims in `tf.contrib.distributions`

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/distributions/util.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/ops/distributions/util.py b/tensorflow/python/ops/distributions/util.py
index 5bc25128a8..0a3000ef5c 100644
--- a/tensorflow/python/ops/distributions/util.py
+++ b/tensorflow/python/ops/distributions/util.py
@@ -1041,14 +1041,14 @@ def reduce_weighted_logsumexp(
   with ops.name_scope(name, "reduce_weighted_logsumexp", [logx, w]):
     logx = ops.convert_to_tensor(logx, name="logx")
     if w is None:
-      lswe = math_ops.reduce_logsumexp(logx, axis=axis, keep_dims=keep_dims)
+      lswe = math_ops.reduce_logsumexp(logx, axis=axis, keepdims=keep_dims)
       if return_sign:
         sgn = array_ops.ones_like(lswe)
         return lswe, sgn
       return lswe
     w = ops.convert_to_tensor(w, dtype=logx.dtype, name="w")
     log_absw_x = logx + math_ops.log(math_ops.abs(w))
-    max_log_absw_x = math_ops.reduce_max(log_absw_x, axis=axis, keep_dims=True)
+    max_log_absw_x = math_ops.reduce_max(log_absw_x, axis=axis, keepdims=True)
     # If the largest element is `-inf` or `inf` then we don't bother subtracting
     # off the max. We do this because otherwise we'd get `inf - inf = NaN`. That
     # this is ok follows from the fact that we're actually free to subtract any
@@ -1062,7 +1062,7 @@ def reduce_weighted_logsumexp(
     sum_wx_over_max_absw_x = math_ops.reduce_sum(
         wx_over_max_absw_x,
         axis=axis,
-        keep_dims=keep_dims)
+        keepdims=keep_dims)
     if not keep_dims:
       max_log_absw_x = array_ops.squeeze(max_log_absw_x, axis)
     sgn = math_ops.sign(sum_wx_over_max_absw_x)
@@ -1180,7 +1180,7 @@ def process_quadrature_grid_and_probs(
     grid = ops.convert_to_tensor(grid, name="grid", dtype=dtype)
     probs = ops.convert_to_tensor(probs, name="unnormalized_probs",
                                   dtype=dtype)
-    probs /= linalg_ops.norm(probs, ord=1, axis=-1, keep_dims=True,
+    probs /= linalg_ops.norm(probs, ord=1, axis=-1, keepdims=True,
                              name="probs")
 
     def _static_event_size(x):
-- 
GitLab


From a09c5af9c94e2799ffe6162a342808742e88ab11 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Mon, 12 Feb 2018 12:07:24 +0000
Subject: [PATCH 947/966] Fix some keep_dims warnings in math_ops_tests

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/math_ops_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index bd26ff6696..d314124ccd 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -105,7 +105,7 @@ class LogSumExpTest(test_util.TensorFlowTestCase):
     for dtype in [np.float16, np.float32, np.double]:
       x_np = np.random.rand(5, 5).astype(dtype)
       with self.test_session(use_gpu=True):
-        y_tf_np = math_ops.reduce_logsumexp(x_np, keep_dims=True).eval()
+        y_tf_np = math_ops.reduce_logsumexp(x_np, keepdims=True).eval()
         self.assertEqual(y_tf_np.ndim, x_np.ndim)
         y_np = log(np.sum(exp(x_np), keepdims=True))
         self.assertAllClose(y_tf_np, y_np)
-- 
GitLab


From a2504ab91afcdeb917a272d2cc54f13dd67c04e7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 12 Feb 2018 05:34:05 -0800
Subject: [PATCH 948/966] [XLA] Support generating tuple shaped fake data in
 client testing

The previous implementation failed over in case of a tuple shaped input
what broke the replay computation tool for the case where the input is a
tuple.

PiperOrigin-RevId: 185366228
---
 tensorflow/compiler/xla/client/lib/testing.cc | 37 +++++++++++++++++--
 1 file changed, 33 insertions(+), 4 deletions(-)

diff --git a/tensorflow/compiler/xla/client/lib/testing.cc b/tensorflow/compiler/xla/client/lib/testing.cc
index 5f2b55713e..b63a1465ea 100644
--- a/tensorflow/compiler/xla/client/lib/testing.cc
+++ b/tensorflow/compiler/xla/client/lib/testing.cc
@@ -31,14 +31,43 @@ limitations under the License.
 namespace xla {
 namespace {
 
+// Calculates the number of bytes required to store the data within the
+// specified shape. In case of a (nested) tuple shape this is the total byte
+// size of all sub-shapes within the tuple.
+int64 DataSizeOfShape(const Shape& shape) {
+  if (ShapeUtil::IsArray(shape)) {
+    return ShapeUtil::ByteSizeOf(shape);
+  }
+
+  int64 total_size = 0;
+  for (const Shape& s : shape.tuple_shapes()) {
+    total_size += DataSizeOfShape(s);
+  }
+  return total_size;
+}
+
+// Create a ComputationDataHandle for an op what generates fake data with the
+// given shape.
+ComputationDataHandle BuildFakeDataOpOnDevice(const Shape& shape,
+                                              ComputationBuilder* builder) {
+  if (ShapeUtil::IsArray(shape)) {
+    return builder->Broadcast(
+        builder->ConstantLiteral(Literal::One(shape.element_type())),
+        AsInt64Slice(shape.dimensions()));
+  }
+  std::vector<ComputationDataHandle> parts;
+  for (const Shape& s : shape.tuple_shapes()) {
+    parts.push_back(BuildFakeDataOpOnDevice(s, builder));
+  }
+  return builder->Tuple(parts);
+}
+
 std::unique_ptr<GlobalData> MakeFakeDataViaDeviceOrDie(const Shape& shape,
                                                        Client* client) {
   ComputationBuilder b(
       client,
       tensorflow::strings::StrCat("make_fake_", ShapeUtil::HumanString(shape)));
-  // TODO(b/26811613): Replace this when RNG is supported on all backends.
-  b.Broadcast(b.ConstantLiteral(Literal::One(shape.element_type())),
-              AsInt64Slice(shape.dimensions()));
+  BuildFakeDataOpOnDevice(shape, &b);
   Computation computation = b.Build().ConsumeValueOrDie();
 
   auto execution_options = CreateDefaultExecutionOptions();
@@ -51,7 +80,7 @@ std::unique_ptr<GlobalData> MakeFakeDataViaDeviceOrDie(const Shape& shape,
 
 std::unique_ptr<GlobalData> MakeFakeDataOrDie(const Shape& shape,
                                               Client* client) {
-  if (ShapeUtil::ByteSizeOf(shape) < (1LL << 20)) {
+  if (DataSizeOfShape(shape) < (1LL << 20)) {
     StatusOr<std::unique_ptr<Literal>> literal_status = MakeFakeLiteral(shape);
     if (!literal_status.ok()) {
       // If we got an Unimplemented error, fall back to making the fake data via
-- 
GitLab


From dd2447af6afe43bf9b10d18a6f438f47963f43dc Mon Sep 17 00:00:00 2001
From: Brian Patton <bjp@google.com>
Date: Mon, 12 Feb 2018 06:40:26 -0800
Subject: [PATCH 949/966] For debugging purposes, it can be useful to know
 which ops are considered non-pure / non-constant.

PiperOrigin-RevId: 185371882
---
 tensorflow/compiler/xla/service/user_computation.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/compiler/xla/service/user_computation.cc b/tensorflow/compiler/xla/service/user_computation.cc
index ef9c80b043..fead9b9236 100644
--- a/tensorflow/compiler/xla/service/user_computation.cc
+++ b/tensorflow/compiler/xla/service/user_computation.cc
@@ -1997,6 +1997,9 @@ void PureFunctionalVisitor(const SessionComputation& session_computation,
     default:
       LOG(FATAL) << "Unexpected request type: " << request.request().op_case();
   }
+  if (!*is_functional) {
+    VLOG(1) << "Non-functional: " << request.request().DebugString();
+  }
   visited->insert(handle.handle());
 }
 
-- 
GitLab


From a8f50991d941b76e32ec3ebe3b3b80f6781e5e58 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 12 Feb 2018 08:38:17 -0800
Subject: [PATCH 950/966] Internal Change

PiperOrigin-RevId: 185382594
---
 tensorflow/contrib/lite/BUILD        | 24 ++++++++++++------------
 tensorflow/contrib/lite/models/BUILD | 26 ++++++++++++++++++++++++++
 2 files changed, 38 insertions(+), 12 deletions(-)
 create mode 100644 tensorflow/contrib/lite/models/BUILD

diff --git a/tensorflow/contrib/lite/BUILD b/tensorflow/contrib/lite/BUILD
index 30664832fb..3520a4eaf0 100644
--- a/tensorflow/contrib/lite/BUILD
+++ b/tensorflow/contrib/lite/BUILD
@@ -235,18 +235,18 @@ cc_test(
 
 # Model tests
 
-cc_library(
-    name = "models_test_utils",
-    testonly = 1,
-    hdrs = ["models/test_utils.h"],
-    deps = select({
-        "//tensorflow:android": [],
-        "//conditions:default": [
-            "@com_google_absl//absl/strings",
-            "//tensorflow/core:test",
-        ],
-    }),
-)
+#cc_library(
+#    name = "models_test_utils",
+#    testonly = 1,
+#    hdrs = ["models/test_utils.h"],
+#    deps = select({
+#        "//tensorflow:android": [],
+#        "//conditions:default": [
+#            "@com_google_absl//absl/strings",
+#            "//tensorflow/core:test",
+#        ],
+#    }),
+#)
 
 filegroup(
     name = "all_files",
diff --git a/tensorflow/contrib/lite/models/BUILD b/tensorflow/contrib/lite/models/BUILD
new file mode 100644
index 0000000000..6a1255b586
--- /dev/null
+++ b/tensorflow/contrib/lite/models/BUILD
@@ -0,0 +1,26 @@
+# Model tests
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts")
+
+exports_files(glob([
+    "testdata/*",
+]))
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
-- 
GitLab


From ac8e4fb7417920f48e4718fb6cf3ea605583b451 Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Mon, 12 Feb 2018 09:28:47 -0800
Subject: [PATCH 951/966] ParseNodeName fix.

ParseNodeName was skipping ops that started with an underscore, leading to warnings that input of an op was undefined and stopping grappler optimizations from being run on the graph.

PiperOrigin-RevId: 185388749
---
 tensorflow/core/grappler/utils.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/grappler/utils.cc b/tensorflow/core/grappler/utils.cc
index 7bfaf36865..eb5a2c48dc 100644
--- a/tensorflow/core/grappler/utils.cc
+++ b/tensorflow/core/grappler/utils.cc
@@ -132,7 +132,7 @@ string ParseNodeName(const string& name, int* position) {
   strings::Scanner scan(name);
   scan.ZeroOrOneLiteral("^")
       .RestartCapture()
-      .One(strings::Scanner::LETTER_DIGIT_DOT)
+      .One(strings::Scanner::LETTER_DIGIT_DOT_UNDERSCORE)
       .Any(strings::Scanner::LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE);
   StringPiece capture;
   StringPiece remaining;
-- 
GitLab


From 21a9ca4487d0d3ef9f2aa2ba5909b37c735c18e6 Mon Sep 17 00:00:00 2001
From: gracehoney <31743510+aaroey@users.noreply.github.com>
Date: Mon, 12 Feb 2018 09:41:43 -0800
Subject: [PATCH 952/966] Fix linter errors in test_tftrt.py

---
 .../contrib/tensorrt/test/test_tftrt.py       | 28 +++++++++----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/tensorflow/contrib/tensorrt/test/test_tftrt.py b/tensorflow/contrib/tensorrt/test/test_tftrt.py
index 927a3e4719..71956662de 100644
--- a/tensorflow/contrib/tensorrt/test/test_tftrt.py
+++ b/tensorflow/contrib/tensorrt/test/test_tftrt.py
@@ -18,33 +18,33 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
 import tensorflow as tf
 import tensorflow.contrib.tensorrt as trt
-import numpy as np
 
 
-def getSimpleGraphDef():
+def get_simple_graph_def():
   """Create a simple graph and return its graph_def"""
   g = tf.Graph()
   with g.as_default():
-    A = tf.placeholder(dtype=tf.float32, shape=(None, 24, 24, 2), name="input")
+    a = tf.placeholder(dtype=tf.float32, shape=(None, 24, 24, 2), name="input")
     e = tf.constant(
         [[[[1., 0.5, 4., 6., 0.5, 1.], [1., 0.5, 1., 1., 0.5, 1.]]]],
         name="weights",
         dtype=tf.float32)
     conv = tf.nn.conv2d(
-        input=A, filter=e, strides=[1, 2, 2, 1], padding="SAME", name="conv")
+        input=a, filter=e, strides=[1, 2, 2, 1], padding="SAME", name="conv")
     b = tf.constant([4., 1.5, 2., 3., 5., 7.], name="bias", dtype=tf.float32)
     t = tf.nn.bias_add(conv, b, name="biasAdd")
     relu = tf.nn.relu(t, "relu")
     idty = tf.identity(relu, "ID")
     v = tf.nn.max_pool(
         idty, [1, 2, 2, 1], [1, 2, 2, 1], "VALID", name="max_pool")
-    out = tf.squeeze(v, name="output")
+    tf.squeeze(v, name="output")
   return g.as_graph_def()
 
 
-def runGraph(gdef, dumm_inp):
+def run_graph(gdef, dumm_inp):
   gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.50)
   tf.reset_default_graph()
   g = tf.Graph()
@@ -60,12 +60,12 @@ def runGraph(gdef, dumm_inp):
 
 
 if "__main__" in __name__:
-  inpDims = (100, 24, 24, 2)
-  dummy_input = np.random.random_sample(inpDims)
-  gdef = getSimpleGraphDef()
-  trt_graph = trt.create_inference_graph(gdef, ["output"],
-                                         inpDims[0])  # Get optimized graph
-  o1 = runGraph(gdef, dummy_input)
-  o2 = runGraph(trt_graph, dummy_input)
-  assert (np.array_equal(o1, o2))
+  inp_dims = (100, 24, 24, 2)
+  dummy_input = np.random.random_sample(inp_dims)
+  gdef = get_simple_graph_def()
+  # Get optimized graph
+  trt_graph = trt.create_inference_graph(gdef, ["output"], inp_dims[0])
+  o1 = run_graph(gdef, dummy_input)
+  o2 = run_graph(trt_graph, dummy_input)
+  assert np.array_equal(o1, o2)
   print("Pass")
-- 
GitLab


From 423aec8f3c891ee3d0bf15e8ee693b7bc0064d8b Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Mon, 12 Feb 2018 09:57:40 -0800
Subject: [PATCH 953/966] Update `tf.contrib.data` API docstring.

PiperOrigin-RevId: 185392564
---
 tensorflow/contrib/data/__init__.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tensorflow/contrib/data/__init__.py b/tensorflow/contrib/data/__init__.py
index 092c50c370..fcdccdd26c 100644
--- a/tensorflow/contrib/data/__init__.py
+++ b/tensorflow/contrib/data/__init__.py
@@ -12,15 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""`tf.contrib.data` API for input pipelines.
+"""Experimental API for building input pipelines.
 
-This module contains the experimental (less stable) counterpart to the
-`tf.data` API. See @{tf.data.Dataset} and @{tf.data.Iterator} for the
-stable classes.
+This module contains experimental `Dataset` sources and transformations that can
+be used in conjunction with the @{tf.data.Dataset} API. Note that the
+`tf.contrib.data` API is not subject to the same backwards compatibility
+guarantees as `tf.data`, but we will provide deprecation advice in advance of
+removing existing functionality.
 
 See the @{$datasets$Importing Data} Programmer's Guide for an overview.
 
-@@Dataset
 @@Counter
 
 @@batch_and_drop_remainder
-- 
GitLab


From 3ee1721b46d0e61097d0ee72f01e3de9739f0b6f Mon Sep 17 00:00:00 2001
From: Yifei Feng <fengyifei2026@gmail.com>
Date: Mon, 12 Feb 2018 10:09:20 -0800
Subject: [PATCH 954/966] Update bazel version in docker (#16880)

---
 tensorflow/tools/ci_build/install/install_bazel.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/tools/ci_build/install/install_bazel.sh b/tensorflow/tools/ci_build/install/install_bazel.sh
index cf8737c2d8..1df6a84d7c 100755
--- a/tensorflow/tools/ci_build/install/install_bazel.sh
+++ b/tensorflow/tools/ci_build/install/install_bazel.sh
@@ -15,7 +15,7 @@
 # ==============================================================================
 
 # Select bazel version.
-BAZEL_VERSION="0.8.0"
+BAZEL_VERSION="0.10.0"
 
 set +e
 local_bazel_ver=$(bazel version 2>&1 | grep -i label | awk '{print $3}')
-- 
GitLab


From 5ae3ed1dd2096382f1bd044edfd0517d7905e0ad Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Mon, 12 Feb 2018 10:27:18 -0800
Subject: [PATCH 955/966] Fix shape inference bug in tensorlist

PiperOrigin-RevId: 185397219
---
 tensorflow/core/ops/list_ops.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/core/ops/list_ops.cc b/tensorflow/core/ops/list_ops.cc
index fa40f41bb9..3487c955cb 100644
--- a/tensorflow/core/ops/list_ops.cc
+++ b/tensorflow/core/ops/list_ops.cc
@@ -241,6 +241,7 @@ REGISTER_OP("TensorListSetItem")
       DataType t;
       TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
       auto* handle_data = c->input_handle_shapes_and_types(0);
+      c->set_output(0, c->Scalar());
       if (handle_data == nullptr) {
         c->set_output_handle_shapes_and_types(0, {{c->UnknownShape(), t}});
         return Status::OK();
-- 
GitLab


From 7ecc83de23df629fcebb541262925e34dd17cc84 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Mon, 12 Feb 2018 10:34:18 -0800
Subject: [PATCH 956/966] [TF:XLA] Add additional test case for tf.gather.

PiperOrigin-RevId: 185398368
---
 tensorflow/compiler/tests/gather_test.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/tensorflow/compiler/tests/gather_test.py b/tensorflow/compiler/tests/gather_test.py
index 13cbe6f312..1a8c451911 100644
--- a/tensorflow/compiler/tests/gather_test.py
+++ b/tensorflow/compiler/tests/gather_test.py
@@ -122,6 +122,20 @@ class GatherTest(xla_test.XLATestCase):
             gather_np = np.take(params, indices, axis=axis)
             self.assertAllEqual(gather_np, gather_value)
 
+  def testIndicesWithDifferentDimensions(self):
+    with self.test_session():
+      for dtype in self.numeric_tf_types:
+        params = array_ops.placeholder(dtype=dtype)
+        indices = array_ops.placeholder(dtype=np.int32)
+        with self.test_scope():
+          gather = array_ops.gather(params, indices)
+        self.assertAllEqual(
+            7, gather.eval(feed_dict={params: [4, 7, 2], indices: 1}))
+        self.assertAllEqual(
+            [7], gather.eval(feed_dict={params: [4, 7, 2], indices: [1]}))
+        self.assertAllEqual(
+            [[7]], gather.eval(feed_dict={params: [4, 7, 2], indices: [[1]]}))
+
 
 class GatherBenchmark(test.Benchmark):
   """Microbenchmarks for the gather op."""
-- 
GitLab


From a55c133748b695778163035f098cf3ac34302872 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 12 Feb 2018 10:34:20 -0800
Subject: [PATCH 957/966] Add support for scalars in `tf.contrib.all_reduce`.

PiperOrigin-RevId: 185398372
---
 tensorflow/contrib/all_reduce/python/all_reduce.py   | 12 ++++++------
 .../contrib/all_reduce/python/all_reduce_test.py     |  4 +++-
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/tensorflow/contrib/all_reduce/python/all_reduce.py b/tensorflow/contrib/all_reduce/python/all_reduce.py
index 28f60b3499..16617b7266 100644
--- a/tensorflow/contrib/all_reduce/python/all_reduce.py
+++ b/tensorflow/contrib/all_reduce/python/all_reduce.py
@@ -48,7 +48,7 @@ def _flatten_tensors(tensors):
   if shape.ndims is None:
     raise ValueError("At least one of the tensors in 'tensors' must have "
                      "statically known rank.")
-  if len(shape) > 1:
+  if len(shape) != 1:
     reshaped = []
     for t in tensors:
       with ops.colocate_with(t):
@@ -289,7 +289,7 @@ def build_ring_all_reduce(input_tensors, num_workers, num_subchunks,
                                        chunks_by_dev)
   if pad_len > 0:
     output_tensors = _strip_padding(output_tensors, pad_len)
-  if len(shape) > 1:
+  if len(shape) != 1:
     output_tensors = _reshape_tensors(output_tensors, shape)
   return output_tensors
 
@@ -466,7 +466,7 @@ def build_recursive_hd_all_reduce(input_tensors, red_op, un_op=None):
   if un_op:
     reduced_shards = [un_op(t) for t in reduced_shards]
   output_tensors = _build_recursive_hd_scatter(reduced_shards, devices)
-  if len(shape) > 1:
+  if len(shape) != 1:
     output_tensors = _reshape_tensors(output_tensors, shape)
   return output_tensors
 
@@ -578,7 +578,7 @@ def build_shuffle_all_reduce(input_tensors, gather_devices, red_op, un_op=None):
   reduced_shards = _build_shuffle_gather(input_tensors, gather_devices,
                                          red_op, un_op)
   output_tensors = _build_shuffle_scatter(reduced_shards, dst_devices)
-  if len(shape) > 1:
+  if len(shape) != 1:
     output_tensors = _reshape_tensors(output_tensors, shape)
   return output_tensors
 
@@ -752,7 +752,7 @@ def _build_nccl_hybrid(input_tensors, red_op, upper_level_f):
         dst_tensors.append(array_ops.identity(broadcast_src))
     down_values[w] = dst_tensors
   output_tensors = [v for sublist in down_values for v in sublist]
-  if len(shape) > 1:
+  if len(shape) != 1:
     output_tensors = _reshape_tensors(output_tensors, shape)
   return output_tensors
 
@@ -831,7 +831,7 @@ def _build_shuffle_hybrid(input_tensors, gather_devices, red_op, upper_level_f):
   for w in range(0, num_workers):
     output_tensors += _build_shuffle_scatter(
         [level_2_output[w]], per_worker_devices[w])
-  if len(shape) > 1:
+  if len(shape) != 1:
     output_tensors = _reshape_tensors(output_tensors, shape)
   return output_tensors
 
diff --git a/tensorflow/contrib/all_reduce/python/all_reduce_test.py b/tensorflow/contrib/all_reduce/python/all_reduce_test.py
index 0802b27369..47bab0a367 100644
--- a/tensorflow/contrib/all_reduce/python/all_reduce_test.py
+++ b/tensorflow/contrib/all_reduce/python/all_reduce_test.py
@@ -119,7 +119,7 @@ class AllReduceTest(test_util.TensorFlowTestCase):
   def _buildInitialVars(self, shape, dev_list):
     values = []
     num_devices = len(dev_list)
-    dim = np.prod(shape)
+    dim = np.prod(shape) if shape else 1
     for d in range(0, num_devices):
       with ops.device(dev_list[d]):
         npt = np.zeros(shape).astype(np.float32)
@@ -164,6 +164,7 @@ class AllReduceTest(test_util.TensorFlowTestCase):
                     (num_workers, num_gpus, shape, subdiv, elapsed))
 
   def testRingAllReduce(self):
+    self._testRingAllReduce(1, 2, [], 1)
     self._testRingAllReduce(1, 2, [8], 1)
     self._testRingAllReduce(1, 2, [4, 4], 1)
     self._testRingAllReduce(6, 1, [8], 1)
@@ -192,6 +193,7 @@ class AllReduceTest(test_util.TensorFlowTestCase):
                     "elapsed=%f" % (num_workers, num_gpus, shape, elapsed))
 
   def testShuffleAllReduce(self):
+    self._testShuffleAllReduce(1, 2, [], 1)
     self._testShuffleAllReduce(1, 2, [8], 1)
     self._testShuffleAllReduce(1, 2, [4, 4], 1)
     self._testShuffleAllReduce(1, 8, [32], 1)
-- 
GitLab


From 35ba5d8dc8899e28ac789dc493f0dd205e169c74 Mon Sep 17 00:00:00 2001
From: Sami Kama <sami.kama.git@gmail.com>
Date: Mon, 12 Feb 2018 13:35:23 -0800
Subject: [PATCH 958/966] Fix Py3 byte and string issue after swig update.
 Clarify failure message on finding libnvinfer in configure.py

---
 configure.py                                  | 21 +++++++++++++------
 .../contrib/tensorrt/python/trt_convert.py    | 12 +++++++++--
 .../contrib/tensorrt/test/test_tftrt.py       |  2 ++
 3 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/configure.py b/configure.py
index 9cf59024c4..3aa1a3e956 100644
--- a/configure.py
+++ b/configure.py
@@ -1078,12 +1078,21 @@ def set_tf_tensorrt_install_path(environ_cp):
           break
 
     # Reset and Retry
-    print('Invalid path to TensorRT. None of the following files can be found:')
-    print(trt_install_path)
-    print(os.path.join(trt_install_path, 'lib'))
-    print(os.path.join(trt_install_path, 'lib64'))
-    if search_result:
-      print(libnvinfer_path_from_ldconfig)
+    if len(possible_files):
+      print('TensorRT libraries found in one the following directories',
+            'are not compatible with selected cuda and cudnn installations')
+      print(trt_install_path)
+      print(os.path.join(trt_install_path, 'lib'))
+      print(os.path.join(trt_install_path, 'lib64'))
+      if search_result:
+        print(libnvinfer_path_from_ldconfig)
+    else:
+      print('Invalid path to TensorRT. None of the following files can be found:')
+      print(trt_install_path)
+      print(os.path.join(trt_install_path, 'lib'))
+      print(os.path.join(trt_install_path, 'lib64'))
+      if search_result:
+        print(libnvinfer_path_from_ldconfig)
 
   else:
     raise UserInputError('Invalid TF_TENSORRT setting was provided %d '
diff --git a/tensorflow/contrib/tensorrt/python/trt_convert.py b/tensorflow/contrib/tensorrt/python/trt_convert.py
index 69bbf451e0..9454862f85 100644
--- a/tensorflow/contrib/tensorrt/python/trt_convert.py
+++ b/tensorflow/contrib/tensorrt/python/trt_convert.py
@@ -55,10 +55,18 @@ def create_inference_graph(input_graph_def,
   def py3bytes(inp):
     return inp.encode("utf-8", errors="surrogateescape")
 
+  def py2string(inp):
+    return inp
+
+  def py3string(inp):
+    return inp.decode("utf-8")
+
   if _six.PY2:
     to_bytes = py2bytes
+    to_string = py2string
   else:
     to_bytes = py3bytes
+    to_string = py3string
 
   out_names = []
   for i in outputs:
@@ -76,8 +84,8 @@ def create_inference_graph(input_graph_def,
   # one is the transformed graphs protobuf string.
   out = trt_convert(input_graph_def_str, out_names, max_batch_size,
                     max_workspace_size_bytes)
-  status = out[0]
-  output_graph_def_string = to_bytes(out[1])
+  status = to_string(out[0])
+  output_graph_def_string = out[1]
   del input_graph_def_str  # Save some memory
   if len(status) < 2:
     raise _impl.UnknownError(None, None, status)
diff --git a/tensorflow/contrib/tensorrt/test/test_tftrt.py b/tensorflow/contrib/tensorrt/test/test_tftrt.py
index 927a3e4719..adf3438e4b 100644
--- a/tensorflow/contrib/tensorrt/test/test_tftrt.py
+++ b/tensorflow/contrib/tensorrt/test/test_tftrt.py
@@ -67,5 +67,7 @@ if "__main__" in __name__:
                                          inpDims[0])  # Get optimized graph
   o1 = runGraph(gdef, dummy_input)
   o2 = runGraph(trt_graph, dummy_input)
+  o3 = runGraph(trt_graph, dummy_input)
   assert (np.array_equal(o1, o2))
+  assert (np.array_equal(o2, o3))
   print("Pass")
-- 
GitLab


From 527ad28262d0289f05405c80d1870324594a214d Mon Sep 17 00:00:00 2001
From: Nick Felt <nickfelt@google.com>
Date: Mon, 12 Feb 2018 14:11:03 -0800
Subject: [PATCH 959/966] Update tb-nightly dep to >= 1.7.0a0, < 1.8.0a0

---
 tensorflow/tools/pip_package/setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 0e6b32bb49..e4ca974e1b 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -62,7 +62,7 @@ else:
 if 'tf_nightly' in project_name:
   for i, pkg in enumerate(REQUIRED_PACKAGES):
     if 'tensorboard' in pkg:
-      REQUIRED_PACKAGES[i] = 'tb-nightly >= 1.5.0a0, < 1.6.0a0'
+      REQUIRED_PACKAGES[i] = 'tb-nightly >= 1.7.0a0, < 1.8.0a0'
       break
 
 # weakref.finalize and enum were introduced in Python 3.4
-- 
GitLab


From 8a96d5727b271e73c87f49817a6c913bc896a4ef Mon Sep 17 00:00:00 2001
From: Sami Kama <sami.kama.git@gmail.com>
Date: Mon, 12 Feb 2018 14:28:51 -0800
Subject: [PATCH 960/966] Make buildifier happy

---
 tensorflow/contrib/BUILD           | 1 -
 tensorflow/tools/pip_package/BUILD | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index 916f4f1f48..4272d90b70 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -112,7 +112,6 @@ py_library(
     ] + if_mpi(["//tensorflow/contrib/mpi_collectives:mpi_collectives_py"]) + if_tensorrt([
         "//tensorflow/contrib/tensorrt:init_py",
     ]),
-
 )
 
 cc_library(
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index bd2e3adb84..31a53eb7b8 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -195,7 +195,7 @@ sh_binary(
             "//tensorflow/tools/dist_test/server:grpc_tensorflow_server",
         ],
     }) + if_mkl(["//third_party/mkl:intel_binary_blob"]) + if_tensorrt([
-      "//tensorflow/contrib/tensorrt:init_py",
+        "//tensorflow/contrib/tensorrt:init_py",
     ]),
 )
 
-- 
GitLab


From 19eadc9bc8e2ae6cbd03800b323db92da4cdf9cc Mon Sep 17 00:00:00 2001
From: Sami Kama <sami.kama.git@gmail.com>
Date: Mon, 12 Feb 2018 15:22:19 -0800
Subject: [PATCH 961/966] Don't use tf as directly and import individual
 modules for internal builds

---
 .../contrib/tensorrt/test/test_tftrt.py       | 51 ++++++++++++-------
 1 file changed, 32 insertions(+), 19 deletions(-)

diff --git a/tensorflow/contrib/tensorrt/test/test_tftrt.py b/tensorflow/contrib/tensorrt/test/test_tftrt.py
index 69fccd3192..9ba8cbf4d5 100644
--- a/tensorflow/contrib/tensorrt/test/test_tftrt.py
+++ b/tensorflow/contrib/tensorrt/test/test_tftrt.py
@@ -19,42 +19,55 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-import tensorflow as tf
-import tensorflow.contrib.tensorrt as trt
+# normally we should do import tensorflow as tf and then
+# tf.placeholder, tf.constant, tf.nn.conv2d etc but
+# it looks like internal builds don't like it so
+# importing every module individually
+
+from tensorflow.contrib.tensorrt as trt
+from tensorflow.core.protobuf import config_pb2 as cpb2
+from tensorflow.python.client import session as csess
+from tensorflow.python.framework import constant_op as cop
+from tensorflow.python.framework import dtypes as dtypes
+from tensorflow.python.framework import importer as importer
+from tensorflow.python.framework import ops as ops
+from tensorflow.python.ops import array_ops as aops
+from tensorflow.python.ops import nn as nn
+from tensorflow.python.ops import nn_ops as nn_ops
 
 
 def get_simple_graph_def():
   """Create a simple graph and return its graph_def"""
-  g = tf.Graph()
+  g = ops.Graph()
   with g.as_default():
-    a = tf.placeholder(dtype=tf.float32, shape=(None, 24, 24, 2), name="input")
-    e = tf.constant(
+    a = aops.placeholder(dtype=dtypes.float32, shape=(None, 24, 24, 2), name="input")
+    e = cop.constant(
         [[[[1., 0.5, 4., 6., 0.5, 1.], [1., 0.5, 1., 1., 0.5, 1.]]]],
         name="weights",
-        dtype=tf.float32)
-    conv = tf.nn.conv2d(
+        dtype=dtypes.float32)
+    conv = nn.conv2d(
         input=a, filter=e, strides=[1, 2, 2, 1], padding="SAME", name="conv")
-    b = tf.constant([4., 1.5, 2., 3., 5., 7.], name="bias", dtype=tf.float32)
-    t = tf.nn.bias_add(conv, b, name="biasAdd")
-    relu = tf.nn.relu(t, "relu")
-    idty = tf.identity(relu, "ID")
-    v = tf.nn.max_pool(
+    b = cop.constant([4., 1.5, 2., 3., 5., 7.], name="bias", dtype=dtypes.float32)
+    t = nn.bias_add(conv, b, name="biasAdd")
+    relu = nn.relu(t, "relu")
+    idty = aops.identity(relu, "ID")
+    v = nn_ops.max_pool(
         idty, [1, 2, 2, 1], [1, 2, 2, 1], "VALID", name="max_pool")
-    tf.squeeze(v, name="output")
+    aops.squeeze(v, name="output")
   return g.as_graph_def()
 
 
 def run_graph(gdef, dumm_inp):
-  gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.50)
-  tf.reset_default_graph()
-  g = tf.Graph()
+  gpu_options = cbp2.GPUOptions(per_process_gpu_memory_fraction=0.50)
+  ops.reset_default_graph()
+  g = ops.Graph()
   with g.as_default():
-    inp, out = tf.import_graph_def(
+    inp, out = importer.import_graph_def(
         graph_def=gdef, return_elements=["input", "output"])
     inp = inp.outputs[0]
     out = out.outputs[0]
-  with tf.Session(
-      config=tf.ConfigProto(gpu_options=gpu_options), graph=g) as sess:
+  with csess.Session(
+      config=cbp2.ConfigProto(gpu_options=gpu_options), graph=g) as sess:
     val = sess.run(out, {inp: dumm_inp})
   return val
 
-- 
GitLab


From a60ee657a6f1a0479e88b0c9c0f10b204e02ab7c Mon Sep 17 00:00:00 2001
From: Sami Kama <sami.kama.git@gmail.com>
Date: Mon, 12 Feb 2018 15:23:54 -0800
Subject: [PATCH 962/966] fix import

---
 tensorflow/contrib/tensorrt/test/test_tftrt.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/tensorrt/test/test_tftrt.py b/tensorflow/contrib/tensorrt/test/test_tftrt.py
index 9ba8cbf4d5..58f786c8fb 100644
--- a/tensorflow/contrib/tensorrt/test/test_tftrt.py
+++ b/tensorflow/contrib/tensorrt/test/test_tftrt.py
@@ -24,7 +24,7 @@ import numpy as np
 # it looks like internal builds don't like it so
 # importing every module individually
 
-from tensorflow.contrib.tensorrt as trt
+from tensorflow.contrib import tensorrt as trt
 from tensorflow.core.protobuf import config_pb2 as cpb2
 from tensorflow.python.client import session as csess
 from tensorflow.python.framework import constant_op as cop
-- 
GitLab


From 83b96092ae178171cd7c8c92b366869854b80158 Mon Sep 17 00:00:00 2001
From: Sami Kama <sami.kama.git@gmail.com>
Date: Mon, 12 Feb 2018 15:25:34 -0800
Subject: [PATCH 963/966] fix typo

---
 tensorflow/contrib/tensorrt/test/test_tftrt.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/tensorrt/test/test_tftrt.py b/tensorflow/contrib/tensorrt/test/test_tftrt.py
index 58f786c8fb..a3b904fdb6 100644
--- a/tensorflow/contrib/tensorrt/test/test_tftrt.py
+++ b/tensorflow/contrib/tensorrt/test/test_tftrt.py
@@ -67,7 +67,7 @@ def run_graph(gdef, dumm_inp):
     inp = inp.outputs[0]
     out = out.outputs[0]
   with csess.Session(
-      config=cbp2.ConfigProto(gpu_options=gpu_options), graph=g) as sess:
+      config=cpb2.ConfigProto(gpu_options=gpu_options), graph=g) as sess:
     val = sess.run(out, {inp: dumm_inp})
   return val
 
-- 
GitLab


From 77fd169ca101cf8c7bda6128c32fd5422558828b Mon Sep 17 00:00:00 2001
From: Sami Kama <sami.kama.git@gmail.com>
Date: Mon, 12 Feb 2018 15:26:08 -0800
Subject: [PATCH 964/966] fix typo

---
 tensorflow/contrib/tensorrt/test/test_tftrt.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/tensorrt/test/test_tftrt.py b/tensorflow/contrib/tensorrt/test/test_tftrt.py
index a3b904fdb6..8a61af4783 100644
--- a/tensorflow/contrib/tensorrt/test/test_tftrt.py
+++ b/tensorflow/contrib/tensorrt/test/test_tftrt.py
@@ -58,7 +58,7 @@ def get_simple_graph_def():
 
 
 def run_graph(gdef, dumm_inp):
-  gpu_options = cbp2.GPUOptions(per_process_gpu_memory_fraction=0.50)
+  gpu_options = cpb2.GPUOptions(per_process_gpu_memory_fraction=0.50)
   ops.reset_default_graph()
   g = ops.Graph()
   with g.as_default():
-- 
GitLab


From 71731eb8148ba7e666697f4956b85c7ffa64bd1d Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Mon, 12 Feb 2018 15:29:07 -0800
Subject: [PATCH 965/966] Add documentation for s3 usage with TensorFlow
 (#16923)

* Add documentation for s3 usage with TensorFlow

This fix adds a very priliminary documentation for s3 usage
with TensorFlow, as an attempt to address the comment 11089.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add s3 documentation to index page for deployment.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add left navigation link for s3 docs.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Change example to use tf.data instead of old input style

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/docs_src/deploy/index.md      |  2 ++
 tensorflow/docs_src/deploy/leftnav_files |  1 +
 tensorflow/docs_src/deploy/s3.md         | 40 ++++++++++++++++++++++++
 3 files changed, 43 insertions(+)
 create mode 100644 tensorflow/docs_src/deploy/s3.md

diff --git a/tensorflow/docs_src/deploy/index.md b/tensorflow/docs_src/deploy/index.md
index 5831960b4f..07b1bc9257 100644
--- a/tensorflow/docs_src/deploy/index.md
+++ b/tensorflow/docs_src/deploy/index.md
@@ -7,6 +7,8 @@ the following documents:
     a cluster of TensorFlow servers.
   * @{$hadoop$How to run TensorFlow on Hadoop}, which has a highly
     self-explanatory title.
+  * @{$s3$How to run TensorFlow with the S3 filesystem}, which explains how
+    to run TensorFlow with the S3 file system.
   * The entire document set for [TensorFlow serving](/serving), an open-source,
     flexible, high-performance serving system for machine-learned models
     designed for production environments. TensorFlow Serving provides
diff --git a/tensorflow/docs_src/deploy/leftnav_files b/tensorflow/docs_src/deploy/leftnav_files
index f8f8d578e6..c682e7add1 100644
--- a/tensorflow/docs_src/deploy/leftnav_files
+++ b/tensorflow/docs_src/deploy/leftnav_files
@@ -1,3 +1,4 @@
 index.md
 distributed.md
 hadoop.md
+s3.md
diff --git a/tensorflow/docs_src/deploy/s3.md b/tensorflow/docs_src/deploy/s3.md
new file mode 100644
index 0000000000..38f8428634
--- /dev/null
+++ b/tensorflow/docs_src/deploy/s3.md
@@ -0,0 +1,40 @@
+# How to run TensorFlow on S3
+
+This document describes how to run TensorFlow on S3 file system.
+
+## S3
+
+We assume that you are familiar with @{$reading_data$reading data}.
+
+To use S3 with TensorFlow, change the file paths you use to read and write
+data to an S3 path. For example:
+
+```python
+filenames = ["s3://bucketname/path/to/file1.tfrecord",
+             "s3://bucketname/path/to/file2.tfrecord"]
+dataset = tf.data.TFRecordDataset(filenames)
+```
+
+When reading or writing data on S3 with your TensorFlow program, the behavior
+could be controlled by various environmental variables:
+
+*   **AWS_REGION**: By default, regional endpoint is used for S3, with region
+    controlled by `AWS_REGION`. If `AWS_REGION` is not specified, then
+    `us-east-1` is used.
+*   **S3_ENDPOINT**: The endpoint could be overridden explicitly with
+    `S3_ENDPOINT` specified.
+*   **S3_USE_HTTPS**: HTTPS is used to access S3 by default, unless
+    `S3_USE_HTTPS=0`.
+*   **S3_VERIFY_SSL**: If HTTPS is used, SSL verification could be disabled
+    with `S3_VERIFY_SSL=0`.
+
+To read or write objects in a bucket that is no publicly accessible,
+AWS credentials must be provided through one of the following methods:
+
+*   Set credentials in the AWS credentials profile file on the local system,
+    located at: `~/.aws/credentials` on Linux, macOS, or Unix, or
+    `C:\Users\USERNAME\.aws\credentials` on Windows.
+*   Set the `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment
+    variables.
+*   If TensorFlow is deployed on an EC2 instance, specify an IAM role and then
+    give the EC2 instance access to that role.
-- 
GitLab


From 1e4b5b8c0cc1675b9ecac3569c91563a2a4f9984 Mon Sep 17 00:00:00 2001
From: gracehoney <31743510+aaroey@users.noreply.github.com>
Date: Mon, 12 Feb 2018 15:36:49 -0800
Subject: [PATCH 966/966] Fix test_tftrt.py format

---
 tensorflow/contrib/tensorrt/test/test_tftrt.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/tensorrt/test/test_tftrt.py b/tensorflow/contrib/tensorrt/test/test_tftrt.py
index 8a61af4783..18dba94acb 100644
--- a/tensorflow/contrib/tensorrt/test/test_tftrt.py
+++ b/tensorflow/contrib/tensorrt/test/test_tftrt.py
@@ -40,14 +40,16 @@ def get_simple_graph_def():
   """Create a simple graph and return its graph_def"""
   g = ops.Graph()
   with g.as_default():
-    a = aops.placeholder(dtype=dtypes.float32, shape=(None, 24, 24, 2), name="input")
+    a = aops.placeholder(
+        dtype=dtypes.float32, shape=(None, 24, 24, 2), name="input")
     e = cop.constant(
         [[[[1., 0.5, 4., 6., 0.5, 1.], [1., 0.5, 1., 1., 0.5, 1.]]]],
         name="weights",
         dtype=dtypes.float32)
     conv = nn.conv2d(
         input=a, filter=e, strides=[1, 2, 2, 1], padding="SAME", name="conv")
-    b = cop.constant([4., 1.5, 2., 3., 5., 7.], name="bias", dtype=dtypes.float32)
+    b = cop.constant(
+        [4., 1.5, 2., 3., 5., 7.], name="bias", dtype=dtypes.float32)
     t = nn.bias_add(conv, b, name="biasAdd")
     relu = nn.relu(t, "relu")
     idty = aops.identity(relu, "ID")
-- 
GitLab